diff --git a/.gitattributes b/.gitattributes index 0cf8d3352a97171461b10ac4e720588f18c440a3..679e2b483ed96d522c57cea4e5c99211135ce407 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2440,3 +2440,12 @@ Qwen2-7B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lor Qwen2-7B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-926-sd-10000/checkpoint-714/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-926-sd-10000/checkpoint-816/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-926-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d87ec1347107e335c324ad5fb5c049217911e3f6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:825fda2e0f779accd405aa421ad4f67319e2d7a0b9107c0e455fda1229924f5b +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d674ff2343fd56068148b18f680580381d6c123c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23ccd16c59a2788dfc5448614fd05f1eb2aec624e1d9fb6cd9ae959c4279163d +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..35d425c42978b1d11034815fbbddf2a6072703a9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6127cfb60d3be961606285a1c225a0023ef3688656b6440241a5751ea3abf6a4 +size 55532922 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fc2c5dd28f06a5da0a1e2d5cd0660ae1634c21cc --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b274eb874b6911a07b62158d2091c3c185cd0a59c55ee659794926bf44e76c0 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b218728bced1051aa39ec2c6d9641bde92a9e369 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2421659337795e065a3b66ba5d3ac62bc25221896bc42738ea478bd7d9b952be +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ebdbb2b853f49469472e4cca77704ba06a126560 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/trainer_state.json @@ -0,0 +1,78077 @@ +{ + "best_metric": 1.0868422985076904, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 111400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000718132854578097, + "grad_norm": 1.0291756391525269, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 10 + }, + { + "epoch": 0.001436265709156194, + "grad_norm": 0.6570823192596436, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 20 + }, + { + "epoch": 0.0021543985637342907, + "grad_norm": 0.693844199180603, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 30 + }, + { + "epoch": 0.002872531418312388, + "grad_norm": 0.5608532428741455, + "learning_rate": 0.0002, + "loss": 0.9377, + "step": 40 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 0.549075722694397, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 50 + }, + { + "epoch": 0.004308797127468581, + "grad_norm": 0.47189879417419434, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 60 + }, + { + "epoch": 0.005026929982046679, + "grad_norm": 0.5799676775932312, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 70 + }, + { + "epoch": 0.005745062836624776, + "grad_norm": 0.45907193422317505, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 80 + }, + { + "epoch": 0.006463195691202872, + "grad_norm": 0.4373045861721039, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 90 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 100 + }, + { + "epoch": 0.007899461400359067, + "grad_norm": 0.5248253345489502, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 110 + }, + { + "epoch": 0.008617594254937163, + "grad_norm": 0.5082874298095703, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 120 + }, + { + "epoch": 0.00933572710951526, + "grad_norm": 0.42670881748199463, + "learning_rate": 0.0002, + "loss": 0.8678, + "step": 130 + }, + { + "epoch": 0.010053859964093357, + "grad_norm": 0.43311649560928345, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 0.43456509709358215, + "learning_rate": 0.0002, + "loss": 0.9252, + "step": 150 + }, + { + "epoch": 0.011490125673249552, + "grad_norm": 0.9222815632820129, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 160 + }, + { + "epoch": 0.012208258527827648, + "grad_norm": 0.42752256989479065, + "learning_rate": 0.0002, + "loss": 0.8651, + "step": 170 + }, + { + "epoch": 0.012926391382405745, + "grad_norm": 0.4175542891025543, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 180 + }, + { + "epoch": 0.013644524236983842, + "grad_norm": 0.4377831518650055, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 190 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 0.47263655066490173, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 200 + }, + { + "epoch": 0.015080789946140035, + "grad_norm": 0.3870520293712616, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 210 + }, + { + "epoch": 0.015798922800718134, + "grad_norm": 0.4950464963912964, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 220 + }, + { + "epoch": 0.01651705565529623, + "grad_norm": 0.4643295407295227, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 230 + }, + { + "epoch": 0.017235188509874325, + "grad_norm": 0.5152903199195862, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 240 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 0.3800727427005768, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 250 + }, + { + "epoch": 0.01867145421903052, + "grad_norm": 0.43700528144836426, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 260 + }, + { + "epoch": 0.01938958707360862, + "grad_norm": 0.3712887763977051, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 270 + }, + { + "epoch": 0.020107719928186715, + "grad_norm": 0.4202553629875183, + "learning_rate": 0.0002, + "loss": 0.8329, + "step": 280 + }, + { + "epoch": 0.02082585278276481, + "grad_norm": 0.40585094690322876, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 0.4685470759868622, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 300 + }, + { + "epoch": 0.022262118491921005, + "grad_norm": 0.373169481754303, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 310 + }, + { + "epoch": 0.022980251346499104, + "grad_norm": 0.39681482315063477, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 320 + }, + { + "epoch": 0.0236983842010772, + "grad_norm": 0.3919322192668915, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 330 + }, + { + "epoch": 0.024416517055655295, + "grad_norm": 0.4728981554508209, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 340 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 0.42439374327659607, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 350 + }, + { + "epoch": 0.02585278276481149, + "grad_norm": 0.425650030374527, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 360 + }, + { + "epoch": 0.02657091561938959, + "grad_norm": 0.4076762795448303, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 370 + }, + { + "epoch": 0.027289048473967684, + "grad_norm": 0.44335922598838806, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 380 + }, + { + "epoch": 0.02800718132854578, + "grad_norm": 0.5313619375228882, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 390 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 0.37089797854423523, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 400 + }, + { + "epoch": 0.029443447037701975, + "grad_norm": 0.5193604826927185, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 410 + }, + { + "epoch": 0.03016157989228007, + "grad_norm": 0.4428552985191345, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 420 + }, + { + "epoch": 0.03087971274685817, + "grad_norm": 0.384171724319458, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 430 + }, + { + "epoch": 0.03159784560143627, + "grad_norm": 0.3906913101673126, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 440 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 0.5365669131278992, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 450 + }, + { + "epoch": 0.03303411131059246, + "grad_norm": 0.4785287380218506, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 460 + }, + { + "epoch": 0.03375224416517056, + "grad_norm": 0.40048182010650635, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 470 + }, + { + "epoch": 0.03447037701974865, + "grad_norm": 0.49529239535331726, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 480 + }, + { + "epoch": 0.03518850987432675, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 490 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 0.3802863359451294, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.03662477558348295, + "grad_norm": 0.40374308824539185, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 510 + }, + { + "epoch": 0.03734290843806104, + "grad_norm": 0.4320009648799896, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 520 + }, + { + "epoch": 0.03806104129263914, + "grad_norm": 0.5198846459388733, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 530 + }, + { + "epoch": 0.03877917414721724, + "grad_norm": 0.4136947989463806, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 540 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 0.39344364404678345, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 550 + }, + { + "epoch": 0.04021543985637343, + "grad_norm": 0.4659644067287445, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 560 + }, + { + "epoch": 0.04093357271095153, + "grad_norm": 0.3898842930793762, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 570 + }, + { + "epoch": 0.04165170556552962, + "grad_norm": 0.3964841961860657, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 580 + }, + { + "epoch": 0.04236983842010772, + "grad_norm": 0.5172179341316223, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 590 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 0.5362544059753418, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 600 + }, + { + "epoch": 0.04380610412926391, + "grad_norm": 0.3975909948348999, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 610 + }, + { + "epoch": 0.04452423698384201, + "grad_norm": 0.3905031085014343, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 620 + }, + { + "epoch": 0.04524236983842011, + "grad_norm": 0.5148088932037354, + "learning_rate": 0.0002, + "loss": 0.7723, + "step": 630 + }, + { + "epoch": 0.04596050269299821, + "grad_norm": 0.38826194405555725, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 640 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 0.5432049036026001, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.0473967684021544, + "grad_norm": 0.42048221826553345, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 660 + }, + { + "epoch": 0.0481149012567325, + "grad_norm": 0.4683088958263397, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 670 + }, + { + "epoch": 0.04883303411131059, + "grad_norm": 0.4623735249042511, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 680 + }, + { + "epoch": 0.04955116696588869, + "grad_norm": 0.509128212928772, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 690 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 0.45767295360565186, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 700 + }, + { + "epoch": 0.05098743267504488, + "grad_norm": 0.4023726284503937, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 710 + }, + { + "epoch": 0.05170556552962298, + "grad_norm": 0.4407201409339905, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 720 + }, + { + "epoch": 0.05242369838420108, + "grad_norm": 0.41862091422080994, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 730 + }, + { + "epoch": 0.05314183123877918, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 740 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 0.4882921576499939, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 750 + }, + { + "epoch": 0.05457809694793537, + "grad_norm": 0.47890132665634155, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 760 + }, + { + "epoch": 0.05529622980251347, + "grad_norm": 0.5811166167259216, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 770 + }, + { + "epoch": 0.05601436265709156, + "grad_norm": 0.41113588213920593, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 780 + }, + { + "epoch": 0.05673249551166966, + "grad_norm": 0.4120602607727051, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 790 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 0.39287394285202026, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 800 + }, + { + "epoch": 0.05816876122082585, + "grad_norm": 0.3986941874027252, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 810 + }, + { + "epoch": 0.05888689407540395, + "grad_norm": 0.4264012575149536, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 820 + }, + { + "epoch": 0.05960502692998205, + "grad_norm": 0.481139600276947, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 830 + }, + { + "epoch": 0.06032315978456014, + "grad_norm": 0.5561784505844116, + "learning_rate": 0.0002, + "loss": 0.8477, + "step": 840 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 0.4787197411060333, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 850 + }, + { + "epoch": 0.06175942549371634, + "grad_norm": 0.46454647183418274, + "learning_rate": 0.0002, + "loss": 0.8567, + "step": 860 + }, + { + "epoch": 0.06247755834829444, + "grad_norm": 0.5929669141769409, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 870 + }, + { + "epoch": 0.06319569120287254, + "grad_norm": 0.4561384618282318, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 880 + }, + { + "epoch": 0.06391382405745062, + "grad_norm": 0.45767998695373535, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 890 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 0.42475444078445435, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 900 + }, + { + "epoch": 0.06535008976660682, + "grad_norm": 0.4911022484302521, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 910 + }, + { + "epoch": 0.06606822262118492, + "grad_norm": 0.5229166746139526, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 920 + }, + { + "epoch": 0.06678635547576302, + "grad_norm": 0.38134580850601196, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 930 + }, + { + "epoch": 0.06750448833034112, + "grad_norm": 0.4171486496925354, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 940 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 0.45171529054641724, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 950 + }, + { + "epoch": 0.0689407540394973, + "grad_norm": 0.44889307022094727, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 960 + }, + { + "epoch": 0.0696588868940754, + "grad_norm": 0.44902464747428894, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 970 + }, + { + "epoch": 0.0703770197486535, + "grad_norm": 0.4671969413757324, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 980 + }, + { + "epoch": 0.0710951526032316, + "grad_norm": 0.4686984717845917, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 990 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1000 + }, + { + "epoch": 0.0725314183123878, + "grad_norm": 0.48861828446388245, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1010 + }, + { + "epoch": 0.0732495511669659, + "grad_norm": 0.7603165507316589, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 1020 + }, + { + "epoch": 0.07396768402154398, + "grad_norm": 0.501654863357544, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 1030 + }, + { + "epoch": 0.07468581687612208, + "grad_norm": 0.45291560888290405, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 1040 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 0.42454713582992554, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 1050 + }, + { + "epoch": 0.07612208258527828, + "grad_norm": 0.4655592441558838, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1060 + }, + { + "epoch": 0.07684021543985638, + "grad_norm": 0.5011071562767029, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 1070 + }, + { + "epoch": 0.07755834829443448, + "grad_norm": 0.37221577763557434, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 1080 + }, + { + "epoch": 0.07827648114901256, + "grad_norm": 0.5123572945594788, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 1090 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 0.44138720631599426, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1100 + }, + { + "epoch": 0.07971274685816876, + "grad_norm": 0.38932886719703674, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 1110 + }, + { + "epoch": 0.08043087971274686, + "grad_norm": 0.435820072889328, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 1120 + }, + { + "epoch": 0.08114901256732496, + "grad_norm": 0.3820142149925232, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 1130 + }, + { + "epoch": 0.08186714542190306, + "grad_norm": 0.39680808782577515, + "learning_rate": 0.0002, + "loss": 0.8617, + "step": 1140 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 0.4833722412586212, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1150 + }, + { + "epoch": 0.08330341113105924, + "grad_norm": 0.5045956969261169, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1160 + }, + { + "epoch": 0.08402154398563734, + "grad_norm": 0.3652207553386688, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 1170 + }, + { + "epoch": 0.08473967684021544, + "grad_norm": 0.44447052478790283, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 1180 + }, + { + "epoch": 0.08545780969479354, + "grad_norm": 0.44942694902420044, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 1190 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 0.48789075016975403, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1200 + }, + { + "epoch": 0.08689407540394974, + "grad_norm": 0.3981451094150543, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1210 + }, + { + "epoch": 0.08761220825852782, + "grad_norm": 0.45545220375061035, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 1220 + }, + { + "epoch": 0.08833034111310592, + "grad_norm": 0.562138557434082, + "learning_rate": 0.0002, + "loss": 0.8406, + "step": 1230 + }, + { + "epoch": 0.08904847396768402, + "grad_norm": 0.48523494601249695, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 1240 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 0.35054388642311096, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1250 + }, + { + "epoch": 0.09048473967684022, + "grad_norm": 0.4148605167865753, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 1260 + }, + { + "epoch": 0.09120287253141832, + "grad_norm": 0.50171959400177, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 1270 + }, + { + "epoch": 0.09192100538599642, + "grad_norm": 0.41747573018074036, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 1280 + }, + { + "epoch": 0.0926391382405745, + "grad_norm": 0.43028751015663147, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1290 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 0.41274991631507874, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 1300 + }, + { + "epoch": 0.0940754039497307, + "grad_norm": 0.5399569272994995, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 1310 + }, + { + "epoch": 0.0947935368043088, + "grad_norm": 0.44284379482269287, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 1320 + }, + { + "epoch": 0.0955116696588869, + "grad_norm": 0.42511969804763794, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1330 + }, + { + "epoch": 0.096229802513465, + "grad_norm": 0.5717929005622864, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1340 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1350 + }, + { + "epoch": 0.09766606822262118, + "grad_norm": 0.4144339859485626, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 1360 + }, + { + "epoch": 0.09838420107719928, + "grad_norm": 0.43676936626434326, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 1370 + }, + { + "epoch": 0.09910233393177738, + "grad_norm": 0.5297161340713501, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 1380 + }, + { + "epoch": 0.09982046678635548, + "grad_norm": 0.5319193601608276, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1390 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 0.4083728492259979, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1400 + }, + { + "epoch": 0.10125673249551168, + "grad_norm": 0.4193868339061737, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1410 + }, + { + "epoch": 0.10197486535008976, + "grad_norm": 0.4062198996543884, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 1420 + }, + { + "epoch": 0.10269299820466786, + "grad_norm": 0.43972232937812805, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1430 + }, + { + "epoch": 0.10341113105924596, + "grad_norm": 0.4598410725593567, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1440 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 0.571662187576294, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1450 + }, + { + "epoch": 0.10484739676840216, + "grad_norm": 0.5437791347503662, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1460 + }, + { + "epoch": 0.10556552962298026, + "grad_norm": 0.4241923391819, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1470 + }, + { + "epoch": 0.10628366247755835, + "grad_norm": 0.5185145735740662, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1480 + }, + { + "epoch": 0.10700179533213644, + "grad_norm": 0.537626326084137, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 1490 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 0.4573661983013153, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 1500 + }, + { + "epoch": 0.10843806104129264, + "grad_norm": 0.4521017074584961, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 1510 + }, + { + "epoch": 0.10915619389587074, + "grad_norm": 0.6835159063339233, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1520 + }, + { + "epoch": 0.10987432675044884, + "grad_norm": 0.43522894382476807, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 1530 + }, + { + "epoch": 0.11059245960502694, + "grad_norm": 0.685547411441803, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 1540 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 0.5283669233322144, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1550 + }, + { + "epoch": 0.11202872531418312, + "grad_norm": 0.4869283437728882, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 1560 + }, + { + "epoch": 0.11274685816876122, + "grad_norm": 0.43024054169654846, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1570 + }, + { + "epoch": 0.11346499102333932, + "grad_norm": 0.46726059913635254, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1580 + }, + { + "epoch": 0.11418312387791742, + "grad_norm": 0.5046039819717407, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 1590 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 0.48972827196121216, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 1600 + }, + { + "epoch": 0.11561938958707361, + "grad_norm": 0.5221049189567566, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 1610 + }, + { + "epoch": 0.1163375224416517, + "grad_norm": 0.49169477820396423, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 1620 + }, + { + "epoch": 0.1170556552962298, + "grad_norm": 0.48462188243865967, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 1630 + }, + { + "epoch": 0.1177737881508079, + "grad_norm": 0.9001021981239319, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 1640 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 0.47555917501449585, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 1650 + }, + { + "epoch": 0.1192100538599641, + "grad_norm": 0.4523521959781647, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1660 + }, + { + "epoch": 0.1199281867145422, + "grad_norm": 0.510956346988678, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 1670 + }, + { + "epoch": 0.12064631956912028, + "grad_norm": 0.48063746094703674, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 0.12136445242369838, + "grad_norm": 0.5209490060806274, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 1690 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 0.5488983988761902, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1700 + }, + { + "epoch": 0.12280071813285458, + "grad_norm": 0.5263523459434509, + "learning_rate": 0.0002, + "loss": 0.829, + "step": 1710 + }, + { + "epoch": 0.12351885098743268, + "grad_norm": 0.45365768671035767, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 1720 + }, + { + "epoch": 0.12423698384201078, + "grad_norm": 0.4366922378540039, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 1730 + }, + { + "epoch": 0.12495511669658887, + "grad_norm": 0.4841083884239197, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 1740 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 0.46546968817710876, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 1750 + }, + { + "epoch": 0.12639138240574507, + "grad_norm": 0.39987099170684814, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1760 + }, + { + "epoch": 0.12710951526032316, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 1770 + }, + { + "epoch": 0.12782764811490124, + "grad_norm": 0.46716657280921936, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 1780 + }, + { + "epoch": 0.12854578096947936, + "grad_norm": 0.46164995431900024, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1790 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 0.4910370111465454, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 1800 + }, + { + "epoch": 0.12998204667863555, + "grad_norm": 0.5615737438201904, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 1810 + }, + { + "epoch": 0.13070017953321364, + "grad_norm": 0.5739728808403015, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1820 + }, + { + "epoch": 0.13141831238779175, + "grad_norm": 0.44104722142219543, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 1830 + }, + { + "epoch": 0.13213644524236984, + "grad_norm": 0.46373724937438965, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 1840 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 0.4481196403503418, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.13357271095152604, + "grad_norm": 0.5689327716827393, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 1860 + }, + { + "epoch": 0.13429084380610412, + "grad_norm": 0.5334849953651428, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 1870 + }, + { + "epoch": 0.13500897666068223, + "grad_norm": 0.5177253484725952, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 1880 + }, + { + "epoch": 0.13572710951526032, + "grad_norm": 0.4919368326663971, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 1890 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 0.5987576842308044, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 1900 + }, + { + "epoch": 0.13716337522441652, + "grad_norm": 0.49790486693382263, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 1910 + }, + { + "epoch": 0.1378815080789946, + "grad_norm": 0.5337542295455933, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1920 + }, + { + "epoch": 0.13859964093357272, + "grad_norm": 0.5171598792076111, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 1930 + }, + { + "epoch": 0.1393177737881508, + "grad_norm": 0.5003953576087952, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1940 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 0.5147887468338013, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 1950 + }, + { + "epoch": 0.140754039497307, + "grad_norm": 0.6365984678268433, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 1960 + }, + { + "epoch": 0.1414721723518851, + "grad_norm": 0.5449512004852295, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 1970 + }, + { + "epoch": 0.1421903052064632, + "grad_norm": 0.4062703847885132, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1980 + }, + { + "epoch": 0.14290843806104128, + "grad_norm": 0.4446912705898285, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 1990 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 0.49001234769821167, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 2000 + }, + { + "epoch": 0.14434470377019748, + "grad_norm": 0.5591765642166138, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 2010 + }, + { + "epoch": 0.1450628366247756, + "grad_norm": 0.6476696133613586, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 2020 + }, + { + "epoch": 0.14578096947935368, + "grad_norm": 0.44688376784324646, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 2030 + }, + { + "epoch": 0.1464991023339318, + "grad_norm": 0.4437490701675415, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 2040 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 0.59927898645401, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 2050 + }, + { + "epoch": 0.14793536804308796, + "grad_norm": 0.4356591999530792, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 2060 + }, + { + "epoch": 0.14865350089766607, + "grad_norm": 0.5560822486877441, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2070 + }, + { + "epoch": 0.14937163375224416, + "grad_norm": 0.43027108907699585, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2080 + }, + { + "epoch": 0.15008976660682227, + "grad_norm": 0.41215455532073975, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 2090 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 0.4607839584350586, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 2100 + }, + { + "epoch": 0.15152603231597844, + "grad_norm": 0.4699854254722595, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2110 + }, + { + "epoch": 0.15224416517055656, + "grad_norm": 0.5111975073814392, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2120 + }, + { + "epoch": 0.15296229802513464, + "grad_norm": 0.4713742733001709, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2130 + }, + { + "epoch": 0.15368043087971275, + "grad_norm": 0.3816622793674469, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2140 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 0.4637526273727417, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 2150 + }, + { + "epoch": 0.15511669658886895, + "grad_norm": 0.3691818118095398, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2160 + }, + { + "epoch": 0.15583482944344704, + "grad_norm": 0.4435218274593353, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 2170 + }, + { + "epoch": 0.15655296229802512, + "grad_norm": 0.5282211899757385, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 2180 + }, + { + "epoch": 0.15727109515260324, + "grad_norm": 0.7611056566238403, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 2190 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 0.5951169729232788, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 2200 + }, + { + "epoch": 0.15870736086175943, + "grad_norm": 0.5243265628814697, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2210 + }, + { + "epoch": 0.15942549371633752, + "grad_norm": 0.518944501876831, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 2220 + }, + { + "epoch": 0.16014362657091563, + "grad_norm": 0.4264616072177887, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2230 + }, + { + "epoch": 0.16086175942549372, + "grad_norm": 0.4619045853614807, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 2240 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 0.4047030508518219, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2250 + }, + { + "epoch": 0.16229802513464991, + "grad_norm": 0.47133687138557434, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 2260 + }, + { + "epoch": 0.163016157989228, + "grad_norm": 0.4990246593952179, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 2270 + }, + { + "epoch": 0.1637342908438061, + "grad_norm": 0.5145298838615417, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 2280 + }, + { + "epoch": 0.1644524236983842, + "grad_norm": 0.5354352593421936, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 2290 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 0.47621065378189087, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 2300 + }, + { + "epoch": 0.1658886894075404, + "grad_norm": 0.45333582162857056, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 2310 + }, + { + "epoch": 0.16660682226211848, + "grad_norm": 0.4832790493965149, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 2320 + }, + { + "epoch": 0.1673249551166966, + "grad_norm": 0.4922761619091034, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2330 + }, + { + "epoch": 0.16804308797127468, + "grad_norm": 0.5701655149459839, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 2340 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 0.5170459151268005, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 2350 + }, + { + "epoch": 0.16947935368043088, + "grad_norm": 0.6562373638153076, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2360 + }, + { + "epoch": 0.170197486535009, + "grad_norm": 0.5350262522697449, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 2370 + }, + { + "epoch": 0.17091561938958708, + "grad_norm": 0.5163491368293762, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 2380 + }, + { + "epoch": 0.17163375224416516, + "grad_norm": 0.48841530084609985, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2390 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 0.44912993907928467, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 2400 + }, + { + "epoch": 0.17307001795332136, + "grad_norm": 0.5770647525787354, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 2410 + }, + { + "epoch": 0.17378815080789947, + "grad_norm": 0.4716179072856903, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 2420 + }, + { + "epoch": 0.17450628366247756, + "grad_norm": 0.5465078949928284, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 2430 + }, + { + "epoch": 0.17522441651705564, + "grad_norm": 0.40810713171958923, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 2440 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 0.3789578080177307, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 2450 + }, + { + "epoch": 0.17666068222621184, + "grad_norm": 0.4615110158920288, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 2460 + }, + { + "epoch": 0.17737881508078995, + "grad_norm": 0.4400235712528229, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2470 + }, + { + "epoch": 0.17809694793536804, + "grad_norm": 0.5935020446777344, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2480 + }, + { + "epoch": 0.17881508078994615, + "grad_norm": 0.5672990679740906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 2490 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 0.4132838845252991, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 2500 + }, + { + "epoch": 0.18025134649910232, + "grad_norm": 0.5373716950416565, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 2510 + }, + { + "epoch": 0.18096947935368043, + "grad_norm": 0.5335832834243774, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2520 + }, + { + "epoch": 0.18168761220825852, + "grad_norm": 0.5705642700195312, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.18240574506283663, + "grad_norm": 0.4807959496974945, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 2540 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 0.4430573880672455, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2550 + }, + { + "epoch": 0.18384201077199283, + "grad_norm": 0.5294728875160217, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 2560 + }, + { + "epoch": 0.18456014362657092, + "grad_norm": 0.661173403263092, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2570 + }, + { + "epoch": 0.185278276481149, + "grad_norm": 0.5044304728507996, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2580 + }, + { + "epoch": 0.18599640933572711, + "grad_norm": 0.48929551243782043, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 2590 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 0.5054438710212708, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2600 + }, + { + "epoch": 0.1874326750448833, + "grad_norm": 0.5613677501678467, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 2610 + }, + { + "epoch": 0.1881508078994614, + "grad_norm": 0.5762478709220886, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 2620 + }, + { + "epoch": 0.1888689407540395, + "grad_norm": 0.4523695409297943, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2630 + }, + { + "epoch": 0.1895870736086176, + "grad_norm": 0.5235317945480347, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 2640 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 0.4894576370716095, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 2650 + }, + { + "epoch": 0.1910233393177738, + "grad_norm": 0.45731106400489807, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2660 + }, + { + "epoch": 0.19174147217235188, + "grad_norm": 0.4726541042327881, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2670 + }, + { + "epoch": 0.19245960502693, + "grad_norm": 0.4281631410121918, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 2680 + }, + { + "epoch": 0.19317773788150808, + "grad_norm": 0.48011314868927, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 2690 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 0.45785006880760193, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2700 + }, + { + "epoch": 0.19461400359066428, + "grad_norm": 0.5244625210762024, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 2710 + }, + { + "epoch": 0.19533213644524236, + "grad_norm": 0.4674883186817169, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2720 + }, + { + "epoch": 0.19605026929982047, + "grad_norm": 0.5969558358192444, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 2730 + }, + { + "epoch": 0.19676840215439856, + "grad_norm": 0.44413265585899353, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 2740 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 0.5094553828239441, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2750 + }, + { + "epoch": 0.19820466786355476, + "grad_norm": 0.4931736886501312, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2760 + }, + { + "epoch": 0.19892280071813284, + "grad_norm": 0.4766625463962555, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 2770 + }, + { + "epoch": 0.19964093357271095, + "grad_norm": 0.4196971654891968, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2780 + }, + { + "epoch": 0.20035906642728904, + "grad_norm": 0.4693375825881958, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 2790 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 0.5407108664512634, + "learning_rate": 0.0002, + "loss": 0.8336, + "step": 2800 + }, + { + "epoch": 0.20179533213644524, + "grad_norm": 0.42864227294921875, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2810 + }, + { + "epoch": 0.20251346499102335, + "grad_norm": 0.4928833246231079, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 2820 + }, + { + "epoch": 0.20323159784560144, + "grad_norm": 0.5575131773948669, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2830 + }, + { + "epoch": 0.20394973070017952, + "grad_norm": 0.505114734172821, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2840 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 0.4727420210838318, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 2850 + }, + { + "epoch": 0.20538599640933572, + "grad_norm": 0.48218145966529846, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 2860 + }, + { + "epoch": 0.20610412926391383, + "grad_norm": 0.5196906328201294, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2870 + }, + { + "epoch": 0.20682226211849192, + "grad_norm": 0.4927639067173004, + "learning_rate": 0.0002, + "loss": 0.8401, + "step": 2880 + }, + { + "epoch": 0.20754039497307003, + "grad_norm": 0.5076990127563477, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 2890 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.4606800079345703, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 2900 + }, + { + "epoch": 0.2089766606822262, + "grad_norm": 0.6184319257736206, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2910 + }, + { + "epoch": 0.2096947935368043, + "grad_norm": 0.5237935781478882, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2920 + }, + { + "epoch": 0.2104129263913824, + "grad_norm": 0.43966251611709595, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 2930 + }, + { + "epoch": 0.2111310592459605, + "grad_norm": 0.48786666989326477, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2940 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 0.4397817552089691, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 2950 + }, + { + "epoch": 0.2125673249551167, + "grad_norm": 0.5155336260795593, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.2132854578096948, + "grad_norm": 0.48058274388313293, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2970 + }, + { + "epoch": 0.21400359066427288, + "grad_norm": 0.5022647976875305, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2980 + }, + { + "epoch": 0.214721723518851, + "grad_norm": 0.5417225360870361, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 2990 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 0.46300315856933594, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 3000 + }, + { + "epoch": 0.2161579892280072, + "grad_norm": 0.5375089049339294, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 3010 + }, + { + "epoch": 0.21687612208258528, + "grad_norm": 0.5050022602081299, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 3020 + }, + { + "epoch": 0.21759425493716336, + "grad_norm": 0.46347716450691223, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 3030 + }, + { + "epoch": 0.21831238779174147, + "grad_norm": 0.544874370098114, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 3040 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 3050 + }, + { + "epoch": 0.21974865350089767, + "grad_norm": 0.5527157187461853, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 3060 + }, + { + "epoch": 0.22046678635547576, + "grad_norm": 0.5565235018730164, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 3070 + }, + { + "epoch": 0.22118491921005387, + "grad_norm": 0.4900645613670349, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 3080 + }, + { + "epoch": 0.22190305206463196, + "grad_norm": 0.4951242208480835, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 3090 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 0.5831719636917114, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 3100 + }, + { + "epoch": 0.22333931777378815, + "grad_norm": 0.417576402425766, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 3110 + }, + { + "epoch": 0.22405745062836624, + "grad_norm": 0.4715117812156677, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 3120 + }, + { + "epoch": 0.22477558348294435, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 3130 + }, + { + "epoch": 0.22549371633752244, + "grad_norm": 0.408184289932251, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 3140 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 3150 + }, + { + "epoch": 0.22692998204667864, + "grad_norm": 0.5631294846534729, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3160 + }, + { + "epoch": 0.22764811490125672, + "grad_norm": 0.5054665803909302, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3170 + }, + { + "epoch": 0.22836624775583483, + "grad_norm": 0.47388020157814026, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 3180 + }, + { + "epoch": 0.22908438061041292, + "grad_norm": 0.45871609449386597, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 3190 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.42431211471557617, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 3200 + }, + { + "epoch": 0.23052064631956912, + "grad_norm": 0.584872305393219, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3210 + }, + { + "epoch": 0.23123877917414723, + "grad_norm": 0.5489653944969177, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 3220 + }, + { + "epoch": 0.23195691202872532, + "grad_norm": 0.5803213119506836, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 3230 + }, + { + "epoch": 0.2326750448833034, + "grad_norm": 0.906505823135376, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3240 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 0.4569525718688965, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 3250 + }, + { + "epoch": 0.2341113105924596, + "grad_norm": 0.5566741228103638, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3260 + }, + { + "epoch": 0.2348294434470377, + "grad_norm": 0.5059959888458252, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3270 + }, + { + "epoch": 0.2355475763016158, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 3280 + }, + { + "epoch": 0.2362657091561939, + "grad_norm": 0.5149409174919128, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 3290 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 0.7323763966560364, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3300 + }, + { + "epoch": 0.23770197486535008, + "grad_norm": 0.6794836521148682, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 3310 + }, + { + "epoch": 0.2384201077199282, + "grad_norm": 0.5176534056663513, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 3320 + }, + { + "epoch": 0.23913824057450628, + "grad_norm": 0.42245906591415405, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 3330 + }, + { + "epoch": 0.2398563734290844, + "grad_norm": 0.43535107374191284, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 0.7038307785987854, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 3350 + }, + { + "epoch": 0.24129263913824056, + "grad_norm": 0.5689977407455444, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 0.24201077199281867, + "grad_norm": 0.538136899471283, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 3370 + }, + { + "epoch": 0.24272890484739676, + "grad_norm": 0.7433661222457886, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 3380 + }, + { + "epoch": 0.24344703770197487, + "grad_norm": 0.6996734738349915, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3390 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.5055703520774841, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 3400 + }, + { + "epoch": 0.24488330341113107, + "grad_norm": 0.5218513607978821, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 3410 + }, + { + "epoch": 0.24560143626570916, + "grad_norm": 0.42782822251319885, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3420 + }, + { + "epoch": 0.24631956912028724, + "grad_norm": 0.4991157650947571, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 3430 + }, + { + "epoch": 0.24703770197486535, + "grad_norm": 0.5063165426254272, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3440 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 0.45863136649131775, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3450 + }, + { + "epoch": 0.24847396768402155, + "grad_norm": 0.474728524684906, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3460 + }, + { + "epoch": 0.24919210053859964, + "grad_norm": 0.522570013999939, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 3470 + }, + { + "epoch": 0.24991023339317775, + "grad_norm": 0.5474396347999573, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 3480 + }, + { + "epoch": 0.2506283662477558, + "grad_norm": 0.49094662070274353, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3490 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 0.6399132609367371, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 3500 + }, + { + "epoch": 0.25206463195691203, + "grad_norm": 0.5910066366195679, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 3510 + }, + { + "epoch": 0.25278276481149015, + "grad_norm": 0.4761259853839874, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3520 + }, + { + "epoch": 0.2535008976660682, + "grad_norm": 0.5124502182006836, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 3530 + }, + { + "epoch": 0.2542190305206463, + "grad_norm": 0.4329150915145874, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3540 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 0.4839608371257782, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 3550 + }, + { + "epoch": 0.2556552962298025, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3560 + }, + { + "epoch": 0.2563734290843806, + "grad_norm": 0.5761468410491943, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 3570 + }, + { + "epoch": 0.2570915619389587, + "grad_norm": 0.49266132712364197, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3580 + }, + { + "epoch": 0.2578096947935368, + "grad_norm": 0.7377930879592896, + "learning_rate": 0.0002, + "loss": 0.7946, + "step": 3590 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 0.543541431427002, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3600 + }, + { + "epoch": 0.259245960502693, + "grad_norm": 0.48385897278785706, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3610 + }, + { + "epoch": 0.2599640933572711, + "grad_norm": 0.5152639746665955, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3620 + }, + { + "epoch": 0.26068222621184917, + "grad_norm": 0.5601988434791565, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 3630 + }, + { + "epoch": 0.2614003590664273, + "grad_norm": 0.4349626302719116, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 0.5487161874771118, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3650 + }, + { + "epoch": 0.2628366247755835, + "grad_norm": 0.45603805780410767, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 3660 + }, + { + "epoch": 0.26355475763016156, + "grad_norm": 0.5012730956077576, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 3670 + }, + { + "epoch": 0.2642728904847397, + "grad_norm": 0.4523845314979553, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 3680 + }, + { + "epoch": 0.2649910233393178, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 3690 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 0.48467493057250977, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 3700 + }, + { + "epoch": 0.26642728904847396, + "grad_norm": 0.4860585927963257, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3710 + }, + { + "epoch": 0.26714542190305207, + "grad_norm": 0.5067077875137329, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3720 + }, + { + "epoch": 0.2678635547576302, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3730 + }, + { + "epoch": 0.26858168761220824, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 3740 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 0.5026951432228088, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 3750 + }, + { + "epoch": 0.27001795332136447, + "grad_norm": 0.49474090337753296, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3760 + }, + { + "epoch": 0.2707360861759425, + "grad_norm": 0.6381985545158386, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 3770 + }, + { + "epoch": 0.27145421903052064, + "grad_norm": 0.4784011244773865, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 3780 + }, + { + "epoch": 0.27217235188509875, + "grad_norm": 0.5126543045043945, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 3790 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 3800 + }, + { + "epoch": 0.2736086175942549, + "grad_norm": 0.5427033305168152, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 3810 + }, + { + "epoch": 0.27432675044883303, + "grad_norm": 0.46467480063438416, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 3820 + }, + { + "epoch": 0.27504488330341115, + "grad_norm": 0.494367390871048, + "learning_rate": 0.0002, + "loss": 0.8414, + "step": 3830 + }, + { + "epoch": 0.2757630161579892, + "grad_norm": 0.59856778383255, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3840 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 0.422128826379776, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 3850 + }, + { + "epoch": 0.27719928186714543, + "grad_norm": 0.5757306814193726, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 3860 + }, + { + "epoch": 0.27791741472172354, + "grad_norm": 0.5850930213928223, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.2786355475763016, + "grad_norm": 0.5633023977279663, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3880 + }, + { + "epoch": 0.2793536804308797, + "grad_norm": 0.5037940144538879, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 3890 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 0.5255506038665771, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 3900 + }, + { + "epoch": 0.2807899461400359, + "grad_norm": 0.44584617018699646, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 3910 + }, + { + "epoch": 0.281508078994614, + "grad_norm": 0.4803239405155182, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 3920 + }, + { + "epoch": 0.2822262118491921, + "grad_norm": 0.5206008553504944, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 3930 + }, + { + "epoch": 0.2829443447037702, + "grad_norm": 0.5596373081207275, + "learning_rate": 0.0002, + "loss": 0.8988, + "step": 3940 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 0.4487258493900299, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 3950 + }, + { + "epoch": 0.2843806104129264, + "grad_norm": 0.4774281978607178, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3960 + }, + { + "epoch": 0.2850987432675045, + "grad_norm": 0.571829617023468, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3970 + }, + { + "epoch": 0.28581687612208256, + "grad_norm": 0.45251455903053284, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 3980 + }, + { + "epoch": 0.2865350089766607, + "grad_norm": 0.5119943618774414, + "learning_rate": 0.0002, + "loss": 0.8007, + "step": 3990 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.42333969473838806, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 4000 + }, + { + "epoch": 0.2879712746858169, + "grad_norm": 0.5694096684455872, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 4010 + }, + { + "epoch": 0.28868940754039496, + "grad_norm": 0.44457492232322693, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 4020 + }, + { + "epoch": 0.2894075403949731, + "grad_norm": 0.496545672416687, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 4030 + }, + { + "epoch": 0.2901256732495512, + "grad_norm": 0.5092352032661438, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 4040 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 0.5124567151069641, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4050 + }, + { + "epoch": 0.29156193895870736, + "grad_norm": 0.5148161053657532, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4060 + }, + { + "epoch": 0.29228007181328547, + "grad_norm": 0.48183947801589966, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4070 + }, + { + "epoch": 0.2929982046678636, + "grad_norm": 0.47728800773620605, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4080 + }, + { + "epoch": 0.29371633752244164, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.5343585014343262, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 4100 + }, + { + "epoch": 0.29515260323159787, + "grad_norm": 0.5760312676429749, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 4110 + }, + { + "epoch": 0.2958707360861759, + "grad_norm": 0.5894787907600403, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4120 + }, + { + "epoch": 0.29658886894075404, + "grad_norm": 0.4528578817844391, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 4130 + }, + { + "epoch": 0.29730700179533215, + "grad_norm": 0.6027235388755798, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 4140 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.5060310959815979, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 4150 + }, + { + "epoch": 0.2987432675044883, + "grad_norm": 0.475252628326416, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4160 + }, + { + "epoch": 0.29946140035906643, + "grad_norm": 0.4855351448059082, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 4170 + }, + { + "epoch": 0.30017953321364454, + "grad_norm": 0.6720767021179199, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4180 + }, + { + "epoch": 0.3008976660682226, + "grad_norm": 0.6409553289413452, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 4190 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.5508167147636414, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 4200 + }, + { + "epoch": 0.30233393177737883, + "grad_norm": 0.45958149433135986, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 4210 + }, + { + "epoch": 0.3030520646319569, + "grad_norm": 0.5201641321182251, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 4220 + }, + { + "epoch": 0.303770197486535, + "grad_norm": 0.5440032482147217, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4230 + }, + { + "epoch": 0.3044883303411131, + "grad_norm": 0.43566814064979553, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4240 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 0.4479893445968628, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 4250 + }, + { + "epoch": 0.3059245960502693, + "grad_norm": 0.40390217304229736, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4260 + }, + { + "epoch": 0.3066427289048474, + "grad_norm": 0.5143486261367798, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 4270 + }, + { + "epoch": 0.3073608617594255, + "grad_norm": 0.5289962887763977, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 4280 + }, + { + "epoch": 0.30807899461400357, + "grad_norm": 0.609561026096344, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 4290 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 0.5967493653297424, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 4300 + }, + { + "epoch": 0.3095152603231598, + "grad_norm": 0.5323672890663147, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4310 + }, + { + "epoch": 0.3102333931777379, + "grad_norm": 0.4996737241744995, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 4320 + }, + { + "epoch": 0.31095152603231596, + "grad_norm": 0.5528829097747803, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 4330 + }, + { + "epoch": 0.3116696588868941, + "grad_norm": 0.5394268035888672, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4340 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 0.4654628038406372, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 4350 + }, + { + "epoch": 0.31310592459605024, + "grad_norm": 0.4933706521987915, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.31382405745062836, + "grad_norm": 0.5310598611831665, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 4370 + }, + { + "epoch": 0.31454219030520647, + "grad_norm": 0.5558765530586243, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4380 + }, + { + "epoch": 0.3152603231597846, + "grad_norm": 0.5281313061714172, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 4390 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 0.5100293755531311, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4400 + }, + { + "epoch": 0.31669658886894075, + "grad_norm": 0.48762813210487366, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 4410 + }, + { + "epoch": 0.31741472172351887, + "grad_norm": 0.5211702585220337, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 4420 + }, + { + "epoch": 0.3181328545780969, + "grad_norm": 0.696747899055481, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 4430 + }, + { + "epoch": 0.31885098743267504, + "grad_norm": 0.6334946751594543, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4440 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.5333067178726196, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4450 + }, + { + "epoch": 0.32028725314183126, + "grad_norm": 0.500091552734375, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 4460 + }, + { + "epoch": 0.3210053859964093, + "grad_norm": 0.5190957188606262, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4470 + }, + { + "epoch": 0.32172351885098743, + "grad_norm": 0.6702370047569275, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 4480 + }, + { + "epoch": 0.32244165170556555, + "grad_norm": 0.4393869638442993, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 4490 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 4500 + }, + { + "epoch": 0.3238779174147217, + "grad_norm": 0.561836838722229, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 4510 + }, + { + "epoch": 0.32459605026929983, + "grad_norm": 0.44366541504859924, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 4520 + }, + { + "epoch": 0.32531418312387794, + "grad_norm": 0.46504274010658264, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 4530 + }, + { + "epoch": 0.326032315978456, + "grad_norm": 0.5498034954071045, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 4540 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 0.5901338458061218, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 4550 + }, + { + "epoch": 0.3274685816876122, + "grad_norm": 0.5485442876815796, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 4560 + }, + { + "epoch": 0.3281867145421903, + "grad_norm": 0.512584924697876, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4570 + }, + { + "epoch": 0.3289048473967684, + "grad_norm": 0.5208188891410828, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 4580 + }, + { + "epoch": 0.3296229802513465, + "grad_norm": 0.4923836886882782, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 4590 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 0.49258530139923096, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 4600 + }, + { + "epoch": 0.3310592459605027, + "grad_norm": 0.4788922667503357, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 4610 + }, + { + "epoch": 0.3317773788150808, + "grad_norm": 0.48276954889297485, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4620 + }, + { + "epoch": 0.3324955116696589, + "grad_norm": 0.6300732493400574, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4630 + }, + { + "epoch": 0.33321364452423696, + "grad_norm": 0.47594770789146423, + "learning_rate": 0.0002, + "loss": 0.8434, + "step": 4640 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.4728924632072449, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 4650 + }, + { + "epoch": 0.3346499102333932, + "grad_norm": 0.5586788654327393, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4660 + }, + { + "epoch": 0.3353680430879713, + "grad_norm": 0.4573180377483368, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 4670 + }, + { + "epoch": 0.33608617594254936, + "grad_norm": 0.6391524076461792, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 4680 + }, + { + "epoch": 0.33680430879712747, + "grad_norm": 0.6570921540260315, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 4690 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 0.4601454734802246, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 4700 + }, + { + "epoch": 0.33824057450628364, + "grad_norm": 0.5640755295753479, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 4710 + }, + { + "epoch": 0.33895870736086176, + "grad_norm": 0.43475520610809326, + "learning_rate": 0.0002, + "loss": 0.8326, + "step": 4720 + }, + { + "epoch": 0.33967684021543987, + "grad_norm": 0.4785807132720947, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 4730 + }, + { + "epoch": 0.340394973070018, + "grad_norm": 0.4934665262699127, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 4740 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 0.45327693223953247, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 4750 + }, + { + "epoch": 0.34183123877917415, + "grad_norm": 0.4710456430912018, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4760 + }, + { + "epoch": 0.34254937163375226, + "grad_norm": 0.5591559410095215, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 4770 + }, + { + "epoch": 0.3432675044883303, + "grad_norm": 0.48958835005760193, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 4780 + }, + { + "epoch": 0.34398563734290843, + "grad_norm": 0.4613766670227051, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 4790 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 0.5425335764884949, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 4800 + }, + { + "epoch": 0.3454219030520646, + "grad_norm": 0.4964924156665802, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.3461400359066427, + "grad_norm": 0.613449215888977, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 4820 + }, + { + "epoch": 0.34685816876122083, + "grad_norm": 0.6553348898887634, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 4830 + }, + { + "epoch": 0.34757630161579894, + "grad_norm": 0.5863470435142517, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 4840 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.5338097810745239, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 4850 + }, + { + "epoch": 0.3490125673249551, + "grad_norm": 0.6129760146141052, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 4860 + }, + { + "epoch": 0.3497307001795332, + "grad_norm": 0.6100956797599792, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 4870 + }, + { + "epoch": 0.3504488330341113, + "grad_norm": 0.5478541254997253, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 4880 + }, + { + "epoch": 0.3511669658886894, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 4890 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6141043901443481, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 4900 + }, + { + "epoch": 0.3526032315978456, + "grad_norm": 0.597191572189331, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 4910 + }, + { + "epoch": 0.3533213644524237, + "grad_norm": 0.5988389253616333, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 4920 + }, + { + "epoch": 0.3540394973070018, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 4930 + }, + { + "epoch": 0.3547576301615799, + "grad_norm": 0.5932779312133789, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 4940 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 0.48911359906196594, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 4950 + }, + { + "epoch": 0.3561938958707361, + "grad_norm": 0.5435750484466553, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4960 + }, + { + "epoch": 0.3569120287253142, + "grad_norm": 0.4786977767944336, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 4970 + }, + { + "epoch": 0.3576301615798923, + "grad_norm": 0.4022316336631775, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 4980 + }, + { + "epoch": 0.35834829443447036, + "grad_norm": 0.4848504364490509, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 4990 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 0.5093459486961365, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 5000 + }, + { + "epoch": 0.3597845601436266, + "grad_norm": 0.47368478775024414, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 5010 + }, + { + "epoch": 0.36050269299820464, + "grad_norm": 0.6041097044944763, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 5020 + }, + { + "epoch": 0.36122082585278276, + "grad_norm": 0.5384424924850464, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 5030 + }, + { + "epoch": 0.36193895870736087, + "grad_norm": 0.4668518602848053, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 5040 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 0.5471060276031494, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 5050 + }, + { + "epoch": 0.36337522441651704, + "grad_norm": 0.731369137763977, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 5060 + }, + { + "epoch": 0.36409335727109515, + "grad_norm": 0.5119590759277344, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 5070 + }, + { + "epoch": 0.36481149012567327, + "grad_norm": 0.567428469657898, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 5080 + }, + { + "epoch": 0.3655296229802513, + "grad_norm": 0.5139971375465393, + "learning_rate": 0.0002, + "loss": 0.7616, + "step": 5090 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.5701581835746765, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 5100 + }, + { + "epoch": 0.36696588868940755, + "grad_norm": 0.5022063851356506, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 5110 + }, + { + "epoch": 0.36768402154398566, + "grad_norm": 0.4684354364871979, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 5120 + }, + { + "epoch": 0.3684021543985637, + "grad_norm": 0.5423495769500732, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 5130 + }, + { + "epoch": 0.36912028725314183, + "grad_norm": 0.46262967586517334, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 5140 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 0.4720141589641571, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 5150 + }, + { + "epoch": 0.370556552962298, + "grad_norm": 0.5113096833229065, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 5160 + }, + { + "epoch": 0.3712746858168761, + "grad_norm": 0.5253350138664246, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 5170 + }, + { + "epoch": 0.37199281867145423, + "grad_norm": 0.5799776315689087, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 5180 + }, + { + "epoch": 0.37271095152603234, + "grad_norm": 0.5166001319885254, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5190 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 5200 + }, + { + "epoch": 0.3741472172351885, + "grad_norm": 0.45811113715171814, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 5210 + }, + { + "epoch": 0.3748653500897666, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 5220 + }, + { + "epoch": 0.3755834829443447, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 0.3763016157989228, + "grad_norm": 0.3858596086502075, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 5240 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 0.6941536068916321, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 5250 + }, + { + "epoch": 0.377737881508079, + "grad_norm": 0.46940872073173523, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 5260 + }, + { + "epoch": 0.3784560143626571, + "grad_norm": 0.5413833260536194, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5270 + }, + { + "epoch": 0.3791741472172352, + "grad_norm": 0.5165658593177795, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 5280 + }, + { + "epoch": 0.3798922800718133, + "grad_norm": 0.6567398309707642, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 5290 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.5466915965080261, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 5300 + }, + { + "epoch": 0.3813285457809695, + "grad_norm": 0.4800598621368408, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 5310 + }, + { + "epoch": 0.3820466786355476, + "grad_norm": 0.4551742970943451, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 5320 + }, + { + "epoch": 0.3827648114901257, + "grad_norm": 0.5561164617538452, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 5330 + }, + { + "epoch": 0.38348294434470376, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 5340 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.465762197971344, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 5350 + }, + { + "epoch": 0.38491921005386, + "grad_norm": 0.6176838874816895, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 5360 + }, + { + "epoch": 0.38563734290843804, + "grad_norm": 0.657926082611084, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 5370 + }, + { + "epoch": 0.38635547576301615, + "grad_norm": 0.5063281655311584, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 5380 + }, + { + "epoch": 0.38707360861759427, + "grad_norm": 0.6960828304290771, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 5390 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 0.46712034940719604, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 5400 + }, + { + "epoch": 0.38850987432675044, + "grad_norm": 0.598114013671875, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 5410 + }, + { + "epoch": 0.38922800718132855, + "grad_norm": 0.6798132061958313, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 5420 + }, + { + "epoch": 0.38994614003590666, + "grad_norm": 0.5194289088249207, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 5430 + }, + { + "epoch": 0.3906642728904847, + "grad_norm": 0.48175323009490967, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 5440 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 0.4979408085346222, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 5450 + }, + { + "epoch": 0.39210053859964095, + "grad_norm": 0.6440972685813904, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5460 + }, + { + "epoch": 0.392818671454219, + "grad_norm": 0.5977227091789246, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 5470 + }, + { + "epoch": 0.3935368043087971, + "grad_norm": 0.4735909104347229, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 5480 + }, + { + "epoch": 0.39425493716337523, + "grad_norm": 0.48181721568107605, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 5490 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.6339454650878906, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 5500 + }, + { + "epoch": 0.3956912028725314, + "grad_norm": 0.5364336371421814, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5510 + }, + { + "epoch": 0.3964093357271095, + "grad_norm": 0.5499233603477478, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 5520 + }, + { + "epoch": 0.3971274685816876, + "grad_norm": 0.47249847650527954, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 5530 + }, + { + "epoch": 0.3978456014362657, + "grad_norm": 0.5692135095596313, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 5540 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 0.6009272933006287, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 5550 + }, + { + "epoch": 0.3992818671454219, + "grad_norm": 0.5198255181312561, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5560 + }, + { + "epoch": 0.4, + "grad_norm": 0.5474766492843628, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 0.4007181328545781, + "grad_norm": 0.5577479600906372, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 5580 + }, + { + "epoch": 0.4014362657091562, + "grad_norm": 0.5350302457809448, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5590 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 0.6310991048812866, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 5600 + }, + { + "epoch": 0.40287253141831236, + "grad_norm": 0.5695762038230896, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5610 + }, + { + "epoch": 0.4035906642728905, + "grad_norm": 0.5431827306747437, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 5620 + }, + { + "epoch": 0.4043087971274686, + "grad_norm": 0.4923325777053833, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 5630 + }, + { + "epoch": 0.4050269299820467, + "grad_norm": 0.531399667263031, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 5640 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.5854769349098206, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 5650 + }, + { + "epoch": 0.40646319569120287, + "grad_norm": 0.6684802174568176, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 5660 + }, + { + "epoch": 0.407181328545781, + "grad_norm": 0.6618620753288269, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 5670 + }, + { + "epoch": 0.40789946140035904, + "grad_norm": 0.4930776059627533, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 5680 + }, + { + "epoch": 0.40861759425493716, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 5690 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5700 + }, + { + "epoch": 0.4100538599640934, + "grad_norm": 0.6773046851158142, + "learning_rate": 0.0002, + "loss": 0.8386, + "step": 5710 + }, + { + "epoch": 0.41077199281867144, + "grad_norm": 0.6750592589378357, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 5720 + }, + { + "epoch": 0.41149012567324955, + "grad_norm": 0.5277232527732849, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5730 + }, + { + "epoch": 0.41220825852782766, + "grad_norm": 0.5155990719795227, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 5740 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 0.5236294865608215, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 5750 + }, + { + "epoch": 0.41364452423698383, + "grad_norm": 0.5073592066764832, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 5760 + }, + { + "epoch": 0.41436265709156195, + "grad_norm": 0.6997184753417969, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 5770 + }, + { + "epoch": 0.41508078994614006, + "grad_norm": 0.5282439589500427, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 5780 + }, + { + "epoch": 0.4157989228007181, + "grad_norm": 0.4997355341911316, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5790 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.6081610321998596, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5800 + }, + { + "epoch": 0.41723518850987434, + "grad_norm": 0.5640295147895813, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 5810 + }, + { + "epoch": 0.4179533213644524, + "grad_norm": 0.6443586349487305, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 0.4186714542190305, + "grad_norm": 0.6456229090690613, + "learning_rate": 0.0002, + "loss": 0.8132, + "step": 5830 + }, + { + "epoch": 0.4193895870736086, + "grad_norm": 0.5422267317771912, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5840 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 0.45251885056495667, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5850 + }, + { + "epoch": 0.4208258527827648, + "grad_norm": 0.781165599822998, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5860 + }, + { + "epoch": 0.4215439856373429, + "grad_norm": 0.5359160900115967, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5870 + }, + { + "epoch": 0.422262118491921, + "grad_norm": 0.6201958656311035, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5880 + }, + { + "epoch": 0.4229802513464991, + "grad_norm": 0.5985850691795349, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 5890 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.5550961494445801, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 5900 + }, + { + "epoch": 0.4244165170556553, + "grad_norm": 0.6284893155097961, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 5910 + }, + { + "epoch": 0.4251346499102334, + "grad_norm": 0.6143685579299927, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 5920 + }, + { + "epoch": 0.4258527827648115, + "grad_norm": 0.5065329670906067, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5930 + }, + { + "epoch": 0.4265709156193896, + "grad_norm": 0.7274345755577087, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 5940 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.606531023979187, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 5950 + }, + { + "epoch": 0.42800718132854576, + "grad_norm": 0.5983648300170898, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5960 + }, + { + "epoch": 0.4287253141831239, + "grad_norm": 0.5546031594276428, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5970 + }, + { + "epoch": 0.429443447037702, + "grad_norm": 0.666868269443512, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 5980 + }, + { + "epoch": 0.4301615798922801, + "grad_norm": 0.41438576579093933, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5990 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 0.5012526512145996, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 6000 + }, + { + "epoch": 0.43159784560143627, + "grad_norm": 0.6071694493293762, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 6010 + }, + { + "epoch": 0.4323159784560144, + "grad_norm": 0.5538384914398193, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 6020 + }, + { + "epoch": 0.43303411131059244, + "grad_norm": 0.5798718929290771, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 6030 + }, + { + "epoch": 0.43375224416517055, + "grad_norm": 0.5442442893981934, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 6040 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.6895565390586853, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 6050 + }, + { + "epoch": 0.4351885098743267, + "grad_norm": 0.6498045325279236, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 6060 + }, + { + "epoch": 0.43590664272890484, + "grad_norm": 0.5225510001182556, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 6070 + }, + { + "epoch": 0.43662477558348295, + "grad_norm": 0.6366992592811584, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 6080 + }, + { + "epoch": 0.43734290843806106, + "grad_norm": 0.47929027676582336, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 6090 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.5722405910491943, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 6100 + }, + { + "epoch": 0.43877917414721723, + "grad_norm": 0.6008004546165466, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 6110 + }, + { + "epoch": 0.43949730700179535, + "grad_norm": 0.5922580361366272, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 6120 + }, + { + "epoch": 0.4402154398563734, + "grad_norm": 0.7051905393600464, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 6130 + }, + { + "epoch": 0.4409335727109515, + "grad_norm": 0.5146450400352478, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 6140 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5605781674385071, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 6150 + }, + { + "epoch": 0.44236983842010774, + "grad_norm": 0.8008661866188049, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 6160 + }, + { + "epoch": 0.4430879712746858, + "grad_norm": 0.47406497597694397, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 6170 + }, + { + "epoch": 0.4438061041292639, + "grad_norm": 0.612287700176239, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 6180 + }, + { + "epoch": 0.444524236983842, + "grad_norm": 0.561188280582428, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 6190 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.6233669519424438, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 6200 + }, + { + "epoch": 0.4459605026929982, + "grad_norm": 0.45546263456344604, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6210 + }, + { + "epoch": 0.4466786355475763, + "grad_norm": 0.5947871208190918, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 6220 + }, + { + "epoch": 0.4473967684021544, + "grad_norm": 0.6109753847122192, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 6230 + }, + { + "epoch": 0.4481149012567325, + "grad_norm": 0.6380727887153625, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6240 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.5225699543952942, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 6250 + }, + { + "epoch": 0.4495511669658887, + "grad_norm": 0.521503210067749, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.45026929982046676, + "grad_norm": 0.5523216128349304, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 6270 + }, + { + "epoch": 0.4509874326750449, + "grad_norm": 0.5954921841621399, + "learning_rate": 0.0002, + "loss": 0.8228, + "step": 6280 + }, + { + "epoch": 0.451705565529623, + "grad_norm": 0.702751100063324, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 6290 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 0.5756356120109558, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 6300 + }, + { + "epoch": 0.45314183123877916, + "grad_norm": 0.45365944504737854, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 6310 + }, + { + "epoch": 0.45385996409335727, + "grad_norm": 0.5027855038642883, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6320 + }, + { + "epoch": 0.4545780969479354, + "grad_norm": 0.6551687121391296, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 6330 + }, + { + "epoch": 0.45529622980251344, + "grad_norm": 0.5296684503555298, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6340 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 0.5762032866477966, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6350 + }, + { + "epoch": 0.45673249551166967, + "grad_norm": 0.5234073996543884, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6360 + }, + { + "epoch": 0.4574506283662478, + "grad_norm": 0.5090946555137634, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 6370 + }, + { + "epoch": 0.45816876122082584, + "grad_norm": 0.6515111327171326, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 6380 + }, + { + "epoch": 0.45888689407540395, + "grad_norm": 0.7904898524284363, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 6390 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.6379680037498474, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 6400 + }, + { + "epoch": 0.4603231597845601, + "grad_norm": 0.641759991645813, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 6410 + }, + { + "epoch": 0.46104129263913823, + "grad_norm": 0.5273829698562622, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 6420 + }, + { + "epoch": 0.46175942549371635, + "grad_norm": 0.5668497681617737, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6430 + }, + { + "epoch": 0.46247755834829446, + "grad_norm": 0.5862061381340027, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 6440 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 0.5239592790603638, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 6450 + }, + { + "epoch": 0.46391382405745063, + "grad_norm": 0.5078722834587097, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 6460 + }, + { + "epoch": 0.46463195691202874, + "grad_norm": 0.566509485244751, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 6470 + }, + { + "epoch": 0.4653500897666068, + "grad_norm": 0.5952697396278381, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 6480 + }, + { + "epoch": 0.4660682226211849, + "grad_norm": 0.6548156142234802, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 6490 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 0.4768427908420563, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.46750448833034114, + "grad_norm": 0.5588273406028748, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 6510 + }, + { + "epoch": 0.4682226211849192, + "grad_norm": 0.5348677039146423, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 6520 + }, + { + "epoch": 0.4689407540394973, + "grad_norm": 0.4784318804740906, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 6530 + }, + { + "epoch": 0.4696588868940754, + "grad_norm": 0.5112265944480896, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 6540 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 0.7250495553016663, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 6550 + }, + { + "epoch": 0.4710951526032316, + "grad_norm": 0.538608968257904, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 6560 + }, + { + "epoch": 0.4718132854578097, + "grad_norm": 0.5981247425079346, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 6570 + }, + { + "epoch": 0.4725314183123878, + "grad_norm": 0.5466762781143188, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 6580 + }, + { + "epoch": 0.4732495511669659, + "grad_norm": 0.5609987378120422, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 6590 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 0.6091027855873108, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 6600 + }, + { + "epoch": 0.4746858168761221, + "grad_norm": 0.5542886853218079, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 6610 + }, + { + "epoch": 0.47540394973070016, + "grad_norm": 0.5656579732894897, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6620 + }, + { + "epoch": 0.4761220825852783, + "grad_norm": 0.47507357597351074, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 6630 + }, + { + "epoch": 0.4768402154398564, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6640 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 0.7129740715026855, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 6650 + }, + { + "epoch": 0.47827648114901256, + "grad_norm": 0.5189188718795776, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 6660 + }, + { + "epoch": 0.47899461400359067, + "grad_norm": 0.7548696398735046, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 6670 + }, + { + "epoch": 0.4797127468581688, + "grad_norm": 0.4729466438293457, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 6680 + }, + { + "epoch": 0.48043087971274684, + "grad_norm": 0.6190000772476196, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 6690 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 0.6276983022689819, + "learning_rate": 0.0002, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.48186714542190306, + "grad_norm": 0.6097590923309326, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 6710 + }, + { + "epoch": 0.4825852782764811, + "grad_norm": 0.6507330536842346, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 6720 + }, + { + "epoch": 0.48330341113105924, + "grad_norm": 0.5501991510391235, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 6730 + }, + { + "epoch": 0.48402154398563735, + "grad_norm": 0.5928015112876892, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 6740 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 0.5523008704185486, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 6750 + }, + { + "epoch": 0.4854578096947935, + "grad_norm": 0.5997263789176941, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 6760 + }, + { + "epoch": 0.48617594254937163, + "grad_norm": 0.6201002597808838, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 6770 + }, + { + "epoch": 0.48689407540394974, + "grad_norm": 0.6338862776756287, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 6780 + }, + { + "epoch": 0.4876122082585278, + "grad_norm": 0.5542550086975098, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6790 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.5587872862815857, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 6800 + }, + { + "epoch": 0.489048473967684, + "grad_norm": 0.5895681977272034, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 6810 + }, + { + "epoch": 0.48976660682226214, + "grad_norm": 0.4948221743106842, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 6820 + }, + { + "epoch": 0.4904847396768402, + "grad_norm": 0.44546931982040405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 6830 + }, + { + "epoch": 0.4912028725314183, + "grad_norm": 0.632046103477478, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 6840 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 0.49396243691444397, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 6850 + }, + { + "epoch": 0.4926391382405745, + "grad_norm": 0.497745156288147, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6860 + }, + { + "epoch": 0.4933572710951526, + "grad_norm": 0.7336170077323914, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 6870 + }, + { + "epoch": 0.4940754039497307, + "grad_norm": 0.6723181009292603, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 6880 + }, + { + "epoch": 0.4947935368043088, + "grad_norm": 0.5887754559516907, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 6890 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 0.6580226421356201, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 6900 + }, + { + "epoch": 0.496229802513465, + "grad_norm": 0.7385056614875793, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 6910 + }, + { + "epoch": 0.4969479353680431, + "grad_norm": 0.48736000061035156, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6920 + }, + { + "epoch": 0.49766606822262116, + "grad_norm": 0.6304559111595154, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 6930 + }, + { + "epoch": 0.4983842010771993, + "grad_norm": 0.607148289680481, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6940 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.5467981696128845, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 6950 + }, + { + "epoch": 0.4998204667863555, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 6960 + }, + { + "epoch": 0.5005385996409336, + "grad_norm": 0.5487921833992004, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 6970 + }, + { + "epoch": 0.5012567324955116, + "grad_norm": 0.5706006288528442, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 6980 + }, + { + "epoch": 0.5019748653500897, + "grad_norm": 0.539536714553833, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 6990 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 0.5527397394180298, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 7000 + }, + { + "epoch": 0.503411131059246, + "grad_norm": 0.5498567223548889, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 7010 + }, + { + "epoch": 0.5041292639138241, + "grad_norm": 0.5878575444221497, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 7020 + }, + { + "epoch": 0.5048473967684022, + "grad_norm": 0.646153450012207, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 7030 + }, + { + "epoch": 0.5055655296229803, + "grad_norm": 0.5603899359703064, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 7040 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 0.5849952697753906, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 7050 + }, + { + "epoch": 0.5070017953321364, + "grad_norm": 0.6082724928855896, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 7060 + }, + { + "epoch": 0.5077199281867145, + "grad_norm": 0.5900670289993286, + "learning_rate": 0.0002, + "loss": 0.8046, + "step": 7070 + }, + { + "epoch": 0.5084380610412926, + "grad_norm": 0.5856624841690063, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 7080 + }, + { + "epoch": 0.5091561938958707, + "grad_norm": 0.6177338361740112, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7090 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 0.5559300184249878, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 7100 + }, + { + "epoch": 0.510592459605027, + "grad_norm": 0.62027907371521, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 7110 + }, + { + "epoch": 0.511310592459605, + "grad_norm": 0.6334301829338074, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7120 + }, + { + "epoch": 0.5120287253141831, + "grad_norm": 0.513795018196106, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 7130 + }, + { + "epoch": 0.5127468581687612, + "grad_norm": 0.7004675269126892, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 7140 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5614308714866638, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7150 + }, + { + "epoch": 0.5141831238779174, + "grad_norm": 0.5037539601325989, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 7160 + }, + { + "epoch": 0.5149012567324955, + "grad_norm": 0.5568661093711853, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 7170 + }, + { + "epoch": 0.5156193895870737, + "grad_norm": 0.7513397336006165, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7180 + }, + { + "epoch": 0.5163375224416517, + "grad_norm": 0.7264583706855774, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 7190 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.6355819702148438, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 7200 + }, + { + "epoch": 0.5177737881508079, + "grad_norm": 0.6063222289085388, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 7210 + }, + { + "epoch": 0.518491921005386, + "grad_norm": 0.6484307646751404, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 7220 + }, + { + "epoch": 0.5192100538599641, + "grad_norm": 0.5260455012321472, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 7230 + }, + { + "epoch": 0.5199281867145422, + "grad_norm": 0.6718002557754517, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7240 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 0.5997617244720459, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 7250 + }, + { + "epoch": 0.5213644524236983, + "grad_norm": 0.5838589668273926, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 7260 + }, + { + "epoch": 0.5220825852782764, + "grad_norm": 0.5755977630615234, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 7270 + }, + { + "epoch": 0.5228007181328546, + "grad_norm": 0.6442093253135681, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 7280 + }, + { + "epoch": 0.5235188509874327, + "grad_norm": 0.6128416657447815, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 7290 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.509742796421051, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 7300 + }, + { + "epoch": 0.5249551166965889, + "grad_norm": 0.5450230836868286, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 7310 + }, + { + "epoch": 0.525673249551167, + "grad_norm": 0.5437141060829163, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 7320 + }, + { + "epoch": 0.526391382405745, + "grad_norm": 0.5291738510131836, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 7330 + }, + { + "epoch": 0.5271095152603231, + "grad_norm": 0.5101743936538696, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 7340 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.5678408145904541, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 7350 + }, + { + "epoch": 0.5285457809694794, + "grad_norm": 0.6332360506057739, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7360 + }, + { + "epoch": 0.5292639138240575, + "grad_norm": 0.4935058653354645, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 7370 + }, + { + "epoch": 0.5299820466786356, + "grad_norm": 0.6399656534194946, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7380 + }, + { + "epoch": 0.5307001795332137, + "grad_norm": 0.5986794233322144, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 7390 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.6948414444923401, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 7400 + }, + { + "epoch": 0.5321364452423698, + "grad_norm": 0.5337842106819153, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 7410 + }, + { + "epoch": 0.5328545780969479, + "grad_norm": 0.6897268295288086, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 7420 + }, + { + "epoch": 0.533572710951526, + "grad_norm": 0.6361175179481506, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 7430 + }, + { + "epoch": 0.5342908438061041, + "grad_norm": 0.5242252945899963, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 7440 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 0.5731322765350342, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 7450 + }, + { + "epoch": 0.5357271095152604, + "grad_norm": 0.5790955424308777, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 7460 + }, + { + "epoch": 0.5364452423698384, + "grad_norm": 0.4979061782360077, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 7470 + }, + { + "epoch": 0.5371633752244165, + "grad_norm": 0.7335101962089539, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 7480 + }, + { + "epoch": 0.5378815080789946, + "grad_norm": 0.592521071434021, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 7490 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.5784769654273987, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 7500 + }, + { + "epoch": 0.5393177737881508, + "grad_norm": 0.8148589730262756, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 7510 + }, + { + "epoch": 0.5400359066427289, + "grad_norm": 0.5727689862251282, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7520 + }, + { + "epoch": 0.540754039497307, + "grad_norm": 0.6958279609680176, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 7530 + }, + { + "epoch": 0.541472172351885, + "grad_norm": 0.6302788257598877, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 7540 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.5950970649719238, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 7550 + }, + { + "epoch": 0.5429084380610413, + "grad_norm": 0.4275270104408264, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 7560 + }, + { + "epoch": 0.5436265709156194, + "grad_norm": 0.7579900622367859, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 7570 + }, + { + "epoch": 0.5443447037701975, + "grad_norm": 0.5835317969322205, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 7580 + }, + { + "epoch": 0.5450628366247756, + "grad_norm": 0.5305142998695374, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 7590 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7600 + }, + { + "epoch": 0.5464991023339317, + "grad_norm": 0.5341935753822327, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 7610 + }, + { + "epoch": 0.5472172351885098, + "grad_norm": 0.6070826053619385, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 7620 + }, + { + "epoch": 0.547935368043088, + "grad_norm": 0.6193035840988159, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 7630 + }, + { + "epoch": 0.5486535008976661, + "grad_norm": 0.6171614527702332, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 7640 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.5700938105583191, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 7650 + }, + { + "epoch": 0.5500897666068223, + "grad_norm": 0.5742418169975281, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7660 + }, + { + "epoch": 0.5508078994614004, + "grad_norm": 0.6450320482254028, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 7670 + }, + { + "epoch": 0.5515260323159784, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 7680 + }, + { + "epoch": 0.5522441651705565, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 7690 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.5846288204193115, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7700 + }, + { + "epoch": 0.5536804308797127, + "grad_norm": 0.623315155506134, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7710 + }, + { + "epoch": 0.5543985637342909, + "grad_norm": 0.6607962250709534, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7720 + }, + { + "epoch": 0.555116696588869, + "grad_norm": 0.5258557200431824, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 7730 + }, + { + "epoch": 0.5558348294434471, + "grad_norm": 0.6464316844940186, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7740 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 0.6390621662139893, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 7750 + }, + { + "epoch": 0.5572710951526032, + "grad_norm": 0.5327560305595398, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 7760 + }, + { + "epoch": 0.5579892280071813, + "grad_norm": 0.8202064633369446, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 7770 + }, + { + "epoch": 0.5587073608617594, + "grad_norm": 0.45350968837738037, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 7780 + }, + { + "epoch": 0.5594254937163375, + "grad_norm": 0.5031413435935974, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 7790 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.5047417879104614, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 7800 + }, + { + "epoch": 0.5608617594254938, + "grad_norm": 0.668912410736084, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 0.5615798922800718, + "grad_norm": 0.6106061339378357, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7820 + }, + { + "epoch": 0.5622980251346499, + "grad_norm": 0.5558443665504456, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 7830 + }, + { + "epoch": 0.563016157989228, + "grad_norm": 0.5937177538871765, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 7840 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 0.67307448387146, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 7850 + }, + { + "epoch": 0.5644524236983842, + "grad_norm": 0.4615475833415985, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7860 + }, + { + "epoch": 0.5651705565529623, + "grad_norm": 0.5462577939033508, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 7870 + }, + { + "epoch": 0.5658886894075404, + "grad_norm": 0.6422402858734131, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7880 + }, + { + "epoch": 0.5666068222621184, + "grad_norm": 0.5313532948493958, + "learning_rate": 0.0002, + "loss": 0.8327, + "step": 7890 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 0.5647847056388855, + "learning_rate": 0.0002, + "loss": 0.7771, + "step": 7900 + }, + { + "epoch": 0.5680430879712747, + "grad_norm": 0.6581610441207886, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 7910 + }, + { + "epoch": 0.5687612208258528, + "grad_norm": 0.46947669982910156, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 7920 + }, + { + "epoch": 0.5694793536804309, + "grad_norm": 0.6420038342475891, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7930 + }, + { + "epoch": 0.570197486535009, + "grad_norm": 0.6730441451072693, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 7940 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.3849070966243744, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 7950 + }, + { + "epoch": 0.5716337522441651, + "grad_norm": 0.6076335906982422, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 7960 + }, + { + "epoch": 0.5723518850987432, + "grad_norm": 0.6446982026100159, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 7970 + }, + { + "epoch": 0.5730700179533214, + "grad_norm": 0.6019234657287598, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 7980 + }, + { + "epoch": 0.5737881508078995, + "grad_norm": 0.620880663394928, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 7990 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 0.4927573502063751, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 8000 + }, + { + "epoch": 0.5752244165170557, + "grad_norm": 0.6276804804801941, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8010 + }, + { + "epoch": 0.5759425493716338, + "grad_norm": 0.484518826007843, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 8020 + }, + { + "epoch": 0.5766606822262118, + "grad_norm": 0.5019962787628174, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.5773788150807899, + "grad_norm": 0.6685234308242798, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 8040 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 0.5762107372283936, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 8050 + }, + { + "epoch": 0.5788150807899461, + "grad_norm": 0.6402477025985718, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 8060 + }, + { + "epoch": 0.5795332136445243, + "grad_norm": 0.5919345617294312, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8070 + }, + { + "epoch": 0.5802513464991024, + "grad_norm": 0.47100913524627686, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 8080 + }, + { + "epoch": 0.5809694793536805, + "grad_norm": 0.6029118895530701, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 8090 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 0.5896338820457458, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 8100 + }, + { + "epoch": 0.5824057450628366, + "grad_norm": 0.49017754197120667, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 8110 + }, + { + "epoch": 0.5831238779174147, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 8120 + }, + { + "epoch": 0.5838420107719928, + "grad_norm": 0.6874517798423767, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.5845601436265709, + "grad_norm": 0.5429391264915466, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 8140 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.5533722639083862, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5859964093357272, + "grad_norm": 0.5827956199645996, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 8160 + }, + { + "epoch": 0.5867145421903052, + "grad_norm": 0.6670212149620056, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 8170 + }, + { + "epoch": 0.5874326750448833, + "grad_norm": 0.5231172442436218, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 8180 + }, + { + "epoch": 0.5881508078994614, + "grad_norm": 0.567447304725647, + "learning_rate": 0.0002, + "loss": 0.7975, + "step": 8190 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 0.5318575501441956, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8200 + }, + { + "epoch": 0.5895870736086176, + "grad_norm": 0.6959463357925415, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 8210 + }, + { + "epoch": 0.5903052064631957, + "grad_norm": 0.6964931488037109, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 8220 + }, + { + "epoch": 0.5910233393177737, + "grad_norm": 0.5164617896080017, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 8230 + }, + { + "epoch": 0.5917414721723518, + "grad_norm": 0.5456110239028931, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 8240 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.6553666591644287, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 8250 + }, + { + "epoch": 0.5931777378815081, + "grad_norm": 0.6185845732688904, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 8260 + }, + { + "epoch": 0.5938958707360862, + "grad_norm": 0.6110545992851257, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8270 + }, + { + "epoch": 0.5946140035906643, + "grad_norm": 0.5186824202537537, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 8280 + }, + { + "epoch": 0.5953321364452424, + "grad_norm": 0.7003735303878784, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 8290 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 0.4606216549873352, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 8300 + }, + { + "epoch": 0.5967684021543985, + "grad_norm": 0.5903441309928894, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 8310 + }, + { + "epoch": 0.5974865350089766, + "grad_norm": 0.7916744947433472, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 8320 + }, + { + "epoch": 0.5982046678635548, + "grad_norm": 0.5506401062011719, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 8330 + }, + { + "epoch": 0.5989228007181329, + "grad_norm": 0.5749204158782959, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 8340 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.6807544827461243, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 8350 + }, + { + "epoch": 0.6003590664272891, + "grad_norm": 0.5782986283302307, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 8360 + }, + { + "epoch": 0.6010771992818671, + "grad_norm": 0.7336342334747314, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 8370 + }, + { + "epoch": 0.6017953321364452, + "grad_norm": 0.5762712955474854, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.6025134649910233, + "grad_norm": 0.5726776719093323, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 8390 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.5355535745620728, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 8400 + }, + { + "epoch": 0.6039497307001795, + "grad_norm": 0.6762161254882812, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 8410 + }, + { + "epoch": 0.6046678635547577, + "grad_norm": 0.8200717568397522, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 8420 + }, + { + "epoch": 0.6053859964093358, + "grad_norm": 0.5600009560585022, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 8430 + }, + { + "epoch": 0.6061041292639138, + "grad_norm": 0.6465966105461121, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 8440 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.5176072120666504, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 8450 + }, + { + "epoch": 0.60754039497307, + "grad_norm": 0.5777280926704407, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 8460 + }, + { + "epoch": 0.6082585278276481, + "grad_norm": 0.5989252924919128, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 8470 + }, + { + "epoch": 0.6089766606822262, + "grad_norm": 0.5207306742668152, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8480 + }, + { + "epoch": 0.6096947935368043, + "grad_norm": 0.5242675542831421, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 8490 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 0.5631455183029175, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 8500 + }, + { + "epoch": 0.6111310592459605, + "grad_norm": 0.65207439661026, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 8510 + }, + { + "epoch": 0.6118491921005386, + "grad_norm": 0.5808899998664856, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8520 + }, + { + "epoch": 0.6125673249551167, + "grad_norm": 0.558127760887146, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 8530 + }, + { + "epoch": 0.6132854578096948, + "grad_norm": 0.6063143014907837, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8540 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.5491744875907898, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 8550 + }, + { + "epoch": 0.614721723518851, + "grad_norm": 0.5105780959129333, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8560 + }, + { + "epoch": 0.6154398563734291, + "grad_norm": 0.6892395615577698, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 8570 + }, + { + "epoch": 0.6161579892280071, + "grad_norm": 0.7411758899688721, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8580 + }, + { + "epoch": 0.6168761220825852, + "grad_norm": 0.6745429635047913, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 8590 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 0.596007227897644, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 8600 + }, + { + "epoch": 0.6183123877917415, + "grad_norm": 0.6751060485839844, + "learning_rate": 0.0002, + "loss": 0.7963, + "step": 8610 + }, + { + "epoch": 0.6190305206463196, + "grad_norm": 0.711124837398529, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 8620 + }, + { + "epoch": 0.6197486535008977, + "grad_norm": 0.6110914945602417, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 8630 + }, + { + "epoch": 0.6204667863554758, + "grad_norm": 0.5687659978866577, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 8640 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.7025772929191589, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8650 + }, + { + "epoch": 0.6219030520646319, + "grad_norm": 0.6456184983253479, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 8660 + }, + { + "epoch": 0.62262118491921, + "grad_norm": 0.5317023992538452, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 8670 + }, + { + "epoch": 0.6233393177737881, + "grad_norm": 0.5531691908836365, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 8680 + }, + { + "epoch": 0.6240574506283663, + "grad_norm": 0.6063531637191772, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 8690 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 1.094390630722046, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 8700 + }, + { + "epoch": 0.6254937163375225, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 8710 + }, + { + "epoch": 0.6262118491921005, + "grad_norm": 0.5470370054244995, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 8720 + }, + { + "epoch": 0.6269299820466786, + "grad_norm": 0.5852634310722351, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 8730 + }, + { + "epoch": 0.6276481149012567, + "grad_norm": 0.6120240092277527, + "learning_rate": 0.0002, + "loss": 0.8712, + "step": 8740 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 0.5608004927635193, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 8750 + }, + { + "epoch": 0.6290843806104129, + "grad_norm": 0.5980432033538818, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 8760 + }, + { + "epoch": 0.629802513464991, + "grad_norm": 0.5670580863952637, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 8770 + }, + { + "epoch": 0.6305206463195692, + "grad_norm": 0.5931687951087952, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 8780 + }, + { + "epoch": 0.6312387791741472, + "grad_norm": 0.7872577905654907, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 8790 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.6355181336402893, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 8800 + }, + { + "epoch": 0.6326750448833034, + "grad_norm": 0.501913845539093, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 8810 + }, + { + "epoch": 0.6333931777378815, + "grad_norm": 0.5956716537475586, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8820 + }, + { + "epoch": 0.6341113105924596, + "grad_norm": 0.6448253393173218, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 8830 + }, + { + "epoch": 0.6348294434470377, + "grad_norm": 0.6139631271362305, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 8840 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.5894306302070618, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 8850 + }, + { + "epoch": 0.6362657091561938, + "grad_norm": 0.8724799752235413, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 8860 + }, + { + "epoch": 0.636983842010772, + "grad_norm": 0.5413858890533447, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 8870 + }, + { + "epoch": 0.6377019748653501, + "grad_norm": 0.5993430614471436, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 8880 + }, + { + "epoch": 0.6384201077199282, + "grad_norm": 0.539415717124939, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 0.600125789642334, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 0.6398563734290844, + "grad_norm": 0.5597978234291077, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 8910 + }, + { + "epoch": 0.6405745062836625, + "grad_norm": 0.6262031197547913, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 8920 + }, + { + "epoch": 0.6412926391382405, + "grad_norm": 0.72662752866745, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 8930 + }, + { + "epoch": 0.6420107719928186, + "grad_norm": 0.613002598285675, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 8940 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.6511827707290649, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 8950 + }, + { + "epoch": 0.6434470377019749, + "grad_norm": 0.5383973717689514, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 8960 + }, + { + "epoch": 0.644165170556553, + "grad_norm": 0.5236184597015381, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 8970 + }, + { + "epoch": 0.6448833034111311, + "grad_norm": 0.5938544273376465, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 8980 + }, + { + "epoch": 0.6456014362657092, + "grad_norm": 0.4594680964946747, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 8990 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 0.6314211487770081, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9000 + }, + { + "epoch": 0.6470377019748653, + "grad_norm": 0.6291103363037109, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 9010 + }, + { + "epoch": 0.6477558348294434, + "grad_norm": 0.5888266563415527, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 9020 + }, + { + "epoch": 0.6484739676840215, + "grad_norm": 0.5613022446632385, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9030 + }, + { + "epoch": 0.6491921005385997, + "grad_norm": 0.7219604253768921, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 9040 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 0.5846529006958008, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 9050 + }, + { + "epoch": 0.6506283662477559, + "grad_norm": 0.7264063954353333, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 9060 + }, + { + "epoch": 0.6513464991023339, + "grad_norm": 0.5797538757324219, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9070 + }, + { + "epoch": 0.652064631956912, + "grad_norm": 0.4857395887374878, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9080 + }, + { + "epoch": 0.6527827648114901, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 9090 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.6105342507362366, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 9100 + }, + { + "epoch": 0.6542190305206463, + "grad_norm": 0.6408740282058716, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 9110 + }, + { + "epoch": 0.6549371633752245, + "grad_norm": 0.7474880814552307, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 9120 + }, + { + "epoch": 0.6556552962298026, + "grad_norm": 0.584768533706665, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 9130 + }, + { + "epoch": 0.6563734290843806, + "grad_norm": 0.6368113160133362, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9140 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.693631649017334, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 9150 + }, + { + "epoch": 0.6578096947935368, + "grad_norm": 0.6094512343406677, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 9160 + }, + { + "epoch": 0.6585278276481149, + "grad_norm": 0.7154942750930786, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 9170 + }, + { + "epoch": 0.659245960502693, + "grad_norm": 0.5749237537384033, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9180 + }, + { + "epoch": 0.6599640933572711, + "grad_norm": 0.6214450001716614, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 9190 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.6357814073562622, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9200 + }, + { + "epoch": 0.6614003590664272, + "grad_norm": 0.5677326917648315, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 9210 + }, + { + "epoch": 0.6621184919210054, + "grad_norm": 0.5432633757591248, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 9220 + }, + { + "epoch": 0.6628366247755835, + "grad_norm": 0.43935060501098633, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 9230 + }, + { + "epoch": 0.6635547576301616, + "grad_norm": 0.5350922346115112, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 9240 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.7745687365531921, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 9250 + }, + { + "epoch": 0.6649910233393178, + "grad_norm": 0.5767113566398621, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9260 + }, + { + "epoch": 0.6657091561938959, + "grad_norm": 0.49304983019828796, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9270 + }, + { + "epoch": 0.6664272890484739, + "grad_norm": 0.6355269551277161, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 9280 + }, + { + "epoch": 0.667145421903052, + "grad_norm": 0.5539451241493225, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 9290 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.5225138068199158, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 9300 + }, + { + "epoch": 0.6685816876122083, + "grad_norm": 0.5435736179351807, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 9310 + }, + { + "epoch": 0.6692998204667864, + "grad_norm": 0.611266553401947, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 9320 + }, + { + "epoch": 0.6700179533213645, + "grad_norm": 0.5880926251411438, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 9330 + }, + { + "epoch": 0.6707360861759426, + "grad_norm": 0.5301468372344971, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9340 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.5614377856254578, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9350 + }, + { + "epoch": 0.6721723518850987, + "grad_norm": 0.7177342176437378, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 9360 + }, + { + "epoch": 0.6728904847396768, + "grad_norm": 0.5187423825263977, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9370 + }, + { + "epoch": 0.6736086175942549, + "grad_norm": 0.49305087327957153, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 9380 + }, + { + "epoch": 0.6743267504488331, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 9390 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 0.8308040499687195, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 9400 + }, + { + "epoch": 0.6757630161579893, + "grad_norm": 0.6522438526153564, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 9410 + }, + { + "epoch": 0.6764811490125673, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 9420 + }, + { + "epoch": 0.6771992818671454, + "grad_norm": 0.783802330493927, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 9430 + }, + { + "epoch": 0.6779174147217235, + "grad_norm": 0.5246656537055969, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 9440 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.6630974411964417, + "learning_rate": 0.0002, + "loss": 0.7866, + "step": 9450 + }, + { + "epoch": 0.6793536804308797, + "grad_norm": 0.5012770295143127, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9460 + }, + { + "epoch": 0.6800718132854578, + "grad_norm": 0.6208643317222595, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 9470 + }, + { + "epoch": 0.680789946140036, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9480 + }, + { + "epoch": 0.681508078994614, + "grad_norm": 0.6613174080848694, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 9490 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.6417899131774902, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9500 + }, + { + "epoch": 0.6829443447037702, + "grad_norm": 0.5060321092605591, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 9510 + }, + { + "epoch": 0.6836624775583483, + "grad_norm": 0.586670458316803, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 9520 + }, + { + "epoch": 0.6843806104129264, + "grad_norm": 0.6607828736305237, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 9530 + }, + { + "epoch": 0.6850987432675045, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9540 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.741000771522522, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 9550 + }, + { + "epoch": 0.6865350089766606, + "grad_norm": 0.4687826335430145, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 9560 + }, + { + "epoch": 0.6872531418312388, + "grad_norm": 0.6452056169509888, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 9570 + }, + { + "epoch": 0.6879712746858169, + "grad_norm": 0.6393555402755737, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 9580 + }, + { + "epoch": 0.688689407540395, + "grad_norm": 0.4907757043838501, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 9590 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.5380825996398926, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 9600 + }, + { + "epoch": 0.6901256732495512, + "grad_norm": 0.5657393932342529, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 9610 + }, + { + "epoch": 0.6908438061041292, + "grad_norm": 0.8505447506904602, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 9620 + }, + { + "epoch": 0.6915619389587073, + "grad_norm": 0.5389836430549622, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 9630 + }, + { + "epoch": 0.6922800718132854, + "grad_norm": 0.4977441728115082, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 9640 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 0.5855389833450317, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 9650 + }, + { + "epoch": 0.6937163375224417, + "grad_norm": 0.633994996547699, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 9660 + }, + { + "epoch": 0.6944344703770198, + "grad_norm": 0.5592191815376282, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 9670 + }, + { + "epoch": 0.6951526032315979, + "grad_norm": 0.6030594706535339, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9680 + }, + { + "epoch": 0.6958707360861759, + "grad_norm": 0.6782388687133789, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 9690 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 0.6777627468109131, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 9700 + }, + { + "epoch": 0.6973070017953321, + "grad_norm": 0.5674123764038086, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 9710 + }, + { + "epoch": 0.6980251346499102, + "grad_norm": 0.5280387997627258, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 9720 + }, + { + "epoch": 0.6987432675044883, + "grad_norm": 0.5471981763839722, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 9730 + }, + { + "epoch": 0.6994614003590665, + "grad_norm": 0.6751061677932739, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9740 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.5942487716674805, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 9750 + }, + { + "epoch": 0.7008976660682226, + "grad_norm": 0.6165713667869568, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 9760 + }, + { + "epoch": 0.7016157989228007, + "grad_norm": 0.5745091438293457, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 9770 + }, + { + "epoch": 0.7023339317773788, + "grad_norm": 0.600308358669281, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 9780 + }, + { + "epoch": 0.7030520646319569, + "grad_norm": 0.6448577046394348, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 9790 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.5662767291069031, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9800 + }, + { + "epoch": 0.7044883303411131, + "grad_norm": 0.6490433812141418, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 9810 + }, + { + "epoch": 0.7052064631956912, + "grad_norm": 0.6126134991645813, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 9820 + }, + { + "epoch": 0.7059245960502692, + "grad_norm": 0.7181116938591003, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 9830 + }, + { + "epoch": 0.7066427289048474, + "grad_norm": 0.7805212140083313, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9840 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.7521958947181702, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9850 + }, + { + "epoch": 0.7080789946140036, + "grad_norm": 0.5610787868499756, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9860 + }, + { + "epoch": 0.7087971274685817, + "grad_norm": 0.7026229500770569, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 9870 + }, + { + "epoch": 0.7095152603231598, + "grad_norm": 0.551691472530365, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 9880 + }, + { + "epoch": 0.7102333931777379, + "grad_norm": 0.5841995477676392, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9890 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 0.7170061469078064, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 9900 + }, + { + "epoch": 0.711669658886894, + "grad_norm": 0.49836990237236023, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 9910 + }, + { + "epoch": 0.7123877917414722, + "grad_norm": 0.5234556794166565, + "learning_rate": 0.0002, + "loss": 0.7667, + "step": 9920 + }, + { + "epoch": 0.7131059245960503, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 9930 + }, + { + "epoch": 0.7138240574506284, + "grad_norm": 0.5657515525817871, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9940 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.5969128012657166, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 9950 + }, + { + "epoch": 0.7152603231597846, + "grad_norm": 0.7136867046356201, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 9960 + }, + { + "epoch": 0.7159784560143626, + "grad_norm": 0.6774699091911316, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9970 + }, + { + "epoch": 0.7166965888689407, + "grad_norm": 0.6066371202468872, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 9980 + }, + { + "epoch": 0.7174147217235188, + "grad_norm": 0.7355279922485352, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 9990 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 0.7996646761894226, + "learning_rate": 0.0002, + "loss": 0.7643, + "step": 10000 + }, + { + "epoch": 0.7188509874326751, + "grad_norm": 0.628839910030365, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10010 + }, + { + "epoch": 0.7195691202872532, + "grad_norm": 0.5472931265830994, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 10020 + }, + { + "epoch": 0.7202872531418313, + "grad_norm": 0.5776344537734985, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 10030 + }, + { + "epoch": 0.7210053859964093, + "grad_norm": 0.5041707158088684, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10040 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5965308547019958, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 10050 + }, + { + "epoch": 0.7224416517055655, + "grad_norm": 0.5892689228057861, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 10060 + }, + { + "epoch": 0.7231597845601436, + "grad_norm": 0.5695884227752686, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 10070 + }, + { + "epoch": 0.7238779174147217, + "grad_norm": 0.6547690629959106, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 10080 + }, + { + "epoch": 0.7245960502692999, + "grad_norm": 0.6759928464889526, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 10090 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.6829725503921509, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 10100 + }, + { + "epoch": 0.726032315978456, + "grad_norm": 0.5242751240730286, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 10110 + }, + { + "epoch": 0.7267504488330341, + "grad_norm": 0.6947014927864075, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 10120 + }, + { + "epoch": 0.7274685816876122, + "grad_norm": 0.6094982624053955, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10130 + }, + { + "epoch": 0.7281867145421903, + "grad_norm": 0.628461480140686, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 10140 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.4952087104320526, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10150 + }, + { + "epoch": 0.7296229802513465, + "grad_norm": 0.6917221546173096, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10160 + }, + { + "epoch": 0.7303411131059246, + "grad_norm": 0.6866413354873657, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10170 + }, + { + "epoch": 0.7310592459605026, + "grad_norm": 0.5505863428115845, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 10180 + }, + { + "epoch": 0.7317773788150808, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 10190 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.5001798272132874, + "learning_rate": 0.0002, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 0.733213644524237, + "grad_norm": 0.5117581486701965, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 10210 + }, + { + "epoch": 0.7339317773788151, + "grad_norm": 0.7716088891029358, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 10220 + }, + { + "epoch": 0.7346499102333932, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 10230 + }, + { + "epoch": 0.7353680430879713, + "grad_norm": 0.6433483362197876, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10240 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.6241081357002258, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10250 + }, + { + "epoch": 0.7368043087971274, + "grad_norm": 0.7198845744132996, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10260 + }, + { + "epoch": 0.7375224416517056, + "grad_norm": 0.5879023671150208, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 10270 + }, + { + "epoch": 0.7382405745062837, + "grad_norm": 0.5810162425041199, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 10280 + }, + { + "epoch": 0.7389587073608618, + "grad_norm": 0.6336500644683838, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10290 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.5627583861351013, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 10300 + }, + { + "epoch": 0.740394973070018, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 10310 + }, + { + "epoch": 0.741113105924596, + "grad_norm": 0.5519505143165588, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 10320 + }, + { + "epoch": 0.7418312387791741, + "grad_norm": 0.628710925579071, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 10330 + }, + { + "epoch": 0.7425493716337522, + "grad_norm": 0.6466957926750183, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10340 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 0.6269286274909973, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 10350 + }, + { + "epoch": 0.7439856373429085, + "grad_norm": 0.6985455751419067, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 10360 + }, + { + "epoch": 0.7447037701974866, + "grad_norm": 0.6203648447990417, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 10370 + }, + { + "epoch": 0.7454219030520647, + "grad_norm": 0.6524295210838318, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 10380 + }, + { + "epoch": 0.7461400359066427, + "grad_norm": 0.6108002662658691, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 10390 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 0.5196276903152466, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 10400 + }, + { + "epoch": 0.7475763016157989, + "grad_norm": 0.6207506656646729, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 10410 + }, + { + "epoch": 0.748294434470377, + "grad_norm": 0.6015686988830566, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 10420 + }, + { + "epoch": 0.7490125673249551, + "grad_norm": 0.6402649879455566, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 10430 + }, + { + "epoch": 0.7497307001795332, + "grad_norm": 0.7816081047058105, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 10440 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.6148143410682678, + "learning_rate": 0.0002, + "loss": 0.8021, + "step": 10450 + }, + { + "epoch": 0.7511669658886894, + "grad_norm": 0.6496613621711731, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 10460 + }, + { + "epoch": 0.7518850987432675, + "grad_norm": 0.49158045649528503, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 10470 + }, + { + "epoch": 0.7526032315978456, + "grad_norm": 0.8629217743873596, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 10480 + }, + { + "epoch": 0.7533213644524237, + "grad_norm": 0.6800066828727722, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 10490 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.6480063199996948, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 10500 + }, + { + "epoch": 0.7547576301615799, + "grad_norm": 0.5740751028060913, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10510 + }, + { + "epoch": 0.755475763016158, + "grad_norm": 0.7182627320289612, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 10520 + }, + { + "epoch": 0.756193895870736, + "grad_norm": 0.6482816934585571, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 10530 + }, + { + "epoch": 0.7569120287253142, + "grad_norm": 0.4937674105167389, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 10540 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.6818482875823975, + "learning_rate": 0.0002, + "loss": 0.7783, + "step": 10550 + }, + { + "epoch": 0.7583482944344704, + "grad_norm": 0.6375173926353455, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10560 + }, + { + "epoch": 0.7590664272890485, + "grad_norm": 0.528798520565033, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 10570 + }, + { + "epoch": 0.7597845601436266, + "grad_norm": 0.42099910974502563, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 10580 + }, + { + "epoch": 0.7605026929982047, + "grad_norm": 0.529604434967041, + "learning_rate": 0.0002, + "loss": 0.8218, + "step": 10590 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.6236841082572937, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 10600 + }, + { + "epoch": 0.7619389587073608, + "grad_norm": 0.6194891929626465, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10610 + }, + { + "epoch": 0.762657091561939, + "grad_norm": 0.5206209421157837, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 10620 + }, + { + "epoch": 0.7633752244165171, + "grad_norm": 0.7981295585632324, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 10630 + }, + { + "epoch": 0.7640933572710952, + "grad_norm": 0.6113479137420654, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 10640 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 0.7025435566902161, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10650 + }, + { + "epoch": 0.7655296229802514, + "grad_norm": 0.46914348006248474, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 10660 + }, + { + "epoch": 0.7662477558348294, + "grad_norm": 0.6134725213050842, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 10670 + }, + { + "epoch": 0.7669658886894075, + "grad_norm": 0.583859920501709, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10680 + }, + { + "epoch": 0.7676840215439856, + "grad_norm": 0.511349081993103, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10690 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.6467110514640808, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 10700 + }, + { + "epoch": 0.7691202872531419, + "grad_norm": 0.7210163474082947, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 10710 + }, + { + "epoch": 0.76983842010772, + "grad_norm": 0.6034521460533142, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 10720 + }, + { + "epoch": 0.7705565529622981, + "grad_norm": 0.6237271428108215, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 10730 + }, + { + "epoch": 0.7712746858168761, + "grad_norm": 0.664328396320343, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 10740 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.6550520062446594, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 10750 + }, + { + "epoch": 0.7727109515260323, + "grad_norm": 0.5103325843811035, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 10760 + }, + { + "epoch": 0.7734290843806104, + "grad_norm": 0.7171200513839722, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 10770 + }, + { + "epoch": 0.7741472172351885, + "grad_norm": 0.5947384834289551, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 10780 + }, + { + "epoch": 0.7748653500897666, + "grad_norm": 0.5293096899986267, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10790 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.6372577548027039, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10800 + }, + { + "epoch": 0.7763016157989228, + "grad_norm": 0.5738261938095093, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.7770197486535009, + "grad_norm": 0.7309247255325317, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 10820 + }, + { + "epoch": 0.777737881508079, + "grad_norm": 0.8867193460464478, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10830 + }, + { + "epoch": 0.7784560143626571, + "grad_norm": 0.6151437759399414, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 10840 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.5645464658737183, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 0.7798922800718133, + "grad_norm": 0.5118698477745056, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10860 + }, + { + "epoch": 0.7806104129263913, + "grad_norm": 0.618181049823761, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 10870 + }, + { + "epoch": 0.7813285457809694, + "grad_norm": 0.7206462025642395, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 10880 + }, + { + "epoch": 0.7820466786355476, + "grad_norm": 0.7993820905685425, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 10890 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.5072754621505737, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10900 + }, + { + "epoch": 0.7834829443447038, + "grad_norm": 0.5829088687896729, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 10910 + }, + { + "epoch": 0.7842010771992819, + "grad_norm": 0.5778957605361938, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 10920 + }, + { + "epoch": 0.78491921005386, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.785637342908438, + "grad_norm": 0.5778013467788696, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 10940 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.6129629611968994, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10950 + }, + { + "epoch": 0.7870736086175942, + "grad_norm": 0.5637320876121521, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10960 + }, + { + "epoch": 0.7877917414721723, + "grad_norm": 0.6253715753555298, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10970 + }, + { + "epoch": 0.7885098743267505, + "grad_norm": 0.6209888458251953, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10980 + }, + { + "epoch": 0.7892280071813286, + "grad_norm": 1.0841948986053467, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10990 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.6570560336112976, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 11000 + }, + { + "epoch": 0.7906642728904847, + "grad_norm": 0.4830388128757477, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11010 + }, + { + "epoch": 0.7913824057450628, + "grad_norm": 0.7607520222663879, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11020 + }, + { + "epoch": 0.7921005385996409, + "grad_norm": 0.8202590346336365, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 11030 + }, + { + "epoch": 0.792818671454219, + "grad_norm": 0.5640848278999329, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11040 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 0.7773675322532654, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 11050 + }, + { + "epoch": 0.7942549371633753, + "grad_norm": 0.664139986038208, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11060 + }, + { + "epoch": 0.7949730700179534, + "grad_norm": 0.6097795367240906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 11070 + }, + { + "epoch": 0.7956912028725314, + "grad_norm": 0.9208881258964539, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 11080 + }, + { + "epoch": 0.7964093357271095, + "grad_norm": 0.6210731863975525, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 11090 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.7060235738754272, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 11100 + }, + { + "epoch": 0.7978456014362657, + "grad_norm": 0.48695266246795654, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 11110 + }, + { + "epoch": 0.7985637342908438, + "grad_norm": 0.6458830833435059, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 11120 + }, + { + "epoch": 0.7992818671454219, + "grad_norm": 0.572545051574707, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 11130 + }, + { + "epoch": 0.8, + "grad_norm": 0.5925027132034302, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 11140 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 0.569622278213501, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 11150 + }, + { + "epoch": 0.8014362657091562, + "grad_norm": 0.537146806716919, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 11160 + }, + { + "epoch": 0.8021543985637343, + "grad_norm": 0.7118613719940186, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 11170 + }, + { + "epoch": 0.8028725314183124, + "grad_norm": 0.6183688044548035, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 11180 + }, + { + "epoch": 0.8035906642728905, + "grad_norm": 0.5187385082244873, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 11190 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.5422571301460266, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 11200 + }, + { + "epoch": 0.8050269299820467, + "grad_norm": 0.635050892829895, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 11210 + }, + { + "epoch": 0.8057450628366247, + "grad_norm": 0.6584872007369995, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 11220 + }, + { + "epoch": 0.8064631956912028, + "grad_norm": 0.624921977519989, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 11230 + }, + { + "epoch": 0.807181328545781, + "grad_norm": 0.6837546229362488, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 11240 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 0.5861160755157471, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11250 + }, + { + "epoch": 0.8086175942549372, + "grad_norm": 0.5751383900642395, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 11260 + }, + { + "epoch": 0.8093357271095153, + "grad_norm": 0.7181510329246521, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11270 + }, + { + "epoch": 0.8100538599640934, + "grad_norm": 0.5862139463424683, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11280 + }, + { + "epoch": 0.8107719928186714, + "grad_norm": 0.4880113899707794, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 11290 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.565590500831604, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 11300 + }, + { + "epoch": 0.8122082585278276, + "grad_norm": 0.6171264052391052, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11310 + }, + { + "epoch": 0.8129263913824057, + "grad_norm": 0.5815969109535217, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 11320 + }, + { + "epoch": 0.8136445242369839, + "grad_norm": 0.5407653450965881, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 11330 + }, + { + "epoch": 0.814362657091562, + "grad_norm": 0.6990084648132324, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 11340 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.5845068097114563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11350 + }, + { + "epoch": 0.8157989228007181, + "grad_norm": 0.5978701114654541, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11360 + }, + { + "epoch": 0.8165170556552962, + "grad_norm": 0.6873053312301636, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 11370 + }, + { + "epoch": 0.8172351885098743, + "grad_norm": 0.7048654556274414, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 11380 + }, + { + "epoch": 0.8179533213644524, + "grad_norm": 0.7631531953811646, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 11390 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.704922080039978, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 11400 + }, + { + "epoch": 0.8193895870736086, + "grad_norm": 0.595460832118988, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11410 + }, + { + "epoch": 0.8201077199281868, + "grad_norm": 0.5882242918014526, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 11420 + }, + { + "epoch": 0.8208258527827648, + "grad_norm": 0.6433175206184387, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 11430 + }, + { + "epoch": 0.8215439856373429, + "grad_norm": 0.6047986149787903, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 11440 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 11450 + }, + { + "epoch": 0.8229802513464991, + "grad_norm": 0.5558379888534546, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 11460 + }, + { + "epoch": 0.8236983842010772, + "grad_norm": 0.6745542287826538, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 11470 + }, + { + "epoch": 0.8244165170556553, + "grad_norm": 0.7082334756851196, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 11480 + }, + { + "epoch": 0.8251346499102334, + "grad_norm": 0.703889787197113, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11490 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.5261096358299255, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 11500 + }, + { + "epoch": 0.8265709156193896, + "grad_norm": 0.6009393930435181, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 11510 + }, + { + "epoch": 0.8272890484739677, + "grad_norm": 0.584274172782898, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 11520 + }, + { + "epoch": 0.8280071813285458, + "grad_norm": 0.6803238987922668, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 11530 + }, + { + "epoch": 0.8287253141831239, + "grad_norm": 0.6230084896087646, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 11540 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.6090595722198486, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 11550 + }, + { + "epoch": 0.8301615798922801, + "grad_norm": 0.5292693376541138, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 11560 + }, + { + "epoch": 0.8308797127468581, + "grad_norm": 0.5675389766693115, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 11570 + }, + { + "epoch": 0.8315978456014362, + "grad_norm": 0.554874062538147, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 11580 + }, + { + "epoch": 0.8323159784560143, + "grad_norm": 0.8582373261451721, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 11590 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.5743035674095154, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 11600 + }, + { + "epoch": 0.8337522441651706, + "grad_norm": 0.5749582648277283, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11610 + }, + { + "epoch": 0.8344703770197487, + "grad_norm": 0.5207278728485107, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11620 + }, + { + "epoch": 0.8351885098743268, + "grad_norm": 0.6262611150741577, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 11630 + }, + { + "epoch": 0.8359066427289048, + "grad_norm": 0.5490066409111023, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 11640 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6283167600631714, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 11650 + }, + { + "epoch": 0.837342908438061, + "grad_norm": 0.7701452374458313, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 11660 + }, + { + "epoch": 0.8380610412926391, + "grad_norm": 0.5825072526931763, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 11670 + }, + { + "epoch": 0.8387791741472173, + "grad_norm": 0.6119720935821533, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 11680 + }, + { + "epoch": 0.8394973070017954, + "grad_norm": 0.689383327960968, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 11690 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.5396560430526733, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 11700 + }, + { + "epoch": 0.8409335727109515, + "grad_norm": 0.577178955078125, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 11710 + }, + { + "epoch": 0.8416517055655296, + "grad_norm": 0.6652564406394958, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 11720 + }, + { + "epoch": 0.8423698384201077, + "grad_norm": 0.588377058506012, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 11730 + }, + { + "epoch": 0.8430879712746858, + "grad_norm": 0.6180438995361328, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 11740 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.6897811889648438, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11750 + }, + { + "epoch": 0.844524236983842, + "grad_norm": 0.5826608538627625, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 11760 + }, + { + "epoch": 0.8452423698384202, + "grad_norm": 0.6511976718902588, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 11770 + }, + { + "epoch": 0.8459605026929982, + "grad_norm": 0.4738382399082184, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 11780 + }, + { + "epoch": 0.8466786355475763, + "grad_norm": 0.541780948638916, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 11790 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 0.6115241050720215, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 11800 + }, + { + "epoch": 0.8481149012567325, + "grad_norm": 0.7067801356315613, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 11810 + }, + { + "epoch": 0.8488330341113106, + "grad_norm": 0.5602791905403137, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 11820 + }, + { + "epoch": 0.8495511669658887, + "grad_norm": 0.6968005299568176, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 11830 + }, + { + "epoch": 0.8502692998204668, + "grad_norm": 0.621132493019104, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11840 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.5777568817138672, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11850 + }, + { + "epoch": 0.851705565529623, + "grad_norm": 0.6468178629875183, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 11860 + }, + { + "epoch": 0.8524236983842011, + "grad_norm": 0.6216070652008057, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 11870 + }, + { + "epoch": 0.8531418312387792, + "grad_norm": 0.7402005791664124, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 11880 + }, + { + "epoch": 0.8538599640933573, + "grad_norm": 0.5192958116531372, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 11890 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 0.6050501465797424, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 11900 + }, + { + "epoch": 0.8552962298025135, + "grad_norm": 0.5363124012947083, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11910 + }, + { + "epoch": 0.8560143626570915, + "grad_norm": 0.525288462638855, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11920 + }, + { + "epoch": 0.8567324955116696, + "grad_norm": 0.6129848957061768, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 11930 + }, + { + "epoch": 0.8574506283662477, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 11940 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.5862830281257629, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 11950 + }, + { + "epoch": 0.858886894075404, + "grad_norm": 0.7078025341033936, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 11960 + }, + { + "epoch": 0.8596050269299821, + "grad_norm": 0.6600908637046814, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 11970 + }, + { + "epoch": 0.8603231597845602, + "grad_norm": 0.5914377570152283, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 11980 + }, + { + "epoch": 0.8610412926391382, + "grad_norm": 0.7844575047492981, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 11990 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.6605148315429688, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12000 + }, + { + "epoch": 0.8624775583482944, + "grad_norm": 0.6320111155509949, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 12010 + }, + { + "epoch": 0.8631956912028725, + "grad_norm": 0.5833557844161987, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 12020 + }, + { + "epoch": 0.8639138240574507, + "grad_norm": 0.5322666764259338, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 12030 + }, + { + "epoch": 0.8646319569120288, + "grad_norm": 0.568696141242981, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 12040 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 0.5739135146141052, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 12050 + }, + { + "epoch": 0.8660682226211849, + "grad_norm": 0.6667993068695068, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 12060 + }, + { + "epoch": 0.866786355475763, + "grad_norm": 0.5393701195716858, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 12070 + }, + { + "epoch": 0.8675044883303411, + "grad_norm": 0.7036312818527222, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 12080 + }, + { + "epoch": 0.8682226211849192, + "grad_norm": 0.5851739048957825, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 12090 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.6554462909698486, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 12100 + }, + { + "epoch": 0.8696588868940754, + "grad_norm": 0.8224838376045227, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 12110 + }, + { + "epoch": 0.8703770197486534, + "grad_norm": 0.513981819152832, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 12120 + }, + { + "epoch": 0.8710951526032316, + "grad_norm": 0.6913988590240479, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 12130 + }, + { + "epoch": 0.8718132854578097, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 12140 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.6216937303543091, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 12150 + }, + { + "epoch": 0.8732495511669659, + "grad_norm": 0.5594495534896851, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 12160 + }, + { + "epoch": 0.873967684021544, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 12170 + }, + { + "epoch": 0.8746858168761221, + "grad_norm": 0.5285239815711975, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 12180 + }, + { + "epoch": 0.8754039497307001, + "grad_norm": 1.0394607782363892, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 12190 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 12200 + }, + { + "epoch": 0.8768402154398564, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 12210 + }, + { + "epoch": 0.8775583482944345, + "grad_norm": 0.593204915523529, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 12220 + }, + { + "epoch": 0.8782764811490126, + "grad_norm": 0.7141679525375366, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 12230 + }, + { + "epoch": 0.8789946140035907, + "grad_norm": 0.6381585597991943, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 12240 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.7076981067657471, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12250 + }, + { + "epoch": 0.8804308797127468, + "grad_norm": 0.8046461939811707, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 12260 + }, + { + "epoch": 0.8811490125673249, + "grad_norm": 0.635160505771637, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 12270 + }, + { + "epoch": 0.881867145421903, + "grad_norm": 0.6388354301452637, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8825852782764811, + "grad_norm": 0.5612906217575073, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 12290 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6716228723526001, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 12300 + }, + { + "epoch": 0.8840215439856374, + "grad_norm": 0.6488762497901917, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 12310 + }, + { + "epoch": 0.8847396768402155, + "grad_norm": 0.5770853757858276, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 12320 + }, + { + "epoch": 0.8854578096947935, + "grad_norm": 0.5006616711616516, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 12330 + }, + { + "epoch": 0.8861759425493716, + "grad_norm": 0.6428417563438416, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 12340 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 0.5721977949142456, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12350 + }, + { + "epoch": 0.8876122082585278, + "grad_norm": 0.7000266313552856, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 12360 + }, + { + "epoch": 0.8883303411131059, + "grad_norm": 0.5252631306648254, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 12370 + }, + { + "epoch": 0.889048473967684, + "grad_norm": 0.5788044929504395, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 0.8897666068222622, + "grad_norm": 0.6730653643608093, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.5556851029396057, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 12400 + }, + { + "epoch": 0.8912028725314183, + "grad_norm": 0.616189181804657, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 12410 + }, + { + "epoch": 0.8919210053859964, + "grad_norm": 0.6360940337181091, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 12420 + }, + { + "epoch": 0.8926391382405745, + "grad_norm": 0.5832887887954712, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 12430 + }, + { + "epoch": 0.8933572710951526, + "grad_norm": 0.8319168090820312, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 12440 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.5415005087852478, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 12450 + }, + { + "epoch": 0.8947935368043088, + "grad_norm": 0.4959808588027954, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 12460 + }, + { + "epoch": 0.8955116696588868, + "grad_norm": 0.5102260708808899, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 12470 + }, + { + "epoch": 0.896229802513465, + "grad_norm": 0.773972749710083, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12480 + }, + { + "epoch": 0.8969479353680431, + "grad_norm": 0.6314513087272644, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 12490 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.6503705382347107, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 12500 + }, + { + "epoch": 0.8983842010771993, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 12510 + }, + { + "epoch": 0.8991023339317774, + "grad_norm": 0.7222756743431091, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 12520 + }, + { + "epoch": 0.8998204667863555, + "grad_norm": 0.7242336869239807, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 12530 + }, + { + "epoch": 0.9005385996409335, + "grad_norm": 0.625769317150116, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 12540 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.6003357172012329, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 12550 + }, + { + "epoch": 0.9019748653500897, + "grad_norm": 0.6089374423027039, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 12560 + }, + { + "epoch": 0.9026929982046679, + "grad_norm": 0.6232544183731079, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 12570 + }, + { + "epoch": 0.903411131059246, + "grad_norm": 0.5426769256591797, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 12580 + }, + { + "epoch": 0.9041292639138241, + "grad_norm": 0.5711943507194519, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 12590 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.5287838578224182, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 12600 + }, + { + "epoch": 0.9055655296229802, + "grad_norm": 0.6192951798439026, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 12610 + }, + { + "epoch": 0.9062836624775583, + "grad_norm": 0.493082195520401, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 12620 + }, + { + "epoch": 0.9070017953321364, + "grad_norm": 0.7668463587760925, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 12630 + }, + { + "epoch": 0.9077199281867145, + "grad_norm": 0.6298037767410278, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 12640 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.5502580404281616, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 12650 + }, + { + "epoch": 0.9091561938958708, + "grad_norm": 0.5525170564651489, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.9098743267504489, + "grad_norm": 0.9753695726394653, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 12670 + }, + { + "epoch": 0.9105924596050269, + "grad_norm": 0.611427366733551, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 12680 + }, + { + "epoch": 0.911310592459605, + "grad_norm": 0.5141594409942627, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 12690 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 0.6739137172698975, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 12700 + }, + { + "epoch": 0.9127468581687612, + "grad_norm": 0.5759707689285278, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 12710 + }, + { + "epoch": 0.9134649910233393, + "grad_norm": 0.5548733472824097, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12720 + }, + { + "epoch": 0.9141831238779174, + "grad_norm": 0.7014280557632446, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 12730 + }, + { + "epoch": 0.9149012567324956, + "grad_norm": 0.5939958691596985, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 12740 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 0.5995593667030334, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12750 + }, + { + "epoch": 0.9163375224416517, + "grad_norm": 0.6686680316925049, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 12760 + }, + { + "epoch": 0.9170556552962298, + "grad_norm": 0.4742372930049896, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 12770 + }, + { + "epoch": 0.9177737881508079, + "grad_norm": 0.5493217706680298, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 12780 + }, + { + "epoch": 0.918491921005386, + "grad_norm": 0.5641885995864868, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 12790 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 0.5814061164855957, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 12800 + }, + { + "epoch": 0.9199281867145422, + "grad_norm": 0.6774331331253052, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 12810 + }, + { + "epoch": 0.9206463195691202, + "grad_norm": 0.5592127442359924, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 12820 + }, + { + "epoch": 0.9213644524236984, + "grad_norm": 0.5246456861495972, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 12830 + }, + { + "epoch": 0.9220825852782765, + "grad_norm": 0.6524264812469482, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 12840 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 0.6010791063308716, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12850 + }, + { + "epoch": 0.9235188509874327, + "grad_norm": 0.5289866924285889, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 12860 + }, + { + "epoch": 0.9242369838420108, + "grad_norm": 0.6850762367248535, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 12870 + }, + { + "epoch": 0.9249551166965889, + "grad_norm": 0.5293797850608826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 12880 + }, + { + "epoch": 0.9256732495511669, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 12890 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.7026739716529846, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 12900 + }, + { + "epoch": 0.9271095152603231, + "grad_norm": 0.6884756684303284, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 12910 + }, + { + "epoch": 0.9278276481149013, + "grad_norm": 0.637884795665741, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 12920 + }, + { + "epoch": 0.9285457809694794, + "grad_norm": 0.513913631439209, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 12930 + }, + { + "epoch": 0.9292639138240575, + "grad_norm": 0.6642340421676636, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 12940 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.5708861947059631, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 12950 + }, + { + "epoch": 0.9307001795332136, + "grad_norm": 0.5896512866020203, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 12960 + }, + { + "epoch": 0.9314183123877917, + "grad_norm": 0.5754874348640442, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 12970 + }, + { + "epoch": 0.9321364452423698, + "grad_norm": 0.6363751888275146, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 12980 + }, + { + "epoch": 0.9328545780969479, + "grad_norm": 0.7660197019577026, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 12990 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.607728898525238, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 13000 + }, + { + "epoch": 0.9342908438061042, + "grad_norm": 0.5257042050361633, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 13010 + }, + { + "epoch": 0.9350089766606823, + "grad_norm": 0.7916908264160156, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 13020 + }, + { + "epoch": 0.9357271095152603, + "grad_norm": 0.8310123085975647, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 13030 + }, + { + "epoch": 0.9364452423698384, + "grad_norm": 0.6543728113174438, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 13040 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.7153878808021545, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 13050 + }, + { + "epoch": 0.9378815080789946, + "grad_norm": 0.7510694265365601, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 13060 + }, + { + "epoch": 0.9385996409335727, + "grad_norm": 0.5524464249610901, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 13070 + }, + { + "epoch": 0.9393177737881508, + "grad_norm": 0.6657140254974365, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 13080 + }, + { + "epoch": 0.940035906642729, + "grad_norm": 0.5757394433021545, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 13090 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.6171187162399292, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 13100 + }, + { + "epoch": 0.9414721723518851, + "grad_norm": 0.5946314334869385, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 13110 + }, + { + "epoch": 0.9421903052064632, + "grad_norm": 0.5727229714393616, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 13120 + }, + { + "epoch": 0.9429084380610413, + "grad_norm": 0.7805224061012268, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 13130 + }, + { + "epoch": 0.9436265709156194, + "grad_norm": 0.5763523578643799, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 13140 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 0.8310899138450623, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13150 + }, + { + "epoch": 0.9450628366247756, + "grad_norm": 0.7531784772872925, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 13160 + }, + { + "epoch": 0.9457809694793536, + "grad_norm": 0.678779661655426, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 13170 + }, + { + "epoch": 0.9464991023339318, + "grad_norm": 0.8096453547477722, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13180 + }, + { + "epoch": 0.9472172351885099, + "grad_norm": 0.6743921637535095, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 13190 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.606852114200592, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 13200 + }, + { + "epoch": 0.9486535008976661, + "grad_norm": 0.6550270915031433, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 13210 + }, + { + "epoch": 0.9493716337522442, + "grad_norm": 0.6494552493095398, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 13220 + }, + { + "epoch": 0.9500897666068223, + "grad_norm": 0.5867666602134705, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 13230 + }, + { + "epoch": 0.9508078994614003, + "grad_norm": 0.6283786296844482, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 13240 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 0.6824573278427124, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 13250 + }, + { + "epoch": 0.9522441651705565, + "grad_norm": 0.6945744156837463, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 13260 + }, + { + "epoch": 0.9529622980251347, + "grad_norm": 0.6468575596809387, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 13270 + }, + { + "epoch": 0.9536804308797128, + "grad_norm": 0.6819407939910889, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 0.9543985637342909, + "grad_norm": 0.6660491824150085, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 13290 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6320462226867676, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 13300 + }, + { + "epoch": 0.955834829443447, + "grad_norm": 0.46753761172294617, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 13310 + }, + { + "epoch": 0.9565529622980251, + "grad_norm": 0.6608774065971375, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 13320 + }, + { + "epoch": 0.9572710951526032, + "grad_norm": 0.607448935508728, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 13330 + }, + { + "epoch": 0.9579892280071813, + "grad_norm": 0.6796701550483704, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 13340 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.7655861377716064, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 13350 + }, + { + "epoch": 0.9594254937163376, + "grad_norm": 0.5881335735321045, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 13360 + }, + { + "epoch": 0.9601436265709156, + "grad_norm": 0.6855270862579346, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 13370 + }, + { + "epoch": 0.9608617594254937, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 13380 + }, + { + "epoch": 0.9615798922800718, + "grad_norm": 0.5983994603157043, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 13390 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.6141189932823181, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 13400 + }, + { + "epoch": 0.963016157989228, + "grad_norm": 0.6539722084999084, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 13410 + }, + { + "epoch": 0.9637342908438061, + "grad_norm": 0.5425801277160645, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 13420 + }, + { + "epoch": 0.9644524236983842, + "grad_norm": 0.8038925528526306, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 13430 + }, + { + "epoch": 0.9651705565529622, + "grad_norm": 0.5729590058326721, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 13440 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5695241689682007, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 13450 + }, + { + "epoch": 0.9666068222621185, + "grad_norm": 0.5913681387901306, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 13460 + }, + { + "epoch": 0.9673249551166966, + "grad_norm": 1.1798994541168213, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 13470 + }, + { + "epoch": 0.9680430879712747, + "grad_norm": 0.5931369066238403, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 13480 + }, + { + "epoch": 0.9687612208258528, + "grad_norm": 0.6269514560699463, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 13490 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.7380245327949524, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 13500 + }, + { + "epoch": 0.9701974865350089, + "grad_norm": 0.5668187141418457, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 13510 + }, + { + "epoch": 0.970915619389587, + "grad_norm": 0.547149121761322, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 13520 + }, + { + "epoch": 0.9716337522441651, + "grad_norm": 0.49131739139556885, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 13530 + }, + { + "epoch": 0.9723518850987433, + "grad_norm": 0.6385366320610046, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 13540 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5962417125701904, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 13550 + }, + { + "epoch": 0.9737881508078995, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 13560 + }, + { + "epoch": 0.9745062836624776, + "grad_norm": 0.5757403373718262, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 13570 + }, + { + "epoch": 0.9752244165170556, + "grad_norm": 0.7214667201042175, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 13580 + }, + { + "epoch": 0.9759425493716337, + "grad_norm": 0.5902701020240784, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 13590 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.752805769443512, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 13600 + }, + { + "epoch": 0.9773788150807899, + "grad_norm": 0.5943595767021179, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 13610 + }, + { + "epoch": 0.978096947935368, + "grad_norm": 0.6752488613128662, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 13620 + }, + { + "epoch": 0.9788150807899462, + "grad_norm": 0.5295413732528687, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 13630 + }, + { + "epoch": 0.9795332136445243, + "grad_norm": 0.732549250125885, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13640 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.5701823830604553, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 13650 + }, + { + "epoch": 0.9809694793536804, + "grad_norm": 0.576898455619812, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13660 + }, + { + "epoch": 0.9816876122082585, + "grad_norm": 0.5916832089424133, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 13670 + }, + { + "epoch": 0.9824057450628366, + "grad_norm": 0.5554524660110474, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 13680 + }, + { + "epoch": 0.9831238779174147, + "grad_norm": 0.6988440752029419, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 13690 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 13700 + }, + { + "epoch": 0.984560143626571, + "grad_norm": 2.421210289001465, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13710 + }, + { + "epoch": 0.985278276481149, + "grad_norm": 0.6307598948478699, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 13720 + }, + { + "epoch": 0.9859964093357271, + "grad_norm": 0.6832480430603027, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 13730 + }, + { + "epoch": 0.9867145421903052, + "grad_norm": 0.5974255204200745, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13740 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.6540380716323853, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 13750 + }, + { + "epoch": 0.9881508078994614, + "grad_norm": 0.7532727122306824, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 13760 + }, + { + "epoch": 0.9888689407540395, + "grad_norm": 0.6776283383369446, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 13770 + }, + { + "epoch": 0.9895870736086176, + "grad_norm": 0.5776281356811523, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 13780 + }, + { + "epoch": 0.9903052064631956, + "grad_norm": 0.5473008751869202, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 13790 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.5428591370582581, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 13800 + }, + { + "epoch": 0.9917414721723519, + "grad_norm": 0.5173406004905701, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 13810 + }, + { + "epoch": 0.99245960502693, + "grad_norm": 0.6462617516517639, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 13820 + }, + { + "epoch": 0.9931777378815081, + "grad_norm": 0.5800426006317139, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 13830 + }, + { + "epoch": 0.9938958707360862, + "grad_norm": 0.5015466809272766, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 13840 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.59474778175354, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 13850 + }, + { + "epoch": 0.9953321364452423, + "grad_norm": 0.5609583258628845, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 13860 + }, + { + "epoch": 0.9960502692998204, + "grad_norm": 0.5762063264846802, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 13870 + }, + { + "epoch": 0.9967684021543985, + "grad_norm": 0.6419214010238647, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 13880 + }, + { + "epoch": 0.9974865350089767, + "grad_norm": 0.7821950316429138, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 13890 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.6216017007827759, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 13900 + }, + { + "epoch": 0.9989228007181329, + "grad_norm": 0.5446485877037048, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 13910 + }, + { + "epoch": 0.999640933572711, + "grad_norm": 0.5037565231323242, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 13920 + }, + { + "epoch": 1.0, + "eval_loss": 1.09147310256958, + "eval_runtime": 55.1915, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 13925 + }, + { + "epoch": 1.000359066427289, + "grad_norm": 0.5808277130126953, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 13930 + }, + { + "epoch": 1.0010771992818672, + "grad_norm": 0.47258496284484863, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 13940 + }, + { + "epoch": 1.0017953321364452, + "grad_norm": 0.8921670317649841, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 13950 + }, + { + "epoch": 1.0025134649910232, + "grad_norm": 0.746729850769043, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 13960 + }, + { + "epoch": 1.0032315978456015, + "grad_norm": 0.6243796944618225, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13970 + }, + { + "epoch": 1.0039497307001795, + "grad_norm": 0.6725090742111206, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 13980 + }, + { + "epoch": 1.0046678635547577, + "grad_norm": 0.8762497305870056, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 13990 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 0.7694411873817444, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 14000 + }, + { + "epoch": 1.006104129263914, + "grad_norm": 0.6208822727203369, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 14010 + }, + { + "epoch": 1.006822262118492, + "grad_norm": 0.8503357768058777, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 14020 + }, + { + "epoch": 1.00754039497307, + "grad_norm": 0.5813316106796265, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14030 + }, + { + "epoch": 1.0082585278276481, + "grad_norm": 0.8186036348342896, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 14040 + }, + { + "epoch": 1.0089766606822261, + "grad_norm": 0.759873628616333, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14050 + }, + { + "epoch": 1.0096947935368044, + "grad_norm": 0.8437777161598206, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 14060 + }, + { + "epoch": 1.0104129263913824, + "grad_norm": 0.5750975012779236, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14070 + }, + { + "epoch": 1.0111310592459606, + "grad_norm": 0.5873221158981323, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 14080 + }, + { + "epoch": 1.0118491921005386, + "grad_norm": 0.6381314396858215, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 14090 + }, + { + "epoch": 1.0125673249551166, + "grad_norm": 0.6510405540466309, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 14100 + }, + { + "epoch": 1.0132854578096948, + "grad_norm": 0.7698671221733093, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 14110 + }, + { + "epoch": 1.0140035906642728, + "grad_norm": 0.646180272102356, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 14120 + }, + { + "epoch": 1.014721723518851, + "grad_norm": 0.6183205246925354, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 14130 + }, + { + "epoch": 1.015439856373429, + "grad_norm": 0.5082563757896423, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 14140 + }, + { + "epoch": 1.0161579892280073, + "grad_norm": 0.7285500764846802, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 14150 + }, + { + "epoch": 1.0168761220825853, + "grad_norm": 0.6368175148963928, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 14160 + }, + { + "epoch": 1.0175942549371633, + "grad_norm": 0.44868743419647217, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 14170 + }, + { + "epoch": 1.0183123877917415, + "grad_norm": 0.6346513628959656, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 14180 + }, + { + "epoch": 1.0190305206463195, + "grad_norm": 0.7287803292274475, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 14190 + }, + { + "epoch": 1.0197486535008977, + "grad_norm": 0.6701363325119019, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 14200 + }, + { + "epoch": 1.0204667863554757, + "grad_norm": 0.6419289112091064, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 14210 + }, + { + "epoch": 1.021184919210054, + "grad_norm": 0.7703002095222473, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 14220 + }, + { + "epoch": 1.021903052064632, + "grad_norm": 0.6803670525550842, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14230 + }, + { + "epoch": 1.02262118491921, + "grad_norm": 0.5780976414680481, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 14240 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 0.5096051096916199, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 14250 + }, + { + "epoch": 1.0240574506283662, + "grad_norm": 0.6058611869812012, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 14260 + }, + { + "epoch": 1.0247755834829444, + "grad_norm": 0.6703311204910278, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 14270 + }, + { + "epoch": 1.0254937163375224, + "grad_norm": 0.7143640518188477, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 14280 + }, + { + "epoch": 1.0262118491921006, + "grad_norm": 0.6730744242668152, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 14290 + }, + { + "epoch": 1.0269299820466786, + "grad_norm": 0.8180603384971619, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14300 + }, + { + "epoch": 1.0276481149012566, + "grad_norm": 0.6752267479896545, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 14310 + }, + { + "epoch": 1.0283662477558349, + "grad_norm": 0.678428590297699, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 14320 + }, + { + "epoch": 1.0290843806104129, + "grad_norm": 0.5959973931312561, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 14330 + }, + { + "epoch": 1.029802513464991, + "grad_norm": 0.5797176957130432, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 14340 + }, + { + "epoch": 1.030520646319569, + "grad_norm": 0.6415652632713318, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 14350 + }, + { + "epoch": 1.0312387791741473, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 14360 + }, + { + "epoch": 1.0319569120287253, + "grad_norm": 0.7158452272415161, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 14370 + }, + { + "epoch": 1.0326750448833033, + "grad_norm": 0.6066089272499084, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 14380 + }, + { + "epoch": 1.0333931777378815, + "grad_norm": 0.7359582781791687, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 14390 + }, + { + "epoch": 1.0341113105924595, + "grad_norm": 0.7372373938560486, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 14400 + }, + { + "epoch": 1.0348294434470378, + "grad_norm": 0.7511868476867676, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 14410 + }, + { + "epoch": 1.0355475763016158, + "grad_norm": 0.5449917912483215, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 14420 + }, + { + "epoch": 1.036265709156194, + "grad_norm": 0.6700817346572876, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 14430 + }, + { + "epoch": 1.036983842010772, + "grad_norm": 0.7061316967010498, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14440 + }, + { + "epoch": 1.03770197486535, + "grad_norm": 0.7582663893699646, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 14450 + }, + { + "epoch": 1.0384201077199282, + "grad_norm": 0.6408873200416565, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 14460 + }, + { + "epoch": 1.0391382405745062, + "grad_norm": 0.7645436525344849, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 14470 + }, + { + "epoch": 1.0398563734290844, + "grad_norm": 0.6522644758224487, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 14480 + }, + { + "epoch": 1.0405745062836624, + "grad_norm": 0.784273624420166, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 14490 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 0.673891544342041, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 14500 + }, + { + "epoch": 1.0420107719928187, + "grad_norm": 0.6566316485404968, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 14510 + }, + { + "epoch": 1.0427289048473967, + "grad_norm": 0.6062059998512268, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 14520 + }, + { + "epoch": 1.0434470377019749, + "grad_norm": 0.6884504556655884, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14530 + }, + { + "epoch": 1.044165170556553, + "grad_norm": 0.6642231345176697, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14540 + }, + { + "epoch": 1.0448833034111311, + "grad_norm": 0.6989523768424988, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 14550 + }, + { + "epoch": 1.0456014362657091, + "grad_norm": 0.8179892301559448, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 14560 + }, + { + "epoch": 1.0463195691202873, + "grad_norm": 0.6426970362663269, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 14570 + }, + { + "epoch": 1.0470377019748653, + "grad_norm": 0.678445041179657, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 14580 + }, + { + "epoch": 1.0477558348294433, + "grad_norm": 0.7573820352554321, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 14590 + }, + { + "epoch": 1.0484739676840216, + "grad_norm": 0.734443724155426, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 14600 + }, + { + "epoch": 1.0491921005385996, + "grad_norm": 0.7333676218986511, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14610 + }, + { + "epoch": 1.0499102333931778, + "grad_norm": 0.6122187972068787, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14620 + }, + { + "epoch": 1.0506283662477558, + "grad_norm": 0.6916412711143494, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 14630 + }, + { + "epoch": 1.051346499102334, + "grad_norm": 0.5898127555847168, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 14640 + }, + { + "epoch": 1.052064631956912, + "grad_norm": 0.6071873307228088, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14650 + }, + { + "epoch": 1.05278276481149, + "grad_norm": 0.6530455946922302, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 14660 + }, + { + "epoch": 1.0535008976660682, + "grad_norm": 0.6919314861297607, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14670 + }, + { + "epoch": 1.0542190305206462, + "grad_norm": 0.7843509912490845, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 14680 + }, + { + "epoch": 1.0549371633752245, + "grad_norm": 0.6106747388839722, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 14690 + }, + { + "epoch": 1.0556552962298025, + "grad_norm": 0.7828368544578552, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 14700 + }, + { + "epoch": 1.0563734290843807, + "grad_norm": 0.6772044897079468, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 14710 + }, + { + "epoch": 1.0570915619389587, + "grad_norm": 0.5430962443351746, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 14720 + }, + { + "epoch": 1.0578096947935367, + "grad_norm": 0.7364194989204407, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 14730 + }, + { + "epoch": 1.058527827648115, + "grad_norm": 0.5607585310935974, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 14740 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 0.7917081713676453, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 14750 + }, + { + "epoch": 1.0599640933572712, + "grad_norm": 0.7852025628089905, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 14760 + }, + { + "epoch": 1.0606822262118492, + "grad_norm": 0.6329161524772644, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 14770 + }, + { + "epoch": 1.0614003590664274, + "grad_norm": 0.7607306838035583, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14780 + }, + { + "epoch": 1.0621184919210054, + "grad_norm": 0.7236617207527161, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14790 + }, + { + "epoch": 1.0628366247755834, + "grad_norm": 0.793542206287384, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 14800 + }, + { + "epoch": 1.0635547576301616, + "grad_norm": 0.53999263048172, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 14810 + }, + { + "epoch": 1.0642728904847396, + "grad_norm": 0.5821034908294678, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 14820 + }, + { + "epoch": 1.0649910233393178, + "grad_norm": 0.6593600511550903, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 14830 + }, + { + "epoch": 1.0657091561938958, + "grad_norm": 0.70230633020401, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 14840 + }, + { + "epoch": 1.066427289048474, + "grad_norm": 0.5715264081954956, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14850 + }, + { + "epoch": 1.067145421903052, + "grad_norm": 0.6610119938850403, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 14860 + }, + { + "epoch": 1.06786355475763, + "grad_norm": 0.5470091700553894, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 14870 + }, + { + "epoch": 1.0685816876122083, + "grad_norm": 0.7529906630516052, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 14880 + }, + { + "epoch": 1.0692998204667863, + "grad_norm": 0.7532844543457031, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 14890 + }, + { + "epoch": 1.0700179533213645, + "grad_norm": 0.6439316868782043, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14900 + }, + { + "epoch": 1.0707360861759425, + "grad_norm": 0.5580114126205444, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14910 + }, + { + "epoch": 1.0714542190305207, + "grad_norm": 0.6299236416816711, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 14920 + }, + { + "epoch": 1.0721723518850987, + "grad_norm": 0.6934021711349487, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 14930 + }, + { + "epoch": 1.0728904847396767, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 14940 + }, + { + "epoch": 1.073608617594255, + "grad_norm": 0.8921014070510864, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14950 + }, + { + "epoch": 1.074326750448833, + "grad_norm": 0.5934301614761353, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 14960 + }, + { + "epoch": 1.0750448833034112, + "grad_norm": 0.8379642367362976, + "learning_rate": 0.0002, + "loss": 0.7595, + "step": 14970 + }, + { + "epoch": 1.0757630161579892, + "grad_norm": 0.6842767596244812, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 14980 + }, + { + "epoch": 1.0764811490125674, + "grad_norm": 0.7296533584594727, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 14990 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 0.6821087002754211, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15000 + }, + { + "epoch": 1.0779174147217234, + "grad_norm": 0.6133626699447632, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 15010 + }, + { + "epoch": 1.0786355475763016, + "grad_norm": 0.6774773001670837, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 15020 + }, + { + "epoch": 1.0793536804308796, + "grad_norm": 0.6818786859512329, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 15030 + }, + { + "epoch": 1.0800718132854579, + "grad_norm": 0.7763522863388062, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15040 + }, + { + "epoch": 1.0807899461400359, + "grad_norm": 0.7259193658828735, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15050 + }, + { + "epoch": 1.081508078994614, + "grad_norm": 0.6797525882720947, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 15060 + }, + { + "epoch": 1.082226211849192, + "grad_norm": 0.5775881409645081, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 15070 + }, + { + "epoch": 1.08294434470377, + "grad_norm": 0.7055524587631226, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15080 + }, + { + "epoch": 1.0836624775583483, + "grad_norm": 0.8018748760223389, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 15090 + }, + { + "epoch": 1.0843806104129263, + "grad_norm": 0.6738115549087524, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 15100 + }, + { + "epoch": 1.0850987432675046, + "grad_norm": 0.6586359143257141, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 15110 + }, + { + "epoch": 1.0858168761220826, + "grad_norm": 0.7396895885467529, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 15120 + }, + { + "epoch": 1.0865350089766608, + "grad_norm": 0.7224817276000977, + "learning_rate": 0.0002, + "loss": 0.7473, + "step": 15130 + }, + { + "epoch": 1.0872531418312388, + "grad_norm": 0.798514187335968, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 15140 + }, + { + "epoch": 1.0879712746858168, + "grad_norm": 0.79301518201828, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 15150 + }, + { + "epoch": 1.088689407540395, + "grad_norm": 0.7106764316558838, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 15160 + }, + { + "epoch": 1.089407540394973, + "grad_norm": 0.6525473594665527, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 15170 + }, + { + "epoch": 1.0901256732495512, + "grad_norm": 0.6001671552658081, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 15180 + }, + { + "epoch": 1.0908438061041292, + "grad_norm": 0.6949557662010193, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 15190 + }, + { + "epoch": 1.0915619389587075, + "grad_norm": 0.5713186860084534, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 15200 + }, + { + "epoch": 1.0922800718132855, + "grad_norm": 0.8773220181465149, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 15210 + }, + { + "epoch": 1.0929982046678635, + "grad_norm": 0.5837785601615906, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 15220 + }, + { + "epoch": 1.0937163375224417, + "grad_norm": 0.7243856191635132, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 15230 + }, + { + "epoch": 1.0944344703770197, + "grad_norm": 0.7008263468742371, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 15240 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 0.7061941623687744, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 15250 + }, + { + "epoch": 1.095870736086176, + "grad_norm": 0.575903594493866, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 15260 + }, + { + "epoch": 1.0965888689407541, + "grad_norm": 0.6794043183326721, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 15270 + }, + { + "epoch": 1.0973070017953321, + "grad_norm": 0.7194870710372925, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 15280 + }, + { + "epoch": 1.0980251346499101, + "grad_norm": 0.8063322305679321, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 15290 + }, + { + "epoch": 1.0987432675044884, + "grad_norm": 0.786101758480072, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 15300 + }, + { + "epoch": 1.0994614003590664, + "grad_norm": 0.827474057674408, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 15310 + }, + { + "epoch": 1.1001795332136446, + "grad_norm": 0.6514455080032349, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 15320 + }, + { + "epoch": 1.1008976660682226, + "grad_norm": 0.7534348368644714, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15330 + }, + { + "epoch": 1.1016157989228008, + "grad_norm": 0.6991367340087891, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 15340 + }, + { + "epoch": 1.1023339317773788, + "grad_norm": 0.6742196679115295, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15350 + }, + { + "epoch": 1.1030520646319568, + "grad_norm": 0.7373757362365723, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 15360 + }, + { + "epoch": 1.103770197486535, + "grad_norm": 0.6834485530853271, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 15370 + }, + { + "epoch": 1.104488330341113, + "grad_norm": 0.6454901099205017, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 15380 + }, + { + "epoch": 1.1052064631956913, + "grad_norm": 0.7764508128166199, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 15390 + }, + { + "epoch": 1.1059245960502693, + "grad_norm": 0.668560802936554, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 15400 + }, + { + "epoch": 1.1066427289048475, + "grad_norm": 0.579655110836029, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 15410 + }, + { + "epoch": 1.1073608617594255, + "grad_norm": 0.7196493148803711, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 15420 + }, + { + "epoch": 1.1080789946140035, + "grad_norm": 0.5530232191085815, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 15430 + }, + { + "epoch": 1.1087971274685817, + "grad_norm": 0.6542958617210388, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 15440 + }, + { + "epoch": 1.1095152603231597, + "grad_norm": 0.7468852400779724, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 15450 + }, + { + "epoch": 1.110233393177738, + "grad_norm": 0.8119780421257019, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 15460 + }, + { + "epoch": 1.110951526032316, + "grad_norm": 0.7807733416557312, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 15470 + }, + { + "epoch": 1.1116696588868942, + "grad_norm": 0.7352553009986877, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 15480 + }, + { + "epoch": 1.1123877917414722, + "grad_norm": 0.8455224633216858, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 15490 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 0.635308563709259, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 15500 + }, + { + "epoch": 1.1138240574506284, + "grad_norm": 0.6268794536590576, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15510 + }, + { + "epoch": 1.1145421903052064, + "grad_norm": 0.6829593181610107, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 15520 + }, + { + "epoch": 1.1152603231597846, + "grad_norm": 0.5997796058654785, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 15530 + }, + { + "epoch": 1.1159784560143626, + "grad_norm": 0.7500942349433899, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 15540 + }, + { + "epoch": 1.1166965888689409, + "grad_norm": 0.7052047848701477, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 15550 + }, + { + "epoch": 1.1174147217235189, + "grad_norm": 0.6698189377784729, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 15560 + }, + { + "epoch": 1.1181328545780969, + "grad_norm": 0.7890462875366211, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 15570 + }, + { + "epoch": 1.118850987432675, + "grad_norm": 0.7002465128898621, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 15580 + }, + { + "epoch": 1.119569120287253, + "grad_norm": 0.7456073760986328, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 15590 + }, + { + "epoch": 1.1202872531418313, + "grad_norm": 0.7997385263442993, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 15600 + }, + { + "epoch": 1.1210053859964093, + "grad_norm": 0.6640482544898987, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15610 + }, + { + "epoch": 1.1217235188509875, + "grad_norm": 0.7765318155288696, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15620 + }, + { + "epoch": 1.1224416517055655, + "grad_norm": 0.7184962630271912, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 15630 + }, + { + "epoch": 1.1231597845601435, + "grad_norm": 0.7310904264450073, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 15640 + }, + { + "epoch": 1.1238779174147218, + "grad_norm": 0.7406452298164368, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 15650 + }, + { + "epoch": 1.1245960502692998, + "grad_norm": 0.7546738982200623, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 15660 + }, + { + "epoch": 1.125314183123878, + "grad_norm": 0.7069764733314514, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 15670 + }, + { + "epoch": 1.126032315978456, + "grad_norm": 0.6309521198272705, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 15680 + }, + { + "epoch": 1.1267504488330342, + "grad_norm": 0.8050156831741333, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 15690 + }, + { + "epoch": 1.1274685816876122, + "grad_norm": 0.726556122303009, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 15700 + }, + { + "epoch": 1.1281867145421902, + "grad_norm": 0.77745521068573, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 15710 + }, + { + "epoch": 1.1289048473967684, + "grad_norm": 0.7467634677886963, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 15720 + }, + { + "epoch": 1.1296229802513464, + "grad_norm": 0.8207895755767822, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 15730 + }, + { + "epoch": 1.1303411131059247, + "grad_norm": 0.8253937363624573, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 15740 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 0.6313983798027039, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 15750 + }, + { + "epoch": 1.1317773788150807, + "grad_norm": 0.8040992021560669, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 15760 + }, + { + "epoch": 1.132495511669659, + "grad_norm": 0.5937064290046692, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 15770 + }, + { + "epoch": 1.133213644524237, + "grad_norm": 0.6486281156539917, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 15780 + }, + { + "epoch": 1.1339317773788151, + "grad_norm": 0.6161853075027466, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 15790 + }, + { + "epoch": 1.1346499102333931, + "grad_norm": 0.6926610469818115, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 15800 + }, + { + "epoch": 1.1353680430879713, + "grad_norm": 0.6084047555923462, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 15810 + }, + { + "epoch": 1.1360861759425493, + "grad_norm": 0.6928383111953735, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 15820 + }, + { + "epoch": 1.1368043087971276, + "grad_norm": 0.7784243822097778, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 15830 + }, + { + "epoch": 1.1375224416517056, + "grad_norm": 0.7169384956359863, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 15840 + }, + { + "epoch": 1.1382405745062836, + "grad_norm": 0.6953616142272949, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 15850 + }, + { + "epoch": 1.1389587073608618, + "grad_norm": 0.7345215082168579, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15860 + }, + { + "epoch": 1.1396768402154398, + "grad_norm": 0.5469502806663513, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 15870 + }, + { + "epoch": 1.140394973070018, + "grad_norm": 0.687680721282959, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15880 + }, + { + "epoch": 1.141113105924596, + "grad_norm": 0.6879996657371521, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 15890 + }, + { + "epoch": 1.141831238779174, + "grad_norm": 0.728886067867279, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 15900 + }, + { + "epoch": 1.1425493716337523, + "grad_norm": 0.929531455039978, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 15910 + }, + { + "epoch": 1.1432675044883303, + "grad_norm": 0.8122507333755493, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 15920 + }, + { + "epoch": 1.1439856373429085, + "grad_norm": 0.6494652628898621, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 15930 + }, + { + "epoch": 1.1447037701974865, + "grad_norm": 0.7307567596435547, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15940 + }, + { + "epoch": 1.1454219030520647, + "grad_norm": 0.548678994178772, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 15950 + }, + { + "epoch": 1.1461400359066427, + "grad_norm": 0.8011603951454163, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 15960 + }, + { + "epoch": 1.146858168761221, + "grad_norm": 0.7026647329330444, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 15970 + }, + { + "epoch": 1.147576301615799, + "grad_norm": 0.7338995933532715, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 15980 + }, + { + "epoch": 1.148294434470377, + "grad_norm": 0.8453443646430969, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 15990 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 0.6787207126617432, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 16000 + }, + { + "epoch": 1.1497307001795332, + "grad_norm": 0.6314631104469299, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 16010 + }, + { + "epoch": 1.1504488330341114, + "grad_norm": 0.8812752962112427, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16020 + }, + { + "epoch": 1.1511669658886894, + "grad_norm": 0.6528969407081604, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 16030 + }, + { + "epoch": 1.1518850987432674, + "grad_norm": 0.7843571305274963, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 16040 + }, + { + "epoch": 1.1526032315978456, + "grad_norm": 0.7095080018043518, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 16050 + }, + { + "epoch": 1.1533213644524236, + "grad_norm": 0.7495582103729248, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 16060 + }, + { + "epoch": 1.1540394973070018, + "grad_norm": 0.6002049446105957, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 16070 + }, + { + "epoch": 1.1547576301615798, + "grad_norm": 0.565014123916626, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 16080 + }, + { + "epoch": 1.155475763016158, + "grad_norm": 0.8209971785545349, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 16090 + }, + { + "epoch": 1.156193895870736, + "grad_norm": 0.7137531042098999, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 16100 + }, + { + "epoch": 1.1569120287253143, + "grad_norm": 0.7307516932487488, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 16110 + }, + { + "epoch": 1.1576301615798923, + "grad_norm": 0.6686444878578186, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 16120 + }, + { + "epoch": 1.1583482944344703, + "grad_norm": 0.7977298498153687, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 16130 + }, + { + "epoch": 1.1590664272890485, + "grad_norm": 0.6980607509613037, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 16140 + }, + { + "epoch": 1.1597845601436265, + "grad_norm": 0.6622613668441772, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 16150 + }, + { + "epoch": 1.1605026929982047, + "grad_norm": 0.6598347425460815, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 16160 + }, + { + "epoch": 1.1612208258527827, + "grad_norm": 0.6686234474182129, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 16170 + }, + { + "epoch": 1.1619389587073607, + "grad_norm": 0.7308177947998047, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 16180 + }, + { + "epoch": 1.162657091561939, + "grad_norm": 0.939537525177002, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 16190 + }, + { + "epoch": 1.163375224416517, + "grad_norm": 0.5514758825302124, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 16200 + }, + { + "epoch": 1.1640933572710952, + "grad_norm": 0.589142918586731, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 16210 + }, + { + "epoch": 1.1648114901256732, + "grad_norm": 0.6888012290000916, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 16220 + }, + { + "epoch": 1.1655296229802514, + "grad_norm": 0.82566899061203, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 16230 + }, + { + "epoch": 1.1662477558348294, + "grad_norm": 0.6107817888259888, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 16240 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 0.7831398844718933, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 16250 + }, + { + "epoch": 1.1676840215439857, + "grad_norm": 0.6468397974967957, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 16260 + }, + { + "epoch": 1.1684021543985637, + "grad_norm": 0.7284161448478699, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 16270 + }, + { + "epoch": 1.1691202872531419, + "grad_norm": 0.6182818412780762, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 16280 + }, + { + "epoch": 1.1698384201077199, + "grad_norm": 0.7091781497001648, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 16290 + }, + { + "epoch": 1.170556552962298, + "grad_norm": 0.7327643632888794, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 16300 + }, + { + "epoch": 1.171274685816876, + "grad_norm": 0.5864694118499756, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 16310 + }, + { + "epoch": 1.171992818671454, + "grad_norm": 0.7049986720085144, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 16320 + }, + { + "epoch": 1.1727109515260323, + "grad_norm": 0.7563399076461792, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 16330 + }, + { + "epoch": 1.1734290843806103, + "grad_norm": 0.5888143181800842, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16340 + }, + { + "epoch": 1.1741472172351886, + "grad_norm": 0.8670049905776978, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 16350 + }, + { + "epoch": 1.1748653500897666, + "grad_norm": 0.8045654296875, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 16360 + }, + { + "epoch": 1.1755834829443448, + "grad_norm": 0.9115668535232544, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 16370 + }, + { + "epoch": 1.1763016157989228, + "grad_norm": 0.6943584084510803, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 16380 + }, + { + "epoch": 1.177019748653501, + "grad_norm": 0.7931740283966064, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 16390 + }, + { + "epoch": 1.177737881508079, + "grad_norm": 0.7967953085899353, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16400 + }, + { + "epoch": 1.178456014362657, + "grad_norm": 0.575165867805481, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 16410 + }, + { + "epoch": 1.1791741472172352, + "grad_norm": 0.6803409457206726, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 16420 + }, + { + "epoch": 1.1798922800718132, + "grad_norm": 0.7661909461021423, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 16430 + }, + { + "epoch": 1.1806104129263915, + "grad_norm": 0.7907630205154419, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 16440 + }, + { + "epoch": 1.1813285457809695, + "grad_norm": 0.7215338945388794, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 16450 + }, + { + "epoch": 1.1820466786355475, + "grad_norm": 0.6824054718017578, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 16460 + }, + { + "epoch": 1.1827648114901257, + "grad_norm": 0.8057665228843689, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 16470 + }, + { + "epoch": 1.1834829443447037, + "grad_norm": 0.7487542033195496, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 16480 + }, + { + "epoch": 1.184201077199282, + "grad_norm": 0.7254953384399414, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 16490 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 0.6986604332923889, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 16500 + }, + { + "epoch": 1.1856373429084381, + "grad_norm": 0.7889591455459595, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 16510 + }, + { + "epoch": 1.1863554757630161, + "grad_norm": 0.6029604077339172, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 16520 + }, + { + "epoch": 1.1870736086175944, + "grad_norm": 0.680322527885437, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 16530 + }, + { + "epoch": 1.1877917414721724, + "grad_norm": 0.8588826060295105, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 16540 + }, + { + "epoch": 1.1885098743267504, + "grad_norm": 0.7614806890487671, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 16550 + }, + { + "epoch": 1.1892280071813286, + "grad_norm": 0.7523183226585388, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 16560 + }, + { + "epoch": 1.1899461400359066, + "grad_norm": 0.8299532532691956, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 16570 + }, + { + "epoch": 1.1906642728904848, + "grad_norm": 0.6709241271018982, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 16580 + }, + { + "epoch": 1.1913824057450628, + "grad_norm": 0.665414035320282, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16590 + }, + { + "epoch": 1.1921005385996408, + "grad_norm": 0.7582152485847473, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 16600 + }, + { + "epoch": 1.192818671454219, + "grad_norm": 0.5856947302818298, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 16610 + }, + { + "epoch": 1.193536804308797, + "grad_norm": 0.6972885727882385, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 16620 + }, + { + "epoch": 1.1942549371633753, + "grad_norm": 0.6884734630584717, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 16630 + }, + { + "epoch": 1.1949730700179533, + "grad_norm": 0.7380475401878357, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 16640 + }, + { + "epoch": 1.1956912028725315, + "grad_norm": 0.7976197600364685, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 16650 + }, + { + "epoch": 1.1964093357271095, + "grad_norm": 0.819256067276001, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 16660 + }, + { + "epoch": 1.1971274685816877, + "grad_norm": 0.587867796421051, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 16670 + }, + { + "epoch": 1.1978456014362657, + "grad_norm": 0.9162678122520447, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 16680 + }, + { + "epoch": 1.1985637342908437, + "grad_norm": 0.7452084422111511, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 16690 + }, + { + "epoch": 1.199281867145422, + "grad_norm": 0.7966971397399902, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 16700 + }, + { + "epoch": 1.2, + "grad_norm": 0.6605724692344666, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 16710 + }, + { + "epoch": 1.2007181328545782, + "grad_norm": 0.6499220728874207, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16720 + }, + { + "epoch": 1.2014362657091562, + "grad_norm": 0.7422114610671997, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 16730 + }, + { + "epoch": 1.2021543985637342, + "grad_norm": 0.6652370095252991, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 16740 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 0.8761070370674133, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 16750 + }, + { + "epoch": 1.2035906642728904, + "grad_norm": 0.7294463515281677, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 16760 + }, + { + "epoch": 1.2043087971274686, + "grad_norm": 0.7725599408149719, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 16770 + }, + { + "epoch": 1.2050269299820466, + "grad_norm": 0.5630005598068237, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 16780 + }, + { + "epoch": 1.2057450628366249, + "grad_norm": 0.7601404786109924, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16790 + }, + { + "epoch": 1.2064631956912029, + "grad_norm": 0.6859985589981079, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16800 + }, + { + "epoch": 1.207181328545781, + "grad_norm": 0.7040054798126221, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 16810 + }, + { + "epoch": 1.207899461400359, + "grad_norm": 0.7058989405632019, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 16820 + }, + { + "epoch": 1.208617594254937, + "grad_norm": 0.7646133899688721, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16830 + }, + { + "epoch": 1.2093357271095153, + "grad_norm": 0.669550359249115, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 16840 + }, + { + "epoch": 1.2100538599640933, + "grad_norm": 0.6613401174545288, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16850 + }, + { + "epoch": 1.2107719928186715, + "grad_norm": 0.8636519312858582, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 16860 + }, + { + "epoch": 1.2114901256732495, + "grad_norm": 0.6077507138252258, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 16870 + }, + { + "epoch": 1.2122082585278275, + "grad_norm": 0.7892228364944458, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 16880 + }, + { + "epoch": 1.2129263913824058, + "grad_norm": 0.7424154877662659, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 16890 + }, + { + "epoch": 1.2136445242369838, + "grad_norm": 0.6525408029556274, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 16900 + }, + { + "epoch": 1.214362657091562, + "grad_norm": 0.6178015470504761, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 16910 + }, + { + "epoch": 1.21508078994614, + "grad_norm": 0.7319437861442566, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 16920 + }, + { + "epoch": 1.2157989228007182, + "grad_norm": 0.6823344826698303, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 16930 + }, + { + "epoch": 1.2165170556552962, + "grad_norm": 0.5681257843971252, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 16940 + }, + { + "epoch": 1.2172351885098744, + "grad_norm": 0.7939814925193787, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 16950 + }, + { + "epoch": 1.2179533213644524, + "grad_norm": 0.7031611800193787, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 16960 + }, + { + "epoch": 1.2186714542190304, + "grad_norm": 0.7610133290290833, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16970 + }, + { + "epoch": 1.2193895870736087, + "grad_norm": 0.8707142472267151, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 16980 + }, + { + "epoch": 1.2201077199281867, + "grad_norm": 0.6603384017944336, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 16990 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 0.7218315005302429, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 17000 + }, + { + "epoch": 1.221543985637343, + "grad_norm": 0.8043148517608643, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17010 + }, + { + "epoch": 1.222262118491921, + "grad_norm": 0.7232559323310852, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17020 + }, + { + "epoch": 1.2229802513464991, + "grad_norm": 0.690376341342926, + "learning_rate": 0.0002, + "loss": 0.7681, + "step": 17030 + }, + { + "epoch": 1.2236983842010771, + "grad_norm": 0.602436363697052, + "learning_rate": 0.0002, + "loss": 0.7042, + "step": 17040 + }, + { + "epoch": 1.2244165170556554, + "grad_norm": 0.7610493898391724, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 17050 + }, + { + "epoch": 1.2251346499102334, + "grad_norm": 0.7504690885543823, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 17060 + }, + { + "epoch": 1.2258527827648116, + "grad_norm": 0.8080246448516846, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 17070 + }, + { + "epoch": 1.2265709156193896, + "grad_norm": 1.0240572690963745, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 17080 + }, + { + "epoch": 1.2272890484739678, + "grad_norm": 0.6874111294746399, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 17090 + }, + { + "epoch": 1.2280071813285458, + "grad_norm": 0.800069272518158, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 17100 + }, + { + "epoch": 1.2287253141831238, + "grad_norm": 0.8628103137016296, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 17110 + }, + { + "epoch": 1.229443447037702, + "grad_norm": 0.7408499121665955, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 17120 + }, + { + "epoch": 1.23016157989228, + "grad_norm": 0.6494335532188416, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 17130 + }, + { + "epoch": 1.2308797127468583, + "grad_norm": 0.6493549942970276, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17140 + }, + { + "epoch": 1.2315978456014363, + "grad_norm": 0.6972658038139343, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 17150 + }, + { + "epoch": 1.2323159784560143, + "grad_norm": 0.6877315044403076, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 17160 + }, + { + "epoch": 1.2330341113105925, + "grad_norm": 0.7569024562835693, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 17170 + }, + { + "epoch": 1.2337522441651705, + "grad_norm": 0.696260392665863, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 17180 + }, + { + "epoch": 1.2344703770197487, + "grad_norm": 0.6150345802307129, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 17190 + }, + { + "epoch": 1.2351885098743267, + "grad_norm": 0.69009929895401, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 17200 + }, + { + "epoch": 1.235906642728905, + "grad_norm": 0.7035185098648071, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 17210 + }, + { + "epoch": 1.236624775583483, + "grad_norm": 0.6792506575584412, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17220 + }, + { + "epoch": 1.2373429084380612, + "grad_norm": 0.6310356855392456, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 17230 + }, + { + "epoch": 1.2380610412926392, + "grad_norm": 0.647026538848877, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 17240 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 0.7609930038452148, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 17250 + }, + { + "epoch": 1.2394973070017954, + "grad_norm": 0.791890561580658, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 17260 + }, + { + "epoch": 1.2402154398563734, + "grad_norm": 0.7126715183258057, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 17270 + }, + { + "epoch": 1.2409335727109516, + "grad_norm": 0.7850401401519775, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 17280 + }, + { + "epoch": 1.2416517055655296, + "grad_norm": 0.6694281697273254, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 17290 + }, + { + "epoch": 1.2423698384201076, + "grad_norm": 0.6418080925941467, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 17300 + }, + { + "epoch": 1.2430879712746858, + "grad_norm": 0.7308132648468018, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 17310 + }, + { + "epoch": 1.2438061041292638, + "grad_norm": 0.8322312235832214, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17320 + }, + { + "epoch": 1.244524236983842, + "grad_norm": 0.6959006190299988, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 17330 + }, + { + "epoch": 1.24524236983842, + "grad_norm": 0.7110121846199036, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17340 + }, + { + "epoch": 1.2459605026929983, + "grad_norm": 0.6496296525001526, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 17350 + }, + { + "epoch": 1.2466786355475763, + "grad_norm": 0.7649076581001282, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 17360 + }, + { + "epoch": 1.2473967684021545, + "grad_norm": 0.7139049172401428, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 17370 + }, + { + "epoch": 1.2481149012567325, + "grad_norm": 0.7709113955497742, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 17380 + }, + { + "epoch": 1.2488330341113105, + "grad_norm": 0.7160373330116272, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 17390 + }, + { + "epoch": 1.2495511669658887, + "grad_norm": 0.5608301162719727, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17400 + }, + { + "epoch": 1.2502692998204668, + "grad_norm": 0.6913180351257324, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 17410 + }, + { + "epoch": 1.250987432675045, + "grad_norm": 0.6980322599411011, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 17420 + }, + { + "epoch": 1.251705565529623, + "grad_norm": 0.8155394792556763, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 17430 + }, + { + "epoch": 1.252423698384201, + "grad_norm": 0.8015886545181274, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 17440 + }, + { + "epoch": 1.2531418312387792, + "grad_norm": 0.5985556244850159, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17450 + }, + { + "epoch": 1.2538599640933572, + "grad_norm": 0.70317143201828, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17460 + }, + { + "epoch": 1.2545780969479354, + "grad_norm": 0.612501323223114, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17470 + }, + { + "epoch": 1.2552962298025134, + "grad_norm": 0.7347102165222168, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 17480 + }, + { + "epoch": 1.2560143626570914, + "grad_norm": 0.9189441800117493, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 17490 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 0.7727932929992676, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 17500 + }, + { + "epoch": 1.2574506283662479, + "grad_norm": 0.6782869696617126, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 17510 + }, + { + "epoch": 1.2581687612208259, + "grad_norm": 0.5710638761520386, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17520 + }, + { + "epoch": 1.2588868940754039, + "grad_norm": 0.6856266856193542, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 17530 + }, + { + "epoch": 1.259605026929982, + "grad_norm": 0.7257347702980042, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 17540 + }, + { + "epoch": 1.26032315978456, + "grad_norm": 0.6343092918395996, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 17550 + }, + { + "epoch": 1.2610412926391383, + "grad_norm": 0.6482594013214111, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 17560 + }, + { + "epoch": 1.2617594254937163, + "grad_norm": 0.6542837619781494, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 17570 + }, + { + "epoch": 1.2624775583482943, + "grad_norm": 0.7106123566627502, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 17580 + }, + { + "epoch": 1.2631956912028726, + "grad_norm": 0.9081960320472717, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 17590 + }, + { + "epoch": 1.2639138240574506, + "grad_norm": 0.7010290026664734, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 17600 + }, + { + "epoch": 1.2646319569120288, + "grad_norm": 0.9973132610321045, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 17610 + }, + { + "epoch": 1.2653500897666068, + "grad_norm": 0.8003297448158264, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 17620 + }, + { + "epoch": 1.2660682226211848, + "grad_norm": 0.7383468151092529, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 17630 + }, + { + "epoch": 1.266786355475763, + "grad_norm": 0.6337200999259949, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 17640 + }, + { + "epoch": 1.2675044883303412, + "grad_norm": 0.6371761560440063, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 17650 + }, + { + "epoch": 1.2682226211849192, + "grad_norm": 0.7283522486686707, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 17660 + }, + { + "epoch": 1.2689407540394972, + "grad_norm": 0.8191015720367432, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 17670 + }, + { + "epoch": 1.2696588868940755, + "grad_norm": 0.6210351586341858, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 17680 + }, + { + "epoch": 1.2703770197486535, + "grad_norm": 0.6563277840614319, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 17690 + }, + { + "epoch": 1.2710951526032317, + "grad_norm": 0.7111260294914246, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 17700 + }, + { + "epoch": 1.2718132854578097, + "grad_norm": 0.7061500549316406, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 17710 + }, + { + "epoch": 1.2725314183123877, + "grad_norm": 0.7657744884490967, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 17720 + }, + { + "epoch": 1.273249551166966, + "grad_norm": 0.6952996850013733, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17730 + }, + { + "epoch": 1.273967684021544, + "grad_norm": 0.5678043961524963, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 17740 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 0.8608036041259766, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 17750 + }, + { + "epoch": 1.2754039497307001, + "grad_norm": 0.7184045910835266, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 17760 + }, + { + "epoch": 1.2761220825852782, + "grad_norm": 0.6647557616233826, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 17770 + }, + { + "epoch": 1.2768402154398564, + "grad_norm": 0.6899349093437195, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17780 + }, + { + "epoch": 1.2775583482944346, + "grad_norm": 0.7073346972465515, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 17790 + }, + { + "epoch": 1.2782764811490126, + "grad_norm": 0.8896707892417908, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 17800 + }, + { + "epoch": 1.2789946140035906, + "grad_norm": 0.5072778463363647, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 17810 + }, + { + "epoch": 1.2797127468581688, + "grad_norm": 0.8889711499214172, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 17820 + }, + { + "epoch": 1.2804308797127468, + "grad_norm": 0.5583778619766235, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 17830 + }, + { + "epoch": 1.281149012567325, + "grad_norm": 0.6526148915290833, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 17840 + }, + { + "epoch": 1.281867145421903, + "grad_norm": 0.7658175826072693, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 17850 + }, + { + "epoch": 1.282585278276481, + "grad_norm": 0.5547847151756287, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 17860 + }, + { + "epoch": 1.2833034111310593, + "grad_norm": 0.6153780817985535, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17870 + }, + { + "epoch": 1.2840215439856373, + "grad_norm": 0.8474061489105225, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 17880 + }, + { + "epoch": 1.2847396768402155, + "grad_norm": 0.859260618686676, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 17890 + }, + { + "epoch": 1.2854578096947935, + "grad_norm": 0.7270520329475403, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 17900 + }, + { + "epoch": 1.2861759425493715, + "grad_norm": 0.8166249394416809, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 17910 + }, + { + "epoch": 1.2868940754039497, + "grad_norm": 0.9158982038497925, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17920 + }, + { + "epoch": 1.287612208258528, + "grad_norm": 0.8132565021514893, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17930 + }, + { + "epoch": 1.288330341113106, + "grad_norm": 0.7914409637451172, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17940 + }, + { + "epoch": 1.289048473967684, + "grad_norm": 0.6256071329116821, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 17950 + }, + { + "epoch": 1.2897666068222622, + "grad_norm": 0.6463542580604553, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 17960 + }, + { + "epoch": 1.2904847396768402, + "grad_norm": 0.6702672839164734, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 17970 + }, + { + "epoch": 1.2912028725314184, + "grad_norm": 0.8666605949401855, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 17980 + }, + { + "epoch": 1.2919210053859964, + "grad_norm": 0.8055952787399292, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17990 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 0.6909741163253784, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 18000 + }, + { + "epoch": 1.2933572710951526, + "grad_norm": 0.663702130317688, + "learning_rate": 0.0002, + "loss": 0.7766, + "step": 18010 + }, + { + "epoch": 1.2940754039497306, + "grad_norm": 0.6952448487281799, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 18020 + }, + { + "epoch": 1.2947935368043089, + "grad_norm": 0.5722854137420654, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18030 + }, + { + "epoch": 1.2955116696588869, + "grad_norm": 0.7987681031227112, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 18040 + }, + { + "epoch": 1.2962298025134649, + "grad_norm": 0.661133348941803, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 18050 + }, + { + "epoch": 1.296947935368043, + "grad_norm": 0.6025064587593079, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 18060 + }, + { + "epoch": 1.2976660682226213, + "grad_norm": 0.7569907903671265, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 18070 + }, + { + "epoch": 1.2983842010771993, + "grad_norm": 0.7222012281417847, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18080 + }, + { + "epoch": 1.2991023339317773, + "grad_norm": 0.5291963815689087, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 18090 + }, + { + "epoch": 1.2998204667863555, + "grad_norm": 0.6808363199234009, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 18100 + }, + { + "epoch": 1.3005385996409335, + "grad_norm": 0.6797927618026733, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 18110 + }, + { + "epoch": 1.3012567324955118, + "grad_norm": 0.7775542140007019, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 18120 + }, + { + "epoch": 1.3019748653500898, + "grad_norm": 0.7369466423988342, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18130 + }, + { + "epoch": 1.3026929982046678, + "grad_norm": 0.6822494864463806, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 18140 + }, + { + "epoch": 1.303411131059246, + "grad_norm": 0.9222138524055481, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 18150 + }, + { + "epoch": 1.304129263913824, + "grad_norm": 0.7485767006874084, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 18160 + }, + { + "epoch": 1.3048473967684022, + "grad_norm": 0.6383684277534485, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 18170 + }, + { + "epoch": 1.3055655296229802, + "grad_norm": 0.5934187173843384, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 18180 + }, + { + "epoch": 1.3062836624775582, + "grad_norm": 0.7265770435333252, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 18190 + }, + { + "epoch": 1.3070017953321365, + "grad_norm": 0.8149140477180481, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 18200 + }, + { + "epoch": 1.3077199281867147, + "grad_norm": 0.8067880272865295, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 18210 + }, + { + "epoch": 1.3084380610412927, + "grad_norm": 0.6109178066253662, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18220 + }, + { + "epoch": 1.3091561938958707, + "grad_norm": 0.7194176316261292, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 18230 + }, + { + "epoch": 1.309874326750449, + "grad_norm": 0.6452242136001587, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 18240 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 0.680550217628479, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 18250 + }, + { + "epoch": 1.3113105924596051, + "grad_norm": 0.7005740404129028, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 18260 + }, + { + "epoch": 1.3120287253141831, + "grad_norm": 0.7217825055122375, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 18270 + }, + { + "epoch": 1.3127468581687611, + "grad_norm": 0.7730209231376648, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 18280 + }, + { + "epoch": 1.3134649910233394, + "grad_norm": 0.8291956186294556, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18290 + }, + { + "epoch": 1.3141831238779174, + "grad_norm": 0.758528470993042, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18300 + }, + { + "epoch": 1.3149012567324956, + "grad_norm": 0.9682782292366028, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 18310 + }, + { + "epoch": 1.3156193895870736, + "grad_norm": 0.5784780979156494, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 18320 + }, + { + "epoch": 1.3163375224416516, + "grad_norm": 0.5870532393455505, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 18330 + }, + { + "epoch": 1.3170556552962298, + "grad_norm": 0.5950172543525696, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 18340 + }, + { + "epoch": 1.317773788150808, + "grad_norm": 0.7625961899757385, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 18350 + }, + { + "epoch": 1.318491921005386, + "grad_norm": 0.8027397394180298, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 18360 + }, + { + "epoch": 1.319210053859964, + "grad_norm": 0.8424779772758484, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 18370 + }, + { + "epoch": 1.3199281867145423, + "grad_norm": 0.5741737484931946, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 18380 + }, + { + "epoch": 1.3206463195691203, + "grad_norm": 0.7363710999488831, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 18390 + }, + { + "epoch": 1.3213644524236985, + "grad_norm": 0.7900536060333252, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 18400 + }, + { + "epoch": 1.3220825852782765, + "grad_norm": 0.6273105144500732, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 18410 + }, + { + "epoch": 1.3228007181328545, + "grad_norm": 0.7612496018409729, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 18420 + }, + { + "epoch": 1.3235188509874327, + "grad_norm": 0.729653537273407, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 18430 + }, + { + "epoch": 1.3242369838420107, + "grad_norm": 0.6599212288856506, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 18440 + }, + { + "epoch": 1.324955116696589, + "grad_norm": 0.762320876121521, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18450 + }, + { + "epoch": 1.325673249551167, + "grad_norm": 0.7468838095664978, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18460 + }, + { + "epoch": 1.326391382405745, + "grad_norm": 0.6376237273216248, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 18470 + }, + { + "epoch": 1.3271095152603232, + "grad_norm": 0.6722603440284729, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18480 + }, + { + "epoch": 1.3278276481149014, + "grad_norm": 0.7011231780052185, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 18490 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 0.5325027108192444, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 18500 + }, + { + "epoch": 1.3292639138240574, + "grad_norm": 0.6916731595993042, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 18510 + }, + { + "epoch": 1.3299820466786356, + "grad_norm": 0.6529106497764587, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18520 + }, + { + "epoch": 1.3307001795332136, + "grad_norm": 0.7708640694618225, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 18530 + }, + { + "epoch": 1.3314183123877918, + "grad_norm": 0.7125861048698425, + "learning_rate": 0.0002, + "loss": 0.7688, + "step": 18540 + }, + { + "epoch": 1.3321364452423698, + "grad_norm": 0.7663969993591309, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 18550 + }, + { + "epoch": 1.3328545780969479, + "grad_norm": 0.601141631603241, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 18560 + }, + { + "epoch": 1.333572710951526, + "grad_norm": 0.6185581088066101, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 18570 + }, + { + "epoch": 1.334290843806104, + "grad_norm": 0.6136596202850342, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 18580 + }, + { + "epoch": 1.3350089766606823, + "grad_norm": 0.8377187252044678, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 18590 + }, + { + "epoch": 1.3357271095152603, + "grad_norm": 0.7649989724159241, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 18600 + }, + { + "epoch": 1.3364452423698383, + "grad_norm": 0.7944515347480774, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 18610 + }, + { + "epoch": 1.3371633752244165, + "grad_norm": 0.619024395942688, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 18620 + }, + { + "epoch": 1.3378815080789948, + "grad_norm": 0.7849082946777344, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 18630 + }, + { + "epoch": 1.3385996409335728, + "grad_norm": 0.5740780830383301, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18640 + }, + { + "epoch": 1.3393177737881508, + "grad_norm": 0.6897456645965576, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 18650 + }, + { + "epoch": 1.340035906642729, + "grad_norm": 0.6263600587844849, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 18660 + }, + { + "epoch": 1.340754039497307, + "grad_norm": 0.5744550824165344, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 18670 + }, + { + "epoch": 1.3414721723518852, + "grad_norm": 0.7785728573799133, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 18680 + }, + { + "epoch": 1.3421903052064632, + "grad_norm": 0.6944230198860168, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 18690 + }, + { + "epoch": 1.3429084380610412, + "grad_norm": 0.7388073801994324, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 18700 + }, + { + "epoch": 1.3436265709156194, + "grad_norm": 0.9555586576461792, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 18710 + }, + { + "epoch": 1.3443447037701974, + "grad_norm": 0.8510582447052002, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 18720 + }, + { + "epoch": 1.3450628366247757, + "grad_norm": 0.6093049645423889, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 18730 + }, + { + "epoch": 1.3457809694793537, + "grad_norm": 0.9159273505210876, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 18740 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 0.7188084721565247, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 18750 + }, + { + "epoch": 1.3472172351885099, + "grad_norm": 0.7228650450706482, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 18760 + }, + { + "epoch": 1.347935368043088, + "grad_norm": 0.8160615563392639, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 18770 + }, + { + "epoch": 1.3486535008976661, + "grad_norm": 0.6485389471054077, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 18780 + }, + { + "epoch": 1.3493716337522441, + "grad_norm": 0.6755139827728271, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 18790 + }, + { + "epoch": 1.3500897666068223, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 18800 + }, + { + "epoch": 1.3508078994614003, + "grad_norm": 0.6954510807991028, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 18810 + }, + { + "epoch": 1.3515260323159786, + "grad_norm": 0.9948558807373047, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 18820 + }, + { + "epoch": 1.3522441651705566, + "grad_norm": 0.708381175994873, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18830 + }, + { + "epoch": 1.3529622980251346, + "grad_norm": 0.6409999132156372, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 18840 + }, + { + "epoch": 1.3536804308797128, + "grad_norm": 0.6365936994552612, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18850 + }, + { + "epoch": 1.3543985637342908, + "grad_norm": 0.7620742917060852, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 18860 + }, + { + "epoch": 1.355116696588869, + "grad_norm": 0.6849071383476257, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 18870 + }, + { + "epoch": 1.355834829443447, + "grad_norm": 0.5776316523551941, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18880 + }, + { + "epoch": 1.356552962298025, + "grad_norm": 0.597236156463623, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 18890 + }, + { + "epoch": 1.3572710951526032, + "grad_norm": 0.6569282412528992, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 18900 + }, + { + "epoch": 1.3579892280071812, + "grad_norm": 0.6384802460670471, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 18910 + }, + { + "epoch": 1.3587073608617595, + "grad_norm": 0.6623879671096802, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 18920 + }, + { + "epoch": 1.3594254937163375, + "grad_norm": 0.6149632334709167, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 18930 + }, + { + "epoch": 1.3601436265709157, + "grad_norm": 0.6978002190589905, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 18940 + }, + { + "epoch": 1.3608617594254937, + "grad_norm": 0.7579124569892883, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 18950 + }, + { + "epoch": 1.361579892280072, + "grad_norm": 0.7138084173202515, + "learning_rate": 0.0002, + "loss": 0.7589, + "step": 18960 + }, + { + "epoch": 1.36229802513465, + "grad_norm": 0.678322434425354, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18970 + }, + { + "epoch": 1.363016157989228, + "grad_norm": 0.694346010684967, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18980 + }, + { + "epoch": 1.3637342908438062, + "grad_norm": 0.682262659072876, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18990 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 0.9068194627761841, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 19000 + }, + { + "epoch": 1.3651705565529624, + "grad_norm": 0.6691566705703735, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 19010 + }, + { + "epoch": 1.3658886894075404, + "grad_norm": 0.7791378498077393, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 19020 + }, + { + "epoch": 1.3666068222621184, + "grad_norm": 0.717107355594635, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 19030 + }, + { + "epoch": 1.3673249551166966, + "grad_norm": 0.7897566556930542, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 19040 + }, + { + "epoch": 1.3680430879712746, + "grad_norm": 0.8823844790458679, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 19050 + }, + { + "epoch": 1.3687612208258528, + "grad_norm": 0.6512053608894348, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 19060 + }, + { + "epoch": 1.3694793536804308, + "grad_norm": 0.6871389150619507, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 19070 + }, + { + "epoch": 1.370197486535009, + "grad_norm": 0.6795603036880493, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 19080 + }, + { + "epoch": 1.370915619389587, + "grad_norm": 0.6569121479988098, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 19090 + }, + { + "epoch": 1.3716337522441653, + "grad_norm": 0.6769960522651672, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 19100 + }, + { + "epoch": 1.3723518850987433, + "grad_norm": 0.726613461971283, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 19110 + }, + { + "epoch": 1.3730700179533213, + "grad_norm": 0.7287817001342773, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 19120 + }, + { + "epoch": 1.3737881508078995, + "grad_norm": 0.6169242858886719, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 19130 + }, + { + "epoch": 1.3745062836624775, + "grad_norm": 0.6537347435951233, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 19140 + }, + { + "epoch": 1.3752244165170557, + "grad_norm": 0.6113879680633545, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 19150 + }, + { + "epoch": 1.3759425493716337, + "grad_norm": 0.6415297985076904, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 19160 + }, + { + "epoch": 1.3766606822262117, + "grad_norm": 0.6812838315963745, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 19170 + }, + { + "epoch": 1.37737881508079, + "grad_norm": 0.7331814169883728, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 19180 + }, + { + "epoch": 1.378096947935368, + "grad_norm": 0.7265108823776245, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 19190 + }, + { + "epoch": 1.3788150807899462, + "grad_norm": 0.6233167052268982, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 19200 + }, + { + "epoch": 1.3795332136445242, + "grad_norm": 0.6841492652893066, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 19210 + }, + { + "epoch": 1.3802513464991024, + "grad_norm": 0.822853684425354, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 19220 + }, + { + "epoch": 1.3809694793536804, + "grad_norm": 0.8078812956809998, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 19230 + }, + { + "epoch": 1.3816876122082586, + "grad_norm": 0.7269898056983948, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 19240 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 0.6297033429145813, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 19250 + }, + { + "epoch": 1.3831238779174146, + "grad_norm": 0.8097442388534546, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 19260 + }, + { + "epoch": 1.3838420107719929, + "grad_norm": 0.6442803740501404, + "learning_rate": 0.0002, + "loss": 0.7281, + "step": 19270 + }, + { + "epoch": 1.3845601436265709, + "grad_norm": 0.659866213798523, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 19280 + }, + { + "epoch": 1.385278276481149, + "grad_norm": 0.7537921667098999, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 19290 + }, + { + "epoch": 1.385996409335727, + "grad_norm": 0.8441828489303589, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 19300 + }, + { + "epoch": 1.386714542190305, + "grad_norm": 0.8506057262420654, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19310 + }, + { + "epoch": 1.3874326750448833, + "grad_norm": 0.6747094392776489, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 19320 + }, + { + "epoch": 1.3881508078994613, + "grad_norm": 0.7906509041786194, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 19330 + }, + { + "epoch": 1.3888689407540395, + "grad_norm": 0.6784867644309998, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 19340 + }, + { + "epoch": 1.3895870736086176, + "grad_norm": 0.6371709108352661, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 19350 + }, + { + "epoch": 1.3903052064631956, + "grad_norm": 0.7858285307884216, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 19360 + }, + { + "epoch": 1.3910233393177738, + "grad_norm": 0.711395263671875, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19370 + }, + { + "epoch": 1.391741472172352, + "grad_norm": 0.7023257613182068, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19380 + }, + { + "epoch": 1.39245960502693, + "grad_norm": 0.7036022543907166, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19390 + }, + { + "epoch": 1.393177737881508, + "grad_norm": 0.6418436169624329, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 19400 + }, + { + "epoch": 1.3938958707360862, + "grad_norm": 0.7108847498893738, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 19410 + }, + { + "epoch": 1.3946140035906642, + "grad_norm": 0.6940230131149292, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 19420 + }, + { + "epoch": 1.3953321364452425, + "grad_norm": 0.6750220656394958, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 19430 + }, + { + "epoch": 1.3960502692998205, + "grad_norm": 0.7479177713394165, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 19440 + }, + { + "epoch": 1.3967684021543985, + "grad_norm": 0.626124918460846, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 19450 + }, + { + "epoch": 1.3974865350089767, + "grad_norm": 0.8908559083938599, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 19460 + }, + { + "epoch": 1.3982046678635547, + "grad_norm": 0.6163712739944458, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 19470 + }, + { + "epoch": 1.398922800718133, + "grad_norm": 0.6993312239646912, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 19480 + }, + { + "epoch": 1.399640933572711, + "grad_norm": 0.6162890791893005, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 19490 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 0.7797643542289734, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 19500 + }, + { + "epoch": 1.4010771992818671, + "grad_norm": 0.7038744688034058, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 19510 + }, + { + "epoch": 1.4017953321364454, + "grad_norm": 0.6902393698692322, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 19520 + }, + { + "epoch": 1.4025134649910234, + "grad_norm": 0.5436386466026306, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 19530 + }, + { + "epoch": 1.4032315978456014, + "grad_norm": 0.6537990570068359, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19540 + }, + { + "epoch": 1.4039497307001796, + "grad_norm": 0.739691972732544, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 19550 + }, + { + "epoch": 1.4046678635547576, + "grad_norm": 0.7287635803222656, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 19560 + }, + { + "epoch": 1.4053859964093358, + "grad_norm": 0.6809501051902771, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 19570 + }, + { + "epoch": 1.4061041292639138, + "grad_norm": 0.8302195072174072, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 19580 + }, + { + "epoch": 1.4068222621184918, + "grad_norm": 0.6613629460334778, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 19590 + }, + { + "epoch": 1.40754039497307, + "grad_norm": 0.7897207736968994, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 19600 + }, + { + "epoch": 1.408258527827648, + "grad_norm": 0.8368293642997742, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 19610 + }, + { + "epoch": 1.4089766606822263, + "grad_norm": 0.665109395980835, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 19620 + }, + { + "epoch": 1.4096947935368043, + "grad_norm": 0.7359302639961243, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 19630 + }, + { + "epoch": 1.4104129263913823, + "grad_norm": 0.8048052787780762, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 19640 + }, + { + "epoch": 1.4111310592459605, + "grad_norm": 0.7414906620979309, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 19650 + }, + { + "epoch": 1.4118491921005387, + "grad_norm": 0.7894161343574524, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 19660 + }, + { + "epoch": 1.4125673249551167, + "grad_norm": 0.6724628210067749, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 19670 + }, + { + "epoch": 1.4132854578096947, + "grad_norm": 0.9397756457328796, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 19680 + }, + { + "epoch": 1.414003590664273, + "grad_norm": 0.6684842109680176, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 19690 + }, + { + "epoch": 1.414721723518851, + "grad_norm": 0.7753993272781372, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 19700 + }, + { + "epoch": 1.4154398563734292, + "grad_norm": 0.6934253573417664, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 19710 + }, + { + "epoch": 1.4161579892280072, + "grad_norm": 0.8567284941673279, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 19720 + }, + { + "epoch": 1.4168761220825852, + "grad_norm": 0.9471787214279175, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 19730 + }, + { + "epoch": 1.4175942549371634, + "grad_norm": 0.6664855480194092, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 19740 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 0.6713361740112305, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 19750 + }, + { + "epoch": 1.4190305206463196, + "grad_norm": 0.6488258838653564, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 19760 + }, + { + "epoch": 1.4197486535008976, + "grad_norm": 0.7089938521385193, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19770 + }, + { + "epoch": 1.4204667863554756, + "grad_norm": 0.6433218717575073, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 19780 + }, + { + "epoch": 1.4211849192100539, + "grad_norm": 0.7025160193443298, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 19790 + }, + { + "epoch": 1.421903052064632, + "grad_norm": 0.7030544877052307, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 19800 + }, + { + "epoch": 1.42262118491921, + "grad_norm": 0.6515552401542664, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 19810 + }, + { + "epoch": 1.423339317773788, + "grad_norm": 0.6463841795921326, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 19820 + }, + { + "epoch": 1.4240574506283663, + "grad_norm": 0.6654344201087952, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19830 + }, + { + "epoch": 1.4247755834829443, + "grad_norm": 0.7223384380340576, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 19840 + }, + { + "epoch": 1.4254937163375225, + "grad_norm": 0.6575722694396973, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 19850 + }, + { + "epoch": 1.4262118491921005, + "grad_norm": 0.6216059327125549, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 19860 + }, + { + "epoch": 1.4269299820466785, + "grad_norm": 0.7451487183570862, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19870 + }, + { + "epoch": 1.4276481149012568, + "grad_norm": 0.6563336253166199, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 19880 + }, + { + "epoch": 1.4283662477558348, + "grad_norm": 0.8021975159645081, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 19890 + }, + { + "epoch": 1.429084380610413, + "grad_norm": 0.7474712133407593, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 19900 + }, + { + "epoch": 1.429802513464991, + "grad_norm": 0.7316377758979797, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 19910 + }, + { + "epoch": 1.430520646319569, + "grad_norm": 0.646892786026001, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 19920 + }, + { + "epoch": 1.4312387791741472, + "grad_norm": 0.6268765926361084, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 19930 + }, + { + "epoch": 1.4319569120287254, + "grad_norm": 0.7104699611663818, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 19940 + }, + { + "epoch": 1.4326750448833034, + "grad_norm": 0.6742063760757446, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 19950 + }, + { + "epoch": 1.4333931777378814, + "grad_norm": 0.6973381638526917, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 19960 + }, + { + "epoch": 1.4341113105924597, + "grad_norm": 0.5819381475448608, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 19970 + }, + { + "epoch": 1.4348294434470377, + "grad_norm": 0.680623471736908, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 19980 + }, + { + "epoch": 1.435547576301616, + "grad_norm": 0.5899890661239624, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 19990 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 0.6225098371505737, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 20000 + }, + { + "epoch": 1.436983842010772, + "grad_norm": 0.6314228773117065, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 20010 + }, + { + "epoch": 1.4377019748653501, + "grad_norm": 0.8690667152404785, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 20020 + }, + { + "epoch": 1.4384201077199281, + "grad_norm": 0.7166543006896973, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 20030 + }, + { + "epoch": 1.4391382405745063, + "grad_norm": 0.7051591873168945, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 20040 + }, + { + "epoch": 1.4398563734290843, + "grad_norm": 0.7606652975082397, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 20050 + }, + { + "epoch": 1.4405745062836623, + "grad_norm": 0.6343185305595398, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 20060 + }, + { + "epoch": 1.4412926391382406, + "grad_norm": 0.5625789761543274, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 20070 + }, + { + "epoch": 1.4420107719928188, + "grad_norm": 0.6081897020339966, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 20080 + }, + { + "epoch": 1.4427289048473968, + "grad_norm": 0.9571536779403687, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 20090 + }, + { + "epoch": 1.4434470377019748, + "grad_norm": 0.869531512260437, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 20100 + }, + { + "epoch": 1.444165170556553, + "grad_norm": 0.6865507960319519, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 20110 + }, + { + "epoch": 1.444883303411131, + "grad_norm": 0.7572755813598633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 20120 + }, + { + "epoch": 1.4456014362657092, + "grad_norm": 0.79011070728302, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 20130 + }, + { + "epoch": 1.4463195691202873, + "grad_norm": 0.8297342658042908, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 20140 + }, + { + "epoch": 1.4470377019748653, + "grad_norm": 0.6593490839004517, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 20150 + }, + { + "epoch": 1.4477558348294435, + "grad_norm": 1.0264687538146973, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 20160 + }, + { + "epoch": 1.4484739676840215, + "grad_norm": 0.7032888531684875, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 20170 + }, + { + "epoch": 1.4491921005385997, + "grad_norm": 0.6438494920730591, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 20180 + }, + { + "epoch": 1.4499102333931777, + "grad_norm": 0.7448790669441223, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 20190 + }, + { + "epoch": 1.4506283662477557, + "grad_norm": 0.7551555037498474, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 20200 + }, + { + "epoch": 1.451346499102334, + "grad_norm": 0.6677857041358948, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 20210 + }, + { + "epoch": 1.4520646319569122, + "grad_norm": 0.7888486385345459, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 20220 + }, + { + "epoch": 1.4527827648114902, + "grad_norm": 0.6658565402030945, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 20230 + }, + { + "epoch": 1.4535008976660682, + "grad_norm": 0.6800249814987183, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 20240 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 0.7419682741165161, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 20250 + }, + { + "epoch": 1.4549371633752244, + "grad_norm": 0.8848792910575867, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 20260 + }, + { + "epoch": 1.4556552962298026, + "grad_norm": 0.6513857245445251, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 20270 + }, + { + "epoch": 1.4563734290843806, + "grad_norm": 0.5605742335319519, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 20280 + }, + { + "epoch": 1.4570915619389586, + "grad_norm": 0.6737141013145447, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 20290 + }, + { + "epoch": 1.4578096947935368, + "grad_norm": 0.6663289666175842, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 20300 + }, + { + "epoch": 1.4585278276481148, + "grad_norm": 0.7157106995582581, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20310 + }, + { + "epoch": 1.459245960502693, + "grad_norm": 0.7713354825973511, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 20320 + }, + { + "epoch": 1.459964093357271, + "grad_norm": 0.8334044218063354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 20330 + }, + { + "epoch": 1.460682226211849, + "grad_norm": 0.7268327474594116, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 20340 + }, + { + "epoch": 1.4614003590664273, + "grad_norm": 0.6791431903839111, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 20350 + }, + { + "epoch": 1.4621184919210055, + "grad_norm": 0.8177870512008667, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 20360 + }, + { + "epoch": 1.4628366247755835, + "grad_norm": 0.8064364790916443, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 20370 + }, + { + "epoch": 1.4635547576301615, + "grad_norm": 0.6547006964683533, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 20380 + }, + { + "epoch": 1.4642728904847397, + "grad_norm": 0.6381436586380005, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 20390 + }, + { + "epoch": 1.4649910233393177, + "grad_norm": 0.7351248264312744, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 20400 + }, + { + "epoch": 1.465709156193896, + "grad_norm": 0.7037558555603027, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 20410 + }, + { + "epoch": 1.466427289048474, + "grad_norm": 0.6294074654579163, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 20420 + }, + { + "epoch": 1.467145421903052, + "grad_norm": 0.9722632765769958, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 20430 + }, + { + "epoch": 1.4678635547576302, + "grad_norm": 0.753065824508667, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 20440 + }, + { + "epoch": 1.4685816876122082, + "grad_norm": 0.7317194938659668, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20450 + }, + { + "epoch": 1.4692998204667864, + "grad_norm": 0.6862193942070007, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 20460 + }, + { + "epoch": 1.4700179533213644, + "grad_norm": 0.7643225193023682, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 20470 + }, + { + "epoch": 1.4707360861759424, + "grad_norm": 0.5904353260993958, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 20480 + }, + { + "epoch": 1.4714542190305206, + "grad_norm": 0.5812238454818726, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20490 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 0.7478151321411133, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 20500 + }, + { + "epoch": 1.4728904847396769, + "grad_norm": 0.7625645399093628, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 20510 + }, + { + "epoch": 1.4736086175942549, + "grad_norm": 0.6354498267173767, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 20520 + }, + { + "epoch": 1.474326750448833, + "grad_norm": 0.8731162548065186, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 20530 + }, + { + "epoch": 1.475044883303411, + "grad_norm": 0.7346670627593994, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 20540 + }, + { + "epoch": 1.4757630161579893, + "grad_norm": 1.038447618484497, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 20550 + }, + { + "epoch": 1.4764811490125673, + "grad_norm": 0.7032809257507324, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 20560 + }, + { + "epoch": 1.4771992818671453, + "grad_norm": 0.8008337020874023, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 20570 + }, + { + "epoch": 1.4779174147217236, + "grad_norm": 0.6735056638717651, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 20580 + }, + { + "epoch": 1.4786355475763016, + "grad_norm": 0.622056245803833, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 20590 + }, + { + "epoch": 1.4793536804308798, + "grad_norm": 0.6580422520637512, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 20600 + }, + { + "epoch": 1.4800718132854578, + "grad_norm": 0.8401153087615967, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20610 + }, + { + "epoch": 1.4807899461400358, + "grad_norm": 0.7564560770988464, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 20620 + }, + { + "epoch": 1.481508078994614, + "grad_norm": 0.8319511413574219, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 20630 + }, + { + "epoch": 1.4822262118491922, + "grad_norm": 0.7430182695388794, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 20640 + }, + { + "epoch": 1.4829443447037702, + "grad_norm": 0.7996522784233093, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 20650 + }, + { + "epoch": 1.4836624775583482, + "grad_norm": 0.6993277072906494, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 20660 + }, + { + "epoch": 1.4843806104129265, + "grad_norm": 0.8621185421943665, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 20670 + }, + { + "epoch": 1.4850987432675045, + "grad_norm": 0.7709757685661316, + "learning_rate": 0.0002, + "loss": 0.7327, + "step": 20680 + }, + { + "epoch": 1.4858168761220827, + "grad_norm": 0.743760347366333, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 20690 + }, + { + "epoch": 1.4865350089766607, + "grad_norm": 0.8353745341300964, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 20700 + }, + { + "epoch": 1.4872531418312387, + "grad_norm": 0.8510433435440063, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 20710 + }, + { + "epoch": 1.487971274685817, + "grad_norm": 0.7065894603729248, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 20720 + }, + { + "epoch": 1.488689407540395, + "grad_norm": 0.6878955960273743, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 20730 + }, + { + "epoch": 1.4894075403949731, + "grad_norm": 0.7861111760139465, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 20740 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 0.4810725152492523, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20750 + }, + { + "epoch": 1.4908438061041291, + "grad_norm": 0.7246082425117493, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 20760 + }, + { + "epoch": 1.4915619389587074, + "grad_norm": 0.7101936340332031, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 20770 + }, + { + "epoch": 1.4922800718132856, + "grad_norm": 0.7508591413497925, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 20780 + }, + { + "epoch": 1.4929982046678636, + "grad_norm": 0.8872039914131165, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 20790 + }, + { + "epoch": 1.4937163375224416, + "grad_norm": 0.7257922887802124, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 20800 + }, + { + "epoch": 1.4944344703770198, + "grad_norm": 0.7886278629302979, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 20810 + }, + { + "epoch": 1.4951526032315978, + "grad_norm": 0.6746290922164917, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 20820 + }, + { + "epoch": 1.495870736086176, + "grad_norm": 0.8118207454681396, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 20830 + }, + { + "epoch": 1.496588868940754, + "grad_norm": 0.7337301969528198, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 20840 + }, + { + "epoch": 1.497307001795332, + "grad_norm": 0.5451242327690125, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 20850 + }, + { + "epoch": 1.4980251346499103, + "grad_norm": 0.8398377299308777, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 20860 + }, + { + "epoch": 1.4987432675044883, + "grad_norm": 0.7196659445762634, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 20870 + }, + { + "epoch": 1.4994614003590665, + "grad_norm": 0.6659539937973022, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 20880 + }, + { + "epoch": 1.5001795332136445, + "grad_norm": 0.6071978807449341, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 20890 + }, + { + "epoch": 1.5008976660682225, + "grad_norm": 0.6704870462417603, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 20900 + }, + { + "epoch": 1.5016157989228007, + "grad_norm": 0.7216639518737793, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 20910 + }, + { + "epoch": 1.502333931777379, + "grad_norm": 0.6050528287887573, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 20920 + }, + { + "epoch": 1.503052064631957, + "grad_norm": 0.7422218918800354, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 20930 + }, + { + "epoch": 1.503770197486535, + "grad_norm": 0.7157148122787476, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20940 + }, + { + "epoch": 1.504488330341113, + "grad_norm": 0.6704899668693542, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 20950 + }, + { + "epoch": 1.5052064631956912, + "grad_norm": 0.7573544979095459, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 20960 + }, + { + "epoch": 1.5059245960502694, + "grad_norm": 0.6710506677627563, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 20970 + }, + { + "epoch": 1.5066427289048474, + "grad_norm": 0.7559793591499329, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 20980 + }, + { + "epoch": 1.5073608617594254, + "grad_norm": 0.6705940961837769, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 20990 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 0.8016680479049683, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21000 + }, + { + "epoch": 1.5087971274685816, + "grad_norm": 0.8154481649398804, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 21010 + }, + { + "epoch": 1.5095152603231599, + "grad_norm": 0.5830582976341248, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 21020 + }, + { + "epoch": 1.5102333931777379, + "grad_norm": 0.7088601589202881, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 21030 + }, + { + "epoch": 1.5109515260323159, + "grad_norm": 0.7499658465385437, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 21040 + }, + { + "epoch": 1.511669658886894, + "grad_norm": 0.7684667706489563, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 21050 + }, + { + "epoch": 1.5123877917414723, + "grad_norm": 0.7183627486228943, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 21060 + }, + { + "epoch": 1.5131059245960503, + "grad_norm": 0.8201524615287781, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 21070 + }, + { + "epoch": 1.5138240574506283, + "grad_norm": 0.6359647512435913, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 21080 + }, + { + "epoch": 1.5145421903052063, + "grad_norm": 0.7419124245643616, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 21090 + }, + { + "epoch": 1.5152603231597845, + "grad_norm": 0.6145808696746826, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 21100 + }, + { + "epoch": 1.5159784560143628, + "grad_norm": 0.7116656303405762, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 21110 + }, + { + "epoch": 1.5166965888689408, + "grad_norm": 0.8927125334739685, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 21120 + }, + { + "epoch": 1.5174147217235188, + "grad_norm": 0.7527788877487183, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 21130 + }, + { + "epoch": 1.518132854578097, + "grad_norm": 0.7537266612052917, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 21140 + }, + { + "epoch": 1.518850987432675, + "grad_norm": 0.9051724672317505, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 21150 + }, + { + "epoch": 1.5195691202872532, + "grad_norm": 0.7258086800575256, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 21160 + }, + { + "epoch": 1.5202872531418312, + "grad_norm": 0.60377436876297, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 21170 + }, + { + "epoch": 1.5210053859964092, + "grad_norm": 0.613362729549408, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 21180 + }, + { + "epoch": 1.5217235188509874, + "grad_norm": 0.6311782002449036, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 21190 + }, + { + "epoch": 1.5224416517055657, + "grad_norm": 0.7814380526542664, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 21200 + }, + { + "epoch": 1.5231597845601437, + "grad_norm": 0.8482790589332581, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 21210 + }, + { + "epoch": 1.5238779174147217, + "grad_norm": 0.6767336130142212, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21220 + }, + { + "epoch": 1.5245960502692997, + "grad_norm": 0.7000219821929932, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 21230 + }, + { + "epoch": 1.525314183123878, + "grad_norm": 0.8848617076873779, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 21240 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 0.692258894443512, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 21250 + }, + { + "epoch": 1.5267504488330341, + "grad_norm": 0.7701950073242188, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 21260 + }, + { + "epoch": 1.5274685816876121, + "grad_norm": 0.7454132437705994, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 21270 + }, + { + "epoch": 1.5281867145421903, + "grad_norm": 0.7299574613571167, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 21280 + }, + { + "epoch": 1.5289048473967684, + "grad_norm": 0.6693950891494751, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 21290 + }, + { + "epoch": 1.5296229802513466, + "grad_norm": 0.8323785066604614, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 21300 + }, + { + "epoch": 1.5303411131059246, + "grad_norm": 0.8998763561248779, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 21310 + }, + { + "epoch": 1.5310592459605026, + "grad_norm": 0.8118193745613098, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 21320 + }, + { + "epoch": 1.5317773788150808, + "grad_norm": 0.8966332077980042, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 21330 + }, + { + "epoch": 1.532495511669659, + "grad_norm": 0.7849827408790588, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 21340 + }, + { + "epoch": 1.533213644524237, + "grad_norm": 0.897583544254303, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 21350 + }, + { + "epoch": 1.533931777378815, + "grad_norm": 0.7998009324073792, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21360 + }, + { + "epoch": 1.534649910233393, + "grad_norm": 0.5890361070632935, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 21370 + }, + { + "epoch": 1.5353680430879713, + "grad_norm": 0.7321302890777588, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 21380 + }, + { + "epoch": 1.5360861759425495, + "grad_norm": 0.7746050357818604, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 21390 + }, + { + "epoch": 1.5368043087971275, + "grad_norm": 0.7033910155296326, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 21400 + }, + { + "epoch": 1.5375224416517055, + "grad_norm": 0.7229148149490356, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 21410 + }, + { + "epoch": 1.5382405745062837, + "grad_norm": 0.8055810928344727, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 21420 + }, + { + "epoch": 1.5389587073608617, + "grad_norm": 0.9411654472351074, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 21430 + }, + { + "epoch": 1.53967684021544, + "grad_norm": 0.7297126650810242, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21440 + }, + { + "epoch": 1.540394973070018, + "grad_norm": 0.7316457629203796, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 21450 + }, + { + "epoch": 1.541113105924596, + "grad_norm": 0.8568798303604126, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 21460 + }, + { + "epoch": 1.5418312387791742, + "grad_norm": 0.7829580307006836, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21470 + }, + { + "epoch": 1.5425493716337524, + "grad_norm": 0.6679823398590088, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 21480 + }, + { + "epoch": 1.5432675044883304, + "grad_norm": 0.5680868029594421, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 21490 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 0.6878862380981445, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 21500 + }, + { + "epoch": 1.5447037701974864, + "grad_norm": 0.7391727566719055, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 21510 + }, + { + "epoch": 1.5454219030520646, + "grad_norm": 0.844994843006134, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 21520 + }, + { + "epoch": 1.5461400359066428, + "grad_norm": 0.7852550148963928, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 21530 + }, + { + "epoch": 1.5468581687612208, + "grad_norm": 0.8370407223701477, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 21540 + }, + { + "epoch": 1.5475763016157988, + "grad_norm": 0.7138169407844543, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 21550 + }, + { + "epoch": 1.548294434470377, + "grad_norm": 0.7660839557647705, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 21560 + }, + { + "epoch": 1.549012567324955, + "grad_norm": 0.6628666520118713, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 21570 + }, + { + "epoch": 1.5497307001795333, + "grad_norm": 0.602262020111084, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 21580 + }, + { + "epoch": 1.5504488330341113, + "grad_norm": 0.6120333671569824, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 21590 + }, + { + "epoch": 1.5511669658886893, + "grad_norm": 0.6742582321166992, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 21600 + }, + { + "epoch": 1.5518850987432675, + "grad_norm": 0.6788192391395569, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 21610 + }, + { + "epoch": 1.5526032315978457, + "grad_norm": 0.7124713659286499, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 21620 + }, + { + "epoch": 1.5533213644524237, + "grad_norm": 0.6297248005867004, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 21630 + }, + { + "epoch": 1.5540394973070017, + "grad_norm": 0.8977078199386597, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21640 + }, + { + "epoch": 1.5547576301615798, + "grad_norm": 0.7543209791183472, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 21650 + }, + { + "epoch": 1.555475763016158, + "grad_norm": 0.8704302310943604, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 21660 + }, + { + "epoch": 1.5561938958707362, + "grad_norm": 0.7848012447357178, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 21670 + }, + { + "epoch": 1.5569120287253142, + "grad_norm": 0.7496278285980225, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 21680 + }, + { + "epoch": 1.5576301615798922, + "grad_norm": 0.7305200099945068, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 21690 + }, + { + "epoch": 1.5583482944344704, + "grad_norm": 0.6671105623245239, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 21700 + }, + { + "epoch": 1.5590664272890484, + "grad_norm": 0.8536111116409302, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 21710 + }, + { + "epoch": 1.5597845601436267, + "grad_norm": 0.7360461354255676, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 21720 + }, + { + "epoch": 1.5605026929982047, + "grad_norm": 0.6665109395980835, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 21730 + }, + { + "epoch": 1.5612208258527827, + "grad_norm": 0.5879628658294678, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 21740 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 0.6937240958213806, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 21750 + }, + { + "epoch": 1.562657091561939, + "grad_norm": 0.7118659019470215, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 21760 + }, + { + "epoch": 1.563375224416517, + "grad_norm": 0.7858866453170776, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 21770 + }, + { + "epoch": 1.564093357271095, + "grad_norm": 0.8691372871398926, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 21780 + }, + { + "epoch": 1.564811490125673, + "grad_norm": 0.8884942531585693, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 21790 + }, + { + "epoch": 1.5655296229802513, + "grad_norm": 0.6335656046867371, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 21800 + }, + { + "epoch": 1.5662477558348296, + "grad_norm": 0.8666166067123413, + "learning_rate": 0.0002, + "loss": 0.7233, + "step": 21810 + }, + { + "epoch": 1.5669658886894076, + "grad_norm": 0.7961624264717102, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 21820 + }, + { + "epoch": 1.5676840215439856, + "grad_norm": 0.6331174373626709, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 21830 + }, + { + "epoch": 1.5684021543985638, + "grad_norm": 0.6476998925209045, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 21840 + }, + { + "epoch": 1.5691202872531418, + "grad_norm": 0.8279129266738892, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 21850 + }, + { + "epoch": 1.56983842010772, + "grad_norm": 0.6997109651565552, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 21860 + }, + { + "epoch": 1.570556552962298, + "grad_norm": 0.6992211937904358, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 21870 + }, + { + "epoch": 1.571274685816876, + "grad_norm": 0.7766915559768677, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 21880 + }, + { + "epoch": 1.5719928186714542, + "grad_norm": 0.6845845580101013, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 21890 + }, + { + "epoch": 1.5727109515260325, + "grad_norm": 0.7247874140739441, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 21900 + }, + { + "epoch": 1.5734290843806105, + "grad_norm": 0.802342414855957, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21910 + }, + { + "epoch": 1.5741472172351885, + "grad_norm": 0.7797709107398987, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 21920 + }, + { + "epoch": 1.5748653500897665, + "grad_norm": 0.6534958481788635, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21930 + }, + { + "epoch": 1.5755834829443447, + "grad_norm": 0.6003528237342834, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 21940 + }, + { + "epoch": 1.576301615798923, + "grad_norm": 0.6920075416564941, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 21950 + }, + { + "epoch": 1.577019748653501, + "grad_norm": 0.7213456034660339, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 21960 + }, + { + "epoch": 1.577737881508079, + "grad_norm": 0.7101914286613464, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 21970 + }, + { + "epoch": 1.5784560143626571, + "grad_norm": 0.9531592130661011, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 21980 + }, + { + "epoch": 1.5791741472172351, + "grad_norm": 0.7690590023994446, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 21990 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 0.8226363062858582, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 22000 + }, + { + "epoch": 1.5806104129263914, + "grad_norm": 0.6128851175308228, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 22010 + }, + { + "epoch": 1.5813285457809694, + "grad_norm": 0.827008068561554, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 22020 + }, + { + "epoch": 1.5820466786355476, + "grad_norm": 0.6729007363319397, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 22030 + }, + { + "epoch": 1.5827648114901258, + "grad_norm": 0.6397014260292053, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 22040 + }, + { + "epoch": 1.5834829443447038, + "grad_norm": 0.6927793622016907, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 22050 + }, + { + "epoch": 1.5842010771992818, + "grad_norm": 0.7527112364768982, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 22060 + }, + { + "epoch": 1.5849192100538598, + "grad_norm": 0.6418012380599976, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 22070 + }, + { + "epoch": 1.585637342908438, + "grad_norm": 0.7627281546592712, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 22080 + }, + { + "epoch": 1.5863554757630163, + "grad_norm": 0.753851592540741, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22090 + }, + { + "epoch": 1.5870736086175943, + "grad_norm": 0.6049349904060364, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 22100 + }, + { + "epoch": 1.5877917414721723, + "grad_norm": 0.6677758693695068, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 22110 + }, + { + "epoch": 1.5885098743267505, + "grad_norm": 0.913489818572998, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22120 + }, + { + "epoch": 1.5892280071813285, + "grad_norm": 0.6779162883758545, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 22130 + }, + { + "epoch": 1.5899461400359067, + "grad_norm": 0.910076916217804, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 22140 + }, + { + "epoch": 1.5906642728904847, + "grad_norm": 0.9506068229675293, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 22150 + }, + { + "epoch": 1.5913824057450627, + "grad_norm": 0.6552460789680481, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 22160 + }, + { + "epoch": 1.592100538599641, + "grad_norm": 0.6855819821357727, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22170 + }, + { + "epoch": 1.5928186714542192, + "grad_norm": 0.6713384985923767, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 22180 + }, + { + "epoch": 1.5935368043087972, + "grad_norm": 0.7168547511100769, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 22190 + }, + { + "epoch": 1.5942549371633752, + "grad_norm": 0.8395482897758484, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22200 + }, + { + "epoch": 1.5949730700179532, + "grad_norm": 0.6676998138427734, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 22210 + }, + { + "epoch": 1.5956912028725314, + "grad_norm": 0.5837140083312988, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 22220 + }, + { + "epoch": 1.5964093357271096, + "grad_norm": 0.8399306535720825, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 22230 + }, + { + "epoch": 1.5971274685816876, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22240 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 0.768604040145874, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 22250 + }, + { + "epoch": 1.5985637342908436, + "grad_norm": 0.6382646560668945, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 22260 + }, + { + "epoch": 1.5992818671454219, + "grad_norm": 0.7244897484779358, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 22270 + }, + { + "epoch": 1.6, + "grad_norm": 0.6250987648963928, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 22280 + }, + { + "epoch": 1.600718132854578, + "grad_norm": 0.8731992244720459, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 22290 + }, + { + "epoch": 1.601436265709156, + "grad_norm": 0.5861822962760925, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 22300 + }, + { + "epoch": 1.6021543985637343, + "grad_norm": 0.716805100440979, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 22310 + }, + { + "epoch": 1.6028725314183125, + "grad_norm": 0.6650034189224243, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 22320 + }, + { + "epoch": 1.6035906642728905, + "grad_norm": 0.6944432854652405, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 22330 + }, + { + "epoch": 1.6043087971274685, + "grad_norm": 0.7411999106407166, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 22340 + }, + { + "epoch": 1.6050269299820465, + "grad_norm": 0.831828773021698, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 22350 + }, + { + "epoch": 1.6057450628366248, + "grad_norm": 0.6252152919769287, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 22360 + }, + { + "epoch": 1.606463195691203, + "grad_norm": 0.8643325567245483, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22370 + }, + { + "epoch": 1.607181328545781, + "grad_norm": 0.7330279350280762, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 22380 + }, + { + "epoch": 1.607899461400359, + "grad_norm": 0.7235422730445862, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 22390 + }, + { + "epoch": 1.608617594254937, + "grad_norm": 0.6940887570381165, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 22400 + }, + { + "epoch": 1.6093357271095152, + "grad_norm": 0.7907325625419617, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 22410 + }, + { + "epoch": 1.6100538599640934, + "grad_norm": 0.6899075508117676, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 22420 + }, + { + "epoch": 1.6107719928186714, + "grad_norm": 0.7057487368583679, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 22430 + }, + { + "epoch": 1.6114901256732495, + "grad_norm": 0.9235003590583801, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 22440 + }, + { + "epoch": 1.6122082585278277, + "grad_norm": 0.7238173484802246, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22450 + }, + { + "epoch": 1.612926391382406, + "grad_norm": 0.5931997299194336, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 22460 + }, + { + "epoch": 1.613644524236984, + "grad_norm": 0.6705866456031799, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 22470 + }, + { + "epoch": 1.614362657091562, + "grad_norm": 0.7392773032188416, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 22480 + }, + { + "epoch": 1.61508078994614, + "grad_norm": 0.6286543607711792, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 22490 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 0.7467446327209473, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 22500 + }, + { + "epoch": 1.6165170556552964, + "grad_norm": 0.8353021740913391, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 22510 + }, + { + "epoch": 1.6172351885098744, + "grad_norm": 0.7333045601844788, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 22520 + }, + { + "epoch": 1.6179533213644524, + "grad_norm": 0.6203709244728088, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 22530 + }, + { + "epoch": 1.6186714542190304, + "grad_norm": 0.5585690140724182, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 22540 + }, + { + "epoch": 1.6193895870736086, + "grad_norm": 0.7157222032546997, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 22550 + }, + { + "epoch": 1.6201077199281868, + "grad_norm": 0.8129993677139282, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 22560 + }, + { + "epoch": 1.6208258527827648, + "grad_norm": 0.6745335459709167, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 22570 + }, + { + "epoch": 1.6215439856373428, + "grad_norm": 0.7684996724128723, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 22580 + }, + { + "epoch": 1.622262118491921, + "grad_norm": 0.6735436916351318, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22590 + }, + { + "epoch": 1.6229802513464993, + "grad_norm": 0.7394272089004517, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 22600 + }, + { + "epoch": 1.6236983842010773, + "grad_norm": 0.7268046140670776, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 22610 + }, + { + "epoch": 1.6244165170556553, + "grad_norm": 0.8338810205459595, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 22620 + }, + { + "epoch": 1.6251346499102333, + "grad_norm": 0.9293080568313599, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 22630 + }, + { + "epoch": 1.6258527827648115, + "grad_norm": 0.8084996938705444, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 22640 + }, + { + "epoch": 1.6265709156193897, + "grad_norm": 0.6605180501937866, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22650 + }, + { + "epoch": 1.6272890484739677, + "grad_norm": 0.8402717113494873, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 22660 + }, + { + "epoch": 1.6280071813285457, + "grad_norm": 0.653055727481842, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 22670 + }, + { + "epoch": 1.6287253141831237, + "grad_norm": 0.6477823257446289, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 22680 + }, + { + "epoch": 1.629443447037702, + "grad_norm": 0.9053590893745422, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 22690 + }, + { + "epoch": 1.6301615798922802, + "grad_norm": 0.90384441614151, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 22700 + }, + { + "epoch": 1.6308797127468582, + "grad_norm": 0.6789469122886658, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 22710 + }, + { + "epoch": 1.6315978456014362, + "grad_norm": 0.7221854329109192, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 22720 + }, + { + "epoch": 1.6323159784560144, + "grad_norm": 0.7724022269248962, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 22730 + }, + { + "epoch": 1.6330341113105926, + "grad_norm": 0.8213715553283691, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 22740 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 0.7102876305580139, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 22750 + }, + { + "epoch": 1.6344703770197486, + "grad_norm": 0.8817880749702454, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 22760 + }, + { + "epoch": 1.6351885098743266, + "grad_norm": 0.8446506857872009, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 22770 + }, + { + "epoch": 1.6359066427289048, + "grad_norm": 0.6749029755592346, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 22780 + }, + { + "epoch": 1.636624775583483, + "grad_norm": 0.7013556957244873, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 22790 + }, + { + "epoch": 1.637342908438061, + "grad_norm": 0.7767965793609619, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22800 + }, + { + "epoch": 1.638061041292639, + "grad_norm": 0.7354073524475098, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 22810 + }, + { + "epoch": 1.638779174147217, + "grad_norm": 0.8871088027954102, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 22820 + }, + { + "epoch": 1.6394973070017953, + "grad_norm": 0.6573871374130249, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 22830 + }, + { + "epoch": 1.6402154398563735, + "grad_norm": 0.5679349303245544, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 22840 + }, + { + "epoch": 1.6409335727109515, + "grad_norm": 0.7072559595108032, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 22850 + }, + { + "epoch": 1.6416517055655295, + "grad_norm": 0.7639257311820984, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 22860 + }, + { + "epoch": 1.6423698384201078, + "grad_norm": 0.6699341535568237, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 22870 + }, + { + "epoch": 1.643087971274686, + "grad_norm": 0.8285767436027527, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 22880 + }, + { + "epoch": 1.643806104129264, + "grad_norm": 0.7328150272369385, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 22890 + }, + { + "epoch": 1.644524236983842, + "grad_norm": 0.8122354745864868, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 22900 + }, + { + "epoch": 1.64524236983842, + "grad_norm": 0.7322969436645508, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 22910 + }, + { + "epoch": 1.6459605026929982, + "grad_norm": 0.7269576191902161, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 22920 + }, + { + "epoch": 1.6466786355475764, + "grad_norm": 0.7037042379379272, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 22930 + }, + { + "epoch": 1.6473967684021544, + "grad_norm": 0.6960355639457703, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 22940 + }, + { + "epoch": 1.6481149012567324, + "grad_norm": 0.7446839213371277, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 22950 + }, + { + "epoch": 1.6488330341113104, + "grad_norm": 0.7201664447784424, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 22960 + }, + { + "epoch": 1.6495511669658887, + "grad_norm": 0.7062349319458008, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 22970 + }, + { + "epoch": 1.6502692998204669, + "grad_norm": 0.7666636109352112, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 22980 + }, + { + "epoch": 1.6509874326750449, + "grad_norm": 0.7872112393379211, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 22990 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 0.7428551316261292, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 23000 + }, + { + "epoch": 1.6524236983842011, + "grad_norm": 0.6087952852249146, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 23010 + }, + { + "epoch": 1.6531418312387793, + "grad_norm": 0.7191354036331177, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 23020 + }, + { + "epoch": 1.6538599640933573, + "grad_norm": 0.8679710626602173, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 23030 + }, + { + "epoch": 1.6545780969479353, + "grad_norm": 0.7232310175895691, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 23040 + }, + { + "epoch": 1.6552962298025133, + "grad_norm": 0.5695104002952576, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 23050 + }, + { + "epoch": 1.6560143626570916, + "grad_norm": 0.6363076567649841, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 23060 + }, + { + "epoch": 1.6567324955116698, + "grad_norm": 0.8168749809265137, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23070 + }, + { + "epoch": 1.6574506283662478, + "grad_norm": 0.7664111852645874, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 23080 + }, + { + "epoch": 1.6581687612208258, + "grad_norm": 0.6748140454292297, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 23090 + }, + { + "epoch": 1.6588868940754038, + "grad_norm": 0.6258183121681213, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 23100 + }, + { + "epoch": 1.659605026929982, + "grad_norm": 0.8669735193252563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 23110 + }, + { + "epoch": 1.6603231597845602, + "grad_norm": 0.5606119632720947, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 23120 + }, + { + "epoch": 1.6610412926391382, + "grad_norm": 0.6602507829666138, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 23130 + }, + { + "epoch": 1.6617594254937162, + "grad_norm": 0.7237988710403442, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 23140 + }, + { + "epoch": 1.6624775583482945, + "grad_norm": 0.9054415225982666, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 23150 + }, + { + "epoch": 1.6631956912028727, + "grad_norm": 0.5186660289764404, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 23160 + }, + { + "epoch": 1.6639138240574507, + "grad_norm": 0.719584584236145, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 23170 + }, + { + "epoch": 1.6646319569120287, + "grad_norm": 0.7583617568016052, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 23180 + }, + { + "epoch": 1.6653500897666067, + "grad_norm": 0.7985982298851013, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 23190 + }, + { + "epoch": 1.666068222621185, + "grad_norm": 0.6952691674232483, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23200 + }, + { + "epoch": 1.6667863554757631, + "grad_norm": 0.7184221744537354, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 23210 + }, + { + "epoch": 1.6675044883303412, + "grad_norm": 0.8256361484527588, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 23220 + }, + { + "epoch": 1.6682226211849192, + "grad_norm": 0.7534128427505493, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 23230 + }, + { + "epoch": 1.6689407540394972, + "grad_norm": 0.7711095213890076, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 23240 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 0.6326615810394287, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 23250 + }, + { + "epoch": 1.6703770197486536, + "grad_norm": 0.8345766663551331, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 23260 + }, + { + "epoch": 1.6710951526032316, + "grad_norm": 0.9079837203025818, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 23270 + }, + { + "epoch": 1.6718132854578096, + "grad_norm": 0.7310197353363037, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 23280 + }, + { + "epoch": 1.6725314183123878, + "grad_norm": 0.7573344707489014, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 23290 + }, + { + "epoch": 1.673249551166966, + "grad_norm": 0.7708047032356262, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 23300 + }, + { + "epoch": 1.673967684021544, + "grad_norm": 0.7665812969207764, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 23310 + }, + { + "epoch": 1.674685816876122, + "grad_norm": 0.7988788485527039, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 23320 + }, + { + "epoch": 1.6754039497307, + "grad_norm": 0.755042552947998, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 23330 + }, + { + "epoch": 1.6761220825852783, + "grad_norm": 0.6605848670005798, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 23340 + }, + { + "epoch": 1.6768402154398565, + "grad_norm": 0.8762016296386719, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 23350 + }, + { + "epoch": 1.6775583482944345, + "grad_norm": 0.604742169380188, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 23360 + }, + { + "epoch": 1.6782764811490125, + "grad_norm": 0.7479172945022583, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 23370 + }, + { + "epoch": 1.6789946140035905, + "grad_norm": 0.6418702602386475, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 23380 + }, + { + "epoch": 1.6797127468581687, + "grad_norm": 0.6783933639526367, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 23390 + }, + { + "epoch": 1.680430879712747, + "grad_norm": 0.7036024928092957, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 23400 + }, + { + "epoch": 1.681149012567325, + "grad_norm": 0.6833266615867615, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 23410 + }, + { + "epoch": 1.681867145421903, + "grad_norm": 0.8867062330245972, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 23420 + }, + { + "epoch": 1.6825852782764812, + "grad_norm": 0.7825753092765808, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 23430 + }, + { + "epoch": 1.6833034111310592, + "grad_norm": 0.6396880745887756, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 23440 + }, + { + "epoch": 1.6840215439856374, + "grad_norm": 0.5723230242729187, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 23450 + }, + { + "epoch": 1.6847396768402154, + "grad_norm": 0.6949231624603271, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 23460 + }, + { + "epoch": 1.6854578096947934, + "grad_norm": 0.8290650248527527, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 23470 + }, + { + "epoch": 1.6861759425493716, + "grad_norm": 0.7765078544616699, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 23480 + }, + { + "epoch": 1.6868940754039499, + "grad_norm": 0.7084149718284607, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 23490 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 0.6916654109954834, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 23500 + }, + { + "epoch": 1.6883303411131059, + "grad_norm": 0.5615179538726807, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 23510 + }, + { + "epoch": 1.6890484739676839, + "grad_norm": 0.7996105551719666, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 23520 + }, + { + "epoch": 1.689766606822262, + "grad_norm": 0.7010168433189392, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23530 + }, + { + "epoch": 1.6904847396768403, + "grad_norm": 0.7876442074775696, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 23540 + }, + { + "epoch": 1.6912028725314183, + "grad_norm": 0.7508043646812439, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 23550 + }, + { + "epoch": 1.6919210053859963, + "grad_norm": 0.8125874400138855, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 23560 + }, + { + "epoch": 1.6926391382405745, + "grad_norm": 0.711840808391571, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 23570 + }, + { + "epoch": 1.6933572710951525, + "grad_norm": 0.6540026068687439, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 23580 + }, + { + "epoch": 1.6940754039497308, + "grad_norm": 0.8376550078392029, + "learning_rate": 0.0002, + "loss": 0.7578, + "step": 23590 + }, + { + "epoch": 1.6947935368043088, + "grad_norm": 0.7075366973876953, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 23600 + }, + { + "epoch": 1.6955116696588868, + "grad_norm": 0.7522266507148743, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23610 + }, + { + "epoch": 1.696229802513465, + "grad_norm": 0.7572667002677917, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 23620 + }, + { + "epoch": 1.6969479353680432, + "grad_norm": 0.6126907467842102, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 23630 + }, + { + "epoch": 1.6976660682226212, + "grad_norm": 0.7473152875900269, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 23640 + }, + { + "epoch": 1.6983842010771992, + "grad_norm": 0.6630390286445618, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 23650 + }, + { + "epoch": 1.6991023339317772, + "grad_norm": 0.5848073363304138, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 23660 + }, + { + "epoch": 1.6998204667863555, + "grad_norm": 0.5901942849159241, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 23670 + }, + { + "epoch": 1.7005385996409337, + "grad_norm": 0.7896918058395386, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 23680 + }, + { + "epoch": 1.7012567324955117, + "grad_norm": 0.705362856388092, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 23690 + }, + { + "epoch": 1.7019748653500897, + "grad_norm": 0.9917470812797546, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 23700 + }, + { + "epoch": 1.702692998204668, + "grad_norm": 0.7550538778305054, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 23710 + }, + { + "epoch": 1.703411131059246, + "grad_norm": 0.8348238468170166, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23720 + }, + { + "epoch": 1.7041292639138241, + "grad_norm": 0.5979694128036499, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 23730 + }, + { + "epoch": 1.7048473967684021, + "grad_norm": 0.7451775670051575, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 23740 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 0.7614818215370178, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 23750 + }, + { + "epoch": 1.7062836624775584, + "grad_norm": 0.5590742826461792, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 23760 + }, + { + "epoch": 1.7070017953321366, + "grad_norm": 0.7039094567298889, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 23770 + }, + { + "epoch": 1.7077199281867146, + "grad_norm": 0.7963233590126038, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23780 + }, + { + "epoch": 1.7084380610412926, + "grad_norm": 0.7214934825897217, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 23790 + }, + { + "epoch": 1.7091561938958706, + "grad_norm": 0.7310500741004944, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23800 + }, + { + "epoch": 1.7098743267504488, + "grad_norm": 0.6653284430503845, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 23810 + }, + { + "epoch": 1.710592459605027, + "grad_norm": 0.6632702946662903, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 23820 + }, + { + "epoch": 1.711310592459605, + "grad_norm": 0.6314955949783325, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 23830 + }, + { + "epoch": 1.712028725314183, + "grad_norm": 0.73652583360672, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 23840 + }, + { + "epoch": 1.7127468581687613, + "grad_norm": 0.5685144662857056, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 23850 + }, + { + "epoch": 1.7134649910233393, + "grad_norm": 0.7010223865509033, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 23860 + }, + { + "epoch": 1.7141831238779175, + "grad_norm": 0.7643879652023315, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 23870 + }, + { + "epoch": 1.7149012567324955, + "grad_norm": 0.7543165683746338, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 23880 + }, + { + "epoch": 1.7156193895870735, + "grad_norm": 0.8816508054733276, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 23890 + }, + { + "epoch": 1.7163375224416517, + "grad_norm": 0.7979614734649658, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23900 + }, + { + "epoch": 1.71705565529623, + "grad_norm": 0.7631057500839233, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 23910 + }, + { + "epoch": 1.717773788150808, + "grad_norm": 0.6349977254867554, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 23920 + }, + { + "epoch": 1.718491921005386, + "grad_norm": 0.7464412450790405, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 23930 + }, + { + "epoch": 1.719210053859964, + "grad_norm": 0.6985567212104797, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 23940 + }, + { + "epoch": 1.7199281867145422, + "grad_norm": 0.6641302704811096, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 23950 + }, + { + "epoch": 1.7206463195691204, + "grad_norm": 0.7299597263336182, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 23960 + }, + { + "epoch": 1.7213644524236984, + "grad_norm": 0.7812355756759644, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 23970 + }, + { + "epoch": 1.7220825852782764, + "grad_norm": 0.667571485042572, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 23980 + }, + { + "epoch": 1.7228007181328546, + "grad_norm": 0.8244081735610962, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 23990 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 0.6684445738792419, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 24000 + }, + { + "epoch": 1.7242369838420109, + "grad_norm": 0.7002949118614197, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 24010 + }, + { + "epoch": 1.7249551166965889, + "grad_norm": 0.6249772906303406, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 24020 + }, + { + "epoch": 1.7256732495511669, + "grad_norm": 0.7279905080795288, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 24030 + }, + { + "epoch": 1.726391382405745, + "grad_norm": 0.631148636341095, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 24040 + }, + { + "epoch": 1.7271095152603233, + "grad_norm": 0.7486464977264404, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 24050 + }, + { + "epoch": 1.7278276481149013, + "grad_norm": 0.7494347095489502, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 24060 + }, + { + "epoch": 1.7285457809694793, + "grad_norm": 0.7821264863014221, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 24070 + }, + { + "epoch": 1.7292639138240573, + "grad_norm": 0.7211608290672302, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 24080 + }, + { + "epoch": 1.7299820466786355, + "grad_norm": 0.7028553485870361, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 24090 + }, + { + "epoch": 1.7307001795332138, + "grad_norm": 0.6189247369766235, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 24100 + }, + { + "epoch": 1.7314183123877918, + "grad_norm": 0.7339756488800049, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 24110 + }, + { + "epoch": 1.7321364452423698, + "grad_norm": 0.6700502038002014, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 24120 + }, + { + "epoch": 1.732854578096948, + "grad_norm": 0.6139533519744873, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 24130 + }, + { + "epoch": 1.733572710951526, + "grad_norm": 0.7249825596809387, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 24140 + }, + { + "epoch": 1.7342908438061042, + "grad_norm": 0.6531777381896973, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 24150 + }, + { + "epoch": 1.7350089766606822, + "grad_norm": 0.8443833589553833, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 24160 + }, + { + "epoch": 1.7357271095152602, + "grad_norm": 0.7040373086929321, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 24170 + }, + { + "epoch": 1.7364452423698384, + "grad_norm": 0.8647749423980713, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24180 + }, + { + "epoch": 1.7371633752244167, + "grad_norm": 0.7297305464744568, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 24190 + }, + { + "epoch": 1.7378815080789947, + "grad_norm": 0.8191218376159668, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 24200 + }, + { + "epoch": 1.7385996409335727, + "grad_norm": 0.7315607666969299, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 24210 + }, + { + "epoch": 1.7393177737881507, + "grad_norm": 0.694486677646637, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 24220 + }, + { + "epoch": 1.740035906642729, + "grad_norm": 0.8115953207015991, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 24230 + }, + { + "epoch": 1.7407540394973071, + "grad_norm": 0.7379186153411865, + "learning_rate": 0.0002, + "loss": 0.7792, + "step": 24240 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 0.6820309162139893, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 24250 + }, + { + "epoch": 1.7421903052064631, + "grad_norm": 0.8210766911506653, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 24260 + }, + { + "epoch": 1.7429084380610413, + "grad_norm": 0.724466860294342, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 24270 + }, + { + "epoch": 1.7436265709156193, + "grad_norm": 0.8768740296363831, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 24280 + }, + { + "epoch": 1.7443447037701976, + "grad_norm": 0.6691206097602844, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24290 + }, + { + "epoch": 1.7450628366247756, + "grad_norm": 0.6529893279075623, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 24300 + }, + { + "epoch": 1.7457809694793536, + "grad_norm": 0.904729962348938, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 24310 + }, + { + "epoch": 1.7464991023339318, + "grad_norm": 0.655235230922699, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24320 + }, + { + "epoch": 1.74721723518851, + "grad_norm": 0.9476361274719238, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 24330 + }, + { + "epoch": 1.747935368043088, + "grad_norm": 0.55366051197052, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 24340 + }, + { + "epoch": 1.748653500897666, + "grad_norm": 0.7192568182945251, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 24350 + }, + { + "epoch": 1.749371633752244, + "grad_norm": 0.7193983793258667, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 24360 + }, + { + "epoch": 1.7500897666068223, + "grad_norm": 0.753998339176178, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24370 + }, + { + "epoch": 1.7508078994614005, + "grad_norm": 1.1058299541473389, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 24380 + }, + { + "epoch": 1.7515260323159785, + "grad_norm": 0.7213007211685181, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 24390 + }, + { + "epoch": 1.7522441651705565, + "grad_norm": 0.972494900226593, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 24400 + }, + { + "epoch": 1.7529622980251347, + "grad_norm": 0.8045306205749512, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 24410 + }, + { + "epoch": 1.7536804308797127, + "grad_norm": 0.82415372133255, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24420 + }, + { + "epoch": 1.754398563734291, + "grad_norm": 0.72683185338974, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 24430 + }, + { + "epoch": 1.755116696588869, + "grad_norm": 0.687907338142395, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 24440 + }, + { + "epoch": 1.755834829443447, + "grad_norm": 0.6616531610488892, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 24450 + }, + { + "epoch": 1.7565529622980252, + "grad_norm": 0.7225571870803833, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 24460 + }, + { + "epoch": 1.7572710951526034, + "grad_norm": 0.7597603797912598, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 24470 + }, + { + "epoch": 1.7579892280071814, + "grad_norm": 0.7850660681724548, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 24480 + }, + { + "epoch": 1.7587073608617594, + "grad_norm": 0.9843530058860779, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 24490 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 0.7010256052017212, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 24500 + }, + { + "epoch": 1.7601436265709156, + "grad_norm": 0.5669383406639099, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 24510 + }, + { + "epoch": 1.7608617594254938, + "grad_norm": 0.7043302655220032, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 24520 + }, + { + "epoch": 1.7615798922800718, + "grad_norm": 0.8000741600990295, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 24530 + }, + { + "epoch": 1.7622980251346498, + "grad_norm": 0.7084416747093201, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 24540 + }, + { + "epoch": 1.763016157989228, + "grad_norm": 0.7290608882904053, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 24550 + }, + { + "epoch": 1.763734290843806, + "grad_norm": 0.8710007071495056, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 24560 + }, + { + "epoch": 1.7644524236983843, + "grad_norm": 0.6346535682678223, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 24570 + }, + { + "epoch": 1.7651705565529623, + "grad_norm": 0.8990599513053894, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 24580 + }, + { + "epoch": 1.7658886894075403, + "grad_norm": 0.7823857665061951, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 24590 + }, + { + "epoch": 1.7666068222621185, + "grad_norm": 0.6250144839286804, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 24600 + }, + { + "epoch": 1.7673249551166967, + "grad_norm": 0.715657114982605, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 24610 + }, + { + "epoch": 1.7680430879712747, + "grad_norm": 0.6254874467849731, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 24620 + }, + { + "epoch": 1.7687612208258527, + "grad_norm": 0.6873717904090881, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 24630 + }, + { + "epoch": 1.7694793536804307, + "grad_norm": 0.7273038625717163, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 24640 + }, + { + "epoch": 1.770197486535009, + "grad_norm": 0.9079981446266174, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 24650 + }, + { + "epoch": 1.7709156193895872, + "grad_norm": 0.6262510418891907, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 24660 + }, + { + "epoch": 1.7716337522441652, + "grad_norm": 0.7326231002807617, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 24670 + }, + { + "epoch": 1.7723518850987432, + "grad_norm": 0.7828301787376404, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 24680 + }, + { + "epoch": 1.7730700179533212, + "grad_norm": 0.5881586670875549, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 24690 + }, + { + "epoch": 1.7737881508078994, + "grad_norm": 0.7101683020591736, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 24700 + }, + { + "epoch": 1.7745062836624776, + "grad_norm": 0.8466469049453735, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 24710 + }, + { + "epoch": 1.7752244165170556, + "grad_norm": 0.7770822644233704, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 24720 + }, + { + "epoch": 1.7759425493716336, + "grad_norm": 0.7259120345115662, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 24730 + }, + { + "epoch": 1.7766606822262119, + "grad_norm": 0.7696824669837952, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 24740 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 0.7603837847709656, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 24750 + }, + { + "epoch": 1.778096947935368, + "grad_norm": 0.6166595220565796, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 24760 + }, + { + "epoch": 1.778815080789946, + "grad_norm": 0.7493758797645569, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 24770 + }, + { + "epoch": 1.779533213644524, + "grad_norm": 0.7177459597587585, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 24780 + }, + { + "epoch": 1.7802513464991023, + "grad_norm": 0.6666781306266785, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 24790 + }, + { + "epoch": 1.7809694793536806, + "grad_norm": 0.6556468605995178, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 24800 + }, + { + "epoch": 1.7816876122082586, + "grad_norm": 0.6119393706321716, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 24810 + }, + { + "epoch": 1.7824057450628366, + "grad_norm": 0.8573325276374817, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 24820 + }, + { + "epoch": 1.7831238779174146, + "grad_norm": 0.8017005920410156, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 24830 + }, + { + "epoch": 1.7838420107719928, + "grad_norm": 0.7337947487831116, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24840 + }, + { + "epoch": 1.784560143626571, + "grad_norm": 0.6717178225517273, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 24850 + }, + { + "epoch": 1.785278276481149, + "grad_norm": 0.8243708610534668, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 24860 + }, + { + "epoch": 1.785996409335727, + "grad_norm": 0.8111547827720642, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24870 + }, + { + "epoch": 1.7867145421903052, + "grad_norm": 0.8577823042869568, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 24880 + }, + { + "epoch": 1.7874326750448835, + "grad_norm": 0.6488644480705261, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 24890 + }, + { + "epoch": 1.7881508078994615, + "grad_norm": 0.6446744799613953, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 24900 + }, + { + "epoch": 1.7888689407540395, + "grad_norm": 0.6400182247161865, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 24910 + }, + { + "epoch": 1.7895870736086175, + "grad_norm": 0.8059108853340149, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 24920 + }, + { + "epoch": 1.7903052064631957, + "grad_norm": 0.7101734280586243, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 24930 + }, + { + "epoch": 1.791023339317774, + "grad_norm": 1.0397762060165405, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 24940 + }, + { + "epoch": 1.791741472172352, + "grad_norm": 0.6231128573417664, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 24950 + }, + { + "epoch": 1.79245960502693, + "grad_norm": 5.905253887176514, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 24960 + }, + { + "epoch": 1.793177737881508, + "grad_norm": 0.8003911375999451, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 24970 + }, + { + "epoch": 1.7938958707360861, + "grad_norm": 0.6340393424034119, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 24980 + }, + { + "epoch": 1.7946140035906644, + "grad_norm": 0.8701013922691345, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 24990 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 0.9085575342178345, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 25000 + }, + { + "epoch": 1.7960502692998204, + "grad_norm": 0.6306625604629517, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 25010 + }, + { + "epoch": 1.7967684021543986, + "grad_norm": 0.6985056400299072, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25020 + }, + { + "epoch": 1.7974865350089768, + "grad_norm": 0.7309113144874573, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 25030 + }, + { + "epoch": 1.7982046678635548, + "grad_norm": 0.6795042157173157, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 25040 + }, + { + "epoch": 1.7989228007181328, + "grad_norm": 0.6920178532600403, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25050 + }, + { + "epoch": 1.7996409335727108, + "grad_norm": 0.6578564047813416, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25060 + }, + { + "epoch": 1.800359066427289, + "grad_norm": 0.6718358993530273, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 25070 + }, + { + "epoch": 1.8010771992818673, + "grad_norm": 0.9086750149726868, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 25080 + }, + { + "epoch": 1.8017953321364453, + "grad_norm": 0.6102437973022461, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 25090 + }, + { + "epoch": 1.8025134649910233, + "grad_norm": 0.6391313076019287, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 25100 + }, + { + "epoch": 1.8032315978456013, + "grad_norm": 0.7150128483772278, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 25110 + }, + { + "epoch": 1.8039497307001795, + "grad_norm": 0.9833421111106873, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 25120 + }, + { + "epoch": 1.8046678635547577, + "grad_norm": 0.774002194404602, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25130 + }, + { + "epoch": 1.8053859964093357, + "grad_norm": 0.644443154335022, + "learning_rate": 0.0002, + "loss": 0.7329, + "step": 25140 + }, + { + "epoch": 1.8061041292639137, + "grad_norm": 0.6996100544929504, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 25150 + }, + { + "epoch": 1.806822262118492, + "grad_norm": 0.7545985579490662, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 25160 + }, + { + "epoch": 1.8075403949730702, + "grad_norm": 0.7505226731300354, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 25170 + }, + { + "epoch": 1.8082585278276482, + "grad_norm": 0.800681471824646, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 25180 + }, + { + "epoch": 1.8089766606822262, + "grad_norm": 0.8268337845802307, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 25190 + }, + { + "epoch": 1.8096947935368042, + "grad_norm": 0.6436594128608704, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 25200 + }, + { + "epoch": 1.8104129263913824, + "grad_norm": 0.6961014270782471, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 25210 + }, + { + "epoch": 1.8111310592459606, + "grad_norm": 0.6649489998817444, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 25220 + }, + { + "epoch": 1.8118491921005386, + "grad_norm": 0.7071637511253357, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 25230 + }, + { + "epoch": 1.8125673249551166, + "grad_norm": 0.9082241654396057, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 25240 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 0.6318159103393555, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 25250 + }, + { + "epoch": 1.8140035906642729, + "grad_norm": 0.8006597757339478, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 25260 + }, + { + "epoch": 1.814721723518851, + "grad_norm": 0.7950259447097778, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 25270 + }, + { + "epoch": 1.815439856373429, + "grad_norm": 0.8376588821411133, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 25280 + }, + { + "epoch": 1.816157989228007, + "grad_norm": 0.8343217968940735, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 25290 + }, + { + "epoch": 1.8168761220825853, + "grad_norm": 0.6240017414093018, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 25300 + }, + { + "epoch": 1.8175942549371635, + "grad_norm": 0.7079808712005615, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 25310 + }, + { + "epoch": 1.8183123877917415, + "grad_norm": 0.5930073261260986, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 25320 + }, + { + "epoch": 1.8190305206463195, + "grad_norm": 0.6994491815567017, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 25330 + }, + { + "epoch": 1.8197486535008975, + "grad_norm": 0.8285305500030518, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 25340 + }, + { + "epoch": 1.8204667863554758, + "grad_norm": 0.6880194544792175, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 25350 + }, + { + "epoch": 1.821184919210054, + "grad_norm": 0.7301307916641235, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 25360 + }, + { + "epoch": 1.821903052064632, + "grad_norm": 0.8117532730102539, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 25370 + }, + { + "epoch": 1.82262118491921, + "grad_norm": 0.8098701238632202, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 25380 + }, + { + "epoch": 1.823339317773788, + "grad_norm": 0.6899038553237915, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 25390 + }, + { + "epoch": 1.8240574506283662, + "grad_norm": 0.7350431084632874, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 25400 + }, + { + "epoch": 1.8247755834829444, + "grad_norm": 0.8723382949829102, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 25410 + }, + { + "epoch": 1.8254937163375224, + "grad_norm": 0.7448108196258545, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 25420 + }, + { + "epoch": 1.8262118491921004, + "grad_norm": 0.7525040507316589, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25430 + }, + { + "epoch": 1.8269299820466787, + "grad_norm": 0.7148599028587341, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25440 + }, + { + "epoch": 1.827648114901257, + "grad_norm": 1.1802153587341309, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 25450 + }, + { + "epoch": 1.828366247755835, + "grad_norm": 0.619945764541626, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25460 + }, + { + "epoch": 1.829084380610413, + "grad_norm": 0.7065792679786682, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 25470 + }, + { + "epoch": 1.829802513464991, + "grad_norm": 0.6626001596450806, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 25480 + }, + { + "epoch": 1.8305206463195691, + "grad_norm": 0.8368920087814331, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 25490 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 0.7528934478759766, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 25500 + }, + { + "epoch": 1.8319569120287253, + "grad_norm": 0.6472136378288269, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 25510 + }, + { + "epoch": 1.8326750448833034, + "grad_norm": 0.7818671464920044, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 25520 + }, + { + "epoch": 1.8333931777378814, + "grad_norm": 0.8280798196792603, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 25530 + }, + { + "epoch": 1.8341113105924596, + "grad_norm": 0.7038599252700806, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 25540 + }, + { + "epoch": 1.8348294434470378, + "grad_norm": 0.6345962882041931, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 25550 + }, + { + "epoch": 1.8355475763016158, + "grad_norm": 0.6891741752624512, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 25560 + }, + { + "epoch": 1.8362657091561938, + "grad_norm": 0.7753492593765259, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 25570 + }, + { + "epoch": 1.836983842010772, + "grad_norm": 0.6907210946083069, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 25580 + }, + { + "epoch": 1.8377019748653503, + "grad_norm": 0.7483090162277222, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 25590 + }, + { + "epoch": 1.8384201077199283, + "grad_norm": 0.8749029636383057, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 25600 + }, + { + "epoch": 1.8391382405745063, + "grad_norm": 0.6936851143836975, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 25610 + }, + { + "epoch": 1.8398563734290843, + "grad_norm": 0.7273763418197632, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 25620 + }, + { + "epoch": 1.8405745062836625, + "grad_norm": 0.7655298113822937, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 25630 + }, + { + "epoch": 1.8412926391382407, + "grad_norm": 0.7207344770431519, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 25640 + }, + { + "epoch": 1.8420107719928187, + "grad_norm": 0.6970131397247314, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 25650 + }, + { + "epoch": 1.8427289048473967, + "grad_norm": 0.7777560353279114, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25660 + }, + { + "epoch": 1.8434470377019747, + "grad_norm": 0.7070116400718689, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 25670 + }, + { + "epoch": 1.844165170556553, + "grad_norm": 0.6980257630348206, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 25680 + }, + { + "epoch": 1.8448833034111312, + "grad_norm": 0.906563401222229, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 25690 + }, + { + "epoch": 1.8456014362657092, + "grad_norm": 0.567991316318512, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 25700 + }, + { + "epoch": 1.8463195691202872, + "grad_norm": 0.5954506993293762, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 25710 + }, + { + "epoch": 1.8470377019748654, + "grad_norm": 0.8073318600654602, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 25720 + }, + { + "epoch": 1.8477558348294436, + "grad_norm": 0.7439551949501038, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 25730 + }, + { + "epoch": 1.8484739676840216, + "grad_norm": 0.8091771602630615, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 25740 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 0.6584576964378357, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 25750 + }, + { + "epoch": 1.8499102333931776, + "grad_norm": 0.8161963224411011, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 25760 + }, + { + "epoch": 1.8506283662477558, + "grad_norm": 0.7337122559547424, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 25770 + }, + { + "epoch": 1.851346499102334, + "grad_norm": 0.8968114256858826, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25780 + }, + { + "epoch": 1.852064631956912, + "grad_norm": 0.8647686839103699, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 25790 + }, + { + "epoch": 1.85278276481149, + "grad_norm": 0.7775349020957947, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 25800 + }, + { + "epoch": 1.853500897666068, + "grad_norm": 0.686072587966919, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 25810 + }, + { + "epoch": 1.8542190305206463, + "grad_norm": 0.7053380012512207, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 25820 + }, + { + "epoch": 1.8549371633752245, + "grad_norm": 0.7899979948997498, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 25830 + }, + { + "epoch": 1.8556552962298025, + "grad_norm": 0.6970776915550232, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 25840 + }, + { + "epoch": 1.8563734290843805, + "grad_norm": 0.7210841774940491, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 25850 + }, + { + "epoch": 1.8570915619389587, + "grad_norm": 0.7297208905220032, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 25860 + }, + { + "epoch": 1.857809694793537, + "grad_norm": 0.7782729268074036, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 25870 + }, + { + "epoch": 1.858527827648115, + "grad_norm": 0.7227505445480347, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 25880 + }, + { + "epoch": 1.859245960502693, + "grad_norm": 0.7489684224128723, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 25890 + }, + { + "epoch": 1.859964093357271, + "grad_norm": 0.7447289824485779, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 25900 + }, + { + "epoch": 1.8606822262118492, + "grad_norm": 0.8516317009925842, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 25910 + }, + { + "epoch": 1.8614003590664274, + "grad_norm": 0.6864543557167053, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 25920 + }, + { + "epoch": 1.8621184919210054, + "grad_norm": 0.6753451824188232, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 25930 + }, + { + "epoch": 1.8628366247755834, + "grad_norm": 0.631679117679596, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25940 + }, + { + "epoch": 1.8635547576301614, + "grad_norm": 0.7715049982070923, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 25950 + }, + { + "epoch": 1.8642728904847397, + "grad_norm": 0.7354850769042969, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 25960 + }, + { + "epoch": 1.8649910233393179, + "grad_norm": 0.7443442940711975, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 25970 + }, + { + "epoch": 1.8657091561938959, + "grad_norm": 0.6880337595939636, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 25980 + }, + { + "epoch": 1.8664272890484739, + "grad_norm": 0.843941867351532, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 25990 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 0.6904318928718567, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 26000 + }, + { + "epoch": 1.86786355475763, + "grad_norm": 0.9041751623153687, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 26010 + }, + { + "epoch": 1.8685816876122083, + "grad_norm": 0.7470057010650635, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 26020 + }, + { + "epoch": 1.8692998204667863, + "grad_norm": 0.6921331882476807, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 26030 + }, + { + "epoch": 1.8700179533213643, + "grad_norm": 0.7627376914024353, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 26040 + }, + { + "epoch": 1.8707360861759426, + "grad_norm": 0.7784932851791382, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 26050 + }, + { + "epoch": 1.8714542190305208, + "grad_norm": 0.6399524807929993, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 26060 + }, + { + "epoch": 1.8721723518850988, + "grad_norm": 0.6478492617607117, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26070 + }, + { + "epoch": 1.8728904847396768, + "grad_norm": 0.6376804113388062, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 26080 + }, + { + "epoch": 1.8736086175942548, + "grad_norm": 0.6976892352104187, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 26090 + }, + { + "epoch": 1.874326750448833, + "grad_norm": 0.7997903227806091, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 26100 + }, + { + "epoch": 1.8750448833034112, + "grad_norm": 0.6984273791313171, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 26110 + }, + { + "epoch": 1.8757630161579892, + "grad_norm": 0.7020659446716309, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26120 + }, + { + "epoch": 1.8764811490125672, + "grad_norm": 0.784986138343811, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 26130 + }, + { + "epoch": 1.8771992818671455, + "grad_norm": 0.7369210124015808, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 26140 + }, + { + "epoch": 1.8779174147217235, + "grad_norm": 0.7730622291564941, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 26150 + }, + { + "epoch": 1.8786355475763017, + "grad_norm": 0.7253434658050537, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 26160 + }, + { + "epoch": 1.8793536804308797, + "grad_norm": 0.8019800186157227, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 26170 + }, + { + "epoch": 1.8800718132854577, + "grad_norm": 0.7337628602981567, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 26180 + }, + { + "epoch": 1.880789946140036, + "grad_norm": 0.7049200534820557, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 26190 + }, + { + "epoch": 1.8815080789946141, + "grad_norm": 0.6451525092124939, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 26200 + }, + { + "epoch": 1.8822262118491921, + "grad_norm": 0.7660874724388123, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 26210 + }, + { + "epoch": 1.8829443447037701, + "grad_norm": 0.8464223146438599, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26220 + }, + { + "epoch": 1.8836624775583481, + "grad_norm": 0.859503984451294, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 26230 + }, + { + "epoch": 1.8843806104129264, + "grad_norm": 0.6969478726387024, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 26240 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 0.6860285997390747, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 26250 + }, + { + "epoch": 1.8858168761220826, + "grad_norm": 0.5873110294342041, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 26260 + }, + { + "epoch": 1.8865350089766606, + "grad_norm": 0.6959530115127563, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 26270 + }, + { + "epoch": 1.8872531418312388, + "grad_norm": 0.8734689950942993, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 26280 + }, + { + "epoch": 1.8879712746858168, + "grad_norm": 0.7385509014129639, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 26290 + }, + { + "epoch": 1.888689407540395, + "grad_norm": 0.6702063083648682, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 26300 + }, + { + "epoch": 1.889407540394973, + "grad_norm": 0.8177255988121033, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 26310 + }, + { + "epoch": 1.890125673249551, + "grad_norm": 0.6638466715812683, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 26320 + }, + { + "epoch": 1.8908438061041293, + "grad_norm": 0.8584128618240356, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 26330 + }, + { + "epoch": 1.8915619389587075, + "grad_norm": 0.677561342716217, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 26340 + }, + { + "epoch": 1.8922800718132855, + "grad_norm": 0.6931864619255066, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 26350 + }, + { + "epoch": 1.8929982046678635, + "grad_norm": 0.6583828330039978, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 26360 + }, + { + "epoch": 1.8937163375224415, + "grad_norm": 0.6708519458770752, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 26370 + }, + { + "epoch": 1.8944344703770197, + "grad_norm": 0.7684788107872009, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 26380 + }, + { + "epoch": 1.895152603231598, + "grad_norm": 0.703217625617981, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 26390 + }, + { + "epoch": 1.895870736086176, + "grad_norm": 0.6686710119247437, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26400 + }, + { + "epoch": 1.896588868940754, + "grad_norm": 0.7429705262184143, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 26410 + }, + { + "epoch": 1.8973070017953322, + "grad_norm": 0.7835305333137512, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 26420 + }, + { + "epoch": 1.8980251346499102, + "grad_norm": 0.7793689370155334, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 26430 + }, + { + "epoch": 1.8987432675044884, + "grad_norm": 0.7337237000465393, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 26440 + }, + { + "epoch": 1.8994614003590664, + "grad_norm": 0.5734546780586243, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 26450 + }, + { + "epoch": 1.9001795332136444, + "grad_norm": 0.655937135219574, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 26460 + }, + { + "epoch": 1.9008976660682226, + "grad_norm": 1.0200905799865723, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 26470 + }, + { + "epoch": 1.9016157989228009, + "grad_norm": 0.6118829250335693, + "learning_rate": 0.0002, + "loss": 0.733, + "step": 26480 + }, + { + "epoch": 1.9023339317773789, + "grad_norm": 0.7459297776222229, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 26490 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 0.9451959729194641, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 26500 + }, + { + "epoch": 1.9037701974865349, + "grad_norm": 0.9694880247116089, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 26510 + }, + { + "epoch": 1.904488330341113, + "grad_norm": 0.806532084941864, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 26520 + }, + { + "epoch": 1.9052064631956913, + "grad_norm": 0.7016968727111816, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 26530 + }, + { + "epoch": 1.9059245960502693, + "grad_norm": 0.7707533836364746, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26540 + }, + { + "epoch": 1.9066427289048473, + "grad_norm": 0.716044545173645, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 26550 + }, + { + "epoch": 1.9073608617594255, + "grad_norm": 0.7904782295227051, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 26560 + }, + { + "epoch": 1.9080789946140035, + "grad_norm": 0.8557461500167847, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 26570 + }, + { + "epoch": 1.9087971274685818, + "grad_norm": 0.6807048916816711, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26580 + }, + { + "epoch": 1.9095152603231598, + "grad_norm": 0.8374032974243164, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 26590 + }, + { + "epoch": 1.9102333931777378, + "grad_norm": 0.7936834692955017, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 26600 + }, + { + "epoch": 1.910951526032316, + "grad_norm": 0.6342210173606873, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 26610 + }, + { + "epoch": 1.9116696588868942, + "grad_norm": 0.8222208023071289, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 26620 + }, + { + "epoch": 1.9123877917414722, + "grad_norm": 0.7890012860298157, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 26630 + }, + { + "epoch": 1.9131059245960502, + "grad_norm": 0.6415254473686218, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 26640 + }, + { + "epoch": 1.9138240574506282, + "grad_norm": 0.7936763763427734, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 26650 + }, + { + "epoch": 1.9145421903052064, + "grad_norm": 0.7174334526062012, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 26660 + }, + { + "epoch": 1.9152603231597847, + "grad_norm": 0.6503710746765137, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 26670 + }, + { + "epoch": 1.9159784560143627, + "grad_norm": 0.7618577480316162, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 26680 + }, + { + "epoch": 1.9166965888689407, + "grad_norm": 0.7984131574630737, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 26690 + }, + { + "epoch": 1.917414721723519, + "grad_norm": 0.6863887906074524, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 26700 + }, + { + "epoch": 1.918132854578097, + "grad_norm": 0.7621138691902161, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 26710 + }, + { + "epoch": 1.9188509874326751, + "grad_norm": 0.7855543494224548, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 26720 + }, + { + "epoch": 1.9195691202872531, + "grad_norm": 0.7045016288757324, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 26730 + }, + { + "epoch": 1.9202872531418311, + "grad_norm": 0.7799559235572815, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 26740 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 0.7999796271324158, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 26750 + }, + { + "epoch": 1.9217235188509876, + "grad_norm": 0.5479980111122131, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 26760 + }, + { + "epoch": 1.9224416517055656, + "grad_norm": 0.7192868590354919, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 26770 + }, + { + "epoch": 1.9231597845601436, + "grad_norm": 0.7642375826835632, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 26780 + }, + { + "epoch": 1.9238779174147216, + "grad_norm": 0.7015959620475769, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 26790 + }, + { + "epoch": 1.9245960502692998, + "grad_norm": 0.6685634851455688, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 26800 + }, + { + "epoch": 1.925314183123878, + "grad_norm": 0.674363911151886, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 26810 + }, + { + "epoch": 1.926032315978456, + "grad_norm": 0.769318163394928, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 26820 + }, + { + "epoch": 1.926750448833034, + "grad_norm": 0.7397989630699158, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 26830 + }, + { + "epoch": 1.9274685816876123, + "grad_norm": 0.7603814601898193, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 26840 + }, + { + "epoch": 1.9281867145421903, + "grad_norm": 0.5960564613342285, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 26850 + }, + { + "epoch": 1.9289048473967685, + "grad_norm": 0.8158858418464661, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 26860 + }, + { + "epoch": 1.9296229802513465, + "grad_norm": 0.7022058367729187, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 26870 + }, + { + "epoch": 1.9303411131059245, + "grad_norm": 0.7249060273170471, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 26880 + }, + { + "epoch": 1.9310592459605027, + "grad_norm": 0.7613264322280884, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 26890 + }, + { + "epoch": 1.931777378815081, + "grad_norm": 0.6857499480247498, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 26900 + }, + { + "epoch": 1.932495511669659, + "grad_norm": 0.6968346834182739, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 26910 + }, + { + "epoch": 1.933213644524237, + "grad_norm": 0.7079267501831055, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 26920 + }, + { + "epoch": 1.933931777378815, + "grad_norm": 0.6571618914604187, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 26930 + }, + { + "epoch": 1.9346499102333932, + "grad_norm": 0.7460548281669617, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 26940 + }, + { + "epoch": 1.9353680430879714, + "grad_norm": 0.7954307794570923, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 26950 + }, + { + "epoch": 1.9360861759425494, + "grad_norm": 0.8696223497390747, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 26960 + }, + { + "epoch": 1.9368043087971274, + "grad_norm": 0.726004421710968, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 26970 + }, + { + "epoch": 1.9375224416517056, + "grad_norm": 0.8760337829589844, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 26980 + }, + { + "epoch": 1.9382405745062836, + "grad_norm": 0.7308675646781921, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 26990 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 0.5900304317474365, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 27000 + }, + { + "epoch": 1.9396768402154398, + "grad_norm": 0.8839457631111145, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 27010 + }, + { + "epoch": 1.9403949730700178, + "grad_norm": 0.7239173650741577, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 27020 + }, + { + "epoch": 1.941113105924596, + "grad_norm": 0.8972901701927185, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 27030 + }, + { + "epoch": 1.9418312387791743, + "grad_norm": 0.7140652537345886, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 27040 + }, + { + "epoch": 1.9425493716337523, + "grad_norm": 0.7502743005752563, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 27050 + }, + { + "epoch": 1.9432675044883303, + "grad_norm": 0.6420751810073853, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 27060 + }, + { + "epoch": 1.9439856373429083, + "grad_norm": 0.6671820282936096, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 27070 + }, + { + "epoch": 1.9447037701974865, + "grad_norm": 0.6268796324729919, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 27080 + }, + { + "epoch": 1.9454219030520647, + "grad_norm": 0.6850021481513977, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 27090 + }, + { + "epoch": 1.9461400359066428, + "grad_norm": 0.6380038261413574, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 27100 + }, + { + "epoch": 1.9468581687612208, + "grad_norm": 0.5806204080581665, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 27110 + }, + { + "epoch": 1.947576301615799, + "grad_norm": 0.8236927390098572, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 27120 + }, + { + "epoch": 1.948294434470377, + "grad_norm": 0.7915826439857483, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27130 + }, + { + "epoch": 1.9490125673249552, + "grad_norm": 0.7467429041862488, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 27140 + }, + { + "epoch": 1.9497307001795332, + "grad_norm": 0.6278707981109619, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27150 + }, + { + "epoch": 1.9504488330341112, + "grad_norm": 0.7353739142417908, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 27160 + }, + { + "epoch": 1.9511669658886894, + "grad_norm": 0.6443645358085632, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27170 + }, + { + "epoch": 1.9518850987432677, + "grad_norm": 0.770800769329071, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 27180 + }, + { + "epoch": 1.9526032315978457, + "grad_norm": 0.8982598781585693, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 27190 + }, + { + "epoch": 1.9533213644524237, + "grad_norm": 0.775017499923706, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 27200 + }, + { + "epoch": 1.9540394973070017, + "grad_norm": 0.8271628618240356, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 27210 + }, + { + "epoch": 1.9547576301615799, + "grad_norm": 0.7460184693336487, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 27220 + }, + { + "epoch": 1.955475763016158, + "grad_norm": 0.7732188105583191, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 27230 + }, + { + "epoch": 1.956193895870736, + "grad_norm": 0.7398577332496643, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 27240 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 0.7132339477539062, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 27250 + }, + { + "epoch": 1.9576301615798921, + "grad_norm": 0.6718965768814087, + "learning_rate": 0.0002, + "loss": 0.7731, + "step": 27260 + }, + { + "epoch": 1.9583482944344703, + "grad_norm": 0.7914422154426575, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 27270 + }, + { + "epoch": 1.9590664272890486, + "grad_norm": 0.8314110636711121, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 27280 + }, + { + "epoch": 1.9597845601436266, + "grad_norm": 0.7810674905776978, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 27290 + }, + { + "epoch": 1.9605026929982046, + "grad_norm": 0.7691007256507874, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 27300 + }, + { + "epoch": 1.9612208258527828, + "grad_norm": 0.6753138899803162, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 27310 + }, + { + "epoch": 1.961938958707361, + "grad_norm": 0.5881175994873047, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 27320 + }, + { + "epoch": 1.962657091561939, + "grad_norm": 0.8414133191108704, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27330 + }, + { + "epoch": 1.963375224416517, + "grad_norm": 0.7363715171813965, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 27340 + }, + { + "epoch": 1.964093357271095, + "grad_norm": 0.6526232361793518, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 27350 + }, + { + "epoch": 1.9648114901256732, + "grad_norm": 0.6821389198303223, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 27360 + }, + { + "epoch": 1.9655296229802515, + "grad_norm": 0.7306062579154968, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 27370 + }, + { + "epoch": 1.9662477558348295, + "grad_norm": 0.6458130478858948, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 27380 + }, + { + "epoch": 1.9669658886894075, + "grad_norm": 0.7243196368217468, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 27390 + }, + { + "epoch": 1.9676840215439855, + "grad_norm": 0.8062235713005066, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 27400 + }, + { + "epoch": 1.9684021543985637, + "grad_norm": 0.68441241979599, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 27410 + }, + { + "epoch": 1.969120287253142, + "grad_norm": 0.7504498958587646, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 27420 + }, + { + "epoch": 1.96983842010772, + "grad_norm": 0.7469466328620911, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 27430 + }, + { + "epoch": 1.970556552962298, + "grad_norm": 0.7109853625297546, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 27440 + }, + { + "epoch": 1.9712746858168761, + "grad_norm": 0.6964903473854065, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 27450 + }, + { + "epoch": 1.9719928186714544, + "grad_norm": 0.8224200010299683, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 27460 + }, + { + "epoch": 1.9727109515260324, + "grad_norm": 0.6195617318153381, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 27470 + }, + { + "epoch": 1.9734290843806104, + "grad_norm": 0.691511332988739, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 27480 + }, + { + "epoch": 1.9741472172351884, + "grad_norm": 0.7437900304794312, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 27490 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 0.7987960577011108, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 27500 + }, + { + "epoch": 1.9755834829443448, + "grad_norm": 0.7117776274681091, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 27510 + }, + { + "epoch": 1.9763016157989228, + "grad_norm": 0.8473866581916809, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 27520 + }, + { + "epoch": 1.9770197486535008, + "grad_norm": 0.7178242802619934, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 27530 + }, + { + "epoch": 1.9777378815080788, + "grad_norm": 0.760145902633667, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 27540 + }, + { + "epoch": 1.978456014362657, + "grad_norm": 0.764436662197113, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 27550 + }, + { + "epoch": 1.9791741472172353, + "grad_norm": 0.7245904803276062, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 27560 + }, + { + "epoch": 1.9798922800718133, + "grad_norm": 0.6317000389099121, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 27570 + }, + { + "epoch": 1.9806104129263913, + "grad_norm": 0.8764704465866089, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 27580 + }, + { + "epoch": 1.9813285457809695, + "grad_norm": 0.6111825108528137, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 27590 + }, + { + "epoch": 1.9820466786355477, + "grad_norm": 0.6797714233398438, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 27600 + }, + { + "epoch": 1.9827648114901257, + "grad_norm": 0.7754142880439758, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 27610 + }, + { + "epoch": 1.9834829443447037, + "grad_norm": 0.7243061661720276, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 27620 + }, + { + "epoch": 1.9842010771992817, + "grad_norm": 0.6194812655448914, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 27630 + }, + { + "epoch": 1.98491921005386, + "grad_norm": 0.6399638056755066, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27640 + }, + { + "epoch": 1.9856373429084382, + "grad_norm": 0.7637218832969666, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 27650 + }, + { + "epoch": 1.9863554757630162, + "grad_norm": 0.9099404811859131, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 27660 + }, + { + "epoch": 1.9870736086175942, + "grad_norm": 0.6892596483230591, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 27670 + }, + { + "epoch": 1.9877917414721722, + "grad_norm": 0.5962418913841248, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 27680 + }, + { + "epoch": 1.9885098743267504, + "grad_norm": 0.5750163197517395, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27690 + }, + { + "epoch": 1.9892280071813286, + "grad_norm": 0.6740097403526306, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 27700 + }, + { + "epoch": 1.9899461400359066, + "grad_norm": 0.6968644857406616, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 27710 + }, + { + "epoch": 1.9906642728904846, + "grad_norm": 0.6788132190704346, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 27720 + }, + { + "epoch": 1.9913824057450629, + "grad_norm": 0.8600544929504395, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 27730 + }, + { + "epoch": 1.992100538599641, + "grad_norm": 0.6227671504020691, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 27740 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 0.6611875295639038, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 27750 + }, + { + "epoch": 1.993536804308797, + "grad_norm": 0.714568018913269, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 27760 + }, + { + "epoch": 1.994254937163375, + "grad_norm": 0.6328669190406799, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27770 + }, + { + "epoch": 1.9949730700179533, + "grad_norm": 0.8673429489135742, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27780 + }, + { + "epoch": 1.9956912028725315, + "grad_norm": 0.820620059967041, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 27790 + }, + { + "epoch": 1.9964093357271095, + "grad_norm": 0.8748094439506531, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 27800 + }, + { + "epoch": 1.9971274685816875, + "grad_norm": 0.8118113875389099, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 27810 + }, + { + "epoch": 1.9978456014362656, + "grad_norm": 0.6886725425720215, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 27820 + }, + { + "epoch": 1.9985637342908438, + "grad_norm": 0.7101268768310547, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 27830 + }, + { + "epoch": 1.999281867145422, + "grad_norm": 0.7823781967163086, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 27840 + }, + { + "epoch": 2.0, + "grad_norm": 0.8491085767745972, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 27850 + }, + { + "epoch": 2.0, + "eval_loss": 1.0868422985076904, + "eval_runtime": 55.1699, + "eval_samples_per_second": 13.286, + "eval_steps_per_second": 1.668, + "step": 27850 + }, + { + "epoch": 2.000718132854578, + "grad_norm": 0.9003389477729797, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 27860 + }, + { + "epoch": 2.001436265709156, + "grad_norm": 0.8898349404335022, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 27870 + }, + { + "epoch": 2.0021543985637344, + "grad_norm": 0.7525973320007324, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 27880 + }, + { + "epoch": 2.0028725314183125, + "grad_norm": 0.7821497321128845, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 27890 + }, + { + "epoch": 2.0035906642728905, + "grad_norm": 0.6334691047668457, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 27900 + }, + { + "epoch": 2.0043087971274685, + "grad_norm": 0.732991099357605, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 27910 + }, + { + "epoch": 2.0050269299820465, + "grad_norm": 0.949942946434021, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 27920 + }, + { + "epoch": 2.005745062836625, + "grad_norm": 0.657267689704895, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 27930 + }, + { + "epoch": 2.006463195691203, + "grad_norm": 0.8329252004623413, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 27940 + }, + { + "epoch": 2.007181328545781, + "grad_norm": 0.7816959023475647, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 27950 + }, + { + "epoch": 2.007899461400359, + "grad_norm": 0.7546323537826538, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 27960 + }, + { + "epoch": 2.0086175942549374, + "grad_norm": 0.9519657492637634, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 27970 + }, + { + "epoch": 2.0093357271095154, + "grad_norm": 0.7934315800666809, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 27980 + }, + { + "epoch": 2.0100538599640934, + "grad_norm": 0.9579764604568481, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 27990 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 0.764167070388794, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 28000 + }, + { + "epoch": 2.0114901256732494, + "grad_norm": 0.7380000948905945, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 28010 + }, + { + "epoch": 2.012208258527828, + "grad_norm": 0.7220044732093811, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 28020 + }, + { + "epoch": 2.012926391382406, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 28030 + }, + { + "epoch": 2.013644524236984, + "grad_norm": 0.7507190704345703, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28040 + }, + { + "epoch": 2.014362657091562, + "grad_norm": 0.9488387703895569, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 28050 + }, + { + "epoch": 2.01508078994614, + "grad_norm": 0.9092940092086792, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 28060 + }, + { + "epoch": 2.0157989228007183, + "grad_norm": 0.7859629392623901, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28070 + }, + { + "epoch": 2.0165170556552963, + "grad_norm": 0.7636393904685974, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 28080 + }, + { + "epoch": 2.0172351885098743, + "grad_norm": 0.8860714435577393, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 28090 + }, + { + "epoch": 2.0179533213644523, + "grad_norm": 0.6837195158004761, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 28100 + }, + { + "epoch": 2.0186714542190307, + "grad_norm": 0.7778242826461792, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 28110 + }, + { + "epoch": 2.0193895870736087, + "grad_norm": 0.7164766788482666, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 28120 + }, + { + "epoch": 2.0201077199281867, + "grad_norm": 0.8965572118759155, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 28130 + }, + { + "epoch": 2.0208258527827647, + "grad_norm": 0.8074374794960022, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 28140 + }, + { + "epoch": 2.0215439856373427, + "grad_norm": 0.8307222127914429, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 28150 + }, + { + "epoch": 2.022262118491921, + "grad_norm": 0.9600032567977905, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 28160 + }, + { + "epoch": 2.022980251346499, + "grad_norm": 0.8541040420532227, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 28170 + }, + { + "epoch": 2.023698384201077, + "grad_norm": 0.8864985704421997, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 28180 + }, + { + "epoch": 2.024416517055655, + "grad_norm": 0.7926326990127563, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 28190 + }, + { + "epoch": 2.025134649910233, + "grad_norm": 1.0548077821731567, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28200 + }, + { + "epoch": 2.0258527827648116, + "grad_norm": 0.7468827366828918, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 28210 + }, + { + "epoch": 2.0265709156193896, + "grad_norm": 0.7683286070823669, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 28220 + }, + { + "epoch": 2.0272890484739676, + "grad_norm": 0.7307319641113281, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 28230 + }, + { + "epoch": 2.0280071813285456, + "grad_norm": 0.7813416719436646, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 28240 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 0.7954556941986084, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 28250 + }, + { + "epoch": 2.029443447037702, + "grad_norm": 0.8836418986320496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 28260 + }, + { + "epoch": 2.03016157989228, + "grad_norm": 0.7092728614807129, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28270 + }, + { + "epoch": 2.030879712746858, + "grad_norm": 0.8512285351753235, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 28280 + }, + { + "epoch": 2.031597845601436, + "grad_norm": 0.8005346059799194, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 28290 + }, + { + "epoch": 2.0323159784560145, + "grad_norm": 0.8872515559196472, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 28300 + }, + { + "epoch": 2.0330341113105925, + "grad_norm": 0.7948436737060547, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 28310 + }, + { + "epoch": 2.0337522441651705, + "grad_norm": 0.7418082356452942, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 28320 + }, + { + "epoch": 2.0344703770197485, + "grad_norm": 0.9600949287414551, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 28330 + }, + { + "epoch": 2.0351885098743265, + "grad_norm": 0.9767434597015381, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 28340 + }, + { + "epoch": 2.035906642728905, + "grad_norm": 0.7435336709022522, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 28350 + }, + { + "epoch": 2.036624775583483, + "grad_norm": 0.997978925704956, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 28360 + }, + { + "epoch": 2.037342908438061, + "grad_norm": 0.9072412252426147, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 28370 + }, + { + "epoch": 2.038061041292639, + "grad_norm": 0.8396701812744141, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 28380 + }, + { + "epoch": 2.0387791741472174, + "grad_norm": 1.0449832677841187, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 28390 + }, + { + "epoch": 2.0394973070017954, + "grad_norm": 0.6471025943756104, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 28400 + }, + { + "epoch": 2.0402154398563734, + "grad_norm": 0.8147950768470764, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 28410 + }, + { + "epoch": 2.0409335727109514, + "grad_norm": 0.902508020401001, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 28420 + }, + { + "epoch": 2.0416517055655294, + "grad_norm": 0.6426262855529785, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 28430 + }, + { + "epoch": 2.042369838420108, + "grad_norm": 0.8016643524169922, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 28440 + }, + { + "epoch": 2.043087971274686, + "grad_norm": 0.6841614246368408, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 28450 + }, + { + "epoch": 2.043806104129264, + "grad_norm": 0.7713631987571716, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 28460 + }, + { + "epoch": 2.044524236983842, + "grad_norm": 0.8795675039291382, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 28470 + }, + { + "epoch": 2.04524236983842, + "grad_norm": 0.725447416305542, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 28480 + }, + { + "epoch": 2.0459605026929983, + "grad_norm": 0.806861162185669, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 28490 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 0.752953827381134, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 28500 + }, + { + "epoch": 2.0473967684021543, + "grad_norm": 0.7143173813819885, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 28510 + }, + { + "epoch": 2.0481149012567323, + "grad_norm": 0.9316226243972778, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 28520 + }, + { + "epoch": 2.048833034111311, + "grad_norm": 0.7292338609695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 28530 + }, + { + "epoch": 2.049551166965889, + "grad_norm": 0.7392885088920593, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 28540 + }, + { + "epoch": 2.050269299820467, + "grad_norm": 0.7288873195648193, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 28550 + }, + { + "epoch": 2.050987432675045, + "grad_norm": 0.7791221141815186, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 28560 + }, + { + "epoch": 2.051705565529623, + "grad_norm": 0.821983814239502, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 28570 + }, + { + "epoch": 2.0524236983842012, + "grad_norm": 0.8925826549530029, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28580 + }, + { + "epoch": 2.0531418312387792, + "grad_norm": 0.7181646227836609, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 28590 + }, + { + "epoch": 2.0538599640933572, + "grad_norm": 0.6387725472450256, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 28600 + }, + { + "epoch": 2.0545780969479353, + "grad_norm": 0.8398096561431885, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 28610 + }, + { + "epoch": 2.0552962298025133, + "grad_norm": 1.0458195209503174, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 28620 + }, + { + "epoch": 2.0560143626570917, + "grad_norm": 0.7032150626182556, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28630 + }, + { + "epoch": 2.0567324955116697, + "grad_norm": 0.8850845098495483, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 28640 + }, + { + "epoch": 2.0574506283662477, + "grad_norm": 0.8587120175361633, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 28650 + }, + { + "epoch": 2.0581687612208257, + "grad_norm": 0.7462602853775024, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28660 + }, + { + "epoch": 2.058886894075404, + "grad_norm": 0.7355574369430542, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 28670 + }, + { + "epoch": 2.059605026929982, + "grad_norm": 0.9229736328125, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 28680 + }, + { + "epoch": 2.06032315978456, + "grad_norm": 0.7685085535049438, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 28690 + }, + { + "epoch": 2.061041292639138, + "grad_norm": 0.6749364137649536, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 28700 + }, + { + "epoch": 2.061759425493716, + "grad_norm": 0.7608520984649658, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28710 + }, + { + "epoch": 2.0624775583482946, + "grad_norm": 0.9451281428337097, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28720 + }, + { + "epoch": 2.0631956912028726, + "grad_norm": 0.7869735360145569, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 28730 + }, + { + "epoch": 2.0639138240574506, + "grad_norm": 0.8422008156776428, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 28740 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 0.7486162781715393, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 28750 + }, + { + "epoch": 2.0653500897666066, + "grad_norm": 0.9374173879623413, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28760 + }, + { + "epoch": 2.066068222621185, + "grad_norm": 0.8749295473098755, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 28770 + }, + { + "epoch": 2.066786355475763, + "grad_norm": 0.8265942931175232, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 28780 + }, + { + "epoch": 2.067504488330341, + "grad_norm": 0.8541982769966125, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 28790 + }, + { + "epoch": 2.068222621184919, + "grad_norm": 0.8220006227493286, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 28800 + }, + { + "epoch": 2.0689407540394975, + "grad_norm": 0.7302022576332092, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 28810 + }, + { + "epoch": 2.0696588868940755, + "grad_norm": 0.7073875069618225, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 28820 + }, + { + "epoch": 2.0703770197486535, + "grad_norm": 0.7792919874191284, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28830 + }, + { + "epoch": 2.0710951526032315, + "grad_norm": 0.8268185257911682, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 28840 + }, + { + "epoch": 2.0718132854578095, + "grad_norm": 0.7576423287391663, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 28850 + }, + { + "epoch": 2.072531418312388, + "grad_norm": 0.8255910873413086, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 28860 + }, + { + "epoch": 2.073249551166966, + "grad_norm": 0.7900934815406799, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 28870 + }, + { + "epoch": 2.073967684021544, + "grad_norm": 0.846665620803833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 28880 + }, + { + "epoch": 2.074685816876122, + "grad_norm": 0.8159831166267395, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 28890 + }, + { + "epoch": 2.0754039497307, + "grad_norm": 0.7395941615104675, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 28900 + }, + { + "epoch": 2.0761220825852784, + "grad_norm": 0.9765046238899231, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 28910 + }, + { + "epoch": 2.0768402154398564, + "grad_norm": 0.8358173966407776, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 28920 + }, + { + "epoch": 2.0775583482944344, + "grad_norm": 0.6848723292350769, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 28930 + }, + { + "epoch": 2.0782764811490124, + "grad_norm": 0.7965065836906433, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 28940 + }, + { + "epoch": 2.078994614003591, + "grad_norm": 0.7618608474731445, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 28950 + }, + { + "epoch": 2.079712746858169, + "grad_norm": 0.890615701675415, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 28960 + }, + { + "epoch": 2.080430879712747, + "grad_norm": 0.7310431003570557, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28970 + }, + { + "epoch": 2.081149012567325, + "grad_norm": 0.8228268027305603, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 28980 + }, + { + "epoch": 2.081867145421903, + "grad_norm": 0.883577287197113, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28990 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 0.8359243869781494, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 29000 + }, + { + "epoch": 2.0833034111310593, + "grad_norm": 0.8285391330718994, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 29010 + }, + { + "epoch": 2.0840215439856373, + "grad_norm": 0.8991064429283142, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 29020 + }, + { + "epoch": 2.0847396768402153, + "grad_norm": 0.6911244988441467, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 29030 + }, + { + "epoch": 2.0854578096947933, + "grad_norm": 0.8462249636650085, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 29040 + }, + { + "epoch": 2.0861759425493718, + "grad_norm": 0.9149548411369324, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 29050 + }, + { + "epoch": 2.0868940754039498, + "grad_norm": 0.7365630269050598, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 29060 + }, + { + "epoch": 2.087612208258528, + "grad_norm": 0.8439079523086548, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 29070 + }, + { + "epoch": 2.088330341113106, + "grad_norm": 0.7123780846595764, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 29080 + }, + { + "epoch": 2.0890484739676842, + "grad_norm": 0.6854261755943298, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 29090 + }, + { + "epoch": 2.0897666068222622, + "grad_norm": 0.83026123046875, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 29100 + }, + { + "epoch": 2.0904847396768402, + "grad_norm": 0.8413158059120178, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 29110 + }, + { + "epoch": 2.0912028725314182, + "grad_norm": 0.9646758437156677, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 29120 + }, + { + "epoch": 2.0919210053859962, + "grad_norm": 0.8421565890312195, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 29130 + }, + { + "epoch": 2.0926391382405747, + "grad_norm": 0.7748899459838867, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 29140 + }, + { + "epoch": 2.0933572710951527, + "grad_norm": 0.5973830819129944, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 29150 + }, + { + "epoch": 2.0940754039497307, + "grad_norm": 0.8440837860107422, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 29160 + }, + { + "epoch": 2.0947935368043087, + "grad_norm": 0.7392688989639282, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 29170 + }, + { + "epoch": 2.0955116696588867, + "grad_norm": 1.0522996187210083, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 29180 + }, + { + "epoch": 2.096229802513465, + "grad_norm": 0.7330273389816284, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 29190 + }, + { + "epoch": 2.096947935368043, + "grad_norm": 1.11064875125885, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 29200 + }, + { + "epoch": 2.097666068222621, + "grad_norm": 0.795446515083313, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 29210 + }, + { + "epoch": 2.098384201077199, + "grad_norm": 0.5552594661712646, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 29220 + }, + { + "epoch": 2.0991023339317776, + "grad_norm": 0.7327710390090942, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 29230 + }, + { + "epoch": 2.0998204667863556, + "grad_norm": 0.7474247217178345, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 29240 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": 0.7775853276252747, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 29250 + }, + { + "epoch": 2.1012567324955116, + "grad_norm": 0.769527018070221, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29260 + }, + { + "epoch": 2.1019748653500896, + "grad_norm": 0.8350797891616821, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 29270 + }, + { + "epoch": 2.102692998204668, + "grad_norm": 0.8749061822891235, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29280 + }, + { + "epoch": 2.103411131059246, + "grad_norm": 0.7838778495788574, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 29290 + }, + { + "epoch": 2.104129263913824, + "grad_norm": 0.8144710063934326, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 29300 + }, + { + "epoch": 2.104847396768402, + "grad_norm": 0.7965250015258789, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 29310 + }, + { + "epoch": 2.10556552962298, + "grad_norm": 0.7075945138931274, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 29320 + }, + { + "epoch": 2.1062836624775585, + "grad_norm": 0.9449555277824402, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 29330 + }, + { + "epoch": 2.1070017953321365, + "grad_norm": 0.9114580750465393, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 29340 + }, + { + "epoch": 2.1077199281867145, + "grad_norm": 0.8768125176429749, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 29350 + }, + { + "epoch": 2.1084380610412925, + "grad_norm": 0.8586908578872681, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 29360 + }, + { + "epoch": 2.109156193895871, + "grad_norm": 0.8351234793663025, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 29370 + }, + { + "epoch": 2.109874326750449, + "grad_norm": 0.686488687992096, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 29380 + }, + { + "epoch": 2.110592459605027, + "grad_norm": 0.7910184264183044, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 29390 + }, + { + "epoch": 2.111310592459605, + "grad_norm": 0.7649612426757812, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 29400 + }, + { + "epoch": 2.112028725314183, + "grad_norm": 0.7790259122848511, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29410 + }, + { + "epoch": 2.1127468581687614, + "grad_norm": 0.8386351466178894, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 29420 + }, + { + "epoch": 2.1134649910233394, + "grad_norm": 0.8605695366859436, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 29430 + }, + { + "epoch": 2.1141831238779174, + "grad_norm": 0.6808947920799255, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 29440 + }, + { + "epoch": 2.1149012567324954, + "grad_norm": 0.8310001492500305, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 29450 + }, + { + "epoch": 2.1156193895870734, + "grad_norm": 1.289986252784729, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 29460 + }, + { + "epoch": 2.116337522441652, + "grad_norm": 0.8679313659667969, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 29470 + }, + { + "epoch": 2.11705565529623, + "grad_norm": 0.9149175882339478, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 29480 + }, + { + "epoch": 2.117773788150808, + "grad_norm": 0.8405622839927673, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 29490 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 0.9174691438674927, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 29500 + }, + { + "epoch": 2.1192100538599643, + "grad_norm": 0.8865614533424377, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29510 + }, + { + "epoch": 2.1199281867145423, + "grad_norm": 0.645301342010498, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29520 + }, + { + "epoch": 2.1206463195691203, + "grad_norm": 0.7612960338592529, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 29530 + }, + { + "epoch": 2.1213644524236983, + "grad_norm": 0.7575576305389404, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 29540 + }, + { + "epoch": 2.1220825852782763, + "grad_norm": 0.8746156096458435, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 29550 + }, + { + "epoch": 2.1228007181328548, + "grad_norm": 0.8488934636116028, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 29560 + }, + { + "epoch": 2.1235188509874328, + "grad_norm": 0.8064972162246704, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 29570 + }, + { + "epoch": 2.1242369838420108, + "grad_norm": 0.7410933971405029, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 29580 + }, + { + "epoch": 2.1249551166965888, + "grad_norm": 0.7023535966873169, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 29590 + }, + { + "epoch": 2.1256732495511668, + "grad_norm": 0.8591743111610413, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 29600 + }, + { + "epoch": 2.126391382405745, + "grad_norm": 0.7270186543464661, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 29610 + }, + { + "epoch": 2.127109515260323, + "grad_norm": 0.9639726281166077, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 29620 + }, + { + "epoch": 2.127827648114901, + "grad_norm": 0.8519027829170227, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 29630 + }, + { + "epoch": 2.128545780969479, + "grad_norm": 0.8786447048187256, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 29640 + }, + { + "epoch": 2.129263913824057, + "grad_norm": 0.7452822923660278, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29650 + }, + { + "epoch": 2.1299820466786357, + "grad_norm": 0.9385744333267212, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 29660 + }, + { + "epoch": 2.1307001795332137, + "grad_norm": 0.7650160193443298, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 29670 + }, + { + "epoch": 2.1314183123877917, + "grad_norm": 0.7581976652145386, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 29680 + }, + { + "epoch": 2.1321364452423697, + "grad_norm": 0.8455183506011963, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 29690 + }, + { + "epoch": 2.132854578096948, + "grad_norm": 0.7200509905815125, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 29700 + }, + { + "epoch": 2.133572710951526, + "grad_norm": 0.7071877121925354, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 29710 + }, + { + "epoch": 2.134290843806104, + "grad_norm": 0.9197220802307129, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 29720 + }, + { + "epoch": 2.135008976660682, + "grad_norm": 0.6787277460098267, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 29730 + }, + { + "epoch": 2.13572710951526, + "grad_norm": 0.8183788061141968, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 29740 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 0.7958994507789612, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29750 + }, + { + "epoch": 2.1371633752244166, + "grad_norm": 0.8803889155387878, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 29760 + }, + { + "epoch": 2.1378815080789946, + "grad_norm": 0.6682677268981934, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 29770 + }, + { + "epoch": 2.1385996409335726, + "grad_norm": 1.0198085308074951, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 29780 + }, + { + "epoch": 2.139317773788151, + "grad_norm": 1.0258227586746216, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 29790 + }, + { + "epoch": 2.140035906642729, + "grad_norm": 0.8920917510986328, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 29800 + }, + { + "epoch": 2.140754039497307, + "grad_norm": 0.8352635502815247, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 29810 + }, + { + "epoch": 2.141472172351885, + "grad_norm": 0.8422067165374756, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 29820 + }, + { + "epoch": 2.142190305206463, + "grad_norm": 0.8845202326774597, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 29830 + }, + { + "epoch": 2.1429084380610415, + "grad_norm": 0.659397542476654, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 29840 + }, + { + "epoch": 2.1436265709156195, + "grad_norm": 0.6233306527137756, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 29850 + }, + { + "epoch": 2.1443447037701975, + "grad_norm": 0.8951199054718018, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 29860 + }, + { + "epoch": 2.1450628366247755, + "grad_norm": 0.6980211734771729, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 29870 + }, + { + "epoch": 2.1457809694793535, + "grad_norm": 0.8463385105133057, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29880 + }, + { + "epoch": 2.146499102333932, + "grad_norm": 0.682183027267456, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 29890 + }, + { + "epoch": 2.14721723518851, + "grad_norm": 0.8491033911705017, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 29900 + }, + { + "epoch": 2.147935368043088, + "grad_norm": 0.8112631440162659, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 29910 + }, + { + "epoch": 2.148653500897666, + "grad_norm": 1.0186359882354736, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29920 + }, + { + "epoch": 2.149371633752244, + "grad_norm": 0.7904929518699646, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 29930 + }, + { + "epoch": 2.1500897666068224, + "grad_norm": 0.8381312489509583, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29940 + }, + { + "epoch": 2.1508078994614004, + "grad_norm": 0.7596192359924316, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 29950 + }, + { + "epoch": 2.1515260323159784, + "grad_norm": 0.7532448768615723, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 29960 + }, + { + "epoch": 2.1522441651705564, + "grad_norm": 0.7877430319786072, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 29970 + }, + { + "epoch": 2.152962298025135, + "grad_norm": 0.6870610117912292, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 29980 + }, + { + "epoch": 2.153680430879713, + "grad_norm": 0.7154987454414368, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 29990 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 0.7692370414733887, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 30000 + }, + { + "epoch": 2.155116696588869, + "grad_norm": 0.7745859026908875, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 30010 + }, + { + "epoch": 2.155834829443447, + "grad_norm": 0.718207061290741, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 30020 + }, + { + "epoch": 2.1565529622980253, + "grad_norm": 0.8851615786552429, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30030 + }, + { + "epoch": 2.1572710951526033, + "grad_norm": 0.736194372177124, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 30040 + }, + { + "epoch": 2.1579892280071813, + "grad_norm": 0.9908117055892944, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 30050 + }, + { + "epoch": 2.1587073608617593, + "grad_norm": 0.6772316694259644, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30060 + }, + { + "epoch": 2.1594254937163377, + "grad_norm": 0.7474411725997925, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 30070 + }, + { + "epoch": 2.1601436265709157, + "grad_norm": 0.8140033483505249, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 30080 + }, + { + "epoch": 2.1608617594254937, + "grad_norm": 0.912555992603302, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 30090 + }, + { + "epoch": 2.1615798922800717, + "grad_norm": 0.8189636468887329, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 30100 + }, + { + "epoch": 2.1622980251346497, + "grad_norm": 0.7520000338554382, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 30110 + }, + { + "epoch": 2.163016157989228, + "grad_norm": 0.9635465741157532, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 30120 + }, + { + "epoch": 2.163734290843806, + "grad_norm": 0.9139830470085144, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 30130 + }, + { + "epoch": 2.164452423698384, + "grad_norm": 0.844384491443634, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 30140 + }, + { + "epoch": 2.165170556552962, + "grad_norm": 0.8296793103218079, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 30150 + }, + { + "epoch": 2.16588868940754, + "grad_norm": 0.7929309606552124, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30160 + }, + { + "epoch": 2.1666068222621186, + "grad_norm": 0.8046507239341736, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 30170 + }, + { + "epoch": 2.1673249551166966, + "grad_norm": 0.8161377310752869, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 30180 + }, + { + "epoch": 2.1680430879712747, + "grad_norm": 0.6984363794326782, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 30190 + }, + { + "epoch": 2.1687612208258527, + "grad_norm": 0.8578489422798157, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30200 + }, + { + "epoch": 2.1694793536804307, + "grad_norm": 0.8051524758338928, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30210 + }, + { + "epoch": 2.170197486535009, + "grad_norm": 0.6775792241096497, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 30220 + }, + { + "epoch": 2.170915619389587, + "grad_norm": 0.7102242708206177, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 30230 + }, + { + "epoch": 2.171633752244165, + "grad_norm": 0.9038975238800049, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 30240 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 0.8509918451309204, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 30250 + }, + { + "epoch": 2.1730700179533216, + "grad_norm": 0.8816375732421875, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 30260 + }, + { + "epoch": 2.1737881508078996, + "grad_norm": 0.7907037138938904, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 30270 + }, + { + "epoch": 2.1745062836624776, + "grad_norm": 0.7104434967041016, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 30280 + }, + { + "epoch": 2.1752244165170556, + "grad_norm": 1.028658151626587, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 30290 + }, + { + "epoch": 2.1759425493716336, + "grad_norm": 0.8542430400848389, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 30300 + }, + { + "epoch": 2.176660682226212, + "grad_norm": 0.7438064813613892, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30310 + }, + { + "epoch": 2.17737881508079, + "grad_norm": 0.8384708762168884, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 30320 + }, + { + "epoch": 2.178096947935368, + "grad_norm": 0.9034163355827332, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 30330 + }, + { + "epoch": 2.178815080789946, + "grad_norm": 0.9659526944160461, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 30340 + }, + { + "epoch": 2.1795332136445245, + "grad_norm": 0.6685642600059509, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 30350 + }, + { + "epoch": 2.1802513464991025, + "grad_norm": 0.9180589318275452, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 30360 + }, + { + "epoch": 2.1809694793536805, + "grad_norm": 0.9550795555114746, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 30370 + }, + { + "epoch": 2.1816876122082585, + "grad_norm": 0.8517686724662781, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 30380 + }, + { + "epoch": 2.1824057450628365, + "grad_norm": 0.7351927161216736, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 30390 + }, + { + "epoch": 2.183123877917415, + "grad_norm": 0.8439408540725708, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 30400 + }, + { + "epoch": 2.183842010771993, + "grad_norm": 0.8322570323944092, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 30410 + }, + { + "epoch": 2.184560143626571, + "grad_norm": 0.6735888123512268, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 30420 + }, + { + "epoch": 2.185278276481149, + "grad_norm": 0.7273133397102356, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 30430 + }, + { + "epoch": 2.185996409335727, + "grad_norm": 0.7841959595680237, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 30440 + }, + { + "epoch": 2.1867145421903054, + "grad_norm": 0.67259281873703, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 30450 + }, + { + "epoch": 2.1874326750448834, + "grad_norm": 0.7646223306655884, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 30460 + }, + { + "epoch": 2.1881508078994614, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 30470 + }, + { + "epoch": 2.1888689407540394, + "grad_norm": 0.8818342685699463, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 30480 + }, + { + "epoch": 2.1895870736086174, + "grad_norm": 0.7421377897262573, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 30490 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 0.8180080652236938, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30500 + }, + { + "epoch": 2.191023339317774, + "grad_norm": 0.8003571033477783, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30510 + }, + { + "epoch": 2.191741472172352, + "grad_norm": 0.8200605511665344, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 30520 + }, + { + "epoch": 2.19245960502693, + "grad_norm": 0.8878887295722961, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 30530 + }, + { + "epoch": 2.1931777378815083, + "grad_norm": 0.8518163561820984, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 30540 + }, + { + "epoch": 2.1938958707360863, + "grad_norm": 0.8182454705238342, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 30550 + }, + { + "epoch": 2.1946140035906643, + "grad_norm": 0.9395919442176819, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 30560 + }, + { + "epoch": 2.1953321364452423, + "grad_norm": 0.7916256189346313, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 30570 + }, + { + "epoch": 2.1960502692998203, + "grad_norm": 0.7303445339202881, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 30580 + }, + { + "epoch": 2.1967684021543987, + "grad_norm": 0.7407387495040894, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 30590 + }, + { + "epoch": 2.1974865350089767, + "grad_norm": 0.7410500645637512, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 30600 + }, + { + "epoch": 2.1982046678635547, + "grad_norm": 0.9176440834999084, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 30610 + }, + { + "epoch": 2.1989228007181327, + "grad_norm": 0.8823038935661316, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 30620 + }, + { + "epoch": 2.199640933572711, + "grad_norm": 0.9263436198234558, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 30630 + }, + { + "epoch": 2.200359066427289, + "grad_norm": 0.6753571033477783, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 30640 + }, + { + "epoch": 2.201077199281867, + "grad_norm": 0.841160774230957, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 30650 + }, + { + "epoch": 2.201795332136445, + "grad_norm": 0.8786441683769226, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 30660 + }, + { + "epoch": 2.202513464991023, + "grad_norm": 0.8833681344985962, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 30670 + }, + { + "epoch": 2.2032315978456016, + "grad_norm": 0.6609824299812317, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 30680 + }, + { + "epoch": 2.2039497307001796, + "grad_norm": 0.7308626174926758, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 30690 + }, + { + "epoch": 2.2046678635547576, + "grad_norm": 0.8854711055755615, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 30700 + }, + { + "epoch": 2.2053859964093356, + "grad_norm": 0.839043140411377, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 30710 + }, + { + "epoch": 2.2061041292639136, + "grad_norm": 0.9030174016952515, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 30720 + }, + { + "epoch": 2.206822262118492, + "grad_norm": 0.6856667399406433, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 30730 + }, + { + "epoch": 2.20754039497307, + "grad_norm": 0.8823501467704773, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 30740 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 0.8501278162002563, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 30750 + }, + { + "epoch": 2.208976660682226, + "grad_norm": 0.8099446892738342, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 30760 + }, + { + "epoch": 2.209694793536804, + "grad_norm": 0.7203072905540466, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 30770 + }, + { + "epoch": 2.2104129263913825, + "grad_norm": 1.0898563861846924, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 30780 + }, + { + "epoch": 2.2111310592459605, + "grad_norm": 0.8157216906547546, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 30790 + }, + { + "epoch": 2.2118491921005385, + "grad_norm": 0.7617478966712952, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 30800 + }, + { + "epoch": 2.2125673249551165, + "grad_norm": 0.790503978729248, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 30810 + }, + { + "epoch": 2.213285457809695, + "grad_norm": 0.9289199113845825, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 30820 + }, + { + "epoch": 2.214003590664273, + "grad_norm": 0.9267001748085022, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 30830 + }, + { + "epoch": 2.214721723518851, + "grad_norm": 0.716023862361908, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 30840 + }, + { + "epoch": 2.215439856373429, + "grad_norm": 0.8733863234519958, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 30850 + }, + { + "epoch": 2.216157989228007, + "grad_norm": 0.7743660807609558, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 30860 + }, + { + "epoch": 2.2168761220825854, + "grad_norm": 0.7974567413330078, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 30870 + }, + { + "epoch": 2.2175942549371634, + "grad_norm": 0.6617984771728516, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 30880 + }, + { + "epoch": 2.2183123877917414, + "grad_norm": 0.6925143003463745, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 30890 + }, + { + "epoch": 2.2190305206463194, + "grad_norm": 0.6853532195091248, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 30900 + }, + { + "epoch": 2.219748653500898, + "grad_norm": 0.7964699268341064, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 30910 + }, + { + "epoch": 2.220466786355476, + "grad_norm": 0.8116228580474854, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 30920 + }, + { + "epoch": 2.221184919210054, + "grad_norm": 1.0121010541915894, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 30930 + }, + { + "epoch": 2.221903052064632, + "grad_norm": 0.7348445653915405, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 30940 + }, + { + "epoch": 2.22262118491921, + "grad_norm": 0.8998047709465027, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 30950 + }, + { + "epoch": 2.2233393177737883, + "grad_norm": 0.6108106970787048, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 30960 + }, + { + "epoch": 2.2240574506283664, + "grad_norm": 1.287834882736206, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 30970 + }, + { + "epoch": 2.2247755834829444, + "grad_norm": 0.8584468960762024, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 30980 + }, + { + "epoch": 2.2254937163375224, + "grad_norm": 0.865276038646698, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 30990 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 0.8713302612304688, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 31000 + }, + { + "epoch": 2.226929982046679, + "grad_norm": 0.9210535883903503, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 31010 + }, + { + "epoch": 2.227648114901257, + "grad_norm": 0.8578430414199829, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 31020 + }, + { + "epoch": 2.228366247755835, + "grad_norm": 0.7128387093544006, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 31030 + }, + { + "epoch": 2.229084380610413, + "grad_norm": 0.8059941530227661, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 31040 + }, + { + "epoch": 2.229802513464991, + "grad_norm": 0.8043261170387268, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 31050 + }, + { + "epoch": 2.2305206463195693, + "grad_norm": 0.9260253310203552, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 31060 + }, + { + "epoch": 2.2312387791741473, + "grad_norm": 0.7908085584640503, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 31070 + }, + { + "epoch": 2.2319569120287253, + "grad_norm": 0.7860442996025085, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 31080 + }, + { + "epoch": 2.2326750448833033, + "grad_norm": 0.8388702273368835, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 31090 + }, + { + "epoch": 2.2333931777378817, + "grad_norm": 0.835686206817627, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 31100 + }, + { + "epoch": 2.2341113105924597, + "grad_norm": 0.8148298859596252, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 31110 + }, + { + "epoch": 2.2348294434470377, + "grad_norm": 0.8501878976821899, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 31120 + }, + { + "epoch": 2.2355475763016157, + "grad_norm": 0.793323278427124, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 31130 + }, + { + "epoch": 2.2362657091561937, + "grad_norm": 0.8234742879867554, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31140 + }, + { + "epoch": 2.236983842010772, + "grad_norm": 0.8691303133964539, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 31150 + }, + { + "epoch": 2.23770197486535, + "grad_norm": 0.8707090020179749, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 31160 + }, + { + "epoch": 2.238420107719928, + "grad_norm": 0.8468940854072571, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 31170 + }, + { + "epoch": 2.239138240574506, + "grad_norm": 0.7275772094726562, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 31180 + }, + { + "epoch": 2.2398563734290846, + "grad_norm": 0.8765808939933777, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 31190 + }, + { + "epoch": 2.2405745062836626, + "grad_norm": 1.02803635597229, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 31200 + }, + { + "epoch": 2.2412926391382406, + "grad_norm": 0.7999185919761658, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 31210 + }, + { + "epoch": 2.2420107719928186, + "grad_norm": 0.5711870789527893, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 31220 + }, + { + "epoch": 2.2427289048473966, + "grad_norm": 0.7183604836463928, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 31230 + }, + { + "epoch": 2.243447037701975, + "grad_norm": 0.8819206357002258, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 31240 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 0.9078969955444336, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 31250 + }, + { + "epoch": 2.244883303411131, + "grad_norm": 1.184506893157959, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 31260 + }, + { + "epoch": 2.245601436265709, + "grad_norm": 0.8660752177238464, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 31270 + }, + { + "epoch": 2.246319569120287, + "grad_norm": 1.011796236038208, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 31280 + }, + { + "epoch": 2.2470377019748655, + "grad_norm": 0.9168157577514648, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 31290 + }, + { + "epoch": 2.2477558348294435, + "grad_norm": 0.7798577547073364, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 31300 + }, + { + "epoch": 2.2484739676840215, + "grad_norm": 0.6609913110733032, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 31310 + }, + { + "epoch": 2.2491921005385995, + "grad_norm": 0.64737868309021, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 31320 + }, + { + "epoch": 2.2499102333931775, + "grad_norm": 1.0700385570526123, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 31330 + }, + { + "epoch": 2.250628366247756, + "grad_norm": 0.7838551998138428, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 31340 + }, + { + "epoch": 2.251346499102334, + "grad_norm": 0.9225728511810303, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 31350 + }, + { + "epoch": 2.252064631956912, + "grad_norm": 0.7956384420394897, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 31360 + }, + { + "epoch": 2.25278276481149, + "grad_norm": 0.7645466923713684, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 31370 + }, + { + "epoch": 2.2535008976660684, + "grad_norm": 0.9595549702644348, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 31380 + }, + { + "epoch": 2.2542190305206464, + "grad_norm": 0.6124163866043091, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 31390 + }, + { + "epoch": 2.2549371633752244, + "grad_norm": 0.7531530261039734, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 31400 + }, + { + "epoch": 2.2556552962298024, + "grad_norm": 0.6904721856117249, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 31410 + }, + { + "epoch": 2.2563734290843804, + "grad_norm": 0.7644204497337341, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 31420 + }, + { + "epoch": 2.257091561938959, + "grad_norm": 0.7879737019538879, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 31430 + }, + { + "epoch": 2.257809694793537, + "grad_norm": 0.796450138092041, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 31440 + }, + { + "epoch": 2.258527827648115, + "grad_norm": 0.7536656856536865, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31450 + }, + { + "epoch": 2.259245960502693, + "grad_norm": 0.6797451376914978, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 31460 + }, + { + "epoch": 2.2599640933572713, + "grad_norm": 0.7833347320556641, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 31470 + }, + { + "epoch": 2.2606822262118493, + "grad_norm": 0.7571428418159485, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 31480 + }, + { + "epoch": 2.2614003590664273, + "grad_norm": 0.7028690576553345, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 31490 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 0.7854651212692261, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 31500 + }, + { + "epoch": 2.2628366247755833, + "grad_norm": 1.1924974918365479, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 31510 + }, + { + "epoch": 2.2635547576301613, + "grad_norm": 0.8087588548660278, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 31520 + }, + { + "epoch": 2.26427289048474, + "grad_norm": 0.8521981835365295, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31530 + }, + { + "epoch": 2.264991023339318, + "grad_norm": 0.754585862159729, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 31540 + }, + { + "epoch": 2.265709156193896, + "grad_norm": 0.8403395414352417, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 31550 + }, + { + "epoch": 2.266427289048474, + "grad_norm": 0.9724786877632141, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 31560 + }, + { + "epoch": 2.2671454219030522, + "grad_norm": 0.7568767070770264, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 31570 + }, + { + "epoch": 2.2678635547576302, + "grad_norm": 0.712009608745575, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 31580 + }, + { + "epoch": 2.2685816876122082, + "grad_norm": 0.7649937868118286, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 31590 + }, + { + "epoch": 2.2692998204667862, + "grad_norm": 0.7319537997245789, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 31600 + }, + { + "epoch": 2.2700179533213642, + "grad_norm": 0.9597942233085632, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 31610 + }, + { + "epoch": 2.2707360861759427, + "grad_norm": 0.7403358817100525, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 31620 + }, + { + "epoch": 2.2714542190305207, + "grad_norm": 0.7395114898681641, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 31630 + }, + { + "epoch": 2.2721723518850987, + "grad_norm": 0.8835344314575195, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 31640 + }, + { + "epoch": 2.2728904847396767, + "grad_norm": 0.76587975025177, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 31650 + }, + { + "epoch": 2.273608617594255, + "grad_norm": 0.6472584009170532, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 31660 + }, + { + "epoch": 2.274326750448833, + "grad_norm": 1.0170460939407349, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 31670 + }, + { + "epoch": 2.275044883303411, + "grad_norm": 0.8170912265777588, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 31680 + }, + { + "epoch": 2.275763016157989, + "grad_norm": 0.6821279525756836, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 31690 + }, + { + "epoch": 2.276481149012567, + "grad_norm": 0.8150709867477417, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 31700 + }, + { + "epoch": 2.2771992818671456, + "grad_norm": 0.6786386370658875, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 31710 + }, + { + "epoch": 2.2779174147217236, + "grad_norm": 0.8871912360191345, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 31720 + }, + { + "epoch": 2.2786355475763016, + "grad_norm": 0.7710220813751221, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 31730 + }, + { + "epoch": 2.2793536804308796, + "grad_norm": 0.8073079586029053, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 31740 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 0.8228550553321838, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 31750 + }, + { + "epoch": 2.280789946140036, + "grad_norm": 0.7987996339797974, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 31760 + }, + { + "epoch": 2.281508078994614, + "grad_norm": 0.744326651096344, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 31770 + }, + { + "epoch": 2.282226211849192, + "grad_norm": 0.7672302722930908, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 31780 + }, + { + "epoch": 2.28294434470377, + "grad_norm": 0.8079774975776672, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 31790 + }, + { + "epoch": 2.283662477558348, + "grad_norm": 0.7383643984794617, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 31800 + }, + { + "epoch": 2.2843806104129265, + "grad_norm": 0.8542332649230957, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 31810 + }, + { + "epoch": 2.2850987432675045, + "grad_norm": 0.7657321691513062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 31820 + }, + { + "epoch": 2.2858168761220825, + "grad_norm": 0.7485944628715515, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 31830 + }, + { + "epoch": 2.2865350089766605, + "grad_norm": 0.7817596793174744, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 31840 + }, + { + "epoch": 2.287253141831239, + "grad_norm": 0.840421736240387, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31850 + }, + { + "epoch": 2.287971274685817, + "grad_norm": 0.8190447688102722, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 31860 + }, + { + "epoch": 2.288689407540395, + "grad_norm": 0.9582287669181824, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 31870 + }, + { + "epoch": 2.289407540394973, + "grad_norm": 1.0939116477966309, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 31880 + }, + { + "epoch": 2.290125673249551, + "grad_norm": 1.0901678800582886, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 31890 + }, + { + "epoch": 2.2908438061041294, + "grad_norm": 0.8025168776512146, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 31900 + }, + { + "epoch": 2.2915619389587074, + "grad_norm": 0.8157371878623962, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 31910 + }, + { + "epoch": 2.2922800718132854, + "grad_norm": 0.7735328078269958, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 31920 + }, + { + "epoch": 2.2929982046678634, + "grad_norm": 0.7501550316810608, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 31930 + }, + { + "epoch": 2.293716337522442, + "grad_norm": 0.76664799451828, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 31940 + }, + { + "epoch": 2.29443447037702, + "grad_norm": 1.0044599771499634, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 31950 + }, + { + "epoch": 2.295152603231598, + "grad_norm": 0.7773551344871521, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 31960 + }, + { + "epoch": 2.295870736086176, + "grad_norm": 0.9021226763725281, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 31970 + }, + { + "epoch": 2.296588868940754, + "grad_norm": 0.9075915813446045, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 31980 + }, + { + "epoch": 2.2973070017953323, + "grad_norm": 0.9109290242195129, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 31990 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 0.7742900252342224, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32000 + }, + { + "epoch": 2.2987432675044883, + "grad_norm": 0.633260190486908, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 32010 + }, + { + "epoch": 2.2994614003590663, + "grad_norm": 0.8593834042549133, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 32020 + }, + { + "epoch": 2.3001795332136448, + "grad_norm": 0.88165283203125, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32030 + }, + { + "epoch": 2.3008976660682228, + "grad_norm": 0.7840633988380432, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 32040 + }, + { + "epoch": 2.3016157989228008, + "grad_norm": 0.8150764107704163, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 32050 + }, + { + "epoch": 2.3023339317773788, + "grad_norm": 0.7683324813842773, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32060 + }, + { + "epoch": 2.3030520646319568, + "grad_norm": 0.7581049799919128, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 32070 + }, + { + "epoch": 2.3037701974865348, + "grad_norm": 0.911687970161438, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32080 + }, + { + "epoch": 2.3044883303411132, + "grad_norm": 1.0596355199813843, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32090 + }, + { + "epoch": 2.3052064631956912, + "grad_norm": 0.7329661846160889, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 32100 + }, + { + "epoch": 2.3059245960502692, + "grad_norm": 0.8251074552536011, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 32110 + }, + { + "epoch": 2.3066427289048472, + "grad_norm": 0.7765523195266724, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 32120 + }, + { + "epoch": 2.3073608617594257, + "grad_norm": 0.8246980905532837, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 32130 + }, + { + "epoch": 2.3080789946140037, + "grad_norm": 0.833387017250061, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 32140 + }, + { + "epoch": 2.3087971274685817, + "grad_norm": 0.9558065533638, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 32150 + }, + { + "epoch": 2.3095152603231597, + "grad_norm": 0.788151204586029, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 32160 + }, + { + "epoch": 2.3102333931777377, + "grad_norm": 0.8662320971488953, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 32170 + }, + { + "epoch": 2.310951526032316, + "grad_norm": 0.7079060673713684, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 32180 + }, + { + "epoch": 2.311669658886894, + "grad_norm": 0.8477022647857666, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 32190 + }, + { + "epoch": 2.312387791741472, + "grad_norm": 0.6549711227416992, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 32200 + }, + { + "epoch": 2.31310592459605, + "grad_norm": 0.8274375796318054, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 32210 + }, + { + "epoch": 2.3138240574506286, + "grad_norm": 0.6305822730064392, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 32220 + }, + { + "epoch": 2.3145421903052066, + "grad_norm": 0.8105725049972534, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 32230 + }, + { + "epoch": 2.3152603231597846, + "grad_norm": 0.7317119240760803, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 32240 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 0.7729924917221069, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 32250 + }, + { + "epoch": 2.3166965888689406, + "grad_norm": 0.8092145919799805, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 32260 + }, + { + "epoch": 2.317414721723519, + "grad_norm": 0.8723762035369873, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 32270 + }, + { + "epoch": 2.318132854578097, + "grad_norm": 0.9699533581733704, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 32280 + }, + { + "epoch": 2.318850987432675, + "grad_norm": 1.2972444295883179, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 32290 + }, + { + "epoch": 2.319569120287253, + "grad_norm": 0.7888450622558594, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 32300 + }, + { + "epoch": 2.3202872531418315, + "grad_norm": 0.7457000017166138, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 32310 + }, + { + "epoch": 2.3210053859964095, + "grad_norm": 0.7270606756210327, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 32320 + }, + { + "epoch": 2.3217235188509875, + "grad_norm": 0.7930711507797241, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32330 + }, + { + "epoch": 2.3224416517055655, + "grad_norm": 0.9015030264854431, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 32340 + }, + { + "epoch": 2.3231597845601435, + "grad_norm": 0.9385523796081543, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 32350 + }, + { + "epoch": 2.3238779174147215, + "grad_norm": 0.7293606400489807, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 32360 + }, + { + "epoch": 2.3245960502693, + "grad_norm": 0.797618567943573, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32370 + }, + { + "epoch": 2.325314183123878, + "grad_norm": 0.8588258028030396, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 32380 + }, + { + "epoch": 2.326032315978456, + "grad_norm": 0.7490078210830688, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 32390 + }, + { + "epoch": 2.326750448833034, + "grad_norm": 0.7569956183433533, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 32400 + }, + { + "epoch": 2.3274685816876124, + "grad_norm": 0.8754122853279114, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 32410 + }, + { + "epoch": 2.3281867145421904, + "grad_norm": 0.9410699605941772, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 32420 + }, + { + "epoch": 2.3289048473967684, + "grad_norm": 1.1309062242507935, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 32430 + }, + { + "epoch": 2.3296229802513464, + "grad_norm": 0.7923168540000916, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 32440 + }, + { + "epoch": 2.3303411131059244, + "grad_norm": 0.830387532711029, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 32450 + }, + { + "epoch": 2.331059245960503, + "grad_norm": 0.9087454080581665, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 32460 + }, + { + "epoch": 2.331777378815081, + "grad_norm": 0.8892660737037659, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 32470 + }, + { + "epoch": 2.332495511669659, + "grad_norm": 0.84930819272995, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 32480 + }, + { + "epoch": 2.333213644524237, + "grad_norm": 0.7736781239509583, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 32490 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 0.7396222352981567, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 32500 + }, + { + "epoch": 2.3346499102333933, + "grad_norm": 0.7710241079330444, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 32510 + }, + { + "epoch": 2.3353680430879713, + "grad_norm": 0.7297301888465881, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 32520 + }, + { + "epoch": 2.3360861759425493, + "grad_norm": 0.9084094166755676, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 32530 + }, + { + "epoch": 2.3368043087971273, + "grad_norm": 0.6425859332084656, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 32540 + }, + { + "epoch": 2.3375224416517058, + "grad_norm": 0.8646581172943115, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 32550 + }, + { + "epoch": 2.3382405745062838, + "grad_norm": 0.91925048828125, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 32560 + }, + { + "epoch": 2.3389587073608618, + "grad_norm": 0.8687716722488403, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 32570 + }, + { + "epoch": 2.3396768402154398, + "grad_norm": 0.9769517183303833, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 32580 + }, + { + "epoch": 2.340394973070018, + "grad_norm": 0.7240557074546814, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 32590 + }, + { + "epoch": 2.341113105924596, + "grad_norm": 0.6631549000740051, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32600 + }, + { + "epoch": 2.341831238779174, + "grad_norm": 0.9103635549545288, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 32610 + }, + { + "epoch": 2.342549371633752, + "grad_norm": 0.8718403577804565, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 32620 + }, + { + "epoch": 2.34326750448833, + "grad_norm": 0.8020271062850952, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 32630 + }, + { + "epoch": 2.343985637342908, + "grad_norm": 0.7834265232086182, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 32640 + }, + { + "epoch": 2.3447037701974867, + "grad_norm": 0.8909988403320312, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 32650 + }, + { + "epoch": 2.3454219030520647, + "grad_norm": 0.6915582418441772, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 32660 + }, + { + "epoch": 2.3461400359066427, + "grad_norm": 0.8829401135444641, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 32670 + }, + { + "epoch": 2.3468581687612207, + "grad_norm": 0.8869150876998901, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 32680 + }, + { + "epoch": 2.347576301615799, + "grad_norm": 0.8348933458328247, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 32690 + }, + { + "epoch": 2.348294434470377, + "grad_norm": 0.7591108679771423, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32700 + }, + { + "epoch": 2.349012567324955, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 32710 + }, + { + "epoch": 2.349730700179533, + "grad_norm": 0.8537896275520325, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 32720 + }, + { + "epoch": 2.350448833034111, + "grad_norm": 0.7750797867774963, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 32730 + }, + { + "epoch": 2.3511669658886896, + "grad_norm": 0.7553941607475281, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 32740 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 0.8083372712135315, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 32750 + }, + { + "epoch": 2.3526032315978456, + "grad_norm": 0.8016324043273926, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 32760 + }, + { + "epoch": 2.3533213644524236, + "grad_norm": 0.7524061799049377, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 32770 + }, + { + "epoch": 2.354039497307002, + "grad_norm": 0.9046763777732849, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 32780 + }, + { + "epoch": 2.35475763016158, + "grad_norm": 0.9704324007034302, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 32790 + }, + { + "epoch": 2.355475763016158, + "grad_norm": 0.8756019473075867, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 32800 + }, + { + "epoch": 2.356193895870736, + "grad_norm": 0.7345646023750305, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32810 + }, + { + "epoch": 2.356912028725314, + "grad_norm": 0.8022899031639099, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 32820 + }, + { + "epoch": 2.3576301615798925, + "grad_norm": 0.7663353085517883, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 32830 + }, + { + "epoch": 2.3583482944344705, + "grad_norm": 0.7802956104278564, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32840 + }, + { + "epoch": 2.3590664272890485, + "grad_norm": 0.8130960464477539, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 32850 + }, + { + "epoch": 2.3597845601436265, + "grad_norm": 0.9671252369880676, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32860 + }, + { + "epoch": 2.3605026929982045, + "grad_norm": 0.8806724548339844, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32870 + }, + { + "epoch": 2.361220825852783, + "grad_norm": 0.9378283619880676, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 32880 + }, + { + "epoch": 2.361938958707361, + "grad_norm": 0.8638162612915039, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32890 + }, + { + "epoch": 2.362657091561939, + "grad_norm": 0.7321885228157043, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 32900 + }, + { + "epoch": 2.363375224416517, + "grad_norm": 0.8445415496826172, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 32910 + }, + { + "epoch": 2.364093357271095, + "grad_norm": 0.915715754032135, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 32920 + }, + { + "epoch": 2.3648114901256734, + "grad_norm": 0.8674854040145874, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 32930 + }, + { + "epoch": 2.3655296229802514, + "grad_norm": 0.7577189207077026, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 32940 + }, + { + "epoch": 2.3662477558348294, + "grad_norm": 0.8649988174438477, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 32950 + }, + { + "epoch": 2.3669658886894074, + "grad_norm": 0.9760734438896179, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 32960 + }, + { + "epoch": 2.367684021543986, + "grad_norm": 0.8909491300582886, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 32970 + }, + { + "epoch": 2.368402154398564, + "grad_norm": 0.6970168948173523, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32980 + }, + { + "epoch": 2.369120287253142, + "grad_norm": 0.8208426237106323, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 32990 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 0.8477405309677124, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 33000 + }, + { + "epoch": 2.370556552962298, + "grad_norm": 0.7771625518798828, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 33010 + }, + { + "epoch": 2.3712746858168763, + "grad_norm": 0.7811821103096008, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33020 + }, + { + "epoch": 2.3719928186714543, + "grad_norm": 0.6280415654182434, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33030 + }, + { + "epoch": 2.3727109515260323, + "grad_norm": 0.8733929395675659, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 33040 + }, + { + "epoch": 2.3734290843806103, + "grad_norm": 0.6169558167457581, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33050 + }, + { + "epoch": 2.3741472172351887, + "grad_norm": 0.7414724826812744, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33060 + }, + { + "epoch": 2.3748653500897667, + "grad_norm": 0.7484683990478516, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 33070 + }, + { + "epoch": 2.3755834829443447, + "grad_norm": 0.8495098948478699, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 33080 + }, + { + "epoch": 2.3763016157989227, + "grad_norm": 0.9057353734970093, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 33090 + }, + { + "epoch": 2.3770197486535007, + "grad_norm": 0.8028274178504944, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 33100 + }, + { + "epoch": 2.377737881508079, + "grad_norm": 1.2398128509521484, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 33110 + }, + { + "epoch": 2.378456014362657, + "grad_norm": 0.7894110679626465, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 33120 + }, + { + "epoch": 2.379174147217235, + "grad_norm": 0.8530096411705017, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 33130 + }, + { + "epoch": 2.379892280071813, + "grad_norm": 0.892613410949707, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 33140 + }, + { + "epoch": 2.380610412926391, + "grad_norm": 0.868606448173523, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 33150 + }, + { + "epoch": 2.3813285457809696, + "grad_norm": 0.6801115870475769, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 33160 + }, + { + "epoch": 2.3820466786355476, + "grad_norm": 0.9517148733139038, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 33170 + }, + { + "epoch": 2.3827648114901256, + "grad_norm": 0.8986499309539795, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 33180 + }, + { + "epoch": 2.3834829443447036, + "grad_norm": 0.8467642068862915, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33190 + }, + { + "epoch": 2.3842010771992816, + "grad_norm": 0.8400940299034119, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 33200 + }, + { + "epoch": 2.38491921005386, + "grad_norm": 0.86443030834198, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 33210 + }, + { + "epoch": 2.385637342908438, + "grad_norm": 0.8599014282226562, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 33220 + }, + { + "epoch": 2.386355475763016, + "grad_norm": 0.868735134601593, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33230 + }, + { + "epoch": 2.387073608617594, + "grad_norm": 0.941734790802002, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 33240 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 0.9342881441116333, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 33250 + }, + { + "epoch": 2.3885098743267505, + "grad_norm": 1.012920618057251, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 33260 + }, + { + "epoch": 2.3892280071813286, + "grad_norm": 0.6949151754379272, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 33270 + }, + { + "epoch": 2.3899461400359066, + "grad_norm": 0.8283912539482117, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 33280 + }, + { + "epoch": 2.3906642728904846, + "grad_norm": 0.807273805141449, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 33290 + }, + { + "epoch": 2.391382405745063, + "grad_norm": 0.8109124302864075, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 33300 + }, + { + "epoch": 2.392100538599641, + "grad_norm": 0.7477563619613647, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 33310 + }, + { + "epoch": 2.392818671454219, + "grad_norm": 0.6961637735366821, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 33320 + }, + { + "epoch": 2.393536804308797, + "grad_norm": 0.9424173831939697, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 33330 + }, + { + "epoch": 2.3942549371633755, + "grad_norm": 0.8289623856544495, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 33340 + }, + { + "epoch": 2.3949730700179535, + "grad_norm": 0.8106551170349121, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 33350 + }, + { + "epoch": 2.3956912028725315, + "grad_norm": 0.8800507187843323, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33360 + }, + { + "epoch": 2.3964093357271095, + "grad_norm": 0.7662274241447449, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 33370 + }, + { + "epoch": 2.3971274685816875, + "grad_norm": 0.889204740524292, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 33380 + }, + { + "epoch": 2.3978456014362655, + "grad_norm": 0.7991349697113037, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 33390 + }, + { + "epoch": 2.398563734290844, + "grad_norm": 0.8210278749465942, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 33400 + }, + { + "epoch": 2.399281867145422, + "grad_norm": 0.91801917552948, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 33410 + }, + { + "epoch": 2.4, + "grad_norm": 0.8086220622062683, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 33420 + }, + { + "epoch": 2.400718132854578, + "grad_norm": 0.901613175868988, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 33430 + }, + { + "epoch": 2.4014362657091564, + "grad_norm": 0.9865965247154236, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 33440 + }, + { + "epoch": 2.4021543985637344, + "grad_norm": 0.8160675168037415, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 33450 + }, + { + "epoch": 2.4028725314183124, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33460 + }, + { + "epoch": 2.4035906642728904, + "grad_norm": 0.8490013480186462, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 33470 + }, + { + "epoch": 2.4043087971274684, + "grad_norm": 0.6947163939476013, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33480 + }, + { + "epoch": 2.405026929982047, + "grad_norm": 0.7984827756881714, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 33490 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 0.7826083302497864, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 33500 + }, + { + "epoch": 2.406463195691203, + "grad_norm": 0.8213959336280823, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 33510 + }, + { + "epoch": 2.407181328545781, + "grad_norm": 0.8790069818496704, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 33520 + }, + { + "epoch": 2.4078994614003593, + "grad_norm": 0.9093378782272339, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 33530 + }, + { + "epoch": 2.4086175942549373, + "grad_norm": 0.8085389137268066, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 33540 + }, + { + "epoch": 2.4093357271095153, + "grad_norm": 0.7952343225479126, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 33550 + }, + { + "epoch": 2.4100538599640933, + "grad_norm": 0.9576563835144043, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 33560 + }, + { + "epoch": 2.4107719928186713, + "grad_norm": 0.7722929120063782, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 33570 + }, + { + "epoch": 2.4114901256732497, + "grad_norm": 0.8634604215621948, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 33580 + }, + { + "epoch": 2.4122082585278277, + "grad_norm": 0.7805271148681641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 33590 + }, + { + "epoch": 2.4129263913824057, + "grad_norm": 0.8274481296539307, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 33600 + }, + { + "epoch": 2.4136445242369837, + "grad_norm": 0.9265141487121582, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 33610 + }, + { + "epoch": 2.414362657091562, + "grad_norm": 0.7497374415397644, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 33620 + }, + { + "epoch": 2.41508078994614, + "grad_norm": 0.7048972249031067, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 33630 + }, + { + "epoch": 2.415798922800718, + "grad_norm": 0.8449550271034241, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 33640 + }, + { + "epoch": 2.416517055655296, + "grad_norm": 0.7581984400749207, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 33650 + }, + { + "epoch": 2.417235188509874, + "grad_norm": 0.7744191288948059, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 33660 + }, + { + "epoch": 2.417953321364452, + "grad_norm": 0.6736614108085632, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 33670 + }, + { + "epoch": 2.4186714542190306, + "grad_norm": 0.985431432723999, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33680 + }, + { + "epoch": 2.4193895870736086, + "grad_norm": 0.8027978539466858, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33690 + }, + { + "epoch": 2.4201077199281866, + "grad_norm": 0.6809377074241638, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 33700 + }, + { + "epoch": 2.4208258527827646, + "grad_norm": 0.8305349946022034, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 33710 + }, + { + "epoch": 2.421543985637343, + "grad_norm": 0.7632496356964111, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 33720 + }, + { + "epoch": 2.422262118491921, + "grad_norm": 0.7241050601005554, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 33730 + }, + { + "epoch": 2.422980251346499, + "grad_norm": 0.6729857325553894, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 33740 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 0.7741881012916565, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 33750 + }, + { + "epoch": 2.424416517055655, + "grad_norm": 0.7844415903091431, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 33760 + }, + { + "epoch": 2.4251346499102335, + "grad_norm": 0.7960098385810852, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 33770 + }, + { + "epoch": 2.4258527827648115, + "grad_norm": 0.8267978429794312, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 33780 + }, + { + "epoch": 2.4265709156193895, + "grad_norm": 0.7498974204063416, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 33790 + }, + { + "epoch": 2.4272890484739675, + "grad_norm": 0.8357859253883362, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 33800 + }, + { + "epoch": 2.428007181328546, + "grad_norm": 0.8056104779243469, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 33810 + }, + { + "epoch": 2.428725314183124, + "grad_norm": 0.806897759437561, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 33820 + }, + { + "epoch": 2.429443447037702, + "grad_norm": 0.7770048975944519, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 33830 + }, + { + "epoch": 2.43016157989228, + "grad_norm": 0.8311458230018616, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 33840 + }, + { + "epoch": 2.430879712746858, + "grad_norm": 0.9201730489730835, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 33850 + }, + { + "epoch": 2.4315978456014364, + "grad_norm": 0.83509761095047, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 33860 + }, + { + "epoch": 2.4323159784560144, + "grad_norm": 0.7680139541625977, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 33870 + }, + { + "epoch": 2.4330341113105924, + "grad_norm": 0.8956670165061951, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 33880 + }, + { + "epoch": 2.4337522441651704, + "grad_norm": 0.717941164970398, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33890 + }, + { + "epoch": 2.434470377019749, + "grad_norm": 0.777206540107727, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 33900 + }, + { + "epoch": 2.435188509874327, + "grad_norm": 0.90232914686203, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 33910 + }, + { + "epoch": 2.435906642728905, + "grad_norm": 1.0817158222198486, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 33920 + }, + { + "epoch": 2.436624775583483, + "grad_norm": 0.7890931367874146, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 33930 + }, + { + "epoch": 2.437342908438061, + "grad_norm": 0.9279449582099915, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 33940 + }, + { + "epoch": 2.438061041292639, + "grad_norm": 0.8313823342323303, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 33950 + }, + { + "epoch": 2.4387791741472173, + "grad_norm": 1.0510340929031372, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 33960 + }, + { + "epoch": 2.4394973070017953, + "grad_norm": 0.8002574443817139, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 33970 + }, + { + "epoch": 2.4402154398563733, + "grad_norm": 0.7822834253311157, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33980 + }, + { + "epoch": 2.4409335727109513, + "grad_norm": 0.9050403237342834, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 33990 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 0.7569652199745178, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 34000 + }, + { + "epoch": 2.442369838420108, + "grad_norm": 0.6609470844268799, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 34010 + }, + { + "epoch": 2.443087971274686, + "grad_norm": 0.8090947866439819, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34020 + }, + { + "epoch": 2.443806104129264, + "grad_norm": 0.647814929485321, + "learning_rate": 0.0002, + "loss": 0.6621, + "step": 34030 + }, + { + "epoch": 2.444524236983842, + "grad_norm": 0.9308601021766663, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 34040 + }, + { + "epoch": 2.4452423698384202, + "grad_norm": 0.8259239792823792, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34050 + }, + { + "epoch": 2.4459605026929983, + "grad_norm": 0.9410025477409363, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 34060 + }, + { + "epoch": 2.4466786355475763, + "grad_norm": 0.7446974515914917, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 34070 + }, + { + "epoch": 2.4473967684021543, + "grad_norm": 0.7093849182128906, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 34080 + }, + { + "epoch": 2.4481149012567327, + "grad_norm": 0.8726152181625366, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 34090 + }, + { + "epoch": 2.4488330341113107, + "grad_norm": 0.808300793170929, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 34100 + }, + { + "epoch": 2.4495511669658887, + "grad_norm": 0.6884859800338745, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 34110 + }, + { + "epoch": 2.4502692998204667, + "grad_norm": 0.7151864767074585, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 34120 + }, + { + "epoch": 2.4509874326750447, + "grad_norm": 0.9261866807937622, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 34130 + }, + { + "epoch": 2.451705565529623, + "grad_norm": 0.8069018125534058, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 34140 + }, + { + "epoch": 2.452423698384201, + "grad_norm": 0.8001297116279602, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 34150 + }, + { + "epoch": 2.453141831238779, + "grad_norm": 0.8547799587249756, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 34160 + }, + { + "epoch": 2.453859964093357, + "grad_norm": 0.6693823337554932, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 34170 + }, + { + "epoch": 2.4545780969479356, + "grad_norm": 0.6646198630332947, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34180 + }, + { + "epoch": 2.4552962298025136, + "grad_norm": 0.9330950975418091, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 34190 + }, + { + "epoch": 2.4560143626570916, + "grad_norm": 0.7738645672798157, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 34200 + }, + { + "epoch": 2.4567324955116696, + "grad_norm": 0.7929846048355103, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 34210 + }, + { + "epoch": 2.4574506283662476, + "grad_norm": 0.8936280012130737, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34220 + }, + { + "epoch": 2.4581687612208256, + "grad_norm": 0.9099360108375549, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 34230 + }, + { + "epoch": 2.458886894075404, + "grad_norm": 0.7941291928291321, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 34240 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 0.7169737219810486, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 34250 + }, + { + "epoch": 2.46032315978456, + "grad_norm": 0.8994171023368835, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 34260 + }, + { + "epoch": 2.461041292639138, + "grad_norm": 0.8087331056594849, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 34270 + }, + { + "epoch": 2.4617594254937165, + "grad_norm": 0.935502827167511, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 34280 + }, + { + "epoch": 2.4624775583482945, + "grad_norm": 0.8957464694976807, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 34290 + }, + { + "epoch": 2.4631956912028725, + "grad_norm": 0.9017183780670166, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 34300 + }, + { + "epoch": 2.4639138240574505, + "grad_norm": 0.7778640389442444, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34310 + }, + { + "epoch": 2.4646319569120285, + "grad_norm": 0.8870323896408081, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 34320 + }, + { + "epoch": 2.465350089766607, + "grad_norm": 0.7660176753997803, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 34330 + }, + { + "epoch": 2.466068222621185, + "grad_norm": 0.8442226648330688, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 34340 + }, + { + "epoch": 2.466786355475763, + "grad_norm": 0.7522561550140381, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 34350 + }, + { + "epoch": 2.467504488330341, + "grad_norm": 0.9355213046073914, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 34360 + }, + { + "epoch": 2.4682226211849194, + "grad_norm": 0.8487382531166077, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 34370 + }, + { + "epoch": 2.4689407540394974, + "grad_norm": 0.7869813442230225, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 34380 + }, + { + "epoch": 2.4696588868940754, + "grad_norm": 0.7562848329544067, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 34390 + }, + { + "epoch": 2.4703770197486534, + "grad_norm": 0.740829586982727, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 34400 + }, + { + "epoch": 2.4710951526032314, + "grad_norm": 1.0862116813659668, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 34410 + }, + { + "epoch": 2.47181328545781, + "grad_norm": 0.9633645415306091, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 34420 + }, + { + "epoch": 2.472531418312388, + "grad_norm": 0.8467186093330383, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 34430 + }, + { + "epoch": 2.473249551166966, + "grad_norm": 0.9972147941589355, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 34440 + }, + { + "epoch": 2.473967684021544, + "grad_norm": 0.8086632490158081, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 34450 + }, + { + "epoch": 2.4746858168761223, + "grad_norm": 0.9043704271316528, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 34460 + }, + { + "epoch": 2.4754039497307003, + "grad_norm": 0.8275330662727356, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34470 + }, + { + "epoch": 2.4761220825852783, + "grad_norm": 0.8142464756965637, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 34480 + }, + { + "epoch": 2.4768402154398563, + "grad_norm": 0.7116754651069641, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 34490 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 0.8742281198501587, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 34500 + }, + { + "epoch": 2.4782764811490123, + "grad_norm": 0.7545657157897949, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 34510 + }, + { + "epoch": 2.478994614003591, + "grad_norm": 0.7586482167243958, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 34520 + }, + { + "epoch": 2.479712746858169, + "grad_norm": 0.9212547540664673, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 34530 + }, + { + "epoch": 2.480430879712747, + "grad_norm": 0.9391530752182007, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 34540 + }, + { + "epoch": 2.481149012567325, + "grad_norm": 1.119698166847229, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 34550 + }, + { + "epoch": 2.4818671454219032, + "grad_norm": 0.8499019145965576, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34560 + }, + { + "epoch": 2.4825852782764812, + "grad_norm": 0.7629778385162354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 34570 + }, + { + "epoch": 2.4833034111310592, + "grad_norm": 0.7667021155357361, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 34580 + }, + { + "epoch": 2.4840215439856372, + "grad_norm": 0.6711493730545044, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 34590 + }, + { + "epoch": 2.4847396768402152, + "grad_norm": 0.7354223728179932, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34600 + }, + { + "epoch": 2.4854578096947937, + "grad_norm": 0.875295102596283, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 34610 + }, + { + "epoch": 2.4861759425493717, + "grad_norm": 0.7341493964195251, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 34620 + }, + { + "epoch": 2.4868940754039497, + "grad_norm": 0.9049216508865356, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 34630 + }, + { + "epoch": 2.4876122082585277, + "grad_norm": 0.7214788198471069, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 34640 + }, + { + "epoch": 2.488330341113106, + "grad_norm": 0.7514070868492126, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 34650 + }, + { + "epoch": 2.489048473967684, + "grad_norm": 0.6929763555526733, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 34660 + }, + { + "epoch": 2.489766606822262, + "grad_norm": 1.11346435546875, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 34670 + }, + { + "epoch": 2.49048473967684, + "grad_norm": 0.9285556674003601, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 34680 + }, + { + "epoch": 2.491202872531418, + "grad_norm": 0.7699695825576782, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 34690 + }, + { + "epoch": 2.4919210053859966, + "grad_norm": 0.872349739074707, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 34700 + }, + { + "epoch": 2.4926391382405746, + "grad_norm": 0.8692147135734558, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 34710 + }, + { + "epoch": 2.4933572710951526, + "grad_norm": 0.799740195274353, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 34720 + }, + { + "epoch": 2.4940754039497306, + "grad_norm": 0.7320986986160278, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 34730 + }, + { + "epoch": 2.494793536804309, + "grad_norm": 0.8233383893966675, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 34740 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 0.9605086445808411, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34750 + }, + { + "epoch": 2.496229802513465, + "grad_norm": 0.8597773909568787, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 34760 + }, + { + "epoch": 2.496947935368043, + "grad_norm": 0.7459201812744141, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34770 + }, + { + "epoch": 2.497666068222621, + "grad_norm": 0.778457522392273, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 34780 + }, + { + "epoch": 2.498384201077199, + "grad_norm": 0.8591375946998596, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 34790 + }, + { + "epoch": 2.4991023339317775, + "grad_norm": 0.9689867496490479, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 34800 + }, + { + "epoch": 2.4998204667863555, + "grad_norm": 0.7430615425109863, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 34810 + }, + { + "epoch": 2.5005385996409335, + "grad_norm": 0.8545114994049072, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 34820 + }, + { + "epoch": 2.5012567324955115, + "grad_norm": 0.7115356922149658, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 34830 + }, + { + "epoch": 2.50197486535009, + "grad_norm": 0.7616795301437378, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34840 + }, + { + "epoch": 2.502692998204668, + "grad_norm": 0.8097891211509705, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 34850 + }, + { + "epoch": 2.503411131059246, + "grad_norm": 0.7397396564483643, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 34860 + }, + { + "epoch": 2.504129263913824, + "grad_norm": 0.7531594038009644, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 34870 + }, + { + "epoch": 2.504847396768402, + "grad_norm": 0.8050091862678528, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 34880 + }, + { + "epoch": 2.5055655296229804, + "grad_norm": 0.7550507187843323, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 34890 + }, + { + "epoch": 2.5062836624775584, + "grad_norm": 1.0131759643554688, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34900 + }, + { + "epoch": 2.5070017953321364, + "grad_norm": 0.9275356531143188, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 34910 + }, + { + "epoch": 2.5077199281867144, + "grad_norm": 0.6655791997909546, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 34920 + }, + { + "epoch": 2.508438061041293, + "grad_norm": 0.79361891746521, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 34930 + }, + { + "epoch": 2.509156193895871, + "grad_norm": 0.8223658800125122, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 34940 + }, + { + "epoch": 2.509874326750449, + "grad_norm": 1.0070416927337646, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 34950 + }, + { + "epoch": 2.510592459605027, + "grad_norm": 0.8408986330032349, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 34960 + }, + { + "epoch": 2.511310592459605, + "grad_norm": 0.8178259134292603, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 34970 + }, + { + "epoch": 2.512028725314183, + "grad_norm": 0.747876763343811, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 34980 + }, + { + "epoch": 2.5127468581687613, + "grad_norm": 0.8551825881004333, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 34990 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 0.8366564512252808, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 35000 + }, + { + "epoch": 2.5141831238779173, + "grad_norm": 0.8491294384002686, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 35010 + }, + { + "epoch": 2.5149012567324958, + "grad_norm": 0.8854562640190125, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 35020 + }, + { + "epoch": 2.5156193895870738, + "grad_norm": 0.8652133345603943, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 35030 + }, + { + "epoch": 2.5163375224416518, + "grad_norm": 0.8734033107757568, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 35040 + }, + { + "epoch": 2.5170556552962298, + "grad_norm": 0.8613446950912476, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 35050 + }, + { + "epoch": 2.5177737881508078, + "grad_norm": 0.762395441532135, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 35060 + }, + { + "epoch": 2.5184919210053858, + "grad_norm": 0.806220293045044, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 35070 + }, + { + "epoch": 2.519210053859964, + "grad_norm": 0.7781713008880615, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 35080 + }, + { + "epoch": 2.519928186714542, + "grad_norm": 0.8639848828315735, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 35090 + }, + { + "epoch": 2.52064631956912, + "grad_norm": 0.7331740260124207, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 35100 + }, + { + "epoch": 2.521364452423698, + "grad_norm": 0.8148137927055359, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 35110 + }, + { + "epoch": 2.5220825852782767, + "grad_norm": 0.6939297914505005, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 35120 + }, + { + "epoch": 2.5228007181328547, + "grad_norm": 0.8151076436042786, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 35130 + }, + { + "epoch": 2.5235188509874327, + "grad_norm": 0.9193238019943237, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 35140 + }, + { + "epoch": 2.5242369838420107, + "grad_norm": 0.8230985403060913, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 35150 + }, + { + "epoch": 2.5249551166965887, + "grad_norm": 0.865492582321167, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 35160 + }, + { + "epoch": 2.525673249551167, + "grad_norm": 0.7673570513725281, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35170 + }, + { + "epoch": 2.526391382405745, + "grad_norm": 0.8296313881874084, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 35180 + }, + { + "epoch": 2.527109515260323, + "grad_norm": 0.6531317234039307, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 35190 + }, + { + "epoch": 2.527827648114901, + "grad_norm": 0.9865642189979553, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 35200 + }, + { + "epoch": 2.5285457809694796, + "grad_norm": 0.8001098036766052, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 35210 + }, + { + "epoch": 2.5292639138240576, + "grad_norm": 0.7523218393325806, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 35220 + }, + { + "epoch": 2.5299820466786356, + "grad_norm": 1.061640977859497, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 35230 + }, + { + "epoch": 2.5307001795332136, + "grad_norm": 0.9668078422546387, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35240 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 0.9554983973503113, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 35250 + }, + { + "epoch": 2.5321364452423696, + "grad_norm": 0.8343066573143005, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 35260 + }, + { + "epoch": 2.532854578096948, + "grad_norm": 0.8408095240592957, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 35270 + }, + { + "epoch": 2.533572710951526, + "grad_norm": 0.8593984842300415, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 35280 + }, + { + "epoch": 2.534290843806104, + "grad_norm": 0.7593855261802673, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 35290 + }, + { + "epoch": 2.5350089766606825, + "grad_norm": 0.9179701209068298, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 35300 + }, + { + "epoch": 2.5357271095152605, + "grad_norm": 0.749022901058197, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 35310 + }, + { + "epoch": 2.5364452423698385, + "grad_norm": 0.7172152400016785, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 35320 + }, + { + "epoch": 2.5371633752244165, + "grad_norm": 0.8228873610496521, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 35330 + }, + { + "epoch": 2.5378815080789945, + "grad_norm": 0.9663547277450562, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 35340 + }, + { + "epoch": 2.5385996409335725, + "grad_norm": 0.8446536660194397, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35350 + }, + { + "epoch": 2.539317773788151, + "grad_norm": 0.9751029014587402, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 35360 + }, + { + "epoch": 2.540035906642729, + "grad_norm": 0.7460315823554993, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 35370 + }, + { + "epoch": 2.540754039497307, + "grad_norm": 0.8269246816635132, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 35380 + }, + { + "epoch": 2.541472172351885, + "grad_norm": 0.7200030088424683, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 35390 + }, + { + "epoch": 2.5421903052064634, + "grad_norm": 0.9586671590805054, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 35400 + }, + { + "epoch": 2.5429084380610414, + "grad_norm": 0.7872378826141357, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 35410 + }, + { + "epoch": 2.5436265709156194, + "grad_norm": 0.8257358074188232, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 35420 + }, + { + "epoch": 2.5443447037701974, + "grad_norm": 0.6924505829811096, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 35430 + }, + { + "epoch": 2.5450628366247754, + "grad_norm": 1.1171481609344482, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 35440 + }, + { + "epoch": 2.545780969479354, + "grad_norm": 0.9635605216026306, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 35450 + }, + { + "epoch": 2.546499102333932, + "grad_norm": 0.9760567545890808, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 35460 + }, + { + "epoch": 2.54721723518851, + "grad_norm": 0.8523460030555725, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 35470 + }, + { + "epoch": 2.547935368043088, + "grad_norm": 0.9316970109939575, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 35480 + }, + { + "epoch": 2.5486535008976663, + "grad_norm": 0.7401485443115234, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 35490 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 1.0627065896987915, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 35500 + }, + { + "epoch": 2.5500897666068223, + "grad_norm": 0.7463156580924988, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 35510 + }, + { + "epoch": 2.5508078994614003, + "grad_norm": 0.9935570359230042, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 35520 + }, + { + "epoch": 2.5515260323159783, + "grad_norm": 0.8824051022529602, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 35530 + }, + { + "epoch": 2.5522441651705563, + "grad_norm": 0.8018375635147095, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 35540 + }, + { + "epoch": 2.5529622980251347, + "grad_norm": 0.7523182034492493, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 35550 + }, + { + "epoch": 2.5536804308797127, + "grad_norm": 0.6771712303161621, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 35560 + }, + { + "epoch": 2.5543985637342908, + "grad_norm": 0.7903336882591248, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 35570 + }, + { + "epoch": 2.555116696588869, + "grad_norm": 0.7973808646202087, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 35580 + }, + { + "epoch": 2.555834829443447, + "grad_norm": 0.9082772731781006, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 35590 + }, + { + "epoch": 2.556552962298025, + "grad_norm": 0.779671311378479, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 35600 + }, + { + "epoch": 2.557271095152603, + "grad_norm": 0.710058331489563, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 35610 + }, + { + "epoch": 2.557989228007181, + "grad_norm": 0.8217873573303223, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 35620 + }, + { + "epoch": 2.558707360861759, + "grad_norm": 0.8017855286598206, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 35630 + }, + { + "epoch": 2.5594254937163377, + "grad_norm": 0.6671402454376221, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 35640 + }, + { + "epoch": 2.5601436265709157, + "grad_norm": 0.9357045292854309, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 35650 + }, + { + "epoch": 2.5608617594254937, + "grad_norm": 0.7676312327384949, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35660 + }, + { + "epoch": 2.5615798922800717, + "grad_norm": 0.7602545619010925, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 35670 + }, + { + "epoch": 2.56229802513465, + "grad_norm": 0.8112275004386902, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35680 + }, + { + "epoch": 2.563016157989228, + "grad_norm": 0.73296719789505, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 35690 + }, + { + "epoch": 2.563734290843806, + "grad_norm": 0.9007818102836609, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 35700 + }, + { + "epoch": 2.564452423698384, + "grad_norm": 0.7526060938835144, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 35710 + }, + { + "epoch": 2.565170556552962, + "grad_norm": 0.813875675201416, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 35720 + }, + { + "epoch": 2.5658886894075406, + "grad_norm": 0.7767695784568787, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 35730 + }, + { + "epoch": 2.5666068222621186, + "grad_norm": 0.7840573787689209, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35740 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 0.7400487661361694, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 35750 + }, + { + "epoch": 2.5680430879712746, + "grad_norm": 0.7424315810203552, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 35760 + }, + { + "epoch": 2.568761220825853, + "grad_norm": 0.7812185883522034, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 35770 + }, + { + "epoch": 2.569479353680431, + "grad_norm": 0.8397669196128845, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 35780 + }, + { + "epoch": 2.570197486535009, + "grad_norm": 0.7543849945068359, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 35790 + }, + { + "epoch": 2.570915619389587, + "grad_norm": 0.903634786605835, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 35800 + }, + { + "epoch": 2.571633752244165, + "grad_norm": 0.853335976600647, + "learning_rate": 0.0002, + "loss": 0.6884, + "step": 35810 + }, + { + "epoch": 2.572351885098743, + "grad_norm": 0.8441029787063599, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 35820 + }, + { + "epoch": 2.5730700179533215, + "grad_norm": 0.9072228670120239, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 35830 + }, + { + "epoch": 2.5737881508078995, + "grad_norm": 0.7720168828964233, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 35840 + }, + { + "epoch": 2.5745062836624775, + "grad_norm": 0.8719366788864136, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35850 + }, + { + "epoch": 2.575224416517056, + "grad_norm": 0.766209065914154, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 35860 + }, + { + "epoch": 2.575942549371634, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 35870 + }, + { + "epoch": 2.576660682226212, + "grad_norm": 0.8068482875823975, + "learning_rate": 0.0002, + "loss": 0.7309, + "step": 35880 + }, + { + "epoch": 2.57737881508079, + "grad_norm": 0.8321225643157959, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 35890 + }, + { + "epoch": 2.578096947935368, + "grad_norm": 0.9787611961364746, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 35900 + }, + { + "epoch": 2.578815080789946, + "grad_norm": 0.6955108642578125, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 35910 + }, + { + "epoch": 2.5795332136445244, + "grad_norm": 0.8309195637702942, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 35920 + }, + { + "epoch": 2.5802513464991024, + "grad_norm": 0.9309390783309937, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 35930 + }, + { + "epoch": 2.5809694793536804, + "grad_norm": 0.903537392616272, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 35940 + }, + { + "epoch": 2.5816876122082584, + "grad_norm": 0.9530633091926575, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 35950 + }, + { + "epoch": 2.582405745062837, + "grad_norm": 1.0140212774276733, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 35960 + }, + { + "epoch": 2.583123877917415, + "grad_norm": 0.8224637508392334, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 35970 + }, + { + "epoch": 2.583842010771993, + "grad_norm": 0.7952998280525208, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 35980 + }, + { + "epoch": 2.584560143626571, + "grad_norm": 0.6057878136634827, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 35990 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 0.9172457456588745, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 36000 + }, + { + "epoch": 2.5859964093357273, + "grad_norm": 1.0061585903167725, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36010 + }, + { + "epoch": 2.5867145421903053, + "grad_norm": 0.8555058240890503, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 36020 + }, + { + "epoch": 2.5874326750448833, + "grad_norm": 0.7732099890708923, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 36030 + }, + { + "epoch": 2.5881508078994613, + "grad_norm": 0.9026121497154236, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 36040 + }, + { + "epoch": 2.5888689407540397, + "grad_norm": 0.7477090954780579, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 36050 + }, + { + "epoch": 2.5895870736086177, + "grad_norm": 0.8835780024528503, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 36060 + }, + { + "epoch": 2.5903052064631957, + "grad_norm": 0.7555899024009705, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 36070 + }, + { + "epoch": 2.5910233393177737, + "grad_norm": 0.7983574867248535, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 36080 + }, + { + "epoch": 2.5917414721723517, + "grad_norm": 0.9261698722839355, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 36090 + }, + { + "epoch": 2.5924596050269297, + "grad_norm": 0.6834031343460083, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 36100 + }, + { + "epoch": 2.593177737881508, + "grad_norm": 0.9528526067733765, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 36110 + }, + { + "epoch": 2.593895870736086, + "grad_norm": 0.7469993233680725, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 36120 + }, + { + "epoch": 2.594614003590664, + "grad_norm": 0.6750355362892151, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 36130 + }, + { + "epoch": 2.5953321364452426, + "grad_norm": 0.8591015338897705, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 36140 + }, + { + "epoch": 2.5960502692998206, + "grad_norm": 0.7359472513198853, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 36150 + }, + { + "epoch": 2.5967684021543986, + "grad_norm": 0.8450608253479004, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36160 + }, + { + "epoch": 2.5974865350089766, + "grad_norm": 0.9069468975067139, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36170 + }, + { + "epoch": 2.5982046678635546, + "grad_norm": 0.9261118173599243, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 36180 + }, + { + "epoch": 2.5989228007181326, + "grad_norm": 0.7164715528488159, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 36190 + }, + { + "epoch": 2.599640933572711, + "grad_norm": 0.8809511661529541, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 36200 + }, + { + "epoch": 2.600359066427289, + "grad_norm": 0.9872701168060303, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 36210 + }, + { + "epoch": 2.601077199281867, + "grad_norm": 0.7544043064117432, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 36220 + }, + { + "epoch": 2.601795332136445, + "grad_norm": 0.9890767335891724, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 36230 + }, + { + "epoch": 2.6025134649910235, + "grad_norm": 0.907865047454834, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 36240 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 0.7724096179008484, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 36250 + }, + { + "epoch": 2.6039497307001795, + "grad_norm": 0.7996655106544495, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36260 + }, + { + "epoch": 2.6046678635547575, + "grad_norm": 0.7184412479400635, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 36270 + }, + { + "epoch": 2.6053859964093355, + "grad_norm": 0.7781601548194885, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 36280 + }, + { + "epoch": 2.6061041292639135, + "grad_norm": 0.8972102403640747, + "learning_rate": 0.0002, + "loss": 0.6975, + "step": 36290 + }, + { + "epoch": 2.606822262118492, + "grad_norm": 0.6831884980201721, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 36300 + }, + { + "epoch": 2.60754039497307, + "grad_norm": 0.9049789905548096, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 36310 + }, + { + "epoch": 2.608258527827648, + "grad_norm": 0.8062970042228699, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 36320 + }, + { + "epoch": 2.6089766606822264, + "grad_norm": 0.94797682762146, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 36330 + }, + { + "epoch": 2.6096947935368044, + "grad_norm": 0.7907559275627136, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 36340 + }, + { + "epoch": 2.6104129263913824, + "grad_norm": 0.6720156073570251, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 36350 + }, + { + "epoch": 2.6111310592459605, + "grad_norm": 0.729228138923645, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 36360 + }, + { + "epoch": 2.6118491921005385, + "grad_norm": 0.9072836637496948, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 36370 + }, + { + "epoch": 2.6125673249551165, + "grad_norm": 0.8022173643112183, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36380 + }, + { + "epoch": 2.613285457809695, + "grad_norm": 0.7475612163543701, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 36390 + }, + { + "epoch": 2.614003590664273, + "grad_norm": 0.7976534366607666, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 36400 + }, + { + "epoch": 2.614721723518851, + "grad_norm": 0.7118260860443115, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36410 + }, + { + "epoch": 2.6154398563734294, + "grad_norm": 0.666500985622406, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36420 + }, + { + "epoch": 2.6161579892280074, + "grad_norm": 0.8776089549064636, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 36430 + }, + { + "epoch": 2.6168761220825854, + "grad_norm": 0.9375919699668884, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 36440 + }, + { + "epoch": 2.6175942549371634, + "grad_norm": 0.8162244558334351, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 36450 + }, + { + "epoch": 2.6183123877917414, + "grad_norm": 0.8459304571151733, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 36460 + }, + { + "epoch": 2.6190305206463194, + "grad_norm": 0.7731037735939026, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 36470 + }, + { + "epoch": 2.619748653500898, + "grad_norm": 0.7857680320739746, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 36480 + }, + { + "epoch": 2.620466786355476, + "grad_norm": 0.8415161371231079, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 36490 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 0.8103558421134949, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 36500 + }, + { + "epoch": 2.621903052064632, + "grad_norm": 0.7876150608062744, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 36510 + }, + { + "epoch": 2.6226211849192103, + "grad_norm": 0.7316484451293945, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 36520 + }, + { + "epoch": 2.6233393177737883, + "grad_norm": 0.7209784984588623, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 36530 + }, + { + "epoch": 2.6240574506283663, + "grad_norm": 0.8933016657829285, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 36540 + }, + { + "epoch": 2.6247755834829443, + "grad_norm": 0.8078171610832214, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 36550 + }, + { + "epoch": 2.6254937163375223, + "grad_norm": 0.9134724736213684, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 36560 + }, + { + "epoch": 2.6262118491921003, + "grad_norm": 0.8691368699073792, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 36570 + }, + { + "epoch": 2.6269299820466787, + "grad_norm": 0.706479012966156, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 36580 + }, + { + "epoch": 2.6276481149012567, + "grad_norm": 0.9333644509315491, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 36590 + }, + { + "epoch": 2.6283662477558347, + "grad_norm": 0.8156154155731201, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 36600 + }, + { + "epoch": 2.629084380610413, + "grad_norm": 0.812745213508606, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 36610 + }, + { + "epoch": 2.629802513464991, + "grad_norm": 0.8898148536682129, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 36620 + }, + { + "epoch": 2.630520646319569, + "grad_norm": 0.8083946108818054, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36630 + }, + { + "epoch": 2.631238779174147, + "grad_norm": 0.7050122618675232, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 36640 + }, + { + "epoch": 2.631956912028725, + "grad_norm": 0.8155789971351624, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 36650 + }, + { + "epoch": 2.632675044883303, + "grad_norm": 0.9102175235748291, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 36660 + }, + { + "epoch": 2.6333931777378816, + "grad_norm": 0.6621248126029968, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36670 + }, + { + "epoch": 2.6341113105924596, + "grad_norm": 0.7338519096374512, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 36680 + }, + { + "epoch": 2.6348294434470376, + "grad_norm": 0.7536506652832031, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 36690 + }, + { + "epoch": 2.635547576301616, + "grad_norm": 0.9357436299324036, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 36700 + }, + { + "epoch": 2.636265709156194, + "grad_norm": 0.7732111215591431, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 36710 + }, + { + "epoch": 2.636983842010772, + "grad_norm": 0.6863537430763245, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36720 + }, + { + "epoch": 2.63770197486535, + "grad_norm": 0.8014764785766602, + "learning_rate": 0.0002, + "loss": 0.7058, + "step": 36730 + }, + { + "epoch": 2.638420107719928, + "grad_norm": 0.8103911280632019, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 36740 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 0.882652997970581, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 36750 + }, + { + "epoch": 2.6398563734290845, + "grad_norm": 0.8705278038978577, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 36760 + }, + { + "epoch": 2.6405745062836625, + "grad_norm": 0.80764240026474, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36770 + }, + { + "epoch": 2.6412926391382405, + "grad_norm": 0.9668620824813843, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 36780 + }, + { + "epoch": 2.6420107719928185, + "grad_norm": 0.7477577328681946, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 36790 + }, + { + "epoch": 2.642728904847397, + "grad_norm": 0.8344516754150391, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 36800 + }, + { + "epoch": 2.643447037701975, + "grad_norm": 0.9520720839500427, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 36810 + }, + { + "epoch": 2.644165170556553, + "grad_norm": 0.5942372679710388, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 36820 + }, + { + "epoch": 2.644883303411131, + "grad_norm": 0.7411555051803589, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 36830 + }, + { + "epoch": 2.645601436265709, + "grad_norm": 0.6597771048545837, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 36840 + }, + { + "epoch": 2.646319569120287, + "grad_norm": 0.8636548519134521, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 36850 + }, + { + "epoch": 2.6470377019748654, + "grad_norm": 0.8557497262954712, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 36860 + }, + { + "epoch": 2.6477558348294434, + "grad_norm": 0.8535996675491333, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 36870 + }, + { + "epoch": 2.6484739676840214, + "grad_norm": 0.7996463775634766, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 36880 + }, + { + "epoch": 2.6491921005386, + "grad_norm": 0.6462067365646362, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 36890 + }, + { + "epoch": 2.649910233393178, + "grad_norm": 0.8849772214889526, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36900 + }, + { + "epoch": 2.650628366247756, + "grad_norm": 0.999173641204834, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 36910 + }, + { + "epoch": 2.651346499102334, + "grad_norm": 0.7221724987030029, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 36920 + }, + { + "epoch": 2.652064631956912, + "grad_norm": 0.8122989535331726, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 36930 + }, + { + "epoch": 2.65278276481149, + "grad_norm": 0.724267840385437, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 36940 + }, + { + "epoch": 2.6535008976660683, + "grad_norm": 0.8250583410263062, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 36950 + }, + { + "epoch": 2.6542190305206463, + "grad_norm": 0.7623526453971863, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 36960 + }, + { + "epoch": 2.6549371633752243, + "grad_norm": 0.6474025845527649, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 36970 + }, + { + "epoch": 2.655655296229803, + "grad_norm": 0.9751694202423096, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 36980 + }, + { + "epoch": 2.656373429084381, + "grad_norm": 0.8338939547538757, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 36990 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 0.8877421021461487, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 37000 + }, + { + "epoch": 2.657809694793537, + "grad_norm": 0.9590298533439636, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 37010 + }, + { + "epoch": 2.658527827648115, + "grad_norm": 0.8224121928215027, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 37020 + }, + { + "epoch": 2.659245960502693, + "grad_norm": 0.9871236681938171, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 37030 + }, + { + "epoch": 2.6599640933572712, + "grad_norm": 0.8729037046432495, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 37040 + }, + { + "epoch": 2.6606822262118492, + "grad_norm": 0.6279319524765015, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 37050 + }, + { + "epoch": 2.6614003590664272, + "grad_norm": 1.0278962850570679, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37060 + }, + { + "epoch": 2.6621184919210052, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 37070 + }, + { + "epoch": 2.6628366247755837, + "grad_norm": 0.7432018518447876, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 37080 + }, + { + "epoch": 2.6635547576301617, + "grad_norm": 0.9425008296966553, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 37090 + }, + { + "epoch": 2.6642728904847397, + "grad_norm": 0.7542579174041748, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 37100 + }, + { + "epoch": 2.6649910233393177, + "grad_norm": 0.8469315767288208, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 37110 + }, + { + "epoch": 2.6657091561938957, + "grad_norm": 0.865777313709259, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 37120 + }, + { + "epoch": 2.6664272890484737, + "grad_norm": 0.7293250560760498, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 37130 + }, + { + "epoch": 2.667145421903052, + "grad_norm": 0.7199395895004272, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 37140 + }, + { + "epoch": 2.66786355475763, + "grad_norm": 0.7801268100738525, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 37150 + }, + { + "epoch": 2.668581687612208, + "grad_norm": 0.8706921935081482, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 37160 + }, + { + "epoch": 2.6692998204667866, + "grad_norm": 0.7124722599983215, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 37170 + }, + { + "epoch": 2.6700179533213646, + "grad_norm": 0.8333015441894531, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 37180 + }, + { + "epoch": 2.6707360861759426, + "grad_norm": 0.8822736740112305, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 37190 + }, + { + "epoch": 2.6714542190305206, + "grad_norm": 0.8300906419754028, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 37200 + }, + { + "epoch": 2.6721723518850986, + "grad_norm": 0.887126088142395, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37210 + }, + { + "epoch": 2.6728904847396766, + "grad_norm": 0.7473671436309814, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 37220 + }, + { + "epoch": 2.673608617594255, + "grad_norm": 0.8121018409729004, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 37230 + }, + { + "epoch": 2.674326750448833, + "grad_norm": 0.7882586717605591, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 37240 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 0.797060489654541, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 37250 + }, + { + "epoch": 2.6757630161579895, + "grad_norm": 0.9776935577392578, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 37260 + }, + { + "epoch": 2.6764811490125675, + "grad_norm": 0.9527283906936646, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37270 + }, + { + "epoch": 2.6771992818671455, + "grad_norm": 0.7232038974761963, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 37280 + }, + { + "epoch": 2.6779174147217235, + "grad_norm": 0.8514575362205505, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 37290 + }, + { + "epoch": 2.6786355475763015, + "grad_norm": 0.8951214551925659, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 37300 + }, + { + "epoch": 2.6793536804308795, + "grad_norm": 0.7569643259048462, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 37310 + }, + { + "epoch": 2.680071813285458, + "grad_norm": 1.0522346496582031, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 37320 + }, + { + "epoch": 2.680789946140036, + "grad_norm": 0.8914180994033813, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 37330 + }, + { + "epoch": 2.681508078994614, + "grad_norm": 0.8251807689666748, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 37340 + }, + { + "epoch": 2.682226211849192, + "grad_norm": 0.8215394020080566, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 37350 + }, + { + "epoch": 2.6829443447037704, + "grad_norm": 0.8043696880340576, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 37360 + }, + { + "epoch": 2.6836624775583484, + "grad_norm": 0.767250657081604, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 37370 + }, + { + "epoch": 2.6843806104129264, + "grad_norm": 0.817740261554718, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 37380 + }, + { + "epoch": 2.6850987432675044, + "grad_norm": 0.7963255047798157, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 37390 + }, + { + "epoch": 2.6858168761220824, + "grad_norm": 0.839271605014801, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 37400 + }, + { + "epoch": 2.6865350089766604, + "grad_norm": 0.7882823348045349, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 37410 + }, + { + "epoch": 2.687253141831239, + "grad_norm": 0.8316412568092346, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 37420 + }, + { + "epoch": 2.687971274685817, + "grad_norm": 1.0044993162155151, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37430 + }, + { + "epoch": 2.688689407540395, + "grad_norm": 0.8342832326889038, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 37440 + }, + { + "epoch": 2.6894075403949733, + "grad_norm": 0.6743215322494507, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 37450 + }, + { + "epoch": 2.6901256732495513, + "grad_norm": 0.6872923970222473, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 37460 + }, + { + "epoch": 2.6908438061041293, + "grad_norm": 0.7377792596817017, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 37470 + }, + { + "epoch": 2.6915619389587073, + "grad_norm": 0.7677304744720459, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 37480 + }, + { + "epoch": 2.6922800718132853, + "grad_norm": 0.9951061010360718, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 37490 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 0.7452111840248108, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 37500 + }, + { + "epoch": 2.6937163375224418, + "grad_norm": 0.9663393497467041, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 37510 + }, + { + "epoch": 2.6944344703770198, + "grad_norm": 0.7919635772705078, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 37520 + }, + { + "epoch": 2.6951526032315978, + "grad_norm": 0.9977981448173523, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 37530 + }, + { + "epoch": 2.695870736086176, + "grad_norm": 0.7279480695724487, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 37540 + }, + { + "epoch": 2.6965888689407542, + "grad_norm": 0.7218075394630432, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 37550 + }, + { + "epoch": 2.6973070017953322, + "grad_norm": 0.9041047096252441, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 37560 + }, + { + "epoch": 2.6980251346499102, + "grad_norm": 0.7689407467842102, + "learning_rate": 0.0002, + "loss": 0.6848, + "step": 37570 + }, + { + "epoch": 2.6987432675044882, + "grad_norm": 0.8184728622436523, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 37580 + }, + { + "epoch": 2.6994614003590662, + "grad_norm": 0.7536661624908447, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 37590 + }, + { + "epoch": 2.7001795332136447, + "grad_norm": 0.8371431231498718, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 37600 + }, + { + "epoch": 2.7008976660682227, + "grad_norm": 0.8562723994255066, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 37610 + }, + { + "epoch": 2.7016157989228007, + "grad_norm": 0.8227898478507996, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 37620 + }, + { + "epoch": 2.7023339317773787, + "grad_norm": 0.764792799949646, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 37630 + }, + { + "epoch": 2.703052064631957, + "grad_norm": 0.7782649993896484, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 37640 + }, + { + "epoch": 2.703770197486535, + "grad_norm": 0.7669944167137146, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 37650 + }, + { + "epoch": 2.704488330341113, + "grad_norm": 0.7945750951766968, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 37660 + }, + { + "epoch": 2.705206463195691, + "grad_norm": 0.6840786337852478, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 37670 + }, + { + "epoch": 2.705924596050269, + "grad_norm": 1.0565117597579956, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 37680 + }, + { + "epoch": 2.706642728904847, + "grad_norm": 0.7407042384147644, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 37690 + }, + { + "epoch": 2.7073608617594256, + "grad_norm": 0.7862113118171692, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 37700 + }, + { + "epoch": 2.7080789946140036, + "grad_norm": 0.7487596273422241, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 37710 + }, + { + "epoch": 2.7087971274685816, + "grad_norm": 0.9416596293449402, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 37720 + }, + { + "epoch": 2.70951526032316, + "grad_norm": 0.8943207263946533, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 37730 + }, + { + "epoch": 2.710233393177738, + "grad_norm": 0.9263445138931274, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 37740 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 0.6869737505912781, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 37750 + }, + { + "epoch": 2.711669658886894, + "grad_norm": 0.9186407923698425, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 37760 + }, + { + "epoch": 2.712387791741472, + "grad_norm": 0.8379335999488831, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 37770 + }, + { + "epoch": 2.71310592459605, + "grad_norm": 0.7248736023902893, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 37780 + }, + { + "epoch": 2.7138240574506285, + "grad_norm": 0.8636229038238525, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 37790 + }, + { + "epoch": 2.7145421903052065, + "grad_norm": 0.7590767741203308, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 37800 + }, + { + "epoch": 2.7152603231597845, + "grad_norm": 0.8946404457092285, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 37810 + }, + { + "epoch": 2.7159784560143625, + "grad_norm": 0.7822132706642151, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 37820 + }, + { + "epoch": 2.716696588868941, + "grad_norm": 0.7882820963859558, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 37830 + }, + { + "epoch": 2.717414721723519, + "grad_norm": 0.8025872707366943, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 37840 + }, + { + "epoch": 2.718132854578097, + "grad_norm": 0.8618839979171753, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 37850 + }, + { + "epoch": 2.718850987432675, + "grad_norm": 0.6975733637809753, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 37860 + }, + { + "epoch": 2.719569120287253, + "grad_norm": 0.7952182292938232, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 37870 + }, + { + "epoch": 2.7202872531418314, + "grad_norm": 0.7580680251121521, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 37880 + }, + { + "epoch": 2.7210053859964094, + "grad_norm": 0.9504257440567017, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 37890 + }, + { + "epoch": 2.7217235188509874, + "grad_norm": 0.856614351272583, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 37900 + }, + { + "epoch": 2.7224416517055654, + "grad_norm": 1.0092085599899292, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 37910 + }, + { + "epoch": 2.723159784560144, + "grad_norm": 0.9009839296340942, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 37920 + }, + { + "epoch": 2.723877917414722, + "grad_norm": 0.9247435331344604, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 37930 + }, + { + "epoch": 2.7245960502693, + "grad_norm": 1.0774317979812622, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 37940 + }, + { + "epoch": 2.725314183123878, + "grad_norm": 0.9104372262954712, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 37950 + }, + { + "epoch": 2.726032315978456, + "grad_norm": 0.7904245257377625, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 37960 + }, + { + "epoch": 2.726750448833034, + "grad_norm": 0.9555521607398987, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 37970 + }, + { + "epoch": 2.7274685816876123, + "grad_norm": 0.7769099473953247, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 37980 + }, + { + "epoch": 2.7281867145421903, + "grad_norm": 0.9202065467834473, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 37990 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 0.732510507106781, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 38000 + }, + { + "epoch": 2.7296229802513468, + "grad_norm": 0.7723771929740906, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 38010 + }, + { + "epoch": 2.7303411131059248, + "grad_norm": 0.7948567867279053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 38020 + }, + { + "epoch": 2.7310592459605028, + "grad_norm": 0.7702966928482056, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 38030 + }, + { + "epoch": 2.7317773788150808, + "grad_norm": 0.689098060131073, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 38040 + }, + { + "epoch": 2.7324955116696588, + "grad_norm": 0.7951080203056335, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 38050 + }, + { + "epoch": 2.7332136445242368, + "grad_norm": 0.7284924983978271, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 38060 + }, + { + "epoch": 2.733931777378815, + "grad_norm": 0.9198044538497925, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 38070 + }, + { + "epoch": 2.734649910233393, + "grad_norm": 0.8653260469436646, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 38080 + }, + { + "epoch": 2.735368043087971, + "grad_norm": 0.8503400683403015, + "learning_rate": 0.0002, + "loss": 0.6832, + "step": 38090 + }, + { + "epoch": 2.736086175942549, + "grad_norm": 0.8388783931732178, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 38100 + }, + { + "epoch": 2.7368043087971277, + "grad_norm": 0.7636904716491699, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 38110 + }, + { + "epoch": 2.7375224416517057, + "grad_norm": 0.8990790247917175, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 38120 + }, + { + "epoch": 2.7382405745062837, + "grad_norm": 0.8878970742225647, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 38130 + }, + { + "epoch": 2.7389587073608617, + "grad_norm": 0.7684310078620911, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 38140 + }, + { + "epoch": 2.7396768402154397, + "grad_norm": 1.0777359008789062, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 38150 + }, + { + "epoch": 2.740394973070018, + "grad_norm": 0.768764317035675, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 38160 + }, + { + "epoch": 2.741113105924596, + "grad_norm": 0.7490760087966919, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 38170 + }, + { + "epoch": 2.741831238779174, + "grad_norm": 0.860373854637146, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 38180 + }, + { + "epoch": 2.742549371633752, + "grad_norm": 0.7145599722862244, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 38190 + }, + { + "epoch": 2.7432675044883306, + "grad_norm": 0.8347760438919067, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 38200 + }, + { + "epoch": 2.7439856373429086, + "grad_norm": 0.8425729274749756, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 38210 + }, + { + "epoch": 2.7447037701974866, + "grad_norm": 0.9289436936378479, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 38220 + }, + { + "epoch": 2.7454219030520646, + "grad_norm": 0.7608675360679626, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 38230 + }, + { + "epoch": 2.7461400359066426, + "grad_norm": 0.8067167401313782, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 38240 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 0.8599629402160645, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 38250 + }, + { + "epoch": 2.747576301615799, + "grad_norm": 0.8425742387771606, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 38260 + }, + { + "epoch": 2.748294434470377, + "grad_norm": 0.8626754283905029, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 38270 + }, + { + "epoch": 2.749012567324955, + "grad_norm": 0.797652006149292, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 38280 + }, + { + "epoch": 2.7497307001795335, + "grad_norm": 0.7971500754356384, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 38290 + }, + { + "epoch": 2.7504488330341115, + "grad_norm": 0.9786333441734314, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 38300 + }, + { + "epoch": 2.7511669658886895, + "grad_norm": 0.7146100997924805, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 38310 + }, + { + "epoch": 2.7518850987432675, + "grad_norm": 0.8436099886894226, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 38320 + }, + { + "epoch": 2.7526032315978455, + "grad_norm": 0.8943847417831421, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 38330 + }, + { + "epoch": 2.7533213644524235, + "grad_norm": 0.8170148730278015, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 38340 + }, + { + "epoch": 2.754039497307002, + "grad_norm": 0.7804728746414185, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 38350 + }, + { + "epoch": 2.75475763016158, + "grad_norm": 0.9139971137046814, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38360 + }, + { + "epoch": 2.755475763016158, + "grad_norm": 0.835332453250885, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 38370 + }, + { + "epoch": 2.756193895870736, + "grad_norm": 1.0904794931411743, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 38380 + }, + { + "epoch": 2.7569120287253144, + "grad_norm": 0.7443365454673767, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 38390 + }, + { + "epoch": 2.7576301615798924, + "grad_norm": 1.1336839199066162, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 38400 + }, + { + "epoch": 2.7583482944344704, + "grad_norm": 0.9024015665054321, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 38410 + }, + { + "epoch": 2.7590664272890484, + "grad_norm": 0.7380578517913818, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 38420 + }, + { + "epoch": 2.7597845601436264, + "grad_norm": 0.9860634207725525, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 38430 + }, + { + "epoch": 2.760502692998205, + "grad_norm": 0.7928970456123352, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 38440 + }, + { + "epoch": 2.761220825852783, + "grad_norm": 1.0357221364974976, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 38450 + }, + { + "epoch": 2.761938958707361, + "grad_norm": 0.8110901117324829, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 38460 + }, + { + "epoch": 2.762657091561939, + "grad_norm": 0.8420981764793396, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 38470 + }, + { + "epoch": 2.7633752244165173, + "grad_norm": 0.858955979347229, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 38480 + }, + { + "epoch": 2.7640933572710953, + "grad_norm": 0.9851368069648743, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 38490 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 0.8073325753211975, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 38500 + }, + { + "epoch": 2.7655296229802513, + "grad_norm": 1.0654062032699585, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38510 + }, + { + "epoch": 2.7662477558348293, + "grad_norm": 0.719603955745697, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 38520 + }, + { + "epoch": 2.7669658886894073, + "grad_norm": 0.9790831804275513, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38530 + }, + { + "epoch": 2.7676840215439857, + "grad_norm": 0.907619833946228, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 38540 + }, + { + "epoch": 2.7684021543985637, + "grad_norm": 0.7463719248771667, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 38550 + }, + { + "epoch": 2.7691202872531417, + "grad_norm": 1.0687178373336792, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 38560 + }, + { + "epoch": 2.76983842010772, + "grad_norm": 0.7397776246070862, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 38570 + }, + { + "epoch": 2.770556552962298, + "grad_norm": 0.7392559051513672, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 38580 + }, + { + "epoch": 2.771274685816876, + "grad_norm": 0.9774793982505798, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38590 + }, + { + "epoch": 2.771992818671454, + "grad_norm": 0.9502208828926086, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 38600 + }, + { + "epoch": 2.772710951526032, + "grad_norm": 0.776108980178833, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 38610 + }, + { + "epoch": 2.77342908438061, + "grad_norm": 0.7633077502250671, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 38620 + }, + { + "epoch": 2.7741472172351886, + "grad_norm": 0.9445580244064331, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 38630 + }, + { + "epoch": 2.7748653500897666, + "grad_norm": 0.943165123462677, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 38640 + }, + { + "epoch": 2.7755834829443446, + "grad_norm": 0.9045929908752441, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 38650 + }, + { + "epoch": 2.7763016157989227, + "grad_norm": 0.9425684213638306, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 38660 + }, + { + "epoch": 2.777019748653501, + "grad_norm": 0.9106295704841614, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 38670 + }, + { + "epoch": 2.777737881508079, + "grad_norm": 0.6264749765396118, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 38680 + }, + { + "epoch": 2.778456014362657, + "grad_norm": 0.9156801700592041, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 38690 + }, + { + "epoch": 2.779174147217235, + "grad_norm": 0.9752956032752991, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 38700 + }, + { + "epoch": 2.779892280071813, + "grad_norm": 0.7849555611610413, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 38710 + }, + { + "epoch": 2.780610412926391, + "grad_norm": 0.8109981417655945, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 38720 + }, + { + "epoch": 2.7813285457809696, + "grad_norm": 0.7882387638092041, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 38730 + }, + { + "epoch": 2.7820466786355476, + "grad_norm": 0.9049678444862366, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 38740 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 0.7678212523460388, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38750 + }, + { + "epoch": 2.783482944344704, + "grad_norm": 0.9754453301429749, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 38760 + }, + { + "epoch": 2.784201077199282, + "grad_norm": 0.7643493413925171, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 38770 + }, + { + "epoch": 2.78491921005386, + "grad_norm": 0.7440303564071655, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 38780 + }, + { + "epoch": 2.785637342908438, + "grad_norm": 0.8870946168899536, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 38790 + }, + { + "epoch": 2.786355475763016, + "grad_norm": 0.8100579977035522, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 38800 + }, + { + "epoch": 2.787073608617594, + "grad_norm": 0.7082616090774536, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 38810 + }, + { + "epoch": 2.7877917414721725, + "grad_norm": 0.7880047559738159, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 38820 + }, + { + "epoch": 2.7885098743267505, + "grad_norm": 0.7217963337898254, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 38830 + }, + { + "epoch": 2.7892280071813285, + "grad_norm": 0.799124002456665, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 38840 + }, + { + "epoch": 2.789946140035907, + "grad_norm": 1.0004022121429443, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 38850 + }, + { + "epoch": 2.790664272890485, + "grad_norm": 0.7866547107696533, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 38860 + }, + { + "epoch": 2.791382405745063, + "grad_norm": 0.891603410243988, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 38870 + }, + { + "epoch": 2.792100538599641, + "grad_norm": 0.7687129378318787, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 38880 + }, + { + "epoch": 2.792818671454219, + "grad_norm": 0.7549769282341003, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 38890 + }, + { + "epoch": 2.793536804308797, + "grad_norm": 0.7792351245880127, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 38900 + }, + { + "epoch": 2.7942549371633754, + "grad_norm": 0.7352819442749023, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 38910 + }, + { + "epoch": 2.7949730700179534, + "grad_norm": 0.8758018612861633, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 38920 + }, + { + "epoch": 2.7956912028725314, + "grad_norm": 0.8213023543357849, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38930 + }, + { + "epoch": 2.7964093357271094, + "grad_norm": 0.899368941783905, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 38940 + }, + { + "epoch": 2.797127468581688, + "grad_norm": 0.7497758269309998, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 38950 + }, + { + "epoch": 2.797845601436266, + "grad_norm": 0.870704710483551, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 38960 + }, + { + "epoch": 2.798563734290844, + "grad_norm": 0.8021528720855713, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 38970 + }, + { + "epoch": 2.799281867145422, + "grad_norm": 0.7541360855102539, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 38980 + }, + { + "epoch": 2.8, + "grad_norm": 0.8909788131713867, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 38990 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 0.8175999522209167, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 39000 + }, + { + "epoch": 2.8014362657091563, + "grad_norm": 0.7336044311523438, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 39010 + }, + { + "epoch": 2.8021543985637343, + "grad_norm": 0.7354168891906738, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 39020 + }, + { + "epoch": 2.8028725314183123, + "grad_norm": 0.8771968483924866, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 39030 + }, + { + "epoch": 2.8035906642728907, + "grad_norm": 0.8073309063911438, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39040 + }, + { + "epoch": 2.8043087971274687, + "grad_norm": 0.8475365042686462, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39050 + }, + { + "epoch": 2.8050269299820467, + "grad_norm": 0.7233281135559082, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 39060 + }, + { + "epoch": 2.8057450628366247, + "grad_norm": 0.9850572347640991, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39070 + }, + { + "epoch": 2.8064631956912027, + "grad_norm": 1.0635435581207275, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 39080 + }, + { + "epoch": 2.8071813285457807, + "grad_norm": 0.8183665871620178, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 39090 + }, + { + "epoch": 2.807899461400359, + "grad_norm": 0.802228569984436, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 39100 + }, + { + "epoch": 2.808617594254937, + "grad_norm": 0.9861624836921692, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 39110 + }, + { + "epoch": 2.809335727109515, + "grad_norm": 0.675205409526825, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 39120 + }, + { + "epoch": 2.8100538599640936, + "grad_norm": 0.7503975629806519, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 39130 + }, + { + "epoch": 2.8107719928186716, + "grad_norm": 0.8266825675964355, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 39140 + }, + { + "epoch": 2.8114901256732496, + "grad_norm": 0.6956485509872437, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39150 + }, + { + "epoch": 2.8122082585278276, + "grad_norm": 0.7363799214363098, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 39160 + }, + { + "epoch": 2.8129263913824056, + "grad_norm": 1.3893407583236694, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 39170 + }, + { + "epoch": 2.8136445242369836, + "grad_norm": 1.0619654655456543, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 39180 + }, + { + "epoch": 2.814362657091562, + "grad_norm": 0.7924326062202454, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 39190 + }, + { + "epoch": 2.81508078994614, + "grad_norm": 0.8838121294975281, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 39200 + }, + { + "epoch": 2.815798922800718, + "grad_norm": 0.9059016108512878, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 39210 + }, + { + "epoch": 2.816517055655296, + "grad_norm": 0.9284590482711792, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 39220 + }, + { + "epoch": 2.8172351885098745, + "grad_norm": 0.7992225289344788, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 39230 + }, + { + "epoch": 2.8179533213644525, + "grad_norm": 0.816376805305481, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 39240 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 0.9183637499809265, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 39250 + }, + { + "epoch": 2.8193895870736085, + "grad_norm": 0.7232057452201843, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 39260 + }, + { + "epoch": 2.8201077199281865, + "grad_norm": 0.9012457728385925, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 39270 + }, + { + "epoch": 2.8208258527827645, + "grad_norm": 0.7796093821525574, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 39280 + }, + { + "epoch": 2.821543985637343, + "grad_norm": 0.8331146836280823, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 39290 + }, + { + "epoch": 2.822262118491921, + "grad_norm": 0.8031269907951355, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 39300 + }, + { + "epoch": 2.822980251346499, + "grad_norm": 0.8563299179077148, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 39310 + }, + { + "epoch": 2.8236983842010774, + "grad_norm": 0.8083387613296509, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 39320 + }, + { + "epoch": 2.8244165170556554, + "grad_norm": 0.8132631182670593, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 39330 + }, + { + "epoch": 2.8251346499102334, + "grad_norm": 0.9071316719055176, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39340 + }, + { + "epoch": 2.8258527827648114, + "grad_norm": 0.8224168419837952, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 39350 + }, + { + "epoch": 2.8265709156193894, + "grad_norm": 1.073014497756958, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 39360 + }, + { + "epoch": 2.8272890484739674, + "grad_norm": 0.9466553926467896, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 39370 + }, + { + "epoch": 2.828007181328546, + "grad_norm": 0.8946257829666138, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 39380 + }, + { + "epoch": 2.828725314183124, + "grad_norm": 0.8497758507728577, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 39390 + }, + { + "epoch": 2.829443447037702, + "grad_norm": 0.8952143788337708, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 39400 + }, + { + "epoch": 2.8301615798922803, + "grad_norm": 0.8839313983917236, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 39410 + }, + { + "epoch": 2.8308797127468583, + "grad_norm": 0.7576757669448853, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 39420 + }, + { + "epoch": 2.8315978456014363, + "grad_norm": 0.8212469816207886, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 39430 + }, + { + "epoch": 2.8323159784560143, + "grad_norm": 0.9289504885673523, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 39440 + }, + { + "epoch": 2.8330341113105924, + "grad_norm": 0.8745405077934265, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 39450 + }, + { + "epoch": 2.8337522441651704, + "grad_norm": 0.7974533438682556, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 39460 + }, + { + "epoch": 2.834470377019749, + "grad_norm": 0.914289116859436, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 39470 + }, + { + "epoch": 2.835188509874327, + "grad_norm": 0.7686914801597595, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 39480 + }, + { + "epoch": 2.835906642728905, + "grad_norm": 0.9289370179176331, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39490 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 0.8851973414421082, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 39500 + }, + { + "epoch": 2.8373429084380613, + "grad_norm": 0.7754096388816833, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 39510 + }, + { + "epoch": 2.8380610412926393, + "grad_norm": 0.8801632523536682, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 39520 + }, + { + "epoch": 2.8387791741472173, + "grad_norm": 0.9031528234481812, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 39530 + }, + { + "epoch": 2.8394973070017953, + "grad_norm": 0.7113721966743469, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 39540 + }, + { + "epoch": 2.8402154398563733, + "grad_norm": 0.7880923748016357, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 39550 + }, + { + "epoch": 2.8409335727109513, + "grad_norm": 2.4828813076019287, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39560 + }, + { + "epoch": 2.8416517055655297, + "grad_norm": 0.9174619913101196, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 39570 + }, + { + "epoch": 2.8423698384201077, + "grad_norm": 0.9708074927330017, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 39580 + }, + { + "epoch": 2.8430879712746857, + "grad_norm": 0.7968248724937439, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 39590 + }, + { + "epoch": 2.843806104129264, + "grad_norm": 0.7967682480812073, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 39600 + }, + { + "epoch": 2.844524236983842, + "grad_norm": 0.7487651109695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 39610 + }, + { + "epoch": 2.84524236983842, + "grad_norm": 0.6997556686401367, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 39620 + }, + { + "epoch": 2.845960502692998, + "grad_norm": 0.7639351487159729, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39630 + }, + { + "epoch": 2.846678635547576, + "grad_norm": 0.9086648225784302, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 39640 + }, + { + "epoch": 2.847396768402154, + "grad_norm": 0.91103196144104, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 39650 + }, + { + "epoch": 2.8481149012567326, + "grad_norm": 0.8096913695335388, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 39660 + }, + { + "epoch": 2.8488330341113106, + "grad_norm": 0.8961427807807922, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39670 + }, + { + "epoch": 2.8495511669658886, + "grad_norm": 0.7489904761314392, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 39680 + }, + { + "epoch": 2.850269299820467, + "grad_norm": 0.7893617749214172, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 39690 + }, + { + "epoch": 2.850987432675045, + "grad_norm": 0.8259761929512024, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 39700 + }, + { + "epoch": 2.851705565529623, + "grad_norm": 0.7006617188453674, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 39710 + }, + { + "epoch": 2.852423698384201, + "grad_norm": 0.8922327756881714, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 39720 + }, + { + "epoch": 2.853141831238779, + "grad_norm": 0.9058550000190735, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 39730 + }, + { + "epoch": 2.853859964093357, + "grad_norm": 0.7627129554748535, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 39740 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 0.9316968321800232, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39750 + }, + { + "epoch": 2.8552962298025135, + "grad_norm": 0.8424679040908813, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39760 + }, + { + "epoch": 2.8560143626570915, + "grad_norm": 0.6185386776924133, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 39770 + }, + { + "epoch": 2.8567324955116695, + "grad_norm": 0.709902286529541, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 39780 + }, + { + "epoch": 2.857450628366248, + "grad_norm": 0.93730229139328, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 39790 + }, + { + "epoch": 2.858168761220826, + "grad_norm": 0.875989556312561, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 39800 + }, + { + "epoch": 2.858886894075404, + "grad_norm": 0.7424131631851196, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 39810 + }, + { + "epoch": 2.859605026929982, + "grad_norm": 0.9108477830886841, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 39820 + }, + { + "epoch": 2.86032315978456, + "grad_norm": 0.8248386383056641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 39830 + }, + { + "epoch": 2.861041292639138, + "grad_norm": 0.8739979863166809, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 39840 + }, + { + "epoch": 2.8617594254937164, + "grad_norm": 0.7940961122512817, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 39850 + }, + { + "epoch": 2.8624775583482944, + "grad_norm": 0.7594687938690186, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 39860 + }, + { + "epoch": 2.8631956912028724, + "grad_norm": 0.9884313941001892, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 39870 + }, + { + "epoch": 2.863913824057451, + "grad_norm": 0.8537741303443909, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 39880 + }, + { + "epoch": 2.864631956912029, + "grad_norm": 0.7407512664794922, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 39890 + }, + { + "epoch": 2.865350089766607, + "grad_norm": 1.0179548263549805, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 39900 + }, + { + "epoch": 2.866068222621185, + "grad_norm": 0.8822470307350159, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 39910 + }, + { + "epoch": 2.866786355475763, + "grad_norm": 0.794448733329773, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 39920 + }, + { + "epoch": 2.867504488330341, + "grad_norm": 0.8115299940109253, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 39930 + }, + { + "epoch": 2.8682226211849193, + "grad_norm": 0.7998958826065063, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 39940 + }, + { + "epoch": 2.8689407540394973, + "grad_norm": 0.8222435116767883, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 39950 + }, + { + "epoch": 2.8696588868940753, + "grad_norm": 0.9495923519134521, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39960 + }, + { + "epoch": 2.8703770197486533, + "grad_norm": 0.6749192476272583, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 39970 + }, + { + "epoch": 2.871095152603232, + "grad_norm": 0.8910874128341675, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 39980 + }, + { + "epoch": 2.87181328545781, + "grad_norm": 0.7051638960838318, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 39990 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 0.8456535339355469, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 40000 + }, + { + "epoch": 2.873249551166966, + "grad_norm": 0.934894859790802, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 40010 + }, + { + "epoch": 2.873967684021544, + "grad_norm": 0.6740477681159973, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 40020 + }, + { + "epoch": 2.8746858168761222, + "grad_norm": 0.6632325649261475, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 40030 + }, + { + "epoch": 2.8754039497307002, + "grad_norm": 0.8889022469520569, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 40040 + }, + { + "epoch": 2.8761220825852782, + "grad_norm": 0.7460705637931824, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 40050 + }, + { + "epoch": 2.8768402154398562, + "grad_norm": 0.9795911908149719, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 40060 + }, + { + "epoch": 2.8775583482944347, + "grad_norm": 1.0002509355545044, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 40070 + }, + { + "epoch": 2.8782764811490127, + "grad_norm": 0.7867239713668823, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 40080 + }, + { + "epoch": 2.8789946140035907, + "grad_norm": 1.0221471786499023, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 40090 + }, + { + "epoch": 2.8797127468581687, + "grad_norm": 0.8091005086898804, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 40100 + }, + { + "epoch": 2.8804308797127467, + "grad_norm": 0.8485820293426514, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 40110 + }, + { + "epoch": 2.8811490125673247, + "grad_norm": 0.7850196957588196, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 40120 + }, + { + "epoch": 2.881867145421903, + "grad_norm": 0.7906134128570557, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 40130 + }, + { + "epoch": 2.882585278276481, + "grad_norm": 0.7957962155342102, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 40140 + }, + { + "epoch": 2.883303411131059, + "grad_norm": 1.0687522888183594, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 40150 + }, + { + "epoch": 2.8840215439856376, + "grad_norm": 0.713752031326294, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 40160 + }, + { + "epoch": 2.8847396768402156, + "grad_norm": 1.1603864431381226, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 40170 + }, + { + "epoch": 2.8854578096947936, + "grad_norm": 0.8423245549201965, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 40180 + }, + { + "epoch": 2.8861759425493716, + "grad_norm": 0.7554550766944885, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40190 + }, + { + "epoch": 2.8868940754039496, + "grad_norm": 0.6006978750228882, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 40200 + }, + { + "epoch": 2.8876122082585276, + "grad_norm": 0.923068106174469, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 40210 + }, + { + "epoch": 2.888330341113106, + "grad_norm": 0.7659787535667419, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 40220 + }, + { + "epoch": 2.889048473967684, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 40230 + }, + { + "epoch": 2.889766606822262, + "grad_norm": 1.1267355680465698, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 40240 + }, + { + "epoch": 2.89048473967684, + "grad_norm": 0.8548554182052612, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 40250 + }, + { + "epoch": 2.8912028725314185, + "grad_norm": 0.7846875786781311, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 40260 + }, + { + "epoch": 2.8919210053859965, + "grad_norm": 0.8606904745101929, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40270 + }, + { + "epoch": 2.8926391382405745, + "grad_norm": 0.6508898138999939, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 40280 + }, + { + "epoch": 2.8933572710951525, + "grad_norm": 0.7903237342834473, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 40290 + }, + { + "epoch": 2.8940754039497305, + "grad_norm": 0.7320941686630249, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 40300 + }, + { + "epoch": 2.894793536804309, + "grad_norm": 1.0031821727752686, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 40310 + }, + { + "epoch": 2.895511669658887, + "grad_norm": 0.7463554739952087, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 40320 + }, + { + "epoch": 2.896229802513465, + "grad_norm": 0.8455599546432495, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 40330 + }, + { + "epoch": 2.896947935368043, + "grad_norm": 0.7645914554595947, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 40340 + }, + { + "epoch": 2.8976660682226214, + "grad_norm": 0.9074810147285461, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 40350 + }, + { + "epoch": 2.8983842010771994, + "grad_norm": 0.9070153832435608, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 40360 + }, + { + "epoch": 2.8991023339317774, + "grad_norm": 0.8649221658706665, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 40370 + }, + { + "epoch": 2.8998204667863554, + "grad_norm": 1.0325016975402832, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 40380 + }, + { + "epoch": 2.9005385996409334, + "grad_norm": 0.8688622713088989, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 40390 + }, + { + "epoch": 2.9012567324955114, + "grad_norm": 0.83316969871521, + "learning_rate": 0.0002, + "loss": 0.7209, + "step": 40400 + }, + { + "epoch": 2.90197486535009, + "grad_norm": 1.0146536827087402, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 40410 + }, + { + "epoch": 2.902692998204668, + "grad_norm": 6.21811580657959, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 40420 + }, + { + "epoch": 2.903411131059246, + "grad_norm": 0.8747655749320984, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 40430 + }, + { + "epoch": 2.9041292639138243, + "grad_norm": 0.8671547174453735, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 40440 + }, + { + "epoch": 2.9048473967684023, + "grad_norm": 0.7888760566711426, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 40450 + }, + { + "epoch": 2.9055655296229803, + "grad_norm": 0.7182217240333557, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 40460 + }, + { + "epoch": 2.9062836624775583, + "grad_norm": 0.8802227973937988, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 40470 + }, + { + "epoch": 2.9070017953321363, + "grad_norm": 0.8106126189231873, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 40480 + }, + { + "epoch": 2.9077199281867143, + "grad_norm": 0.7313538789749146, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 40490 + }, + { + "epoch": 2.9084380610412928, + "grad_norm": 0.6098655462265015, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40500 + }, + { + "epoch": 2.9091561938958708, + "grad_norm": 0.8849560618400574, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 40510 + }, + { + "epoch": 2.9098743267504488, + "grad_norm": 0.8761322498321533, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 40520 + }, + { + "epoch": 2.9105924596050268, + "grad_norm": 0.8259703516960144, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 40530 + }, + { + "epoch": 2.911310592459605, + "grad_norm": 0.6613079309463501, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 40540 + }, + { + "epoch": 2.912028725314183, + "grad_norm": 0.825678825378418, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 40550 + }, + { + "epoch": 2.912746858168761, + "grad_norm": 0.824850857257843, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 40560 + }, + { + "epoch": 2.9134649910233392, + "grad_norm": 0.9629682898521423, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 40570 + }, + { + "epoch": 2.9141831238779172, + "grad_norm": 0.7446485161781311, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 40580 + }, + { + "epoch": 2.9149012567324957, + "grad_norm": 0.9028317928314209, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 40590 + }, + { + "epoch": 2.9156193895870737, + "grad_norm": 0.9646022319793701, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 40600 + }, + { + "epoch": 2.9163375224416517, + "grad_norm": 0.8845045566558838, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 40610 + }, + { + "epoch": 2.9170556552962297, + "grad_norm": 0.9660372734069824, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 40620 + }, + { + "epoch": 2.917773788150808, + "grad_norm": 0.8914347290992737, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 40630 + }, + { + "epoch": 2.918491921005386, + "grad_norm": 0.7789235711097717, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 40640 + }, + { + "epoch": 2.919210053859964, + "grad_norm": 0.8221206665039062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 40650 + }, + { + "epoch": 2.919928186714542, + "grad_norm": 0.9550618529319763, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 40660 + }, + { + "epoch": 2.92064631956912, + "grad_norm": 0.868315577507019, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 40670 + }, + { + "epoch": 2.921364452423698, + "grad_norm": 0.852878749370575, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 40680 + }, + { + "epoch": 2.9220825852782766, + "grad_norm": 0.8388790488243103, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 40690 + }, + { + "epoch": 2.9228007181328546, + "grad_norm": 0.9897602200508118, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 40700 + }, + { + "epoch": 2.9235188509874326, + "grad_norm": 0.8050527572631836, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 40710 + }, + { + "epoch": 2.924236983842011, + "grad_norm": 0.7296929955482483, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 40720 + }, + { + "epoch": 2.924955116696589, + "grad_norm": 0.917475700378418, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 40730 + }, + { + "epoch": 2.925673249551167, + "grad_norm": 0.9118483662605286, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 40740 + }, + { + "epoch": 2.926391382405745, + "grad_norm": 0.7722473740577698, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 40750 + }, + { + "epoch": 2.927109515260323, + "grad_norm": 0.7950358986854553, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 40760 + }, + { + "epoch": 2.927827648114901, + "grad_norm": 0.8868561387062073, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 40770 + }, + { + "epoch": 2.9285457809694795, + "grad_norm": 0.7923154830932617, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 40780 + }, + { + "epoch": 2.9292639138240575, + "grad_norm": 0.7285428047180176, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 40790 + }, + { + "epoch": 2.9299820466786355, + "grad_norm": 0.794775664806366, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 40800 + }, + { + "epoch": 2.9307001795332135, + "grad_norm": 0.8351698517799377, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 40810 + }, + { + "epoch": 2.931418312387792, + "grad_norm": 0.853082001209259, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40820 + }, + { + "epoch": 2.93213644524237, + "grad_norm": 0.8209722638130188, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 40830 + }, + { + "epoch": 2.932854578096948, + "grad_norm": 0.8982136845588684, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 40840 + }, + { + "epoch": 2.933572710951526, + "grad_norm": 0.8373305201530457, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 40850 + }, + { + "epoch": 2.934290843806104, + "grad_norm": 0.8326864242553711, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 40860 + }, + { + "epoch": 2.9350089766606824, + "grad_norm": 0.7232590317726135, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 40870 + }, + { + "epoch": 2.9357271095152604, + "grad_norm": 0.823615312576294, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 40880 + }, + { + "epoch": 2.9364452423698384, + "grad_norm": 0.7532811760902405, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 40890 + }, + { + "epoch": 2.9371633752244164, + "grad_norm": 0.9594773650169373, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 40900 + }, + { + "epoch": 2.937881508078995, + "grad_norm": 0.8368398547172546, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 40910 + }, + { + "epoch": 2.938599640933573, + "grad_norm": 0.8336817026138306, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 40920 + }, + { + "epoch": 2.939317773788151, + "grad_norm": 0.8413758277893066, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 40930 + }, + { + "epoch": 2.940035906642729, + "grad_norm": 0.7117549180984497, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 40940 + }, + { + "epoch": 2.940754039497307, + "grad_norm": 0.8741925954818726, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 40950 + }, + { + "epoch": 2.941472172351885, + "grad_norm": 0.8476088047027588, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 40960 + }, + { + "epoch": 2.9421903052064633, + "grad_norm": 0.674659788608551, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 40970 + }, + { + "epoch": 2.9429084380610413, + "grad_norm": 0.7087500691413879, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 40980 + }, + { + "epoch": 2.9436265709156193, + "grad_norm": 0.9202252626419067, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 40990 + }, + { + "epoch": 2.9443447037701977, + "grad_norm": 0.9775124192237854, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 41000 + }, + { + "epoch": 2.9450628366247757, + "grad_norm": 0.7465068101882935, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 41010 + }, + { + "epoch": 2.9457809694793538, + "grad_norm": 0.7229986786842346, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 41020 + }, + { + "epoch": 2.9464991023339318, + "grad_norm": 0.7228954434394836, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 41030 + }, + { + "epoch": 2.9472172351885098, + "grad_norm": 0.9396149516105652, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 41040 + }, + { + "epoch": 2.9479353680430878, + "grad_norm": 0.9458696842193604, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 41050 + }, + { + "epoch": 2.948653500897666, + "grad_norm": 0.8276246190071106, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 41060 + }, + { + "epoch": 2.949371633752244, + "grad_norm": 0.7927420139312744, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 41070 + }, + { + "epoch": 2.950089766606822, + "grad_norm": 0.7403103709220886, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 41080 + }, + { + "epoch": 2.9508078994614, + "grad_norm": 0.9813524484634399, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 41090 + }, + { + "epoch": 2.9515260323159787, + "grad_norm": 0.8560924530029297, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 41100 + }, + { + "epoch": 2.9522441651705567, + "grad_norm": 0.6937443017959595, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 41110 + }, + { + "epoch": 2.9529622980251347, + "grad_norm": 0.8440476655960083, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 41120 + }, + { + "epoch": 2.9536804308797127, + "grad_norm": 1.1260770559310913, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 41130 + }, + { + "epoch": 2.9543985637342907, + "grad_norm": 0.8789936900138855, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 41140 + }, + { + "epoch": 2.9551166965888687, + "grad_norm": 0.8205832839012146, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 41150 + }, + { + "epoch": 2.955834829443447, + "grad_norm": 0.8148444294929504, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 41160 + }, + { + "epoch": 2.956552962298025, + "grad_norm": 0.791296660900116, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41170 + }, + { + "epoch": 2.957271095152603, + "grad_norm": 1.3229854106903076, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 41180 + }, + { + "epoch": 2.9579892280071816, + "grad_norm": 0.906423807144165, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 41190 + }, + { + "epoch": 2.9587073608617596, + "grad_norm": 0.8707411289215088, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 41200 + }, + { + "epoch": 2.9594254937163376, + "grad_norm": 1.0362473726272583, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 41210 + }, + { + "epoch": 2.9601436265709156, + "grad_norm": 0.818546712398529, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 41220 + }, + { + "epoch": 2.9608617594254936, + "grad_norm": 0.8558517098426819, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 41230 + }, + { + "epoch": 2.9615798922800716, + "grad_norm": 0.8262931704521179, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 41240 + }, + { + "epoch": 2.96229802513465, + "grad_norm": 0.9603250026702881, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 41250 + }, + { + "epoch": 2.963016157989228, + "grad_norm": 0.891610860824585, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 41260 + }, + { + "epoch": 2.963734290843806, + "grad_norm": 0.9823883175849915, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 41270 + }, + { + "epoch": 2.9644524236983845, + "grad_norm": 0.8783510327339172, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 41280 + }, + { + "epoch": 2.9651705565529625, + "grad_norm": 0.873656690120697, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 41290 + }, + { + "epoch": 2.9658886894075405, + "grad_norm": 0.8281165957450867, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 41300 + }, + { + "epoch": 2.9666068222621185, + "grad_norm": 0.8008899092674255, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 41310 + }, + { + "epoch": 2.9673249551166965, + "grad_norm": 0.8564065098762512, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41320 + }, + { + "epoch": 2.9680430879712745, + "grad_norm": 0.786119818687439, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41330 + }, + { + "epoch": 2.968761220825853, + "grad_norm": 1.3152399063110352, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 41340 + }, + { + "epoch": 2.969479353680431, + "grad_norm": 0.7551527619361877, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 41350 + }, + { + "epoch": 2.970197486535009, + "grad_norm": 1.1397290229797363, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 41360 + }, + { + "epoch": 2.970915619389587, + "grad_norm": 0.8333854079246521, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 41370 + }, + { + "epoch": 2.9716337522441654, + "grad_norm": 0.8096165657043457, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 41380 + }, + { + "epoch": 2.9723518850987434, + "grad_norm": 0.8378547430038452, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 41390 + }, + { + "epoch": 2.9730700179533214, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 41400 + }, + { + "epoch": 2.9737881508078994, + "grad_norm": 0.8722409605979919, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 41410 + }, + { + "epoch": 2.9745062836624774, + "grad_norm": 0.6680061221122742, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 41420 + }, + { + "epoch": 2.9752244165170554, + "grad_norm": 0.7666152715682983, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 41430 + }, + { + "epoch": 2.975942549371634, + "grad_norm": 0.8489957451820374, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 41440 + }, + { + "epoch": 2.976660682226212, + "grad_norm": 0.8516127467155457, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 41450 + }, + { + "epoch": 2.97737881508079, + "grad_norm": 0.8836804628372192, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 41460 + }, + { + "epoch": 2.9780969479353683, + "grad_norm": 1.0963364839553833, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 41470 + }, + { + "epoch": 2.9788150807899463, + "grad_norm": 0.9908610582351685, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 41480 + }, + { + "epoch": 2.9795332136445243, + "grad_norm": 0.8822041153907776, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 41490 + }, + { + "epoch": 2.9802513464991023, + "grad_norm": 0.717723548412323, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 41500 + }, + { + "epoch": 2.9809694793536803, + "grad_norm": 0.8413400053977966, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 41510 + }, + { + "epoch": 2.9816876122082583, + "grad_norm": 0.8771023750305176, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41520 + }, + { + "epoch": 2.9824057450628367, + "grad_norm": 0.7185000777244568, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 41530 + }, + { + "epoch": 2.9831238779174147, + "grad_norm": 0.8299767374992371, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 41540 + }, + { + "epoch": 2.9838420107719927, + "grad_norm": 0.9309971928596497, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 41550 + }, + { + "epoch": 2.984560143626571, + "grad_norm": 0.7644693851470947, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 41560 + }, + { + "epoch": 2.985278276481149, + "grad_norm": 0.7888111472129822, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 41570 + }, + { + "epoch": 2.985996409335727, + "grad_norm": 1.0921967029571533, + "learning_rate": 0.0002, + "loss": 0.6984, + "step": 41580 + }, + { + "epoch": 2.986714542190305, + "grad_norm": 0.8116785883903503, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 41590 + }, + { + "epoch": 2.987432675044883, + "grad_norm": 0.983269214630127, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 41600 + }, + { + "epoch": 2.988150807899461, + "grad_norm": 0.81700599193573, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 41610 + }, + { + "epoch": 2.9888689407540396, + "grad_norm": 0.7545617818832397, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 41620 + }, + { + "epoch": 2.9895870736086176, + "grad_norm": 0.8695791363716125, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 41630 + }, + { + "epoch": 2.9903052064631956, + "grad_norm": 0.8980445861816406, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 41640 + }, + { + "epoch": 2.9910233393177736, + "grad_norm": 0.7884747982025146, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 41650 + }, + { + "epoch": 2.991741472172352, + "grad_norm": 0.8347880840301514, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 41660 + }, + { + "epoch": 2.99245960502693, + "grad_norm": 0.7786261439323425, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 41670 + }, + { + "epoch": 2.993177737881508, + "grad_norm": 0.7830624580383301, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 41680 + }, + { + "epoch": 2.993895870736086, + "grad_norm": 0.8293532133102417, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 41690 + }, + { + "epoch": 2.994614003590664, + "grad_norm": 0.8476244211196899, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 41700 + }, + { + "epoch": 2.995332136445242, + "grad_norm": 0.7218726873397827, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 41710 + }, + { + "epoch": 2.9960502692998205, + "grad_norm": 0.8144199252128601, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 41720 + }, + { + "epoch": 2.9967684021543985, + "grad_norm": 0.7047123312950134, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 41730 + }, + { + "epoch": 2.9974865350089765, + "grad_norm": 0.8412184715270996, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 41740 + }, + { + "epoch": 2.998204667863555, + "grad_norm": 0.8840848207473755, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 41750 + }, + { + "epoch": 2.998922800718133, + "grad_norm": 0.7302142977714539, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 41760 + }, + { + "epoch": 2.999640933572711, + "grad_norm": 0.7075994610786438, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 41770 + }, + { + "epoch": 3.0, + "eval_loss": 1.1079821586608887, + "eval_runtime": 55.1897, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 41775 + }, + { + "epoch": 3.000359066427289, + "grad_norm": 0.8630077838897705, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 41780 + }, + { + "epoch": 3.001077199281867, + "grad_norm": 0.8901806473731995, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 41790 + }, + { + "epoch": 3.0017953321364454, + "grad_norm": 0.8291767835617065, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 41800 + }, + { + "epoch": 3.0025134649910235, + "grad_norm": 0.792519211769104, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 41810 + }, + { + "epoch": 3.0032315978456015, + "grad_norm": 1.1330063343048096, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 41820 + }, + { + "epoch": 3.0039497307001795, + "grad_norm": 0.9401350617408752, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 41830 + }, + { + "epoch": 3.0046678635547575, + "grad_norm": 0.8065463304519653, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 41840 + }, + { + "epoch": 3.005385996409336, + "grad_norm": 0.8309979438781738, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 41850 + }, + { + "epoch": 3.006104129263914, + "grad_norm": 0.7432689070701599, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 41860 + }, + { + "epoch": 3.006822262118492, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 41870 + }, + { + "epoch": 3.00754039497307, + "grad_norm": 1.4364255666732788, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 41880 + }, + { + "epoch": 3.008258527827648, + "grad_norm": 0.9023072123527527, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 41890 + }, + { + "epoch": 3.0089766606822264, + "grad_norm": 0.7790587544441223, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 41900 + }, + { + "epoch": 3.0096947935368044, + "grad_norm": 0.9163706302642822, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 41910 + }, + { + "epoch": 3.0104129263913824, + "grad_norm": 0.8147963285446167, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 41920 + }, + { + "epoch": 3.0111310592459604, + "grad_norm": 0.8432748913764954, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 41930 + }, + { + "epoch": 3.011849192100539, + "grad_norm": 0.9216182231903076, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 41940 + }, + { + "epoch": 3.012567324955117, + "grad_norm": 0.62154221534729, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 41950 + }, + { + "epoch": 3.013285457809695, + "grad_norm": 0.8902392387390137, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 41960 + }, + { + "epoch": 3.014003590664273, + "grad_norm": 0.9601083993911743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 41970 + }, + { + "epoch": 3.014721723518851, + "grad_norm": 0.8938809037208557, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 41980 + }, + { + "epoch": 3.0154398563734293, + "grad_norm": 1.0621999502182007, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 41990 + }, + { + "epoch": 3.0161579892280073, + "grad_norm": 0.7310585379600525, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 42000 + }, + { + "epoch": 3.0168761220825853, + "grad_norm": 0.8475853800773621, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 42010 + }, + { + "epoch": 3.0175942549371633, + "grad_norm": 0.8509864807128906, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 42020 + }, + { + "epoch": 3.0183123877917413, + "grad_norm": 0.7461876273155212, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 42030 + }, + { + "epoch": 3.0190305206463197, + "grad_norm": 0.7734265327453613, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 42040 + }, + { + "epoch": 3.0197486535008977, + "grad_norm": 0.9056455492973328, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 42050 + }, + { + "epoch": 3.0204667863554757, + "grad_norm": 0.9183889031410217, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 42060 + }, + { + "epoch": 3.0211849192100537, + "grad_norm": 1.0777326822280884, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 42070 + }, + { + "epoch": 3.021903052064632, + "grad_norm": 0.9217308163642883, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 42080 + }, + { + "epoch": 3.02262118491921, + "grad_norm": 0.8220202326774597, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42090 + }, + { + "epoch": 3.023339317773788, + "grad_norm": 0.8454978466033936, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 42100 + }, + { + "epoch": 3.024057450628366, + "grad_norm": 0.8116370439529419, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 42110 + }, + { + "epoch": 3.024775583482944, + "grad_norm": 0.8064935207366943, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 42120 + }, + { + "epoch": 3.0254937163375226, + "grad_norm": 0.9718650579452515, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 42130 + }, + { + "epoch": 3.0262118491921006, + "grad_norm": 0.8817588090896606, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 42140 + }, + { + "epoch": 3.0269299820466786, + "grad_norm": 0.7757318615913391, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 42150 + }, + { + "epoch": 3.0276481149012566, + "grad_norm": 0.7500545382499695, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 42160 + }, + { + "epoch": 3.0283662477558346, + "grad_norm": 0.72913658618927, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 42170 + }, + { + "epoch": 3.029084380610413, + "grad_norm": 0.7641891837120056, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 42180 + }, + { + "epoch": 3.029802513464991, + "grad_norm": 0.7682021856307983, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 42190 + }, + { + "epoch": 3.030520646319569, + "grad_norm": 0.8145958781242371, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 42200 + }, + { + "epoch": 3.031238779174147, + "grad_norm": 1.0546396970748901, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 42210 + }, + { + "epoch": 3.0319569120287255, + "grad_norm": 0.8222804665565491, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 42220 + }, + { + "epoch": 3.0326750448833035, + "grad_norm": 0.8245829343795776, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 42230 + }, + { + "epoch": 3.0333931777378815, + "grad_norm": 0.9059963822364807, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 42240 + }, + { + "epoch": 3.0341113105924595, + "grad_norm": 1.026747465133667, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 42250 + }, + { + "epoch": 3.0348294434470375, + "grad_norm": 0.9108404517173767, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42260 + }, + { + "epoch": 3.035547576301616, + "grad_norm": 0.9828516840934753, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 42270 + }, + { + "epoch": 3.036265709156194, + "grad_norm": 0.9664266705513, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 42280 + }, + { + "epoch": 3.036983842010772, + "grad_norm": 0.7577654719352722, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42290 + }, + { + "epoch": 3.03770197486535, + "grad_norm": 0.8331853151321411, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 42300 + }, + { + "epoch": 3.038420107719928, + "grad_norm": 0.8017228245735168, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 42310 + }, + { + "epoch": 3.0391382405745064, + "grad_norm": 1.0316718816757202, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 42320 + }, + { + "epoch": 3.0398563734290844, + "grad_norm": 0.9379803538322449, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 42330 + }, + { + "epoch": 3.0405745062836624, + "grad_norm": 0.7554476857185364, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 42340 + }, + { + "epoch": 3.0412926391382404, + "grad_norm": 0.7377917766571045, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 42350 + }, + { + "epoch": 3.042010771992819, + "grad_norm": 1.0655276775360107, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 42360 + }, + { + "epoch": 3.042728904847397, + "grad_norm": 0.7748511433601379, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 42370 + }, + { + "epoch": 3.043447037701975, + "grad_norm": 0.848649799823761, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 42380 + }, + { + "epoch": 3.044165170556553, + "grad_norm": 0.7754636406898499, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 42390 + }, + { + "epoch": 3.044883303411131, + "grad_norm": 0.8173656463623047, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 42400 + }, + { + "epoch": 3.0456014362657093, + "grad_norm": 0.7881983518600464, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 42410 + }, + { + "epoch": 3.0463195691202873, + "grad_norm": 0.971072256565094, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 42420 + }, + { + "epoch": 3.0470377019748653, + "grad_norm": 0.8400143384933472, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 42430 + }, + { + "epoch": 3.0477558348294433, + "grad_norm": 1.0028647184371948, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 42440 + }, + { + "epoch": 3.0484739676840213, + "grad_norm": 0.9728034734725952, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 42450 + }, + { + "epoch": 3.0491921005386, + "grad_norm": 0.937633752822876, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 42460 + }, + { + "epoch": 3.049910233393178, + "grad_norm": 1.0265642404556274, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 42470 + }, + { + "epoch": 3.050628366247756, + "grad_norm": 0.9733216762542725, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 42480 + }, + { + "epoch": 3.051346499102334, + "grad_norm": 0.7039174437522888, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 42490 + }, + { + "epoch": 3.0520646319569122, + "grad_norm": 0.7515231370925903, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 42500 + }, + { + "epoch": 3.0527827648114902, + "grad_norm": 0.9115300178527832, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 42510 + }, + { + "epoch": 3.0535008976660682, + "grad_norm": 0.7403655648231506, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 42520 + }, + { + "epoch": 3.0542190305206462, + "grad_norm": 0.7826810479164124, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 42530 + }, + { + "epoch": 3.0549371633752243, + "grad_norm": 0.8007349371910095, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 42540 + }, + { + "epoch": 3.0556552962298027, + "grad_norm": 0.7975959777832031, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 42550 + }, + { + "epoch": 3.0563734290843807, + "grad_norm": 0.9665228128433228, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42560 + }, + { + "epoch": 3.0570915619389587, + "grad_norm": 0.8386123180389404, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 42570 + }, + { + "epoch": 3.0578096947935367, + "grad_norm": 0.7437782287597656, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 42580 + }, + { + "epoch": 3.0585278276481147, + "grad_norm": 0.8360698223114014, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 42590 + }, + { + "epoch": 3.059245960502693, + "grad_norm": 0.8982073664665222, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42600 + }, + { + "epoch": 3.059964093357271, + "grad_norm": 0.9425758719444275, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 42610 + }, + { + "epoch": 3.060682226211849, + "grad_norm": 0.8567131161689758, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42620 + }, + { + "epoch": 3.061400359066427, + "grad_norm": 0.9322942495346069, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 42630 + }, + { + "epoch": 3.0621184919210056, + "grad_norm": 0.8283235430717468, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 42640 + }, + { + "epoch": 3.0628366247755836, + "grad_norm": 0.8457967638969421, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 42650 + }, + { + "epoch": 3.0635547576301616, + "grad_norm": 0.8205100893974304, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42660 + }, + { + "epoch": 3.0642728904847396, + "grad_norm": 0.8385181427001953, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 42670 + }, + { + "epoch": 3.0649910233393176, + "grad_norm": 1.2959390878677368, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 42680 + }, + { + "epoch": 3.065709156193896, + "grad_norm": 0.7150540351867676, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 42690 + }, + { + "epoch": 3.066427289048474, + "grad_norm": 0.6647360920906067, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 42700 + }, + { + "epoch": 3.067145421903052, + "grad_norm": 0.9148316979408264, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 42710 + }, + { + "epoch": 3.06786355475763, + "grad_norm": 0.8606209754943848, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 42720 + }, + { + "epoch": 3.068581687612208, + "grad_norm": 1.4255632162094116, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42730 + }, + { + "epoch": 3.0692998204667865, + "grad_norm": 0.9131710529327393, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 42740 + }, + { + "epoch": 3.0700179533213645, + "grad_norm": 0.9560360908508301, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 42750 + }, + { + "epoch": 3.0707360861759425, + "grad_norm": 0.9278100728988647, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42760 + }, + { + "epoch": 3.0714542190305205, + "grad_norm": 0.7258471846580505, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 42770 + }, + { + "epoch": 3.072172351885099, + "grad_norm": 1.1537690162658691, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 42780 + }, + { + "epoch": 3.072890484739677, + "grad_norm": 0.8562588691711426, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 42790 + }, + { + "epoch": 3.073608617594255, + "grad_norm": 1.0271626710891724, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 42800 + }, + { + "epoch": 3.074326750448833, + "grad_norm": 0.85148024559021, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 42810 + }, + { + "epoch": 3.075044883303411, + "grad_norm": 0.805772602558136, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 42820 + }, + { + "epoch": 3.0757630161579894, + "grad_norm": 0.8057122230529785, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 42830 + }, + { + "epoch": 3.0764811490125674, + "grad_norm": 0.7997274994850159, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 42840 + }, + { + "epoch": 3.0771992818671454, + "grad_norm": 0.8739321231842041, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 42850 + }, + { + "epoch": 3.0779174147217234, + "grad_norm": 0.833951473236084, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 42860 + }, + { + "epoch": 3.0786355475763014, + "grad_norm": 0.8813839554786682, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 42870 + }, + { + "epoch": 3.07935368043088, + "grad_norm": 0.9020521640777588, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 42880 + }, + { + "epoch": 3.080071813285458, + "grad_norm": 0.888148844242096, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 42890 + }, + { + "epoch": 3.080789946140036, + "grad_norm": 0.8110589385032654, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 42900 + }, + { + "epoch": 3.081508078994614, + "grad_norm": 0.818738579750061, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 42910 + }, + { + "epoch": 3.082226211849192, + "grad_norm": 0.9607479572296143, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 42920 + }, + { + "epoch": 3.0829443447037703, + "grad_norm": 0.8162698745727539, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 42930 + }, + { + "epoch": 3.0836624775583483, + "grad_norm": 0.8170801997184753, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 42940 + }, + { + "epoch": 3.0843806104129263, + "grad_norm": 0.9250763654708862, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 42950 + }, + { + "epoch": 3.0850987432675043, + "grad_norm": 0.898097813129425, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 42960 + }, + { + "epoch": 3.0858168761220828, + "grad_norm": 0.9398433566093445, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 42970 + }, + { + "epoch": 3.0865350089766608, + "grad_norm": 1.052808165550232, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 42980 + }, + { + "epoch": 3.087253141831239, + "grad_norm": 0.8974723219871521, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 42990 + }, + { + "epoch": 3.087971274685817, + "grad_norm": 0.7517408728599548, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 43000 + }, + { + "epoch": 3.088689407540395, + "grad_norm": 0.8054485321044922, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 43010 + }, + { + "epoch": 3.0894075403949732, + "grad_norm": 0.9896154999732971, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 43020 + }, + { + "epoch": 3.0901256732495512, + "grad_norm": 0.7887356281280518, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 43030 + }, + { + "epoch": 3.0908438061041292, + "grad_norm": 1.0119125843048096, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 43040 + }, + { + "epoch": 3.0915619389587072, + "grad_norm": 0.8753892779350281, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 43050 + }, + { + "epoch": 3.0922800718132857, + "grad_norm": 0.8322654962539673, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43060 + }, + { + "epoch": 3.0929982046678637, + "grad_norm": 1.0605992078781128, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 43070 + }, + { + "epoch": 3.0937163375224417, + "grad_norm": 0.8783912062644958, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 43080 + }, + { + "epoch": 3.0944344703770197, + "grad_norm": 0.8839107751846313, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 43090 + }, + { + "epoch": 3.0951526032315977, + "grad_norm": 1.1655086278915405, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 43100 + }, + { + "epoch": 3.095870736086176, + "grad_norm": 0.7051523327827454, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 43110 + }, + { + "epoch": 3.096588868940754, + "grad_norm": 0.7793807983398438, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43120 + }, + { + "epoch": 3.097307001795332, + "grad_norm": 0.8352194428443909, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 43130 + }, + { + "epoch": 3.09802513464991, + "grad_norm": 0.9684847593307495, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 43140 + }, + { + "epoch": 3.098743267504488, + "grad_norm": 1.1106340885162354, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 43150 + }, + { + "epoch": 3.0994614003590666, + "grad_norm": 0.7814911603927612, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 43160 + }, + { + "epoch": 3.1001795332136446, + "grad_norm": 0.7923110723495483, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 43170 + }, + { + "epoch": 3.1008976660682226, + "grad_norm": 0.87022864818573, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 43180 + }, + { + "epoch": 3.1016157989228006, + "grad_norm": 0.9352855682373047, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 43190 + }, + { + "epoch": 3.1023339317773786, + "grad_norm": 0.8548445105552673, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 43200 + }, + { + "epoch": 3.103052064631957, + "grad_norm": 0.9576025009155273, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 43210 + }, + { + "epoch": 3.103770197486535, + "grad_norm": 0.7430430054664612, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 43220 + }, + { + "epoch": 3.104488330341113, + "grad_norm": 0.9619144797325134, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 43230 + }, + { + "epoch": 3.105206463195691, + "grad_norm": 0.8622338771820068, + "learning_rate": 0.0002, + "loss": 0.6171, + "step": 43240 + }, + { + "epoch": 3.1059245960502695, + "grad_norm": 0.853489339351654, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43250 + }, + { + "epoch": 3.1066427289048475, + "grad_norm": 0.9253206849098206, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 43260 + }, + { + "epoch": 3.1073608617594255, + "grad_norm": 0.9700671434402466, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 43270 + }, + { + "epoch": 3.1080789946140035, + "grad_norm": 1.0550731420516968, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 43280 + }, + { + "epoch": 3.1087971274685815, + "grad_norm": 0.939452052116394, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 43290 + }, + { + "epoch": 3.10951526032316, + "grad_norm": 0.8855276107788086, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 43300 + }, + { + "epoch": 3.110233393177738, + "grad_norm": 0.92197185754776, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 43310 + }, + { + "epoch": 3.110951526032316, + "grad_norm": 0.8825578689575195, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 43320 + }, + { + "epoch": 3.111669658886894, + "grad_norm": 0.9964608550071716, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 43330 + }, + { + "epoch": 3.1123877917414724, + "grad_norm": 0.9070520401000977, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 43340 + }, + { + "epoch": 3.1131059245960504, + "grad_norm": 0.9699633717536926, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 43350 + }, + { + "epoch": 3.1138240574506284, + "grad_norm": 0.7384091019630432, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 43360 + }, + { + "epoch": 3.1145421903052064, + "grad_norm": 0.9445326328277588, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 43370 + }, + { + "epoch": 3.1152603231597844, + "grad_norm": 0.8906524181365967, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 43380 + }, + { + "epoch": 3.115978456014363, + "grad_norm": 0.8850129246711731, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 43390 + }, + { + "epoch": 3.116696588868941, + "grad_norm": 0.7091860771179199, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 43400 + }, + { + "epoch": 3.117414721723519, + "grad_norm": 0.8992764949798584, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 43410 + }, + { + "epoch": 3.118132854578097, + "grad_norm": 0.9166698455810547, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43420 + }, + { + "epoch": 3.118850987432675, + "grad_norm": 1.1195749044418335, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 43430 + }, + { + "epoch": 3.1195691202872533, + "grad_norm": 0.9414069652557373, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 43440 + }, + { + "epoch": 3.1202872531418313, + "grad_norm": 0.7641217112541199, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 43450 + }, + { + "epoch": 3.1210053859964093, + "grad_norm": 1.2659285068511963, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 43460 + }, + { + "epoch": 3.1217235188509873, + "grad_norm": 0.9968213438987732, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 43470 + }, + { + "epoch": 3.1224416517055653, + "grad_norm": 0.8819042444229126, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 43480 + }, + { + "epoch": 3.1231597845601438, + "grad_norm": 0.9124775528907776, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 43490 + }, + { + "epoch": 3.1238779174147218, + "grad_norm": 0.868354082107544, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 43500 + }, + { + "epoch": 3.1245960502692998, + "grad_norm": 0.7367526292800903, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 43510 + }, + { + "epoch": 3.1253141831238778, + "grad_norm": 0.7553679943084717, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43520 + }, + { + "epoch": 3.126032315978456, + "grad_norm": 0.7970008850097656, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 43530 + }, + { + "epoch": 3.126750448833034, + "grad_norm": 0.9117488861083984, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 43540 + }, + { + "epoch": 3.127468581687612, + "grad_norm": 0.8004103899002075, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 43550 + }, + { + "epoch": 3.12818671454219, + "grad_norm": 0.736518919467926, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 43560 + }, + { + "epoch": 3.128904847396768, + "grad_norm": 0.8568395376205444, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 43570 + }, + { + "epoch": 3.1296229802513467, + "grad_norm": 0.9344052672386169, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 43580 + }, + { + "epoch": 3.1303411131059247, + "grad_norm": 0.7986525297164917, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 43590 + }, + { + "epoch": 3.1310592459605027, + "grad_norm": 0.8283242583274841, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 43600 + }, + { + "epoch": 3.1317773788150807, + "grad_norm": 0.6534292101860046, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 43610 + }, + { + "epoch": 3.132495511669659, + "grad_norm": 0.9585428833961487, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 43620 + }, + { + "epoch": 3.133213644524237, + "grad_norm": 0.8299157023429871, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 43630 + }, + { + "epoch": 3.133931777378815, + "grad_norm": 0.9050052762031555, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 43640 + }, + { + "epoch": 3.134649910233393, + "grad_norm": 1.0457062721252441, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 43650 + }, + { + "epoch": 3.135368043087971, + "grad_norm": 0.907691240310669, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 43660 + }, + { + "epoch": 3.1360861759425496, + "grad_norm": 0.8868935108184814, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 43670 + }, + { + "epoch": 3.1368043087971276, + "grad_norm": 0.8585456609725952, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 43680 + }, + { + "epoch": 3.1375224416517056, + "grad_norm": 1.0402741432189941, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 43690 + }, + { + "epoch": 3.1382405745062836, + "grad_norm": 1.0866798162460327, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 43700 + }, + { + "epoch": 3.1389587073608616, + "grad_norm": 0.7637296915054321, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 43710 + }, + { + "epoch": 3.13967684021544, + "grad_norm": 0.755235493183136, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 43720 + }, + { + "epoch": 3.140394973070018, + "grad_norm": 0.7258853316307068, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 43730 + }, + { + "epoch": 3.141113105924596, + "grad_norm": 1.0425268411636353, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 43740 + }, + { + "epoch": 3.141831238779174, + "grad_norm": 0.9171959757804871, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 43750 + }, + { + "epoch": 3.142549371633752, + "grad_norm": 0.8900150656700134, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 43760 + }, + { + "epoch": 3.1432675044883305, + "grad_norm": 0.9879246354103088, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 43770 + }, + { + "epoch": 3.1439856373429085, + "grad_norm": 0.7853389382362366, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 43780 + }, + { + "epoch": 3.1447037701974865, + "grad_norm": 1.0245232582092285, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 43790 + }, + { + "epoch": 3.1454219030520645, + "grad_norm": 0.8486390113830566, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 43800 + }, + { + "epoch": 3.146140035906643, + "grad_norm": 0.8536406755447388, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 43810 + }, + { + "epoch": 3.146858168761221, + "grad_norm": 0.9653734564781189, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 43820 + }, + { + "epoch": 3.147576301615799, + "grad_norm": 0.8292608857154846, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 43830 + }, + { + "epoch": 3.148294434470377, + "grad_norm": 1.147524118423462, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 43840 + }, + { + "epoch": 3.149012567324955, + "grad_norm": 0.9317546486854553, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 43850 + }, + { + "epoch": 3.1497307001795334, + "grad_norm": 0.8651045560836792, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 43860 + }, + { + "epoch": 3.1504488330341114, + "grad_norm": 0.8718969225883484, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 43870 + }, + { + "epoch": 3.1511669658886894, + "grad_norm": 1.0140702724456787, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 43880 + }, + { + "epoch": 3.1518850987432674, + "grad_norm": 0.75941401720047, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43890 + }, + { + "epoch": 3.152603231597846, + "grad_norm": 0.6618940234184265, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 43900 + }, + { + "epoch": 3.153321364452424, + "grad_norm": 1.0013338327407837, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 43910 + }, + { + "epoch": 3.154039497307002, + "grad_norm": 0.8735299706459045, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 43920 + }, + { + "epoch": 3.15475763016158, + "grad_norm": 1.141914963722229, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 43930 + }, + { + "epoch": 3.155475763016158, + "grad_norm": 1.0916038751602173, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 43940 + }, + { + "epoch": 3.1561938958707363, + "grad_norm": 0.7042547464370728, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 43950 + }, + { + "epoch": 3.1569120287253143, + "grad_norm": 0.9885236620903015, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 43960 + }, + { + "epoch": 3.1576301615798923, + "grad_norm": 0.8083009719848633, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 43970 + }, + { + "epoch": 3.1583482944344703, + "grad_norm": 1.082627296447754, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 43980 + }, + { + "epoch": 3.1590664272890483, + "grad_norm": 0.9293290376663208, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 43990 + }, + { + "epoch": 3.1597845601436267, + "grad_norm": 0.861003041267395, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 44000 + }, + { + "epoch": 3.1605026929982047, + "grad_norm": 0.9565994143486023, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 44010 + }, + { + "epoch": 3.1612208258527827, + "grad_norm": 0.9609305262565613, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 44020 + }, + { + "epoch": 3.1619389587073607, + "grad_norm": 0.847830593585968, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 44030 + }, + { + "epoch": 3.1626570915619387, + "grad_norm": 0.852357804775238, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 44040 + }, + { + "epoch": 3.163375224416517, + "grad_norm": 0.8634562492370605, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44050 + }, + { + "epoch": 3.164093357271095, + "grad_norm": 1.0259950160980225, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 44060 + }, + { + "epoch": 3.164811490125673, + "grad_norm": 0.9615250825881958, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 44070 + }, + { + "epoch": 3.165529622980251, + "grad_norm": 0.9892165660858154, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 44080 + }, + { + "epoch": 3.1662477558348296, + "grad_norm": 0.8827354907989502, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 44090 + }, + { + "epoch": 3.1669658886894076, + "grad_norm": 0.9258168339729309, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 44100 + }, + { + "epoch": 3.1676840215439857, + "grad_norm": 0.7983399033546448, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 44110 + }, + { + "epoch": 3.1684021543985637, + "grad_norm": 0.9917809963226318, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 44120 + }, + { + "epoch": 3.1691202872531417, + "grad_norm": 1.058927297592163, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44130 + }, + { + "epoch": 3.16983842010772, + "grad_norm": 1.0095895528793335, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44140 + }, + { + "epoch": 3.170556552962298, + "grad_norm": 0.9032495617866516, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 44150 + }, + { + "epoch": 3.171274685816876, + "grad_norm": 0.9391272664070129, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 44160 + }, + { + "epoch": 3.171992818671454, + "grad_norm": 0.990755558013916, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44170 + }, + { + "epoch": 3.172710951526032, + "grad_norm": 0.9310759902000427, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 44180 + }, + { + "epoch": 3.1734290843806106, + "grad_norm": 0.7698856592178345, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 44190 + }, + { + "epoch": 3.1741472172351886, + "grad_norm": 0.7735867500305176, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 44200 + }, + { + "epoch": 3.1748653500897666, + "grad_norm": 1.1447525024414062, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 44210 + }, + { + "epoch": 3.1755834829443446, + "grad_norm": 0.8667060136795044, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 44220 + }, + { + "epoch": 3.176301615798923, + "grad_norm": 0.8596829771995544, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 44230 + }, + { + "epoch": 3.177019748653501, + "grad_norm": 0.8607654571533203, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 44240 + }, + { + "epoch": 3.177737881508079, + "grad_norm": 0.9346948266029358, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 44250 + }, + { + "epoch": 3.178456014362657, + "grad_norm": 0.852344810962677, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 44260 + }, + { + "epoch": 3.179174147217235, + "grad_norm": 0.9260450005531311, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 44270 + }, + { + "epoch": 3.1798922800718135, + "grad_norm": 0.924053430557251, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 44280 + }, + { + "epoch": 3.1806104129263915, + "grad_norm": 1.001965045928955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 44290 + }, + { + "epoch": 3.1813285457809695, + "grad_norm": 0.943215012550354, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44300 + }, + { + "epoch": 3.1820466786355475, + "grad_norm": 1.006977915763855, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 44310 + }, + { + "epoch": 3.1827648114901255, + "grad_norm": 0.9768950343132019, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 44320 + }, + { + "epoch": 3.183482944344704, + "grad_norm": 0.9297489523887634, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 44330 + }, + { + "epoch": 3.184201077199282, + "grad_norm": 0.9110919237136841, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 44340 + }, + { + "epoch": 3.18491921005386, + "grad_norm": 0.9821381568908691, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 44350 + }, + { + "epoch": 3.185637342908438, + "grad_norm": 0.8451243042945862, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 44360 + }, + { + "epoch": 3.1863554757630164, + "grad_norm": 0.9676638245582581, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 44370 + }, + { + "epoch": 3.1870736086175944, + "grad_norm": 0.9826035499572754, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 44380 + }, + { + "epoch": 3.1877917414721724, + "grad_norm": 0.9453121423721313, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 44390 + }, + { + "epoch": 3.1885098743267504, + "grad_norm": 0.7766330242156982, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 44400 + }, + { + "epoch": 3.1892280071813284, + "grad_norm": 0.9302349090576172, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 44410 + }, + { + "epoch": 3.189946140035907, + "grad_norm": 0.8335331082344055, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 44420 + }, + { + "epoch": 3.190664272890485, + "grad_norm": 0.6722736358642578, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 44430 + }, + { + "epoch": 3.191382405745063, + "grad_norm": 0.9047536849975586, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 44440 + }, + { + "epoch": 3.192100538599641, + "grad_norm": 0.9653822183609009, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 44450 + }, + { + "epoch": 3.192818671454219, + "grad_norm": 0.7750703692436218, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 44460 + }, + { + "epoch": 3.1935368043087973, + "grad_norm": 0.7767539024353027, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 44470 + }, + { + "epoch": 3.1942549371633753, + "grad_norm": 0.8597778081893921, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44480 + }, + { + "epoch": 3.1949730700179533, + "grad_norm": 1.1711493730545044, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 44490 + }, + { + "epoch": 3.1956912028725313, + "grad_norm": 0.9025220274925232, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 44500 + }, + { + "epoch": 3.1964093357271093, + "grad_norm": 0.8084979057312012, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44510 + }, + { + "epoch": 3.1971274685816877, + "grad_norm": 0.8475074172019958, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44520 + }, + { + "epoch": 3.1978456014362657, + "grad_norm": 0.9915644526481628, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 44530 + }, + { + "epoch": 3.1985637342908437, + "grad_norm": 0.992231547832489, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 44540 + }, + { + "epoch": 3.1992818671454217, + "grad_norm": 0.9804556369781494, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 44550 + }, + { + "epoch": 3.2, + "grad_norm": 1.045558214187622, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 44560 + }, + { + "epoch": 3.200718132854578, + "grad_norm": 1.0880261659622192, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 44570 + }, + { + "epoch": 3.201436265709156, + "grad_norm": 0.9511138200759888, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44580 + }, + { + "epoch": 3.202154398563734, + "grad_norm": 0.9115344882011414, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 44590 + }, + { + "epoch": 3.202872531418312, + "grad_norm": 1.0738362073898315, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 44600 + }, + { + "epoch": 3.2035906642728906, + "grad_norm": 0.8209697604179382, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44610 + }, + { + "epoch": 3.2043087971274686, + "grad_norm": 0.9220197796821594, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44620 + }, + { + "epoch": 3.2050269299820466, + "grad_norm": 0.8859700560569763, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 44630 + }, + { + "epoch": 3.2057450628366246, + "grad_norm": 0.9772757291793823, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 44640 + }, + { + "epoch": 3.206463195691203, + "grad_norm": 0.9385574460029602, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 44650 + }, + { + "epoch": 3.207181328545781, + "grad_norm": 0.839958906173706, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 44660 + }, + { + "epoch": 3.207899461400359, + "grad_norm": 0.860478401184082, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 44670 + }, + { + "epoch": 3.208617594254937, + "grad_norm": 0.846886396408081, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 44680 + }, + { + "epoch": 3.209335727109515, + "grad_norm": 0.8591006398200989, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 44690 + }, + { + "epoch": 3.2100538599640935, + "grad_norm": 0.9236023426055908, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 44700 + }, + { + "epoch": 3.2107719928186715, + "grad_norm": 0.7348999977111816, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44710 + }, + { + "epoch": 3.2114901256732495, + "grad_norm": 1.0041730403900146, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 44720 + }, + { + "epoch": 3.2122082585278275, + "grad_norm": 0.8382687568664551, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 44730 + }, + { + "epoch": 3.2129263913824055, + "grad_norm": 0.8253511190414429, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 44740 + }, + { + "epoch": 3.213644524236984, + "grad_norm": 0.9589242935180664, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 44750 + }, + { + "epoch": 3.214362657091562, + "grad_norm": 0.8938157558441162, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 44760 + }, + { + "epoch": 3.21508078994614, + "grad_norm": 1.0085135698318481, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 44770 + }, + { + "epoch": 3.215798922800718, + "grad_norm": 0.8647134304046631, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 44780 + }, + { + "epoch": 3.216517055655296, + "grad_norm": 1.09453284740448, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 44790 + }, + { + "epoch": 3.2172351885098744, + "grad_norm": 0.8710666298866272, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 44800 + }, + { + "epoch": 3.2179533213644524, + "grad_norm": 0.8080880641937256, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 44810 + }, + { + "epoch": 3.2186714542190304, + "grad_norm": 1.0440675020217896, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 44820 + }, + { + "epoch": 3.2193895870736084, + "grad_norm": 1.1036376953125, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 44830 + }, + { + "epoch": 3.220107719928187, + "grad_norm": 0.8783546686172485, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44840 + }, + { + "epoch": 3.220825852782765, + "grad_norm": 0.7816855907440186, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 44850 + }, + { + "epoch": 3.221543985637343, + "grad_norm": 1.0099157094955444, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 44860 + }, + { + "epoch": 3.222262118491921, + "grad_norm": 1.054928183555603, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 44870 + }, + { + "epoch": 3.222980251346499, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 44880 + }, + { + "epoch": 3.2236983842010773, + "grad_norm": 0.9730798602104187, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 44890 + }, + { + "epoch": 3.2244165170556554, + "grad_norm": 0.7911382913589478, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 44900 + }, + { + "epoch": 3.2251346499102334, + "grad_norm": 0.9574400782585144, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 44910 + }, + { + "epoch": 3.2258527827648114, + "grad_norm": 0.8101068139076233, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 44920 + }, + { + "epoch": 3.22657091561939, + "grad_norm": 0.754146933555603, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 44930 + }, + { + "epoch": 3.227289048473968, + "grad_norm": 0.7471939921379089, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 44940 + }, + { + "epoch": 3.228007181328546, + "grad_norm": 1.0040855407714844, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 44950 + }, + { + "epoch": 3.228725314183124, + "grad_norm": 1.0016074180603027, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 44960 + }, + { + "epoch": 3.229443447037702, + "grad_norm": 1.0432976484298706, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 44970 + }, + { + "epoch": 3.2301615798922803, + "grad_norm": 0.8517055511474609, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 44980 + }, + { + "epoch": 3.2308797127468583, + "grad_norm": 0.9174178242683411, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 44990 + }, + { + "epoch": 3.2315978456014363, + "grad_norm": 0.9733774065971375, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 45000 + }, + { + "epoch": 3.2323159784560143, + "grad_norm": 0.9074714779853821, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 45010 + }, + { + "epoch": 3.2330341113105923, + "grad_norm": 0.8802759051322937, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 45020 + }, + { + "epoch": 3.2337522441651707, + "grad_norm": 1.0620871782302856, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 45030 + }, + { + "epoch": 3.2344703770197487, + "grad_norm": 0.8069542050361633, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 45040 + }, + { + "epoch": 3.2351885098743267, + "grad_norm": 0.9139137864112854, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 45050 + }, + { + "epoch": 3.2359066427289047, + "grad_norm": 0.8936411142349243, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 45060 + }, + { + "epoch": 3.2366247755834827, + "grad_norm": 0.9098079204559326, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 45070 + }, + { + "epoch": 3.237342908438061, + "grad_norm": 1.062953233718872, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45080 + }, + { + "epoch": 3.238061041292639, + "grad_norm": 0.8656470775604248, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 45090 + }, + { + "epoch": 3.238779174147217, + "grad_norm": 0.9299449920654297, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 45100 + }, + { + "epoch": 3.239497307001795, + "grad_norm": 1.0102022886276245, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 45110 + }, + { + "epoch": 3.2402154398563736, + "grad_norm": 0.8074561953544617, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 45120 + }, + { + "epoch": 3.2409335727109516, + "grad_norm": 1.044105887413025, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 45130 + }, + { + "epoch": 3.2416517055655296, + "grad_norm": 0.8742762207984924, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 45140 + }, + { + "epoch": 3.2423698384201076, + "grad_norm": 0.8240015506744385, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 45150 + }, + { + "epoch": 3.2430879712746856, + "grad_norm": 0.8438951373100281, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 45160 + }, + { + "epoch": 3.243806104129264, + "grad_norm": 1.02358877658844, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 45170 + }, + { + "epoch": 3.244524236983842, + "grad_norm": 0.8824774026870728, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 45180 + }, + { + "epoch": 3.24524236983842, + "grad_norm": 0.971015989780426, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 45190 + }, + { + "epoch": 3.245960502692998, + "grad_norm": 0.9282383918762207, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 45200 + }, + { + "epoch": 3.2466786355475765, + "grad_norm": 0.7908362746238708, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 45210 + }, + { + "epoch": 3.2473967684021545, + "grad_norm": 1.0721662044525146, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 45220 + }, + { + "epoch": 3.2481149012567325, + "grad_norm": 0.9516810774803162, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 45230 + }, + { + "epoch": 3.2488330341113105, + "grad_norm": 0.7914131283760071, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 45240 + }, + { + "epoch": 3.2495511669658885, + "grad_norm": 0.8492292761802673, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 45250 + }, + { + "epoch": 3.250269299820467, + "grad_norm": 0.8880114555358887, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 45260 + }, + { + "epoch": 3.250987432675045, + "grad_norm": 0.7808310985565186, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 45270 + }, + { + "epoch": 3.251705565529623, + "grad_norm": 0.8566828966140747, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 45280 + }, + { + "epoch": 3.252423698384201, + "grad_norm": 0.7929658889770508, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45290 + }, + { + "epoch": 3.253141831238779, + "grad_norm": 0.678207516670227, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 45300 + }, + { + "epoch": 3.2538599640933574, + "grad_norm": 0.9963029623031616, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45310 + }, + { + "epoch": 3.2545780969479354, + "grad_norm": 0.835304856300354, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 45320 + }, + { + "epoch": 3.2552962298025134, + "grad_norm": 0.7281617522239685, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 45330 + }, + { + "epoch": 3.2560143626570914, + "grad_norm": 1.244890570640564, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 45340 + }, + { + "epoch": 3.2567324955116694, + "grad_norm": 0.8372750282287598, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 45350 + }, + { + "epoch": 3.257450628366248, + "grad_norm": 1.0029667615890503, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 45360 + }, + { + "epoch": 3.258168761220826, + "grad_norm": 0.8561908602714539, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 45370 + }, + { + "epoch": 3.258886894075404, + "grad_norm": 1.0058085918426514, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 45380 + }, + { + "epoch": 3.259605026929982, + "grad_norm": 0.7768221497535706, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 45390 + }, + { + "epoch": 3.2603231597845603, + "grad_norm": 0.8443793058395386, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 45400 + }, + { + "epoch": 3.2610412926391383, + "grad_norm": 1.0140392780303955, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 45410 + }, + { + "epoch": 3.2617594254937163, + "grad_norm": 0.8397058248519897, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 45420 + }, + { + "epoch": 3.2624775583482943, + "grad_norm": 0.9717063903808594, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 45430 + }, + { + "epoch": 3.2631956912028723, + "grad_norm": 1.0279473066329956, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 45440 + }, + { + "epoch": 3.263913824057451, + "grad_norm": 1.207457184791565, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 45450 + }, + { + "epoch": 3.264631956912029, + "grad_norm": 0.8121998906135559, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 45460 + }, + { + "epoch": 3.265350089766607, + "grad_norm": 1.037733554840088, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 45470 + }, + { + "epoch": 3.266068222621185, + "grad_norm": 0.9305754899978638, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 45480 + }, + { + "epoch": 3.2667863554757632, + "grad_norm": 0.9733602404594421, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 45490 + }, + { + "epoch": 3.2675044883303412, + "grad_norm": 0.8345039486885071, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 45500 + }, + { + "epoch": 3.2682226211849192, + "grad_norm": 0.8601692318916321, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45510 + }, + { + "epoch": 3.2689407540394972, + "grad_norm": 0.7921277284622192, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 45520 + }, + { + "epoch": 3.2696588868940752, + "grad_norm": 0.8324153423309326, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 45530 + }, + { + "epoch": 3.2703770197486537, + "grad_norm": 0.85141521692276, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 45540 + }, + { + "epoch": 3.2710951526032317, + "grad_norm": 0.9399608373641968, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 45550 + }, + { + "epoch": 3.2718132854578097, + "grad_norm": 0.9829166531562805, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 45560 + }, + { + "epoch": 3.2725314183123877, + "grad_norm": 0.9936266541481018, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 45570 + }, + { + "epoch": 3.2732495511669657, + "grad_norm": 1.036165714263916, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 45580 + }, + { + "epoch": 3.273967684021544, + "grad_norm": 0.8988680243492126, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45590 + }, + { + "epoch": 3.274685816876122, + "grad_norm": 0.9173405766487122, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 45600 + }, + { + "epoch": 3.2754039497307, + "grad_norm": 0.9967324733734131, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 45610 + }, + { + "epoch": 3.276122082585278, + "grad_norm": 0.9097777009010315, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 45620 + }, + { + "epoch": 3.276840215439856, + "grad_norm": 1.0559430122375488, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 45630 + }, + { + "epoch": 3.2775583482944346, + "grad_norm": 0.9583360552787781, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 45640 + }, + { + "epoch": 3.2782764811490126, + "grad_norm": 0.7630334496498108, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 45650 + }, + { + "epoch": 3.2789946140035906, + "grad_norm": 0.9955230355262756, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 45660 + }, + { + "epoch": 3.2797127468581686, + "grad_norm": 0.8685793876647949, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45670 + }, + { + "epoch": 3.280430879712747, + "grad_norm": 0.919913113117218, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 45680 + }, + { + "epoch": 3.281149012567325, + "grad_norm": 0.826144814491272, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 45690 + }, + { + "epoch": 3.281867145421903, + "grad_norm": 0.9750179052352905, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 45700 + }, + { + "epoch": 3.282585278276481, + "grad_norm": 0.7931897640228271, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 45710 + }, + { + "epoch": 3.283303411131059, + "grad_norm": 1.0380089282989502, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 45720 + }, + { + "epoch": 3.2840215439856375, + "grad_norm": 0.8220566511154175, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 45730 + }, + { + "epoch": 3.2847396768402155, + "grad_norm": 0.9688239693641663, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 45740 + }, + { + "epoch": 3.2854578096947935, + "grad_norm": 0.8760311603546143, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 45750 + }, + { + "epoch": 3.2861759425493715, + "grad_norm": 0.8103382587432861, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 45760 + }, + { + "epoch": 3.28689407540395, + "grad_norm": 0.8835865259170532, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 45770 + }, + { + "epoch": 3.287612208258528, + "grad_norm": 0.9021160006523132, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45780 + }, + { + "epoch": 3.288330341113106, + "grad_norm": 0.8182386159896851, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 45790 + }, + { + "epoch": 3.289048473967684, + "grad_norm": 0.8555024862289429, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45800 + }, + { + "epoch": 3.289766606822262, + "grad_norm": 1.0982348918914795, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 45810 + }, + { + "epoch": 3.2904847396768404, + "grad_norm": 1.06246817111969, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 45820 + }, + { + "epoch": 3.2912028725314184, + "grad_norm": 1.1727149486541748, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 45830 + }, + { + "epoch": 3.2919210053859964, + "grad_norm": 0.8224700093269348, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 45840 + }, + { + "epoch": 3.2926391382405744, + "grad_norm": 0.8195698261260986, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 45850 + }, + { + "epoch": 3.2933572710951524, + "grad_norm": 0.8424476981163025, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 45860 + }, + { + "epoch": 3.294075403949731, + "grad_norm": 0.9804632067680359, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 45870 + }, + { + "epoch": 3.294793536804309, + "grad_norm": 0.8701804876327515, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 45880 + }, + { + "epoch": 3.295511669658887, + "grad_norm": 0.8876864910125732, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 45890 + }, + { + "epoch": 3.296229802513465, + "grad_norm": 1.0105448961257935, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 45900 + }, + { + "epoch": 3.296947935368043, + "grad_norm": 0.847017228603363, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 45910 + }, + { + "epoch": 3.2976660682226213, + "grad_norm": 0.7610297799110413, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 45920 + }, + { + "epoch": 3.2983842010771993, + "grad_norm": 0.7272670269012451, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 45930 + }, + { + "epoch": 3.2991023339317773, + "grad_norm": 0.8243510127067566, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 45940 + }, + { + "epoch": 3.2998204667863553, + "grad_norm": 1.0113074779510498, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 45950 + }, + { + "epoch": 3.3005385996409338, + "grad_norm": 0.8578087687492371, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 45960 + }, + { + "epoch": 3.3012567324955118, + "grad_norm": 0.9511606097221375, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 45970 + }, + { + "epoch": 3.3019748653500898, + "grad_norm": 0.8612566590309143, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 45980 + }, + { + "epoch": 3.3026929982046678, + "grad_norm": 0.8702331185340881, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 45990 + }, + { + "epoch": 3.3034111310592458, + "grad_norm": 1.0229583978652954, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 46000 + }, + { + "epoch": 3.304129263913824, + "grad_norm": 1.1775577068328857, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 46010 + }, + { + "epoch": 3.3048473967684022, + "grad_norm": 0.9922171831130981, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 46020 + }, + { + "epoch": 3.3055655296229802, + "grad_norm": 0.8246880769729614, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 46030 + }, + { + "epoch": 3.3062836624775582, + "grad_norm": 0.9351653456687927, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 46040 + }, + { + "epoch": 3.3070017953321367, + "grad_norm": 0.9617429375648499, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 46050 + }, + { + "epoch": 3.3077199281867147, + "grad_norm": 0.9753885269165039, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 46060 + }, + { + "epoch": 3.3084380610412927, + "grad_norm": 0.8532425165176392, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 46070 + }, + { + "epoch": 3.3091561938958707, + "grad_norm": 0.9722012877464294, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 46080 + }, + { + "epoch": 3.3098743267504487, + "grad_norm": 0.8950021266937256, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 46090 + }, + { + "epoch": 3.3105924596050267, + "grad_norm": 0.8536333441734314, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46100 + }, + { + "epoch": 3.311310592459605, + "grad_norm": 0.9423946738243103, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46110 + }, + { + "epoch": 3.312028725314183, + "grad_norm": 0.8573169112205505, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 46120 + }, + { + "epoch": 3.312746858168761, + "grad_norm": 1.0122376680374146, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 46130 + }, + { + "epoch": 3.313464991023339, + "grad_norm": 0.7492560744285583, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 46140 + }, + { + "epoch": 3.3141831238779176, + "grad_norm": 1.023658037185669, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 46150 + }, + { + "epoch": 3.3149012567324956, + "grad_norm": 1.1191970109939575, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 46160 + }, + { + "epoch": 3.3156193895870736, + "grad_norm": 0.9847373962402344, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 46170 + }, + { + "epoch": 3.3163375224416516, + "grad_norm": 0.7315911054611206, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 46180 + }, + { + "epoch": 3.3170556552962296, + "grad_norm": 0.8267890214920044, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 46190 + }, + { + "epoch": 3.317773788150808, + "grad_norm": 0.8898099064826965, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 46200 + }, + { + "epoch": 3.318491921005386, + "grad_norm": 0.8525369167327881, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 46210 + }, + { + "epoch": 3.319210053859964, + "grad_norm": 0.8074760437011719, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 46220 + }, + { + "epoch": 3.319928186714542, + "grad_norm": 0.8473616242408752, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 46230 + }, + { + "epoch": 3.3206463195691205, + "grad_norm": 0.8678314089775085, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 46240 + }, + { + "epoch": 3.3213644524236985, + "grad_norm": 0.8718782067298889, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 46250 + }, + { + "epoch": 3.3220825852782765, + "grad_norm": 0.9384858012199402, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 46260 + }, + { + "epoch": 3.3228007181328545, + "grad_norm": 0.9295032620429993, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 46270 + }, + { + "epoch": 3.3235188509874325, + "grad_norm": 0.9472482800483704, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 46280 + }, + { + "epoch": 3.324236983842011, + "grad_norm": 0.7970638275146484, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 46290 + }, + { + "epoch": 3.324955116696589, + "grad_norm": 0.9508723020553589, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 46300 + }, + { + "epoch": 3.325673249551167, + "grad_norm": 0.9153636693954468, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 46310 + }, + { + "epoch": 3.326391382405745, + "grad_norm": 0.7890323400497437, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 46320 + }, + { + "epoch": 3.3271095152603234, + "grad_norm": 0.8711825609207153, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46330 + }, + { + "epoch": 3.3278276481149014, + "grad_norm": 0.9938926696777344, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 46340 + }, + { + "epoch": 3.3285457809694794, + "grad_norm": 0.8497524857521057, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 46350 + }, + { + "epoch": 3.3292639138240574, + "grad_norm": 0.9191650748252869, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 46360 + }, + { + "epoch": 3.3299820466786354, + "grad_norm": 0.8974085450172424, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 46370 + }, + { + "epoch": 3.3307001795332134, + "grad_norm": 0.9928934574127197, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 46380 + }, + { + "epoch": 3.331418312387792, + "grad_norm": 0.9011030197143555, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46390 + }, + { + "epoch": 3.33213644524237, + "grad_norm": 0.898594856262207, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 46400 + }, + { + "epoch": 3.332854578096948, + "grad_norm": 0.7506672143936157, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 46410 + }, + { + "epoch": 3.333572710951526, + "grad_norm": 0.9239172339439392, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 46420 + }, + { + "epoch": 3.3342908438061043, + "grad_norm": 1.0749682188034058, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46430 + }, + { + "epoch": 3.3350089766606823, + "grad_norm": 0.9262617230415344, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 46440 + }, + { + "epoch": 3.3357271095152603, + "grad_norm": 0.8681274056434631, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 46450 + }, + { + "epoch": 3.3364452423698383, + "grad_norm": 0.9558620452880859, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 46460 + }, + { + "epoch": 3.3371633752244163, + "grad_norm": 0.8907097578048706, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 46470 + }, + { + "epoch": 3.3378815080789948, + "grad_norm": 1.0941565036773682, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 46480 + }, + { + "epoch": 3.3385996409335728, + "grad_norm": 0.8971590995788574, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 46490 + }, + { + "epoch": 3.3393177737881508, + "grad_norm": 1.0315606594085693, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 46500 + }, + { + "epoch": 3.3400359066427288, + "grad_norm": 0.7717124223709106, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 46510 + }, + { + "epoch": 3.340754039497307, + "grad_norm": 0.8060970902442932, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 46520 + }, + { + "epoch": 3.341472172351885, + "grad_norm": 0.969510018825531, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 46530 + }, + { + "epoch": 3.342190305206463, + "grad_norm": 0.8837248682975769, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 46540 + }, + { + "epoch": 3.342908438061041, + "grad_norm": 0.9561076164245605, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 46550 + }, + { + "epoch": 3.343626570915619, + "grad_norm": 0.8529208898544312, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 46560 + }, + { + "epoch": 3.3443447037701977, + "grad_norm": 1.1300519704818726, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 46570 + }, + { + "epoch": 3.3450628366247757, + "grad_norm": 0.8330956101417542, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46580 + }, + { + "epoch": 3.3457809694793537, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 46590 + }, + { + "epoch": 3.3464991023339317, + "grad_norm": 1.0470821857452393, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 46600 + }, + { + "epoch": 3.34721723518851, + "grad_norm": 0.9933704137802124, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46610 + }, + { + "epoch": 3.347935368043088, + "grad_norm": 0.8130798935890198, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 46620 + }, + { + "epoch": 3.348653500897666, + "grad_norm": 0.9746946692466736, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46630 + }, + { + "epoch": 3.349371633752244, + "grad_norm": 0.8607267141342163, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46640 + }, + { + "epoch": 3.350089766606822, + "grad_norm": 0.800335705280304, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 46650 + }, + { + "epoch": 3.3508078994614, + "grad_norm": 1.0083239078521729, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 46660 + }, + { + "epoch": 3.3515260323159786, + "grad_norm": 1.0774433612823486, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 46670 + }, + { + "epoch": 3.3522441651705566, + "grad_norm": 0.9378824234008789, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46680 + }, + { + "epoch": 3.3529622980251346, + "grad_norm": 0.8490564227104187, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 46690 + }, + { + "epoch": 3.3536804308797126, + "grad_norm": 1.0415582656860352, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 46700 + }, + { + "epoch": 3.354398563734291, + "grad_norm": 0.8514367938041687, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 46710 + }, + { + "epoch": 3.355116696588869, + "grad_norm": 0.7691360712051392, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 46720 + }, + { + "epoch": 3.355834829443447, + "grad_norm": 0.8345438241958618, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 46730 + }, + { + "epoch": 3.356552962298025, + "grad_norm": 1.023492693901062, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 46740 + }, + { + "epoch": 3.357271095152603, + "grad_norm": 0.9648325443267822, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 46750 + }, + { + "epoch": 3.3579892280071815, + "grad_norm": 0.9029248356819153, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 46760 + }, + { + "epoch": 3.3587073608617595, + "grad_norm": 0.9109513759613037, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 46770 + }, + { + "epoch": 3.3594254937163375, + "grad_norm": 0.7757390141487122, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 46780 + }, + { + "epoch": 3.3601436265709155, + "grad_norm": 0.794035792350769, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46790 + }, + { + "epoch": 3.360861759425494, + "grad_norm": 0.8211429715156555, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 46800 + }, + { + "epoch": 3.361579892280072, + "grad_norm": 0.8620322346687317, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46810 + }, + { + "epoch": 3.36229802513465, + "grad_norm": 0.9392538070678711, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 46820 + }, + { + "epoch": 3.363016157989228, + "grad_norm": 0.8297873139381409, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 46830 + }, + { + "epoch": 3.363734290843806, + "grad_norm": 0.9158190488815308, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 46840 + }, + { + "epoch": 3.3644524236983844, + "grad_norm": 1.1449424028396606, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 46850 + }, + { + "epoch": 3.3651705565529624, + "grad_norm": 0.8718444108963013, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 46860 + }, + { + "epoch": 3.3658886894075404, + "grad_norm": 0.7744014263153076, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 46870 + }, + { + "epoch": 3.3666068222621184, + "grad_norm": 0.8392460942268372, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 46880 + }, + { + "epoch": 3.367324955116697, + "grad_norm": 1.0424989461898804, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 46890 + }, + { + "epoch": 3.368043087971275, + "grad_norm": 1.4696359634399414, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 46900 + }, + { + "epoch": 3.368761220825853, + "grad_norm": 0.9298201203346252, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46910 + }, + { + "epoch": 3.369479353680431, + "grad_norm": 0.8965262770652771, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 46920 + }, + { + "epoch": 3.370197486535009, + "grad_norm": 0.9395381808280945, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 46930 + }, + { + "epoch": 3.370915619389587, + "grad_norm": 0.9069047570228577, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 46940 + }, + { + "epoch": 3.3716337522441653, + "grad_norm": 0.9208605885505676, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46950 + }, + { + "epoch": 3.3723518850987433, + "grad_norm": 0.9493077397346497, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 46960 + }, + { + "epoch": 3.3730700179533213, + "grad_norm": 1.0804208517074585, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 46970 + }, + { + "epoch": 3.3737881508078993, + "grad_norm": 0.9465714693069458, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 46980 + }, + { + "epoch": 3.3745062836624777, + "grad_norm": 0.9189882278442383, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 46990 + }, + { + "epoch": 3.3752244165170557, + "grad_norm": 1.0199357271194458, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 47000 + }, + { + "epoch": 3.3759425493716337, + "grad_norm": 0.8999426960945129, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 47010 + }, + { + "epoch": 3.3766606822262117, + "grad_norm": 0.8923690319061279, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 47020 + }, + { + "epoch": 3.3773788150807897, + "grad_norm": 0.7459347248077393, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 47030 + }, + { + "epoch": 3.378096947935368, + "grad_norm": 0.7702858448028564, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 47040 + }, + { + "epoch": 3.378815080789946, + "grad_norm": 0.8296625018119812, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 47050 + }, + { + "epoch": 3.379533213644524, + "grad_norm": 1.2952555418014526, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47060 + }, + { + "epoch": 3.380251346499102, + "grad_norm": 0.7778869271278381, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 47070 + }, + { + "epoch": 3.3809694793536806, + "grad_norm": 0.9151549339294434, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 47080 + }, + { + "epoch": 3.3816876122082586, + "grad_norm": 0.7883925437927246, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 47090 + }, + { + "epoch": 3.3824057450628366, + "grad_norm": 0.9602295756340027, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 47100 + }, + { + "epoch": 3.3831238779174146, + "grad_norm": 0.7953121066093445, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47110 + }, + { + "epoch": 3.3838420107719926, + "grad_norm": 1.110148549079895, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 47120 + }, + { + "epoch": 3.384560143626571, + "grad_norm": 0.9359608888626099, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 47130 + }, + { + "epoch": 3.385278276481149, + "grad_norm": 0.7877762317657471, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 47140 + }, + { + "epoch": 3.385996409335727, + "grad_norm": 0.8586933016777039, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47150 + }, + { + "epoch": 3.386714542190305, + "grad_norm": 0.8920878767967224, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 47160 + }, + { + "epoch": 3.3874326750448835, + "grad_norm": 0.9692603349685669, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 47170 + }, + { + "epoch": 3.3881508078994615, + "grad_norm": 0.9038610458374023, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 47180 + }, + { + "epoch": 3.3888689407540395, + "grad_norm": 1.6299188137054443, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 47190 + }, + { + "epoch": 3.3895870736086176, + "grad_norm": 0.9704291820526123, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 47200 + }, + { + "epoch": 3.3903052064631956, + "grad_norm": 0.9503401517868042, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 47210 + }, + { + "epoch": 3.3910233393177736, + "grad_norm": 1.0051378011703491, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 47220 + }, + { + "epoch": 3.391741472172352, + "grad_norm": 0.7336357235908508, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 47230 + }, + { + "epoch": 3.39245960502693, + "grad_norm": 0.9847398996353149, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47240 + }, + { + "epoch": 3.393177737881508, + "grad_norm": 0.8100917339324951, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 47250 + }, + { + "epoch": 3.393895870736086, + "grad_norm": 0.9752838611602783, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 47260 + }, + { + "epoch": 3.3946140035906645, + "grad_norm": 0.9400623440742493, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 47270 + }, + { + "epoch": 3.3953321364452425, + "grad_norm": 0.7310057878494263, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 47280 + }, + { + "epoch": 3.3960502692998205, + "grad_norm": 0.8898789286613464, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 47290 + }, + { + "epoch": 3.3967684021543985, + "grad_norm": 1.0157585144042969, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 47300 + }, + { + "epoch": 3.3974865350089765, + "grad_norm": 0.9108527898788452, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 47310 + }, + { + "epoch": 3.398204667863555, + "grad_norm": 0.9796249270439148, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 47320 + }, + { + "epoch": 3.398922800718133, + "grad_norm": 0.8176435232162476, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 47330 + }, + { + "epoch": 3.399640933572711, + "grad_norm": 0.9981188178062439, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 47340 + }, + { + "epoch": 3.400359066427289, + "grad_norm": 0.9774404764175415, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47350 + }, + { + "epoch": 3.4010771992818674, + "grad_norm": 0.8624991774559021, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 47360 + }, + { + "epoch": 3.4017953321364454, + "grad_norm": 0.9191665053367615, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 47370 + }, + { + "epoch": 3.4025134649910234, + "grad_norm": 0.7971290946006775, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 47380 + }, + { + "epoch": 3.4032315978456014, + "grad_norm": 0.8336732983589172, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 47390 + }, + { + "epoch": 3.4039497307001794, + "grad_norm": 0.7730334401130676, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 47400 + }, + { + "epoch": 3.404667863554758, + "grad_norm": 0.8559145927429199, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 47410 + }, + { + "epoch": 3.405385996409336, + "grad_norm": 1.0261447429656982, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 47420 + }, + { + "epoch": 3.406104129263914, + "grad_norm": 0.9931781888008118, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 47430 + }, + { + "epoch": 3.406822262118492, + "grad_norm": 0.8971807360649109, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 47440 + }, + { + "epoch": 3.4075403949730703, + "grad_norm": 0.8886999487876892, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 47450 + }, + { + "epoch": 3.4082585278276483, + "grad_norm": 0.9551735520362854, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 47460 + }, + { + "epoch": 3.4089766606822263, + "grad_norm": 0.9066859483718872, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 47470 + }, + { + "epoch": 3.4096947935368043, + "grad_norm": 0.9192125201225281, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 47480 + }, + { + "epoch": 3.4104129263913823, + "grad_norm": 0.9332839250564575, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 47490 + }, + { + "epoch": 3.4111310592459603, + "grad_norm": 0.745563805103302, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47500 + }, + { + "epoch": 3.4118491921005387, + "grad_norm": 0.6843905448913574, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 47510 + }, + { + "epoch": 3.4125673249551167, + "grad_norm": 0.8063111305236816, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 47520 + }, + { + "epoch": 3.4132854578096947, + "grad_norm": 0.9666593670845032, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 47530 + }, + { + "epoch": 3.4140035906642727, + "grad_norm": 0.8112747073173523, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47540 + }, + { + "epoch": 3.414721723518851, + "grad_norm": 0.820807933807373, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 47550 + }, + { + "epoch": 3.415439856373429, + "grad_norm": 0.8476285338401794, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 47560 + }, + { + "epoch": 3.416157989228007, + "grad_norm": 1.0232552289962769, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47570 + }, + { + "epoch": 3.416876122082585, + "grad_norm": 0.8749372363090515, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 47580 + }, + { + "epoch": 3.417594254937163, + "grad_norm": 0.8117937445640564, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 47590 + }, + { + "epoch": 3.4183123877917416, + "grad_norm": 0.9010460376739502, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 47600 + }, + { + "epoch": 3.4190305206463196, + "grad_norm": 0.8955527544021606, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 47610 + }, + { + "epoch": 3.4197486535008976, + "grad_norm": 0.884186327457428, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 47620 + }, + { + "epoch": 3.4204667863554756, + "grad_norm": 0.8995241522789001, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 47630 + }, + { + "epoch": 3.421184919210054, + "grad_norm": 1.0627013444900513, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47640 + }, + { + "epoch": 3.421903052064632, + "grad_norm": 0.8619979619979858, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 47650 + }, + { + "epoch": 3.42262118491921, + "grad_norm": 0.9682498574256897, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 47660 + }, + { + "epoch": 3.423339317773788, + "grad_norm": 0.9614400863647461, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 47670 + }, + { + "epoch": 3.424057450628366, + "grad_norm": 0.7986962795257568, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 47680 + }, + { + "epoch": 3.4247755834829445, + "grad_norm": 0.8255957961082458, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 47690 + }, + { + "epoch": 3.4254937163375225, + "grad_norm": 0.9139757752418518, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 47700 + }, + { + "epoch": 3.4262118491921005, + "grad_norm": 0.8086292743682861, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 47710 + }, + { + "epoch": 3.4269299820466785, + "grad_norm": 0.8852273225784302, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 47720 + }, + { + "epoch": 3.427648114901257, + "grad_norm": 0.7568784356117249, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 47730 + }, + { + "epoch": 3.428366247755835, + "grad_norm": 0.8933039903640747, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 47740 + }, + { + "epoch": 3.429084380610413, + "grad_norm": 0.8101669549942017, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 47750 + }, + { + "epoch": 3.429802513464991, + "grad_norm": 0.7021054625511169, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 47760 + }, + { + "epoch": 3.430520646319569, + "grad_norm": 0.8282538652420044, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 47770 + }, + { + "epoch": 3.431238779174147, + "grad_norm": 0.8168348670005798, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 47780 + }, + { + "epoch": 3.4319569120287254, + "grad_norm": 0.9504001140594482, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 47790 + }, + { + "epoch": 3.4326750448833034, + "grad_norm": 0.7500190734863281, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47800 + }, + { + "epoch": 3.4333931777378814, + "grad_norm": 0.8645710945129395, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 47810 + }, + { + "epoch": 3.4341113105924594, + "grad_norm": 0.8088704943656921, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 47820 + }, + { + "epoch": 3.434829443447038, + "grad_norm": 0.9981673955917358, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 47830 + }, + { + "epoch": 3.435547576301616, + "grad_norm": 0.9363315105438232, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 47840 + }, + { + "epoch": 3.436265709156194, + "grad_norm": 0.8471030592918396, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 47850 + }, + { + "epoch": 3.436983842010772, + "grad_norm": 0.9447668790817261, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 47860 + }, + { + "epoch": 3.43770197486535, + "grad_norm": 0.9494127631187439, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 47870 + }, + { + "epoch": 3.4384201077199283, + "grad_norm": 0.8340432643890381, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47880 + }, + { + "epoch": 3.4391382405745063, + "grad_norm": 0.8466387987136841, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 47890 + }, + { + "epoch": 3.4398563734290843, + "grad_norm": 0.9498962759971619, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47900 + }, + { + "epoch": 3.4405745062836623, + "grad_norm": 0.8490501046180725, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 47910 + }, + { + "epoch": 3.441292639138241, + "grad_norm": 0.9506490230560303, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 47920 + }, + { + "epoch": 3.442010771992819, + "grad_norm": 0.7944257855415344, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 47930 + }, + { + "epoch": 3.442728904847397, + "grad_norm": 0.9725518226623535, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 47940 + }, + { + "epoch": 3.443447037701975, + "grad_norm": 0.7823024392127991, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47950 + }, + { + "epoch": 3.444165170556553, + "grad_norm": 0.810565173625946, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 47960 + }, + { + "epoch": 3.4448833034111312, + "grad_norm": 0.9809024333953857, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 47970 + }, + { + "epoch": 3.4456014362657092, + "grad_norm": 0.8818578720092773, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 47980 + }, + { + "epoch": 3.4463195691202873, + "grad_norm": 0.9843092560768127, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 47990 + }, + { + "epoch": 3.4470377019748653, + "grad_norm": 0.916313886642456, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 48000 + }, + { + "epoch": 3.4477558348294433, + "grad_norm": 0.908442497253418, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 48010 + }, + { + "epoch": 3.4484739676840217, + "grad_norm": 0.9880178570747375, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 48020 + }, + { + "epoch": 3.4491921005385997, + "grad_norm": 0.9276854991912842, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 48030 + }, + { + "epoch": 3.4499102333931777, + "grad_norm": 1.0879448652267456, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 48040 + }, + { + "epoch": 3.4506283662477557, + "grad_norm": 0.7430389523506165, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 48050 + }, + { + "epoch": 3.4513464991023337, + "grad_norm": 1.0880072116851807, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 48060 + }, + { + "epoch": 3.452064631956912, + "grad_norm": 1.0424141883850098, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 48070 + }, + { + "epoch": 3.45278276481149, + "grad_norm": 0.926330029964447, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 48080 + }, + { + "epoch": 3.453500897666068, + "grad_norm": 0.8911219239234924, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 48090 + }, + { + "epoch": 3.454219030520646, + "grad_norm": 0.8727201223373413, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 48100 + }, + { + "epoch": 3.4549371633752246, + "grad_norm": 0.8573940396308899, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48110 + }, + { + "epoch": 3.4556552962298026, + "grad_norm": 1.0427064895629883, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 48120 + }, + { + "epoch": 3.4563734290843806, + "grad_norm": 0.8688231706619263, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 48130 + }, + { + "epoch": 3.4570915619389586, + "grad_norm": 0.8856009244918823, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 48140 + }, + { + "epoch": 3.4578096947935366, + "grad_norm": 0.9535353183746338, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 48150 + }, + { + "epoch": 3.458527827648115, + "grad_norm": 0.9466010928153992, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 48160 + }, + { + "epoch": 3.459245960502693, + "grad_norm": 0.9783535599708557, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 48170 + }, + { + "epoch": 3.459964093357271, + "grad_norm": 0.8010456562042236, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 48180 + }, + { + "epoch": 3.460682226211849, + "grad_norm": 0.8928955793380737, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 48190 + }, + { + "epoch": 3.4614003590664275, + "grad_norm": 0.7565838694572449, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 48200 + }, + { + "epoch": 3.4621184919210055, + "grad_norm": 1.0044180154800415, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 48210 + }, + { + "epoch": 3.4628366247755835, + "grad_norm": 0.8161038160324097, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 48220 + }, + { + "epoch": 3.4635547576301615, + "grad_norm": 1.1000211238861084, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 48230 + }, + { + "epoch": 3.4642728904847395, + "grad_norm": 0.7942240238189697, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 48240 + }, + { + "epoch": 3.464991023339318, + "grad_norm": 0.7546432018280029, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 48250 + }, + { + "epoch": 3.465709156193896, + "grad_norm": 0.7705255150794983, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 48260 + }, + { + "epoch": 3.466427289048474, + "grad_norm": 0.7958067059516907, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 48270 + }, + { + "epoch": 3.467145421903052, + "grad_norm": 0.9199120402336121, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48280 + }, + { + "epoch": 3.46786355475763, + "grad_norm": 1.118672251701355, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 48290 + }, + { + "epoch": 3.4685816876122084, + "grad_norm": 0.9161015748977661, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 48300 + }, + { + "epoch": 3.4692998204667864, + "grad_norm": 1.1086218357086182, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 48310 + }, + { + "epoch": 3.4700179533213644, + "grad_norm": 1.0123368501663208, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 48320 + }, + { + "epoch": 3.4707360861759424, + "grad_norm": 0.7380602359771729, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 48330 + }, + { + "epoch": 3.4714542190305204, + "grad_norm": 0.8967105150222778, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 48340 + }, + { + "epoch": 3.472172351885099, + "grad_norm": 1.0134044885635376, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48350 + }, + { + "epoch": 3.472890484739677, + "grad_norm": 1.080815076828003, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 48360 + }, + { + "epoch": 3.473608617594255, + "grad_norm": 1.151721477508545, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 48370 + }, + { + "epoch": 3.474326750448833, + "grad_norm": 0.9436505436897278, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 48380 + }, + { + "epoch": 3.4750448833034113, + "grad_norm": 0.9154609441757202, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 48390 + }, + { + "epoch": 3.4757630161579893, + "grad_norm": 0.8943037986755371, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 48400 + }, + { + "epoch": 3.4764811490125673, + "grad_norm": 0.936988115310669, + "learning_rate": 0.0002, + "loss": 0.6316, + "step": 48410 + }, + { + "epoch": 3.4771992818671453, + "grad_norm": 0.826960027217865, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 48420 + }, + { + "epoch": 3.4779174147217233, + "grad_norm": 1.0487587451934814, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 48430 + }, + { + "epoch": 3.478635547576302, + "grad_norm": 0.729163646697998, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 48440 + }, + { + "epoch": 3.47935368043088, + "grad_norm": 0.8156948089599609, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 48450 + }, + { + "epoch": 3.480071813285458, + "grad_norm": 0.8004332184791565, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 48460 + }, + { + "epoch": 3.480789946140036, + "grad_norm": 0.9632692337036133, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 48470 + }, + { + "epoch": 3.4815080789946142, + "grad_norm": 1.0950212478637695, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 48480 + }, + { + "epoch": 3.4822262118491922, + "grad_norm": 0.8574318885803223, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 48490 + }, + { + "epoch": 3.4829443447037702, + "grad_norm": 0.8552606701850891, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 48500 + }, + { + "epoch": 3.4836624775583482, + "grad_norm": 0.9698445200920105, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 48510 + }, + { + "epoch": 3.4843806104129262, + "grad_norm": 0.9427815675735474, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 48520 + }, + { + "epoch": 3.4850987432675042, + "grad_norm": 0.7902070879936218, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 48530 + }, + { + "epoch": 3.4858168761220827, + "grad_norm": 1.0300066471099854, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 48540 + }, + { + "epoch": 3.4865350089766607, + "grad_norm": 1.1688778400421143, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 48550 + }, + { + "epoch": 3.4872531418312387, + "grad_norm": 1.0012071132659912, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 48560 + }, + { + "epoch": 3.4879712746858167, + "grad_norm": 1.112094759941101, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 48570 + }, + { + "epoch": 3.488689407540395, + "grad_norm": 0.8547284603118896, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 48580 + }, + { + "epoch": 3.489407540394973, + "grad_norm": 0.8827278017997742, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 48590 + }, + { + "epoch": 3.490125673249551, + "grad_norm": 0.9255490303039551, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 48600 + }, + { + "epoch": 3.490843806104129, + "grad_norm": 0.8000030517578125, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 48610 + }, + { + "epoch": 3.491561938958707, + "grad_norm": 0.9327391386032104, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 48620 + }, + { + "epoch": 3.4922800718132856, + "grad_norm": 0.9004138708114624, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 48630 + }, + { + "epoch": 3.4929982046678636, + "grad_norm": 0.9886971116065979, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 48640 + }, + { + "epoch": 3.4937163375224416, + "grad_norm": 0.9890487194061279, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 48650 + }, + { + "epoch": 3.4944344703770196, + "grad_norm": 0.7024438977241516, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 48660 + }, + { + "epoch": 3.495152603231598, + "grad_norm": 0.8397303223609924, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 48670 + }, + { + "epoch": 3.495870736086176, + "grad_norm": 0.9120950698852539, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 48680 + }, + { + "epoch": 3.496588868940754, + "grad_norm": 1.057299017906189, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48690 + }, + { + "epoch": 3.497307001795332, + "grad_norm": 0.821325957775116, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 48700 + }, + { + "epoch": 3.49802513464991, + "grad_norm": 1.0029970407485962, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 48710 + }, + { + "epoch": 3.4987432675044885, + "grad_norm": 0.9483712911605835, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 48720 + }, + { + "epoch": 3.4994614003590665, + "grad_norm": 0.9637855291366577, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 48730 + }, + { + "epoch": 3.5001795332136445, + "grad_norm": 0.6848894357681274, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 48740 + }, + { + "epoch": 3.5008976660682225, + "grad_norm": 0.7848573327064514, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 48750 + }, + { + "epoch": 3.501615798922801, + "grad_norm": 1.0341308116912842, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 48760 + }, + { + "epoch": 3.502333931777379, + "grad_norm": 0.8858218193054199, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 48770 + }, + { + "epoch": 3.503052064631957, + "grad_norm": 0.8366939425468445, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 48780 + }, + { + "epoch": 3.503770197486535, + "grad_norm": 0.7926092147827148, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 48790 + }, + { + "epoch": 3.504488330341113, + "grad_norm": 0.8503843545913696, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 48800 + }, + { + "epoch": 3.505206463195691, + "grad_norm": 0.8867869973182678, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 48810 + }, + { + "epoch": 3.5059245960502694, + "grad_norm": 1.0336930751800537, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 48820 + }, + { + "epoch": 3.5066427289048474, + "grad_norm": 0.8564051985740662, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 48830 + }, + { + "epoch": 3.5073608617594254, + "grad_norm": 0.9202605485916138, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 48840 + }, + { + "epoch": 3.508078994614004, + "grad_norm": 0.8838639855384827, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 48850 + }, + { + "epoch": 3.508797127468582, + "grad_norm": 0.8975196480751038, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48860 + }, + { + "epoch": 3.50951526032316, + "grad_norm": 0.8842370510101318, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 48870 + }, + { + "epoch": 3.510233393177738, + "grad_norm": 0.9195886254310608, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 48880 + }, + { + "epoch": 3.510951526032316, + "grad_norm": 0.986130952835083, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 48890 + }, + { + "epoch": 3.511669658886894, + "grad_norm": 0.8119593858718872, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 48900 + }, + { + "epoch": 3.5123877917414723, + "grad_norm": 0.9027136564254761, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 48910 + }, + { + "epoch": 3.5131059245960503, + "grad_norm": 0.8560537099838257, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 48920 + }, + { + "epoch": 3.5138240574506283, + "grad_norm": 0.7073559165000916, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 48930 + }, + { + "epoch": 3.5145421903052063, + "grad_norm": 0.8753304481506348, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 48940 + }, + { + "epoch": 3.5152603231597848, + "grad_norm": 0.9151145815849304, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 48950 + }, + { + "epoch": 3.5159784560143628, + "grad_norm": 0.7794315814971924, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 48960 + }, + { + "epoch": 3.5166965888689408, + "grad_norm": 0.9226023554801941, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 48970 + }, + { + "epoch": 3.5174147217235188, + "grad_norm": 0.8442051410675049, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48980 + }, + { + "epoch": 3.5181328545780968, + "grad_norm": 0.9769423007965088, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 48990 + }, + { + "epoch": 3.5188509874326748, + "grad_norm": 0.740347146987915, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 49000 + }, + { + "epoch": 3.519569120287253, + "grad_norm": 0.8963457345962524, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 49010 + }, + { + "epoch": 3.520287253141831, + "grad_norm": 0.8410176634788513, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 49020 + }, + { + "epoch": 3.521005385996409, + "grad_norm": 1.0486022233963013, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 49030 + }, + { + "epoch": 3.5217235188509877, + "grad_norm": 0.95393967628479, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 49040 + }, + { + "epoch": 3.5224416517055657, + "grad_norm": 0.8261157274246216, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49050 + }, + { + "epoch": 3.5231597845601437, + "grad_norm": 0.9321704506874084, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 49060 + }, + { + "epoch": 3.5238779174147217, + "grad_norm": 1.2596088647842407, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 49070 + }, + { + "epoch": 3.5245960502692997, + "grad_norm": 0.8584637641906738, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 49080 + }, + { + "epoch": 3.5253141831238777, + "grad_norm": 0.850520670413971, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 49090 + }, + { + "epoch": 3.526032315978456, + "grad_norm": 0.8915920257568359, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 49100 + }, + { + "epoch": 3.526750448833034, + "grad_norm": 0.9070239067077637, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 49110 + }, + { + "epoch": 3.527468581687612, + "grad_norm": 0.699878990650177, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 49120 + }, + { + "epoch": 3.5281867145421906, + "grad_norm": 0.9003779888153076, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 49130 + }, + { + "epoch": 3.5289048473967686, + "grad_norm": 0.7886711955070496, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 49140 + }, + { + "epoch": 3.5296229802513466, + "grad_norm": 0.7368922233581543, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 49150 + }, + { + "epoch": 3.5303411131059246, + "grad_norm": 0.8585197329521179, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 49160 + }, + { + "epoch": 3.5310592459605026, + "grad_norm": 1.0205435752868652, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 49170 + }, + { + "epoch": 3.5317773788150806, + "grad_norm": 0.8756650686264038, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 49180 + }, + { + "epoch": 3.532495511669659, + "grad_norm": 1.0278643369674683, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 49190 + }, + { + "epoch": 3.533213644524237, + "grad_norm": 0.8641911745071411, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 49200 + }, + { + "epoch": 3.533931777378815, + "grad_norm": 0.8730159401893616, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 49210 + }, + { + "epoch": 3.534649910233393, + "grad_norm": 0.918637216091156, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 49220 + }, + { + "epoch": 3.5353680430879715, + "grad_norm": 1.0467222929000854, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 49230 + }, + { + "epoch": 3.5360861759425495, + "grad_norm": 1.005009412765503, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 49240 + }, + { + "epoch": 3.5368043087971275, + "grad_norm": 0.9775063395500183, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 49250 + }, + { + "epoch": 3.5375224416517055, + "grad_norm": 0.8198322057723999, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 49260 + }, + { + "epoch": 3.5382405745062835, + "grad_norm": 0.8184829354286194, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 49270 + }, + { + "epoch": 3.5389587073608615, + "grad_norm": 0.9520270824432373, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 49280 + }, + { + "epoch": 3.53967684021544, + "grad_norm": 0.7816803455352783, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 49290 + }, + { + "epoch": 3.540394973070018, + "grad_norm": 0.6915702819824219, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 49300 + }, + { + "epoch": 3.541113105924596, + "grad_norm": 0.8282375931739807, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 49310 + }, + { + "epoch": 3.5418312387791744, + "grad_norm": 1.0797513723373413, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 49320 + }, + { + "epoch": 3.5425493716337524, + "grad_norm": 0.868671715259552, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 49330 + }, + { + "epoch": 3.5432675044883304, + "grad_norm": 0.8534455895423889, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 49340 + }, + { + "epoch": 3.5439856373429084, + "grad_norm": 0.816411554813385, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 49350 + }, + { + "epoch": 3.5447037701974864, + "grad_norm": 0.7813423275947571, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 49360 + }, + { + "epoch": 3.5454219030520644, + "grad_norm": 0.8002013564109802, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 49370 + }, + { + "epoch": 3.546140035906643, + "grad_norm": 0.9740113615989685, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 49380 + }, + { + "epoch": 3.546858168761221, + "grad_norm": 0.9046127200126648, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 49390 + }, + { + "epoch": 3.547576301615799, + "grad_norm": 0.8635150194168091, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 49400 + }, + { + "epoch": 3.5482944344703773, + "grad_norm": 0.9488558769226074, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 49410 + }, + { + "epoch": 3.5490125673249553, + "grad_norm": 0.9637090563774109, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 49420 + }, + { + "epoch": 3.5497307001795333, + "grad_norm": 1.042245626449585, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 49430 + }, + { + "epoch": 3.5504488330341113, + "grad_norm": 0.9076175689697266, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 49440 + }, + { + "epoch": 3.5511669658886893, + "grad_norm": 0.8480596542358398, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 49450 + }, + { + "epoch": 3.5518850987432673, + "grad_norm": 0.8483007550239563, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 49460 + }, + { + "epoch": 3.5526032315978457, + "grad_norm": 0.7855815887451172, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 49470 + }, + { + "epoch": 3.5533213644524237, + "grad_norm": 0.8435823917388916, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 49480 + }, + { + "epoch": 3.5540394973070017, + "grad_norm": 0.8613026142120361, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 49490 + }, + { + "epoch": 3.5547576301615798, + "grad_norm": 0.9654812812805176, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 49500 + }, + { + "epoch": 3.555475763016158, + "grad_norm": 0.8888838887214661, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 49510 + }, + { + "epoch": 3.556193895870736, + "grad_norm": 0.7718146443367004, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49520 + }, + { + "epoch": 3.556912028725314, + "grad_norm": 0.9487382173538208, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 49530 + }, + { + "epoch": 3.557630161579892, + "grad_norm": 0.9256559610366821, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 49540 + }, + { + "epoch": 3.55834829443447, + "grad_norm": 0.8879945874214172, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 49550 + }, + { + "epoch": 3.559066427289048, + "grad_norm": 0.8498744368553162, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 49560 + }, + { + "epoch": 3.5597845601436267, + "grad_norm": 0.9550948143005371, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 49570 + }, + { + "epoch": 3.5605026929982047, + "grad_norm": 0.8386164903640747, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 49580 + }, + { + "epoch": 3.5612208258527827, + "grad_norm": 0.925573468208313, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 49590 + }, + { + "epoch": 3.561938958707361, + "grad_norm": 0.8867112398147583, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 49600 + }, + { + "epoch": 3.562657091561939, + "grad_norm": 0.7638537883758545, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 49610 + }, + { + "epoch": 3.563375224416517, + "grad_norm": 0.9491845965385437, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 49620 + }, + { + "epoch": 3.564093357271095, + "grad_norm": 0.8384189605712891, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 49630 + }, + { + "epoch": 3.564811490125673, + "grad_norm": 0.8850575089454651, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 49640 + }, + { + "epoch": 3.565529622980251, + "grad_norm": 1.020916223526001, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 49650 + }, + { + "epoch": 3.5662477558348296, + "grad_norm": 0.9298280477523804, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 49660 + }, + { + "epoch": 3.5669658886894076, + "grad_norm": 0.9795742034912109, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 49670 + }, + { + "epoch": 3.5676840215439856, + "grad_norm": 0.9401193261146545, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 49680 + }, + { + "epoch": 3.568402154398564, + "grad_norm": 1.0383585691452026, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49690 + }, + { + "epoch": 3.569120287253142, + "grad_norm": 0.8370866179466248, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 49700 + }, + { + "epoch": 3.56983842010772, + "grad_norm": 0.8207486271858215, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 49710 + }, + { + "epoch": 3.570556552962298, + "grad_norm": 0.8551223278045654, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49720 + }, + { + "epoch": 3.571274685816876, + "grad_norm": 0.8041176199913025, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 49730 + }, + { + "epoch": 3.571992818671454, + "grad_norm": 0.9862527847290039, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 49740 + }, + { + "epoch": 3.5727109515260325, + "grad_norm": 0.7557165622711182, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 49750 + }, + { + "epoch": 3.5734290843806105, + "grad_norm": 1.0908563137054443, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 49760 + }, + { + "epoch": 3.5741472172351885, + "grad_norm": 0.7245369553565979, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 49770 + }, + { + "epoch": 3.5748653500897665, + "grad_norm": 0.7851184010505676, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 49780 + }, + { + "epoch": 3.575583482944345, + "grad_norm": 0.9443599581718445, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 49790 + }, + { + "epoch": 3.576301615798923, + "grad_norm": 1.021196961402893, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 49800 + }, + { + "epoch": 3.577019748653501, + "grad_norm": 0.9099196195602417, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 49810 + }, + { + "epoch": 3.577737881508079, + "grad_norm": 0.9397716522216797, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 49820 + }, + { + "epoch": 3.578456014362657, + "grad_norm": 0.9214922785758972, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 49830 + }, + { + "epoch": 3.579174147217235, + "grad_norm": 1.0053879022598267, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 49840 + }, + { + "epoch": 3.5798922800718134, + "grad_norm": 0.9415460228919983, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 49850 + }, + { + "epoch": 3.5806104129263914, + "grad_norm": 1.0807833671569824, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 49860 + }, + { + "epoch": 3.5813285457809694, + "grad_norm": 1.0070871114730835, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 49870 + }, + { + "epoch": 3.582046678635548, + "grad_norm": 0.9707024693489075, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 49880 + }, + { + "epoch": 3.582764811490126, + "grad_norm": 0.9979593753814697, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 49890 + }, + { + "epoch": 3.583482944344704, + "grad_norm": 0.7238648533821106, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 49900 + }, + { + "epoch": 3.584201077199282, + "grad_norm": 0.8168631792068481, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 49910 + }, + { + "epoch": 3.58491921005386, + "grad_norm": 0.8156409859657288, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 49920 + }, + { + "epoch": 3.585637342908438, + "grad_norm": 0.9256414175033569, + "learning_rate": 0.0002, + "loss": 0.6248, + "step": 49930 + }, + { + "epoch": 3.5863554757630163, + "grad_norm": 1.0090070962905884, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 49940 + }, + { + "epoch": 3.5870736086175943, + "grad_norm": 0.8257701992988586, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 49950 + }, + { + "epoch": 3.5877917414721723, + "grad_norm": 0.9189013242721558, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 49960 + }, + { + "epoch": 3.5885098743267507, + "grad_norm": 0.8497788310050964, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 49970 + }, + { + "epoch": 3.5892280071813287, + "grad_norm": 0.9596505761146545, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 49980 + }, + { + "epoch": 3.5899461400359067, + "grad_norm": 0.8773331642150879, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 49990 + }, + { + "epoch": 3.5906642728904847, + "grad_norm": 0.8952302932739258, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50000 + }, + { + "epoch": 3.5913824057450627, + "grad_norm": 0.7713809609413147, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 50010 + }, + { + "epoch": 3.5921005385996407, + "grad_norm": 1.0151346921920776, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 50020 + }, + { + "epoch": 3.592818671454219, + "grad_norm": 0.8793733716011047, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 50030 + }, + { + "epoch": 3.593536804308797, + "grad_norm": 0.8881325721740723, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 50040 + }, + { + "epoch": 3.594254937163375, + "grad_norm": 0.9346749782562256, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 50050 + }, + { + "epoch": 3.594973070017953, + "grad_norm": 0.8705052137374878, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 50060 + }, + { + "epoch": 3.5956912028725316, + "grad_norm": 1.039197564125061, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 50070 + }, + { + "epoch": 3.5964093357271096, + "grad_norm": 0.7053273320198059, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 50080 + }, + { + "epoch": 3.5971274685816876, + "grad_norm": 0.8268665671348572, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 50090 + }, + { + "epoch": 3.5978456014362656, + "grad_norm": 0.8921764492988586, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 50100 + }, + { + "epoch": 3.5985637342908436, + "grad_norm": 0.9756084680557251, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 50110 + }, + { + "epoch": 3.5992818671454216, + "grad_norm": 0.9275530576705933, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 50120 + }, + { + "epoch": 3.6, + "grad_norm": 0.9030009508132935, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 50130 + }, + { + "epoch": 3.600718132854578, + "grad_norm": 0.7805638909339905, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 50140 + }, + { + "epoch": 3.601436265709156, + "grad_norm": 0.7627325057983398, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 50150 + }, + { + "epoch": 3.6021543985637345, + "grad_norm": 0.7809714078903198, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 50160 + }, + { + "epoch": 3.6028725314183125, + "grad_norm": 0.7910378575325012, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 50170 + }, + { + "epoch": 3.6035906642728905, + "grad_norm": 1.004438042640686, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 50180 + }, + { + "epoch": 3.6043087971274685, + "grad_norm": 0.825969934463501, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 50190 + }, + { + "epoch": 3.6050269299820465, + "grad_norm": 0.8866565227508545, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 50200 + }, + { + "epoch": 3.6057450628366245, + "grad_norm": 0.8920543193817139, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 50210 + }, + { + "epoch": 3.606463195691203, + "grad_norm": 1.106584906578064, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 50220 + }, + { + "epoch": 3.607181328545781, + "grad_norm": 0.916607677936554, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 50230 + }, + { + "epoch": 3.607899461400359, + "grad_norm": 0.8014767169952393, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 50240 + }, + { + "epoch": 3.608617594254937, + "grad_norm": 0.9556822776794434, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 50250 + }, + { + "epoch": 3.6093357271095154, + "grad_norm": 0.9630016684532166, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50260 + }, + { + "epoch": 3.6100538599640934, + "grad_norm": 0.9862125515937805, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 50270 + }, + { + "epoch": 3.6107719928186714, + "grad_norm": 1.0043333768844604, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 50280 + }, + { + "epoch": 3.6114901256732495, + "grad_norm": 0.9255319833755493, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 50290 + }, + { + "epoch": 3.6122082585278275, + "grad_norm": 1.012023687362671, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 50300 + }, + { + "epoch": 3.612926391382406, + "grad_norm": 1.0701122283935547, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50310 + }, + { + "epoch": 3.613644524236984, + "grad_norm": 0.8270810842514038, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 50320 + }, + { + "epoch": 3.614362657091562, + "grad_norm": 0.8881328105926514, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 50330 + }, + { + "epoch": 3.61508078994614, + "grad_norm": 0.9536844491958618, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 50340 + }, + { + "epoch": 3.6157989228007184, + "grad_norm": 0.8044326305389404, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 50350 + }, + { + "epoch": 3.6165170556552964, + "grad_norm": 0.834591805934906, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50360 + }, + { + "epoch": 3.6172351885098744, + "grad_norm": 0.903752863407135, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 50370 + }, + { + "epoch": 3.6179533213644524, + "grad_norm": 0.9148632884025574, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 50380 + }, + { + "epoch": 3.6186714542190304, + "grad_norm": 0.9280176162719727, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 50390 + }, + { + "epoch": 3.6193895870736084, + "grad_norm": 0.9524136781692505, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 50400 + }, + { + "epoch": 3.620107719928187, + "grad_norm": 1.1751197576522827, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 50410 + }, + { + "epoch": 3.620825852782765, + "grad_norm": 1.032279133796692, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 50420 + }, + { + "epoch": 3.621543985637343, + "grad_norm": 0.790741503238678, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 50430 + }, + { + "epoch": 3.6222621184919213, + "grad_norm": 0.9584221243858337, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 50440 + }, + { + "epoch": 3.6229802513464993, + "grad_norm": 0.7792508006095886, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 50450 + }, + { + "epoch": 3.6236983842010773, + "grad_norm": 0.8273448944091797, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 50460 + }, + { + "epoch": 3.6244165170556553, + "grad_norm": 0.8001132607460022, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 50470 + }, + { + "epoch": 3.6251346499102333, + "grad_norm": 1.077109694480896, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 50480 + }, + { + "epoch": 3.6258527827648113, + "grad_norm": 1.111274003982544, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 50490 + }, + { + "epoch": 3.6265709156193897, + "grad_norm": 0.7757347822189331, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 50500 + }, + { + "epoch": 3.6272890484739677, + "grad_norm": 0.9217049479484558, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 50510 + }, + { + "epoch": 3.6280071813285457, + "grad_norm": 0.9362251162528992, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 50520 + }, + { + "epoch": 3.6287253141831237, + "grad_norm": 0.9435479044914246, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 50530 + }, + { + "epoch": 3.629443447037702, + "grad_norm": 0.7748915553092957, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 50540 + }, + { + "epoch": 3.63016157989228, + "grad_norm": 0.8238945007324219, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 50550 + }, + { + "epoch": 3.630879712746858, + "grad_norm": 0.8421505093574524, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 50560 + }, + { + "epoch": 3.631597845601436, + "grad_norm": 1.0272293090820312, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 50570 + }, + { + "epoch": 3.632315978456014, + "grad_norm": 0.7643818259239197, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 50580 + }, + { + "epoch": 3.6330341113105926, + "grad_norm": 0.9756225347518921, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 50590 + }, + { + "epoch": 3.6337522441651706, + "grad_norm": 0.9311570525169373, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 50600 + }, + { + "epoch": 3.6344703770197486, + "grad_norm": 0.8829827904701233, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 50610 + }, + { + "epoch": 3.6351885098743266, + "grad_norm": 0.9473454356193542, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 50620 + }, + { + "epoch": 3.635906642728905, + "grad_norm": 1.1023668050765991, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 50630 + }, + { + "epoch": 3.636624775583483, + "grad_norm": 0.8490299582481384, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 50640 + }, + { + "epoch": 3.637342908438061, + "grad_norm": 1.1129392385482788, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 50650 + }, + { + "epoch": 3.638061041292639, + "grad_norm": 1.0334501266479492, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 50660 + }, + { + "epoch": 3.638779174147217, + "grad_norm": 0.8397296667098999, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 50670 + }, + { + "epoch": 3.639497307001795, + "grad_norm": 0.7984256744384766, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 50680 + }, + { + "epoch": 3.6402154398563735, + "grad_norm": 1.1182054281234741, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 50690 + }, + { + "epoch": 3.6409335727109515, + "grad_norm": 0.8743279576301575, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 50700 + }, + { + "epoch": 3.6416517055655295, + "grad_norm": 0.9101628661155701, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 50710 + }, + { + "epoch": 3.642369838420108, + "grad_norm": 0.8866934180259705, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 50720 + }, + { + "epoch": 3.643087971274686, + "grad_norm": 0.863945484161377, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 50730 + }, + { + "epoch": 3.643806104129264, + "grad_norm": 1.0845744609832764, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 50740 + }, + { + "epoch": 3.644524236983842, + "grad_norm": 0.8610911965370178, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 50750 + }, + { + "epoch": 3.64524236983842, + "grad_norm": 0.8502625226974487, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 50760 + }, + { + "epoch": 3.645960502692998, + "grad_norm": 0.847372829914093, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 50770 + }, + { + "epoch": 3.6466786355475764, + "grad_norm": 0.8649292588233948, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 50780 + }, + { + "epoch": 3.6473967684021544, + "grad_norm": 0.8742905855178833, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 50790 + }, + { + "epoch": 3.6481149012567324, + "grad_norm": 0.9546048641204834, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 50800 + }, + { + "epoch": 3.6488330341113104, + "grad_norm": 0.7893161773681641, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 50810 + }, + { + "epoch": 3.649551166965889, + "grad_norm": 0.9350247979164124, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 50820 + }, + { + "epoch": 3.650269299820467, + "grad_norm": 0.772149384021759, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 50830 + }, + { + "epoch": 3.650987432675045, + "grad_norm": 0.8281718492507935, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 50840 + }, + { + "epoch": 3.651705565529623, + "grad_norm": 0.8063850402832031, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 50850 + }, + { + "epoch": 3.652423698384201, + "grad_norm": 0.8101351261138916, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 50860 + }, + { + "epoch": 3.6531418312387793, + "grad_norm": 0.8747833371162415, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 50870 + }, + { + "epoch": 3.6538599640933573, + "grad_norm": 0.9634656310081482, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 50880 + }, + { + "epoch": 3.6545780969479353, + "grad_norm": 1.1646045446395874, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 50890 + }, + { + "epoch": 3.6552962298025133, + "grad_norm": 0.8538454174995422, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 50900 + }, + { + "epoch": 3.656014362657092, + "grad_norm": 0.7639184594154358, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 50910 + }, + { + "epoch": 3.65673249551167, + "grad_norm": 0.8750212788581848, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 50920 + }, + { + "epoch": 3.657450628366248, + "grad_norm": 0.9161198735237122, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 50930 + }, + { + "epoch": 3.658168761220826, + "grad_norm": 0.7987924814224243, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 50940 + }, + { + "epoch": 3.658886894075404, + "grad_norm": 0.8939290642738342, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 50950 + }, + { + "epoch": 3.659605026929982, + "grad_norm": 0.9803797602653503, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 50960 + }, + { + "epoch": 3.6603231597845602, + "grad_norm": 1.2423512935638428, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 50970 + }, + { + "epoch": 3.6610412926391382, + "grad_norm": 1.0023225545883179, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 50980 + }, + { + "epoch": 3.6617594254937162, + "grad_norm": 0.9066677689552307, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 50990 + }, + { + "epoch": 3.6624775583482947, + "grad_norm": 0.8906226754188538, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 51000 + }, + { + "epoch": 3.6631956912028727, + "grad_norm": 0.7449954152107239, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51010 + }, + { + "epoch": 3.6639138240574507, + "grad_norm": 0.812612771987915, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 51020 + }, + { + "epoch": 3.6646319569120287, + "grad_norm": 0.861818253993988, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 51030 + }, + { + "epoch": 3.6653500897666067, + "grad_norm": 0.849726676940918, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 51040 + }, + { + "epoch": 3.6660682226211847, + "grad_norm": 0.9738494753837585, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 51050 + }, + { + "epoch": 3.666786355475763, + "grad_norm": 0.928989827632904, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 51060 + }, + { + "epoch": 3.667504488330341, + "grad_norm": 0.9725563526153564, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 51070 + }, + { + "epoch": 3.668222621184919, + "grad_norm": 0.9366095066070557, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51080 + }, + { + "epoch": 3.668940754039497, + "grad_norm": 0.8012986779212952, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 51090 + }, + { + "epoch": 3.6696588868940756, + "grad_norm": 1.0646892786026, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51100 + }, + { + "epoch": 3.6703770197486536, + "grad_norm": 0.7245157361030579, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 51110 + }, + { + "epoch": 3.6710951526032316, + "grad_norm": 0.6938936114311218, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 51120 + }, + { + "epoch": 3.6718132854578096, + "grad_norm": 0.8461366295814514, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 51130 + }, + { + "epoch": 3.6725314183123876, + "grad_norm": 0.8392583131790161, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 51140 + }, + { + "epoch": 3.673249551166966, + "grad_norm": 0.7245259284973145, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 51150 + }, + { + "epoch": 3.673967684021544, + "grad_norm": 1.0742167234420776, + "learning_rate": 0.0002, + "loss": 0.6165, + "step": 51160 + }, + { + "epoch": 3.674685816876122, + "grad_norm": 0.9553889036178589, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 51170 + }, + { + "epoch": 3.6754039497307, + "grad_norm": 0.8713715672492981, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 51180 + }, + { + "epoch": 3.6761220825852785, + "grad_norm": 0.7499800324440002, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 51190 + }, + { + "epoch": 3.6768402154398565, + "grad_norm": 1.1118139028549194, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 51200 + }, + { + "epoch": 3.6775583482944345, + "grad_norm": 0.8146613836288452, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 51210 + }, + { + "epoch": 3.6782764811490125, + "grad_norm": 0.9331285357475281, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 51220 + }, + { + "epoch": 3.6789946140035905, + "grad_norm": 1.0497597455978394, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 51230 + }, + { + "epoch": 3.6797127468581685, + "grad_norm": 0.879814863204956, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51240 + }, + { + "epoch": 3.680430879712747, + "grad_norm": 0.9896606802940369, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 51250 + }, + { + "epoch": 3.681149012567325, + "grad_norm": 0.928236186504364, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 51260 + }, + { + "epoch": 3.681867145421903, + "grad_norm": 0.8436732292175293, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 51270 + }, + { + "epoch": 3.6825852782764814, + "grad_norm": 0.93634432554245, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51280 + }, + { + "epoch": 3.6833034111310594, + "grad_norm": 0.8477143049240112, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 51290 + }, + { + "epoch": 3.6840215439856374, + "grad_norm": 0.8720934987068176, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 51300 + }, + { + "epoch": 3.6847396768402154, + "grad_norm": 0.7322931289672852, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 51310 + }, + { + "epoch": 3.6854578096947934, + "grad_norm": 1.0064427852630615, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 51320 + }, + { + "epoch": 3.6861759425493714, + "grad_norm": 1.0197817087173462, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 51330 + }, + { + "epoch": 3.68689407540395, + "grad_norm": 0.8764060139656067, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 51340 + }, + { + "epoch": 3.687612208258528, + "grad_norm": 0.9763964414596558, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 51350 + }, + { + "epoch": 3.688330341113106, + "grad_norm": 0.8389105200767517, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 51360 + }, + { + "epoch": 3.689048473967684, + "grad_norm": 0.9215750694274902, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 51370 + }, + { + "epoch": 3.6897666068222623, + "grad_norm": 0.8444913625717163, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 51380 + }, + { + "epoch": 3.6904847396768403, + "grad_norm": 0.9635153412818909, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 51390 + }, + { + "epoch": 3.6912028725314183, + "grad_norm": 1.0397378206253052, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 51400 + }, + { + "epoch": 3.6919210053859963, + "grad_norm": 0.9154748320579529, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 51410 + }, + { + "epoch": 3.6926391382405743, + "grad_norm": 0.906445324420929, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 51420 + }, + { + "epoch": 3.6933572710951523, + "grad_norm": 0.9237992763519287, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 51430 + }, + { + "epoch": 3.6940754039497308, + "grad_norm": 0.8796338438987732, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 51440 + }, + { + "epoch": 3.6947935368043088, + "grad_norm": 0.8613203763961792, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 51450 + }, + { + "epoch": 3.6955116696588868, + "grad_norm": 0.7957607507705688, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 51460 + }, + { + "epoch": 3.6962298025134652, + "grad_norm": 0.9183711409568787, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 51470 + }, + { + "epoch": 3.6969479353680432, + "grad_norm": 1.0108308792114258, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 51480 + }, + { + "epoch": 3.6976660682226212, + "grad_norm": 0.7768247127532959, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 51490 + }, + { + "epoch": 3.6983842010771992, + "grad_norm": 1.0051485300064087, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 51500 + }, + { + "epoch": 3.6991023339317772, + "grad_norm": 0.82451993227005, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 51510 + }, + { + "epoch": 3.6998204667863552, + "grad_norm": 0.9542286992073059, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 51520 + }, + { + "epoch": 3.7005385996409337, + "grad_norm": 0.693890392780304, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 51530 + }, + { + "epoch": 3.7012567324955117, + "grad_norm": 0.9068924784660339, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 51540 + }, + { + "epoch": 3.7019748653500897, + "grad_norm": 0.8694922924041748, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 51550 + }, + { + "epoch": 3.702692998204668, + "grad_norm": 0.941081702709198, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 51560 + }, + { + "epoch": 3.703411131059246, + "grad_norm": 0.7385984659194946, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 51570 + }, + { + "epoch": 3.704129263913824, + "grad_norm": 1.0399216413497925, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51580 + }, + { + "epoch": 3.704847396768402, + "grad_norm": 0.9802294969558716, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 51590 + }, + { + "epoch": 3.70556552962298, + "grad_norm": 1.0409669876098633, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51600 + }, + { + "epoch": 3.706283662477558, + "grad_norm": 0.8972786068916321, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 51610 + }, + { + "epoch": 3.7070017953321366, + "grad_norm": 1.1916245222091675, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 51620 + }, + { + "epoch": 3.7077199281867146, + "grad_norm": 0.9545385241508484, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 51630 + }, + { + "epoch": 3.7084380610412926, + "grad_norm": 1.0773427486419678, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 51640 + }, + { + "epoch": 3.7091561938958706, + "grad_norm": 1.0856024026870728, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 51650 + }, + { + "epoch": 3.709874326750449, + "grad_norm": 0.7678500413894653, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51660 + }, + { + "epoch": 3.710592459605027, + "grad_norm": 0.7276270985603333, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 51670 + }, + { + "epoch": 3.711310592459605, + "grad_norm": 0.8859017491340637, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 51680 + }, + { + "epoch": 3.712028725314183, + "grad_norm": 0.9037614464759827, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 51690 + }, + { + "epoch": 3.712746858168761, + "grad_norm": 0.9223412275314331, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51700 + }, + { + "epoch": 3.713464991023339, + "grad_norm": 0.8812923431396484, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 51710 + }, + { + "epoch": 3.7141831238779175, + "grad_norm": 0.8242456912994385, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 51720 + }, + { + "epoch": 3.7149012567324955, + "grad_norm": 0.8368834257125854, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 51730 + }, + { + "epoch": 3.7156193895870735, + "grad_norm": 0.8624704480171204, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 51740 + }, + { + "epoch": 3.716337522441652, + "grad_norm": 0.9138273596763611, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51750 + }, + { + "epoch": 3.71705565529623, + "grad_norm": 0.8088571429252625, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 51760 + }, + { + "epoch": 3.717773788150808, + "grad_norm": 0.882808268070221, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 51770 + }, + { + "epoch": 3.718491921005386, + "grad_norm": 0.9368035197257996, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 51780 + }, + { + "epoch": 3.719210053859964, + "grad_norm": 0.8341794013977051, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 51790 + }, + { + "epoch": 3.719928186714542, + "grad_norm": 0.8692073225975037, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 51800 + }, + { + "epoch": 3.7206463195691204, + "grad_norm": 0.7566918730735779, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 51810 + }, + { + "epoch": 3.7213644524236984, + "grad_norm": 1.113138198852539, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 51820 + }, + { + "epoch": 3.7220825852782764, + "grad_norm": 0.8793158531188965, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 51830 + }, + { + "epoch": 3.722800718132855, + "grad_norm": 0.8856439590454102, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 51840 + }, + { + "epoch": 3.723518850987433, + "grad_norm": 1.0182029008865356, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 51850 + }, + { + "epoch": 3.724236983842011, + "grad_norm": 1.1177181005477905, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 51860 + }, + { + "epoch": 3.724955116696589, + "grad_norm": 0.6600990295410156, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 51870 + }, + { + "epoch": 3.725673249551167, + "grad_norm": 1.0563536882400513, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 51880 + }, + { + "epoch": 3.726391382405745, + "grad_norm": 1.1067734956741333, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 51890 + }, + { + "epoch": 3.7271095152603233, + "grad_norm": 1.0204616785049438, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 51900 + }, + { + "epoch": 3.7278276481149013, + "grad_norm": 0.8647155165672302, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51910 + }, + { + "epoch": 3.7285457809694793, + "grad_norm": 1.0754971504211426, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 51920 + }, + { + "epoch": 3.7292639138240573, + "grad_norm": 1.0448992252349854, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 51930 + }, + { + "epoch": 3.7299820466786358, + "grad_norm": 0.963434100151062, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 51940 + }, + { + "epoch": 3.7307001795332138, + "grad_norm": 0.8112701773643494, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51950 + }, + { + "epoch": 3.7314183123877918, + "grad_norm": 0.7975119948387146, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 51960 + }, + { + "epoch": 3.7321364452423698, + "grad_norm": 0.7953376173973083, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 51970 + }, + { + "epoch": 3.7328545780969478, + "grad_norm": 0.9519981741905212, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 51980 + }, + { + "epoch": 3.7335727109515258, + "grad_norm": 0.8705791234970093, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 51990 + }, + { + "epoch": 3.734290843806104, + "grad_norm": 0.870205283164978, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 52000 + }, + { + "epoch": 3.735008976660682, + "grad_norm": 0.9558930993080139, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 52010 + }, + { + "epoch": 3.73572710951526, + "grad_norm": 0.9330434799194336, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 52020 + }, + { + "epoch": 3.7364452423698387, + "grad_norm": 0.783620297908783, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 52030 + }, + { + "epoch": 3.7371633752244167, + "grad_norm": 0.7575166821479797, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52040 + }, + { + "epoch": 3.7378815080789947, + "grad_norm": 1.0592705011367798, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 52050 + }, + { + "epoch": 3.7385996409335727, + "grad_norm": 0.9309433102607727, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 52060 + }, + { + "epoch": 3.7393177737881507, + "grad_norm": 0.972861647605896, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 52070 + }, + { + "epoch": 3.7400359066427287, + "grad_norm": 0.9318740963935852, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 52080 + }, + { + "epoch": 3.740754039497307, + "grad_norm": 0.7938477396965027, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 52090 + }, + { + "epoch": 3.741472172351885, + "grad_norm": 1.1515966653823853, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 52100 + }, + { + "epoch": 3.742190305206463, + "grad_norm": 1.076869010925293, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 52110 + }, + { + "epoch": 3.7429084380610416, + "grad_norm": 0.8516066670417786, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 52120 + }, + { + "epoch": 3.7436265709156196, + "grad_norm": 0.6853429079055786, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 52130 + }, + { + "epoch": 3.7443447037701976, + "grad_norm": 0.8179695010185242, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52140 + }, + { + "epoch": 3.7450628366247756, + "grad_norm": 0.8395232558250427, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 52150 + }, + { + "epoch": 3.7457809694793536, + "grad_norm": 1.0178003311157227, + "learning_rate": 0.0002, + "loss": 0.6902, + "step": 52160 + }, + { + "epoch": 3.7464991023339316, + "grad_norm": 1.1801023483276367, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 52170 + }, + { + "epoch": 3.74721723518851, + "grad_norm": 0.8215751647949219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 52180 + }, + { + "epoch": 3.747935368043088, + "grad_norm": 1.17083740234375, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 52190 + }, + { + "epoch": 3.748653500897666, + "grad_norm": 0.9230290651321411, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 52200 + }, + { + "epoch": 3.749371633752244, + "grad_norm": 0.8431521058082581, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 52210 + }, + { + "epoch": 3.7500897666068225, + "grad_norm": 0.9690840244293213, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 52220 + }, + { + "epoch": 3.7508078994614005, + "grad_norm": 1.0022395849227905, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 52230 + }, + { + "epoch": 3.7515260323159785, + "grad_norm": 1.0489065647125244, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 52240 + }, + { + "epoch": 3.7522441651705565, + "grad_norm": 0.7880696058273315, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 52250 + }, + { + "epoch": 3.7529622980251345, + "grad_norm": 1.0255829095840454, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 52260 + }, + { + "epoch": 3.7536804308797125, + "grad_norm": 0.8470141291618347, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 52270 + }, + { + "epoch": 3.754398563734291, + "grad_norm": 0.9040523171424866, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 52280 + }, + { + "epoch": 3.755116696588869, + "grad_norm": 0.9564392566680908, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 52290 + }, + { + "epoch": 3.755834829443447, + "grad_norm": 0.907857358455658, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 52300 + }, + { + "epoch": 3.7565529622980254, + "grad_norm": 0.8929873704910278, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 52310 + }, + { + "epoch": 3.7572710951526034, + "grad_norm": 0.854434072971344, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 52320 + }, + { + "epoch": 3.7579892280071814, + "grad_norm": 0.8744779229164124, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 52330 + }, + { + "epoch": 3.7587073608617594, + "grad_norm": 0.9022667407989502, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52340 + }, + { + "epoch": 3.7594254937163374, + "grad_norm": 0.8884857892990112, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52350 + }, + { + "epoch": 3.7601436265709154, + "grad_norm": 1.0228430032730103, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 52360 + }, + { + "epoch": 3.760861759425494, + "grad_norm": 0.8593528270721436, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 52370 + }, + { + "epoch": 3.761579892280072, + "grad_norm": 0.9435563087463379, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 52380 + }, + { + "epoch": 3.76229802513465, + "grad_norm": 0.7545679807662964, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52390 + }, + { + "epoch": 3.7630161579892283, + "grad_norm": 0.9411585927009583, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52400 + }, + { + "epoch": 3.7637342908438063, + "grad_norm": 0.9764377474784851, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 52410 + }, + { + "epoch": 3.7644524236983843, + "grad_norm": 1.0718384981155396, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 52420 + }, + { + "epoch": 3.7651705565529623, + "grad_norm": 0.8765230774879456, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52430 + }, + { + "epoch": 3.7658886894075403, + "grad_norm": 0.9275036454200745, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 52440 + }, + { + "epoch": 3.7666068222621183, + "grad_norm": 0.967410147190094, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 52450 + }, + { + "epoch": 3.7673249551166967, + "grad_norm": 0.7738949060440063, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 52460 + }, + { + "epoch": 3.7680430879712747, + "grad_norm": 1.0828070640563965, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 52470 + }, + { + "epoch": 3.7687612208258527, + "grad_norm": 0.9570213556289673, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 52480 + }, + { + "epoch": 3.7694793536804307, + "grad_norm": 1.0688215494155884, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 52490 + }, + { + "epoch": 3.770197486535009, + "grad_norm": 0.7970073223114014, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 52500 + }, + { + "epoch": 3.770915619389587, + "grad_norm": 0.7132976651191711, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 52510 + }, + { + "epoch": 3.771633752244165, + "grad_norm": 1.152268648147583, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 52520 + }, + { + "epoch": 3.772351885098743, + "grad_norm": 0.8645235896110535, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52530 + }, + { + "epoch": 3.773070017953321, + "grad_norm": 0.7725570201873779, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 52540 + }, + { + "epoch": 3.773788150807899, + "grad_norm": 0.9718102812767029, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 52550 + }, + { + "epoch": 3.7745062836624776, + "grad_norm": 0.7568017840385437, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 52560 + }, + { + "epoch": 3.7752244165170556, + "grad_norm": 0.9578912854194641, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 52570 + }, + { + "epoch": 3.7759425493716336, + "grad_norm": 0.8657314777374268, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 52580 + }, + { + "epoch": 3.776660682226212, + "grad_norm": 0.7564393281936646, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 52590 + }, + { + "epoch": 3.77737881508079, + "grad_norm": 0.7631160616874695, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 52600 + }, + { + "epoch": 3.778096947935368, + "grad_norm": 1.1852056980133057, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 52610 + }, + { + "epoch": 3.778815080789946, + "grad_norm": 1.0620790719985962, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 52620 + }, + { + "epoch": 3.779533213644524, + "grad_norm": 0.8677777647972107, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 52630 + }, + { + "epoch": 3.780251346499102, + "grad_norm": 0.9913218021392822, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 52640 + }, + { + "epoch": 3.7809694793536806, + "grad_norm": 0.9868429899215698, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 52650 + }, + { + "epoch": 3.7816876122082586, + "grad_norm": 0.8791782259941101, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 52660 + }, + { + "epoch": 3.7824057450628366, + "grad_norm": 0.9503955245018005, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 52670 + }, + { + "epoch": 3.7831238779174146, + "grad_norm": 0.8647131323814392, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 52680 + }, + { + "epoch": 3.783842010771993, + "grad_norm": 0.9819629788398743, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52690 + }, + { + "epoch": 3.784560143626571, + "grad_norm": 0.8548610210418701, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 52700 + }, + { + "epoch": 3.785278276481149, + "grad_norm": 0.8706230521202087, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 52710 + }, + { + "epoch": 3.785996409335727, + "grad_norm": 1.0032461881637573, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52720 + }, + { + "epoch": 3.786714542190305, + "grad_norm": 1.0578246116638184, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 52730 + }, + { + "epoch": 3.7874326750448835, + "grad_norm": 0.9854007363319397, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52740 + }, + { + "epoch": 3.7881508078994615, + "grad_norm": 0.8389187455177307, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 52750 + }, + { + "epoch": 3.7888689407540395, + "grad_norm": 0.9192399978637695, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 52760 + }, + { + "epoch": 3.7895870736086175, + "grad_norm": 0.9518283605575562, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 52770 + }, + { + "epoch": 3.790305206463196, + "grad_norm": 1.1296825408935547, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52780 + }, + { + "epoch": 3.791023339317774, + "grad_norm": 1.0589144229888916, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 52790 + }, + { + "epoch": 3.791741472172352, + "grad_norm": 0.8954343199729919, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 52800 + }, + { + "epoch": 3.79245960502693, + "grad_norm": 0.8283370733261108, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 52810 + }, + { + "epoch": 3.793177737881508, + "grad_norm": 0.910642683506012, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 52820 + }, + { + "epoch": 3.793895870736086, + "grad_norm": 0.9255108833312988, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 52830 + }, + { + "epoch": 3.7946140035906644, + "grad_norm": 0.8773723244667053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 52840 + }, + { + "epoch": 3.7953321364452424, + "grad_norm": 0.8454240560531616, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 52850 + }, + { + "epoch": 3.7960502692998204, + "grad_norm": 0.7636052966117859, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 52860 + }, + { + "epoch": 3.796768402154399, + "grad_norm": 0.9358382821083069, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 52870 + }, + { + "epoch": 3.797486535008977, + "grad_norm": 0.9662801623344421, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 52880 + }, + { + "epoch": 3.798204667863555, + "grad_norm": 0.995907187461853, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 52890 + }, + { + "epoch": 3.798922800718133, + "grad_norm": 0.8700127005577087, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 52900 + }, + { + "epoch": 3.799640933572711, + "grad_norm": 0.8987792134284973, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 52910 + }, + { + "epoch": 3.800359066427289, + "grad_norm": 0.9753904938697815, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 52920 + }, + { + "epoch": 3.8010771992818673, + "grad_norm": 0.7873555421829224, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 52930 + }, + { + "epoch": 3.8017953321364453, + "grad_norm": 0.8177929520606995, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 52940 + }, + { + "epoch": 3.8025134649910233, + "grad_norm": 0.8865532279014587, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 52950 + }, + { + "epoch": 3.8032315978456013, + "grad_norm": 0.9113775491714478, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 52960 + }, + { + "epoch": 3.8039497307001797, + "grad_norm": 0.9424585700035095, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 52970 + }, + { + "epoch": 3.8046678635547577, + "grad_norm": 0.8347237706184387, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 52980 + }, + { + "epoch": 3.8053859964093357, + "grad_norm": 0.826863169670105, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 52990 + }, + { + "epoch": 3.8061041292639137, + "grad_norm": 0.7313310503959656, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 53000 + }, + { + "epoch": 3.8068222621184917, + "grad_norm": 0.8352667093276978, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 53010 + }, + { + "epoch": 3.80754039497307, + "grad_norm": 0.748461127281189, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 53020 + }, + { + "epoch": 3.808258527827648, + "grad_norm": 0.943256139755249, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 53030 + }, + { + "epoch": 3.808976660682226, + "grad_norm": 1.0448410511016846, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 53040 + }, + { + "epoch": 3.809694793536804, + "grad_norm": 0.9047636985778809, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 53050 + }, + { + "epoch": 3.8104129263913826, + "grad_norm": 0.8594381213188171, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 53060 + }, + { + "epoch": 3.8111310592459606, + "grad_norm": 0.7593536972999573, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 53070 + }, + { + "epoch": 3.8118491921005386, + "grad_norm": 0.7189019918441772, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 53080 + }, + { + "epoch": 3.8125673249551166, + "grad_norm": 0.8569809198379517, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53090 + }, + { + "epoch": 3.8132854578096946, + "grad_norm": 0.923378050327301, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53100 + }, + { + "epoch": 3.8140035906642726, + "grad_norm": 0.9088824391365051, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 53110 + }, + { + "epoch": 3.814721723518851, + "grad_norm": 1.1386840343475342, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 53120 + }, + { + "epoch": 3.815439856373429, + "grad_norm": 0.8389552235603333, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 53130 + }, + { + "epoch": 3.816157989228007, + "grad_norm": 0.7940975427627563, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 53140 + }, + { + "epoch": 3.8168761220825855, + "grad_norm": 0.8389907479286194, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 53150 + }, + { + "epoch": 3.8175942549371635, + "grad_norm": 0.774206280708313, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 53160 + }, + { + "epoch": 3.8183123877917415, + "grad_norm": 1.189447283744812, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 53170 + }, + { + "epoch": 3.8190305206463195, + "grad_norm": 0.9875882863998413, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 53180 + }, + { + "epoch": 3.8197486535008975, + "grad_norm": 0.9205945134162903, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 53190 + }, + { + "epoch": 3.8204667863554755, + "grad_norm": 0.8312796354293823, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 53200 + }, + { + "epoch": 3.821184919210054, + "grad_norm": 0.9755756855010986, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 53210 + }, + { + "epoch": 3.821903052064632, + "grad_norm": 1.0722965002059937, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53220 + }, + { + "epoch": 3.82262118491921, + "grad_norm": 0.7720510959625244, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 53230 + }, + { + "epoch": 3.823339317773788, + "grad_norm": 1.020147681236267, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 53240 + }, + { + "epoch": 3.8240574506283664, + "grad_norm": 0.8241816759109497, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53250 + }, + { + "epoch": 3.8247755834829444, + "grad_norm": 0.8939895629882812, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 53260 + }, + { + "epoch": 3.8254937163375224, + "grad_norm": 1.010852336883545, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 53270 + }, + { + "epoch": 3.8262118491921004, + "grad_norm": 0.8201420307159424, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 53280 + }, + { + "epoch": 3.8269299820466784, + "grad_norm": 0.8797973990440369, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 53290 + }, + { + "epoch": 3.827648114901257, + "grad_norm": 0.9034950137138367, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 53300 + }, + { + "epoch": 3.828366247755835, + "grad_norm": 0.926802933216095, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 53310 + }, + { + "epoch": 3.829084380610413, + "grad_norm": 1.0205509662628174, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 53320 + }, + { + "epoch": 3.829802513464991, + "grad_norm": 0.9524099230766296, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 53330 + }, + { + "epoch": 3.8305206463195693, + "grad_norm": 0.9692625999450684, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 53340 + }, + { + "epoch": 3.8312387791741473, + "grad_norm": 0.7255275845527649, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 53350 + }, + { + "epoch": 3.8319569120287253, + "grad_norm": 0.7199059724807739, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53360 + }, + { + "epoch": 3.8326750448833034, + "grad_norm": 1.004464864730835, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 53370 + }, + { + "epoch": 3.8333931777378814, + "grad_norm": 0.9092583060264587, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53380 + }, + { + "epoch": 3.8341113105924594, + "grad_norm": 0.945091724395752, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 53390 + }, + { + "epoch": 3.834829443447038, + "grad_norm": 0.7980135679244995, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 53400 + }, + { + "epoch": 3.835547576301616, + "grad_norm": 0.7812868356704712, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 53410 + }, + { + "epoch": 3.836265709156194, + "grad_norm": 0.8957077860832214, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53420 + }, + { + "epoch": 3.8369838420107722, + "grad_norm": 0.9119600653648376, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 53430 + }, + { + "epoch": 3.8377019748653503, + "grad_norm": 0.8208187222480774, + "learning_rate": 0.0002, + "loss": 0.7346, + "step": 53440 + }, + { + "epoch": 3.8384201077199283, + "grad_norm": 0.7930439114570618, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 53450 + }, + { + "epoch": 3.8391382405745063, + "grad_norm": 0.8937777280807495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 53460 + }, + { + "epoch": 3.8398563734290843, + "grad_norm": 0.7583796977996826, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 53470 + }, + { + "epoch": 3.8405745062836623, + "grad_norm": 1.0735969543457031, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 53480 + }, + { + "epoch": 3.8412926391382407, + "grad_norm": 1.1106033325195312, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 53490 + }, + { + "epoch": 3.8420107719928187, + "grad_norm": 1.092631220817566, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 53500 + }, + { + "epoch": 3.8427289048473967, + "grad_norm": 0.9961787462234497, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 53510 + }, + { + "epoch": 3.8434470377019747, + "grad_norm": 0.833831250667572, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 53520 + }, + { + "epoch": 3.844165170556553, + "grad_norm": 1.0000009536743164, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 53530 + }, + { + "epoch": 3.844883303411131, + "grad_norm": 0.9784213304519653, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 53540 + }, + { + "epoch": 3.845601436265709, + "grad_norm": 0.8582558035850525, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 53550 + }, + { + "epoch": 3.846319569120287, + "grad_norm": 0.8267415761947632, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 53560 + }, + { + "epoch": 3.847037701974865, + "grad_norm": 0.8783000111579895, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 53570 + }, + { + "epoch": 3.8477558348294436, + "grad_norm": 0.9866999983787537, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 53580 + }, + { + "epoch": 3.8484739676840216, + "grad_norm": 0.8459296226501465, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 53590 + }, + { + "epoch": 3.8491921005385996, + "grad_norm": 0.9804834723472595, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 53600 + }, + { + "epoch": 3.8499102333931776, + "grad_norm": 0.951074481010437, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 53610 + }, + { + "epoch": 3.850628366247756, + "grad_norm": 0.8020104169845581, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 53620 + }, + { + "epoch": 3.851346499102334, + "grad_norm": 0.9296963214874268, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 53630 + }, + { + "epoch": 3.852064631956912, + "grad_norm": 0.8983652591705322, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 53640 + }, + { + "epoch": 3.85278276481149, + "grad_norm": 1.031858205795288, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 53650 + }, + { + "epoch": 3.853500897666068, + "grad_norm": 0.8943952918052673, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 53660 + }, + { + "epoch": 3.854219030520646, + "grad_norm": 1.0072312355041504, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 53670 + }, + { + "epoch": 3.8549371633752245, + "grad_norm": 1.0604884624481201, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 53680 + }, + { + "epoch": 3.8556552962298025, + "grad_norm": 0.834223210811615, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 53690 + }, + { + "epoch": 3.8563734290843805, + "grad_norm": 0.9872867465019226, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 53700 + }, + { + "epoch": 3.857091561938959, + "grad_norm": 0.7999459505081177, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53710 + }, + { + "epoch": 3.857809694793537, + "grad_norm": 0.717722475528717, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 53720 + }, + { + "epoch": 3.858527827648115, + "grad_norm": 1.0675442218780518, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 53730 + }, + { + "epoch": 3.859245960502693, + "grad_norm": 0.9789777398109436, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 53740 + }, + { + "epoch": 3.859964093357271, + "grad_norm": 0.9318669438362122, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 53750 + }, + { + "epoch": 3.860682226211849, + "grad_norm": 0.9848631024360657, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 53760 + }, + { + "epoch": 3.8614003590664274, + "grad_norm": 0.8754391670227051, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 53770 + }, + { + "epoch": 3.8621184919210054, + "grad_norm": 0.9024585485458374, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 53780 + }, + { + "epoch": 3.8628366247755834, + "grad_norm": 0.8974794745445251, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 53790 + }, + { + "epoch": 3.8635547576301614, + "grad_norm": 0.8342790603637695, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 53800 + }, + { + "epoch": 3.86427289048474, + "grad_norm": 0.8177682757377625, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 53810 + }, + { + "epoch": 3.864991023339318, + "grad_norm": 1.0259089469909668, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 53820 + }, + { + "epoch": 3.865709156193896, + "grad_norm": 1.042290210723877, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 53830 + }, + { + "epoch": 3.866427289048474, + "grad_norm": 0.7316540479660034, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 53840 + }, + { + "epoch": 3.867145421903052, + "grad_norm": 0.9384970664978027, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53850 + }, + { + "epoch": 3.86786355475763, + "grad_norm": 0.9273143410682678, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53860 + }, + { + "epoch": 3.8685816876122083, + "grad_norm": 1.1183570623397827, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 53870 + }, + { + "epoch": 3.8692998204667863, + "grad_norm": 0.9455275535583496, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 53880 + }, + { + "epoch": 3.8700179533213643, + "grad_norm": 0.8702114820480347, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 53890 + }, + { + "epoch": 3.870736086175943, + "grad_norm": 0.8751053214073181, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53900 + }, + { + "epoch": 3.871454219030521, + "grad_norm": 0.9793110489845276, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 53910 + }, + { + "epoch": 3.872172351885099, + "grad_norm": 0.9705014824867249, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 53920 + }, + { + "epoch": 3.872890484739677, + "grad_norm": 1.051504373550415, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 53930 + }, + { + "epoch": 3.873608617594255, + "grad_norm": 0.8590622544288635, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 53940 + }, + { + "epoch": 3.874326750448833, + "grad_norm": 0.7828099727630615, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 53950 + }, + { + "epoch": 3.8750448833034112, + "grad_norm": 0.86341792345047, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 53960 + }, + { + "epoch": 3.8757630161579892, + "grad_norm": 1.114670991897583, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 53970 + }, + { + "epoch": 3.8764811490125672, + "grad_norm": 0.8559519052505493, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 53980 + }, + { + "epoch": 3.8771992818671457, + "grad_norm": 1.0518953800201416, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 53990 + }, + { + "epoch": 3.8779174147217237, + "grad_norm": 0.7157500982284546, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 54000 + }, + { + "epoch": 3.8786355475763017, + "grad_norm": 0.8390372395515442, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 54010 + }, + { + "epoch": 3.8793536804308797, + "grad_norm": 0.8486756086349487, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 54020 + }, + { + "epoch": 3.8800718132854577, + "grad_norm": 0.8361587524414062, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 54030 + }, + { + "epoch": 3.8807899461400357, + "grad_norm": 0.9490554928779602, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 54040 + }, + { + "epoch": 3.881508078994614, + "grad_norm": 1.0311323404312134, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 54050 + }, + { + "epoch": 3.882226211849192, + "grad_norm": 0.84800124168396, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54060 + }, + { + "epoch": 3.88294434470377, + "grad_norm": 0.8940879702568054, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 54070 + }, + { + "epoch": 3.883662477558348, + "grad_norm": 0.985542356967926, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 54080 + }, + { + "epoch": 3.8843806104129266, + "grad_norm": 0.8846475481987, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 54090 + }, + { + "epoch": 3.8850987432675046, + "grad_norm": 0.9186338186264038, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 54100 + }, + { + "epoch": 3.8858168761220826, + "grad_norm": 1.106598973274231, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 54110 + }, + { + "epoch": 3.8865350089766606, + "grad_norm": 0.8167300224304199, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 54120 + }, + { + "epoch": 3.8872531418312386, + "grad_norm": 0.9153622984886169, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 54130 + }, + { + "epoch": 3.8879712746858166, + "grad_norm": 0.8464475274085999, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 54140 + }, + { + "epoch": 3.888689407540395, + "grad_norm": 0.8889452815055847, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 54150 + }, + { + "epoch": 3.889407540394973, + "grad_norm": 0.7861065864562988, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 54160 + }, + { + "epoch": 3.890125673249551, + "grad_norm": 0.882674515247345, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 54170 + }, + { + "epoch": 3.8908438061041295, + "grad_norm": 0.8503835201263428, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 54180 + }, + { + "epoch": 3.8915619389587075, + "grad_norm": 0.888455331325531, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 54190 + }, + { + "epoch": 3.8922800718132855, + "grad_norm": 1.0473699569702148, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 54200 + }, + { + "epoch": 3.8929982046678635, + "grad_norm": 0.9548208713531494, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 54210 + }, + { + "epoch": 3.8937163375224415, + "grad_norm": 0.9158754944801331, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 54220 + }, + { + "epoch": 3.8944344703770195, + "grad_norm": 0.9001154899597168, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54230 + }, + { + "epoch": 3.895152603231598, + "grad_norm": 0.9736626148223877, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54240 + }, + { + "epoch": 3.895870736086176, + "grad_norm": 0.8809846043586731, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 54250 + }, + { + "epoch": 3.896588868940754, + "grad_norm": 0.887583315372467, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 54260 + }, + { + "epoch": 3.8973070017953324, + "grad_norm": 0.8395712971687317, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 54270 + }, + { + "epoch": 3.8980251346499104, + "grad_norm": 0.8391315937042236, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 54280 + }, + { + "epoch": 3.8987432675044884, + "grad_norm": 0.8210049271583557, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54290 + }, + { + "epoch": 3.8994614003590664, + "grad_norm": 1.1364530324935913, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54300 + }, + { + "epoch": 3.9001795332136444, + "grad_norm": 0.7712056636810303, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 54310 + }, + { + "epoch": 3.9008976660682224, + "grad_norm": 0.9466049671173096, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 54320 + }, + { + "epoch": 3.901615798922801, + "grad_norm": 1.0367140769958496, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 54330 + }, + { + "epoch": 3.902333931777379, + "grad_norm": 1.0168321132659912, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 54340 + }, + { + "epoch": 3.903052064631957, + "grad_norm": 0.7830407619476318, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 54350 + }, + { + "epoch": 3.903770197486535, + "grad_norm": 0.9649789333343506, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 54360 + }, + { + "epoch": 3.9044883303411133, + "grad_norm": 0.681077778339386, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 54370 + }, + { + "epoch": 3.9052064631956913, + "grad_norm": 0.8970136046409607, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 54380 + }, + { + "epoch": 3.9059245960502693, + "grad_norm": 0.9155173301696777, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 54390 + }, + { + "epoch": 3.9066427289048473, + "grad_norm": 1.0447794198989868, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 54400 + }, + { + "epoch": 3.9073608617594253, + "grad_norm": 0.7823813557624817, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 54410 + }, + { + "epoch": 3.9080789946140033, + "grad_norm": 0.9289445877075195, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 54420 + }, + { + "epoch": 3.9087971274685818, + "grad_norm": 0.9983111619949341, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 54430 + }, + { + "epoch": 3.9095152603231598, + "grad_norm": 0.7952495813369751, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 54440 + }, + { + "epoch": 3.9102333931777378, + "grad_norm": 0.8045601844787598, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 54450 + }, + { + "epoch": 3.910951526032316, + "grad_norm": 0.936585009098053, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 54460 + }, + { + "epoch": 3.911669658886894, + "grad_norm": 0.745793879032135, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 54470 + }, + { + "epoch": 3.912387791741472, + "grad_norm": 0.9137616157531738, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 54480 + }, + { + "epoch": 3.9131059245960502, + "grad_norm": 0.826316237449646, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 54490 + }, + { + "epoch": 3.9138240574506282, + "grad_norm": 0.94313645362854, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 54500 + }, + { + "epoch": 3.9145421903052062, + "grad_norm": 1.045893907546997, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 54510 + }, + { + "epoch": 3.9152603231597847, + "grad_norm": 0.9122704863548279, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 54520 + }, + { + "epoch": 3.9159784560143627, + "grad_norm": 1.0999689102172852, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 54530 + }, + { + "epoch": 3.9166965888689407, + "grad_norm": 0.9281555414199829, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 54540 + }, + { + "epoch": 3.917414721723519, + "grad_norm": 1.1439622640609741, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 54550 + }, + { + "epoch": 3.918132854578097, + "grad_norm": 0.9375617504119873, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 54560 + }, + { + "epoch": 3.918850987432675, + "grad_norm": 0.92906653881073, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 54570 + }, + { + "epoch": 3.919569120287253, + "grad_norm": 1.0840893983840942, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 54580 + }, + { + "epoch": 3.920287253141831, + "grad_norm": 0.8145509362220764, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 54590 + }, + { + "epoch": 3.921005385996409, + "grad_norm": 0.973737895488739, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 54600 + }, + { + "epoch": 3.9217235188509876, + "grad_norm": 0.9302353858947754, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 54610 + }, + { + "epoch": 3.9224416517055656, + "grad_norm": 0.9167897701263428, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 54620 + }, + { + "epoch": 3.9231597845601436, + "grad_norm": 0.8096851706504822, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 54630 + }, + { + "epoch": 3.9238779174147216, + "grad_norm": 0.8006368279457092, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 54640 + }, + { + "epoch": 3.9245960502693, + "grad_norm": 0.7800863981246948, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 54650 + }, + { + "epoch": 3.925314183123878, + "grad_norm": 1.0331560373306274, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 54660 + }, + { + "epoch": 3.926032315978456, + "grad_norm": 1.0057517290115356, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 54670 + }, + { + "epoch": 3.926750448833034, + "grad_norm": 0.8920564651489258, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 54680 + }, + { + "epoch": 3.927468581687612, + "grad_norm": 0.7704599499702454, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 54690 + }, + { + "epoch": 3.92818671454219, + "grad_norm": 0.827032208442688, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 54700 + }, + { + "epoch": 3.9289048473967685, + "grad_norm": 1.0019268989562988, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 54710 + }, + { + "epoch": 3.9296229802513465, + "grad_norm": 0.862033486366272, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 54720 + }, + { + "epoch": 3.9303411131059245, + "grad_norm": 0.8965592980384827, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 54730 + }, + { + "epoch": 3.931059245960503, + "grad_norm": 0.7689077854156494, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 54740 + }, + { + "epoch": 3.931777378815081, + "grad_norm": 0.846276581287384, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 54750 + }, + { + "epoch": 3.932495511669659, + "grad_norm": 0.8932713866233826, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 54760 + }, + { + "epoch": 3.933213644524237, + "grad_norm": 0.9711386561393738, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 54770 + }, + { + "epoch": 3.933931777378815, + "grad_norm": 0.9290250539779663, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 54780 + }, + { + "epoch": 3.934649910233393, + "grad_norm": 1.0897367000579834, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 54790 + }, + { + "epoch": 3.9353680430879714, + "grad_norm": 0.8451842665672302, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 54800 + }, + { + "epoch": 3.9360861759425494, + "grad_norm": 0.8400090336799622, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 54810 + }, + { + "epoch": 3.9368043087971274, + "grad_norm": 0.951383650302887, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 54820 + }, + { + "epoch": 3.937522441651706, + "grad_norm": 0.848838210105896, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 54830 + }, + { + "epoch": 3.938240574506284, + "grad_norm": 0.735763669013977, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 54840 + }, + { + "epoch": 3.938958707360862, + "grad_norm": 0.979037344455719, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 54850 + }, + { + "epoch": 3.93967684021544, + "grad_norm": 0.933674693107605, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 54860 + }, + { + "epoch": 3.940394973070018, + "grad_norm": 0.835593044757843, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 54870 + }, + { + "epoch": 3.941113105924596, + "grad_norm": 1.0034281015396118, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 54880 + }, + { + "epoch": 3.9418312387791743, + "grad_norm": 0.9732975959777832, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 54890 + }, + { + "epoch": 3.9425493716337523, + "grad_norm": 0.9666336178779602, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54900 + }, + { + "epoch": 3.9432675044883303, + "grad_norm": 0.755310595035553, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 54910 + }, + { + "epoch": 3.9439856373429083, + "grad_norm": 0.8732092976570129, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 54920 + }, + { + "epoch": 3.9447037701974867, + "grad_norm": 1.139453649520874, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 54930 + }, + { + "epoch": 3.9454219030520647, + "grad_norm": 0.9044837951660156, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 54940 + }, + { + "epoch": 3.9461400359066428, + "grad_norm": 1.0496679544448853, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 54950 + }, + { + "epoch": 3.9468581687612208, + "grad_norm": 1.0099035501480103, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 54960 + }, + { + "epoch": 3.9475763016157988, + "grad_norm": 1.0694963932037354, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 54970 + }, + { + "epoch": 3.9482944344703768, + "grad_norm": 1.0012997388839722, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 54980 + }, + { + "epoch": 3.949012567324955, + "grad_norm": 0.8910513520240784, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 54990 + }, + { + "epoch": 3.949730700179533, + "grad_norm": 1.0267579555511475, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 55000 + }, + { + "epoch": 3.950448833034111, + "grad_norm": 0.9786432385444641, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 55010 + }, + { + "epoch": 3.9511669658886897, + "grad_norm": 0.8703538775444031, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55020 + }, + { + "epoch": 3.9518850987432677, + "grad_norm": 0.8970484137535095, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 55030 + }, + { + "epoch": 3.9526032315978457, + "grad_norm": 0.8781577944755554, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 55040 + }, + { + "epoch": 3.9533213644524237, + "grad_norm": 0.8040280938148499, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 55050 + }, + { + "epoch": 3.9540394973070017, + "grad_norm": 0.851926326751709, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 55060 + }, + { + "epoch": 3.9547576301615797, + "grad_norm": 0.8597240447998047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 55070 + }, + { + "epoch": 3.955475763016158, + "grad_norm": 0.9461944699287415, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55080 + }, + { + "epoch": 3.956193895870736, + "grad_norm": 0.7576611042022705, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 55090 + }, + { + "epoch": 3.956912028725314, + "grad_norm": 0.9484710693359375, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 55100 + }, + { + "epoch": 3.957630161579892, + "grad_norm": 0.9487117528915405, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 55110 + }, + { + "epoch": 3.9583482944344706, + "grad_norm": 0.870090663433075, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55120 + }, + { + "epoch": 3.9590664272890486, + "grad_norm": 0.8496458530426025, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 55130 + }, + { + "epoch": 3.9597845601436266, + "grad_norm": 1.0121779441833496, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 55140 + }, + { + "epoch": 3.9605026929982046, + "grad_norm": 0.8912323713302612, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 55150 + }, + { + "epoch": 3.9612208258527826, + "grad_norm": 0.8398444652557373, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 55160 + }, + { + "epoch": 3.961938958707361, + "grad_norm": 0.8046348690986633, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 55170 + }, + { + "epoch": 3.962657091561939, + "grad_norm": 1.0369254350662231, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 55180 + }, + { + "epoch": 3.963375224416517, + "grad_norm": 1.172431230545044, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 55190 + }, + { + "epoch": 3.964093357271095, + "grad_norm": 0.8093554377555847, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 55200 + }, + { + "epoch": 3.9648114901256735, + "grad_norm": 0.8851078748703003, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 55210 + }, + { + "epoch": 3.9655296229802515, + "grad_norm": 0.7494266033172607, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 55220 + }, + { + "epoch": 3.9662477558348295, + "grad_norm": 0.9556898474693298, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 55230 + }, + { + "epoch": 3.9669658886894075, + "grad_norm": 1.016017198562622, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 55240 + }, + { + "epoch": 3.9676840215439855, + "grad_norm": 0.8425998091697693, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 55250 + }, + { + "epoch": 3.9684021543985635, + "grad_norm": 0.717673122882843, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 55260 + }, + { + "epoch": 3.969120287253142, + "grad_norm": 0.8366572856903076, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 55270 + }, + { + "epoch": 3.96983842010772, + "grad_norm": 0.8981583118438721, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 55280 + }, + { + "epoch": 3.970556552962298, + "grad_norm": 0.8868781328201294, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 55290 + }, + { + "epoch": 3.9712746858168764, + "grad_norm": 1.0632785558700562, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 55300 + }, + { + "epoch": 3.9719928186714544, + "grad_norm": 0.8813109993934631, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 55310 + }, + { + "epoch": 3.9727109515260324, + "grad_norm": 0.8225542306900024, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 55320 + }, + { + "epoch": 3.9734290843806104, + "grad_norm": 1.1391420364379883, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 55330 + }, + { + "epoch": 3.9741472172351884, + "grad_norm": 1.0371832847595215, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55340 + }, + { + "epoch": 3.9748653500897664, + "grad_norm": 1.0542186498641968, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 55350 + }, + { + "epoch": 3.975583482944345, + "grad_norm": 1.0178009271621704, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 55360 + }, + { + "epoch": 3.976301615798923, + "grad_norm": 0.7927802205085754, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 55370 + }, + { + "epoch": 3.977019748653501, + "grad_norm": 0.9350495934486389, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55380 + }, + { + "epoch": 3.977737881508079, + "grad_norm": 1.0240116119384766, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 55390 + }, + { + "epoch": 3.9784560143626573, + "grad_norm": 1.0279067754745483, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 55400 + }, + { + "epoch": 3.9791741472172353, + "grad_norm": 1.1228227615356445, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 55410 + }, + { + "epoch": 3.9798922800718133, + "grad_norm": 0.9500134587287903, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 55420 + }, + { + "epoch": 3.9806104129263913, + "grad_norm": 0.9229732155799866, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 55430 + }, + { + "epoch": 3.9813285457809693, + "grad_norm": 0.7946729063987732, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 55440 + }, + { + "epoch": 3.9820466786355477, + "grad_norm": 0.9987489581108093, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 55450 + }, + { + "epoch": 3.9827648114901257, + "grad_norm": 0.9670467972755432, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 55460 + }, + { + "epoch": 3.9834829443447037, + "grad_norm": 0.835028350353241, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 55470 + }, + { + "epoch": 3.9842010771992817, + "grad_norm": 0.8678702712059021, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 55480 + }, + { + "epoch": 3.98491921005386, + "grad_norm": 0.8581197261810303, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 55490 + }, + { + "epoch": 3.985637342908438, + "grad_norm": 0.779848039150238, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 55500 + }, + { + "epoch": 3.986355475763016, + "grad_norm": 0.8827589154243469, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 55510 + }, + { + "epoch": 3.987073608617594, + "grad_norm": 1.0108301639556885, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55520 + }, + { + "epoch": 3.987791741472172, + "grad_norm": 0.8506004214286804, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 55530 + }, + { + "epoch": 3.98850987432675, + "grad_norm": 1.0297727584838867, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 55540 + }, + { + "epoch": 3.9892280071813286, + "grad_norm": 0.8579224944114685, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55550 + }, + { + "epoch": 3.9899461400359066, + "grad_norm": 0.8503788113594055, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 55560 + }, + { + "epoch": 3.9906642728904846, + "grad_norm": 1.1144801378250122, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 55570 + }, + { + "epoch": 3.991382405745063, + "grad_norm": 0.8418305516242981, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 55580 + }, + { + "epoch": 3.992100538599641, + "grad_norm": 1.0065871477127075, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 55590 + }, + { + "epoch": 3.992818671454219, + "grad_norm": 0.8160259127616882, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 55600 + }, + { + "epoch": 3.993536804308797, + "grad_norm": 0.8678009510040283, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55610 + }, + { + "epoch": 3.994254937163375, + "grad_norm": 0.863465428352356, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 55620 + }, + { + "epoch": 3.994973070017953, + "grad_norm": 0.9242135286331177, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 55630 + }, + { + "epoch": 3.9956912028725315, + "grad_norm": 1.0285470485687256, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 55640 + }, + { + "epoch": 3.9964093357271095, + "grad_norm": 0.8953320384025574, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 55650 + }, + { + "epoch": 3.9971274685816875, + "grad_norm": 0.915892481803894, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 55660 + }, + { + "epoch": 3.9978456014362656, + "grad_norm": 0.8235118985176086, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 55670 + }, + { + "epoch": 3.998563734290844, + "grad_norm": 1.0178656578063965, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 55680 + }, + { + "epoch": 3.999281867145422, + "grad_norm": 0.9926803708076477, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 55690 + }, + { + "epoch": 4.0, + "grad_norm": 0.9213629961013794, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 55700 + }, + { + "epoch": 4.0, + "eval_loss": 1.1152480840682983, + "eval_runtime": 55.2237, + "eval_samples_per_second": 13.273, + "eval_steps_per_second": 1.666, + "step": 55700 + }, + { + "epoch": 4.000718132854578, + "grad_norm": 1.0820496082305908, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 55710 + }, + { + "epoch": 4.001436265709156, + "grad_norm": 0.9036441445350647, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 55720 + }, + { + "epoch": 4.002154398563734, + "grad_norm": 1.102754831314087, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 55730 + }, + { + "epoch": 4.002872531418312, + "grad_norm": 0.98259437084198, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 55740 + }, + { + "epoch": 4.003590664272891, + "grad_norm": 1.1935845613479614, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 55750 + }, + { + "epoch": 4.004308797127469, + "grad_norm": 0.9925830960273743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 55760 + }, + { + "epoch": 4.005026929982047, + "grad_norm": 1.075087070465088, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 55770 + }, + { + "epoch": 4.005745062836625, + "grad_norm": 0.8746396899223328, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 55780 + }, + { + "epoch": 4.006463195691203, + "grad_norm": 0.7635995745658875, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 55790 + }, + { + "epoch": 4.007181328545781, + "grad_norm": 0.9064885377883911, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 55800 + }, + { + "epoch": 4.007899461400359, + "grad_norm": 1.018478274345398, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 55810 + }, + { + "epoch": 4.008617594254937, + "grad_norm": 0.9797589778900146, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 55820 + }, + { + "epoch": 4.009335727109515, + "grad_norm": 0.7867457866668701, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 55830 + }, + { + "epoch": 4.010053859964093, + "grad_norm": 0.9998070597648621, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 55840 + }, + { + "epoch": 4.010771992818672, + "grad_norm": 0.8656311631202698, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 55850 + }, + { + "epoch": 4.01149012567325, + "grad_norm": 0.945469081401825, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 55860 + }, + { + "epoch": 4.012208258527828, + "grad_norm": 0.8809926509857178, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 55870 + }, + { + "epoch": 4.012926391382406, + "grad_norm": 0.8047897219657898, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 55880 + }, + { + "epoch": 4.013644524236984, + "grad_norm": 1.0563900470733643, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 55890 + }, + { + "epoch": 4.014362657091562, + "grad_norm": 0.8578300476074219, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 55900 + }, + { + "epoch": 4.01508078994614, + "grad_norm": 1.0304765701293945, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 55910 + }, + { + "epoch": 4.015798922800718, + "grad_norm": 0.8087666034698486, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 55920 + }, + { + "epoch": 4.016517055655296, + "grad_norm": 1.0192348957061768, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 55930 + }, + { + "epoch": 4.017235188509875, + "grad_norm": 1.061194658279419, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 55940 + }, + { + "epoch": 4.017953321364453, + "grad_norm": 0.93668133020401, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 55950 + }, + { + "epoch": 4.018671454219031, + "grad_norm": 1.1569286584854126, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 55960 + }, + { + "epoch": 4.019389587073609, + "grad_norm": 0.9853817224502563, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 55970 + }, + { + "epoch": 4.020107719928187, + "grad_norm": 0.851109504699707, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 55980 + }, + { + "epoch": 4.020825852782765, + "grad_norm": 1.053525447845459, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 55990 + }, + { + "epoch": 4.021543985637343, + "grad_norm": 0.8307225704193115, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 56000 + }, + { + "epoch": 4.022262118491921, + "grad_norm": 1.2741150856018066, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 56010 + }, + { + "epoch": 4.022980251346499, + "grad_norm": 0.9708344340324402, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 56020 + }, + { + "epoch": 4.023698384201078, + "grad_norm": 1.265034556388855, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 56030 + }, + { + "epoch": 4.024416517055656, + "grad_norm": 0.9364367723464966, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 56040 + }, + { + "epoch": 4.025134649910234, + "grad_norm": 0.8643592000007629, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 56050 + }, + { + "epoch": 4.025852782764812, + "grad_norm": 0.9742133021354675, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 56060 + }, + { + "epoch": 4.02657091561939, + "grad_norm": 1.1793473958969116, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 56070 + }, + { + "epoch": 4.027289048473968, + "grad_norm": 0.9641149044036865, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 56080 + }, + { + "epoch": 4.028007181328546, + "grad_norm": 0.9426136016845703, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 56090 + }, + { + "epoch": 4.028725314183124, + "grad_norm": 0.9211869835853577, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 56100 + }, + { + "epoch": 4.029443447037702, + "grad_norm": 1.1576565504074097, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 56110 + }, + { + "epoch": 4.03016157989228, + "grad_norm": 1.0014013051986694, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 56120 + }, + { + "epoch": 4.0308797127468585, + "grad_norm": 0.9307010769844055, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 56130 + }, + { + "epoch": 4.0315978456014365, + "grad_norm": 0.8290148377418518, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 56140 + }, + { + "epoch": 4.0323159784560145, + "grad_norm": 1.0648446083068848, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 56150 + }, + { + "epoch": 4.0330341113105925, + "grad_norm": 1.1545547246932983, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 56160 + }, + { + "epoch": 4.0337522441651705, + "grad_norm": 0.9643545150756836, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 56170 + }, + { + "epoch": 4.0344703770197485, + "grad_norm": 0.8913900256156921, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 56180 + }, + { + "epoch": 4.0351885098743265, + "grad_norm": 0.9445754289627075, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 56190 + }, + { + "epoch": 4.0359066427289045, + "grad_norm": 0.9353124499320984, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 56200 + }, + { + "epoch": 4.0366247755834825, + "grad_norm": 1.1780431270599365, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 56210 + }, + { + "epoch": 4.037342908438061, + "grad_norm": 0.9208880662918091, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 56220 + }, + { + "epoch": 4.038061041292639, + "grad_norm": 0.9475517272949219, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 56230 + }, + { + "epoch": 4.038779174147217, + "grad_norm": 0.7478583455085754, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 56240 + }, + { + "epoch": 4.039497307001795, + "grad_norm": 1.0026403665542603, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 56250 + }, + { + "epoch": 4.040215439856373, + "grad_norm": 0.9664973020553589, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 56260 + }, + { + "epoch": 4.040933572710951, + "grad_norm": 1.0655616521835327, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 56270 + }, + { + "epoch": 4.041651705565529, + "grad_norm": 0.8367540240287781, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 56280 + }, + { + "epoch": 4.042369838420107, + "grad_norm": 0.7982191443443298, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 56290 + }, + { + "epoch": 4.043087971274685, + "grad_norm": 0.8304495215415955, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 56300 + }, + { + "epoch": 4.043806104129264, + "grad_norm": 0.95123291015625, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 56310 + }, + { + "epoch": 4.044524236983842, + "grad_norm": 0.9504102468490601, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 56320 + }, + { + "epoch": 4.04524236983842, + "grad_norm": 0.7432710528373718, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 56330 + }, + { + "epoch": 4.045960502692998, + "grad_norm": 0.9327874183654785, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 56340 + }, + { + "epoch": 4.046678635547576, + "grad_norm": 0.9161670804023743, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 56350 + }, + { + "epoch": 4.047396768402154, + "grad_norm": 0.9371771812438965, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 56360 + }, + { + "epoch": 4.048114901256732, + "grad_norm": 1.0332437753677368, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 56370 + }, + { + "epoch": 4.04883303411131, + "grad_norm": 0.7346320748329163, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 56380 + }, + { + "epoch": 4.049551166965888, + "grad_norm": 0.8247857689857483, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 56390 + }, + { + "epoch": 4.050269299820466, + "grad_norm": 0.925325334072113, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 56400 + }, + { + "epoch": 4.050987432675045, + "grad_norm": 0.7344088554382324, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 56410 + }, + { + "epoch": 4.051705565529623, + "grad_norm": 0.9204918146133423, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 56420 + }, + { + "epoch": 4.052423698384201, + "grad_norm": 0.8273472785949707, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 56430 + }, + { + "epoch": 4.053141831238779, + "grad_norm": 0.9524998068809509, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 56440 + }, + { + "epoch": 4.053859964093357, + "grad_norm": 0.9168205857276917, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 56450 + }, + { + "epoch": 4.054578096947935, + "grad_norm": 0.9634994864463806, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 56460 + }, + { + "epoch": 4.055296229802513, + "grad_norm": 1.2027593851089478, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 56470 + }, + { + "epoch": 4.056014362657091, + "grad_norm": 1.2347805500030518, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 56480 + }, + { + "epoch": 4.056732495511669, + "grad_norm": 0.8621458411216736, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 56490 + }, + { + "epoch": 4.057450628366248, + "grad_norm": 0.9194608330726624, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 56500 + }, + { + "epoch": 4.058168761220826, + "grad_norm": 1.0153663158416748, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 56510 + }, + { + "epoch": 4.058886894075404, + "grad_norm": 0.9170986413955688, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 56520 + }, + { + "epoch": 4.059605026929982, + "grad_norm": 1.033057689666748, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 56530 + }, + { + "epoch": 4.06032315978456, + "grad_norm": 1.0125197172164917, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 56540 + }, + { + "epoch": 4.061041292639138, + "grad_norm": 0.9429898262023926, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 56550 + }, + { + "epoch": 4.061759425493716, + "grad_norm": 0.9242179989814758, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 56560 + }, + { + "epoch": 4.062477558348294, + "grad_norm": 0.9365091323852539, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 56570 + }, + { + "epoch": 4.063195691202872, + "grad_norm": 0.9148455858230591, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 56580 + }, + { + "epoch": 4.063913824057451, + "grad_norm": 0.8546709418296814, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 56590 + }, + { + "epoch": 4.064631956912029, + "grad_norm": 0.9743902087211609, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 56600 + }, + { + "epoch": 4.065350089766607, + "grad_norm": 1.0599974393844604, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 56610 + }, + { + "epoch": 4.066068222621185, + "grad_norm": 0.9677841067314148, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 56620 + }, + { + "epoch": 4.066786355475763, + "grad_norm": 0.8892754316329956, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 56630 + }, + { + "epoch": 4.067504488330341, + "grad_norm": 0.8837814331054688, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 56640 + }, + { + "epoch": 4.068222621184919, + "grad_norm": 0.9284095764160156, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 56650 + }, + { + "epoch": 4.068940754039497, + "grad_norm": 1.0163567066192627, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 56660 + }, + { + "epoch": 4.069658886894075, + "grad_norm": 0.8713456988334656, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 56670 + }, + { + "epoch": 4.070377019748653, + "grad_norm": 0.8356686234474182, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 56680 + }, + { + "epoch": 4.071095152603232, + "grad_norm": 0.8998766541481018, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 56690 + }, + { + "epoch": 4.07181328545781, + "grad_norm": 1.0441967248916626, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 56700 + }, + { + "epoch": 4.072531418312388, + "grad_norm": 0.9313125610351562, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 56710 + }, + { + "epoch": 4.073249551166966, + "grad_norm": 0.9912964701652527, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 56720 + }, + { + "epoch": 4.073967684021544, + "grad_norm": 0.9048459529876709, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 56730 + }, + { + "epoch": 4.074685816876122, + "grad_norm": 1.0248944759368896, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 56740 + }, + { + "epoch": 4.0754039497307, + "grad_norm": 1.4526786804199219, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 56750 + }, + { + "epoch": 4.076122082585278, + "grad_norm": 0.9813178181648254, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 56760 + }, + { + "epoch": 4.076840215439856, + "grad_norm": 1.0686813592910767, + "learning_rate": 0.0002, + "loss": 0.5707, + "step": 56770 + }, + { + "epoch": 4.077558348294435, + "grad_norm": 1.1093482971191406, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 56780 + }, + { + "epoch": 4.078276481149013, + "grad_norm": 0.9377819895744324, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 56790 + }, + { + "epoch": 4.078994614003591, + "grad_norm": 0.8043649196624756, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 56800 + }, + { + "epoch": 4.079712746858169, + "grad_norm": 0.7995415925979614, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 56810 + }, + { + "epoch": 4.080430879712747, + "grad_norm": 1.0076148509979248, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 56820 + }, + { + "epoch": 4.081149012567325, + "grad_norm": 0.8192076683044434, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 56830 + }, + { + "epoch": 4.081867145421903, + "grad_norm": 0.9226266145706177, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 56840 + }, + { + "epoch": 4.082585278276481, + "grad_norm": 0.8877972960472107, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 56850 + }, + { + "epoch": 4.083303411131059, + "grad_norm": 0.9578937888145447, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 56860 + }, + { + "epoch": 4.084021543985638, + "grad_norm": 0.8929167985916138, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 56870 + }, + { + "epoch": 4.084739676840216, + "grad_norm": 1.0015977621078491, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 56880 + }, + { + "epoch": 4.085457809694794, + "grad_norm": 0.9768750667572021, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 56890 + }, + { + "epoch": 4.086175942549372, + "grad_norm": 1.0834569931030273, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 56900 + }, + { + "epoch": 4.08689407540395, + "grad_norm": 0.8761230707168579, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 56910 + }, + { + "epoch": 4.087612208258528, + "grad_norm": 1.027064323425293, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 56920 + }, + { + "epoch": 4.088330341113106, + "grad_norm": 1.130336880683899, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 56930 + }, + { + "epoch": 4.089048473967684, + "grad_norm": 0.8157579898834229, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 56940 + }, + { + "epoch": 4.089766606822262, + "grad_norm": 1.071175217628479, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 56950 + }, + { + "epoch": 4.09048473967684, + "grad_norm": 0.9534492492675781, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 56960 + }, + { + "epoch": 4.091202872531419, + "grad_norm": 0.9584037661552429, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 56970 + }, + { + "epoch": 4.091921005385997, + "grad_norm": 1.1513131856918335, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 56980 + }, + { + "epoch": 4.092639138240575, + "grad_norm": 1.0167666673660278, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 56990 + }, + { + "epoch": 4.093357271095153, + "grad_norm": 1.0630987882614136, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 57000 + }, + { + "epoch": 4.094075403949731, + "grad_norm": 1.0326893329620361, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 57010 + }, + { + "epoch": 4.094793536804309, + "grad_norm": 0.9701678156852722, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 57020 + }, + { + "epoch": 4.095511669658887, + "grad_norm": 0.839935302734375, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 57030 + }, + { + "epoch": 4.096229802513465, + "grad_norm": 0.8995838761329651, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 57040 + }, + { + "epoch": 4.096947935368043, + "grad_norm": 0.8039916157722473, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 57050 + }, + { + "epoch": 4.097666068222622, + "grad_norm": 1.126122236251831, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 57060 + }, + { + "epoch": 4.0983842010772, + "grad_norm": 0.8749837875366211, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 57070 + }, + { + "epoch": 4.099102333931778, + "grad_norm": 0.8630341291427612, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 57080 + }, + { + "epoch": 4.099820466786356, + "grad_norm": 0.8889496922492981, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 57090 + }, + { + "epoch": 4.100538599640934, + "grad_norm": 0.9050310254096985, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 57100 + }, + { + "epoch": 4.101256732495512, + "grad_norm": 0.943072497844696, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 57110 + }, + { + "epoch": 4.10197486535009, + "grad_norm": 0.9031552672386169, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 57120 + }, + { + "epoch": 4.102692998204668, + "grad_norm": 0.939862847328186, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 57130 + }, + { + "epoch": 4.103411131059246, + "grad_norm": 0.8080634474754333, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 57140 + }, + { + "epoch": 4.1041292639138245, + "grad_norm": 0.9181693196296692, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 57150 + }, + { + "epoch": 4.1048473967684025, + "grad_norm": 0.9609217643737793, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 57160 + }, + { + "epoch": 4.1055655296229805, + "grad_norm": 1.1246516704559326, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 57170 + }, + { + "epoch": 4.1062836624775585, + "grad_norm": 1.0616880655288696, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 57180 + }, + { + "epoch": 4.1070017953321365, + "grad_norm": 0.9954505562782288, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 57190 + }, + { + "epoch": 4.1077199281867145, + "grad_norm": 1.0602279901504517, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 57200 + }, + { + "epoch": 4.1084380610412925, + "grad_norm": 0.8984764814376831, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 57210 + }, + { + "epoch": 4.1091561938958705, + "grad_norm": 0.845167875289917, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 57220 + }, + { + "epoch": 4.1098743267504485, + "grad_norm": 0.7901500463485718, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 57230 + }, + { + "epoch": 4.1105924596050265, + "grad_norm": 1.0462526082992554, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 57240 + }, + { + "epoch": 4.111310592459605, + "grad_norm": 0.9098827838897705, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 57250 + }, + { + "epoch": 4.112028725314183, + "grad_norm": 0.9234077334403992, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 57260 + }, + { + "epoch": 4.112746858168761, + "grad_norm": 1.0033560991287231, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 57270 + }, + { + "epoch": 4.113464991023339, + "grad_norm": 1.0620051622390747, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 57280 + }, + { + "epoch": 4.114183123877917, + "grad_norm": 0.8679345846176147, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 57290 + }, + { + "epoch": 4.114901256732495, + "grad_norm": 0.7557345628738403, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 57300 + }, + { + "epoch": 4.115619389587073, + "grad_norm": 0.8970935344696045, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 57310 + }, + { + "epoch": 4.116337522441651, + "grad_norm": 1.0779842138290405, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 57320 + }, + { + "epoch": 4.117055655296229, + "grad_norm": 1.2036106586456299, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 57330 + }, + { + "epoch": 4.117773788150808, + "grad_norm": 0.8337953686714172, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 57340 + }, + { + "epoch": 4.118491921005386, + "grad_norm": 0.9850410223007202, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 57350 + }, + { + "epoch": 4.119210053859964, + "grad_norm": 0.8028770685195923, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 57360 + }, + { + "epoch": 4.119928186714542, + "grad_norm": 0.8693217039108276, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 57370 + }, + { + "epoch": 4.12064631956912, + "grad_norm": 0.8795534372329712, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 57380 + }, + { + "epoch": 4.121364452423698, + "grad_norm": 1.0081543922424316, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 57390 + }, + { + "epoch": 4.122082585278276, + "grad_norm": 0.8776742219924927, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 57400 + }, + { + "epoch": 4.122800718132854, + "grad_norm": 0.8247824311256409, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 57410 + }, + { + "epoch": 4.123518850987432, + "grad_norm": 1.1346335411071777, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 57420 + }, + { + "epoch": 4.124236983842011, + "grad_norm": 1.0671089887619019, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 57430 + }, + { + "epoch": 4.124955116696589, + "grad_norm": 0.8548333048820496, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 57440 + }, + { + "epoch": 4.125673249551167, + "grad_norm": 1.0221573114395142, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 57450 + }, + { + "epoch": 4.126391382405745, + "grad_norm": 0.9746617674827576, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 57460 + }, + { + "epoch": 4.127109515260323, + "grad_norm": 0.8104965090751648, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 57470 + }, + { + "epoch": 4.127827648114901, + "grad_norm": 1.0401487350463867, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 57480 + }, + { + "epoch": 4.128545780969479, + "grad_norm": 0.8828882575035095, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 57490 + }, + { + "epoch": 4.129263913824057, + "grad_norm": 1.0121098756790161, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 57500 + }, + { + "epoch": 4.129982046678635, + "grad_norm": 0.8789737820625305, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 57510 + }, + { + "epoch": 4.130700179533213, + "grad_norm": 1.0386744737625122, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 57520 + }, + { + "epoch": 4.131418312387792, + "grad_norm": 1.0092610120773315, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 57530 + }, + { + "epoch": 4.13213644524237, + "grad_norm": 0.8706282377243042, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 57540 + }, + { + "epoch": 4.132854578096948, + "grad_norm": 0.9270507097244263, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 57550 + }, + { + "epoch": 4.133572710951526, + "grad_norm": 1.0303068161010742, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 57560 + }, + { + "epoch": 4.134290843806104, + "grad_norm": 1.1169062852859497, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 57570 + }, + { + "epoch": 4.135008976660682, + "grad_norm": 0.8530599474906921, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 57580 + }, + { + "epoch": 4.13572710951526, + "grad_norm": 1.1395039558410645, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 57590 + }, + { + "epoch": 4.136445242369838, + "grad_norm": 0.8944115042686462, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 57600 + }, + { + "epoch": 4.137163375224416, + "grad_norm": 1.137966275215149, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 57610 + }, + { + "epoch": 4.137881508078995, + "grad_norm": 0.8244962692260742, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 57620 + }, + { + "epoch": 4.138599640933573, + "grad_norm": 1.1935817003250122, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 57630 + }, + { + "epoch": 4.139317773788151, + "grad_norm": 0.9774235486984253, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 57640 + }, + { + "epoch": 4.140035906642729, + "grad_norm": 1.066219449043274, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 57650 + }, + { + "epoch": 4.140754039497307, + "grad_norm": 0.8631396293640137, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 57660 + }, + { + "epoch": 4.141472172351885, + "grad_norm": 0.888410747051239, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 57670 + }, + { + "epoch": 4.142190305206463, + "grad_norm": 1.002642035484314, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 57680 + }, + { + "epoch": 4.142908438061041, + "grad_norm": 1.0092825889587402, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 57690 + }, + { + "epoch": 4.143626570915619, + "grad_norm": 0.9126971364021301, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 57700 + }, + { + "epoch": 4.144344703770198, + "grad_norm": 1.0303562879562378, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 57710 + }, + { + "epoch": 4.145062836624776, + "grad_norm": 1.1230897903442383, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 57720 + }, + { + "epoch": 4.145780969479354, + "grad_norm": 1.0494099855422974, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 57730 + }, + { + "epoch": 4.146499102333932, + "grad_norm": 0.9555442333221436, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 57740 + }, + { + "epoch": 4.14721723518851, + "grad_norm": 0.8255124092102051, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 57750 + }, + { + "epoch": 4.147935368043088, + "grad_norm": 1.097853660583496, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 57760 + }, + { + "epoch": 4.148653500897666, + "grad_norm": 1.0272663831710815, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 57770 + }, + { + "epoch": 4.149371633752244, + "grad_norm": 1.022571086883545, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 57780 + }, + { + "epoch": 4.150089766606822, + "grad_norm": 0.964543342590332, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 57790 + }, + { + "epoch": 4.1508078994614, + "grad_norm": 0.9251219034194946, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 57800 + }, + { + "epoch": 4.151526032315979, + "grad_norm": 1.081840991973877, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 57810 + }, + { + "epoch": 4.152244165170557, + "grad_norm": 0.8989445567131042, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 57820 + }, + { + "epoch": 4.152962298025135, + "grad_norm": 0.903629720211029, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 57830 + }, + { + "epoch": 4.153680430879713, + "grad_norm": 0.8985397219657898, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 57840 + }, + { + "epoch": 4.154398563734291, + "grad_norm": 1.047778844833374, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 57850 + }, + { + "epoch": 4.155116696588869, + "grad_norm": 0.9804165363311768, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 57860 + }, + { + "epoch": 4.155834829443447, + "grad_norm": 1.187309980392456, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 57870 + }, + { + "epoch": 4.156552962298025, + "grad_norm": 0.9854836463928223, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 57880 + }, + { + "epoch": 4.157271095152603, + "grad_norm": 0.8494308590888977, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 57890 + }, + { + "epoch": 4.157989228007182, + "grad_norm": 0.9359684586524963, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 57900 + }, + { + "epoch": 4.15870736086176, + "grad_norm": 0.8971988558769226, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 57910 + }, + { + "epoch": 4.159425493716338, + "grad_norm": 0.8848021030426025, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 57920 + }, + { + "epoch": 4.160143626570916, + "grad_norm": 0.982877790927887, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 57930 + }, + { + "epoch": 4.160861759425494, + "grad_norm": 0.8668819069862366, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 57940 + }, + { + "epoch": 4.161579892280072, + "grad_norm": 1.06569504737854, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 57950 + }, + { + "epoch": 4.16229802513465, + "grad_norm": 1.165740728378296, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 57960 + }, + { + "epoch": 4.163016157989228, + "grad_norm": 1.0534512996673584, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 57970 + }, + { + "epoch": 4.163734290843806, + "grad_norm": 0.8785330653190613, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 57980 + }, + { + "epoch": 4.164452423698384, + "grad_norm": 1.1244874000549316, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 57990 + }, + { + "epoch": 4.165170556552963, + "grad_norm": 0.8839399218559265, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 58000 + }, + { + "epoch": 4.165888689407541, + "grad_norm": 1.0603798627853394, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 58010 + }, + { + "epoch": 4.166606822262119, + "grad_norm": 0.9737853407859802, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 58020 + }, + { + "epoch": 4.167324955116697, + "grad_norm": 1.0650558471679688, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 58030 + }, + { + "epoch": 4.168043087971275, + "grad_norm": 0.7528959512710571, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 58040 + }, + { + "epoch": 4.168761220825853, + "grad_norm": 0.9286156892776489, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 58050 + }, + { + "epoch": 4.169479353680431, + "grad_norm": 1.0225880146026611, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 58060 + }, + { + "epoch": 4.170197486535009, + "grad_norm": 0.9990654587745667, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 58070 + }, + { + "epoch": 4.170915619389587, + "grad_norm": 1.052057147026062, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 58080 + }, + { + "epoch": 4.1716337522441655, + "grad_norm": 0.7366801500320435, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 58090 + }, + { + "epoch": 4.1723518850987436, + "grad_norm": 1.0943711996078491, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 58100 + }, + { + "epoch": 4.1730700179533216, + "grad_norm": 1.1297656297683716, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 58110 + }, + { + "epoch": 4.1737881508078996, + "grad_norm": 0.7861461639404297, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 58120 + }, + { + "epoch": 4.174506283662478, + "grad_norm": 0.8643335103988647, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 58130 + }, + { + "epoch": 4.175224416517056, + "grad_norm": 0.957288384437561, + "learning_rate": 0.0002, + "loss": 0.6103, + "step": 58140 + }, + { + "epoch": 4.175942549371634, + "grad_norm": 0.9175366759300232, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 58150 + }, + { + "epoch": 4.176660682226212, + "grad_norm": 1.129935622215271, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 58160 + }, + { + "epoch": 4.17737881508079, + "grad_norm": 0.9683087468147278, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 58170 + }, + { + "epoch": 4.1780969479353685, + "grad_norm": 1.045171856880188, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 58180 + }, + { + "epoch": 4.1788150807899465, + "grad_norm": 0.9858742952346802, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 58190 + }, + { + "epoch": 4.1795332136445245, + "grad_norm": 0.8513413071632385, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 58200 + }, + { + "epoch": 4.1802513464991025, + "grad_norm": 0.9584265947341919, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 58210 + }, + { + "epoch": 4.1809694793536805, + "grad_norm": 0.8828920722007751, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 58220 + }, + { + "epoch": 4.1816876122082585, + "grad_norm": 0.9849961400032043, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 58230 + }, + { + "epoch": 4.1824057450628365, + "grad_norm": 1.0601637363433838, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 58240 + }, + { + "epoch": 4.1831238779174145, + "grad_norm": 1.2206604480743408, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 58250 + }, + { + "epoch": 4.1838420107719925, + "grad_norm": 1.1768009662628174, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 58260 + }, + { + "epoch": 4.184560143626571, + "grad_norm": 0.9521295428276062, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 58270 + }, + { + "epoch": 4.185278276481149, + "grad_norm": 0.892971932888031, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 58280 + }, + { + "epoch": 4.185996409335727, + "grad_norm": 0.8712016940116882, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 58290 + }, + { + "epoch": 4.186714542190305, + "grad_norm": 1.0190843343734741, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 58300 + }, + { + "epoch": 4.187432675044883, + "grad_norm": 1.0149270296096802, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 58310 + }, + { + "epoch": 4.188150807899461, + "grad_norm": 1.1818004846572876, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 58320 + }, + { + "epoch": 4.188868940754039, + "grad_norm": 0.7892335653305054, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 58330 + }, + { + "epoch": 4.189587073608617, + "grad_norm": 0.9792808890342712, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 58340 + }, + { + "epoch": 4.190305206463195, + "grad_norm": 0.9946883320808411, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 58350 + }, + { + "epoch": 4.191023339317773, + "grad_norm": 1.0363789796829224, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 58360 + }, + { + "epoch": 4.191741472172352, + "grad_norm": 0.9285917282104492, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 58370 + }, + { + "epoch": 4.19245960502693, + "grad_norm": 0.9461679458618164, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 58380 + }, + { + "epoch": 4.193177737881508, + "grad_norm": 1.0344175100326538, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 58390 + }, + { + "epoch": 4.193895870736086, + "grad_norm": 0.9530242085456848, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 58400 + }, + { + "epoch": 4.194614003590664, + "grad_norm": 0.9171900749206543, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 58410 + }, + { + "epoch": 4.195332136445242, + "grad_norm": 0.8094898462295532, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 58420 + }, + { + "epoch": 4.19605026929982, + "grad_norm": 0.921981930732727, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 58430 + }, + { + "epoch": 4.196768402154398, + "grad_norm": 0.9783532023429871, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 58440 + }, + { + "epoch": 4.197486535008976, + "grad_norm": 1.017805576324463, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 58450 + }, + { + "epoch": 4.198204667863555, + "grad_norm": 0.9244308471679688, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 58460 + }, + { + "epoch": 4.198922800718133, + "grad_norm": 0.9942585229873657, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 58470 + }, + { + "epoch": 4.199640933572711, + "grad_norm": 1.1045037508010864, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 58480 + }, + { + "epoch": 4.200359066427289, + "grad_norm": 0.9483149647712708, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58490 + }, + { + "epoch": 4.201077199281867, + "grad_norm": 1.0807271003723145, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 58500 + }, + { + "epoch": 4.201795332136445, + "grad_norm": 0.7697445750236511, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 58510 + }, + { + "epoch": 4.202513464991023, + "grad_norm": 1.0761178731918335, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 58520 + }, + { + "epoch": 4.203231597845601, + "grad_norm": 0.9992024898529053, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 58530 + }, + { + "epoch": 4.203949730700179, + "grad_norm": 0.8741498589515686, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 58540 + }, + { + "epoch": 4.204667863554757, + "grad_norm": 0.8557528853416443, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 58550 + }, + { + "epoch": 4.205385996409336, + "grad_norm": 0.8853630423545837, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 58560 + }, + { + "epoch": 4.206104129263914, + "grad_norm": 0.9858933687210083, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 58570 + }, + { + "epoch": 4.206822262118492, + "grad_norm": 1.104732871055603, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 58580 + }, + { + "epoch": 4.20754039497307, + "grad_norm": 0.9345462322235107, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 58590 + }, + { + "epoch": 4.208258527827648, + "grad_norm": 0.9620407819747925, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 58600 + }, + { + "epoch": 4.208976660682226, + "grad_norm": 0.8546963334083557, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 58610 + }, + { + "epoch": 4.209694793536804, + "grad_norm": 0.8125145435333252, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 58620 + }, + { + "epoch": 4.210412926391382, + "grad_norm": 0.8481138944625854, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 58630 + }, + { + "epoch": 4.21113105924596, + "grad_norm": 0.8884692788124084, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 58640 + }, + { + "epoch": 4.211849192100539, + "grad_norm": 1.09279465675354, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 58650 + }, + { + "epoch": 4.212567324955117, + "grad_norm": 0.9806583523750305, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 58660 + }, + { + "epoch": 4.213285457809695, + "grad_norm": 0.9510366916656494, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 58670 + }, + { + "epoch": 4.214003590664273, + "grad_norm": 0.7517459988594055, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 58680 + }, + { + "epoch": 4.214721723518851, + "grad_norm": 1.1134123802185059, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 58690 + }, + { + "epoch": 4.215439856373429, + "grad_norm": 0.8307328820228577, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 58700 + }, + { + "epoch": 4.216157989228007, + "grad_norm": 0.8211639523506165, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 58710 + }, + { + "epoch": 4.216876122082585, + "grad_norm": 1.0749584436416626, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 58720 + }, + { + "epoch": 4.217594254937163, + "grad_norm": 1.1394833326339722, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 58730 + }, + { + "epoch": 4.218312387791742, + "grad_norm": 1.05130934715271, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 58740 + }, + { + "epoch": 4.21903052064632, + "grad_norm": 0.7949456572532654, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 58750 + }, + { + "epoch": 4.219748653500898, + "grad_norm": 0.906506359577179, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 58760 + }, + { + "epoch": 4.220466786355476, + "grad_norm": 0.8338989615440369, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 58770 + }, + { + "epoch": 4.221184919210054, + "grad_norm": 0.9325370788574219, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 58780 + }, + { + "epoch": 4.221903052064632, + "grad_norm": 1.0208096504211426, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 58790 + }, + { + "epoch": 4.22262118491921, + "grad_norm": 1.0075920820236206, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 58800 + }, + { + "epoch": 4.223339317773788, + "grad_norm": 0.9858701229095459, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 58810 + }, + { + "epoch": 4.224057450628366, + "grad_norm": 1.0010110139846802, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 58820 + }, + { + "epoch": 4.224775583482945, + "grad_norm": 0.9360540509223938, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 58830 + }, + { + "epoch": 4.225493716337523, + "grad_norm": 0.9021786451339722, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 58840 + }, + { + "epoch": 4.226211849192101, + "grad_norm": 1.1778476238250732, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 58850 + }, + { + "epoch": 4.226929982046679, + "grad_norm": 1.0061023235321045, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 58860 + }, + { + "epoch": 4.227648114901257, + "grad_norm": 0.8839752674102783, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58870 + }, + { + "epoch": 4.228366247755835, + "grad_norm": 1.0078870058059692, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 58880 + }, + { + "epoch": 4.229084380610413, + "grad_norm": 0.8926451206207275, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 58890 + }, + { + "epoch": 4.229802513464991, + "grad_norm": 1.4018772840499878, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 58900 + }, + { + "epoch": 4.230520646319569, + "grad_norm": 0.9911289215087891, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 58910 + }, + { + "epoch": 4.231238779174147, + "grad_norm": 0.9374576807022095, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58920 + }, + { + "epoch": 4.231956912028726, + "grad_norm": 1.179650068283081, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 58930 + }, + { + "epoch": 4.232675044883304, + "grad_norm": 0.9434911012649536, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 58940 + }, + { + "epoch": 4.233393177737882, + "grad_norm": 1.0061911344528198, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 58950 + }, + { + "epoch": 4.23411131059246, + "grad_norm": 0.9663233757019043, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 58960 + }, + { + "epoch": 4.234829443447038, + "grad_norm": 0.8897581696510315, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 58970 + }, + { + "epoch": 4.235547576301616, + "grad_norm": 0.873281717300415, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 58980 + }, + { + "epoch": 4.236265709156194, + "grad_norm": 0.9146949052810669, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 58990 + }, + { + "epoch": 4.236983842010772, + "grad_norm": 0.9381195306777954, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 59000 + }, + { + "epoch": 4.23770197486535, + "grad_norm": 0.9700697064399719, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 59010 + }, + { + "epoch": 4.238420107719929, + "grad_norm": 0.9050154685974121, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 59020 + }, + { + "epoch": 4.239138240574507, + "grad_norm": 0.9901503324508667, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 59030 + }, + { + "epoch": 4.239856373429085, + "grad_norm": 0.9009594321250916, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 59040 + }, + { + "epoch": 4.240574506283663, + "grad_norm": 1.0924968719482422, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 59050 + }, + { + "epoch": 4.241292639138241, + "grad_norm": 0.9939947724342346, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 59060 + }, + { + "epoch": 4.242010771992819, + "grad_norm": 1.0577857494354248, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 59070 + }, + { + "epoch": 4.242728904847397, + "grad_norm": 1.0836747884750366, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 59080 + }, + { + "epoch": 4.243447037701975, + "grad_norm": 0.97043377161026, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 59090 + }, + { + "epoch": 4.244165170556553, + "grad_norm": 0.7711901664733887, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 59100 + }, + { + "epoch": 4.244883303411131, + "grad_norm": 1.0143170356750488, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 59110 + }, + { + "epoch": 4.2456014362657095, + "grad_norm": 0.9151925444602966, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 59120 + }, + { + "epoch": 4.2463195691202875, + "grad_norm": 0.9252700209617615, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 59130 + }, + { + "epoch": 4.2470377019748655, + "grad_norm": 0.8429408073425293, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 59140 + }, + { + "epoch": 4.2477558348294435, + "grad_norm": 0.9645987153053284, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 59150 + }, + { + "epoch": 4.2484739676840215, + "grad_norm": 0.9949791431427002, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 59160 + }, + { + "epoch": 4.2491921005385995, + "grad_norm": 0.9128350615501404, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 59170 + }, + { + "epoch": 4.2499102333931775, + "grad_norm": 0.7406911849975586, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 59180 + }, + { + "epoch": 4.2506283662477555, + "grad_norm": 1.0237419605255127, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 59190 + }, + { + "epoch": 4.2513464991023335, + "grad_norm": 0.805459201335907, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 59200 + }, + { + "epoch": 4.252064631956912, + "grad_norm": 0.8477254509925842, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 59210 + }, + { + "epoch": 4.25278276481149, + "grad_norm": 0.984023928642273, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 59220 + }, + { + "epoch": 4.253500897666068, + "grad_norm": 1.0667484998703003, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 59230 + }, + { + "epoch": 4.254219030520646, + "grad_norm": 0.7192284464836121, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 59240 + }, + { + "epoch": 4.254937163375224, + "grad_norm": 0.9557451009750366, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 59250 + }, + { + "epoch": 4.255655296229802, + "grad_norm": 0.9209784865379333, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 59260 + }, + { + "epoch": 4.25637342908438, + "grad_norm": 0.9785363674163818, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 59270 + }, + { + "epoch": 4.257091561938958, + "grad_norm": 0.910214364528656, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 59280 + }, + { + "epoch": 4.257809694793536, + "grad_norm": 0.8945858478546143, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 59290 + }, + { + "epoch": 4.258527827648114, + "grad_norm": 1.0984420776367188, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 59300 + }, + { + "epoch": 4.259245960502693, + "grad_norm": 1.0256640911102295, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 59310 + }, + { + "epoch": 4.259964093357271, + "grad_norm": 0.978397786617279, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 59320 + }, + { + "epoch": 4.260682226211849, + "grad_norm": 0.7587000727653503, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 59330 + }, + { + "epoch": 4.261400359066427, + "grad_norm": 0.9384620785713196, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 59340 + }, + { + "epoch": 4.262118491921005, + "grad_norm": 0.893992006778717, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 59350 + }, + { + "epoch": 4.262836624775583, + "grad_norm": 1.0231536626815796, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 59360 + }, + { + "epoch": 4.263554757630161, + "grad_norm": 0.9810128211975098, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 59370 + }, + { + "epoch": 4.264272890484739, + "grad_norm": 1.0868116617202759, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 59380 + }, + { + "epoch": 4.264991023339318, + "grad_norm": 1.1433676481246948, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 59390 + }, + { + "epoch": 4.265709156193896, + "grad_norm": 0.9836946725845337, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 59400 + }, + { + "epoch": 4.266427289048474, + "grad_norm": 0.9473603963851929, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 59410 + }, + { + "epoch": 4.267145421903052, + "grad_norm": 0.9066835641860962, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 59420 + }, + { + "epoch": 4.26786355475763, + "grad_norm": 1.0534718036651611, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 59430 + }, + { + "epoch": 4.268581687612208, + "grad_norm": 1.0392775535583496, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 59440 + }, + { + "epoch": 4.269299820466786, + "grad_norm": 1.011472463607788, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 59450 + }, + { + "epoch": 4.270017953321364, + "grad_norm": 1.0704147815704346, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 59460 + }, + { + "epoch": 4.270736086175942, + "grad_norm": 0.9349238872528076, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 59470 + }, + { + "epoch": 4.27145421903052, + "grad_norm": 0.8745087385177612, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 59480 + }, + { + "epoch": 4.272172351885099, + "grad_norm": 0.8823763728141785, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 59490 + }, + { + "epoch": 4.272890484739677, + "grad_norm": 1.110912799835205, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 59500 + }, + { + "epoch": 4.273608617594255, + "grad_norm": 1.0000925064086914, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 59510 + }, + { + "epoch": 4.274326750448833, + "grad_norm": 1.1578227281570435, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 59520 + }, + { + "epoch": 4.275044883303411, + "grad_norm": 0.875720202922821, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 59530 + }, + { + "epoch": 4.275763016157989, + "grad_norm": 0.9562238454818726, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 59540 + }, + { + "epoch": 4.276481149012567, + "grad_norm": 0.8384222388267517, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 59550 + }, + { + "epoch": 4.277199281867145, + "grad_norm": 1.2719428539276123, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 59560 + }, + { + "epoch": 4.277917414721723, + "grad_norm": 1.0656434297561646, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 59570 + }, + { + "epoch": 4.278635547576302, + "grad_norm": 1.0766716003417969, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 59580 + }, + { + "epoch": 4.27935368043088, + "grad_norm": 0.8892807960510254, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 59590 + }, + { + "epoch": 4.280071813285458, + "grad_norm": 0.8956300020217896, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 59600 + }, + { + "epoch": 4.280789946140036, + "grad_norm": 0.9562926888465881, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 59610 + }, + { + "epoch": 4.281508078994614, + "grad_norm": 1.009141445159912, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 59620 + }, + { + "epoch": 4.282226211849192, + "grad_norm": 1.0546064376831055, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 59630 + }, + { + "epoch": 4.28294434470377, + "grad_norm": 0.8831254243850708, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 59640 + }, + { + "epoch": 4.283662477558348, + "grad_norm": 0.9560053944587708, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 59650 + }, + { + "epoch": 4.284380610412926, + "grad_norm": 1.030339241027832, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 59660 + }, + { + "epoch": 4.285098743267504, + "grad_norm": 1.00662100315094, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 59670 + }, + { + "epoch": 4.285816876122083, + "grad_norm": 1.0759116411209106, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 59680 + }, + { + "epoch": 4.286535008976661, + "grad_norm": 0.9985393285751343, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 59690 + }, + { + "epoch": 4.287253141831239, + "grad_norm": 0.9044474959373474, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 59700 + }, + { + "epoch": 4.287971274685817, + "grad_norm": 1.1224442720413208, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 59710 + }, + { + "epoch": 4.288689407540395, + "grad_norm": 0.8436414003372192, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 59720 + }, + { + "epoch": 4.289407540394973, + "grad_norm": 1.0695041418075562, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 59730 + }, + { + "epoch": 4.290125673249551, + "grad_norm": 0.8809951543807983, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 59740 + }, + { + "epoch": 4.290843806104129, + "grad_norm": 1.0213792324066162, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 59750 + }, + { + "epoch": 4.291561938958707, + "grad_norm": 0.9660196900367737, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 59760 + }, + { + "epoch": 4.292280071813286, + "grad_norm": 0.8005787134170532, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 59770 + }, + { + "epoch": 4.292998204667864, + "grad_norm": 1.0016109943389893, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 59780 + }, + { + "epoch": 4.293716337522442, + "grad_norm": 0.9112903475761414, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 59790 + }, + { + "epoch": 4.29443447037702, + "grad_norm": 0.9999852180480957, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 59800 + }, + { + "epoch": 4.295152603231598, + "grad_norm": 0.9323953986167908, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 59810 + }, + { + "epoch": 4.295870736086176, + "grad_norm": 0.903037965297699, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 59820 + }, + { + "epoch": 4.296588868940754, + "grad_norm": 1.2462431192398071, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 59830 + }, + { + "epoch": 4.297307001795332, + "grad_norm": 1.2322230339050293, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 59840 + }, + { + "epoch": 4.29802513464991, + "grad_norm": 0.9584668278694153, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 59850 + }, + { + "epoch": 4.298743267504488, + "grad_norm": 0.9664767980575562, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 59860 + }, + { + "epoch": 4.299461400359067, + "grad_norm": 0.8860437273979187, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 59870 + }, + { + "epoch": 4.300179533213645, + "grad_norm": 1.0825127363204956, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 59880 + }, + { + "epoch": 4.300897666068223, + "grad_norm": 1.1312100887298584, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 59890 + }, + { + "epoch": 4.301615798922801, + "grad_norm": 0.8289751410484314, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 59900 + }, + { + "epoch": 4.302333931777379, + "grad_norm": 0.8990927934646606, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 59910 + }, + { + "epoch": 4.303052064631957, + "grad_norm": 0.9667525887489319, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 59920 + }, + { + "epoch": 4.303770197486535, + "grad_norm": 0.8656060695648193, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 59930 + }, + { + "epoch": 4.304488330341113, + "grad_norm": 0.8909396529197693, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 59940 + }, + { + "epoch": 4.305206463195692, + "grad_norm": 0.9533283114433289, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 59950 + }, + { + "epoch": 4.30592459605027, + "grad_norm": 0.9090739488601685, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 59960 + }, + { + "epoch": 4.306642728904848, + "grad_norm": 1.096656322479248, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 59970 + }, + { + "epoch": 4.307360861759426, + "grad_norm": 1.0392465591430664, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 59980 + }, + { + "epoch": 4.308078994614004, + "grad_norm": 0.8733913898468018, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 59990 + }, + { + "epoch": 4.308797127468582, + "grad_norm": 0.8287094235420227, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 60000 + }, + { + "epoch": 4.30951526032316, + "grad_norm": 0.9267017245292664, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 60010 + }, + { + "epoch": 4.310233393177738, + "grad_norm": 0.9969515800476074, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 60020 + }, + { + "epoch": 4.310951526032316, + "grad_norm": 1.0005015134811401, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 60030 + }, + { + "epoch": 4.311669658886894, + "grad_norm": 1.1215369701385498, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 60040 + }, + { + "epoch": 4.312387791741473, + "grad_norm": 1.0434890985488892, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 60050 + }, + { + "epoch": 4.313105924596051, + "grad_norm": 0.967989981174469, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 60060 + }, + { + "epoch": 4.313824057450629, + "grad_norm": 1.007599115371704, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 60070 + }, + { + "epoch": 4.314542190305207, + "grad_norm": 0.9356340765953064, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 60080 + }, + { + "epoch": 4.315260323159785, + "grad_norm": 0.9566757678985596, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 60090 + }, + { + "epoch": 4.315978456014363, + "grad_norm": 1.1066830158233643, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 60100 + }, + { + "epoch": 4.316696588868941, + "grad_norm": 0.9895772933959961, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 60110 + }, + { + "epoch": 4.317414721723519, + "grad_norm": 1.07423734664917, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 60120 + }, + { + "epoch": 4.318132854578097, + "grad_norm": 1.0777037143707275, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 60130 + }, + { + "epoch": 4.3188509874326755, + "grad_norm": 1.1475656032562256, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 60140 + }, + { + "epoch": 4.3195691202872535, + "grad_norm": 1.0705864429473877, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 60150 + }, + { + "epoch": 4.3202872531418315, + "grad_norm": 0.8676854968070984, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 60160 + }, + { + "epoch": 4.3210053859964095, + "grad_norm": 0.9488174319267273, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 60170 + }, + { + "epoch": 4.3217235188509875, + "grad_norm": 1.1171153783798218, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 60180 + }, + { + "epoch": 4.3224416517055655, + "grad_norm": 1.091435194015503, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 60190 + }, + { + "epoch": 4.3231597845601435, + "grad_norm": 0.880944013595581, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 60200 + }, + { + "epoch": 4.3238779174147215, + "grad_norm": 0.8458809852600098, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 60210 + }, + { + "epoch": 4.3245960502692995, + "grad_norm": 0.7900225520133972, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 60220 + }, + { + "epoch": 4.3253141831238775, + "grad_norm": 0.966742753982544, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 60230 + }, + { + "epoch": 4.326032315978456, + "grad_norm": 0.8948110342025757, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 60240 + }, + { + "epoch": 4.326750448833034, + "grad_norm": 0.8598700165748596, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 60250 + }, + { + "epoch": 4.327468581687612, + "grad_norm": 1.127610206604004, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 60260 + }, + { + "epoch": 4.32818671454219, + "grad_norm": 0.8357340693473816, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 60270 + }, + { + "epoch": 4.328904847396768, + "grad_norm": 0.8771896362304688, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 60280 + }, + { + "epoch": 4.329622980251346, + "grad_norm": 0.9202101826667786, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 60290 + }, + { + "epoch": 4.330341113105924, + "grad_norm": 1.1427538394927979, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 60300 + }, + { + "epoch": 4.331059245960502, + "grad_norm": 0.8711863160133362, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 60310 + }, + { + "epoch": 4.33177737881508, + "grad_norm": 0.972723662853241, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 60320 + }, + { + "epoch": 4.332495511669659, + "grad_norm": 1.1496877670288086, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 60330 + }, + { + "epoch": 4.333213644524237, + "grad_norm": 1.008581519126892, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 60340 + }, + { + "epoch": 4.333931777378815, + "grad_norm": 1.0802706480026245, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 60350 + }, + { + "epoch": 4.334649910233393, + "grad_norm": 0.8394291996955872, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 60360 + }, + { + "epoch": 4.335368043087971, + "grad_norm": 0.8355905413627625, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 60370 + }, + { + "epoch": 4.336086175942549, + "grad_norm": 0.9583960175514221, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 60380 + }, + { + "epoch": 4.336804308797127, + "grad_norm": 1.138934850692749, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 60390 + }, + { + "epoch": 4.337522441651705, + "grad_norm": 1.0334709882736206, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 60400 + }, + { + "epoch": 4.338240574506283, + "grad_norm": 0.729686439037323, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 60410 + }, + { + "epoch": 4.338958707360861, + "grad_norm": 0.8735929727554321, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 60420 + }, + { + "epoch": 4.33967684021544, + "grad_norm": 0.9617681503295898, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 60430 + }, + { + "epoch": 4.340394973070018, + "grad_norm": 0.9439655542373657, + "learning_rate": 0.0002, + "loss": 0.5865, + "step": 60440 + }, + { + "epoch": 4.341113105924596, + "grad_norm": 0.9275408387184143, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 60450 + }, + { + "epoch": 4.341831238779174, + "grad_norm": 1.0693308115005493, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 60460 + }, + { + "epoch": 4.342549371633752, + "grad_norm": 0.9234438538551331, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 60470 + }, + { + "epoch": 4.34326750448833, + "grad_norm": 1.1376168727874756, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 60480 + }, + { + "epoch": 4.343985637342908, + "grad_norm": 0.9218108654022217, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 60490 + }, + { + "epoch": 4.344703770197486, + "grad_norm": 1.1467362642288208, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 60500 + }, + { + "epoch": 4.345421903052064, + "grad_norm": 0.9459165930747986, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 60510 + }, + { + "epoch": 4.346140035906643, + "grad_norm": 0.9460827708244324, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 60520 + }, + { + "epoch": 4.346858168761221, + "grad_norm": 1.0845041275024414, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 60530 + }, + { + "epoch": 4.347576301615799, + "grad_norm": 1.082675576210022, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 60540 + }, + { + "epoch": 4.348294434470377, + "grad_norm": 0.8443698883056641, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 60550 + }, + { + "epoch": 4.349012567324955, + "grad_norm": 1.018393874168396, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 60560 + }, + { + "epoch": 4.349730700179533, + "grad_norm": 0.8796373009681702, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 60570 + }, + { + "epoch": 4.350448833034111, + "grad_norm": 1.097942590713501, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 60580 + }, + { + "epoch": 4.351166965888689, + "grad_norm": 0.8750485181808472, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 60590 + }, + { + "epoch": 4.351885098743267, + "grad_norm": 1.0339995622634888, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 60600 + }, + { + "epoch": 4.352603231597846, + "grad_norm": 0.9077731966972351, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 60610 + }, + { + "epoch": 4.353321364452424, + "grad_norm": 1.051321029663086, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 60620 + }, + { + "epoch": 4.354039497307002, + "grad_norm": 1.0018669366836548, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 60630 + }, + { + "epoch": 4.35475763016158, + "grad_norm": 1.0349196195602417, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 60640 + }, + { + "epoch": 4.355475763016158, + "grad_norm": 1.009589672088623, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 60650 + }, + { + "epoch": 4.356193895870736, + "grad_norm": 1.0463480949401855, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 60660 + }, + { + "epoch": 4.356912028725314, + "grad_norm": 0.9815132021903992, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 60670 + }, + { + "epoch": 4.357630161579892, + "grad_norm": 1.0977262258529663, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 60680 + }, + { + "epoch": 4.35834829443447, + "grad_norm": 0.8450005054473877, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 60690 + }, + { + "epoch": 4.359066427289049, + "grad_norm": 1.0959078073501587, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 60700 + }, + { + "epoch": 4.359784560143627, + "grad_norm": 0.9155098795890808, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 60710 + }, + { + "epoch": 4.360502692998205, + "grad_norm": 0.9267987012863159, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 60720 + }, + { + "epoch": 4.361220825852783, + "grad_norm": 1.177472472190857, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 60730 + }, + { + "epoch": 4.361938958707361, + "grad_norm": 0.8615312576293945, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 60740 + }, + { + "epoch": 4.362657091561939, + "grad_norm": 1.0939710140228271, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 60750 + }, + { + "epoch": 4.363375224416517, + "grad_norm": 1.0928049087524414, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 60760 + }, + { + "epoch": 4.364093357271095, + "grad_norm": 1.0796833038330078, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 60770 + }, + { + "epoch": 4.364811490125673, + "grad_norm": 0.9768339991569519, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 60780 + }, + { + "epoch": 4.365529622980251, + "grad_norm": 0.9082722067832947, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 60790 + }, + { + "epoch": 4.36624775583483, + "grad_norm": 0.9614832997322083, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 60800 + }, + { + "epoch": 4.366965888689408, + "grad_norm": 0.8874651789665222, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 60810 + }, + { + "epoch": 4.367684021543986, + "grad_norm": 0.8810178637504578, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 60820 + }, + { + "epoch": 4.368402154398564, + "grad_norm": 1.0893806219100952, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 60830 + }, + { + "epoch": 4.369120287253142, + "grad_norm": 0.9042278528213501, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 60840 + }, + { + "epoch": 4.36983842010772, + "grad_norm": 1.0832217931747437, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 60850 + }, + { + "epoch": 4.370556552962298, + "grad_norm": 0.9431114792823792, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 60860 + }, + { + "epoch": 4.371274685816876, + "grad_norm": 1.031553030014038, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 60870 + }, + { + "epoch": 4.371992818671454, + "grad_norm": 0.8702824711799622, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 60880 + }, + { + "epoch": 4.372710951526033, + "grad_norm": 1.1109199523925781, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 60890 + }, + { + "epoch": 4.373429084380611, + "grad_norm": 0.8369361162185669, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 60900 + }, + { + "epoch": 4.374147217235189, + "grad_norm": 0.988915205001831, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 60910 + }, + { + "epoch": 4.374865350089767, + "grad_norm": 0.9365919232368469, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 60920 + }, + { + "epoch": 4.375583482944345, + "grad_norm": 0.9789398908615112, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 60930 + }, + { + "epoch": 4.376301615798923, + "grad_norm": 0.8786931037902832, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 60940 + }, + { + "epoch": 4.377019748653501, + "grad_norm": 0.8891511559486389, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 60950 + }, + { + "epoch": 4.377737881508079, + "grad_norm": 0.9561707377433777, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 60960 + }, + { + "epoch": 4.378456014362657, + "grad_norm": 0.8674200177192688, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 60970 + }, + { + "epoch": 4.379174147217235, + "grad_norm": 0.9285916090011597, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 60980 + }, + { + "epoch": 4.379892280071814, + "grad_norm": 0.9185547232627869, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 60990 + }, + { + "epoch": 4.380610412926392, + "grad_norm": 1.081664800643921, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 61000 + }, + { + "epoch": 4.38132854578097, + "grad_norm": 1.0475854873657227, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 61010 + }, + { + "epoch": 4.382046678635548, + "grad_norm": 1.1519653797149658, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 61020 + }, + { + "epoch": 4.382764811490126, + "grad_norm": 0.8757607936859131, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 61030 + }, + { + "epoch": 4.383482944344704, + "grad_norm": 0.8707934021949768, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 61040 + }, + { + "epoch": 4.384201077199282, + "grad_norm": 1.1807516813278198, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 61050 + }, + { + "epoch": 4.38491921005386, + "grad_norm": 1.0674688816070557, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 61060 + }, + { + "epoch": 4.385637342908438, + "grad_norm": 0.9321209788322449, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 61070 + }, + { + "epoch": 4.3863554757630165, + "grad_norm": 1.0786446332931519, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 61080 + }, + { + "epoch": 4.3870736086175945, + "grad_norm": 0.9733907580375671, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 61090 + }, + { + "epoch": 4.3877917414721725, + "grad_norm": 0.9476010203361511, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 61100 + }, + { + "epoch": 4.3885098743267505, + "grad_norm": 1.1321563720703125, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 61110 + }, + { + "epoch": 4.3892280071813286, + "grad_norm": 0.9379117488861084, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 61120 + }, + { + "epoch": 4.3899461400359066, + "grad_norm": 0.8409728407859802, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 61130 + }, + { + "epoch": 4.3906642728904846, + "grad_norm": 0.8309189081192017, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 61140 + }, + { + "epoch": 4.391382405745063, + "grad_norm": 0.8922196626663208, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 61150 + }, + { + "epoch": 4.392100538599641, + "grad_norm": 0.8274614214897156, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 61160 + }, + { + "epoch": 4.392818671454219, + "grad_norm": 1.0928618907928467, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 61170 + }, + { + "epoch": 4.3935368043087974, + "grad_norm": 0.9771125316619873, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 61180 + }, + { + "epoch": 4.3942549371633755, + "grad_norm": 0.8844535946846008, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 61190 + }, + { + "epoch": 4.3949730700179535, + "grad_norm": 1.0498822927474976, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 61200 + }, + { + "epoch": 4.3956912028725315, + "grad_norm": 0.9882155060768127, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 61210 + }, + { + "epoch": 4.3964093357271095, + "grad_norm": 1.090356707572937, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 61220 + }, + { + "epoch": 4.3971274685816875, + "grad_norm": 1.0908088684082031, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 61230 + }, + { + "epoch": 4.3978456014362655, + "grad_norm": 1.0013501644134521, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 61240 + }, + { + "epoch": 4.3985637342908435, + "grad_norm": 1.0916062593460083, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 61250 + }, + { + "epoch": 4.399281867145422, + "grad_norm": 1.0817667245864868, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 61260 + }, + { + "epoch": 4.4, + "grad_norm": 0.9745162129402161, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 61270 + }, + { + "epoch": 4.400718132854578, + "grad_norm": 1.0653400421142578, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 61280 + }, + { + "epoch": 4.401436265709156, + "grad_norm": 1.0082067251205444, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 61290 + }, + { + "epoch": 4.402154398563734, + "grad_norm": 0.7963659167289734, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 61300 + }, + { + "epoch": 4.402872531418312, + "grad_norm": 1.0428845882415771, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 61310 + }, + { + "epoch": 4.40359066427289, + "grad_norm": 0.9205707311630249, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 61320 + }, + { + "epoch": 4.404308797127468, + "grad_norm": 1.0103533267974854, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 61330 + }, + { + "epoch": 4.405026929982046, + "grad_norm": 1.113547682762146, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 61340 + }, + { + "epoch": 4.405745062836624, + "grad_norm": 1.137488842010498, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 61350 + }, + { + "epoch": 4.406463195691203, + "grad_norm": 1.1284101009368896, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 61360 + }, + { + "epoch": 4.407181328545781, + "grad_norm": 0.8010451197624207, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 61370 + }, + { + "epoch": 4.407899461400359, + "grad_norm": 0.8893977403640747, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 61380 + }, + { + "epoch": 4.408617594254937, + "grad_norm": 0.9098272323608398, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 61390 + }, + { + "epoch": 4.409335727109515, + "grad_norm": 1.0613329410552979, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 61400 + }, + { + "epoch": 4.410053859964093, + "grad_norm": 1.0070269107818604, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 61410 + }, + { + "epoch": 4.410771992818671, + "grad_norm": 0.8632227778434753, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 61420 + }, + { + "epoch": 4.411490125673249, + "grad_norm": 1.0183731317520142, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 61430 + }, + { + "epoch": 4.412208258527827, + "grad_norm": 0.9049941897392273, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 61440 + }, + { + "epoch": 4.412926391382406, + "grad_norm": 1.0184082984924316, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 61450 + }, + { + "epoch": 4.413644524236984, + "grad_norm": 0.9994277358055115, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 61460 + }, + { + "epoch": 4.414362657091562, + "grad_norm": 1.0112420320510864, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 61470 + }, + { + "epoch": 4.41508078994614, + "grad_norm": 0.9751759171485901, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 61480 + }, + { + "epoch": 4.415798922800718, + "grad_norm": 1.047135591506958, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 61490 + }, + { + "epoch": 4.416517055655296, + "grad_norm": 0.886282742023468, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 61500 + }, + { + "epoch": 4.417235188509874, + "grad_norm": 0.971964418888092, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 61510 + }, + { + "epoch": 4.417953321364452, + "grad_norm": 0.9603846073150635, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 61520 + }, + { + "epoch": 4.41867145421903, + "grad_norm": 1.060042142868042, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 61530 + }, + { + "epoch": 4.419389587073608, + "grad_norm": 1.1231369972229004, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 61540 + }, + { + "epoch": 4.420107719928187, + "grad_norm": 0.8269591331481934, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 61550 + }, + { + "epoch": 4.420825852782765, + "grad_norm": 1.0341241359710693, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 61560 + }, + { + "epoch": 4.421543985637343, + "grad_norm": 0.7276636958122253, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 61570 + }, + { + "epoch": 4.422262118491921, + "grad_norm": 1.0663669109344482, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 61580 + }, + { + "epoch": 4.422980251346499, + "grad_norm": 0.9764387011528015, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 61590 + }, + { + "epoch": 4.423698384201077, + "grad_norm": 1.0953258275985718, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 61600 + }, + { + "epoch": 4.424416517055655, + "grad_norm": 0.8877012729644775, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 61610 + }, + { + "epoch": 4.425134649910233, + "grad_norm": 0.8781440854072571, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 61620 + }, + { + "epoch": 4.425852782764811, + "grad_norm": 0.8333432674407959, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 61630 + }, + { + "epoch": 4.42657091561939, + "grad_norm": 0.9647989869117737, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 61640 + }, + { + "epoch": 4.427289048473968, + "grad_norm": 1.0801783800125122, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 61650 + }, + { + "epoch": 4.428007181328546, + "grad_norm": 0.8215882778167725, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 61660 + }, + { + "epoch": 4.428725314183124, + "grad_norm": 0.9853931665420532, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 61670 + }, + { + "epoch": 4.429443447037702, + "grad_norm": 0.8658010959625244, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 61680 + }, + { + "epoch": 4.43016157989228, + "grad_norm": 1.124064326286316, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 61690 + }, + { + "epoch": 4.430879712746858, + "grad_norm": 1.009340763092041, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 61700 + }, + { + "epoch": 4.431597845601436, + "grad_norm": 0.8705293536186218, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 61710 + }, + { + "epoch": 4.432315978456014, + "grad_norm": 1.1323511600494385, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 61720 + }, + { + "epoch": 4.433034111310592, + "grad_norm": 1.1203019618988037, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 61730 + }, + { + "epoch": 4.433752244165171, + "grad_norm": 1.1683770418167114, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 61740 + }, + { + "epoch": 4.434470377019749, + "grad_norm": 1.0735899209976196, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 61750 + }, + { + "epoch": 4.435188509874327, + "grad_norm": 1.142496109008789, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 61760 + }, + { + "epoch": 4.435906642728905, + "grad_norm": 1.1157732009887695, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 61770 + }, + { + "epoch": 4.436624775583483, + "grad_norm": 0.8845949172973633, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 61780 + }, + { + "epoch": 4.437342908438061, + "grad_norm": 1.1212759017944336, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 61790 + }, + { + "epoch": 4.438061041292639, + "grad_norm": 0.8832488656044006, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 61800 + }, + { + "epoch": 4.438779174147217, + "grad_norm": 0.9059590101242065, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 61810 + }, + { + "epoch": 4.439497307001796, + "grad_norm": 1.0625685453414917, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 61820 + }, + { + "epoch": 4.440215439856374, + "grad_norm": 0.9565598368644714, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 61830 + }, + { + "epoch": 4.440933572710952, + "grad_norm": 0.8975377082824707, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 61840 + }, + { + "epoch": 4.44165170556553, + "grad_norm": 1.0412718057632446, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 61850 + }, + { + "epoch": 4.442369838420108, + "grad_norm": 0.9923529624938965, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 61860 + }, + { + "epoch": 4.443087971274686, + "grad_norm": 1.3025734424591064, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 61870 + }, + { + "epoch": 4.443806104129264, + "grad_norm": 1.0031960010528564, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 61880 + }, + { + "epoch": 4.444524236983842, + "grad_norm": 1.0974701642990112, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 61890 + }, + { + "epoch": 4.44524236983842, + "grad_norm": 1.1044024229049683, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 61900 + }, + { + "epoch": 4.445960502692998, + "grad_norm": 1.0782772302627563, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 61910 + }, + { + "epoch": 4.446678635547577, + "grad_norm": 1.006304383277893, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 61920 + }, + { + "epoch": 4.447396768402155, + "grad_norm": 0.9258833527565002, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 61930 + }, + { + "epoch": 4.448114901256733, + "grad_norm": 0.9888426065444946, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 61940 + }, + { + "epoch": 4.448833034111311, + "grad_norm": 0.9592963457107544, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 61950 + }, + { + "epoch": 4.449551166965889, + "grad_norm": 1.0527986288070679, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 61960 + }, + { + "epoch": 4.450269299820467, + "grad_norm": 0.8613291382789612, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 61970 + }, + { + "epoch": 4.450987432675045, + "grad_norm": 1.1083767414093018, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 61980 + }, + { + "epoch": 4.451705565529623, + "grad_norm": 0.772679328918457, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 61990 + }, + { + "epoch": 4.452423698384201, + "grad_norm": 0.9052274227142334, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 62000 + }, + { + "epoch": 4.45314183123878, + "grad_norm": 1.129667043685913, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 62010 + }, + { + "epoch": 4.453859964093358, + "grad_norm": 0.9994529485702515, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 62020 + }, + { + "epoch": 4.454578096947936, + "grad_norm": 0.982155978679657, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 62030 + }, + { + "epoch": 4.455296229802514, + "grad_norm": 0.9139904975891113, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 62040 + }, + { + "epoch": 4.456014362657092, + "grad_norm": 1.0877810716629028, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 62050 + }, + { + "epoch": 4.45673249551167, + "grad_norm": 1.0535308122634888, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 62060 + }, + { + "epoch": 4.457450628366248, + "grad_norm": 1.0225313901901245, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 62070 + }, + { + "epoch": 4.458168761220826, + "grad_norm": 0.8443132042884827, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 62080 + }, + { + "epoch": 4.458886894075404, + "grad_norm": 1.0426654815673828, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 62090 + }, + { + "epoch": 4.459605026929982, + "grad_norm": 1.1110700368881226, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 62100 + }, + { + "epoch": 4.4603231597845605, + "grad_norm": 1.0200893878936768, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 62110 + }, + { + "epoch": 4.4610412926391385, + "grad_norm": 0.9102830290794373, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 62120 + }, + { + "epoch": 4.4617594254937165, + "grad_norm": 1.1395094394683838, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 62130 + }, + { + "epoch": 4.4624775583482945, + "grad_norm": 1.1202316284179688, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 62140 + }, + { + "epoch": 4.4631956912028725, + "grad_norm": 1.142580509185791, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 62150 + }, + { + "epoch": 4.4639138240574505, + "grad_norm": 0.9843677878379822, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 62160 + }, + { + "epoch": 4.4646319569120285, + "grad_norm": 1.0351676940917969, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 62170 + }, + { + "epoch": 4.4653500897666065, + "grad_norm": 0.9365093111991882, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 62180 + }, + { + "epoch": 4.4660682226211845, + "grad_norm": 1.041193962097168, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 62190 + }, + { + "epoch": 4.466786355475763, + "grad_norm": 0.9686329960823059, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 62200 + }, + { + "epoch": 4.467504488330341, + "grad_norm": 1.028622031211853, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 62210 + }, + { + "epoch": 4.468222621184919, + "grad_norm": 0.9717516899108887, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 62220 + }, + { + "epoch": 4.468940754039497, + "grad_norm": 1.0467450618743896, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 62230 + }, + { + "epoch": 4.469658886894075, + "grad_norm": 0.943717896938324, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 62240 + }, + { + "epoch": 4.470377019748653, + "grad_norm": 0.909429132938385, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 62250 + }, + { + "epoch": 4.471095152603231, + "grad_norm": 1.0294792652130127, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 62260 + }, + { + "epoch": 4.471813285457809, + "grad_norm": 1.1044281721115112, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 62270 + }, + { + "epoch": 4.472531418312387, + "grad_norm": 1.1555784940719604, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 62280 + }, + { + "epoch": 4.473249551166965, + "grad_norm": 0.9441297650337219, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 62290 + }, + { + "epoch": 4.473967684021544, + "grad_norm": 0.9164380431175232, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 62300 + }, + { + "epoch": 4.474685816876122, + "grad_norm": 1.1139159202575684, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 62310 + }, + { + "epoch": 4.4754039497307, + "grad_norm": 1.0201882123947144, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 62320 + }, + { + "epoch": 4.476122082585278, + "grad_norm": 1.1471681594848633, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 62330 + }, + { + "epoch": 4.476840215439856, + "grad_norm": 1.0333549976348877, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 62340 + }, + { + "epoch": 4.477558348294434, + "grad_norm": 0.8929767608642578, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 62350 + }, + { + "epoch": 4.478276481149012, + "grad_norm": 0.9465752840042114, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 62360 + }, + { + "epoch": 4.47899461400359, + "grad_norm": 1.2155033349990845, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 62370 + }, + { + "epoch": 4.479712746858169, + "grad_norm": 0.7181217074394226, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 62380 + }, + { + "epoch": 4.480430879712747, + "grad_norm": 1.0052744150161743, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 62390 + }, + { + "epoch": 4.481149012567325, + "grad_norm": 0.8522219061851501, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 62400 + }, + { + "epoch": 4.481867145421903, + "grad_norm": 0.8844723105430603, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 62410 + }, + { + "epoch": 4.482585278276481, + "grad_norm": 0.9542465209960938, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 62420 + }, + { + "epoch": 4.483303411131059, + "grad_norm": 0.8963674306869507, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 62430 + }, + { + "epoch": 4.484021543985637, + "grad_norm": 0.8105363845825195, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 62440 + }, + { + "epoch": 4.484739676840215, + "grad_norm": 0.9618421196937561, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 62450 + }, + { + "epoch": 4.485457809694793, + "grad_norm": 1.1931076049804688, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 62460 + }, + { + "epoch": 4.486175942549371, + "grad_norm": 0.7406999468803406, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 62470 + }, + { + "epoch": 4.48689407540395, + "grad_norm": 0.7698216438293457, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 62480 + }, + { + "epoch": 4.487612208258528, + "grad_norm": 0.862271249294281, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 62490 + }, + { + "epoch": 4.488330341113106, + "grad_norm": 1.0025171041488647, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 62500 + }, + { + "epoch": 4.489048473967684, + "grad_norm": 0.8474493622779846, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 62510 + }, + { + "epoch": 4.489766606822262, + "grad_norm": 0.8965697884559631, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 62520 + }, + { + "epoch": 4.49048473967684, + "grad_norm": 1.1276488304138184, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 62530 + }, + { + "epoch": 4.491202872531418, + "grad_norm": 1.0253537893295288, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 62540 + }, + { + "epoch": 4.491921005385996, + "grad_norm": 1.1750596761703491, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 62550 + }, + { + "epoch": 4.492639138240574, + "grad_norm": 0.9951794147491455, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 62560 + }, + { + "epoch": 4.493357271095153, + "grad_norm": 1.2510017156600952, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 62570 + }, + { + "epoch": 4.494075403949731, + "grad_norm": 1.4066375494003296, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 62580 + }, + { + "epoch": 4.494793536804309, + "grad_norm": 0.988175094127655, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 62590 + }, + { + "epoch": 4.495511669658887, + "grad_norm": 1.2049115896224976, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 62600 + }, + { + "epoch": 4.496229802513465, + "grad_norm": 0.962464451789856, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 62610 + }, + { + "epoch": 4.496947935368043, + "grad_norm": 0.9324793815612793, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 62620 + }, + { + "epoch": 4.497666068222621, + "grad_norm": 0.9174214005470276, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 62630 + }, + { + "epoch": 4.498384201077199, + "grad_norm": 0.9729902148246765, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 62640 + }, + { + "epoch": 4.499102333931777, + "grad_norm": 1.0190484523773193, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 62650 + }, + { + "epoch": 4.499820466786355, + "grad_norm": 1.1473679542541504, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 62660 + }, + { + "epoch": 4.500538599640934, + "grad_norm": 1.0160558223724365, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 62670 + }, + { + "epoch": 4.501256732495512, + "grad_norm": 0.8083887100219727, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 62680 + }, + { + "epoch": 4.50197486535009, + "grad_norm": 0.941933274269104, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 62690 + }, + { + "epoch": 4.502692998204668, + "grad_norm": 0.9962822794914246, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 62700 + }, + { + "epoch": 4.503411131059246, + "grad_norm": 0.8993943333625793, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 62710 + }, + { + "epoch": 4.504129263913824, + "grad_norm": 0.9438319206237793, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 62720 + }, + { + "epoch": 4.504847396768402, + "grad_norm": 0.7951892018318176, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 62730 + }, + { + "epoch": 4.50556552962298, + "grad_norm": 0.8875413537025452, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 62740 + }, + { + "epoch": 4.506283662477558, + "grad_norm": 0.993819534778595, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 62750 + }, + { + "epoch": 4.507001795332137, + "grad_norm": 0.9177559018135071, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 62760 + }, + { + "epoch": 4.507719928186715, + "grad_norm": 0.8632771968841553, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 62770 + }, + { + "epoch": 4.508438061041293, + "grad_norm": 0.943778395652771, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 62780 + }, + { + "epoch": 4.509156193895871, + "grad_norm": 0.8754997849464417, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 62790 + }, + { + "epoch": 4.509874326750449, + "grad_norm": 1.102683424949646, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 62800 + }, + { + "epoch": 4.510592459605027, + "grad_norm": 1.1156457662582397, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 62810 + }, + { + "epoch": 4.511310592459605, + "grad_norm": 0.9178887009620667, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 62820 + }, + { + "epoch": 4.512028725314183, + "grad_norm": 0.9520689249038696, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 62830 + }, + { + "epoch": 4.512746858168761, + "grad_norm": 0.8880525231361389, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 62840 + }, + { + "epoch": 4.513464991023339, + "grad_norm": 0.9541497826576233, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 62850 + }, + { + "epoch": 4.514183123877918, + "grad_norm": 1.003766417503357, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 62860 + }, + { + "epoch": 4.514901256732496, + "grad_norm": 0.8844705820083618, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 62870 + }, + { + "epoch": 4.515619389587074, + "grad_norm": 1.1870828866958618, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 62880 + }, + { + "epoch": 4.516337522441652, + "grad_norm": 0.863487184047699, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 62890 + }, + { + "epoch": 4.51705565529623, + "grad_norm": 0.997770369052887, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 62900 + }, + { + "epoch": 4.517773788150808, + "grad_norm": 0.9708612561225891, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 62910 + }, + { + "epoch": 4.518491921005386, + "grad_norm": 1.1381206512451172, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 62920 + }, + { + "epoch": 4.519210053859964, + "grad_norm": 1.0386693477630615, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 62930 + }, + { + "epoch": 4.519928186714543, + "grad_norm": 1.1711705923080444, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 62940 + }, + { + "epoch": 4.520646319569121, + "grad_norm": 0.8727447390556335, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 62950 + }, + { + "epoch": 4.521364452423699, + "grad_norm": 0.9215193390846252, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 62960 + }, + { + "epoch": 4.522082585278277, + "grad_norm": 1.005467176437378, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 62970 + }, + { + "epoch": 4.522800718132855, + "grad_norm": 0.8761187791824341, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 62980 + }, + { + "epoch": 4.523518850987433, + "grad_norm": 0.957848310470581, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 62990 + }, + { + "epoch": 4.524236983842011, + "grad_norm": 0.8634148836135864, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 63000 + }, + { + "epoch": 4.524955116696589, + "grad_norm": 0.9557477235794067, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 63010 + }, + { + "epoch": 4.525673249551167, + "grad_norm": 1.017720341682434, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 63020 + }, + { + "epoch": 4.526391382405745, + "grad_norm": 1.0281825065612793, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 63030 + }, + { + "epoch": 4.527109515260323, + "grad_norm": 1.253974437713623, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 63040 + }, + { + "epoch": 4.527827648114902, + "grad_norm": 0.8489068150520325, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 63050 + }, + { + "epoch": 4.52854578096948, + "grad_norm": 0.9681686162948608, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 63060 + }, + { + "epoch": 4.529263913824058, + "grad_norm": 1.10277259349823, + "learning_rate": 0.0002, + "loss": 0.6166, + "step": 63070 + }, + { + "epoch": 4.529982046678636, + "grad_norm": 0.9469163417816162, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 63080 + }, + { + "epoch": 4.530700179533214, + "grad_norm": 1.1228134632110596, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 63090 + }, + { + "epoch": 4.531418312387792, + "grad_norm": 0.9673212170600891, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 63100 + }, + { + "epoch": 4.53213644524237, + "grad_norm": 1.0221107006072998, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 63110 + }, + { + "epoch": 4.532854578096948, + "grad_norm": 0.826372504234314, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 63120 + }, + { + "epoch": 4.5335727109515265, + "grad_norm": 1.1805331707000732, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 63130 + }, + { + "epoch": 4.5342908438061045, + "grad_norm": 0.9645666480064392, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 63140 + }, + { + "epoch": 4.5350089766606825, + "grad_norm": 1.0838309526443481, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 63150 + }, + { + "epoch": 4.5357271095152605, + "grad_norm": 1.061414361000061, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 63160 + }, + { + "epoch": 4.5364452423698385, + "grad_norm": 0.841961145401001, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 63170 + }, + { + "epoch": 4.5371633752244165, + "grad_norm": 1.1220186948776245, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 63180 + }, + { + "epoch": 4.5378815080789945, + "grad_norm": 1.036441445350647, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 63190 + }, + { + "epoch": 4.5385996409335725, + "grad_norm": 0.9089716076850891, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 63200 + }, + { + "epoch": 4.5393177737881505, + "grad_norm": 0.8699982762336731, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 63210 + }, + { + "epoch": 4.5400359066427285, + "grad_norm": 0.8489565253257751, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 63220 + }, + { + "epoch": 4.540754039497307, + "grad_norm": 0.7778416275978088, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 63230 + }, + { + "epoch": 4.541472172351885, + "grad_norm": 1.0625852346420288, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 63240 + }, + { + "epoch": 4.542190305206463, + "grad_norm": 0.8515732884407043, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 63250 + }, + { + "epoch": 4.542908438061041, + "grad_norm": 0.7679561376571655, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 63260 + }, + { + "epoch": 4.543626570915619, + "grad_norm": 0.7358446717262268, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 63270 + }, + { + "epoch": 4.544344703770197, + "grad_norm": 1.0866128206253052, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 63280 + }, + { + "epoch": 4.545062836624775, + "grad_norm": 1.0870225429534912, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 63290 + }, + { + "epoch": 4.545780969479353, + "grad_norm": 0.951095461845398, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 63300 + }, + { + "epoch": 4.546499102333931, + "grad_norm": 1.0914306640625, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 63310 + }, + { + "epoch": 4.54721723518851, + "grad_norm": 0.8676106333732605, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 63320 + }, + { + "epoch": 4.547935368043088, + "grad_norm": 1.0129096508026123, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 63330 + }, + { + "epoch": 4.548653500897666, + "grad_norm": 0.8710526823997498, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 63340 + }, + { + "epoch": 4.549371633752244, + "grad_norm": 0.7014815807342529, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 63350 + }, + { + "epoch": 4.550089766606822, + "grad_norm": 1.1546777486801147, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 63360 + }, + { + "epoch": 4.5508078994614, + "grad_norm": 0.7464957237243652, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 63370 + }, + { + "epoch": 4.551526032315978, + "grad_norm": 0.9976209998130798, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 63380 + }, + { + "epoch": 4.552244165170556, + "grad_norm": 0.9543681740760803, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 63390 + }, + { + "epoch": 4.552962298025134, + "grad_norm": 1.1498578786849976, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 63400 + }, + { + "epoch": 4.553680430879712, + "grad_norm": 1.0162293910980225, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 63410 + }, + { + "epoch": 4.554398563734291, + "grad_norm": 0.9015304446220398, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 63420 + }, + { + "epoch": 4.555116696588869, + "grad_norm": 1.1639831066131592, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 63430 + }, + { + "epoch": 4.555834829443447, + "grad_norm": 0.9494703412055969, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 63440 + }, + { + "epoch": 4.556552962298025, + "grad_norm": 1.0555956363677979, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 63450 + }, + { + "epoch": 4.557271095152603, + "grad_norm": 0.8513827919960022, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 63460 + }, + { + "epoch": 4.557989228007181, + "grad_norm": 1.0614275932312012, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 63470 + }, + { + "epoch": 4.558707360861759, + "grad_norm": 0.8341137766838074, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 63480 + }, + { + "epoch": 4.559425493716337, + "grad_norm": 1.2136222124099731, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 63490 + }, + { + "epoch": 4.560143626570916, + "grad_norm": 0.8806019425392151, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 63500 + }, + { + "epoch": 4.560861759425494, + "grad_norm": 1.2548854351043701, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 63510 + }, + { + "epoch": 4.561579892280072, + "grad_norm": 1.0162668228149414, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 63520 + }, + { + "epoch": 4.56229802513465, + "grad_norm": 1.0487624406814575, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 63530 + }, + { + "epoch": 4.563016157989228, + "grad_norm": 1.2505502700805664, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 63540 + }, + { + "epoch": 4.563734290843806, + "grad_norm": 0.9930511713027954, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 63550 + }, + { + "epoch": 4.564452423698384, + "grad_norm": 0.8132568001747131, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 63560 + }, + { + "epoch": 4.565170556552962, + "grad_norm": 1.0129177570343018, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 63570 + }, + { + "epoch": 4.56588868940754, + "grad_norm": 0.9011693596839905, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 63580 + }, + { + "epoch": 4.566606822262118, + "grad_norm": 0.9161545634269714, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 63590 + }, + { + "epoch": 4.567324955116696, + "grad_norm": 0.8852348327636719, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 63600 + }, + { + "epoch": 4.568043087971275, + "grad_norm": 0.8579391837120056, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 63610 + }, + { + "epoch": 4.568761220825853, + "grad_norm": 0.9271050095558167, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 63620 + }, + { + "epoch": 4.569479353680431, + "grad_norm": 0.9881834983825684, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 63630 + }, + { + "epoch": 4.570197486535009, + "grad_norm": 1.0255686044692993, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 63640 + }, + { + "epoch": 4.570915619389587, + "grad_norm": 0.8758876919746399, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 63650 + }, + { + "epoch": 4.571633752244165, + "grad_norm": 1.0134185552597046, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 63660 + }, + { + "epoch": 4.572351885098743, + "grad_norm": 0.8535705208778381, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 63670 + }, + { + "epoch": 4.573070017953321, + "grad_norm": 0.9614834785461426, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 63680 + }, + { + "epoch": 4.5737881508079, + "grad_norm": 0.9004243612289429, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 63690 + }, + { + "epoch": 4.574506283662478, + "grad_norm": 0.9563080072402954, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 63700 + }, + { + "epoch": 4.575224416517056, + "grad_norm": 1.024857521057129, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 63710 + }, + { + "epoch": 4.575942549371634, + "grad_norm": 0.9345638155937195, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 63720 + }, + { + "epoch": 4.576660682226212, + "grad_norm": 1.27083158493042, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 63730 + }, + { + "epoch": 4.57737881508079, + "grad_norm": 1.0866559743881226, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 63740 + }, + { + "epoch": 4.578096947935368, + "grad_norm": 0.9253925681114197, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 63750 + }, + { + "epoch": 4.578815080789946, + "grad_norm": 0.8127399682998657, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 63760 + }, + { + "epoch": 4.579533213644524, + "grad_norm": 1.0453993082046509, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 63770 + }, + { + "epoch": 4.580251346499102, + "grad_norm": 1.2227544784545898, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 63780 + }, + { + "epoch": 4.580969479353681, + "grad_norm": 1.0207865238189697, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 63790 + }, + { + "epoch": 4.581687612208259, + "grad_norm": 1.030447244644165, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 63800 + }, + { + "epoch": 4.582405745062837, + "grad_norm": 1.0855677127838135, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 63810 + }, + { + "epoch": 4.583123877917415, + "grad_norm": 0.9572556018829346, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 63820 + }, + { + "epoch": 4.583842010771993, + "grad_norm": 0.9061040282249451, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 63830 + }, + { + "epoch": 4.584560143626571, + "grad_norm": 0.9267677068710327, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 63840 + }, + { + "epoch": 4.585278276481149, + "grad_norm": 1.070076823234558, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 63850 + }, + { + "epoch": 4.585996409335727, + "grad_norm": 1.045881748199463, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 63860 + }, + { + "epoch": 4.586714542190305, + "grad_norm": 0.9190576672554016, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 63870 + }, + { + "epoch": 4.587432675044884, + "grad_norm": 0.9263932704925537, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 63880 + }, + { + "epoch": 4.588150807899462, + "grad_norm": 1.0217589139938354, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 63890 + }, + { + "epoch": 4.58886894075404, + "grad_norm": 0.9200088381767273, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 63900 + }, + { + "epoch": 4.589587073608618, + "grad_norm": 0.9877251386642456, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 63910 + }, + { + "epoch": 4.590305206463196, + "grad_norm": 1.0059093236923218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 63920 + }, + { + "epoch": 4.591023339317774, + "grad_norm": 1.2618095874786377, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 63930 + }, + { + "epoch": 4.591741472172352, + "grad_norm": 1.1779268980026245, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 63940 + }, + { + "epoch": 4.59245960502693, + "grad_norm": 1.2339502573013306, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 63950 + }, + { + "epoch": 4.593177737881508, + "grad_norm": 0.7488788366317749, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 63960 + }, + { + "epoch": 4.593895870736086, + "grad_norm": 0.8366380929946899, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 63970 + }, + { + "epoch": 4.594614003590665, + "grad_norm": 1.0292677879333496, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 63980 + }, + { + "epoch": 4.595332136445243, + "grad_norm": 0.7938551306724548, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 63990 + }, + { + "epoch": 4.596050269299821, + "grad_norm": 0.7958516478538513, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 64000 + }, + { + "epoch": 4.596768402154399, + "grad_norm": 0.9613908529281616, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 64010 + }, + { + "epoch": 4.597486535008977, + "grad_norm": 1.0253773927688599, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 64020 + }, + { + "epoch": 4.598204667863555, + "grad_norm": 1.0560888051986694, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 64030 + }, + { + "epoch": 4.598922800718133, + "grad_norm": 1.1093556880950928, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 64040 + }, + { + "epoch": 4.599640933572711, + "grad_norm": 0.8492098450660706, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 64050 + }, + { + "epoch": 4.6003590664272895, + "grad_norm": 1.0070436000823975, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 64060 + }, + { + "epoch": 4.6010771992818675, + "grad_norm": 0.9774282574653625, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 64070 + }, + { + "epoch": 4.6017953321364455, + "grad_norm": 1.0744960308074951, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 64080 + }, + { + "epoch": 4.6025134649910235, + "grad_norm": 1.0101491212844849, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 64090 + }, + { + "epoch": 4.6032315978456015, + "grad_norm": 1.2306591272354126, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 64100 + }, + { + "epoch": 4.6039497307001795, + "grad_norm": 0.9187033176422119, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 64110 + }, + { + "epoch": 4.6046678635547575, + "grad_norm": 0.9178676605224609, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 64120 + }, + { + "epoch": 4.6053859964093355, + "grad_norm": 1.006374716758728, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 64130 + }, + { + "epoch": 4.6061041292639135, + "grad_norm": 1.0774449110031128, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 64140 + }, + { + "epoch": 4.6068222621184916, + "grad_norm": 1.0360658168792725, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 64150 + }, + { + "epoch": 4.6075403949730696, + "grad_norm": 1.1061090230941772, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 64160 + }, + { + "epoch": 4.608258527827648, + "grad_norm": 1.0320971012115479, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 64170 + }, + { + "epoch": 4.6089766606822264, + "grad_norm": 0.8596988916397095, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 64180 + }, + { + "epoch": 4.6096947935368044, + "grad_norm": 1.1665741205215454, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 64190 + }, + { + "epoch": 4.6104129263913824, + "grad_norm": 0.857207715511322, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 64200 + }, + { + "epoch": 4.6111310592459605, + "grad_norm": 1.0088987350463867, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 64210 + }, + { + "epoch": 4.6118491921005385, + "grad_norm": 1.0985605716705322, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 64220 + }, + { + "epoch": 4.6125673249551165, + "grad_norm": 0.9504913687705994, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 64230 + }, + { + "epoch": 4.6132854578096945, + "grad_norm": 0.8415018916130066, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 64240 + }, + { + "epoch": 4.614003590664273, + "grad_norm": 0.9857034087181091, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 64250 + }, + { + "epoch": 4.614721723518851, + "grad_norm": 1.0164235830307007, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 64260 + }, + { + "epoch": 4.615439856373429, + "grad_norm": 0.949481725692749, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 64270 + }, + { + "epoch": 4.616157989228007, + "grad_norm": 0.9526455998420715, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 64280 + }, + { + "epoch": 4.616876122082585, + "grad_norm": 1.1121242046356201, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 64290 + }, + { + "epoch": 4.617594254937163, + "grad_norm": 0.9598871469497681, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 64300 + }, + { + "epoch": 4.618312387791741, + "grad_norm": 1.0406304597854614, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 64310 + }, + { + "epoch": 4.619030520646319, + "grad_norm": 1.1816964149475098, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 64320 + }, + { + "epoch": 4.619748653500897, + "grad_norm": 0.9818326830863953, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 64330 + }, + { + "epoch": 4.620466786355475, + "grad_norm": 0.952017605304718, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 64340 + }, + { + "epoch": 4.621184919210053, + "grad_norm": 1.1263453960418701, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 64350 + }, + { + "epoch": 4.621903052064632, + "grad_norm": 1.1158473491668701, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 64360 + }, + { + "epoch": 4.62262118491921, + "grad_norm": 0.9056766033172607, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 64370 + }, + { + "epoch": 4.623339317773788, + "grad_norm": 0.8113203048706055, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 64380 + }, + { + "epoch": 4.624057450628366, + "grad_norm": 0.8646712899208069, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 64390 + }, + { + "epoch": 4.624775583482944, + "grad_norm": 1.0064425468444824, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 64400 + }, + { + "epoch": 4.625493716337522, + "grad_norm": 0.9867565631866455, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 64410 + }, + { + "epoch": 4.6262118491921, + "grad_norm": 1.018764615058899, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 64420 + }, + { + "epoch": 4.626929982046678, + "grad_norm": 1.0607863664627075, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 64430 + }, + { + "epoch": 4.627648114901257, + "grad_norm": 1.012825846672058, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 64440 + }, + { + "epoch": 4.628366247755835, + "grad_norm": 0.8441653847694397, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 64450 + }, + { + "epoch": 4.629084380610413, + "grad_norm": 0.9819194674491882, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 64460 + }, + { + "epoch": 4.629802513464991, + "grad_norm": 0.925519585609436, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 64470 + }, + { + "epoch": 4.630520646319569, + "grad_norm": 0.9409030079841614, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 64480 + }, + { + "epoch": 4.631238779174147, + "grad_norm": 1.148024559020996, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 64490 + }, + { + "epoch": 4.631956912028725, + "grad_norm": 0.8225533962249756, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 64500 + }, + { + "epoch": 4.632675044883303, + "grad_norm": 0.8806734681129456, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 64510 + }, + { + "epoch": 4.633393177737881, + "grad_norm": 0.9656694531440735, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 64520 + }, + { + "epoch": 4.634111310592459, + "grad_norm": 0.9977783560752869, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 64530 + }, + { + "epoch": 4.634829443447038, + "grad_norm": 0.9259420037269592, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 64540 + }, + { + "epoch": 4.635547576301616, + "grad_norm": 1.0215885639190674, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 64550 + }, + { + "epoch": 4.636265709156194, + "grad_norm": 1.1082557439804077, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 64560 + }, + { + "epoch": 4.636983842010772, + "grad_norm": 1.1183207035064697, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 64570 + }, + { + "epoch": 4.63770197486535, + "grad_norm": 0.9914339184761047, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 64580 + }, + { + "epoch": 4.638420107719928, + "grad_norm": 0.8065831661224365, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 64590 + }, + { + "epoch": 4.639138240574506, + "grad_norm": 1.1546721458435059, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 64600 + }, + { + "epoch": 4.639856373429084, + "grad_norm": 1.0395900011062622, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 64610 + }, + { + "epoch": 4.640574506283663, + "grad_norm": 0.9957455992698669, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 64620 + }, + { + "epoch": 4.641292639138241, + "grad_norm": 1.069557785987854, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 64630 + }, + { + "epoch": 4.642010771992819, + "grad_norm": 1.005236268043518, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 64640 + }, + { + "epoch": 4.642728904847397, + "grad_norm": 1.0216304063796997, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 64650 + }, + { + "epoch": 4.643447037701975, + "grad_norm": 0.8567317128181458, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 64660 + }, + { + "epoch": 4.644165170556553, + "grad_norm": 1.0386067628860474, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 64670 + }, + { + "epoch": 4.644883303411131, + "grad_norm": 0.9566055536270142, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 64680 + }, + { + "epoch": 4.645601436265709, + "grad_norm": 1.0990564823150635, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 64690 + }, + { + "epoch": 4.646319569120287, + "grad_norm": 0.9962695240974426, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 64700 + }, + { + "epoch": 4.647037701974865, + "grad_norm": 0.9041377305984497, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 64710 + }, + { + "epoch": 4.647755834829443, + "grad_norm": 0.8611233234405518, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 64720 + }, + { + "epoch": 4.648473967684022, + "grad_norm": 1.1569812297821045, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 64730 + }, + { + "epoch": 4.6491921005386, + "grad_norm": 0.7946197390556335, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 64740 + }, + { + "epoch": 4.649910233393178, + "grad_norm": 0.9612061381340027, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 64750 + }, + { + "epoch": 4.650628366247756, + "grad_norm": 0.9669303297996521, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 64760 + }, + { + "epoch": 4.651346499102334, + "grad_norm": 0.8117775321006775, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 64770 + }, + { + "epoch": 4.652064631956912, + "grad_norm": 1.2326241731643677, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 64780 + }, + { + "epoch": 4.65278276481149, + "grad_norm": 0.7494568228721619, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 64790 + }, + { + "epoch": 4.653500897666068, + "grad_norm": 0.8145379424095154, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 64800 + }, + { + "epoch": 4.654219030520647, + "grad_norm": 1.0139610767364502, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 64810 + }, + { + "epoch": 4.654937163375225, + "grad_norm": 0.9887115359306335, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 64820 + }, + { + "epoch": 4.655655296229803, + "grad_norm": 0.9565147161483765, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 64830 + }, + { + "epoch": 4.656373429084381, + "grad_norm": 0.9022467136383057, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 64840 + }, + { + "epoch": 4.657091561938959, + "grad_norm": 1.075003981590271, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 64850 + }, + { + "epoch": 4.657809694793537, + "grad_norm": 0.8705733418464661, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 64860 + }, + { + "epoch": 4.658527827648115, + "grad_norm": 1.0826832056045532, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 64870 + }, + { + "epoch": 4.659245960502693, + "grad_norm": 1.1056268215179443, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 64880 + }, + { + "epoch": 4.659964093357271, + "grad_norm": 0.8664149641990662, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 64890 + }, + { + "epoch": 4.660682226211849, + "grad_norm": 0.9487230181694031, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 64900 + }, + { + "epoch": 4.661400359066427, + "grad_norm": 1.0357837677001953, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 64910 + }, + { + "epoch": 4.662118491921006, + "grad_norm": 0.8620632290840149, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 64920 + }, + { + "epoch": 4.662836624775584, + "grad_norm": 1.108986735343933, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 64930 + }, + { + "epoch": 4.663554757630162, + "grad_norm": 0.8017674684524536, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 64940 + }, + { + "epoch": 4.66427289048474, + "grad_norm": 0.882347583770752, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 64950 + }, + { + "epoch": 4.664991023339318, + "grad_norm": 0.9466867446899414, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 64960 + }, + { + "epoch": 4.665709156193896, + "grad_norm": 1.1823636293411255, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 64970 + }, + { + "epoch": 4.666427289048474, + "grad_norm": 0.9535016417503357, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 64980 + }, + { + "epoch": 4.667145421903052, + "grad_norm": 0.9456726312637329, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 64990 + }, + { + "epoch": 4.667863554757631, + "grad_norm": 0.7761920690536499, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 65000 + }, + { + "epoch": 4.668581687612209, + "grad_norm": 1.060357689857483, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 65010 + }, + { + "epoch": 4.669299820466787, + "grad_norm": 0.9083862900733948, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 65020 + }, + { + "epoch": 4.670017953321365, + "grad_norm": 0.8745762705802917, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 65030 + }, + { + "epoch": 4.670736086175943, + "grad_norm": 0.8715422749519348, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 65040 + }, + { + "epoch": 4.671454219030521, + "grad_norm": 0.9407707452774048, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 65050 + }, + { + "epoch": 4.672172351885099, + "grad_norm": 0.8998945355415344, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 65060 + }, + { + "epoch": 4.672890484739677, + "grad_norm": 0.9147891998291016, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 65070 + }, + { + "epoch": 4.673608617594255, + "grad_norm": 1.116614580154419, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 65080 + }, + { + "epoch": 4.674326750448833, + "grad_norm": 1.0764213800430298, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 65090 + }, + { + "epoch": 4.6750448833034115, + "grad_norm": 0.9115945100784302, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 65100 + }, + { + "epoch": 4.6757630161579895, + "grad_norm": 1.001251459121704, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 65110 + }, + { + "epoch": 4.6764811490125675, + "grad_norm": 1.0330020189285278, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 65120 + }, + { + "epoch": 4.6771992818671455, + "grad_norm": 0.9083197116851807, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 65130 + }, + { + "epoch": 4.6779174147217235, + "grad_norm": 0.9298770427703857, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 65140 + }, + { + "epoch": 4.6786355475763015, + "grad_norm": 1.0009549856185913, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 65150 + }, + { + "epoch": 4.6793536804308795, + "grad_norm": 0.951389729976654, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 65160 + }, + { + "epoch": 4.6800718132854575, + "grad_norm": 1.151870608329773, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 65170 + }, + { + "epoch": 4.680789946140036, + "grad_norm": 1.0074727535247803, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 65180 + }, + { + "epoch": 4.681508078994614, + "grad_norm": 1.0490152835845947, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 65190 + }, + { + "epoch": 4.682226211849192, + "grad_norm": 0.8967363834381104, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 65200 + }, + { + "epoch": 4.68294434470377, + "grad_norm": 1.2314889430999756, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 65210 + }, + { + "epoch": 4.683662477558348, + "grad_norm": 0.7764074802398682, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 65220 + }, + { + "epoch": 4.684380610412926, + "grad_norm": 1.0587822198867798, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 65230 + }, + { + "epoch": 4.685098743267504, + "grad_norm": 0.916114091873169, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 65240 + }, + { + "epoch": 4.685816876122082, + "grad_norm": 0.9117472767829895, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 65250 + }, + { + "epoch": 4.68653500897666, + "grad_norm": 0.8369293212890625, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 65260 + }, + { + "epoch": 4.687253141831238, + "grad_norm": 0.9700121879577637, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 65270 + }, + { + "epoch": 4.687971274685816, + "grad_norm": 1.0008411407470703, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 65280 + }, + { + "epoch": 4.688689407540395, + "grad_norm": 0.9339549541473389, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 65290 + }, + { + "epoch": 4.689407540394973, + "grad_norm": 0.956701934337616, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 65300 + }, + { + "epoch": 4.690125673249551, + "grad_norm": 1.2042720317840576, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 65310 + }, + { + "epoch": 4.690843806104129, + "grad_norm": 0.8679144382476807, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 65320 + }, + { + "epoch": 4.691561938958707, + "grad_norm": 1.2320687770843506, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 65330 + }, + { + "epoch": 4.692280071813285, + "grad_norm": 0.8397238850593567, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 65340 + }, + { + "epoch": 4.692998204667863, + "grad_norm": 0.7850362658500671, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 65350 + }, + { + "epoch": 4.693716337522441, + "grad_norm": 0.9281290173530579, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 65360 + }, + { + "epoch": 4.69443447037702, + "grad_norm": 1.1506335735321045, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 65370 + }, + { + "epoch": 4.695152603231598, + "grad_norm": 1.0910584926605225, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 65380 + }, + { + "epoch": 4.695870736086176, + "grad_norm": 0.8937386274337769, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 65390 + }, + { + "epoch": 4.696588868940754, + "grad_norm": 1.0163888931274414, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 65400 + }, + { + "epoch": 4.697307001795332, + "grad_norm": 1.0290007591247559, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 65410 + }, + { + "epoch": 4.69802513464991, + "grad_norm": 0.9046576023101807, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 65420 + }, + { + "epoch": 4.698743267504488, + "grad_norm": 1.0030237436294556, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 65430 + }, + { + "epoch": 4.699461400359066, + "grad_norm": 0.8196740746498108, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 65440 + }, + { + "epoch": 4.700179533213644, + "grad_norm": 0.9036651849746704, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 65450 + }, + { + "epoch": 4.700897666068222, + "grad_norm": 1.2080141305923462, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 65460 + }, + { + "epoch": 4.7016157989228, + "grad_norm": 0.8743635416030884, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 65470 + }, + { + "epoch": 4.702333931777379, + "grad_norm": 0.9566192030906677, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 65480 + }, + { + "epoch": 4.703052064631957, + "grad_norm": 1.0505144596099854, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 65490 + }, + { + "epoch": 4.703770197486535, + "grad_norm": 0.8797298073768616, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 65500 + }, + { + "epoch": 4.704488330341113, + "grad_norm": 0.9970770478248596, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 65510 + }, + { + "epoch": 4.705206463195691, + "grad_norm": 1.1743851900100708, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 65520 + }, + { + "epoch": 4.705924596050269, + "grad_norm": 0.9534381031990051, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 65530 + }, + { + "epoch": 4.706642728904847, + "grad_norm": 0.9735581278800964, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 65540 + }, + { + "epoch": 4.707360861759425, + "grad_norm": 1.185352087020874, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 65550 + }, + { + "epoch": 4.708078994614004, + "grad_norm": 0.9383901357650757, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 65560 + }, + { + "epoch": 4.708797127468582, + "grad_norm": 1.0194662809371948, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 65570 + }, + { + "epoch": 4.70951526032316, + "grad_norm": 0.8448300361633301, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 65580 + }, + { + "epoch": 4.710233393177738, + "grad_norm": 1.1930629014968872, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 65590 + }, + { + "epoch": 4.710951526032316, + "grad_norm": 1.0038636922836304, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 65600 + }, + { + "epoch": 4.711669658886894, + "grad_norm": 0.8206564784049988, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 65610 + }, + { + "epoch": 4.712387791741472, + "grad_norm": 1.0984861850738525, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 65620 + }, + { + "epoch": 4.71310592459605, + "grad_norm": 1.2891547679901123, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 65630 + }, + { + "epoch": 4.713824057450628, + "grad_norm": 0.927062451839447, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 65640 + }, + { + "epoch": 4.714542190305206, + "grad_norm": 0.8647334575653076, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 65650 + }, + { + "epoch": 4.715260323159785, + "grad_norm": 1.1017670631408691, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 65660 + }, + { + "epoch": 4.715978456014363, + "grad_norm": 0.9589072465896606, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 65670 + }, + { + "epoch": 4.716696588868941, + "grad_norm": 0.9496776461601257, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 65680 + }, + { + "epoch": 4.717414721723519, + "grad_norm": 0.9266180396080017, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 65690 + }, + { + "epoch": 4.718132854578097, + "grad_norm": 0.8699696063995361, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 65700 + }, + { + "epoch": 4.718850987432675, + "grad_norm": 1.0444015264511108, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 65710 + }, + { + "epoch": 4.719569120287253, + "grad_norm": 1.0100741386413574, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 65720 + }, + { + "epoch": 4.720287253141831, + "grad_norm": 1.1442630290985107, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 65730 + }, + { + "epoch": 4.721005385996409, + "grad_norm": 0.8937877416610718, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 65740 + }, + { + "epoch": 4.721723518850988, + "grad_norm": 1.0718764066696167, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 65750 + }, + { + "epoch": 4.722441651705566, + "grad_norm": 0.8838587999343872, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 65760 + }, + { + "epoch": 4.723159784560144, + "grad_norm": 1.1247940063476562, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 65770 + }, + { + "epoch": 4.723877917414722, + "grad_norm": 0.9491105675697327, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 65780 + }, + { + "epoch": 4.7245960502693, + "grad_norm": 1.0896921157836914, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 65790 + }, + { + "epoch": 4.725314183123878, + "grad_norm": 1.0097380876541138, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 65800 + }, + { + "epoch": 4.726032315978456, + "grad_norm": 0.911763608455658, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 65810 + }, + { + "epoch": 4.726750448833034, + "grad_norm": 1.1295124292373657, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 65820 + }, + { + "epoch": 4.727468581687612, + "grad_norm": 0.7637538313865662, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 65830 + }, + { + "epoch": 4.72818671454219, + "grad_norm": 0.9255306720733643, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 65840 + }, + { + "epoch": 4.728904847396769, + "grad_norm": 0.9847530126571655, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 65850 + }, + { + "epoch": 4.729622980251347, + "grad_norm": 0.9036182761192322, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 65860 + }, + { + "epoch": 4.730341113105925, + "grad_norm": 0.8284199833869934, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 65870 + }, + { + "epoch": 4.731059245960503, + "grad_norm": 1.0142838954925537, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 65880 + }, + { + "epoch": 4.731777378815081, + "grad_norm": 0.9389033913612366, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 65890 + }, + { + "epoch": 4.732495511669659, + "grad_norm": 0.8870056867599487, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 65900 + }, + { + "epoch": 4.733213644524237, + "grad_norm": 1.1211678981781006, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 65910 + }, + { + "epoch": 4.733931777378815, + "grad_norm": 0.7796614170074463, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 65920 + }, + { + "epoch": 4.734649910233394, + "grad_norm": 1.0360451936721802, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 65930 + }, + { + "epoch": 4.735368043087972, + "grad_norm": 0.8383482098579407, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 65940 + }, + { + "epoch": 4.73608617594255, + "grad_norm": 0.7985122799873352, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 65950 + }, + { + "epoch": 4.736804308797128, + "grad_norm": 1.0314199924468994, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 65960 + }, + { + "epoch": 4.737522441651706, + "grad_norm": 0.9279016852378845, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 65970 + }, + { + "epoch": 4.738240574506284, + "grad_norm": 1.1046063899993896, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 65980 + }, + { + "epoch": 4.738958707360862, + "grad_norm": 0.9075793623924255, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 65990 + }, + { + "epoch": 4.73967684021544, + "grad_norm": 1.0945355892181396, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 66000 + }, + { + "epoch": 4.740394973070018, + "grad_norm": 0.8885519504547119, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 66010 + }, + { + "epoch": 4.741113105924596, + "grad_norm": 0.9312083125114441, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 66020 + }, + { + "epoch": 4.741831238779174, + "grad_norm": 1.1574538946151733, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 66030 + }, + { + "epoch": 4.742549371633753, + "grad_norm": 0.9346209168434143, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 66040 + }, + { + "epoch": 4.743267504488331, + "grad_norm": 0.8935149312019348, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 66050 + }, + { + "epoch": 4.743985637342909, + "grad_norm": 0.8958369493484497, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 66060 + }, + { + "epoch": 4.744703770197487, + "grad_norm": 0.9383506774902344, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 66070 + }, + { + "epoch": 4.745421903052065, + "grad_norm": 0.9868947863578796, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 66080 + }, + { + "epoch": 4.746140035906643, + "grad_norm": 1.3417645692825317, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 66090 + }, + { + "epoch": 4.746858168761221, + "grad_norm": 1.070693850517273, + "learning_rate": 0.0002, + "loss": 0.5417, + "step": 66100 + }, + { + "epoch": 4.747576301615799, + "grad_norm": 0.8841570019721985, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 66110 + }, + { + "epoch": 4.7482944344703775, + "grad_norm": 0.7963120341300964, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 66120 + }, + { + "epoch": 4.7490125673249555, + "grad_norm": 0.8145691156387329, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 66130 + }, + { + "epoch": 4.7497307001795335, + "grad_norm": 0.9074729681015015, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 66140 + }, + { + "epoch": 4.7504488330341115, + "grad_norm": 0.9129886627197266, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 66150 + }, + { + "epoch": 4.7511669658886895, + "grad_norm": 0.91527259349823, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 66160 + }, + { + "epoch": 4.7518850987432675, + "grad_norm": 0.9569419622421265, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 66170 + }, + { + "epoch": 4.7526032315978455, + "grad_norm": 0.8777104616165161, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 66180 + }, + { + "epoch": 4.7533213644524235, + "grad_norm": 0.9673085808753967, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 66190 + }, + { + "epoch": 4.7540394973070015, + "grad_norm": 1.0683966875076294, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 66200 + }, + { + "epoch": 4.7547576301615795, + "grad_norm": 1.1591907739639282, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 66210 + }, + { + "epoch": 4.755475763016158, + "grad_norm": 1.1973309516906738, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 66220 + }, + { + "epoch": 4.756193895870736, + "grad_norm": 0.8472012281417847, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 66230 + }, + { + "epoch": 4.756912028725314, + "grad_norm": 0.9896261692047119, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 66240 + }, + { + "epoch": 4.757630161579892, + "grad_norm": 0.8498432040214539, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 66250 + }, + { + "epoch": 4.75834829443447, + "grad_norm": 0.9624166488647461, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 66260 + }, + { + "epoch": 4.759066427289048, + "grad_norm": 1.0951786041259766, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 66270 + }, + { + "epoch": 4.759784560143626, + "grad_norm": 0.9863157868385315, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 66280 + }, + { + "epoch": 4.760502692998204, + "grad_norm": 1.0062068700790405, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 66290 + }, + { + "epoch": 4.761220825852782, + "grad_norm": 0.8075495958328247, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 66300 + }, + { + "epoch": 4.761938958707361, + "grad_norm": 0.9617878198623657, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 66310 + }, + { + "epoch": 4.762657091561939, + "grad_norm": 1.097091555595398, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 66320 + }, + { + "epoch": 4.763375224416517, + "grad_norm": 1.2713453769683838, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 66330 + }, + { + "epoch": 4.764093357271095, + "grad_norm": 0.9473448991775513, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 66340 + }, + { + "epoch": 4.764811490125673, + "grad_norm": 1.0176854133605957, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 66350 + }, + { + "epoch": 4.765529622980251, + "grad_norm": 1.0486242771148682, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 66360 + }, + { + "epoch": 4.766247755834829, + "grad_norm": 1.249985694885254, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 66370 + }, + { + "epoch": 4.766965888689407, + "grad_norm": 1.283875584602356, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 66380 + }, + { + "epoch": 4.767684021543985, + "grad_norm": 1.0009022951126099, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 66390 + }, + { + "epoch": 4.768402154398563, + "grad_norm": 0.9718021750450134, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 66400 + }, + { + "epoch": 4.769120287253142, + "grad_norm": 1.0865732431411743, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 66410 + }, + { + "epoch": 4.76983842010772, + "grad_norm": 0.9273189306259155, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 66420 + }, + { + "epoch": 4.770556552962298, + "grad_norm": 1.067535638809204, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 66430 + }, + { + "epoch": 4.771274685816876, + "grad_norm": 1.0551011562347412, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 66440 + }, + { + "epoch": 4.771992818671454, + "grad_norm": 1.0336146354675293, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 66450 + }, + { + "epoch": 4.772710951526032, + "grad_norm": 0.8738380670547485, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 66460 + }, + { + "epoch": 4.77342908438061, + "grad_norm": 1.1048321723937988, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 66470 + }, + { + "epoch": 4.774147217235188, + "grad_norm": 0.8471167683601379, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 66480 + }, + { + "epoch": 4.774865350089767, + "grad_norm": 1.2527031898498535, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 66490 + }, + { + "epoch": 4.775583482944345, + "grad_norm": 1.0056052207946777, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 66500 + }, + { + "epoch": 4.776301615798923, + "grad_norm": 1.142456293106079, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 66510 + }, + { + "epoch": 4.777019748653501, + "grad_norm": 1.1813132762908936, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 66520 + }, + { + "epoch": 4.777737881508079, + "grad_norm": 0.8683654069900513, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 66530 + }, + { + "epoch": 4.778456014362657, + "grad_norm": 1.0577980279922485, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 66540 + }, + { + "epoch": 4.779174147217235, + "grad_norm": 1.077438473701477, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 66550 + }, + { + "epoch": 4.779892280071813, + "grad_norm": 1.0107938051223755, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 66560 + }, + { + "epoch": 4.780610412926391, + "grad_norm": 0.8071168065071106, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 66570 + }, + { + "epoch": 4.781328545780969, + "grad_norm": 0.8887564539909363, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 66580 + }, + { + "epoch": 4.782046678635547, + "grad_norm": 0.9823092222213745, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 66590 + }, + { + "epoch": 4.782764811490126, + "grad_norm": 0.9026784300804138, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 66600 + }, + { + "epoch": 4.783482944344704, + "grad_norm": 0.8912792205810547, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 66610 + }, + { + "epoch": 4.784201077199282, + "grad_norm": 1.0955979824066162, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 66620 + }, + { + "epoch": 4.78491921005386, + "grad_norm": 0.8614793419837952, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 66630 + }, + { + "epoch": 4.785637342908438, + "grad_norm": 0.7247269153594971, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 66640 + }, + { + "epoch": 4.786355475763016, + "grad_norm": 0.9685400724411011, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 66650 + }, + { + "epoch": 4.787073608617594, + "grad_norm": 0.9219905734062195, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 66660 + }, + { + "epoch": 4.787791741472172, + "grad_norm": 0.9217489361763, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 66670 + }, + { + "epoch": 4.788509874326751, + "grad_norm": 1.13791823387146, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 66680 + }, + { + "epoch": 4.789228007181329, + "grad_norm": 0.857542872428894, + "learning_rate": 0.0002, + "loss": 0.6114, + "step": 66690 + }, + { + "epoch": 4.789946140035907, + "grad_norm": 0.9886694550514221, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 66700 + }, + { + "epoch": 4.790664272890485, + "grad_norm": 0.987952470779419, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 66710 + }, + { + "epoch": 4.791382405745063, + "grad_norm": 1.051612377166748, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 66720 + }, + { + "epoch": 4.792100538599641, + "grad_norm": 0.9816454648971558, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 66730 + }, + { + "epoch": 4.792818671454219, + "grad_norm": 1.0953829288482666, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 66740 + }, + { + "epoch": 4.793536804308797, + "grad_norm": 0.8720369935035706, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 66750 + }, + { + "epoch": 4.794254937163375, + "grad_norm": 0.8910234570503235, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 66760 + }, + { + "epoch": 4.794973070017953, + "grad_norm": 0.8300510048866272, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 66770 + }, + { + "epoch": 4.795691202872531, + "grad_norm": 0.9380533695220947, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 66780 + }, + { + "epoch": 4.79640933572711, + "grad_norm": 0.8361864686012268, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 66790 + }, + { + "epoch": 4.797127468581688, + "grad_norm": 1.051262617111206, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 66800 + }, + { + "epoch": 4.797845601436266, + "grad_norm": 1.1324400901794434, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 66810 + }, + { + "epoch": 4.798563734290844, + "grad_norm": 0.853903591632843, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 66820 + }, + { + "epoch": 4.799281867145422, + "grad_norm": 0.9949867725372314, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 66830 + }, + { + "epoch": 4.8, + "grad_norm": 0.9204033017158508, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 66840 + }, + { + "epoch": 4.800718132854578, + "grad_norm": 0.7461584806442261, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 66850 + }, + { + "epoch": 4.801436265709156, + "grad_norm": 1.1019874811172485, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 66860 + }, + { + "epoch": 4.802154398563735, + "grad_norm": 1.1695797443389893, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 66870 + }, + { + "epoch": 4.802872531418313, + "grad_norm": 1.0902758836746216, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 66880 + }, + { + "epoch": 4.803590664272891, + "grad_norm": 0.8778618574142456, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 66890 + }, + { + "epoch": 4.804308797127469, + "grad_norm": 0.905505359172821, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 66900 + }, + { + "epoch": 4.805026929982047, + "grad_norm": 1.0802056789398193, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 66910 + }, + { + "epoch": 4.805745062836625, + "grad_norm": 0.7899449467658997, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 66920 + }, + { + "epoch": 4.806463195691203, + "grad_norm": 1.1938519477844238, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 66930 + }, + { + "epoch": 4.807181328545781, + "grad_norm": 1.0213780403137207, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 66940 + }, + { + "epoch": 4.807899461400359, + "grad_norm": 0.9925506711006165, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 66950 + }, + { + "epoch": 4.808617594254937, + "grad_norm": 1.0174424648284912, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 66960 + }, + { + "epoch": 4.809335727109516, + "grad_norm": 1.0515072345733643, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 66970 + }, + { + "epoch": 4.810053859964094, + "grad_norm": 1.0161492824554443, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 66980 + }, + { + "epoch": 4.810771992818672, + "grad_norm": 0.8421840071678162, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 66990 + }, + { + "epoch": 4.81149012567325, + "grad_norm": 1.0493539571762085, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 67000 + }, + { + "epoch": 4.812208258527828, + "grad_norm": 1.1133309602737427, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 67010 + }, + { + "epoch": 4.812926391382406, + "grad_norm": 0.924017071723938, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 67020 + }, + { + "epoch": 4.813644524236984, + "grad_norm": 1.0568689107894897, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 67030 + }, + { + "epoch": 4.814362657091562, + "grad_norm": 0.989414632320404, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 67040 + }, + { + "epoch": 4.8150807899461405, + "grad_norm": 0.9256827235221863, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 67050 + }, + { + "epoch": 4.8157989228007185, + "grad_norm": 0.9538901448249817, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 67060 + }, + { + "epoch": 4.8165170556552965, + "grad_norm": 1.0373849868774414, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 67070 + }, + { + "epoch": 4.8172351885098745, + "grad_norm": 1.0019729137420654, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 67080 + }, + { + "epoch": 4.8179533213644525, + "grad_norm": 0.9930381178855896, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 67090 + }, + { + "epoch": 4.8186714542190305, + "grad_norm": 1.0008453130722046, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 67100 + }, + { + "epoch": 4.8193895870736085, + "grad_norm": 1.0153851509094238, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 67110 + }, + { + "epoch": 4.8201077199281865, + "grad_norm": 1.0193161964416504, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 67120 + }, + { + "epoch": 4.8208258527827645, + "grad_norm": 1.0204501152038574, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 67130 + }, + { + "epoch": 4.8215439856373425, + "grad_norm": 0.9097670316696167, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 67140 + }, + { + "epoch": 4.8222621184919205, + "grad_norm": 0.9288716912269592, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 67150 + }, + { + "epoch": 4.822980251346499, + "grad_norm": 0.9975850582122803, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 67160 + }, + { + "epoch": 4.823698384201077, + "grad_norm": 0.8502511382102966, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 67170 + }, + { + "epoch": 4.824416517055655, + "grad_norm": 1.0129257440567017, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 67180 + }, + { + "epoch": 4.825134649910233, + "grad_norm": 1.0009492635726929, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 67190 + }, + { + "epoch": 4.825852782764811, + "grad_norm": 0.9273321032524109, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 67200 + }, + { + "epoch": 4.8265709156193894, + "grad_norm": 1.0438604354858398, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 67210 + }, + { + "epoch": 4.8272890484739674, + "grad_norm": 1.119573712348938, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 67220 + }, + { + "epoch": 4.8280071813285454, + "grad_norm": 0.9607422351837158, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 67230 + }, + { + "epoch": 4.828725314183124, + "grad_norm": 0.9614062905311584, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 67240 + }, + { + "epoch": 4.829443447037702, + "grad_norm": 1.1017652750015259, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 67250 + }, + { + "epoch": 4.83016157989228, + "grad_norm": 1.0521706342697144, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 67260 + }, + { + "epoch": 4.830879712746858, + "grad_norm": 0.7685959339141846, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 67270 + }, + { + "epoch": 4.831597845601436, + "grad_norm": 0.7894896268844604, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 67280 + }, + { + "epoch": 4.832315978456014, + "grad_norm": 1.0882996320724487, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 67290 + }, + { + "epoch": 4.833034111310592, + "grad_norm": 0.9215409755706787, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 67300 + }, + { + "epoch": 4.83375224416517, + "grad_norm": 0.8660635352134705, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 67310 + }, + { + "epoch": 4.834470377019748, + "grad_norm": 0.980879008769989, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 67320 + }, + { + "epoch": 4.835188509874326, + "grad_norm": 1.0356814861297607, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 67330 + }, + { + "epoch": 4.835906642728904, + "grad_norm": 1.0265507698059082, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 67340 + }, + { + "epoch": 4.836624775583483, + "grad_norm": 1.0659137964248657, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 67350 + }, + { + "epoch": 4.837342908438061, + "grad_norm": 0.9485231637954712, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 67360 + }, + { + "epoch": 4.838061041292639, + "grad_norm": 1.0950140953063965, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 67370 + }, + { + "epoch": 4.838779174147217, + "grad_norm": 0.8907382488250732, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 67380 + }, + { + "epoch": 4.839497307001795, + "grad_norm": 0.9777120351791382, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 67390 + }, + { + "epoch": 4.840215439856373, + "grad_norm": 0.8482252955436707, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 67400 + }, + { + "epoch": 4.840933572710951, + "grad_norm": 0.8505899906158447, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 67410 + }, + { + "epoch": 4.841651705565529, + "grad_norm": 0.8574482798576355, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 67420 + }, + { + "epoch": 4.842369838420108, + "grad_norm": 1.092310905456543, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 67430 + }, + { + "epoch": 4.843087971274686, + "grad_norm": 0.9418560266494751, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 67440 + }, + { + "epoch": 4.843806104129264, + "grad_norm": 1.1310782432556152, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 67450 + }, + { + "epoch": 4.844524236983842, + "grad_norm": 0.9993671774864197, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 67460 + }, + { + "epoch": 4.84524236983842, + "grad_norm": 0.8322528600692749, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 67470 + }, + { + "epoch": 4.845960502692998, + "grad_norm": 0.8488435745239258, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 67480 + }, + { + "epoch": 4.846678635547576, + "grad_norm": 0.8070611357688904, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 67490 + }, + { + "epoch": 4.847396768402154, + "grad_norm": 0.8200163245201111, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 67500 + }, + { + "epoch": 4.848114901256732, + "grad_norm": 0.91901034116745, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 67510 + }, + { + "epoch": 4.84883303411131, + "grad_norm": 1.0938435792922974, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 67520 + }, + { + "epoch": 4.849551166965889, + "grad_norm": 0.7926174402236938, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 67530 + }, + { + "epoch": 4.850269299820467, + "grad_norm": 0.9914385676383972, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 67540 + }, + { + "epoch": 4.850987432675045, + "grad_norm": 1.033065915107727, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 67550 + }, + { + "epoch": 4.851705565529623, + "grad_norm": 0.9700239300727844, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 67560 + }, + { + "epoch": 4.852423698384201, + "grad_norm": 0.8550103902816772, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 67570 + }, + { + "epoch": 4.853141831238779, + "grad_norm": 1.0009654760360718, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 67580 + }, + { + "epoch": 4.853859964093357, + "grad_norm": 1.0766186714172363, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 67590 + }, + { + "epoch": 4.854578096947935, + "grad_norm": 0.9512220621109009, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 67600 + }, + { + "epoch": 4.855296229802514, + "grad_norm": 0.8434456586837769, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 67610 + }, + { + "epoch": 4.856014362657092, + "grad_norm": 1.0276665687561035, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 67620 + }, + { + "epoch": 4.85673249551167, + "grad_norm": 0.9758516550064087, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 67630 + }, + { + "epoch": 4.857450628366248, + "grad_norm": 0.8988076448440552, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 67640 + }, + { + "epoch": 4.858168761220826, + "grad_norm": 1.0038257837295532, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 67650 + }, + { + "epoch": 4.858886894075404, + "grad_norm": 0.9973093867301941, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 67660 + }, + { + "epoch": 4.859605026929982, + "grad_norm": 0.9754974246025085, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 67670 + }, + { + "epoch": 4.86032315978456, + "grad_norm": 1.1829560995101929, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 67680 + }, + { + "epoch": 4.861041292639138, + "grad_norm": 1.1077659130096436, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 67690 + }, + { + "epoch": 4.861759425493716, + "grad_norm": 0.9862872958183289, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 67700 + }, + { + "epoch": 4.862477558348294, + "grad_norm": 0.9826052188873291, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 67710 + }, + { + "epoch": 4.863195691202873, + "grad_norm": 0.940082848072052, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 67720 + }, + { + "epoch": 4.863913824057451, + "grad_norm": 0.895434558391571, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 67730 + }, + { + "epoch": 4.864631956912029, + "grad_norm": 1.1194682121276855, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 67740 + }, + { + "epoch": 4.865350089766607, + "grad_norm": 0.9984544515609741, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 67750 + }, + { + "epoch": 4.866068222621185, + "grad_norm": 1.049224615097046, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 67760 + }, + { + "epoch": 4.866786355475763, + "grad_norm": 1.009515643119812, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 67770 + }, + { + "epoch": 4.867504488330341, + "grad_norm": 1.0336902141571045, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 67780 + }, + { + "epoch": 4.868222621184919, + "grad_norm": 0.9310635924339294, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 67790 + }, + { + "epoch": 4.868940754039498, + "grad_norm": 0.934882640838623, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 67800 + }, + { + "epoch": 4.869658886894076, + "grad_norm": 0.8663495779037476, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 67810 + }, + { + "epoch": 4.870377019748654, + "grad_norm": 1.0085018873214722, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 67820 + }, + { + "epoch": 4.871095152603232, + "grad_norm": 0.896507978439331, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 67830 + }, + { + "epoch": 4.87181328545781, + "grad_norm": 0.925809919834137, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 67840 + }, + { + "epoch": 4.872531418312388, + "grad_norm": 0.8044029474258423, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 67850 + }, + { + "epoch": 4.873249551166966, + "grad_norm": 1.0026800632476807, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 67860 + }, + { + "epoch": 4.873967684021544, + "grad_norm": 0.9577589631080627, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 67870 + }, + { + "epoch": 4.874685816876122, + "grad_norm": 0.8225193619728088, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 67880 + }, + { + "epoch": 4.8754039497307, + "grad_norm": 1.0019139051437378, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 67890 + }, + { + "epoch": 4.876122082585278, + "grad_norm": 0.9282827377319336, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 67900 + }, + { + "epoch": 4.876840215439857, + "grad_norm": 0.8204836249351501, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 67910 + }, + { + "epoch": 4.877558348294435, + "grad_norm": 0.907356321811676, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 67920 + }, + { + "epoch": 4.878276481149013, + "grad_norm": 1.12422776222229, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 67930 + }, + { + "epoch": 4.878994614003591, + "grad_norm": 0.8230205178260803, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 67940 + }, + { + "epoch": 4.879712746858169, + "grad_norm": 1.1588479280471802, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 67950 + }, + { + "epoch": 4.880430879712747, + "grad_norm": 1.1064553260803223, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 67960 + }, + { + "epoch": 4.881149012567325, + "grad_norm": 0.9311534762382507, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 67970 + }, + { + "epoch": 4.881867145421903, + "grad_norm": 0.7575639486312866, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 67980 + }, + { + "epoch": 4.882585278276482, + "grad_norm": 0.9201191067695618, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 67990 + }, + { + "epoch": 4.88330341113106, + "grad_norm": 0.8487658500671387, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 68000 + }, + { + "epoch": 4.884021543985638, + "grad_norm": 0.9645208716392517, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 68010 + }, + { + "epoch": 4.884739676840216, + "grad_norm": 0.8594469428062439, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 68020 + }, + { + "epoch": 4.885457809694794, + "grad_norm": 0.9518412947654724, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 68030 + }, + { + "epoch": 4.886175942549372, + "grad_norm": 1.0934258699417114, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 68040 + }, + { + "epoch": 4.88689407540395, + "grad_norm": 0.988761842250824, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 68050 + }, + { + "epoch": 4.887612208258528, + "grad_norm": 0.7572013735771179, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 68060 + }, + { + "epoch": 4.888330341113106, + "grad_norm": 0.8801929950714111, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 68070 + }, + { + "epoch": 4.889048473967684, + "grad_norm": 1.0080658197402954, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 68080 + }, + { + "epoch": 4.8897666068222625, + "grad_norm": 0.9588785171508789, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 68090 + }, + { + "epoch": 4.8904847396768405, + "grad_norm": 1.0994032621383667, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 68100 + }, + { + "epoch": 4.8912028725314185, + "grad_norm": 0.9851962924003601, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 68110 + }, + { + "epoch": 4.8919210053859965, + "grad_norm": 0.9566116333007812, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 68120 + }, + { + "epoch": 4.8926391382405745, + "grad_norm": 0.8708083033561707, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 68130 + }, + { + "epoch": 4.8933572710951525, + "grad_norm": 1.2182754278182983, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 68140 + }, + { + "epoch": 4.8940754039497305, + "grad_norm": 1.047988772392273, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 68150 + }, + { + "epoch": 4.8947935368043085, + "grad_norm": 0.8665831685066223, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 68160 + }, + { + "epoch": 4.8955116696588865, + "grad_norm": 0.9313908219337463, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 68170 + }, + { + "epoch": 4.896229802513465, + "grad_norm": 0.9568582773208618, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 68180 + }, + { + "epoch": 4.896947935368043, + "grad_norm": 1.0427594184875488, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 68190 + }, + { + "epoch": 4.897666068222621, + "grad_norm": 0.9132021069526672, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 68200 + }, + { + "epoch": 4.898384201077199, + "grad_norm": 0.9597318768501282, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 68210 + }, + { + "epoch": 4.899102333931777, + "grad_norm": 1.0736947059631348, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 68220 + }, + { + "epoch": 4.899820466786355, + "grad_norm": 0.9318404793739319, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 68230 + }, + { + "epoch": 4.900538599640933, + "grad_norm": 0.8594326972961426, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 68240 + }, + { + "epoch": 4.901256732495511, + "grad_norm": 1.1437443494796753, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 68250 + }, + { + "epoch": 4.901974865350089, + "grad_norm": 1.1599408388137817, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 68260 + }, + { + "epoch": 4.902692998204667, + "grad_norm": 1.160628080368042, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 68270 + }, + { + "epoch": 4.903411131059246, + "grad_norm": 1.0147801637649536, + "learning_rate": 0.0002, + "loss": 0.613, + "step": 68280 + }, + { + "epoch": 4.904129263913824, + "grad_norm": 0.8622691631317139, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 68290 + }, + { + "epoch": 4.904847396768402, + "grad_norm": 0.7179980874061584, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 68300 + }, + { + "epoch": 4.90556552962298, + "grad_norm": 1.1705092191696167, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 68310 + }, + { + "epoch": 4.906283662477558, + "grad_norm": 1.1687676906585693, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 68320 + }, + { + "epoch": 4.907001795332136, + "grad_norm": 1.1621531248092651, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 68330 + }, + { + "epoch": 4.907719928186714, + "grad_norm": 1.0241422653198242, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 68340 + }, + { + "epoch": 4.908438061041292, + "grad_norm": 0.943354070186615, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 68350 + }, + { + "epoch": 4.909156193895871, + "grad_norm": 0.8091703653335571, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 68360 + }, + { + "epoch": 4.909874326750449, + "grad_norm": 0.8871228694915771, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 68370 + }, + { + "epoch": 4.910592459605027, + "grad_norm": 1.0951069593429565, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 68380 + }, + { + "epoch": 4.911310592459605, + "grad_norm": 1.1355193853378296, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 68390 + }, + { + "epoch": 4.912028725314183, + "grad_norm": 1.0741122961044312, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 68400 + }, + { + "epoch": 4.912746858168761, + "grad_norm": 0.9285269975662231, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 68410 + }, + { + "epoch": 4.913464991023339, + "grad_norm": 1.080695390701294, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 68420 + }, + { + "epoch": 4.914183123877917, + "grad_norm": 0.921331524848938, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 68430 + }, + { + "epoch": 4.914901256732495, + "grad_norm": 0.9763174057006836, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 68440 + }, + { + "epoch": 4.915619389587073, + "grad_norm": 1.1133354902267456, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 68450 + }, + { + "epoch": 4.916337522441651, + "grad_norm": 0.8373502492904663, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 68460 + }, + { + "epoch": 4.91705565529623, + "grad_norm": 0.9192346334457397, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 68470 + }, + { + "epoch": 4.917773788150808, + "grad_norm": 1.0724657773971558, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 68480 + }, + { + "epoch": 4.918491921005386, + "grad_norm": 0.9209843873977661, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 68490 + }, + { + "epoch": 4.919210053859964, + "grad_norm": 0.9201577305793762, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 68500 + }, + { + "epoch": 4.919928186714542, + "grad_norm": 0.8086138963699341, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 68510 + }, + { + "epoch": 4.92064631956912, + "grad_norm": 1.0917785167694092, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 68520 + }, + { + "epoch": 4.921364452423698, + "grad_norm": 0.9287897944450378, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 68530 + }, + { + "epoch": 4.922082585278276, + "grad_norm": 0.9830158948898315, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 68540 + }, + { + "epoch": 4.922800718132855, + "grad_norm": 0.8674678802490234, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 68550 + }, + { + "epoch": 4.923518850987433, + "grad_norm": 0.7996176481246948, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 68560 + }, + { + "epoch": 4.924236983842011, + "grad_norm": 1.1284033060073853, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 68570 + }, + { + "epoch": 4.924955116696589, + "grad_norm": 0.894339919090271, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 68580 + }, + { + "epoch": 4.925673249551167, + "grad_norm": 1.1140280961990356, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 68590 + }, + { + "epoch": 4.926391382405745, + "grad_norm": 0.9048344492912292, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 68600 + }, + { + "epoch": 4.927109515260323, + "grad_norm": 0.9380471706390381, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 68610 + }, + { + "epoch": 4.927827648114901, + "grad_norm": 0.8598429560661316, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 68620 + }, + { + "epoch": 4.928545780969479, + "grad_norm": 1.0813355445861816, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 68630 + }, + { + "epoch": 4.929263913824057, + "grad_norm": 0.979053795337677, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 68640 + }, + { + "epoch": 4.929982046678636, + "grad_norm": 0.8194574117660522, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 68650 + }, + { + "epoch": 4.930700179533214, + "grad_norm": 0.8593540787696838, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 68660 + }, + { + "epoch": 4.931418312387792, + "grad_norm": 1.0134016275405884, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 68670 + }, + { + "epoch": 4.93213644524237, + "grad_norm": 1.060586929321289, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 68680 + }, + { + "epoch": 4.932854578096948, + "grad_norm": 0.84132319688797, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 68690 + }, + { + "epoch": 4.933572710951526, + "grad_norm": 1.0767526626586914, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 68700 + }, + { + "epoch": 4.934290843806104, + "grad_norm": 0.8858519792556763, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 68710 + }, + { + "epoch": 4.935008976660682, + "grad_norm": 1.194031000137329, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 68720 + }, + { + "epoch": 4.93572710951526, + "grad_norm": 0.8270226120948792, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 68730 + }, + { + "epoch": 4.936445242369839, + "grad_norm": 1.0385973453521729, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 68740 + }, + { + "epoch": 4.937163375224417, + "grad_norm": 0.9062243700027466, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 68750 + }, + { + "epoch": 4.937881508078995, + "grad_norm": 1.0526955127716064, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 68760 + }, + { + "epoch": 4.938599640933573, + "grad_norm": 0.930604100227356, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 68770 + }, + { + "epoch": 4.939317773788151, + "grad_norm": 0.9635265469551086, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 68780 + }, + { + "epoch": 4.940035906642729, + "grad_norm": 0.9825171232223511, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 68790 + }, + { + "epoch": 4.940754039497307, + "grad_norm": 0.9621182680130005, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 68800 + }, + { + "epoch": 4.941472172351885, + "grad_norm": 0.9655307531356812, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 68810 + }, + { + "epoch": 4.942190305206463, + "grad_norm": 1.2948180437088013, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 68820 + }, + { + "epoch": 4.942908438061041, + "grad_norm": 0.9206728339195251, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 68830 + }, + { + "epoch": 4.94362657091562, + "grad_norm": 1.0235631465911865, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 68840 + }, + { + "epoch": 4.944344703770198, + "grad_norm": 1.0542538166046143, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 68850 + }, + { + "epoch": 4.945062836624776, + "grad_norm": 0.9787087440490723, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 68860 + }, + { + "epoch": 4.945780969479354, + "grad_norm": 0.9527219533920288, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 68870 + }, + { + "epoch": 4.946499102333932, + "grad_norm": 1.1525826454162598, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 68880 + }, + { + "epoch": 4.94721723518851, + "grad_norm": 0.8610072731971741, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 68890 + }, + { + "epoch": 4.947935368043088, + "grad_norm": 1.1403616666793823, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 68900 + }, + { + "epoch": 4.948653500897666, + "grad_norm": 1.10334312915802, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 68910 + }, + { + "epoch": 4.949371633752245, + "grad_norm": 0.8633760809898376, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 68920 + }, + { + "epoch": 4.950089766606823, + "grad_norm": 1.1291080713272095, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 68930 + }, + { + "epoch": 4.950807899461401, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 68940 + }, + { + "epoch": 4.951526032315979, + "grad_norm": 0.9207960963249207, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 68950 + }, + { + "epoch": 4.952244165170557, + "grad_norm": 0.9815934300422668, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 68960 + }, + { + "epoch": 4.952962298025135, + "grad_norm": 0.9725701808929443, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 68970 + }, + { + "epoch": 4.953680430879713, + "grad_norm": 0.844926655292511, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 68980 + }, + { + "epoch": 4.954398563734291, + "grad_norm": 0.9898511171340942, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 68990 + }, + { + "epoch": 4.955116696588869, + "grad_norm": 1.1311410665512085, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 69000 + }, + { + "epoch": 4.955834829443447, + "grad_norm": 1.218610405921936, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 69010 + }, + { + "epoch": 4.956552962298025, + "grad_norm": 1.1536420583724976, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 69020 + }, + { + "epoch": 4.957271095152604, + "grad_norm": 1.1857786178588867, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 69030 + }, + { + "epoch": 4.957989228007182, + "grad_norm": 0.9969246983528137, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 69040 + }, + { + "epoch": 4.95870736086176, + "grad_norm": 1.138635277748108, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 69050 + }, + { + "epoch": 4.959425493716338, + "grad_norm": 1.110474705696106, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 69060 + }, + { + "epoch": 4.960143626570916, + "grad_norm": 1.0366318225860596, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 69070 + }, + { + "epoch": 4.960861759425494, + "grad_norm": 0.6927996277809143, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 69080 + }, + { + "epoch": 4.961579892280072, + "grad_norm": 1.0368026494979858, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 69090 + }, + { + "epoch": 4.96229802513465, + "grad_norm": 1.0638312101364136, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 69100 + }, + { + "epoch": 4.9630161579892285, + "grad_norm": 1.0372415781021118, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 69110 + }, + { + "epoch": 4.9637342908438065, + "grad_norm": 0.8257387280464172, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 69120 + }, + { + "epoch": 4.9644524236983845, + "grad_norm": 1.0046974420547485, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 69130 + }, + { + "epoch": 4.9651705565529625, + "grad_norm": 1.0139652490615845, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 69140 + }, + { + "epoch": 4.9658886894075405, + "grad_norm": 1.0214691162109375, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 69150 + }, + { + "epoch": 4.9666068222621185, + "grad_norm": 1.1042424440383911, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 69160 + }, + { + "epoch": 4.9673249551166965, + "grad_norm": 0.8749067783355713, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 69170 + }, + { + "epoch": 4.9680430879712745, + "grad_norm": 0.9894024133682251, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 69180 + }, + { + "epoch": 4.9687612208258525, + "grad_norm": 1.0218034982681274, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 69190 + }, + { + "epoch": 4.9694793536804305, + "grad_norm": 0.9782929420471191, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 69200 + }, + { + "epoch": 4.9701974865350085, + "grad_norm": 0.9373409748077393, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 69210 + }, + { + "epoch": 4.970915619389587, + "grad_norm": 1.0329546928405762, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 69220 + }, + { + "epoch": 4.971633752244165, + "grad_norm": 0.9746108055114746, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 69230 + }, + { + "epoch": 4.972351885098743, + "grad_norm": 0.9202073216438293, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 69240 + }, + { + "epoch": 4.973070017953321, + "grad_norm": 1.078032374382019, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 69250 + }, + { + "epoch": 4.973788150807899, + "grad_norm": 0.8860024809837341, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 69260 + }, + { + "epoch": 4.974506283662477, + "grad_norm": 0.915212094783783, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 69270 + }, + { + "epoch": 4.975224416517055, + "grad_norm": 1.1192166805267334, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 69280 + }, + { + "epoch": 4.975942549371633, + "grad_norm": 0.8387445211410522, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 69290 + }, + { + "epoch": 4.976660682226212, + "grad_norm": 1.1210044622421265, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 69300 + }, + { + "epoch": 4.97737881508079, + "grad_norm": 1.0051207542419434, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 69310 + }, + { + "epoch": 4.978096947935368, + "grad_norm": 0.9248682856559753, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 69320 + }, + { + "epoch": 4.978815080789946, + "grad_norm": 0.8265128135681152, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 69330 + }, + { + "epoch": 4.979533213644524, + "grad_norm": 0.9432681798934937, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 69340 + }, + { + "epoch": 4.980251346499102, + "grad_norm": 1.0135977268218994, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 69350 + }, + { + "epoch": 4.98096947935368, + "grad_norm": 0.9857245683670044, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 69360 + }, + { + "epoch": 4.981687612208258, + "grad_norm": 0.9215952157974243, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 69370 + }, + { + "epoch": 4.982405745062836, + "grad_norm": 1.1518077850341797, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 69380 + }, + { + "epoch": 4.983123877917414, + "grad_norm": 0.8836095929145813, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 69390 + }, + { + "epoch": 4.983842010771993, + "grad_norm": 0.8082528710365295, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 69400 + }, + { + "epoch": 4.984560143626571, + "grad_norm": 0.9295604825019836, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 69410 + }, + { + "epoch": 4.985278276481149, + "grad_norm": 1.002057433128357, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 69420 + }, + { + "epoch": 4.985996409335727, + "grad_norm": 0.8127216100692749, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 69430 + }, + { + "epoch": 4.986714542190305, + "grad_norm": 1.058138370513916, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 69440 + }, + { + "epoch": 4.987432675044883, + "grad_norm": 0.8451166749000549, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 69450 + }, + { + "epoch": 4.988150807899461, + "grad_norm": 0.9687268137931824, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 69460 + }, + { + "epoch": 4.988868940754039, + "grad_norm": 1.0342036485671997, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 69470 + }, + { + "epoch": 4.989587073608618, + "grad_norm": 0.9042398929595947, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 69480 + }, + { + "epoch": 4.990305206463196, + "grad_norm": 1.0575438737869263, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 69490 + }, + { + "epoch": 4.991023339317774, + "grad_norm": 0.9364935159683228, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 69500 + }, + { + "epoch": 4.991741472172352, + "grad_norm": 1.0327378511428833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 69510 + }, + { + "epoch": 4.99245960502693, + "grad_norm": 0.815592885017395, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 69520 + }, + { + "epoch": 4.993177737881508, + "grad_norm": 1.0813369750976562, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 69530 + }, + { + "epoch": 4.993895870736086, + "grad_norm": 1.0277023315429688, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 69540 + }, + { + "epoch": 4.994614003590664, + "grad_norm": 1.0291162729263306, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 69550 + }, + { + "epoch": 4.995332136445242, + "grad_norm": 0.8435685634613037, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 69560 + }, + { + "epoch": 4.99605026929982, + "grad_norm": 1.1972291469573975, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 69570 + }, + { + "epoch": 4.996768402154398, + "grad_norm": 0.8114907741546631, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 69580 + }, + { + "epoch": 4.997486535008977, + "grad_norm": 0.8296133875846863, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 69590 + }, + { + "epoch": 4.998204667863555, + "grad_norm": 1.1728706359863281, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 69600 + }, + { + "epoch": 4.998922800718133, + "grad_norm": 0.9586578607559204, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 69610 + }, + { + "epoch": 4.999640933572711, + "grad_norm": 0.9725151062011719, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 69620 + }, + { + "epoch": 5.0, + "eval_loss": 1.133581519126892, + "eval_runtime": 55.2151, + "eval_samples_per_second": 13.275, + "eval_steps_per_second": 1.666, + "step": 69625 + }, + { + "epoch": 5.000359066427289, + "grad_norm": 0.9312055706977844, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 69630 + }, + { + "epoch": 5.001077199281867, + "grad_norm": 1.0534896850585938, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 69640 + }, + { + "epoch": 5.001795332136445, + "grad_norm": 0.8891698718070984, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 69650 + }, + { + "epoch": 5.002513464991023, + "grad_norm": 0.7791097164154053, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 69660 + }, + { + "epoch": 5.003231597845601, + "grad_norm": 1.2891173362731934, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 69670 + }, + { + "epoch": 5.00394973070018, + "grad_norm": 0.7909513711929321, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 69680 + }, + { + "epoch": 5.004667863554758, + "grad_norm": 0.988648533821106, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 69690 + }, + { + "epoch": 5.005385996409336, + "grad_norm": 0.9669296741485596, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 69700 + }, + { + "epoch": 5.006104129263914, + "grad_norm": 1.2393349409103394, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 69710 + }, + { + "epoch": 5.006822262118492, + "grad_norm": 1.2420750856399536, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 69720 + }, + { + "epoch": 5.00754039497307, + "grad_norm": 1.1698096990585327, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 69730 + }, + { + "epoch": 5.008258527827648, + "grad_norm": 1.2228301763534546, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 69740 + }, + { + "epoch": 5.008976660682226, + "grad_norm": 0.9350621104240417, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 69750 + }, + { + "epoch": 5.009694793536804, + "grad_norm": 0.9828507304191589, + "learning_rate": 0.0002, + "loss": 0.5278, + "step": 69760 + }, + { + "epoch": 5.010412926391383, + "grad_norm": 0.9372149109840393, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 69770 + }, + { + "epoch": 5.011131059245961, + "grad_norm": 0.8098477125167847, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 69780 + }, + { + "epoch": 5.011849192100539, + "grad_norm": 1.0418338775634766, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 69790 + }, + { + "epoch": 5.012567324955117, + "grad_norm": 1.0175801515579224, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 69800 + }, + { + "epoch": 5.013285457809695, + "grad_norm": 1.2128081321716309, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 69810 + }, + { + "epoch": 5.014003590664273, + "grad_norm": 1.001805067062378, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 69820 + }, + { + "epoch": 5.014721723518851, + "grad_norm": 0.8957470059394836, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 69830 + }, + { + "epoch": 5.015439856373429, + "grad_norm": 0.9344548583030701, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 69840 + }, + { + "epoch": 5.016157989228007, + "grad_norm": 0.8545927405357361, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 69850 + }, + { + "epoch": 5.016876122082586, + "grad_norm": 1.3907777070999146, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 69860 + }, + { + "epoch": 5.017594254937164, + "grad_norm": 0.8112093806266785, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 69870 + }, + { + "epoch": 5.018312387791742, + "grad_norm": 1.0151532888412476, + "learning_rate": 0.0002, + "loss": 0.5, + "step": 69880 + }, + { + "epoch": 5.01903052064632, + "grad_norm": 1.249021053314209, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 69890 + }, + { + "epoch": 5.019748653500898, + "grad_norm": 0.9310314059257507, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 69900 + }, + { + "epoch": 5.020466786355476, + "grad_norm": 0.9444572925567627, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 69910 + }, + { + "epoch": 5.021184919210054, + "grad_norm": 1.0952081680297852, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 69920 + }, + { + "epoch": 5.021903052064632, + "grad_norm": 1.2106375694274902, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 69930 + }, + { + "epoch": 5.02262118491921, + "grad_norm": 1.0179580450057983, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 69940 + }, + { + "epoch": 5.023339317773788, + "grad_norm": 1.0865367650985718, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 69950 + }, + { + "epoch": 5.024057450628367, + "grad_norm": 1.0965075492858887, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 69960 + }, + { + "epoch": 5.024775583482945, + "grad_norm": 0.8879445791244507, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 69970 + }, + { + "epoch": 5.025493716337523, + "grad_norm": 1.2588363885879517, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 69980 + }, + { + "epoch": 5.026211849192101, + "grad_norm": 0.935705304145813, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 69990 + }, + { + "epoch": 5.026929982046679, + "grad_norm": 1.072012186050415, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 70000 + }, + { + "epoch": 5.027648114901257, + "grad_norm": 1.286438226699829, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 70010 + }, + { + "epoch": 5.028366247755835, + "grad_norm": 1.1165392398834229, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 70020 + }, + { + "epoch": 5.029084380610413, + "grad_norm": 0.7998424172401428, + "learning_rate": 0.0002, + "loss": 0.5348, + "step": 70030 + }, + { + "epoch": 5.029802513464991, + "grad_norm": 1.5669852495193481, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 70040 + }, + { + "epoch": 5.0305206463195695, + "grad_norm": 0.9780290722846985, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 70050 + }, + { + "epoch": 5.0312387791741475, + "grad_norm": 0.9837628602981567, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 70060 + }, + { + "epoch": 5.0319569120287255, + "grad_norm": 0.9558916091918945, + "learning_rate": 0.0002, + "loss": 0.5369, + "step": 70070 + }, + { + "epoch": 5.0326750448833035, + "grad_norm": 0.8893155455589294, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 70080 + }, + { + "epoch": 5.0333931777378815, + "grad_norm": 1.1403675079345703, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 70090 + }, + { + "epoch": 5.0341113105924595, + "grad_norm": 1.0453649759292603, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 70100 + }, + { + "epoch": 5.0348294434470375, + "grad_norm": 0.8127498030662537, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 70110 + }, + { + "epoch": 5.0355475763016155, + "grad_norm": 0.9344680309295654, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 70120 + }, + { + "epoch": 5.0362657091561935, + "grad_norm": 1.0302079916000366, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 70130 + }, + { + "epoch": 5.036983842010772, + "grad_norm": 1.0549713373184204, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 70140 + }, + { + "epoch": 5.03770197486535, + "grad_norm": 0.8916767835617065, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 70150 + }, + { + "epoch": 5.038420107719928, + "grad_norm": 0.9799798130989075, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 70160 + }, + { + "epoch": 5.039138240574506, + "grad_norm": 1.15560781955719, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 70170 + }, + { + "epoch": 5.039856373429084, + "grad_norm": 1.0577017068862915, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 70180 + }, + { + "epoch": 5.040574506283662, + "grad_norm": 1.027990698814392, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 70190 + }, + { + "epoch": 5.04129263913824, + "grad_norm": 1.0818232297897339, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 70200 + }, + { + "epoch": 5.042010771992818, + "grad_norm": 1.0287196636199951, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 70210 + }, + { + "epoch": 5.042728904847396, + "grad_norm": 1.1569273471832275, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 70220 + }, + { + "epoch": 5.0434470377019744, + "grad_norm": 1.0485484600067139, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 70230 + }, + { + "epoch": 5.044165170556553, + "grad_norm": 0.9244540333747864, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 70240 + }, + { + "epoch": 5.044883303411131, + "grad_norm": 0.9576422572135925, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 70250 + }, + { + "epoch": 5.045601436265709, + "grad_norm": 0.8719421625137329, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 70260 + }, + { + "epoch": 5.046319569120287, + "grad_norm": 0.8685409426689148, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 70270 + }, + { + "epoch": 5.047037701974865, + "grad_norm": 1.2735247611999512, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 70280 + }, + { + "epoch": 5.047755834829443, + "grad_norm": 0.9082128405570984, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 70290 + }, + { + "epoch": 5.048473967684021, + "grad_norm": 1.0626471042633057, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 70300 + }, + { + "epoch": 5.049192100538599, + "grad_norm": 1.1463991403579712, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 70310 + }, + { + "epoch": 5.049910233393177, + "grad_norm": 0.8825355172157288, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 70320 + }, + { + "epoch": 5.050628366247756, + "grad_norm": 1.0549408197402954, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 70330 + }, + { + "epoch": 5.051346499102334, + "grad_norm": 1.3740944862365723, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 70340 + }, + { + "epoch": 5.052064631956912, + "grad_norm": 1.4197895526885986, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 70350 + }, + { + "epoch": 5.05278276481149, + "grad_norm": 1.1764925718307495, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 70360 + }, + { + "epoch": 5.053500897666068, + "grad_norm": 1.0443403720855713, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 70370 + }, + { + "epoch": 5.054219030520646, + "grad_norm": 1.1807527542114258, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 70380 + }, + { + "epoch": 5.054937163375224, + "grad_norm": 1.4032433032989502, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 70390 + }, + { + "epoch": 5.055655296229802, + "grad_norm": 0.9815662503242493, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 70400 + }, + { + "epoch": 5.05637342908438, + "grad_norm": 0.9368446469306946, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 70410 + }, + { + "epoch": 5.057091561938959, + "grad_norm": 1.1156736612319946, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 70420 + }, + { + "epoch": 5.057809694793537, + "grad_norm": 1.01651132106781, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 70430 + }, + { + "epoch": 5.058527827648115, + "grad_norm": 0.9906342029571533, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 70440 + }, + { + "epoch": 5.059245960502693, + "grad_norm": 0.8666667938232422, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 70450 + }, + { + "epoch": 5.059964093357271, + "grad_norm": 1.0508924722671509, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 70460 + }, + { + "epoch": 5.060682226211849, + "grad_norm": 1.2472858428955078, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 70470 + }, + { + "epoch": 5.061400359066427, + "grad_norm": 1.019073724746704, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 70480 + }, + { + "epoch": 5.062118491921005, + "grad_norm": 0.9745403528213501, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 70490 + }, + { + "epoch": 5.062836624775583, + "grad_norm": 1.121208906173706, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 70500 + }, + { + "epoch": 5.063554757630161, + "grad_norm": 1.0535147190093994, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 70510 + }, + { + "epoch": 5.06427289048474, + "grad_norm": 1.0368950366973877, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 70520 + }, + { + "epoch": 5.064991023339318, + "grad_norm": 0.948964536190033, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 70530 + }, + { + "epoch": 5.065709156193896, + "grad_norm": 1.0289826393127441, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 70540 + }, + { + "epoch": 5.066427289048474, + "grad_norm": 1.118374228477478, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 70550 + }, + { + "epoch": 5.067145421903052, + "grad_norm": 0.8712816834449768, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 70560 + }, + { + "epoch": 5.06786355475763, + "grad_norm": 0.9057969450950623, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 70570 + }, + { + "epoch": 5.068581687612208, + "grad_norm": 0.9292685985565186, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 70580 + }, + { + "epoch": 5.069299820466786, + "grad_norm": 0.9159911274909973, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 70590 + }, + { + "epoch": 5.070017953321364, + "grad_norm": 0.973848819732666, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 70600 + }, + { + "epoch": 5.070736086175943, + "grad_norm": 0.7892279028892517, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 70610 + }, + { + "epoch": 5.071454219030521, + "grad_norm": 0.9943311214447021, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 70620 + }, + { + "epoch": 5.072172351885099, + "grad_norm": 1.1457926034927368, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 70630 + }, + { + "epoch": 5.072890484739677, + "grad_norm": 0.9307738542556763, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 70640 + }, + { + "epoch": 5.073608617594255, + "grad_norm": 1.0899816751480103, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 70650 + }, + { + "epoch": 5.074326750448833, + "grad_norm": 0.8357672691345215, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 70660 + }, + { + "epoch": 5.075044883303411, + "grad_norm": 0.8889468312263489, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 70670 + }, + { + "epoch": 5.075763016157989, + "grad_norm": 0.9152118563652039, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 70680 + }, + { + "epoch": 5.076481149012567, + "grad_norm": 1.106160044670105, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 70690 + }, + { + "epoch": 5.077199281867145, + "grad_norm": 0.8519207835197449, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 70700 + }, + { + "epoch": 5.077917414721724, + "grad_norm": 0.9754986763000488, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 70710 + }, + { + "epoch": 5.078635547576302, + "grad_norm": 1.167883276939392, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 70720 + }, + { + "epoch": 5.07935368043088, + "grad_norm": 0.987622082233429, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 70730 + }, + { + "epoch": 5.080071813285458, + "grad_norm": 1.0008184909820557, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 70740 + }, + { + "epoch": 5.080789946140036, + "grad_norm": 0.6318819522857666, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 70750 + }, + { + "epoch": 5.081508078994614, + "grad_norm": 0.984886884689331, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 70760 + }, + { + "epoch": 5.082226211849192, + "grad_norm": 1.0583622455596924, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 70770 + }, + { + "epoch": 5.08294434470377, + "grad_norm": 0.9730119705200195, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 70780 + }, + { + "epoch": 5.083662477558348, + "grad_norm": 1.0201330184936523, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 70790 + }, + { + "epoch": 5.084380610412927, + "grad_norm": 1.0479248762130737, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 70800 + }, + { + "epoch": 5.085098743267505, + "grad_norm": 0.9185113906860352, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 70810 + }, + { + "epoch": 5.085816876122083, + "grad_norm": 0.9326799511909485, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 70820 + }, + { + "epoch": 5.086535008976661, + "grad_norm": 0.958739697933197, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 70830 + }, + { + "epoch": 5.087253141831239, + "grad_norm": 0.9643770456314087, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 70840 + }, + { + "epoch": 5.087971274685817, + "grad_norm": 0.8650234341621399, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 70850 + }, + { + "epoch": 5.088689407540395, + "grad_norm": 0.9354105591773987, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 70860 + }, + { + "epoch": 5.089407540394973, + "grad_norm": 0.8736345171928406, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 70870 + }, + { + "epoch": 5.090125673249551, + "grad_norm": 0.9172632098197937, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 70880 + }, + { + "epoch": 5.09084380610413, + "grad_norm": 0.9495565295219421, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 70890 + }, + { + "epoch": 5.091561938958708, + "grad_norm": 1.0328829288482666, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 70900 + }, + { + "epoch": 5.092280071813286, + "grad_norm": 0.9335703253746033, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 70910 + }, + { + "epoch": 5.092998204667864, + "grad_norm": 1.0919437408447266, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 70920 + }, + { + "epoch": 5.093716337522442, + "grad_norm": 1.03340744972229, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 70930 + }, + { + "epoch": 5.09443447037702, + "grad_norm": 1.0501604080200195, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 70940 + }, + { + "epoch": 5.095152603231598, + "grad_norm": 0.9442012310028076, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 70950 + }, + { + "epoch": 5.095870736086176, + "grad_norm": 1.2592464685440063, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 70960 + }, + { + "epoch": 5.096588868940754, + "grad_norm": 1.0961427688598633, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 70970 + }, + { + "epoch": 5.097307001795333, + "grad_norm": 1.0472424030303955, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 70980 + }, + { + "epoch": 5.098025134649911, + "grad_norm": 0.9489352107048035, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 70990 + }, + { + "epoch": 5.098743267504489, + "grad_norm": 1.0499446392059326, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 71000 + }, + { + "epoch": 5.099461400359067, + "grad_norm": 1.013005018234253, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 71010 + }, + { + "epoch": 5.100179533213645, + "grad_norm": 0.9594261050224304, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 71020 + }, + { + "epoch": 5.100897666068223, + "grad_norm": 1.2016123533248901, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 71030 + }, + { + "epoch": 5.101615798922801, + "grad_norm": 1.0389765501022339, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 71040 + }, + { + "epoch": 5.102333931777379, + "grad_norm": 1.053534746170044, + "learning_rate": 0.0002, + "loss": 0.5036, + "step": 71050 + }, + { + "epoch": 5.103052064631957, + "grad_norm": 1.1379448175430298, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 71060 + }, + { + "epoch": 5.103770197486535, + "grad_norm": 0.8796491622924805, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 71070 + }, + { + "epoch": 5.1044883303411135, + "grad_norm": 1.0591254234313965, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 71080 + }, + { + "epoch": 5.1052064631956915, + "grad_norm": 0.9622171521186829, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 71090 + }, + { + "epoch": 5.1059245960502695, + "grad_norm": 0.9173060059547424, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 71100 + }, + { + "epoch": 5.1066427289048475, + "grad_norm": 0.8363444805145264, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 71110 + }, + { + "epoch": 5.1073608617594255, + "grad_norm": 1.1006172895431519, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 71120 + }, + { + "epoch": 5.1080789946140035, + "grad_norm": 1.0720574855804443, + "learning_rate": 0.0002, + "loss": 0.5753, + "step": 71130 + }, + { + "epoch": 5.1087971274685815, + "grad_norm": 1.0560680627822876, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 71140 + }, + { + "epoch": 5.1095152603231595, + "grad_norm": 0.8485415577888489, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 71150 + }, + { + "epoch": 5.1102333931777375, + "grad_norm": 1.109383225440979, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 71160 + }, + { + "epoch": 5.110951526032316, + "grad_norm": 0.9296035766601562, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 71170 + }, + { + "epoch": 5.111669658886894, + "grad_norm": 1.2855182886123657, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 71180 + }, + { + "epoch": 5.112387791741472, + "grad_norm": 1.0313524007797241, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 71190 + }, + { + "epoch": 5.11310592459605, + "grad_norm": 1.0436697006225586, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 71200 + }, + { + "epoch": 5.113824057450628, + "grad_norm": 0.901333212852478, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 71210 + }, + { + "epoch": 5.114542190305206, + "grad_norm": 1.2170051336288452, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 71220 + }, + { + "epoch": 5.115260323159784, + "grad_norm": 0.8850961327552795, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 71230 + }, + { + "epoch": 5.115978456014362, + "grad_norm": 1.0147113800048828, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 71240 + }, + { + "epoch": 5.11669658886894, + "grad_norm": 1.0043506622314453, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 71250 + }, + { + "epoch": 5.117414721723518, + "grad_norm": 0.9887113571166992, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 71260 + }, + { + "epoch": 5.118132854578097, + "grad_norm": 1.1013392210006714, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 71270 + }, + { + "epoch": 5.118850987432675, + "grad_norm": 0.9213799238204956, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 71280 + }, + { + "epoch": 5.119569120287253, + "grad_norm": 1.047400712966919, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 71290 + }, + { + "epoch": 5.120287253141831, + "grad_norm": 1.030534029006958, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 71300 + }, + { + "epoch": 5.121005385996409, + "grad_norm": 0.9464976191520691, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 71310 + }, + { + "epoch": 5.121723518850987, + "grad_norm": 0.8610315918922424, + "learning_rate": 0.0002, + "loss": 0.5707, + "step": 71320 + }, + { + "epoch": 5.122441651705565, + "grad_norm": 1.0824426412582397, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 71330 + }, + { + "epoch": 5.123159784560143, + "grad_norm": 0.9382733106613159, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 71340 + }, + { + "epoch": 5.123877917414721, + "grad_norm": 0.9364684224128723, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 71350 + }, + { + "epoch": 5.1245960502693, + "grad_norm": 0.9583013653755188, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 71360 + }, + { + "epoch": 5.125314183123878, + "grad_norm": 1.287533164024353, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 71370 + }, + { + "epoch": 5.126032315978456, + "grad_norm": 1.5031169652938843, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 71380 + }, + { + "epoch": 5.126750448833034, + "grad_norm": 0.9891406297683716, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 71390 + }, + { + "epoch": 5.127468581687612, + "grad_norm": 1.1851537227630615, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 71400 + }, + { + "epoch": 5.12818671454219, + "grad_norm": 0.9869971871376038, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 71410 + }, + { + "epoch": 5.128904847396768, + "grad_norm": 0.961662769317627, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 71420 + }, + { + "epoch": 5.129622980251346, + "grad_norm": 1.1036419868469238, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 71430 + }, + { + "epoch": 5.130341113105924, + "grad_norm": 1.175361156463623, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 71440 + }, + { + "epoch": 5.131059245960503, + "grad_norm": 0.9801875948905945, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 71450 + }, + { + "epoch": 5.131777378815081, + "grad_norm": 0.9424611330032349, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 71460 + }, + { + "epoch": 5.132495511669659, + "grad_norm": 1.11662757396698, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 71470 + }, + { + "epoch": 5.133213644524237, + "grad_norm": 0.9969366192817688, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 71480 + }, + { + "epoch": 5.133931777378815, + "grad_norm": 1.278640866279602, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 71490 + }, + { + "epoch": 5.134649910233393, + "grad_norm": 1.1090457439422607, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 71500 + }, + { + "epoch": 5.135368043087971, + "grad_norm": 1.01808500289917, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 71510 + }, + { + "epoch": 5.136086175942549, + "grad_norm": 1.029135823249817, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 71520 + }, + { + "epoch": 5.136804308797127, + "grad_norm": 1.1207175254821777, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 71530 + }, + { + "epoch": 5.137522441651706, + "grad_norm": 1.0327218770980835, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 71540 + }, + { + "epoch": 5.138240574506284, + "grad_norm": 1.042490839958191, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 71550 + }, + { + "epoch": 5.138958707360862, + "grad_norm": 1.1800413131713867, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 71560 + }, + { + "epoch": 5.13967684021544, + "grad_norm": 1.0748766660690308, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 71570 + }, + { + "epoch": 5.140394973070018, + "grad_norm": 0.9983090758323669, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 71580 + }, + { + "epoch": 5.141113105924596, + "grad_norm": 1.30636727809906, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 71590 + }, + { + "epoch": 5.141831238779174, + "grad_norm": 0.9960222840309143, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 71600 + }, + { + "epoch": 5.142549371633752, + "grad_norm": 1.237027645111084, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 71610 + }, + { + "epoch": 5.14326750448833, + "grad_norm": 1.0913307666778564, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 71620 + }, + { + "epoch": 5.143985637342908, + "grad_norm": 0.940657913684845, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 71630 + }, + { + "epoch": 5.144703770197487, + "grad_norm": 1.093796730041504, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 71640 + }, + { + "epoch": 5.145421903052065, + "grad_norm": 0.9703856110572815, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 71650 + }, + { + "epoch": 5.146140035906643, + "grad_norm": 0.9874776005744934, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 71660 + }, + { + "epoch": 5.146858168761221, + "grad_norm": 0.9723859429359436, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 71670 + }, + { + "epoch": 5.147576301615799, + "grad_norm": 0.997107207775116, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 71680 + }, + { + "epoch": 5.148294434470377, + "grad_norm": 1.0261175632476807, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 71690 + }, + { + "epoch": 5.149012567324955, + "grad_norm": 0.9093905687332153, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 71700 + }, + { + "epoch": 5.149730700179533, + "grad_norm": 0.9909888505935669, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 71710 + }, + { + "epoch": 5.150448833034111, + "grad_norm": 0.9111971259117126, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 71720 + }, + { + "epoch": 5.15116696588869, + "grad_norm": 0.9319643974304199, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 71730 + }, + { + "epoch": 5.151885098743268, + "grad_norm": 1.0744104385375977, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 71740 + }, + { + "epoch": 5.152603231597846, + "grad_norm": 1.1555477380752563, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 71750 + }, + { + "epoch": 5.153321364452424, + "grad_norm": 0.9809171557426453, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 71760 + }, + { + "epoch": 5.154039497307002, + "grad_norm": 0.7937686443328857, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 71770 + }, + { + "epoch": 5.15475763016158, + "grad_norm": 1.1925430297851562, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 71780 + }, + { + "epoch": 5.155475763016158, + "grad_norm": 1.077412486076355, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 71790 + }, + { + "epoch": 5.156193895870736, + "grad_norm": 0.7992808222770691, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 71800 + }, + { + "epoch": 5.156912028725314, + "grad_norm": 1.0938535928726196, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 71810 + }, + { + "epoch": 5.157630161579892, + "grad_norm": 0.9458112120628357, + "learning_rate": 0.0002, + "loss": 0.5562, + "step": 71820 + }, + { + "epoch": 5.158348294434471, + "grad_norm": 0.984940230846405, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 71830 + }, + { + "epoch": 5.159066427289049, + "grad_norm": 0.9242565035820007, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 71840 + }, + { + "epoch": 5.159784560143627, + "grad_norm": 0.8386720418930054, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 71850 + }, + { + "epoch": 5.160502692998205, + "grad_norm": 0.9627357721328735, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 71860 + }, + { + "epoch": 5.161220825852783, + "grad_norm": 1.0118762254714966, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 71870 + }, + { + "epoch": 5.161938958707361, + "grad_norm": 1.1552608013153076, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 71880 + }, + { + "epoch": 5.162657091561939, + "grad_norm": 1.0910389423370361, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 71890 + }, + { + "epoch": 5.163375224416517, + "grad_norm": 1.046639084815979, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 71900 + }, + { + "epoch": 5.164093357271095, + "grad_norm": 1.0087649822235107, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 71910 + }, + { + "epoch": 5.164811490125674, + "grad_norm": 0.9418644309043884, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 71920 + }, + { + "epoch": 5.165529622980252, + "grad_norm": 1.1213915348052979, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 71930 + }, + { + "epoch": 5.16624775583483, + "grad_norm": 1.043786644935608, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 71940 + }, + { + "epoch": 5.166965888689408, + "grad_norm": 1.2150449752807617, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 71950 + }, + { + "epoch": 5.167684021543986, + "grad_norm": 1.1214520931243896, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 71960 + }, + { + "epoch": 5.168402154398564, + "grad_norm": 0.9235218167304993, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 71970 + }, + { + "epoch": 5.169120287253142, + "grad_norm": 0.8736480474472046, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 71980 + }, + { + "epoch": 5.16983842010772, + "grad_norm": 0.8723195195198059, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 71990 + }, + { + "epoch": 5.170556552962298, + "grad_norm": 1.0873022079467773, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 72000 + }, + { + "epoch": 5.1712746858168765, + "grad_norm": 0.9196295142173767, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 72010 + }, + { + "epoch": 5.1719928186714546, + "grad_norm": 0.9244471192359924, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 72020 + }, + { + "epoch": 5.1727109515260326, + "grad_norm": 1.0555505752563477, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 72030 + }, + { + "epoch": 5.1734290843806106, + "grad_norm": 1.1527929306030273, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 72040 + }, + { + "epoch": 5.174147217235189, + "grad_norm": 0.9069058895111084, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 72050 + }, + { + "epoch": 5.174865350089767, + "grad_norm": 1.1047141551971436, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 72060 + }, + { + "epoch": 5.175583482944345, + "grad_norm": 0.9805511832237244, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 72070 + }, + { + "epoch": 5.176301615798923, + "grad_norm": 1.1636970043182373, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 72080 + }, + { + "epoch": 5.177019748653501, + "grad_norm": 1.0193538665771484, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 72090 + }, + { + "epoch": 5.177737881508079, + "grad_norm": 0.8850618600845337, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 72100 + }, + { + "epoch": 5.1784560143626575, + "grad_norm": 1.042271614074707, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 72110 + }, + { + "epoch": 5.1791741472172355, + "grad_norm": 1.1405227184295654, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 72120 + }, + { + "epoch": 5.1798922800718135, + "grad_norm": 1.0013195276260376, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 72130 + }, + { + "epoch": 5.1806104129263915, + "grad_norm": 1.0474903583526611, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 72140 + }, + { + "epoch": 5.1813285457809695, + "grad_norm": 1.0384612083435059, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 72150 + }, + { + "epoch": 5.1820466786355475, + "grad_norm": 1.145086646080017, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 72160 + }, + { + "epoch": 5.1827648114901255, + "grad_norm": 1.0845173597335815, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 72170 + }, + { + "epoch": 5.1834829443447035, + "grad_norm": 0.9870346188545227, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 72180 + }, + { + "epoch": 5.1842010771992815, + "grad_norm": 1.1098768711090088, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 72190 + }, + { + "epoch": 5.18491921005386, + "grad_norm": 0.9397785067558289, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 72200 + }, + { + "epoch": 5.185637342908438, + "grad_norm": 1.0817532539367676, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 72210 + }, + { + "epoch": 5.186355475763016, + "grad_norm": 1.0027309656143188, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 72220 + }, + { + "epoch": 5.187073608617594, + "grad_norm": 0.8262016773223877, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 72230 + }, + { + "epoch": 5.187791741472172, + "grad_norm": 0.9968137741088867, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 72240 + }, + { + "epoch": 5.18850987432675, + "grad_norm": 0.9072695970535278, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 72250 + }, + { + "epoch": 5.189228007181328, + "grad_norm": 1.0388357639312744, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 72260 + }, + { + "epoch": 5.189946140035906, + "grad_norm": 0.8883537650108337, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 72270 + }, + { + "epoch": 5.190664272890484, + "grad_norm": 1.0161921977996826, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 72280 + }, + { + "epoch": 5.191382405745063, + "grad_norm": 0.964936375617981, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 72290 + }, + { + "epoch": 5.192100538599641, + "grad_norm": 0.9728496670722961, + "learning_rate": 0.0002, + "loss": 0.5145, + "step": 72300 + }, + { + "epoch": 5.192818671454219, + "grad_norm": 1.2411649227142334, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 72310 + }, + { + "epoch": 5.193536804308797, + "grad_norm": 0.9430946111679077, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 72320 + }, + { + "epoch": 5.194254937163375, + "grad_norm": 1.1522886753082275, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 72330 + }, + { + "epoch": 5.194973070017953, + "grad_norm": 1.0727189779281616, + "learning_rate": 0.0002, + "loss": 0.5013, + "step": 72340 + }, + { + "epoch": 5.195691202872531, + "grad_norm": 1.2506077289581299, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 72350 + }, + { + "epoch": 5.196409335727109, + "grad_norm": 1.0949938297271729, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 72360 + }, + { + "epoch": 5.197127468581687, + "grad_norm": 1.191125750541687, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 72370 + }, + { + "epoch": 5.197845601436265, + "grad_norm": 1.1154223680496216, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 72380 + }, + { + "epoch": 5.198563734290844, + "grad_norm": 0.9623886942863464, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 72390 + }, + { + "epoch": 5.199281867145422, + "grad_norm": 0.9432680010795593, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 72400 + }, + { + "epoch": 5.2, + "grad_norm": 1.035905122756958, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 72410 + }, + { + "epoch": 5.200718132854578, + "grad_norm": 0.9044913053512573, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 72420 + }, + { + "epoch": 5.201436265709156, + "grad_norm": 1.082187533378601, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 72430 + }, + { + "epoch": 5.202154398563734, + "grad_norm": 0.9368400573730469, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 72440 + }, + { + "epoch": 5.202872531418312, + "grad_norm": 1.1515194177627563, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 72450 + }, + { + "epoch": 5.20359066427289, + "grad_norm": 0.8333232402801514, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 72460 + }, + { + "epoch": 5.204308797127468, + "grad_norm": 1.0885688066482544, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 72470 + }, + { + "epoch": 5.205026929982047, + "grad_norm": 0.8189428448677063, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 72480 + }, + { + "epoch": 5.205745062836625, + "grad_norm": 1.0145429372787476, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 72490 + }, + { + "epoch": 5.206463195691203, + "grad_norm": 1.132490634918213, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 72500 + }, + { + "epoch": 5.207181328545781, + "grad_norm": 0.8866808414459229, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 72510 + }, + { + "epoch": 5.207899461400359, + "grad_norm": 0.9681518077850342, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 72520 + }, + { + "epoch": 5.208617594254937, + "grad_norm": 0.9992330074310303, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 72530 + }, + { + "epoch": 5.209335727109515, + "grad_norm": 1.0767436027526855, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 72540 + }, + { + "epoch": 5.210053859964093, + "grad_norm": 1.1362388134002686, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 72550 + }, + { + "epoch": 5.210771992818671, + "grad_norm": 0.9741758704185486, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 72560 + }, + { + "epoch": 5.211490125673249, + "grad_norm": 0.8216298818588257, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 72570 + }, + { + "epoch": 5.212208258527828, + "grad_norm": 0.7500724792480469, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 72580 + }, + { + "epoch": 5.212926391382406, + "grad_norm": 0.9152594804763794, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 72590 + }, + { + "epoch": 5.213644524236984, + "grad_norm": 1.014940857887268, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 72600 + }, + { + "epoch": 5.214362657091562, + "grad_norm": 0.9333099722862244, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 72610 + }, + { + "epoch": 5.21508078994614, + "grad_norm": 0.7940610647201538, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 72620 + }, + { + "epoch": 5.215798922800718, + "grad_norm": 1.0365521907806396, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 72630 + }, + { + "epoch": 5.216517055655296, + "grad_norm": 1.37727952003479, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 72640 + }, + { + "epoch": 5.217235188509874, + "grad_norm": 1.2019168138504028, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 72650 + }, + { + "epoch": 5.217953321364452, + "grad_norm": 1.1696226596832275, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 72660 + }, + { + "epoch": 5.218671454219031, + "grad_norm": 0.9608798623085022, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 72670 + }, + { + "epoch": 5.219389587073609, + "grad_norm": 0.9139777421951294, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 72680 + }, + { + "epoch": 5.220107719928187, + "grad_norm": 0.9937016367912292, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 72690 + }, + { + "epoch": 5.220825852782765, + "grad_norm": 1.2787059545516968, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 72700 + }, + { + "epoch": 5.221543985637343, + "grad_norm": 1.0757197141647339, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 72710 + }, + { + "epoch": 5.222262118491921, + "grad_norm": 0.8053579926490784, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 72720 + }, + { + "epoch": 5.222980251346499, + "grad_norm": 1.0239759683609009, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 72730 + }, + { + "epoch": 5.223698384201077, + "grad_norm": 0.9972975850105286, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 72740 + }, + { + "epoch": 5.224416517055655, + "grad_norm": 1.0504519939422607, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 72750 + }, + { + "epoch": 5.225134649910234, + "grad_norm": 1.1793010234832764, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 72760 + }, + { + "epoch": 5.225852782764812, + "grad_norm": 1.1098815202713013, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 72770 + }, + { + "epoch": 5.22657091561939, + "grad_norm": 1.1078516244888306, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 72780 + }, + { + "epoch": 5.227289048473968, + "grad_norm": 0.8684433698654175, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 72790 + }, + { + "epoch": 5.228007181328546, + "grad_norm": 1.159390926361084, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 72800 + }, + { + "epoch": 5.228725314183124, + "grad_norm": 1.0468506813049316, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 72810 + }, + { + "epoch": 5.229443447037702, + "grad_norm": 0.8684625029563904, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 72820 + }, + { + "epoch": 5.23016157989228, + "grad_norm": 1.0117321014404297, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 72830 + }, + { + "epoch": 5.230879712746858, + "grad_norm": 1.0513219833374023, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 72840 + }, + { + "epoch": 5.231597845601437, + "grad_norm": 1.0659555196762085, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 72850 + }, + { + "epoch": 5.232315978456015, + "grad_norm": 0.7726831436157227, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 72860 + }, + { + "epoch": 5.233034111310593, + "grad_norm": 1.0346935987472534, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 72870 + }, + { + "epoch": 5.233752244165171, + "grad_norm": 0.9112410545349121, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 72880 + }, + { + "epoch": 5.234470377019749, + "grad_norm": 1.2933332920074463, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 72890 + }, + { + "epoch": 5.235188509874327, + "grad_norm": 0.9740806221961975, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 72900 + }, + { + "epoch": 5.235906642728905, + "grad_norm": 0.8041712641716003, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 72910 + }, + { + "epoch": 5.236624775583483, + "grad_norm": 0.9510180950164795, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 72920 + }, + { + "epoch": 5.237342908438061, + "grad_norm": 0.9103419780731201, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 72930 + }, + { + "epoch": 5.238061041292639, + "grad_norm": 0.8317763805389404, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 72940 + }, + { + "epoch": 5.238779174147218, + "grad_norm": 1.0269867181777954, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 72950 + }, + { + "epoch": 5.239497307001796, + "grad_norm": 1.0599713325500488, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 72960 + }, + { + "epoch": 5.240215439856374, + "grad_norm": 0.9341228008270264, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 72970 + }, + { + "epoch": 5.240933572710952, + "grad_norm": 1.1216323375701904, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 72980 + }, + { + "epoch": 5.24165170556553, + "grad_norm": 0.9396152496337891, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 72990 + }, + { + "epoch": 5.242369838420108, + "grad_norm": 1.1474549770355225, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 73000 + }, + { + "epoch": 5.243087971274686, + "grad_norm": 1.2160102128982544, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 73010 + }, + { + "epoch": 5.243806104129264, + "grad_norm": 1.0755409002304077, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 73020 + }, + { + "epoch": 5.244524236983842, + "grad_norm": 1.0645225048065186, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 73030 + }, + { + "epoch": 5.2452423698384205, + "grad_norm": 1.1155469417572021, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 73040 + }, + { + "epoch": 5.2459605026929985, + "grad_norm": 1.1631708145141602, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 73050 + }, + { + "epoch": 5.2466786355475765, + "grad_norm": 0.8747480511665344, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 73060 + }, + { + "epoch": 5.2473967684021545, + "grad_norm": 0.9174497723579407, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 73070 + }, + { + "epoch": 5.2481149012567325, + "grad_norm": 1.334018349647522, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 73080 + }, + { + "epoch": 5.2488330341113105, + "grad_norm": 1.0842393636703491, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 73090 + }, + { + "epoch": 5.2495511669658885, + "grad_norm": 1.0531692504882812, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 73100 + }, + { + "epoch": 5.2502692998204665, + "grad_norm": 0.9069980978965759, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 73110 + }, + { + "epoch": 5.2509874326750445, + "grad_norm": 1.1319832801818848, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 73120 + }, + { + "epoch": 5.2517055655296225, + "grad_norm": 1.0468456745147705, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 73130 + }, + { + "epoch": 5.252423698384201, + "grad_norm": 1.1752768754959106, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 73140 + }, + { + "epoch": 5.253141831238779, + "grad_norm": 1.0697909593582153, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 73150 + }, + { + "epoch": 5.253859964093357, + "grad_norm": 1.1179429292678833, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 73160 + }, + { + "epoch": 5.254578096947935, + "grad_norm": 0.9088113903999329, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 73170 + }, + { + "epoch": 5.255296229802513, + "grad_norm": 0.8814208507537842, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 73180 + }, + { + "epoch": 5.256014362657091, + "grad_norm": 1.026688814163208, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 73190 + }, + { + "epoch": 5.256732495511669, + "grad_norm": 0.9974902868270874, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 73200 + }, + { + "epoch": 5.257450628366247, + "grad_norm": 0.948743999004364, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 73210 + }, + { + "epoch": 5.258168761220825, + "grad_norm": 0.9069591164588928, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 73220 + }, + { + "epoch": 5.258886894075404, + "grad_norm": 1.0574030876159668, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 73230 + }, + { + "epoch": 5.259605026929982, + "grad_norm": 0.9299649596214294, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 73240 + }, + { + "epoch": 5.26032315978456, + "grad_norm": 0.9888820648193359, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 73250 + }, + { + "epoch": 5.261041292639138, + "grad_norm": 1.0164920091629028, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 73260 + }, + { + "epoch": 5.261759425493716, + "grad_norm": 0.933210551738739, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 73270 + }, + { + "epoch": 5.262477558348294, + "grad_norm": 1.1754034757614136, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 73280 + }, + { + "epoch": 5.263195691202872, + "grad_norm": 1.1599570512771606, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 73290 + }, + { + "epoch": 5.26391382405745, + "grad_norm": 1.0497905015945435, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 73300 + }, + { + "epoch": 5.264631956912028, + "grad_norm": 1.3603366613388062, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 73310 + }, + { + "epoch": 5.265350089766607, + "grad_norm": 1.0283215045928955, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 73320 + }, + { + "epoch": 5.266068222621185, + "grad_norm": 1.1043906211853027, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 73330 + }, + { + "epoch": 5.266786355475763, + "grad_norm": 0.9386111497879028, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 73340 + }, + { + "epoch": 5.267504488330341, + "grad_norm": 1.3586112260818481, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 73350 + }, + { + "epoch": 5.268222621184919, + "grad_norm": 1.034179449081421, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 73360 + }, + { + "epoch": 5.268940754039497, + "grad_norm": 0.9645284414291382, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 73370 + }, + { + "epoch": 5.269658886894075, + "grad_norm": 1.1078046560287476, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 73380 + }, + { + "epoch": 5.270377019748653, + "grad_norm": 0.9737151265144348, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 73390 + }, + { + "epoch": 5.271095152603231, + "grad_norm": 1.1911388635635376, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 73400 + }, + { + "epoch": 5.27181328545781, + "grad_norm": 0.9089180827140808, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 73410 + }, + { + "epoch": 5.272531418312388, + "grad_norm": 1.094515085220337, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 73420 + }, + { + "epoch": 5.273249551166966, + "grad_norm": 1.2531700134277344, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 73430 + }, + { + "epoch": 5.273967684021544, + "grad_norm": 0.9279667139053345, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 73440 + }, + { + "epoch": 5.274685816876122, + "grad_norm": 0.9872317314147949, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 73450 + }, + { + "epoch": 5.2754039497307, + "grad_norm": 1.0645262002944946, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 73460 + }, + { + "epoch": 5.276122082585278, + "grad_norm": 0.9505489468574524, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 73470 + }, + { + "epoch": 5.276840215439856, + "grad_norm": 1.0444035530090332, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 73480 + }, + { + "epoch": 5.277558348294434, + "grad_norm": 1.1813455820083618, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 73490 + }, + { + "epoch": 5.278276481149012, + "grad_norm": 0.782117486000061, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 73500 + }, + { + "epoch": 5.278994614003591, + "grad_norm": 0.8837172389030457, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 73510 + }, + { + "epoch": 5.279712746858169, + "grad_norm": 0.8320443630218506, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 73520 + }, + { + "epoch": 5.280430879712747, + "grad_norm": 1.111466407775879, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 73530 + }, + { + "epoch": 5.281149012567325, + "grad_norm": 1.0448017120361328, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 73540 + }, + { + "epoch": 5.281867145421903, + "grad_norm": 1.2046639919281006, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 73550 + }, + { + "epoch": 5.282585278276481, + "grad_norm": 1.084886074066162, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 73560 + }, + { + "epoch": 5.283303411131059, + "grad_norm": 0.8321937918663025, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 73570 + }, + { + "epoch": 5.284021543985637, + "grad_norm": 1.172440767288208, + "learning_rate": 0.0002, + "loss": 0.5735, + "step": 73580 + }, + { + "epoch": 5.284739676840215, + "grad_norm": 0.937133252620697, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 73590 + }, + { + "epoch": 5.285457809694794, + "grad_norm": 1.0996583700180054, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 73600 + }, + { + "epoch": 5.286175942549372, + "grad_norm": 1.2459958791732788, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 73610 + }, + { + "epoch": 5.28689407540395, + "grad_norm": 0.8362332582473755, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 73620 + }, + { + "epoch": 5.287612208258528, + "grad_norm": 0.9784061312675476, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 73630 + }, + { + "epoch": 5.288330341113106, + "grad_norm": 1.087041974067688, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 73640 + }, + { + "epoch": 5.289048473967684, + "grad_norm": 0.8641281723976135, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 73650 + }, + { + "epoch": 5.289766606822262, + "grad_norm": 1.030386209487915, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 73660 + }, + { + "epoch": 5.29048473967684, + "grad_norm": 1.0551509857177734, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 73670 + }, + { + "epoch": 5.291202872531418, + "grad_norm": 0.9969013333320618, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 73680 + }, + { + "epoch": 5.291921005385996, + "grad_norm": 0.9566490054130554, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 73690 + }, + { + "epoch": 5.292639138240575, + "grad_norm": 1.1376742124557495, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 73700 + }, + { + "epoch": 5.293357271095153, + "grad_norm": 1.0127843618392944, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 73710 + }, + { + "epoch": 5.294075403949731, + "grad_norm": 0.9500759243965149, + "learning_rate": 0.0002, + "loss": 0.5673, + "step": 73720 + }, + { + "epoch": 5.294793536804309, + "grad_norm": 0.9597342610359192, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 73730 + }, + { + "epoch": 5.295511669658887, + "grad_norm": 1.0982595682144165, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 73740 + }, + { + "epoch": 5.296229802513465, + "grad_norm": 0.9007689952850342, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 73750 + }, + { + "epoch": 5.296947935368043, + "grad_norm": 0.9329614639282227, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 73760 + }, + { + "epoch": 5.297666068222621, + "grad_norm": 1.235142469406128, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 73770 + }, + { + "epoch": 5.298384201077199, + "grad_norm": 1.0875943899154663, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 73780 + }, + { + "epoch": 5.299102333931778, + "grad_norm": 1.0499054193496704, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 73790 + }, + { + "epoch": 5.299820466786356, + "grad_norm": 1.117954969406128, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 73800 + }, + { + "epoch": 5.300538599640934, + "grad_norm": 0.800291121006012, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 73810 + }, + { + "epoch": 5.301256732495512, + "grad_norm": 1.1461842060089111, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 73820 + }, + { + "epoch": 5.30197486535009, + "grad_norm": 1.0084760189056396, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 73830 + }, + { + "epoch": 5.302692998204668, + "grad_norm": 1.1249386072158813, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 73840 + }, + { + "epoch": 5.303411131059246, + "grad_norm": 1.0846004486083984, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 73850 + }, + { + "epoch": 5.304129263913824, + "grad_norm": 1.1557925939559937, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 73860 + }, + { + "epoch": 5.304847396768402, + "grad_norm": 1.2287988662719727, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 73870 + }, + { + "epoch": 5.30556552962298, + "grad_norm": 0.9618542194366455, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 73880 + }, + { + "epoch": 5.306283662477559, + "grad_norm": 0.9429472088813782, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 73890 + }, + { + "epoch": 5.307001795332137, + "grad_norm": 0.9032631516456604, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 73900 + }, + { + "epoch": 5.307719928186715, + "grad_norm": 1.0008580684661865, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 73910 + }, + { + "epoch": 5.308438061041293, + "grad_norm": 0.9795624017715454, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 73920 + }, + { + "epoch": 5.309156193895871, + "grad_norm": 1.1194090843200684, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 73930 + }, + { + "epoch": 5.309874326750449, + "grad_norm": 1.1057528257369995, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 73940 + }, + { + "epoch": 5.310592459605027, + "grad_norm": 0.7807615995407104, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 73950 + }, + { + "epoch": 5.311310592459605, + "grad_norm": 0.9465593099594116, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 73960 + }, + { + "epoch": 5.312028725314184, + "grad_norm": 1.104210615158081, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 73970 + }, + { + "epoch": 5.312746858168762, + "grad_norm": 1.0452964305877686, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 73980 + }, + { + "epoch": 5.31346499102334, + "grad_norm": 1.0314992666244507, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 73990 + }, + { + "epoch": 5.314183123877918, + "grad_norm": 0.9187130928039551, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 74000 + }, + { + "epoch": 5.314901256732496, + "grad_norm": 0.8660678267478943, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 74010 + }, + { + "epoch": 5.315619389587074, + "grad_norm": 0.9470953345298767, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 74020 + }, + { + "epoch": 5.316337522441652, + "grad_norm": 1.0028631687164307, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 74030 + }, + { + "epoch": 5.31705565529623, + "grad_norm": 1.0237356424331665, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 74040 + }, + { + "epoch": 5.317773788150808, + "grad_norm": 1.0299798250198364, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 74050 + }, + { + "epoch": 5.318491921005386, + "grad_norm": 1.0326799154281616, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 74060 + }, + { + "epoch": 5.3192100538599645, + "grad_norm": 1.156346082687378, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 74070 + }, + { + "epoch": 5.3199281867145425, + "grad_norm": 1.1542664766311646, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 74080 + }, + { + "epoch": 5.3206463195691205, + "grad_norm": 1.0503013134002686, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 74090 + }, + { + "epoch": 5.3213644524236985, + "grad_norm": 1.1088979244232178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 74100 + }, + { + "epoch": 5.3220825852782765, + "grad_norm": 0.9314014911651611, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 74110 + }, + { + "epoch": 5.3228007181328545, + "grad_norm": 1.0813525915145874, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 74120 + }, + { + "epoch": 5.3235188509874325, + "grad_norm": 0.7824062705039978, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 74130 + }, + { + "epoch": 5.3242369838420105, + "grad_norm": 1.0552699565887451, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 74140 + }, + { + "epoch": 5.3249551166965885, + "grad_norm": 1.0916554927825928, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 74150 + }, + { + "epoch": 5.325673249551167, + "grad_norm": 1.205618143081665, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 74160 + }, + { + "epoch": 5.326391382405745, + "grad_norm": 1.2551230192184448, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 74170 + }, + { + "epoch": 5.327109515260323, + "grad_norm": 0.7715005278587341, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 74180 + }, + { + "epoch": 5.327827648114901, + "grad_norm": 1.1059352159500122, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 74190 + }, + { + "epoch": 5.328545780969479, + "grad_norm": 0.9441812634468079, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 74200 + }, + { + "epoch": 5.329263913824057, + "grad_norm": 1.0012084245681763, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 74210 + }, + { + "epoch": 5.329982046678635, + "grad_norm": 0.8594073057174683, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 74220 + }, + { + "epoch": 5.330700179533213, + "grad_norm": 0.8931775093078613, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 74230 + }, + { + "epoch": 5.331418312387791, + "grad_norm": 0.967250406742096, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 74240 + }, + { + "epoch": 5.332136445242369, + "grad_norm": 0.9776269793510437, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 74250 + }, + { + "epoch": 5.332854578096948, + "grad_norm": 0.9393186569213867, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 74260 + }, + { + "epoch": 5.333572710951526, + "grad_norm": 1.0081093311309814, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 74270 + }, + { + "epoch": 5.334290843806104, + "grad_norm": 0.9002147316932678, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 74280 + }, + { + "epoch": 5.335008976660682, + "grad_norm": 0.9237701296806335, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 74290 + }, + { + "epoch": 5.33572710951526, + "grad_norm": 1.070694923400879, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 74300 + }, + { + "epoch": 5.336445242369838, + "grad_norm": 1.0134668350219727, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 74310 + }, + { + "epoch": 5.337163375224416, + "grad_norm": 1.0903294086456299, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 74320 + }, + { + "epoch": 5.337881508078994, + "grad_norm": 0.9000239372253418, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 74330 + }, + { + "epoch": 5.338599640933572, + "grad_norm": 1.0584321022033691, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 74340 + }, + { + "epoch": 5.339317773788151, + "grad_norm": 1.046420931816101, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 74350 + }, + { + "epoch": 5.340035906642729, + "grad_norm": 0.8862320184707642, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 74360 + }, + { + "epoch": 5.340754039497307, + "grad_norm": 0.8197309970855713, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 74370 + }, + { + "epoch": 5.341472172351885, + "grad_norm": 0.9539661407470703, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 74380 + }, + { + "epoch": 5.342190305206463, + "grad_norm": 1.481026530265808, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 74390 + }, + { + "epoch": 5.342908438061041, + "grad_norm": 1.0685169696807861, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 74400 + }, + { + "epoch": 5.343626570915619, + "grad_norm": 1.1468359231948853, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 74410 + }, + { + "epoch": 5.344344703770197, + "grad_norm": 0.9982373714447021, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 74420 + }, + { + "epoch": 5.345062836624775, + "grad_norm": 0.9273471236228943, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 74430 + }, + { + "epoch": 5.345780969479353, + "grad_norm": 1.058828592300415, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 74440 + }, + { + "epoch": 5.346499102333932, + "grad_norm": 1.0442006587982178, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 74450 + }, + { + "epoch": 5.34721723518851, + "grad_norm": 1.0955053567886353, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 74460 + }, + { + "epoch": 5.347935368043088, + "grad_norm": 0.9326002597808838, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 74470 + }, + { + "epoch": 5.348653500897666, + "grad_norm": 0.9496979117393494, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 74480 + }, + { + "epoch": 5.349371633752244, + "grad_norm": 1.1995937824249268, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 74490 + }, + { + "epoch": 5.350089766606822, + "grad_norm": 0.8761899471282959, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 74500 + }, + { + "epoch": 5.3508078994614, + "grad_norm": 1.2390170097351074, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 74510 + }, + { + "epoch": 5.351526032315978, + "grad_norm": 0.9101138114929199, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 74520 + }, + { + "epoch": 5.352244165170557, + "grad_norm": 0.925466001033783, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 74530 + }, + { + "epoch": 5.352962298025135, + "grad_norm": 0.9483969807624817, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 74540 + }, + { + "epoch": 5.353680430879713, + "grad_norm": 1.0530859231948853, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 74550 + }, + { + "epoch": 5.354398563734291, + "grad_norm": 1.209647536277771, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 74560 + }, + { + "epoch": 5.355116696588869, + "grad_norm": 0.9849331378936768, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 74570 + }, + { + "epoch": 5.355834829443447, + "grad_norm": 1.0822848081588745, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 74580 + }, + { + "epoch": 5.356552962298025, + "grad_norm": 1.1460528373718262, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 74590 + }, + { + "epoch": 5.357271095152603, + "grad_norm": 0.9509134292602539, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 74600 + }, + { + "epoch": 5.357989228007181, + "grad_norm": 0.9884999394416809, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 74610 + }, + { + "epoch": 5.358707360861759, + "grad_norm": 0.9619579911231995, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 74620 + }, + { + "epoch": 5.359425493716338, + "grad_norm": 0.8596125245094299, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 74630 + }, + { + "epoch": 5.360143626570916, + "grad_norm": 1.16913640499115, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 74640 + }, + { + "epoch": 5.360861759425494, + "grad_norm": 0.99276202917099, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 74650 + }, + { + "epoch": 5.361579892280072, + "grad_norm": 1.1293696165084839, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 74660 + }, + { + "epoch": 5.36229802513465, + "grad_norm": 1.187947154045105, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 74670 + }, + { + "epoch": 5.363016157989228, + "grad_norm": 0.8637247681617737, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 74680 + }, + { + "epoch": 5.363734290843806, + "grad_norm": 1.1049476861953735, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 74690 + }, + { + "epoch": 5.364452423698384, + "grad_norm": 1.1736515760421753, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 74700 + }, + { + "epoch": 5.365170556552962, + "grad_norm": 1.0203301906585693, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 74710 + }, + { + "epoch": 5.365888689407541, + "grad_norm": 1.15559720993042, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 74720 + }, + { + "epoch": 5.366606822262119, + "grad_norm": 1.2008144855499268, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 74730 + }, + { + "epoch": 5.367324955116697, + "grad_norm": 1.0385756492614746, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 74740 + }, + { + "epoch": 5.368043087971275, + "grad_norm": 0.8964240550994873, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 74750 + }, + { + "epoch": 5.368761220825853, + "grad_norm": 0.9824761748313904, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 74760 + }, + { + "epoch": 5.369479353680431, + "grad_norm": 0.8815994262695312, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 74770 + }, + { + "epoch": 5.370197486535009, + "grad_norm": 0.9729493856430054, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 74780 + }, + { + "epoch": 5.370915619389587, + "grad_norm": 1.1032123565673828, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 74790 + }, + { + "epoch": 5.371633752244165, + "grad_norm": 1.039591908454895, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 74800 + }, + { + "epoch": 5.372351885098743, + "grad_norm": 0.9741610884666443, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 74810 + }, + { + "epoch": 5.373070017953322, + "grad_norm": 0.9789814949035645, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 74820 + }, + { + "epoch": 5.3737881508079, + "grad_norm": 1.0777033567428589, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 74830 + }, + { + "epoch": 5.374506283662478, + "grad_norm": 0.9058641195297241, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 74840 + }, + { + "epoch": 5.375224416517056, + "grad_norm": 1.2161815166473389, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 74850 + }, + { + "epoch": 5.375942549371634, + "grad_norm": 1.1079481840133667, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 74860 + }, + { + "epoch": 5.376660682226212, + "grad_norm": 0.9494470357894897, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 74870 + }, + { + "epoch": 5.37737881508079, + "grad_norm": 1.0116358995437622, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 74880 + }, + { + "epoch": 5.378096947935368, + "grad_norm": 0.9382423162460327, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 74890 + }, + { + "epoch": 5.378815080789946, + "grad_norm": 1.036151647567749, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 74900 + }, + { + "epoch": 5.379533213644525, + "grad_norm": 0.9436623454093933, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 74910 + }, + { + "epoch": 5.380251346499103, + "grad_norm": 1.0149152278900146, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 74920 + }, + { + "epoch": 5.380969479353681, + "grad_norm": 1.1645641326904297, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 74930 + }, + { + "epoch": 5.381687612208259, + "grad_norm": 1.002287745475769, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 74940 + }, + { + "epoch": 5.382405745062837, + "grad_norm": 1.1176437139511108, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 74950 + }, + { + "epoch": 5.383123877917415, + "grad_norm": 0.9210802912712097, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 74960 + }, + { + "epoch": 5.383842010771993, + "grad_norm": 1.1873447895050049, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 74970 + }, + { + "epoch": 5.384560143626571, + "grad_norm": 0.8372976779937744, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 74980 + }, + { + "epoch": 5.385278276481149, + "grad_norm": 0.9220532178878784, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 74990 + }, + { + "epoch": 5.385996409335727, + "grad_norm": 0.9196901917457581, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 75000 + }, + { + "epoch": 5.3867145421903055, + "grad_norm": 0.9325235486030579, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 75010 + }, + { + "epoch": 5.3874326750448835, + "grad_norm": 1.0902531147003174, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 75020 + }, + { + "epoch": 5.3881508078994615, + "grad_norm": 1.049468755722046, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 75030 + }, + { + "epoch": 5.3888689407540395, + "grad_norm": 0.9372574687004089, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 75040 + }, + { + "epoch": 5.3895870736086176, + "grad_norm": 0.9013437628746033, + "learning_rate": 0.0002, + "loss": 0.6158, + "step": 75050 + }, + { + "epoch": 5.3903052064631956, + "grad_norm": 1.2111071348190308, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 75060 + }, + { + "epoch": 5.3910233393177736, + "grad_norm": 1.0006011724472046, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 75070 + }, + { + "epoch": 5.391741472172352, + "grad_norm": 0.9180546402931213, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 75080 + }, + { + "epoch": 5.3924596050269304, + "grad_norm": 1.096113920211792, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 75090 + }, + { + "epoch": 5.3931777378815084, + "grad_norm": 0.9041603207588196, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 75100 + }, + { + "epoch": 5.3938958707360865, + "grad_norm": 0.9675783514976501, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 75110 + }, + { + "epoch": 5.3946140035906645, + "grad_norm": 1.0952513217926025, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 75120 + }, + { + "epoch": 5.3953321364452425, + "grad_norm": 1.0166294574737549, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75130 + }, + { + "epoch": 5.3960502692998205, + "grad_norm": 1.0892874002456665, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 75140 + }, + { + "epoch": 5.3967684021543985, + "grad_norm": 0.9894046187400818, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 75150 + }, + { + "epoch": 5.3974865350089765, + "grad_norm": 0.9991754293441772, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 75160 + }, + { + "epoch": 5.3982046678635545, + "grad_norm": 1.1027519702911377, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 75170 + }, + { + "epoch": 5.3989228007181325, + "grad_norm": 1.0579880475997925, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 75180 + }, + { + "epoch": 5.399640933572711, + "grad_norm": 1.1149101257324219, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 75190 + }, + { + "epoch": 5.400359066427289, + "grad_norm": 0.8802945017814636, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 75200 + }, + { + "epoch": 5.401077199281867, + "grad_norm": 0.9168137907981873, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 75210 + }, + { + "epoch": 5.401795332136445, + "grad_norm": 1.232630968093872, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 75220 + }, + { + "epoch": 5.402513464991023, + "grad_norm": 1.1038591861724854, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 75230 + }, + { + "epoch": 5.403231597845601, + "grad_norm": 0.8985993266105652, + "learning_rate": 0.0002, + "loss": 0.5754, + "step": 75240 + }, + { + "epoch": 5.403949730700179, + "grad_norm": 1.1096316576004028, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 75250 + }, + { + "epoch": 5.404667863554757, + "grad_norm": 0.8516051173210144, + "learning_rate": 0.0002, + "loss": 0.5834, + "step": 75260 + }, + { + "epoch": 5.405385996409335, + "grad_norm": 0.9967356324195862, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 75270 + }, + { + "epoch": 5.406104129263914, + "grad_norm": 1.0092874765396118, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 75280 + }, + { + "epoch": 5.406822262118492, + "grad_norm": 1.049838662147522, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 75290 + }, + { + "epoch": 5.40754039497307, + "grad_norm": 1.1491070985794067, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 75300 + }, + { + "epoch": 5.408258527827648, + "grad_norm": 0.9348118901252747, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 75310 + }, + { + "epoch": 5.408976660682226, + "grad_norm": 1.1226147413253784, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 75320 + }, + { + "epoch": 5.409694793536804, + "grad_norm": 0.9042587876319885, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 75330 + }, + { + "epoch": 5.410412926391382, + "grad_norm": 1.1212877035140991, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 75340 + }, + { + "epoch": 5.41113105924596, + "grad_norm": 0.9805570840835571, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 75350 + }, + { + "epoch": 5.411849192100538, + "grad_norm": 0.9803917407989502, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 75360 + }, + { + "epoch": 5.412567324955116, + "grad_norm": 1.2139064073562622, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 75370 + }, + { + "epoch": 5.413285457809695, + "grad_norm": 0.9510865211486816, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 75380 + }, + { + "epoch": 5.414003590664273, + "grad_norm": 1.0752202272415161, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 75390 + }, + { + "epoch": 5.414721723518851, + "grad_norm": 1.1144053936004639, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 75400 + }, + { + "epoch": 5.415439856373429, + "grad_norm": 1.128998875617981, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 75410 + }, + { + "epoch": 5.416157989228007, + "grad_norm": 1.2901849746704102, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 75420 + }, + { + "epoch": 5.416876122082585, + "grad_norm": 1.2822786569595337, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 75430 + }, + { + "epoch": 5.417594254937163, + "grad_norm": 0.8724783658981323, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 75440 + }, + { + "epoch": 5.418312387791741, + "grad_norm": 1.1321152448654175, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 75450 + }, + { + "epoch": 5.419030520646319, + "grad_norm": 1.1211779117584229, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 75460 + }, + { + "epoch": 5.419748653500898, + "grad_norm": 1.0542290210723877, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 75470 + }, + { + "epoch": 5.420466786355476, + "grad_norm": 0.9432206153869629, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 75480 + }, + { + "epoch": 5.421184919210054, + "grad_norm": 1.2051608562469482, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 75490 + }, + { + "epoch": 5.421903052064632, + "grad_norm": 1.188256859779358, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 75500 + }, + { + "epoch": 5.42262118491921, + "grad_norm": 1.2768784761428833, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 75510 + }, + { + "epoch": 5.423339317773788, + "grad_norm": 0.8228567242622375, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75520 + }, + { + "epoch": 5.424057450628366, + "grad_norm": 1.235684871673584, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 75530 + }, + { + "epoch": 5.424775583482944, + "grad_norm": 0.8361109495162964, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 75540 + }, + { + "epoch": 5.425493716337522, + "grad_norm": 1.0450727939605713, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 75550 + }, + { + "epoch": 5.4262118491921, + "grad_norm": 0.9942979216575623, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 75560 + }, + { + "epoch": 5.426929982046679, + "grad_norm": 0.8162592053413391, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 75570 + }, + { + "epoch": 5.427648114901257, + "grad_norm": 0.9193033576011658, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 75580 + }, + { + "epoch": 5.428366247755835, + "grad_norm": 1.095130443572998, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 75590 + }, + { + "epoch": 5.429084380610413, + "grad_norm": 1.1752824783325195, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 75600 + }, + { + "epoch": 5.429802513464991, + "grad_norm": 1.2007960081100464, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 75610 + }, + { + "epoch": 5.430520646319569, + "grad_norm": 0.997347354888916, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 75620 + }, + { + "epoch": 5.431238779174147, + "grad_norm": 1.3878827095031738, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 75630 + }, + { + "epoch": 5.431956912028725, + "grad_norm": 1.1839812994003296, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 75640 + }, + { + "epoch": 5.432675044883303, + "grad_norm": 0.9912546873092651, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 75650 + }, + { + "epoch": 5.433393177737882, + "grad_norm": 0.9305517673492432, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 75660 + }, + { + "epoch": 5.43411131059246, + "grad_norm": 1.0036604404449463, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 75670 + }, + { + "epoch": 5.434829443447038, + "grad_norm": 1.2500226497650146, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 75680 + }, + { + "epoch": 5.435547576301616, + "grad_norm": 0.9476167559623718, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 75690 + }, + { + "epoch": 5.436265709156194, + "grad_norm": 0.9769760370254517, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 75700 + }, + { + "epoch": 5.436983842010772, + "grad_norm": 1.1001025438308716, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 75710 + }, + { + "epoch": 5.43770197486535, + "grad_norm": 1.1783069372177124, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 75720 + }, + { + "epoch": 5.438420107719928, + "grad_norm": 0.887438952922821, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75730 + }, + { + "epoch": 5.439138240574506, + "grad_norm": 0.9631154537200928, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 75740 + }, + { + "epoch": 5.439856373429085, + "grad_norm": 1.0824158191680908, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 75750 + }, + { + "epoch": 5.440574506283663, + "grad_norm": 1.0108296871185303, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 75760 + }, + { + "epoch": 5.441292639138241, + "grad_norm": 1.1728253364562988, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 75770 + }, + { + "epoch": 5.442010771992819, + "grad_norm": 1.0904773473739624, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 75780 + }, + { + "epoch": 5.442728904847397, + "grad_norm": 0.8982957601547241, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 75790 + }, + { + "epoch": 5.443447037701975, + "grad_norm": 1.0233404636383057, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 75800 + }, + { + "epoch": 5.444165170556553, + "grad_norm": 1.0092064142227173, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 75810 + }, + { + "epoch": 5.444883303411131, + "grad_norm": 1.2747842073440552, + "learning_rate": 0.0002, + "loss": 0.5673, + "step": 75820 + }, + { + "epoch": 5.445601436265709, + "grad_norm": 1.0365403890609741, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 75830 + }, + { + "epoch": 5.446319569120288, + "grad_norm": 1.0413976907730103, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 75840 + }, + { + "epoch": 5.447037701974866, + "grad_norm": 0.8858456015586853, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 75850 + }, + { + "epoch": 5.447755834829444, + "grad_norm": 0.9823445677757263, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 75860 + }, + { + "epoch": 5.448473967684022, + "grad_norm": 0.8515284061431885, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 75870 + }, + { + "epoch": 5.4491921005386, + "grad_norm": 1.130850911140442, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 75880 + }, + { + "epoch": 5.449910233393178, + "grad_norm": 0.984725832939148, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 75890 + }, + { + "epoch": 5.450628366247756, + "grad_norm": 1.1701595783233643, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 75900 + }, + { + "epoch": 5.451346499102334, + "grad_norm": 0.8988107442855835, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 75910 + }, + { + "epoch": 5.452064631956912, + "grad_norm": 0.9909947514533997, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 75920 + }, + { + "epoch": 5.45278276481149, + "grad_norm": 0.8861672282218933, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 75930 + }, + { + "epoch": 5.453500897666069, + "grad_norm": 0.9513981938362122, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 75940 + }, + { + "epoch": 5.454219030520647, + "grad_norm": 1.0320760011672974, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 75950 + }, + { + "epoch": 5.454937163375225, + "grad_norm": 0.9830206632614136, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 75960 + }, + { + "epoch": 5.455655296229803, + "grad_norm": 0.9816349148750305, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 75970 + }, + { + "epoch": 5.456373429084381, + "grad_norm": 0.9741218090057373, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 75980 + }, + { + "epoch": 5.457091561938959, + "grad_norm": 1.1291148662567139, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 75990 + }, + { + "epoch": 5.457809694793537, + "grad_norm": 0.9770109057426453, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 76000 + }, + { + "epoch": 5.458527827648115, + "grad_norm": 1.0204377174377441, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 76010 + }, + { + "epoch": 5.459245960502693, + "grad_norm": 1.0453336238861084, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 76020 + }, + { + "epoch": 5.4599640933572715, + "grad_norm": 1.1595505475997925, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 76030 + }, + { + "epoch": 5.4606822262118495, + "grad_norm": 1.1686701774597168, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 76040 + }, + { + "epoch": 5.4614003590664275, + "grad_norm": 1.14364755153656, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 76050 + }, + { + "epoch": 5.4621184919210055, + "grad_norm": 0.9742125868797302, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 76060 + }, + { + "epoch": 5.4628366247755835, + "grad_norm": 0.8235608339309692, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 76070 + }, + { + "epoch": 5.4635547576301615, + "grad_norm": 0.9801425337791443, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 76080 + }, + { + "epoch": 5.4642728904847395, + "grad_norm": 0.9001221060752869, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 76090 + }, + { + "epoch": 5.4649910233393175, + "grad_norm": 0.9292157888412476, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 76100 + }, + { + "epoch": 5.4657091561938955, + "grad_norm": 1.0024322271347046, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 76110 + }, + { + "epoch": 5.4664272890484735, + "grad_norm": 0.8057159781455994, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 76120 + }, + { + "epoch": 5.467145421903052, + "grad_norm": 1.0617927312850952, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 76130 + }, + { + "epoch": 5.46786355475763, + "grad_norm": 1.003967046737671, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 76140 + }, + { + "epoch": 5.468581687612208, + "grad_norm": 0.903408944606781, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 76150 + }, + { + "epoch": 5.469299820466786, + "grad_norm": 0.8173895478248596, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 76160 + }, + { + "epoch": 5.470017953321364, + "grad_norm": 1.0187482833862305, + "learning_rate": 0.0002, + "loss": 0.5526, + "step": 76170 + }, + { + "epoch": 5.470736086175942, + "grad_norm": 1.0418041944503784, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 76180 + }, + { + "epoch": 5.47145421903052, + "grad_norm": 0.9768357872962952, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 76190 + }, + { + "epoch": 5.472172351885098, + "grad_norm": 1.0834382772445679, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 76200 + }, + { + "epoch": 5.472890484739676, + "grad_norm": 0.8447439670562744, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 76210 + }, + { + "epoch": 5.473608617594255, + "grad_norm": 0.9379050135612488, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 76220 + }, + { + "epoch": 5.474326750448833, + "grad_norm": 1.0395485162734985, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 76230 + }, + { + "epoch": 5.475044883303411, + "grad_norm": 1.2082624435424805, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 76240 + }, + { + "epoch": 5.475763016157989, + "grad_norm": 1.0714443922042847, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 76250 + }, + { + "epoch": 5.476481149012567, + "grad_norm": 0.945319414138794, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 76260 + }, + { + "epoch": 5.477199281867145, + "grad_norm": 1.1415241956710815, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 76270 + }, + { + "epoch": 5.477917414721723, + "grad_norm": 0.9221673011779785, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 76280 + }, + { + "epoch": 5.478635547576301, + "grad_norm": 1.0118398666381836, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 76290 + }, + { + "epoch": 5.479353680430879, + "grad_norm": 1.396807312965393, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 76300 + }, + { + "epoch": 5.480071813285457, + "grad_norm": 1.0437991619110107, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 76310 + }, + { + "epoch": 5.480789946140036, + "grad_norm": 1.5910401344299316, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 76320 + }, + { + "epoch": 5.481508078994614, + "grad_norm": 0.9262010455131531, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 76330 + }, + { + "epoch": 5.482226211849192, + "grad_norm": 1.2534247636795044, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 76340 + }, + { + "epoch": 5.48294434470377, + "grad_norm": 1.186294674873352, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 76350 + }, + { + "epoch": 5.483662477558348, + "grad_norm": 0.9822857975959778, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 76360 + }, + { + "epoch": 5.484380610412926, + "grad_norm": 1.0006381273269653, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 76370 + }, + { + "epoch": 5.485098743267504, + "grad_norm": 0.8960304260253906, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 76380 + }, + { + "epoch": 5.485816876122082, + "grad_norm": 0.7309539914131165, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 76390 + }, + { + "epoch": 5.486535008976661, + "grad_norm": 0.9747139811515808, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 76400 + }, + { + "epoch": 5.487253141831239, + "grad_norm": 0.9586864113807678, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 76410 + }, + { + "epoch": 5.487971274685817, + "grad_norm": 1.0815327167510986, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 76420 + }, + { + "epoch": 5.488689407540395, + "grad_norm": 1.1324117183685303, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 76430 + }, + { + "epoch": 5.489407540394973, + "grad_norm": 0.8575648069381714, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 76440 + }, + { + "epoch": 5.490125673249551, + "grad_norm": 0.9821682572364807, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 76450 + }, + { + "epoch": 5.490843806104129, + "grad_norm": 1.1611464023590088, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 76460 + }, + { + "epoch": 5.491561938958707, + "grad_norm": 1.0340297222137451, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 76470 + }, + { + "epoch": 5.492280071813285, + "grad_norm": 1.0116628408432007, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 76480 + }, + { + "epoch": 5.492998204667863, + "grad_norm": 0.9619752764701843, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 76490 + }, + { + "epoch": 5.493716337522442, + "grad_norm": 0.9924456477165222, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 76500 + }, + { + "epoch": 5.49443447037702, + "grad_norm": 0.9449224472045898, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 76510 + }, + { + "epoch": 5.495152603231598, + "grad_norm": 0.9075009822845459, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 76520 + }, + { + "epoch": 5.495870736086176, + "grad_norm": 1.3078763484954834, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 76530 + }, + { + "epoch": 5.496588868940754, + "grad_norm": 1.3162729740142822, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 76540 + }, + { + "epoch": 5.497307001795332, + "grad_norm": 1.144333839416504, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 76550 + }, + { + "epoch": 5.49802513464991, + "grad_norm": 0.9332208633422852, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 76560 + }, + { + "epoch": 5.498743267504488, + "grad_norm": 0.9660165309906006, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 76570 + }, + { + "epoch": 5.499461400359066, + "grad_norm": 1.0954749584197998, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 76580 + }, + { + "epoch": 5.500179533213645, + "grad_norm": 1.0537810325622559, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 76590 + }, + { + "epoch": 5.500897666068223, + "grad_norm": 0.9944321513175964, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 76600 + }, + { + "epoch": 5.501615798922801, + "grad_norm": 1.094462513923645, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 76610 + }, + { + "epoch": 5.502333931777379, + "grad_norm": 1.0246481895446777, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 76620 + }, + { + "epoch": 5.503052064631957, + "grad_norm": 0.9705453515052795, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 76630 + }, + { + "epoch": 5.503770197486535, + "grad_norm": 1.5252249240875244, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 76640 + }, + { + "epoch": 5.504488330341113, + "grad_norm": 0.8469606637954712, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 76650 + }, + { + "epoch": 5.505206463195691, + "grad_norm": 1.1882504224777222, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 76660 + }, + { + "epoch": 5.505924596050269, + "grad_norm": 0.8447994589805603, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 76670 + }, + { + "epoch": 5.506642728904847, + "grad_norm": 0.9340696930885315, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 76680 + }, + { + "epoch": 5.507360861759426, + "grad_norm": 0.9622383713722229, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 76690 + }, + { + "epoch": 5.508078994614004, + "grad_norm": 1.1516523361206055, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 76700 + }, + { + "epoch": 5.508797127468582, + "grad_norm": 1.207190990447998, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 76710 + }, + { + "epoch": 5.50951526032316, + "grad_norm": 1.1244179010391235, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 76720 + }, + { + "epoch": 5.510233393177738, + "grad_norm": 1.052288293838501, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 76730 + }, + { + "epoch": 5.510951526032316, + "grad_norm": 0.9571291208267212, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 76740 + }, + { + "epoch": 5.511669658886894, + "grad_norm": 0.9449458122253418, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 76750 + }, + { + "epoch": 5.512387791741472, + "grad_norm": 1.0140511989593506, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 76760 + }, + { + "epoch": 5.513105924596051, + "grad_norm": 1.057715654373169, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 76770 + }, + { + "epoch": 5.513824057450629, + "grad_norm": 0.930642306804657, + "learning_rate": 0.0002, + "loss": 0.5643, + "step": 76780 + }, + { + "epoch": 5.514542190305207, + "grad_norm": 1.1213828325271606, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 76790 + }, + { + "epoch": 5.515260323159785, + "grad_norm": 0.9147387742996216, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 76800 + }, + { + "epoch": 5.515978456014363, + "grad_norm": 1.1786983013153076, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 76810 + }, + { + "epoch": 5.516696588868941, + "grad_norm": 1.1022626161575317, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 76820 + }, + { + "epoch": 5.517414721723519, + "grad_norm": 1.0389000177383423, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 76830 + }, + { + "epoch": 5.518132854578097, + "grad_norm": 1.0750621557235718, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 76840 + }, + { + "epoch": 5.518850987432675, + "grad_norm": 1.0372626781463623, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 76850 + }, + { + "epoch": 5.519569120287253, + "grad_norm": 1.0989108085632324, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 76860 + }, + { + "epoch": 5.520287253141831, + "grad_norm": 1.030346155166626, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 76870 + }, + { + "epoch": 5.52100538599641, + "grad_norm": 1.1362419128417969, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 76880 + }, + { + "epoch": 5.521723518850988, + "grad_norm": 0.9110873937606812, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 76890 + }, + { + "epoch": 5.522441651705566, + "grad_norm": 1.0214358568191528, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 76900 + }, + { + "epoch": 5.523159784560144, + "grad_norm": 1.3764830827713013, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 76910 + }, + { + "epoch": 5.523877917414722, + "grad_norm": 1.0396335124969482, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 76920 + }, + { + "epoch": 5.5245960502693, + "grad_norm": 1.1942898035049438, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 76930 + }, + { + "epoch": 5.525314183123878, + "grad_norm": 0.8795760869979858, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 76940 + }, + { + "epoch": 5.526032315978456, + "grad_norm": 1.1081048250198364, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 76950 + }, + { + "epoch": 5.526750448833035, + "grad_norm": 0.9652274250984192, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 76960 + }, + { + "epoch": 5.527468581687613, + "grad_norm": 0.96559739112854, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 76970 + }, + { + "epoch": 5.528186714542191, + "grad_norm": 1.0416076183319092, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 76980 + }, + { + "epoch": 5.528904847396769, + "grad_norm": 0.9854229092597961, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 76990 + }, + { + "epoch": 5.529622980251347, + "grad_norm": 1.0515462160110474, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 77000 + }, + { + "epoch": 5.530341113105925, + "grad_norm": 1.0287327766418457, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 77010 + }, + { + "epoch": 5.531059245960503, + "grad_norm": 0.9579883217811584, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 77020 + }, + { + "epoch": 5.531777378815081, + "grad_norm": 1.0365805625915527, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 77030 + }, + { + "epoch": 5.532495511669659, + "grad_norm": 1.1600725650787354, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 77040 + }, + { + "epoch": 5.533213644524237, + "grad_norm": 0.8598031401634216, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 77050 + }, + { + "epoch": 5.533931777378815, + "grad_norm": 0.8884791731834412, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 77060 + }, + { + "epoch": 5.5346499102333935, + "grad_norm": 0.900223433971405, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 77070 + }, + { + "epoch": 5.5353680430879715, + "grad_norm": 1.0212652683258057, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 77080 + }, + { + "epoch": 5.5360861759425495, + "grad_norm": 1.0924701690673828, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 77090 + }, + { + "epoch": 5.5368043087971275, + "grad_norm": 1.1955485343933105, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 77100 + }, + { + "epoch": 5.5375224416517055, + "grad_norm": 1.2157706022262573, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 77110 + }, + { + "epoch": 5.5382405745062835, + "grad_norm": 1.1118255853652954, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 77120 + }, + { + "epoch": 5.5389587073608615, + "grad_norm": 1.0146820545196533, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 77130 + }, + { + "epoch": 5.5396768402154395, + "grad_norm": 1.0876632928848267, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 77140 + }, + { + "epoch": 5.540394973070018, + "grad_norm": 0.7914495468139648, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 77150 + }, + { + "epoch": 5.541113105924596, + "grad_norm": 1.0584027767181396, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 77160 + }, + { + "epoch": 5.541831238779174, + "grad_norm": 0.9816845059394836, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 77170 + }, + { + "epoch": 5.542549371633752, + "grad_norm": 1.219076156616211, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 77180 + }, + { + "epoch": 5.54326750448833, + "grad_norm": 0.9526635408401489, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 77190 + }, + { + "epoch": 5.543985637342908, + "grad_norm": 0.8437230587005615, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 77200 + }, + { + "epoch": 5.544703770197486, + "grad_norm": 0.9670451283454895, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 77210 + }, + { + "epoch": 5.545421903052064, + "grad_norm": 1.015687346458435, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 77220 + }, + { + "epoch": 5.546140035906642, + "grad_norm": 0.8280553817749023, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 77230 + }, + { + "epoch": 5.54685816876122, + "grad_norm": 1.1320816278457642, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 77240 + }, + { + "epoch": 5.547576301615799, + "grad_norm": 1.3338711261749268, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 77250 + }, + { + "epoch": 5.548294434470377, + "grad_norm": 0.9553194642066956, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 77260 + }, + { + "epoch": 5.549012567324955, + "grad_norm": 1.0604912042617798, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 77270 + }, + { + "epoch": 5.549730700179533, + "grad_norm": 1.1037590503692627, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 77280 + }, + { + "epoch": 5.550448833034111, + "grad_norm": 1.166212558746338, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 77290 + }, + { + "epoch": 5.551166965888689, + "grad_norm": 1.0189802646636963, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 77300 + }, + { + "epoch": 5.551885098743267, + "grad_norm": 0.9592387080192566, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 77310 + }, + { + "epoch": 5.552603231597845, + "grad_norm": 0.9533785581588745, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 77320 + }, + { + "epoch": 5.553321364452424, + "grad_norm": 0.9666807055473328, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 77330 + }, + { + "epoch": 5.554039497307002, + "grad_norm": 0.8827478289604187, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 77340 + }, + { + "epoch": 5.55475763016158, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 77350 + }, + { + "epoch": 5.555475763016158, + "grad_norm": 1.14597487449646, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 77360 + }, + { + "epoch": 5.556193895870736, + "grad_norm": 1.009392499923706, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 77370 + }, + { + "epoch": 5.556912028725314, + "grad_norm": 1.115757942199707, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 77380 + }, + { + "epoch": 5.557630161579892, + "grad_norm": 0.9907452464103699, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 77390 + }, + { + "epoch": 5.55834829443447, + "grad_norm": 1.0667012929916382, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 77400 + }, + { + "epoch": 5.559066427289048, + "grad_norm": 0.9301251173019409, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 77410 + }, + { + "epoch": 5.559784560143626, + "grad_norm": 1.090384602546692, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 77420 + }, + { + "epoch": 5.560502692998204, + "grad_norm": 0.8073469996452332, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 77430 + }, + { + "epoch": 5.561220825852783, + "grad_norm": 1.1003652811050415, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 77440 + }, + { + "epoch": 5.561938958707361, + "grad_norm": 0.9493791460990906, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 77450 + }, + { + "epoch": 5.562657091561939, + "grad_norm": 0.925388514995575, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 77460 + }, + { + "epoch": 5.563375224416517, + "grad_norm": 1.0946427583694458, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 77470 + }, + { + "epoch": 5.564093357271095, + "grad_norm": 0.9791404008865356, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 77480 + }, + { + "epoch": 5.564811490125673, + "grad_norm": 1.0534733533859253, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 77490 + }, + { + "epoch": 5.565529622980251, + "grad_norm": 0.9351776242256165, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 77500 + }, + { + "epoch": 5.566247755834829, + "grad_norm": 1.004448413848877, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 77510 + }, + { + "epoch": 5.566965888689408, + "grad_norm": 1.0199403762817383, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 77520 + }, + { + "epoch": 5.567684021543986, + "grad_norm": 1.0693204402923584, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 77530 + }, + { + "epoch": 5.568402154398564, + "grad_norm": 1.0635178089141846, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 77540 + }, + { + "epoch": 5.569120287253142, + "grad_norm": 1.1154648065567017, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 77550 + }, + { + "epoch": 5.56983842010772, + "grad_norm": 0.999116837978363, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 77560 + }, + { + "epoch": 5.570556552962298, + "grad_norm": 0.9967397451400757, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 77570 + }, + { + "epoch": 5.571274685816876, + "grad_norm": 0.9684699773788452, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 77580 + }, + { + "epoch": 5.571992818671454, + "grad_norm": 1.027213454246521, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 77590 + }, + { + "epoch": 5.572710951526032, + "grad_norm": 1.0571194887161255, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 77600 + }, + { + "epoch": 5.57342908438061, + "grad_norm": 1.2010499238967896, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 77610 + }, + { + "epoch": 5.574147217235188, + "grad_norm": 1.1033680438995361, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 77620 + }, + { + "epoch": 5.574865350089767, + "grad_norm": 0.9394578337669373, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 77630 + }, + { + "epoch": 5.575583482944345, + "grad_norm": 1.379382610321045, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 77640 + }, + { + "epoch": 5.576301615798923, + "grad_norm": 0.9787197709083557, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 77650 + }, + { + "epoch": 5.577019748653501, + "grad_norm": 0.9680284261703491, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 77660 + }, + { + "epoch": 5.577737881508079, + "grad_norm": 1.0449682474136353, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 77670 + }, + { + "epoch": 5.578456014362657, + "grad_norm": 1.1243085861206055, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 77680 + }, + { + "epoch": 5.579174147217235, + "grad_norm": 0.9228966236114502, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 77690 + }, + { + "epoch": 5.579892280071813, + "grad_norm": 1.1349890232086182, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 77700 + }, + { + "epoch": 5.580610412926392, + "grad_norm": 1.2248499393463135, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 77710 + }, + { + "epoch": 5.58132854578097, + "grad_norm": 1.0066324472427368, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 77720 + }, + { + "epoch": 5.582046678635548, + "grad_norm": 1.2642878293991089, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 77730 + }, + { + "epoch": 5.582764811490126, + "grad_norm": 1.031591534614563, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 77740 + }, + { + "epoch": 5.583482944344704, + "grad_norm": 1.0925929546356201, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 77750 + }, + { + "epoch": 5.584201077199282, + "grad_norm": 1.0567110776901245, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 77760 + }, + { + "epoch": 5.58491921005386, + "grad_norm": 1.246246099472046, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 77770 + }, + { + "epoch": 5.585637342908438, + "grad_norm": 1.2467739582061768, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 77780 + }, + { + "epoch": 5.586355475763016, + "grad_norm": 1.2695211172103882, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 77790 + }, + { + "epoch": 5.587073608617594, + "grad_norm": 1.0498571395874023, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 77800 + }, + { + "epoch": 5.587791741472173, + "grad_norm": 1.0078339576721191, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 77810 + }, + { + "epoch": 5.588509874326751, + "grad_norm": 1.108199954032898, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 77820 + }, + { + "epoch": 5.589228007181329, + "grad_norm": 1.0577641725540161, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 77830 + }, + { + "epoch": 5.589946140035907, + "grad_norm": 1.2169439792633057, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 77840 + }, + { + "epoch": 5.590664272890485, + "grad_norm": 0.8310868740081787, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 77850 + }, + { + "epoch": 5.591382405745063, + "grad_norm": 0.9794082045555115, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 77860 + }, + { + "epoch": 5.592100538599641, + "grad_norm": 0.8867404460906982, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 77870 + }, + { + "epoch": 5.592818671454219, + "grad_norm": 0.9204208254814148, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 77880 + }, + { + "epoch": 5.593536804308797, + "grad_norm": 0.9801714420318604, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 77890 + }, + { + "epoch": 5.594254937163376, + "grad_norm": 0.9383925199508667, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 77900 + }, + { + "epoch": 5.594973070017954, + "grad_norm": 0.9124664068222046, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 77910 + }, + { + "epoch": 5.595691202872532, + "grad_norm": 0.9618783593177795, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 77920 + }, + { + "epoch": 5.59640933572711, + "grad_norm": 0.9575216770172119, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 77930 + }, + { + "epoch": 5.597127468581688, + "grad_norm": 1.1223464012145996, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 77940 + }, + { + "epoch": 5.597845601436266, + "grad_norm": 0.9947475790977478, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 77950 + }, + { + "epoch": 5.598563734290844, + "grad_norm": 1.141959309577942, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 77960 + }, + { + "epoch": 5.599281867145422, + "grad_norm": 1.095525860786438, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 77970 + }, + { + "epoch": 5.6, + "grad_norm": 0.9396624565124512, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 77980 + }, + { + "epoch": 5.600718132854578, + "grad_norm": 0.8162274956703186, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 77990 + }, + { + "epoch": 5.6014362657091565, + "grad_norm": 1.0130535364151, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 78000 + }, + { + "epoch": 5.6021543985637345, + "grad_norm": 1.0016634464263916, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 78010 + }, + { + "epoch": 5.6028725314183125, + "grad_norm": 0.8936169743537903, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 78020 + }, + { + "epoch": 5.6035906642728905, + "grad_norm": 1.169625163078308, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 78030 + }, + { + "epoch": 5.6043087971274685, + "grad_norm": 0.8896323442459106, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 78040 + }, + { + "epoch": 5.6050269299820465, + "grad_norm": 1.0939475297927856, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 78050 + }, + { + "epoch": 5.6057450628366245, + "grad_norm": 1.0880711078643799, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 78060 + }, + { + "epoch": 5.6064631956912026, + "grad_norm": 1.1426655054092407, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 78070 + }, + { + "epoch": 5.607181328545781, + "grad_norm": 1.118586540222168, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 78080 + }, + { + "epoch": 5.607899461400359, + "grad_norm": 0.8784464597702026, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 78090 + }, + { + "epoch": 5.608617594254937, + "grad_norm": 1.137229561805725, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 78100 + }, + { + "epoch": 5.6093357271095154, + "grad_norm": 1.1041932106018066, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 78110 + }, + { + "epoch": 5.6100538599640934, + "grad_norm": 1.0170503854751587, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 78120 + }, + { + "epoch": 5.6107719928186714, + "grad_norm": 1.298754334449768, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 78130 + }, + { + "epoch": 5.6114901256732495, + "grad_norm": 0.9344905018806458, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 78140 + }, + { + "epoch": 5.6122082585278275, + "grad_norm": 0.9467785954475403, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 78150 + }, + { + "epoch": 5.6129263913824055, + "grad_norm": 1.0617443323135376, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 78160 + }, + { + "epoch": 5.6136445242369835, + "grad_norm": 0.9017760753631592, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 78170 + }, + { + "epoch": 5.6143626570915615, + "grad_norm": 1.152601957321167, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 78180 + }, + { + "epoch": 5.61508078994614, + "grad_norm": 0.9889463186264038, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 78190 + }, + { + "epoch": 5.615798922800718, + "grad_norm": 1.0367393493652344, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 78200 + }, + { + "epoch": 5.616517055655296, + "grad_norm": 0.8466457724571228, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 78210 + }, + { + "epoch": 5.617235188509874, + "grad_norm": 0.936083197593689, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 78220 + }, + { + "epoch": 5.617953321364452, + "grad_norm": 1.018784999847412, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 78230 + }, + { + "epoch": 5.61867145421903, + "grad_norm": 0.8527804017066956, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 78240 + }, + { + "epoch": 5.619389587073608, + "grad_norm": 1.1873106956481934, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 78250 + }, + { + "epoch": 5.620107719928186, + "grad_norm": 0.9401728510856628, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 78260 + }, + { + "epoch": 5.620825852782765, + "grad_norm": 1.0801159143447876, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 78270 + }, + { + "epoch": 5.621543985637343, + "grad_norm": 1.0053739547729492, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 78280 + }, + { + "epoch": 5.622262118491921, + "grad_norm": 0.8599331378936768, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 78290 + }, + { + "epoch": 5.622980251346499, + "grad_norm": 2.3157296180725098, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 78300 + }, + { + "epoch": 5.623698384201077, + "grad_norm": 1.0027490854263306, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 78310 + }, + { + "epoch": 5.624416517055655, + "grad_norm": 0.996688961982727, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 78320 + }, + { + "epoch": 5.625134649910233, + "grad_norm": 1.0462113618850708, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 78330 + }, + { + "epoch": 5.625852782764811, + "grad_norm": 0.8750988245010376, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 78340 + }, + { + "epoch": 5.626570915619389, + "grad_norm": 0.8078145384788513, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 78350 + }, + { + "epoch": 5.627289048473967, + "grad_norm": 0.9047532081604004, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 78360 + }, + { + "epoch": 5.628007181328546, + "grad_norm": 0.9784479737281799, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 78370 + }, + { + "epoch": 5.628725314183124, + "grad_norm": 0.9529541730880737, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 78380 + }, + { + "epoch": 5.629443447037702, + "grad_norm": 0.8264740109443665, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 78390 + }, + { + "epoch": 5.63016157989228, + "grad_norm": 1.049724817276001, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 78400 + }, + { + "epoch": 5.630879712746858, + "grad_norm": 0.9866746068000793, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 78410 + }, + { + "epoch": 5.631597845601436, + "grad_norm": 0.897155225276947, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 78420 + }, + { + "epoch": 5.632315978456014, + "grad_norm": 1.225464940071106, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 78430 + }, + { + "epoch": 5.633034111310592, + "grad_norm": 0.8793753981590271, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 78440 + }, + { + "epoch": 5.63375224416517, + "grad_norm": 1.082482099533081, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 78450 + }, + { + "epoch": 5.634470377019749, + "grad_norm": 1.054064393043518, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 78460 + }, + { + "epoch": 5.635188509874327, + "grad_norm": 1.0032247304916382, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 78470 + }, + { + "epoch": 5.635906642728905, + "grad_norm": 0.8544651865959167, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 78480 + }, + { + "epoch": 5.636624775583483, + "grad_norm": 0.9475075602531433, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 78490 + }, + { + "epoch": 5.637342908438061, + "grad_norm": 1.0814138650894165, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 78500 + }, + { + "epoch": 5.638061041292639, + "grad_norm": 1.0813153982162476, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 78510 + }, + { + "epoch": 5.638779174147217, + "grad_norm": 1.0225616693496704, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 78520 + }, + { + "epoch": 5.639497307001795, + "grad_norm": 1.0777465105056763, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 78530 + }, + { + "epoch": 5.640215439856373, + "grad_norm": 1.156148910522461, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 78540 + }, + { + "epoch": 5.640933572710951, + "grad_norm": 1.0147465467453003, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 78550 + }, + { + "epoch": 5.64165170556553, + "grad_norm": 0.9606683850288391, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 78560 + }, + { + "epoch": 5.642369838420108, + "grad_norm": 0.9478723406791687, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 78570 + }, + { + "epoch": 5.643087971274686, + "grad_norm": 1.0653880834579468, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 78580 + }, + { + "epoch": 5.643806104129264, + "grad_norm": 1.7519923448562622, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 78590 + }, + { + "epoch": 5.644524236983842, + "grad_norm": 1.0567299127578735, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 78600 + }, + { + "epoch": 5.64524236983842, + "grad_norm": 0.8980287909507751, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 78610 + }, + { + "epoch": 5.645960502692998, + "grad_norm": 0.8792264461517334, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 78620 + }, + { + "epoch": 5.646678635547576, + "grad_norm": 1.2306275367736816, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 78630 + }, + { + "epoch": 5.647396768402155, + "grad_norm": 0.8259932398796082, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 78640 + }, + { + "epoch": 5.648114901256733, + "grad_norm": 0.9605076313018799, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 78650 + }, + { + "epoch": 5.648833034111311, + "grad_norm": 0.9967419505119324, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 78660 + }, + { + "epoch": 5.649551166965889, + "grad_norm": 0.9774024486541748, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 78670 + }, + { + "epoch": 5.650269299820467, + "grad_norm": 0.9838066697120667, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 78680 + }, + { + "epoch": 5.650987432675045, + "grad_norm": 1.1617798805236816, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 78690 + }, + { + "epoch": 5.651705565529623, + "grad_norm": 1.075006365776062, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 78700 + }, + { + "epoch": 5.652423698384201, + "grad_norm": 0.8859893679618835, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 78710 + }, + { + "epoch": 5.653141831238779, + "grad_norm": 1.0774717330932617, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 78720 + }, + { + "epoch": 5.653859964093357, + "grad_norm": 1.147273302078247, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 78730 + }, + { + "epoch": 5.654578096947935, + "grad_norm": 1.1403213739395142, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 78740 + }, + { + "epoch": 5.655296229802514, + "grad_norm": 0.9115353226661682, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 78750 + }, + { + "epoch": 5.656014362657092, + "grad_norm": 0.9303002953529358, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 78760 + }, + { + "epoch": 5.65673249551167, + "grad_norm": 0.9324957728385925, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 78770 + }, + { + "epoch": 5.657450628366248, + "grad_norm": 0.9688063859939575, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 78780 + }, + { + "epoch": 5.658168761220826, + "grad_norm": 0.9019638299942017, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 78790 + }, + { + "epoch": 5.658886894075404, + "grad_norm": 0.8236798048019409, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 78800 + }, + { + "epoch": 5.659605026929982, + "grad_norm": 1.2702386379241943, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 78810 + }, + { + "epoch": 5.66032315978456, + "grad_norm": 1.041077971458435, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 78820 + }, + { + "epoch": 5.661041292639139, + "grad_norm": 0.9028838276863098, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 78830 + }, + { + "epoch": 5.661759425493717, + "grad_norm": 0.9874144196510315, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 78840 + }, + { + "epoch": 5.662477558348295, + "grad_norm": 0.9633761048316956, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 78850 + }, + { + "epoch": 5.663195691202873, + "grad_norm": 0.9069564342498779, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 78860 + }, + { + "epoch": 5.663913824057451, + "grad_norm": 0.9560621976852417, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 78870 + }, + { + "epoch": 5.664631956912029, + "grad_norm": 0.9941161870956421, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 78880 + }, + { + "epoch": 5.665350089766607, + "grad_norm": 0.920407235622406, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 78890 + }, + { + "epoch": 5.666068222621185, + "grad_norm": 0.9909250140190125, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 78900 + }, + { + "epoch": 5.666786355475763, + "grad_norm": 0.9528568983078003, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 78910 + }, + { + "epoch": 5.667504488330341, + "grad_norm": 1.041440725326538, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 78920 + }, + { + "epoch": 5.66822262118492, + "grad_norm": 1.0072191953659058, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 78930 + }, + { + "epoch": 5.668940754039498, + "grad_norm": 1.0740574598312378, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 78940 + }, + { + "epoch": 5.669658886894076, + "grad_norm": 0.9168822169303894, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 78950 + }, + { + "epoch": 5.670377019748654, + "grad_norm": 1.1818004846572876, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 78960 + }, + { + "epoch": 5.671095152603232, + "grad_norm": 1.1925201416015625, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 78970 + }, + { + "epoch": 5.67181328545781, + "grad_norm": 0.879940390586853, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 78980 + }, + { + "epoch": 5.672531418312388, + "grad_norm": 1.0998331308364868, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 78990 + }, + { + "epoch": 5.673249551166966, + "grad_norm": 1.076637625694275, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 79000 + }, + { + "epoch": 5.673967684021544, + "grad_norm": 1.076864242553711, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 79010 + }, + { + "epoch": 5.6746858168761225, + "grad_norm": 1.0206586122512817, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 79020 + }, + { + "epoch": 5.6754039497307005, + "grad_norm": 0.8242515325546265, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 79030 + }, + { + "epoch": 5.6761220825852785, + "grad_norm": 1.1180634498596191, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 79040 + }, + { + "epoch": 5.6768402154398565, + "grad_norm": 1.0155152082443237, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 79050 + }, + { + "epoch": 5.6775583482944345, + "grad_norm": 1.0445241928100586, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 79060 + }, + { + "epoch": 5.6782764811490125, + "grad_norm": 0.9851725697517395, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 79070 + }, + { + "epoch": 5.6789946140035905, + "grad_norm": 0.9979640245437622, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 79080 + }, + { + "epoch": 5.6797127468581685, + "grad_norm": 1.0398952960968018, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 79090 + }, + { + "epoch": 5.6804308797127465, + "grad_norm": 1.094164252281189, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 79100 + }, + { + "epoch": 5.6811490125673245, + "grad_norm": 0.9546816945075989, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 79110 + }, + { + "epoch": 5.681867145421903, + "grad_norm": 1.1635938882827759, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 79120 + }, + { + "epoch": 5.682585278276481, + "grad_norm": 1.0260306596755981, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 79130 + }, + { + "epoch": 5.683303411131059, + "grad_norm": 0.9900122284889221, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 79140 + }, + { + "epoch": 5.684021543985637, + "grad_norm": 1.049688458442688, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 79150 + }, + { + "epoch": 5.684739676840215, + "grad_norm": 1.124272108078003, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 79160 + }, + { + "epoch": 5.685457809694793, + "grad_norm": 1.1109849214553833, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 79170 + }, + { + "epoch": 5.686175942549371, + "grad_norm": 0.739007830619812, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 79180 + }, + { + "epoch": 5.686894075403949, + "grad_norm": 1.2063007354736328, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 79190 + }, + { + "epoch": 5.687612208258528, + "grad_norm": 1.223317265510559, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 79200 + }, + { + "epoch": 5.688330341113106, + "grad_norm": 0.8042855858802795, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 79210 + }, + { + "epoch": 5.689048473967684, + "grad_norm": 0.9294175505638123, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 79220 + }, + { + "epoch": 5.689766606822262, + "grad_norm": 0.978084146976471, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 79230 + }, + { + "epoch": 5.69048473967684, + "grad_norm": 0.9271620512008667, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 79240 + }, + { + "epoch": 5.691202872531418, + "grad_norm": 1.158677339553833, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 79250 + }, + { + "epoch": 5.691921005385996, + "grad_norm": 0.9468576312065125, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 79260 + }, + { + "epoch": 5.692639138240574, + "grad_norm": 1.2025824785232544, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 79270 + }, + { + "epoch": 5.693357271095152, + "grad_norm": 1.0167860984802246, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 79280 + }, + { + "epoch": 5.69407540394973, + "grad_norm": 0.971199631690979, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 79290 + }, + { + "epoch": 5.694793536804308, + "grad_norm": 1.1757864952087402, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 79300 + }, + { + "epoch": 5.695511669658887, + "grad_norm": 1.0199662446975708, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 79310 + }, + { + "epoch": 5.696229802513465, + "grad_norm": 0.9662485122680664, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 79320 + }, + { + "epoch": 5.696947935368043, + "grad_norm": 0.9324414134025574, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 79330 + }, + { + "epoch": 5.697666068222621, + "grad_norm": 0.855752170085907, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 79340 + }, + { + "epoch": 5.698384201077199, + "grad_norm": 1.2723703384399414, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 79350 + }, + { + "epoch": 5.699102333931777, + "grad_norm": 1.0254011154174805, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 79360 + }, + { + "epoch": 5.699820466786355, + "grad_norm": 1.0958263874053955, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 79370 + }, + { + "epoch": 5.700538599640933, + "grad_norm": 1.0214145183563232, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 79380 + }, + { + "epoch": 5.701256732495512, + "grad_norm": 1.1087455749511719, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 79390 + }, + { + "epoch": 5.70197486535009, + "grad_norm": 0.8885074853897095, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 79400 + }, + { + "epoch": 5.702692998204668, + "grad_norm": 0.9854450821876526, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 79410 + }, + { + "epoch": 5.703411131059246, + "grad_norm": 0.858744204044342, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 79420 + }, + { + "epoch": 5.704129263913824, + "grad_norm": 0.9434788823127747, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 79430 + }, + { + "epoch": 5.704847396768402, + "grad_norm": 1.1388801336288452, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 79440 + }, + { + "epoch": 5.70556552962298, + "grad_norm": 1.0701899528503418, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 79450 + }, + { + "epoch": 5.706283662477558, + "grad_norm": 0.9147594571113586, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 79460 + }, + { + "epoch": 5.707001795332136, + "grad_norm": 1.055008053779602, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 79470 + }, + { + "epoch": 5.707719928186714, + "grad_norm": 0.7841609716415405, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 79480 + }, + { + "epoch": 5.708438061041292, + "grad_norm": 1.0334571599960327, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 79490 + }, + { + "epoch": 5.709156193895871, + "grad_norm": 1.2841367721557617, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 79500 + }, + { + "epoch": 5.709874326750449, + "grad_norm": 1.0296638011932373, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 79510 + }, + { + "epoch": 5.710592459605027, + "grad_norm": 0.9161922931671143, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 79520 + }, + { + "epoch": 5.711310592459605, + "grad_norm": 1.056856632232666, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 79530 + }, + { + "epoch": 5.712028725314183, + "grad_norm": 0.9919893145561218, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 79540 + }, + { + "epoch": 5.712746858168761, + "grad_norm": 1.1128891706466675, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 79550 + }, + { + "epoch": 5.713464991023339, + "grad_norm": 1.1171997785568237, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 79560 + }, + { + "epoch": 5.714183123877917, + "grad_norm": 0.9389346837997437, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 79570 + }, + { + "epoch": 5.714901256732496, + "grad_norm": 0.9869245886802673, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 79580 + }, + { + "epoch": 5.715619389587074, + "grad_norm": 0.9019966721534729, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 79590 + }, + { + "epoch": 5.716337522441652, + "grad_norm": 0.9791252017021179, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 79600 + }, + { + "epoch": 5.71705565529623, + "grad_norm": 1.0269849300384521, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 79610 + }, + { + "epoch": 5.717773788150808, + "grad_norm": 1.0340129137039185, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 79620 + }, + { + "epoch": 5.718491921005386, + "grad_norm": 0.9742604494094849, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 79630 + }, + { + "epoch": 5.719210053859964, + "grad_norm": 1.126868724822998, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 79640 + }, + { + "epoch": 5.719928186714542, + "grad_norm": 1.04326331615448, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 79650 + }, + { + "epoch": 5.72064631956912, + "grad_norm": 0.8300277590751648, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 79660 + }, + { + "epoch": 5.721364452423698, + "grad_norm": 0.8482570052146912, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 79670 + }, + { + "epoch": 5.722082585278277, + "grad_norm": 1.0777807235717773, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 79680 + }, + { + "epoch": 5.722800718132855, + "grad_norm": 1.2682723999023438, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 79690 + }, + { + "epoch": 5.723518850987433, + "grad_norm": 0.8742772340774536, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 79700 + }, + { + "epoch": 5.724236983842011, + "grad_norm": 0.9218387603759766, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 79710 + }, + { + "epoch": 5.724955116696589, + "grad_norm": 0.8977975845336914, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 79720 + }, + { + "epoch": 5.725673249551167, + "grad_norm": 1.0873085260391235, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 79730 + }, + { + "epoch": 5.726391382405745, + "grad_norm": 0.9811807870864868, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 79740 + }, + { + "epoch": 5.727109515260323, + "grad_norm": 0.926764965057373, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 79750 + }, + { + "epoch": 5.727827648114902, + "grad_norm": 1.0103713274002075, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 79760 + }, + { + "epoch": 5.72854578096948, + "grad_norm": 1.1389189958572388, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 79770 + }, + { + "epoch": 5.729263913824058, + "grad_norm": 1.1654961109161377, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 79780 + }, + { + "epoch": 5.729982046678636, + "grad_norm": 0.7925996780395508, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 79790 + }, + { + "epoch": 5.730700179533214, + "grad_norm": 1.3329131603240967, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 79800 + }, + { + "epoch": 5.731418312387792, + "grad_norm": 1.158328890800476, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 79810 + }, + { + "epoch": 5.73213644524237, + "grad_norm": 0.9904412031173706, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 79820 + }, + { + "epoch": 5.732854578096948, + "grad_norm": 1.099233865737915, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 79830 + }, + { + "epoch": 5.733572710951526, + "grad_norm": 1.0224473476409912, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 79840 + }, + { + "epoch": 5.734290843806104, + "grad_norm": 1.0482215881347656, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 79850 + }, + { + "epoch": 5.735008976660682, + "grad_norm": 0.9790018200874329, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 79860 + }, + { + "epoch": 5.735727109515261, + "grad_norm": 1.034548044204712, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 79870 + }, + { + "epoch": 5.736445242369839, + "grad_norm": 0.799286961555481, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 79880 + }, + { + "epoch": 5.737163375224417, + "grad_norm": 1.0119048357009888, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 79890 + }, + { + "epoch": 5.737881508078995, + "grad_norm": 0.9742264151573181, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 79900 + }, + { + "epoch": 5.738599640933573, + "grad_norm": 1.0408239364624023, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 79910 + }, + { + "epoch": 5.739317773788151, + "grad_norm": 0.9165748953819275, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 79920 + }, + { + "epoch": 5.740035906642729, + "grad_norm": 1.1859451532363892, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 79930 + }, + { + "epoch": 5.740754039497307, + "grad_norm": 0.8772084712982178, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 79940 + }, + { + "epoch": 5.741472172351886, + "grad_norm": 1.0123273134231567, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 79950 + }, + { + "epoch": 5.742190305206464, + "grad_norm": 1.1873936653137207, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 79960 + }, + { + "epoch": 5.742908438061042, + "grad_norm": 0.9065699577331543, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 79970 + }, + { + "epoch": 5.74362657091562, + "grad_norm": 1.1626464128494263, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 79980 + }, + { + "epoch": 5.744344703770198, + "grad_norm": 1.0311716794967651, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 79990 + }, + { + "epoch": 5.745062836624776, + "grad_norm": 1.0865558385849, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 80000 + }, + { + "epoch": 5.745780969479354, + "grad_norm": 1.0257176160812378, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 80010 + }, + { + "epoch": 5.746499102333932, + "grad_norm": 0.9805439710617065, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 80020 + }, + { + "epoch": 5.74721723518851, + "grad_norm": 0.9744977355003357, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 80030 + }, + { + "epoch": 5.747935368043088, + "grad_norm": 1.302816390991211, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 80040 + }, + { + "epoch": 5.748653500897666, + "grad_norm": 0.8866990208625793, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 80050 + }, + { + "epoch": 5.7493716337522445, + "grad_norm": 1.0133726596832275, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 80060 + }, + { + "epoch": 5.7500897666068225, + "grad_norm": 1.0043569803237915, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 80070 + }, + { + "epoch": 5.7508078994614005, + "grad_norm": 0.9100040197372437, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 80080 + }, + { + "epoch": 5.7515260323159785, + "grad_norm": 0.7994180917739868, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 80090 + }, + { + "epoch": 5.7522441651705565, + "grad_norm": 1.120188593864441, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 80100 + }, + { + "epoch": 5.7529622980251345, + "grad_norm": 0.9555420279502869, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 80110 + }, + { + "epoch": 5.7536804308797125, + "grad_norm": 1.0305951833724976, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 80120 + }, + { + "epoch": 5.7543985637342905, + "grad_norm": 0.9632731676101685, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 80130 + }, + { + "epoch": 5.755116696588869, + "grad_norm": 1.2654297351837158, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 80140 + }, + { + "epoch": 5.755834829443447, + "grad_norm": 1.027190089225769, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 80150 + }, + { + "epoch": 5.756552962298025, + "grad_norm": 0.9829175472259521, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 80160 + }, + { + "epoch": 5.757271095152603, + "grad_norm": 1.083803653717041, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 80170 + }, + { + "epoch": 5.757989228007181, + "grad_norm": 0.9353913068771362, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 80180 + }, + { + "epoch": 5.758707360861759, + "grad_norm": 1.1824370622634888, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 80190 + }, + { + "epoch": 5.759425493716337, + "grad_norm": 1.0901048183441162, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 80200 + }, + { + "epoch": 5.760143626570915, + "grad_norm": 1.0389254093170166, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 80210 + }, + { + "epoch": 5.760861759425493, + "grad_norm": 0.9746400117874146, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 80220 + }, + { + "epoch": 5.761579892280071, + "grad_norm": 0.9319248795509338, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 80230 + }, + { + "epoch": 5.76229802513465, + "grad_norm": 1.152784824371338, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 80240 + }, + { + "epoch": 5.763016157989228, + "grad_norm": 0.9462733864784241, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 80250 + }, + { + "epoch": 5.763734290843806, + "grad_norm": 0.8884182572364807, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 80260 + }, + { + "epoch": 5.764452423698384, + "grad_norm": 0.8755964636802673, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 80270 + }, + { + "epoch": 5.765170556552962, + "grad_norm": 0.8983452320098877, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 80280 + }, + { + "epoch": 5.76588868940754, + "grad_norm": 0.8565991520881653, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 80290 + }, + { + "epoch": 5.766606822262118, + "grad_norm": 1.0557159185409546, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 80300 + }, + { + "epoch": 5.767324955116696, + "grad_norm": 1.057214379310608, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 80310 + }, + { + "epoch": 5.768043087971275, + "grad_norm": 0.9852516055107117, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 80320 + }, + { + "epoch": 5.768761220825853, + "grad_norm": 1.0339698791503906, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 80330 + }, + { + "epoch": 5.769479353680431, + "grad_norm": 1.0056889057159424, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 80340 + }, + { + "epoch": 5.770197486535009, + "grad_norm": 1.0941663980484009, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 80350 + }, + { + "epoch": 5.770915619389587, + "grad_norm": 1.2145589590072632, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 80360 + }, + { + "epoch": 5.771633752244165, + "grad_norm": 0.9609606862068176, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 80370 + }, + { + "epoch": 5.772351885098743, + "grad_norm": 0.8815773129463196, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 80380 + }, + { + "epoch": 5.773070017953321, + "grad_norm": 1.2630987167358398, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 80390 + }, + { + "epoch": 5.773788150807899, + "grad_norm": 1.0605450868606567, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 80400 + }, + { + "epoch": 5.774506283662477, + "grad_norm": 1.165069341659546, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 80410 + }, + { + "epoch": 5.775224416517055, + "grad_norm": 0.9038028717041016, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 80420 + }, + { + "epoch": 5.775942549371634, + "grad_norm": 1.0571858882904053, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 80430 + }, + { + "epoch": 5.776660682226212, + "grad_norm": 1.0388168096542358, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 80440 + }, + { + "epoch": 5.77737881508079, + "grad_norm": 1.0552119016647339, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 80450 + }, + { + "epoch": 5.778096947935368, + "grad_norm": 1.0610109567642212, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 80460 + }, + { + "epoch": 5.778815080789946, + "grad_norm": 0.9906430244445801, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 80470 + }, + { + "epoch": 5.779533213644524, + "grad_norm": 1.1511857509613037, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 80480 + }, + { + "epoch": 5.780251346499102, + "grad_norm": 1.2738412618637085, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 80490 + }, + { + "epoch": 5.78096947935368, + "grad_norm": 0.8945937752723694, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 80500 + }, + { + "epoch": 5.781687612208259, + "grad_norm": 1.1105149984359741, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 80510 + }, + { + "epoch": 5.782405745062837, + "grad_norm": 0.8432297110557556, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 80520 + }, + { + "epoch": 5.783123877917415, + "grad_norm": 0.9257984757423401, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 80530 + }, + { + "epoch": 5.783842010771993, + "grad_norm": 1.1708799600601196, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 80540 + }, + { + "epoch": 5.784560143626571, + "grad_norm": 0.9969521164894104, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 80550 + }, + { + "epoch": 5.785278276481149, + "grad_norm": 1.0361413955688477, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 80560 + }, + { + "epoch": 5.785996409335727, + "grad_norm": 0.9876393675804138, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 80570 + }, + { + "epoch": 5.786714542190305, + "grad_norm": 1.0356241464614868, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 80580 + }, + { + "epoch": 5.787432675044883, + "grad_norm": 1.178865671157837, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 80590 + }, + { + "epoch": 5.788150807899461, + "grad_norm": 0.8614338636398315, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 80600 + }, + { + "epoch": 5.788868940754039, + "grad_norm": 1.020734429359436, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 80610 + }, + { + "epoch": 5.789587073608618, + "grad_norm": 1.035951852798462, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 80620 + }, + { + "epoch": 5.790305206463196, + "grad_norm": 0.898637592792511, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 80630 + }, + { + "epoch": 5.791023339317774, + "grad_norm": 0.9803016781806946, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 80640 + }, + { + "epoch": 5.791741472172352, + "grad_norm": 1.2902555465698242, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 80650 + }, + { + "epoch": 5.79245960502693, + "grad_norm": 1.3364112377166748, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 80660 + }, + { + "epoch": 5.793177737881508, + "grad_norm": 0.8553985953330994, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 80670 + }, + { + "epoch": 5.793895870736086, + "grad_norm": 0.8211889863014221, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 80680 + }, + { + "epoch": 5.794614003590664, + "grad_norm": 0.9288306832313538, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 80690 + }, + { + "epoch": 5.795332136445243, + "grad_norm": 1.0716029405593872, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 80700 + }, + { + "epoch": 5.796050269299821, + "grad_norm": 0.9957329034805298, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 80710 + }, + { + "epoch": 5.796768402154399, + "grad_norm": 0.9691376090049744, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 80720 + }, + { + "epoch": 5.797486535008977, + "grad_norm": 1.0590804815292358, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 80730 + }, + { + "epoch": 5.798204667863555, + "grad_norm": 1.0408968925476074, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 80740 + }, + { + "epoch": 5.798922800718133, + "grad_norm": 1.0249526500701904, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 80750 + }, + { + "epoch": 5.799640933572711, + "grad_norm": 1.3658806085586548, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 80760 + }, + { + "epoch": 5.800359066427289, + "grad_norm": 0.9562603831291199, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 80770 + }, + { + "epoch": 5.801077199281867, + "grad_norm": 0.8790915012359619, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 80780 + }, + { + "epoch": 5.801795332136445, + "grad_norm": 0.8351004123687744, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 80790 + }, + { + "epoch": 5.802513464991024, + "grad_norm": 0.964562714099884, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 80800 + }, + { + "epoch": 5.803231597845602, + "grad_norm": 1.0873116254806519, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 80810 + }, + { + "epoch": 5.80394973070018, + "grad_norm": 0.9821216464042664, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 80820 + }, + { + "epoch": 5.804667863554758, + "grad_norm": 1.1158807277679443, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 80830 + }, + { + "epoch": 5.805385996409336, + "grad_norm": 1.0098856687545776, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 80840 + }, + { + "epoch": 5.806104129263914, + "grad_norm": 0.9628035426139832, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 80850 + }, + { + "epoch": 5.806822262118492, + "grad_norm": 1.133800983428955, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 80860 + }, + { + "epoch": 5.80754039497307, + "grad_norm": 0.9423992037773132, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 80870 + }, + { + "epoch": 5.808258527827648, + "grad_norm": 1.0758612155914307, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 80880 + }, + { + "epoch": 5.808976660682227, + "grad_norm": 1.232029914855957, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 80890 + }, + { + "epoch": 5.809694793536805, + "grad_norm": 1.1063108444213867, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 80900 + }, + { + "epoch": 5.810412926391383, + "grad_norm": 0.9759877920150757, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 80910 + }, + { + "epoch": 5.811131059245961, + "grad_norm": 0.9180193543434143, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 80920 + }, + { + "epoch": 5.811849192100539, + "grad_norm": 1.0818052291870117, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 80930 + }, + { + "epoch": 5.812567324955117, + "grad_norm": 0.998986542224884, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 80940 + }, + { + "epoch": 5.813285457809695, + "grad_norm": 1.1549060344696045, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 80950 + }, + { + "epoch": 5.814003590664273, + "grad_norm": 1.1900213956832886, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 80960 + }, + { + "epoch": 5.814721723518851, + "grad_norm": 0.8114368915557861, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 80970 + }, + { + "epoch": 5.815439856373429, + "grad_norm": 1.0296406745910645, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 80980 + }, + { + "epoch": 5.8161579892280075, + "grad_norm": 1.0466746091842651, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 80990 + }, + { + "epoch": 5.8168761220825855, + "grad_norm": 1.0524508953094482, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 81000 + }, + { + "epoch": 5.8175942549371635, + "grad_norm": 1.1588358879089355, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 81010 + }, + { + "epoch": 5.8183123877917415, + "grad_norm": 0.9378601908683777, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 81020 + }, + { + "epoch": 5.8190305206463195, + "grad_norm": 0.9486441612243652, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 81030 + }, + { + "epoch": 5.8197486535008975, + "grad_norm": 0.9805227518081665, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 81040 + }, + { + "epoch": 5.8204667863554755, + "grad_norm": 1.1627717018127441, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 81050 + }, + { + "epoch": 5.8211849192100535, + "grad_norm": 1.0716841220855713, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 81060 + }, + { + "epoch": 5.821903052064632, + "grad_norm": 1.2398899793624878, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 81070 + }, + { + "epoch": 5.82262118491921, + "grad_norm": 1.0934730768203735, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 81080 + }, + { + "epoch": 5.823339317773788, + "grad_norm": 0.9701796174049377, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 81090 + }, + { + "epoch": 5.824057450628366, + "grad_norm": 1.0218969583511353, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 81100 + }, + { + "epoch": 5.824775583482944, + "grad_norm": 1.3066465854644775, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 81110 + }, + { + "epoch": 5.825493716337522, + "grad_norm": 1.1067441701889038, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 81120 + }, + { + "epoch": 5.8262118491921004, + "grad_norm": 0.9750344753265381, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 81130 + }, + { + "epoch": 5.8269299820466784, + "grad_norm": 1.129191279411316, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 81140 + }, + { + "epoch": 5.8276481149012564, + "grad_norm": 1.05964195728302, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 81150 + }, + { + "epoch": 5.8283662477558345, + "grad_norm": 1.1094872951507568, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 81160 + }, + { + "epoch": 5.8290843806104125, + "grad_norm": 0.9163196086883545, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 81170 + }, + { + "epoch": 5.829802513464991, + "grad_norm": 1.0035687685012817, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 81180 + }, + { + "epoch": 5.830520646319569, + "grad_norm": 1.0353461503982544, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 81190 + }, + { + "epoch": 5.831238779174147, + "grad_norm": 1.0566555261611938, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 81200 + }, + { + "epoch": 5.831956912028725, + "grad_norm": 1.2373290061950684, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 81210 + }, + { + "epoch": 5.832675044883303, + "grad_norm": 0.8818837404251099, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 81220 + }, + { + "epoch": 5.833393177737881, + "grad_norm": 1.1024713516235352, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 81230 + }, + { + "epoch": 5.834111310592459, + "grad_norm": 1.2478809356689453, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 81240 + }, + { + "epoch": 5.834829443447037, + "grad_norm": 0.8647364377975464, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 81250 + }, + { + "epoch": 5.835547576301616, + "grad_norm": 1.1106358766555786, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 81260 + }, + { + "epoch": 5.836265709156194, + "grad_norm": 0.9432938694953918, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 81270 + }, + { + "epoch": 5.836983842010772, + "grad_norm": 1.0283797979354858, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 81280 + }, + { + "epoch": 5.83770197486535, + "grad_norm": 1.158918857574463, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 81290 + }, + { + "epoch": 5.838420107719928, + "grad_norm": 0.9700069427490234, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 81300 + }, + { + "epoch": 5.839138240574506, + "grad_norm": 1.08310866355896, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 81310 + }, + { + "epoch": 5.839856373429084, + "grad_norm": 1.05460524559021, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 81320 + }, + { + "epoch": 5.840574506283662, + "grad_norm": 0.9849268794059753, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 81330 + }, + { + "epoch": 5.84129263913824, + "grad_norm": 0.888306736946106, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 81340 + }, + { + "epoch": 5.842010771992818, + "grad_norm": 1.0337001085281372, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 81350 + }, + { + "epoch": 5.842728904847397, + "grad_norm": 1.0778567790985107, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 81360 + }, + { + "epoch": 5.843447037701975, + "grad_norm": 1.1484156847000122, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 81370 + }, + { + "epoch": 5.844165170556553, + "grad_norm": 1.0948245525360107, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 81380 + }, + { + "epoch": 5.844883303411131, + "grad_norm": 0.9363969564437866, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 81390 + }, + { + "epoch": 5.845601436265709, + "grad_norm": 1.0151013135910034, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 81400 + }, + { + "epoch": 5.846319569120287, + "grad_norm": 0.9925733804702759, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 81410 + }, + { + "epoch": 5.847037701974865, + "grad_norm": 1.0356744527816772, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 81420 + }, + { + "epoch": 5.847755834829443, + "grad_norm": 1.0633001327514648, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 81430 + }, + { + "epoch": 5.848473967684021, + "grad_norm": 0.9900460839271545, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 81440 + }, + { + "epoch": 5.8491921005386, + "grad_norm": 1.2677979469299316, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 81450 + }, + { + "epoch": 5.849910233393178, + "grad_norm": 0.8174138069152832, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 81460 + }, + { + "epoch": 5.850628366247756, + "grad_norm": 1.1986393928527832, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 81470 + }, + { + "epoch": 5.851346499102334, + "grad_norm": 1.1009358167648315, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 81480 + }, + { + "epoch": 5.852064631956912, + "grad_norm": 0.966446578502655, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 81490 + }, + { + "epoch": 5.85278276481149, + "grad_norm": 0.9657767415046692, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 81500 + }, + { + "epoch": 5.853500897666068, + "grad_norm": 1.0480058193206787, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 81510 + }, + { + "epoch": 5.854219030520646, + "grad_norm": 1.2003830671310425, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 81520 + }, + { + "epoch": 5.854937163375224, + "grad_norm": 0.8683754205703735, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 81530 + }, + { + "epoch": 5.855655296229802, + "grad_norm": 1.0860967636108398, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 81540 + }, + { + "epoch": 5.856373429084381, + "grad_norm": 1.0415282249450684, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 81550 + }, + { + "epoch": 5.857091561938959, + "grad_norm": 0.9897454380989075, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 81560 + }, + { + "epoch": 5.857809694793537, + "grad_norm": 1.173884630203247, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 81570 + }, + { + "epoch": 5.858527827648115, + "grad_norm": 1.2426209449768066, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 81580 + }, + { + "epoch": 5.859245960502693, + "grad_norm": 0.9390465021133423, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 81590 + }, + { + "epoch": 5.859964093357271, + "grad_norm": 1.1387195587158203, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 81600 + }, + { + "epoch": 5.860682226211849, + "grad_norm": 0.9902143478393555, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 81610 + }, + { + "epoch": 5.861400359066427, + "grad_norm": 0.8328776359558105, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 81620 + }, + { + "epoch": 5.862118491921006, + "grad_norm": 0.9837837815284729, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 81630 + }, + { + "epoch": 5.862836624775584, + "grad_norm": 1.0013370513916016, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 81640 + }, + { + "epoch": 5.863554757630162, + "grad_norm": 0.9408028721809387, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 81650 + }, + { + "epoch": 5.86427289048474, + "grad_norm": 1.093140959739685, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 81660 + }, + { + "epoch": 5.864991023339318, + "grad_norm": 0.9554300904273987, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 81670 + }, + { + "epoch": 5.865709156193896, + "grad_norm": 1.1276485919952393, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 81680 + }, + { + "epoch": 5.866427289048474, + "grad_norm": 0.9628785252571106, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 81690 + }, + { + "epoch": 5.867145421903052, + "grad_norm": 0.9844689965248108, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 81700 + }, + { + "epoch": 5.86786355475763, + "grad_norm": 0.9679856896400452, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 81710 + }, + { + "epoch": 5.868581687612208, + "grad_norm": 1.0225571393966675, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 81720 + }, + { + "epoch": 5.869299820466786, + "grad_norm": 0.9330390691757202, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 81730 + }, + { + "epoch": 5.870017953321365, + "grad_norm": 1.0584566593170166, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 81740 + }, + { + "epoch": 5.870736086175943, + "grad_norm": 0.781548023223877, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 81750 + }, + { + "epoch": 5.871454219030521, + "grad_norm": 0.8906106352806091, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 81760 + }, + { + "epoch": 5.872172351885099, + "grad_norm": 1.1402281522750854, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 81770 + }, + { + "epoch": 5.872890484739677, + "grad_norm": 0.9991076588630676, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 81780 + }, + { + "epoch": 5.873608617594255, + "grad_norm": 1.0120140314102173, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 81790 + }, + { + "epoch": 5.874326750448833, + "grad_norm": 0.8857715725898743, + "learning_rate": 0.0002, + "loss": 0.6114, + "step": 81800 + }, + { + "epoch": 5.875044883303411, + "grad_norm": 0.8531954288482666, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 81810 + }, + { + "epoch": 5.87576301615799, + "grad_norm": 1.1601015329360962, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 81820 + }, + { + "epoch": 5.876481149012568, + "grad_norm": 1.1435350179672241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 81830 + }, + { + "epoch": 5.877199281867146, + "grad_norm": 0.9526153802871704, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 81840 + }, + { + "epoch": 5.877917414721724, + "grad_norm": 1.06845223903656, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 81850 + }, + { + "epoch": 5.878635547576302, + "grad_norm": 0.9239344596862793, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 81860 + }, + { + "epoch": 5.87935368043088, + "grad_norm": 0.8632398247718811, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 81870 + }, + { + "epoch": 5.880071813285458, + "grad_norm": 0.9148443341255188, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 81880 + }, + { + "epoch": 5.880789946140036, + "grad_norm": 0.9910652041435242, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 81890 + }, + { + "epoch": 5.881508078994614, + "grad_norm": 0.8335179090499878, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 81900 + }, + { + "epoch": 5.882226211849192, + "grad_norm": 0.9921387434005737, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 81910 + }, + { + "epoch": 5.88294434470377, + "grad_norm": 1.0532517433166504, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 81920 + }, + { + "epoch": 5.883662477558349, + "grad_norm": 1.026400089263916, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 81930 + }, + { + "epoch": 5.884380610412927, + "grad_norm": 1.019195318222046, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 81940 + }, + { + "epoch": 5.885098743267505, + "grad_norm": 0.987238347530365, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 81950 + }, + { + "epoch": 5.885816876122083, + "grad_norm": 1.1714487075805664, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 81960 + }, + { + "epoch": 5.886535008976661, + "grad_norm": 1.0854483842849731, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 81970 + }, + { + "epoch": 5.887253141831239, + "grad_norm": 1.0678396224975586, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 81980 + }, + { + "epoch": 5.887971274685817, + "grad_norm": 1.1009471416473389, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 81990 + }, + { + "epoch": 5.888689407540395, + "grad_norm": 1.2056844234466553, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 82000 + }, + { + "epoch": 5.8894075403949735, + "grad_norm": 1.131302833557129, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 82010 + }, + { + "epoch": 5.8901256732495515, + "grad_norm": 1.4466036558151245, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 82020 + }, + { + "epoch": 5.8908438061041295, + "grad_norm": 1.051228404045105, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 82030 + }, + { + "epoch": 5.8915619389587075, + "grad_norm": 1.0010617971420288, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 82040 + }, + { + "epoch": 5.8922800718132855, + "grad_norm": 0.9095138311386108, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 82050 + }, + { + "epoch": 5.8929982046678635, + "grad_norm": 1.0237005949020386, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 82060 + }, + { + "epoch": 5.8937163375224415, + "grad_norm": 1.035122036933899, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 82070 + }, + { + "epoch": 5.8944344703770195, + "grad_norm": 1.0271964073181152, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 82080 + }, + { + "epoch": 5.8951526032315975, + "grad_norm": 1.2044503688812256, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 82090 + }, + { + "epoch": 5.8958707360861755, + "grad_norm": 1.0275284051895142, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 82100 + }, + { + "epoch": 5.896588868940754, + "grad_norm": 0.9974840879440308, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 82110 + }, + { + "epoch": 5.897307001795332, + "grad_norm": 1.009968638420105, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 82120 + }, + { + "epoch": 5.89802513464991, + "grad_norm": 0.8396142721176147, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 82130 + }, + { + "epoch": 5.898743267504488, + "grad_norm": 1.002354621887207, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 82140 + }, + { + "epoch": 5.899461400359066, + "grad_norm": 0.9998893737792969, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 82150 + }, + { + "epoch": 5.900179533213644, + "grad_norm": 1.1027010679244995, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 82160 + }, + { + "epoch": 5.900897666068222, + "grad_norm": 1.2028530836105347, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 82170 + }, + { + "epoch": 5.9016157989228, + "grad_norm": 1.0018759965896606, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 82180 + }, + { + "epoch": 5.902333931777379, + "grad_norm": 0.8911277055740356, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 82190 + }, + { + "epoch": 5.903052064631957, + "grad_norm": 1.0172009468078613, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 82200 + }, + { + "epoch": 5.903770197486535, + "grad_norm": 1.1664029359817505, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 82210 + }, + { + "epoch": 5.904488330341113, + "grad_norm": 1.0620089769363403, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 82220 + }, + { + "epoch": 5.905206463195691, + "grad_norm": 1.0756114721298218, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 82230 + }, + { + "epoch": 5.905924596050269, + "grad_norm": 1.1727497577667236, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 82240 + }, + { + "epoch": 5.906642728904847, + "grad_norm": 0.9833515882492065, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 82250 + }, + { + "epoch": 5.907360861759425, + "grad_norm": 0.9236368536949158, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 82260 + }, + { + "epoch": 5.908078994614003, + "grad_norm": 0.9773947596549988, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 82270 + }, + { + "epoch": 5.908797127468581, + "grad_norm": 1.1427783966064453, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 82280 + }, + { + "epoch": 5.909515260323159, + "grad_norm": 1.0215164422988892, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 82290 + }, + { + "epoch": 5.910233393177738, + "grad_norm": 1.1157845258712769, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 82300 + }, + { + "epoch": 5.910951526032316, + "grad_norm": 1.1490662097930908, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 82310 + }, + { + "epoch": 5.911669658886894, + "grad_norm": 0.7233976125717163, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 82320 + }, + { + "epoch": 5.912387791741472, + "grad_norm": 1.0053865909576416, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 82330 + }, + { + "epoch": 5.91310592459605, + "grad_norm": 0.9764766097068787, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 82340 + }, + { + "epoch": 5.913824057450628, + "grad_norm": 0.9492928385734558, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 82350 + }, + { + "epoch": 5.914542190305206, + "grad_norm": 0.9538891315460205, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 82360 + }, + { + "epoch": 5.915260323159784, + "grad_norm": 1.2620314359664917, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 82370 + }, + { + "epoch": 5.915978456014363, + "grad_norm": 0.9913349151611328, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 82380 + }, + { + "epoch": 5.916696588868941, + "grad_norm": 0.9712074995040894, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 82390 + }, + { + "epoch": 5.917414721723519, + "grad_norm": 1.1554654836654663, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 82400 + }, + { + "epoch": 5.918132854578097, + "grad_norm": 1.1418904066085815, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 82410 + }, + { + "epoch": 5.918850987432675, + "grad_norm": 0.9405845999717712, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 82420 + }, + { + "epoch": 5.919569120287253, + "grad_norm": 1.0801819562911987, + "learning_rate": 0.0002, + "loss": 0.606, + "step": 82430 + }, + { + "epoch": 5.920287253141831, + "grad_norm": 0.8643896579742432, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 82440 + }, + { + "epoch": 5.921005385996409, + "grad_norm": 1.106025218963623, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 82450 + }, + { + "epoch": 5.921723518850987, + "grad_norm": 1.0338234901428223, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 82460 + }, + { + "epoch": 5.922441651705565, + "grad_norm": 1.0648493766784668, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 82470 + }, + { + "epoch": 5.923159784560143, + "grad_norm": 1.1950433254241943, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 82480 + }, + { + "epoch": 5.923877917414722, + "grad_norm": 0.8730897903442383, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 82490 + }, + { + "epoch": 5.9245960502693, + "grad_norm": 1.2262312173843384, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 82500 + }, + { + "epoch": 5.925314183123878, + "grad_norm": 0.9526116251945496, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 82510 + }, + { + "epoch": 5.926032315978456, + "grad_norm": 1.0540224313735962, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 82520 + }, + { + "epoch": 5.926750448833034, + "grad_norm": 1.0537306070327759, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 82530 + }, + { + "epoch": 5.927468581687612, + "grad_norm": 1.134207844734192, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 82540 + }, + { + "epoch": 5.92818671454219, + "grad_norm": 0.9042250514030457, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 82550 + }, + { + "epoch": 5.928904847396768, + "grad_norm": 1.0424834489822388, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 82560 + }, + { + "epoch": 5.929622980251347, + "grad_norm": 1.1571602821350098, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 82570 + }, + { + "epoch": 5.930341113105925, + "grad_norm": 1.1033377647399902, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 82580 + }, + { + "epoch": 5.931059245960503, + "grad_norm": 0.9211772680282593, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 82590 + }, + { + "epoch": 5.931777378815081, + "grad_norm": 1.0566459894180298, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 82600 + }, + { + "epoch": 5.932495511669659, + "grad_norm": 1.1773834228515625, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 82610 + }, + { + "epoch": 5.933213644524237, + "grad_norm": 1.193396806716919, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 82620 + }, + { + "epoch": 5.933931777378815, + "grad_norm": 1.1101785898208618, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 82630 + }, + { + "epoch": 5.934649910233393, + "grad_norm": 0.6988118886947632, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 82640 + }, + { + "epoch": 5.935368043087971, + "grad_norm": 0.9590985774993896, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 82650 + }, + { + "epoch": 5.936086175942549, + "grad_norm": 0.8512062430381775, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 82660 + }, + { + "epoch": 5.936804308797128, + "grad_norm": 1.0381710529327393, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 82670 + }, + { + "epoch": 5.937522441651706, + "grad_norm": 1.0816296339035034, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 82680 + }, + { + "epoch": 5.938240574506284, + "grad_norm": 1.0592364072799683, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 82690 + }, + { + "epoch": 5.938958707360862, + "grad_norm": 0.737452507019043, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 82700 + }, + { + "epoch": 5.93967684021544, + "grad_norm": 0.9019039869308472, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 82710 + }, + { + "epoch": 5.940394973070018, + "grad_norm": 1.0049666166305542, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 82720 + }, + { + "epoch": 5.941113105924596, + "grad_norm": 1.0016309022903442, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 82730 + }, + { + "epoch": 5.941831238779174, + "grad_norm": 0.7967594861984253, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 82740 + }, + { + "epoch": 5.942549371633753, + "grad_norm": 0.8978520631790161, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 82750 + }, + { + "epoch": 5.943267504488331, + "grad_norm": 1.0101654529571533, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 82760 + }, + { + "epoch": 5.943985637342909, + "grad_norm": 1.1515586376190186, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 82770 + }, + { + "epoch": 5.944703770197487, + "grad_norm": 0.8666134476661682, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 82780 + }, + { + "epoch": 5.945421903052065, + "grad_norm": 1.1365231275558472, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 82790 + }, + { + "epoch": 5.946140035906643, + "grad_norm": 1.211229920387268, + "learning_rate": 0.0002, + "loss": 0.6122, + "step": 82800 + }, + { + "epoch": 5.946858168761221, + "grad_norm": 0.9900869727134705, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 82810 + }, + { + "epoch": 5.947576301615799, + "grad_norm": 0.9555928111076355, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 82820 + }, + { + "epoch": 5.948294434470377, + "grad_norm": 0.8468470573425293, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 82830 + }, + { + "epoch": 5.949012567324955, + "grad_norm": 1.0280319452285767, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 82840 + }, + { + "epoch": 5.949730700179533, + "grad_norm": 0.930145800113678, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 82850 + }, + { + "epoch": 5.950448833034112, + "grad_norm": 1.0677028894424438, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 82860 + }, + { + "epoch": 5.95116696588869, + "grad_norm": 1.2035255432128906, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 82870 + }, + { + "epoch": 5.951885098743268, + "grad_norm": 0.897537887096405, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 82880 + }, + { + "epoch": 5.952603231597846, + "grad_norm": 1.2858690023422241, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 82890 + }, + { + "epoch": 5.953321364452424, + "grad_norm": 1.0300413370132446, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 82900 + }, + { + "epoch": 5.954039497307002, + "grad_norm": 0.9873301982879639, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 82910 + }, + { + "epoch": 5.95475763016158, + "grad_norm": 1.0315600633621216, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 82920 + }, + { + "epoch": 5.955475763016158, + "grad_norm": 1.0631790161132812, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 82930 + }, + { + "epoch": 5.9561938958707366, + "grad_norm": 1.035544514656067, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 82940 + }, + { + "epoch": 5.956912028725315, + "grad_norm": 1.0162041187286377, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 82950 + }, + { + "epoch": 5.957630161579893, + "grad_norm": 0.7858892679214478, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 82960 + }, + { + "epoch": 5.958348294434471, + "grad_norm": 1.0359784364700317, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 82970 + }, + { + "epoch": 5.959066427289049, + "grad_norm": 1.057173252105713, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 82980 + }, + { + "epoch": 5.959784560143627, + "grad_norm": 1.1017464399337769, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 82990 + }, + { + "epoch": 5.960502692998205, + "grad_norm": 1.0688945055007935, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 83000 + }, + { + "epoch": 5.961220825852783, + "grad_norm": 1.048864483833313, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 83010 + }, + { + "epoch": 5.961938958707361, + "grad_norm": 1.057308316230774, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 83020 + }, + { + "epoch": 5.962657091561939, + "grad_norm": 0.9014604687690735, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 83030 + }, + { + "epoch": 5.963375224416517, + "grad_norm": 0.9899709224700928, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 83040 + }, + { + "epoch": 5.9640933572710955, + "grad_norm": 1.0675519704818726, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 83050 + }, + { + "epoch": 5.9648114901256735, + "grad_norm": 0.9497889876365662, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 83060 + }, + { + "epoch": 5.9655296229802515, + "grad_norm": 0.9149549603462219, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 83070 + }, + { + "epoch": 5.9662477558348295, + "grad_norm": 1.329373836517334, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 83080 + }, + { + "epoch": 5.9669658886894075, + "grad_norm": 1.0731712579727173, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 83090 + }, + { + "epoch": 5.9676840215439855, + "grad_norm": 0.9498835802078247, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 83100 + }, + { + "epoch": 5.9684021543985635, + "grad_norm": 1.1222829818725586, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 83110 + }, + { + "epoch": 5.9691202872531415, + "grad_norm": 0.9923429489135742, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 83120 + }, + { + "epoch": 5.96983842010772, + "grad_norm": 0.9046645164489746, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 83130 + }, + { + "epoch": 5.970556552962298, + "grad_norm": 0.9259500503540039, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 83140 + }, + { + "epoch": 5.971274685816876, + "grad_norm": 1.0604174137115479, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 83150 + }, + { + "epoch": 5.971992818671454, + "grad_norm": 1.0391676425933838, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 83160 + }, + { + "epoch": 5.972710951526032, + "grad_norm": 0.8825796246528625, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 83170 + }, + { + "epoch": 5.97342908438061, + "grad_norm": 0.9687952399253845, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 83180 + }, + { + "epoch": 5.974147217235188, + "grad_norm": 0.9401392340660095, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 83190 + }, + { + "epoch": 5.974865350089766, + "grad_norm": 1.0526834726333618, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 83200 + }, + { + "epoch": 5.975583482944344, + "grad_norm": 1.1882060766220093, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 83210 + }, + { + "epoch": 5.976301615798922, + "grad_norm": 0.9182824492454529, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 83220 + }, + { + "epoch": 5.977019748653501, + "grad_norm": 1.344875454902649, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 83230 + }, + { + "epoch": 5.977737881508079, + "grad_norm": 1.3868434429168701, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 83240 + }, + { + "epoch": 5.978456014362657, + "grad_norm": 1.2702280282974243, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 83250 + }, + { + "epoch": 5.979174147217235, + "grad_norm": 0.9808234572410583, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 83260 + }, + { + "epoch": 5.979892280071813, + "grad_norm": 0.9225142598152161, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 83270 + }, + { + "epoch": 5.980610412926391, + "grad_norm": 1.1095874309539795, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 83280 + }, + { + "epoch": 5.981328545780969, + "grad_norm": 1.2650344371795654, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 83290 + }, + { + "epoch": 5.982046678635547, + "grad_norm": 0.8230084180831909, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 83300 + }, + { + "epoch": 5.982764811490125, + "grad_norm": 1.171427607536316, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 83310 + }, + { + "epoch": 5.983482944344704, + "grad_norm": 0.7458868026733398, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 83320 + }, + { + "epoch": 5.984201077199282, + "grad_norm": 0.9238616228103638, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 83330 + }, + { + "epoch": 5.98491921005386, + "grad_norm": 1.027495265007019, + "learning_rate": 0.0002, + "loss": 0.6316, + "step": 83340 + }, + { + "epoch": 5.985637342908438, + "grad_norm": 1.0694037675857544, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 83350 + }, + { + "epoch": 5.986355475763016, + "grad_norm": 0.9498767256736755, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 83360 + }, + { + "epoch": 5.987073608617594, + "grad_norm": 1.0524284839630127, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 83370 + }, + { + "epoch": 5.987791741472172, + "grad_norm": 1.07961905002594, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 83380 + }, + { + "epoch": 5.98850987432675, + "grad_norm": 1.1436965465545654, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 83390 + }, + { + "epoch": 5.989228007181328, + "grad_norm": 1.2610782384872437, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 83400 + }, + { + "epoch": 5.989946140035906, + "grad_norm": 1.1105682849884033, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 83410 + }, + { + "epoch": 5.990664272890485, + "grad_norm": 0.9900349378585815, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 83420 + }, + { + "epoch": 5.991382405745063, + "grad_norm": 0.8766723275184631, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 83430 + }, + { + "epoch": 5.992100538599641, + "grad_norm": 0.9532597661018372, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 83440 + }, + { + "epoch": 5.992818671454219, + "grad_norm": 1.016831398010254, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 83450 + }, + { + "epoch": 5.993536804308797, + "grad_norm": 0.9884716272354126, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 83460 + }, + { + "epoch": 5.994254937163375, + "grad_norm": 0.9415417909622192, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 83470 + }, + { + "epoch": 5.994973070017953, + "grad_norm": 0.8629752397537231, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 83480 + }, + { + "epoch": 5.995691202872531, + "grad_norm": 1.061378002166748, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 83490 + }, + { + "epoch": 5.99640933572711, + "grad_norm": 0.907195508480072, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 83500 + }, + { + "epoch": 5.997127468581688, + "grad_norm": 1.023658037185669, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 83510 + }, + { + "epoch": 5.997845601436266, + "grad_norm": 0.9893278479576111, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 83520 + }, + { + "epoch": 5.998563734290844, + "grad_norm": 1.1909127235412598, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 83530 + }, + { + "epoch": 5.999281867145422, + "grad_norm": 1.1800892353057861, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 83540 + }, + { + "epoch": 6.0, + "grad_norm": 1.0822563171386719, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 83550 + }, + { + "epoch": 6.0, + "eval_loss": 1.1494214534759521, + "eval_runtime": 55.1809, + "eval_samples_per_second": 13.284, + "eval_steps_per_second": 1.667, + "step": 83550 + }, + { + "epoch": 6.000718132854578, + "grad_norm": 0.8760911226272583, + "learning_rate": 0.0002, + "loss": 0.529, + "step": 83560 + }, + { + "epoch": 6.001436265709156, + "grad_norm": 1.0037305355072021, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 83570 + }, + { + "epoch": 6.002154398563734, + "grad_norm": 1.0550320148468018, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 83580 + }, + { + "epoch": 6.002872531418312, + "grad_norm": 0.7841113805770874, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 83590 + }, + { + "epoch": 6.003590664272891, + "grad_norm": 1.1221094131469727, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 83600 + }, + { + "epoch": 6.004308797127469, + "grad_norm": 1.174143671989441, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 83610 + }, + { + "epoch": 6.005026929982047, + "grad_norm": 1.1316391229629517, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 83620 + }, + { + "epoch": 6.005745062836625, + "grad_norm": 0.9318140745162964, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 83630 + }, + { + "epoch": 6.006463195691203, + "grad_norm": 1.1589723825454712, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 83640 + }, + { + "epoch": 6.007181328545781, + "grad_norm": 0.7452214360237122, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 83650 + }, + { + "epoch": 6.007899461400359, + "grad_norm": 1.205767035484314, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 83660 + }, + { + "epoch": 6.008617594254937, + "grad_norm": 0.8741596341133118, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 83670 + }, + { + "epoch": 6.009335727109515, + "grad_norm": 1.152982234954834, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 83680 + }, + { + "epoch": 6.010053859964093, + "grad_norm": 1.2438874244689941, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 83690 + }, + { + "epoch": 6.010771992818672, + "grad_norm": 1.142795443534851, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 83700 + }, + { + "epoch": 6.01149012567325, + "grad_norm": 1.1999919414520264, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 83710 + }, + { + "epoch": 6.012208258527828, + "grad_norm": 1.1839698553085327, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 83720 + }, + { + "epoch": 6.012926391382406, + "grad_norm": 1.1131623983383179, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 83730 + }, + { + "epoch": 6.013644524236984, + "grad_norm": 0.8436203598976135, + "learning_rate": 0.0002, + "loss": 0.5086, + "step": 83740 + }, + { + "epoch": 6.014362657091562, + "grad_norm": 0.9938826560974121, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 83750 + }, + { + "epoch": 6.01508078994614, + "grad_norm": 1.1624900102615356, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 83760 + }, + { + "epoch": 6.015798922800718, + "grad_norm": 1.0212476253509521, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 83770 + }, + { + "epoch": 6.016517055655296, + "grad_norm": 0.8108501434326172, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 83780 + }, + { + "epoch": 6.017235188509875, + "grad_norm": 1.3106935024261475, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 83790 + }, + { + "epoch": 6.017953321364453, + "grad_norm": 1.3103147745132446, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 83800 + }, + { + "epoch": 6.018671454219031, + "grad_norm": 0.7501855492591858, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 83810 + }, + { + "epoch": 6.019389587073609, + "grad_norm": 0.9246482253074646, + "learning_rate": 0.0002, + "loss": 0.5079, + "step": 83820 + }, + { + "epoch": 6.020107719928187, + "grad_norm": 1.0305052995681763, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 83830 + }, + { + "epoch": 6.020825852782765, + "grad_norm": 1.0912569761276245, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 83840 + }, + { + "epoch": 6.021543985637343, + "grad_norm": 0.9320057034492493, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 83850 + }, + { + "epoch": 6.022262118491921, + "grad_norm": 1.160483479499817, + "learning_rate": 0.0002, + "loss": 0.4795, + "step": 83860 + }, + { + "epoch": 6.022980251346499, + "grad_norm": 1.0211237668991089, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 83870 + }, + { + "epoch": 6.023698384201078, + "grad_norm": 0.8101710081100464, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 83880 + }, + { + "epoch": 6.024416517055656, + "grad_norm": 1.0671406984329224, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 83890 + }, + { + "epoch": 6.025134649910234, + "grad_norm": 1.3084125518798828, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 83900 + }, + { + "epoch": 6.025852782764812, + "grad_norm": 1.0144813060760498, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 83910 + }, + { + "epoch": 6.02657091561939, + "grad_norm": 1.134848952293396, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 83920 + }, + { + "epoch": 6.027289048473968, + "grad_norm": 1.183115005493164, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 83930 + }, + { + "epoch": 6.028007181328546, + "grad_norm": 0.961912989616394, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 83940 + }, + { + "epoch": 6.028725314183124, + "grad_norm": 0.9033881425857544, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 83950 + }, + { + "epoch": 6.029443447037702, + "grad_norm": 1.0272901058197021, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 83960 + }, + { + "epoch": 6.03016157989228, + "grad_norm": 1.0007939338684082, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 83970 + }, + { + "epoch": 6.0308797127468585, + "grad_norm": 1.0941389799118042, + "learning_rate": 0.0002, + "loss": 0.5215, + "step": 83980 + }, + { + "epoch": 6.0315978456014365, + "grad_norm": 0.9068517088890076, + "learning_rate": 0.0002, + "loss": 0.4881, + "step": 83990 + }, + { + "epoch": 6.0323159784560145, + "grad_norm": 0.8636500835418701, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 84000 + }, + { + "epoch": 6.0330341113105925, + "grad_norm": 1.352675437927246, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 84010 + }, + { + "epoch": 6.0337522441651705, + "grad_norm": 1.0889637470245361, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 84020 + }, + { + "epoch": 6.0344703770197485, + "grad_norm": 0.9063141345977783, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 84030 + }, + { + "epoch": 6.0351885098743265, + "grad_norm": 1.317254900932312, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 84040 + }, + { + "epoch": 6.0359066427289045, + "grad_norm": 1.1001603603363037, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 84050 + }, + { + "epoch": 6.0366247755834825, + "grad_norm": 0.8041839003562927, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 84060 + }, + { + "epoch": 6.037342908438061, + "grad_norm": 1.125082015991211, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 84070 + }, + { + "epoch": 6.038061041292639, + "grad_norm": 0.8926277160644531, + "learning_rate": 0.0002, + "loss": 0.5023, + "step": 84080 + }, + { + "epoch": 6.038779174147217, + "grad_norm": 1.0548304319381714, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 84090 + }, + { + "epoch": 6.039497307001795, + "grad_norm": 1.2299435138702393, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 84100 + }, + { + "epoch": 6.040215439856373, + "grad_norm": 0.7348281741142273, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 84110 + }, + { + "epoch": 6.040933572710951, + "grad_norm": 1.032209873199463, + "learning_rate": 0.0002, + "loss": 0.5598, + "step": 84120 + }, + { + "epoch": 6.041651705565529, + "grad_norm": 0.925134003162384, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 84130 + }, + { + "epoch": 6.042369838420107, + "grad_norm": 1.1078300476074219, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 84140 + }, + { + "epoch": 6.043087971274685, + "grad_norm": 0.9045702815055847, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 84150 + }, + { + "epoch": 6.043806104129264, + "grad_norm": 0.8836823105812073, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 84160 + }, + { + "epoch": 6.044524236983842, + "grad_norm": 0.8083572387695312, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 84170 + }, + { + "epoch": 6.04524236983842, + "grad_norm": 0.8744190335273743, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 84180 + }, + { + "epoch": 6.045960502692998, + "grad_norm": 1.1944562196731567, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 84190 + }, + { + "epoch": 6.046678635547576, + "grad_norm": 1.3782621622085571, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 84200 + }, + { + "epoch": 6.047396768402154, + "grad_norm": 1.2800641059875488, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 84210 + }, + { + "epoch": 6.048114901256732, + "grad_norm": 1.1035456657409668, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 84220 + }, + { + "epoch": 6.04883303411131, + "grad_norm": 1.243274211883545, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 84230 + }, + { + "epoch": 6.049551166965888, + "grad_norm": 0.8821795582771301, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 84240 + }, + { + "epoch": 6.050269299820466, + "grad_norm": 0.8730825185775757, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 84250 + }, + { + "epoch": 6.050987432675045, + "grad_norm": 0.9874304533004761, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 84260 + }, + { + "epoch": 6.051705565529623, + "grad_norm": 1.3245618343353271, + "learning_rate": 0.0002, + "loss": 0.5261, + "step": 84270 + }, + { + "epoch": 6.052423698384201, + "grad_norm": 1.04741370677948, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 84280 + }, + { + "epoch": 6.053141831238779, + "grad_norm": 1.1984949111938477, + "learning_rate": 0.0002, + "loss": 0.511, + "step": 84290 + }, + { + "epoch": 6.053859964093357, + "grad_norm": 0.9603039622306824, + "learning_rate": 0.0002, + "loss": 0.5148, + "step": 84300 + }, + { + "epoch": 6.054578096947935, + "grad_norm": 1.178102731704712, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 84310 + }, + { + "epoch": 6.055296229802513, + "grad_norm": 1.135046124458313, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 84320 + }, + { + "epoch": 6.056014362657091, + "grad_norm": 0.9682887196540833, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 84330 + }, + { + "epoch": 6.056732495511669, + "grad_norm": 0.9676550030708313, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 84340 + }, + { + "epoch": 6.057450628366248, + "grad_norm": 1.0987977981567383, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 84350 + }, + { + "epoch": 6.058168761220826, + "grad_norm": 0.9808574914932251, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 84360 + }, + { + "epoch": 6.058886894075404, + "grad_norm": 1.0585200786590576, + "learning_rate": 0.0002, + "loss": 0.4836, + "step": 84370 + }, + { + "epoch": 6.059605026929982, + "grad_norm": 0.9592017531394958, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 84380 + }, + { + "epoch": 6.06032315978456, + "grad_norm": 0.9652285575866699, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 84390 + }, + { + "epoch": 6.061041292639138, + "grad_norm": 1.1223928928375244, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 84400 + }, + { + "epoch": 6.061759425493716, + "grad_norm": 1.0554455518722534, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 84410 + }, + { + "epoch": 6.062477558348294, + "grad_norm": 1.4566363096237183, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 84420 + }, + { + "epoch": 6.063195691202872, + "grad_norm": 1.0793368816375732, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 84430 + }, + { + "epoch": 6.063913824057451, + "grad_norm": 1.1032981872558594, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 84440 + }, + { + "epoch": 6.064631956912029, + "grad_norm": 1.0701037645339966, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 84450 + }, + { + "epoch": 6.065350089766607, + "grad_norm": 0.9359426498413086, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 84460 + }, + { + "epoch": 6.066068222621185, + "grad_norm": 1.0277773141860962, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 84470 + }, + { + "epoch": 6.066786355475763, + "grad_norm": 1.029319405555725, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 84480 + }, + { + "epoch": 6.067504488330341, + "grad_norm": 1.3563756942749023, + "learning_rate": 0.0002, + "loss": 0.4949, + "step": 84490 + }, + { + "epoch": 6.068222621184919, + "grad_norm": 0.9577816128730774, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 84500 + }, + { + "epoch": 6.068940754039497, + "grad_norm": 0.9856799840927124, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 84510 + }, + { + "epoch": 6.069658886894075, + "grad_norm": 1.3285183906555176, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 84520 + }, + { + "epoch": 6.070377019748653, + "grad_norm": 1.0407335758209229, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 84530 + }, + { + "epoch": 6.071095152603232, + "grad_norm": 1.3125360012054443, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 84540 + }, + { + "epoch": 6.07181328545781, + "grad_norm": 1.0198888778686523, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 84550 + }, + { + "epoch": 6.072531418312388, + "grad_norm": 1.198135256767273, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 84560 + }, + { + "epoch": 6.073249551166966, + "grad_norm": 1.1547776460647583, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 84570 + }, + { + "epoch": 6.073967684021544, + "grad_norm": 1.1667766571044922, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 84580 + }, + { + "epoch": 6.074685816876122, + "grad_norm": 0.945159375667572, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 84590 + }, + { + "epoch": 6.0754039497307, + "grad_norm": 1.0362721681594849, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 84600 + }, + { + "epoch": 6.076122082585278, + "grad_norm": 1.1442973613739014, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 84610 + }, + { + "epoch": 6.076840215439856, + "grad_norm": 1.2077388763427734, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 84620 + }, + { + "epoch": 6.077558348294435, + "grad_norm": 1.1404398679733276, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 84630 + }, + { + "epoch": 6.078276481149013, + "grad_norm": 1.0291249752044678, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 84640 + }, + { + "epoch": 6.078994614003591, + "grad_norm": 1.2045460939407349, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 84650 + }, + { + "epoch": 6.079712746858169, + "grad_norm": 0.9492267966270447, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 84660 + }, + { + "epoch": 6.080430879712747, + "grad_norm": 0.9108620285987854, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 84670 + }, + { + "epoch": 6.081149012567325, + "grad_norm": 1.0403251647949219, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 84680 + }, + { + "epoch": 6.081867145421903, + "grad_norm": 0.8537648916244507, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 84690 + }, + { + "epoch": 6.082585278276481, + "grad_norm": 0.8450568914413452, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 84700 + }, + { + "epoch": 6.083303411131059, + "grad_norm": 0.9770439267158508, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 84710 + }, + { + "epoch": 6.084021543985638, + "grad_norm": 0.7480165958404541, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 84720 + }, + { + "epoch": 6.084739676840216, + "grad_norm": 1.0038665533065796, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 84730 + }, + { + "epoch": 6.085457809694794, + "grad_norm": 1.2631266117095947, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 84740 + }, + { + "epoch": 6.086175942549372, + "grad_norm": 1.0285290479660034, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 84750 + }, + { + "epoch": 6.08689407540395, + "grad_norm": 0.8775458335876465, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 84760 + }, + { + "epoch": 6.087612208258528, + "grad_norm": 1.105391263961792, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 84770 + }, + { + "epoch": 6.088330341113106, + "grad_norm": 0.9214589595794678, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 84780 + }, + { + "epoch": 6.089048473967684, + "grad_norm": 1.1920515298843384, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 84790 + }, + { + "epoch": 6.089766606822262, + "grad_norm": 1.0314369201660156, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 84800 + }, + { + "epoch": 6.09048473967684, + "grad_norm": 1.1323022842407227, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 84810 + }, + { + "epoch": 6.091202872531419, + "grad_norm": 0.9882907271385193, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 84820 + }, + { + "epoch": 6.091921005385997, + "grad_norm": 0.9372309446334839, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 84830 + }, + { + "epoch": 6.092639138240575, + "grad_norm": 0.9904384016990662, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 84840 + }, + { + "epoch": 6.093357271095153, + "grad_norm": 1.1983239650726318, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 84850 + }, + { + "epoch": 6.094075403949731, + "grad_norm": 1.0157414674758911, + "learning_rate": 0.0002, + "loss": 0.5018, + "step": 84860 + }, + { + "epoch": 6.094793536804309, + "grad_norm": 1.1213963031768799, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 84870 + }, + { + "epoch": 6.095511669658887, + "grad_norm": 0.9863889813423157, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 84880 + }, + { + "epoch": 6.096229802513465, + "grad_norm": 1.2265585660934448, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 84890 + }, + { + "epoch": 6.096947935368043, + "grad_norm": 0.9000206589698792, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 84900 + }, + { + "epoch": 6.097666068222622, + "grad_norm": 0.9284350872039795, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 84910 + }, + { + "epoch": 6.0983842010772, + "grad_norm": 0.8180069923400879, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 84920 + }, + { + "epoch": 6.099102333931778, + "grad_norm": 1.0313721895217896, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 84930 + }, + { + "epoch": 6.099820466786356, + "grad_norm": 0.9959180355072021, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 84940 + }, + { + "epoch": 6.100538599640934, + "grad_norm": 1.1720712184906006, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 84950 + }, + { + "epoch": 6.101256732495512, + "grad_norm": 1.1033729314804077, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 84960 + }, + { + "epoch": 6.10197486535009, + "grad_norm": 1.2325657606124878, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 84970 + }, + { + "epoch": 6.102692998204668, + "grad_norm": 1.204935073852539, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 84980 + }, + { + "epoch": 6.103411131059246, + "grad_norm": 0.9543479084968567, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 84990 + }, + { + "epoch": 6.1041292639138245, + "grad_norm": 1.0036866664886475, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 85000 + }, + { + "epoch": 6.1048473967684025, + "grad_norm": 1.0862882137298584, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 85010 + }, + { + "epoch": 6.1055655296229805, + "grad_norm": 1.052764892578125, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 85020 + }, + { + "epoch": 6.1062836624775585, + "grad_norm": 1.1948769092559814, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 85030 + }, + { + "epoch": 6.1070017953321365, + "grad_norm": 1.0291588306427002, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 85040 + }, + { + "epoch": 6.1077199281867145, + "grad_norm": 1.2162322998046875, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 85050 + }, + { + "epoch": 6.1084380610412925, + "grad_norm": 1.2867375612258911, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 85060 + }, + { + "epoch": 6.1091561938958705, + "grad_norm": 0.9639427661895752, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 85070 + }, + { + "epoch": 6.1098743267504485, + "grad_norm": 1.0775039196014404, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 85080 + }, + { + "epoch": 6.1105924596050265, + "grad_norm": 1.0423188209533691, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 85090 + }, + { + "epoch": 6.111310592459605, + "grad_norm": 0.9388473033905029, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 85100 + }, + { + "epoch": 6.112028725314183, + "grad_norm": 1.0761773586273193, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 85110 + }, + { + "epoch": 6.112746858168761, + "grad_norm": 1.0886104106903076, + "learning_rate": 0.0002, + "loss": 0.5144, + "step": 85120 + }, + { + "epoch": 6.113464991023339, + "grad_norm": 0.8716141581535339, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 85130 + }, + { + "epoch": 6.114183123877917, + "grad_norm": 1.5060595273971558, + "learning_rate": 0.0002, + "loss": 0.5598, + "step": 85140 + }, + { + "epoch": 6.114901256732495, + "grad_norm": 1.2417129278182983, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 85150 + }, + { + "epoch": 6.115619389587073, + "grad_norm": 1.063604712486267, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 85160 + }, + { + "epoch": 6.116337522441651, + "grad_norm": 1.1341352462768555, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 85170 + }, + { + "epoch": 6.117055655296229, + "grad_norm": 1.011865258216858, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 85180 + }, + { + "epoch": 6.117773788150808, + "grad_norm": 1.0746972560882568, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 85190 + }, + { + "epoch": 6.118491921005386, + "grad_norm": 0.9522349238395691, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 85200 + }, + { + "epoch": 6.119210053859964, + "grad_norm": 1.091785192489624, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 85210 + }, + { + "epoch": 6.119928186714542, + "grad_norm": 1.1013420820236206, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 85220 + }, + { + "epoch": 6.12064631956912, + "grad_norm": 0.9477053880691528, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 85230 + }, + { + "epoch": 6.121364452423698, + "grad_norm": 1.1278045177459717, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 85240 + }, + { + "epoch": 6.122082585278276, + "grad_norm": 1.0343154668807983, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 85250 + }, + { + "epoch": 6.122800718132854, + "grad_norm": 0.9023236036300659, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 85260 + }, + { + "epoch": 6.123518850987432, + "grad_norm": 1.1085705757141113, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 85270 + }, + { + "epoch": 6.124236983842011, + "grad_norm": 1.2945729494094849, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 85280 + }, + { + "epoch": 6.124955116696589, + "grad_norm": 1.0367915630340576, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 85290 + }, + { + "epoch": 6.125673249551167, + "grad_norm": 0.9990636706352234, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 85300 + }, + { + "epoch": 6.126391382405745, + "grad_norm": 0.9737518429756165, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 85310 + }, + { + "epoch": 6.127109515260323, + "grad_norm": 1.0211181640625, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 85320 + }, + { + "epoch": 6.127827648114901, + "grad_norm": 0.9609670042991638, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 85330 + }, + { + "epoch": 6.128545780969479, + "grad_norm": 1.124629259109497, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 85340 + }, + { + "epoch": 6.129263913824057, + "grad_norm": 0.9436500072479248, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 85350 + }, + { + "epoch": 6.129982046678635, + "grad_norm": 1.3075382709503174, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 85360 + }, + { + "epoch": 6.130700179533213, + "grad_norm": 0.9185589551925659, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 85370 + }, + { + "epoch": 6.131418312387792, + "grad_norm": 1.1051443815231323, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 85380 + }, + { + "epoch": 6.13213644524237, + "grad_norm": 1.185263752937317, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 85390 + }, + { + "epoch": 6.132854578096948, + "grad_norm": 1.0959895849227905, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 85400 + }, + { + "epoch": 6.133572710951526, + "grad_norm": 0.9279834032058716, + "learning_rate": 0.0002, + "loss": 0.4946, + "step": 85410 + }, + { + "epoch": 6.134290843806104, + "grad_norm": 1.36788010597229, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 85420 + }, + { + "epoch": 6.135008976660682, + "grad_norm": 1.0156842470169067, + "learning_rate": 0.0002, + "loss": 0.5122, + "step": 85430 + }, + { + "epoch": 6.13572710951526, + "grad_norm": 0.9998385906219482, + "learning_rate": 0.0002, + "loss": 0.5287, + "step": 85440 + }, + { + "epoch": 6.136445242369838, + "grad_norm": 1.21120285987854, + "learning_rate": 0.0002, + "loss": 0.5205, + "step": 85450 + }, + { + "epoch": 6.137163375224416, + "grad_norm": 1.1198976039886475, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 85460 + }, + { + "epoch": 6.137881508078995, + "grad_norm": 0.8551197648048401, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 85470 + }, + { + "epoch": 6.138599640933573, + "grad_norm": 1.378423810005188, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 85480 + }, + { + "epoch": 6.139317773788151, + "grad_norm": 1.0602139234542847, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 85490 + }, + { + "epoch": 6.140035906642729, + "grad_norm": 0.9416277408599854, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 85500 + }, + { + "epoch": 6.140754039497307, + "grad_norm": 0.9356902241706848, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 85510 + }, + { + "epoch": 6.141472172351885, + "grad_norm": 1.1635851860046387, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 85520 + }, + { + "epoch": 6.142190305206463, + "grad_norm": 0.7880265712738037, + "learning_rate": 0.0002, + "loss": 0.5026, + "step": 85530 + }, + { + "epoch": 6.142908438061041, + "grad_norm": 1.0618375539779663, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 85540 + }, + { + "epoch": 6.143626570915619, + "grad_norm": 0.8438394665718079, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 85550 + }, + { + "epoch": 6.144344703770198, + "grad_norm": 1.0630128383636475, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 85560 + }, + { + "epoch": 6.145062836624776, + "grad_norm": 1.027308464050293, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 85570 + }, + { + "epoch": 6.145780969479354, + "grad_norm": 1.0832568407058716, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 85580 + }, + { + "epoch": 6.146499102333932, + "grad_norm": 0.9134858250617981, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 85590 + }, + { + "epoch": 6.14721723518851, + "grad_norm": 1.2738041877746582, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 85600 + }, + { + "epoch": 6.147935368043088, + "grad_norm": 0.9961518049240112, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 85610 + }, + { + "epoch": 6.148653500897666, + "grad_norm": 0.8851816654205322, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 85620 + }, + { + "epoch": 6.149371633752244, + "grad_norm": 0.96479731798172, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 85630 + }, + { + "epoch": 6.150089766606822, + "grad_norm": 0.903256893157959, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 85640 + }, + { + "epoch": 6.1508078994614, + "grad_norm": 1.065151333808899, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 85650 + }, + { + "epoch": 6.151526032315979, + "grad_norm": 0.9824285507202148, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 85660 + }, + { + "epoch": 6.152244165170557, + "grad_norm": 1.1620386838912964, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 85670 + }, + { + "epoch": 6.152962298025135, + "grad_norm": 1.134757161140442, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 85680 + }, + { + "epoch": 6.153680430879713, + "grad_norm": 1.165537714958191, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 85690 + }, + { + "epoch": 6.154398563734291, + "grad_norm": 0.9486454129219055, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 85700 + }, + { + "epoch": 6.155116696588869, + "grad_norm": 0.9379110932350159, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 85710 + }, + { + "epoch": 6.155834829443447, + "grad_norm": 1.0051493644714355, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 85720 + }, + { + "epoch": 6.156552962298025, + "grad_norm": 0.9311991333961487, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 85730 + }, + { + "epoch": 6.157271095152603, + "grad_norm": 1.2071181535720825, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 85740 + }, + { + "epoch": 6.157989228007182, + "grad_norm": 1.2609243392944336, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 85750 + }, + { + "epoch": 6.15870736086176, + "grad_norm": 1.0485966205596924, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 85760 + }, + { + "epoch": 6.159425493716338, + "grad_norm": 0.9949250817298889, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 85770 + }, + { + "epoch": 6.160143626570916, + "grad_norm": 0.8191118836402893, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 85780 + }, + { + "epoch": 6.160861759425494, + "grad_norm": 0.96427983045578, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 85790 + }, + { + "epoch": 6.161579892280072, + "grad_norm": 1.0336496829986572, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 85800 + }, + { + "epoch": 6.16229802513465, + "grad_norm": 1.0699222087860107, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 85810 + }, + { + "epoch": 6.163016157989228, + "grad_norm": 1.2340054512023926, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 85820 + }, + { + "epoch": 6.163734290843806, + "grad_norm": 0.981848955154419, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 85830 + }, + { + "epoch": 6.164452423698384, + "grad_norm": 1.2059850692749023, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 85840 + }, + { + "epoch": 6.165170556552963, + "grad_norm": 1.0239924192428589, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 85850 + }, + { + "epoch": 6.165888689407541, + "grad_norm": 0.8601624369621277, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 85860 + }, + { + "epoch": 6.166606822262119, + "grad_norm": 1.1900125741958618, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 85870 + }, + { + "epoch": 6.167324955116697, + "grad_norm": 0.9747354388237, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 85880 + }, + { + "epoch": 6.168043087971275, + "grad_norm": 1.1277778148651123, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 85890 + }, + { + "epoch": 6.168761220825853, + "grad_norm": 1.1270111799240112, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 85900 + }, + { + "epoch": 6.169479353680431, + "grad_norm": 1.1610701084136963, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 85910 + }, + { + "epoch": 6.170197486535009, + "grad_norm": 0.873607873916626, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 85920 + }, + { + "epoch": 6.170915619389587, + "grad_norm": 1.040145993232727, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 85930 + }, + { + "epoch": 6.1716337522441655, + "grad_norm": 1.0139122009277344, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 85940 + }, + { + "epoch": 6.1723518850987436, + "grad_norm": 1.0575451850891113, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 85950 + }, + { + "epoch": 6.1730700179533216, + "grad_norm": 1.100884199142456, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 85960 + }, + { + "epoch": 6.1737881508078996, + "grad_norm": 1.1741244792938232, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 85970 + }, + { + "epoch": 6.174506283662478, + "grad_norm": 0.9446555376052856, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 85980 + }, + { + "epoch": 6.175224416517056, + "grad_norm": 0.9297952055931091, + "learning_rate": 0.0002, + "loss": 0.493, + "step": 85990 + }, + { + "epoch": 6.175942549371634, + "grad_norm": 1.196361780166626, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 86000 + }, + { + "epoch": 6.176660682226212, + "grad_norm": 1.0719913244247437, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 86010 + }, + { + "epoch": 6.17737881508079, + "grad_norm": 1.0942085981369019, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 86020 + }, + { + "epoch": 6.1780969479353685, + "grad_norm": 0.8989787697792053, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 86030 + }, + { + "epoch": 6.1788150807899465, + "grad_norm": 1.071344017982483, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 86040 + }, + { + "epoch": 6.1795332136445245, + "grad_norm": 0.9686782360076904, + "learning_rate": 0.0002, + "loss": 0.4885, + "step": 86050 + }, + { + "epoch": 6.1802513464991025, + "grad_norm": 1.0769884586334229, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 86060 + }, + { + "epoch": 6.1809694793536805, + "grad_norm": 0.9761241674423218, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 86070 + }, + { + "epoch": 6.1816876122082585, + "grad_norm": 1.0531808137893677, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 86080 + }, + { + "epoch": 6.1824057450628365, + "grad_norm": 1.0523570775985718, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 86090 + }, + { + "epoch": 6.1831238779174145, + "grad_norm": 1.2155946493148804, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 86100 + }, + { + "epoch": 6.1838420107719925, + "grad_norm": 1.1012920141220093, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 86110 + }, + { + "epoch": 6.184560143626571, + "grad_norm": 0.8764983415603638, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 86120 + }, + { + "epoch": 6.185278276481149, + "grad_norm": 0.950320303440094, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 86130 + }, + { + "epoch": 6.185996409335727, + "grad_norm": 1.1183594465255737, + "learning_rate": 0.0002, + "loss": 0.5275, + "step": 86140 + }, + { + "epoch": 6.186714542190305, + "grad_norm": 1.1919164657592773, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 86150 + }, + { + "epoch": 6.187432675044883, + "grad_norm": 1.1478904485702515, + "learning_rate": 0.0002, + "loss": 0.5121, + "step": 86160 + }, + { + "epoch": 6.188150807899461, + "grad_norm": 1.0764135122299194, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 86170 + }, + { + "epoch": 6.188868940754039, + "grad_norm": 1.195090889930725, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 86180 + }, + { + "epoch": 6.189587073608617, + "grad_norm": 1.089442253112793, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 86190 + }, + { + "epoch": 6.190305206463195, + "grad_norm": 0.9705546498298645, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 86200 + }, + { + "epoch": 6.191023339317773, + "grad_norm": 1.164642333984375, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 86210 + }, + { + "epoch": 6.191741472172352, + "grad_norm": 0.9551387429237366, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 86220 + }, + { + "epoch": 6.19245960502693, + "grad_norm": 1.0483227968215942, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 86230 + }, + { + "epoch": 6.193177737881508, + "grad_norm": 1.0068920850753784, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 86240 + }, + { + "epoch": 6.193895870736086, + "grad_norm": 1.142656683921814, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 86250 + }, + { + "epoch": 6.194614003590664, + "grad_norm": 1.1186467409133911, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 86260 + }, + { + "epoch": 6.195332136445242, + "grad_norm": 1.1664706468582153, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 86270 + }, + { + "epoch": 6.19605026929982, + "grad_norm": 1.2658511400222778, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 86280 + }, + { + "epoch": 6.196768402154398, + "grad_norm": 1.122759222984314, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 86290 + }, + { + "epoch": 6.197486535008976, + "grad_norm": 1.1611319780349731, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 86300 + }, + { + "epoch": 6.198204667863555, + "grad_norm": 1.0476176738739014, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 86310 + }, + { + "epoch": 6.198922800718133, + "grad_norm": 1.2284801006317139, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 86320 + }, + { + "epoch": 6.199640933572711, + "grad_norm": 1.1340757608413696, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 86330 + }, + { + "epoch": 6.200359066427289, + "grad_norm": 1.045088768005371, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 86340 + }, + { + "epoch": 6.201077199281867, + "grad_norm": 1.1200770139694214, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 86350 + }, + { + "epoch": 6.201795332136445, + "grad_norm": 1.1879554986953735, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 86360 + }, + { + "epoch": 6.202513464991023, + "grad_norm": 1.1146271228790283, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 86370 + }, + { + "epoch": 6.203231597845601, + "grad_norm": 0.8934822678565979, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 86380 + }, + { + "epoch": 6.203949730700179, + "grad_norm": 1.21973717212677, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 86390 + }, + { + "epoch": 6.204667863554757, + "grad_norm": 0.9424970746040344, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 86400 + }, + { + "epoch": 6.205385996409336, + "grad_norm": 1.0036219358444214, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 86410 + }, + { + "epoch": 6.206104129263914, + "grad_norm": 0.9319575428962708, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 86420 + }, + { + "epoch": 6.206822262118492, + "grad_norm": 1.0548789501190186, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 86430 + }, + { + "epoch": 6.20754039497307, + "grad_norm": 0.9361019730567932, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 86440 + }, + { + "epoch": 6.208258527827648, + "grad_norm": 0.9350554347038269, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 86450 + }, + { + "epoch": 6.208976660682226, + "grad_norm": 1.291595458984375, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 86460 + }, + { + "epoch": 6.209694793536804, + "grad_norm": 1.0414642095565796, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 86470 + }, + { + "epoch": 6.210412926391382, + "grad_norm": 1.1983444690704346, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 86480 + }, + { + "epoch": 6.21113105924596, + "grad_norm": 0.9444540739059448, + "learning_rate": 0.0002, + "loss": 0.493, + "step": 86490 + }, + { + "epoch": 6.211849192100539, + "grad_norm": 1.072526216506958, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 86500 + }, + { + "epoch": 6.212567324955117, + "grad_norm": 1.0109381675720215, + "learning_rate": 0.0002, + "loss": 0.5509, + "step": 86510 + }, + { + "epoch": 6.213285457809695, + "grad_norm": 1.1661816835403442, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 86520 + }, + { + "epoch": 6.214003590664273, + "grad_norm": 1.0434976816177368, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 86530 + }, + { + "epoch": 6.214721723518851, + "grad_norm": 1.1290796995162964, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 86540 + }, + { + "epoch": 6.215439856373429, + "grad_norm": 0.746512234210968, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 86550 + }, + { + "epoch": 6.216157989228007, + "grad_norm": 1.0346291065216064, + "learning_rate": 0.0002, + "loss": 0.5412, + "step": 86560 + }, + { + "epoch": 6.216876122082585, + "grad_norm": 1.2428497076034546, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 86570 + }, + { + "epoch": 6.217594254937163, + "grad_norm": 1.0040535926818848, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 86580 + }, + { + "epoch": 6.218312387791742, + "grad_norm": 0.9300616383552551, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 86590 + }, + { + "epoch": 6.21903052064632, + "grad_norm": 1.0006635189056396, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 86600 + }, + { + "epoch": 6.219748653500898, + "grad_norm": 1.1402281522750854, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 86610 + }, + { + "epoch": 6.220466786355476, + "grad_norm": 1.1543347835540771, + "learning_rate": 0.0002, + "loss": 0.5324, + "step": 86620 + }, + { + "epoch": 6.221184919210054, + "grad_norm": 1.1074384450912476, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 86630 + }, + { + "epoch": 6.221903052064632, + "grad_norm": 0.9032864570617676, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 86640 + }, + { + "epoch": 6.22262118491921, + "grad_norm": 1.094516396522522, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 86650 + }, + { + "epoch": 6.223339317773788, + "grad_norm": 1.2248685359954834, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 86660 + }, + { + "epoch": 6.224057450628366, + "grad_norm": 1.0211371183395386, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 86670 + }, + { + "epoch": 6.224775583482945, + "grad_norm": 1.0956611633300781, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 86680 + }, + { + "epoch": 6.225493716337523, + "grad_norm": 1.1494320631027222, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 86690 + }, + { + "epoch": 6.226211849192101, + "grad_norm": 0.968108594417572, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 86700 + }, + { + "epoch": 6.226929982046679, + "grad_norm": 1.376665711402893, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 86710 + }, + { + "epoch": 6.227648114901257, + "grad_norm": 1.2121574878692627, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 86720 + }, + { + "epoch": 6.228366247755835, + "grad_norm": 1.001272439956665, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 86730 + }, + { + "epoch": 6.229084380610413, + "grad_norm": 0.9023162722587585, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 86740 + }, + { + "epoch": 6.229802513464991, + "grad_norm": 1.2660632133483887, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 86750 + }, + { + "epoch": 6.230520646319569, + "grad_norm": 1.0549668073654175, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 86760 + }, + { + "epoch": 6.231238779174147, + "grad_norm": 1.0364645719528198, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 86770 + }, + { + "epoch": 6.231956912028726, + "grad_norm": 1.2197567224502563, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 86780 + }, + { + "epoch": 6.232675044883304, + "grad_norm": 0.8866947889328003, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 86790 + }, + { + "epoch": 6.233393177737882, + "grad_norm": 1.1795434951782227, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 86800 + }, + { + "epoch": 6.23411131059246, + "grad_norm": 1.0882378816604614, + "learning_rate": 0.0002, + "loss": 0.5309, + "step": 86810 + }, + { + "epoch": 6.234829443447038, + "grad_norm": 1.181888222694397, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 86820 + }, + { + "epoch": 6.235547576301616, + "grad_norm": 1.031209111213684, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 86830 + }, + { + "epoch": 6.236265709156194, + "grad_norm": 1.2889492511749268, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 86840 + }, + { + "epoch": 6.236983842010772, + "grad_norm": 0.874086856842041, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 86850 + }, + { + "epoch": 6.23770197486535, + "grad_norm": 1.1912312507629395, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 86860 + }, + { + "epoch": 6.238420107719929, + "grad_norm": 1.0963071584701538, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 86870 + }, + { + "epoch": 6.239138240574507, + "grad_norm": 1.028746485710144, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 86880 + }, + { + "epoch": 6.239856373429085, + "grad_norm": 1.0736430883407593, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 86890 + }, + { + "epoch": 6.240574506283663, + "grad_norm": 0.9559927582740784, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 86900 + }, + { + "epoch": 6.241292639138241, + "grad_norm": 0.9696667790412903, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 86910 + }, + { + "epoch": 6.242010771992819, + "grad_norm": 1.0710713863372803, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 86920 + }, + { + "epoch": 6.242728904847397, + "grad_norm": 1.0459970235824585, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 86930 + }, + { + "epoch": 6.243447037701975, + "grad_norm": 1.212083339691162, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 86940 + }, + { + "epoch": 6.244165170556553, + "grad_norm": 1.0369303226470947, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 86950 + }, + { + "epoch": 6.244883303411131, + "grad_norm": 1.180519700050354, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 86960 + }, + { + "epoch": 6.2456014362657095, + "grad_norm": 1.0670114755630493, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 86970 + }, + { + "epoch": 6.2463195691202875, + "grad_norm": 1.072209119796753, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 86980 + }, + { + "epoch": 6.2470377019748655, + "grad_norm": 0.9642090201377869, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 86990 + }, + { + "epoch": 6.2477558348294435, + "grad_norm": 1.077467918395996, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 87000 + }, + { + "epoch": 6.2484739676840215, + "grad_norm": 1.1081476211547852, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 87010 + }, + { + "epoch": 6.2491921005385995, + "grad_norm": 0.8815084099769592, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 87020 + }, + { + "epoch": 6.2499102333931775, + "grad_norm": 0.8562555313110352, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 87030 + }, + { + "epoch": 6.2506283662477555, + "grad_norm": 0.8729159235954285, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 87040 + }, + { + "epoch": 6.2513464991023335, + "grad_norm": 1.005082368850708, + "learning_rate": 0.0002, + "loss": 0.5179, + "step": 87050 + }, + { + "epoch": 6.252064631956912, + "grad_norm": 1.3991386890411377, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 87060 + }, + { + "epoch": 6.25278276481149, + "grad_norm": 1.090180516242981, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 87070 + }, + { + "epoch": 6.253500897666068, + "grad_norm": 1.08149254322052, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 87080 + }, + { + "epoch": 6.254219030520646, + "grad_norm": 1.1021103858947754, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 87090 + }, + { + "epoch": 6.254937163375224, + "grad_norm": 1.2393771409988403, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 87100 + }, + { + "epoch": 6.255655296229802, + "grad_norm": 0.9702037572860718, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 87110 + }, + { + "epoch": 6.25637342908438, + "grad_norm": 1.203088641166687, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 87120 + }, + { + "epoch": 6.257091561938958, + "grad_norm": 0.9722330570220947, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 87130 + }, + { + "epoch": 6.257809694793536, + "grad_norm": 0.9802384376525879, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 87140 + }, + { + "epoch": 6.258527827648114, + "grad_norm": 0.9991751909255981, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 87150 + }, + { + "epoch": 6.259245960502693, + "grad_norm": 1.1102324724197388, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 87160 + }, + { + "epoch": 6.259964093357271, + "grad_norm": 1.1357909440994263, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 87170 + }, + { + "epoch": 6.260682226211849, + "grad_norm": 1.1128548383712769, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 87180 + }, + { + "epoch": 6.261400359066427, + "grad_norm": 1.1135061979293823, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 87190 + }, + { + "epoch": 6.262118491921005, + "grad_norm": 0.9545563459396362, + "learning_rate": 0.0002, + "loss": 0.4923, + "step": 87200 + }, + { + "epoch": 6.262836624775583, + "grad_norm": 1.3011159896850586, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 87210 + }, + { + "epoch": 6.263554757630161, + "grad_norm": 1.217691421508789, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 87220 + }, + { + "epoch": 6.264272890484739, + "grad_norm": 0.9615218043327332, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 87230 + }, + { + "epoch": 6.264991023339318, + "grad_norm": 0.9935932159423828, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 87240 + }, + { + "epoch": 6.265709156193896, + "grad_norm": 1.01247239112854, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 87250 + }, + { + "epoch": 6.266427289048474, + "grad_norm": 1.1960358619689941, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 87260 + }, + { + "epoch": 6.267145421903052, + "grad_norm": 1.053942322731018, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 87270 + }, + { + "epoch": 6.26786355475763, + "grad_norm": 1.2450612783432007, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 87280 + }, + { + "epoch": 6.268581687612208, + "grad_norm": 0.7816058397293091, + "learning_rate": 0.0002, + "loss": 0.5149, + "step": 87290 + }, + { + "epoch": 6.269299820466786, + "grad_norm": 1.014817237854004, + "learning_rate": 0.0002, + "loss": 0.549, + "step": 87300 + }, + { + "epoch": 6.270017953321364, + "grad_norm": 1.1871070861816406, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 87310 + }, + { + "epoch": 6.270736086175942, + "grad_norm": 1.0170562267303467, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 87320 + }, + { + "epoch": 6.27145421903052, + "grad_norm": 1.216288685798645, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 87330 + }, + { + "epoch": 6.272172351885099, + "grad_norm": 0.8846057653427124, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 87340 + }, + { + "epoch": 6.272890484739677, + "grad_norm": 1.181233286857605, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 87350 + }, + { + "epoch": 6.273608617594255, + "grad_norm": 1.0051873922348022, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 87360 + }, + { + "epoch": 6.274326750448833, + "grad_norm": 1.1179516315460205, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 87370 + }, + { + "epoch": 6.275044883303411, + "grad_norm": 1.0118002891540527, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 87380 + }, + { + "epoch": 6.275763016157989, + "grad_norm": 1.0948026180267334, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 87390 + }, + { + "epoch": 6.276481149012567, + "grad_norm": 1.0836515426635742, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 87400 + }, + { + "epoch": 6.277199281867145, + "grad_norm": 0.9548853039741516, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 87410 + }, + { + "epoch": 6.277917414721723, + "grad_norm": 1.2531564235687256, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 87420 + }, + { + "epoch": 6.278635547576302, + "grad_norm": 1.010250449180603, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 87430 + }, + { + "epoch": 6.27935368043088, + "grad_norm": 1.3306254148483276, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 87440 + }, + { + "epoch": 6.280071813285458, + "grad_norm": 0.9485062956809998, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 87450 + }, + { + "epoch": 6.280789946140036, + "grad_norm": 0.9938563704490662, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 87460 + }, + { + "epoch": 6.281508078994614, + "grad_norm": 1.1747362613677979, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 87470 + }, + { + "epoch": 6.282226211849192, + "grad_norm": 1.1712254285812378, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 87480 + }, + { + "epoch": 6.28294434470377, + "grad_norm": 1.1453865766525269, + "learning_rate": 0.0002, + "loss": 0.6165, + "step": 87490 + }, + { + "epoch": 6.283662477558348, + "grad_norm": 0.974902331829071, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 87500 + }, + { + "epoch": 6.284380610412926, + "grad_norm": 1.1181912422180176, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 87510 + }, + { + "epoch": 6.285098743267504, + "grad_norm": 1.047453761100769, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 87520 + }, + { + "epoch": 6.285816876122083, + "grad_norm": 1.185815453529358, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 87530 + }, + { + "epoch": 6.286535008976661, + "grad_norm": 1.1126786470413208, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 87540 + }, + { + "epoch": 6.287253141831239, + "grad_norm": 1.0931676626205444, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 87550 + }, + { + "epoch": 6.287971274685817, + "grad_norm": 0.9930597543716431, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 87560 + }, + { + "epoch": 6.288689407540395, + "grad_norm": 0.9909583926200867, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 87570 + }, + { + "epoch": 6.289407540394973, + "grad_norm": 1.3766822814941406, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 87580 + }, + { + "epoch": 6.290125673249551, + "grad_norm": 1.0137864351272583, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 87590 + }, + { + "epoch": 6.290843806104129, + "grad_norm": 0.8761594295501709, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 87600 + }, + { + "epoch": 6.291561938958707, + "grad_norm": 1.155881404876709, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 87610 + }, + { + "epoch": 6.292280071813286, + "grad_norm": 0.9972963333129883, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 87620 + }, + { + "epoch": 6.292998204667864, + "grad_norm": 1.195021152496338, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 87630 + }, + { + "epoch": 6.293716337522442, + "grad_norm": 0.9872829914093018, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 87640 + }, + { + "epoch": 6.29443447037702, + "grad_norm": 1.3643794059753418, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 87650 + }, + { + "epoch": 6.295152603231598, + "grad_norm": 0.9389668703079224, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 87660 + }, + { + "epoch": 6.295870736086176, + "grad_norm": 1.379319429397583, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 87670 + }, + { + "epoch": 6.296588868940754, + "grad_norm": 1.1253849267959595, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 87680 + }, + { + "epoch": 6.297307001795332, + "grad_norm": 1.2402328252792358, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 87690 + }, + { + "epoch": 6.29802513464991, + "grad_norm": 1.085004210472107, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 87700 + }, + { + "epoch": 6.298743267504488, + "grad_norm": 1.0939021110534668, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 87710 + }, + { + "epoch": 6.299461400359067, + "grad_norm": 1.0350301265716553, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 87720 + }, + { + "epoch": 6.300179533213645, + "grad_norm": 0.9862944483757019, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 87730 + }, + { + "epoch": 6.300897666068223, + "grad_norm": 0.990942656993866, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 87740 + }, + { + "epoch": 6.301615798922801, + "grad_norm": 0.9287887215614319, + "learning_rate": 0.0002, + "loss": 0.4843, + "step": 87750 + }, + { + "epoch": 6.302333931777379, + "grad_norm": 1.225714087486267, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 87760 + }, + { + "epoch": 6.303052064631957, + "grad_norm": 1.0181951522827148, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 87770 + }, + { + "epoch": 6.303770197486535, + "grad_norm": 0.9808282256126404, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 87780 + }, + { + "epoch": 6.304488330341113, + "grad_norm": 1.1413379907608032, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 87790 + }, + { + "epoch": 6.305206463195692, + "grad_norm": 1.1188091039657593, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 87800 + }, + { + "epoch": 6.30592459605027, + "grad_norm": 1.297154188156128, + "learning_rate": 0.0002, + "loss": 0.497, + "step": 87810 + }, + { + "epoch": 6.306642728904848, + "grad_norm": 1.0723271369934082, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 87820 + }, + { + "epoch": 6.307360861759426, + "grad_norm": 1.067265510559082, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 87830 + }, + { + "epoch": 6.308078994614004, + "grad_norm": 1.01328444480896, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 87840 + }, + { + "epoch": 6.308797127468582, + "grad_norm": 1.092671513557434, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 87850 + }, + { + "epoch": 6.30951526032316, + "grad_norm": 1.168721079826355, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 87860 + }, + { + "epoch": 6.310233393177738, + "grad_norm": 1.165495753288269, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 87870 + }, + { + "epoch": 6.310951526032316, + "grad_norm": 1.10816490650177, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 87880 + }, + { + "epoch": 6.311669658886894, + "grad_norm": 0.9667611718177795, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 87890 + }, + { + "epoch": 6.312387791741473, + "grad_norm": 1.22564697265625, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 87900 + }, + { + "epoch": 6.313105924596051, + "grad_norm": 1.1156506538391113, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 87910 + }, + { + "epoch": 6.313824057450629, + "grad_norm": 1.03804349899292, + "learning_rate": 0.0002, + "loss": 0.5324, + "step": 87920 + }, + { + "epoch": 6.314542190305207, + "grad_norm": 0.9424136281013489, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 87930 + }, + { + "epoch": 6.315260323159785, + "grad_norm": 1.2243257761001587, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 87940 + }, + { + "epoch": 6.315978456014363, + "grad_norm": 1.0930471420288086, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 87950 + }, + { + "epoch": 6.316696588868941, + "grad_norm": 1.096875548362732, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 87960 + }, + { + "epoch": 6.317414721723519, + "grad_norm": 1.0606242418289185, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 87970 + }, + { + "epoch": 6.318132854578097, + "grad_norm": 0.8657089471817017, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 87980 + }, + { + "epoch": 6.3188509874326755, + "grad_norm": 0.9751629829406738, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 87990 + }, + { + "epoch": 6.3195691202872535, + "grad_norm": 1.0751961469650269, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 88000 + }, + { + "epoch": 6.3202872531418315, + "grad_norm": 1.0679874420166016, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 88010 + }, + { + "epoch": 6.3210053859964095, + "grad_norm": 1.4102588891983032, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 88020 + }, + { + "epoch": 6.3217235188509875, + "grad_norm": 0.8747799396514893, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 88030 + }, + { + "epoch": 6.3224416517055655, + "grad_norm": 1.0866155624389648, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 88040 + }, + { + "epoch": 6.3231597845601435, + "grad_norm": 1.2255747318267822, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 88050 + }, + { + "epoch": 6.3238779174147215, + "grad_norm": 1.031588077545166, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 88060 + }, + { + "epoch": 6.3245960502692995, + "grad_norm": 1.1994154453277588, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 88070 + }, + { + "epoch": 6.3253141831238775, + "grad_norm": 0.9172461032867432, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 88080 + }, + { + "epoch": 6.326032315978456, + "grad_norm": 0.8762667775154114, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 88090 + }, + { + "epoch": 6.326750448833034, + "grad_norm": 1.166225790977478, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 88100 + }, + { + "epoch": 6.327468581687612, + "grad_norm": 1.014858365058899, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 88110 + }, + { + "epoch": 6.32818671454219, + "grad_norm": 1.1080266237258911, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 88120 + }, + { + "epoch": 6.328904847396768, + "grad_norm": 0.9775443077087402, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 88130 + }, + { + "epoch": 6.329622980251346, + "grad_norm": 0.9032314419746399, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 88140 + }, + { + "epoch": 6.330341113105924, + "grad_norm": 1.0170091390609741, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 88150 + }, + { + "epoch": 6.331059245960502, + "grad_norm": 0.9412024617195129, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 88160 + }, + { + "epoch": 6.33177737881508, + "grad_norm": 0.9090259671211243, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 88170 + }, + { + "epoch": 6.332495511669659, + "grad_norm": 0.8896998167037964, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 88180 + }, + { + "epoch": 6.333213644524237, + "grad_norm": 1.1648571491241455, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 88190 + }, + { + "epoch": 6.333931777378815, + "grad_norm": 1.13261878490448, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 88200 + }, + { + "epoch": 6.334649910233393, + "grad_norm": 0.9561943411827087, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 88210 + }, + { + "epoch": 6.335368043087971, + "grad_norm": 1.3076379299163818, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 88220 + }, + { + "epoch": 6.336086175942549, + "grad_norm": 0.9788665175437927, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 88230 + }, + { + "epoch": 6.336804308797127, + "grad_norm": 1.2843645811080933, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 88240 + }, + { + "epoch": 6.337522441651705, + "grad_norm": 1.1531981229782104, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 88250 + }, + { + "epoch": 6.338240574506283, + "grad_norm": 1.1946183443069458, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 88260 + }, + { + "epoch": 6.338958707360861, + "grad_norm": 1.1190218925476074, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 88270 + }, + { + "epoch": 6.33967684021544, + "grad_norm": 1.0605140924453735, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 88280 + }, + { + "epoch": 6.340394973070018, + "grad_norm": 1.0237314701080322, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 88290 + }, + { + "epoch": 6.341113105924596, + "grad_norm": 1.1268457174301147, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 88300 + }, + { + "epoch": 6.341831238779174, + "grad_norm": 1.0750062465667725, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 88310 + }, + { + "epoch": 6.342549371633752, + "grad_norm": 1.2356536388397217, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 88320 + }, + { + "epoch": 6.34326750448833, + "grad_norm": 1.0375114679336548, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 88330 + }, + { + "epoch": 6.343985637342908, + "grad_norm": 1.063388705253601, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 88340 + }, + { + "epoch": 6.344703770197486, + "grad_norm": 0.9182760715484619, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 88350 + }, + { + "epoch": 6.345421903052064, + "grad_norm": 0.9787414073944092, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 88360 + }, + { + "epoch": 6.346140035906643, + "grad_norm": 1.295432448387146, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 88370 + }, + { + "epoch": 6.346858168761221, + "grad_norm": 0.9269146919250488, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 88380 + }, + { + "epoch": 6.347576301615799, + "grad_norm": 0.9076777696609497, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 88390 + }, + { + "epoch": 6.348294434470377, + "grad_norm": 1.1186468601226807, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 88400 + }, + { + "epoch": 6.349012567324955, + "grad_norm": 1.1021504402160645, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 88410 + }, + { + "epoch": 6.349730700179533, + "grad_norm": 1.2439358234405518, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 88420 + }, + { + "epoch": 6.350448833034111, + "grad_norm": 1.1228888034820557, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 88430 + }, + { + "epoch": 6.351166965888689, + "grad_norm": 1.226587176322937, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 88440 + }, + { + "epoch": 6.351885098743267, + "grad_norm": 1.2813525199890137, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 88450 + }, + { + "epoch": 6.352603231597846, + "grad_norm": 1.411405086517334, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 88460 + }, + { + "epoch": 6.353321364452424, + "grad_norm": 1.3659696578979492, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 88470 + }, + { + "epoch": 6.354039497307002, + "grad_norm": 1.1398485898971558, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 88480 + }, + { + "epoch": 6.35475763016158, + "grad_norm": 1.2088590860366821, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 88490 + }, + { + "epoch": 6.355475763016158, + "grad_norm": 0.9191108345985413, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 88500 + }, + { + "epoch": 6.356193895870736, + "grad_norm": 0.9855144619941711, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 88510 + }, + { + "epoch": 6.356912028725314, + "grad_norm": 1.0576577186584473, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 88520 + }, + { + "epoch": 6.357630161579892, + "grad_norm": 1.0213230848312378, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 88530 + }, + { + "epoch": 6.35834829443447, + "grad_norm": 1.2086849212646484, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 88540 + }, + { + "epoch": 6.359066427289049, + "grad_norm": 1.05294930934906, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 88550 + }, + { + "epoch": 6.359784560143627, + "grad_norm": 1.1798300743103027, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 88560 + }, + { + "epoch": 6.360502692998205, + "grad_norm": 1.088749885559082, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 88570 + }, + { + "epoch": 6.361220825852783, + "grad_norm": 1.0071386098861694, + "learning_rate": 0.0002, + "loss": 0.5299, + "step": 88580 + }, + { + "epoch": 6.361938958707361, + "grad_norm": 1.2080132961273193, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 88590 + }, + { + "epoch": 6.362657091561939, + "grad_norm": 0.9784366488456726, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 88600 + }, + { + "epoch": 6.363375224416517, + "grad_norm": 0.9475322961807251, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 88610 + }, + { + "epoch": 6.364093357271095, + "grad_norm": 0.8267584443092346, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 88620 + }, + { + "epoch": 6.364811490125673, + "grad_norm": 1.05606210231781, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 88630 + }, + { + "epoch": 6.365529622980251, + "grad_norm": 1.2059335708618164, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 88640 + }, + { + "epoch": 6.36624775583483, + "grad_norm": 1.1900845766067505, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 88650 + }, + { + "epoch": 6.366965888689408, + "grad_norm": 1.0271358489990234, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 88660 + }, + { + "epoch": 6.367684021543986, + "grad_norm": 1.1839162111282349, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 88670 + }, + { + "epoch": 6.368402154398564, + "grad_norm": 0.9042913317680359, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 88680 + }, + { + "epoch": 6.369120287253142, + "grad_norm": 1.079893946647644, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 88690 + }, + { + "epoch": 6.36983842010772, + "grad_norm": 1.0999629497528076, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 88700 + }, + { + "epoch": 6.370556552962298, + "grad_norm": 1.0618157386779785, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 88710 + }, + { + "epoch": 6.371274685816876, + "grad_norm": 0.9567645788192749, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 88720 + }, + { + "epoch": 6.371992818671454, + "grad_norm": 1.0342025756835938, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 88730 + }, + { + "epoch": 6.372710951526033, + "grad_norm": 1.0789190530776978, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 88740 + }, + { + "epoch": 6.373429084380611, + "grad_norm": 0.9956819415092468, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 88750 + }, + { + "epoch": 6.374147217235189, + "grad_norm": 0.9103280305862427, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 88760 + }, + { + "epoch": 6.374865350089767, + "grad_norm": 0.9856002330780029, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 88770 + }, + { + "epoch": 6.375583482944345, + "grad_norm": 1.1801226139068604, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 88780 + }, + { + "epoch": 6.376301615798923, + "grad_norm": 0.9876776933670044, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 88790 + }, + { + "epoch": 6.377019748653501, + "grad_norm": 1.0169886350631714, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 88800 + }, + { + "epoch": 6.377737881508079, + "grad_norm": 1.0118076801300049, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 88810 + }, + { + "epoch": 6.378456014362657, + "grad_norm": 1.0641456842422485, + "learning_rate": 0.0002, + "loss": 0.5205, + "step": 88820 + }, + { + "epoch": 6.379174147217235, + "grad_norm": 1.1138534545898438, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 88830 + }, + { + "epoch": 6.379892280071814, + "grad_norm": 1.1518962383270264, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 88840 + }, + { + "epoch": 6.380610412926392, + "grad_norm": 1.3662128448486328, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 88850 + }, + { + "epoch": 6.38132854578097, + "grad_norm": 0.9544311761856079, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 88860 + }, + { + "epoch": 6.382046678635548, + "grad_norm": 0.9747556447982788, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 88870 + }, + { + "epoch": 6.382764811490126, + "grad_norm": 1.1651948690414429, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 88880 + }, + { + "epoch": 6.383482944344704, + "grad_norm": 1.4048396348953247, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 88890 + }, + { + "epoch": 6.384201077199282, + "grad_norm": 1.1144068241119385, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 88900 + }, + { + "epoch": 6.38491921005386, + "grad_norm": 1.2978034019470215, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 88910 + }, + { + "epoch": 6.385637342908438, + "grad_norm": 1.1776132583618164, + "learning_rate": 0.0002, + "loss": 0.5279, + "step": 88920 + }, + { + "epoch": 6.3863554757630165, + "grad_norm": 0.8849034905433655, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 88930 + }, + { + "epoch": 6.3870736086175945, + "grad_norm": 1.1207057237625122, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 88940 + }, + { + "epoch": 6.3877917414721725, + "grad_norm": 0.9364172220230103, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 88950 + }, + { + "epoch": 6.3885098743267505, + "grad_norm": 1.1731317043304443, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 88960 + }, + { + "epoch": 6.3892280071813286, + "grad_norm": 1.0411573648452759, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 88970 + }, + { + "epoch": 6.3899461400359066, + "grad_norm": 1.0817447900772095, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 88980 + }, + { + "epoch": 6.3906642728904846, + "grad_norm": 1.0037593841552734, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 88990 + }, + { + "epoch": 6.391382405745063, + "grad_norm": 1.1684437990188599, + "learning_rate": 0.0002, + "loss": 0.562, + "step": 89000 + }, + { + "epoch": 6.392100538599641, + "grad_norm": 1.0237388610839844, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 89010 + }, + { + "epoch": 6.392818671454219, + "grad_norm": 1.24791419506073, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 89020 + }, + { + "epoch": 6.3935368043087974, + "grad_norm": 0.842664897441864, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 89030 + }, + { + "epoch": 6.3942549371633755, + "grad_norm": 1.1692326068878174, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 89040 + }, + { + "epoch": 6.3949730700179535, + "grad_norm": 1.0786939859390259, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 89050 + }, + { + "epoch": 6.3956912028725315, + "grad_norm": 1.1315077543258667, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 89060 + }, + { + "epoch": 6.3964093357271095, + "grad_norm": 0.9949214458465576, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 89070 + }, + { + "epoch": 6.3971274685816875, + "grad_norm": 1.0302025079727173, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 89080 + }, + { + "epoch": 6.3978456014362655, + "grad_norm": 0.9664030075073242, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 89090 + }, + { + "epoch": 6.3985637342908435, + "grad_norm": 1.1251037120819092, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 89100 + }, + { + "epoch": 6.399281867145422, + "grad_norm": 1.1103272438049316, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 89110 + }, + { + "epoch": 6.4, + "grad_norm": 0.9192888736724854, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 89120 + }, + { + "epoch": 6.400718132854578, + "grad_norm": 1.027806043624878, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 89130 + }, + { + "epoch": 6.401436265709156, + "grad_norm": 1.1219452619552612, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 89140 + }, + { + "epoch": 6.402154398563734, + "grad_norm": 1.1703979969024658, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 89150 + }, + { + "epoch": 6.402872531418312, + "grad_norm": 1.025874376296997, + "learning_rate": 0.0002, + "loss": 0.5251, + "step": 89160 + }, + { + "epoch": 6.40359066427289, + "grad_norm": 1.070225715637207, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 89170 + }, + { + "epoch": 6.404308797127468, + "grad_norm": 1.1915208101272583, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 89180 + }, + { + "epoch": 6.405026929982046, + "grad_norm": 1.1954079866409302, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 89190 + }, + { + "epoch": 6.405745062836624, + "grad_norm": 1.035910964012146, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 89200 + }, + { + "epoch": 6.406463195691203, + "grad_norm": 1.1363351345062256, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 89210 + }, + { + "epoch": 6.407181328545781, + "grad_norm": 1.2086843252182007, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 89220 + }, + { + "epoch": 6.407899461400359, + "grad_norm": 1.3492387533187866, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 89230 + }, + { + "epoch": 6.408617594254937, + "grad_norm": 0.8746330738067627, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 89240 + }, + { + "epoch": 6.409335727109515, + "grad_norm": 1.0165427923202515, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 89250 + }, + { + "epoch": 6.410053859964093, + "grad_norm": 1.0314675569534302, + "learning_rate": 0.0002, + "loss": 0.5437, + "step": 89260 + }, + { + "epoch": 6.410771992818671, + "grad_norm": 1.2128242254257202, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 89270 + }, + { + "epoch": 6.411490125673249, + "grad_norm": 0.9496060013771057, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 89280 + }, + { + "epoch": 6.412208258527827, + "grad_norm": 1.1838264465332031, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 89290 + }, + { + "epoch": 6.412926391382406, + "grad_norm": 1.1700918674468994, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 89300 + }, + { + "epoch": 6.413644524236984, + "grad_norm": 1.2102051973342896, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 89310 + }, + { + "epoch": 6.414362657091562, + "grad_norm": 0.9485594630241394, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 89320 + }, + { + "epoch": 6.41508078994614, + "grad_norm": 1.041496753692627, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 89330 + }, + { + "epoch": 6.415798922800718, + "grad_norm": 1.0785019397735596, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 89340 + }, + { + "epoch": 6.416517055655296, + "grad_norm": 0.9527593851089478, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 89350 + }, + { + "epoch": 6.417235188509874, + "grad_norm": 0.9879035353660583, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 89360 + }, + { + "epoch": 6.417953321364452, + "grad_norm": 0.9143751263618469, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 89370 + }, + { + "epoch": 6.41867145421903, + "grad_norm": 0.9145408272743225, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 89380 + }, + { + "epoch": 6.419389587073608, + "grad_norm": 1.0128624439239502, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 89390 + }, + { + "epoch": 6.420107719928187, + "grad_norm": 0.9454543590545654, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 89400 + }, + { + "epoch": 6.420825852782765, + "grad_norm": 1.0659215450286865, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 89410 + }, + { + "epoch": 6.421543985637343, + "grad_norm": 1.1622642278671265, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 89420 + }, + { + "epoch": 6.422262118491921, + "grad_norm": 0.9805575013160706, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 89430 + }, + { + "epoch": 6.422980251346499, + "grad_norm": 0.871903121471405, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 89440 + }, + { + "epoch": 6.423698384201077, + "grad_norm": 0.992355227470398, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 89450 + }, + { + "epoch": 6.424416517055655, + "grad_norm": 1.4055765867233276, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 89460 + }, + { + "epoch": 6.425134649910233, + "grad_norm": 1.0447325706481934, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 89470 + }, + { + "epoch": 6.425852782764811, + "grad_norm": 1.1162594556808472, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 89480 + }, + { + "epoch": 6.42657091561939, + "grad_norm": 1.0767697095870972, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 89490 + }, + { + "epoch": 6.427289048473968, + "grad_norm": 1.2253819704055786, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 89500 + }, + { + "epoch": 6.428007181328546, + "grad_norm": 1.0623136758804321, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 89510 + }, + { + "epoch": 6.428725314183124, + "grad_norm": 1.3238742351531982, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 89520 + }, + { + "epoch": 6.429443447037702, + "grad_norm": 1.2376916408538818, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 89530 + }, + { + "epoch": 6.43016157989228, + "grad_norm": 1.197453260421753, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 89540 + }, + { + "epoch": 6.430879712746858, + "grad_norm": 1.0539700984954834, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 89550 + }, + { + "epoch": 6.431597845601436, + "grad_norm": 1.0659761428833008, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 89560 + }, + { + "epoch": 6.432315978456014, + "grad_norm": 1.0186322927474976, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 89570 + }, + { + "epoch": 6.433034111310592, + "grad_norm": 1.232337474822998, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 89580 + }, + { + "epoch": 6.433752244165171, + "grad_norm": 1.1512500047683716, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 89590 + }, + { + "epoch": 6.434470377019749, + "grad_norm": 1.0068955421447754, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 89600 + }, + { + "epoch": 6.435188509874327, + "grad_norm": 1.1359424591064453, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 89610 + }, + { + "epoch": 6.435906642728905, + "grad_norm": 1.4369128942489624, + "learning_rate": 0.0002, + "loss": 0.553, + "step": 89620 + }, + { + "epoch": 6.436624775583483, + "grad_norm": 0.9382445216178894, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 89630 + }, + { + "epoch": 6.437342908438061, + "grad_norm": 0.8607977628707886, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 89640 + }, + { + "epoch": 6.438061041292639, + "grad_norm": 0.9498276114463806, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 89650 + }, + { + "epoch": 6.438779174147217, + "grad_norm": 1.4109948873519897, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 89660 + }, + { + "epoch": 6.439497307001796, + "grad_norm": 1.106134295463562, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 89670 + }, + { + "epoch": 6.440215439856374, + "grad_norm": 1.128963589668274, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 89680 + }, + { + "epoch": 6.440933572710952, + "grad_norm": 1.1370604038238525, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 89690 + }, + { + "epoch": 6.44165170556553, + "grad_norm": 1.380922794342041, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 89700 + }, + { + "epoch": 6.442369838420108, + "grad_norm": 0.9597383737564087, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 89710 + }, + { + "epoch": 6.443087971274686, + "grad_norm": 1.1491756439208984, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 89720 + }, + { + "epoch": 6.443806104129264, + "grad_norm": 1.1313573122024536, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 89730 + }, + { + "epoch": 6.444524236983842, + "grad_norm": 1.1081135272979736, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 89740 + }, + { + "epoch": 6.44524236983842, + "grad_norm": 1.0297505855560303, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 89750 + }, + { + "epoch": 6.445960502692998, + "grad_norm": 1.0534520149230957, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 89760 + }, + { + "epoch": 6.446678635547577, + "grad_norm": 1.218485951423645, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 89770 + }, + { + "epoch": 6.447396768402155, + "grad_norm": 0.9336987137794495, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 89780 + }, + { + "epoch": 6.448114901256733, + "grad_norm": 0.9854478240013123, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 89790 + }, + { + "epoch": 6.448833034111311, + "grad_norm": 1.1036708354949951, + "learning_rate": 0.0002, + "loss": 0.5718, + "step": 89800 + }, + { + "epoch": 6.449551166965889, + "grad_norm": 1.2220509052276611, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 89810 + }, + { + "epoch": 6.450269299820467, + "grad_norm": 0.9955567121505737, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 89820 + }, + { + "epoch": 6.450987432675045, + "grad_norm": 1.0350912809371948, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 89830 + }, + { + "epoch": 6.451705565529623, + "grad_norm": 1.156080722808838, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 89840 + }, + { + "epoch": 6.452423698384201, + "grad_norm": 0.8922389149665833, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 89850 + }, + { + "epoch": 6.45314183123878, + "grad_norm": 0.9318913221359253, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 89860 + }, + { + "epoch": 6.453859964093358, + "grad_norm": 0.9420756101608276, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 89870 + }, + { + "epoch": 6.454578096947936, + "grad_norm": 1.0303646326065063, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 89880 + }, + { + "epoch": 6.455296229802514, + "grad_norm": 1.070806860923767, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 89890 + }, + { + "epoch": 6.456014362657092, + "grad_norm": 0.9890686869621277, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 89900 + }, + { + "epoch": 6.45673249551167, + "grad_norm": 1.1254929304122925, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 89910 + }, + { + "epoch": 6.457450628366248, + "grad_norm": 1.0023183822631836, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 89920 + }, + { + "epoch": 6.458168761220826, + "grad_norm": 1.118721604347229, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 89930 + }, + { + "epoch": 6.458886894075404, + "grad_norm": 1.2170203924179077, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 89940 + }, + { + "epoch": 6.459605026929982, + "grad_norm": 1.0662257671356201, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 89950 + }, + { + "epoch": 6.4603231597845605, + "grad_norm": 0.8912546634674072, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 89960 + }, + { + "epoch": 6.4610412926391385, + "grad_norm": 1.0346225500106812, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 89970 + }, + { + "epoch": 6.4617594254937165, + "grad_norm": 1.239388346672058, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 89980 + }, + { + "epoch": 6.4624775583482945, + "grad_norm": 1.0100152492523193, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 89990 + }, + { + "epoch": 6.4631956912028725, + "grad_norm": 1.1496137380599976, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 90000 + }, + { + "epoch": 6.4639138240574505, + "grad_norm": 0.9652666449546814, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 90010 + }, + { + "epoch": 6.4646319569120285, + "grad_norm": 1.459730863571167, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 90020 + }, + { + "epoch": 6.4653500897666065, + "grad_norm": 0.9096665978431702, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 90030 + }, + { + "epoch": 6.4660682226211845, + "grad_norm": 1.1356233358383179, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 90040 + }, + { + "epoch": 6.466786355475763, + "grad_norm": 1.0192385911941528, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 90050 + }, + { + "epoch": 6.467504488330341, + "grad_norm": 0.9494831562042236, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 90060 + }, + { + "epoch": 6.468222621184919, + "grad_norm": 0.9784388542175293, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 90070 + }, + { + "epoch": 6.468940754039497, + "grad_norm": 1.0754846334457397, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 90080 + }, + { + "epoch": 6.469658886894075, + "grad_norm": 0.9019646644592285, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 90090 + }, + { + "epoch": 6.470377019748653, + "grad_norm": 1.1848793029785156, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 90100 + }, + { + "epoch": 6.471095152603231, + "grad_norm": 1.1312837600708008, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 90110 + }, + { + "epoch": 6.471813285457809, + "grad_norm": 0.9868128299713135, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 90120 + }, + { + "epoch": 6.472531418312387, + "grad_norm": 0.894279956817627, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 90130 + }, + { + "epoch": 6.473249551166965, + "grad_norm": 1.1206544637680054, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 90140 + }, + { + "epoch": 6.473967684021544, + "grad_norm": 1.048126220703125, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 90150 + }, + { + "epoch": 6.474685816876122, + "grad_norm": 0.9624786972999573, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 90160 + }, + { + "epoch": 6.4754039497307, + "grad_norm": 1.3301671743392944, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 90170 + }, + { + "epoch": 6.476122082585278, + "grad_norm": 1.1016923189163208, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 90180 + }, + { + "epoch": 6.476840215439856, + "grad_norm": 1.084158182144165, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 90190 + }, + { + "epoch": 6.477558348294434, + "grad_norm": 1.0704890489578247, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 90200 + }, + { + "epoch": 6.478276481149012, + "grad_norm": 1.0849730968475342, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 90210 + }, + { + "epoch": 6.47899461400359, + "grad_norm": 1.0671768188476562, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 90220 + }, + { + "epoch": 6.479712746858169, + "grad_norm": 1.1208873987197876, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 90230 + }, + { + "epoch": 6.480430879712747, + "grad_norm": 1.1958850622177124, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 90240 + }, + { + "epoch": 6.481149012567325, + "grad_norm": 1.2102761268615723, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 90250 + }, + { + "epoch": 6.481867145421903, + "grad_norm": 1.0813510417938232, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 90260 + }, + { + "epoch": 6.482585278276481, + "grad_norm": 0.8553891777992249, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 90270 + }, + { + "epoch": 6.483303411131059, + "grad_norm": 1.0855463743209839, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 90280 + }, + { + "epoch": 6.484021543985637, + "grad_norm": 1.1179498434066772, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 90290 + }, + { + "epoch": 6.484739676840215, + "grad_norm": 1.1268035173416138, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 90300 + }, + { + "epoch": 6.485457809694793, + "grad_norm": 1.0755188465118408, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 90310 + }, + { + "epoch": 6.486175942549371, + "grad_norm": 1.0469547510147095, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 90320 + }, + { + "epoch": 6.48689407540395, + "grad_norm": 0.8739270567893982, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 90330 + }, + { + "epoch": 6.487612208258528, + "grad_norm": 1.2452377080917358, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 90340 + }, + { + "epoch": 6.488330341113106, + "grad_norm": 1.1576505899429321, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 90350 + }, + { + "epoch": 6.489048473967684, + "grad_norm": 1.0247524976730347, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 90360 + }, + { + "epoch": 6.489766606822262, + "grad_norm": 1.1306205987930298, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 90370 + }, + { + "epoch": 6.49048473967684, + "grad_norm": 1.0545839071273804, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 90380 + }, + { + "epoch": 6.491202872531418, + "grad_norm": 1.281407117843628, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 90390 + }, + { + "epoch": 6.491921005385996, + "grad_norm": 1.2330801486968994, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 90400 + }, + { + "epoch": 6.492639138240574, + "grad_norm": 0.8966873288154602, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 90410 + }, + { + "epoch": 6.493357271095153, + "grad_norm": 0.9748067259788513, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 90420 + }, + { + "epoch": 6.494075403949731, + "grad_norm": 0.9285972118377686, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 90430 + }, + { + "epoch": 6.494793536804309, + "grad_norm": 1.123449444770813, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 90440 + }, + { + "epoch": 6.495511669658887, + "grad_norm": 1.4190359115600586, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 90450 + }, + { + "epoch": 6.496229802513465, + "grad_norm": 0.9877263307571411, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 90460 + }, + { + "epoch": 6.496947935368043, + "grad_norm": 0.9850174188613892, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 90470 + }, + { + "epoch": 6.497666068222621, + "grad_norm": 1.3609496355056763, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 90480 + }, + { + "epoch": 6.498384201077199, + "grad_norm": 0.8299460411071777, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 90490 + }, + { + "epoch": 6.499102333931777, + "grad_norm": 1.3359589576721191, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 90500 + }, + { + "epoch": 6.499820466786355, + "grad_norm": 1.1211248636245728, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 90510 + }, + { + "epoch": 6.500538599640934, + "grad_norm": 1.1070419549942017, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 90520 + }, + { + "epoch": 6.501256732495512, + "grad_norm": 1.1590572595596313, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 90530 + }, + { + "epoch": 6.50197486535009, + "grad_norm": 0.9865858554840088, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 90540 + }, + { + "epoch": 6.502692998204668, + "grad_norm": 0.9752925634384155, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 90550 + }, + { + "epoch": 6.503411131059246, + "grad_norm": 1.2411525249481201, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 90560 + }, + { + "epoch": 6.504129263913824, + "grad_norm": 1.1538971662521362, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 90570 + }, + { + "epoch": 6.504847396768402, + "grad_norm": 1.2818700075149536, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 90580 + }, + { + "epoch": 6.50556552962298, + "grad_norm": 1.2787950038909912, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 90590 + }, + { + "epoch": 6.506283662477558, + "grad_norm": 1.1357126235961914, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 90600 + }, + { + "epoch": 6.507001795332137, + "grad_norm": 1.0781097412109375, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 90610 + }, + { + "epoch": 6.507719928186715, + "grad_norm": 0.9754705429077148, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 90620 + }, + { + "epoch": 6.508438061041293, + "grad_norm": 1.018410563468933, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 90630 + }, + { + "epoch": 6.509156193895871, + "grad_norm": 1.0382000207901, + "learning_rate": 0.0002, + "loss": 0.562, + "step": 90640 + }, + { + "epoch": 6.509874326750449, + "grad_norm": 0.9059327840805054, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 90650 + }, + { + "epoch": 6.510592459605027, + "grad_norm": 1.2049181461334229, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 90660 + }, + { + "epoch": 6.511310592459605, + "grad_norm": 1.1005393266677856, + "learning_rate": 0.0002, + "loss": 0.6158, + "step": 90670 + }, + { + "epoch": 6.512028725314183, + "grad_norm": 1.0504072904586792, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 90680 + }, + { + "epoch": 6.512746858168761, + "grad_norm": 1.2491340637207031, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 90690 + }, + { + "epoch": 6.513464991023339, + "grad_norm": 0.9971826672554016, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 90700 + }, + { + "epoch": 6.514183123877918, + "grad_norm": 1.0228981971740723, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 90710 + }, + { + "epoch": 6.514901256732496, + "grad_norm": 1.1531293392181396, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 90720 + }, + { + "epoch": 6.515619389587074, + "grad_norm": 0.9401963949203491, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 90730 + }, + { + "epoch": 6.516337522441652, + "grad_norm": 1.3876653909683228, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 90740 + }, + { + "epoch": 6.51705565529623, + "grad_norm": 1.3111445903778076, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 90750 + }, + { + "epoch": 6.517773788150808, + "grad_norm": 0.8705055713653564, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 90760 + }, + { + "epoch": 6.518491921005386, + "grad_norm": 1.213295340538025, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 90770 + }, + { + "epoch": 6.519210053859964, + "grad_norm": 1.2075343132019043, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 90780 + }, + { + "epoch": 6.519928186714543, + "grad_norm": 0.9814115166664124, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 90790 + }, + { + "epoch": 6.520646319569121, + "grad_norm": 1.0937272310256958, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 90800 + }, + { + "epoch": 6.521364452423699, + "grad_norm": 1.0839916467666626, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 90810 + }, + { + "epoch": 6.522082585278277, + "grad_norm": 1.1918399333953857, + "learning_rate": 0.0002, + "loss": 0.6166, + "step": 90820 + }, + { + "epoch": 6.522800718132855, + "grad_norm": 1.1677868366241455, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 90830 + }, + { + "epoch": 6.523518850987433, + "grad_norm": 1.0840870141983032, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 90840 + }, + { + "epoch": 6.524236983842011, + "grad_norm": 1.10408353805542, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 90850 + }, + { + "epoch": 6.524955116696589, + "grad_norm": 1.056705355644226, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 90860 + }, + { + "epoch": 6.525673249551167, + "grad_norm": 1.0552406311035156, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 90870 + }, + { + "epoch": 6.526391382405745, + "grad_norm": 1.000816822052002, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 90880 + }, + { + "epoch": 6.527109515260323, + "grad_norm": 1.1465239524841309, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 90890 + }, + { + "epoch": 6.527827648114902, + "grad_norm": 0.9380449652671814, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 90900 + }, + { + "epoch": 6.52854578096948, + "grad_norm": 0.9572200179100037, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 90910 + }, + { + "epoch": 6.529263913824058, + "grad_norm": 1.0058002471923828, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 90920 + }, + { + "epoch": 6.529982046678636, + "grad_norm": 1.0932626724243164, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 90930 + }, + { + "epoch": 6.530700179533214, + "grad_norm": 0.9283126592636108, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 90940 + }, + { + "epoch": 6.531418312387792, + "grad_norm": 1.1347819566726685, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 90950 + }, + { + "epoch": 6.53213644524237, + "grad_norm": 1.4964616298675537, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 90960 + }, + { + "epoch": 6.532854578096948, + "grad_norm": 1.1725877523422241, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 90970 + }, + { + "epoch": 6.5335727109515265, + "grad_norm": 1.185640811920166, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 90980 + }, + { + "epoch": 6.5342908438061045, + "grad_norm": 1.0598312616348267, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 90990 + }, + { + "epoch": 6.5350089766606825, + "grad_norm": 1.389320731163025, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 91000 + }, + { + "epoch": 6.5357271095152605, + "grad_norm": 1.102960467338562, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 91010 + }, + { + "epoch": 6.5364452423698385, + "grad_norm": 1.2482284307479858, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 91020 + }, + { + "epoch": 6.5371633752244165, + "grad_norm": 1.213861346244812, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 91030 + }, + { + "epoch": 6.5378815080789945, + "grad_norm": 1.1872318983078003, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 91040 + }, + { + "epoch": 6.5385996409335725, + "grad_norm": 1.0767916440963745, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 91050 + }, + { + "epoch": 6.5393177737881505, + "grad_norm": 1.0610442161560059, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 91060 + }, + { + "epoch": 6.5400359066427285, + "grad_norm": 1.0161356925964355, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 91070 + }, + { + "epoch": 6.540754039497307, + "grad_norm": 1.373284101486206, + "learning_rate": 0.0002, + "loss": 0.5421, + "step": 91080 + }, + { + "epoch": 6.541472172351885, + "grad_norm": 1.1611387729644775, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 91090 + }, + { + "epoch": 6.542190305206463, + "grad_norm": 1.1980092525482178, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 91100 + }, + { + "epoch": 6.542908438061041, + "grad_norm": 1.1174312829971313, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 91110 + }, + { + "epoch": 6.543626570915619, + "grad_norm": 1.1376914978027344, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 91120 + }, + { + "epoch": 6.544344703770197, + "grad_norm": 1.0551620721817017, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 91130 + }, + { + "epoch": 6.545062836624775, + "grad_norm": 1.2839815616607666, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 91140 + }, + { + "epoch": 6.545780969479353, + "grad_norm": 0.7656933665275574, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 91150 + }, + { + "epoch": 6.546499102333931, + "grad_norm": 1.1079483032226562, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 91160 + }, + { + "epoch": 6.54721723518851, + "grad_norm": 1.4870734214782715, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 91170 + }, + { + "epoch": 6.547935368043088, + "grad_norm": 1.1784024238586426, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 91180 + }, + { + "epoch": 6.548653500897666, + "grad_norm": 1.3510793447494507, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 91190 + }, + { + "epoch": 6.549371633752244, + "grad_norm": 1.0237789154052734, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 91200 + }, + { + "epoch": 6.550089766606822, + "grad_norm": 1.0721405744552612, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 91210 + }, + { + "epoch": 6.5508078994614, + "grad_norm": 0.9794955253601074, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 91220 + }, + { + "epoch": 6.551526032315978, + "grad_norm": 1.1046847105026245, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 91230 + }, + { + "epoch": 6.552244165170556, + "grad_norm": 0.9706982374191284, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 91240 + }, + { + "epoch": 6.552962298025134, + "grad_norm": 0.9466179609298706, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 91250 + }, + { + "epoch": 6.553680430879712, + "grad_norm": 1.126806616783142, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 91260 + }, + { + "epoch": 6.554398563734291, + "grad_norm": 0.9713812470436096, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 91270 + }, + { + "epoch": 6.555116696588869, + "grad_norm": 0.8955506682395935, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 91280 + }, + { + "epoch": 6.555834829443447, + "grad_norm": 1.2066279649734497, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 91290 + }, + { + "epoch": 6.556552962298025, + "grad_norm": 0.957999587059021, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 91300 + }, + { + "epoch": 6.557271095152603, + "grad_norm": 1.253709077835083, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 91310 + }, + { + "epoch": 6.557989228007181, + "grad_norm": 1.0075397491455078, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 91320 + }, + { + "epoch": 6.558707360861759, + "grad_norm": 0.9356904029846191, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 91330 + }, + { + "epoch": 6.559425493716337, + "grad_norm": 1.1555782556533813, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 91340 + }, + { + "epoch": 6.560143626570916, + "grad_norm": 0.9786396026611328, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 91350 + }, + { + "epoch": 6.560861759425494, + "grad_norm": 1.156374454498291, + "learning_rate": 0.0002, + "loss": 0.5417, + "step": 91360 + }, + { + "epoch": 6.561579892280072, + "grad_norm": 1.0572668313980103, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 91370 + }, + { + "epoch": 6.56229802513465, + "grad_norm": 1.4248497486114502, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 91380 + }, + { + "epoch": 6.563016157989228, + "grad_norm": 1.1191383600234985, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 91390 + }, + { + "epoch": 6.563734290843806, + "grad_norm": 0.9622306227684021, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 91400 + }, + { + "epoch": 6.564452423698384, + "grad_norm": 1.3683338165283203, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 91410 + }, + { + "epoch": 6.565170556552962, + "grad_norm": 1.0363010168075562, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 91420 + }, + { + "epoch": 6.56588868940754, + "grad_norm": 1.2861888408660889, + "learning_rate": 0.0002, + "loss": 0.5718, + "step": 91430 + }, + { + "epoch": 6.566606822262118, + "grad_norm": 1.0330547094345093, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 91440 + }, + { + "epoch": 6.567324955116696, + "grad_norm": 1.044992446899414, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 91450 + }, + { + "epoch": 6.568043087971275, + "grad_norm": 1.0722706317901611, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 91460 + }, + { + "epoch": 6.568761220825853, + "grad_norm": 1.1327447891235352, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 91470 + }, + { + "epoch": 6.569479353680431, + "grad_norm": 1.2709840536117554, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 91480 + }, + { + "epoch": 6.570197486535009, + "grad_norm": 1.0964101552963257, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 91490 + }, + { + "epoch": 6.570915619389587, + "grad_norm": 0.9897898435592651, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 91500 + }, + { + "epoch": 6.571633752244165, + "grad_norm": 1.0143952369689941, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 91510 + }, + { + "epoch": 6.572351885098743, + "grad_norm": 0.923865020275116, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 91520 + }, + { + "epoch": 6.573070017953321, + "grad_norm": 1.144390344619751, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 91530 + }, + { + "epoch": 6.5737881508079, + "grad_norm": 1.0636180639266968, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 91540 + }, + { + "epoch": 6.574506283662478, + "grad_norm": 1.0699774026870728, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 91550 + }, + { + "epoch": 6.575224416517056, + "grad_norm": 1.2139345407485962, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 91560 + }, + { + "epoch": 6.575942549371634, + "grad_norm": 1.4551644325256348, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 91570 + }, + { + "epoch": 6.576660682226212, + "grad_norm": 1.2388415336608887, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 91580 + }, + { + "epoch": 6.57737881508079, + "grad_norm": 0.9303404688835144, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 91590 + }, + { + "epoch": 6.578096947935368, + "grad_norm": 0.932905912399292, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 91600 + }, + { + "epoch": 6.578815080789946, + "grad_norm": 1.0726542472839355, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 91610 + }, + { + "epoch": 6.579533213644524, + "grad_norm": 1.138890266418457, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 91620 + }, + { + "epoch": 6.580251346499102, + "grad_norm": 1.087165355682373, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 91630 + }, + { + "epoch": 6.580969479353681, + "grad_norm": 1.0526753664016724, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 91640 + }, + { + "epoch": 6.581687612208259, + "grad_norm": 1.068217158317566, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 91650 + }, + { + "epoch": 6.582405745062837, + "grad_norm": 1.09737229347229, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 91660 + }, + { + "epoch": 6.583123877917415, + "grad_norm": 0.9466586112976074, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 91670 + }, + { + "epoch": 6.583842010771993, + "grad_norm": 1.2311620712280273, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 91680 + }, + { + "epoch": 6.584560143626571, + "grad_norm": 1.2385680675506592, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 91690 + }, + { + "epoch": 6.585278276481149, + "grad_norm": 0.947889506816864, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 91700 + }, + { + "epoch": 6.585996409335727, + "grad_norm": 0.9600529670715332, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 91710 + }, + { + "epoch": 6.586714542190305, + "grad_norm": 1.3595638275146484, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 91720 + }, + { + "epoch": 6.587432675044884, + "grad_norm": 1.0087260007858276, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 91730 + }, + { + "epoch": 6.588150807899462, + "grad_norm": 1.0008373260498047, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 91740 + }, + { + "epoch": 6.58886894075404, + "grad_norm": 1.0367980003356934, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 91750 + }, + { + "epoch": 6.589587073608618, + "grad_norm": 1.1934503316879272, + "learning_rate": 0.0002, + "loss": 0.5834, + "step": 91760 + }, + { + "epoch": 6.590305206463196, + "grad_norm": 1.0295839309692383, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 91770 + }, + { + "epoch": 6.591023339317774, + "grad_norm": 0.926913857460022, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 91780 + }, + { + "epoch": 6.591741472172352, + "grad_norm": 1.055837631225586, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 91790 + }, + { + "epoch": 6.59245960502693, + "grad_norm": 1.006401777267456, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 91800 + }, + { + "epoch": 6.593177737881508, + "grad_norm": 1.1368589401245117, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 91810 + }, + { + "epoch": 6.593895870736086, + "grad_norm": 0.8494837880134583, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 91820 + }, + { + "epoch": 6.594614003590665, + "grad_norm": 1.3219822645187378, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 91830 + }, + { + "epoch": 6.595332136445243, + "grad_norm": 1.0583800077438354, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 91840 + }, + { + "epoch": 6.596050269299821, + "grad_norm": 1.0579098463058472, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 91850 + }, + { + "epoch": 6.596768402154399, + "grad_norm": 1.0618008375167847, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 91860 + }, + { + "epoch": 6.597486535008977, + "grad_norm": 0.9425104260444641, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 91870 + }, + { + "epoch": 6.598204667863555, + "grad_norm": 0.9130632281303406, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 91880 + }, + { + "epoch": 6.598922800718133, + "grad_norm": 1.126438856124878, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 91890 + }, + { + "epoch": 6.599640933572711, + "grad_norm": 0.9135168194770813, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 91900 + }, + { + "epoch": 6.6003590664272895, + "grad_norm": 1.1640992164611816, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 91910 + }, + { + "epoch": 6.6010771992818675, + "grad_norm": 1.2641936540603638, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 91920 + }, + { + "epoch": 6.6017953321364455, + "grad_norm": 1.1252738237380981, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 91930 + }, + { + "epoch": 6.6025134649910235, + "grad_norm": 1.0307750701904297, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 91940 + }, + { + "epoch": 6.6032315978456015, + "grad_norm": 0.978972315788269, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 91950 + }, + { + "epoch": 6.6039497307001795, + "grad_norm": 1.1350890398025513, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 91960 + }, + { + "epoch": 6.6046678635547575, + "grad_norm": 0.9177488088607788, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 91970 + }, + { + "epoch": 6.6053859964093355, + "grad_norm": 1.0381031036376953, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 91980 + }, + { + "epoch": 6.6061041292639135, + "grad_norm": 1.1706395149230957, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 91990 + }, + { + "epoch": 6.6068222621184916, + "grad_norm": 1.1102650165557861, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 92000 + }, + { + "epoch": 6.6075403949730696, + "grad_norm": 0.9234306812286377, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 92010 + }, + { + "epoch": 6.608258527827648, + "grad_norm": 1.2014371156692505, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 92020 + }, + { + "epoch": 6.6089766606822264, + "grad_norm": 0.9392209053039551, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 92030 + }, + { + "epoch": 6.6096947935368044, + "grad_norm": 1.0882072448730469, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 92040 + }, + { + "epoch": 6.6104129263913824, + "grad_norm": 1.032155156135559, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 92050 + }, + { + "epoch": 6.6111310592459605, + "grad_norm": 0.913979172706604, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 92060 + }, + { + "epoch": 6.6118491921005385, + "grad_norm": 1.205101490020752, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 92070 + }, + { + "epoch": 6.6125673249551165, + "grad_norm": 1.0713984966278076, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 92080 + }, + { + "epoch": 6.6132854578096945, + "grad_norm": 0.9191082715988159, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 92090 + }, + { + "epoch": 6.614003590664273, + "grad_norm": 0.9553678631782532, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 92100 + }, + { + "epoch": 6.614721723518851, + "grad_norm": 1.333262324333191, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 92110 + }, + { + "epoch": 6.615439856373429, + "grad_norm": 1.030739426612854, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 92120 + }, + { + "epoch": 6.616157989228007, + "grad_norm": 0.8777900338172913, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 92130 + }, + { + "epoch": 6.616876122082585, + "grad_norm": 1.071578860282898, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 92140 + }, + { + "epoch": 6.617594254937163, + "grad_norm": 1.1931039094924927, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 92150 + }, + { + "epoch": 6.618312387791741, + "grad_norm": 1.2041425704956055, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 92160 + }, + { + "epoch": 6.619030520646319, + "grad_norm": 0.8523036241531372, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 92170 + }, + { + "epoch": 6.619748653500897, + "grad_norm": 1.1914807558059692, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 92180 + }, + { + "epoch": 6.620466786355475, + "grad_norm": 1.1336464881896973, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 92190 + }, + { + "epoch": 6.621184919210053, + "grad_norm": 1.2282923460006714, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 92200 + }, + { + "epoch": 6.621903052064632, + "grad_norm": 1.1887043714523315, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 92210 + }, + { + "epoch": 6.62262118491921, + "grad_norm": 0.9654178619384766, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 92220 + }, + { + "epoch": 6.623339317773788, + "grad_norm": 0.7957702875137329, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 92230 + }, + { + "epoch": 6.624057450628366, + "grad_norm": 0.8697461485862732, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 92240 + }, + { + "epoch": 6.624775583482944, + "grad_norm": 1.0392963886260986, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 92250 + }, + { + "epoch": 6.625493716337522, + "grad_norm": 1.1502392292022705, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 92260 + }, + { + "epoch": 6.6262118491921, + "grad_norm": 1.2818870544433594, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 92270 + }, + { + "epoch": 6.626929982046678, + "grad_norm": 0.8769828081130981, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 92280 + }, + { + "epoch": 6.627648114901257, + "grad_norm": 1.2273039817810059, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 92290 + }, + { + "epoch": 6.628366247755835, + "grad_norm": 0.8619378805160522, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 92300 + }, + { + "epoch": 6.629084380610413, + "grad_norm": 0.9501098990440369, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 92310 + }, + { + "epoch": 6.629802513464991, + "grad_norm": 1.0698163509368896, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 92320 + }, + { + "epoch": 6.630520646319569, + "grad_norm": 1.0689377784729004, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 92330 + }, + { + "epoch": 6.631238779174147, + "grad_norm": 1.2086275815963745, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 92340 + }, + { + "epoch": 6.631956912028725, + "grad_norm": 1.1256859302520752, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 92350 + }, + { + "epoch": 6.632675044883303, + "grad_norm": 0.9717738032341003, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 92360 + }, + { + "epoch": 6.633393177737881, + "grad_norm": 0.9784330725669861, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 92370 + }, + { + "epoch": 6.634111310592459, + "grad_norm": 1.2600007057189941, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 92380 + }, + { + "epoch": 6.634829443447038, + "grad_norm": 0.889910101890564, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 92390 + }, + { + "epoch": 6.635547576301616, + "grad_norm": 1.010524868965149, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 92400 + }, + { + "epoch": 6.636265709156194, + "grad_norm": 1.325664758682251, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 92410 + }, + { + "epoch": 6.636983842010772, + "grad_norm": 1.3910914659500122, + "learning_rate": 0.0002, + "loss": 0.6149, + "step": 92420 + }, + { + "epoch": 6.63770197486535, + "grad_norm": 0.8858863115310669, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 92430 + }, + { + "epoch": 6.638420107719928, + "grad_norm": 1.1841683387756348, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 92440 + }, + { + "epoch": 6.639138240574506, + "grad_norm": 1.2783559560775757, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 92450 + }, + { + "epoch": 6.639856373429084, + "grad_norm": 0.9154769778251648, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 92460 + }, + { + "epoch": 6.640574506283663, + "grad_norm": 1.003371000289917, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 92470 + }, + { + "epoch": 6.641292639138241, + "grad_norm": 0.9700522422790527, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 92480 + }, + { + "epoch": 6.642010771992819, + "grad_norm": 1.273629069328308, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 92490 + }, + { + "epoch": 6.642728904847397, + "grad_norm": 1.2746435403823853, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 92500 + }, + { + "epoch": 6.643447037701975, + "grad_norm": 1.0184870958328247, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 92510 + }, + { + "epoch": 6.644165170556553, + "grad_norm": 0.9988235831260681, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 92520 + }, + { + "epoch": 6.644883303411131, + "grad_norm": 1.075997233390808, + "learning_rate": 0.0002, + "loss": 0.5275, + "step": 92530 + }, + { + "epoch": 6.645601436265709, + "grad_norm": 1.180784821510315, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 92540 + }, + { + "epoch": 6.646319569120287, + "grad_norm": 1.0889579057693481, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 92550 + }, + { + "epoch": 6.647037701974865, + "grad_norm": 1.0069187879562378, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 92560 + }, + { + "epoch": 6.647755834829443, + "grad_norm": 1.110495686531067, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 92570 + }, + { + "epoch": 6.648473967684022, + "grad_norm": 1.0540684461593628, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 92580 + }, + { + "epoch": 6.6491921005386, + "grad_norm": 1.0917930603027344, + "learning_rate": 0.0002, + "loss": 0.5718, + "step": 92590 + }, + { + "epoch": 6.649910233393178, + "grad_norm": 1.225898027420044, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 92600 + }, + { + "epoch": 6.650628366247756, + "grad_norm": 0.9372484087944031, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 92610 + }, + { + "epoch": 6.651346499102334, + "grad_norm": 0.98685622215271, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 92620 + }, + { + "epoch": 6.652064631956912, + "grad_norm": 1.1148556470870972, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 92630 + }, + { + "epoch": 6.65278276481149, + "grad_norm": 1.1483707427978516, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 92640 + }, + { + "epoch": 6.653500897666068, + "grad_norm": 1.092708706855774, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 92650 + }, + { + "epoch": 6.654219030520647, + "grad_norm": 1.0641281604766846, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 92660 + }, + { + "epoch": 6.654937163375225, + "grad_norm": 0.9953374862670898, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 92670 + }, + { + "epoch": 6.655655296229803, + "grad_norm": 0.9792306423187256, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 92680 + }, + { + "epoch": 6.656373429084381, + "grad_norm": 1.1209690570831299, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 92690 + }, + { + "epoch": 6.657091561938959, + "grad_norm": 0.8281117677688599, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 92700 + }, + { + "epoch": 6.657809694793537, + "grad_norm": 0.9189280867576599, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 92710 + }, + { + "epoch": 6.658527827648115, + "grad_norm": 1.1859153509140015, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 92720 + }, + { + "epoch": 6.659245960502693, + "grad_norm": 0.9750476479530334, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 92730 + }, + { + "epoch": 6.659964093357271, + "grad_norm": 0.9973570704460144, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 92740 + }, + { + "epoch": 6.660682226211849, + "grad_norm": 1.0170378684997559, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 92750 + }, + { + "epoch": 6.661400359066427, + "grad_norm": 1.352283239364624, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 92760 + }, + { + "epoch": 6.662118491921006, + "grad_norm": 1.1020066738128662, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 92770 + }, + { + "epoch": 6.662836624775584, + "grad_norm": 1.0750092267990112, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 92780 + }, + { + "epoch": 6.663554757630162, + "grad_norm": 1.1006640195846558, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 92790 + }, + { + "epoch": 6.66427289048474, + "grad_norm": 1.2372384071350098, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 92800 + }, + { + "epoch": 6.664991023339318, + "grad_norm": 1.084846019744873, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 92810 + }, + { + "epoch": 6.665709156193896, + "grad_norm": 1.1738693714141846, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 92820 + }, + { + "epoch": 6.666427289048474, + "grad_norm": 1.159678339958191, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 92830 + }, + { + "epoch": 6.667145421903052, + "grad_norm": 0.9957766532897949, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 92840 + }, + { + "epoch": 6.667863554757631, + "grad_norm": 1.1403744220733643, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 92850 + }, + { + "epoch": 6.668581687612209, + "grad_norm": 1.0120519399642944, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 92860 + }, + { + "epoch": 6.669299820466787, + "grad_norm": 1.0876718759536743, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 92870 + }, + { + "epoch": 6.670017953321365, + "grad_norm": 1.175749659538269, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 92880 + }, + { + "epoch": 6.670736086175943, + "grad_norm": 0.9808473587036133, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 92890 + }, + { + "epoch": 6.671454219030521, + "grad_norm": 1.121573805809021, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 92900 + }, + { + "epoch": 6.672172351885099, + "grad_norm": 0.9749727249145508, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 92910 + }, + { + "epoch": 6.672890484739677, + "grad_norm": 1.0969820022583008, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 92920 + }, + { + "epoch": 6.673608617594255, + "grad_norm": 1.0777957439422607, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 92930 + }, + { + "epoch": 6.674326750448833, + "grad_norm": 1.2342437505722046, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 92940 + }, + { + "epoch": 6.6750448833034115, + "grad_norm": 1.18901789188385, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 92950 + }, + { + "epoch": 6.6757630161579895, + "grad_norm": 1.2212412357330322, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 92960 + }, + { + "epoch": 6.6764811490125675, + "grad_norm": 1.0007524490356445, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 92970 + }, + { + "epoch": 6.6771992818671455, + "grad_norm": 1.1012821197509766, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 92980 + }, + { + "epoch": 6.6779174147217235, + "grad_norm": 0.9446989893913269, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 92990 + }, + { + "epoch": 6.6786355475763015, + "grad_norm": 1.5307164192199707, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 93000 + }, + { + "epoch": 6.6793536804308795, + "grad_norm": 1.4290575981140137, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 93010 + }, + { + "epoch": 6.6800718132854575, + "grad_norm": 1.2367054224014282, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 93020 + }, + { + "epoch": 6.680789946140036, + "grad_norm": 0.874568521976471, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 93030 + }, + { + "epoch": 6.681508078994614, + "grad_norm": 1.152861475944519, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 93040 + }, + { + "epoch": 6.682226211849192, + "grad_norm": 0.9524891972541809, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 93050 + }, + { + "epoch": 6.68294434470377, + "grad_norm": 0.8084558844566345, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 93060 + }, + { + "epoch": 6.683662477558348, + "grad_norm": 1.1458806991577148, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 93070 + }, + { + "epoch": 6.684380610412926, + "grad_norm": 1.1427397727966309, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 93080 + }, + { + "epoch": 6.685098743267504, + "grad_norm": 1.1136237382888794, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 93090 + }, + { + "epoch": 6.685816876122082, + "grad_norm": 1.0270767211914062, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 93100 + }, + { + "epoch": 6.68653500897666, + "grad_norm": 0.9473410844802856, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 93110 + }, + { + "epoch": 6.687253141831238, + "grad_norm": 1.011011004447937, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 93120 + }, + { + "epoch": 6.687971274685816, + "grad_norm": 0.9286965131759644, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 93130 + }, + { + "epoch": 6.688689407540395, + "grad_norm": 1.226515293121338, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 93140 + }, + { + "epoch": 6.689407540394973, + "grad_norm": 0.9131909608840942, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 93150 + }, + { + "epoch": 6.690125673249551, + "grad_norm": 1.2111890316009521, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 93160 + }, + { + "epoch": 6.690843806104129, + "grad_norm": 0.9296384453773499, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 93170 + }, + { + "epoch": 6.691561938958707, + "grad_norm": 0.9636726975440979, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 93180 + }, + { + "epoch": 6.692280071813285, + "grad_norm": 1.0116214752197266, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 93190 + }, + { + "epoch": 6.692998204667863, + "grad_norm": 1.2671175003051758, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 93200 + }, + { + "epoch": 6.693716337522441, + "grad_norm": 1.0676039457321167, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 93210 + }, + { + "epoch": 6.69443447037702, + "grad_norm": 1.3277634382247925, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 93220 + }, + { + "epoch": 6.695152603231598, + "grad_norm": 0.9312936663627625, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 93230 + }, + { + "epoch": 6.695870736086176, + "grad_norm": 1.410414457321167, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 93240 + }, + { + "epoch": 6.696588868940754, + "grad_norm": 1.014519453048706, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 93250 + }, + { + "epoch": 6.697307001795332, + "grad_norm": 0.9211319088935852, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 93260 + }, + { + "epoch": 6.69802513464991, + "grad_norm": 1.1027755737304688, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 93270 + }, + { + "epoch": 6.698743267504488, + "grad_norm": 1.0538618564605713, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 93280 + }, + { + "epoch": 6.699461400359066, + "grad_norm": 1.159927248954773, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 93290 + }, + { + "epoch": 6.700179533213644, + "grad_norm": 1.1329137086868286, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 93300 + }, + { + "epoch": 6.700897666068222, + "grad_norm": 0.9797694683074951, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 93310 + }, + { + "epoch": 6.7016157989228, + "grad_norm": 1.0968587398529053, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 93320 + }, + { + "epoch": 6.702333931777379, + "grad_norm": 0.9620516896247864, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 93330 + }, + { + "epoch": 6.703052064631957, + "grad_norm": 1.048879623413086, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 93340 + }, + { + "epoch": 6.703770197486535, + "grad_norm": 1.086421012878418, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 93350 + }, + { + "epoch": 6.704488330341113, + "grad_norm": 1.1045429706573486, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 93360 + }, + { + "epoch": 6.705206463195691, + "grad_norm": 1.081629991531372, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 93370 + }, + { + "epoch": 6.705924596050269, + "grad_norm": 0.9947898387908936, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 93380 + }, + { + "epoch": 6.706642728904847, + "grad_norm": 0.8837184309959412, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 93390 + }, + { + "epoch": 6.707360861759425, + "grad_norm": 1.1838666200637817, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 93400 + }, + { + "epoch": 6.708078994614004, + "grad_norm": 0.9221062064170837, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 93410 + }, + { + "epoch": 6.708797127468582, + "grad_norm": 1.0049937963485718, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 93420 + }, + { + "epoch": 6.70951526032316, + "grad_norm": 0.8895014524459839, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 93430 + }, + { + "epoch": 6.710233393177738, + "grad_norm": 1.2572799921035767, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 93440 + }, + { + "epoch": 6.710951526032316, + "grad_norm": 1.082982063293457, + "learning_rate": 0.0002, + "loss": 0.5763, + "step": 93450 + }, + { + "epoch": 6.711669658886894, + "grad_norm": 1.1520570516586304, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 93460 + }, + { + "epoch": 6.712387791741472, + "grad_norm": 1.0604512691497803, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 93470 + }, + { + "epoch": 6.71310592459605, + "grad_norm": 0.9887481331825256, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 93480 + }, + { + "epoch": 6.713824057450628, + "grad_norm": 1.0163664817810059, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 93490 + }, + { + "epoch": 6.714542190305206, + "grad_norm": 1.187687873840332, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 93500 + }, + { + "epoch": 6.715260323159785, + "grad_norm": 0.8770190477371216, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 93510 + }, + { + "epoch": 6.715978456014363, + "grad_norm": 1.1552737951278687, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 93520 + }, + { + "epoch": 6.716696588868941, + "grad_norm": 1.168770432472229, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 93530 + }, + { + "epoch": 6.717414721723519, + "grad_norm": 1.1071383953094482, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 93540 + }, + { + "epoch": 6.718132854578097, + "grad_norm": 0.8549296259880066, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 93550 + }, + { + "epoch": 6.718850987432675, + "grad_norm": 1.1576329469680786, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 93560 + }, + { + "epoch": 6.719569120287253, + "grad_norm": 1.1610777378082275, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 93570 + }, + { + "epoch": 6.720287253141831, + "grad_norm": 1.0316133499145508, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 93580 + }, + { + "epoch": 6.721005385996409, + "grad_norm": 1.1048495769500732, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 93590 + }, + { + "epoch": 6.721723518850988, + "grad_norm": 1.1212984323501587, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 93600 + }, + { + "epoch": 6.722441651705566, + "grad_norm": 1.1465938091278076, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 93610 + }, + { + "epoch": 6.723159784560144, + "grad_norm": 0.8978183269500732, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 93620 + }, + { + "epoch": 6.723877917414722, + "grad_norm": 1.0475369691848755, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 93630 + }, + { + "epoch": 6.7245960502693, + "grad_norm": 1.0717675685882568, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 93640 + }, + { + "epoch": 6.725314183123878, + "grad_norm": 1.2429792881011963, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 93650 + }, + { + "epoch": 6.726032315978456, + "grad_norm": 1.0333678722381592, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 93660 + }, + { + "epoch": 6.726750448833034, + "grad_norm": 1.211590051651001, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 93670 + }, + { + "epoch": 6.727468581687612, + "grad_norm": 1.0022165775299072, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 93680 + }, + { + "epoch": 6.72818671454219, + "grad_norm": 1.0192183256149292, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 93690 + }, + { + "epoch": 6.728904847396769, + "grad_norm": 0.9370006322860718, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 93700 + }, + { + "epoch": 6.729622980251347, + "grad_norm": 0.7869033813476562, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 93710 + }, + { + "epoch": 6.730341113105925, + "grad_norm": 0.899703860282898, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 93720 + }, + { + "epoch": 6.731059245960503, + "grad_norm": 1.1216487884521484, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 93730 + }, + { + "epoch": 6.731777378815081, + "grad_norm": 0.9117740988731384, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 93740 + }, + { + "epoch": 6.732495511669659, + "grad_norm": 1.070947289466858, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 93750 + }, + { + "epoch": 6.733213644524237, + "grad_norm": 1.0529371500015259, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 93760 + }, + { + "epoch": 6.733931777378815, + "grad_norm": 0.7950748801231384, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 93770 + }, + { + "epoch": 6.734649910233394, + "grad_norm": 1.0469520092010498, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 93780 + }, + { + "epoch": 6.735368043087972, + "grad_norm": 1.4734543561935425, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 93790 + }, + { + "epoch": 6.73608617594255, + "grad_norm": 0.8239574432373047, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 93800 + }, + { + "epoch": 6.736804308797128, + "grad_norm": 1.1228505373001099, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 93810 + }, + { + "epoch": 6.737522441651706, + "grad_norm": 1.0902183055877686, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 93820 + }, + { + "epoch": 6.738240574506284, + "grad_norm": 1.220467209815979, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 93830 + }, + { + "epoch": 6.738958707360862, + "grad_norm": 1.199582815170288, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 93840 + }, + { + "epoch": 6.73967684021544, + "grad_norm": 1.1008597612380981, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 93850 + }, + { + "epoch": 6.740394973070018, + "grad_norm": 0.8596068620681763, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 93860 + }, + { + "epoch": 6.741113105924596, + "grad_norm": 1.220947027206421, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 93870 + }, + { + "epoch": 6.741831238779174, + "grad_norm": 1.2840452194213867, + "learning_rate": 0.0002, + "loss": 0.5425, + "step": 93880 + }, + { + "epoch": 6.742549371633753, + "grad_norm": 1.1923094987869263, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 93890 + }, + { + "epoch": 6.743267504488331, + "grad_norm": 1.1287206411361694, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 93900 + }, + { + "epoch": 6.743985637342909, + "grad_norm": 0.9465082287788391, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 93910 + }, + { + "epoch": 6.744703770197487, + "grad_norm": 0.9888480305671692, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 93920 + }, + { + "epoch": 6.745421903052065, + "grad_norm": 1.1438485383987427, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 93930 + }, + { + "epoch": 6.746140035906643, + "grad_norm": 0.8203039169311523, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 93940 + }, + { + "epoch": 6.746858168761221, + "grad_norm": 1.217855453491211, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 93950 + }, + { + "epoch": 6.747576301615799, + "grad_norm": 1.245977520942688, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 93960 + }, + { + "epoch": 6.7482944344703775, + "grad_norm": 1.240097165107727, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 93970 + }, + { + "epoch": 6.7490125673249555, + "grad_norm": 0.9436663389205933, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 93980 + }, + { + "epoch": 6.7497307001795335, + "grad_norm": 0.9331963062286377, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 93990 + }, + { + "epoch": 6.7504488330341115, + "grad_norm": 0.9809562563896179, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 94000 + }, + { + "epoch": 6.7511669658886895, + "grad_norm": 1.1596009731292725, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 94010 + }, + { + "epoch": 6.7518850987432675, + "grad_norm": 1.082684874534607, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 94020 + }, + { + "epoch": 6.7526032315978455, + "grad_norm": 0.9931458234786987, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 94030 + }, + { + "epoch": 6.7533213644524235, + "grad_norm": 0.8717518448829651, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 94040 + }, + { + "epoch": 6.7540394973070015, + "grad_norm": 0.9379602074623108, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 94050 + }, + { + "epoch": 6.7547576301615795, + "grad_norm": 0.8819605708122253, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 94060 + }, + { + "epoch": 6.755475763016158, + "grad_norm": 1.111547589302063, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 94070 + }, + { + "epoch": 6.756193895870736, + "grad_norm": 1.0755881071090698, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 94080 + }, + { + "epoch": 6.756912028725314, + "grad_norm": 1.0734093189239502, + "learning_rate": 0.0002, + "loss": 0.5494, + "step": 94090 + }, + { + "epoch": 6.757630161579892, + "grad_norm": 1.0390300750732422, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 94100 + }, + { + "epoch": 6.75834829443447, + "grad_norm": 0.9557124972343445, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 94110 + }, + { + "epoch": 6.759066427289048, + "grad_norm": 1.0970680713653564, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 94120 + }, + { + "epoch": 6.759784560143626, + "grad_norm": 1.0715644359588623, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 94130 + }, + { + "epoch": 6.760502692998204, + "grad_norm": 1.1311662197113037, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 94140 + }, + { + "epoch": 6.761220825852782, + "grad_norm": 0.9891370534896851, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 94150 + }, + { + "epoch": 6.761938958707361, + "grad_norm": 0.9472686648368835, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 94160 + }, + { + "epoch": 6.762657091561939, + "grad_norm": 1.1044381856918335, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 94170 + }, + { + "epoch": 6.763375224416517, + "grad_norm": 1.2088780403137207, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 94180 + }, + { + "epoch": 6.764093357271095, + "grad_norm": 0.9210726618766785, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 94190 + }, + { + "epoch": 6.764811490125673, + "grad_norm": 1.0969771146774292, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 94200 + }, + { + "epoch": 6.765529622980251, + "grad_norm": 1.1030265092849731, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 94210 + }, + { + "epoch": 6.766247755834829, + "grad_norm": 0.9451745748519897, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 94220 + }, + { + "epoch": 6.766965888689407, + "grad_norm": 1.0216296911239624, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 94230 + }, + { + "epoch": 6.767684021543985, + "grad_norm": 1.4021092653274536, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 94240 + }, + { + "epoch": 6.768402154398563, + "grad_norm": 1.2341269254684448, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 94250 + }, + { + "epoch": 6.769120287253142, + "grad_norm": 1.1086686849594116, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 94260 + }, + { + "epoch": 6.76983842010772, + "grad_norm": 0.8565682172775269, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 94270 + }, + { + "epoch": 6.770556552962298, + "grad_norm": 0.9314411878585815, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 94280 + }, + { + "epoch": 6.771274685816876, + "grad_norm": 1.0592315196990967, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 94290 + }, + { + "epoch": 6.771992818671454, + "grad_norm": 1.086379885673523, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 94300 + }, + { + "epoch": 6.772710951526032, + "grad_norm": 1.13401198387146, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 94310 + }, + { + "epoch": 6.77342908438061, + "grad_norm": 1.0137985944747925, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 94320 + }, + { + "epoch": 6.774147217235188, + "grad_norm": 1.0459709167480469, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 94330 + }, + { + "epoch": 6.774865350089767, + "grad_norm": 1.2213165760040283, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 94340 + }, + { + "epoch": 6.775583482944345, + "grad_norm": 1.099478006362915, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 94350 + }, + { + "epoch": 6.776301615798923, + "grad_norm": 1.124526858329773, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 94360 + }, + { + "epoch": 6.777019748653501, + "grad_norm": 1.0199998617172241, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 94370 + }, + { + "epoch": 6.777737881508079, + "grad_norm": 1.1849408149719238, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 94380 + }, + { + "epoch": 6.778456014362657, + "grad_norm": 1.2265552282333374, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 94390 + }, + { + "epoch": 6.779174147217235, + "grad_norm": 0.7576864361763, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 94400 + }, + { + "epoch": 6.779892280071813, + "grad_norm": 0.8172970414161682, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 94410 + }, + { + "epoch": 6.780610412926391, + "grad_norm": 1.1105220317840576, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 94420 + }, + { + "epoch": 6.781328545780969, + "grad_norm": 1.0542421340942383, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 94430 + }, + { + "epoch": 6.782046678635547, + "grad_norm": 1.0088121891021729, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 94440 + }, + { + "epoch": 6.782764811490126, + "grad_norm": 0.9872488379478455, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 94450 + }, + { + "epoch": 6.783482944344704, + "grad_norm": 1.2545148134231567, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 94460 + }, + { + "epoch": 6.784201077199282, + "grad_norm": 0.8847712278366089, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 94470 + }, + { + "epoch": 6.78491921005386, + "grad_norm": 0.7758765816688538, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 94480 + }, + { + "epoch": 6.785637342908438, + "grad_norm": 1.0454037189483643, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 94490 + }, + { + "epoch": 6.786355475763016, + "grad_norm": 1.1336725950241089, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 94500 + }, + { + "epoch": 6.787073608617594, + "grad_norm": 1.081356406211853, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 94510 + }, + { + "epoch": 6.787791741472172, + "grad_norm": 1.126288890838623, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 94520 + }, + { + "epoch": 6.788509874326751, + "grad_norm": 1.1156792640686035, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 94530 + }, + { + "epoch": 6.789228007181329, + "grad_norm": 1.0243451595306396, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 94540 + }, + { + "epoch": 6.789946140035907, + "grad_norm": 0.9778338670730591, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 94550 + }, + { + "epoch": 6.790664272890485, + "grad_norm": 0.9668094515800476, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 94560 + }, + { + "epoch": 6.791382405745063, + "grad_norm": 1.121848464012146, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 94570 + }, + { + "epoch": 6.792100538599641, + "grad_norm": 1.105825662612915, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 94580 + }, + { + "epoch": 6.792818671454219, + "grad_norm": 1.1236833333969116, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 94590 + }, + { + "epoch": 6.793536804308797, + "grad_norm": 1.0655126571655273, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 94600 + }, + { + "epoch": 6.794254937163375, + "grad_norm": 0.9249289631843567, + "learning_rate": 0.0002, + "loss": 0.5271, + "step": 94610 + }, + { + "epoch": 6.794973070017953, + "grad_norm": 1.0177690982818604, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 94620 + }, + { + "epoch": 6.795691202872531, + "grad_norm": 1.1961153745651245, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 94630 + }, + { + "epoch": 6.79640933572711, + "grad_norm": 1.0987505912780762, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 94640 + }, + { + "epoch": 6.797127468581688, + "grad_norm": 1.0165259838104248, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 94650 + }, + { + "epoch": 6.797845601436266, + "grad_norm": 1.1336601972579956, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 94660 + }, + { + "epoch": 6.798563734290844, + "grad_norm": 1.0786010026931763, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 94670 + }, + { + "epoch": 6.799281867145422, + "grad_norm": 1.2896602153778076, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 94680 + }, + { + "epoch": 6.8, + "grad_norm": 1.0934168100357056, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 94690 + }, + { + "epoch": 6.800718132854578, + "grad_norm": 1.1080414056777954, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 94700 + }, + { + "epoch": 6.801436265709156, + "grad_norm": 1.1141704320907593, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 94710 + }, + { + "epoch": 6.802154398563735, + "grad_norm": 0.9571144580841064, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 94720 + }, + { + "epoch": 6.802872531418313, + "grad_norm": 0.8907591700553894, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 94730 + }, + { + "epoch": 6.803590664272891, + "grad_norm": 1.0547759532928467, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 94740 + }, + { + "epoch": 6.804308797127469, + "grad_norm": 0.973573625087738, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 94750 + }, + { + "epoch": 6.805026929982047, + "grad_norm": 0.7889130711555481, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 94760 + }, + { + "epoch": 6.805745062836625, + "grad_norm": 0.9414647221565247, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 94770 + }, + { + "epoch": 6.806463195691203, + "grad_norm": 0.9452534317970276, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 94780 + }, + { + "epoch": 6.807181328545781, + "grad_norm": 1.2215145826339722, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 94790 + }, + { + "epoch": 6.807899461400359, + "grad_norm": 1.116302490234375, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 94800 + }, + { + "epoch": 6.808617594254937, + "grad_norm": 0.850916862487793, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 94810 + }, + { + "epoch": 6.809335727109516, + "grad_norm": 0.8699719905853271, + "learning_rate": 0.0002, + "loss": 0.5411, + "step": 94820 + }, + { + "epoch": 6.810053859964094, + "grad_norm": 1.0958143472671509, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 94830 + }, + { + "epoch": 6.810771992818672, + "grad_norm": 1.128580927848816, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 94840 + }, + { + "epoch": 6.81149012567325, + "grad_norm": 0.9490674138069153, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 94850 + }, + { + "epoch": 6.812208258527828, + "grad_norm": 0.9294022917747498, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 94860 + }, + { + "epoch": 6.812926391382406, + "grad_norm": 1.048378348350525, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 94870 + }, + { + "epoch": 6.813644524236984, + "grad_norm": 1.1972805261611938, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 94880 + }, + { + "epoch": 6.814362657091562, + "grad_norm": 0.7709503769874573, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 94890 + }, + { + "epoch": 6.8150807899461405, + "grad_norm": 1.0244873762130737, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 94900 + }, + { + "epoch": 6.8157989228007185, + "grad_norm": 1.0576984882354736, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 94910 + }, + { + "epoch": 6.8165170556552965, + "grad_norm": 1.3478775024414062, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 94920 + }, + { + "epoch": 6.8172351885098745, + "grad_norm": 0.982311487197876, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 94930 + }, + { + "epoch": 6.8179533213644525, + "grad_norm": 1.1846535205841064, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 94940 + }, + { + "epoch": 6.8186714542190305, + "grad_norm": 0.9255896210670471, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 94950 + }, + { + "epoch": 6.8193895870736085, + "grad_norm": 0.9418646693229675, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 94960 + }, + { + "epoch": 6.8201077199281865, + "grad_norm": 1.189335584640503, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 94970 + }, + { + "epoch": 6.8208258527827645, + "grad_norm": 1.1003406047821045, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 94980 + }, + { + "epoch": 6.8215439856373425, + "grad_norm": 0.9203724265098572, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 94990 + }, + { + "epoch": 6.8222621184919205, + "grad_norm": 1.093252182006836, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 95000 + }, + { + "epoch": 6.822980251346499, + "grad_norm": 1.2737812995910645, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 95010 + }, + { + "epoch": 6.823698384201077, + "grad_norm": 1.1859848499298096, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 95020 + }, + { + "epoch": 6.824416517055655, + "grad_norm": 0.9591164588928223, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 95030 + }, + { + "epoch": 6.825134649910233, + "grad_norm": 1.0144239664077759, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 95040 + }, + { + "epoch": 6.825852782764811, + "grad_norm": 1.2520356178283691, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 95050 + }, + { + "epoch": 6.8265709156193894, + "grad_norm": 1.003438115119934, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 95060 + }, + { + "epoch": 6.8272890484739674, + "grad_norm": 0.9512312412261963, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 95070 + }, + { + "epoch": 6.8280071813285454, + "grad_norm": 0.9984938502311707, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 95080 + }, + { + "epoch": 6.828725314183124, + "grad_norm": 0.9630827307701111, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 95090 + }, + { + "epoch": 6.829443447037702, + "grad_norm": 0.8859394192695618, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 95100 + }, + { + "epoch": 6.83016157989228, + "grad_norm": 0.9082155227661133, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 95110 + }, + { + "epoch": 6.830879712746858, + "grad_norm": 1.0707300901412964, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 95120 + }, + { + "epoch": 6.831597845601436, + "grad_norm": 1.2023502588272095, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 95130 + }, + { + "epoch": 6.832315978456014, + "grad_norm": 1.0189216136932373, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 95140 + }, + { + "epoch": 6.833034111310592, + "grad_norm": 1.1216851472854614, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 95150 + }, + { + "epoch": 6.83375224416517, + "grad_norm": 1.124589204788208, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 95160 + }, + { + "epoch": 6.834470377019748, + "grad_norm": 1.1183217763900757, + "learning_rate": 0.0002, + "loss": 0.5374, + "step": 95170 + }, + { + "epoch": 6.835188509874326, + "grad_norm": 1.0307188034057617, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 95180 + }, + { + "epoch": 6.835906642728904, + "grad_norm": 1.2438706159591675, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 95190 + }, + { + "epoch": 6.836624775583483, + "grad_norm": 1.117887258529663, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 95200 + }, + { + "epoch": 6.837342908438061, + "grad_norm": 0.8934445381164551, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 95210 + }, + { + "epoch": 6.838061041292639, + "grad_norm": 1.097379207611084, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 95220 + }, + { + "epoch": 6.838779174147217, + "grad_norm": 1.1034258604049683, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 95230 + }, + { + "epoch": 6.839497307001795, + "grad_norm": 1.052120327949524, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 95240 + }, + { + "epoch": 6.840215439856373, + "grad_norm": 1.0844687223434448, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 95250 + }, + { + "epoch": 6.840933572710951, + "grad_norm": 1.1553566455841064, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 95260 + }, + { + "epoch": 6.841651705565529, + "grad_norm": 1.1977533102035522, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 95270 + }, + { + "epoch": 6.842369838420108, + "grad_norm": 0.9635998010635376, + "learning_rate": 0.0002, + "loss": 0.5562, + "step": 95280 + }, + { + "epoch": 6.843087971274686, + "grad_norm": 1.0867844820022583, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 95290 + }, + { + "epoch": 6.843806104129264, + "grad_norm": 1.1252882480621338, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 95300 + }, + { + "epoch": 6.844524236983842, + "grad_norm": 1.1130266189575195, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 95310 + }, + { + "epoch": 6.84524236983842, + "grad_norm": 1.058863878250122, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 95320 + }, + { + "epoch": 6.845960502692998, + "grad_norm": 1.173840880393982, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 95330 + }, + { + "epoch": 6.846678635547576, + "grad_norm": 1.09446120262146, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 95340 + }, + { + "epoch": 6.847396768402154, + "grad_norm": 1.0762465000152588, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 95350 + }, + { + "epoch": 6.848114901256732, + "grad_norm": 1.0056897401809692, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 95360 + }, + { + "epoch": 6.84883303411131, + "grad_norm": 0.929190456867218, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 95370 + }, + { + "epoch": 6.849551166965889, + "grad_norm": 1.1152058839797974, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 95380 + }, + { + "epoch": 6.850269299820467, + "grad_norm": 1.0163987874984741, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 95390 + }, + { + "epoch": 6.850987432675045, + "grad_norm": 1.1169452667236328, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 95400 + }, + { + "epoch": 6.851705565529623, + "grad_norm": 1.2225226163864136, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 95410 + }, + { + "epoch": 6.852423698384201, + "grad_norm": 1.0833172798156738, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 95420 + }, + { + "epoch": 6.853141831238779, + "grad_norm": 1.0159578323364258, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 95430 + }, + { + "epoch": 6.853859964093357, + "grad_norm": 1.1164990663528442, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 95440 + }, + { + "epoch": 6.854578096947935, + "grad_norm": 1.1340656280517578, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 95450 + }, + { + "epoch": 6.855296229802514, + "grad_norm": 1.1228697299957275, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 95460 + }, + { + "epoch": 6.856014362657092, + "grad_norm": 1.0189276933670044, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 95470 + }, + { + "epoch": 6.85673249551167, + "grad_norm": 1.1692779064178467, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 95480 + }, + { + "epoch": 6.857450628366248, + "grad_norm": 1.0779703855514526, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 95490 + }, + { + "epoch": 6.858168761220826, + "grad_norm": 1.0127906799316406, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 95500 + }, + { + "epoch": 6.858886894075404, + "grad_norm": 1.2124756574630737, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 95510 + }, + { + "epoch": 6.859605026929982, + "grad_norm": 1.0948219299316406, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 95520 + }, + { + "epoch": 6.86032315978456, + "grad_norm": 0.8796268701553345, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 95530 + }, + { + "epoch": 6.861041292639138, + "grad_norm": 1.0725175142288208, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 95540 + }, + { + "epoch": 6.861759425493716, + "grad_norm": 0.9067171812057495, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 95550 + }, + { + "epoch": 6.862477558348294, + "grad_norm": 1.0576670169830322, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 95560 + }, + { + "epoch": 6.863195691202873, + "grad_norm": 0.9622264504432678, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 95570 + }, + { + "epoch": 6.863913824057451, + "grad_norm": 1.0197248458862305, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 95580 + }, + { + "epoch": 6.864631956912029, + "grad_norm": 0.9197335243225098, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 95590 + }, + { + "epoch": 6.865350089766607, + "grad_norm": 1.0169627666473389, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 95600 + }, + { + "epoch": 6.866068222621185, + "grad_norm": 0.9868543744087219, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 95610 + }, + { + "epoch": 6.866786355475763, + "grad_norm": 0.9861942529678345, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 95620 + }, + { + "epoch": 6.867504488330341, + "grad_norm": 1.0906847715377808, + "learning_rate": 0.0002, + "loss": 0.5753, + "step": 95630 + }, + { + "epoch": 6.868222621184919, + "grad_norm": 1.2462674379348755, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 95640 + }, + { + "epoch": 6.868940754039498, + "grad_norm": 0.9801536202430725, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 95650 + }, + { + "epoch": 6.869658886894076, + "grad_norm": 1.0568761825561523, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 95660 + }, + { + "epoch": 6.870377019748654, + "grad_norm": 0.8431015014648438, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 95670 + }, + { + "epoch": 6.871095152603232, + "grad_norm": 1.2253447771072388, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 95680 + }, + { + "epoch": 6.87181328545781, + "grad_norm": 0.8862479329109192, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 95690 + }, + { + "epoch": 6.872531418312388, + "grad_norm": 1.0733704566955566, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 95700 + }, + { + "epoch": 6.873249551166966, + "grad_norm": 0.9327288269996643, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 95710 + }, + { + "epoch": 6.873967684021544, + "grad_norm": 0.9877831339836121, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 95720 + }, + { + "epoch": 6.874685816876122, + "grad_norm": 0.9772239327430725, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 95730 + }, + { + "epoch": 6.8754039497307, + "grad_norm": 0.9799681901931763, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 95740 + }, + { + "epoch": 6.876122082585278, + "grad_norm": 1.0650758743286133, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 95750 + }, + { + "epoch": 6.876840215439857, + "grad_norm": 1.068557858467102, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 95760 + }, + { + "epoch": 6.877558348294435, + "grad_norm": 1.1335437297821045, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 95770 + }, + { + "epoch": 6.878276481149013, + "grad_norm": 0.8993158936500549, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 95780 + }, + { + "epoch": 6.878994614003591, + "grad_norm": 1.0593502521514893, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 95790 + }, + { + "epoch": 6.879712746858169, + "grad_norm": 1.2181397676467896, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 95800 + }, + { + "epoch": 6.880430879712747, + "grad_norm": 0.9614198207855225, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 95810 + }, + { + "epoch": 6.881149012567325, + "grad_norm": 1.021591067314148, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 95820 + }, + { + "epoch": 6.881867145421903, + "grad_norm": 1.3752840757369995, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 95830 + }, + { + "epoch": 6.882585278276482, + "grad_norm": 1.236355185508728, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 95840 + }, + { + "epoch": 6.88330341113106, + "grad_norm": 1.1957523822784424, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 95850 + }, + { + "epoch": 6.884021543985638, + "grad_norm": 0.8793587684631348, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 95860 + }, + { + "epoch": 6.884739676840216, + "grad_norm": 1.202054738998413, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 95870 + }, + { + "epoch": 6.885457809694794, + "grad_norm": 0.8061116337776184, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 95880 + }, + { + "epoch": 6.886175942549372, + "grad_norm": 1.0037956237792969, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 95890 + }, + { + "epoch": 6.88689407540395, + "grad_norm": 1.006435751914978, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 95900 + }, + { + "epoch": 6.887612208258528, + "grad_norm": 1.141200304031372, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 95910 + }, + { + "epoch": 6.888330341113106, + "grad_norm": 0.9017927050590515, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 95920 + }, + { + "epoch": 6.889048473967684, + "grad_norm": 0.9288154244422913, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 95930 + }, + { + "epoch": 6.8897666068222625, + "grad_norm": 1.2263801097869873, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 95940 + }, + { + "epoch": 6.8904847396768405, + "grad_norm": 1.2005410194396973, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 95950 + }, + { + "epoch": 6.8912028725314185, + "grad_norm": 1.0801531076431274, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 95960 + }, + { + "epoch": 6.8919210053859965, + "grad_norm": 1.1115456819534302, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 95970 + }, + { + "epoch": 6.8926391382405745, + "grad_norm": 1.062920093536377, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 95980 + }, + { + "epoch": 6.8933572710951525, + "grad_norm": 0.9343897700309753, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 95990 + }, + { + "epoch": 6.8940754039497305, + "grad_norm": 1.0236390829086304, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 96000 + }, + { + "epoch": 6.8947935368043085, + "grad_norm": 1.0680996179580688, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 96010 + }, + { + "epoch": 6.8955116696588865, + "grad_norm": 1.1796760559082031, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 96020 + }, + { + "epoch": 6.896229802513465, + "grad_norm": 0.9805570840835571, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 96030 + }, + { + "epoch": 6.896947935368043, + "grad_norm": 1.245386004447937, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 96040 + }, + { + "epoch": 6.897666068222621, + "grad_norm": 1.0306174755096436, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 96050 + }, + { + "epoch": 6.898384201077199, + "grad_norm": 1.0599836111068726, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 96060 + }, + { + "epoch": 6.899102333931777, + "grad_norm": 1.1438795328140259, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 96070 + }, + { + "epoch": 6.899820466786355, + "grad_norm": 0.9044751524925232, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 96080 + }, + { + "epoch": 6.900538599640933, + "grad_norm": 0.9689591526985168, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 96090 + }, + { + "epoch": 6.901256732495511, + "grad_norm": 1.003217339515686, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 96100 + }, + { + "epoch": 6.901974865350089, + "grad_norm": 1.1630250215530396, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 96110 + }, + { + "epoch": 6.902692998204667, + "grad_norm": 1.0304425954818726, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 96120 + }, + { + "epoch": 6.903411131059246, + "grad_norm": 1.0148587226867676, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 96130 + }, + { + "epoch": 6.904129263913824, + "grad_norm": 1.3722255229949951, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 96140 + }, + { + "epoch": 6.904847396768402, + "grad_norm": 1.1518549919128418, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 96150 + }, + { + "epoch": 6.90556552962298, + "grad_norm": 1.0342949628829956, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 96160 + }, + { + "epoch": 6.906283662477558, + "grad_norm": 1.0178996324539185, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 96170 + }, + { + "epoch": 6.907001795332136, + "grad_norm": 1.3429099321365356, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 96180 + }, + { + "epoch": 6.907719928186714, + "grad_norm": 1.2281367778778076, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 96190 + }, + { + "epoch": 6.908438061041292, + "grad_norm": 0.8190469145774841, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 96200 + }, + { + "epoch": 6.909156193895871, + "grad_norm": 1.1344635486602783, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 96210 + }, + { + "epoch": 6.909874326750449, + "grad_norm": 1.0540097951889038, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 96220 + }, + { + "epoch": 6.910592459605027, + "grad_norm": 1.044974446296692, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 96230 + }, + { + "epoch": 6.911310592459605, + "grad_norm": 0.6890087723731995, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 96240 + }, + { + "epoch": 6.912028725314183, + "grad_norm": 1.1266905069351196, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 96250 + }, + { + "epoch": 6.912746858168761, + "grad_norm": 1.3173121213912964, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 96260 + }, + { + "epoch": 6.913464991023339, + "grad_norm": 1.0043895244598389, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 96270 + }, + { + "epoch": 6.914183123877917, + "grad_norm": 1.0634605884552002, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 96280 + }, + { + "epoch": 6.914901256732495, + "grad_norm": 1.234516978263855, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 96290 + }, + { + "epoch": 6.915619389587073, + "grad_norm": 1.042026162147522, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 96300 + }, + { + "epoch": 6.916337522441651, + "grad_norm": 1.063632845878601, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 96310 + }, + { + "epoch": 6.91705565529623, + "grad_norm": 1.0733225345611572, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 96320 + }, + { + "epoch": 6.917773788150808, + "grad_norm": 1.4382662773132324, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 96330 + }, + { + "epoch": 6.918491921005386, + "grad_norm": 1.19964599609375, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 96340 + }, + { + "epoch": 6.919210053859964, + "grad_norm": 0.9012235403060913, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 96350 + }, + { + "epoch": 6.919928186714542, + "grad_norm": 0.8663099408149719, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 96360 + }, + { + "epoch": 6.92064631956912, + "grad_norm": 0.8944193124771118, + "learning_rate": 0.0002, + "loss": 0.5164, + "step": 96370 + }, + { + "epoch": 6.921364452423698, + "grad_norm": 1.1201437711715698, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 96380 + }, + { + "epoch": 6.922082585278276, + "grad_norm": 1.0434664487838745, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 96390 + }, + { + "epoch": 6.922800718132855, + "grad_norm": 1.2666915655136108, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 96400 + }, + { + "epoch": 6.923518850987433, + "grad_norm": 0.9610332250595093, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 96410 + }, + { + "epoch": 6.924236983842011, + "grad_norm": 1.1521750688552856, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 96420 + }, + { + "epoch": 6.924955116696589, + "grad_norm": 0.921970546245575, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 96430 + }, + { + "epoch": 6.925673249551167, + "grad_norm": 1.1277226209640503, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 96440 + }, + { + "epoch": 6.926391382405745, + "grad_norm": 1.147425889968872, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 96450 + }, + { + "epoch": 6.927109515260323, + "grad_norm": 1.0128270387649536, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 96460 + }, + { + "epoch": 6.927827648114901, + "grad_norm": 1.0726343393325806, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 96470 + }, + { + "epoch": 6.928545780969479, + "grad_norm": 0.9902656078338623, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 96480 + }, + { + "epoch": 6.929263913824057, + "grad_norm": 0.9662004709243774, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 96490 + }, + { + "epoch": 6.929982046678636, + "grad_norm": 0.9595714807510376, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 96500 + }, + { + "epoch": 6.930700179533214, + "grad_norm": 1.0666614770889282, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 96510 + }, + { + "epoch": 6.931418312387792, + "grad_norm": 0.8744403123855591, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 96520 + }, + { + "epoch": 6.93213644524237, + "grad_norm": 1.0382628440856934, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 96530 + }, + { + "epoch": 6.932854578096948, + "grad_norm": 0.9165884256362915, + "learning_rate": 0.0002, + "loss": 0.5445, + "step": 96540 + }, + { + "epoch": 6.933572710951526, + "grad_norm": 0.9073842763900757, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 96550 + }, + { + "epoch": 6.934290843806104, + "grad_norm": 1.100635051727295, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 96560 + }, + { + "epoch": 6.935008976660682, + "grad_norm": 1.1503266096115112, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 96570 + }, + { + "epoch": 6.93572710951526, + "grad_norm": 0.9526805281639099, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 96580 + }, + { + "epoch": 6.936445242369839, + "grad_norm": 1.115716814994812, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 96590 + }, + { + "epoch": 6.937163375224417, + "grad_norm": 1.0669193267822266, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 96600 + }, + { + "epoch": 6.937881508078995, + "grad_norm": 1.0191189050674438, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 96610 + }, + { + "epoch": 6.938599640933573, + "grad_norm": 1.1885946989059448, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 96620 + }, + { + "epoch": 6.939317773788151, + "grad_norm": 0.9806031584739685, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 96630 + }, + { + "epoch": 6.940035906642729, + "grad_norm": 0.9700000286102295, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 96640 + }, + { + "epoch": 6.940754039497307, + "grad_norm": 1.0870105028152466, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 96650 + }, + { + "epoch": 6.941472172351885, + "grad_norm": 0.7441867589950562, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 96660 + }, + { + "epoch": 6.942190305206463, + "grad_norm": 0.8631957173347473, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 96670 + }, + { + "epoch": 6.942908438061041, + "grad_norm": 1.0538444519042969, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 96680 + }, + { + "epoch": 6.94362657091562, + "grad_norm": 1.0235437154769897, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 96690 + }, + { + "epoch": 6.944344703770198, + "grad_norm": 1.069114089012146, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 96700 + }, + { + "epoch": 6.945062836624776, + "grad_norm": 1.0421861410140991, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 96710 + }, + { + "epoch": 6.945780969479354, + "grad_norm": 0.9244136810302734, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 96720 + }, + { + "epoch": 6.946499102333932, + "grad_norm": 0.962041437625885, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 96730 + }, + { + "epoch": 6.94721723518851, + "grad_norm": 1.049677848815918, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 96740 + }, + { + "epoch": 6.947935368043088, + "grad_norm": 1.0276710987091064, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 96750 + }, + { + "epoch": 6.948653500897666, + "grad_norm": 1.036650538444519, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 96760 + }, + { + "epoch": 6.949371633752245, + "grad_norm": 1.0379945039749146, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 96770 + }, + { + "epoch": 6.950089766606823, + "grad_norm": 0.9768070578575134, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 96780 + }, + { + "epoch": 6.950807899461401, + "grad_norm": 1.0515118837356567, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 96790 + }, + { + "epoch": 6.951526032315979, + "grad_norm": 0.9186223149299622, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 96800 + }, + { + "epoch": 6.952244165170557, + "grad_norm": 1.0430902242660522, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 96810 + }, + { + "epoch": 6.952962298025135, + "grad_norm": 0.7750678658485413, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 96820 + }, + { + "epoch": 6.953680430879713, + "grad_norm": 1.1721138954162598, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 96830 + }, + { + "epoch": 6.954398563734291, + "grad_norm": 1.2088165283203125, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 96840 + }, + { + "epoch": 6.955116696588869, + "grad_norm": 0.9956802129745483, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 96850 + }, + { + "epoch": 6.955834829443447, + "grad_norm": 1.0444421768188477, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 96860 + }, + { + "epoch": 6.956552962298025, + "grad_norm": 1.2420955896377563, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 96870 + }, + { + "epoch": 6.957271095152604, + "grad_norm": 1.0187203884124756, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 96880 + }, + { + "epoch": 6.957989228007182, + "grad_norm": 1.0883756875991821, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 96890 + }, + { + "epoch": 6.95870736086176, + "grad_norm": 1.1869568824768066, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 96900 + }, + { + "epoch": 6.959425493716338, + "grad_norm": 1.242119312286377, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 96910 + }, + { + "epoch": 6.960143626570916, + "grad_norm": 1.0262869596481323, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 96920 + }, + { + "epoch": 6.960861759425494, + "grad_norm": 0.9577149152755737, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 96930 + }, + { + "epoch": 6.961579892280072, + "grad_norm": 0.9224622249603271, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 96940 + }, + { + "epoch": 6.96229802513465, + "grad_norm": 1.0761854648590088, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 96950 + }, + { + "epoch": 6.9630161579892285, + "grad_norm": 1.1029279232025146, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 96960 + }, + { + "epoch": 6.9637342908438065, + "grad_norm": 1.1132091283798218, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 96970 + }, + { + "epoch": 6.9644524236983845, + "grad_norm": 0.9723706245422363, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 96980 + }, + { + "epoch": 6.9651705565529625, + "grad_norm": 1.0453037023544312, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 96990 + }, + { + "epoch": 6.9658886894075405, + "grad_norm": 1.16423499584198, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 97000 + }, + { + "epoch": 6.9666068222621185, + "grad_norm": 1.1522771120071411, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 97010 + }, + { + "epoch": 6.9673249551166965, + "grad_norm": 1.020828127861023, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 97020 + }, + { + "epoch": 6.9680430879712745, + "grad_norm": 1.0301889181137085, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 97030 + }, + { + "epoch": 6.9687612208258525, + "grad_norm": 1.0615862607955933, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 97040 + }, + { + "epoch": 6.9694793536804305, + "grad_norm": 1.1750848293304443, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 97050 + }, + { + "epoch": 6.9701974865350085, + "grad_norm": 0.916283905506134, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 97060 + }, + { + "epoch": 6.970915619389587, + "grad_norm": 1.0715203285217285, + "learning_rate": 0.0002, + "loss": 0.6158, + "step": 97070 + }, + { + "epoch": 6.971633752244165, + "grad_norm": 1.1171340942382812, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 97080 + }, + { + "epoch": 6.972351885098743, + "grad_norm": 0.886015772819519, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 97090 + }, + { + "epoch": 6.973070017953321, + "grad_norm": 0.9498746991157532, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 97100 + }, + { + "epoch": 6.973788150807899, + "grad_norm": 1.1563011407852173, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 97110 + }, + { + "epoch": 6.974506283662477, + "grad_norm": 0.9086321592330933, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 97120 + }, + { + "epoch": 6.975224416517055, + "grad_norm": 0.9804864525794983, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 97130 + }, + { + "epoch": 6.975942549371633, + "grad_norm": 1.5005993843078613, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 97140 + }, + { + "epoch": 6.976660682226212, + "grad_norm": 1.1720819473266602, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 97150 + }, + { + "epoch": 6.97737881508079, + "grad_norm": 1.095572590827942, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 97160 + }, + { + "epoch": 6.978096947935368, + "grad_norm": 1.1880861520767212, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 97170 + }, + { + "epoch": 6.978815080789946, + "grad_norm": 1.0959832668304443, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 97180 + }, + { + "epoch": 6.979533213644524, + "grad_norm": 1.2158745527267456, + "learning_rate": 0.0002, + "loss": 0.5834, + "step": 97190 + }, + { + "epoch": 6.980251346499102, + "grad_norm": 1.0073821544647217, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 97200 + }, + { + "epoch": 6.98096947935368, + "grad_norm": 0.8503464460372925, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 97210 + }, + { + "epoch": 6.981687612208258, + "grad_norm": 0.9399861097335815, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 97220 + }, + { + "epoch": 6.982405745062836, + "grad_norm": 1.1167447566986084, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 97230 + }, + { + "epoch": 6.983123877917414, + "grad_norm": 1.2710384130477905, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 97240 + }, + { + "epoch": 6.983842010771993, + "grad_norm": 0.8514767289161682, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 97250 + }, + { + "epoch": 6.984560143626571, + "grad_norm": 0.9983348846435547, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 97260 + }, + { + "epoch": 6.985278276481149, + "grad_norm": 1.1713277101516724, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 97270 + }, + { + "epoch": 6.985996409335727, + "grad_norm": 1.346272349357605, + "learning_rate": 0.0002, + "loss": 0.5297, + "step": 97280 + }, + { + "epoch": 6.986714542190305, + "grad_norm": 1.0687556266784668, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 97290 + }, + { + "epoch": 6.987432675044883, + "grad_norm": 1.035805106163025, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 97300 + }, + { + "epoch": 6.988150807899461, + "grad_norm": 1.149027705192566, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 97310 + }, + { + "epoch": 6.988868940754039, + "grad_norm": 0.9672921895980835, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 97320 + }, + { + "epoch": 6.989587073608618, + "grad_norm": 1.0306763648986816, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 97330 + }, + { + "epoch": 6.990305206463196, + "grad_norm": 1.1457809209823608, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 97340 + }, + { + "epoch": 6.991023339317774, + "grad_norm": 0.9718224406242371, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 97350 + }, + { + "epoch": 6.991741472172352, + "grad_norm": 0.9872630834579468, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 97360 + }, + { + "epoch": 6.99245960502693, + "grad_norm": 1.0302132368087769, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 97370 + }, + { + "epoch": 6.993177737881508, + "grad_norm": 1.001103162765503, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 97380 + }, + { + "epoch": 6.993895870736086, + "grad_norm": 0.9207047820091248, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 97390 + }, + { + "epoch": 6.994614003590664, + "grad_norm": 1.1986219882965088, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 97400 + }, + { + "epoch": 6.995332136445242, + "grad_norm": 1.343885064125061, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 97410 + }, + { + "epoch": 6.99605026929982, + "grad_norm": 1.0611628293991089, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 97420 + }, + { + "epoch": 6.996768402154398, + "grad_norm": 0.9514605402946472, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 97430 + }, + { + "epoch": 6.997486535008977, + "grad_norm": 1.0259917974472046, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 97440 + }, + { + "epoch": 6.998204667863555, + "grad_norm": 1.0735033750534058, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 97450 + }, + { + "epoch": 6.998922800718133, + "grad_norm": 1.053984522819519, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 97460 + }, + { + "epoch": 6.999640933572711, + "grad_norm": 1.0285807847976685, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 97470 + }, + { + "epoch": 7.0, + "eval_loss": 1.168665885925293, + "eval_runtime": 55.1686, + "eval_samples_per_second": 13.287, + "eval_steps_per_second": 1.668, + "step": 97475 + }, + { + "epoch": 7.000359066427289, + "grad_norm": 1.0394084453582764, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 97480 + }, + { + "epoch": 7.001077199281867, + "grad_norm": 1.0377404689788818, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 97490 + }, + { + "epoch": 7.001795332136445, + "grad_norm": 1.143609642982483, + "learning_rate": 0.0002, + "loss": 0.502, + "step": 97500 + }, + { + "epoch": 7.002513464991023, + "grad_norm": 0.9544180035591125, + "learning_rate": 0.0002, + "loss": 0.5071, + "step": 97510 + }, + { + "epoch": 7.003231597845601, + "grad_norm": 1.1849734783172607, + "learning_rate": 0.0002, + "loss": 0.5249, + "step": 97520 + }, + { + "epoch": 7.00394973070018, + "grad_norm": 1.0769017934799194, + "learning_rate": 0.0002, + "loss": 0.5095, + "step": 97530 + }, + { + "epoch": 7.004667863554758, + "grad_norm": 1.2054177522659302, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 97540 + }, + { + "epoch": 7.005385996409336, + "grad_norm": 0.800378680229187, + "learning_rate": 0.0002, + "loss": 0.4639, + "step": 97550 + }, + { + "epoch": 7.006104129263914, + "grad_norm": 1.0197957754135132, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 97560 + }, + { + "epoch": 7.006822262118492, + "grad_norm": 1.1266579627990723, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 97570 + }, + { + "epoch": 7.00754039497307, + "grad_norm": 0.9955291152000427, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 97580 + }, + { + "epoch": 7.008258527827648, + "grad_norm": 1.1531357765197754, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 97590 + }, + { + "epoch": 7.008976660682226, + "grad_norm": 1.1159368753433228, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 97600 + }, + { + "epoch": 7.009694793536804, + "grad_norm": 1.2170041799545288, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 97610 + }, + { + "epoch": 7.010412926391383, + "grad_norm": 1.2761963605880737, + "learning_rate": 0.0002, + "loss": 0.527, + "step": 97620 + }, + { + "epoch": 7.011131059245961, + "grad_norm": 1.1703165769577026, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 97630 + }, + { + "epoch": 7.011849192100539, + "grad_norm": 1.0011869668960571, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 97640 + }, + { + "epoch": 7.012567324955117, + "grad_norm": 1.2599170207977295, + "learning_rate": 0.0002, + "loss": 0.4728, + "step": 97650 + }, + { + "epoch": 7.013285457809695, + "grad_norm": 0.9646086692810059, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 97660 + }, + { + "epoch": 7.014003590664273, + "grad_norm": 1.067461609840393, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 97670 + }, + { + "epoch": 7.014721723518851, + "grad_norm": 0.9157150983810425, + "learning_rate": 0.0002, + "loss": 0.5079, + "step": 97680 + }, + { + "epoch": 7.015439856373429, + "grad_norm": 1.5808709859848022, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 97690 + }, + { + "epoch": 7.016157989228007, + "grad_norm": 1.069395661354065, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 97700 + }, + { + "epoch": 7.016876122082586, + "grad_norm": 1.180887222290039, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 97710 + }, + { + "epoch": 7.017594254937164, + "grad_norm": 1.0960854291915894, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 97720 + }, + { + "epoch": 7.018312387791742, + "grad_norm": 0.9090136885643005, + "learning_rate": 0.0002, + "loss": 0.516, + "step": 97730 + }, + { + "epoch": 7.01903052064632, + "grad_norm": 0.992369532585144, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 97740 + }, + { + "epoch": 7.019748653500898, + "grad_norm": 1.1090840101242065, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 97750 + }, + { + "epoch": 7.020466786355476, + "grad_norm": 1.173752784729004, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 97760 + }, + { + "epoch": 7.021184919210054, + "grad_norm": 1.1630373001098633, + "learning_rate": 0.0002, + "loss": 0.496, + "step": 97770 + }, + { + "epoch": 7.021903052064632, + "grad_norm": 1.34774649143219, + "learning_rate": 0.0002, + "loss": 0.4946, + "step": 97780 + }, + { + "epoch": 7.02262118491921, + "grad_norm": 1.0631234645843506, + "learning_rate": 0.0002, + "loss": 0.4801, + "step": 97790 + }, + { + "epoch": 7.023339317773788, + "grad_norm": 1.1396355628967285, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 97800 + }, + { + "epoch": 7.024057450628367, + "grad_norm": 1.0061511993408203, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 97810 + }, + { + "epoch": 7.024775583482945, + "grad_norm": 0.8545233607292175, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 97820 + }, + { + "epoch": 7.025493716337523, + "grad_norm": 1.1746221780776978, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 97830 + }, + { + "epoch": 7.026211849192101, + "grad_norm": 0.9705178737640381, + "learning_rate": 0.0002, + "loss": 0.5056, + "step": 97840 + }, + { + "epoch": 7.026929982046679, + "grad_norm": 0.9517123103141785, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 97850 + }, + { + "epoch": 7.027648114901257, + "grad_norm": 1.0428272485733032, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 97860 + }, + { + "epoch": 7.028366247755835, + "grad_norm": 1.020277976989746, + "learning_rate": 0.0002, + "loss": 0.5108, + "step": 97870 + }, + { + "epoch": 7.029084380610413, + "grad_norm": 1.1434438228607178, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 97880 + }, + { + "epoch": 7.029802513464991, + "grad_norm": 0.8937026858329773, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 97890 + }, + { + "epoch": 7.0305206463195695, + "grad_norm": 0.9241712093353271, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 97900 + }, + { + "epoch": 7.0312387791741475, + "grad_norm": 1.0576003789901733, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 97910 + }, + { + "epoch": 7.0319569120287255, + "grad_norm": 0.9046192765235901, + "learning_rate": 0.0002, + "loss": 0.483, + "step": 97920 + }, + { + "epoch": 7.0326750448833035, + "grad_norm": 0.9557563662528992, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 97930 + }, + { + "epoch": 7.0333931777378815, + "grad_norm": 1.0260612964630127, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 97940 + }, + { + "epoch": 7.0341113105924595, + "grad_norm": 1.005668044090271, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 97950 + }, + { + "epoch": 7.0348294434470375, + "grad_norm": 1.0715222358703613, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 97960 + }, + { + "epoch": 7.0355475763016155, + "grad_norm": 0.9782606363296509, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 97970 + }, + { + "epoch": 7.0362657091561935, + "grad_norm": 0.970796525478363, + "learning_rate": 0.0002, + "loss": 0.467, + "step": 97980 + }, + { + "epoch": 7.036983842010772, + "grad_norm": 1.0109657049179077, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 97990 + }, + { + "epoch": 7.03770197486535, + "grad_norm": 1.0419244766235352, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 98000 + }, + { + "epoch": 7.038420107719928, + "grad_norm": 1.140035629272461, + "learning_rate": 0.0002, + "loss": 0.5009, + "step": 98010 + }, + { + "epoch": 7.039138240574506, + "grad_norm": 1.148266315460205, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 98020 + }, + { + "epoch": 7.039856373429084, + "grad_norm": 1.0584349632263184, + "learning_rate": 0.0002, + "loss": 0.5445, + "step": 98030 + }, + { + "epoch": 7.040574506283662, + "grad_norm": 1.0054830312728882, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 98040 + }, + { + "epoch": 7.04129263913824, + "grad_norm": 1.3186599016189575, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 98050 + }, + { + "epoch": 7.042010771992818, + "grad_norm": 1.5720367431640625, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 98060 + }, + { + "epoch": 7.042728904847396, + "grad_norm": 1.0619040727615356, + "learning_rate": 0.0002, + "loss": 0.4977, + "step": 98070 + }, + { + "epoch": 7.0434470377019744, + "grad_norm": 1.1936930418014526, + "learning_rate": 0.0002, + "loss": 0.4769, + "step": 98080 + }, + { + "epoch": 7.044165170556553, + "grad_norm": 1.1437066793441772, + "learning_rate": 0.0002, + "loss": 0.476, + "step": 98090 + }, + { + "epoch": 7.044883303411131, + "grad_norm": 1.1040478944778442, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 98100 + }, + { + "epoch": 7.045601436265709, + "grad_norm": 1.2150214910507202, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 98110 + }, + { + "epoch": 7.046319569120287, + "grad_norm": 1.1224234104156494, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 98120 + }, + { + "epoch": 7.047037701974865, + "grad_norm": 1.256640076637268, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 98130 + }, + { + "epoch": 7.047755834829443, + "grad_norm": 1.2098320722579956, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 98140 + }, + { + "epoch": 7.048473967684021, + "grad_norm": 1.0719431638717651, + "learning_rate": 0.0002, + "loss": 0.5187, + "step": 98150 + }, + { + "epoch": 7.049192100538599, + "grad_norm": 1.5370041131973267, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 98160 + }, + { + "epoch": 7.049910233393177, + "grad_norm": 1.166554570198059, + "learning_rate": 0.0002, + "loss": 0.5036, + "step": 98170 + }, + { + "epoch": 7.050628366247756, + "grad_norm": 0.927842378616333, + "learning_rate": 0.0002, + "loss": 0.476, + "step": 98180 + }, + { + "epoch": 7.051346499102334, + "grad_norm": 0.9756902456283569, + "learning_rate": 0.0002, + "loss": 0.4905, + "step": 98190 + }, + { + "epoch": 7.052064631956912, + "grad_norm": 0.994195282459259, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 98200 + }, + { + "epoch": 7.05278276481149, + "grad_norm": 1.1864269971847534, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 98210 + }, + { + "epoch": 7.053500897666068, + "grad_norm": 0.8431169390678406, + "learning_rate": 0.0002, + "loss": 0.4897, + "step": 98220 + }, + { + "epoch": 7.054219030520646, + "grad_norm": 1.233312726020813, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 98230 + }, + { + "epoch": 7.054937163375224, + "grad_norm": 1.0040699243545532, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 98240 + }, + { + "epoch": 7.055655296229802, + "grad_norm": 1.004325032234192, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 98250 + }, + { + "epoch": 7.05637342908438, + "grad_norm": 1.1213003396987915, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 98260 + }, + { + "epoch": 7.057091561938959, + "grad_norm": 1.115504264831543, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 98270 + }, + { + "epoch": 7.057809694793537, + "grad_norm": 0.9618098139762878, + "learning_rate": 0.0002, + "loss": 0.4699, + "step": 98280 + }, + { + "epoch": 7.058527827648115, + "grad_norm": 0.9967533946037292, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 98290 + }, + { + "epoch": 7.059245960502693, + "grad_norm": 1.061136245727539, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 98300 + }, + { + "epoch": 7.059964093357271, + "grad_norm": 1.3787742853164673, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 98310 + }, + { + "epoch": 7.060682226211849, + "grad_norm": 1.0541613101959229, + "learning_rate": 0.0002, + "loss": 0.5003, + "step": 98320 + }, + { + "epoch": 7.061400359066427, + "grad_norm": 1.3264026641845703, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 98330 + }, + { + "epoch": 7.062118491921005, + "grad_norm": 0.9874539375305176, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 98340 + }, + { + "epoch": 7.062836624775583, + "grad_norm": 0.8959392309188843, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 98350 + }, + { + "epoch": 7.063554757630161, + "grad_norm": 0.9952960014343262, + "learning_rate": 0.0002, + "loss": 0.5031, + "step": 98360 + }, + { + "epoch": 7.06427289048474, + "grad_norm": 1.0395413637161255, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 98370 + }, + { + "epoch": 7.064991023339318, + "grad_norm": 0.9314938187599182, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 98380 + }, + { + "epoch": 7.065709156193896, + "grad_norm": 1.0952500104904175, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 98390 + }, + { + "epoch": 7.066427289048474, + "grad_norm": 0.8393705487251282, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 98400 + }, + { + "epoch": 7.067145421903052, + "grad_norm": 1.0407543182373047, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 98410 + }, + { + "epoch": 7.06786355475763, + "grad_norm": 1.015194296836853, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 98420 + }, + { + "epoch": 7.068581687612208, + "grad_norm": 1.0878134965896606, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 98430 + }, + { + "epoch": 7.069299820466786, + "grad_norm": 1.0402575731277466, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 98440 + }, + { + "epoch": 7.070017953321364, + "grad_norm": 0.8770583271980286, + "learning_rate": 0.0002, + "loss": 0.4895, + "step": 98450 + }, + { + "epoch": 7.070736086175943, + "grad_norm": 1.0066659450531006, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 98460 + }, + { + "epoch": 7.071454219030521, + "grad_norm": 1.1627628803253174, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 98470 + }, + { + "epoch": 7.072172351885099, + "grad_norm": 1.1217474937438965, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 98480 + }, + { + "epoch": 7.072890484739677, + "grad_norm": 1.1825461387634277, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 98490 + }, + { + "epoch": 7.073608617594255, + "grad_norm": 1.2198481559753418, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 98500 + }, + { + "epoch": 7.074326750448833, + "grad_norm": 1.0615922212600708, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 98510 + }, + { + "epoch": 7.075044883303411, + "grad_norm": 1.1725428104400635, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 98520 + }, + { + "epoch": 7.075763016157989, + "grad_norm": 1.0269757509231567, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 98530 + }, + { + "epoch": 7.076481149012567, + "grad_norm": 0.9191881418228149, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 98540 + }, + { + "epoch": 7.077199281867145, + "grad_norm": 1.2156354188919067, + "learning_rate": 0.0002, + "loss": 0.4974, + "step": 98550 + }, + { + "epoch": 7.077917414721724, + "grad_norm": 1.1455811262130737, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 98560 + }, + { + "epoch": 7.078635547576302, + "grad_norm": 1.1971662044525146, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 98570 + }, + { + "epoch": 7.07935368043088, + "grad_norm": 1.1876308917999268, + "learning_rate": 0.0002, + "loss": 0.5287, + "step": 98580 + }, + { + "epoch": 7.080071813285458, + "grad_norm": 1.0847078561782837, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 98590 + }, + { + "epoch": 7.080789946140036, + "grad_norm": 1.1745446920394897, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 98600 + }, + { + "epoch": 7.081508078994614, + "grad_norm": 1.133808970451355, + "learning_rate": 0.0002, + "loss": 0.5145, + "step": 98610 + }, + { + "epoch": 7.082226211849192, + "grad_norm": 0.8598989248275757, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 98620 + }, + { + "epoch": 7.08294434470377, + "grad_norm": 0.9775993824005127, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 98630 + }, + { + "epoch": 7.083662477558348, + "grad_norm": 1.1053773164749146, + "learning_rate": 0.0002, + "loss": 0.499, + "step": 98640 + }, + { + "epoch": 7.084380610412927, + "grad_norm": 1.1902083158493042, + "learning_rate": 0.0002, + "loss": 0.4975, + "step": 98650 + }, + { + "epoch": 7.085098743267505, + "grad_norm": 1.2208364009857178, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 98660 + }, + { + "epoch": 7.085816876122083, + "grad_norm": 1.3565878868103027, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 98670 + }, + { + "epoch": 7.086535008976661, + "grad_norm": 1.1915233135223389, + "learning_rate": 0.0002, + "loss": 0.5183, + "step": 98680 + }, + { + "epoch": 7.087253141831239, + "grad_norm": 0.7820531725883484, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 98690 + }, + { + "epoch": 7.087971274685817, + "grad_norm": 1.3015085458755493, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 98700 + }, + { + "epoch": 7.088689407540395, + "grad_norm": 1.1178984642028809, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 98710 + }, + { + "epoch": 7.089407540394973, + "grad_norm": 1.0407224893569946, + "learning_rate": 0.0002, + "loss": 0.4689, + "step": 98720 + }, + { + "epoch": 7.090125673249551, + "grad_norm": 1.070882797241211, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 98730 + }, + { + "epoch": 7.09084380610413, + "grad_norm": 1.0723912715911865, + "learning_rate": 0.0002, + "loss": 0.511, + "step": 98740 + }, + { + "epoch": 7.091561938958708, + "grad_norm": 0.9973018169403076, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 98750 + }, + { + "epoch": 7.092280071813286, + "grad_norm": 1.2216873168945312, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 98760 + }, + { + "epoch": 7.092998204667864, + "grad_norm": 0.9081874489784241, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 98770 + }, + { + "epoch": 7.093716337522442, + "grad_norm": 1.141811490058899, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 98780 + }, + { + "epoch": 7.09443447037702, + "grad_norm": 0.9687919020652771, + "learning_rate": 0.0002, + "loss": 0.4975, + "step": 98790 + }, + { + "epoch": 7.095152603231598, + "grad_norm": 1.0691136121749878, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 98800 + }, + { + "epoch": 7.095870736086176, + "grad_norm": 1.100003957748413, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 98810 + }, + { + "epoch": 7.096588868940754, + "grad_norm": 1.0004968643188477, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 98820 + }, + { + "epoch": 7.097307001795333, + "grad_norm": 1.0497100353240967, + "learning_rate": 0.0002, + "loss": 0.532, + "step": 98830 + }, + { + "epoch": 7.098025134649911, + "grad_norm": 1.0173693895339966, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 98840 + }, + { + "epoch": 7.098743267504489, + "grad_norm": 1.3046447038650513, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 98850 + }, + { + "epoch": 7.099461400359067, + "grad_norm": 1.1587737798690796, + "learning_rate": 0.0002, + "loss": 0.4968, + "step": 98860 + }, + { + "epoch": 7.100179533213645, + "grad_norm": 0.9734950661659241, + "learning_rate": 0.0002, + "loss": 0.5003, + "step": 98870 + }, + { + "epoch": 7.100897666068223, + "grad_norm": 1.2131417989730835, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 98880 + }, + { + "epoch": 7.101615798922801, + "grad_norm": 1.2643247842788696, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 98890 + }, + { + "epoch": 7.102333931777379, + "grad_norm": 1.0531554222106934, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 98900 + }, + { + "epoch": 7.103052064631957, + "grad_norm": 1.0205429792404175, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 98910 + }, + { + "epoch": 7.103770197486535, + "grad_norm": 1.1247005462646484, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 98920 + }, + { + "epoch": 7.1044883303411135, + "grad_norm": 1.1993550062179565, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 98930 + }, + { + "epoch": 7.1052064631956915, + "grad_norm": 1.1030243635177612, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 98940 + }, + { + "epoch": 7.1059245960502695, + "grad_norm": 1.134373426437378, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 98950 + }, + { + "epoch": 7.1066427289048475, + "grad_norm": 1.0449906587600708, + "learning_rate": 0.0002, + "loss": 0.4968, + "step": 98960 + }, + { + "epoch": 7.1073608617594255, + "grad_norm": 0.9911691546440125, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 98970 + }, + { + "epoch": 7.1080789946140035, + "grad_norm": 1.2021015882492065, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 98980 + }, + { + "epoch": 7.1087971274685815, + "grad_norm": 1.1013414859771729, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 98990 + }, + { + "epoch": 7.1095152603231595, + "grad_norm": 1.0632404088974, + "learning_rate": 0.0002, + "loss": 0.519, + "step": 99000 + }, + { + "epoch": 7.1102333931777375, + "grad_norm": 1.1499850749969482, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 99010 + }, + { + "epoch": 7.110951526032316, + "grad_norm": 1.1187937259674072, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 99020 + }, + { + "epoch": 7.111669658886894, + "grad_norm": 1.109269618988037, + "learning_rate": 0.0002, + "loss": 0.4913, + "step": 99030 + }, + { + "epoch": 7.112387791741472, + "grad_norm": 1.04684317111969, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 99040 + }, + { + "epoch": 7.11310592459605, + "grad_norm": 1.142975926399231, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 99050 + }, + { + "epoch": 7.113824057450628, + "grad_norm": 1.0006840229034424, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 99060 + }, + { + "epoch": 7.114542190305206, + "grad_norm": 1.1721967458724976, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 99070 + }, + { + "epoch": 7.115260323159784, + "grad_norm": 1.0295040607452393, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 99080 + }, + { + "epoch": 7.115978456014362, + "grad_norm": 1.2406680583953857, + "learning_rate": 0.0002, + "loss": 0.5251, + "step": 99090 + }, + { + "epoch": 7.11669658886894, + "grad_norm": 1.2812756299972534, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 99100 + }, + { + "epoch": 7.117414721723518, + "grad_norm": 0.9559424519538879, + "learning_rate": 0.0002, + "loss": 0.5016, + "step": 99110 + }, + { + "epoch": 7.118132854578097, + "grad_norm": 1.2253276109695435, + "learning_rate": 0.0002, + "loss": 0.5077, + "step": 99120 + }, + { + "epoch": 7.118850987432675, + "grad_norm": 0.9636382460594177, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 99130 + }, + { + "epoch": 7.119569120287253, + "grad_norm": 0.9765542149543762, + "learning_rate": 0.0002, + "loss": 0.481, + "step": 99140 + }, + { + "epoch": 7.120287253141831, + "grad_norm": 0.8722323775291443, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 99150 + }, + { + "epoch": 7.121005385996409, + "grad_norm": 1.2198525667190552, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 99160 + }, + { + "epoch": 7.121723518850987, + "grad_norm": 0.9809777140617371, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 99170 + }, + { + "epoch": 7.122441651705565, + "grad_norm": 0.9328579902648926, + "learning_rate": 0.0002, + "loss": 0.529, + "step": 99180 + }, + { + "epoch": 7.123159784560143, + "grad_norm": 1.0994173288345337, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 99190 + }, + { + "epoch": 7.123877917414721, + "grad_norm": 0.9433317184448242, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 99200 + }, + { + "epoch": 7.1245960502693, + "grad_norm": 0.9754116535186768, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 99210 + }, + { + "epoch": 7.125314183123878, + "grad_norm": 1.3194613456726074, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 99220 + }, + { + "epoch": 7.126032315978456, + "grad_norm": 1.166597604751587, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 99230 + }, + { + "epoch": 7.126750448833034, + "grad_norm": 1.1221239566802979, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 99240 + }, + { + "epoch": 7.127468581687612, + "grad_norm": 1.1992909908294678, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 99250 + }, + { + "epoch": 7.12818671454219, + "grad_norm": 1.0624475479125977, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 99260 + }, + { + "epoch": 7.128904847396768, + "grad_norm": 0.9556567668914795, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 99270 + }, + { + "epoch": 7.129622980251346, + "grad_norm": 1.3168047666549683, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 99280 + }, + { + "epoch": 7.130341113105924, + "grad_norm": 1.0971012115478516, + "learning_rate": 0.0002, + "loss": 0.5186, + "step": 99290 + }, + { + "epoch": 7.131059245960503, + "grad_norm": 1.287570595741272, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 99300 + }, + { + "epoch": 7.131777378815081, + "grad_norm": 1.4277496337890625, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 99310 + }, + { + "epoch": 7.132495511669659, + "grad_norm": 0.933844268321991, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 99320 + }, + { + "epoch": 7.133213644524237, + "grad_norm": 1.0423851013183594, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 99330 + }, + { + "epoch": 7.133931777378815, + "grad_norm": 1.0162577629089355, + "learning_rate": 0.0002, + "loss": 0.521, + "step": 99340 + }, + { + "epoch": 7.134649910233393, + "grad_norm": 1.0845975875854492, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 99350 + }, + { + "epoch": 7.135368043087971, + "grad_norm": 1.0210866928100586, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 99360 + }, + { + "epoch": 7.136086175942549, + "grad_norm": 0.9540662169456482, + "learning_rate": 0.0002, + "loss": 0.5562, + "step": 99370 + }, + { + "epoch": 7.136804308797127, + "grad_norm": 0.9962146878242493, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 99380 + }, + { + "epoch": 7.137522441651706, + "grad_norm": 1.021399736404419, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 99390 + }, + { + "epoch": 7.138240574506284, + "grad_norm": 1.227946400642395, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 99400 + }, + { + "epoch": 7.138958707360862, + "grad_norm": 1.2851567268371582, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 99410 + }, + { + "epoch": 7.13967684021544, + "grad_norm": 0.9820418953895569, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 99420 + }, + { + "epoch": 7.140394973070018, + "grad_norm": 0.9503002762794495, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 99430 + }, + { + "epoch": 7.141113105924596, + "grad_norm": 0.924704372882843, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 99440 + }, + { + "epoch": 7.141831238779174, + "grad_norm": 1.1376171112060547, + "learning_rate": 0.0002, + "loss": 0.4548, + "step": 99450 + }, + { + "epoch": 7.142549371633752, + "grad_norm": 1.2862539291381836, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 99460 + }, + { + "epoch": 7.14326750448833, + "grad_norm": 1.1068240404129028, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 99470 + }, + { + "epoch": 7.143985637342908, + "grad_norm": 1.3112517595291138, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 99480 + }, + { + "epoch": 7.144703770197487, + "grad_norm": 1.0884982347488403, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 99490 + }, + { + "epoch": 7.145421903052065, + "grad_norm": 1.2093886137008667, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 99500 + }, + { + "epoch": 7.146140035906643, + "grad_norm": 0.9628178477287292, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 99510 + }, + { + "epoch": 7.146858168761221, + "grad_norm": 1.1300674676895142, + "learning_rate": 0.0002, + "loss": 0.527, + "step": 99520 + }, + { + "epoch": 7.147576301615799, + "grad_norm": 0.8746275901794434, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 99530 + }, + { + "epoch": 7.148294434470377, + "grad_norm": 1.034233808517456, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 99540 + }, + { + "epoch": 7.149012567324955, + "grad_norm": 1.0235376358032227, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 99550 + }, + { + "epoch": 7.149730700179533, + "grad_norm": 1.048659324645996, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 99560 + }, + { + "epoch": 7.150448833034111, + "grad_norm": 1.278841495513916, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 99570 + }, + { + "epoch": 7.15116696588869, + "grad_norm": 1.0460485219955444, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 99580 + }, + { + "epoch": 7.151885098743268, + "grad_norm": 1.070234775543213, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 99590 + }, + { + "epoch": 7.152603231597846, + "grad_norm": 1.1036664247512817, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 99600 + }, + { + "epoch": 7.153321364452424, + "grad_norm": 1.212744116783142, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 99610 + }, + { + "epoch": 7.154039497307002, + "grad_norm": 1.1095936298370361, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 99620 + }, + { + "epoch": 7.15475763016158, + "grad_norm": 1.1953791379928589, + "learning_rate": 0.0002, + "loss": 0.4783, + "step": 99630 + }, + { + "epoch": 7.155475763016158, + "grad_norm": 1.3188790082931519, + "learning_rate": 0.0002, + "loss": 0.511, + "step": 99640 + }, + { + "epoch": 7.156193895870736, + "grad_norm": 0.8723140358924866, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 99650 + }, + { + "epoch": 7.156912028725314, + "grad_norm": 0.9156793355941772, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 99660 + }, + { + "epoch": 7.157630161579892, + "grad_norm": 0.9418860673904419, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 99670 + }, + { + "epoch": 7.158348294434471, + "grad_norm": 1.0322530269622803, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 99680 + }, + { + "epoch": 7.159066427289049, + "grad_norm": 1.0246423482894897, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 99690 + }, + { + "epoch": 7.159784560143627, + "grad_norm": 0.8930608630180359, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 99700 + }, + { + "epoch": 7.160502692998205, + "grad_norm": 1.038223385810852, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 99710 + }, + { + "epoch": 7.161220825852783, + "grad_norm": 1.1020445823669434, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 99720 + }, + { + "epoch": 7.161938958707361, + "grad_norm": 0.9623728394508362, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 99730 + }, + { + "epoch": 7.162657091561939, + "grad_norm": 1.0490144491195679, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 99740 + }, + { + "epoch": 7.163375224416517, + "grad_norm": 1.039595127105713, + "learning_rate": 0.0002, + "loss": 0.4951, + "step": 99750 + }, + { + "epoch": 7.164093357271095, + "grad_norm": 1.2656937837600708, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 99760 + }, + { + "epoch": 7.164811490125674, + "grad_norm": 1.469683289527893, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 99770 + }, + { + "epoch": 7.165529622980252, + "grad_norm": 1.1830174922943115, + "learning_rate": 0.0002, + "loss": 0.5348, + "step": 99780 + }, + { + "epoch": 7.16624775583483, + "grad_norm": 1.144771933555603, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 99790 + }, + { + "epoch": 7.166965888689408, + "grad_norm": 0.8902682662010193, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 99800 + }, + { + "epoch": 7.167684021543986, + "grad_norm": 1.0538955926895142, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 99810 + }, + { + "epoch": 7.168402154398564, + "grad_norm": 1.3387681245803833, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 99820 + }, + { + "epoch": 7.169120287253142, + "grad_norm": 1.1162230968475342, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 99830 + }, + { + "epoch": 7.16983842010772, + "grad_norm": 0.9946745038032532, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 99840 + }, + { + "epoch": 7.170556552962298, + "grad_norm": 1.0431642532348633, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 99850 + }, + { + "epoch": 7.1712746858168765, + "grad_norm": 1.1344799995422363, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 99860 + }, + { + "epoch": 7.1719928186714546, + "grad_norm": 0.8978185653686523, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 99870 + }, + { + "epoch": 7.1727109515260326, + "grad_norm": 1.2808794975280762, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 99880 + }, + { + "epoch": 7.1734290843806106, + "grad_norm": 1.0654441118240356, + "learning_rate": 0.0002, + "loss": 0.5222, + "step": 99890 + }, + { + "epoch": 7.174147217235189, + "grad_norm": 1.2751258611679077, + "learning_rate": 0.0002, + "loss": 0.5411, + "step": 99900 + }, + { + "epoch": 7.174865350089767, + "grad_norm": 0.9488890171051025, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 99910 + }, + { + "epoch": 7.175583482944345, + "grad_norm": 1.2057361602783203, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 99920 + }, + { + "epoch": 7.176301615798923, + "grad_norm": 1.2620776891708374, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 99930 + }, + { + "epoch": 7.177019748653501, + "grad_norm": 1.0042833089828491, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 99940 + }, + { + "epoch": 7.177737881508079, + "grad_norm": 0.9716517329216003, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 99950 + }, + { + "epoch": 7.1784560143626575, + "grad_norm": 0.9876767992973328, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 99960 + }, + { + "epoch": 7.1791741472172355, + "grad_norm": 1.0020827054977417, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 99970 + }, + { + "epoch": 7.1798922800718135, + "grad_norm": 1.0674978494644165, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 99980 + }, + { + "epoch": 7.1806104129263915, + "grad_norm": 1.3148112297058105, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 99990 + }, + { + "epoch": 7.1813285457809695, + "grad_norm": 1.048911690711975, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 100000 + }, + { + "epoch": 7.1820466786355475, + "grad_norm": 1.0747761726379395, + "learning_rate": 0.0002, + "loss": 0.5144, + "step": 100010 + }, + { + "epoch": 7.1827648114901255, + "grad_norm": 1.1818102598190308, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 100020 + }, + { + "epoch": 7.1834829443447035, + "grad_norm": 0.9548772573471069, + "learning_rate": 0.0002, + "loss": 0.5178, + "step": 100030 + }, + { + "epoch": 7.1842010771992815, + "grad_norm": 1.2127790451049805, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 100040 + }, + { + "epoch": 7.18491921005386, + "grad_norm": 1.1227222681045532, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 100050 + }, + { + "epoch": 7.185637342908438, + "grad_norm": 1.1687812805175781, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 100060 + }, + { + "epoch": 7.186355475763016, + "grad_norm": 0.9948291182518005, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 100070 + }, + { + "epoch": 7.187073608617594, + "grad_norm": 1.140623688697815, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 100080 + }, + { + "epoch": 7.187791741472172, + "grad_norm": 1.0152307748794556, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 100090 + }, + { + "epoch": 7.18850987432675, + "grad_norm": 1.049146056175232, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 100100 + }, + { + "epoch": 7.189228007181328, + "grad_norm": 0.9283392429351807, + "learning_rate": 0.0002, + "loss": 0.4833, + "step": 100110 + }, + { + "epoch": 7.189946140035906, + "grad_norm": 0.9900078177452087, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 100120 + }, + { + "epoch": 7.190664272890484, + "grad_norm": 0.9017449021339417, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 100130 + }, + { + "epoch": 7.191382405745063, + "grad_norm": 1.0106319189071655, + "learning_rate": 0.0002, + "loss": 0.508, + "step": 100140 + }, + { + "epoch": 7.192100538599641, + "grad_norm": 0.985713541507721, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 100150 + }, + { + "epoch": 7.192818671454219, + "grad_norm": 1.074846863746643, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 100160 + }, + { + "epoch": 7.193536804308797, + "grad_norm": 1.1982495784759521, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 100170 + }, + { + "epoch": 7.194254937163375, + "grad_norm": 0.9354469180107117, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 100180 + }, + { + "epoch": 7.194973070017953, + "grad_norm": 1.289989948272705, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 100190 + }, + { + "epoch": 7.195691202872531, + "grad_norm": 1.2959555387496948, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 100200 + }, + { + "epoch": 7.196409335727109, + "grad_norm": 1.127426266670227, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 100210 + }, + { + "epoch": 7.197127468581687, + "grad_norm": 1.1479859352111816, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 100220 + }, + { + "epoch": 7.197845601436265, + "grad_norm": 0.9798394441604614, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 100230 + }, + { + "epoch": 7.198563734290844, + "grad_norm": 1.155127763748169, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 100240 + }, + { + "epoch": 7.199281867145422, + "grad_norm": 1.051482081413269, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 100250 + }, + { + "epoch": 7.2, + "grad_norm": 1.0441079139709473, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 100260 + }, + { + "epoch": 7.200718132854578, + "grad_norm": 0.9930968284606934, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 100270 + }, + { + "epoch": 7.201436265709156, + "grad_norm": 1.001161813735962, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 100280 + }, + { + "epoch": 7.202154398563734, + "grad_norm": 1.075697898864746, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 100290 + }, + { + "epoch": 7.202872531418312, + "grad_norm": 1.359117031097412, + "learning_rate": 0.0002, + "loss": 0.5232, + "step": 100300 + }, + { + "epoch": 7.20359066427289, + "grad_norm": 0.9824917316436768, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 100310 + }, + { + "epoch": 7.204308797127468, + "grad_norm": 1.0275092124938965, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 100320 + }, + { + "epoch": 7.205026929982047, + "grad_norm": 1.1662230491638184, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 100330 + }, + { + "epoch": 7.205745062836625, + "grad_norm": 1.0671597719192505, + "learning_rate": 0.0002, + "loss": 0.5178, + "step": 100340 + }, + { + "epoch": 7.206463195691203, + "grad_norm": 1.6219303607940674, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 100350 + }, + { + "epoch": 7.207181328545781, + "grad_norm": 1.098658561706543, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 100360 + }, + { + "epoch": 7.207899461400359, + "grad_norm": 1.1623865365982056, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 100370 + }, + { + "epoch": 7.208617594254937, + "grad_norm": 0.9317528009414673, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 100380 + }, + { + "epoch": 7.209335727109515, + "grad_norm": 1.1576400995254517, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 100390 + }, + { + "epoch": 7.210053859964093, + "grad_norm": 1.111785888671875, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 100400 + }, + { + "epoch": 7.210771992818671, + "grad_norm": 1.0347126722335815, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 100410 + }, + { + "epoch": 7.211490125673249, + "grad_norm": 1.2763441801071167, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 100420 + }, + { + "epoch": 7.212208258527828, + "grad_norm": 1.4479249715805054, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 100430 + }, + { + "epoch": 7.212926391382406, + "grad_norm": 1.0243892669677734, + "learning_rate": 0.0002, + "loss": 0.493, + "step": 100440 + }, + { + "epoch": 7.213644524236984, + "grad_norm": 1.099047064781189, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 100450 + }, + { + "epoch": 7.214362657091562, + "grad_norm": 0.9364129900932312, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 100460 + }, + { + "epoch": 7.21508078994614, + "grad_norm": 0.9328993558883667, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 100470 + }, + { + "epoch": 7.215798922800718, + "grad_norm": 1.336569905281067, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 100480 + }, + { + "epoch": 7.216517055655296, + "grad_norm": 1.090484380722046, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 100490 + }, + { + "epoch": 7.217235188509874, + "grad_norm": 0.8246992826461792, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 100500 + }, + { + "epoch": 7.217953321364452, + "grad_norm": 1.1569660902023315, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 100510 + }, + { + "epoch": 7.218671454219031, + "grad_norm": 0.9871801733970642, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 100520 + }, + { + "epoch": 7.219389587073609, + "grad_norm": 0.9819903373718262, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 100530 + }, + { + "epoch": 7.220107719928187, + "grad_norm": 1.251344919204712, + "learning_rate": 0.0002, + "loss": 0.4942, + "step": 100540 + }, + { + "epoch": 7.220825852782765, + "grad_norm": 1.2649824619293213, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 100550 + }, + { + "epoch": 7.221543985637343, + "grad_norm": 1.1401978731155396, + "learning_rate": 0.0002, + "loss": 0.5205, + "step": 100560 + }, + { + "epoch": 7.222262118491921, + "grad_norm": 1.1615785360336304, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 100570 + }, + { + "epoch": 7.222980251346499, + "grad_norm": 1.1743568181991577, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 100580 + }, + { + "epoch": 7.223698384201077, + "grad_norm": 1.1526521444320679, + "learning_rate": 0.0002, + "loss": 0.5526, + "step": 100590 + }, + { + "epoch": 7.224416517055655, + "grad_norm": 1.1919556856155396, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 100600 + }, + { + "epoch": 7.225134649910234, + "grad_norm": 1.1855655908584595, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 100610 + }, + { + "epoch": 7.225852782764812, + "grad_norm": 1.1512478590011597, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 100620 + }, + { + "epoch": 7.22657091561939, + "grad_norm": 0.8307192325592041, + "learning_rate": 0.0002, + "loss": 0.5179, + "step": 100630 + }, + { + "epoch": 7.227289048473968, + "grad_norm": 1.269504189491272, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 100640 + }, + { + "epoch": 7.228007181328546, + "grad_norm": 1.2145130634307861, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 100650 + }, + { + "epoch": 7.228725314183124, + "grad_norm": 1.0325201749801636, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 100660 + }, + { + "epoch": 7.229443447037702, + "grad_norm": 0.9242451190948486, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 100670 + }, + { + "epoch": 7.23016157989228, + "grad_norm": 1.3832745552062988, + "learning_rate": 0.0002, + "loss": 0.4692, + "step": 100680 + }, + { + "epoch": 7.230879712746858, + "grad_norm": 0.9716517925262451, + "learning_rate": 0.0002, + "loss": 0.519, + "step": 100690 + }, + { + "epoch": 7.231597845601437, + "grad_norm": 1.0162315368652344, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 100700 + }, + { + "epoch": 7.232315978456015, + "grad_norm": 1.1335854530334473, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 100710 + }, + { + "epoch": 7.233034111310593, + "grad_norm": 0.9655877947807312, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 100720 + }, + { + "epoch": 7.233752244165171, + "grad_norm": 1.373853087425232, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 100730 + }, + { + "epoch": 7.234470377019749, + "grad_norm": 1.14335298538208, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 100740 + }, + { + "epoch": 7.235188509874327, + "grad_norm": 1.0966235399246216, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 100750 + }, + { + "epoch": 7.235906642728905, + "grad_norm": 1.1448538303375244, + "learning_rate": 0.0002, + "loss": 0.5865, + "step": 100760 + }, + { + "epoch": 7.236624775583483, + "grad_norm": 1.431077003479004, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 100770 + }, + { + "epoch": 7.237342908438061, + "grad_norm": 1.148725986480713, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 100780 + }, + { + "epoch": 7.238061041292639, + "grad_norm": 1.2375414371490479, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 100790 + }, + { + "epoch": 7.238779174147218, + "grad_norm": 1.0722655057907104, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 100800 + }, + { + "epoch": 7.239497307001796, + "grad_norm": 1.1120193004608154, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 100810 + }, + { + "epoch": 7.240215439856374, + "grad_norm": 1.1200876235961914, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 100820 + }, + { + "epoch": 7.240933572710952, + "grad_norm": 0.9498430490493774, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 100830 + }, + { + "epoch": 7.24165170556553, + "grad_norm": 1.0005161762237549, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 100840 + }, + { + "epoch": 7.242369838420108, + "grad_norm": 1.1116056442260742, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 100850 + }, + { + "epoch": 7.243087971274686, + "grad_norm": 1.2970526218414307, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 100860 + }, + { + "epoch": 7.243806104129264, + "grad_norm": 0.9523774981498718, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 100870 + }, + { + "epoch": 7.244524236983842, + "grad_norm": 1.0484211444854736, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 100880 + }, + { + "epoch": 7.2452423698384205, + "grad_norm": 1.2013362646102905, + "learning_rate": 0.0002, + "loss": 0.5118, + "step": 100890 + }, + { + "epoch": 7.2459605026929985, + "grad_norm": 1.0352288484573364, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 100900 + }, + { + "epoch": 7.2466786355475765, + "grad_norm": 1.2752721309661865, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 100910 + }, + { + "epoch": 7.2473967684021545, + "grad_norm": 0.9587982892990112, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 100920 + }, + { + "epoch": 7.2481149012567325, + "grad_norm": 1.57708740234375, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 100930 + }, + { + "epoch": 7.2488330341113105, + "grad_norm": 1.1802852153778076, + "learning_rate": 0.0002, + "loss": 0.5068, + "step": 100940 + }, + { + "epoch": 7.2495511669658885, + "grad_norm": 1.192427396774292, + "learning_rate": 0.0002, + "loss": 0.5178, + "step": 100950 + }, + { + "epoch": 7.2502692998204665, + "grad_norm": 1.138766884803772, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 100960 + }, + { + "epoch": 7.2509874326750445, + "grad_norm": 1.1480544805526733, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 100970 + }, + { + "epoch": 7.2517055655296225, + "grad_norm": 1.096941351890564, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 100980 + }, + { + "epoch": 7.252423698384201, + "grad_norm": 1.16941499710083, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 100990 + }, + { + "epoch": 7.253141831238779, + "grad_norm": 1.138398289680481, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 101000 + }, + { + "epoch": 7.253859964093357, + "grad_norm": 0.9534326791763306, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 101010 + }, + { + "epoch": 7.254578096947935, + "grad_norm": 1.2834177017211914, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 101020 + }, + { + "epoch": 7.255296229802513, + "grad_norm": 1.0083826780319214, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 101030 + }, + { + "epoch": 7.256014362657091, + "grad_norm": 0.8869968056678772, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 101040 + }, + { + "epoch": 7.256732495511669, + "grad_norm": 1.1779630184173584, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 101050 + }, + { + "epoch": 7.257450628366247, + "grad_norm": 0.9937887787818909, + "learning_rate": 0.0002, + "loss": 0.5422, + "step": 101060 + }, + { + "epoch": 7.258168761220825, + "grad_norm": 0.9739404916763306, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 101070 + }, + { + "epoch": 7.258886894075404, + "grad_norm": 0.9721621870994568, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 101080 + }, + { + "epoch": 7.259605026929982, + "grad_norm": 1.0670732259750366, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 101090 + }, + { + "epoch": 7.26032315978456, + "grad_norm": 1.0157248973846436, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 101100 + }, + { + "epoch": 7.261041292639138, + "grad_norm": 0.6791224479675293, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 101110 + }, + { + "epoch": 7.261759425493716, + "grad_norm": 1.168717622756958, + "learning_rate": 0.0002, + "loss": 0.5095, + "step": 101120 + }, + { + "epoch": 7.262477558348294, + "grad_norm": 1.1143511533737183, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 101130 + }, + { + "epoch": 7.263195691202872, + "grad_norm": 1.088230013847351, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 101140 + }, + { + "epoch": 7.26391382405745, + "grad_norm": 1.1834399700164795, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 101150 + }, + { + "epoch": 7.264631956912028, + "grad_norm": 1.0157420635223389, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 101160 + }, + { + "epoch": 7.265350089766607, + "grad_norm": 1.103623390197754, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 101170 + }, + { + "epoch": 7.266068222621185, + "grad_norm": 1.2007834911346436, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 101180 + }, + { + "epoch": 7.266786355475763, + "grad_norm": 1.204030156135559, + "learning_rate": 0.0002, + "loss": 0.4982, + "step": 101190 + }, + { + "epoch": 7.267504488330341, + "grad_norm": 1.0954475402832031, + "learning_rate": 0.0002, + "loss": 0.5361, + "step": 101200 + }, + { + "epoch": 7.268222621184919, + "grad_norm": 1.0195337533950806, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 101210 + }, + { + "epoch": 7.268940754039497, + "grad_norm": 1.0377559661865234, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 101220 + }, + { + "epoch": 7.269658886894075, + "grad_norm": 1.1147254705429077, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 101230 + }, + { + "epoch": 7.270377019748653, + "grad_norm": 1.0451658964157104, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 101240 + }, + { + "epoch": 7.271095152603231, + "grad_norm": 1.2418344020843506, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 101250 + }, + { + "epoch": 7.27181328545781, + "grad_norm": 1.100477933883667, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 101260 + }, + { + "epoch": 7.272531418312388, + "grad_norm": 1.0112155675888062, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 101270 + }, + { + "epoch": 7.273249551166966, + "grad_norm": 1.3673237562179565, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 101280 + }, + { + "epoch": 7.273967684021544, + "grad_norm": 1.0272409915924072, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 101290 + }, + { + "epoch": 7.274685816876122, + "grad_norm": 1.1041511297225952, + "learning_rate": 0.0002, + "loss": 0.515, + "step": 101300 + }, + { + "epoch": 7.2754039497307, + "grad_norm": 1.1367343664169312, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 101310 + }, + { + "epoch": 7.276122082585278, + "grad_norm": 0.936102569103241, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 101320 + }, + { + "epoch": 7.276840215439856, + "grad_norm": 1.1409412622451782, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 101330 + }, + { + "epoch": 7.277558348294434, + "grad_norm": 1.103954553604126, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 101340 + }, + { + "epoch": 7.278276481149012, + "grad_norm": 1.0316593647003174, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 101350 + }, + { + "epoch": 7.278994614003591, + "grad_norm": 1.2040457725524902, + "learning_rate": 0.0002, + "loss": 0.5273, + "step": 101360 + }, + { + "epoch": 7.279712746858169, + "grad_norm": 1.0609431266784668, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 101370 + }, + { + "epoch": 7.280430879712747, + "grad_norm": 1.0759286880493164, + "learning_rate": 0.0002, + "loss": 0.5196, + "step": 101380 + }, + { + "epoch": 7.281149012567325, + "grad_norm": 1.128455400466919, + "learning_rate": 0.0002, + "loss": 0.495, + "step": 101390 + }, + { + "epoch": 7.281867145421903, + "grad_norm": 1.2482393980026245, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 101400 + }, + { + "epoch": 7.282585278276481, + "grad_norm": 1.216482400894165, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 101410 + }, + { + "epoch": 7.283303411131059, + "grad_norm": 1.1360549926757812, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 101420 + }, + { + "epoch": 7.284021543985637, + "grad_norm": 1.1246616840362549, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 101430 + }, + { + "epoch": 7.284739676840215, + "grad_norm": 1.2419198751449585, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 101440 + }, + { + "epoch": 7.285457809694794, + "grad_norm": 1.169204831123352, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 101450 + }, + { + "epoch": 7.286175942549372, + "grad_norm": 0.988856852054596, + "learning_rate": 0.0002, + "loss": 0.562, + "step": 101460 + }, + { + "epoch": 7.28689407540395, + "grad_norm": 1.0422797203063965, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 101470 + }, + { + "epoch": 7.287612208258528, + "grad_norm": 0.9522702097892761, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 101480 + }, + { + "epoch": 7.288330341113106, + "grad_norm": 1.2551125288009644, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 101490 + }, + { + "epoch": 7.289048473967684, + "grad_norm": 1.4335172176361084, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 101500 + }, + { + "epoch": 7.289766606822262, + "grad_norm": 1.1649556159973145, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 101510 + }, + { + "epoch": 7.29048473967684, + "grad_norm": 1.1837944984436035, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 101520 + }, + { + "epoch": 7.291202872531418, + "grad_norm": 1.1103264093399048, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 101530 + }, + { + "epoch": 7.291921005385996, + "grad_norm": 1.0029321908950806, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 101540 + }, + { + "epoch": 7.292639138240575, + "grad_norm": 1.1226013898849487, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 101550 + }, + { + "epoch": 7.293357271095153, + "grad_norm": 1.368054986000061, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 101560 + }, + { + "epoch": 7.294075403949731, + "grad_norm": 1.20630943775177, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 101570 + }, + { + "epoch": 7.294793536804309, + "grad_norm": 1.004388689994812, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 101580 + }, + { + "epoch": 7.295511669658887, + "grad_norm": 1.029399037361145, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 101590 + }, + { + "epoch": 7.296229802513465, + "grad_norm": 1.1087204217910767, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 101600 + }, + { + "epoch": 7.296947935368043, + "grad_norm": 1.1086976528167725, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 101610 + }, + { + "epoch": 7.297666068222621, + "grad_norm": 1.2080177068710327, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 101620 + }, + { + "epoch": 7.298384201077199, + "grad_norm": 1.0005929470062256, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 101630 + }, + { + "epoch": 7.299102333931778, + "grad_norm": 1.0818030834197998, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 101640 + }, + { + "epoch": 7.299820466786356, + "grad_norm": 1.3539172410964966, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 101650 + }, + { + "epoch": 7.300538599640934, + "grad_norm": 1.2323400974273682, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 101660 + }, + { + "epoch": 7.301256732495512, + "grad_norm": 1.0842500925064087, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 101670 + }, + { + "epoch": 7.30197486535009, + "grad_norm": 1.0156948566436768, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 101680 + }, + { + "epoch": 7.302692998204668, + "grad_norm": 0.9736073613166809, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 101690 + }, + { + "epoch": 7.303411131059246, + "grad_norm": 1.130902886390686, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 101700 + }, + { + "epoch": 7.304129263913824, + "grad_norm": 1.0969539880752563, + "learning_rate": 0.0002, + "loss": 0.5118, + "step": 101710 + }, + { + "epoch": 7.304847396768402, + "grad_norm": 1.1104915142059326, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 101720 + }, + { + "epoch": 7.30556552962298, + "grad_norm": 1.3659855127334595, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 101730 + }, + { + "epoch": 7.306283662477559, + "grad_norm": 1.1095956563949585, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 101740 + }, + { + "epoch": 7.307001795332137, + "grad_norm": 1.1549444198608398, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 101750 + }, + { + "epoch": 7.307719928186715, + "grad_norm": 1.0718402862548828, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 101760 + }, + { + "epoch": 7.308438061041293, + "grad_norm": 1.151033639907837, + "learning_rate": 0.0002, + "loss": 0.4963, + "step": 101770 + }, + { + "epoch": 7.309156193895871, + "grad_norm": 0.9531689882278442, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 101780 + }, + { + "epoch": 7.309874326750449, + "grad_norm": 1.3025462627410889, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 101790 + }, + { + "epoch": 7.310592459605027, + "grad_norm": 1.062644600868225, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 101800 + }, + { + "epoch": 7.311310592459605, + "grad_norm": 1.1687922477722168, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 101810 + }, + { + "epoch": 7.312028725314184, + "grad_norm": 1.2879260778427124, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 101820 + }, + { + "epoch": 7.312746858168762, + "grad_norm": 0.9876636862754822, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 101830 + }, + { + "epoch": 7.31346499102334, + "grad_norm": 0.8604402542114258, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 101840 + }, + { + "epoch": 7.314183123877918, + "grad_norm": 1.1162822246551514, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 101850 + }, + { + "epoch": 7.314901256732496, + "grad_norm": 1.095772624015808, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 101860 + }, + { + "epoch": 7.315619389587074, + "grad_norm": 1.0100891590118408, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 101870 + }, + { + "epoch": 7.316337522441652, + "grad_norm": 0.9602094888687134, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 101880 + }, + { + "epoch": 7.31705565529623, + "grad_norm": 1.2045155763626099, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 101890 + }, + { + "epoch": 7.317773788150808, + "grad_norm": 1.014012098312378, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 101900 + }, + { + "epoch": 7.318491921005386, + "grad_norm": 1.0581108331680298, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 101910 + }, + { + "epoch": 7.3192100538599645, + "grad_norm": 0.9462026953697205, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 101920 + }, + { + "epoch": 7.3199281867145425, + "grad_norm": 1.0593115091323853, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 101930 + }, + { + "epoch": 7.3206463195691205, + "grad_norm": 1.1326113939285278, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 101940 + }, + { + "epoch": 7.3213644524236985, + "grad_norm": 0.933236300945282, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 101950 + }, + { + "epoch": 7.3220825852782765, + "grad_norm": 0.9311601519584656, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 101960 + }, + { + "epoch": 7.3228007181328545, + "grad_norm": 1.2303248643875122, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 101970 + }, + { + "epoch": 7.3235188509874325, + "grad_norm": 1.1904213428497314, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 101980 + }, + { + "epoch": 7.3242369838420105, + "grad_norm": 1.281388759613037, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 101990 + }, + { + "epoch": 7.3249551166965885, + "grad_norm": 1.0551466941833496, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 102000 + }, + { + "epoch": 7.325673249551167, + "grad_norm": 1.3299282789230347, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 102010 + }, + { + "epoch": 7.326391382405745, + "grad_norm": 1.2172462940216064, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 102020 + }, + { + "epoch": 7.327109515260323, + "grad_norm": 1.0828213691711426, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 102030 + }, + { + "epoch": 7.327827648114901, + "grad_norm": 1.336836338043213, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 102040 + }, + { + "epoch": 7.328545780969479, + "grad_norm": 1.1681890487670898, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 102050 + }, + { + "epoch": 7.329263913824057, + "grad_norm": 0.9713141918182373, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 102060 + }, + { + "epoch": 7.329982046678635, + "grad_norm": 0.919150710105896, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 102070 + }, + { + "epoch": 7.330700179533213, + "grad_norm": 1.1288635730743408, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 102080 + }, + { + "epoch": 7.331418312387791, + "grad_norm": 1.1016335487365723, + "learning_rate": 0.0002, + "loss": 0.5273, + "step": 102090 + }, + { + "epoch": 7.332136445242369, + "grad_norm": 0.8584099411964417, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 102100 + }, + { + "epoch": 7.332854578096948, + "grad_norm": 1.1394617557525635, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 102110 + }, + { + "epoch": 7.333572710951526, + "grad_norm": 1.0681827068328857, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 102120 + }, + { + "epoch": 7.334290843806104, + "grad_norm": 1.1277847290039062, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 102130 + }, + { + "epoch": 7.335008976660682, + "grad_norm": 1.093695044517517, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 102140 + }, + { + "epoch": 7.33572710951526, + "grad_norm": 1.2288036346435547, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 102150 + }, + { + "epoch": 7.336445242369838, + "grad_norm": 1.0734258890151978, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 102160 + }, + { + "epoch": 7.337163375224416, + "grad_norm": 1.1947388648986816, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 102170 + }, + { + "epoch": 7.337881508078994, + "grad_norm": 0.9444851279258728, + "learning_rate": 0.0002, + "loss": 0.5718, + "step": 102180 + }, + { + "epoch": 7.338599640933572, + "grad_norm": 1.0540008544921875, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 102190 + }, + { + "epoch": 7.339317773788151, + "grad_norm": 1.1238518953323364, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 102200 + }, + { + "epoch": 7.340035906642729, + "grad_norm": 1.129989743232727, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 102210 + }, + { + "epoch": 7.340754039497307, + "grad_norm": 0.8847355842590332, + "learning_rate": 0.0002, + "loss": 0.5158, + "step": 102220 + }, + { + "epoch": 7.341472172351885, + "grad_norm": 1.1628837585449219, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 102230 + }, + { + "epoch": 7.342190305206463, + "grad_norm": 1.1139917373657227, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 102240 + }, + { + "epoch": 7.342908438061041, + "grad_norm": 1.113997220993042, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 102250 + }, + { + "epoch": 7.343626570915619, + "grad_norm": 1.2163578271865845, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 102260 + }, + { + "epoch": 7.344344703770197, + "grad_norm": 1.0641776323318481, + "learning_rate": 0.0002, + "loss": 0.5417, + "step": 102270 + }, + { + "epoch": 7.345062836624775, + "grad_norm": 1.2397149801254272, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 102280 + }, + { + "epoch": 7.345780969479353, + "grad_norm": 1.3043087720870972, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 102290 + }, + { + "epoch": 7.346499102333932, + "grad_norm": 1.0568885803222656, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 102300 + }, + { + "epoch": 7.34721723518851, + "grad_norm": 1.1168477535247803, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 102310 + }, + { + "epoch": 7.347935368043088, + "grad_norm": 1.0510926246643066, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 102320 + }, + { + "epoch": 7.348653500897666, + "grad_norm": 1.0340518951416016, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 102330 + }, + { + "epoch": 7.349371633752244, + "grad_norm": 1.0256576538085938, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 102340 + }, + { + "epoch": 7.350089766606822, + "grad_norm": 1.1578398942947388, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 102350 + }, + { + "epoch": 7.3508078994614, + "grad_norm": 0.9840098023414612, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 102360 + }, + { + "epoch": 7.351526032315978, + "grad_norm": 1.1200997829437256, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 102370 + }, + { + "epoch": 7.352244165170557, + "grad_norm": 1.3507630825042725, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 102380 + }, + { + "epoch": 7.352962298025135, + "grad_norm": 1.156908631324768, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 102390 + }, + { + "epoch": 7.353680430879713, + "grad_norm": 1.2381980419158936, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 102400 + }, + { + "epoch": 7.354398563734291, + "grad_norm": 1.2751537561416626, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 102410 + }, + { + "epoch": 7.355116696588869, + "grad_norm": 1.2542656660079956, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 102420 + }, + { + "epoch": 7.355834829443447, + "grad_norm": 1.1342339515686035, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 102430 + }, + { + "epoch": 7.356552962298025, + "grad_norm": 1.1476532220840454, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 102440 + }, + { + "epoch": 7.357271095152603, + "grad_norm": 1.0370854139328003, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 102450 + }, + { + "epoch": 7.357989228007181, + "grad_norm": 1.137521505355835, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 102460 + }, + { + "epoch": 7.358707360861759, + "grad_norm": 1.1226446628570557, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 102470 + }, + { + "epoch": 7.359425493716338, + "grad_norm": 0.975045382976532, + "learning_rate": 0.0002, + "loss": 0.5395, + "step": 102480 + }, + { + "epoch": 7.360143626570916, + "grad_norm": 1.0371936559677124, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 102490 + }, + { + "epoch": 7.360861759425494, + "grad_norm": 1.264593482017517, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 102500 + }, + { + "epoch": 7.361579892280072, + "grad_norm": 1.2820146083831787, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 102510 + }, + { + "epoch": 7.36229802513465, + "grad_norm": 1.3086479902267456, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 102520 + }, + { + "epoch": 7.363016157989228, + "grad_norm": 1.1097291707992554, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 102530 + }, + { + "epoch": 7.363734290843806, + "grad_norm": 1.3544751405715942, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 102540 + }, + { + "epoch": 7.364452423698384, + "grad_norm": 1.2640280723571777, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 102550 + }, + { + "epoch": 7.365170556552962, + "grad_norm": 0.932267963886261, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 102560 + }, + { + "epoch": 7.365888689407541, + "grad_norm": 1.259298324584961, + "learning_rate": 0.0002, + "loss": 0.5, + "step": 102570 + }, + { + "epoch": 7.366606822262119, + "grad_norm": 1.0883609056472778, + "learning_rate": 0.0002, + "loss": 0.5067, + "step": 102580 + }, + { + "epoch": 7.367324955116697, + "grad_norm": 1.5364124774932861, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 102590 + }, + { + "epoch": 7.368043087971275, + "grad_norm": 1.2528936862945557, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 102600 + }, + { + "epoch": 7.368761220825853, + "grad_norm": 0.9821929335594177, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 102610 + }, + { + "epoch": 7.369479353680431, + "grad_norm": 1.284264326095581, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 102620 + }, + { + "epoch": 7.370197486535009, + "grad_norm": 0.941703736782074, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 102630 + }, + { + "epoch": 7.370915619389587, + "grad_norm": 1.121385931968689, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 102640 + }, + { + "epoch": 7.371633752244165, + "grad_norm": 1.0397694110870361, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 102650 + }, + { + "epoch": 7.372351885098743, + "grad_norm": 1.0811786651611328, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 102660 + }, + { + "epoch": 7.373070017953322, + "grad_norm": 1.2080687284469604, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 102670 + }, + { + "epoch": 7.3737881508079, + "grad_norm": 1.0456428527832031, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 102680 + }, + { + "epoch": 7.374506283662478, + "grad_norm": 1.1772913932800293, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 102690 + }, + { + "epoch": 7.375224416517056, + "grad_norm": 1.209205150604248, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 102700 + }, + { + "epoch": 7.375942549371634, + "grad_norm": 1.220784068107605, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 102710 + }, + { + "epoch": 7.376660682226212, + "grad_norm": 1.0235114097595215, + "learning_rate": 0.0002, + "loss": 0.5084, + "step": 102720 + }, + { + "epoch": 7.37737881508079, + "grad_norm": 1.13937246799469, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 102730 + }, + { + "epoch": 7.378096947935368, + "grad_norm": 1.1369940042495728, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 102740 + }, + { + "epoch": 7.378815080789946, + "grad_norm": 0.9204146265983582, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 102750 + }, + { + "epoch": 7.379533213644525, + "grad_norm": 1.0428136587142944, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 102760 + }, + { + "epoch": 7.380251346499103, + "grad_norm": 1.3043127059936523, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 102770 + }, + { + "epoch": 7.380969479353681, + "grad_norm": 1.1984827518463135, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 102780 + }, + { + "epoch": 7.381687612208259, + "grad_norm": 1.169627070426941, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 102790 + }, + { + "epoch": 7.382405745062837, + "grad_norm": 0.9647679924964905, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 102800 + }, + { + "epoch": 7.383123877917415, + "grad_norm": 1.1284246444702148, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 102810 + }, + { + "epoch": 7.383842010771993, + "grad_norm": 0.9789248704910278, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 102820 + }, + { + "epoch": 7.384560143626571, + "grad_norm": 1.191469669342041, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 102830 + }, + { + "epoch": 7.385278276481149, + "grad_norm": 1.0203280448913574, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 102840 + }, + { + "epoch": 7.385996409335727, + "grad_norm": 1.1877976655960083, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 102850 + }, + { + "epoch": 7.3867145421903055, + "grad_norm": 1.2310867309570312, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 102860 + }, + { + "epoch": 7.3874326750448835, + "grad_norm": 1.0421714782714844, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 102870 + }, + { + "epoch": 7.3881508078994615, + "grad_norm": 1.2161095142364502, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 102880 + }, + { + "epoch": 7.3888689407540395, + "grad_norm": 0.9794706106185913, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 102890 + }, + { + "epoch": 7.3895870736086176, + "grad_norm": 1.2623358964920044, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 102900 + }, + { + "epoch": 7.3903052064631956, + "grad_norm": 0.9731680750846863, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 102910 + }, + { + "epoch": 7.3910233393177736, + "grad_norm": 1.2712689638137817, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 102920 + }, + { + "epoch": 7.391741472172352, + "grad_norm": 0.9469414949417114, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 102930 + }, + { + "epoch": 7.3924596050269304, + "grad_norm": 1.238718867301941, + "learning_rate": 0.0002, + "loss": 0.5252, + "step": 102940 + }, + { + "epoch": 7.3931777378815084, + "grad_norm": 1.262328028678894, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 102950 + }, + { + "epoch": 7.3938958707360865, + "grad_norm": 0.9899580478668213, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 102960 + }, + { + "epoch": 7.3946140035906645, + "grad_norm": 1.1182234287261963, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 102970 + }, + { + "epoch": 7.3953321364452425, + "grad_norm": 1.0213241577148438, + "learning_rate": 0.0002, + "loss": 0.5026, + "step": 102980 + }, + { + "epoch": 7.3960502692998205, + "grad_norm": 1.3077130317687988, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 102990 + }, + { + "epoch": 7.3967684021543985, + "grad_norm": 0.8821753263473511, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 103000 + }, + { + "epoch": 7.3974865350089765, + "grad_norm": 1.1906793117523193, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 103010 + }, + { + "epoch": 7.3982046678635545, + "grad_norm": 0.9587275981903076, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 103020 + }, + { + "epoch": 7.3989228007181325, + "grad_norm": 1.1806607246398926, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 103030 + }, + { + "epoch": 7.399640933572711, + "grad_norm": 1.0863158702850342, + "learning_rate": 0.0002, + "loss": 0.4866, + "step": 103040 + }, + { + "epoch": 7.400359066427289, + "grad_norm": 1.3175718784332275, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 103050 + }, + { + "epoch": 7.401077199281867, + "grad_norm": 1.0932444334030151, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 103060 + }, + { + "epoch": 7.401795332136445, + "grad_norm": 1.079542636871338, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 103070 + }, + { + "epoch": 7.402513464991023, + "grad_norm": 0.9434978365898132, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 103080 + }, + { + "epoch": 7.403231597845601, + "grad_norm": 1.2751423120498657, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 103090 + }, + { + "epoch": 7.403949730700179, + "grad_norm": 1.232871413230896, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 103100 + }, + { + "epoch": 7.404667863554757, + "grad_norm": 0.9898984432220459, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 103110 + }, + { + "epoch": 7.405385996409335, + "grad_norm": 0.8187330961227417, + "learning_rate": 0.0002, + "loss": 0.4788, + "step": 103120 + }, + { + "epoch": 7.406104129263914, + "grad_norm": 1.0267345905303955, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 103130 + }, + { + "epoch": 7.406822262118492, + "grad_norm": 1.018702507019043, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 103140 + }, + { + "epoch": 7.40754039497307, + "grad_norm": 1.2904773950576782, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 103150 + }, + { + "epoch": 7.408258527827648, + "grad_norm": 1.0485228300094604, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 103160 + }, + { + "epoch": 7.408976660682226, + "grad_norm": 1.112001895904541, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 103170 + }, + { + "epoch": 7.409694793536804, + "grad_norm": 0.9980560541152954, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 103180 + }, + { + "epoch": 7.410412926391382, + "grad_norm": 1.002909541130066, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 103190 + }, + { + "epoch": 7.41113105924596, + "grad_norm": 1.2632182836532593, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 103200 + }, + { + "epoch": 7.411849192100538, + "grad_norm": 0.8257913589477539, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 103210 + }, + { + "epoch": 7.412567324955116, + "grad_norm": 0.9777436852455139, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 103220 + }, + { + "epoch": 7.413285457809695, + "grad_norm": 1.1428900957107544, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 103230 + }, + { + "epoch": 7.414003590664273, + "grad_norm": 1.2036991119384766, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 103240 + }, + { + "epoch": 7.414721723518851, + "grad_norm": 1.0227148532867432, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 103250 + }, + { + "epoch": 7.415439856373429, + "grad_norm": 1.160910964012146, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 103260 + }, + { + "epoch": 7.416157989228007, + "grad_norm": 1.2486878633499146, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 103270 + }, + { + "epoch": 7.416876122082585, + "grad_norm": 0.9630030393600464, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 103280 + }, + { + "epoch": 7.417594254937163, + "grad_norm": 1.4181947708129883, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 103290 + }, + { + "epoch": 7.418312387791741, + "grad_norm": 1.173350214958191, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 103300 + }, + { + "epoch": 7.419030520646319, + "grad_norm": 1.2790213823318481, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 103310 + }, + { + "epoch": 7.419748653500898, + "grad_norm": 1.3033418655395508, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 103320 + }, + { + "epoch": 7.420466786355476, + "grad_norm": 1.1796131134033203, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 103330 + }, + { + "epoch": 7.421184919210054, + "grad_norm": 1.2483408451080322, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 103340 + }, + { + "epoch": 7.421903052064632, + "grad_norm": 1.174924373626709, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 103350 + }, + { + "epoch": 7.42262118491921, + "grad_norm": 0.9597971439361572, + "learning_rate": 0.0002, + "loss": 0.5249, + "step": 103360 + }, + { + "epoch": 7.423339317773788, + "grad_norm": 1.029307246208191, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 103370 + }, + { + "epoch": 7.424057450628366, + "grad_norm": 1.2511323690414429, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 103380 + }, + { + "epoch": 7.424775583482944, + "grad_norm": 0.9973678588867188, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 103390 + }, + { + "epoch": 7.425493716337522, + "grad_norm": 1.248966932296753, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 103400 + }, + { + "epoch": 7.4262118491921, + "grad_norm": 1.1157349348068237, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 103410 + }, + { + "epoch": 7.426929982046679, + "grad_norm": 1.268991470336914, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 103420 + }, + { + "epoch": 7.427648114901257, + "grad_norm": 1.163036823272705, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 103430 + }, + { + "epoch": 7.428366247755835, + "grad_norm": 1.136313796043396, + "learning_rate": 0.0002, + "loss": 0.5246, + "step": 103440 + }, + { + "epoch": 7.429084380610413, + "grad_norm": 1.3698488473892212, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 103450 + }, + { + "epoch": 7.429802513464991, + "grad_norm": 1.136257290840149, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 103460 + }, + { + "epoch": 7.430520646319569, + "grad_norm": 1.236160397529602, + "learning_rate": 0.0002, + "loss": 0.5278, + "step": 103470 + }, + { + "epoch": 7.431238779174147, + "grad_norm": 1.1289445161819458, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 103480 + }, + { + "epoch": 7.431956912028725, + "grad_norm": 1.197693943977356, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 103490 + }, + { + "epoch": 7.432675044883303, + "grad_norm": 1.2970328330993652, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 103500 + }, + { + "epoch": 7.433393177737882, + "grad_norm": 1.1042685508728027, + "learning_rate": 0.0002, + "loss": 0.5763, + "step": 103510 + }, + { + "epoch": 7.43411131059246, + "grad_norm": 1.1035256385803223, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 103520 + }, + { + "epoch": 7.434829443447038, + "grad_norm": 1.210533618927002, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 103530 + }, + { + "epoch": 7.435547576301616, + "grad_norm": 1.0207868814468384, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 103540 + }, + { + "epoch": 7.436265709156194, + "grad_norm": 1.023432970046997, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 103550 + }, + { + "epoch": 7.436983842010772, + "grad_norm": 1.1517932415008545, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 103560 + }, + { + "epoch": 7.43770197486535, + "grad_norm": 1.2798852920532227, + "learning_rate": 0.0002, + "loss": 0.4931, + "step": 103570 + }, + { + "epoch": 7.438420107719928, + "grad_norm": 0.9245955348014832, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 103580 + }, + { + "epoch": 7.439138240574506, + "grad_norm": 1.0329653024673462, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 103590 + }, + { + "epoch": 7.439856373429085, + "grad_norm": 0.9156534671783447, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 103600 + }, + { + "epoch": 7.440574506283663, + "grad_norm": 1.0112179517745972, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 103610 + }, + { + "epoch": 7.441292639138241, + "grad_norm": 1.0597492456436157, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 103620 + }, + { + "epoch": 7.442010771992819, + "grad_norm": 1.0997483730316162, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 103630 + }, + { + "epoch": 7.442728904847397, + "grad_norm": 1.0250455141067505, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 103640 + }, + { + "epoch": 7.443447037701975, + "grad_norm": 1.0806883573532104, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 103650 + }, + { + "epoch": 7.444165170556553, + "grad_norm": 1.2387017011642456, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 103660 + }, + { + "epoch": 7.444883303411131, + "grad_norm": 1.0246366262435913, + "learning_rate": 0.0002, + "loss": 0.5084, + "step": 103670 + }, + { + "epoch": 7.445601436265709, + "grad_norm": 1.071362853050232, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 103680 + }, + { + "epoch": 7.446319569120288, + "grad_norm": 1.1581261157989502, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 103690 + }, + { + "epoch": 7.447037701974866, + "grad_norm": 1.1136809587478638, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 103700 + }, + { + "epoch": 7.447755834829444, + "grad_norm": 1.3133236169815063, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 103710 + }, + { + "epoch": 7.448473967684022, + "grad_norm": 1.163678765296936, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 103720 + }, + { + "epoch": 7.4491921005386, + "grad_norm": 1.121063232421875, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 103730 + }, + { + "epoch": 7.449910233393178, + "grad_norm": 1.1806761026382446, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 103740 + }, + { + "epoch": 7.450628366247756, + "grad_norm": 0.9124397039413452, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 103750 + }, + { + "epoch": 7.451346499102334, + "grad_norm": 1.0819965600967407, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 103760 + }, + { + "epoch": 7.452064631956912, + "grad_norm": 1.260360836982727, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 103770 + }, + { + "epoch": 7.45278276481149, + "grad_norm": 1.3185076713562012, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 103780 + }, + { + "epoch": 7.453500897666069, + "grad_norm": 1.182569146156311, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 103790 + }, + { + "epoch": 7.454219030520647, + "grad_norm": 1.42801034450531, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 103800 + }, + { + "epoch": 7.454937163375225, + "grad_norm": 1.1232067346572876, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 103810 + }, + { + "epoch": 7.455655296229803, + "grad_norm": 0.9760740399360657, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 103820 + }, + { + "epoch": 7.456373429084381, + "grad_norm": 1.1086724996566772, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 103830 + }, + { + "epoch": 7.457091561938959, + "grad_norm": 1.293244481086731, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 103840 + }, + { + "epoch": 7.457809694793537, + "grad_norm": 1.0689499378204346, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 103850 + }, + { + "epoch": 7.458527827648115, + "grad_norm": 1.208716869354248, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 103860 + }, + { + "epoch": 7.459245960502693, + "grad_norm": 1.0105576515197754, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 103870 + }, + { + "epoch": 7.4599640933572715, + "grad_norm": 1.1546603441238403, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 103880 + }, + { + "epoch": 7.4606822262118495, + "grad_norm": 1.258599042892456, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 103890 + }, + { + "epoch": 7.4614003590664275, + "grad_norm": 1.2506718635559082, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 103900 + }, + { + "epoch": 7.4621184919210055, + "grad_norm": 1.0375752449035645, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 103910 + }, + { + "epoch": 7.4628366247755835, + "grad_norm": 1.0918235778808594, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 103920 + }, + { + "epoch": 7.4635547576301615, + "grad_norm": 1.2511614561080933, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 103930 + }, + { + "epoch": 7.4642728904847395, + "grad_norm": 0.9855675101280212, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 103940 + }, + { + "epoch": 7.4649910233393175, + "grad_norm": 1.1818993091583252, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 103950 + }, + { + "epoch": 7.4657091561938955, + "grad_norm": 1.2684056758880615, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 103960 + }, + { + "epoch": 7.4664272890484735, + "grad_norm": 1.3526806831359863, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 103970 + }, + { + "epoch": 7.467145421903052, + "grad_norm": 1.1802287101745605, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 103980 + }, + { + "epoch": 7.46786355475763, + "grad_norm": 1.0627036094665527, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 103990 + }, + { + "epoch": 7.468581687612208, + "grad_norm": 1.2383025884628296, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 104000 + }, + { + "epoch": 7.469299820466786, + "grad_norm": 1.2024378776550293, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 104010 + }, + { + "epoch": 7.470017953321364, + "grad_norm": 0.8383823037147522, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 104020 + }, + { + "epoch": 7.470736086175942, + "grad_norm": 1.0333143472671509, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 104030 + }, + { + "epoch": 7.47145421903052, + "grad_norm": 1.232338309288025, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 104040 + }, + { + "epoch": 7.472172351885098, + "grad_norm": 1.1523895263671875, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 104050 + }, + { + "epoch": 7.472890484739676, + "grad_norm": 1.2198411226272583, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 104060 + }, + { + "epoch": 7.473608617594255, + "grad_norm": 1.1921417713165283, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 104070 + }, + { + "epoch": 7.474326750448833, + "grad_norm": 1.174011468887329, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 104080 + }, + { + "epoch": 7.475044883303411, + "grad_norm": 1.3201649188995361, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 104090 + }, + { + "epoch": 7.475763016157989, + "grad_norm": 0.9371066689491272, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 104100 + }, + { + "epoch": 7.476481149012567, + "grad_norm": 1.4846594333648682, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 104110 + }, + { + "epoch": 7.477199281867145, + "grad_norm": 1.1780450344085693, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 104120 + }, + { + "epoch": 7.477917414721723, + "grad_norm": 1.2080824375152588, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 104130 + }, + { + "epoch": 7.478635547576301, + "grad_norm": 1.0390220880508423, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 104140 + }, + { + "epoch": 7.479353680430879, + "grad_norm": 0.8703257441520691, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 104150 + }, + { + "epoch": 7.480071813285457, + "grad_norm": 1.017080307006836, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 104160 + }, + { + "epoch": 7.480789946140036, + "grad_norm": 1.2483022212982178, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 104170 + }, + { + "epoch": 7.481508078994614, + "grad_norm": 1.0958250761032104, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 104180 + }, + { + "epoch": 7.482226211849192, + "grad_norm": 1.1949903964996338, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 104190 + }, + { + "epoch": 7.48294434470377, + "grad_norm": 1.2361127138137817, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 104200 + }, + { + "epoch": 7.483662477558348, + "grad_norm": 1.2279026508331299, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 104210 + }, + { + "epoch": 7.484380610412926, + "grad_norm": 1.0336331129074097, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 104220 + }, + { + "epoch": 7.485098743267504, + "grad_norm": 1.0021189451217651, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 104230 + }, + { + "epoch": 7.485816876122082, + "grad_norm": 1.1586246490478516, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 104240 + }, + { + "epoch": 7.486535008976661, + "grad_norm": 0.9006508588790894, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 104250 + }, + { + "epoch": 7.487253141831239, + "grad_norm": 1.2152459621429443, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 104260 + }, + { + "epoch": 7.487971274685817, + "grad_norm": 1.0048519372940063, + "learning_rate": 0.0002, + "loss": 0.5437, + "step": 104270 + }, + { + "epoch": 7.488689407540395, + "grad_norm": 1.1151599884033203, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 104280 + }, + { + "epoch": 7.489407540394973, + "grad_norm": 0.9922400116920471, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 104290 + }, + { + "epoch": 7.490125673249551, + "grad_norm": 1.137277364730835, + "learning_rate": 0.0002, + "loss": 0.5033, + "step": 104300 + }, + { + "epoch": 7.490843806104129, + "grad_norm": 1.381284475326538, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 104310 + }, + { + "epoch": 7.491561938958707, + "grad_norm": 1.0104176998138428, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 104320 + }, + { + "epoch": 7.492280071813285, + "grad_norm": 1.1292575597763062, + "learning_rate": 0.0002, + "loss": 0.507, + "step": 104330 + }, + { + "epoch": 7.492998204667863, + "grad_norm": 1.0010626316070557, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 104340 + }, + { + "epoch": 7.493716337522442, + "grad_norm": 0.9468943476676941, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 104350 + }, + { + "epoch": 7.49443447037702, + "grad_norm": 1.0348953008651733, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 104360 + }, + { + "epoch": 7.495152603231598, + "grad_norm": 1.0347660779953003, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 104370 + }, + { + "epoch": 7.495870736086176, + "grad_norm": 1.1240533590316772, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 104380 + }, + { + "epoch": 7.496588868940754, + "grad_norm": 0.8433300852775574, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 104390 + }, + { + "epoch": 7.497307001795332, + "grad_norm": 1.0124489068984985, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 104400 + }, + { + "epoch": 7.49802513464991, + "grad_norm": 1.050297498703003, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 104410 + }, + { + "epoch": 7.498743267504488, + "grad_norm": 1.226494312286377, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 104420 + }, + { + "epoch": 7.499461400359066, + "grad_norm": 1.0367873907089233, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 104430 + }, + { + "epoch": 7.500179533213645, + "grad_norm": 1.2138985395431519, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 104440 + }, + { + "epoch": 7.500897666068223, + "grad_norm": 1.2024848461151123, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 104450 + }, + { + "epoch": 7.501615798922801, + "grad_norm": 0.9568573832511902, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 104460 + }, + { + "epoch": 7.502333931777379, + "grad_norm": 0.959540605545044, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 104470 + }, + { + "epoch": 7.503052064631957, + "grad_norm": 1.1272302865982056, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 104480 + }, + { + "epoch": 7.503770197486535, + "grad_norm": 1.1625477075576782, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 104490 + }, + { + "epoch": 7.504488330341113, + "grad_norm": 1.1393729448318481, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 104500 + }, + { + "epoch": 7.505206463195691, + "grad_norm": 1.1496871709823608, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 104510 + }, + { + "epoch": 7.505924596050269, + "grad_norm": 1.10691237449646, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 104520 + }, + { + "epoch": 7.506642728904847, + "grad_norm": 1.1505173444747925, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 104530 + }, + { + "epoch": 7.507360861759426, + "grad_norm": 1.2328600883483887, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 104540 + }, + { + "epoch": 7.508078994614004, + "grad_norm": 1.0103087425231934, + "learning_rate": 0.0002, + "loss": 0.5457, + "step": 104550 + }, + { + "epoch": 7.508797127468582, + "grad_norm": 1.1978994607925415, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 104560 + }, + { + "epoch": 7.50951526032316, + "grad_norm": 1.070842981338501, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 104570 + }, + { + "epoch": 7.510233393177738, + "grad_norm": 1.1058868169784546, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 104580 + }, + { + "epoch": 7.510951526032316, + "grad_norm": 1.383592963218689, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 104590 + }, + { + "epoch": 7.511669658886894, + "grad_norm": 1.2177189588546753, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 104600 + }, + { + "epoch": 7.512387791741472, + "grad_norm": 1.7231167554855347, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 104610 + }, + { + "epoch": 7.513105924596051, + "grad_norm": 0.9763862490653992, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 104620 + }, + { + "epoch": 7.513824057450629, + "grad_norm": 1.242191195487976, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 104630 + }, + { + "epoch": 7.514542190305207, + "grad_norm": 0.9510217308998108, + "learning_rate": 0.0002, + "loss": 0.5051, + "step": 104640 + }, + { + "epoch": 7.515260323159785, + "grad_norm": 1.260542631149292, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 104650 + }, + { + "epoch": 7.515978456014363, + "grad_norm": 0.9604901075363159, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 104660 + }, + { + "epoch": 7.516696588868941, + "grad_norm": 1.0860100984573364, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 104670 + }, + { + "epoch": 7.517414721723519, + "grad_norm": 0.9627196192741394, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 104680 + }, + { + "epoch": 7.518132854578097, + "grad_norm": 1.0736050605773926, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 104690 + }, + { + "epoch": 7.518850987432675, + "grad_norm": 1.150801420211792, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 104700 + }, + { + "epoch": 7.519569120287253, + "grad_norm": 1.1193088293075562, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 104710 + }, + { + "epoch": 7.520287253141831, + "grad_norm": 1.0462759733200073, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 104720 + }, + { + "epoch": 7.52100538599641, + "grad_norm": 0.8539935946464539, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 104730 + }, + { + "epoch": 7.521723518850988, + "grad_norm": 1.1345696449279785, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 104740 + }, + { + "epoch": 7.522441651705566, + "grad_norm": 1.0367025136947632, + "learning_rate": 0.0002, + "loss": 0.4941, + "step": 104750 + }, + { + "epoch": 7.523159784560144, + "grad_norm": 1.3531326055526733, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 104760 + }, + { + "epoch": 7.523877917414722, + "grad_norm": 0.8530771136283875, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 104770 + }, + { + "epoch": 7.5245960502693, + "grad_norm": 1.0597292184829712, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 104780 + }, + { + "epoch": 7.525314183123878, + "grad_norm": 1.0896775722503662, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 104790 + }, + { + "epoch": 7.526032315978456, + "grad_norm": 1.3138227462768555, + "learning_rate": 0.0002, + "loss": 0.508, + "step": 104800 + }, + { + "epoch": 7.526750448833035, + "grad_norm": 0.9158141016960144, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 104810 + }, + { + "epoch": 7.527468581687613, + "grad_norm": 1.1566123962402344, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 104820 + }, + { + "epoch": 7.528186714542191, + "grad_norm": 1.138040542602539, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 104830 + }, + { + "epoch": 7.528904847396769, + "grad_norm": 1.0407382249832153, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 104840 + }, + { + "epoch": 7.529622980251347, + "grad_norm": 1.104064702987671, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 104850 + }, + { + "epoch": 7.530341113105925, + "grad_norm": 1.040507435798645, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 104860 + }, + { + "epoch": 7.531059245960503, + "grad_norm": 1.146317958831787, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 104870 + }, + { + "epoch": 7.531777378815081, + "grad_norm": 1.0730783939361572, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 104880 + }, + { + "epoch": 7.532495511669659, + "grad_norm": 1.2540011405944824, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 104890 + }, + { + "epoch": 7.533213644524237, + "grad_norm": 1.0158214569091797, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 104900 + }, + { + "epoch": 7.533931777378815, + "grad_norm": 1.0645452737808228, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 104910 + }, + { + "epoch": 7.5346499102333935, + "grad_norm": 1.1173311471939087, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 104920 + }, + { + "epoch": 7.5353680430879715, + "grad_norm": 1.091782808303833, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 104930 + }, + { + "epoch": 7.5360861759425495, + "grad_norm": 1.1219462156295776, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 104940 + }, + { + "epoch": 7.5368043087971275, + "grad_norm": 1.2164716720581055, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 104950 + }, + { + "epoch": 7.5375224416517055, + "grad_norm": 1.0167542695999146, + "learning_rate": 0.0002, + "loss": 0.5186, + "step": 104960 + }, + { + "epoch": 7.5382405745062835, + "grad_norm": 1.029844045639038, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 104970 + }, + { + "epoch": 7.5389587073608615, + "grad_norm": 1.004914402961731, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 104980 + }, + { + "epoch": 7.5396768402154395, + "grad_norm": 1.151977300643921, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 104990 + }, + { + "epoch": 7.540394973070018, + "grad_norm": 1.063069462776184, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 105000 + }, + { + "epoch": 7.541113105924596, + "grad_norm": 0.9950627684593201, + "learning_rate": 0.0002, + "loss": 0.5278, + "step": 105010 + }, + { + "epoch": 7.541831238779174, + "grad_norm": 0.9897221922874451, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 105020 + }, + { + "epoch": 7.542549371633752, + "grad_norm": 1.220423698425293, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 105030 + }, + { + "epoch": 7.54326750448833, + "grad_norm": 1.0800561904907227, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 105040 + }, + { + "epoch": 7.543985637342908, + "grad_norm": 1.1115468740463257, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 105050 + }, + { + "epoch": 7.544703770197486, + "grad_norm": 1.1754465103149414, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 105060 + }, + { + "epoch": 7.545421903052064, + "grad_norm": 0.8769645690917969, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 105070 + }, + { + "epoch": 7.546140035906642, + "grad_norm": 1.0276274681091309, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 105080 + }, + { + "epoch": 7.54685816876122, + "grad_norm": 1.2642459869384766, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 105090 + }, + { + "epoch": 7.547576301615799, + "grad_norm": 1.1204240322113037, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 105100 + }, + { + "epoch": 7.548294434470377, + "grad_norm": 1.1700465679168701, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 105110 + }, + { + "epoch": 7.549012567324955, + "grad_norm": 0.921738862991333, + "learning_rate": 0.0002, + "loss": 0.5494, + "step": 105120 + }, + { + "epoch": 7.549730700179533, + "grad_norm": 1.0517377853393555, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 105130 + }, + { + "epoch": 7.550448833034111, + "grad_norm": 0.8750519156455994, + "learning_rate": 0.0002, + "loss": 0.5369, + "step": 105140 + }, + { + "epoch": 7.551166965888689, + "grad_norm": 0.9947483539581299, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 105150 + }, + { + "epoch": 7.551885098743267, + "grad_norm": 1.133035659790039, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 105160 + }, + { + "epoch": 7.552603231597845, + "grad_norm": 1.0302581787109375, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 105170 + }, + { + "epoch": 7.553321364452424, + "grad_norm": 1.0290307998657227, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 105180 + }, + { + "epoch": 7.554039497307002, + "grad_norm": 1.2476361989974976, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 105190 + }, + { + "epoch": 7.55475763016158, + "grad_norm": 1.1051201820373535, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 105200 + }, + { + "epoch": 7.555475763016158, + "grad_norm": 1.4432711601257324, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 105210 + }, + { + "epoch": 7.556193895870736, + "grad_norm": 1.1134647130966187, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 105220 + }, + { + "epoch": 7.556912028725314, + "grad_norm": 1.2649270296096802, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 105230 + }, + { + "epoch": 7.557630161579892, + "grad_norm": 0.9547544717788696, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 105240 + }, + { + "epoch": 7.55834829443447, + "grad_norm": 1.153113842010498, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 105250 + }, + { + "epoch": 7.559066427289048, + "grad_norm": 1.0354572534561157, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 105260 + }, + { + "epoch": 7.559784560143626, + "grad_norm": 1.2131483554840088, + "learning_rate": 0.0002, + "loss": 0.5673, + "step": 105270 + }, + { + "epoch": 7.560502692998204, + "grad_norm": 0.9127926826477051, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 105280 + }, + { + "epoch": 7.561220825852783, + "grad_norm": 1.1065036058425903, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 105290 + }, + { + "epoch": 7.561938958707361, + "grad_norm": 1.133322834968567, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 105300 + }, + { + "epoch": 7.562657091561939, + "grad_norm": 0.9822283387184143, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 105310 + }, + { + "epoch": 7.563375224416517, + "grad_norm": 1.0777708292007446, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 105320 + }, + { + "epoch": 7.564093357271095, + "grad_norm": 1.0826656818389893, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 105330 + }, + { + "epoch": 7.564811490125673, + "grad_norm": 1.1842281818389893, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 105340 + }, + { + "epoch": 7.565529622980251, + "grad_norm": 1.1248035430908203, + "learning_rate": 0.0002, + "loss": 0.553, + "step": 105350 + }, + { + "epoch": 7.566247755834829, + "grad_norm": 0.9905921220779419, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 105360 + }, + { + "epoch": 7.566965888689408, + "grad_norm": 1.0215412378311157, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 105370 + }, + { + "epoch": 7.567684021543986, + "grad_norm": 1.2403844594955444, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 105380 + }, + { + "epoch": 7.568402154398564, + "grad_norm": 1.2371299266815186, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 105390 + }, + { + "epoch": 7.569120287253142, + "grad_norm": 1.2021104097366333, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 105400 + }, + { + "epoch": 7.56983842010772, + "grad_norm": 1.1641038656234741, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 105410 + }, + { + "epoch": 7.570556552962298, + "grad_norm": 1.1443949937820435, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 105420 + }, + { + "epoch": 7.571274685816876, + "grad_norm": 1.1318271160125732, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 105430 + }, + { + "epoch": 7.571992818671454, + "grad_norm": 1.3928632736206055, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 105440 + }, + { + "epoch": 7.572710951526032, + "grad_norm": 1.1141331195831299, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 105450 + }, + { + "epoch": 7.57342908438061, + "grad_norm": 1.301546573638916, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 105460 + }, + { + "epoch": 7.574147217235188, + "grad_norm": 1.1085830926895142, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 105470 + }, + { + "epoch": 7.574865350089767, + "grad_norm": 0.9858543872833252, + "learning_rate": 0.0002, + "loss": 0.532, + "step": 105480 + }, + { + "epoch": 7.575583482944345, + "grad_norm": 1.0768673419952393, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 105490 + }, + { + "epoch": 7.576301615798923, + "grad_norm": 1.0940971374511719, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 105500 + }, + { + "epoch": 7.577019748653501, + "grad_norm": 1.2131849527359009, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 105510 + }, + { + "epoch": 7.577737881508079, + "grad_norm": 1.139255166053772, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 105520 + }, + { + "epoch": 7.578456014362657, + "grad_norm": 1.1880031824111938, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 105530 + }, + { + "epoch": 7.579174147217235, + "grad_norm": 1.1227078437805176, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 105540 + }, + { + "epoch": 7.579892280071813, + "grad_norm": 0.9665518999099731, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 105550 + }, + { + "epoch": 7.580610412926392, + "grad_norm": 1.2579736709594727, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 105560 + }, + { + "epoch": 7.58132854578097, + "grad_norm": 1.3003990650177002, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 105570 + }, + { + "epoch": 7.582046678635548, + "grad_norm": 1.0537091493606567, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 105580 + }, + { + "epoch": 7.582764811490126, + "grad_norm": 1.2199420928955078, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 105590 + }, + { + "epoch": 7.583482944344704, + "grad_norm": 1.1907626390457153, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 105600 + }, + { + "epoch": 7.584201077199282, + "grad_norm": 1.0684664249420166, + "learning_rate": 0.0002, + "loss": 0.5403, + "step": 105610 + }, + { + "epoch": 7.58491921005386, + "grad_norm": 1.1190338134765625, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 105620 + }, + { + "epoch": 7.585637342908438, + "grad_norm": 1.0873574018478394, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 105630 + }, + { + "epoch": 7.586355475763016, + "grad_norm": 1.0512418746948242, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 105640 + }, + { + "epoch": 7.587073608617594, + "grad_norm": 1.3036644458770752, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 105650 + }, + { + "epoch": 7.587791741472173, + "grad_norm": 1.037948489189148, + "learning_rate": 0.0002, + "loss": 0.5598, + "step": 105660 + }, + { + "epoch": 7.588509874326751, + "grad_norm": 0.987514317035675, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 105670 + }, + { + "epoch": 7.589228007181329, + "grad_norm": 1.2718415260314941, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 105680 + }, + { + "epoch": 7.589946140035907, + "grad_norm": 1.2168786525726318, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 105690 + }, + { + "epoch": 7.590664272890485, + "grad_norm": 1.0258911848068237, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 105700 + }, + { + "epoch": 7.591382405745063, + "grad_norm": 1.0203795433044434, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 105710 + }, + { + "epoch": 7.592100538599641, + "grad_norm": 1.1677968502044678, + "learning_rate": 0.0002, + "loss": 0.5411, + "step": 105720 + }, + { + "epoch": 7.592818671454219, + "grad_norm": 1.4036188125610352, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 105730 + }, + { + "epoch": 7.593536804308797, + "grad_norm": 1.0176831483840942, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 105740 + }, + { + "epoch": 7.594254937163376, + "grad_norm": 1.1458805799484253, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 105750 + }, + { + "epoch": 7.594973070017954, + "grad_norm": 1.038974642753601, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 105760 + }, + { + "epoch": 7.595691202872532, + "grad_norm": 1.247301697731018, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 105770 + }, + { + "epoch": 7.59640933572711, + "grad_norm": 0.8886832594871521, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 105780 + }, + { + "epoch": 7.597127468581688, + "grad_norm": 1.1210025548934937, + "learning_rate": 0.0002, + "loss": 0.5249, + "step": 105790 + }, + { + "epoch": 7.597845601436266, + "grad_norm": 1.1681327819824219, + "learning_rate": 0.0002, + "loss": 0.5422, + "step": 105800 + }, + { + "epoch": 7.598563734290844, + "grad_norm": 1.1547762155532837, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 105810 + }, + { + "epoch": 7.599281867145422, + "grad_norm": 1.1720976829528809, + "learning_rate": 0.0002, + "loss": 0.5183, + "step": 105820 + }, + { + "epoch": 7.6, + "grad_norm": 1.0706144571304321, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 105830 + }, + { + "epoch": 7.600718132854578, + "grad_norm": 1.031205415725708, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 105840 + }, + { + "epoch": 7.6014362657091565, + "grad_norm": 1.1801010370254517, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 105850 + }, + { + "epoch": 7.6021543985637345, + "grad_norm": 1.0154755115509033, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 105860 + }, + { + "epoch": 7.6028725314183125, + "grad_norm": 1.0330030918121338, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 105870 + }, + { + "epoch": 7.6035906642728905, + "grad_norm": 0.9404476881027222, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 105880 + }, + { + "epoch": 7.6043087971274685, + "grad_norm": 1.0264246463775635, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 105890 + }, + { + "epoch": 7.6050269299820465, + "grad_norm": 1.154560923576355, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 105900 + }, + { + "epoch": 7.6057450628366245, + "grad_norm": 0.8954422473907471, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 105910 + }, + { + "epoch": 7.6064631956912026, + "grad_norm": 0.9354978799819946, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 105920 + }, + { + "epoch": 7.607181328545781, + "grad_norm": 1.2349580526351929, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 105930 + }, + { + "epoch": 7.607899461400359, + "grad_norm": 1.0203192234039307, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 105940 + }, + { + "epoch": 7.608617594254937, + "grad_norm": 0.8431771397590637, + "learning_rate": 0.0002, + "loss": 0.5231, + "step": 105950 + }, + { + "epoch": 7.6093357271095154, + "grad_norm": 1.1733695268630981, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 105960 + }, + { + "epoch": 7.6100538599640934, + "grad_norm": 0.965118408203125, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 105970 + }, + { + "epoch": 7.6107719928186714, + "grad_norm": 0.987450897693634, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 105980 + }, + { + "epoch": 7.6114901256732495, + "grad_norm": 1.2337433099746704, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 105990 + }, + { + "epoch": 7.6122082585278275, + "grad_norm": 1.2976964712142944, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 106000 + }, + { + "epoch": 7.6129263913824055, + "grad_norm": 1.0748823881149292, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 106010 + }, + { + "epoch": 7.6136445242369835, + "grad_norm": 1.2771751880645752, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 106020 + }, + { + "epoch": 7.6143626570915615, + "grad_norm": 0.9651449918746948, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 106030 + }, + { + "epoch": 7.61508078994614, + "grad_norm": 1.4248602390289307, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 106040 + }, + { + "epoch": 7.615798922800718, + "grad_norm": 1.1568830013275146, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 106050 + }, + { + "epoch": 7.616517055655296, + "grad_norm": 1.2090665102005005, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 106060 + }, + { + "epoch": 7.617235188509874, + "grad_norm": 1.0982604026794434, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 106070 + }, + { + "epoch": 7.617953321364452, + "grad_norm": 1.0705735683441162, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 106080 + }, + { + "epoch": 7.61867145421903, + "grad_norm": 1.1313707828521729, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 106090 + }, + { + "epoch": 7.619389587073608, + "grad_norm": 1.2538282871246338, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 106100 + }, + { + "epoch": 7.620107719928186, + "grad_norm": 1.374280571937561, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 106110 + }, + { + "epoch": 7.620825852782765, + "grad_norm": 1.024248719215393, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 106120 + }, + { + "epoch": 7.621543985637343, + "grad_norm": 0.9976266622543335, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 106130 + }, + { + "epoch": 7.622262118491921, + "grad_norm": 1.2104789018630981, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 106140 + }, + { + "epoch": 7.622980251346499, + "grad_norm": 1.154041051864624, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 106150 + }, + { + "epoch": 7.623698384201077, + "grad_norm": 1.1514118909835815, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 106160 + }, + { + "epoch": 7.624416517055655, + "grad_norm": 0.9994077086448669, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 106170 + }, + { + "epoch": 7.625134649910233, + "grad_norm": 1.0648950338363647, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 106180 + }, + { + "epoch": 7.625852782764811, + "grad_norm": 1.247307538986206, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 106190 + }, + { + "epoch": 7.626570915619389, + "grad_norm": 1.2144126892089844, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 106200 + }, + { + "epoch": 7.627289048473967, + "grad_norm": 1.196209192276001, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 106210 + }, + { + "epoch": 7.628007181328546, + "grad_norm": 1.0064209699630737, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 106220 + }, + { + "epoch": 7.628725314183124, + "grad_norm": 1.0938220024108887, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 106230 + }, + { + "epoch": 7.629443447037702, + "grad_norm": 1.0046473741531372, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 106240 + }, + { + "epoch": 7.63016157989228, + "grad_norm": 1.1092835664749146, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 106250 + }, + { + "epoch": 7.630879712746858, + "grad_norm": 1.0419597625732422, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 106260 + }, + { + "epoch": 7.631597845601436, + "grad_norm": 1.115281581878662, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 106270 + }, + { + "epoch": 7.632315978456014, + "grad_norm": 0.926291823387146, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 106280 + }, + { + "epoch": 7.633034111310592, + "grad_norm": 1.2301737070083618, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 106290 + }, + { + "epoch": 7.63375224416517, + "grad_norm": 1.2254445552825928, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 106300 + }, + { + "epoch": 7.634470377019749, + "grad_norm": 0.9048781394958496, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 106310 + }, + { + "epoch": 7.635188509874327, + "grad_norm": 0.9848755598068237, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 106320 + }, + { + "epoch": 7.635906642728905, + "grad_norm": 1.056156873703003, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 106330 + }, + { + "epoch": 7.636624775583483, + "grad_norm": 1.2103949785232544, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 106340 + }, + { + "epoch": 7.637342908438061, + "grad_norm": 0.9873999953269958, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 106350 + }, + { + "epoch": 7.638061041292639, + "grad_norm": 1.0306750535964966, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 106360 + }, + { + "epoch": 7.638779174147217, + "grad_norm": 1.1849476099014282, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 106370 + }, + { + "epoch": 7.639497307001795, + "grad_norm": 1.231707215309143, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 106380 + }, + { + "epoch": 7.640215439856373, + "grad_norm": 1.194321632385254, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 106390 + }, + { + "epoch": 7.640933572710951, + "grad_norm": 1.0539367198944092, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 106400 + }, + { + "epoch": 7.64165170556553, + "grad_norm": 1.1701070070266724, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 106410 + }, + { + "epoch": 7.642369838420108, + "grad_norm": 1.2178397178649902, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 106420 + }, + { + "epoch": 7.643087971274686, + "grad_norm": 0.9702774286270142, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 106430 + }, + { + "epoch": 7.643806104129264, + "grad_norm": 1.0613373517990112, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 106440 + }, + { + "epoch": 7.644524236983842, + "grad_norm": 1.0604264736175537, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 106450 + }, + { + "epoch": 7.64524236983842, + "grad_norm": 0.8836958408355713, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 106460 + }, + { + "epoch": 7.645960502692998, + "grad_norm": 1.1939433813095093, + "learning_rate": 0.0002, + "loss": 0.5337, + "step": 106470 + }, + { + "epoch": 7.646678635547576, + "grad_norm": 1.1198155879974365, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 106480 + }, + { + "epoch": 7.647396768402155, + "grad_norm": 1.1567481756210327, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 106490 + }, + { + "epoch": 7.648114901256733, + "grad_norm": 1.1108657121658325, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 106500 + }, + { + "epoch": 7.648833034111311, + "grad_norm": 1.116945505142212, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 106510 + }, + { + "epoch": 7.649551166965889, + "grad_norm": 0.951562762260437, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 106520 + }, + { + "epoch": 7.650269299820467, + "grad_norm": 1.1393115520477295, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 106530 + }, + { + "epoch": 7.650987432675045, + "grad_norm": 1.0645884275436401, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 106540 + }, + { + "epoch": 7.651705565529623, + "grad_norm": 1.0742363929748535, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 106550 + }, + { + "epoch": 7.652423698384201, + "grad_norm": 1.2417876720428467, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 106560 + }, + { + "epoch": 7.653141831238779, + "grad_norm": 1.1374881267547607, + "learning_rate": 0.0002, + "loss": 0.5232, + "step": 106570 + }, + { + "epoch": 7.653859964093357, + "grad_norm": 1.0783830881118774, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 106580 + }, + { + "epoch": 7.654578096947935, + "grad_norm": 1.014607548713684, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 106590 + }, + { + "epoch": 7.655296229802514, + "grad_norm": 0.9155649542808533, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 106600 + }, + { + "epoch": 7.656014362657092, + "grad_norm": 1.0671756267547607, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 106610 + }, + { + "epoch": 7.65673249551167, + "grad_norm": 0.9360224008560181, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 106620 + }, + { + "epoch": 7.657450628366248, + "grad_norm": 1.1457395553588867, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 106630 + }, + { + "epoch": 7.658168761220826, + "grad_norm": 0.9849295020103455, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 106640 + }, + { + "epoch": 7.658886894075404, + "grad_norm": 1.0622800588607788, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 106650 + }, + { + "epoch": 7.659605026929982, + "grad_norm": 0.8352060914039612, + "learning_rate": 0.0002, + "loss": 0.5494, + "step": 106660 + }, + { + "epoch": 7.66032315978456, + "grad_norm": 1.1975891590118408, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 106670 + }, + { + "epoch": 7.661041292639139, + "grad_norm": 1.1585075855255127, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 106680 + }, + { + "epoch": 7.661759425493717, + "grad_norm": 1.1387015581130981, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 106690 + }, + { + "epoch": 7.662477558348295, + "grad_norm": 1.2752996683120728, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 106700 + }, + { + "epoch": 7.663195691202873, + "grad_norm": 1.1885957717895508, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 106710 + }, + { + "epoch": 7.663913824057451, + "grad_norm": 0.9355967044830322, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 106720 + }, + { + "epoch": 7.664631956912029, + "grad_norm": 1.0528348684310913, + "learning_rate": 0.0002, + "loss": 0.5205, + "step": 106730 + }, + { + "epoch": 7.665350089766607, + "grad_norm": 1.1075369119644165, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 106740 + }, + { + "epoch": 7.666068222621185, + "grad_norm": 1.2078553438186646, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 106750 + }, + { + "epoch": 7.666786355475763, + "grad_norm": 0.9850115776062012, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 106760 + }, + { + "epoch": 7.667504488330341, + "grad_norm": 1.1855263710021973, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 106770 + }, + { + "epoch": 7.66822262118492, + "grad_norm": 1.3375587463378906, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 106780 + }, + { + "epoch": 7.668940754039498, + "grad_norm": 0.8773086071014404, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 106790 + }, + { + "epoch": 7.669658886894076, + "grad_norm": 1.293311595916748, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 106800 + }, + { + "epoch": 7.670377019748654, + "grad_norm": 1.1973644495010376, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 106810 + }, + { + "epoch": 7.671095152603232, + "grad_norm": 1.0847374200820923, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 106820 + }, + { + "epoch": 7.67181328545781, + "grad_norm": 0.98153156042099, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 106830 + }, + { + "epoch": 7.672531418312388, + "grad_norm": 1.049188494682312, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 106840 + }, + { + "epoch": 7.673249551166966, + "grad_norm": 1.0110270977020264, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 106850 + }, + { + "epoch": 7.673967684021544, + "grad_norm": 1.046575903892517, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 106860 + }, + { + "epoch": 7.6746858168761225, + "grad_norm": 0.9939501285552979, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 106870 + }, + { + "epoch": 7.6754039497307005, + "grad_norm": 1.1165480613708496, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 106880 + }, + { + "epoch": 7.6761220825852785, + "grad_norm": 0.8909515738487244, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 106890 + }, + { + "epoch": 7.6768402154398565, + "grad_norm": 0.99685138463974, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 106900 + }, + { + "epoch": 7.6775583482944345, + "grad_norm": 0.9978061318397522, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 106910 + }, + { + "epoch": 7.6782764811490125, + "grad_norm": 1.2148759365081787, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 106920 + }, + { + "epoch": 7.6789946140035905, + "grad_norm": 1.2721340656280518, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 106930 + }, + { + "epoch": 7.6797127468581685, + "grad_norm": 1.0458247661590576, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 106940 + }, + { + "epoch": 7.6804308797127465, + "grad_norm": 0.9900956749916077, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 106950 + }, + { + "epoch": 7.6811490125673245, + "grad_norm": 1.0812790393829346, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 106960 + }, + { + "epoch": 7.681867145421903, + "grad_norm": 1.1479923725128174, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 106970 + }, + { + "epoch": 7.682585278276481, + "grad_norm": 0.7898157238960266, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 106980 + }, + { + "epoch": 7.683303411131059, + "grad_norm": 1.4052869081497192, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 106990 + }, + { + "epoch": 7.684021543985637, + "grad_norm": 1.3122624158859253, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 107000 + }, + { + "epoch": 7.684739676840215, + "grad_norm": 1.0138102769851685, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 107010 + }, + { + "epoch": 7.685457809694793, + "grad_norm": 1.0716434717178345, + "learning_rate": 0.0002, + "loss": 0.5447, + "step": 107020 + }, + { + "epoch": 7.686175942549371, + "grad_norm": 1.2208350896835327, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 107030 + }, + { + "epoch": 7.686894075403949, + "grad_norm": 1.3777594566345215, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 107040 + }, + { + "epoch": 7.687612208258528, + "grad_norm": 1.1951156854629517, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 107050 + }, + { + "epoch": 7.688330341113106, + "grad_norm": 0.987120509147644, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 107060 + }, + { + "epoch": 7.689048473967684, + "grad_norm": 0.9455362558364868, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 107070 + }, + { + "epoch": 7.689766606822262, + "grad_norm": 0.9832291007041931, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 107080 + }, + { + "epoch": 7.69048473967684, + "grad_norm": 1.046239972114563, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 107090 + }, + { + "epoch": 7.691202872531418, + "grad_norm": 1.1121305227279663, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 107100 + }, + { + "epoch": 7.691921005385996, + "grad_norm": 1.0636173486709595, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 107110 + }, + { + "epoch": 7.692639138240574, + "grad_norm": 1.2166199684143066, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 107120 + }, + { + "epoch": 7.693357271095152, + "grad_norm": 1.0859293937683105, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 107130 + }, + { + "epoch": 7.69407540394973, + "grad_norm": 0.9719768166542053, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 107140 + }, + { + "epoch": 7.694793536804308, + "grad_norm": 1.5153313875198364, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 107150 + }, + { + "epoch": 7.695511669658887, + "grad_norm": 1.1787729263305664, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 107160 + }, + { + "epoch": 7.696229802513465, + "grad_norm": 0.9926921129226685, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 107170 + }, + { + "epoch": 7.696947935368043, + "grad_norm": 1.0670396089553833, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 107180 + }, + { + "epoch": 7.697666068222621, + "grad_norm": 1.022409200668335, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 107190 + }, + { + "epoch": 7.698384201077199, + "grad_norm": 0.9605807065963745, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 107200 + }, + { + "epoch": 7.699102333931777, + "grad_norm": 1.2187163829803467, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 107210 + }, + { + "epoch": 7.699820466786355, + "grad_norm": 1.2335593700408936, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 107220 + }, + { + "epoch": 7.700538599640933, + "grad_norm": 1.159769892692566, + "learning_rate": 0.0002, + "loss": 0.5494, + "step": 107230 + }, + { + "epoch": 7.701256732495512, + "grad_norm": 0.9486351013183594, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 107240 + }, + { + "epoch": 7.70197486535009, + "grad_norm": 1.2952953577041626, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 107250 + }, + { + "epoch": 7.702692998204668, + "grad_norm": 0.9187726974487305, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 107260 + }, + { + "epoch": 7.703411131059246, + "grad_norm": 1.0610202550888062, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 107270 + }, + { + "epoch": 7.704129263913824, + "grad_norm": 1.0553513765335083, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 107280 + }, + { + "epoch": 7.704847396768402, + "grad_norm": 1.0521212816238403, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 107290 + }, + { + "epoch": 7.70556552962298, + "grad_norm": 1.197798252105713, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 107300 + }, + { + "epoch": 7.706283662477558, + "grad_norm": 1.1656016111373901, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 107310 + }, + { + "epoch": 7.707001795332136, + "grad_norm": 1.1318942308425903, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 107320 + }, + { + "epoch": 7.707719928186714, + "grad_norm": 1.2302566766738892, + "learning_rate": 0.0002, + "loss": 0.5205, + "step": 107330 + }, + { + "epoch": 7.708438061041292, + "grad_norm": 1.2854527235031128, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 107340 + }, + { + "epoch": 7.709156193895871, + "grad_norm": 1.2395009994506836, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 107350 + }, + { + "epoch": 7.709874326750449, + "grad_norm": 1.2834311723709106, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 107360 + }, + { + "epoch": 7.710592459605027, + "grad_norm": 0.9438875317573547, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 107370 + }, + { + "epoch": 7.711310592459605, + "grad_norm": 1.2651551961898804, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 107380 + }, + { + "epoch": 7.712028725314183, + "grad_norm": 1.0880811214447021, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 107390 + }, + { + "epoch": 7.712746858168761, + "grad_norm": 1.077873706817627, + "learning_rate": 0.0002, + "loss": 0.532, + "step": 107400 + }, + { + "epoch": 7.713464991023339, + "grad_norm": 1.183581829071045, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 107410 + }, + { + "epoch": 7.714183123877917, + "grad_norm": 0.903417706489563, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 107420 + }, + { + "epoch": 7.714901256732496, + "grad_norm": 1.0142052173614502, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 107430 + }, + { + "epoch": 7.715619389587074, + "grad_norm": 1.287375807762146, + "learning_rate": 0.0002, + "loss": 0.521, + "step": 107440 + }, + { + "epoch": 7.716337522441652, + "grad_norm": 1.036961555480957, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 107450 + }, + { + "epoch": 7.71705565529623, + "grad_norm": 1.053189992904663, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 107460 + }, + { + "epoch": 7.717773788150808, + "grad_norm": 1.0782629251480103, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 107470 + }, + { + "epoch": 7.718491921005386, + "grad_norm": 1.2815700769424438, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 107480 + }, + { + "epoch": 7.719210053859964, + "grad_norm": 1.0254477262496948, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 107490 + }, + { + "epoch": 7.719928186714542, + "grad_norm": 1.2113746404647827, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 107500 + }, + { + "epoch": 7.72064631956912, + "grad_norm": 1.1663107872009277, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 107510 + }, + { + "epoch": 7.721364452423698, + "grad_norm": 1.1120136976242065, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 107520 + }, + { + "epoch": 7.722082585278277, + "grad_norm": 0.9561337828636169, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 107530 + }, + { + "epoch": 7.722800718132855, + "grad_norm": 1.0723344087600708, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 107540 + }, + { + "epoch": 7.723518850987433, + "grad_norm": 1.1457021236419678, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 107550 + }, + { + "epoch": 7.724236983842011, + "grad_norm": 1.1626014709472656, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 107560 + }, + { + "epoch": 7.724955116696589, + "grad_norm": 1.0837032794952393, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 107570 + }, + { + "epoch": 7.725673249551167, + "grad_norm": 1.1355236768722534, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 107580 + }, + { + "epoch": 7.726391382405745, + "grad_norm": 0.9753133654594421, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 107590 + }, + { + "epoch": 7.727109515260323, + "grad_norm": 1.1424425840377808, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 107600 + }, + { + "epoch": 7.727827648114902, + "grad_norm": 0.8058976531028748, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 107610 + }, + { + "epoch": 7.72854578096948, + "grad_norm": 1.1998937129974365, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 107620 + }, + { + "epoch": 7.729263913824058, + "grad_norm": 1.0383063554763794, + "learning_rate": 0.0002, + "loss": 0.5348, + "step": 107630 + }, + { + "epoch": 7.729982046678636, + "grad_norm": 1.069886565208435, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 107640 + }, + { + "epoch": 7.730700179533214, + "grad_norm": 1.113100290298462, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 107650 + }, + { + "epoch": 7.731418312387792, + "grad_norm": 1.1166869401931763, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 107660 + }, + { + "epoch": 7.73213644524237, + "grad_norm": 1.3739103078842163, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 107670 + }, + { + "epoch": 7.732854578096948, + "grad_norm": 0.9432857036590576, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 107680 + }, + { + "epoch": 7.733572710951526, + "grad_norm": 1.0611073970794678, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 107690 + }, + { + "epoch": 7.734290843806104, + "grad_norm": 1.052598476409912, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 107700 + }, + { + "epoch": 7.735008976660682, + "grad_norm": 1.080534815788269, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 107710 + }, + { + "epoch": 7.735727109515261, + "grad_norm": 1.3288558721542358, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 107720 + }, + { + "epoch": 7.736445242369839, + "grad_norm": 1.1469939947128296, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 107730 + }, + { + "epoch": 7.737163375224417, + "grad_norm": 0.9235124588012695, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 107740 + }, + { + "epoch": 7.737881508078995, + "grad_norm": 1.2601470947265625, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 107750 + }, + { + "epoch": 7.738599640933573, + "grad_norm": 1.181703805923462, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 107760 + }, + { + "epoch": 7.739317773788151, + "grad_norm": 0.9549161195755005, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 107770 + }, + { + "epoch": 7.740035906642729, + "grad_norm": 1.078458547592163, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 107780 + }, + { + "epoch": 7.740754039497307, + "grad_norm": 1.1542205810546875, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 107790 + }, + { + "epoch": 7.741472172351886, + "grad_norm": 1.288838505744934, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 107800 + }, + { + "epoch": 7.742190305206464, + "grad_norm": 0.972050666809082, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 107810 + }, + { + "epoch": 7.742908438061042, + "grad_norm": 0.9113378524780273, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 107820 + }, + { + "epoch": 7.74362657091562, + "grad_norm": 1.207448959350586, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 107830 + }, + { + "epoch": 7.744344703770198, + "grad_norm": 1.2151618003845215, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 107840 + }, + { + "epoch": 7.745062836624776, + "grad_norm": 1.0792107582092285, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 107850 + }, + { + "epoch": 7.745780969479354, + "grad_norm": 0.9030680656433105, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 107860 + }, + { + "epoch": 7.746499102333932, + "grad_norm": 1.120816707611084, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 107870 + }, + { + "epoch": 7.74721723518851, + "grad_norm": 1.221238374710083, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 107880 + }, + { + "epoch": 7.747935368043088, + "grad_norm": 1.2627668380737305, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 107890 + }, + { + "epoch": 7.748653500897666, + "grad_norm": 1.4177098274230957, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 107900 + }, + { + "epoch": 7.7493716337522445, + "grad_norm": 1.2448033094406128, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 107910 + }, + { + "epoch": 7.7500897666068225, + "grad_norm": 1.1706769466400146, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 107920 + }, + { + "epoch": 7.7508078994614005, + "grad_norm": 0.9637128114700317, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 107930 + }, + { + "epoch": 7.7515260323159785, + "grad_norm": 1.129179835319519, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 107940 + }, + { + "epoch": 7.7522441651705565, + "grad_norm": 1.3793165683746338, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 107950 + }, + { + "epoch": 7.7529622980251345, + "grad_norm": 1.0685398578643799, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 107960 + }, + { + "epoch": 7.7536804308797125, + "grad_norm": 0.9382266998291016, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 107970 + }, + { + "epoch": 7.7543985637342905, + "grad_norm": 1.0740195512771606, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 107980 + }, + { + "epoch": 7.755116696588869, + "grad_norm": 1.292909860610962, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 107990 + }, + { + "epoch": 7.755834829443447, + "grad_norm": 1.2145541906356812, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 108000 + }, + { + "epoch": 7.756552962298025, + "grad_norm": 0.9905714988708496, + "learning_rate": 0.0002, + "loss": 0.5443, + "step": 108010 + }, + { + "epoch": 7.757271095152603, + "grad_norm": 1.1003599166870117, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 108020 + }, + { + "epoch": 7.757989228007181, + "grad_norm": 1.0429667234420776, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 108030 + }, + { + "epoch": 7.758707360861759, + "grad_norm": 0.8607417941093445, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 108040 + }, + { + "epoch": 7.759425493716337, + "grad_norm": 1.0659228563308716, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 108050 + }, + { + "epoch": 7.760143626570915, + "grad_norm": 1.0484120845794678, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 108060 + }, + { + "epoch": 7.760861759425493, + "grad_norm": 1.1236662864685059, + "learning_rate": 0.0002, + "loss": 0.5115, + "step": 108070 + }, + { + "epoch": 7.761579892280071, + "grad_norm": 1.0550786256790161, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 108080 + }, + { + "epoch": 7.76229802513465, + "grad_norm": 1.178968906402588, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 108090 + }, + { + "epoch": 7.763016157989228, + "grad_norm": 0.9117124080657959, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 108100 + }, + { + "epoch": 7.763734290843806, + "grad_norm": 1.1276684999465942, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 108110 + }, + { + "epoch": 7.764452423698384, + "grad_norm": 1.0472416877746582, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 108120 + }, + { + "epoch": 7.765170556552962, + "grad_norm": 0.8711934685707092, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 108130 + }, + { + "epoch": 7.76588868940754, + "grad_norm": 1.0953301191329956, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 108140 + }, + { + "epoch": 7.766606822262118, + "grad_norm": 1.1367015838623047, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 108150 + }, + { + "epoch": 7.767324955116696, + "grad_norm": 1.324832797050476, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 108160 + }, + { + "epoch": 7.768043087971275, + "grad_norm": 1.0333607196807861, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 108170 + }, + { + "epoch": 7.768761220825853, + "grad_norm": 1.1580414772033691, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 108180 + }, + { + "epoch": 7.769479353680431, + "grad_norm": 1.1693189144134521, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 108190 + }, + { + "epoch": 7.770197486535009, + "grad_norm": 1.0650800466537476, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 108200 + }, + { + "epoch": 7.770915619389587, + "grad_norm": 1.0890787839889526, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 108210 + }, + { + "epoch": 7.771633752244165, + "grad_norm": 1.065359115600586, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 108220 + }, + { + "epoch": 7.772351885098743, + "grad_norm": 0.864976704120636, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 108230 + }, + { + "epoch": 7.773070017953321, + "grad_norm": 0.9769368171691895, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 108240 + }, + { + "epoch": 7.773788150807899, + "grad_norm": 1.2894748449325562, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 108250 + }, + { + "epoch": 7.774506283662477, + "grad_norm": 1.1528522968292236, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 108260 + }, + { + "epoch": 7.775224416517055, + "grad_norm": 1.1542086601257324, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 108270 + }, + { + "epoch": 7.775942549371634, + "grad_norm": 1.3909233808517456, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 108280 + }, + { + "epoch": 7.776660682226212, + "grad_norm": 0.9855168461799622, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 108290 + }, + { + "epoch": 7.77737881508079, + "grad_norm": 1.0425859689712524, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 108300 + }, + { + "epoch": 7.778096947935368, + "grad_norm": 1.0025626420974731, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 108310 + }, + { + "epoch": 7.778815080789946, + "grad_norm": 1.036100149154663, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 108320 + }, + { + "epoch": 7.779533213644524, + "grad_norm": 0.9820912480354309, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 108330 + }, + { + "epoch": 7.780251346499102, + "grad_norm": 1.4552558660507202, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 108340 + }, + { + "epoch": 7.78096947935368, + "grad_norm": 1.1851739883422852, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 108350 + }, + { + "epoch": 7.781687612208259, + "grad_norm": 0.9678618311882019, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 108360 + }, + { + "epoch": 7.782405745062837, + "grad_norm": 1.052158236503601, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 108370 + }, + { + "epoch": 7.783123877917415, + "grad_norm": 0.8977556228637695, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 108380 + }, + { + "epoch": 7.783842010771993, + "grad_norm": 1.2486764192581177, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 108390 + }, + { + "epoch": 7.784560143626571, + "grad_norm": 1.020477056503296, + "learning_rate": 0.0002, + "loss": 0.553, + "step": 108400 + }, + { + "epoch": 7.785278276481149, + "grad_norm": 1.1957271099090576, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 108410 + }, + { + "epoch": 7.785996409335727, + "grad_norm": 1.0586557388305664, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 108420 + }, + { + "epoch": 7.786714542190305, + "grad_norm": 0.8806754946708679, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 108430 + }, + { + "epoch": 7.787432675044883, + "grad_norm": 1.0272849798202515, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 108440 + }, + { + "epoch": 7.788150807899461, + "grad_norm": 1.052829623222351, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 108450 + }, + { + "epoch": 7.788868940754039, + "grad_norm": 1.276508092880249, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 108460 + }, + { + "epoch": 7.789587073608618, + "grad_norm": 0.9878475069999695, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 108470 + }, + { + "epoch": 7.790305206463196, + "grad_norm": 0.9568123817443848, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 108480 + }, + { + "epoch": 7.791023339317774, + "grad_norm": 1.097121238708496, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 108490 + }, + { + "epoch": 7.791741472172352, + "grad_norm": 1.188984751701355, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 108500 + }, + { + "epoch": 7.79245960502693, + "grad_norm": 0.9185505509376526, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 108510 + }, + { + "epoch": 7.793177737881508, + "grad_norm": 0.9427091479301453, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 108520 + }, + { + "epoch": 7.793895870736086, + "grad_norm": 1.0734131336212158, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 108530 + }, + { + "epoch": 7.794614003590664, + "grad_norm": 1.1126554012298584, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 108540 + }, + { + "epoch": 7.795332136445243, + "grad_norm": 1.1394606828689575, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 108550 + }, + { + "epoch": 7.796050269299821, + "grad_norm": 0.9328436851501465, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 108560 + }, + { + "epoch": 7.796768402154399, + "grad_norm": 1.1082807779312134, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 108570 + }, + { + "epoch": 7.797486535008977, + "grad_norm": 1.1107451915740967, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 108580 + }, + { + "epoch": 7.798204667863555, + "grad_norm": 1.1145843267440796, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 108590 + }, + { + "epoch": 7.798922800718133, + "grad_norm": 0.9881244897842407, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 108600 + }, + { + "epoch": 7.799640933572711, + "grad_norm": 1.022754192352295, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 108610 + }, + { + "epoch": 7.800359066427289, + "grad_norm": 1.197089672088623, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 108620 + }, + { + "epoch": 7.801077199281867, + "grad_norm": 1.0599340200424194, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 108630 + }, + { + "epoch": 7.801795332136445, + "grad_norm": 1.1776701211929321, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 108640 + }, + { + "epoch": 7.802513464991024, + "grad_norm": 0.9674487709999084, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 108650 + }, + { + "epoch": 7.803231597845602, + "grad_norm": 0.9964252710342407, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 108660 + }, + { + "epoch": 7.80394973070018, + "grad_norm": 1.0302894115447998, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 108670 + }, + { + "epoch": 7.804667863554758, + "grad_norm": 1.3224111795425415, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 108680 + }, + { + "epoch": 7.805385996409336, + "grad_norm": 1.2263908386230469, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 108690 + }, + { + "epoch": 7.806104129263914, + "grad_norm": 1.3223700523376465, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 108700 + }, + { + "epoch": 7.806822262118492, + "grad_norm": 1.0767865180969238, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 108710 + }, + { + "epoch": 7.80754039497307, + "grad_norm": 1.0822714567184448, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 108720 + }, + { + "epoch": 7.808258527827648, + "grad_norm": 1.2550771236419678, + "learning_rate": 0.0002, + "loss": 0.5865, + "step": 108730 + }, + { + "epoch": 7.808976660682227, + "grad_norm": 1.0170459747314453, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 108740 + }, + { + "epoch": 7.809694793536805, + "grad_norm": 1.1515722274780273, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 108750 + }, + { + "epoch": 7.810412926391383, + "grad_norm": 1.327756643295288, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 108760 + }, + { + "epoch": 7.811131059245961, + "grad_norm": 1.0545963048934937, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 108770 + }, + { + "epoch": 7.811849192100539, + "grad_norm": 1.0827748775482178, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 108780 + }, + { + "epoch": 7.812567324955117, + "grad_norm": 1.010693073272705, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 108790 + }, + { + "epoch": 7.813285457809695, + "grad_norm": 1.2254958152770996, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 108800 + }, + { + "epoch": 7.814003590664273, + "grad_norm": 0.9775252938270569, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 108810 + }, + { + "epoch": 7.814721723518851, + "grad_norm": 0.9968659281730652, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 108820 + }, + { + "epoch": 7.815439856373429, + "grad_norm": 0.9968136548995972, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 108830 + }, + { + "epoch": 7.8161579892280075, + "grad_norm": 1.0271786451339722, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 108840 + }, + { + "epoch": 7.8168761220825855, + "grad_norm": 1.332309603691101, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 108850 + }, + { + "epoch": 7.8175942549371635, + "grad_norm": 1.2836099863052368, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 108860 + }, + { + "epoch": 7.8183123877917415, + "grad_norm": 0.9816291332244873, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 108870 + }, + { + "epoch": 7.8190305206463195, + "grad_norm": 1.1243056058883667, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 108880 + }, + { + "epoch": 7.8197486535008975, + "grad_norm": 1.2360351085662842, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 108890 + }, + { + "epoch": 7.8204667863554755, + "grad_norm": 1.2734822034835815, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 108900 + }, + { + "epoch": 7.8211849192100535, + "grad_norm": 1.2423732280731201, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 108910 + }, + { + "epoch": 7.821903052064632, + "grad_norm": 0.969839334487915, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 108920 + }, + { + "epoch": 7.82262118491921, + "grad_norm": 1.1603267192840576, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 108930 + }, + { + "epoch": 7.823339317773788, + "grad_norm": 1.1748993396759033, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 108940 + }, + { + "epoch": 7.824057450628366, + "grad_norm": 1.246304988861084, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 108950 + }, + { + "epoch": 7.824775583482944, + "grad_norm": 0.9472703337669373, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 108960 + }, + { + "epoch": 7.825493716337522, + "grad_norm": 1.22053062915802, + "learning_rate": 0.0002, + "loss": 0.5421, + "step": 108970 + }, + { + "epoch": 7.8262118491921004, + "grad_norm": 1.0310567617416382, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 108980 + }, + { + "epoch": 7.8269299820466784, + "grad_norm": 1.1211191415786743, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 108990 + }, + { + "epoch": 7.8276481149012564, + "grad_norm": 0.9057613015174866, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 109000 + }, + { + "epoch": 7.8283662477558345, + "grad_norm": 1.0615124702453613, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 109010 + }, + { + "epoch": 7.8290843806104125, + "grad_norm": 0.9669250845909119, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 109020 + }, + { + "epoch": 7.829802513464991, + "grad_norm": 1.1100435256958008, + "learning_rate": 0.0002, + "loss": 0.5562, + "step": 109030 + }, + { + "epoch": 7.830520646319569, + "grad_norm": 1.2583600282669067, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 109040 + }, + { + "epoch": 7.831238779174147, + "grad_norm": 1.228148102760315, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 109050 + }, + { + "epoch": 7.831956912028725, + "grad_norm": 1.0673317909240723, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 109060 + }, + { + "epoch": 7.832675044883303, + "grad_norm": 1.169648289680481, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 109070 + }, + { + "epoch": 7.833393177737881, + "grad_norm": 1.0065253973007202, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 109080 + }, + { + "epoch": 7.834111310592459, + "grad_norm": 1.1310595273971558, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 109090 + }, + { + "epoch": 7.834829443447037, + "grad_norm": 0.9469314217567444, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 109100 + }, + { + "epoch": 7.835547576301616, + "grad_norm": 1.1143816709518433, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 109110 + }, + { + "epoch": 7.836265709156194, + "grad_norm": 1.0617737770080566, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 109120 + }, + { + "epoch": 7.836983842010772, + "grad_norm": 1.0489295721054077, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 109130 + }, + { + "epoch": 7.83770197486535, + "grad_norm": 1.2900800704956055, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 109140 + }, + { + "epoch": 7.838420107719928, + "grad_norm": 1.1539736986160278, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 109150 + }, + { + "epoch": 7.839138240574506, + "grad_norm": 1.0503592491149902, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 109160 + }, + { + "epoch": 7.839856373429084, + "grad_norm": 1.134155035018921, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 109170 + }, + { + "epoch": 7.840574506283662, + "grad_norm": 1.042429804801941, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 109180 + }, + { + "epoch": 7.84129263913824, + "grad_norm": 1.0549449920654297, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 109190 + }, + { + "epoch": 7.842010771992818, + "grad_norm": 0.9603164196014404, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 109200 + }, + { + "epoch": 7.842728904847397, + "grad_norm": 1.3291586637496948, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 109210 + }, + { + "epoch": 7.843447037701975, + "grad_norm": 0.7739448547363281, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 109220 + }, + { + "epoch": 7.844165170556553, + "grad_norm": 1.0020095109939575, + "learning_rate": 0.0002, + "loss": 0.4998, + "step": 109230 + }, + { + "epoch": 7.844883303411131, + "grad_norm": 0.9480768442153931, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 109240 + }, + { + "epoch": 7.845601436265709, + "grad_norm": 1.0376673936843872, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 109250 + }, + { + "epoch": 7.846319569120287, + "grad_norm": 0.9776299595832825, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 109260 + }, + { + "epoch": 7.847037701974865, + "grad_norm": 1.0477584600448608, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 109270 + }, + { + "epoch": 7.847755834829443, + "grad_norm": 1.162746548652649, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 109280 + }, + { + "epoch": 7.848473967684021, + "grad_norm": 1.0150725841522217, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 109290 + }, + { + "epoch": 7.8491921005386, + "grad_norm": 1.0144163370132446, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 109300 + }, + { + "epoch": 7.849910233393178, + "grad_norm": 0.9614455103874207, + "learning_rate": 0.0002, + "loss": 0.549, + "step": 109310 + }, + { + "epoch": 7.850628366247756, + "grad_norm": 1.223591685295105, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 109320 + }, + { + "epoch": 7.851346499102334, + "grad_norm": 1.149753212928772, + "learning_rate": 0.0002, + "loss": 0.5763, + "step": 109330 + }, + { + "epoch": 7.852064631956912, + "grad_norm": 0.8418117165565491, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 109340 + }, + { + "epoch": 7.85278276481149, + "grad_norm": 1.3950735330581665, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 109350 + }, + { + "epoch": 7.853500897666068, + "grad_norm": 1.315022587776184, + "learning_rate": 0.0002, + "loss": 0.6149, + "step": 109360 + }, + { + "epoch": 7.854219030520646, + "grad_norm": 0.9699475765228271, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 109370 + }, + { + "epoch": 7.854937163375224, + "grad_norm": 1.0460443496704102, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 109380 + }, + { + "epoch": 7.855655296229802, + "grad_norm": 1.0051870346069336, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 109390 + }, + { + "epoch": 7.856373429084381, + "grad_norm": 1.1087634563446045, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 109400 + }, + { + "epoch": 7.857091561938959, + "grad_norm": 1.0926934480667114, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 109410 + }, + { + "epoch": 7.857809694793537, + "grad_norm": 0.9953354597091675, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 109420 + }, + { + "epoch": 7.858527827648115, + "grad_norm": 1.170961856842041, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 109430 + }, + { + "epoch": 7.859245960502693, + "grad_norm": 1.2087738513946533, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 109440 + }, + { + "epoch": 7.859964093357271, + "grad_norm": 0.969118595123291, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 109450 + }, + { + "epoch": 7.860682226211849, + "grad_norm": 1.2040046453475952, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 109460 + }, + { + "epoch": 7.861400359066427, + "grad_norm": 0.9882297515869141, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 109470 + }, + { + "epoch": 7.862118491921006, + "grad_norm": 1.0635188817977905, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 109480 + }, + { + "epoch": 7.862836624775584, + "grad_norm": 1.174045205116272, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 109490 + }, + { + "epoch": 7.863554757630162, + "grad_norm": 0.9702258706092834, + "learning_rate": 0.0002, + "loss": 0.5403, + "step": 109500 + }, + { + "epoch": 7.86427289048474, + "grad_norm": 0.8843887448310852, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 109510 + }, + { + "epoch": 7.864991023339318, + "grad_norm": 0.961931049823761, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 109520 + }, + { + "epoch": 7.865709156193896, + "grad_norm": 0.9497876763343811, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 109530 + }, + { + "epoch": 7.866427289048474, + "grad_norm": 1.0348241329193115, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 109540 + }, + { + "epoch": 7.867145421903052, + "grad_norm": 1.0796928405761719, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 109550 + }, + { + "epoch": 7.86786355475763, + "grad_norm": 1.2193728685379028, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 109560 + }, + { + "epoch": 7.868581687612208, + "grad_norm": 0.8161213994026184, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 109570 + }, + { + "epoch": 7.869299820466786, + "grad_norm": 1.062281608581543, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 109580 + }, + { + "epoch": 7.870017953321365, + "grad_norm": 1.0982999801635742, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 109590 + }, + { + "epoch": 7.870736086175943, + "grad_norm": 1.057931661605835, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 109600 + }, + { + "epoch": 7.871454219030521, + "grad_norm": 1.1201120615005493, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 109610 + }, + { + "epoch": 7.872172351885099, + "grad_norm": 1.2803348302841187, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 109620 + }, + { + "epoch": 7.872890484739677, + "grad_norm": 1.1370888948440552, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 109630 + }, + { + "epoch": 7.873608617594255, + "grad_norm": 1.1025199890136719, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 109640 + }, + { + "epoch": 7.874326750448833, + "grad_norm": 0.9794017672538757, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 109650 + }, + { + "epoch": 7.875044883303411, + "grad_norm": 1.0693902969360352, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 109660 + }, + { + "epoch": 7.87576301615799, + "grad_norm": 1.1972219944000244, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 109670 + }, + { + "epoch": 7.876481149012568, + "grad_norm": 1.5061790943145752, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 109680 + }, + { + "epoch": 7.877199281867146, + "grad_norm": 1.194033145904541, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 109690 + }, + { + "epoch": 7.877917414721724, + "grad_norm": 1.1381443738937378, + "learning_rate": 0.0002, + "loss": 0.5437, + "step": 109700 + }, + { + "epoch": 7.878635547576302, + "grad_norm": 1.1147687435150146, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 109710 + }, + { + "epoch": 7.87935368043088, + "grad_norm": 1.0469177961349487, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 109720 + }, + { + "epoch": 7.880071813285458, + "grad_norm": 1.066167950630188, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 109730 + }, + { + "epoch": 7.880789946140036, + "grad_norm": 1.1696351766586304, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 109740 + }, + { + "epoch": 7.881508078994614, + "grad_norm": 1.0112557411193848, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 109750 + }, + { + "epoch": 7.882226211849192, + "grad_norm": 1.0896331071853638, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 109760 + }, + { + "epoch": 7.88294434470377, + "grad_norm": 1.1275625228881836, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 109770 + }, + { + "epoch": 7.883662477558349, + "grad_norm": 0.859959602355957, + "learning_rate": 0.0002, + "loss": 0.5248, + "step": 109780 + }, + { + "epoch": 7.884380610412927, + "grad_norm": 1.1432042121887207, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 109790 + }, + { + "epoch": 7.885098743267505, + "grad_norm": 1.0156069993972778, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 109800 + }, + { + "epoch": 7.885816876122083, + "grad_norm": 0.8594014048576355, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 109810 + }, + { + "epoch": 7.886535008976661, + "grad_norm": 0.8861605525016785, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 109820 + }, + { + "epoch": 7.887253141831239, + "grad_norm": 0.9504907131195068, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 109830 + }, + { + "epoch": 7.887971274685817, + "grad_norm": 1.0248312950134277, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 109840 + }, + { + "epoch": 7.888689407540395, + "grad_norm": 1.1179074048995972, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 109850 + }, + { + "epoch": 7.8894075403949735, + "grad_norm": 0.9005255103111267, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 109860 + }, + { + "epoch": 7.8901256732495515, + "grad_norm": 1.0487693548202515, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 109870 + }, + { + "epoch": 7.8908438061041295, + "grad_norm": 1.2038270235061646, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 109880 + }, + { + "epoch": 7.8915619389587075, + "grad_norm": 0.9288236498832703, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 109890 + }, + { + "epoch": 7.8922800718132855, + "grad_norm": 0.959175169467926, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 109900 + }, + { + "epoch": 7.8929982046678635, + "grad_norm": 0.9703200459480286, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 109910 + }, + { + "epoch": 7.8937163375224415, + "grad_norm": 1.2670199871063232, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 109920 + }, + { + "epoch": 7.8944344703770195, + "grad_norm": 1.3127061128616333, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 109930 + }, + { + "epoch": 7.8951526032315975, + "grad_norm": 1.072664737701416, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 109940 + }, + { + "epoch": 7.8958707360861755, + "grad_norm": 1.0517730712890625, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 109950 + }, + { + "epoch": 7.896588868940754, + "grad_norm": 0.8665887713432312, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 109960 + }, + { + "epoch": 7.897307001795332, + "grad_norm": 1.2894970178604126, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 109970 + }, + { + "epoch": 7.89802513464991, + "grad_norm": 1.1201982498168945, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 109980 + }, + { + "epoch": 7.898743267504488, + "grad_norm": 1.0165940523147583, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 109990 + }, + { + "epoch": 7.899461400359066, + "grad_norm": 1.1439729928970337, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 110000 + }, + { + "epoch": 7.900179533213644, + "grad_norm": 1.0404242277145386, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 110010 + }, + { + "epoch": 7.900897666068222, + "grad_norm": 1.015904426574707, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 110020 + }, + { + "epoch": 7.9016157989228, + "grad_norm": 1.1397117376327515, + "learning_rate": 0.0002, + "loss": 0.5395, + "step": 110030 + }, + { + "epoch": 7.902333931777379, + "grad_norm": 1.5121701955795288, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 110040 + }, + { + "epoch": 7.903052064631957, + "grad_norm": 1.1664289236068726, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 110050 + }, + { + "epoch": 7.903770197486535, + "grad_norm": 1.1808925867080688, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 110060 + }, + { + "epoch": 7.904488330341113, + "grad_norm": 0.997465968132019, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 110070 + }, + { + "epoch": 7.905206463195691, + "grad_norm": 1.164481520652771, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 110080 + }, + { + "epoch": 7.905924596050269, + "grad_norm": 1.3008257150650024, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 110090 + }, + { + "epoch": 7.906642728904847, + "grad_norm": 1.067894697189331, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 110100 + }, + { + "epoch": 7.907360861759425, + "grad_norm": 1.0160772800445557, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 110110 + }, + { + "epoch": 7.908078994614003, + "grad_norm": 1.0485782623291016, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 110120 + }, + { + "epoch": 7.908797127468581, + "grad_norm": 1.2126682996749878, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 110130 + }, + { + "epoch": 7.909515260323159, + "grad_norm": 1.124619722366333, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 110140 + }, + { + "epoch": 7.910233393177738, + "grad_norm": 1.1250736713409424, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 110150 + }, + { + "epoch": 7.910951526032316, + "grad_norm": 0.9558429718017578, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 110160 + }, + { + "epoch": 7.911669658886894, + "grad_norm": 1.1605639457702637, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 110170 + }, + { + "epoch": 7.912387791741472, + "grad_norm": 1.4227420091629028, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 110180 + }, + { + "epoch": 7.91310592459605, + "grad_norm": 1.1452029943466187, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 110190 + }, + { + "epoch": 7.913824057450628, + "grad_norm": 0.9975438714027405, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 110200 + }, + { + "epoch": 7.914542190305206, + "grad_norm": 1.0418251752853394, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 110210 + }, + { + "epoch": 7.915260323159784, + "grad_norm": 1.2578071355819702, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 110220 + }, + { + "epoch": 7.915978456014363, + "grad_norm": 0.9857864379882812, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 110230 + }, + { + "epoch": 7.916696588868941, + "grad_norm": 1.2045122385025024, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 110240 + }, + { + "epoch": 7.917414721723519, + "grad_norm": 1.0540096759796143, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 110250 + }, + { + "epoch": 7.918132854578097, + "grad_norm": 1.3578428030014038, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 110260 + }, + { + "epoch": 7.918850987432675, + "grad_norm": 1.1917411088943481, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 110270 + }, + { + "epoch": 7.919569120287253, + "grad_norm": 0.953195333480835, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 110280 + }, + { + "epoch": 7.920287253141831, + "grad_norm": 1.060767650604248, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 110290 + }, + { + "epoch": 7.921005385996409, + "grad_norm": 1.0920186042785645, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 110300 + }, + { + "epoch": 7.921723518850987, + "grad_norm": 1.0263668298721313, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 110310 + }, + { + "epoch": 7.922441651705565, + "grad_norm": 1.0305999517440796, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 110320 + }, + { + "epoch": 7.923159784560143, + "grad_norm": 1.2554773092269897, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 110330 + }, + { + "epoch": 7.923877917414722, + "grad_norm": 1.1688004732131958, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 110340 + }, + { + "epoch": 7.9245960502693, + "grad_norm": 0.996721625328064, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 110350 + }, + { + "epoch": 7.925314183123878, + "grad_norm": 1.000508427619934, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 110360 + }, + { + "epoch": 7.926032315978456, + "grad_norm": 1.0895634889602661, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 110370 + }, + { + "epoch": 7.926750448833034, + "grad_norm": 0.9376350045204163, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 110380 + }, + { + "epoch": 7.927468581687612, + "grad_norm": 0.9476872086524963, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 110390 + }, + { + "epoch": 7.92818671454219, + "grad_norm": 1.142225742340088, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 110400 + }, + { + "epoch": 7.928904847396768, + "grad_norm": 1.2613552808761597, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 110410 + }, + { + "epoch": 7.929622980251347, + "grad_norm": 1.0425217151641846, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 110420 + }, + { + "epoch": 7.930341113105925, + "grad_norm": 1.1250224113464355, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 110430 + }, + { + "epoch": 7.931059245960503, + "grad_norm": 1.1487616300582886, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 110440 + }, + { + "epoch": 7.931777378815081, + "grad_norm": 1.009817123413086, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 110450 + }, + { + "epoch": 7.932495511669659, + "grad_norm": 1.0866706371307373, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 110460 + }, + { + "epoch": 7.933213644524237, + "grad_norm": 0.9821379780769348, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 110470 + }, + { + "epoch": 7.933931777378815, + "grad_norm": 1.042220115661621, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 110480 + }, + { + "epoch": 7.934649910233393, + "grad_norm": 1.018154263496399, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 110490 + }, + { + "epoch": 7.935368043087971, + "grad_norm": 1.0129317045211792, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 110500 + }, + { + "epoch": 7.936086175942549, + "grad_norm": 1.0918302536010742, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 110510 + }, + { + "epoch": 7.936804308797128, + "grad_norm": 1.3739500045776367, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 110520 + }, + { + "epoch": 7.937522441651706, + "grad_norm": 0.9313759803771973, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 110530 + }, + { + "epoch": 7.938240574506284, + "grad_norm": 1.0325546264648438, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 110540 + }, + { + "epoch": 7.938958707360862, + "grad_norm": 1.0858685970306396, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 110550 + }, + { + "epoch": 7.93967684021544, + "grad_norm": 0.9607970118522644, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 110560 + }, + { + "epoch": 7.940394973070018, + "grad_norm": 1.2014137506484985, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 110570 + }, + { + "epoch": 7.941113105924596, + "grad_norm": 1.0917125940322876, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 110580 + }, + { + "epoch": 7.941831238779174, + "grad_norm": 1.0328655242919922, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 110590 + }, + { + "epoch": 7.942549371633753, + "grad_norm": 0.9071711897850037, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 110600 + }, + { + "epoch": 7.943267504488331, + "grad_norm": 1.0363129377365112, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 110610 + }, + { + "epoch": 7.943985637342909, + "grad_norm": 1.1908930540084839, + "learning_rate": 0.0002, + "loss": 0.549, + "step": 110620 + }, + { + "epoch": 7.944703770197487, + "grad_norm": 1.1436357498168945, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 110630 + }, + { + "epoch": 7.945421903052065, + "grad_norm": 1.2671914100646973, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 110640 + }, + { + "epoch": 7.946140035906643, + "grad_norm": 1.0665358304977417, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 110650 + }, + { + "epoch": 7.946858168761221, + "grad_norm": 1.065150499343872, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 110660 + }, + { + "epoch": 7.947576301615799, + "grad_norm": 1.3114454746246338, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 110670 + }, + { + "epoch": 7.948294434470377, + "grad_norm": 1.439401388168335, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 110680 + }, + { + "epoch": 7.949012567324955, + "grad_norm": 1.0176633596420288, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 110690 + }, + { + "epoch": 7.949730700179533, + "grad_norm": 1.2536396980285645, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 110700 + }, + { + "epoch": 7.950448833034112, + "grad_norm": 1.1297016143798828, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 110710 + }, + { + "epoch": 7.95116696588869, + "grad_norm": 0.9819521307945251, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 110720 + }, + { + "epoch": 7.951885098743268, + "grad_norm": 1.0327529907226562, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 110730 + }, + { + "epoch": 7.952603231597846, + "grad_norm": 1.003000259399414, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 110740 + }, + { + "epoch": 7.953321364452424, + "grad_norm": 0.9818766117095947, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 110750 + }, + { + "epoch": 7.954039497307002, + "grad_norm": 1.1950650215148926, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 110760 + }, + { + "epoch": 7.95475763016158, + "grad_norm": 1.1700283288955688, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 110770 + }, + { + "epoch": 7.955475763016158, + "grad_norm": 0.8310879468917847, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 110780 + }, + { + "epoch": 7.9561938958707366, + "grad_norm": 1.3428716659545898, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 110790 + }, + { + "epoch": 7.956912028725315, + "grad_norm": 1.2581387758255005, + "learning_rate": 0.0002, + "loss": 0.5137, + "step": 110800 + }, + { + "epoch": 7.957630161579893, + "grad_norm": 1.0624088048934937, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 110810 + }, + { + "epoch": 7.958348294434471, + "grad_norm": 1.0604743957519531, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 110820 + }, + { + "epoch": 7.959066427289049, + "grad_norm": 1.3024394512176514, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 110830 + }, + { + "epoch": 7.959784560143627, + "grad_norm": 0.9976829886436462, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 110840 + }, + { + "epoch": 7.960502692998205, + "grad_norm": 1.2092949151992798, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 110850 + }, + { + "epoch": 7.961220825852783, + "grad_norm": 1.0752426385879517, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 110860 + }, + { + "epoch": 7.961938958707361, + "grad_norm": 0.9072325229644775, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 110870 + }, + { + "epoch": 7.962657091561939, + "grad_norm": 1.1252259016036987, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 110880 + }, + { + "epoch": 7.963375224416517, + "grad_norm": 1.002448558807373, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 110890 + }, + { + "epoch": 7.9640933572710955, + "grad_norm": 0.9354956150054932, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 110900 + }, + { + "epoch": 7.9648114901256735, + "grad_norm": 1.1560840606689453, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 110910 + }, + { + "epoch": 7.9655296229802515, + "grad_norm": 1.169173240661621, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 110920 + }, + { + "epoch": 7.9662477558348295, + "grad_norm": 1.169741153717041, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 110930 + }, + { + "epoch": 7.9669658886894075, + "grad_norm": 1.092739224433899, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 110940 + }, + { + "epoch": 7.9676840215439855, + "grad_norm": 0.901034414768219, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 110950 + }, + { + "epoch": 7.9684021543985635, + "grad_norm": 1.1143959760665894, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 110960 + }, + { + "epoch": 7.9691202872531415, + "grad_norm": 1.1839512586593628, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 110970 + }, + { + "epoch": 7.96983842010772, + "grad_norm": 0.9340457320213318, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 110980 + }, + { + "epoch": 7.970556552962298, + "grad_norm": 1.0368584394454956, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 110990 + }, + { + "epoch": 7.971274685816876, + "grad_norm": 1.0153379440307617, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 111000 + }, + { + "epoch": 7.971992818671454, + "grad_norm": 1.0815552473068237, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 111010 + }, + { + "epoch": 7.972710951526032, + "grad_norm": 1.0502792596817017, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 111020 + }, + { + "epoch": 7.97342908438061, + "grad_norm": 1.3402234315872192, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 111030 + }, + { + "epoch": 7.974147217235188, + "grad_norm": 1.155196189880371, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 111040 + }, + { + "epoch": 7.974865350089766, + "grad_norm": 1.2841416597366333, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 111050 + }, + { + "epoch": 7.975583482944344, + "grad_norm": 1.1467466354370117, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 111060 + }, + { + "epoch": 7.976301615798922, + "grad_norm": 1.1308223009109497, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 111070 + }, + { + "epoch": 7.977019748653501, + "grad_norm": 1.0641266107559204, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 111080 + }, + { + "epoch": 7.977737881508079, + "grad_norm": 1.0808128118515015, + "learning_rate": 0.0002, + "loss": 0.5154, + "step": 111090 + }, + { + "epoch": 7.978456014362657, + "grad_norm": 1.2631522417068481, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 111100 + }, + { + "epoch": 7.979174147217235, + "grad_norm": 1.1176106929779053, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 111110 + }, + { + "epoch": 7.979892280071813, + "grad_norm": 1.183842658996582, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 111120 + }, + { + "epoch": 7.980610412926391, + "grad_norm": 0.9207148551940918, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 111130 + }, + { + "epoch": 7.981328545780969, + "grad_norm": 1.314513087272644, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 111140 + }, + { + "epoch": 7.982046678635547, + "grad_norm": 1.4508297443389893, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 111150 + }, + { + "epoch": 7.982764811490125, + "grad_norm": 1.1941379308700562, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 111160 + }, + { + "epoch": 7.983482944344704, + "grad_norm": 1.0326071977615356, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 111170 + }, + { + "epoch": 7.984201077199282, + "grad_norm": 1.1843258142471313, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 111180 + }, + { + "epoch": 7.98491921005386, + "grad_norm": 0.98868727684021, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 111190 + }, + { + "epoch": 7.985637342908438, + "grad_norm": 1.0722097158432007, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 111200 + }, + { + "epoch": 7.986355475763016, + "grad_norm": 1.254882574081421, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 111210 + }, + { + "epoch": 7.987073608617594, + "grad_norm": 1.1299649477005005, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 111220 + }, + { + "epoch": 7.987791741472172, + "grad_norm": 1.0343568325042725, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 111230 + }, + { + "epoch": 7.98850987432675, + "grad_norm": 1.173403024673462, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 111240 + }, + { + "epoch": 7.989228007181328, + "grad_norm": 1.2749351263046265, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 111250 + }, + { + "epoch": 7.989946140035906, + "grad_norm": 1.1579365730285645, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 111260 + }, + { + "epoch": 7.990664272890485, + "grad_norm": 1.2069926261901855, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 111270 + }, + { + "epoch": 7.991382405745063, + "grad_norm": 1.1962283849716187, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 111280 + }, + { + "epoch": 7.992100538599641, + "grad_norm": 0.9776540398597717, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 111290 + }, + { + "epoch": 7.992818671454219, + "grad_norm": 0.9829531311988831, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 111300 + }, + { + "epoch": 7.993536804308797, + "grad_norm": 1.3035449981689453, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 111310 + }, + { + "epoch": 7.994254937163375, + "grad_norm": 1.3423140048980713, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 111320 + }, + { + "epoch": 7.994973070017953, + "grad_norm": 1.1216566562652588, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 111330 + }, + { + "epoch": 7.995691202872531, + "grad_norm": 1.0143498182296753, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 111340 + }, + { + "epoch": 7.99640933572711, + "grad_norm": 1.0691397190093994, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 111350 + }, + { + "epoch": 7.997127468581688, + "grad_norm": 1.3484272956848145, + "learning_rate": 0.0002, + "loss": 0.5412, + "step": 111360 + }, + { + "epoch": 7.997845601436266, + "grad_norm": 0.9939428567886353, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 111370 + }, + { + "epoch": 7.998563734290844, + "grad_norm": 1.0009615421295166, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 111380 + }, + { + "epoch": 7.999281867145422, + "grad_norm": 0.986566424369812, + "learning_rate": 0.0002, + "loss": 0.5718, + "step": 111390 + }, + { + "epoch": 8.0, + "grad_norm": 0.9135745167732239, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 111400 + }, + { + "epoch": 8.0, + "eval_loss": 1.1793164014816284, + "eval_runtime": 55.1651, + "eval_samples_per_second": 13.287, + "eval_steps_per_second": 1.668, + "step": 111400 + } + ], + "logging_steps": 10, + "max_steps": 111400, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.155342565297357e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-111400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c5f2903e0143529ba3dac5eda041ffd6dcb63724 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1845a0b074e2c313c9e5d8df2d7d2ef41c53740ac7fc8fbfe3d314f315337229 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d07835765f221d2c77dae57ebb217c2e91de1d25 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:541e1136f4d2f630646bf90a2bf81d8e01b0f3123b3545d5458dc227dc336ede +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cbbfb6d1296f51b4eca2322e4a28b6ddc31b3cd6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beba1b35a46c216251bfa76d806db7b3f39d8ce6e65dcf693c24a0c2a0631577 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..614f37e4e6a8066459ce44ada5e306c378c76fd3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5afe9078e078a9972fcbff2a2fd4e3a3f2b61c05ccf3c629b5240c27f37b346b +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..25383a96e31060eb248e1515cef06007c2e09669 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/trainer_state.json @@ -0,0 +1,9785 @@ +{ + "best_metric": 1.09147310256958, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 13925, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000718132854578097, + "grad_norm": 1.0291756391525269, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 10 + }, + { + "epoch": 0.001436265709156194, + "grad_norm": 0.6570823192596436, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 20 + }, + { + "epoch": 0.0021543985637342907, + "grad_norm": 0.693844199180603, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 30 + }, + { + "epoch": 0.002872531418312388, + "grad_norm": 0.5608532428741455, + "learning_rate": 0.0002, + "loss": 0.9377, + "step": 40 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 0.549075722694397, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 50 + }, + { + "epoch": 0.004308797127468581, + "grad_norm": 0.47189879417419434, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 60 + }, + { + "epoch": 0.005026929982046679, + "grad_norm": 0.5799676775932312, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 70 + }, + { + "epoch": 0.005745062836624776, + "grad_norm": 0.45907193422317505, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 80 + }, + { + "epoch": 0.006463195691202872, + "grad_norm": 0.4373045861721039, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 90 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 100 + }, + { + "epoch": 0.007899461400359067, + "grad_norm": 0.5248253345489502, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 110 + }, + { + "epoch": 0.008617594254937163, + "grad_norm": 0.5082874298095703, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 120 + }, + { + "epoch": 0.00933572710951526, + "grad_norm": 0.42670881748199463, + "learning_rate": 0.0002, + "loss": 0.8678, + "step": 130 + }, + { + "epoch": 0.010053859964093357, + "grad_norm": 0.43311649560928345, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 0.43456509709358215, + "learning_rate": 0.0002, + "loss": 0.9252, + "step": 150 + }, + { + "epoch": 0.011490125673249552, + "grad_norm": 0.9222815632820129, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 160 + }, + { + "epoch": 0.012208258527827648, + "grad_norm": 0.42752256989479065, + "learning_rate": 0.0002, + "loss": 0.8651, + "step": 170 + }, + { + "epoch": 0.012926391382405745, + "grad_norm": 0.4175542891025543, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 180 + }, + { + "epoch": 0.013644524236983842, + "grad_norm": 0.4377831518650055, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 190 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 0.47263655066490173, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 200 + }, + { + "epoch": 0.015080789946140035, + "grad_norm": 0.3870520293712616, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 210 + }, + { + "epoch": 0.015798922800718134, + "grad_norm": 0.4950464963912964, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 220 + }, + { + "epoch": 0.01651705565529623, + "grad_norm": 0.4643295407295227, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 230 + }, + { + "epoch": 0.017235188509874325, + "grad_norm": 0.5152903199195862, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 240 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 0.3800727427005768, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 250 + }, + { + "epoch": 0.01867145421903052, + "grad_norm": 0.43700528144836426, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 260 + }, + { + "epoch": 0.01938958707360862, + "grad_norm": 0.3712887763977051, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 270 + }, + { + "epoch": 0.020107719928186715, + "grad_norm": 0.4202553629875183, + "learning_rate": 0.0002, + "loss": 0.8329, + "step": 280 + }, + { + "epoch": 0.02082585278276481, + "grad_norm": 0.40585094690322876, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 0.4685470759868622, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 300 + }, + { + "epoch": 0.022262118491921005, + "grad_norm": 0.373169481754303, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 310 + }, + { + "epoch": 0.022980251346499104, + "grad_norm": 0.39681482315063477, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 320 + }, + { + "epoch": 0.0236983842010772, + "grad_norm": 0.3919322192668915, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 330 + }, + { + "epoch": 0.024416517055655295, + "grad_norm": 0.4728981554508209, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 340 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 0.42439374327659607, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 350 + }, + { + "epoch": 0.02585278276481149, + "grad_norm": 0.425650030374527, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 360 + }, + { + "epoch": 0.02657091561938959, + "grad_norm": 0.4076762795448303, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 370 + }, + { + "epoch": 0.027289048473967684, + "grad_norm": 0.44335922598838806, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 380 + }, + { + "epoch": 0.02800718132854578, + "grad_norm": 0.5313619375228882, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 390 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 0.37089797854423523, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 400 + }, + { + "epoch": 0.029443447037701975, + "grad_norm": 0.5193604826927185, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 410 + }, + { + "epoch": 0.03016157989228007, + "grad_norm": 0.4428552985191345, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 420 + }, + { + "epoch": 0.03087971274685817, + "grad_norm": 0.384171724319458, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 430 + }, + { + "epoch": 0.03159784560143627, + "grad_norm": 0.3906913101673126, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 440 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 0.5365669131278992, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 450 + }, + { + "epoch": 0.03303411131059246, + "grad_norm": 0.4785287380218506, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 460 + }, + { + "epoch": 0.03375224416517056, + "grad_norm": 0.40048182010650635, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 470 + }, + { + "epoch": 0.03447037701974865, + "grad_norm": 0.49529239535331726, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 480 + }, + { + "epoch": 0.03518850987432675, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 490 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 0.3802863359451294, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.03662477558348295, + "grad_norm": 0.40374308824539185, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 510 + }, + { + "epoch": 0.03734290843806104, + "grad_norm": 0.4320009648799896, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 520 + }, + { + "epoch": 0.03806104129263914, + "grad_norm": 0.5198846459388733, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 530 + }, + { + "epoch": 0.03877917414721724, + "grad_norm": 0.4136947989463806, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 540 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 0.39344364404678345, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 550 + }, + { + "epoch": 0.04021543985637343, + "grad_norm": 0.4659644067287445, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 560 + }, + { + "epoch": 0.04093357271095153, + "grad_norm": 0.3898842930793762, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 570 + }, + { + "epoch": 0.04165170556552962, + "grad_norm": 0.3964841961860657, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 580 + }, + { + "epoch": 0.04236983842010772, + "grad_norm": 0.5172179341316223, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 590 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 0.5362544059753418, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 600 + }, + { + "epoch": 0.04380610412926391, + "grad_norm": 0.3975909948348999, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 610 + }, + { + "epoch": 0.04452423698384201, + "grad_norm": 0.3905031085014343, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 620 + }, + { + "epoch": 0.04524236983842011, + "grad_norm": 0.5148088932037354, + "learning_rate": 0.0002, + "loss": 0.7723, + "step": 630 + }, + { + "epoch": 0.04596050269299821, + "grad_norm": 0.38826194405555725, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 640 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 0.5432049036026001, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.0473967684021544, + "grad_norm": 0.42048221826553345, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 660 + }, + { + "epoch": 0.0481149012567325, + "grad_norm": 0.4683088958263397, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 670 + }, + { + "epoch": 0.04883303411131059, + "grad_norm": 0.4623735249042511, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 680 + }, + { + "epoch": 0.04955116696588869, + "grad_norm": 0.509128212928772, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 690 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 0.45767295360565186, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 700 + }, + { + "epoch": 0.05098743267504488, + "grad_norm": 0.4023726284503937, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 710 + }, + { + "epoch": 0.05170556552962298, + "grad_norm": 0.4407201409339905, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 720 + }, + { + "epoch": 0.05242369838420108, + "grad_norm": 0.41862091422080994, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 730 + }, + { + "epoch": 0.05314183123877918, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 740 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 0.4882921576499939, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 750 + }, + { + "epoch": 0.05457809694793537, + "grad_norm": 0.47890132665634155, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 760 + }, + { + "epoch": 0.05529622980251347, + "grad_norm": 0.5811166167259216, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 770 + }, + { + "epoch": 0.05601436265709156, + "grad_norm": 0.41113588213920593, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 780 + }, + { + "epoch": 0.05673249551166966, + "grad_norm": 0.4120602607727051, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 790 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 0.39287394285202026, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 800 + }, + { + "epoch": 0.05816876122082585, + "grad_norm": 0.3986941874027252, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 810 + }, + { + "epoch": 0.05888689407540395, + "grad_norm": 0.4264012575149536, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 820 + }, + { + "epoch": 0.05960502692998205, + "grad_norm": 0.481139600276947, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 830 + }, + { + "epoch": 0.06032315978456014, + "grad_norm": 0.5561784505844116, + "learning_rate": 0.0002, + "loss": 0.8477, + "step": 840 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 0.4787197411060333, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 850 + }, + { + "epoch": 0.06175942549371634, + "grad_norm": 0.46454647183418274, + "learning_rate": 0.0002, + "loss": 0.8567, + "step": 860 + }, + { + "epoch": 0.06247755834829444, + "grad_norm": 0.5929669141769409, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 870 + }, + { + "epoch": 0.06319569120287254, + "grad_norm": 0.4561384618282318, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 880 + }, + { + "epoch": 0.06391382405745062, + "grad_norm": 0.45767998695373535, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 890 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 0.42475444078445435, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 900 + }, + { + "epoch": 0.06535008976660682, + "grad_norm": 0.4911022484302521, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 910 + }, + { + "epoch": 0.06606822262118492, + "grad_norm": 0.5229166746139526, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 920 + }, + { + "epoch": 0.06678635547576302, + "grad_norm": 0.38134580850601196, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 930 + }, + { + "epoch": 0.06750448833034112, + "grad_norm": 0.4171486496925354, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 940 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 0.45171529054641724, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 950 + }, + { + "epoch": 0.0689407540394973, + "grad_norm": 0.44889307022094727, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 960 + }, + { + "epoch": 0.0696588868940754, + "grad_norm": 0.44902464747428894, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 970 + }, + { + "epoch": 0.0703770197486535, + "grad_norm": 0.4671969413757324, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 980 + }, + { + "epoch": 0.0710951526032316, + "grad_norm": 0.4686984717845917, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 990 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1000 + }, + { + "epoch": 0.0725314183123878, + "grad_norm": 0.48861828446388245, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1010 + }, + { + "epoch": 0.0732495511669659, + "grad_norm": 0.7603165507316589, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 1020 + }, + { + "epoch": 0.07396768402154398, + "grad_norm": 0.501654863357544, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 1030 + }, + { + "epoch": 0.07468581687612208, + "grad_norm": 0.45291560888290405, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 1040 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 0.42454713582992554, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 1050 + }, + { + "epoch": 0.07612208258527828, + "grad_norm": 0.4655592441558838, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1060 + }, + { + "epoch": 0.07684021543985638, + "grad_norm": 0.5011071562767029, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 1070 + }, + { + "epoch": 0.07755834829443448, + "grad_norm": 0.37221577763557434, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 1080 + }, + { + "epoch": 0.07827648114901256, + "grad_norm": 0.5123572945594788, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 1090 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 0.44138720631599426, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1100 + }, + { + "epoch": 0.07971274685816876, + "grad_norm": 0.38932886719703674, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 1110 + }, + { + "epoch": 0.08043087971274686, + "grad_norm": 0.435820072889328, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 1120 + }, + { + "epoch": 0.08114901256732496, + "grad_norm": 0.3820142149925232, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 1130 + }, + { + "epoch": 0.08186714542190306, + "grad_norm": 0.39680808782577515, + "learning_rate": 0.0002, + "loss": 0.8617, + "step": 1140 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 0.4833722412586212, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1150 + }, + { + "epoch": 0.08330341113105924, + "grad_norm": 0.5045956969261169, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1160 + }, + { + "epoch": 0.08402154398563734, + "grad_norm": 0.3652207553386688, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 1170 + }, + { + "epoch": 0.08473967684021544, + "grad_norm": 0.44447052478790283, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 1180 + }, + { + "epoch": 0.08545780969479354, + "grad_norm": 0.44942694902420044, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 1190 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 0.48789075016975403, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1200 + }, + { + "epoch": 0.08689407540394974, + "grad_norm": 0.3981451094150543, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1210 + }, + { + "epoch": 0.08761220825852782, + "grad_norm": 0.45545220375061035, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 1220 + }, + { + "epoch": 0.08833034111310592, + "grad_norm": 0.562138557434082, + "learning_rate": 0.0002, + "loss": 0.8406, + "step": 1230 + }, + { + "epoch": 0.08904847396768402, + "grad_norm": 0.48523494601249695, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 1240 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 0.35054388642311096, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1250 + }, + { + "epoch": 0.09048473967684022, + "grad_norm": 0.4148605167865753, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 1260 + }, + { + "epoch": 0.09120287253141832, + "grad_norm": 0.50171959400177, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 1270 + }, + { + "epoch": 0.09192100538599642, + "grad_norm": 0.41747573018074036, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 1280 + }, + { + "epoch": 0.0926391382405745, + "grad_norm": 0.43028751015663147, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1290 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 0.41274991631507874, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 1300 + }, + { + "epoch": 0.0940754039497307, + "grad_norm": 0.5399569272994995, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 1310 + }, + { + "epoch": 0.0947935368043088, + "grad_norm": 0.44284379482269287, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 1320 + }, + { + "epoch": 0.0955116696588869, + "grad_norm": 0.42511969804763794, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1330 + }, + { + "epoch": 0.096229802513465, + "grad_norm": 0.5717929005622864, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1340 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1350 + }, + { + "epoch": 0.09766606822262118, + "grad_norm": 0.4144339859485626, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 1360 + }, + { + "epoch": 0.09838420107719928, + "grad_norm": 0.43676936626434326, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 1370 + }, + { + "epoch": 0.09910233393177738, + "grad_norm": 0.5297161340713501, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 1380 + }, + { + "epoch": 0.09982046678635548, + "grad_norm": 0.5319193601608276, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1390 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 0.4083728492259979, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1400 + }, + { + "epoch": 0.10125673249551168, + "grad_norm": 0.4193868339061737, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1410 + }, + { + "epoch": 0.10197486535008976, + "grad_norm": 0.4062198996543884, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 1420 + }, + { + "epoch": 0.10269299820466786, + "grad_norm": 0.43972232937812805, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1430 + }, + { + "epoch": 0.10341113105924596, + "grad_norm": 0.4598410725593567, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1440 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 0.571662187576294, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1450 + }, + { + "epoch": 0.10484739676840216, + "grad_norm": 0.5437791347503662, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1460 + }, + { + "epoch": 0.10556552962298026, + "grad_norm": 0.4241923391819, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1470 + }, + { + "epoch": 0.10628366247755835, + "grad_norm": 0.5185145735740662, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1480 + }, + { + "epoch": 0.10700179533213644, + "grad_norm": 0.537626326084137, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 1490 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 0.4573661983013153, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 1500 + }, + { + "epoch": 0.10843806104129264, + "grad_norm": 0.4521017074584961, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 1510 + }, + { + "epoch": 0.10915619389587074, + "grad_norm": 0.6835159063339233, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1520 + }, + { + "epoch": 0.10987432675044884, + "grad_norm": 0.43522894382476807, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 1530 + }, + { + "epoch": 0.11059245960502694, + "grad_norm": 0.685547411441803, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 1540 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 0.5283669233322144, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1550 + }, + { + "epoch": 0.11202872531418312, + "grad_norm": 0.4869283437728882, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 1560 + }, + { + "epoch": 0.11274685816876122, + "grad_norm": 0.43024054169654846, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1570 + }, + { + "epoch": 0.11346499102333932, + "grad_norm": 0.46726059913635254, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1580 + }, + { + "epoch": 0.11418312387791742, + "grad_norm": 0.5046039819717407, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 1590 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 0.48972827196121216, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 1600 + }, + { + "epoch": 0.11561938958707361, + "grad_norm": 0.5221049189567566, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 1610 + }, + { + "epoch": 0.1163375224416517, + "grad_norm": 0.49169477820396423, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 1620 + }, + { + "epoch": 0.1170556552962298, + "grad_norm": 0.48462188243865967, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 1630 + }, + { + "epoch": 0.1177737881508079, + "grad_norm": 0.9001021981239319, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 1640 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 0.47555917501449585, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 1650 + }, + { + "epoch": 0.1192100538599641, + "grad_norm": 0.4523521959781647, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1660 + }, + { + "epoch": 0.1199281867145422, + "grad_norm": 0.510956346988678, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 1670 + }, + { + "epoch": 0.12064631956912028, + "grad_norm": 0.48063746094703674, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 0.12136445242369838, + "grad_norm": 0.5209490060806274, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 1690 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 0.5488983988761902, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1700 + }, + { + "epoch": 0.12280071813285458, + "grad_norm": 0.5263523459434509, + "learning_rate": 0.0002, + "loss": 0.829, + "step": 1710 + }, + { + "epoch": 0.12351885098743268, + "grad_norm": 0.45365768671035767, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 1720 + }, + { + "epoch": 0.12423698384201078, + "grad_norm": 0.4366922378540039, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 1730 + }, + { + "epoch": 0.12495511669658887, + "grad_norm": 0.4841083884239197, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 1740 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 0.46546968817710876, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 1750 + }, + { + "epoch": 0.12639138240574507, + "grad_norm": 0.39987099170684814, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1760 + }, + { + "epoch": 0.12710951526032316, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 1770 + }, + { + "epoch": 0.12782764811490124, + "grad_norm": 0.46716657280921936, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 1780 + }, + { + "epoch": 0.12854578096947936, + "grad_norm": 0.46164995431900024, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1790 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 0.4910370111465454, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 1800 + }, + { + "epoch": 0.12998204667863555, + "grad_norm": 0.5615737438201904, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 1810 + }, + { + "epoch": 0.13070017953321364, + "grad_norm": 0.5739728808403015, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1820 + }, + { + "epoch": 0.13141831238779175, + "grad_norm": 0.44104722142219543, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 1830 + }, + { + "epoch": 0.13213644524236984, + "grad_norm": 0.46373724937438965, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 1840 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 0.4481196403503418, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.13357271095152604, + "grad_norm": 0.5689327716827393, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 1860 + }, + { + "epoch": 0.13429084380610412, + "grad_norm": 0.5334849953651428, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 1870 + }, + { + "epoch": 0.13500897666068223, + "grad_norm": 0.5177253484725952, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 1880 + }, + { + "epoch": 0.13572710951526032, + "grad_norm": 0.4919368326663971, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 1890 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 0.5987576842308044, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 1900 + }, + { + "epoch": 0.13716337522441652, + "grad_norm": 0.49790486693382263, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 1910 + }, + { + "epoch": 0.1378815080789946, + "grad_norm": 0.5337542295455933, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1920 + }, + { + "epoch": 0.13859964093357272, + "grad_norm": 0.5171598792076111, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 1930 + }, + { + "epoch": 0.1393177737881508, + "grad_norm": 0.5003953576087952, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1940 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 0.5147887468338013, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 1950 + }, + { + "epoch": 0.140754039497307, + "grad_norm": 0.6365984678268433, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 1960 + }, + { + "epoch": 0.1414721723518851, + "grad_norm": 0.5449512004852295, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 1970 + }, + { + "epoch": 0.1421903052064632, + "grad_norm": 0.4062703847885132, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1980 + }, + { + "epoch": 0.14290843806104128, + "grad_norm": 0.4446912705898285, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 1990 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 0.49001234769821167, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 2000 + }, + { + "epoch": 0.14434470377019748, + "grad_norm": 0.5591765642166138, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 2010 + }, + { + "epoch": 0.1450628366247756, + "grad_norm": 0.6476696133613586, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 2020 + }, + { + "epoch": 0.14578096947935368, + "grad_norm": 0.44688376784324646, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 2030 + }, + { + "epoch": 0.1464991023339318, + "grad_norm": 0.4437490701675415, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 2040 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 0.59927898645401, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 2050 + }, + { + "epoch": 0.14793536804308796, + "grad_norm": 0.4356591999530792, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 2060 + }, + { + "epoch": 0.14865350089766607, + "grad_norm": 0.5560822486877441, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2070 + }, + { + "epoch": 0.14937163375224416, + "grad_norm": 0.43027108907699585, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2080 + }, + { + "epoch": 0.15008976660682227, + "grad_norm": 0.41215455532073975, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 2090 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 0.4607839584350586, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 2100 + }, + { + "epoch": 0.15152603231597844, + "grad_norm": 0.4699854254722595, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2110 + }, + { + "epoch": 0.15224416517055656, + "grad_norm": 0.5111975073814392, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2120 + }, + { + "epoch": 0.15296229802513464, + "grad_norm": 0.4713742733001709, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2130 + }, + { + "epoch": 0.15368043087971275, + "grad_norm": 0.3816622793674469, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2140 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 0.4637526273727417, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 2150 + }, + { + "epoch": 0.15511669658886895, + "grad_norm": 0.3691818118095398, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2160 + }, + { + "epoch": 0.15583482944344704, + "grad_norm": 0.4435218274593353, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 2170 + }, + { + "epoch": 0.15655296229802512, + "grad_norm": 0.5282211899757385, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 2180 + }, + { + "epoch": 0.15727109515260324, + "grad_norm": 0.7611056566238403, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 2190 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 0.5951169729232788, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 2200 + }, + { + "epoch": 0.15870736086175943, + "grad_norm": 0.5243265628814697, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2210 + }, + { + "epoch": 0.15942549371633752, + "grad_norm": 0.518944501876831, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 2220 + }, + { + "epoch": 0.16014362657091563, + "grad_norm": 0.4264616072177887, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2230 + }, + { + "epoch": 0.16086175942549372, + "grad_norm": 0.4619045853614807, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 2240 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 0.4047030508518219, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2250 + }, + { + "epoch": 0.16229802513464991, + "grad_norm": 0.47133687138557434, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 2260 + }, + { + "epoch": 0.163016157989228, + "grad_norm": 0.4990246593952179, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 2270 + }, + { + "epoch": 0.1637342908438061, + "grad_norm": 0.5145298838615417, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 2280 + }, + { + "epoch": 0.1644524236983842, + "grad_norm": 0.5354352593421936, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 2290 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 0.47621065378189087, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 2300 + }, + { + "epoch": 0.1658886894075404, + "grad_norm": 0.45333582162857056, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 2310 + }, + { + "epoch": 0.16660682226211848, + "grad_norm": 0.4832790493965149, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 2320 + }, + { + "epoch": 0.1673249551166966, + "grad_norm": 0.4922761619091034, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2330 + }, + { + "epoch": 0.16804308797127468, + "grad_norm": 0.5701655149459839, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 2340 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 0.5170459151268005, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 2350 + }, + { + "epoch": 0.16947935368043088, + "grad_norm": 0.6562373638153076, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2360 + }, + { + "epoch": 0.170197486535009, + "grad_norm": 0.5350262522697449, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 2370 + }, + { + "epoch": 0.17091561938958708, + "grad_norm": 0.5163491368293762, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 2380 + }, + { + "epoch": 0.17163375224416516, + "grad_norm": 0.48841530084609985, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2390 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 0.44912993907928467, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 2400 + }, + { + "epoch": 0.17307001795332136, + "grad_norm": 0.5770647525787354, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 2410 + }, + { + "epoch": 0.17378815080789947, + "grad_norm": 0.4716179072856903, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 2420 + }, + { + "epoch": 0.17450628366247756, + "grad_norm": 0.5465078949928284, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 2430 + }, + { + "epoch": 0.17522441651705564, + "grad_norm": 0.40810713171958923, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 2440 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 0.3789578080177307, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 2450 + }, + { + "epoch": 0.17666068222621184, + "grad_norm": 0.4615110158920288, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 2460 + }, + { + "epoch": 0.17737881508078995, + "grad_norm": 0.4400235712528229, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2470 + }, + { + "epoch": 0.17809694793536804, + "grad_norm": 0.5935020446777344, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2480 + }, + { + "epoch": 0.17881508078994615, + "grad_norm": 0.5672990679740906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 2490 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 0.4132838845252991, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 2500 + }, + { + "epoch": 0.18025134649910232, + "grad_norm": 0.5373716950416565, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 2510 + }, + { + "epoch": 0.18096947935368043, + "grad_norm": 0.5335832834243774, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2520 + }, + { + "epoch": 0.18168761220825852, + "grad_norm": 0.5705642700195312, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.18240574506283663, + "grad_norm": 0.4807959496974945, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 2540 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 0.4430573880672455, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2550 + }, + { + "epoch": 0.18384201077199283, + "grad_norm": 0.5294728875160217, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 2560 + }, + { + "epoch": 0.18456014362657092, + "grad_norm": 0.661173403263092, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2570 + }, + { + "epoch": 0.185278276481149, + "grad_norm": 0.5044304728507996, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2580 + }, + { + "epoch": 0.18599640933572711, + "grad_norm": 0.48929551243782043, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 2590 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 0.5054438710212708, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2600 + }, + { + "epoch": 0.1874326750448833, + "grad_norm": 0.5613677501678467, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 2610 + }, + { + "epoch": 0.1881508078994614, + "grad_norm": 0.5762478709220886, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 2620 + }, + { + "epoch": 0.1888689407540395, + "grad_norm": 0.4523695409297943, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2630 + }, + { + "epoch": 0.1895870736086176, + "grad_norm": 0.5235317945480347, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 2640 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 0.4894576370716095, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 2650 + }, + { + "epoch": 0.1910233393177738, + "grad_norm": 0.45731106400489807, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2660 + }, + { + "epoch": 0.19174147217235188, + "grad_norm": 0.4726541042327881, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2670 + }, + { + "epoch": 0.19245960502693, + "grad_norm": 0.4281631410121918, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 2680 + }, + { + "epoch": 0.19317773788150808, + "grad_norm": 0.48011314868927, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 2690 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 0.45785006880760193, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2700 + }, + { + "epoch": 0.19461400359066428, + "grad_norm": 0.5244625210762024, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 2710 + }, + { + "epoch": 0.19533213644524236, + "grad_norm": 0.4674883186817169, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2720 + }, + { + "epoch": 0.19605026929982047, + "grad_norm": 0.5969558358192444, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 2730 + }, + { + "epoch": 0.19676840215439856, + "grad_norm": 0.44413265585899353, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 2740 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 0.5094553828239441, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2750 + }, + { + "epoch": 0.19820466786355476, + "grad_norm": 0.4931736886501312, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2760 + }, + { + "epoch": 0.19892280071813284, + "grad_norm": 0.4766625463962555, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 2770 + }, + { + "epoch": 0.19964093357271095, + "grad_norm": 0.4196971654891968, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2780 + }, + { + "epoch": 0.20035906642728904, + "grad_norm": 0.4693375825881958, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 2790 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 0.5407108664512634, + "learning_rate": 0.0002, + "loss": 0.8336, + "step": 2800 + }, + { + "epoch": 0.20179533213644524, + "grad_norm": 0.42864227294921875, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2810 + }, + { + "epoch": 0.20251346499102335, + "grad_norm": 0.4928833246231079, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 2820 + }, + { + "epoch": 0.20323159784560144, + "grad_norm": 0.5575131773948669, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2830 + }, + { + "epoch": 0.20394973070017952, + "grad_norm": 0.505114734172821, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2840 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 0.4727420210838318, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 2850 + }, + { + "epoch": 0.20538599640933572, + "grad_norm": 0.48218145966529846, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 2860 + }, + { + "epoch": 0.20610412926391383, + "grad_norm": 0.5196906328201294, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2870 + }, + { + "epoch": 0.20682226211849192, + "grad_norm": 0.4927639067173004, + "learning_rate": 0.0002, + "loss": 0.8401, + "step": 2880 + }, + { + "epoch": 0.20754039497307003, + "grad_norm": 0.5076990127563477, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 2890 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.4606800079345703, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 2900 + }, + { + "epoch": 0.2089766606822262, + "grad_norm": 0.6184319257736206, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2910 + }, + { + "epoch": 0.2096947935368043, + "grad_norm": 0.5237935781478882, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2920 + }, + { + "epoch": 0.2104129263913824, + "grad_norm": 0.43966251611709595, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 2930 + }, + { + "epoch": 0.2111310592459605, + "grad_norm": 0.48786666989326477, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2940 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 0.4397817552089691, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 2950 + }, + { + "epoch": 0.2125673249551167, + "grad_norm": 0.5155336260795593, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.2132854578096948, + "grad_norm": 0.48058274388313293, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2970 + }, + { + "epoch": 0.21400359066427288, + "grad_norm": 0.5022647976875305, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2980 + }, + { + "epoch": 0.214721723518851, + "grad_norm": 0.5417225360870361, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 2990 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 0.46300315856933594, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 3000 + }, + { + "epoch": 0.2161579892280072, + "grad_norm": 0.5375089049339294, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 3010 + }, + { + "epoch": 0.21687612208258528, + "grad_norm": 0.5050022602081299, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 3020 + }, + { + "epoch": 0.21759425493716336, + "grad_norm": 0.46347716450691223, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 3030 + }, + { + "epoch": 0.21831238779174147, + "grad_norm": 0.544874370098114, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 3040 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 3050 + }, + { + "epoch": 0.21974865350089767, + "grad_norm": 0.5527157187461853, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 3060 + }, + { + "epoch": 0.22046678635547576, + "grad_norm": 0.5565235018730164, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 3070 + }, + { + "epoch": 0.22118491921005387, + "grad_norm": 0.4900645613670349, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 3080 + }, + { + "epoch": 0.22190305206463196, + "grad_norm": 0.4951242208480835, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 3090 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 0.5831719636917114, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 3100 + }, + { + "epoch": 0.22333931777378815, + "grad_norm": 0.417576402425766, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 3110 + }, + { + "epoch": 0.22405745062836624, + "grad_norm": 0.4715117812156677, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 3120 + }, + { + "epoch": 0.22477558348294435, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 3130 + }, + { + "epoch": 0.22549371633752244, + "grad_norm": 0.408184289932251, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 3140 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 3150 + }, + { + "epoch": 0.22692998204667864, + "grad_norm": 0.5631294846534729, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3160 + }, + { + "epoch": 0.22764811490125672, + "grad_norm": 0.5054665803909302, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3170 + }, + { + "epoch": 0.22836624775583483, + "grad_norm": 0.47388020157814026, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 3180 + }, + { + "epoch": 0.22908438061041292, + "grad_norm": 0.45871609449386597, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 3190 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.42431211471557617, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 3200 + }, + { + "epoch": 0.23052064631956912, + "grad_norm": 0.584872305393219, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3210 + }, + { + "epoch": 0.23123877917414723, + "grad_norm": 0.5489653944969177, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 3220 + }, + { + "epoch": 0.23195691202872532, + "grad_norm": 0.5803213119506836, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 3230 + }, + { + "epoch": 0.2326750448833034, + "grad_norm": 0.906505823135376, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3240 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 0.4569525718688965, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 3250 + }, + { + "epoch": 0.2341113105924596, + "grad_norm": 0.5566741228103638, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3260 + }, + { + "epoch": 0.2348294434470377, + "grad_norm": 0.5059959888458252, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3270 + }, + { + "epoch": 0.2355475763016158, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 3280 + }, + { + "epoch": 0.2362657091561939, + "grad_norm": 0.5149409174919128, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 3290 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 0.7323763966560364, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3300 + }, + { + "epoch": 0.23770197486535008, + "grad_norm": 0.6794836521148682, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 3310 + }, + { + "epoch": 0.2384201077199282, + "grad_norm": 0.5176534056663513, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 3320 + }, + { + "epoch": 0.23913824057450628, + "grad_norm": 0.42245906591415405, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 3330 + }, + { + "epoch": 0.2398563734290844, + "grad_norm": 0.43535107374191284, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 0.7038307785987854, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 3350 + }, + { + "epoch": 0.24129263913824056, + "grad_norm": 0.5689977407455444, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 0.24201077199281867, + "grad_norm": 0.538136899471283, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 3370 + }, + { + "epoch": 0.24272890484739676, + "grad_norm": 0.7433661222457886, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 3380 + }, + { + "epoch": 0.24344703770197487, + "grad_norm": 0.6996734738349915, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3390 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.5055703520774841, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 3400 + }, + { + "epoch": 0.24488330341113107, + "grad_norm": 0.5218513607978821, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 3410 + }, + { + "epoch": 0.24560143626570916, + "grad_norm": 0.42782822251319885, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3420 + }, + { + "epoch": 0.24631956912028724, + "grad_norm": 0.4991157650947571, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 3430 + }, + { + "epoch": 0.24703770197486535, + "grad_norm": 0.5063165426254272, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3440 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 0.45863136649131775, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3450 + }, + { + "epoch": 0.24847396768402155, + "grad_norm": 0.474728524684906, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3460 + }, + { + "epoch": 0.24919210053859964, + "grad_norm": 0.522570013999939, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 3470 + }, + { + "epoch": 0.24991023339317775, + "grad_norm": 0.5474396347999573, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 3480 + }, + { + "epoch": 0.2506283662477558, + "grad_norm": 0.49094662070274353, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3490 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 0.6399132609367371, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 3500 + }, + { + "epoch": 0.25206463195691203, + "grad_norm": 0.5910066366195679, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 3510 + }, + { + "epoch": 0.25278276481149015, + "grad_norm": 0.4761259853839874, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3520 + }, + { + "epoch": 0.2535008976660682, + "grad_norm": 0.5124502182006836, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 3530 + }, + { + "epoch": 0.2542190305206463, + "grad_norm": 0.4329150915145874, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3540 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 0.4839608371257782, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 3550 + }, + { + "epoch": 0.2556552962298025, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3560 + }, + { + "epoch": 0.2563734290843806, + "grad_norm": 0.5761468410491943, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 3570 + }, + { + "epoch": 0.2570915619389587, + "grad_norm": 0.49266132712364197, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3580 + }, + { + "epoch": 0.2578096947935368, + "grad_norm": 0.7377930879592896, + "learning_rate": 0.0002, + "loss": 0.7946, + "step": 3590 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 0.543541431427002, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3600 + }, + { + "epoch": 0.259245960502693, + "grad_norm": 0.48385897278785706, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3610 + }, + { + "epoch": 0.2599640933572711, + "grad_norm": 0.5152639746665955, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3620 + }, + { + "epoch": 0.26068222621184917, + "grad_norm": 0.5601988434791565, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 3630 + }, + { + "epoch": 0.2614003590664273, + "grad_norm": 0.4349626302719116, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 0.5487161874771118, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3650 + }, + { + "epoch": 0.2628366247755835, + "grad_norm": 0.45603805780410767, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 3660 + }, + { + "epoch": 0.26355475763016156, + "grad_norm": 0.5012730956077576, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 3670 + }, + { + "epoch": 0.2642728904847397, + "grad_norm": 0.4523845314979553, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 3680 + }, + { + "epoch": 0.2649910233393178, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 3690 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 0.48467493057250977, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 3700 + }, + { + "epoch": 0.26642728904847396, + "grad_norm": 0.4860585927963257, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3710 + }, + { + "epoch": 0.26714542190305207, + "grad_norm": 0.5067077875137329, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3720 + }, + { + "epoch": 0.2678635547576302, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3730 + }, + { + "epoch": 0.26858168761220824, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 3740 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 0.5026951432228088, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 3750 + }, + { + "epoch": 0.27001795332136447, + "grad_norm": 0.49474090337753296, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3760 + }, + { + "epoch": 0.2707360861759425, + "grad_norm": 0.6381985545158386, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 3770 + }, + { + "epoch": 0.27145421903052064, + "grad_norm": 0.4784011244773865, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 3780 + }, + { + "epoch": 0.27217235188509875, + "grad_norm": 0.5126543045043945, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 3790 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 3800 + }, + { + "epoch": 0.2736086175942549, + "grad_norm": 0.5427033305168152, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 3810 + }, + { + "epoch": 0.27432675044883303, + "grad_norm": 0.46467480063438416, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 3820 + }, + { + "epoch": 0.27504488330341115, + "grad_norm": 0.494367390871048, + "learning_rate": 0.0002, + "loss": 0.8414, + "step": 3830 + }, + { + "epoch": 0.2757630161579892, + "grad_norm": 0.59856778383255, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3840 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 0.422128826379776, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 3850 + }, + { + "epoch": 0.27719928186714543, + "grad_norm": 0.5757306814193726, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 3860 + }, + { + "epoch": 0.27791741472172354, + "grad_norm": 0.5850930213928223, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.2786355475763016, + "grad_norm": 0.5633023977279663, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3880 + }, + { + "epoch": 0.2793536804308797, + "grad_norm": 0.5037940144538879, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 3890 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 0.5255506038665771, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 3900 + }, + { + "epoch": 0.2807899461400359, + "grad_norm": 0.44584617018699646, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 3910 + }, + { + "epoch": 0.281508078994614, + "grad_norm": 0.4803239405155182, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 3920 + }, + { + "epoch": 0.2822262118491921, + "grad_norm": 0.5206008553504944, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 3930 + }, + { + "epoch": 0.2829443447037702, + "grad_norm": 0.5596373081207275, + "learning_rate": 0.0002, + "loss": 0.8988, + "step": 3940 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 0.4487258493900299, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 3950 + }, + { + "epoch": 0.2843806104129264, + "grad_norm": 0.4774281978607178, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3960 + }, + { + "epoch": 0.2850987432675045, + "grad_norm": 0.571829617023468, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3970 + }, + { + "epoch": 0.28581687612208256, + "grad_norm": 0.45251455903053284, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 3980 + }, + { + "epoch": 0.2865350089766607, + "grad_norm": 0.5119943618774414, + "learning_rate": 0.0002, + "loss": 0.8007, + "step": 3990 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.42333969473838806, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 4000 + }, + { + "epoch": 0.2879712746858169, + "grad_norm": 0.5694096684455872, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 4010 + }, + { + "epoch": 0.28868940754039496, + "grad_norm": 0.44457492232322693, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 4020 + }, + { + "epoch": 0.2894075403949731, + "grad_norm": 0.496545672416687, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 4030 + }, + { + "epoch": 0.2901256732495512, + "grad_norm": 0.5092352032661438, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 4040 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 0.5124567151069641, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4050 + }, + { + "epoch": 0.29156193895870736, + "grad_norm": 0.5148161053657532, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4060 + }, + { + "epoch": 0.29228007181328547, + "grad_norm": 0.48183947801589966, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4070 + }, + { + "epoch": 0.2929982046678636, + "grad_norm": 0.47728800773620605, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4080 + }, + { + "epoch": 0.29371633752244164, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.5343585014343262, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 4100 + }, + { + "epoch": 0.29515260323159787, + "grad_norm": 0.5760312676429749, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 4110 + }, + { + "epoch": 0.2958707360861759, + "grad_norm": 0.5894787907600403, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4120 + }, + { + "epoch": 0.29658886894075404, + "grad_norm": 0.4528578817844391, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 4130 + }, + { + "epoch": 0.29730700179533215, + "grad_norm": 0.6027235388755798, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 4140 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.5060310959815979, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 4150 + }, + { + "epoch": 0.2987432675044883, + "grad_norm": 0.475252628326416, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4160 + }, + { + "epoch": 0.29946140035906643, + "grad_norm": 0.4855351448059082, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 4170 + }, + { + "epoch": 0.30017953321364454, + "grad_norm": 0.6720767021179199, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4180 + }, + { + "epoch": 0.3008976660682226, + "grad_norm": 0.6409553289413452, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 4190 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.5508167147636414, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 4200 + }, + { + "epoch": 0.30233393177737883, + "grad_norm": 0.45958149433135986, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 4210 + }, + { + "epoch": 0.3030520646319569, + "grad_norm": 0.5201641321182251, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 4220 + }, + { + "epoch": 0.303770197486535, + "grad_norm": 0.5440032482147217, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4230 + }, + { + "epoch": 0.3044883303411131, + "grad_norm": 0.43566814064979553, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4240 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 0.4479893445968628, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 4250 + }, + { + "epoch": 0.3059245960502693, + "grad_norm": 0.40390217304229736, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4260 + }, + { + "epoch": 0.3066427289048474, + "grad_norm": 0.5143486261367798, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 4270 + }, + { + "epoch": 0.3073608617594255, + "grad_norm": 0.5289962887763977, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 4280 + }, + { + "epoch": 0.30807899461400357, + "grad_norm": 0.609561026096344, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 4290 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 0.5967493653297424, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 4300 + }, + { + "epoch": 0.3095152603231598, + "grad_norm": 0.5323672890663147, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4310 + }, + { + "epoch": 0.3102333931777379, + "grad_norm": 0.4996737241744995, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 4320 + }, + { + "epoch": 0.31095152603231596, + "grad_norm": 0.5528829097747803, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 4330 + }, + { + "epoch": 0.3116696588868941, + "grad_norm": 0.5394268035888672, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4340 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 0.4654628038406372, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 4350 + }, + { + "epoch": 0.31310592459605024, + "grad_norm": 0.4933706521987915, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.31382405745062836, + "grad_norm": 0.5310598611831665, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 4370 + }, + { + "epoch": 0.31454219030520647, + "grad_norm": 0.5558765530586243, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4380 + }, + { + "epoch": 0.3152603231597846, + "grad_norm": 0.5281313061714172, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 4390 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 0.5100293755531311, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4400 + }, + { + "epoch": 0.31669658886894075, + "grad_norm": 0.48762813210487366, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 4410 + }, + { + "epoch": 0.31741472172351887, + "grad_norm": 0.5211702585220337, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 4420 + }, + { + "epoch": 0.3181328545780969, + "grad_norm": 0.696747899055481, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 4430 + }, + { + "epoch": 0.31885098743267504, + "grad_norm": 0.6334946751594543, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4440 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.5333067178726196, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4450 + }, + { + "epoch": 0.32028725314183126, + "grad_norm": 0.500091552734375, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 4460 + }, + { + "epoch": 0.3210053859964093, + "grad_norm": 0.5190957188606262, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4470 + }, + { + "epoch": 0.32172351885098743, + "grad_norm": 0.6702370047569275, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 4480 + }, + { + "epoch": 0.32244165170556555, + "grad_norm": 0.4393869638442993, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 4490 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 4500 + }, + { + "epoch": 0.3238779174147217, + "grad_norm": 0.561836838722229, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 4510 + }, + { + "epoch": 0.32459605026929983, + "grad_norm": 0.44366541504859924, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 4520 + }, + { + "epoch": 0.32531418312387794, + "grad_norm": 0.46504274010658264, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 4530 + }, + { + "epoch": 0.326032315978456, + "grad_norm": 0.5498034954071045, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 4540 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 0.5901338458061218, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 4550 + }, + { + "epoch": 0.3274685816876122, + "grad_norm": 0.5485442876815796, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 4560 + }, + { + "epoch": 0.3281867145421903, + "grad_norm": 0.512584924697876, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4570 + }, + { + "epoch": 0.3289048473967684, + "grad_norm": 0.5208188891410828, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 4580 + }, + { + "epoch": 0.3296229802513465, + "grad_norm": 0.4923836886882782, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 4590 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 0.49258530139923096, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 4600 + }, + { + "epoch": 0.3310592459605027, + "grad_norm": 0.4788922667503357, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 4610 + }, + { + "epoch": 0.3317773788150808, + "grad_norm": 0.48276954889297485, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4620 + }, + { + "epoch": 0.3324955116696589, + "grad_norm": 0.6300732493400574, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4630 + }, + { + "epoch": 0.33321364452423696, + "grad_norm": 0.47594770789146423, + "learning_rate": 0.0002, + "loss": 0.8434, + "step": 4640 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.4728924632072449, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 4650 + }, + { + "epoch": 0.3346499102333932, + "grad_norm": 0.5586788654327393, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4660 + }, + { + "epoch": 0.3353680430879713, + "grad_norm": 0.4573180377483368, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 4670 + }, + { + "epoch": 0.33608617594254936, + "grad_norm": 0.6391524076461792, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 4680 + }, + { + "epoch": 0.33680430879712747, + "grad_norm": 0.6570921540260315, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 4690 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 0.4601454734802246, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 4700 + }, + { + "epoch": 0.33824057450628364, + "grad_norm": 0.5640755295753479, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 4710 + }, + { + "epoch": 0.33895870736086176, + "grad_norm": 0.43475520610809326, + "learning_rate": 0.0002, + "loss": 0.8326, + "step": 4720 + }, + { + "epoch": 0.33967684021543987, + "grad_norm": 0.4785807132720947, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 4730 + }, + { + "epoch": 0.340394973070018, + "grad_norm": 0.4934665262699127, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 4740 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 0.45327693223953247, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 4750 + }, + { + "epoch": 0.34183123877917415, + "grad_norm": 0.4710456430912018, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4760 + }, + { + "epoch": 0.34254937163375226, + "grad_norm": 0.5591559410095215, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 4770 + }, + { + "epoch": 0.3432675044883303, + "grad_norm": 0.48958835005760193, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 4780 + }, + { + "epoch": 0.34398563734290843, + "grad_norm": 0.4613766670227051, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 4790 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 0.5425335764884949, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 4800 + }, + { + "epoch": 0.3454219030520646, + "grad_norm": 0.4964924156665802, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.3461400359066427, + "grad_norm": 0.613449215888977, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 4820 + }, + { + "epoch": 0.34685816876122083, + "grad_norm": 0.6553348898887634, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 4830 + }, + { + "epoch": 0.34757630161579894, + "grad_norm": 0.5863470435142517, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 4840 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.5338097810745239, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 4850 + }, + { + "epoch": 0.3490125673249551, + "grad_norm": 0.6129760146141052, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 4860 + }, + { + "epoch": 0.3497307001795332, + "grad_norm": 0.6100956797599792, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 4870 + }, + { + "epoch": 0.3504488330341113, + "grad_norm": 0.5478541254997253, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 4880 + }, + { + "epoch": 0.3511669658886894, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 4890 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6141043901443481, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 4900 + }, + { + "epoch": 0.3526032315978456, + "grad_norm": 0.597191572189331, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 4910 + }, + { + "epoch": 0.3533213644524237, + "grad_norm": 0.5988389253616333, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 4920 + }, + { + "epoch": 0.3540394973070018, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 4930 + }, + { + "epoch": 0.3547576301615799, + "grad_norm": 0.5932779312133789, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 4940 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 0.48911359906196594, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 4950 + }, + { + "epoch": 0.3561938958707361, + "grad_norm": 0.5435750484466553, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4960 + }, + { + "epoch": 0.3569120287253142, + "grad_norm": 0.4786977767944336, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 4970 + }, + { + "epoch": 0.3576301615798923, + "grad_norm": 0.4022316336631775, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 4980 + }, + { + "epoch": 0.35834829443447036, + "grad_norm": 0.4848504364490509, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 4990 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 0.5093459486961365, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 5000 + }, + { + "epoch": 0.3597845601436266, + "grad_norm": 0.47368478775024414, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 5010 + }, + { + "epoch": 0.36050269299820464, + "grad_norm": 0.6041097044944763, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 5020 + }, + { + "epoch": 0.36122082585278276, + "grad_norm": 0.5384424924850464, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 5030 + }, + { + "epoch": 0.36193895870736087, + "grad_norm": 0.4668518602848053, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 5040 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 0.5471060276031494, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 5050 + }, + { + "epoch": 0.36337522441651704, + "grad_norm": 0.731369137763977, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 5060 + }, + { + "epoch": 0.36409335727109515, + "grad_norm": 0.5119590759277344, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 5070 + }, + { + "epoch": 0.36481149012567327, + "grad_norm": 0.567428469657898, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 5080 + }, + { + "epoch": 0.3655296229802513, + "grad_norm": 0.5139971375465393, + "learning_rate": 0.0002, + "loss": 0.7616, + "step": 5090 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.5701581835746765, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 5100 + }, + { + "epoch": 0.36696588868940755, + "grad_norm": 0.5022063851356506, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 5110 + }, + { + "epoch": 0.36768402154398566, + "grad_norm": 0.4684354364871979, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 5120 + }, + { + "epoch": 0.3684021543985637, + "grad_norm": 0.5423495769500732, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 5130 + }, + { + "epoch": 0.36912028725314183, + "grad_norm": 0.46262967586517334, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 5140 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 0.4720141589641571, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 5150 + }, + { + "epoch": 0.370556552962298, + "grad_norm": 0.5113096833229065, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 5160 + }, + { + "epoch": 0.3712746858168761, + "grad_norm": 0.5253350138664246, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 5170 + }, + { + "epoch": 0.37199281867145423, + "grad_norm": 0.5799776315689087, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 5180 + }, + { + "epoch": 0.37271095152603234, + "grad_norm": 0.5166001319885254, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5190 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 5200 + }, + { + "epoch": 0.3741472172351885, + "grad_norm": 0.45811113715171814, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 5210 + }, + { + "epoch": 0.3748653500897666, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 5220 + }, + { + "epoch": 0.3755834829443447, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 0.3763016157989228, + "grad_norm": 0.3858596086502075, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 5240 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 0.6941536068916321, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 5250 + }, + { + "epoch": 0.377737881508079, + "grad_norm": 0.46940872073173523, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 5260 + }, + { + "epoch": 0.3784560143626571, + "grad_norm": 0.5413833260536194, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5270 + }, + { + "epoch": 0.3791741472172352, + "grad_norm": 0.5165658593177795, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 5280 + }, + { + "epoch": 0.3798922800718133, + "grad_norm": 0.6567398309707642, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 5290 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.5466915965080261, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 5300 + }, + { + "epoch": 0.3813285457809695, + "grad_norm": 0.4800598621368408, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 5310 + }, + { + "epoch": 0.3820466786355476, + "grad_norm": 0.4551742970943451, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 5320 + }, + { + "epoch": 0.3827648114901257, + "grad_norm": 0.5561164617538452, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 5330 + }, + { + "epoch": 0.38348294434470376, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 5340 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.465762197971344, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 5350 + }, + { + "epoch": 0.38491921005386, + "grad_norm": 0.6176838874816895, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 5360 + }, + { + "epoch": 0.38563734290843804, + "grad_norm": 0.657926082611084, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 5370 + }, + { + "epoch": 0.38635547576301615, + "grad_norm": 0.5063281655311584, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 5380 + }, + { + "epoch": 0.38707360861759427, + "grad_norm": 0.6960828304290771, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 5390 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 0.46712034940719604, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 5400 + }, + { + "epoch": 0.38850987432675044, + "grad_norm": 0.598114013671875, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 5410 + }, + { + "epoch": 0.38922800718132855, + "grad_norm": 0.6798132061958313, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 5420 + }, + { + "epoch": 0.38994614003590666, + "grad_norm": 0.5194289088249207, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 5430 + }, + { + "epoch": 0.3906642728904847, + "grad_norm": 0.48175323009490967, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 5440 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 0.4979408085346222, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 5450 + }, + { + "epoch": 0.39210053859964095, + "grad_norm": 0.6440972685813904, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5460 + }, + { + "epoch": 0.392818671454219, + "grad_norm": 0.5977227091789246, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 5470 + }, + { + "epoch": 0.3935368043087971, + "grad_norm": 0.4735909104347229, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 5480 + }, + { + "epoch": 0.39425493716337523, + "grad_norm": 0.48181721568107605, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 5490 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.6339454650878906, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 5500 + }, + { + "epoch": 0.3956912028725314, + "grad_norm": 0.5364336371421814, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5510 + }, + { + "epoch": 0.3964093357271095, + "grad_norm": 0.5499233603477478, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 5520 + }, + { + "epoch": 0.3971274685816876, + "grad_norm": 0.47249847650527954, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 5530 + }, + { + "epoch": 0.3978456014362657, + "grad_norm": 0.5692135095596313, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 5540 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 0.6009272933006287, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 5550 + }, + { + "epoch": 0.3992818671454219, + "grad_norm": 0.5198255181312561, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5560 + }, + { + "epoch": 0.4, + "grad_norm": 0.5474766492843628, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 0.4007181328545781, + "grad_norm": 0.5577479600906372, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 5580 + }, + { + "epoch": 0.4014362657091562, + "grad_norm": 0.5350302457809448, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5590 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 0.6310991048812866, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 5600 + }, + { + "epoch": 0.40287253141831236, + "grad_norm": 0.5695762038230896, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5610 + }, + { + "epoch": 0.4035906642728905, + "grad_norm": 0.5431827306747437, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 5620 + }, + { + "epoch": 0.4043087971274686, + "grad_norm": 0.4923325777053833, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 5630 + }, + { + "epoch": 0.4050269299820467, + "grad_norm": 0.531399667263031, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 5640 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.5854769349098206, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 5650 + }, + { + "epoch": 0.40646319569120287, + "grad_norm": 0.6684802174568176, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 5660 + }, + { + "epoch": 0.407181328545781, + "grad_norm": 0.6618620753288269, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 5670 + }, + { + "epoch": 0.40789946140035904, + "grad_norm": 0.4930776059627533, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 5680 + }, + { + "epoch": 0.40861759425493716, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 5690 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5700 + }, + { + "epoch": 0.4100538599640934, + "grad_norm": 0.6773046851158142, + "learning_rate": 0.0002, + "loss": 0.8386, + "step": 5710 + }, + { + "epoch": 0.41077199281867144, + "grad_norm": 0.6750592589378357, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 5720 + }, + { + "epoch": 0.41149012567324955, + "grad_norm": 0.5277232527732849, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5730 + }, + { + "epoch": 0.41220825852782766, + "grad_norm": 0.5155990719795227, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 5740 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 0.5236294865608215, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 5750 + }, + { + "epoch": 0.41364452423698383, + "grad_norm": 0.5073592066764832, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 5760 + }, + { + "epoch": 0.41436265709156195, + "grad_norm": 0.6997184753417969, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 5770 + }, + { + "epoch": 0.41508078994614006, + "grad_norm": 0.5282439589500427, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 5780 + }, + { + "epoch": 0.4157989228007181, + "grad_norm": 0.4997355341911316, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5790 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.6081610321998596, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5800 + }, + { + "epoch": 0.41723518850987434, + "grad_norm": 0.5640295147895813, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 5810 + }, + { + "epoch": 0.4179533213644524, + "grad_norm": 0.6443586349487305, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 0.4186714542190305, + "grad_norm": 0.6456229090690613, + "learning_rate": 0.0002, + "loss": 0.8132, + "step": 5830 + }, + { + "epoch": 0.4193895870736086, + "grad_norm": 0.5422267317771912, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5840 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 0.45251885056495667, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5850 + }, + { + "epoch": 0.4208258527827648, + "grad_norm": 0.781165599822998, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5860 + }, + { + "epoch": 0.4215439856373429, + "grad_norm": 0.5359160900115967, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5870 + }, + { + "epoch": 0.422262118491921, + "grad_norm": 0.6201958656311035, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5880 + }, + { + "epoch": 0.4229802513464991, + "grad_norm": 0.5985850691795349, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 5890 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.5550961494445801, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 5900 + }, + { + "epoch": 0.4244165170556553, + "grad_norm": 0.6284893155097961, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 5910 + }, + { + "epoch": 0.4251346499102334, + "grad_norm": 0.6143685579299927, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 5920 + }, + { + "epoch": 0.4258527827648115, + "grad_norm": 0.5065329670906067, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5930 + }, + { + "epoch": 0.4265709156193896, + "grad_norm": 0.7274345755577087, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 5940 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.606531023979187, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 5950 + }, + { + "epoch": 0.42800718132854576, + "grad_norm": 0.5983648300170898, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5960 + }, + { + "epoch": 0.4287253141831239, + "grad_norm": 0.5546031594276428, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5970 + }, + { + "epoch": 0.429443447037702, + "grad_norm": 0.666868269443512, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 5980 + }, + { + "epoch": 0.4301615798922801, + "grad_norm": 0.41438576579093933, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5990 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 0.5012526512145996, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 6000 + }, + { + "epoch": 0.43159784560143627, + "grad_norm": 0.6071694493293762, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 6010 + }, + { + "epoch": 0.4323159784560144, + "grad_norm": 0.5538384914398193, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 6020 + }, + { + "epoch": 0.43303411131059244, + "grad_norm": 0.5798718929290771, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 6030 + }, + { + "epoch": 0.43375224416517055, + "grad_norm": 0.5442442893981934, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 6040 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.6895565390586853, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 6050 + }, + { + "epoch": 0.4351885098743267, + "grad_norm": 0.6498045325279236, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 6060 + }, + { + "epoch": 0.43590664272890484, + "grad_norm": 0.5225510001182556, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 6070 + }, + { + "epoch": 0.43662477558348295, + "grad_norm": 0.6366992592811584, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 6080 + }, + { + "epoch": 0.43734290843806106, + "grad_norm": 0.47929027676582336, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 6090 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.5722405910491943, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 6100 + }, + { + "epoch": 0.43877917414721723, + "grad_norm": 0.6008004546165466, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 6110 + }, + { + "epoch": 0.43949730700179535, + "grad_norm": 0.5922580361366272, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 6120 + }, + { + "epoch": 0.4402154398563734, + "grad_norm": 0.7051905393600464, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 6130 + }, + { + "epoch": 0.4409335727109515, + "grad_norm": 0.5146450400352478, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 6140 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5605781674385071, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 6150 + }, + { + "epoch": 0.44236983842010774, + "grad_norm": 0.8008661866188049, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 6160 + }, + { + "epoch": 0.4430879712746858, + "grad_norm": 0.47406497597694397, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 6170 + }, + { + "epoch": 0.4438061041292639, + "grad_norm": 0.612287700176239, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 6180 + }, + { + "epoch": 0.444524236983842, + "grad_norm": 0.561188280582428, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 6190 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.6233669519424438, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 6200 + }, + { + "epoch": 0.4459605026929982, + "grad_norm": 0.45546263456344604, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6210 + }, + { + "epoch": 0.4466786355475763, + "grad_norm": 0.5947871208190918, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 6220 + }, + { + "epoch": 0.4473967684021544, + "grad_norm": 0.6109753847122192, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 6230 + }, + { + "epoch": 0.4481149012567325, + "grad_norm": 0.6380727887153625, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6240 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.5225699543952942, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 6250 + }, + { + "epoch": 0.4495511669658887, + "grad_norm": 0.521503210067749, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.45026929982046676, + "grad_norm": 0.5523216128349304, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 6270 + }, + { + "epoch": 0.4509874326750449, + "grad_norm": 0.5954921841621399, + "learning_rate": 0.0002, + "loss": 0.8228, + "step": 6280 + }, + { + "epoch": 0.451705565529623, + "grad_norm": 0.702751100063324, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 6290 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 0.5756356120109558, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 6300 + }, + { + "epoch": 0.45314183123877916, + "grad_norm": 0.45365944504737854, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 6310 + }, + { + "epoch": 0.45385996409335727, + "grad_norm": 0.5027855038642883, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6320 + }, + { + "epoch": 0.4545780969479354, + "grad_norm": 0.6551687121391296, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 6330 + }, + { + "epoch": 0.45529622980251344, + "grad_norm": 0.5296684503555298, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6340 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 0.5762032866477966, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6350 + }, + { + "epoch": 0.45673249551166967, + "grad_norm": 0.5234073996543884, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6360 + }, + { + "epoch": 0.4574506283662478, + "grad_norm": 0.5090946555137634, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 6370 + }, + { + "epoch": 0.45816876122082584, + "grad_norm": 0.6515111327171326, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 6380 + }, + { + "epoch": 0.45888689407540395, + "grad_norm": 0.7904898524284363, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 6390 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.6379680037498474, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 6400 + }, + { + "epoch": 0.4603231597845601, + "grad_norm": 0.641759991645813, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 6410 + }, + { + "epoch": 0.46104129263913823, + "grad_norm": 0.5273829698562622, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 6420 + }, + { + "epoch": 0.46175942549371635, + "grad_norm": 0.5668497681617737, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6430 + }, + { + "epoch": 0.46247755834829446, + "grad_norm": 0.5862061381340027, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 6440 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 0.5239592790603638, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 6450 + }, + { + "epoch": 0.46391382405745063, + "grad_norm": 0.5078722834587097, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 6460 + }, + { + "epoch": 0.46463195691202874, + "grad_norm": 0.566509485244751, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 6470 + }, + { + "epoch": 0.4653500897666068, + "grad_norm": 0.5952697396278381, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 6480 + }, + { + "epoch": 0.4660682226211849, + "grad_norm": 0.6548156142234802, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 6490 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 0.4768427908420563, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.46750448833034114, + "grad_norm": 0.5588273406028748, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 6510 + }, + { + "epoch": 0.4682226211849192, + "grad_norm": 0.5348677039146423, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 6520 + }, + { + "epoch": 0.4689407540394973, + "grad_norm": 0.4784318804740906, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 6530 + }, + { + "epoch": 0.4696588868940754, + "grad_norm": 0.5112265944480896, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 6540 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 0.7250495553016663, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 6550 + }, + { + "epoch": 0.4710951526032316, + "grad_norm": 0.538608968257904, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 6560 + }, + { + "epoch": 0.4718132854578097, + "grad_norm": 0.5981247425079346, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 6570 + }, + { + "epoch": 0.4725314183123878, + "grad_norm": 0.5466762781143188, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 6580 + }, + { + "epoch": 0.4732495511669659, + "grad_norm": 0.5609987378120422, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 6590 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 0.6091027855873108, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 6600 + }, + { + "epoch": 0.4746858168761221, + "grad_norm": 0.5542886853218079, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 6610 + }, + { + "epoch": 0.47540394973070016, + "grad_norm": 0.5656579732894897, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6620 + }, + { + "epoch": 0.4761220825852783, + "grad_norm": 0.47507357597351074, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 6630 + }, + { + "epoch": 0.4768402154398564, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6640 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 0.7129740715026855, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 6650 + }, + { + "epoch": 0.47827648114901256, + "grad_norm": 0.5189188718795776, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 6660 + }, + { + "epoch": 0.47899461400359067, + "grad_norm": 0.7548696398735046, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 6670 + }, + { + "epoch": 0.4797127468581688, + "grad_norm": 0.4729466438293457, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 6680 + }, + { + "epoch": 0.48043087971274684, + "grad_norm": 0.6190000772476196, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 6690 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 0.6276983022689819, + "learning_rate": 0.0002, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.48186714542190306, + "grad_norm": 0.6097590923309326, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 6710 + }, + { + "epoch": 0.4825852782764811, + "grad_norm": 0.6507330536842346, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 6720 + }, + { + "epoch": 0.48330341113105924, + "grad_norm": 0.5501991510391235, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 6730 + }, + { + "epoch": 0.48402154398563735, + "grad_norm": 0.5928015112876892, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 6740 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 0.5523008704185486, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 6750 + }, + { + "epoch": 0.4854578096947935, + "grad_norm": 0.5997263789176941, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 6760 + }, + { + "epoch": 0.48617594254937163, + "grad_norm": 0.6201002597808838, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 6770 + }, + { + "epoch": 0.48689407540394974, + "grad_norm": 0.6338862776756287, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 6780 + }, + { + "epoch": 0.4876122082585278, + "grad_norm": 0.5542550086975098, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6790 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.5587872862815857, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 6800 + }, + { + "epoch": 0.489048473967684, + "grad_norm": 0.5895681977272034, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 6810 + }, + { + "epoch": 0.48976660682226214, + "grad_norm": 0.4948221743106842, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 6820 + }, + { + "epoch": 0.4904847396768402, + "grad_norm": 0.44546931982040405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 6830 + }, + { + "epoch": 0.4912028725314183, + "grad_norm": 0.632046103477478, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 6840 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 0.49396243691444397, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 6850 + }, + { + "epoch": 0.4926391382405745, + "grad_norm": 0.497745156288147, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6860 + }, + { + "epoch": 0.4933572710951526, + "grad_norm": 0.7336170077323914, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 6870 + }, + { + "epoch": 0.4940754039497307, + "grad_norm": 0.6723181009292603, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 6880 + }, + { + "epoch": 0.4947935368043088, + "grad_norm": 0.5887754559516907, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 6890 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 0.6580226421356201, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 6900 + }, + { + "epoch": 0.496229802513465, + "grad_norm": 0.7385056614875793, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 6910 + }, + { + "epoch": 0.4969479353680431, + "grad_norm": 0.48736000061035156, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6920 + }, + { + "epoch": 0.49766606822262116, + "grad_norm": 0.6304559111595154, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 6930 + }, + { + "epoch": 0.4983842010771993, + "grad_norm": 0.607148289680481, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6940 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.5467981696128845, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 6950 + }, + { + "epoch": 0.4998204667863555, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 6960 + }, + { + "epoch": 0.5005385996409336, + "grad_norm": 0.5487921833992004, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 6970 + }, + { + "epoch": 0.5012567324955116, + "grad_norm": 0.5706006288528442, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 6980 + }, + { + "epoch": 0.5019748653500897, + "grad_norm": 0.539536714553833, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 6990 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 0.5527397394180298, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 7000 + }, + { + "epoch": 0.503411131059246, + "grad_norm": 0.5498567223548889, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 7010 + }, + { + "epoch": 0.5041292639138241, + "grad_norm": 0.5878575444221497, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 7020 + }, + { + "epoch": 0.5048473967684022, + "grad_norm": 0.646153450012207, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 7030 + }, + { + "epoch": 0.5055655296229803, + "grad_norm": 0.5603899359703064, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 7040 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 0.5849952697753906, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 7050 + }, + { + "epoch": 0.5070017953321364, + "grad_norm": 0.6082724928855896, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 7060 + }, + { + "epoch": 0.5077199281867145, + "grad_norm": 0.5900670289993286, + "learning_rate": 0.0002, + "loss": 0.8046, + "step": 7070 + }, + { + "epoch": 0.5084380610412926, + "grad_norm": 0.5856624841690063, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 7080 + }, + { + "epoch": 0.5091561938958707, + "grad_norm": 0.6177338361740112, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7090 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 0.5559300184249878, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 7100 + }, + { + "epoch": 0.510592459605027, + "grad_norm": 0.62027907371521, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 7110 + }, + { + "epoch": 0.511310592459605, + "grad_norm": 0.6334301829338074, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7120 + }, + { + "epoch": 0.5120287253141831, + "grad_norm": 0.513795018196106, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 7130 + }, + { + "epoch": 0.5127468581687612, + "grad_norm": 0.7004675269126892, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 7140 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5614308714866638, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7150 + }, + { + "epoch": 0.5141831238779174, + "grad_norm": 0.5037539601325989, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 7160 + }, + { + "epoch": 0.5149012567324955, + "grad_norm": 0.5568661093711853, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 7170 + }, + { + "epoch": 0.5156193895870737, + "grad_norm": 0.7513397336006165, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7180 + }, + { + "epoch": 0.5163375224416517, + "grad_norm": 0.7264583706855774, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 7190 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.6355819702148438, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 7200 + }, + { + "epoch": 0.5177737881508079, + "grad_norm": 0.6063222289085388, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 7210 + }, + { + "epoch": 0.518491921005386, + "grad_norm": 0.6484307646751404, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 7220 + }, + { + "epoch": 0.5192100538599641, + "grad_norm": 0.5260455012321472, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 7230 + }, + { + "epoch": 0.5199281867145422, + "grad_norm": 0.6718002557754517, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7240 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 0.5997617244720459, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 7250 + }, + { + "epoch": 0.5213644524236983, + "grad_norm": 0.5838589668273926, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 7260 + }, + { + "epoch": 0.5220825852782764, + "grad_norm": 0.5755977630615234, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 7270 + }, + { + "epoch": 0.5228007181328546, + "grad_norm": 0.6442093253135681, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 7280 + }, + { + "epoch": 0.5235188509874327, + "grad_norm": 0.6128416657447815, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 7290 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.509742796421051, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 7300 + }, + { + "epoch": 0.5249551166965889, + "grad_norm": 0.5450230836868286, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 7310 + }, + { + "epoch": 0.525673249551167, + "grad_norm": 0.5437141060829163, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 7320 + }, + { + "epoch": 0.526391382405745, + "grad_norm": 0.5291738510131836, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 7330 + }, + { + "epoch": 0.5271095152603231, + "grad_norm": 0.5101743936538696, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 7340 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.5678408145904541, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 7350 + }, + { + "epoch": 0.5285457809694794, + "grad_norm": 0.6332360506057739, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7360 + }, + { + "epoch": 0.5292639138240575, + "grad_norm": 0.4935058653354645, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 7370 + }, + { + "epoch": 0.5299820466786356, + "grad_norm": 0.6399656534194946, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7380 + }, + { + "epoch": 0.5307001795332137, + "grad_norm": 0.5986794233322144, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 7390 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.6948414444923401, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 7400 + }, + { + "epoch": 0.5321364452423698, + "grad_norm": 0.5337842106819153, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 7410 + }, + { + "epoch": 0.5328545780969479, + "grad_norm": 0.6897268295288086, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 7420 + }, + { + "epoch": 0.533572710951526, + "grad_norm": 0.6361175179481506, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 7430 + }, + { + "epoch": 0.5342908438061041, + "grad_norm": 0.5242252945899963, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 7440 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 0.5731322765350342, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 7450 + }, + { + "epoch": 0.5357271095152604, + "grad_norm": 0.5790955424308777, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 7460 + }, + { + "epoch": 0.5364452423698384, + "grad_norm": 0.4979061782360077, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 7470 + }, + { + "epoch": 0.5371633752244165, + "grad_norm": 0.7335101962089539, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 7480 + }, + { + "epoch": 0.5378815080789946, + "grad_norm": 0.592521071434021, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 7490 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.5784769654273987, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 7500 + }, + { + "epoch": 0.5393177737881508, + "grad_norm": 0.8148589730262756, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 7510 + }, + { + "epoch": 0.5400359066427289, + "grad_norm": 0.5727689862251282, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7520 + }, + { + "epoch": 0.540754039497307, + "grad_norm": 0.6958279609680176, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 7530 + }, + { + "epoch": 0.541472172351885, + "grad_norm": 0.6302788257598877, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 7540 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.5950970649719238, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 7550 + }, + { + "epoch": 0.5429084380610413, + "grad_norm": 0.4275270104408264, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 7560 + }, + { + "epoch": 0.5436265709156194, + "grad_norm": 0.7579900622367859, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 7570 + }, + { + "epoch": 0.5443447037701975, + "grad_norm": 0.5835317969322205, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 7580 + }, + { + "epoch": 0.5450628366247756, + "grad_norm": 0.5305142998695374, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 7590 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7600 + }, + { + "epoch": 0.5464991023339317, + "grad_norm": 0.5341935753822327, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 7610 + }, + { + "epoch": 0.5472172351885098, + "grad_norm": 0.6070826053619385, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 7620 + }, + { + "epoch": 0.547935368043088, + "grad_norm": 0.6193035840988159, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 7630 + }, + { + "epoch": 0.5486535008976661, + "grad_norm": 0.6171614527702332, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 7640 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.5700938105583191, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 7650 + }, + { + "epoch": 0.5500897666068223, + "grad_norm": 0.5742418169975281, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7660 + }, + { + "epoch": 0.5508078994614004, + "grad_norm": 0.6450320482254028, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 7670 + }, + { + "epoch": 0.5515260323159784, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 7680 + }, + { + "epoch": 0.5522441651705565, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 7690 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.5846288204193115, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7700 + }, + { + "epoch": 0.5536804308797127, + "grad_norm": 0.623315155506134, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7710 + }, + { + "epoch": 0.5543985637342909, + "grad_norm": 0.6607962250709534, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7720 + }, + { + "epoch": 0.555116696588869, + "grad_norm": 0.5258557200431824, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 7730 + }, + { + "epoch": 0.5558348294434471, + "grad_norm": 0.6464316844940186, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7740 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 0.6390621662139893, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 7750 + }, + { + "epoch": 0.5572710951526032, + "grad_norm": 0.5327560305595398, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 7760 + }, + { + "epoch": 0.5579892280071813, + "grad_norm": 0.8202064633369446, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 7770 + }, + { + "epoch": 0.5587073608617594, + "grad_norm": 0.45350968837738037, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 7780 + }, + { + "epoch": 0.5594254937163375, + "grad_norm": 0.5031413435935974, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 7790 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.5047417879104614, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 7800 + }, + { + "epoch": 0.5608617594254938, + "grad_norm": 0.668912410736084, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 0.5615798922800718, + "grad_norm": 0.6106061339378357, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7820 + }, + { + "epoch": 0.5622980251346499, + "grad_norm": 0.5558443665504456, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 7830 + }, + { + "epoch": 0.563016157989228, + "grad_norm": 0.5937177538871765, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 7840 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 0.67307448387146, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 7850 + }, + { + "epoch": 0.5644524236983842, + "grad_norm": 0.4615475833415985, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7860 + }, + { + "epoch": 0.5651705565529623, + "grad_norm": 0.5462577939033508, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 7870 + }, + { + "epoch": 0.5658886894075404, + "grad_norm": 0.6422402858734131, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7880 + }, + { + "epoch": 0.5666068222621184, + "grad_norm": 0.5313532948493958, + "learning_rate": 0.0002, + "loss": 0.8327, + "step": 7890 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 0.5647847056388855, + "learning_rate": 0.0002, + "loss": 0.7771, + "step": 7900 + }, + { + "epoch": 0.5680430879712747, + "grad_norm": 0.6581610441207886, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 7910 + }, + { + "epoch": 0.5687612208258528, + "grad_norm": 0.46947669982910156, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 7920 + }, + { + "epoch": 0.5694793536804309, + "grad_norm": 0.6420038342475891, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7930 + }, + { + "epoch": 0.570197486535009, + "grad_norm": 0.6730441451072693, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 7940 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.3849070966243744, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 7950 + }, + { + "epoch": 0.5716337522441651, + "grad_norm": 0.6076335906982422, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 7960 + }, + { + "epoch": 0.5723518850987432, + "grad_norm": 0.6446982026100159, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 7970 + }, + { + "epoch": 0.5730700179533214, + "grad_norm": 0.6019234657287598, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 7980 + }, + { + "epoch": 0.5737881508078995, + "grad_norm": 0.620880663394928, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 7990 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 0.4927573502063751, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 8000 + }, + { + "epoch": 0.5752244165170557, + "grad_norm": 0.6276804804801941, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8010 + }, + { + "epoch": 0.5759425493716338, + "grad_norm": 0.484518826007843, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 8020 + }, + { + "epoch": 0.5766606822262118, + "grad_norm": 0.5019962787628174, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.5773788150807899, + "grad_norm": 0.6685234308242798, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 8040 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 0.5762107372283936, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 8050 + }, + { + "epoch": 0.5788150807899461, + "grad_norm": 0.6402477025985718, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 8060 + }, + { + "epoch": 0.5795332136445243, + "grad_norm": 0.5919345617294312, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8070 + }, + { + "epoch": 0.5802513464991024, + "grad_norm": 0.47100913524627686, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 8080 + }, + { + "epoch": 0.5809694793536805, + "grad_norm": 0.6029118895530701, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 8090 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 0.5896338820457458, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 8100 + }, + { + "epoch": 0.5824057450628366, + "grad_norm": 0.49017754197120667, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 8110 + }, + { + "epoch": 0.5831238779174147, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 8120 + }, + { + "epoch": 0.5838420107719928, + "grad_norm": 0.6874517798423767, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.5845601436265709, + "grad_norm": 0.5429391264915466, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 8140 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.5533722639083862, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5859964093357272, + "grad_norm": 0.5827956199645996, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 8160 + }, + { + "epoch": 0.5867145421903052, + "grad_norm": 0.6670212149620056, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 8170 + }, + { + "epoch": 0.5874326750448833, + "grad_norm": 0.5231172442436218, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 8180 + }, + { + "epoch": 0.5881508078994614, + "grad_norm": 0.567447304725647, + "learning_rate": 0.0002, + "loss": 0.7975, + "step": 8190 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 0.5318575501441956, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8200 + }, + { + "epoch": 0.5895870736086176, + "grad_norm": 0.6959463357925415, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 8210 + }, + { + "epoch": 0.5903052064631957, + "grad_norm": 0.6964931488037109, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 8220 + }, + { + "epoch": 0.5910233393177737, + "grad_norm": 0.5164617896080017, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 8230 + }, + { + "epoch": 0.5917414721723518, + "grad_norm": 0.5456110239028931, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 8240 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.6553666591644287, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 8250 + }, + { + "epoch": 0.5931777378815081, + "grad_norm": 0.6185845732688904, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 8260 + }, + { + "epoch": 0.5938958707360862, + "grad_norm": 0.6110545992851257, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8270 + }, + { + "epoch": 0.5946140035906643, + "grad_norm": 0.5186824202537537, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 8280 + }, + { + "epoch": 0.5953321364452424, + "grad_norm": 0.7003735303878784, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 8290 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 0.4606216549873352, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 8300 + }, + { + "epoch": 0.5967684021543985, + "grad_norm": 0.5903441309928894, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 8310 + }, + { + "epoch": 0.5974865350089766, + "grad_norm": 0.7916744947433472, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 8320 + }, + { + "epoch": 0.5982046678635548, + "grad_norm": 0.5506401062011719, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 8330 + }, + { + "epoch": 0.5989228007181329, + "grad_norm": 0.5749204158782959, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 8340 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.6807544827461243, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 8350 + }, + { + "epoch": 0.6003590664272891, + "grad_norm": 0.5782986283302307, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 8360 + }, + { + "epoch": 0.6010771992818671, + "grad_norm": 0.7336342334747314, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 8370 + }, + { + "epoch": 0.6017953321364452, + "grad_norm": 0.5762712955474854, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.6025134649910233, + "grad_norm": 0.5726776719093323, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 8390 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.5355535745620728, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 8400 + }, + { + "epoch": 0.6039497307001795, + "grad_norm": 0.6762161254882812, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 8410 + }, + { + "epoch": 0.6046678635547577, + "grad_norm": 0.8200717568397522, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 8420 + }, + { + "epoch": 0.6053859964093358, + "grad_norm": 0.5600009560585022, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 8430 + }, + { + "epoch": 0.6061041292639138, + "grad_norm": 0.6465966105461121, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 8440 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.5176072120666504, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 8450 + }, + { + "epoch": 0.60754039497307, + "grad_norm": 0.5777280926704407, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 8460 + }, + { + "epoch": 0.6082585278276481, + "grad_norm": 0.5989252924919128, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 8470 + }, + { + "epoch": 0.6089766606822262, + "grad_norm": 0.5207306742668152, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8480 + }, + { + "epoch": 0.6096947935368043, + "grad_norm": 0.5242675542831421, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 8490 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 0.5631455183029175, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 8500 + }, + { + "epoch": 0.6111310592459605, + "grad_norm": 0.65207439661026, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 8510 + }, + { + "epoch": 0.6118491921005386, + "grad_norm": 0.5808899998664856, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8520 + }, + { + "epoch": 0.6125673249551167, + "grad_norm": 0.558127760887146, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 8530 + }, + { + "epoch": 0.6132854578096948, + "grad_norm": 0.6063143014907837, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8540 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.5491744875907898, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 8550 + }, + { + "epoch": 0.614721723518851, + "grad_norm": 0.5105780959129333, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8560 + }, + { + "epoch": 0.6154398563734291, + "grad_norm": 0.6892395615577698, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 8570 + }, + { + "epoch": 0.6161579892280071, + "grad_norm": 0.7411758899688721, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8580 + }, + { + "epoch": 0.6168761220825852, + "grad_norm": 0.6745429635047913, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 8590 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 0.596007227897644, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 8600 + }, + { + "epoch": 0.6183123877917415, + "grad_norm": 0.6751060485839844, + "learning_rate": 0.0002, + "loss": 0.7963, + "step": 8610 + }, + { + "epoch": 0.6190305206463196, + "grad_norm": 0.711124837398529, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 8620 + }, + { + "epoch": 0.6197486535008977, + "grad_norm": 0.6110914945602417, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 8630 + }, + { + "epoch": 0.6204667863554758, + "grad_norm": 0.5687659978866577, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 8640 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.7025772929191589, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8650 + }, + { + "epoch": 0.6219030520646319, + "grad_norm": 0.6456184983253479, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 8660 + }, + { + "epoch": 0.62262118491921, + "grad_norm": 0.5317023992538452, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 8670 + }, + { + "epoch": 0.6233393177737881, + "grad_norm": 0.5531691908836365, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 8680 + }, + { + "epoch": 0.6240574506283663, + "grad_norm": 0.6063531637191772, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 8690 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 1.094390630722046, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 8700 + }, + { + "epoch": 0.6254937163375225, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 8710 + }, + { + "epoch": 0.6262118491921005, + "grad_norm": 0.5470370054244995, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 8720 + }, + { + "epoch": 0.6269299820466786, + "grad_norm": 0.5852634310722351, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 8730 + }, + { + "epoch": 0.6276481149012567, + "grad_norm": 0.6120240092277527, + "learning_rate": 0.0002, + "loss": 0.8712, + "step": 8740 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 0.5608004927635193, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 8750 + }, + { + "epoch": 0.6290843806104129, + "grad_norm": 0.5980432033538818, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 8760 + }, + { + "epoch": 0.629802513464991, + "grad_norm": 0.5670580863952637, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 8770 + }, + { + "epoch": 0.6305206463195692, + "grad_norm": 0.5931687951087952, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 8780 + }, + { + "epoch": 0.6312387791741472, + "grad_norm": 0.7872577905654907, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 8790 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.6355181336402893, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 8800 + }, + { + "epoch": 0.6326750448833034, + "grad_norm": 0.501913845539093, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 8810 + }, + { + "epoch": 0.6333931777378815, + "grad_norm": 0.5956716537475586, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8820 + }, + { + "epoch": 0.6341113105924596, + "grad_norm": 0.6448253393173218, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 8830 + }, + { + "epoch": 0.6348294434470377, + "grad_norm": 0.6139631271362305, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 8840 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.5894306302070618, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 8850 + }, + { + "epoch": 0.6362657091561938, + "grad_norm": 0.8724799752235413, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 8860 + }, + { + "epoch": 0.636983842010772, + "grad_norm": 0.5413858890533447, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 8870 + }, + { + "epoch": 0.6377019748653501, + "grad_norm": 0.5993430614471436, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 8880 + }, + { + "epoch": 0.6384201077199282, + "grad_norm": 0.539415717124939, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 0.600125789642334, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 0.6398563734290844, + "grad_norm": 0.5597978234291077, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 8910 + }, + { + "epoch": 0.6405745062836625, + "grad_norm": 0.6262031197547913, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 8920 + }, + { + "epoch": 0.6412926391382405, + "grad_norm": 0.72662752866745, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 8930 + }, + { + "epoch": 0.6420107719928186, + "grad_norm": 0.613002598285675, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 8940 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.6511827707290649, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 8950 + }, + { + "epoch": 0.6434470377019749, + "grad_norm": 0.5383973717689514, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 8960 + }, + { + "epoch": 0.644165170556553, + "grad_norm": 0.5236184597015381, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 8970 + }, + { + "epoch": 0.6448833034111311, + "grad_norm": 0.5938544273376465, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 8980 + }, + { + "epoch": 0.6456014362657092, + "grad_norm": 0.4594680964946747, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 8990 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 0.6314211487770081, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9000 + }, + { + "epoch": 0.6470377019748653, + "grad_norm": 0.6291103363037109, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 9010 + }, + { + "epoch": 0.6477558348294434, + "grad_norm": 0.5888266563415527, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 9020 + }, + { + "epoch": 0.6484739676840215, + "grad_norm": 0.5613022446632385, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9030 + }, + { + "epoch": 0.6491921005385997, + "grad_norm": 0.7219604253768921, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 9040 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 0.5846529006958008, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 9050 + }, + { + "epoch": 0.6506283662477559, + "grad_norm": 0.7264063954353333, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 9060 + }, + { + "epoch": 0.6513464991023339, + "grad_norm": 0.5797538757324219, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9070 + }, + { + "epoch": 0.652064631956912, + "grad_norm": 0.4857395887374878, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9080 + }, + { + "epoch": 0.6527827648114901, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 9090 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.6105342507362366, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 9100 + }, + { + "epoch": 0.6542190305206463, + "grad_norm": 0.6408740282058716, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 9110 + }, + { + "epoch": 0.6549371633752245, + "grad_norm": 0.7474880814552307, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 9120 + }, + { + "epoch": 0.6556552962298026, + "grad_norm": 0.584768533706665, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 9130 + }, + { + "epoch": 0.6563734290843806, + "grad_norm": 0.6368113160133362, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9140 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.693631649017334, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 9150 + }, + { + "epoch": 0.6578096947935368, + "grad_norm": 0.6094512343406677, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 9160 + }, + { + "epoch": 0.6585278276481149, + "grad_norm": 0.7154942750930786, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 9170 + }, + { + "epoch": 0.659245960502693, + "grad_norm": 0.5749237537384033, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9180 + }, + { + "epoch": 0.6599640933572711, + "grad_norm": 0.6214450001716614, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 9190 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.6357814073562622, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9200 + }, + { + "epoch": 0.6614003590664272, + "grad_norm": 0.5677326917648315, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 9210 + }, + { + "epoch": 0.6621184919210054, + "grad_norm": 0.5432633757591248, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 9220 + }, + { + "epoch": 0.6628366247755835, + "grad_norm": 0.43935060501098633, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 9230 + }, + { + "epoch": 0.6635547576301616, + "grad_norm": 0.5350922346115112, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 9240 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.7745687365531921, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 9250 + }, + { + "epoch": 0.6649910233393178, + "grad_norm": 0.5767113566398621, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9260 + }, + { + "epoch": 0.6657091561938959, + "grad_norm": 0.49304983019828796, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9270 + }, + { + "epoch": 0.6664272890484739, + "grad_norm": 0.6355269551277161, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 9280 + }, + { + "epoch": 0.667145421903052, + "grad_norm": 0.5539451241493225, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 9290 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.5225138068199158, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 9300 + }, + { + "epoch": 0.6685816876122083, + "grad_norm": 0.5435736179351807, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 9310 + }, + { + "epoch": 0.6692998204667864, + "grad_norm": 0.611266553401947, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 9320 + }, + { + "epoch": 0.6700179533213645, + "grad_norm": 0.5880926251411438, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 9330 + }, + { + "epoch": 0.6707360861759426, + "grad_norm": 0.5301468372344971, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9340 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.5614377856254578, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9350 + }, + { + "epoch": 0.6721723518850987, + "grad_norm": 0.7177342176437378, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 9360 + }, + { + "epoch": 0.6728904847396768, + "grad_norm": 0.5187423825263977, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9370 + }, + { + "epoch": 0.6736086175942549, + "grad_norm": 0.49305087327957153, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 9380 + }, + { + "epoch": 0.6743267504488331, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 9390 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 0.8308040499687195, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 9400 + }, + { + "epoch": 0.6757630161579893, + "grad_norm": 0.6522438526153564, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 9410 + }, + { + "epoch": 0.6764811490125673, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 9420 + }, + { + "epoch": 0.6771992818671454, + "grad_norm": 0.783802330493927, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 9430 + }, + { + "epoch": 0.6779174147217235, + "grad_norm": 0.5246656537055969, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 9440 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.6630974411964417, + "learning_rate": 0.0002, + "loss": 0.7866, + "step": 9450 + }, + { + "epoch": 0.6793536804308797, + "grad_norm": 0.5012770295143127, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9460 + }, + { + "epoch": 0.6800718132854578, + "grad_norm": 0.6208643317222595, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 9470 + }, + { + "epoch": 0.680789946140036, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9480 + }, + { + "epoch": 0.681508078994614, + "grad_norm": 0.6613174080848694, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 9490 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.6417899131774902, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9500 + }, + { + "epoch": 0.6829443447037702, + "grad_norm": 0.5060321092605591, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 9510 + }, + { + "epoch": 0.6836624775583483, + "grad_norm": 0.586670458316803, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 9520 + }, + { + "epoch": 0.6843806104129264, + "grad_norm": 0.6607828736305237, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 9530 + }, + { + "epoch": 0.6850987432675045, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9540 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.741000771522522, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 9550 + }, + { + "epoch": 0.6865350089766606, + "grad_norm": 0.4687826335430145, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 9560 + }, + { + "epoch": 0.6872531418312388, + "grad_norm": 0.6452056169509888, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 9570 + }, + { + "epoch": 0.6879712746858169, + "grad_norm": 0.6393555402755737, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 9580 + }, + { + "epoch": 0.688689407540395, + "grad_norm": 0.4907757043838501, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 9590 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.5380825996398926, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 9600 + }, + { + "epoch": 0.6901256732495512, + "grad_norm": 0.5657393932342529, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 9610 + }, + { + "epoch": 0.6908438061041292, + "grad_norm": 0.8505447506904602, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 9620 + }, + { + "epoch": 0.6915619389587073, + "grad_norm": 0.5389836430549622, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 9630 + }, + { + "epoch": 0.6922800718132854, + "grad_norm": 0.4977441728115082, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 9640 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 0.5855389833450317, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 9650 + }, + { + "epoch": 0.6937163375224417, + "grad_norm": 0.633994996547699, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 9660 + }, + { + "epoch": 0.6944344703770198, + "grad_norm": 0.5592191815376282, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 9670 + }, + { + "epoch": 0.6951526032315979, + "grad_norm": 0.6030594706535339, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9680 + }, + { + "epoch": 0.6958707360861759, + "grad_norm": 0.6782388687133789, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 9690 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 0.6777627468109131, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 9700 + }, + { + "epoch": 0.6973070017953321, + "grad_norm": 0.5674123764038086, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 9710 + }, + { + "epoch": 0.6980251346499102, + "grad_norm": 0.5280387997627258, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 9720 + }, + { + "epoch": 0.6987432675044883, + "grad_norm": 0.5471981763839722, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 9730 + }, + { + "epoch": 0.6994614003590665, + "grad_norm": 0.6751061677932739, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9740 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.5942487716674805, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 9750 + }, + { + "epoch": 0.7008976660682226, + "grad_norm": 0.6165713667869568, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 9760 + }, + { + "epoch": 0.7016157989228007, + "grad_norm": 0.5745091438293457, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 9770 + }, + { + "epoch": 0.7023339317773788, + "grad_norm": 0.600308358669281, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 9780 + }, + { + "epoch": 0.7030520646319569, + "grad_norm": 0.6448577046394348, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 9790 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.5662767291069031, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9800 + }, + { + "epoch": 0.7044883303411131, + "grad_norm": 0.6490433812141418, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 9810 + }, + { + "epoch": 0.7052064631956912, + "grad_norm": 0.6126134991645813, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 9820 + }, + { + "epoch": 0.7059245960502692, + "grad_norm": 0.7181116938591003, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 9830 + }, + { + "epoch": 0.7066427289048474, + "grad_norm": 0.7805212140083313, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9840 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.7521958947181702, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9850 + }, + { + "epoch": 0.7080789946140036, + "grad_norm": 0.5610787868499756, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9860 + }, + { + "epoch": 0.7087971274685817, + "grad_norm": 0.7026229500770569, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 9870 + }, + { + "epoch": 0.7095152603231598, + "grad_norm": 0.551691472530365, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 9880 + }, + { + "epoch": 0.7102333931777379, + "grad_norm": 0.5841995477676392, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9890 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 0.7170061469078064, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 9900 + }, + { + "epoch": 0.711669658886894, + "grad_norm": 0.49836990237236023, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 9910 + }, + { + "epoch": 0.7123877917414722, + "grad_norm": 0.5234556794166565, + "learning_rate": 0.0002, + "loss": 0.7667, + "step": 9920 + }, + { + "epoch": 0.7131059245960503, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 9930 + }, + { + "epoch": 0.7138240574506284, + "grad_norm": 0.5657515525817871, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9940 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.5969128012657166, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 9950 + }, + { + "epoch": 0.7152603231597846, + "grad_norm": 0.7136867046356201, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 9960 + }, + { + "epoch": 0.7159784560143626, + "grad_norm": 0.6774699091911316, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9970 + }, + { + "epoch": 0.7166965888689407, + "grad_norm": 0.6066371202468872, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 9980 + }, + { + "epoch": 0.7174147217235188, + "grad_norm": 0.7355279922485352, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 9990 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 0.7996646761894226, + "learning_rate": 0.0002, + "loss": 0.7643, + "step": 10000 + }, + { + "epoch": 0.7188509874326751, + "grad_norm": 0.628839910030365, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10010 + }, + { + "epoch": 0.7195691202872532, + "grad_norm": 0.5472931265830994, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 10020 + }, + { + "epoch": 0.7202872531418313, + "grad_norm": 0.5776344537734985, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 10030 + }, + { + "epoch": 0.7210053859964093, + "grad_norm": 0.5041707158088684, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10040 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5965308547019958, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 10050 + }, + { + "epoch": 0.7224416517055655, + "grad_norm": 0.5892689228057861, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 10060 + }, + { + "epoch": 0.7231597845601436, + "grad_norm": 0.5695884227752686, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 10070 + }, + { + "epoch": 0.7238779174147217, + "grad_norm": 0.6547690629959106, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 10080 + }, + { + "epoch": 0.7245960502692999, + "grad_norm": 0.6759928464889526, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 10090 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.6829725503921509, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 10100 + }, + { + "epoch": 0.726032315978456, + "grad_norm": 0.5242751240730286, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 10110 + }, + { + "epoch": 0.7267504488330341, + "grad_norm": 0.6947014927864075, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 10120 + }, + { + "epoch": 0.7274685816876122, + "grad_norm": 0.6094982624053955, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10130 + }, + { + "epoch": 0.7281867145421903, + "grad_norm": 0.628461480140686, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 10140 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.4952087104320526, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10150 + }, + { + "epoch": 0.7296229802513465, + "grad_norm": 0.6917221546173096, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10160 + }, + { + "epoch": 0.7303411131059246, + "grad_norm": 0.6866413354873657, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10170 + }, + { + "epoch": 0.7310592459605026, + "grad_norm": 0.5505863428115845, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 10180 + }, + { + "epoch": 0.7317773788150808, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 10190 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.5001798272132874, + "learning_rate": 0.0002, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 0.733213644524237, + "grad_norm": 0.5117581486701965, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 10210 + }, + { + "epoch": 0.7339317773788151, + "grad_norm": 0.7716088891029358, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 10220 + }, + { + "epoch": 0.7346499102333932, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 10230 + }, + { + "epoch": 0.7353680430879713, + "grad_norm": 0.6433483362197876, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10240 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.6241081357002258, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10250 + }, + { + "epoch": 0.7368043087971274, + "grad_norm": 0.7198845744132996, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10260 + }, + { + "epoch": 0.7375224416517056, + "grad_norm": 0.5879023671150208, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 10270 + }, + { + "epoch": 0.7382405745062837, + "grad_norm": 0.5810162425041199, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 10280 + }, + { + "epoch": 0.7389587073608618, + "grad_norm": 0.6336500644683838, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10290 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.5627583861351013, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 10300 + }, + { + "epoch": 0.740394973070018, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 10310 + }, + { + "epoch": 0.741113105924596, + "grad_norm": 0.5519505143165588, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 10320 + }, + { + "epoch": 0.7418312387791741, + "grad_norm": 0.628710925579071, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 10330 + }, + { + "epoch": 0.7425493716337522, + "grad_norm": 0.6466957926750183, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10340 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 0.6269286274909973, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 10350 + }, + { + "epoch": 0.7439856373429085, + "grad_norm": 0.6985455751419067, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 10360 + }, + { + "epoch": 0.7447037701974866, + "grad_norm": 0.6203648447990417, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 10370 + }, + { + "epoch": 0.7454219030520647, + "grad_norm": 0.6524295210838318, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 10380 + }, + { + "epoch": 0.7461400359066427, + "grad_norm": 0.6108002662658691, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 10390 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 0.5196276903152466, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 10400 + }, + { + "epoch": 0.7475763016157989, + "grad_norm": 0.6207506656646729, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 10410 + }, + { + "epoch": 0.748294434470377, + "grad_norm": 0.6015686988830566, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 10420 + }, + { + "epoch": 0.7490125673249551, + "grad_norm": 0.6402649879455566, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 10430 + }, + { + "epoch": 0.7497307001795332, + "grad_norm": 0.7816081047058105, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 10440 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.6148143410682678, + "learning_rate": 0.0002, + "loss": 0.8021, + "step": 10450 + }, + { + "epoch": 0.7511669658886894, + "grad_norm": 0.6496613621711731, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 10460 + }, + { + "epoch": 0.7518850987432675, + "grad_norm": 0.49158045649528503, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 10470 + }, + { + "epoch": 0.7526032315978456, + "grad_norm": 0.8629217743873596, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 10480 + }, + { + "epoch": 0.7533213644524237, + "grad_norm": 0.6800066828727722, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 10490 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.6480063199996948, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 10500 + }, + { + "epoch": 0.7547576301615799, + "grad_norm": 0.5740751028060913, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10510 + }, + { + "epoch": 0.755475763016158, + "grad_norm": 0.7182627320289612, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 10520 + }, + { + "epoch": 0.756193895870736, + "grad_norm": 0.6482816934585571, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 10530 + }, + { + "epoch": 0.7569120287253142, + "grad_norm": 0.4937674105167389, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 10540 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.6818482875823975, + "learning_rate": 0.0002, + "loss": 0.7783, + "step": 10550 + }, + { + "epoch": 0.7583482944344704, + "grad_norm": 0.6375173926353455, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10560 + }, + { + "epoch": 0.7590664272890485, + "grad_norm": 0.528798520565033, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 10570 + }, + { + "epoch": 0.7597845601436266, + "grad_norm": 0.42099910974502563, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 10580 + }, + { + "epoch": 0.7605026929982047, + "grad_norm": 0.529604434967041, + "learning_rate": 0.0002, + "loss": 0.8218, + "step": 10590 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.6236841082572937, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 10600 + }, + { + "epoch": 0.7619389587073608, + "grad_norm": 0.6194891929626465, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10610 + }, + { + "epoch": 0.762657091561939, + "grad_norm": 0.5206209421157837, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 10620 + }, + { + "epoch": 0.7633752244165171, + "grad_norm": 0.7981295585632324, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 10630 + }, + { + "epoch": 0.7640933572710952, + "grad_norm": 0.6113479137420654, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 10640 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 0.7025435566902161, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10650 + }, + { + "epoch": 0.7655296229802514, + "grad_norm": 0.46914348006248474, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 10660 + }, + { + "epoch": 0.7662477558348294, + "grad_norm": 0.6134725213050842, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 10670 + }, + { + "epoch": 0.7669658886894075, + "grad_norm": 0.583859920501709, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10680 + }, + { + "epoch": 0.7676840215439856, + "grad_norm": 0.511349081993103, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10690 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.6467110514640808, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 10700 + }, + { + "epoch": 0.7691202872531419, + "grad_norm": 0.7210163474082947, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 10710 + }, + { + "epoch": 0.76983842010772, + "grad_norm": 0.6034521460533142, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 10720 + }, + { + "epoch": 0.7705565529622981, + "grad_norm": 0.6237271428108215, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 10730 + }, + { + "epoch": 0.7712746858168761, + "grad_norm": 0.664328396320343, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 10740 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.6550520062446594, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 10750 + }, + { + "epoch": 0.7727109515260323, + "grad_norm": 0.5103325843811035, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 10760 + }, + { + "epoch": 0.7734290843806104, + "grad_norm": 0.7171200513839722, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 10770 + }, + { + "epoch": 0.7741472172351885, + "grad_norm": 0.5947384834289551, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 10780 + }, + { + "epoch": 0.7748653500897666, + "grad_norm": 0.5293096899986267, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10790 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.6372577548027039, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10800 + }, + { + "epoch": 0.7763016157989228, + "grad_norm": 0.5738261938095093, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.7770197486535009, + "grad_norm": 0.7309247255325317, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 10820 + }, + { + "epoch": 0.777737881508079, + "grad_norm": 0.8867193460464478, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10830 + }, + { + "epoch": 0.7784560143626571, + "grad_norm": 0.6151437759399414, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 10840 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.5645464658737183, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 0.7798922800718133, + "grad_norm": 0.5118698477745056, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10860 + }, + { + "epoch": 0.7806104129263913, + "grad_norm": 0.618181049823761, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 10870 + }, + { + "epoch": 0.7813285457809694, + "grad_norm": 0.7206462025642395, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 10880 + }, + { + "epoch": 0.7820466786355476, + "grad_norm": 0.7993820905685425, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 10890 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.5072754621505737, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10900 + }, + { + "epoch": 0.7834829443447038, + "grad_norm": 0.5829088687896729, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 10910 + }, + { + "epoch": 0.7842010771992819, + "grad_norm": 0.5778957605361938, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 10920 + }, + { + "epoch": 0.78491921005386, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.785637342908438, + "grad_norm": 0.5778013467788696, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 10940 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.6129629611968994, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10950 + }, + { + "epoch": 0.7870736086175942, + "grad_norm": 0.5637320876121521, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10960 + }, + { + "epoch": 0.7877917414721723, + "grad_norm": 0.6253715753555298, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10970 + }, + { + "epoch": 0.7885098743267505, + "grad_norm": 0.6209888458251953, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10980 + }, + { + "epoch": 0.7892280071813286, + "grad_norm": 1.0841948986053467, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10990 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.6570560336112976, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 11000 + }, + { + "epoch": 0.7906642728904847, + "grad_norm": 0.4830388128757477, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11010 + }, + { + "epoch": 0.7913824057450628, + "grad_norm": 0.7607520222663879, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11020 + }, + { + "epoch": 0.7921005385996409, + "grad_norm": 0.8202590346336365, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 11030 + }, + { + "epoch": 0.792818671454219, + "grad_norm": 0.5640848278999329, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11040 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 0.7773675322532654, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 11050 + }, + { + "epoch": 0.7942549371633753, + "grad_norm": 0.664139986038208, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11060 + }, + { + "epoch": 0.7949730700179534, + "grad_norm": 0.6097795367240906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 11070 + }, + { + "epoch": 0.7956912028725314, + "grad_norm": 0.9208881258964539, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 11080 + }, + { + "epoch": 0.7964093357271095, + "grad_norm": 0.6210731863975525, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 11090 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.7060235738754272, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 11100 + }, + { + "epoch": 0.7978456014362657, + "grad_norm": 0.48695266246795654, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 11110 + }, + { + "epoch": 0.7985637342908438, + "grad_norm": 0.6458830833435059, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 11120 + }, + { + "epoch": 0.7992818671454219, + "grad_norm": 0.572545051574707, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 11130 + }, + { + "epoch": 0.8, + "grad_norm": 0.5925027132034302, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 11140 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 0.569622278213501, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 11150 + }, + { + "epoch": 0.8014362657091562, + "grad_norm": 0.537146806716919, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 11160 + }, + { + "epoch": 0.8021543985637343, + "grad_norm": 0.7118613719940186, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 11170 + }, + { + "epoch": 0.8028725314183124, + "grad_norm": 0.6183688044548035, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 11180 + }, + { + "epoch": 0.8035906642728905, + "grad_norm": 0.5187385082244873, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 11190 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.5422571301460266, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 11200 + }, + { + "epoch": 0.8050269299820467, + "grad_norm": 0.635050892829895, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 11210 + }, + { + "epoch": 0.8057450628366247, + "grad_norm": 0.6584872007369995, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 11220 + }, + { + "epoch": 0.8064631956912028, + "grad_norm": 0.624921977519989, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 11230 + }, + { + "epoch": 0.807181328545781, + "grad_norm": 0.6837546229362488, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 11240 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 0.5861160755157471, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11250 + }, + { + "epoch": 0.8086175942549372, + "grad_norm": 0.5751383900642395, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 11260 + }, + { + "epoch": 0.8093357271095153, + "grad_norm": 0.7181510329246521, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11270 + }, + { + "epoch": 0.8100538599640934, + "grad_norm": 0.5862139463424683, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11280 + }, + { + "epoch": 0.8107719928186714, + "grad_norm": 0.4880113899707794, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 11290 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.565590500831604, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 11300 + }, + { + "epoch": 0.8122082585278276, + "grad_norm": 0.6171264052391052, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11310 + }, + { + "epoch": 0.8129263913824057, + "grad_norm": 0.5815969109535217, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 11320 + }, + { + "epoch": 0.8136445242369839, + "grad_norm": 0.5407653450965881, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 11330 + }, + { + "epoch": 0.814362657091562, + "grad_norm": 0.6990084648132324, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 11340 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.5845068097114563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11350 + }, + { + "epoch": 0.8157989228007181, + "grad_norm": 0.5978701114654541, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11360 + }, + { + "epoch": 0.8165170556552962, + "grad_norm": 0.6873053312301636, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 11370 + }, + { + "epoch": 0.8172351885098743, + "grad_norm": 0.7048654556274414, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 11380 + }, + { + "epoch": 0.8179533213644524, + "grad_norm": 0.7631531953811646, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 11390 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.704922080039978, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 11400 + }, + { + "epoch": 0.8193895870736086, + "grad_norm": 0.595460832118988, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11410 + }, + { + "epoch": 0.8201077199281868, + "grad_norm": 0.5882242918014526, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 11420 + }, + { + "epoch": 0.8208258527827648, + "grad_norm": 0.6433175206184387, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 11430 + }, + { + "epoch": 0.8215439856373429, + "grad_norm": 0.6047986149787903, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 11440 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 11450 + }, + { + "epoch": 0.8229802513464991, + "grad_norm": 0.5558379888534546, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 11460 + }, + { + "epoch": 0.8236983842010772, + "grad_norm": 0.6745542287826538, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 11470 + }, + { + "epoch": 0.8244165170556553, + "grad_norm": 0.7082334756851196, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 11480 + }, + { + "epoch": 0.8251346499102334, + "grad_norm": 0.703889787197113, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11490 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.5261096358299255, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 11500 + }, + { + "epoch": 0.8265709156193896, + "grad_norm": 0.6009393930435181, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 11510 + }, + { + "epoch": 0.8272890484739677, + "grad_norm": 0.584274172782898, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 11520 + }, + { + "epoch": 0.8280071813285458, + "grad_norm": 0.6803238987922668, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 11530 + }, + { + "epoch": 0.8287253141831239, + "grad_norm": 0.6230084896087646, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 11540 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.6090595722198486, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 11550 + }, + { + "epoch": 0.8301615798922801, + "grad_norm": 0.5292693376541138, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 11560 + }, + { + "epoch": 0.8308797127468581, + "grad_norm": 0.5675389766693115, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 11570 + }, + { + "epoch": 0.8315978456014362, + "grad_norm": 0.554874062538147, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 11580 + }, + { + "epoch": 0.8323159784560143, + "grad_norm": 0.8582373261451721, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 11590 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.5743035674095154, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 11600 + }, + { + "epoch": 0.8337522441651706, + "grad_norm": 0.5749582648277283, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11610 + }, + { + "epoch": 0.8344703770197487, + "grad_norm": 0.5207278728485107, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11620 + }, + { + "epoch": 0.8351885098743268, + "grad_norm": 0.6262611150741577, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 11630 + }, + { + "epoch": 0.8359066427289048, + "grad_norm": 0.5490066409111023, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 11640 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6283167600631714, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 11650 + }, + { + "epoch": 0.837342908438061, + "grad_norm": 0.7701452374458313, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 11660 + }, + { + "epoch": 0.8380610412926391, + "grad_norm": 0.5825072526931763, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 11670 + }, + { + "epoch": 0.8387791741472173, + "grad_norm": 0.6119720935821533, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 11680 + }, + { + "epoch": 0.8394973070017954, + "grad_norm": 0.689383327960968, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 11690 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.5396560430526733, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 11700 + }, + { + "epoch": 0.8409335727109515, + "grad_norm": 0.577178955078125, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 11710 + }, + { + "epoch": 0.8416517055655296, + "grad_norm": 0.6652564406394958, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 11720 + }, + { + "epoch": 0.8423698384201077, + "grad_norm": 0.588377058506012, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 11730 + }, + { + "epoch": 0.8430879712746858, + "grad_norm": 0.6180438995361328, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 11740 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.6897811889648438, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11750 + }, + { + "epoch": 0.844524236983842, + "grad_norm": 0.5826608538627625, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 11760 + }, + { + "epoch": 0.8452423698384202, + "grad_norm": 0.6511976718902588, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 11770 + }, + { + "epoch": 0.8459605026929982, + "grad_norm": 0.4738382399082184, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 11780 + }, + { + "epoch": 0.8466786355475763, + "grad_norm": 0.541780948638916, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 11790 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 0.6115241050720215, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 11800 + }, + { + "epoch": 0.8481149012567325, + "grad_norm": 0.7067801356315613, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 11810 + }, + { + "epoch": 0.8488330341113106, + "grad_norm": 0.5602791905403137, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 11820 + }, + { + "epoch": 0.8495511669658887, + "grad_norm": 0.6968005299568176, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 11830 + }, + { + "epoch": 0.8502692998204668, + "grad_norm": 0.621132493019104, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11840 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.5777568817138672, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11850 + }, + { + "epoch": 0.851705565529623, + "grad_norm": 0.6468178629875183, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 11860 + }, + { + "epoch": 0.8524236983842011, + "grad_norm": 0.6216070652008057, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 11870 + }, + { + "epoch": 0.8531418312387792, + "grad_norm": 0.7402005791664124, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 11880 + }, + { + "epoch": 0.8538599640933573, + "grad_norm": 0.5192958116531372, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 11890 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 0.6050501465797424, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 11900 + }, + { + "epoch": 0.8552962298025135, + "grad_norm": 0.5363124012947083, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11910 + }, + { + "epoch": 0.8560143626570915, + "grad_norm": 0.525288462638855, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11920 + }, + { + "epoch": 0.8567324955116696, + "grad_norm": 0.6129848957061768, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 11930 + }, + { + "epoch": 0.8574506283662477, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 11940 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.5862830281257629, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 11950 + }, + { + "epoch": 0.858886894075404, + "grad_norm": 0.7078025341033936, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 11960 + }, + { + "epoch": 0.8596050269299821, + "grad_norm": 0.6600908637046814, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 11970 + }, + { + "epoch": 0.8603231597845602, + "grad_norm": 0.5914377570152283, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 11980 + }, + { + "epoch": 0.8610412926391382, + "grad_norm": 0.7844575047492981, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 11990 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.6605148315429688, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12000 + }, + { + "epoch": 0.8624775583482944, + "grad_norm": 0.6320111155509949, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 12010 + }, + { + "epoch": 0.8631956912028725, + "grad_norm": 0.5833557844161987, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 12020 + }, + { + "epoch": 0.8639138240574507, + "grad_norm": 0.5322666764259338, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 12030 + }, + { + "epoch": 0.8646319569120288, + "grad_norm": 0.568696141242981, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 12040 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 0.5739135146141052, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 12050 + }, + { + "epoch": 0.8660682226211849, + "grad_norm": 0.6667993068695068, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 12060 + }, + { + "epoch": 0.866786355475763, + "grad_norm": 0.5393701195716858, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 12070 + }, + { + "epoch": 0.8675044883303411, + "grad_norm": 0.7036312818527222, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 12080 + }, + { + "epoch": 0.8682226211849192, + "grad_norm": 0.5851739048957825, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 12090 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.6554462909698486, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 12100 + }, + { + "epoch": 0.8696588868940754, + "grad_norm": 0.8224838376045227, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 12110 + }, + { + "epoch": 0.8703770197486534, + "grad_norm": 0.513981819152832, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 12120 + }, + { + "epoch": 0.8710951526032316, + "grad_norm": 0.6913988590240479, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 12130 + }, + { + "epoch": 0.8718132854578097, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 12140 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.6216937303543091, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 12150 + }, + { + "epoch": 0.8732495511669659, + "grad_norm": 0.5594495534896851, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 12160 + }, + { + "epoch": 0.873967684021544, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 12170 + }, + { + "epoch": 0.8746858168761221, + "grad_norm": 0.5285239815711975, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 12180 + }, + { + "epoch": 0.8754039497307001, + "grad_norm": 1.0394607782363892, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 12190 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 12200 + }, + { + "epoch": 0.8768402154398564, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 12210 + }, + { + "epoch": 0.8775583482944345, + "grad_norm": 0.593204915523529, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 12220 + }, + { + "epoch": 0.8782764811490126, + "grad_norm": 0.7141679525375366, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 12230 + }, + { + "epoch": 0.8789946140035907, + "grad_norm": 0.6381585597991943, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 12240 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.7076981067657471, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12250 + }, + { + "epoch": 0.8804308797127468, + "grad_norm": 0.8046461939811707, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 12260 + }, + { + "epoch": 0.8811490125673249, + "grad_norm": 0.635160505771637, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 12270 + }, + { + "epoch": 0.881867145421903, + "grad_norm": 0.6388354301452637, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8825852782764811, + "grad_norm": 0.5612906217575073, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 12290 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6716228723526001, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 12300 + }, + { + "epoch": 0.8840215439856374, + "grad_norm": 0.6488762497901917, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 12310 + }, + { + "epoch": 0.8847396768402155, + "grad_norm": 0.5770853757858276, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 12320 + }, + { + "epoch": 0.8854578096947935, + "grad_norm": 0.5006616711616516, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 12330 + }, + { + "epoch": 0.8861759425493716, + "grad_norm": 0.6428417563438416, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 12340 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 0.5721977949142456, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12350 + }, + { + "epoch": 0.8876122082585278, + "grad_norm": 0.7000266313552856, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 12360 + }, + { + "epoch": 0.8883303411131059, + "grad_norm": 0.5252631306648254, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 12370 + }, + { + "epoch": 0.889048473967684, + "grad_norm": 0.5788044929504395, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 0.8897666068222622, + "grad_norm": 0.6730653643608093, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.5556851029396057, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 12400 + }, + { + "epoch": 0.8912028725314183, + "grad_norm": 0.616189181804657, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 12410 + }, + { + "epoch": 0.8919210053859964, + "grad_norm": 0.6360940337181091, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 12420 + }, + { + "epoch": 0.8926391382405745, + "grad_norm": 0.5832887887954712, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 12430 + }, + { + "epoch": 0.8933572710951526, + "grad_norm": 0.8319168090820312, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 12440 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.5415005087852478, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 12450 + }, + { + "epoch": 0.8947935368043088, + "grad_norm": 0.4959808588027954, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 12460 + }, + { + "epoch": 0.8955116696588868, + "grad_norm": 0.5102260708808899, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 12470 + }, + { + "epoch": 0.896229802513465, + "grad_norm": 0.773972749710083, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12480 + }, + { + "epoch": 0.8969479353680431, + "grad_norm": 0.6314513087272644, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 12490 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.6503705382347107, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 12500 + }, + { + "epoch": 0.8983842010771993, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 12510 + }, + { + "epoch": 0.8991023339317774, + "grad_norm": 0.7222756743431091, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 12520 + }, + { + "epoch": 0.8998204667863555, + "grad_norm": 0.7242336869239807, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 12530 + }, + { + "epoch": 0.9005385996409335, + "grad_norm": 0.625769317150116, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 12540 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.6003357172012329, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 12550 + }, + { + "epoch": 0.9019748653500897, + "grad_norm": 0.6089374423027039, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 12560 + }, + { + "epoch": 0.9026929982046679, + "grad_norm": 0.6232544183731079, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 12570 + }, + { + "epoch": 0.903411131059246, + "grad_norm": 0.5426769256591797, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 12580 + }, + { + "epoch": 0.9041292639138241, + "grad_norm": 0.5711943507194519, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 12590 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.5287838578224182, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 12600 + }, + { + "epoch": 0.9055655296229802, + "grad_norm": 0.6192951798439026, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 12610 + }, + { + "epoch": 0.9062836624775583, + "grad_norm": 0.493082195520401, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 12620 + }, + { + "epoch": 0.9070017953321364, + "grad_norm": 0.7668463587760925, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 12630 + }, + { + "epoch": 0.9077199281867145, + "grad_norm": 0.6298037767410278, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 12640 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.5502580404281616, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 12650 + }, + { + "epoch": 0.9091561938958708, + "grad_norm": 0.5525170564651489, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.9098743267504489, + "grad_norm": 0.9753695726394653, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 12670 + }, + { + "epoch": 0.9105924596050269, + "grad_norm": 0.611427366733551, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 12680 + }, + { + "epoch": 0.911310592459605, + "grad_norm": 0.5141594409942627, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 12690 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 0.6739137172698975, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 12700 + }, + { + "epoch": 0.9127468581687612, + "grad_norm": 0.5759707689285278, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 12710 + }, + { + "epoch": 0.9134649910233393, + "grad_norm": 0.5548733472824097, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12720 + }, + { + "epoch": 0.9141831238779174, + "grad_norm": 0.7014280557632446, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 12730 + }, + { + "epoch": 0.9149012567324956, + "grad_norm": 0.5939958691596985, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 12740 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 0.5995593667030334, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12750 + }, + { + "epoch": 0.9163375224416517, + "grad_norm": 0.6686680316925049, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 12760 + }, + { + "epoch": 0.9170556552962298, + "grad_norm": 0.4742372930049896, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 12770 + }, + { + "epoch": 0.9177737881508079, + "grad_norm": 0.5493217706680298, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 12780 + }, + { + "epoch": 0.918491921005386, + "grad_norm": 0.5641885995864868, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 12790 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 0.5814061164855957, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 12800 + }, + { + "epoch": 0.9199281867145422, + "grad_norm": 0.6774331331253052, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 12810 + }, + { + "epoch": 0.9206463195691202, + "grad_norm": 0.5592127442359924, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 12820 + }, + { + "epoch": 0.9213644524236984, + "grad_norm": 0.5246456861495972, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 12830 + }, + { + "epoch": 0.9220825852782765, + "grad_norm": 0.6524264812469482, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 12840 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 0.6010791063308716, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12850 + }, + { + "epoch": 0.9235188509874327, + "grad_norm": 0.5289866924285889, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 12860 + }, + { + "epoch": 0.9242369838420108, + "grad_norm": 0.6850762367248535, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 12870 + }, + { + "epoch": 0.9249551166965889, + "grad_norm": 0.5293797850608826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 12880 + }, + { + "epoch": 0.9256732495511669, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 12890 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.7026739716529846, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 12900 + }, + { + "epoch": 0.9271095152603231, + "grad_norm": 0.6884756684303284, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 12910 + }, + { + "epoch": 0.9278276481149013, + "grad_norm": 0.637884795665741, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 12920 + }, + { + "epoch": 0.9285457809694794, + "grad_norm": 0.513913631439209, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 12930 + }, + { + "epoch": 0.9292639138240575, + "grad_norm": 0.6642340421676636, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 12940 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.5708861947059631, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 12950 + }, + { + "epoch": 0.9307001795332136, + "grad_norm": 0.5896512866020203, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 12960 + }, + { + "epoch": 0.9314183123877917, + "grad_norm": 0.5754874348640442, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 12970 + }, + { + "epoch": 0.9321364452423698, + "grad_norm": 0.6363751888275146, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 12980 + }, + { + "epoch": 0.9328545780969479, + "grad_norm": 0.7660197019577026, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 12990 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.607728898525238, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 13000 + }, + { + "epoch": 0.9342908438061042, + "grad_norm": 0.5257042050361633, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 13010 + }, + { + "epoch": 0.9350089766606823, + "grad_norm": 0.7916908264160156, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 13020 + }, + { + "epoch": 0.9357271095152603, + "grad_norm": 0.8310123085975647, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 13030 + }, + { + "epoch": 0.9364452423698384, + "grad_norm": 0.6543728113174438, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 13040 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.7153878808021545, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 13050 + }, + { + "epoch": 0.9378815080789946, + "grad_norm": 0.7510694265365601, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 13060 + }, + { + "epoch": 0.9385996409335727, + "grad_norm": 0.5524464249610901, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 13070 + }, + { + "epoch": 0.9393177737881508, + "grad_norm": 0.6657140254974365, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 13080 + }, + { + "epoch": 0.940035906642729, + "grad_norm": 0.5757394433021545, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 13090 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.6171187162399292, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 13100 + }, + { + "epoch": 0.9414721723518851, + "grad_norm": 0.5946314334869385, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 13110 + }, + { + "epoch": 0.9421903052064632, + "grad_norm": 0.5727229714393616, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 13120 + }, + { + "epoch": 0.9429084380610413, + "grad_norm": 0.7805224061012268, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 13130 + }, + { + "epoch": 0.9436265709156194, + "grad_norm": 0.5763523578643799, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 13140 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 0.8310899138450623, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13150 + }, + { + "epoch": 0.9450628366247756, + "grad_norm": 0.7531784772872925, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 13160 + }, + { + "epoch": 0.9457809694793536, + "grad_norm": 0.678779661655426, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 13170 + }, + { + "epoch": 0.9464991023339318, + "grad_norm": 0.8096453547477722, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13180 + }, + { + "epoch": 0.9472172351885099, + "grad_norm": 0.6743921637535095, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 13190 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.606852114200592, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 13200 + }, + { + "epoch": 0.9486535008976661, + "grad_norm": 0.6550270915031433, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 13210 + }, + { + "epoch": 0.9493716337522442, + "grad_norm": 0.6494552493095398, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 13220 + }, + { + "epoch": 0.9500897666068223, + "grad_norm": 0.5867666602134705, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 13230 + }, + { + "epoch": 0.9508078994614003, + "grad_norm": 0.6283786296844482, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 13240 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 0.6824573278427124, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 13250 + }, + { + "epoch": 0.9522441651705565, + "grad_norm": 0.6945744156837463, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 13260 + }, + { + "epoch": 0.9529622980251347, + "grad_norm": 0.6468575596809387, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 13270 + }, + { + "epoch": 0.9536804308797128, + "grad_norm": 0.6819407939910889, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 0.9543985637342909, + "grad_norm": 0.6660491824150085, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 13290 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6320462226867676, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 13300 + }, + { + "epoch": 0.955834829443447, + "grad_norm": 0.46753761172294617, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 13310 + }, + { + "epoch": 0.9565529622980251, + "grad_norm": 0.6608774065971375, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 13320 + }, + { + "epoch": 0.9572710951526032, + "grad_norm": 0.607448935508728, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 13330 + }, + { + "epoch": 0.9579892280071813, + "grad_norm": 0.6796701550483704, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 13340 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.7655861377716064, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 13350 + }, + { + "epoch": 0.9594254937163376, + "grad_norm": 0.5881335735321045, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 13360 + }, + { + "epoch": 0.9601436265709156, + "grad_norm": 0.6855270862579346, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 13370 + }, + { + "epoch": 0.9608617594254937, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 13380 + }, + { + "epoch": 0.9615798922800718, + "grad_norm": 0.5983994603157043, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 13390 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.6141189932823181, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 13400 + }, + { + "epoch": 0.963016157989228, + "grad_norm": 0.6539722084999084, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 13410 + }, + { + "epoch": 0.9637342908438061, + "grad_norm": 0.5425801277160645, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 13420 + }, + { + "epoch": 0.9644524236983842, + "grad_norm": 0.8038925528526306, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 13430 + }, + { + "epoch": 0.9651705565529622, + "grad_norm": 0.5729590058326721, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 13440 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5695241689682007, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 13450 + }, + { + "epoch": 0.9666068222621185, + "grad_norm": 0.5913681387901306, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 13460 + }, + { + "epoch": 0.9673249551166966, + "grad_norm": 1.1798994541168213, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 13470 + }, + { + "epoch": 0.9680430879712747, + "grad_norm": 0.5931369066238403, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 13480 + }, + { + "epoch": 0.9687612208258528, + "grad_norm": 0.6269514560699463, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 13490 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.7380245327949524, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 13500 + }, + { + "epoch": 0.9701974865350089, + "grad_norm": 0.5668187141418457, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 13510 + }, + { + "epoch": 0.970915619389587, + "grad_norm": 0.547149121761322, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 13520 + }, + { + "epoch": 0.9716337522441651, + "grad_norm": 0.49131739139556885, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 13530 + }, + { + "epoch": 0.9723518850987433, + "grad_norm": 0.6385366320610046, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 13540 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5962417125701904, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 13550 + }, + { + "epoch": 0.9737881508078995, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 13560 + }, + { + "epoch": 0.9745062836624776, + "grad_norm": 0.5757403373718262, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 13570 + }, + { + "epoch": 0.9752244165170556, + "grad_norm": 0.7214667201042175, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 13580 + }, + { + "epoch": 0.9759425493716337, + "grad_norm": 0.5902701020240784, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 13590 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.752805769443512, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 13600 + }, + { + "epoch": 0.9773788150807899, + "grad_norm": 0.5943595767021179, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 13610 + }, + { + "epoch": 0.978096947935368, + "grad_norm": 0.6752488613128662, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 13620 + }, + { + "epoch": 0.9788150807899462, + "grad_norm": 0.5295413732528687, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 13630 + }, + { + "epoch": 0.9795332136445243, + "grad_norm": 0.732549250125885, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13640 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.5701823830604553, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 13650 + }, + { + "epoch": 0.9809694793536804, + "grad_norm": 0.576898455619812, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13660 + }, + { + "epoch": 0.9816876122082585, + "grad_norm": 0.5916832089424133, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 13670 + }, + { + "epoch": 0.9824057450628366, + "grad_norm": 0.5554524660110474, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 13680 + }, + { + "epoch": 0.9831238779174147, + "grad_norm": 0.6988440752029419, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 13690 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 13700 + }, + { + "epoch": 0.984560143626571, + "grad_norm": 2.421210289001465, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13710 + }, + { + "epoch": 0.985278276481149, + "grad_norm": 0.6307598948478699, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 13720 + }, + { + "epoch": 0.9859964093357271, + "grad_norm": 0.6832480430603027, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 13730 + }, + { + "epoch": 0.9867145421903052, + "grad_norm": 0.5974255204200745, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13740 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.6540380716323853, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 13750 + }, + { + "epoch": 0.9881508078994614, + "grad_norm": 0.7532727122306824, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 13760 + }, + { + "epoch": 0.9888689407540395, + "grad_norm": 0.6776283383369446, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 13770 + }, + { + "epoch": 0.9895870736086176, + "grad_norm": 0.5776281356811523, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 13780 + }, + { + "epoch": 0.9903052064631956, + "grad_norm": 0.5473008751869202, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 13790 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.5428591370582581, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 13800 + }, + { + "epoch": 0.9917414721723519, + "grad_norm": 0.5173406004905701, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 13810 + }, + { + "epoch": 0.99245960502693, + "grad_norm": 0.6462617516517639, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 13820 + }, + { + "epoch": 0.9931777378815081, + "grad_norm": 0.5800426006317139, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 13830 + }, + { + "epoch": 0.9938958707360862, + "grad_norm": 0.5015466809272766, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 13840 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.59474778175354, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 13850 + }, + { + "epoch": 0.9953321364452423, + "grad_norm": 0.5609583258628845, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 13860 + }, + { + "epoch": 0.9960502692998204, + "grad_norm": 0.5762063264846802, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 13870 + }, + { + "epoch": 0.9967684021543985, + "grad_norm": 0.6419214010238647, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 13880 + }, + { + "epoch": 0.9974865350089767, + "grad_norm": 0.7821950316429138, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 13890 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.6216017007827759, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 13900 + }, + { + "epoch": 0.9989228007181329, + "grad_norm": 0.5446485877037048, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 13910 + }, + { + "epoch": 0.999640933572711, + "grad_norm": 0.5037565231323242, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 13920 + }, + { + "epoch": 1.0, + "eval_loss": 1.09147310256958, + "eval_runtime": 55.1915, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 13925 + } + ], + "logging_steps": 10, + "max_steps": 111400, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.444178206621696e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d87ec1347107e335c324ad5fb5c049217911e3f6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:825fda2e0f779accd405aa421ad4f67319e2d7a0b9107c0e455fda1229924f5b +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ba57f3d75c2349e68487dba99eb2b07276643e8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93ceb28c0f8732b2dbd9a7c0c46e53a31b04aacf7c4b2cb159c079d400f2ca62 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..099c0898e7f8b127671fa4f7b06cfb86f592885b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4dfa5c73c7b3d480a97eed94a4bcb1c8c5a27593e6bca4b6b579689882861d +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..54fda3d02f9c7451e41f44e034c11d17d9d56e1e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:002f2b66ea6105266eb195da2b691e5361abf426954e2fbb7546ff4235aafa14 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f4d3a84f1ccfe8fc52e469dc738d352f3ba8f60f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/trainer_state.json @@ -0,0 +1,19544 @@ +{ + "best_metric": 1.0868422985076904, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 27850, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000718132854578097, + "grad_norm": 1.0291756391525269, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 10 + }, + { + "epoch": 0.001436265709156194, + "grad_norm": 0.6570823192596436, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 20 + }, + { + "epoch": 0.0021543985637342907, + "grad_norm": 0.693844199180603, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 30 + }, + { + "epoch": 0.002872531418312388, + "grad_norm": 0.5608532428741455, + "learning_rate": 0.0002, + "loss": 0.9377, + "step": 40 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 0.549075722694397, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 50 + }, + { + "epoch": 0.004308797127468581, + "grad_norm": 0.47189879417419434, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 60 + }, + { + "epoch": 0.005026929982046679, + "grad_norm": 0.5799676775932312, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 70 + }, + { + "epoch": 0.005745062836624776, + "grad_norm": 0.45907193422317505, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 80 + }, + { + "epoch": 0.006463195691202872, + "grad_norm": 0.4373045861721039, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 90 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 100 + }, + { + "epoch": 0.007899461400359067, + "grad_norm": 0.5248253345489502, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 110 + }, + { + "epoch": 0.008617594254937163, + "grad_norm": 0.5082874298095703, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 120 + }, + { + "epoch": 0.00933572710951526, + "grad_norm": 0.42670881748199463, + "learning_rate": 0.0002, + "loss": 0.8678, + "step": 130 + }, + { + "epoch": 0.010053859964093357, + "grad_norm": 0.43311649560928345, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 0.43456509709358215, + "learning_rate": 0.0002, + "loss": 0.9252, + "step": 150 + }, + { + "epoch": 0.011490125673249552, + "grad_norm": 0.9222815632820129, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 160 + }, + { + "epoch": 0.012208258527827648, + "grad_norm": 0.42752256989479065, + "learning_rate": 0.0002, + "loss": 0.8651, + "step": 170 + }, + { + "epoch": 0.012926391382405745, + "grad_norm": 0.4175542891025543, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 180 + }, + { + "epoch": 0.013644524236983842, + "grad_norm": 0.4377831518650055, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 190 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 0.47263655066490173, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 200 + }, + { + "epoch": 0.015080789946140035, + "grad_norm": 0.3870520293712616, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 210 + }, + { + "epoch": 0.015798922800718134, + "grad_norm": 0.4950464963912964, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 220 + }, + { + "epoch": 0.01651705565529623, + "grad_norm": 0.4643295407295227, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 230 + }, + { + "epoch": 0.017235188509874325, + "grad_norm": 0.5152903199195862, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 240 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 0.3800727427005768, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 250 + }, + { + "epoch": 0.01867145421903052, + "grad_norm": 0.43700528144836426, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 260 + }, + { + "epoch": 0.01938958707360862, + "grad_norm": 0.3712887763977051, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 270 + }, + { + "epoch": 0.020107719928186715, + "grad_norm": 0.4202553629875183, + "learning_rate": 0.0002, + "loss": 0.8329, + "step": 280 + }, + { + "epoch": 0.02082585278276481, + "grad_norm": 0.40585094690322876, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 0.4685470759868622, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 300 + }, + { + "epoch": 0.022262118491921005, + "grad_norm": 0.373169481754303, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 310 + }, + { + "epoch": 0.022980251346499104, + "grad_norm": 0.39681482315063477, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 320 + }, + { + "epoch": 0.0236983842010772, + "grad_norm": 0.3919322192668915, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 330 + }, + { + "epoch": 0.024416517055655295, + "grad_norm": 0.4728981554508209, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 340 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 0.42439374327659607, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 350 + }, + { + "epoch": 0.02585278276481149, + "grad_norm": 0.425650030374527, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 360 + }, + { + "epoch": 0.02657091561938959, + "grad_norm": 0.4076762795448303, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 370 + }, + { + "epoch": 0.027289048473967684, + "grad_norm": 0.44335922598838806, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 380 + }, + { + "epoch": 0.02800718132854578, + "grad_norm": 0.5313619375228882, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 390 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 0.37089797854423523, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 400 + }, + { + "epoch": 0.029443447037701975, + "grad_norm": 0.5193604826927185, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 410 + }, + { + "epoch": 0.03016157989228007, + "grad_norm": 0.4428552985191345, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 420 + }, + { + "epoch": 0.03087971274685817, + "grad_norm": 0.384171724319458, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 430 + }, + { + "epoch": 0.03159784560143627, + "grad_norm": 0.3906913101673126, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 440 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 0.5365669131278992, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 450 + }, + { + "epoch": 0.03303411131059246, + "grad_norm": 0.4785287380218506, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 460 + }, + { + "epoch": 0.03375224416517056, + "grad_norm": 0.40048182010650635, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 470 + }, + { + "epoch": 0.03447037701974865, + "grad_norm": 0.49529239535331726, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 480 + }, + { + "epoch": 0.03518850987432675, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 490 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 0.3802863359451294, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.03662477558348295, + "grad_norm": 0.40374308824539185, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 510 + }, + { + "epoch": 0.03734290843806104, + "grad_norm": 0.4320009648799896, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 520 + }, + { + "epoch": 0.03806104129263914, + "grad_norm": 0.5198846459388733, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 530 + }, + { + "epoch": 0.03877917414721724, + "grad_norm": 0.4136947989463806, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 540 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 0.39344364404678345, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 550 + }, + { + "epoch": 0.04021543985637343, + "grad_norm": 0.4659644067287445, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 560 + }, + { + "epoch": 0.04093357271095153, + "grad_norm": 0.3898842930793762, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 570 + }, + { + "epoch": 0.04165170556552962, + "grad_norm": 0.3964841961860657, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 580 + }, + { + "epoch": 0.04236983842010772, + "grad_norm": 0.5172179341316223, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 590 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 0.5362544059753418, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 600 + }, + { + "epoch": 0.04380610412926391, + "grad_norm": 0.3975909948348999, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 610 + }, + { + "epoch": 0.04452423698384201, + "grad_norm": 0.3905031085014343, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 620 + }, + { + "epoch": 0.04524236983842011, + "grad_norm": 0.5148088932037354, + "learning_rate": 0.0002, + "loss": 0.7723, + "step": 630 + }, + { + "epoch": 0.04596050269299821, + "grad_norm": 0.38826194405555725, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 640 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 0.5432049036026001, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.0473967684021544, + "grad_norm": 0.42048221826553345, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 660 + }, + { + "epoch": 0.0481149012567325, + "grad_norm": 0.4683088958263397, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 670 + }, + { + "epoch": 0.04883303411131059, + "grad_norm": 0.4623735249042511, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 680 + }, + { + "epoch": 0.04955116696588869, + "grad_norm": 0.509128212928772, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 690 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 0.45767295360565186, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 700 + }, + { + "epoch": 0.05098743267504488, + "grad_norm": 0.4023726284503937, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 710 + }, + { + "epoch": 0.05170556552962298, + "grad_norm": 0.4407201409339905, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 720 + }, + { + "epoch": 0.05242369838420108, + "grad_norm": 0.41862091422080994, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 730 + }, + { + "epoch": 0.05314183123877918, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 740 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 0.4882921576499939, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 750 + }, + { + "epoch": 0.05457809694793537, + "grad_norm": 0.47890132665634155, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 760 + }, + { + "epoch": 0.05529622980251347, + "grad_norm": 0.5811166167259216, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 770 + }, + { + "epoch": 0.05601436265709156, + "grad_norm": 0.41113588213920593, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 780 + }, + { + "epoch": 0.05673249551166966, + "grad_norm": 0.4120602607727051, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 790 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 0.39287394285202026, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 800 + }, + { + "epoch": 0.05816876122082585, + "grad_norm": 0.3986941874027252, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 810 + }, + { + "epoch": 0.05888689407540395, + "grad_norm": 0.4264012575149536, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 820 + }, + { + "epoch": 0.05960502692998205, + "grad_norm": 0.481139600276947, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 830 + }, + { + "epoch": 0.06032315978456014, + "grad_norm": 0.5561784505844116, + "learning_rate": 0.0002, + "loss": 0.8477, + "step": 840 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 0.4787197411060333, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 850 + }, + { + "epoch": 0.06175942549371634, + "grad_norm": 0.46454647183418274, + "learning_rate": 0.0002, + "loss": 0.8567, + "step": 860 + }, + { + "epoch": 0.06247755834829444, + "grad_norm": 0.5929669141769409, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 870 + }, + { + "epoch": 0.06319569120287254, + "grad_norm": 0.4561384618282318, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 880 + }, + { + "epoch": 0.06391382405745062, + "grad_norm": 0.45767998695373535, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 890 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 0.42475444078445435, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 900 + }, + { + "epoch": 0.06535008976660682, + "grad_norm": 0.4911022484302521, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 910 + }, + { + "epoch": 0.06606822262118492, + "grad_norm": 0.5229166746139526, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 920 + }, + { + "epoch": 0.06678635547576302, + "grad_norm": 0.38134580850601196, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 930 + }, + { + "epoch": 0.06750448833034112, + "grad_norm": 0.4171486496925354, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 940 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 0.45171529054641724, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 950 + }, + { + "epoch": 0.0689407540394973, + "grad_norm": 0.44889307022094727, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 960 + }, + { + "epoch": 0.0696588868940754, + "grad_norm": 0.44902464747428894, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 970 + }, + { + "epoch": 0.0703770197486535, + "grad_norm": 0.4671969413757324, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 980 + }, + { + "epoch": 0.0710951526032316, + "grad_norm": 0.4686984717845917, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 990 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1000 + }, + { + "epoch": 0.0725314183123878, + "grad_norm": 0.48861828446388245, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1010 + }, + { + "epoch": 0.0732495511669659, + "grad_norm": 0.7603165507316589, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 1020 + }, + { + "epoch": 0.07396768402154398, + "grad_norm": 0.501654863357544, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 1030 + }, + { + "epoch": 0.07468581687612208, + "grad_norm": 0.45291560888290405, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 1040 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 0.42454713582992554, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 1050 + }, + { + "epoch": 0.07612208258527828, + "grad_norm": 0.4655592441558838, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1060 + }, + { + "epoch": 0.07684021543985638, + "grad_norm": 0.5011071562767029, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 1070 + }, + { + "epoch": 0.07755834829443448, + "grad_norm": 0.37221577763557434, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 1080 + }, + { + "epoch": 0.07827648114901256, + "grad_norm": 0.5123572945594788, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 1090 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 0.44138720631599426, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1100 + }, + { + "epoch": 0.07971274685816876, + "grad_norm": 0.38932886719703674, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 1110 + }, + { + "epoch": 0.08043087971274686, + "grad_norm": 0.435820072889328, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 1120 + }, + { + "epoch": 0.08114901256732496, + "grad_norm": 0.3820142149925232, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 1130 + }, + { + "epoch": 0.08186714542190306, + "grad_norm": 0.39680808782577515, + "learning_rate": 0.0002, + "loss": 0.8617, + "step": 1140 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 0.4833722412586212, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1150 + }, + { + "epoch": 0.08330341113105924, + "grad_norm": 0.5045956969261169, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1160 + }, + { + "epoch": 0.08402154398563734, + "grad_norm": 0.3652207553386688, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 1170 + }, + { + "epoch": 0.08473967684021544, + "grad_norm": 0.44447052478790283, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 1180 + }, + { + "epoch": 0.08545780969479354, + "grad_norm": 0.44942694902420044, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 1190 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 0.48789075016975403, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1200 + }, + { + "epoch": 0.08689407540394974, + "grad_norm": 0.3981451094150543, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1210 + }, + { + "epoch": 0.08761220825852782, + "grad_norm": 0.45545220375061035, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 1220 + }, + { + "epoch": 0.08833034111310592, + "grad_norm": 0.562138557434082, + "learning_rate": 0.0002, + "loss": 0.8406, + "step": 1230 + }, + { + "epoch": 0.08904847396768402, + "grad_norm": 0.48523494601249695, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 1240 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 0.35054388642311096, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1250 + }, + { + "epoch": 0.09048473967684022, + "grad_norm": 0.4148605167865753, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 1260 + }, + { + "epoch": 0.09120287253141832, + "grad_norm": 0.50171959400177, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 1270 + }, + { + "epoch": 0.09192100538599642, + "grad_norm": 0.41747573018074036, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 1280 + }, + { + "epoch": 0.0926391382405745, + "grad_norm": 0.43028751015663147, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1290 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 0.41274991631507874, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 1300 + }, + { + "epoch": 0.0940754039497307, + "grad_norm": 0.5399569272994995, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 1310 + }, + { + "epoch": 0.0947935368043088, + "grad_norm": 0.44284379482269287, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 1320 + }, + { + "epoch": 0.0955116696588869, + "grad_norm": 0.42511969804763794, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1330 + }, + { + "epoch": 0.096229802513465, + "grad_norm": 0.5717929005622864, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1340 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1350 + }, + { + "epoch": 0.09766606822262118, + "grad_norm": 0.4144339859485626, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 1360 + }, + { + "epoch": 0.09838420107719928, + "grad_norm": 0.43676936626434326, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 1370 + }, + { + "epoch": 0.09910233393177738, + "grad_norm": 0.5297161340713501, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 1380 + }, + { + "epoch": 0.09982046678635548, + "grad_norm": 0.5319193601608276, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1390 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 0.4083728492259979, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1400 + }, + { + "epoch": 0.10125673249551168, + "grad_norm": 0.4193868339061737, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1410 + }, + { + "epoch": 0.10197486535008976, + "grad_norm": 0.4062198996543884, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 1420 + }, + { + "epoch": 0.10269299820466786, + "grad_norm": 0.43972232937812805, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1430 + }, + { + "epoch": 0.10341113105924596, + "grad_norm": 0.4598410725593567, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1440 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 0.571662187576294, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1450 + }, + { + "epoch": 0.10484739676840216, + "grad_norm": 0.5437791347503662, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1460 + }, + { + "epoch": 0.10556552962298026, + "grad_norm": 0.4241923391819, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1470 + }, + { + "epoch": 0.10628366247755835, + "grad_norm": 0.5185145735740662, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1480 + }, + { + "epoch": 0.10700179533213644, + "grad_norm": 0.537626326084137, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 1490 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 0.4573661983013153, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 1500 + }, + { + "epoch": 0.10843806104129264, + "grad_norm": 0.4521017074584961, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 1510 + }, + { + "epoch": 0.10915619389587074, + "grad_norm": 0.6835159063339233, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1520 + }, + { + "epoch": 0.10987432675044884, + "grad_norm": 0.43522894382476807, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 1530 + }, + { + "epoch": 0.11059245960502694, + "grad_norm": 0.685547411441803, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 1540 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 0.5283669233322144, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1550 + }, + { + "epoch": 0.11202872531418312, + "grad_norm": 0.4869283437728882, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 1560 + }, + { + "epoch": 0.11274685816876122, + "grad_norm": 0.43024054169654846, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1570 + }, + { + "epoch": 0.11346499102333932, + "grad_norm": 0.46726059913635254, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1580 + }, + { + "epoch": 0.11418312387791742, + "grad_norm": 0.5046039819717407, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 1590 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 0.48972827196121216, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 1600 + }, + { + "epoch": 0.11561938958707361, + "grad_norm": 0.5221049189567566, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 1610 + }, + { + "epoch": 0.1163375224416517, + "grad_norm": 0.49169477820396423, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 1620 + }, + { + "epoch": 0.1170556552962298, + "grad_norm": 0.48462188243865967, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 1630 + }, + { + "epoch": 0.1177737881508079, + "grad_norm": 0.9001021981239319, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 1640 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 0.47555917501449585, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 1650 + }, + { + "epoch": 0.1192100538599641, + "grad_norm": 0.4523521959781647, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1660 + }, + { + "epoch": 0.1199281867145422, + "grad_norm": 0.510956346988678, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 1670 + }, + { + "epoch": 0.12064631956912028, + "grad_norm": 0.48063746094703674, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 0.12136445242369838, + "grad_norm": 0.5209490060806274, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 1690 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 0.5488983988761902, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1700 + }, + { + "epoch": 0.12280071813285458, + "grad_norm": 0.5263523459434509, + "learning_rate": 0.0002, + "loss": 0.829, + "step": 1710 + }, + { + "epoch": 0.12351885098743268, + "grad_norm": 0.45365768671035767, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 1720 + }, + { + "epoch": 0.12423698384201078, + "grad_norm": 0.4366922378540039, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 1730 + }, + { + "epoch": 0.12495511669658887, + "grad_norm": 0.4841083884239197, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 1740 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 0.46546968817710876, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 1750 + }, + { + "epoch": 0.12639138240574507, + "grad_norm": 0.39987099170684814, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1760 + }, + { + "epoch": 0.12710951526032316, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 1770 + }, + { + "epoch": 0.12782764811490124, + "grad_norm": 0.46716657280921936, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 1780 + }, + { + "epoch": 0.12854578096947936, + "grad_norm": 0.46164995431900024, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1790 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 0.4910370111465454, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 1800 + }, + { + "epoch": 0.12998204667863555, + "grad_norm": 0.5615737438201904, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 1810 + }, + { + "epoch": 0.13070017953321364, + "grad_norm": 0.5739728808403015, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1820 + }, + { + "epoch": 0.13141831238779175, + "grad_norm": 0.44104722142219543, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 1830 + }, + { + "epoch": 0.13213644524236984, + "grad_norm": 0.46373724937438965, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 1840 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 0.4481196403503418, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.13357271095152604, + "grad_norm": 0.5689327716827393, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 1860 + }, + { + "epoch": 0.13429084380610412, + "grad_norm": 0.5334849953651428, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 1870 + }, + { + "epoch": 0.13500897666068223, + "grad_norm": 0.5177253484725952, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 1880 + }, + { + "epoch": 0.13572710951526032, + "grad_norm": 0.4919368326663971, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 1890 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 0.5987576842308044, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 1900 + }, + { + "epoch": 0.13716337522441652, + "grad_norm": 0.49790486693382263, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 1910 + }, + { + "epoch": 0.1378815080789946, + "grad_norm": 0.5337542295455933, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1920 + }, + { + "epoch": 0.13859964093357272, + "grad_norm": 0.5171598792076111, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 1930 + }, + { + "epoch": 0.1393177737881508, + "grad_norm": 0.5003953576087952, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1940 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 0.5147887468338013, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 1950 + }, + { + "epoch": 0.140754039497307, + "grad_norm": 0.6365984678268433, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 1960 + }, + { + "epoch": 0.1414721723518851, + "grad_norm": 0.5449512004852295, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 1970 + }, + { + "epoch": 0.1421903052064632, + "grad_norm": 0.4062703847885132, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1980 + }, + { + "epoch": 0.14290843806104128, + "grad_norm": 0.4446912705898285, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 1990 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 0.49001234769821167, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 2000 + }, + { + "epoch": 0.14434470377019748, + "grad_norm": 0.5591765642166138, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 2010 + }, + { + "epoch": 0.1450628366247756, + "grad_norm": 0.6476696133613586, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 2020 + }, + { + "epoch": 0.14578096947935368, + "grad_norm": 0.44688376784324646, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 2030 + }, + { + "epoch": 0.1464991023339318, + "grad_norm": 0.4437490701675415, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 2040 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 0.59927898645401, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 2050 + }, + { + "epoch": 0.14793536804308796, + "grad_norm": 0.4356591999530792, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 2060 + }, + { + "epoch": 0.14865350089766607, + "grad_norm": 0.5560822486877441, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2070 + }, + { + "epoch": 0.14937163375224416, + "grad_norm": 0.43027108907699585, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2080 + }, + { + "epoch": 0.15008976660682227, + "grad_norm": 0.41215455532073975, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 2090 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 0.4607839584350586, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 2100 + }, + { + "epoch": 0.15152603231597844, + "grad_norm": 0.4699854254722595, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2110 + }, + { + "epoch": 0.15224416517055656, + "grad_norm": 0.5111975073814392, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2120 + }, + { + "epoch": 0.15296229802513464, + "grad_norm": 0.4713742733001709, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2130 + }, + { + "epoch": 0.15368043087971275, + "grad_norm": 0.3816622793674469, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2140 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 0.4637526273727417, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 2150 + }, + { + "epoch": 0.15511669658886895, + "grad_norm": 0.3691818118095398, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2160 + }, + { + "epoch": 0.15583482944344704, + "grad_norm": 0.4435218274593353, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 2170 + }, + { + "epoch": 0.15655296229802512, + "grad_norm": 0.5282211899757385, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 2180 + }, + { + "epoch": 0.15727109515260324, + "grad_norm": 0.7611056566238403, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 2190 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 0.5951169729232788, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 2200 + }, + { + "epoch": 0.15870736086175943, + "grad_norm": 0.5243265628814697, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2210 + }, + { + "epoch": 0.15942549371633752, + "grad_norm": 0.518944501876831, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 2220 + }, + { + "epoch": 0.16014362657091563, + "grad_norm": 0.4264616072177887, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2230 + }, + { + "epoch": 0.16086175942549372, + "grad_norm": 0.4619045853614807, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 2240 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 0.4047030508518219, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2250 + }, + { + "epoch": 0.16229802513464991, + "grad_norm": 0.47133687138557434, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 2260 + }, + { + "epoch": 0.163016157989228, + "grad_norm": 0.4990246593952179, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 2270 + }, + { + "epoch": 0.1637342908438061, + "grad_norm": 0.5145298838615417, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 2280 + }, + { + "epoch": 0.1644524236983842, + "grad_norm": 0.5354352593421936, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 2290 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 0.47621065378189087, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 2300 + }, + { + "epoch": 0.1658886894075404, + "grad_norm": 0.45333582162857056, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 2310 + }, + { + "epoch": 0.16660682226211848, + "grad_norm": 0.4832790493965149, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 2320 + }, + { + "epoch": 0.1673249551166966, + "grad_norm": 0.4922761619091034, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2330 + }, + { + "epoch": 0.16804308797127468, + "grad_norm": 0.5701655149459839, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 2340 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 0.5170459151268005, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 2350 + }, + { + "epoch": 0.16947935368043088, + "grad_norm": 0.6562373638153076, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2360 + }, + { + "epoch": 0.170197486535009, + "grad_norm": 0.5350262522697449, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 2370 + }, + { + "epoch": 0.17091561938958708, + "grad_norm": 0.5163491368293762, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 2380 + }, + { + "epoch": 0.17163375224416516, + "grad_norm": 0.48841530084609985, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2390 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 0.44912993907928467, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 2400 + }, + { + "epoch": 0.17307001795332136, + "grad_norm": 0.5770647525787354, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 2410 + }, + { + "epoch": 0.17378815080789947, + "grad_norm": 0.4716179072856903, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 2420 + }, + { + "epoch": 0.17450628366247756, + "grad_norm": 0.5465078949928284, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 2430 + }, + { + "epoch": 0.17522441651705564, + "grad_norm": 0.40810713171958923, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 2440 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 0.3789578080177307, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 2450 + }, + { + "epoch": 0.17666068222621184, + "grad_norm": 0.4615110158920288, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 2460 + }, + { + "epoch": 0.17737881508078995, + "grad_norm": 0.4400235712528229, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2470 + }, + { + "epoch": 0.17809694793536804, + "grad_norm": 0.5935020446777344, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2480 + }, + { + "epoch": 0.17881508078994615, + "grad_norm": 0.5672990679740906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 2490 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 0.4132838845252991, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 2500 + }, + { + "epoch": 0.18025134649910232, + "grad_norm": 0.5373716950416565, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 2510 + }, + { + "epoch": 0.18096947935368043, + "grad_norm": 0.5335832834243774, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2520 + }, + { + "epoch": 0.18168761220825852, + "grad_norm": 0.5705642700195312, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.18240574506283663, + "grad_norm": 0.4807959496974945, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 2540 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 0.4430573880672455, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2550 + }, + { + "epoch": 0.18384201077199283, + "grad_norm": 0.5294728875160217, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 2560 + }, + { + "epoch": 0.18456014362657092, + "grad_norm": 0.661173403263092, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2570 + }, + { + "epoch": 0.185278276481149, + "grad_norm": 0.5044304728507996, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2580 + }, + { + "epoch": 0.18599640933572711, + "grad_norm": 0.48929551243782043, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 2590 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 0.5054438710212708, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2600 + }, + { + "epoch": 0.1874326750448833, + "grad_norm": 0.5613677501678467, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 2610 + }, + { + "epoch": 0.1881508078994614, + "grad_norm": 0.5762478709220886, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 2620 + }, + { + "epoch": 0.1888689407540395, + "grad_norm": 0.4523695409297943, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2630 + }, + { + "epoch": 0.1895870736086176, + "grad_norm": 0.5235317945480347, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 2640 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 0.4894576370716095, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 2650 + }, + { + "epoch": 0.1910233393177738, + "grad_norm": 0.45731106400489807, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2660 + }, + { + "epoch": 0.19174147217235188, + "grad_norm": 0.4726541042327881, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2670 + }, + { + "epoch": 0.19245960502693, + "grad_norm": 0.4281631410121918, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 2680 + }, + { + "epoch": 0.19317773788150808, + "grad_norm": 0.48011314868927, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 2690 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 0.45785006880760193, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2700 + }, + { + "epoch": 0.19461400359066428, + "grad_norm": 0.5244625210762024, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 2710 + }, + { + "epoch": 0.19533213644524236, + "grad_norm": 0.4674883186817169, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2720 + }, + { + "epoch": 0.19605026929982047, + "grad_norm": 0.5969558358192444, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 2730 + }, + { + "epoch": 0.19676840215439856, + "grad_norm": 0.44413265585899353, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 2740 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 0.5094553828239441, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2750 + }, + { + "epoch": 0.19820466786355476, + "grad_norm": 0.4931736886501312, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2760 + }, + { + "epoch": 0.19892280071813284, + "grad_norm": 0.4766625463962555, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 2770 + }, + { + "epoch": 0.19964093357271095, + "grad_norm": 0.4196971654891968, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2780 + }, + { + "epoch": 0.20035906642728904, + "grad_norm": 0.4693375825881958, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 2790 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 0.5407108664512634, + "learning_rate": 0.0002, + "loss": 0.8336, + "step": 2800 + }, + { + "epoch": 0.20179533213644524, + "grad_norm": 0.42864227294921875, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2810 + }, + { + "epoch": 0.20251346499102335, + "grad_norm": 0.4928833246231079, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 2820 + }, + { + "epoch": 0.20323159784560144, + "grad_norm": 0.5575131773948669, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2830 + }, + { + "epoch": 0.20394973070017952, + "grad_norm": 0.505114734172821, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2840 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 0.4727420210838318, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 2850 + }, + { + "epoch": 0.20538599640933572, + "grad_norm": 0.48218145966529846, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 2860 + }, + { + "epoch": 0.20610412926391383, + "grad_norm": 0.5196906328201294, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2870 + }, + { + "epoch": 0.20682226211849192, + "grad_norm": 0.4927639067173004, + "learning_rate": 0.0002, + "loss": 0.8401, + "step": 2880 + }, + { + "epoch": 0.20754039497307003, + "grad_norm": 0.5076990127563477, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 2890 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.4606800079345703, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 2900 + }, + { + "epoch": 0.2089766606822262, + "grad_norm": 0.6184319257736206, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2910 + }, + { + "epoch": 0.2096947935368043, + "grad_norm": 0.5237935781478882, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2920 + }, + { + "epoch": 0.2104129263913824, + "grad_norm": 0.43966251611709595, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 2930 + }, + { + "epoch": 0.2111310592459605, + "grad_norm": 0.48786666989326477, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2940 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 0.4397817552089691, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 2950 + }, + { + "epoch": 0.2125673249551167, + "grad_norm": 0.5155336260795593, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.2132854578096948, + "grad_norm": 0.48058274388313293, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2970 + }, + { + "epoch": 0.21400359066427288, + "grad_norm": 0.5022647976875305, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2980 + }, + { + "epoch": 0.214721723518851, + "grad_norm": 0.5417225360870361, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 2990 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 0.46300315856933594, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 3000 + }, + { + "epoch": 0.2161579892280072, + "grad_norm": 0.5375089049339294, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 3010 + }, + { + "epoch": 0.21687612208258528, + "grad_norm": 0.5050022602081299, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 3020 + }, + { + "epoch": 0.21759425493716336, + "grad_norm": 0.46347716450691223, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 3030 + }, + { + "epoch": 0.21831238779174147, + "grad_norm": 0.544874370098114, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 3040 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 3050 + }, + { + "epoch": 0.21974865350089767, + "grad_norm": 0.5527157187461853, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 3060 + }, + { + "epoch": 0.22046678635547576, + "grad_norm": 0.5565235018730164, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 3070 + }, + { + "epoch": 0.22118491921005387, + "grad_norm": 0.4900645613670349, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 3080 + }, + { + "epoch": 0.22190305206463196, + "grad_norm": 0.4951242208480835, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 3090 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 0.5831719636917114, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 3100 + }, + { + "epoch": 0.22333931777378815, + "grad_norm": 0.417576402425766, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 3110 + }, + { + "epoch": 0.22405745062836624, + "grad_norm": 0.4715117812156677, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 3120 + }, + { + "epoch": 0.22477558348294435, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 3130 + }, + { + "epoch": 0.22549371633752244, + "grad_norm": 0.408184289932251, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 3140 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 3150 + }, + { + "epoch": 0.22692998204667864, + "grad_norm": 0.5631294846534729, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3160 + }, + { + "epoch": 0.22764811490125672, + "grad_norm": 0.5054665803909302, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3170 + }, + { + "epoch": 0.22836624775583483, + "grad_norm": 0.47388020157814026, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 3180 + }, + { + "epoch": 0.22908438061041292, + "grad_norm": 0.45871609449386597, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 3190 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.42431211471557617, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 3200 + }, + { + "epoch": 0.23052064631956912, + "grad_norm": 0.584872305393219, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3210 + }, + { + "epoch": 0.23123877917414723, + "grad_norm": 0.5489653944969177, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 3220 + }, + { + "epoch": 0.23195691202872532, + "grad_norm": 0.5803213119506836, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 3230 + }, + { + "epoch": 0.2326750448833034, + "grad_norm": 0.906505823135376, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3240 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 0.4569525718688965, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 3250 + }, + { + "epoch": 0.2341113105924596, + "grad_norm": 0.5566741228103638, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3260 + }, + { + "epoch": 0.2348294434470377, + "grad_norm": 0.5059959888458252, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3270 + }, + { + "epoch": 0.2355475763016158, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 3280 + }, + { + "epoch": 0.2362657091561939, + "grad_norm": 0.5149409174919128, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 3290 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 0.7323763966560364, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3300 + }, + { + "epoch": 0.23770197486535008, + "grad_norm": 0.6794836521148682, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 3310 + }, + { + "epoch": 0.2384201077199282, + "grad_norm": 0.5176534056663513, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 3320 + }, + { + "epoch": 0.23913824057450628, + "grad_norm": 0.42245906591415405, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 3330 + }, + { + "epoch": 0.2398563734290844, + "grad_norm": 0.43535107374191284, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 0.7038307785987854, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 3350 + }, + { + "epoch": 0.24129263913824056, + "grad_norm": 0.5689977407455444, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 0.24201077199281867, + "grad_norm": 0.538136899471283, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 3370 + }, + { + "epoch": 0.24272890484739676, + "grad_norm": 0.7433661222457886, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 3380 + }, + { + "epoch": 0.24344703770197487, + "grad_norm": 0.6996734738349915, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3390 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.5055703520774841, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 3400 + }, + { + "epoch": 0.24488330341113107, + "grad_norm": 0.5218513607978821, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 3410 + }, + { + "epoch": 0.24560143626570916, + "grad_norm": 0.42782822251319885, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3420 + }, + { + "epoch": 0.24631956912028724, + "grad_norm": 0.4991157650947571, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 3430 + }, + { + "epoch": 0.24703770197486535, + "grad_norm": 0.5063165426254272, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3440 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 0.45863136649131775, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3450 + }, + { + "epoch": 0.24847396768402155, + "grad_norm": 0.474728524684906, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3460 + }, + { + "epoch": 0.24919210053859964, + "grad_norm": 0.522570013999939, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 3470 + }, + { + "epoch": 0.24991023339317775, + "grad_norm": 0.5474396347999573, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 3480 + }, + { + "epoch": 0.2506283662477558, + "grad_norm": 0.49094662070274353, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3490 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 0.6399132609367371, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 3500 + }, + { + "epoch": 0.25206463195691203, + "grad_norm": 0.5910066366195679, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 3510 + }, + { + "epoch": 0.25278276481149015, + "grad_norm": 0.4761259853839874, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3520 + }, + { + "epoch": 0.2535008976660682, + "grad_norm": 0.5124502182006836, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 3530 + }, + { + "epoch": 0.2542190305206463, + "grad_norm": 0.4329150915145874, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3540 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 0.4839608371257782, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 3550 + }, + { + "epoch": 0.2556552962298025, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3560 + }, + { + "epoch": 0.2563734290843806, + "grad_norm": 0.5761468410491943, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 3570 + }, + { + "epoch": 0.2570915619389587, + "grad_norm": 0.49266132712364197, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3580 + }, + { + "epoch": 0.2578096947935368, + "grad_norm": 0.7377930879592896, + "learning_rate": 0.0002, + "loss": 0.7946, + "step": 3590 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 0.543541431427002, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3600 + }, + { + "epoch": 0.259245960502693, + "grad_norm": 0.48385897278785706, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3610 + }, + { + "epoch": 0.2599640933572711, + "grad_norm": 0.5152639746665955, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3620 + }, + { + "epoch": 0.26068222621184917, + "grad_norm": 0.5601988434791565, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 3630 + }, + { + "epoch": 0.2614003590664273, + "grad_norm": 0.4349626302719116, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 0.5487161874771118, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3650 + }, + { + "epoch": 0.2628366247755835, + "grad_norm": 0.45603805780410767, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 3660 + }, + { + "epoch": 0.26355475763016156, + "grad_norm": 0.5012730956077576, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 3670 + }, + { + "epoch": 0.2642728904847397, + "grad_norm": 0.4523845314979553, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 3680 + }, + { + "epoch": 0.2649910233393178, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 3690 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 0.48467493057250977, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 3700 + }, + { + "epoch": 0.26642728904847396, + "grad_norm": 0.4860585927963257, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3710 + }, + { + "epoch": 0.26714542190305207, + "grad_norm": 0.5067077875137329, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3720 + }, + { + "epoch": 0.2678635547576302, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3730 + }, + { + "epoch": 0.26858168761220824, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 3740 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 0.5026951432228088, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 3750 + }, + { + "epoch": 0.27001795332136447, + "grad_norm": 0.49474090337753296, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3760 + }, + { + "epoch": 0.2707360861759425, + "grad_norm": 0.6381985545158386, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 3770 + }, + { + "epoch": 0.27145421903052064, + "grad_norm": 0.4784011244773865, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 3780 + }, + { + "epoch": 0.27217235188509875, + "grad_norm": 0.5126543045043945, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 3790 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 3800 + }, + { + "epoch": 0.2736086175942549, + "grad_norm": 0.5427033305168152, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 3810 + }, + { + "epoch": 0.27432675044883303, + "grad_norm": 0.46467480063438416, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 3820 + }, + { + "epoch": 0.27504488330341115, + "grad_norm": 0.494367390871048, + "learning_rate": 0.0002, + "loss": 0.8414, + "step": 3830 + }, + { + "epoch": 0.2757630161579892, + "grad_norm": 0.59856778383255, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3840 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 0.422128826379776, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 3850 + }, + { + "epoch": 0.27719928186714543, + "grad_norm": 0.5757306814193726, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 3860 + }, + { + "epoch": 0.27791741472172354, + "grad_norm": 0.5850930213928223, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.2786355475763016, + "grad_norm": 0.5633023977279663, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3880 + }, + { + "epoch": 0.2793536804308797, + "grad_norm": 0.5037940144538879, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 3890 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 0.5255506038665771, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 3900 + }, + { + "epoch": 0.2807899461400359, + "grad_norm": 0.44584617018699646, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 3910 + }, + { + "epoch": 0.281508078994614, + "grad_norm": 0.4803239405155182, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 3920 + }, + { + "epoch": 0.2822262118491921, + "grad_norm": 0.5206008553504944, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 3930 + }, + { + "epoch": 0.2829443447037702, + "grad_norm": 0.5596373081207275, + "learning_rate": 0.0002, + "loss": 0.8988, + "step": 3940 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 0.4487258493900299, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 3950 + }, + { + "epoch": 0.2843806104129264, + "grad_norm": 0.4774281978607178, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3960 + }, + { + "epoch": 0.2850987432675045, + "grad_norm": 0.571829617023468, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3970 + }, + { + "epoch": 0.28581687612208256, + "grad_norm": 0.45251455903053284, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 3980 + }, + { + "epoch": 0.2865350089766607, + "grad_norm": 0.5119943618774414, + "learning_rate": 0.0002, + "loss": 0.8007, + "step": 3990 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.42333969473838806, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 4000 + }, + { + "epoch": 0.2879712746858169, + "grad_norm": 0.5694096684455872, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 4010 + }, + { + "epoch": 0.28868940754039496, + "grad_norm": 0.44457492232322693, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 4020 + }, + { + "epoch": 0.2894075403949731, + "grad_norm": 0.496545672416687, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 4030 + }, + { + "epoch": 0.2901256732495512, + "grad_norm": 0.5092352032661438, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 4040 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 0.5124567151069641, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4050 + }, + { + "epoch": 0.29156193895870736, + "grad_norm": 0.5148161053657532, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4060 + }, + { + "epoch": 0.29228007181328547, + "grad_norm": 0.48183947801589966, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4070 + }, + { + "epoch": 0.2929982046678636, + "grad_norm": 0.47728800773620605, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4080 + }, + { + "epoch": 0.29371633752244164, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.5343585014343262, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 4100 + }, + { + "epoch": 0.29515260323159787, + "grad_norm": 0.5760312676429749, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 4110 + }, + { + "epoch": 0.2958707360861759, + "grad_norm": 0.5894787907600403, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4120 + }, + { + "epoch": 0.29658886894075404, + "grad_norm": 0.4528578817844391, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 4130 + }, + { + "epoch": 0.29730700179533215, + "grad_norm": 0.6027235388755798, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 4140 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.5060310959815979, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 4150 + }, + { + "epoch": 0.2987432675044883, + "grad_norm": 0.475252628326416, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4160 + }, + { + "epoch": 0.29946140035906643, + "grad_norm": 0.4855351448059082, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 4170 + }, + { + "epoch": 0.30017953321364454, + "grad_norm": 0.6720767021179199, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4180 + }, + { + "epoch": 0.3008976660682226, + "grad_norm": 0.6409553289413452, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 4190 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.5508167147636414, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 4200 + }, + { + "epoch": 0.30233393177737883, + "grad_norm": 0.45958149433135986, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 4210 + }, + { + "epoch": 0.3030520646319569, + "grad_norm": 0.5201641321182251, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 4220 + }, + { + "epoch": 0.303770197486535, + "grad_norm": 0.5440032482147217, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4230 + }, + { + "epoch": 0.3044883303411131, + "grad_norm": 0.43566814064979553, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4240 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 0.4479893445968628, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 4250 + }, + { + "epoch": 0.3059245960502693, + "grad_norm": 0.40390217304229736, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4260 + }, + { + "epoch": 0.3066427289048474, + "grad_norm": 0.5143486261367798, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 4270 + }, + { + "epoch": 0.3073608617594255, + "grad_norm": 0.5289962887763977, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 4280 + }, + { + "epoch": 0.30807899461400357, + "grad_norm": 0.609561026096344, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 4290 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 0.5967493653297424, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 4300 + }, + { + "epoch": 0.3095152603231598, + "grad_norm": 0.5323672890663147, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4310 + }, + { + "epoch": 0.3102333931777379, + "grad_norm": 0.4996737241744995, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 4320 + }, + { + "epoch": 0.31095152603231596, + "grad_norm": 0.5528829097747803, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 4330 + }, + { + "epoch": 0.3116696588868941, + "grad_norm": 0.5394268035888672, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4340 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 0.4654628038406372, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 4350 + }, + { + "epoch": 0.31310592459605024, + "grad_norm": 0.4933706521987915, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.31382405745062836, + "grad_norm": 0.5310598611831665, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 4370 + }, + { + "epoch": 0.31454219030520647, + "grad_norm": 0.5558765530586243, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4380 + }, + { + "epoch": 0.3152603231597846, + "grad_norm": 0.5281313061714172, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 4390 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 0.5100293755531311, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4400 + }, + { + "epoch": 0.31669658886894075, + "grad_norm": 0.48762813210487366, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 4410 + }, + { + "epoch": 0.31741472172351887, + "grad_norm": 0.5211702585220337, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 4420 + }, + { + "epoch": 0.3181328545780969, + "grad_norm": 0.696747899055481, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 4430 + }, + { + "epoch": 0.31885098743267504, + "grad_norm": 0.6334946751594543, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4440 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.5333067178726196, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4450 + }, + { + "epoch": 0.32028725314183126, + "grad_norm": 0.500091552734375, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 4460 + }, + { + "epoch": 0.3210053859964093, + "grad_norm": 0.5190957188606262, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4470 + }, + { + "epoch": 0.32172351885098743, + "grad_norm": 0.6702370047569275, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 4480 + }, + { + "epoch": 0.32244165170556555, + "grad_norm": 0.4393869638442993, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 4490 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 4500 + }, + { + "epoch": 0.3238779174147217, + "grad_norm": 0.561836838722229, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 4510 + }, + { + "epoch": 0.32459605026929983, + "grad_norm": 0.44366541504859924, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 4520 + }, + { + "epoch": 0.32531418312387794, + "grad_norm": 0.46504274010658264, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 4530 + }, + { + "epoch": 0.326032315978456, + "grad_norm": 0.5498034954071045, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 4540 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 0.5901338458061218, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 4550 + }, + { + "epoch": 0.3274685816876122, + "grad_norm": 0.5485442876815796, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 4560 + }, + { + "epoch": 0.3281867145421903, + "grad_norm": 0.512584924697876, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4570 + }, + { + "epoch": 0.3289048473967684, + "grad_norm": 0.5208188891410828, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 4580 + }, + { + "epoch": 0.3296229802513465, + "grad_norm": 0.4923836886882782, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 4590 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 0.49258530139923096, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 4600 + }, + { + "epoch": 0.3310592459605027, + "grad_norm": 0.4788922667503357, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 4610 + }, + { + "epoch": 0.3317773788150808, + "grad_norm": 0.48276954889297485, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4620 + }, + { + "epoch": 0.3324955116696589, + "grad_norm": 0.6300732493400574, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4630 + }, + { + "epoch": 0.33321364452423696, + "grad_norm": 0.47594770789146423, + "learning_rate": 0.0002, + "loss": 0.8434, + "step": 4640 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.4728924632072449, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 4650 + }, + { + "epoch": 0.3346499102333932, + "grad_norm": 0.5586788654327393, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4660 + }, + { + "epoch": 0.3353680430879713, + "grad_norm": 0.4573180377483368, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 4670 + }, + { + "epoch": 0.33608617594254936, + "grad_norm": 0.6391524076461792, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 4680 + }, + { + "epoch": 0.33680430879712747, + "grad_norm": 0.6570921540260315, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 4690 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 0.4601454734802246, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 4700 + }, + { + "epoch": 0.33824057450628364, + "grad_norm": 0.5640755295753479, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 4710 + }, + { + "epoch": 0.33895870736086176, + "grad_norm": 0.43475520610809326, + "learning_rate": 0.0002, + "loss": 0.8326, + "step": 4720 + }, + { + "epoch": 0.33967684021543987, + "grad_norm": 0.4785807132720947, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 4730 + }, + { + "epoch": 0.340394973070018, + "grad_norm": 0.4934665262699127, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 4740 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 0.45327693223953247, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 4750 + }, + { + "epoch": 0.34183123877917415, + "grad_norm": 0.4710456430912018, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4760 + }, + { + "epoch": 0.34254937163375226, + "grad_norm": 0.5591559410095215, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 4770 + }, + { + "epoch": 0.3432675044883303, + "grad_norm": 0.48958835005760193, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 4780 + }, + { + "epoch": 0.34398563734290843, + "grad_norm": 0.4613766670227051, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 4790 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 0.5425335764884949, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 4800 + }, + { + "epoch": 0.3454219030520646, + "grad_norm": 0.4964924156665802, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.3461400359066427, + "grad_norm": 0.613449215888977, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 4820 + }, + { + "epoch": 0.34685816876122083, + "grad_norm": 0.6553348898887634, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 4830 + }, + { + "epoch": 0.34757630161579894, + "grad_norm": 0.5863470435142517, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 4840 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.5338097810745239, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 4850 + }, + { + "epoch": 0.3490125673249551, + "grad_norm": 0.6129760146141052, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 4860 + }, + { + "epoch": 0.3497307001795332, + "grad_norm": 0.6100956797599792, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 4870 + }, + { + "epoch": 0.3504488330341113, + "grad_norm": 0.5478541254997253, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 4880 + }, + { + "epoch": 0.3511669658886894, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 4890 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6141043901443481, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 4900 + }, + { + "epoch": 0.3526032315978456, + "grad_norm": 0.597191572189331, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 4910 + }, + { + "epoch": 0.3533213644524237, + "grad_norm": 0.5988389253616333, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 4920 + }, + { + "epoch": 0.3540394973070018, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 4930 + }, + { + "epoch": 0.3547576301615799, + "grad_norm": 0.5932779312133789, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 4940 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 0.48911359906196594, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 4950 + }, + { + "epoch": 0.3561938958707361, + "grad_norm": 0.5435750484466553, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4960 + }, + { + "epoch": 0.3569120287253142, + "grad_norm": 0.4786977767944336, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 4970 + }, + { + "epoch": 0.3576301615798923, + "grad_norm": 0.4022316336631775, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 4980 + }, + { + "epoch": 0.35834829443447036, + "grad_norm": 0.4848504364490509, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 4990 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 0.5093459486961365, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 5000 + }, + { + "epoch": 0.3597845601436266, + "grad_norm": 0.47368478775024414, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 5010 + }, + { + "epoch": 0.36050269299820464, + "grad_norm": 0.6041097044944763, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 5020 + }, + { + "epoch": 0.36122082585278276, + "grad_norm": 0.5384424924850464, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 5030 + }, + { + "epoch": 0.36193895870736087, + "grad_norm": 0.4668518602848053, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 5040 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 0.5471060276031494, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 5050 + }, + { + "epoch": 0.36337522441651704, + "grad_norm": 0.731369137763977, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 5060 + }, + { + "epoch": 0.36409335727109515, + "grad_norm": 0.5119590759277344, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 5070 + }, + { + "epoch": 0.36481149012567327, + "grad_norm": 0.567428469657898, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 5080 + }, + { + "epoch": 0.3655296229802513, + "grad_norm": 0.5139971375465393, + "learning_rate": 0.0002, + "loss": 0.7616, + "step": 5090 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.5701581835746765, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 5100 + }, + { + "epoch": 0.36696588868940755, + "grad_norm": 0.5022063851356506, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 5110 + }, + { + "epoch": 0.36768402154398566, + "grad_norm": 0.4684354364871979, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 5120 + }, + { + "epoch": 0.3684021543985637, + "grad_norm": 0.5423495769500732, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 5130 + }, + { + "epoch": 0.36912028725314183, + "grad_norm": 0.46262967586517334, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 5140 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 0.4720141589641571, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 5150 + }, + { + "epoch": 0.370556552962298, + "grad_norm": 0.5113096833229065, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 5160 + }, + { + "epoch": 0.3712746858168761, + "grad_norm": 0.5253350138664246, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 5170 + }, + { + "epoch": 0.37199281867145423, + "grad_norm": 0.5799776315689087, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 5180 + }, + { + "epoch": 0.37271095152603234, + "grad_norm": 0.5166001319885254, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5190 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 5200 + }, + { + "epoch": 0.3741472172351885, + "grad_norm": 0.45811113715171814, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 5210 + }, + { + "epoch": 0.3748653500897666, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 5220 + }, + { + "epoch": 0.3755834829443447, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 0.3763016157989228, + "grad_norm": 0.3858596086502075, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 5240 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 0.6941536068916321, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 5250 + }, + { + "epoch": 0.377737881508079, + "grad_norm": 0.46940872073173523, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 5260 + }, + { + "epoch": 0.3784560143626571, + "grad_norm": 0.5413833260536194, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5270 + }, + { + "epoch": 0.3791741472172352, + "grad_norm": 0.5165658593177795, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 5280 + }, + { + "epoch": 0.3798922800718133, + "grad_norm": 0.6567398309707642, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 5290 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.5466915965080261, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 5300 + }, + { + "epoch": 0.3813285457809695, + "grad_norm": 0.4800598621368408, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 5310 + }, + { + "epoch": 0.3820466786355476, + "grad_norm": 0.4551742970943451, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 5320 + }, + { + "epoch": 0.3827648114901257, + "grad_norm": 0.5561164617538452, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 5330 + }, + { + "epoch": 0.38348294434470376, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 5340 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.465762197971344, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 5350 + }, + { + "epoch": 0.38491921005386, + "grad_norm": 0.6176838874816895, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 5360 + }, + { + "epoch": 0.38563734290843804, + "grad_norm": 0.657926082611084, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 5370 + }, + { + "epoch": 0.38635547576301615, + "grad_norm": 0.5063281655311584, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 5380 + }, + { + "epoch": 0.38707360861759427, + "grad_norm": 0.6960828304290771, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 5390 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 0.46712034940719604, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 5400 + }, + { + "epoch": 0.38850987432675044, + "grad_norm": 0.598114013671875, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 5410 + }, + { + "epoch": 0.38922800718132855, + "grad_norm": 0.6798132061958313, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 5420 + }, + { + "epoch": 0.38994614003590666, + "grad_norm": 0.5194289088249207, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 5430 + }, + { + "epoch": 0.3906642728904847, + "grad_norm": 0.48175323009490967, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 5440 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 0.4979408085346222, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 5450 + }, + { + "epoch": 0.39210053859964095, + "grad_norm": 0.6440972685813904, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5460 + }, + { + "epoch": 0.392818671454219, + "grad_norm": 0.5977227091789246, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 5470 + }, + { + "epoch": 0.3935368043087971, + "grad_norm": 0.4735909104347229, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 5480 + }, + { + "epoch": 0.39425493716337523, + "grad_norm": 0.48181721568107605, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 5490 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.6339454650878906, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 5500 + }, + { + "epoch": 0.3956912028725314, + "grad_norm": 0.5364336371421814, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5510 + }, + { + "epoch": 0.3964093357271095, + "grad_norm": 0.5499233603477478, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 5520 + }, + { + "epoch": 0.3971274685816876, + "grad_norm": 0.47249847650527954, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 5530 + }, + { + "epoch": 0.3978456014362657, + "grad_norm": 0.5692135095596313, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 5540 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 0.6009272933006287, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 5550 + }, + { + "epoch": 0.3992818671454219, + "grad_norm": 0.5198255181312561, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5560 + }, + { + "epoch": 0.4, + "grad_norm": 0.5474766492843628, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 0.4007181328545781, + "grad_norm": 0.5577479600906372, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 5580 + }, + { + "epoch": 0.4014362657091562, + "grad_norm": 0.5350302457809448, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5590 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 0.6310991048812866, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 5600 + }, + { + "epoch": 0.40287253141831236, + "grad_norm": 0.5695762038230896, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5610 + }, + { + "epoch": 0.4035906642728905, + "grad_norm": 0.5431827306747437, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 5620 + }, + { + "epoch": 0.4043087971274686, + "grad_norm": 0.4923325777053833, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 5630 + }, + { + "epoch": 0.4050269299820467, + "grad_norm": 0.531399667263031, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 5640 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.5854769349098206, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 5650 + }, + { + "epoch": 0.40646319569120287, + "grad_norm": 0.6684802174568176, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 5660 + }, + { + "epoch": 0.407181328545781, + "grad_norm": 0.6618620753288269, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 5670 + }, + { + "epoch": 0.40789946140035904, + "grad_norm": 0.4930776059627533, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 5680 + }, + { + "epoch": 0.40861759425493716, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 5690 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5700 + }, + { + "epoch": 0.4100538599640934, + "grad_norm": 0.6773046851158142, + "learning_rate": 0.0002, + "loss": 0.8386, + "step": 5710 + }, + { + "epoch": 0.41077199281867144, + "grad_norm": 0.6750592589378357, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 5720 + }, + { + "epoch": 0.41149012567324955, + "grad_norm": 0.5277232527732849, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5730 + }, + { + "epoch": 0.41220825852782766, + "grad_norm": 0.5155990719795227, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 5740 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 0.5236294865608215, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 5750 + }, + { + "epoch": 0.41364452423698383, + "grad_norm": 0.5073592066764832, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 5760 + }, + { + "epoch": 0.41436265709156195, + "grad_norm": 0.6997184753417969, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 5770 + }, + { + "epoch": 0.41508078994614006, + "grad_norm": 0.5282439589500427, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 5780 + }, + { + "epoch": 0.4157989228007181, + "grad_norm": 0.4997355341911316, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5790 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.6081610321998596, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5800 + }, + { + "epoch": 0.41723518850987434, + "grad_norm": 0.5640295147895813, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 5810 + }, + { + "epoch": 0.4179533213644524, + "grad_norm": 0.6443586349487305, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 0.4186714542190305, + "grad_norm": 0.6456229090690613, + "learning_rate": 0.0002, + "loss": 0.8132, + "step": 5830 + }, + { + "epoch": 0.4193895870736086, + "grad_norm": 0.5422267317771912, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5840 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 0.45251885056495667, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5850 + }, + { + "epoch": 0.4208258527827648, + "grad_norm": 0.781165599822998, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5860 + }, + { + "epoch": 0.4215439856373429, + "grad_norm": 0.5359160900115967, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5870 + }, + { + "epoch": 0.422262118491921, + "grad_norm": 0.6201958656311035, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5880 + }, + { + "epoch": 0.4229802513464991, + "grad_norm": 0.5985850691795349, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 5890 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.5550961494445801, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 5900 + }, + { + "epoch": 0.4244165170556553, + "grad_norm": 0.6284893155097961, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 5910 + }, + { + "epoch": 0.4251346499102334, + "grad_norm": 0.6143685579299927, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 5920 + }, + { + "epoch": 0.4258527827648115, + "grad_norm": 0.5065329670906067, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5930 + }, + { + "epoch": 0.4265709156193896, + "grad_norm": 0.7274345755577087, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 5940 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.606531023979187, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 5950 + }, + { + "epoch": 0.42800718132854576, + "grad_norm": 0.5983648300170898, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5960 + }, + { + "epoch": 0.4287253141831239, + "grad_norm": 0.5546031594276428, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5970 + }, + { + "epoch": 0.429443447037702, + "grad_norm": 0.666868269443512, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 5980 + }, + { + "epoch": 0.4301615798922801, + "grad_norm": 0.41438576579093933, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5990 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 0.5012526512145996, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 6000 + }, + { + "epoch": 0.43159784560143627, + "grad_norm": 0.6071694493293762, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 6010 + }, + { + "epoch": 0.4323159784560144, + "grad_norm": 0.5538384914398193, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 6020 + }, + { + "epoch": 0.43303411131059244, + "grad_norm": 0.5798718929290771, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 6030 + }, + { + "epoch": 0.43375224416517055, + "grad_norm": 0.5442442893981934, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 6040 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.6895565390586853, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 6050 + }, + { + "epoch": 0.4351885098743267, + "grad_norm": 0.6498045325279236, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 6060 + }, + { + "epoch": 0.43590664272890484, + "grad_norm": 0.5225510001182556, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 6070 + }, + { + "epoch": 0.43662477558348295, + "grad_norm": 0.6366992592811584, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 6080 + }, + { + "epoch": 0.43734290843806106, + "grad_norm": 0.47929027676582336, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 6090 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.5722405910491943, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 6100 + }, + { + "epoch": 0.43877917414721723, + "grad_norm": 0.6008004546165466, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 6110 + }, + { + "epoch": 0.43949730700179535, + "grad_norm": 0.5922580361366272, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 6120 + }, + { + "epoch": 0.4402154398563734, + "grad_norm": 0.7051905393600464, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 6130 + }, + { + "epoch": 0.4409335727109515, + "grad_norm": 0.5146450400352478, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 6140 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5605781674385071, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 6150 + }, + { + "epoch": 0.44236983842010774, + "grad_norm": 0.8008661866188049, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 6160 + }, + { + "epoch": 0.4430879712746858, + "grad_norm": 0.47406497597694397, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 6170 + }, + { + "epoch": 0.4438061041292639, + "grad_norm": 0.612287700176239, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 6180 + }, + { + "epoch": 0.444524236983842, + "grad_norm": 0.561188280582428, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 6190 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.6233669519424438, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 6200 + }, + { + "epoch": 0.4459605026929982, + "grad_norm": 0.45546263456344604, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6210 + }, + { + "epoch": 0.4466786355475763, + "grad_norm": 0.5947871208190918, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 6220 + }, + { + "epoch": 0.4473967684021544, + "grad_norm": 0.6109753847122192, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 6230 + }, + { + "epoch": 0.4481149012567325, + "grad_norm": 0.6380727887153625, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6240 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.5225699543952942, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 6250 + }, + { + "epoch": 0.4495511669658887, + "grad_norm": 0.521503210067749, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.45026929982046676, + "grad_norm": 0.5523216128349304, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 6270 + }, + { + "epoch": 0.4509874326750449, + "grad_norm": 0.5954921841621399, + "learning_rate": 0.0002, + "loss": 0.8228, + "step": 6280 + }, + { + "epoch": 0.451705565529623, + "grad_norm": 0.702751100063324, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 6290 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 0.5756356120109558, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 6300 + }, + { + "epoch": 0.45314183123877916, + "grad_norm": 0.45365944504737854, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 6310 + }, + { + "epoch": 0.45385996409335727, + "grad_norm": 0.5027855038642883, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6320 + }, + { + "epoch": 0.4545780969479354, + "grad_norm": 0.6551687121391296, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 6330 + }, + { + "epoch": 0.45529622980251344, + "grad_norm": 0.5296684503555298, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6340 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 0.5762032866477966, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6350 + }, + { + "epoch": 0.45673249551166967, + "grad_norm": 0.5234073996543884, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6360 + }, + { + "epoch": 0.4574506283662478, + "grad_norm": 0.5090946555137634, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 6370 + }, + { + "epoch": 0.45816876122082584, + "grad_norm": 0.6515111327171326, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 6380 + }, + { + "epoch": 0.45888689407540395, + "grad_norm": 0.7904898524284363, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 6390 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.6379680037498474, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 6400 + }, + { + "epoch": 0.4603231597845601, + "grad_norm": 0.641759991645813, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 6410 + }, + { + "epoch": 0.46104129263913823, + "grad_norm": 0.5273829698562622, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 6420 + }, + { + "epoch": 0.46175942549371635, + "grad_norm": 0.5668497681617737, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6430 + }, + { + "epoch": 0.46247755834829446, + "grad_norm": 0.5862061381340027, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 6440 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 0.5239592790603638, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 6450 + }, + { + "epoch": 0.46391382405745063, + "grad_norm": 0.5078722834587097, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 6460 + }, + { + "epoch": 0.46463195691202874, + "grad_norm": 0.566509485244751, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 6470 + }, + { + "epoch": 0.4653500897666068, + "grad_norm": 0.5952697396278381, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 6480 + }, + { + "epoch": 0.4660682226211849, + "grad_norm": 0.6548156142234802, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 6490 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 0.4768427908420563, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.46750448833034114, + "grad_norm": 0.5588273406028748, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 6510 + }, + { + "epoch": 0.4682226211849192, + "grad_norm": 0.5348677039146423, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 6520 + }, + { + "epoch": 0.4689407540394973, + "grad_norm": 0.4784318804740906, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 6530 + }, + { + "epoch": 0.4696588868940754, + "grad_norm": 0.5112265944480896, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 6540 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 0.7250495553016663, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 6550 + }, + { + "epoch": 0.4710951526032316, + "grad_norm": 0.538608968257904, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 6560 + }, + { + "epoch": 0.4718132854578097, + "grad_norm": 0.5981247425079346, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 6570 + }, + { + "epoch": 0.4725314183123878, + "grad_norm": 0.5466762781143188, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 6580 + }, + { + "epoch": 0.4732495511669659, + "grad_norm": 0.5609987378120422, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 6590 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 0.6091027855873108, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 6600 + }, + { + "epoch": 0.4746858168761221, + "grad_norm": 0.5542886853218079, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 6610 + }, + { + "epoch": 0.47540394973070016, + "grad_norm": 0.5656579732894897, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6620 + }, + { + "epoch": 0.4761220825852783, + "grad_norm": 0.47507357597351074, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 6630 + }, + { + "epoch": 0.4768402154398564, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6640 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 0.7129740715026855, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 6650 + }, + { + "epoch": 0.47827648114901256, + "grad_norm": 0.5189188718795776, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 6660 + }, + { + "epoch": 0.47899461400359067, + "grad_norm": 0.7548696398735046, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 6670 + }, + { + "epoch": 0.4797127468581688, + "grad_norm": 0.4729466438293457, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 6680 + }, + { + "epoch": 0.48043087971274684, + "grad_norm": 0.6190000772476196, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 6690 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 0.6276983022689819, + "learning_rate": 0.0002, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.48186714542190306, + "grad_norm": 0.6097590923309326, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 6710 + }, + { + "epoch": 0.4825852782764811, + "grad_norm": 0.6507330536842346, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 6720 + }, + { + "epoch": 0.48330341113105924, + "grad_norm": 0.5501991510391235, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 6730 + }, + { + "epoch": 0.48402154398563735, + "grad_norm": 0.5928015112876892, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 6740 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 0.5523008704185486, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 6750 + }, + { + "epoch": 0.4854578096947935, + "grad_norm": 0.5997263789176941, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 6760 + }, + { + "epoch": 0.48617594254937163, + "grad_norm": 0.6201002597808838, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 6770 + }, + { + "epoch": 0.48689407540394974, + "grad_norm": 0.6338862776756287, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 6780 + }, + { + "epoch": 0.4876122082585278, + "grad_norm": 0.5542550086975098, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6790 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.5587872862815857, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 6800 + }, + { + "epoch": 0.489048473967684, + "grad_norm": 0.5895681977272034, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 6810 + }, + { + "epoch": 0.48976660682226214, + "grad_norm": 0.4948221743106842, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 6820 + }, + { + "epoch": 0.4904847396768402, + "grad_norm": 0.44546931982040405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 6830 + }, + { + "epoch": 0.4912028725314183, + "grad_norm": 0.632046103477478, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 6840 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 0.49396243691444397, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 6850 + }, + { + "epoch": 0.4926391382405745, + "grad_norm": 0.497745156288147, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6860 + }, + { + "epoch": 0.4933572710951526, + "grad_norm": 0.7336170077323914, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 6870 + }, + { + "epoch": 0.4940754039497307, + "grad_norm": 0.6723181009292603, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 6880 + }, + { + "epoch": 0.4947935368043088, + "grad_norm": 0.5887754559516907, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 6890 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 0.6580226421356201, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 6900 + }, + { + "epoch": 0.496229802513465, + "grad_norm": 0.7385056614875793, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 6910 + }, + { + "epoch": 0.4969479353680431, + "grad_norm": 0.48736000061035156, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6920 + }, + { + "epoch": 0.49766606822262116, + "grad_norm": 0.6304559111595154, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 6930 + }, + { + "epoch": 0.4983842010771993, + "grad_norm": 0.607148289680481, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6940 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.5467981696128845, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 6950 + }, + { + "epoch": 0.4998204667863555, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 6960 + }, + { + "epoch": 0.5005385996409336, + "grad_norm": 0.5487921833992004, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 6970 + }, + { + "epoch": 0.5012567324955116, + "grad_norm": 0.5706006288528442, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 6980 + }, + { + "epoch": 0.5019748653500897, + "grad_norm": 0.539536714553833, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 6990 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 0.5527397394180298, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 7000 + }, + { + "epoch": 0.503411131059246, + "grad_norm": 0.5498567223548889, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 7010 + }, + { + "epoch": 0.5041292639138241, + "grad_norm": 0.5878575444221497, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 7020 + }, + { + "epoch": 0.5048473967684022, + "grad_norm": 0.646153450012207, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 7030 + }, + { + "epoch": 0.5055655296229803, + "grad_norm": 0.5603899359703064, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 7040 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 0.5849952697753906, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 7050 + }, + { + "epoch": 0.5070017953321364, + "grad_norm": 0.6082724928855896, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 7060 + }, + { + "epoch": 0.5077199281867145, + "grad_norm": 0.5900670289993286, + "learning_rate": 0.0002, + "loss": 0.8046, + "step": 7070 + }, + { + "epoch": 0.5084380610412926, + "grad_norm": 0.5856624841690063, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 7080 + }, + { + "epoch": 0.5091561938958707, + "grad_norm": 0.6177338361740112, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7090 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 0.5559300184249878, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 7100 + }, + { + "epoch": 0.510592459605027, + "grad_norm": 0.62027907371521, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 7110 + }, + { + "epoch": 0.511310592459605, + "grad_norm": 0.6334301829338074, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7120 + }, + { + "epoch": 0.5120287253141831, + "grad_norm": 0.513795018196106, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 7130 + }, + { + "epoch": 0.5127468581687612, + "grad_norm": 0.7004675269126892, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 7140 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5614308714866638, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7150 + }, + { + "epoch": 0.5141831238779174, + "grad_norm": 0.5037539601325989, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 7160 + }, + { + "epoch": 0.5149012567324955, + "grad_norm": 0.5568661093711853, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 7170 + }, + { + "epoch": 0.5156193895870737, + "grad_norm": 0.7513397336006165, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7180 + }, + { + "epoch": 0.5163375224416517, + "grad_norm": 0.7264583706855774, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 7190 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.6355819702148438, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 7200 + }, + { + "epoch": 0.5177737881508079, + "grad_norm": 0.6063222289085388, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 7210 + }, + { + "epoch": 0.518491921005386, + "grad_norm": 0.6484307646751404, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 7220 + }, + { + "epoch": 0.5192100538599641, + "grad_norm": 0.5260455012321472, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 7230 + }, + { + "epoch": 0.5199281867145422, + "grad_norm": 0.6718002557754517, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7240 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 0.5997617244720459, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 7250 + }, + { + "epoch": 0.5213644524236983, + "grad_norm": 0.5838589668273926, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 7260 + }, + { + "epoch": 0.5220825852782764, + "grad_norm": 0.5755977630615234, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 7270 + }, + { + "epoch": 0.5228007181328546, + "grad_norm": 0.6442093253135681, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 7280 + }, + { + "epoch": 0.5235188509874327, + "grad_norm": 0.6128416657447815, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 7290 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.509742796421051, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 7300 + }, + { + "epoch": 0.5249551166965889, + "grad_norm": 0.5450230836868286, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 7310 + }, + { + "epoch": 0.525673249551167, + "grad_norm": 0.5437141060829163, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 7320 + }, + { + "epoch": 0.526391382405745, + "grad_norm": 0.5291738510131836, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 7330 + }, + { + "epoch": 0.5271095152603231, + "grad_norm": 0.5101743936538696, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 7340 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.5678408145904541, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 7350 + }, + { + "epoch": 0.5285457809694794, + "grad_norm": 0.6332360506057739, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7360 + }, + { + "epoch": 0.5292639138240575, + "grad_norm": 0.4935058653354645, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 7370 + }, + { + "epoch": 0.5299820466786356, + "grad_norm": 0.6399656534194946, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7380 + }, + { + "epoch": 0.5307001795332137, + "grad_norm": 0.5986794233322144, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 7390 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.6948414444923401, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 7400 + }, + { + "epoch": 0.5321364452423698, + "grad_norm": 0.5337842106819153, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 7410 + }, + { + "epoch": 0.5328545780969479, + "grad_norm": 0.6897268295288086, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 7420 + }, + { + "epoch": 0.533572710951526, + "grad_norm": 0.6361175179481506, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 7430 + }, + { + "epoch": 0.5342908438061041, + "grad_norm": 0.5242252945899963, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 7440 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 0.5731322765350342, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 7450 + }, + { + "epoch": 0.5357271095152604, + "grad_norm": 0.5790955424308777, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 7460 + }, + { + "epoch": 0.5364452423698384, + "grad_norm": 0.4979061782360077, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 7470 + }, + { + "epoch": 0.5371633752244165, + "grad_norm": 0.7335101962089539, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 7480 + }, + { + "epoch": 0.5378815080789946, + "grad_norm": 0.592521071434021, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 7490 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.5784769654273987, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 7500 + }, + { + "epoch": 0.5393177737881508, + "grad_norm": 0.8148589730262756, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 7510 + }, + { + "epoch": 0.5400359066427289, + "grad_norm": 0.5727689862251282, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7520 + }, + { + "epoch": 0.540754039497307, + "grad_norm": 0.6958279609680176, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 7530 + }, + { + "epoch": 0.541472172351885, + "grad_norm": 0.6302788257598877, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 7540 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.5950970649719238, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 7550 + }, + { + "epoch": 0.5429084380610413, + "grad_norm": 0.4275270104408264, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 7560 + }, + { + "epoch": 0.5436265709156194, + "grad_norm": 0.7579900622367859, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 7570 + }, + { + "epoch": 0.5443447037701975, + "grad_norm": 0.5835317969322205, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 7580 + }, + { + "epoch": 0.5450628366247756, + "grad_norm": 0.5305142998695374, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 7590 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7600 + }, + { + "epoch": 0.5464991023339317, + "grad_norm": 0.5341935753822327, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 7610 + }, + { + "epoch": 0.5472172351885098, + "grad_norm": 0.6070826053619385, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 7620 + }, + { + "epoch": 0.547935368043088, + "grad_norm": 0.6193035840988159, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 7630 + }, + { + "epoch": 0.5486535008976661, + "grad_norm": 0.6171614527702332, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 7640 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.5700938105583191, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 7650 + }, + { + "epoch": 0.5500897666068223, + "grad_norm": 0.5742418169975281, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7660 + }, + { + "epoch": 0.5508078994614004, + "grad_norm": 0.6450320482254028, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 7670 + }, + { + "epoch": 0.5515260323159784, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 7680 + }, + { + "epoch": 0.5522441651705565, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 7690 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.5846288204193115, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7700 + }, + { + "epoch": 0.5536804308797127, + "grad_norm": 0.623315155506134, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7710 + }, + { + "epoch": 0.5543985637342909, + "grad_norm": 0.6607962250709534, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7720 + }, + { + "epoch": 0.555116696588869, + "grad_norm": 0.5258557200431824, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 7730 + }, + { + "epoch": 0.5558348294434471, + "grad_norm": 0.6464316844940186, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7740 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 0.6390621662139893, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 7750 + }, + { + "epoch": 0.5572710951526032, + "grad_norm": 0.5327560305595398, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 7760 + }, + { + "epoch": 0.5579892280071813, + "grad_norm": 0.8202064633369446, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 7770 + }, + { + "epoch": 0.5587073608617594, + "grad_norm": 0.45350968837738037, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 7780 + }, + { + "epoch": 0.5594254937163375, + "grad_norm": 0.5031413435935974, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 7790 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.5047417879104614, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 7800 + }, + { + "epoch": 0.5608617594254938, + "grad_norm": 0.668912410736084, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 0.5615798922800718, + "grad_norm": 0.6106061339378357, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7820 + }, + { + "epoch": 0.5622980251346499, + "grad_norm": 0.5558443665504456, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 7830 + }, + { + "epoch": 0.563016157989228, + "grad_norm": 0.5937177538871765, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 7840 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 0.67307448387146, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 7850 + }, + { + "epoch": 0.5644524236983842, + "grad_norm": 0.4615475833415985, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7860 + }, + { + "epoch": 0.5651705565529623, + "grad_norm": 0.5462577939033508, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 7870 + }, + { + "epoch": 0.5658886894075404, + "grad_norm": 0.6422402858734131, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7880 + }, + { + "epoch": 0.5666068222621184, + "grad_norm": 0.5313532948493958, + "learning_rate": 0.0002, + "loss": 0.8327, + "step": 7890 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 0.5647847056388855, + "learning_rate": 0.0002, + "loss": 0.7771, + "step": 7900 + }, + { + "epoch": 0.5680430879712747, + "grad_norm": 0.6581610441207886, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 7910 + }, + { + "epoch": 0.5687612208258528, + "grad_norm": 0.46947669982910156, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 7920 + }, + { + "epoch": 0.5694793536804309, + "grad_norm": 0.6420038342475891, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7930 + }, + { + "epoch": 0.570197486535009, + "grad_norm": 0.6730441451072693, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 7940 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.3849070966243744, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 7950 + }, + { + "epoch": 0.5716337522441651, + "grad_norm": 0.6076335906982422, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 7960 + }, + { + "epoch": 0.5723518850987432, + "grad_norm": 0.6446982026100159, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 7970 + }, + { + "epoch": 0.5730700179533214, + "grad_norm": 0.6019234657287598, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 7980 + }, + { + "epoch": 0.5737881508078995, + "grad_norm": 0.620880663394928, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 7990 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 0.4927573502063751, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 8000 + }, + { + "epoch": 0.5752244165170557, + "grad_norm": 0.6276804804801941, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8010 + }, + { + "epoch": 0.5759425493716338, + "grad_norm": 0.484518826007843, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 8020 + }, + { + "epoch": 0.5766606822262118, + "grad_norm": 0.5019962787628174, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.5773788150807899, + "grad_norm": 0.6685234308242798, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 8040 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 0.5762107372283936, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 8050 + }, + { + "epoch": 0.5788150807899461, + "grad_norm": 0.6402477025985718, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 8060 + }, + { + "epoch": 0.5795332136445243, + "grad_norm": 0.5919345617294312, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8070 + }, + { + "epoch": 0.5802513464991024, + "grad_norm": 0.47100913524627686, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 8080 + }, + { + "epoch": 0.5809694793536805, + "grad_norm": 0.6029118895530701, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 8090 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 0.5896338820457458, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 8100 + }, + { + "epoch": 0.5824057450628366, + "grad_norm": 0.49017754197120667, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 8110 + }, + { + "epoch": 0.5831238779174147, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 8120 + }, + { + "epoch": 0.5838420107719928, + "grad_norm": 0.6874517798423767, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.5845601436265709, + "grad_norm": 0.5429391264915466, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 8140 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.5533722639083862, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5859964093357272, + "grad_norm": 0.5827956199645996, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 8160 + }, + { + "epoch": 0.5867145421903052, + "grad_norm": 0.6670212149620056, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 8170 + }, + { + "epoch": 0.5874326750448833, + "grad_norm": 0.5231172442436218, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 8180 + }, + { + "epoch": 0.5881508078994614, + "grad_norm": 0.567447304725647, + "learning_rate": 0.0002, + "loss": 0.7975, + "step": 8190 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 0.5318575501441956, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8200 + }, + { + "epoch": 0.5895870736086176, + "grad_norm": 0.6959463357925415, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 8210 + }, + { + "epoch": 0.5903052064631957, + "grad_norm": 0.6964931488037109, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 8220 + }, + { + "epoch": 0.5910233393177737, + "grad_norm": 0.5164617896080017, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 8230 + }, + { + "epoch": 0.5917414721723518, + "grad_norm": 0.5456110239028931, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 8240 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.6553666591644287, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 8250 + }, + { + "epoch": 0.5931777378815081, + "grad_norm": 0.6185845732688904, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 8260 + }, + { + "epoch": 0.5938958707360862, + "grad_norm": 0.6110545992851257, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8270 + }, + { + "epoch": 0.5946140035906643, + "grad_norm": 0.5186824202537537, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 8280 + }, + { + "epoch": 0.5953321364452424, + "grad_norm": 0.7003735303878784, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 8290 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 0.4606216549873352, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 8300 + }, + { + "epoch": 0.5967684021543985, + "grad_norm": 0.5903441309928894, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 8310 + }, + { + "epoch": 0.5974865350089766, + "grad_norm": 0.7916744947433472, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 8320 + }, + { + "epoch": 0.5982046678635548, + "grad_norm": 0.5506401062011719, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 8330 + }, + { + "epoch": 0.5989228007181329, + "grad_norm": 0.5749204158782959, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 8340 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.6807544827461243, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 8350 + }, + { + "epoch": 0.6003590664272891, + "grad_norm": 0.5782986283302307, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 8360 + }, + { + "epoch": 0.6010771992818671, + "grad_norm": 0.7336342334747314, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 8370 + }, + { + "epoch": 0.6017953321364452, + "grad_norm": 0.5762712955474854, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.6025134649910233, + "grad_norm": 0.5726776719093323, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 8390 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.5355535745620728, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 8400 + }, + { + "epoch": 0.6039497307001795, + "grad_norm": 0.6762161254882812, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 8410 + }, + { + "epoch": 0.6046678635547577, + "grad_norm": 0.8200717568397522, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 8420 + }, + { + "epoch": 0.6053859964093358, + "grad_norm": 0.5600009560585022, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 8430 + }, + { + "epoch": 0.6061041292639138, + "grad_norm": 0.6465966105461121, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 8440 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.5176072120666504, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 8450 + }, + { + "epoch": 0.60754039497307, + "grad_norm": 0.5777280926704407, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 8460 + }, + { + "epoch": 0.6082585278276481, + "grad_norm": 0.5989252924919128, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 8470 + }, + { + "epoch": 0.6089766606822262, + "grad_norm": 0.5207306742668152, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8480 + }, + { + "epoch": 0.6096947935368043, + "grad_norm": 0.5242675542831421, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 8490 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 0.5631455183029175, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 8500 + }, + { + "epoch": 0.6111310592459605, + "grad_norm": 0.65207439661026, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 8510 + }, + { + "epoch": 0.6118491921005386, + "grad_norm": 0.5808899998664856, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8520 + }, + { + "epoch": 0.6125673249551167, + "grad_norm": 0.558127760887146, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 8530 + }, + { + "epoch": 0.6132854578096948, + "grad_norm": 0.6063143014907837, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8540 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.5491744875907898, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 8550 + }, + { + "epoch": 0.614721723518851, + "grad_norm": 0.5105780959129333, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8560 + }, + { + "epoch": 0.6154398563734291, + "grad_norm": 0.6892395615577698, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 8570 + }, + { + "epoch": 0.6161579892280071, + "grad_norm": 0.7411758899688721, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8580 + }, + { + "epoch": 0.6168761220825852, + "grad_norm": 0.6745429635047913, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 8590 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 0.596007227897644, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 8600 + }, + { + "epoch": 0.6183123877917415, + "grad_norm": 0.6751060485839844, + "learning_rate": 0.0002, + "loss": 0.7963, + "step": 8610 + }, + { + "epoch": 0.6190305206463196, + "grad_norm": 0.711124837398529, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 8620 + }, + { + "epoch": 0.6197486535008977, + "grad_norm": 0.6110914945602417, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 8630 + }, + { + "epoch": 0.6204667863554758, + "grad_norm": 0.5687659978866577, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 8640 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.7025772929191589, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8650 + }, + { + "epoch": 0.6219030520646319, + "grad_norm": 0.6456184983253479, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 8660 + }, + { + "epoch": 0.62262118491921, + "grad_norm": 0.5317023992538452, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 8670 + }, + { + "epoch": 0.6233393177737881, + "grad_norm": 0.5531691908836365, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 8680 + }, + { + "epoch": 0.6240574506283663, + "grad_norm": 0.6063531637191772, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 8690 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 1.094390630722046, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 8700 + }, + { + "epoch": 0.6254937163375225, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 8710 + }, + { + "epoch": 0.6262118491921005, + "grad_norm": 0.5470370054244995, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 8720 + }, + { + "epoch": 0.6269299820466786, + "grad_norm": 0.5852634310722351, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 8730 + }, + { + "epoch": 0.6276481149012567, + "grad_norm": 0.6120240092277527, + "learning_rate": 0.0002, + "loss": 0.8712, + "step": 8740 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 0.5608004927635193, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 8750 + }, + { + "epoch": 0.6290843806104129, + "grad_norm": 0.5980432033538818, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 8760 + }, + { + "epoch": 0.629802513464991, + "grad_norm": 0.5670580863952637, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 8770 + }, + { + "epoch": 0.6305206463195692, + "grad_norm": 0.5931687951087952, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 8780 + }, + { + "epoch": 0.6312387791741472, + "grad_norm": 0.7872577905654907, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 8790 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.6355181336402893, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 8800 + }, + { + "epoch": 0.6326750448833034, + "grad_norm": 0.501913845539093, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 8810 + }, + { + "epoch": 0.6333931777378815, + "grad_norm": 0.5956716537475586, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8820 + }, + { + "epoch": 0.6341113105924596, + "grad_norm": 0.6448253393173218, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 8830 + }, + { + "epoch": 0.6348294434470377, + "grad_norm": 0.6139631271362305, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 8840 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.5894306302070618, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 8850 + }, + { + "epoch": 0.6362657091561938, + "grad_norm": 0.8724799752235413, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 8860 + }, + { + "epoch": 0.636983842010772, + "grad_norm": 0.5413858890533447, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 8870 + }, + { + "epoch": 0.6377019748653501, + "grad_norm": 0.5993430614471436, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 8880 + }, + { + "epoch": 0.6384201077199282, + "grad_norm": 0.539415717124939, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 0.600125789642334, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 0.6398563734290844, + "grad_norm": 0.5597978234291077, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 8910 + }, + { + "epoch": 0.6405745062836625, + "grad_norm": 0.6262031197547913, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 8920 + }, + { + "epoch": 0.6412926391382405, + "grad_norm": 0.72662752866745, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 8930 + }, + { + "epoch": 0.6420107719928186, + "grad_norm": 0.613002598285675, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 8940 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.6511827707290649, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 8950 + }, + { + "epoch": 0.6434470377019749, + "grad_norm": 0.5383973717689514, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 8960 + }, + { + "epoch": 0.644165170556553, + "grad_norm": 0.5236184597015381, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 8970 + }, + { + "epoch": 0.6448833034111311, + "grad_norm": 0.5938544273376465, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 8980 + }, + { + "epoch": 0.6456014362657092, + "grad_norm": 0.4594680964946747, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 8990 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 0.6314211487770081, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9000 + }, + { + "epoch": 0.6470377019748653, + "grad_norm": 0.6291103363037109, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 9010 + }, + { + "epoch": 0.6477558348294434, + "grad_norm": 0.5888266563415527, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 9020 + }, + { + "epoch": 0.6484739676840215, + "grad_norm": 0.5613022446632385, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9030 + }, + { + "epoch": 0.6491921005385997, + "grad_norm": 0.7219604253768921, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 9040 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 0.5846529006958008, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 9050 + }, + { + "epoch": 0.6506283662477559, + "grad_norm": 0.7264063954353333, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 9060 + }, + { + "epoch": 0.6513464991023339, + "grad_norm": 0.5797538757324219, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9070 + }, + { + "epoch": 0.652064631956912, + "grad_norm": 0.4857395887374878, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9080 + }, + { + "epoch": 0.6527827648114901, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 9090 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.6105342507362366, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 9100 + }, + { + "epoch": 0.6542190305206463, + "grad_norm": 0.6408740282058716, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 9110 + }, + { + "epoch": 0.6549371633752245, + "grad_norm": 0.7474880814552307, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 9120 + }, + { + "epoch": 0.6556552962298026, + "grad_norm": 0.584768533706665, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 9130 + }, + { + "epoch": 0.6563734290843806, + "grad_norm": 0.6368113160133362, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9140 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.693631649017334, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 9150 + }, + { + "epoch": 0.6578096947935368, + "grad_norm": 0.6094512343406677, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 9160 + }, + { + "epoch": 0.6585278276481149, + "grad_norm": 0.7154942750930786, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 9170 + }, + { + "epoch": 0.659245960502693, + "grad_norm": 0.5749237537384033, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9180 + }, + { + "epoch": 0.6599640933572711, + "grad_norm": 0.6214450001716614, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 9190 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.6357814073562622, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9200 + }, + { + "epoch": 0.6614003590664272, + "grad_norm": 0.5677326917648315, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 9210 + }, + { + "epoch": 0.6621184919210054, + "grad_norm": 0.5432633757591248, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 9220 + }, + { + "epoch": 0.6628366247755835, + "grad_norm": 0.43935060501098633, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 9230 + }, + { + "epoch": 0.6635547576301616, + "grad_norm": 0.5350922346115112, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 9240 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.7745687365531921, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 9250 + }, + { + "epoch": 0.6649910233393178, + "grad_norm": 0.5767113566398621, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9260 + }, + { + "epoch": 0.6657091561938959, + "grad_norm": 0.49304983019828796, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9270 + }, + { + "epoch": 0.6664272890484739, + "grad_norm": 0.6355269551277161, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 9280 + }, + { + "epoch": 0.667145421903052, + "grad_norm": 0.5539451241493225, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 9290 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.5225138068199158, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 9300 + }, + { + "epoch": 0.6685816876122083, + "grad_norm": 0.5435736179351807, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 9310 + }, + { + "epoch": 0.6692998204667864, + "grad_norm": 0.611266553401947, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 9320 + }, + { + "epoch": 0.6700179533213645, + "grad_norm": 0.5880926251411438, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 9330 + }, + { + "epoch": 0.6707360861759426, + "grad_norm": 0.5301468372344971, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9340 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.5614377856254578, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9350 + }, + { + "epoch": 0.6721723518850987, + "grad_norm": 0.7177342176437378, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 9360 + }, + { + "epoch": 0.6728904847396768, + "grad_norm": 0.5187423825263977, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9370 + }, + { + "epoch": 0.6736086175942549, + "grad_norm": 0.49305087327957153, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 9380 + }, + { + "epoch": 0.6743267504488331, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 9390 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 0.8308040499687195, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 9400 + }, + { + "epoch": 0.6757630161579893, + "grad_norm": 0.6522438526153564, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 9410 + }, + { + "epoch": 0.6764811490125673, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 9420 + }, + { + "epoch": 0.6771992818671454, + "grad_norm": 0.783802330493927, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 9430 + }, + { + "epoch": 0.6779174147217235, + "grad_norm": 0.5246656537055969, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 9440 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.6630974411964417, + "learning_rate": 0.0002, + "loss": 0.7866, + "step": 9450 + }, + { + "epoch": 0.6793536804308797, + "grad_norm": 0.5012770295143127, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9460 + }, + { + "epoch": 0.6800718132854578, + "grad_norm": 0.6208643317222595, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 9470 + }, + { + "epoch": 0.680789946140036, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9480 + }, + { + "epoch": 0.681508078994614, + "grad_norm": 0.6613174080848694, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 9490 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.6417899131774902, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9500 + }, + { + "epoch": 0.6829443447037702, + "grad_norm": 0.5060321092605591, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 9510 + }, + { + "epoch": 0.6836624775583483, + "grad_norm": 0.586670458316803, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 9520 + }, + { + "epoch": 0.6843806104129264, + "grad_norm": 0.6607828736305237, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 9530 + }, + { + "epoch": 0.6850987432675045, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9540 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.741000771522522, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 9550 + }, + { + "epoch": 0.6865350089766606, + "grad_norm": 0.4687826335430145, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 9560 + }, + { + "epoch": 0.6872531418312388, + "grad_norm": 0.6452056169509888, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 9570 + }, + { + "epoch": 0.6879712746858169, + "grad_norm": 0.6393555402755737, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 9580 + }, + { + "epoch": 0.688689407540395, + "grad_norm": 0.4907757043838501, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 9590 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.5380825996398926, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 9600 + }, + { + "epoch": 0.6901256732495512, + "grad_norm": 0.5657393932342529, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 9610 + }, + { + "epoch": 0.6908438061041292, + "grad_norm": 0.8505447506904602, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 9620 + }, + { + "epoch": 0.6915619389587073, + "grad_norm": 0.5389836430549622, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 9630 + }, + { + "epoch": 0.6922800718132854, + "grad_norm": 0.4977441728115082, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 9640 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 0.5855389833450317, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 9650 + }, + { + "epoch": 0.6937163375224417, + "grad_norm": 0.633994996547699, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 9660 + }, + { + "epoch": 0.6944344703770198, + "grad_norm": 0.5592191815376282, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 9670 + }, + { + "epoch": 0.6951526032315979, + "grad_norm": 0.6030594706535339, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9680 + }, + { + "epoch": 0.6958707360861759, + "grad_norm": 0.6782388687133789, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 9690 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 0.6777627468109131, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 9700 + }, + { + "epoch": 0.6973070017953321, + "grad_norm": 0.5674123764038086, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 9710 + }, + { + "epoch": 0.6980251346499102, + "grad_norm": 0.5280387997627258, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 9720 + }, + { + "epoch": 0.6987432675044883, + "grad_norm": 0.5471981763839722, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 9730 + }, + { + "epoch": 0.6994614003590665, + "grad_norm": 0.6751061677932739, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9740 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.5942487716674805, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 9750 + }, + { + "epoch": 0.7008976660682226, + "grad_norm": 0.6165713667869568, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 9760 + }, + { + "epoch": 0.7016157989228007, + "grad_norm": 0.5745091438293457, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 9770 + }, + { + "epoch": 0.7023339317773788, + "grad_norm": 0.600308358669281, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 9780 + }, + { + "epoch": 0.7030520646319569, + "grad_norm": 0.6448577046394348, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 9790 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.5662767291069031, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9800 + }, + { + "epoch": 0.7044883303411131, + "grad_norm": 0.6490433812141418, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 9810 + }, + { + "epoch": 0.7052064631956912, + "grad_norm": 0.6126134991645813, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 9820 + }, + { + "epoch": 0.7059245960502692, + "grad_norm": 0.7181116938591003, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 9830 + }, + { + "epoch": 0.7066427289048474, + "grad_norm": 0.7805212140083313, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9840 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.7521958947181702, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9850 + }, + { + "epoch": 0.7080789946140036, + "grad_norm": 0.5610787868499756, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9860 + }, + { + "epoch": 0.7087971274685817, + "grad_norm": 0.7026229500770569, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 9870 + }, + { + "epoch": 0.7095152603231598, + "grad_norm": 0.551691472530365, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 9880 + }, + { + "epoch": 0.7102333931777379, + "grad_norm": 0.5841995477676392, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9890 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 0.7170061469078064, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 9900 + }, + { + "epoch": 0.711669658886894, + "grad_norm": 0.49836990237236023, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 9910 + }, + { + "epoch": 0.7123877917414722, + "grad_norm": 0.5234556794166565, + "learning_rate": 0.0002, + "loss": 0.7667, + "step": 9920 + }, + { + "epoch": 0.7131059245960503, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 9930 + }, + { + "epoch": 0.7138240574506284, + "grad_norm": 0.5657515525817871, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9940 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.5969128012657166, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 9950 + }, + { + "epoch": 0.7152603231597846, + "grad_norm": 0.7136867046356201, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 9960 + }, + { + "epoch": 0.7159784560143626, + "grad_norm": 0.6774699091911316, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9970 + }, + { + "epoch": 0.7166965888689407, + "grad_norm": 0.6066371202468872, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 9980 + }, + { + "epoch": 0.7174147217235188, + "grad_norm": 0.7355279922485352, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 9990 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 0.7996646761894226, + "learning_rate": 0.0002, + "loss": 0.7643, + "step": 10000 + }, + { + "epoch": 0.7188509874326751, + "grad_norm": 0.628839910030365, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10010 + }, + { + "epoch": 0.7195691202872532, + "grad_norm": 0.5472931265830994, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 10020 + }, + { + "epoch": 0.7202872531418313, + "grad_norm": 0.5776344537734985, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 10030 + }, + { + "epoch": 0.7210053859964093, + "grad_norm": 0.5041707158088684, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10040 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5965308547019958, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 10050 + }, + { + "epoch": 0.7224416517055655, + "grad_norm": 0.5892689228057861, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 10060 + }, + { + "epoch": 0.7231597845601436, + "grad_norm": 0.5695884227752686, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 10070 + }, + { + "epoch": 0.7238779174147217, + "grad_norm": 0.6547690629959106, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 10080 + }, + { + "epoch": 0.7245960502692999, + "grad_norm": 0.6759928464889526, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 10090 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.6829725503921509, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 10100 + }, + { + "epoch": 0.726032315978456, + "grad_norm": 0.5242751240730286, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 10110 + }, + { + "epoch": 0.7267504488330341, + "grad_norm": 0.6947014927864075, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 10120 + }, + { + "epoch": 0.7274685816876122, + "grad_norm": 0.6094982624053955, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10130 + }, + { + "epoch": 0.7281867145421903, + "grad_norm": 0.628461480140686, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 10140 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.4952087104320526, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10150 + }, + { + "epoch": 0.7296229802513465, + "grad_norm": 0.6917221546173096, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10160 + }, + { + "epoch": 0.7303411131059246, + "grad_norm": 0.6866413354873657, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10170 + }, + { + "epoch": 0.7310592459605026, + "grad_norm": 0.5505863428115845, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 10180 + }, + { + "epoch": 0.7317773788150808, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 10190 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.5001798272132874, + "learning_rate": 0.0002, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 0.733213644524237, + "grad_norm": 0.5117581486701965, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 10210 + }, + { + "epoch": 0.7339317773788151, + "grad_norm": 0.7716088891029358, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 10220 + }, + { + "epoch": 0.7346499102333932, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 10230 + }, + { + "epoch": 0.7353680430879713, + "grad_norm": 0.6433483362197876, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10240 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.6241081357002258, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10250 + }, + { + "epoch": 0.7368043087971274, + "grad_norm": 0.7198845744132996, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10260 + }, + { + "epoch": 0.7375224416517056, + "grad_norm": 0.5879023671150208, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 10270 + }, + { + "epoch": 0.7382405745062837, + "grad_norm": 0.5810162425041199, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 10280 + }, + { + "epoch": 0.7389587073608618, + "grad_norm": 0.6336500644683838, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10290 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.5627583861351013, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 10300 + }, + { + "epoch": 0.740394973070018, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 10310 + }, + { + "epoch": 0.741113105924596, + "grad_norm": 0.5519505143165588, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 10320 + }, + { + "epoch": 0.7418312387791741, + "grad_norm": 0.628710925579071, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 10330 + }, + { + "epoch": 0.7425493716337522, + "grad_norm": 0.6466957926750183, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10340 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 0.6269286274909973, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 10350 + }, + { + "epoch": 0.7439856373429085, + "grad_norm": 0.6985455751419067, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 10360 + }, + { + "epoch": 0.7447037701974866, + "grad_norm": 0.6203648447990417, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 10370 + }, + { + "epoch": 0.7454219030520647, + "grad_norm": 0.6524295210838318, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 10380 + }, + { + "epoch": 0.7461400359066427, + "grad_norm": 0.6108002662658691, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 10390 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 0.5196276903152466, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 10400 + }, + { + "epoch": 0.7475763016157989, + "grad_norm": 0.6207506656646729, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 10410 + }, + { + "epoch": 0.748294434470377, + "grad_norm": 0.6015686988830566, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 10420 + }, + { + "epoch": 0.7490125673249551, + "grad_norm": 0.6402649879455566, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 10430 + }, + { + "epoch": 0.7497307001795332, + "grad_norm": 0.7816081047058105, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 10440 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.6148143410682678, + "learning_rate": 0.0002, + "loss": 0.8021, + "step": 10450 + }, + { + "epoch": 0.7511669658886894, + "grad_norm": 0.6496613621711731, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 10460 + }, + { + "epoch": 0.7518850987432675, + "grad_norm": 0.49158045649528503, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 10470 + }, + { + "epoch": 0.7526032315978456, + "grad_norm": 0.8629217743873596, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 10480 + }, + { + "epoch": 0.7533213644524237, + "grad_norm": 0.6800066828727722, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 10490 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.6480063199996948, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 10500 + }, + { + "epoch": 0.7547576301615799, + "grad_norm": 0.5740751028060913, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10510 + }, + { + "epoch": 0.755475763016158, + "grad_norm": 0.7182627320289612, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 10520 + }, + { + "epoch": 0.756193895870736, + "grad_norm": 0.6482816934585571, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 10530 + }, + { + "epoch": 0.7569120287253142, + "grad_norm": 0.4937674105167389, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 10540 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.6818482875823975, + "learning_rate": 0.0002, + "loss": 0.7783, + "step": 10550 + }, + { + "epoch": 0.7583482944344704, + "grad_norm": 0.6375173926353455, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10560 + }, + { + "epoch": 0.7590664272890485, + "grad_norm": 0.528798520565033, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 10570 + }, + { + "epoch": 0.7597845601436266, + "grad_norm": 0.42099910974502563, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 10580 + }, + { + "epoch": 0.7605026929982047, + "grad_norm": 0.529604434967041, + "learning_rate": 0.0002, + "loss": 0.8218, + "step": 10590 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.6236841082572937, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 10600 + }, + { + "epoch": 0.7619389587073608, + "grad_norm": 0.6194891929626465, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10610 + }, + { + "epoch": 0.762657091561939, + "grad_norm": 0.5206209421157837, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 10620 + }, + { + "epoch": 0.7633752244165171, + "grad_norm": 0.7981295585632324, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 10630 + }, + { + "epoch": 0.7640933572710952, + "grad_norm": 0.6113479137420654, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 10640 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 0.7025435566902161, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10650 + }, + { + "epoch": 0.7655296229802514, + "grad_norm": 0.46914348006248474, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 10660 + }, + { + "epoch": 0.7662477558348294, + "grad_norm": 0.6134725213050842, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 10670 + }, + { + "epoch": 0.7669658886894075, + "grad_norm": 0.583859920501709, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10680 + }, + { + "epoch": 0.7676840215439856, + "grad_norm": 0.511349081993103, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10690 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.6467110514640808, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 10700 + }, + { + "epoch": 0.7691202872531419, + "grad_norm": 0.7210163474082947, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 10710 + }, + { + "epoch": 0.76983842010772, + "grad_norm": 0.6034521460533142, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 10720 + }, + { + "epoch": 0.7705565529622981, + "grad_norm": 0.6237271428108215, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 10730 + }, + { + "epoch": 0.7712746858168761, + "grad_norm": 0.664328396320343, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 10740 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.6550520062446594, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 10750 + }, + { + "epoch": 0.7727109515260323, + "grad_norm": 0.5103325843811035, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 10760 + }, + { + "epoch": 0.7734290843806104, + "grad_norm": 0.7171200513839722, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 10770 + }, + { + "epoch": 0.7741472172351885, + "grad_norm": 0.5947384834289551, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 10780 + }, + { + "epoch": 0.7748653500897666, + "grad_norm": 0.5293096899986267, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10790 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.6372577548027039, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10800 + }, + { + "epoch": 0.7763016157989228, + "grad_norm": 0.5738261938095093, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.7770197486535009, + "grad_norm": 0.7309247255325317, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 10820 + }, + { + "epoch": 0.777737881508079, + "grad_norm": 0.8867193460464478, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10830 + }, + { + "epoch": 0.7784560143626571, + "grad_norm": 0.6151437759399414, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 10840 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.5645464658737183, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 0.7798922800718133, + "grad_norm": 0.5118698477745056, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10860 + }, + { + "epoch": 0.7806104129263913, + "grad_norm": 0.618181049823761, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 10870 + }, + { + "epoch": 0.7813285457809694, + "grad_norm": 0.7206462025642395, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 10880 + }, + { + "epoch": 0.7820466786355476, + "grad_norm": 0.7993820905685425, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 10890 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.5072754621505737, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10900 + }, + { + "epoch": 0.7834829443447038, + "grad_norm": 0.5829088687896729, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 10910 + }, + { + "epoch": 0.7842010771992819, + "grad_norm": 0.5778957605361938, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 10920 + }, + { + "epoch": 0.78491921005386, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.785637342908438, + "grad_norm": 0.5778013467788696, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 10940 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.6129629611968994, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10950 + }, + { + "epoch": 0.7870736086175942, + "grad_norm": 0.5637320876121521, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10960 + }, + { + "epoch": 0.7877917414721723, + "grad_norm": 0.6253715753555298, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10970 + }, + { + "epoch": 0.7885098743267505, + "grad_norm": 0.6209888458251953, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10980 + }, + { + "epoch": 0.7892280071813286, + "grad_norm": 1.0841948986053467, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10990 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.6570560336112976, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 11000 + }, + { + "epoch": 0.7906642728904847, + "grad_norm": 0.4830388128757477, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11010 + }, + { + "epoch": 0.7913824057450628, + "grad_norm": 0.7607520222663879, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11020 + }, + { + "epoch": 0.7921005385996409, + "grad_norm": 0.8202590346336365, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 11030 + }, + { + "epoch": 0.792818671454219, + "grad_norm": 0.5640848278999329, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11040 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 0.7773675322532654, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 11050 + }, + { + "epoch": 0.7942549371633753, + "grad_norm": 0.664139986038208, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11060 + }, + { + "epoch": 0.7949730700179534, + "grad_norm": 0.6097795367240906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 11070 + }, + { + "epoch": 0.7956912028725314, + "grad_norm": 0.9208881258964539, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 11080 + }, + { + "epoch": 0.7964093357271095, + "grad_norm": 0.6210731863975525, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 11090 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.7060235738754272, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 11100 + }, + { + "epoch": 0.7978456014362657, + "grad_norm": 0.48695266246795654, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 11110 + }, + { + "epoch": 0.7985637342908438, + "grad_norm": 0.6458830833435059, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 11120 + }, + { + "epoch": 0.7992818671454219, + "grad_norm": 0.572545051574707, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 11130 + }, + { + "epoch": 0.8, + "grad_norm": 0.5925027132034302, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 11140 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 0.569622278213501, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 11150 + }, + { + "epoch": 0.8014362657091562, + "grad_norm": 0.537146806716919, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 11160 + }, + { + "epoch": 0.8021543985637343, + "grad_norm": 0.7118613719940186, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 11170 + }, + { + "epoch": 0.8028725314183124, + "grad_norm": 0.6183688044548035, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 11180 + }, + { + "epoch": 0.8035906642728905, + "grad_norm": 0.5187385082244873, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 11190 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.5422571301460266, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 11200 + }, + { + "epoch": 0.8050269299820467, + "grad_norm": 0.635050892829895, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 11210 + }, + { + "epoch": 0.8057450628366247, + "grad_norm": 0.6584872007369995, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 11220 + }, + { + "epoch": 0.8064631956912028, + "grad_norm": 0.624921977519989, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 11230 + }, + { + "epoch": 0.807181328545781, + "grad_norm": 0.6837546229362488, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 11240 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 0.5861160755157471, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11250 + }, + { + "epoch": 0.8086175942549372, + "grad_norm": 0.5751383900642395, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 11260 + }, + { + "epoch": 0.8093357271095153, + "grad_norm": 0.7181510329246521, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11270 + }, + { + "epoch": 0.8100538599640934, + "grad_norm": 0.5862139463424683, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11280 + }, + { + "epoch": 0.8107719928186714, + "grad_norm": 0.4880113899707794, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 11290 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.565590500831604, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 11300 + }, + { + "epoch": 0.8122082585278276, + "grad_norm": 0.6171264052391052, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11310 + }, + { + "epoch": 0.8129263913824057, + "grad_norm": 0.5815969109535217, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 11320 + }, + { + "epoch": 0.8136445242369839, + "grad_norm": 0.5407653450965881, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 11330 + }, + { + "epoch": 0.814362657091562, + "grad_norm": 0.6990084648132324, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 11340 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.5845068097114563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11350 + }, + { + "epoch": 0.8157989228007181, + "grad_norm": 0.5978701114654541, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11360 + }, + { + "epoch": 0.8165170556552962, + "grad_norm": 0.6873053312301636, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 11370 + }, + { + "epoch": 0.8172351885098743, + "grad_norm": 0.7048654556274414, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 11380 + }, + { + "epoch": 0.8179533213644524, + "grad_norm": 0.7631531953811646, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 11390 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.704922080039978, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 11400 + }, + { + "epoch": 0.8193895870736086, + "grad_norm": 0.595460832118988, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11410 + }, + { + "epoch": 0.8201077199281868, + "grad_norm": 0.5882242918014526, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 11420 + }, + { + "epoch": 0.8208258527827648, + "grad_norm": 0.6433175206184387, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 11430 + }, + { + "epoch": 0.8215439856373429, + "grad_norm": 0.6047986149787903, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 11440 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 11450 + }, + { + "epoch": 0.8229802513464991, + "grad_norm": 0.5558379888534546, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 11460 + }, + { + "epoch": 0.8236983842010772, + "grad_norm": 0.6745542287826538, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 11470 + }, + { + "epoch": 0.8244165170556553, + "grad_norm": 0.7082334756851196, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 11480 + }, + { + "epoch": 0.8251346499102334, + "grad_norm": 0.703889787197113, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11490 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.5261096358299255, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 11500 + }, + { + "epoch": 0.8265709156193896, + "grad_norm": 0.6009393930435181, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 11510 + }, + { + "epoch": 0.8272890484739677, + "grad_norm": 0.584274172782898, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 11520 + }, + { + "epoch": 0.8280071813285458, + "grad_norm": 0.6803238987922668, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 11530 + }, + { + "epoch": 0.8287253141831239, + "grad_norm": 0.6230084896087646, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 11540 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.6090595722198486, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 11550 + }, + { + "epoch": 0.8301615798922801, + "grad_norm": 0.5292693376541138, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 11560 + }, + { + "epoch": 0.8308797127468581, + "grad_norm": 0.5675389766693115, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 11570 + }, + { + "epoch": 0.8315978456014362, + "grad_norm": 0.554874062538147, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 11580 + }, + { + "epoch": 0.8323159784560143, + "grad_norm": 0.8582373261451721, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 11590 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.5743035674095154, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 11600 + }, + { + "epoch": 0.8337522441651706, + "grad_norm": 0.5749582648277283, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11610 + }, + { + "epoch": 0.8344703770197487, + "grad_norm": 0.5207278728485107, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11620 + }, + { + "epoch": 0.8351885098743268, + "grad_norm": 0.6262611150741577, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 11630 + }, + { + "epoch": 0.8359066427289048, + "grad_norm": 0.5490066409111023, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 11640 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6283167600631714, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 11650 + }, + { + "epoch": 0.837342908438061, + "grad_norm": 0.7701452374458313, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 11660 + }, + { + "epoch": 0.8380610412926391, + "grad_norm": 0.5825072526931763, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 11670 + }, + { + "epoch": 0.8387791741472173, + "grad_norm": 0.6119720935821533, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 11680 + }, + { + "epoch": 0.8394973070017954, + "grad_norm": 0.689383327960968, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 11690 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.5396560430526733, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 11700 + }, + { + "epoch": 0.8409335727109515, + "grad_norm": 0.577178955078125, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 11710 + }, + { + "epoch": 0.8416517055655296, + "grad_norm": 0.6652564406394958, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 11720 + }, + { + "epoch": 0.8423698384201077, + "grad_norm": 0.588377058506012, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 11730 + }, + { + "epoch": 0.8430879712746858, + "grad_norm": 0.6180438995361328, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 11740 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.6897811889648438, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11750 + }, + { + "epoch": 0.844524236983842, + "grad_norm": 0.5826608538627625, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 11760 + }, + { + "epoch": 0.8452423698384202, + "grad_norm": 0.6511976718902588, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 11770 + }, + { + "epoch": 0.8459605026929982, + "grad_norm": 0.4738382399082184, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 11780 + }, + { + "epoch": 0.8466786355475763, + "grad_norm": 0.541780948638916, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 11790 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 0.6115241050720215, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 11800 + }, + { + "epoch": 0.8481149012567325, + "grad_norm": 0.7067801356315613, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 11810 + }, + { + "epoch": 0.8488330341113106, + "grad_norm": 0.5602791905403137, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 11820 + }, + { + "epoch": 0.8495511669658887, + "grad_norm": 0.6968005299568176, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 11830 + }, + { + "epoch": 0.8502692998204668, + "grad_norm": 0.621132493019104, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11840 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.5777568817138672, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11850 + }, + { + "epoch": 0.851705565529623, + "grad_norm": 0.6468178629875183, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 11860 + }, + { + "epoch": 0.8524236983842011, + "grad_norm": 0.6216070652008057, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 11870 + }, + { + "epoch": 0.8531418312387792, + "grad_norm": 0.7402005791664124, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 11880 + }, + { + "epoch": 0.8538599640933573, + "grad_norm": 0.5192958116531372, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 11890 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 0.6050501465797424, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 11900 + }, + { + "epoch": 0.8552962298025135, + "grad_norm": 0.5363124012947083, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11910 + }, + { + "epoch": 0.8560143626570915, + "grad_norm": 0.525288462638855, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11920 + }, + { + "epoch": 0.8567324955116696, + "grad_norm": 0.6129848957061768, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 11930 + }, + { + "epoch": 0.8574506283662477, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 11940 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.5862830281257629, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 11950 + }, + { + "epoch": 0.858886894075404, + "grad_norm": 0.7078025341033936, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 11960 + }, + { + "epoch": 0.8596050269299821, + "grad_norm": 0.6600908637046814, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 11970 + }, + { + "epoch": 0.8603231597845602, + "grad_norm": 0.5914377570152283, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 11980 + }, + { + "epoch": 0.8610412926391382, + "grad_norm": 0.7844575047492981, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 11990 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.6605148315429688, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12000 + }, + { + "epoch": 0.8624775583482944, + "grad_norm": 0.6320111155509949, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 12010 + }, + { + "epoch": 0.8631956912028725, + "grad_norm": 0.5833557844161987, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 12020 + }, + { + "epoch": 0.8639138240574507, + "grad_norm": 0.5322666764259338, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 12030 + }, + { + "epoch": 0.8646319569120288, + "grad_norm": 0.568696141242981, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 12040 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 0.5739135146141052, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 12050 + }, + { + "epoch": 0.8660682226211849, + "grad_norm": 0.6667993068695068, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 12060 + }, + { + "epoch": 0.866786355475763, + "grad_norm": 0.5393701195716858, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 12070 + }, + { + "epoch": 0.8675044883303411, + "grad_norm": 0.7036312818527222, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 12080 + }, + { + "epoch": 0.8682226211849192, + "grad_norm": 0.5851739048957825, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 12090 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.6554462909698486, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 12100 + }, + { + "epoch": 0.8696588868940754, + "grad_norm": 0.8224838376045227, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 12110 + }, + { + "epoch": 0.8703770197486534, + "grad_norm": 0.513981819152832, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 12120 + }, + { + "epoch": 0.8710951526032316, + "grad_norm": 0.6913988590240479, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 12130 + }, + { + "epoch": 0.8718132854578097, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 12140 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.6216937303543091, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 12150 + }, + { + "epoch": 0.8732495511669659, + "grad_norm": 0.5594495534896851, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 12160 + }, + { + "epoch": 0.873967684021544, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 12170 + }, + { + "epoch": 0.8746858168761221, + "grad_norm": 0.5285239815711975, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 12180 + }, + { + "epoch": 0.8754039497307001, + "grad_norm": 1.0394607782363892, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 12190 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 12200 + }, + { + "epoch": 0.8768402154398564, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 12210 + }, + { + "epoch": 0.8775583482944345, + "grad_norm": 0.593204915523529, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 12220 + }, + { + "epoch": 0.8782764811490126, + "grad_norm": 0.7141679525375366, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 12230 + }, + { + "epoch": 0.8789946140035907, + "grad_norm": 0.6381585597991943, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 12240 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.7076981067657471, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12250 + }, + { + "epoch": 0.8804308797127468, + "grad_norm": 0.8046461939811707, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 12260 + }, + { + "epoch": 0.8811490125673249, + "grad_norm": 0.635160505771637, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 12270 + }, + { + "epoch": 0.881867145421903, + "grad_norm": 0.6388354301452637, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8825852782764811, + "grad_norm": 0.5612906217575073, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 12290 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6716228723526001, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 12300 + }, + { + "epoch": 0.8840215439856374, + "grad_norm": 0.6488762497901917, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 12310 + }, + { + "epoch": 0.8847396768402155, + "grad_norm": 0.5770853757858276, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 12320 + }, + { + "epoch": 0.8854578096947935, + "grad_norm": 0.5006616711616516, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 12330 + }, + { + "epoch": 0.8861759425493716, + "grad_norm": 0.6428417563438416, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 12340 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 0.5721977949142456, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12350 + }, + { + "epoch": 0.8876122082585278, + "grad_norm": 0.7000266313552856, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 12360 + }, + { + "epoch": 0.8883303411131059, + "grad_norm": 0.5252631306648254, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 12370 + }, + { + "epoch": 0.889048473967684, + "grad_norm": 0.5788044929504395, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 0.8897666068222622, + "grad_norm": 0.6730653643608093, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.5556851029396057, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 12400 + }, + { + "epoch": 0.8912028725314183, + "grad_norm": 0.616189181804657, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 12410 + }, + { + "epoch": 0.8919210053859964, + "grad_norm": 0.6360940337181091, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 12420 + }, + { + "epoch": 0.8926391382405745, + "grad_norm": 0.5832887887954712, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 12430 + }, + { + "epoch": 0.8933572710951526, + "grad_norm": 0.8319168090820312, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 12440 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.5415005087852478, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 12450 + }, + { + "epoch": 0.8947935368043088, + "grad_norm": 0.4959808588027954, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 12460 + }, + { + "epoch": 0.8955116696588868, + "grad_norm": 0.5102260708808899, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 12470 + }, + { + "epoch": 0.896229802513465, + "grad_norm": 0.773972749710083, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12480 + }, + { + "epoch": 0.8969479353680431, + "grad_norm": 0.6314513087272644, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 12490 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.6503705382347107, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 12500 + }, + { + "epoch": 0.8983842010771993, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 12510 + }, + { + "epoch": 0.8991023339317774, + "grad_norm": 0.7222756743431091, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 12520 + }, + { + "epoch": 0.8998204667863555, + "grad_norm": 0.7242336869239807, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 12530 + }, + { + "epoch": 0.9005385996409335, + "grad_norm": 0.625769317150116, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 12540 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.6003357172012329, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 12550 + }, + { + "epoch": 0.9019748653500897, + "grad_norm": 0.6089374423027039, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 12560 + }, + { + "epoch": 0.9026929982046679, + "grad_norm": 0.6232544183731079, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 12570 + }, + { + "epoch": 0.903411131059246, + "grad_norm": 0.5426769256591797, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 12580 + }, + { + "epoch": 0.9041292639138241, + "grad_norm": 0.5711943507194519, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 12590 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.5287838578224182, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 12600 + }, + { + "epoch": 0.9055655296229802, + "grad_norm": 0.6192951798439026, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 12610 + }, + { + "epoch": 0.9062836624775583, + "grad_norm": 0.493082195520401, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 12620 + }, + { + "epoch": 0.9070017953321364, + "grad_norm": 0.7668463587760925, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 12630 + }, + { + "epoch": 0.9077199281867145, + "grad_norm": 0.6298037767410278, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 12640 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.5502580404281616, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 12650 + }, + { + "epoch": 0.9091561938958708, + "grad_norm": 0.5525170564651489, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.9098743267504489, + "grad_norm": 0.9753695726394653, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 12670 + }, + { + "epoch": 0.9105924596050269, + "grad_norm": 0.611427366733551, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 12680 + }, + { + "epoch": 0.911310592459605, + "grad_norm": 0.5141594409942627, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 12690 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 0.6739137172698975, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 12700 + }, + { + "epoch": 0.9127468581687612, + "grad_norm": 0.5759707689285278, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 12710 + }, + { + "epoch": 0.9134649910233393, + "grad_norm": 0.5548733472824097, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12720 + }, + { + "epoch": 0.9141831238779174, + "grad_norm": 0.7014280557632446, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 12730 + }, + { + "epoch": 0.9149012567324956, + "grad_norm": 0.5939958691596985, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 12740 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 0.5995593667030334, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12750 + }, + { + "epoch": 0.9163375224416517, + "grad_norm": 0.6686680316925049, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 12760 + }, + { + "epoch": 0.9170556552962298, + "grad_norm": 0.4742372930049896, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 12770 + }, + { + "epoch": 0.9177737881508079, + "grad_norm": 0.5493217706680298, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 12780 + }, + { + "epoch": 0.918491921005386, + "grad_norm": 0.5641885995864868, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 12790 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 0.5814061164855957, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 12800 + }, + { + "epoch": 0.9199281867145422, + "grad_norm": 0.6774331331253052, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 12810 + }, + { + "epoch": 0.9206463195691202, + "grad_norm": 0.5592127442359924, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 12820 + }, + { + "epoch": 0.9213644524236984, + "grad_norm": 0.5246456861495972, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 12830 + }, + { + "epoch": 0.9220825852782765, + "grad_norm": 0.6524264812469482, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 12840 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 0.6010791063308716, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12850 + }, + { + "epoch": 0.9235188509874327, + "grad_norm": 0.5289866924285889, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 12860 + }, + { + "epoch": 0.9242369838420108, + "grad_norm": 0.6850762367248535, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 12870 + }, + { + "epoch": 0.9249551166965889, + "grad_norm": 0.5293797850608826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 12880 + }, + { + "epoch": 0.9256732495511669, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 12890 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.7026739716529846, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 12900 + }, + { + "epoch": 0.9271095152603231, + "grad_norm": 0.6884756684303284, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 12910 + }, + { + "epoch": 0.9278276481149013, + "grad_norm": 0.637884795665741, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 12920 + }, + { + "epoch": 0.9285457809694794, + "grad_norm": 0.513913631439209, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 12930 + }, + { + "epoch": 0.9292639138240575, + "grad_norm": 0.6642340421676636, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 12940 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.5708861947059631, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 12950 + }, + { + "epoch": 0.9307001795332136, + "grad_norm": 0.5896512866020203, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 12960 + }, + { + "epoch": 0.9314183123877917, + "grad_norm": 0.5754874348640442, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 12970 + }, + { + "epoch": 0.9321364452423698, + "grad_norm": 0.6363751888275146, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 12980 + }, + { + "epoch": 0.9328545780969479, + "grad_norm": 0.7660197019577026, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 12990 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.607728898525238, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 13000 + }, + { + "epoch": 0.9342908438061042, + "grad_norm": 0.5257042050361633, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 13010 + }, + { + "epoch": 0.9350089766606823, + "grad_norm": 0.7916908264160156, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 13020 + }, + { + "epoch": 0.9357271095152603, + "grad_norm": 0.8310123085975647, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 13030 + }, + { + "epoch": 0.9364452423698384, + "grad_norm": 0.6543728113174438, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 13040 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.7153878808021545, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 13050 + }, + { + "epoch": 0.9378815080789946, + "grad_norm": 0.7510694265365601, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 13060 + }, + { + "epoch": 0.9385996409335727, + "grad_norm": 0.5524464249610901, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 13070 + }, + { + "epoch": 0.9393177737881508, + "grad_norm": 0.6657140254974365, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 13080 + }, + { + "epoch": 0.940035906642729, + "grad_norm": 0.5757394433021545, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 13090 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.6171187162399292, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 13100 + }, + { + "epoch": 0.9414721723518851, + "grad_norm": 0.5946314334869385, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 13110 + }, + { + "epoch": 0.9421903052064632, + "grad_norm": 0.5727229714393616, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 13120 + }, + { + "epoch": 0.9429084380610413, + "grad_norm": 0.7805224061012268, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 13130 + }, + { + "epoch": 0.9436265709156194, + "grad_norm": 0.5763523578643799, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 13140 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 0.8310899138450623, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13150 + }, + { + "epoch": 0.9450628366247756, + "grad_norm": 0.7531784772872925, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 13160 + }, + { + "epoch": 0.9457809694793536, + "grad_norm": 0.678779661655426, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 13170 + }, + { + "epoch": 0.9464991023339318, + "grad_norm": 0.8096453547477722, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13180 + }, + { + "epoch": 0.9472172351885099, + "grad_norm": 0.6743921637535095, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 13190 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.606852114200592, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 13200 + }, + { + "epoch": 0.9486535008976661, + "grad_norm": 0.6550270915031433, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 13210 + }, + { + "epoch": 0.9493716337522442, + "grad_norm": 0.6494552493095398, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 13220 + }, + { + "epoch": 0.9500897666068223, + "grad_norm": 0.5867666602134705, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 13230 + }, + { + "epoch": 0.9508078994614003, + "grad_norm": 0.6283786296844482, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 13240 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 0.6824573278427124, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 13250 + }, + { + "epoch": 0.9522441651705565, + "grad_norm": 0.6945744156837463, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 13260 + }, + { + "epoch": 0.9529622980251347, + "grad_norm": 0.6468575596809387, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 13270 + }, + { + "epoch": 0.9536804308797128, + "grad_norm": 0.6819407939910889, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 0.9543985637342909, + "grad_norm": 0.6660491824150085, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 13290 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6320462226867676, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 13300 + }, + { + "epoch": 0.955834829443447, + "grad_norm": 0.46753761172294617, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 13310 + }, + { + "epoch": 0.9565529622980251, + "grad_norm": 0.6608774065971375, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 13320 + }, + { + "epoch": 0.9572710951526032, + "grad_norm": 0.607448935508728, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 13330 + }, + { + "epoch": 0.9579892280071813, + "grad_norm": 0.6796701550483704, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 13340 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.7655861377716064, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 13350 + }, + { + "epoch": 0.9594254937163376, + "grad_norm": 0.5881335735321045, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 13360 + }, + { + "epoch": 0.9601436265709156, + "grad_norm": 0.6855270862579346, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 13370 + }, + { + "epoch": 0.9608617594254937, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 13380 + }, + { + "epoch": 0.9615798922800718, + "grad_norm": 0.5983994603157043, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 13390 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.6141189932823181, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 13400 + }, + { + "epoch": 0.963016157989228, + "grad_norm": 0.6539722084999084, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 13410 + }, + { + "epoch": 0.9637342908438061, + "grad_norm": 0.5425801277160645, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 13420 + }, + { + "epoch": 0.9644524236983842, + "grad_norm": 0.8038925528526306, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 13430 + }, + { + "epoch": 0.9651705565529622, + "grad_norm": 0.5729590058326721, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 13440 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5695241689682007, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 13450 + }, + { + "epoch": 0.9666068222621185, + "grad_norm": 0.5913681387901306, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 13460 + }, + { + "epoch": 0.9673249551166966, + "grad_norm": 1.1798994541168213, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 13470 + }, + { + "epoch": 0.9680430879712747, + "grad_norm": 0.5931369066238403, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 13480 + }, + { + "epoch": 0.9687612208258528, + "grad_norm": 0.6269514560699463, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 13490 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.7380245327949524, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 13500 + }, + { + "epoch": 0.9701974865350089, + "grad_norm": 0.5668187141418457, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 13510 + }, + { + "epoch": 0.970915619389587, + "grad_norm": 0.547149121761322, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 13520 + }, + { + "epoch": 0.9716337522441651, + "grad_norm": 0.49131739139556885, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 13530 + }, + { + "epoch": 0.9723518850987433, + "grad_norm": 0.6385366320610046, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 13540 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5962417125701904, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 13550 + }, + { + "epoch": 0.9737881508078995, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 13560 + }, + { + "epoch": 0.9745062836624776, + "grad_norm": 0.5757403373718262, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 13570 + }, + { + "epoch": 0.9752244165170556, + "grad_norm": 0.7214667201042175, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 13580 + }, + { + "epoch": 0.9759425493716337, + "grad_norm": 0.5902701020240784, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 13590 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.752805769443512, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 13600 + }, + { + "epoch": 0.9773788150807899, + "grad_norm": 0.5943595767021179, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 13610 + }, + { + "epoch": 0.978096947935368, + "grad_norm": 0.6752488613128662, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 13620 + }, + { + "epoch": 0.9788150807899462, + "grad_norm": 0.5295413732528687, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 13630 + }, + { + "epoch": 0.9795332136445243, + "grad_norm": 0.732549250125885, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13640 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.5701823830604553, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 13650 + }, + { + "epoch": 0.9809694793536804, + "grad_norm": 0.576898455619812, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13660 + }, + { + "epoch": 0.9816876122082585, + "grad_norm": 0.5916832089424133, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 13670 + }, + { + "epoch": 0.9824057450628366, + "grad_norm": 0.5554524660110474, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 13680 + }, + { + "epoch": 0.9831238779174147, + "grad_norm": 0.6988440752029419, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 13690 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 13700 + }, + { + "epoch": 0.984560143626571, + "grad_norm": 2.421210289001465, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13710 + }, + { + "epoch": 0.985278276481149, + "grad_norm": 0.6307598948478699, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 13720 + }, + { + "epoch": 0.9859964093357271, + "grad_norm": 0.6832480430603027, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 13730 + }, + { + "epoch": 0.9867145421903052, + "grad_norm": 0.5974255204200745, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13740 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.6540380716323853, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 13750 + }, + { + "epoch": 0.9881508078994614, + "grad_norm": 0.7532727122306824, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 13760 + }, + { + "epoch": 0.9888689407540395, + "grad_norm": 0.6776283383369446, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 13770 + }, + { + "epoch": 0.9895870736086176, + "grad_norm": 0.5776281356811523, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 13780 + }, + { + "epoch": 0.9903052064631956, + "grad_norm": 0.5473008751869202, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 13790 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.5428591370582581, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 13800 + }, + { + "epoch": 0.9917414721723519, + "grad_norm": 0.5173406004905701, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 13810 + }, + { + "epoch": 0.99245960502693, + "grad_norm": 0.6462617516517639, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 13820 + }, + { + "epoch": 0.9931777378815081, + "grad_norm": 0.5800426006317139, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 13830 + }, + { + "epoch": 0.9938958707360862, + "grad_norm": 0.5015466809272766, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 13840 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.59474778175354, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 13850 + }, + { + "epoch": 0.9953321364452423, + "grad_norm": 0.5609583258628845, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 13860 + }, + { + "epoch": 0.9960502692998204, + "grad_norm": 0.5762063264846802, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 13870 + }, + { + "epoch": 0.9967684021543985, + "grad_norm": 0.6419214010238647, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 13880 + }, + { + "epoch": 0.9974865350089767, + "grad_norm": 0.7821950316429138, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 13890 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.6216017007827759, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 13900 + }, + { + "epoch": 0.9989228007181329, + "grad_norm": 0.5446485877037048, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 13910 + }, + { + "epoch": 0.999640933572711, + "grad_norm": 0.5037565231323242, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 13920 + }, + { + "epoch": 1.0, + "eval_loss": 1.09147310256958, + "eval_runtime": 55.1915, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 13925 + }, + { + "epoch": 1.000359066427289, + "grad_norm": 0.5808277130126953, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 13930 + }, + { + "epoch": 1.0010771992818672, + "grad_norm": 0.47258496284484863, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 13940 + }, + { + "epoch": 1.0017953321364452, + "grad_norm": 0.8921670317649841, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 13950 + }, + { + "epoch": 1.0025134649910232, + "grad_norm": 0.746729850769043, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 13960 + }, + { + "epoch": 1.0032315978456015, + "grad_norm": 0.6243796944618225, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13970 + }, + { + "epoch": 1.0039497307001795, + "grad_norm": 0.6725090742111206, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 13980 + }, + { + "epoch": 1.0046678635547577, + "grad_norm": 0.8762497305870056, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 13990 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 0.7694411873817444, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 14000 + }, + { + "epoch": 1.006104129263914, + "grad_norm": 0.6208822727203369, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 14010 + }, + { + "epoch": 1.006822262118492, + "grad_norm": 0.8503357768058777, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 14020 + }, + { + "epoch": 1.00754039497307, + "grad_norm": 0.5813316106796265, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14030 + }, + { + "epoch": 1.0082585278276481, + "grad_norm": 0.8186036348342896, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 14040 + }, + { + "epoch": 1.0089766606822261, + "grad_norm": 0.759873628616333, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14050 + }, + { + "epoch": 1.0096947935368044, + "grad_norm": 0.8437777161598206, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 14060 + }, + { + "epoch": 1.0104129263913824, + "grad_norm": 0.5750975012779236, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14070 + }, + { + "epoch": 1.0111310592459606, + "grad_norm": 0.5873221158981323, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 14080 + }, + { + "epoch": 1.0118491921005386, + "grad_norm": 0.6381314396858215, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 14090 + }, + { + "epoch": 1.0125673249551166, + "grad_norm": 0.6510405540466309, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 14100 + }, + { + "epoch": 1.0132854578096948, + "grad_norm": 0.7698671221733093, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 14110 + }, + { + "epoch": 1.0140035906642728, + "grad_norm": 0.646180272102356, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 14120 + }, + { + "epoch": 1.014721723518851, + "grad_norm": 0.6183205246925354, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 14130 + }, + { + "epoch": 1.015439856373429, + "grad_norm": 0.5082563757896423, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 14140 + }, + { + "epoch": 1.0161579892280073, + "grad_norm": 0.7285500764846802, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 14150 + }, + { + "epoch": 1.0168761220825853, + "grad_norm": 0.6368175148963928, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 14160 + }, + { + "epoch": 1.0175942549371633, + "grad_norm": 0.44868743419647217, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 14170 + }, + { + "epoch": 1.0183123877917415, + "grad_norm": 0.6346513628959656, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 14180 + }, + { + "epoch": 1.0190305206463195, + "grad_norm": 0.7287803292274475, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 14190 + }, + { + "epoch": 1.0197486535008977, + "grad_norm": 0.6701363325119019, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 14200 + }, + { + "epoch": 1.0204667863554757, + "grad_norm": 0.6419289112091064, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 14210 + }, + { + "epoch": 1.021184919210054, + "grad_norm": 0.7703002095222473, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 14220 + }, + { + "epoch": 1.021903052064632, + "grad_norm": 0.6803670525550842, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14230 + }, + { + "epoch": 1.02262118491921, + "grad_norm": 0.5780976414680481, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 14240 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 0.5096051096916199, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 14250 + }, + { + "epoch": 1.0240574506283662, + "grad_norm": 0.6058611869812012, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 14260 + }, + { + "epoch": 1.0247755834829444, + "grad_norm": 0.6703311204910278, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 14270 + }, + { + "epoch": 1.0254937163375224, + "grad_norm": 0.7143640518188477, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 14280 + }, + { + "epoch": 1.0262118491921006, + "grad_norm": 0.6730744242668152, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 14290 + }, + { + "epoch": 1.0269299820466786, + "grad_norm": 0.8180603384971619, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14300 + }, + { + "epoch": 1.0276481149012566, + "grad_norm": 0.6752267479896545, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 14310 + }, + { + "epoch": 1.0283662477558349, + "grad_norm": 0.678428590297699, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 14320 + }, + { + "epoch": 1.0290843806104129, + "grad_norm": 0.5959973931312561, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 14330 + }, + { + "epoch": 1.029802513464991, + "grad_norm": 0.5797176957130432, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 14340 + }, + { + "epoch": 1.030520646319569, + "grad_norm": 0.6415652632713318, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 14350 + }, + { + "epoch": 1.0312387791741473, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 14360 + }, + { + "epoch": 1.0319569120287253, + "grad_norm": 0.7158452272415161, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 14370 + }, + { + "epoch": 1.0326750448833033, + "grad_norm": 0.6066089272499084, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 14380 + }, + { + "epoch": 1.0333931777378815, + "grad_norm": 0.7359582781791687, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 14390 + }, + { + "epoch": 1.0341113105924595, + "grad_norm": 0.7372373938560486, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 14400 + }, + { + "epoch": 1.0348294434470378, + "grad_norm": 0.7511868476867676, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 14410 + }, + { + "epoch": 1.0355475763016158, + "grad_norm": 0.5449917912483215, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 14420 + }, + { + "epoch": 1.036265709156194, + "grad_norm": 0.6700817346572876, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 14430 + }, + { + "epoch": 1.036983842010772, + "grad_norm": 0.7061316967010498, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14440 + }, + { + "epoch": 1.03770197486535, + "grad_norm": 0.7582663893699646, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 14450 + }, + { + "epoch": 1.0384201077199282, + "grad_norm": 0.6408873200416565, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 14460 + }, + { + "epoch": 1.0391382405745062, + "grad_norm": 0.7645436525344849, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 14470 + }, + { + "epoch": 1.0398563734290844, + "grad_norm": 0.6522644758224487, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 14480 + }, + { + "epoch": 1.0405745062836624, + "grad_norm": 0.784273624420166, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 14490 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 0.673891544342041, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 14500 + }, + { + "epoch": 1.0420107719928187, + "grad_norm": 0.6566316485404968, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 14510 + }, + { + "epoch": 1.0427289048473967, + "grad_norm": 0.6062059998512268, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 14520 + }, + { + "epoch": 1.0434470377019749, + "grad_norm": 0.6884504556655884, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14530 + }, + { + "epoch": 1.044165170556553, + "grad_norm": 0.6642231345176697, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14540 + }, + { + "epoch": 1.0448833034111311, + "grad_norm": 0.6989523768424988, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 14550 + }, + { + "epoch": 1.0456014362657091, + "grad_norm": 0.8179892301559448, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 14560 + }, + { + "epoch": 1.0463195691202873, + "grad_norm": 0.6426970362663269, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 14570 + }, + { + "epoch": 1.0470377019748653, + "grad_norm": 0.678445041179657, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 14580 + }, + { + "epoch": 1.0477558348294433, + "grad_norm": 0.7573820352554321, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 14590 + }, + { + "epoch": 1.0484739676840216, + "grad_norm": 0.734443724155426, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 14600 + }, + { + "epoch": 1.0491921005385996, + "grad_norm": 0.7333676218986511, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14610 + }, + { + "epoch": 1.0499102333931778, + "grad_norm": 0.6122187972068787, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14620 + }, + { + "epoch": 1.0506283662477558, + "grad_norm": 0.6916412711143494, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 14630 + }, + { + "epoch": 1.051346499102334, + "grad_norm": 0.5898127555847168, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 14640 + }, + { + "epoch": 1.052064631956912, + "grad_norm": 0.6071873307228088, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14650 + }, + { + "epoch": 1.05278276481149, + "grad_norm": 0.6530455946922302, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 14660 + }, + { + "epoch": 1.0535008976660682, + "grad_norm": 0.6919314861297607, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14670 + }, + { + "epoch": 1.0542190305206462, + "grad_norm": 0.7843509912490845, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 14680 + }, + { + "epoch": 1.0549371633752245, + "grad_norm": 0.6106747388839722, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 14690 + }, + { + "epoch": 1.0556552962298025, + "grad_norm": 0.7828368544578552, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 14700 + }, + { + "epoch": 1.0563734290843807, + "grad_norm": 0.6772044897079468, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 14710 + }, + { + "epoch": 1.0570915619389587, + "grad_norm": 0.5430962443351746, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 14720 + }, + { + "epoch": 1.0578096947935367, + "grad_norm": 0.7364194989204407, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 14730 + }, + { + "epoch": 1.058527827648115, + "grad_norm": 0.5607585310935974, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 14740 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 0.7917081713676453, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 14750 + }, + { + "epoch": 1.0599640933572712, + "grad_norm": 0.7852025628089905, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 14760 + }, + { + "epoch": 1.0606822262118492, + "grad_norm": 0.6329161524772644, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 14770 + }, + { + "epoch": 1.0614003590664274, + "grad_norm": 0.7607306838035583, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14780 + }, + { + "epoch": 1.0621184919210054, + "grad_norm": 0.7236617207527161, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14790 + }, + { + "epoch": 1.0628366247755834, + "grad_norm": 0.793542206287384, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 14800 + }, + { + "epoch": 1.0635547576301616, + "grad_norm": 0.53999263048172, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 14810 + }, + { + "epoch": 1.0642728904847396, + "grad_norm": 0.5821034908294678, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 14820 + }, + { + "epoch": 1.0649910233393178, + "grad_norm": 0.6593600511550903, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 14830 + }, + { + "epoch": 1.0657091561938958, + "grad_norm": 0.70230633020401, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 14840 + }, + { + "epoch": 1.066427289048474, + "grad_norm": 0.5715264081954956, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14850 + }, + { + "epoch": 1.067145421903052, + "grad_norm": 0.6610119938850403, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 14860 + }, + { + "epoch": 1.06786355475763, + "grad_norm": 0.5470091700553894, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 14870 + }, + { + "epoch": 1.0685816876122083, + "grad_norm": 0.7529906630516052, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 14880 + }, + { + "epoch": 1.0692998204667863, + "grad_norm": 0.7532844543457031, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 14890 + }, + { + "epoch": 1.0700179533213645, + "grad_norm": 0.6439316868782043, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14900 + }, + { + "epoch": 1.0707360861759425, + "grad_norm": 0.5580114126205444, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14910 + }, + { + "epoch": 1.0714542190305207, + "grad_norm": 0.6299236416816711, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 14920 + }, + { + "epoch": 1.0721723518850987, + "grad_norm": 0.6934021711349487, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 14930 + }, + { + "epoch": 1.0728904847396767, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 14940 + }, + { + "epoch": 1.073608617594255, + "grad_norm": 0.8921014070510864, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14950 + }, + { + "epoch": 1.074326750448833, + "grad_norm": 0.5934301614761353, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 14960 + }, + { + "epoch": 1.0750448833034112, + "grad_norm": 0.8379642367362976, + "learning_rate": 0.0002, + "loss": 0.7595, + "step": 14970 + }, + { + "epoch": 1.0757630161579892, + "grad_norm": 0.6842767596244812, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 14980 + }, + { + "epoch": 1.0764811490125674, + "grad_norm": 0.7296533584594727, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 14990 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 0.6821087002754211, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15000 + }, + { + "epoch": 1.0779174147217234, + "grad_norm": 0.6133626699447632, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 15010 + }, + { + "epoch": 1.0786355475763016, + "grad_norm": 0.6774773001670837, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 15020 + }, + { + "epoch": 1.0793536804308796, + "grad_norm": 0.6818786859512329, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 15030 + }, + { + "epoch": 1.0800718132854579, + "grad_norm": 0.7763522863388062, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15040 + }, + { + "epoch": 1.0807899461400359, + "grad_norm": 0.7259193658828735, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15050 + }, + { + "epoch": 1.081508078994614, + "grad_norm": 0.6797525882720947, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 15060 + }, + { + "epoch": 1.082226211849192, + "grad_norm": 0.5775881409645081, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 15070 + }, + { + "epoch": 1.08294434470377, + "grad_norm": 0.7055524587631226, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15080 + }, + { + "epoch": 1.0836624775583483, + "grad_norm": 0.8018748760223389, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 15090 + }, + { + "epoch": 1.0843806104129263, + "grad_norm": 0.6738115549087524, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 15100 + }, + { + "epoch": 1.0850987432675046, + "grad_norm": 0.6586359143257141, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 15110 + }, + { + "epoch": 1.0858168761220826, + "grad_norm": 0.7396895885467529, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 15120 + }, + { + "epoch": 1.0865350089766608, + "grad_norm": 0.7224817276000977, + "learning_rate": 0.0002, + "loss": 0.7473, + "step": 15130 + }, + { + "epoch": 1.0872531418312388, + "grad_norm": 0.798514187335968, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 15140 + }, + { + "epoch": 1.0879712746858168, + "grad_norm": 0.79301518201828, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 15150 + }, + { + "epoch": 1.088689407540395, + "grad_norm": 0.7106764316558838, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 15160 + }, + { + "epoch": 1.089407540394973, + "grad_norm": 0.6525473594665527, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 15170 + }, + { + "epoch": 1.0901256732495512, + "grad_norm": 0.6001671552658081, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 15180 + }, + { + "epoch": 1.0908438061041292, + "grad_norm": 0.6949557662010193, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 15190 + }, + { + "epoch": 1.0915619389587075, + "grad_norm": 0.5713186860084534, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 15200 + }, + { + "epoch": 1.0922800718132855, + "grad_norm": 0.8773220181465149, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 15210 + }, + { + "epoch": 1.0929982046678635, + "grad_norm": 0.5837785601615906, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 15220 + }, + { + "epoch": 1.0937163375224417, + "grad_norm": 0.7243856191635132, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 15230 + }, + { + "epoch": 1.0944344703770197, + "grad_norm": 0.7008263468742371, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 15240 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 0.7061941623687744, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 15250 + }, + { + "epoch": 1.095870736086176, + "grad_norm": 0.575903594493866, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 15260 + }, + { + "epoch": 1.0965888689407541, + "grad_norm": 0.6794043183326721, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 15270 + }, + { + "epoch": 1.0973070017953321, + "grad_norm": 0.7194870710372925, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 15280 + }, + { + "epoch": 1.0980251346499101, + "grad_norm": 0.8063322305679321, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 15290 + }, + { + "epoch": 1.0987432675044884, + "grad_norm": 0.786101758480072, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 15300 + }, + { + "epoch": 1.0994614003590664, + "grad_norm": 0.827474057674408, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 15310 + }, + { + "epoch": 1.1001795332136446, + "grad_norm": 0.6514455080032349, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 15320 + }, + { + "epoch": 1.1008976660682226, + "grad_norm": 0.7534348368644714, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15330 + }, + { + "epoch": 1.1016157989228008, + "grad_norm": 0.6991367340087891, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 15340 + }, + { + "epoch": 1.1023339317773788, + "grad_norm": 0.6742196679115295, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15350 + }, + { + "epoch": 1.1030520646319568, + "grad_norm": 0.7373757362365723, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 15360 + }, + { + "epoch": 1.103770197486535, + "grad_norm": 0.6834485530853271, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 15370 + }, + { + "epoch": 1.104488330341113, + "grad_norm": 0.6454901099205017, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 15380 + }, + { + "epoch": 1.1052064631956913, + "grad_norm": 0.7764508128166199, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 15390 + }, + { + "epoch": 1.1059245960502693, + "grad_norm": 0.668560802936554, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 15400 + }, + { + "epoch": 1.1066427289048475, + "grad_norm": 0.579655110836029, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 15410 + }, + { + "epoch": 1.1073608617594255, + "grad_norm": 0.7196493148803711, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 15420 + }, + { + "epoch": 1.1080789946140035, + "grad_norm": 0.5530232191085815, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 15430 + }, + { + "epoch": 1.1087971274685817, + "grad_norm": 0.6542958617210388, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 15440 + }, + { + "epoch": 1.1095152603231597, + "grad_norm": 0.7468852400779724, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 15450 + }, + { + "epoch": 1.110233393177738, + "grad_norm": 0.8119780421257019, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 15460 + }, + { + "epoch": 1.110951526032316, + "grad_norm": 0.7807733416557312, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 15470 + }, + { + "epoch": 1.1116696588868942, + "grad_norm": 0.7352553009986877, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 15480 + }, + { + "epoch": 1.1123877917414722, + "grad_norm": 0.8455224633216858, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 15490 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 0.635308563709259, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 15500 + }, + { + "epoch": 1.1138240574506284, + "grad_norm": 0.6268794536590576, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15510 + }, + { + "epoch": 1.1145421903052064, + "grad_norm": 0.6829593181610107, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 15520 + }, + { + "epoch": 1.1152603231597846, + "grad_norm": 0.5997796058654785, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 15530 + }, + { + "epoch": 1.1159784560143626, + "grad_norm": 0.7500942349433899, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 15540 + }, + { + "epoch": 1.1166965888689409, + "grad_norm": 0.7052047848701477, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 15550 + }, + { + "epoch": 1.1174147217235189, + "grad_norm": 0.6698189377784729, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 15560 + }, + { + "epoch": 1.1181328545780969, + "grad_norm": 0.7890462875366211, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 15570 + }, + { + "epoch": 1.118850987432675, + "grad_norm": 0.7002465128898621, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 15580 + }, + { + "epoch": 1.119569120287253, + "grad_norm": 0.7456073760986328, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 15590 + }, + { + "epoch": 1.1202872531418313, + "grad_norm": 0.7997385263442993, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 15600 + }, + { + "epoch": 1.1210053859964093, + "grad_norm": 0.6640482544898987, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15610 + }, + { + "epoch": 1.1217235188509875, + "grad_norm": 0.7765318155288696, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15620 + }, + { + "epoch": 1.1224416517055655, + "grad_norm": 0.7184962630271912, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 15630 + }, + { + "epoch": 1.1231597845601435, + "grad_norm": 0.7310904264450073, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 15640 + }, + { + "epoch": 1.1238779174147218, + "grad_norm": 0.7406452298164368, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 15650 + }, + { + "epoch": 1.1245960502692998, + "grad_norm": 0.7546738982200623, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 15660 + }, + { + "epoch": 1.125314183123878, + "grad_norm": 0.7069764733314514, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 15670 + }, + { + "epoch": 1.126032315978456, + "grad_norm": 0.6309521198272705, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 15680 + }, + { + "epoch": 1.1267504488330342, + "grad_norm": 0.8050156831741333, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 15690 + }, + { + "epoch": 1.1274685816876122, + "grad_norm": 0.726556122303009, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 15700 + }, + { + "epoch": 1.1281867145421902, + "grad_norm": 0.77745521068573, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 15710 + }, + { + "epoch": 1.1289048473967684, + "grad_norm": 0.7467634677886963, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 15720 + }, + { + "epoch": 1.1296229802513464, + "grad_norm": 0.8207895755767822, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 15730 + }, + { + "epoch": 1.1303411131059247, + "grad_norm": 0.8253937363624573, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 15740 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 0.6313983798027039, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 15750 + }, + { + "epoch": 1.1317773788150807, + "grad_norm": 0.8040992021560669, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 15760 + }, + { + "epoch": 1.132495511669659, + "grad_norm": 0.5937064290046692, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 15770 + }, + { + "epoch": 1.133213644524237, + "grad_norm": 0.6486281156539917, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 15780 + }, + { + "epoch": 1.1339317773788151, + "grad_norm": 0.6161853075027466, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 15790 + }, + { + "epoch": 1.1346499102333931, + "grad_norm": 0.6926610469818115, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 15800 + }, + { + "epoch": 1.1353680430879713, + "grad_norm": 0.6084047555923462, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 15810 + }, + { + "epoch": 1.1360861759425493, + "grad_norm": 0.6928383111953735, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 15820 + }, + { + "epoch": 1.1368043087971276, + "grad_norm": 0.7784243822097778, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 15830 + }, + { + "epoch": 1.1375224416517056, + "grad_norm": 0.7169384956359863, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 15840 + }, + { + "epoch": 1.1382405745062836, + "grad_norm": 0.6953616142272949, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 15850 + }, + { + "epoch": 1.1389587073608618, + "grad_norm": 0.7345215082168579, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15860 + }, + { + "epoch": 1.1396768402154398, + "grad_norm": 0.5469502806663513, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 15870 + }, + { + "epoch": 1.140394973070018, + "grad_norm": 0.687680721282959, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15880 + }, + { + "epoch": 1.141113105924596, + "grad_norm": 0.6879996657371521, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 15890 + }, + { + "epoch": 1.141831238779174, + "grad_norm": 0.728886067867279, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 15900 + }, + { + "epoch": 1.1425493716337523, + "grad_norm": 0.929531455039978, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 15910 + }, + { + "epoch": 1.1432675044883303, + "grad_norm": 0.8122507333755493, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 15920 + }, + { + "epoch": 1.1439856373429085, + "grad_norm": 0.6494652628898621, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 15930 + }, + { + "epoch": 1.1447037701974865, + "grad_norm": 0.7307567596435547, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15940 + }, + { + "epoch": 1.1454219030520647, + "grad_norm": 0.548678994178772, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 15950 + }, + { + "epoch": 1.1461400359066427, + "grad_norm": 0.8011603951454163, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 15960 + }, + { + "epoch": 1.146858168761221, + "grad_norm": 0.7026647329330444, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 15970 + }, + { + "epoch": 1.147576301615799, + "grad_norm": 0.7338995933532715, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 15980 + }, + { + "epoch": 1.148294434470377, + "grad_norm": 0.8453443646430969, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 15990 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 0.6787207126617432, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 16000 + }, + { + "epoch": 1.1497307001795332, + "grad_norm": 0.6314631104469299, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 16010 + }, + { + "epoch": 1.1504488330341114, + "grad_norm": 0.8812752962112427, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16020 + }, + { + "epoch": 1.1511669658886894, + "grad_norm": 0.6528969407081604, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 16030 + }, + { + "epoch": 1.1518850987432674, + "grad_norm": 0.7843571305274963, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 16040 + }, + { + "epoch": 1.1526032315978456, + "grad_norm": 0.7095080018043518, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 16050 + }, + { + "epoch": 1.1533213644524236, + "grad_norm": 0.7495582103729248, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 16060 + }, + { + "epoch": 1.1540394973070018, + "grad_norm": 0.6002049446105957, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 16070 + }, + { + "epoch": 1.1547576301615798, + "grad_norm": 0.565014123916626, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 16080 + }, + { + "epoch": 1.155475763016158, + "grad_norm": 0.8209971785545349, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 16090 + }, + { + "epoch": 1.156193895870736, + "grad_norm": 0.7137531042098999, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 16100 + }, + { + "epoch": 1.1569120287253143, + "grad_norm": 0.7307516932487488, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 16110 + }, + { + "epoch": 1.1576301615798923, + "grad_norm": 0.6686444878578186, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 16120 + }, + { + "epoch": 1.1583482944344703, + "grad_norm": 0.7977298498153687, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 16130 + }, + { + "epoch": 1.1590664272890485, + "grad_norm": 0.6980607509613037, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 16140 + }, + { + "epoch": 1.1597845601436265, + "grad_norm": 0.6622613668441772, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 16150 + }, + { + "epoch": 1.1605026929982047, + "grad_norm": 0.6598347425460815, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 16160 + }, + { + "epoch": 1.1612208258527827, + "grad_norm": 0.6686234474182129, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 16170 + }, + { + "epoch": 1.1619389587073607, + "grad_norm": 0.7308177947998047, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 16180 + }, + { + "epoch": 1.162657091561939, + "grad_norm": 0.939537525177002, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 16190 + }, + { + "epoch": 1.163375224416517, + "grad_norm": 0.5514758825302124, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 16200 + }, + { + "epoch": 1.1640933572710952, + "grad_norm": 0.589142918586731, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 16210 + }, + { + "epoch": 1.1648114901256732, + "grad_norm": 0.6888012290000916, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 16220 + }, + { + "epoch": 1.1655296229802514, + "grad_norm": 0.82566899061203, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 16230 + }, + { + "epoch": 1.1662477558348294, + "grad_norm": 0.6107817888259888, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 16240 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 0.7831398844718933, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 16250 + }, + { + "epoch": 1.1676840215439857, + "grad_norm": 0.6468397974967957, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 16260 + }, + { + "epoch": 1.1684021543985637, + "grad_norm": 0.7284161448478699, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 16270 + }, + { + "epoch": 1.1691202872531419, + "grad_norm": 0.6182818412780762, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 16280 + }, + { + "epoch": 1.1698384201077199, + "grad_norm": 0.7091781497001648, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 16290 + }, + { + "epoch": 1.170556552962298, + "grad_norm": 0.7327643632888794, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 16300 + }, + { + "epoch": 1.171274685816876, + "grad_norm": 0.5864694118499756, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 16310 + }, + { + "epoch": 1.171992818671454, + "grad_norm": 0.7049986720085144, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 16320 + }, + { + "epoch": 1.1727109515260323, + "grad_norm": 0.7563399076461792, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 16330 + }, + { + "epoch": 1.1734290843806103, + "grad_norm": 0.5888143181800842, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16340 + }, + { + "epoch": 1.1741472172351886, + "grad_norm": 0.8670049905776978, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 16350 + }, + { + "epoch": 1.1748653500897666, + "grad_norm": 0.8045654296875, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 16360 + }, + { + "epoch": 1.1755834829443448, + "grad_norm": 0.9115668535232544, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 16370 + }, + { + "epoch": 1.1763016157989228, + "grad_norm": 0.6943584084510803, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 16380 + }, + { + "epoch": 1.177019748653501, + "grad_norm": 0.7931740283966064, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 16390 + }, + { + "epoch": 1.177737881508079, + "grad_norm": 0.7967953085899353, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16400 + }, + { + "epoch": 1.178456014362657, + "grad_norm": 0.575165867805481, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 16410 + }, + { + "epoch": 1.1791741472172352, + "grad_norm": 0.6803409457206726, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 16420 + }, + { + "epoch": 1.1798922800718132, + "grad_norm": 0.7661909461021423, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 16430 + }, + { + "epoch": 1.1806104129263915, + "grad_norm": 0.7907630205154419, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 16440 + }, + { + "epoch": 1.1813285457809695, + "grad_norm": 0.7215338945388794, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 16450 + }, + { + "epoch": 1.1820466786355475, + "grad_norm": 0.6824054718017578, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 16460 + }, + { + "epoch": 1.1827648114901257, + "grad_norm": 0.8057665228843689, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 16470 + }, + { + "epoch": 1.1834829443447037, + "grad_norm": 0.7487542033195496, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 16480 + }, + { + "epoch": 1.184201077199282, + "grad_norm": 0.7254953384399414, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 16490 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 0.6986604332923889, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 16500 + }, + { + "epoch": 1.1856373429084381, + "grad_norm": 0.7889591455459595, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 16510 + }, + { + "epoch": 1.1863554757630161, + "grad_norm": 0.6029604077339172, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 16520 + }, + { + "epoch": 1.1870736086175944, + "grad_norm": 0.680322527885437, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 16530 + }, + { + "epoch": 1.1877917414721724, + "grad_norm": 0.8588826060295105, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 16540 + }, + { + "epoch": 1.1885098743267504, + "grad_norm": 0.7614806890487671, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 16550 + }, + { + "epoch": 1.1892280071813286, + "grad_norm": 0.7523183226585388, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 16560 + }, + { + "epoch": 1.1899461400359066, + "grad_norm": 0.8299532532691956, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 16570 + }, + { + "epoch": 1.1906642728904848, + "grad_norm": 0.6709241271018982, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 16580 + }, + { + "epoch": 1.1913824057450628, + "grad_norm": 0.665414035320282, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16590 + }, + { + "epoch": 1.1921005385996408, + "grad_norm": 0.7582152485847473, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 16600 + }, + { + "epoch": 1.192818671454219, + "grad_norm": 0.5856947302818298, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 16610 + }, + { + "epoch": 1.193536804308797, + "grad_norm": 0.6972885727882385, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 16620 + }, + { + "epoch": 1.1942549371633753, + "grad_norm": 0.6884734630584717, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 16630 + }, + { + "epoch": 1.1949730700179533, + "grad_norm": 0.7380475401878357, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 16640 + }, + { + "epoch": 1.1956912028725315, + "grad_norm": 0.7976197600364685, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 16650 + }, + { + "epoch": 1.1964093357271095, + "grad_norm": 0.819256067276001, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 16660 + }, + { + "epoch": 1.1971274685816877, + "grad_norm": 0.587867796421051, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 16670 + }, + { + "epoch": 1.1978456014362657, + "grad_norm": 0.9162678122520447, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 16680 + }, + { + "epoch": 1.1985637342908437, + "grad_norm": 0.7452084422111511, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 16690 + }, + { + "epoch": 1.199281867145422, + "grad_norm": 0.7966971397399902, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 16700 + }, + { + "epoch": 1.2, + "grad_norm": 0.6605724692344666, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 16710 + }, + { + "epoch": 1.2007181328545782, + "grad_norm": 0.6499220728874207, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16720 + }, + { + "epoch": 1.2014362657091562, + "grad_norm": 0.7422114610671997, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 16730 + }, + { + "epoch": 1.2021543985637342, + "grad_norm": 0.6652370095252991, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 16740 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 0.8761070370674133, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 16750 + }, + { + "epoch": 1.2035906642728904, + "grad_norm": 0.7294463515281677, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 16760 + }, + { + "epoch": 1.2043087971274686, + "grad_norm": 0.7725599408149719, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 16770 + }, + { + "epoch": 1.2050269299820466, + "grad_norm": 0.5630005598068237, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 16780 + }, + { + "epoch": 1.2057450628366249, + "grad_norm": 0.7601404786109924, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16790 + }, + { + "epoch": 1.2064631956912029, + "grad_norm": 0.6859985589981079, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16800 + }, + { + "epoch": 1.207181328545781, + "grad_norm": 0.7040054798126221, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 16810 + }, + { + "epoch": 1.207899461400359, + "grad_norm": 0.7058989405632019, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 16820 + }, + { + "epoch": 1.208617594254937, + "grad_norm": 0.7646133899688721, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16830 + }, + { + "epoch": 1.2093357271095153, + "grad_norm": 0.669550359249115, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 16840 + }, + { + "epoch": 1.2100538599640933, + "grad_norm": 0.6613401174545288, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16850 + }, + { + "epoch": 1.2107719928186715, + "grad_norm": 0.8636519312858582, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 16860 + }, + { + "epoch": 1.2114901256732495, + "grad_norm": 0.6077507138252258, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 16870 + }, + { + "epoch": 1.2122082585278275, + "grad_norm": 0.7892228364944458, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 16880 + }, + { + "epoch": 1.2129263913824058, + "grad_norm": 0.7424154877662659, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 16890 + }, + { + "epoch": 1.2136445242369838, + "grad_norm": 0.6525408029556274, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 16900 + }, + { + "epoch": 1.214362657091562, + "grad_norm": 0.6178015470504761, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 16910 + }, + { + "epoch": 1.21508078994614, + "grad_norm": 0.7319437861442566, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 16920 + }, + { + "epoch": 1.2157989228007182, + "grad_norm": 0.6823344826698303, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 16930 + }, + { + "epoch": 1.2165170556552962, + "grad_norm": 0.5681257843971252, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 16940 + }, + { + "epoch": 1.2172351885098744, + "grad_norm": 0.7939814925193787, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 16950 + }, + { + "epoch": 1.2179533213644524, + "grad_norm": 0.7031611800193787, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 16960 + }, + { + "epoch": 1.2186714542190304, + "grad_norm": 0.7610133290290833, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16970 + }, + { + "epoch": 1.2193895870736087, + "grad_norm": 0.8707142472267151, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 16980 + }, + { + "epoch": 1.2201077199281867, + "grad_norm": 0.6603384017944336, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 16990 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 0.7218315005302429, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 17000 + }, + { + "epoch": 1.221543985637343, + "grad_norm": 0.8043148517608643, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17010 + }, + { + "epoch": 1.222262118491921, + "grad_norm": 0.7232559323310852, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17020 + }, + { + "epoch": 1.2229802513464991, + "grad_norm": 0.690376341342926, + "learning_rate": 0.0002, + "loss": 0.7681, + "step": 17030 + }, + { + "epoch": 1.2236983842010771, + "grad_norm": 0.602436363697052, + "learning_rate": 0.0002, + "loss": 0.7042, + "step": 17040 + }, + { + "epoch": 1.2244165170556554, + "grad_norm": 0.7610493898391724, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 17050 + }, + { + "epoch": 1.2251346499102334, + "grad_norm": 0.7504690885543823, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 17060 + }, + { + "epoch": 1.2258527827648116, + "grad_norm": 0.8080246448516846, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 17070 + }, + { + "epoch": 1.2265709156193896, + "grad_norm": 1.0240572690963745, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 17080 + }, + { + "epoch": 1.2272890484739678, + "grad_norm": 0.6874111294746399, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 17090 + }, + { + "epoch": 1.2280071813285458, + "grad_norm": 0.800069272518158, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 17100 + }, + { + "epoch": 1.2287253141831238, + "grad_norm": 0.8628103137016296, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 17110 + }, + { + "epoch": 1.229443447037702, + "grad_norm": 0.7408499121665955, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 17120 + }, + { + "epoch": 1.23016157989228, + "grad_norm": 0.6494335532188416, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 17130 + }, + { + "epoch": 1.2308797127468583, + "grad_norm": 0.6493549942970276, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17140 + }, + { + "epoch": 1.2315978456014363, + "grad_norm": 0.6972658038139343, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 17150 + }, + { + "epoch": 1.2323159784560143, + "grad_norm": 0.6877315044403076, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 17160 + }, + { + "epoch": 1.2330341113105925, + "grad_norm": 0.7569024562835693, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 17170 + }, + { + "epoch": 1.2337522441651705, + "grad_norm": 0.696260392665863, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 17180 + }, + { + "epoch": 1.2344703770197487, + "grad_norm": 0.6150345802307129, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 17190 + }, + { + "epoch": 1.2351885098743267, + "grad_norm": 0.69009929895401, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 17200 + }, + { + "epoch": 1.235906642728905, + "grad_norm": 0.7035185098648071, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 17210 + }, + { + "epoch": 1.236624775583483, + "grad_norm": 0.6792506575584412, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17220 + }, + { + "epoch": 1.2373429084380612, + "grad_norm": 0.6310356855392456, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 17230 + }, + { + "epoch": 1.2380610412926392, + "grad_norm": 0.647026538848877, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 17240 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 0.7609930038452148, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 17250 + }, + { + "epoch": 1.2394973070017954, + "grad_norm": 0.791890561580658, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 17260 + }, + { + "epoch": 1.2402154398563734, + "grad_norm": 0.7126715183258057, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 17270 + }, + { + "epoch": 1.2409335727109516, + "grad_norm": 0.7850401401519775, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 17280 + }, + { + "epoch": 1.2416517055655296, + "grad_norm": 0.6694281697273254, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 17290 + }, + { + "epoch": 1.2423698384201076, + "grad_norm": 0.6418080925941467, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 17300 + }, + { + "epoch": 1.2430879712746858, + "grad_norm": 0.7308132648468018, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 17310 + }, + { + "epoch": 1.2438061041292638, + "grad_norm": 0.8322312235832214, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17320 + }, + { + "epoch": 1.244524236983842, + "grad_norm": 0.6959006190299988, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 17330 + }, + { + "epoch": 1.24524236983842, + "grad_norm": 0.7110121846199036, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17340 + }, + { + "epoch": 1.2459605026929983, + "grad_norm": 0.6496296525001526, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 17350 + }, + { + "epoch": 1.2466786355475763, + "grad_norm": 0.7649076581001282, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 17360 + }, + { + "epoch": 1.2473967684021545, + "grad_norm": 0.7139049172401428, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 17370 + }, + { + "epoch": 1.2481149012567325, + "grad_norm": 0.7709113955497742, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 17380 + }, + { + "epoch": 1.2488330341113105, + "grad_norm": 0.7160373330116272, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 17390 + }, + { + "epoch": 1.2495511669658887, + "grad_norm": 0.5608301162719727, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17400 + }, + { + "epoch": 1.2502692998204668, + "grad_norm": 0.6913180351257324, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 17410 + }, + { + "epoch": 1.250987432675045, + "grad_norm": 0.6980322599411011, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 17420 + }, + { + "epoch": 1.251705565529623, + "grad_norm": 0.8155394792556763, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 17430 + }, + { + "epoch": 1.252423698384201, + "grad_norm": 0.8015886545181274, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 17440 + }, + { + "epoch": 1.2531418312387792, + "grad_norm": 0.5985556244850159, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17450 + }, + { + "epoch": 1.2538599640933572, + "grad_norm": 0.70317143201828, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17460 + }, + { + "epoch": 1.2545780969479354, + "grad_norm": 0.612501323223114, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17470 + }, + { + "epoch": 1.2552962298025134, + "grad_norm": 0.7347102165222168, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 17480 + }, + { + "epoch": 1.2560143626570914, + "grad_norm": 0.9189441800117493, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 17490 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 0.7727932929992676, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 17500 + }, + { + "epoch": 1.2574506283662479, + "grad_norm": 0.6782869696617126, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 17510 + }, + { + "epoch": 1.2581687612208259, + "grad_norm": 0.5710638761520386, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17520 + }, + { + "epoch": 1.2588868940754039, + "grad_norm": 0.6856266856193542, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 17530 + }, + { + "epoch": 1.259605026929982, + "grad_norm": 0.7257347702980042, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 17540 + }, + { + "epoch": 1.26032315978456, + "grad_norm": 0.6343092918395996, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 17550 + }, + { + "epoch": 1.2610412926391383, + "grad_norm": 0.6482594013214111, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 17560 + }, + { + "epoch": 1.2617594254937163, + "grad_norm": 0.6542837619781494, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 17570 + }, + { + "epoch": 1.2624775583482943, + "grad_norm": 0.7106123566627502, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 17580 + }, + { + "epoch": 1.2631956912028726, + "grad_norm": 0.9081960320472717, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 17590 + }, + { + "epoch": 1.2639138240574506, + "grad_norm": 0.7010290026664734, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 17600 + }, + { + "epoch": 1.2646319569120288, + "grad_norm": 0.9973132610321045, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 17610 + }, + { + "epoch": 1.2653500897666068, + "grad_norm": 0.8003297448158264, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 17620 + }, + { + "epoch": 1.2660682226211848, + "grad_norm": 0.7383468151092529, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 17630 + }, + { + "epoch": 1.266786355475763, + "grad_norm": 0.6337200999259949, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 17640 + }, + { + "epoch": 1.2675044883303412, + "grad_norm": 0.6371761560440063, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 17650 + }, + { + "epoch": 1.2682226211849192, + "grad_norm": 0.7283522486686707, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 17660 + }, + { + "epoch": 1.2689407540394972, + "grad_norm": 0.8191015720367432, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 17670 + }, + { + "epoch": 1.2696588868940755, + "grad_norm": 0.6210351586341858, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 17680 + }, + { + "epoch": 1.2703770197486535, + "grad_norm": 0.6563277840614319, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 17690 + }, + { + "epoch": 1.2710951526032317, + "grad_norm": 0.7111260294914246, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 17700 + }, + { + "epoch": 1.2718132854578097, + "grad_norm": 0.7061500549316406, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 17710 + }, + { + "epoch": 1.2725314183123877, + "grad_norm": 0.7657744884490967, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 17720 + }, + { + "epoch": 1.273249551166966, + "grad_norm": 0.6952996850013733, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17730 + }, + { + "epoch": 1.273967684021544, + "grad_norm": 0.5678043961524963, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 17740 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 0.8608036041259766, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 17750 + }, + { + "epoch": 1.2754039497307001, + "grad_norm": 0.7184045910835266, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 17760 + }, + { + "epoch": 1.2761220825852782, + "grad_norm": 0.6647557616233826, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 17770 + }, + { + "epoch": 1.2768402154398564, + "grad_norm": 0.6899349093437195, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17780 + }, + { + "epoch": 1.2775583482944346, + "grad_norm": 0.7073346972465515, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 17790 + }, + { + "epoch": 1.2782764811490126, + "grad_norm": 0.8896707892417908, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 17800 + }, + { + "epoch": 1.2789946140035906, + "grad_norm": 0.5072778463363647, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 17810 + }, + { + "epoch": 1.2797127468581688, + "grad_norm": 0.8889711499214172, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 17820 + }, + { + "epoch": 1.2804308797127468, + "grad_norm": 0.5583778619766235, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 17830 + }, + { + "epoch": 1.281149012567325, + "grad_norm": 0.6526148915290833, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 17840 + }, + { + "epoch": 1.281867145421903, + "grad_norm": 0.7658175826072693, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 17850 + }, + { + "epoch": 1.282585278276481, + "grad_norm": 0.5547847151756287, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 17860 + }, + { + "epoch": 1.2833034111310593, + "grad_norm": 0.6153780817985535, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17870 + }, + { + "epoch": 1.2840215439856373, + "grad_norm": 0.8474061489105225, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 17880 + }, + { + "epoch": 1.2847396768402155, + "grad_norm": 0.859260618686676, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 17890 + }, + { + "epoch": 1.2854578096947935, + "grad_norm": 0.7270520329475403, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 17900 + }, + { + "epoch": 1.2861759425493715, + "grad_norm": 0.8166249394416809, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 17910 + }, + { + "epoch": 1.2868940754039497, + "grad_norm": 0.9158982038497925, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17920 + }, + { + "epoch": 1.287612208258528, + "grad_norm": 0.8132565021514893, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17930 + }, + { + "epoch": 1.288330341113106, + "grad_norm": 0.7914409637451172, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17940 + }, + { + "epoch": 1.289048473967684, + "grad_norm": 0.6256071329116821, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 17950 + }, + { + "epoch": 1.2897666068222622, + "grad_norm": 0.6463542580604553, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 17960 + }, + { + "epoch": 1.2904847396768402, + "grad_norm": 0.6702672839164734, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 17970 + }, + { + "epoch": 1.2912028725314184, + "grad_norm": 0.8666605949401855, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 17980 + }, + { + "epoch": 1.2919210053859964, + "grad_norm": 0.8055952787399292, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17990 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 0.6909741163253784, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 18000 + }, + { + "epoch": 1.2933572710951526, + "grad_norm": 0.663702130317688, + "learning_rate": 0.0002, + "loss": 0.7766, + "step": 18010 + }, + { + "epoch": 1.2940754039497306, + "grad_norm": 0.6952448487281799, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 18020 + }, + { + "epoch": 1.2947935368043089, + "grad_norm": 0.5722854137420654, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18030 + }, + { + "epoch": 1.2955116696588869, + "grad_norm": 0.7987681031227112, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 18040 + }, + { + "epoch": 1.2962298025134649, + "grad_norm": 0.661133348941803, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 18050 + }, + { + "epoch": 1.296947935368043, + "grad_norm": 0.6025064587593079, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 18060 + }, + { + "epoch": 1.2976660682226213, + "grad_norm": 0.7569907903671265, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 18070 + }, + { + "epoch": 1.2983842010771993, + "grad_norm": 0.7222012281417847, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18080 + }, + { + "epoch": 1.2991023339317773, + "grad_norm": 0.5291963815689087, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 18090 + }, + { + "epoch": 1.2998204667863555, + "grad_norm": 0.6808363199234009, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 18100 + }, + { + "epoch": 1.3005385996409335, + "grad_norm": 0.6797927618026733, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 18110 + }, + { + "epoch": 1.3012567324955118, + "grad_norm": 0.7775542140007019, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 18120 + }, + { + "epoch": 1.3019748653500898, + "grad_norm": 0.7369466423988342, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18130 + }, + { + "epoch": 1.3026929982046678, + "grad_norm": 0.6822494864463806, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 18140 + }, + { + "epoch": 1.303411131059246, + "grad_norm": 0.9222138524055481, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 18150 + }, + { + "epoch": 1.304129263913824, + "grad_norm": 0.7485767006874084, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 18160 + }, + { + "epoch": 1.3048473967684022, + "grad_norm": 0.6383684277534485, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 18170 + }, + { + "epoch": 1.3055655296229802, + "grad_norm": 0.5934187173843384, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 18180 + }, + { + "epoch": 1.3062836624775582, + "grad_norm": 0.7265770435333252, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 18190 + }, + { + "epoch": 1.3070017953321365, + "grad_norm": 0.8149140477180481, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 18200 + }, + { + "epoch": 1.3077199281867147, + "grad_norm": 0.8067880272865295, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 18210 + }, + { + "epoch": 1.3084380610412927, + "grad_norm": 0.6109178066253662, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18220 + }, + { + "epoch": 1.3091561938958707, + "grad_norm": 0.7194176316261292, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 18230 + }, + { + "epoch": 1.309874326750449, + "grad_norm": 0.6452242136001587, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 18240 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 0.680550217628479, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 18250 + }, + { + "epoch": 1.3113105924596051, + "grad_norm": 0.7005740404129028, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 18260 + }, + { + "epoch": 1.3120287253141831, + "grad_norm": 0.7217825055122375, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 18270 + }, + { + "epoch": 1.3127468581687611, + "grad_norm": 0.7730209231376648, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 18280 + }, + { + "epoch": 1.3134649910233394, + "grad_norm": 0.8291956186294556, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18290 + }, + { + "epoch": 1.3141831238779174, + "grad_norm": 0.758528470993042, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18300 + }, + { + "epoch": 1.3149012567324956, + "grad_norm": 0.9682782292366028, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 18310 + }, + { + "epoch": 1.3156193895870736, + "grad_norm": 0.5784780979156494, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 18320 + }, + { + "epoch": 1.3163375224416516, + "grad_norm": 0.5870532393455505, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 18330 + }, + { + "epoch": 1.3170556552962298, + "grad_norm": 0.5950172543525696, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 18340 + }, + { + "epoch": 1.317773788150808, + "grad_norm": 0.7625961899757385, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 18350 + }, + { + "epoch": 1.318491921005386, + "grad_norm": 0.8027397394180298, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 18360 + }, + { + "epoch": 1.319210053859964, + "grad_norm": 0.8424779772758484, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 18370 + }, + { + "epoch": 1.3199281867145423, + "grad_norm": 0.5741737484931946, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 18380 + }, + { + "epoch": 1.3206463195691203, + "grad_norm": 0.7363710999488831, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 18390 + }, + { + "epoch": 1.3213644524236985, + "grad_norm": 0.7900536060333252, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 18400 + }, + { + "epoch": 1.3220825852782765, + "grad_norm": 0.6273105144500732, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 18410 + }, + { + "epoch": 1.3228007181328545, + "grad_norm": 0.7612496018409729, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 18420 + }, + { + "epoch": 1.3235188509874327, + "grad_norm": 0.729653537273407, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 18430 + }, + { + "epoch": 1.3242369838420107, + "grad_norm": 0.6599212288856506, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 18440 + }, + { + "epoch": 1.324955116696589, + "grad_norm": 0.762320876121521, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18450 + }, + { + "epoch": 1.325673249551167, + "grad_norm": 0.7468838095664978, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18460 + }, + { + "epoch": 1.326391382405745, + "grad_norm": 0.6376237273216248, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 18470 + }, + { + "epoch": 1.3271095152603232, + "grad_norm": 0.6722603440284729, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18480 + }, + { + "epoch": 1.3278276481149014, + "grad_norm": 0.7011231780052185, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 18490 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 0.5325027108192444, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 18500 + }, + { + "epoch": 1.3292639138240574, + "grad_norm": 0.6916731595993042, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 18510 + }, + { + "epoch": 1.3299820466786356, + "grad_norm": 0.6529106497764587, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18520 + }, + { + "epoch": 1.3307001795332136, + "grad_norm": 0.7708640694618225, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 18530 + }, + { + "epoch": 1.3314183123877918, + "grad_norm": 0.7125861048698425, + "learning_rate": 0.0002, + "loss": 0.7688, + "step": 18540 + }, + { + "epoch": 1.3321364452423698, + "grad_norm": 0.7663969993591309, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 18550 + }, + { + "epoch": 1.3328545780969479, + "grad_norm": 0.601141631603241, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 18560 + }, + { + "epoch": 1.333572710951526, + "grad_norm": 0.6185581088066101, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 18570 + }, + { + "epoch": 1.334290843806104, + "grad_norm": 0.6136596202850342, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 18580 + }, + { + "epoch": 1.3350089766606823, + "grad_norm": 0.8377187252044678, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 18590 + }, + { + "epoch": 1.3357271095152603, + "grad_norm": 0.7649989724159241, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 18600 + }, + { + "epoch": 1.3364452423698383, + "grad_norm": 0.7944515347480774, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 18610 + }, + { + "epoch": 1.3371633752244165, + "grad_norm": 0.619024395942688, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 18620 + }, + { + "epoch": 1.3378815080789948, + "grad_norm": 0.7849082946777344, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 18630 + }, + { + "epoch": 1.3385996409335728, + "grad_norm": 0.5740780830383301, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18640 + }, + { + "epoch": 1.3393177737881508, + "grad_norm": 0.6897456645965576, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 18650 + }, + { + "epoch": 1.340035906642729, + "grad_norm": 0.6263600587844849, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 18660 + }, + { + "epoch": 1.340754039497307, + "grad_norm": 0.5744550824165344, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 18670 + }, + { + "epoch": 1.3414721723518852, + "grad_norm": 0.7785728573799133, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 18680 + }, + { + "epoch": 1.3421903052064632, + "grad_norm": 0.6944230198860168, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 18690 + }, + { + "epoch": 1.3429084380610412, + "grad_norm": 0.7388073801994324, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 18700 + }, + { + "epoch": 1.3436265709156194, + "grad_norm": 0.9555586576461792, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 18710 + }, + { + "epoch": 1.3443447037701974, + "grad_norm": 0.8510582447052002, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 18720 + }, + { + "epoch": 1.3450628366247757, + "grad_norm": 0.6093049645423889, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 18730 + }, + { + "epoch": 1.3457809694793537, + "grad_norm": 0.9159273505210876, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 18740 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 0.7188084721565247, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 18750 + }, + { + "epoch": 1.3472172351885099, + "grad_norm": 0.7228650450706482, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 18760 + }, + { + "epoch": 1.347935368043088, + "grad_norm": 0.8160615563392639, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 18770 + }, + { + "epoch": 1.3486535008976661, + "grad_norm": 0.6485389471054077, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 18780 + }, + { + "epoch": 1.3493716337522441, + "grad_norm": 0.6755139827728271, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 18790 + }, + { + "epoch": 1.3500897666068223, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 18800 + }, + { + "epoch": 1.3508078994614003, + "grad_norm": 0.6954510807991028, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 18810 + }, + { + "epoch": 1.3515260323159786, + "grad_norm": 0.9948558807373047, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 18820 + }, + { + "epoch": 1.3522441651705566, + "grad_norm": 0.708381175994873, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18830 + }, + { + "epoch": 1.3529622980251346, + "grad_norm": 0.6409999132156372, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 18840 + }, + { + "epoch": 1.3536804308797128, + "grad_norm": 0.6365936994552612, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18850 + }, + { + "epoch": 1.3543985637342908, + "grad_norm": 0.7620742917060852, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 18860 + }, + { + "epoch": 1.355116696588869, + "grad_norm": 0.6849071383476257, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 18870 + }, + { + "epoch": 1.355834829443447, + "grad_norm": 0.5776316523551941, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18880 + }, + { + "epoch": 1.356552962298025, + "grad_norm": 0.597236156463623, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 18890 + }, + { + "epoch": 1.3572710951526032, + "grad_norm": 0.6569282412528992, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 18900 + }, + { + "epoch": 1.3579892280071812, + "grad_norm": 0.6384802460670471, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 18910 + }, + { + "epoch": 1.3587073608617595, + "grad_norm": 0.6623879671096802, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 18920 + }, + { + "epoch": 1.3594254937163375, + "grad_norm": 0.6149632334709167, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 18930 + }, + { + "epoch": 1.3601436265709157, + "grad_norm": 0.6978002190589905, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 18940 + }, + { + "epoch": 1.3608617594254937, + "grad_norm": 0.7579124569892883, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 18950 + }, + { + "epoch": 1.361579892280072, + "grad_norm": 0.7138084173202515, + "learning_rate": 0.0002, + "loss": 0.7589, + "step": 18960 + }, + { + "epoch": 1.36229802513465, + "grad_norm": 0.678322434425354, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18970 + }, + { + "epoch": 1.363016157989228, + "grad_norm": 0.694346010684967, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18980 + }, + { + "epoch": 1.3637342908438062, + "grad_norm": 0.682262659072876, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18990 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 0.9068194627761841, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 19000 + }, + { + "epoch": 1.3651705565529624, + "grad_norm": 0.6691566705703735, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 19010 + }, + { + "epoch": 1.3658886894075404, + "grad_norm": 0.7791378498077393, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 19020 + }, + { + "epoch": 1.3666068222621184, + "grad_norm": 0.717107355594635, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 19030 + }, + { + "epoch": 1.3673249551166966, + "grad_norm": 0.7897566556930542, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 19040 + }, + { + "epoch": 1.3680430879712746, + "grad_norm": 0.8823844790458679, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 19050 + }, + { + "epoch": 1.3687612208258528, + "grad_norm": 0.6512053608894348, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 19060 + }, + { + "epoch": 1.3694793536804308, + "grad_norm": 0.6871389150619507, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 19070 + }, + { + "epoch": 1.370197486535009, + "grad_norm": 0.6795603036880493, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 19080 + }, + { + "epoch": 1.370915619389587, + "grad_norm": 0.6569121479988098, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 19090 + }, + { + "epoch": 1.3716337522441653, + "grad_norm": 0.6769960522651672, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 19100 + }, + { + "epoch": 1.3723518850987433, + "grad_norm": 0.726613461971283, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 19110 + }, + { + "epoch": 1.3730700179533213, + "grad_norm": 0.7287817001342773, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 19120 + }, + { + "epoch": 1.3737881508078995, + "grad_norm": 0.6169242858886719, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 19130 + }, + { + "epoch": 1.3745062836624775, + "grad_norm": 0.6537347435951233, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 19140 + }, + { + "epoch": 1.3752244165170557, + "grad_norm": 0.6113879680633545, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 19150 + }, + { + "epoch": 1.3759425493716337, + "grad_norm": 0.6415297985076904, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 19160 + }, + { + "epoch": 1.3766606822262117, + "grad_norm": 0.6812838315963745, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 19170 + }, + { + "epoch": 1.37737881508079, + "grad_norm": 0.7331814169883728, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 19180 + }, + { + "epoch": 1.378096947935368, + "grad_norm": 0.7265108823776245, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 19190 + }, + { + "epoch": 1.3788150807899462, + "grad_norm": 0.6233167052268982, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 19200 + }, + { + "epoch": 1.3795332136445242, + "grad_norm": 0.6841492652893066, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 19210 + }, + { + "epoch": 1.3802513464991024, + "grad_norm": 0.822853684425354, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 19220 + }, + { + "epoch": 1.3809694793536804, + "grad_norm": 0.8078812956809998, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 19230 + }, + { + "epoch": 1.3816876122082586, + "grad_norm": 0.7269898056983948, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 19240 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 0.6297033429145813, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 19250 + }, + { + "epoch": 1.3831238779174146, + "grad_norm": 0.8097442388534546, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 19260 + }, + { + "epoch": 1.3838420107719929, + "grad_norm": 0.6442803740501404, + "learning_rate": 0.0002, + "loss": 0.7281, + "step": 19270 + }, + { + "epoch": 1.3845601436265709, + "grad_norm": 0.659866213798523, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 19280 + }, + { + "epoch": 1.385278276481149, + "grad_norm": 0.7537921667098999, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 19290 + }, + { + "epoch": 1.385996409335727, + "grad_norm": 0.8441828489303589, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 19300 + }, + { + "epoch": 1.386714542190305, + "grad_norm": 0.8506057262420654, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19310 + }, + { + "epoch": 1.3874326750448833, + "grad_norm": 0.6747094392776489, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 19320 + }, + { + "epoch": 1.3881508078994613, + "grad_norm": 0.7906509041786194, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 19330 + }, + { + "epoch": 1.3888689407540395, + "grad_norm": 0.6784867644309998, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 19340 + }, + { + "epoch": 1.3895870736086176, + "grad_norm": 0.6371709108352661, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 19350 + }, + { + "epoch": 1.3903052064631956, + "grad_norm": 0.7858285307884216, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 19360 + }, + { + "epoch": 1.3910233393177738, + "grad_norm": 0.711395263671875, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19370 + }, + { + "epoch": 1.391741472172352, + "grad_norm": 0.7023257613182068, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19380 + }, + { + "epoch": 1.39245960502693, + "grad_norm": 0.7036022543907166, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19390 + }, + { + "epoch": 1.393177737881508, + "grad_norm": 0.6418436169624329, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 19400 + }, + { + "epoch": 1.3938958707360862, + "grad_norm": 0.7108847498893738, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 19410 + }, + { + "epoch": 1.3946140035906642, + "grad_norm": 0.6940230131149292, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 19420 + }, + { + "epoch": 1.3953321364452425, + "grad_norm": 0.6750220656394958, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 19430 + }, + { + "epoch": 1.3960502692998205, + "grad_norm": 0.7479177713394165, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 19440 + }, + { + "epoch": 1.3967684021543985, + "grad_norm": 0.626124918460846, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 19450 + }, + { + "epoch": 1.3974865350089767, + "grad_norm": 0.8908559083938599, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 19460 + }, + { + "epoch": 1.3982046678635547, + "grad_norm": 0.6163712739944458, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 19470 + }, + { + "epoch": 1.398922800718133, + "grad_norm": 0.6993312239646912, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 19480 + }, + { + "epoch": 1.399640933572711, + "grad_norm": 0.6162890791893005, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 19490 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 0.7797643542289734, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 19500 + }, + { + "epoch": 1.4010771992818671, + "grad_norm": 0.7038744688034058, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 19510 + }, + { + "epoch": 1.4017953321364454, + "grad_norm": 0.6902393698692322, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 19520 + }, + { + "epoch": 1.4025134649910234, + "grad_norm": 0.5436386466026306, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 19530 + }, + { + "epoch": 1.4032315978456014, + "grad_norm": 0.6537990570068359, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19540 + }, + { + "epoch": 1.4039497307001796, + "grad_norm": 0.739691972732544, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 19550 + }, + { + "epoch": 1.4046678635547576, + "grad_norm": 0.7287635803222656, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 19560 + }, + { + "epoch": 1.4053859964093358, + "grad_norm": 0.6809501051902771, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 19570 + }, + { + "epoch": 1.4061041292639138, + "grad_norm": 0.8302195072174072, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 19580 + }, + { + "epoch": 1.4068222621184918, + "grad_norm": 0.6613629460334778, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 19590 + }, + { + "epoch": 1.40754039497307, + "grad_norm": 0.7897207736968994, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 19600 + }, + { + "epoch": 1.408258527827648, + "grad_norm": 0.8368293642997742, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 19610 + }, + { + "epoch": 1.4089766606822263, + "grad_norm": 0.665109395980835, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 19620 + }, + { + "epoch": 1.4096947935368043, + "grad_norm": 0.7359302639961243, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 19630 + }, + { + "epoch": 1.4104129263913823, + "grad_norm": 0.8048052787780762, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 19640 + }, + { + "epoch": 1.4111310592459605, + "grad_norm": 0.7414906620979309, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 19650 + }, + { + "epoch": 1.4118491921005387, + "grad_norm": 0.7894161343574524, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 19660 + }, + { + "epoch": 1.4125673249551167, + "grad_norm": 0.6724628210067749, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 19670 + }, + { + "epoch": 1.4132854578096947, + "grad_norm": 0.9397756457328796, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 19680 + }, + { + "epoch": 1.414003590664273, + "grad_norm": 0.6684842109680176, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 19690 + }, + { + "epoch": 1.414721723518851, + "grad_norm": 0.7753993272781372, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 19700 + }, + { + "epoch": 1.4154398563734292, + "grad_norm": 0.6934253573417664, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 19710 + }, + { + "epoch": 1.4161579892280072, + "grad_norm": 0.8567284941673279, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 19720 + }, + { + "epoch": 1.4168761220825852, + "grad_norm": 0.9471787214279175, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 19730 + }, + { + "epoch": 1.4175942549371634, + "grad_norm": 0.6664855480194092, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 19740 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 0.6713361740112305, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 19750 + }, + { + "epoch": 1.4190305206463196, + "grad_norm": 0.6488258838653564, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 19760 + }, + { + "epoch": 1.4197486535008976, + "grad_norm": 0.7089938521385193, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19770 + }, + { + "epoch": 1.4204667863554756, + "grad_norm": 0.6433218717575073, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 19780 + }, + { + "epoch": 1.4211849192100539, + "grad_norm": 0.7025160193443298, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 19790 + }, + { + "epoch": 1.421903052064632, + "grad_norm": 0.7030544877052307, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 19800 + }, + { + "epoch": 1.42262118491921, + "grad_norm": 0.6515552401542664, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 19810 + }, + { + "epoch": 1.423339317773788, + "grad_norm": 0.6463841795921326, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 19820 + }, + { + "epoch": 1.4240574506283663, + "grad_norm": 0.6654344201087952, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19830 + }, + { + "epoch": 1.4247755834829443, + "grad_norm": 0.7223384380340576, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 19840 + }, + { + "epoch": 1.4254937163375225, + "grad_norm": 0.6575722694396973, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 19850 + }, + { + "epoch": 1.4262118491921005, + "grad_norm": 0.6216059327125549, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 19860 + }, + { + "epoch": 1.4269299820466785, + "grad_norm": 0.7451487183570862, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19870 + }, + { + "epoch": 1.4276481149012568, + "grad_norm": 0.6563336253166199, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 19880 + }, + { + "epoch": 1.4283662477558348, + "grad_norm": 0.8021975159645081, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 19890 + }, + { + "epoch": 1.429084380610413, + "grad_norm": 0.7474712133407593, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 19900 + }, + { + "epoch": 1.429802513464991, + "grad_norm": 0.7316377758979797, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 19910 + }, + { + "epoch": 1.430520646319569, + "grad_norm": 0.646892786026001, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 19920 + }, + { + "epoch": 1.4312387791741472, + "grad_norm": 0.6268765926361084, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 19930 + }, + { + "epoch": 1.4319569120287254, + "grad_norm": 0.7104699611663818, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 19940 + }, + { + "epoch": 1.4326750448833034, + "grad_norm": 0.6742063760757446, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 19950 + }, + { + "epoch": 1.4333931777378814, + "grad_norm": 0.6973381638526917, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 19960 + }, + { + "epoch": 1.4341113105924597, + "grad_norm": 0.5819381475448608, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 19970 + }, + { + "epoch": 1.4348294434470377, + "grad_norm": 0.680623471736908, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 19980 + }, + { + "epoch": 1.435547576301616, + "grad_norm": 0.5899890661239624, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 19990 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 0.6225098371505737, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 20000 + }, + { + "epoch": 1.436983842010772, + "grad_norm": 0.6314228773117065, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 20010 + }, + { + "epoch": 1.4377019748653501, + "grad_norm": 0.8690667152404785, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 20020 + }, + { + "epoch": 1.4384201077199281, + "grad_norm": 0.7166543006896973, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 20030 + }, + { + "epoch": 1.4391382405745063, + "grad_norm": 0.7051591873168945, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 20040 + }, + { + "epoch": 1.4398563734290843, + "grad_norm": 0.7606652975082397, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 20050 + }, + { + "epoch": 1.4405745062836623, + "grad_norm": 0.6343185305595398, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 20060 + }, + { + "epoch": 1.4412926391382406, + "grad_norm": 0.5625789761543274, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 20070 + }, + { + "epoch": 1.4420107719928188, + "grad_norm": 0.6081897020339966, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 20080 + }, + { + "epoch": 1.4427289048473968, + "grad_norm": 0.9571536779403687, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 20090 + }, + { + "epoch": 1.4434470377019748, + "grad_norm": 0.869531512260437, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 20100 + }, + { + "epoch": 1.444165170556553, + "grad_norm": 0.6865507960319519, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 20110 + }, + { + "epoch": 1.444883303411131, + "grad_norm": 0.7572755813598633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 20120 + }, + { + "epoch": 1.4456014362657092, + "grad_norm": 0.79011070728302, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 20130 + }, + { + "epoch": 1.4463195691202873, + "grad_norm": 0.8297342658042908, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 20140 + }, + { + "epoch": 1.4470377019748653, + "grad_norm": 0.6593490839004517, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 20150 + }, + { + "epoch": 1.4477558348294435, + "grad_norm": 1.0264687538146973, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 20160 + }, + { + "epoch": 1.4484739676840215, + "grad_norm": 0.7032888531684875, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 20170 + }, + { + "epoch": 1.4491921005385997, + "grad_norm": 0.6438494920730591, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 20180 + }, + { + "epoch": 1.4499102333931777, + "grad_norm": 0.7448790669441223, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 20190 + }, + { + "epoch": 1.4506283662477557, + "grad_norm": 0.7551555037498474, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 20200 + }, + { + "epoch": 1.451346499102334, + "grad_norm": 0.6677857041358948, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 20210 + }, + { + "epoch": 1.4520646319569122, + "grad_norm": 0.7888486385345459, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 20220 + }, + { + "epoch": 1.4527827648114902, + "grad_norm": 0.6658565402030945, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 20230 + }, + { + "epoch": 1.4535008976660682, + "grad_norm": 0.6800249814987183, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 20240 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 0.7419682741165161, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 20250 + }, + { + "epoch": 1.4549371633752244, + "grad_norm": 0.8848792910575867, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 20260 + }, + { + "epoch": 1.4556552962298026, + "grad_norm": 0.6513857245445251, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 20270 + }, + { + "epoch": 1.4563734290843806, + "grad_norm": 0.5605742335319519, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 20280 + }, + { + "epoch": 1.4570915619389586, + "grad_norm": 0.6737141013145447, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 20290 + }, + { + "epoch": 1.4578096947935368, + "grad_norm": 0.6663289666175842, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 20300 + }, + { + "epoch": 1.4585278276481148, + "grad_norm": 0.7157106995582581, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20310 + }, + { + "epoch": 1.459245960502693, + "grad_norm": 0.7713354825973511, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 20320 + }, + { + "epoch": 1.459964093357271, + "grad_norm": 0.8334044218063354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 20330 + }, + { + "epoch": 1.460682226211849, + "grad_norm": 0.7268327474594116, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 20340 + }, + { + "epoch": 1.4614003590664273, + "grad_norm": 0.6791431903839111, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 20350 + }, + { + "epoch": 1.4621184919210055, + "grad_norm": 0.8177870512008667, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 20360 + }, + { + "epoch": 1.4628366247755835, + "grad_norm": 0.8064364790916443, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 20370 + }, + { + "epoch": 1.4635547576301615, + "grad_norm": 0.6547006964683533, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 20380 + }, + { + "epoch": 1.4642728904847397, + "grad_norm": 0.6381436586380005, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 20390 + }, + { + "epoch": 1.4649910233393177, + "grad_norm": 0.7351248264312744, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 20400 + }, + { + "epoch": 1.465709156193896, + "grad_norm": 0.7037558555603027, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 20410 + }, + { + "epoch": 1.466427289048474, + "grad_norm": 0.6294074654579163, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 20420 + }, + { + "epoch": 1.467145421903052, + "grad_norm": 0.9722632765769958, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 20430 + }, + { + "epoch": 1.4678635547576302, + "grad_norm": 0.753065824508667, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 20440 + }, + { + "epoch": 1.4685816876122082, + "grad_norm": 0.7317194938659668, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20450 + }, + { + "epoch": 1.4692998204667864, + "grad_norm": 0.6862193942070007, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 20460 + }, + { + "epoch": 1.4700179533213644, + "grad_norm": 0.7643225193023682, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 20470 + }, + { + "epoch": 1.4707360861759424, + "grad_norm": 0.5904353260993958, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 20480 + }, + { + "epoch": 1.4714542190305206, + "grad_norm": 0.5812238454818726, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20490 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 0.7478151321411133, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 20500 + }, + { + "epoch": 1.4728904847396769, + "grad_norm": 0.7625645399093628, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 20510 + }, + { + "epoch": 1.4736086175942549, + "grad_norm": 0.6354498267173767, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 20520 + }, + { + "epoch": 1.474326750448833, + "grad_norm": 0.8731162548065186, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 20530 + }, + { + "epoch": 1.475044883303411, + "grad_norm": 0.7346670627593994, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 20540 + }, + { + "epoch": 1.4757630161579893, + "grad_norm": 1.038447618484497, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 20550 + }, + { + "epoch": 1.4764811490125673, + "grad_norm": 0.7032809257507324, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 20560 + }, + { + "epoch": 1.4771992818671453, + "grad_norm": 0.8008337020874023, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 20570 + }, + { + "epoch": 1.4779174147217236, + "grad_norm": 0.6735056638717651, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 20580 + }, + { + "epoch": 1.4786355475763016, + "grad_norm": 0.622056245803833, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 20590 + }, + { + "epoch": 1.4793536804308798, + "grad_norm": 0.6580422520637512, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 20600 + }, + { + "epoch": 1.4800718132854578, + "grad_norm": 0.8401153087615967, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20610 + }, + { + "epoch": 1.4807899461400358, + "grad_norm": 0.7564560770988464, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 20620 + }, + { + "epoch": 1.481508078994614, + "grad_norm": 0.8319511413574219, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 20630 + }, + { + "epoch": 1.4822262118491922, + "grad_norm": 0.7430182695388794, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 20640 + }, + { + "epoch": 1.4829443447037702, + "grad_norm": 0.7996522784233093, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 20650 + }, + { + "epoch": 1.4836624775583482, + "grad_norm": 0.6993277072906494, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 20660 + }, + { + "epoch": 1.4843806104129265, + "grad_norm": 0.8621185421943665, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 20670 + }, + { + "epoch": 1.4850987432675045, + "grad_norm": 0.7709757685661316, + "learning_rate": 0.0002, + "loss": 0.7327, + "step": 20680 + }, + { + "epoch": 1.4858168761220827, + "grad_norm": 0.743760347366333, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 20690 + }, + { + "epoch": 1.4865350089766607, + "grad_norm": 0.8353745341300964, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 20700 + }, + { + "epoch": 1.4872531418312387, + "grad_norm": 0.8510433435440063, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 20710 + }, + { + "epoch": 1.487971274685817, + "grad_norm": 0.7065894603729248, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 20720 + }, + { + "epoch": 1.488689407540395, + "grad_norm": 0.6878955960273743, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 20730 + }, + { + "epoch": 1.4894075403949731, + "grad_norm": 0.7861111760139465, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 20740 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 0.4810725152492523, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20750 + }, + { + "epoch": 1.4908438061041291, + "grad_norm": 0.7246082425117493, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 20760 + }, + { + "epoch": 1.4915619389587074, + "grad_norm": 0.7101936340332031, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 20770 + }, + { + "epoch": 1.4922800718132856, + "grad_norm": 0.7508591413497925, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 20780 + }, + { + "epoch": 1.4929982046678636, + "grad_norm": 0.8872039914131165, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 20790 + }, + { + "epoch": 1.4937163375224416, + "grad_norm": 0.7257922887802124, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 20800 + }, + { + "epoch": 1.4944344703770198, + "grad_norm": 0.7886278629302979, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 20810 + }, + { + "epoch": 1.4951526032315978, + "grad_norm": 0.6746290922164917, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 20820 + }, + { + "epoch": 1.495870736086176, + "grad_norm": 0.8118207454681396, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 20830 + }, + { + "epoch": 1.496588868940754, + "grad_norm": 0.7337301969528198, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 20840 + }, + { + "epoch": 1.497307001795332, + "grad_norm": 0.5451242327690125, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 20850 + }, + { + "epoch": 1.4980251346499103, + "grad_norm": 0.8398377299308777, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 20860 + }, + { + "epoch": 1.4987432675044883, + "grad_norm": 0.7196659445762634, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 20870 + }, + { + "epoch": 1.4994614003590665, + "grad_norm": 0.6659539937973022, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 20880 + }, + { + "epoch": 1.5001795332136445, + "grad_norm": 0.6071978807449341, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 20890 + }, + { + "epoch": 1.5008976660682225, + "grad_norm": 0.6704870462417603, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 20900 + }, + { + "epoch": 1.5016157989228007, + "grad_norm": 0.7216639518737793, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 20910 + }, + { + "epoch": 1.502333931777379, + "grad_norm": 0.6050528287887573, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 20920 + }, + { + "epoch": 1.503052064631957, + "grad_norm": 0.7422218918800354, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 20930 + }, + { + "epoch": 1.503770197486535, + "grad_norm": 0.7157148122787476, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20940 + }, + { + "epoch": 1.504488330341113, + "grad_norm": 0.6704899668693542, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 20950 + }, + { + "epoch": 1.5052064631956912, + "grad_norm": 0.7573544979095459, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 20960 + }, + { + "epoch": 1.5059245960502694, + "grad_norm": 0.6710506677627563, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 20970 + }, + { + "epoch": 1.5066427289048474, + "grad_norm": 0.7559793591499329, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 20980 + }, + { + "epoch": 1.5073608617594254, + "grad_norm": 0.6705940961837769, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 20990 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 0.8016680479049683, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21000 + }, + { + "epoch": 1.5087971274685816, + "grad_norm": 0.8154481649398804, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 21010 + }, + { + "epoch": 1.5095152603231599, + "grad_norm": 0.5830582976341248, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 21020 + }, + { + "epoch": 1.5102333931777379, + "grad_norm": 0.7088601589202881, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 21030 + }, + { + "epoch": 1.5109515260323159, + "grad_norm": 0.7499658465385437, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 21040 + }, + { + "epoch": 1.511669658886894, + "grad_norm": 0.7684667706489563, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 21050 + }, + { + "epoch": 1.5123877917414723, + "grad_norm": 0.7183627486228943, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 21060 + }, + { + "epoch": 1.5131059245960503, + "grad_norm": 0.8201524615287781, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 21070 + }, + { + "epoch": 1.5138240574506283, + "grad_norm": 0.6359647512435913, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 21080 + }, + { + "epoch": 1.5145421903052063, + "grad_norm": 0.7419124245643616, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 21090 + }, + { + "epoch": 1.5152603231597845, + "grad_norm": 0.6145808696746826, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 21100 + }, + { + "epoch": 1.5159784560143628, + "grad_norm": 0.7116656303405762, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 21110 + }, + { + "epoch": 1.5166965888689408, + "grad_norm": 0.8927125334739685, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 21120 + }, + { + "epoch": 1.5174147217235188, + "grad_norm": 0.7527788877487183, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 21130 + }, + { + "epoch": 1.518132854578097, + "grad_norm": 0.7537266612052917, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 21140 + }, + { + "epoch": 1.518850987432675, + "grad_norm": 0.9051724672317505, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 21150 + }, + { + "epoch": 1.5195691202872532, + "grad_norm": 0.7258086800575256, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 21160 + }, + { + "epoch": 1.5202872531418312, + "grad_norm": 0.60377436876297, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 21170 + }, + { + "epoch": 1.5210053859964092, + "grad_norm": 0.613362729549408, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 21180 + }, + { + "epoch": 1.5217235188509874, + "grad_norm": 0.6311782002449036, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 21190 + }, + { + "epoch": 1.5224416517055657, + "grad_norm": 0.7814380526542664, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 21200 + }, + { + "epoch": 1.5231597845601437, + "grad_norm": 0.8482790589332581, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 21210 + }, + { + "epoch": 1.5238779174147217, + "grad_norm": 0.6767336130142212, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21220 + }, + { + "epoch": 1.5245960502692997, + "grad_norm": 0.7000219821929932, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 21230 + }, + { + "epoch": 1.525314183123878, + "grad_norm": 0.8848617076873779, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 21240 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 0.692258894443512, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 21250 + }, + { + "epoch": 1.5267504488330341, + "grad_norm": 0.7701950073242188, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 21260 + }, + { + "epoch": 1.5274685816876121, + "grad_norm": 0.7454132437705994, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 21270 + }, + { + "epoch": 1.5281867145421903, + "grad_norm": 0.7299574613571167, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 21280 + }, + { + "epoch": 1.5289048473967684, + "grad_norm": 0.6693950891494751, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 21290 + }, + { + "epoch": 1.5296229802513466, + "grad_norm": 0.8323785066604614, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 21300 + }, + { + "epoch": 1.5303411131059246, + "grad_norm": 0.8998763561248779, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 21310 + }, + { + "epoch": 1.5310592459605026, + "grad_norm": 0.8118193745613098, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 21320 + }, + { + "epoch": 1.5317773788150808, + "grad_norm": 0.8966332077980042, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 21330 + }, + { + "epoch": 1.532495511669659, + "grad_norm": 0.7849827408790588, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 21340 + }, + { + "epoch": 1.533213644524237, + "grad_norm": 0.897583544254303, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 21350 + }, + { + "epoch": 1.533931777378815, + "grad_norm": 0.7998009324073792, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21360 + }, + { + "epoch": 1.534649910233393, + "grad_norm": 0.5890361070632935, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 21370 + }, + { + "epoch": 1.5353680430879713, + "grad_norm": 0.7321302890777588, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 21380 + }, + { + "epoch": 1.5360861759425495, + "grad_norm": 0.7746050357818604, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 21390 + }, + { + "epoch": 1.5368043087971275, + "grad_norm": 0.7033910155296326, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 21400 + }, + { + "epoch": 1.5375224416517055, + "grad_norm": 0.7229148149490356, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 21410 + }, + { + "epoch": 1.5382405745062837, + "grad_norm": 0.8055810928344727, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 21420 + }, + { + "epoch": 1.5389587073608617, + "grad_norm": 0.9411654472351074, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 21430 + }, + { + "epoch": 1.53967684021544, + "grad_norm": 0.7297126650810242, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21440 + }, + { + "epoch": 1.540394973070018, + "grad_norm": 0.7316457629203796, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 21450 + }, + { + "epoch": 1.541113105924596, + "grad_norm": 0.8568798303604126, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 21460 + }, + { + "epoch": 1.5418312387791742, + "grad_norm": 0.7829580307006836, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21470 + }, + { + "epoch": 1.5425493716337524, + "grad_norm": 0.6679823398590088, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 21480 + }, + { + "epoch": 1.5432675044883304, + "grad_norm": 0.5680868029594421, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 21490 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 0.6878862380981445, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 21500 + }, + { + "epoch": 1.5447037701974864, + "grad_norm": 0.7391727566719055, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 21510 + }, + { + "epoch": 1.5454219030520646, + "grad_norm": 0.844994843006134, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 21520 + }, + { + "epoch": 1.5461400359066428, + "grad_norm": 0.7852550148963928, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 21530 + }, + { + "epoch": 1.5468581687612208, + "grad_norm": 0.8370407223701477, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 21540 + }, + { + "epoch": 1.5475763016157988, + "grad_norm": 0.7138169407844543, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 21550 + }, + { + "epoch": 1.548294434470377, + "grad_norm": 0.7660839557647705, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 21560 + }, + { + "epoch": 1.549012567324955, + "grad_norm": 0.6628666520118713, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 21570 + }, + { + "epoch": 1.5497307001795333, + "grad_norm": 0.602262020111084, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 21580 + }, + { + "epoch": 1.5504488330341113, + "grad_norm": 0.6120333671569824, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 21590 + }, + { + "epoch": 1.5511669658886893, + "grad_norm": 0.6742582321166992, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 21600 + }, + { + "epoch": 1.5518850987432675, + "grad_norm": 0.6788192391395569, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 21610 + }, + { + "epoch": 1.5526032315978457, + "grad_norm": 0.7124713659286499, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 21620 + }, + { + "epoch": 1.5533213644524237, + "grad_norm": 0.6297248005867004, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 21630 + }, + { + "epoch": 1.5540394973070017, + "grad_norm": 0.8977078199386597, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21640 + }, + { + "epoch": 1.5547576301615798, + "grad_norm": 0.7543209791183472, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 21650 + }, + { + "epoch": 1.555475763016158, + "grad_norm": 0.8704302310943604, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 21660 + }, + { + "epoch": 1.5561938958707362, + "grad_norm": 0.7848012447357178, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 21670 + }, + { + "epoch": 1.5569120287253142, + "grad_norm": 0.7496278285980225, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 21680 + }, + { + "epoch": 1.5576301615798922, + "grad_norm": 0.7305200099945068, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 21690 + }, + { + "epoch": 1.5583482944344704, + "grad_norm": 0.6671105623245239, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 21700 + }, + { + "epoch": 1.5590664272890484, + "grad_norm": 0.8536111116409302, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 21710 + }, + { + "epoch": 1.5597845601436267, + "grad_norm": 0.7360461354255676, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 21720 + }, + { + "epoch": 1.5605026929982047, + "grad_norm": 0.6665109395980835, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 21730 + }, + { + "epoch": 1.5612208258527827, + "grad_norm": 0.5879628658294678, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 21740 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 0.6937240958213806, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 21750 + }, + { + "epoch": 1.562657091561939, + "grad_norm": 0.7118659019470215, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 21760 + }, + { + "epoch": 1.563375224416517, + "grad_norm": 0.7858866453170776, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 21770 + }, + { + "epoch": 1.564093357271095, + "grad_norm": 0.8691372871398926, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 21780 + }, + { + "epoch": 1.564811490125673, + "grad_norm": 0.8884942531585693, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 21790 + }, + { + "epoch": 1.5655296229802513, + "grad_norm": 0.6335656046867371, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 21800 + }, + { + "epoch": 1.5662477558348296, + "grad_norm": 0.8666166067123413, + "learning_rate": 0.0002, + "loss": 0.7233, + "step": 21810 + }, + { + "epoch": 1.5669658886894076, + "grad_norm": 0.7961624264717102, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 21820 + }, + { + "epoch": 1.5676840215439856, + "grad_norm": 0.6331174373626709, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 21830 + }, + { + "epoch": 1.5684021543985638, + "grad_norm": 0.6476998925209045, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 21840 + }, + { + "epoch": 1.5691202872531418, + "grad_norm": 0.8279129266738892, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 21850 + }, + { + "epoch": 1.56983842010772, + "grad_norm": 0.6997109651565552, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 21860 + }, + { + "epoch": 1.570556552962298, + "grad_norm": 0.6992211937904358, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 21870 + }, + { + "epoch": 1.571274685816876, + "grad_norm": 0.7766915559768677, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 21880 + }, + { + "epoch": 1.5719928186714542, + "grad_norm": 0.6845845580101013, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 21890 + }, + { + "epoch": 1.5727109515260325, + "grad_norm": 0.7247874140739441, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 21900 + }, + { + "epoch": 1.5734290843806105, + "grad_norm": 0.802342414855957, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21910 + }, + { + "epoch": 1.5741472172351885, + "grad_norm": 0.7797709107398987, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 21920 + }, + { + "epoch": 1.5748653500897665, + "grad_norm": 0.6534958481788635, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21930 + }, + { + "epoch": 1.5755834829443447, + "grad_norm": 0.6003528237342834, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 21940 + }, + { + "epoch": 1.576301615798923, + "grad_norm": 0.6920075416564941, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 21950 + }, + { + "epoch": 1.577019748653501, + "grad_norm": 0.7213456034660339, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 21960 + }, + { + "epoch": 1.577737881508079, + "grad_norm": 0.7101914286613464, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 21970 + }, + { + "epoch": 1.5784560143626571, + "grad_norm": 0.9531592130661011, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 21980 + }, + { + "epoch": 1.5791741472172351, + "grad_norm": 0.7690590023994446, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 21990 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 0.8226363062858582, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 22000 + }, + { + "epoch": 1.5806104129263914, + "grad_norm": 0.6128851175308228, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 22010 + }, + { + "epoch": 1.5813285457809694, + "grad_norm": 0.827008068561554, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 22020 + }, + { + "epoch": 1.5820466786355476, + "grad_norm": 0.6729007363319397, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 22030 + }, + { + "epoch": 1.5827648114901258, + "grad_norm": 0.6397014260292053, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 22040 + }, + { + "epoch": 1.5834829443447038, + "grad_norm": 0.6927793622016907, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 22050 + }, + { + "epoch": 1.5842010771992818, + "grad_norm": 0.7527112364768982, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 22060 + }, + { + "epoch": 1.5849192100538598, + "grad_norm": 0.6418012380599976, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 22070 + }, + { + "epoch": 1.585637342908438, + "grad_norm": 0.7627281546592712, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 22080 + }, + { + "epoch": 1.5863554757630163, + "grad_norm": 0.753851592540741, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22090 + }, + { + "epoch": 1.5870736086175943, + "grad_norm": 0.6049349904060364, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 22100 + }, + { + "epoch": 1.5877917414721723, + "grad_norm": 0.6677758693695068, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 22110 + }, + { + "epoch": 1.5885098743267505, + "grad_norm": 0.913489818572998, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22120 + }, + { + "epoch": 1.5892280071813285, + "grad_norm": 0.6779162883758545, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 22130 + }, + { + "epoch": 1.5899461400359067, + "grad_norm": 0.910076916217804, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 22140 + }, + { + "epoch": 1.5906642728904847, + "grad_norm": 0.9506068229675293, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 22150 + }, + { + "epoch": 1.5913824057450627, + "grad_norm": 0.6552460789680481, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 22160 + }, + { + "epoch": 1.592100538599641, + "grad_norm": 0.6855819821357727, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22170 + }, + { + "epoch": 1.5928186714542192, + "grad_norm": 0.6713384985923767, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 22180 + }, + { + "epoch": 1.5935368043087972, + "grad_norm": 0.7168547511100769, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 22190 + }, + { + "epoch": 1.5942549371633752, + "grad_norm": 0.8395482897758484, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22200 + }, + { + "epoch": 1.5949730700179532, + "grad_norm": 0.6676998138427734, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 22210 + }, + { + "epoch": 1.5956912028725314, + "grad_norm": 0.5837140083312988, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 22220 + }, + { + "epoch": 1.5964093357271096, + "grad_norm": 0.8399306535720825, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 22230 + }, + { + "epoch": 1.5971274685816876, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22240 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 0.768604040145874, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 22250 + }, + { + "epoch": 1.5985637342908436, + "grad_norm": 0.6382646560668945, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 22260 + }, + { + "epoch": 1.5992818671454219, + "grad_norm": 0.7244897484779358, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 22270 + }, + { + "epoch": 1.6, + "grad_norm": 0.6250987648963928, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 22280 + }, + { + "epoch": 1.600718132854578, + "grad_norm": 0.8731992244720459, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 22290 + }, + { + "epoch": 1.601436265709156, + "grad_norm": 0.5861822962760925, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 22300 + }, + { + "epoch": 1.6021543985637343, + "grad_norm": 0.716805100440979, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 22310 + }, + { + "epoch": 1.6028725314183125, + "grad_norm": 0.6650034189224243, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 22320 + }, + { + "epoch": 1.6035906642728905, + "grad_norm": 0.6944432854652405, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 22330 + }, + { + "epoch": 1.6043087971274685, + "grad_norm": 0.7411999106407166, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 22340 + }, + { + "epoch": 1.6050269299820465, + "grad_norm": 0.831828773021698, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 22350 + }, + { + "epoch": 1.6057450628366248, + "grad_norm": 0.6252152919769287, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 22360 + }, + { + "epoch": 1.606463195691203, + "grad_norm": 0.8643325567245483, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22370 + }, + { + "epoch": 1.607181328545781, + "grad_norm": 0.7330279350280762, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 22380 + }, + { + "epoch": 1.607899461400359, + "grad_norm": 0.7235422730445862, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 22390 + }, + { + "epoch": 1.608617594254937, + "grad_norm": 0.6940887570381165, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 22400 + }, + { + "epoch": 1.6093357271095152, + "grad_norm": 0.7907325625419617, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 22410 + }, + { + "epoch": 1.6100538599640934, + "grad_norm": 0.6899075508117676, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 22420 + }, + { + "epoch": 1.6107719928186714, + "grad_norm": 0.7057487368583679, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 22430 + }, + { + "epoch": 1.6114901256732495, + "grad_norm": 0.9235003590583801, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 22440 + }, + { + "epoch": 1.6122082585278277, + "grad_norm": 0.7238173484802246, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22450 + }, + { + "epoch": 1.612926391382406, + "grad_norm": 0.5931997299194336, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 22460 + }, + { + "epoch": 1.613644524236984, + "grad_norm": 0.6705866456031799, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 22470 + }, + { + "epoch": 1.614362657091562, + "grad_norm": 0.7392773032188416, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 22480 + }, + { + "epoch": 1.61508078994614, + "grad_norm": 0.6286543607711792, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 22490 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 0.7467446327209473, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 22500 + }, + { + "epoch": 1.6165170556552964, + "grad_norm": 0.8353021740913391, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 22510 + }, + { + "epoch": 1.6172351885098744, + "grad_norm": 0.7333045601844788, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 22520 + }, + { + "epoch": 1.6179533213644524, + "grad_norm": 0.6203709244728088, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 22530 + }, + { + "epoch": 1.6186714542190304, + "grad_norm": 0.5585690140724182, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 22540 + }, + { + "epoch": 1.6193895870736086, + "grad_norm": 0.7157222032546997, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 22550 + }, + { + "epoch": 1.6201077199281868, + "grad_norm": 0.8129993677139282, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 22560 + }, + { + "epoch": 1.6208258527827648, + "grad_norm": 0.6745335459709167, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 22570 + }, + { + "epoch": 1.6215439856373428, + "grad_norm": 0.7684996724128723, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 22580 + }, + { + "epoch": 1.622262118491921, + "grad_norm": 0.6735436916351318, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22590 + }, + { + "epoch": 1.6229802513464993, + "grad_norm": 0.7394272089004517, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 22600 + }, + { + "epoch": 1.6236983842010773, + "grad_norm": 0.7268046140670776, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 22610 + }, + { + "epoch": 1.6244165170556553, + "grad_norm": 0.8338810205459595, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 22620 + }, + { + "epoch": 1.6251346499102333, + "grad_norm": 0.9293080568313599, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 22630 + }, + { + "epoch": 1.6258527827648115, + "grad_norm": 0.8084996938705444, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 22640 + }, + { + "epoch": 1.6265709156193897, + "grad_norm": 0.6605180501937866, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22650 + }, + { + "epoch": 1.6272890484739677, + "grad_norm": 0.8402717113494873, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 22660 + }, + { + "epoch": 1.6280071813285457, + "grad_norm": 0.653055727481842, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 22670 + }, + { + "epoch": 1.6287253141831237, + "grad_norm": 0.6477823257446289, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 22680 + }, + { + "epoch": 1.629443447037702, + "grad_norm": 0.9053590893745422, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 22690 + }, + { + "epoch": 1.6301615798922802, + "grad_norm": 0.90384441614151, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 22700 + }, + { + "epoch": 1.6308797127468582, + "grad_norm": 0.6789469122886658, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 22710 + }, + { + "epoch": 1.6315978456014362, + "grad_norm": 0.7221854329109192, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 22720 + }, + { + "epoch": 1.6323159784560144, + "grad_norm": 0.7724022269248962, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 22730 + }, + { + "epoch": 1.6330341113105926, + "grad_norm": 0.8213715553283691, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 22740 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 0.7102876305580139, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 22750 + }, + { + "epoch": 1.6344703770197486, + "grad_norm": 0.8817880749702454, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 22760 + }, + { + "epoch": 1.6351885098743266, + "grad_norm": 0.8446506857872009, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 22770 + }, + { + "epoch": 1.6359066427289048, + "grad_norm": 0.6749029755592346, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 22780 + }, + { + "epoch": 1.636624775583483, + "grad_norm": 0.7013556957244873, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 22790 + }, + { + "epoch": 1.637342908438061, + "grad_norm": 0.7767965793609619, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22800 + }, + { + "epoch": 1.638061041292639, + "grad_norm": 0.7354073524475098, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 22810 + }, + { + "epoch": 1.638779174147217, + "grad_norm": 0.8871088027954102, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 22820 + }, + { + "epoch": 1.6394973070017953, + "grad_norm": 0.6573871374130249, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 22830 + }, + { + "epoch": 1.6402154398563735, + "grad_norm": 0.5679349303245544, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 22840 + }, + { + "epoch": 1.6409335727109515, + "grad_norm": 0.7072559595108032, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 22850 + }, + { + "epoch": 1.6416517055655295, + "grad_norm": 0.7639257311820984, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 22860 + }, + { + "epoch": 1.6423698384201078, + "grad_norm": 0.6699341535568237, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 22870 + }, + { + "epoch": 1.643087971274686, + "grad_norm": 0.8285767436027527, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 22880 + }, + { + "epoch": 1.643806104129264, + "grad_norm": 0.7328150272369385, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 22890 + }, + { + "epoch": 1.644524236983842, + "grad_norm": 0.8122354745864868, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 22900 + }, + { + "epoch": 1.64524236983842, + "grad_norm": 0.7322969436645508, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 22910 + }, + { + "epoch": 1.6459605026929982, + "grad_norm": 0.7269576191902161, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 22920 + }, + { + "epoch": 1.6466786355475764, + "grad_norm": 0.7037042379379272, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 22930 + }, + { + "epoch": 1.6473967684021544, + "grad_norm": 0.6960355639457703, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 22940 + }, + { + "epoch": 1.6481149012567324, + "grad_norm": 0.7446839213371277, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 22950 + }, + { + "epoch": 1.6488330341113104, + "grad_norm": 0.7201664447784424, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 22960 + }, + { + "epoch": 1.6495511669658887, + "grad_norm": 0.7062349319458008, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 22970 + }, + { + "epoch": 1.6502692998204669, + "grad_norm": 0.7666636109352112, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 22980 + }, + { + "epoch": 1.6509874326750449, + "grad_norm": 0.7872112393379211, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 22990 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 0.7428551316261292, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 23000 + }, + { + "epoch": 1.6524236983842011, + "grad_norm": 0.6087952852249146, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 23010 + }, + { + "epoch": 1.6531418312387793, + "grad_norm": 0.7191354036331177, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 23020 + }, + { + "epoch": 1.6538599640933573, + "grad_norm": 0.8679710626602173, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 23030 + }, + { + "epoch": 1.6545780969479353, + "grad_norm": 0.7232310175895691, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 23040 + }, + { + "epoch": 1.6552962298025133, + "grad_norm": 0.5695104002952576, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 23050 + }, + { + "epoch": 1.6560143626570916, + "grad_norm": 0.6363076567649841, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 23060 + }, + { + "epoch": 1.6567324955116698, + "grad_norm": 0.8168749809265137, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23070 + }, + { + "epoch": 1.6574506283662478, + "grad_norm": 0.7664111852645874, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 23080 + }, + { + "epoch": 1.6581687612208258, + "grad_norm": 0.6748140454292297, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 23090 + }, + { + "epoch": 1.6588868940754038, + "grad_norm": 0.6258183121681213, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 23100 + }, + { + "epoch": 1.659605026929982, + "grad_norm": 0.8669735193252563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 23110 + }, + { + "epoch": 1.6603231597845602, + "grad_norm": 0.5606119632720947, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 23120 + }, + { + "epoch": 1.6610412926391382, + "grad_norm": 0.6602507829666138, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 23130 + }, + { + "epoch": 1.6617594254937162, + "grad_norm": 0.7237988710403442, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 23140 + }, + { + "epoch": 1.6624775583482945, + "grad_norm": 0.9054415225982666, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 23150 + }, + { + "epoch": 1.6631956912028727, + "grad_norm": 0.5186660289764404, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 23160 + }, + { + "epoch": 1.6639138240574507, + "grad_norm": 0.719584584236145, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 23170 + }, + { + "epoch": 1.6646319569120287, + "grad_norm": 0.7583617568016052, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 23180 + }, + { + "epoch": 1.6653500897666067, + "grad_norm": 0.7985982298851013, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 23190 + }, + { + "epoch": 1.666068222621185, + "grad_norm": 0.6952691674232483, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23200 + }, + { + "epoch": 1.6667863554757631, + "grad_norm": 0.7184221744537354, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 23210 + }, + { + "epoch": 1.6675044883303412, + "grad_norm": 0.8256361484527588, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 23220 + }, + { + "epoch": 1.6682226211849192, + "grad_norm": 0.7534128427505493, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 23230 + }, + { + "epoch": 1.6689407540394972, + "grad_norm": 0.7711095213890076, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 23240 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 0.6326615810394287, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 23250 + }, + { + "epoch": 1.6703770197486536, + "grad_norm": 0.8345766663551331, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 23260 + }, + { + "epoch": 1.6710951526032316, + "grad_norm": 0.9079837203025818, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 23270 + }, + { + "epoch": 1.6718132854578096, + "grad_norm": 0.7310197353363037, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 23280 + }, + { + "epoch": 1.6725314183123878, + "grad_norm": 0.7573344707489014, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 23290 + }, + { + "epoch": 1.673249551166966, + "grad_norm": 0.7708047032356262, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 23300 + }, + { + "epoch": 1.673967684021544, + "grad_norm": 0.7665812969207764, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 23310 + }, + { + "epoch": 1.674685816876122, + "grad_norm": 0.7988788485527039, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 23320 + }, + { + "epoch": 1.6754039497307, + "grad_norm": 0.755042552947998, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 23330 + }, + { + "epoch": 1.6761220825852783, + "grad_norm": 0.6605848670005798, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 23340 + }, + { + "epoch": 1.6768402154398565, + "grad_norm": 0.8762016296386719, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 23350 + }, + { + "epoch": 1.6775583482944345, + "grad_norm": 0.604742169380188, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 23360 + }, + { + "epoch": 1.6782764811490125, + "grad_norm": 0.7479172945022583, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 23370 + }, + { + "epoch": 1.6789946140035905, + "grad_norm": 0.6418702602386475, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 23380 + }, + { + "epoch": 1.6797127468581687, + "grad_norm": 0.6783933639526367, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 23390 + }, + { + "epoch": 1.680430879712747, + "grad_norm": 0.7036024928092957, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 23400 + }, + { + "epoch": 1.681149012567325, + "grad_norm": 0.6833266615867615, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 23410 + }, + { + "epoch": 1.681867145421903, + "grad_norm": 0.8867062330245972, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 23420 + }, + { + "epoch": 1.6825852782764812, + "grad_norm": 0.7825753092765808, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 23430 + }, + { + "epoch": 1.6833034111310592, + "grad_norm": 0.6396880745887756, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 23440 + }, + { + "epoch": 1.6840215439856374, + "grad_norm": 0.5723230242729187, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 23450 + }, + { + "epoch": 1.6847396768402154, + "grad_norm": 0.6949231624603271, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 23460 + }, + { + "epoch": 1.6854578096947934, + "grad_norm": 0.8290650248527527, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 23470 + }, + { + "epoch": 1.6861759425493716, + "grad_norm": 0.7765078544616699, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 23480 + }, + { + "epoch": 1.6868940754039499, + "grad_norm": 0.7084149718284607, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 23490 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 0.6916654109954834, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 23500 + }, + { + "epoch": 1.6883303411131059, + "grad_norm": 0.5615179538726807, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 23510 + }, + { + "epoch": 1.6890484739676839, + "grad_norm": 0.7996105551719666, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 23520 + }, + { + "epoch": 1.689766606822262, + "grad_norm": 0.7010168433189392, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23530 + }, + { + "epoch": 1.6904847396768403, + "grad_norm": 0.7876442074775696, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 23540 + }, + { + "epoch": 1.6912028725314183, + "grad_norm": 0.7508043646812439, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 23550 + }, + { + "epoch": 1.6919210053859963, + "grad_norm": 0.8125874400138855, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 23560 + }, + { + "epoch": 1.6926391382405745, + "grad_norm": 0.711840808391571, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 23570 + }, + { + "epoch": 1.6933572710951525, + "grad_norm": 0.6540026068687439, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 23580 + }, + { + "epoch": 1.6940754039497308, + "grad_norm": 0.8376550078392029, + "learning_rate": 0.0002, + "loss": 0.7578, + "step": 23590 + }, + { + "epoch": 1.6947935368043088, + "grad_norm": 0.7075366973876953, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 23600 + }, + { + "epoch": 1.6955116696588868, + "grad_norm": 0.7522266507148743, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23610 + }, + { + "epoch": 1.696229802513465, + "grad_norm": 0.7572667002677917, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 23620 + }, + { + "epoch": 1.6969479353680432, + "grad_norm": 0.6126907467842102, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 23630 + }, + { + "epoch": 1.6976660682226212, + "grad_norm": 0.7473152875900269, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 23640 + }, + { + "epoch": 1.6983842010771992, + "grad_norm": 0.6630390286445618, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 23650 + }, + { + "epoch": 1.6991023339317772, + "grad_norm": 0.5848073363304138, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 23660 + }, + { + "epoch": 1.6998204667863555, + "grad_norm": 0.5901942849159241, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 23670 + }, + { + "epoch": 1.7005385996409337, + "grad_norm": 0.7896918058395386, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 23680 + }, + { + "epoch": 1.7012567324955117, + "grad_norm": 0.705362856388092, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 23690 + }, + { + "epoch": 1.7019748653500897, + "grad_norm": 0.9917470812797546, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 23700 + }, + { + "epoch": 1.702692998204668, + "grad_norm": 0.7550538778305054, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 23710 + }, + { + "epoch": 1.703411131059246, + "grad_norm": 0.8348238468170166, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23720 + }, + { + "epoch": 1.7041292639138241, + "grad_norm": 0.5979694128036499, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 23730 + }, + { + "epoch": 1.7048473967684021, + "grad_norm": 0.7451775670051575, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 23740 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 0.7614818215370178, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 23750 + }, + { + "epoch": 1.7062836624775584, + "grad_norm": 0.5590742826461792, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 23760 + }, + { + "epoch": 1.7070017953321366, + "grad_norm": 0.7039094567298889, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 23770 + }, + { + "epoch": 1.7077199281867146, + "grad_norm": 0.7963233590126038, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23780 + }, + { + "epoch": 1.7084380610412926, + "grad_norm": 0.7214934825897217, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 23790 + }, + { + "epoch": 1.7091561938958706, + "grad_norm": 0.7310500741004944, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23800 + }, + { + "epoch": 1.7098743267504488, + "grad_norm": 0.6653284430503845, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 23810 + }, + { + "epoch": 1.710592459605027, + "grad_norm": 0.6632702946662903, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 23820 + }, + { + "epoch": 1.711310592459605, + "grad_norm": 0.6314955949783325, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 23830 + }, + { + "epoch": 1.712028725314183, + "grad_norm": 0.73652583360672, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 23840 + }, + { + "epoch": 1.7127468581687613, + "grad_norm": 0.5685144662857056, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 23850 + }, + { + "epoch": 1.7134649910233393, + "grad_norm": 0.7010223865509033, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 23860 + }, + { + "epoch": 1.7141831238779175, + "grad_norm": 0.7643879652023315, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 23870 + }, + { + "epoch": 1.7149012567324955, + "grad_norm": 0.7543165683746338, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 23880 + }, + { + "epoch": 1.7156193895870735, + "grad_norm": 0.8816508054733276, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 23890 + }, + { + "epoch": 1.7163375224416517, + "grad_norm": 0.7979614734649658, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23900 + }, + { + "epoch": 1.71705565529623, + "grad_norm": 0.7631057500839233, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 23910 + }, + { + "epoch": 1.717773788150808, + "grad_norm": 0.6349977254867554, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 23920 + }, + { + "epoch": 1.718491921005386, + "grad_norm": 0.7464412450790405, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 23930 + }, + { + "epoch": 1.719210053859964, + "grad_norm": 0.6985567212104797, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 23940 + }, + { + "epoch": 1.7199281867145422, + "grad_norm": 0.6641302704811096, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 23950 + }, + { + "epoch": 1.7206463195691204, + "grad_norm": 0.7299597263336182, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 23960 + }, + { + "epoch": 1.7213644524236984, + "grad_norm": 0.7812355756759644, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 23970 + }, + { + "epoch": 1.7220825852782764, + "grad_norm": 0.667571485042572, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 23980 + }, + { + "epoch": 1.7228007181328546, + "grad_norm": 0.8244081735610962, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 23990 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 0.6684445738792419, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 24000 + }, + { + "epoch": 1.7242369838420109, + "grad_norm": 0.7002949118614197, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 24010 + }, + { + "epoch": 1.7249551166965889, + "grad_norm": 0.6249772906303406, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 24020 + }, + { + "epoch": 1.7256732495511669, + "grad_norm": 0.7279905080795288, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 24030 + }, + { + "epoch": 1.726391382405745, + "grad_norm": 0.631148636341095, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 24040 + }, + { + "epoch": 1.7271095152603233, + "grad_norm": 0.7486464977264404, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 24050 + }, + { + "epoch": 1.7278276481149013, + "grad_norm": 0.7494347095489502, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 24060 + }, + { + "epoch": 1.7285457809694793, + "grad_norm": 0.7821264863014221, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 24070 + }, + { + "epoch": 1.7292639138240573, + "grad_norm": 0.7211608290672302, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 24080 + }, + { + "epoch": 1.7299820466786355, + "grad_norm": 0.7028553485870361, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 24090 + }, + { + "epoch": 1.7307001795332138, + "grad_norm": 0.6189247369766235, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 24100 + }, + { + "epoch": 1.7314183123877918, + "grad_norm": 0.7339756488800049, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 24110 + }, + { + "epoch": 1.7321364452423698, + "grad_norm": 0.6700502038002014, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 24120 + }, + { + "epoch": 1.732854578096948, + "grad_norm": 0.6139533519744873, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 24130 + }, + { + "epoch": 1.733572710951526, + "grad_norm": 0.7249825596809387, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 24140 + }, + { + "epoch": 1.7342908438061042, + "grad_norm": 0.6531777381896973, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 24150 + }, + { + "epoch": 1.7350089766606822, + "grad_norm": 0.8443833589553833, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 24160 + }, + { + "epoch": 1.7357271095152602, + "grad_norm": 0.7040373086929321, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 24170 + }, + { + "epoch": 1.7364452423698384, + "grad_norm": 0.8647749423980713, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24180 + }, + { + "epoch": 1.7371633752244167, + "grad_norm": 0.7297305464744568, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 24190 + }, + { + "epoch": 1.7378815080789947, + "grad_norm": 0.8191218376159668, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 24200 + }, + { + "epoch": 1.7385996409335727, + "grad_norm": 0.7315607666969299, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 24210 + }, + { + "epoch": 1.7393177737881507, + "grad_norm": 0.694486677646637, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 24220 + }, + { + "epoch": 1.740035906642729, + "grad_norm": 0.8115953207015991, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 24230 + }, + { + "epoch": 1.7407540394973071, + "grad_norm": 0.7379186153411865, + "learning_rate": 0.0002, + "loss": 0.7792, + "step": 24240 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 0.6820309162139893, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 24250 + }, + { + "epoch": 1.7421903052064631, + "grad_norm": 0.8210766911506653, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 24260 + }, + { + "epoch": 1.7429084380610413, + "grad_norm": 0.724466860294342, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 24270 + }, + { + "epoch": 1.7436265709156193, + "grad_norm": 0.8768740296363831, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 24280 + }, + { + "epoch": 1.7443447037701976, + "grad_norm": 0.6691206097602844, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24290 + }, + { + "epoch": 1.7450628366247756, + "grad_norm": 0.6529893279075623, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 24300 + }, + { + "epoch": 1.7457809694793536, + "grad_norm": 0.904729962348938, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 24310 + }, + { + "epoch": 1.7464991023339318, + "grad_norm": 0.655235230922699, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24320 + }, + { + "epoch": 1.74721723518851, + "grad_norm": 0.9476361274719238, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 24330 + }, + { + "epoch": 1.747935368043088, + "grad_norm": 0.55366051197052, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 24340 + }, + { + "epoch": 1.748653500897666, + "grad_norm": 0.7192568182945251, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 24350 + }, + { + "epoch": 1.749371633752244, + "grad_norm": 0.7193983793258667, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 24360 + }, + { + "epoch": 1.7500897666068223, + "grad_norm": 0.753998339176178, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24370 + }, + { + "epoch": 1.7508078994614005, + "grad_norm": 1.1058299541473389, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 24380 + }, + { + "epoch": 1.7515260323159785, + "grad_norm": 0.7213007211685181, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 24390 + }, + { + "epoch": 1.7522441651705565, + "grad_norm": 0.972494900226593, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 24400 + }, + { + "epoch": 1.7529622980251347, + "grad_norm": 0.8045306205749512, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 24410 + }, + { + "epoch": 1.7536804308797127, + "grad_norm": 0.82415372133255, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24420 + }, + { + "epoch": 1.754398563734291, + "grad_norm": 0.72683185338974, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 24430 + }, + { + "epoch": 1.755116696588869, + "grad_norm": 0.687907338142395, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 24440 + }, + { + "epoch": 1.755834829443447, + "grad_norm": 0.6616531610488892, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 24450 + }, + { + "epoch": 1.7565529622980252, + "grad_norm": 0.7225571870803833, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 24460 + }, + { + "epoch": 1.7572710951526034, + "grad_norm": 0.7597603797912598, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 24470 + }, + { + "epoch": 1.7579892280071814, + "grad_norm": 0.7850660681724548, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 24480 + }, + { + "epoch": 1.7587073608617594, + "grad_norm": 0.9843530058860779, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 24490 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 0.7010256052017212, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 24500 + }, + { + "epoch": 1.7601436265709156, + "grad_norm": 0.5669383406639099, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 24510 + }, + { + "epoch": 1.7608617594254938, + "grad_norm": 0.7043302655220032, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 24520 + }, + { + "epoch": 1.7615798922800718, + "grad_norm": 0.8000741600990295, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 24530 + }, + { + "epoch": 1.7622980251346498, + "grad_norm": 0.7084416747093201, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 24540 + }, + { + "epoch": 1.763016157989228, + "grad_norm": 0.7290608882904053, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 24550 + }, + { + "epoch": 1.763734290843806, + "grad_norm": 0.8710007071495056, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 24560 + }, + { + "epoch": 1.7644524236983843, + "grad_norm": 0.6346535682678223, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 24570 + }, + { + "epoch": 1.7651705565529623, + "grad_norm": 0.8990599513053894, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 24580 + }, + { + "epoch": 1.7658886894075403, + "grad_norm": 0.7823857665061951, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 24590 + }, + { + "epoch": 1.7666068222621185, + "grad_norm": 0.6250144839286804, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 24600 + }, + { + "epoch": 1.7673249551166967, + "grad_norm": 0.715657114982605, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 24610 + }, + { + "epoch": 1.7680430879712747, + "grad_norm": 0.6254874467849731, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 24620 + }, + { + "epoch": 1.7687612208258527, + "grad_norm": 0.6873717904090881, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 24630 + }, + { + "epoch": 1.7694793536804307, + "grad_norm": 0.7273038625717163, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 24640 + }, + { + "epoch": 1.770197486535009, + "grad_norm": 0.9079981446266174, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 24650 + }, + { + "epoch": 1.7709156193895872, + "grad_norm": 0.6262510418891907, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 24660 + }, + { + "epoch": 1.7716337522441652, + "grad_norm": 0.7326231002807617, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 24670 + }, + { + "epoch": 1.7723518850987432, + "grad_norm": 0.7828301787376404, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 24680 + }, + { + "epoch": 1.7730700179533212, + "grad_norm": 0.5881586670875549, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 24690 + }, + { + "epoch": 1.7737881508078994, + "grad_norm": 0.7101683020591736, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 24700 + }, + { + "epoch": 1.7745062836624776, + "grad_norm": 0.8466469049453735, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 24710 + }, + { + "epoch": 1.7752244165170556, + "grad_norm": 0.7770822644233704, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 24720 + }, + { + "epoch": 1.7759425493716336, + "grad_norm": 0.7259120345115662, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 24730 + }, + { + "epoch": 1.7766606822262119, + "grad_norm": 0.7696824669837952, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 24740 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 0.7603837847709656, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 24750 + }, + { + "epoch": 1.778096947935368, + "grad_norm": 0.6166595220565796, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 24760 + }, + { + "epoch": 1.778815080789946, + "grad_norm": 0.7493758797645569, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 24770 + }, + { + "epoch": 1.779533213644524, + "grad_norm": 0.7177459597587585, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 24780 + }, + { + "epoch": 1.7802513464991023, + "grad_norm": 0.6666781306266785, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 24790 + }, + { + "epoch": 1.7809694793536806, + "grad_norm": 0.6556468605995178, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 24800 + }, + { + "epoch": 1.7816876122082586, + "grad_norm": 0.6119393706321716, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 24810 + }, + { + "epoch": 1.7824057450628366, + "grad_norm": 0.8573325276374817, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 24820 + }, + { + "epoch": 1.7831238779174146, + "grad_norm": 0.8017005920410156, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 24830 + }, + { + "epoch": 1.7838420107719928, + "grad_norm": 0.7337947487831116, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24840 + }, + { + "epoch": 1.784560143626571, + "grad_norm": 0.6717178225517273, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 24850 + }, + { + "epoch": 1.785278276481149, + "grad_norm": 0.8243708610534668, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 24860 + }, + { + "epoch": 1.785996409335727, + "grad_norm": 0.8111547827720642, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24870 + }, + { + "epoch": 1.7867145421903052, + "grad_norm": 0.8577823042869568, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 24880 + }, + { + "epoch": 1.7874326750448835, + "grad_norm": 0.6488644480705261, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 24890 + }, + { + "epoch": 1.7881508078994615, + "grad_norm": 0.6446744799613953, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 24900 + }, + { + "epoch": 1.7888689407540395, + "grad_norm": 0.6400182247161865, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 24910 + }, + { + "epoch": 1.7895870736086175, + "grad_norm": 0.8059108853340149, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 24920 + }, + { + "epoch": 1.7903052064631957, + "grad_norm": 0.7101734280586243, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 24930 + }, + { + "epoch": 1.791023339317774, + "grad_norm": 1.0397762060165405, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 24940 + }, + { + "epoch": 1.791741472172352, + "grad_norm": 0.6231128573417664, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 24950 + }, + { + "epoch": 1.79245960502693, + "grad_norm": 5.905253887176514, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 24960 + }, + { + "epoch": 1.793177737881508, + "grad_norm": 0.8003911375999451, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 24970 + }, + { + "epoch": 1.7938958707360861, + "grad_norm": 0.6340393424034119, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 24980 + }, + { + "epoch": 1.7946140035906644, + "grad_norm": 0.8701013922691345, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 24990 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 0.9085575342178345, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 25000 + }, + { + "epoch": 1.7960502692998204, + "grad_norm": 0.6306625604629517, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 25010 + }, + { + "epoch": 1.7967684021543986, + "grad_norm": 0.6985056400299072, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25020 + }, + { + "epoch": 1.7974865350089768, + "grad_norm": 0.7309113144874573, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 25030 + }, + { + "epoch": 1.7982046678635548, + "grad_norm": 0.6795042157173157, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 25040 + }, + { + "epoch": 1.7989228007181328, + "grad_norm": 0.6920178532600403, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25050 + }, + { + "epoch": 1.7996409335727108, + "grad_norm": 0.6578564047813416, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25060 + }, + { + "epoch": 1.800359066427289, + "grad_norm": 0.6718358993530273, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 25070 + }, + { + "epoch": 1.8010771992818673, + "grad_norm": 0.9086750149726868, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 25080 + }, + { + "epoch": 1.8017953321364453, + "grad_norm": 0.6102437973022461, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 25090 + }, + { + "epoch": 1.8025134649910233, + "grad_norm": 0.6391313076019287, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 25100 + }, + { + "epoch": 1.8032315978456013, + "grad_norm": 0.7150128483772278, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 25110 + }, + { + "epoch": 1.8039497307001795, + "grad_norm": 0.9833421111106873, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 25120 + }, + { + "epoch": 1.8046678635547577, + "grad_norm": 0.774002194404602, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25130 + }, + { + "epoch": 1.8053859964093357, + "grad_norm": 0.644443154335022, + "learning_rate": 0.0002, + "loss": 0.7329, + "step": 25140 + }, + { + "epoch": 1.8061041292639137, + "grad_norm": 0.6996100544929504, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 25150 + }, + { + "epoch": 1.806822262118492, + "grad_norm": 0.7545985579490662, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 25160 + }, + { + "epoch": 1.8075403949730702, + "grad_norm": 0.7505226731300354, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 25170 + }, + { + "epoch": 1.8082585278276482, + "grad_norm": 0.800681471824646, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 25180 + }, + { + "epoch": 1.8089766606822262, + "grad_norm": 0.8268337845802307, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 25190 + }, + { + "epoch": 1.8096947935368042, + "grad_norm": 0.6436594128608704, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 25200 + }, + { + "epoch": 1.8104129263913824, + "grad_norm": 0.6961014270782471, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 25210 + }, + { + "epoch": 1.8111310592459606, + "grad_norm": 0.6649489998817444, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 25220 + }, + { + "epoch": 1.8118491921005386, + "grad_norm": 0.7071637511253357, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 25230 + }, + { + "epoch": 1.8125673249551166, + "grad_norm": 0.9082241654396057, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 25240 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 0.6318159103393555, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 25250 + }, + { + "epoch": 1.8140035906642729, + "grad_norm": 0.8006597757339478, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 25260 + }, + { + "epoch": 1.814721723518851, + "grad_norm": 0.7950259447097778, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 25270 + }, + { + "epoch": 1.815439856373429, + "grad_norm": 0.8376588821411133, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 25280 + }, + { + "epoch": 1.816157989228007, + "grad_norm": 0.8343217968940735, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 25290 + }, + { + "epoch": 1.8168761220825853, + "grad_norm": 0.6240017414093018, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 25300 + }, + { + "epoch": 1.8175942549371635, + "grad_norm": 0.7079808712005615, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 25310 + }, + { + "epoch": 1.8183123877917415, + "grad_norm": 0.5930073261260986, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 25320 + }, + { + "epoch": 1.8190305206463195, + "grad_norm": 0.6994491815567017, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 25330 + }, + { + "epoch": 1.8197486535008975, + "grad_norm": 0.8285305500030518, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 25340 + }, + { + "epoch": 1.8204667863554758, + "grad_norm": 0.6880194544792175, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 25350 + }, + { + "epoch": 1.821184919210054, + "grad_norm": 0.7301307916641235, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 25360 + }, + { + "epoch": 1.821903052064632, + "grad_norm": 0.8117532730102539, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 25370 + }, + { + "epoch": 1.82262118491921, + "grad_norm": 0.8098701238632202, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 25380 + }, + { + "epoch": 1.823339317773788, + "grad_norm": 0.6899038553237915, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 25390 + }, + { + "epoch": 1.8240574506283662, + "grad_norm": 0.7350431084632874, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 25400 + }, + { + "epoch": 1.8247755834829444, + "grad_norm": 0.8723382949829102, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 25410 + }, + { + "epoch": 1.8254937163375224, + "grad_norm": 0.7448108196258545, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 25420 + }, + { + "epoch": 1.8262118491921004, + "grad_norm": 0.7525040507316589, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25430 + }, + { + "epoch": 1.8269299820466787, + "grad_norm": 0.7148599028587341, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25440 + }, + { + "epoch": 1.827648114901257, + "grad_norm": 1.1802153587341309, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 25450 + }, + { + "epoch": 1.828366247755835, + "grad_norm": 0.619945764541626, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25460 + }, + { + "epoch": 1.829084380610413, + "grad_norm": 0.7065792679786682, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 25470 + }, + { + "epoch": 1.829802513464991, + "grad_norm": 0.6626001596450806, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 25480 + }, + { + "epoch": 1.8305206463195691, + "grad_norm": 0.8368920087814331, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 25490 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 0.7528934478759766, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 25500 + }, + { + "epoch": 1.8319569120287253, + "grad_norm": 0.6472136378288269, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 25510 + }, + { + "epoch": 1.8326750448833034, + "grad_norm": 0.7818671464920044, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 25520 + }, + { + "epoch": 1.8333931777378814, + "grad_norm": 0.8280798196792603, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 25530 + }, + { + "epoch": 1.8341113105924596, + "grad_norm": 0.7038599252700806, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 25540 + }, + { + "epoch": 1.8348294434470378, + "grad_norm": 0.6345962882041931, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 25550 + }, + { + "epoch": 1.8355475763016158, + "grad_norm": 0.6891741752624512, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 25560 + }, + { + "epoch": 1.8362657091561938, + "grad_norm": 0.7753492593765259, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 25570 + }, + { + "epoch": 1.836983842010772, + "grad_norm": 0.6907210946083069, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 25580 + }, + { + "epoch": 1.8377019748653503, + "grad_norm": 0.7483090162277222, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 25590 + }, + { + "epoch": 1.8384201077199283, + "grad_norm": 0.8749029636383057, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 25600 + }, + { + "epoch": 1.8391382405745063, + "grad_norm": 0.6936851143836975, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 25610 + }, + { + "epoch": 1.8398563734290843, + "grad_norm": 0.7273763418197632, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 25620 + }, + { + "epoch": 1.8405745062836625, + "grad_norm": 0.7655298113822937, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 25630 + }, + { + "epoch": 1.8412926391382407, + "grad_norm": 0.7207344770431519, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 25640 + }, + { + "epoch": 1.8420107719928187, + "grad_norm": 0.6970131397247314, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 25650 + }, + { + "epoch": 1.8427289048473967, + "grad_norm": 0.7777560353279114, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25660 + }, + { + "epoch": 1.8434470377019747, + "grad_norm": 0.7070116400718689, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 25670 + }, + { + "epoch": 1.844165170556553, + "grad_norm": 0.6980257630348206, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 25680 + }, + { + "epoch": 1.8448833034111312, + "grad_norm": 0.906563401222229, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 25690 + }, + { + "epoch": 1.8456014362657092, + "grad_norm": 0.567991316318512, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 25700 + }, + { + "epoch": 1.8463195691202872, + "grad_norm": 0.5954506993293762, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 25710 + }, + { + "epoch": 1.8470377019748654, + "grad_norm": 0.8073318600654602, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 25720 + }, + { + "epoch": 1.8477558348294436, + "grad_norm": 0.7439551949501038, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 25730 + }, + { + "epoch": 1.8484739676840216, + "grad_norm": 0.8091771602630615, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 25740 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 0.6584576964378357, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 25750 + }, + { + "epoch": 1.8499102333931776, + "grad_norm": 0.8161963224411011, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 25760 + }, + { + "epoch": 1.8506283662477558, + "grad_norm": 0.7337122559547424, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 25770 + }, + { + "epoch": 1.851346499102334, + "grad_norm": 0.8968114256858826, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25780 + }, + { + "epoch": 1.852064631956912, + "grad_norm": 0.8647686839103699, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 25790 + }, + { + "epoch": 1.85278276481149, + "grad_norm": 0.7775349020957947, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 25800 + }, + { + "epoch": 1.853500897666068, + "grad_norm": 0.686072587966919, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 25810 + }, + { + "epoch": 1.8542190305206463, + "grad_norm": 0.7053380012512207, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 25820 + }, + { + "epoch": 1.8549371633752245, + "grad_norm": 0.7899979948997498, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 25830 + }, + { + "epoch": 1.8556552962298025, + "grad_norm": 0.6970776915550232, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 25840 + }, + { + "epoch": 1.8563734290843805, + "grad_norm": 0.7210841774940491, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 25850 + }, + { + "epoch": 1.8570915619389587, + "grad_norm": 0.7297208905220032, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 25860 + }, + { + "epoch": 1.857809694793537, + "grad_norm": 0.7782729268074036, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 25870 + }, + { + "epoch": 1.858527827648115, + "grad_norm": 0.7227505445480347, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 25880 + }, + { + "epoch": 1.859245960502693, + "grad_norm": 0.7489684224128723, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 25890 + }, + { + "epoch": 1.859964093357271, + "grad_norm": 0.7447289824485779, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 25900 + }, + { + "epoch": 1.8606822262118492, + "grad_norm": 0.8516317009925842, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 25910 + }, + { + "epoch": 1.8614003590664274, + "grad_norm": 0.6864543557167053, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 25920 + }, + { + "epoch": 1.8621184919210054, + "grad_norm": 0.6753451824188232, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 25930 + }, + { + "epoch": 1.8628366247755834, + "grad_norm": 0.631679117679596, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25940 + }, + { + "epoch": 1.8635547576301614, + "grad_norm": 0.7715049982070923, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 25950 + }, + { + "epoch": 1.8642728904847397, + "grad_norm": 0.7354850769042969, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 25960 + }, + { + "epoch": 1.8649910233393179, + "grad_norm": 0.7443442940711975, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 25970 + }, + { + "epoch": 1.8657091561938959, + "grad_norm": 0.6880337595939636, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 25980 + }, + { + "epoch": 1.8664272890484739, + "grad_norm": 0.843941867351532, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 25990 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 0.6904318928718567, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 26000 + }, + { + "epoch": 1.86786355475763, + "grad_norm": 0.9041751623153687, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 26010 + }, + { + "epoch": 1.8685816876122083, + "grad_norm": 0.7470057010650635, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 26020 + }, + { + "epoch": 1.8692998204667863, + "grad_norm": 0.6921331882476807, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 26030 + }, + { + "epoch": 1.8700179533213643, + "grad_norm": 0.7627376914024353, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 26040 + }, + { + "epoch": 1.8707360861759426, + "grad_norm": 0.7784932851791382, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 26050 + }, + { + "epoch": 1.8714542190305208, + "grad_norm": 0.6399524807929993, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 26060 + }, + { + "epoch": 1.8721723518850988, + "grad_norm": 0.6478492617607117, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26070 + }, + { + "epoch": 1.8728904847396768, + "grad_norm": 0.6376804113388062, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 26080 + }, + { + "epoch": 1.8736086175942548, + "grad_norm": 0.6976892352104187, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 26090 + }, + { + "epoch": 1.874326750448833, + "grad_norm": 0.7997903227806091, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 26100 + }, + { + "epoch": 1.8750448833034112, + "grad_norm": 0.6984273791313171, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 26110 + }, + { + "epoch": 1.8757630161579892, + "grad_norm": 0.7020659446716309, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26120 + }, + { + "epoch": 1.8764811490125672, + "grad_norm": 0.784986138343811, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 26130 + }, + { + "epoch": 1.8771992818671455, + "grad_norm": 0.7369210124015808, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 26140 + }, + { + "epoch": 1.8779174147217235, + "grad_norm": 0.7730622291564941, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 26150 + }, + { + "epoch": 1.8786355475763017, + "grad_norm": 0.7253434658050537, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 26160 + }, + { + "epoch": 1.8793536804308797, + "grad_norm": 0.8019800186157227, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 26170 + }, + { + "epoch": 1.8800718132854577, + "grad_norm": 0.7337628602981567, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 26180 + }, + { + "epoch": 1.880789946140036, + "grad_norm": 0.7049200534820557, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 26190 + }, + { + "epoch": 1.8815080789946141, + "grad_norm": 0.6451525092124939, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 26200 + }, + { + "epoch": 1.8822262118491921, + "grad_norm": 0.7660874724388123, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 26210 + }, + { + "epoch": 1.8829443447037701, + "grad_norm": 0.8464223146438599, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26220 + }, + { + "epoch": 1.8836624775583481, + "grad_norm": 0.859503984451294, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 26230 + }, + { + "epoch": 1.8843806104129264, + "grad_norm": 0.6969478726387024, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 26240 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 0.6860285997390747, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 26250 + }, + { + "epoch": 1.8858168761220826, + "grad_norm": 0.5873110294342041, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 26260 + }, + { + "epoch": 1.8865350089766606, + "grad_norm": 0.6959530115127563, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 26270 + }, + { + "epoch": 1.8872531418312388, + "grad_norm": 0.8734689950942993, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 26280 + }, + { + "epoch": 1.8879712746858168, + "grad_norm": 0.7385509014129639, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 26290 + }, + { + "epoch": 1.888689407540395, + "grad_norm": 0.6702063083648682, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 26300 + }, + { + "epoch": 1.889407540394973, + "grad_norm": 0.8177255988121033, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 26310 + }, + { + "epoch": 1.890125673249551, + "grad_norm": 0.6638466715812683, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 26320 + }, + { + "epoch": 1.8908438061041293, + "grad_norm": 0.8584128618240356, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 26330 + }, + { + "epoch": 1.8915619389587075, + "grad_norm": 0.677561342716217, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 26340 + }, + { + "epoch": 1.8922800718132855, + "grad_norm": 0.6931864619255066, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 26350 + }, + { + "epoch": 1.8929982046678635, + "grad_norm": 0.6583828330039978, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 26360 + }, + { + "epoch": 1.8937163375224415, + "grad_norm": 0.6708519458770752, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 26370 + }, + { + "epoch": 1.8944344703770197, + "grad_norm": 0.7684788107872009, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 26380 + }, + { + "epoch": 1.895152603231598, + "grad_norm": 0.703217625617981, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 26390 + }, + { + "epoch": 1.895870736086176, + "grad_norm": 0.6686710119247437, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26400 + }, + { + "epoch": 1.896588868940754, + "grad_norm": 0.7429705262184143, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 26410 + }, + { + "epoch": 1.8973070017953322, + "grad_norm": 0.7835305333137512, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 26420 + }, + { + "epoch": 1.8980251346499102, + "grad_norm": 0.7793689370155334, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 26430 + }, + { + "epoch": 1.8987432675044884, + "grad_norm": 0.7337237000465393, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 26440 + }, + { + "epoch": 1.8994614003590664, + "grad_norm": 0.5734546780586243, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 26450 + }, + { + "epoch": 1.9001795332136444, + "grad_norm": 0.655937135219574, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 26460 + }, + { + "epoch": 1.9008976660682226, + "grad_norm": 1.0200905799865723, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 26470 + }, + { + "epoch": 1.9016157989228009, + "grad_norm": 0.6118829250335693, + "learning_rate": 0.0002, + "loss": 0.733, + "step": 26480 + }, + { + "epoch": 1.9023339317773789, + "grad_norm": 0.7459297776222229, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 26490 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 0.9451959729194641, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 26500 + }, + { + "epoch": 1.9037701974865349, + "grad_norm": 0.9694880247116089, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 26510 + }, + { + "epoch": 1.904488330341113, + "grad_norm": 0.806532084941864, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 26520 + }, + { + "epoch": 1.9052064631956913, + "grad_norm": 0.7016968727111816, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 26530 + }, + { + "epoch": 1.9059245960502693, + "grad_norm": 0.7707533836364746, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26540 + }, + { + "epoch": 1.9066427289048473, + "grad_norm": 0.716044545173645, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 26550 + }, + { + "epoch": 1.9073608617594255, + "grad_norm": 0.7904782295227051, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 26560 + }, + { + "epoch": 1.9080789946140035, + "grad_norm": 0.8557461500167847, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 26570 + }, + { + "epoch": 1.9087971274685818, + "grad_norm": 0.6807048916816711, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26580 + }, + { + "epoch": 1.9095152603231598, + "grad_norm": 0.8374032974243164, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 26590 + }, + { + "epoch": 1.9102333931777378, + "grad_norm": 0.7936834692955017, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 26600 + }, + { + "epoch": 1.910951526032316, + "grad_norm": 0.6342210173606873, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 26610 + }, + { + "epoch": 1.9116696588868942, + "grad_norm": 0.8222208023071289, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 26620 + }, + { + "epoch": 1.9123877917414722, + "grad_norm": 0.7890012860298157, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 26630 + }, + { + "epoch": 1.9131059245960502, + "grad_norm": 0.6415254473686218, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 26640 + }, + { + "epoch": 1.9138240574506282, + "grad_norm": 0.7936763763427734, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 26650 + }, + { + "epoch": 1.9145421903052064, + "grad_norm": 0.7174334526062012, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 26660 + }, + { + "epoch": 1.9152603231597847, + "grad_norm": 0.6503710746765137, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 26670 + }, + { + "epoch": 1.9159784560143627, + "grad_norm": 0.7618577480316162, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 26680 + }, + { + "epoch": 1.9166965888689407, + "grad_norm": 0.7984131574630737, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 26690 + }, + { + "epoch": 1.917414721723519, + "grad_norm": 0.6863887906074524, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 26700 + }, + { + "epoch": 1.918132854578097, + "grad_norm": 0.7621138691902161, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 26710 + }, + { + "epoch": 1.9188509874326751, + "grad_norm": 0.7855543494224548, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 26720 + }, + { + "epoch": 1.9195691202872531, + "grad_norm": 0.7045016288757324, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 26730 + }, + { + "epoch": 1.9202872531418311, + "grad_norm": 0.7799559235572815, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 26740 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 0.7999796271324158, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 26750 + }, + { + "epoch": 1.9217235188509876, + "grad_norm": 0.5479980111122131, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 26760 + }, + { + "epoch": 1.9224416517055656, + "grad_norm": 0.7192868590354919, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 26770 + }, + { + "epoch": 1.9231597845601436, + "grad_norm": 0.7642375826835632, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 26780 + }, + { + "epoch": 1.9238779174147216, + "grad_norm": 0.7015959620475769, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 26790 + }, + { + "epoch": 1.9245960502692998, + "grad_norm": 0.6685634851455688, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 26800 + }, + { + "epoch": 1.925314183123878, + "grad_norm": 0.674363911151886, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 26810 + }, + { + "epoch": 1.926032315978456, + "grad_norm": 0.769318163394928, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 26820 + }, + { + "epoch": 1.926750448833034, + "grad_norm": 0.7397989630699158, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 26830 + }, + { + "epoch": 1.9274685816876123, + "grad_norm": 0.7603814601898193, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 26840 + }, + { + "epoch": 1.9281867145421903, + "grad_norm": 0.5960564613342285, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 26850 + }, + { + "epoch": 1.9289048473967685, + "grad_norm": 0.8158858418464661, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 26860 + }, + { + "epoch": 1.9296229802513465, + "grad_norm": 0.7022058367729187, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 26870 + }, + { + "epoch": 1.9303411131059245, + "grad_norm": 0.7249060273170471, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 26880 + }, + { + "epoch": 1.9310592459605027, + "grad_norm": 0.7613264322280884, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 26890 + }, + { + "epoch": 1.931777378815081, + "grad_norm": 0.6857499480247498, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 26900 + }, + { + "epoch": 1.932495511669659, + "grad_norm": 0.6968346834182739, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 26910 + }, + { + "epoch": 1.933213644524237, + "grad_norm": 0.7079267501831055, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 26920 + }, + { + "epoch": 1.933931777378815, + "grad_norm": 0.6571618914604187, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 26930 + }, + { + "epoch": 1.9346499102333932, + "grad_norm": 0.7460548281669617, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 26940 + }, + { + "epoch": 1.9353680430879714, + "grad_norm": 0.7954307794570923, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 26950 + }, + { + "epoch": 1.9360861759425494, + "grad_norm": 0.8696223497390747, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 26960 + }, + { + "epoch": 1.9368043087971274, + "grad_norm": 0.726004421710968, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 26970 + }, + { + "epoch": 1.9375224416517056, + "grad_norm": 0.8760337829589844, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 26980 + }, + { + "epoch": 1.9382405745062836, + "grad_norm": 0.7308675646781921, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 26990 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 0.5900304317474365, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 27000 + }, + { + "epoch": 1.9396768402154398, + "grad_norm": 0.8839457631111145, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 27010 + }, + { + "epoch": 1.9403949730700178, + "grad_norm": 0.7239173650741577, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 27020 + }, + { + "epoch": 1.941113105924596, + "grad_norm": 0.8972901701927185, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 27030 + }, + { + "epoch": 1.9418312387791743, + "grad_norm": 0.7140652537345886, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 27040 + }, + { + "epoch": 1.9425493716337523, + "grad_norm": 0.7502743005752563, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 27050 + }, + { + "epoch": 1.9432675044883303, + "grad_norm": 0.6420751810073853, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 27060 + }, + { + "epoch": 1.9439856373429083, + "grad_norm": 0.6671820282936096, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 27070 + }, + { + "epoch": 1.9447037701974865, + "grad_norm": 0.6268796324729919, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 27080 + }, + { + "epoch": 1.9454219030520647, + "grad_norm": 0.6850021481513977, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 27090 + }, + { + "epoch": 1.9461400359066428, + "grad_norm": 0.6380038261413574, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 27100 + }, + { + "epoch": 1.9468581687612208, + "grad_norm": 0.5806204080581665, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 27110 + }, + { + "epoch": 1.947576301615799, + "grad_norm": 0.8236927390098572, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 27120 + }, + { + "epoch": 1.948294434470377, + "grad_norm": 0.7915826439857483, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27130 + }, + { + "epoch": 1.9490125673249552, + "grad_norm": 0.7467429041862488, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 27140 + }, + { + "epoch": 1.9497307001795332, + "grad_norm": 0.6278707981109619, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27150 + }, + { + "epoch": 1.9504488330341112, + "grad_norm": 0.7353739142417908, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 27160 + }, + { + "epoch": 1.9511669658886894, + "grad_norm": 0.6443645358085632, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27170 + }, + { + "epoch": 1.9518850987432677, + "grad_norm": 0.770800769329071, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 27180 + }, + { + "epoch": 1.9526032315978457, + "grad_norm": 0.8982598781585693, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 27190 + }, + { + "epoch": 1.9533213644524237, + "grad_norm": 0.775017499923706, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 27200 + }, + { + "epoch": 1.9540394973070017, + "grad_norm": 0.8271628618240356, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 27210 + }, + { + "epoch": 1.9547576301615799, + "grad_norm": 0.7460184693336487, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 27220 + }, + { + "epoch": 1.955475763016158, + "grad_norm": 0.7732188105583191, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 27230 + }, + { + "epoch": 1.956193895870736, + "grad_norm": 0.7398577332496643, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 27240 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 0.7132339477539062, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 27250 + }, + { + "epoch": 1.9576301615798921, + "grad_norm": 0.6718965768814087, + "learning_rate": 0.0002, + "loss": 0.7731, + "step": 27260 + }, + { + "epoch": 1.9583482944344703, + "grad_norm": 0.7914422154426575, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 27270 + }, + { + "epoch": 1.9590664272890486, + "grad_norm": 0.8314110636711121, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 27280 + }, + { + "epoch": 1.9597845601436266, + "grad_norm": 0.7810674905776978, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 27290 + }, + { + "epoch": 1.9605026929982046, + "grad_norm": 0.7691007256507874, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 27300 + }, + { + "epoch": 1.9612208258527828, + "grad_norm": 0.6753138899803162, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 27310 + }, + { + "epoch": 1.961938958707361, + "grad_norm": 0.5881175994873047, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 27320 + }, + { + "epoch": 1.962657091561939, + "grad_norm": 0.8414133191108704, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27330 + }, + { + "epoch": 1.963375224416517, + "grad_norm": 0.7363715171813965, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 27340 + }, + { + "epoch": 1.964093357271095, + "grad_norm": 0.6526232361793518, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 27350 + }, + { + "epoch": 1.9648114901256732, + "grad_norm": 0.6821389198303223, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 27360 + }, + { + "epoch": 1.9655296229802515, + "grad_norm": 0.7306062579154968, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 27370 + }, + { + "epoch": 1.9662477558348295, + "grad_norm": 0.6458130478858948, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 27380 + }, + { + "epoch": 1.9669658886894075, + "grad_norm": 0.7243196368217468, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 27390 + }, + { + "epoch": 1.9676840215439855, + "grad_norm": 0.8062235713005066, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 27400 + }, + { + "epoch": 1.9684021543985637, + "grad_norm": 0.68441241979599, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 27410 + }, + { + "epoch": 1.969120287253142, + "grad_norm": 0.7504498958587646, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 27420 + }, + { + "epoch": 1.96983842010772, + "grad_norm": 0.7469466328620911, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 27430 + }, + { + "epoch": 1.970556552962298, + "grad_norm": 0.7109853625297546, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 27440 + }, + { + "epoch": 1.9712746858168761, + "grad_norm": 0.6964903473854065, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 27450 + }, + { + "epoch": 1.9719928186714544, + "grad_norm": 0.8224200010299683, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 27460 + }, + { + "epoch": 1.9727109515260324, + "grad_norm": 0.6195617318153381, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 27470 + }, + { + "epoch": 1.9734290843806104, + "grad_norm": 0.691511332988739, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 27480 + }, + { + "epoch": 1.9741472172351884, + "grad_norm": 0.7437900304794312, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 27490 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 0.7987960577011108, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 27500 + }, + { + "epoch": 1.9755834829443448, + "grad_norm": 0.7117776274681091, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 27510 + }, + { + "epoch": 1.9763016157989228, + "grad_norm": 0.8473866581916809, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 27520 + }, + { + "epoch": 1.9770197486535008, + "grad_norm": 0.7178242802619934, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 27530 + }, + { + "epoch": 1.9777378815080788, + "grad_norm": 0.760145902633667, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 27540 + }, + { + "epoch": 1.978456014362657, + "grad_norm": 0.764436662197113, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 27550 + }, + { + "epoch": 1.9791741472172353, + "grad_norm": 0.7245904803276062, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 27560 + }, + { + "epoch": 1.9798922800718133, + "grad_norm": 0.6317000389099121, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 27570 + }, + { + "epoch": 1.9806104129263913, + "grad_norm": 0.8764704465866089, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 27580 + }, + { + "epoch": 1.9813285457809695, + "grad_norm": 0.6111825108528137, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 27590 + }, + { + "epoch": 1.9820466786355477, + "grad_norm": 0.6797714233398438, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 27600 + }, + { + "epoch": 1.9827648114901257, + "grad_norm": 0.7754142880439758, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 27610 + }, + { + "epoch": 1.9834829443447037, + "grad_norm": 0.7243061661720276, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 27620 + }, + { + "epoch": 1.9842010771992817, + "grad_norm": 0.6194812655448914, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 27630 + }, + { + "epoch": 1.98491921005386, + "grad_norm": 0.6399638056755066, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27640 + }, + { + "epoch": 1.9856373429084382, + "grad_norm": 0.7637218832969666, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 27650 + }, + { + "epoch": 1.9863554757630162, + "grad_norm": 0.9099404811859131, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 27660 + }, + { + "epoch": 1.9870736086175942, + "grad_norm": 0.6892596483230591, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 27670 + }, + { + "epoch": 1.9877917414721722, + "grad_norm": 0.5962418913841248, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 27680 + }, + { + "epoch": 1.9885098743267504, + "grad_norm": 0.5750163197517395, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27690 + }, + { + "epoch": 1.9892280071813286, + "grad_norm": 0.6740097403526306, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 27700 + }, + { + "epoch": 1.9899461400359066, + "grad_norm": 0.6968644857406616, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 27710 + }, + { + "epoch": 1.9906642728904846, + "grad_norm": 0.6788132190704346, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 27720 + }, + { + "epoch": 1.9913824057450629, + "grad_norm": 0.8600544929504395, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 27730 + }, + { + "epoch": 1.992100538599641, + "grad_norm": 0.6227671504020691, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 27740 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 0.6611875295639038, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 27750 + }, + { + "epoch": 1.993536804308797, + "grad_norm": 0.714568018913269, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 27760 + }, + { + "epoch": 1.994254937163375, + "grad_norm": 0.6328669190406799, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27770 + }, + { + "epoch": 1.9949730700179533, + "grad_norm": 0.8673429489135742, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27780 + }, + { + "epoch": 1.9956912028725315, + "grad_norm": 0.820620059967041, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 27790 + }, + { + "epoch": 1.9964093357271095, + "grad_norm": 0.8748094439506531, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 27800 + }, + { + "epoch": 1.9971274685816875, + "grad_norm": 0.8118113875389099, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 27810 + }, + { + "epoch": 1.9978456014362656, + "grad_norm": 0.6886725425720215, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 27820 + }, + { + "epoch": 1.9985637342908438, + "grad_norm": 0.7101268768310547, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 27830 + }, + { + "epoch": 1.999281867145422, + "grad_norm": 0.7823781967163086, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 27840 + }, + { + "epoch": 2.0, + "grad_norm": 0.8491085767745972, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 27850 + }, + { + "epoch": 2.0, + "eval_loss": 1.0868422985076904, + "eval_runtime": 55.1699, + "eval_samples_per_second": 13.286, + "eval_steps_per_second": 1.668, + "step": 27850 + } + ], + "logging_steps": 10, + "max_steps": 111400, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2888356413243392e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8fb55ef495680508d795976446a205457e9485cd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c1872d8306c5b10f9d181b835c81cf47e0494389e3f3c06565b8bdda06555d7 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..47415aa2329b7180b8cc306980100f479b89d06e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:617fce82686e17f86c32c139bf748f1e16deffefe05fb1a0af1908120b34eaa4 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b6e19ee3056d880d063895bdb7f151ebd9ba594 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87f21e61aed5de04752effe467bd414f3e01e768e38f1a04f3807db63af6159d +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c5b6859245e10a01b6b318823624f0db4cc3902 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:190354344b8ee055ba999d32c782cc3147c7b3e3ae8cc7be34ea93d579899b30 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..77ead3a3f88603ea5cdbbd009c1fbdbd4870aa64 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/trainer_state.json @@ -0,0 +1,29296 @@ +{ + "best_metric": 1.0868422985076904, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 41775, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000718132854578097, + "grad_norm": 1.0291756391525269, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 10 + }, + { + "epoch": 0.001436265709156194, + "grad_norm": 0.6570823192596436, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 20 + }, + { + "epoch": 0.0021543985637342907, + "grad_norm": 0.693844199180603, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 30 + }, + { + "epoch": 0.002872531418312388, + "grad_norm": 0.5608532428741455, + "learning_rate": 0.0002, + "loss": 0.9377, + "step": 40 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 0.549075722694397, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 50 + }, + { + "epoch": 0.004308797127468581, + "grad_norm": 0.47189879417419434, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 60 + }, + { + "epoch": 0.005026929982046679, + "grad_norm": 0.5799676775932312, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 70 + }, + { + "epoch": 0.005745062836624776, + "grad_norm": 0.45907193422317505, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 80 + }, + { + "epoch": 0.006463195691202872, + "grad_norm": 0.4373045861721039, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 90 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 100 + }, + { + "epoch": 0.007899461400359067, + "grad_norm": 0.5248253345489502, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 110 + }, + { + "epoch": 0.008617594254937163, + "grad_norm": 0.5082874298095703, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 120 + }, + { + "epoch": 0.00933572710951526, + "grad_norm": 0.42670881748199463, + "learning_rate": 0.0002, + "loss": 0.8678, + "step": 130 + }, + { + "epoch": 0.010053859964093357, + "grad_norm": 0.43311649560928345, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 0.43456509709358215, + "learning_rate": 0.0002, + "loss": 0.9252, + "step": 150 + }, + { + "epoch": 0.011490125673249552, + "grad_norm": 0.9222815632820129, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 160 + }, + { + "epoch": 0.012208258527827648, + "grad_norm": 0.42752256989479065, + "learning_rate": 0.0002, + "loss": 0.8651, + "step": 170 + }, + { + "epoch": 0.012926391382405745, + "grad_norm": 0.4175542891025543, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 180 + }, + { + "epoch": 0.013644524236983842, + "grad_norm": 0.4377831518650055, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 190 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 0.47263655066490173, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 200 + }, + { + "epoch": 0.015080789946140035, + "grad_norm": 0.3870520293712616, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 210 + }, + { + "epoch": 0.015798922800718134, + "grad_norm": 0.4950464963912964, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 220 + }, + { + "epoch": 0.01651705565529623, + "grad_norm": 0.4643295407295227, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 230 + }, + { + "epoch": 0.017235188509874325, + "grad_norm": 0.5152903199195862, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 240 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 0.3800727427005768, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 250 + }, + { + "epoch": 0.01867145421903052, + "grad_norm": 0.43700528144836426, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 260 + }, + { + "epoch": 0.01938958707360862, + "grad_norm": 0.3712887763977051, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 270 + }, + { + "epoch": 0.020107719928186715, + "grad_norm": 0.4202553629875183, + "learning_rate": 0.0002, + "loss": 0.8329, + "step": 280 + }, + { + "epoch": 0.02082585278276481, + "grad_norm": 0.40585094690322876, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 0.4685470759868622, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 300 + }, + { + "epoch": 0.022262118491921005, + "grad_norm": 0.373169481754303, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 310 + }, + { + "epoch": 0.022980251346499104, + "grad_norm": 0.39681482315063477, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 320 + }, + { + "epoch": 0.0236983842010772, + "grad_norm": 0.3919322192668915, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 330 + }, + { + "epoch": 0.024416517055655295, + "grad_norm": 0.4728981554508209, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 340 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 0.42439374327659607, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 350 + }, + { + "epoch": 0.02585278276481149, + "grad_norm": 0.425650030374527, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 360 + }, + { + "epoch": 0.02657091561938959, + "grad_norm": 0.4076762795448303, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 370 + }, + { + "epoch": 0.027289048473967684, + "grad_norm": 0.44335922598838806, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 380 + }, + { + "epoch": 0.02800718132854578, + "grad_norm": 0.5313619375228882, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 390 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 0.37089797854423523, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 400 + }, + { + "epoch": 0.029443447037701975, + "grad_norm": 0.5193604826927185, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 410 + }, + { + "epoch": 0.03016157989228007, + "grad_norm": 0.4428552985191345, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 420 + }, + { + "epoch": 0.03087971274685817, + "grad_norm": 0.384171724319458, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 430 + }, + { + "epoch": 0.03159784560143627, + "grad_norm": 0.3906913101673126, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 440 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 0.5365669131278992, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 450 + }, + { + "epoch": 0.03303411131059246, + "grad_norm": 0.4785287380218506, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 460 + }, + { + "epoch": 0.03375224416517056, + "grad_norm": 0.40048182010650635, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 470 + }, + { + "epoch": 0.03447037701974865, + "grad_norm": 0.49529239535331726, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 480 + }, + { + "epoch": 0.03518850987432675, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 490 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 0.3802863359451294, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.03662477558348295, + "grad_norm": 0.40374308824539185, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 510 + }, + { + "epoch": 0.03734290843806104, + "grad_norm": 0.4320009648799896, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 520 + }, + { + "epoch": 0.03806104129263914, + "grad_norm": 0.5198846459388733, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 530 + }, + { + "epoch": 0.03877917414721724, + "grad_norm": 0.4136947989463806, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 540 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 0.39344364404678345, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 550 + }, + { + "epoch": 0.04021543985637343, + "grad_norm": 0.4659644067287445, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 560 + }, + { + "epoch": 0.04093357271095153, + "grad_norm": 0.3898842930793762, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 570 + }, + { + "epoch": 0.04165170556552962, + "grad_norm": 0.3964841961860657, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 580 + }, + { + "epoch": 0.04236983842010772, + "grad_norm": 0.5172179341316223, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 590 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 0.5362544059753418, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 600 + }, + { + "epoch": 0.04380610412926391, + "grad_norm": 0.3975909948348999, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 610 + }, + { + "epoch": 0.04452423698384201, + "grad_norm": 0.3905031085014343, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 620 + }, + { + "epoch": 0.04524236983842011, + "grad_norm": 0.5148088932037354, + "learning_rate": 0.0002, + "loss": 0.7723, + "step": 630 + }, + { + "epoch": 0.04596050269299821, + "grad_norm": 0.38826194405555725, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 640 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 0.5432049036026001, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.0473967684021544, + "grad_norm": 0.42048221826553345, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 660 + }, + { + "epoch": 0.0481149012567325, + "grad_norm": 0.4683088958263397, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 670 + }, + { + "epoch": 0.04883303411131059, + "grad_norm": 0.4623735249042511, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 680 + }, + { + "epoch": 0.04955116696588869, + "grad_norm": 0.509128212928772, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 690 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 0.45767295360565186, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 700 + }, + { + "epoch": 0.05098743267504488, + "grad_norm": 0.4023726284503937, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 710 + }, + { + "epoch": 0.05170556552962298, + "grad_norm": 0.4407201409339905, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 720 + }, + { + "epoch": 0.05242369838420108, + "grad_norm": 0.41862091422080994, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 730 + }, + { + "epoch": 0.05314183123877918, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 740 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 0.4882921576499939, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 750 + }, + { + "epoch": 0.05457809694793537, + "grad_norm": 0.47890132665634155, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 760 + }, + { + "epoch": 0.05529622980251347, + "grad_norm": 0.5811166167259216, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 770 + }, + { + "epoch": 0.05601436265709156, + "grad_norm": 0.41113588213920593, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 780 + }, + { + "epoch": 0.05673249551166966, + "grad_norm": 0.4120602607727051, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 790 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 0.39287394285202026, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 800 + }, + { + "epoch": 0.05816876122082585, + "grad_norm": 0.3986941874027252, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 810 + }, + { + "epoch": 0.05888689407540395, + "grad_norm": 0.4264012575149536, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 820 + }, + { + "epoch": 0.05960502692998205, + "grad_norm": 0.481139600276947, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 830 + }, + { + "epoch": 0.06032315978456014, + "grad_norm": 0.5561784505844116, + "learning_rate": 0.0002, + "loss": 0.8477, + "step": 840 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 0.4787197411060333, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 850 + }, + { + "epoch": 0.06175942549371634, + "grad_norm": 0.46454647183418274, + "learning_rate": 0.0002, + "loss": 0.8567, + "step": 860 + }, + { + "epoch": 0.06247755834829444, + "grad_norm": 0.5929669141769409, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 870 + }, + { + "epoch": 0.06319569120287254, + "grad_norm": 0.4561384618282318, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 880 + }, + { + "epoch": 0.06391382405745062, + "grad_norm": 0.45767998695373535, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 890 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 0.42475444078445435, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 900 + }, + { + "epoch": 0.06535008976660682, + "grad_norm": 0.4911022484302521, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 910 + }, + { + "epoch": 0.06606822262118492, + "grad_norm": 0.5229166746139526, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 920 + }, + { + "epoch": 0.06678635547576302, + "grad_norm": 0.38134580850601196, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 930 + }, + { + "epoch": 0.06750448833034112, + "grad_norm": 0.4171486496925354, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 940 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 0.45171529054641724, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 950 + }, + { + "epoch": 0.0689407540394973, + "grad_norm": 0.44889307022094727, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 960 + }, + { + "epoch": 0.0696588868940754, + "grad_norm": 0.44902464747428894, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 970 + }, + { + "epoch": 0.0703770197486535, + "grad_norm": 0.4671969413757324, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 980 + }, + { + "epoch": 0.0710951526032316, + "grad_norm": 0.4686984717845917, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 990 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1000 + }, + { + "epoch": 0.0725314183123878, + "grad_norm": 0.48861828446388245, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1010 + }, + { + "epoch": 0.0732495511669659, + "grad_norm": 0.7603165507316589, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 1020 + }, + { + "epoch": 0.07396768402154398, + "grad_norm": 0.501654863357544, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 1030 + }, + { + "epoch": 0.07468581687612208, + "grad_norm": 0.45291560888290405, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 1040 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 0.42454713582992554, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 1050 + }, + { + "epoch": 0.07612208258527828, + "grad_norm": 0.4655592441558838, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1060 + }, + { + "epoch": 0.07684021543985638, + "grad_norm": 0.5011071562767029, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 1070 + }, + { + "epoch": 0.07755834829443448, + "grad_norm": 0.37221577763557434, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 1080 + }, + { + "epoch": 0.07827648114901256, + "grad_norm": 0.5123572945594788, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 1090 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 0.44138720631599426, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1100 + }, + { + "epoch": 0.07971274685816876, + "grad_norm": 0.38932886719703674, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 1110 + }, + { + "epoch": 0.08043087971274686, + "grad_norm": 0.435820072889328, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 1120 + }, + { + "epoch": 0.08114901256732496, + "grad_norm": 0.3820142149925232, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 1130 + }, + { + "epoch": 0.08186714542190306, + "grad_norm": 0.39680808782577515, + "learning_rate": 0.0002, + "loss": 0.8617, + "step": 1140 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 0.4833722412586212, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1150 + }, + { + "epoch": 0.08330341113105924, + "grad_norm": 0.5045956969261169, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1160 + }, + { + "epoch": 0.08402154398563734, + "grad_norm": 0.3652207553386688, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 1170 + }, + { + "epoch": 0.08473967684021544, + "grad_norm": 0.44447052478790283, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 1180 + }, + { + "epoch": 0.08545780969479354, + "grad_norm": 0.44942694902420044, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 1190 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 0.48789075016975403, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1200 + }, + { + "epoch": 0.08689407540394974, + "grad_norm": 0.3981451094150543, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1210 + }, + { + "epoch": 0.08761220825852782, + "grad_norm": 0.45545220375061035, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 1220 + }, + { + "epoch": 0.08833034111310592, + "grad_norm": 0.562138557434082, + "learning_rate": 0.0002, + "loss": 0.8406, + "step": 1230 + }, + { + "epoch": 0.08904847396768402, + "grad_norm": 0.48523494601249695, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 1240 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 0.35054388642311096, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1250 + }, + { + "epoch": 0.09048473967684022, + "grad_norm": 0.4148605167865753, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 1260 + }, + { + "epoch": 0.09120287253141832, + "grad_norm": 0.50171959400177, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 1270 + }, + { + "epoch": 0.09192100538599642, + "grad_norm": 0.41747573018074036, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 1280 + }, + { + "epoch": 0.0926391382405745, + "grad_norm": 0.43028751015663147, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1290 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 0.41274991631507874, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 1300 + }, + { + "epoch": 0.0940754039497307, + "grad_norm": 0.5399569272994995, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 1310 + }, + { + "epoch": 0.0947935368043088, + "grad_norm": 0.44284379482269287, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 1320 + }, + { + "epoch": 0.0955116696588869, + "grad_norm": 0.42511969804763794, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1330 + }, + { + "epoch": 0.096229802513465, + "grad_norm": 0.5717929005622864, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1340 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1350 + }, + { + "epoch": 0.09766606822262118, + "grad_norm": 0.4144339859485626, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 1360 + }, + { + "epoch": 0.09838420107719928, + "grad_norm": 0.43676936626434326, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 1370 + }, + { + "epoch": 0.09910233393177738, + "grad_norm": 0.5297161340713501, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 1380 + }, + { + "epoch": 0.09982046678635548, + "grad_norm": 0.5319193601608276, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1390 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 0.4083728492259979, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1400 + }, + { + "epoch": 0.10125673249551168, + "grad_norm": 0.4193868339061737, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1410 + }, + { + "epoch": 0.10197486535008976, + "grad_norm": 0.4062198996543884, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 1420 + }, + { + "epoch": 0.10269299820466786, + "grad_norm": 0.43972232937812805, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1430 + }, + { + "epoch": 0.10341113105924596, + "grad_norm": 0.4598410725593567, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1440 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 0.571662187576294, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1450 + }, + { + "epoch": 0.10484739676840216, + "grad_norm": 0.5437791347503662, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1460 + }, + { + "epoch": 0.10556552962298026, + "grad_norm": 0.4241923391819, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1470 + }, + { + "epoch": 0.10628366247755835, + "grad_norm": 0.5185145735740662, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1480 + }, + { + "epoch": 0.10700179533213644, + "grad_norm": 0.537626326084137, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 1490 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 0.4573661983013153, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 1500 + }, + { + "epoch": 0.10843806104129264, + "grad_norm": 0.4521017074584961, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 1510 + }, + { + "epoch": 0.10915619389587074, + "grad_norm": 0.6835159063339233, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1520 + }, + { + "epoch": 0.10987432675044884, + "grad_norm": 0.43522894382476807, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 1530 + }, + { + "epoch": 0.11059245960502694, + "grad_norm": 0.685547411441803, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 1540 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 0.5283669233322144, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1550 + }, + { + "epoch": 0.11202872531418312, + "grad_norm": 0.4869283437728882, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 1560 + }, + { + "epoch": 0.11274685816876122, + "grad_norm": 0.43024054169654846, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1570 + }, + { + "epoch": 0.11346499102333932, + "grad_norm": 0.46726059913635254, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1580 + }, + { + "epoch": 0.11418312387791742, + "grad_norm": 0.5046039819717407, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 1590 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 0.48972827196121216, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 1600 + }, + { + "epoch": 0.11561938958707361, + "grad_norm": 0.5221049189567566, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 1610 + }, + { + "epoch": 0.1163375224416517, + "grad_norm": 0.49169477820396423, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 1620 + }, + { + "epoch": 0.1170556552962298, + "grad_norm": 0.48462188243865967, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 1630 + }, + { + "epoch": 0.1177737881508079, + "grad_norm": 0.9001021981239319, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 1640 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 0.47555917501449585, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 1650 + }, + { + "epoch": 0.1192100538599641, + "grad_norm": 0.4523521959781647, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1660 + }, + { + "epoch": 0.1199281867145422, + "grad_norm": 0.510956346988678, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 1670 + }, + { + "epoch": 0.12064631956912028, + "grad_norm": 0.48063746094703674, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 0.12136445242369838, + "grad_norm": 0.5209490060806274, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 1690 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 0.5488983988761902, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1700 + }, + { + "epoch": 0.12280071813285458, + "grad_norm": 0.5263523459434509, + "learning_rate": 0.0002, + "loss": 0.829, + "step": 1710 + }, + { + "epoch": 0.12351885098743268, + "grad_norm": 0.45365768671035767, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 1720 + }, + { + "epoch": 0.12423698384201078, + "grad_norm": 0.4366922378540039, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 1730 + }, + { + "epoch": 0.12495511669658887, + "grad_norm": 0.4841083884239197, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 1740 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 0.46546968817710876, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 1750 + }, + { + "epoch": 0.12639138240574507, + "grad_norm": 0.39987099170684814, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1760 + }, + { + "epoch": 0.12710951526032316, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 1770 + }, + { + "epoch": 0.12782764811490124, + "grad_norm": 0.46716657280921936, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 1780 + }, + { + "epoch": 0.12854578096947936, + "grad_norm": 0.46164995431900024, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1790 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 0.4910370111465454, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 1800 + }, + { + "epoch": 0.12998204667863555, + "grad_norm": 0.5615737438201904, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 1810 + }, + { + "epoch": 0.13070017953321364, + "grad_norm": 0.5739728808403015, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1820 + }, + { + "epoch": 0.13141831238779175, + "grad_norm": 0.44104722142219543, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 1830 + }, + { + "epoch": 0.13213644524236984, + "grad_norm": 0.46373724937438965, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 1840 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 0.4481196403503418, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.13357271095152604, + "grad_norm": 0.5689327716827393, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 1860 + }, + { + "epoch": 0.13429084380610412, + "grad_norm": 0.5334849953651428, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 1870 + }, + { + "epoch": 0.13500897666068223, + "grad_norm": 0.5177253484725952, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 1880 + }, + { + "epoch": 0.13572710951526032, + "grad_norm": 0.4919368326663971, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 1890 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 0.5987576842308044, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 1900 + }, + { + "epoch": 0.13716337522441652, + "grad_norm": 0.49790486693382263, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 1910 + }, + { + "epoch": 0.1378815080789946, + "grad_norm": 0.5337542295455933, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1920 + }, + { + "epoch": 0.13859964093357272, + "grad_norm": 0.5171598792076111, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 1930 + }, + { + "epoch": 0.1393177737881508, + "grad_norm": 0.5003953576087952, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1940 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 0.5147887468338013, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 1950 + }, + { + "epoch": 0.140754039497307, + "grad_norm": 0.6365984678268433, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 1960 + }, + { + "epoch": 0.1414721723518851, + "grad_norm": 0.5449512004852295, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 1970 + }, + { + "epoch": 0.1421903052064632, + "grad_norm": 0.4062703847885132, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1980 + }, + { + "epoch": 0.14290843806104128, + "grad_norm": 0.4446912705898285, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 1990 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 0.49001234769821167, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 2000 + }, + { + "epoch": 0.14434470377019748, + "grad_norm": 0.5591765642166138, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 2010 + }, + { + "epoch": 0.1450628366247756, + "grad_norm": 0.6476696133613586, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 2020 + }, + { + "epoch": 0.14578096947935368, + "grad_norm": 0.44688376784324646, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 2030 + }, + { + "epoch": 0.1464991023339318, + "grad_norm": 0.4437490701675415, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 2040 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 0.59927898645401, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 2050 + }, + { + "epoch": 0.14793536804308796, + "grad_norm": 0.4356591999530792, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 2060 + }, + { + "epoch": 0.14865350089766607, + "grad_norm": 0.5560822486877441, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2070 + }, + { + "epoch": 0.14937163375224416, + "grad_norm": 0.43027108907699585, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2080 + }, + { + "epoch": 0.15008976660682227, + "grad_norm": 0.41215455532073975, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 2090 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 0.4607839584350586, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 2100 + }, + { + "epoch": 0.15152603231597844, + "grad_norm": 0.4699854254722595, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2110 + }, + { + "epoch": 0.15224416517055656, + "grad_norm": 0.5111975073814392, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2120 + }, + { + "epoch": 0.15296229802513464, + "grad_norm": 0.4713742733001709, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2130 + }, + { + "epoch": 0.15368043087971275, + "grad_norm": 0.3816622793674469, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2140 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 0.4637526273727417, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 2150 + }, + { + "epoch": 0.15511669658886895, + "grad_norm": 0.3691818118095398, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2160 + }, + { + "epoch": 0.15583482944344704, + "grad_norm": 0.4435218274593353, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 2170 + }, + { + "epoch": 0.15655296229802512, + "grad_norm": 0.5282211899757385, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 2180 + }, + { + "epoch": 0.15727109515260324, + "grad_norm": 0.7611056566238403, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 2190 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 0.5951169729232788, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 2200 + }, + { + "epoch": 0.15870736086175943, + "grad_norm": 0.5243265628814697, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2210 + }, + { + "epoch": 0.15942549371633752, + "grad_norm": 0.518944501876831, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 2220 + }, + { + "epoch": 0.16014362657091563, + "grad_norm": 0.4264616072177887, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2230 + }, + { + "epoch": 0.16086175942549372, + "grad_norm": 0.4619045853614807, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 2240 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 0.4047030508518219, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2250 + }, + { + "epoch": 0.16229802513464991, + "grad_norm": 0.47133687138557434, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 2260 + }, + { + "epoch": 0.163016157989228, + "grad_norm": 0.4990246593952179, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 2270 + }, + { + "epoch": 0.1637342908438061, + "grad_norm": 0.5145298838615417, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 2280 + }, + { + "epoch": 0.1644524236983842, + "grad_norm": 0.5354352593421936, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 2290 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 0.47621065378189087, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 2300 + }, + { + "epoch": 0.1658886894075404, + "grad_norm": 0.45333582162857056, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 2310 + }, + { + "epoch": 0.16660682226211848, + "grad_norm": 0.4832790493965149, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 2320 + }, + { + "epoch": 0.1673249551166966, + "grad_norm": 0.4922761619091034, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2330 + }, + { + "epoch": 0.16804308797127468, + "grad_norm": 0.5701655149459839, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 2340 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 0.5170459151268005, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 2350 + }, + { + "epoch": 0.16947935368043088, + "grad_norm": 0.6562373638153076, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2360 + }, + { + "epoch": 0.170197486535009, + "grad_norm": 0.5350262522697449, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 2370 + }, + { + "epoch": 0.17091561938958708, + "grad_norm": 0.5163491368293762, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 2380 + }, + { + "epoch": 0.17163375224416516, + "grad_norm": 0.48841530084609985, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2390 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 0.44912993907928467, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 2400 + }, + { + "epoch": 0.17307001795332136, + "grad_norm": 0.5770647525787354, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 2410 + }, + { + "epoch": 0.17378815080789947, + "grad_norm": 0.4716179072856903, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 2420 + }, + { + "epoch": 0.17450628366247756, + "grad_norm": 0.5465078949928284, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 2430 + }, + { + "epoch": 0.17522441651705564, + "grad_norm": 0.40810713171958923, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 2440 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 0.3789578080177307, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 2450 + }, + { + "epoch": 0.17666068222621184, + "grad_norm": 0.4615110158920288, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 2460 + }, + { + "epoch": 0.17737881508078995, + "grad_norm": 0.4400235712528229, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2470 + }, + { + "epoch": 0.17809694793536804, + "grad_norm": 0.5935020446777344, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2480 + }, + { + "epoch": 0.17881508078994615, + "grad_norm": 0.5672990679740906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 2490 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 0.4132838845252991, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 2500 + }, + { + "epoch": 0.18025134649910232, + "grad_norm": 0.5373716950416565, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 2510 + }, + { + "epoch": 0.18096947935368043, + "grad_norm": 0.5335832834243774, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2520 + }, + { + "epoch": 0.18168761220825852, + "grad_norm": 0.5705642700195312, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.18240574506283663, + "grad_norm": 0.4807959496974945, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 2540 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 0.4430573880672455, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2550 + }, + { + "epoch": 0.18384201077199283, + "grad_norm": 0.5294728875160217, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 2560 + }, + { + "epoch": 0.18456014362657092, + "grad_norm": 0.661173403263092, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2570 + }, + { + "epoch": 0.185278276481149, + "grad_norm": 0.5044304728507996, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2580 + }, + { + "epoch": 0.18599640933572711, + "grad_norm": 0.48929551243782043, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 2590 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 0.5054438710212708, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2600 + }, + { + "epoch": 0.1874326750448833, + "grad_norm": 0.5613677501678467, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 2610 + }, + { + "epoch": 0.1881508078994614, + "grad_norm": 0.5762478709220886, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 2620 + }, + { + "epoch": 0.1888689407540395, + "grad_norm": 0.4523695409297943, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2630 + }, + { + "epoch": 0.1895870736086176, + "grad_norm": 0.5235317945480347, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 2640 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 0.4894576370716095, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 2650 + }, + { + "epoch": 0.1910233393177738, + "grad_norm": 0.45731106400489807, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2660 + }, + { + "epoch": 0.19174147217235188, + "grad_norm": 0.4726541042327881, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2670 + }, + { + "epoch": 0.19245960502693, + "grad_norm": 0.4281631410121918, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 2680 + }, + { + "epoch": 0.19317773788150808, + "grad_norm": 0.48011314868927, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 2690 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 0.45785006880760193, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2700 + }, + { + "epoch": 0.19461400359066428, + "grad_norm": 0.5244625210762024, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 2710 + }, + { + "epoch": 0.19533213644524236, + "grad_norm": 0.4674883186817169, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2720 + }, + { + "epoch": 0.19605026929982047, + "grad_norm": 0.5969558358192444, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 2730 + }, + { + "epoch": 0.19676840215439856, + "grad_norm": 0.44413265585899353, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 2740 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 0.5094553828239441, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2750 + }, + { + "epoch": 0.19820466786355476, + "grad_norm": 0.4931736886501312, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2760 + }, + { + "epoch": 0.19892280071813284, + "grad_norm": 0.4766625463962555, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 2770 + }, + { + "epoch": 0.19964093357271095, + "grad_norm": 0.4196971654891968, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2780 + }, + { + "epoch": 0.20035906642728904, + "grad_norm": 0.4693375825881958, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 2790 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 0.5407108664512634, + "learning_rate": 0.0002, + "loss": 0.8336, + "step": 2800 + }, + { + "epoch": 0.20179533213644524, + "grad_norm": 0.42864227294921875, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2810 + }, + { + "epoch": 0.20251346499102335, + "grad_norm": 0.4928833246231079, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 2820 + }, + { + "epoch": 0.20323159784560144, + "grad_norm": 0.5575131773948669, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2830 + }, + { + "epoch": 0.20394973070017952, + "grad_norm": 0.505114734172821, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2840 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 0.4727420210838318, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 2850 + }, + { + "epoch": 0.20538599640933572, + "grad_norm": 0.48218145966529846, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 2860 + }, + { + "epoch": 0.20610412926391383, + "grad_norm": 0.5196906328201294, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2870 + }, + { + "epoch": 0.20682226211849192, + "grad_norm": 0.4927639067173004, + "learning_rate": 0.0002, + "loss": 0.8401, + "step": 2880 + }, + { + "epoch": 0.20754039497307003, + "grad_norm": 0.5076990127563477, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 2890 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.4606800079345703, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 2900 + }, + { + "epoch": 0.2089766606822262, + "grad_norm": 0.6184319257736206, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2910 + }, + { + "epoch": 0.2096947935368043, + "grad_norm": 0.5237935781478882, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2920 + }, + { + "epoch": 0.2104129263913824, + "grad_norm": 0.43966251611709595, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 2930 + }, + { + "epoch": 0.2111310592459605, + "grad_norm": 0.48786666989326477, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2940 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 0.4397817552089691, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 2950 + }, + { + "epoch": 0.2125673249551167, + "grad_norm": 0.5155336260795593, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.2132854578096948, + "grad_norm": 0.48058274388313293, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2970 + }, + { + "epoch": 0.21400359066427288, + "grad_norm": 0.5022647976875305, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2980 + }, + { + "epoch": 0.214721723518851, + "grad_norm": 0.5417225360870361, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 2990 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 0.46300315856933594, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 3000 + }, + { + "epoch": 0.2161579892280072, + "grad_norm": 0.5375089049339294, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 3010 + }, + { + "epoch": 0.21687612208258528, + "grad_norm": 0.5050022602081299, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 3020 + }, + { + "epoch": 0.21759425493716336, + "grad_norm": 0.46347716450691223, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 3030 + }, + { + "epoch": 0.21831238779174147, + "grad_norm": 0.544874370098114, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 3040 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 3050 + }, + { + "epoch": 0.21974865350089767, + "grad_norm": 0.5527157187461853, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 3060 + }, + { + "epoch": 0.22046678635547576, + "grad_norm": 0.5565235018730164, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 3070 + }, + { + "epoch": 0.22118491921005387, + "grad_norm": 0.4900645613670349, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 3080 + }, + { + "epoch": 0.22190305206463196, + "grad_norm": 0.4951242208480835, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 3090 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 0.5831719636917114, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 3100 + }, + { + "epoch": 0.22333931777378815, + "grad_norm": 0.417576402425766, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 3110 + }, + { + "epoch": 0.22405745062836624, + "grad_norm": 0.4715117812156677, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 3120 + }, + { + "epoch": 0.22477558348294435, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 3130 + }, + { + "epoch": 0.22549371633752244, + "grad_norm": 0.408184289932251, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 3140 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 3150 + }, + { + "epoch": 0.22692998204667864, + "grad_norm": 0.5631294846534729, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3160 + }, + { + "epoch": 0.22764811490125672, + "grad_norm": 0.5054665803909302, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3170 + }, + { + "epoch": 0.22836624775583483, + "grad_norm": 0.47388020157814026, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 3180 + }, + { + "epoch": 0.22908438061041292, + "grad_norm": 0.45871609449386597, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 3190 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.42431211471557617, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 3200 + }, + { + "epoch": 0.23052064631956912, + "grad_norm": 0.584872305393219, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3210 + }, + { + "epoch": 0.23123877917414723, + "grad_norm": 0.5489653944969177, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 3220 + }, + { + "epoch": 0.23195691202872532, + "grad_norm": 0.5803213119506836, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 3230 + }, + { + "epoch": 0.2326750448833034, + "grad_norm": 0.906505823135376, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3240 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 0.4569525718688965, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 3250 + }, + { + "epoch": 0.2341113105924596, + "grad_norm": 0.5566741228103638, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3260 + }, + { + "epoch": 0.2348294434470377, + "grad_norm": 0.5059959888458252, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3270 + }, + { + "epoch": 0.2355475763016158, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 3280 + }, + { + "epoch": 0.2362657091561939, + "grad_norm": 0.5149409174919128, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 3290 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 0.7323763966560364, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3300 + }, + { + "epoch": 0.23770197486535008, + "grad_norm": 0.6794836521148682, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 3310 + }, + { + "epoch": 0.2384201077199282, + "grad_norm": 0.5176534056663513, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 3320 + }, + { + "epoch": 0.23913824057450628, + "grad_norm": 0.42245906591415405, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 3330 + }, + { + "epoch": 0.2398563734290844, + "grad_norm": 0.43535107374191284, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 0.7038307785987854, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 3350 + }, + { + "epoch": 0.24129263913824056, + "grad_norm": 0.5689977407455444, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 0.24201077199281867, + "grad_norm": 0.538136899471283, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 3370 + }, + { + "epoch": 0.24272890484739676, + "grad_norm": 0.7433661222457886, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 3380 + }, + { + "epoch": 0.24344703770197487, + "grad_norm": 0.6996734738349915, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3390 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.5055703520774841, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 3400 + }, + { + "epoch": 0.24488330341113107, + "grad_norm": 0.5218513607978821, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 3410 + }, + { + "epoch": 0.24560143626570916, + "grad_norm": 0.42782822251319885, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3420 + }, + { + "epoch": 0.24631956912028724, + "grad_norm": 0.4991157650947571, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 3430 + }, + { + "epoch": 0.24703770197486535, + "grad_norm": 0.5063165426254272, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3440 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 0.45863136649131775, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3450 + }, + { + "epoch": 0.24847396768402155, + "grad_norm": 0.474728524684906, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3460 + }, + { + "epoch": 0.24919210053859964, + "grad_norm": 0.522570013999939, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 3470 + }, + { + "epoch": 0.24991023339317775, + "grad_norm": 0.5474396347999573, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 3480 + }, + { + "epoch": 0.2506283662477558, + "grad_norm": 0.49094662070274353, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3490 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 0.6399132609367371, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 3500 + }, + { + "epoch": 0.25206463195691203, + "grad_norm": 0.5910066366195679, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 3510 + }, + { + "epoch": 0.25278276481149015, + "grad_norm": 0.4761259853839874, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3520 + }, + { + "epoch": 0.2535008976660682, + "grad_norm": 0.5124502182006836, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 3530 + }, + { + "epoch": 0.2542190305206463, + "grad_norm": 0.4329150915145874, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3540 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 0.4839608371257782, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 3550 + }, + { + "epoch": 0.2556552962298025, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3560 + }, + { + "epoch": 0.2563734290843806, + "grad_norm": 0.5761468410491943, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 3570 + }, + { + "epoch": 0.2570915619389587, + "grad_norm": 0.49266132712364197, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3580 + }, + { + "epoch": 0.2578096947935368, + "grad_norm": 0.7377930879592896, + "learning_rate": 0.0002, + "loss": 0.7946, + "step": 3590 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 0.543541431427002, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3600 + }, + { + "epoch": 0.259245960502693, + "grad_norm": 0.48385897278785706, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3610 + }, + { + "epoch": 0.2599640933572711, + "grad_norm": 0.5152639746665955, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3620 + }, + { + "epoch": 0.26068222621184917, + "grad_norm": 0.5601988434791565, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 3630 + }, + { + "epoch": 0.2614003590664273, + "grad_norm": 0.4349626302719116, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 0.5487161874771118, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3650 + }, + { + "epoch": 0.2628366247755835, + "grad_norm": 0.45603805780410767, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 3660 + }, + { + "epoch": 0.26355475763016156, + "grad_norm": 0.5012730956077576, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 3670 + }, + { + "epoch": 0.2642728904847397, + "grad_norm": 0.4523845314979553, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 3680 + }, + { + "epoch": 0.2649910233393178, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 3690 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 0.48467493057250977, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 3700 + }, + { + "epoch": 0.26642728904847396, + "grad_norm": 0.4860585927963257, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3710 + }, + { + "epoch": 0.26714542190305207, + "grad_norm": 0.5067077875137329, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3720 + }, + { + "epoch": 0.2678635547576302, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3730 + }, + { + "epoch": 0.26858168761220824, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 3740 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 0.5026951432228088, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 3750 + }, + { + "epoch": 0.27001795332136447, + "grad_norm": 0.49474090337753296, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3760 + }, + { + "epoch": 0.2707360861759425, + "grad_norm": 0.6381985545158386, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 3770 + }, + { + "epoch": 0.27145421903052064, + "grad_norm": 0.4784011244773865, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 3780 + }, + { + "epoch": 0.27217235188509875, + "grad_norm": 0.5126543045043945, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 3790 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 3800 + }, + { + "epoch": 0.2736086175942549, + "grad_norm": 0.5427033305168152, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 3810 + }, + { + "epoch": 0.27432675044883303, + "grad_norm": 0.46467480063438416, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 3820 + }, + { + "epoch": 0.27504488330341115, + "grad_norm": 0.494367390871048, + "learning_rate": 0.0002, + "loss": 0.8414, + "step": 3830 + }, + { + "epoch": 0.2757630161579892, + "grad_norm": 0.59856778383255, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3840 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 0.422128826379776, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 3850 + }, + { + "epoch": 0.27719928186714543, + "grad_norm": 0.5757306814193726, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 3860 + }, + { + "epoch": 0.27791741472172354, + "grad_norm": 0.5850930213928223, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.2786355475763016, + "grad_norm": 0.5633023977279663, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3880 + }, + { + "epoch": 0.2793536804308797, + "grad_norm": 0.5037940144538879, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 3890 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 0.5255506038665771, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 3900 + }, + { + "epoch": 0.2807899461400359, + "grad_norm": 0.44584617018699646, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 3910 + }, + { + "epoch": 0.281508078994614, + "grad_norm": 0.4803239405155182, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 3920 + }, + { + "epoch": 0.2822262118491921, + "grad_norm": 0.5206008553504944, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 3930 + }, + { + "epoch": 0.2829443447037702, + "grad_norm": 0.5596373081207275, + "learning_rate": 0.0002, + "loss": 0.8988, + "step": 3940 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 0.4487258493900299, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 3950 + }, + { + "epoch": 0.2843806104129264, + "grad_norm": 0.4774281978607178, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3960 + }, + { + "epoch": 0.2850987432675045, + "grad_norm": 0.571829617023468, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3970 + }, + { + "epoch": 0.28581687612208256, + "grad_norm": 0.45251455903053284, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 3980 + }, + { + "epoch": 0.2865350089766607, + "grad_norm": 0.5119943618774414, + "learning_rate": 0.0002, + "loss": 0.8007, + "step": 3990 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.42333969473838806, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 4000 + }, + { + "epoch": 0.2879712746858169, + "grad_norm": 0.5694096684455872, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 4010 + }, + { + "epoch": 0.28868940754039496, + "grad_norm": 0.44457492232322693, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 4020 + }, + { + "epoch": 0.2894075403949731, + "grad_norm": 0.496545672416687, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 4030 + }, + { + "epoch": 0.2901256732495512, + "grad_norm": 0.5092352032661438, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 4040 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 0.5124567151069641, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4050 + }, + { + "epoch": 0.29156193895870736, + "grad_norm": 0.5148161053657532, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4060 + }, + { + "epoch": 0.29228007181328547, + "grad_norm": 0.48183947801589966, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4070 + }, + { + "epoch": 0.2929982046678636, + "grad_norm": 0.47728800773620605, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4080 + }, + { + "epoch": 0.29371633752244164, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.5343585014343262, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 4100 + }, + { + "epoch": 0.29515260323159787, + "grad_norm": 0.5760312676429749, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 4110 + }, + { + "epoch": 0.2958707360861759, + "grad_norm": 0.5894787907600403, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4120 + }, + { + "epoch": 0.29658886894075404, + "grad_norm": 0.4528578817844391, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 4130 + }, + { + "epoch": 0.29730700179533215, + "grad_norm": 0.6027235388755798, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 4140 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.5060310959815979, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 4150 + }, + { + "epoch": 0.2987432675044883, + "grad_norm": 0.475252628326416, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4160 + }, + { + "epoch": 0.29946140035906643, + "grad_norm": 0.4855351448059082, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 4170 + }, + { + "epoch": 0.30017953321364454, + "grad_norm": 0.6720767021179199, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4180 + }, + { + "epoch": 0.3008976660682226, + "grad_norm": 0.6409553289413452, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 4190 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.5508167147636414, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 4200 + }, + { + "epoch": 0.30233393177737883, + "grad_norm": 0.45958149433135986, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 4210 + }, + { + "epoch": 0.3030520646319569, + "grad_norm": 0.5201641321182251, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 4220 + }, + { + "epoch": 0.303770197486535, + "grad_norm": 0.5440032482147217, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4230 + }, + { + "epoch": 0.3044883303411131, + "grad_norm": 0.43566814064979553, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4240 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 0.4479893445968628, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 4250 + }, + { + "epoch": 0.3059245960502693, + "grad_norm": 0.40390217304229736, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4260 + }, + { + "epoch": 0.3066427289048474, + "grad_norm": 0.5143486261367798, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 4270 + }, + { + "epoch": 0.3073608617594255, + "grad_norm": 0.5289962887763977, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 4280 + }, + { + "epoch": 0.30807899461400357, + "grad_norm": 0.609561026096344, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 4290 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 0.5967493653297424, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 4300 + }, + { + "epoch": 0.3095152603231598, + "grad_norm": 0.5323672890663147, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4310 + }, + { + "epoch": 0.3102333931777379, + "grad_norm": 0.4996737241744995, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 4320 + }, + { + "epoch": 0.31095152603231596, + "grad_norm": 0.5528829097747803, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 4330 + }, + { + "epoch": 0.3116696588868941, + "grad_norm": 0.5394268035888672, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4340 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 0.4654628038406372, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 4350 + }, + { + "epoch": 0.31310592459605024, + "grad_norm": 0.4933706521987915, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.31382405745062836, + "grad_norm": 0.5310598611831665, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 4370 + }, + { + "epoch": 0.31454219030520647, + "grad_norm": 0.5558765530586243, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4380 + }, + { + "epoch": 0.3152603231597846, + "grad_norm": 0.5281313061714172, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 4390 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 0.5100293755531311, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4400 + }, + { + "epoch": 0.31669658886894075, + "grad_norm": 0.48762813210487366, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 4410 + }, + { + "epoch": 0.31741472172351887, + "grad_norm": 0.5211702585220337, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 4420 + }, + { + "epoch": 0.3181328545780969, + "grad_norm": 0.696747899055481, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 4430 + }, + { + "epoch": 0.31885098743267504, + "grad_norm": 0.6334946751594543, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4440 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.5333067178726196, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4450 + }, + { + "epoch": 0.32028725314183126, + "grad_norm": 0.500091552734375, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 4460 + }, + { + "epoch": 0.3210053859964093, + "grad_norm": 0.5190957188606262, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4470 + }, + { + "epoch": 0.32172351885098743, + "grad_norm": 0.6702370047569275, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 4480 + }, + { + "epoch": 0.32244165170556555, + "grad_norm": 0.4393869638442993, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 4490 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 4500 + }, + { + "epoch": 0.3238779174147217, + "grad_norm": 0.561836838722229, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 4510 + }, + { + "epoch": 0.32459605026929983, + "grad_norm": 0.44366541504859924, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 4520 + }, + { + "epoch": 0.32531418312387794, + "grad_norm": 0.46504274010658264, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 4530 + }, + { + "epoch": 0.326032315978456, + "grad_norm": 0.5498034954071045, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 4540 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 0.5901338458061218, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 4550 + }, + { + "epoch": 0.3274685816876122, + "grad_norm": 0.5485442876815796, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 4560 + }, + { + "epoch": 0.3281867145421903, + "grad_norm": 0.512584924697876, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4570 + }, + { + "epoch": 0.3289048473967684, + "grad_norm": 0.5208188891410828, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 4580 + }, + { + "epoch": 0.3296229802513465, + "grad_norm": 0.4923836886882782, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 4590 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 0.49258530139923096, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 4600 + }, + { + "epoch": 0.3310592459605027, + "grad_norm": 0.4788922667503357, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 4610 + }, + { + "epoch": 0.3317773788150808, + "grad_norm": 0.48276954889297485, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4620 + }, + { + "epoch": 0.3324955116696589, + "grad_norm": 0.6300732493400574, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4630 + }, + { + "epoch": 0.33321364452423696, + "grad_norm": 0.47594770789146423, + "learning_rate": 0.0002, + "loss": 0.8434, + "step": 4640 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.4728924632072449, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 4650 + }, + { + "epoch": 0.3346499102333932, + "grad_norm": 0.5586788654327393, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4660 + }, + { + "epoch": 0.3353680430879713, + "grad_norm": 0.4573180377483368, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 4670 + }, + { + "epoch": 0.33608617594254936, + "grad_norm": 0.6391524076461792, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 4680 + }, + { + "epoch": 0.33680430879712747, + "grad_norm": 0.6570921540260315, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 4690 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 0.4601454734802246, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 4700 + }, + { + "epoch": 0.33824057450628364, + "grad_norm": 0.5640755295753479, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 4710 + }, + { + "epoch": 0.33895870736086176, + "grad_norm": 0.43475520610809326, + "learning_rate": 0.0002, + "loss": 0.8326, + "step": 4720 + }, + { + "epoch": 0.33967684021543987, + "grad_norm": 0.4785807132720947, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 4730 + }, + { + "epoch": 0.340394973070018, + "grad_norm": 0.4934665262699127, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 4740 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 0.45327693223953247, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 4750 + }, + { + "epoch": 0.34183123877917415, + "grad_norm": 0.4710456430912018, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4760 + }, + { + "epoch": 0.34254937163375226, + "grad_norm": 0.5591559410095215, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 4770 + }, + { + "epoch": 0.3432675044883303, + "grad_norm": 0.48958835005760193, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 4780 + }, + { + "epoch": 0.34398563734290843, + "grad_norm": 0.4613766670227051, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 4790 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 0.5425335764884949, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 4800 + }, + { + "epoch": 0.3454219030520646, + "grad_norm": 0.4964924156665802, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.3461400359066427, + "grad_norm": 0.613449215888977, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 4820 + }, + { + "epoch": 0.34685816876122083, + "grad_norm": 0.6553348898887634, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 4830 + }, + { + "epoch": 0.34757630161579894, + "grad_norm": 0.5863470435142517, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 4840 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.5338097810745239, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 4850 + }, + { + "epoch": 0.3490125673249551, + "grad_norm": 0.6129760146141052, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 4860 + }, + { + "epoch": 0.3497307001795332, + "grad_norm": 0.6100956797599792, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 4870 + }, + { + "epoch": 0.3504488330341113, + "grad_norm": 0.5478541254997253, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 4880 + }, + { + "epoch": 0.3511669658886894, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 4890 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6141043901443481, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 4900 + }, + { + "epoch": 0.3526032315978456, + "grad_norm": 0.597191572189331, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 4910 + }, + { + "epoch": 0.3533213644524237, + "grad_norm": 0.5988389253616333, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 4920 + }, + { + "epoch": 0.3540394973070018, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 4930 + }, + { + "epoch": 0.3547576301615799, + "grad_norm": 0.5932779312133789, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 4940 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 0.48911359906196594, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 4950 + }, + { + "epoch": 0.3561938958707361, + "grad_norm": 0.5435750484466553, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4960 + }, + { + "epoch": 0.3569120287253142, + "grad_norm": 0.4786977767944336, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 4970 + }, + { + "epoch": 0.3576301615798923, + "grad_norm": 0.4022316336631775, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 4980 + }, + { + "epoch": 0.35834829443447036, + "grad_norm": 0.4848504364490509, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 4990 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 0.5093459486961365, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 5000 + }, + { + "epoch": 0.3597845601436266, + "grad_norm": 0.47368478775024414, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 5010 + }, + { + "epoch": 0.36050269299820464, + "grad_norm": 0.6041097044944763, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 5020 + }, + { + "epoch": 0.36122082585278276, + "grad_norm": 0.5384424924850464, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 5030 + }, + { + "epoch": 0.36193895870736087, + "grad_norm": 0.4668518602848053, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 5040 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 0.5471060276031494, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 5050 + }, + { + "epoch": 0.36337522441651704, + "grad_norm": 0.731369137763977, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 5060 + }, + { + "epoch": 0.36409335727109515, + "grad_norm": 0.5119590759277344, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 5070 + }, + { + "epoch": 0.36481149012567327, + "grad_norm": 0.567428469657898, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 5080 + }, + { + "epoch": 0.3655296229802513, + "grad_norm": 0.5139971375465393, + "learning_rate": 0.0002, + "loss": 0.7616, + "step": 5090 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.5701581835746765, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 5100 + }, + { + "epoch": 0.36696588868940755, + "grad_norm": 0.5022063851356506, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 5110 + }, + { + "epoch": 0.36768402154398566, + "grad_norm": 0.4684354364871979, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 5120 + }, + { + "epoch": 0.3684021543985637, + "grad_norm": 0.5423495769500732, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 5130 + }, + { + "epoch": 0.36912028725314183, + "grad_norm": 0.46262967586517334, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 5140 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 0.4720141589641571, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 5150 + }, + { + "epoch": 0.370556552962298, + "grad_norm": 0.5113096833229065, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 5160 + }, + { + "epoch": 0.3712746858168761, + "grad_norm": 0.5253350138664246, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 5170 + }, + { + "epoch": 0.37199281867145423, + "grad_norm": 0.5799776315689087, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 5180 + }, + { + "epoch": 0.37271095152603234, + "grad_norm": 0.5166001319885254, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5190 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 5200 + }, + { + "epoch": 0.3741472172351885, + "grad_norm": 0.45811113715171814, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 5210 + }, + { + "epoch": 0.3748653500897666, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 5220 + }, + { + "epoch": 0.3755834829443447, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 0.3763016157989228, + "grad_norm": 0.3858596086502075, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 5240 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 0.6941536068916321, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 5250 + }, + { + "epoch": 0.377737881508079, + "grad_norm": 0.46940872073173523, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 5260 + }, + { + "epoch": 0.3784560143626571, + "grad_norm": 0.5413833260536194, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5270 + }, + { + "epoch": 0.3791741472172352, + "grad_norm": 0.5165658593177795, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 5280 + }, + { + "epoch": 0.3798922800718133, + "grad_norm": 0.6567398309707642, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 5290 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.5466915965080261, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 5300 + }, + { + "epoch": 0.3813285457809695, + "grad_norm": 0.4800598621368408, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 5310 + }, + { + "epoch": 0.3820466786355476, + "grad_norm": 0.4551742970943451, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 5320 + }, + { + "epoch": 0.3827648114901257, + "grad_norm": 0.5561164617538452, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 5330 + }, + { + "epoch": 0.38348294434470376, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 5340 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.465762197971344, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 5350 + }, + { + "epoch": 0.38491921005386, + "grad_norm": 0.6176838874816895, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 5360 + }, + { + "epoch": 0.38563734290843804, + "grad_norm": 0.657926082611084, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 5370 + }, + { + "epoch": 0.38635547576301615, + "grad_norm": 0.5063281655311584, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 5380 + }, + { + "epoch": 0.38707360861759427, + "grad_norm": 0.6960828304290771, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 5390 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 0.46712034940719604, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 5400 + }, + { + "epoch": 0.38850987432675044, + "grad_norm": 0.598114013671875, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 5410 + }, + { + "epoch": 0.38922800718132855, + "grad_norm": 0.6798132061958313, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 5420 + }, + { + "epoch": 0.38994614003590666, + "grad_norm": 0.5194289088249207, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 5430 + }, + { + "epoch": 0.3906642728904847, + "grad_norm": 0.48175323009490967, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 5440 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 0.4979408085346222, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 5450 + }, + { + "epoch": 0.39210053859964095, + "grad_norm": 0.6440972685813904, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5460 + }, + { + "epoch": 0.392818671454219, + "grad_norm": 0.5977227091789246, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 5470 + }, + { + "epoch": 0.3935368043087971, + "grad_norm": 0.4735909104347229, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 5480 + }, + { + "epoch": 0.39425493716337523, + "grad_norm": 0.48181721568107605, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 5490 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.6339454650878906, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 5500 + }, + { + "epoch": 0.3956912028725314, + "grad_norm": 0.5364336371421814, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5510 + }, + { + "epoch": 0.3964093357271095, + "grad_norm": 0.5499233603477478, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 5520 + }, + { + "epoch": 0.3971274685816876, + "grad_norm": 0.47249847650527954, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 5530 + }, + { + "epoch": 0.3978456014362657, + "grad_norm": 0.5692135095596313, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 5540 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 0.6009272933006287, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 5550 + }, + { + "epoch": 0.3992818671454219, + "grad_norm": 0.5198255181312561, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5560 + }, + { + "epoch": 0.4, + "grad_norm": 0.5474766492843628, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 0.4007181328545781, + "grad_norm": 0.5577479600906372, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 5580 + }, + { + "epoch": 0.4014362657091562, + "grad_norm": 0.5350302457809448, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5590 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 0.6310991048812866, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 5600 + }, + { + "epoch": 0.40287253141831236, + "grad_norm": 0.5695762038230896, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5610 + }, + { + "epoch": 0.4035906642728905, + "grad_norm": 0.5431827306747437, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 5620 + }, + { + "epoch": 0.4043087971274686, + "grad_norm": 0.4923325777053833, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 5630 + }, + { + "epoch": 0.4050269299820467, + "grad_norm": 0.531399667263031, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 5640 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.5854769349098206, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 5650 + }, + { + "epoch": 0.40646319569120287, + "grad_norm": 0.6684802174568176, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 5660 + }, + { + "epoch": 0.407181328545781, + "grad_norm": 0.6618620753288269, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 5670 + }, + { + "epoch": 0.40789946140035904, + "grad_norm": 0.4930776059627533, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 5680 + }, + { + "epoch": 0.40861759425493716, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 5690 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5700 + }, + { + "epoch": 0.4100538599640934, + "grad_norm": 0.6773046851158142, + "learning_rate": 0.0002, + "loss": 0.8386, + "step": 5710 + }, + { + "epoch": 0.41077199281867144, + "grad_norm": 0.6750592589378357, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 5720 + }, + { + "epoch": 0.41149012567324955, + "grad_norm": 0.5277232527732849, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5730 + }, + { + "epoch": 0.41220825852782766, + "grad_norm": 0.5155990719795227, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 5740 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 0.5236294865608215, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 5750 + }, + { + "epoch": 0.41364452423698383, + "grad_norm": 0.5073592066764832, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 5760 + }, + { + "epoch": 0.41436265709156195, + "grad_norm": 0.6997184753417969, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 5770 + }, + { + "epoch": 0.41508078994614006, + "grad_norm": 0.5282439589500427, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 5780 + }, + { + "epoch": 0.4157989228007181, + "grad_norm": 0.4997355341911316, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5790 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.6081610321998596, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5800 + }, + { + "epoch": 0.41723518850987434, + "grad_norm": 0.5640295147895813, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 5810 + }, + { + "epoch": 0.4179533213644524, + "grad_norm": 0.6443586349487305, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 0.4186714542190305, + "grad_norm": 0.6456229090690613, + "learning_rate": 0.0002, + "loss": 0.8132, + "step": 5830 + }, + { + "epoch": 0.4193895870736086, + "grad_norm": 0.5422267317771912, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5840 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 0.45251885056495667, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5850 + }, + { + "epoch": 0.4208258527827648, + "grad_norm": 0.781165599822998, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5860 + }, + { + "epoch": 0.4215439856373429, + "grad_norm": 0.5359160900115967, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5870 + }, + { + "epoch": 0.422262118491921, + "grad_norm": 0.6201958656311035, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5880 + }, + { + "epoch": 0.4229802513464991, + "grad_norm": 0.5985850691795349, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 5890 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.5550961494445801, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 5900 + }, + { + "epoch": 0.4244165170556553, + "grad_norm": 0.6284893155097961, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 5910 + }, + { + "epoch": 0.4251346499102334, + "grad_norm": 0.6143685579299927, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 5920 + }, + { + "epoch": 0.4258527827648115, + "grad_norm": 0.5065329670906067, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5930 + }, + { + "epoch": 0.4265709156193896, + "grad_norm": 0.7274345755577087, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 5940 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.606531023979187, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 5950 + }, + { + "epoch": 0.42800718132854576, + "grad_norm": 0.5983648300170898, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5960 + }, + { + "epoch": 0.4287253141831239, + "grad_norm": 0.5546031594276428, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5970 + }, + { + "epoch": 0.429443447037702, + "grad_norm": 0.666868269443512, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 5980 + }, + { + "epoch": 0.4301615798922801, + "grad_norm": 0.41438576579093933, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5990 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 0.5012526512145996, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 6000 + }, + { + "epoch": 0.43159784560143627, + "grad_norm": 0.6071694493293762, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 6010 + }, + { + "epoch": 0.4323159784560144, + "grad_norm": 0.5538384914398193, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 6020 + }, + { + "epoch": 0.43303411131059244, + "grad_norm": 0.5798718929290771, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 6030 + }, + { + "epoch": 0.43375224416517055, + "grad_norm": 0.5442442893981934, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 6040 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.6895565390586853, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 6050 + }, + { + "epoch": 0.4351885098743267, + "grad_norm": 0.6498045325279236, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 6060 + }, + { + "epoch": 0.43590664272890484, + "grad_norm": 0.5225510001182556, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 6070 + }, + { + "epoch": 0.43662477558348295, + "grad_norm": 0.6366992592811584, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 6080 + }, + { + "epoch": 0.43734290843806106, + "grad_norm": 0.47929027676582336, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 6090 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.5722405910491943, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 6100 + }, + { + "epoch": 0.43877917414721723, + "grad_norm": 0.6008004546165466, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 6110 + }, + { + "epoch": 0.43949730700179535, + "grad_norm": 0.5922580361366272, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 6120 + }, + { + "epoch": 0.4402154398563734, + "grad_norm": 0.7051905393600464, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 6130 + }, + { + "epoch": 0.4409335727109515, + "grad_norm": 0.5146450400352478, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 6140 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5605781674385071, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 6150 + }, + { + "epoch": 0.44236983842010774, + "grad_norm": 0.8008661866188049, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 6160 + }, + { + "epoch": 0.4430879712746858, + "grad_norm": 0.47406497597694397, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 6170 + }, + { + "epoch": 0.4438061041292639, + "grad_norm": 0.612287700176239, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 6180 + }, + { + "epoch": 0.444524236983842, + "grad_norm": 0.561188280582428, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 6190 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.6233669519424438, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 6200 + }, + { + "epoch": 0.4459605026929982, + "grad_norm": 0.45546263456344604, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6210 + }, + { + "epoch": 0.4466786355475763, + "grad_norm": 0.5947871208190918, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 6220 + }, + { + "epoch": 0.4473967684021544, + "grad_norm": 0.6109753847122192, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 6230 + }, + { + "epoch": 0.4481149012567325, + "grad_norm": 0.6380727887153625, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6240 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.5225699543952942, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 6250 + }, + { + "epoch": 0.4495511669658887, + "grad_norm": 0.521503210067749, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.45026929982046676, + "grad_norm": 0.5523216128349304, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 6270 + }, + { + "epoch": 0.4509874326750449, + "grad_norm": 0.5954921841621399, + "learning_rate": 0.0002, + "loss": 0.8228, + "step": 6280 + }, + { + "epoch": 0.451705565529623, + "grad_norm": 0.702751100063324, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 6290 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 0.5756356120109558, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 6300 + }, + { + "epoch": 0.45314183123877916, + "grad_norm": 0.45365944504737854, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 6310 + }, + { + "epoch": 0.45385996409335727, + "grad_norm": 0.5027855038642883, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6320 + }, + { + "epoch": 0.4545780969479354, + "grad_norm": 0.6551687121391296, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 6330 + }, + { + "epoch": 0.45529622980251344, + "grad_norm": 0.5296684503555298, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6340 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 0.5762032866477966, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6350 + }, + { + "epoch": 0.45673249551166967, + "grad_norm": 0.5234073996543884, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6360 + }, + { + "epoch": 0.4574506283662478, + "grad_norm": 0.5090946555137634, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 6370 + }, + { + "epoch": 0.45816876122082584, + "grad_norm": 0.6515111327171326, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 6380 + }, + { + "epoch": 0.45888689407540395, + "grad_norm": 0.7904898524284363, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 6390 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.6379680037498474, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 6400 + }, + { + "epoch": 0.4603231597845601, + "grad_norm": 0.641759991645813, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 6410 + }, + { + "epoch": 0.46104129263913823, + "grad_norm": 0.5273829698562622, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 6420 + }, + { + "epoch": 0.46175942549371635, + "grad_norm": 0.5668497681617737, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6430 + }, + { + "epoch": 0.46247755834829446, + "grad_norm": 0.5862061381340027, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 6440 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 0.5239592790603638, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 6450 + }, + { + "epoch": 0.46391382405745063, + "grad_norm": 0.5078722834587097, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 6460 + }, + { + "epoch": 0.46463195691202874, + "grad_norm": 0.566509485244751, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 6470 + }, + { + "epoch": 0.4653500897666068, + "grad_norm": 0.5952697396278381, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 6480 + }, + { + "epoch": 0.4660682226211849, + "grad_norm": 0.6548156142234802, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 6490 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 0.4768427908420563, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.46750448833034114, + "grad_norm": 0.5588273406028748, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 6510 + }, + { + "epoch": 0.4682226211849192, + "grad_norm": 0.5348677039146423, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 6520 + }, + { + "epoch": 0.4689407540394973, + "grad_norm": 0.4784318804740906, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 6530 + }, + { + "epoch": 0.4696588868940754, + "grad_norm": 0.5112265944480896, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 6540 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 0.7250495553016663, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 6550 + }, + { + "epoch": 0.4710951526032316, + "grad_norm": 0.538608968257904, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 6560 + }, + { + "epoch": 0.4718132854578097, + "grad_norm": 0.5981247425079346, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 6570 + }, + { + "epoch": 0.4725314183123878, + "grad_norm": 0.5466762781143188, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 6580 + }, + { + "epoch": 0.4732495511669659, + "grad_norm": 0.5609987378120422, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 6590 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 0.6091027855873108, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 6600 + }, + { + "epoch": 0.4746858168761221, + "grad_norm": 0.5542886853218079, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 6610 + }, + { + "epoch": 0.47540394973070016, + "grad_norm": 0.5656579732894897, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6620 + }, + { + "epoch": 0.4761220825852783, + "grad_norm": 0.47507357597351074, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 6630 + }, + { + "epoch": 0.4768402154398564, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6640 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 0.7129740715026855, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 6650 + }, + { + "epoch": 0.47827648114901256, + "grad_norm": 0.5189188718795776, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 6660 + }, + { + "epoch": 0.47899461400359067, + "grad_norm": 0.7548696398735046, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 6670 + }, + { + "epoch": 0.4797127468581688, + "grad_norm": 0.4729466438293457, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 6680 + }, + { + "epoch": 0.48043087971274684, + "grad_norm": 0.6190000772476196, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 6690 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 0.6276983022689819, + "learning_rate": 0.0002, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.48186714542190306, + "grad_norm": 0.6097590923309326, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 6710 + }, + { + "epoch": 0.4825852782764811, + "grad_norm": 0.6507330536842346, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 6720 + }, + { + "epoch": 0.48330341113105924, + "grad_norm": 0.5501991510391235, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 6730 + }, + { + "epoch": 0.48402154398563735, + "grad_norm": 0.5928015112876892, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 6740 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 0.5523008704185486, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 6750 + }, + { + "epoch": 0.4854578096947935, + "grad_norm": 0.5997263789176941, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 6760 + }, + { + "epoch": 0.48617594254937163, + "grad_norm": 0.6201002597808838, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 6770 + }, + { + "epoch": 0.48689407540394974, + "grad_norm": 0.6338862776756287, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 6780 + }, + { + "epoch": 0.4876122082585278, + "grad_norm": 0.5542550086975098, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6790 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.5587872862815857, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 6800 + }, + { + "epoch": 0.489048473967684, + "grad_norm": 0.5895681977272034, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 6810 + }, + { + "epoch": 0.48976660682226214, + "grad_norm": 0.4948221743106842, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 6820 + }, + { + "epoch": 0.4904847396768402, + "grad_norm": 0.44546931982040405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 6830 + }, + { + "epoch": 0.4912028725314183, + "grad_norm": 0.632046103477478, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 6840 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 0.49396243691444397, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 6850 + }, + { + "epoch": 0.4926391382405745, + "grad_norm": 0.497745156288147, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6860 + }, + { + "epoch": 0.4933572710951526, + "grad_norm": 0.7336170077323914, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 6870 + }, + { + "epoch": 0.4940754039497307, + "grad_norm": 0.6723181009292603, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 6880 + }, + { + "epoch": 0.4947935368043088, + "grad_norm": 0.5887754559516907, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 6890 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 0.6580226421356201, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 6900 + }, + { + "epoch": 0.496229802513465, + "grad_norm": 0.7385056614875793, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 6910 + }, + { + "epoch": 0.4969479353680431, + "grad_norm": 0.48736000061035156, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6920 + }, + { + "epoch": 0.49766606822262116, + "grad_norm": 0.6304559111595154, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 6930 + }, + { + "epoch": 0.4983842010771993, + "grad_norm": 0.607148289680481, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6940 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.5467981696128845, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 6950 + }, + { + "epoch": 0.4998204667863555, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 6960 + }, + { + "epoch": 0.5005385996409336, + "grad_norm": 0.5487921833992004, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 6970 + }, + { + "epoch": 0.5012567324955116, + "grad_norm": 0.5706006288528442, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 6980 + }, + { + "epoch": 0.5019748653500897, + "grad_norm": 0.539536714553833, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 6990 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 0.5527397394180298, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 7000 + }, + { + "epoch": 0.503411131059246, + "grad_norm": 0.5498567223548889, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 7010 + }, + { + "epoch": 0.5041292639138241, + "grad_norm": 0.5878575444221497, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 7020 + }, + { + "epoch": 0.5048473967684022, + "grad_norm": 0.646153450012207, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 7030 + }, + { + "epoch": 0.5055655296229803, + "grad_norm": 0.5603899359703064, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 7040 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 0.5849952697753906, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 7050 + }, + { + "epoch": 0.5070017953321364, + "grad_norm": 0.6082724928855896, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 7060 + }, + { + "epoch": 0.5077199281867145, + "grad_norm": 0.5900670289993286, + "learning_rate": 0.0002, + "loss": 0.8046, + "step": 7070 + }, + { + "epoch": 0.5084380610412926, + "grad_norm": 0.5856624841690063, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 7080 + }, + { + "epoch": 0.5091561938958707, + "grad_norm": 0.6177338361740112, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7090 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 0.5559300184249878, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 7100 + }, + { + "epoch": 0.510592459605027, + "grad_norm": 0.62027907371521, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 7110 + }, + { + "epoch": 0.511310592459605, + "grad_norm": 0.6334301829338074, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7120 + }, + { + "epoch": 0.5120287253141831, + "grad_norm": 0.513795018196106, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 7130 + }, + { + "epoch": 0.5127468581687612, + "grad_norm": 0.7004675269126892, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 7140 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5614308714866638, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7150 + }, + { + "epoch": 0.5141831238779174, + "grad_norm": 0.5037539601325989, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 7160 + }, + { + "epoch": 0.5149012567324955, + "grad_norm": 0.5568661093711853, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 7170 + }, + { + "epoch": 0.5156193895870737, + "grad_norm": 0.7513397336006165, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7180 + }, + { + "epoch": 0.5163375224416517, + "grad_norm": 0.7264583706855774, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 7190 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.6355819702148438, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 7200 + }, + { + "epoch": 0.5177737881508079, + "grad_norm": 0.6063222289085388, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 7210 + }, + { + "epoch": 0.518491921005386, + "grad_norm": 0.6484307646751404, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 7220 + }, + { + "epoch": 0.5192100538599641, + "grad_norm": 0.5260455012321472, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 7230 + }, + { + "epoch": 0.5199281867145422, + "grad_norm": 0.6718002557754517, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7240 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 0.5997617244720459, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 7250 + }, + { + "epoch": 0.5213644524236983, + "grad_norm": 0.5838589668273926, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 7260 + }, + { + "epoch": 0.5220825852782764, + "grad_norm": 0.5755977630615234, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 7270 + }, + { + "epoch": 0.5228007181328546, + "grad_norm": 0.6442093253135681, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 7280 + }, + { + "epoch": 0.5235188509874327, + "grad_norm": 0.6128416657447815, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 7290 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.509742796421051, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 7300 + }, + { + "epoch": 0.5249551166965889, + "grad_norm": 0.5450230836868286, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 7310 + }, + { + "epoch": 0.525673249551167, + "grad_norm": 0.5437141060829163, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 7320 + }, + { + "epoch": 0.526391382405745, + "grad_norm": 0.5291738510131836, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 7330 + }, + { + "epoch": 0.5271095152603231, + "grad_norm": 0.5101743936538696, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 7340 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.5678408145904541, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 7350 + }, + { + "epoch": 0.5285457809694794, + "grad_norm": 0.6332360506057739, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7360 + }, + { + "epoch": 0.5292639138240575, + "grad_norm": 0.4935058653354645, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 7370 + }, + { + "epoch": 0.5299820466786356, + "grad_norm": 0.6399656534194946, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7380 + }, + { + "epoch": 0.5307001795332137, + "grad_norm": 0.5986794233322144, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 7390 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.6948414444923401, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 7400 + }, + { + "epoch": 0.5321364452423698, + "grad_norm": 0.5337842106819153, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 7410 + }, + { + "epoch": 0.5328545780969479, + "grad_norm": 0.6897268295288086, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 7420 + }, + { + "epoch": 0.533572710951526, + "grad_norm": 0.6361175179481506, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 7430 + }, + { + "epoch": 0.5342908438061041, + "grad_norm": 0.5242252945899963, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 7440 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 0.5731322765350342, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 7450 + }, + { + "epoch": 0.5357271095152604, + "grad_norm": 0.5790955424308777, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 7460 + }, + { + "epoch": 0.5364452423698384, + "grad_norm": 0.4979061782360077, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 7470 + }, + { + "epoch": 0.5371633752244165, + "grad_norm": 0.7335101962089539, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 7480 + }, + { + "epoch": 0.5378815080789946, + "grad_norm": 0.592521071434021, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 7490 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.5784769654273987, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 7500 + }, + { + "epoch": 0.5393177737881508, + "grad_norm": 0.8148589730262756, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 7510 + }, + { + "epoch": 0.5400359066427289, + "grad_norm": 0.5727689862251282, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7520 + }, + { + "epoch": 0.540754039497307, + "grad_norm": 0.6958279609680176, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 7530 + }, + { + "epoch": 0.541472172351885, + "grad_norm": 0.6302788257598877, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 7540 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.5950970649719238, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 7550 + }, + { + "epoch": 0.5429084380610413, + "grad_norm": 0.4275270104408264, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 7560 + }, + { + "epoch": 0.5436265709156194, + "grad_norm": 0.7579900622367859, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 7570 + }, + { + "epoch": 0.5443447037701975, + "grad_norm": 0.5835317969322205, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 7580 + }, + { + "epoch": 0.5450628366247756, + "grad_norm": 0.5305142998695374, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 7590 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7600 + }, + { + "epoch": 0.5464991023339317, + "grad_norm": 0.5341935753822327, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 7610 + }, + { + "epoch": 0.5472172351885098, + "grad_norm": 0.6070826053619385, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 7620 + }, + { + "epoch": 0.547935368043088, + "grad_norm": 0.6193035840988159, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 7630 + }, + { + "epoch": 0.5486535008976661, + "grad_norm": 0.6171614527702332, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 7640 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.5700938105583191, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 7650 + }, + { + "epoch": 0.5500897666068223, + "grad_norm": 0.5742418169975281, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7660 + }, + { + "epoch": 0.5508078994614004, + "grad_norm": 0.6450320482254028, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 7670 + }, + { + "epoch": 0.5515260323159784, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 7680 + }, + { + "epoch": 0.5522441651705565, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 7690 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.5846288204193115, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7700 + }, + { + "epoch": 0.5536804308797127, + "grad_norm": 0.623315155506134, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7710 + }, + { + "epoch": 0.5543985637342909, + "grad_norm": 0.6607962250709534, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7720 + }, + { + "epoch": 0.555116696588869, + "grad_norm": 0.5258557200431824, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 7730 + }, + { + "epoch": 0.5558348294434471, + "grad_norm": 0.6464316844940186, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7740 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 0.6390621662139893, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 7750 + }, + { + "epoch": 0.5572710951526032, + "grad_norm": 0.5327560305595398, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 7760 + }, + { + "epoch": 0.5579892280071813, + "grad_norm": 0.8202064633369446, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 7770 + }, + { + "epoch": 0.5587073608617594, + "grad_norm": 0.45350968837738037, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 7780 + }, + { + "epoch": 0.5594254937163375, + "grad_norm": 0.5031413435935974, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 7790 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.5047417879104614, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 7800 + }, + { + "epoch": 0.5608617594254938, + "grad_norm": 0.668912410736084, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 0.5615798922800718, + "grad_norm": 0.6106061339378357, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7820 + }, + { + "epoch": 0.5622980251346499, + "grad_norm": 0.5558443665504456, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 7830 + }, + { + "epoch": 0.563016157989228, + "grad_norm": 0.5937177538871765, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 7840 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 0.67307448387146, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 7850 + }, + { + "epoch": 0.5644524236983842, + "grad_norm": 0.4615475833415985, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7860 + }, + { + "epoch": 0.5651705565529623, + "grad_norm": 0.5462577939033508, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 7870 + }, + { + "epoch": 0.5658886894075404, + "grad_norm": 0.6422402858734131, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7880 + }, + { + "epoch": 0.5666068222621184, + "grad_norm": 0.5313532948493958, + "learning_rate": 0.0002, + "loss": 0.8327, + "step": 7890 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 0.5647847056388855, + "learning_rate": 0.0002, + "loss": 0.7771, + "step": 7900 + }, + { + "epoch": 0.5680430879712747, + "grad_norm": 0.6581610441207886, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 7910 + }, + { + "epoch": 0.5687612208258528, + "grad_norm": 0.46947669982910156, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 7920 + }, + { + "epoch": 0.5694793536804309, + "grad_norm": 0.6420038342475891, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7930 + }, + { + "epoch": 0.570197486535009, + "grad_norm": 0.6730441451072693, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 7940 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.3849070966243744, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 7950 + }, + { + "epoch": 0.5716337522441651, + "grad_norm": 0.6076335906982422, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 7960 + }, + { + "epoch": 0.5723518850987432, + "grad_norm": 0.6446982026100159, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 7970 + }, + { + "epoch": 0.5730700179533214, + "grad_norm": 0.6019234657287598, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 7980 + }, + { + "epoch": 0.5737881508078995, + "grad_norm": 0.620880663394928, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 7990 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 0.4927573502063751, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 8000 + }, + { + "epoch": 0.5752244165170557, + "grad_norm": 0.6276804804801941, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8010 + }, + { + "epoch": 0.5759425493716338, + "grad_norm": 0.484518826007843, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 8020 + }, + { + "epoch": 0.5766606822262118, + "grad_norm": 0.5019962787628174, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.5773788150807899, + "grad_norm": 0.6685234308242798, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 8040 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 0.5762107372283936, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 8050 + }, + { + "epoch": 0.5788150807899461, + "grad_norm": 0.6402477025985718, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 8060 + }, + { + "epoch": 0.5795332136445243, + "grad_norm": 0.5919345617294312, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8070 + }, + { + "epoch": 0.5802513464991024, + "grad_norm": 0.47100913524627686, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 8080 + }, + { + "epoch": 0.5809694793536805, + "grad_norm": 0.6029118895530701, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 8090 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 0.5896338820457458, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 8100 + }, + { + "epoch": 0.5824057450628366, + "grad_norm": 0.49017754197120667, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 8110 + }, + { + "epoch": 0.5831238779174147, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 8120 + }, + { + "epoch": 0.5838420107719928, + "grad_norm": 0.6874517798423767, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.5845601436265709, + "grad_norm": 0.5429391264915466, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 8140 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.5533722639083862, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5859964093357272, + "grad_norm": 0.5827956199645996, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 8160 + }, + { + "epoch": 0.5867145421903052, + "grad_norm": 0.6670212149620056, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 8170 + }, + { + "epoch": 0.5874326750448833, + "grad_norm": 0.5231172442436218, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 8180 + }, + { + "epoch": 0.5881508078994614, + "grad_norm": 0.567447304725647, + "learning_rate": 0.0002, + "loss": 0.7975, + "step": 8190 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 0.5318575501441956, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8200 + }, + { + "epoch": 0.5895870736086176, + "grad_norm": 0.6959463357925415, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 8210 + }, + { + "epoch": 0.5903052064631957, + "grad_norm": 0.6964931488037109, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 8220 + }, + { + "epoch": 0.5910233393177737, + "grad_norm": 0.5164617896080017, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 8230 + }, + { + "epoch": 0.5917414721723518, + "grad_norm": 0.5456110239028931, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 8240 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.6553666591644287, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 8250 + }, + { + "epoch": 0.5931777378815081, + "grad_norm": 0.6185845732688904, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 8260 + }, + { + "epoch": 0.5938958707360862, + "grad_norm": 0.6110545992851257, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8270 + }, + { + "epoch": 0.5946140035906643, + "grad_norm": 0.5186824202537537, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 8280 + }, + { + "epoch": 0.5953321364452424, + "grad_norm": 0.7003735303878784, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 8290 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 0.4606216549873352, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 8300 + }, + { + "epoch": 0.5967684021543985, + "grad_norm": 0.5903441309928894, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 8310 + }, + { + "epoch": 0.5974865350089766, + "grad_norm": 0.7916744947433472, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 8320 + }, + { + "epoch": 0.5982046678635548, + "grad_norm": 0.5506401062011719, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 8330 + }, + { + "epoch": 0.5989228007181329, + "grad_norm": 0.5749204158782959, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 8340 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.6807544827461243, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 8350 + }, + { + "epoch": 0.6003590664272891, + "grad_norm": 0.5782986283302307, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 8360 + }, + { + "epoch": 0.6010771992818671, + "grad_norm": 0.7336342334747314, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 8370 + }, + { + "epoch": 0.6017953321364452, + "grad_norm": 0.5762712955474854, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.6025134649910233, + "grad_norm": 0.5726776719093323, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 8390 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.5355535745620728, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 8400 + }, + { + "epoch": 0.6039497307001795, + "grad_norm": 0.6762161254882812, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 8410 + }, + { + "epoch": 0.6046678635547577, + "grad_norm": 0.8200717568397522, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 8420 + }, + { + "epoch": 0.6053859964093358, + "grad_norm": 0.5600009560585022, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 8430 + }, + { + "epoch": 0.6061041292639138, + "grad_norm": 0.6465966105461121, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 8440 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.5176072120666504, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 8450 + }, + { + "epoch": 0.60754039497307, + "grad_norm": 0.5777280926704407, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 8460 + }, + { + "epoch": 0.6082585278276481, + "grad_norm": 0.5989252924919128, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 8470 + }, + { + "epoch": 0.6089766606822262, + "grad_norm": 0.5207306742668152, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8480 + }, + { + "epoch": 0.6096947935368043, + "grad_norm": 0.5242675542831421, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 8490 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 0.5631455183029175, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 8500 + }, + { + "epoch": 0.6111310592459605, + "grad_norm": 0.65207439661026, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 8510 + }, + { + "epoch": 0.6118491921005386, + "grad_norm": 0.5808899998664856, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8520 + }, + { + "epoch": 0.6125673249551167, + "grad_norm": 0.558127760887146, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 8530 + }, + { + "epoch": 0.6132854578096948, + "grad_norm": 0.6063143014907837, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8540 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.5491744875907898, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 8550 + }, + { + "epoch": 0.614721723518851, + "grad_norm": 0.5105780959129333, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8560 + }, + { + "epoch": 0.6154398563734291, + "grad_norm": 0.6892395615577698, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 8570 + }, + { + "epoch": 0.6161579892280071, + "grad_norm": 0.7411758899688721, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8580 + }, + { + "epoch": 0.6168761220825852, + "grad_norm": 0.6745429635047913, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 8590 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 0.596007227897644, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 8600 + }, + { + "epoch": 0.6183123877917415, + "grad_norm": 0.6751060485839844, + "learning_rate": 0.0002, + "loss": 0.7963, + "step": 8610 + }, + { + "epoch": 0.6190305206463196, + "grad_norm": 0.711124837398529, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 8620 + }, + { + "epoch": 0.6197486535008977, + "grad_norm": 0.6110914945602417, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 8630 + }, + { + "epoch": 0.6204667863554758, + "grad_norm": 0.5687659978866577, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 8640 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.7025772929191589, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8650 + }, + { + "epoch": 0.6219030520646319, + "grad_norm": 0.6456184983253479, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 8660 + }, + { + "epoch": 0.62262118491921, + "grad_norm": 0.5317023992538452, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 8670 + }, + { + "epoch": 0.6233393177737881, + "grad_norm": 0.5531691908836365, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 8680 + }, + { + "epoch": 0.6240574506283663, + "grad_norm": 0.6063531637191772, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 8690 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 1.094390630722046, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 8700 + }, + { + "epoch": 0.6254937163375225, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 8710 + }, + { + "epoch": 0.6262118491921005, + "grad_norm": 0.5470370054244995, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 8720 + }, + { + "epoch": 0.6269299820466786, + "grad_norm": 0.5852634310722351, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 8730 + }, + { + "epoch": 0.6276481149012567, + "grad_norm": 0.6120240092277527, + "learning_rate": 0.0002, + "loss": 0.8712, + "step": 8740 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 0.5608004927635193, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 8750 + }, + { + "epoch": 0.6290843806104129, + "grad_norm": 0.5980432033538818, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 8760 + }, + { + "epoch": 0.629802513464991, + "grad_norm": 0.5670580863952637, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 8770 + }, + { + "epoch": 0.6305206463195692, + "grad_norm": 0.5931687951087952, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 8780 + }, + { + "epoch": 0.6312387791741472, + "grad_norm": 0.7872577905654907, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 8790 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.6355181336402893, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 8800 + }, + { + "epoch": 0.6326750448833034, + "grad_norm": 0.501913845539093, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 8810 + }, + { + "epoch": 0.6333931777378815, + "grad_norm": 0.5956716537475586, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8820 + }, + { + "epoch": 0.6341113105924596, + "grad_norm": 0.6448253393173218, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 8830 + }, + { + "epoch": 0.6348294434470377, + "grad_norm": 0.6139631271362305, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 8840 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.5894306302070618, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 8850 + }, + { + "epoch": 0.6362657091561938, + "grad_norm": 0.8724799752235413, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 8860 + }, + { + "epoch": 0.636983842010772, + "grad_norm": 0.5413858890533447, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 8870 + }, + { + "epoch": 0.6377019748653501, + "grad_norm": 0.5993430614471436, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 8880 + }, + { + "epoch": 0.6384201077199282, + "grad_norm": 0.539415717124939, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 0.600125789642334, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 0.6398563734290844, + "grad_norm": 0.5597978234291077, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 8910 + }, + { + "epoch": 0.6405745062836625, + "grad_norm": 0.6262031197547913, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 8920 + }, + { + "epoch": 0.6412926391382405, + "grad_norm": 0.72662752866745, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 8930 + }, + { + "epoch": 0.6420107719928186, + "grad_norm": 0.613002598285675, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 8940 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.6511827707290649, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 8950 + }, + { + "epoch": 0.6434470377019749, + "grad_norm": 0.5383973717689514, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 8960 + }, + { + "epoch": 0.644165170556553, + "grad_norm": 0.5236184597015381, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 8970 + }, + { + "epoch": 0.6448833034111311, + "grad_norm": 0.5938544273376465, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 8980 + }, + { + "epoch": 0.6456014362657092, + "grad_norm": 0.4594680964946747, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 8990 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 0.6314211487770081, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9000 + }, + { + "epoch": 0.6470377019748653, + "grad_norm": 0.6291103363037109, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 9010 + }, + { + "epoch": 0.6477558348294434, + "grad_norm": 0.5888266563415527, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 9020 + }, + { + "epoch": 0.6484739676840215, + "grad_norm": 0.5613022446632385, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9030 + }, + { + "epoch": 0.6491921005385997, + "grad_norm": 0.7219604253768921, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 9040 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 0.5846529006958008, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 9050 + }, + { + "epoch": 0.6506283662477559, + "grad_norm": 0.7264063954353333, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 9060 + }, + { + "epoch": 0.6513464991023339, + "grad_norm": 0.5797538757324219, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9070 + }, + { + "epoch": 0.652064631956912, + "grad_norm": 0.4857395887374878, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9080 + }, + { + "epoch": 0.6527827648114901, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 9090 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.6105342507362366, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 9100 + }, + { + "epoch": 0.6542190305206463, + "grad_norm": 0.6408740282058716, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 9110 + }, + { + "epoch": 0.6549371633752245, + "grad_norm": 0.7474880814552307, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 9120 + }, + { + "epoch": 0.6556552962298026, + "grad_norm": 0.584768533706665, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 9130 + }, + { + "epoch": 0.6563734290843806, + "grad_norm": 0.6368113160133362, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9140 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.693631649017334, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 9150 + }, + { + "epoch": 0.6578096947935368, + "grad_norm": 0.6094512343406677, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 9160 + }, + { + "epoch": 0.6585278276481149, + "grad_norm": 0.7154942750930786, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 9170 + }, + { + "epoch": 0.659245960502693, + "grad_norm": 0.5749237537384033, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9180 + }, + { + "epoch": 0.6599640933572711, + "grad_norm": 0.6214450001716614, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 9190 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.6357814073562622, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9200 + }, + { + "epoch": 0.6614003590664272, + "grad_norm": 0.5677326917648315, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 9210 + }, + { + "epoch": 0.6621184919210054, + "grad_norm": 0.5432633757591248, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 9220 + }, + { + "epoch": 0.6628366247755835, + "grad_norm": 0.43935060501098633, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 9230 + }, + { + "epoch": 0.6635547576301616, + "grad_norm": 0.5350922346115112, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 9240 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.7745687365531921, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 9250 + }, + { + "epoch": 0.6649910233393178, + "grad_norm": 0.5767113566398621, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9260 + }, + { + "epoch": 0.6657091561938959, + "grad_norm": 0.49304983019828796, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9270 + }, + { + "epoch": 0.6664272890484739, + "grad_norm": 0.6355269551277161, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 9280 + }, + { + "epoch": 0.667145421903052, + "grad_norm": 0.5539451241493225, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 9290 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.5225138068199158, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 9300 + }, + { + "epoch": 0.6685816876122083, + "grad_norm": 0.5435736179351807, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 9310 + }, + { + "epoch": 0.6692998204667864, + "grad_norm": 0.611266553401947, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 9320 + }, + { + "epoch": 0.6700179533213645, + "grad_norm": 0.5880926251411438, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 9330 + }, + { + "epoch": 0.6707360861759426, + "grad_norm": 0.5301468372344971, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9340 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.5614377856254578, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9350 + }, + { + "epoch": 0.6721723518850987, + "grad_norm": 0.7177342176437378, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 9360 + }, + { + "epoch": 0.6728904847396768, + "grad_norm": 0.5187423825263977, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9370 + }, + { + "epoch": 0.6736086175942549, + "grad_norm": 0.49305087327957153, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 9380 + }, + { + "epoch": 0.6743267504488331, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 9390 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 0.8308040499687195, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 9400 + }, + { + "epoch": 0.6757630161579893, + "grad_norm": 0.6522438526153564, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 9410 + }, + { + "epoch": 0.6764811490125673, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 9420 + }, + { + "epoch": 0.6771992818671454, + "grad_norm": 0.783802330493927, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 9430 + }, + { + "epoch": 0.6779174147217235, + "grad_norm": 0.5246656537055969, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 9440 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.6630974411964417, + "learning_rate": 0.0002, + "loss": 0.7866, + "step": 9450 + }, + { + "epoch": 0.6793536804308797, + "grad_norm": 0.5012770295143127, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9460 + }, + { + "epoch": 0.6800718132854578, + "grad_norm": 0.6208643317222595, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 9470 + }, + { + "epoch": 0.680789946140036, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9480 + }, + { + "epoch": 0.681508078994614, + "grad_norm": 0.6613174080848694, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 9490 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.6417899131774902, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9500 + }, + { + "epoch": 0.6829443447037702, + "grad_norm": 0.5060321092605591, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 9510 + }, + { + "epoch": 0.6836624775583483, + "grad_norm": 0.586670458316803, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 9520 + }, + { + "epoch": 0.6843806104129264, + "grad_norm": 0.6607828736305237, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 9530 + }, + { + "epoch": 0.6850987432675045, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9540 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.741000771522522, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 9550 + }, + { + "epoch": 0.6865350089766606, + "grad_norm": 0.4687826335430145, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 9560 + }, + { + "epoch": 0.6872531418312388, + "grad_norm": 0.6452056169509888, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 9570 + }, + { + "epoch": 0.6879712746858169, + "grad_norm": 0.6393555402755737, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 9580 + }, + { + "epoch": 0.688689407540395, + "grad_norm": 0.4907757043838501, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 9590 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.5380825996398926, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 9600 + }, + { + "epoch": 0.6901256732495512, + "grad_norm": 0.5657393932342529, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 9610 + }, + { + "epoch": 0.6908438061041292, + "grad_norm": 0.8505447506904602, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 9620 + }, + { + "epoch": 0.6915619389587073, + "grad_norm": 0.5389836430549622, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 9630 + }, + { + "epoch": 0.6922800718132854, + "grad_norm": 0.4977441728115082, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 9640 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 0.5855389833450317, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 9650 + }, + { + "epoch": 0.6937163375224417, + "grad_norm": 0.633994996547699, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 9660 + }, + { + "epoch": 0.6944344703770198, + "grad_norm": 0.5592191815376282, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 9670 + }, + { + "epoch": 0.6951526032315979, + "grad_norm": 0.6030594706535339, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9680 + }, + { + "epoch": 0.6958707360861759, + "grad_norm": 0.6782388687133789, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 9690 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 0.6777627468109131, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 9700 + }, + { + "epoch": 0.6973070017953321, + "grad_norm": 0.5674123764038086, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 9710 + }, + { + "epoch": 0.6980251346499102, + "grad_norm": 0.5280387997627258, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 9720 + }, + { + "epoch": 0.6987432675044883, + "grad_norm": 0.5471981763839722, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 9730 + }, + { + "epoch": 0.6994614003590665, + "grad_norm": 0.6751061677932739, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9740 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.5942487716674805, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 9750 + }, + { + "epoch": 0.7008976660682226, + "grad_norm": 0.6165713667869568, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 9760 + }, + { + "epoch": 0.7016157989228007, + "grad_norm": 0.5745091438293457, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 9770 + }, + { + "epoch": 0.7023339317773788, + "grad_norm": 0.600308358669281, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 9780 + }, + { + "epoch": 0.7030520646319569, + "grad_norm": 0.6448577046394348, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 9790 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.5662767291069031, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9800 + }, + { + "epoch": 0.7044883303411131, + "grad_norm": 0.6490433812141418, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 9810 + }, + { + "epoch": 0.7052064631956912, + "grad_norm": 0.6126134991645813, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 9820 + }, + { + "epoch": 0.7059245960502692, + "grad_norm": 0.7181116938591003, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 9830 + }, + { + "epoch": 0.7066427289048474, + "grad_norm": 0.7805212140083313, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9840 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.7521958947181702, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9850 + }, + { + "epoch": 0.7080789946140036, + "grad_norm": 0.5610787868499756, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9860 + }, + { + "epoch": 0.7087971274685817, + "grad_norm": 0.7026229500770569, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 9870 + }, + { + "epoch": 0.7095152603231598, + "grad_norm": 0.551691472530365, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 9880 + }, + { + "epoch": 0.7102333931777379, + "grad_norm": 0.5841995477676392, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9890 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 0.7170061469078064, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 9900 + }, + { + "epoch": 0.711669658886894, + "grad_norm": 0.49836990237236023, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 9910 + }, + { + "epoch": 0.7123877917414722, + "grad_norm": 0.5234556794166565, + "learning_rate": 0.0002, + "loss": 0.7667, + "step": 9920 + }, + { + "epoch": 0.7131059245960503, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 9930 + }, + { + "epoch": 0.7138240574506284, + "grad_norm": 0.5657515525817871, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9940 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.5969128012657166, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 9950 + }, + { + "epoch": 0.7152603231597846, + "grad_norm": 0.7136867046356201, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 9960 + }, + { + "epoch": 0.7159784560143626, + "grad_norm": 0.6774699091911316, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9970 + }, + { + "epoch": 0.7166965888689407, + "grad_norm": 0.6066371202468872, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 9980 + }, + { + "epoch": 0.7174147217235188, + "grad_norm": 0.7355279922485352, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 9990 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 0.7996646761894226, + "learning_rate": 0.0002, + "loss": 0.7643, + "step": 10000 + }, + { + "epoch": 0.7188509874326751, + "grad_norm": 0.628839910030365, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10010 + }, + { + "epoch": 0.7195691202872532, + "grad_norm": 0.5472931265830994, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 10020 + }, + { + "epoch": 0.7202872531418313, + "grad_norm": 0.5776344537734985, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 10030 + }, + { + "epoch": 0.7210053859964093, + "grad_norm": 0.5041707158088684, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10040 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5965308547019958, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 10050 + }, + { + "epoch": 0.7224416517055655, + "grad_norm": 0.5892689228057861, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 10060 + }, + { + "epoch": 0.7231597845601436, + "grad_norm": 0.5695884227752686, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 10070 + }, + { + "epoch": 0.7238779174147217, + "grad_norm": 0.6547690629959106, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 10080 + }, + { + "epoch": 0.7245960502692999, + "grad_norm": 0.6759928464889526, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 10090 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.6829725503921509, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 10100 + }, + { + "epoch": 0.726032315978456, + "grad_norm": 0.5242751240730286, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 10110 + }, + { + "epoch": 0.7267504488330341, + "grad_norm": 0.6947014927864075, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 10120 + }, + { + "epoch": 0.7274685816876122, + "grad_norm": 0.6094982624053955, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10130 + }, + { + "epoch": 0.7281867145421903, + "grad_norm": 0.628461480140686, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 10140 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.4952087104320526, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10150 + }, + { + "epoch": 0.7296229802513465, + "grad_norm": 0.6917221546173096, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10160 + }, + { + "epoch": 0.7303411131059246, + "grad_norm": 0.6866413354873657, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10170 + }, + { + "epoch": 0.7310592459605026, + "grad_norm": 0.5505863428115845, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 10180 + }, + { + "epoch": 0.7317773788150808, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 10190 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.5001798272132874, + "learning_rate": 0.0002, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 0.733213644524237, + "grad_norm": 0.5117581486701965, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 10210 + }, + { + "epoch": 0.7339317773788151, + "grad_norm": 0.7716088891029358, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 10220 + }, + { + "epoch": 0.7346499102333932, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 10230 + }, + { + "epoch": 0.7353680430879713, + "grad_norm": 0.6433483362197876, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10240 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.6241081357002258, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10250 + }, + { + "epoch": 0.7368043087971274, + "grad_norm": 0.7198845744132996, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10260 + }, + { + "epoch": 0.7375224416517056, + "grad_norm": 0.5879023671150208, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 10270 + }, + { + "epoch": 0.7382405745062837, + "grad_norm": 0.5810162425041199, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 10280 + }, + { + "epoch": 0.7389587073608618, + "grad_norm": 0.6336500644683838, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10290 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.5627583861351013, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 10300 + }, + { + "epoch": 0.740394973070018, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 10310 + }, + { + "epoch": 0.741113105924596, + "grad_norm": 0.5519505143165588, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 10320 + }, + { + "epoch": 0.7418312387791741, + "grad_norm": 0.628710925579071, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 10330 + }, + { + "epoch": 0.7425493716337522, + "grad_norm": 0.6466957926750183, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10340 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 0.6269286274909973, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 10350 + }, + { + "epoch": 0.7439856373429085, + "grad_norm": 0.6985455751419067, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 10360 + }, + { + "epoch": 0.7447037701974866, + "grad_norm": 0.6203648447990417, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 10370 + }, + { + "epoch": 0.7454219030520647, + "grad_norm": 0.6524295210838318, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 10380 + }, + { + "epoch": 0.7461400359066427, + "grad_norm": 0.6108002662658691, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 10390 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 0.5196276903152466, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 10400 + }, + { + "epoch": 0.7475763016157989, + "grad_norm": 0.6207506656646729, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 10410 + }, + { + "epoch": 0.748294434470377, + "grad_norm": 0.6015686988830566, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 10420 + }, + { + "epoch": 0.7490125673249551, + "grad_norm": 0.6402649879455566, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 10430 + }, + { + "epoch": 0.7497307001795332, + "grad_norm": 0.7816081047058105, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 10440 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.6148143410682678, + "learning_rate": 0.0002, + "loss": 0.8021, + "step": 10450 + }, + { + "epoch": 0.7511669658886894, + "grad_norm": 0.6496613621711731, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 10460 + }, + { + "epoch": 0.7518850987432675, + "grad_norm": 0.49158045649528503, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 10470 + }, + { + "epoch": 0.7526032315978456, + "grad_norm": 0.8629217743873596, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 10480 + }, + { + "epoch": 0.7533213644524237, + "grad_norm": 0.6800066828727722, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 10490 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.6480063199996948, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 10500 + }, + { + "epoch": 0.7547576301615799, + "grad_norm": 0.5740751028060913, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10510 + }, + { + "epoch": 0.755475763016158, + "grad_norm": 0.7182627320289612, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 10520 + }, + { + "epoch": 0.756193895870736, + "grad_norm": 0.6482816934585571, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 10530 + }, + { + "epoch": 0.7569120287253142, + "grad_norm": 0.4937674105167389, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 10540 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.6818482875823975, + "learning_rate": 0.0002, + "loss": 0.7783, + "step": 10550 + }, + { + "epoch": 0.7583482944344704, + "grad_norm": 0.6375173926353455, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10560 + }, + { + "epoch": 0.7590664272890485, + "grad_norm": 0.528798520565033, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 10570 + }, + { + "epoch": 0.7597845601436266, + "grad_norm": 0.42099910974502563, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 10580 + }, + { + "epoch": 0.7605026929982047, + "grad_norm": 0.529604434967041, + "learning_rate": 0.0002, + "loss": 0.8218, + "step": 10590 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.6236841082572937, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 10600 + }, + { + "epoch": 0.7619389587073608, + "grad_norm": 0.6194891929626465, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10610 + }, + { + "epoch": 0.762657091561939, + "grad_norm": 0.5206209421157837, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 10620 + }, + { + "epoch": 0.7633752244165171, + "grad_norm": 0.7981295585632324, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 10630 + }, + { + "epoch": 0.7640933572710952, + "grad_norm": 0.6113479137420654, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 10640 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 0.7025435566902161, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10650 + }, + { + "epoch": 0.7655296229802514, + "grad_norm": 0.46914348006248474, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 10660 + }, + { + "epoch": 0.7662477558348294, + "grad_norm": 0.6134725213050842, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 10670 + }, + { + "epoch": 0.7669658886894075, + "grad_norm": 0.583859920501709, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10680 + }, + { + "epoch": 0.7676840215439856, + "grad_norm": 0.511349081993103, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10690 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.6467110514640808, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 10700 + }, + { + "epoch": 0.7691202872531419, + "grad_norm": 0.7210163474082947, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 10710 + }, + { + "epoch": 0.76983842010772, + "grad_norm": 0.6034521460533142, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 10720 + }, + { + "epoch": 0.7705565529622981, + "grad_norm": 0.6237271428108215, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 10730 + }, + { + "epoch": 0.7712746858168761, + "grad_norm": 0.664328396320343, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 10740 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.6550520062446594, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 10750 + }, + { + "epoch": 0.7727109515260323, + "grad_norm": 0.5103325843811035, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 10760 + }, + { + "epoch": 0.7734290843806104, + "grad_norm": 0.7171200513839722, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 10770 + }, + { + "epoch": 0.7741472172351885, + "grad_norm": 0.5947384834289551, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 10780 + }, + { + "epoch": 0.7748653500897666, + "grad_norm": 0.5293096899986267, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10790 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.6372577548027039, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10800 + }, + { + "epoch": 0.7763016157989228, + "grad_norm": 0.5738261938095093, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.7770197486535009, + "grad_norm": 0.7309247255325317, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 10820 + }, + { + "epoch": 0.777737881508079, + "grad_norm": 0.8867193460464478, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10830 + }, + { + "epoch": 0.7784560143626571, + "grad_norm": 0.6151437759399414, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 10840 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.5645464658737183, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 0.7798922800718133, + "grad_norm": 0.5118698477745056, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10860 + }, + { + "epoch": 0.7806104129263913, + "grad_norm": 0.618181049823761, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 10870 + }, + { + "epoch": 0.7813285457809694, + "grad_norm": 0.7206462025642395, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 10880 + }, + { + "epoch": 0.7820466786355476, + "grad_norm": 0.7993820905685425, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 10890 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.5072754621505737, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10900 + }, + { + "epoch": 0.7834829443447038, + "grad_norm": 0.5829088687896729, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 10910 + }, + { + "epoch": 0.7842010771992819, + "grad_norm": 0.5778957605361938, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 10920 + }, + { + "epoch": 0.78491921005386, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.785637342908438, + "grad_norm": 0.5778013467788696, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 10940 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.6129629611968994, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10950 + }, + { + "epoch": 0.7870736086175942, + "grad_norm": 0.5637320876121521, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10960 + }, + { + "epoch": 0.7877917414721723, + "grad_norm": 0.6253715753555298, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10970 + }, + { + "epoch": 0.7885098743267505, + "grad_norm": 0.6209888458251953, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10980 + }, + { + "epoch": 0.7892280071813286, + "grad_norm": 1.0841948986053467, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10990 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.6570560336112976, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 11000 + }, + { + "epoch": 0.7906642728904847, + "grad_norm": 0.4830388128757477, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11010 + }, + { + "epoch": 0.7913824057450628, + "grad_norm": 0.7607520222663879, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11020 + }, + { + "epoch": 0.7921005385996409, + "grad_norm": 0.8202590346336365, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 11030 + }, + { + "epoch": 0.792818671454219, + "grad_norm": 0.5640848278999329, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11040 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 0.7773675322532654, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 11050 + }, + { + "epoch": 0.7942549371633753, + "grad_norm": 0.664139986038208, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11060 + }, + { + "epoch": 0.7949730700179534, + "grad_norm": 0.6097795367240906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 11070 + }, + { + "epoch": 0.7956912028725314, + "grad_norm": 0.9208881258964539, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 11080 + }, + { + "epoch": 0.7964093357271095, + "grad_norm": 0.6210731863975525, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 11090 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.7060235738754272, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 11100 + }, + { + "epoch": 0.7978456014362657, + "grad_norm": 0.48695266246795654, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 11110 + }, + { + "epoch": 0.7985637342908438, + "grad_norm": 0.6458830833435059, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 11120 + }, + { + "epoch": 0.7992818671454219, + "grad_norm": 0.572545051574707, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 11130 + }, + { + "epoch": 0.8, + "grad_norm": 0.5925027132034302, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 11140 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 0.569622278213501, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 11150 + }, + { + "epoch": 0.8014362657091562, + "grad_norm": 0.537146806716919, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 11160 + }, + { + "epoch": 0.8021543985637343, + "grad_norm": 0.7118613719940186, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 11170 + }, + { + "epoch": 0.8028725314183124, + "grad_norm": 0.6183688044548035, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 11180 + }, + { + "epoch": 0.8035906642728905, + "grad_norm": 0.5187385082244873, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 11190 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.5422571301460266, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 11200 + }, + { + "epoch": 0.8050269299820467, + "grad_norm": 0.635050892829895, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 11210 + }, + { + "epoch": 0.8057450628366247, + "grad_norm": 0.6584872007369995, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 11220 + }, + { + "epoch": 0.8064631956912028, + "grad_norm": 0.624921977519989, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 11230 + }, + { + "epoch": 0.807181328545781, + "grad_norm": 0.6837546229362488, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 11240 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 0.5861160755157471, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11250 + }, + { + "epoch": 0.8086175942549372, + "grad_norm": 0.5751383900642395, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 11260 + }, + { + "epoch": 0.8093357271095153, + "grad_norm": 0.7181510329246521, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11270 + }, + { + "epoch": 0.8100538599640934, + "grad_norm": 0.5862139463424683, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11280 + }, + { + "epoch": 0.8107719928186714, + "grad_norm": 0.4880113899707794, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 11290 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.565590500831604, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 11300 + }, + { + "epoch": 0.8122082585278276, + "grad_norm": 0.6171264052391052, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11310 + }, + { + "epoch": 0.8129263913824057, + "grad_norm": 0.5815969109535217, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 11320 + }, + { + "epoch": 0.8136445242369839, + "grad_norm": 0.5407653450965881, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 11330 + }, + { + "epoch": 0.814362657091562, + "grad_norm": 0.6990084648132324, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 11340 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.5845068097114563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11350 + }, + { + "epoch": 0.8157989228007181, + "grad_norm": 0.5978701114654541, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11360 + }, + { + "epoch": 0.8165170556552962, + "grad_norm": 0.6873053312301636, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 11370 + }, + { + "epoch": 0.8172351885098743, + "grad_norm": 0.7048654556274414, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 11380 + }, + { + "epoch": 0.8179533213644524, + "grad_norm": 0.7631531953811646, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 11390 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.704922080039978, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 11400 + }, + { + "epoch": 0.8193895870736086, + "grad_norm": 0.595460832118988, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11410 + }, + { + "epoch": 0.8201077199281868, + "grad_norm": 0.5882242918014526, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 11420 + }, + { + "epoch": 0.8208258527827648, + "grad_norm": 0.6433175206184387, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 11430 + }, + { + "epoch": 0.8215439856373429, + "grad_norm": 0.6047986149787903, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 11440 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 11450 + }, + { + "epoch": 0.8229802513464991, + "grad_norm": 0.5558379888534546, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 11460 + }, + { + "epoch": 0.8236983842010772, + "grad_norm": 0.6745542287826538, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 11470 + }, + { + "epoch": 0.8244165170556553, + "grad_norm": 0.7082334756851196, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 11480 + }, + { + "epoch": 0.8251346499102334, + "grad_norm": 0.703889787197113, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11490 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.5261096358299255, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 11500 + }, + { + "epoch": 0.8265709156193896, + "grad_norm": 0.6009393930435181, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 11510 + }, + { + "epoch": 0.8272890484739677, + "grad_norm": 0.584274172782898, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 11520 + }, + { + "epoch": 0.8280071813285458, + "grad_norm": 0.6803238987922668, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 11530 + }, + { + "epoch": 0.8287253141831239, + "grad_norm": 0.6230084896087646, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 11540 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.6090595722198486, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 11550 + }, + { + "epoch": 0.8301615798922801, + "grad_norm": 0.5292693376541138, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 11560 + }, + { + "epoch": 0.8308797127468581, + "grad_norm": 0.5675389766693115, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 11570 + }, + { + "epoch": 0.8315978456014362, + "grad_norm": 0.554874062538147, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 11580 + }, + { + "epoch": 0.8323159784560143, + "grad_norm": 0.8582373261451721, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 11590 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.5743035674095154, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 11600 + }, + { + "epoch": 0.8337522441651706, + "grad_norm": 0.5749582648277283, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11610 + }, + { + "epoch": 0.8344703770197487, + "grad_norm": 0.5207278728485107, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11620 + }, + { + "epoch": 0.8351885098743268, + "grad_norm": 0.6262611150741577, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 11630 + }, + { + "epoch": 0.8359066427289048, + "grad_norm": 0.5490066409111023, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 11640 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6283167600631714, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 11650 + }, + { + "epoch": 0.837342908438061, + "grad_norm": 0.7701452374458313, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 11660 + }, + { + "epoch": 0.8380610412926391, + "grad_norm": 0.5825072526931763, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 11670 + }, + { + "epoch": 0.8387791741472173, + "grad_norm": 0.6119720935821533, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 11680 + }, + { + "epoch": 0.8394973070017954, + "grad_norm": 0.689383327960968, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 11690 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.5396560430526733, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 11700 + }, + { + "epoch": 0.8409335727109515, + "grad_norm": 0.577178955078125, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 11710 + }, + { + "epoch": 0.8416517055655296, + "grad_norm": 0.6652564406394958, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 11720 + }, + { + "epoch": 0.8423698384201077, + "grad_norm": 0.588377058506012, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 11730 + }, + { + "epoch": 0.8430879712746858, + "grad_norm": 0.6180438995361328, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 11740 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.6897811889648438, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11750 + }, + { + "epoch": 0.844524236983842, + "grad_norm": 0.5826608538627625, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 11760 + }, + { + "epoch": 0.8452423698384202, + "grad_norm": 0.6511976718902588, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 11770 + }, + { + "epoch": 0.8459605026929982, + "grad_norm": 0.4738382399082184, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 11780 + }, + { + "epoch": 0.8466786355475763, + "grad_norm": 0.541780948638916, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 11790 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 0.6115241050720215, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 11800 + }, + { + "epoch": 0.8481149012567325, + "grad_norm": 0.7067801356315613, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 11810 + }, + { + "epoch": 0.8488330341113106, + "grad_norm": 0.5602791905403137, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 11820 + }, + { + "epoch": 0.8495511669658887, + "grad_norm": 0.6968005299568176, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 11830 + }, + { + "epoch": 0.8502692998204668, + "grad_norm": 0.621132493019104, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11840 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.5777568817138672, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11850 + }, + { + "epoch": 0.851705565529623, + "grad_norm": 0.6468178629875183, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 11860 + }, + { + "epoch": 0.8524236983842011, + "grad_norm": 0.6216070652008057, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 11870 + }, + { + "epoch": 0.8531418312387792, + "grad_norm": 0.7402005791664124, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 11880 + }, + { + "epoch": 0.8538599640933573, + "grad_norm": 0.5192958116531372, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 11890 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 0.6050501465797424, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 11900 + }, + { + "epoch": 0.8552962298025135, + "grad_norm": 0.5363124012947083, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11910 + }, + { + "epoch": 0.8560143626570915, + "grad_norm": 0.525288462638855, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11920 + }, + { + "epoch": 0.8567324955116696, + "grad_norm": 0.6129848957061768, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 11930 + }, + { + "epoch": 0.8574506283662477, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 11940 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.5862830281257629, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 11950 + }, + { + "epoch": 0.858886894075404, + "grad_norm": 0.7078025341033936, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 11960 + }, + { + "epoch": 0.8596050269299821, + "grad_norm": 0.6600908637046814, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 11970 + }, + { + "epoch": 0.8603231597845602, + "grad_norm": 0.5914377570152283, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 11980 + }, + { + "epoch": 0.8610412926391382, + "grad_norm": 0.7844575047492981, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 11990 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.6605148315429688, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12000 + }, + { + "epoch": 0.8624775583482944, + "grad_norm": 0.6320111155509949, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 12010 + }, + { + "epoch": 0.8631956912028725, + "grad_norm": 0.5833557844161987, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 12020 + }, + { + "epoch": 0.8639138240574507, + "grad_norm": 0.5322666764259338, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 12030 + }, + { + "epoch": 0.8646319569120288, + "grad_norm": 0.568696141242981, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 12040 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 0.5739135146141052, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 12050 + }, + { + "epoch": 0.8660682226211849, + "grad_norm": 0.6667993068695068, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 12060 + }, + { + "epoch": 0.866786355475763, + "grad_norm": 0.5393701195716858, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 12070 + }, + { + "epoch": 0.8675044883303411, + "grad_norm": 0.7036312818527222, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 12080 + }, + { + "epoch": 0.8682226211849192, + "grad_norm": 0.5851739048957825, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 12090 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.6554462909698486, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 12100 + }, + { + "epoch": 0.8696588868940754, + "grad_norm": 0.8224838376045227, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 12110 + }, + { + "epoch": 0.8703770197486534, + "grad_norm": 0.513981819152832, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 12120 + }, + { + "epoch": 0.8710951526032316, + "grad_norm": 0.6913988590240479, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 12130 + }, + { + "epoch": 0.8718132854578097, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 12140 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.6216937303543091, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 12150 + }, + { + "epoch": 0.8732495511669659, + "grad_norm": 0.5594495534896851, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 12160 + }, + { + "epoch": 0.873967684021544, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 12170 + }, + { + "epoch": 0.8746858168761221, + "grad_norm": 0.5285239815711975, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 12180 + }, + { + "epoch": 0.8754039497307001, + "grad_norm": 1.0394607782363892, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 12190 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 12200 + }, + { + "epoch": 0.8768402154398564, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 12210 + }, + { + "epoch": 0.8775583482944345, + "grad_norm": 0.593204915523529, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 12220 + }, + { + "epoch": 0.8782764811490126, + "grad_norm": 0.7141679525375366, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 12230 + }, + { + "epoch": 0.8789946140035907, + "grad_norm": 0.6381585597991943, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 12240 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.7076981067657471, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12250 + }, + { + "epoch": 0.8804308797127468, + "grad_norm": 0.8046461939811707, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 12260 + }, + { + "epoch": 0.8811490125673249, + "grad_norm": 0.635160505771637, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 12270 + }, + { + "epoch": 0.881867145421903, + "grad_norm": 0.6388354301452637, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8825852782764811, + "grad_norm": 0.5612906217575073, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 12290 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6716228723526001, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 12300 + }, + { + "epoch": 0.8840215439856374, + "grad_norm": 0.6488762497901917, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 12310 + }, + { + "epoch": 0.8847396768402155, + "grad_norm": 0.5770853757858276, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 12320 + }, + { + "epoch": 0.8854578096947935, + "grad_norm": 0.5006616711616516, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 12330 + }, + { + "epoch": 0.8861759425493716, + "grad_norm": 0.6428417563438416, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 12340 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 0.5721977949142456, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12350 + }, + { + "epoch": 0.8876122082585278, + "grad_norm": 0.7000266313552856, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 12360 + }, + { + "epoch": 0.8883303411131059, + "grad_norm": 0.5252631306648254, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 12370 + }, + { + "epoch": 0.889048473967684, + "grad_norm": 0.5788044929504395, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 0.8897666068222622, + "grad_norm": 0.6730653643608093, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.5556851029396057, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 12400 + }, + { + "epoch": 0.8912028725314183, + "grad_norm": 0.616189181804657, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 12410 + }, + { + "epoch": 0.8919210053859964, + "grad_norm": 0.6360940337181091, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 12420 + }, + { + "epoch": 0.8926391382405745, + "grad_norm": 0.5832887887954712, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 12430 + }, + { + "epoch": 0.8933572710951526, + "grad_norm": 0.8319168090820312, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 12440 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.5415005087852478, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 12450 + }, + { + "epoch": 0.8947935368043088, + "grad_norm": 0.4959808588027954, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 12460 + }, + { + "epoch": 0.8955116696588868, + "grad_norm": 0.5102260708808899, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 12470 + }, + { + "epoch": 0.896229802513465, + "grad_norm": 0.773972749710083, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12480 + }, + { + "epoch": 0.8969479353680431, + "grad_norm": 0.6314513087272644, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 12490 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.6503705382347107, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 12500 + }, + { + "epoch": 0.8983842010771993, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 12510 + }, + { + "epoch": 0.8991023339317774, + "grad_norm": 0.7222756743431091, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 12520 + }, + { + "epoch": 0.8998204667863555, + "grad_norm": 0.7242336869239807, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 12530 + }, + { + "epoch": 0.9005385996409335, + "grad_norm": 0.625769317150116, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 12540 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.6003357172012329, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 12550 + }, + { + "epoch": 0.9019748653500897, + "grad_norm": 0.6089374423027039, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 12560 + }, + { + "epoch": 0.9026929982046679, + "grad_norm": 0.6232544183731079, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 12570 + }, + { + "epoch": 0.903411131059246, + "grad_norm": 0.5426769256591797, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 12580 + }, + { + "epoch": 0.9041292639138241, + "grad_norm": 0.5711943507194519, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 12590 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.5287838578224182, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 12600 + }, + { + "epoch": 0.9055655296229802, + "grad_norm": 0.6192951798439026, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 12610 + }, + { + "epoch": 0.9062836624775583, + "grad_norm": 0.493082195520401, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 12620 + }, + { + "epoch": 0.9070017953321364, + "grad_norm": 0.7668463587760925, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 12630 + }, + { + "epoch": 0.9077199281867145, + "grad_norm": 0.6298037767410278, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 12640 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.5502580404281616, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 12650 + }, + { + "epoch": 0.9091561938958708, + "grad_norm": 0.5525170564651489, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.9098743267504489, + "grad_norm": 0.9753695726394653, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 12670 + }, + { + "epoch": 0.9105924596050269, + "grad_norm": 0.611427366733551, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 12680 + }, + { + "epoch": 0.911310592459605, + "grad_norm": 0.5141594409942627, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 12690 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 0.6739137172698975, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 12700 + }, + { + "epoch": 0.9127468581687612, + "grad_norm": 0.5759707689285278, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 12710 + }, + { + "epoch": 0.9134649910233393, + "grad_norm": 0.5548733472824097, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12720 + }, + { + "epoch": 0.9141831238779174, + "grad_norm": 0.7014280557632446, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 12730 + }, + { + "epoch": 0.9149012567324956, + "grad_norm": 0.5939958691596985, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 12740 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 0.5995593667030334, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12750 + }, + { + "epoch": 0.9163375224416517, + "grad_norm": 0.6686680316925049, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 12760 + }, + { + "epoch": 0.9170556552962298, + "grad_norm": 0.4742372930049896, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 12770 + }, + { + "epoch": 0.9177737881508079, + "grad_norm": 0.5493217706680298, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 12780 + }, + { + "epoch": 0.918491921005386, + "grad_norm": 0.5641885995864868, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 12790 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 0.5814061164855957, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 12800 + }, + { + "epoch": 0.9199281867145422, + "grad_norm": 0.6774331331253052, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 12810 + }, + { + "epoch": 0.9206463195691202, + "grad_norm": 0.5592127442359924, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 12820 + }, + { + "epoch": 0.9213644524236984, + "grad_norm": 0.5246456861495972, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 12830 + }, + { + "epoch": 0.9220825852782765, + "grad_norm": 0.6524264812469482, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 12840 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 0.6010791063308716, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12850 + }, + { + "epoch": 0.9235188509874327, + "grad_norm": 0.5289866924285889, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 12860 + }, + { + "epoch": 0.9242369838420108, + "grad_norm": 0.6850762367248535, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 12870 + }, + { + "epoch": 0.9249551166965889, + "grad_norm": 0.5293797850608826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 12880 + }, + { + "epoch": 0.9256732495511669, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 12890 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.7026739716529846, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 12900 + }, + { + "epoch": 0.9271095152603231, + "grad_norm": 0.6884756684303284, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 12910 + }, + { + "epoch": 0.9278276481149013, + "grad_norm": 0.637884795665741, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 12920 + }, + { + "epoch": 0.9285457809694794, + "grad_norm": 0.513913631439209, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 12930 + }, + { + "epoch": 0.9292639138240575, + "grad_norm": 0.6642340421676636, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 12940 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.5708861947059631, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 12950 + }, + { + "epoch": 0.9307001795332136, + "grad_norm": 0.5896512866020203, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 12960 + }, + { + "epoch": 0.9314183123877917, + "grad_norm": 0.5754874348640442, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 12970 + }, + { + "epoch": 0.9321364452423698, + "grad_norm": 0.6363751888275146, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 12980 + }, + { + "epoch": 0.9328545780969479, + "grad_norm": 0.7660197019577026, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 12990 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.607728898525238, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 13000 + }, + { + "epoch": 0.9342908438061042, + "grad_norm": 0.5257042050361633, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 13010 + }, + { + "epoch": 0.9350089766606823, + "grad_norm": 0.7916908264160156, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 13020 + }, + { + "epoch": 0.9357271095152603, + "grad_norm": 0.8310123085975647, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 13030 + }, + { + "epoch": 0.9364452423698384, + "grad_norm": 0.6543728113174438, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 13040 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.7153878808021545, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 13050 + }, + { + "epoch": 0.9378815080789946, + "grad_norm": 0.7510694265365601, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 13060 + }, + { + "epoch": 0.9385996409335727, + "grad_norm": 0.5524464249610901, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 13070 + }, + { + "epoch": 0.9393177737881508, + "grad_norm": 0.6657140254974365, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 13080 + }, + { + "epoch": 0.940035906642729, + "grad_norm": 0.5757394433021545, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 13090 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.6171187162399292, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 13100 + }, + { + "epoch": 0.9414721723518851, + "grad_norm": 0.5946314334869385, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 13110 + }, + { + "epoch": 0.9421903052064632, + "grad_norm": 0.5727229714393616, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 13120 + }, + { + "epoch": 0.9429084380610413, + "grad_norm": 0.7805224061012268, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 13130 + }, + { + "epoch": 0.9436265709156194, + "grad_norm": 0.5763523578643799, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 13140 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 0.8310899138450623, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13150 + }, + { + "epoch": 0.9450628366247756, + "grad_norm": 0.7531784772872925, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 13160 + }, + { + "epoch": 0.9457809694793536, + "grad_norm": 0.678779661655426, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 13170 + }, + { + "epoch": 0.9464991023339318, + "grad_norm": 0.8096453547477722, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13180 + }, + { + "epoch": 0.9472172351885099, + "grad_norm": 0.6743921637535095, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 13190 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.606852114200592, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 13200 + }, + { + "epoch": 0.9486535008976661, + "grad_norm": 0.6550270915031433, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 13210 + }, + { + "epoch": 0.9493716337522442, + "grad_norm": 0.6494552493095398, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 13220 + }, + { + "epoch": 0.9500897666068223, + "grad_norm": 0.5867666602134705, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 13230 + }, + { + "epoch": 0.9508078994614003, + "grad_norm": 0.6283786296844482, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 13240 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 0.6824573278427124, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 13250 + }, + { + "epoch": 0.9522441651705565, + "grad_norm": 0.6945744156837463, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 13260 + }, + { + "epoch": 0.9529622980251347, + "grad_norm": 0.6468575596809387, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 13270 + }, + { + "epoch": 0.9536804308797128, + "grad_norm": 0.6819407939910889, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 0.9543985637342909, + "grad_norm": 0.6660491824150085, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 13290 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6320462226867676, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 13300 + }, + { + "epoch": 0.955834829443447, + "grad_norm": 0.46753761172294617, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 13310 + }, + { + "epoch": 0.9565529622980251, + "grad_norm": 0.6608774065971375, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 13320 + }, + { + "epoch": 0.9572710951526032, + "grad_norm": 0.607448935508728, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 13330 + }, + { + "epoch": 0.9579892280071813, + "grad_norm": 0.6796701550483704, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 13340 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.7655861377716064, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 13350 + }, + { + "epoch": 0.9594254937163376, + "grad_norm": 0.5881335735321045, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 13360 + }, + { + "epoch": 0.9601436265709156, + "grad_norm": 0.6855270862579346, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 13370 + }, + { + "epoch": 0.9608617594254937, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 13380 + }, + { + "epoch": 0.9615798922800718, + "grad_norm": 0.5983994603157043, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 13390 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.6141189932823181, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 13400 + }, + { + "epoch": 0.963016157989228, + "grad_norm": 0.6539722084999084, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 13410 + }, + { + "epoch": 0.9637342908438061, + "grad_norm": 0.5425801277160645, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 13420 + }, + { + "epoch": 0.9644524236983842, + "grad_norm": 0.8038925528526306, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 13430 + }, + { + "epoch": 0.9651705565529622, + "grad_norm": 0.5729590058326721, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 13440 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5695241689682007, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 13450 + }, + { + "epoch": 0.9666068222621185, + "grad_norm": 0.5913681387901306, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 13460 + }, + { + "epoch": 0.9673249551166966, + "grad_norm": 1.1798994541168213, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 13470 + }, + { + "epoch": 0.9680430879712747, + "grad_norm": 0.5931369066238403, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 13480 + }, + { + "epoch": 0.9687612208258528, + "grad_norm": 0.6269514560699463, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 13490 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.7380245327949524, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 13500 + }, + { + "epoch": 0.9701974865350089, + "grad_norm": 0.5668187141418457, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 13510 + }, + { + "epoch": 0.970915619389587, + "grad_norm": 0.547149121761322, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 13520 + }, + { + "epoch": 0.9716337522441651, + "grad_norm": 0.49131739139556885, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 13530 + }, + { + "epoch": 0.9723518850987433, + "grad_norm": 0.6385366320610046, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 13540 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5962417125701904, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 13550 + }, + { + "epoch": 0.9737881508078995, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 13560 + }, + { + "epoch": 0.9745062836624776, + "grad_norm": 0.5757403373718262, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 13570 + }, + { + "epoch": 0.9752244165170556, + "grad_norm": 0.7214667201042175, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 13580 + }, + { + "epoch": 0.9759425493716337, + "grad_norm": 0.5902701020240784, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 13590 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.752805769443512, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 13600 + }, + { + "epoch": 0.9773788150807899, + "grad_norm": 0.5943595767021179, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 13610 + }, + { + "epoch": 0.978096947935368, + "grad_norm": 0.6752488613128662, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 13620 + }, + { + "epoch": 0.9788150807899462, + "grad_norm": 0.5295413732528687, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 13630 + }, + { + "epoch": 0.9795332136445243, + "grad_norm": 0.732549250125885, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13640 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.5701823830604553, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 13650 + }, + { + "epoch": 0.9809694793536804, + "grad_norm": 0.576898455619812, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13660 + }, + { + "epoch": 0.9816876122082585, + "grad_norm": 0.5916832089424133, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 13670 + }, + { + "epoch": 0.9824057450628366, + "grad_norm": 0.5554524660110474, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 13680 + }, + { + "epoch": 0.9831238779174147, + "grad_norm": 0.6988440752029419, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 13690 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 13700 + }, + { + "epoch": 0.984560143626571, + "grad_norm": 2.421210289001465, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13710 + }, + { + "epoch": 0.985278276481149, + "grad_norm": 0.6307598948478699, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 13720 + }, + { + "epoch": 0.9859964093357271, + "grad_norm": 0.6832480430603027, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 13730 + }, + { + "epoch": 0.9867145421903052, + "grad_norm": 0.5974255204200745, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13740 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.6540380716323853, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 13750 + }, + { + "epoch": 0.9881508078994614, + "grad_norm": 0.7532727122306824, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 13760 + }, + { + "epoch": 0.9888689407540395, + "grad_norm": 0.6776283383369446, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 13770 + }, + { + "epoch": 0.9895870736086176, + "grad_norm": 0.5776281356811523, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 13780 + }, + { + "epoch": 0.9903052064631956, + "grad_norm": 0.5473008751869202, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 13790 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.5428591370582581, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 13800 + }, + { + "epoch": 0.9917414721723519, + "grad_norm": 0.5173406004905701, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 13810 + }, + { + "epoch": 0.99245960502693, + "grad_norm": 0.6462617516517639, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 13820 + }, + { + "epoch": 0.9931777378815081, + "grad_norm": 0.5800426006317139, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 13830 + }, + { + "epoch": 0.9938958707360862, + "grad_norm": 0.5015466809272766, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 13840 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.59474778175354, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 13850 + }, + { + "epoch": 0.9953321364452423, + "grad_norm": 0.5609583258628845, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 13860 + }, + { + "epoch": 0.9960502692998204, + "grad_norm": 0.5762063264846802, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 13870 + }, + { + "epoch": 0.9967684021543985, + "grad_norm": 0.6419214010238647, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 13880 + }, + { + "epoch": 0.9974865350089767, + "grad_norm": 0.7821950316429138, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 13890 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.6216017007827759, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 13900 + }, + { + "epoch": 0.9989228007181329, + "grad_norm": 0.5446485877037048, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 13910 + }, + { + "epoch": 0.999640933572711, + "grad_norm": 0.5037565231323242, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 13920 + }, + { + "epoch": 1.0, + "eval_loss": 1.09147310256958, + "eval_runtime": 55.1915, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 13925 + }, + { + "epoch": 1.000359066427289, + "grad_norm": 0.5808277130126953, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 13930 + }, + { + "epoch": 1.0010771992818672, + "grad_norm": 0.47258496284484863, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 13940 + }, + { + "epoch": 1.0017953321364452, + "grad_norm": 0.8921670317649841, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 13950 + }, + { + "epoch": 1.0025134649910232, + "grad_norm": 0.746729850769043, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 13960 + }, + { + "epoch": 1.0032315978456015, + "grad_norm": 0.6243796944618225, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13970 + }, + { + "epoch": 1.0039497307001795, + "grad_norm": 0.6725090742111206, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 13980 + }, + { + "epoch": 1.0046678635547577, + "grad_norm": 0.8762497305870056, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 13990 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 0.7694411873817444, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 14000 + }, + { + "epoch": 1.006104129263914, + "grad_norm": 0.6208822727203369, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 14010 + }, + { + "epoch": 1.006822262118492, + "grad_norm": 0.8503357768058777, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 14020 + }, + { + "epoch": 1.00754039497307, + "grad_norm": 0.5813316106796265, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14030 + }, + { + "epoch": 1.0082585278276481, + "grad_norm": 0.8186036348342896, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 14040 + }, + { + "epoch": 1.0089766606822261, + "grad_norm": 0.759873628616333, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14050 + }, + { + "epoch": 1.0096947935368044, + "grad_norm": 0.8437777161598206, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 14060 + }, + { + "epoch": 1.0104129263913824, + "grad_norm": 0.5750975012779236, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14070 + }, + { + "epoch": 1.0111310592459606, + "grad_norm": 0.5873221158981323, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 14080 + }, + { + "epoch": 1.0118491921005386, + "grad_norm": 0.6381314396858215, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 14090 + }, + { + "epoch": 1.0125673249551166, + "grad_norm": 0.6510405540466309, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 14100 + }, + { + "epoch": 1.0132854578096948, + "grad_norm": 0.7698671221733093, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 14110 + }, + { + "epoch": 1.0140035906642728, + "grad_norm": 0.646180272102356, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 14120 + }, + { + "epoch": 1.014721723518851, + "grad_norm": 0.6183205246925354, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 14130 + }, + { + "epoch": 1.015439856373429, + "grad_norm": 0.5082563757896423, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 14140 + }, + { + "epoch": 1.0161579892280073, + "grad_norm": 0.7285500764846802, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 14150 + }, + { + "epoch": 1.0168761220825853, + "grad_norm": 0.6368175148963928, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 14160 + }, + { + "epoch": 1.0175942549371633, + "grad_norm": 0.44868743419647217, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 14170 + }, + { + "epoch": 1.0183123877917415, + "grad_norm": 0.6346513628959656, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 14180 + }, + { + "epoch": 1.0190305206463195, + "grad_norm": 0.7287803292274475, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 14190 + }, + { + "epoch": 1.0197486535008977, + "grad_norm": 0.6701363325119019, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 14200 + }, + { + "epoch": 1.0204667863554757, + "grad_norm": 0.6419289112091064, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 14210 + }, + { + "epoch": 1.021184919210054, + "grad_norm": 0.7703002095222473, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 14220 + }, + { + "epoch": 1.021903052064632, + "grad_norm": 0.6803670525550842, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14230 + }, + { + "epoch": 1.02262118491921, + "grad_norm": 0.5780976414680481, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 14240 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 0.5096051096916199, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 14250 + }, + { + "epoch": 1.0240574506283662, + "grad_norm": 0.6058611869812012, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 14260 + }, + { + "epoch": 1.0247755834829444, + "grad_norm": 0.6703311204910278, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 14270 + }, + { + "epoch": 1.0254937163375224, + "grad_norm": 0.7143640518188477, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 14280 + }, + { + "epoch": 1.0262118491921006, + "grad_norm": 0.6730744242668152, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 14290 + }, + { + "epoch": 1.0269299820466786, + "grad_norm": 0.8180603384971619, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14300 + }, + { + "epoch": 1.0276481149012566, + "grad_norm": 0.6752267479896545, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 14310 + }, + { + "epoch": 1.0283662477558349, + "grad_norm": 0.678428590297699, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 14320 + }, + { + "epoch": 1.0290843806104129, + "grad_norm": 0.5959973931312561, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 14330 + }, + { + "epoch": 1.029802513464991, + "grad_norm": 0.5797176957130432, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 14340 + }, + { + "epoch": 1.030520646319569, + "grad_norm": 0.6415652632713318, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 14350 + }, + { + "epoch": 1.0312387791741473, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 14360 + }, + { + "epoch": 1.0319569120287253, + "grad_norm": 0.7158452272415161, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 14370 + }, + { + "epoch": 1.0326750448833033, + "grad_norm": 0.6066089272499084, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 14380 + }, + { + "epoch": 1.0333931777378815, + "grad_norm": 0.7359582781791687, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 14390 + }, + { + "epoch": 1.0341113105924595, + "grad_norm": 0.7372373938560486, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 14400 + }, + { + "epoch": 1.0348294434470378, + "grad_norm": 0.7511868476867676, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 14410 + }, + { + "epoch": 1.0355475763016158, + "grad_norm": 0.5449917912483215, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 14420 + }, + { + "epoch": 1.036265709156194, + "grad_norm": 0.6700817346572876, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 14430 + }, + { + "epoch": 1.036983842010772, + "grad_norm": 0.7061316967010498, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14440 + }, + { + "epoch": 1.03770197486535, + "grad_norm": 0.7582663893699646, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 14450 + }, + { + "epoch": 1.0384201077199282, + "grad_norm": 0.6408873200416565, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 14460 + }, + { + "epoch": 1.0391382405745062, + "grad_norm": 0.7645436525344849, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 14470 + }, + { + "epoch": 1.0398563734290844, + "grad_norm": 0.6522644758224487, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 14480 + }, + { + "epoch": 1.0405745062836624, + "grad_norm": 0.784273624420166, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 14490 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 0.673891544342041, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 14500 + }, + { + "epoch": 1.0420107719928187, + "grad_norm": 0.6566316485404968, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 14510 + }, + { + "epoch": 1.0427289048473967, + "grad_norm": 0.6062059998512268, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 14520 + }, + { + "epoch": 1.0434470377019749, + "grad_norm": 0.6884504556655884, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14530 + }, + { + "epoch": 1.044165170556553, + "grad_norm": 0.6642231345176697, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14540 + }, + { + "epoch": 1.0448833034111311, + "grad_norm": 0.6989523768424988, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 14550 + }, + { + "epoch": 1.0456014362657091, + "grad_norm": 0.8179892301559448, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 14560 + }, + { + "epoch": 1.0463195691202873, + "grad_norm": 0.6426970362663269, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 14570 + }, + { + "epoch": 1.0470377019748653, + "grad_norm": 0.678445041179657, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 14580 + }, + { + "epoch": 1.0477558348294433, + "grad_norm": 0.7573820352554321, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 14590 + }, + { + "epoch": 1.0484739676840216, + "grad_norm": 0.734443724155426, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 14600 + }, + { + "epoch": 1.0491921005385996, + "grad_norm": 0.7333676218986511, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14610 + }, + { + "epoch": 1.0499102333931778, + "grad_norm": 0.6122187972068787, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14620 + }, + { + "epoch": 1.0506283662477558, + "grad_norm": 0.6916412711143494, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 14630 + }, + { + "epoch": 1.051346499102334, + "grad_norm": 0.5898127555847168, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 14640 + }, + { + "epoch": 1.052064631956912, + "grad_norm": 0.6071873307228088, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14650 + }, + { + "epoch": 1.05278276481149, + "grad_norm": 0.6530455946922302, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 14660 + }, + { + "epoch": 1.0535008976660682, + "grad_norm": 0.6919314861297607, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14670 + }, + { + "epoch": 1.0542190305206462, + "grad_norm": 0.7843509912490845, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 14680 + }, + { + "epoch": 1.0549371633752245, + "grad_norm": 0.6106747388839722, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 14690 + }, + { + "epoch": 1.0556552962298025, + "grad_norm": 0.7828368544578552, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 14700 + }, + { + "epoch": 1.0563734290843807, + "grad_norm": 0.6772044897079468, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 14710 + }, + { + "epoch": 1.0570915619389587, + "grad_norm": 0.5430962443351746, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 14720 + }, + { + "epoch": 1.0578096947935367, + "grad_norm": 0.7364194989204407, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 14730 + }, + { + "epoch": 1.058527827648115, + "grad_norm": 0.5607585310935974, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 14740 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 0.7917081713676453, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 14750 + }, + { + "epoch": 1.0599640933572712, + "grad_norm": 0.7852025628089905, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 14760 + }, + { + "epoch": 1.0606822262118492, + "grad_norm": 0.6329161524772644, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 14770 + }, + { + "epoch": 1.0614003590664274, + "grad_norm": 0.7607306838035583, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14780 + }, + { + "epoch": 1.0621184919210054, + "grad_norm": 0.7236617207527161, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14790 + }, + { + "epoch": 1.0628366247755834, + "grad_norm": 0.793542206287384, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 14800 + }, + { + "epoch": 1.0635547576301616, + "grad_norm": 0.53999263048172, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 14810 + }, + { + "epoch": 1.0642728904847396, + "grad_norm": 0.5821034908294678, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 14820 + }, + { + "epoch": 1.0649910233393178, + "grad_norm": 0.6593600511550903, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 14830 + }, + { + "epoch": 1.0657091561938958, + "grad_norm": 0.70230633020401, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 14840 + }, + { + "epoch": 1.066427289048474, + "grad_norm": 0.5715264081954956, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14850 + }, + { + "epoch": 1.067145421903052, + "grad_norm": 0.6610119938850403, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 14860 + }, + { + "epoch": 1.06786355475763, + "grad_norm": 0.5470091700553894, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 14870 + }, + { + "epoch": 1.0685816876122083, + "grad_norm": 0.7529906630516052, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 14880 + }, + { + "epoch": 1.0692998204667863, + "grad_norm": 0.7532844543457031, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 14890 + }, + { + "epoch": 1.0700179533213645, + "grad_norm": 0.6439316868782043, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14900 + }, + { + "epoch": 1.0707360861759425, + "grad_norm": 0.5580114126205444, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14910 + }, + { + "epoch": 1.0714542190305207, + "grad_norm": 0.6299236416816711, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 14920 + }, + { + "epoch": 1.0721723518850987, + "grad_norm": 0.6934021711349487, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 14930 + }, + { + "epoch": 1.0728904847396767, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 14940 + }, + { + "epoch": 1.073608617594255, + "grad_norm": 0.8921014070510864, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14950 + }, + { + "epoch": 1.074326750448833, + "grad_norm": 0.5934301614761353, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 14960 + }, + { + "epoch": 1.0750448833034112, + "grad_norm": 0.8379642367362976, + "learning_rate": 0.0002, + "loss": 0.7595, + "step": 14970 + }, + { + "epoch": 1.0757630161579892, + "grad_norm": 0.6842767596244812, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 14980 + }, + { + "epoch": 1.0764811490125674, + "grad_norm": 0.7296533584594727, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 14990 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 0.6821087002754211, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15000 + }, + { + "epoch": 1.0779174147217234, + "grad_norm": 0.6133626699447632, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 15010 + }, + { + "epoch": 1.0786355475763016, + "grad_norm": 0.6774773001670837, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 15020 + }, + { + "epoch": 1.0793536804308796, + "grad_norm": 0.6818786859512329, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 15030 + }, + { + "epoch": 1.0800718132854579, + "grad_norm": 0.7763522863388062, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15040 + }, + { + "epoch": 1.0807899461400359, + "grad_norm": 0.7259193658828735, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15050 + }, + { + "epoch": 1.081508078994614, + "grad_norm": 0.6797525882720947, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 15060 + }, + { + "epoch": 1.082226211849192, + "grad_norm": 0.5775881409645081, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 15070 + }, + { + "epoch": 1.08294434470377, + "grad_norm": 0.7055524587631226, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15080 + }, + { + "epoch": 1.0836624775583483, + "grad_norm": 0.8018748760223389, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 15090 + }, + { + "epoch": 1.0843806104129263, + "grad_norm": 0.6738115549087524, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 15100 + }, + { + "epoch": 1.0850987432675046, + "grad_norm": 0.6586359143257141, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 15110 + }, + { + "epoch": 1.0858168761220826, + "grad_norm": 0.7396895885467529, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 15120 + }, + { + "epoch": 1.0865350089766608, + "grad_norm": 0.7224817276000977, + "learning_rate": 0.0002, + "loss": 0.7473, + "step": 15130 + }, + { + "epoch": 1.0872531418312388, + "grad_norm": 0.798514187335968, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 15140 + }, + { + "epoch": 1.0879712746858168, + "grad_norm": 0.79301518201828, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 15150 + }, + { + "epoch": 1.088689407540395, + "grad_norm": 0.7106764316558838, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 15160 + }, + { + "epoch": 1.089407540394973, + "grad_norm": 0.6525473594665527, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 15170 + }, + { + "epoch": 1.0901256732495512, + "grad_norm": 0.6001671552658081, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 15180 + }, + { + "epoch": 1.0908438061041292, + "grad_norm": 0.6949557662010193, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 15190 + }, + { + "epoch": 1.0915619389587075, + "grad_norm": 0.5713186860084534, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 15200 + }, + { + "epoch": 1.0922800718132855, + "grad_norm": 0.8773220181465149, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 15210 + }, + { + "epoch": 1.0929982046678635, + "grad_norm": 0.5837785601615906, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 15220 + }, + { + "epoch": 1.0937163375224417, + "grad_norm": 0.7243856191635132, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 15230 + }, + { + "epoch": 1.0944344703770197, + "grad_norm": 0.7008263468742371, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 15240 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 0.7061941623687744, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 15250 + }, + { + "epoch": 1.095870736086176, + "grad_norm": 0.575903594493866, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 15260 + }, + { + "epoch": 1.0965888689407541, + "grad_norm": 0.6794043183326721, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 15270 + }, + { + "epoch": 1.0973070017953321, + "grad_norm": 0.7194870710372925, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 15280 + }, + { + "epoch": 1.0980251346499101, + "grad_norm": 0.8063322305679321, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 15290 + }, + { + "epoch": 1.0987432675044884, + "grad_norm": 0.786101758480072, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 15300 + }, + { + "epoch": 1.0994614003590664, + "grad_norm": 0.827474057674408, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 15310 + }, + { + "epoch": 1.1001795332136446, + "grad_norm": 0.6514455080032349, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 15320 + }, + { + "epoch": 1.1008976660682226, + "grad_norm": 0.7534348368644714, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15330 + }, + { + "epoch": 1.1016157989228008, + "grad_norm": 0.6991367340087891, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 15340 + }, + { + "epoch": 1.1023339317773788, + "grad_norm": 0.6742196679115295, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15350 + }, + { + "epoch": 1.1030520646319568, + "grad_norm": 0.7373757362365723, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 15360 + }, + { + "epoch": 1.103770197486535, + "grad_norm": 0.6834485530853271, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 15370 + }, + { + "epoch": 1.104488330341113, + "grad_norm": 0.6454901099205017, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 15380 + }, + { + "epoch": 1.1052064631956913, + "grad_norm": 0.7764508128166199, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 15390 + }, + { + "epoch": 1.1059245960502693, + "grad_norm": 0.668560802936554, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 15400 + }, + { + "epoch": 1.1066427289048475, + "grad_norm": 0.579655110836029, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 15410 + }, + { + "epoch": 1.1073608617594255, + "grad_norm": 0.7196493148803711, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 15420 + }, + { + "epoch": 1.1080789946140035, + "grad_norm": 0.5530232191085815, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 15430 + }, + { + "epoch": 1.1087971274685817, + "grad_norm": 0.6542958617210388, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 15440 + }, + { + "epoch": 1.1095152603231597, + "grad_norm": 0.7468852400779724, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 15450 + }, + { + "epoch": 1.110233393177738, + "grad_norm": 0.8119780421257019, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 15460 + }, + { + "epoch": 1.110951526032316, + "grad_norm": 0.7807733416557312, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 15470 + }, + { + "epoch": 1.1116696588868942, + "grad_norm": 0.7352553009986877, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 15480 + }, + { + "epoch": 1.1123877917414722, + "grad_norm": 0.8455224633216858, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 15490 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 0.635308563709259, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 15500 + }, + { + "epoch": 1.1138240574506284, + "grad_norm": 0.6268794536590576, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15510 + }, + { + "epoch": 1.1145421903052064, + "grad_norm": 0.6829593181610107, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 15520 + }, + { + "epoch": 1.1152603231597846, + "grad_norm": 0.5997796058654785, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 15530 + }, + { + "epoch": 1.1159784560143626, + "grad_norm": 0.7500942349433899, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 15540 + }, + { + "epoch": 1.1166965888689409, + "grad_norm": 0.7052047848701477, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 15550 + }, + { + "epoch": 1.1174147217235189, + "grad_norm": 0.6698189377784729, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 15560 + }, + { + "epoch": 1.1181328545780969, + "grad_norm": 0.7890462875366211, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 15570 + }, + { + "epoch": 1.118850987432675, + "grad_norm": 0.7002465128898621, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 15580 + }, + { + "epoch": 1.119569120287253, + "grad_norm": 0.7456073760986328, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 15590 + }, + { + "epoch": 1.1202872531418313, + "grad_norm": 0.7997385263442993, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 15600 + }, + { + "epoch": 1.1210053859964093, + "grad_norm": 0.6640482544898987, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15610 + }, + { + "epoch": 1.1217235188509875, + "grad_norm": 0.7765318155288696, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15620 + }, + { + "epoch": 1.1224416517055655, + "grad_norm": 0.7184962630271912, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 15630 + }, + { + "epoch": 1.1231597845601435, + "grad_norm": 0.7310904264450073, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 15640 + }, + { + "epoch": 1.1238779174147218, + "grad_norm": 0.7406452298164368, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 15650 + }, + { + "epoch": 1.1245960502692998, + "grad_norm": 0.7546738982200623, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 15660 + }, + { + "epoch": 1.125314183123878, + "grad_norm": 0.7069764733314514, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 15670 + }, + { + "epoch": 1.126032315978456, + "grad_norm": 0.6309521198272705, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 15680 + }, + { + "epoch": 1.1267504488330342, + "grad_norm": 0.8050156831741333, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 15690 + }, + { + "epoch": 1.1274685816876122, + "grad_norm": 0.726556122303009, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 15700 + }, + { + "epoch": 1.1281867145421902, + "grad_norm": 0.77745521068573, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 15710 + }, + { + "epoch": 1.1289048473967684, + "grad_norm": 0.7467634677886963, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 15720 + }, + { + "epoch": 1.1296229802513464, + "grad_norm": 0.8207895755767822, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 15730 + }, + { + "epoch": 1.1303411131059247, + "grad_norm": 0.8253937363624573, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 15740 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 0.6313983798027039, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 15750 + }, + { + "epoch": 1.1317773788150807, + "grad_norm": 0.8040992021560669, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 15760 + }, + { + "epoch": 1.132495511669659, + "grad_norm": 0.5937064290046692, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 15770 + }, + { + "epoch": 1.133213644524237, + "grad_norm": 0.6486281156539917, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 15780 + }, + { + "epoch": 1.1339317773788151, + "grad_norm": 0.6161853075027466, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 15790 + }, + { + "epoch": 1.1346499102333931, + "grad_norm": 0.6926610469818115, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 15800 + }, + { + "epoch": 1.1353680430879713, + "grad_norm": 0.6084047555923462, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 15810 + }, + { + "epoch": 1.1360861759425493, + "grad_norm": 0.6928383111953735, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 15820 + }, + { + "epoch": 1.1368043087971276, + "grad_norm": 0.7784243822097778, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 15830 + }, + { + "epoch": 1.1375224416517056, + "grad_norm": 0.7169384956359863, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 15840 + }, + { + "epoch": 1.1382405745062836, + "grad_norm": 0.6953616142272949, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 15850 + }, + { + "epoch": 1.1389587073608618, + "grad_norm": 0.7345215082168579, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15860 + }, + { + "epoch": 1.1396768402154398, + "grad_norm": 0.5469502806663513, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 15870 + }, + { + "epoch": 1.140394973070018, + "grad_norm": 0.687680721282959, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15880 + }, + { + "epoch": 1.141113105924596, + "grad_norm": 0.6879996657371521, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 15890 + }, + { + "epoch": 1.141831238779174, + "grad_norm": 0.728886067867279, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 15900 + }, + { + "epoch": 1.1425493716337523, + "grad_norm": 0.929531455039978, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 15910 + }, + { + "epoch": 1.1432675044883303, + "grad_norm": 0.8122507333755493, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 15920 + }, + { + "epoch": 1.1439856373429085, + "grad_norm": 0.6494652628898621, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 15930 + }, + { + "epoch": 1.1447037701974865, + "grad_norm": 0.7307567596435547, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15940 + }, + { + "epoch": 1.1454219030520647, + "grad_norm": 0.548678994178772, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 15950 + }, + { + "epoch": 1.1461400359066427, + "grad_norm": 0.8011603951454163, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 15960 + }, + { + "epoch": 1.146858168761221, + "grad_norm": 0.7026647329330444, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 15970 + }, + { + "epoch": 1.147576301615799, + "grad_norm": 0.7338995933532715, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 15980 + }, + { + "epoch": 1.148294434470377, + "grad_norm": 0.8453443646430969, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 15990 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 0.6787207126617432, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 16000 + }, + { + "epoch": 1.1497307001795332, + "grad_norm": 0.6314631104469299, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 16010 + }, + { + "epoch": 1.1504488330341114, + "grad_norm": 0.8812752962112427, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16020 + }, + { + "epoch": 1.1511669658886894, + "grad_norm": 0.6528969407081604, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 16030 + }, + { + "epoch": 1.1518850987432674, + "grad_norm": 0.7843571305274963, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 16040 + }, + { + "epoch": 1.1526032315978456, + "grad_norm": 0.7095080018043518, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 16050 + }, + { + "epoch": 1.1533213644524236, + "grad_norm": 0.7495582103729248, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 16060 + }, + { + "epoch": 1.1540394973070018, + "grad_norm": 0.6002049446105957, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 16070 + }, + { + "epoch": 1.1547576301615798, + "grad_norm": 0.565014123916626, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 16080 + }, + { + "epoch": 1.155475763016158, + "grad_norm": 0.8209971785545349, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 16090 + }, + { + "epoch": 1.156193895870736, + "grad_norm": 0.7137531042098999, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 16100 + }, + { + "epoch": 1.1569120287253143, + "grad_norm": 0.7307516932487488, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 16110 + }, + { + "epoch": 1.1576301615798923, + "grad_norm": 0.6686444878578186, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 16120 + }, + { + "epoch": 1.1583482944344703, + "grad_norm": 0.7977298498153687, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 16130 + }, + { + "epoch": 1.1590664272890485, + "grad_norm": 0.6980607509613037, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 16140 + }, + { + "epoch": 1.1597845601436265, + "grad_norm": 0.6622613668441772, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 16150 + }, + { + "epoch": 1.1605026929982047, + "grad_norm": 0.6598347425460815, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 16160 + }, + { + "epoch": 1.1612208258527827, + "grad_norm": 0.6686234474182129, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 16170 + }, + { + "epoch": 1.1619389587073607, + "grad_norm": 0.7308177947998047, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 16180 + }, + { + "epoch": 1.162657091561939, + "grad_norm": 0.939537525177002, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 16190 + }, + { + "epoch": 1.163375224416517, + "grad_norm": 0.5514758825302124, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 16200 + }, + { + "epoch": 1.1640933572710952, + "grad_norm": 0.589142918586731, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 16210 + }, + { + "epoch": 1.1648114901256732, + "grad_norm": 0.6888012290000916, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 16220 + }, + { + "epoch": 1.1655296229802514, + "grad_norm": 0.82566899061203, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 16230 + }, + { + "epoch": 1.1662477558348294, + "grad_norm": 0.6107817888259888, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 16240 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 0.7831398844718933, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 16250 + }, + { + "epoch": 1.1676840215439857, + "grad_norm": 0.6468397974967957, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 16260 + }, + { + "epoch": 1.1684021543985637, + "grad_norm": 0.7284161448478699, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 16270 + }, + { + "epoch": 1.1691202872531419, + "grad_norm": 0.6182818412780762, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 16280 + }, + { + "epoch": 1.1698384201077199, + "grad_norm": 0.7091781497001648, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 16290 + }, + { + "epoch": 1.170556552962298, + "grad_norm": 0.7327643632888794, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 16300 + }, + { + "epoch": 1.171274685816876, + "grad_norm": 0.5864694118499756, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 16310 + }, + { + "epoch": 1.171992818671454, + "grad_norm": 0.7049986720085144, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 16320 + }, + { + "epoch": 1.1727109515260323, + "grad_norm": 0.7563399076461792, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 16330 + }, + { + "epoch": 1.1734290843806103, + "grad_norm": 0.5888143181800842, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16340 + }, + { + "epoch": 1.1741472172351886, + "grad_norm": 0.8670049905776978, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 16350 + }, + { + "epoch": 1.1748653500897666, + "grad_norm": 0.8045654296875, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 16360 + }, + { + "epoch": 1.1755834829443448, + "grad_norm": 0.9115668535232544, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 16370 + }, + { + "epoch": 1.1763016157989228, + "grad_norm": 0.6943584084510803, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 16380 + }, + { + "epoch": 1.177019748653501, + "grad_norm": 0.7931740283966064, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 16390 + }, + { + "epoch": 1.177737881508079, + "grad_norm": 0.7967953085899353, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16400 + }, + { + "epoch": 1.178456014362657, + "grad_norm": 0.575165867805481, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 16410 + }, + { + "epoch": 1.1791741472172352, + "grad_norm": 0.6803409457206726, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 16420 + }, + { + "epoch": 1.1798922800718132, + "grad_norm": 0.7661909461021423, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 16430 + }, + { + "epoch": 1.1806104129263915, + "grad_norm": 0.7907630205154419, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 16440 + }, + { + "epoch": 1.1813285457809695, + "grad_norm": 0.7215338945388794, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 16450 + }, + { + "epoch": 1.1820466786355475, + "grad_norm": 0.6824054718017578, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 16460 + }, + { + "epoch": 1.1827648114901257, + "grad_norm": 0.8057665228843689, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 16470 + }, + { + "epoch": 1.1834829443447037, + "grad_norm": 0.7487542033195496, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 16480 + }, + { + "epoch": 1.184201077199282, + "grad_norm": 0.7254953384399414, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 16490 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 0.6986604332923889, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 16500 + }, + { + "epoch": 1.1856373429084381, + "grad_norm": 0.7889591455459595, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 16510 + }, + { + "epoch": 1.1863554757630161, + "grad_norm": 0.6029604077339172, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 16520 + }, + { + "epoch": 1.1870736086175944, + "grad_norm": 0.680322527885437, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 16530 + }, + { + "epoch": 1.1877917414721724, + "grad_norm": 0.8588826060295105, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 16540 + }, + { + "epoch": 1.1885098743267504, + "grad_norm": 0.7614806890487671, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 16550 + }, + { + "epoch": 1.1892280071813286, + "grad_norm": 0.7523183226585388, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 16560 + }, + { + "epoch": 1.1899461400359066, + "grad_norm": 0.8299532532691956, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 16570 + }, + { + "epoch": 1.1906642728904848, + "grad_norm": 0.6709241271018982, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 16580 + }, + { + "epoch": 1.1913824057450628, + "grad_norm": 0.665414035320282, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16590 + }, + { + "epoch": 1.1921005385996408, + "grad_norm": 0.7582152485847473, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 16600 + }, + { + "epoch": 1.192818671454219, + "grad_norm": 0.5856947302818298, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 16610 + }, + { + "epoch": 1.193536804308797, + "grad_norm": 0.6972885727882385, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 16620 + }, + { + "epoch": 1.1942549371633753, + "grad_norm": 0.6884734630584717, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 16630 + }, + { + "epoch": 1.1949730700179533, + "grad_norm": 0.7380475401878357, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 16640 + }, + { + "epoch": 1.1956912028725315, + "grad_norm": 0.7976197600364685, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 16650 + }, + { + "epoch": 1.1964093357271095, + "grad_norm": 0.819256067276001, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 16660 + }, + { + "epoch": 1.1971274685816877, + "grad_norm": 0.587867796421051, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 16670 + }, + { + "epoch": 1.1978456014362657, + "grad_norm": 0.9162678122520447, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 16680 + }, + { + "epoch": 1.1985637342908437, + "grad_norm": 0.7452084422111511, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 16690 + }, + { + "epoch": 1.199281867145422, + "grad_norm": 0.7966971397399902, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 16700 + }, + { + "epoch": 1.2, + "grad_norm": 0.6605724692344666, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 16710 + }, + { + "epoch": 1.2007181328545782, + "grad_norm": 0.6499220728874207, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16720 + }, + { + "epoch": 1.2014362657091562, + "grad_norm": 0.7422114610671997, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 16730 + }, + { + "epoch": 1.2021543985637342, + "grad_norm": 0.6652370095252991, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 16740 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 0.8761070370674133, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 16750 + }, + { + "epoch": 1.2035906642728904, + "grad_norm": 0.7294463515281677, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 16760 + }, + { + "epoch": 1.2043087971274686, + "grad_norm": 0.7725599408149719, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 16770 + }, + { + "epoch": 1.2050269299820466, + "grad_norm": 0.5630005598068237, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 16780 + }, + { + "epoch": 1.2057450628366249, + "grad_norm": 0.7601404786109924, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16790 + }, + { + "epoch": 1.2064631956912029, + "grad_norm": 0.6859985589981079, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16800 + }, + { + "epoch": 1.207181328545781, + "grad_norm": 0.7040054798126221, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 16810 + }, + { + "epoch": 1.207899461400359, + "grad_norm": 0.7058989405632019, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 16820 + }, + { + "epoch": 1.208617594254937, + "grad_norm": 0.7646133899688721, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16830 + }, + { + "epoch": 1.2093357271095153, + "grad_norm": 0.669550359249115, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 16840 + }, + { + "epoch": 1.2100538599640933, + "grad_norm": 0.6613401174545288, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16850 + }, + { + "epoch": 1.2107719928186715, + "grad_norm": 0.8636519312858582, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 16860 + }, + { + "epoch": 1.2114901256732495, + "grad_norm": 0.6077507138252258, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 16870 + }, + { + "epoch": 1.2122082585278275, + "grad_norm": 0.7892228364944458, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 16880 + }, + { + "epoch": 1.2129263913824058, + "grad_norm": 0.7424154877662659, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 16890 + }, + { + "epoch": 1.2136445242369838, + "grad_norm": 0.6525408029556274, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 16900 + }, + { + "epoch": 1.214362657091562, + "grad_norm": 0.6178015470504761, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 16910 + }, + { + "epoch": 1.21508078994614, + "grad_norm": 0.7319437861442566, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 16920 + }, + { + "epoch": 1.2157989228007182, + "grad_norm": 0.6823344826698303, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 16930 + }, + { + "epoch": 1.2165170556552962, + "grad_norm": 0.5681257843971252, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 16940 + }, + { + "epoch": 1.2172351885098744, + "grad_norm": 0.7939814925193787, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 16950 + }, + { + "epoch": 1.2179533213644524, + "grad_norm": 0.7031611800193787, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 16960 + }, + { + "epoch": 1.2186714542190304, + "grad_norm": 0.7610133290290833, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16970 + }, + { + "epoch": 1.2193895870736087, + "grad_norm": 0.8707142472267151, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 16980 + }, + { + "epoch": 1.2201077199281867, + "grad_norm": 0.6603384017944336, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 16990 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 0.7218315005302429, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 17000 + }, + { + "epoch": 1.221543985637343, + "grad_norm": 0.8043148517608643, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17010 + }, + { + "epoch": 1.222262118491921, + "grad_norm": 0.7232559323310852, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17020 + }, + { + "epoch": 1.2229802513464991, + "grad_norm": 0.690376341342926, + "learning_rate": 0.0002, + "loss": 0.7681, + "step": 17030 + }, + { + "epoch": 1.2236983842010771, + "grad_norm": 0.602436363697052, + "learning_rate": 0.0002, + "loss": 0.7042, + "step": 17040 + }, + { + "epoch": 1.2244165170556554, + "grad_norm": 0.7610493898391724, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 17050 + }, + { + "epoch": 1.2251346499102334, + "grad_norm": 0.7504690885543823, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 17060 + }, + { + "epoch": 1.2258527827648116, + "grad_norm": 0.8080246448516846, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 17070 + }, + { + "epoch": 1.2265709156193896, + "grad_norm": 1.0240572690963745, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 17080 + }, + { + "epoch": 1.2272890484739678, + "grad_norm": 0.6874111294746399, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 17090 + }, + { + "epoch": 1.2280071813285458, + "grad_norm": 0.800069272518158, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 17100 + }, + { + "epoch": 1.2287253141831238, + "grad_norm": 0.8628103137016296, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 17110 + }, + { + "epoch": 1.229443447037702, + "grad_norm": 0.7408499121665955, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 17120 + }, + { + "epoch": 1.23016157989228, + "grad_norm": 0.6494335532188416, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 17130 + }, + { + "epoch": 1.2308797127468583, + "grad_norm": 0.6493549942970276, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17140 + }, + { + "epoch": 1.2315978456014363, + "grad_norm": 0.6972658038139343, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 17150 + }, + { + "epoch": 1.2323159784560143, + "grad_norm": 0.6877315044403076, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 17160 + }, + { + "epoch": 1.2330341113105925, + "grad_norm": 0.7569024562835693, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 17170 + }, + { + "epoch": 1.2337522441651705, + "grad_norm": 0.696260392665863, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 17180 + }, + { + "epoch": 1.2344703770197487, + "grad_norm": 0.6150345802307129, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 17190 + }, + { + "epoch": 1.2351885098743267, + "grad_norm": 0.69009929895401, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 17200 + }, + { + "epoch": 1.235906642728905, + "grad_norm": 0.7035185098648071, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 17210 + }, + { + "epoch": 1.236624775583483, + "grad_norm": 0.6792506575584412, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17220 + }, + { + "epoch": 1.2373429084380612, + "grad_norm": 0.6310356855392456, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 17230 + }, + { + "epoch": 1.2380610412926392, + "grad_norm": 0.647026538848877, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 17240 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 0.7609930038452148, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 17250 + }, + { + "epoch": 1.2394973070017954, + "grad_norm": 0.791890561580658, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 17260 + }, + { + "epoch": 1.2402154398563734, + "grad_norm": 0.7126715183258057, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 17270 + }, + { + "epoch": 1.2409335727109516, + "grad_norm": 0.7850401401519775, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 17280 + }, + { + "epoch": 1.2416517055655296, + "grad_norm": 0.6694281697273254, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 17290 + }, + { + "epoch": 1.2423698384201076, + "grad_norm": 0.6418080925941467, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 17300 + }, + { + "epoch": 1.2430879712746858, + "grad_norm": 0.7308132648468018, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 17310 + }, + { + "epoch": 1.2438061041292638, + "grad_norm": 0.8322312235832214, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17320 + }, + { + "epoch": 1.244524236983842, + "grad_norm": 0.6959006190299988, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 17330 + }, + { + "epoch": 1.24524236983842, + "grad_norm": 0.7110121846199036, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17340 + }, + { + "epoch": 1.2459605026929983, + "grad_norm": 0.6496296525001526, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 17350 + }, + { + "epoch": 1.2466786355475763, + "grad_norm": 0.7649076581001282, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 17360 + }, + { + "epoch": 1.2473967684021545, + "grad_norm": 0.7139049172401428, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 17370 + }, + { + "epoch": 1.2481149012567325, + "grad_norm": 0.7709113955497742, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 17380 + }, + { + "epoch": 1.2488330341113105, + "grad_norm": 0.7160373330116272, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 17390 + }, + { + "epoch": 1.2495511669658887, + "grad_norm": 0.5608301162719727, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17400 + }, + { + "epoch": 1.2502692998204668, + "grad_norm": 0.6913180351257324, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 17410 + }, + { + "epoch": 1.250987432675045, + "grad_norm": 0.6980322599411011, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 17420 + }, + { + "epoch": 1.251705565529623, + "grad_norm": 0.8155394792556763, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 17430 + }, + { + "epoch": 1.252423698384201, + "grad_norm": 0.8015886545181274, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 17440 + }, + { + "epoch": 1.2531418312387792, + "grad_norm": 0.5985556244850159, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17450 + }, + { + "epoch": 1.2538599640933572, + "grad_norm": 0.70317143201828, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17460 + }, + { + "epoch": 1.2545780969479354, + "grad_norm": 0.612501323223114, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17470 + }, + { + "epoch": 1.2552962298025134, + "grad_norm": 0.7347102165222168, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 17480 + }, + { + "epoch": 1.2560143626570914, + "grad_norm": 0.9189441800117493, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 17490 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 0.7727932929992676, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 17500 + }, + { + "epoch": 1.2574506283662479, + "grad_norm": 0.6782869696617126, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 17510 + }, + { + "epoch": 1.2581687612208259, + "grad_norm": 0.5710638761520386, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17520 + }, + { + "epoch": 1.2588868940754039, + "grad_norm": 0.6856266856193542, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 17530 + }, + { + "epoch": 1.259605026929982, + "grad_norm": 0.7257347702980042, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 17540 + }, + { + "epoch": 1.26032315978456, + "grad_norm": 0.6343092918395996, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 17550 + }, + { + "epoch": 1.2610412926391383, + "grad_norm": 0.6482594013214111, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 17560 + }, + { + "epoch": 1.2617594254937163, + "grad_norm": 0.6542837619781494, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 17570 + }, + { + "epoch": 1.2624775583482943, + "grad_norm": 0.7106123566627502, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 17580 + }, + { + "epoch": 1.2631956912028726, + "grad_norm": 0.9081960320472717, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 17590 + }, + { + "epoch": 1.2639138240574506, + "grad_norm": 0.7010290026664734, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 17600 + }, + { + "epoch": 1.2646319569120288, + "grad_norm": 0.9973132610321045, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 17610 + }, + { + "epoch": 1.2653500897666068, + "grad_norm": 0.8003297448158264, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 17620 + }, + { + "epoch": 1.2660682226211848, + "grad_norm": 0.7383468151092529, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 17630 + }, + { + "epoch": 1.266786355475763, + "grad_norm": 0.6337200999259949, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 17640 + }, + { + "epoch": 1.2675044883303412, + "grad_norm": 0.6371761560440063, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 17650 + }, + { + "epoch": 1.2682226211849192, + "grad_norm": 0.7283522486686707, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 17660 + }, + { + "epoch": 1.2689407540394972, + "grad_norm": 0.8191015720367432, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 17670 + }, + { + "epoch": 1.2696588868940755, + "grad_norm": 0.6210351586341858, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 17680 + }, + { + "epoch": 1.2703770197486535, + "grad_norm": 0.6563277840614319, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 17690 + }, + { + "epoch": 1.2710951526032317, + "grad_norm": 0.7111260294914246, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 17700 + }, + { + "epoch": 1.2718132854578097, + "grad_norm": 0.7061500549316406, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 17710 + }, + { + "epoch": 1.2725314183123877, + "grad_norm": 0.7657744884490967, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 17720 + }, + { + "epoch": 1.273249551166966, + "grad_norm": 0.6952996850013733, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17730 + }, + { + "epoch": 1.273967684021544, + "grad_norm": 0.5678043961524963, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 17740 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 0.8608036041259766, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 17750 + }, + { + "epoch": 1.2754039497307001, + "grad_norm": 0.7184045910835266, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 17760 + }, + { + "epoch": 1.2761220825852782, + "grad_norm": 0.6647557616233826, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 17770 + }, + { + "epoch": 1.2768402154398564, + "grad_norm": 0.6899349093437195, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17780 + }, + { + "epoch": 1.2775583482944346, + "grad_norm": 0.7073346972465515, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 17790 + }, + { + "epoch": 1.2782764811490126, + "grad_norm": 0.8896707892417908, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 17800 + }, + { + "epoch": 1.2789946140035906, + "grad_norm": 0.5072778463363647, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 17810 + }, + { + "epoch": 1.2797127468581688, + "grad_norm": 0.8889711499214172, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 17820 + }, + { + "epoch": 1.2804308797127468, + "grad_norm": 0.5583778619766235, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 17830 + }, + { + "epoch": 1.281149012567325, + "grad_norm": 0.6526148915290833, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 17840 + }, + { + "epoch": 1.281867145421903, + "grad_norm": 0.7658175826072693, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 17850 + }, + { + "epoch": 1.282585278276481, + "grad_norm": 0.5547847151756287, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 17860 + }, + { + "epoch": 1.2833034111310593, + "grad_norm": 0.6153780817985535, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17870 + }, + { + "epoch": 1.2840215439856373, + "grad_norm": 0.8474061489105225, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 17880 + }, + { + "epoch": 1.2847396768402155, + "grad_norm": 0.859260618686676, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 17890 + }, + { + "epoch": 1.2854578096947935, + "grad_norm": 0.7270520329475403, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 17900 + }, + { + "epoch": 1.2861759425493715, + "grad_norm": 0.8166249394416809, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 17910 + }, + { + "epoch": 1.2868940754039497, + "grad_norm": 0.9158982038497925, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17920 + }, + { + "epoch": 1.287612208258528, + "grad_norm": 0.8132565021514893, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17930 + }, + { + "epoch": 1.288330341113106, + "grad_norm": 0.7914409637451172, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17940 + }, + { + "epoch": 1.289048473967684, + "grad_norm": 0.6256071329116821, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 17950 + }, + { + "epoch": 1.2897666068222622, + "grad_norm": 0.6463542580604553, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 17960 + }, + { + "epoch": 1.2904847396768402, + "grad_norm": 0.6702672839164734, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 17970 + }, + { + "epoch": 1.2912028725314184, + "grad_norm": 0.8666605949401855, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 17980 + }, + { + "epoch": 1.2919210053859964, + "grad_norm": 0.8055952787399292, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17990 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 0.6909741163253784, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 18000 + }, + { + "epoch": 1.2933572710951526, + "grad_norm": 0.663702130317688, + "learning_rate": 0.0002, + "loss": 0.7766, + "step": 18010 + }, + { + "epoch": 1.2940754039497306, + "grad_norm": 0.6952448487281799, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 18020 + }, + { + "epoch": 1.2947935368043089, + "grad_norm": 0.5722854137420654, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18030 + }, + { + "epoch": 1.2955116696588869, + "grad_norm": 0.7987681031227112, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 18040 + }, + { + "epoch": 1.2962298025134649, + "grad_norm": 0.661133348941803, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 18050 + }, + { + "epoch": 1.296947935368043, + "grad_norm": 0.6025064587593079, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 18060 + }, + { + "epoch": 1.2976660682226213, + "grad_norm": 0.7569907903671265, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 18070 + }, + { + "epoch": 1.2983842010771993, + "grad_norm": 0.7222012281417847, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18080 + }, + { + "epoch": 1.2991023339317773, + "grad_norm": 0.5291963815689087, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 18090 + }, + { + "epoch": 1.2998204667863555, + "grad_norm": 0.6808363199234009, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 18100 + }, + { + "epoch": 1.3005385996409335, + "grad_norm": 0.6797927618026733, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 18110 + }, + { + "epoch": 1.3012567324955118, + "grad_norm": 0.7775542140007019, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 18120 + }, + { + "epoch": 1.3019748653500898, + "grad_norm": 0.7369466423988342, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18130 + }, + { + "epoch": 1.3026929982046678, + "grad_norm": 0.6822494864463806, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 18140 + }, + { + "epoch": 1.303411131059246, + "grad_norm": 0.9222138524055481, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 18150 + }, + { + "epoch": 1.304129263913824, + "grad_norm": 0.7485767006874084, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 18160 + }, + { + "epoch": 1.3048473967684022, + "grad_norm": 0.6383684277534485, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 18170 + }, + { + "epoch": 1.3055655296229802, + "grad_norm": 0.5934187173843384, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 18180 + }, + { + "epoch": 1.3062836624775582, + "grad_norm": 0.7265770435333252, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 18190 + }, + { + "epoch": 1.3070017953321365, + "grad_norm": 0.8149140477180481, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 18200 + }, + { + "epoch": 1.3077199281867147, + "grad_norm": 0.8067880272865295, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 18210 + }, + { + "epoch": 1.3084380610412927, + "grad_norm": 0.6109178066253662, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18220 + }, + { + "epoch": 1.3091561938958707, + "grad_norm": 0.7194176316261292, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 18230 + }, + { + "epoch": 1.309874326750449, + "grad_norm": 0.6452242136001587, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 18240 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 0.680550217628479, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 18250 + }, + { + "epoch": 1.3113105924596051, + "grad_norm": 0.7005740404129028, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 18260 + }, + { + "epoch": 1.3120287253141831, + "grad_norm": 0.7217825055122375, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 18270 + }, + { + "epoch": 1.3127468581687611, + "grad_norm": 0.7730209231376648, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 18280 + }, + { + "epoch": 1.3134649910233394, + "grad_norm": 0.8291956186294556, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18290 + }, + { + "epoch": 1.3141831238779174, + "grad_norm": 0.758528470993042, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18300 + }, + { + "epoch": 1.3149012567324956, + "grad_norm": 0.9682782292366028, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 18310 + }, + { + "epoch": 1.3156193895870736, + "grad_norm": 0.5784780979156494, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 18320 + }, + { + "epoch": 1.3163375224416516, + "grad_norm": 0.5870532393455505, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 18330 + }, + { + "epoch": 1.3170556552962298, + "grad_norm": 0.5950172543525696, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 18340 + }, + { + "epoch": 1.317773788150808, + "grad_norm": 0.7625961899757385, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 18350 + }, + { + "epoch": 1.318491921005386, + "grad_norm": 0.8027397394180298, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 18360 + }, + { + "epoch": 1.319210053859964, + "grad_norm": 0.8424779772758484, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 18370 + }, + { + "epoch": 1.3199281867145423, + "grad_norm": 0.5741737484931946, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 18380 + }, + { + "epoch": 1.3206463195691203, + "grad_norm": 0.7363710999488831, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 18390 + }, + { + "epoch": 1.3213644524236985, + "grad_norm": 0.7900536060333252, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 18400 + }, + { + "epoch": 1.3220825852782765, + "grad_norm": 0.6273105144500732, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 18410 + }, + { + "epoch": 1.3228007181328545, + "grad_norm": 0.7612496018409729, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 18420 + }, + { + "epoch": 1.3235188509874327, + "grad_norm": 0.729653537273407, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 18430 + }, + { + "epoch": 1.3242369838420107, + "grad_norm": 0.6599212288856506, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 18440 + }, + { + "epoch": 1.324955116696589, + "grad_norm": 0.762320876121521, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18450 + }, + { + "epoch": 1.325673249551167, + "grad_norm": 0.7468838095664978, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18460 + }, + { + "epoch": 1.326391382405745, + "grad_norm": 0.6376237273216248, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 18470 + }, + { + "epoch": 1.3271095152603232, + "grad_norm": 0.6722603440284729, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18480 + }, + { + "epoch": 1.3278276481149014, + "grad_norm": 0.7011231780052185, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 18490 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 0.5325027108192444, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 18500 + }, + { + "epoch": 1.3292639138240574, + "grad_norm": 0.6916731595993042, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 18510 + }, + { + "epoch": 1.3299820466786356, + "grad_norm": 0.6529106497764587, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18520 + }, + { + "epoch": 1.3307001795332136, + "grad_norm": 0.7708640694618225, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 18530 + }, + { + "epoch": 1.3314183123877918, + "grad_norm": 0.7125861048698425, + "learning_rate": 0.0002, + "loss": 0.7688, + "step": 18540 + }, + { + "epoch": 1.3321364452423698, + "grad_norm": 0.7663969993591309, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 18550 + }, + { + "epoch": 1.3328545780969479, + "grad_norm": 0.601141631603241, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 18560 + }, + { + "epoch": 1.333572710951526, + "grad_norm": 0.6185581088066101, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 18570 + }, + { + "epoch": 1.334290843806104, + "grad_norm": 0.6136596202850342, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 18580 + }, + { + "epoch": 1.3350089766606823, + "grad_norm": 0.8377187252044678, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 18590 + }, + { + "epoch": 1.3357271095152603, + "grad_norm": 0.7649989724159241, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 18600 + }, + { + "epoch": 1.3364452423698383, + "grad_norm": 0.7944515347480774, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 18610 + }, + { + "epoch": 1.3371633752244165, + "grad_norm": 0.619024395942688, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 18620 + }, + { + "epoch": 1.3378815080789948, + "grad_norm": 0.7849082946777344, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 18630 + }, + { + "epoch": 1.3385996409335728, + "grad_norm": 0.5740780830383301, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18640 + }, + { + "epoch": 1.3393177737881508, + "grad_norm": 0.6897456645965576, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 18650 + }, + { + "epoch": 1.340035906642729, + "grad_norm": 0.6263600587844849, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 18660 + }, + { + "epoch": 1.340754039497307, + "grad_norm": 0.5744550824165344, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 18670 + }, + { + "epoch": 1.3414721723518852, + "grad_norm": 0.7785728573799133, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 18680 + }, + { + "epoch": 1.3421903052064632, + "grad_norm": 0.6944230198860168, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 18690 + }, + { + "epoch": 1.3429084380610412, + "grad_norm": 0.7388073801994324, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 18700 + }, + { + "epoch": 1.3436265709156194, + "grad_norm": 0.9555586576461792, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 18710 + }, + { + "epoch": 1.3443447037701974, + "grad_norm": 0.8510582447052002, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 18720 + }, + { + "epoch": 1.3450628366247757, + "grad_norm": 0.6093049645423889, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 18730 + }, + { + "epoch": 1.3457809694793537, + "grad_norm": 0.9159273505210876, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 18740 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 0.7188084721565247, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 18750 + }, + { + "epoch": 1.3472172351885099, + "grad_norm": 0.7228650450706482, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 18760 + }, + { + "epoch": 1.347935368043088, + "grad_norm": 0.8160615563392639, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 18770 + }, + { + "epoch": 1.3486535008976661, + "grad_norm": 0.6485389471054077, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 18780 + }, + { + "epoch": 1.3493716337522441, + "grad_norm": 0.6755139827728271, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 18790 + }, + { + "epoch": 1.3500897666068223, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 18800 + }, + { + "epoch": 1.3508078994614003, + "grad_norm": 0.6954510807991028, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 18810 + }, + { + "epoch": 1.3515260323159786, + "grad_norm": 0.9948558807373047, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 18820 + }, + { + "epoch": 1.3522441651705566, + "grad_norm": 0.708381175994873, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18830 + }, + { + "epoch": 1.3529622980251346, + "grad_norm": 0.6409999132156372, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 18840 + }, + { + "epoch": 1.3536804308797128, + "grad_norm": 0.6365936994552612, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18850 + }, + { + "epoch": 1.3543985637342908, + "grad_norm": 0.7620742917060852, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 18860 + }, + { + "epoch": 1.355116696588869, + "grad_norm": 0.6849071383476257, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 18870 + }, + { + "epoch": 1.355834829443447, + "grad_norm": 0.5776316523551941, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18880 + }, + { + "epoch": 1.356552962298025, + "grad_norm": 0.597236156463623, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 18890 + }, + { + "epoch": 1.3572710951526032, + "grad_norm": 0.6569282412528992, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 18900 + }, + { + "epoch": 1.3579892280071812, + "grad_norm": 0.6384802460670471, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 18910 + }, + { + "epoch": 1.3587073608617595, + "grad_norm": 0.6623879671096802, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 18920 + }, + { + "epoch": 1.3594254937163375, + "grad_norm": 0.6149632334709167, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 18930 + }, + { + "epoch": 1.3601436265709157, + "grad_norm": 0.6978002190589905, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 18940 + }, + { + "epoch": 1.3608617594254937, + "grad_norm": 0.7579124569892883, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 18950 + }, + { + "epoch": 1.361579892280072, + "grad_norm": 0.7138084173202515, + "learning_rate": 0.0002, + "loss": 0.7589, + "step": 18960 + }, + { + "epoch": 1.36229802513465, + "grad_norm": 0.678322434425354, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18970 + }, + { + "epoch": 1.363016157989228, + "grad_norm": 0.694346010684967, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18980 + }, + { + "epoch": 1.3637342908438062, + "grad_norm": 0.682262659072876, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18990 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 0.9068194627761841, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 19000 + }, + { + "epoch": 1.3651705565529624, + "grad_norm": 0.6691566705703735, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 19010 + }, + { + "epoch": 1.3658886894075404, + "grad_norm": 0.7791378498077393, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 19020 + }, + { + "epoch": 1.3666068222621184, + "grad_norm": 0.717107355594635, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 19030 + }, + { + "epoch": 1.3673249551166966, + "grad_norm": 0.7897566556930542, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 19040 + }, + { + "epoch": 1.3680430879712746, + "grad_norm": 0.8823844790458679, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 19050 + }, + { + "epoch": 1.3687612208258528, + "grad_norm": 0.6512053608894348, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 19060 + }, + { + "epoch": 1.3694793536804308, + "grad_norm": 0.6871389150619507, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 19070 + }, + { + "epoch": 1.370197486535009, + "grad_norm": 0.6795603036880493, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 19080 + }, + { + "epoch": 1.370915619389587, + "grad_norm": 0.6569121479988098, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 19090 + }, + { + "epoch": 1.3716337522441653, + "grad_norm": 0.6769960522651672, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 19100 + }, + { + "epoch": 1.3723518850987433, + "grad_norm": 0.726613461971283, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 19110 + }, + { + "epoch": 1.3730700179533213, + "grad_norm": 0.7287817001342773, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 19120 + }, + { + "epoch": 1.3737881508078995, + "grad_norm": 0.6169242858886719, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 19130 + }, + { + "epoch": 1.3745062836624775, + "grad_norm": 0.6537347435951233, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 19140 + }, + { + "epoch": 1.3752244165170557, + "grad_norm": 0.6113879680633545, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 19150 + }, + { + "epoch": 1.3759425493716337, + "grad_norm": 0.6415297985076904, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 19160 + }, + { + "epoch": 1.3766606822262117, + "grad_norm": 0.6812838315963745, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 19170 + }, + { + "epoch": 1.37737881508079, + "grad_norm": 0.7331814169883728, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 19180 + }, + { + "epoch": 1.378096947935368, + "grad_norm": 0.7265108823776245, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 19190 + }, + { + "epoch": 1.3788150807899462, + "grad_norm": 0.6233167052268982, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 19200 + }, + { + "epoch": 1.3795332136445242, + "grad_norm": 0.6841492652893066, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 19210 + }, + { + "epoch": 1.3802513464991024, + "grad_norm": 0.822853684425354, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 19220 + }, + { + "epoch": 1.3809694793536804, + "grad_norm": 0.8078812956809998, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 19230 + }, + { + "epoch": 1.3816876122082586, + "grad_norm": 0.7269898056983948, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 19240 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 0.6297033429145813, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 19250 + }, + { + "epoch": 1.3831238779174146, + "grad_norm": 0.8097442388534546, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 19260 + }, + { + "epoch": 1.3838420107719929, + "grad_norm": 0.6442803740501404, + "learning_rate": 0.0002, + "loss": 0.7281, + "step": 19270 + }, + { + "epoch": 1.3845601436265709, + "grad_norm": 0.659866213798523, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 19280 + }, + { + "epoch": 1.385278276481149, + "grad_norm": 0.7537921667098999, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 19290 + }, + { + "epoch": 1.385996409335727, + "grad_norm": 0.8441828489303589, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 19300 + }, + { + "epoch": 1.386714542190305, + "grad_norm": 0.8506057262420654, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19310 + }, + { + "epoch": 1.3874326750448833, + "grad_norm": 0.6747094392776489, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 19320 + }, + { + "epoch": 1.3881508078994613, + "grad_norm": 0.7906509041786194, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 19330 + }, + { + "epoch": 1.3888689407540395, + "grad_norm": 0.6784867644309998, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 19340 + }, + { + "epoch": 1.3895870736086176, + "grad_norm": 0.6371709108352661, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 19350 + }, + { + "epoch": 1.3903052064631956, + "grad_norm": 0.7858285307884216, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 19360 + }, + { + "epoch": 1.3910233393177738, + "grad_norm": 0.711395263671875, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19370 + }, + { + "epoch": 1.391741472172352, + "grad_norm": 0.7023257613182068, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19380 + }, + { + "epoch": 1.39245960502693, + "grad_norm": 0.7036022543907166, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19390 + }, + { + "epoch": 1.393177737881508, + "grad_norm": 0.6418436169624329, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 19400 + }, + { + "epoch": 1.3938958707360862, + "grad_norm": 0.7108847498893738, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 19410 + }, + { + "epoch": 1.3946140035906642, + "grad_norm": 0.6940230131149292, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 19420 + }, + { + "epoch": 1.3953321364452425, + "grad_norm": 0.6750220656394958, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 19430 + }, + { + "epoch": 1.3960502692998205, + "grad_norm": 0.7479177713394165, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 19440 + }, + { + "epoch": 1.3967684021543985, + "grad_norm": 0.626124918460846, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 19450 + }, + { + "epoch": 1.3974865350089767, + "grad_norm": 0.8908559083938599, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 19460 + }, + { + "epoch": 1.3982046678635547, + "grad_norm": 0.6163712739944458, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 19470 + }, + { + "epoch": 1.398922800718133, + "grad_norm": 0.6993312239646912, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 19480 + }, + { + "epoch": 1.399640933572711, + "grad_norm": 0.6162890791893005, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 19490 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 0.7797643542289734, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 19500 + }, + { + "epoch": 1.4010771992818671, + "grad_norm": 0.7038744688034058, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 19510 + }, + { + "epoch": 1.4017953321364454, + "grad_norm": 0.6902393698692322, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 19520 + }, + { + "epoch": 1.4025134649910234, + "grad_norm": 0.5436386466026306, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 19530 + }, + { + "epoch": 1.4032315978456014, + "grad_norm": 0.6537990570068359, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19540 + }, + { + "epoch": 1.4039497307001796, + "grad_norm": 0.739691972732544, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 19550 + }, + { + "epoch": 1.4046678635547576, + "grad_norm": 0.7287635803222656, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 19560 + }, + { + "epoch": 1.4053859964093358, + "grad_norm": 0.6809501051902771, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 19570 + }, + { + "epoch": 1.4061041292639138, + "grad_norm": 0.8302195072174072, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 19580 + }, + { + "epoch": 1.4068222621184918, + "grad_norm": 0.6613629460334778, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 19590 + }, + { + "epoch": 1.40754039497307, + "grad_norm": 0.7897207736968994, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 19600 + }, + { + "epoch": 1.408258527827648, + "grad_norm": 0.8368293642997742, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 19610 + }, + { + "epoch": 1.4089766606822263, + "grad_norm": 0.665109395980835, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 19620 + }, + { + "epoch": 1.4096947935368043, + "grad_norm": 0.7359302639961243, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 19630 + }, + { + "epoch": 1.4104129263913823, + "grad_norm": 0.8048052787780762, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 19640 + }, + { + "epoch": 1.4111310592459605, + "grad_norm": 0.7414906620979309, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 19650 + }, + { + "epoch": 1.4118491921005387, + "grad_norm": 0.7894161343574524, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 19660 + }, + { + "epoch": 1.4125673249551167, + "grad_norm": 0.6724628210067749, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 19670 + }, + { + "epoch": 1.4132854578096947, + "grad_norm": 0.9397756457328796, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 19680 + }, + { + "epoch": 1.414003590664273, + "grad_norm": 0.6684842109680176, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 19690 + }, + { + "epoch": 1.414721723518851, + "grad_norm": 0.7753993272781372, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 19700 + }, + { + "epoch": 1.4154398563734292, + "grad_norm": 0.6934253573417664, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 19710 + }, + { + "epoch": 1.4161579892280072, + "grad_norm": 0.8567284941673279, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 19720 + }, + { + "epoch": 1.4168761220825852, + "grad_norm": 0.9471787214279175, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 19730 + }, + { + "epoch": 1.4175942549371634, + "grad_norm": 0.6664855480194092, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 19740 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 0.6713361740112305, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 19750 + }, + { + "epoch": 1.4190305206463196, + "grad_norm": 0.6488258838653564, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 19760 + }, + { + "epoch": 1.4197486535008976, + "grad_norm": 0.7089938521385193, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19770 + }, + { + "epoch": 1.4204667863554756, + "grad_norm": 0.6433218717575073, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 19780 + }, + { + "epoch": 1.4211849192100539, + "grad_norm": 0.7025160193443298, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 19790 + }, + { + "epoch": 1.421903052064632, + "grad_norm": 0.7030544877052307, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 19800 + }, + { + "epoch": 1.42262118491921, + "grad_norm": 0.6515552401542664, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 19810 + }, + { + "epoch": 1.423339317773788, + "grad_norm": 0.6463841795921326, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 19820 + }, + { + "epoch": 1.4240574506283663, + "grad_norm": 0.6654344201087952, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19830 + }, + { + "epoch": 1.4247755834829443, + "grad_norm": 0.7223384380340576, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 19840 + }, + { + "epoch": 1.4254937163375225, + "grad_norm": 0.6575722694396973, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 19850 + }, + { + "epoch": 1.4262118491921005, + "grad_norm": 0.6216059327125549, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 19860 + }, + { + "epoch": 1.4269299820466785, + "grad_norm": 0.7451487183570862, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19870 + }, + { + "epoch": 1.4276481149012568, + "grad_norm": 0.6563336253166199, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 19880 + }, + { + "epoch": 1.4283662477558348, + "grad_norm": 0.8021975159645081, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 19890 + }, + { + "epoch": 1.429084380610413, + "grad_norm": 0.7474712133407593, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 19900 + }, + { + "epoch": 1.429802513464991, + "grad_norm": 0.7316377758979797, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 19910 + }, + { + "epoch": 1.430520646319569, + "grad_norm": 0.646892786026001, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 19920 + }, + { + "epoch": 1.4312387791741472, + "grad_norm": 0.6268765926361084, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 19930 + }, + { + "epoch": 1.4319569120287254, + "grad_norm": 0.7104699611663818, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 19940 + }, + { + "epoch": 1.4326750448833034, + "grad_norm": 0.6742063760757446, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 19950 + }, + { + "epoch": 1.4333931777378814, + "grad_norm": 0.6973381638526917, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 19960 + }, + { + "epoch": 1.4341113105924597, + "grad_norm": 0.5819381475448608, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 19970 + }, + { + "epoch": 1.4348294434470377, + "grad_norm": 0.680623471736908, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 19980 + }, + { + "epoch": 1.435547576301616, + "grad_norm": 0.5899890661239624, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 19990 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 0.6225098371505737, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 20000 + }, + { + "epoch": 1.436983842010772, + "grad_norm": 0.6314228773117065, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 20010 + }, + { + "epoch": 1.4377019748653501, + "grad_norm": 0.8690667152404785, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 20020 + }, + { + "epoch": 1.4384201077199281, + "grad_norm": 0.7166543006896973, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 20030 + }, + { + "epoch": 1.4391382405745063, + "grad_norm": 0.7051591873168945, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 20040 + }, + { + "epoch": 1.4398563734290843, + "grad_norm": 0.7606652975082397, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 20050 + }, + { + "epoch": 1.4405745062836623, + "grad_norm": 0.6343185305595398, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 20060 + }, + { + "epoch": 1.4412926391382406, + "grad_norm": 0.5625789761543274, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 20070 + }, + { + "epoch": 1.4420107719928188, + "grad_norm": 0.6081897020339966, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 20080 + }, + { + "epoch": 1.4427289048473968, + "grad_norm": 0.9571536779403687, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 20090 + }, + { + "epoch": 1.4434470377019748, + "grad_norm": 0.869531512260437, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 20100 + }, + { + "epoch": 1.444165170556553, + "grad_norm": 0.6865507960319519, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 20110 + }, + { + "epoch": 1.444883303411131, + "grad_norm": 0.7572755813598633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 20120 + }, + { + "epoch": 1.4456014362657092, + "grad_norm": 0.79011070728302, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 20130 + }, + { + "epoch": 1.4463195691202873, + "grad_norm": 0.8297342658042908, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 20140 + }, + { + "epoch": 1.4470377019748653, + "grad_norm": 0.6593490839004517, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 20150 + }, + { + "epoch": 1.4477558348294435, + "grad_norm": 1.0264687538146973, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 20160 + }, + { + "epoch": 1.4484739676840215, + "grad_norm": 0.7032888531684875, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 20170 + }, + { + "epoch": 1.4491921005385997, + "grad_norm": 0.6438494920730591, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 20180 + }, + { + "epoch": 1.4499102333931777, + "grad_norm": 0.7448790669441223, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 20190 + }, + { + "epoch": 1.4506283662477557, + "grad_norm": 0.7551555037498474, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 20200 + }, + { + "epoch": 1.451346499102334, + "grad_norm": 0.6677857041358948, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 20210 + }, + { + "epoch": 1.4520646319569122, + "grad_norm": 0.7888486385345459, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 20220 + }, + { + "epoch": 1.4527827648114902, + "grad_norm": 0.6658565402030945, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 20230 + }, + { + "epoch": 1.4535008976660682, + "grad_norm": 0.6800249814987183, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 20240 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 0.7419682741165161, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 20250 + }, + { + "epoch": 1.4549371633752244, + "grad_norm": 0.8848792910575867, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 20260 + }, + { + "epoch": 1.4556552962298026, + "grad_norm": 0.6513857245445251, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 20270 + }, + { + "epoch": 1.4563734290843806, + "grad_norm": 0.5605742335319519, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 20280 + }, + { + "epoch": 1.4570915619389586, + "grad_norm": 0.6737141013145447, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 20290 + }, + { + "epoch": 1.4578096947935368, + "grad_norm": 0.6663289666175842, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 20300 + }, + { + "epoch": 1.4585278276481148, + "grad_norm": 0.7157106995582581, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20310 + }, + { + "epoch": 1.459245960502693, + "grad_norm": 0.7713354825973511, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 20320 + }, + { + "epoch": 1.459964093357271, + "grad_norm": 0.8334044218063354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 20330 + }, + { + "epoch": 1.460682226211849, + "grad_norm": 0.7268327474594116, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 20340 + }, + { + "epoch": 1.4614003590664273, + "grad_norm": 0.6791431903839111, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 20350 + }, + { + "epoch": 1.4621184919210055, + "grad_norm": 0.8177870512008667, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 20360 + }, + { + "epoch": 1.4628366247755835, + "grad_norm": 0.8064364790916443, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 20370 + }, + { + "epoch": 1.4635547576301615, + "grad_norm": 0.6547006964683533, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 20380 + }, + { + "epoch": 1.4642728904847397, + "grad_norm": 0.6381436586380005, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 20390 + }, + { + "epoch": 1.4649910233393177, + "grad_norm": 0.7351248264312744, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 20400 + }, + { + "epoch": 1.465709156193896, + "grad_norm": 0.7037558555603027, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 20410 + }, + { + "epoch": 1.466427289048474, + "grad_norm": 0.6294074654579163, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 20420 + }, + { + "epoch": 1.467145421903052, + "grad_norm": 0.9722632765769958, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 20430 + }, + { + "epoch": 1.4678635547576302, + "grad_norm": 0.753065824508667, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 20440 + }, + { + "epoch": 1.4685816876122082, + "grad_norm": 0.7317194938659668, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20450 + }, + { + "epoch": 1.4692998204667864, + "grad_norm": 0.6862193942070007, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 20460 + }, + { + "epoch": 1.4700179533213644, + "grad_norm": 0.7643225193023682, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 20470 + }, + { + "epoch": 1.4707360861759424, + "grad_norm": 0.5904353260993958, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 20480 + }, + { + "epoch": 1.4714542190305206, + "grad_norm": 0.5812238454818726, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20490 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 0.7478151321411133, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 20500 + }, + { + "epoch": 1.4728904847396769, + "grad_norm": 0.7625645399093628, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 20510 + }, + { + "epoch": 1.4736086175942549, + "grad_norm": 0.6354498267173767, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 20520 + }, + { + "epoch": 1.474326750448833, + "grad_norm": 0.8731162548065186, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 20530 + }, + { + "epoch": 1.475044883303411, + "grad_norm": 0.7346670627593994, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 20540 + }, + { + "epoch": 1.4757630161579893, + "grad_norm": 1.038447618484497, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 20550 + }, + { + "epoch": 1.4764811490125673, + "grad_norm": 0.7032809257507324, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 20560 + }, + { + "epoch": 1.4771992818671453, + "grad_norm": 0.8008337020874023, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 20570 + }, + { + "epoch": 1.4779174147217236, + "grad_norm": 0.6735056638717651, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 20580 + }, + { + "epoch": 1.4786355475763016, + "grad_norm": 0.622056245803833, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 20590 + }, + { + "epoch": 1.4793536804308798, + "grad_norm": 0.6580422520637512, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 20600 + }, + { + "epoch": 1.4800718132854578, + "grad_norm": 0.8401153087615967, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20610 + }, + { + "epoch": 1.4807899461400358, + "grad_norm": 0.7564560770988464, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 20620 + }, + { + "epoch": 1.481508078994614, + "grad_norm": 0.8319511413574219, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 20630 + }, + { + "epoch": 1.4822262118491922, + "grad_norm": 0.7430182695388794, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 20640 + }, + { + "epoch": 1.4829443447037702, + "grad_norm": 0.7996522784233093, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 20650 + }, + { + "epoch": 1.4836624775583482, + "grad_norm": 0.6993277072906494, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 20660 + }, + { + "epoch": 1.4843806104129265, + "grad_norm": 0.8621185421943665, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 20670 + }, + { + "epoch": 1.4850987432675045, + "grad_norm": 0.7709757685661316, + "learning_rate": 0.0002, + "loss": 0.7327, + "step": 20680 + }, + { + "epoch": 1.4858168761220827, + "grad_norm": 0.743760347366333, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 20690 + }, + { + "epoch": 1.4865350089766607, + "grad_norm": 0.8353745341300964, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 20700 + }, + { + "epoch": 1.4872531418312387, + "grad_norm": 0.8510433435440063, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 20710 + }, + { + "epoch": 1.487971274685817, + "grad_norm": 0.7065894603729248, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 20720 + }, + { + "epoch": 1.488689407540395, + "grad_norm": 0.6878955960273743, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 20730 + }, + { + "epoch": 1.4894075403949731, + "grad_norm": 0.7861111760139465, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 20740 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 0.4810725152492523, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20750 + }, + { + "epoch": 1.4908438061041291, + "grad_norm": 0.7246082425117493, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 20760 + }, + { + "epoch": 1.4915619389587074, + "grad_norm": 0.7101936340332031, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 20770 + }, + { + "epoch": 1.4922800718132856, + "grad_norm": 0.7508591413497925, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 20780 + }, + { + "epoch": 1.4929982046678636, + "grad_norm": 0.8872039914131165, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 20790 + }, + { + "epoch": 1.4937163375224416, + "grad_norm": 0.7257922887802124, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 20800 + }, + { + "epoch": 1.4944344703770198, + "grad_norm": 0.7886278629302979, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 20810 + }, + { + "epoch": 1.4951526032315978, + "grad_norm": 0.6746290922164917, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 20820 + }, + { + "epoch": 1.495870736086176, + "grad_norm": 0.8118207454681396, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 20830 + }, + { + "epoch": 1.496588868940754, + "grad_norm": 0.7337301969528198, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 20840 + }, + { + "epoch": 1.497307001795332, + "grad_norm": 0.5451242327690125, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 20850 + }, + { + "epoch": 1.4980251346499103, + "grad_norm": 0.8398377299308777, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 20860 + }, + { + "epoch": 1.4987432675044883, + "grad_norm": 0.7196659445762634, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 20870 + }, + { + "epoch": 1.4994614003590665, + "grad_norm": 0.6659539937973022, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 20880 + }, + { + "epoch": 1.5001795332136445, + "grad_norm": 0.6071978807449341, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 20890 + }, + { + "epoch": 1.5008976660682225, + "grad_norm": 0.6704870462417603, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 20900 + }, + { + "epoch": 1.5016157989228007, + "grad_norm": 0.7216639518737793, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 20910 + }, + { + "epoch": 1.502333931777379, + "grad_norm": 0.6050528287887573, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 20920 + }, + { + "epoch": 1.503052064631957, + "grad_norm": 0.7422218918800354, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 20930 + }, + { + "epoch": 1.503770197486535, + "grad_norm": 0.7157148122787476, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20940 + }, + { + "epoch": 1.504488330341113, + "grad_norm": 0.6704899668693542, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 20950 + }, + { + "epoch": 1.5052064631956912, + "grad_norm": 0.7573544979095459, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 20960 + }, + { + "epoch": 1.5059245960502694, + "grad_norm": 0.6710506677627563, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 20970 + }, + { + "epoch": 1.5066427289048474, + "grad_norm": 0.7559793591499329, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 20980 + }, + { + "epoch": 1.5073608617594254, + "grad_norm": 0.6705940961837769, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 20990 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 0.8016680479049683, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21000 + }, + { + "epoch": 1.5087971274685816, + "grad_norm": 0.8154481649398804, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 21010 + }, + { + "epoch": 1.5095152603231599, + "grad_norm": 0.5830582976341248, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 21020 + }, + { + "epoch": 1.5102333931777379, + "grad_norm": 0.7088601589202881, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 21030 + }, + { + "epoch": 1.5109515260323159, + "grad_norm": 0.7499658465385437, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 21040 + }, + { + "epoch": 1.511669658886894, + "grad_norm": 0.7684667706489563, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 21050 + }, + { + "epoch": 1.5123877917414723, + "grad_norm": 0.7183627486228943, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 21060 + }, + { + "epoch": 1.5131059245960503, + "grad_norm": 0.8201524615287781, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 21070 + }, + { + "epoch": 1.5138240574506283, + "grad_norm": 0.6359647512435913, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 21080 + }, + { + "epoch": 1.5145421903052063, + "grad_norm": 0.7419124245643616, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 21090 + }, + { + "epoch": 1.5152603231597845, + "grad_norm": 0.6145808696746826, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 21100 + }, + { + "epoch": 1.5159784560143628, + "grad_norm": 0.7116656303405762, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 21110 + }, + { + "epoch": 1.5166965888689408, + "grad_norm": 0.8927125334739685, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 21120 + }, + { + "epoch": 1.5174147217235188, + "grad_norm": 0.7527788877487183, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 21130 + }, + { + "epoch": 1.518132854578097, + "grad_norm": 0.7537266612052917, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 21140 + }, + { + "epoch": 1.518850987432675, + "grad_norm": 0.9051724672317505, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 21150 + }, + { + "epoch": 1.5195691202872532, + "grad_norm": 0.7258086800575256, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 21160 + }, + { + "epoch": 1.5202872531418312, + "grad_norm": 0.60377436876297, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 21170 + }, + { + "epoch": 1.5210053859964092, + "grad_norm": 0.613362729549408, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 21180 + }, + { + "epoch": 1.5217235188509874, + "grad_norm": 0.6311782002449036, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 21190 + }, + { + "epoch": 1.5224416517055657, + "grad_norm": 0.7814380526542664, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 21200 + }, + { + "epoch": 1.5231597845601437, + "grad_norm": 0.8482790589332581, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 21210 + }, + { + "epoch": 1.5238779174147217, + "grad_norm": 0.6767336130142212, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21220 + }, + { + "epoch": 1.5245960502692997, + "grad_norm": 0.7000219821929932, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 21230 + }, + { + "epoch": 1.525314183123878, + "grad_norm": 0.8848617076873779, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 21240 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 0.692258894443512, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 21250 + }, + { + "epoch": 1.5267504488330341, + "grad_norm": 0.7701950073242188, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 21260 + }, + { + "epoch": 1.5274685816876121, + "grad_norm": 0.7454132437705994, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 21270 + }, + { + "epoch": 1.5281867145421903, + "grad_norm": 0.7299574613571167, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 21280 + }, + { + "epoch": 1.5289048473967684, + "grad_norm": 0.6693950891494751, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 21290 + }, + { + "epoch": 1.5296229802513466, + "grad_norm": 0.8323785066604614, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 21300 + }, + { + "epoch": 1.5303411131059246, + "grad_norm": 0.8998763561248779, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 21310 + }, + { + "epoch": 1.5310592459605026, + "grad_norm": 0.8118193745613098, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 21320 + }, + { + "epoch": 1.5317773788150808, + "grad_norm": 0.8966332077980042, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 21330 + }, + { + "epoch": 1.532495511669659, + "grad_norm": 0.7849827408790588, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 21340 + }, + { + "epoch": 1.533213644524237, + "grad_norm": 0.897583544254303, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 21350 + }, + { + "epoch": 1.533931777378815, + "grad_norm": 0.7998009324073792, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21360 + }, + { + "epoch": 1.534649910233393, + "grad_norm": 0.5890361070632935, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 21370 + }, + { + "epoch": 1.5353680430879713, + "grad_norm": 0.7321302890777588, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 21380 + }, + { + "epoch": 1.5360861759425495, + "grad_norm": 0.7746050357818604, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 21390 + }, + { + "epoch": 1.5368043087971275, + "grad_norm": 0.7033910155296326, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 21400 + }, + { + "epoch": 1.5375224416517055, + "grad_norm": 0.7229148149490356, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 21410 + }, + { + "epoch": 1.5382405745062837, + "grad_norm": 0.8055810928344727, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 21420 + }, + { + "epoch": 1.5389587073608617, + "grad_norm": 0.9411654472351074, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 21430 + }, + { + "epoch": 1.53967684021544, + "grad_norm": 0.7297126650810242, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21440 + }, + { + "epoch": 1.540394973070018, + "grad_norm": 0.7316457629203796, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 21450 + }, + { + "epoch": 1.541113105924596, + "grad_norm": 0.8568798303604126, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 21460 + }, + { + "epoch": 1.5418312387791742, + "grad_norm": 0.7829580307006836, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21470 + }, + { + "epoch": 1.5425493716337524, + "grad_norm": 0.6679823398590088, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 21480 + }, + { + "epoch": 1.5432675044883304, + "grad_norm": 0.5680868029594421, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 21490 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 0.6878862380981445, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 21500 + }, + { + "epoch": 1.5447037701974864, + "grad_norm": 0.7391727566719055, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 21510 + }, + { + "epoch": 1.5454219030520646, + "grad_norm": 0.844994843006134, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 21520 + }, + { + "epoch": 1.5461400359066428, + "grad_norm": 0.7852550148963928, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 21530 + }, + { + "epoch": 1.5468581687612208, + "grad_norm": 0.8370407223701477, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 21540 + }, + { + "epoch": 1.5475763016157988, + "grad_norm": 0.7138169407844543, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 21550 + }, + { + "epoch": 1.548294434470377, + "grad_norm": 0.7660839557647705, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 21560 + }, + { + "epoch": 1.549012567324955, + "grad_norm": 0.6628666520118713, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 21570 + }, + { + "epoch": 1.5497307001795333, + "grad_norm": 0.602262020111084, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 21580 + }, + { + "epoch": 1.5504488330341113, + "grad_norm": 0.6120333671569824, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 21590 + }, + { + "epoch": 1.5511669658886893, + "grad_norm": 0.6742582321166992, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 21600 + }, + { + "epoch": 1.5518850987432675, + "grad_norm": 0.6788192391395569, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 21610 + }, + { + "epoch": 1.5526032315978457, + "grad_norm": 0.7124713659286499, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 21620 + }, + { + "epoch": 1.5533213644524237, + "grad_norm": 0.6297248005867004, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 21630 + }, + { + "epoch": 1.5540394973070017, + "grad_norm": 0.8977078199386597, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21640 + }, + { + "epoch": 1.5547576301615798, + "grad_norm": 0.7543209791183472, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 21650 + }, + { + "epoch": 1.555475763016158, + "grad_norm": 0.8704302310943604, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 21660 + }, + { + "epoch": 1.5561938958707362, + "grad_norm": 0.7848012447357178, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 21670 + }, + { + "epoch": 1.5569120287253142, + "grad_norm": 0.7496278285980225, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 21680 + }, + { + "epoch": 1.5576301615798922, + "grad_norm": 0.7305200099945068, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 21690 + }, + { + "epoch": 1.5583482944344704, + "grad_norm": 0.6671105623245239, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 21700 + }, + { + "epoch": 1.5590664272890484, + "grad_norm": 0.8536111116409302, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 21710 + }, + { + "epoch": 1.5597845601436267, + "grad_norm": 0.7360461354255676, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 21720 + }, + { + "epoch": 1.5605026929982047, + "grad_norm": 0.6665109395980835, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 21730 + }, + { + "epoch": 1.5612208258527827, + "grad_norm": 0.5879628658294678, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 21740 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 0.6937240958213806, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 21750 + }, + { + "epoch": 1.562657091561939, + "grad_norm": 0.7118659019470215, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 21760 + }, + { + "epoch": 1.563375224416517, + "grad_norm": 0.7858866453170776, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 21770 + }, + { + "epoch": 1.564093357271095, + "grad_norm": 0.8691372871398926, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 21780 + }, + { + "epoch": 1.564811490125673, + "grad_norm": 0.8884942531585693, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 21790 + }, + { + "epoch": 1.5655296229802513, + "grad_norm": 0.6335656046867371, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 21800 + }, + { + "epoch": 1.5662477558348296, + "grad_norm": 0.8666166067123413, + "learning_rate": 0.0002, + "loss": 0.7233, + "step": 21810 + }, + { + "epoch": 1.5669658886894076, + "grad_norm": 0.7961624264717102, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 21820 + }, + { + "epoch": 1.5676840215439856, + "grad_norm": 0.6331174373626709, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 21830 + }, + { + "epoch": 1.5684021543985638, + "grad_norm": 0.6476998925209045, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 21840 + }, + { + "epoch": 1.5691202872531418, + "grad_norm": 0.8279129266738892, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 21850 + }, + { + "epoch": 1.56983842010772, + "grad_norm": 0.6997109651565552, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 21860 + }, + { + "epoch": 1.570556552962298, + "grad_norm": 0.6992211937904358, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 21870 + }, + { + "epoch": 1.571274685816876, + "grad_norm": 0.7766915559768677, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 21880 + }, + { + "epoch": 1.5719928186714542, + "grad_norm": 0.6845845580101013, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 21890 + }, + { + "epoch": 1.5727109515260325, + "grad_norm": 0.7247874140739441, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 21900 + }, + { + "epoch": 1.5734290843806105, + "grad_norm": 0.802342414855957, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21910 + }, + { + "epoch": 1.5741472172351885, + "grad_norm": 0.7797709107398987, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 21920 + }, + { + "epoch": 1.5748653500897665, + "grad_norm": 0.6534958481788635, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21930 + }, + { + "epoch": 1.5755834829443447, + "grad_norm": 0.6003528237342834, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 21940 + }, + { + "epoch": 1.576301615798923, + "grad_norm": 0.6920075416564941, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 21950 + }, + { + "epoch": 1.577019748653501, + "grad_norm": 0.7213456034660339, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 21960 + }, + { + "epoch": 1.577737881508079, + "grad_norm": 0.7101914286613464, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 21970 + }, + { + "epoch": 1.5784560143626571, + "grad_norm": 0.9531592130661011, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 21980 + }, + { + "epoch": 1.5791741472172351, + "grad_norm": 0.7690590023994446, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 21990 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 0.8226363062858582, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 22000 + }, + { + "epoch": 1.5806104129263914, + "grad_norm": 0.6128851175308228, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 22010 + }, + { + "epoch": 1.5813285457809694, + "grad_norm": 0.827008068561554, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 22020 + }, + { + "epoch": 1.5820466786355476, + "grad_norm": 0.6729007363319397, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 22030 + }, + { + "epoch": 1.5827648114901258, + "grad_norm": 0.6397014260292053, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 22040 + }, + { + "epoch": 1.5834829443447038, + "grad_norm": 0.6927793622016907, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 22050 + }, + { + "epoch": 1.5842010771992818, + "grad_norm": 0.7527112364768982, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 22060 + }, + { + "epoch": 1.5849192100538598, + "grad_norm": 0.6418012380599976, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 22070 + }, + { + "epoch": 1.585637342908438, + "grad_norm": 0.7627281546592712, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 22080 + }, + { + "epoch": 1.5863554757630163, + "grad_norm": 0.753851592540741, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22090 + }, + { + "epoch": 1.5870736086175943, + "grad_norm": 0.6049349904060364, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 22100 + }, + { + "epoch": 1.5877917414721723, + "grad_norm": 0.6677758693695068, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 22110 + }, + { + "epoch": 1.5885098743267505, + "grad_norm": 0.913489818572998, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22120 + }, + { + "epoch": 1.5892280071813285, + "grad_norm": 0.6779162883758545, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 22130 + }, + { + "epoch": 1.5899461400359067, + "grad_norm": 0.910076916217804, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 22140 + }, + { + "epoch": 1.5906642728904847, + "grad_norm": 0.9506068229675293, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 22150 + }, + { + "epoch": 1.5913824057450627, + "grad_norm": 0.6552460789680481, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 22160 + }, + { + "epoch": 1.592100538599641, + "grad_norm": 0.6855819821357727, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22170 + }, + { + "epoch": 1.5928186714542192, + "grad_norm": 0.6713384985923767, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 22180 + }, + { + "epoch": 1.5935368043087972, + "grad_norm": 0.7168547511100769, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 22190 + }, + { + "epoch": 1.5942549371633752, + "grad_norm": 0.8395482897758484, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22200 + }, + { + "epoch": 1.5949730700179532, + "grad_norm": 0.6676998138427734, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 22210 + }, + { + "epoch": 1.5956912028725314, + "grad_norm": 0.5837140083312988, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 22220 + }, + { + "epoch": 1.5964093357271096, + "grad_norm": 0.8399306535720825, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 22230 + }, + { + "epoch": 1.5971274685816876, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22240 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 0.768604040145874, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 22250 + }, + { + "epoch": 1.5985637342908436, + "grad_norm": 0.6382646560668945, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 22260 + }, + { + "epoch": 1.5992818671454219, + "grad_norm": 0.7244897484779358, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 22270 + }, + { + "epoch": 1.6, + "grad_norm": 0.6250987648963928, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 22280 + }, + { + "epoch": 1.600718132854578, + "grad_norm": 0.8731992244720459, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 22290 + }, + { + "epoch": 1.601436265709156, + "grad_norm": 0.5861822962760925, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 22300 + }, + { + "epoch": 1.6021543985637343, + "grad_norm": 0.716805100440979, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 22310 + }, + { + "epoch": 1.6028725314183125, + "grad_norm": 0.6650034189224243, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 22320 + }, + { + "epoch": 1.6035906642728905, + "grad_norm": 0.6944432854652405, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 22330 + }, + { + "epoch": 1.6043087971274685, + "grad_norm": 0.7411999106407166, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 22340 + }, + { + "epoch": 1.6050269299820465, + "grad_norm": 0.831828773021698, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 22350 + }, + { + "epoch": 1.6057450628366248, + "grad_norm": 0.6252152919769287, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 22360 + }, + { + "epoch": 1.606463195691203, + "grad_norm": 0.8643325567245483, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22370 + }, + { + "epoch": 1.607181328545781, + "grad_norm": 0.7330279350280762, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 22380 + }, + { + "epoch": 1.607899461400359, + "grad_norm": 0.7235422730445862, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 22390 + }, + { + "epoch": 1.608617594254937, + "grad_norm": 0.6940887570381165, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 22400 + }, + { + "epoch": 1.6093357271095152, + "grad_norm": 0.7907325625419617, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 22410 + }, + { + "epoch": 1.6100538599640934, + "grad_norm": 0.6899075508117676, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 22420 + }, + { + "epoch": 1.6107719928186714, + "grad_norm": 0.7057487368583679, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 22430 + }, + { + "epoch": 1.6114901256732495, + "grad_norm": 0.9235003590583801, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 22440 + }, + { + "epoch": 1.6122082585278277, + "grad_norm": 0.7238173484802246, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22450 + }, + { + "epoch": 1.612926391382406, + "grad_norm": 0.5931997299194336, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 22460 + }, + { + "epoch": 1.613644524236984, + "grad_norm": 0.6705866456031799, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 22470 + }, + { + "epoch": 1.614362657091562, + "grad_norm": 0.7392773032188416, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 22480 + }, + { + "epoch": 1.61508078994614, + "grad_norm": 0.6286543607711792, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 22490 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 0.7467446327209473, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 22500 + }, + { + "epoch": 1.6165170556552964, + "grad_norm": 0.8353021740913391, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 22510 + }, + { + "epoch": 1.6172351885098744, + "grad_norm": 0.7333045601844788, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 22520 + }, + { + "epoch": 1.6179533213644524, + "grad_norm": 0.6203709244728088, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 22530 + }, + { + "epoch": 1.6186714542190304, + "grad_norm": 0.5585690140724182, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 22540 + }, + { + "epoch": 1.6193895870736086, + "grad_norm": 0.7157222032546997, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 22550 + }, + { + "epoch": 1.6201077199281868, + "grad_norm": 0.8129993677139282, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 22560 + }, + { + "epoch": 1.6208258527827648, + "grad_norm": 0.6745335459709167, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 22570 + }, + { + "epoch": 1.6215439856373428, + "grad_norm": 0.7684996724128723, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 22580 + }, + { + "epoch": 1.622262118491921, + "grad_norm": 0.6735436916351318, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22590 + }, + { + "epoch": 1.6229802513464993, + "grad_norm": 0.7394272089004517, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 22600 + }, + { + "epoch": 1.6236983842010773, + "grad_norm": 0.7268046140670776, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 22610 + }, + { + "epoch": 1.6244165170556553, + "grad_norm": 0.8338810205459595, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 22620 + }, + { + "epoch": 1.6251346499102333, + "grad_norm": 0.9293080568313599, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 22630 + }, + { + "epoch": 1.6258527827648115, + "grad_norm": 0.8084996938705444, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 22640 + }, + { + "epoch": 1.6265709156193897, + "grad_norm": 0.6605180501937866, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22650 + }, + { + "epoch": 1.6272890484739677, + "grad_norm": 0.8402717113494873, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 22660 + }, + { + "epoch": 1.6280071813285457, + "grad_norm": 0.653055727481842, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 22670 + }, + { + "epoch": 1.6287253141831237, + "grad_norm": 0.6477823257446289, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 22680 + }, + { + "epoch": 1.629443447037702, + "grad_norm": 0.9053590893745422, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 22690 + }, + { + "epoch": 1.6301615798922802, + "grad_norm": 0.90384441614151, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 22700 + }, + { + "epoch": 1.6308797127468582, + "grad_norm": 0.6789469122886658, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 22710 + }, + { + "epoch": 1.6315978456014362, + "grad_norm": 0.7221854329109192, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 22720 + }, + { + "epoch": 1.6323159784560144, + "grad_norm": 0.7724022269248962, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 22730 + }, + { + "epoch": 1.6330341113105926, + "grad_norm": 0.8213715553283691, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 22740 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 0.7102876305580139, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 22750 + }, + { + "epoch": 1.6344703770197486, + "grad_norm": 0.8817880749702454, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 22760 + }, + { + "epoch": 1.6351885098743266, + "grad_norm": 0.8446506857872009, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 22770 + }, + { + "epoch": 1.6359066427289048, + "grad_norm": 0.6749029755592346, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 22780 + }, + { + "epoch": 1.636624775583483, + "grad_norm": 0.7013556957244873, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 22790 + }, + { + "epoch": 1.637342908438061, + "grad_norm": 0.7767965793609619, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22800 + }, + { + "epoch": 1.638061041292639, + "grad_norm": 0.7354073524475098, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 22810 + }, + { + "epoch": 1.638779174147217, + "grad_norm": 0.8871088027954102, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 22820 + }, + { + "epoch": 1.6394973070017953, + "grad_norm": 0.6573871374130249, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 22830 + }, + { + "epoch": 1.6402154398563735, + "grad_norm": 0.5679349303245544, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 22840 + }, + { + "epoch": 1.6409335727109515, + "grad_norm": 0.7072559595108032, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 22850 + }, + { + "epoch": 1.6416517055655295, + "grad_norm": 0.7639257311820984, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 22860 + }, + { + "epoch": 1.6423698384201078, + "grad_norm": 0.6699341535568237, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 22870 + }, + { + "epoch": 1.643087971274686, + "grad_norm": 0.8285767436027527, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 22880 + }, + { + "epoch": 1.643806104129264, + "grad_norm": 0.7328150272369385, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 22890 + }, + { + "epoch": 1.644524236983842, + "grad_norm": 0.8122354745864868, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 22900 + }, + { + "epoch": 1.64524236983842, + "grad_norm": 0.7322969436645508, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 22910 + }, + { + "epoch": 1.6459605026929982, + "grad_norm": 0.7269576191902161, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 22920 + }, + { + "epoch": 1.6466786355475764, + "grad_norm": 0.7037042379379272, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 22930 + }, + { + "epoch": 1.6473967684021544, + "grad_norm": 0.6960355639457703, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 22940 + }, + { + "epoch": 1.6481149012567324, + "grad_norm": 0.7446839213371277, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 22950 + }, + { + "epoch": 1.6488330341113104, + "grad_norm": 0.7201664447784424, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 22960 + }, + { + "epoch": 1.6495511669658887, + "grad_norm": 0.7062349319458008, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 22970 + }, + { + "epoch": 1.6502692998204669, + "grad_norm": 0.7666636109352112, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 22980 + }, + { + "epoch": 1.6509874326750449, + "grad_norm": 0.7872112393379211, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 22990 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 0.7428551316261292, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 23000 + }, + { + "epoch": 1.6524236983842011, + "grad_norm": 0.6087952852249146, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 23010 + }, + { + "epoch": 1.6531418312387793, + "grad_norm": 0.7191354036331177, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 23020 + }, + { + "epoch": 1.6538599640933573, + "grad_norm": 0.8679710626602173, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 23030 + }, + { + "epoch": 1.6545780969479353, + "grad_norm": 0.7232310175895691, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 23040 + }, + { + "epoch": 1.6552962298025133, + "grad_norm": 0.5695104002952576, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 23050 + }, + { + "epoch": 1.6560143626570916, + "grad_norm": 0.6363076567649841, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 23060 + }, + { + "epoch": 1.6567324955116698, + "grad_norm": 0.8168749809265137, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23070 + }, + { + "epoch": 1.6574506283662478, + "grad_norm": 0.7664111852645874, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 23080 + }, + { + "epoch": 1.6581687612208258, + "grad_norm": 0.6748140454292297, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 23090 + }, + { + "epoch": 1.6588868940754038, + "grad_norm": 0.6258183121681213, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 23100 + }, + { + "epoch": 1.659605026929982, + "grad_norm": 0.8669735193252563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 23110 + }, + { + "epoch": 1.6603231597845602, + "grad_norm": 0.5606119632720947, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 23120 + }, + { + "epoch": 1.6610412926391382, + "grad_norm": 0.6602507829666138, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 23130 + }, + { + "epoch": 1.6617594254937162, + "grad_norm": 0.7237988710403442, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 23140 + }, + { + "epoch": 1.6624775583482945, + "grad_norm": 0.9054415225982666, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 23150 + }, + { + "epoch": 1.6631956912028727, + "grad_norm": 0.5186660289764404, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 23160 + }, + { + "epoch": 1.6639138240574507, + "grad_norm": 0.719584584236145, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 23170 + }, + { + "epoch": 1.6646319569120287, + "grad_norm": 0.7583617568016052, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 23180 + }, + { + "epoch": 1.6653500897666067, + "grad_norm": 0.7985982298851013, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 23190 + }, + { + "epoch": 1.666068222621185, + "grad_norm": 0.6952691674232483, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23200 + }, + { + "epoch": 1.6667863554757631, + "grad_norm": 0.7184221744537354, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 23210 + }, + { + "epoch": 1.6675044883303412, + "grad_norm": 0.8256361484527588, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 23220 + }, + { + "epoch": 1.6682226211849192, + "grad_norm": 0.7534128427505493, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 23230 + }, + { + "epoch": 1.6689407540394972, + "grad_norm": 0.7711095213890076, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 23240 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 0.6326615810394287, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 23250 + }, + { + "epoch": 1.6703770197486536, + "grad_norm": 0.8345766663551331, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 23260 + }, + { + "epoch": 1.6710951526032316, + "grad_norm": 0.9079837203025818, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 23270 + }, + { + "epoch": 1.6718132854578096, + "grad_norm": 0.7310197353363037, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 23280 + }, + { + "epoch": 1.6725314183123878, + "grad_norm": 0.7573344707489014, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 23290 + }, + { + "epoch": 1.673249551166966, + "grad_norm": 0.7708047032356262, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 23300 + }, + { + "epoch": 1.673967684021544, + "grad_norm": 0.7665812969207764, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 23310 + }, + { + "epoch": 1.674685816876122, + "grad_norm": 0.7988788485527039, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 23320 + }, + { + "epoch": 1.6754039497307, + "grad_norm": 0.755042552947998, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 23330 + }, + { + "epoch": 1.6761220825852783, + "grad_norm": 0.6605848670005798, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 23340 + }, + { + "epoch": 1.6768402154398565, + "grad_norm": 0.8762016296386719, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 23350 + }, + { + "epoch": 1.6775583482944345, + "grad_norm": 0.604742169380188, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 23360 + }, + { + "epoch": 1.6782764811490125, + "grad_norm": 0.7479172945022583, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 23370 + }, + { + "epoch": 1.6789946140035905, + "grad_norm": 0.6418702602386475, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 23380 + }, + { + "epoch": 1.6797127468581687, + "grad_norm": 0.6783933639526367, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 23390 + }, + { + "epoch": 1.680430879712747, + "grad_norm": 0.7036024928092957, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 23400 + }, + { + "epoch": 1.681149012567325, + "grad_norm": 0.6833266615867615, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 23410 + }, + { + "epoch": 1.681867145421903, + "grad_norm": 0.8867062330245972, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 23420 + }, + { + "epoch": 1.6825852782764812, + "grad_norm": 0.7825753092765808, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 23430 + }, + { + "epoch": 1.6833034111310592, + "grad_norm": 0.6396880745887756, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 23440 + }, + { + "epoch": 1.6840215439856374, + "grad_norm": 0.5723230242729187, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 23450 + }, + { + "epoch": 1.6847396768402154, + "grad_norm": 0.6949231624603271, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 23460 + }, + { + "epoch": 1.6854578096947934, + "grad_norm": 0.8290650248527527, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 23470 + }, + { + "epoch": 1.6861759425493716, + "grad_norm": 0.7765078544616699, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 23480 + }, + { + "epoch": 1.6868940754039499, + "grad_norm": 0.7084149718284607, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 23490 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 0.6916654109954834, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 23500 + }, + { + "epoch": 1.6883303411131059, + "grad_norm": 0.5615179538726807, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 23510 + }, + { + "epoch": 1.6890484739676839, + "grad_norm": 0.7996105551719666, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 23520 + }, + { + "epoch": 1.689766606822262, + "grad_norm": 0.7010168433189392, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23530 + }, + { + "epoch": 1.6904847396768403, + "grad_norm": 0.7876442074775696, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 23540 + }, + { + "epoch": 1.6912028725314183, + "grad_norm": 0.7508043646812439, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 23550 + }, + { + "epoch": 1.6919210053859963, + "grad_norm": 0.8125874400138855, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 23560 + }, + { + "epoch": 1.6926391382405745, + "grad_norm": 0.711840808391571, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 23570 + }, + { + "epoch": 1.6933572710951525, + "grad_norm": 0.6540026068687439, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 23580 + }, + { + "epoch": 1.6940754039497308, + "grad_norm": 0.8376550078392029, + "learning_rate": 0.0002, + "loss": 0.7578, + "step": 23590 + }, + { + "epoch": 1.6947935368043088, + "grad_norm": 0.7075366973876953, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 23600 + }, + { + "epoch": 1.6955116696588868, + "grad_norm": 0.7522266507148743, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23610 + }, + { + "epoch": 1.696229802513465, + "grad_norm": 0.7572667002677917, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 23620 + }, + { + "epoch": 1.6969479353680432, + "grad_norm": 0.6126907467842102, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 23630 + }, + { + "epoch": 1.6976660682226212, + "grad_norm": 0.7473152875900269, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 23640 + }, + { + "epoch": 1.6983842010771992, + "grad_norm": 0.6630390286445618, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 23650 + }, + { + "epoch": 1.6991023339317772, + "grad_norm": 0.5848073363304138, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 23660 + }, + { + "epoch": 1.6998204667863555, + "grad_norm": 0.5901942849159241, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 23670 + }, + { + "epoch": 1.7005385996409337, + "grad_norm": 0.7896918058395386, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 23680 + }, + { + "epoch": 1.7012567324955117, + "grad_norm": 0.705362856388092, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 23690 + }, + { + "epoch": 1.7019748653500897, + "grad_norm": 0.9917470812797546, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 23700 + }, + { + "epoch": 1.702692998204668, + "grad_norm": 0.7550538778305054, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 23710 + }, + { + "epoch": 1.703411131059246, + "grad_norm": 0.8348238468170166, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23720 + }, + { + "epoch": 1.7041292639138241, + "grad_norm": 0.5979694128036499, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 23730 + }, + { + "epoch": 1.7048473967684021, + "grad_norm": 0.7451775670051575, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 23740 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 0.7614818215370178, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 23750 + }, + { + "epoch": 1.7062836624775584, + "grad_norm": 0.5590742826461792, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 23760 + }, + { + "epoch": 1.7070017953321366, + "grad_norm": 0.7039094567298889, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 23770 + }, + { + "epoch": 1.7077199281867146, + "grad_norm": 0.7963233590126038, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23780 + }, + { + "epoch": 1.7084380610412926, + "grad_norm": 0.7214934825897217, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 23790 + }, + { + "epoch": 1.7091561938958706, + "grad_norm": 0.7310500741004944, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23800 + }, + { + "epoch": 1.7098743267504488, + "grad_norm": 0.6653284430503845, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 23810 + }, + { + "epoch": 1.710592459605027, + "grad_norm": 0.6632702946662903, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 23820 + }, + { + "epoch": 1.711310592459605, + "grad_norm": 0.6314955949783325, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 23830 + }, + { + "epoch": 1.712028725314183, + "grad_norm": 0.73652583360672, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 23840 + }, + { + "epoch": 1.7127468581687613, + "grad_norm": 0.5685144662857056, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 23850 + }, + { + "epoch": 1.7134649910233393, + "grad_norm": 0.7010223865509033, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 23860 + }, + { + "epoch": 1.7141831238779175, + "grad_norm": 0.7643879652023315, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 23870 + }, + { + "epoch": 1.7149012567324955, + "grad_norm": 0.7543165683746338, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 23880 + }, + { + "epoch": 1.7156193895870735, + "grad_norm": 0.8816508054733276, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 23890 + }, + { + "epoch": 1.7163375224416517, + "grad_norm": 0.7979614734649658, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23900 + }, + { + "epoch": 1.71705565529623, + "grad_norm": 0.7631057500839233, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 23910 + }, + { + "epoch": 1.717773788150808, + "grad_norm": 0.6349977254867554, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 23920 + }, + { + "epoch": 1.718491921005386, + "grad_norm": 0.7464412450790405, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 23930 + }, + { + "epoch": 1.719210053859964, + "grad_norm": 0.6985567212104797, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 23940 + }, + { + "epoch": 1.7199281867145422, + "grad_norm": 0.6641302704811096, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 23950 + }, + { + "epoch": 1.7206463195691204, + "grad_norm": 0.7299597263336182, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 23960 + }, + { + "epoch": 1.7213644524236984, + "grad_norm": 0.7812355756759644, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 23970 + }, + { + "epoch": 1.7220825852782764, + "grad_norm": 0.667571485042572, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 23980 + }, + { + "epoch": 1.7228007181328546, + "grad_norm": 0.8244081735610962, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 23990 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 0.6684445738792419, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 24000 + }, + { + "epoch": 1.7242369838420109, + "grad_norm": 0.7002949118614197, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 24010 + }, + { + "epoch": 1.7249551166965889, + "grad_norm": 0.6249772906303406, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 24020 + }, + { + "epoch": 1.7256732495511669, + "grad_norm": 0.7279905080795288, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 24030 + }, + { + "epoch": 1.726391382405745, + "grad_norm": 0.631148636341095, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 24040 + }, + { + "epoch": 1.7271095152603233, + "grad_norm": 0.7486464977264404, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 24050 + }, + { + "epoch": 1.7278276481149013, + "grad_norm": 0.7494347095489502, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 24060 + }, + { + "epoch": 1.7285457809694793, + "grad_norm": 0.7821264863014221, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 24070 + }, + { + "epoch": 1.7292639138240573, + "grad_norm": 0.7211608290672302, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 24080 + }, + { + "epoch": 1.7299820466786355, + "grad_norm": 0.7028553485870361, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 24090 + }, + { + "epoch": 1.7307001795332138, + "grad_norm": 0.6189247369766235, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 24100 + }, + { + "epoch": 1.7314183123877918, + "grad_norm": 0.7339756488800049, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 24110 + }, + { + "epoch": 1.7321364452423698, + "grad_norm": 0.6700502038002014, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 24120 + }, + { + "epoch": 1.732854578096948, + "grad_norm": 0.6139533519744873, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 24130 + }, + { + "epoch": 1.733572710951526, + "grad_norm": 0.7249825596809387, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 24140 + }, + { + "epoch": 1.7342908438061042, + "grad_norm": 0.6531777381896973, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 24150 + }, + { + "epoch": 1.7350089766606822, + "grad_norm": 0.8443833589553833, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 24160 + }, + { + "epoch": 1.7357271095152602, + "grad_norm": 0.7040373086929321, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 24170 + }, + { + "epoch": 1.7364452423698384, + "grad_norm": 0.8647749423980713, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24180 + }, + { + "epoch": 1.7371633752244167, + "grad_norm": 0.7297305464744568, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 24190 + }, + { + "epoch": 1.7378815080789947, + "grad_norm": 0.8191218376159668, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 24200 + }, + { + "epoch": 1.7385996409335727, + "grad_norm": 0.7315607666969299, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 24210 + }, + { + "epoch": 1.7393177737881507, + "grad_norm": 0.694486677646637, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 24220 + }, + { + "epoch": 1.740035906642729, + "grad_norm": 0.8115953207015991, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 24230 + }, + { + "epoch": 1.7407540394973071, + "grad_norm": 0.7379186153411865, + "learning_rate": 0.0002, + "loss": 0.7792, + "step": 24240 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 0.6820309162139893, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 24250 + }, + { + "epoch": 1.7421903052064631, + "grad_norm": 0.8210766911506653, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 24260 + }, + { + "epoch": 1.7429084380610413, + "grad_norm": 0.724466860294342, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 24270 + }, + { + "epoch": 1.7436265709156193, + "grad_norm": 0.8768740296363831, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 24280 + }, + { + "epoch": 1.7443447037701976, + "grad_norm": 0.6691206097602844, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24290 + }, + { + "epoch": 1.7450628366247756, + "grad_norm": 0.6529893279075623, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 24300 + }, + { + "epoch": 1.7457809694793536, + "grad_norm": 0.904729962348938, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 24310 + }, + { + "epoch": 1.7464991023339318, + "grad_norm": 0.655235230922699, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24320 + }, + { + "epoch": 1.74721723518851, + "grad_norm": 0.9476361274719238, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 24330 + }, + { + "epoch": 1.747935368043088, + "grad_norm": 0.55366051197052, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 24340 + }, + { + "epoch": 1.748653500897666, + "grad_norm": 0.7192568182945251, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 24350 + }, + { + "epoch": 1.749371633752244, + "grad_norm": 0.7193983793258667, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 24360 + }, + { + "epoch": 1.7500897666068223, + "grad_norm": 0.753998339176178, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24370 + }, + { + "epoch": 1.7508078994614005, + "grad_norm": 1.1058299541473389, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 24380 + }, + { + "epoch": 1.7515260323159785, + "grad_norm": 0.7213007211685181, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 24390 + }, + { + "epoch": 1.7522441651705565, + "grad_norm": 0.972494900226593, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 24400 + }, + { + "epoch": 1.7529622980251347, + "grad_norm": 0.8045306205749512, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 24410 + }, + { + "epoch": 1.7536804308797127, + "grad_norm": 0.82415372133255, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24420 + }, + { + "epoch": 1.754398563734291, + "grad_norm": 0.72683185338974, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 24430 + }, + { + "epoch": 1.755116696588869, + "grad_norm": 0.687907338142395, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 24440 + }, + { + "epoch": 1.755834829443447, + "grad_norm": 0.6616531610488892, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 24450 + }, + { + "epoch": 1.7565529622980252, + "grad_norm": 0.7225571870803833, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 24460 + }, + { + "epoch": 1.7572710951526034, + "grad_norm": 0.7597603797912598, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 24470 + }, + { + "epoch": 1.7579892280071814, + "grad_norm": 0.7850660681724548, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 24480 + }, + { + "epoch": 1.7587073608617594, + "grad_norm": 0.9843530058860779, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 24490 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 0.7010256052017212, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 24500 + }, + { + "epoch": 1.7601436265709156, + "grad_norm": 0.5669383406639099, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 24510 + }, + { + "epoch": 1.7608617594254938, + "grad_norm": 0.7043302655220032, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 24520 + }, + { + "epoch": 1.7615798922800718, + "grad_norm": 0.8000741600990295, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 24530 + }, + { + "epoch": 1.7622980251346498, + "grad_norm": 0.7084416747093201, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 24540 + }, + { + "epoch": 1.763016157989228, + "grad_norm": 0.7290608882904053, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 24550 + }, + { + "epoch": 1.763734290843806, + "grad_norm": 0.8710007071495056, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 24560 + }, + { + "epoch": 1.7644524236983843, + "grad_norm": 0.6346535682678223, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 24570 + }, + { + "epoch": 1.7651705565529623, + "grad_norm": 0.8990599513053894, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 24580 + }, + { + "epoch": 1.7658886894075403, + "grad_norm": 0.7823857665061951, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 24590 + }, + { + "epoch": 1.7666068222621185, + "grad_norm": 0.6250144839286804, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 24600 + }, + { + "epoch": 1.7673249551166967, + "grad_norm": 0.715657114982605, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 24610 + }, + { + "epoch": 1.7680430879712747, + "grad_norm": 0.6254874467849731, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 24620 + }, + { + "epoch": 1.7687612208258527, + "grad_norm": 0.6873717904090881, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 24630 + }, + { + "epoch": 1.7694793536804307, + "grad_norm": 0.7273038625717163, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 24640 + }, + { + "epoch": 1.770197486535009, + "grad_norm": 0.9079981446266174, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 24650 + }, + { + "epoch": 1.7709156193895872, + "grad_norm": 0.6262510418891907, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 24660 + }, + { + "epoch": 1.7716337522441652, + "grad_norm": 0.7326231002807617, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 24670 + }, + { + "epoch": 1.7723518850987432, + "grad_norm": 0.7828301787376404, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 24680 + }, + { + "epoch": 1.7730700179533212, + "grad_norm": 0.5881586670875549, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 24690 + }, + { + "epoch": 1.7737881508078994, + "grad_norm": 0.7101683020591736, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 24700 + }, + { + "epoch": 1.7745062836624776, + "grad_norm": 0.8466469049453735, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 24710 + }, + { + "epoch": 1.7752244165170556, + "grad_norm": 0.7770822644233704, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 24720 + }, + { + "epoch": 1.7759425493716336, + "grad_norm": 0.7259120345115662, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 24730 + }, + { + "epoch": 1.7766606822262119, + "grad_norm": 0.7696824669837952, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 24740 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 0.7603837847709656, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 24750 + }, + { + "epoch": 1.778096947935368, + "grad_norm": 0.6166595220565796, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 24760 + }, + { + "epoch": 1.778815080789946, + "grad_norm": 0.7493758797645569, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 24770 + }, + { + "epoch": 1.779533213644524, + "grad_norm": 0.7177459597587585, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 24780 + }, + { + "epoch": 1.7802513464991023, + "grad_norm": 0.6666781306266785, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 24790 + }, + { + "epoch": 1.7809694793536806, + "grad_norm": 0.6556468605995178, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 24800 + }, + { + "epoch": 1.7816876122082586, + "grad_norm": 0.6119393706321716, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 24810 + }, + { + "epoch": 1.7824057450628366, + "grad_norm": 0.8573325276374817, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 24820 + }, + { + "epoch": 1.7831238779174146, + "grad_norm": 0.8017005920410156, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 24830 + }, + { + "epoch": 1.7838420107719928, + "grad_norm": 0.7337947487831116, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24840 + }, + { + "epoch": 1.784560143626571, + "grad_norm": 0.6717178225517273, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 24850 + }, + { + "epoch": 1.785278276481149, + "grad_norm": 0.8243708610534668, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 24860 + }, + { + "epoch": 1.785996409335727, + "grad_norm": 0.8111547827720642, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24870 + }, + { + "epoch": 1.7867145421903052, + "grad_norm": 0.8577823042869568, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 24880 + }, + { + "epoch": 1.7874326750448835, + "grad_norm": 0.6488644480705261, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 24890 + }, + { + "epoch": 1.7881508078994615, + "grad_norm": 0.6446744799613953, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 24900 + }, + { + "epoch": 1.7888689407540395, + "grad_norm": 0.6400182247161865, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 24910 + }, + { + "epoch": 1.7895870736086175, + "grad_norm": 0.8059108853340149, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 24920 + }, + { + "epoch": 1.7903052064631957, + "grad_norm": 0.7101734280586243, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 24930 + }, + { + "epoch": 1.791023339317774, + "grad_norm": 1.0397762060165405, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 24940 + }, + { + "epoch": 1.791741472172352, + "grad_norm": 0.6231128573417664, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 24950 + }, + { + "epoch": 1.79245960502693, + "grad_norm": 5.905253887176514, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 24960 + }, + { + "epoch": 1.793177737881508, + "grad_norm": 0.8003911375999451, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 24970 + }, + { + "epoch": 1.7938958707360861, + "grad_norm": 0.6340393424034119, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 24980 + }, + { + "epoch": 1.7946140035906644, + "grad_norm": 0.8701013922691345, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 24990 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 0.9085575342178345, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 25000 + }, + { + "epoch": 1.7960502692998204, + "grad_norm": 0.6306625604629517, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 25010 + }, + { + "epoch": 1.7967684021543986, + "grad_norm": 0.6985056400299072, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25020 + }, + { + "epoch": 1.7974865350089768, + "grad_norm": 0.7309113144874573, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 25030 + }, + { + "epoch": 1.7982046678635548, + "grad_norm": 0.6795042157173157, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 25040 + }, + { + "epoch": 1.7989228007181328, + "grad_norm": 0.6920178532600403, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25050 + }, + { + "epoch": 1.7996409335727108, + "grad_norm": 0.6578564047813416, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25060 + }, + { + "epoch": 1.800359066427289, + "grad_norm": 0.6718358993530273, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 25070 + }, + { + "epoch": 1.8010771992818673, + "grad_norm": 0.9086750149726868, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 25080 + }, + { + "epoch": 1.8017953321364453, + "grad_norm": 0.6102437973022461, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 25090 + }, + { + "epoch": 1.8025134649910233, + "grad_norm": 0.6391313076019287, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 25100 + }, + { + "epoch": 1.8032315978456013, + "grad_norm": 0.7150128483772278, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 25110 + }, + { + "epoch": 1.8039497307001795, + "grad_norm": 0.9833421111106873, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 25120 + }, + { + "epoch": 1.8046678635547577, + "grad_norm": 0.774002194404602, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25130 + }, + { + "epoch": 1.8053859964093357, + "grad_norm": 0.644443154335022, + "learning_rate": 0.0002, + "loss": 0.7329, + "step": 25140 + }, + { + "epoch": 1.8061041292639137, + "grad_norm": 0.6996100544929504, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 25150 + }, + { + "epoch": 1.806822262118492, + "grad_norm": 0.7545985579490662, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 25160 + }, + { + "epoch": 1.8075403949730702, + "grad_norm": 0.7505226731300354, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 25170 + }, + { + "epoch": 1.8082585278276482, + "grad_norm": 0.800681471824646, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 25180 + }, + { + "epoch": 1.8089766606822262, + "grad_norm": 0.8268337845802307, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 25190 + }, + { + "epoch": 1.8096947935368042, + "grad_norm": 0.6436594128608704, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 25200 + }, + { + "epoch": 1.8104129263913824, + "grad_norm": 0.6961014270782471, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 25210 + }, + { + "epoch": 1.8111310592459606, + "grad_norm": 0.6649489998817444, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 25220 + }, + { + "epoch": 1.8118491921005386, + "grad_norm": 0.7071637511253357, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 25230 + }, + { + "epoch": 1.8125673249551166, + "grad_norm": 0.9082241654396057, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 25240 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 0.6318159103393555, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 25250 + }, + { + "epoch": 1.8140035906642729, + "grad_norm": 0.8006597757339478, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 25260 + }, + { + "epoch": 1.814721723518851, + "grad_norm": 0.7950259447097778, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 25270 + }, + { + "epoch": 1.815439856373429, + "grad_norm": 0.8376588821411133, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 25280 + }, + { + "epoch": 1.816157989228007, + "grad_norm": 0.8343217968940735, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 25290 + }, + { + "epoch": 1.8168761220825853, + "grad_norm": 0.6240017414093018, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 25300 + }, + { + "epoch": 1.8175942549371635, + "grad_norm": 0.7079808712005615, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 25310 + }, + { + "epoch": 1.8183123877917415, + "grad_norm": 0.5930073261260986, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 25320 + }, + { + "epoch": 1.8190305206463195, + "grad_norm": 0.6994491815567017, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 25330 + }, + { + "epoch": 1.8197486535008975, + "grad_norm": 0.8285305500030518, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 25340 + }, + { + "epoch": 1.8204667863554758, + "grad_norm": 0.6880194544792175, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 25350 + }, + { + "epoch": 1.821184919210054, + "grad_norm": 0.7301307916641235, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 25360 + }, + { + "epoch": 1.821903052064632, + "grad_norm": 0.8117532730102539, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 25370 + }, + { + "epoch": 1.82262118491921, + "grad_norm": 0.8098701238632202, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 25380 + }, + { + "epoch": 1.823339317773788, + "grad_norm": 0.6899038553237915, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 25390 + }, + { + "epoch": 1.8240574506283662, + "grad_norm": 0.7350431084632874, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 25400 + }, + { + "epoch": 1.8247755834829444, + "grad_norm": 0.8723382949829102, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 25410 + }, + { + "epoch": 1.8254937163375224, + "grad_norm": 0.7448108196258545, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 25420 + }, + { + "epoch": 1.8262118491921004, + "grad_norm": 0.7525040507316589, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25430 + }, + { + "epoch": 1.8269299820466787, + "grad_norm": 0.7148599028587341, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25440 + }, + { + "epoch": 1.827648114901257, + "grad_norm": 1.1802153587341309, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 25450 + }, + { + "epoch": 1.828366247755835, + "grad_norm": 0.619945764541626, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25460 + }, + { + "epoch": 1.829084380610413, + "grad_norm": 0.7065792679786682, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 25470 + }, + { + "epoch": 1.829802513464991, + "grad_norm": 0.6626001596450806, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 25480 + }, + { + "epoch": 1.8305206463195691, + "grad_norm": 0.8368920087814331, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 25490 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 0.7528934478759766, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 25500 + }, + { + "epoch": 1.8319569120287253, + "grad_norm": 0.6472136378288269, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 25510 + }, + { + "epoch": 1.8326750448833034, + "grad_norm": 0.7818671464920044, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 25520 + }, + { + "epoch": 1.8333931777378814, + "grad_norm": 0.8280798196792603, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 25530 + }, + { + "epoch": 1.8341113105924596, + "grad_norm": 0.7038599252700806, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 25540 + }, + { + "epoch": 1.8348294434470378, + "grad_norm": 0.6345962882041931, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 25550 + }, + { + "epoch": 1.8355475763016158, + "grad_norm": 0.6891741752624512, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 25560 + }, + { + "epoch": 1.8362657091561938, + "grad_norm": 0.7753492593765259, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 25570 + }, + { + "epoch": 1.836983842010772, + "grad_norm": 0.6907210946083069, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 25580 + }, + { + "epoch": 1.8377019748653503, + "grad_norm": 0.7483090162277222, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 25590 + }, + { + "epoch": 1.8384201077199283, + "grad_norm": 0.8749029636383057, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 25600 + }, + { + "epoch": 1.8391382405745063, + "grad_norm": 0.6936851143836975, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 25610 + }, + { + "epoch": 1.8398563734290843, + "grad_norm": 0.7273763418197632, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 25620 + }, + { + "epoch": 1.8405745062836625, + "grad_norm": 0.7655298113822937, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 25630 + }, + { + "epoch": 1.8412926391382407, + "grad_norm": 0.7207344770431519, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 25640 + }, + { + "epoch": 1.8420107719928187, + "grad_norm": 0.6970131397247314, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 25650 + }, + { + "epoch": 1.8427289048473967, + "grad_norm": 0.7777560353279114, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25660 + }, + { + "epoch": 1.8434470377019747, + "grad_norm": 0.7070116400718689, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 25670 + }, + { + "epoch": 1.844165170556553, + "grad_norm": 0.6980257630348206, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 25680 + }, + { + "epoch": 1.8448833034111312, + "grad_norm": 0.906563401222229, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 25690 + }, + { + "epoch": 1.8456014362657092, + "grad_norm": 0.567991316318512, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 25700 + }, + { + "epoch": 1.8463195691202872, + "grad_norm": 0.5954506993293762, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 25710 + }, + { + "epoch": 1.8470377019748654, + "grad_norm": 0.8073318600654602, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 25720 + }, + { + "epoch": 1.8477558348294436, + "grad_norm": 0.7439551949501038, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 25730 + }, + { + "epoch": 1.8484739676840216, + "grad_norm": 0.8091771602630615, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 25740 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 0.6584576964378357, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 25750 + }, + { + "epoch": 1.8499102333931776, + "grad_norm": 0.8161963224411011, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 25760 + }, + { + "epoch": 1.8506283662477558, + "grad_norm": 0.7337122559547424, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 25770 + }, + { + "epoch": 1.851346499102334, + "grad_norm": 0.8968114256858826, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25780 + }, + { + "epoch": 1.852064631956912, + "grad_norm": 0.8647686839103699, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 25790 + }, + { + "epoch": 1.85278276481149, + "grad_norm": 0.7775349020957947, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 25800 + }, + { + "epoch": 1.853500897666068, + "grad_norm": 0.686072587966919, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 25810 + }, + { + "epoch": 1.8542190305206463, + "grad_norm": 0.7053380012512207, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 25820 + }, + { + "epoch": 1.8549371633752245, + "grad_norm": 0.7899979948997498, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 25830 + }, + { + "epoch": 1.8556552962298025, + "grad_norm": 0.6970776915550232, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 25840 + }, + { + "epoch": 1.8563734290843805, + "grad_norm": 0.7210841774940491, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 25850 + }, + { + "epoch": 1.8570915619389587, + "grad_norm": 0.7297208905220032, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 25860 + }, + { + "epoch": 1.857809694793537, + "grad_norm": 0.7782729268074036, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 25870 + }, + { + "epoch": 1.858527827648115, + "grad_norm": 0.7227505445480347, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 25880 + }, + { + "epoch": 1.859245960502693, + "grad_norm": 0.7489684224128723, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 25890 + }, + { + "epoch": 1.859964093357271, + "grad_norm": 0.7447289824485779, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 25900 + }, + { + "epoch": 1.8606822262118492, + "grad_norm": 0.8516317009925842, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 25910 + }, + { + "epoch": 1.8614003590664274, + "grad_norm": 0.6864543557167053, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 25920 + }, + { + "epoch": 1.8621184919210054, + "grad_norm": 0.6753451824188232, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 25930 + }, + { + "epoch": 1.8628366247755834, + "grad_norm": 0.631679117679596, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25940 + }, + { + "epoch": 1.8635547576301614, + "grad_norm": 0.7715049982070923, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 25950 + }, + { + "epoch": 1.8642728904847397, + "grad_norm": 0.7354850769042969, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 25960 + }, + { + "epoch": 1.8649910233393179, + "grad_norm": 0.7443442940711975, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 25970 + }, + { + "epoch": 1.8657091561938959, + "grad_norm": 0.6880337595939636, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 25980 + }, + { + "epoch": 1.8664272890484739, + "grad_norm": 0.843941867351532, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 25990 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 0.6904318928718567, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 26000 + }, + { + "epoch": 1.86786355475763, + "grad_norm": 0.9041751623153687, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 26010 + }, + { + "epoch": 1.8685816876122083, + "grad_norm": 0.7470057010650635, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 26020 + }, + { + "epoch": 1.8692998204667863, + "grad_norm": 0.6921331882476807, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 26030 + }, + { + "epoch": 1.8700179533213643, + "grad_norm": 0.7627376914024353, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 26040 + }, + { + "epoch": 1.8707360861759426, + "grad_norm": 0.7784932851791382, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 26050 + }, + { + "epoch": 1.8714542190305208, + "grad_norm": 0.6399524807929993, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 26060 + }, + { + "epoch": 1.8721723518850988, + "grad_norm": 0.6478492617607117, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26070 + }, + { + "epoch": 1.8728904847396768, + "grad_norm": 0.6376804113388062, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 26080 + }, + { + "epoch": 1.8736086175942548, + "grad_norm": 0.6976892352104187, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 26090 + }, + { + "epoch": 1.874326750448833, + "grad_norm": 0.7997903227806091, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 26100 + }, + { + "epoch": 1.8750448833034112, + "grad_norm": 0.6984273791313171, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 26110 + }, + { + "epoch": 1.8757630161579892, + "grad_norm": 0.7020659446716309, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26120 + }, + { + "epoch": 1.8764811490125672, + "grad_norm": 0.784986138343811, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 26130 + }, + { + "epoch": 1.8771992818671455, + "grad_norm": 0.7369210124015808, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 26140 + }, + { + "epoch": 1.8779174147217235, + "grad_norm": 0.7730622291564941, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 26150 + }, + { + "epoch": 1.8786355475763017, + "grad_norm": 0.7253434658050537, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 26160 + }, + { + "epoch": 1.8793536804308797, + "grad_norm": 0.8019800186157227, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 26170 + }, + { + "epoch": 1.8800718132854577, + "grad_norm": 0.7337628602981567, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 26180 + }, + { + "epoch": 1.880789946140036, + "grad_norm": 0.7049200534820557, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 26190 + }, + { + "epoch": 1.8815080789946141, + "grad_norm": 0.6451525092124939, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 26200 + }, + { + "epoch": 1.8822262118491921, + "grad_norm": 0.7660874724388123, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 26210 + }, + { + "epoch": 1.8829443447037701, + "grad_norm": 0.8464223146438599, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26220 + }, + { + "epoch": 1.8836624775583481, + "grad_norm": 0.859503984451294, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 26230 + }, + { + "epoch": 1.8843806104129264, + "grad_norm": 0.6969478726387024, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 26240 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 0.6860285997390747, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 26250 + }, + { + "epoch": 1.8858168761220826, + "grad_norm": 0.5873110294342041, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 26260 + }, + { + "epoch": 1.8865350089766606, + "grad_norm": 0.6959530115127563, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 26270 + }, + { + "epoch": 1.8872531418312388, + "grad_norm": 0.8734689950942993, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 26280 + }, + { + "epoch": 1.8879712746858168, + "grad_norm": 0.7385509014129639, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 26290 + }, + { + "epoch": 1.888689407540395, + "grad_norm": 0.6702063083648682, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 26300 + }, + { + "epoch": 1.889407540394973, + "grad_norm": 0.8177255988121033, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 26310 + }, + { + "epoch": 1.890125673249551, + "grad_norm": 0.6638466715812683, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 26320 + }, + { + "epoch": 1.8908438061041293, + "grad_norm": 0.8584128618240356, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 26330 + }, + { + "epoch": 1.8915619389587075, + "grad_norm": 0.677561342716217, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 26340 + }, + { + "epoch": 1.8922800718132855, + "grad_norm": 0.6931864619255066, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 26350 + }, + { + "epoch": 1.8929982046678635, + "grad_norm": 0.6583828330039978, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 26360 + }, + { + "epoch": 1.8937163375224415, + "grad_norm": 0.6708519458770752, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 26370 + }, + { + "epoch": 1.8944344703770197, + "grad_norm": 0.7684788107872009, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 26380 + }, + { + "epoch": 1.895152603231598, + "grad_norm": 0.703217625617981, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 26390 + }, + { + "epoch": 1.895870736086176, + "grad_norm": 0.6686710119247437, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26400 + }, + { + "epoch": 1.896588868940754, + "grad_norm": 0.7429705262184143, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 26410 + }, + { + "epoch": 1.8973070017953322, + "grad_norm": 0.7835305333137512, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 26420 + }, + { + "epoch": 1.8980251346499102, + "grad_norm": 0.7793689370155334, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 26430 + }, + { + "epoch": 1.8987432675044884, + "grad_norm": 0.7337237000465393, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 26440 + }, + { + "epoch": 1.8994614003590664, + "grad_norm": 0.5734546780586243, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 26450 + }, + { + "epoch": 1.9001795332136444, + "grad_norm": 0.655937135219574, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 26460 + }, + { + "epoch": 1.9008976660682226, + "grad_norm": 1.0200905799865723, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 26470 + }, + { + "epoch": 1.9016157989228009, + "grad_norm": 0.6118829250335693, + "learning_rate": 0.0002, + "loss": 0.733, + "step": 26480 + }, + { + "epoch": 1.9023339317773789, + "grad_norm": 0.7459297776222229, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 26490 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 0.9451959729194641, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 26500 + }, + { + "epoch": 1.9037701974865349, + "grad_norm": 0.9694880247116089, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 26510 + }, + { + "epoch": 1.904488330341113, + "grad_norm": 0.806532084941864, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 26520 + }, + { + "epoch": 1.9052064631956913, + "grad_norm": 0.7016968727111816, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 26530 + }, + { + "epoch": 1.9059245960502693, + "grad_norm": 0.7707533836364746, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26540 + }, + { + "epoch": 1.9066427289048473, + "grad_norm": 0.716044545173645, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 26550 + }, + { + "epoch": 1.9073608617594255, + "grad_norm": 0.7904782295227051, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 26560 + }, + { + "epoch": 1.9080789946140035, + "grad_norm": 0.8557461500167847, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 26570 + }, + { + "epoch": 1.9087971274685818, + "grad_norm": 0.6807048916816711, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26580 + }, + { + "epoch": 1.9095152603231598, + "grad_norm": 0.8374032974243164, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 26590 + }, + { + "epoch": 1.9102333931777378, + "grad_norm": 0.7936834692955017, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 26600 + }, + { + "epoch": 1.910951526032316, + "grad_norm": 0.6342210173606873, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 26610 + }, + { + "epoch": 1.9116696588868942, + "grad_norm": 0.8222208023071289, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 26620 + }, + { + "epoch": 1.9123877917414722, + "grad_norm": 0.7890012860298157, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 26630 + }, + { + "epoch": 1.9131059245960502, + "grad_norm": 0.6415254473686218, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 26640 + }, + { + "epoch": 1.9138240574506282, + "grad_norm": 0.7936763763427734, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 26650 + }, + { + "epoch": 1.9145421903052064, + "grad_norm": 0.7174334526062012, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 26660 + }, + { + "epoch": 1.9152603231597847, + "grad_norm": 0.6503710746765137, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 26670 + }, + { + "epoch": 1.9159784560143627, + "grad_norm": 0.7618577480316162, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 26680 + }, + { + "epoch": 1.9166965888689407, + "grad_norm": 0.7984131574630737, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 26690 + }, + { + "epoch": 1.917414721723519, + "grad_norm": 0.6863887906074524, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 26700 + }, + { + "epoch": 1.918132854578097, + "grad_norm": 0.7621138691902161, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 26710 + }, + { + "epoch": 1.9188509874326751, + "grad_norm": 0.7855543494224548, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 26720 + }, + { + "epoch": 1.9195691202872531, + "grad_norm": 0.7045016288757324, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 26730 + }, + { + "epoch": 1.9202872531418311, + "grad_norm": 0.7799559235572815, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 26740 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 0.7999796271324158, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 26750 + }, + { + "epoch": 1.9217235188509876, + "grad_norm": 0.5479980111122131, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 26760 + }, + { + "epoch": 1.9224416517055656, + "grad_norm": 0.7192868590354919, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 26770 + }, + { + "epoch": 1.9231597845601436, + "grad_norm": 0.7642375826835632, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 26780 + }, + { + "epoch": 1.9238779174147216, + "grad_norm": 0.7015959620475769, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 26790 + }, + { + "epoch": 1.9245960502692998, + "grad_norm": 0.6685634851455688, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 26800 + }, + { + "epoch": 1.925314183123878, + "grad_norm": 0.674363911151886, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 26810 + }, + { + "epoch": 1.926032315978456, + "grad_norm": 0.769318163394928, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 26820 + }, + { + "epoch": 1.926750448833034, + "grad_norm": 0.7397989630699158, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 26830 + }, + { + "epoch": 1.9274685816876123, + "grad_norm": 0.7603814601898193, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 26840 + }, + { + "epoch": 1.9281867145421903, + "grad_norm": 0.5960564613342285, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 26850 + }, + { + "epoch": 1.9289048473967685, + "grad_norm": 0.8158858418464661, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 26860 + }, + { + "epoch": 1.9296229802513465, + "grad_norm": 0.7022058367729187, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 26870 + }, + { + "epoch": 1.9303411131059245, + "grad_norm": 0.7249060273170471, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 26880 + }, + { + "epoch": 1.9310592459605027, + "grad_norm": 0.7613264322280884, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 26890 + }, + { + "epoch": 1.931777378815081, + "grad_norm": 0.6857499480247498, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 26900 + }, + { + "epoch": 1.932495511669659, + "grad_norm": 0.6968346834182739, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 26910 + }, + { + "epoch": 1.933213644524237, + "grad_norm": 0.7079267501831055, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 26920 + }, + { + "epoch": 1.933931777378815, + "grad_norm": 0.6571618914604187, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 26930 + }, + { + "epoch": 1.9346499102333932, + "grad_norm": 0.7460548281669617, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 26940 + }, + { + "epoch": 1.9353680430879714, + "grad_norm": 0.7954307794570923, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 26950 + }, + { + "epoch": 1.9360861759425494, + "grad_norm": 0.8696223497390747, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 26960 + }, + { + "epoch": 1.9368043087971274, + "grad_norm": 0.726004421710968, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 26970 + }, + { + "epoch": 1.9375224416517056, + "grad_norm": 0.8760337829589844, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 26980 + }, + { + "epoch": 1.9382405745062836, + "grad_norm": 0.7308675646781921, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 26990 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 0.5900304317474365, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 27000 + }, + { + "epoch": 1.9396768402154398, + "grad_norm": 0.8839457631111145, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 27010 + }, + { + "epoch": 1.9403949730700178, + "grad_norm": 0.7239173650741577, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 27020 + }, + { + "epoch": 1.941113105924596, + "grad_norm": 0.8972901701927185, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 27030 + }, + { + "epoch": 1.9418312387791743, + "grad_norm": 0.7140652537345886, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 27040 + }, + { + "epoch": 1.9425493716337523, + "grad_norm": 0.7502743005752563, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 27050 + }, + { + "epoch": 1.9432675044883303, + "grad_norm": 0.6420751810073853, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 27060 + }, + { + "epoch": 1.9439856373429083, + "grad_norm": 0.6671820282936096, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 27070 + }, + { + "epoch": 1.9447037701974865, + "grad_norm": 0.6268796324729919, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 27080 + }, + { + "epoch": 1.9454219030520647, + "grad_norm": 0.6850021481513977, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 27090 + }, + { + "epoch": 1.9461400359066428, + "grad_norm": 0.6380038261413574, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 27100 + }, + { + "epoch": 1.9468581687612208, + "grad_norm": 0.5806204080581665, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 27110 + }, + { + "epoch": 1.947576301615799, + "grad_norm": 0.8236927390098572, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 27120 + }, + { + "epoch": 1.948294434470377, + "grad_norm": 0.7915826439857483, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27130 + }, + { + "epoch": 1.9490125673249552, + "grad_norm": 0.7467429041862488, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 27140 + }, + { + "epoch": 1.9497307001795332, + "grad_norm": 0.6278707981109619, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27150 + }, + { + "epoch": 1.9504488330341112, + "grad_norm": 0.7353739142417908, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 27160 + }, + { + "epoch": 1.9511669658886894, + "grad_norm": 0.6443645358085632, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27170 + }, + { + "epoch": 1.9518850987432677, + "grad_norm": 0.770800769329071, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 27180 + }, + { + "epoch": 1.9526032315978457, + "grad_norm": 0.8982598781585693, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 27190 + }, + { + "epoch": 1.9533213644524237, + "grad_norm": 0.775017499923706, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 27200 + }, + { + "epoch": 1.9540394973070017, + "grad_norm": 0.8271628618240356, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 27210 + }, + { + "epoch": 1.9547576301615799, + "grad_norm": 0.7460184693336487, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 27220 + }, + { + "epoch": 1.955475763016158, + "grad_norm": 0.7732188105583191, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 27230 + }, + { + "epoch": 1.956193895870736, + "grad_norm": 0.7398577332496643, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 27240 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 0.7132339477539062, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 27250 + }, + { + "epoch": 1.9576301615798921, + "grad_norm": 0.6718965768814087, + "learning_rate": 0.0002, + "loss": 0.7731, + "step": 27260 + }, + { + "epoch": 1.9583482944344703, + "grad_norm": 0.7914422154426575, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 27270 + }, + { + "epoch": 1.9590664272890486, + "grad_norm": 0.8314110636711121, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 27280 + }, + { + "epoch": 1.9597845601436266, + "grad_norm": 0.7810674905776978, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 27290 + }, + { + "epoch": 1.9605026929982046, + "grad_norm": 0.7691007256507874, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 27300 + }, + { + "epoch": 1.9612208258527828, + "grad_norm": 0.6753138899803162, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 27310 + }, + { + "epoch": 1.961938958707361, + "grad_norm": 0.5881175994873047, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 27320 + }, + { + "epoch": 1.962657091561939, + "grad_norm": 0.8414133191108704, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27330 + }, + { + "epoch": 1.963375224416517, + "grad_norm": 0.7363715171813965, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 27340 + }, + { + "epoch": 1.964093357271095, + "grad_norm": 0.6526232361793518, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 27350 + }, + { + "epoch": 1.9648114901256732, + "grad_norm": 0.6821389198303223, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 27360 + }, + { + "epoch": 1.9655296229802515, + "grad_norm": 0.7306062579154968, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 27370 + }, + { + "epoch": 1.9662477558348295, + "grad_norm": 0.6458130478858948, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 27380 + }, + { + "epoch": 1.9669658886894075, + "grad_norm": 0.7243196368217468, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 27390 + }, + { + "epoch": 1.9676840215439855, + "grad_norm": 0.8062235713005066, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 27400 + }, + { + "epoch": 1.9684021543985637, + "grad_norm": 0.68441241979599, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 27410 + }, + { + "epoch": 1.969120287253142, + "grad_norm": 0.7504498958587646, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 27420 + }, + { + "epoch": 1.96983842010772, + "grad_norm": 0.7469466328620911, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 27430 + }, + { + "epoch": 1.970556552962298, + "grad_norm": 0.7109853625297546, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 27440 + }, + { + "epoch": 1.9712746858168761, + "grad_norm": 0.6964903473854065, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 27450 + }, + { + "epoch": 1.9719928186714544, + "grad_norm": 0.8224200010299683, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 27460 + }, + { + "epoch": 1.9727109515260324, + "grad_norm": 0.6195617318153381, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 27470 + }, + { + "epoch": 1.9734290843806104, + "grad_norm": 0.691511332988739, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 27480 + }, + { + "epoch": 1.9741472172351884, + "grad_norm": 0.7437900304794312, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 27490 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 0.7987960577011108, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 27500 + }, + { + "epoch": 1.9755834829443448, + "grad_norm": 0.7117776274681091, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 27510 + }, + { + "epoch": 1.9763016157989228, + "grad_norm": 0.8473866581916809, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 27520 + }, + { + "epoch": 1.9770197486535008, + "grad_norm": 0.7178242802619934, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 27530 + }, + { + "epoch": 1.9777378815080788, + "grad_norm": 0.760145902633667, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 27540 + }, + { + "epoch": 1.978456014362657, + "grad_norm": 0.764436662197113, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 27550 + }, + { + "epoch": 1.9791741472172353, + "grad_norm": 0.7245904803276062, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 27560 + }, + { + "epoch": 1.9798922800718133, + "grad_norm": 0.6317000389099121, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 27570 + }, + { + "epoch": 1.9806104129263913, + "grad_norm": 0.8764704465866089, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 27580 + }, + { + "epoch": 1.9813285457809695, + "grad_norm": 0.6111825108528137, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 27590 + }, + { + "epoch": 1.9820466786355477, + "grad_norm": 0.6797714233398438, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 27600 + }, + { + "epoch": 1.9827648114901257, + "grad_norm": 0.7754142880439758, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 27610 + }, + { + "epoch": 1.9834829443447037, + "grad_norm": 0.7243061661720276, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 27620 + }, + { + "epoch": 1.9842010771992817, + "grad_norm": 0.6194812655448914, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 27630 + }, + { + "epoch": 1.98491921005386, + "grad_norm": 0.6399638056755066, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27640 + }, + { + "epoch": 1.9856373429084382, + "grad_norm": 0.7637218832969666, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 27650 + }, + { + "epoch": 1.9863554757630162, + "grad_norm": 0.9099404811859131, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 27660 + }, + { + "epoch": 1.9870736086175942, + "grad_norm": 0.6892596483230591, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 27670 + }, + { + "epoch": 1.9877917414721722, + "grad_norm": 0.5962418913841248, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 27680 + }, + { + "epoch": 1.9885098743267504, + "grad_norm": 0.5750163197517395, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27690 + }, + { + "epoch": 1.9892280071813286, + "grad_norm": 0.6740097403526306, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 27700 + }, + { + "epoch": 1.9899461400359066, + "grad_norm": 0.6968644857406616, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 27710 + }, + { + "epoch": 1.9906642728904846, + "grad_norm": 0.6788132190704346, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 27720 + }, + { + "epoch": 1.9913824057450629, + "grad_norm": 0.8600544929504395, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 27730 + }, + { + "epoch": 1.992100538599641, + "grad_norm": 0.6227671504020691, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 27740 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 0.6611875295639038, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 27750 + }, + { + "epoch": 1.993536804308797, + "grad_norm": 0.714568018913269, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 27760 + }, + { + "epoch": 1.994254937163375, + "grad_norm": 0.6328669190406799, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27770 + }, + { + "epoch": 1.9949730700179533, + "grad_norm": 0.8673429489135742, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27780 + }, + { + "epoch": 1.9956912028725315, + "grad_norm": 0.820620059967041, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 27790 + }, + { + "epoch": 1.9964093357271095, + "grad_norm": 0.8748094439506531, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 27800 + }, + { + "epoch": 1.9971274685816875, + "grad_norm": 0.8118113875389099, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 27810 + }, + { + "epoch": 1.9978456014362656, + "grad_norm": 0.6886725425720215, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 27820 + }, + { + "epoch": 1.9985637342908438, + "grad_norm": 0.7101268768310547, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 27830 + }, + { + "epoch": 1.999281867145422, + "grad_norm": 0.7823781967163086, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 27840 + }, + { + "epoch": 2.0, + "grad_norm": 0.8491085767745972, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 27850 + }, + { + "epoch": 2.0, + "eval_loss": 1.0868422985076904, + "eval_runtime": 55.1699, + "eval_samples_per_second": 13.286, + "eval_steps_per_second": 1.668, + "step": 27850 + }, + { + "epoch": 2.000718132854578, + "grad_norm": 0.9003389477729797, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 27860 + }, + { + "epoch": 2.001436265709156, + "grad_norm": 0.8898349404335022, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 27870 + }, + { + "epoch": 2.0021543985637344, + "grad_norm": 0.7525973320007324, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 27880 + }, + { + "epoch": 2.0028725314183125, + "grad_norm": 0.7821497321128845, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 27890 + }, + { + "epoch": 2.0035906642728905, + "grad_norm": 0.6334691047668457, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 27900 + }, + { + "epoch": 2.0043087971274685, + "grad_norm": 0.732991099357605, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 27910 + }, + { + "epoch": 2.0050269299820465, + "grad_norm": 0.949942946434021, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 27920 + }, + { + "epoch": 2.005745062836625, + "grad_norm": 0.657267689704895, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 27930 + }, + { + "epoch": 2.006463195691203, + "grad_norm": 0.8329252004623413, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 27940 + }, + { + "epoch": 2.007181328545781, + "grad_norm": 0.7816959023475647, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 27950 + }, + { + "epoch": 2.007899461400359, + "grad_norm": 0.7546323537826538, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 27960 + }, + { + "epoch": 2.0086175942549374, + "grad_norm": 0.9519657492637634, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 27970 + }, + { + "epoch": 2.0093357271095154, + "grad_norm": 0.7934315800666809, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 27980 + }, + { + "epoch": 2.0100538599640934, + "grad_norm": 0.9579764604568481, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 27990 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 0.764167070388794, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 28000 + }, + { + "epoch": 2.0114901256732494, + "grad_norm": 0.7380000948905945, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 28010 + }, + { + "epoch": 2.012208258527828, + "grad_norm": 0.7220044732093811, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 28020 + }, + { + "epoch": 2.012926391382406, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 28030 + }, + { + "epoch": 2.013644524236984, + "grad_norm": 0.7507190704345703, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28040 + }, + { + "epoch": 2.014362657091562, + "grad_norm": 0.9488387703895569, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 28050 + }, + { + "epoch": 2.01508078994614, + "grad_norm": 0.9092940092086792, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 28060 + }, + { + "epoch": 2.0157989228007183, + "grad_norm": 0.7859629392623901, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28070 + }, + { + "epoch": 2.0165170556552963, + "grad_norm": 0.7636393904685974, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 28080 + }, + { + "epoch": 2.0172351885098743, + "grad_norm": 0.8860714435577393, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 28090 + }, + { + "epoch": 2.0179533213644523, + "grad_norm": 0.6837195158004761, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 28100 + }, + { + "epoch": 2.0186714542190307, + "grad_norm": 0.7778242826461792, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 28110 + }, + { + "epoch": 2.0193895870736087, + "grad_norm": 0.7164766788482666, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 28120 + }, + { + "epoch": 2.0201077199281867, + "grad_norm": 0.8965572118759155, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 28130 + }, + { + "epoch": 2.0208258527827647, + "grad_norm": 0.8074374794960022, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 28140 + }, + { + "epoch": 2.0215439856373427, + "grad_norm": 0.8307222127914429, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 28150 + }, + { + "epoch": 2.022262118491921, + "grad_norm": 0.9600032567977905, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 28160 + }, + { + "epoch": 2.022980251346499, + "grad_norm": 0.8541040420532227, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 28170 + }, + { + "epoch": 2.023698384201077, + "grad_norm": 0.8864985704421997, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 28180 + }, + { + "epoch": 2.024416517055655, + "grad_norm": 0.7926326990127563, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 28190 + }, + { + "epoch": 2.025134649910233, + "grad_norm": 1.0548077821731567, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28200 + }, + { + "epoch": 2.0258527827648116, + "grad_norm": 0.7468827366828918, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 28210 + }, + { + "epoch": 2.0265709156193896, + "grad_norm": 0.7683286070823669, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 28220 + }, + { + "epoch": 2.0272890484739676, + "grad_norm": 0.7307319641113281, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 28230 + }, + { + "epoch": 2.0280071813285456, + "grad_norm": 0.7813416719436646, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 28240 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 0.7954556941986084, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 28250 + }, + { + "epoch": 2.029443447037702, + "grad_norm": 0.8836418986320496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 28260 + }, + { + "epoch": 2.03016157989228, + "grad_norm": 0.7092728614807129, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28270 + }, + { + "epoch": 2.030879712746858, + "grad_norm": 0.8512285351753235, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 28280 + }, + { + "epoch": 2.031597845601436, + "grad_norm": 0.8005346059799194, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 28290 + }, + { + "epoch": 2.0323159784560145, + "grad_norm": 0.8872515559196472, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 28300 + }, + { + "epoch": 2.0330341113105925, + "grad_norm": 0.7948436737060547, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 28310 + }, + { + "epoch": 2.0337522441651705, + "grad_norm": 0.7418082356452942, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 28320 + }, + { + "epoch": 2.0344703770197485, + "grad_norm": 0.9600949287414551, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 28330 + }, + { + "epoch": 2.0351885098743265, + "grad_norm": 0.9767434597015381, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 28340 + }, + { + "epoch": 2.035906642728905, + "grad_norm": 0.7435336709022522, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 28350 + }, + { + "epoch": 2.036624775583483, + "grad_norm": 0.997978925704956, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 28360 + }, + { + "epoch": 2.037342908438061, + "grad_norm": 0.9072412252426147, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 28370 + }, + { + "epoch": 2.038061041292639, + "grad_norm": 0.8396701812744141, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 28380 + }, + { + "epoch": 2.0387791741472174, + "grad_norm": 1.0449832677841187, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 28390 + }, + { + "epoch": 2.0394973070017954, + "grad_norm": 0.6471025943756104, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 28400 + }, + { + "epoch": 2.0402154398563734, + "grad_norm": 0.8147950768470764, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 28410 + }, + { + "epoch": 2.0409335727109514, + "grad_norm": 0.902508020401001, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 28420 + }, + { + "epoch": 2.0416517055655294, + "grad_norm": 0.6426262855529785, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 28430 + }, + { + "epoch": 2.042369838420108, + "grad_norm": 0.8016643524169922, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 28440 + }, + { + "epoch": 2.043087971274686, + "grad_norm": 0.6841614246368408, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 28450 + }, + { + "epoch": 2.043806104129264, + "grad_norm": 0.7713631987571716, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 28460 + }, + { + "epoch": 2.044524236983842, + "grad_norm": 0.8795675039291382, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 28470 + }, + { + "epoch": 2.04524236983842, + "grad_norm": 0.725447416305542, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 28480 + }, + { + "epoch": 2.0459605026929983, + "grad_norm": 0.806861162185669, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 28490 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 0.752953827381134, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 28500 + }, + { + "epoch": 2.0473967684021543, + "grad_norm": 0.7143173813819885, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 28510 + }, + { + "epoch": 2.0481149012567323, + "grad_norm": 0.9316226243972778, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 28520 + }, + { + "epoch": 2.048833034111311, + "grad_norm": 0.7292338609695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 28530 + }, + { + "epoch": 2.049551166965889, + "grad_norm": 0.7392885088920593, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 28540 + }, + { + "epoch": 2.050269299820467, + "grad_norm": 0.7288873195648193, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 28550 + }, + { + "epoch": 2.050987432675045, + "grad_norm": 0.7791221141815186, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 28560 + }, + { + "epoch": 2.051705565529623, + "grad_norm": 0.821983814239502, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 28570 + }, + { + "epoch": 2.0524236983842012, + "grad_norm": 0.8925826549530029, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28580 + }, + { + "epoch": 2.0531418312387792, + "grad_norm": 0.7181646227836609, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 28590 + }, + { + "epoch": 2.0538599640933572, + "grad_norm": 0.6387725472450256, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 28600 + }, + { + "epoch": 2.0545780969479353, + "grad_norm": 0.8398096561431885, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 28610 + }, + { + "epoch": 2.0552962298025133, + "grad_norm": 1.0458195209503174, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 28620 + }, + { + "epoch": 2.0560143626570917, + "grad_norm": 0.7032150626182556, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28630 + }, + { + "epoch": 2.0567324955116697, + "grad_norm": 0.8850845098495483, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 28640 + }, + { + "epoch": 2.0574506283662477, + "grad_norm": 0.8587120175361633, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 28650 + }, + { + "epoch": 2.0581687612208257, + "grad_norm": 0.7462602853775024, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28660 + }, + { + "epoch": 2.058886894075404, + "grad_norm": 0.7355574369430542, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 28670 + }, + { + "epoch": 2.059605026929982, + "grad_norm": 0.9229736328125, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 28680 + }, + { + "epoch": 2.06032315978456, + "grad_norm": 0.7685085535049438, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 28690 + }, + { + "epoch": 2.061041292639138, + "grad_norm": 0.6749364137649536, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 28700 + }, + { + "epoch": 2.061759425493716, + "grad_norm": 0.7608520984649658, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28710 + }, + { + "epoch": 2.0624775583482946, + "grad_norm": 0.9451281428337097, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28720 + }, + { + "epoch": 2.0631956912028726, + "grad_norm": 0.7869735360145569, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 28730 + }, + { + "epoch": 2.0639138240574506, + "grad_norm": 0.8422008156776428, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 28740 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 0.7486162781715393, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 28750 + }, + { + "epoch": 2.0653500897666066, + "grad_norm": 0.9374173879623413, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28760 + }, + { + "epoch": 2.066068222621185, + "grad_norm": 0.8749295473098755, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 28770 + }, + { + "epoch": 2.066786355475763, + "grad_norm": 0.8265942931175232, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 28780 + }, + { + "epoch": 2.067504488330341, + "grad_norm": 0.8541982769966125, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 28790 + }, + { + "epoch": 2.068222621184919, + "grad_norm": 0.8220006227493286, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 28800 + }, + { + "epoch": 2.0689407540394975, + "grad_norm": 0.7302022576332092, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 28810 + }, + { + "epoch": 2.0696588868940755, + "grad_norm": 0.7073875069618225, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 28820 + }, + { + "epoch": 2.0703770197486535, + "grad_norm": 0.7792919874191284, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28830 + }, + { + "epoch": 2.0710951526032315, + "grad_norm": 0.8268185257911682, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 28840 + }, + { + "epoch": 2.0718132854578095, + "grad_norm": 0.7576423287391663, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 28850 + }, + { + "epoch": 2.072531418312388, + "grad_norm": 0.8255910873413086, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 28860 + }, + { + "epoch": 2.073249551166966, + "grad_norm": 0.7900934815406799, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 28870 + }, + { + "epoch": 2.073967684021544, + "grad_norm": 0.846665620803833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 28880 + }, + { + "epoch": 2.074685816876122, + "grad_norm": 0.8159831166267395, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 28890 + }, + { + "epoch": 2.0754039497307, + "grad_norm": 0.7395941615104675, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 28900 + }, + { + "epoch": 2.0761220825852784, + "grad_norm": 0.9765046238899231, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 28910 + }, + { + "epoch": 2.0768402154398564, + "grad_norm": 0.8358173966407776, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 28920 + }, + { + "epoch": 2.0775583482944344, + "grad_norm": 0.6848723292350769, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 28930 + }, + { + "epoch": 2.0782764811490124, + "grad_norm": 0.7965065836906433, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 28940 + }, + { + "epoch": 2.078994614003591, + "grad_norm": 0.7618608474731445, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 28950 + }, + { + "epoch": 2.079712746858169, + "grad_norm": 0.890615701675415, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 28960 + }, + { + "epoch": 2.080430879712747, + "grad_norm": 0.7310431003570557, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28970 + }, + { + "epoch": 2.081149012567325, + "grad_norm": 0.8228268027305603, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 28980 + }, + { + "epoch": 2.081867145421903, + "grad_norm": 0.883577287197113, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28990 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 0.8359243869781494, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 29000 + }, + { + "epoch": 2.0833034111310593, + "grad_norm": 0.8285391330718994, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 29010 + }, + { + "epoch": 2.0840215439856373, + "grad_norm": 0.8991064429283142, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 29020 + }, + { + "epoch": 2.0847396768402153, + "grad_norm": 0.6911244988441467, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 29030 + }, + { + "epoch": 2.0854578096947933, + "grad_norm": 0.8462249636650085, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 29040 + }, + { + "epoch": 2.0861759425493718, + "grad_norm": 0.9149548411369324, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 29050 + }, + { + "epoch": 2.0868940754039498, + "grad_norm": 0.7365630269050598, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 29060 + }, + { + "epoch": 2.087612208258528, + "grad_norm": 0.8439079523086548, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 29070 + }, + { + "epoch": 2.088330341113106, + "grad_norm": 0.7123780846595764, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 29080 + }, + { + "epoch": 2.0890484739676842, + "grad_norm": 0.6854261755943298, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 29090 + }, + { + "epoch": 2.0897666068222622, + "grad_norm": 0.83026123046875, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 29100 + }, + { + "epoch": 2.0904847396768402, + "grad_norm": 0.8413158059120178, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 29110 + }, + { + "epoch": 2.0912028725314182, + "grad_norm": 0.9646758437156677, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 29120 + }, + { + "epoch": 2.0919210053859962, + "grad_norm": 0.8421565890312195, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 29130 + }, + { + "epoch": 2.0926391382405747, + "grad_norm": 0.7748899459838867, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 29140 + }, + { + "epoch": 2.0933572710951527, + "grad_norm": 0.5973830819129944, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 29150 + }, + { + "epoch": 2.0940754039497307, + "grad_norm": 0.8440837860107422, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 29160 + }, + { + "epoch": 2.0947935368043087, + "grad_norm": 0.7392688989639282, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 29170 + }, + { + "epoch": 2.0955116696588867, + "grad_norm": 1.0522996187210083, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 29180 + }, + { + "epoch": 2.096229802513465, + "grad_norm": 0.7330273389816284, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 29190 + }, + { + "epoch": 2.096947935368043, + "grad_norm": 1.11064875125885, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 29200 + }, + { + "epoch": 2.097666068222621, + "grad_norm": 0.795446515083313, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 29210 + }, + { + "epoch": 2.098384201077199, + "grad_norm": 0.5552594661712646, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 29220 + }, + { + "epoch": 2.0991023339317776, + "grad_norm": 0.7327710390090942, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 29230 + }, + { + "epoch": 2.0998204667863556, + "grad_norm": 0.7474247217178345, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 29240 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": 0.7775853276252747, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 29250 + }, + { + "epoch": 2.1012567324955116, + "grad_norm": 0.769527018070221, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29260 + }, + { + "epoch": 2.1019748653500896, + "grad_norm": 0.8350797891616821, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 29270 + }, + { + "epoch": 2.102692998204668, + "grad_norm": 0.8749061822891235, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29280 + }, + { + "epoch": 2.103411131059246, + "grad_norm": 0.7838778495788574, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 29290 + }, + { + "epoch": 2.104129263913824, + "grad_norm": 0.8144710063934326, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 29300 + }, + { + "epoch": 2.104847396768402, + "grad_norm": 0.7965250015258789, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 29310 + }, + { + "epoch": 2.10556552962298, + "grad_norm": 0.7075945138931274, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 29320 + }, + { + "epoch": 2.1062836624775585, + "grad_norm": 0.9449555277824402, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 29330 + }, + { + "epoch": 2.1070017953321365, + "grad_norm": 0.9114580750465393, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 29340 + }, + { + "epoch": 2.1077199281867145, + "grad_norm": 0.8768125176429749, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 29350 + }, + { + "epoch": 2.1084380610412925, + "grad_norm": 0.8586908578872681, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 29360 + }, + { + "epoch": 2.109156193895871, + "grad_norm": 0.8351234793663025, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 29370 + }, + { + "epoch": 2.109874326750449, + "grad_norm": 0.686488687992096, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 29380 + }, + { + "epoch": 2.110592459605027, + "grad_norm": 0.7910184264183044, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 29390 + }, + { + "epoch": 2.111310592459605, + "grad_norm": 0.7649612426757812, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 29400 + }, + { + "epoch": 2.112028725314183, + "grad_norm": 0.7790259122848511, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29410 + }, + { + "epoch": 2.1127468581687614, + "grad_norm": 0.8386351466178894, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 29420 + }, + { + "epoch": 2.1134649910233394, + "grad_norm": 0.8605695366859436, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 29430 + }, + { + "epoch": 2.1141831238779174, + "grad_norm": 0.6808947920799255, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 29440 + }, + { + "epoch": 2.1149012567324954, + "grad_norm": 0.8310001492500305, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 29450 + }, + { + "epoch": 2.1156193895870734, + "grad_norm": 1.289986252784729, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 29460 + }, + { + "epoch": 2.116337522441652, + "grad_norm": 0.8679313659667969, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 29470 + }, + { + "epoch": 2.11705565529623, + "grad_norm": 0.9149175882339478, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 29480 + }, + { + "epoch": 2.117773788150808, + "grad_norm": 0.8405622839927673, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 29490 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 0.9174691438674927, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 29500 + }, + { + "epoch": 2.1192100538599643, + "grad_norm": 0.8865614533424377, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29510 + }, + { + "epoch": 2.1199281867145423, + "grad_norm": 0.645301342010498, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29520 + }, + { + "epoch": 2.1206463195691203, + "grad_norm": 0.7612960338592529, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 29530 + }, + { + "epoch": 2.1213644524236983, + "grad_norm": 0.7575576305389404, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 29540 + }, + { + "epoch": 2.1220825852782763, + "grad_norm": 0.8746156096458435, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 29550 + }, + { + "epoch": 2.1228007181328548, + "grad_norm": 0.8488934636116028, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 29560 + }, + { + "epoch": 2.1235188509874328, + "grad_norm": 0.8064972162246704, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 29570 + }, + { + "epoch": 2.1242369838420108, + "grad_norm": 0.7410933971405029, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 29580 + }, + { + "epoch": 2.1249551166965888, + "grad_norm": 0.7023535966873169, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 29590 + }, + { + "epoch": 2.1256732495511668, + "grad_norm": 0.8591743111610413, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 29600 + }, + { + "epoch": 2.126391382405745, + "grad_norm": 0.7270186543464661, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 29610 + }, + { + "epoch": 2.127109515260323, + "grad_norm": 0.9639726281166077, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 29620 + }, + { + "epoch": 2.127827648114901, + "grad_norm": 0.8519027829170227, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 29630 + }, + { + "epoch": 2.128545780969479, + "grad_norm": 0.8786447048187256, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 29640 + }, + { + "epoch": 2.129263913824057, + "grad_norm": 0.7452822923660278, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29650 + }, + { + "epoch": 2.1299820466786357, + "grad_norm": 0.9385744333267212, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 29660 + }, + { + "epoch": 2.1307001795332137, + "grad_norm": 0.7650160193443298, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 29670 + }, + { + "epoch": 2.1314183123877917, + "grad_norm": 0.7581976652145386, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 29680 + }, + { + "epoch": 2.1321364452423697, + "grad_norm": 0.8455183506011963, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 29690 + }, + { + "epoch": 2.132854578096948, + "grad_norm": 0.7200509905815125, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 29700 + }, + { + "epoch": 2.133572710951526, + "grad_norm": 0.7071877121925354, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 29710 + }, + { + "epoch": 2.134290843806104, + "grad_norm": 0.9197220802307129, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 29720 + }, + { + "epoch": 2.135008976660682, + "grad_norm": 0.6787277460098267, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 29730 + }, + { + "epoch": 2.13572710951526, + "grad_norm": 0.8183788061141968, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 29740 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 0.7958994507789612, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29750 + }, + { + "epoch": 2.1371633752244166, + "grad_norm": 0.8803889155387878, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 29760 + }, + { + "epoch": 2.1378815080789946, + "grad_norm": 0.6682677268981934, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 29770 + }, + { + "epoch": 2.1385996409335726, + "grad_norm": 1.0198085308074951, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 29780 + }, + { + "epoch": 2.139317773788151, + "grad_norm": 1.0258227586746216, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 29790 + }, + { + "epoch": 2.140035906642729, + "grad_norm": 0.8920917510986328, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 29800 + }, + { + "epoch": 2.140754039497307, + "grad_norm": 0.8352635502815247, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 29810 + }, + { + "epoch": 2.141472172351885, + "grad_norm": 0.8422067165374756, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 29820 + }, + { + "epoch": 2.142190305206463, + "grad_norm": 0.8845202326774597, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 29830 + }, + { + "epoch": 2.1429084380610415, + "grad_norm": 0.659397542476654, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 29840 + }, + { + "epoch": 2.1436265709156195, + "grad_norm": 0.6233306527137756, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 29850 + }, + { + "epoch": 2.1443447037701975, + "grad_norm": 0.8951199054718018, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 29860 + }, + { + "epoch": 2.1450628366247755, + "grad_norm": 0.6980211734771729, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 29870 + }, + { + "epoch": 2.1457809694793535, + "grad_norm": 0.8463385105133057, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29880 + }, + { + "epoch": 2.146499102333932, + "grad_norm": 0.682183027267456, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 29890 + }, + { + "epoch": 2.14721723518851, + "grad_norm": 0.8491033911705017, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 29900 + }, + { + "epoch": 2.147935368043088, + "grad_norm": 0.8112631440162659, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 29910 + }, + { + "epoch": 2.148653500897666, + "grad_norm": 1.0186359882354736, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29920 + }, + { + "epoch": 2.149371633752244, + "grad_norm": 0.7904929518699646, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 29930 + }, + { + "epoch": 2.1500897666068224, + "grad_norm": 0.8381312489509583, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29940 + }, + { + "epoch": 2.1508078994614004, + "grad_norm": 0.7596192359924316, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 29950 + }, + { + "epoch": 2.1515260323159784, + "grad_norm": 0.7532448768615723, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 29960 + }, + { + "epoch": 2.1522441651705564, + "grad_norm": 0.7877430319786072, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 29970 + }, + { + "epoch": 2.152962298025135, + "grad_norm": 0.6870610117912292, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 29980 + }, + { + "epoch": 2.153680430879713, + "grad_norm": 0.7154987454414368, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 29990 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 0.7692370414733887, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 30000 + }, + { + "epoch": 2.155116696588869, + "grad_norm": 0.7745859026908875, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 30010 + }, + { + "epoch": 2.155834829443447, + "grad_norm": 0.718207061290741, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 30020 + }, + { + "epoch": 2.1565529622980253, + "grad_norm": 0.8851615786552429, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30030 + }, + { + "epoch": 2.1572710951526033, + "grad_norm": 0.736194372177124, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 30040 + }, + { + "epoch": 2.1579892280071813, + "grad_norm": 0.9908117055892944, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 30050 + }, + { + "epoch": 2.1587073608617593, + "grad_norm": 0.6772316694259644, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30060 + }, + { + "epoch": 2.1594254937163377, + "grad_norm": 0.7474411725997925, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 30070 + }, + { + "epoch": 2.1601436265709157, + "grad_norm": 0.8140033483505249, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 30080 + }, + { + "epoch": 2.1608617594254937, + "grad_norm": 0.912555992603302, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 30090 + }, + { + "epoch": 2.1615798922800717, + "grad_norm": 0.8189636468887329, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 30100 + }, + { + "epoch": 2.1622980251346497, + "grad_norm": 0.7520000338554382, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 30110 + }, + { + "epoch": 2.163016157989228, + "grad_norm": 0.9635465741157532, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 30120 + }, + { + "epoch": 2.163734290843806, + "grad_norm": 0.9139830470085144, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 30130 + }, + { + "epoch": 2.164452423698384, + "grad_norm": 0.844384491443634, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 30140 + }, + { + "epoch": 2.165170556552962, + "grad_norm": 0.8296793103218079, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 30150 + }, + { + "epoch": 2.16588868940754, + "grad_norm": 0.7929309606552124, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30160 + }, + { + "epoch": 2.1666068222621186, + "grad_norm": 0.8046507239341736, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 30170 + }, + { + "epoch": 2.1673249551166966, + "grad_norm": 0.8161377310752869, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 30180 + }, + { + "epoch": 2.1680430879712747, + "grad_norm": 0.6984363794326782, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 30190 + }, + { + "epoch": 2.1687612208258527, + "grad_norm": 0.8578489422798157, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30200 + }, + { + "epoch": 2.1694793536804307, + "grad_norm": 0.8051524758338928, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30210 + }, + { + "epoch": 2.170197486535009, + "grad_norm": 0.6775792241096497, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 30220 + }, + { + "epoch": 2.170915619389587, + "grad_norm": 0.7102242708206177, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 30230 + }, + { + "epoch": 2.171633752244165, + "grad_norm": 0.9038975238800049, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 30240 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 0.8509918451309204, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 30250 + }, + { + "epoch": 2.1730700179533216, + "grad_norm": 0.8816375732421875, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 30260 + }, + { + "epoch": 2.1737881508078996, + "grad_norm": 0.7907037138938904, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 30270 + }, + { + "epoch": 2.1745062836624776, + "grad_norm": 0.7104434967041016, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 30280 + }, + { + "epoch": 2.1752244165170556, + "grad_norm": 1.028658151626587, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 30290 + }, + { + "epoch": 2.1759425493716336, + "grad_norm": 0.8542430400848389, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 30300 + }, + { + "epoch": 2.176660682226212, + "grad_norm": 0.7438064813613892, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30310 + }, + { + "epoch": 2.17737881508079, + "grad_norm": 0.8384708762168884, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 30320 + }, + { + "epoch": 2.178096947935368, + "grad_norm": 0.9034163355827332, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 30330 + }, + { + "epoch": 2.178815080789946, + "grad_norm": 0.9659526944160461, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 30340 + }, + { + "epoch": 2.1795332136445245, + "grad_norm": 0.6685642600059509, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 30350 + }, + { + "epoch": 2.1802513464991025, + "grad_norm": 0.9180589318275452, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 30360 + }, + { + "epoch": 2.1809694793536805, + "grad_norm": 0.9550795555114746, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 30370 + }, + { + "epoch": 2.1816876122082585, + "grad_norm": 0.8517686724662781, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 30380 + }, + { + "epoch": 2.1824057450628365, + "grad_norm": 0.7351927161216736, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 30390 + }, + { + "epoch": 2.183123877917415, + "grad_norm": 0.8439408540725708, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 30400 + }, + { + "epoch": 2.183842010771993, + "grad_norm": 0.8322570323944092, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 30410 + }, + { + "epoch": 2.184560143626571, + "grad_norm": 0.6735888123512268, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 30420 + }, + { + "epoch": 2.185278276481149, + "grad_norm": 0.7273133397102356, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 30430 + }, + { + "epoch": 2.185996409335727, + "grad_norm": 0.7841959595680237, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 30440 + }, + { + "epoch": 2.1867145421903054, + "grad_norm": 0.67259281873703, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 30450 + }, + { + "epoch": 2.1874326750448834, + "grad_norm": 0.7646223306655884, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 30460 + }, + { + "epoch": 2.1881508078994614, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 30470 + }, + { + "epoch": 2.1888689407540394, + "grad_norm": 0.8818342685699463, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 30480 + }, + { + "epoch": 2.1895870736086174, + "grad_norm": 0.7421377897262573, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 30490 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 0.8180080652236938, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30500 + }, + { + "epoch": 2.191023339317774, + "grad_norm": 0.8003571033477783, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30510 + }, + { + "epoch": 2.191741472172352, + "grad_norm": 0.8200605511665344, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 30520 + }, + { + "epoch": 2.19245960502693, + "grad_norm": 0.8878887295722961, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 30530 + }, + { + "epoch": 2.1931777378815083, + "grad_norm": 0.8518163561820984, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 30540 + }, + { + "epoch": 2.1938958707360863, + "grad_norm": 0.8182454705238342, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 30550 + }, + { + "epoch": 2.1946140035906643, + "grad_norm": 0.9395919442176819, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 30560 + }, + { + "epoch": 2.1953321364452423, + "grad_norm": 0.7916256189346313, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 30570 + }, + { + "epoch": 2.1960502692998203, + "grad_norm": 0.7303445339202881, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 30580 + }, + { + "epoch": 2.1967684021543987, + "grad_norm": 0.7407387495040894, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 30590 + }, + { + "epoch": 2.1974865350089767, + "grad_norm": 0.7410500645637512, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 30600 + }, + { + "epoch": 2.1982046678635547, + "grad_norm": 0.9176440834999084, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 30610 + }, + { + "epoch": 2.1989228007181327, + "grad_norm": 0.8823038935661316, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 30620 + }, + { + "epoch": 2.199640933572711, + "grad_norm": 0.9263436198234558, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 30630 + }, + { + "epoch": 2.200359066427289, + "grad_norm": 0.6753571033477783, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 30640 + }, + { + "epoch": 2.201077199281867, + "grad_norm": 0.841160774230957, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 30650 + }, + { + "epoch": 2.201795332136445, + "grad_norm": 0.8786441683769226, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 30660 + }, + { + "epoch": 2.202513464991023, + "grad_norm": 0.8833681344985962, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 30670 + }, + { + "epoch": 2.2032315978456016, + "grad_norm": 0.6609824299812317, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 30680 + }, + { + "epoch": 2.2039497307001796, + "grad_norm": 0.7308626174926758, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 30690 + }, + { + "epoch": 2.2046678635547576, + "grad_norm": 0.8854711055755615, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 30700 + }, + { + "epoch": 2.2053859964093356, + "grad_norm": 0.839043140411377, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 30710 + }, + { + "epoch": 2.2061041292639136, + "grad_norm": 0.9030174016952515, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 30720 + }, + { + "epoch": 2.206822262118492, + "grad_norm": 0.6856667399406433, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 30730 + }, + { + "epoch": 2.20754039497307, + "grad_norm": 0.8823501467704773, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 30740 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 0.8501278162002563, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 30750 + }, + { + "epoch": 2.208976660682226, + "grad_norm": 0.8099446892738342, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 30760 + }, + { + "epoch": 2.209694793536804, + "grad_norm": 0.7203072905540466, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 30770 + }, + { + "epoch": 2.2104129263913825, + "grad_norm": 1.0898563861846924, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 30780 + }, + { + "epoch": 2.2111310592459605, + "grad_norm": 0.8157216906547546, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 30790 + }, + { + "epoch": 2.2118491921005385, + "grad_norm": 0.7617478966712952, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 30800 + }, + { + "epoch": 2.2125673249551165, + "grad_norm": 0.790503978729248, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 30810 + }, + { + "epoch": 2.213285457809695, + "grad_norm": 0.9289199113845825, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 30820 + }, + { + "epoch": 2.214003590664273, + "grad_norm": 0.9267001748085022, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 30830 + }, + { + "epoch": 2.214721723518851, + "grad_norm": 0.716023862361908, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 30840 + }, + { + "epoch": 2.215439856373429, + "grad_norm": 0.8733863234519958, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 30850 + }, + { + "epoch": 2.216157989228007, + "grad_norm": 0.7743660807609558, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 30860 + }, + { + "epoch": 2.2168761220825854, + "grad_norm": 0.7974567413330078, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 30870 + }, + { + "epoch": 2.2175942549371634, + "grad_norm": 0.6617984771728516, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 30880 + }, + { + "epoch": 2.2183123877917414, + "grad_norm": 0.6925143003463745, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 30890 + }, + { + "epoch": 2.2190305206463194, + "grad_norm": 0.6853532195091248, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 30900 + }, + { + "epoch": 2.219748653500898, + "grad_norm": 0.7964699268341064, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 30910 + }, + { + "epoch": 2.220466786355476, + "grad_norm": 0.8116228580474854, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 30920 + }, + { + "epoch": 2.221184919210054, + "grad_norm": 1.0121010541915894, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 30930 + }, + { + "epoch": 2.221903052064632, + "grad_norm": 0.7348445653915405, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 30940 + }, + { + "epoch": 2.22262118491921, + "grad_norm": 0.8998047709465027, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 30950 + }, + { + "epoch": 2.2233393177737883, + "grad_norm": 0.6108106970787048, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 30960 + }, + { + "epoch": 2.2240574506283664, + "grad_norm": 1.287834882736206, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 30970 + }, + { + "epoch": 2.2247755834829444, + "grad_norm": 0.8584468960762024, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 30980 + }, + { + "epoch": 2.2254937163375224, + "grad_norm": 0.865276038646698, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 30990 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 0.8713302612304688, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 31000 + }, + { + "epoch": 2.226929982046679, + "grad_norm": 0.9210535883903503, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 31010 + }, + { + "epoch": 2.227648114901257, + "grad_norm": 0.8578430414199829, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 31020 + }, + { + "epoch": 2.228366247755835, + "grad_norm": 0.7128387093544006, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 31030 + }, + { + "epoch": 2.229084380610413, + "grad_norm": 0.8059941530227661, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 31040 + }, + { + "epoch": 2.229802513464991, + "grad_norm": 0.8043261170387268, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 31050 + }, + { + "epoch": 2.2305206463195693, + "grad_norm": 0.9260253310203552, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 31060 + }, + { + "epoch": 2.2312387791741473, + "grad_norm": 0.7908085584640503, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 31070 + }, + { + "epoch": 2.2319569120287253, + "grad_norm": 0.7860442996025085, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 31080 + }, + { + "epoch": 2.2326750448833033, + "grad_norm": 0.8388702273368835, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 31090 + }, + { + "epoch": 2.2333931777378817, + "grad_norm": 0.835686206817627, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 31100 + }, + { + "epoch": 2.2341113105924597, + "grad_norm": 0.8148298859596252, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 31110 + }, + { + "epoch": 2.2348294434470377, + "grad_norm": 0.8501878976821899, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 31120 + }, + { + "epoch": 2.2355475763016157, + "grad_norm": 0.793323278427124, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 31130 + }, + { + "epoch": 2.2362657091561937, + "grad_norm": 0.8234742879867554, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31140 + }, + { + "epoch": 2.236983842010772, + "grad_norm": 0.8691303133964539, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 31150 + }, + { + "epoch": 2.23770197486535, + "grad_norm": 0.8707090020179749, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 31160 + }, + { + "epoch": 2.238420107719928, + "grad_norm": 0.8468940854072571, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 31170 + }, + { + "epoch": 2.239138240574506, + "grad_norm": 0.7275772094726562, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 31180 + }, + { + "epoch": 2.2398563734290846, + "grad_norm": 0.8765808939933777, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 31190 + }, + { + "epoch": 2.2405745062836626, + "grad_norm": 1.02803635597229, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 31200 + }, + { + "epoch": 2.2412926391382406, + "grad_norm": 0.7999185919761658, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 31210 + }, + { + "epoch": 2.2420107719928186, + "grad_norm": 0.5711870789527893, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 31220 + }, + { + "epoch": 2.2427289048473966, + "grad_norm": 0.7183604836463928, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 31230 + }, + { + "epoch": 2.243447037701975, + "grad_norm": 0.8819206357002258, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 31240 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 0.9078969955444336, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 31250 + }, + { + "epoch": 2.244883303411131, + "grad_norm": 1.184506893157959, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 31260 + }, + { + "epoch": 2.245601436265709, + "grad_norm": 0.8660752177238464, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 31270 + }, + { + "epoch": 2.246319569120287, + "grad_norm": 1.011796236038208, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 31280 + }, + { + "epoch": 2.2470377019748655, + "grad_norm": 0.9168157577514648, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 31290 + }, + { + "epoch": 2.2477558348294435, + "grad_norm": 0.7798577547073364, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 31300 + }, + { + "epoch": 2.2484739676840215, + "grad_norm": 0.6609913110733032, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 31310 + }, + { + "epoch": 2.2491921005385995, + "grad_norm": 0.64737868309021, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 31320 + }, + { + "epoch": 2.2499102333931775, + "grad_norm": 1.0700385570526123, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 31330 + }, + { + "epoch": 2.250628366247756, + "grad_norm": 0.7838551998138428, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 31340 + }, + { + "epoch": 2.251346499102334, + "grad_norm": 0.9225728511810303, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 31350 + }, + { + "epoch": 2.252064631956912, + "grad_norm": 0.7956384420394897, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 31360 + }, + { + "epoch": 2.25278276481149, + "grad_norm": 0.7645466923713684, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 31370 + }, + { + "epoch": 2.2535008976660684, + "grad_norm": 0.9595549702644348, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 31380 + }, + { + "epoch": 2.2542190305206464, + "grad_norm": 0.6124163866043091, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 31390 + }, + { + "epoch": 2.2549371633752244, + "grad_norm": 0.7531530261039734, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 31400 + }, + { + "epoch": 2.2556552962298024, + "grad_norm": 0.6904721856117249, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 31410 + }, + { + "epoch": 2.2563734290843804, + "grad_norm": 0.7644204497337341, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 31420 + }, + { + "epoch": 2.257091561938959, + "grad_norm": 0.7879737019538879, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 31430 + }, + { + "epoch": 2.257809694793537, + "grad_norm": 0.796450138092041, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 31440 + }, + { + "epoch": 2.258527827648115, + "grad_norm": 0.7536656856536865, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31450 + }, + { + "epoch": 2.259245960502693, + "grad_norm": 0.6797451376914978, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 31460 + }, + { + "epoch": 2.2599640933572713, + "grad_norm": 0.7833347320556641, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 31470 + }, + { + "epoch": 2.2606822262118493, + "grad_norm": 0.7571428418159485, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 31480 + }, + { + "epoch": 2.2614003590664273, + "grad_norm": 0.7028690576553345, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 31490 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 0.7854651212692261, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 31500 + }, + { + "epoch": 2.2628366247755833, + "grad_norm": 1.1924974918365479, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 31510 + }, + { + "epoch": 2.2635547576301613, + "grad_norm": 0.8087588548660278, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 31520 + }, + { + "epoch": 2.26427289048474, + "grad_norm": 0.8521981835365295, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31530 + }, + { + "epoch": 2.264991023339318, + "grad_norm": 0.754585862159729, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 31540 + }, + { + "epoch": 2.265709156193896, + "grad_norm": 0.8403395414352417, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 31550 + }, + { + "epoch": 2.266427289048474, + "grad_norm": 0.9724786877632141, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 31560 + }, + { + "epoch": 2.2671454219030522, + "grad_norm": 0.7568767070770264, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 31570 + }, + { + "epoch": 2.2678635547576302, + "grad_norm": 0.712009608745575, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 31580 + }, + { + "epoch": 2.2685816876122082, + "grad_norm": 0.7649937868118286, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 31590 + }, + { + "epoch": 2.2692998204667862, + "grad_norm": 0.7319537997245789, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 31600 + }, + { + "epoch": 2.2700179533213642, + "grad_norm": 0.9597942233085632, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 31610 + }, + { + "epoch": 2.2707360861759427, + "grad_norm": 0.7403358817100525, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 31620 + }, + { + "epoch": 2.2714542190305207, + "grad_norm": 0.7395114898681641, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 31630 + }, + { + "epoch": 2.2721723518850987, + "grad_norm": 0.8835344314575195, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 31640 + }, + { + "epoch": 2.2728904847396767, + "grad_norm": 0.76587975025177, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 31650 + }, + { + "epoch": 2.273608617594255, + "grad_norm": 0.6472584009170532, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 31660 + }, + { + "epoch": 2.274326750448833, + "grad_norm": 1.0170460939407349, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 31670 + }, + { + "epoch": 2.275044883303411, + "grad_norm": 0.8170912265777588, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 31680 + }, + { + "epoch": 2.275763016157989, + "grad_norm": 0.6821279525756836, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 31690 + }, + { + "epoch": 2.276481149012567, + "grad_norm": 0.8150709867477417, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 31700 + }, + { + "epoch": 2.2771992818671456, + "grad_norm": 0.6786386370658875, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 31710 + }, + { + "epoch": 2.2779174147217236, + "grad_norm": 0.8871912360191345, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 31720 + }, + { + "epoch": 2.2786355475763016, + "grad_norm": 0.7710220813751221, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 31730 + }, + { + "epoch": 2.2793536804308796, + "grad_norm": 0.8073079586029053, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 31740 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 0.8228550553321838, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 31750 + }, + { + "epoch": 2.280789946140036, + "grad_norm": 0.7987996339797974, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 31760 + }, + { + "epoch": 2.281508078994614, + "grad_norm": 0.744326651096344, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 31770 + }, + { + "epoch": 2.282226211849192, + "grad_norm": 0.7672302722930908, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 31780 + }, + { + "epoch": 2.28294434470377, + "grad_norm": 0.8079774975776672, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 31790 + }, + { + "epoch": 2.283662477558348, + "grad_norm": 0.7383643984794617, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 31800 + }, + { + "epoch": 2.2843806104129265, + "grad_norm": 0.8542332649230957, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 31810 + }, + { + "epoch": 2.2850987432675045, + "grad_norm": 0.7657321691513062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 31820 + }, + { + "epoch": 2.2858168761220825, + "grad_norm": 0.7485944628715515, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 31830 + }, + { + "epoch": 2.2865350089766605, + "grad_norm": 0.7817596793174744, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 31840 + }, + { + "epoch": 2.287253141831239, + "grad_norm": 0.840421736240387, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31850 + }, + { + "epoch": 2.287971274685817, + "grad_norm": 0.8190447688102722, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 31860 + }, + { + "epoch": 2.288689407540395, + "grad_norm": 0.9582287669181824, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 31870 + }, + { + "epoch": 2.289407540394973, + "grad_norm": 1.0939116477966309, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 31880 + }, + { + "epoch": 2.290125673249551, + "grad_norm": 1.0901678800582886, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 31890 + }, + { + "epoch": 2.2908438061041294, + "grad_norm": 0.8025168776512146, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 31900 + }, + { + "epoch": 2.2915619389587074, + "grad_norm": 0.8157371878623962, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 31910 + }, + { + "epoch": 2.2922800718132854, + "grad_norm": 0.7735328078269958, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 31920 + }, + { + "epoch": 2.2929982046678634, + "grad_norm": 0.7501550316810608, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 31930 + }, + { + "epoch": 2.293716337522442, + "grad_norm": 0.76664799451828, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 31940 + }, + { + "epoch": 2.29443447037702, + "grad_norm": 1.0044599771499634, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 31950 + }, + { + "epoch": 2.295152603231598, + "grad_norm": 0.7773551344871521, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 31960 + }, + { + "epoch": 2.295870736086176, + "grad_norm": 0.9021226763725281, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 31970 + }, + { + "epoch": 2.296588868940754, + "grad_norm": 0.9075915813446045, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 31980 + }, + { + "epoch": 2.2973070017953323, + "grad_norm": 0.9109290242195129, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 31990 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 0.7742900252342224, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32000 + }, + { + "epoch": 2.2987432675044883, + "grad_norm": 0.633260190486908, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 32010 + }, + { + "epoch": 2.2994614003590663, + "grad_norm": 0.8593834042549133, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 32020 + }, + { + "epoch": 2.3001795332136448, + "grad_norm": 0.88165283203125, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32030 + }, + { + "epoch": 2.3008976660682228, + "grad_norm": 0.7840633988380432, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 32040 + }, + { + "epoch": 2.3016157989228008, + "grad_norm": 0.8150764107704163, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 32050 + }, + { + "epoch": 2.3023339317773788, + "grad_norm": 0.7683324813842773, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32060 + }, + { + "epoch": 2.3030520646319568, + "grad_norm": 0.7581049799919128, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 32070 + }, + { + "epoch": 2.3037701974865348, + "grad_norm": 0.911687970161438, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32080 + }, + { + "epoch": 2.3044883303411132, + "grad_norm": 1.0596355199813843, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32090 + }, + { + "epoch": 2.3052064631956912, + "grad_norm": 0.7329661846160889, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 32100 + }, + { + "epoch": 2.3059245960502692, + "grad_norm": 0.8251074552536011, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 32110 + }, + { + "epoch": 2.3066427289048472, + "grad_norm": 0.7765523195266724, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 32120 + }, + { + "epoch": 2.3073608617594257, + "grad_norm": 0.8246980905532837, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 32130 + }, + { + "epoch": 2.3080789946140037, + "grad_norm": 0.833387017250061, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 32140 + }, + { + "epoch": 2.3087971274685817, + "grad_norm": 0.9558065533638, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 32150 + }, + { + "epoch": 2.3095152603231597, + "grad_norm": 0.788151204586029, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 32160 + }, + { + "epoch": 2.3102333931777377, + "grad_norm": 0.8662320971488953, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 32170 + }, + { + "epoch": 2.310951526032316, + "grad_norm": 0.7079060673713684, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 32180 + }, + { + "epoch": 2.311669658886894, + "grad_norm": 0.8477022647857666, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 32190 + }, + { + "epoch": 2.312387791741472, + "grad_norm": 0.6549711227416992, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 32200 + }, + { + "epoch": 2.31310592459605, + "grad_norm": 0.8274375796318054, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 32210 + }, + { + "epoch": 2.3138240574506286, + "grad_norm": 0.6305822730064392, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 32220 + }, + { + "epoch": 2.3145421903052066, + "grad_norm": 0.8105725049972534, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 32230 + }, + { + "epoch": 2.3152603231597846, + "grad_norm": 0.7317119240760803, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 32240 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 0.7729924917221069, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 32250 + }, + { + "epoch": 2.3166965888689406, + "grad_norm": 0.8092145919799805, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 32260 + }, + { + "epoch": 2.317414721723519, + "grad_norm": 0.8723762035369873, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 32270 + }, + { + "epoch": 2.318132854578097, + "grad_norm": 0.9699533581733704, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 32280 + }, + { + "epoch": 2.318850987432675, + "grad_norm": 1.2972444295883179, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 32290 + }, + { + "epoch": 2.319569120287253, + "grad_norm": 0.7888450622558594, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 32300 + }, + { + "epoch": 2.3202872531418315, + "grad_norm": 0.7457000017166138, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 32310 + }, + { + "epoch": 2.3210053859964095, + "grad_norm": 0.7270606756210327, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 32320 + }, + { + "epoch": 2.3217235188509875, + "grad_norm": 0.7930711507797241, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32330 + }, + { + "epoch": 2.3224416517055655, + "grad_norm": 0.9015030264854431, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 32340 + }, + { + "epoch": 2.3231597845601435, + "grad_norm": 0.9385523796081543, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 32350 + }, + { + "epoch": 2.3238779174147215, + "grad_norm": 0.7293606400489807, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 32360 + }, + { + "epoch": 2.3245960502693, + "grad_norm": 0.797618567943573, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32370 + }, + { + "epoch": 2.325314183123878, + "grad_norm": 0.8588258028030396, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 32380 + }, + { + "epoch": 2.326032315978456, + "grad_norm": 0.7490078210830688, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 32390 + }, + { + "epoch": 2.326750448833034, + "grad_norm": 0.7569956183433533, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 32400 + }, + { + "epoch": 2.3274685816876124, + "grad_norm": 0.8754122853279114, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 32410 + }, + { + "epoch": 2.3281867145421904, + "grad_norm": 0.9410699605941772, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 32420 + }, + { + "epoch": 2.3289048473967684, + "grad_norm": 1.1309062242507935, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 32430 + }, + { + "epoch": 2.3296229802513464, + "grad_norm": 0.7923168540000916, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 32440 + }, + { + "epoch": 2.3303411131059244, + "grad_norm": 0.830387532711029, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 32450 + }, + { + "epoch": 2.331059245960503, + "grad_norm": 0.9087454080581665, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 32460 + }, + { + "epoch": 2.331777378815081, + "grad_norm": 0.8892660737037659, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 32470 + }, + { + "epoch": 2.332495511669659, + "grad_norm": 0.84930819272995, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 32480 + }, + { + "epoch": 2.333213644524237, + "grad_norm": 0.7736781239509583, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 32490 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 0.7396222352981567, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 32500 + }, + { + "epoch": 2.3346499102333933, + "grad_norm": 0.7710241079330444, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 32510 + }, + { + "epoch": 2.3353680430879713, + "grad_norm": 0.7297301888465881, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 32520 + }, + { + "epoch": 2.3360861759425493, + "grad_norm": 0.9084094166755676, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 32530 + }, + { + "epoch": 2.3368043087971273, + "grad_norm": 0.6425859332084656, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 32540 + }, + { + "epoch": 2.3375224416517058, + "grad_norm": 0.8646581172943115, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 32550 + }, + { + "epoch": 2.3382405745062838, + "grad_norm": 0.91925048828125, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 32560 + }, + { + "epoch": 2.3389587073608618, + "grad_norm": 0.8687716722488403, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 32570 + }, + { + "epoch": 2.3396768402154398, + "grad_norm": 0.9769517183303833, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 32580 + }, + { + "epoch": 2.340394973070018, + "grad_norm": 0.7240557074546814, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 32590 + }, + { + "epoch": 2.341113105924596, + "grad_norm": 0.6631549000740051, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32600 + }, + { + "epoch": 2.341831238779174, + "grad_norm": 0.9103635549545288, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 32610 + }, + { + "epoch": 2.342549371633752, + "grad_norm": 0.8718403577804565, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 32620 + }, + { + "epoch": 2.34326750448833, + "grad_norm": 0.8020271062850952, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 32630 + }, + { + "epoch": 2.343985637342908, + "grad_norm": 0.7834265232086182, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 32640 + }, + { + "epoch": 2.3447037701974867, + "grad_norm": 0.8909988403320312, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 32650 + }, + { + "epoch": 2.3454219030520647, + "grad_norm": 0.6915582418441772, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 32660 + }, + { + "epoch": 2.3461400359066427, + "grad_norm": 0.8829401135444641, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 32670 + }, + { + "epoch": 2.3468581687612207, + "grad_norm": 0.8869150876998901, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 32680 + }, + { + "epoch": 2.347576301615799, + "grad_norm": 0.8348933458328247, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 32690 + }, + { + "epoch": 2.348294434470377, + "grad_norm": 0.7591108679771423, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32700 + }, + { + "epoch": 2.349012567324955, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 32710 + }, + { + "epoch": 2.349730700179533, + "grad_norm": 0.8537896275520325, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 32720 + }, + { + "epoch": 2.350448833034111, + "grad_norm": 0.7750797867774963, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 32730 + }, + { + "epoch": 2.3511669658886896, + "grad_norm": 0.7553941607475281, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 32740 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 0.8083372712135315, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 32750 + }, + { + "epoch": 2.3526032315978456, + "grad_norm": 0.8016324043273926, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 32760 + }, + { + "epoch": 2.3533213644524236, + "grad_norm": 0.7524061799049377, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 32770 + }, + { + "epoch": 2.354039497307002, + "grad_norm": 0.9046763777732849, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 32780 + }, + { + "epoch": 2.35475763016158, + "grad_norm": 0.9704324007034302, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 32790 + }, + { + "epoch": 2.355475763016158, + "grad_norm": 0.8756019473075867, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 32800 + }, + { + "epoch": 2.356193895870736, + "grad_norm": 0.7345646023750305, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32810 + }, + { + "epoch": 2.356912028725314, + "grad_norm": 0.8022899031639099, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 32820 + }, + { + "epoch": 2.3576301615798925, + "grad_norm": 0.7663353085517883, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 32830 + }, + { + "epoch": 2.3583482944344705, + "grad_norm": 0.7802956104278564, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32840 + }, + { + "epoch": 2.3590664272890485, + "grad_norm": 0.8130960464477539, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 32850 + }, + { + "epoch": 2.3597845601436265, + "grad_norm": 0.9671252369880676, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32860 + }, + { + "epoch": 2.3605026929982045, + "grad_norm": 0.8806724548339844, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32870 + }, + { + "epoch": 2.361220825852783, + "grad_norm": 0.9378283619880676, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 32880 + }, + { + "epoch": 2.361938958707361, + "grad_norm": 0.8638162612915039, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32890 + }, + { + "epoch": 2.362657091561939, + "grad_norm": 0.7321885228157043, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 32900 + }, + { + "epoch": 2.363375224416517, + "grad_norm": 0.8445415496826172, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 32910 + }, + { + "epoch": 2.364093357271095, + "grad_norm": 0.915715754032135, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 32920 + }, + { + "epoch": 2.3648114901256734, + "grad_norm": 0.8674854040145874, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 32930 + }, + { + "epoch": 2.3655296229802514, + "grad_norm": 0.7577189207077026, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 32940 + }, + { + "epoch": 2.3662477558348294, + "grad_norm": 0.8649988174438477, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 32950 + }, + { + "epoch": 2.3669658886894074, + "grad_norm": 0.9760734438896179, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 32960 + }, + { + "epoch": 2.367684021543986, + "grad_norm": 0.8909491300582886, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 32970 + }, + { + "epoch": 2.368402154398564, + "grad_norm": 0.6970168948173523, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32980 + }, + { + "epoch": 2.369120287253142, + "grad_norm": 0.8208426237106323, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 32990 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 0.8477405309677124, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 33000 + }, + { + "epoch": 2.370556552962298, + "grad_norm": 0.7771625518798828, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 33010 + }, + { + "epoch": 2.3712746858168763, + "grad_norm": 0.7811821103096008, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33020 + }, + { + "epoch": 2.3719928186714543, + "grad_norm": 0.6280415654182434, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33030 + }, + { + "epoch": 2.3727109515260323, + "grad_norm": 0.8733929395675659, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 33040 + }, + { + "epoch": 2.3734290843806103, + "grad_norm": 0.6169558167457581, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33050 + }, + { + "epoch": 2.3741472172351887, + "grad_norm": 0.7414724826812744, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33060 + }, + { + "epoch": 2.3748653500897667, + "grad_norm": 0.7484683990478516, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 33070 + }, + { + "epoch": 2.3755834829443447, + "grad_norm": 0.8495098948478699, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 33080 + }, + { + "epoch": 2.3763016157989227, + "grad_norm": 0.9057353734970093, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 33090 + }, + { + "epoch": 2.3770197486535007, + "grad_norm": 0.8028274178504944, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 33100 + }, + { + "epoch": 2.377737881508079, + "grad_norm": 1.2398128509521484, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 33110 + }, + { + "epoch": 2.378456014362657, + "grad_norm": 0.7894110679626465, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 33120 + }, + { + "epoch": 2.379174147217235, + "grad_norm": 0.8530096411705017, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 33130 + }, + { + "epoch": 2.379892280071813, + "grad_norm": 0.892613410949707, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 33140 + }, + { + "epoch": 2.380610412926391, + "grad_norm": 0.868606448173523, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 33150 + }, + { + "epoch": 2.3813285457809696, + "grad_norm": 0.6801115870475769, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 33160 + }, + { + "epoch": 2.3820466786355476, + "grad_norm": 0.9517148733139038, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 33170 + }, + { + "epoch": 2.3827648114901256, + "grad_norm": 0.8986499309539795, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 33180 + }, + { + "epoch": 2.3834829443447036, + "grad_norm": 0.8467642068862915, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33190 + }, + { + "epoch": 2.3842010771992816, + "grad_norm": 0.8400940299034119, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 33200 + }, + { + "epoch": 2.38491921005386, + "grad_norm": 0.86443030834198, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 33210 + }, + { + "epoch": 2.385637342908438, + "grad_norm": 0.8599014282226562, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 33220 + }, + { + "epoch": 2.386355475763016, + "grad_norm": 0.868735134601593, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33230 + }, + { + "epoch": 2.387073608617594, + "grad_norm": 0.941734790802002, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 33240 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 0.9342881441116333, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 33250 + }, + { + "epoch": 2.3885098743267505, + "grad_norm": 1.012920618057251, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 33260 + }, + { + "epoch": 2.3892280071813286, + "grad_norm": 0.6949151754379272, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 33270 + }, + { + "epoch": 2.3899461400359066, + "grad_norm": 0.8283912539482117, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 33280 + }, + { + "epoch": 2.3906642728904846, + "grad_norm": 0.807273805141449, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 33290 + }, + { + "epoch": 2.391382405745063, + "grad_norm": 0.8109124302864075, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 33300 + }, + { + "epoch": 2.392100538599641, + "grad_norm": 0.7477563619613647, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 33310 + }, + { + "epoch": 2.392818671454219, + "grad_norm": 0.6961637735366821, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 33320 + }, + { + "epoch": 2.393536804308797, + "grad_norm": 0.9424173831939697, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 33330 + }, + { + "epoch": 2.3942549371633755, + "grad_norm": 0.8289623856544495, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 33340 + }, + { + "epoch": 2.3949730700179535, + "grad_norm": 0.8106551170349121, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 33350 + }, + { + "epoch": 2.3956912028725315, + "grad_norm": 0.8800507187843323, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33360 + }, + { + "epoch": 2.3964093357271095, + "grad_norm": 0.7662274241447449, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 33370 + }, + { + "epoch": 2.3971274685816875, + "grad_norm": 0.889204740524292, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 33380 + }, + { + "epoch": 2.3978456014362655, + "grad_norm": 0.7991349697113037, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 33390 + }, + { + "epoch": 2.398563734290844, + "grad_norm": 0.8210278749465942, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 33400 + }, + { + "epoch": 2.399281867145422, + "grad_norm": 0.91801917552948, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 33410 + }, + { + "epoch": 2.4, + "grad_norm": 0.8086220622062683, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 33420 + }, + { + "epoch": 2.400718132854578, + "grad_norm": 0.901613175868988, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 33430 + }, + { + "epoch": 2.4014362657091564, + "grad_norm": 0.9865965247154236, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 33440 + }, + { + "epoch": 2.4021543985637344, + "grad_norm": 0.8160675168037415, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 33450 + }, + { + "epoch": 2.4028725314183124, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33460 + }, + { + "epoch": 2.4035906642728904, + "grad_norm": 0.8490013480186462, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 33470 + }, + { + "epoch": 2.4043087971274684, + "grad_norm": 0.6947163939476013, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33480 + }, + { + "epoch": 2.405026929982047, + "grad_norm": 0.7984827756881714, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 33490 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 0.7826083302497864, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 33500 + }, + { + "epoch": 2.406463195691203, + "grad_norm": 0.8213959336280823, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 33510 + }, + { + "epoch": 2.407181328545781, + "grad_norm": 0.8790069818496704, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 33520 + }, + { + "epoch": 2.4078994614003593, + "grad_norm": 0.9093378782272339, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 33530 + }, + { + "epoch": 2.4086175942549373, + "grad_norm": 0.8085389137268066, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 33540 + }, + { + "epoch": 2.4093357271095153, + "grad_norm": 0.7952343225479126, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 33550 + }, + { + "epoch": 2.4100538599640933, + "grad_norm": 0.9576563835144043, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 33560 + }, + { + "epoch": 2.4107719928186713, + "grad_norm": 0.7722929120063782, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 33570 + }, + { + "epoch": 2.4114901256732497, + "grad_norm": 0.8634604215621948, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 33580 + }, + { + "epoch": 2.4122082585278277, + "grad_norm": 0.7805271148681641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 33590 + }, + { + "epoch": 2.4129263913824057, + "grad_norm": 0.8274481296539307, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 33600 + }, + { + "epoch": 2.4136445242369837, + "grad_norm": 0.9265141487121582, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 33610 + }, + { + "epoch": 2.414362657091562, + "grad_norm": 0.7497374415397644, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 33620 + }, + { + "epoch": 2.41508078994614, + "grad_norm": 0.7048972249031067, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 33630 + }, + { + "epoch": 2.415798922800718, + "grad_norm": 0.8449550271034241, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 33640 + }, + { + "epoch": 2.416517055655296, + "grad_norm": 0.7581984400749207, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 33650 + }, + { + "epoch": 2.417235188509874, + "grad_norm": 0.7744191288948059, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 33660 + }, + { + "epoch": 2.417953321364452, + "grad_norm": 0.6736614108085632, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 33670 + }, + { + "epoch": 2.4186714542190306, + "grad_norm": 0.985431432723999, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33680 + }, + { + "epoch": 2.4193895870736086, + "grad_norm": 0.8027978539466858, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33690 + }, + { + "epoch": 2.4201077199281866, + "grad_norm": 0.6809377074241638, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 33700 + }, + { + "epoch": 2.4208258527827646, + "grad_norm": 0.8305349946022034, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 33710 + }, + { + "epoch": 2.421543985637343, + "grad_norm": 0.7632496356964111, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 33720 + }, + { + "epoch": 2.422262118491921, + "grad_norm": 0.7241050601005554, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 33730 + }, + { + "epoch": 2.422980251346499, + "grad_norm": 0.6729857325553894, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 33740 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 0.7741881012916565, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 33750 + }, + { + "epoch": 2.424416517055655, + "grad_norm": 0.7844415903091431, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 33760 + }, + { + "epoch": 2.4251346499102335, + "grad_norm": 0.7960098385810852, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 33770 + }, + { + "epoch": 2.4258527827648115, + "grad_norm": 0.8267978429794312, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 33780 + }, + { + "epoch": 2.4265709156193895, + "grad_norm": 0.7498974204063416, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 33790 + }, + { + "epoch": 2.4272890484739675, + "grad_norm": 0.8357859253883362, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 33800 + }, + { + "epoch": 2.428007181328546, + "grad_norm": 0.8056104779243469, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 33810 + }, + { + "epoch": 2.428725314183124, + "grad_norm": 0.806897759437561, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 33820 + }, + { + "epoch": 2.429443447037702, + "grad_norm": 0.7770048975944519, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 33830 + }, + { + "epoch": 2.43016157989228, + "grad_norm": 0.8311458230018616, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 33840 + }, + { + "epoch": 2.430879712746858, + "grad_norm": 0.9201730489730835, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 33850 + }, + { + "epoch": 2.4315978456014364, + "grad_norm": 0.83509761095047, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 33860 + }, + { + "epoch": 2.4323159784560144, + "grad_norm": 0.7680139541625977, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 33870 + }, + { + "epoch": 2.4330341113105924, + "grad_norm": 0.8956670165061951, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 33880 + }, + { + "epoch": 2.4337522441651704, + "grad_norm": 0.717941164970398, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33890 + }, + { + "epoch": 2.434470377019749, + "grad_norm": 0.777206540107727, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 33900 + }, + { + "epoch": 2.435188509874327, + "grad_norm": 0.90232914686203, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 33910 + }, + { + "epoch": 2.435906642728905, + "grad_norm": 1.0817158222198486, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 33920 + }, + { + "epoch": 2.436624775583483, + "grad_norm": 0.7890931367874146, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 33930 + }, + { + "epoch": 2.437342908438061, + "grad_norm": 0.9279449582099915, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 33940 + }, + { + "epoch": 2.438061041292639, + "grad_norm": 0.8313823342323303, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 33950 + }, + { + "epoch": 2.4387791741472173, + "grad_norm": 1.0510340929031372, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 33960 + }, + { + "epoch": 2.4394973070017953, + "grad_norm": 0.8002574443817139, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 33970 + }, + { + "epoch": 2.4402154398563733, + "grad_norm": 0.7822834253311157, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33980 + }, + { + "epoch": 2.4409335727109513, + "grad_norm": 0.9050403237342834, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 33990 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 0.7569652199745178, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 34000 + }, + { + "epoch": 2.442369838420108, + "grad_norm": 0.6609470844268799, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 34010 + }, + { + "epoch": 2.443087971274686, + "grad_norm": 0.8090947866439819, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34020 + }, + { + "epoch": 2.443806104129264, + "grad_norm": 0.647814929485321, + "learning_rate": 0.0002, + "loss": 0.6621, + "step": 34030 + }, + { + "epoch": 2.444524236983842, + "grad_norm": 0.9308601021766663, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 34040 + }, + { + "epoch": 2.4452423698384202, + "grad_norm": 0.8259239792823792, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34050 + }, + { + "epoch": 2.4459605026929983, + "grad_norm": 0.9410025477409363, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 34060 + }, + { + "epoch": 2.4466786355475763, + "grad_norm": 0.7446974515914917, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 34070 + }, + { + "epoch": 2.4473967684021543, + "grad_norm": 0.7093849182128906, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 34080 + }, + { + "epoch": 2.4481149012567327, + "grad_norm": 0.8726152181625366, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 34090 + }, + { + "epoch": 2.4488330341113107, + "grad_norm": 0.808300793170929, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 34100 + }, + { + "epoch": 2.4495511669658887, + "grad_norm": 0.6884859800338745, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 34110 + }, + { + "epoch": 2.4502692998204667, + "grad_norm": 0.7151864767074585, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 34120 + }, + { + "epoch": 2.4509874326750447, + "grad_norm": 0.9261866807937622, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 34130 + }, + { + "epoch": 2.451705565529623, + "grad_norm": 0.8069018125534058, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 34140 + }, + { + "epoch": 2.452423698384201, + "grad_norm": 0.8001297116279602, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 34150 + }, + { + "epoch": 2.453141831238779, + "grad_norm": 0.8547799587249756, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 34160 + }, + { + "epoch": 2.453859964093357, + "grad_norm": 0.6693823337554932, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 34170 + }, + { + "epoch": 2.4545780969479356, + "grad_norm": 0.6646198630332947, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34180 + }, + { + "epoch": 2.4552962298025136, + "grad_norm": 0.9330950975418091, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 34190 + }, + { + "epoch": 2.4560143626570916, + "grad_norm": 0.7738645672798157, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 34200 + }, + { + "epoch": 2.4567324955116696, + "grad_norm": 0.7929846048355103, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 34210 + }, + { + "epoch": 2.4574506283662476, + "grad_norm": 0.8936280012130737, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34220 + }, + { + "epoch": 2.4581687612208256, + "grad_norm": 0.9099360108375549, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 34230 + }, + { + "epoch": 2.458886894075404, + "grad_norm": 0.7941291928291321, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 34240 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 0.7169737219810486, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 34250 + }, + { + "epoch": 2.46032315978456, + "grad_norm": 0.8994171023368835, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 34260 + }, + { + "epoch": 2.461041292639138, + "grad_norm": 0.8087331056594849, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 34270 + }, + { + "epoch": 2.4617594254937165, + "grad_norm": 0.935502827167511, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 34280 + }, + { + "epoch": 2.4624775583482945, + "grad_norm": 0.8957464694976807, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 34290 + }, + { + "epoch": 2.4631956912028725, + "grad_norm": 0.9017183780670166, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 34300 + }, + { + "epoch": 2.4639138240574505, + "grad_norm": 0.7778640389442444, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34310 + }, + { + "epoch": 2.4646319569120285, + "grad_norm": 0.8870323896408081, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 34320 + }, + { + "epoch": 2.465350089766607, + "grad_norm": 0.7660176753997803, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 34330 + }, + { + "epoch": 2.466068222621185, + "grad_norm": 0.8442226648330688, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 34340 + }, + { + "epoch": 2.466786355475763, + "grad_norm": 0.7522561550140381, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 34350 + }, + { + "epoch": 2.467504488330341, + "grad_norm": 0.9355213046073914, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 34360 + }, + { + "epoch": 2.4682226211849194, + "grad_norm": 0.8487382531166077, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 34370 + }, + { + "epoch": 2.4689407540394974, + "grad_norm": 0.7869813442230225, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 34380 + }, + { + "epoch": 2.4696588868940754, + "grad_norm": 0.7562848329544067, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 34390 + }, + { + "epoch": 2.4703770197486534, + "grad_norm": 0.740829586982727, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 34400 + }, + { + "epoch": 2.4710951526032314, + "grad_norm": 1.0862116813659668, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 34410 + }, + { + "epoch": 2.47181328545781, + "grad_norm": 0.9633645415306091, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 34420 + }, + { + "epoch": 2.472531418312388, + "grad_norm": 0.8467186093330383, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 34430 + }, + { + "epoch": 2.473249551166966, + "grad_norm": 0.9972147941589355, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 34440 + }, + { + "epoch": 2.473967684021544, + "grad_norm": 0.8086632490158081, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 34450 + }, + { + "epoch": 2.4746858168761223, + "grad_norm": 0.9043704271316528, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 34460 + }, + { + "epoch": 2.4754039497307003, + "grad_norm": 0.8275330662727356, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34470 + }, + { + "epoch": 2.4761220825852783, + "grad_norm": 0.8142464756965637, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 34480 + }, + { + "epoch": 2.4768402154398563, + "grad_norm": 0.7116754651069641, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 34490 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 0.8742281198501587, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 34500 + }, + { + "epoch": 2.4782764811490123, + "grad_norm": 0.7545657157897949, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 34510 + }, + { + "epoch": 2.478994614003591, + "grad_norm": 0.7586482167243958, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 34520 + }, + { + "epoch": 2.479712746858169, + "grad_norm": 0.9212547540664673, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 34530 + }, + { + "epoch": 2.480430879712747, + "grad_norm": 0.9391530752182007, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 34540 + }, + { + "epoch": 2.481149012567325, + "grad_norm": 1.119698166847229, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 34550 + }, + { + "epoch": 2.4818671454219032, + "grad_norm": 0.8499019145965576, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34560 + }, + { + "epoch": 2.4825852782764812, + "grad_norm": 0.7629778385162354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 34570 + }, + { + "epoch": 2.4833034111310592, + "grad_norm": 0.7667021155357361, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 34580 + }, + { + "epoch": 2.4840215439856372, + "grad_norm": 0.6711493730545044, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 34590 + }, + { + "epoch": 2.4847396768402152, + "grad_norm": 0.7354223728179932, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34600 + }, + { + "epoch": 2.4854578096947937, + "grad_norm": 0.875295102596283, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 34610 + }, + { + "epoch": 2.4861759425493717, + "grad_norm": 0.7341493964195251, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 34620 + }, + { + "epoch": 2.4868940754039497, + "grad_norm": 0.9049216508865356, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 34630 + }, + { + "epoch": 2.4876122082585277, + "grad_norm": 0.7214788198471069, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 34640 + }, + { + "epoch": 2.488330341113106, + "grad_norm": 0.7514070868492126, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 34650 + }, + { + "epoch": 2.489048473967684, + "grad_norm": 0.6929763555526733, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 34660 + }, + { + "epoch": 2.489766606822262, + "grad_norm": 1.11346435546875, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 34670 + }, + { + "epoch": 2.49048473967684, + "grad_norm": 0.9285556674003601, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 34680 + }, + { + "epoch": 2.491202872531418, + "grad_norm": 0.7699695825576782, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 34690 + }, + { + "epoch": 2.4919210053859966, + "grad_norm": 0.872349739074707, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 34700 + }, + { + "epoch": 2.4926391382405746, + "grad_norm": 0.8692147135734558, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 34710 + }, + { + "epoch": 2.4933572710951526, + "grad_norm": 0.799740195274353, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 34720 + }, + { + "epoch": 2.4940754039497306, + "grad_norm": 0.7320986986160278, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 34730 + }, + { + "epoch": 2.494793536804309, + "grad_norm": 0.8233383893966675, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 34740 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 0.9605086445808411, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34750 + }, + { + "epoch": 2.496229802513465, + "grad_norm": 0.8597773909568787, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 34760 + }, + { + "epoch": 2.496947935368043, + "grad_norm": 0.7459201812744141, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34770 + }, + { + "epoch": 2.497666068222621, + "grad_norm": 0.778457522392273, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 34780 + }, + { + "epoch": 2.498384201077199, + "grad_norm": 0.8591375946998596, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 34790 + }, + { + "epoch": 2.4991023339317775, + "grad_norm": 0.9689867496490479, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 34800 + }, + { + "epoch": 2.4998204667863555, + "grad_norm": 0.7430615425109863, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 34810 + }, + { + "epoch": 2.5005385996409335, + "grad_norm": 0.8545114994049072, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 34820 + }, + { + "epoch": 2.5012567324955115, + "grad_norm": 0.7115356922149658, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 34830 + }, + { + "epoch": 2.50197486535009, + "grad_norm": 0.7616795301437378, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34840 + }, + { + "epoch": 2.502692998204668, + "grad_norm": 0.8097891211509705, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 34850 + }, + { + "epoch": 2.503411131059246, + "grad_norm": 0.7397396564483643, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 34860 + }, + { + "epoch": 2.504129263913824, + "grad_norm": 0.7531594038009644, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 34870 + }, + { + "epoch": 2.504847396768402, + "grad_norm": 0.8050091862678528, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 34880 + }, + { + "epoch": 2.5055655296229804, + "grad_norm": 0.7550507187843323, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 34890 + }, + { + "epoch": 2.5062836624775584, + "grad_norm": 1.0131759643554688, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34900 + }, + { + "epoch": 2.5070017953321364, + "grad_norm": 0.9275356531143188, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 34910 + }, + { + "epoch": 2.5077199281867144, + "grad_norm": 0.6655791997909546, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 34920 + }, + { + "epoch": 2.508438061041293, + "grad_norm": 0.79361891746521, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 34930 + }, + { + "epoch": 2.509156193895871, + "grad_norm": 0.8223658800125122, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 34940 + }, + { + "epoch": 2.509874326750449, + "grad_norm": 1.0070416927337646, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 34950 + }, + { + "epoch": 2.510592459605027, + "grad_norm": 0.8408986330032349, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 34960 + }, + { + "epoch": 2.511310592459605, + "grad_norm": 0.8178259134292603, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 34970 + }, + { + "epoch": 2.512028725314183, + "grad_norm": 0.747876763343811, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 34980 + }, + { + "epoch": 2.5127468581687613, + "grad_norm": 0.8551825881004333, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 34990 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 0.8366564512252808, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 35000 + }, + { + "epoch": 2.5141831238779173, + "grad_norm": 0.8491294384002686, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 35010 + }, + { + "epoch": 2.5149012567324958, + "grad_norm": 0.8854562640190125, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 35020 + }, + { + "epoch": 2.5156193895870738, + "grad_norm": 0.8652133345603943, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 35030 + }, + { + "epoch": 2.5163375224416518, + "grad_norm": 0.8734033107757568, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 35040 + }, + { + "epoch": 2.5170556552962298, + "grad_norm": 0.8613446950912476, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 35050 + }, + { + "epoch": 2.5177737881508078, + "grad_norm": 0.762395441532135, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 35060 + }, + { + "epoch": 2.5184919210053858, + "grad_norm": 0.806220293045044, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 35070 + }, + { + "epoch": 2.519210053859964, + "grad_norm": 0.7781713008880615, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 35080 + }, + { + "epoch": 2.519928186714542, + "grad_norm": 0.8639848828315735, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 35090 + }, + { + "epoch": 2.52064631956912, + "grad_norm": 0.7331740260124207, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 35100 + }, + { + "epoch": 2.521364452423698, + "grad_norm": 0.8148137927055359, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 35110 + }, + { + "epoch": 2.5220825852782767, + "grad_norm": 0.6939297914505005, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 35120 + }, + { + "epoch": 2.5228007181328547, + "grad_norm": 0.8151076436042786, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 35130 + }, + { + "epoch": 2.5235188509874327, + "grad_norm": 0.9193238019943237, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 35140 + }, + { + "epoch": 2.5242369838420107, + "grad_norm": 0.8230985403060913, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 35150 + }, + { + "epoch": 2.5249551166965887, + "grad_norm": 0.865492582321167, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 35160 + }, + { + "epoch": 2.525673249551167, + "grad_norm": 0.7673570513725281, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35170 + }, + { + "epoch": 2.526391382405745, + "grad_norm": 0.8296313881874084, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 35180 + }, + { + "epoch": 2.527109515260323, + "grad_norm": 0.6531317234039307, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 35190 + }, + { + "epoch": 2.527827648114901, + "grad_norm": 0.9865642189979553, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 35200 + }, + { + "epoch": 2.5285457809694796, + "grad_norm": 0.8001098036766052, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 35210 + }, + { + "epoch": 2.5292639138240576, + "grad_norm": 0.7523218393325806, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 35220 + }, + { + "epoch": 2.5299820466786356, + "grad_norm": 1.061640977859497, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 35230 + }, + { + "epoch": 2.5307001795332136, + "grad_norm": 0.9668078422546387, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35240 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 0.9554983973503113, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 35250 + }, + { + "epoch": 2.5321364452423696, + "grad_norm": 0.8343066573143005, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 35260 + }, + { + "epoch": 2.532854578096948, + "grad_norm": 0.8408095240592957, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 35270 + }, + { + "epoch": 2.533572710951526, + "grad_norm": 0.8593984842300415, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 35280 + }, + { + "epoch": 2.534290843806104, + "grad_norm": 0.7593855261802673, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 35290 + }, + { + "epoch": 2.5350089766606825, + "grad_norm": 0.9179701209068298, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 35300 + }, + { + "epoch": 2.5357271095152605, + "grad_norm": 0.749022901058197, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 35310 + }, + { + "epoch": 2.5364452423698385, + "grad_norm": 0.7172152400016785, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 35320 + }, + { + "epoch": 2.5371633752244165, + "grad_norm": 0.8228873610496521, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 35330 + }, + { + "epoch": 2.5378815080789945, + "grad_norm": 0.9663547277450562, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 35340 + }, + { + "epoch": 2.5385996409335725, + "grad_norm": 0.8446536660194397, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35350 + }, + { + "epoch": 2.539317773788151, + "grad_norm": 0.9751029014587402, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 35360 + }, + { + "epoch": 2.540035906642729, + "grad_norm": 0.7460315823554993, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 35370 + }, + { + "epoch": 2.540754039497307, + "grad_norm": 0.8269246816635132, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 35380 + }, + { + "epoch": 2.541472172351885, + "grad_norm": 0.7200030088424683, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 35390 + }, + { + "epoch": 2.5421903052064634, + "grad_norm": 0.9586671590805054, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 35400 + }, + { + "epoch": 2.5429084380610414, + "grad_norm": 0.7872378826141357, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 35410 + }, + { + "epoch": 2.5436265709156194, + "grad_norm": 0.8257358074188232, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 35420 + }, + { + "epoch": 2.5443447037701974, + "grad_norm": 0.6924505829811096, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 35430 + }, + { + "epoch": 2.5450628366247754, + "grad_norm": 1.1171481609344482, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 35440 + }, + { + "epoch": 2.545780969479354, + "grad_norm": 0.9635605216026306, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 35450 + }, + { + "epoch": 2.546499102333932, + "grad_norm": 0.9760567545890808, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 35460 + }, + { + "epoch": 2.54721723518851, + "grad_norm": 0.8523460030555725, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 35470 + }, + { + "epoch": 2.547935368043088, + "grad_norm": 0.9316970109939575, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 35480 + }, + { + "epoch": 2.5486535008976663, + "grad_norm": 0.7401485443115234, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 35490 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 1.0627065896987915, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 35500 + }, + { + "epoch": 2.5500897666068223, + "grad_norm": 0.7463156580924988, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 35510 + }, + { + "epoch": 2.5508078994614003, + "grad_norm": 0.9935570359230042, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 35520 + }, + { + "epoch": 2.5515260323159783, + "grad_norm": 0.8824051022529602, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 35530 + }, + { + "epoch": 2.5522441651705563, + "grad_norm": 0.8018375635147095, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 35540 + }, + { + "epoch": 2.5529622980251347, + "grad_norm": 0.7523182034492493, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 35550 + }, + { + "epoch": 2.5536804308797127, + "grad_norm": 0.6771712303161621, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 35560 + }, + { + "epoch": 2.5543985637342908, + "grad_norm": 0.7903336882591248, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 35570 + }, + { + "epoch": 2.555116696588869, + "grad_norm": 0.7973808646202087, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 35580 + }, + { + "epoch": 2.555834829443447, + "grad_norm": 0.9082772731781006, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 35590 + }, + { + "epoch": 2.556552962298025, + "grad_norm": 0.779671311378479, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 35600 + }, + { + "epoch": 2.557271095152603, + "grad_norm": 0.710058331489563, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 35610 + }, + { + "epoch": 2.557989228007181, + "grad_norm": 0.8217873573303223, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 35620 + }, + { + "epoch": 2.558707360861759, + "grad_norm": 0.8017855286598206, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 35630 + }, + { + "epoch": 2.5594254937163377, + "grad_norm": 0.6671402454376221, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 35640 + }, + { + "epoch": 2.5601436265709157, + "grad_norm": 0.9357045292854309, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 35650 + }, + { + "epoch": 2.5608617594254937, + "grad_norm": 0.7676312327384949, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35660 + }, + { + "epoch": 2.5615798922800717, + "grad_norm": 0.7602545619010925, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 35670 + }, + { + "epoch": 2.56229802513465, + "grad_norm": 0.8112275004386902, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35680 + }, + { + "epoch": 2.563016157989228, + "grad_norm": 0.73296719789505, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 35690 + }, + { + "epoch": 2.563734290843806, + "grad_norm": 0.9007818102836609, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 35700 + }, + { + "epoch": 2.564452423698384, + "grad_norm": 0.7526060938835144, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 35710 + }, + { + "epoch": 2.565170556552962, + "grad_norm": 0.813875675201416, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 35720 + }, + { + "epoch": 2.5658886894075406, + "grad_norm": 0.7767695784568787, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 35730 + }, + { + "epoch": 2.5666068222621186, + "grad_norm": 0.7840573787689209, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35740 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 0.7400487661361694, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 35750 + }, + { + "epoch": 2.5680430879712746, + "grad_norm": 0.7424315810203552, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 35760 + }, + { + "epoch": 2.568761220825853, + "grad_norm": 0.7812185883522034, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 35770 + }, + { + "epoch": 2.569479353680431, + "grad_norm": 0.8397669196128845, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 35780 + }, + { + "epoch": 2.570197486535009, + "grad_norm": 0.7543849945068359, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 35790 + }, + { + "epoch": 2.570915619389587, + "grad_norm": 0.903634786605835, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 35800 + }, + { + "epoch": 2.571633752244165, + "grad_norm": 0.853335976600647, + "learning_rate": 0.0002, + "loss": 0.6884, + "step": 35810 + }, + { + "epoch": 2.572351885098743, + "grad_norm": 0.8441029787063599, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 35820 + }, + { + "epoch": 2.5730700179533215, + "grad_norm": 0.9072228670120239, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 35830 + }, + { + "epoch": 2.5737881508078995, + "grad_norm": 0.7720168828964233, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 35840 + }, + { + "epoch": 2.5745062836624775, + "grad_norm": 0.8719366788864136, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35850 + }, + { + "epoch": 2.575224416517056, + "grad_norm": 0.766209065914154, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 35860 + }, + { + "epoch": 2.575942549371634, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 35870 + }, + { + "epoch": 2.576660682226212, + "grad_norm": 0.8068482875823975, + "learning_rate": 0.0002, + "loss": 0.7309, + "step": 35880 + }, + { + "epoch": 2.57737881508079, + "grad_norm": 0.8321225643157959, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 35890 + }, + { + "epoch": 2.578096947935368, + "grad_norm": 0.9787611961364746, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 35900 + }, + { + "epoch": 2.578815080789946, + "grad_norm": 0.6955108642578125, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 35910 + }, + { + "epoch": 2.5795332136445244, + "grad_norm": 0.8309195637702942, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 35920 + }, + { + "epoch": 2.5802513464991024, + "grad_norm": 0.9309390783309937, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 35930 + }, + { + "epoch": 2.5809694793536804, + "grad_norm": 0.903537392616272, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 35940 + }, + { + "epoch": 2.5816876122082584, + "grad_norm": 0.9530633091926575, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 35950 + }, + { + "epoch": 2.582405745062837, + "grad_norm": 1.0140212774276733, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 35960 + }, + { + "epoch": 2.583123877917415, + "grad_norm": 0.8224637508392334, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 35970 + }, + { + "epoch": 2.583842010771993, + "grad_norm": 0.7952998280525208, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 35980 + }, + { + "epoch": 2.584560143626571, + "grad_norm": 0.6057878136634827, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 35990 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 0.9172457456588745, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 36000 + }, + { + "epoch": 2.5859964093357273, + "grad_norm": 1.0061585903167725, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36010 + }, + { + "epoch": 2.5867145421903053, + "grad_norm": 0.8555058240890503, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 36020 + }, + { + "epoch": 2.5874326750448833, + "grad_norm": 0.7732099890708923, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 36030 + }, + { + "epoch": 2.5881508078994613, + "grad_norm": 0.9026121497154236, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 36040 + }, + { + "epoch": 2.5888689407540397, + "grad_norm": 0.7477090954780579, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 36050 + }, + { + "epoch": 2.5895870736086177, + "grad_norm": 0.8835780024528503, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 36060 + }, + { + "epoch": 2.5903052064631957, + "grad_norm": 0.7555899024009705, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 36070 + }, + { + "epoch": 2.5910233393177737, + "grad_norm": 0.7983574867248535, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 36080 + }, + { + "epoch": 2.5917414721723517, + "grad_norm": 0.9261698722839355, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 36090 + }, + { + "epoch": 2.5924596050269297, + "grad_norm": 0.6834031343460083, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 36100 + }, + { + "epoch": 2.593177737881508, + "grad_norm": 0.9528526067733765, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 36110 + }, + { + "epoch": 2.593895870736086, + "grad_norm": 0.7469993233680725, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 36120 + }, + { + "epoch": 2.594614003590664, + "grad_norm": 0.6750355362892151, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 36130 + }, + { + "epoch": 2.5953321364452426, + "grad_norm": 0.8591015338897705, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 36140 + }, + { + "epoch": 2.5960502692998206, + "grad_norm": 0.7359472513198853, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 36150 + }, + { + "epoch": 2.5967684021543986, + "grad_norm": 0.8450608253479004, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36160 + }, + { + "epoch": 2.5974865350089766, + "grad_norm": 0.9069468975067139, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36170 + }, + { + "epoch": 2.5982046678635546, + "grad_norm": 0.9261118173599243, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 36180 + }, + { + "epoch": 2.5989228007181326, + "grad_norm": 0.7164715528488159, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 36190 + }, + { + "epoch": 2.599640933572711, + "grad_norm": 0.8809511661529541, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 36200 + }, + { + "epoch": 2.600359066427289, + "grad_norm": 0.9872701168060303, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 36210 + }, + { + "epoch": 2.601077199281867, + "grad_norm": 0.7544043064117432, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 36220 + }, + { + "epoch": 2.601795332136445, + "grad_norm": 0.9890767335891724, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 36230 + }, + { + "epoch": 2.6025134649910235, + "grad_norm": 0.907865047454834, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 36240 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 0.7724096179008484, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 36250 + }, + { + "epoch": 2.6039497307001795, + "grad_norm": 0.7996655106544495, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36260 + }, + { + "epoch": 2.6046678635547575, + "grad_norm": 0.7184412479400635, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 36270 + }, + { + "epoch": 2.6053859964093355, + "grad_norm": 0.7781601548194885, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 36280 + }, + { + "epoch": 2.6061041292639135, + "grad_norm": 0.8972102403640747, + "learning_rate": 0.0002, + "loss": 0.6975, + "step": 36290 + }, + { + "epoch": 2.606822262118492, + "grad_norm": 0.6831884980201721, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 36300 + }, + { + "epoch": 2.60754039497307, + "grad_norm": 0.9049789905548096, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 36310 + }, + { + "epoch": 2.608258527827648, + "grad_norm": 0.8062970042228699, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 36320 + }, + { + "epoch": 2.6089766606822264, + "grad_norm": 0.94797682762146, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 36330 + }, + { + "epoch": 2.6096947935368044, + "grad_norm": 0.7907559275627136, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 36340 + }, + { + "epoch": 2.6104129263913824, + "grad_norm": 0.6720156073570251, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 36350 + }, + { + "epoch": 2.6111310592459605, + "grad_norm": 0.729228138923645, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 36360 + }, + { + "epoch": 2.6118491921005385, + "grad_norm": 0.9072836637496948, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 36370 + }, + { + "epoch": 2.6125673249551165, + "grad_norm": 0.8022173643112183, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36380 + }, + { + "epoch": 2.613285457809695, + "grad_norm": 0.7475612163543701, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 36390 + }, + { + "epoch": 2.614003590664273, + "grad_norm": 0.7976534366607666, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 36400 + }, + { + "epoch": 2.614721723518851, + "grad_norm": 0.7118260860443115, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36410 + }, + { + "epoch": 2.6154398563734294, + "grad_norm": 0.666500985622406, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36420 + }, + { + "epoch": 2.6161579892280074, + "grad_norm": 0.8776089549064636, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 36430 + }, + { + "epoch": 2.6168761220825854, + "grad_norm": 0.9375919699668884, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 36440 + }, + { + "epoch": 2.6175942549371634, + "grad_norm": 0.8162244558334351, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 36450 + }, + { + "epoch": 2.6183123877917414, + "grad_norm": 0.8459304571151733, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 36460 + }, + { + "epoch": 2.6190305206463194, + "grad_norm": 0.7731037735939026, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 36470 + }, + { + "epoch": 2.619748653500898, + "grad_norm": 0.7857680320739746, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 36480 + }, + { + "epoch": 2.620466786355476, + "grad_norm": 0.8415161371231079, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 36490 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 0.8103558421134949, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 36500 + }, + { + "epoch": 2.621903052064632, + "grad_norm": 0.7876150608062744, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 36510 + }, + { + "epoch": 2.6226211849192103, + "grad_norm": 0.7316484451293945, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 36520 + }, + { + "epoch": 2.6233393177737883, + "grad_norm": 0.7209784984588623, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 36530 + }, + { + "epoch": 2.6240574506283663, + "grad_norm": 0.8933016657829285, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 36540 + }, + { + "epoch": 2.6247755834829443, + "grad_norm": 0.8078171610832214, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 36550 + }, + { + "epoch": 2.6254937163375223, + "grad_norm": 0.9134724736213684, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 36560 + }, + { + "epoch": 2.6262118491921003, + "grad_norm": 0.8691368699073792, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 36570 + }, + { + "epoch": 2.6269299820466787, + "grad_norm": 0.706479012966156, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 36580 + }, + { + "epoch": 2.6276481149012567, + "grad_norm": 0.9333644509315491, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 36590 + }, + { + "epoch": 2.6283662477558347, + "grad_norm": 0.8156154155731201, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 36600 + }, + { + "epoch": 2.629084380610413, + "grad_norm": 0.812745213508606, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 36610 + }, + { + "epoch": 2.629802513464991, + "grad_norm": 0.8898148536682129, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 36620 + }, + { + "epoch": 2.630520646319569, + "grad_norm": 0.8083946108818054, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36630 + }, + { + "epoch": 2.631238779174147, + "grad_norm": 0.7050122618675232, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 36640 + }, + { + "epoch": 2.631956912028725, + "grad_norm": 0.8155789971351624, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 36650 + }, + { + "epoch": 2.632675044883303, + "grad_norm": 0.9102175235748291, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 36660 + }, + { + "epoch": 2.6333931777378816, + "grad_norm": 0.6621248126029968, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36670 + }, + { + "epoch": 2.6341113105924596, + "grad_norm": 0.7338519096374512, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 36680 + }, + { + "epoch": 2.6348294434470376, + "grad_norm": 0.7536506652832031, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 36690 + }, + { + "epoch": 2.635547576301616, + "grad_norm": 0.9357436299324036, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 36700 + }, + { + "epoch": 2.636265709156194, + "grad_norm": 0.7732111215591431, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 36710 + }, + { + "epoch": 2.636983842010772, + "grad_norm": 0.6863537430763245, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36720 + }, + { + "epoch": 2.63770197486535, + "grad_norm": 0.8014764785766602, + "learning_rate": 0.0002, + "loss": 0.7058, + "step": 36730 + }, + { + "epoch": 2.638420107719928, + "grad_norm": 0.8103911280632019, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 36740 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 0.882652997970581, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 36750 + }, + { + "epoch": 2.6398563734290845, + "grad_norm": 0.8705278038978577, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 36760 + }, + { + "epoch": 2.6405745062836625, + "grad_norm": 0.80764240026474, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36770 + }, + { + "epoch": 2.6412926391382405, + "grad_norm": 0.9668620824813843, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 36780 + }, + { + "epoch": 2.6420107719928185, + "grad_norm": 0.7477577328681946, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 36790 + }, + { + "epoch": 2.642728904847397, + "grad_norm": 0.8344516754150391, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 36800 + }, + { + "epoch": 2.643447037701975, + "grad_norm": 0.9520720839500427, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 36810 + }, + { + "epoch": 2.644165170556553, + "grad_norm": 0.5942372679710388, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 36820 + }, + { + "epoch": 2.644883303411131, + "grad_norm": 0.7411555051803589, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 36830 + }, + { + "epoch": 2.645601436265709, + "grad_norm": 0.6597771048545837, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 36840 + }, + { + "epoch": 2.646319569120287, + "grad_norm": 0.8636548519134521, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 36850 + }, + { + "epoch": 2.6470377019748654, + "grad_norm": 0.8557497262954712, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 36860 + }, + { + "epoch": 2.6477558348294434, + "grad_norm": 0.8535996675491333, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 36870 + }, + { + "epoch": 2.6484739676840214, + "grad_norm": 0.7996463775634766, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 36880 + }, + { + "epoch": 2.6491921005386, + "grad_norm": 0.6462067365646362, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 36890 + }, + { + "epoch": 2.649910233393178, + "grad_norm": 0.8849772214889526, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36900 + }, + { + "epoch": 2.650628366247756, + "grad_norm": 0.999173641204834, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 36910 + }, + { + "epoch": 2.651346499102334, + "grad_norm": 0.7221724987030029, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 36920 + }, + { + "epoch": 2.652064631956912, + "grad_norm": 0.8122989535331726, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 36930 + }, + { + "epoch": 2.65278276481149, + "grad_norm": 0.724267840385437, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 36940 + }, + { + "epoch": 2.6535008976660683, + "grad_norm": 0.8250583410263062, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 36950 + }, + { + "epoch": 2.6542190305206463, + "grad_norm": 0.7623526453971863, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 36960 + }, + { + "epoch": 2.6549371633752243, + "grad_norm": 0.6474025845527649, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 36970 + }, + { + "epoch": 2.655655296229803, + "grad_norm": 0.9751694202423096, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 36980 + }, + { + "epoch": 2.656373429084381, + "grad_norm": 0.8338939547538757, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 36990 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 0.8877421021461487, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 37000 + }, + { + "epoch": 2.657809694793537, + "grad_norm": 0.9590298533439636, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 37010 + }, + { + "epoch": 2.658527827648115, + "grad_norm": 0.8224121928215027, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 37020 + }, + { + "epoch": 2.659245960502693, + "grad_norm": 0.9871236681938171, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 37030 + }, + { + "epoch": 2.6599640933572712, + "grad_norm": 0.8729037046432495, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 37040 + }, + { + "epoch": 2.6606822262118492, + "grad_norm": 0.6279319524765015, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 37050 + }, + { + "epoch": 2.6614003590664272, + "grad_norm": 1.0278962850570679, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37060 + }, + { + "epoch": 2.6621184919210052, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 37070 + }, + { + "epoch": 2.6628366247755837, + "grad_norm": 0.7432018518447876, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 37080 + }, + { + "epoch": 2.6635547576301617, + "grad_norm": 0.9425008296966553, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 37090 + }, + { + "epoch": 2.6642728904847397, + "grad_norm": 0.7542579174041748, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 37100 + }, + { + "epoch": 2.6649910233393177, + "grad_norm": 0.8469315767288208, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 37110 + }, + { + "epoch": 2.6657091561938957, + "grad_norm": 0.865777313709259, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 37120 + }, + { + "epoch": 2.6664272890484737, + "grad_norm": 0.7293250560760498, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 37130 + }, + { + "epoch": 2.667145421903052, + "grad_norm": 0.7199395895004272, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 37140 + }, + { + "epoch": 2.66786355475763, + "grad_norm": 0.7801268100738525, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 37150 + }, + { + "epoch": 2.668581687612208, + "grad_norm": 0.8706921935081482, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 37160 + }, + { + "epoch": 2.6692998204667866, + "grad_norm": 0.7124722599983215, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 37170 + }, + { + "epoch": 2.6700179533213646, + "grad_norm": 0.8333015441894531, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 37180 + }, + { + "epoch": 2.6707360861759426, + "grad_norm": 0.8822736740112305, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 37190 + }, + { + "epoch": 2.6714542190305206, + "grad_norm": 0.8300906419754028, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 37200 + }, + { + "epoch": 2.6721723518850986, + "grad_norm": 0.887126088142395, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37210 + }, + { + "epoch": 2.6728904847396766, + "grad_norm": 0.7473671436309814, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 37220 + }, + { + "epoch": 2.673608617594255, + "grad_norm": 0.8121018409729004, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 37230 + }, + { + "epoch": 2.674326750448833, + "grad_norm": 0.7882586717605591, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 37240 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 0.797060489654541, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 37250 + }, + { + "epoch": 2.6757630161579895, + "grad_norm": 0.9776935577392578, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 37260 + }, + { + "epoch": 2.6764811490125675, + "grad_norm": 0.9527283906936646, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37270 + }, + { + "epoch": 2.6771992818671455, + "grad_norm": 0.7232038974761963, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 37280 + }, + { + "epoch": 2.6779174147217235, + "grad_norm": 0.8514575362205505, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 37290 + }, + { + "epoch": 2.6786355475763015, + "grad_norm": 0.8951214551925659, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 37300 + }, + { + "epoch": 2.6793536804308795, + "grad_norm": 0.7569643259048462, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 37310 + }, + { + "epoch": 2.680071813285458, + "grad_norm": 1.0522346496582031, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 37320 + }, + { + "epoch": 2.680789946140036, + "grad_norm": 0.8914180994033813, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 37330 + }, + { + "epoch": 2.681508078994614, + "grad_norm": 0.8251807689666748, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 37340 + }, + { + "epoch": 2.682226211849192, + "grad_norm": 0.8215394020080566, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 37350 + }, + { + "epoch": 2.6829443447037704, + "grad_norm": 0.8043696880340576, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 37360 + }, + { + "epoch": 2.6836624775583484, + "grad_norm": 0.767250657081604, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 37370 + }, + { + "epoch": 2.6843806104129264, + "grad_norm": 0.817740261554718, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 37380 + }, + { + "epoch": 2.6850987432675044, + "grad_norm": 0.7963255047798157, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 37390 + }, + { + "epoch": 2.6858168761220824, + "grad_norm": 0.839271605014801, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 37400 + }, + { + "epoch": 2.6865350089766604, + "grad_norm": 0.7882823348045349, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 37410 + }, + { + "epoch": 2.687253141831239, + "grad_norm": 0.8316412568092346, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 37420 + }, + { + "epoch": 2.687971274685817, + "grad_norm": 1.0044993162155151, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37430 + }, + { + "epoch": 2.688689407540395, + "grad_norm": 0.8342832326889038, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 37440 + }, + { + "epoch": 2.6894075403949733, + "grad_norm": 0.6743215322494507, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 37450 + }, + { + "epoch": 2.6901256732495513, + "grad_norm": 0.6872923970222473, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 37460 + }, + { + "epoch": 2.6908438061041293, + "grad_norm": 0.7377792596817017, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 37470 + }, + { + "epoch": 2.6915619389587073, + "grad_norm": 0.7677304744720459, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 37480 + }, + { + "epoch": 2.6922800718132853, + "grad_norm": 0.9951061010360718, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 37490 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 0.7452111840248108, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 37500 + }, + { + "epoch": 2.6937163375224418, + "grad_norm": 0.9663393497467041, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 37510 + }, + { + "epoch": 2.6944344703770198, + "grad_norm": 0.7919635772705078, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 37520 + }, + { + "epoch": 2.6951526032315978, + "grad_norm": 0.9977981448173523, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 37530 + }, + { + "epoch": 2.695870736086176, + "grad_norm": 0.7279480695724487, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 37540 + }, + { + "epoch": 2.6965888689407542, + "grad_norm": 0.7218075394630432, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 37550 + }, + { + "epoch": 2.6973070017953322, + "grad_norm": 0.9041047096252441, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 37560 + }, + { + "epoch": 2.6980251346499102, + "grad_norm": 0.7689407467842102, + "learning_rate": 0.0002, + "loss": 0.6848, + "step": 37570 + }, + { + "epoch": 2.6987432675044882, + "grad_norm": 0.8184728622436523, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 37580 + }, + { + "epoch": 2.6994614003590662, + "grad_norm": 0.7536661624908447, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 37590 + }, + { + "epoch": 2.7001795332136447, + "grad_norm": 0.8371431231498718, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 37600 + }, + { + "epoch": 2.7008976660682227, + "grad_norm": 0.8562723994255066, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 37610 + }, + { + "epoch": 2.7016157989228007, + "grad_norm": 0.8227898478507996, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 37620 + }, + { + "epoch": 2.7023339317773787, + "grad_norm": 0.764792799949646, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 37630 + }, + { + "epoch": 2.703052064631957, + "grad_norm": 0.7782649993896484, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 37640 + }, + { + "epoch": 2.703770197486535, + "grad_norm": 0.7669944167137146, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 37650 + }, + { + "epoch": 2.704488330341113, + "grad_norm": 0.7945750951766968, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 37660 + }, + { + "epoch": 2.705206463195691, + "grad_norm": 0.6840786337852478, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 37670 + }, + { + "epoch": 2.705924596050269, + "grad_norm": 1.0565117597579956, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 37680 + }, + { + "epoch": 2.706642728904847, + "grad_norm": 0.7407042384147644, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 37690 + }, + { + "epoch": 2.7073608617594256, + "grad_norm": 0.7862113118171692, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 37700 + }, + { + "epoch": 2.7080789946140036, + "grad_norm": 0.7487596273422241, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 37710 + }, + { + "epoch": 2.7087971274685816, + "grad_norm": 0.9416596293449402, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 37720 + }, + { + "epoch": 2.70951526032316, + "grad_norm": 0.8943207263946533, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 37730 + }, + { + "epoch": 2.710233393177738, + "grad_norm": 0.9263445138931274, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 37740 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 0.6869737505912781, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 37750 + }, + { + "epoch": 2.711669658886894, + "grad_norm": 0.9186407923698425, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 37760 + }, + { + "epoch": 2.712387791741472, + "grad_norm": 0.8379335999488831, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 37770 + }, + { + "epoch": 2.71310592459605, + "grad_norm": 0.7248736023902893, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 37780 + }, + { + "epoch": 2.7138240574506285, + "grad_norm": 0.8636229038238525, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 37790 + }, + { + "epoch": 2.7145421903052065, + "grad_norm": 0.7590767741203308, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 37800 + }, + { + "epoch": 2.7152603231597845, + "grad_norm": 0.8946404457092285, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 37810 + }, + { + "epoch": 2.7159784560143625, + "grad_norm": 0.7822132706642151, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 37820 + }, + { + "epoch": 2.716696588868941, + "grad_norm": 0.7882820963859558, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 37830 + }, + { + "epoch": 2.717414721723519, + "grad_norm": 0.8025872707366943, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 37840 + }, + { + "epoch": 2.718132854578097, + "grad_norm": 0.8618839979171753, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 37850 + }, + { + "epoch": 2.718850987432675, + "grad_norm": 0.6975733637809753, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 37860 + }, + { + "epoch": 2.719569120287253, + "grad_norm": 0.7952182292938232, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 37870 + }, + { + "epoch": 2.7202872531418314, + "grad_norm": 0.7580680251121521, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 37880 + }, + { + "epoch": 2.7210053859964094, + "grad_norm": 0.9504257440567017, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 37890 + }, + { + "epoch": 2.7217235188509874, + "grad_norm": 0.856614351272583, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 37900 + }, + { + "epoch": 2.7224416517055654, + "grad_norm": 1.0092085599899292, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 37910 + }, + { + "epoch": 2.723159784560144, + "grad_norm": 0.9009839296340942, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 37920 + }, + { + "epoch": 2.723877917414722, + "grad_norm": 0.9247435331344604, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 37930 + }, + { + "epoch": 2.7245960502693, + "grad_norm": 1.0774317979812622, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 37940 + }, + { + "epoch": 2.725314183123878, + "grad_norm": 0.9104372262954712, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 37950 + }, + { + "epoch": 2.726032315978456, + "grad_norm": 0.7904245257377625, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 37960 + }, + { + "epoch": 2.726750448833034, + "grad_norm": 0.9555521607398987, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 37970 + }, + { + "epoch": 2.7274685816876123, + "grad_norm": 0.7769099473953247, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 37980 + }, + { + "epoch": 2.7281867145421903, + "grad_norm": 0.9202065467834473, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 37990 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 0.732510507106781, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 38000 + }, + { + "epoch": 2.7296229802513468, + "grad_norm": 0.7723771929740906, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 38010 + }, + { + "epoch": 2.7303411131059248, + "grad_norm": 0.7948567867279053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 38020 + }, + { + "epoch": 2.7310592459605028, + "grad_norm": 0.7702966928482056, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 38030 + }, + { + "epoch": 2.7317773788150808, + "grad_norm": 0.689098060131073, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 38040 + }, + { + "epoch": 2.7324955116696588, + "grad_norm": 0.7951080203056335, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 38050 + }, + { + "epoch": 2.7332136445242368, + "grad_norm": 0.7284924983978271, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 38060 + }, + { + "epoch": 2.733931777378815, + "grad_norm": 0.9198044538497925, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 38070 + }, + { + "epoch": 2.734649910233393, + "grad_norm": 0.8653260469436646, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 38080 + }, + { + "epoch": 2.735368043087971, + "grad_norm": 0.8503400683403015, + "learning_rate": 0.0002, + "loss": 0.6832, + "step": 38090 + }, + { + "epoch": 2.736086175942549, + "grad_norm": 0.8388783931732178, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 38100 + }, + { + "epoch": 2.7368043087971277, + "grad_norm": 0.7636904716491699, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 38110 + }, + { + "epoch": 2.7375224416517057, + "grad_norm": 0.8990790247917175, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 38120 + }, + { + "epoch": 2.7382405745062837, + "grad_norm": 0.8878970742225647, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 38130 + }, + { + "epoch": 2.7389587073608617, + "grad_norm": 0.7684310078620911, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 38140 + }, + { + "epoch": 2.7396768402154397, + "grad_norm": 1.0777359008789062, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 38150 + }, + { + "epoch": 2.740394973070018, + "grad_norm": 0.768764317035675, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 38160 + }, + { + "epoch": 2.741113105924596, + "grad_norm": 0.7490760087966919, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 38170 + }, + { + "epoch": 2.741831238779174, + "grad_norm": 0.860373854637146, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 38180 + }, + { + "epoch": 2.742549371633752, + "grad_norm": 0.7145599722862244, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 38190 + }, + { + "epoch": 2.7432675044883306, + "grad_norm": 0.8347760438919067, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 38200 + }, + { + "epoch": 2.7439856373429086, + "grad_norm": 0.8425729274749756, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 38210 + }, + { + "epoch": 2.7447037701974866, + "grad_norm": 0.9289436936378479, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 38220 + }, + { + "epoch": 2.7454219030520646, + "grad_norm": 0.7608675360679626, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 38230 + }, + { + "epoch": 2.7461400359066426, + "grad_norm": 0.8067167401313782, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 38240 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 0.8599629402160645, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 38250 + }, + { + "epoch": 2.747576301615799, + "grad_norm": 0.8425742387771606, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 38260 + }, + { + "epoch": 2.748294434470377, + "grad_norm": 0.8626754283905029, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 38270 + }, + { + "epoch": 2.749012567324955, + "grad_norm": 0.797652006149292, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 38280 + }, + { + "epoch": 2.7497307001795335, + "grad_norm": 0.7971500754356384, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 38290 + }, + { + "epoch": 2.7504488330341115, + "grad_norm": 0.9786333441734314, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 38300 + }, + { + "epoch": 2.7511669658886895, + "grad_norm": 0.7146100997924805, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 38310 + }, + { + "epoch": 2.7518850987432675, + "grad_norm": 0.8436099886894226, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 38320 + }, + { + "epoch": 2.7526032315978455, + "grad_norm": 0.8943847417831421, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 38330 + }, + { + "epoch": 2.7533213644524235, + "grad_norm": 0.8170148730278015, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 38340 + }, + { + "epoch": 2.754039497307002, + "grad_norm": 0.7804728746414185, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 38350 + }, + { + "epoch": 2.75475763016158, + "grad_norm": 0.9139971137046814, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38360 + }, + { + "epoch": 2.755475763016158, + "grad_norm": 0.835332453250885, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 38370 + }, + { + "epoch": 2.756193895870736, + "grad_norm": 1.0904794931411743, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 38380 + }, + { + "epoch": 2.7569120287253144, + "grad_norm": 0.7443365454673767, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 38390 + }, + { + "epoch": 2.7576301615798924, + "grad_norm": 1.1336839199066162, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 38400 + }, + { + "epoch": 2.7583482944344704, + "grad_norm": 0.9024015665054321, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 38410 + }, + { + "epoch": 2.7590664272890484, + "grad_norm": 0.7380578517913818, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 38420 + }, + { + "epoch": 2.7597845601436264, + "grad_norm": 0.9860634207725525, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 38430 + }, + { + "epoch": 2.760502692998205, + "grad_norm": 0.7928970456123352, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 38440 + }, + { + "epoch": 2.761220825852783, + "grad_norm": 1.0357221364974976, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 38450 + }, + { + "epoch": 2.761938958707361, + "grad_norm": 0.8110901117324829, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 38460 + }, + { + "epoch": 2.762657091561939, + "grad_norm": 0.8420981764793396, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 38470 + }, + { + "epoch": 2.7633752244165173, + "grad_norm": 0.858955979347229, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 38480 + }, + { + "epoch": 2.7640933572710953, + "grad_norm": 0.9851368069648743, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 38490 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 0.8073325753211975, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 38500 + }, + { + "epoch": 2.7655296229802513, + "grad_norm": 1.0654062032699585, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38510 + }, + { + "epoch": 2.7662477558348293, + "grad_norm": 0.719603955745697, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 38520 + }, + { + "epoch": 2.7669658886894073, + "grad_norm": 0.9790831804275513, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38530 + }, + { + "epoch": 2.7676840215439857, + "grad_norm": 0.907619833946228, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 38540 + }, + { + "epoch": 2.7684021543985637, + "grad_norm": 0.7463719248771667, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 38550 + }, + { + "epoch": 2.7691202872531417, + "grad_norm": 1.0687178373336792, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 38560 + }, + { + "epoch": 2.76983842010772, + "grad_norm": 0.7397776246070862, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 38570 + }, + { + "epoch": 2.770556552962298, + "grad_norm": 0.7392559051513672, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 38580 + }, + { + "epoch": 2.771274685816876, + "grad_norm": 0.9774793982505798, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38590 + }, + { + "epoch": 2.771992818671454, + "grad_norm": 0.9502208828926086, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 38600 + }, + { + "epoch": 2.772710951526032, + "grad_norm": 0.776108980178833, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 38610 + }, + { + "epoch": 2.77342908438061, + "grad_norm": 0.7633077502250671, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 38620 + }, + { + "epoch": 2.7741472172351886, + "grad_norm": 0.9445580244064331, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 38630 + }, + { + "epoch": 2.7748653500897666, + "grad_norm": 0.943165123462677, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 38640 + }, + { + "epoch": 2.7755834829443446, + "grad_norm": 0.9045929908752441, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 38650 + }, + { + "epoch": 2.7763016157989227, + "grad_norm": 0.9425684213638306, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 38660 + }, + { + "epoch": 2.777019748653501, + "grad_norm": 0.9106295704841614, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 38670 + }, + { + "epoch": 2.777737881508079, + "grad_norm": 0.6264749765396118, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 38680 + }, + { + "epoch": 2.778456014362657, + "grad_norm": 0.9156801700592041, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 38690 + }, + { + "epoch": 2.779174147217235, + "grad_norm": 0.9752956032752991, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 38700 + }, + { + "epoch": 2.779892280071813, + "grad_norm": 0.7849555611610413, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 38710 + }, + { + "epoch": 2.780610412926391, + "grad_norm": 0.8109981417655945, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 38720 + }, + { + "epoch": 2.7813285457809696, + "grad_norm": 0.7882387638092041, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 38730 + }, + { + "epoch": 2.7820466786355476, + "grad_norm": 0.9049678444862366, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 38740 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 0.7678212523460388, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38750 + }, + { + "epoch": 2.783482944344704, + "grad_norm": 0.9754453301429749, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 38760 + }, + { + "epoch": 2.784201077199282, + "grad_norm": 0.7643493413925171, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 38770 + }, + { + "epoch": 2.78491921005386, + "grad_norm": 0.7440303564071655, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 38780 + }, + { + "epoch": 2.785637342908438, + "grad_norm": 0.8870946168899536, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 38790 + }, + { + "epoch": 2.786355475763016, + "grad_norm": 0.8100579977035522, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 38800 + }, + { + "epoch": 2.787073608617594, + "grad_norm": 0.7082616090774536, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 38810 + }, + { + "epoch": 2.7877917414721725, + "grad_norm": 0.7880047559738159, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 38820 + }, + { + "epoch": 2.7885098743267505, + "grad_norm": 0.7217963337898254, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 38830 + }, + { + "epoch": 2.7892280071813285, + "grad_norm": 0.799124002456665, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 38840 + }, + { + "epoch": 2.789946140035907, + "grad_norm": 1.0004022121429443, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 38850 + }, + { + "epoch": 2.790664272890485, + "grad_norm": 0.7866547107696533, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 38860 + }, + { + "epoch": 2.791382405745063, + "grad_norm": 0.891603410243988, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 38870 + }, + { + "epoch": 2.792100538599641, + "grad_norm": 0.7687129378318787, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 38880 + }, + { + "epoch": 2.792818671454219, + "grad_norm": 0.7549769282341003, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 38890 + }, + { + "epoch": 2.793536804308797, + "grad_norm": 0.7792351245880127, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 38900 + }, + { + "epoch": 2.7942549371633754, + "grad_norm": 0.7352819442749023, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 38910 + }, + { + "epoch": 2.7949730700179534, + "grad_norm": 0.8758018612861633, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 38920 + }, + { + "epoch": 2.7956912028725314, + "grad_norm": 0.8213023543357849, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38930 + }, + { + "epoch": 2.7964093357271094, + "grad_norm": 0.899368941783905, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 38940 + }, + { + "epoch": 2.797127468581688, + "grad_norm": 0.7497758269309998, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 38950 + }, + { + "epoch": 2.797845601436266, + "grad_norm": 0.870704710483551, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 38960 + }, + { + "epoch": 2.798563734290844, + "grad_norm": 0.8021528720855713, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 38970 + }, + { + "epoch": 2.799281867145422, + "grad_norm": 0.7541360855102539, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 38980 + }, + { + "epoch": 2.8, + "grad_norm": 0.8909788131713867, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 38990 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 0.8175999522209167, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 39000 + }, + { + "epoch": 2.8014362657091563, + "grad_norm": 0.7336044311523438, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 39010 + }, + { + "epoch": 2.8021543985637343, + "grad_norm": 0.7354168891906738, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 39020 + }, + { + "epoch": 2.8028725314183123, + "grad_norm": 0.8771968483924866, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 39030 + }, + { + "epoch": 2.8035906642728907, + "grad_norm": 0.8073309063911438, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39040 + }, + { + "epoch": 2.8043087971274687, + "grad_norm": 0.8475365042686462, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39050 + }, + { + "epoch": 2.8050269299820467, + "grad_norm": 0.7233281135559082, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 39060 + }, + { + "epoch": 2.8057450628366247, + "grad_norm": 0.9850572347640991, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39070 + }, + { + "epoch": 2.8064631956912027, + "grad_norm": 1.0635435581207275, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 39080 + }, + { + "epoch": 2.8071813285457807, + "grad_norm": 0.8183665871620178, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 39090 + }, + { + "epoch": 2.807899461400359, + "grad_norm": 0.802228569984436, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 39100 + }, + { + "epoch": 2.808617594254937, + "grad_norm": 0.9861624836921692, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 39110 + }, + { + "epoch": 2.809335727109515, + "grad_norm": 0.675205409526825, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 39120 + }, + { + "epoch": 2.8100538599640936, + "grad_norm": 0.7503975629806519, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 39130 + }, + { + "epoch": 2.8107719928186716, + "grad_norm": 0.8266825675964355, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 39140 + }, + { + "epoch": 2.8114901256732496, + "grad_norm": 0.6956485509872437, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39150 + }, + { + "epoch": 2.8122082585278276, + "grad_norm": 0.7363799214363098, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 39160 + }, + { + "epoch": 2.8129263913824056, + "grad_norm": 1.3893407583236694, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 39170 + }, + { + "epoch": 2.8136445242369836, + "grad_norm": 1.0619654655456543, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 39180 + }, + { + "epoch": 2.814362657091562, + "grad_norm": 0.7924326062202454, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 39190 + }, + { + "epoch": 2.81508078994614, + "grad_norm": 0.8838121294975281, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 39200 + }, + { + "epoch": 2.815798922800718, + "grad_norm": 0.9059016108512878, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 39210 + }, + { + "epoch": 2.816517055655296, + "grad_norm": 0.9284590482711792, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 39220 + }, + { + "epoch": 2.8172351885098745, + "grad_norm": 0.7992225289344788, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 39230 + }, + { + "epoch": 2.8179533213644525, + "grad_norm": 0.816376805305481, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 39240 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 0.9183637499809265, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 39250 + }, + { + "epoch": 2.8193895870736085, + "grad_norm": 0.7232057452201843, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 39260 + }, + { + "epoch": 2.8201077199281865, + "grad_norm": 0.9012457728385925, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 39270 + }, + { + "epoch": 2.8208258527827645, + "grad_norm": 0.7796093821525574, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 39280 + }, + { + "epoch": 2.821543985637343, + "grad_norm": 0.8331146836280823, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 39290 + }, + { + "epoch": 2.822262118491921, + "grad_norm": 0.8031269907951355, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 39300 + }, + { + "epoch": 2.822980251346499, + "grad_norm": 0.8563299179077148, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 39310 + }, + { + "epoch": 2.8236983842010774, + "grad_norm": 0.8083387613296509, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 39320 + }, + { + "epoch": 2.8244165170556554, + "grad_norm": 0.8132631182670593, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 39330 + }, + { + "epoch": 2.8251346499102334, + "grad_norm": 0.9071316719055176, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39340 + }, + { + "epoch": 2.8258527827648114, + "grad_norm": 0.8224168419837952, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 39350 + }, + { + "epoch": 2.8265709156193894, + "grad_norm": 1.073014497756958, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 39360 + }, + { + "epoch": 2.8272890484739674, + "grad_norm": 0.9466553926467896, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 39370 + }, + { + "epoch": 2.828007181328546, + "grad_norm": 0.8946257829666138, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 39380 + }, + { + "epoch": 2.828725314183124, + "grad_norm": 0.8497758507728577, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 39390 + }, + { + "epoch": 2.829443447037702, + "grad_norm": 0.8952143788337708, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 39400 + }, + { + "epoch": 2.8301615798922803, + "grad_norm": 0.8839313983917236, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 39410 + }, + { + "epoch": 2.8308797127468583, + "grad_norm": 0.7576757669448853, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 39420 + }, + { + "epoch": 2.8315978456014363, + "grad_norm": 0.8212469816207886, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 39430 + }, + { + "epoch": 2.8323159784560143, + "grad_norm": 0.9289504885673523, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 39440 + }, + { + "epoch": 2.8330341113105924, + "grad_norm": 0.8745405077934265, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 39450 + }, + { + "epoch": 2.8337522441651704, + "grad_norm": 0.7974533438682556, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 39460 + }, + { + "epoch": 2.834470377019749, + "grad_norm": 0.914289116859436, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 39470 + }, + { + "epoch": 2.835188509874327, + "grad_norm": 0.7686914801597595, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 39480 + }, + { + "epoch": 2.835906642728905, + "grad_norm": 0.9289370179176331, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39490 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 0.8851973414421082, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 39500 + }, + { + "epoch": 2.8373429084380613, + "grad_norm": 0.7754096388816833, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 39510 + }, + { + "epoch": 2.8380610412926393, + "grad_norm": 0.8801632523536682, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 39520 + }, + { + "epoch": 2.8387791741472173, + "grad_norm": 0.9031528234481812, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 39530 + }, + { + "epoch": 2.8394973070017953, + "grad_norm": 0.7113721966743469, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 39540 + }, + { + "epoch": 2.8402154398563733, + "grad_norm": 0.7880923748016357, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 39550 + }, + { + "epoch": 2.8409335727109513, + "grad_norm": 2.4828813076019287, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39560 + }, + { + "epoch": 2.8416517055655297, + "grad_norm": 0.9174619913101196, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 39570 + }, + { + "epoch": 2.8423698384201077, + "grad_norm": 0.9708074927330017, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 39580 + }, + { + "epoch": 2.8430879712746857, + "grad_norm": 0.7968248724937439, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 39590 + }, + { + "epoch": 2.843806104129264, + "grad_norm": 0.7967682480812073, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 39600 + }, + { + "epoch": 2.844524236983842, + "grad_norm": 0.7487651109695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 39610 + }, + { + "epoch": 2.84524236983842, + "grad_norm": 0.6997556686401367, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 39620 + }, + { + "epoch": 2.845960502692998, + "grad_norm": 0.7639351487159729, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39630 + }, + { + "epoch": 2.846678635547576, + "grad_norm": 0.9086648225784302, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 39640 + }, + { + "epoch": 2.847396768402154, + "grad_norm": 0.91103196144104, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 39650 + }, + { + "epoch": 2.8481149012567326, + "grad_norm": 0.8096913695335388, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 39660 + }, + { + "epoch": 2.8488330341113106, + "grad_norm": 0.8961427807807922, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39670 + }, + { + "epoch": 2.8495511669658886, + "grad_norm": 0.7489904761314392, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 39680 + }, + { + "epoch": 2.850269299820467, + "grad_norm": 0.7893617749214172, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 39690 + }, + { + "epoch": 2.850987432675045, + "grad_norm": 0.8259761929512024, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 39700 + }, + { + "epoch": 2.851705565529623, + "grad_norm": 0.7006617188453674, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 39710 + }, + { + "epoch": 2.852423698384201, + "grad_norm": 0.8922327756881714, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 39720 + }, + { + "epoch": 2.853141831238779, + "grad_norm": 0.9058550000190735, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 39730 + }, + { + "epoch": 2.853859964093357, + "grad_norm": 0.7627129554748535, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 39740 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 0.9316968321800232, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39750 + }, + { + "epoch": 2.8552962298025135, + "grad_norm": 0.8424679040908813, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39760 + }, + { + "epoch": 2.8560143626570915, + "grad_norm": 0.6185386776924133, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 39770 + }, + { + "epoch": 2.8567324955116695, + "grad_norm": 0.709902286529541, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 39780 + }, + { + "epoch": 2.857450628366248, + "grad_norm": 0.93730229139328, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 39790 + }, + { + "epoch": 2.858168761220826, + "grad_norm": 0.875989556312561, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 39800 + }, + { + "epoch": 2.858886894075404, + "grad_norm": 0.7424131631851196, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 39810 + }, + { + "epoch": 2.859605026929982, + "grad_norm": 0.9108477830886841, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 39820 + }, + { + "epoch": 2.86032315978456, + "grad_norm": 0.8248386383056641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 39830 + }, + { + "epoch": 2.861041292639138, + "grad_norm": 0.8739979863166809, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 39840 + }, + { + "epoch": 2.8617594254937164, + "grad_norm": 0.7940961122512817, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 39850 + }, + { + "epoch": 2.8624775583482944, + "grad_norm": 0.7594687938690186, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 39860 + }, + { + "epoch": 2.8631956912028724, + "grad_norm": 0.9884313941001892, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 39870 + }, + { + "epoch": 2.863913824057451, + "grad_norm": 0.8537741303443909, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 39880 + }, + { + "epoch": 2.864631956912029, + "grad_norm": 0.7407512664794922, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 39890 + }, + { + "epoch": 2.865350089766607, + "grad_norm": 1.0179548263549805, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 39900 + }, + { + "epoch": 2.866068222621185, + "grad_norm": 0.8822470307350159, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 39910 + }, + { + "epoch": 2.866786355475763, + "grad_norm": 0.794448733329773, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 39920 + }, + { + "epoch": 2.867504488330341, + "grad_norm": 0.8115299940109253, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 39930 + }, + { + "epoch": 2.8682226211849193, + "grad_norm": 0.7998958826065063, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 39940 + }, + { + "epoch": 2.8689407540394973, + "grad_norm": 0.8222435116767883, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 39950 + }, + { + "epoch": 2.8696588868940753, + "grad_norm": 0.9495923519134521, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39960 + }, + { + "epoch": 2.8703770197486533, + "grad_norm": 0.6749192476272583, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 39970 + }, + { + "epoch": 2.871095152603232, + "grad_norm": 0.8910874128341675, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 39980 + }, + { + "epoch": 2.87181328545781, + "grad_norm": 0.7051638960838318, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 39990 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 0.8456535339355469, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 40000 + }, + { + "epoch": 2.873249551166966, + "grad_norm": 0.934894859790802, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 40010 + }, + { + "epoch": 2.873967684021544, + "grad_norm": 0.6740477681159973, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 40020 + }, + { + "epoch": 2.8746858168761222, + "grad_norm": 0.6632325649261475, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 40030 + }, + { + "epoch": 2.8754039497307002, + "grad_norm": 0.8889022469520569, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 40040 + }, + { + "epoch": 2.8761220825852782, + "grad_norm": 0.7460705637931824, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 40050 + }, + { + "epoch": 2.8768402154398562, + "grad_norm": 0.9795911908149719, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 40060 + }, + { + "epoch": 2.8775583482944347, + "grad_norm": 1.0002509355545044, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 40070 + }, + { + "epoch": 2.8782764811490127, + "grad_norm": 0.7867239713668823, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 40080 + }, + { + "epoch": 2.8789946140035907, + "grad_norm": 1.0221471786499023, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 40090 + }, + { + "epoch": 2.8797127468581687, + "grad_norm": 0.8091005086898804, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 40100 + }, + { + "epoch": 2.8804308797127467, + "grad_norm": 0.8485820293426514, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 40110 + }, + { + "epoch": 2.8811490125673247, + "grad_norm": 0.7850196957588196, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 40120 + }, + { + "epoch": 2.881867145421903, + "grad_norm": 0.7906134128570557, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 40130 + }, + { + "epoch": 2.882585278276481, + "grad_norm": 0.7957962155342102, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 40140 + }, + { + "epoch": 2.883303411131059, + "grad_norm": 1.0687522888183594, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 40150 + }, + { + "epoch": 2.8840215439856376, + "grad_norm": 0.713752031326294, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 40160 + }, + { + "epoch": 2.8847396768402156, + "grad_norm": 1.1603864431381226, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 40170 + }, + { + "epoch": 2.8854578096947936, + "grad_norm": 0.8423245549201965, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 40180 + }, + { + "epoch": 2.8861759425493716, + "grad_norm": 0.7554550766944885, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40190 + }, + { + "epoch": 2.8868940754039496, + "grad_norm": 0.6006978750228882, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 40200 + }, + { + "epoch": 2.8876122082585276, + "grad_norm": 0.923068106174469, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 40210 + }, + { + "epoch": 2.888330341113106, + "grad_norm": 0.7659787535667419, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 40220 + }, + { + "epoch": 2.889048473967684, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 40230 + }, + { + "epoch": 2.889766606822262, + "grad_norm": 1.1267355680465698, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 40240 + }, + { + "epoch": 2.89048473967684, + "grad_norm": 0.8548554182052612, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 40250 + }, + { + "epoch": 2.8912028725314185, + "grad_norm": 0.7846875786781311, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 40260 + }, + { + "epoch": 2.8919210053859965, + "grad_norm": 0.8606904745101929, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40270 + }, + { + "epoch": 2.8926391382405745, + "grad_norm": 0.6508898138999939, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 40280 + }, + { + "epoch": 2.8933572710951525, + "grad_norm": 0.7903237342834473, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 40290 + }, + { + "epoch": 2.8940754039497305, + "grad_norm": 0.7320941686630249, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 40300 + }, + { + "epoch": 2.894793536804309, + "grad_norm": 1.0031821727752686, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 40310 + }, + { + "epoch": 2.895511669658887, + "grad_norm": 0.7463554739952087, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 40320 + }, + { + "epoch": 2.896229802513465, + "grad_norm": 0.8455599546432495, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 40330 + }, + { + "epoch": 2.896947935368043, + "grad_norm": 0.7645914554595947, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 40340 + }, + { + "epoch": 2.8976660682226214, + "grad_norm": 0.9074810147285461, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 40350 + }, + { + "epoch": 2.8983842010771994, + "grad_norm": 0.9070153832435608, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 40360 + }, + { + "epoch": 2.8991023339317774, + "grad_norm": 0.8649221658706665, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 40370 + }, + { + "epoch": 2.8998204667863554, + "grad_norm": 1.0325016975402832, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 40380 + }, + { + "epoch": 2.9005385996409334, + "grad_norm": 0.8688622713088989, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 40390 + }, + { + "epoch": 2.9012567324955114, + "grad_norm": 0.83316969871521, + "learning_rate": 0.0002, + "loss": 0.7209, + "step": 40400 + }, + { + "epoch": 2.90197486535009, + "grad_norm": 1.0146536827087402, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 40410 + }, + { + "epoch": 2.902692998204668, + "grad_norm": 6.21811580657959, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 40420 + }, + { + "epoch": 2.903411131059246, + "grad_norm": 0.8747655749320984, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 40430 + }, + { + "epoch": 2.9041292639138243, + "grad_norm": 0.8671547174453735, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 40440 + }, + { + "epoch": 2.9048473967684023, + "grad_norm": 0.7888760566711426, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 40450 + }, + { + "epoch": 2.9055655296229803, + "grad_norm": 0.7182217240333557, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 40460 + }, + { + "epoch": 2.9062836624775583, + "grad_norm": 0.8802227973937988, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 40470 + }, + { + "epoch": 2.9070017953321363, + "grad_norm": 0.8106126189231873, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 40480 + }, + { + "epoch": 2.9077199281867143, + "grad_norm": 0.7313538789749146, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 40490 + }, + { + "epoch": 2.9084380610412928, + "grad_norm": 0.6098655462265015, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40500 + }, + { + "epoch": 2.9091561938958708, + "grad_norm": 0.8849560618400574, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 40510 + }, + { + "epoch": 2.9098743267504488, + "grad_norm": 0.8761322498321533, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 40520 + }, + { + "epoch": 2.9105924596050268, + "grad_norm": 0.8259703516960144, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 40530 + }, + { + "epoch": 2.911310592459605, + "grad_norm": 0.6613079309463501, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 40540 + }, + { + "epoch": 2.912028725314183, + "grad_norm": 0.825678825378418, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 40550 + }, + { + "epoch": 2.912746858168761, + "grad_norm": 0.824850857257843, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 40560 + }, + { + "epoch": 2.9134649910233392, + "grad_norm": 0.9629682898521423, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 40570 + }, + { + "epoch": 2.9141831238779172, + "grad_norm": 0.7446485161781311, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 40580 + }, + { + "epoch": 2.9149012567324957, + "grad_norm": 0.9028317928314209, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 40590 + }, + { + "epoch": 2.9156193895870737, + "grad_norm": 0.9646022319793701, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 40600 + }, + { + "epoch": 2.9163375224416517, + "grad_norm": 0.8845045566558838, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 40610 + }, + { + "epoch": 2.9170556552962297, + "grad_norm": 0.9660372734069824, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 40620 + }, + { + "epoch": 2.917773788150808, + "grad_norm": 0.8914347290992737, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 40630 + }, + { + "epoch": 2.918491921005386, + "grad_norm": 0.7789235711097717, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 40640 + }, + { + "epoch": 2.919210053859964, + "grad_norm": 0.8221206665039062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 40650 + }, + { + "epoch": 2.919928186714542, + "grad_norm": 0.9550618529319763, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 40660 + }, + { + "epoch": 2.92064631956912, + "grad_norm": 0.868315577507019, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 40670 + }, + { + "epoch": 2.921364452423698, + "grad_norm": 0.852878749370575, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 40680 + }, + { + "epoch": 2.9220825852782766, + "grad_norm": 0.8388790488243103, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 40690 + }, + { + "epoch": 2.9228007181328546, + "grad_norm": 0.9897602200508118, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 40700 + }, + { + "epoch": 2.9235188509874326, + "grad_norm": 0.8050527572631836, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 40710 + }, + { + "epoch": 2.924236983842011, + "grad_norm": 0.7296929955482483, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 40720 + }, + { + "epoch": 2.924955116696589, + "grad_norm": 0.917475700378418, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 40730 + }, + { + "epoch": 2.925673249551167, + "grad_norm": 0.9118483662605286, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 40740 + }, + { + "epoch": 2.926391382405745, + "grad_norm": 0.7722473740577698, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 40750 + }, + { + "epoch": 2.927109515260323, + "grad_norm": 0.7950358986854553, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 40760 + }, + { + "epoch": 2.927827648114901, + "grad_norm": 0.8868561387062073, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 40770 + }, + { + "epoch": 2.9285457809694795, + "grad_norm": 0.7923154830932617, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 40780 + }, + { + "epoch": 2.9292639138240575, + "grad_norm": 0.7285428047180176, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 40790 + }, + { + "epoch": 2.9299820466786355, + "grad_norm": 0.794775664806366, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 40800 + }, + { + "epoch": 2.9307001795332135, + "grad_norm": 0.8351698517799377, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 40810 + }, + { + "epoch": 2.931418312387792, + "grad_norm": 0.853082001209259, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40820 + }, + { + "epoch": 2.93213644524237, + "grad_norm": 0.8209722638130188, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 40830 + }, + { + "epoch": 2.932854578096948, + "grad_norm": 0.8982136845588684, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 40840 + }, + { + "epoch": 2.933572710951526, + "grad_norm": 0.8373305201530457, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 40850 + }, + { + "epoch": 2.934290843806104, + "grad_norm": 0.8326864242553711, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 40860 + }, + { + "epoch": 2.9350089766606824, + "grad_norm": 0.7232590317726135, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 40870 + }, + { + "epoch": 2.9357271095152604, + "grad_norm": 0.823615312576294, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 40880 + }, + { + "epoch": 2.9364452423698384, + "grad_norm": 0.7532811760902405, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 40890 + }, + { + "epoch": 2.9371633752244164, + "grad_norm": 0.9594773650169373, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 40900 + }, + { + "epoch": 2.937881508078995, + "grad_norm": 0.8368398547172546, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 40910 + }, + { + "epoch": 2.938599640933573, + "grad_norm": 0.8336817026138306, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 40920 + }, + { + "epoch": 2.939317773788151, + "grad_norm": 0.8413758277893066, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 40930 + }, + { + "epoch": 2.940035906642729, + "grad_norm": 0.7117549180984497, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 40940 + }, + { + "epoch": 2.940754039497307, + "grad_norm": 0.8741925954818726, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 40950 + }, + { + "epoch": 2.941472172351885, + "grad_norm": 0.8476088047027588, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 40960 + }, + { + "epoch": 2.9421903052064633, + "grad_norm": 0.674659788608551, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 40970 + }, + { + "epoch": 2.9429084380610413, + "grad_norm": 0.7087500691413879, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 40980 + }, + { + "epoch": 2.9436265709156193, + "grad_norm": 0.9202252626419067, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 40990 + }, + { + "epoch": 2.9443447037701977, + "grad_norm": 0.9775124192237854, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 41000 + }, + { + "epoch": 2.9450628366247757, + "grad_norm": 0.7465068101882935, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 41010 + }, + { + "epoch": 2.9457809694793538, + "grad_norm": 0.7229986786842346, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 41020 + }, + { + "epoch": 2.9464991023339318, + "grad_norm": 0.7228954434394836, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 41030 + }, + { + "epoch": 2.9472172351885098, + "grad_norm": 0.9396149516105652, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 41040 + }, + { + "epoch": 2.9479353680430878, + "grad_norm": 0.9458696842193604, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 41050 + }, + { + "epoch": 2.948653500897666, + "grad_norm": 0.8276246190071106, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 41060 + }, + { + "epoch": 2.949371633752244, + "grad_norm": 0.7927420139312744, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 41070 + }, + { + "epoch": 2.950089766606822, + "grad_norm": 0.7403103709220886, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 41080 + }, + { + "epoch": 2.9508078994614, + "grad_norm": 0.9813524484634399, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 41090 + }, + { + "epoch": 2.9515260323159787, + "grad_norm": 0.8560924530029297, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 41100 + }, + { + "epoch": 2.9522441651705567, + "grad_norm": 0.6937443017959595, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 41110 + }, + { + "epoch": 2.9529622980251347, + "grad_norm": 0.8440476655960083, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 41120 + }, + { + "epoch": 2.9536804308797127, + "grad_norm": 1.1260770559310913, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 41130 + }, + { + "epoch": 2.9543985637342907, + "grad_norm": 0.8789936900138855, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 41140 + }, + { + "epoch": 2.9551166965888687, + "grad_norm": 0.8205832839012146, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 41150 + }, + { + "epoch": 2.955834829443447, + "grad_norm": 0.8148444294929504, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 41160 + }, + { + "epoch": 2.956552962298025, + "grad_norm": 0.791296660900116, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41170 + }, + { + "epoch": 2.957271095152603, + "grad_norm": 1.3229854106903076, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 41180 + }, + { + "epoch": 2.9579892280071816, + "grad_norm": 0.906423807144165, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 41190 + }, + { + "epoch": 2.9587073608617596, + "grad_norm": 0.8707411289215088, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 41200 + }, + { + "epoch": 2.9594254937163376, + "grad_norm": 1.0362473726272583, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 41210 + }, + { + "epoch": 2.9601436265709156, + "grad_norm": 0.818546712398529, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 41220 + }, + { + "epoch": 2.9608617594254936, + "grad_norm": 0.8558517098426819, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 41230 + }, + { + "epoch": 2.9615798922800716, + "grad_norm": 0.8262931704521179, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 41240 + }, + { + "epoch": 2.96229802513465, + "grad_norm": 0.9603250026702881, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 41250 + }, + { + "epoch": 2.963016157989228, + "grad_norm": 0.891610860824585, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 41260 + }, + { + "epoch": 2.963734290843806, + "grad_norm": 0.9823883175849915, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 41270 + }, + { + "epoch": 2.9644524236983845, + "grad_norm": 0.8783510327339172, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 41280 + }, + { + "epoch": 2.9651705565529625, + "grad_norm": 0.873656690120697, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 41290 + }, + { + "epoch": 2.9658886894075405, + "grad_norm": 0.8281165957450867, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 41300 + }, + { + "epoch": 2.9666068222621185, + "grad_norm": 0.8008899092674255, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 41310 + }, + { + "epoch": 2.9673249551166965, + "grad_norm": 0.8564065098762512, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41320 + }, + { + "epoch": 2.9680430879712745, + "grad_norm": 0.786119818687439, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41330 + }, + { + "epoch": 2.968761220825853, + "grad_norm": 1.3152399063110352, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 41340 + }, + { + "epoch": 2.969479353680431, + "grad_norm": 0.7551527619361877, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 41350 + }, + { + "epoch": 2.970197486535009, + "grad_norm": 1.1397290229797363, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 41360 + }, + { + "epoch": 2.970915619389587, + "grad_norm": 0.8333854079246521, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 41370 + }, + { + "epoch": 2.9716337522441654, + "grad_norm": 0.8096165657043457, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 41380 + }, + { + "epoch": 2.9723518850987434, + "grad_norm": 0.8378547430038452, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 41390 + }, + { + "epoch": 2.9730700179533214, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 41400 + }, + { + "epoch": 2.9737881508078994, + "grad_norm": 0.8722409605979919, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 41410 + }, + { + "epoch": 2.9745062836624774, + "grad_norm": 0.6680061221122742, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 41420 + }, + { + "epoch": 2.9752244165170554, + "grad_norm": 0.7666152715682983, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 41430 + }, + { + "epoch": 2.975942549371634, + "grad_norm": 0.8489957451820374, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 41440 + }, + { + "epoch": 2.976660682226212, + "grad_norm": 0.8516127467155457, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 41450 + }, + { + "epoch": 2.97737881508079, + "grad_norm": 0.8836804628372192, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 41460 + }, + { + "epoch": 2.9780969479353683, + "grad_norm": 1.0963364839553833, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 41470 + }, + { + "epoch": 2.9788150807899463, + "grad_norm": 0.9908610582351685, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 41480 + }, + { + "epoch": 2.9795332136445243, + "grad_norm": 0.8822041153907776, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 41490 + }, + { + "epoch": 2.9802513464991023, + "grad_norm": 0.717723548412323, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 41500 + }, + { + "epoch": 2.9809694793536803, + "grad_norm": 0.8413400053977966, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 41510 + }, + { + "epoch": 2.9816876122082583, + "grad_norm": 0.8771023750305176, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41520 + }, + { + "epoch": 2.9824057450628367, + "grad_norm": 0.7185000777244568, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 41530 + }, + { + "epoch": 2.9831238779174147, + "grad_norm": 0.8299767374992371, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 41540 + }, + { + "epoch": 2.9838420107719927, + "grad_norm": 0.9309971928596497, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 41550 + }, + { + "epoch": 2.984560143626571, + "grad_norm": 0.7644693851470947, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 41560 + }, + { + "epoch": 2.985278276481149, + "grad_norm": 0.7888111472129822, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 41570 + }, + { + "epoch": 2.985996409335727, + "grad_norm": 1.0921967029571533, + "learning_rate": 0.0002, + "loss": 0.6984, + "step": 41580 + }, + { + "epoch": 2.986714542190305, + "grad_norm": 0.8116785883903503, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 41590 + }, + { + "epoch": 2.987432675044883, + "grad_norm": 0.983269214630127, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 41600 + }, + { + "epoch": 2.988150807899461, + "grad_norm": 0.81700599193573, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 41610 + }, + { + "epoch": 2.9888689407540396, + "grad_norm": 0.7545617818832397, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 41620 + }, + { + "epoch": 2.9895870736086176, + "grad_norm": 0.8695791363716125, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 41630 + }, + { + "epoch": 2.9903052064631956, + "grad_norm": 0.8980445861816406, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 41640 + }, + { + "epoch": 2.9910233393177736, + "grad_norm": 0.7884747982025146, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 41650 + }, + { + "epoch": 2.991741472172352, + "grad_norm": 0.8347880840301514, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 41660 + }, + { + "epoch": 2.99245960502693, + "grad_norm": 0.7786261439323425, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 41670 + }, + { + "epoch": 2.993177737881508, + "grad_norm": 0.7830624580383301, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 41680 + }, + { + "epoch": 2.993895870736086, + "grad_norm": 0.8293532133102417, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 41690 + }, + { + "epoch": 2.994614003590664, + "grad_norm": 0.8476244211196899, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 41700 + }, + { + "epoch": 2.995332136445242, + "grad_norm": 0.7218726873397827, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 41710 + }, + { + "epoch": 2.9960502692998205, + "grad_norm": 0.8144199252128601, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 41720 + }, + { + "epoch": 2.9967684021543985, + "grad_norm": 0.7047123312950134, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 41730 + }, + { + "epoch": 2.9974865350089765, + "grad_norm": 0.8412184715270996, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 41740 + }, + { + "epoch": 2.998204667863555, + "grad_norm": 0.8840848207473755, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 41750 + }, + { + "epoch": 2.998922800718133, + "grad_norm": 0.7302142977714539, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 41760 + }, + { + "epoch": 2.999640933572711, + "grad_norm": 0.7075994610786438, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 41770 + }, + { + "epoch": 3.0, + "eval_loss": 1.1079821586608887, + "eval_runtime": 55.1897, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 41775 + } + ], + "logging_steps": 10, + "max_steps": 111400, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9332534619865088e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-41775/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2a187658ce380572d4a6ec24c1b38d09202dd5ab --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9859570c18229c472a3fdb3986a6a537e197e8e83a8b73f475a2b0662afc5e0e +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..778fbcb27e48ec78a8bd0f9dc22fbb3c2f706380 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22ae1c055702f3dbaf6c1458354e108c9289c1f111ae4f2fd3940c2c22d2a877 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9c82f0657325bc599030aa06b37a83ffd1fcdcd0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1ad2ee1f312e183971226dfba256e44f680ba011cb7aad1016406ecd0e3135 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e237727ed75ce600ed8ce03f83f50a4b5c06476 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43b003ea16ad12b68d24e3e8c4c364d6bd43d9b7984fd004e9b6465103caa838 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0da8bdba777af6f6149b4b32595df124bdc97e65 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/trainer_state.json @@ -0,0 +1,39055 @@ +{ + "best_metric": 1.0868422985076904, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 55700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000718132854578097, + "grad_norm": 1.0291756391525269, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 10 + }, + { + "epoch": 0.001436265709156194, + "grad_norm": 0.6570823192596436, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 20 + }, + { + "epoch": 0.0021543985637342907, + "grad_norm": 0.693844199180603, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 30 + }, + { + "epoch": 0.002872531418312388, + "grad_norm": 0.5608532428741455, + "learning_rate": 0.0002, + "loss": 0.9377, + "step": 40 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 0.549075722694397, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 50 + }, + { + "epoch": 0.004308797127468581, + "grad_norm": 0.47189879417419434, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 60 + }, + { + "epoch": 0.005026929982046679, + "grad_norm": 0.5799676775932312, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 70 + }, + { + "epoch": 0.005745062836624776, + "grad_norm": 0.45907193422317505, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 80 + }, + { + "epoch": 0.006463195691202872, + "grad_norm": 0.4373045861721039, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 90 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 100 + }, + { + "epoch": 0.007899461400359067, + "grad_norm": 0.5248253345489502, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 110 + }, + { + "epoch": 0.008617594254937163, + "grad_norm": 0.5082874298095703, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 120 + }, + { + "epoch": 0.00933572710951526, + "grad_norm": 0.42670881748199463, + "learning_rate": 0.0002, + "loss": 0.8678, + "step": 130 + }, + { + "epoch": 0.010053859964093357, + "grad_norm": 0.43311649560928345, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 0.43456509709358215, + "learning_rate": 0.0002, + "loss": 0.9252, + "step": 150 + }, + { + "epoch": 0.011490125673249552, + "grad_norm": 0.9222815632820129, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 160 + }, + { + "epoch": 0.012208258527827648, + "grad_norm": 0.42752256989479065, + "learning_rate": 0.0002, + "loss": 0.8651, + "step": 170 + }, + { + "epoch": 0.012926391382405745, + "grad_norm": 0.4175542891025543, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 180 + }, + { + "epoch": 0.013644524236983842, + "grad_norm": 0.4377831518650055, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 190 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 0.47263655066490173, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 200 + }, + { + "epoch": 0.015080789946140035, + "grad_norm": 0.3870520293712616, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 210 + }, + { + "epoch": 0.015798922800718134, + "grad_norm": 0.4950464963912964, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 220 + }, + { + "epoch": 0.01651705565529623, + "grad_norm": 0.4643295407295227, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 230 + }, + { + "epoch": 0.017235188509874325, + "grad_norm": 0.5152903199195862, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 240 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 0.3800727427005768, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 250 + }, + { + "epoch": 0.01867145421903052, + "grad_norm": 0.43700528144836426, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 260 + }, + { + "epoch": 0.01938958707360862, + "grad_norm": 0.3712887763977051, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 270 + }, + { + "epoch": 0.020107719928186715, + "grad_norm": 0.4202553629875183, + "learning_rate": 0.0002, + "loss": 0.8329, + "step": 280 + }, + { + "epoch": 0.02082585278276481, + "grad_norm": 0.40585094690322876, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 0.4685470759868622, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 300 + }, + { + "epoch": 0.022262118491921005, + "grad_norm": 0.373169481754303, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 310 + }, + { + "epoch": 0.022980251346499104, + "grad_norm": 0.39681482315063477, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 320 + }, + { + "epoch": 0.0236983842010772, + "grad_norm": 0.3919322192668915, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 330 + }, + { + "epoch": 0.024416517055655295, + "grad_norm": 0.4728981554508209, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 340 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 0.42439374327659607, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 350 + }, + { + "epoch": 0.02585278276481149, + "grad_norm": 0.425650030374527, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 360 + }, + { + "epoch": 0.02657091561938959, + "grad_norm": 0.4076762795448303, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 370 + }, + { + "epoch": 0.027289048473967684, + "grad_norm": 0.44335922598838806, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 380 + }, + { + "epoch": 0.02800718132854578, + "grad_norm": 0.5313619375228882, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 390 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 0.37089797854423523, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 400 + }, + { + "epoch": 0.029443447037701975, + "grad_norm": 0.5193604826927185, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 410 + }, + { + "epoch": 0.03016157989228007, + "grad_norm": 0.4428552985191345, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 420 + }, + { + "epoch": 0.03087971274685817, + "grad_norm": 0.384171724319458, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 430 + }, + { + "epoch": 0.03159784560143627, + "grad_norm": 0.3906913101673126, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 440 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 0.5365669131278992, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 450 + }, + { + "epoch": 0.03303411131059246, + "grad_norm": 0.4785287380218506, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 460 + }, + { + "epoch": 0.03375224416517056, + "grad_norm": 0.40048182010650635, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 470 + }, + { + "epoch": 0.03447037701974865, + "grad_norm": 0.49529239535331726, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 480 + }, + { + "epoch": 0.03518850987432675, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 490 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 0.3802863359451294, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.03662477558348295, + "grad_norm": 0.40374308824539185, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 510 + }, + { + "epoch": 0.03734290843806104, + "grad_norm": 0.4320009648799896, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 520 + }, + { + "epoch": 0.03806104129263914, + "grad_norm": 0.5198846459388733, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 530 + }, + { + "epoch": 0.03877917414721724, + "grad_norm": 0.4136947989463806, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 540 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 0.39344364404678345, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 550 + }, + { + "epoch": 0.04021543985637343, + "grad_norm": 0.4659644067287445, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 560 + }, + { + "epoch": 0.04093357271095153, + "grad_norm": 0.3898842930793762, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 570 + }, + { + "epoch": 0.04165170556552962, + "grad_norm": 0.3964841961860657, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 580 + }, + { + "epoch": 0.04236983842010772, + "grad_norm": 0.5172179341316223, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 590 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 0.5362544059753418, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 600 + }, + { + "epoch": 0.04380610412926391, + "grad_norm": 0.3975909948348999, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 610 + }, + { + "epoch": 0.04452423698384201, + "grad_norm": 0.3905031085014343, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 620 + }, + { + "epoch": 0.04524236983842011, + "grad_norm": 0.5148088932037354, + "learning_rate": 0.0002, + "loss": 0.7723, + "step": 630 + }, + { + "epoch": 0.04596050269299821, + "grad_norm": 0.38826194405555725, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 640 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 0.5432049036026001, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.0473967684021544, + "grad_norm": 0.42048221826553345, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 660 + }, + { + "epoch": 0.0481149012567325, + "grad_norm": 0.4683088958263397, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 670 + }, + { + "epoch": 0.04883303411131059, + "grad_norm": 0.4623735249042511, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 680 + }, + { + "epoch": 0.04955116696588869, + "grad_norm": 0.509128212928772, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 690 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 0.45767295360565186, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 700 + }, + { + "epoch": 0.05098743267504488, + "grad_norm": 0.4023726284503937, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 710 + }, + { + "epoch": 0.05170556552962298, + "grad_norm": 0.4407201409339905, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 720 + }, + { + "epoch": 0.05242369838420108, + "grad_norm": 0.41862091422080994, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 730 + }, + { + "epoch": 0.05314183123877918, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 740 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 0.4882921576499939, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 750 + }, + { + "epoch": 0.05457809694793537, + "grad_norm": 0.47890132665634155, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 760 + }, + { + "epoch": 0.05529622980251347, + "grad_norm": 0.5811166167259216, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 770 + }, + { + "epoch": 0.05601436265709156, + "grad_norm": 0.41113588213920593, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 780 + }, + { + "epoch": 0.05673249551166966, + "grad_norm": 0.4120602607727051, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 790 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 0.39287394285202026, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 800 + }, + { + "epoch": 0.05816876122082585, + "grad_norm": 0.3986941874027252, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 810 + }, + { + "epoch": 0.05888689407540395, + "grad_norm": 0.4264012575149536, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 820 + }, + { + "epoch": 0.05960502692998205, + "grad_norm": 0.481139600276947, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 830 + }, + { + "epoch": 0.06032315978456014, + "grad_norm": 0.5561784505844116, + "learning_rate": 0.0002, + "loss": 0.8477, + "step": 840 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 0.4787197411060333, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 850 + }, + { + "epoch": 0.06175942549371634, + "grad_norm": 0.46454647183418274, + "learning_rate": 0.0002, + "loss": 0.8567, + "step": 860 + }, + { + "epoch": 0.06247755834829444, + "grad_norm": 0.5929669141769409, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 870 + }, + { + "epoch": 0.06319569120287254, + "grad_norm": 0.4561384618282318, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 880 + }, + { + "epoch": 0.06391382405745062, + "grad_norm": 0.45767998695373535, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 890 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 0.42475444078445435, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 900 + }, + { + "epoch": 0.06535008976660682, + "grad_norm": 0.4911022484302521, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 910 + }, + { + "epoch": 0.06606822262118492, + "grad_norm": 0.5229166746139526, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 920 + }, + { + "epoch": 0.06678635547576302, + "grad_norm": 0.38134580850601196, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 930 + }, + { + "epoch": 0.06750448833034112, + "grad_norm": 0.4171486496925354, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 940 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 0.45171529054641724, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 950 + }, + { + "epoch": 0.0689407540394973, + "grad_norm": 0.44889307022094727, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 960 + }, + { + "epoch": 0.0696588868940754, + "grad_norm": 0.44902464747428894, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 970 + }, + { + "epoch": 0.0703770197486535, + "grad_norm": 0.4671969413757324, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 980 + }, + { + "epoch": 0.0710951526032316, + "grad_norm": 0.4686984717845917, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 990 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1000 + }, + { + "epoch": 0.0725314183123878, + "grad_norm": 0.48861828446388245, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1010 + }, + { + "epoch": 0.0732495511669659, + "grad_norm": 0.7603165507316589, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 1020 + }, + { + "epoch": 0.07396768402154398, + "grad_norm": 0.501654863357544, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 1030 + }, + { + "epoch": 0.07468581687612208, + "grad_norm": 0.45291560888290405, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 1040 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 0.42454713582992554, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 1050 + }, + { + "epoch": 0.07612208258527828, + "grad_norm": 0.4655592441558838, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1060 + }, + { + "epoch": 0.07684021543985638, + "grad_norm": 0.5011071562767029, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 1070 + }, + { + "epoch": 0.07755834829443448, + "grad_norm": 0.37221577763557434, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 1080 + }, + { + "epoch": 0.07827648114901256, + "grad_norm": 0.5123572945594788, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 1090 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 0.44138720631599426, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1100 + }, + { + "epoch": 0.07971274685816876, + "grad_norm": 0.38932886719703674, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 1110 + }, + { + "epoch": 0.08043087971274686, + "grad_norm": 0.435820072889328, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 1120 + }, + { + "epoch": 0.08114901256732496, + "grad_norm": 0.3820142149925232, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 1130 + }, + { + "epoch": 0.08186714542190306, + "grad_norm": 0.39680808782577515, + "learning_rate": 0.0002, + "loss": 0.8617, + "step": 1140 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 0.4833722412586212, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1150 + }, + { + "epoch": 0.08330341113105924, + "grad_norm": 0.5045956969261169, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1160 + }, + { + "epoch": 0.08402154398563734, + "grad_norm": 0.3652207553386688, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 1170 + }, + { + "epoch": 0.08473967684021544, + "grad_norm": 0.44447052478790283, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 1180 + }, + { + "epoch": 0.08545780969479354, + "grad_norm": 0.44942694902420044, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 1190 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 0.48789075016975403, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1200 + }, + { + "epoch": 0.08689407540394974, + "grad_norm": 0.3981451094150543, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1210 + }, + { + "epoch": 0.08761220825852782, + "grad_norm": 0.45545220375061035, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 1220 + }, + { + "epoch": 0.08833034111310592, + "grad_norm": 0.562138557434082, + "learning_rate": 0.0002, + "loss": 0.8406, + "step": 1230 + }, + { + "epoch": 0.08904847396768402, + "grad_norm": 0.48523494601249695, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 1240 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 0.35054388642311096, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1250 + }, + { + "epoch": 0.09048473967684022, + "grad_norm": 0.4148605167865753, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 1260 + }, + { + "epoch": 0.09120287253141832, + "grad_norm": 0.50171959400177, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 1270 + }, + { + "epoch": 0.09192100538599642, + "grad_norm": 0.41747573018074036, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 1280 + }, + { + "epoch": 0.0926391382405745, + "grad_norm": 0.43028751015663147, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1290 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 0.41274991631507874, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 1300 + }, + { + "epoch": 0.0940754039497307, + "grad_norm": 0.5399569272994995, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 1310 + }, + { + "epoch": 0.0947935368043088, + "grad_norm": 0.44284379482269287, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 1320 + }, + { + "epoch": 0.0955116696588869, + "grad_norm": 0.42511969804763794, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1330 + }, + { + "epoch": 0.096229802513465, + "grad_norm": 0.5717929005622864, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1340 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1350 + }, + { + "epoch": 0.09766606822262118, + "grad_norm": 0.4144339859485626, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 1360 + }, + { + "epoch": 0.09838420107719928, + "grad_norm": 0.43676936626434326, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 1370 + }, + { + "epoch": 0.09910233393177738, + "grad_norm": 0.5297161340713501, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 1380 + }, + { + "epoch": 0.09982046678635548, + "grad_norm": 0.5319193601608276, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1390 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 0.4083728492259979, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1400 + }, + { + "epoch": 0.10125673249551168, + "grad_norm": 0.4193868339061737, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1410 + }, + { + "epoch": 0.10197486535008976, + "grad_norm": 0.4062198996543884, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 1420 + }, + { + "epoch": 0.10269299820466786, + "grad_norm": 0.43972232937812805, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1430 + }, + { + "epoch": 0.10341113105924596, + "grad_norm": 0.4598410725593567, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1440 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 0.571662187576294, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1450 + }, + { + "epoch": 0.10484739676840216, + "grad_norm": 0.5437791347503662, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1460 + }, + { + "epoch": 0.10556552962298026, + "grad_norm": 0.4241923391819, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1470 + }, + { + "epoch": 0.10628366247755835, + "grad_norm": 0.5185145735740662, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1480 + }, + { + "epoch": 0.10700179533213644, + "grad_norm": 0.537626326084137, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 1490 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 0.4573661983013153, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 1500 + }, + { + "epoch": 0.10843806104129264, + "grad_norm": 0.4521017074584961, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 1510 + }, + { + "epoch": 0.10915619389587074, + "grad_norm": 0.6835159063339233, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1520 + }, + { + "epoch": 0.10987432675044884, + "grad_norm": 0.43522894382476807, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 1530 + }, + { + "epoch": 0.11059245960502694, + "grad_norm": 0.685547411441803, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 1540 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 0.5283669233322144, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1550 + }, + { + "epoch": 0.11202872531418312, + "grad_norm": 0.4869283437728882, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 1560 + }, + { + "epoch": 0.11274685816876122, + "grad_norm": 0.43024054169654846, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1570 + }, + { + "epoch": 0.11346499102333932, + "grad_norm": 0.46726059913635254, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1580 + }, + { + "epoch": 0.11418312387791742, + "grad_norm": 0.5046039819717407, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 1590 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 0.48972827196121216, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 1600 + }, + { + "epoch": 0.11561938958707361, + "grad_norm": 0.5221049189567566, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 1610 + }, + { + "epoch": 0.1163375224416517, + "grad_norm": 0.49169477820396423, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 1620 + }, + { + "epoch": 0.1170556552962298, + "grad_norm": 0.48462188243865967, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 1630 + }, + { + "epoch": 0.1177737881508079, + "grad_norm": 0.9001021981239319, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 1640 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 0.47555917501449585, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 1650 + }, + { + "epoch": 0.1192100538599641, + "grad_norm": 0.4523521959781647, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1660 + }, + { + "epoch": 0.1199281867145422, + "grad_norm": 0.510956346988678, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 1670 + }, + { + "epoch": 0.12064631956912028, + "grad_norm": 0.48063746094703674, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 0.12136445242369838, + "grad_norm": 0.5209490060806274, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 1690 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 0.5488983988761902, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1700 + }, + { + "epoch": 0.12280071813285458, + "grad_norm": 0.5263523459434509, + "learning_rate": 0.0002, + "loss": 0.829, + "step": 1710 + }, + { + "epoch": 0.12351885098743268, + "grad_norm": 0.45365768671035767, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 1720 + }, + { + "epoch": 0.12423698384201078, + "grad_norm": 0.4366922378540039, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 1730 + }, + { + "epoch": 0.12495511669658887, + "grad_norm": 0.4841083884239197, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 1740 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 0.46546968817710876, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 1750 + }, + { + "epoch": 0.12639138240574507, + "grad_norm": 0.39987099170684814, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1760 + }, + { + "epoch": 0.12710951526032316, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 1770 + }, + { + "epoch": 0.12782764811490124, + "grad_norm": 0.46716657280921936, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 1780 + }, + { + "epoch": 0.12854578096947936, + "grad_norm": 0.46164995431900024, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1790 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 0.4910370111465454, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 1800 + }, + { + "epoch": 0.12998204667863555, + "grad_norm": 0.5615737438201904, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 1810 + }, + { + "epoch": 0.13070017953321364, + "grad_norm": 0.5739728808403015, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1820 + }, + { + "epoch": 0.13141831238779175, + "grad_norm": 0.44104722142219543, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 1830 + }, + { + "epoch": 0.13213644524236984, + "grad_norm": 0.46373724937438965, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 1840 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 0.4481196403503418, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.13357271095152604, + "grad_norm": 0.5689327716827393, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 1860 + }, + { + "epoch": 0.13429084380610412, + "grad_norm": 0.5334849953651428, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 1870 + }, + { + "epoch": 0.13500897666068223, + "grad_norm": 0.5177253484725952, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 1880 + }, + { + "epoch": 0.13572710951526032, + "grad_norm": 0.4919368326663971, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 1890 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 0.5987576842308044, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 1900 + }, + { + "epoch": 0.13716337522441652, + "grad_norm": 0.49790486693382263, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 1910 + }, + { + "epoch": 0.1378815080789946, + "grad_norm": 0.5337542295455933, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1920 + }, + { + "epoch": 0.13859964093357272, + "grad_norm": 0.5171598792076111, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 1930 + }, + { + "epoch": 0.1393177737881508, + "grad_norm": 0.5003953576087952, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1940 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 0.5147887468338013, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 1950 + }, + { + "epoch": 0.140754039497307, + "grad_norm": 0.6365984678268433, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 1960 + }, + { + "epoch": 0.1414721723518851, + "grad_norm": 0.5449512004852295, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 1970 + }, + { + "epoch": 0.1421903052064632, + "grad_norm": 0.4062703847885132, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1980 + }, + { + "epoch": 0.14290843806104128, + "grad_norm": 0.4446912705898285, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 1990 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 0.49001234769821167, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 2000 + }, + { + "epoch": 0.14434470377019748, + "grad_norm": 0.5591765642166138, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 2010 + }, + { + "epoch": 0.1450628366247756, + "grad_norm": 0.6476696133613586, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 2020 + }, + { + "epoch": 0.14578096947935368, + "grad_norm": 0.44688376784324646, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 2030 + }, + { + "epoch": 0.1464991023339318, + "grad_norm": 0.4437490701675415, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 2040 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 0.59927898645401, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 2050 + }, + { + "epoch": 0.14793536804308796, + "grad_norm": 0.4356591999530792, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 2060 + }, + { + "epoch": 0.14865350089766607, + "grad_norm": 0.5560822486877441, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2070 + }, + { + "epoch": 0.14937163375224416, + "grad_norm": 0.43027108907699585, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2080 + }, + { + "epoch": 0.15008976660682227, + "grad_norm": 0.41215455532073975, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 2090 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 0.4607839584350586, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 2100 + }, + { + "epoch": 0.15152603231597844, + "grad_norm": 0.4699854254722595, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2110 + }, + { + "epoch": 0.15224416517055656, + "grad_norm": 0.5111975073814392, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2120 + }, + { + "epoch": 0.15296229802513464, + "grad_norm": 0.4713742733001709, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2130 + }, + { + "epoch": 0.15368043087971275, + "grad_norm": 0.3816622793674469, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2140 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 0.4637526273727417, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 2150 + }, + { + "epoch": 0.15511669658886895, + "grad_norm": 0.3691818118095398, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2160 + }, + { + "epoch": 0.15583482944344704, + "grad_norm": 0.4435218274593353, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 2170 + }, + { + "epoch": 0.15655296229802512, + "grad_norm": 0.5282211899757385, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 2180 + }, + { + "epoch": 0.15727109515260324, + "grad_norm": 0.7611056566238403, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 2190 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 0.5951169729232788, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 2200 + }, + { + "epoch": 0.15870736086175943, + "grad_norm": 0.5243265628814697, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2210 + }, + { + "epoch": 0.15942549371633752, + "grad_norm": 0.518944501876831, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 2220 + }, + { + "epoch": 0.16014362657091563, + "grad_norm": 0.4264616072177887, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2230 + }, + { + "epoch": 0.16086175942549372, + "grad_norm": 0.4619045853614807, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 2240 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 0.4047030508518219, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2250 + }, + { + "epoch": 0.16229802513464991, + "grad_norm": 0.47133687138557434, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 2260 + }, + { + "epoch": 0.163016157989228, + "grad_norm": 0.4990246593952179, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 2270 + }, + { + "epoch": 0.1637342908438061, + "grad_norm": 0.5145298838615417, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 2280 + }, + { + "epoch": 0.1644524236983842, + "grad_norm": 0.5354352593421936, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 2290 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 0.47621065378189087, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 2300 + }, + { + "epoch": 0.1658886894075404, + "grad_norm": 0.45333582162857056, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 2310 + }, + { + "epoch": 0.16660682226211848, + "grad_norm": 0.4832790493965149, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 2320 + }, + { + "epoch": 0.1673249551166966, + "grad_norm": 0.4922761619091034, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2330 + }, + { + "epoch": 0.16804308797127468, + "grad_norm": 0.5701655149459839, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 2340 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 0.5170459151268005, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 2350 + }, + { + "epoch": 0.16947935368043088, + "grad_norm": 0.6562373638153076, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2360 + }, + { + "epoch": 0.170197486535009, + "grad_norm": 0.5350262522697449, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 2370 + }, + { + "epoch": 0.17091561938958708, + "grad_norm": 0.5163491368293762, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 2380 + }, + { + "epoch": 0.17163375224416516, + "grad_norm": 0.48841530084609985, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2390 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 0.44912993907928467, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 2400 + }, + { + "epoch": 0.17307001795332136, + "grad_norm": 0.5770647525787354, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 2410 + }, + { + "epoch": 0.17378815080789947, + "grad_norm": 0.4716179072856903, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 2420 + }, + { + "epoch": 0.17450628366247756, + "grad_norm": 0.5465078949928284, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 2430 + }, + { + "epoch": 0.17522441651705564, + "grad_norm": 0.40810713171958923, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 2440 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 0.3789578080177307, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 2450 + }, + { + "epoch": 0.17666068222621184, + "grad_norm": 0.4615110158920288, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 2460 + }, + { + "epoch": 0.17737881508078995, + "grad_norm": 0.4400235712528229, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2470 + }, + { + "epoch": 0.17809694793536804, + "grad_norm": 0.5935020446777344, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2480 + }, + { + "epoch": 0.17881508078994615, + "grad_norm": 0.5672990679740906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 2490 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 0.4132838845252991, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 2500 + }, + { + "epoch": 0.18025134649910232, + "grad_norm": 0.5373716950416565, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 2510 + }, + { + "epoch": 0.18096947935368043, + "grad_norm": 0.5335832834243774, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2520 + }, + { + "epoch": 0.18168761220825852, + "grad_norm": 0.5705642700195312, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.18240574506283663, + "grad_norm": 0.4807959496974945, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 2540 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 0.4430573880672455, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2550 + }, + { + "epoch": 0.18384201077199283, + "grad_norm": 0.5294728875160217, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 2560 + }, + { + "epoch": 0.18456014362657092, + "grad_norm": 0.661173403263092, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2570 + }, + { + "epoch": 0.185278276481149, + "grad_norm": 0.5044304728507996, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2580 + }, + { + "epoch": 0.18599640933572711, + "grad_norm": 0.48929551243782043, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 2590 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 0.5054438710212708, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2600 + }, + { + "epoch": 0.1874326750448833, + "grad_norm": 0.5613677501678467, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 2610 + }, + { + "epoch": 0.1881508078994614, + "grad_norm": 0.5762478709220886, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 2620 + }, + { + "epoch": 0.1888689407540395, + "grad_norm": 0.4523695409297943, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2630 + }, + { + "epoch": 0.1895870736086176, + "grad_norm": 0.5235317945480347, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 2640 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 0.4894576370716095, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 2650 + }, + { + "epoch": 0.1910233393177738, + "grad_norm": 0.45731106400489807, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2660 + }, + { + "epoch": 0.19174147217235188, + "grad_norm": 0.4726541042327881, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2670 + }, + { + "epoch": 0.19245960502693, + "grad_norm": 0.4281631410121918, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 2680 + }, + { + "epoch": 0.19317773788150808, + "grad_norm": 0.48011314868927, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 2690 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 0.45785006880760193, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2700 + }, + { + "epoch": 0.19461400359066428, + "grad_norm": 0.5244625210762024, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 2710 + }, + { + "epoch": 0.19533213644524236, + "grad_norm": 0.4674883186817169, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2720 + }, + { + "epoch": 0.19605026929982047, + "grad_norm": 0.5969558358192444, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 2730 + }, + { + "epoch": 0.19676840215439856, + "grad_norm": 0.44413265585899353, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 2740 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 0.5094553828239441, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2750 + }, + { + "epoch": 0.19820466786355476, + "grad_norm": 0.4931736886501312, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2760 + }, + { + "epoch": 0.19892280071813284, + "grad_norm": 0.4766625463962555, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 2770 + }, + { + "epoch": 0.19964093357271095, + "grad_norm": 0.4196971654891968, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2780 + }, + { + "epoch": 0.20035906642728904, + "grad_norm": 0.4693375825881958, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 2790 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 0.5407108664512634, + "learning_rate": 0.0002, + "loss": 0.8336, + "step": 2800 + }, + { + "epoch": 0.20179533213644524, + "grad_norm": 0.42864227294921875, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2810 + }, + { + "epoch": 0.20251346499102335, + "grad_norm": 0.4928833246231079, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 2820 + }, + { + "epoch": 0.20323159784560144, + "grad_norm": 0.5575131773948669, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2830 + }, + { + "epoch": 0.20394973070017952, + "grad_norm": 0.505114734172821, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2840 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 0.4727420210838318, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 2850 + }, + { + "epoch": 0.20538599640933572, + "grad_norm": 0.48218145966529846, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 2860 + }, + { + "epoch": 0.20610412926391383, + "grad_norm": 0.5196906328201294, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2870 + }, + { + "epoch": 0.20682226211849192, + "grad_norm": 0.4927639067173004, + "learning_rate": 0.0002, + "loss": 0.8401, + "step": 2880 + }, + { + "epoch": 0.20754039497307003, + "grad_norm": 0.5076990127563477, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 2890 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.4606800079345703, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 2900 + }, + { + "epoch": 0.2089766606822262, + "grad_norm": 0.6184319257736206, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2910 + }, + { + "epoch": 0.2096947935368043, + "grad_norm": 0.5237935781478882, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2920 + }, + { + "epoch": 0.2104129263913824, + "grad_norm": 0.43966251611709595, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 2930 + }, + { + "epoch": 0.2111310592459605, + "grad_norm": 0.48786666989326477, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2940 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 0.4397817552089691, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 2950 + }, + { + "epoch": 0.2125673249551167, + "grad_norm": 0.5155336260795593, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.2132854578096948, + "grad_norm": 0.48058274388313293, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2970 + }, + { + "epoch": 0.21400359066427288, + "grad_norm": 0.5022647976875305, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2980 + }, + { + "epoch": 0.214721723518851, + "grad_norm": 0.5417225360870361, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 2990 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 0.46300315856933594, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 3000 + }, + { + "epoch": 0.2161579892280072, + "grad_norm": 0.5375089049339294, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 3010 + }, + { + "epoch": 0.21687612208258528, + "grad_norm": 0.5050022602081299, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 3020 + }, + { + "epoch": 0.21759425493716336, + "grad_norm": 0.46347716450691223, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 3030 + }, + { + "epoch": 0.21831238779174147, + "grad_norm": 0.544874370098114, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 3040 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 3050 + }, + { + "epoch": 0.21974865350089767, + "grad_norm": 0.5527157187461853, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 3060 + }, + { + "epoch": 0.22046678635547576, + "grad_norm": 0.5565235018730164, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 3070 + }, + { + "epoch": 0.22118491921005387, + "grad_norm": 0.4900645613670349, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 3080 + }, + { + "epoch": 0.22190305206463196, + "grad_norm": 0.4951242208480835, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 3090 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 0.5831719636917114, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 3100 + }, + { + "epoch": 0.22333931777378815, + "grad_norm": 0.417576402425766, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 3110 + }, + { + "epoch": 0.22405745062836624, + "grad_norm": 0.4715117812156677, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 3120 + }, + { + "epoch": 0.22477558348294435, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 3130 + }, + { + "epoch": 0.22549371633752244, + "grad_norm": 0.408184289932251, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 3140 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 3150 + }, + { + "epoch": 0.22692998204667864, + "grad_norm": 0.5631294846534729, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3160 + }, + { + "epoch": 0.22764811490125672, + "grad_norm": 0.5054665803909302, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3170 + }, + { + "epoch": 0.22836624775583483, + "grad_norm": 0.47388020157814026, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 3180 + }, + { + "epoch": 0.22908438061041292, + "grad_norm": 0.45871609449386597, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 3190 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.42431211471557617, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 3200 + }, + { + "epoch": 0.23052064631956912, + "grad_norm": 0.584872305393219, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3210 + }, + { + "epoch": 0.23123877917414723, + "grad_norm": 0.5489653944969177, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 3220 + }, + { + "epoch": 0.23195691202872532, + "grad_norm": 0.5803213119506836, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 3230 + }, + { + "epoch": 0.2326750448833034, + "grad_norm": 0.906505823135376, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3240 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 0.4569525718688965, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 3250 + }, + { + "epoch": 0.2341113105924596, + "grad_norm": 0.5566741228103638, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3260 + }, + { + "epoch": 0.2348294434470377, + "grad_norm": 0.5059959888458252, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3270 + }, + { + "epoch": 0.2355475763016158, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 3280 + }, + { + "epoch": 0.2362657091561939, + "grad_norm": 0.5149409174919128, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 3290 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 0.7323763966560364, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3300 + }, + { + "epoch": 0.23770197486535008, + "grad_norm": 0.6794836521148682, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 3310 + }, + { + "epoch": 0.2384201077199282, + "grad_norm": 0.5176534056663513, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 3320 + }, + { + "epoch": 0.23913824057450628, + "grad_norm": 0.42245906591415405, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 3330 + }, + { + "epoch": 0.2398563734290844, + "grad_norm": 0.43535107374191284, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 0.7038307785987854, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 3350 + }, + { + "epoch": 0.24129263913824056, + "grad_norm": 0.5689977407455444, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 0.24201077199281867, + "grad_norm": 0.538136899471283, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 3370 + }, + { + "epoch": 0.24272890484739676, + "grad_norm": 0.7433661222457886, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 3380 + }, + { + "epoch": 0.24344703770197487, + "grad_norm": 0.6996734738349915, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3390 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.5055703520774841, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 3400 + }, + { + "epoch": 0.24488330341113107, + "grad_norm": 0.5218513607978821, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 3410 + }, + { + "epoch": 0.24560143626570916, + "grad_norm": 0.42782822251319885, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3420 + }, + { + "epoch": 0.24631956912028724, + "grad_norm": 0.4991157650947571, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 3430 + }, + { + "epoch": 0.24703770197486535, + "grad_norm": 0.5063165426254272, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3440 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 0.45863136649131775, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3450 + }, + { + "epoch": 0.24847396768402155, + "grad_norm": 0.474728524684906, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3460 + }, + { + "epoch": 0.24919210053859964, + "grad_norm": 0.522570013999939, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 3470 + }, + { + "epoch": 0.24991023339317775, + "grad_norm": 0.5474396347999573, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 3480 + }, + { + "epoch": 0.2506283662477558, + "grad_norm": 0.49094662070274353, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3490 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 0.6399132609367371, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 3500 + }, + { + "epoch": 0.25206463195691203, + "grad_norm": 0.5910066366195679, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 3510 + }, + { + "epoch": 0.25278276481149015, + "grad_norm": 0.4761259853839874, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3520 + }, + { + "epoch": 0.2535008976660682, + "grad_norm": 0.5124502182006836, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 3530 + }, + { + "epoch": 0.2542190305206463, + "grad_norm": 0.4329150915145874, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3540 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 0.4839608371257782, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 3550 + }, + { + "epoch": 0.2556552962298025, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3560 + }, + { + "epoch": 0.2563734290843806, + "grad_norm": 0.5761468410491943, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 3570 + }, + { + "epoch": 0.2570915619389587, + "grad_norm": 0.49266132712364197, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3580 + }, + { + "epoch": 0.2578096947935368, + "grad_norm": 0.7377930879592896, + "learning_rate": 0.0002, + "loss": 0.7946, + "step": 3590 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 0.543541431427002, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3600 + }, + { + "epoch": 0.259245960502693, + "grad_norm": 0.48385897278785706, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3610 + }, + { + "epoch": 0.2599640933572711, + "grad_norm": 0.5152639746665955, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3620 + }, + { + "epoch": 0.26068222621184917, + "grad_norm": 0.5601988434791565, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 3630 + }, + { + "epoch": 0.2614003590664273, + "grad_norm": 0.4349626302719116, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 0.5487161874771118, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3650 + }, + { + "epoch": 0.2628366247755835, + "grad_norm": 0.45603805780410767, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 3660 + }, + { + "epoch": 0.26355475763016156, + "grad_norm": 0.5012730956077576, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 3670 + }, + { + "epoch": 0.2642728904847397, + "grad_norm": 0.4523845314979553, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 3680 + }, + { + "epoch": 0.2649910233393178, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 3690 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 0.48467493057250977, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 3700 + }, + { + "epoch": 0.26642728904847396, + "grad_norm": 0.4860585927963257, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3710 + }, + { + "epoch": 0.26714542190305207, + "grad_norm": 0.5067077875137329, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3720 + }, + { + "epoch": 0.2678635547576302, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3730 + }, + { + "epoch": 0.26858168761220824, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 3740 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 0.5026951432228088, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 3750 + }, + { + "epoch": 0.27001795332136447, + "grad_norm": 0.49474090337753296, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3760 + }, + { + "epoch": 0.2707360861759425, + "grad_norm": 0.6381985545158386, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 3770 + }, + { + "epoch": 0.27145421903052064, + "grad_norm": 0.4784011244773865, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 3780 + }, + { + "epoch": 0.27217235188509875, + "grad_norm": 0.5126543045043945, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 3790 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 3800 + }, + { + "epoch": 0.2736086175942549, + "grad_norm": 0.5427033305168152, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 3810 + }, + { + "epoch": 0.27432675044883303, + "grad_norm": 0.46467480063438416, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 3820 + }, + { + "epoch": 0.27504488330341115, + "grad_norm": 0.494367390871048, + "learning_rate": 0.0002, + "loss": 0.8414, + "step": 3830 + }, + { + "epoch": 0.2757630161579892, + "grad_norm": 0.59856778383255, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3840 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 0.422128826379776, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 3850 + }, + { + "epoch": 0.27719928186714543, + "grad_norm": 0.5757306814193726, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 3860 + }, + { + "epoch": 0.27791741472172354, + "grad_norm": 0.5850930213928223, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.2786355475763016, + "grad_norm": 0.5633023977279663, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3880 + }, + { + "epoch": 0.2793536804308797, + "grad_norm": 0.5037940144538879, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 3890 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 0.5255506038665771, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 3900 + }, + { + "epoch": 0.2807899461400359, + "grad_norm": 0.44584617018699646, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 3910 + }, + { + "epoch": 0.281508078994614, + "grad_norm": 0.4803239405155182, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 3920 + }, + { + "epoch": 0.2822262118491921, + "grad_norm": 0.5206008553504944, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 3930 + }, + { + "epoch": 0.2829443447037702, + "grad_norm": 0.5596373081207275, + "learning_rate": 0.0002, + "loss": 0.8988, + "step": 3940 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 0.4487258493900299, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 3950 + }, + { + "epoch": 0.2843806104129264, + "grad_norm": 0.4774281978607178, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3960 + }, + { + "epoch": 0.2850987432675045, + "grad_norm": 0.571829617023468, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3970 + }, + { + "epoch": 0.28581687612208256, + "grad_norm": 0.45251455903053284, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 3980 + }, + { + "epoch": 0.2865350089766607, + "grad_norm": 0.5119943618774414, + "learning_rate": 0.0002, + "loss": 0.8007, + "step": 3990 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.42333969473838806, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 4000 + }, + { + "epoch": 0.2879712746858169, + "grad_norm": 0.5694096684455872, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 4010 + }, + { + "epoch": 0.28868940754039496, + "grad_norm": 0.44457492232322693, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 4020 + }, + { + "epoch": 0.2894075403949731, + "grad_norm": 0.496545672416687, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 4030 + }, + { + "epoch": 0.2901256732495512, + "grad_norm": 0.5092352032661438, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 4040 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 0.5124567151069641, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4050 + }, + { + "epoch": 0.29156193895870736, + "grad_norm": 0.5148161053657532, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4060 + }, + { + "epoch": 0.29228007181328547, + "grad_norm": 0.48183947801589966, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4070 + }, + { + "epoch": 0.2929982046678636, + "grad_norm": 0.47728800773620605, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4080 + }, + { + "epoch": 0.29371633752244164, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.5343585014343262, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 4100 + }, + { + "epoch": 0.29515260323159787, + "grad_norm": 0.5760312676429749, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 4110 + }, + { + "epoch": 0.2958707360861759, + "grad_norm": 0.5894787907600403, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4120 + }, + { + "epoch": 0.29658886894075404, + "grad_norm": 0.4528578817844391, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 4130 + }, + { + "epoch": 0.29730700179533215, + "grad_norm": 0.6027235388755798, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 4140 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.5060310959815979, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 4150 + }, + { + "epoch": 0.2987432675044883, + "grad_norm": 0.475252628326416, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4160 + }, + { + "epoch": 0.29946140035906643, + "grad_norm": 0.4855351448059082, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 4170 + }, + { + "epoch": 0.30017953321364454, + "grad_norm": 0.6720767021179199, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4180 + }, + { + "epoch": 0.3008976660682226, + "grad_norm": 0.6409553289413452, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 4190 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.5508167147636414, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 4200 + }, + { + "epoch": 0.30233393177737883, + "grad_norm": 0.45958149433135986, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 4210 + }, + { + "epoch": 0.3030520646319569, + "grad_norm": 0.5201641321182251, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 4220 + }, + { + "epoch": 0.303770197486535, + "grad_norm": 0.5440032482147217, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4230 + }, + { + "epoch": 0.3044883303411131, + "grad_norm": 0.43566814064979553, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4240 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 0.4479893445968628, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 4250 + }, + { + "epoch": 0.3059245960502693, + "grad_norm": 0.40390217304229736, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4260 + }, + { + "epoch": 0.3066427289048474, + "grad_norm": 0.5143486261367798, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 4270 + }, + { + "epoch": 0.3073608617594255, + "grad_norm": 0.5289962887763977, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 4280 + }, + { + "epoch": 0.30807899461400357, + "grad_norm": 0.609561026096344, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 4290 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 0.5967493653297424, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 4300 + }, + { + "epoch": 0.3095152603231598, + "grad_norm": 0.5323672890663147, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4310 + }, + { + "epoch": 0.3102333931777379, + "grad_norm": 0.4996737241744995, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 4320 + }, + { + "epoch": 0.31095152603231596, + "grad_norm": 0.5528829097747803, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 4330 + }, + { + "epoch": 0.3116696588868941, + "grad_norm": 0.5394268035888672, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4340 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 0.4654628038406372, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 4350 + }, + { + "epoch": 0.31310592459605024, + "grad_norm": 0.4933706521987915, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.31382405745062836, + "grad_norm": 0.5310598611831665, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 4370 + }, + { + "epoch": 0.31454219030520647, + "grad_norm": 0.5558765530586243, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4380 + }, + { + "epoch": 0.3152603231597846, + "grad_norm": 0.5281313061714172, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 4390 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 0.5100293755531311, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4400 + }, + { + "epoch": 0.31669658886894075, + "grad_norm": 0.48762813210487366, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 4410 + }, + { + "epoch": 0.31741472172351887, + "grad_norm": 0.5211702585220337, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 4420 + }, + { + "epoch": 0.3181328545780969, + "grad_norm": 0.696747899055481, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 4430 + }, + { + "epoch": 0.31885098743267504, + "grad_norm": 0.6334946751594543, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4440 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.5333067178726196, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4450 + }, + { + "epoch": 0.32028725314183126, + "grad_norm": 0.500091552734375, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 4460 + }, + { + "epoch": 0.3210053859964093, + "grad_norm": 0.5190957188606262, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4470 + }, + { + "epoch": 0.32172351885098743, + "grad_norm": 0.6702370047569275, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 4480 + }, + { + "epoch": 0.32244165170556555, + "grad_norm": 0.4393869638442993, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 4490 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 4500 + }, + { + "epoch": 0.3238779174147217, + "grad_norm": 0.561836838722229, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 4510 + }, + { + "epoch": 0.32459605026929983, + "grad_norm": 0.44366541504859924, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 4520 + }, + { + "epoch": 0.32531418312387794, + "grad_norm": 0.46504274010658264, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 4530 + }, + { + "epoch": 0.326032315978456, + "grad_norm": 0.5498034954071045, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 4540 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 0.5901338458061218, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 4550 + }, + { + "epoch": 0.3274685816876122, + "grad_norm": 0.5485442876815796, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 4560 + }, + { + "epoch": 0.3281867145421903, + "grad_norm": 0.512584924697876, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4570 + }, + { + "epoch": 0.3289048473967684, + "grad_norm": 0.5208188891410828, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 4580 + }, + { + "epoch": 0.3296229802513465, + "grad_norm": 0.4923836886882782, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 4590 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 0.49258530139923096, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 4600 + }, + { + "epoch": 0.3310592459605027, + "grad_norm": 0.4788922667503357, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 4610 + }, + { + "epoch": 0.3317773788150808, + "grad_norm": 0.48276954889297485, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4620 + }, + { + "epoch": 0.3324955116696589, + "grad_norm": 0.6300732493400574, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4630 + }, + { + "epoch": 0.33321364452423696, + "grad_norm": 0.47594770789146423, + "learning_rate": 0.0002, + "loss": 0.8434, + "step": 4640 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.4728924632072449, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 4650 + }, + { + "epoch": 0.3346499102333932, + "grad_norm": 0.5586788654327393, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4660 + }, + { + "epoch": 0.3353680430879713, + "grad_norm": 0.4573180377483368, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 4670 + }, + { + "epoch": 0.33608617594254936, + "grad_norm": 0.6391524076461792, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 4680 + }, + { + "epoch": 0.33680430879712747, + "grad_norm": 0.6570921540260315, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 4690 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 0.4601454734802246, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 4700 + }, + { + "epoch": 0.33824057450628364, + "grad_norm": 0.5640755295753479, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 4710 + }, + { + "epoch": 0.33895870736086176, + "grad_norm": 0.43475520610809326, + "learning_rate": 0.0002, + "loss": 0.8326, + "step": 4720 + }, + { + "epoch": 0.33967684021543987, + "grad_norm": 0.4785807132720947, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 4730 + }, + { + "epoch": 0.340394973070018, + "grad_norm": 0.4934665262699127, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 4740 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 0.45327693223953247, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 4750 + }, + { + "epoch": 0.34183123877917415, + "grad_norm": 0.4710456430912018, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4760 + }, + { + "epoch": 0.34254937163375226, + "grad_norm": 0.5591559410095215, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 4770 + }, + { + "epoch": 0.3432675044883303, + "grad_norm": 0.48958835005760193, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 4780 + }, + { + "epoch": 0.34398563734290843, + "grad_norm": 0.4613766670227051, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 4790 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 0.5425335764884949, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 4800 + }, + { + "epoch": 0.3454219030520646, + "grad_norm": 0.4964924156665802, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.3461400359066427, + "grad_norm": 0.613449215888977, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 4820 + }, + { + "epoch": 0.34685816876122083, + "grad_norm": 0.6553348898887634, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 4830 + }, + { + "epoch": 0.34757630161579894, + "grad_norm": 0.5863470435142517, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 4840 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.5338097810745239, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 4850 + }, + { + "epoch": 0.3490125673249551, + "grad_norm": 0.6129760146141052, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 4860 + }, + { + "epoch": 0.3497307001795332, + "grad_norm": 0.6100956797599792, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 4870 + }, + { + "epoch": 0.3504488330341113, + "grad_norm": 0.5478541254997253, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 4880 + }, + { + "epoch": 0.3511669658886894, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 4890 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6141043901443481, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 4900 + }, + { + "epoch": 0.3526032315978456, + "grad_norm": 0.597191572189331, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 4910 + }, + { + "epoch": 0.3533213644524237, + "grad_norm": 0.5988389253616333, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 4920 + }, + { + "epoch": 0.3540394973070018, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 4930 + }, + { + "epoch": 0.3547576301615799, + "grad_norm": 0.5932779312133789, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 4940 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 0.48911359906196594, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 4950 + }, + { + "epoch": 0.3561938958707361, + "grad_norm": 0.5435750484466553, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4960 + }, + { + "epoch": 0.3569120287253142, + "grad_norm": 0.4786977767944336, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 4970 + }, + { + "epoch": 0.3576301615798923, + "grad_norm": 0.4022316336631775, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 4980 + }, + { + "epoch": 0.35834829443447036, + "grad_norm": 0.4848504364490509, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 4990 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 0.5093459486961365, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 5000 + }, + { + "epoch": 0.3597845601436266, + "grad_norm": 0.47368478775024414, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 5010 + }, + { + "epoch": 0.36050269299820464, + "grad_norm": 0.6041097044944763, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 5020 + }, + { + "epoch": 0.36122082585278276, + "grad_norm": 0.5384424924850464, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 5030 + }, + { + "epoch": 0.36193895870736087, + "grad_norm": 0.4668518602848053, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 5040 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 0.5471060276031494, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 5050 + }, + { + "epoch": 0.36337522441651704, + "grad_norm": 0.731369137763977, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 5060 + }, + { + "epoch": 0.36409335727109515, + "grad_norm": 0.5119590759277344, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 5070 + }, + { + "epoch": 0.36481149012567327, + "grad_norm": 0.567428469657898, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 5080 + }, + { + "epoch": 0.3655296229802513, + "grad_norm": 0.5139971375465393, + "learning_rate": 0.0002, + "loss": 0.7616, + "step": 5090 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.5701581835746765, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 5100 + }, + { + "epoch": 0.36696588868940755, + "grad_norm": 0.5022063851356506, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 5110 + }, + { + "epoch": 0.36768402154398566, + "grad_norm": 0.4684354364871979, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 5120 + }, + { + "epoch": 0.3684021543985637, + "grad_norm": 0.5423495769500732, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 5130 + }, + { + "epoch": 0.36912028725314183, + "grad_norm": 0.46262967586517334, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 5140 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 0.4720141589641571, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 5150 + }, + { + "epoch": 0.370556552962298, + "grad_norm": 0.5113096833229065, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 5160 + }, + { + "epoch": 0.3712746858168761, + "grad_norm": 0.5253350138664246, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 5170 + }, + { + "epoch": 0.37199281867145423, + "grad_norm": 0.5799776315689087, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 5180 + }, + { + "epoch": 0.37271095152603234, + "grad_norm": 0.5166001319885254, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5190 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 5200 + }, + { + "epoch": 0.3741472172351885, + "grad_norm": 0.45811113715171814, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 5210 + }, + { + "epoch": 0.3748653500897666, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 5220 + }, + { + "epoch": 0.3755834829443447, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 0.3763016157989228, + "grad_norm": 0.3858596086502075, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 5240 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 0.6941536068916321, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 5250 + }, + { + "epoch": 0.377737881508079, + "grad_norm": 0.46940872073173523, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 5260 + }, + { + "epoch": 0.3784560143626571, + "grad_norm": 0.5413833260536194, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5270 + }, + { + "epoch": 0.3791741472172352, + "grad_norm": 0.5165658593177795, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 5280 + }, + { + "epoch": 0.3798922800718133, + "grad_norm": 0.6567398309707642, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 5290 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.5466915965080261, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 5300 + }, + { + "epoch": 0.3813285457809695, + "grad_norm": 0.4800598621368408, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 5310 + }, + { + "epoch": 0.3820466786355476, + "grad_norm": 0.4551742970943451, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 5320 + }, + { + "epoch": 0.3827648114901257, + "grad_norm": 0.5561164617538452, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 5330 + }, + { + "epoch": 0.38348294434470376, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 5340 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.465762197971344, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 5350 + }, + { + "epoch": 0.38491921005386, + "grad_norm": 0.6176838874816895, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 5360 + }, + { + "epoch": 0.38563734290843804, + "grad_norm": 0.657926082611084, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 5370 + }, + { + "epoch": 0.38635547576301615, + "grad_norm": 0.5063281655311584, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 5380 + }, + { + "epoch": 0.38707360861759427, + "grad_norm": 0.6960828304290771, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 5390 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 0.46712034940719604, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 5400 + }, + { + "epoch": 0.38850987432675044, + "grad_norm": 0.598114013671875, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 5410 + }, + { + "epoch": 0.38922800718132855, + "grad_norm": 0.6798132061958313, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 5420 + }, + { + "epoch": 0.38994614003590666, + "grad_norm": 0.5194289088249207, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 5430 + }, + { + "epoch": 0.3906642728904847, + "grad_norm": 0.48175323009490967, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 5440 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 0.4979408085346222, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 5450 + }, + { + "epoch": 0.39210053859964095, + "grad_norm": 0.6440972685813904, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5460 + }, + { + "epoch": 0.392818671454219, + "grad_norm": 0.5977227091789246, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 5470 + }, + { + "epoch": 0.3935368043087971, + "grad_norm": 0.4735909104347229, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 5480 + }, + { + "epoch": 0.39425493716337523, + "grad_norm": 0.48181721568107605, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 5490 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.6339454650878906, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 5500 + }, + { + "epoch": 0.3956912028725314, + "grad_norm": 0.5364336371421814, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5510 + }, + { + "epoch": 0.3964093357271095, + "grad_norm": 0.5499233603477478, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 5520 + }, + { + "epoch": 0.3971274685816876, + "grad_norm": 0.47249847650527954, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 5530 + }, + { + "epoch": 0.3978456014362657, + "grad_norm": 0.5692135095596313, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 5540 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 0.6009272933006287, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 5550 + }, + { + "epoch": 0.3992818671454219, + "grad_norm": 0.5198255181312561, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5560 + }, + { + "epoch": 0.4, + "grad_norm": 0.5474766492843628, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 0.4007181328545781, + "grad_norm": 0.5577479600906372, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 5580 + }, + { + "epoch": 0.4014362657091562, + "grad_norm": 0.5350302457809448, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5590 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 0.6310991048812866, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 5600 + }, + { + "epoch": 0.40287253141831236, + "grad_norm": 0.5695762038230896, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5610 + }, + { + "epoch": 0.4035906642728905, + "grad_norm": 0.5431827306747437, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 5620 + }, + { + "epoch": 0.4043087971274686, + "grad_norm": 0.4923325777053833, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 5630 + }, + { + "epoch": 0.4050269299820467, + "grad_norm": 0.531399667263031, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 5640 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.5854769349098206, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 5650 + }, + { + "epoch": 0.40646319569120287, + "grad_norm": 0.6684802174568176, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 5660 + }, + { + "epoch": 0.407181328545781, + "grad_norm": 0.6618620753288269, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 5670 + }, + { + "epoch": 0.40789946140035904, + "grad_norm": 0.4930776059627533, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 5680 + }, + { + "epoch": 0.40861759425493716, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 5690 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5700 + }, + { + "epoch": 0.4100538599640934, + "grad_norm": 0.6773046851158142, + "learning_rate": 0.0002, + "loss": 0.8386, + "step": 5710 + }, + { + "epoch": 0.41077199281867144, + "grad_norm": 0.6750592589378357, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 5720 + }, + { + "epoch": 0.41149012567324955, + "grad_norm": 0.5277232527732849, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5730 + }, + { + "epoch": 0.41220825852782766, + "grad_norm": 0.5155990719795227, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 5740 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 0.5236294865608215, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 5750 + }, + { + "epoch": 0.41364452423698383, + "grad_norm": 0.5073592066764832, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 5760 + }, + { + "epoch": 0.41436265709156195, + "grad_norm": 0.6997184753417969, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 5770 + }, + { + "epoch": 0.41508078994614006, + "grad_norm": 0.5282439589500427, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 5780 + }, + { + "epoch": 0.4157989228007181, + "grad_norm": 0.4997355341911316, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5790 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.6081610321998596, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5800 + }, + { + "epoch": 0.41723518850987434, + "grad_norm": 0.5640295147895813, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 5810 + }, + { + "epoch": 0.4179533213644524, + "grad_norm": 0.6443586349487305, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 0.4186714542190305, + "grad_norm": 0.6456229090690613, + "learning_rate": 0.0002, + "loss": 0.8132, + "step": 5830 + }, + { + "epoch": 0.4193895870736086, + "grad_norm": 0.5422267317771912, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5840 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 0.45251885056495667, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5850 + }, + { + "epoch": 0.4208258527827648, + "grad_norm": 0.781165599822998, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5860 + }, + { + "epoch": 0.4215439856373429, + "grad_norm": 0.5359160900115967, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5870 + }, + { + "epoch": 0.422262118491921, + "grad_norm": 0.6201958656311035, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5880 + }, + { + "epoch": 0.4229802513464991, + "grad_norm": 0.5985850691795349, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 5890 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.5550961494445801, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 5900 + }, + { + "epoch": 0.4244165170556553, + "grad_norm": 0.6284893155097961, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 5910 + }, + { + "epoch": 0.4251346499102334, + "grad_norm": 0.6143685579299927, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 5920 + }, + { + "epoch": 0.4258527827648115, + "grad_norm": 0.5065329670906067, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5930 + }, + { + "epoch": 0.4265709156193896, + "grad_norm": 0.7274345755577087, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 5940 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.606531023979187, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 5950 + }, + { + "epoch": 0.42800718132854576, + "grad_norm": 0.5983648300170898, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5960 + }, + { + "epoch": 0.4287253141831239, + "grad_norm": 0.5546031594276428, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5970 + }, + { + "epoch": 0.429443447037702, + "grad_norm": 0.666868269443512, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 5980 + }, + { + "epoch": 0.4301615798922801, + "grad_norm": 0.41438576579093933, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5990 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 0.5012526512145996, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 6000 + }, + { + "epoch": 0.43159784560143627, + "grad_norm": 0.6071694493293762, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 6010 + }, + { + "epoch": 0.4323159784560144, + "grad_norm": 0.5538384914398193, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 6020 + }, + { + "epoch": 0.43303411131059244, + "grad_norm": 0.5798718929290771, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 6030 + }, + { + "epoch": 0.43375224416517055, + "grad_norm": 0.5442442893981934, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 6040 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.6895565390586853, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 6050 + }, + { + "epoch": 0.4351885098743267, + "grad_norm": 0.6498045325279236, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 6060 + }, + { + "epoch": 0.43590664272890484, + "grad_norm": 0.5225510001182556, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 6070 + }, + { + "epoch": 0.43662477558348295, + "grad_norm": 0.6366992592811584, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 6080 + }, + { + "epoch": 0.43734290843806106, + "grad_norm": 0.47929027676582336, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 6090 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.5722405910491943, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 6100 + }, + { + "epoch": 0.43877917414721723, + "grad_norm": 0.6008004546165466, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 6110 + }, + { + "epoch": 0.43949730700179535, + "grad_norm": 0.5922580361366272, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 6120 + }, + { + "epoch": 0.4402154398563734, + "grad_norm": 0.7051905393600464, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 6130 + }, + { + "epoch": 0.4409335727109515, + "grad_norm": 0.5146450400352478, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 6140 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5605781674385071, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 6150 + }, + { + "epoch": 0.44236983842010774, + "grad_norm": 0.8008661866188049, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 6160 + }, + { + "epoch": 0.4430879712746858, + "grad_norm": 0.47406497597694397, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 6170 + }, + { + "epoch": 0.4438061041292639, + "grad_norm": 0.612287700176239, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 6180 + }, + { + "epoch": 0.444524236983842, + "grad_norm": 0.561188280582428, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 6190 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.6233669519424438, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 6200 + }, + { + "epoch": 0.4459605026929982, + "grad_norm": 0.45546263456344604, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6210 + }, + { + "epoch": 0.4466786355475763, + "grad_norm": 0.5947871208190918, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 6220 + }, + { + "epoch": 0.4473967684021544, + "grad_norm": 0.6109753847122192, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 6230 + }, + { + "epoch": 0.4481149012567325, + "grad_norm": 0.6380727887153625, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6240 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.5225699543952942, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 6250 + }, + { + "epoch": 0.4495511669658887, + "grad_norm": 0.521503210067749, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.45026929982046676, + "grad_norm": 0.5523216128349304, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 6270 + }, + { + "epoch": 0.4509874326750449, + "grad_norm": 0.5954921841621399, + "learning_rate": 0.0002, + "loss": 0.8228, + "step": 6280 + }, + { + "epoch": 0.451705565529623, + "grad_norm": 0.702751100063324, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 6290 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 0.5756356120109558, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 6300 + }, + { + "epoch": 0.45314183123877916, + "grad_norm": 0.45365944504737854, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 6310 + }, + { + "epoch": 0.45385996409335727, + "grad_norm": 0.5027855038642883, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6320 + }, + { + "epoch": 0.4545780969479354, + "grad_norm": 0.6551687121391296, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 6330 + }, + { + "epoch": 0.45529622980251344, + "grad_norm": 0.5296684503555298, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6340 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 0.5762032866477966, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6350 + }, + { + "epoch": 0.45673249551166967, + "grad_norm": 0.5234073996543884, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6360 + }, + { + "epoch": 0.4574506283662478, + "grad_norm": 0.5090946555137634, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 6370 + }, + { + "epoch": 0.45816876122082584, + "grad_norm": 0.6515111327171326, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 6380 + }, + { + "epoch": 0.45888689407540395, + "grad_norm": 0.7904898524284363, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 6390 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.6379680037498474, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 6400 + }, + { + "epoch": 0.4603231597845601, + "grad_norm": 0.641759991645813, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 6410 + }, + { + "epoch": 0.46104129263913823, + "grad_norm": 0.5273829698562622, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 6420 + }, + { + "epoch": 0.46175942549371635, + "grad_norm": 0.5668497681617737, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6430 + }, + { + "epoch": 0.46247755834829446, + "grad_norm": 0.5862061381340027, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 6440 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 0.5239592790603638, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 6450 + }, + { + "epoch": 0.46391382405745063, + "grad_norm": 0.5078722834587097, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 6460 + }, + { + "epoch": 0.46463195691202874, + "grad_norm": 0.566509485244751, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 6470 + }, + { + "epoch": 0.4653500897666068, + "grad_norm": 0.5952697396278381, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 6480 + }, + { + "epoch": 0.4660682226211849, + "grad_norm": 0.6548156142234802, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 6490 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 0.4768427908420563, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.46750448833034114, + "grad_norm": 0.5588273406028748, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 6510 + }, + { + "epoch": 0.4682226211849192, + "grad_norm": 0.5348677039146423, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 6520 + }, + { + "epoch": 0.4689407540394973, + "grad_norm": 0.4784318804740906, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 6530 + }, + { + "epoch": 0.4696588868940754, + "grad_norm": 0.5112265944480896, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 6540 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 0.7250495553016663, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 6550 + }, + { + "epoch": 0.4710951526032316, + "grad_norm": 0.538608968257904, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 6560 + }, + { + "epoch": 0.4718132854578097, + "grad_norm": 0.5981247425079346, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 6570 + }, + { + "epoch": 0.4725314183123878, + "grad_norm": 0.5466762781143188, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 6580 + }, + { + "epoch": 0.4732495511669659, + "grad_norm": 0.5609987378120422, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 6590 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 0.6091027855873108, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 6600 + }, + { + "epoch": 0.4746858168761221, + "grad_norm": 0.5542886853218079, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 6610 + }, + { + "epoch": 0.47540394973070016, + "grad_norm": 0.5656579732894897, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6620 + }, + { + "epoch": 0.4761220825852783, + "grad_norm": 0.47507357597351074, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 6630 + }, + { + "epoch": 0.4768402154398564, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6640 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 0.7129740715026855, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 6650 + }, + { + "epoch": 0.47827648114901256, + "grad_norm": 0.5189188718795776, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 6660 + }, + { + "epoch": 0.47899461400359067, + "grad_norm": 0.7548696398735046, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 6670 + }, + { + "epoch": 0.4797127468581688, + "grad_norm": 0.4729466438293457, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 6680 + }, + { + "epoch": 0.48043087971274684, + "grad_norm": 0.6190000772476196, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 6690 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 0.6276983022689819, + "learning_rate": 0.0002, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.48186714542190306, + "grad_norm": 0.6097590923309326, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 6710 + }, + { + "epoch": 0.4825852782764811, + "grad_norm": 0.6507330536842346, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 6720 + }, + { + "epoch": 0.48330341113105924, + "grad_norm": 0.5501991510391235, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 6730 + }, + { + "epoch": 0.48402154398563735, + "grad_norm": 0.5928015112876892, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 6740 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 0.5523008704185486, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 6750 + }, + { + "epoch": 0.4854578096947935, + "grad_norm": 0.5997263789176941, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 6760 + }, + { + "epoch": 0.48617594254937163, + "grad_norm": 0.6201002597808838, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 6770 + }, + { + "epoch": 0.48689407540394974, + "grad_norm": 0.6338862776756287, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 6780 + }, + { + "epoch": 0.4876122082585278, + "grad_norm": 0.5542550086975098, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6790 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.5587872862815857, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 6800 + }, + { + "epoch": 0.489048473967684, + "grad_norm": 0.5895681977272034, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 6810 + }, + { + "epoch": 0.48976660682226214, + "grad_norm": 0.4948221743106842, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 6820 + }, + { + "epoch": 0.4904847396768402, + "grad_norm": 0.44546931982040405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 6830 + }, + { + "epoch": 0.4912028725314183, + "grad_norm": 0.632046103477478, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 6840 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 0.49396243691444397, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 6850 + }, + { + "epoch": 0.4926391382405745, + "grad_norm": 0.497745156288147, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6860 + }, + { + "epoch": 0.4933572710951526, + "grad_norm": 0.7336170077323914, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 6870 + }, + { + "epoch": 0.4940754039497307, + "grad_norm": 0.6723181009292603, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 6880 + }, + { + "epoch": 0.4947935368043088, + "grad_norm": 0.5887754559516907, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 6890 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 0.6580226421356201, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 6900 + }, + { + "epoch": 0.496229802513465, + "grad_norm": 0.7385056614875793, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 6910 + }, + { + "epoch": 0.4969479353680431, + "grad_norm": 0.48736000061035156, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6920 + }, + { + "epoch": 0.49766606822262116, + "grad_norm": 0.6304559111595154, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 6930 + }, + { + "epoch": 0.4983842010771993, + "grad_norm": 0.607148289680481, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6940 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.5467981696128845, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 6950 + }, + { + "epoch": 0.4998204667863555, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 6960 + }, + { + "epoch": 0.5005385996409336, + "grad_norm": 0.5487921833992004, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 6970 + }, + { + "epoch": 0.5012567324955116, + "grad_norm": 0.5706006288528442, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 6980 + }, + { + "epoch": 0.5019748653500897, + "grad_norm": 0.539536714553833, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 6990 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 0.5527397394180298, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 7000 + }, + { + "epoch": 0.503411131059246, + "grad_norm": 0.5498567223548889, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 7010 + }, + { + "epoch": 0.5041292639138241, + "grad_norm": 0.5878575444221497, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 7020 + }, + { + "epoch": 0.5048473967684022, + "grad_norm": 0.646153450012207, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 7030 + }, + { + "epoch": 0.5055655296229803, + "grad_norm": 0.5603899359703064, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 7040 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 0.5849952697753906, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 7050 + }, + { + "epoch": 0.5070017953321364, + "grad_norm": 0.6082724928855896, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 7060 + }, + { + "epoch": 0.5077199281867145, + "grad_norm": 0.5900670289993286, + "learning_rate": 0.0002, + "loss": 0.8046, + "step": 7070 + }, + { + "epoch": 0.5084380610412926, + "grad_norm": 0.5856624841690063, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 7080 + }, + { + "epoch": 0.5091561938958707, + "grad_norm": 0.6177338361740112, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7090 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 0.5559300184249878, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 7100 + }, + { + "epoch": 0.510592459605027, + "grad_norm": 0.62027907371521, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 7110 + }, + { + "epoch": 0.511310592459605, + "grad_norm": 0.6334301829338074, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7120 + }, + { + "epoch": 0.5120287253141831, + "grad_norm": 0.513795018196106, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 7130 + }, + { + "epoch": 0.5127468581687612, + "grad_norm": 0.7004675269126892, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 7140 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5614308714866638, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7150 + }, + { + "epoch": 0.5141831238779174, + "grad_norm": 0.5037539601325989, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 7160 + }, + { + "epoch": 0.5149012567324955, + "grad_norm": 0.5568661093711853, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 7170 + }, + { + "epoch": 0.5156193895870737, + "grad_norm": 0.7513397336006165, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7180 + }, + { + "epoch": 0.5163375224416517, + "grad_norm": 0.7264583706855774, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 7190 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.6355819702148438, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 7200 + }, + { + "epoch": 0.5177737881508079, + "grad_norm": 0.6063222289085388, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 7210 + }, + { + "epoch": 0.518491921005386, + "grad_norm": 0.6484307646751404, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 7220 + }, + { + "epoch": 0.5192100538599641, + "grad_norm": 0.5260455012321472, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 7230 + }, + { + "epoch": 0.5199281867145422, + "grad_norm": 0.6718002557754517, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7240 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 0.5997617244720459, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 7250 + }, + { + "epoch": 0.5213644524236983, + "grad_norm": 0.5838589668273926, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 7260 + }, + { + "epoch": 0.5220825852782764, + "grad_norm": 0.5755977630615234, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 7270 + }, + { + "epoch": 0.5228007181328546, + "grad_norm": 0.6442093253135681, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 7280 + }, + { + "epoch": 0.5235188509874327, + "grad_norm": 0.6128416657447815, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 7290 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.509742796421051, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 7300 + }, + { + "epoch": 0.5249551166965889, + "grad_norm": 0.5450230836868286, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 7310 + }, + { + "epoch": 0.525673249551167, + "grad_norm": 0.5437141060829163, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 7320 + }, + { + "epoch": 0.526391382405745, + "grad_norm": 0.5291738510131836, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 7330 + }, + { + "epoch": 0.5271095152603231, + "grad_norm": 0.5101743936538696, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 7340 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.5678408145904541, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 7350 + }, + { + "epoch": 0.5285457809694794, + "grad_norm": 0.6332360506057739, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7360 + }, + { + "epoch": 0.5292639138240575, + "grad_norm": 0.4935058653354645, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 7370 + }, + { + "epoch": 0.5299820466786356, + "grad_norm": 0.6399656534194946, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7380 + }, + { + "epoch": 0.5307001795332137, + "grad_norm": 0.5986794233322144, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 7390 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.6948414444923401, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 7400 + }, + { + "epoch": 0.5321364452423698, + "grad_norm": 0.5337842106819153, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 7410 + }, + { + "epoch": 0.5328545780969479, + "grad_norm": 0.6897268295288086, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 7420 + }, + { + "epoch": 0.533572710951526, + "grad_norm": 0.6361175179481506, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 7430 + }, + { + "epoch": 0.5342908438061041, + "grad_norm": 0.5242252945899963, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 7440 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 0.5731322765350342, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 7450 + }, + { + "epoch": 0.5357271095152604, + "grad_norm": 0.5790955424308777, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 7460 + }, + { + "epoch": 0.5364452423698384, + "grad_norm": 0.4979061782360077, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 7470 + }, + { + "epoch": 0.5371633752244165, + "grad_norm": 0.7335101962089539, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 7480 + }, + { + "epoch": 0.5378815080789946, + "grad_norm": 0.592521071434021, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 7490 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.5784769654273987, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 7500 + }, + { + "epoch": 0.5393177737881508, + "grad_norm": 0.8148589730262756, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 7510 + }, + { + "epoch": 0.5400359066427289, + "grad_norm": 0.5727689862251282, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7520 + }, + { + "epoch": 0.540754039497307, + "grad_norm": 0.6958279609680176, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 7530 + }, + { + "epoch": 0.541472172351885, + "grad_norm": 0.6302788257598877, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 7540 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.5950970649719238, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 7550 + }, + { + "epoch": 0.5429084380610413, + "grad_norm": 0.4275270104408264, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 7560 + }, + { + "epoch": 0.5436265709156194, + "grad_norm": 0.7579900622367859, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 7570 + }, + { + "epoch": 0.5443447037701975, + "grad_norm": 0.5835317969322205, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 7580 + }, + { + "epoch": 0.5450628366247756, + "grad_norm": 0.5305142998695374, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 7590 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7600 + }, + { + "epoch": 0.5464991023339317, + "grad_norm": 0.5341935753822327, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 7610 + }, + { + "epoch": 0.5472172351885098, + "grad_norm": 0.6070826053619385, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 7620 + }, + { + "epoch": 0.547935368043088, + "grad_norm": 0.6193035840988159, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 7630 + }, + { + "epoch": 0.5486535008976661, + "grad_norm": 0.6171614527702332, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 7640 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.5700938105583191, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 7650 + }, + { + "epoch": 0.5500897666068223, + "grad_norm": 0.5742418169975281, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7660 + }, + { + "epoch": 0.5508078994614004, + "grad_norm": 0.6450320482254028, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 7670 + }, + { + "epoch": 0.5515260323159784, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 7680 + }, + { + "epoch": 0.5522441651705565, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 7690 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.5846288204193115, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7700 + }, + { + "epoch": 0.5536804308797127, + "grad_norm": 0.623315155506134, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7710 + }, + { + "epoch": 0.5543985637342909, + "grad_norm": 0.6607962250709534, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7720 + }, + { + "epoch": 0.555116696588869, + "grad_norm": 0.5258557200431824, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 7730 + }, + { + "epoch": 0.5558348294434471, + "grad_norm": 0.6464316844940186, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7740 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 0.6390621662139893, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 7750 + }, + { + "epoch": 0.5572710951526032, + "grad_norm": 0.5327560305595398, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 7760 + }, + { + "epoch": 0.5579892280071813, + "grad_norm": 0.8202064633369446, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 7770 + }, + { + "epoch": 0.5587073608617594, + "grad_norm": 0.45350968837738037, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 7780 + }, + { + "epoch": 0.5594254937163375, + "grad_norm": 0.5031413435935974, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 7790 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.5047417879104614, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 7800 + }, + { + "epoch": 0.5608617594254938, + "grad_norm": 0.668912410736084, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 0.5615798922800718, + "grad_norm": 0.6106061339378357, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7820 + }, + { + "epoch": 0.5622980251346499, + "grad_norm": 0.5558443665504456, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 7830 + }, + { + "epoch": 0.563016157989228, + "grad_norm": 0.5937177538871765, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 7840 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 0.67307448387146, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 7850 + }, + { + "epoch": 0.5644524236983842, + "grad_norm": 0.4615475833415985, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7860 + }, + { + "epoch": 0.5651705565529623, + "grad_norm": 0.5462577939033508, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 7870 + }, + { + "epoch": 0.5658886894075404, + "grad_norm": 0.6422402858734131, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7880 + }, + { + "epoch": 0.5666068222621184, + "grad_norm": 0.5313532948493958, + "learning_rate": 0.0002, + "loss": 0.8327, + "step": 7890 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 0.5647847056388855, + "learning_rate": 0.0002, + "loss": 0.7771, + "step": 7900 + }, + { + "epoch": 0.5680430879712747, + "grad_norm": 0.6581610441207886, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 7910 + }, + { + "epoch": 0.5687612208258528, + "grad_norm": 0.46947669982910156, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 7920 + }, + { + "epoch": 0.5694793536804309, + "grad_norm": 0.6420038342475891, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7930 + }, + { + "epoch": 0.570197486535009, + "grad_norm": 0.6730441451072693, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 7940 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.3849070966243744, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 7950 + }, + { + "epoch": 0.5716337522441651, + "grad_norm": 0.6076335906982422, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 7960 + }, + { + "epoch": 0.5723518850987432, + "grad_norm": 0.6446982026100159, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 7970 + }, + { + "epoch": 0.5730700179533214, + "grad_norm": 0.6019234657287598, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 7980 + }, + { + "epoch": 0.5737881508078995, + "grad_norm": 0.620880663394928, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 7990 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 0.4927573502063751, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 8000 + }, + { + "epoch": 0.5752244165170557, + "grad_norm": 0.6276804804801941, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8010 + }, + { + "epoch": 0.5759425493716338, + "grad_norm": 0.484518826007843, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 8020 + }, + { + "epoch": 0.5766606822262118, + "grad_norm": 0.5019962787628174, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.5773788150807899, + "grad_norm": 0.6685234308242798, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 8040 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 0.5762107372283936, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 8050 + }, + { + "epoch": 0.5788150807899461, + "grad_norm": 0.6402477025985718, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 8060 + }, + { + "epoch": 0.5795332136445243, + "grad_norm": 0.5919345617294312, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8070 + }, + { + "epoch": 0.5802513464991024, + "grad_norm": 0.47100913524627686, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 8080 + }, + { + "epoch": 0.5809694793536805, + "grad_norm": 0.6029118895530701, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 8090 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 0.5896338820457458, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 8100 + }, + { + "epoch": 0.5824057450628366, + "grad_norm": 0.49017754197120667, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 8110 + }, + { + "epoch": 0.5831238779174147, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 8120 + }, + { + "epoch": 0.5838420107719928, + "grad_norm": 0.6874517798423767, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.5845601436265709, + "grad_norm": 0.5429391264915466, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 8140 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.5533722639083862, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5859964093357272, + "grad_norm": 0.5827956199645996, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 8160 + }, + { + "epoch": 0.5867145421903052, + "grad_norm": 0.6670212149620056, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 8170 + }, + { + "epoch": 0.5874326750448833, + "grad_norm": 0.5231172442436218, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 8180 + }, + { + "epoch": 0.5881508078994614, + "grad_norm": 0.567447304725647, + "learning_rate": 0.0002, + "loss": 0.7975, + "step": 8190 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 0.5318575501441956, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8200 + }, + { + "epoch": 0.5895870736086176, + "grad_norm": 0.6959463357925415, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 8210 + }, + { + "epoch": 0.5903052064631957, + "grad_norm": 0.6964931488037109, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 8220 + }, + { + "epoch": 0.5910233393177737, + "grad_norm": 0.5164617896080017, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 8230 + }, + { + "epoch": 0.5917414721723518, + "grad_norm": 0.5456110239028931, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 8240 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.6553666591644287, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 8250 + }, + { + "epoch": 0.5931777378815081, + "grad_norm": 0.6185845732688904, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 8260 + }, + { + "epoch": 0.5938958707360862, + "grad_norm": 0.6110545992851257, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8270 + }, + { + "epoch": 0.5946140035906643, + "grad_norm": 0.5186824202537537, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 8280 + }, + { + "epoch": 0.5953321364452424, + "grad_norm": 0.7003735303878784, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 8290 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 0.4606216549873352, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 8300 + }, + { + "epoch": 0.5967684021543985, + "grad_norm": 0.5903441309928894, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 8310 + }, + { + "epoch": 0.5974865350089766, + "grad_norm": 0.7916744947433472, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 8320 + }, + { + "epoch": 0.5982046678635548, + "grad_norm": 0.5506401062011719, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 8330 + }, + { + "epoch": 0.5989228007181329, + "grad_norm": 0.5749204158782959, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 8340 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.6807544827461243, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 8350 + }, + { + "epoch": 0.6003590664272891, + "grad_norm": 0.5782986283302307, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 8360 + }, + { + "epoch": 0.6010771992818671, + "grad_norm": 0.7336342334747314, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 8370 + }, + { + "epoch": 0.6017953321364452, + "grad_norm": 0.5762712955474854, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.6025134649910233, + "grad_norm": 0.5726776719093323, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 8390 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.5355535745620728, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 8400 + }, + { + "epoch": 0.6039497307001795, + "grad_norm": 0.6762161254882812, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 8410 + }, + { + "epoch": 0.6046678635547577, + "grad_norm": 0.8200717568397522, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 8420 + }, + { + "epoch": 0.6053859964093358, + "grad_norm": 0.5600009560585022, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 8430 + }, + { + "epoch": 0.6061041292639138, + "grad_norm": 0.6465966105461121, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 8440 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.5176072120666504, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 8450 + }, + { + "epoch": 0.60754039497307, + "grad_norm": 0.5777280926704407, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 8460 + }, + { + "epoch": 0.6082585278276481, + "grad_norm": 0.5989252924919128, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 8470 + }, + { + "epoch": 0.6089766606822262, + "grad_norm": 0.5207306742668152, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8480 + }, + { + "epoch": 0.6096947935368043, + "grad_norm": 0.5242675542831421, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 8490 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 0.5631455183029175, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 8500 + }, + { + "epoch": 0.6111310592459605, + "grad_norm": 0.65207439661026, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 8510 + }, + { + "epoch": 0.6118491921005386, + "grad_norm": 0.5808899998664856, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8520 + }, + { + "epoch": 0.6125673249551167, + "grad_norm": 0.558127760887146, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 8530 + }, + { + "epoch": 0.6132854578096948, + "grad_norm": 0.6063143014907837, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8540 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.5491744875907898, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 8550 + }, + { + "epoch": 0.614721723518851, + "grad_norm": 0.5105780959129333, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8560 + }, + { + "epoch": 0.6154398563734291, + "grad_norm": 0.6892395615577698, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 8570 + }, + { + "epoch": 0.6161579892280071, + "grad_norm": 0.7411758899688721, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8580 + }, + { + "epoch": 0.6168761220825852, + "grad_norm": 0.6745429635047913, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 8590 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 0.596007227897644, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 8600 + }, + { + "epoch": 0.6183123877917415, + "grad_norm": 0.6751060485839844, + "learning_rate": 0.0002, + "loss": 0.7963, + "step": 8610 + }, + { + "epoch": 0.6190305206463196, + "grad_norm": 0.711124837398529, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 8620 + }, + { + "epoch": 0.6197486535008977, + "grad_norm": 0.6110914945602417, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 8630 + }, + { + "epoch": 0.6204667863554758, + "grad_norm": 0.5687659978866577, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 8640 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.7025772929191589, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8650 + }, + { + "epoch": 0.6219030520646319, + "grad_norm": 0.6456184983253479, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 8660 + }, + { + "epoch": 0.62262118491921, + "grad_norm": 0.5317023992538452, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 8670 + }, + { + "epoch": 0.6233393177737881, + "grad_norm": 0.5531691908836365, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 8680 + }, + { + "epoch": 0.6240574506283663, + "grad_norm": 0.6063531637191772, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 8690 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 1.094390630722046, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 8700 + }, + { + "epoch": 0.6254937163375225, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 8710 + }, + { + "epoch": 0.6262118491921005, + "grad_norm": 0.5470370054244995, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 8720 + }, + { + "epoch": 0.6269299820466786, + "grad_norm": 0.5852634310722351, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 8730 + }, + { + "epoch": 0.6276481149012567, + "grad_norm": 0.6120240092277527, + "learning_rate": 0.0002, + "loss": 0.8712, + "step": 8740 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 0.5608004927635193, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 8750 + }, + { + "epoch": 0.6290843806104129, + "grad_norm": 0.5980432033538818, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 8760 + }, + { + "epoch": 0.629802513464991, + "grad_norm": 0.5670580863952637, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 8770 + }, + { + "epoch": 0.6305206463195692, + "grad_norm": 0.5931687951087952, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 8780 + }, + { + "epoch": 0.6312387791741472, + "grad_norm": 0.7872577905654907, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 8790 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.6355181336402893, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 8800 + }, + { + "epoch": 0.6326750448833034, + "grad_norm": 0.501913845539093, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 8810 + }, + { + "epoch": 0.6333931777378815, + "grad_norm": 0.5956716537475586, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8820 + }, + { + "epoch": 0.6341113105924596, + "grad_norm": 0.6448253393173218, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 8830 + }, + { + "epoch": 0.6348294434470377, + "grad_norm": 0.6139631271362305, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 8840 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.5894306302070618, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 8850 + }, + { + "epoch": 0.6362657091561938, + "grad_norm": 0.8724799752235413, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 8860 + }, + { + "epoch": 0.636983842010772, + "grad_norm": 0.5413858890533447, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 8870 + }, + { + "epoch": 0.6377019748653501, + "grad_norm": 0.5993430614471436, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 8880 + }, + { + "epoch": 0.6384201077199282, + "grad_norm": 0.539415717124939, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 0.600125789642334, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 0.6398563734290844, + "grad_norm": 0.5597978234291077, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 8910 + }, + { + "epoch": 0.6405745062836625, + "grad_norm": 0.6262031197547913, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 8920 + }, + { + "epoch": 0.6412926391382405, + "grad_norm": 0.72662752866745, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 8930 + }, + { + "epoch": 0.6420107719928186, + "grad_norm": 0.613002598285675, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 8940 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.6511827707290649, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 8950 + }, + { + "epoch": 0.6434470377019749, + "grad_norm": 0.5383973717689514, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 8960 + }, + { + "epoch": 0.644165170556553, + "grad_norm": 0.5236184597015381, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 8970 + }, + { + "epoch": 0.6448833034111311, + "grad_norm": 0.5938544273376465, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 8980 + }, + { + "epoch": 0.6456014362657092, + "grad_norm": 0.4594680964946747, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 8990 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 0.6314211487770081, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9000 + }, + { + "epoch": 0.6470377019748653, + "grad_norm": 0.6291103363037109, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 9010 + }, + { + "epoch": 0.6477558348294434, + "grad_norm": 0.5888266563415527, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 9020 + }, + { + "epoch": 0.6484739676840215, + "grad_norm": 0.5613022446632385, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9030 + }, + { + "epoch": 0.6491921005385997, + "grad_norm": 0.7219604253768921, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 9040 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 0.5846529006958008, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 9050 + }, + { + "epoch": 0.6506283662477559, + "grad_norm": 0.7264063954353333, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 9060 + }, + { + "epoch": 0.6513464991023339, + "grad_norm": 0.5797538757324219, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9070 + }, + { + "epoch": 0.652064631956912, + "grad_norm": 0.4857395887374878, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9080 + }, + { + "epoch": 0.6527827648114901, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 9090 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.6105342507362366, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 9100 + }, + { + "epoch": 0.6542190305206463, + "grad_norm": 0.6408740282058716, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 9110 + }, + { + "epoch": 0.6549371633752245, + "grad_norm": 0.7474880814552307, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 9120 + }, + { + "epoch": 0.6556552962298026, + "grad_norm": 0.584768533706665, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 9130 + }, + { + "epoch": 0.6563734290843806, + "grad_norm": 0.6368113160133362, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9140 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.693631649017334, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 9150 + }, + { + "epoch": 0.6578096947935368, + "grad_norm": 0.6094512343406677, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 9160 + }, + { + "epoch": 0.6585278276481149, + "grad_norm": 0.7154942750930786, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 9170 + }, + { + "epoch": 0.659245960502693, + "grad_norm": 0.5749237537384033, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9180 + }, + { + "epoch": 0.6599640933572711, + "grad_norm": 0.6214450001716614, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 9190 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.6357814073562622, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9200 + }, + { + "epoch": 0.6614003590664272, + "grad_norm": 0.5677326917648315, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 9210 + }, + { + "epoch": 0.6621184919210054, + "grad_norm": 0.5432633757591248, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 9220 + }, + { + "epoch": 0.6628366247755835, + "grad_norm": 0.43935060501098633, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 9230 + }, + { + "epoch": 0.6635547576301616, + "grad_norm": 0.5350922346115112, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 9240 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.7745687365531921, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 9250 + }, + { + "epoch": 0.6649910233393178, + "grad_norm": 0.5767113566398621, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9260 + }, + { + "epoch": 0.6657091561938959, + "grad_norm": 0.49304983019828796, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9270 + }, + { + "epoch": 0.6664272890484739, + "grad_norm": 0.6355269551277161, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 9280 + }, + { + "epoch": 0.667145421903052, + "grad_norm": 0.5539451241493225, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 9290 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.5225138068199158, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 9300 + }, + { + "epoch": 0.6685816876122083, + "grad_norm": 0.5435736179351807, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 9310 + }, + { + "epoch": 0.6692998204667864, + "grad_norm": 0.611266553401947, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 9320 + }, + { + "epoch": 0.6700179533213645, + "grad_norm": 0.5880926251411438, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 9330 + }, + { + "epoch": 0.6707360861759426, + "grad_norm": 0.5301468372344971, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9340 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.5614377856254578, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9350 + }, + { + "epoch": 0.6721723518850987, + "grad_norm": 0.7177342176437378, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 9360 + }, + { + "epoch": 0.6728904847396768, + "grad_norm": 0.5187423825263977, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9370 + }, + { + "epoch": 0.6736086175942549, + "grad_norm": 0.49305087327957153, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 9380 + }, + { + "epoch": 0.6743267504488331, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 9390 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 0.8308040499687195, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 9400 + }, + { + "epoch": 0.6757630161579893, + "grad_norm": 0.6522438526153564, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 9410 + }, + { + "epoch": 0.6764811490125673, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 9420 + }, + { + "epoch": 0.6771992818671454, + "grad_norm": 0.783802330493927, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 9430 + }, + { + "epoch": 0.6779174147217235, + "grad_norm": 0.5246656537055969, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 9440 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.6630974411964417, + "learning_rate": 0.0002, + "loss": 0.7866, + "step": 9450 + }, + { + "epoch": 0.6793536804308797, + "grad_norm": 0.5012770295143127, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9460 + }, + { + "epoch": 0.6800718132854578, + "grad_norm": 0.6208643317222595, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 9470 + }, + { + "epoch": 0.680789946140036, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9480 + }, + { + "epoch": 0.681508078994614, + "grad_norm": 0.6613174080848694, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 9490 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.6417899131774902, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9500 + }, + { + "epoch": 0.6829443447037702, + "grad_norm": 0.5060321092605591, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 9510 + }, + { + "epoch": 0.6836624775583483, + "grad_norm": 0.586670458316803, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 9520 + }, + { + "epoch": 0.6843806104129264, + "grad_norm": 0.6607828736305237, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 9530 + }, + { + "epoch": 0.6850987432675045, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9540 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.741000771522522, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 9550 + }, + { + "epoch": 0.6865350089766606, + "grad_norm": 0.4687826335430145, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 9560 + }, + { + "epoch": 0.6872531418312388, + "grad_norm": 0.6452056169509888, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 9570 + }, + { + "epoch": 0.6879712746858169, + "grad_norm": 0.6393555402755737, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 9580 + }, + { + "epoch": 0.688689407540395, + "grad_norm": 0.4907757043838501, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 9590 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.5380825996398926, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 9600 + }, + { + "epoch": 0.6901256732495512, + "grad_norm": 0.5657393932342529, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 9610 + }, + { + "epoch": 0.6908438061041292, + "grad_norm": 0.8505447506904602, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 9620 + }, + { + "epoch": 0.6915619389587073, + "grad_norm": 0.5389836430549622, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 9630 + }, + { + "epoch": 0.6922800718132854, + "grad_norm": 0.4977441728115082, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 9640 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 0.5855389833450317, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 9650 + }, + { + "epoch": 0.6937163375224417, + "grad_norm": 0.633994996547699, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 9660 + }, + { + "epoch": 0.6944344703770198, + "grad_norm": 0.5592191815376282, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 9670 + }, + { + "epoch": 0.6951526032315979, + "grad_norm": 0.6030594706535339, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9680 + }, + { + "epoch": 0.6958707360861759, + "grad_norm": 0.6782388687133789, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 9690 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 0.6777627468109131, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 9700 + }, + { + "epoch": 0.6973070017953321, + "grad_norm": 0.5674123764038086, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 9710 + }, + { + "epoch": 0.6980251346499102, + "grad_norm": 0.5280387997627258, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 9720 + }, + { + "epoch": 0.6987432675044883, + "grad_norm": 0.5471981763839722, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 9730 + }, + { + "epoch": 0.6994614003590665, + "grad_norm": 0.6751061677932739, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9740 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.5942487716674805, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 9750 + }, + { + "epoch": 0.7008976660682226, + "grad_norm": 0.6165713667869568, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 9760 + }, + { + "epoch": 0.7016157989228007, + "grad_norm": 0.5745091438293457, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 9770 + }, + { + "epoch": 0.7023339317773788, + "grad_norm": 0.600308358669281, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 9780 + }, + { + "epoch": 0.7030520646319569, + "grad_norm": 0.6448577046394348, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 9790 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.5662767291069031, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9800 + }, + { + "epoch": 0.7044883303411131, + "grad_norm": 0.6490433812141418, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 9810 + }, + { + "epoch": 0.7052064631956912, + "grad_norm": 0.6126134991645813, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 9820 + }, + { + "epoch": 0.7059245960502692, + "grad_norm": 0.7181116938591003, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 9830 + }, + { + "epoch": 0.7066427289048474, + "grad_norm": 0.7805212140083313, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9840 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.7521958947181702, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9850 + }, + { + "epoch": 0.7080789946140036, + "grad_norm": 0.5610787868499756, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9860 + }, + { + "epoch": 0.7087971274685817, + "grad_norm": 0.7026229500770569, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 9870 + }, + { + "epoch": 0.7095152603231598, + "grad_norm": 0.551691472530365, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 9880 + }, + { + "epoch": 0.7102333931777379, + "grad_norm": 0.5841995477676392, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9890 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 0.7170061469078064, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 9900 + }, + { + "epoch": 0.711669658886894, + "grad_norm": 0.49836990237236023, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 9910 + }, + { + "epoch": 0.7123877917414722, + "grad_norm": 0.5234556794166565, + "learning_rate": 0.0002, + "loss": 0.7667, + "step": 9920 + }, + { + "epoch": 0.7131059245960503, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 9930 + }, + { + "epoch": 0.7138240574506284, + "grad_norm": 0.5657515525817871, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9940 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.5969128012657166, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 9950 + }, + { + "epoch": 0.7152603231597846, + "grad_norm": 0.7136867046356201, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 9960 + }, + { + "epoch": 0.7159784560143626, + "grad_norm": 0.6774699091911316, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9970 + }, + { + "epoch": 0.7166965888689407, + "grad_norm": 0.6066371202468872, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 9980 + }, + { + "epoch": 0.7174147217235188, + "grad_norm": 0.7355279922485352, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 9990 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 0.7996646761894226, + "learning_rate": 0.0002, + "loss": 0.7643, + "step": 10000 + }, + { + "epoch": 0.7188509874326751, + "grad_norm": 0.628839910030365, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10010 + }, + { + "epoch": 0.7195691202872532, + "grad_norm": 0.5472931265830994, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 10020 + }, + { + "epoch": 0.7202872531418313, + "grad_norm": 0.5776344537734985, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 10030 + }, + { + "epoch": 0.7210053859964093, + "grad_norm": 0.5041707158088684, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10040 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5965308547019958, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 10050 + }, + { + "epoch": 0.7224416517055655, + "grad_norm": 0.5892689228057861, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 10060 + }, + { + "epoch": 0.7231597845601436, + "grad_norm": 0.5695884227752686, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 10070 + }, + { + "epoch": 0.7238779174147217, + "grad_norm": 0.6547690629959106, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 10080 + }, + { + "epoch": 0.7245960502692999, + "grad_norm": 0.6759928464889526, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 10090 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.6829725503921509, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 10100 + }, + { + "epoch": 0.726032315978456, + "grad_norm": 0.5242751240730286, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 10110 + }, + { + "epoch": 0.7267504488330341, + "grad_norm": 0.6947014927864075, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 10120 + }, + { + "epoch": 0.7274685816876122, + "grad_norm": 0.6094982624053955, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10130 + }, + { + "epoch": 0.7281867145421903, + "grad_norm": 0.628461480140686, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 10140 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.4952087104320526, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10150 + }, + { + "epoch": 0.7296229802513465, + "grad_norm": 0.6917221546173096, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10160 + }, + { + "epoch": 0.7303411131059246, + "grad_norm": 0.6866413354873657, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10170 + }, + { + "epoch": 0.7310592459605026, + "grad_norm": 0.5505863428115845, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 10180 + }, + { + "epoch": 0.7317773788150808, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 10190 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.5001798272132874, + "learning_rate": 0.0002, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 0.733213644524237, + "grad_norm": 0.5117581486701965, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 10210 + }, + { + "epoch": 0.7339317773788151, + "grad_norm": 0.7716088891029358, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 10220 + }, + { + "epoch": 0.7346499102333932, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 10230 + }, + { + "epoch": 0.7353680430879713, + "grad_norm": 0.6433483362197876, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10240 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.6241081357002258, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10250 + }, + { + "epoch": 0.7368043087971274, + "grad_norm": 0.7198845744132996, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10260 + }, + { + "epoch": 0.7375224416517056, + "grad_norm": 0.5879023671150208, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 10270 + }, + { + "epoch": 0.7382405745062837, + "grad_norm": 0.5810162425041199, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 10280 + }, + { + "epoch": 0.7389587073608618, + "grad_norm": 0.6336500644683838, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10290 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.5627583861351013, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 10300 + }, + { + "epoch": 0.740394973070018, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 10310 + }, + { + "epoch": 0.741113105924596, + "grad_norm": 0.5519505143165588, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 10320 + }, + { + "epoch": 0.7418312387791741, + "grad_norm": 0.628710925579071, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 10330 + }, + { + "epoch": 0.7425493716337522, + "grad_norm": 0.6466957926750183, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10340 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 0.6269286274909973, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 10350 + }, + { + "epoch": 0.7439856373429085, + "grad_norm": 0.6985455751419067, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 10360 + }, + { + "epoch": 0.7447037701974866, + "grad_norm": 0.6203648447990417, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 10370 + }, + { + "epoch": 0.7454219030520647, + "grad_norm": 0.6524295210838318, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 10380 + }, + { + "epoch": 0.7461400359066427, + "grad_norm": 0.6108002662658691, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 10390 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 0.5196276903152466, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 10400 + }, + { + "epoch": 0.7475763016157989, + "grad_norm": 0.6207506656646729, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 10410 + }, + { + "epoch": 0.748294434470377, + "grad_norm": 0.6015686988830566, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 10420 + }, + { + "epoch": 0.7490125673249551, + "grad_norm": 0.6402649879455566, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 10430 + }, + { + "epoch": 0.7497307001795332, + "grad_norm": 0.7816081047058105, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 10440 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.6148143410682678, + "learning_rate": 0.0002, + "loss": 0.8021, + "step": 10450 + }, + { + "epoch": 0.7511669658886894, + "grad_norm": 0.6496613621711731, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 10460 + }, + { + "epoch": 0.7518850987432675, + "grad_norm": 0.49158045649528503, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 10470 + }, + { + "epoch": 0.7526032315978456, + "grad_norm": 0.8629217743873596, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 10480 + }, + { + "epoch": 0.7533213644524237, + "grad_norm": 0.6800066828727722, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 10490 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.6480063199996948, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 10500 + }, + { + "epoch": 0.7547576301615799, + "grad_norm": 0.5740751028060913, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10510 + }, + { + "epoch": 0.755475763016158, + "grad_norm": 0.7182627320289612, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 10520 + }, + { + "epoch": 0.756193895870736, + "grad_norm": 0.6482816934585571, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 10530 + }, + { + "epoch": 0.7569120287253142, + "grad_norm": 0.4937674105167389, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 10540 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.6818482875823975, + "learning_rate": 0.0002, + "loss": 0.7783, + "step": 10550 + }, + { + "epoch": 0.7583482944344704, + "grad_norm": 0.6375173926353455, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10560 + }, + { + "epoch": 0.7590664272890485, + "grad_norm": 0.528798520565033, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 10570 + }, + { + "epoch": 0.7597845601436266, + "grad_norm": 0.42099910974502563, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 10580 + }, + { + "epoch": 0.7605026929982047, + "grad_norm": 0.529604434967041, + "learning_rate": 0.0002, + "loss": 0.8218, + "step": 10590 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.6236841082572937, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 10600 + }, + { + "epoch": 0.7619389587073608, + "grad_norm": 0.6194891929626465, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10610 + }, + { + "epoch": 0.762657091561939, + "grad_norm": 0.5206209421157837, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 10620 + }, + { + "epoch": 0.7633752244165171, + "grad_norm": 0.7981295585632324, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 10630 + }, + { + "epoch": 0.7640933572710952, + "grad_norm": 0.6113479137420654, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 10640 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 0.7025435566902161, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10650 + }, + { + "epoch": 0.7655296229802514, + "grad_norm": 0.46914348006248474, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 10660 + }, + { + "epoch": 0.7662477558348294, + "grad_norm": 0.6134725213050842, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 10670 + }, + { + "epoch": 0.7669658886894075, + "grad_norm": 0.583859920501709, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10680 + }, + { + "epoch": 0.7676840215439856, + "grad_norm": 0.511349081993103, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10690 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.6467110514640808, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 10700 + }, + { + "epoch": 0.7691202872531419, + "grad_norm": 0.7210163474082947, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 10710 + }, + { + "epoch": 0.76983842010772, + "grad_norm": 0.6034521460533142, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 10720 + }, + { + "epoch": 0.7705565529622981, + "grad_norm": 0.6237271428108215, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 10730 + }, + { + "epoch": 0.7712746858168761, + "grad_norm": 0.664328396320343, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 10740 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.6550520062446594, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 10750 + }, + { + "epoch": 0.7727109515260323, + "grad_norm": 0.5103325843811035, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 10760 + }, + { + "epoch": 0.7734290843806104, + "grad_norm": 0.7171200513839722, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 10770 + }, + { + "epoch": 0.7741472172351885, + "grad_norm": 0.5947384834289551, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 10780 + }, + { + "epoch": 0.7748653500897666, + "grad_norm": 0.5293096899986267, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10790 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.6372577548027039, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10800 + }, + { + "epoch": 0.7763016157989228, + "grad_norm": 0.5738261938095093, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.7770197486535009, + "grad_norm": 0.7309247255325317, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 10820 + }, + { + "epoch": 0.777737881508079, + "grad_norm": 0.8867193460464478, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10830 + }, + { + "epoch": 0.7784560143626571, + "grad_norm": 0.6151437759399414, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 10840 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.5645464658737183, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 0.7798922800718133, + "grad_norm": 0.5118698477745056, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10860 + }, + { + "epoch": 0.7806104129263913, + "grad_norm": 0.618181049823761, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 10870 + }, + { + "epoch": 0.7813285457809694, + "grad_norm": 0.7206462025642395, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 10880 + }, + { + "epoch": 0.7820466786355476, + "grad_norm": 0.7993820905685425, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 10890 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.5072754621505737, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10900 + }, + { + "epoch": 0.7834829443447038, + "grad_norm": 0.5829088687896729, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 10910 + }, + { + "epoch": 0.7842010771992819, + "grad_norm": 0.5778957605361938, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 10920 + }, + { + "epoch": 0.78491921005386, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.785637342908438, + "grad_norm": 0.5778013467788696, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 10940 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.6129629611968994, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10950 + }, + { + "epoch": 0.7870736086175942, + "grad_norm": 0.5637320876121521, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10960 + }, + { + "epoch": 0.7877917414721723, + "grad_norm": 0.6253715753555298, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10970 + }, + { + "epoch": 0.7885098743267505, + "grad_norm": 0.6209888458251953, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10980 + }, + { + "epoch": 0.7892280071813286, + "grad_norm": 1.0841948986053467, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10990 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.6570560336112976, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 11000 + }, + { + "epoch": 0.7906642728904847, + "grad_norm": 0.4830388128757477, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11010 + }, + { + "epoch": 0.7913824057450628, + "grad_norm": 0.7607520222663879, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11020 + }, + { + "epoch": 0.7921005385996409, + "grad_norm": 0.8202590346336365, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 11030 + }, + { + "epoch": 0.792818671454219, + "grad_norm": 0.5640848278999329, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11040 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 0.7773675322532654, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 11050 + }, + { + "epoch": 0.7942549371633753, + "grad_norm": 0.664139986038208, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11060 + }, + { + "epoch": 0.7949730700179534, + "grad_norm": 0.6097795367240906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 11070 + }, + { + "epoch": 0.7956912028725314, + "grad_norm": 0.9208881258964539, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 11080 + }, + { + "epoch": 0.7964093357271095, + "grad_norm": 0.6210731863975525, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 11090 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.7060235738754272, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 11100 + }, + { + "epoch": 0.7978456014362657, + "grad_norm": 0.48695266246795654, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 11110 + }, + { + "epoch": 0.7985637342908438, + "grad_norm": 0.6458830833435059, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 11120 + }, + { + "epoch": 0.7992818671454219, + "grad_norm": 0.572545051574707, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 11130 + }, + { + "epoch": 0.8, + "grad_norm": 0.5925027132034302, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 11140 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 0.569622278213501, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 11150 + }, + { + "epoch": 0.8014362657091562, + "grad_norm": 0.537146806716919, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 11160 + }, + { + "epoch": 0.8021543985637343, + "grad_norm": 0.7118613719940186, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 11170 + }, + { + "epoch": 0.8028725314183124, + "grad_norm": 0.6183688044548035, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 11180 + }, + { + "epoch": 0.8035906642728905, + "grad_norm": 0.5187385082244873, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 11190 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.5422571301460266, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 11200 + }, + { + "epoch": 0.8050269299820467, + "grad_norm": 0.635050892829895, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 11210 + }, + { + "epoch": 0.8057450628366247, + "grad_norm": 0.6584872007369995, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 11220 + }, + { + "epoch": 0.8064631956912028, + "grad_norm": 0.624921977519989, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 11230 + }, + { + "epoch": 0.807181328545781, + "grad_norm": 0.6837546229362488, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 11240 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 0.5861160755157471, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11250 + }, + { + "epoch": 0.8086175942549372, + "grad_norm": 0.5751383900642395, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 11260 + }, + { + "epoch": 0.8093357271095153, + "grad_norm": 0.7181510329246521, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11270 + }, + { + "epoch": 0.8100538599640934, + "grad_norm": 0.5862139463424683, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11280 + }, + { + "epoch": 0.8107719928186714, + "grad_norm": 0.4880113899707794, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 11290 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.565590500831604, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 11300 + }, + { + "epoch": 0.8122082585278276, + "grad_norm": 0.6171264052391052, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11310 + }, + { + "epoch": 0.8129263913824057, + "grad_norm": 0.5815969109535217, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 11320 + }, + { + "epoch": 0.8136445242369839, + "grad_norm": 0.5407653450965881, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 11330 + }, + { + "epoch": 0.814362657091562, + "grad_norm": 0.6990084648132324, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 11340 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.5845068097114563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11350 + }, + { + "epoch": 0.8157989228007181, + "grad_norm": 0.5978701114654541, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11360 + }, + { + "epoch": 0.8165170556552962, + "grad_norm": 0.6873053312301636, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 11370 + }, + { + "epoch": 0.8172351885098743, + "grad_norm": 0.7048654556274414, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 11380 + }, + { + "epoch": 0.8179533213644524, + "grad_norm": 0.7631531953811646, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 11390 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.704922080039978, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 11400 + }, + { + "epoch": 0.8193895870736086, + "grad_norm": 0.595460832118988, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11410 + }, + { + "epoch": 0.8201077199281868, + "grad_norm": 0.5882242918014526, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 11420 + }, + { + "epoch": 0.8208258527827648, + "grad_norm": 0.6433175206184387, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 11430 + }, + { + "epoch": 0.8215439856373429, + "grad_norm": 0.6047986149787903, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 11440 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 11450 + }, + { + "epoch": 0.8229802513464991, + "grad_norm": 0.5558379888534546, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 11460 + }, + { + "epoch": 0.8236983842010772, + "grad_norm": 0.6745542287826538, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 11470 + }, + { + "epoch": 0.8244165170556553, + "grad_norm": 0.7082334756851196, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 11480 + }, + { + "epoch": 0.8251346499102334, + "grad_norm": 0.703889787197113, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11490 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.5261096358299255, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 11500 + }, + { + "epoch": 0.8265709156193896, + "grad_norm": 0.6009393930435181, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 11510 + }, + { + "epoch": 0.8272890484739677, + "grad_norm": 0.584274172782898, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 11520 + }, + { + "epoch": 0.8280071813285458, + "grad_norm": 0.6803238987922668, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 11530 + }, + { + "epoch": 0.8287253141831239, + "grad_norm": 0.6230084896087646, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 11540 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.6090595722198486, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 11550 + }, + { + "epoch": 0.8301615798922801, + "grad_norm": 0.5292693376541138, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 11560 + }, + { + "epoch": 0.8308797127468581, + "grad_norm": 0.5675389766693115, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 11570 + }, + { + "epoch": 0.8315978456014362, + "grad_norm": 0.554874062538147, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 11580 + }, + { + "epoch": 0.8323159784560143, + "grad_norm": 0.8582373261451721, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 11590 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.5743035674095154, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 11600 + }, + { + "epoch": 0.8337522441651706, + "grad_norm": 0.5749582648277283, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11610 + }, + { + "epoch": 0.8344703770197487, + "grad_norm": 0.5207278728485107, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11620 + }, + { + "epoch": 0.8351885098743268, + "grad_norm": 0.6262611150741577, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 11630 + }, + { + "epoch": 0.8359066427289048, + "grad_norm": 0.5490066409111023, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 11640 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6283167600631714, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 11650 + }, + { + "epoch": 0.837342908438061, + "grad_norm": 0.7701452374458313, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 11660 + }, + { + "epoch": 0.8380610412926391, + "grad_norm": 0.5825072526931763, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 11670 + }, + { + "epoch": 0.8387791741472173, + "grad_norm": 0.6119720935821533, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 11680 + }, + { + "epoch": 0.8394973070017954, + "grad_norm": 0.689383327960968, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 11690 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.5396560430526733, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 11700 + }, + { + "epoch": 0.8409335727109515, + "grad_norm": 0.577178955078125, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 11710 + }, + { + "epoch": 0.8416517055655296, + "grad_norm": 0.6652564406394958, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 11720 + }, + { + "epoch": 0.8423698384201077, + "grad_norm": 0.588377058506012, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 11730 + }, + { + "epoch": 0.8430879712746858, + "grad_norm": 0.6180438995361328, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 11740 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.6897811889648438, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11750 + }, + { + "epoch": 0.844524236983842, + "grad_norm": 0.5826608538627625, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 11760 + }, + { + "epoch": 0.8452423698384202, + "grad_norm": 0.6511976718902588, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 11770 + }, + { + "epoch": 0.8459605026929982, + "grad_norm": 0.4738382399082184, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 11780 + }, + { + "epoch": 0.8466786355475763, + "grad_norm": 0.541780948638916, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 11790 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 0.6115241050720215, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 11800 + }, + { + "epoch": 0.8481149012567325, + "grad_norm": 0.7067801356315613, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 11810 + }, + { + "epoch": 0.8488330341113106, + "grad_norm": 0.5602791905403137, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 11820 + }, + { + "epoch": 0.8495511669658887, + "grad_norm": 0.6968005299568176, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 11830 + }, + { + "epoch": 0.8502692998204668, + "grad_norm": 0.621132493019104, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11840 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.5777568817138672, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11850 + }, + { + "epoch": 0.851705565529623, + "grad_norm": 0.6468178629875183, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 11860 + }, + { + "epoch": 0.8524236983842011, + "grad_norm": 0.6216070652008057, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 11870 + }, + { + "epoch": 0.8531418312387792, + "grad_norm": 0.7402005791664124, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 11880 + }, + { + "epoch": 0.8538599640933573, + "grad_norm": 0.5192958116531372, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 11890 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 0.6050501465797424, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 11900 + }, + { + "epoch": 0.8552962298025135, + "grad_norm": 0.5363124012947083, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11910 + }, + { + "epoch": 0.8560143626570915, + "grad_norm": 0.525288462638855, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11920 + }, + { + "epoch": 0.8567324955116696, + "grad_norm": 0.6129848957061768, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 11930 + }, + { + "epoch": 0.8574506283662477, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 11940 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.5862830281257629, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 11950 + }, + { + "epoch": 0.858886894075404, + "grad_norm": 0.7078025341033936, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 11960 + }, + { + "epoch": 0.8596050269299821, + "grad_norm": 0.6600908637046814, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 11970 + }, + { + "epoch": 0.8603231597845602, + "grad_norm": 0.5914377570152283, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 11980 + }, + { + "epoch": 0.8610412926391382, + "grad_norm": 0.7844575047492981, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 11990 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.6605148315429688, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12000 + }, + { + "epoch": 0.8624775583482944, + "grad_norm": 0.6320111155509949, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 12010 + }, + { + "epoch": 0.8631956912028725, + "grad_norm": 0.5833557844161987, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 12020 + }, + { + "epoch": 0.8639138240574507, + "grad_norm": 0.5322666764259338, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 12030 + }, + { + "epoch": 0.8646319569120288, + "grad_norm": 0.568696141242981, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 12040 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 0.5739135146141052, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 12050 + }, + { + "epoch": 0.8660682226211849, + "grad_norm": 0.6667993068695068, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 12060 + }, + { + "epoch": 0.866786355475763, + "grad_norm": 0.5393701195716858, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 12070 + }, + { + "epoch": 0.8675044883303411, + "grad_norm": 0.7036312818527222, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 12080 + }, + { + "epoch": 0.8682226211849192, + "grad_norm": 0.5851739048957825, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 12090 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.6554462909698486, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 12100 + }, + { + "epoch": 0.8696588868940754, + "grad_norm": 0.8224838376045227, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 12110 + }, + { + "epoch": 0.8703770197486534, + "grad_norm": 0.513981819152832, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 12120 + }, + { + "epoch": 0.8710951526032316, + "grad_norm": 0.6913988590240479, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 12130 + }, + { + "epoch": 0.8718132854578097, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 12140 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.6216937303543091, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 12150 + }, + { + "epoch": 0.8732495511669659, + "grad_norm": 0.5594495534896851, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 12160 + }, + { + "epoch": 0.873967684021544, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 12170 + }, + { + "epoch": 0.8746858168761221, + "grad_norm": 0.5285239815711975, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 12180 + }, + { + "epoch": 0.8754039497307001, + "grad_norm": 1.0394607782363892, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 12190 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 12200 + }, + { + "epoch": 0.8768402154398564, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 12210 + }, + { + "epoch": 0.8775583482944345, + "grad_norm": 0.593204915523529, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 12220 + }, + { + "epoch": 0.8782764811490126, + "grad_norm": 0.7141679525375366, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 12230 + }, + { + "epoch": 0.8789946140035907, + "grad_norm": 0.6381585597991943, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 12240 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.7076981067657471, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12250 + }, + { + "epoch": 0.8804308797127468, + "grad_norm": 0.8046461939811707, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 12260 + }, + { + "epoch": 0.8811490125673249, + "grad_norm": 0.635160505771637, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 12270 + }, + { + "epoch": 0.881867145421903, + "grad_norm": 0.6388354301452637, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8825852782764811, + "grad_norm": 0.5612906217575073, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 12290 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6716228723526001, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 12300 + }, + { + "epoch": 0.8840215439856374, + "grad_norm": 0.6488762497901917, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 12310 + }, + { + "epoch": 0.8847396768402155, + "grad_norm": 0.5770853757858276, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 12320 + }, + { + "epoch": 0.8854578096947935, + "grad_norm": 0.5006616711616516, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 12330 + }, + { + "epoch": 0.8861759425493716, + "grad_norm": 0.6428417563438416, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 12340 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 0.5721977949142456, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12350 + }, + { + "epoch": 0.8876122082585278, + "grad_norm": 0.7000266313552856, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 12360 + }, + { + "epoch": 0.8883303411131059, + "grad_norm": 0.5252631306648254, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 12370 + }, + { + "epoch": 0.889048473967684, + "grad_norm": 0.5788044929504395, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 0.8897666068222622, + "grad_norm": 0.6730653643608093, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.5556851029396057, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 12400 + }, + { + "epoch": 0.8912028725314183, + "grad_norm": 0.616189181804657, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 12410 + }, + { + "epoch": 0.8919210053859964, + "grad_norm": 0.6360940337181091, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 12420 + }, + { + "epoch": 0.8926391382405745, + "grad_norm": 0.5832887887954712, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 12430 + }, + { + "epoch": 0.8933572710951526, + "grad_norm": 0.8319168090820312, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 12440 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.5415005087852478, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 12450 + }, + { + "epoch": 0.8947935368043088, + "grad_norm": 0.4959808588027954, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 12460 + }, + { + "epoch": 0.8955116696588868, + "grad_norm": 0.5102260708808899, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 12470 + }, + { + "epoch": 0.896229802513465, + "grad_norm": 0.773972749710083, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12480 + }, + { + "epoch": 0.8969479353680431, + "grad_norm": 0.6314513087272644, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 12490 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.6503705382347107, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 12500 + }, + { + "epoch": 0.8983842010771993, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 12510 + }, + { + "epoch": 0.8991023339317774, + "grad_norm": 0.7222756743431091, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 12520 + }, + { + "epoch": 0.8998204667863555, + "grad_norm": 0.7242336869239807, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 12530 + }, + { + "epoch": 0.9005385996409335, + "grad_norm": 0.625769317150116, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 12540 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.6003357172012329, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 12550 + }, + { + "epoch": 0.9019748653500897, + "grad_norm": 0.6089374423027039, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 12560 + }, + { + "epoch": 0.9026929982046679, + "grad_norm": 0.6232544183731079, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 12570 + }, + { + "epoch": 0.903411131059246, + "grad_norm": 0.5426769256591797, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 12580 + }, + { + "epoch": 0.9041292639138241, + "grad_norm": 0.5711943507194519, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 12590 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.5287838578224182, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 12600 + }, + { + "epoch": 0.9055655296229802, + "grad_norm": 0.6192951798439026, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 12610 + }, + { + "epoch": 0.9062836624775583, + "grad_norm": 0.493082195520401, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 12620 + }, + { + "epoch": 0.9070017953321364, + "grad_norm": 0.7668463587760925, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 12630 + }, + { + "epoch": 0.9077199281867145, + "grad_norm": 0.6298037767410278, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 12640 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.5502580404281616, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 12650 + }, + { + "epoch": 0.9091561938958708, + "grad_norm": 0.5525170564651489, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.9098743267504489, + "grad_norm": 0.9753695726394653, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 12670 + }, + { + "epoch": 0.9105924596050269, + "grad_norm": 0.611427366733551, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 12680 + }, + { + "epoch": 0.911310592459605, + "grad_norm": 0.5141594409942627, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 12690 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 0.6739137172698975, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 12700 + }, + { + "epoch": 0.9127468581687612, + "grad_norm": 0.5759707689285278, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 12710 + }, + { + "epoch": 0.9134649910233393, + "grad_norm": 0.5548733472824097, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12720 + }, + { + "epoch": 0.9141831238779174, + "grad_norm": 0.7014280557632446, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 12730 + }, + { + "epoch": 0.9149012567324956, + "grad_norm": 0.5939958691596985, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 12740 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 0.5995593667030334, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12750 + }, + { + "epoch": 0.9163375224416517, + "grad_norm": 0.6686680316925049, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 12760 + }, + { + "epoch": 0.9170556552962298, + "grad_norm": 0.4742372930049896, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 12770 + }, + { + "epoch": 0.9177737881508079, + "grad_norm": 0.5493217706680298, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 12780 + }, + { + "epoch": 0.918491921005386, + "grad_norm": 0.5641885995864868, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 12790 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 0.5814061164855957, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 12800 + }, + { + "epoch": 0.9199281867145422, + "grad_norm": 0.6774331331253052, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 12810 + }, + { + "epoch": 0.9206463195691202, + "grad_norm": 0.5592127442359924, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 12820 + }, + { + "epoch": 0.9213644524236984, + "grad_norm": 0.5246456861495972, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 12830 + }, + { + "epoch": 0.9220825852782765, + "grad_norm": 0.6524264812469482, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 12840 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 0.6010791063308716, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12850 + }, + { + "epoch": 0.9235188509874327, + "grad_norm": 0.5289866924285889, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 12860 + }, + { + "epoch": 0.9242369838420108, + "grad_norm": 0.6850762367248535, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 12870 + }, + { + "epoch": 0.9249551166965889, + "grad_norm": 0.5293797850608826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 12880 + }, + { + "epoch": 0.9256732495511669, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 12890 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.7026739716529846, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 12900 + }, + { + "epoch": 0.9271095152603231, + "grad_norm": 0.6884756684303284, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 12910 + }, + { + "epoch": 0.9278276481149013, + "grad_norm": 0.637884795665741, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 12920 + }, + { + "epoch": 0.9285457809694794, + "grad_norm": 0.513913631439209, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 12930 + }, + { + "epoch": 0.9292639138240575, + "grad_norm": 0.6642340421676636, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 12940 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.5708861947059631, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 12950 + }, + { + "epoch": 0.9307001795332136, + "grad_norm": 0.5896512866020203, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 12960 + }, + { + "epoch": 0.9314183123877917, + "grad_norm": 0.5754874348640442, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 12970 + }, + { + "epoch": 0.9321364452423698, + "grad_norm": 0.6363751888275146, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 12980 + }, + { + "epoch": 0.9328545780969479, + "grad_norm": 0.7660197019577026, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 12990 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.607728898525238, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 13000 + }, + { + "epoch": 0.9342908438061042, + "grad_norm": 0.5257042050361633, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 13010 + }, + { + "epoch": 0.9350089766606823, + "grad_norm": 0.7916908264160156, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 13020 + }, + { + "epoch": 0.9357271095152603, + "grad_norm": 0.8310123085975647, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 13030 + }, + { + "epoch": 0.9364452423698384, + "grad_norm": 0.6543728113174438, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 13040 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.7153878808021545, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 13050 + }, + { + "epoch": 0.9378815080789946, + "grad_norm": 0.7510694265365601, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 13060 + }, + { + "epoch": 0.9385996409335727, + "grad_norm": 0.5524464249610901, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 13070 + }, + { + "epoch": 0.9393177737881508, + "grad_norm": 0.6657140254974365, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 13080 + }, + { + "epoch": 0.940035906642729, + "grad_norm": 0.5757394433021545, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 13090 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.6171187162399292, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 13100 + }, + { + "epoch": 0.9414721723518851, + "grad_norm": 0.5946314334869385, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 13110 + }, + { + "epoch": 0.9421903052064632, + "grad_norm": 0.5727229714393616, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 13120 + }, + { + "epoch": 0.9429084380610413, + "grad_norm": 0.7805224061012268, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 13130 + }, + { + "epoch": 0.9436265709156194, + "grad_norm": 0.5763523578643799, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 13140 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 0.8310899138450623, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13150 + }, + { + "epoch": 0.9450628366247756, + "grad_norm": 0.7531784772872925, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 13160 + }, + { + "epoch": 0.9457809694793536, + "grad_norm": 0.678779661655426, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 13170 + }, + { + "epoch": 0.9464991023339318, + "grad_norm": 0.8096453547477722, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13180 + }, + { + "epoch": 0.9472172351885099, + "grad_norm": 0.6743921637535095, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 13190 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.606852114200592, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 13200 + }, + { + "epoch": 0.9486535008976661, + "grad_norm": 0.6550270915031433, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 13210 + }, + { + "epoch": 0.9493716337522442, + "grad_norm": 0.6494552493095398, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 13220 + }, + { + "epoch": 0.9500897666068223, + "grad_norm": 0.5867666602134705, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 13230 + }, + { + "epoch": 0.9508078994614003, + "grad_norm": 0.6283786296844482, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 13240 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 0.6824573278427124, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 13250 + }, + { + "epoch": 0.9522441651705565, + "grad_norm": 0.6945744156837463, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 13260 + }, + { + "epoch": 0.9529622980251347, + "grad_norm": 0.6468575596809387, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 13270 + }, + { + "epoch": 0.9536804308797128, + "grad_norm": 0.6819407939910889, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 0.9543985637342909, + "grad_norm": 0.6660491824150085, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 13290 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6320462226867676, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 13300 + }, + { + "epoch": 0.955834829443447, + "grad_norm": 0.46753761172294617, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 13310 + }, + { + "epoch": 0.9565529622980251, + "grad_norm": 0.6608774065971375, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 13320 + }, + { + "epoch": 0.9572710951526032, + "grad_norm": 0.607448935508728, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 13330 + }, + { + "epoch": 0.9579892280071813, + "grad_norm": 0.6796701550483704, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 13340 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.7655861377716064, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 13350 + }, + { + "epoch": 0.9594254937163376, + "grad_norm": 0.5881335735321045, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 13360 + }, + { + "epoch": 0.9601436265709156, + "grad_norm": 0.6855270862579346, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 13370 + }, + { + "epoch": 0.9608617594254937, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 13380 + }, + { + "epoch": 0.9615798922800718, + "grad_norm": 0.5983994603157043, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 13390 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.6141189932823181, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 13400 + }, + { + "epoch": 0.963016157989228, + "grad_norm": 0.6539722084999084, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 13410 + }, + { + "epoch": 0.9637342908438061, + "grad_norm": 0.5425801277160645, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 13420 + }, + { + "epoch": 0.9644524236983842, + "grad_norm": 0.8038925528526306, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 13430 + }, + { + "epoch": 0.9651705565529622, + "grad_norm": 0.5729590058326721, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 13440 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5695241689682007, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 13450 + }, + { + "epoch": 0.9666068222621185, + "grad_norm": 0.5913681387901306, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 13460 + }, + { + "epoch": 0.9673249551166966, + "grad_norm": 1.1798994541168213, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 13470 + }, + { + "epoch": 0.9680430879712747, + "grad_norm": 0.5931369066238403, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 13480 + }, + { + "epoch": 0.9687612208258528, + "grad_norm": 0.6269514560699463, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 13490 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.7380245327949524, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 13500 + }, + { + "epoch": 0.9701974865350089, + "grad_norm": 0.5668187141418457, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 13510 + }, + { + "epoch": 0.970915619389587, + "grad_norm": 0.547149121761322, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 13520 + }, + { + "epoch": 0.9716337522441651, + "grad_norm": 0.49131739139556885, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 13530 + }, + { + "epoch": 0.9723518850987433, + "grad_norm": 0.6385366320610046, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 13540 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5962417125701904, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 13550 + }, + { + "epoch": 0.9737881508078995, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 13560 + }, + { + "epoch": 0.9745062836624776, + "grad_norm": 0.5757403373718262, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 13570 + }, + { + "epoch": 0.9752244165170556, + "grad_norm": 0.7214667201042175, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 13580 + }, + { + "epoch": 0.9759425493716337, + "grad_norm": 0.5902701020240784, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 13590 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.752805769443512, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 13600 + }, + { + "epoch": 0.9773788150807899, + "grad_norm": 0.5943595767021179, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 13610 + }, + { + "epoch": 0.978096947935368, + "grad_norm": 0.6752488613128662, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 13620 + }, + { + "epoch": 0.9788150807899462, + "grad_norm": 0.5295413732528687, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 13630 + }, + { + "epoch": 0.9795332136445243, + "grad_norm": 0.732549250125885, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13640 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.5701823830604553, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 13650 + }, + { + "epoch": 0.9809694793536804, + "grad_norm": 0.576898455619812, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13660 + }, + { + "epoch": 0.9816876122082585, + "grad_norm": 0.5916832089424133, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 13670 + }, + { + "epoch": 0.9824057450628366, + "grad_norm": 0.5554524660110474, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 13680 + }, + { + "epoch": 0.9831238779174147, + "grad_norm": 0.6988440752029419, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 13690 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 13700 + }, + { + "epoch": 0.984560143626571, + "grad_norm": 2.421210289001465, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13710 + }, + { + "epoch": 0.985278276481149, + "grad_norm": 0.6307598948478699, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 13720 + }, + { + "epoch": 0.9859964093357271, + "grad_norm": 0.6832480430603027, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 13730 + }, + { + "epoch": 0.9867145421903052, + "grad_norm": 0.5974255204200745, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13740 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.6540380716323853, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 13750 + }, + { + "epoch": 0.9881508078994614, + "grad_norm": 0.7532727122306824, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 13760 + }, + { + "epoch": 0.9888689407540395, + "grad_norm": 0.6776283383369446, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 13770 + }, + { + "epoch": 0.9895870736086176, + "grad_norm": 0.5776281356811523, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 13780 + }, + { + "epoch": 0.9903052064631956, + "grad_norm": 0.5473008751869202, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 13790 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.5428591370582581, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 13800 + }, + { + "epoch": 0.9917414721723519, + "grad_norm": 0.5173406004905701, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 13810 + }, + { + "epoch": 0.99245960502693, + "grad_norm": 0.6462617516517639, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 13820 + }, + { + "epoch": 0.9931777378815081, + "grad_norm": 0.5800426006317139, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 13830 + }, + { + "epoch": 0.9938958707360862, + "grad_norm": 0.5015466809272766, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 13840 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.59474778175354, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 13850 + }, + { + "epoch": 0.9953321364452423, + "grad_norm": 0.5609583258628845, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 13860 + }, + { + "epoch": 0.9960502692998204, + "grad_norm": 0.5762063264846802, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 13870 + }, + { + "epoch": 0.9967684021543985, + "grad_norm": 0.6419214010238647, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 13880 + }, + { + "epoch": 0.9974865350089767, + "grad_norm": 0.7821950316429138, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 13890 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.6216017007827759, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 13900 + }, + { + "epoch": 0.9989228007181329, + "grad_norm": 0.5446485877037048, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 13910 + }, + { + "epoch": 0.999640933572711, + "grad_norm": 0.5037565231323242, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 13920 + }, + { + "epoch": 1.0, + "eval_loss": 1.09147310256958, + "eval_runtime": 55.1915, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 13925 + }, + { + "epoch": 1.000359066427289, + "grad_norm": 0.5808277130126953, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 13930 + }, + { + "epoch": 1.0010771992818672, + "grad_norm": 0.47258496284484863, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 13940 + }, + { + "epoch": 1.0017953321364452, + "grad_norm": 0.8921670317649841, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 13950 + }, + { + "epoch": 1.0025134649910232, + "grad_norm": 0.746729850769043, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 13960 + }, + { + "epoch": 1.0032315978456015, + "grad_norm": 0.6243796944618225, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13970 + }, + { + "epoch": 1.0039497307001795, + "grad_norm": 0.6725090742111206, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 13980 + }, + { + "epoch": 1.0046678635547577, + "grad_norm": 0.8762497305870056, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 13990 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 0.7694411873817444, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 14000 + }, + { + "epoch": 1.006104129263914, + "grad_norm": 0.6208822727203369, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 14010 + }, + { + "epoch": 1.006822262118492, + "grad_norm": 0.8503357768058777, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 14020 + }, + { + "epoch": 1.00754039497307, + "grad_norm": 0.5813316106796265, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14030 + }, + { + "epoch": 1.0082585278276481, + "grad_norm": 0.8186036348342896, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 14040 + }, + { + "epoch": 1.0089766606822261, + "grad_norm": 0.759873628616333, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14050 + }, + { + "epoch": 1.0096947935368044, + "grad_norm": 0.8437777161598206, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 14060 + }, + { + "epoch": 1.0104129263913824, + "grad_norm": 0.5750975012779236, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14070 + }, + { + "epoch": 1.0111310592459606, + "grad_norm": 0.5873221158981323, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 14080 + }, + { + "epoch": 1.0118491921005386, + "grad_norm": 0.6381314396858215, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 14090 + }, + { + "epoch": 1.0125673249551166, + "grad_norm": 0.6510405540466309, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 14100 + }, + { + "epoch": 1.0132854578096948, + "grad_norm": 0.7698671221733093, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 14110 + }, + { + "epoch": 1.0140035906642728, + "grad_norm": 0.646180272102356, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 14120 + }, + { + "epoch": 1.014721723518851, + "grad_norm": 0.6183205246925354, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 14130 + }, + { + "epoch": 1.015439856373429, + "grad_norm": 0.5082563757896423, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 14140 + }, + { + "epoch": 1.0161579892280073, + "grad_norm": 0.7285500764846802, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 14150 + }, + { + "epoch": 1.0168761220825853, + "grad_norm": 0.6368175148963928, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 14160 + }, + { + "epoch": 1.0175942549371633, + "grad_norm": 0.44868743419647217, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 14170 + }, + { + "epoch": 1.0183123877917415, + "grad_norm": 0.6346513628959656, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 14180 + }, + { + "epoch": 1.0190305206463195, + "grad_norm": 0.7287803292274475, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 14190 + }, + { + "epoch": 1.0197486535008977, + "grad_norm": 0.6701363325119019, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 14200 + }, + { + "epoch": 1.0204667863554757, + "grad_norm": 0.6419289112091064, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 14210 + }, + { + "epoch": 1.021184919210054, + "grad_norm": 0.7703002095222473, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 14220 + }, + { + "epoch": 1.021903052064632, + "grad_norm": 0.6803670525550842, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14230 + }, + { + "epoch": 1.02262118491921, + "grad_norm": 0.5780976414680481, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 14240 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 0.5096051096916199, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 14250 + }, + { + "epoch": 1.0240574506283662, + "grad_norm": 0.6058611869812012, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 14260 + }, + { + "epoch": 1.0247755834829444, + "grad_norm": 0.6703311204910278, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 14270 + }, + { + "epoch": 1.0254937163375224, + "grad_norm": 0.7143640518188477, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 14280 + }, + { + "epoch": 1.0262118491921006, + "grad_norm": 0.6730744242668152, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 14290 + }, + { + "epoch": 1.0269299820466786, + "grad_norm": 0.8180603384971619, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14300 + }, + { + "epoch": 1.0276481149012566, + "grad_norm": 0.6752267479896545, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 14310 + }, + { + "epoch": 1.0283662477558349, + "grad_norm": 0.678428590297699, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 14320 + }, + { + "epoch": 1.0290843806104129, + "grad_norm": 0.5959973931312561, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 14330 + }, + { + "epoch": 1.029802513464991, + "grad_norm": 0.5797176957130432, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 14340 + }, + { + "epoch": 1.030520646319569, + "grad_norm": 0.6415652632713318, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 14350 + }, + { + "epoch": 1.0312387791741473, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 14360 + }, + { + "epoch": 1.0319569120287253, + "grad_norm": 0.7158452272415161, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 14370 + }, + { + "epoch": 1.0326750448833033, + "grad_norm": 0.6066089272499084, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 14380 + }, + { + "epoch": 1.0333931777378815, + "grad_norm": 0.7359582781791687, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 14390 + }, + { + "epoch": 1.0341113105924595, + "grad_norm": 0.7372373938560486, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 14400 + }, + { + "epoch": 1.0348294434470378, + "grad_norm": 0.7511868476867676, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 14410 + }, + { + "epoch": 1.0355475763016158, + "grad_norm": 0.5449917912483215, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 14420 + }, + { + "epoch": 1.036265709156194, + "grad_norm": 0.6700817346572876, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 14430 + }, + { + "epoch": 1.036983842010772, + "grad_norm": 0.7061316967010498, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14440 + }, + { + "epoch": 1.03770197486535, + "grad_norm": 0.7582663893699646, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 14450 + }, + { + "epoch": 1.0384201077199282, + "grad_norm": 0.6408873200416565, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 14460 + }, + { + "epoch": 1.0391382405745062, + "grad_norm": 0.7645436525344849, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 14470 + }, + { + "epoch": 1.0398563734290844, + "grad_norm": 0.6522644758224487, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 14480 + }, + { + "epoch": 1.0405745062836624, + "grad_norm": 0.784273624420166, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 14490 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 0.673891544342041, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 14500 + }, + { + "epoch": 1.0420107719928187, + "grad_norm": 0.6566316485404968, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 14510 + }, + { + "epoch": 1.0427289048473967, + "grad_norm": 0.6062059998512268, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 14520 + }, + { + "epoch": 1.0434470377019749, + "grad_norm": 0.6884504556655884, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14530 + }, + { + "epoch": 1.044165170556553, + "grad_norm": 0.6642231345176697, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14540 + }, + { + "epoch": 1.0448833034111311, + "grad_norm": 0.6989523768424988, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 14550 + }, + { + "epoch": 1.0456014362657091, + "grad_norm": 0.8179892301559448, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 14560 + }, + { + "epoch": 1.0463195691202873, + "grad_norm": 0.6426970362663269, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 14570 + }, + { + "epoch": 1.0470377019748653, + "grad_norm": 0.678445041179657, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 14580 + }, + { + "epoch": 1.0477558348294433, + "grad_norm": 0.7573820352554321, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 14590 + }, + { + "epoch": 1.0484739676840216, + "grad_norm": 0.734443724155426, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 14600 + }, + { + "epoch": 1.0491921005385996, + "grad_norm": 0.7333676218986511, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14610 + }, + { + "epoch": 1.0499102333931778, + "grad_norm": 0.6122187972068787, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14620 + }, + { + "epoch": 1.0506283662477558, + "grad_norm": 0.6916412711143494, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 14630 + }, + { + "epoch": 1.051346499102334, + "grad_norm": 0.5898127555847168, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 14640 + }, + { + "epoch": 1.052064631956912, + "grad_norm": 0.6071873307228088, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14650 + }, + { + "epoch": 1.05278276481149, + "grad_norm": 0.6530455946922302, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 14660 + }, + { + "epoch": 1.0535008976660682, + "grad_norm": 0.6919314861297607, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14670 + }, + { + "epoch": 1.0542190305206462, + "grad_norm": 0.7843509912490845, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 14680 + }, + { + "epoch": 1.0549371633752245, + "grad_norm": 0.6106747388839722, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 14690 + }, + { + "epoch": 1.0556552962298025, + "grad_norm": 0.7828368544578552, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 14700 + }, + { + "epoch": 1.0563734290843807, + "grad_norm": 0.6772044897079468, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 14710 + }, + { + "epoch": 1.0570915619389587, + "grad_norm": 0.5430962443351746, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 14720 + }, + { + "epoch": 1.0578096947935367, + "grad_norm": 0.7364194989204407, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 14730 + }, + { + "epoch": 1.058527827648115, + "grad_norm": 0.5607585310935974, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 14740 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 0.7917081713676453, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 14750 + }, + { + "epoch": 1.0599640933572712, + "grad_norm": 0.7852025628089905, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 14760 + }, + { + "epoch": 1.0606822262118492, + "grad_norm": 0.6329161524772644, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 14770 + }, + { + "epoch": 1.0614003590664274, + "grad_norm": 0.7607306838035583, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14780 + }, + { + "epoch": 1.0621184919210054, + "grad_norm": 0.7236617207527161, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14790 + }, + { + "epoch": 1.0628366247755834, + "grad_norm": 0.793542206287384, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 14800 + }, + { + "epoch": 1.0635547576301616, + "grad_norm": 0.53999263048172, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 14810 + }, + { + "epoch": 1.0642728904847396, + "grad_norm": 0.5821034908294678, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 14820 + }, + { + "epoch": 1.0649910233393178, + "grad_norm": 0.6593600511550903, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 14830 + }, + { + "epoch": 1.0657091561938958, + "grad_norm": 0.70230633020401, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 14840 + }, + { + "epoch": 1.066427289048474, + "grad_norm": 0.5715264081954956, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14850 + }, + { + "epoch": 1.067145421903052, + "grad_norm": 0.6610119938850403, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 14860 + }, + { + "epoch": 1.06786355475763, + "grad_norm": 0.5470091700553894, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 14870 + }, + { + "epoch": 1.0685816876122083, + "grad_norm": 0.7529906630516052, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 14880 + }, + { + "epoch": 1.0692998204667863, + "grad_norm": 0.7532844543457031, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 14890 + }, + { + "epoch": 1.0700179533213645, + "grad_norm": 0.6439316868782043, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14900 + }, + { + "epoch": 1.0707360861759425, + "grad_norm": 0.5580114126205444, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14910 + }, + { + "epoch": 1.0714542190305207, + "grad_norm": 0.6299236416816711, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 14920 + }, + { + "epoch": 1.0721723518850987, + "grad_norm": 0.6934021711349487, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 14930 + }, + { + "epoch": 1.0728904847396767, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 14940 + }, + { + "epoch": 1.073608617594255, + "grad_norm": 0.8921014070510864, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14950 + }, + { + "epoch": 1.074326750448833, + "grad_norm": 0.5934301614761353, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 14960 + }, + { + "epoch": 1.0750448833034112, + "grad_norm": 0.8379642367362976, + "learning_rate": 0.0002, + "loss": 0.7595, + "step": 14970 + }, + { + "epoch": 1.0757630161579892, + "grad_norm": 0.6842767596244812, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 14980 + }, + { + "epoch": 1.0764811490125674, + "grad_norm": 0.7296533584594727, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 14990 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 0.6821087002754211, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15000 + }, + { + "epoch": 1.0779174147217234, + "grad_norm": 0.6133626699447632, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 15010 + }, + { + "epoch": 1.0786355475763016, + "grad_norm": 0.6774773001670837, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 15020 + }, + { + "epoch": 1.0793536804308796, + "grad_norm": 0.6818786859512329, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 15030 + }, + { + "epoch": 1.0800718132854579, + "grad_norm": 0.7763522863388062, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15040 + }, + { + "epoch": 1.0807899461400359, + "grad_norm": 0.7259193658828735, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15050 + }, + { + "epoch": 1.081508078994614, + "grad_norm": 0.6797525882720947, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 15060 + }, + { + "epoch": 1.082226211849192, + "grad_norm": 0.5775881409645081, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 15070 + }, + { + "epoch": 1.08294434470377, + "grad_norm": 0.7055524587631226, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15080 + }, + { + "epoch": 1.0836624775583483, + "grad_norm": 0.8018748760223389, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 15090 + }, + { + "epoch": 1.0843806104129263, + "grad_norm": 0.6738115549087524, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 15100 + }, + { + "epoch": 1.0850987432675046, + "grad_norm": 0.6586359143257141, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 15110 + }, + { + "epoch": 1.0858168761220826, + "grad_norm": 0.7396895885467529, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 15120 + }, + { + "epoch": 1.0865350089766608, + "grad_norm": 0.7224817276000977, + "learning_rate": 0.0002, + "loss": 0.7473, + "step": 15130 + }, + { + "epoch": 1.0872531418312388, + "grad_norm": 0.798514187335968, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 15140 + }, + { + "epoch": 1.0879712746858168, + "grad_norm": 0.79301518201828, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 15150 + }, + { + "epoch": 1.088689407540395, + "grad_norm": 0.7106764316558838, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 15160 + }, + { + "epoch": 1.089407540394973, + "grad_norm": 0.6525473594665527, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 15170 + }, + { + "epoch": 1.0901256732495512, + "grad_norm": 0.6001671552658081, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 15180 + }, + { + "epoch": 1.0908438061041292, + "grad_norm": 0.6949557662010193, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 15190 + }, + { + "epoch": 1.0915619389587075, + "grad_norm": 0.5713186860084534, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 15200 + }, + { + "epoch": 1.0922800718132855, + "grad_norm": 0.8773220181465149, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 15210 + }, + { + "epoch": 1.0929982046678635, + "grad_norm": 0.5837785601615906, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 15220 + }, + { + "epoch": 1.0937163375224417, + "grad_norm": 0.7243856191635132, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 15230 + }, + { + "epoch": 1.0944344703770197, + "grad_norm": 0.7008263468742371, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 15240 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 0.7061941623687744, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 15250 + }, + { + "epoch": 1.095870736086176, + "grad_norm": 0.575903594493866, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 15260 + }, + { + "epoch": 1.0965888689407541, + "grad_norm": 0.6794043183326721, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 15270 + }, + { + "epoch": 1.0973070017953321, + "grad_norm": 0.7194870710372925, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 15280 + }, + { + "epoch": 1.0980251346499101, + "grad_norm": 0.8063322305679321, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 15290 + }, + { + "epoch": 1.0987432675044884, + "grad_norm": 0.786101758480072, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 15300 + }, + { + "epoch": 1.0994614003590664, + "grad_norm": 0.827474057674408, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 15310 + }, + { + "epoch": 1.1001795332136446, + "grad_norm": 0.6514455080032349, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 15320 + }, + { + "epoch": 1.1008976660682226, + "grad_norm": 0.7534348368644714, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15330 + }, + { + "epoch": 1.1016157989228008, + "grad_norm": 0.6991367340087891, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 15340 + }, + { + "epoch": 1.1023339317773788, + "grad_norm": 0.6742196679115295, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15350 + }, + { + "epoch": 1.1030520646319568, + "grad_norm": 0.7373757362365723, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 15360 + }, + { + "epoch": 1.103770197486535, + "grad_norm": 0.6834485530853271, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 15370 + }, + { + "epoch": 1.104488330341113, + "grad_norm": 0.6454901099205017, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 15380 + }, + { + "epoch": 1.1052064631956913, + "grad_norm": 0.7764508128166199, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 15390 + }, + { + "epoch": 1.1059245960502693, + "grad_norm": 0.668560802936554, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 15400 + }, + { + "epoch": 1.1066427289048475, + "grad_norm": 0.579655110836029, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 15410 + }, + { + "epoch": 1.1073608617594255, + "grad_norm": 0.7196493148803711, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 15420 + }, + { + "epoch": 1.1080789946140035, + "grad_norm": 0.5530232191085815, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 15430 + }, + { + "epoch": 1.1087971274685817, + "grad_norm": 0.6542958617210388, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 15440 + }, + { + "epoch": 1.1095152603231597, + "grad_norm": 0.7468852400779724, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 15450 + }, + { + "epoch": 1.110233393177738, + "grad_norm": 0.8119780421257019, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 15460 + }, + { + "epoch": 1.110951526032316, + "grad_norm": 0.7807733416557312, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 15470 + }, + { + "epoch": 1.1116696588868942, + "grad_norm": 0.7352553009986877, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 15480 + }, + { + "epoch": 1.1123877917414722, + "grad_norm": 0.8455224633216858, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 15490 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 0.635308563709259, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 15500 + }, + { + "epoch": 1.1138240574506284, + "grad_norm": 0.6268794536590576, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15510 + }, + { + "epoch": 1.1145421903052064, + "grad_norm": 0.6829593181610107, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 15520 + }, + { + "epoch": 1.1152603231597846, + "grad_norm": 0.5997796058654785, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 15530 + }, + { + "epoch": 1.1159784560143626, + "grad_norm": 0.7500942349433899, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 15540 + }, + { + "epoch": 1.1166965888689409, + "grad_norm": 0.7052047848701477, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 15550 + }, + { + "epoch": 1.1174147217235189, + "grad_norm": 0.6698189377784729, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 15560 + }, + { + "epoch": 1.1181328545780969, + "grad_norm": 0.7890462875366211, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 15570 + }, + { + "epoch": 1.118850987432675, + "grad_norm": 0.7002465128898621, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 15580 + }, + { + "epoch": 1.119569120287253, + "grad_norm": 0.7456073760986328, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 15590 + }, + { + "epoch": 1.1202872531418313, + "grad_norm": 0.7997385263442993, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 15600 + }, + { + "epoch": 1.1210053859964093, + "grad_norm": 0.6640482544898987, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15610 + }, + { + "epoch": 1.1217235188509875, + "grad_norm": 0.7765318155288696, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15620 + }, + { + "epoch": 1.1224416517055655, + "grad_norm": 0.7184962630271912, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 15630 + }, + { + "epoch": 1.1231597845601435, + "grad_norm": 0.7310904264450073, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 15640 + }, + { + "epoch": 1.1238779174147218, + "grad_norm": 0.7406452298164368, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 15650 + }, + { + "epoch": 1.1245960502692998, + "grad_norm": 0.7546738982200623, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 15660 + }, + { + "epoch": 1.125314183123878, + "grad_norm": 0.7069764733314514, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 15670 + }, + { + "epoch": 1.126032315978456, + "grad_norm": 0.6309521198272705, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 15680 + }, + { + "epoch": 1.1267504488330342, + "grad_norm": 0.8050156831741333, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 15690 + }, + { + "epoch": 1.1274685816876122, + "grad_norm": 0.726556122303009, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 15700 + }, + { + "epoch": 1.1281867145421902, + "grad_norm": 0.77745521068573, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 15710 + }, + { + "epoch": 1.1289048473967684, + "grad_norm": 0.7467634677886963, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 15720 + }, + { + "epoch": 1.1296229802513464, + "grad_norm": 0.8207895755767822, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 15730 + }, + { + "epoch": 1.1303411131059247, + "grad_norm": 0.8253937363624573, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 15740 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 0.6313983798027039, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 15750 + }, + { + "epoch": 1.1317773788150807, + "grad_norm": 0.8040992021560669, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 15760 + }, + { + "epoch": 1.132495511669659, + "grad_norm": 0.5937064290046692, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 15770 + }, + { + "epoch": 1.133213644524237, + "grad_norm": 0.6486281156539917, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 15780 + }, + { + "epoch": 1.1339317773788151, + "grad_norm": 0.6161853075027466, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 15790 + }, + { + "epoch": 1.1346499102333931, + "grad_norm": 0.6926610469818115, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 15800 + }, + { + "epoch": 1.1353680430879713, + "grad_norm": 0.6084047555923462, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 15810 + }, + { + "epoch": 1.1360861759425493, + "grad_norm": 0.6928383111953735, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 15820 + }, + { + "epoch": 1.1368043087971276, + "grad_norm": 0.7784243822097778, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 15830 + }, + { + "epoch": 1.1375224416517056, + "grad_norm": 0.7169384956359863, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 15840 + }, + { + "epoch": 1.1382405745062836, + "grad_norm": 0.6953616142272949, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 15850 + }, + { + "epoch": 1.1389587073608618, + "grad_norm": 0.7345215082168579, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15860 + }, + { + "epoch": 1.1396768402154398, + "grad_norm": 0.5469502806663513, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 15870 + }, + { + "epoch": 1.140394973070018, + "grad_norm": 0.687680721282959, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15880 + }, + { + "epoch": 1.141113105924596, + "grad_norm": 0.6879996657371521, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 15890 + }, + { + "epoch": 1.141831238779174, + "grad_norm": 0.728886067867279, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 15900 + }, + { + "epoch": 1.1425493716337523, + "grad_norm": 0.929531455039978, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 15910 + }, + { + "epoch": 1.1432675044883303, + "grad_norm": 0.8122507333755493, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 15920 + }, + { + "epoch": 1.1439856373429085, + "grad_norm": 0.6494652628898621, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 15930 + }, + { + "epoch": 1.1447037701974865, + "grad_norm": 0.7307567596435547, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15940 + }, + { + "epoch": 1.1454219030520647, + "grad_norm": 0.548678994178772, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 15950 + }, + { + "epoch": 1.1461400359066427, + "grad_norm": 0.8011603951454163, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 15960 + }, + { + "epoch": 1.146858168761221, + "grad_norm": 0.7026647329330444, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 15970 + }, + { + "epoch": 1.147576301615799, + "grad_norm": 0.7338995933532715, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 15980 + }, + { + "epoch": 1.148294434470377, + "grad_norm": 0.8453443646430969, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 15990 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 0.6787207126617432, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 16000 + }, + { + "epoch": 1.1497307001795332, + "grad_norm": 0.6314631104469299, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 16010 + }, + { + "epoch": 1.1504488330341114, + "grad_norm": 0.8812752962112427, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16020 + }, + { + "epoch": 1.1511669658886894, + "grad_norm": 0.6528969407081604, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 16030 + }, + { + "epoch": 1.1518850987432674, + "grad_norm": 0.7843571305274963, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 16040 + }, + { + "epoch": 1.1526032315978456, + "grad_norm": 0.7095080018043518, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 16050 + }, + { + "epoch": 1.1533213644524236, + "grad_norm": 0.7495582103729248, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 16060 + }, + { + "epoch": 1.1540394973070018, + "grad_norm": 0.6002049446105957, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 16070 + }, + { + "epoch": 1.1547576301615798, + "grad_norm": 0.565014123916626, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 16080 + }, + { + "epoch": 1.155475763016158, + "grad_norm": 0.8209971785545349, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 16090 + }, + { + "epoch": 1.156193895870736, + "grad_norm": 0.7137531042098999, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 16100 + }, + { + "epoch": 1.1569120287253143, + "grad_norm": 0.7307516932487488, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 16110 + }, + { + "epoch": 1.1576301615798923, + "grad_norm": 0.6686444878578186, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 16120 + }, + { + "epoch": 1.1583482944344703, + "grad_norm": 0.7977298498153687, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 16130 + }, + { + "epoch": 1.1590664272890485, + "grad_norm": 0.6980607509613037, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 16140 + }, + { + "epoch": 1.1597845601436265, + "grad_norm": 0.6622613668441772, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 16150 + }, + { + "epoch": 1.1605026929982047, + "grad_norm": 0.6598347425460815, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 16160 + }, + { + "epoch": 1.1612208258527827, + "grad_norm": 0.6686234474182129, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 16170 + }, + { + "epoch": 1.1619389587073607, + "grad_norm": 0.7308177947998047, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 16180 + }, + { + "epoch": 1.162657091561939, + "grad_norm": 0.939537525177002, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 16190 + }, + { + "epoch": 1.163375224416517, + "grad_norm": 0.5514758825302124, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 16200 + }, + { + "epoch": 1.1640933572710952, + "grad_norm": 0.589142918586731, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 16210 + }, + { + "epoch": 1.1648114901256732, + "grad_norm": 0.6888012290000916, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 16220 + }, + { + "epoch": 1.1655296229802514, + "grad_norm": 0.82566899061203, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 16230 + }, + { + "epoch": 1.1662477558348294, + "grad_norm": 0.6107817888259888, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 16240 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 0.7831398844718933, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 16250 + }, + { + "epoch": 1.1676840215439857, + "grad_norm": 0.6468397974967957, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 16260 + }, + { + "epoch": 1.1684021543985637, + "grad_norm": 0.7284161448478699, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 16270 + }, + { + "epoch": 1.1691202872531419, + "grad_norm": 0.6182818412780762, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 16280 + }, + { + "epoch": 1.1698384201077199, + "grad_norm": 0.7091781497001648, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 16290 + }, + { + "epoch": 1.170556552962298, + "grad_norm": 0.7327643632888794, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 16300 + }, + { + "epoch": 1.171274685816876, + "grad_norm": 0.5864694118499756, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 16310 + }, + { + "epoch": 1.171992818671454, + "grad_norm": 0.7049986720085144, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 16320 + }, + { + "epoch": 1.1727109515260323, + "grad_norm": 0.7563399076461792, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 16330 + }, + { + "epoch": 1.1734290843806103, + "grad_norm": 0.5888143181800842, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16340 + }, + { + "epoch": 1.1741472172351886, + "grad_norm": 0.8670049905776978, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 16350 + }, + { + "epoch": 1.1748653500897666, + "grad_norm": 0.8045654296875, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 16360 + }, + { + "epoch": 1.1755834829443448, + "grad_norm": 0.9115668535232544, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 16370 + }, + { + "epoch": 1.1763016157989228, + "grad_norm": 0.6943584084510803, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 16380 + }, + { + "epoch": 1.177019748653501, + "grad_norm": 0.7931740283966064, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 16390 + }, + { + "epoch": 1.177737881508079, + "grad_norm": 0.7967953085899353, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16400 + }, + { + "epoch": 1.178456014362657, + "grad_norm": 0.575165867805481, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 16410 + }, + { + "epoch": 1.1791741472172352, + "grad_norm": 0.6803409457206726, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 16420 + }, + { + "epoch": 1.1798922800718132, + "grad_norm": 0.7661909461021423, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 16430 + }, + { + "epoch": 1.1806104129263915, + "grad_norm": 0.7907630205154419, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 16440 + }, + { + "epoch": 1.1813285457809695, + "grad_norm": 0.7215338945388794, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 16450 + }, + { + "epoch": 1.1820466786355475, + "grad_norm": 0.6824054718017578, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 16460 + }, + { + "epoch": 1.1827648114901257, + "grad_norm": 0.8057665228843689, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 16470 + }, + { + "epoch": 1.1834829443447037, + "grad_norm": 0.7487542033195496, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 16480 + }, + { + "epoch": 1.184201077199282, + "grad_norm": 0.7254953384399414, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 16490 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 0.6986604332923889, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 16500 + }, + { + "epoch": 1.1856373429084381, + "grad_norm": 0.7889591455459595, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 16510 + }, + { + "epoch": 1.1863554757630161, + "grad_norm": 0.6029604077339172, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 16520 + }, + { + "epoch": 1.1870736086175944, + "grad_norm": 0.680322527885437, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 16530 + }, + { + "epoch": 1.1877917414721724, + "grad_norm": 0.8588826060295105, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 16540 + }, + { + "epoch": 1.1885098743267504, + "grad_norm": 0.7614806890487671, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 16550 + }, + { + "epoch": 1.1892280071813286, + "grad_norm": 0.7523183226585388, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 16560 + }, + { + "epoch": 1.1899461400359066, + "grad_norm": 0.8299532532691956, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 16570 + }, + { + "epoch": 1.1906642728904848, + "grad_norm": 0.6709241271018982, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 16580 + }, + { + "epoch": 1.1913824057450628, + "grad_norm": 0.665414035320282, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16590 + }, + { + "epoch": 1.1921005385996408, + "grad_norm": 0.7582152485847473, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 16600 + }, + { + "epoch": 1.192818671454219, + "grad_norm": 0.5856947302818298, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 16610 + }, + { + "epoch": 1.193536804308797, + "grad_norm": 0.6972885727882385, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 16620 + }, + { + "epoch": 1.1942549371633753, + "grad_norm": 0.6884734630584717, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 16630 + }, + { + "epoch": 1.1949730700179533, + "grad_norm": 0.7380475401878357, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 16640 + }, + { + "epoch": 1.1956912028725315, + "grad_norm": 0.7976197600364685, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 16650 + }, + { + "epoch": 1.1964093357271095, + "grad_norm": 0.819256067276001, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 16660 + }, + { + "epoch": 1.1971274685816877, + "grad_norm": 0.587867796421051, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 16670 + }, + { + "epoch": 1.1978456014362657, + "grad_norm": 0.9162678122520447, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 16680 + }, + { + "epoch": 1.1985637342908437, + "grad_norm": 0.7452084422111511, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 16690 + }, + { + "epoch": 1.199281867145422, + "grad_norm": 0.7966971397399902, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 16700 + }, + { + "epoch": 1.2, + "grad_norm": 0.6605724692344666, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 16710 + }, + { + "epoch": 1.2007181328545782, + "grad_norm": 0.6499220728874207, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16720 + }, + { + "epoch": 1.2014362657091562, + "grad_norm": 0.7422114610671997, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 16730 + }, + { + "epoch": 1.2021543985637342, + "grad_norm": 0.6652370095252991, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 16740 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 0.8761070370674133, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 16750 + }, + { + "epoch": 1.2035906642728904, + "grad_norm": 0.7294463515281677, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 16760 + }, + { + "epoch": 1.2043087971274686, + "grad_norm": 0.7725599408149719, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 16770 + }, + { + "epoch": 1.2050269299820466, + "grad_norm": 0.5630005598068237, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 16780 + }, + { + "epoch": 1.2057450628366249, + "grad_norm": 0.7601404786109924, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16790 + }, + { + "epoch": 1.2064631956912029, + "grad_norm": 0.6859985589981079, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16800 + }, + { + "epoch": 1.207181328545781, + "grad_norm": 0.7040054798126221, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 16810 + }, + { + "epoch": 1.207899461400359, + "grad_norm": 0.7058989405632019, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 16820 + }, + { + "epoch": 1.208617594254937, + "grad_norm": 0.7646133899688721, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16830 + }, + { + "epoch": 1.2093357271095153, + "grad_norm": 0.669550359249115, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 16840 + }, + { + "epoch": 1.2100538599640933, + "grad_norm": 0.6613401174545288, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16850 + }, + { + "epoch": 1.2107719928186715, + "grad_norm": 0.8636519312858582, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 16860 + }, + { + "epoch": 1.2114901256732495, + "grad_norm": 0.6077507138252258, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 16870 + }, + { + "epoch": 1.2122082585278275, + "grad_norm": 0.7892228364944458, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 16880 + }, + { + "epoch": 1.2129263913824058, + "grad_norm": 0.7424154877662659, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 16890 + }, + { + "epoch": 1.2136445242369838, + "grad_norm": 0.6525408029556274, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 16900 + }, + { + "epoch": 1.214362657091562, + "grad_norm": 0.6178015470504761, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 16910 + }, + { + "epoch": 1.21508078994614, + "grad_norm": 0.7319437861442566, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 16920 + }, + { + "epoch": 1.2157989228007182, + "grad_norm": 0.6823344826698303, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 16930 + }, + { + "epoch": 1.2165170556552962, + "grad_norm": 0.5681257843971252, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 16940 + }, + { + "epoch": 1.2172351885098744, + "grad_norm": 0.7939814925193787, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 16950 + }, + { + "epoch": 1.2179533213644524, + "grad_norm": 0.7031611800193787, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 16960 + }, + { + "epoch": 1.2186714542190304, + "grad_norm": 0.7610133290290833, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16970 + }, + { + "epoch": 1.2193895870736087, + "grad_norm": 0.8707142472267151, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 16980 + }, + { + "epoch": 1.2201077199281867, + "grad_norm": 0.6603384017944336, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 16990 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 0.7218315005302429, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 17000 + }, + { + "epoch": 1.221543985637343, + "grad_norm": 0.8043148517608643, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17010 + }, + { + "epoch": 1.222262118491921, + "grad_norm": 0.7232559323310852, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17020 + }, + { + "epoch": 1.2229802513464991, + "grad_norm": 0.690376341342926, + "learning_rate": 0.0002, + "loss": 0.7681, + "step": 17030 + }, + { + "epoch": 1.2236983842010771, + "grad_norm": 0.602436363697052, + "learning_rate": 0.0002, + "loss": 0.7042, + "step": 17040 + }, + { + "epoch": 1.2244165170556554, + "grad_norm": 0.7610493898391724, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 17050 + }, + { + "epoch": 1.2251346499102334, + "grad_norm": 0.7504690885543823, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 17060 + }, + { + "epoch": 1.2258527827648116, + "grad_norm": 0.8080246448516846, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 17070 + }, + { + "epoch": 1.2265709156193896, + "grad_norm": 1.0240572690963745, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 17080 + }, + { + "epoch": 1.2272890484739678, + "grad_norm": 0.6874111294746399, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 17090 + }, + { + "epoch": 1.2280071813285458, + "grad_norm": 0.800069272518158, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 17100 + }, + { + "epoch": 1.2287253141831238, + "grad_norm": 0.8628103137016296, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 17110 + }, + { + "epoch": 1.229443447037702, + "grad_norm": 0.7408499121665955, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 17120 + }, + { + "epoch": 1.23016157989228, + "grad_norm": 0.6494335532188416, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 17130 + }, + { + "epoch": 1.2308797127468583, + "grad_norm": 0.6493549942970276, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17140 + }, + { + "epoch": 1.2315978456014363, + "grad_norm": 0.6972658038139343, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 17150 + }, + { + "epoch": 1.2323159784560143, + "grad_norm": 0.6877315044403076, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 17160 + }, + { + "epoch": 1.2330341113105925, + "grad_norm": 0.7569024562835693, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 17170 + }, + { + "epoch": 1.2337522441651705, + "grad_norm": 0.696260392665863, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 17180 + }, + { + "epoch": 1.2344703770197487, + "grad_norm": 0.6150345802307129, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 17190 + }, + { + "epoch": 1.2351885098743267, + "grad_norm": 0.69009929895401, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 17200 + }, + { + "epoch": 1.235906642728905, + "grad_norm": 0.7035185098648071, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 17210 + }, + { + "epoch": 1.236624775583483, + "grad_norm": 0.6792506575584412, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17220 + }, + { + "epoch": 1.2373429084380612, + "grad_norm": 0.6310356855392456, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 17230 + }, + { + "epoch": 1.2380610412926392, + "grad_norm": 0.647026538848877, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 17240 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 0.7609930038452148, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 17250 + }, + { + "epoch": 1.2394973070017954, + "grad_norm": 0.791890561580658, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 17260 + }, + { + "epoch": 1.2402154398563734, + "grad_norm": 0.7126715183258057, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 17270 + }, + { + "epoch": 1.2409335727109516, + "grad_norm": 0.7850401401519775, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 17280 + }, + { + "epoch": 1.2416517055655296, + "grad_norm": 0.6694281697273254, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 17290 + }, + { + "epoch": 1.2423698384201076, + "grad_norm": 0.6418080925941467, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 17300 + }, + { + "epoch": 1.2430879712746858, + "grad_norm": 0.7308132648468018, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 17310 + }, + { + "epoch": 1.2438061041292638, + "grad_norm": 0.8322312235832214, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17320 + }, + { + "epoch": 1.244524236983842, + "grad_norm": 0.6959006190299988, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 17330 + }, + { + "epoch": 1.24524236983842, + "grad_norm": 0.7110121846199036, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17340 + }, + { + "epoch": 1.2459605026929983, + "grad_norm": 0.6496296525001526, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 17350 + }, + { + "epoch": 1.2466786355475763, + "grad_norm": 0.7649076581001282, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 17360 + }, + { + "epoch": 1.2473967684021545, + "grad_norm": 0.7139049172401428, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 17370 + }, + { + "epoch": 1.2481149012567325, + "grad_norm": 0.7709113955497742, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 17380 + }, + { + "epoch": 1.2488330341113105, + "grad_norm": 0.7160373330116272, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 17390 + }, + { + "epoch": 1.2495511669658887, + "grad_norm": 0.5608301162719727, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17400 + }, + { + "epoch": 1.2502692998204668, + "grad_norm": 0.6913180351257324, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 17410 + }, + { + "epoch": 1.250987432675045, + "grad_norm": 0.6980322599411011, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 17420 + }, + { + "epoch": 1.251705565529623, + "grad_norm": 0.8155394792556763, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 17430 + }, + { + "epoch": 1.252423698384201, + "grad_norm": 0.8015886545181274, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 17440 + }, + { + "epoch": 1.2531418312387792, + "grad_norm": 0.5985556244850159, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17450 + }, + { + "epoch": 1.2538599640933572, + "grad_norm": 0.70317143201828, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17460 + }, + { + "epoch": 1.2545780969479354, + "grad_norm": 0.612501323223114, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17470 + }, + { + "epoch": 1.2552962298025134, + "grad_norm": 0.7347102165222168, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 17480 + }, + { + "epoch": 1.2560143626570914, + "grad_norm": 0.9189441800117493, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 17490 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 0.7727932929992676, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 17500 + }, + { + "epoch": 1.2574506283662479, + "grad_norm": 0.6782869696617126, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 17510 + }, + { + "epoch": 1.2581687612208259, + "grad_norm": 0.5710638761520386, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17520 + }, + { + "epoch": 1.2588868940754039, + "grad_norm": 0.6856266856193542, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 17530 + }, + { + "epoch": 1.259605026929982, + "grad_norm": 0.7257347702980042, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 17540 + }, + { + "epoch": 1.26032315978456, + "grad_norm": 0.6343092918395996, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 17550 + }, + { + "epoch": 1.2610412926391383, + "grad_norm": 0.6482594013214111, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 17560 + }, + { + "epoch": 1.2617594254937163, + "grad_norm": 0.6542837619781494, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 17570 + }, + { + "epoch": 1.2624775583482943, + "grad_norm": 0.7106123566627502, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 17580 + }, + { + "epoch": 1.2631956912028726, + "grad_norm": 0.9081960320472717, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 17590 + }, + { + "epoch": 1.2639138240574506, + "grad_norm": 0.7010290026664734, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 17600 + }, + { + "epoch": 1.2646319569120288, + "grad_norm": 0.9973132610321045, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 17610 + }, + { + "epoch": 1.2653500897666068, + "grad_norm": 0.8003297448158264, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 17620 + }, + { + "epoch": 1.2660682226211848, + "grad_norm": 0.7383468151092529, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 17630 + }, + { + "epoch": 1.266786355475763, + "grad_norm": 0.6337200999259949, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 17640 + }, + { + "epoch": 1.2675044883303412, + "grad_norm": 0.6371761560440063, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 17650 + }, + { + "epoch": 1.2682226211849192, + "grad_norm": 0.7283522486686707, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 17660 + }, + { + "epoch": 1.2689407540394972, + "grad_norm": 0.8191015720367432, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 17670 + }, + { + "epoch": 1.2696588868940755, + "grad_norm": 0.6210351586341858, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 17680 + }, + { + "epoch": 1.2703770197486535, + "grad_norm": 0.6563277840614319, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 17690 + }, + { + "epoch": 1.2710951526032317, + "grad_norm": 0.7111260294914246, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 17700 + }, + { + "epoch": 1.2718132854578097, + "grad_norm": 0.7061500549316406, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 17710 + }, + { + "epoch": 1.2725314183123877, + "grad_norm": 0.7657744884490967, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 17720 + }, + { + "epoch": 1.273249551166966, + "grad_norm": 0.6952996850013733, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17730 + }, + { + "epoch": 1.273967684021544, + "grad_norm": 0.5678043961524963, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 17740 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 0.8608036041259766, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 17750 + }, + { + "epoch": 1.2754039497307001, + "grad_norm": 0.7184045910835266, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 17760 + }, + { + "epoch": 1.2761220825852782, + "grad_norm": 0.6647557616233826, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 17770 + }, + { + "epoch": 1.2768402154398564, + "grad_norm": 0.6899349093437195, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17780 + }, + { + "epoch": 1.2775583482944346, + "grad_norm": 0.7073346972465515, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 17790 + }, + { + "epoch": 1.2782764811490126, + "grad_norm": 0.8896707892417908, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 17800 + }, + { + "epoch": 1.2789946140035906, + "grad_norm": 0.5072778463363647, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 17810 + }, + { + "epoch": 1.2797127468581688, + "grad_norm": 0.8889711499214172, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 17820 + }, + { + "epoch": 1.2804308797127468, + "grad_norm": 0.5583778619766235, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 17830 + }, + { + "epoch": 1.281149012567325, + "grad_norm": 0.6526148915290833, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 17840 + }, + { + "epoch": 1.281867145421903, + "grad_norm": 0.7658175826072693, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 17850 + }, + { + "epoch": 1.282585278276481, + "grad_norm": 0.5547847151756287, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 17860 + }, + { + "epoch": 1.2833034111310593, + "grad_norm": 0.6153780817985535, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17870 + }, + { + "epoch": 1.2840215439856373, + "grad_norm": 0.8474061489105225, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 17880 + }, + { + "epoch": 1.2847396768402155, + "grad_norm": 0.859260618686676, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 17890 + }, + { + "epoch": 1.2854578096947935, + "grad_norm": 0.7270520329475403, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 17900 + }, + { + "epoch": 1.2861759425493715, + "grad_norm": 0.8166249394416809, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 17910 + }, + { + "epoch": 1.2868940754039497, + "grad_norm": 0.9158982038497925, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17920 + }, + { + "epoch": 1.287612208258528, + "grad_norm": 0.8132565021514893, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17930 + }, + { + "epoch": 1.288330341113106, + "grad_norm": 0.7914409637451172, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17940 + }, + { + "epoch": 1.289048473967684, + "grad_norm": 0.6256071329116821, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 17950 + }, + { + "epoch": 1.2897666068222622, + "grad_norm": 0.6463542580604553, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 17960 + }, + { + "epoch": 1.2904847396768402, + "grad_norm": 0.6702672839164734, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 17970 + }, + { + "epoch": 1.2912028725314184, + "grad_norm": 0.8666605949401855, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 17980 + }, + { + "epoch": 1.2919210053859964, + "grad_norm": 0.8055952787399292, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17990 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 0.6909741163253784, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 18000 + }, + { + "epoch": 1.2933572710951526, + "grad_norm": 0.663702130317688, + "learning_rate": 0.0002, + "loss": 0.7766, + "step": 18010 + }, + { + "epoch": 1.2940754039497306, + "grad_norm": 0.6952448487281799, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 18020 + }, + { + "epoch": 1.2947935368043089, + "grad_norm": 0.5722854137420654, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18030 + }, + { + "epoch": 1.2955116696588869, + "grad_norm": 0.7987681031227112, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 18040 + }, + { + "epoch": 1.2962298025134649, + "grad_norm": 0.661133348941803, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 18050 + }, + { + "epoch": 1.296947935368043, + "grad_norm": 0.6025064587593079, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 18060 + }, + { + "epoch": 1.2976660682226213, + "grad_norm": 0.7569907903671265, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 18070 + }, + { + "epoch": 1.2983842010771993, + "grad_norm": 0.7222012281417847, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18080 + }, + { + "epoch": 1.2991023339317773, + "grad_norm": 0.5291963815689087, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 18090 + }, + { + "epoch": 1.2998204667863555, + "grad_norm": 0.6808363199234009, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 18100 + }, + { + "epoch": 1.3005385996409335, + "grad_norm": 0.6797927618026733, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 18110 + }, + { + "epoch": 1.3012567324955118, + "grad_norm": 0.7775542140007019, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 18120 + }, + { + "epoch": 1.3019748653500898, + "grad_norm": 0.7369466423988342, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18130 + }, + { + "epoch": 1.3026929982046678, + "grad_norm": 0.6822494864463806, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 18140 + }, + { + "epoch": 1.303411131059246, + "grad_norm": 0.9222138524055481, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 18150 + }, + { + "epoch": 1.304129263913824, + "grad_norm": 0.7485767006874084, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 18160 + }, + { + "epoch": 1.3048473967684022, + "grad_norm": 0.6383684277534485, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 18170 + }, + { + "epoch": 1.3055655296229802, + "grad_norm": 0.5934187173843384, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 18180 + }, + { + "epoch": 1.3062836624775582, + "grad_norm": 0.7265770435333252, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 18190 + }, + { + "epoch": 1.3070017953321365, + "grad_norm": 0.8149140477180481, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 18200 + }, + { + "epoch": 1.3077199281867147, + "grad_norm": 0.8067880272865295, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 18210 + }, + { + "epoch": 1.3084380610412927, + "grad_norm": 0.6109178066253662, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18220 + }, + { + "epoch": 1.3091561938958707, + "grad_norm": 0.7194176316261292, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 18230 + }, + { + "epoch": 1.309874326750449, + "grad_norm": 0.6452242136001587, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 18240 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 0.680550217628479, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 18250 + }, + { + "epoch": 1.3113105924596051, + "grad_norm": 0.7005740404129028, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 18260 + }, + { + "epoch": 1.3120287253141831, + "grad_norm": 0.7217825055122375, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 18270 + }, + { + "epoch": 1.3127468581687611, + "grad_norm": 0.7730209231376648, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 18280 + }, + { + "epoch": 1.3134649910233394, + "grad_norm": 0.8291956186294556, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18290 + }, + { + "epoch": 1.3141831238779174, + "grad_norm": 0.758528470993042, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18300 + }, + { + "epoch": 1.3149012567324956, + "grad_norm": 0.9682782292366028, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 18310 + }, + { + "epoch": 1.3156193895870736, + "grad_norm": 0.5784780979156494, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 18320 + }, + { + "epoch": 1.3163375224416516, + "grad_norm": 0.5870532393455505, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 18330 + }, + { + "epoch": 1.3170556552962298, + "grad_norm": 0.5950172543525696, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 18340 + }, + { + "epoch": 1.317773788150808, + "grad_norm": 0.7625961899757385, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 18350 + }, + { + "epoch": 1.318491921005386, + "grad_norm": 0.8027397394180298, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 18360 + }, + { + "epoch": 1.319210053859964, + "grad_norm": 0.8424779772758484, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 18370 + }, + { + "epoch": 1.3199281867145423, + "grad_norm": 0.5741737484931946, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 18380 + }, + { + "epoch": 1.3206463195691203, + "grad_norm": 0.7363710999488831, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 18390 + }, + { + "epoch": 1.3213644524236985, + "grad_norm": 0.7900536060333252, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 18400 + }, + { + "epoch": 1.3220825852782765, + "grad_norm": 0.6273105144500732, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 18410 + }, + { + "epoch": 1.3228007181328545, + "grad_norm": 0.7612496018409729, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 18420 + }, + { + "epoch": 1.3235188509874327, + "grad_norm": 0.729653537273407, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 18430 + }, + { + "epoch": 1.3242369838420107, + "grad_norm": 0.6599212288856506, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 18440 + }, + { + "epoch": 1.324955116696589, + "grad_norm": 0.762320876121521, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18450 + }, + { + "epoch": 1.325673249551167, + "grad_norm": 0.7468838095664978, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18460 + }, + { + "epoch": 1.326391382405745, + "grad_norm": 0.6376237273216248, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 18470 + }, + { + "epoch": 1.3271095152603232, + "grad_norm": 0.6722603440284729, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18480 + }, + { + "epoch": 1.3278276481149014, + "grad_norm": 0.7011231780052185, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 18490 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 0.5325027108192444, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 18500 + }, + { + "epoch": 1.3292639138240574, + "grad_norm": 0.6916731595993042, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 18510 + }, + { + "epoch": 1.3299820466786356, + "grad_norm": 0.6529106497764587, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18520 + }, + { + "epoch": 1.3307001795332136, + "grad_norm": 0.7708640694618225, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 18530 + }, + { + "epoch": 1.3314183123877918, + "grad_norm": 0.7125861048698425, + "learning_rate": 0.0002, + "loss": 0.7688, + "step": 18540 + }, + { + "epoch": 1.3321364452423698, + "grad_norm": 0.7663969993591309, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 18550 + }, + { + "epoch": 1.3328545780969479, + "grad_norm": 0.601141631603241, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 18560 + }, + { + "epoch": 1.333572710951526, + "grad_norm": 0.6185581088066101, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 18570 + }, + { + "epoch": 1.334290843806104, + "grad_norm": 0.6136596202850342, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 18580 + }, + { + "epoch": 1.3350089766606823, + "grad_norm": 0.8377187252044678, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 18590 + }, + { + "epoch": 1.3357271095152603, + "grad_norm": 0.7649989724159241, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 18600 + }, + { + "epoch": 1.3364452423698383, + "grad_norm": 0.7944515347480774, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 18610 + }, + { + "epoch": 1.3371633752244165, + "grad_norm": 0.619024395942688, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 18620 + }, + { + "epoch": 1.3378815080789948, + "grad_norm": 0.7849082946777344, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 18630 + }, + { + "epoch": 1.3385996409335728, + "grad_norm": 0.5740780830383301, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18640 + }, + { + "epoch": 1.3393177737881508, + "grad_norm": 0.6897456645965576, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 18650 + }, + { + "epoch": 1.340035906642729, + "grad_norm": 0.6263600587844849, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 18660 + }, + { + "epoch": 1.340754039497307, + "grad_norm": 0.5744550824165344, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 18670 + }, + { + "epoch": 1.3414721723518852, + "grad_norm": 0.7785728573799133, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 18680 + }, + { + "epoch": 1.3421903052064632, + "grad_norm": 0.6944230198860168, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 18690 + }, + { + "epoch": 1.3429084380610412, + "grad_norm": 0.7388073801994324, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 18700 + }, + { + "epoch": 1.3436265709156194, + "grad_norm": 0.9555586576461792, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 18710 + }, + { + "epoch": 1.3443447037701974, + "grad_norm": 0.8510582447052002, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 18720 + }, + { + "epoch": 1.3450628366247757, + "grad_norm": 0.6093049645423889, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 18730 + }, + { + "epoch": 1.3457809694793537, + "grad_norm": 0.9159273505210876, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 18740 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 0.7188084721565247, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 18750 + }, + { + "epoch": 1.3472172351885099, + "grad_norm": 0.7228650450706482, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 18760 + }, + { + "epoch": 1.347935368043088, + "grad_norm": 0.8160615563392639, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 18770 + }, + { + "epoch": 1.3486535008976661, + "grad_norm": 0.6485389471054077, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 18780 + }, + { + "epoch": 1.3493716337522441, + "grad_norm": 0.6755139827728271, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 18790 + }, + { + "epoch": 1.3500897666068223, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 18800 + }, + { + "epoch": 1.3508078994614003, + "grad_norm": 0.6954510807991028, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 18810 + }, + { + "epoch": 1.3515260323159786, + "grad_norm": 0.9948558807373047, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 18820 + }, + { + "epoch": 1.3522441651705566, + "grad_norm": 0.708381175994873, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18830 + }, + { + "epoch": 1.3529622980251346, + "grad_norm": 0.6409999132156372, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 18840 + }, + { + "epoch": 1.3536804308797128, + "grad_norm": 0.6365936994552612, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18850 + }, + { + "epoch": 1.3543985637342908, + "grad_norm": 0.7620742917060852, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 18860 + }, + { + "epoch": 1.355116696588869, + "grad_norm": 0.6849071383476257, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 18870 + }, + { + "epoch": 1.355834829443447, + "grad_norm": 0.5776316523551941, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18880 + }, + { + "epoch": 1.356552962298025, + "grad_norm": 0.597236156463623, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 18890 + }, + { + "epoch": 1.3572710951526032, + "grad_norm": 0.6569282412528992, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 18900 + }, + { + "epoch": 1.3579892280071812, + "grad_norm": 0.6384802460670471, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 18910 + }, + { + "epoch": 1.3587073608617595, + "grad_norm": 0.6623879671096802, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 18920 + }, + { + "epoch": 1.3594254937163375, + "grad_norm": 0.6149632334709167, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 18930 + }, + { + "epoch": 1.3601436265709157, + "grad_norm": 0.6978002190589905, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 18940 + }, + { + "epoch": 1.3608617594254937, + "grad_norm": 0.7579124569892883, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 18950 + }, + { + "epoch": 1.361579892280072, + "grad_norm": 0.7138084173202515, + "learning_rate": 0.0002, + "loss": 0.7589, + "step": 18960 + }, + { + "epoch": 1.36229802513465, + "grad_norm": 0.678322434425354, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18970 + }, + { + "epoch": 1.363016157989228, + "grad_norm": 0.694346010684967, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18980 + }, + { + "epoch": 1.3637342908438062, + "grad_norm": 0.682262659072876, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18990 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 0.9068194627761841, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 19000 + }, + { + "epoch": 1.3651705565529624, + "grad_norm": 0.6691566705703735, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 19010 + }, + { + "epoch": 1.3658886894075404, + "grad_norm": 0.7791378498077393, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 19020 + }, + { + "epoch": 1.3666068222621184, + "grad_norm": 0.717107355594635, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 19030 + }, + { + "epoch": 1.3673249551166966, + "grad_norm": 0.7897566556930542, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 19040 + }, + { + "epoch": 1.3680430879712746, + "grad_norm": 0.8823844790458679, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 19050 + }, + { + "epoch": 1.3687612208258528, + "grad_norm": 0.6512053608894348, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 19060 + }, + { + "epoch": 1.3694793536804308, + "grad_norm": 0.6871389150619507, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 19070 + }, + { + "epoch": 1.370197486535009, + "grad_norm": 0.6795603036880493, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 19080 + }, + { + "epoch": 1.370915619389587, + "grad_norm": 0.6569121479988098, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 19090 + }, + { + "epoch": 1.3716337522441653, + "grad_norm": 0.6769960522651672, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 19100 + }, + { + "epoch": 1.3723518850987433, + "grad_norm": 0.726613461971283, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 19110 + }, + { + "epoch": 1.3730700179533213, + "grad_norm": 0.7287817001342773, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 19120 + }, + { + "epoch": 1.3737881508078995, + "grad_norm": 0.6169242858886719, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 19130 + }, + { + "epoch": 1.3745062836624775, + "grad_norm": 0.6537347435951233, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 19140 + }, + { + "epoch": 1.3752244165170557, + "grad_norm": 0.6113879680633545, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 19150 + }, + { + "epoch": 1.3759425493716337, + "grad_norm": 0.6415297985076904, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 19160 + }, + { + "epoch": 1.3766606822262117, + "grad_norm": 0.6812838315963745, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 19170 + }, + { + "epoch": 1.37737881508079, + "grad_norm": 0.7331814169883728, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 19180 + }, + { + "epoch": 1.378096947935368, + "grad_norm": 0.7265108823776245, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 19190 + }, + { + "epoch": 1.3788150807899462, + "grad_norm": 0.6233167052268982, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 19200 + }, + { + "epoch": 1.3795332136445242, + "grad_norm": 0.6841492652893066, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 19210 + }, + { + "epoch": 1.3802513464991024, + "grad_norm": 0.822853684425354, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 19220 + }, + { + "epoch": 1.3809694793536804, + "grad_norm": 0.8078812956809998, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 19230 + }, + { + "epoch": 1.3816876122082586, + "grad_norm": 0.7269898056983948, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 19240 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 0.6297033429145813, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 19250 + }, + { + "epoch": 1.3831238779174146, + "grad_norm": 0.8097442388534546, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 19260 + }, + { + "epoch": 1.3838420107719929, + "grad_norm": 0.6442803740501404, + "learning_rate": 0.0002, + "loss": 0.7281, + "step": 19270 + }, + { + "epoch": 1.3845601436265709, + "grad_norm": 0.659866213798523, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 19280 + }, + { + "epoch": 1.385278276481149, + "grad_norm": 0.7537921667098999, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 19290 + }, + { + "epoch": 1.385996409335727, + "grad_norm": 0.8441828489303589, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 19300 + }, + { + "epoch": 1.386714542190305, + "grad_norm": 0.8506057262420654, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19310 + }, + { + "epoch": 1.3874326750448833, + "grad_norm": 0.6747094392776489, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 19320 + }, + { + "epoch": 1.3881508078994613, + "grad_norm": 0.7906509041786194, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 19330 + }, + { + "epoch": 1.3888689407540395, + "grad_norm": 0.6784867644309998, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 19340 + }, + { + "epoch": 1.3895870736086176, + "grad_norm": 0.6371709108352661, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 19350 + }, + { + "epoch": 1.3903052064631956, + "grad_norm": 0.7858285307884216, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 19360 + }, + { + "epoch": 1.3910233393177738, + "grad_norm": 0.711395263671875, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19370 + }, + { + "epoch": 1.391741472172352, + "grad_norm": 0.7023257613182068, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19380 + }, + { + "epoch": 1.39245960502693, + "grad_norm": 0.7036022543907166, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19390 + }, + { + "epoch": 1.393177737881508, + "grad_norm": 0.6418436169624329, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 19400 + }, + { + "epoch": 1.3938958707360862, + "grad_norm": 0.7108847498893738, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 19410 + }, + { + "epoch": 1.3946140035906642, + "grad_norm": 0.6940230131149292, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 19420 + }, + { + "epoch": 1.3953321364452425, + "grad_norm": 0.6750220656394958, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 19430 + }, + { + "epoch": 1.3960502692998205, + "grad_norm": 0.7479177713394165, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 19440 + }, + { + "epoch": 1.3967684021543985, + "grad_norm": 0.626124918460846, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 19450 + }, + { + "epoch": 1.3974865350089767, + "grad_norm": 0.8908559083938599, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 19460 + }, + { + "epoch": 1.3982046678635547, + "grad_norm": 0.6163712739944458, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 19470 + }, + { + "epoch": 1.398922800718133, + "grad_norm": 0.6993312239646912, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 19480 + }, + { + "epoch": 1.399640933572711, + "grad_norm": 0.6162890791893005, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 19490 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 0.7797643542289734, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 19500 + }, + { + "epoch": 1.4010771992818671, + "grad_norm": 0.7038744688034058, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 19510 + }, + { + "epoch": 1.4017953321364454, + "grad_norm": 0.6902393698692322, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 19520 + }, + { + "epoch": 1.4025134649910234, + "grad_norm": 0.5436386466026306, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 19530 + }, + { + "epoch": 1.4032315978456014, + "grad_norm": 0.6537990570068359, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19540 + }, + { + "epoch": 1.4039497307001796, + "grad_norm": 0.739691972732544, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 19550 + }, + { + "epoch": 1.4046678635547576, + "grad_norm": 0.7287635803222656, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 19560 + }, + { + "epoch": 1.4053859964093358, + "grad_norm": 0.6809501051902771, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 19570 + }, + { + "epoch": 1.4061041292639138, + "grad_norm": 0.8302195072174072, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 19580 + }, + { + "epoch": 1.4068222621184918, + "grad_norm": 0.6613629460334778, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 19590 + }, + { + "epoch": 1.40754039497307, + "grad_norm": 0.7897207736968994, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 19600 + }, + { + "epoch": 1.408258527827648, + "grad_norm": 0.8368293642997742, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 19610 + }, + { + "epoch": 1.4089766606822263, + "grad_norm": 0.665109395980835, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 19620 + }, + { + "epoch": 1.4096947935368043, + "grad_norm": 0.7359302639961243, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 19630 + }, + { + "epoch": 1.4104129263913823, + "grad_norm": 0.8048052787780762, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 19640 + }, + { + "epoch": 1.4111310592459605, + "grad_norm": 0.7414906620979309, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 19650 + }, + { + "epoch": 1.4118491921005387, + "grad_norm": 0.7894161343574524, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 19660 + }, + { + "epoch": 1.4125673249551167, + "grad_norm": 0.6724628210067749, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 19670 + }, + { + "epoch": 1.4132854578096947, + "grad_norm": 0.9397756457328796, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 19680 + }, + { + "epoch": 1.414003590664273, + "grad_norm": 0.6684842109680176, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 19690 + }, + { + "epoch": 1.414721723518851, + "grad_norm": 0.7753993272781372, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 19700 + }, + { + "epoch": 1.4154398563734292, + "grad_norm": 0.6934253573417664, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 19710 + }, + { + "epoch": 1.4161579892280072, + "grad_norm": 0.8567284941673279, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 19720 + }, + { + "epoch": 1.4168761220825852, + "grad_norm": 0.9471787214279175, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 19730 + }, + { + "epoch": 1.4175942549371634, + "grad_norm": 0.6664855480194092, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 19740 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 0.6713361740112305, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 19750 + }, + { + "epoch": 1.4190305206463196, + "grad_norm": 0.6488258838653564, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 19760 + }, + { + "epoch": 1.4197486535008976, + "grad_norm": 0.7089938521385193, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19770 + }, + { + "epoch": 1.4204667863554756, + "grad_norm": 0.6433218717575073, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 19780 + }, + { + "epoch": 1.4211849192100539, + "grad_norm": 0.7025160193443298, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 19790 + }, + { + "epoch": 1.421903052064632, + "grad_norm": 0.7030544877052307, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 19800 + }, + { + "epoch": 1.42262118491921, + "grad_norm": 0.6515552401542664, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 19810 + }, + { + "epoch": 1.423339317773788, + "grad_norm": 0.6463841795921326, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 19820 + }, + { + "epoch": 1.4240574506283663, + "grad_norm": 0.6654344201087952, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19830 + }, + { + "epoch": 1.4247755834829443, + "grad_norm": 0.7223384380340576, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 19840 + }, + { + "epoch": 1.4254937163375225, + "grad_norm": 0.6575722694396973, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 19850 + }, + { + "epoch": 1.4262118491921005, + "grad_norm": 0.6216059327125549, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 19860 + }, + { + "epoch": 1.4269299820466785, + "grad_norm": 0.7451487183570862, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19870 + }, + { + "epoch": 1.4276481149012568, + "grad_norm": 0.6563336253166199, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 19880 + }, + { + "epoch": 1.4283662477558348, + "grad_norm": 0.8021975159645081, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 19890 + }, + { + "epoch": 1.429084380610413, + "grad_norm": 0.7474712133407593, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 19900 + }, + { + "epoch": 1.429802513464991, + "grad_norm": 0.7316377758979797, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 19910 + }, + { + "epoch": 1.430520646319569, + "grad_norm": 0.646892786026001, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 19920 + }, + { + "epoch": 1.4312387791741472, + "grad_norm": 0.6268765926361084, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 19930 + }, + { + "epoch": 1.4319569120287254, + "grad_norm": 0.7104699611663818, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 19940 + }, + { + "epoch": 1.4326750448833034, + "grad_norm": 0.6742063760757446, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 19950 + }, + { + "epoch": 1.4333931777378814, + "grad_norm": 0.6973381638526917, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 19960 + }, + { + "epoch": 1.4341113105924597, + "grad_norm": 0.5819381475448608, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 19970 + }, + { + "epoch": 1.4348294434470377, + "grad_norm": 0.680623471736908, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 19980 + }, + { + "epoch": 1.435547576301616, + "grad_norm": 0.5899890661239624, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 19990 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 0.6225098371505737, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 20000 + }, + { + "epoch": 1.436983842010772, + "grad_norm": 0.6314228773117065, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 20010 + }, + { + "epoch": 1.4377019748653501, + "grad_norm": 0.8690667152404785, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 20020 + }, + { + "epoch": 1.4384201077199281, + "grad_norm": 0.7166543006896973, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 20030 + }, + { + "epoch": 1.4391382405745063, + "grad_norm": 0.7051591873168945, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 20040 + }, + { + "epoch": 1.4398563734290843, + "grad_norm": 0.7606652975082397, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 20050 + }, + { + "epoch": 1.4405745062836623, + "grad_norm": 0.6343185305595398, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 20060 + }, + { + "epoch": 1.4412926391382406, + "grad_norm": 0.5625789761543274, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 20070 + }, + { + "epoch": 1.4420107719928188, + "grad_norm": 0.6081897020339966, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 20080 + }, + { + "epoch": 1.4427289048473968, + "grad_norm": 0.9571536779403687, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 20090 + }, + { + "epoch": 1.4434470377019748, + "grad_norm": 0.869531512260437, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 20100 + }, + { + "epoch": 1.444165170556553, + "grad_norm": 0.6865507960319519, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 20110 + }, + { + "epoch": 1.444883303411131, + "grad_norm": 0.7572755813598633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 20120 + }, + { + "epoch": 1.4456014362657092, + "grad_norm": 0.79011070728302, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 20130 + }, + { + "epoch": 1.4463195691202873, + "grad_norm": 0.8297342658042908, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 20140 + }, + { + "epoch": 1.4470377019748653, + "grad_norm": 0.6593490839004517, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 20150 + }, + { + "epoch": 1.4477558348294435, + "grad_norm": 1.0264687538146973, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 20160 + }, + { + "epoch": 1.4484739676840215, + "grad_norm": 0.7032888531684875, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 20170 + }, + { + "epoch": 1.4491921005385997, + "grad_norm": 0.6438494920730591, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 20180 + }, + { + "epoch": 1.4499102333931777, + "grad_norm": 0.7448790669441223, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 20190 + }, + { + "epoch": 1.4506283662477557, + "grad_norm": 0.7551555037498474, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 20200 + }, + { + "epoch": 1.451346499102334, + "grad_norm": 0.6677857041358948, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 20210 + }, + { + "epoch": 1.4520646319569122, + "grad_norm": 0.7888486385345459, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 20220 + }, + { + "epoch": 1.4527827648114902, + "grad_norm": 0.6658565402030945, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 20230 + }, + { + "epoch": 1.4535008976660682, + "grad_norm": 0.6800249814987183, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 20240 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 0.7419682741165161, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 20250 + }, + { + "epoch": 1.4549371633752244, + "grad_norm": 0.8848792910575867, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 20260 + }, + { + "epoch": 1.4556552962298026, + "grad_norm": 0.6513857245445251, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 20270 + }, + { + "epoch": 1.4563734290843806, + "grad_norm": 0.5605742335319519, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 20280 + }, + { + "epoch": 1.4570915619389586, + "grad_norm": 0.6737141013145447, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 20290 + }, + { + "epoch": 1.4578096947935368, + "grad_norm": 0.6663289666175842, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 20300 + }, + { + "epoch": 1.4585278276481148, + "grad_norm": 0.7157106995582581, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20310 + }, + { + "epoch": 1.459245960502693, + "grad_norm": 0.7713354825973511, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 20320 + }, + { + "epoch": 1.459964093357271, + "grad_norm": 0.8334044218063354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 20330 + }, + { + "epoch": 1.460682226211849, + "grad_norm": 0.7268327474594116, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 20340 + }, + { + "epoch": 1.4614003590664273, + "grad_norm": 0.6791431903839111, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 20350 + }, + { + "epoch": 1.4621184919210055, + "grad_norm": 0.8177870512008667, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 20360 + }, + { + "epoch": 1.4628366247755835, + "grad_norm": 0.8064364790916443, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 20370 + }, + { + "epoch": 1.4635547576301615, + "grad_norm": 0.6547006964683533, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 20380 + }, + { + "epoch": 1.4642728904847397, + "grad_norm": 0.6381436586380005, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 20390 + }, + { + "epoch": 1.4649910233393177, + "grad_norm": 0.7351248264312744, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 20400 + }, + { + "epoch": 1.465709156193896, + "grad_norm": 0.7037558555603027, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 20410 + }, + { + "epoch": 1.466427289048474, + "grad_norm": 0.6294074654579163, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 20420 + }, + { + "epoch": 1.467145421903052, + "grad_norm": 0.9722632765769958, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 20430 + }, + { + "epoch": 1.4678635547576302, + "grad_norm": 0.753065824508667, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 20440 + }, + { + "epoch": 1.4685816876122082, + "grad_norm": 0.7317194938659668, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20450 + }, + { + "epoch": 1.4692998204667864, + "grad_norm": 0.6862193942070007, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 20460 + }, + { + "epoch": 1.4700179533213644, + "grad_norm": 0.7643225193023682, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 20470 + }, + { + "epoch": 1.4707360861759424, + "grad_norm": 0.5904353260993958, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 20480 + }, + { + "epoch": 1.4714542190305206, + "grad_norm": 0.5812238454818726, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20490 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 0.7478151321411133, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 20500 + }, + { + "epoch": 1.4728904847396769, + "grad_norm": 0.7625645399093628, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 20510 + }, + { + "epoch": 1.4736086175942549, + "grad_norm": 0.6354498267173767, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 20520 + }, + { + "epoch": 1.474326750448833, + "grad_norm": 0.8731162548065186, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 20530 + }, + { + "epoch": 1.475044883303411, + "grad_norm": 0.7346670627593994, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 20540 + }, + { + "epoch": 1.4757630161579893, + "grad_norm": 1.038447618484497, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 20550 + }, + { + "epoch": 1.4764811490125673, + "grad_norm": 0.7032809257507324, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 20560 + }, + { + "epoch": 1.4771992818671453, + "grad_norm": 0.8008337020874023, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 20570 + }, + { + "epoch": 1.4779174147217236, + "grad_norm": 0.6735056638717651, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 20580 + }, + { + "epoch": 1.4786355475763016, + "grad_norm": 0.622056245803833, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 20590 + }, + { + "epoch": 1.4793536804308798, + "grad_norm": 0.6580422520637512, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 20600 + }, + { + "epoch": 1.4800718132854578, + "grad_norm": 0.8401153087615967, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20610 + }, + { + "epoch": 1.4807899461400358, + "grad_norm": 0.7564560770988464, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 20620 + }, + { + "epoch": 1.481508078994614, + "grad_norm": 0.8319511413574219, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 20630 + }, + { + "epoch": 1.4822262118491922, + "grad_norm": 0.7430182695388794, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 20640 + }, + { + "epoch": 1.4829443447037702, + "grad_norm": 0.7996522784233093, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 20650 + }, + { + "epoch": 1.4836624775583482, + "grad_norm": 0.6993277072906494, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 20660 + }, + { + "epoch": 1.4843806104129265, + "grad_norm": 0.8621185421943665, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 20670 + }, + { + "epoch": 1.4850987432675045, + "grad_norm": 0.7709757685661316, + "learning_rate": 0.0002, + "loss": 0.7327, + "step": 20680 + }, + { + "epoch": 1.4858168761220827, + "grad_norm": 0.743760347366333, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 20690 + }, + { + "epoch": 1.4865350089766607, + "grad_norm": 0.8353745341300964, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 20700 + }, + { + "epoch": 1.4872531418312387, + "grad_norm": 0.8510433435440063, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 20710 + }, + { + "epoch": 1.487971274685817, + "grad_norm": 0.7065894603729248, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 20720 + }, + { + "epoch": 1.488689407540395, + "grad_norm": 0.6878955960273743, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 20730 + }, + { + "epoch": 1.4894075403949731, + "grad_norm": 0.7861111760139465, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 20740 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 0.4810725152492523, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20750 + }, + { + "epoch": 1.4908438061041291, + "grad_norm": 0.7246082425117493, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 20760 + }, + { + "epoch": 1.4915619389587074, + "grad_norm": 0.7101936340332031, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 20770 + }, + { + "epoch": 1.4922800718132856, + "grad_norm": 0.7508591413497925, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 20780 + }, + { + "epoch": 1.4929982046678636, + "grad_norm": 0.8872039914131165, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 20790 + }, + { + "epoch": 1.4937163375224416, + "grad_norm": 0.7257922887802124, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 20800 + }, + { + "epoch": 1.4944344703770198, + "grad_norm": 0.7886278629302979, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 20810 + }, + { + "epoch": 1.4951526032315978, + "grad_norm": 0.6746290922164917, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 20820 + }, + { + "epoch": 1.495870736086176, + "grad_norm": 0.8118207454681396, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 20830 + }, + { + "epoch": 1.496588868940754, + "grad_norm": 0.7337301969528198, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 20840 + }, + { + "epoch": 1.497307001795332, + "grad_norm": 0.5451242327690125, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 20850 + }, + { + "epoch": 1.4980251346499103, + "grad_norm": 0.8398377299308777, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 20860 + }, + { + "epoch": 1.4987432675044883, + "grad_norm": 0.7196659445762634, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 20870 + }, + { + "epoch": 1.4994614003590665, + "grad_norm": 0.6659539937973022, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 20880 + }, + { + "epoch": 1.5001795332136445, + "grad_norm": 0.6071978807449341, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 20890 + }, + { + "epoch": 1.5008976660682225, + "grad_norm": 0.6704870462417603, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 20900 + }, + { + "epoch": 1.5016157989228007, + "grad_norm": 0.7216639518737793, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 20910 + }, + { + "epoch": 1.502333931777379, + "grad_norm": 0.6050528287887573, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 20920 + }, + { + "epoch": 1.503052064631957, + "grad_norm": 0.7422218918800354, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 20930 + }, + { + "epoch": 1.503770197486535, + "grad_norm": 0.7157148122787476, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20940 + }, + { + "epoch": 1.504488330341113, + "grad_norm": 0.6704899668693542, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 20950 + }, + { + "epoch": 1.5052064631956912, + "grad_norm": 0.7573544979095459, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 20960 + }, + { + "epoch": 1.5059245960502694, + "grad_norm": 0.6710506677627563, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 20970 + }, + { + "epoch": 1.5066427289048474, + "grad_norm": 0.7559793591499329, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 20980 + }, + { + "epoch": 1.5073608617594254, + "grad_norm": 0.6705940961837769, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 20990 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 0.8016680479049683, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21000 + }, + { + "epoch": 1.5087971274685816, + "grad_norm": 0.8154481649398804, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 21010 + }, + { + "epoch": 1.5095152603231599, + "grad_norm": 0.5830582976341248, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 21020 + }, + { + "epoch": 1.5102333931777379, + "grad_norm": 0.7088601589202881, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 21030 + }, + { + "epoch": 1.5109515260323159, + "grad_norm": 0.7499658465385437, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 21040 + }, + { + "epoch": 1.511669658886894, + "grad_norm": 0.7684667706489563, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 21050 + }, + { + "epoch": 1.5123877917414723, + "grad_norm": 0.7183627486228943, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 21060 + }, + { + "epoch": 1.5131059245960503, + "grad_norm": 0.8201524615287781, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 21070 + }, + { + "epoch": 1.5138240574506283, + "grad_norm": 0.6359647512435913, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 21080 + }, + { + "epoch": 1.5145421903052063, + "grad_norm": 0.7419124245643616, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 21090 + }, + { + "epoch": 1.5152603231597845, + "grad_norm": 0.6145808696746826, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 21100 + }, + { + "epoch": 1.5159784560143628, + "grad_norm": 0.7116656303405762, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 21110 + }, + { + "epoch": 1.5166965888689408, + "grad_norm": 0.8927125334739685, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 21120 + }, + { + "epoch": 1.5174147217235188, + "grad_norm": 0.7527788877487183, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 21130 + }, + { + "epoch": 1.518132854578097, + "grad_norm": 0.7537266612052917, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 21140 + }, + { + "epoch": 1.518850987432675, + "grad_norm": 0.9051724672317505, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 21150 + }, + { + "epoch": 1.5195691202872532, + "grad_norm": 0.7258086800575256, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 21160 + }, + { + "epoch": 1.5202872531418312, + "grad_norm": 0.60377436876297, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 21170 + }, + { + "epoch": 1.5210053859964092, + "grad_norm": 0.613362729549408, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 21180 + }, + { + "epoch": 1.5217235188509874, + "grad_norm": 0.6311782002449036, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 21190 + }, + { + "epoch": 1.5224416517055657, + "grad_norm": 0.7814380526542664, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 21200 + }, + { + "epoch": 1.5231597845601437, + "grad_norm": 0.8482790589332581, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 21210 + }, + { + "epoch": 1.5238779174147217, + "grad_norm": 0.6767336130142212, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21220 + }, + { + "epoch": 1.5245960502692997, + "grad_norm": 0.7000219821929932, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 21230 + }, + { + "epoch": 1.525314183123878, + "grad_norm": 0.8848617076873779, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 21240 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 0.692258894443512, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 21250 + }, + { + "epoch": 1.5267504488330341, + "grad_norm": 0.7701950073242188, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 21260 + }, + { + "epoch": 1.5274685816876121, + "grad_norm": 0.7454132437705994, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 21270 + }, + { + "epoch": 1.5281867145421903, + "grad_norm": 0.7299574613571167, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 21280 + }, + { + "epoch": 1.5289048473967684, + "grad_norm": 0.6693950891494751, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 21290 + }, + { + "epoch": 1.5296229802513466, + "grad_norm": 0.8323785066604614, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 21300 + }, + { + "epoch": 1.5303411131059246, + "grad_norm": 0.8998763561248779, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 21310 + }, + { + "epoch": 1.5310592459605026, + "grad_norm": 0.8118193745613098, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 21320 + }, + { + "epoch": 1.5317773788150808, + "grad_norm": 0.8966332077980042, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 21330 + }, + { + "epoch": 1.532495511669659, + "grad_norm": 0.7849827408790588, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 21340 + }, + { + "epoch": 1.533213644524237, + "grad_norm": 0.897583544254303, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 21350 + }, + { + "epoch": 1.533931777378815, + "grad_norm": 0.7998009324073792, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21360 + }, + { + "epoch": 1.534649910233393, + "grad_norm": 0.5890361070632935, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 21370 + }, + { + "epoch": 1.5353680430879713, + "grad_norm": 0.7321302890777588, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 21380 + }, + { + "epoch": 1.5360861759425495, + "grad_norm": 0.7746050357818604, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 21390 + }, + { + "epoch": 1.5368043087971275, + "grad_norm": 0.7033910155296326, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 21400 + }, + { + "epoch": 1.5375224416517055, + "grad_norm": 0.7229148149490356, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 21410 + }, + { + "epoch": 1.5382405745062837, + "grad_norm": 0.8055810928344727, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 21420 + }, + { + "epoch": 1.5389587073608617, + "grad_norm": 0.9411654472351074, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 21430 + }, + { + "epoch": 1.53967684021544, + "grad_norm": 0.7297126650810242, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21440 + }, + { + "epoch": 1.540394973070018, + "grad_norm": 0.7316457629203796, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 21450 + }, + { + "epoch": 1.541113105924596, + "grad_norm": 0.8568798303604126, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 21460 + }, + { + "epoch": 1.5418312387791742, + "grad_norm": 0.7829580307006836, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21470 + }, + { + "epoch": 1.5425493716337524, + "grad_norm": 0.6679823398590088, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 21480 + }, + { + "epoch": 1.5432675044883304, + "grad_norm": 0.5680868029594421, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 21490 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 0.6878862380981445, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 21500 + }, + { + "epoch": 1.5447037701974864, + "grad_norm": 0.7391727566719055, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 21510 + }, + { + "epoch": 1.5454219030520646, + "grad_norm": 0.844994843006134, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 21520 + }, + { + "epoch": 1.5461400359066428, + "grad_norm": 0.7852550148963928, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 21530 + }, + { + "epoch": 1.5468581687612208, + "grad_norm": 0.8370407223701477, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 21540 + }, + { + "epoch": 1.5475763016157988, + "grad_norm": 0.7138169407844543, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 21550 + }, + { + "epoch": 1.548294434470377, + "grad_norm": 0.7660839557647705, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 21560 + }, + { + "epoch": 1.549012567324955, + "grad_norm": 0.6628666520118713, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 21570 + }, + { + "epoch": 1.5497307001795333, + "grad_norm": 0.602262020111084, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 21580 + }, + { + "epoch": 1.5504488330341113, + "grad_norm": 0.6120333671569824, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 21590 + }, + { + "epoch": 1.5511669658886893, + "grad_norm": 0.6742582321166992, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 21600 + }, + { + "epoch": 1.5518850987432675, + "grad_norm": 0.6788192391395569, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 21610 + }, + { + "epoch": 1.5526032315978457, + "grad_norm": 0.7124713659286499, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 21620 + }, + { + "epoch": 1.5533213644524237, + "grad_norm": 0.6297248005867004, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 21630 + }, + { + "epoch": 1.5540394973070017, + "grad_norm": 0.8977078199386597, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21640 + }, + { + "epoch": 1.5547576301615798, + "grad_norm": 0.7543209791183472, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 21650 + }, + { + "epoch": 1.555475763016158, + "grad_norm": 0.8704302310943604, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 21660 + }, + { + "epoch": 1.5561938958707362, + "grad_norm": 0.7848012447357178, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 21670 + }, + { + "epoch": 1.5569120287253142, + "grad_norm": 0.7496278285980225, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 21680 + }, + { + "epoch": 1.5576301615798922, + "grad_norm": 0.7305200099945068, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 21690 + }, + { + "epoch": 1.5583482944344704, + "grad_norm": 0.6671105623245239, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 21700 + }, + { + "epoch": 1.5590664272890484, + "grad_norm": 0.8536111116409302, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 21710 + }, + { + "epoch": 1.5597845601436267, + "grad_norm": 0.7360461354255676, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 21720 + }, + { + "epoch": 1.5605026929982047, + "grad_norm": 0.6665109395980835, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 21730 + }, + { + "epoch": 1.5612208258527827, + "grad_norm": 0.5879628658294678, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 21740 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 0.6937240958213806, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 21750 + }, + { + "epoch": 1.562657091561939, + "grad_norm": 0.7118659019470215, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 21760 + }, + { + "epoch": 1.563375224416517, + "grad_norm": 0.7858866453170776, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 21770 + }, + { + "epoch": 1.564093357271095, + "grad_norm": 0.8691372871398926, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 21780 + }, + { + "epoch": 1.564811490125673, + "grad_norm": 0.8884942531585693, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 21790 + }, + { + "epoch": 1.5655296229802513, + "grad_norm": 0.6335656046867371, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 21800 + }, + { + "epoch": 1.5662477558348296, + "grad_norm": 0.8666166067123413, + "learning_rate": 0.0002, + "loss": 0.7233, + "step": 21810 + }, + { + "epoch": 1.5669658886894076, + "grad_norm": 0.7961624264717102, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 21820 + }, + { + "epoch": 1.5676840215439856, + "grad_norm": 0.6331174373626709, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 21830 + }, + { + "epoch": 1.5684021543985638, + "grad_norm": 0.6476998925209045, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 21840 + }, + { + "epoch": 1.5691202872531418, + "grad_norm": 0.8279129266738892, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 21850 + }, + { + "epoch": 1.56983842010772, + "grad_norm": 0.6997109651565552, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 21860 + }, + { + "epoch": 1.570556552962298, + "grad_norm": 0.6992211937904358, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 21870 + }, + { + "epoch": 1.571274685816876, + "grad_norm": 0.7766915559768677, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 21880 + }, + { + "epoch": 1.5719928186714542, + "grad_norm": 0.6845845580101013, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 21890 + }, + { + "epoch": 1.5727109515260325, + "grad_norm": 0.7247874140739441, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 21900 + }, + { + "epoch": 1.5734290843806105, + "grad_norm": 0.802342414855957, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21910 + }, + { + "epoch": 1.5741472172351885, + "grad_norm": 0.7797709107398987, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 21920 + }, + { + "epoch": 1.5748653500897665, + "grad_norm": 0.6534958481788635, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21930 + }, + { + "epoch": 1.5755834829443447, + "grad_norm": 0.6003528237342834, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 21940 + }, + { + "epoch": 1.576301615798923, + "grad_norm": 0.6920075416564941, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 21950 + }, + { + "epoch": 1.577019748653501, + "grad_norm": 0.7213456034660339, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 21960 + }, + { + "epoch": 1.577737881508079, + "grad_norm": 0.7101914286613464, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 21970 + }, + { + "epoch": 1.5784560143626571, + "grad_norm": 0.9531592130661011, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 21980 + }, + { + "epoch": 1.5791741472172351, + "grad_norm": 0.7690590023994446, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 21990 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 0.8226363062858582, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 22000 + }, + { + "epoch": 1.5806104129263914, + "grad_norm": 0.6128851175308228, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 22010 + }, + { + "epoch": 1.5813285457809694, + "grad_norm": 0.827008068561554, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 22020 + }, + { + "epoch": 1.5820466786355476, + "grad_norm": 0.6729007363319397, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 22030 + }, + { + "epoch": 1.5827648114901258, + "grad_norm": 0.6397014260292053, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 22040 + }, + { + "epoch": 1.5834829443447038, + "grad_norm": 0.6927793622016907, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 22050 + }, + { + "epoch": 1.5842010771992818, + "grad_norm": 0.7527112364768982, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 22060 + }, + { + "epoch": 1.5849192100538598, + "grad_norm": 0.6418012380599976, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 22070 + }, + { + "epoch": 1.585637342908438, + "grad_norm": 0.7627281546592712, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 22080 + }, + { + "epoch": 1.5863554757630163, + "grad_norm": 0.753851592540741, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22090 + }, + { + "epoch": 1.5870736086175943, + "grad_norm": 0.6049349904060364, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 22100 + }, + { + "epoch": 1.5877917414721723, + "grad_norm": 0.6677758693695068, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 22110 + }, + { + "epoch": 1.5885098743267505, + "grad_norm": 0.913489818572998, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22120 + }, + { + "epoch": 1.5892280071813285, + "grad_norm": 0.6779162883758545, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 22130 + }, + { + "epoch": 1.5899461400359067, + "grad_norm": 0.910076916217804, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 22140 + }, + { + "epoch": 1.5906642728904847, + "grad_norm": 0.9506068229675293, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 22150 + }, + { + "epoch": 1.5913824057450627, + "grad_norm": 0.6552460789680481, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 22160 + }, + { + "epoch": 1.592100538599641, + "grad_norm": 0.6855819821357727, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22170 + }, + { + "epoch": 1.5928186714542192, + "grad_norm": 0.6713384985923767, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 22180 + }, + { + "epoch": 1.5935368043087972, + "grad_norm": 0.7168547511100769, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 22190 + }, + { + "epoch": 1.5942549371633752, + "grad_norm": 0.8395482897758484, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22200 + }, + { + "epoch": 1.5949730700179532, + "grad_norm": 0.6676998138427734, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 22210 + }, + { + "epoch": 1.5956912028725314, + "grad_norm": 0.5837140083312988, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 22220 + }, + { + "epoch": 1.5964093357271096, + "grad_norm": 0.8399306535720825, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 22230 + }, + { + "epoch": 1.5971274685816876, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22240 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 0.768604040145874, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 22250 + }, + { + "epoch": 1.5985637342908436, + "grad_norm": 0.6382646560668945, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 22260 + }, + { + "epoch": 1.5992818671454219, + "grad_norm": 0.7244897484779358, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 22270 + }, + { + "epoch": 1.6, + "grad_norm": 0.6250987648963928, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 22280 + }, + { + "epoch": 1.600718132854578, + "grad_norm": 0.8731992244720459, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 22290 + }, + { + "epoch": 1.601436265709156, + "grad_norm": 0.5861822962760925, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 22300 + }, + { + "epoch": 1.6021543985637343, + "grad_norm": 0.716805100440979, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 22310 + }, + { + "epoch": 1.6028725314183125, + "grad_norm": 0.6650034189224243, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 22320 + }, + { + "epoch": 1.6035906642728905, + "grad_norm": 0.6944432854652405, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 22330 + }, + { + "epoch": 1.6043087971274685, + "grad_norm": 0.7411999106407166, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 22340 + }, + { + "epoch": 1.6050269299820465, + "grad_norm": 0.831828773021698, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 22350 + }, + { + "epoch": 1.6057450628366248, + "grad_norm": 0.6252152919769287, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 22360 + }, + { + "epoch": 1.606463195691203, + "grad_norm": 0.8643325567245483, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22370 + }, + { + "epoch": 1.607181328545781, + "grad_norm": 0.7330279350280762, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 22380 + }, + { + "epoch": 1.607899461400359, + "grad_norm": 0.7235422730445862, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 22390 + }, + { + "epoch": 1.608617594254937, + "grad_norm": 0.6940887570381165, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 22400 + }, + { + "epoch": 1.6093357271095152, + "grad_norm": 0.7907325625419617, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 22410 + }, + { + "epoch": 1.6100538599640934, + "grad_norm": 0.6899075508117676, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 22420 + }, + { + "epoch": 1.6107719928186714, + "grad_norm": 0.7057487368583679, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 22430 + }, + { + "epoch": 1.6114901256732495, + "grad_norm": 0.9235003590583801, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 22440 + }, + { + "epoch": 1.6122082585278277, + "grad_norm": 0.7238173484802246, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22450 + }, + { + "epoch": 1.612926391382406, + "grad_norm": 0.5931997299194336, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 22460 + }, + { + "epoch": 1.613644524236984, + "grad_norm": 0.6705866456031799, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 22470 + }, + { + "epoch": 1.614362657091562, + "grad_norm": 0.7392773032188416, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 22480 + }, + { + "epoch": 1.61508078994614, + "grad_norm": 0.6286543607711792, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 22490 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 0.7467446327209473, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 22500 + }, + { + "epoch": 1.6165170556552964, + "grad_norm": 0.8353021740913391, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 22510 + }, + { + "epoch": 1.6172351885098744, + "grad_norm": 0.7333045601844788, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 22520 + }, + { + "epoch": 1.6179533213644524, + "grad_norm": 0.6203709244728088, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 22530 + }, + { + "epoch": 1.6186714542190304, + "grad_norm": 0.5585690140724182, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 22540 + }, + { + "epoch": 1.6193895870736086, + "grad_norm": 0.7157222032546997, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 22550 + }, + { + "epoch": 1.6201077199281868, + "grad_norm": 0.8129993677139282, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 22560 + }, + { + "epoch": 1.6208258527827648, + "grad_norm": 0.6745335459709167, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 22570 + }, + { + "epoch": 1.6215439856373428, + "grad_norm": 0.7684996724128723, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 22580 + }, + { + "epoch": 1.622262118491921, + "grad_norm": 0.6735436916351318, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22590 + }, + { + "epoch": 1.6229802513464993, + "grad_norm": 0.7394272089004517, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 22600 + }, + { + "epoch": 1.6236983842010773, + "grad_norm": 0.7268046140670776, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 22610 + }, + { + "epoch": 1.6244165170556553, + "grad_norm": 0.8338810205459595, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 22620 + }, + { + "epoch": 1.6251346499102333, + "grad_norm": 0.9293080568313599, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 22630 + }, + { + "epoch": 1.6258527827648115, + "grad_norm": 0.8084996938705444, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 22640 + }, + { + "epoch": 1.6265709156193897, + "grad_norm": 0.6605180501937866, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22650 + }, + { + "epoch": 1.6272890484739677, + "grad_norm": 0.8402717113494873, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 22660 + }, + { + "epoch": 1.6280071813285457, + "grad_norm": 0.653055727481842, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 22670 + }, + { + "epoch": 1.6287253141831237, + "grad_norm": 0.6477823257446289, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 22680 + }, + { + "epoch": 1.629443447037702, + "grad_norm": 0.9053590893745422, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 22690 + }, + { + "epoch": 1.6301615798922802, + "grad_norm": 0.90384441614151, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 22700 + }, + { + "epoch": 1.6308797127468582, + "grad_norm": 0.6789469122886658, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 22710 + }, + { + "epoch": 1.6315978456014362, + "grad_norm": 0.7221854329109192, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 22720 + }, + { + "epoch": 1.6323159784560144, + "grad_norm": 0.7724022269248962, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 22730 + }, + { + "epoch": 1.6330341113105926, + "grad_norm": 0.8213715553283691, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 22740 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 0.7102876305580139, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 22750 + }, + { + "epoch": 1.6344703770197486, + "grad_norm": 0.8817880749702454, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 22760 + }, + { + "epoch": 1.6351885098743266, + "grad_norm": 0.8446506857872009, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 22770 + }, + { + "epoch": 1.6359066427289048, + "grad_norm": 0.6749029755592346, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 22780 + }, + { + "epoch": 1.636624775583483, + "grad_norm": 0.7013556957244873, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 22790 + }, + { + "epoch": 1.637342908438061, + "grad_norm": 0.7767965793609619, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22800 + }, + { + "epoch": 1.638061041292639, + "grad_norm": 0.7354073524475098, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 22810 + }, + { + "epoch": 1.638779174147217, + "grad_norm": 0.8871088027954102, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 22820 + }, + { + "epoch": 1.6394973070017953, + "grad_norm": 0.6573871374130249, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 22830 + }, + { + "epoch": 1.6402154398563735, + "grad_norm": 0.5679349303245544, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 22840 + }, + { + "epoch": 1.6409335727109515, + "grad_norm": 0.7072559595108032, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 22850 + }, + { + "epoch": 1.6416517055655295, + "grad_norm": 0.7639257311820984, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 22860 + }, + { + "epoch": 1.6423698384201078, + "grad_norm": 0.6699341535568237, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 22870 + }, + { + "epoch": 1.643087971274686, + "grad_norm": 0.8285767436027527, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 22880 + }, + { + "epoch": 1.643806104129264, + "grad_norm": 0.7328150272369385, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 22890 + }, + { + "epoch": 1.644524236983842, + "grad_norm": 0.8122354745864868, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 22900 + }, + { + "epoch": 1.64524236983842, + "grad_norm": 0.7322969436645508, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 22910 + }, + { + "epoch": 1.6459605026929982, + "grad_norm": 0.7269576191902161, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 22920 + }, + { + "epoch": 1.6466786355475764, + "grad_norm": 0.7037042379379272, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 22930 + }, + { + "epoch": 1.6473967684021544, + "grad_norm": 0.6960355639457703, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 22940 + }, + { + "epoch": 1.6481149012567324, + "grad_norm": 0.7446839213371277, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 22950 + }, + { + "epoch": 1.6488330341113104, + "grad_norm": 0.7201664447784424, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 22960 + }, + { + "epoch": 1.6495511669658887, + "grad_norm": 0.7062349319458008, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 22970 + }, + { + "epoch": 1.6502692998204669, + "grad_norm": 0.7666636109352112, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 22980 + }, + { + "epoch": 1.6509874326750449, + "grad_norm": 0.7872112393379211, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 22990 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 0.7428551316261292, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 23000 + }, + { + "epoch": 1.6524236983842011, + "grad_norm": 0.6087952852249146, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 23010 + }, + { + "epoch": 1.6531418312387793, + "grad_norm": 0.7191354036331177, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 23020 + }, + { + "epoch": 1.6538599640933573, + "grad_norm": 0.8679710626602173, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 23030 + }, + { + "epoch": 1.6545780969479353, + "grad_norm": 0.7232310175895691, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 23040 + }, + { + "epoch": 1.6552962298025133, + "grad_norm": 0.5695104002952576, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 23050 + }, + { + "epoch": 1.6560143626570916, + "grad_norm": 0.6363076567649841, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 23060 + }, + { + "epoch": 1.6567324955116698, + "grad_norm": 0.8168749809265137, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23070 + }, + { + "epoch": 1.6574506283662478, + "grad_norm": 0.7664111852645874, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 23080 + }, + { + "epoch": 1.6581687612208258, + "grad_norm": 0.6748140454292297, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 23090 + }, + { + "epoch": 1.6588868940754038, + "grad_norm": 0.6258183121681213, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 23100 + }, + { + "epoch": 1.659605026929982, + "grad_norm": 0.8669735193252563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 23110 + }, + { + "epoch": 1.6603231597845602, + "grad_norm": 0.5606119632720947, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 23120 + }, + { + "epoch": 1.6610412926391382, + "grad_norm": 0.6602507829666138, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 23130 + }, + { + "epoch": 1.6617594254937162, + "grad_norm": 0.7237988710403442, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 23140 + }, + { + "epoch": 1.6624775583482945, + "grad_norm": 0.9054415225982666, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 23150 + }, + { + "epoch": 1.6631956912028727, + "grad_norm": 0.5186660289764404, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 23160 + }, + { + "epoch": 1.6639138240574507, + "grad_norm": 0.719584584236145, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 23170 + }, + { + "epoch": 1.6646319569120287, + "grad_norm": 0.7583617568016052, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 23180 + }, + { + "epoch": 1.6653500897666067, + "grad_norm": 0.7985982298851013, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 23190 + }, + { + "epoch": 1.666068222621185, + "grad_norm": 0.6952691674232483, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23200 + }, + { + "epoch": 1.6667863554757631, + "grad_norm": 0.7184221744537354, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 23210 + }, + { + "epoch": 1.6675044883303412, + "grad_norm": 0.8256361484527588, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 23220 + }, + { + "epoch": 1.6682226211849192, + "grad_norm": 0.7534128427505493, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 23230 + }, + { + "epoch": 1.6689407540394972, + "grad_norm": 0.7711095213890076, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 23240 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 0.6326615810394287, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 23250 + }, + { + "epoch": 1.6703770197486536, + "grad_norm": 0.8345766663551331, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 23260 + }, + { + "epoch": 1.6710951526032316, + "grad_norm": 0.9079837203025818, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 23270 + }, + { + "epoch": 1.6718132854578096, + "grad_norm": 0.7310197353363037, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 23280 + }, + { + "epoch": 1.6725314183123878, + "grad_norm": 0.7573344707489014, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 23290 + }, + { + "epoch": 1.673249551166966, + "grad_norm": 0.7708047032356262, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 23300 + }, + { + "epoch": 1.673967684021544, + "grad_norm": 0.7665812969207764, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 23310 + }, + { + "epoch": 1.674685816876122, + "grad_norm": 0.7988788485527039, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 23320 + }, + { + "epoch": 1.6754039497307, + "grad_norm": 0.755042552947998, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 23330 + }, + { + "epoch": 1.6761220825852783, + "grad_norm": 0.6605848670005798, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 23340 + }, + { + "epoch": 1.6768402154398565, + "grad_norm": 0.8762016296386719, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 23350 + }, + { + "epoch": 1.6775583482944345, + "grad_norm": 0.604742169380188, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 23360 + }, + { + "epoch": 1.6782764811490125, + "grad_norm": 0.7479172945022583, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 23370 + }, + { + "epoch": 1.6789946140035905, + "grad_norm": 0.6418702602386475, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 23380 + }, + { + "epoch": 1.6797127468581687, + "grad_norm": 0.6783933639526367, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 23390 + }, + { + "epoch": 1.680430879712747, + "grad_norm": 0.7036024928092957, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 23400 + }, + { + "epoch": 1.681149012567325, + "grad_norm": 0.6833266615867615, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 23410 + }, + { + "epoch": 1.681867145421903, + "grad_norm": 0.8867062330245972, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 23420 + }, + { + "epoch": 1.6825852782764812, + "grad_norm": 0.7825753092765808, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 23430 + }, + { + "epoch": 1.6833034111310592, + "grad_norm": 0.6396880745887756, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 23440 + }, + { + "epoch": 1.6840215439856374, + "grad_norm": 0.5723230242729187, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 23450 + }, + { + "epoch": 1.6847396768402154, + "grad_norm": 0.6949231624603271, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 23460 + }, + { + "epoch": 1.6854578096947934, + "grad_norm": 0.8290650248527527, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 23470 + }, + { + "epoch": 1.6861759425493716, + "grad_norm": 0.7765078544616699, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 23480 + }, + { + "epoch": 1.6868940754039499, + "grad_norm": 0.7084149718284607, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 23490 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 0.6916654109954834, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 23500 + }, + { + "epoch": 1.6883303411131059, + "grad_norm": 0.5615179538726807, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 23510 + }, + { + "epoch": 1.6890484739676839, + "grad_norm": 0.7996105551719666, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 23520 + }, + { + "epoch": 1.689766606822262, + "grad_norm": 0.7010168433189392, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23530 + }, + { + "epoch": 1.6904847396768403, + "grad_norm": 0.7876442074775696, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 23540 + }, + { + "epoch": 1.6912028725314183, + "grad_norm": 0.7508043646812439, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 23550 + }, + { + "epoch": 1.6919210053859963, + "grad_norm": 0.8125874400138855, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 23560 + }, + { + "epoch": 1.6926391382405745, + "grad_norm": 0.711840808391571, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 23570 + }, + { + "epoch": 1.6933572710951525, + "grad_norm": 0.6540026068687439, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 23580 + }, + { + "epoch": 1.6940754039497308, + "grad_norm": 0.8376550078392029, + "learning_rate": 0.0002, + "loss": 0.7578, + "step": 23590 + }, + { + "epoch": 1.6947935368043088, + "grad_norm": 0.7075366973876953, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 23600 + }, + { + "epoch": 1.6955116696588868, + "grad_norm": 0.7522266507148743, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23610 + }, + { + "epoch": 1.696229802513465, + "grad_norm": 0.7572667002677917, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 23620 + }, + { + "epoch": 1.6969479353680432, + "grad_norm": 0.6126907467842102, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 23630 + }, + { + "epoch": 1.6976660682226212, + "grad_norm": 0.7473152875900269, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 23640 + }, + { + "epoch": 1.6983842010771992, + "grad_norm": 0.6630390286445618, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 23650 + }, + { + "epoch": 1.6991023339317772, + "grad_norm": 0.5848073363304138, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 23660 + }, + { + "epoch": 1.6998204667863555, + "grad_norm": 0.5901942849159241, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 23670 + }, + { + "epoch": 1.7005385996409337, + "grad_norm": 0.7896918058395386, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 23680 + }, + { + "epoch": 1.7012567324955117, + "grad_norm": 0.705362856388092, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 23690 + }, + { + "epoch": 1.7019748653500897, + "grad_norm": 0.9917470812797546, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 23700 + }, + { + "epoch": 1.702692998204668, + "grad_norm": 0.7550538778305054, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 23710 + }, + { + "epoch": 1.703411131059246, + "grad_norm": 0.8348238468170166, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23720 + }, + { + "epoch": 1.7041292639138241, + "grad_norm": 0.5979694128036499, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 23730 + }, + { + "epoch": 1.7048473967684021, + "grad_norm": 0.7451775670051575, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 23740 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 0.7614818215370178, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 23750 + }, + { + "epoch": 1.7062836624775584, + "grad_norm": 0.5590742826461792, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 23760 + }, + { + "epoch": 1.7070017953321366, + "grad_norm": 0.7039094567298889, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 23770 + }, + { + "epoch": 1.7077199281867146, + "grad_norm": 0.7963233590126038, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23780 + }, + { + "epoch": 1.7084380610412926, + "grad_norm": 0.7214934825897217, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 23790 + }, + { + "epoch": 1.7091561938958706, + "grad_norm": 0.7310500741004944, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23800 + }, + { + "epoch": 1.7098743267504488, + "grad_norm": 0.6653284430503845, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 23810 + }, + { + "epoch": 1.710592459605027, + "grad_norm": 0.6632702946662903, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 23820 + }, + { + "epoch": 1.711310592459605, + "grad_norm": 0.6314955949783325, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 23830 + }, + { + "epoch": 1.712028725314183, + "grad_norm": 0.73652583360672, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 23840 + }, + { + "epoch": 1.7127468581687613, + "grad_norm": 0.5685144662857056, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 23850 + }, + { + "epoch": 1.7134649910233393, + "grad_norm": 0.7010223865509033, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 23860 + }, + { + "epoch": 1.7141831238779175, + "grad_norm": 0.7643879652023315, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 23870 + }, + { + "epoch": 1.7149012567324955, + "grad_norm": 0.7543165683746338, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 23880 + }, + { + "epoch": 1.7156193895870735, + "grad_norm": 0.8816508054733276, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 23890 + }, + { + "epoch": 1.7163375224416517, + "grad_norm": 0.7979614734649658, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23900 + }, + { + "epoch": 1.71705565529623, + "grad_norm": 0.7631057500839233, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 23910 + }, + { + "epoch": 1.717773788150808, + "grad_norm": 0.6349977254867554, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 23920 + }, + { + "epoch": 1.718491921005386, + "grad_norm": 0.7464412450790405, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 23930 + }, + { + "epoch": 1.719210053859964, + "grad_norm": 0.6985567212104797, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 23940 + }, + { + "epoch": 1.7199281867145422, + "grad_norm": 0.6641302704811096, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 23950 + }, + { + "epoch": 1.7206463195691204, + "grad_norm": 0.7299597263336182, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 23960 + }, + { + "epoch": 1.7213644524236984, + "grad_norm": 0.7812355756759644, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 23970 + }, + { + "epoch": 1.7220825852782764, + "grad_norm": 0.667571485042572, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 23980 + }, + { + "epoch": 1.7228007181328546, + "grad_norm": 0.8244081735610962, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 23990 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 0.6684445738792419, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 24000 + }, + { + "epoch": 1.7242369838420109, + "grad_norm": 0.7002949118614197, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 24010 + }, + { + "epoch": 1.7249551166965889, + "grad_norm": 0.6249772906303406, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 24020 + }, + { + "epoch": 1.7256732495511669, + "grad_norm": 0.7279905080795288, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 24030 + }, + { + "epoch": 1.726391382405745, + "grad_norm": 0.631148636341095, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 24040 + }, + { + "epoch": 1.7271095152603233, + "grad_norm": 0.7486464977264404, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 24050 + }, + { + "epoch": 1.7278276481149013, + "grad_norm": 0.7494347095489502, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 24060 + }, + { + "epoch": 1.7285457809694793, + "grad_norm": 0.7821264863014221, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 24070 + }, + { + "epoch": 1.7292639138240573, + "grad_norm": 0.7211608290672302, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 24080 + }, + { + "epoch": 1.7299820466786355, + "grad_norm": 0.7028553485870361, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 24090 + }, + { + "epoch": 1.7307001795332138, + "grad_norm": 0.6189247369766235, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 24100 + }, + { + "epoch": 1.7314183123877918, + "grad_norm": 0.7339756488800049, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 24110 + }, + { + "epoch": 1.7321364452423698, + "grad_norm": 0.6700502038002014, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 24120 + }, + { + "epoch": 1.732854578096948, + "grad_norm": 0.6139533519744873, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 24130 + }, + { + "epoch": 1.733572710951526, + "grad_norm": 0.7249825596809387, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 24140 + }, + { + "epoch": 1.7342908438061042, + "grad_norm": 0.6531777381896973, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 24150 + }, + { + "epoch": 1.7350089766606822, + "grad_norm": 0.8443833589553833, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 24160 + }, + { + "epoch": 1.7357271095152602, + "grad_norm": 0.7040373086929321, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 24170 + }, + { + "epoch": 1.7364452423698384, + "grad_norm": 0.8647749423980713, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24180 + }, + { + "epoch": 1.7371633752244167, + "grad_norm": 0.7297305464744568, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 24190 + }, + { + "epoch": 1.7378815080789947, + "grad_norm": 0.8191218376159668, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 24200 + }, + { + "epoch": 1.7385996409335727, + "grad_norm": 0.7315607666969299, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 24210 + }, + { + "epoch": 1.7393177737881507, + "grad_norm": 0.694486677646637, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 24220 + }, + { + "epoch": 1.740035906642729, + "grad_norm": 0.8115953207015991, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 24230 + }, + { + "epoch": 1.7407540394973071, + "grad_norm": 0.7379186153411865, + "learning_rate": 0.0002, + "loss": 0.7792, + "step": 24240 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 0.6820309162139893, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 24250 + }, + { + "epoch": 1.7421903052064631, + "grad_norm": 0.8210766911506653, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 24260 + }, + { + "epoch": 1.7429084380610413, + "grad_norm": 0.724466860294342, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 24270 + }, + { + "epoch": 1.7436265709156193, + "grad_norm": 0.8768740296363831, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 24280 + }, + { + "epoch": 1.7443447037701976, + "grad_norm": 0.6691206097602844, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24290 + }, + { + "epoch": 1.7450628366247756, + "grad_norm": 0.6529893279075623, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 24300 + }, + { + "epoch": 1.7457809694793536, + "grad_norm": 0.904729962348938, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 24310 + }, + { + "epoch": 1.7464991023339318, + "grad_norm": 0.655235230922699, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24320 + }, + { + "epoch": 1.74721723518851, + "grad_norm": 0.9476361274719238, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 24330 + }, + { + "epoch": 1.747935368043088, + "grad_norm": 0.55366051197052, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 24340 + }, + { + "epoch": 1.748653500897666, + "grad_norm": 0.7192568182945251, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 24350 + }, + { + "epoch": 1.749371633752244, + "grad_norm": 0.7193983793258667, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 24360 + }, + { + "epoch": 1.7500897666068223, + "grad_norm": 0.753998339176178, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24370 + }, + { + "epoch": 1.7508078994614005, + "grad_norm": 1.1058299541473389, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 24380 + }, + { + "epoch": 1.7515260323159785, + "grad_norm": 0.7213007211685181, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 24390 + }, + { + "epoch": 1.7522441651705565, + "grad_norm": 0.972494900226593, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 24400 + }, + { + "epoch": 1.7529622980251347, + "grad_norm": 0.8045306205749512, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 24410 + }, + { + "epoch": 1.7536804308797127, + "grad_norm": 0.82415372133255, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24420 + }, + { + "epoch": 1.754398563734291, + "grad_norm": 0.72683185338974, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 24430 + }, + { + "epoch": 1.755116696588869, + "grad_norm": 0.687907338142395, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 24440 + }, + { + "epoch": 1.755834829443447, + "grad_norm": 0.6616531610488892, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 24450 + }, + { + "epoch": 1.7565529622980252, + "grad_norm": 0.7225571870803833, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 24460 + }, + { + "epoch": 1.7572710951526034, + "grad_norm": 0.7597603797912598, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 24470 + }, + { + "epoch": 1.7579892280071814, + "grad_norm": 0.7850660681724548, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 24480 + }, + { + "epoch": 1.7587073608617594, + "grad_norm": 0.9843530058860779, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 24490 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 0.7010256052017212, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 24500 + }, + { + "epoch": 1.7601436265709156, + "grad_norm": 0.5669383406639099, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 24510 + }, + { + "epoch": 1.7608617594254938, + "grad_norm": 0.7043302655220032, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 24520 + }, + { + "epoch": 1.7615798922800718, + "grad_norm": 0.8000741600990295, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 24530 + }, + { + "epoch": 1.7622980251346498, + "grad_norm": 0.7084416747093201, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 24540 + }, + { + "epoch": 1.763016157989228, + "grad_norm": 0.7290608882904053, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 24550 + }, + { + "epoch": 1.763734290843806, + "grad_norm": 0.8710007071495056, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 24560 + }, + { + "epoch": 1.7644524236983843, + "grad_norm": 0.6346535682678223, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 24570 + }, + { + "epoch": 1.7651705565529623, + "grad_norm": 0.8990599513053894, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 24580 + }, + { + "epoch": 1.7658886894075403, + "grad_norm": 0.7823857665061951, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 24590 + }, + { + "epoch": 1.7666068222621185, + "grad_norm": 0.6250144839286804, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 24600 + }, + { + "epoch": 1.7673249551166967, + "grad_norm": 0.715657114982605, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 24610 + }, + { + "epoch": 1.7680430879712747, + "grad_norm": 0.6254874467849731, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 24620 + }, + { + "epoch": 1.7687612208258527, + "grad_norm": 0.6873717904090881, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 24630 + }, + { + "epoch": 1.7694793536804307, + "grad_norm": 0.7273038625717163, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 24640 + }, + { + "epoch": 1.770197486535009, + "grad_norm": 0.9079981446266174, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 24650 + }, + { + "epoch": 1.7709156193895872, + "grad_norm": 0.6262510418891907, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 24660 + }, + { + "epoch": 1.7716337522441652, + "grad_norm": 0.7326231002807617, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 24670 + }, + { + "epoch": 1.7723518850987432, + "grad_norm": 0.7828301787376404, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 24680 + }, + { + "epoch": 1.7730700179533212, + "grad_norm": 0.5881586670875549, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 24690 + }, + { + "epoch": 1.7737881508078994, + "grad_norm": 0.7101683020591736, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 24700 + }, + { + "epoch": 1.7745062836624776, + "grad_norm": 0.8466469049453735, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 24710 + }, + { + "epoch": 1.7752244165170556, + "grad_norm": 0.7770822644233704, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 24720 + }, + { + "epoch": 1.7759425493716336, + "grad_norm": 0.7259120345115662, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 24730 + }, + { + "epoch": 1.7766606822262119, + "grad_norm": 0.7696824669837952, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 24740 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 0.7603837847709656, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 24750 + }, + { + "epoch": 1.778096947935368, + "grad_norm": 0.6166595220565796, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 24760 + }, + { + "epoch": 1.778815080789946, + "grad_norm": 0.7493758797645569, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 24770 + }, + { + "epoch": 1.779533213644524, + "grad_norm": 0.7177459597587585, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 24780 + }, + { + "epoch": 1.7802513464991023, + "grad_norm": 0.6666781306266785, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 24790 + }, + { + "epoch": 1.7809694793536806, + "grad_norm": 0.6556468605995178, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 24800 + }, + { + "epoch": 1.7816876122082586, + "grad_norm": 0.6119393706321716, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 24810 + }, + { + "epoch": 1.7824057450628366, + "grad_norm": 0.8573325276374817, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 24820 + }, + { + "epoch": 1.7831238779174146, + "grad_norm": 0.8017005920410156, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 24830 + }, + { + "epoch": 1.7838420107719928, + "grad_norm": 0.7337947487831116, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24840 + }, + { + "epoch": 1.784560143626571, + "grad_norm": 0.6717178225517273, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 24850 + }, + { + "epoch": 1.785278276481149, + "grad_norm": 0.8243708610534668, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 24860 + }, + { + "epoch": 1.785996409335727, + "grad_norm": 0.8111547827720642, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24870 + }, + { + "epoch": 1.7867145421903052, + "grad_norm": 0.8577823042869568, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 24880 + }, + { + "epoch": 1.7874326750448835, + "grad_norm": 0.6488644480705261, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 24890 + }, + { + "epoch": 1.7881508078994615, + "grad_norm": 0.6446744799613953, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 24900 + }, + { + "epoch": 1.7888689407540395, + "grad_norm": 0.6400182247161865, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 24910 + }, + { + "epoch": 1.7895870736086175, + "grad_norm": 0.8059108853340149, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 24920 + }, + { + "epoch": 1.7903052064631957, + "grad_norm": 0.7101734280586243, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 24930 + }, + { + "epoch": 1.791023339317774, + "grad_norm": 1.0397762060165405, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 24940 + }, + { + "epoch": 1.791741472172352, + "grad_norm": 0.6231128573417664, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 24950 + }, + { + "epoch": 1.79245960502693, + "grad_norm": 5.905253887176514, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 24960 + }, + { + "epoch": 1.793177737881508, + "grad_norm": 0.8003911375999451, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 24970 + }, + { + "epoch": 1.7938958707360861, + "grad_norm": 0.6340393424034119, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 24980 + }, + { + "epoch": 1.7946140035906644, + "grad_norm": 0.8701013922691345, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 24990 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 0.9085575342178345, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 25000 + }, + { + "epoch": 1.7960502692998204, + "grad_norm": 0.6306625604629517, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 25010 + }, + { + "epoch": 1.7967684021543986, + "grad_norm": 0.6985056400299072, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25020 + }, + { + "epoch": 1.7974865350089768, + "grad_norm": 0.7309113144874573, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 25030 + }, + { + "epoch": 1.7982046678635548, + "grad_norm": 0.6795042157173157, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 25040 + }, + { + "epoch": 1.7989228007181328, + "grad_norm": 0.6920178532600403, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25050 + }, + { + "epoch": 1.7996409335727108, + "grad_norm": 0.6578564047813416, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25060 + }, + { + "epoch": 1.800359066427289, + "grad_norm": 0.6718358993530273, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 25070 + }, + { + "epoch": 1.8010771992818673, + "grad_norm": 0.9086750149726868, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 25080 + }, + { + "epoch": 1.8017953321364453, + "grad_norm": 0.6102437973022461, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 25090 + }, + { + "epoch": 1.8025134649910233, + "grad_norm": 0.6391313076019287, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 25100 + }, + { + "epoch": 1.8032315978456013, + "grad_norm": 0.7150128483772278, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 25110 + }, + { + "epoch": 1.8039497307001795, + "grad_norm": 0.9833421111106873, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 25120 + }, + { + "epoch": 1.8046678635547577, + "grad_norm": 0.774002194404602, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25130 + }, + { + "epoch": 1.8053859964093357, + "grad_norm": 0.644443154335022, + "learning_rate": 0.0002, + "loss": 0.7329, + "step": 25140 + }, + { + "epoch": 1.8061041292639137, + "grad_norm": 0.6996100544929504, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 25150 + }, + { + "epoch": 1.806822262118492, + "grad_norm": 0.7545985579490662, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 25160 + }, + { + "epoch": 1.8075403949730702, + "grad_norm": 0.7505226731300354, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 25170 + }, + { + "epoch": 1.8082585278276482, + "grad_norm": 0.800681471824646, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 25180 + }, + { + "epoch": 1.8089766606822262, + "grad_norm": 0.8268337845802307, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 25190 + }, + { + "epoch": 1.8096947935368042, + "grad_norm": 0.6436594128608704, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 25200 + }, + { + "epoch": 1.8104129263913824, + "grad_norm": 0.6961014270782471, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 25210 + }, + { + "epoch": 1.8111310592459606, + "grad_norm": 0.6649489998817444, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 25220 + }, + { + "epoch": 1.8118491921005386, + "grad_norm": 0.7071637511253357, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 25230 + }, + { + "epoch": 1.8125673249551166, + "grad_norm": 0.9082241654396057, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 25240 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 0.6318159103393555, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 25250 + }, + { + "epoch": 1.8140035906642729, + "grad_norm": 0.8006597757339478, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 25260 + }, + { + "epoch": 1.814721723518851, + "grad_norm": 0.7950259447097778, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 25270 + }, + { + "epoch": 1.815439856373429, + "grad_norm": 0.8376588821411133, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 25280 + }, + { + "epoch": 1.816157989228007, + "grad_norm": 0.8343217968940735, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 25290 + }, + { + "epoch": 1.8168761220825853, + "grad_norm": 0.6240017414093018, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 25300 + }, + { + "epoch": 1.8175942549371635, + "grad_norm": 0.7079808712005615, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 25310 + }, + { + "epoch": 1.8183123877917415, + "grad_norm": 0.5930073261260986, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 25320 + }, + { + "epoch": 1.8190305206463195, + "grad_norm": 0.6994491815567017, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 25330 + }, + { + "epoch": 1.8197486535008975, + "grad_norm": 0.8285305500030518, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 25340 + }, + { + "epoch": 1.8204667863554758, + "grad_norm": 0.6880194544792175, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 25350 + }, + { + "epoch": 1.821184919210054, + "grad_norm": 0.7301307916641235, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 25360 + }, + { + "epoch": 1.821903052064632, + "grad_norm": 0.8117532730102539, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 25370 + }, + { + "epoch": 1.82262118491921, + "grad_norm": 0.8098701238632202, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 25380 + }, + { + "epoch": 1.823339317773788, + "grad_norm": 0.6899038553237915, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 25390 + }, + { + "epoch": 1.8240574506283662, + "grad_norm": 0.7350431084632874, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 25400 + }, + { + "epoch": 1.8247755834829444, + "grad_norm": 0.8723382949829102, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 25410 + }, + { + "epoch": 1.8254937163375224, + "grad_norm": 0.7448108196258545, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 25420 + }, + { + "epoch": 1.8262118491921004, + "grad_norm": 0.7525040507316589, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25430 + }, + { + "epoch": 1.8269299820466787, + "grad_norm": 0.7148599028587341, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25440 + }, + { + "epoch": 1.827648114901257, + "grad_norm": 1.1802153587341309, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 25450 + }, + { + "epoch": 1.828366247755835, + "grad_norm": 0.619945764541626, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25460 + }, + { + "epoch": 1.829084380610413, + "grad_norm": 0.7065792679786682, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 25470 + }, + { + "epoch": 1.829802513464991, + "grad_norm": 0.6626001596450806, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 25480 + }, + { + "epoch": 1.8305206463195691, + "grad_norm": 0.8368920087814331, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 25490 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 0.7528934478759766, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 25500 + }, + { + "epoch": 1.8319569120287253, + "grad_norm": 0.6472136378288269, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 25510 + }, + { + "epoch": 1.8326750448833034, + "grad_norm": 0.7818671464920044, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 25520 + }, + { + "epoch": 1.8333931777378814, + "grad_norm": 0.8280798196792603, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 25530 + }, + { + "epoch": 1.8341113105924596, + "grad_norm": 0.7038599252700806, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 25540 + }, + { + "epoch": 1.8348294434470378, + "grad_norm": 0.6345962882041931, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 25550 + }, + { + "epoch": 1.8355475763016158, + "grad_norm": 0.6891741752624512, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 25560 + }, + { + "epoch": 1.8362657091561938, + "grad_norm": 0.7753492593765259, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 25570 + }, + { + "epoch": 1.836983842010772, + "grad_norm": 0.6907210946083069, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 25580 + }, + { + "epoch": 1.8377019748653503, + "grad_norm": 0.7483090162277222, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 25590 + }, + { + "epoch": 1.8384201077199283, + "grad_norm": 0.8749029636383057, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 25600 + }, + { + "epoch": 1.8391382405745063, + "grad_norm": 0.6936851143836975, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 25610 + }, + { + "epoch": 1.8398563734290843, + "grad_norm": 0.7273763418197632, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 25620 + }, + { + "epoch": 1.8405745062836625, + "grad_norm": 0.7655298113822937, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 25630 + }, + { + "epoch": 1.8412926391382407, + "grad_norm": 0.7207344770431519, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 25640 + }, + { + "epoch": 1.8420107719928187, + "grad_norm": 0.6970131397247314, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 25650 + }, + { + "epoch": 1.8427289048473967, + "grad_norm": 0.7777560353279114, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25660 + }, + { + "epoch": 1.8434470377019747, + "grad_norm": 0.7070116400718689, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 25670 + }, + { + "epoch": 1.844165170556553, + "grad_norm": 0.6980257630348206, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 25680 + }, + { + "epoch": 1.8448833034111312, + "grad_norm": 0.906563401222229, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 25690 + }, + { + "epoch": 1.8456014362657092, + "grad_norm": 0.567991316318512, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 25700 + }, + { + "epoch": 1.8463195691202872, + "grad_norm": 0.5954506993293762, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 25710 + }, + { + "epoch": 1.8470377019748654, + "grad_norm": 0.8073318600654602, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 25720 + }, + { + "epoch": 1.8477558348294436, + "grad_norm": 0.7439551949501038, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 25730 + }, + { + "epoch": 1.8484739676840216, + "grad_norm": 0.8091771602630615, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 25740 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 0.6584576964378357, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 25750 + }, + { + "epoch": 1.8499102333931776, + "grad_norm": 0.8161963224411011, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 25760 + }, + { + "epoch": 1.8506283662477558, + "grad_norm": 0.7337122559547424, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 25770 + }, + { + "epoch": 1.851346499102334, + "grad_norm": 0.8968114256858826, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25780 + }, + { + "epoch": 1.852064631956912, + "grad_norm": 0.8647686839103699, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 25790 + }, + { + "epoch": 1.85278276481149, + "grad_norm": 0.7775349020957947, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 25800 + }, + { + "epoch": 1.853500897666068, + "grad_norm": 0.686072587966919, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 25810 + }, + { + "epoch": 1.8542190305206463, + "grad_norm": 0.7053380012512207, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 25820 + }, + { + "epoch": 1.8549371633752245, + "grad_norm": 0.7899979948997498, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 25830 + }, + { + "epoch": 1.8556552962298025, + "grad_norm": 0.6970776915550232, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 25840 + }, + { + "epoch": 1.8563734290843805, + "grad_norm": 0.7210841774940491, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 25850 + }, + { + "epoch": 1.8570915619389587, + "grad_norm": 0.7297208905220032, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 25860 + }, + { + "epoch": 1.857809694793537, + "grad_norm": 0.7782729268074036, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 25870 + }, + { + "epoch": 1.858527827648115, + "grad_norm": 0.7227505445480347, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 25880 + }, + { + "epoch": 1.859245960502693, + "grad_norm": 0.7489684224128723, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 25890 + }, + { + "epoch": 1.859964093357271, + "grad_norm": 0.7447289824485779, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 25900 + }, + { + "epoch": 1.8606822262118492, + "grad_norm": 0.8516317009925842, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 25910 + }, + { + "epoch": 1.8614003590664274, + "grad_norm": 0.6864543557167053, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 25920 + }, + { + "epoch": 1.8621184919210054, + "grad_norm": 0.6753451824188232, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 25930 + }, + { + "epoch": 1.8628366247755834, + "grad_norm": 0.631679117679596, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25940 + }, + { + "epoch": 1.8635547576301614, + "grad_norm": 0.7715049982070923, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 25950 + }, + { + "epoch": 1.8642728904847397, + "grad_norm": 0.7354850769042969, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 25960 + }, + { + "epoch": 1.8649910233393179, + "grad_norm": 0.7443442940711975, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 25970 + }, + { + "epoch": 1.8657091561938959, + "grad_norm": 0.6880337595939636, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 25980 + }, + { + "epoch": 1.8664272890484739, + "grad_norm": 0.843941867351532, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 25990 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 0.6904318928718567, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 26000 + }, + { + "epoch": 1.86786355475763, + "grad_norm": 0.9041751623153687, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 26010 + }, + { + "epoch": 1.8685816876122083, + "grad_norm": 0.7470057010650635, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 26020 + }, + { + "epoch": 1.8692998204667863, + "grad_norm": 0.6921331882476807, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 26030 + }, + { + "epoch": 1.8700179533213643, + "grad_norm": 0.7627376914024353, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 26040 + }, + { + "epoch": 1.8707360861759426, + "grad_norm": 0.7784932851791382, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 26050 + }, + { + "epoch": 1.8714542190305208, + "grad_norm": 0.6399524807929993, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 26060 + }, + { + "epoch": 1.8721723518850988, + "grad_norm": 0.6478492617607117, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26070 + }, + { + "epoch": 1.8728904847396768, + "grad_norm": 0.6376804113388062, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 26080 + }, + { + "epoch": 1.8736086175942548, + "grad_norm": 0.6976892352104187, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 26090 + }, + { + "epoch": 1.874326750448833, + "grad_norm": 0.7997903227806091, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 26100 + }, + { + "epoch": 1.8750448833034112, + "grad_norm": 0.6984273791313171, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 26110 + }, + { + "epoch": 1.8757630161579892, + "grad_norm": 0.7020659446716309, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26120 + }, + { + "epoch": 1.8764811490125672, + "grad_norm": 0.784986138343811, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 26130 + }, + { + "epoch": 1.8771992818671455, + "grad_norm": 0.7369210124015808, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 26140 + }, + { + "epoch": 1.8779174147217235, + "grad_norm": 0.7730622291564941, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 26150 + }, + { + "epoch": 1.8786355475763017, + "grad_norm": 0.7253434658050537, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 26160 + }, + { + "epoch": 1.8793536804308797, + "grad_norm": 0.8019800186157227, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 26170 + }, + { + "epoch": 1.8800718132854577, + "grad_norm": 0.7337628602981567, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 26180 + }, + { + "epoch": 1.880789946140036, + "grad_norm": 0.7049200534820557, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 26190 + }, + { + "epoch": 1.8815080789946141, + "grad_norm": 0.6451525092124939, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 26200 + }, + { + "epoch": 1.8822262118491921, + "grad_norm": 0.7660874724388123, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 26210 + }, + { + "epoch": 1.8829443447037701, + "grad_norm": 0.8464223146438599, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26220 + }, + { + "epoch": 1.8836624775583481, + "grad_norm": 0.859503984451294, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 26230 + }, + { + "epoch": 1.8843806104129264, + "grad_norm": 0.6969478726387024, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 26240 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 0.6860285997390747, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 26250 + }, + { + "epoch": 1.8858168761220826, + "grad_norm": 0.5873110294342041, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 26260 + }, + { + "epoch": 1.8865350089766606, + "grad_norm": 0.6959530115127563, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 26270 + }, + { + "epoch": 1.8872531418312388, + "grad_norm": 0.8734689950942993, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 26280 + }, + { + "epoch": 1.8879712746858168, + "grad_norm": 0.7385509014129639, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 26290 + }, + { + "epoch": 1.888689407540395, + "grad_norm": 0.6702063083648682, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 26300 + }, + { + "epoch": 1.889407540394973, + "grad_norm": 0.8177255988121033, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 26310 + }, + { + "epoch": 1.890125673249551, + "grad_norm": 0.6638466715812683, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 26320 + }, + { + "epoch": 1.8908438061041293, + "grad_norm": 0.8584128618240356, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 26330 + }, + { + "epoch": 1.8915619389587075, + "grad_norm": 0.677561342716217, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 26340 + }, + { + "epoch": 1.8922800718132855, + "grad_norm": 0.6931864619255066, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 26350 + }, + { + "epoch": 1.8929982046678635, + "grad_norm": 0.6583828330039978, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 26360 + }, + { + "epoch": 1.8937163375224415, + "grad_norm": 0.6708519458770752, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 26370 + }, + { + "epoch": 1.8944344703770197, + "grad_norm": 0.7684788107872009, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 26380 + }, + { + "epoch": 1.895152603231598, + "grad_norm": 0.703217625617981, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 26390 + }, + { + "epoch": 1.895870736086176, + "grad_norm": 0.6686710119247437, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26400 + }, + { + "epoch": 1.896588868940754, + "grad_norm": 0.7429705262184143, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 26410 + }, + { + "epoch": 1.8973070017953322, + "grad_norm": 0.7835305333137512, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 26420 + }, + { + "epoch": 1.8980251346499102, + "grad_norm": 0.7793689370155334, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 26430 + }, + { + "epoch": 1.8987432675044884, + "grad_norm": 0.7337237000465393, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 26440 + }, + { + "epoch": 1.8994614003590664, + "grad_norm": 0.5734546780586243, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 26450 + }, + { + "epoch": 1.9001795332136444, + "grad_norm": 0.655937135219574, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 26460 + }, + { + "epoch": 1.9008976660682226, + "grad_norm": 1.0200905799865723, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 26470 + }, + { + "epoch": 1.9016157989228009, + "grad_norm": 0.6118829250335693, + "learning_rate": 0.0002, + "loss": 0.733, + "step": 26480 + }, + { + "epoch": 1.9023339317773789, + "grad_norm": 0.7459297776222229, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 26490 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 0.9451959729194641, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 26500 + }, + { + "epoch": 1.9037701974865349, + "grad_norm": 0.9694880247116089, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 26510 + }, + { + "epoch": 1.904488330341113, + "grad_norm": 0.806532084941864, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 26520 + }, + { + "epoch": 1.9052064631956913, + "grad_norm": 0.7016968727111816, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 26530 + }, + { + "epoch": 1.9059245960502693, + "grad_norm": 0.7707533836364746, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26540 + }, + { + "epoch": 1.9066427289048473, + "grad_norm": 0.716044545173645, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 26550 + }, + { + "epoch": 1.9073608617594255, + "grad_norm": 0.7904782295227051, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 26560 + }, + { + "epoch": 1.9080789946140035, + "grad_norm": 0.8557461500167847, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 26570 + }, + { + "epoch": 1.9087971274685818, + "grad_norm": 0.6807048916816711, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26580 + }, + { + "epoch": 1.9095152603231598, + "grad_norm": 0.8374032974243164, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 26590 + }, + { + "epoch": 1.9102333931777378, + "grad_norm": 0.7936834692955017, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 26600 + }, + { + "epoch": 1.910951526032316, + "grad_norm": 0.6342210173606873, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 26610 + }, + { + "epoch": 1.9116696588868942, + "grad_norm": 0.8222208023071289, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 26620 + }, + { + "epoch": 1.9123877917414722, + "grad_norm": 0.7890012860298157, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 26630 + }, + { + "epoch": 1.9131059245960502, + "grad_norm": 0.6415254473686218, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 26640 + }, + { + "epoch": 1.9138240574506282, + "grad_norm": 0.7936763763427734, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 26650 + }, + { + "epoch": 1.9145421903052064, + "grad_norm": 0.7174334526062012, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 26660 + }, + { + "epoch": 1.9152603231597847, + "grad_norm": 0.6503710746765137, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 26670 + }, + { + "epoch": 1.9159784560143627, + "grad_norm": 0.7618577480316162, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 26680 + }, + { + "epoch": 1.9166965888689407, + "grad_norm": 0.7984131574630737, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 26690 + }, + { + "epoch": 1.917414721723519, + "grad_norm": 0.6863887906074524, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 26700 + }, + { + "epoch": 1.918132854578097, + "grad_norm": 0.7621138691902161, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 26710 + }, + { + "epoch": 1.9188509874326751, + "grad_norm": 0.7855543494224548, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 26720 + }, + { + "epoch": 1.9195691202872531, + "grad_norm": 0.7045016288757324, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 26730 + }, + { + "epoch": 1.9202872531418311, + "grad_norm": 0.7799559235572815, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 26740 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 0.7999796271324158, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 26750 + }, + { + "epoch": 1.9217235188509876, + "grad_norm": 0.5479980111122131, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 26760 + }, + { + "epoch": 1.9224416517055656, + "grad_norm": 0.7192868590354919, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 26770 + }, + { + "epoch": 1.9231597845601436, + "grad_norm": 0.7642375826835632, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 26780 + }, + { + "epoch": 1.9238779174147216, + "grad_norm": 0.7015959620475769, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 26790 + }, + { + "epoch": 1.9245960502692998, + "grad_norm": 0.6685634851455688, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 26800 + }, + { + "epoch": 1.925314183123878, + "grad_norm": 0.674363911151886, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 26810 + }, + { + "epoch": 1.926032315978456, + "grad_norm": 0.769318163394928, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 26820 + }, + { + "epoch": 1.926750448833034, + "grad_norm": 0.7397989630699158, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 26830 + }, + { + "epoch": 1.9274685816876123, + "grad_norm": 0.7603814601898193, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 26840 + }, + { + "epoch": 1.9281867145421903, + "grad_norm": 0.5960564613342285, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 26850 + }, + { + "epoch": 1.9289048473967685, + "grad_norm": 0.8158858418464661, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 26860 + }, + { + "epoch": 1.9296229802513465, + "grad_norm": 0.7022058367729187, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 26870 + }, + { + "epoch": 1.9303411131059245, + "grad_norm": 0.7249060273170471, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 26880 + }, + { + "epoch": 1.9310592459605027, + "grad_norm": 0.7613264322280884, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 26890 + }, + { + "epoch": 1.931777378815081, + "grad_norm": 0.6857499480247498, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 26900 + }, + { + "epoch": 1.932495511669659, + "grad_norm": 0.6968346834182739, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 26910 + }, + { + "epoch": 1.933213644524237, + "grad_norm": 0.7079267501831055, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 26920 + }, + { + "epoch": 1.933931777378815, + "grad_norm": 0.6571618914604187, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 26930 + }, + { + "epoch": 1.9346499102333932, + "grad_norm": 0.7460548281669617, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 26940 + }, + { + "epoch": 1.9353680430879714, + "grad_norm": 0.7954307794570923, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 26950 + }, + { + "epoch": 1.9360861759425494, + "grad_norm": 0.8696223497390747, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 26960 + }, + { + "epoch": 1.9368043087971274, + "grad_norm": 0.726004421710968, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 26970 + }, + { + "epoch": 1.9375224416517056, + "grad_norm": 0.8760337829589844, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 26980 + }, + { + "epoch": 1.9382405745062836, + "grad_norm": 0.7308675646781921, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 26990 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 0.5900304317474365, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 27000 + }, + { + "epoch": 1.9396768402154398, + "grad_norm": 0.8839457631111145, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 27010 + }, + { + "epoch": 1.9403949730700178, + "grad_norm": 0.7239173650741577, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 27020 + }, + { + "epoch": 1.941113105924596, + "grad_norm": 0.8972901701927185, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 27030 + }, + { + "epoch": 1.9418312387791743, + "grad_norm": 0.7140652537345886, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 27040 + }, + { + "epoch": 1.9425493716337523, + "grad_norm": 0.7502743005752563, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 27050 + }, + { + "epoch": 1.9432675044883303, + "grad_norm": 0.6420751810073853, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 27060 + }, + { + "epoch": 1.9439856373429083, + "grad_norm": 0.6671820282936096, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 27070 + }, + { + "epoch": 1.9447037701974865, + "grad_norm": 0.6268796324729919, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 27080 + }, + { + "epoch": 1.9454219030520647, + "grad_norm": 0.6850021481513977, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 27090 + }, + { + "epoch": 1.9461400359066428, + "grad_norm": 0.6380038261413574, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 27100 + }, + { + "epoch": 1.9468581687612208, + "grad_norm": 0.5806204080581665, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 27110 + }, + { + "epoch": 1.947576301615799, + "grad_norm": 0.8236927390098572, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 27120 + }, + { + "epoch": 1.948294434470377, + "grad_norm": 0.7915826439857483, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27130 + }, + { + "epoch": 1.9490125673249552, + "grad_norm": 0.7467429041862488, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 27140 + }, + { + "epoch": 1.9497307001795332, + "grad_norm": 0.6278707981109619, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27150 + }, + { + "epoch": 1.9504488330341112, + "grad_norm": 0.7353739142417908, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 27160 + }, + { + "epoch": 1.9511669658886894, + "grad_norm": 0.6443645358085632, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27170 + }, + { + "epoch": 1.9518850987432677, + "grad_norm": 0.770800769329071, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 27180 + }, + { + "epoch": 1.9526032315978457, + "grad_norm": 0.8982598781585693, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 27190 + }, + { + "epoch": 1.9533213644524237, + "grad_norm": 0.775017499923706, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 27200 + }, + { + "epoch": 1.9540394973070017, + "grad_norm": 0.8271628618240356, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 27210 + }, + { + "epoch": 1.9547576301615799, + "grad_norm": 0.7460184693336487, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 27220 + }, + { + "epoch": 1.955475763016158, + "grad_norm": 0.7732188105583191, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 27230 + }, + { + "epoch": 1.956193895870736, + "grad_norm": 0.7398577332496643, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 27240 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 0.7132339477539062, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 27250 + }, + { + "epoch": 1.9576301615798921, + "grad_norm": 0.6718965768814087, + "learning_rate": 0.0002, + "loss": 0.7731, + "step": 27260 + }, + { + "epoch": 1.9583482944344703, + "grad_norm": 0.7914422154426575, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 27270 + }, + { + "epoch": 1.9590664272890486, + "grad_norm": 0.8314110636711121, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 27280 + }, + { + "epoch": 1.9597845601436266, + "grad_norm": 0.7810674905776978, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 27290 + }, + { + "epoch": 1.9605026929982046, + "grad_norm": 0.7691007256507874, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 27300 + }, + { + "epoch": 1.9612208258527828, + "grad_norm": 0.6753138899803162, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 27310 + }, + { + "epoch": 1.961938958707361, + "grad_norm": 0.5881175994873047, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 27320 + }, + { + "epoch": 1.962657091561939, + "grad_norm": 0.8414133191108704, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27330 + }, + { + "epoch": 1.963375224416517, + "grad_norm": 0.7363715171813965, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 27340 + }, + { + "epoch": 1.964093357271095, + "grad_norm": 0.6526232361793518, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 27350 + }, + { + "epoch": 1.9648114901256732, + "grad_norm": 0.6821389198303223, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 27360 + }, + { + "epoch": 1.9655296229802515, + "grad_norm": 0.7306062579154968, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 27370 + }, + { + "epoch": 1.9662477558348295, + "grad_norm": 0.6458130478858948, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 27380 + }, + { + "epoch": 1.9669658886894075, + "grad_norm": 0.7243196368217468, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 27390 + }, + { + "epoch": 1.9676840215439855, + "grad_norm": 0.8062235713005066, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 27400 + }, + { + "epoch": 1.9684021543985637, + "grad_norm": 0.68441241979599, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 27410 + }, + { + "epoch": 1.969120287253142, + "grad_norm": 0.7504498958587646, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 27420 + }, + { + "epoch": 1.96983842010772, + "grad_norm": 0.7469466328620911, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 27430 + }, + { + "epoch": 1.970556552962298, + "grad_norm": 0.7109853625297546, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 27440 + }, + { + "epoch": 1.9712746858168761, + "grad_norm": 0.6964903473854065, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 27450 + }, + { + "epoch": 1.9719928186714544, + "grad_norm": 0.8224200010299683, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 27460 + }, + { + "epoch": 1.9727109515260324, + "grad_norm": 0.6195617318153381, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 27470 + }, + { + "epoch": 1.9734290843806104, + "grad_norm": 0.691511332988739, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 27480 + }, + { + "epoch": 1.9741472172351884, + "grad_norm": 0.7437900304794312, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 27490 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 0.7987960577011108, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 27500 + }, + { + "epoch": 1.9755834829443448, + "grad_norm": 0.7117776274681091, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 27510 + }, + { + "epoch": 1.9763016157989228, + "grad_norm": 0.8473866581916809, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 27520 + }, + { + "epoch": 1.9770197486535008, + "grad_norm": 0.7178242802619934, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 27530 + }, + { + "epoch": 1.9777378815080788, + "grad_norm": 0.760145902633667, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 27540 + }, + { + "epoch": 1.978456014362657, + "grad_norm": 0.764436662197113, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 27550 + }, + { + "epoch": 1.9791741472172353, + "grad_norm": 0.7245904803276062, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 27560 + }, + { + "epoch": 1.9798922800718133, + "grad_norm": 0.6317000389099121, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 27570 + }, + { + "epoch": 1.9806104129263913, + "grad_norm": 0.8764704465866089, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 27580 + }, + { + "epoch": 1.9813285457809695, + "grad_norm": 0.6111825108528137, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 27590 + }, + { + "epoch": 1.9820466786355477, + "grad_norm": 0.6797714233398438, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 27600 + }, + { + "epoch": 1.9827648114901257, + "grad_norm": 0.7754142880439758, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 27610 + }, + { + "epoch": 1.9834829443447037, + "grad_norm": 0.7243061661720276, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 27620 + }, + { + "epoch": 1.9842010771992817, + "grad_norm": 0.6194812655448914, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 27630 + }, + { + "epoch": 1.98491921005386, + "grad_norm": 0.6399638056755066, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27640 + }, + { + "epoch": 1.9856373429084382, + "grad_norm": 0.7637218832969666, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 27650 + }, + { + "epoch": 1.9863554757630162, + "grad_norm": 0.9099404811859131, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 27660 + }, + { + "epoch": 1.9870736086175942, + "grad_norm": 0.6892596483230591, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 27670 + }, + { + "epoch": 1.9877917414721722, + "grad_norm": 0.5962418913841248, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 27680 + }, + { + "epoch": 1.9885098743267504, + "grad_norm": 0.5750163197517395, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27690 + }, + { + "epoch": 1.9892280071813286, + "grad_norm": 0.6740097403526306, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 27700 + }, + { + "epoch": 1.9899461400359066, + "grad_norm": 0.6968644857406616, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 27710 + }, + { + "epoch": 1.9906642728904846, + "grad_norm": 0.6788132190704346, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 27720 + }, + { + "epoch": 1.9913824057450629, + "grad_norm": 0.8600544929504395, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 27730 + }, + { + "epoch": 1.992100538599641, + "grad_norm": 0.6227671504020691, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 27740 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 0.6611875295639038, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 27750 + }, + { + "epoch": 1.993536804308797, + "grad_norm": 0.714568018913269, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 27760 + }, + { + "epoch": 1.994254937163375, + "grad_norm": 0.6328669190406799, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27770 + }, + { + "epoch": 1.9949730700179533, + "grad_norm": 0.8673429489135742, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27780 + }, + { + "epoch": 1.9956912028725315, + "grad_norm": 0.820620059967041, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 27790 + }, + { + "epoch": 1.9964093357271095, + "grad_norm": 0.8748094439506531, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 27800 + }, + { + "epoch": 1.9971274685816875, + "grad_norm": 0.8118113875389099, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 27810 + }, + { + "epoch": 1.9978456014362656, + "grad_norm": 0.6886725425720215, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 27820 + }, + { + "epoch": 1.9985637342908438, + "grad_norm": 0.7101268768310547, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 27830 + }, + { + "epoch": 1.999281867145422, + "grad_norm": 0.7823781967163086, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 27840 + }, + { + "epoch": 2.0, + "grad_norm": 0.8491085767745972, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 27850 + }, + { + "epoch": 2.0, + "eval_loss": 1.0868422985076904, + "eval_runtime": 55.1699, + "eval_samples_per_second": 13.286, + "eval_steps_per_second": 1.668, + "step": 27850 + }, + { + "epoch": 2.000718132854578, + "grad_norm": 0.9003389477729797, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 27860 + }, + { + "epoch": 2.001436265709156, + "grad_norm": 0.8898349404335022, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 27870 + }, + { + "epoch": 2.0021543985637344, + "grad_norm": 0.7525973320007324, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 27880 + }, + { + "epoch": 2.0028725314183125, + "grad_norm": 0.7821497321128845, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 27890 + }, + { + "epoch": 2.0035906642728905, + "grad_norm": 0.6334691047668457, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 27900 + }, + { + "epoch": 2.0043087971274685, + "grad_norm": 0.732991099357605, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 27910 + }, + { + "epoch": 2.0050269299820465, + "grad_norm": 0.949942946434021, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 27920 + }, + { + "epoch": 2.005745062836625, + "grad_norm": 0.657267689704895, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 27930 + }, + { + "epoch": 2.006463195691203, + "grad_norm": 0.8329252004623413, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 27940 + }, + { + "epoch": 2.007181328545781, + "grad_norm": 0.7816959023475647, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 27950 + }, + { + "epoch": 2.007899461400359, + "grad_norm": 0.7546323537826538, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 27960 + }, + { + "epoch": 2.0086175942549374, + "grad_norm": 0.9519657492637634, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 27970 + }, + { + "epoch": 2.0093357271095154, + "grad_norm": 0.7934315800666809, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 27980 + }, + { + "epoch": 2.0100538599640934, + "grad_norm": 0.9579764604568481, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 27990 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 0.764167070388794, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 28000 + }, + { + "epoch": 2.0114901256732494, + "grad_norm": 0.7380000948905945, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 28010 + }, + { + "epoch": 2.012208258527828, + "grad_norm": 0.7220044732093811, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 28020 + }, + { + "epoch": 2.012926391382406, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 28030 + }, + { + "epoch": 2.013644524236984, + "grad_norm": 0.7507190704345703, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28040 + }, + { + "epoch": 2.014362657091562, + "grad_norm": 0.9488387703895569, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 28050 + }, + { + "epoch": 2.01508078994614, + "grad_norm": 0.9092940092086792, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 28060 + }, + { + "epoch": 2.0157989228007183, + "grad_norm": 0.7859629392623901, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28070 + }, + { + "epoch": 2.0165170556552963, + "grad_norm": 0.7636393904685974, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 28080 + }, + { + "epoch": 2.0172351885098743, + "grad_norm": 0.8860714435577393, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 28090 + }, + { + "epoch": 2.0179533213644523, + "grad_norm": 0.6837195158004761, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 28100 + }, + { + "epoch": 2.0186714542190307, + "grad_norm": 0.7778242826461792, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 28110 + }, + { + "epoch": 2.0193895870736087, + "grad_norm": 0.7164766788482666, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 28120 + }, + { + "epoch": 2.0201077199281867, + "grad_norm": 0.8965572118759155, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 28130 + }, + { + "epoch": 2.0208258527827647, + "grad_norm": 0.8074374794960022, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 28140 + }, + { + "epoch": 2.0215439856373427, + "grad_norm": 0.8307222127914429, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 28150 + }, + { + "epoch": 2.022262118491921, + "grad_norm": 0.9600032567977905, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 28160 + }, + { + "epoch": 2.022980251346499, + "grad_norm": 0.8541040420532227, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 28170 + }, + { + "epoch": 2.023698384201077, + "grad_norm": 0.8864985704421997, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 28180 + }, + { + "epoch": 2.024416517055655, + "grad_norm": 0.7926326990127563, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 28190 + }, + { + "epoch": 2.025134649910233, + "grad_norm": 1.0548077821731567, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28200 + }, + { + "epoch": 2.0258527827648116, + "grad_norm": 0.7468827366828918, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 28210 + }, + { + "epoch": 2.0265709156193896, + "grad_norm": 0.7683286070823669, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 28220 + }, + { + "epoch": 2.0272890484739676, + "grad_norm": 0.7307319641113281, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 28230 + }, + { + "epoch": 2.0280071813285456, + "grad_norm": 0.7813416719436646, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 28240 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 0.7954556941986084, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 28250 + }, + { + "epoch": 2.029443447037702, + "grad_norm": 0.8836418986320496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 28260 + }, + { + "epoch": 2.03016157989228, + "grad_norm": 0.7092728614807129, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28270 + }, + { + "epoch": 2.030879712746858, + "grad_norm": 0.8512285351753235, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 28280 + }, + { + "epoch": 2.031597845601436, + "grad_norm": 0.8005346059799194, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 28290 + }, + { + "epoch": 2.0323159784560145, + "grad_norm": 0.8872515559196472, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 28300 + }, + { + "epoch": 2.0330341113105925, + "grad_norm": 0.7948436737060547, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 28310 + }, + { + "epoch": 2.0337522441651705, + "grad_norm": 0.7418082356452942, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 28320 + }, + { + "epoch": 2.0344703770197485, + "grad_norm": 0.9600949287414551, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 28330 + }, + { + "epoch": 2.0351885098743265, + "grad_norm": 0.9767434597015381, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 28340 + }, + { + "epoch": 2.035906642728905, + "grad_norm": 0.7435336709022522, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 28350 + }, + { + "epoch": 2.036624775583483, + "grad_norm": 0.997978925704956, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 28360 + }, + { + "epoch": 2.037342908438061, + "grad_norm": 0.9072412252426147, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 28370 + }, + { + "epoch": 2.038061041292639, + "grad_norm": 0.8396701812744141, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 28380 + }, + { + "epoch": 2.0387791741472174, + "grad_norm": 1.0449832677841187, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 28390 + }, + { + "epoch": 2.0394973070017954, + "grad_norm": 0.6471025943756104, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 28400 + }, + { + "epoch": 2.0402154398563734, + "grad_norm": 0.8147950768470764, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 28410 + }, + { + "epoch": 2.0409335727109514, + "grad_norm": 0.902508020401001, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 28420 + }, + { + "epoch": 2.0416517055655294, + "grad_norm": 0.6426262855529785, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 28430 + }, + { + "epoch": 2.042369838420108, + "grad_norm": 0.8016643524169922, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 28440 + }, + { + "epoch": 2.043087971274686, + "grad_norm": 0.6841614246368408, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 28450 + }, + { + "epoch": 2.043806104129264, + "grad_norm": 0.7713631987571716, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 28460 + }, + { + "epoch": 2.044524236983842, + "grad_norm": 0.8795675039291382, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 28470 + }, + { + "epoch": 2.04524236983842, + "grad_norm": 0.725447416305542, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 28480 + }, + { + "epoch": 2.0459605026929983, + "grad_norm": 0.806861162185669, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 28490 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 0.752953827381134, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 28500 + }, + { + "epoch": 2.0473967684021543, + "grad_norm": 0.7143173813819885, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 28510 + }, + { + "epoch": 2.0481149012567323, + "grad_norm": 0.9316226243972778, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 28520 + }, + { + "epoch": 2.048833034111311, + "grad_norm": 0.7292338609695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 28530 + }, + { + "epoch": 2.049551166965889, + "grad_norm": 0.7392885088920593, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 28540 + }, + { + "epoch": 2.050269299820467, + "grad_norm": 0.7288873195648193, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 28550 + }, + { + "epoch": 2.050987432675045, + "grad_norm": 0.7791221141815186, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 28560 + }, + { + "epoch": 2.051705565529623, + "grad_norm": 0.821983814239502, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 28570 + }, + { + "epoch": 2.0524236983842012, + "grad_norm": 0.8925826549530029, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28580 + }, + { + "epoch": 2.0531418312387792, + "grad_norm": 0.7181646227836609, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 28590 + }, + { + "epoch": 2.0538599640933572, + "grad_norm": 0.6387725472450256, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 28600 + }, + { + "epoch": 2.0545780969479353, + "grad_norm": 0.8398096561431885, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 28610 + }, + { + "epoch": 2.0552962298025133, + "grad_norm": 1.0458195209503174, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 28620 + }, + { + "epoch": 2.0560143626570917, + "grad_norm": 0.7032150626182556, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28630 + }, + { + "epoch": 2.0567324955116697, + "grad_norm": 0.8850845098495483, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 28640 + }, + { + "epoch": 2.0574506283662477, + "grad_norm": 0.8587120175361633, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 28650 + }, + { + "epoch": 2.0581687612208257, + "grad_norm": 0.7462602853775024, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28660 + }, + { + "epoch": 2.058886894075404, + "grad_norm": 0.7355574369430542, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 28670 + }, + { + "epoch": 2.059605026929982, + "grad_norm": 0.9229736328125, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 28680 + }, + { + "epoch": 2.06032315978456, + "grad_norm": 0.7685085535049438, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 28690 + }, + { + "epoch": 2.061041292639138, + "grad_norm": 0.6749364137649536, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 28700 + }, + { + "epoch": 2.061759425493716, + "grad_norm": 0.7608520984649658, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28710 + }, + { + "epoch": 2.0624775583482946, + "grad_norm": 0.9451281428337097, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28720 + }, + { + "epoch": 2.0631956912028726, + "grad_norm": 0.7869735360145569, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 28730 + }, + { + "epoch": 2.0639138240574506, + "grad_norm": 0.8422008156776428, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 28740 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 0.7486162781715393, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 28750 + }, + { + "epoch": 2.0653500897666066, + "grad_norm": 0.9374173879623413, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28760 + }, + { + "epoch": 2.066068222621185, + "grad_norm": 0.8749295473098755, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 28770 + }, + { + "epoch": 2.066786355475763, + "grad_norm": 0.8265942931175232, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 28780 + }, + { + "epoch": 2.067504488330341, + "grad_norm": 0.8541982769966125, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 28790 + }, + { + "epoch": 2.068222621184919, + "grad_norm": 0.8220006227493286, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 28800 + }, + { + "epoch": 2.0689407540394975, + "grad_norm": 0.7302022576332092, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 28810 + }, + { + "epoch": 2.0696588868940755, + "grad_norm": 0.7073875069618225, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 28820 + }, + { + "epoch": 2.0703770197486535, + "grad_norm": 0.7792919874191284, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28830 + }, + { + "epoch": 2.0710951526032315, + "grad_norm": 0.8268185257911682, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 28840 + }, + { + "epoch": 2.0718132854578095, + "grad_norm": 0.7576423287391663, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 28850 + }, + { + "epoch": 2.072531418312388, + "grad_norm": 0.8255910873413086, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 28860 + }, + { + "epoch": 2.073249551166966, + "grad_norm": 0.7900934815406799, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 28870 + }, + { + "epoch": 2.073967684021544, + "grad_norm": 0.846665620803833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 28880 + }, + { + "epoch": 2.074685816876122, + "grad_norm": 0.8159831166267395, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 28890 + }, + { + "epoch": 2.0754039497307, + "grad_norm": 0.7395941615104675, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 28900 + }, + { + "epoch": 2.0761220825852784, + "grad_norm": 0.9765046238899231, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 28910 + }, + { + "epoch": 2.0768402154398564, + "grad_norm": 0.8358173966407776, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 28920 + }, + { + "epoch": 2.0775583482944344, + "grad_norm": 0.6848723292350769, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 28930 + }, + { + "epoch": 2.0782764811490124, + "grad_norm": 0.7965065836906433, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 28940 + }, + { + "epoch": 2.078994614003591, + "grad_norm": 0.7618608474731445, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 28950 + }, + { + "epoch": 2.079712746858169, + "grad_norm": 0.890615701675415, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 28960 + }, + { + "epoch": 2.080430879712747, + "grad_norm": 0.7310431003570557, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28970 + }, + { + "epoch": 2.081149012567325, + "grad_norm": 0.8228268027305603, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 28980 + }, + { + "epoch": 2.081867145421903, + "grad_norm": 0.883577287197113, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28990 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 0.8359243869781494, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 29000 + }, + { + "epoch": 2.0833034111310593, + "grad_norm": 0.8285391330718994, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 29010 + }, + { + "epoch": 2.0840215439856373, + "grad_norm": 0.8991064429283142, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 29020 + }, + { + "epoch": 2.0847396768402153, + "grad_norm": 0.6911244988441467, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 29030 + }, + { + "epoch": 2.0854578096947933, + "grad_norm": 0.8462249636650085, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 29040 + }, + { + "epoch": 2.0861759425493718, + "grad_norm": 0.9149548411369324, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 29050 + }, + { + "epoch": 2.0868940754039498, + "grad_norm": 0.7365630269050598, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 29060 + }, + { + "epoch": 2.087612208258528, + "grad_norm": 0.8439079523086548, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 29070 + }, + { + "epoch": 2.088330341113106, + "grad_norm": 0.7123780846595764, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 29080 + }, + { + "epoch": 2.0890484739676842, + "grad_norm": 0.6854261755943298, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 29090 + }, + { + "epoch": 2.0897666068222622, + "grad_norm": 0.83026123046875, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 29100 + }, + { + "epoch": 2.0904847396768402, + "grad_norm": 0.8413158059120178, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 29110 + }, + { + "epoch": 2.0912028725314182, + "grad_norm": 0.9646758437156677, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 29120 + }, + { + "epoch": 2.0919210053859962, + "grad_norm": 0.8421565890312195, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 29130 + }, + { + "epoch": 2.0926391382405747, + "grad_norm": 0.7748899459838867, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 29140 + }, + { + "epoch": 2.0933572710951527, + "grad_norm": 0.5973830819129944, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 29150 + }, + { + "epoch": 2.0940754039497307, + "grad_norm": 0.8440837860107422, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 29160 + }, + { + "epoch": 2.0947935368043087, + "grad_norm": 0.7392688989639282, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 29170 + }, + { + "epoch": 2.0955116696588867, + "grad_norm": 1.0522996187210083, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 29180 + }, + { + "epoch": 2.096229802513465, + "grad_norm": 0.7330273389816284, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 29190 + }, + { + "epoch": 2.096947935368043, + "grad_norm": 1.11064875125885, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 29200 + }, + { + "epoch": 2.097666068222621, + "grad_norm": 0.795446515083313, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 29210 + }, + { + "epoch": 2.098384201077199, + "grad_norm": 0.5552594661712646, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 29220 + }, + { + "epoch": 2.0991023339317776, + "grad_norm": 0.7327710390090942, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 29230 + }, + { + "epoch": 2.0998204667863556, + "grad_norm": 0.7474247217178345, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 29240 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": 0.7775853276252747, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 29250 + }, + { + "epoch": 2.1012567324955116, + "grad_norm": 0.769527018070221, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29260 + }, + { + "epoch": 2.1019748653500896, + "grad_norm": 0.8350797891616821, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 29270 + }, + { + "epoch": 2.102692998204668, + "grad_norm": 0.8749061822891235, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29280 + }, + { + "epoch": 2.103411131059246, + "grad_norm": 0.7838778495788574, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 29290 + }, + { + "epoch": 2.104129263913824, + "grad_norm": 0.8144710063934326, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 29300 + }, + { + "epoch": 2.104847396768402, + "grad_norm": 0.7965250015258789, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 29310 + }, + { + "epoch": 2.10556552962298, + "grad_norm": 0.7075945138931274, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 29320 + }, + { + "epoch": 2.1062836624775585, + "grad_norm": 0.9449555277824402, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 29330 + }, + { + "epoch": 2.1070017953321365, + "grad_norm": 0.9114580750465393, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 29340 + }, + { + "epoch": 2.1077199281867145, + "grad_norm": 0.8768125176429749, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 29350 + }, + { + "epoch": 2.1084380610412925, + "grad_norm": 0.8586908578872681, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 29360 + }, + { + "epoch": 2.109156193895871, + "grad_norm": 0.8351234793663025, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 29370 + }, + { + "epoch": 2.109874326750449, + "grad_norm": 0.686488687992096, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 29380 + }, + { + "epoch": 2.110592459605027, + "grad_norm": 0.7910184264183044, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 29390 + }, + { + "epoch": 2.111310592459605, + "grad_norm": 0.7649612426757812, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 29400 + }, + { + "epoch": 2.112028725314183, + "grad_norm": 0.7790259122848511, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29410 + }, + { + "epoch": 2.1127468581687614, + "grad_norm": 0.8386351466178894, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 29420 + }, + { + "epoch": 2.1134649910233394, + "grad_norm": 0.8605695366859436, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 29430 + }, + { + "epoch": 2.1141831238779174, + "grad_norm": 0.6808947920799255, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 29440 + }, + { + "epoch": 2.1149012567324954, + "grad_norm": 0.8310001492500305, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 29450 + }, + { + "epoch": 2.1156193895870734, + "grad_norm": 1.289986252784729, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 29460 + }, + { + "epoch": 2.116337522441652, + "grad_norm": 0.8679313659667969, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 29470 + }, + { + "epoch": 2.11705565529623, + "grad_norm": 0.9149175882339478, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 29480 + }, + { + "epoch": 2.117773788150808, + "grad_norm": 0.8405622839927673, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 29490 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 0.9174691438674927, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 29500 + }, + { + "epoch": 2.1192100538599643, + "grad_norm": 0.8865614533424377, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29510 + }, + { + "epoch": 2.1199281867145423, + "grad_norm": 0.645301342010498, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29520 + }, + { + "epoch": 2.1206463195691203, + "grad_norm": 0.7612960338592529, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 29530 + }, + { + "epoch": 2.1213644524236983, + "grad_norm": 0.7575576305389404, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 29540 + }, + { + "epoch": 2.1220825852782763, + "grad_norm": 0.8746156096458435, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 29550 + }, + { + "epoch": 2.1228007181328548, + "grad_norm": 0.8488934636116028, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 29560 + }, + { + "epoch": 2.1235188509874328, + "grad_norm": 0.8064972162246704, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 29570 + }, + { + "epoch": 2.1242369838420108, + "grad_norm": 0.7410933971405029, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 29580 + }, + { + "epoch": 2.1249551166965888, + "grad_norm": 0.7023535966873169, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 29590 + }, + { + "epoch": 2.1256732495511668, + "grad_norm": 0.8591743111610413, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 29600 + }, + { + "epoch": 2.126391382405745, + "grad_norm": 0.7270186543464661, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 29610 + }, + { + "epoch": 2.127109515260323, + "grad_norm": 0.9639726281166077, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 29620 + }, + { + "epoch": 2.127827648114901, + "grad_norm": 0.8519027829170227, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 29630 + }, + { + "epoch": 2.128545780969479, + "grad_norm": 0.8786447048187256, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 29640 + }, + { + "epoch": 2.129263913824057, + "grad_norm": 0.7452822923660278, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29650 + }, + { + "epoch": 2.1299820466786357, + "grad_norm": 0.9385744333267212, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 29660 + }, + { + "epoch": 2.1307001795332137, + "grad_norm": 0.7650160193443298, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 29670 + }, + { + "epoch": 2.1314183123877917, + "grad_norm": 0.7581976652145386, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 29680 + }, + { + "epoch": 2.1321364452423697, + "grad_norm": 0.8455183506011963, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 29690 + }, + { + "epoch": 2.132854578096948, + "grad_norm": 0.7200509905815125, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 29700 + }, + { + "epoch": 2.133572710951526, + "grad_norm": 0.7071877121925354, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 29710 + }, + { + "epoch": 2.134290843806104, + "grad_norm": 0.9197220802307129, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 29720 + }, + { + "epoch": 2.135008976660682, + "grad_norm": 0.6787277460098267, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 29730 + }, + { + "epoch": 2.13572710951526, + "grad_norm": 0.8183788061141968, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 29740 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 0.7958994507789612, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29750 + }, + { + "epoch": 2.1371633752244166, + "grad_norm": 0.8803889155387878, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 29760 + }, + { + "epoch": 2.1378815080789946, + "grad_norm": 0.6682677268981934, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 29770 + }, + { + "epoch": 2.1385996409335726, + "grad_norm": 1.0198085308074951, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 29780 + }, + { + "epoch": 2.139317773788151, + "grad_norm": 1.0258227586746216, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 29790 + }, + { + "epoch": 2.140035906642729, + "grad_norm": 0.8920917510986328, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 29800 + }, + { + "epoch": 2.140754039497307, + "grad_norm": 0.8352635502815247, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 29810 + }, + { + "epoch": 2.141472172351885, + "grad_norm": 0.8422067165374756, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 29820 + }, + { + "epoch": 2.142190305206463, + "grad_norm": 0.8845202326774597, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 29830 + }, + { + "epoch": 2.1429084380610415, + "grad_norm": 0.659397542476654, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 29840 + }, + { + "epoch": 2.1436265709156195, + "grad_norm": 0.6233306527137756, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 29850 + }, + { + "epoch": 2.1443447037701975, + "grad_norm": 0.8951199054718018, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 29860 + }, + { + "epoch": 2.1450628366247755, + "grad_norm": 0.6980211734771729, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 29870 + }, + { + "epoch": 2.1457809694793535, + "grad_norm": 0.8463385105133057, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29880 + }, + { + "epoch": 2.146499102333932, + "grad_norm": 0.682183027267456, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 29890 + }, + { + "epoch": 2.14721723518851, + "grad_norm": 0.8491033911705017, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 29900 + }, + { + "epoch": 2.147935368043088, + "grad_norm": 0.8112631440162659, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 29910 + }, + { + "epoch": 2.148653500897666, + "grad_norm": 1.0186359882354736, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29920 + }, + { + "epoch": 2.149371633752244, + "grad_norm": 0.7904929518699646, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 29930 + }, + { + "epoch": 2.1500897666068224, + "grad_norm": 0.8381312489509583, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29940 + }, + { + "epoch": 2.1508078994614004, + "grad_norm": 0.7596192359924316, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 29950 + }, + { + "epoch": 2.1515260323159784, + "grad_norm": 0.7532448768615723, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 29960 + }, + { + "epoch": 2.1522441651705564, + "grad_norm": 0.7877430319786072, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 29970 + }, + { + "epoch": 2.152962298025135, + "grad_norm": 0.6870610117912292, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 29980 + }, + { + "epoch": 2.153680430879713, + "grad_norm": 0.7154987454414368, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 29990 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 0.7692370414733887, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 30000 + }, + { + "epoch": 2.155116696588869, + "grad_norm": 0.7745859026908875, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 30010 + }, + { + "epoch": 2.155834829443447, + "grad_norm": 0.718207061290741, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 30020 + }, + { + "epoch": 2.1565529622980253, + "grad_norm": 0.8851615786552429, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30030 + }, + { + "epoch": 2.1572710951526033, + "grad_norm": 0.736194372177124, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 30040 + }, + { + "epoch": 2.1579892280071813, + "grad_norm": 0.9908117055892944, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 30050 + }, + { + "epoch": 2.1587073608617593, + "grad_norm": 0.6772316694259644, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30060 + }, + { + "epoch": 2.1594254937163377, + "grad_norm": 0.7474411725997925, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 30070 + }, + { + "epoch": 2.1601436265709157, + "grad_norm": 0.8140033483505249, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 30080 + }, + { + "epoch": 2.1608617594254937, + "grad_norm": 0.912555992603302, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 30090 + }, + { + "epoch": 2.1615798922800717, + "grad_norm": 0.8189636468887329, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 30100 + }, + { + "epoch": 2.1622980251346497, + "grad_norm": 0.7520000338554382, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 30110 + }, + { + "epoch": 2.163016157989228, + "grad_norm": 0.9635465741157532, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 30120 + }, + { + "epoch": 2.163734290843806, + "grad_norm": 0.9139830470085144, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 30130 + }, + { + "epoch": 2.164452423698384, + "grad_norm": 0.844384491443634, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 30140 + }, + { + "epoch": 2.165170556552962, + "grad_norm": 0.8296793103218079, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 30150 + }, + { + "epoch": 2.16588868940754, + "grad_norm": 0.7929309606552124, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30160 + }, + { + "epoch": 2.1666068222621186, + "grad_norm": 0.8046507239341736, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 30170 + }, + { + "epoch": 2.1673249551166966, + "grad_norm": 0.8161377310752869, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 30180 + }, + { + "epoch": 2.1680430879712747, + "grad_norm": 0.6984363794326782, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 30190 + }, + { + "epoch": 2.1687612208258527, + "grad_norm": 0.8578489422798157, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30200 + }, + { + "epoch": 2.1694793536804307, + "grad_norm": 0.8051524758338928, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30210 + }, + { + "epoch": 2.170197486535009, + "grad_norm": 0.6775792241096497, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 30220 + }, + { + "epoch": 2.170915619389587, + "grad_norm": 0.7102242708206177, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 30230 + }, + { + "epoch": 2.171633752244165, + "grad_norm": 0.9038975238800049, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 30240 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 0.8509918451309204, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 30250 + }, + { + "epoch": 2.1730700179533216, + "grad_norm": 0.8816375732421875, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 30260 + }, + { + "epoch": 2.1737881508078996, + "grad_norm": 0.7907037138938904, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 30270 + }, + { + "epoch": 2.1745062836624776, + "grad_norm": 0.7104434967041016, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 30280 + }, + { + "epoch": 2.1752244165170556, + "grad_norm": 1.028658151626587, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 30290 + }, + { + "epoch": 2.1759425493716336, + "grad_norm": 0.8542430400848389, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 30300 + }, + { + "epoch": 2.176660682226212, + "grad_norm": 0.7438064813613892, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30310 + }, + { + "epoch": 2.17737881508079, + "grad_norm": 0.8384708762168884, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 30320 + }, + { + "epoch": 2.178096947935368, + "grad_norm": 0.9034163355827332, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 30330 + }, + { + "epoch": 2.178815080789946, + "grad_norm": 0.9659526944160461, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 30340 + }, + { + "epoch": 2.1795332136445245, + "grad_norm": 0.6685642600059509, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 30350 + }, + { + "epoch": 2.1802513464991025, + "grad_norm": 0.9180589318275452, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 30360 + }, + { + "epoch": 2.1809694793536805, + "grad_norm": 0.9550795555114746, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 30370 + }, + { + "epoch": 2.1816876122082585, + "grad_norm": 0.8517686724662781, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 30380 + }, + { + "epoch": 2.1824057450628365, + "grad_norm": 0.7351927161216736, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 30390 + }, + { + "epoch": 2.183123877917415, + "grad_norm": 0.8439408540725708, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 30400 + }, + { + "epoch": 2.183842010771993, + "grad_norm": 0.8322570323944092, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 30410 + }, + { + "epoch": 2.184560143626571, + "grad_norm": 0.6735888123512268, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 30420 + }, + { + "epoch": 2.185278276481149, + "grad_norm": 0.7273133397102356, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 30430 + }, + { + "epoch": 2.185996409335727, + "grad_norm": 0.7841959595680237, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 30440 + }, + { + "epoch": 2.1867145421903054, + "grad_norm": 0.67259281873703, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 30450 + }, + { + "epoch": 2.1874326750448834, + "grad_norm": 0.7646223306655884, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 30460 + }, + { + "epoch": 2.1881508078994614, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 30470 + }, + { + "epoch": 2.1888689407540394, + "grad_norm": 0.8818342685699463, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 30480 + }, + { + "epoch": 2.1895870736086174, + "grad_norm": 0.7421377897262573, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 30490 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 0.8180080652236938, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30500 + }, + { + "epoch": 2.191023339317774, + "grad_norm": 0.8003571033477783, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30510 + }, + { + "epoch": 2.191741472172352, + "grad_norm": 0.8200605511665344, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 30520 + }, + { + "epoch": 2.19245960502693, + "grad_norm": 0.8878887295722961, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 30530 + }, + { + "epoch": 2.1931777378815083, + "grad_norm": 0.8518163561820984, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 30540 + }, + { + "epoch": 2.1938958707360863, + "grad_norm": 0.8182454705238342, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 30550 + }, + { + "epoch": 2.1946140035906643, + "grad_norm": 0.9395919442176819, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 30560 + }, + { + "epoch": 2.1953321364452423, + "grad_norm": 0.7916256189346313, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 30570 + }, + { + "epoch": 2.1960502692998203, + "grad_norm": 0.7303445339202881, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 30580 + }, + { + "epoch": 2.1967684021543987, + "grad_norm": 0.7407387495040894, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 30590 + }, + { + "epoch": 2.1974865350089767, + "grad_norm": 0.7410500645637512, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 30600 + }, + { + "epoch": 2.1982046678635547, + "grad_norm": 0.9176440834999084, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 30610 + }, + { + "epoch": 2.1989228007181327, + "grad_norm": 0.8823038935661316, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 30620 + }, + { + "epoch": 2.199640933572711, + "grad_norm": 0.9263436198234558, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 30630 + }, + { + "epoch": 2.200359066427289, + "grad_norm": 0.6753571033477783, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 30640 + }, + { + "epoch": 2.201077199281867, + "grad_norm": 0.841160774230957, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 30650 + }, + { + "epoch": 2.201795332136445, + "grad_norm": 0.8786441683769226, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 30660 + }, + { + "epoch": 2.202513464991023, + "grad_norm": 0.8833681344985962, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 30670 + }, + { + "epoch": 2.2032315978456016, + "grad_norm": 0.6609824299812317, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 30680 + }, + { + "epoch": 2.2039497307001796, + "grad_norm": 0.7308626174926758, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 30690 + }, + { + "epoch": 2.2046678635547576, + "grad_norm": 0.8854711055755615, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 30700 + }, + { + "epoch": 2.2053859964093356, + "grad_norm": 0.839043140411377, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 30710 + }, + { + "epoch": 2.2061041292639136, + "grad_norm": 0.9030174016952515, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 30720 + }, + { + "epoch": 2.206822262118492, + "grad_norm": 0.6856667399406433, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 30730 + }, + { + "epoch": 2.20754039497307, + "grad_norm": 0.8823501467704773, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 30740 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 0.8501278162002563, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 30750 + }, + { + "epoch": 2.208976660682226, + "grad_norm": 0.8099446892738342, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 30760 + }, + { + "epoch": 2.209694793536804, + "grad_norm": 0.7203072905540466, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 30770 + }, + { + "epoch": 2.2104129263913825, + "grad_norm": 1.0898563861846924, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 30780 + }, + { + "epoch": 2.2111310592459605, + "grad_norm": 0.8157216906547546, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 30790 + }, + { + "epoch": 2.2118491921005385, + "grad_norm": 0.7617478966712952, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 30800 + }, + { + "epoch": 2.2125673249551165, + "grad_norm": 0.790503978729248, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 30810 + }, + { + "epoch": 2.213285457809695, + "grad_norm": 0.9289199113845825, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 30820 + }, + { + "epoch": 2.214003590664273, + "grad_norm": 0.9267001748085022, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 30830 + }, + { + "epoch": 2.214721723518851, + "grad_norm": 0.716023862361908, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 30840 + }, + { + "epoch": 2.215439856373429, + "grad_norm": 0.8733863234519958, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 30850 + }, + { + "epoch": 2.216157989228007, + "grad_norm": 0.7743660807609558, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 30860 + }, + { + "epoch": 2.2168761220825854, + "grad_norm": 0.7974567413330078, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 30870 + }, + { + "epoch": 2.2175942549371634, + "grad_norm": 0.6617984771728516, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 30880 + }, + { + "epoch": 2.2183123877917414, + "grad_norm": 0.6925143003463745, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 30890 + }, + { + "epoch": 2.2190305206463194, + "grad_norm": 0.6853532195091248, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 30900 + }, + { + "epoch": 2.219748653500898, + "grad_norm": 0.7964699268341064, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 30910 + }, + { + "epoch": 2.220466786355476, + "grad_norm": 0.8116228580474854, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 30920 + }, + { + "epoch": 2.221184919210054, + "grad_norm": 1.0121010541915894, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 30930 + }, + { + "epoch": 2.221903052064632, + "grad_norm": 0.7348445653915405, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 30940 + }, + { + "epoch": 2.22262118491921, + "grad_norm": 0.8998047709465027, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 30950 + }, + { + "epoch": 2.2233393177737883, + "grad_norm": 0.6108106970787048, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 30960 + }, + { + "epoch": 2.2240574506283664, + "grad_norm": 1.287834882736206, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 30970 + }, + { + "epoch": 2.2247755834829444, + "grad_norm": 0.8584468960762024, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 30980 + }, + { + "epoch": 2.2254937163375224, + "grad_norm": 0.865276038646698, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 30990 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 0.8713302612304688, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 31000 + }, + { + "epoch": 2.226929982046679, + "grad_norm": 0.9210535883903503, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 31010 + }, + { + "epoch": 2.227648114901257, + "grad_norm": 0.8578430414199829, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 31020 + }, + { + "epoch": 2.228366247755835, + "grad_norm": 0.7128387093544006, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 31030 + }, + { + "epoch": 2.229084380610413, + "grad_norm": 0.8059941530227661, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 31040 + }, + { + "epoch": 2.229802513464991, + "grad_norm": 0.8043261170387268, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 31050 + }, + { + "epoch": 2.2305206463195693, + "grad_norm": 0.9260253310203552, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 31060 + }, + { + "epoch": 2.2312387791741473, + "grad_norm": 0.7908085584640503, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 31070 + }, + { + "epoch": 2.2319569120287253, + "grad_norm": 0.7860442996025085, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 31080 + }, + { + "epoch": 2.2326750448833033, + "grad_norm": 0.8388702273368835, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 31090 + }, + { + "epoch": 2.2333931777378817, + "grad_norm": 0.835686206817627, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 31100 + }, + { + "epoch": 2.2341113105924597, + "grad_norm": 0.8148298859596252, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 31110 + }, + { + "epoch": 2.2348294434470377, + "grad_norm": 0.8501878976821899, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 31120 + }, + { + "epoch": 2.2355475763016157, + "grad_norm": 0.793323278427124, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 31130 + }, + { + "epoch": 2.2362657091561937, + "grad_norm": 0.8234742879867554, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31140 + }, + { + "epoch": 2.236983842010772, + "grad_norm": 0.8691303133964539, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 31150 + }, + { + "epoch": 2.23770197486535, + "grad_norm": 0.8707090020179749, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 31160 + }, + { + "epoch": 2.238420107719928, + "grad_norm": 0.8468940854072571, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 31170 + }, + { + "epoch": 2.239138240574506, + "grad_norm": 0.7275772094726562, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 31180 + }, + { + "epoch": 2.2398563734290846, + "grad_norm": 0.8765808939933777, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 31190 + }, + { + "epoch": 2.2405745062836626, + "grad_norm": 1.02803635597229, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 31200 + }, + { + "epoch": 2.2412926391382406, + "grad_norm": 0.7999185919761658, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 31210 + }, + { + "epoch": 2.2420107719928186, + "grad_norm": 0.5711870789527893, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 31220 + }, + { + "epoch": 2.2427289048473966, + "grad_norm": 0.7183604836463928, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 31230 + }, + { + "epoch": 2.243447037701975, + "grad_norm": 0.8819206357002258, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 31240 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 0.9078969955444336, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 31250 + }, + { + "epoch": 2.244883303411131, + "grad_norm": 1.184506893157959, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 31260 + }, + { + "epoch": 2.245601436265709, + "grad_norm": 0.8660752177238464, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 31270 + }, + { + "epoch": 2.246319569120287, + "grad_norm": 1.011796236038208, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 31280 + }, + { + "epoch": 2.2470377019748655, + "grad_norm": 0.9168157577514648, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 31290 + }, + { + "epoch": 2.2477558348294435, + "grad_norm": 0.7798577547073364, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 31300 + }, + { + "epoch": 2.2484739676840215, + "grad_norm": 0.6609913110733032, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 31310 + }, + { + "epoch": 2.2491921005385995, + "grad_norm": 0.64737868309021, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 31320 + }, + { + "epoch": 2.2499102333931775, + "grad_norm": 1.0700385570526123, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 31330 + }, + { + "epoch": 2.250628366247756, + "grad_norm": 0.7838551998138428, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 31340 + }, + { + "epoch": 2.251346499102334, + "grad_norm": 0.9225728511810303, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 31350 + }, + { + "epoch": 2.252064631956912, + "grad_norm": 0.7956384420394897, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 31360 + }, + { + "epoch": 2.25278276481149, + "grad_norm": 0.7645466923713684, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 31370 + }, + { + "epoch": 2.2535008976660684, + "grad_norm": 0.9595549702644348, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 31380 + }, + { + "epoch": 2.2542190305206464, + "grad_norm": 0.6124163866043091, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 31390 + }, + { + "epoch": 2.2549371633752244, + "grad_norm": 0.7531530261039734, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 31400 + }, + { + "epoch": 2.2556552962298024, + "grad_norm": 0.6904721856117249, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 31410 + }, + { + "epoch": 2.2563734290843804, + "grad_norm": 0.7644204497337341, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 31420 + }, + { + "epoch": 2.257091561938959, + "grad_norm": 0.7879737019538879, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 31430 + }, + { + "epoch": 2.257809694793537, + "grad_norm": 0.796450138092041, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 31440 + }, + { + "epoch": 2.258527827648115, + "grad_norm": 0.7536656856536865, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31450 + }, + { + "epoch": 2.259245960502693, + "grad_norm": 0.6797451376914978, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 31460 + }, + { + "epoch": 2.2599640933572713, + "grad_norm": 0.7833347320556641, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 31470 + }, + { + "epoch": 2.2606822262118493, + "grad_norm": 0.7571428418159485, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 31480 + }, + { + "epoch": 2.2614003590664273, + "grad_norm": 0.7028690576553345, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 31490 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 0.7854651212692261, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 31500 + }, + { + "epoch": 2.2628366247755833, + "grad_norm": 1.1924974918365479, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 31510 + }, + { + "epoch": 2.2635547576301613, + "grad_norm": 0.8087588548660278, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 31520 + }, + { + "epoch": 2.26427289048474, + "grad_norm": 0.8521981835365295, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31530 + }, + { + "epoch": 2.264991023339318, + "grad_norm": 0.754585862159729, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 31540 + }, + { + "epoch": 2.265709156193896, + "grad_norm": 0.8403395414352417, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 31550 + }, + { + "epoch": 2.266427289048474, + "grad_norm": 0.9724786877632141, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 31560 + }, + { + "epoch": 2.2671454219030522, + "grad_norm": 0.7568767070770264, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 31570 + }, + { + "epoch": 2.2678635547576302, + "grad_norm": 0.712009608745575, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 31580 + }, + { + "epoch": 2.2685816876122082, + "grad_norm": 0.7649937868118286, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 31590 + }, + { + "epoch": 2.2692998204667862, + "grad_norm": 0.7319537997245789, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 31600 + }, + { + "epoch": 2.2700179533213642, + "grad_norm": 0.9597942233085632, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 31610 + }, + { + "epoch": 2.2707360861759427, + "grad_norm": 0.7403358817100525, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 31620 + }, + { + "epoch": 2.2714542190305207, + "grad_norm": 0.7395114898681641, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 31630 + }, + { + "epoch": 2.2721723518850987, + "grad_norm": 0.8835344314575195, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 31640 + }, + { + "epoch": 2.2728904847396767, + "grad_norm": 0.76587975025177, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 31650 + }, + { + "epoch": 2.273608617594255, + "grad_norm": 0.6472584009170532, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 31660 + }, + { + "epoch": 2.274326750448833, + "grad_norm": 1.0170460939407349, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 31670 + }, + { + "epoch": 2.275044883303411, + "grad_norm": 0.8170912265777588, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 31680 + }, + { + "epoch": 2.275763016157989, + "grad_norm": 0.6821279525756836, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 31690 + }, + { + "epoch": 2.276481149012567, + "grad_norm": 0.8150709867477417, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 31700 + }, + { + "epoch": 2.2771992818671456, + "grad_norm": 0.6786386370658875, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 31710 + }, + { + "epoch": 2.2779174147217236, + "grad_norm": 0.8871912360191345, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 31720 + }, + { + "epoch": 2.2786355475763016, + "grad_norm": 0.7710220813751221, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 31730 + }, + { + "epoch": 2.2793536804308796, + "grad_norm": 0.8073079586029053, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 31740 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 0.8228550553321838, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 31750 + }, + { + "epoch": 2.280789946140036, + "grad_norm": 0.7987996339797974, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 31760 + }, + { + "epoch": 2.281508078994614, + "grad_norm": 0.744326651096344, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 31770 + }, + { + "epoch": 2.282226211849192, + "grad_norm": 0.7672302722930908, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 31780 + }, + { + "epoch": 2.28294434470377, + "grad_norm": 0.8079774975776672, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 31790 + }, + { + "epoch": 2.283662477558348, + "grad_norm": 0.7383643984794617, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 31800 + }, + { + "epoch": 2.2843806104129265, + "grad_norm": 0.8542332649230957, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 31810 + }, + { + "epoch": 2.2850987432675045, + "grad_norm": 0.7657321691513062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 31820 + }, + { + "epoch": 2.2858168761220825, + "grad_norm": 0.7485944628715515, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 31830 + }, + { + "epoch": 2.2865350089766605, + "grad_norm": 0.7817596793174744, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 31840 + }, + { + "epoch": 2.287253141831239, + "grad_norm": 0.840421736240387, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31850 + }, + { + "epoch": 2.287971274685817, + "grad_norm": 0.8190447688102722, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 31860 + }, + { + "epoch": 2.288689407540395, + "grad_norm": 0.9582287669181824, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 31870 + }, + { + "epoch": 2.289407540394973, + "grad_norm": 1.0939116477966309, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 31880 + }, + { + "epoch": 2.290125673249551, + "grad_norm": 1.0901678800582886, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 31890 + }, + { + "epoch": 2.2908438061041294, + "grad_norm": 0.8025168776512146, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 31900 + }, + { + "epoch": 2.2915619389587074, + "grad_norm": 0.8157371878623962, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 31910 + }, + { + "epoch": 2.2922800718132854, + "grad_norm": 0.7735328078269958, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 31920 + }, + { + "epoch": 2.2929982046678634, + "grad_norm": 0.7501550316810608, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 31930 + }, + { + "epoch": 2.293716337522442, + "grad_norm": 0.76664799451828, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 31940 + }, + { + "epoch": 2.29443447037702, + "grad_norm": 1.0044599771499634, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 31950 + }, + { + "epoch": 2.295152603231598, + "grad_norm": 0.7773551344871521, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 31960 + }, + { + "epoch": 2.295870736086176, + "grad_norm": 0.9021226763725281, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 31970 + }, + { + "epoch": 2.296588868940754, + "grad_norm": 0.9075915813446045, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 31980 + }, + { + "epoch": 2.2973070017953323, + "grad_norm": 0.9109290242195129, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 31990 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 0.7742900252342224, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32000 + }, + { + "epoch": 2.2987432675044883, + "grad_norm": 0.633260190486908, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 32010 + }, + { + "epoch": 2.2994614003590663, + "grad_norm": 0.8593834042549133, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 32020 + }, + { + "epoch": 2.3001795332136448, + "grad_norm": 0.88165283203125, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32030 + }, + { + "epoch": 2.3008976660682228, + "grad_norm": 0.7840633988380432, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 32040 + }, + { + "epoch": 2.3016157989228008, + "grad_norm": 0.8150764107704163, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 32050 + }, + { + "epoch": 2.3023339317773788, + "grad_norm": 0.7683324813842773, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32060 + }, + { + "epoch": 2.3030520646319568, + "grad_norm": 0.7581049799919128, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 32070 + }, + { + "epoch": 2.3037701974865348, + "grad_norm": 0.911687970161438, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32080 + }, + { + "epoch": 2.3044883303411132, + "grad_norm": 1.0596355199813843, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32090 + }, + { + "epoch": 2.3052064631956912, + "grad_norm": 0.7329661846160889, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 32100 + }, + { + "epoch": 2.3059245960502692, + "grad_norm": 0.8251074552536011, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 32110 + }, + { + "epoch": 2.3066427289048472, + "grad_norm": 0.7765523195266724, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 32120 + }, + { + "epoch": 2.3073608617594257, + "grad_norm": 0.8246980905532837, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 32130 + }, + { + "epoch": 2.3080789946140037, + "grad_norm": 0.833387017250061, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 32140 + }, + { + "epoch": 2.3087971274685817, + "grad_norm": 0.9558065533638, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 32150 + }, + { + "epoch": 2.3095152603231597, + "grad_norm": 0.788151204586029, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 32160 + }, + { + "epoch": 2.3102333931777377, + "grad_norm": 0.8662320971488953, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 32170 + }, + { + "epoch": 2.310951526032316, + "grad_norm": 0.7079060673713684, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 32180 + }, + { + "epoch": 2.311669658886894, + "grad_norm": 0.8477022647857666, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 32190 + }, + { + "epoch": 2.312387791741472, + "grad_norm": 0.6549711227416992, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 32200 + }, + { + "epoch": 2.31310592459605, + "grad_norm": 0.8274375796318054, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 32210 + }, + { + "epoch": 2.3138240574506286, + "grad_norm": 0.6305822730064392, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 32220 + }, + { + "epoch": 2.3145421903052066, + "grad_norm": 0.8105725049972534, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 32230 + }, + { + "epoch": 2.3152603231597846, + "grad_norm": 0.7317119240760803, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 32240 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 0.7729924917221069, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 32250 + }, + { + "epoch": 2.3166965888689406, + "grad_norm": 0.8092145919799805, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 32260 + }, + { + "epoch": 2.317414721723519, + "grad_norm": 0.8723762035369873, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 32270 + }, + { + "epoch": 2.318132854578097, + "grad_norm": 0.9699533581733704, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 32280 + }, + { + "epoch": 2.318850987432675, + "grad_norm": 1.2972444295883179, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 32290 + }, + { + "epoch": 2.319569120287253, + "grad_norm": 0.7888450622558594, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 32300 + }, + { + "epoch": 2.3202872531418315, + "grad_norm": 0.7457000017166138, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 32310 + }, + { + "epoch": 2.3210053859964095, + "grad_norm": 0.7270606756210327, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 32320 + }, + { + "epoch": 2.3217235188509875, + "grad_norm": 0.7930711507797241, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32330 + }, + { + "epoch": 2.3224416517055655, + "grad_norm": 0.9015030264854431, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 32340 + }, + { + "epoch": 2.3231597845601435, + "grad_norm": 0.9385523796081543, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 32350 + }, + { + "epoch": 2.3238779174147215, + "grad_norm": 0.7293606400489807, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 32360 + }, + { + "epoch": 2.3245960502693, + "grad_norm": 0.797618567943573, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32370 + }, + { + "epoch": 2.325314183123878, + "grad_norm": 0.8588258028030396, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 32380 + }, + { + "epoch": 2.326032315978456, + "grad_norm": 0.7490078210830688, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 32390 + }, + { + "epoch": 2.326750448833034, + "grad_norm": 0.7569956183433533, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 32400 + }, + { + "epoch": 2.3274685816876124, + "grad_norm": 0.8754122853279114, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 32410 + }, + { + "epoch": 2.3281867145421904, + "grad_norm": 0.9410699605941772, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 32420 + }, + { + "epoch": 2.3289048473967684, + "grad_norm": 1.1309062242507935, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 32430 + }, + { + "epoch": 2.3296229802513464, + "grad_norm": 0.7923168540000916, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 32440 + }, + { + "epoch": 2.3303411131059244, + "grad_norm": 0.830387532711029, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 32450 + }, + { + "epoch": 2.331059245960503, + "grad_norm": 0.9087454080581665, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 32460 + }, + { + "epoch": 2.331777378815081, + "grad_norm": 0.8892660737037659, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 32470 + }, + { + "epoch": 2.332495511669659, + "grad_norm": 0.84930819272995, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 32480 + }, + { + "epoch": 2.333213644524237, + "grad_norm": 0.7736781239509583, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 32490 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 0.7396222352981567, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 32500 + }, + { + "epoch": 2.3346499102333933, + "grad_norm": 0.7710241079330444, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 32510 + }, + { + "epoch": 2.3353680430879713, + "grad_norm": 0.7297301888465881, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 32520 + }, + { + "epoch": 2.3360861759425493, + "grad_norm": 0.9084094166755676, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 32530 + }, + { + "epoch": 2.3368043087971273, + "grad_norm": 0.6425859332084656, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 32540 + }, + { + "epoch": 2.3375224416517058, + "grad_norm": 0.8646581172943115, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 32550 + }, + { + "epoch": 2.3382405745062838, + "grad_norm": 0.91925048828125, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 32560 + }, + { + "epoch": 2.3389587073608618, + "grad_norm": 0.8687716722488403, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 32570 + }, + { + "epoch": 2.3396768402154398, + "grad_norm": 0.9769517183303833, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 32580 + }, + { + "epoch": 2.340394973070018, + "grad_norm": 0.7240557074546814, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 32590 + }, + { + "epoch": 2.341113105924596, + "grad_norm": 0.6631549000740051, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32600 + }, + { + "epoch": 2.341831238779174, + "grad_norm": 0.9103635549545288, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 32610 + }, + { + "epoch": 2.342549371633752, + "grad_norm": 0.8718403577804565, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 32620 + }, + { + "epoch": 2.34326750448833, + "grad_norm": 0.8020271062850952, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 32630 + }, + { + "epoch": 2.343985637342908, + "grad_norm": 0.7834265232086182, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 32640 + }, + { + "epoch": 2.3447037701974867, + "grad_norm": 0.8909988403320312, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 32650 + }, + { + "epoch": 2.3454219030520647, + "grad_norm": 0.6915582418441772, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 32660 + }, + { + "epoch": 2.3461400359066427, + "grad_norm": 0.8829401135444641, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 32670 + }, + { + "epoch": 2.3468581687612207, + "grad_norm": 0.8869150876998901, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 32680 + }, + { + "epoch": 2.347576301615799, + "grad_norm": 0.8348933458328247, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 32690 + }, + { + "epoch": 2.348294434470377, + "grad_norm": 0.7591108679771423, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32700 + }, + { + "epoch": 2.349012567324955, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 32710 + }, + { + "epoch": 2.349730700179533, + "grad_norm": 0.8537896275520325, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 32720 + }, + { + "epoch": 2.350448833034111, + "grad_norm": 0.7750797867774963, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 32730 + }, + { + "epoch": 2.3511669658886896, + "grad_norm": 0.7553941607475281, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 32740 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 0.8083372712135315, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 32750 + }, + { + "epoch": 2.3526032315978456, + "grad_norm": 0.8016324043273926, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 32760 + }, + { + "epoch": 2.3533213644524236, + "grad_norm": 0.7524061799049377, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 32770 + }, + { + "epoch": 2.354039497307002, + "grad_norm": 0.9046763777732849, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 32780 + }, + { + "epoch": 2.35475763016158, + "grad_norm": 0.9704324007034302, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 32790 + }, + { + "epoch": 2.355475763016158, + "grad_norm": 0.8756019473075867, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 32800 + }, + { + "epoch": 2.356193895870736, + "grad_norm": 0.7345646023750305, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32810 + }, + { + "epoch": 2.356912028725314, + "grad_norm": 0.8022899031639099, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 32820 + }, + { + "epoch": 2.3576301615798925, + "grad_norm": 0.7663353085517883, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 32830 + }, + { + "epoch": 2.3583482944344705, + "grad_norm": 0.7802956104278564, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32840 + }, + { + "epoch": 2.3590664272890485, + "grad_norm": 0.8130960464477539, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 32850 + }, + { + "epoch": 2.3597845601436265, + "grad_norm": 0.9671252369880676, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32860 + }, + { + "epoch": 2.3605026929982045, + "grad_norm": 0.8806724548339844, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32870 + }, + { + "epoch": 2.361220825852783, + "grad_norm": 0.9378283619880676, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 32880 + }, + { + "epoch": 2.361938958707361, + "grad_norm": 0.8638162612915039, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32890 + }, + { + "epoch": 2.362657091561939, + "grad_norm": 0.7321885228157043, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 32900 + }, + { + "epoch": 2.363375224416517, + "grad_norm": 0.8445415496826172, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 32910 + }, + { + "epoch": 2.364093357271095, + "grad_norm": 0.915715754032135, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 32920 + }, + { + "epoch": 2.3648114901256734, + "grad_norm": 0.8674854040145874, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 32930 + }, + { + "epoch": 2.3655296229802514, + "grad_norm": 0.7577189207077026, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 32940 + }, + { + "epoch": 2.3662477558348294, + "grad_norm": 0.8649988174438477, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 32950 + }, + { + "epoch": 2.3669658886894074, + "grad_norm": 0.9760734438896179, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 32960 + }, + { + "epoch": 2.367684021543986, + "grad_norm": 0.8909491300582886, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 32970 + }, + { + "epoch": 2.368402154398564, + "grad_norm": 0.6970168948173523, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32980 + }, + { + "epoch": 2.369120287253142, + "grad_norm": 0.8208426237106323, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 32990 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 0.8477405309677124, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 33000 + }, + { + "epoch": 2.370556552962298, + "grad_norm": 0.7771625518798828, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 33010 + }, + { + "epoch": 2.3712746858168763, + "grad_norm": 0.7811821103096008, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33020 + }, + { + "epoch": 2.3719928186714543, + "grad_norm": 0.6280415654182434, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33030 + }, + { + "epoch": 2.3727109515260323, + "grad_norm": 0.8733929395675659, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 33040 + }, + { + "epoch": 2.3734290843806103, + "grad_norm": 0.6169558167457581, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33050 + }, + { + "epoch": 2.3741472172351887, + "grad_norm": 0.7414724826812744, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33060 + }, + { + "epoch": 2.3748653500897667, + "grad_norm": 0.7484683990478516, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 33070 + }, + { + "epoch": 2.3755834829443447, + "grad_norm": 0.8495098948478699, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 33080 + }, + { + "epoch": 2.3763016157989227, + "grad_norm": 0.9057353734970093, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 33090 + }, + { + "epoch": 2.3770197486535007, + "grad_norm": 0.8028274178504944, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 33100 + }, + { + "epoch": 2.377737881508079, + "grad_norm": 1.2398128509521484, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 33110 + }, + { + "epoch": 2.378456014362657, + "grad_norm": 0.7894110679626465, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 33120 + }, + { + "epoch": 2.379174147217235, + "grad_norm": 0.8530096411705017, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 33130 + }, + { + "epoch": 2.379892280071813, + "grad_norm": 0.892613410949707, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 33140 + }, + { + "epoch": 2.380610412926391, + "grad_norm": 0.868606448173523, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 33150 + }, + { + "epoch": 2.3813285457809696, + "grad_norm": 0.6801115870475769, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 33160 + }, + { + "epoch": 2.3820466786355476, + "grad_norm": 0.9517148733139038, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 33170 + }, + { + "epoch": 2.3827648114901256, + "grad_norm": 0.8986499309539795, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 33180 + }, + { + "epoch": 2.3834829443447036, + "grad_norm": 0.8467642068862915, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33190 + }, + { + "epoch": 2.3842010771992816, + "grad_norm": 0.8400940299034119, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 33200 + }, + { + "epoch": 2.38491921005386, + "grad_norm": 0.86443030834198, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 33210 + }, + { + "epoch": 2.385637342908438, + "grad_norm": 0.8599014282226562, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 33220 + }, + { + "epoch": 2.386355475763016, + "grad_norm": 0.868735134601593, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33230 + }, + { + "epoch": 2.387073608617594, + "grad_norm": 0.941734790802002, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 33240 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 0.9342881441116333, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 33250 + }, + { + "epoch": 2.3885098743267505, + "grad_norm": 1.012920618057251, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 33260 + }, + { + "epoch": 2.3892280071813286, + "grad_norm": 0.6949151754379272, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 33270 + }, + { + "epoch": 2.3899461400359066, + "grad_norm": 0.8283912539482117, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 33280 + }, + { + "epoch": 2.3906642728904846, + "grad_norm": 0.807273805141449, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 33290 + }, + { + "epoch": 2.391382405745063, + "grad_norm": 0.8109124302864075, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 33300 + }, + { + "epoch": 2.392100538599641, + "grad_norm": 0.7477563619613647, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 33310 + }, + { + "epoch": 2.392818671454219, + "grad_norm": 0.6961637735366821, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 33320 + }, + { + "epoch": 2.393536804308797, + "grad_norm": 0.9424173831939697, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 33330 + }, + { + "epoch": 2.3942549371633755, + "grad_norm": 0.8289623856544495, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 33340 + }, + { + "epoch": 2.3949730700179535, + "grad_norm": 0.8106551170349121, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 33350 + }, + { + "epoch": 2.3956912028725315, + "grad_norm": 0.8800507187843323, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33360 + }, + { + "epoch": 2.3964093357271095, + "grad_norm": 0.7662274241447449, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 33370 + }, + { + "epoch": 2.3971274685816875, + "grad_norm": 0.889204740524292, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 33380 + }, + { + "epoch": 2.3978456014362655, + "grad_norm": 0.7991349697113037, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 33390 + }, + { + "epoch": 2.398563734290844, + "grad_norm": 0.8210278749465942, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 33400 + }, + { + "epoch": 2.399281867145422, + "grad_norm": 0.91801917552948, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 33410 + }, + { + "epoch": 2.4, + "grad_norm": 0.8086220622062683, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 33420 + }, + { + "epoch": 2.400718132854578, + "grad_norm": 0.901613175868988, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 33430 + }, + { + "epoch": 2.4014362657091564, + "grad_norm": 0.9865965247154236, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 33440 + }, + { + "epoch": 2.4021543985637344, + "grad_norm": 0.8160675168037415, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 33450 + }, + { + "epoch": 2.4028725314183124, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33460 + }, + { + "epoch": 2.4035906642728904, + "grad_norm": 0.8490013480186462, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 33470 + }, + { + "epoch": 2.4043087971274684, + "grad_norm": 0.6947163939476013, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33480 + }, + { + "epoch": 2.405026929982047, + "grad_norm": 0.7984827756881714, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 33490 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 0.7826083302497864, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 33500 + }, + { + "epoch": 2.406463195691203, + "grad_norm": 0.8213959336280823, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 33510 + }, + { + "epoch": 2.407181328545781, + "grad_norm": 0.8790069818496704, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 33520 + }, + { + "epoch": 2.4078994614003593, + "grad_norm": 0.9093378782272339, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 33530 + }, + { + "epoch": 2.4086175942549373, + "grad_norm": 0.8085389137268066, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 33540 + }, + { + "epoch": 2.4093357271095153, + "grad_norm": 0.7952343225479126, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 33550 + }, + { + "epoch": 2.4100538599640933, + "grad_norm": 0.9576563835144043, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 33560 + }, + { + "epoch": 2.4107719928186713, + "grad_norm": 0.7722929120063782, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 33570 + }, + { + "epoch": 2.4114901256732497, + "grad_norm": 0.8634604215621948, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 33580 + }, + { + "epoch": 2.4122082585278277, + "grad_norm": 0.7805271148681641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 33590 + }, + { + "epoch": 2.4129263913824057, + "grad_norm": 0.8274481296539307, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 33600 + }, + { + "epoch": 2.4136445242369837, + "grad_norm": 0.9265141487121582, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 33610 + }, + { + "epoch": 2.414362657091562, + "grad_norm": 0.7497374415397644, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 33620 + }, + { + "epoch": 2.41508078994614, + "grad_norm": 0.7048972249031067, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 33630 + }, + { + "epoch": 2.415798922800718, + "grad_norm": 0.8449550271034241, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 33640 + }, + { + "epoch": 2.416517055655296, + "grad_norm": 0.7581984400749207, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 33650 + }, + { + "epoch": 2.417235188509874, + "grad_norm": 0.7744191288948059, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 33660 + }, + { + "epoch": 2.417953321364452, + "grad_norm": 0.6736614108085632, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 33670 + }, + { + "epoch": 2.4186714542190306, + "grad_norm": 0.985431432723999, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33680 + }, + { + "epoch": 2.4193895870736086, + "grad_norm": 0.8027978539466858, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33690 + }, + { + "epoch": 2.4201077199281866, + "grad_norm": 0.6809377074241638, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 33700 + }, + { + "epoch": 2.4208258527827646, + "grad_norm": 0.8305349946022034, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 33710 + }, + { + "epoch": 2.421543985637343, + "grad_norm": 0.7632496356964111, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 33720 + }, + { + "epoch": 2.422262118491921, + "grad_norm": 0.7241050601005554, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 33730 + }, + { + "epoch": 2.422980251346499, + "grad_norm": 0.6729857325553894, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 33740 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 0.7741881012916565, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 33750 + }, + { + "epoch": 2.424416517055655, + "grad_norm": 0.7844415903091431, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 33760 + }, + { + "epoch": 2.4251346499102335, + "grad_norm": 0.7960098385810852, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 33770 + }, + { + "epoch": 2.4258527827648115, + "grad_norm": 0.8267978429794312, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 33780 + }, + { + "epoch": 2.4265709156193895, + "grad_norm": 0.7498974204063416, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 33790 + }, + { + "epoch": 2.4272890484739675, + "grad_norm": 0.8357859253883362, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 33800 + }, + { + "epoch": 2.428007181328546, + "grad_norm": 0.8056104779243469, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 33810 + }, + { + "epoch": 2.428725314183124, + "grad_norm": 0.806897759437561, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 33820 + }, + { + "epoch": 2.429443447037702, + "grad_norm": 0.7770048975944519, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 33830 + }, + { + "epoch": 2.43016157989228, + "grad_norm": 0.8311458230018616, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 33840 + }, + { + "epoch": 2.430879712746858, + "grad_norm": 0.9201730489730835, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 33850 + }, + { + "epoch": 2.4315978456014364, + "grad_norm": 0.83509761095047, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 33860 + }, + { + "epoch": 2.4323159784560144, + "grad_norm": 0.7680139541625977, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 33870 + }, + { + "epoch": 2.4330341113105924, + "grad_norm": 0.8956670165061951, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 33880 + }, + { + "epoch": 2.4337522441651704, + "grad_norm": 0.717941164970398, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33890 + }, + { + "epoch": 2.434470377019749, + "grad_norm": 0.777206540107727, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 33900 + }, + { + "epoch": 2.435188509874327, + "grad_norm": 0.90232914686203, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 33910 + }, + { + "epoch": 2.435906642728905, + "grad_norm": 1.0817158222198486, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 33920 + }, + { + "epoch": 2.436624775583483, + "grad_norm": 0.7890931367874146, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 33930 + }, + { + "epoch": 2.437342908438061, + "grad_norm": 0.9279449582099915, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 33940 + }, + { + "epoch": 2.438061041292639, + "grad_norm": 0.8313823342323303, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 33950 + }, + { + "epoch": 2.4387791741472173, + "grad_norm": 1.0510340929031372, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 33960 + }, + { + "epoch": 2.4394973070017953, + "grad_norm": 0.8002574443817139, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 33970 + }, + { + "epoch": 2.4402154398563733, + "grad_norm": 0.7822834253311157, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33980 + }, + { + "epoch": 2.4409335727109513, + "grad_norm": 0.9050403237342834, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 33990 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 0.7569652199745178, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 34000 + }, + { + "epoch": 2.442369838420108, + "grad_norm": 0.6609470844268799, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 34010 + }, + { + "epoch": 2.443087971274686, + "grad_norm": 0.8090947866439819, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34020 + }, + { + "epoch": 2.443806104129264, + "grad_norm": 0.647814929485321, + "learning_rate": 0.0002, + "loss": 0.6621, + "step": 34030 + }, + { + "epoch": 2.444524236983842, + "grad_norm": 0.9308601021766663, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 34040 + }, + { + "epoch": 2.4452423698384202, + "grad_norm": 0.8259239792823792, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34050 + }, + { + "epoch": 2.4459605026929983, + "grad_norm": 0.9410025477409363, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 34060 + }, + { + "epoch": 2.4466786355475763, + "grad_norm": 0.7446974515914917, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 34070 + }, + { + "epoch": 2.4473967684021543, + "grad_norm": 0.7093849182128906, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 34080 + }, + { + "epoch": 2.4481149012567327, + "grad_norm": 0.8726152181625366, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 34090 + }, + { + "epoch": 2.4488330341113107, + "grad_norm": 0.808300793170929, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 34100 + }, + { + "epoch": 2.4495511669658887, + "grad_norm": 0.6884859800338745, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 34110 + }, + { + "epoch": 2.4502692998204667, + "grad_norm": 0.7151864767074585, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 34120 + }, + { + "epoch": 2.4509874326750447, + "grad_norm": 0.9261866807937622, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 34130 + }, + { + "epoch": 2.451705565529623, + "grad_norm": 0.8069018125534058, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 34140 + }, + { + "epoch": 2.452423698384201, + "grad_norm": 0.8001297116279602, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 34150 + }, + { + "epoch": 2.453141831238779, + "grad_norm": 0.8547799587249756, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 34160 + }, + { + "epoch": 2.453859964093357, + "grad_norm": 0.6693823337554932, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 34170 + }, + { + "epoch": 2.4545780969479356, + "grad_norm": 0.6646198630332947, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34180 + }, + { + "epoch": 2.4552962298025136, + "grad_norm": 0.9330950975418091, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 34190 + }, + { + "epoch": 2.4560143626570916, + "grad_norm": 0.7738645672798157, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 34200 + }, + { + "epoch": 2.4567324955116696, + "grad_norm": 0.7929846048355103, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 34210 + }, + { + "epoch": 2.4574506283662476, + "grad_norm": 0.8936280012130737, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34220 + }, + { + "epoch": 2.4581687612208256, + "grad_norm": 0.9099360108375549, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 34230 + }, + { + "epoch": 2.458886894075404, + "grad_norm": 0.7941291928291321, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 34240 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 0.7169737219810486, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 34250 + }, + { + "epoch": 2.46032315978456, + "grad_norm": 0.8994171023368835, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 34260 + }, + { + "epoch": 2.461041292639138, + "grad_norm": 0.8087331056594849, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 34270 + }, + { + "epoch": 2.4617594254937165, + "grad_norm": 0.935502827167511, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 34280 + }, + { + "epoch": 2.4624775583482945, + "grad_norm": 0.8957464694976807, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 34290 + }, + { + "epoch": 2.4631956912028725, + "grad_norm": 0.9017183780670166, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 34300 + }, + { + "epoch": 2.4639138240574505, + "grad_norm": 0.7778640389442444, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34310 + }, + { + "epoch": 2.4646319569120285, + "grad_norm": 0.8870323896408081, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 34320 + }, + { + "epoch": 2.465350089766607, + "grad_norm": 0.7660176753997803, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 34330 + }, + { + "epoch": 2.466068222621185, + "grad_norm": 0.8442226648330688, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 34340 + }, + { + "epoch": 2.466786355475763, + "grad_norm": 0.7522561550140381, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 34350 + }, + { + "epoch": 2.467504488330341, + "grad_norm": 0.9355213046073914, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 34360 + }, + { + "epoch": 2.4682226211849194, + "grad_norm": 0.8487382531166077, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 34370 + }, + { + "epoch": 2.4689407540394974, + "grad_norm": 0.7869813442230225, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 34380 + }, + { + "epoch": 2.4696588868940754, + "grad_norm": 0.7562848329544067, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 34390 + }, + { + "epoch": 2.4703770197486534, + "grad_norm": 0.740829586982727, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 34400 + }, + { + "epoch": 2.4710951526032314, + "grad_norm": 1.0862116813659668, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 34410 + }, + { + "epoch": 2.47181328545781, + "grad_norm": 0.9633645415306091, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 34420 + }, + { + "epoch": 2.472531418312388, + "grad_norm": 0.8467186093330383, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 34430 + }, + { + "epoch": 2.473249551166966, + "grad_norm": 0.9972147941589355, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 34440 + }, + { + "epoch": 2.473967684021544, + "grad_norm": 0.8086632490158081, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 34450 + }, + { + "epoch": 2.4746858168761223, + "grad_norm": 0.9043704271316528, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 34460 + }, + { + "epoch": 2.4754039497307003, + "grad_norm": 0.8275330662727356, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34470 + }, + { + "epoch": 2.4761220825852783, + "grad_norm": 0.8142464756965637, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 34480 + }, + { + "epoch": 2.4768402154398563, + "grad_norm": 0.7116754651069641, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 34490 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 0.8742281198501587, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 34500 + }, + { + "epoch": 2.4782764811490123, + "grad_norm": 0.7545657157897949, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 34510 + }, + { + "epoch": 2.478994614003591, + "grad_norm": 0.7586482167243958, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 34520 + }, + { + "epoch": 2.479712746858169, + "grad_norm": 0.9212547540664673, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 34530 + }, + { + "epoch": 2.480430879712747, + "grad_norm": 0.9391530752182007, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 34540 + }, + { + "epoch": 2.481149012567325, + "grad_norm": 1.119698166847229, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 34550 + }, + { + "epoch": 2.4818671454219032, + "grad_norm": 0.8499019145965576, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34560 + }, + { + "epoch": 2.4825852782764812, + "grad_norm": 0.7629778385162354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 34570 + }, + { + "epoch": 2.4833034111310592, + "grad_norm": 0.7667021155357361, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 34580 + }, + { + "epoch": 2.4840215439856372, + "grad_norm": 0.6711493730545044, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 34590 + }, + { + "epoch": 2.4847396768402152, + "grad_norm": 0.7354223728179932, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34600 + }, + { + "epoch": 2.4854578096947937, + "grad_norm": 0.875295102596283, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 34610 + }, + { + "epoch": 2.4861759425493717, + "grad_norm": 0.7341493964195251, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 34620 + }, + { + "epoch": 2.4868940754039497, + "grad_norm": 0.9049216508865356, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 34630 + }, + { + "epoch": 2.4876122082585277, + "grad_norm": 0.7214788198471069, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 34640 + }, + { + "epoch": 2.488330341113106, + "grad_norm": 0.7514070868492126, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 34650 + }, + { + "epoch": 2.489048473967684, + "grad_norm": 0.6929763555526733, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 34660 + }, + { + "epoch": 2.489766606822262, + "grad_norm": 1.11346435546875, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 34670 + }, + { + "epoch": 2.49048473967684, + "grad_norm": 0.9285556674003601, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 34680 + }, + { + "epoch": 2.491202872531418, + "grad_norm": 0.7699695825576782, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 34690 + }, + { + "epoch": 2.4919210053859966, + "grad_norm": 0.872349739074707, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 34700 + }, + { + "epoch": 2.4926391382405746, + "grad_norm": 0.8692147135734558, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 34710 + }, + { + "epoch": 2.4933572710951526, + "grad_norm": 0.799740195274353, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 34720 + }, + { + "epoch": 2.4940754039497306, + "grad_norm": 0.7320986986160278, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 34730 + }, + { + "epoch": 2.494793536804309, + "grad_norm": 0.8233383893966675, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 34740 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 0.9605086445808411, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34750 + }, + { + "epoch": 2.496229802513465, + "grad_norm": 0.8597773909568787, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 34760 + }, + { + "epoch": 2.496947935368043, + "grad_norm": 0.7459201812744141, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34770 + }, + { + "epoch": 2.497666068222621, + "grad_norm": 0.778457522392273, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 34780 + }, + { + "epoch": 2.498384201077199, + "grad_norm": 0.8591375946998596, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 34790 + }, + { + "epoch": 2.4991023339317775, + "grad_norm": 0.9689867496490479, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 34800 + }, + { + "epoch": 2.4998204667863555, + "grad_norm": 0.7430615425109863, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 34810 + }, + { + "epoch": 2.5005385996409335, + "grad_norm": 0.8545114994049072, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 34820 + }, + { + "epoch": 2.5012567324955115, + "grad_norm": 0.7115356922149658, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 34830 + }, + { + "epoch": 2.50197486535009, + "grad_norm": 0.7616795301437378, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34840 + }, + { + "epoch": 2.502692998204668, + "grad_norm": 0.8097891211509705, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 34850 + }, + { + "epoch": 2.503411131059246, + "grad_norm": 0.7397396564483643, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 34860 + }, + { + "epoch": 2.504129263913824, + "grad_norm": 0.7531594038009644, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 34870 + }, + { + "epoch": 2.504847396768402, + "grad_norm": 0.8050091862678528, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 34880 + }, + { + "epoch": 2.5055655296229804, + "grad_norm": 0.7550507187843323, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 34890 + }, + { + "epoch": 2.5062836624775584, + "grad_norm": 1.0131759643554688, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34900 + }, + { + "epoch": 2.5070017953321364, + "grad_norm": 0.9275356531143188, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 34910 + }, + { + "epoch": 2.5077199281867144, + "grad_norm": 0.6655791997909546, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 34920 + }, + { + "epoch": 2.508438061041293, + "grad_norm": 0.79361891746521, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 34930 + }, + { + "epoch": 2.509156193895871, + "grad_norm": 0.8223658800125122, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 34940 + }, + { + "epoch": 2.509874326750449, + "grad_norm": 1.0070416927337646, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 34950 + }, + { + "epoch": 2.510592459605027, + "grad_norm": 0.8408986330032349, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 34960 + }, + { + "epoch": 2.511310592459605, + "grad_norm": 0.8178259134292603, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 34970 + }, + { + "epoch": 2.512028725314183, + "grad_norm": 0.747876763343811, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 34980 + }, + { + "epoch": 2.5127468581687613, + "grad_norm": 0.8551825881004333, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 34990 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 0.8366564512252808, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 35000 + }, + { + "epoch": 2.5141831238779173, + "grad_norm": 0.8491294384002686, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 35010 + }, + { + "epoch": 2.5149012567324958, + "grad_norm": 0.8854562640190125, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 35020 + }, + { + "epoch": 2.5156193895870738, + "grad_norm": 0.8652133345603943, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 35030 + }, + { + "epoch": 2.5163375224416518, + "grad_norm": 0.8734033107757568, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 35040 + }, + { + "epoch": 2.5170556552962298, + "grad_norm": 0.8613446950912476, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 35050 + }, + { + "epoch": 2.5177737881508078, + "grad_norm": 0.762395441532135, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 35060 + }, + { + "epoch": 2.5184919210053858, + "grad_norm": 0.806220293045044, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 35070 + }, + { + "epoch": 2.519210053859964, + "grad_norm": 0.7781713008880615, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 35080 + }, + { + "epoch": 2.519928186714542, + "grad_norm": 0.8639848828315735, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 35090 + }, + { + "epoch": 2.52064631956912, + "grad_norm": 0.7331740260124207, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 35100 + }, + { + "epoch": 2.521364452423698, + "grad_norm": 0.8148137927055359, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 35110 + }, + { + "epoch": 2.5220825852782767, + "grad_norm": 0.6939297914505005, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 35120 + }, + { + "epoch": 2.5228007181328547, + "grad_norm": 0.8151076436042786, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 35130 + }, + { + "epoch": 2.5235188509874327, + "grad_norm": 0.9193238019943237, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 35140 + }, + { + "epoch": 2.5242369838420107, + "grad_norm": 0.8230985403060913, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 35150 + }, + { + "epoch": 2.5249551166965887, + "grad_norm": 0.865492582321167, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 35160 + }, + { + "epoch": 2.525673249551167, + "grad_norm": 0.7673570513725281, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35170 + }, + { + "epoch": 2.526391382405745, + "grad_norm": 0.8296313881874084, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 35180 + }, + { + "epoch": 2.527109515260323, + "grad_norm": 0.6531317234039307, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 35190 + }, + { + "epoch": 2.527827648114901, + "grad_norm": 0.9865642189979553, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 35200 + }, + { + "epoch": 2.5285457809694796, + "grad_norm": 0.8001098036766052, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 35210 + }, + { + "epoch": 2.5292639138240576, + "grad_norm": 0.7523218393325806, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 35220 + }, + { + "epoch": 2.5299820466786356, + "grad_norm": 1.061640977859497, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 35230 + }, + { + "epoch": 2.5307001795332136, + "grad_norm": 0.9668078422546387, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35240 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 0.9554983973503113, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 35250 + }, + { + "epoch": 2.5321364452423696, + "grad_norm": 0.8343066573143005, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 35260 + }, + { + "epoch": 2.532854578096948, + "grad_norm": 0.8408095240592957, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 35270 + }, + { + "epoch": 2.533572710951526, + "grad_norm": 0.8593984842300415, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 35280 + }, + { + "epoch": 2.534290843806104, + "grad_norm": 0.7593855261802673, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 35290 + }, + { + "epoch": 2.5350089766606825, + "grad_norm": 0.9179701209068298, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 35300 + }, + { + "epoch": 2.5357271095152605, + "grad_norm": 0.749022901058197, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 35310 + }, + { + "epoch": 2.5364452423698385, + "grad_norm": 0.7172152400016785, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 35320 + }, + { + "epoch": 2.5371633752244165, + "grad_norm": 0.8228873610496521, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 35330 + }, + { + "epoch": 2.5378815080789945, + "grad_norm": 0.9663547277450562, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 35340 + }, + { + "epoch": 2.5385996409335725, + "grad_norm": 0.8446536660194397, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35350 + }, + { + "epoch": 2.539317773788151, + "grad_norm": 0.9751029014587402, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 35360 + }, + { + "epoch": 2.540035906642729, + "grad_norm": 0.7460315823554993, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 35370 + }, + { + "epoch": 2.540754039497307, + "grad_norm": 0.8269246816635132, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 35380 + }, + { + "epoch": 2.541472172351885, + "grad_norm": 0.7200030088424683, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 35390 + }, + { + "epoch": 2.5421903052064634, + "grad_norm": 0.9586671590805054, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 35400 + }, + { + "epoch": 2.5429084380610414, + "grad_norm": 0.7872378826141357, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 35410 + }, + { + "epoch": 2.5436265709156194, + "grad_norm": 0.8257358074188232, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 35420 + }, + { + "epoch": 2.5443447037701974, + "grad_norm": 0.6924505829811096, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 35430 + }, + { + "epoch": 2.5450628366247754, + "grad_norm": 1.1171481609344482, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 35440 + }, + { + "epoch": 2.545780969479354, + "grad_norm": 0.9635605216026306, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 35450 + }, + { + "epoch": 2.546499102333932, + "grad_norm": 0.9760567545890808, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 35460 + }, + { + "epoch": 2.54721723518851, + "grad_norm": 0.8523460030555725, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 35470 + }, + { + "epoch": 2.547935368043088, + "grad_norm": 0.9316970109939575, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 35480 + }, + { + "epoch": 2.5486535008976663, + "grad_norm": 0.7401485443115234, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 35490 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 1.0627065896987915, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 35500 + }, + { + "epoch": 2.5500897666068223, + "grad_norm": 0.7463156580924988, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 35510 + }, + { + "epoch": 2.5508078994614003, + "grad_norm": 0.9935570359230042, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 35520 + }, + { + "epoch": 2.5515260323159783, + "grad_norm": 0.8824051022529602, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 35530 + }, + { + "epoch": 2.5522441651705563, + "grad_norm": 0.8018375635147095, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 35540 + }, + { + "epoch": 2.5529622980251347, + "grad_norm": 0.7523182034492493, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 35550 + }, + { + "epoch": 2.5536804308797127, + "grad_norm": 0.6771712303161621, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 35560 + }, + { + "epoch": 2.5543985637342908, + "grad_norm": 0.7903336882591248, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 35570 + }, + { + "epoch": 2.555116696588869, + "grad_norm": 0.7973808646202087, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 35580 + }, + { + "epoch": 2.555834829443447, + "grad_norm": 0.9082772731781006, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 35590 + }, + { + "epoch": 2.556552962298025, + "grad_norm": 0.779671311378479, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 35600 + }, + { + "epoch": 2.557271095152603, + "grad_norm": 0.710058331489563, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 35610 + }, + { + "epoch": 2.557989228007181, + "grad_norm": 0.8217873573303223, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 35620 + }, + { + "epoch": 2.558707360861759, + "grad_norm": 0.8017855286598206, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 35630 + }, + { + "epoch": 2.5594254937163377, + "grad_norm": 0.6671402454376221, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 35640 + }, + { + "epoch": 2.5601436265709157, + "grad_norm": 0.9357045292854309, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 35650 + }, + { + "epoch": 2.5608617594254937, + "grad_norm": 0.7676312327384949, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35660 + }, + { + "epoch": 2.5615798922800717, + "grad_norm": 0.7602545619010925, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 35670 + }, + { + "epoch": 2.56229802513465, + "grad_norm": 0.8112275004386902, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35680 + }, + { + "epoch": 2.563016157989228, + "grad_norm": 0.73296719789505, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 35690 + }, + { + "epoch": 2.563734290843806, + "grad_norm": 0.9007818102836609, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 35700 + }, + { + "epoch": 2.564452423698384, + "grad_norm": 0.7526060938835144, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 35710 + }, + { + "epoch": 2.565170556552962, + "grad_norm": 0.813875675201416, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 35720 + }, + { + "epoch": 2.5658886894075406, + "grad_norm": 0.7767695784568787, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 35730 + }, + { + "epoch": 2.5666068222621186, + "grad_norm": 0.7840573787689209, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35740 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 0.7400487661361694, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 35750 + }, + { + "epoch": 2.5680430879712746, + "grad_norm": 0.7424315810203552, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 35760 + }, + { + "epoch": 2.568761220825853, + "grad_norm": 0.7812185883522034, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 35770 + }, + { + "epoch": 2.569479353680431, + "grad_norm": 0.8397669196128845, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 35780 + }, + { + "epoch": 2.570197486535009, + "grad_norm": 0.7543849945068359, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 35790 + }, + { + "epoch": 2.570915619389587, + "grad_norm": 0.903634786605835, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 35800 + }, + { + "epoch": 2.571633752244165, + "grad_norm": 0.853335976600647, + "learning_rate": 0.0002, + "loss": 0.6884, + "step": 35810 + }, + { + "epoch": 2.572351885098743, + "grad_norm": 0.8441029787063599, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 35820 + }, + { + "epoch": 2.5730700179533215, + "grad_norm": 0.9072228670120239, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 35830 + }, + { + "epoch": 2.5737881508078995, + "grad_norm": 0.7720168828964233, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 35840 + }, + { + "epoch": 2.5745062836624775, + "grad_norm": 0.8719366788864136, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35850 + }, + { + "epoch": 2.575224416517056, + "grad_norm": 0.766209065914154, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 35860 + }, + { + "epoch": 2.575942549371634, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 35870 + }, + { + "epoch": 2.576660682226212, + "grad_norm": 0.8068482875823975, + "learning_rate": 0.0002, + "loss": 0.7309, + "step": 35880 + }, + { + "epoch": 2.57737881508079, + "grad_norm": 0.8321225643157959, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 35890 + }, + { + "epoch": 2.578096947935368, + "grad_norm": 0.9787611961364746, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 35900 + }, + { + "epoch": 2.578815080789946, + "grad_norm": 0.6955108642578125, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 35910 + }, + { + "epoch": 2.5795332136445244, + "grad_norm": 0.8309195637702942, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 35920 + }, + { + "epoch": 2.5802513464991024, + "grad_norm": 0.9309390783309937, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 35930 + }, + { + "epoch": 2.5809694793536804, + "grad_norm": 0.903537392616272, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 35940 + }, + { + "epoch": 2.5816876122082584, + "grad_norm": 0.9530633091926575, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 35950 + }, + { + "epoch": 2.582405745062837, + "grad_norm": 1.0140212774276733, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 35960 + }, + { + "epoch": 2.583123877917415, + "grad_norm": 0.8224637508392334, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 35970 + }, + { + "epoch": 2.583842010771993, + "grad_norm": 0.7952998280525208, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 35980 + }, + { + "epoch": 2.584560143626571, + "grad_norm": 0.6057878136634827, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 35990 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 0.9172457456588745, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 36000 + }, + { + "epoch": 2.5859964093357273, + "grad_norm": 1.0061585903167725, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36010 + }, + { + "epoch": 2.5867145421903053, + "grad_norm": 0.8555058240890503, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 36020 + }, + { + "epoch": 2.5874326750448833, + "grad_norm": 0.7732099890708923, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 36030 + }, + { + "epoch": 2.5881508078994613, + "grad_norm": 0.9026121497154236, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 36040 + }, + { + "epoch": 2.5888689407540397, + "grad_norm": 0.7477090954780579, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 36050 + }, + { + "epoch": 2.5895870736086177, + "grad_norm": 0.8835780024528503, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 36060 + }, + { + "epoch": 2.5903052064631957, + "grad_norm": 0.7555899024009705, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 36070 + }, + { + "epoch": 2.5910233393177737, + "grad_norm": 0.7983574867248535, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 36080 + }, + { + "epoch": 2.5917414721723517, + "grad_norm": 0.9261698722839355, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 36090 + }, + { + "epoch": 2.5924596050269297, + "grad_norm": 0.6834031343460083, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 36100 + }, + { + "epoch": 2.593177737881508, + "grad_norm": 0.9528526067733765, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 36110 + }, + { + "epoch": 2.593895870736086, + "grad_norm": 0.7469993233680725, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 36120 + }, + { + "epoch": 2.594614003590664, + "grad_norm": 0.6750355362892151, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 36130 + }, + { + "epoch": 2.5953321364452426, + "grad_norm": 0.8591015338897705, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 36140 + }, + { + "epoch": 2.5960502692998206, + "grad_norm": 0.7359472513198853, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 36150 + }, + { + "epoch": 2.5967684021543986, + "grad_norm": 0.8450608253479004, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36160 + }, + { + "epoch": 2.5974865350089766, + "grad_norm": 0.9069468975067139, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36170 + }, + { + "epoch": 2.5982046678635546, + "grad_norm": 0.9261118173599243, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 36180 + }, + { + "epoch": 2.5989228007181326, + "grad_norm": 0.7164715528488159, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 36190 + }, + { + "epoch": 2.599640933572711, + "grad_norm": 0.8809511661529541, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 36200 + }, + { + "epoch": 2.600359066427289, + "grad_norm": 0.9872701168060303, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 36210 + }, + { + "epoch": 2.601077199281867, + "grad_norm": 0.7544043064117432, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 36220 + }, + { + "epoch": 2.601795332136445, + "grad_norm": 0.9890767335891724, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 36230 + }, + { + "epoch": 2.6025134649910235, + "grad_norm": 0.907865047454834, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 36240 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 0.7724096179008484, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 36250 + }, + { + "epoch": 2.6039497307001795, + "grad_norm": 0.7996655106544495, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36260 + }, + { + "epoch": 2.6046678635547575, + "grad_norm": 0.7184412479400635, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 36270 + }, + { + "epoch": 2.6053859964093355, + "grad_norm": 0.7781601548194885, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 36280 + }, + { + "epoch": 2.6061041292639135, + "grad_norm": 0.8972102403640747, + "learning_rate": 0.0002, + "loss": 0.6975, + "step": 36290 + }, + { + "epoch": 2.606822262118492, + "grad_norm": 0.6831884980201721, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 36300 + }, + { + "epoch": 2.60754039497307, + "grad_norm": 0.9049789905548096, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 36310 + }, + { + "epoch": 2.608258527827648, + "grad_norm": 0.8062970042228699, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 36320 + }, + { + "epoch": 2.6089766606822264, + "grad_norm": 0.94797682762146, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 36330 + }, + { + "epoch": 2.6096947935368044, + "grad_norm": 0.7907559275627136, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 36340 + }, + { + "epoch": 2.6104129263913824, + "grad_norm": 0.6720156073570251, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 36350 + }, + { + "epoch": 2.6111310592459605, + "grad_norm": 0.729228138923645, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 36360 + }, + { + "epoch": 2.6118491921005385, + "grad_norm": 0.9072836637496948, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 36370 + }, + { + "epoch": 2.6125673249551165, + "grad_norm": 0.8022173643112183, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36380 + }, + { + "epoch": 2.613285457809695, + "grad_norm": 0.7475612163543701, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 36390 + }, + { + "epoch": 2.614003590664273, + "grad_norm": 0.7976534366607666, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 36400 + }, + { + "epoch": 2.614721723518851, + "grad_norm": 0.7118260860443115, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36410 + }, + { + "epoch": 2.6154398563734294, + "grad_norm": 0.666500985622406, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36420 + }, + { + "epoch": 2.6161579892280074, + "grad_norm": 0.8776089549064636, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 36430 + }, + { + "epoch": 2.6168761220825854, + "grad_norm": 0.9375919699668884, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 36440 + }, + { + "epoch": 2.6175942549371634, + "grad_norm": 0.8162244558334351, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 36450 + }, + { + "epoch": 2.6183123877917414, + "grad_norm": 0.8459304571151733, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 36460 + }, + { + "epoch": 2.6190305206463194, + "grad_norm": 0.7731037735939026, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 36470 + }, + { + "epoch": 2.619748653500898, + "grad_norm": 0.7857680320739746, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 36480 + }, + { + "epoch": 2.620466786355476, + "grad_norm": 0.8415161371231079, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 36490 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 0.8103558421134949, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 36500 + }, + { + "epoch": 2.621903052064632, + "grad_norm": 0.7876150608062744, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 36510 + }, + { + "epoch": 2.6226211849192103, + "grad_norm": 0.7316484451293945, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 36520 + }, + { + "epoch": 2.6233393177737883, + "grad_norm": 0.7209784984588623, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 36530 + }, + { + "epoch": 2.6240574506283663, + "grad_norm": 0.8933016657829285, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 36540 + }, + { + "epoch": 2.6247755834829443, + "grad_norm": 0.8078171610832214, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 36550 + }, + { + "epoch": 2.6254937163375223, + "grad_norm": 0.9134724736213684, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 36560 + }, + { + "epoch": 2.6262118491921003, + "grad_norm": 0.8691368699073792, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 36570 + }, + { + "epoch": 2.6269299820466787, + "grad_norm": 0.706479012966156, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 36580 + }, + { + "epoch": 2.6276481149012567, + "grad_norm": 0.9333644509315491, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 36590 + }, + { + "epoch": 2.6283662477558347, + "grad_norm": 0.8156154155731201, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 36600 + }, + { + "epoch": 2.629084380610413, + "grad_norm": 0.812745213508606, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 36610 + }, + { + "epoch": 2.629802513464991, + "grad_norm": 0.8898148536682129, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 36620 + }, + { + "epoch": 2.630520646319569, + "grad_norm": 0.8083946108818054, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36630 + }, + { + "epoch": 2.631238779174147, + "grad_norm": 0.7050122618675232, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 36640 + }, + { + "epoch": 2.631956912028725, + "grad_norm": 0.8155789971351624, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 36650 + }, + { + "epoch": 2.632675044883303, + "grad_norm": 0.9102175235748291, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 36660 + }, + { + "epoch": 2.6333931777378816, + "grad_norm": 0.6621248126029968, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36670 + }, + { + "epoch": 2.6341113105924596, + "grad_norm": 0.7338519096374512, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 36680 + }, + { + "epoch": 2.6348294434470376, + "grad_norm": 0.7536506652832031, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 36690 + }, + { + "epoch": 2.635547576301616, + "grad_norm": 0.9357436299324036, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 36700 + }, + { + "epoch": 2.636265709156194, + "grad_norm": 0.7732111215591431, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 36710 + }, + { + "epoch": 2.636983842010772, + "grad_norm": 0.6863537430763245, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36720 + }, + { + "epoch": 2.63770197486535, + "grad_norm": 0.8014764785766602, + "learning_rate": 0.0002, + "loss": 0.7058, + "step": 36730 + }, + { + "epoch": 2.638420107719928, + "grad_norm": 0.8103911280632019, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 36740 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 0.882652997970581, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 36750 + }, + { + "epoch": 2.6398563734290845, + "grad_norm": 0.8705278038978577, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 36760 + }, + { + "epoch": 2.6405745062836625, + "grad_norm": 0.80764240026474, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36770 + }, + { + "epoch": 2.6412926391382405, + "grad_norm": 0.9668620824813843, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 36780 + }, + { + "epoch": 2.6420107719928185, + "grad_norm": 0.7477577328681946, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 36790 + }, + { + "epoch": 2.642728904847397, + "grad_norm": 0.8344516754150391, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 36800 + }, + { + "epoch": 2.643447037701975, + "grad_norm": 0.9520720839500427, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 36810 + }, + { + "epoch": 2.644165170556553, + "grad_norm": 0.5942372679710388, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 36820 + }, + { + "epoch": 2.644883303411131, + "grad_norm": 0.7411555051803589, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 36830 + }, + { + "epoch": 2.645601436265709, + "grad_norm": 0.6597771048545837, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 36840 + }, + { + "epoch": 2.646319569120287, + "grad_norm": 0.8636548519134521, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 36850 + }, + { + "epoch": 2.6470377019748654, + "grad_norm": 0.8557497262954712, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 36860 + }, + { + "epoch": 2.6477558348294434, + "grad_norm": 0.8535996675491333, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 36870 + }, + { + "epoch": 2.6484739676840214, + "grad_norm": 0.7996463775634766, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 36880 + }, + { + "epoch": 2.6491921005386, + "grad_norm": 0.6462067365646362, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 36890 + }, + { + "epoch": 2.649910233393178, + "grad_norm": 0.8849772214889526, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36900 + }, + { + "epoch": 2.650628366247756, + "grad_norm": 0.999173641204834, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 36910 + }, + { + "epoch": 2.651346499102334, + "grad_norm": 0.7221724987030029, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 36920 + }, + { + "epoch": 2.652064631956912, + "grad_norm": 0.8122989535331726, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 36930 + }, + { + "epoch": 2.65278276481149, + "grad_norm": 0.724267840385437, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 36940 + }, + { + "epoch": 2.6535008976660683, + "grad_norm": 0.8250583410263062, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 36950 + }, + { + "epoch": 2.6542190305206463, + "grad_norm": 0.7623526453971863, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 36960 + }, + { + "epoch": 2.6549371633752243, + "grad_norm": 0.6474025845527649, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 36970 + }, + { + "epoch": 2.655655296229803, + "grad_norm": 0.9751694202423096, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 36980 + }, + { + "epoch": 2.656373429084381, + "grad_norm": 0.8338939547538757, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 36990 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 0.8877421021461487, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 37000 + }, + { + "epoch": 2.657809694793537, + "grad_norm": 0.9590298533439636, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 37010 + }, + { + "epoch": 2.658527827648115, + "grad_norm": 0.8224121928215027, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 37020 + }, + { + "epoch": 2.659245960502693, + "grad_norm": 0.9871236681938171, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 37030 + }, + { + "epoch": 2.6599640933572712, + "grad_norm": 0.8729037046432495, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 37040 + }, + { + "epoch": 2.6606822262118492, + "grad_norm": 0.6279319524765015, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 37050 + }, + { + "epoch": 2.6614003590664272, + "grad_norm": 1.0278962850570679, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37060 + }, + { + "epoch": 2.6621184919210052, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 37070 + }, + { + "epoch": 2.6628366247755837, + "grad_norm": 0.7432018518447876, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 37080 + }, + { + "epoch": 2.6635547576301617, + "grad_norm": 0.9425008296966553, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 37090 + }, + { + "epoch": 2.6642728904847397, + "grad_norm": 0.7542579174041748, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 37100 + }, + { + "epoch": 2.6649910233393177, + "grad_norm": 0.8469315767288208, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 37110 + }, + { + "epoch": 2.6657091561938957, + "grad_norm": 0.865777313709259, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 37120 + }, + { + "epoch": 2.6664272890484737, + "grad_norm": 0.7293250560760498, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 37130 + }, + { + "epoch": 2.667145421903052, + "grad_norm": 0.7199395895004272, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 37140 + }, + { + "epoch": 2.66786355475763, + "grad_norm": 0.7801268100738525, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 37150 + }, + { + "epoch": 2.668581687612208, + "grad_norm": 0.8706921935081482, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 37160 + }, + { + "epoch": 2.6692998204667866, + "grad_norm": 0.7124722599983215, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 37170 + }, + { + "epoch": 2.6700179533213646, + "grad_norm": 0.8333015441894531, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 37180 + }, + { + "epoch": 2.6707360861759426, + "grad_norm": 0.8822736740112305, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 37190 + }, + { + "epoch": 2.6714542190305206, + "grad_norm": 0.8300906419754028, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 37200 + }, + { + "epoch": 2.6721723518850986, + "grad_norm": 0.887126088142395, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37210 + }, + { + "epoch": 2.6728904847396766, + "grad_norm": 0.7473671436309814, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 37220 + }, + { + "epoch": 2.673608617594255, + "grad_norm": 0.8121018409729004, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 37230 + }, + { + "epoch": 2.674326750448833, + "grad_norm": 0.7882586717605591, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 37240 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 0.797060489654541, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 37250 + }, + { + "epoch": 2.6757630161579895, + "grad_norm": 0.9776935577392578, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 37260 + }, + { + "epoch": 2.6764811490125675, + "grad_norm": 0.9527283906936646, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37270 + }, + { + "epoch": 2.6771992818671455, + "grad_norm": 0.7232038974761963, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 37280 + }, + { + "epoch": 2.6779174147217235, + "grad_norm": 0.8514575362205505, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 37290 + }, + { + "epoch": 2.6786355475763015, + "grad_norm": 0.8951214551925659, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 37300 + }, + { + "epoch": 2.6793536804308795, + "grad_norm": 0.7569643259048462, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 37310 + }, + { + "epoch": 2.680071813285458, + "grad_norm": 1.0522346496582031, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 37320 + }, + { + "epoch": 2.680789946140036, + "grad_norm": 0.8914180994033813, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 37330 + }, + { + "epoch": 2.681508078994614, + "grad_norm": 0.8251807689666748, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 37340 + }, + { + "epoch": 2.682226211849192, + "grad_norm": 0.8215394020080566, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 37350 + }, + { + "epoch": 2.6829443447037704, + "grad_norm": 0.8043696880340576, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 37360 + }, + { + "epoch": 2.6836624775583484, + "grad_norm": 0.767250657081604, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 37370 + }, + { + "epoch": 2.6843806104129264, + "grad_norm": 0.817740261554718, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 37380 + }, + { + "epoch": 2.6850987432675044, + "grad_norm": 0.7963255047798157, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 37390 + }, + { + "epoch": 2.6858168761220824, + "grad_norm": 0.839271605014801, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 37400 + }, + { + "epoch": 2.6865350089766604, + "grad_norm": 0.7882823348045349, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 37410 + }, + { + "epoch": 2.687253141831239, + "grad_norm": 0.8316412568092346, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 37420 + }, + { + "epoch": 2.687971274685817, + "grad_norm": 1.0044993162155151, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37430 + }, + { + "epoch": 2.688689407540395, + "grad_norm": 0.8342832326889038, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 37440 + }, + { + "epoch": 2.6894075403949733, + "grad_norm": 0.6743215322494507, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 37450 + }, + { + "epoch": 2.6901256732495513, + "grad_norm": 0.6872923970222473, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 37460 + }, + { + "epoch": 2.6908438061041293, + "grad_norm": 0.7377792596817017, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 37470 + }, + { + "epoch": 2.6915619389587073, + "grad_norm": 0.7677304744720459, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 37480 + }, + { + "epoch": 2.6922800718132853, + "grad_norm": 0.9951061010360718, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 37490 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 0.7452111840248108, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 37500 + }, + { + "epoch": 2.6937163375224418, + "grad_norm": 0.9663393497467041, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 37510 + }, + { + "epoch": 2.6944344703770198, + "grad_norm": 0.7919635772705078, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 37520 + }, + { + "epoch": 2.6951526032315978, + "grad_norm": 0.9977981448173523, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 37530 + }, + { + "epoch": 2.695870736086176, + "grad_norm": 0.7279480695724487, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 37540 + }, + { + "epoch": 2.6965888689407542, + "grad_norm": 0.7218075394630432, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 37550 + }, + { + "epoch": 2.6973070017953322, + "grad_norm": 0.9041047096252441, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 37560 + }, + { + "epoch": 2.6980251346499102, + "grad_norm": 0.7689407467842102, + "learning_rate": 0.0002, + "loss": 0.6848, + "step": 37570 + }, + { + "epoch": 2.6987432675044882, + "grad_norm": 0.8184728622436523, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 37580 + }, + { + "epoch": 2.6994614003590662, + "grad_norm": 0.7536661624908447, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 37590 + }, + { + "epoch": 2.7001795332136447, + "grad_norm": 0.8371431231498718, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 37600 + }, + { + "epoch": 2.7008976660682227, + "grad_norm": 0.8562723994255066, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 37610 + }, + { + "epoch": 2.7016157989228007, + "grad_norm": 0.8227898478507996, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 37620 + }, + { + "epoch": 2.7023339317773787, + "grad_norm": 0.764792799949646, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 37630 + }, + { + "epoch": 2.703052064631957, + "grad_norm": 0.7782649993896484, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 37640 + }, + { + "epoch": 2.703770197486535, + "grad_norm": 0.7669944167137146, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 37650 + }, + { + "epoch": 2.704488330341113, + "grad_norm": 0.7945750951766968, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 37660 + }, + { + "epoch": 2.705206463195691, + "grad_norm": 0.6840786337852478, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 37670 + }, + { + "epoch": 2.705924596050269, + "grad_norm": 1.0565117597579956, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 37680 + }, + { + "epoch": 2.706642728904847, + "grad_norm": 0.7407042384147644, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 37690 + }, + { + "epoch": 2.7073608617594256, + "grad_norm": 0.7862113118171692, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 37700 + }, + { + "epoch": 2.7080789946140036, + "grad_norm": 0.7487596273422241, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 37710 + }, + { + "epoch": 2.7087971274685816, + "grad_norm": 0.9416596293449402, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 37720 + }, + { + "epoch": 2.70951526032316, + "grad_norm": 0.8943207263946533, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 37730 + }, + { + "epoch": 2.710233393177738, + "grad_norm": 0.9263445138931274, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 37740 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 0.6869737505912781, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 37750 + }, + { + "epoch": 2.711669658886894, + "grad_norm": 0.9186407923698425, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 37760 + }, + { + "epoch": 2.712387791741472, + "grad_norm": 0.8379335999488831, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 37770 + }, + { + "epoch": 2.71310592459605, + "grad_norm": 0.7248736023902893, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 37780 + }, + { + "epoch": 2.7138240574506285, + "grad_norm": 0.8636229038238525, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 37790 + }, + { + "epoch": 2.7145421903052065, + "grad_norm": 0.7590767741203308, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 37800 + }, + { + "epoch": 2.7152603231597845, + "grad_norm": 0.8946404457092285, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 37810 + }, + { + "epoch": 2.7159784560143625, + "grad_norm": 0.7822132706642151, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 37820 + }, + { + "epoch": 2.716696588868941, + "grad_norm": 0.7882820963859558, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 37830 + }, + { + "epoch": 2.717414721723519, + "grad_norm": 0.8025872707366943, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 37840 + }, + { + "epoch": 2.718132854578097, + "grad_norm": 0.8618839979171753, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 37850 + }, + { + "epoch": 2.718850987432675, + "grad_norm": 0.6975733637809753, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 37860 + }, + { + "epoch": 2.719569120287253, + "grad_norm": 0.7952182292938232, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 37870 + }, + { + "epoch": 2.7202872531418314, + "grad_norm": 0.7580680251121521, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 37880 + }, + { + "epoch": 2.7210053859964094, + "grad_norm": 0.9504257440567017, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 37890 + }, + { + "epoch": 2.7217235188509874, + "grad_norm": 0.856614351272583, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 37900 + }, + { + "epoch": 2.7224416517055654, + "grad_norm": 1.0092085599899292, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 37910 + }, + { + "epoch": 2.723159784560144, + "grad_norm": 0.9009839296340942, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 37920 + }, + { + "epoch": 2.723877917414722, + "grad_norm": 0.9247435331344604, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 37930 + }, + { + "epoch": 2.7245960502693, + "grad_norm": 1.0774317979812622, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 37940 + }, + { + "epoch": 2.725314183123878, + "grad_norm": 0.9104372262954712, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 37950 + }, + { + "epoch": 2.726032315978456, + "grad_norm": 0.7904245257377625, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 37960 + }, + { + "epoch": 2.726750448833034, + "grad_norm": 0.9555521607398987, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 37970 + }, + { + "epoch": 2.7274685816876123, + "grad_norm": 0.7769099473953247, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 37980 + }, + { + "epoch": 2.7281867145421903, + "grad_norm": 0.9202065467834473, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 37990 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 0.732510507106781, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 38000 + }, + { + "epoch": 2.7296229802513468, + "grad_norm": 0.7723771929740906, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 38010 + }, + { + "epoch": 2.7303411131059248, + "grad_norm": 0.7948567867279053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 38020 + }, + { + "epoch": 2.7310592459605028, + "grad_norm": 0.7702966928482056, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 38030 + }, + { + "epoch": 2.7317773788150808, + "grad_norm": 0.689098060131073, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 38040 + }, + { + "epoch": 2.7324955116696588, + "grad_norm": 0.7951080203056335, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 38050 + }, + { + "epoch": 2.7332136445242368, + "grad_norm": 0.7284924983978271, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 38060 + }, + { + "epoch": 2.733931777378815, + "grad_norm": 0.9198044538497925, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 38070 + }, + { + "epoch": 2.734649910233393, + "grad_norm": 0.8653260469436646, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 38080 + }, + { + "epoch": 2.735368043087971, + "grad_norm": 0.8503400683403015, + "learning_rate": 0.0002, + "loss": 0.6832, + "step": 38090 + }, + { + "epoch": 2.736086175942549, + "grad_norm": 0.8388783931732178, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 38100 + }, + { + "epoch": 2.7368043087971277, + "grad_norm": 0.7636904716491699, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 38110 + }, + { + "epoch": 2.7375224416517057, + "grad_norm": 0.8990790247917175, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 38120 + }, + { + "epoch": 2.7382405745062837, + "grad_norm": 0.8878970742225647, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 38130 + }, + { + "epoch": 2.7389587073608617, + "grad_norm": 0.7684310078620911, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 38140 + }, + { + "epoch": 2.7396768402154397, + "grad_norm": 1.0777359008789062, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 38150 + }, + { + "epoch": 2.740394973070018, + "grad_norm": 0.768764317035675, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 38160 + }, + { + "epoch": 2.741113105924596, + "grad_norm": 0.7490760087966919, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 38170 + }, + { + "epoch": 2.741831238779174, + "grad_norm": 0.860373854637146, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 38180 + }, + { + "epoch": 2.742549371633752, + "grad_norm": 0.7145599722862244, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 38190 + }, + { + "epoch": 2.7432675044883306, + "grad_norm": 0.8347760438919067, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 38200 + }, + { + "epoch": 2.7439856373429086, + "grad_norm": 0.8425729274749756, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 38210 + }, + { + "epoch": 2.7447037701974866, + "grad_norm": 0.9289436936378479, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 38220 + }, + { + "epoch": 2.7454219030520646, + "grad_norm": 0.7608675360679626, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 38230 + }, + { + "epoch": 2.7461400359066426, + "grad_norm": 0.8067167401313782, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 38240 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 0.8599629402160645, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 38250 + }, + { + "epoch": 2.747576301615799, + "grad_norm": 0.8425742387771606, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 38260 + }, + { + "epoch": 2.748294434470377, + "grad_norm": 0.8626754283905029, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 38270 + }, + { + "epoch": 2.749012567324955, + "grad_norm": 0.797652006149292, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 38280 + }, + { + "epoch": 2.7497307001795335, + "grad_norm": 0.7971500754356384, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 38290 + }, + { + "epoch": 2.7504488330341115, + "grad_norm": 0.9786333441734314, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 38300 + }, + { + "epoch": 2.7511669658886895, + "grad_norm": 0.7146100997924805, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 38310 + }, + { + "epoch": 2.7518850987432675, + "grad_norm": 0.8436099886894226, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 38320 + }, + { + "epoch": 2.7526032315978455, + "grad_norm": 0.8943847417831421, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 38330 + }, + { + "epoch": 2.7533213644524235, + "grad_norm": 0.8170148730278015, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 38340 + }, + { + "epoch": 2.754039497307002, + "grad_norm": 0.7804728746414185, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 38350 + }, + { + "epoch": 2.75475763016158, + "grad_norm": 0.9139971137046814, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38360 + }, + { + "epoch": 2.755475763016158, + "grad_norm": 0.835332453250885, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 38370 + }, + { + "epoch": 2.756193895870736, + "grad_norm": 1.0904794931411743, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 38380 + }, + { + "epoch": 2.7569120287253144, + "grad_norm": 0.7443365454673767, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 38390 + }, + { + "epoch": 2.7576301615798924, + "grad_norm": 1.1336839199066162, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 38400 + }, + { + "epoch": 2.7583482944344704, + "grad_norm": 0.9024015665054321, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 38410 + }, + { + "epoch": 2.7590664272890484, + "grad_norm": 0.7380578517913818, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 38420 + }, + { + "epoch": 2.7597845601436264, + "grad_norm": 0.9860634207725525, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 38430 + }, + { + "epoch": 2.760502692998205, + "grad_norm": 0.7928970456123352, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 38440 + }, + { + "epoch": 2.761220825852783, + "grad_norm": 1.0357221364974976, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 38450 + }, + { + "epoch": 2.761938958707361, + "grad_norm": 0.8110901117324829, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 38460 + }, + { + "epoch": 2.762657091561939, + "grad_norm": 0.8420981764793396, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 38470 + }, + { + "epoch": 2.7633752244165173, + "grad_norm": 0.858955979347229, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 38480 + }, + { + "epoch": 2.7640933572710953, + "grad_norm": 0.9851368069648743, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 38490 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 0.8073325753211975, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 38500 + }, + { + "epoch": 2.7655296229802513, + "grad_norm": 1.0654062032699585, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38510 + }, + { + "epoch": 2.7662477558348293, + "grad_norm": 0.719603955745697, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 38520 + }, + { + "epoch": 2.7669658886894073, + "grad_norm": 0.9790831804275513, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38530 + }, + { + "epoch": 2.7676840215439857, + "grad_norm": 0.907619833946228, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 38540 + }, + { + "epoch": 2.7684021543985637, + "grad_norm": 0.7463719248771667, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 38550 + }, + { + "epoch": 2.7691202872531417, + "grad_norm": 1.0687178373336792, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 38560 + }, + { + "epoch": 2.76983842010772, + "grad_norm": 0.7397776246070862, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 38570 + }, + { + "epoch": 2.770556552962298, + "grad_norm": 0.7392559051513672, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 38580 + }, + { + "epoch": 2.771274685816876, + "grad_norm": 0.9774793982505798, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38590 + }, + { + "epoch": 2.771992818671454, + "grad_norm": 0.9502208828926086, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 38600 + }, + { + "epoch": 2.772710951526032, + "grad_norm": 0.776108980178833, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 38610 + }, + { + "epoch": 2.77342908438061, + "grad_norm": 0.7633077502250671, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 38620 + }, + { + "epoch": 2.7741472172351886, + "grad_norm": 0.9445580244064331, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 38630 + }, + { + "epoch": 2.7748653500897666, + "grad_norm": 0.943165123462677, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 38640 + }, + { + "epoch": 2.7755834829443446, + "grad_norm": 0.9045929908752441, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 38650 + }, + { + "epoch": 2.7763016157989227, + "grad_norm": 0.9425684213638306, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 38660 + }, + { + "epoch": 2.777019748653501, + "grad_norm": 0.9106295704841614, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 38670 + }, + { + "epoch": 2.777737881508079, + "grad_norm": 0.6264749765396118, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 38680 + }, + { + "epoch": 2.778456014362657, + "grad_norm": 0.9156801700592041, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 38690 + }, + { + "epoch": 2.779174147217235, + "grad_norm": 0.9752956032752991, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 38700 + }, + { + "epoch": 2.779892280071813, + "grad_norm": 0.7849555611610413, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 38710 + }, + { + "epoch": 2.780610412926391, + "grad_norm": 0.8109981417655945, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 38720 + }, + { + "epoch": 2.7813285457809696, + "grad_norm": 0.7882387638092041, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 38730 + }, + { + "epoch": 2.7820466786355476, + "grad_norm": 0.9049678444862366, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 38740 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 0.7678212523460388, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38750 + }, + { + "epoch": 2.783482944344704, + "grad_norm": 0.9754453301429749, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 38760 + }, + { + "epoch": 2.784201077199282, + "grad_norm": 0.7643493413925171, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 38770 + }, + { + "epoch": 2.78491921005386, + "grad_norm": 0.7440303564071655, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 38780 + }, + { + "epoch": 2.785637342908438, + "grad_norm": 0.8870946168899536, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 38790 + }, + { + "epoch": 2.786355475763016, + "grad_norm": 0.8100579977035522, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 38800 + }, + { + "epoch": 2.787073608617594, + "grad_norm": 0.7082616090774536, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 38810 + }, + { + "epoch": 2.7877917414721725, + "grad_norm": 0.7880047559738159, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 38820 + }, + { + "epoch": 2.7885098743267505, + "grad_norm": 0.7217963337898254, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 38830 + }, + { + "epoch": 2.7892280071813285, + "grad_norm": 0.799124002456665, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 38840 + }, + { + "epoch": 2.789946140035907, + "grad_norm": 1.0004022121429443, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 38850 + }, + { + "epoch": 2.790664272890485, + "grad_norm": 0.7866547107696533, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 38860 + }, + { + "epoch": 2.791382405745063, + "grad_norm": 0.891603410243988, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 38870 + }, + { + "epoch": 2.792100538599641, + "grad_norm": 0.7687129378318787, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 38880 + }, + { + "epoch": 2.792818671454219, + "grad_norm": 0.7549769282341003, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 38890 + }, + { + "epoch": 2.793536804308797, + "grad_norm": 0.7792351245880127, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 38900 + }, + { + "epoch": 2.7942549371633754, + "grad_norm": 0.7352819442749023, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 38910 + }, + { + "epoch": 2.7949730700179534, + "grad_norm": 0.8758018612861633, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 38920 + }, + { + "epoch": 2.7956912028725314, + "grad_norm": 0.8213023543357849, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38930 + }, + { + "epoch": 2.7964093357271094, + "grad_norm": 0.899368941783905, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 38940 + }, + { + "epoch": 2.797127468581688, + "grad_norm": 0.7497758269309998, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 38950 + }, + { + "epoch": 2.797845601436266, + "grad_norm": 0.870704710483551, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 38960 + }, + { + "epoch": 2.798563734290844, + "grad_norm": 0.8021528720855713, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 38970 + }, + { + "epoch": 2.799281867145422, + "grad_norm": 0.7541360855102539, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 38980 + }, + { + "epoch": 2.8, + "grad_norm": 0.8909788131713867, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 38990 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 0.8175999522209167, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 39000 + }, + { + "epoch": 2.8014362657091563, + "grad_norm": 0.7336044311523438, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 39010 + }, + { + "epoch": 2.8021543985637343, + "grad_norm": 0.7354168891906738, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 39020 + }, + { + "epoch": 2.8028725314183123, + "grad_norm": 0.8771968483924866, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 39030 + }, + { + "epoch": 2.8035906642728907, + "grad_norm": 0.8073309063911438, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39040 + }, + { + "epoch": 2.8043087971274687, + "grad_norm": 0.8475365042686462, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39050 + }, + { + "epoch": 2.8050269299820467, + "grad_norm": 0.7233281135559082, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 39060 + }, + { + "epoch": 2.8057450628366247, + "grad_norm": 0.9850572347640991, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39070 + }, + { + "epoch": 2.8064631956912027, + "grad_norm": 1.0635435581207275, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 39080 + }, + { + "epoch": 2.8071813285457807, + "grad_norm": 0.8183665871620178, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 39090 + }, + { + "epoch": 2.807899461400359, + "grad_norm": 0.802228569984436, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 39100 + }, + { + "epoch": 2.808617594254937, + "grad_norm": 0.9861624836921692, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 39110 + }, + { + "epoch": 2.809335727109515, + "grad_norm": 0.675205409526825, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 39120 + }, + { + "epoch": 2.8100538599640936, + "grad_norm": 0.7503975629806519, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 39130 + }, + { + "epoch": 2.8107719928186716, + "grad_norm": 0.8266825675964355, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 39140 + }, + { + "epoch": 2.8114901256732496, + "grad_norm": 0.6956485509872437, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39150 + }, + { + "epoch": 2.8122082585278276, + "grad_norm": 0.7363799214363098, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 39160 + }, + { + "epoch": 2.8129263913824056, + "grad_norm": 1.3893407583236694, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 39170 + }, + { + "epoch": 2.8136445242369836, + "grad_norm": 1.0619654655456543, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 39180 + }, + { + "epoch": 2.814362657091562, + "grad_norm": 0.7924326062202454, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 39190 + }, + { + "epoch": 2.81508078994614, + "grad_norm": 0.8838121294975281, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 39200 + }, + { + "epoch": 2.815798922800718, + "grad_norm": 0.9059016108512878, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 39210 + }, + { + "epoch": 2.816517055655296, + "grad_norm": 0.9284590482711792, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 39220 + }, + { + "epoch": 2.8172351885098745, + "grad_norm": 0.7992225289344788, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 39230 + }, + { + "epoch": 2.8179533213644525, + "grad_norm": 0.816376805305481, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 39240 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 0.9183637499809265, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 39250 + }, + { + "epoch": 2.8193895870736085, + "grad_norm": 0.7232057452201843, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 39260 + }, + { + "epoch": 2.8201077199281865, + "grad_norm": 0.9012457728385925, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 39270 + }, + { + "epoch": 2.8208258527827645, + "grad_norm": 0.7796093821525574, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 39280 + }, + { + "epoch": 2.821543985637343, + "grad_norm": 0.8331146836280823, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 39290 + }, + { + "epoch": 2.822262118491921, + "grad_norm": 0.8031269907951355, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 39300 + }, + { + "epoch": 2.822980251346499, + "grad_norm": 0.8563299179077148, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 39310 + }, + { + "epoch": 2.8236983842010774, + "grad_norm": 0.8083387613296509, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 39320 + }, + { + "epoch": 2.8244165170556554, + "grad_norm": 0.8132631182670593, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 39330 + }, + { + "epoch": 2.8251346499102334, + "grad_norm": 0.9071316719055176, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39340 + }, + { + "epoch": 2.8258527827648114, + "grad_norm": 0.8224168419837952, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 39350 + }, + { + "epoch": 2.8265709156193894, + "grad_norm": 1.073014497756958, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 39360 + }, + { + "epoch": 2.8272890484739674, + "grad_norm": 0.9466553926467896, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 39370 + }, + { + "epoch": 2.828007181328546, + "grad_norm": 0.8946257829666138, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 39380 + }, + { + "epoch": 2.828725314183124, + "grad_norm": 0.8497758507728577, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 39390 + }, + { + "epoch": 2.829443447037702, + "grad_norm": 0.8952143788337708, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 39400 + }, + { + "epoch": 2.8301615798922803, + "grad_norm": 0.8839313983917236, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 39410 + }, + { + "epoch": 2.8308797127468583, + "grad_norm": 0.7576757669448853, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 39420 + }, + { + "epoch": 2.8315978456014363, + "grad_norm": 0.8212469816207886, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 39430 + }, + { + "epoch": 2.8323159784560143, + "grad_norm": 0.9289504885673523, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 39440 + }, + { + "epoch": 2.8330341113105924, + "grad_norm": 0.8745405077934265, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 39450 + }, + { + "epoch": 2.8337522441651704, + "grad_norm": 0.7974533438682556, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 39460 + }, + { + "epoch": 2.834470377019749, + "grad_norm": 0.914289116859436, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 39470 + }, + { + "epoch": 2.835188509874327, + "grad_norm": 0.7686914801597595, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 39480 + }, + { + "epoch": 2.835906642728905, + "grad_norm": 0.9289370179176331, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39490 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 0.8851973414421082, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 39500 + }, + { + "epoch": 2.8373429084380613, + "grad_norm": 0.7754096388816833, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 39510 + }, + { + "epoch": 2.8380610412926393, + "grad_norm": 0.8801632523536682, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 39520 + }, + { + "epoch": 2.8387791741472173, + "grad_norm": 0.9031528234481812, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 39530 + }, + { + "epoch": 2.8394973070017953, + "grad_norm": 0.7113721966743469, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 39540 + }, + { + "epoch": 2.8402154398563733, + "grad_norm": 0.7880923748016357, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 39550 + }, + { + "epoch": 2.8409335727109513, + "grad_norm": 2.4828813076019287, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39560 + }, + { + "epoch": 2.8416517055655297, + "grad_norm": 0.9174619913101196, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 39570 + }, + { + "epoch": 2.8423698384201077, + "grad_norm": 0.9708074927330017, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 39580 + }, + { + "epoch": 2.8430879712746857, + "grad_norm": 0.7968248724937439, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 39590 + }, + { + "epoch": 2.843806104129264, + "grad_norm": 0.7967682480812073, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 39600 + }, + { + "epoch": 2.844524236983842, + "grad_norm": 0.7487651109695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 39610 + }, + { + "epoch": 2.84524236983842, + "grad_norm": 0.6997556686401367, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 39620 + }, + { + "epoch": 2.845960502692998, + "grad_norm": 0.7639351487159729, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39630 + }, + { + "epoch": 2.846678635547576, + "grad_norm": 0.9086648225784302, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 39640 + }, + { + "epoch": 2.847396768402154, + "grad_norm": 0.91103196144104, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 39650 + }, + { + "epoch": 2.8481149012567326, + "grad_norm": 0.8096913695335388, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 39660 + }, + { + "epoch": 2.8488330341113106, + "grad_norm": 0.8961427807807922, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39670 + }, + { + "epoch": 2.8495511669658886, + "grad_norm": 0.7489904761314392, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 39680 + }, + { + "epoch": 2.850269299820467, + "grad_norm": 0.7893617749214172, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 39690 + }, + { + "epoch": 2.850987432675045, + "grad_norm": 0.8259761929512024, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 39700 + }, + { + "epoch": 2.851705565529623, + "grad_norm": 0.7006617188453674, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 39710 + }, + { + "epoch": 2.852423698384201, + "grad_norm": 0.8922327756881714, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 39720 + }, + { + "epoch": 2.853141831238779, + "grad_norm": 0.9058550000190735, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 39730 + }, + { + "epoch": 2.853859964093357, + "grad_norm": 0.7627129554748535, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 39740 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 0.9316968321800232, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39750 + }, + { + "epoch": 2.8552962298025135, + "grad_norm": 0.8424679040908813, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39760 + }, + { + "epoch": 2.8560143626570915, + "grad_norm": 0.6185386776924133, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 39770 + }, + { + "epoch": 2.8567324955116695, + "grad_norm": 0.709902286529541, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 39780 + }, + { + "epoch": 2.857450628366248, + "grad_norm": 0.93730229139328, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 39790 + }, + { + "epoch": 2.858168761220826, + "grad_norm": 0.875989556312561, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 39800 + }, + { + "epoch": 2.858886894075404, + "grad_norm": 0.7424131631851196, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 39810 + }, + { + "epoch": 2.859605026929982, + "grad_norm": 0.9108477830886841, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 39820 + }, + { + "epoch": 2.86032315978456, + "grad_norm": 0.8248386383056641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 39830 + }, + { + "epoch": 2.861041292639138, + "grad_norm": 0.8739979863166809, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 39840 + }, + { + "epoch": 2.8617594254937164, + "grad_norm": 0.7940961122512817, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 39850 + }, + { + "epoch": 2.8624775583482944, + "grad_norm": 0.7594687938690186, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 39860 + }, + { + "epoch": 2.8631956912028724, + "grad_norm": 0.9884313941001892, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 39870 + }, + { + "epoch": 2.863913824057451, + "grad_norm": 0.8537741303443909, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 39880 + }, + { + "epoch": 2.864631956912029, + "grad_norm": 0.7407512664794922, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 39890 + }, + { + "epoch": 2.865350089766607, + "grad_norm": 1.0179548263549805, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 39900 + }, + { + "epoch": 2.866068222621185, + "grad_norm": 0.8822470307350159, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 39910 + }, + { + "epoch": 2.866786355475763, + "grad_norm": 0.794448733329773, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 39920 + }, + { + "epoch": 2.867504488330341, + "grad_norm": 0.8115299940109253, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 39930 + }, + { + "epoch": 2.8682226211849193, + "grad_norm": 0.7998958826065063, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 39940 + }, + { + "epoch": 2.8689407540394973, + "grad_norm": 0.8222435116767883, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 39950 + }, + { + "epoch": 2.8696588868940753, + "grad_norm": 0.9495923519134521, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39960 + }, + { + "epoch": 2.8703770197486533, + "grad_norm": 0.6749192476272583, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 39970 + }, + { + "epoch": 2.871095152603232, + "grad_norm": 0.8910874128341675, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 39980 + }, + { + "epoch": 2.87181328545781, + "grad_norm": 0.7051638960838318, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 39990 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 0.8456535339355469, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 40000 + }, + { + "epoch": 2.873249551166966, + "grad_norm": 0.934894859790802, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 40010 + }, + { + "epoch": 2.873967684021544, + "grad_norm": 0.6740477681159973, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 40020 + }, + { + "epoch": 2.8746858168761222, + "grad_norm": 0.6632325649261475, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 40030 + }, + { + "epoch": 2.8754039497307002, + "grad_norm": 0.8889022469520569, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 40040 + }, + { + "epoch": 2.8761220825852782, + "grad_norm": 0.7460705637931824, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 40050 + }, + { + "epoch": 2.8768402154398562, + "grad_norm": 0.9795911908149719, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 40060 + }, + { + "epoch": 2.8775583482944347, + "grad_norm": 1.0002509355545044, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 40070 + }, + { + "epoch": 2.8782764811490127, + "grad_norm": 0.7867239713668823, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 40080 + }, + { + "epoch": 2.8789946140035907, + "grad_norm": 1.0221471786499023, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 40090 + }, + { + "epoch": 2.8797127468581687, + "grad_norm": 0.8091005086898804, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 40100 + }, + { + "epoch": 2.8804308797127467, + "grad_norm": 0.8485820293426514, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 40110 + }, + { + "epoch": 2.8811490125673247, + "grad_norm": 0.7850196957588196, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 40120 + }, + { + "epoch": 2.881867145421903, + "grad_norm": 0.7906134128570557, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 40130 + }, + { + "epoch": 2.882585278276481, + "grad_norm": 0.7957962155342102, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 40140 + }, + { + "epoch": 2.883303411131059, + "grad_norm": 1.0687522888183594, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 40150 + }, + { + "epoch": 2.8840215439856376, + "grad_norm": 0.713752031326294, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 40160 + }, + { + "epoch": 2.8847396768402156, + "grad_norm": 1.1603864431381226, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 40170 + }, + { + "epoch": 2.8854578096947936, + "grad_norm": 0.8423245549201965, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 40180 + }, + { + "epoch": 2.8861759425493716, + "grad_norm": 0.7554550766944885, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40190 + }, + { + "epoch": 2.8868940754039496, + "grad_norm": 0.6006978750228882, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 40200 + }, + { + "epoch": 2.8876122082585276, + "grad_norm": 0.923068106174469, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 40210 + }, + { + "epoch": 2.888330341113106, + "grad_norm": 0.7659787535667419, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 40220 + }, + { + "epoch": 2.889048473967684, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 40230 + }, + { + "epoch": 2.889766606822262, + "grad_norm": 1.1267355680465698, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 40240 + }, + { + "epoch": 2.89048473967684, + "grad_norm": 0.8548554182052612, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 40250 + }, + { + "epoch": 2.8912028725314185, + "grad_norm": 0.7846875786781311, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 40260 + }, + { + "epoch": 2.8919210053859965, + "grad_norm": 0.8606904745101929, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40270 + }, + { + "epoch": 2.8926391382405745, + "grad_norm": 0.6508898138999939, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 40280 + }, + { + "epoch": 2.8933572710951525, + "grad_norm": 0.7903237342834473, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 40290 + }, + { + "epoch": 2.8940754039497305, + "grad_norm": 0.7320941686630249, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 40300 + }, + { + "epoch": 2.894793536804309, + "grad_norm": 1.0031821727752686, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 40310 + }, + { + "epoch": 2.895511669658887, + "grad_norm": 0.7463554739952087, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 40320 + }, + { + "epoch": 2.896229802513465, + "grad_norm": 0.8455599546432495, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 40330 + }, + { + "epoch": 2.896947935368043, + "grad_norm": 0.7645914554595947, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 40340 + }, + { + "epoch": 2.8976660682226214, + "grad_norm": 0.9074810147285461, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 40350 + }, + { + "epoch": 2.8983842010771994, + "grad_norm": 0.9070153832435608, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 40360 + }, + { + "epoch": 2.8991023339317774, + "grad_norm": 0.8649221658706665, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 40370 + }, + { + "epoch": 2.8998204667863554, + "grad_norm": 1.0325016975402832, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 40380 + }, + { + "epoch": 2.9005385996409334, + "grad_norm": 0.8688622713088989, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 40390 + }, + { + "epoch": 2.9012567324955114, + "grad_norm": 0.83316969871521, + "learning_rate": 0.0002, + "loss": 0.7209, + "step": 40400 + }, + { + "epoch": 2.90197486535009, + "grad_norm": 1.0146536827087402, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 40410 + }, + { + "epoch": 2.902692998204668, + "grad_norm": 6.21811580657959, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 40420 + }, + { + "epoch": 2.903411131059246, + "grad_norm": 0.8747655749320984, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 40430 + }, + { + "epoch": 2.9041292639138243, + "grad_norm": 0.8671547174453735, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 40440 + }, + { + "epoch": 2.9048473967684023, + "grad_norm": 0.7888760566711426, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 40450 + }, + { + "epoch": 2.9055655296229803, + "grad_norm": 0.7182217240333557, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 40460 + }, + { + "epoch": 2.9062836624775583, + "grad_norm": 0.8802227973937988, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 40470 + }, + { + "epoch": 2.9070017953321363, + "grad_norm": 0.8106126189231873, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 40480 + }, + { + "epoch": 2.9077199281867143, + "grad_norm": 0.7313538789749146, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 40490 + }, + { + "epoch": 2.9084380610412928, + "grad_norm": 0.6098655462265015, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40500 + }, + { + "epoch": 2.9091561938958708, + "grad_norm": 0.8849560618400574, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 40510 + }, + { + "epoch": 2.9098743267504488, + "grad_norm": 0.8761322498321533, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 40520 + }, + { + "epoch": 2.9105924596050268, + "grad_norm": 0.8259703516960144, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 40530 + }, + { + "epoch": 2.911310592459605, + "grad_norm": 0.6613079309463501, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 40540 + }, + { + "epoch": 2.912028725314183, + "grad_norm": 0.825678825378418, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 40550 + }, + { + "epoch": 2.912746858168761, + "grad_norm": 0.824850857257843, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 40560 + }, + { + "epoch": 2.9134649910233392, + "grad_norm": 0.9629682898521423, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 40570 + }, + { + "epoch": 2.9141831238779172, + "grad_norm": 0.7446485161781311, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 40580 + }, + { + "epoch": 2.9149012567324957, + "grad_norm": 0.9028317928314209, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 40590 + }, + { + "epoch": 2.9156193895870737, + "grad_norm": 0.9646022319793701, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 40600 + }, + { + "epoch": 2.9163375224416517, + "grad_norm": 0.8845045566558838, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 40610 + }, + { + "epoch": 2.9170556552962297, + "grad_norm": 0.9660372734069824, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 40620 + }, + { + "epoch": 2.917773788150808, + "grad_norm": 0.8914347290992737, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 40630 + }, + { + "epoch": 2.918491921005386, + "grad_norm": 0.7789235711097717, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 40640 + }, + { + "epoch": 2.919210053859964, + "grad_norm": 0.8221206665039062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 40650 + }, + { + "epoch": 2.919928186714542, + "grad_norm": 0.9550618529319763, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 40660 + }, + { + "epoch": 2.92064631956912, + "grad_norm": 0.868315577507019, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 40670 + }, + { + "epoch": 2.921364452423698, + "grad_norm": 0.852878749370575, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 40680 + }, + { + "epoch": 2.9220825852782766, + "grad_norm": 0.8388790488243103, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 40690 + }, + { + "epoch": 2.9228007181328546, + "grad_norm": 0.9897602200508118, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 40700 + }, + { + "epoch": 2.9235188509874326, + "grad_norm": 0.8050527572631836, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 40710 + }, + { + "epoch": 2.924236983842011, + "grad_norm": 0.7296929955482483, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 40720 + }, + { + "epoch": 2.924955116696589, + "grad_norm": 0.917475700378418, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 40730 + }, + { + "epoch": 2.925673249551167, + "grad_norm": 0.9118483662605286, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 40740 + }, + { + "epoch": 2.926391382405745, + "grad_norm": 0.7722473740577698, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 40750 + }, + { + "epoch": 2.927109515260323, + "grad_norm": 0.7950358986854553, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 40760 + }, + { + "epoch": 2.927827648114901, + "grad_norm": 0.8868561387062073, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 40770 + }, + { + "epoch": 2.9285457809694795, + "grad_norm": 0.7923154830932617, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 40780 + }, + { + "epoch": 2.9292639138240575, + "grad_norm": 0.7285428047180176, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 40790 + }, + { + "epoch": 2.9299820466786355, + "grad_norm": 0.794775664806366, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 40800 + }, + { + "epoch": 2.9307001795332135, + "grad_norm": 0.8351698517799377, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 40810 + }, + { + "epoch": 2.931418312387792, + "grad_norm": 0.853082001209259, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40820 + }, + { + "epoch": 2.93213644524237, + "grad_norm": 0.8209722638130188, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 40830 + }, + { + "epoch": 2.932854578096948, + "grad_norm": 0.8982136845588684, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 40840 + }, + { + "epoch": 2.933572710951526, + "grad_norm": 0.8373305201530457, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 40850 + }, + { + "epoch": 2.934290843806104, + "grad_norm": 0.8326864242553711, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 40860 + }, + { + "epoch": 2.9350089766606824, + "grad_norm": 0.7232590317726135, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 40870 + }, + { + "epoch": 2.9357271095152604, + "grad_norm": 0.823615312576294, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 40880 + }, + { + "epoch": 2.9364452423698384, + "grad_norm": 0.7532811760902405, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 40890 + }, + { + "epoch": 2.9371633752244164, + "grad_norm": 0.9594773650169373, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 40900 + }, + { + "epoch": 2.937881508078995, + "grad_norm": 0.8368398547172546, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 40910 + }, + { + "epoch": 2.938599640933573, + "grad_norm": 0.8336817026138306, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 40920 + }, + { + "epoch": 2.939317773788151, + "grad_norm": 0.8413758277893066, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 40930 + }, + { + "epoch": 2.940035906642729, + "grad_norm": 0.7117549180984497, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 40940 + }, + { + "epoch": 2.940754039497307, + "grad_norm": 0.8741925954818726, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 40950 + }, + { + "epoch": 2.941472172351885, + "grad_norm": 0.8476088047027588, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 40960 + }, + { + "epoch": 2.9421903052064633, + "grad_norm": 0.674659788608551, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 40970 + }, + { + "epoch": 2.9429084380610413, + "grad_norm": 0.7087500691413879, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 40980 + }, + { + "epoch": 2.9436265709156193, + "grad_norm": 0.9202252626419067, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 40990 + }, + { + "epoch": 2.9443447037701977, + "grad_norm": 0.9775124192237854, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 41000 + }, + { + "epoch": 2.9450628366247757, + "grad_norm": 0.7465068101882935, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 41010 + }, + { + "epoch": 2.9457809694793538, + "grad_norm": 0.7229986786842346, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 41020 + }, + { + "epoch": 2.9464991023339318, + "grad_norm": 0.7228954434394836, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 41030 + }, + { + "epoch": 2.9472172351885098, + "grad_norm": 0.9396149516105652, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 41040 + }, + { + "epoch": 2.9479353680430878, + "grad_norm": 0.9458696842193604, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 41050 + }, + { + "epoch": 2.948653500897666, + "grad_norm": 0.8276246190071106, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 41060 + }, + { + "epoch": 2.949371633752244, + "grad_norm": 0.7927420139312744, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 41070 + }, + { + "epoch": 2.950089766606822, + "grad_norm": 0.7403103709220886, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 41080 + }, + { + "epoch": 2.9508078994614, + "grad_norm": 0.9813524484634399, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 41090 + }, + { + "epoch": 2.9515260323159787, + "grad_norm": 0.8560924530029297, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 41100 + }, + { + "epoch": 2.9522441651705567, + "grad_norm": 0.6937443017959595, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 41110 + }, + { + "epoch": 2.9529622980251347, + "grad_norm": 0.8440476655960083, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 41120 + }, + { + "epoch": 2.9536804308797127, + "grad_norm": 1.1260770559310913, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 41130 + }, + { + "epoch": 2.9543985637342907, + "grad_norm": 0.8789936900138855, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 41140 + }, + { + "epoch": 2.9551166965888687, + "grad_norm": 0.8205832839012146, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 41150 + }, + { + "epoch": 2.955834829443447, + "grad_norm": 0.8148444294929504, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 41160 + }, + { + "epoch": 2.956552962298025, + "grad_norm": 0.791296660900116, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41170 + }, + { + "epoch": 2.957271095152603, + "grad_norm": 1.3229854106903076, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 41180 + }, + { + "epoch": 2.9579892280071816, + "grad_norm": 0.906423807144165, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 41190 + }, + { + "epoch": 2.9587073608617596, + "grad_norm": 0.8707411289215088, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 41200 + }, + { + "epoch": 2.9594254937163376, + "grad_norm": 1.0362473726272583, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 41210 + }, + { + "epoch": 2.9601436265709156, + "grad_norm": 0.818546712398529, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 41220 + }, + { + "epoch": 2.9608617594254936, + "grad_norm": 0.8558517098426819, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 41230 + }, + { + "epoch": 2.9615798922800716, + "grad_norm": 0.8262931704521179, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 41240 + }, + { + "epoch": 2.96229802513465, + "grad_norm": 0.9603250026702881, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 41250 + }, + { + "epoch": 2.963016157989228, + "grad_norm": 0.891610860824585, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 41260 + }, + { + "epoch": 2.963734290843806, + "grad_norm": 0.9823883175849915, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 41270 + }, + { + "epoch": 2.9644524236983845, + "grad_norm": 0.8783510327339172, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 41280 + }, + { + "epoch": 2.9651705565529625, + "grad_norm": 0.873656690120697, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 41290 + }, + { + "epoch": 2.9658886894075405, + "grad_norm": 0.8281165957450867, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 41300 + }, + { + "epoch": 2.9666068222621185, + "grad_norm": 0.8008899092674255, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 41310 + }, + { + "epoch": 2.9673249551166965, + "grad_norm": 0.8564065098762512, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41320 + }, + { + "epoch": 2.9680430879712745, + "grad_norm": 0.786119818687439, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41330 + }, + { + "epoch": 2.968761220825853, + "grad_norm": 1.3152399063110352, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 41340 + }, + { + "epoch": 2.969479353680431, + "grad_norm": 0.7551527619361877, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 41350 + }, + { + "epoch": 2.970197486535009, + "grad_norm": 1.1397290229797363, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 41360 + }, + { + "epoch": 2.970915619389587, + "grad_norm": 0.8333854079246521, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 41370 + }, + { + "epoch": 2.9716337522441654, + "grad_norm": 0.8096165657043457, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 41380 + }, + { + "epoch": 2.9723518850987434, + "grad_norm": 0.8378547430038452, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 41390 + }, + { + "epoch": 2.9730700179533214, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 41400 + }, + { + "epoch": 2.9737881508078994, + "grad_norm": 0.8722409605979919, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 41410 + }, + { + "epoch": 2.9745062836624774, + "grad_norm": 0.6680061221122742, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 41420 + }, + { + "epoch": 2.9752244165170554, + "grad_norm": 0.7666152715682983, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 41430 + }, + { + "epoch": 2.975942549371634, + "grad_norm": 0.8489957451820374, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 41440 + }, + { + "epoch": 2.976660682226212, + "grad_norm": 0.8516127467155457, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 41450 + }, + { + "epoch": 2.97737881508079, + "grad_norm": 0.8836804628372192, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 41460 + }, + { + "epoch": 2.9780969479353683, + "grad_norm": 1.0963364839553833, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 41470 + }, + { + "epoch": 2.9788150807899463, + "grad_norm": 0.9908610582351685, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 41480 + }, + { + "epoch": 2.9795332136445243, + "grad_norm": 0.8822041153907776, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 41490 + }, + { + "epoch": 2.9802513464991023, + "grad_norm": 0.717723548412323, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 41500 + }, + { + "epoch": 2.9809694793536803, + "grad_norm": 0.8413400053977966, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 41510 + }, + { + "epoch": 2.9816876122082583, + "grad_norm": 0.8771023750305176, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41520 + }, + { + "epoch": 2.9824057450628367, + "grad_norm": 0.7185000777244568, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 41530 + }, + { + "epoch": 2.9831238779174147, + "grad_norm": 0.8299767374992371, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 41540 + }, + { + "epoch": 2.9838420107719927, + "grad_norm": 0.9309971928596497, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 41550 + }, + { + "epoch": 2.984560143626571, + "grad_norm": 0.7644693851470947, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 41560 + }, + { + "epoch": 2.985278276481149, + "grad_norm": 0.7888111472129822, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 41570 + }, + { + "epoch": 2.985996409335727, + "grad_norm": 1.0921967029571533, + "learning_rate": 0.0002, + "loss": 0.6984, + "step": 41580 + }, + { + "epoch": 2.986714542190305, + "grad_norm": 0.8116785883903503, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 41590 + }, + { + "epoch": 2.987432675044883, + "grad_norm": 0.983269214630127, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 41600 + }, + { + "epoch": 2.988150807899461, + "grad_norm": 0.81700599193573, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 41610 + }, + { + "epoch": 2.9888689407540396, + "grad_norm": 0.7545617818832397, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 41620 + }, + { + "epoch": 2.9895870736086176, + "grad_norm": 0.8695791363716125, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 41630 + }, + { + "epoch": 2.9903052064631956, + "grad_norm": 0.8980445861816406, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 41640 + }, + { + "epoch": 2.9910233393177736, + "grad_norm": 0.7884747982025146, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 41650 + }, + { + "epoch": 2.991741472172352, + "grad_norm": 0.8347880840301514, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 41660 + }, + { + "epoch": 2.99245960502693, + "grad_norm": 0.7786261439323425, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 41670 + }, + { + "epoch": 2.993177737881508, + "grad_norm": 0.7830624580383301, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 41680 + }, + { + "epoch": 2.993895870736086, + "grad_norm": 0.8293532133102417, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 41690 + }, + { + "epoch": 2.994614003590664, + "grad_norm": 0.8476244211196899, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 41700 + }, + { + "epoch": 2.995332136445242, + "grad_norm": 0.7218726873397827, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 41710 + }, + { + "epoch": 2.9960502692998205, + "grad_norm": 0.8144199252128601, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 41720 + }, + { + "epoch": 2.9967684021543985, + "grad_norm": 0.7047123312950134, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 41730 + }, + { + "epoch": 2.9974865350089765, + "grad_norm": 0.8412184715270996, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 41740 + }, + { + "epoch": 2.998204667863555, + "grad_norm": 0.8840848207473755, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 41750 + }, + { + "epoch": 2.998922800718133, + "grad_norm": 0.7302142977714539, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 41760 + }, + { + "epoch": 2.999640933572711, + "grad_norm": 0.7075994610786438, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 41770 + }, + { + "epoch": 3.0, + "eval_loss": 1.1079821586608887, + "eval_runtime": 55.1897, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 41775 + }, + { + "epoch": 3.000359066427289, + "grad_norm": 0.8630077838897705, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 41780 + }, + { + "epoch": 3.001077199281867, + "grad_norm": 0.8901806473731995, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 41790 + }, + { + "epoch": 3.0017953321364454, + "grad_norm": 0.8291767835617065, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 41800 + }, + { + "epoch": 3.0025134649910235, + "grad_norm": 0.792519211769104, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 41810 + }, + { + "epoch": 3.0032315978456015, + "grad_norm": 1.1330063343048096, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 41820 + }, + { + "epoch": 3.0039497307001795, + "grad_norm": 0.9401350617408752, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 41830 + }, + { + "epoch": 3.0046678635547575, + "grad_norm": 0.8065463304519653, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 41840 + }, + { + "epoch": 3.005385996409336, + "grad_norm": 0.8309979438781738, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 41850 + }, + { + "epoch": 3.006104129263914, + "grad_norm": 0.7432689070701599, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 41860 + }, + { + "epoch": 3.006822262118492, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 41870 + }, + { + "epoch": 3.00754039497307, + "grad_norm": 1.4364255666732788, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 41880 + }, + { + "epoch": 3.008258527827648, + "grad_norm": 0.9023072123527527, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 41890 + }, + { + "epoch": 3.0089766606822264, + "grad_norm": 0.7790587544441223, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 41900 + }, + { + "epoch": 3.0096947935368044, + "grad_norm": 0.9163706302642822, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 41910 + }, + { + "epoch": 3.0104129263913824, + "grad_norm": 0.8147963285446167, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 41920 + }, + { + "epoch": 3.0111310592459604, + "grad_norm": 0.8432748913764954, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 41930 + }, + { + "epoch": 3.011849192100539, + "grad_norm": 0.9216182231903076, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 41940 + }, + { + "epoch": 3.012567324955117, + "grad_norm": 0.62154221534729, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 41950 + }, + { + "epoch": 3.013285457809695, + "grad_norm": 0.8902392387390137, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 41960 + }, + { + "epoch": 3.014003590664273, + "grad_norm": 0.9601083993911743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 41970 + }, + { + "epoch": 3.014721723518851, + "grad_norm": 0.8938809037208557, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 41980 + }, + { + "epoch": 3.0154398563734293, + "grad_norm": 1.0621999502182007, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 41990 + }, + { + "epoch": 3.0161579892280073, + "grad_norm": 0.7310585379600525, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 42000 + }, + { + "epoch": 3.0168761220825853, + "grad_norm": 0.8475853800773621, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 42010 + }, + { + "epoch": 3.0175942549371633, + "grad_norm": 0.8509864807128906, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 42020 + }, + { + "epoch": 3.0183123877917413, + "grad_norm": 0.7461876273155212, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 42030 + }, + { + "epoch": 3.0190305206463197, + "grad_norm": 0.7734265327453613, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 42040 + }, + { + "epoch": 3.0197486535008977, + "grad_norm": 0.9056455492973328, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 42050 + }, + { + "epoch": 3.0204667863554757, + "grad_norm": 0.9183889031410217, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 42060 + }, + { + "epoch": 3.0211849192100537, + "grad_norm": 1.0777326822280884, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 42070 + }, + { + "epoch": 3.021903052064632, + "grad_norm": 0.9217308163642883, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 42080 + }, + { + "epoch": 3.02262118491921, + "grad_norm": 0.8220202326774597, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42090 + }, + { + "epoch": 3.023339317773788, + "grad_norm": 0.8454978466033936, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 42100 + }, + { + "epoch": 3.024057450628366, + "grad_norm": 0.8116370439529419, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 42110 + }, + { + "epoch": 3.024775583482944, + "grad_norm": 0.8064935207366943, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 42120 + }, + { + "epoch": 3.0254937163375226, + "grad_norm": 0.9718650579452515, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 42130 + }, + { + "epoch": 3.0262118491921006, + "grad_norm": 0.8817588090896606, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 42140 + }, + { + "epoch": 3.0269299820466786, + "grad_norm": 0.7757318615913391, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 42150 + }, + { + "epoch": 3.0276481149012566, + "grad_norm": 0.7500545382499695, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 42160 + }, + { + "epoch": 3.0283662477558346, + "grad_norm": 0.72913658618927, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 42170 + }, + { + "epoch": 3.029084380610413, + "grad_norm": 0.7641891837120056, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 42180 + }, + { + "epoch": 3.029802513464991, + "grad_norm": 0.7682021856307983, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 42190 + }, + { + "epoch": 3.030520646319569, + "grad_norm": 0.8145958781242371, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 42200 + }, + { + "epoch": 3.031238779174147, + "grad_norm": 1.0546396970748901, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 42210 + }, + { + "epoch": 3.0319569120287255, + "grad_norm": 0.8222804665565491, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 42220 + }, + { + "epoch": 3.0326750448833035, + "grad_norm": 0.8245829343795776, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 42230 + }, + { + "epoch": 3.0333931777378815, + "grad_norm": 0.9059963822364807, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 42240 + }, + { + "epoch": 3.0341113105924595, + "grad_norm": 1.026747465133667, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 42250 + }, + { + "epoch": 3.0348294434470375, + "grad_norm": 0.9108404517173767, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42260 + }, + { + "epoch": 3.035547576301616, + "grad_norm": 0.9828516840934753, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 42270 + }, + { + "epoch": 3.036265709156194, + "grad_norm": 0.9664266705513, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 42280 + }, + { + "epoch": 3.036983842010772, + "grad_norm": 0.7577654719352722, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42290 + }, + { + "epoch": 3.03770197486535, + "grad_norm": 0.8331853151321411, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 42300 + }, + { + "epoch": 3.038420107719928, + "grad_norm": 0.8017228245735168, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 42310 + }, + { + "epoch": 3.0391382405745064, + "grad_norm": 1.0316718816757202, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 42320 + }, + { + "epoch": 3.0398563734290844, + "grad_norm": 0.9379803538322449, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 42330 + }, + { + "epoch": 3.0405745062836624, + "grad_norm": 0.7554476857185364, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 42340 + }, + { + "epoch": 3.0412926391382404, + "grad_norm": 0.7377917766571045, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 42350 + }, + { + "epoch": 3.042010771992819, + "grad_norm": 1.0655276775360107, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 42360 + }, + { + "epoch": 3.042728904847397, + "grad_norm": 0.7748511433601379, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 42370 + }, + { + "epoch": 3.043447037701975, + "grad_norm": 0.848649799823761, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 42380 + }, + { + "epoch": 3.044165170556553, + "grad_norm": 0.7754636406898499, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 42390 + }, + { + "epoch": 3.044883303411131, + "grad_norm": 0.8173656463623047, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 42400 + }, + { + "epoch": 3.0456014362657093, + "grad_norm": 0.7881983518600464, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 42410 + }, + { + "epoch": 3.0463195691202873, + "grad_norm": 0.971072256565094, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 42420 + }, + { + "epoch": 3.0470377019748653, + "grad_norm": 0.8400143384933472, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 42430 + }, + { + "epoch": 3.0477558348294433, + "grad_norm": 1.0028647184371948, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 42440 + }, + { + "epoch": 3.0484739676840213, + "grad_norm": 0.9728034734725952, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 42450 + }, + { + "epoch": 3.0491921005386, + "grad_norm": 0.937633752822876, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 42460 + }, + { + "epoch": 3.049910233393178, + "grad_norm": 1.0265642404556274, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 42470 + }, + { + "epoch": 3.050628366247756, + "grad_norm": 0.9733216762542725, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 42480 + }, + { + "epoch": 3.051346499102334, + "grad_norm": 0.7039174437522888, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 42490 + }, + { + "epoch": 3.0520646319569122, + "grad_norm": 0.7515231370925903, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 42500 + }, + { + "epoch": 3.0527827648114902, + "grad_norm": 0.9115300178527832, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 42510 + }, + { + "epoch": 3.0535008976660682, + "grad_norm": 0.7403655648231506, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 42520 + }, + { + "epoch": 3.0542190305206462, + "grad_norm": 0.7826810479164124, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 42530 + }, + { + "epoch": 3.0549371633752243, + "grad_norm": 0.8007349371910095, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 42540 + }, + { + "epoch": 3.0556552962298027, + "grad_norm": 0.7975959777832031, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 42550 + }, + { + "epoch": 3.0563734290843807, + "grad_norm": 0.9665228128433228, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42560 + }, + { + "epoch": 3.0570915619389587, + "grad_norm": 0.8386123180389404, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 42570 + }, + { + "epoch": 3.0578096947935367, + "grad_norm": 0.7437782287597656, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 42580 + }, + { + "epoch": 3.0585278276481147, + "grad_norm": 0.8360698223114014, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 42590 + }, + { + "epoch": 3.059245960502693, + "grad_norm": 0.8982073664665222, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42600 + }, + { + "epoch": 3.059964093357271, + "grad_norm": 0.9425758719444275, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 42610 + }, + { + "epoch": 3.060682226211849, + "grad_norm": 0.8567131161689758, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42620 + }, + { + "epoch": 3.061400359066427, + "grad_norm": 0.9322942495346069, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 42630 + }, + { + "epoch": 3.0621184919210056, + "grad_norm": 0.8283235430717468, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 42640 + }, + { + "epoch": 3.0628366247755836, + "grad_norm": 0.8457967638969421, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 42650 + }, + { + "epoch": 3.0635547576301616, + "grad_norm": 0.8205100893974304, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42660 + }, + { + "epoch": 3.0642728904847396, + "grad_norm": 0.8385181427001953, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 42670 + }, + { + "epoch": 3.0649910233393176, + "grad_norm": 1.2959390878677368, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 42680 + }, + { + "epoch": 3.065709156193896, + "grad_norm": 0.7150540351867676, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 42690 + }, + { + "epoch": 3.066427289048474, + "grad_norm": 0.6647360920906067, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 42700 + }, + { + "epoch": 3.067145421903052, + "grad_norm": 0.9148316979408264, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 42710 + }, + { + "epoch": 3.06786355475763, + "grad_norm": 0.8606209754943848, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 42720 + }, + { + "epoch": 3.068581687612208, + "grad_norm": 1.4255632162094116, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42730 + }, + { + "epoch": 3.0692998204667865, + "grad_norm": 0.9131710529327393, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 42740 + }, + { + "epoch": 3.0700179533213645, + "grad_norm": 0.9560360908508301, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 42750 + }, + { + "epoch": 3.0707360861759425, + "grad_norm": 0.9278100728988647, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42760 + }, + { + "epoch": 3.0714542190305205, + "grad_norm": 0.7258471846580505, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 42770 + }, + { + "epoch": 3.072172351885099, + "grad_norm": 1.1537690162658691, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 42780 + }, + { + "epoch": 3.072890484739677, + "grad_norm": 0.8562588691711426, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 42790 + }, + { + "epoch": 3.073608617594255, + "grad_norm": 1.0271626710891724, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 42800 + }, + { + "epoch": 3.074326750448833, + "grad_norm": 0.85148024559021, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 42810 + }, + { + "epoch": 3.075044883303411, + "grad_norm": 0.805772602558136, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 42820 + }, + { + "epoch": 3.0757630161579894, + "grad_norm": 0.8057122230529785, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 42830 + }, + { + "epoch": 3.0764811490125674, + "grad_norm": 0.7997274994850159, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 42840 + }, + { + "epoch": 3.0771992818671454, + "grad_norm": 0.8739321231842041, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 42850 + }, + { + "epoch": 3.0779174147217234, + "grad_norm": 0.833951473236084, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 42860 + }, + { + "epoch": 3.0786355475763014, + "grad_norm": 0.8813839554786682, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 42870 + }, + { + "epoch": 3.07935368043088, + "grad_norm": 0.9020521640777588, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 42880 + }, + { + "epoch": 3.080071813285458, + "grad_norm": 0.888148844242096, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 42890 + }, + { + "epoch": 3.080789946140036, + "grad_norm": 0.8110589385032654, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 42900 + }, + { + "epoch": 3.081508078994614, + "grad_norm": 0.818738579750061, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 42910 + }, + { + "epoch": 3.082226211849192, + "grad_norm": 0.9607479572296143, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 42920 + }, + { + "epoch": 3.0829443447037703, + "grad_norm": 0.8162698745727539, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 42930 + }, + { + "epoch": 3.0836624775583483, + "grad_norm": 0.8170801997184753, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 42940 + }, + { + "epoch": 3.0843806104129263, + "grad_norm": 0.9250763654708862, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 42950 + }, + { + "epoch": 3.0850987432675043, + "grad_norm": 0.898097813129425, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 42960 + }, + { + "epoch": 3.0858168761220828, + "grad_norm": 0.9398433566093445, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 42970 + }, + { + "epoch": 3.0865350089766608, + "grad_norm": 1.052808165550232, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 42980 + }, + { + "epoch": 3.087253141831239, + "grad_norm": 0.8974723219871521, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 42990 + }, + { + "epoch": 3.087971274685817, + "grad_norm": 0.7517408728599548, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 43000 + }, + { + "epoch": 3.088689407540395, + "grad_norm": 0.8054485321044922, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 43010 + }, + { + "epoch": 3.0894075403949732, + "grad_norm": 0.9896154999732971, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 43020 + }, + { + "epoch": 3.0901256732495512, + "grad_norm": 0.7887356281280518, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 43030 + }, + { + "epoch": 3.0908438061041292, + "grad_norm": 1.0119125843048096, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 43040 + }, + { + "epoch": 3.0915619389587072, + "grad_norm": 0.8753892779350281, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 43050 + }, + { + "epoch": 3.0922800718132857, + "grad_norm": 0.8322654962539673, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43060 + }, + { + "epoch": 3.0929982046678637, + "grad_norm": 1.0605992078781128, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 43070 + }, + { + "epoch": 3.0937163375224417, + "grad_norm": 0.8783912062644958, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 43080 + }, + { + "epoch": 3.0944344703770197, + "grad_norm": 0.8839107751846313, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 43090 + }, + { + "epoch": 3.0951526032315977, + "grad_norm": 1.1655086278915405, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 43100 + }, + { + "epoch": 3.095870736086176, + "grad_norm": 0.7051523327827454, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 43110 + }, + { + "epoch": 3.096588868940754, + "grad_norm": 0.7793807983398438, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43120 + }, + { + "epoch": 3.097307001795332, + "grad_norm": 0.8352194428443909, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 43130 + }, + { + "epoch": 3.09802513464991, + "grad_norm": 0.9684847593307495, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 43140 + }, + { + "epoch": 3.098743267504488, + "grad_norm": 1.1106340885162354, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 43150 + }, + { + "epoch": 3.0994614003590666, + "grad_norm": 0.7814911603927612, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 43160 + }, + { + "epoch": 3.1001795332136446, + "grad_norm": 0.7923110723495483, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 43170 + }, + { + "epoch": 3.1008976660682226, + "grad_norm": 0.87022864818573, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 43180 + }, + { + "epoch": 3.1016157989228006, + "grad_norm": 0.9352855682373047, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 43190 + }, + { + "epoch": 3.1023339317773786, + "grad_norm": 0.8548445105552673, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 43200 + }, + { + "epoch": 3.103052064631957, + "grad_norm": 0.9576025009155273, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 43210 + }, + { + "epoch": 3.103770197486535, + "grad_norm": 0.7430430054664612, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 43220 + }, + { + "epoch": 3.104488330341113, + "grad_norm": 0.9619144797325134, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 43230 + }, + { + "epoch": 3.105206463195691, + "grad_norm": 0.8622338771820068, + "learning_rate": 0.0002, + "loss": 0.6171, + "step": 43240 + }, + { + "epoch": 3.1059245960502695, + "grad_norm": 0.853489339351654, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43250 + }, + { + "epoch": 3.1066427289048475, + "grad_norm": 0.9253206849098206, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 43260 + }, + { + "epoch": 3.1073608617594255, + "grad_norm": 0.9700671434402466, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 43270 + }, + { + "epoch": 3.1080789946140035, + "grad_norm": 1.0550731420516968, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 43280 + }, + { + "epoch": 3.1087971274685815, + "grad_norm": 0.939452052116394, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 43290 + }, + { + "epoch": 3.10951526032316, + "grad_norm": 0.8855276107788086, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 43300 + }, + { + "epoch": 3.110233393177738, + "grad_norm": 0.92197185754776, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 43310 + }, + { + "epoch": 3.110951526032316, + "grad_norm": 0.8825578689575195, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 43320 + }, + { + "epoch": 3.111669658886894, + "grad_norm": 0.9964608550071716, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 43330 + }, + { + "epoch": 3.1123877917414724, + "grad_norm": 0.9070520401000977, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 43340 + }, + { + "epoch": 3.1131059245960504, + "grad_norm": 0.9699633717536926, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 43350 + }, + { + "epoch": 3.1138240574506284, + "grad_norm": 0.7384091019630432, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 43360 + }, + { + "epoch": 3.1145421903052064, + "grad_norm": 0.9445326328277588, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 43370 + }, + { + "epoch": 3.1152603231597844, + "grad_norm": 0.8906524181365967, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 43380 + }, + { + "epoch": 3.115978456014363, + "grad_norm": 0.8850129246711731, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 43390 + }, + { + "epoch": 3.116696588868941, + "grad_norm": 0.7091860771179199, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 43400 + }, + { + "epoch": 3.117414721723519, + "grad_norm": 0.8992764949798584, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 43410 + }, + { + "epoch": 3.118132854578097, + "grad_norm": 0.9166698455810547, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43420 + }, + { + "epoch": 3.118850987432675, + "grad_norm": 1.1195749044418335, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 43430 + }, + { + "epoch": 3.1195691202872533, + "grad_norm": 0.9414069652557373, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 43440 + }, + { + "epoch": 3.1202872531418313, + "grad_norm": 0.7641217112541199, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 43450 + }, + { + "epoch": 3.1210053859964093, + "grad_norm": 1.2659285068511963, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 43460 + }, + { + "epoch": 3.1217235188509873, + "grad_norm": 0.9968213438987732, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 43470 + }, + { + "epoch": 3.1224416517055653, + "grad_norm": 0.8819042444229126, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 43480 + }, + { + "epoch": 3.1231597845601438, + "grad_norm": 0.9124775528907776, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 43490 + }, + { + "epoch": 3.1238779174147218, + "grad_norm": 0.868354082107544, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 43500 + }, + { + "epoch": 3.1245960502692998, + "grad_norm": 0.7367526292800903, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 43510 + }, + { + "epoch": 3.1253141831238778, + "grad_norm": 0.7553679943084717, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43520 + }, + { + "epoch": 3.126032315978456, + "grad_norm": 0.7970008850097656, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 43530 + }, + { + "epoch": 3.126750448833034, + "grad_norm": 0.9117488861083984, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 43540 + }, + { + "epoch": 3.127468581687612, + "grad_norm": 0.8004103899002075, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 43550 + }, + { + "epoch": 3.12818671454219, + "grad_norm": 0.736518919467926, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 43560 + }, + { + "epoch": 3.128904847396768, + "grad_norm": 0.8568395376205444, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 43570 + }, + { + "epoch": 3.1296229802513467, + "grad_norm": 0.9344052672386169, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 43580 + }, + { + "epoch": 3.1303411131059247, + "grad_norm": 0.7986525297164917, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 43590 + }, + { + "epoch": 3.1310592459605027, + "grad_norm": 0.8283242583274841, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 43600 + }, + { + "epoch": 3.1317773788150807, + "grad_norm": 0.6534292101860046, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 43610 + }, + { + "epoch": 3.132495511669659, + "grad_norm": 0.9585428833961487, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 43620 + }, + { + "epoch": 3.133213644524237, + "grad_norm": 0.8299157023429871, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 43630 + }, + { + "epoch": 3.133931777378815, + "grad_norm": 0.9050052762031555, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 43640 + }, + { + "epoch": 3.134649910233393, + "grad_norm": 1.0457062721252441, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 43650 + }, + { + "epoch": 3.135368043087971, + "grad_norm": 0.907691240310669, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 43660 + }, + { + "epoch": 3.1360861759425496, + "grad_norm": 0.8868935108184814, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 43670 + }, + { + "epoch": 3.1368043087971276, + "grad_norm": 0.8585456609725952, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 43680 + }, + { + "epoch": 3.1375224416517056, + "grad_norm": 1.0402741432189941, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 43690 + }, + { + "epoch": 3.1382405745062836, + "grad_norm": 1.0866798162460327, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 43700 + }, + { + "epoch": 3.1389587073608616, + "grad_norm": 0.7637296915054321, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 43710 + }, + { + "epoch": 3.13967684021544, + "grad_norm": 0.755235493183136, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 43720 + }, + { + "epoch": 3.140394973070018, + "grad_norm": 0.7258853316307068, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 43730 + }, + { + "epoch": 3.141113105924596, + "grad_norm": 1.0425268411636353, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 43740 + }, + { + "epoch": 3.141831238779174, + "grad_norm": 0.9171959757804871, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 43750 + }, + { + "epoch": 3.142549371633752, + "grad_norm": 0.8900150656700134, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 43760 + }, + { + "epoch": 3.1432675044883305, + "grad_norm": 0.9879246354103088, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 43770 + }, + { + "epoch": 3.1439856373429085, + "grad_norm": 0.7853389382362366, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 43780 + }, + { + "epoch": 3.1447037701974865, + "grad_norm": 1.0245232582092285, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 43790 + }, + { + "epoch": 3.1454219030520645, + "grad_norm": 0.8486390113830566, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 43800 + }, + { + "epoch": 3.146140035906643, + "grad_norm": 0.8536406755447388, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 43810 + }, + { + "epoch": 3.146858168761221, + "grad_norm": 0.9653734564781189, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 43820 + }, + { + "epoch": 3.147576301615799, + "grad_norm": 0.8292608857154846, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 43830 + }, + { + "epoch": 3.148294434470377, + "grad_norm": 1.147524118423462, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 43840 + }, + { + "epoch": 3.149012567324955, + "grad_norm": 0.9317546486854553, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 43850 + }, + { + "epoch": 3.1497307001795334, + "grad_norm": 0.8651045560836792, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 43860 + }, + { + "epoch": 3.1504488330341114, + "grad_norm": 0.8718969225883484, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 43870 + }, + { + "epoch": 3.1511669658886894, + "grad_norm": 1.0140702724456787, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 43880 + }, + { + "epoch": 3.1518850987432674, + "grad_norm": 0.75941401720047, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43890 + }, + { + "epoch": 3.152603231597846, + "grad_norm": 0.6618940234184265, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 43900 + }, + { + "epoch": 3.153321364452424, + "grad_norm": 1.0013338327407837, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 43910 + }, + { + "epoch": 3.154039497307002, + "grad_norm": 0.8735299706459045, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 43920 + }, + { + "epoch": 3.15475763016158, + "grad_norm": 1.141914963722229, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 43930 + }, + { + "epoch": 3.155475763016158, + "grad_norm": 1.0916038751602173, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 43940 + }, + { + "epoch": 3.1561938958707363, + "grad_norm": 0.7042547464370728, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 43950 + }, + { + "epoch": 3.1569120287253143, + "grad_norm": 0.9885236620903015, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 43960 + }, + { + "epoch": 3.1576301615798923, + "grad_norm": 0.8083009719848633, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 43970 + }, + { + "epoch": 3.1583482944344703, + "grad_norm": 1.082627296447754, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 43980 + }, + { + "epoch": 3.1590664272890483, + "grad_norm": 0.9293290376663208, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 43990 + }, + { + "epoch": 3.1597845601436267, + "grad_norm": 0.861003041267395, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 44000 + }, + { + "epoch": 3.1605026929982047, + "grad_norm": 0.9565994143486023, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 44010 + }, + { + "epoch": 3.1612208258527827, + "grad_norm": 0.9609305262565613, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 44020 + }, + { + "epoch": 3.1619389587073607, + "grad_norm": 0.847830593585968, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 44030 + }, + { + "epoch": 3.1626570915619387, + "grad_norm": 0.852357804775238, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 44040 + }, + { + "epoch": 3.163375224416517, + "grad_norm": 0.8634562492370605, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44050 + }, + { + "epoch": 3.164093357271095, + "grad_norm": 1.0259950160980225, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 44060 + }, + { + "epoch": 3.164811490125673, + "grad_norm": 0.9615250825881958, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 44070 + }, + { + "epoch": 3.165529622980251, + "grad_norm": 0.9892165660858154, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 44080 + }, + { + "epoch": 3.1662477558348296, + "grad_norm": 0.8827354907989502, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 44090 + }, + { + "epoch": 3.1669658886894076, + "grad_norm": 0.9258168339729309, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 44100 + }, + { + "epoch": 3.1676840215439857, + "grad_norm": 0.7983399033546448, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 44110 + }, + { + "epoch": 3.1684021543985637, + "grad_norm": 0.9917809963226318, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 44120 + }, + { + "epoch": 3.1691202872531417, + "grad_norm": 1.058927297592163, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44130 + }, + { + "epoch": 3.16983842010772, + "grad_norm": 1.0095895528793335, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44140 + }, + { + "epoch": 3.170556552962298, + "grad_norm": 0.9032495617866516, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 44150 + }, + { + "epoch": 3.171274685816876, + "grad_norm": 0.9391272664070129, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 44160 + }, + { + "epoch": 3.171992818671454, + "grad_norm": 0.990755558013916, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44170 + }, + { + "epoch": 3.172710951526032, + "grad_norm": 0.9310759902000427, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 44180 + }, + { + "epoch": 3.1734290843806106, + "grad_norm": 0.7698856592178345, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 44190 + }, + { + "epoch": 3.1741472172351886, + "grad_norm": 0.7735867500305176, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 44200 + }, + { + "epoch": 3.1748653500897666, + "grad_norm": 1.1447525024414062, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 44210 + }, + { + "epoch": 3.1755834829443446, + "grad_norm": 0.8667060136795044, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 44220 + }, + { + "epoch": 3.176301615798923, + "grad_norm": 0.8596829771995544, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 44230 + }, + { + "epoch": 3.177019748653501, + "grad_norm": 0.8607654571533203, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 44240 + }, + { + "epoch": 3.177737881508079, + "grad_norm": 0.9346948266029358, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 44250 + }, + { + "epoch": 3.178456014362657, + "grad_norm": 0.852344810962677, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 44260 + }, + { + "epoch": 3.179174147217235, + "grad_norm": 0.9260450005531311, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 44270 + }, + { + "epoch": 3.1798922800718135, + "grad_norm": 0.924053430557251, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 44280 + }, + { + "epoch": 3.1806104129263915, + "grad_norm": 1.001965045928955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 44290 + }, + { + "epoch": 3.1813285457809695, + "grad_norm": 0.943215012550354, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44300 + }, + { + "epoch": 3.1820466786355475, + "grad_norm": 1.006977915763855, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 44310 + }, + { + "epoch": 3.1827648114901255, + "grad_norm": 0.9768950343132019, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 44320 + }, + { + "epoch": 3.183482944344704, + "grad_norm": 0.9297489523887634, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 44330 + }, + { + "epoch": 3.184201077199282, + "grad_norm": 0.9110919237136841, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 44340 + }, + { + "epoch": 3.18491921005386, + "grad_norm": 0.9821381568908691, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 44350 + }, + { + "epoch": 3.185637342908438, + "grad_norm": 0.8451243042945862, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 44360 + }, + { + "epoch": 3.1863554757630164, + "grad_norm": 0.9676638245582581, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 44370 + }, + { + "epoch": 3.1870736086175944, + "grad_norm": 0.9826035499572754, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 44380 + }, + { + "epoch": 3.1877917414721724, + "grad_norm": 0.9453121423721313, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 44390 + }, + { + "epoch": 3.1885098743267504, + "grad_norm": 0.7766330242156982, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 44400 + }, + { + "epoch": 3.1892280071813284, + "grad_norm": 0.9302349090576172, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 44410 + }, + { + "epoch": 3.189946140035907, + "grad_norm": 0.8335331082344055, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 44420 + }, + { + "epoch": 3.190664272890485, + "grad_norm": 0.6722736358642578, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 44430 + }, + { + "epoch": 3.191382405745063, + "grad_norm": 0.9047536849975586, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 44440 + }, + { + "epoch": 3.192100538599641, + "grad_norm": 0.9653822183609009, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 44450 + }, + { + "epoch": 3.192818671454219, + "grad_norm": 0.7750703692436218, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 44460 + }, + { + "epoch": 3.1935368043087973, + "grad_norm": 0.7767539024353027, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 44470 + }, + { + "epoch": 3.1942549371633753, + "grad_norm": 0.8597778081893921, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44480 + }, + { + "epoch": 3.1949730700179533, + "grad_norm": 1.1711493730545044, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 44490 + }, + { + "epoch": 3.1956912028725313, + "grad_norm": 0.9025220274925232, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 44500 + }, + { + "epoch": 3.1964093357271093, + "grad_norm": 0.8084979057312012, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44510 + }, + { + "epoch": 3.1971274685816877, + "grad_norm": 0.8475074172019958, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44520 + }, + { + "epoch": 3.1978456014362657, + "grad_norm": 0.9915644526481628, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 44530 + }, + { + "epoch": 3.1985637342908437, + "grad_norm": 0.992231547832489, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 44540 + }, + { + "epoch": 3.1992818671454217, + "grad_norm": 0.9804556369781494, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 44550 + }, + { + "epoch": 3.2, + "grad_norm": 1.045558214187622, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 44560 + }, + { + "epoch": 3.200718132854578, + "grad_norm": 1.0880261659622192, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 44570 + }, + { + "epoch": 3.201436265709156, + "grad_norm": 0.9511138200759888, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44580 + }, + { + "epoch": 3.202154398563734, + "grad_norm": 0.9115344882011414, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 44590 + }, + { + "epoch": 3.202872531418312, + "grad_norm": 1.0738362073898315, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 44600 + }, + { + "epoch": 3.2035906642728906, + "grad_norm": 0.8209697604179382, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44610 + }, + { + "epoch": 3.2043087971274686, + "grad_norm": 0.9220197796821594, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44620 + }, + { + "epoch": 3.2050269299820466, + "grad_norm": 0.8859700560569763, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 44630 + }, + { + "epoch": 3.2057450628366246, + "grad_norm": 0.9772757291793823, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 44640 + }, + { + "epoch": 3.206463195691203, + "grad_norm": 0.9385574460029602, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 44650 + }, + { + "epoch": 3.207181328545781, + "grad_norm": 0.839958906173706, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 44660 + }, + { + "epoch": 3.207899461400359, + "grad_norm": 0.860478401184082, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 44670 + }, + { + "epoch": 3.208617594254937, + "grad_norm": 0.846886396408081, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 44680 + }, + { + "epoch": 3.209335727109515, + "grad_norm": 0.8591006398200989, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 44690 + }, + { + "epoch": 3.2100538599640935, + "grad_norm": 0.9236023426055908, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 44700 + }, + { + "epoch": 3.2107719928186715, + "grad_norm": 0.7348999977111816, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44710 + }, + { + "epoch": 3.2114901256732495, + "grad_norm": 1.0041730403900146, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 44720 + }, + { + "epoch": 3.2122082585278275, + "grad_norm": 0.8382687568664551, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 44730 + }, + { + "epoch": 3.2129263913824055, + "grad_norm": 0.8253511190414429, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 44740 + }, + { + "epoch": 3.213644524236984, + "grad_norm": 0.9589242935180664, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 44750 + }, + { + "epoch": 3.214362657091562, + "grad_norm": 0.8938157558441162, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 44760 + }, + { + "epoch": 3.21508078994614, + "grad_norm": 1.0085135698318481, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 44770 + }, + { + "epoch": 3.215798922800718, + "grad_norm": 0.8647134304046631, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 44780 + }, + { + "epoch": 3.216517055655296, + "grad_norm": 1.09453284740448, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 44790 + }, + { + "epoch": 3.2172351885098744, + "grad_norm": 0.8710666298866272, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 44800 + }, + { + "epoch": 3.2179533213644524, + "grad_norm": 0.8080880641937256, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 44810 + }, + { + "epoch": 3.2186714542190304, + "grad_norm": 1.0440675020217896, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 44820 + }, + { + "epoch": 3.2193895870736084, + "grad_norm": 1.1036376953125, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 44830 + }, + { + "epoch": 3.220107719928187, + "grad_norm": 0.8783546686172485, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44840 + }, + { + "epoch": 3.220825852782765, + "grad_norm": 0.7816855907440186, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 44850 + }, + { + "epoch": 3.221543985637343, + "grad_norm": 1.0099157094955444, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 44860 + }, + { + "epoch": 3.222262118491921, + "grad_norm": 1.054928183555603, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 44870 + }, + { + "epoch": 3.222980251346499, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 44880 + }, + { + "epoch": 3.2236983842010773, + "grad_norm": 0.9730798602104187, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 44890 + }, + { + "epoch": 3.2244165170556554, + "grad_norm": 0.7911382913589478, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 44900 + }, + { + "epoch": 3.2251346499102334, + "grad_norm": 0.9574400782585144, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 44910 + }, + { + "epoch": 3.2258527827648114, + "grad_norm": 0.8101068139076233, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 44920 + }, + { + "epoch": 3.22657091561939, + "grad_norm": 0.754146933555603, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 44930 + }, + { + "epoch": 3.227289048473968, + "grad_norm": 0.7471939921379089, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 44940 + }, + { + "epoch": 3.228007181328546, + "grad_norm": 1.0040855407714844, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 44950 + }, + { + "epoch": 3.228725314183124, + "grad_norm": 1.0016074180603027, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 44960 + }, + { + "epoch": 3.229443447037702, + "grad_norm": 1.0432976484298706, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 44970 + }, + { + "epoch": 3.2301615798922803, + "grad_norm": 0.8517055511474609, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 44980 + }, + { + "epoch": 3.2308797127468583, + "grad_norm": 0.9174178242683411, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 44990 + }, + { + "epoch": 3.2315978456014363, + "grad_norm": 0.9733774065971375, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 45000 + }, + { + "epoch": 3.2323159784560143, + "grad_norm": 0.9074714779853821, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 45010 + }, + { + "epoch": 3.2330341113105923, + "grad_norm": 0.8802759051322937, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 45020 + }, + { + "epoch": 3.2337522441651707, + "grad_norm": 1.0620871782302856, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 45030 + }, + { + "epoch": 3.2344703770197487, + "grad_norm": 0.8069542050361633, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 45040 + }, + { + "epoch": 3.2351885098743267, + "grad_norm": 0.9139137864112854, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 45050 + }, + { + "epoch": 3.2359066427289047, + "grad_norm": 0.8936411142349243, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 45060 + }, + { + "epoch": 3.2366247755834827, + "grad_norm": 0.9098079204559326, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 45070 + }, + { + "epoch": 3.237342908438061, + "grad_norm": 1.062953233718872, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45080 + }, + { + "epoch": 3.238061041292639, + "grad_norm": 0.8656470775604248, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 45090 + }, + { + "epoch": 3.238779174147217, + "grad_norm": 0.9299449920654297, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 45100 + }, + { + "epoch": 3.239497307001795, + "grad_norm": 1.0102022886276245, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 45110 + }, + { + "epoch": 3.2402154398563736, + "grad_norm": 0.8074561953544617, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 45120 + }, + { + "epoch": 3.2409335727109516, + "grad_norm": 1.044105887413025, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 45130 + }, + { + "epoch": 3.2416517055655296, + "grad_norm": 0.8742762207984924, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 45140 + }, + { + "epoch": 3.2423698384201076, + "grad_norm": 0.8240015506744385, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 45150 + }, + { + "epoch": 3.2430879712746856, + "grad_norm": 0.8438951373100281, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 45160 + }, + { + "epoch": 3.243806104129264, + "grad_norm": 1.02358877658844, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 45170 + }, + { + "epoch": 3.244524236983842, + "grad_norm": 0.8824774026870728, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 45180 + }, + { + "epoch": 3.24524236983842, + "grad_norm": 0.971015989780426, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 45190 + }, + { + "epoch": 3.245960502692998, + "grad_norm": 0.9282383918762207, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 45200 + }, + { + "epoch": 3.2466786355475765, + "grad_norm": 0.7908362746238708, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 45210 + }, + { + "epoch": 3.2473967684021545, + "grad_norm": 1.0721662044525146, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 45220 + }, + { + "epoch": 3.2481149012567325, + "grad_norm": 0.9516810774803162, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 45230 + }, + { + "epoch": 3.2488330341113105, + "grad_norm": 0.7914131283760071, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 45240 + }, + { + "epoch": 3.2495511669658885, + "grad_norm": 0.8492292761802673, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 45250 + }, + { + "epoch": 3.250269299820467, + "grad_norm": 0.8880114555358887, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 45260 + }, + { + "epoch": 3.250987432675045, + "grad_norm": 0.7808310985565186, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 45270 + }, + { + "epoch": 3.251705565529623, + "grad_norm": 0.8566828966140747, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 45280 + }, + { + "epoch": 3.252423698384201, + "grad_norm": 0.7929658889770508, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45290 + }, + { + "epoch": 3.253141831238779, + "grad_norm": 0.678207516670227, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 45300 + }, + { + "epoch": 3.2538599640933574, + "grad_norm": 0.9963029623031616, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45310 + }, + { + "epoch": 3.2545780969479354, + "grad_norm": 0.835304856300354, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 45320 + }, + { + "epoch": 3.2552962298025134, + "grad_norm": 0.7281617522239685, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 45330 + }, + { + "epoch": 3.2560143626570914, + "grad_norm": 1.244890570640564, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 45340 + }, + { + "epoch": 3.2567324955116694, + "grad_norm": 0.8372750282287598, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 45350 + }, + { + "epoch": 3.257450628366248, + "grad_norm": 1.0029667615890503, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 45360 + }, + { + "epoch": 3.258168761220826, + "grad_norm": 0.8561908602714539, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 45370 + }, + { + "epoch": 3.258886894075404, + "grad_norm": 1.0058085918426514, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 45380 + }, + { + "epoch": 3.259605026929982, + "grad_norm": 0.7768221497535706, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 45390 + }, + { + "epoch": 3.2603231597845603, + "grad_norm": 0.8443793058395386, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 45400 + }, + { + "epoch": 3.2610412926391383, + "grad_norm": 1.0140392780303955, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 45410 + }, + { + "epoch": 3.2617594254937163, + "grad_norm": 0.8397058248519897, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 45420 + }, + { + "epoch": 3.2624775583482943, + "grad_norm": 0.9717063903808594, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 45430 + }, + { + "epoch": 3.2631956912028723, + "grad_norm": 1.0279473066329956, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 45440 + }, + { + "epoch": 3.263913824057451, + "grad_norm": 1.207457184791565, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 45450 + }, + { + "epoch": 3.264631956912029, + "grad_norm": 0.8121998906135559, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 45460 + }, + { + "epoch": 3.265350089766607, + "grad_norm": 1.037733554840088, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 45470 + }, + { + "epoch": 3.266068222621185, + "grad_norm": 0.9305754899978638, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 45480 + }, + { + "epoch": 3.2667863554757632, + "grad_norm": 0.9733602404594421, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 45490 + }, + { + "epoch": 3.2675044883303412, + "grad_norm": 0.8345039486885071, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 45500 + }, + { + "epoch": 3.2682226211849192, + "grad_norm": 0.8601692318916321, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45510 + }, + { + "epoch": 3.2689407540394972, + "grad_norm": 0.7921277284622192, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 45520 + }, + { + "epoch": 3.2696588868940752, + "grad_norm": 0.8324153423309326, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 45530 + }, + { + "epoch": 3.2703770197486537, + "grad_norm": 0.85141521692276, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 45540 + }, + { + "epoch": 3.2710951526032317, + "grad_norm": 0.9399608373641968, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 45550 + }, + { + "epoch": 3.2718132854578097, + "grad_norm": 0.9829166531562805, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 45560 + }, + { + "epoch": 3.2725314183123877, + "grad_norm": 0.9936266541481018, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 45570 + }, + { + "epoch": 3.2732495511669657, + "grad_norm": 1.036165714263916, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 45580 + }, + { + "epoch": 3.273967684021544, + "grad_norm": 0.8988680243492126, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45590 + }, + { + "epoch": 3.274685816876122, + "grad_norm": 0.9173405766487122, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 45600 + }, + { + "epoch": 3.2754039497307, + "grad_norm": 0.9967324733734131, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 45610 + }, + { + "epoch": 3.276122082585278, + "grad_norm": 0.9097777009010315, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 45620 + }, + { + "epoch": 3.276840215439856, + "grad_norm": 1.0559430122375488, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 45630 + }, + { + "epoch": 3.2775583482944346, + "grad_norm": 0.9583360552787781, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 45640 + }, + { + "epoch": 3.2782764811490126, + "grad_norm": 0.7630334496498108, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 45650 + }, + { + "epoch": 3.2789946140035906, + "grad_norm": 0.9955230355262756, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 45660 + }, + { + "epoch": 3.2797127468581686, + "grad_norm": 0.8685793876647949, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45670 + }, + { + "epoch": 3.280430879712747, + "grad_norm": 0.919913113117218, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 45680 + }, + { + "epoch": 3.281149012567325, + "grad_norm": 0.826144814491272, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 45690 + }, + { + "epoch": 3.281867145421903, + "grad_norm": 0.9750179052352905, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 45700 + }, + { + "epoch": 3.282585278276481, + "grad_norm": 0.7931897640228271, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 45710 + }, + { + "epoch": 3.283303411131059, + "grad_norm": 1.0380089282989502, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 45720 + }, + { + "epoch": 3.2840215439856375, + "grad_norm": 0.8220566511154175, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 45730 + }, + { + "epoch": 3.2847396768402155, + "grad_norm": 0.9688239693641663, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 45740 + }, + { + "epoch": 3.2854578096947935, + "grad_norm": 0.8760311603546143, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 45750 + }, + { + "epoch": 3.2861759425493715, + "grad_norm": 0.8103382587432861, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 45760 + }, + { + "epoch": 3.28689407540395, + "grad_norm": 0.8835865259170532, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 45770 + }, + { + "epoch": 3.287612208258528, + "grad_norm": 0.9021160006523132, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45780 + }, + { + "epoch": 3.288330341113106, + "grad_norm": 0.8182386159896851, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 45790 + }, + { + "epoch": 3.289048473967684, + "grad_norm": 0.8555024862289429, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45800 + }, + { + "epoch": 3.289766606822262, + "grad_norm": 1.0982348918914795, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 45810 + }, + { + "epoch": 3.2904847396768404, + "grad_norm": 1.06246817111969, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 45820 + }, + { + "epoch": 3.2912028725314184, + "grad_norm": 1.1727149486541748, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 45830 + }, + { + "epoch": 3.2919210053859964, + "grad_norm": 0.8224700093269348, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 45840 + }, + { + "epoch": 3.2926391382405744, + "grad_norm": 0.8195698261260986, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 45850 + }, + { + "epoch": 3.2933572710951524, + "grad_norm": 0.8424476981163025, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 45860 + }, + { + "epoch": 3.294075403949731, + "grad_norm": 0.9804632067680359, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 45870 + }, + { + "epoch": 3.294793536804309, + "grad_norm": 0.8701804876327515, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 45880 + }, + { + "epoch": 3.295511669658887, + "grad_norm": 0.8876864910125732, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 45890 + }, + { + "epoch": 3.296229802513465, + "grad_norm": 1.0105448961257935, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 45900 + }, + { + "epoch": 3.296947935368043, + "grad_norm": 0.847017228603363, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 45910 + }, + { + "epoch": 3.2976660682226213, + "grad_norm": 0.7610297799110413, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 45920 + }, + { + "epoch": 3.2983842010771993, + "grad_norm": 0.7272670269012451, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 45930 + }, + { + "epoch": 3.2991023339317773, + "grad_norm": 0.8243510127067566, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 45940 + }, + { + "epoch": 3.2998204667863553, + "grad_norm": 1.0113074779510498, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 45950 + }, + { + "epoch": 3.3005385996409338, + "grad_norm": 0.8578087687492371, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 45960 + }, + { + "epoch": 3.3012567324955118, + "grad_norm": 0.9511606097221375, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 45970 + }, + { + "epoch": 3.3019748653500898, + "grad_norm": 0.8612566590309143, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 45980 + }, + { + "epoch": 3.3026929982046678, + "grad_norm": 0.8702331185340881, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 45990 + }, + { + "epoch": 3.3034111310592458, + "grad_norm": 1.0229583978652954, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 46000 + }, + { + "epoch": 3.304129263913824, + "grad_norm": 1.1775577068328857, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 46010 + }, + { + "epoch": 3.3048473967684022, + "grad_norm": 0.9922171831130981, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 46020 + }, + { + "epoch": 3.3055655296229802, + "grad_norm": 0.8246880769729614, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 46030 + }, + { + "epoch": 3.3062836624775582, + "grad_norm": 0.9351653456687927, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 46040 + }, + { + "epoch": 3.3070017953321367, + "grad_norm": 0.9617429375648499, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 46050 + }, + { + "epoch": 3.3077199281867147, + "grad_norm": 0.9753885269165039, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 46060 + }, + { + "epoch": 3.3084380610412927, + "grad_norm": 0.8532425165176392, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 46070 + }, + { + "epoch": 3.3091561938958707, + "grad_norm": 0.9722012877464294, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 46080 + }, + { + "epoch": 3.3098743267504487, + "grad_norm": 0.8950021266937256, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 46090 + }, + { + "epoch": 3.3105924596050267, + "grad_norm": 0.8536333441734314, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46100 + }, + { + "epoch": 3.311310592459605, + "grad_norm": 0.9423946738243103, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46110 + }, + { + "epoch": 3.312028725314183, + "grad_norm": 0.8573169112205505, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 46120 + }, + { + "epoch": 3.312746858168761, + "grad_norm": 1.0122376680374146, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 46130 + }, + { + "epoch": 3.313464991023339, + "grad_norm": 0.7492560744285583, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 46140 + }, + { + "epoch": 3.3141831238779176, + "grad_norm": 1.023658037185669, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 46150 + }, + { + "epoch": 3.3149012567324956, + "grad_norm": 1.1191970109939575, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 46160 + }, + { + "epoch": 3.3156193895870736, + "grad_norm": 0.9847373962402344, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 46170 + }, + { + "epoch": 3.3163375224416516, + "grad_norm": 0.7315911054611206, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 46180 + }, + { + "epoch": 3.3170556552962296, + "grad_norm": 0.8267890214920044, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 46190 + }, + { + "epoch": 3.317773788150808, + "grad_norm": 0.8898099064826965, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 46200 + }, + { + "epoch": 3.318491921005386, + "grad_norm": 0.8525369167327881, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 46210 + }, + { + "epoch": 3.319210053859964, + "grad_norm": 0.8074760437011719, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 46220 + }, + { + "epoch": 3.319928186714542, + "grad_norm": 0.8473616242408752, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 46230 + }, + { + "epoch": 3.3206463195691205, + "grad_norm": 0.8678314089775085, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 46240 + }, + { + "epoch": 3.3213644524236985, + "grad_norm": 0.8718782067298889, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 46250 + }, + { + "epoch": 3.3220825852782765, + "grad_norm": 0.9384858012199402, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 46260 + }, + { + "epoch": 3.3228007181328545, + "grad_norm": 0.9295032620429993, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 46270 + }, + { + "epoch": 3.3235188509874325, + "grad_norm": 0.9472482800483704, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 46280 + }, + { + "epoch": 3.324236983842011, + "grad_norm": 0.7970638275146484, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 46290 + }, + { + "epoch": 3.324955116696589, + "grad_norm": 0.9508723020553589, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 46300 + }, + { + "epoch": 3.325673249551167, + "grad_norm": 0.9153636693954468, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 46310 + }, + { + "epoch": 3.326391382405745, + "grad_norm": 0.7890323400497437, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 46320 + }, + { + "epoch": 3.3271095152603234, + "grad_norm": 0.8711825609207153, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46330 + }, + { + "epoch": 3.3278276481149014, + "grad_norm": 0.9938926696777344, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 46340 + }, + { + "epoch": 3.3285457809694794, + "grad_norm": 0.8497524857521057, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 46350 + }, + { + "epoch": 3.3292639138240574, + "grad_norm": 0.9191650748252869, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 46360 + }, + { + "epoch": 3.3299820466786354, + "grad_norm": 0.8974085450172424, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 46370 + }, + { + "epoch": 3.3307001795332134, + "grad_norm": 0.9928934574127197, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 46380 + }, + { + "epoch": 3.331418312387792, + "grad_norm": 0.9011030197143555, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46390 + }, + { + "epoch": 3.33213644524237, + "grad_norm": 0.898594856262207, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 46400 + }, + { + "epoch": 3.332854578096948, + "grad_norm": 0.7506672143936157, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 46410 + }, + { + "epoch": 3.333572710951526, + "grad_norm": 0.9239172339439392, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 46420 + }, + { + "epoch": 3.3342908438061043, + "grad_norm": 1.0749682188034058, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46430 + }, + { + "epoch": 3.3350089766606823, + "grad_norm": 0.9262617230415344, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 46440 + }, + { + "epoch": 3.3357271095152603, + "grad_norm": 0.8681274056434631, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 46450 + }, + { + "epoch": 3.3364452423698383, + "grad_norm": 0.9558620452880859, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 46460 + }, + { + "epoch": 3.3371633752244163, + "grad_norm": 0.8907097578048706, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 46470 + }, + { + "epoch": 3.3378815080789948, + "grad_norm": 1.0941565036773682, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 46480 + }, + { + "epoch": 3.3385996409335728, + "grad_norm": 0.8971590995788574, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 46490 + }, + { + "epoch": 3.3393177737881508, + "grad_norm": 1.0315606594085693, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 46500 + }, + { + "epoch": 3.3400359066427288, + "grad_norm": 0.7717124223709106, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 46510 + }, + { + "epoch": 3.340754039497307, + "grad_norm": 0.8060970902442932, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 46520 + }, + { + "epoch": 3.341472172351885, + "grad_norm": 0.969510018825531, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 46530 + }, + { + "epoch": 3.342190305206463, + "grad_norm": 0.8837248682975769, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 46540 + }, + { + "epoch": 3.342908438061041, + "grad_norm": 0.9561076164245605, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 46550 + }, + { + "epoch": 3.343626570915619, + "grad_norm": 0.8529208898544312, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 46560 + }, + { + "epoch": 3.3443447037701977, + "grad_norm": 1.1300519704818726, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 46570 + }, + { + "epoch": 3.3450628366247757, + "grad_norm": 0.8330956101417542, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46580 + }, + { + "epoch": 3.3457809694793537, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 46590 + }, + { + "epoch": 3.3464991023339317, + "grad_norm": 1.0470821857452393, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 46600 + }, + { + "epoch": 3.34721723518851, + "grad_norm": 0.9933704137802124, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46610 + }, + { + "epoch": 3.347935368043088, + "grad_norm": 0.8130798935890198, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 46620 + }, + { + "epoch": 3.348653500897666, + "grad_norm": 0.9746946692466736, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46630 + }, + { + "epoch": 3.349371633752244, + "grad_norm": 0.8607267141342163, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46640 + }, + { + "epoch": 3.350089766606822, + "grad_norm": 0.800335705280304, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 46650 + }, + { + "epoch": 3.3508078994614, + "grad_norm": 1.0083239078521729, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 46660 + }, + { + "epoch": 3.3515260323159786, + "grad_norm": 1.0774433612823486, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 46670 + }, + { + "epoch": 3.3522441651705566, + "grad_norm": 0.9378824234008789, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46680 + }, + { + "epoch": 3.3529622980251346, + "grad_norm": 0.8490564227104187, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 46690 + }, + { + "epoch": 3.3536804308797126, + "grad_norm": 1.0415582656860352, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 46700 + }, + { + "epoch": 3.354398563734291, + "grad_norm": 0.8514367938041687, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 46710 + }, + { + "epoch": 3.355116696588869, + "grad_norm": 0.7691360712051392, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 46720 + }, + { + "epoch": 3.355834829443447, + "grad_norm": 0.8345438241958618, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 46730 + }, + { + "epoch": 3.356552962298025, + "grad_norm": 1.023492693901062, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 46740 + }, + { + "epoch": 3.357271095152603, + "grad_norm": 0.9648325443267822, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 46750 + }, + { + "epoch": 3.3579892280071815, + "grad_norm": 0.9029248356819153, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 46760 + }, + { + "epoch": 3.3587073608617595, + "grad_norm": 0.9109513759613037, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 46770 + }, + { + "epoch": 3.3594254937163375, + "grad_norm": 0.7757390141487122, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 46780 + }, + { + "epoch": 3.3601436265709155, + "grad_norm": 0.794035792350769, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46790 + }, + { + "epoch": 3.360861759425494, + "grad_norm": 0.8211429715156555, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 46800 + }, + { + "epoch": 3.361579892280072, + "grad_norm": 0.8620322346687317, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46810 + }, + { + "epoch": 3.36229802513465, + "grad_norm": 0.9392538070678711, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 46820 + }, + { + "epoch": 3.363016157989228, + "grad_norm": 0.8297873139381409, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 46830 + }, + { + "epoch": 3.363734290843806, + "grad_norm": 0.9158190488815308, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 46840 + }, + { + "epoch": 3.3644524236983844, + "grad_norm": 1.1449424028396606, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 46850 + }, + { + "epoch": 3.3651705565529624, + "grad_norm": 0.8718444108963013, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 46860 + }, + { + "epoch": 3.3658886894075404, + "grad_norm": 0.7744014263153076, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 46870 + }, + { + "epoch": 3.3666068222621184, + "grad_norm": 0.8392460942268372, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 46880 + }, + { + "epoch": 3.367324955116697, + "grad_norm": 1.0424989461898804, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 46890 + }, + { + "epoch": 3.368043087971275, + "grad_norm": 1.4696359634399414, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 46900 + }, + { + "epoch": 3.368761220825853, + "grad_norm": 0.9298201203346252, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46910 + }, + { + "epoch": 3.369479353680431, + "grad_norm": 0.8965262770652771, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 46920 + }, + { + "epoch": 3.370197486535009, + "grad_norm": 0.9395381808280945, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 46930 + }, + { + "epoch": 3.370915619389587, + "grad_norm": 0.9069047570228577, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 46940 + }, + { + "epoch": 3.3716337522441653, + "grad_norm": 0.9208605885505676, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46950 + }, + { + "epoch": 3.3723518850987433, + "grad_norm": 0.9493077397346497, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 46960 + }, + { + "epoch": 3.3730700179533213, + "grad_norm": 1.0804208517074585, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 46970 + }, + { + "epoch": 3.3737881508078993, + "grad_norm": 0.9465714693069458, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 46980 + }, + { + "epoch": 3.3745062836624777, + "grad_norm": 0.9189882278442383, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 46990 + }, + { + "epoch": 3.3752244165170557, + "grad_norm": 1.0199357271194458, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 47000 + }, + { + "epoch": 3.3759425493716337, + "grad_norm": 0.8999426960945129, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 47010 + }, + { + "epoch": 3.3766606822262117, + "grad_norm": 0.8923690319061279, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 47020 + }, + { + "epoch": 3.3773788150807897, + "grad_norm": 0.7459347248077393, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 47030 + }, + { + "epoch": 3.378096947935368, + "grad_norm": 0.7702858448028564, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 47040 + }, + { + "epoch": 3.378815080789946, + "grad_norm": 0.8296625018119812, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 47050 + }, + { + "epoch": 3.379533213644524, + "grad_norm": 1.2952555418014526, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47060 + }, + { + "epoch": 3.380251346499102, + "grad_norm": 0.7778869271278381, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 47070 + }, + { + "epoch": 3.3809694793536806, + "grad_norm": 0.9151549339294434, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 47080 + }, + { + "epoch": 3.3816876122082586, + "grad_norm": 0.7883925437927246, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 47090 + }, + { + "epoch": 3.3824057450628366, + "grad_norm": 0.9602295756340027, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 47100 + }, + { + "epoch": 3.3831238779174146, + "grad_norm": 0.7953121066093445, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47110 + }, + { + "epoch": 3.3838420107719926, + "grad_norm": 1.110148549079895, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 47120 + }, + { + "epoch": 3.384560143626571, + "grad_norm": 0.9359608888626099, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 47130 + }, + { + "epoch": 3.385278276481149, + "grad_norm": 0.7877762317657471, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 47140 + }, + { + "epoch": 3.385996409335727, + "grad_norm": 0.8586933016777039, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47150 + }, + { + "epoch": 3.386714542190305, + "grad_norm": 0.8920878767967224, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 47160 + }, + { + "epoch": 3.3874326750448835, + "grad_norm": 0.9692603349685669, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 47170 + }, + { + "epoch": 3.3881508078994615, + "grad_norm": 0.9038610458374023, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 47180 + }, + { + "epoch": 3.3888689407540395, + "grad_norm": 1.6299188137054443, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 47190 + }, + { + "epoch": 3.3895870736086176, + "grad_norm": 0.9704291820526123, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 47200 + }, + { + "epoch": 3.3903052064631956, + "grad_norm": 0.9503401517868042, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 47210 + }, + { + "epoch": 3.3910233393177736, + "grad_norm": 1.0051378011703491, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 47220 + }, + { + "epoch": 3.391741472172352, + "grad_norm": 0.7336357235908508, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 47230 + }, + { + "epoch": 3.39245960502693, + "grad_norm": 0.9847398996353149, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47240 + }, + { + "epoch": 3.393177737881508, + "grad_norm": 0.8100917339324951, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 47250 + }, + { + "epoch": 3.393895870736086, + "grad_norm": 0.9752838611602783, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 47260 + }, + { + "epoch": 3.3946140035906645, + "grad_norm": 0.9400623440742493, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 47270 + }, + { + "epoch": 3.3953321364452425, + "grad_norm": 0.7310057878494263, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 47280 + }, + { + "epoch": 3.3960502692998205, + "grad_norm": 0.8898789286613464, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 47290 + }, + { + "epoch": 3.3967684021543985, + "grad_norm": 1.0157585144042969, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 47300 + }, + { + "epoch": 3.3974865350089765, + "grad_norm": 0.9108527898788452, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 47310 + }, + { + "epoch": 3.398204667863555, + "grad_norm": 0.9796249270439148, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 47320 + }, + { + "epoch": 3.398922800718133, + "grad_norm": 0.8176435232162476, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 47330 + }, + { + "epoch": 3.399640933572711, + "grad_norm": 0.9981188178062439, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 47340 + }, + { + "epoch": 3.400359066427289, + "grad_norm": 0.9774404764175415, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47350 + }, + { + "epoch": 3.4010771992818674, + "grad_norm": 0.8624991774559021, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 47360 + }, + { + "epoch": 3.4017953321364454, + "grad_norm": 0.9191665053367615, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 47370 + }, + { + "epoch": 3.4025134649910234, + "grad_norm": 0.7971290946006775, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 47380 + }, + { + "epoch": 3.4032315978456014, + "grad_norm": 0.8336732983589172, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 47390 + }, + { + "epoch": 3.4039497307001794, + "grad_norm": 0.7730334401130676, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 47400 + }, + { + "epoch": 3.404667863554758, + "grad_norm": 0.8559145927429199, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 47410 + }, + { + "epoch": 3.405385996409336, + "grad_norm": 1.0261447429656982, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 47420 + }, + { + "epoch": 3.406104129263914, + "grad_norm": 0.9931781888008118, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 47430 + }, + { + "epoch": 3.406822262118492, + "grad_norm": 0.8971807360649109, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 47440 + }, + { + "epoch": 3.4075403949730703, + "grad_norm": 0.8886999487876892, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 47450 + }, + { + "epoch": 3.4082585278276483, + "grad_norm": 0.9551735520362854, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 47460 + }, + { + "epoch": 3.4089766606822263, + "grad_norm": 0.9066859483718872, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 47470 + }, + { + "epoch": 3.4096947935368043, + "grad_norm": 0.9192125201225281, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 47480 + }, + { + "epoch": 3.4104129263913823, + "grad_norm": 0.9332839250564575, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 47490 + }, + { + "epoch": 3.4111310592459603, + "grad_norm": 0.745563805103302, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47500 + }, + { + "epoch": 3.4118491921005387, + "grad_norm": 0.6843905448913574, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 47510 + }, + { + "epoch": 3.4125673249551167, + "grad_norm": 0.8063111305236816, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 47520 + }, + { + "epoch": 3.4132854578096947, + "grad_norm": 0.9666593670845032, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 47530 + }, + { + "epoch": 3.4140035906642727, + "grad_norm": 0.8112747073173523, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47540 + }, + { + "epoch": 3.414721723518851, + "grad_norm": 0.820807933807373, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 47550 + }, + { + "epoch": 3.415439856373429, + "grad_norm": 0.8476285338401794, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 47560 + }, + { + "epoch": 3.416157989228007, + "grad_norm": 1.0232552289962769, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47570 + }, + { + "epoch": 3.416876122082585, + "grad_norm": 0.8749372363090515, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 47580 + }, + { + "epoch": 3.417594254937163, + "grad_norm": 0.8117937445640564, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 47590 + }, + { + "epoch": 3.4183123877917416, + "grad_norm": 0.9010460376739502, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 47600 + }, + { + "epoch": 3.4190305206463196, + "grad_norm": 0.8955527544021606, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 47610 + }, + { + "epoch": 3.4197486535008976, + "grad_norm": 0.884186327457428, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 47620 + }, + { + "epoch": 3.4204667863554756, + "grad_norm": 0.8995241522789001, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 47630 + }, + { + "epoch": 3.421184919210054, + "grad_norm": 1.0627013444900513, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47640 + }, + { + "epoch": 3.421903052064632, + "grad_norm": 0.8619979619979858, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 47650 + }, + { + "epoch": 3.42262118491921, + "grad_norm": 0.9682498574256897, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 47660 + }, + { + "epoch": 3.423339317773788, + "grad_norm": 0.9614400863647461, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 47670 + }, + { + "epoch": 3.424057450628366, + "grad_norm": 0.7986962795257568, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 47680 + }, + { + "epoch": 3.4247755834829445, + "grad_norm": 0.8255957961082458, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 47690 + }, + { + "epoch": 3.4254937163375225, + "grad_norm": 0.9139757752418518, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 47700 + }, + { + "epoch": 3.4262118491921005, + "grad_norm": 0.8086292743682861, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 47710 + }, + { + "epoch": 3.4269299820466785, + "grad_norm": 0.8852273225784302, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 47720 + }, + { + "epoch": 3.427648114901257, + "grad_norm": 0.7568784356117249, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 47730 + }, + { + "epoch": 3.428366247755835, + "grad_norm": 0.8933039903640747, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 47740 + }, + { + "epoch": 3.429084380610413, + "grad_norm": 0.8101669549942017, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 47750 + }, + { + "epoch": 3.429802513464991, + "grad_norm": 0.7021054625511169, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 47760 + }, + { + "epoch": 3.430520646319569, + "grad_norm": 0.8282538652420044, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 47770 + }, + { + "epoch": 3.431238779174147, + "grad_norm": 0.8168348670005798, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 47780 + }, + { + "epoch": 3.4319569120287254, + "grad_norm": 0.9504001140594482, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 47790 + }, + { + "epoch": 3.4326750448833034, + "grad_norm": 0.7500190734863281, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47800 + }, + { + "epoch": 3.4333931777378814, + "grad_norm": 0.8645710945129395, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 47810 + }, + { + "epoch": 3.4341113105924594, + "grad_norm": 0.8088704943656921, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 47820 + }, + { + "epoch": 3.434829443447038, + "grad_norm": 0.9981673955917358, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 47830 + }, + { + "epoch": 3.435547576301616, + "grad_norm": 0.9363315105438232, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 47840 + }, + { + "epoch": 3.436265709156194, + "grad_norm": 0.8471030592918396, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 47850 + }, + { + "epoch": 3.436983842010772, + "grad_norm": 0.9447668790817261, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 47860 + }, + { + "epoch": 3.43770197486535, + "grad_norm": 0.9494127631187439, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 47870 + }, + { + "epoch": 3.4384201077199283, + "grad_norm": 0.8340432643890381, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47880 + }, + { + "epoch": 3.4391382405745063, + "grad_norm": 0.8466387987136841, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 47890 + }, + { + "epoch": 3.4398563734290843, + "grad_norm": 0.9498962759971619, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47900 + }, + { + "epoch": 3.4405745062836623, + "grad_norm": 0.8490501046180725, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 47910 + }, + { + "epoch": 3.441292639138241, + "grad_norm": 0.9506490230560303, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 47920 + }, + { + "epoch": 3.442010771992819, + "grad_norm": 0.7944257855415344, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 47930 + }, + { + "epoch": 3.442728904847397, + "grad_norm": 0.9725518226623535, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 47940 + }, + { + "epoch": 3.443447037701975, + "grad_norm": 0.7823024392127991, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47950 + }, + { + "epoch": 3.444165170556553, + "grad_norm": 0.810565173625946, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 47960 + }, + { + "epoch": 3.4448833034111312, + "grad_norm": 0.9809024333953857, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 47970 + }, + { + "epoch": 3.4456014362657092, + "grad_norm": 0.8818578720092773, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 47980 + }, + { + "epoch": 3.4463195691202873, + "grad_norm": 0.9843092560768127, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 47990 + }, + { + "epoch": 3.4470377019748653, + "grad_norm": 0.916313886642456, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 48000 + }, + { + "epoch": 3.4477558348294433, + "grad_norm": 0.908442497253418, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 48010 + }, + { + "epoch": 3.4484739676840217, + "grad_norm": 0.9880178570747375, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 48020 + }, + { + "epoch": 3.4491921005385997, + "grad_norm": 0.9276854991912842, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 48030 + }, + { + "epoch": 3.4499102333931777, + "grad_norm": 1.0879448652267456, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 48040 + }, + { + "epoch": 3.4506283662477557, + "grad_norm": 0.7430389523506165, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 48050 + }, + { + "epoch": 3.4513464991023337, + "grad_norm": 1.0880072116851807, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 48060 + }, + { + "epoch": 3.452064631956912, + "grad_norm": 1.0424141883850098, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 48070 + }, + { + "epoch": 3.45278276481149, + "grad_norm": 0.926330029964447, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 48080 + }, + { + "epoch": 3.453500897666068, + "grad_norm": 0.8911219239234924, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 48090 + }, + { + "epoch": 3.454219030520646, + "grad_norm": 0.8727201223373413, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 48100 + }, + { + "epoch": 3.4549371633752246, + "grad_norm": 0.8573940396308899, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48110 + }, + { + "epoch": 3.4556552962298026, + "grad_norm": 1.0427064895629883, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 48120 + }, + { + "epoch": 3.4563734290843806, + "grad_norm": 0.8688231706619263, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 48130 + }, + { + "epoch": 3.4570915619389586, + "grad_norm": 0.8856009244918823, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 48140 + }, + { + "epoch": 3.4578096947935366, + "grad_norm": 0.9535353183746338, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 48150 + }, + { + "epoch": 3.458527827648115, + "grad_norm": 0.9466010928153992, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 48160 + }, + { + "epoch": 3.459245960502693, + "grad_norm": 0.9783535599708557, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 48170 + }, + { + "epoch": 3.459964093357271, + "grad_norm": 0.8010456562042236, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 48180 + }, + { + "epoch": 3.460682226211849, + "grad_norm": 0.8928955793380737, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 48190 + }, + { + "epoch": 3.4614003590664275, + "grad_norm": 0.7565838694572449, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 48200 + }, + { + "epoch": 3.4621184919210055, + "grad_norm": 1.0044180154800415, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 48210 + }, + { + "epoch": 3.4628366247755835, + "grad_norm": 0.8161038160324097, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 48220 + }, + { + "epoch": 3.4635547576301615, + "grad_norm": 1.1000211238861084, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 48230 + }, + { + "epoch": 3.4642728904847395, + "grad_norm": 0.7942240238189697, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 48240 + }, + { + "epoch": 3.464991023339318, + "grad_norm": 0.7546432018280029, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 48250 + }, + { + "epoch": 3.465709156193896, + "grad_norm": 0.7705255150794983, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 48260 + }, + { + "epoch": 3.466427289048474, + "grad_norm": 0.7958067059516907, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 48270 + }, + { + "epoch": 3.467145421903052, + "grad_norm": 0.9199120402336121, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48280 + }, + { + "epoch": 3.46786355475763, + "grad_norm": 1.118672251701355, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 48290 + }, + { + "epoch": 3.4685816876122084, + "grad_norm": 0.9161015748977661, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 48300 + }, + { + "epoch": 3.4692998204667864, + "grad_norm": 1.1086218357086182, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 48310 + }, + { + "epoch": 3.4700179533213644, + "grad_norm": 1.0123368501663208, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 48320 + }, + { + "epoch": 3.4707360861759424, + "grad_norm": 0.7380602359771729, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 48330 + }, + { + "epoch": 3.4714542190305204, + "grad_norm": 0.8967105150222778, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 48340 + }, + { + "epoch": 3.472172351885099, + "grad_norm": 1.0134044885635376, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48350 + }, + { + "epoch": 3.472890484739677, + "grad_norm": 1.080815076828003, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 48360 + }, + { + "epoch": 3.473608617594255, + "grad_norm": 1.151721477508545, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 48370 + }, + { + "epoch": 3.474326750448833, + "grad_norm": 0.9436505436897278, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 48380 + }, + { + "epoch": 3.4750448833034113, + "grad_norm": 0.9154609441757202, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 48390 + }, + { + "epoch": 3.4757630161579893, + "grad_norm": 0.8943037986755371, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 48400 + }, + { + "epoch": 3.4764811490125673, + "grad_norm": 0.936988115310669, + "learning_rate": 0.0002, + "loss": 0.6316, + "step": 48410 + }, + { + "epoch": 3.4771992818671453, + "grad_norm": 0.826960027217865, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 48420 + }, + { + "epoch": 3.4779174147217233, + "grad_norm": 1.0487587451934814, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 48430 + }, + { + "epoch": 3.478635547576302, + "grad_norm": 0.729163646697998, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 48440 + }, + { + "epoch": 3.47935368043088, + "grad_norm": 0.8156948089599609, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 48450 + }, + { + "epoch": 3.480071813285458, + "grad_norm": 0.8004332184791565, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 48460 + }, + { + "epoch": 3.480789946140036, + "grad_norm": 0.9632692337036133, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 48470 + }, + { + "epoch": 3.4815080789946142, + "grad_norm": 1.0950212478637695, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 48480 + }, + { + "epoch": 3.4822262118491922, + "grad_norm": 0.8574318885803223, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 48490 + }, + { + "epoch": 3.4829443447037702, + "grad_norm": 0.8552606701850891, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 48500 + }, + { + "epoch": 3.4836624775583482, + "grad_norm": 0.9698445200920105, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 48510 + }, + { + "epoch": 3.4843806104129262, + "grad_norm": 0.9427815675735474, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 48520 + }, + { + "epoch": 3.4850987432675042, + "grad_norm": 0.7902070879936218, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 48530 + }, + { + "epoch": 3.4858168761220827, + "grad_norm": 1.0300066471099854, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 48540 + }, + { + "epoch": 3.4865350089766607, + "grad_norm": 1.1688778400421143, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 48550 + }, + { + "epoch": 3.4872531418312387, + "grad_norm": 1.0012071132659912, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 48560 + }, + { + "epoch": 3.4879712746858167, + "grad_norm": 1.112094759941101, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 48570 + }, + { + "epoch": 3.488689407540395, + "grad_norm": 0.8547284603118896, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 48580 + }, + { + "epoch": 3.489407540394973, + "grad_norm": 0.8827278017997742, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 48590 + }, + { + "epoch": 3.490125673249551, + "grad_norm": 0.9255490303039551, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 48600 + }, + { + "epoch": 3.490843806104129, + "grad_norm": 0.8000030517578125, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 48610 + }, + { + "epoch": 3.491561938958707, + "grad_norm": 0.9327391386032104, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 48620 + }, + { + "epoch": 3.4922800718132856, + "grad_norm": 0.9004138708114624, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 48630 + }, + { + "epoch": 3.4929982046678636, + "grad_norm": 0.9886971116065979, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 48640 + }, + { + "epoch": 3.4937163375224416, + "grad_norm": 0.9890487194061279, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 48650 + }, + { + "epoch": 3.4944344703770196, + "grad_norm": 0.7024438977241516, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 48660 + }, + { + "epoch": 3.495152603231598, + "grad_norm": 0.8397303223609924, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 48670 + }, + { + "epoch": 3.495870736086176, + "grad_norm": 0.9120950698852539, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 48680 + }, + { + "epoch": 3.496588868940754, + "grad_norm": 1.057299017906189, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48690 + }, + { + "epoch": 3.497307001795332, + "grad_norm": 0.821325957775116, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 48700 + }, + { + "epoch": 3.49802513464991, + "grad_norm": 1.0029970407485962, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 48710 + }, + { + "epoch": 3.4987432675044885, + "grad_norm": 0.9483712911605835, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 48720 + }, + { + "epoch": 3.4994614003590665, + "grad_norm": 0.9637855291366577, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 48730 + }, + { + "epoch": 3.5001795332136445, + "grad_norm": 0.6848894357681274, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 48740 + }, + { + "epoch": 3.5008976660682225, + "grad_norm": 0.7848573327064514, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 48750 + }, + { + "epoch": 3.501615798922801, + "grad_norm": 1.0341308116912842, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 48760 + }, + { + "epoch": 3.502333931777379, + "grad_norm": 0.8858218193054199, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 48770 + }, + { + "epoch": 3.503052064631957, + "grad_norm": 0.8366939425468445, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 48780 + }, + { + "epoch": 3.503770197486535, + "grad_norm": 0.7926092147827148, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 48790 + }, + { + "epoch": 3.504488330341113, + "grad_norm": 0.8503843545913696, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 48800 + }, + { + "epoch": 3.505206463195691, + "grad_norm": 0.8867869973182678, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 48810 + }, + { + "epoch": 3.5059245960502694, + "grad_norm": 1.0336930751800537, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 48820 + }, + { + "epoch": 3.5066427289048474, + "grad_norm": 0.8564051985740662, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 48830 + }, + { + "epoch": 3.5073608617594254, + "grad_norm": 0.9202605485916138, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 48840 + }, + { + "epoch": 3.508078994614004, + "grad_norm": 0.8838639855384827, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 48850 + }, + { + "epoch": 3.508797127468582, + "grad_norm": 0.8975196480751038, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48860 + }, + { + "epoch": 3.50951526032316, + "grad_norm": 0.8842370510101318, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 48870 + }, + { + "epoch": 3.510233393177738, + "grad_norm": 0.9195886254310608, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 48880 + }, + { + "epoch": 3.510951526032316, + "grad_norm": 0.986130952835083, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 48890 + }, + { + "epoch": 3.511669658886894, + "grad_norm": 0.8119593858718872, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 48900 + }, + { + "epoch": 3.5123877917414723, + "grad_norm": 0.9027136564254761, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 48910 + }, + { + "epoch": 3.5131059245960503, + "grad_norm": 0.8560537099838257, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 48920 + }, + { + "epoch": 3.5138240574506283, + "grad_norm": 0.7073559165000916, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 48930 + }, + { + "epoch": 3.5145421903052063, + "grad_norm": 0.8753304481506348, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 48940 + }, + { + "epoch": 3.5152603231597848, + "grad_norm": 0.9151145815849304, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 48950 + }, + { + "epoch": 3.5159784560143628, + "grad_norm": 0.7794315814971924, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 48960 + }, + { + "epoch": 3.5166965888689408, + "grad_norm": 0.9226023554801941, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 48970 + }, + { + "epoch": 3.5174147217235188, + "grad_norm": 0.8442051410675049, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48980 + }, + { + "epoch": 3.5181328545780968, + "grad_norm": 0.9769423007965088, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 48990 + }, + { + "epoch": 3.5188509874326748, + "grad_norm": 0.740347146987915, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 49000 + }, + { + "epoch": 3.519569120287253, + "grad_norm": 0.8963457345962524, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 49010 + }, + { + "epoch": 3.520287253141831, + "grad_norm": 0.8410176634788513, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 49020 + }, + { + "epoch": 3.521005385996409, + "grad_norm": 1.0486022233963013, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 49030 + }, + { + "epoch": 3.5217235188509877, + "grad_norm": 0.95393967628479, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 49040 + }, + { + "epoch": 3.5224416517055657, + "grad_norm": 0.8261157274246216, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49050 + }, + { + "epoch": 3.5231597845601437, + "grad_norm": 0.9321704506874084, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 49060 + }, + { + "epoch": 3.5238779174147217, + "grad_norm": 1.2596088647842407, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 49070 + }, + { + "epoch": 3.5245960502692997, + "grad_norm": 0.8584637641906738, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 49080 + }, + { + "epoch": 3.5253141831238777, + "grad_norm": 0.850520670413971, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 49090 + }, + { + "epoch": 3.526032315978456, + "grad_norm": 0.8915920257568359, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 49100 + }, + { + "epoch": 3.526750448833034, + "grad_norm": 0.9070239067077637, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 49110 + }, + { + "epoch": 3.527468581687612, + "grad_norm": 0.699878990650177, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 49120 + }, + { + "epoch": 3.5281867145421906, + "grad_norm": 0.9003779888153076, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 49130 + }, + { + "epoch": 3.5289048473967686, + "grad_norm": 0.7886711955070496, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 49140 + }, + { + "epoch": 3.5296229802513466, + "grad_norm": 0.7368922233581543, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 49150 + }, + { + "epoch": 3.5303411131059246, + "grad_norm": 0.8585197329521179, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 49160 + }, + { + "epoch": 3.5310592459605026, + "grad_norm": 1.0205435752868652, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 49170 + }, + { + "epoch": 3.5317773788150806, + "grad_norm": 0.8756650686264038, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 49180 + }, + { + "epoch": 3.532495511669659, + "grad_norm": 1.0278643369674683, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 49190 + }, + { + "epoch": 3.533213644524237, + "grad_norm": 0.8641911745071411, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 49200 + }, + { + "epoch": 3.533931777378815, + "grad_norm": 0.8730159401893616, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 49210 + }, + { + "epoch": 3.534649910233393, + "grad_norm": 0.918637216091156, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 49220 + }, + { + "epoch": 3.5353680430879715, + "grad_norm": 1.0467222929000854, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 49230 + }, + { + "epoch": 3.5360861759425495, + "grad_norm": 1.005009412765503, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 49240 + }, + { + "epoch": 3.5368043087971275, + "grad_norm": 0.9775063395500183, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 49250 + }, + { + "epoch": 3.5375224416517055, + "grad_norm": 0.8198322057723999, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 49260 + }, + { + "epoch": 3.5382405745062835, + "grad_norm": 0.8184829354286194, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 49270 + }, + { + "epoch": 3.5389587073608615, + "grad_norm": 0.9520270824432373, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 49280 + }, + { + "epoch": 3.53967684021544, + "grad_norm": 0.7816803455352783, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 49290 + }, + { + "epoch": 3.540394973070018, + "grad_norm": 0.6915702819824219, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 49300 + }, + { + "epoch": 3.541113105924596, + "grad_norm": 0.8282375931739807, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 49310 + }, + { + "epoch": 3.5418312387791744, + "grad_norm": 1.0797513723373413, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 49320 + }, + { + "epoch": 3.5425493716337524, + "grad_norm": 0.868671715259552, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 49330 + }, + { + "epoch": 3.5432675044883304, + "grad_norm": 0.8534455895423889, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 49340 + }, + { + "epoch": 3.5439856373429084, + "grad_norm": 0.816411554813385, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 49350 + }, + { + "epoch": 3.5447037701974864, + "grad_norm": 0.7813423275947571, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 49360 + }, + { + "epoch": 3.5454219030520644, + "grad_norm": 0.8002013564109802, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 49370 + }, + { + "epoch": 3.546140035906643, + "grad_norm": 0.9740113615989685, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 49380 + }, + { + "epoch": 3.546858168761221, + "grad_norm": 0.9046127200126648, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 49390 + }, + { + "epoch": 3.547576301615799, + "grad_norm": 0.8635150194168091, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 49400 + }, + { + "epoch": 3.5482944344703773, + "grad_norm": 0.9488558769226074, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 49410 + }, + { + "epoch": 3.5490125673249553, + "grad_norm": 0.9637090563774109, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 49420 + }, + { + "epoch": 3.5497307001795333, + "grad_norm": 1.042245626449585, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 49430 + }, + { + "epoch": 3.5504488330341113, + "grad_norm": 0.9076175689697266, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 49440 + }, + { + "epoch": 3.5511669658886893, + "grad_norm": 0.8480596542358398, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 49450 + }, + { + "epoch": 3.5518850987432673, + "grad_norm": 0.8483007550239563, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 49460 + }, + { + "epoch": 3.5526032315978457, + "grad_norm": 0.7855815887451172, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 49470 + }, + { + "epoch": 3.5533213644524237, + "grad_norm": 0.8435823917388916, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 49480 + }, + { + "epoch": 3.5540394973070017, + "grad_norm": 0.8613026142120361, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 49490 + }, + { + "epoch": 3.5547576301615798, + "grad_norm": 0.9654812812805176, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 49500 + }, + { + "epoch": 3.555475763016158, + "grad_norm": 0.8888838887214661, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 49510 + }, + { + "epoch": 3.556193895870736, + "grad_norm": 0.7718146443367004, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49520 + }, + { + "epoch": 3.556912028725314, + "grad_norm": 0.9487382173538208, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 49530 + }, + { + "epoch": 3.557630161579892, + "grad_norm": 0.9256559610366821, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 49540 + }, + { + "epoch": 3.55834829443447, + "grad_norm": 0.8879945874214172, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 49550 + }, + { + "epoch": 3.559066427289048, + "grad_norm": 0.8498744368553162, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 49560 + }, + { + "epoch": 3.5597845601436267, + "grad_norm": 0.9550948143005371, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 49570 + }, + { + "epoch": 3.5605026929982047, + "grad_norm": 0.8386164903640747, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 49580 + }, + { + "epoch": 3.5612208258527827, + "grad_norm": 0.925573468208313, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 49590 + }, + { + "epoch": 3.561938958707361, + "grad_norm": 0.8867112398147583, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 49600 + }, + { + "epoch": 3.562657091561939, + "grad_norm": 0.7638537883758545, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 49610 + }, + { + "epoch": 3.563375224416517, + "grad_norm": 0.9491845965385437, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 49620 + }, + { + "epoch": 3.564093357271095, + "grad_norm": 0.8384189605712891, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 49630 + }, + { + "epoch": 3.564811490125673, + "grad_norm": 0.8850575089454651, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 49640 + }, + { + "epoch": 3.565529622980251, + "grad_norm": 1.020916223526001, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 49650 + }, + { + "epoch": 3.5662477558348296, + "grad_norm": 0.9298280477523804, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 49660 + }, + { + "epoch": 3.5669658886894076, + "grad_norm": 0.9795742034912109, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 49670 + }, + { + "epoch": 3.5676840215439856, + "grad_norm": 0.9401193261146545, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 49680 + }, + { + "epoch": 3.568402154398564, + "grad_norm": 1.0383585691452026, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49690 + }, + { + "epoch": 3.569120287253142, + "grad_norm": 0.8370866179466248, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 49700 + }, + { + "epoch": 3.56983842010772, + "grad_norm": 0.8207486271858215, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 49710 + }, + { + "epoch": 3.570556552962298, + "grad_norm": 0.8551223278045654, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49720 + }, + { + "epoch": 3.571274685816876, + "grad_norm": 0.8041176199913025, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 49730 + }, + { + "epoch": 3.571992818671454, + "grad_norm": 0.9862527847290039, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 49740 + }, + { + "epoch": 3.5727109515260325, + "grad_norm": 0.7557165622711182, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 49750 + }, + { + "epoch": 3.5734290843806105, + "grad_norm": 1.0908563137054443, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 49760 + }, + { + "epoch": 3.5741472172351885, + "grad_norm": 0.7245369553565979, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 49770 + }, + { + "epoch": 3.5748653500897665, + "grad_norm": 0.7851184010505676, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 49780 + }, + { + "epoch": 3.575583482944345, + "grad_norm": 0.9443599581718445, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 49790 + }, + { + "epoch": 3.576301615798923, + "grad_norm": 1.021196961402893, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 49800 + }, + { + "epoch": 3.577019748653501, + "grad_norm": 0.9099196195602417, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 49810 + }, + { + "epoch": 3.577737881508079, + "grad_norm": 0.9397716522216797, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 49820 + }, + { + "epoch": 3.578456014362657, + "grad_norm": 0.9214922785758972, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 49830 + }, + { + "epoch": 3.579174147217235, + "grad_norm": 1.0053879022598267, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 49840 + }, + { + "epoch": 3.5798922800718134, + "grad_norm": 0.9415460228919983, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 49850 + }, + { + "epoch": 3.5806104129263914, + "grad_norm": 1.0807833671569824, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 49860 + }, + { + "epoch": 3.5813285457809694, + "grad_norm": 1.0070871114730835, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 49870 + }, + { + "epoch": 3.582046678635548, + "grad_norm": 0.9707024693489075, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 49880 + }, + { + "epoch": 3.582764811490126, + "grad_norm": 0.9979593753814697, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 49890 + }, + { + "epoch": 3.583482944344704, + "grad_norm": 0.7238648533821106, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 49900 + }, + { + "epoch": 3.584201077199282, + "grad_norm": 0.8168631792068481, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 49910 + }, + { + "epoch": 3.58491921005386, + "grad_norm": 0.8156409859657288, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 49920 + }, + { + "epoch": 3.585637342908438, + "grad_norm": 0.9256414175033569, + "learning_rate": 0.0002, + "loss": 0.6248, + "step": 49930 + }, + { + "epoch": 3.5863554757630163, + "grad_norm": 1.0090070962905884, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 49940 + }, + { + "epoch": 3.5870736086175943, + "grad_norm": 0.8257701992988586, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 49950 + }, + { + "epoch": 3.5877917414721723, + "grad_norm": 0.9189013242721558, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 49960 + }, + { + "epoch": 3.5885098743267507, + "grad_norm": 0.8497788310050964, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 49970 + }, + { + "epoch": 3.5892280071813287, + "grad_norm": 0.9596505761146545, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 49980 + }, + { + "epoch": 3.5899461400359067, + "grad_norm": 0.8773331642150879, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 49990 + }, + { + "epoch": 3.5906642728904847, + "grad_norm": 0.8952302932739258, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50000 + }, + { + "epoch": 3.5913824057450627, + "grad_norm": 0.7713809609413147, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 50010 + }, + { + "epoch": 3.5921005385996407, + "grad_norm": 1.0151346921920776, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 50020 + }, + { + "epoch": 3.592818671454219, + "grad_norm": 0.8793733716011047, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 50030 + }, + { + "epoch": 3.593536804308797, + "grad_norm": 0.8881325721740723, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 50040 + }, + { + "epoch": 3.594254937163375, + "grad_norm": 0.9346749782562256, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 50050 + }, + { + "epoch": 3.594973070017953, + "grad_norm": 0.8705052137374878, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 50060 + }, + { + "epoch": 3.5956912028725316, + "grad_norm": 1.039197564125061, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 50070 + }, + { + "epoch": 3.5964093357271096, + "grad_norm": 0.7053273320198059, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 50080 + }, + { + "epoch": 3.5971274685816876, + "grad_norm": 0.8268665671348572, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 50090 + }, + { + "epoch": 3.5978456014362656, + "grad_norm": 0.8921764492988586, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 50100 + }, + { + "epoch": 3.5985637342908436, + "grad_norm": 0.9756084680557251, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 50110 + }, + { + "epoch": 3.5992818671454216, + "grad_norm": 0.9275530576705933, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 50120 + }, + { + "epoch": 3.6, + "grad_norm": 0.9030009508132935, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 50130 + }, + { + "epoch": 3.600718132854578, + "grad_norm": 0.7805638909339905, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 50140 + }, + { + "epoch": 3.601436265709156, + "grad_norm": 0.7627325057983398, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 50150 + }, + { + "epoch": 3.6021543985637345, + "grad_norm": 0.7809714078903198, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 50160 + }, + { + "epoch": 3.6028725314183125, + "grad_norm": 0.7910378575325012, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 50170 + }, + { + "epoch": 3.6035906642728905, + "grad_norm": 1.004438042640686, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 50180 + }, + { + "epoch": 3.6043087971274685, + "grad_norm": 0.825969934463501, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 50190 + }, + { + "epoch": 3.6050269299820465, + "grad_norm": 0.8866565227508545, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 50200 + }, + { + "epoch": 3.6057450628366245, + "grad_norm": 0.8920543193817139, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 50210 + }, + { + "epoch": 3.606463195691203, + "grad_norm": 1.106584906578064, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 50220 + }, + { + "epoch": 3.607181328545781, + "grad_norm": 0.916607677936554, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 50230 + }, + { + "epoch": 3.607899461400359, + "grad_norm": 0.8014767169952393, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 50240 + }, + { + "epoch": 3.608617594254937, + "grad_norm": 0.9556822776794434, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 50250 + }, + { + "epoch": 3.6093357271095154, + "grad_norm": 0.9630016684532166, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50260 + }, + { + "epoch": 3.6100538599640934, + "grad_norm": 0.9862125515937805, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 50270 + }, + { + "epoch": 3.6107719928186714, + "grad_norm": 1.0043333768844604, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 50280 + }, + { + "epoch": 3.6114901256732495, + "grad_norm": 0.9255319833755493, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 50290 + }, + { + "epoch": 3.6122082585278275, + "grad_norm": 1.012023687362671, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 50300 + }, + { + "epoch": 3.612926391382406, + "grad_norm": 1.0701122283935547, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50310 + }, + { + "epoch": 3.613644524236984, + "grad_norm": 0.8270810842514038, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 50320 + }, + { + "epoch": 3.614362657091562, + "grad_norm": 0.8881328105926514, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 50330 + }, + { + "epoch": 3.61508078994614, + "grad_norm": 0.9536844491958618, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 50340 + }, + { + "epoch": 3.6157989228007184, + "grad_norm": 0.8044326305389404, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 50350 + }, + { + "epoch": 3.6165170556552964, + "grad_norm": 0.834591805934906, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50360 + }, + { + "epoch": 3.6172351885098744, + "grad_norm": 0.903752863407135, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 50370 + }, + { + "epoch": 3.6179533213644524, + "grad_norm": 0.9148632884025574, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 50380 + }, + { + "epoch": 3.6186714542190304, + "grad_norm": 0.9280176162719727, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 50390 + }, + { + "epoch": 3.6193895870736084, + "grad_norm": 0.9524136781692505, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 50400 + }, + { + "epoch": 3.620107719928187, + "grad_norm": 1.1751197576522827, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 50410 + }, + { + "epoch": 3.620825852782765, + "grad_norm": 1.032279133796692, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 50420 + }, + { + "epoch": 3.621543985637343, + "grad_norm": 0.790741503238678, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 50430 + }, + { + "epoch": 3.6222621184919213, + "grad_norm": 0.9584221243858337, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 50440 + }, + { + "epoch": 3.6229802513464993, + "grad_norm": 0.7792508006095886, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 50450 + }, + { + "epoch": 3.6236983842010773, + "grad_norm": 0.8273448944091797, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 50460 + }, + { + "epoch": 3.6244165170556553, + "grad_norm": 0.8001132607460022, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 50470 + }, + { + "epoch": 3.6251346499102333, + "grad_norm": 1.077109694480896, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 50480 + }, + { + "epoch": 3.6258527827648113, + "grad_norm": 1.111274003982544, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 50490 + }, + { + "epoch": 3.6265709156193897, + "grad_norm": 0.7757347822189331, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 50500 + }, + { + "epoch": 3.6272890484739677, + "grad_norm": 0.9217049479484558, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 50510 + }, + { + "epoch": 3.6280071813285457, + "grad_norm": 0.9362251162528992, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 50520 + }, + { + "epoch": 3.6287253141831237, + "grad_norm": 0.9435479044914246, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 50530 + }, + { + "epoch": 3.629443447037702, + "grad_norm": 0.7748915553092957, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 50540 + }, + { + "epoch": 3.63016157989228, + "grad_norm": 0.8238945007324219, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 50550 + }, + { + "epoch": 3.630879712746858, + "grad_norm": 0.8421505093574524, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 50560 + }, + { + "epoch": 3.631597845601436, + "grad_norm": 1.0272293090820312, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 50570 + }, + { + "epoch": 3.632315978456014, + "grad_norm": 0.7643818259239197, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 50580 + }, + { + "epoch": 3.6330341113105926, + "grad_norm": 0.9756225347518921, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 50590 + }, + { + "epoch": 3.6337522441651706, + "grad_norm": 0.9311570525169373, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 50600 + }, + { + "epoch": 3.6344703770197486, + "grad_norm": 0.8829827904701233, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 50610 + }, + { + "epoch": 3.6351885098743266, + "grad_norm": 0.9473454356193542, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 50620 + }, + { + "epoch": 3.635906642728905, + "grad_norm": 1.1023668050765991, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 50630 + }, + { + "epoch": 3.636624775583483, + "grad_norm": 0.8490299582481384, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 50640 + }, + { + "epoch": 3.637342908438061, + "grad_norm": 1.1129392385482788, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 50650 + }, + { + "epoch": 3.638061041292639, + "grad_norm": 1.0334501266479492, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 50660 + }, + { + "epoch": 3.638779174147217, + "grad_norm": 0.8397296667098999, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 50670 + }, + { + "epoch": 3.639497307001795, + "grad_norm": 0.7984256744384766, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 50680 + }, + { + "epoch": 3.6402154398563735, + "grad_norm": 1.1182054281234741, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 50690 + }, + { + "epoch": 3.6409335727109515, + "grad_norm": 0.8743279576301575, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 50700 + }, + { + "epoch": 3.6416517055655295, + "grad_norm": 0.9101628661155701, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 50710 + }, + { + "epoch": 3.642369838420108, + "grad_norm": 0.8866934180259705, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 50720 + }, + { + "epoch": 3.643087971274686, + "grad_norm": 0.863945484161377, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 50730 + }, + { + "epoch": 3.643806104129264, + "grad_norm": 1.0845744609832764, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 50740 + }, + { + "epoch": 3.644524236983842, + "grad_norm": 0.8610911965370178, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 50750 + }, + { + "epoch": 3.64524236983842, + "grad_norm": 0.8502625226974487, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 50760 + }, + { + "epoch": 3.645960502692998, + "grad_norm": 0.847372829914093, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 50770 + }, + { + "epoch": 3.6466786355475764, + "grad_norm": 0.8649292588233948, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 50780 + }, + { + "epoch": 3.6473967684021544, + "grad_norm": 0.8742905855178833, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 50790 + }, + { + "epoch": 3.6481149012567324, + "grad_norm": 0.9546048641204834, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 50800 + }, + { + "epoch": 3.6488330341113104, + "grad_norm": 0.7893161773681641, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 50810 + }, + { + "epoch": 3.649551166965889, + "grad_norm": 0.9350247979164124, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 50820 + }, + { + "epoch": 3.650269299820467, + "grad_norm": 0.772149384021759, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 50830 + }, + { + "epoch": 3.650987432675045, + "grad_norm": 0.8281718492507935, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 50840 + }, + { + "epoch": 3.651705565529623, + "grad_norm": 0.8063850402832031, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 50850 + }, + { + "epoch": 3.652423698384201, + "grad_norm": 0.8101351261138916, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 50860 + }, + { + "epoch": 3.6531418312387793, + "grad_norm": 0.8747833371162415, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 50870 + }, + { + "epoch": 3.6538599640933573, + "grad_norm": 0.9634656310081482, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 50880 + }, + { + "epoch": 3.6545780969479353, + "grad_norm": 1.1646045446395874, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 50890 + }, + { + "epoch": 3.6552962298025133, + "grad_norm": 0.8538454174995422, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 50900 + }, + { + "epoch": 3.656014362657092, + "grad_norm": 0.7639184594154358, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 50910 + }, + { + "epoch": 3.65673249551167, + "grad_norm": 0.8750212788581848, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 50920 + }, + { + "epoch": 3.657450628366248, + "grad_norm": 0.9161198735237122, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 50930 + }, + { + "epoch": 3.658168761220826, + "grad_norm": 0.7987924814224243, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 50940 + }, + { + "epoch": 3.658886894075404, + "grad_norm": 0.8939290642738342, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 50950 + }, + { + "epoch": 3.659605026929982, + "grad_norm": 0.9803797602653503, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 50960 + }, + { + "epoch": 3.6603231597845602, + "grad_norm": 1.2423512935638428, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 50970 + }, + { + "epoch": 3.6610412926391382, + "grad_norm": 1.0023225545883179, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 50980 + }, + { + "epoch": 3.6617594254937162, + "grad_norm": 0.9066677689552307, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 50990 + }, + { + "epoch": 3.6624775583482947, + "grad_norm": 0.8906226754188538, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 51000 + }, + { + "epoch": 3.6631956912028727, + "grad_norm": 0.7449954152107239, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51010 + }, + { + "epoch": 3.6639138240574507, + "grad_norm": 0.812612771987915, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 51020 + }, + { + "epoch": 3.6646319569120287, + "grad_norm": 0.861818253993988, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 51030 + }, + { + "epoch": 3.6653500897666067, + "grad_norm": 0.849726676940918, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 51040 + }, + { + "epoch": 3.6660682226211847, + "grad_norm": 0.9738494753837585, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 51050 + }, + { + "epoch": 3.666786355475763, + "grad_norm": 0.928989827632904, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 51060 + }, + { + "epoch": 3.667504488330341, + "grad_norm": 0.9725563526153564, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 51070 + }, + { + "epoch": 3.668222621184919, + "grad_norm": 0.9366095066070557, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51080 + }, + { + "epoch": 3.668940754039497, + "grad_norm": 0.8012986779212952, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 51090 + }, + { + "epoch": 3.6696588868940756, + "grad_norm": 1.0646892786026, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51100 + }, + { + "epoch": 3.6703770197486536, + "grad_norm": 0.7245157361030579, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 51110 + }, + { + "epoch": 3.6710951526032316, + "grad_norm": 0.6938936114311218, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 51120 + }, + { + "epoch": 3.6718132854578096, + "grad_norm": 0.8461366295814514, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 51130 + }, + { + "epoch": 3.6725314183123876, + "grad_norm": 0.8392583131790161, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 51140 + }, + { + "epoch": 3.673249551166966, + "grad_norm": 0.7245259284973145, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 51150 + }, + { + "epoch": 3.673967684021544, + "grad_norm": 1.0742167234420776, + "learning_rate": 0.0002, + "loss": 0.6165, + "step": 51160 + }, + { + "epoch": 3.674685816876122, + "grad_norm": 0.9553889036178589, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 51170 + }, + { + "epoch": 3.6754039497307, + "grad_norm": 0.8713715672492981, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 51180 + }, + { + "epoch": 3.6761220825852785, + "grad_norm": 0.7499800324440002, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 51190 + }, + { + "epoch": 3.6768402154398565, + "grad_norm": 1.1118139028549194, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 51200 + }, + { + "epoch": 3.6775583482944345, + "grad_norm": 0.8146613836288452, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 51210 + }, + { + "epoch": 3.6782764811490125, + "grad_norm": 0.9331285357475281, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 51220 + }, + { + "epoch": 3.6789946140035905, + "grad_norm": 1.0497597455978394, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 51230 + }, + { + "epoch": 3.6797127468581685, + "grad_norm": 0.879814863204956, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51240 + }, + { + "epoch": 3.680430879712747, + "grad_norm": 0.9896606802940369, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 51250 + }, + { + "epoch": 3.681149012567325, + "grad_norm": 0.928236186504364, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 51260 + }, + { + "epoch": 3.681867145421903, + "grad_norm": 0.8436732292175293, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 51270 + }, + { + "epoch": 3.6825852782764814, + "grad_norm": 0.93634432554245, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51280 + }, + { + "epoch": 3.6833034111310594, + "grad_norm": 0.8477143049240112, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 51290 + }, + { + "epoch": 3.6840215439856374, + "grad_norm": 0.8720934987068176, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 51300 + }, + { + "epoch": 3.6847396768402154, + "grad_norm": 0.7322931289672852, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 51310 + }, + { + "epoch": 3.6854578096947934, + "grad_norm": 1.0064427852630615, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 51320 + }, + { + "epoch": 3.6861759425493714, + "grad_norm": 1.0197817087173462, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 51330 + }, + { + "epoch": 3.68689407540395, + "grad_norm": 0.8764060139656067, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 51340 + }, + { + "epoch": 3.687612208258528, + "grad_norm": 0.9763964414596558, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 51350 + }, + { + "epoch": 3.688330341113106, + "grad_norm": 0.8389105200767517, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 51360 + }, + { + "epoch": 3.689048473967684, + "grad_norm": 0.9215750694274902, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 51370 + }, + { + "epoch": 3.6897666068222623, + "grad_norm": 0.8444913625717163, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 51380 + }, + { + "epoch": 3.6904847396768403, + "grad_norm": 0.9635153412818909, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 51390 + }, + { + "epoch": 3.6912028725314183, + "grad_norm": 1.0397378206253052, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 51400 + }, + { + "epoch": 3.6919210053859963, + "grad_norm": 0.9154748320579529, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 51410 + }, + { + "epoch": 3.6926391382405743, + "grad_norm": 0.906445324420929, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 51420 + }, + { + "epoch": 3.6933572710951523, + "grad_norm": 0.9237992763519287, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 51430 + }, + { + "epoch": 3.6940754039497308, + "grad_norm": 0.8796338438987732, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 51440 + }, + { + "epoch": 3.6947935368043088, + "grad_norm": 0.8613203763961792, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 51450 + }, + { + "epoch": 3.6955116696588868, + "grad_norm": 0.7957607507705688, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 51460 + }, + { + "epoch": 3.6962298025134652, + "grad_norm": 0.9183711409568787, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 51470 + }, + { + "epoch": 3.6969479353680432, + "grad_norm": 1.0108308792114258, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 51480 + }, + { + "epoch": 3.6976660682226212, + "grad_norm": 0.7768247127532959, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 51490 + }, + { + "epoch": 3.6983842010771992, + "grad_norm": 1.0051485300064087, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 51500 + }, + { + "epoch": 3.6991023339317772, + "grad_norm": 0.82451993227005, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 51510 + }, + { + "epoch": 3.6998204667863552, + "grad_norm": 0.9542286992073059, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 51520 + }, + { + "epoch": 3.7005385996409337, + "grad_norm": 0.693890392780304, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 51530 + }, + { + "epoch": 3.7012567324955117, + "grad_norm": 0.9068924784660339, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 51540 + }, + { + "epoch": 3.7019748653500897, + "grad_norm": 0.8694922924041748, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 51550 + }, + { + "epoch": 3.702692998204668, + "grad_norm": 0.941081702709198, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 51560 + }, + { + "epoch": 3.703411131059246, + "grad_norm": 0.7385984659194946, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 51570 + }, + { + "epoch": 3.704129263913824, + "grad_norm": 1.0399216413497925, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51580 + }, + { + "epoch": 3.704847396768402, + "grad_norm": 0.9802294969558716, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 51590 + }, + { + "epoch": 3.70556552962298, + "grad_norm": 1.0409669876098633, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51600 + }, + { + "epoch": 3.706283662477558, + "grad_norm": 0.8972786068916321, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 51610 + }, + { + "epoch": 3.7070017953321366, + "grad_norm": 1.1916245222091675, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 51620 + }, + { + "epoch": 3.7077199281867146, + "grad_norm": 0.9545385241508484, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 51630 + }, + { + "epoch": 3.7084380610412926, + "grad_norm": 1.0773427486419678, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 51640 + }, + { + "epoch": 3.7091561938958706, + "grad_norm": 1.0856024026870728, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 51650 + }, + { + "epoch": 3.709874326750449, + "grad_norm": 0.7678500413894653, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51660 + }, + { + "epoch": 3.710592459605027, + "grad_norm": 0.7276270985603333, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 51670 + }, + { + "epoch": 3.711310592459605, + "grad_norm": 0.8859017491340637, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 51680 + }, + { + "epoch": 3.712028725314183, + "grad_norm": 0.9037614464759827, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 51690 + }, + { + "epoch": 3.712746858168761, + "grad_norm": 0.9223412275314331, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51700 + }, + { + "epoch": 3.713464991023339, + "grad_norm": 0.8812923431396484, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 51710 + }, + { + "epoch": 3.7141831238779175, + "grad_norm": 0.8242456912994385, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 51720 + }, + { + "epoch": 3.7149012567324955, + "grad_norm": 0.8368834257125854, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 51730 + }, + { + "epoch": 3.7156193895870735, + "grad_norm": 0.8624704480171204, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 51740 + }, + { + "epoch": 3.716337522441652, + "grad_norm": 0.9138273596763611, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51750 + }, + { + "epoch": 3.71705565529623, + "grad_norm": 0.8088571429252625, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 51760 + }, + { + "epoch": 3.717773788150808, + "grad_norm": 0.882808268070221, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 51770 + }, + { + "epoch": 3.718491921005386, + "grad_norm": 0.9368035197257996, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 51780 + }, + { + "epoch": 3.719210053859964, + "grad_norm": 0.8341794013977051, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 51790 + }, + { + "epoch": 3.719928186714542, + "grad_norm": 0.8692073225975037, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 51800 + }, + { + "epoch": 3.7206463195691204, + "grad_norm": 0.7566918730735779, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 51810 + }, + { + "epoch": 3.7213644524236984, + "grad_norm": 1.113138198852539, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 51820 + }, + { + "epoch": 3.7220825852782764, + "grad_norm": 0.8793158531188965, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 51830 + }, + { + "epoch": 3.722800718132855, + "grad_norm": 0.8856439590454102, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 51840 + }, + { + "epoch": 3.723518850987433, + "grad_norm": 1.0182029008865356, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 51850 + }, + { + "epoch": 3.724236983842011, + "grad_norm": 1.1177181005477905, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 51860 + }, + { + "epoch": 3.724955116696589, + "grad_norm": 0.6600990295410156, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 51870 + }, + { + "epoch": 3.725673249551167, + "grad_norm": 1.0563536882400513, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 51880 + }, + { + "epoch": 3.726391382405745, + "grad_norm": 1.1067734956741333, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 51890 + }, + { + "epoch": 3.7271095152603233, + "grad_norm": 1.0204616785049438, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 51900 + }, + { + "epoch": 3.7278276481149013, + "grad_norm": 0.8647155165672302, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51910 + }, + { + "epoch": 3.7285457809694793, + "grad_norm": 1.0754971504211426, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 51920 + }, + { + "epoch": 3.7292639138240573, + "grad_norm": 1.0448992252349854, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 51930 + }, + { + "epoch": 3.7299820466786358, + "grad_norm": 0.963434100151062, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 51940 + }, + { + "epoch": 3.7307001795332138, + "grad_norm": 0.8112701773643494, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51950 + }, + { + "epoch": 3.7314183123877918, + "grad_norm": 0.7975119948387146, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 51960 + }, + { + "epoch": 3.7321364452423698, + "grad_norm": 0.7953376173973083, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 51970 + }, + { + "epoch": 3.7328545780969478, + "grad_norm": 0.9519981741905212, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 51980 + }, + { + "epoch": 3.7335727109515258, + "grad_norm": 0.8705791234970093, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 51990 + }, + { + "epoch": 3.734290843806104, + "grad_norm": 0.870205283164978, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 52000 + }, + { + "epoch": 3.735008976660682, + "grad_norm": 0.9558930993080139, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 52010 + }, + { + "epoch": 3.73572710951526, + "grad_norm": 0.9330434799194336, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 52020 + }, + { + "epoch": 3.7364452423698387, + "grad_norm": 0.783620297908783, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 52030 + }, + { + "epoch": 3.7371633752244167, + "grad_norm": 0.7575166821479797, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52040 + }, + { + "epoch": 3.7378815080789947, + "grad_norm": 1.0592705011367798, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 52050 + }, + { + "epoch": 3.7385996409335727, + "grad_norm": 0.9309433102607727, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 52060 + }, + { + "epoch": 3.7393177737881507, + "grad_norm": 0.972861647605896, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 52070 + }, + { + "epoch": 3.7400359066427287, + "grad_norm": 0.9318740963935852, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 52080 + }, + { + "epoch": 3.740754039497307, + "grad_norm": 0.7938477396965027, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 52090 + }, + { + "epoch": 3.741472172351885, + "grad_norm": 1.1515966653823853, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 52100 + }, + { + "epoch": 3.742190305206463, + "grad_norm": 1.076869010925293, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 52110 + }, + { + "epoch": 3.7429084380610416, + "grad_norm": 0.8516066670417786, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 52120 + }, + { + "epoch": 3.7436265709156196, + "grad_norm": 0.6853429079055786, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 52130 + }, + { + "epoch": 3.7443447037701976, + "grad_norm": 0.8179695010185242, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52140 + }, + { + "epoch": 3.7450628366247756, + "grad_norm": 0.8395232558250427, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 52150 + }, + { + "epoch": 3.7457809694793536, + "grad_norm": 1.0178003311157227, + "learning_rate": 0.0002, + "loss": 0.6902, + "step": 52160 + }, + { + "epoch": 3.7464991023339316, + "grad_norm": 1.1801023483276367, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 52170 + }, + { + "epoch": 3.74721723518851, + "grad_norm": 0.8215751647949219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 52180 + }, + { + "epoch": 3.747935368043088, + "grad_norm": 1.17083740234375, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 52190 + }, + { + "epoch": 3.748653500897666, + "grad_norm": 0.9230290651321411, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 52200 + }, + { + "epoch": 3.749371633752244, + "grad_norm": 0.8431521058082581, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 52210 + }, + { + "epoch": 3.7500897666068225, + "grad_norm": 0.9690840244293213, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 52220 + }, + { + "epoch": 3.7508078994614005, + "grad_norm": 1.0022395849227905, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 52230 + }, + { + "epoch": 3.7515260323159785, + "grad_norm": 1.0489065647125244, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 52240 + }, + { + "epoch": 3.7522441651705565, + "grad_norm": 0.7880696058273315, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 52250 + }, + { + "epoch": 3.7529622980251345, + "grad_norm": 1.0255829095840454, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 52260 + }, + { + "epoch": 3.7536804308797125, + "grad_norm": 0.8470141291618347, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 52270 + }, + { + "epoch": 3.754398563734291, + "grad_norm": 0.9040523171424866, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 52280 + }, + { + "epoch": 3.755116696588869, + "grad_norm": 0.9564392566680908, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 52290 + }, + { + "epoch": 3.755834829443447, + "grad_norm": 0.907857358455658, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 52300 + }, + { + "epoch": 3.7565529622980254, + "grad_norm": 0.8929873704910278, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 52310 + }, + { + "epoch": 3.7572710951526034, + "grad_norm": 0.854434072971344, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 52320 + }, + { + "epoch": 3.7579892280071814, + "grad_norm": 0.8744779229164124, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 52330 + }, + { + "epoch": 3.7587073608617594, + "grad_norm": 0.9022667407989502, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52340 + }, + { + "epoch": 3.7594254937163374, + "grad_norm": 0.8884857892990112, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52350 + }, + { + "epoch": 3.7601436265709154, + "grad_norm": 1.0228430032730103, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 52360 + }, + { + "epoch": 3.760861759425494, + "grad_norm": 0.8593528270721436, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 52370 + }, + { + "epoch": 3.761579892280072, + "grad_norm": 0.9435563087463379, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 52380 + }, + { + "epoch": 3.76229802513465, + "grad_norm": 0.7545679807662964, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52390 + }, + { + "epoch": 3.7630161579892283, + "grad_norm": 0.9411585927009583, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52400 + }, + { + "epoch": 3.7637342908438063, + "grad_norm": 0.9764377474784851, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 52410 + }, + { + "epoch": 3.7644524236983843, + "grad_norm": 1.0718384981155396, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 52420 + }, + { + "epoch": 3.7651705565529623, + "grad_norm": 0.8765230774879456, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52430 + }, + { + "epoch": 3.7658886894075403, + "grad_norm": 0.9275036454200745, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 52440 + }, + { + "epoch": 3.7666068222621183, + "grad_norm": 0.967410147190094, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 52450 + }, + { + "epoch": 3.7673249551166967, + "grad_norm": 0.7738949060440063, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 52460 + }, + { + "epoch": 3.7680430879712747, + "grad_norm": 1.0828070640563965, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 52470 + }, + { + "epoch": 3.7687612208258527, + "grad_norm": 0.9570213556289673, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 52480 + }, + { + "epoch": 3.7694793536804307, + "grad_norm": 1.0688215494155884, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 52490 + }, + { + "epoch": 3.770197486535009, + "grad_norm": 0.7970073223114014, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 52500 + }, + { + "epoch": 3.770915619389587, + "grad_norm": 0.7132976651191711, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 52510 + }, + { + "epoch": 3.771633752244165, + "grad_norm": 1.152268648147583, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 52520 + }, + { + "epoch": 3.772351885098743, + "grad_norm": 0.8645235896110535, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52530 + }, + { + "epoch": 3.773070017953321, + "grad_norm": 0.7725570201873779, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 52540 + }, + { + "epoch": 3.773788150807899, + "grad_norm": 0.9718102812767029, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 52550 + }, + { + "epoch": 3.7745062836624776, + "grad_norm": 0.7568017840385437, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 52560 + }, + { + "epoch": 3.7752244165170556, + "grad_norm": 0.9578912854194641, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 52570 + }, + { + "epoch": 3.7759425493716336, + "grad_norm": 0.8657314777374268, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 52580 + }, + { + "epoch": 3.776660682226212, + "grad_norm": 0.7564393281936646, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 52590 + }, + { + "epoch": 3.77737881508079, + "grad_norm": 0.7631160616874695, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 52600 + }, + { + "epoch": 3.778096947935368, + "grad_norm": 1.1852056980133057, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 52610 + }, + { + "epoch": 3.778815080789946, + "grad_norm": 1.0620790719985962, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 52620 + }, + { + "epoch": 3.779533213644524, + "grad_norm": 0.8677777647972107, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 52630 + }, + { + "epoch": 3.780251346499102, + "grad_norm": 0.9913218021392822, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 52640 + }, + { + "epoch": 3.7809694793536806, + "grad_norm": 0.9868429899215698, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 52650 + }, + { + "epoch": 3.7816876122082586, + "grad_norm": 0.8791782259941101, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 52660 + }, + { + "epoch": 3.7824057450628366, + "grad_norm": 0.9503955245018005, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 52670 + }, + { + "epoch": 3.7831238779174146, + "grad_norm": 0.8647131323814392, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 52680 + }, + { + "epoch": 3.783842010771993, + "grad_norm": 0.9819629788398743, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52690 + }, + { + "epoch": 3.784560143626571, + "grad_norm": 0.8548610210418701, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 52700 + }, + { + "epoch": 3.785278276481149, + "grad_norm": 0.8706230521202087, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 52710 + }, + { + "epoch": 3.785996409335727, + "grad_norm": 1.0032461881637573, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52720 + }, + { + "epoch": 3.786714542190305, + "grad_norm": 1.0578246116638184, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 52730 + }, + { + "epoch": 3.7874326750448835, + "grad_norm": 0.9854007363319397, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52740 + }, + { + "epoch": 3.7881508078994615, + "grad_norm": 0.8389187455177307, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 52750 + }, + { + "epoch": 3.7888689407540395, + "grad_norm": 0.9192399978637695, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 52760 + }, + { + "epoch": 3.7895870736086175, + "grad_norm": 0.9518283605575562, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 52770 + }, + { + "epoch": 3.790305206463196, + "grad_norm": 1.1296825408935547, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52780 + }, + { + "epoch": 3.791023339317774, + "grad_norm": 1.0589144229888916, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 52790 + }, + { + "epoch": 3.791741472172352, + "grad_norm": 0.8954343199729919, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 52800 + }, + { + "epoch": 3.79245960502693, + "grad_norm": 0.8283370733261108, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 52810 + }, + { + "epoch": 3.793177737881508, + "grad_norm": 0.910642683506012, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 52820 + }, + { + "epoch": 3.793895870736086, + "grad_norm": 0.9255108833312988, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 52830 + }, + { + "epoch": 3.7946140035906644, + "grad_norm": 0.8773723244667053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 52840 + }, + { + "epoch": 3.7953321364452424, + "grad_norm": 0.8454240560531616, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 52850 + }, + { + "epoch": 3.7960502692998204, + "grad_norm": 0.7636052966117859, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 52860 + }, + { + "epoch": 3.796768402154399, + "grad_norm": 0.9358382821083069, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 52870 + }, + { + "epoch": 3.797486535008977, + "grad_norm": 0.9662801623344421, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 52880 + }, + { + "epoch": 3.798204667863555, + "grad_norm": 0.995907187461853, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 52890 + }, + { + "epoch": 3.798922800718133, + "grad_norm": 0.8700127005577087, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 52900 + }, + { + "epoch": 3.799640933572711, + "grad_norm": 0.8987792134284973, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 52910 + }, + { + "epoch": 3.800359066427289, + "grad_norm": 0.9753904938697815, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 52920 + }, + { + "epoch": 3.8010771992818673, + "grad_norm": 0.7873555421829224, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 52930 + }, + { + "epoch": 3.8017953321364453, + "grad_norm": 0.8177929520606995, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 52940 + }, + { + "epoch": 3.8025134649910233, + "grad_norm": 0.8865532279014587, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 52950 + }, + { + "epoch": 3.8032315978456013, + "grad_norm": 0.9113775491714478, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 52960 + }, + { + "epoch": 3.8039497307001797, + "grad_norm": 0.9424585700035095, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 52970 + }, + { + "epoch": 3.8046678635547577, + "grad_norm": 0.8347237706184387, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 52980 + }, + { + "epoch": 3.8053859964093357, + "grad_norm": 0.826863169670105, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 52990 + }, + { + "epoch": 3.8061041292639137, + "grad_norm": 0.7313310503959656, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 53000 + }, + { + "epoch": 3.8068222621184917, + "grad_norm": 0.8352667093276978, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 53010 + }, + { + "epoch": 3.80754039497307, + "grad_norm": 0.748461127281189, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 53020 + }, + { + "epoch": 3.808258527827648, + "grad_norm": 0.943256139755249, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 53030 + }, + { + "epoch": 3.808976660682226, + "grad_norm": 1.0448410511016846, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 53040 + }, + { + "epoch": 3.809694793536804, + "grad_norm": 0.9047636985778809, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 53050 + }, + { + "epoch": 3.8104129263913826, + "grad_norm": 0.8594381213188171, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 53060 + }, + { + "epoch": 3.8111310592459606, + "grad_norm": 0.7593536972999573, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 53070 + }, + { + "epoch": 3.8118491921005386, + "grad_norm": 0.7189019918441772, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 53080 + }, + { + "epoch": 3.8125673249551166, + "grad_norm": 0.8569809198379517, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53090 + }, + { + "epoch": 3.8132854578096946, + "grad_norm": 0.923378050327301, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53100 + }, + { + "epoch": 3.8140035906642726, + "grad_norm": 0.9088824391365051, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 53110 + }, + { + "epoch": 3.814721723518851, + "grad_norm": 1.1386840343475342, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 53120 + }, + { + "epoch": 3.815439856373429, + "grad_norm": 0.8389552235603333, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 53130 + }, + { + "epoch": 3.816157989228007, + "grad_norm": 0.7940975427627563, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 53140 + }, + { + "epoch": 3.8168761220825855, + "grad_norm": 0.8389907479286194, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 53150 + }, + { + "epoch": 3.8175942549371635, + "grad_norm": 0.774206280708313, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 53160 + }, + { + "epoch": 3.8183123877917415, + "grad_norm": 1.189447283744812, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 53170 + }, + { + "epoch": 3.8190305206463195, + "grad_norm": 0.9875882863998413, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 53180 + }, + { + "epoch": 3.8197486535008975, + "grad_norm": 0.9205945134162903, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 53190 + }, + { + "epoch": 3.8204667863554755, + "grad_norm": 0.8312796354293823, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 53200 + }, + { + "epoch": 3.821184919210054, + "grad_norm": 0.9755756855010986, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 53210 + }, + { + "epoch": 3.821903052064632, + "grad_norm": 1.0722965002059937, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53220 + }, + { + "epoch": 3.82262118491921, + "grad_norm": 0.7720510959625244, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 53230 + }, + { + "epoch": 3.823339317773788, + "grad_norm": 1.020147681236267, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 53240 + }, + { + "epoch": 3.8240574506283664, + "grad_norm": 0.8241816759109497, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53250 + }, + { + "epoch": 3.8247755834829444, + "grad_norm": 0.8939895629882812, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 53260 + }, + { + "epoch": 3.8254937163375224, + "grad_norm": 1.010852336883545, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 53270 + }, + { + "epoch": 3.8262118491921004, + "grad_norm": 0.8201420307159424, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 53280 + }, + { + "epoch": 3.8269299820466784, + "grad_norm": 0.8797973990440369, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 53290 + }, + { + "epoch": 3.827648114901257, + "grad_norm": 0.9034950137138367, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 53300 + }, + { + "epoch": 3.828366247755835, + "grad_norm": 0.926802933216095, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 53310 + }, + { + "epoch": 3.829084380610413, + "grad_norm": 1.0205509662628174, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 53320 + }, + { + "epoch": 3.829802513464991, + "grad_norm": 0.9524099230766296, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 53330 + }, + { + "epoch": 3.8305206463195693, + "grad_norm": 0.9692625999450684, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 53340 + }, + { + "epoch": 3.8312387791741473, + "grad_norm": 0.7255275845527649, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 53350 + }, + { + "epoch": 3.8319569120287253, + "grad_norm": 0.7199059724807739, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53360 + }, + { + "epoch": 3.8326750448833034, + "grad_norm": 1.004464864730835, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 53370 + }, + { + "epoch": 3.8333931777378814, + "grad_norm": 0.9092583060264587, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53380 + }, + { + "epoch": 3.8341113105924594, + "grad_norm": 0.945091724395752, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 53390 + }, + { + "epoch": 3.834829443447038, + "grad_norm": 0.7980135679244995, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 53400 + }, + { + "epoch": 3.835547576301616, + "grad_norm": 0.7812868356704712, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 53410 + }, + { + "epoch": 3.836265709156194, + "grad_norm": 0.8957077860832214, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53420 + }, + { + "epoch": 3.8369838420107722, + "grad_norm": 0.9119600653648376, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 53430 + }, + { + "epoch": 3.8377019748653503, + "grad_norm": 0.8208187222480774, + "learning_rate": 0.0002, + "loss": 0.7346, + "step": 53440 + }, + { + "epoch": 3.8384201077199283, + "grad_norm": 0.7930439114570618, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 53450 + }, + { + "epoch": 3.8391382405745063, + "grad_norm": 0.8937777280807495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 53460 + }, + { + "epoch": 3.8398563734290843, + "grad_norm": 0.7583796977996826, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 53470 + }, + { + "epoch": 3.8405745062836623, + "grad_norm": 1.0735969543457031, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 53480 + }, + { + "epoch": 3.8412926391382407, + "grad_norm": 1.1106033325195312, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 53490 + }, + { + "epoch": 3.8420107719928187, + "grad_norm": 1.092631220817566, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 53500 + }, + { + "epoch": 3.8427289048473967, + "grad_norm": 0.9961787462234497, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 53510 + }, + { + "epoch": 3.8434470377019747, + "grad_norm": 0.833831250667572, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 53520 + }, + { + "epoch": 3.844165170556553, + "grad_norm": 1.0000009536743164, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 53530 + }, + { + "epoch": 3.844883303411131, + "grad_norm": 0.9784213304519653, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 53540 + }, + { + "epoch": 3.845601436265709, + "grad_norm": 0.8582558035850525, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 53550 + }, + { + "epoch": 3.846319569120287, + "grad_norm": 0.8267415761947632, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 53560 + }, + { + "epoch": 3.847037701974865, + "grad_norm": 0.8783000111579895, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 53570 + }, + { + "epoch": 3.8477558348294436, + "grad_norm": 0.9866999983787537, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 53580 + }, + { + "epoch": 3.8484739676840216, + "grad_norm": 0.8459296226501465, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 53590 + }, + { + "epoch": 3.8491921005385996, + "grad_norm": 0.9804834723472595, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 53600 + }, + { + "epoch": 3.8499102333931776, + "grad_norm": 0.951074481010437, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 53610 + }, + { + "epoch": 3.850628366247756, + "grad_norm": 0.8020104169845581, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 53620 + }, + { + "epoch": 3.851346499102334, + "grad_norm": 0.9296963214874268, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 53630 + }, + { + "epoch": 3.852064631956912, + "grad_norm": 0.8983652591705322, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 53640 + }, + { + "epoch": 3.85278276481149, + "grad_norm": 1.031858205795288, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 53650 + }, + { + "epoch": 3.853500897666068, + "grad_norm": 0.8943952918052673, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 53660 + }, + { + "epoch": 3.854219030520646, + "grad_norm": 1.0072312355041504, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 53670 + }, + { + "epoch": 3.8549371633752245, + "grad_norm": 1.0604884624481201, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 53680 + }, + { + "epoch": 3.8556552962298025, + "grad_norm": 0.834223210811615, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 53690 + }, + { + "epoch": 3.8563734290843805, + "grad_norm": 0.9872867465019226, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 53700 + }, + { + "epoch": 3.857091561938959, + "grad_norm": 0.7999459505081177, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53710 + }, + { + "epoch": 3.857809694793537, + "grad_norm": 0.717722475528717, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 53720 + }, + { + "epoch": 3.858527827648115, + "grad_norm": 1.0675442218780518, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 53730 + }, + { + "epoch": 3.859245960502693, + "grad_norm": 0.9789777398109436, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 53740 + }, + { + "epoch": 3.859964093357271, + "grad_norm": 0.9318669438362122, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 53750 + }, + { + "epoch": 3.860682226211849, + "grad_norm": 0.9848631024360657, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 53760 + }, + { + "epoch": 3.8614003590664274, + "grad_norm": 0.8754391670227051, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 53770 + }, + { + "epoch": 3.8621184919210054, + "grad_norm": 0.9024585485458374, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 53780 + }, + { + "epoch": 3.8628366247755834, + "grad_norm": 0.8974794745445251, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 53790 + }, + { + "epoch": 3.8635547576301614, + "grad_norm": 0.8342790603637695, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 53800 + }, + { + "epoch": 3.86427289048474, + "grad_norm": 0.8177682757377625, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 53810 + }, + { + "epoch": 3.864991023339318, + "grad_norm": 1.0259089469909668, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 53820 + }, + { + "epoch": 3.865709156193896, + "grad_norm": 1.042290210723877, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 53830 + }, + { + "epoch": 3.866427289048474, + "grad_norm": 0.7316540479660034, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 53840 + }, + { + "epoch": 3.867145421903052, + "grad_norm": 0.9384970664978027, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53850 + }, + { + "epoch": 3.86786355475763, + "grad_norm": 0.9273143410682678, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53860 + }, + { + "epoch": 3.8685816876122083, + "grad_norm": 1.1183570623397827, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 53870 + }, + { + "epoch": 3.8692998204667863, + "grad_norm": 0.9455275535583496, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 53880 + }, + { + "epoch": 3.8700179533213643, + "grad_norm": 0.8702114820480347, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 53890 + }, + { + "epoch": 3.870736086175943, + "grad_norm": 0.8751053214073181, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53900 + }, + { + "epoch": 3.871454219030521, + "grad_norm": 0.9793110489845276, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 53910 + }, + { + "epoch": 3.872172351885099, + "grad_norm": 0.9705014824867249, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 53920 + }, + { + "epoch": 3.872890484739677, + "grad_norm": 1.051504373550415, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 53930 + }, + { + "epoch": 3.873608617594255, + "grad_norm": 0.8590622544288635, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 53940 + }, + { + "epoch": 3.874326750448833, + "grad_norm": 0.7828099727630615, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 53950 + }, + { + "epoch": 3.8750448833034112, + "grad_norm": 0.86341792345047, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 53960 + }, + { + "epoch": 3.8757630161579892, + "grad_norm": 1.114670991897583, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 53970 + }, + { + "epoch": 3.8764811490125672, + "grad_norm": 0.8559519052505493, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 53980 + }, + { + "epoch": 3.8771992818671457, + "grad_norm": 1.0518953800201416, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 53990 + }, + { + "epoch": 3.8779174147217237, + "grad_norm": 0.7157500982284546, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 54000 + }, + { + "epoch": 3.8786355475763017, + "grad_norm": 0.8390372395515442, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 54010 + }, + { + "epoch": 3.8793536804308797, + "grad_norm": 0.8486756086349487, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 54020 + }, + { + "epoch": 3.8800718132854577, + "grad_norm": 0.8361587524414062, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 54030 + }, + { + "epoch": 3.8807899461400357, + "grad_norm": 0.9490554928779602, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 54040 + }, + { + "epoch": 3.881508078994614, + "grad_norm": 1.0311323404312134, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 54050 + }, + { + "epoch": 3.882226211849192, + "grad_norm": 0.84800124168396, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54060 + }, + { + "epoch": 3.88294434470377, + "grad_norm": 0.8940879702568054, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 54070 + }, + { + "epoch": 3.883662477558348, + "grad_norm": 0.985542356967926, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 54080 + }, + { + "epoch": 3.8843806104129266, + "grad_norm": 0.8846475481987, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 54090 + }, + { + "epoch": 3.8850987432675046, + "grad_norm": 0.9186338186264038, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 54100 + }, + { + "epoch": 3.8858168761220826, + "grad_norm": 1.106598973274231, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 54110 + }, + { + "epoch": 3.8865350089766606, + "grad_norm": 0.8167300224304199, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 54120 + }, + { + "epoch": 3.8872531418312386, + "grad_norm": 0.9153622984886169, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 54130 + }, + { + "epoch": 3.8879712746858166, + "grad_norm": 0.8464475274085999, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 54140 + }, + { + "epoch": 3.888689407540395, + "grad_norm": 0.8889452815055847, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 54150 + }, + { + "epoch": 3.889407540394973, + "grad_norm": 0.7861065864562988, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 54160 + }, + { + "epoch": 3.890125673249551, + "grad_norm": 0.882674515247345, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 54170 + }, + { + "epoch": 3.8908438061041295, + "grad_norm": 0.8503835201263428, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 54180 + }, + { + "epoch": 3.8915619389587075, + "grad_norm": 0.888455331325531, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 54190 + }, + { + "epoch": 3.8922800718132855, + "grad_norm": 1.0473699569702148, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 54200 + }, + { + "epoch": 3.8929982046678635, + "grad_norm": 0.9548208713531494, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 54210 + }, + { + "epoch": 3.8937163375224415, + "grad_norm": 0.9158754944801331, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 54220 + }, + { + "epoch": 3.8944344703770195, + "grad_norm": 0.9001154899597168, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54230 + }, + { + "epoch": 3.895152603231598, + "grad_norm": 0.9736626148223877, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54240 + }, + { + "epoch": 3.895870736086176, + "grad_norm": 0.8809846043586731, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 54250 + }, + { + "epoch": 3.896588868940754, + "grad_norm": 0.887583315372467, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 54260 + }, + { + "epoch": 3.8973070017953324, + "grad_norm": 0.8395712971687317, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 54270 + }, + { + "epoch": 3.8980251346499104, + "grad_norm": 0.8391315937042236, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 54280 + }, + { + "epoch": 3.8987432675044884, + "grad_norm": 0.8210049271583557, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54290 + }, + { + "epoch": 3.8994614003590664, + "grad_norm": 1.1364530324935913, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54300 + }, + { + "epoch": 3.9001795332136444, + "grad_norm": 0.7712056636810303, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 54310 + }, + { + "epoch": 3.9008976660682224, + "grad_norm": 0.9466049671173096, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 54320 + }, + { + "epoch": 3.901615798922801, + "grad_norm": 1.0367140769958496, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 54330 + }, + { + "epoch": 3.902333931777379, + "grad_norm": 1.0168321132659912, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 54340 + }, + { + "epoch": 3.903052064631957, + "grad_norm": 0.7830407619476318, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 54350 + }, + { + "epoch": 3.903770197486535, + "grad_norm": 0.9649789333343506, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 54360 + }, + { + "epoch": 3.9044883303411133, + "grad_norm": 0.681077778339386, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 54370 + }, + { + "epoch": 3.9052064631956913, + "grad_norm": 0.8970136046409607, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 54380 + }, + { + "epoch": 3.9059245960502693, + "grad_norm": 0.9155173301696777, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 54390 + }, + { + "epoch": 3.9066427289048473, + "grad_norm": 1.0447794198989868, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 54400 + }, + { + "epoch": 3.9073608617594253, + "grad_norm": 0.7823813557624817, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 54410 + }, + { + "epoch": 3.9080789946140033, + "grad_norm": 0.9289445877075195, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 54420 + }, + { + "epoch": 3.9087971274685818, + "grad_norm": 0.9983111619949341, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 54430 + }, + { + "epoch": 3.9095152603231598, + "grad_norm": 0.7952495813369751, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 54440 + }, + { + "epoch": 3.9102333931777378, + "grad_norm": 0.8045601844787598, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 54450 + }, + { + "epoch": 3.910951526032316, + "grad_norm": 0.936585009098053, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 54460 + }, + { + "epoch": 3.911669658886894, + "grad_norm": 0.745793879032135, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 54470 + }, + { + "epoch": 3.912387791741472, + "grad_norm": 0.9137616157531738, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 54480 + }, + { + "epoch": 3.9131059245960502, + "grad_norm": 0.826316237449646, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 54490 + }, + { + "epoch": 3.9138240574506282, + "grad_norm": 0.94313645362854, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 54500 + }, + { + "epoch": 3.9145421903052062, + "grad_norm": 1.045893907546997, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 54510 + }, + { + "epoch": 3.9152603231597847, + "grad_norm": 0.9122704863548279, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 54520 + }, + { + "epoch": 3.9159784560143627, + "grad_norm": 1.0999689102172852, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 54530 + }, + { + "epoch": 3.9166965888689407, + "grad_norm": 0.9281555414199829, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 54540 + }, + { + "epoch": 3.917414721723519, + "grad_norm": 1.1439622640609741, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 54550 + }, + { + "epoch": 3.918132854578097, + "grad_norm": 0.9375617504119873, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 54560 + }, + { + "epoch": 3.918850987432675, + "grad_norm": 0.92906653881073, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 54570 + }, + { + "epoch": 3.919569120287253, + "grad_norm": 1.0840893983840942, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 54580 + }, + { + "epoch": 3.920287253141831, + "grad_norm": 0.8145509362220764, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 54590 + }, + { + "epoch": 3.921005385996409, + "grad_norm": 0.973737895488739, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 54600 + }, + { + "epoch": 3.9217235188509876, + "grad_norm": 0.9302353858947754, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 54610 + }, + { + "epoch": 3.9224416517055656, + "grad_norm": 0.9167897701263428, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 54620 + }, + { + "epoch": 3.9231597845601436, + "grad_norm": 0.8096851706504822, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 54630 + }, + { + "epoch": 3.9238779174147216, + "grad_norm": 0.8006368279457092, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 54640 + }, + { + "epoch": 3.9245960502693, + "grad_norm": 0.7800863981246948, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 54650 + }, + { + "epoch": 3.925314183123878, + "grad_norm": 1.0331560373306274, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 54660 + }, + { + "epoch": 3.926032315978456, + "grad_norm": 1.0057517290115356, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 54670 + }, + { + "epoch": 3.926750448833034, + "grad_norm": 0.8920564651489258, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 54680 + }, + { + "epoch": 3.927468581687612, + "grad_norm": 0.7704599499702454, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 54690 + }, + { + "epoch": 3.92818671454219, + "grad_norm": 0.827032208442688, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 54700 + }, + { + "epoch": 3.9289048473967685, + "grad_norm": 1.0019268989562988, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 54710 + }, + { + "epoch": 3.9296229802513465, + "grad_norm": 0.862033486366272, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 54720 + }, + { + "epoch": 3.9303411131059245, + "grad_norm": 0.8965592980384827, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 54730 + }, + { + "epoch": 3.931059245960503, + "grad_norm": 0.7689077854156494, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 54740 + }, + { + "epoch": 3.931777378815081, + "grad_norm": 0.846276581287384, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 54750 + }, + { + "epoch": 3.932495511669659, + "grad_norm": 0.8932713866233826, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 54760 + }, + { + "epoch": 3.933213644524237, + "grad_norm": 0.9711386561393738, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 54770 + }, + { + "epoch": 3.933931777378815, + "grad_norm": 0.9290250539779663, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 54780 + }, + { + "epoch": 3.934649910233393, + "grad_norm": 1.0897367000579834, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 54790 + }, + { + "epoch": 3.9353680430879714, + "grad_norm": 0.8451842665672302, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 54800 + }, + { + "epoch": 3.9360861759425494, + "grad_norm": 0.8400090336799622, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 54810 + }, + { + "epoch": 3.9368043087971274, + "grad_norm": 0.951383650302887, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 54820 + }, + { + "epoch": 3.937522441651706, + "grad_norm": 0.848838210105896, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 54830 + }, + { + "epoch": 3.938240574506284, + "grad_norm": 0.735763669013977, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 54840 + }, + { + "epoch": 3.938958707360862, + "grad_norm": 0.979037344455719, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 54850 + }, + { + "epoch": 3.93967684021544, + "grad_norm": 0.933674693107605, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 54860 + }, + { + "epoch": 3.940394973070018, + "grad_norm": 0.835593044757843, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 54870 + }, + { + "epoch": 3.941113105924596, + "grad_norm": 1.0034281015396118, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 54880 + }, + { + "epoch": 3.9418312387791743, + "grad_norm": 0.9732975959777832, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 54890 + }, + { + "epoch": 3.9425493716337523, + "grad_norm": 0.9666336178779602, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54900 + }, + { + "epoch": 3.9432675044883303, + "grad_norm": 0.755310595035553, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 54910 + }, + { + "epoch": 3.9439856373429083, + "grad_norm": 0.8732092976570129, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 54920 + }, + { + "epoch": 3.9447037701974867, + "grad_norm": 1.139453649520874, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 54930 + }, + { + "epoch": 3.9454219030520647, + "grad_norm": 0.9044837951660156, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 54940 + }, + { + "epoch": 3.9461400359066428, + "grad_norm": 1.0496679544448853, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 54950 + }, + { + "epoch": 3.9468581687612208, + "grad_norm": 1.0099035501480103, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 54960 + }, + { + "epoch": 3.9475763016157988, + "grad_norm": 1.0694963932037354, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 54970 + }, + { + "epoch": 3.9482944344703768, + "grad_norm": 1.0012997388839722, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 54980 + }, + { + "epoch": 3.949012567324955, + "grad_norm": 0.8910513520240784, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 54990 + }, + { + "epoch": 3.949730700179533, + "grad_norm": 1.0267579555511475, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 55000 + }, + { + "epoch": 3.950448833034111, + "grad_norm": 0.9786432385444641, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 55010 + }, + { + "epoch": 3.9511669658886897, + "grad_norm": 0.8703538775444031, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55020 + }, + { + "epoch": 3.9518850987432677, + "grad_norm": 0.8970484137535095, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 55030 + }, + { + "epoch": 3.9526032315978457, + "grad_norm": 0.8781577944755554, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 55040 + }, + { + "epoch": 3.9533213644524237, + "grad_norm": 0.8040280938148499, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 55050 + }, + { + "epoch": 3.9540394973070017, + "grad_norm": 0.851926326751709, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 55060 + }, + { + "epoch": 3.9547576301615797, + "grad_norm": 0.8597240447998047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 55070 + }, + { + "epoch": 3.955475763016158, + "grad_norm": 0.9461944699287415, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55080 + }, + { + "epoch": 3.956193895870736, + "grad_norm": 0.7576611042022705, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 55090 + }, + { + "epoch": 3.956912028725314, + "grad_norm": 0.9484710693359375, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 55100 + }, + { + "epoch": 3.957630161579892, + "grad_norm": 0.9487117528915405, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 55110 + }, + { + "epoch": 3.9583482944344706, + "grad_norm": 0.870090663433075, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55120 + }, + { + "epoch": 3.9590664272890486, + "grad_norm": 0.8496458530426025, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 55130 + }, + { + "epoch": 3.9597845601436266, + "grad_norm": 1.0121779441833496, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 55140 + }, + { + "epoch": 3.9605026929982046, + "grad_norm": 0.8912323713302612, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 55150 + }, + { + "epoch": 3.9612208258527826, + "grad_norm": 0.8398444652557373, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 55160 + }, + { + "epoch": 3.961938958707361, + "grad_norm": 0.8046348690986633, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 55170 + }, + { + "epoch": 3.962657091561939, + "grad_norm": 1.0369254350662231, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 55180 + }, + { + "epoch": 3.963375224416517, + "grad_norm": 1.172431230545044, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 55190 + }, + { + "epoch": 3.964093357271095, + "grad_norm": 0.8093554377555847, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 55200 + }, + { + "epoch": 3.9648114901256735, + "grad_norm": 0.8851078748703003, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 55210 + }, + { + "epoch": 3.9655296229802515, + "grad_norm": 0.7494266033172607, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 55220 + }, + { + "epoch": 3.9662477558348295, + "grad_norm": 0.9556898474693298, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 55230 + }, + { + "epoch": 3.9669658886894075, + "grad_norm": 1.016017198562622, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 55240 + }, + { + "epoch": 3.9676840215439855, + "grad_norm": 0.8425998091697693, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 55250 + }, + { + "epoch": 3.9684021543985635, + "grad_norm": 0.717673122882843, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 55260 + }, + { + "epoch": 3.969120287253142, + "grad_norm": 0.8366572856903076, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 55270 + }, + { + "epoch": 3.96983842010772, + "grad_norm": 0.8981583118438721, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 55280 + }, + { + "epoch": 3.970556552962298, + "grad_norm": 0.8868781328201294, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 55290 + }, + { + "epoch": 3.9712746858168764, + "grad_norm": 1.0632785558700562, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 55300 + }, + { + "epoch": 3.9719928186714544, + "grad_norm": 0.8813109993934631, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 55310 + }, + { + "epoch": 3.9727109515260324, + "grad_norm": 0.8225542306900024, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 55320 + }, + { + "epoch": 3.9734290843806104, + "grad_norm": 1.1391420364379883, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 55330 + }, + { + "epoch": 3.9741472172351884, + "grad_norm": 1.0371832847595215, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55340 + }, + { + "epoch": 3.9748653500897664, + "grad_norm": 1.0542186498641968, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 55350 + }, + { + "epoch": 3.975583482944345, + "grad_norm": 1.0178009271621704, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 55360 + }, + { + "epoch": 3.976301615798923, + "grad_norm": 0.7927802205085754, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 55370 + }, + { + "epoch": 3.977019748653501, + "grad_norm": 0.9350495934486389, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55380 + }, + { + "epoch": 3.977737881508079, + "grad_norm": 1.0240116119384766, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 55390 + }, + { + "epoch": 3.9784560143626573, + "grad_norm": 1.0279067754745483, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 55400 + }, + { + "epoch": 3.9791741472172353, + "grad_norm": 1.1228227615356445, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 55410 + }, + { + "epoch": 3.9798922800718133, + "grad_norm": 0.9500134587287903, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 55420 + }, + { + "epoch": 3.9806104129263913, + "grad_norm": 0.9229732155799866, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 55430 + }, + { + "epoch": 3.9813285457809693, + "grad_norm": 0.7946729063987732, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 55440 + }, + { + "epoch": 3.9820466786355477, + "grad_norm": 0.9987489581108093, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 55450 + }, + { + "epoch": 3.9827648114901257, + "grad_norm": 0.9670467972755432, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 55460 + }, + { + "epoch": 3.9834829443447037, + "grad_norm": 0.835028350353241, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 55470 + }, + { + "epoch": 3.9842010771992817, + "grad_norm": 0.8678702712059021, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 55480 + }, + { + "epoch": 3.98491921005386, + "grad_norm": 0.8581197261810303, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 55490 + }, + { + "epoch": 3.985637342908438, + "grad_norm": 0.779848039150238, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 55500 + }, + { + "epoch": 3.986355475763016, + "grad_norm": 0.8827589154243469, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 55510 + }, + { + "epoch": 3.987073608617594, + "grad_norm": 1.0108301639556885, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55520 + }, + { + "epoch": 3.987791741472172, + "grad_norm": 0.8506004214286804, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 55530 + }, + { + "epoch": 3.98850987432675, + "grad_norm": 1.0297727584838867, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 55540 + }, + { + "epoch": 3.9892280071813286, + "grad_norm": 0.8579224944114685, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55550 + }, + { + "epoch": 3.9899461400359066, + "grad_norm": 0.8503788113594055, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 55560 + }, + { + "epoch": 3.9906642728904846, + "grad_norm": 1.1144801378250122, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 55570 + }, + { + "epoch": 3.991382405745063, + "grad_norm": 0.8418305516242981, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 55580 + }, + { + "epoch": 3.992100538599641, + "grad_norm": 1.0065871477127075, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 55590 + }, + { + "epoch": 3.992818671454219, + "grad_norm": 0.8160259127616882, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 55600 + }, + { + "epoch": 3.993536804308797, + "grad_norm": 0.8678009510040283, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55610 + }, + { + "epoch": 3.994254937163375, + "grad_norm": 0.863465428352356, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 55620 + }, + { + "epoch": 3.994973070017953, + "grad_norm": 0.9242135286331177, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 55630 + }, + { + "epoch": 3.9956912028725315, + "grad_norm": 1.0285470485687256, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 55640 + }, + { + "epoch": 3.9964093357271095, + "grad_norm": 0.8953320384025574, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 55650 + }, + { + "epoch": 3.9971274685816875, + "grad_norm": 0.915892481803894, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 55660 + }, + { + "epoch": 3.9978456014362656, + "grad_norm": 0.8235118985176086, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 55670 + }, + { + "epoch": 3.998563734290844, + "grad_norm": 1.0178656578063965, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 55680 + }, + { + "epoch": 3.999281867145422, + "grad_norm": 0.9926803708076477, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 55690 + }, + { + "epoch": 4.0, + "grad_norm": 0.9213629961013794, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 55700 + }, + { + "epoch": 4.0, + "eval_loss": 1.1152480840682983, + "eval_runtime": 55.2237, + "eval_samples_per_second": 13.273, + "eval_steps_per_second": 1.666, + "step": 55700 + } + ], + "logging_steps": 10, + "max_steps": 111400, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.5776712826486784e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-55700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec482ffeaf8bfb83ad01f395311a6ad944b71ed8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ba4f7296c81834beec7082f8f8d57dfb4ecffbb3553e8edbe10c33aaf433b8f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a75f9790600efb356d56a7af15faafe5807fc9e8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab68e306d9deb47c4e67cc0b85e4104bfe5b64914b30c62f1f7a71bd27c33ad2 +size 55532922 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..23257e6774501f364361e825142e27d130b5948f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c2b3211843c0ecec5cd751afe63968b921432fa19293ae0a6b577965a7e815 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f10bbf8f6a704dfa47ec94b708f473476ee11a8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcf8709157200b07d4eb771de92956f7e5c018cbc477fd94eda8d13cfbda1494 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b6c7323dc26eb5f8c09a01670192c06aa4e7adcc --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/trainer_state.json @@ -0,0 +1,48807 @@ +{ + "best_metric": 1.0868422985076904, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 69625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000718132854578097, + "grad_norm": 1.0291756391525269, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 10 + }, + { + "epoch": 0.001436265709156194, + "grad_norm": 0.6570823192596436, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 20 + }, + { + "epoch": 0.0021543985637342907, + "grad_norm": 0.693844199180603, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 30 + }, + { + "epoch": 0.002872531418312388, + "grad_norm": 0.5608532428741455, + "learning_rate": 0.0002, + "loss": 0.9377, + "step": 40 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 0.549075722694397, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 50 + }, + { + "epoch": 0.004308797127468581, + "grad_norm": 0.47189879417419434, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 60 + }, + { + "epoch": 0.005026929982046679, + "grad_norm": 0.5799676775932312, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 70 + }, + { + "epoch": 0.005745062836624776, + "grad_norm": 0.45907193422317505, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 80 + }, + { + "epoch": 0.006463195691202872, + "grad_norm": 0.4373045861721039, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 90 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 100 + }, + { + "epoch": 0.007899461400359067, + "grad_norm": 0.5248253345489502, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 110 + }, + { + "epoch": 0.008617594254937163, + "grad_norm": 0.5082874298095703, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 120 + }, + { + "epoch": 0.00933572710951526, + "grad_norm": 0.42670881748199463, + "learning_rate": 0.0002, + "loss": 0.8678, + "step": 130 + }, + { + "epoch": 0.010053859964093357, + "grad_norm": 0.43311649560928345, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 0.43456509709358215, + "learning_rate": 0.0002, + "loss": 0.9252, + "step": 150 + }, + { + "epoch": 0.011490125673249552, + "grad_norm": 0.9222815632820129, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 160 + }, + { + "epoch": 0.012208258527827648, + "grad_norm": 0.42752256989479065, + "learning_rate": 0.0002, + "loss": 0.8651, + "step": 170 + }, + { + "epoch": 0.012926391382405745, + "grad_norm": 0.4175542891025543, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 180 + }, + { + "epoch": 0.013644524236983842, + "grad_norm": 0.4377831518650055, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 190 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 0.47263655066490173, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 200 + }, + { + "epoch": 0.015080789946140035, + "grad_norm": 0.3870520293712616, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 210 + }, + { + "epoch": 0.015798922800718134, + "grad_norm": 0.4950464963912964, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 220 + }, + { + "epoch": 0.01651705565529623, + "grad_norm": 0.4643295407295227, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 230 + }, + { + "epoch": 0.017235188509874325, + "grad_norm": 0.5152903199195862, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 240 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 0.3800727427005768, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 250 + }, + { + "epoch": 0.01867145421903052, + "grad_norm": 0.43700528144836426, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 260 + }, + { + "epoch": 0.01938958707360862, + "grad_norm": 0.3712887763977051, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 270 + }, + { + "epoch": 0.020107719928186715, + "grad_norm": 0.4202553629875183, + "learning_rate": 0.0002, + "loss": 0.8329, + "step": 280 + }, + { + "epoch": 0.02082585278276481, + "grad_norm": 0.40585094690322876, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 0.4685470759868622, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 300 + }, + { + "epoch": 0.022262118491921005, + "grad_norm": 0.373169481754303, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 310 + }, + { + "epoch": 0.022980251346499104, + "grad_norm": 0.39681482315063477, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 320 + }, + { + "epoch": 0.0236983842010772, + "grad_norm": 0.3919322192668915, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 330 + }, + { + "epoch": 0.024416517055655295, + "grad_norm": 0.4728981554508209, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 340 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 0.42439374327659607, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 350 + }, + { + "epoch": 0.02585278276481149, + "grad_norm": 0.425650030374527, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 360 + }, + { + "epoch": 0.02657091561938959, + "grad_norm": 0.4076762795448303, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 370 + }, + { + "epoch": 0.027289048473967684, + "grad_norm": 0.44335922598838806, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 380 + }, + { + "epoch": 0.02800718132854578, + "grad_norm": 0.5313619375228882, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 390 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 0.37089797854423523, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 400 + }, + { + "epoch": 0.029443447037701975, + "grad_norm": 0.5193604826927185, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 410 + }, + { + "epoch": 0.03016157989228007, + "grad_norm": 0.4428552985191345, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 420 + }, + { + "epoch": 0.03087971274685817, + "grad_norm": 0.384171724319458, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 430 + }, + { + "epoch": 0.03159784560143627, + "grad_norm": 0.3906913101673126, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 440 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 0.5365669131278992, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 450 + }, + { + "epoch": 0.03303411131059246, + "grad_norm": 0.4785287380218506, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 460 + }, + { + "epoch": 0.03375224416517056, + "grad_norm": 0.40048182010650635, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 470 + }, + { + "epoch": 0.03447037701974865, + "grad_norm": 0.49529239535331726, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 480 + }, + { + "epoch": 0.03518850987432675, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 490 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 0.3802863359451294, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.03662477558348295, + "grad_norm": 0.40374308824539185, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 510 + }, + { + "epoch": 0.03734290843806104, + "grad_norm": 0.4320009648799896, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 520 + }, + { + "epoch": 0.03806104129263914, + "grad_norm": 0.5198846459388733, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 530 + }, + { + "epoch": 0.03877917414721724, + "grad_norm": 0.4136947989463806, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 540 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 0.39344364404678345, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 550 + }, + { + "epoch": 0.04021543985637343, + "grad_norm": 0.4659644067287445, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 560 + }, + { + "epoch": 0.04093357271095153, + "grad_norm": 0.3898842930793762, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 570 + }, + { + "epoch": 0.04165170556552962, + "grad_norm": 0.3964841961860657, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 580 + }, + { + "epoch": 0.04236983842010772, + "grad_norm": 0.5172179341316223, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 590 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 0.5362544059753418, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 600 + }, + { + "epoch": 0.04380610412926391, + "grad_norm": 0.3975909948348999, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 610 + }, + { + "epoch": 0.04452423698384201, + "grad_norm": 0.3905031085014343, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 620 + }, + { + "epoch": 0.04524236983842011, + "grad_norm": 0.5148088932037354, + "learning_rate": 0.0002, + "loss": 0.7723, + "step": 630 + }, + { + "epoch": 0.04596050269299821, + "grad_norm": 0.38826194405555725, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 640 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 0.5432049036026001, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.0473967684021544, + "grad_norm": 0.42048221826553345, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 660 + }, + { + "epoch": 0.0481149012567325, + "grad_norm": 0.4683088958263397, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 670 + }, + { + "epoch": 0.04883303411131059, + "grad_norm": 0.4623735249042511, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 680 + }, + { + "epoch": 0.04955116696588869, + "grad_norm": 0.509128212928772, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 690 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 0.45767295360565186, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 700 + }, + { + "epoch": 0.05098743267504488, + "grad_norm": 0.4023726284503937, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 710 + }, + { + "epoch": 0.05170556552962298, + "grad_norm": 0.4407201409339905, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 720 + }, + { + "epoch": 0.05242369838420108, + "grad_norm": 0.41862091422080994, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 730 + }, + { + "epoch": 0.05314183123877918, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 740 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 0.4882921576499939, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 750 + }, + { + "epoch": 0.05457809694793537, + "grad_norm": 0.47890132665634155, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 760 + }, + { + "epoch": 0.05529622980251347, + "grad_norm": 0.5811166167259216, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 770 + }, + { + "epoch": 0.05601436265709156, + "grad_norm": 0.41113588213920593, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 780 + }, + { + "epoch": 0.05673249551166966, + "grad_norm": 0.4120602607727051, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 790 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 0.39287394285202026, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 800 + }, + { + "epoch": 0.05816876122082585, + "grad_norm": 0.3986941874027252, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 810 + }, + { + "epoch": 0.05888689407540395, + "grad_norm": 0.4264012575149536, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 820 + }, + { + "epoch": 0.05960502692998205, + "grad_norm": 0.481139600276947, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 830 + }, + { + "epoch": 0.06032315978456014, + "grad_norm": 0.5561784505844116, + "learning_rate": 0.0002, + "loss": 0.8477, + "step": 840 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 0.4787197411060333, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 850 + }, + { + "epoch": 0.06175942549371634, + "grad_norm": 0.46454647183418274, + "learning_rate": 0.0002, + "loss": 0.8567, + "step": 860 + }, + { + "epoch": 0.06247755834829444, + "grad_norm": 0.5929669141769409, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 870 + }, + { + "epoch": 0.06319569120287254, + "grad_norm": 0.4561384618282318, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 880 + }, + { + "epoch": 0.06391382405745062, + "grad_norm": 0.45767998695373535, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 890 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 0.42475444078445435, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 900 + }, + { + "epoch": 0.06535008976660682, + "grad_norm": 0.4911022484302521, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 910 + }, + { + "epoch": 0.06606822262118492, + "grad_norm": 0.5229166746139526, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 920 + }, + { + "epoch": 0.06678635547576302, + "grad_norm": 0.38134580850601196, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 930 + }, + { + "epoch": 0.06750448833034112, + "grad_norm": 0.4171486496925354, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 940 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 0.45171529054641724, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 950 + }, + { + "epoch": 0.0689407540394973, + "grad_norm": 0.44889307022094727, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 960 + }, + { + "epoch": 0.0696588868940754, + "grad_norm": 0.44902464747428894, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 970 + }, + { + "epoch": 0.0703770197486535, + "grad_norm": 0.4671969413757324, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 980 + }, + { + "epoch": 0.0710951526032316, + "grad_norm": 0.4686984717845917, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 990 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1000 + }, + { + "epoch": 0.0725314183123878, + "grad_norm": 0.48861828446388245, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1010 + }, + { + "epoch": 0.0732495511669659, + "grad_norm": 0.7603165507316589, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 1020 + }, + { + "epoch": 0.07396768402154398, + "grad_norm": 0.501654863357544, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 1030 + }, + { + "epoch": 0.07468581687612208, + "grad_norm": 0.45291560888290405, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 1040 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 0.42454713582992554, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 1050 + }, + { + "epoch": 0.07612208258527828, + "grad_norm": 0.4655592441558838, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1060 + }, + { + "epoch": 0.07684021543985638, + "grad_norm": 0.5011071562767029, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 1070 + }, + { + "epoch": 0.07755834829443448, + "grad_norm": 0.37221577763557434, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 1080 + }, + { + "epoch": 0.07827648114901256, + "grad_norm": 0.5123572945594788, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 1090 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 0.44138720631599426, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1100 + }, + { + "epoch": 0.07971274685816876, + "grad_norm": 0.38932886719703674, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 1110 + }, + { + "epoch": 0.08043087971274686, + "grad_norm": 0.435820072889328, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 1120 + }, + { + "epoch": 0.08114901256732496, + "grad_norm": 0.3820142149925232, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 1130 + }, + { + "epoch": 0.08186714542190306, + "grad_norm": 0.39680808782577515, + "learning_rate": 0.0002, + "loss": 0.8617, + "step": 1140 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 0.4833722412586212, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1150 + }, + { + "epoch": 0.08330341113105924, + "grad_norm": 0.5045956969261169, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1160 + }, + { + "epoch": 0.08402154398563734, + "grad_norm": 0.3652207553386688, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 1170 + }, + { + "epoch": 0.08473967684021544, + "grad_norm": 0.44447052478790283, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 1180 + }, + { + "epoch": 0.08545780969479354, + "grad_norm": 0.44942694902420044, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 1190 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 0.48789075016975403, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1200 + }, + { + "epoch": 0.08689407540394974, + "grad_norm": 0.3981451094150543, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1210 + }, + { + "epoch": 0.08761220825852782, + "grad_norm": 0.45545220375061035, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 1220 + }, + { + "epoch": 0.08833034111310592, + "grad_norm": 0.562138557434082, + "learning_rate": 0.0002, + "loss": 0.8406, + "step": 1230 + }, + { + "epoch": 0.08904847396768402, + "grad_norm": 0.48523494601249695, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 1240 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 0.35054388642311096, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1250 + }, + { + "epoch": 0.09048473967684022, + "grad_norm": 0.4148605167865753, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 1260 + }, + { + "epoch": 0.09120287253141832, + "grad_norm": 0.50171959400177, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 1270 + }, + { + "epoch": 0.09192100538599642, + "grad_norm": 0.41747573018074036, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 1280 + }, + { + "epoch": 0.0926391382405745, + "grad_norm": 0.43028751015663147, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1290 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 0.41274991631507874, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 1300 + }, + { + "epoch": 0.0940754039497307, + "grad_norm": 0.5399569272994995, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 1310 + }, + { + "epoch": 0.0947935368043088, + "grad_norm": 0.44284379482269287, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 1320 + }, + { + "epoch": 0.0955116696588869, + "grad_norm": 0.42511969804763794, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1330 + }, + { + "epoch": 0.096229802513465, + "grad_norm": 0.5717929005622864, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1340 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1350 + }, + { + "epoch": 0.09766606822262118, + "grad_norm": 0.4144339859485626, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 1360 + }, + { + "epoch": 0.09838420107719928, + "grad_norm": 0.43676936626434326, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 1370 + }, + { + "epoch": 0.09910233393177738, + "grad_norm": 0.5297161340713501, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 1380 + }, + { + "epoch": 0.09982046678635548, + "grad_norm": 0.5319193601608276, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1390 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 0.4083728492259979, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1400 + }, + { + "epoch": 0.10125673249551168, + "grad_norm": 0.4193868339061737, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1410 + }, + { + "epoch": 0.10197486535008976, + "grad_norm": 0.4062198996543884, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 1420 + }, + { + "epoch": 0.10269299820466786, + "grad_norm": 0.43972232937812805, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1430 + }, + { + "epoch": 0.10341113105924596, + "grad_norm": 0.4598410725593567, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1440 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 0.571662187576294, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1450 + }, + { + "epoch": 0.10484739676840216, + "grad_norm": 0.5437791347503662, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1460 + }, + { + "epoch": 0.10556552962298026, + "grad_norm": 0.4241923391819, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1470 + }, + { + "epoch": 0.10628366247755835, + "grad_norm": 0.5185145735740662, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1480 + }, + { + "epoch": 0.10700179533213644, + "grad_norm": 0.537626326084137, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 1490 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 0.4573661983013153, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 1500 + }, + { + "epoch": 0.10843806104129264, + "grad_norm": 0.4521017074584961, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 1510 + }, + { + "epoch": 0.10915619389587074, + "grad_norm": 0.6835159063339233, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1520 + }, + { + "epoch": 0.10987432675044884, + "grad_norm": 0.43522894382476807, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 1530 + }, + { + "epoch": 0.11059245960502694, + "grad_norm": 0.685547411441803, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 1540 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 0.5283669233322144, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1550 + }, + { + "epoch": 0.11202872531418312, + "grad_norm": 0.4869283437728882, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 1560 + }, + { + "epoch": 0.11274685816876122, + "grad_norm": 0.43024054169654846, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1570 + }, + { + "epoch": 0.11346499102333932, + "grad_norm": 0.46726059913635254, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1580 + }, + { + "epoch": 0.11418312387791742, + "grad_norm": 0.5046039819717407, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 1590 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 0.48972827196121216, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 1600 + }, + { + "epoch": 0.11561938958707361, + "grad_norm": 0.5221049189567566, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 1610 + }, + { + "epoch": 0.1163375224416517, + "grad_norm": 0.49169477820396423, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 1620 + }, + { + "epoch": 0.1170556552962298, + "grad_norm": 0.48462188243865967, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 1630 + }, + { + "epoch": 0.1177737881508079, + "grad_norm": 0.9001021981239319, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 1640 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 0.47555917501449585, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 1650 + }, + { + "epoch": 0.1192100538599641, + "grad_norm": 0.4523521959781647, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1660 + }, + { + "epoch": 0.1199281867145422, + "grad_norm": 0.510956346988678, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 1670 + }, + { + "epoch": 0.12064631956912028, + "grad_norm": 0.48063746094703674, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 0.12136445242369838, + "grad_norm": 0.5209490060806274, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 1690 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 0.5488983988761902, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1700 + }, + { + "epoch": 0.12280071813285458, + "grad_norm": 0.5263523459434509, + "learning_rate": 0.0002, + "loss": 0.829, + "step": 1710 + }, + { + "epoch": 0.12351885098743268, + "grad_norm": 0.45365768671035767, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 1720 + }, + { + "epoch": 0.12423698384201078, + "grad_norm": 0.4366922378540039, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 1730 + }, + { + "epoch": 0.12495511669658887, + "grad_norm": 0.4841083884239197, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 1740 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 0.46546968817710876, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 1750 + }, + { + "epoch": 0.12639138240574507, + "grad_norm": 0.39987099170684814, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1760 + }, + { + "epoch": 0.12710951526032316, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 1770 + }, + { + "epoch": 0.12782764811490124, + "grad_norm": 0.46716657280921936, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 1780 + }, + { + "epoch": 0.12854578096947936, + "grad_norm": 0.46164995431900024, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1790 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 0.4910370111465454, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 1800 + }, + { + "epoch": 0.12998204667863555, + "grad_norm": 0.5615737438201904, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 1810 + }, + { + "epoch": 0.13070017953321364, + "grad_norm": 0.5739728808403015, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1820 + }, + { + "epoch": 0.13141831238779175, + "grad_norm": 0.44104722142219543, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 1830 + }, + { + "epoch": 0.13213644524236984, + "grad_norm": 0.46373724937438965, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 1840 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 0.4481196403503418, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.13357271095152604, + "grad_norm": 0.5689327716827393, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 1860 + }, + { + "epoch": 0.13429084380610412, + "grad_norm": 0.5334849953651428, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 1870 + }, + { + "epoch": 0.13500897666068223, + "grad_norm": 0.5177253484725952, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 1880 + }, + { + "epoch": 0.13572710951526032, + "grad_norm": 0.4919368326663971, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 1890 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 0.5987576842308044, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 1900 + }, + { + "epoch": 0.13716337522441652, + "grad_norm": 0.49790486693382263, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 1910 + }, + { + "epoch": 0.1378815080789946, + "grad_norm": 0.5337542295455933, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1920 + }, + { + "epoch": 0.13859964093357272, + "grad_norm": 0.5171598792076111, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 1930 + }, + { + "epoch": 0.1393177737881508, + "grad_norm": 0.5003953576087952, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1940 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 0.5147887468338013, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 1950 + }, + { + "epoch": 0.140754039497307, + "grad_norm": 0.6365984678268433, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 1960 + }, + { + "epoch": 0.1414721723518851, + "grad_norm": 0.5449512004852295, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 1970 + }, + { + "epoch": 0.1421903052064632, + "grad_norm": 0.4062703847885132, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1980 + }, + { + "epoch": 0.14290843806104128, + "grad_norm": 0.4446912705898285, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 1990 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 0.49001234769821167, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 2000 + }, + { + "epoch": 0.14434470377019748, + "grad_norm": 0.5591765642166138, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 2010 + }, + { + "epoch": 0.1450628366247756, + "grad_norm": 0.6476696133613586, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 2020 + }, + { + "epoch": 0.14578096947935368, + "grad_norm": 0.44688376784324646, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 2030 + }, + { + "epoch": 0.1464991023339318, + "grad_norm": 0.4437490701675415, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 2040 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 0.59927898645401, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 2050 + }, + { + "epoch": 0.14793536804308796, + "grad_norm": 0.4356591999530792, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 2060 + }, + { + "epoch": 0.14865350089766607, + "grad_norm": 0.5560822486877441, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2070 + }, + { + "epoch": 0.14937163375224416, + "grad_norm": 0.43027108907699585, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2080 + }, + { + "epoch": 0.15008976660682227, + "grad_norm": 0.41215455532073975, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 2090 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 0.4607839584350586, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 2100 + }, + { + "epoch": 0.15152603231597844, + "grad_norm": 0.4699854254722595, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2110 + }, + { + "epoch": 0.15224416517055656, + "grad_norm": 0.5111975073814392, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2120 + }, + { + "epoch": 0.15296229802513464, + "grad_norm": 0.4713742733001709, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2130 + }, + { + "epoch": 0.15368043087971275, + "grad_norm": 0.3816622793674469, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2140 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 0.4637526273727417, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 2150 + }, + { + "epoch": 0.15511669658886895, + "grad_norm": 0.3691818118095398, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2160 + }, + { + "epoch": 0.15583482944344704, + "grad_norm": 0.4435218274593353, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 2170 + }, + { + "epoch": 0.15655296229802512, + "grad_norm": 0.5282211899757385, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 2180 + }, + { + "epoch": 0.15727109515260324, + "grad_norm": 0.7611056566238403, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 2190 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 0.5951169729232788, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 2200 + }, + { + "epoch": 0.15870736086175943, + "grad_norm": 0.5243265628814697, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2210 + }, + { + "epoch": 0.15942549371633752, + "grad_norm": 0.518944501876831, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 2220 + }, + { + "epoch": 0.16014362657091563, + "grad_norm": 0.4264616072177887, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2230 + }, + { + "epoch": 0.16086175942549372, + "grad_norm": 0.4619045853614807, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 2240 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 0.4047030508518219, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2250 + }, + { + "epoch": 0.16229802513464991, + "grad_norm": 0.47133687138557434, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 2260 + }, + { + "epoch": 0.163016157989228, + "grad_norm": 0.4990246593952179, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 2270 + }, + { + "epoch": 0.1637342908438061, + "grad_norm": 0.5145298838615417, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 2280 + }, + { + "epoch": 0.1644524236983842, + "grad_norm": 0.5354352593421936, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 2290 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 0.47621065378189087, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 2300 + }, + { + "epoch": 0.1658886894075404, + "grad_norm": 0.45333582162857056, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 2310 + }, + { + "epoch": 0.16660682226211848, + "grad_norm": 0.4832790493965149, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 2320 + }, + { + "epoch": 0.1673249551166966, + "grad_norm": 0.4922761619091034, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2330 + }, + { + "epoch": 0.16804308797127468, + "grad_norm": 0.5701655149459839, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 2340 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 0.5170459151268005, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 2350 + }, + { + "epoch": 0.16947935368043088, + "grad_norm": 0.6562373638153076, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2360 + }, + { + "epoch": 0.170197486535009, + "grad_norm": 0.5350262522697449, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 2370 + }, + { + "epoch": 0.17091561938958708, + "grad_norm": 0.5163491368293762, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 2380 + }, + { + "epoch": 0.17163375224416516, + "grad_norm": 0.48841530084609985, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2390 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 0.44912993907928467, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 2400 + }, + { + "epoch": 0.17307001795332136, + "grad_norm": 0.5770647525787354, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 2410 + }, + { + "epoch": 0.17378815080789947, + "grad_norm": 0.4716179072856903, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 2420 + }, + { + "epoch": 0.17450628366247756, + "grad_norm": 0.5465078949928284, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 2430 + }, + { + "epoch": 0.17522441651705564, + "grad_norm": 0.40810713171958923, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 2440 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 0.3789578080177307, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 2450 + }, + { + "epoch": 0.17666068222621184, + "grad_norm": 0.4615110158920288, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 2460 + }, + { + "epoch": 0.17737881508078995, + "grad_norm": 0.4400235712528229, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2470 + }, + { + "epoch": 0.17809694793536804, + "grad_norm": 0.5935020446777344, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2480 + }, + { + "epoch": 0.17881508078994615, + "grad_norm": 0.5672990679740906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 2490 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 0.4132838845252991, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 2500 + }, + { + "epoch": 0.18025134649910232, + "grad_norm": 0.5373716950416565, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 2510 + }, + { + "epoch": 0.18096947935368043, + "grad_norm": 0.5335832834243774, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2520 + }, + { + "epoch": 0.18168761220825852, + "grad_norm": 0.5705642700195312, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.18240574506283663, + "grad_norm": 0.4807959496974945, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 2540 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 0.4430573880672455, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2550 + }, + { + "epoch": 0.18384201077199283, + "grad_norm": 0.5294728875160217, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 2560 + }, + { + "epoch": 0.18456014362657092, + "grad_norm": 0.661173403263092, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2570 + }, + { + "epoch": 0.185278276481149, + "grad_norm": 0.5044304728507996, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2580 + }, + { + "epoch": 0.18599640933572711, + "grad_norm": 0.48929551243782043, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 2590 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 0.5054438710212708, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2600 + }, + { + "epoch": 0.1874326750448833, + "grad_norm": 0.5613677501678467, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 2610 + }, + { + "epoch": 0.1881508078994614, + "grad_norm": 0.5762478709220886, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 2620 + }, + { + "epoch": 0.1888689407540395, + "grad_norm": 0.4523695409297943, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2630 + }, + { + "epoch": 0.1895870736086176, + "grad_norm": 0.5235317945480347, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 2640 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 0.4894576370716095, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 2650 + }, + { + "epoch": 0.1910233393177738, + "grad_norm": 0.45731106400489807, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2660 + }, + { + "epoch": 0.19174147217235188, + "grad_norm": 0.4726541042327881, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2670 + }, + { + "epoch": 0.19245960502693, + "grad_norm": 0.4281631410121918, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 2680 + }, + { + "epoch": 0.19317773788150808, + "grad_norm": 0.48011314868927, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 2690 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 0.45785006880760193, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2700 + }, + { + "epoch": 0.19461400359066428, + "grad_norm": 0.5244625210762024, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 2710 + }, + { + "epoch": 0.19533213644524236, + "grad_norm": 0.4674883186817169, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2720 + }, + { + "epoch": 0.19605026929982047, + "grad_norm": 0.5969558358192444, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 2730 + }, + { + "epoch": 0.19676840215439856, + "grad_norm": 0.44413265585899353, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 2740 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 0.5094553828239441, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2750 + }, + { + "epoch": 0.19820466786355476, + "grad_norm": 0.4931736886501312, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2760 + }, + { + "epoch": 0.19892280071813284, + "grad_norm": 0.4766625463962555, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 2770 + }, + { + "epoch": 0.19964093357271095, + "grad_norm": 0.4196971654891968, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2780 + }, + { + "epoch": 0.20035906642728904, + "grad_norm": 0.4693375825881958, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 2790 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 0.5407108664512634, + "learning_rate": 0.0002, + "loss": 0.8336, + "step": 2800 + }, + { + "epoch": 0.20179533213644524, + "grad_norm": 0.42864227294921875, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2810 + }, + { + "epoch": 0.20251346499102335, + "grad_norm": 0.4928833246231079, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 2820 + }, + { + "epoch": 0.20323159784560144, + "grad_norm": 0.5575131773948669, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2830 + }, + { + "epoch": 0.20394973070017952, + "grad_norm": 0.505114734172821, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2840 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 0.4727420210838318, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 2850 + }, + { + "epoch": 0.20538599640933572, + "grad_norm": 0.48218145966529846, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 2860 + }, + { + "epoch": 0.20610412926391383, + "grad_norm": 0.5196906328201294, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2870 + }, + { + "epoch": 0.20682226211849192, + "grad_norm": 0.4927639067173004, + "learning_rate": 0.0002, + "loss": 0.8401, + "step": 2880 + }, + { + "epoch": 0.20754039497307003, + "grad_norm": 0.5076990127563477, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 2890 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.4606800079345703, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 2900 + }, + { + "epoch": 0.2089766606822262, + "grad_norm": 0.6184319257736206, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2910 + }, + { + "epoch": 0.2096947935368043, + "grad_norm": 0.5237935781478882, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2920 + }, + { + "epoch": 0.2104129263913824, + "grad_norm": 0.43966251611709595, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 2930 + }, + { + "epoch": 0.2111310592459605, + "grad_norm": 0.48786666989326477, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2940 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 0.4397817552089691, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 2950 + }, + { + "epoch": 0.2125673249551167, + "grad_norm": 0.5155336260795593, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.2132854578096948, + "grad_norm": 0.48058274388313293, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2970 + }, + { + "epoch": 0.21400359066427288, + "grad_norm": 0.5022647976875305, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2980 + }, + { + "epoch": 0.214721723518851, + "grad_norm": 0.5417225360870361, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 2990 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 0.46300315856933594, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 3000 + }, + { + "epoch": 0.2161579892280072, + "grad_norm": 0.5375089049339294, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 3010 + }, + { + "epoch": 0.21687612208258528, + "grad_norm": 0.5050022602081299, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 3020 + }, + { + "epoch": 0.21759425493716336, + "grad_norm": 0.46347716450691223, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 3030 + }, + { + "epoch": 0.21831238779174147, + "grad_norm": 0.544874370098114, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 3040 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 3050 + }, + { + "epoch": 0.21974865350089767, + "grad_norm": 0.5527157187461853, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 3060 + }, + { + "epoch": 0.22046678635547576, + "grad_norm": 0.5565235018730164, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 3070 + }, + { + "epoch": 0.22118491921005387, + "grad_norm": 0.4900645613670349, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 3080 + }, + { + "epoch": 0.22190305206463196, + "grad_norm": 0.4951242208480835, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 3090 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 0.5831719636917114, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 3100 + }, + { + "epoch": 0.22333931777378815, + "grad_norm": 0.417576402425766, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 3110 + }, + { + "epoch": 0.22405745062836624, + "grad_norm": 0.4715117812156677, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 3120 + }, + { + "epoch": 0.22477558348294435, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 3130 + }, + { + "epoch": 0.22549371633752244, + "grad_norm": 0.408184289932251, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 3140 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 3150 + }, + { + "epoch": 0.22692998204667864, + "grad_norm": 0.5631294846534729, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3160 + }, + { + "epoch": 0.22764811490125672, + "grad_norm": 0.5054665803909302, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3170 + }, + { + "epoch": 0.22836624775583483, + "grad_norm": 0.47388020157814026, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 3180 + }, + { + "epoch": 0.22908438061041292, + "grad_norm": 0.45871609449386597, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 3190 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.42431211471557617, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 3200 + }, + { + "epoch": 0.23052064631956912, + "grad_norm": 0.584872305393219, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3210 + }, + { + "epoch": 0.23123877917414723, + "grad_norm": 0.5489653944969177, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 3220 + }, + { + "epoch": 0.23195691202872532, + "grad_norm": 0.5803213119506836, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 3230 + }, + { + "epoch": 0.2326750448833034, + "grad_norm": 0.906505823135376, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3240 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 0.4569525718688965, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 3250 + }, + { + "epoch": 0.2341113105924596, + "grad_norm": 0.5566741228103638, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3260 + }, + { + "epoch": 0.2348294434470377, + "grad_norm": 0.5059959888458252, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3270 + }, + { + "epoch": 0.2355475763016158, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 3280 + }, + { + "epoch": 0.2362657091561939, + "grad_norm": 0.5149409174919128, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 3290 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 0.7323763966560364, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3300 + }, + { + "epoch": 0.23770197486535008, + "grad_norm": 0.6794836521148682, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 3310 + }, + { + "epoch": 0.2384201077199282, + "grad_norm": 0.5176534056663513, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 3320 + }, + { + "epoch": 0.23913824057450628, + "grad_norm": 0.42245906591415405, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 3330 + }, + { + "epoch": 0.2398563734290844, + "grad_norm": 0.43535107374191284, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 0.7038307785987854, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 3350 + }, + { + "epoch": 0.24129263913824056, + "grad_norm": 0.5689977407455444, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 0.24201077199281867, + "grad_norm": 0.538136899471283, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 3370 + }, + { + "epoch": 0.24272890484739676, + "grad_norm": 0.7433661222457886, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 3380 + }, + { + "epoch": 0.24344703770197487, + "grad_norm": 0.6996734738349915, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3390 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.5055703520774841, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 3400 + }, + { + "epoch": 0.24488330341113107, + "grad_norm": 0.5218513607978821, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 3410 + }, + { + "epoch": 0.24560143626570916, + "grad_norm": 0.42782822251319885, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3420 + }, + { + "epoch": 0.24631956912028724, + "grad_norm": 0.4991157650947571, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 3430 + }, + { + "epoch": 0.24703770197486535, + "grad_norm": 0.5063165426254272, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3440 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 0.45863136649131775, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3450 + }, + { + "epoch": 0.24847396768402155, + "grad_norm": 0.474728524684906, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3460 + }, + { + "epoch": 0.24919210053859964, + "grad_norm": 0.522570013999939, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 3470 + }, + { + "epoch": 0.24991023339317775, + "grad_norm": 0.5474396347999573, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 3480 + }, + { + "epoch": 0.2506283662477558, + "grad_norm": 0.49094662070274353, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3490 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 0.6399132609367371, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 3500 + }, + { + "epoch": 0.25206463195691203, + "grad_norm": 0.5910066366195679, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 3510 + }, + { + "epoch": 0.25278276481149015, + "grad_norm": 0.4761259853839874, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3520 + }, + { + "epoch": 0.2535008976660682, + "grad_norm": 0.5124502182006836, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 3530 + }, + { + "epoch": 0.2542190305206463, + "grad_norm": 0.4329150915145874, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3540 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 0.4839608371257782, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 3550 + }, + { + "epoch": 0.2556552962298025, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3560 + }, + { + "epoch": 0.2563734290843806, + "grad_norm": 0.5761468410491943, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 3570 + }, + { + "epoch": 0.2570915619389587, + "grad_norm": 0.49266132712364197, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3580 + }, + { + "epoch": 0.2578096947935368, + "grad_norm": 0.7377930879592896, + "learning_rate": 0.0002, + "loss": 0.7946, + "step": 3590 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 0.543541431427002, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3600 + }, + { + "epoch": 0.259245960502693, + "grad_norm": 0.48385897278785706, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3610 + }, + { + "epoch": 0.2599640933572711, + "grad_norm": 0.5152639746665955, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3620 + }, + { + "epoch": 0.26068222621184917, + "grad_norm": 0.5601988434791565, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 3630 + }, + { + "epoch": 0.2614003590664273, + "grad_norm": 0.4349626302719116, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 0.5487161874771118, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3650 + }, + { + "epoch": 0.2628366247755835, + "grad_norm": 0.45603805780410767, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 3660 + }, + { + "epoch": 0.26355475763016156, + "grad_norm": 0.5012730956077576, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 3670 + }, + { + "epoch": 0.2642728904847397, + "grad_norm": 0.4523845314979553, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 3680 + }, + { + "epoch": 0.2649910233393178, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 3690 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 0.48467493057250977, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 3700 + }, + { + "epoch": 0.26642728904847396, + "grad_norm": 0.4860585927963257, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3710 + }, + { + "epoch": 0.26714542190305207, + "grad_norm": 0.5067077875137329, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3720 + }, + { + "epoch": 0.2678635547576302, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3730 + }, + { + "epoch": 0.26858168761220824, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 3740 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 0.5026951432228088, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 3750 + }, + { + "epoch": 0.27001795332136447, + "grad_norm": 0.49474090337753296, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3760 + }, + { + "epoch": 0.2707360861759425, + "grad_norm": 0.6381985545158386, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 3770 + }, + { + "epoch": 0.27145421903052064, + "grad_norm": 0.4784011244773865, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 3780 + }, + { + "epoch": 0.27217235188509875, + "grad_norm": 0.5126543045043945, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 3790 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 3800 + }, + { + "epoch": 0.2736086175942549, + "grad_norm": 0.5427033305168152, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 3810 + }, + { + "epoch": 0.27432675044883303, + "grad_norm": 0.46467480063438416, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 3820 + }, + { + "epoch": 0.27504488330341115, + "grad_norm": 0.494367390871048, + "learning_rate": 0.0002, + "loss": 0.8414, + "step": 3830 + }, + { + "epoch": 0.2757630161579892, + "grad_norm": 0.59856778383255, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3840 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 0.422128826379776, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 3850 + }, + { + "epoch": 0.27719928186714543, + "grad_norm": 0.5757306814193726, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 3860 + }, + { + "epoch": 0.27791741472172354, + "grad_norm": 0.5850930213928223, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.2786355475763016, + "grad_norm": 0.5633023977279663, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3880 + }, + { + "epoch": 0.2793536804308797, + "grad_norm": 0.5037940144538879, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 3890 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 0.5255506038665771, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 3900 + }, + { + "epoch": 0.2807899461400359, + "grad_norm": 0.44584617018699646, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 3910 + }, + { + "epoch": 0.281508078994614, + "grad_norm": 0.4803239405155182, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 3920 + }, + { + "epoch": 0.2822262118491921, + "grad_norm": 0.5206008553504944, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 3930 + }, + { + "epoch": 0.2829443447037702, + "grad_norm": 0.5596373081207275, + "learning_rate": 0.0002, + "loss": 0.8988, + "step": 3940 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 0.4487258493900299, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 3950 + }, + { + "epoch": 0.2843806104129264, + "grad_norm": 0.4774281978607178, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3960 + }, + { + "epoch": 0.2850987432675045, + "grad_norm": 0.571829617023468, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3970 + }, + { + "epoch": 0.28581687612208256, + "grad_norm": 0.45251455903053284, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 3980 + }, + { + "epoch": 0.2865350089766607, + "grad_norm": 0.5119943618774414, + "learning_rate": 0.0002, + "loss": 0.8007, + "step": 3990 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.42333969473838806, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 4000 + }, + { + "epoch": 0.2879712746858169, + "grad_norm": 0.5694096684455872, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 4010 + }, + { + "epoch": 0.28868940754039496, + "grad_norm": 0.44457492232322693, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 4020 + }, + { + "epoch": 0.2894075403949731, + "grad_norm": 0.496545672416687, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 4030 + }, + { + "epoch": 0.2901256732495512, + "grad_norm": 0.5092352032661438, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 4040 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 0.5124567151069641, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4050 + }, + { + "epoch": 0.29156193895870736, + "grad_norm": 0.5148161053657532, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4060 + }, + { + "epoch": 0.29228007181328547, + "grad_norm": 0.48183947801589966, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4070 + }, + { + "epoch": 0.2929982046678636, + "grad_norm": 0.47728800773620605, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4080 + }, + { + "epoch": 0.29371633752244164, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.5343585014343262, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 4100 + }, + { + "epoch": 0.29515260323159787, + "grad_norm": 0.5760312676429749, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 4110 + }, + { + "epoch": 0.2958707360861759, + "grad_norm": 0.5894787907600403, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4120 + }, + { + "epoch": 0.29658886894075404, + "grad_norm": 0.4528578817844391, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 4130 + }, + { + "epoch": 0.29730700179533215, + "grad_norm": 0.6027235388755798, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 4140 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.5060310959815979, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 4150 + }, + { + "epoch": 0.2987432675044883, + "grad_norm": 0.475252628326416, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4160 + }, + { + "epoch": 0.29946140035906643, + "grad_norm": 0.4855351448059082, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 4170 + }, + { + "epoch": 0.30017953321364454, + "grad_norm": 0.6720767021179199, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4180 + }, + { + "epoch": 0.3008976660682226, + "grad_norm": 0.6409553289413452, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 4190 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.5508167147636414, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 4200 + }, + { + "epoch": 0.30233393177737883, + "grad_norm": 0.45958149433135986, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 4210 + }, + { + "epoch": 0.3030520646319569, + "grad_norm": 0.5201641321182251, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 4220 + }, + { + "epoch": 0.303770197486535, + "grad_norm": 0.5440032482147217, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4230 + }, + { + "epoch": 0.3044883303411131, + "grad_norm": 0.43566814064979553, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4240 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 0.4479893445968628, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 4250 + }, + { + "epoch": 0.3059245960502693, + "grad_norm": 0.40390217304229736, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4260 + }, + { + "epoch": 0.3066427289048474, + "grad_norm": 0.5143486261367798, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 4270 + }, + { + "epoch": 0.3073608617594255, + "grad_norm": 0.5289962887763977, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 4280 + }, + { + "epoch": 0.30807899461400357, + "grad_norm": 0.609561026096344, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 4290 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 0.5967493653297424, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 4300 + }, + { + "epoch": 0.3095152603231598, + "grad_norm": 0.5323672890663147, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4310 + }, + { + "epoch": 0.3102333931777379, + "grad_norm": 0.4996737241744995, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 4320 + }, + { + "epoch": 0.31095152603231596, + "grad_norm": 0.5528829097747803, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 4330 + }, + { + "epoch": 0.3116696588868941, + "grad_norm": 0.5394268035888672, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4340 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 0.4654628038406372, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 4350 + }, + { + "epoch": 0.31310592459605024, + "grad_norm": 0.4933706521987915, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.31382405745062836, + "grad_norm": 0.5310598611831665, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 4370 + }, + { + "epoch": 0.31454219030520647, + "grad_norm": 0.5558765530586243, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4380 + }, + { + "epoch": 0.3152603231597846, + "grad_norm": 0.5281313061714172, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 4390 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 0.5100293755531311, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4400 + }, + { + "epoch": 0.31669658886894075, + "grad_norm": 0.48762813210487366, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 4410 + }, + { + "epoch": 0.31741472172351887, + "grad_norm": 0.5211702585220337, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 4420 + }, + { + "epoch": 0.3181328545780969, + "grad_norm": 0.696747899055481, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 4430 + }, + { + "epoch": 0.31885098743267504, + "grad_norm": 0.6334946751594543, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4440 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.5333067178726196, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4450 + }, + { + "epoch": 0.32028725314183126, + "grad_norm": 0.500091552734375, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 4460 + }, + { + "epoch": 0.3210053859964093, + "grad_norm": 0.5190957188606262, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4470 + }, + { + "epoch": 0.32172351885098743, + "grad_norm": 0.6702370047569275, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 4480 + }, + { + "epoch": 0.32244165170556555, + "grad_norm": 0.4393869638442993, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 4490 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 4500 + }, + { + "epoch": 0.3238779174147217, + "grad_norm": 0.561836838722229, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 4510 + }, + { + "epoch": 0.32459605026929983, + "grad_norm": 0.44366541504859924, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 4520 + }, + { + "epoch": 0.32531418312387794, + "grad_norm": 0.46504274010658264, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 4530 + }, + { + "epoch": 0.326032315978456, + "grad_norm": 0.5498034954071045, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 4540 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 0.5901338458061218, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 4550 + }, + { + "epoch": 0.3274685816876122, + "grad_norm": 0.5485442876815796, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 4560 + }, + { + "epoch": 0.3281867145421903, + "grad_norm": 0.512584924697876, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4570 + }, + { + "epoch": 0.3289048473967684, + "grad_norm": 0.5208188891410828, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 4580 + }, + { + "epoch": 0.3296229802513465, + "grad_norm": 0.4923836886882782, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 4590 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 0.49258530139923096, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 4600 + }, + { + "epoch": 0.3310592459605027, + "grad_norm": 0.4788922667503357, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 4610 + }, + { + "epoch": 0.3317773788150808, + "grad_norm": 0.48276954889297485, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4620 + }, + { + "epoch": 0.3324955116696589, + "grad_norm": 0.6300732493400574, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4630 + }, + { + "epoch": 0.33321364452423696, + "grad_norm": 0.47594770789146423, + "learning_rate": 0.0002, + "loss": 0.8434, + "step": 4640 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.4728924632072449, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 4650 + }, + { + "epoch": 0.3346499102333932, + "grad_norm": 0.5586788654327393, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4660 + }, + { + "epoch": 0.3353680430879713, + "grad_norm": 0.4573180377483368, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 4670 + }, + { + "epoch": 0.33608617594254936, + "grad_norm": 0.6391524076461792, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 4680 + }, + { + "epoch": 0.33680430879712747, + "grad_norm": 0.6570921540260315, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 4690 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 0.4601454734802246, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 4700 + }, + { + "epoch": 0.33824057450628364, + "grad_norm": 0.5640755295753479, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 4710 + }, + { + "epoch": 0.33895870736086176, + "grad_norm": 0.43475520610809326, + "learning_rate": 0.0002, + "loss": 0.8326, + "step": 4720 + }, + { + "epoch": 0.33967684021543987, + "grad_norm": 0.4785807132720947, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 4730 + }, + { + "epoch": 0.340394973070018, + "grad_norm": 0.4934665262699127, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 4740 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 0.45327693223953247, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 4750 + }, + { + "epoch": 0.34183123877917415, + "grad_norm": 0.4710456430912018, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4760 + }, + { + "epoch": 0.34254937163375226, + "grad_norm": 0.5591559410095215, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 4770 + }, + { + "epoch": 0.3432675044883303, + "grad_norm": 0.48958835005760193, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 4780 + }, + { + "epoch": 0.34398563734290843, + "grad_norm": 0.4613766670227051, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 4790 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 0.5425335764884949, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 4800 + }, + { + "epoch": 0.3454219030520646, + "grad_norm": 0.4964924156665802, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.3461400359066427, + "grad_norm": 0.613449215888977, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 4820 + }, + { + "epoch": 0.34685816876122083, + "grad_norm": 0.6553348898887634, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 4830 + }, + { + "epoch": 0.34757630161579894, + "grad_norm": 0.5863470435142517, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 4840 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.5338097810745239, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 4850 + }, + { + "epoch": 0.3490125673249551, + "grad_norm": 0.6129760146141052, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 4860 + }, + { + "epoch": 0.3497307001795332, + "grad_norm": 0.6100956797599792, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 4870 + }, + { + "epoch": 0.3504488330341113, + "grad_norm": 0.5478541254997253, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 4880 + }, + { + "epoch": 0.3511669658886894, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 4890 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6141043901443481, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 4900 + }, + { + "epoch": 0.3526032315978456, + "grad_norm": 0.597191572189331, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 4910 + }, + { + "epoch": 0.3533213644524237, + "grad_norm": 0.5988389253616333, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 4920 + }, + { + "epoch": 0.3540394973070018, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 4930 + }, + { + "epoch": 0.3547576301615799, + "grad_norm": 0.5932779312133789, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 4940 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 0.48911359906196594, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 4950 + }, + { + "epoch": 0.3561938958707361, + "grad_norm": 0.5435750484466553, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4960 + }, + { + "epoch": 0.3569120287253142, + "grad_norm": 0.4786977767944336, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 4970 + }, + { + "epoch": 0.3576301615798923, + "grad_norm": 0.4022316336631775, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 4980 + }, + { + "epoch": 0.35834829443447036, + "grad_norm": 0.4848504364490509, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 4990 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 0.5093459486961365, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 5000 + }, + { + "epoch": 0.3597845601436266, + "grad_norm": 0.47368478775024414, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 5010 + }, + { + "epoch": 0.36050269299820464, + "grad_norm": 0.6041097044944763, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 5020 + }, + { + "epoch": 0.36122082585278276, + "grad_norm": 0.5384424924850464, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 5030 + }, + { + "epoch": 0.36193895870736087, + "grad_norm": 0.4668518602848053, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 5040 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 0.5471060276031494, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 5050 + }, + { + "epoch": 0.36337522441651704, + "grad_norm": 0.731369137763977, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 5060 + }, + { + "epoch": 0.36409335727109515, + "grad_norm": 0.5119590759277344, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 5070 + }, + { + "epoch": 0.36481149012567327, + "grad_norm": 0.567428469657898, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 5080 + }, + { + "epoch": 0.3655296229802513, + "grad_norm": 0.5139971375465393, + "learning_rate": 0.0002, + "loss": 0.7616, + "step": 5090 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.5701581835746765, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 5100 + }, + { + "epoch": 0.36696588868940755, + "grad_norm": 0.5022063851356506, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 5110 + }, + { + "epoch": 0.36768402154398566, + "grad_norm": 0.4684354364871979, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 5120 + }, + { + "epoch": 0.3684021543985637, + "grad_norm": 0.5423495769500732, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 5130 + }, + { + "epoch": 0.36912028725314183, + "grad_norm": 0.46262967586517334, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 5140 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 0.4720141589641571, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 5150 + }, + { + "epoch": 0.370556552962298, + "grad_norm": 0.5113096833229065, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 5160 + }, + { + "epoch": 0.3712746858168761, + "grad_norm": 0.5253350138664246, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 5170 + }, + { + "epoch": 0.37199281867145423, + "grad_norm": 0.5799776315689087, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 5180 + }, + { + "epoch": 0.37271095152603234, + "grad_norm": 0.5166001319885254, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5190 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 5200 + }, + { + "epoch": 0.3741472172351885, + "grad_norm": 0.45811113715171814, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 5210 + }, + { + "epoch": 0.3748653500897666, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 5220 + }, + { + "epoch": 0.3755834829443447, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 0.3763016157989228, + "grad_norm": 0.3858596086502075, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 5240 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 0.6941536068916321, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 5250 + }, + { + "epoch": 0.377737881508079, + "grad_norm": 0.46940872073173523, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 5260 + }, + { + "epoch": 0.3784560143626571, + "grad_norm": 0.5413833260536194, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5270 + }, + { + "epoch": 0.3791741472172352, + "grad_norm": 0.5165658593177795, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 5280 + }, + { + "epoch": 0.3798922800718133, + "grad_norm": 0.6567398309707642, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 5290 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.5466915965080261, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 5300 + }, + { + "epoch": 0.3813285457809695, + "grad_norm": 0.4800598621368408, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 5310 + }, + { + "epoch": 0.3820466786355476, + "grad_norm": 0.4551742970943451, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 5320 + }, + { + "epoch": 0.3827648114901257, + "grad_norm": 0.5561164617538452, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 5330 + }, + { + "epoch": 0.38348294434470376, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 5340 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.465762197971344, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 5350 + }, + { + "epoch": 0.38491921005386, + "grad_norm": 0.6176838874816895, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 5360 + }, + { + "epoch": 0.38563734290843804, + "grad_norm": 0.657926082611084, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 5370 + }, + { + "epoch": 0.38635547576301615, + "grad_norm": 0.5063281655311584, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 5380 + }, + { + "epoch": 0.38707360861759427, + "grad_norm": 0.6960828304290771, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 5390 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 0.46712034940719604, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 5400 + }, + { + "epoch": 0.38850987432675044, + "grad_norm": 0.598114013671875, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 5410 + }, + { + "epoch": 0.38922800718132855, + "grad_norm": 0.6798132061958313, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 5420 + }, + { + "epoch": 0.38994614003590666, + "grad_norm": 0.5194289088249207, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 5430 + }, + { + "epoch": 0.3906642728904847, + "grad_norm": 0.48175323009490967, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 5440 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 0.4979408085346222, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 5450 + }, + { + "epoch": 0.39210053859964095, + "grad_norm": 0.6440972685813904, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5460 + }, + { + "epoch": 0.392818671454219, + "grad_norm": 0.5977227091789246, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 5470 + }, + { + "epoch": 0.3935368043087971, + "grad_norm": 0.4735909104347229, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 5480 + }, + { + "epoch": 0.39425493716337523, + "grad_norm": 0.48181721568107605, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 5490 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.6339454650878906, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 5500 + }, + { + "epoch": 0.3956912028725314, + "grad_norm": 0.5364336371421814, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5510 + }, + { + "epoch": 0.3964093357271095, + "grad_norm": 0.5499233603477478, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 5520 + }, + { + "epoch": 0.3971274685816876, + "grad_norm": 0.47249847650527954, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 5530 + }, + { + "epoch": 0.3978456014362657, + "grad_norm": 0.5692135095596313, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 5540 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 0.6009272933006287, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 5550 + }, + { + "epoch": 0.3992818671454219, + "grad_norm": 0.5198255181312561, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5560 + }, + { + "epoch": 0.4, + "grad_norm": 0.5474766492843628, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 0.4007181328545781, + "grad_norm": 0.5577479600906372, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 5580 + }, + { + "epoch": 0.4014362657091562, + "grad_norm": 0.5350302457809448, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5590 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 0.6310991048812866, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 5600 + }, + { + "epoch": 0.40287253141831236, + "grad_norm": 0.5695762038230896, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5610 + }, + { + "epoch": 0.4035906642728905, + "grad_norm": 0.5431827306747437, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 5620 + }, + { + "epoch": 0.4043087971274686, + "grad_norm": 0.4923325777053833, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 5630 + }, + { + "epoch": 0.4050269299820467, + "grad_norm": 0.531399667263031, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 5640 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.5854769349098206, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 5650 + }, + { + "epoch": 0.40646319569120287, + "grad_norm": 0.6684802174568176, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 5660 + }, + { + "epoch": 0.407181328545781, + "grad_norm": 0.6618620753288269, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 5670 + }, + { + "epoch": 0.40789946140035904, + "grad_norm": 0.4930776059627533, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 5680 + }, + { + "epoch": 0.40861759425493716, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 5690 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5700 + }, + { + "epoch": 0.4100538599640934, + "grad_norm": 0.6773046851158142, + "learning_rate": 0.0002, + "loss": 0.8386, + "step": 5710 + }, + { + "epoch": 0.41077199281867144, + "grad_norm": 0.6750592589378357, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 5720 + }, + { + "epoch": 0.41149012567324955, + "grad_norm": 0.5277232527732849, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5730 + }, + { + "epoch": 0.41220825852782766, + "grad_norm": 0.5155990719795227, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 5740 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 0.5236294865608215, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 5750 + }, + { + "epoch": 0.41364452423698383, + "grad_norm": 0.5073592066764832, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 5760 + }, + { + "epoch": 0.41436265709156195, + "grad_norm": 0.6997184753417969, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 5770 + }, + { + "epoch": 0.41508078994614006, + "grad_norm": 0.5282439589500427, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 5780 + }, + { + "epoch": 0.4157989228007181, + "grad_norm": 0.4997355341911316, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5790 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.6081610321998596, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5800 + }, + { + "epoch": 0.41723518850987434, + "grad_norm": 0.5640295147895813, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 5810 + }, + { + "epoch": 0.4179533213644524, + "grad_norm": 0.6443586349487305, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 0.4186714542190305, + "grad_norm": 0.6456229090690613, + "learning_rate": 0.0002, + "loss": 0.8132, + "step": 5830 + }, + { + "epoch": 0.4193895870736086, + "grad_norm": 0.5422267317771912, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5840 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 0.45251885056495667, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5850 + }, + { + "epoch": 0.4208258527827648, + "grad_norm": 0.781165599822998, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5860 + }, + { + "epoch": 0.4215439856373429, + "grad_norm": 0.5359160900115967, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5870 + }, + { + "epoch": 0.422262118491921, + "grad_norm": 0.6201958656311035, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5880 + }, + { + "epoch": 0.4229802513464991, + "grad_norm": 0.5985850691795349, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 5890 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.5550961494445801, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 5900 + }, + { + "epoch": 0.4244165170556553, + "grad_norm": 0.6284893155097961, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 5910 + }, + { + "epoch": 0.4251346499102334, + "grad_norm": 0.6143685579299927, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 5920 + }, + { + "epoch": 0.4258527827648115, + "grad_norm": 0.5065329670906067, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5930 + }, + { + "epoch": 0.4265709156193896, + "grad_norm": 0.7274345755577087, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 5940 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.606531023979187, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 5950 + }, + { + "epoch": 0.42800718132854576, + "grad_norm": 0.5983648300170898, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5960 + }, + { + "epoch": 0.4287253141831239, + "grad_norm": 0.5546031594276428, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5970 + }, + { + "epoch": 0.429443447037702, + "grad_norm": 0.666868269443512, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 5980 + }, + { + "epoch": 0.4301615798922801, + "grad_norm": 0.41438576579093933, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5990 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 0.5012526512145996, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 6000 + }, + { + "epoch": 0.43159784560143627, + "grad_norm": 0.6071694493293762, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 6010 + }, + { + "epoch": 0.4323159784560144, + "grad_norm": 0.5538384914398193, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 6020 + }, + { + "epoch": 0.43303411131059244, + "grad_norm": 0.5798718929290771, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 6030 + }, + { + "epoch": 0.43375224416517055, + "grad_norm": 0.5442442893981934, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 6040 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.6895565390586853, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 6050 + }, + { + "epoch": 0.4351885098743267, + "grad_norm": 0.6498045325279236, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 6060 + }, + { + "epoch": 0.43590664272890484, + "grad_norm": 0.5225510001182556, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 6070 + }, + { + "epoch": 0.43662477558348295, + "grad_norm": 0.6366992592811584, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 6080 + }, + { + "epoch": 0.43734290843806106, + "grad_norm": 0.47929027676582336, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 6090 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.5722405910491943, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 6100 + }, + { + "epoch": 0.43877917414721723, + "grad_norm": 0.6008004546165466, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 6110 + }, + { + "epoch": 0.43949730700179535, + "grad_norm": 0.5922580361366272, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 6120 + }, + { + "epoch": 0.4402154398563734, + "grad_norm": 0.7051905393600464, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 6130 + }, + { + "epoch": 0.4409335727109515, + "grad_norm": 0.5146450400352478, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 6140 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5605781674385071, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 6150 + }, + { + "epoch": 0.44236983842010774, + "grad_norm": 0.8008661866188049, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 6160 + }, + { + "epoch": 0.4430879712746858, + "grad_norm": 0.47406497597694397, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 6170 + }, + { + "epoch": 0.4438061041292639, + "grad_norm": 0.612287700176239, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 6180 + }, + { + "epoch": 0.444524236983842, + "grad_norm": 0.561188280582428, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 6190 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.6233669519424438, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 6200 + }, + { + "epoch": 0.4459605026929982, + "grad_norm": 0.45546263456344604, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6210 + }, + { + "epoch": 0.4466786355475763, + "grad_norm": 0.5947871208190918, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 6220 + }, + { + "epoch": 0.4473967684021544, + "grad_norm": 0.6109753847122192, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 6230 + }, + { + "epoch": 0.4481149012567325, + "grad_norm": 0.6380727887153625, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6240 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.5225699543952942, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 6250 + }, + { + "epoch": 0.4495511669658887, + "grad_norm": 0.521503210067749, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.45026929982046676, + "grad_norm": 0.5523216128349304, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 6270 + }, + { + "epoch": 0.4509874326750449, + "grad_norm": 0.5954921841621399, + "learning_rate": 0.0002, + "loss": 0.8228, + "step": 6280 + }, + { + "epoch": 0.451705565529623, + "grad_norm": 0.702751100063324, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 6290 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 0.5756356120109558, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 6300 + }, + { + "epoch": 0.45314183123877916, + "grad_norm": 0.45365944504737854, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 6310 + }, + { + "epoch": 0.45385996409335727, + "grad_norm": 0.5027855038642883, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6320 + }, + { + "epoch": 0.4545780969479354, + "grad_norm": 0.6551687121391296, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 6330 + }, + { + "epoch": 0.45529622980251344, + "grad_norm": 0.5296684503555298, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6340 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 0.5762032866477966, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6350 + }, + { + "epoch": 0.45673249551166967, + "grad_norm": 0.5234073996543884, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6360 + }, + { + "epoch": 0.4574506283662478, + "grad_norm": 0.5090946555137634, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 6370 + }, + { + "epoch": 0.45816876122082584, + "grad_norm": 0.6515111327171326, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 6380 + }, + { + "epoch": 0.45888689407540395, + "grad_norm": 0.7904898524284363, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 6390 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.6379680037498474, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 6400 + }, + { + "epoch": 0.4603231597845601, + "grad_norm": 0.641759991645813, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 6410 + }, + { + "epoch": 0.46104129263913823, + "grad_norm": 0.5273829698562622, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 6420 + }, + { + "epoch": 0.46175942549371635, + "grad_norm": 0.5668497681617737, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6430 + }, + { + "epoch": 0.46247755834829446, + "grad_norm": 0.5862061381340027, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 6440 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 0.5239592790603638, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 6450 + }, + { + "epoch": 0.46391382405745063, + "grad_norm": 0.5078722834587097, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 6460 + }, + { + "epoch": 0.46463195691202874, + "grad_norm": 0.566509485244751, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 6470 + }, + { + "epoch": 0.4653500897666068, + "grad_norm": 0.5952697396278381, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 6480 + }, + { + "epoch": 0.4660682226211849, + "grad_norm": 0.6548156142234802, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 6490 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 0.4768427908420563, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.46750448833034114, + "grad_norm": 0.5588273406028748, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 6510 + }, + { + "epoch": 0.4682226211849192, + "grad_norm": 0.5348677039146423, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 6520 + }, + { + "epoch": 0.4689407540394973, + "grad_norm": 0.4784318804740906, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 6530 + }, + { + "epoch": 0.4696588868940754, + "grad_norm": 0.5112265944480896, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 6540 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 0.7250495553016663, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 6550 + }, + { + "epoch": 0.4710951526032316, + "grad_norm": 0.538608968257904, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 6560 + }, + { + "epoch": 0.4718132854578097, + "grad_norm": 0.5981247425079346, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 6570 + }, + { + "epoch": 0.4725314183123878, + "grad_norm": 0.5466762781143188, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 6580 + }, + { + "epoch": 0.4732495511669659, + "grad_norm": 0.5609987378120422, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 6590 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 0.6091027855873108, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 6600 + }, + { + "epoch": 0.4746858168761221, + "grad_norm": 0.5542886853218079, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 6610 + }, + { + "epoch": 0.47540394973070016, + "grad_norm": 0.5656579732894897, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6620 + }, + { + "epoch": 0.4761220825852783, + "grad_norm": 0.47507357597351074, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 6630 + }, + { + "epoch": 0.4768402154398564, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6640 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 0.7129740715026855, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 6650 + }, + { + "epoch": 0.47827648114901256, + "grad_norm": 0.5189188718795776, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 6660 + }, + { + "epoch": 0.47899461400359067, + "grad_norm": 0.7548696398735046, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 6670 + }, + { + "epoch": 0.4797127468581688, + "grad_norm": 0.4729466438293457, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 6680 + }, + { + "epoch": 0.48043087971274684, + "grad_norm": 0.6190000772476196, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 6690 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 0.6276983022689819, + "learning_rate": 0.0002, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.48186714542190306, + "grad_norm": 0.6097590923309326, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 6710 + }, + { + "epoch": 0.4825852782764811, + "grad_norm": 0.6507330536842346, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 6720 + }, + { + "epoch": 0.48330341113105924, + "grad_norm": 0.5501991510391235, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 6730 + }, + { + "epoch": 0.48402154398563735, + "grad_norm": 0.5928015112876892, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 6740 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 0.5523008704185486, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 6750 + }, + { + "epoch": 0.4854578096947935, + "grad_norm": 0.5997263789176941, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 6760 + }, + { + "epoch": 0.48617594254937163, + "grad_norm": 0.6201002597808838, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 6770 + }, + { + "epoch": 0.48689407540394974, + "grad_norm": 0.6338862776756287, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 6780 + }, + { + "epoch": 0.4876122082585278, + "grad_norm": 0.5542550086975098, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6790 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.5587872862815857, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 6800 + }, + { + "epoch": 0.489048473967684, + "grad_norm": 0.5895681977272034, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 6810 + }, + { + "epoch": 0.48976660682226214, + "grad_norm": 0.4948221743106842, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 6820 + }, + { + "epoch": 0.4904847396768402, + "grad_norm": 0.44546931982040405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 6830 + }, + { + "epoch": 0.4912028725314183, + "grad_norm": 0.632046103477478, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 6840 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 0.49396243691444397, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 6850 + }, + { + "epoch": 0.4926391382405745, + "grad_norm": 0.497745156288147, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6860 + }, + { + "epoch": 0.4933572710951526, + "grad_norm": 0.7336170077323914, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 6870 + }, + { + "epoch": 0.4940754039497307, + "grad_norm": 0.6723181009292603, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 6880 + }, + { + "epoch": 0.4947935368043088, + "grad_norm": 0.5887754559516907, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 6890 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 0.6580226421356201, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 6900 + }, + { + "epoch": 0.496229802513465, + "grad_norm": 0.7385056614875793, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 6910 + }, + { + "epoch": 0.4969479353680431, + "grad_norm": 0.48736000061035156, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6920 + }, + { + "epoch": 0.49766606822262116, + "grad_norm": 0.6304559111595154, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 6930 + }, + { + "epoch": 0.4983842010771993, + "grad_norm": 0.607148289680481, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6940 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.5467981696128845, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 6950 + }, + { + "epoch": 0.4998204667863555, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 6960 + }, + { + "epoch": 0.5005385996409336, + "grad_norm": 0.5487921833992004, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 6970 + }, + { + "epoch": 0.5012567324955116, + "grad_norm": 0.5706006288528442, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 6980 + }, + { + "epoch": 0.5019748653500897, + "grad_norm": 0.539536714553833, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 6990 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 0.5527397394180298, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 7000 + }, + { + "epoch": 0.503411131059246, + "grad_norm": 0.5498567223548889, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 7010 + }, + { + "epoch": 0.5041292639138241, + "grad_norm": 0.5878575444221497, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 7020 + }, + { + "epoch": 0.5048473967684022, + "grad_norm": 0.646153450012207, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 7030 + }, + { + "epoch": 0.5055655296229803, + "grad_norm": 0.5603899359703064, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 7040 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 0.5849952697753906, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 7050 + }, + { + "epoch": 0.5070017953321364, + "grad_norm": 0.6082724928855896, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 7060 + }, + { + "epoch": 0.5077199281867145, + "grad_norm": 0.5900670289993286, + "learning_rate": 0.0002, + "loss": 0.8046, + "step": 7070 + }, + { + "epoch": 0.5084380610412926, + "grad_norm": 0.5856624841690063, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 7080 + }, + { + "epoch": 0.5091561938958707, + "grad_norm": 0.6177338361740112, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7090 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 0.5559300184249878, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 7100 + }, + { + "epoch": 0.510592459605027, + "grad_norm": 0.62027907371521, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 7110 + }, + { + "epoch": 0.511310592459605, + "grad_norm": 0.6334301829338074, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7120 + }, + { + "epoch": 0.5120287253141831, + "grad_norm": 0.513795018196106, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 7130 + }, + { + "epoch": 0.5127468581687612, + "grad_norm": 0.7004675269126892, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 7140 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5614308714866638, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7150 + }, + { + "epoch": 0.5141831238779174, + "grad_norm": 0.5037539601325989, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 7160 + }, + { + "epoch": 0.5149012567324955, + "grad_norm": 0.5568661093711853, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 7170 + }, + { + "epoch": 0.5156193895870737, + "grad_norm": 0.7513397336006165, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7180 + }, + { + "epoch": 0.5163375224416517, + "grad_norm": 0.7264583706855774, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 7190 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.6355819702148438, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 7200 + }, + { + "epoch": 0.5177737881508079, + "grad_norm": 0.6063222289085388, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 7210 + }, + { + "epoch": 0.518491921005386, + "grad_norm": 0.6484307646751404, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 7220 + }, + { + "epoch": 0.5192100538599641, + "grad_norm": 0.5260455012321472, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 7230 + }, + { + "epoch": 0.5199281867145422, + "grad_norm": 0.6718002557754517, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7240 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 0.5997617244720459, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 7250 + }, + { + "epoch": 0.5213644524236983, + "grad_norm": 0.5838589668273926, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 7260 + }, + { + "epoch": 0.5220825852782764, + "grad_norm": 0.5755977630615234, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 7270 + }, + { + "epoch": 0.5228007181328546, + "grad_norm": 0.6442093253135681, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 7280 + }, + { + "epoch": 0.5235188509874327, + "grad_norm": 0.6128416657447815, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 7290 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.509742796421051, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 7300 + }, + { + "epoch": 0.5249551166965889, + "grad_norm": 0.5450230836868286, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 7310 + }, + { + "epoch": 0.525673249551167, + "grad_norm": 0.5437141060829163, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 7320 + }, + { + "epoch": 0.526391382405745, + "grad_norm": 0.5291738510131836, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 7330 + }, + { + "epoch": 0.5271095152603231, + "grad_norm": 0.5101743936538696, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 7340 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.5678408145904541, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 7350 + }, + { + "epoch": 0.5285457809694794, + "grad_norm": 0.6332360506057739, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7360 + }, + { + "epoch": 0.5292639138240575, + "grad_norm": 0.4935058653354645, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 7370 + }, + { + "epoch": 0.5299820466786356, + "grad_norm": 0.6399656534194946, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7380 + }, + { + "epoch": 0.5307001795332137, + "grad_norm": 0.5986794233322144, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 7390 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.6948414444923401, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 7400 + }, + { + "epoch": 0.5321364452423698, + "grad_norm": 0.5337842106819153, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 7410 + }, + { + "epoch": 0.5328545780969479, + "grad_norm": 0.6897268295288086, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 7420 + }, + { + "epoch": 0.533572710951526, + "grad_norm": 0.6361175179481506, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 7430 + }, + { + "epoch": 0.5342908438061041, + "grad_norm": 0.5242252945899963, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 7440 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 0.5731322765350342, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 7450 + }, + { + "epoch": 0.5357271095152604, + "grad_norm": 0.5790955424308777, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 7460 + }, + { + "epoch": 0.5364452423698384, + "grad_norm": 0.4979061782360077, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 7470 + }, + { + "epoch": 0.5371633752244165, + "grad_norm": 0.7335101962089539, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 7480 + }, + { + "epoch": 0.5378815080789946, + "grad_norm": 0.592521071434021, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 7490 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.5784769654273987, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 7500 + }, + { + "epoch": 0.5393177737881508, + "grad_norm": 0.8148589730262756, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 7510 + }, + { + "epoch": 0.5400359066427289, + "grad_norm": 0.5727689862251282, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7520 + }, + { + "epoch": 0.540754039497307, + "grad_norm": 0.6958279609680176, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 7530 + }, + { + "epoch": 0.541472172351885, + "grad_norm": 0.6302788257598877, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 7540 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.5950970649719238, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 7550 + }, + { + "epoch": 0.5429084380610413, + "grad_norm": 0.4275270104408264, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 7560 + }, + { + "epoch": 0.5436265709156194, + "grad_norm": 0.7579900622367859, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 7570 + }, + { + "epoch": 0.5443447037701975, + "grad_norm": 0.5835317969322205, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 7580 + }, + { + "epoch": 0.5450628366247756, + "grad_norm": 0.5305142998695374, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 7590 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7600 + }, + { + "epoch": 0.5464991023339317, + "grad_norm": 0.5341935753822327, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 7610 + }, + { + "epoch": 0.5472172351885098, + "grad_norm": 0.6070826053619385, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 7620 + }, + { + "epoch": 0.547935368043088, + "grad_norm": 0.6193035840988159, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 7630 + }, + { + "epoch": 0.5486535008976661, + "grad_norm": 0.6171614527702332, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 7640 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.5700938105583191, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 7650 + }, + { + "epoch": 0.5500897666068223, + "grad_norm": 0.5742418169975281, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7660 + }, + { + "epoch": 0.5508078994614004, + "grad_norm": 0.6450320482254028, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 7670 + }, + { + "epoch": 0.5515260323159784, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 7680 + }, + { + "epoch": 0.5522441651705565, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 7690 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.5846288204193115, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7700 + }, + { + "epoch": 0.5536804308797127, + "grad_norm": 0.623315155506134, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7710 + }, + { + "epoch": 0.5543985637342909, + "grad_norm": 0.6607962250709534, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7720 + }, + { + "epoch": 0.555116696588869, + "grad_norm": 0.5258557200431824, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 7730 + }, + { + "epoch": 0.5558348294434471, + "grad_norm": 0.6464316844940186, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7740 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 0.6390621662139893, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 7750 + }, + { + "epoch": 0.5572710951526032, + "grad_norm": 0.5327560305595398, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 7760 + }, + { + "epoch": 0.5579892280071813, + "grad_norm": 0.8202064633369446, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 7770 + }, + { + "epoch": 0.5587073608617594, + "grad_norm": 0.45350968837738037, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 7780 + }, + { + "epoch": 0.5594254937163375, + "grad_norm": 0.5031413435935974, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 7790 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.5047417879104614, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 7800 + }, + { + "epoch": 0.5608617594254938, + "grad_norm": 0.668912410736084, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 0.5615798922800718, + "grad_norm": 0.6106061339378357, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7820 + }, + { + "epoch": 0.5622980251346499, + "grad_norm": 0.5558443665504456, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 7830 + }, + { + "epoch": 0.563016157989228, + "grad_norm": 0.5937177538871765, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 7840 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 0.67307448387146, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 7850 + }, + { + "epoch": 0.5644524236983842, + "grad_norm": 0.4615475833415985, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7860 + }, + { + "epoch": 0.5651705565529623, + "grad_norm": 0.5462577939033508, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 7870 + }, + { + "epoch": 0.5658886894075404, + "grad_norm": 0.6422402858734131, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7880 + }, + { + "epoch": 0.5666068222621184, + "grad_norm": 0.5313532948493958, + "learning_rate": 0.0002, + "loss": 0.8327, + "step": 7890 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 0.5647847056388855, + "learning_rate": 0.0002, + "loss": 0.7771, + "step": 7900 + }, + { + "epoch": 0.5680430879712747, + "grad_norm": 0.6581610441207886, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 7910 + }, + { + "epoch": 0.5687612208258528, + "grad_norm": 0.46947669982910156, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 7920 + }, + { + "epoch": 0.5694793536804309, + "grad_norm": 0.6420038342475891, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7930 + }, + { + "epoch": 0.570197486535009, + "grad_norm": 0.6730441451072693, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 7940 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.3849070966243744, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 7950 + }, + { + "epoch": 0.5716337522441651, + "grad_norm": 0.6076335906982422, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 7960 + }, + { + "epoch": 0.5723518850987432, + "grad_norm": 0.6446982026100159, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 7970 + }, + { + "epoch": 0.5730700179533214, + "grad_norm": 0.6019234657287598, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 7980 + }, + { + "epoch": 0.5737881508078995, + "grad_norm": 0.620880663394928, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 7990 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 0.4927573502063751, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 8000 + }, + { + "epoch": 0.5752244165170557, + "grad_norm": 0.6276804804801941, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8010 + }, + { + "epoch": 0.5759425493716338, + "grad_norm": 0.484518826007843, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 8020 + }, + { + "epoch": 0.5766606822262118, + "grad_norm": 0.5019962787628174, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.5773788150807899, + "grad_norm": 0.6685234308242798, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 8040 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 0.5762107372283936, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 8050 + }, + { + "epoch": 0.5788150807899461, + "grad_norm": 0.6402477025985718, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 8060 + }, + { + "epoch": 0.5795332136445243, + "grad_norm": 0.5919345617294312, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8070 + }, + { + "epoch": 0.5802513464991024, + "grad_norm": 0.47100913524627686, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 8080 + }, + { + "epoch": 0.5809694793536805, + "grad_norm": 0.6029118895530701, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 8090 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 0.5896338820457458, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 8100 + }, + { + "epoch": 0.5824057450628366, + "grad_norm": 0.49017754197120667, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 8110 + }, + { + "epoch": 0.5831238779174147, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 8120 + }, + { + "epoch": 0.5838420107719928, + "grad_norm": 0.6874517798423767, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.5845601436265709, + "grad_norm": 0.5429391264915466, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 8140 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.5533722639083862, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5859964093357272, + "grad_norm": 0.5827956199645996, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 8160 + }, + { + "epoch": 0.5867145421903052, + "grad_norm": 0.6670212149620056, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 8170 + }, + { + "epoch": 0.5874326750448833, + "grad_norm": 0.5231172442436218, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 8180 + }, + { + "epoch": 0.5881508078994614, + "grad_norm": 0.567447304725647, + "learning_rate": 0.0002, + "loss": 0.7975, + "step": 8190 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 0.5318575501441956, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8200 + }, + { + "epoch": 0.5895870736086176, + "grad_norm": 0.6959463357925415, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 8210 + }, + { + "epoch": 0.5903052064631957, + "grad_norm": 0.6964931488037109, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 8220 + }, + { + "epoch": 0.5910233393177737, + "grad_norm": 0.5164617896080017, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 8230 + }, + { + "epoch": 0.5917414721723518, + "grad_norm": 0.5456110239028931, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 8240 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.6553666591644287, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 8250 + }, + { + "epoch": 0.5931777378815081, + "grad_norm": 0.6185845732688904, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 8260 + }, + { + "epoch": 0.5938958707360862, + "grad_norm": 0.6110545992851257, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8270 + }, + { + "epoch": 0.5946140035906643, + "grad_norm": 0.5186824202537537, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 8280 + }, + { + "epoch": 0.5953321364452424, + "grad_norm": 0.7003735303878784, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 8290 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 0.4606216549873352, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 8300 + }, + { + "epoch": 0.5967684021543985, + "grad_norm": 0.5903441309928894, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 8310 + }, + { + "epoch": 0.5974865350089766, + "grad_norm": 0.7916744947433472, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 8320 + }, + { + "epoch": 0.5982046678635548, + "grad_norm": 0.5506401062011719, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 8330 + }, + { + "epoch": 0.5989228007181329, + "grad_norm": 0.5749204158782959, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 8340 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.6807544827461243, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 8350 + }, + { + "epoch": 0.6003590664272891, + "grad_norm": 0.5782986283302307, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 8360 + }, + { + "epoch": 0.6010771992818671, + "grad_norm": 0.7336342334747314, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 8370 + }, + { + "epoch": 0.6017953321364452, + "grad_norm": 0.5762712955474854, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.6025134649910233, + "grad_norm": 0.5726776719093323, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 8390 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.5355535745620728, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 8400 + }, + { + "epoch": 0.6039497307001795, + "grad_norm": 0.6762161254882812, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 8410 + }, + { + "epoch": 0.6046678635547577, + "grad_norm": 0.8200717568397522, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 8420 + }, + { + "epoch": 0.6053859964093358, + "grad_norm": 0.5600009560585022, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 8430 + }, + { + "epoch": 0.6061041292639138, + "grad_norm": 0.6465966105461121, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 8440 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.5176072120666504, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 8450 + }, + { + "epoch": 0.60754039497307, + "grad_norm": 0.5777280926704407, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 8460 + }, + { + "epoch": 0.6082585278276481, + "grad_norm": 0.5989252924919128, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 8470 + }, + { + "epoch": 0.6089766606822262, + "grad_norm": 0.5207306742668152, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8480 + }, + { + "epoch": 0.6096947935368043, + "grad_norm": 0.5242675542831421, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 8490 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 0.5631455183029175, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 8500 + }, + { + "epoch": 0.6111310592459605, + "grad_norm": 0.65207439661026, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 8510 + }, + { + "epoch": 0.6118491921005386, + "grad_norm": 0.5808899998664856, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8520 + }, + { + "epoch": 0.6125673249551167, + "grad_norm": 0.558127760887146, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 8530 + }, + { + "epoch": 0.6132854578096948, + "grad_norm": 0.6063143014907837, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8540 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.5491744875907898, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 8550 + }, + { + "epoch": 0.614721723518851, + "grad_norm": 0.5105780959129333, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8560 + }, + { + "epoch": 0.6154398563734291, + "grad_norm": 0.6892395615577698, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 8570 + }, + { + "epoch": 0.6161579892280071, + "grad_norm": 0.7411758899688721, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8580 + }, + { + "epoch": 0.6168761220825852, + "grad_norm": 0.6745429635047913, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 8590 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 0.596007227897644, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 8600 + }, + { + "epoch": 0.6183123877917415, + "grad_norm": 0.6751060485839844, + "learning_rate": 0.0002, + "loss": 0.7963, + "step": 8610 + }, + { + "epoch": 0.6190305206463196, + "grad_norm": 0.711124837398529, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 8620 + }, + { + "epoch": 0.6197486535008977, + "grad_norm": 0.6110914945602417, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 8630 + }, + { + "epoch": 0.6204667863554758, + "grad_norm": 0.5687659978866577, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 8640 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.7025772929191589, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8650 + }, + { + "epoch": 0.6219030520646319, + "grad_norm": 0.6456184983253479, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 8660 + }, + { + "epoch": 0.62262118491921, + "grad_norm": 0.5317023992538452, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 8670 + }, + { + "epoch": 0.6233393177737881, + "grad_norm": 0.5531691908836365, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 8680 + }, + { + "epoch": 0.6240574506283663, + "grad_norm": 0.6063531637191772, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 8690 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 1.094390630722046, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 8700 + }, + { + "epoch": 0.6254937163375225, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 8710 + }, + { + "epoch": 0.6262118491921005, + "grad_norm": 0.5470370054244995, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 8720 + }, + { + "epoch": 0.6269299820466786, + "grad_norm": 0.5852634310722351, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 8730 + }, + { + "epoch": 0.6276481149012567, + "grad_norm": 0.6120240092277527, + "learning_rate": 0.0002, + "loss": 0.8712, + "step": 8740 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 0.5608004927635193, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 8750 + }, + { + "epoch": 0.6290843806104129, + "grad_norm": 0.5980432033538818, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 8760 + }, + { + "epoch": 0.629802513464991, + "grad_norm": 0.5670580863952637, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 8770 + }, + { + "epoch": 0.6305206463195692, + "grad_norm": 0.5931687951087952, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 8780 + }, + { + "epoch": 0.6312387791741472, + "grad_norm": 0.7872577905654907, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 8790 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.6355181336402893, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 8800 + }, + { + "epoch": 0.6326750448833034, + "grad_norm": 0.501913845539093, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 8810 + }, + { + "epoch": 0.6333931777378815, + "grad_norm": 0.5956716537475586, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8820 + }, + { + "epoch": 0.6341113105924596, + "grad_norm": 0.6448253393173218, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 8830 + }, + { + "epoch": 0.6348294434470377, + "grad_norm": 0.6139631271362305, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 8840 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.5894306302070618, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 8850 + }, + { + "epoch": 0.6362657091561938, + "grad_norm": 0.8724799752235413, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 8860 + }, + { + "epoch": 0.636983842010772, + "grad_norm": 0.5413858890533447, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 8870 + }, + { + "epoch": 0.6377019748653501, + "grad_norm": 0.5993430614471436, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 8880 + }, + { + "epoch": 0.6384201077199282, + "grad_norm": 0.539415717124939, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 0.600125789642334, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 0.6398563734290844, + "grad_norm": 0.5597978234291077, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 8910 + }, + { + "epoch": 0.6405745062836625, + "grad_norm": 0.6262031197547913, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 8920 + }, + { + "epoch": 0.6412926391382405, + "grad_norm": 0.72662752866745, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 8930 + }, + { + "epoch": 0.6420107719928186, + "grad_norm": 0.613002598285675, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 8940 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.6511827707290649, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 8950 + }, + { + "epoch": 0.6434470377019749, + "grad_norm": 0.5383973717689514, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 8960 + }, + { + "epoch": 0.644165170556553, + "grad_norm": 0.5236184597015381, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 8970 + }, + { + "epoch": 0.6448833034111311, + "grad_norm": 0.5938544273376465, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 8980 + }, + { + "epoch": 0.6456014362657092, + "grad_norm": 0.4594680964946747, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 8990 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 0.6314211487770081, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9000 + }, + { + "epoch": 0.6470377019748653, + "grad_norm": 0.6291103363037109, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 9010 + }, + { + "epoch": 0.6477558348294434, + "grad_norm": 0.5888266563415527, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 9020 + }, + { + "epoch": 0.6484739676840215, + "grad_norm": 0.5613022446632385, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9030 + }, + { + "epoch": 0.6491921005385997, + "grad_norm": 0.7219604253768921, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 9040 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 0.5846529006958008, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 9050 + }, + { + "epoch": 0.6506283662477559, + "grad_norm": 0.7264063954353333, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 9060 + }, + { + "epoch": 0.6513464991023339, + "grad_norm": 0.5797538757324219, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9070 + }, + { + "epoch": 0.652064631956912, + "grad_norm": 0.4857395887374878, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9080 + }, + { + "epoch": 0.6527827648114901, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 9090 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.6105342507362366, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 9100 + }, + { + "epoch": 0.6542190305206463, + "grad_norm": 0.6408740282058716, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 9110 + }, + { + "epoch": 0.6549371633752245, + "grad_norm": 0.7474880814552307, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 9120 + }, + { + "epoch": 0.6556552962298026, + "grad_norm": 0.584768533706665, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 9130 + }, + { + "epoch": 0.6563734290843806, + "grad_norm": 0.6368113160133362, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9140 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.693631649017334, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 9150 + }, + { + "epoch": 0.6578096947935368, + "grad_norm": 0.6094512343406677, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 9160 + }, + { + "epoch": 0.6585278276481149, + "grad_norm": 0.7154942750930786, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 9170 + }, + { + "epoch": 0.659245960502693, + "grad_norm": 0.5749237537384033, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9180 + }, + { + "epoch": 0.6599640933572711, + "grad_norm": 0.6214450001716614, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 9190 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.6357814073562622, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9200 + }, + { + "epoch": 0.6614003590664272, + "grad_norm": 0.5677326917648315, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 9210 + }, + { + "epoch": 0.6621184919210054, + "grad_norm": 0.5432633757591248, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 9220 + }, + { + "epoch": 0.6628366247755835, + "grad_norm": 0.43935060501098633, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 9230 + }, + { + "epoch": 0.6635547576301616, + "grad_norm": 0.5350922346115112, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 9240 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.7745687365531921, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 9250 + }, + { + "epoch": 0.6649910233393178, + "grad_norm": 0.5767113566398621, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9260 + }, + { + "epoch": 0.6657091561938959, + "grad_norm": 0.49304983019828796, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9270 + }, + { + "epoch": 0.6664272890484739, + "grad_norm": 0.6355269551277161, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 9280 + }, + { + "epoch": 0.667145421903052, + "grad_norm": 0.5539451241493225, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 9290 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.5225138068199158, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 9300 + }, + { + "epoch": 0.6685816876122083, + "grad_norm": 0.5435736179351807, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 9310 + }, + { + "epoch": 0.6692998204667864, + "grad_norm": 0.611266553401947, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 9320 + }, + { + "epoch": 0.6700179533213645, + "grad_norm": 0.5880926251411438, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 9330 + }, + { + "epoch": 0.6707360861759426, + "grad_norm": 0.5301468372344971, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9340 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.5614377856254578, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9350 + }, + { + "epoch": 0.6721723518850987, + "grad_norm": 0.7177342176437378, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 9360 + }, + { + "epoch": 0.6728904847396768, + "grad_norm": 0.5187423825263977, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9370 + }, + { + "epoch": 0.6736086175942549, + "grad_norm": 0.49305087327957153, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 9380 + }, + { + "epoch": 0.6743267504488331, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 9390 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 0.8308040499687195, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 9400 + }, + { + "epoch": 0.6757630161579893, + "grad_norm": 0.6522438526153564, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 9410 + }, + { + "epoch": 0.6764811490125673, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 9420 + }, + { + "epoch": 0.6771992818671454, + "grad_norm": 0.783802330493927, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 9430 + }, + { + "epoch": 0.6779174147217235, + "grad_norm": 0.5246656537055969, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 9440 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.6630974411964417, + "learning_rate": 0.0002, + "loss": 0.7866, + "step": 9450 + }, + { + "epoch": 0.6793536804308797, + "grad_norm": 0.5012770295143127, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9460 + }, + { + "epoch": 0.6800718132854578, + "grad_norm": 0.6208643317222595, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 9470 + }, + { + "epoch": 0.680789946140036, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9480 + }, + { + "epoch": 0.681508078994614, + "grad_norm": 0.6613174080848694, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 9490 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.6417899131774902, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9500 + }, + { + "epoch": 0.6829443447037702, + "grad_norm": 0.5060321092605591, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 9510 + }, + { + "epoch": 0.6836624775583483, + "grad_norm": 0.586670458316803, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 9520 + }, + { + "epoch": 0.6843806104129264, + "grad_norm": 0.6607828736305237, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 9530 + }, + { + "epoch": 0.6850987432675045, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9540 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.741000771522522, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 9550 + }, + { + "epoch": 0.6865350089766606, + "grad_norm": 0.4687826335430145, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 9560 + }, + { + "epoch": 0.6872531418312388, + "grad_norm": 0.6452056169509888, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 9570 + }, + { + "epoch": 0.6879712746858169, + "grad_norm": 0.6393555402755737, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 9580 + }, + { + "epoch": 0.688689407540395, + "grad_norm": 0.4907757043838501, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 9590 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.5380825996398926, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 9600 + }, + { + "epoch": 0.6901256732495512, + "grad_norm": 0.5657393932342529, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 9610 + }, + { + "epoch": 0.6908438061041292, + "grad_norm": 0.8505447506904602, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 9620 + }, + { + "epoch": 0.6915619389587073, + "grad_norm": 0.5389836430549622, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 9630 + }, + { + "epoch": 0.6922800718132854, + "grad_norm": 0.4977441728115082, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 9640 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 0.5855389833450317, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 9650 + }, + { + "epoch": 0.6937163375224417, + "grad_norm": 0.633994996547699, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 9660 + }, + { + "epoch": 0.6944344703770198, + "grad_norm": 0.5592191815376282, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 9670 + }, + { + "epoch": 0.6951526032315979, + "grad_norm": 0.6030594706535339, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9680 + }, + { + "epoch": 0.6958707360861759, + "grad_norm": 0.6782388687133789, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 9690 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 0.6777627468109131, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 9700 + }, + { + "epoch": 0.6973070017953321, + "grad_norm": 0.5674123764038086, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 9710 + }, + { + "epoch": 0.6980251346499102, + "grad_norm": 0.5280387997627258, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 9720 + }, + { + "epoch": 0.6987432675044883, + "grad_norm": 0.5471981763839722, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 9730 + }, + { + "epoch": 0.6994614003590665, + "grad_norm": 0.6751061677932739, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9740 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.5942487716674805, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 9750 + }, + { + "epoch": 0.7008976660682226, + "grad_norm": 0.6165713667869568, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 9760 + }, + { + "epoch": 0.7016157989228007, + "grad_norm": 0.5745091438293457, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 9770 + }, + { + "epoch": 0.7023339317773788, + "grad_norm": 0.600308358669281, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 9780 + }, + { + "epoch": 0.7030520646319569, + "grad_norm": 0.6448577046394348, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 9790 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.5662767291069031, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9800 + }, + { + "epoch": 0.7044883303411131, + "grad_norm": 0.6490433812141418, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 9810 + }, + { + "epoch": 0.7052064631956912, + "grad_norm": 0.6126134991645813, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 9820 + }, + { + "epoch": 0.7059245960502692, + "grad_norm": 0.7181116938591003, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 9830 + }, + { + "epoch": 0.7066427289048474, + "grad_norm": 0.7805212140083313, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9840 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.7521958947181702, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9850 + }, + { + "epoch": 0.7080789946140036, + "grad_norm": 0.5610787868499756, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9860 + }, + { + "epoch": 0.7087971274685817, + "grad_norm": 0.7026229500770569, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 9870 + }, + { + "epoch": 0.7095152603231598, + "grad_norm": 0.551691472530365, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 9880 + }, + { + "epoch": 0.7102333931777379, + "grad_norm": 0.5841995477676392, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9890 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 0.7170061469078064, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 9900 + }, + { + "epoch": 0.711669658886894, + "grad_norm": 0.49836990237236023, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 9910 + }, + { + "epoch": 0.7123877917414722, + "grad_norm": 0.5234556794166565, + "learning_rate": 0.0002, + "loss": 0.7667, + "step": 9920 + }, + { + "epoch": 0.7131059245960503, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 9930 + }, + { + "epoch": 0.7138240574506284, + "grad_norm": 0.5657515525817871, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9940 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.5969128012657166, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 9950 + }, + { + "epoch": 0.7152603231597846, + "grad_norm": 0.7136867046356201, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 9960 + }, + { + "epoch": 0.7159784560143626, + "grad_norm": 0.6774699091911316, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9970 + }, + { + "epoch": 0.7166965888689407, + "grad_norm": 0.6066371202468872, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 9980 + }, + { + "epoch": 0.7174147217235188, + "grad_norm": 0.7355279922485352, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 9990 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 0.7996646761894226, + "learning_rate": 0.0002, + "loss": 0.7643, + "step": 10000 + }, + { + "epoch": 0.7188509874326751, + "grad_norm": 0.628839910030365, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10010 + }, + { + "epoch": 0.7195691202872532, + "grad_norm": 0.5472931265830994, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 10020 + }, + { + "epoch": 0.7202872531418313, + "grad_norm": 0.5776344537734985, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 10030 + }, + { + "epoch": 0.7210053859964093, + "grad_norm": 0.5041707158088684, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10040 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5965308547019958, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 10050 + }, + { + "epoch": 0.7224416517055655, + "grad_norm": 0.5892689228057861, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 10060 + }, + { + "epoch": 0.7231597845601436, + "grad_norm": 0.5695884227752686, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 10070 + }, + { + "epoch": 0.7238779174147217, + "grad_norm": 0.6547690629959106, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 10080 + }, + { + "epoch": 0.7245960502692999, + "grad_norm": 0.6759928464889526, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 10090 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.6829725503921509, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 10100 + }, + { + "epoch": 0.726032315978456, + "grad_norm": 0.5242751240730286, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 10110 + }, + { + "epoch": 0.7267504488330341, + "grad_norm": 0.6947014927864075, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 10120 + }, + { + "epoch": 0.7274685816876122, + "grad_norm": 0.6094982624053955, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10130 + }, + { + "epoch": 0.7281867145421903, + "grad_norm": 0.628461480140686, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 10140 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.4952087104320526, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10150 + }, + { + "epoch": 0.7296229802513465, + "grad_norm": 0.6917221546173096, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10160 + }, + { + "epoch": 0.7303411131059246, + "grad_norm": 0.6866413354873657, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10170 + }, + { + "epoch": 0.7310592459605026, + "grad_norm": 0.5505863428115845, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 10180 + }, + { + "epoch": 0.7317773788150808, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 10190 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.5001798272132874, + "learning_rate": 0.0002, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 0.733213644524237, + "grad_norm": 0.5117581486701965, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 10210 + }, + { + "epoch": 0.7339317773788151, + "grad_norm": 0.7716088891029358, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 10220 + }, + { + "epoch": 0.7346499102333932, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 10230 + }, + { + "epoch": 0.7353680430879713, + "grad_norm": 0.6433483362197876, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10240 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.6241081357002258, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10250 + }, + { + "epoch": 0.7368043087971274, + "grad_norm": 0.7198845744132996, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10260 + }, + { + "epoch": 0.7375224416517056, + "grad_norm": 0.5879023671150208, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 10270 + }, + { + "epoch": 0.7382405745062837, + "grad_norm": 0.5810162425041199, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 10280 + }, + { + "epoch": 0.7389587073608618, + "grad_norm": 0.6336500644683838, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10290 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.5627583861351013, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 10300 + }, + { + "epoch": 0.740394973070018, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 10310 + }, + { + "epoch": 0.741113105924596, + "grad_norm": 0.5519505143165588, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 10320 + }, + { + "epoch": 0.7418312387791741, + "grad_norm": 0.628710925579071, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 10330 + }, + { + "epoch": 0.7425493716337522, + "grad_norm": 0.6466957926750183, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10340 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 0.6269286274909973, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 10350 + }, + { + "epoch": 0.7439856373429085, + "grad_norm": 0.6985455751419067, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 10360 + }, + { + "epoch": 0.7447037701974866, + "grad_norm": 0.6203648447990417, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 10370 + }, + { + "epoch": 0.7454219030520647, + "grad_norm": 0.6524295210838318, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 10380 + }, + { + "epoch": 0.7461400359066427, + "grad_norm": 0.6108002662658691, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 10390 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 0.5196276903152466, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 10400 + }, + { + "epoch": 0.7475763016157989, + "grad_norm": 0.6207506656646729, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 10410 + }, + { + "epoch": 0.748294434470377, + "grad_norm": 0.6015686988830566, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 10420 + }, + { + "epoch": 0.7490125673249551, + "grad_norm": 0.6402649879455566, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 10430 + }, + { + "epoch": 0.7497307001795332, + "grad_norm": 0.7816081047058105, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 10440 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.6148143410682678, + "learning_rate": 0.0002, + "loss": 0.8021, + "step": 10450 + }, + { + "epoch": 0.7511669658886894, + "grad_norm": 0.6496613621711731, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 10460 + }, + { + "epoch": 0.7518850987432675, + "grad_norm": 0.49158045649528503, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 10470 + }, + { + "epoch": 0.7526032315978456, + "grad_norm": 0.8629217743873596, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 10480 + }, + { + "epoch": 0.7533213644524237, + "grad_norm": 0.6800066828727722, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 10490 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.6480063199996948, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 10500 + }, + { + "epoch": 0.7547576301615799, + "grad_norm": 0.5740751028060913, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10510 + }, + { + "epoch": 0.755475763016158, + "grad_norm": 0.7182627320289612, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 10520 + }, + { + "epoch": 0.756193895870736, + "grad_norm": 0.6482816934585571, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 10530 + }, + { + "epoch": 0.7569120287253142, + "grad_norm": 0.4937674105167389, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 10540 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.6818482875823975, + "learning_rate": 0.0002, + "loss": 0.7783, + "step": 10550 + }, + { + "epoch": 0.7583482944344704, + "grad_norm": 0.6375173926353455, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10560 + }, + { + "epoch": 0.7590664272890485, + "grad_norm": 0.528798520565033, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 10570 + }, + { + "epoch": 0.7597845601436266, + "grad_norm": 0.42099910974502563, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 10580 + }, + { + "epoch": 0.7605026929982047, + "grad_norm": 0.529604434967041, + "learning_rate": 0.0002, + "loss": 0.8218, + "step": 10590 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.6236841082572937, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 10600 + }, + { + "epoch": 0.7619389587073608, + "grad_norm": 0.6194891929626465, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10610 + }, + { + "epoch": 0.762657091561939, + "grad_norm": 0.5206209421157837, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 10620 + }, + { + "epoch": 0.7633752244165171, + "grad_norm": 0.7981295585632324, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 10630 + }, + { + "epoch": 0.7640933572710952, + "grad_norm": 0.6113479137420654, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 10640 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 0.7025435566902161, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10650 + }, + { + "epoch": 0.7655296229802514, + "grad_norm": 0.46914348006248474, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 10660 + }, + { + "epoch": 0.7662477558348294, + "grad_norm": 0.6134725213050842, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 10670 + }, + { + "epoch": 0.7669658886894075, + "grad_norm": 0.583859920501709, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10680 + }, + { + "epoch": 0.7676840215439856, + "grad_norm": 0.511349081993103, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10690 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.6467110514640808, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 10700 + }, + { + "epoch": 0.7691202872531419, + "grad_norm": 0.7210163474082947, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 10710 + }, + { + "epoch": 0.76983842010772, + "grad_norm": 0.6034521460533142, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 10720 + }, + { + "epoch": 0.7705565529622981, + "grad_norm": 0.6237271428108215, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 10730 + }, + { + "epoch": 0.7712746858168761, + "grad_norm": 0.664328396320343, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 10740 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.6550520062446594, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 10750 + }, + { + "epoch": 0.7727109515260323, + "grad_norm": 0.5103325843811035, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 10760 + }, + { + "epoch": 0.7734290843806104, + "grad_norm": 0.7171200513839722, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 10770 + }, + { + "epoch": 0.7741472172351885, + "grad_norm": 0.5947384834289551, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 10780 + }, + { + "epoch": 0.7748653500897666, + "grad_norm": 0.5293096899986267, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10790 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.6372577548027039, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10800 + }, + { + "epoch": 0.7763016157989228, + "grad_norm": 0.5738261938095093, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.7770197486535009, + "grad_norm": 0.7309247255325317, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 10820 + }, + { + "epoch": 0.777737881508079, + "grad_norm": 0.8867193460464478, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10830 + }, + { + "epoch": 0.7784560143626571, + "grad_norm": 0.6151437759399414, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 10840 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.5645464658737183, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 0.7798922800718133, + "grad_norm": 0.5118698477745056, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10860 + }, + { + "epoch": 0.7806104129263913, + "grad_norm": 0.618181049823761, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 10870 + }, + { + "epoch": 0.7813285457809694, + "grad_norm": 0.7206462025642395, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 10880 + }, + { + "epoch": 0.7820466786355476, + "grad_norm": 0.7993820905685425, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 10890 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.5072754621505737, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10900 + }, + { + "epoch": 0.7834829443447038, + "grad_norm": 0.5829088687896729, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 10910 + }, + { + "epoch": 0.7842010771992819, + "grad_norm": 0.5778957605361938, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 10920 + }, + { + "epoch": 0.78491921005386, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.785637342908438, + "grad_norm": 0.5778013467788696, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 10940 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.6129629611968994, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10950 + }, + { + "epoch": 0.7870736086175942, + "grad_norm": 0.5637320876121521, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10960 + }, + { + "epoch": 0.7877917414721723, + "grad_norm": 0.6253715753555298, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10970 + }, + { + "epoch": 0.7885098743267505, + "grad_norm": 0.6209888458251953, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10980 + }, + { + "epoch": 0.7892280071813286, + "grad_norm": 1.0841948986053467, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10990 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.6570560336112976, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 11000 + }, + { + "epoch": 0.7906642728904847, + "grad_norm": 0.4830388128757477, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11010 + }, + { + "epoch": 0.7913824057450628, + "grad_norm": 0.7607520222663879, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11020 + }, + { + "epoch": 0.7921005385996409, + "grad_norm": 0.8202590346336365, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 11030 + }, + { + "epoch": 0.792818671454219, + "grad_norm": 0.5640848278999329, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11040 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 0.7773675322532654, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 11050 + }, + { + "epoch": 0.7942549371633753, + "grad_norm": 0.664139986038208, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11060 + }, + { + "epoch": 0.7949730700179534, + "grad_norm": 0.6097795367240906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 11070 + }, + { + "epoch": 0.7956912028725314, + "grad_norm": 0.9208881258964539, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 11080 + }, + { + "epoch": 0.7964093357271095, + "grad_norm": 0.6210731863975525, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 11090 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.7060235738754272, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 11100 + }, + { + "epoch": 0.7978456014362657, + "grad_norm": 0.48695266246795654, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 11110 + }, + { + "epoch": 0.7985637342908438, + "grad_norm": 0.6458830833435059, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 11120 + }, + { + "epoch": 0.7992818671454219, + "grad_norm": 0.572545051574707, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 11130 + }, + { + "epoch": 0.8, + "grad_norm": 0.5925027132034302, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 11140 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 0.569622278213501, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 11150 + }, + { + "epoch": 0.8014362657091562, + "grad_norm": 0.537146806716919, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 11160 + }, + { + "epoch": 0.8021543985637343, + "grad_norm": 0.7118613719940186, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 11170 + }, + { + "epoch": 0.8028725314183124, + "grad_norm": 0.6183688044548035, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 11180 + }, + { + "epoch": 0.8035906642728905, + "grad_norm": 0.5187385082244873, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 11190 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.5422571301460266, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 11200 + }, + { + "epoch": 0.8050269299820467, + "grad_norm": 0.635050892829895, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 11210 + }, + { + "epoch": 0.8057450628366247, + "grad_norm": 0.6584872007369995, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 11220 + }, + { + "epoch": 0.8064631956912028, + "grad_norm": 0.624921977519989, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 11230 + }, + { + "epoch": 0.807181328545781, + "grad_norm": 0.6837546229362488, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 11240 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 0.5861160755157471, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11250 + }, + { + "epoch": 0.8086175942549372, + "grad_norm": 0.5751383900642395, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 11260 + }, + { + "epoch": 0.8093357271095153, + "grad_norm": 0.7181510329246521, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11270 + }, + { + "epoch": 0.8100538599640934, + "grad_norm": 0.5862139463424683, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11280 + }, + { + "epoch": 0.8107719928186714, + "grad_norm": 0.4880113899707794, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 11290 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.565590500831604, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 11300 + }, + { + "epoch": 0.8122082585278276, + "grad_norm": 0.6171264052391052, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11310 + }, + { + "epoch": 0.8129263913824057, + "grad_norm": 0.5815969109535217, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 11320 + }, + { + "epoch": 0.8136445242369839, + "grad_norm": 0.5407653450965881, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 11330 + }, + { + "epoch": 0.814362657091562, + "grad_norm": 0.6990084648132324, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 11340 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.5845068097114563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11350 + }, + { + "epoch": 0.8157989228007181, + "grad_norm": 0.5978701114654541, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11360 + }, + { + "epoch": 0.8165170556552962, + "grad_norm": 0.6873053312301636, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 11370 + }, + { + "epoch": 0.8172351885098743, + "grad_norm": 0.7048654556274414, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 11380 + }, + { + "epoch": 0.8179533213644524, + "grad_norm": 0.7631531953811646, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 11390 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.704922080039978, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 11400 + }, + { + "epoch": 0.8193895870736086, + "grad_norm": 0.595460832118988, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11410 + }, + { + "epoch": 0.8201077199281868, + "grad_norm": 0.5882242918014526, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 11420 + }, + { + "epoch": 0.8208258527827648, + "grad_norm": 0.6433175206184387, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 11430 + }, + { + "epoch": 0.8215439856373429, + "grad_norm": 0.6047986149787903, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 11440 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 11450 + }, + { + "epoch": 0.8229802513464991, + "grad_norm": 0.5558379888534546, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 11460 + }, + { + "epoch": 0.8236983842010772, + "grad_norm": 0.6745542287826538, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 11470 + }, + { + "epoch": 0.8244165170556553, + "grad_norm": 0.7082334756851196, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 11480 + }, + { + "epoch": 0.8251346499102334, + "grad_norm": 0.703889787197113, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11490 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.5261096358299255, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 11500 + }, + { + "epoch": 0.8265709156193896, + "grad_norm": 0.6009393930435181, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 11510 + }, + { + "epoch": 0.8272890484739677, + "grad_norm": 0.584274172782898, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 11520 + }, + { + "epoch": 0.8280071813285458, + "grad_norm": 0.6803238987922668, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 11530 + }, + { + "epoch": 0.8287253141831239, + "grad_norm": 0.6230084896087646, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 11540 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.6090595722198486, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 11550 + }, + { + "epoch": 0.8301615798922801, + "grad_norm": 0.5292693376541138, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 11560 + }, + { + "epoch": 0.8308797127468581, + "grad_norm": 0.5675389766693115, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 11570 + }, + { + "epoch": 0.8315978456014362, + "grad_norm": 0.554874062538147, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 11580 + }, + { + "epoch": 0.8323159784560143, + "grad_norm": 0.8582373261451721, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 11590 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.5743035674095154, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 11600 + }, + { + "epoch": 0.8337522441651706, + "grad_norm": 0.5749582648277283, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11610 + }, + { + "epoch": 0.8344703770197487, + "grad_norm": 0.5207278728485107, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11620 + }, + { + "epoch": 0.8351885098743268, + "grad_norm": 0.6262611150741577, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 11630 + }, + { + "epoch": 0.8359066427289048, + "grad_norm": 0.5490066409111023, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 11640 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6283167600631714, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 11650 + }, + { + "epoch": 0.837342908438061, + "grad_norm": 0.7701452374458313, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 11660 + }, + { + "epoch": 0.8380610412926391, + "grad_norm": 0.5825072526931763, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 11670 + }, + { + "epoch": 0.8387791741472173, + "grad_norm": 0.6119720935821533, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 11680 + }, + { + "epoch": 0.8394973070017954, + "grad_norm": 0.689383327960968, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 11690 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.5396560430526733, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 11700 + }, + { + "epoch": 0.8409335727109515, + "grad_norm": 0.577178955078125, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 11710 + }, + { + "epoch": 0.8416517055655296, + "grad_norm": 0.6652564406394958, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 11720 + }, + { + "epoch": 0.8423698384201077, + "grad_norm": 0.588377058506012, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 11730 + }, + { + "epoch": 0.8430879712746858, + "grad_norm": 0.6180438995361328, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 11740 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.6897811889648438, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11750 + }, + { + "epoch": 0.844524236983842, + "grad_norm": 0.5826608538627625, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 11760 + }, + { + "epoch": 0.8452423698384202, + "grad_norm": 0.6511976718902588, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 11770 + }, + { + "epoch": 0.8459605026929982, + "grad_norm": 0.4738382399082184, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 11780 + }, + { + "epoch": 0.8466786355475763, + "grad_norm": 0.541780948638916, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 11790 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 0.6115241050720215, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 11800 + }, + { + "epoch": 0.8481149012567325, + "grad_norm": 0.7067801356315613, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 11810 + }, + { + "epoch": 0.8488330341113106, + "grad_norm": 0.5602791905403137, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 11820 + }, + { + "epoch": 0.8495511669658887, + "grad_norm": 0.6968005299568176, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 11830 + }, + { + "epoch": 0.8502692998204668, + "grad_norm": 0.621132493019104, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11840 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.5777568817138672, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11850 + }, + { + "epoch": 0.851705565529623, + "grad_norm": 0.6468178629875183, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 11860 + }, + { + "epoch": 0.8524236983842011, + "grad_norm": 0.6216070652008057, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 11870 + }, + { + "epoch": 0.8531418312387792, + "grad_norm": 0.7402005791664124, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 11880 + }, + { + "epoch": 0.8538599640933573, + "grad_norm": 0.5192958116531372, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 11890 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 0.6050501465797424, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 11900 + }, + { + "epoch": 0.8552962298025135, + "grad_norm": 0.5363124012947083, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11910 + }, + { + "epoch": 0.8560143626570915, + "grad_norm": 0.525288462638855, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11920 + }, + { + "epoch": 0.8567324955116696, + "grad_norm": 0.6129848957061768, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 11930 + }, + { + "epoch": 0.8574506283662477, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 11940 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.5862830281257629, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 11950 + }, + { + "epoch": 0.858886894075404, + "grad_norm": 0.7078025341033936, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 11960 + }, + { + "epoch": 0.8596050269299821, + "grad_norm": 0.6600908637046814, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 11970 + }, + { + "epoch": 0.8603231597845602, + "grad_norm": 0.5914377570152283, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 11980 + }, + { + "epoch": 0.8610412926391382, + "grad_norm": 0.7844575047492981, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 11990 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.6605148315429688, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12000 + }, + { + "epoch": 0.8624775583482944, + "grad_norm": 0.6320111155509949, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 12010 + }, + { + "epoch": 0.8631956912028725, + "grad_norm": 0.5833557844161987, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 12020 + }, + { + "epoch": 0.8639138240574507, + "grad_norm": 0.5322666764259338, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 12030 + }, + { + "epoch": 0.8646319569120288, + "grad_norm": 0.568696141242981, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 12040 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 0.5739135146141052, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 12050 + }, + { + "epoch": 0.8660682226211849, + "grad_norm": 0.6667993068695068, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 12060 + }, + { + "epoch": 0.866786355475763, + "grad_norm": 0.5393701195716858, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 12070 + }, + { + "epoch": 0.8675044883303411, + "grad_norm": 0.7036312818527222, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 12080 + }, + { + "epoch": 0.8682226211849192, + "grad_norm": 0.5851739048957825, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 12090 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.6554462909698486, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 12100 + }, + { + "epoch": 0.8696588868940754, + "grad_norm": 0.8224838376045227, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 12110 + }, + { + "epoch": 0.8703770197486534, + "grad_norm": 0.513981819152832, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 12120 + }, + { + "epoch": 0.8710951526032316, + "grad_norm": 0.6913988590240479, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 12130 + }, + { + "epoch": 0.8718132854578097, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 12140 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.6216937303543091, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 12150 + }, + { + "epoch": 0.8732495511669659, + "grad_norm": 0.5594495534896851, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 12160 + }, + { + "epoch": 0.873967684021544, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 12170 + }, + { + "epoch": 0.8746858168761221, + "grad_norm": 0.5285239815711975, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 12180 + }, + { + "epoch": 0.8754039497307001, + "grad_norm": 1.0394607782363892, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 12190 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 12200 + }, + { + "epoch": 0.8768402154398564, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 12210 + }, + { + "epoch": 0.8775583482944345, + "grad_norm": 0.593204915523529, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 12220 + }, + { + "epoch": 0.8782764811490126, + "grad_norm": 0.7141679525375366, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 12230 + }, + { + "epoch": 0.8789946140035907, + "grad_norm": 0.6381585597991943, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 12240 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.7076981067657471, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12250 + }, + { + "epoch": 0.8804308797127468, + "grad_norm": 0.8046461939811707, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 12260 + }, + { + "epoch": 0.8811490125673249, + "grad_norm": 0.635160505771637, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 12270 + }, + { + "epoch": 0.881867145421903, + "grad_norm": 0.6388354301452637, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8825852782764811, + "grad_norm": 0.5612906217575073, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 12290 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6716228723526001, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 12300 + }, + { + "epoch": 0.8840215439856374, + "grad_norm": 0.6488762497901917, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 12310 + }, + { + "epoch": 0.8847396768402155, + "grad_norm": 0.5770853757858276, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 12320 + }, + { + "epoch": 0.8854578096947935, + "grad_norm": 0.5006616711616516, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 12330 + }, + { + "epoch": 0.8861759425493716, + "grad_norm": 0.6428417563438416, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 12340 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 0.5721977949142456, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12350 + }, + { + "epoch": 0.8876122082585278, + "grad_norm": 0.7000266313552856, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 12360 + }, + { + "epoch": 0.8883303411131059, + "grad_norm": 0.5252631306648254, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 12370 + }, + { + "epoch": 0.889048473967684, + "grad_norm": 0.5788044929504395, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 0.8897666068222622, + "grad_norm": 0.6730653643608093, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.5556851029396057, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 12400 + }, + { + "epoch": 0.8912028725314183, + "grad_norm": 0.616189181804657, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 12410 + }, + { + "epoch": 0.8919210053859964, + "grad_norm": 0.6360940337181091, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 12420 + }, + { + "epoch": 0.8926391382405745, + "grad_norm": 0.5832887887954712, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 12430 + }, + { + "epoch": 0.8933572710951526, + "grad_norm": 0.8319168090820312, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 12440 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.5415005087852478, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 12450 + }, + { + "epoch": 0.8947935368043088, + "grad_norm": 0.4959808588027954, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 12460 + }, + { + "epoch": 0.8955116696588868, + "grad_norm": 0.5102260708808899, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 12470 + }, + { + "epoch": 0.896229802513465, + "grad_norm": 0.773972749710083, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12480 + }, + { + "epoch": 0.8969479353680431, + "grad_norm": 0.6314513087272644, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 12490 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.6503705382347107, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 12500 + }, + { + "epoch": 0.8983842010771993, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 12510 + }, + { + "epoch": 0.8991023339317774, + "grad_norm": 0.7222756743431091, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 12520 + }, + { + "epoch": 0.8998204667863555, + "grad_norm": 0.7242336869239807, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 12530 + }, + { + "epoch": 0.9005385996409335, + "grad_norm": 0.625769317150116, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 12540 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.6003357172012329, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 12550 + }, + { + "epoch": 0.9019748653500897, + "grad_norm": 0.6089374423027039, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 12560 + }, + { + "epoch": 0.9026929982046679, + "grad_norm": 0.6232544183731079, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 12570 + }, + { + "epoch": 0.903411131059246, + "grad_norm": 0.5426769256591797, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 12580 + }, + { + "epoch": 0.9041292639138241, + "grad_norm": 0.5711943507194519, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 12590 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.5287838578224182, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 12600 + }, + { + "epoch": 0.9055655296229802, + "grad_norm": 0.6192951798439026, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 12610 + }, + { + "epoch": 0.9062836624775583, + "grad_norm": 0.493082195520401, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 12620 + }, + { + "epoch": 0.9070017953321364, + "grad_norm": 0.7668463587760925, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 12630 + }, + { + "epoch": 0.9077199281867145, + "grad_norm": 0.6298037767410278, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 12640 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.5502580404281616, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 12650 + }, + { + "epoch": 0.9091561938958708, + "grad_norm": 0.5525170564651489, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.9098743267504489, + "grad_norm": 0.9753695726394653, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 12670 + }, + { + "epoch": 0.9105924596050269, + "grad_norm": 0.611427366733551, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 12680 + }, + { + "epoch": 0.911310592459605, + "grad_norm": 0.5141594409942627, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 12690 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 0.6739137172698975, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 12700 + }, + { + "epoch": 0.9127468581687612, + "grad_norm": 0.5759707689285278, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 12710 + }, + { + "epoch": 0.9134649910233393, + "grad_norm": 0.5548733472824097, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12720 + }, + { + "epoch": 0.9141831238779174, + "grad_norm": 0.7014280557632446, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 12730 + }, + { + "epoch": 0.9149012567324956, + "grad_norm": 0.5939958691596985, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 12740 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 0.5995593667030334, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12750 + }, + { + "epoch": 0.9163375224416517, + "grad_norm": 0.6686680316925049, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 12760 + }, + { + "epoch": 0.9170556552962298, + "grad_norm": 0.4742372930049896, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 12770 + }, + { + "epoch": 0.9177737881508079, + "grad_norm": 0.5493217706680298, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 12780 + }, + { + "epoch": 0.918491921005386, + "grad_norm": 0.5641885995864868, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 12790 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 0.5814061164855957, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 12800 + }, + { + "epoch": 0.9199281867145422, + "grad_norm": 0.6774331331253052, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 12810 + }, + { + "epoch": 0.9206463195691202, + "grad_norm": 0.5592127442359924, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 12820 + }, + { + "epoch": 0.9213644524236984, + "grad_norm": 0.5246456861495972, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 12830 + }, + { + "epoch": 0.9220825852782765, + "grad_norm": 0.6524264812469482, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 12840 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 0.6010791063308716, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12850 + }, + { + "epoch": 0.9235188509874327, + "grad_norm": 0.5289866924285889, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 12860 + }, + { + "epoch": 0.9242369838420108, + "grad_norm": 0.6850762367248535, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 12870 + }, + { + "epoch": 0.9249551166965889, + "grad_norm": 0.5293797850608826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 12880 + }, + { + "epoch": 0.9256732495511669, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 12890 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.7026739716529846, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 12900 + }, + { + "epoch": 0.9271095152603231, + "grad_norm": 0.6884756684303284, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 12910 + }, + { + "epoch": 0.9278276481149013, + "grad_norm": 0.637884795665741, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 12920 + }, + { + "epoch": 0.9285457809694794, + "grad_norm": 0.513913631439209, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 12930 + }, + { + "epoch": 0.9292639138240575, + "grad_norm": 0.6642340421676636, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 12940 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.5708861947059631, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 12950 + }, + { + "epoch": 0.9307001795332136, + "grad_norm": 0.5896512866020203, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 12960 + }, + { + "epoch": 0.9314183123877917, + "grad_norm": 0.5754874348640442, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 12970 + }, + { + "epoch": 0.9321364452423698, + "grad_norm": 0.6363751888275146, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 12980 + }, + { + "epoch": 0.9328545780969479, + "grad_norm": 0.7660197019577026, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 12990 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.607728898525238, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 13000 + }, + { + "epoch": 0.9342908438061042, + "grad_norm": 0.5257042050361633, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 13010 + }, + { + "epoch": 0.9350089766606823, + "grad_norm": 0.7916908264160156, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 13020 + }, + { + "epoch": 0.9357271095152603, + "grad_norm": 0.8310123085975647, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 13030 + }, + { + "epoch": 0.9364452423698384, + "grad_norm": 0.6543728113174438, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 13040 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.7153878808021545, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 13050 + }, + { + "epoch": 0.9378815080789946, + "grad_norm": 0.7510694265365601, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 13060 + }, + { + "epoch": 0.9385996409335727, + "grad_norm": 0.5524464249610901, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 13070 + }, + { + "epoch": 0.9393177737881508, + "grad_norm": 0.6657140254974365, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 13080 + }, + { + "epoch": 0.940035906642729, + "grad_norm": 0.5757394433021545, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 13090 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.6171187162399292, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 13100 + }, + { + "epoch": 0.9414721723518851, + "grad_norm": 0.5946314334869385, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 13110 + }, + { + "epoch": 0.9421903052064632, + "grad_norm": 0.5727229714393616, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 13120 + }, + { + "epoch": 0.9429084380610413, + "grad_norm": 0.7805224061012268, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 13130 + }, + { + "epoch": 0.9436265709156194, + "grad_norm": 0.5763523578643799, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 13140 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 0.8310899138450623, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13150 + }, + { + "epoch": 0.9450628366247756, + "grad_norm": 0.7531784772872925, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 13160 + }, + { + "epoch": 0.9457809694793536, + "grad_norm": 0.678779661655426, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 13170 + }, + { + "epoch": 0.9464991023339318, + "grad_norm": 0.8096453547477722, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13180 + }, + { + "epoch": 0.9472172351885099, + "grad_norm": 0.6743921637535095, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 13190 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.606852114200592, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 13200 + }, + { + "epoch": 0.9486535008976661, + "grad_norm": 0.6550270915031433, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 13210 + }, + { + "epoch": 0.9493716337522442, + "grad_norm": 0.6494552493095398, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 13220 + }, + { + "epoch": 0.9500897666068223, + "grad_norm": 0.5867666602134705, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 13230 + }, + { + "epoch": 0.9508078994614003, + "grad_norm": 0.6283786296844482, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 13240 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 0.6824573278427124, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 13250 + }, + { + "epoch": 0.9522441651705565, + "grad_norm": 0.6945744156837463, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 13260 + }, + { + "epoch": 0.9529622980251347, + "grad_norm": 0.6468575596809387, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 13270 + }, + { + "epoch": 0.9536804308797128, + "grad_norm": 0.6819407939910889, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 0.9543985637342909, + "grad_norm": 0.6660491824150085, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 13290 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6320462226867676, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 13300 + }, + { + "epoch": 0.955834829443447, + "grad_norm": 0.46753761172294617, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 13310 + }, + { + "epoch": 0.9565529622980251, + "grad_norm": 0.6608774065971375, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 13320 + }, + { + "epoch": 0.9572710951526032, + "grad_norm": 0.607448935508728, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 13330 + }, + { + "epoch": 0.9579892280071813, + "grad_norm": 0.6796701550483704, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 13340 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.7655861377716064, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 13350 + }, + { + "epoch": 0.9594254937163376, + "grad_norm": 0.5881335735321045, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 13360 + }, + { + "epoch": 0.9601436265709156, + "grad_norm": 0.6855270862579346, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 13370 + }, + { + "epoch": 0.9608617594254937, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 13380 + }, + { + "epoch": 0.9615798922800718, + "grad_norm": 0.5983994603157043, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 13390 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.6141189932823181, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 13400 + }, + { + "epoch": 0.963016157989228, + "grad_norm": 0.6539722084999084, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 13410 + }, + { + "epoch": 0.9637342908438061, + "grad_norm": 0.5425801277160645, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 13420 + }, + { + "epoch": 0.9644524236983842, + "grad_norm": 0.8038925528526306, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 13430 + }, + { + "epoch": 0.9651705565529622, + "grad_norm": 0.5729590058326721, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 13440 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5695241689682007, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 13450 + }, + { + "epoch": 0.9666068222621185, + "grad_norm": 0.5913681387901306, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 13460 + }, + { + "epoch": 0.9673249551166966, + "grad_norm": 1.1798994541168213, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 13470 + }, + { + "epoch": 0.9680430879712747, + "grad_norm": 0.5931369066238403, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 13480 + }, + { + "epoch": 0.9687612208258528, + "grad_norm": 0.6269514560699463, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 13490 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.7380245327949524, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 13500 + }, + { + "epoch": 0.9701974865350089, + "grad_norm": 0.5668187141418457, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 13510 + }, + { + "epoch": 0.970915619389587, + "grad_norm": 0.547149121761322, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 13520 + }, + { + "epoch": 0.9716337522441651, + "grad_norm": 0.49131739139556885, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 13530 + }, + { + "epoch": 0.9723518850987433, + "grad_norm": 0.6385366320610046, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 13540 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5962417125701904, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 13550 + }, + { + "epoch": 0.9737881508078995, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 13560 + }, + { + "epoch": 0.9745062836624776, + "grad_norm": 0.5757403373718262, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 13570 + }, + { + "epoch": 0.9752244165170556, + "grad_norm": 0.7214667201042175, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 13580 + }, + { + "epoch": 0.9759425493716337, + "grad_norm": 0.5902701020240784, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 13590 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.752805769443512, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 13600 + }, + { + "epoch": 0.9773788150807899, + "grad_norm": 0.5943595767021179, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 13610 + }, + { + "epoch": 0.978096947935368, + "grad_norm": 0.6752488613128662, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 13620 + }, + { + "epoch": 0.9788150807899462, + "grad_norm": 0.5295413732528687, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 13630 + }, + { + "epoch": 0.9795332136445243, + "grad_norm": 0.732549250125885, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13640 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.5701823830604553, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 13650 + }, + { + "epoch": 0.9809694793536804, + "grad_norm": 0.576898455619812, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13660 + }, + { + "epoch": 0.9816876122082585, + "grad_norm": 0.5916832089424133, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 13670 + }, + { + "epoch": 0.9824057450628366, + "grad_norm": 0.5554524660110474, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 13680 + }, + { + "epoch": 0.9831238779174147, + "grad_norm": 0.6988440752029419, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 13690 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 13700 + }, + { + "epoch": 0.984560143626571, + "grad_norm": 2.421210289001465, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13710 + }, + { + "epoch": 0.985278276481149, + "grad_norm": 0.6307598948478699, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 13720 + }, + { + "epoch": 0.9859964093357271, + "grad_norm": 0.6832480430603027, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 13730 + }, + { + "epoch": 0.9867145421903052, + "grad_norm": 0.5974255204200745, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13740 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.6540380716323853, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 13750 + }, + { + "epoch": 0.9881508078994614, + "grad_norm": 0.7532727122306824, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 13760 + }, + { + "epoch": 0.9888689407540395, + "grad_norm": 0.6776283383369446, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 13770 + }, + { + "epoch": 0.9895870736086176, + "grad_norm": 0.5776281356811523, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 13780 + }, + { + "epoch": 0.9903052064631956, + "grad_norm": 0.5473008751869202, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 13790 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.5428591370582581, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 13800 + }, + { + "epoch": 0.9917414721723519, + "grad_norm": 0.5173406004905701, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 13810 + }, + { + "epoch": 0.99245960502693, + "grad_norm": 0.6462617516517639, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 13820 + }, + { + "epoch": 0.9931777378815081, + "grad_norm": 0.5800426006317139, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 13830 + }, + { + "epoch": 0.9938958707360862, + "grad_norm": 0.5015466809272766, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 13840 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.59474778175354, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 13850 + }, + { + "epoch": 0.9953321364452423, + "grad_norm": 0.5609583258628845, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 13860 + }, + { + "epoch": 0.9960502692998204, + "grad_norm": 0.5762063264846802, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 13870 + }, + { + "epoch": 0.9967684021543985, + "grad_norm": 0.6419214010238647, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 13880 + }, + { + "epoch": 0.9974865350089767, + "grad_norm": 0.7821950316429138, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 13890 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.6216017007827759, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 13900 + }, + { + "epoch": 0.9989228007181329, + "grad_norm": 0.5446485877037048, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 13910 + }, + { + "epoch": 0.999640933572711, + "grad_norm": 0.5037565231323242, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 13920 + }, + { + "epoch": 1.0, + "eval_loss": 1.09147310256958, + "eval_runtime": 55.1915, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 13925 + }, + { + "epoch": 1.000359066427289, + "grad_norm": 0.5808277130126953, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 13930 + }, + { + "epoch": 1.0010771992818672, + "grad_norm": 0.47258496284484863, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 13940 + }, + { + "epoch": 1.0017953321364452, + "grad_norm": 0.8921670317649841, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 13950 + }, + { + "epoch": 1.0025134649910232, + "grad_norm": 0.746729850769043, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 13960 + }, + { + "epoch": 1.0032315978456015, + "grad_norm": 0.6243796944618225, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13970 + }, + { + "epoch": 1.0039497307001795, + "grad_norm": 0.6725090742111206, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 13980 + }, + { + "epoch": 1.0046678635547577, + "grad_norm": 0.8762497305870056, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 13990 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 0.7694411873817444, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 14000 + }, + { + "epoch": 1.006104129263914, + "grad_norm": 0.6208822727203369, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 14010 + }, + { + "epoch": 1.006822262118492, + "grad_norm": 0.8503357768058777, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 14020 + }, + { + "epoch": 1.00754039497307, + "grad_norm": 0.5813316106796265, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14030 + }, + { + "epoch": 1.0082585278276481, + "grad_norm": 0.8186036348342896, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 14040 + }, + { + "epoch": 1.0089766606822261, + "grad_norm": 0.759873628616333, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14050 + }, + { + "epoch": 1.0096947935368044, + "grad_norm": 0.8437777161598206, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 14060 + }, + { + "epoch": 1.0104129263913824, + "grad_norm": 0.5750975012779236, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14070 + }, + { + "epoch": 1.0111310592459606, + "grad_norm": 0.5873221158981323, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 14080 + }, + { + "epoch": 1.0118491921005386, + "grad_norm": 0.6381314396858215, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 14090 + }, + { + "epoch": 1.0125673249551166, + "grad_norm": 0.6510405540466309, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 14100 + }, + { + "epoch": 1.0132854578096948, + "grad_norm": 0.7698671221733093, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 14110 + }, + { + "epoch": 1.0140035906642728, + "grad_norm": 0.646180272102356, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 14120 + }, + { + "epoch": 1.014721723518851, + "grad_norm": 0.6183205246925354, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 14130 + }, + { + "epoch": 1.015439856373429, + "grad_norm": 0.5082563757896423, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 14140 + }, + { + "epoch": 1.0161579892280073, + "grad_norm": 0.7285500764846802, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 14150 + }, + { + "epoch": 1.0168761220825853, + "grad_norm": 0.6368175148963928, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 14160 + }, + { + "epoch": 1.0175942549371633, + "grad_norm": 0.44868743419647217, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 14170 + }, + { + "epoch": 1.0183123877917415, + "grad_norm": 0.6346513628959656, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 14180 + }, + { + "epoch": 1.0190305206463195, + "grad_norm": 0.7287803292274475, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 14190 + }, + { + "epoch": 1.0197486535008977, + "grad_norm": 0.6701363325119019, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 14200 + }, + { + "epoch": 1.0204667863554757, + "grad_norm": 0.6419289112091064, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 14210 + }, + { + "epoch": 1.021184919210054, + "grad_norm": 0.7703002095222473, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 14220 + }, + { + "epoch": 1.021903052064632, + "grad_norm": 0.6803670525550842, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14230 + }, + { + "epoch": 1.02262118491921, + "grad_norm": 0.5780976414680481, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 14240 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 0.5096051096916199, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 14250 + }, + { + "epoch": 1.0240574506283662, + "grad_norm": 0.6058611869812012, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 14260 + }, + { + "epoch": 1.0247755834829444, + "grad_norm": 0.6703311204910278, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 14270 + }, + { + "epoch": 1.0254937163375224, + "grad_norm": 0.7143640518188477, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 14280 + }, + { + "epoch": 1.0262118491921006, + "grad_norm": 0.6730744242668152, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 14290 + }, + { + "epoch": 1.0269299820466786, + "grad_norm": 0.8180603384971619, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14300 + }, + { + "epoch": 1.0276481149012566, + "grad_norm": 0.6752267479896545, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 14310 + }, + { + "epoch": 1.0283662477558349, + "grad_norm": 0.678428590297699, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 14320 + }, + { + "epoch": 1.0290843806104129, + "grad_norm": 0.5959973931312561, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 14330 + }, + { + "epoch": 1.029802513464991, + "grad_norm": 0.5797176957130432, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 14340 + }, + { + "epoch": 1.030520646319569, + "grad_norm": 0.6415652632713318, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 14350 + }, + { + "epoch": 1.0312387791741473, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 14360 + }, + { + "epoch": 1.0319569120287253, + "grad_norm": 0.7158452272415161, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 14370 + }, + { + "epoch": 1.0326750448833033, + "grad_norm": 0.6066089272499084, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 14380 + }, + { + "epoch": 1.0333931777378815, + "grad_norm": 0.7359582781791687, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 14390 + }, + { + "epoch": 1.0341113105924595, + "grad_norm": 0.7372373938560486, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 14400 + }, + { + "epoch": 1.0348294434470378, + "grad_norm": 0.7511868476867676, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 14410 + }, + { + "epoch": 1.0355475763016158, + "grad_norm": 0.5449917912483215, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 14420 + }, + { + "epoch": 1.036265709156194, + "grad_norm": 0.6700817346572876, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 14430 + }, + { + "epoch": 1.036983842010772, + "grad_norm": 0.7061316967010498, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14440 + }, + { + "epoch": 1.03770197486535, + "grad_norm": 0.7582663893699646, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 14450 + }, + { + "epoch": 1.0384201077199282, + "grad_norm": 0.6408873200416565, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 14460 + }, + { + "epoch": 1.0391382405745062, + "grad_norm": 0.7645436525344849, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 14470 + }, + { + "epoch": 1.0398563734290844, + "grad_norm": 0.6522644758224487, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 14480 + }, + { + "epoch": 1.0405745062836624, + "grad_norm": 0.784273624420166, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 14490 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 0.673891544342041, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 14500 + }, + { + "epoch": 1.0420107719928187, + "grad_norm": 0.6566316485404968, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 14510 + }, + { + "epoch": 1.0427289048473967, + "grad_norm": 0.6062059998512268, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 14520 + }, + { + "epoch": 1.0434470377019749, + "grad_norm": 0.6884504556655884, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14530 + }, + { + "epoch": 1.044165170556553, + "grad_norm": 0.6642231345176697, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14540 + }, + { + "epoch": 1.0448833034111311, + "grad_norm": 0.6989523768424988, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 14550 + }, + { + "epoch": 1.0456014362657091, + "grad_norm": 0.8179892301559448, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 14560 + }, + { + "epoch": 1.0463195691202873, + "grad_norm": 0.6426970362663269, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 14570 + }, + { + "epoch": 1.0470377019748653, + "grad_norm": 0.678445041179657, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 14580 + }, + { + "epoch": 1.0477558348294433, + "grad_norm": 0.7573820352554321, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 14590 + }, + { + "epoch": 1.0484739676840216, + "grad_norm": 0.734443724155426, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 14600 + }, + { + "epoch": 1.0491921005385996, + "grad_norm": 0.7333676218986511, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14610 + }, + { + "epoch": 1.0499102333931778, + "grad_norm": 0.6122187972068787, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14620 + }, + { + "epoch": 1.0506283662477558, + "grad_norm": 0.6916412711143494, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 14630 + }, + { + "epoch": 1.051346499102334, + "grad_norm": 0.5898127555847168, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 14640 + }, + { + "epoch": 1.052064631956912, + "grad_norm": 0.6071873307228088, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14650 + }, + { + "epoch": 1.05278276481149, + "grad_norm": 0.6530455946922302, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 14660 + }, + { + "epoch": 1.0535008976660682, + "grad_norm": 0.6919314861297607, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14670 + }, + { + "epoch": 1.0542190305206462, + "grad_norm": 0.7843509912490845, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 14680 + }, + { + "epoch": 1.0549371633752245, + "grad_norm": 0.6106747388839722, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 14690 + }, + { + "epoch": 1.0556552962298025, + "grad_norm": 0.7828368544578552, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 14700 + }, + { + "epoch": 1.0563734290843807, + "grad_norm": 0.6772044897079468, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 14710 + }, + { + "epoch": 1.0570915619389587, + "grad_norm": 0.5430962443351746, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 14720 + }, + { + "epoch": 1.0578096947935367, + "grad_norm": 0.7364194989204407, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 14730 + }, + { + "epoch": 1.058527827648115, + "grad_norm": 0.5607585310935974, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 14740 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 0.7917081713676453, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 14750 + }, + { + "epoch": 1.0599640933572712, + "grad_norm": 0.7852025628089905, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 14760 + }, + { + "epoch": 1.0606822262118492, + "grad_norm": 0.6329161524772644, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 14770 + }, + { + "epoch": 1.0614003590664274, + "grad_norm": 0.7607306838035583, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14780 + }, + { + "epoch": 1.0621184919210054, + "grad_norm": 0.7236617207527161, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14790 + }, + { + "epoch": 1.0628366247755834, + "grad_norm": 0.793542206287384, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 14800 + }, + { + "epoch": 1.0635547576301616, + "grad_norm": 0.53999263048172, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 14810 + }, + { + "epoch": 1.0642728904847396, + "grad_norm": 0.5821034908294678, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 14820 + }, + { + "epoch": 1.0649910233393178, + "grad_norm": 0.6593600511550903, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 14830 + }, + { + "epoch": 1.0657091561938958, + "grad_norm": 0.70230633020401, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 14840 + }, + { + "epoch": 1.066427289048474, + "grad_norm": 0.5715264081954956, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14850 + }, + { + "epoch": 1.067145421903052, + "grad_norm": 0.6610119938850403, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 14860 + }, + { + "epoch": 1.06786355475763, + "grad_norm": 0.5470091700553894, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 14870 + }, + { + "epoch": 1.0685816876122083, + "grad_norm": 0.7529906630516052, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 14880 + }, + { + "epoch": 1.0692998204667863, + "grad_norm": 0.7532844543457031, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 14890 + }, + { + "epoch": 1.0700179533213645, + "grad_norm": 0.6439316868782043, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14900 + }, + { + "epoch": 1.0707360861759425, + "grad_norm": 0.5580114126205444, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14910 + }, + { + "epoch": 1.0714542190305207, + "grad_norm": 0.6299236416816711, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 14920 + }, + { + "epoch": 1.0721723518850987, + "grad_norm": 0.6934021711349487, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 14930 + }, + { + "epoch": 1.0728904847396767, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 14940 + }, + { + "epoch": 1.073608617594255, + "grad_norm": 0.8921014070510864, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14950 + }, + { + "epoch": 1.074326750448833, + "grad_norm": 0.5934301614761353, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 14960 + }, + { + "epoch": 1.0750448833034112, + "grad_norm": 0.8379642367362976, + "learning_rate": 0.0002, + "loss": 0.7595, + "step": 14970 + }, + { + "epoch": 1.0757630161579892, + "grad_norm": 0.6842767596244812, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 14980 + }, + { + "epoch": 1.0764811490125674, + "grad_norm": 0.7296533584594727, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 14990 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 0.6821087002754211, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15000 + }, + { + "epoch": 1.0779174147217234, + "grad_norm": 0.6133626699447632, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 15010 + }, + { + "epoch": 1.0786355475763016, + "grad_norm": 0.6774773001670837, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 15020 + }, + { + "epoch": 1.0793536804308796, + "grad_norm": 0.6818786859512329, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 15030 + }, + { + "epoch": 1.0800718132854579, + "grad_norm": 0.7763522863388062, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15040 + }, + { + "epoch": 1.0807899461400359, + "grad_norm": 0.7259193658828735, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15050 + }, + { + "epoch": 1.081508078994614, + "grad_norm": 0.6797525882720947, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 15060 + }, + { + "epoch": 1.082226211849192, + "grad_norm": 0.5775881409645081, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 15070 + }, + { + "epoch": 1.08294434470377, + "grad_norm": 0.7055524587631226, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15080 + }, + { + "epoch": 1.0836624775583483, + "grad_norm": 0.8018748760223389, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 15090 + }, + { + "epoch": 1.0843806104129263, + "grad_norm": 0.6738115549087524, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 15100 + }, + { + "epoch": 1.0850987432675046, + "grad_norm": 0.6586359143257141, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 15110 + }, + { + "epoch": 1.0858168761220826, + "grad_norm": 0.7396895885467529, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 15120 + }, + { + "epoch": 1.0865350089766608, + "grad_norm": 0.7224817276000977, + "learning_rate": 0.0002, + "loss": 0.7473, + "step": 15130 + }, + { + "epoch": 1.0872531418312388, + "grad_norm": 0.798514187335968, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 15140 + }, + { + "epoch": 1.0879712746858168, + "grad_norm": 0.79301518201828, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 15150 + }, + { + "epoch": 1.088689407540395, + "grad_norm": 0.7106764316558838, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 15160 + }, + { + "epoch": 1.089407540394973, + "grad_norm": 0.6525473594665527, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 15170 + }, + { + "epoch": 1.0901256732495512, + "grad_norm": 0.6001671552658081, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 15180 + }, + { + "epoch": 1.0908438061041292, + "grad_norm": 0.6949557662010193, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 15190 + }, + { + "epoch": 1.0915619389587075, + "grad_norm": 0.5713186860084534, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 15200 + }, + { + "epoch": 1.0922800718132855, + "grad_norm": 0.8773220181465149, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 15210 + }, + { + "epoch": 1.0929982046678635, + "grad_norm": 0.5837785601615906, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 15220 + }, + { + "epoch": 1.0937163375224417, + "grad_norm": 0.7243856191635132, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 15230 + }, + { + "epoch": 1.0944344703770197, + "grad_norm": 0.7008263468742371, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 15240 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 0.7061941623687744, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 15250 + }, + { + "epoch": 1.095870736086176, + "grad_norm": 0.575903594493866, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 15260 + }, + { + "epoch": 1.0965888689407541, + "grad_norm": 0.6794043183326721, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 15270 + }, + { + "epoch": 1.0973070017953321, + "grad_norm": 0.7194870710372925, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 15280 + }, + { + "epoch": 1.0980251346499101, + "grad_norm": 0.8063322305679321, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 15290 + }, + { + "epoch": 1.0987432675044884, + "grad_norm": 0.786101758480072, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 15300 + }, + { + "epoch": 1.0994614003590664, + "grad_norm": 0.827474057674408, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 15310 + }, + { + "epoch": 1.1001795332136446, + "grad_norm": 0.6514455080032349, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 15320 + }, + { + "epoch": 1.1008976660682226, + "grad_norm": 0.7534348368644714, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15330 + }, + { + "epoch": 1.1016157989228008, + "grad_norm": 0.6991367340087891, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 15340 + }, + { + "epoch": 1.1023339317773788, + "grad_norm": 0.6742196679115295, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15350 + }, + { + "epoch": 1.1030520646319568, + "grad_norm": 0.7373757362365723, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 15360 + }, + { + "epoch": 1.103770197486535, + "grad_norm": 0.6834485530853271, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 15370 + }, + { + "epoch": 1.104488330341113, + "grad_norm": 0.6454901099205017, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 15380 + }, + { + "epoch": 1.1052064631956913, + "grad_norm": 0.7764508128166199, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 15390 + }, + { + "epoch": 1.1059245960502693, + "grad_norm": 0.668560802936554, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 15400 + }, + { + "epoch": 1.1066427289048475, + "grad_norm": 0.579655110836029, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 15410 + }, + { + "epoch": 1.1073608617594255, + "grad_norm": 0.7196493148803711, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 15420 + }, + { + "epoch": 1.1080789946140035, + "grad_norm": 0.5530232191085815, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 15430 + }, + { + "epoch": 1.1087971274685817, + "grad_norm": 0.6542958617210388, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 15440 + }, + { + "epoch": 1.1095152603231597, + "grad_norm": 0.7468852400779724, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 15450 + }, + { + "epoch": 1.110233393177738, + "grad_norm": 0.8119780421257019, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 15460 + }, + { + "epoch": 1.110951526032316, + "grad_norm": 0.7807733416557312, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 15470 + }, + { + "epoch": 1.1116696588868942, + "grad_norm": 0.7352553009986877, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 15480 + }, + { + "epoch": 1.1123877917414722, + "grad_norm": 0.8455224633216858, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 15490 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 0.635308563709259, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 15500 + }, + { + "epoch": 1.1138240574506284, + "grad_norm": 0.6268794536590576, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15510 + }, + { + "epoch": 1.1145421903052064, + "grad_norm": 0.6829593181610107, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 15520 + }, + { + "epoch": 1.1152603231597846, + "grad_norm": 0.5997796058654785, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 15530 + }, + { + "epoch": 1.1159784560143626, + "grad_norm": 0.7500942349433899, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 15540 + }, + { + "epoch": 1.1166965888689409, + "grad_norm": 0.7052047848701477, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 15550 + }, + { + "epoch": 1.1174147217235189, + "grad_norm": 0.6698189377784729, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 15560 + }, + { + "epoch": 1.1181328545780969, + "grad_norm": 0.7890462875366211, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 15570 + }, + { + "epoch": 1.118850987432675, + "grad_norm": 0.7002465128898621, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 15580 + }, + { + "epoch": 1.119569120287253, + "grad_norm": 0.7456073760986328, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 15590 + }, + { + "epoch": 1.1202872531418313, + "grad_norm": 0.7997385263442993, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 15600 + }, + { + "epoch": 1.1210053859964093, + "grad_norm": 0.6640482544898987, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15610 + }, + { + "epoch": 1.1217235188509875, + "grad_norm": 0.7765318155288696, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15620 + }, + { + "epoch": 1.1224416517055655, + "grad_norm": 0.7184962630271912, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 15630 + }, + { + "epoch": 1.1231597845601435, + "grad_norm": 0.7310904264450073, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 15640 + }, + { + "epoch": 1.1238779174147218, + "grad_norm": 0.7406452298164368, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 15650 + }, + { + "epoch": 1.1245960502692998, + "grad_norm": 0.7546738982200623, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 15660 + }, + { + "epoch": 1.125314183123878, + "grad_norm": 0.7069764733314514, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 15670 + }, + { + "epoch": 1.126032315978456, + "grad_norm": 0.6309521198272705, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 15680 + }, + { + "epoch": 1.1267504488330342, + "grad_norm": 0.8050156831741333, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 15690 + }, + { + "epoch": 1.1274685816876122, + "grad_norm": 0.726556122303009, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 15700 + }, + { + "epoch": 1.1281867145421902, + "grad_norm": 0.77745521068573, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 15710 + }, + { + "epoch": 1.1289048473967684, + "grad_norm": 0.7467634677886963, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 15720 + }, + { + "epoch": 1.1296229802513464, + "grad_norm": 0.8207895755767822, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 15730 + }, + { + "epoch": 1.1303411131059247, + "grad_norm": 0.8253937363624573, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 15740 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 0.6313983798027039, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 15750 + }, + { + "epoch": 1.1317773788150807, + "grad_norm": 0.8040992021560669, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 15760 + }, + { + "epoch": 1.132495511669659, + "grad_norm": 0.5937064290046692, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 15770 + }, + { + "epoch": 1.133213644524237, + "grad_norm": 0.6486281156539917, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 15780 + }, + { + "epoch": 1.1339317773788151, + "grad_norm": 0.6161853075027466, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 15790 + }, + { + "epoch": 1.1346499102333931, + "grad_norm": 0.6926610469818115, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 15800 + }, + { + "epoch": 1.1353680430879713, + "grad_norm": 0.6084047555923462, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 15810 + }, + { + "epoch": 1.1360861759425493, + "grad_norm": 0.6928383111953735, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 15820 + }, + { + "epoch": 1.1368043087971276, + "grad_norm": 0.7784243822097778, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 15830 + }, + { + "epoch": 1.1375224416517056, + "grad_norm": 0.7169384956359863, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 15840 + }, + { + "epoch": 1.1382405745062836, + "grad_norm": 0.6953616142272949, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 15850 + }, + { + "epoch": 1.1389587073608618, + "grad_norm": 0.7345215082168579, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15860 + }, + { + "epoch": 1.1396768402154398, + "grad_norm": 0.5469502806663513, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 15870 + }, + { + "epoch": 1.140394973070018, + "grad_norm": 0.687680721282959, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15880 + }, + { + "epoch": 1.141113105924596, + "grad_norm": 0.6879996657371521, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 15890 + }, + { + "epoch": 1.141831238779174, + "grad_norm": 0.728886067867279, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 15900 + }, + { + "epoch": 1.1425493716337523, + "grad_norm": 0.929531455039978, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 15910 + }, + { + "epoch": 1.1432675044883303, + "grad_norm": 0.8122507333755493, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 15920 + }, + { + "epoch": 1.1439856373429085, + "grad_norm": 0.6494652628898621, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 15930 + }, + { + "epoch": 1.1447037701974865, + "grad_norm": 0.7307567596435547, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15940 + }, + { + "epoch": 1.1454219030520647, + "grad_norm": 0.548678994178772, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 15950 + }, + { + "epoch": 1.1461400359066427, + "grad_norm": 0.8011603951454163, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 15960 + }, + { + "epoch": 1.146858168761221, + "grad_norm": 0.7026647329330444, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 15970 + }, + { + "epoch": 1.147576301615799, + "grad_norm": 0.7338995933532715, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 15980 + }, + { + "epoch": 1.148294434470377, + "grad_norm": 0.8453443646430969, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 15990 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 0.6787207126617432, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 16000 + }, + { + "epoch": 1.1497307001795332, + "grad_norm": 0.6314631104469299, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 16010 + }, + { + "epoch": 1.1504488330341114, + "grad_norm": 0.8812752962112427, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16020 + }, + { + "epoch": 1.1511669658886894, + "grad_norm": 0.6528969407081604, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 16030 + }, + { + "epoch": 1.1518850987432674, + "grad_norm": 0.7843571305274963, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 16040 + }, + { + "epoch": 1.1526032315978456, + "grad_norm": 0.7095080018043518, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 16050 + }, + { + "epoch": 1.1533213644524236, + "grad_norm": 0.7495582103729248, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 16060 + }, + { + "epoch": 1.1540394973070018, + "grad_norm": 0.6002049446105957, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 16070 + }, + { + "epoch": 1.1547576301615798, + "grad_norm": 0.565014123916626, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 16080 + }, + { + "epoch": 1.155475763016158, + "grad_norm": 0.8209971785545349, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 16090 + }, + { + "epoch": 1.156193895870736, + "grad_norm": 0.7137531042098999, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 16100 + }, + { + "epoch": 1.1569120287253143, + "grad_norm": 0.7307516932487488, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 16110 + }, + { + "epoch": 1.1576301615798923, + "grad_norm": 0.6686444878578186, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 16120 + }, + { + "epoch": 1.1583482944344703, + "grad_norm": 0.7977298498153687, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 16130 + }, + { + "epoch": 1.1590664272890485, + "grad_norm": 0.6980607509613037, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 16140 + }, + { + "epoch": 1.1597845601436265, + "grad_norm": 0.6622613668441772, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 16150 + }, + { + "epoch": 1.1605026929982047, + "grad_norm": 0.6598347425460815, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 16160 + }, + { + "epoch": 1.1612208258527827, + "grad_norm": 0.6686234474182129, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 16170 + }, + { + "epoch": 1.1619389587073607, + "grad_norm": 0.7308177947998047, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 16180 + }, + { + "epoch": 1.162657091561939, + "grad_norm": 0.939537525177002, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 16190 + }, + { + "epoch": 1.163375224416517, + "grad_norm": 0.5514758825302124, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 16200 + }, + { + "epoch": 1.1640933572710952, + "grad_norm": 0.589142918586731, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 16210 + }, + { + "epoch": 1.1648114901256732, + "grad_norm": 0.6888012290000916, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 16220 + }, + { + "epoch": 1.1655296229802514, + "grad_norm": 0.82566899061203, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 16230 + }, + { + "epoch": 1.1662477558348294, + "grad_norm": 0.6107817888259888, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 16240 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 0.7831398844718933, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 16250 + }, + { + "epoch": 1.1676840215439857, + "grad_norm": 0.6468397974967957, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 16260 + }, + { + "epoch": 1.1684021543985637, + "grad_norm": 0.7284161448478699, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 16270 + }, + { + "epoch": 1.1691202872531419, + "grad_norm": 0.6182818412780762, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 16280 + }, + { + "epoch": 1.1698384201077199, + "grad_norm": 0.7091781497001648, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 16290 + }, + { + "epoch": 1.170556552962298, + "grad_norm": 0.7327643632888794, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 16300 + }, + { + "epoch": 1.171274685816876, + "grad_norm": 0.5864694118499756, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 16310 + }, + { + "epoch": 1.171992818671454, + "grad_norm": 0.7049986720085144, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 16320 + }, + { + "epoch": 1.1727109515260323, + "grad_norm": 0.7563399076461792, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 16330 + }, + { + "epoch": 1.1734290843806103, + "grad_norm": 0.5888143181800842, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16340 + }, + { + "epoch": 1.1741472172351886, + "grad_norm": 0.8670049905776978, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 16350 + }, + { + "epoch": 1.1748653500897666, + "grad_norm": 0.8045654296875, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 16360 + }, + { + "epoch": 1.1755834829443448, + "grad_norm": 0.9115668535232544, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 16370 + }, + { + "epoch": 1.1763016157989228, + "grad_norm": 0.6943584084510803, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 16380 + }, + { + "epoch": 1.177019748653501, + "grad_norm": 0.7931740283966064, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 16390 + }, + { + "epoch": 1.177737881508079, + "grad_norm": 0.7967953085899353, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16400 + }, + { + "epoch": 1.178456014362657, + "grad_norm": 0.575165867805481, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 16410 + }, + { + "epoch": 1.1791741472172352, + "grad_norm": 0.6803409457206726, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 16420 + }, + { + "epoch": 1.1798922800718132, + "grad_norm": 0.7661909461021423, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 16430 + }, + { + "epoch": 1.1806104129263915, + "grad_norm": 0.7907630205154419, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 16440 + }, + { + "epoch": 1.1813285457809695, + "grad_norm": 0.7215338945388794, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 16450 + }, + { + "epoch": 1.1820466786355475, + "grad_norm": 0.6824054718017578, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 16460 + }, + { + "epoch": 1.1827648114901257, + "grad_norm": 0.8057665228843689, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 16470 + }, + { + "epoch": 1.1834829443447037, + "grad_norm": 0.7487542033195496, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 16480 + }, + { + "epoch": 1.184201077199282, + "grad_norm": 0.7254953384399414, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 16490 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 0.6986604332923889, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 16500 + }, + { + "epoch": 1.1856373429084381, + "grad_norm": 0.7889591455459595, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 16510 + }, + { + "epoch": 1.1863554757630161, + "grad_norm": 0.6029604077339172, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 16520 + }, + { + "epoch": 1.1870736086175944, + "grad_norm": 0.680322527885437, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 16530 + }, + { + "epoch": 1.1877917414721724, + "grad_norm": 0.8588826060295105, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 16540 + }, + { + "epoch": 1.1885098743267504, + "grad_norm": 0.7614806890487671, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 16550 + }, + { + "epoch": 1.1892280071813286, + "grad_norm": 0.7523183226585388, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 16560 + }, + { + "epoch": 1.1899461400359066, + "grad_norm": 0.8299532532691956, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 16570 + }, + { + "epoch": 1.1906642728904848, + "grad_norm": 0.6709241271018982, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 16580 + }, + { + "epoch": 1.1913824057450628, + "grad_norm": 0.665414035320282, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16590 + }, + { + "epoch": 1.1921005385996408, + "grad_norm": 0.7582152485847473, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 16600 + }, + { + "epoch": 1.192818671454219, + "grad_norm": 0.5856947302818298, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 16610 + }, + { + "epoch": 1.193536804308797, + "grad_norm": 0.6972885727882385, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 16620 + }, + { + "epoch": 1.1942549371633753, + "grad_norm": 0.6884734630584717, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 16630 + }, + { + "epoch": 1.1949730700179533, + "grad_norm": 0.7380475401878357, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 16640 + }, + { + "epoch": 1.1956912028725315, + "grad_norm": 0.7976197600364685, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 16650 + }, + { + "epoch": 1.1964093357271095, + "grad_norm": 0.819256067276001, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 16660 + }, + { + "epoch": 1.1971274685816877, + "grad_norm": 0.587867796421051, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 16670 + }, + { + "epoch": 1.1978456014362657, + "grad_norm": 0.9162678122520447, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 16680 + }, + { + "epoch": 1.1985637342908437, + "grad_norm": 0.7452084422111511, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 16690 + }, + { + "epoch": 1.199281867145422, + "grad_norm": 0.7966971397399902, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 16700 + }, + { + "epoch": 1.2, + "grad_norm": 0.6605724692344666, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 16710 + }, + { + "epoch": 1.2007181328545782, + "grad_norm": 0.6499220728874207, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16720 + }, + { + "epoch": 1.2014362657091562, + "grad_norm": 0.7422114610671997, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 16730 + }, + { + "epoch": 1.2021543985637342, + "grad_norm": 0.6652370095252991, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 16740 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 0.8761070370674133, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 16750 + }, + { + "epoch": 1.2035906642728904, + "grad_norm": 0.7294463515281677, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 16760 + }, + { + "epoch": 1.2043087971274686, + "grad_norm": 0.7725599408149719, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 16770 + }, + { + "epoch": 1.2050269299820466, + "grad_norm": 0.5630005598068237, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 16780 + }, + { + "epoch": 1.2057450628366249, + "grad_norm": 0.7601404786109924, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16790 + }, + { + "epoch": 1.2064631956912029, + "grad_norm": 0.6859985589981079, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16800 + }, + { + "epoch": 1.207181328545781, + "grad_norm": 0.7040054798126221, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 16810 + }, + { + "epoch": 1.207899461400359, + "grad_norm": 0.7058989405632019, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 16820 + }, + { + "epoch": 1.208617594254937, + "grad_norm": 0.7646133899688721, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16830 + }, + { + "epoch": 1.2093357271095153, + "grad_norm": 0.669550359249115, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 16840 + }, + { + "epoch": 1.2100538599640933, + "grad_norm": 0.6613401174545288, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16850 + }, + { + "epoch": 1.2107719928186715, + "grad_norm": 0.8636519312858582, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 16860 + }, + { + "epoch": 1.2114901256732495, + "grad_norm": 0.6077507138252258, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 16870 + }, + { + "epoch": 1.2122082585278275, + "grad_norm": 0.7892228364944458, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 16880 + }, + { + "epoch": 1.2129263913824058, + "grad_norm": 0.7424154877662659, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 16890 + }, + { + "epoch": 1.2136445242369838, + "grad_norm": 0.6525408029556274, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 16900 + }, + { + "epoch": 1.214362657091562, + "grad_norm": 0.6178015470504761, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 16910 + }, + { + "epoch": 1.21508078994614, + "grad_norm": 0.7319437861442566, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 16920 + }, + { + "epoch": 1.2157989228007182, + "grad_norm": 0.6823344826698303, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 16930 + }, + { + "epoch": 1.2165170556552962, + "grad_norm": 0.5681257843971252, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 16940 + }, + { + "epoch": 1.2172351885098744, + "grad_norm": 0.7939814925193787, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 16950 + }, + { + "epoch": 1.2179533213644524, + "grad_norm": 0.7031611800193787, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 16960 + }, + { + "epoch": 1.2186714542190304, + "grad_norm": 0.7610133290290833, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16970 + }, + { + "epoch": 1.2193895870736087, + "grad_norm": 0.8707142472267151, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 16980 + }, + { + "epoch": 1.2201077199281867, + "grad_norm": 0.6603384017944336, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 16990 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 0.7218315005302429, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 17000 + }, + { + "epoch": 1.221543985637343, + "grad_norm": 0.8043148517608643, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17010 + }, + { + "epoch": 1.222262118491921, + "grad_norm": 0.7232559323310852, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17020 + }, + { + "epoch": 1.2229802513464991, + "grad_norm": 0.690376341342926, + "learning_rate": 0.0002, + "loss": 0.7681, + "step": 17030 + }, + { + "epoch": 1.2236983842010771, + "grad_norm": 0.602436363697052, + "learning_rate": 0.0002, + "loss": 0.7042, + "step": 17040 + }, + { + "epoch": 1.2244165170556554, + "grad_norm": 0.7610493898391724, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 17050 + }, + { + "epoch": 1.2251346499102334, + "grad_norm": 0.7504690885543823, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 17060 + }, + { + "epoch": 1.2258527827648116, + "grad_norm": 0.8080246448516846, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 17070 + }, + { + "epoch": 1.2265709156193896, + "grad_norm": 1.0240572690963745, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 17080 + }, + { + "epoch": 1.2272890484739678, + "grad_norm": 0.6874111294746399, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 17090 + }, + { + "epoch": 1.2280071813285458, + "grad_norm": 0.800069272518158, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 17100 + }, + { + "epoch": 1.2287253141831238, + "grad_norm": 0.8628103137016296, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 17110 + }, + { + "epoch": 1.229443447037702, + "grad_norm": 0.7408499121665955, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 17120 + }, + { + "epoch": 1.23016157989228, + "grad_norm": 0.6494335532188416, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 17130 + }, + { + "epoch": 1.2308797127468583, + "grad_norm": 0.6493549942970276, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17140 + }, + { + "epoch": 1.2315978456014363, + "grad_norm": 0.6972658038139343, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 17150 + }, + { + "epoch": 1.2323159784560143, + "grad_norm": 0.6877315044403076, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 17160 + }, + { + "epoch": 1.2330341113105925, + "grad_norm": 0.7569024562835693, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 17170 + }, + { + "epoch": 1.2337522441651705, + "grad_norm": 0.696260392665863, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 17180 + }, + { + "epoch": 1.2344703770197487, + "grad_norm": 0.6150345802307129, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 17190 + }, + { + "epoch": 1.2351885098743267, + "grad_norm": 0.69009929895401, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 17200 + }, + { + "epoch": 1.235906642728905, + "grad_norm": 0.7035185098648071, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 17210 + }, + { + "epoch": 1.236624775583483, + "grad_norm": 0.6792506575584412, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17220 + }, + { + "epoch": 1.2373429084380612, + "grad_norm": 0.6310356855392456, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 17230 + }, + { + "epoch": 1.2380610412926392, + "grad_norm": 0.647026538848877, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 17240 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 0.7609930038452148, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 17250 + }, + { + "epoch": 1.2394973070017954, + "grad_norm": 0.791890561580658, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 17260 + }, + { + "epoch": 1.2402154398563734, + "grad_norm": 0.7126715183258057, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 17270 + }, + { + "epoch": 1.2409335727109516, + "grad_norm": 0.7850401401519775, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 17280 + }, + { + "epoch": 1.2416517055655296, + "grad_norm": 0.6694281697273254, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 17290 + }, + { + "epoch": 1.2423698384201076, + "grad_norm": 0.6418080925941467, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 17300 + }, + { + "epoch": 1.2430879712746858, + "grad_norm": 0.7308132648468018, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 17310 + }, + { + "epoch": 1.2438061041292638, + "grad_norm": 0.8322312235832214, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17320 + }, + { + "epoch": 1.244524236983842, + "grad_norm": 0.6959006190299988, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 17330 + }, + { + "epoch": 1.24524236983842, + "grad_norm": 0.7110121846199036, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17340 + }, + { + "epoch": 1.2459605026929983, + "grad_norm": 0.6496296525001526, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 17350 + }, + { + "epoch": 1.2466786355475763, + "grad_norm": 0.7649076581001282, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 17360 + }, + { + "epoch": 1.2473967684021545, + "grad_norm": 0.7139049172401428, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 17370 + }, + { + "epoch": 1.2481149012567325, + "grad_norm": 0.7709113955497742, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 17380 + }, + { + "epoch": 1.2488330341113105, + "grad_norm": 0.7160373330116272, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 17390 + }, + { + "epoch": 1.2495511669658887, + "grad_norm": 0.5608301162719727, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17400 + }, + { + "epoch": 1.2502692998204668, + "grad_norm": 0.6913180351257324, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 17410 + }, + { + "epoch": 1.250987432675045, + "grad_norm": 0.6980322599411011, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 17420 + }, + { + "epoch": 1.251705565529623, + "grad_norm": 0.8155394792556763, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 17430 + }, + { + "epoch": 1.252423698384201, + "grad_norm": 0.8015886545181274, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 17440 + }, + { + "epoch": 1.2531418312387792, + "grad_norm": 0.5985556244850159, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17450 + }, + { + "epoch": 1.2538599640933572, + "grad_norm": 0.70317143201828, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17460 + }, + { + "epoch": 1.2545780969479354, + "grad_norm": 0.612501323223114, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17470 + }, + { + "epoch": 1.2552962298025134, + "grad_norm": 0.7347102165222168, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 17480 + }, + { + "epoch": 1.2560143626570914, + "grad_norm": 0.9189441800117493, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 17490 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 0.7727932929992676, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 17500 + }, + { + "epoch": 1.2574506283662479, + "grad_norm": 0.6782869696617126, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 17510 + }, + { + "epoch": 1.2581687612208259, + "grad_norm": 0.5710638761520386, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17520 + }, + { + "epoch": 1.2588868940754039, + "grad_norm": 0.6856266856193542, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 17530 + }, + { + "epoch": 1.259605026929982, + "grad_norm": 0.7257347702980042, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 17540 + }, + { + "epoch": 1.26032315978456, + "grad_norm": 0.6343092918395996, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 17550 + }, + { + "epoch": 1.2610412926391383, + "grad_norm": 0.6482594013214111, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 17560 + }, + { + "epoch": 1.2617594254937163, + "grad_norm": 0.6542837619781494, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 17570 + }, + { + "epoch": 1.2624775583482943, + "grad_norm": 0.7106123566627502, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 17580 + }, + { + "epoch": 1.2631956912028726, + "grad_norm": 0.9081960320472717, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 17590 + }, + { + "epoch": 1.2639138240574506, + "grad_norm": 0.7010290026664734, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 17600 + }, + { + "epoch": 1.2646319569120288, + "grad_norm": 0.9973132610321045, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 17610 + }, + { + "epoch": 1.2653500897666068, + "grad_norm": 0.8003297448158264, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 17620 + }, + { + "epoch": 1.2660682226211848, + "grad_norm": 0.7383468151092529, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 17630 + }, + { + "epoch": 1.266786355475763, + "grad_norm": 0.6337200999259949, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 17640 + }, + { + "epoch": 1.2675044883303412, + "grad_norm": 0.6371761560440063, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 17650 + }, + { + "epoch": 1.2682226211849192, + "grad_norm": 0.7283522486686707, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 17660 + }, + { + "epoch": 1.2689407540394972, + "grad_norm": 0.8191015720367432, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 17670 + }, + { + "epoch": 1.2696588868940755, + "grad_norm": 0.6210351586341858, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 17680 + }, + { + "epoch": 1.2703770197486535, + "grad_norm": 0.6563277840614319, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 17690 + }, + { + "epoch": 1.2710951526032317, + "grad_norm": 0.7111260294914246, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 17700 + }, + { + "epoch": 1.2718132854578097, + "grad_norm": 0.7061500549316406, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 17710 + }, + { + "epoch": 1.2725314183123877, + "grad_norm": 0.7657744884490967, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 17720 + }, + { + "epoch": 1.273249551166966, + "grad_norm": 0.6952996850013733, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17730 + }, + { + "epoch": 1.273967684021544, + "grad_norm": 0.5678043961524963, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 17740 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 0.8608036041259766, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 17750 + }, + { + "epoch": 1.2754039497307001, + "grad_norm": 0.7184045910835266, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 17760 + }, + { + "epoch": 1.2761220825852782, + "grad_norm": 0.6647557616233826, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 17770 + }, + { + "epoch": 1.2768402154398564, + "grad_norm": 0.6899349093437195, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17780 + }, + { + "epoch": 1.2775583482944346, + "grad_norm": 0.7073346972465515, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 17790 + }, + { + "epoch": 1.2782764811490126, + "grad_norm": 0.8896707892417908, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 17800 + }, + { + "epoch": 1.2789946140035906, + "grad_norm": 0.5072778463363647, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 17810 + }, + { + "epoch": 1.2797127468581688, + "grad_norm": 0.8889711499214172, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 17820 + }, + { + "epoch": 1.2804308797127468, + "grad_norm": 0.5583778619766235, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 17830 + }, + { + "epoch": 1.281149012567325, + "grad_norm": 0.6526148915290833, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 17840 + }, + { + "epoch": 1.281867145421903, + "grad_norm": 0.7658175826072693, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 17850 + }, + { + "epoch": 1.282585278276481, + "grad_norm": 0.5547847151756287, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 17860 + }, + { + "epoch": 1.2833034111310593, + "grad_norm": 0.6153780817985535, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17870 + }, + { + "epoch": 1.2840215439856373, + "grad_norm": 0.8474061489105225, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 17880 + }, + { + "epoch": 1.2847396768402155, + "grad_norm": 0.859260618686676, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 17890 + }, + { + "epoch": 1.2854578096947935, + "grad_norm": 0.7270520329475403, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 17900 + }, + { + "epoch": 1.2861759425493715, + "grad_norm": 0.8166249394416809, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 17910 + }, + { + "epoch": 1.2868940754039497, + "grad_norm": 0.9158982038497925, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17920 + }, + { + "epoch": 1.287612208258528, + "grad_norm": 0.8132565021514893, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17930 + }, + { + "epoch": 1.288330341113106, + "grad_norm": 0.7914409637451172, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17940 + }, + { + "epoch": 1.289048473967684, + "grad_norm": 0.6256071329116821, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 17950 + }, + { + "epoch": 1.2897666068222622, + "grad_norm": 0.6463542580604553, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 17960 + }, + { + "epoch": 1.2904847396768402, + "grad_norm": 0.6702672839164734, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 17970 + }, + { + "epoch": 1.2912028725314184, + "grad_norm": 0.8666605949401855, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 17980 + }, + { + "epoch": 1.2919210053859964, + "grad_norm": 0.8055952787399292, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17990 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 0.6909741163253784, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 18000 + }, + { + "epoch": 1.2933572710951526, + "grad_norm": 0.663702130317688, + "learning_rate": 0.0002, + "loss": 0.7766, + "step": 18010 + }, + { + "epoch": 1.2940754039497306, + "grad_norm": 0.6952448487281799, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 18020 + }, + { + "epoch": 1.2947935368043089, + "grad_norm": 0.5722854137420654, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18030 + }, + { + "epoch": 1.2955116696588869, + "grad_norm": 0.7987681031227112, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 18040 + }, + { + "epoch": 1.2962298025134649, + "grad_norm": 0.661133348941803, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 18050 + }, + { + "epoch": 1.296947935368043, + "grad_norm": 0.6025064587593079, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 18060 + }, + { + "epoch": 1.2976660682226213, + "grad_norm": 0.7569907903671265, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 18070 + }, + { + "epoch": 1.2983842010771993, + "grad_norm": 0.7222012281417847, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18080 + }, + { + "epoch": 1.2991023339317773, + "grad_norm": 0.5291963815689087, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 18090 + }, + { + "epoch": 1.2998204667863555, + "grad_norm": 0.6808363199234009, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 18100 + }, + { + "epoch": 1.3005385996409335, + "grad_norm": 0.6797927618026733, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 18110 + }, + { + "epoch": 1.3012567324955118, + "grad_norm": 0.7775542140007019, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 18120 + }, + { + "epoch": 1.3019748653500898, + "grad_norm": 0.7369466423988342, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18130 + }, + { + "epoch": 1.3026929982046678, + "grad_norm": 0.6822494864463806, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 18140 + }, + { + "epoch": 1.303411131059246, + "grad_norm": 0.9222138524055481, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 18150 + }, + { + "epoch": 1.304129263913824, + "grad_norm": 0.7485767006874084, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 18160 + }, + { + "epoch": 1.3048473967684022, + "grad_norm": 0.6383684277534485, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 18170 + }, + { + "epoch": 1.3055655296229802, + "grad_norm": 0.5934187173843384, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 18180 + }, + { + "epoch": 1.3062836624775582, + "grad_norm": 0.7265770435333252, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 18190 + }, + { + "epoch": 1.3070017953321365, + "grad_norm": 0.8149140477180481, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 18200 + }, + { + "epoch": 1.3077199281867147, + "grad_norm": 0.8067880272865295, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 18210 + }, + { + "epoch": 1.3084380610412927, + "grad_norm": 0.6109178066253662, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18220 + }, + { + "epoch": 1.3091561938958707, + "grad_norm": 0.7194176316261292, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 18230 + }, + { + "epoch": 1.309874326750449, + "grad_norm": 0.6452242136001587, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 18240 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 0.680550217628479, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 18250 + }, + { + "epoch": 1.3113105924596051, + "grad_norm": 0.7005740404129028, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 18260 + }, + { + "epoch": 1.3120287253141831, + "grad_norm": 0.7217825055122375, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 18270 + }, + { + "epoch": 1.3127468581687611, + "grad_norm": 0.7730209231376648, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 18280 + }, + { + "epoch": 1.3134649910233394, + "grad_norm": 0.8291956186294556, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18290 + }, + { + "epoch": 1.3141831238779174, + "grad_norm": 0.758528470993042, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18300 + }, + { + "epoch": 1.3149012567324956, + "grad_norm": 0.9682782292366028, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 18310 + }, + { + "epoch": 1.3156193895870736, + "grad_norm": 0.5784780979156494, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 18320 + }, + { + "epoch": 1.3163375224416516, + "grad_norm": 0.5870532393455505, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 18330 + }, + { + "epoch": 1.3170556552962298, + "grad_norm": 0.5950172543525696, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 18340 + }, + { + "epoch": 1.317773788150808, + "grad_norm": 0.7625961899757385, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 18350 + }, + { + "epoch": 1.318491921005386, + "grad_norm": 0.8027397394180298, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 18360 + }, + { + "epoch": 1.319210053859964, + "grad_norm": 0.8424779772758484, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 18370 + }, + { + "epoch": 1.3199281867145423, + "grad_norm": 0.5741737484931946, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 18380 + }, + { + "epoch": 1.3206463195691203, + "grad_norm": 0.7363710999488831, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 18390 + }, + { + "epoch": 1.3213644524236985, + "grad_norm": 0.7900536060333252, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 18400 + }, + { + "epoch": 1.3220825852782765, + "grad_norm": 0.6273105144500732, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 18410 + }, + { + "epoch": 1.3228007181328545, + "grad_norm": 0.7612496018409729, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 18420 + }, + { + "epoch": 1.3235188509874327, + "grad_norm": 0.729653537273407, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 18430 + }, + { + "epoch": 1.3242369838420107, + "grad_norm": 0.6599212288856506, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 18440 + }, + { + "epoch": 1.324955116696589, + "grad_norm": 0.762320876121521, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18450 + }, + { + "epoch": 1.325673249551167, + "grad_norm": 0.7468838095664978, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18460 + }, + { + "epoch": 1.326391382405745, + "grad_norm": 0.6376237273216248, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 18470 + }, + { + "epoch": 1.3271095152603232, + "grad_norm": 0.6722603440284729, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18480 + }, + { + "epoch": 1.3278276481149014, + "grad_norm": 0.7011231780052185, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 18490 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 0.5325027108192444, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 18500 + }, + { + "epoch": 1.3292639138240574, + "grad_norm": 0.6916731595993042, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 18510 + }, + { + "epoch": 1.3299820466786356, + "grad_norm": 0.6529106497764587, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18520 + }, + { + "epoch": 1.3307001795332136, + "grad_norm": 0.7708640694618225, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 18530 + }, + { + "epoch": 1.3314183123877918, + "grad_norm": 0.7125861048698425, + "learning_rate": 0.0002, + "loss": 0.7688, + "step": 18540 + }, + { + "epoch": 1.3321364452423698, + "grad_norm": 0.7663969993591309, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 18550 + }, + { + "epoch": 1.3328545780969479, + "grad_norm": 0.601141631603241, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 18560 + }, + { + "epoch": 1.333572710951526, + "grad_norm": 0.6185581088066101, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 18570 + }, + { + "epoch": 1.334290843806104, + "grad_norm": 0.6136596202850342, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 18580 + }, + { + "epoch": 1.3350089766606823, + "grad_norm": 0.8377187252044678, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 18590 + }, + { + "epoch": 1.3357271095152603, + "grad_norm": 0.7649989724159241, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 18600 + }, + { + "epoch": 1.3364452423698383, + "grad_norm": 0.7944515347480774, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 18610 + }, + { + "epoch": 1.3371633752244165, + "grad_norm": 0.619024395942688, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 18620 + }, + { + "epoch": 1.3378815080789948, + "grad_norm": 0.7849082946777344, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 18630 + }, + { + "epoch": 1.3385996409335728, + "grad_norm": 0.5740780830383301, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18640 + }, + { + "epoch": 1.3393177737881508, + "grad_norm": 0.6897456645965576, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 18650 + }, + { + "epoch": 1.340035906642729, + "grad_norm": 0.6263600587844849, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 18660 + }, + { + "epoch": 1.340754039497307, + "grad_norm": 0.5744550824165344, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 18670 + }, + { + "epoch": 1.3414721723518852, + "grad_norm": 0.7785728573799133, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 18680 + }, + { + "epoch": 1.3421903052064632, + "grad_norm": 0.6944230198860168, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 18690 + }, + { + "epoch": 1.3429084380610412, + "grad_norm": 0.7388073801994324, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 18700 + }, + { + "epoch": 1.3436265709156194, + "grad_norm": 0.9555586576461792, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 18710 + }, + { + "epoch": 1.3443447037701974, + "grad_norm": 0.8510582447052002, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 18720 + }, + { + "epoch": 1.3450628366247757, + "grad_norm": 0.6093049645423889, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 18730 + }, + { + "epoch": 1.3457809694793537, + "grad_norm": 0.9159273505210876, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 18740 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 0.7188084721565247, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 18750 + }, + { + "epoch": 1.3472172351885099, + "grad_norm": 0.7228650450706482, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 18760 + }, + { + "epoch": 1.347935368043088, + "grad_norm": 0.8160615563392639, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 18770 + }, + { + "epoch": 1.3486535008976661, + "grad_norm": 0.6485389471054077, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 18780 + }, + { + "epoch": 1.3493716337522441, + "grad_norm": 0.6755139827728271, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 18790 + }, + { + "epoch": 1.3500897666068223, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 18800 + }, + { + "epoch": 1.3508078994614003, + "grad_norm": 0.6954510807991028, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 18810 + }, + { + "epoch": 1.3515260323159786, + "grad_norm": 0.9948558807373047, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 18820 + }, + { + "epoch": 1.3522441651705566, + "grad_norm": 0.708381175994873, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18830 + }, + { + "epoch": 1.3529622980251346, + "grad_norm": 0.6409999132156372, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 18840 + }, + { + "epoch": 1.3536804308797128, + "grad_norm": 0.6365936994552612, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18850 + }, + { + "epoch": 1.3543985637342908, + "grad_norm": 0.7620742917060852, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 18860 + }, + { + "epoch": 1.355116696588869, + "grad_norm": 0.6849071383476257, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 18870 + }, + { + "epoch": 1.355834829443447, + "grad_norm": 0.5776316523551941, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18880 + }, + { + "epoch": 1.356552962298025, + "grad_norm": 0.597236156463623, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 18890 + }, + { + "epoch": 1.3572710951526032, + "grad_norm": 0.6569282412528992, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 18900 + }, + { + "epoch": 1.3579892280071812, + "grad_norm": 0.6384802460670471, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 18910 + }, + { + "epoch": 1.3587073608617595, + "grad_norm": 0.6623879671096802, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 18920 + }, + { + "epoch": 1.3594254937163375, + "grad_norm": 0.6149632334709167, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 18930 + }, + { + "epoch": 1.3601436265709157, + "grad_norm": 0.6978002190589905, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 18940 + }, + { + "epoch": 1.3608617594254937, + "grad_norm": 0.7579124569892883, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 18950 + }, + { + "epoch": 1.361579892280072, + "grad_norm": 0.7138084173202515, + "learning_rate": 0.0002, + "loss": 0.7589, + "step": 18960 + }, + { + "epoch": 1.36229802513465, + "grad_norm": 0.678322434425354, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18970 + }, + { + "epoch": 1.363016157989228, + "grad_norm": 0.694346010684967, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18980 + }, + { + "epoch": 1.3637342908438062, + "grad_norm": 0.682262659072876, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18990 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 0.9068194627761841, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 19000 + }, + { + "epoch": 1.3651705565529624, + "grad_norm": 0.6691566705703735, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 19010 + }, + { + "epoch": 1.3658886894075404, + "grad_norm": 0.7791378498077393, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 19020 + }, + { + "epoch": 1.3666068222621184, + "grad_norm": 0.717107355594635, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 19030 + }, + { + "epoch": 1.3673249551166966, + "grad_norm": 0.7897566556930542, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 19040 + }, + { + "epoch": 1.3680430879712746, + "grad_norm": 0.8823844790458679, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 19050 + }, + { + "epoch": 1.3687612208258528, + "grad_norm": 0.6512053608894348, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 19060 + }, + { + "epoch": 1.3694793536804308, + "grad_norm": 0.6871389150619507, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 19070 + }, + { + "epoch": 1.370197486535009, + "grad_norm": 0.6795603036880493, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 19080 + }, + { + "epoch": 1.370915619389587, + "grad_norm": 0.6569121479988098, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 19090 + }, + { + "epoch": 1.3716337522441653, + "grad_norm": 0.6769960522651672, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 19100 + }, + { + "epoch": 1.3723518850987433, + "grad_norm": 0.726613461971283, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 19110 + }, + { + "epoch": 1.3730700179533213, + "grad_norm": 0.7287817001342773, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 19120 + }, + { + "epoch": 1.3737881508078995, + "grad_norm": 0.6169242858886719, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 19130 + }, + { + "epoch": 1.3745062836624775, + "grad_norm": 0.6537347435951233, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 19140 + }, + { + "epoch": 1.3752244165170557, + "grad_norm": 0.6113879680633545, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 19150 + }, + { + "epoch": 1.3759425493716337, + "grad_norm": 0.6415297985076904, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 19160 + }, + { + "epoch": 1.3766606822262117, + "grad_norm": 0.6812838315963745, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 19170 + }, + { + "epoch": 1.37737881508079, + "grad_norm": 0.7331814169883728, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 19180 + }, + { + "epoch": 1.378096947935368, + "grad_norm": 0.7265108823776245, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 19190 + }, + { + "epoch": 1.3788150807899462, + "grad_norm": 0.6233167052268982, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 19200 + }, + { + "epoch": 1.3795332136445242, + "grad_norm": 0.6841492652893066, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 19210 + }, + { + "epoch": 1.3802513464991024, + "grad_norm": 0.822853684425354, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 19220 + }, + { + "epoch": 1.3809694793536804, + "grad_norm": 0.8078812956809998, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 19230 + }, + { + "epoch": 1.3816876122082586, + "grad_norm": 0.7269898056983948, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 19240 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 0.6297033429145813, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 19250 + }, + { + "epoch": 1.3831238779174146, + "grad_norm": 0.8097442388534546, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 19260 + }, + { + "epoch": 1.3838420107719929, + "grad_norm": 0.6442803740501404, + "learning_rate": 0.0002, + "loss": 0.7281, + "step": 19270 + }, + { + "epoch": 1.3845601436265709, + "grad_norm": 0.659866213798523, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 19280 + }, + { + "epoch": 1.385278276481149, + "grad_norm": 0.7537921667098999, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 19290 + }, + { + "epoch": 1.385996409335727, + "grad_norm": 0.8441828489303589, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 19300 + }, + { + "epoch": 1.386714542190305, + "grad_norm": 0.8506057262420654, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19310 + }, + { + "epoch": 1.3874326750448833, + "grad_norm": 0.6747094392776489, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 19320 + }, + { + "epoch": 1.3881508078994613, + "grad_norm": 0.7906509041786194, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 19330 + }, + { + "epoch": 1.3888689407540395, + "grad_norm": 0.6784867644309998, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 19340 + }, + { + "epoch": 1.3895870736086176, + "grad_norm": 0.6371709108352661, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 19350 + }, + { + "epoch": 1.3903052064631956, + "grad_norm": 0.7858285307884216, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 19360 + }, + { + "epoch": 1.3910233393177738, + "grad_norm": 0.711395263671875, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19370 + }, + { + "epoch": 1.391741472172352, + "grad_norm": 0.7023257613182068, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19380 + }, + { + "epoch": 1.39245960502693, + "grad_norm": 0.7036022543907166, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19390 + }, + { + "epoch": 1.393177737881508, + "grad_norm": 0.6418436169624329, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 19400 + }, + { + "epoch": 1.3938958707360862, + "grad_norm": 0.7108847498893738, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 19410 + }, + { + "epoch": 1.3946140035906642, + "grad_norm": 0.6940230131149292, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 19420 + }, + { + "epoch": 1.3953321364452425, + "grad_norm": 0.6750220656394958, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 19430 + }, + { + "epoch": 1.3960502692998205, + "grad_norm": 0.7479177713394165, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 19440 + }, + { + "epoch": 1.3967684021543985, + "grad_norm": 0.626124918460846, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 19450 + }, + { + "epoch": 1.3974865350089767, + "grad_norm": 0.8908559083938599, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 19460 + }, + { + "epoch": 1.3982046678635547, + "grad_norm": 0.6163712739944458, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 19470 + }, + { + "epoch": 1.398922800718133, + "grad_norm": 0.6993312239646912, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 19480 + }, + { + "epoch": 1.399640933572711, + "grad_norm": 0.6162890791893005, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 19490 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 0.7797643542289734, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 19500 + }, + { + "epoch": 1.4010771992818671, + "grad_norm": 0.7038744688034058, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 19510 + }, + { + "epoch": 1.4017953321364454, + "grad_norm": 0.6902393698692322, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 19520 + }, + { + "epoch": 1.4025134649910234, + "grad_norm": 0.5436386466026306, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 19530 + }, + { + "epoch": 1.4032315978456014, + "grad_norm": 0.6537990570068359, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19540 + }, + { + "epoch": 1.4039497307001796, + "grad_norm": 0.739691972732544, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 19550 + }, + { + "epoch": 1.4046678635547576, + "grad_norm": 0.7287635803222656, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 19560 + }, + { + "epoch": 1.4053859964093358, + "grad_norm": 0.6809501051902771, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 19570 + }, + { + "epoch": 1.4061041292639138, + "grad_norm": 0.8302195072174072, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 19580 + }, + { + "epoch": 1.4068222621184918, + "grad_norm": 0.6613629460334778, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 19590 + }, + { + "epoch": 1.40754039497307, + "grad_norm": 0.7897207736968994, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 19600 + }, + { + "epoch": 1.408258527827648, + "grad_norm": 0.8368293642997742, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 19610 + }, + { + "epoch": 1.4089766606822263, + "grad_norm": 0.665109395980835, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 19620 + }, + { + "epoch": 1.4096947935368043, + "grad_norm": 0.7359302639961243, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 19630 + }, + { + "epoch": 1.4104129263913823, + "grad_norm": 0.8048052787780762, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 19640 + }, + { + "epoch": 1.4111310592459605, + "grad_norm": 0.7414906620979309, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 19650 + }, + { + "epoch": 1.4118491921005387, + "grad_norm": 0.7894161343574524, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 19660 + }, + { + "epoch": 1.4125673249551167, + "grad_norm": 0.6724628210067749, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 19670 + }, + { + "epoch": 1.4132854578096947, + "grad_norm": 0.9397756457328796, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 19680 + }, + { + "epoch": 1.414003590664273, + "grad_norm": 0.6684842109680176, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 19690 + }, + { + "epoch": 1.414721723518851, + "grad_norm": 0.7753993272781372, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 19700 + }, + { + "epoch": 1.4154398563734292, + "grad_norm": 0.6934253573417664, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 19710 + }, + { + "epoch": 1.4161579892280072, + "grad_norm": 0.8567284941673279, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 19720 + }, + { + "epoch": 1.4168761220825852, + "grad_norm": 0.9471787214279175, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 19730 + }, + { + "epoch": 1.4175942549371634, + "grad_norm": 0.6664855480194092, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 19740 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 0.6713361740112305, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 19750 + }, + { + "epoch": 1.4190305206463196, + "grad_norm": 0.6488258838653564, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 19760 + }, + { + "epoch": 1.4197486535008976, + "grad_norm": 0.7089938521385193, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19770 + }, + { + "epoch": 1.4204667863554756, + "grad_norm": 0.6433218717575073, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 19780 + }, + { + "epoch": 1.4211849192100539, + "grad_norm": 0.7025160193443298, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 19790 + }, + { + "epoch": 1.421903052064632, + "grad_norm": 0.7030544877052307, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 19800 + }, + { + "epoch": 1.42262118491921, + "grad_norm": 0.6515552401542664, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 19810 + }, + { + "epoch": 1.423339317773788, + "grad_norm": 0.6463841795921326, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 19820 + }, + { + "epoch": 1.4240574506283663, + "grad_norm": 0.6654344201087952, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19830 + }, + { + "epoch": 1.4247755834829443, + "grad_norm": 0.7223384380340576, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 19840 + }, + { + "epoch": 1.4254937163375225, + "grad_norm": 0.6575722694396973, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 19850 + }, + { + "epoch": 1.4262118491921005, + "grad_norm": 0.6216059327125549, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 19860 + }, + { + "epoch": 1.4269299820466785, + "grad_norm": 0.7451487183570862, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19870 + }, + { + "epoch": 1.4276481149012568, + "grad_norm": 0.6563336253166199, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 19880 + }, + { + "epoch": 1.4283662477558348, + "grad_norm": 0.8021975159645081, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 19890 + }, + { + "epoch": 1.429084380610413, + "grad_norm": 0.7474712133407593, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 19900 + }, + { + "epoch": 1.429802513464991, + "grad_norm": 0.7316377758979797, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 19910 + }, + { + "epoch": 1.430520646319569, + "grad_norm": 0.646892786026001, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 19920 + }, + { + "epoch": 1.4312387791741472, + "grad_norm": 0.6268765926361084, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 19930 + }, + { + "epoch": 1.4319569120287254, + "grad_norm": 0.7104699611663818, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 19940 + }, + { + "epoch": 1.4326750448833034, + "grad_norm": 0.6742063760757446, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 19950 + }, + { + "epoch": 1.4333931777378814, + "grad_norm": 0.6973381638526917, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 19960 + }, + { + "epoch": 1.4341113105924597, + "grad_norm": 0.5819381475448608, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 19970 + }, + { + "epoch": 1.4348294434470377, + "grad_norm": 0.680623471736908, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 19980 + }, + { + "epoch": 1.435547576301616, + "grad_norm": 0.5899890661239624, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 19990 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 0.6225098371505737, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 20000 + }, + { + "epoch": 1.436983842010772, + "grad_norm": 0.6314228773117065, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 20010 + }, + { + "epoch": 1.4377019748653501, + "grad_norm": 0.8690667152404785, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 20020 + }, + { + "epoch": 1.4384201077199281, + "grad_norm": 0.7166543006896973, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 20030 + }, + { + "epoch": 1.4391382405745063, + "grad_norm": 0.7051591873168945, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 20040 + }, + { + "epoch": 1.4398563734290843, + "grad_norm": 0.7606652975082397, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 20050 + }, + { + "epoch": 1.4405745062836623, + "grad_norm": 0.6343185305595398, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 20060 + }, + { + "epoch": 1.4412926391382406, + "grad_norm": 0.5625789761543274, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 20070 + }, + { + "epoch": 1.4420107719928188, + "grad_norm": 0.6081897020339966, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 20080 + }, + { + "epoch": 1.4427289048473968, + "grad_norm": 0.9571536779403687, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 20090 + }, + { + "epoch": 1.4434470377019748, + "grad_norm": 0.869531512260437, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 20100 + }, + { + "epoch": 1.444165170556553, + "grad_norm": 0.6865507960319519, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 20110 + }, + { + "epoch": 1.444883303411131, + "grad_norm": 0.7572755813598633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 20120 + }, + { + "epoch": 1.4456014362657092, + "grad_norm": 0.79011070728302, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 20130 + }, + { + "epoch": 1.4463195691202873, + "grad_norm": 0.8297342658042908, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 20140 + }, + { + "epoch": 1.4470377019748653, + "grad_norm": 0.6593490839004517, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 20150 + }, + { + "epoch": 1.4477558348294435, + "grad_norm": 1.0264687538146973, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 20160 + }, + { + "epoch": 1.4484739676840215, + "grad_norm": 0.7032888531684875, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 20170 + }, + { + "epoch": 1.4491921005385997, + "grad_norm": 0.6438494920730591, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 20180 + }, + { + "epoch": 1.4499102333931777, + "grad_norm": 0.7448790669441223, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 20190 + }, + { + "epoch": 1.4506283662477557, + "grad_norm": 0.7551555037498474, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 20200 + }, + { + "epoch": 1.451346499102334, + "grad_norm": 0.6677857041358948, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 20210 + }, + { + "epoch": 1.4520646319569122, + "grad_norm": 0.7888486385345459, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 20220 + }, + { + "epoch": 1.4527827648114902, + "grad_norm": 0.6658565402030945, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 20230 + }, + { + "epoch": 1.4535008976660682, + "grad_norm": 0.6800249814987183, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 20240 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 0.7419682741165161, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 20250 + }, + { + "epoch": 1.4549371633752244, + "grad_norm": 0.8848792910575867, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 20260 + }, + { + "epoch": 1.4556552962298026, + "grad_norm": 0.6513857245445251, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 20270 + }, + { + "epoch": 1.4563734290843806, + "grad_norm": 0.5605742335319519, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 20280 + }, + { + "epoch": 1.4570915619389586, + "grad_norm": 0.6737141013145447, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 20290 + }, + { + "epoch": 1.4578096947935368, + "grad_norm": 0.6663289666175842, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 20300 + }, + { + "epoch": 1.4585278276481148, + "grad_norm": 0.7157106995582581, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20310 + }, + { + "epoch": 1.459245960502693, + "grad_norm": 0.7713354825973511, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 20320 + }, + { + "epoch": 1.459964093357271, + "grad_norm": 0.8334044218063354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 20330 + }, + { + "epoch": 1.460682226211849, + "grad_norm": 0.7268327474594116, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 20340 + }, + { + "epoch": 1.4614003590664273, + "grad_norm": 0.6791431903839111, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 20350 + }, + { + "epoch": 1.4621184919210055, + "grad_norm": 0.8177870512008667, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 20360 + }, + { + "epoch": 1.4628366247755835, + "grad_norm": 0.8064364790916443, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 20370 + }, + { + "epoch": 1.4635547576301615, + "grad_norm": 0.6547006964683533, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 20380 + }, + { + "epoch": 1.4642728904847397, + "grad_norm": 0.6381436586380005, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 20390 + }, + { + "epoch": 1.4649910233393177, + "grad_norm": 0.7351248264312744, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 20400 + }, + { + "epoch": 1.465709156193896, + "grad_norm": 0.7037558555603027, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 20410 + }, + { + "epoch": 1.466427289048474, + "grad_norm": 0.6294074654579163, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 20420 + }, + { + "epoch": 1.467145421903052, + "grad_norm": 0.9722632765769958, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 20430 + }, + { + "epoch": 1.4678635547576302, + "grad_norm": 0.753065824508667, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 20440 + }, + { + "epoch": 1.4685816876122082, + "grad_norm": 0.7317194938659668, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20450 + }, + { + "epoch": 1.4692998204667864, + "grad_norm": 0.6862193942070007, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 20460 + }, + { + "epoch": 1.4700179533213644, + "grad_norm": 0.7643225193023682, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 20470 + }, + { + "epoch": 1.4707360861759424, + "grad_norm": 0.5904353260993958, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 20480 + }, + { + "epoch": 1.4714542190305206, + "grad_norm": 0.5812238454818726, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20490 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 0.7478151321411133, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 20500 + }, + { + "epoch": 1.4728904847396769, + "grad_norm": 0.7625645399093628, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 20510 + }, + { + "epoch": 1.4736086175942549, + "grad_norm": 0.6354498267173767, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 20520 + }, + { + "epoch": 1.474326750448833, + "grad_norm": 0.8731162548065186, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 20530 + }, + { + "epoch": 1.475044883303411, + "grad_norm": 0.7346670627593994, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 20540 + }, + { + "epoch": 1.4757630161579893, + "grad_norm": 1.038447618484497, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 20550 + }, + { + "epoch": 1.4764811490125673, + "grad_norm": 0.7032809257507324, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 20560 + }, + { + "epoch": 1.4771992818671453, + "grad_norm": 0.8008337020874023, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 20570 + }, + { + "epoch": 1.4779174147217236, + "grad_norm": 0.6735056638717651, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 20580 + }, + { + "epoch": 1.4786355475763016, + "grad_norm": 0.622056245803833, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 20590 + }, + { + "epoch": 1.4793536804308798, + "grad_norm": 0.6580422520637512, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 20600 + }, + { + "epoch": 1.4800718132854578, + "grad_norm": 0.8401153087615967, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20610 + }, + { + "epoch": 1.4807899461400358, + "grad_norm": 0.7564560770988464, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 20620 + }, + { + "epoch": 1.481508078994614, + "grad_norm": 0.8319511413574219, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 20630 + }, + { + "epoch": 1.4822262118491922, + "grad_norm": 0.7430182695388794, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 20640 + }, + { + "epoch": 1.4829443447037702, + "grad_norm": 0.7996522784233093, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 20650 + }, + { + "epoch": 1.4836624775583482, + "grad_norm": 0.6993277072906494, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 20660 + }, + { + "epoch": 1.4843806104129265, + "grad_norm": 0.8621185421943665, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 20670 + }, + { + "epoch": 1.4850987432675045, + "grad_norm": 0.7709757685661316, + "learning_rate": 0.0002, + "loss": 0.7327, + "step": 20680 + }, + { + "epoch": 1.4858168761220827, + "grad_norm": 0.743760347366333, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 20690 + }, + { + "epoch": 1.4865350089766607, + "grad_norm": 0.8353745341300964, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 20700 + }, + { + "epoch": 1.4872531418312387, + "grad_norm": 0.8510433435440063, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 20710 + }, + { + "epoch": 1.487971274685817, + "grad_norm": 0.7065894603729248, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 20720 + }, + { + "epoch": 1.488689407540395, + "grad_norm": 0.6878955960273743, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 20730 + }, + { + "epoch": 1.4894075403949731, + "grad_norm": 0.7861111760139465, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 20740 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 0.4810725152492523, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20750 + }, + { + "epoch": 1.4908438061041291, + "grad_norm": 0.7246082425117493, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 20760 + }, + { + "epoch": 1.4915619389587074, + "grad_norm": 0.7101936340332031, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 20770 + }, + { + "epoch": 1.4922800718132856, + "grad_norm": 0.7508591413497925, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 20780 + }, + { + "epoch": 1.4929982046678636, + "grad_norm": 0.8872039914131165, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 20790 + }, + { + "epoch": 1.4937163375224416, + "grad_norm": 0.7257922887802124, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 20800 + }, + { + "epoch": 1.4944344703770198, + "grad_norm": 0.7886278629302979, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 20810 + }, + { + "epoch": 1.4951526032315978, + "grad_norm": 0.6746290922164917, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 20820 + }, + { + "epoch": 1.495870736086176, + "grad_norm": 0.8118207454681396, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 20830 + }, + { + "epoch": 1.496588868940754, + "grad_norm": 0.7337301969528198, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 20840 + }, + { + "epoch": 1.497307001795332, + "grad_norm": 0.5451242327690125, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 20850 + }, + { + "epoch": 1.4980251346499103, + "grad_norm": 0.8398377299308777, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 20860 + }, + { + "epoch": 1.4987432675044883, + "grad_norm": 0.7196659445762634, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 20870 + }, + { + "epoch": 1.4994614003590665, + "grad_norm": 0.6659539937973022, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 20880 + }, + { + "epoch": 1.5001795332136445, + "grad_norm": 0.6071978807449341, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 20890 + }, + { + "epoch": 1.5008976660682225, + "grad_norm": 0.6704870462417603, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 20900 + }, + { + "epoch": 1.5016157989228007, + "grad_norm": 0.7216639518737793, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 20910 + }, + { + "epoch": 1.502333931777379, + "grad_norm": 0.6050528287887573, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 20920 + }, + { + "epoch": 1.503052064631957, + "grad_norm": 0.7422218918800354, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 20930 + }, + { + "epoch": 1.503770197486535, + "grad_norm": 0.7157148122787476, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20940 + }, + { + "epoch": 1.504488330341113, + "grad_norm": 0.6704899668693542, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 20950 + }, + { + "epoch": 1.5052064631956912, + "grad_norm": 0.7573544979095459, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 20960 + }, + { + "epoch": 1.5059245960502694, + "grad_norm": 0.6710506677627563, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 20970 + }, + { + "epoch": 1.5066427289048474, + "grad_norm": 0.7559793591499329, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 20980 + }, + { + "epoch": 1.5073608617594254, + "grad_norm": 0.6705940961837769, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 20990 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 0.8016680479049683, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21000 + }, + { + "epoch": 1.5087971274685816, + "grad_norm": 0.8154481649398804, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 21010 + }, + { + "epoch": 1.5095152603231599, + "grad_norm": 0.5830582976341248, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 21020 + }, + { + "epoch": 1.5102333931777379, + "grad_norm": 0.7088601589202881, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 21030 + }, + { + "epoch": 1.5109515260323159, + "grad_norm": 0.7499658465385437, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 21040 + }, + { + "epoch": 1.511669658886894, + "grad_norm": 0.7684667706489563, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 21050 + }, + { + "epoch": 1.5123877917414723, + "grad_norm": 0.7183627486228943, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 21060 + }, + { + "epoch": 1.5131059245960503, + "grad_norm": 0.8201524615287781, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 21070 + }, + { + "epoch": 1.5138240574506283, + "grad_norm": 0.6359647512435913, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 21080 + }, + { + "epoch": 1.5145421903052063, + "grad_norm": 0.7419124245643616, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 21090 + }, + { + "epoch": 1.5152603231597845, + "grad_norm": 0.6145808696746826, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 21100 + }, + { + "epoch": 1.5159784560143628, + "grad_norm": 0.7116656303405762, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 21110 + }, + { + "epoch": 1.5166965888689408, + "grad_norm": 0.8927125334739685, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 21120 + }, + { + "epoch": 1.5174147217235188, + "grad_norm": 0.7527788877487183, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 21130 + }, + { + "epoch": 1.518132854578097, + "grad_norm": 0.7537266612052917, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 21140 + }, + { + "epoch": 1.518850987432675, + "grad_norm": 0.9051724672317505, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 21150 + }, + { + "epoch": 1.5195691202872532, + "grad_norm": 0.7258086800575256, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 21160 + }, + { + "epoch": 1.5202872531418312, + "grad_norm": 0.60377436876297, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 21170 + }, + { + "epoch": 1.5210053859964092, + "grad_norm": 0.613362729549408, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 21180 + }, + { + "epoch": 1.5217235188509874, + "grad_norm": 0.6311782002449036, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 21190 + }, + { + "epoch": 1.5224416517055657, + "grad_norm": 0.7814380526542664, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 21200 + }, + { + "epoch": 1.5231597845601437, + "grad_norm": 0.8482790589332581, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 21210 + }, + { + "epoch": 1.5238779174147217, + "grad_norm": 0.6767336130142212, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21220 + }, + { + "epoch": 1.5245960502692997, + "grad_norm": 0.7000219821929932, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 21230 + }, + { + "epoch": 1.525314183123878, + "grad_norm": 0.8848617076873779, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 21240 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 0.692258894443512, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 21250 + }, + { + "epoch": 1.5267504488330341, + "grad_norm": 0.7701950073242188, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 21260 + }, + { + "epoch": 1.5274685816876121, + "grad_norm": 0.7454132437705994, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 21270 + }, + { + "epoch": 1.5281867145421903, + "grad_norm": 0.7299574613571167, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 21280 + }, + { + "epoch": 1.5289048473967684, + "grad_norm": 0.6693950891494751, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 21290 + }, + { + "epoch": 1.5296229802513466, + "grad_norm": 0.8323785066604614, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 21300 + }, + { + "epoch": 1.5303411131059246, + "grad_norm": 0.8998763561248779, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 21310 + }, + { + "epoch": 1.5310592459605026, + "grad_norm": 0.8118193745613098, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 21320 + }, + { + "epoch": 1.5317773788150808, + "grad_norm": 0.8966332077980042, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 21330 + }, + { + "epoch": 1.532495511669659, + "grad_norm": 0.7849827408790588, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 21340 + }, + { + "epoch": 1.533213644524237, + "grad_norm": 0.897583544254303, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 21350 + }, + { + "epoch": 1.533931777378815, + "grad_norm": 0.7998009324073792, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21360 + }, + { + "epoch": 1.534649910233393, + "grad_norm": 0.5890361070632935, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 21370 + }, + { + "epoch": 1.5353680430879713, + "grad_norm": 0.7321302890777588, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 21380 + }, + { + "epoch": 1.5360861759425495, + "grad_norm": 0.7746050357818604, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 21390 + }, + { + "epoch": 1.5368043087971275, + "grad_norm": 0.7033910155296326, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 21400 + }, + { + "epoch": 1.5375224416517055, + "grad_norm": 0.7229148149490356, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 21410 + }, + { + "epoch": 1.5382405745062837, + "grad_norm": 0.8055810928344727, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 21420 + }, + { + "epoch": 1.5389587073608617, + "grad_norm": 0.9411654472351074, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 21430 + }, + { + "epoch": 1.53967684021544, + "grad_norm": 0.7297126650810242, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21440 + }, + { + "epoch": 1.540394973070018, + "grad_norm": 0.7316457629203796, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 21450 + }, + { + "epoch": 1.541113105924596, + "grad_norm": 0.8568798303604126, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 21460 + }, + { + "epoch": 1.5418312387791742, + "grad_norm": 0.7829580307006836, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21470 + }, + { + "epoch": 1.5425493716337524, + "grad_norm": 0.6679823398590088, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 21480 + }, + { + "epoch": 1.5432675044883304, + "grad_norm": 0.5680868029594421, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 21490 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 0.6878862380981445, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 21500 + }, + { + "epoch": 1.5447037701974864, + "grad_norm": 0.7391727566719055, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 21510 + }, + { + "epoch": 1.5454219030520646, + "grad_norm": 0.844994843006134, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 21520 + }, + { + "epoch": 1.5461400359066428, + "grad_norm": 0.7852550148963928, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 21530 + }, + { + "epoch": 1.5468581687612208, + "grad_norm": 0.8370407223701477, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 21540 + }, + { + "epoch": 1.5475763016157988, + "grad_norm": 0.7138169407844543, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 21550 + }, + { + "epoch": 1.548294434470377, + "grad_norm": 0.7660839557647705, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 21560 + }, + { + "epoch": 1.549012567324955, + "grad_norm": 0.6628666520118713, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 21570 + }, + { + "epoch": 1.5497307001795333, + "grad_norm": 0.602262020111084, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 21580 + }, + { + "epoch": 1.5504488330341113, + "grad_norm": 0.6120333671569824, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 21590 + }, + { + "epoch": 1.5511669658886893, + "grad_norm": 0.6742582321166992, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 21600 + }, + { + "epoch": 1.5518850987432675, + "grad_norm": 0.6788192391395569, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 21610 + }, + { + "epoch": 1.5526032315978457, + "grad_norm": 0.7124713659286499, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 21620 + }, + { + "epoch": 1.5533213644524237, + "grad_norm": 0.6297248005867004, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 21630 + }, + { + "epoch": 1.5540394973070017, + "grad_norm": 0.8977078199386597, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21640 + }, + { + "epoch": 1.5547576301615798, + "grad_norm": 0.7543209791183472, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 21650 + }, + { + "epoch": 1.555475763016158, + "grad_norm": 0.8704302310943604, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 21660 + }, + { + "epoch": 1.5561938958707362, + "grad_norm": 0.7848012447357178, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 21670 + }, + { + "epoch": 1.5569120287253142, + "grad_norm": 0.7496278285980225, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 21680 + }, + { + "epoch": 1.5576301615798922, + "grad_norm": 0.7305200099945068, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 21690 + }, + { + "epoch": 1.5583482944344704, + "grad_norm": 0.6671105623245239, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 21700 + }, + { + "epoch": 1.5590664272890484, + "grad_norm": 0.8536111116409302, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 21710 + }, + { + "epoch": 1.5597845601436267, + "grad_norm": 0.7360461354255676, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 21720 + }, + { + "epoch": 1.5605026929982047, + "grad_norm": 0.6665109395980835, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 21730 + }, + { + "epoch": 1.5612208258527827, + "grad_norm": 0.5879628658294678, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 21740 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 0.6937240958213806, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 21750 + }, + { + "epoch": 1.562657091561939, + "grad_norm": 0.7118659019470215, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 21760 + }, + { + "epoch": 1.563375224416517, + "grad_norm": 0.7858866453170776, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 21770 + }, + { + "epoch": 1.564093357271095, + "grad_norm": 0.8691372871398926, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 21780 + }, + { + "epoch": 1.564811490125673, + "grad_norm": 0.8884942531585693, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 21790 + }, + { + "epoch": 1.5655296229802513, + "grad_norm": 0.6335656046867371, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 21800 + }, + { + "epoch": 1.5662477558348296, + "grad_norm": 0.8666166067123413, + "learning_rate": 0.0002, + "loss": 0.7233, + "step": 21810 + }, + { + "epoch": 1.5669658886894076, + "grad_norm": 0.7961624264717102, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 21820 + }, + { + "epoch": 1.5676840215439856, + "grad_norm": 0.6331174373626709, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 21830 + }, + { + "epoch": 1.5684021543985638, + "grad_norm": 0.6476998925209045, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 21840 + }, + { + "epoch": 1.5691202872531418, + "grad_norm": 0.8279129266738892, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 21850 + }, + { + "epoch": 1.56983842010772, + "grad_norm": 0.6997109651565552, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 21860 + }, + { + "epoch": 1.570556552962298, + "grad_norm": 0.6992211937904358, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 21870 + }, + { + "epoch": 1.571274685816876, + "grad_norm": 0.7766915559768677, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 21880 + }, + { + "epoch": 1.5719928186714542, + "grad_norm": 0.6845845580101013, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 21890 + }, + { + "epoch": 1.5727109515260325, + "grad_norm": 0.7247874140739441, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 21900 + }, + { + "epoch": 1.5734290843806105, + "grad_norm": 0.802342414855957, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21910 + }, + { + "epoch": 1.5741472172351885, + "grad_norm": 0.7797709107398987, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 21920 + }, + { + "epoch": 1.5748653500897665, + "grad_norm": 0.6534958481788635, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21930 + }, + { + "epoch": 1.5755834829443447, + "grad_norm": 0.6003528237342834, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 21940 + }, + { + "epoch": 1.576301615798923, + "grad_norm": 0.6920075416564941, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 21950 + }, + { + "epoch": 1.577019748653501, + "grad_norm": 0.7213456034660339, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 21960 + }, + { + "epoch": 1.577737881508079, + "grad_norm": 0.7101914286613464, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 21970 + }, + { + "epoch": 1.5784560143626571, + "grad_norm": 0.9531592130661011, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 21980 + }, + { + "epoch": 1.5791741472172351, + "grad_norm": 0.7690590023994446, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 21990 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 0.8226363062858582, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 22000 + }, + { + "epoch": 1.5806104129263914, + "grad_norm": 0.6128851175308228, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 22010 + }, + { + "epoch": 1.5813285457809694, + "grad_norm": 0.827008068561554, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 22020 + }, + { + "epoch": 1.5820466786355476, + "grad_norm": 0.6729007363319397, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 22030 + }, + { + "epoch": 1.5827648114901258, + "grad_norm": 0.6397014260292053, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 22040 + }, + { + "epoch": 1.5834829443447038, + "grad_norm": 0.6927793622016907, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 22050 + }, + { + "epoch": 1.5842010771992818, + "grad_norm": 0.7527112364768982, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 22060 + }, + { + "epoch": 1.5849192100538598, + "grad_norm": 0.6418012380599976, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 22070 + }, + { + "epoch": 1.585637342908438, + "grad_norm": 0.7627281546592712, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 22080 + }, + { + "epoch": 1.5863554757630163, + "grad_norm": 0.753851592540741, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22090 + }, + { + "epoch": 1.5870736086175943, + "grad_norm": 0.6049349904060364, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 22100 + }, + { + "epoch": 1.5877917414721723, + "grad_norm": 0.6677758693695068, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 22110 + }, + { + "epoch": 1.5885098743267505, + "grad_norm": 0.913489818572998, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22120 + }, + { + "epoch": 1.5892280071813285, + "grad_norm": 0.6779162883758545, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 22130 + }, + { + "epoch": 1.5899461400359067, + "grad_norm": 0.910076916217804, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 22140 + }, + { + "epoch": 1.5906642728904847, + "grad_norm": 0.9506068229675293, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 22150 + }, + { + "epoch": 1.5913824057450627, + "grad_norm": 0.6552460789680481, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 22160 + }, + { + "epoch": 1.592100538599641, + "grad_norm": 0.6855819821357727, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22170 + }, + { + "epoch": 1.5928186714542192, + "grad_norm": 0.6713384985923767, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 22180 + }, + { + "epoch": 1.5935368043087972, + "grad_norm": 0.7168547511100769, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 22190 + }, + { + "epoch": 1.5942549371633752, + "grad_norm": 0.8395482897758484, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22200 + }, + { + "epoch": 1.5949730700179532, + "grad_norm": 0.6676998138427734, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 22210 + }, + { + "epoch": 1.5956912028725314, + "grad_norm": 0.5837140083312988, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 22220 + }, + { + "epoch": 1.5964093357271096, + "grad_norm": 0.8399306535720825, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 22230 + }, + { + "epoch": 1.5971274685816876, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22240 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 0.768604040145874, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 22250 + }, + { + "epoch": 1.5985637342908436, + "grad_norm": 0.6382646560668945, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 22260 + }, + { + "epoch": 1.5992818671454219, + "grad_norm": 0.7244897484779358, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 22270 + }, + { + "epoch": 1.6, + "grad_norm": 0.6250987648963928, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 22280 + }, + { + "epoch": 1.600718132854578, + "grad_norm": 0.8731992244720459, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 22290 + }, + { + "epoch": 1.601436265709156, + "grad_norm": 0.5861822962760925, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 22300 + }, + { + "epoch": 1.6021543985637343, + "grad_norm": 0.716805100440979, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 22310 + }, + { + "epoch": 1.6028725314183125, + "grad_norm": 0.6650034189224243, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 22320 + }, + { + "epoch": 1.6035906642728905, + "grad_norm": 0.6944432854652405, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 22330 + }, + { + "epoch": 1.6043087971274685, + "grad_norm": 0.7411999106407166, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 22340 + }, + { + "epoch": 1.6050269299820465, + "grad_norm": 0.831828773021698, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 22350 + }, + { + "epoch": 1.6057450628366248, + "grad_norm": 0.6252152919769287, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 22360 + }, + { + "epoch": 1.606463195691203, + "grad_norm": 0.8643325567245483, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22370 + }, + { + "epoch": 1.607181328545781, + "grad_norm": 0.7330279350280762, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 22380 + }, + { + "epoch": 1.607899461400359, + "grad_norm": 0.7235422730445862, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 22390 + }, + { + "epoch": 1.608617594254937, + "grad_norm": 0.6940887570381165, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 22400 + }, + { + "epoch": 1.6093357271095152, + "grad_norm": 0.7907325625419617, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 22410 + }, + { + "epoch": 1.6100538599640934, + "grad_norm": 0.6899075508117676, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 22420 + }, + { + "epoch": 1.6107719928186714, + "grad_norm": 0.7057487368583679, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 22430 + }, + { + "epoch": 1.6114901256732495, + "grad_norm": 0.9235003590583801, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 22440 + }, + { + "epoch": 1.6122082585278277, + "grad_norm": 0.7238173484802246, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22450 + }, + { + "epoch": 1.612926391382406, + "grad_norm": 0.5931997299194336, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 22460 + }, + { + "epoch": 1.613644524236984, + "grad_norm": 0.6705866456031799, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 22470 + }, + { + "epoch": 1.614362657091562, + "grad_norm": 0.7392773032188416, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 22480 + }, + { + "epoch": 1.61508078994614, + "grad_norm": 0.6286543607711792, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 22490 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 0.7467446327209473, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 22500 + }, + { + "epoch": 1.6165170556552964, + "grad_norm": 0.8353021740913391, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 22510 + }, + { + "epoch": 1.6172351885098744, + "grad_norm": 0.7333045601844788, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 22520 + }, + { + "epoch": 1.6179533213644524, + "grad_norm": 0.6203709244728088, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 22530 + }, + { + "epoch": 1.6186714542190304, + "grad_norm": 0.5585690140724182, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 22540 + }, + { + "epoch": 1.6193895870736086, + "grad_norm": 0.7157222032546997, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 22550 + }, + { + "epoch": 1.6201077199281868, + "grad_norm": 0.8129993677139282, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 22560 + }, + { + "epoch": 1.6208258527827648, + "grad_norm": 0.6745335459709167, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 22570 + }, + { + "epoch": 1.6215439856373428, + "grad_norm": 0.7684996724128723, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 22580 + }, + { + "epoch": 1.622262118491921, + "grad_norm": 0.6735436916351318, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22590 + }, + { + "epoch": 1.6229802513464993, + "grad_norm": 0.7394272089004517, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 22600 + }, + { + "epoch": 1.6236983842010773, + "grad_norm": 0.7268046140670776, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 22610 + }, + { + "epoch": 1.6244165170556553, + "grad_norm": 0.8338810205459595, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 22620 + }, + { + "epoch": 1.6251346499102333, + "grad_norm": 0.9293080568313599, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 22630 + }, + { + "epoch": 1.6258527827648115, + "grad_norm": 0.8084996938705444, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 22640 + }, + { + "epoch": 1.6265709156193897, + "grad_norm": 0.6605180501937866, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22650 + }, + { + "epoch": 1.6272890484739677, + "grad_norm": 0.8402717113494873, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 22660 + }, + { + "epoch": 1.6280071813285457, + "grad_norm": 0.653055727481842, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 22670 + }, + { + "epoch": 1.6287253141831237, + "grad_norm": 0.6477823257446289, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 22680 + }, + { + "epoch": 1.629443447037702, + "grad_norm": 0.9053590893745422, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 22690 + }, + { + "epoch": 1.6301615798922802, + "grad_norm": 0.90384441614151, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 22700 + }, + { + "epoch": 1.6308797127468582, + "grad_norm": 0.6789469122886658, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 22710 + }, + { + "epoch": 1.6315978456014362, + "grad_norm": 0.7221854329109192, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 22720 + }, + { + "epoch": 1.6323159784560144, + "grad_norm": 0.7724022269248962, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 22730 + }, + { + "epoch": 1.6330341113105926, + "grad_norm": 0.8213715553283691, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 22740 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 0.7102876305580139, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 22750 + }, + { + "epoch": 1.6344703770197486, + "grad_norm": 0.8817880749702454, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 22760 + }, + { + "epoch": 1.6351885098743266, + "grad_norm": 0.8446506857872009, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 22770 + }, + { + "epoch": 1.6359066427289048, + "grad_norm": 0.6749029755592346, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 22780 + }, + { + "epoch": 1.636624775583483, + "grad_norm": 0.7013556957244873, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 22790 + }, + { + "epoch": 1.637342908438061, + "grad_norm": 0.7767965793609619, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22800 + }, + { + "epoch": 1.638061041292639, + "grad_norm": 0.7354073524475098, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 22810 + }, + { + "epoch": 1.638779174147217, + "grad_norm": 0.8871088027954102, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 22820 + }, + { + "epoch": 1.6394973070017953, + "grad_norm": 0.6573871374130249, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 22830 + }, + { + "epoch": 1.6402154398563735, + "grad_norm": 0.5679349303245544, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 22840 + }, + { + "epoch": 1.6409335727109515, + "grad_norm": 0.7072559595108032, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 22850 + }, + { + "epoch": 1.6416517055655295, + "grad_norm": 0.7639257311820984, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 22860 + }, + { + "epoch": 1.6423698384201078, + "grad_norm": 0.6699341535568237, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 22870 + }, + { + "epoch": 1.643087971274686, + "grad_norm": 0.8285767436027527, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 22880 + }, + { + "epoch": 1.643806104129264, + "grad_norm": 0.7328150272369385, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 22890 + }, + { + "epoch": 1.644524236983842, + "grad_norm": 0.8122354745864868, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 22900 + }, + { + "epoch": 1.64524236983842, + "grad_norm": 0.7322969436645508, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 22910 + }, + { + "epoch": 1.6459605026929982, + "grad_norm": 0.7269576191902161, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 22920 + }, + { + "epoch": 1.6466786355475764, + "grad_norm": 0.7037042379379272, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 22930 + }, + { + "epoch": 1.6473967684021544, + "grad_norm": 0.6960355639457703, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 22940 + }, + { + "epoch": 1.6481149012567324, + "grad_norm": 0.7446839213371277, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 22950 + }, + { + "epoch": 1.6488330341113104, + "grad_norm": 0.7201664447784424, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 22960 + }, + { + "epoch": 1.6495511669658887, + "grad_norm": 0.7062349319458008, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 22970 + }, + { + "epoch": 1.6502692998204669, + "grad_norm": 0.7666636109352112, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 22980 + }, + { + "epoch": 1.6509874326750449, + "grad_norm": 0.7872112393379211, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 22990 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 0.7428551316261292, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 23000 + }, + { + "epoch": 1.6524236983842011, + "grad_norm": 0.6087952852249146, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 23010 + }, + { + "epoch": 1.6531418312387793, + "grad_norm": 0.7191354036331177, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 23020 + }, + { + "epoch": 1.6538599640933573, + "grad_norm": 0.8679710626602173, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 23030 + }, + { + "epoch": 1.6545780969479353, + "grad_norm": 0.7232310175895691, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 23040 + }, + { + "epoch": 1.6552962298025133, + "grad_norm": 0.5695104002952576, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 23050 + }, + { + "epoch": 1.6560143626570916, + "grad_norm": 0.6363076567649841, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 23060 + }, + { + "epoch": 1.6567324955116698, + "grad_norm": 0.8168749809265137, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23070 + }, + { + "epoch": 1.6574506283662478, + "grad_norm": 0.7664111852645874, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 23080 + }, + { + "epoch": 1.6581687612208258, + "grad_norm": 0.6748140454292297, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 23090 + }, + { + "epoch": 1.6588868940754038, + "grad_norm": 0.6258183121681213, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 23100 + }, + { + "epoch": 1.659605026929982, + "grad_norm": 0.8669735193252563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 23110 + }, + { + "epoch": 1.6603231597845602, + "grad_norm": 0.5606119632720947, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 23120 + }, + { + "epoch": 1.6610412926391382, + "grad_norm": 0.6602507829666138, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 23130 + }, + { + "epoch": 1.6617594254937162, + "grad_norm": 0.7237988710403442, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 23140 + }, + { + "epoch": 1.6624775583482945, + "grad_norm": 0.9054415225982666, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 23150 + }, + { + "epoch": 1.6631956912028727, + "grad_norm": 0.5186660289764404, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 23160 + }, + { + "epoch": 1.6639138240574507, + "grad_norm": 0.719584584236145, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 23170 + }, + { + "epoch": 1.6646319569120287, + "grad_norm": 0.7583617568016052, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 23180 + }, + { + "epoch": 1.6653500897666067, + "grad_norm": 0.7985982298851013, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 23190 + }, + { + "epoch": 1.666068222621185, + "grad_norm": 0.6952691674232483, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23200 + }, + { + "epoch": 1.6667863554757631, + "grad_norm": 0.7184221744537354, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 23210 + }, + { + "epoch": 1.6675044883303412, + "grad_norm": 0.8256361484527588, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 23220 + }, + { + "epoch": 1.6682226211849192, + "grad_norm": 0.7534128427505493, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 23230 + }, + { + "epoch": 1.6689407540394972, + "grad_norm": 0.7711095213890076, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 23240 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 0.6326615810394287, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 23250 + }, + { + "epoch": 1.6703770197486536, + "grad_norm": 0.8345766663551331, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 23260 + }, + { + "epoch": 1.6710951526032316, + "grad_norm": 0.9079837203025818, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 23270 + }, + { + "epoch": 1.6718132854578096, + "grad_norm": 0.7310197353363037, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 23280 + }, + { + "epoch": 1.6725314183123878, + "grad_norm": 0.7573344707489014, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 23290 + }, + { + "epoch": 1.673249551166966, + "grad_norm": 0.7708047032356262, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 23300 + }, + { + "epoch": 1.673967684021544, + "grad_norm": 0.7665812969207764, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 23310 + }, + { + "epoch": 1.674685816876122, + "grad_norm": 0.7988788485527039, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 23320 + }, + { + "epoch": 1.6754039497307, + "grad_norm": 0.755042552947998, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 23330 + }, + { + "epoch": 1.6761220825852783, + "grad_norm": 0.6605848670005798, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 23340 + }, + { + "epoch": 1.6768402154398565, + "grad_norm": 0.8762016296386719, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 23350 + }, + { + "epoch": 1.6775583482944345, + "grad_norm": 0.604742169380188, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 23360 + }, + { + "epoch": 1.6782764811490125, + "grad_norm": 0.7479172945022583, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 23370 + }, + { + "epoch": 1.6789946140035905, + "grad_norm": 0.6418702602386475, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 23380 + }, + { + "epoch": 1.6797127468581687, + "grad_norm": 0.6783933639526367, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 23390 + }, + { + "epoch": 1.680430879712747, + "grad_norm": 0.7036024928092957, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 23400 + }, + { + "epoch": 1.681149012567325, + "grad_norm": 0.6833266615867615, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 23410 + }, + { + "epoch": 1.681867145421903, + "grad_norm": 0.8867062330245972, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 23420 + }, + { + "epoch": 1.6825852782764812, + "grad_norm": 0.7825753092765808, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 23430 + }, + { + "epoch": 1.6833034111310592, + "grad_norm": 0.6396880745887756, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 23440 + }, + { + "epoch": 1.6840215439856374, + "grad_norm": 0.5723230242729187, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 23450 + }, + { + "epoch": 1.6847396768402154, + "grad_norm": 0.6949231624603271, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 23460 + }, + { + "epoch": 1.6854578096947934, + "grad_norm": 0.8290650248527527, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 23470 + }, + { + "epoch": 1.6861759425493716, + "grad_norm": 0.7765078544616699, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 23480 + }, + { + "epoch": 1.6868940754039499, + "grad_norm": 0.7084149718284607, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 23490 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 0.6916654109954834, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 23500 + }, + { + "epoch": 1.6883303411131059, + "grad_norm": 0.5615179538726807, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 23510 + }, + { + "epoch": 1.6890484739676839, + "grad_norm": 0.7996105551719666, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 23520 + }, + { + "epoch": 1.689766606822262, + "grad_norm": 0.7010168433189392, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23530 + }, + { + "epoch": 1.6904847396768403, + "grad_norm": 0.7876442074775696, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 23540 + }, + { + "epoch": 1.6912028725314183, + "grad_norm": 0.7508043646812439, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 23550 + }, + { + "epoch": 1.6919210053859963, + "grad_norm": 0.8125874400138855, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 23560 + }, + { + "epoch": 1.6926391382405745, + "grad_norm": 0.711840808391571, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 23570 + }, + { + "epoch": 1.6933572710951525, + "grad_norm": 0.6540026068687439, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 23580 + }, + { + "epoch": 1.6940754039497308, + "grad_norm": 0.8376550078392029, + "learning_rate": 0.0002, + "loss": 0.7578, + "step": 23590 + }, + { + "epoch": 1.6947935368043088, + "grad_norm": 0.7075366973876953, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 23600 + }, + { + "epoch": 1.6955116696588868, + "grad_norm": 0.7522266507148743, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23610 + }, + { + "epoch": 1.696229802513465, + "grad_norm": 0.7572667002677917, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 23620 + }, + { + "epoch": 1.6969479353680432, + "grad_norm": 0.6126907467842102, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 23630 + }, + { + "epoch": 1.6976660682226212, + "grad_norm": 0.7473152875900269, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 23640 + }, + { + "epoch": 1.6983842010771992, + "grad_norm": 0.6630390286445618, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 23650 + }, + { + "epoch": 1.6991023339317772, + "grad_norm": 0.5848073363304138, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 23660 + }, + { + "epoch": 1.6998204667863555, + "grad_norm": 0.5901942849159241, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 23670 + }, + { + "epoch": 1.7005385996409337, + "grad_norm": 0.7896918058395386, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 23680 + }, + { + "epoch": 1.7012567324955117, + "grad_norm": 0.705362856388092, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 23690 + }, + { + "epoch": 1.7019748653500897, + "grad_norm": 0.9917470812797546, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 23700 + }, + { + "epoch": 1.702692998204668, + "grad_norm": 0.7550538778305054, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 23710 + }, + { + "epoch": 1.703411131059246, + "grad_norm": 0.8348238468170166, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23720 + }, + { + "epoch": 1.7041292639138241, + "grad_norm": 0.5979694128036499, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 23730 + }, + { + "epoch": 1.7048473967684021, + "grad_norm": 0.7451775670051575, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 23740 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 0.7614818215370178, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 23750 + }, + { + "epoch": 1.7062836624775584, + "grad_norm": 0.5590742826461792, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 23760 + }, + { + "epoch": 1.7070017953321366, + "grad_norm": 0.7039094567298889, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 23770 + }, + { + "epoch": 1.7077199281867146, + "grad_norm": 0.7963233590126038, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23780 + }, + { + "epoch": 1.7084380610412926, + "grad_norm": 0.7214934825897217, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 23790 + }, + { + "epoch": 1.7091561938958706, + "grad_norm": 0.7310500741004944, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23800 + }, + { + "epoch": 1.7098743267504488, + "grad_norm": 0.6653284430503845, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 23810 + }, + { + "epoch": 1.710592459605027, + "grad_norm": 0.6632702946662903, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 23820 + }, + { + "epoch": 1.711310592459605, + "grad_norm": 0.6314955949783325, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 23830 + }, + { + "epoch": 1.712028725314183, + "grad_norm": 0.73652583360672, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 23840 + }, + { + "epoch": 1.7127468581687613, + "grad_norm": 0.5685144662857056, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 23850 + }, + { + "epoch": 1.7134649910233393, + "grad_norm": 0.7010223865509033, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 23860 + }, + { + "epoch": 1.7141831238779175, + "grad_norm": 0.7643879652023315, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 23870 + }, + { + "epoch": 1.7149012567324955, + "grad_norm": 0.7543165683746338, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 23880 + }, + { + "epoch": 1.7156193895870735, + "grad_norm": 0.8816508054733276, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 23890 + }, + { + "epoch": 1.7163375224416517, + "grad_norm": 0.7979614734649658, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23900 + }, + { + "epoch": 1.71705565529623, + "grad_norm": 0.7631057500839233, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 23910 + }, + { + "epoch": 1.717773788150808, + "grad_norm": 0.6349977254867554, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 23920 + }, + { + "epoch": 1.718491921005386, + "grad_norm": 0.7464412450790405, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 23930 + }, + { + "epoch": 1.719210053859964, + "grad_norm": 0.6985567212104797, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 23940 + }, + { + "epoch": 1.7199281867145422, + "grad_norm": 0.6641302704811096, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 23950 + }, + { + "epoch": 1.7206463195691204, + "grad_norm": 0.7299597263336182, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 23960 + }, + { + "epoch": 1.7213644524236984, + "grad_norm": 0.7812355756759644, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 23970 + }, + { + "epoch": 1.7220825852782764, + "grad_norm": 0.667571485042572, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 23980 + }, + { + "epoch": 1.7228007181328546, + "grad_norm": 0.8244081735610962, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 23990 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 0.6684445738792419, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 24000 + }, + { + "epoch": 1.7242369838420109, + "grad_norm": 0.7002949118614197, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 24010 + }, + { + "epoch": 1.7249551166965889, + "grad_norm": 0.6249772906303406, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 24020 + }, + { + "epoch": 1.7256732495511669, + "grad_norm": 0.7279905080795288, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 24030 + }, + { + "epoch": 1.726391382405745, + "grad_norm": 0.631148636341095, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 24040 + }, + { + "epoch": 1.7271095152603233, + "grad_norm": 0.7486464977264404, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 24050 + }, + { + "epoch": 1.7278276481149013, + "grad_norm": 0.7494347095489502, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 24060 + }, + { + "epoch": 1.7285457809694793, + "grad_norm": 0.7821264863014221, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 24070 + }, + { + "epoch": 1.7292639138240573, + "grad_norm": 0.7211608290672302, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 24080 + }, + { + "epoch": 1.7299820466786355, + "grad_norm": 0.7028553485870361, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 24090 + }, + { + "epoch": 1.7307001795332138, + "grad_norm": 0.6189247369766235, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 24100 + }, + { + "epoch": 1.7314183123877918, + "grad_norm": 0.7339756488800049, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 24110 + }, + { + "epoch": 1.7321364452423698, + "grad_norm": 0.6700502038002014, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 24120 + }, + { + "epoch": 1.732854578096948, + "grad_norm": 0.6139533519744873, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 24130 + }, + { + "epoch": 1.733572710951526, + "grad_norm": 0.7249825596809387, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 24140 + }, + { + "epoch": 1.7342908438061042, + "grad_norm": 0.6531777381896973, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 24150 + }, + { + "epoch": 1.7350089766606822, + "grad_norm": 0.8443833589553833, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 24160 + }, + { + "epoch": 1.7357271095152602, + "grad_norm": 0.7040373086929321, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 24170 + }, + { + "epoch": 1.7364452423698384, + "grad_norm": 0.8647749423980713, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24180 + }, + { + "epoch": 1.7371633752244167, + "grad_norm": 0.7297305464744568, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 24190 + }, + { + "epoch": 1.7378815080789947, + "grad_norm": 0.8191218376159668, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 24200 + }, + { + "epoch": 1.7385996409335727, + "grad_norm": 0.7315607666969299, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 24210 + }, + { + "epoch": 1.7393177737881507, + "grad_norm": 0.694486677646637, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 24220 + }, + { + "epoch": 1.740035906642729, + "grad_norm": 0.8115953207015991, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 24230 + }, + { + "epoch": 1.7407540394973071, + "grad_norm": 0.7379186153411865, + "learning_rate": 0.0002, + "loss": 0.7792, + "step": 24240 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 0.6820309162139893, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 24250 + }, + { + "epoch": 1.7421903052064631, + "grad_norm": 0.8210766911506653, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 24260 + }, + { + "epoch": 1.7429084380610413, + "grad_norm": 0.724466860294342, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 24270 + }, + { + "epoch": 1.7436265709156193, + "grad_norm": 0.8768740296363831, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 24280 + }, + { + "epoch": 1.7443447037701976, + "grad_norm": 0.6691206097602844, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24290 + }, + { + "epoch": 1.7450628366247756, + "grad_norm": 0.6529893279075623, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 24300 + }, + { + "epoch": 1.7457809694793536, + "grad_norm": 0.904729962348938, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 24310 + }, + { + "epoch": 1.7464991023339318, + "grad_norm": 0.655235230922699, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24320 + }, + { + "epoch": 1.74721723518851, + "grad_norm": 0.9476361274719238, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 24330 + }, + { + "epoch": 1.747935368043088, + "grad_norm": 0.55366051197052, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 24340 + }, + { + "epoch": 1.748653500897666, + "grad_norm": 0.7192568182945251, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 24350 + }, + { + "epoch": 1.749371633752244, + "grad_norm": 0.7193983793258667, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 24360 + }, + { + "epoch": 1.7500897666068223, + "grad_norm": 0.753998339176178, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24370 + }, + { + "epoch": 1.7508078994614005, + "grad_norm": 1.1058299541473389, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 24380 + }, + { + "epoch": 1.7515260323159785, + "grad_norm": 0.7213007211685181, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 24390 + }, + { + "epoch": 1.7522441651705565, + "grad_norm": 0.972494900226593, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 24400 + }, + { + "epoch": 1.7529622980251347, + "grad_norm": 0.8045306205749512, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 24410 + }, + { + "epoch": 1.7536804308797127, + "grad_norm": 0.82415372133255, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24420 + }, + { + "epoch": 1.754398563734291, + "grad_norm": 0.72683185338974, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 24430 + }, + { + "epoch": 1.755116696588869, + "grad_norm": 0.687907338142395, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 24440 + }, + { + "epoch": 1.755834829443447, + "grad_norm": 0.6616531610488892, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 24450 + }, + { + "epoch": 1.7565529622980252, + "grad_norm": 0.7225571870803833, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 24460 + }, + { + "epoch": 1.7572710951526034, + "grad_norm": 0.7597603797912598, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 24470 + }, + { + "epoch": 1.7579892280071814, + "grad_norm": 0.7850660681724548, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 24480 + }, + { + "epoch": 1.7587073608617594, + "grad_norm": 0.9843530058860779, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 24490 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 0.7010256052017212, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 24500 + }, + { + "epoch": 1.7601436265709156, + "grad_norm": 0.5669383406639099, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 24510 + }, + { + "epoch": 1.7608617594254938, + "grad_norm": 0.7043302655220032, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 24520 + }, + { + "epoch": 1.7615798922800718, + "grad_norm": 0.8000741600990295, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 24530 + }, + { + "epoch": 1.7622980251346498, + "grad_norm": 0.7084416747093201, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 24540 + }, + { + "epoch": 1.763016157989228, + "grad_norm": 0.7290608882904053, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 24550 + }, + { + "epoch": 1.763734290843806, + "grad_norm": 0.8710007071495056, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 24560 + }, + { + "epoch": 1.7644524236983843, + "grad_norm": 0.6346535682678223, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 24570 + }, + { + "epoch": 1.7651705565529623, + "grad_norm": 0.8990599513053894, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 24580 + }, + { + "epoch": 1.7658886894075403, + "grad_norm": 0.7823857665061951, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 24590 + }, + { + "epoch": 1.7666068222621185, + "grad_norm": 0.6250144839286804, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 24600 + }, + { + "epoch": 1.7673249551166967, + "grad_norm": 0.715657114982605, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 24610 + }, + { + "epoch": 1.7680430879712747, + "grad_norm": 0.6254874467849731, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 24620 + }, + { + "epoch": 1.7687612208258527, + "grad_norm": 0.6873717904090881, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 24630 + }, + { + "epoch": 1.7694793536804307, + "grad_norm": 0.7273038625717163, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 24640 + }, + { + "epoch": 1.770197486535009, + "grad_norm": 0.9079981446266174, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 24650 + }, + { + "epoch": 1.7709156193895872, + "grad_norm": 0.6262510418891907, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 24660 + }, + { + "epoch": 1.7716337522441652, + "grad_norm": 0.7326231002807617, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 24670 + }, + { + "epoch": 1.7723518850987432, + "grad_norm": 0.7828301787376404, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 24680 + }, + { + "epoch": 1.7730700179533212, + "grad_norm": 0.5881586670875549, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 24690 + }, + { + "epoch": 1.7737881508078994, + "grad_norm": 0.7101683020591736, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 24700 + }, + { + "epoch": 1.7745062836624776, + "grad_norm": 0.8466469049453735, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 24710 + }, + { + "epoch": 1.7752244165170556, + "grad_norm": 0.7770822644233704, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 24720 + }, + { + "epoch": 1.7759425493716336, + "grad_norm": 0.7259120345115662, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 24730 + }, + { + "epoch": 1.7766606822262119, + "grad_norm": 0.7696824669837952, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 24740 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 0.7603837847709656, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 24750 + }, + { + "epoch": 1.778096947935368, + "grad_norm": 0.6166595220565796, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 24760 + }, + { + "epoch": 1.778815080789946, + "grad_norm": 0.7493758797645569, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 24770 + }, + { + "epoch": 1.779533213644524, + "grad_norm": 0.7177459597587585, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 24780 + }, + { + "epoch": 1.7802513464991023, + "grad_norm": 0.6666781306266785, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 24790 + }, + { + "epoch": 1.7809694793536806, + "grad_norm": 0.6556468605995178, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 24800 + }, + { + "epoch": 1.7816876122082586, + "grad_norm": 0.6119393706321716, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 24810 + }, + { + "epoch": 1.7824057450628366, + "grad_norm": 0.8573325276374817, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 24820 + }, + { + "epoch": 1.7831238779174146, + "grad_norm": 0.8017005920410156, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 24830 + }, + { + "epoch": 1.7838420107719928, + "grad_norm": 0.7337947487831116, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24840 + }, + { + "epoch": 1.784560143626571, + "grad_norm": 0.6717178225517273, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 24850 + }, + { + "epoch": 1.785278276481149, + "grad_norm": 0.8243708610534668, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 24860 + }, + { + "epoch": 1.785996409335727, + "grad_norm": 0.8111547827720642, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24870 + }, + { + "epoch": 1.7867145421903052, + "grad_norm": 0.8577823042869568, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 24880 + }, + { + "epoch": 1.7874326750448835, + "grad_norm": 0.6488644480705261, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 24890 + }, + { + "epoch": 1.7881508078994615, + "grad_norm": 0.6446744799613953, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 24900 + }, + { + "epoch": 1.7888689407540395, + "grad_norm": 0.6400182247161865, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 24910 + }, + { + "epoch": 1.7895870736086175, + "grad_norm": 0.8059108853340149, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 24920 + }, + { + "epoch": 1.7903052064631957, + "grad_norm": 0.7101734280586243, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 24930 + }, + { + "epoch": 1.791023339317774, + "grad_norm": 1.0397762060165405, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 24940 + }, + { + "epoch": 1.791741472172352, + "grad_norm": 0.6231128573417664, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 24950 + }, + { + "epoch": 1.79245960502693, + "grad_norm": 5.905253887176514, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 24960 + }, + { + "epoch": 1.793177737881508, + "grad_norm": 0.8003911375999451, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 24970 + }, + { + "epoch": 1.7938958707360861, + "grad_norm": 0.6340393424034119, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 24980 + }, + { + "epoch": 1.7946140035906644, + "grad_norm": 0.8701013922691345, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 24990 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 0.9085575342178345, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 25000 + }, + { + "epoch": 1.7960502692998204, + "grad_norm": 0.6306625604629517, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 25010 + }, + { + "epoch": 1.7967684021543986, + "grad_norm": 0.6985056400299072, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25020 + }, + { + "epoch": 1.7974865350089768, + "grad_norm": 0.7309113144874573, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 25030 + }, + { + "epoch": 1.7982046678635548, + "grad_norm": 0.6795042157173157, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 25040 + }, + { + "epoch": 1.7989228007181328, + "grad_norm": 0.6920178532600403, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25050 + }, + { + "epoch": 1.7996409335727108, + "grad_norm": 0.6578564047813416, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25060 + }, + { + "epoch": 1.800359066427289, + "grad_norm": 0.6718358993530273, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 25070 + }, + { + "epoch": 1.8010771992818673, + "grad_norm": 0.9086750149726868, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 25080 + }, + { + "epoch": 1.8017953321364453, + "grad_norm": 0.6102437973022461, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 25090 + }, + { + "epoch": 1.8025134649910233, + "grad_norm": 0.6391313076019287, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 25100 + }, + { + "epoch": 1.8032315978456013, + "grad_norm": 0.7150128483772278, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 25110 + }, + { + "epoch": 1.8039497307001795, + "grad_norm": 0.9833421111106873, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 25120 + }, + { + "epoch": 1.8046678635547577, + "grad_norm": 0.774002194404602, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25130 + }, + { + "epoch": 1.8053859964093357, + "grad_norm": 0.644443154335022, + "learning_rate": 0.0002, + "loss": 0.7329, + "step": 25140 + }, + { + "epoch": 1.8061041292639137, + "grad_norm": 0.6996100544929504, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 25150 + }, + { + "epoch": 1.806822262118492, + "grad_norm": 0.7545985579490662, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 25160 + }, + { + "epoch": 1.8075403949730702, + "grad_norm": 0.7505226731300354, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 25170 + }, + { + "epoch": 1.8082585278276482, + "grad_norm": 0.800681471824646, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 25180 + }, + { + "epoch": 1.8089766606822262, + "grad_norm": 0.8268337845802307, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 25190 + }, + { + "epoch": 1.8096947935368042, + "grad_norm": 0.6436594128608704, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 25200 + }, + { + "epoch": 1.8104129263913824, + "grad_norm": 0.6961014270782471, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 25210 + }, + { + "epoch": 1.8111310592459606, + "grad_norm": 0.6649489998817444, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 25220 + }, + { + "epoch": 1.8118491921005386, + "grad_norm": 0.7071637511253357, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 25230 + }, + { + "epoch": 1.8125673249551166, + "grad_norm": 0.9082241654396057, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 25240 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 0.6318159103393555, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 25250 + }, + { + "epoch": 1.8140035906642729, + "grad_norm": 0.8006597757339478, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 25260 + }, + { + "epoch": 1.814721723518851, + "grad_norm": 0.7950259447097778, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 25270 + }, + { + "epoch": 1.815439856373429, + "grad_norm": 0.8376588821411133, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 25280 + }, + { + "epoch": 1.816157989228007, + "grad_norm": 0.8343217968940735, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 25290 + }, + { + "epoch": 1.8168761220825853, + "grad_norm": 0.6240017414093018, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 25300 + }, + { + "epoch": 1.8175942549371635, + "grad_norm": 0.7079808712005615, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 25310 + }, + { + "epoch": 1.8183123877917415, + "grad_norm": 0.5930073261260986, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 25320 + }, + { + "epoch": 1.8190305206463195, + "grad_norm": 0.6994491815567017, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 25330 + }, + { + "epoch": 1.8197486535008975, + "grad_norm": 0.8285305500030518, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 25340 + }, + { + "epoch": 1.8204667863554758, + "grad_norm": 0.6880194544792175, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 25350 + }, + { + "epoch": 1.821184919210054, + "grad_norm": 0.7301307916641235, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 25360 + }, + { + "epoch": 1.821903052064632, + "grad_norm": 0.8117532730102539, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 25370 + }, + { + "epoch": 1.82262118491921, + "grad_norm": 0.8098701238632202, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 25380 + }, + { + "epoch": 1.823339317773788, + "grad_norm": 0.6899038553237915, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 25390 + }, + { + "epoch": 1.8240574506283662, + "grad_norm": 0.7350431084632874, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 25400 + }, + { + "epoch": 1.8247755834829444, + "grad_norm": 0.8723382949829102, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 25410 + }, + { + "epoch": 1.8254937163375224, + "grad_norm": 0.7448108196258545, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 25420 + }, + { + "epoch": 1.8262118491921004, + "grad_norm": 0.7525040507316589, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25430 + }, + { + "epoch": 1.8269299820466787, + "grad_norm": 0.7148599028587341, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25440 + }, + { + "epoch": 1.827648114901257, + "grad_norm": 1.1802153587341309, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 25450 + }, + { + "epoch": 1.828366247755835, + "grad_norm": 0.619945764541626, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25460 + }, + { + "epoch": 1.829084380610413, + "grad_norm": 0.7065792679786682, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 25470 + }, + { + "epoch": 1.829802513464991, + "grad_norm": 0.6626001596450806, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 25480 + }, + { + "epoch": 1.8305206463195691, + "grad_norm": 0.8368920087814331, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 25490 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 0.7528934478759766, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 25500 + }, + { + "epoch": 1.8319569120287253, + "grad_norm": 0.6472136378288269, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 25510 + }, + { + "epoch": 1.8326750448833034, + "grad_norm": 0.7818671464920044, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 25520 + }, + { + "epoch": 1.8333931777378814, + "grad_norm": 0.8280798196792603, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 25530 + }, + { + "epoch": 1.8341113105924596, + "grad_norm": 0.7038599252700806, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 25540 + }, + { + "epoch": 1.8348294434470378, + "grad_norm": 0.6345962882041931, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 25550 + }, + { + "epoch": 1.8355475763016158, + "grad_norm": 0.6891741752624512, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 25560 + }, + { + "epoch": 1.8362657091561938, + "grad_norm": 0.7753492593765259, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 25570 + }, + { + "epoch": 1.836983842010772, + "grad_norm": 0.6907210946083069, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 25580 + }, + { + "epoch": 1.8377019748653503, + "grad_norm": 0.7483090162277222, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 25590 + }, + { + "epoch": 1.8384201077199283, + "grad_norm": 0.8749029636383057, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 25600 + }, + { + "epoch": 1.8391382405745063, + "grad_norm": 0.6936851143836975, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 25610 + }, + { + "epoch": 1.8398563734290843, + "grad_norm": 0.7273763418197632, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 25620 + }, + { + "epoch": 1.8405745062836625, + "grad_norm": 0.7655298113822937, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 25630 + }, + { + "epoch": 1.8412926391382407, + "grad_norm": 0.7207344770431519, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 25640 + }, + { + "epoch": 1.8420107719928187, + "grad_norm": 0.6970131397247314, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 25650 + }, + { + "epoch": 1.8427289048473967, + "grad_norm": 0.7777560353279114, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25660 + }, + { + "epoch": 1.8434470377019747, + "grad_norm": 0.7070116400718689, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 25670 + }, + { + "epoch": 1.844165170556553, + "grad_norm": 0.6980257630348206, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 25680 + }, + { + "epoch": 1.8448833034111312, + "grad_norm": 0.906563401222229, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 25690 + }, + { + "epoch": 1.8456014362657092, + "grad_norm": 0.567991316318512, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 25700 + }, + { + "epoch": 1.8463195691202872, + "grad_norm": 0.5954506993293762, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 25710 + }, + { + "epoch": 1.8470377019748654, + "grad_norm": 0.8073318600654602, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 25720 + }, + { + "epoch": 1.8477558348294436, + "grad_norm": 0.7439551949501038, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 25730 + }, + { + "epoch": 1.8484739676840216, + "grad_norm": 0.8091771602630615, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 25740 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 0.6584576964378357, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 25750 + }, + { + "epoch": 1.8499102333931776, + "grad_norm": 0.8161963224411011, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 25760 + }, + { + "epoch": 1.8506283662477558, + "grad_norm": 0.7337122559547424, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 25770 + }, + { + "epoch": 1.851346499102334, + "grad_norm": 0.8968114256858826, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25780 + }, + { + "epoch": 1.852064631956912, + "grad_norm": 0.8647686839103699, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 25790 + }, + { + "epoch": 1.85278276481149, + "grad_norm": 0.7775349020957947, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 25800 + }, + { + "epoch": 1.853500897666068, + "grad_norm": 0.686072587966919, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 25810 + }, + { + "epoch": 1.8542190305206463, + "grad_norm": 0.7053380012512207, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 25820 + }, + { + "epoch": 1.8549371633752245, + "grad_norm": 0.7899979948997498, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 25830 + }, + { + "epoch": 1.8556552962298025, + "grad_norm": 0.6970776915550232, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 25840 + }, + { + "epoch": 1.8563734290843805, + "grad_norm": 0.7210841774940491, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 25850 + }, + { + "epoch": 1.8570915619389587, + "grad_norm": 0.7297208905220032, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 25860 + }, + { + "epoch": 1.857809694793537, + "grad_norm": 0.7782729268074036, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 25870 + }, + { + "epoch": 1.858527827648115, + "grad_norm": 0.7227505445480347, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 25880 + }, + { + "epoch": 1.859245960502693, + "grad_norm": 0.7489684224128723, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 25890 + }, + { + "epoch": 1.859964093357271, + "grad_norm": 0.7447289824485779, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 25900 + }, + { + "epoch": 1.8606822262118492, + "grad_norm": 0.8516317009925842, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 25910 + }, + { + "epoch": 1.8614003590664274, + "grad_norm": 0.6864543557167053, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 25920 + }, + { + "epoch": 1.8621184919210054, + "grad_norm": 0.6753451824188232, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 25930 + }, + { + "epoch": 1.8628366247755834, + "grad_norm": 0.631679117679596, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25940 + }, + { + "epoch": 1.8635547576301614, + "grad_norm": 0.7715049982070923, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 25950 + }, + { + "epoch": 1.8642728904847397, + "grad_norm": 0.7354850769042969, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 25960 + }, + { + "epoch": 1.8649910233393179, + "grad_norm": 0.7443442940711975, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 25970 + }, + { + "epoch": 1.8657091561938959, + "grad_norm": 0.6880337595939636, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 25980 + }, + { + "epoch": 1.8664272890484739, + "grad_norm": 0.843941867351532, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 25990 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 0.6904318928718567, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 26000 + }, + { + "epoch": 1.86786355475763, + "grad_norm": 0.9041751623153687, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 26010 + }, + { + "epoch": 1.8685816876122083, + "grad_norm": 0.7470057010650635, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 26020 + }, + { + "epoch": 1.8692998204667863, + "grad_norm": 0.6921331882476807, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 26030 + }, + { + "epoch": 1.8700179533213643, + "grad_norm": 0.7627376914024353, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 26040 + }, + { + "epoch": 1.8707360861759426, + "grad_norm": 0.7784932851791382, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 26050 + }, + { + "epoch": 1.8714542190305208, + "grad_norm": 0.6399524807929993, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 26060 + }, + { + "epoch": 1.8721723518850988, + "grad_norm": 0.6478492617607117, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26070 + }, + { + "epoch": 1.8728904847396768, + "grad_norm": 0.6376804113388062, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 26080 + }, + { + "epoch": 1.8736086175942548, + "grad_norm": 0.6976892352104187, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 26090 + }, + { + "epoch": 1.874326750448833, + "grad_norm": 0.7997903227806091, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 26100 + }, + { + "epoch": 1.8750448833034112, + "grad_norm": 0.6984273791313171, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 26110 + }, + { + "epoch": 1.8757630161579892, + "grad_norm": 0.7020659446716309, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26120 + }, + { + "epoch": 1.8764811490125672, + "grad_norm": 0.784986138343811, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 26130 + }, + { + "epoch": 1.8771992818671455, + "grad_norm": 0.7369210124015808, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 26140 + }, + { + "epoch": 1.8779174147217235, + "grad_norm": 0.7730622291564941, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 26150 + }, + { + "epoch": 1.8786355475763017, + "grad_norm": 0.7253434658050537, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 26160 + }, + { + "epoch": 1.8793536804308797, + "grad_norm": 0.8019800186157227, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 26170 + }, + { + "epoch": 1.8800718132854577, + "grad_norm": 0.7337628602981567, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 26180 + }, + { + "epoch": 1.880789946140036, + "grad_norm": 0.7049200534820557, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 26190 + }, + { + "epoch": 1.8815080789946141, + "grad_norm": 0.6451525092124939, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 26200 + }, + { + "epoch": 1.8822262118491921, + "grad_norm": 0.7660874724388123, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 26210 + }, + { + "epoch": 1.8829443447037701, + "grad_norm": 0.8464223146438599, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26220 + }, + { + "epoch": 1.8836624775583481, + "grad_norm": 0.859503984451294, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 26230 + }, + { + "epoch": 1.8843806104129264, + "grad_norm": 0.6969478726387024, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 26240 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 0.6860285997390747, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 26250 + }, + { + "epoch": 1.8858168761220826, + "grad_norm": 0.5873110294342041, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 26260 + }, + { + "epoch": 1.8865350089766606, + "grad_norm": 0.6959530115127563, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 26270 + }, + { + "epoch": 1.8872531418312388, + "grad_norm": 0.8734689950942993, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 26280 + }, + { + "epoch": 1.8879712746858168, + "grad_norm": 0.7385509014129639, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 26290 + }, + { + "epoch": 1.888689407540395, + "grad_norm": 0.6702063083648682, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 26300 + }, + { + "epoch": 1.889407540394973, + "grad_norm": 0.8177255988121033, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 26310 + }, + { + "epoch": 1.890125673249551, + "grad_norm": 0.6638466715812683, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 26320 + }, + { + "epoch": 1.8908438061041293, + "grad_norm": 0.8584128618240356, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 26330 + }, + { + "epoch": 1.8915619389587075, + "grad_norm": 0.677561342716217, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 26340 + }, + { + "epoch": 1.8922800718132855, + "grad_norm": 0.6931864619255066, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 26350 + }, + { + "epoch": 1.8929982046678635, + "grad_norm": 0.6583828330039978, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 26360 + }, + { + "epoch": 1.8937163375224415, + "grad_norm": 0.6708519458770752, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 26370 + }, + { + "epoch": 1.8944344703770197, + "grad_norm": 0.7684788107872009, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 26380 + }, + { + "epoch": 1.895152603231598, + "grad_norm": 0.703217625617981, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 26390 + }, + { + "epoch": 1.895870736086176, + "grad_norm": 0.6686710119247437, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26400 + }, + { + "epoch": 1.896588868940754, + "grad_norm": 0.7429705262184143, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 26410 + }, + { + "epoch": 1.8973070017953322, + "grad_norm": 0.7835305333137512, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 26420 + }, + { + "epoch": 1.8980251346499102, + "grad_norm": 0.7793689370155334, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 26430 + }, + { + "epoch": 1.8987432675044884, + "grad_norm": 0.7337237000465393, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 26440 + }, + { + "epoch": 1.8994614003590664, + "grad_norm": 0.5734546780586243, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 26450 + }, + { + "epoch": 1.9001795332136444, + "grad_norm": 0.655937135219574, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 26460 + }, + { + "epoch": 1.9008976660682226, + "grad_norm": 1.0200905799865723, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 26470 + }, + { + "epoch": 1.9016157989228009, + "grad_norm": 0.6118829250335693, + "learning_rate": 0.0002, + "loss": 0.733, + "step": 26480 + }, + { + "epoch": 1.9023339317773789, + "grad_norm": 0.7459297776222229, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 26490 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 0.9451959729194641, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 26500 + }, + { + "epoch": 1.9037701974865349, + "grad_norm": 0.9694880247116089, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 26510 + }, + { + "epoch": 1.904488330341113, + "grad_norm": 0.806532084941864, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 26520 + }, + { + "epoch": 1.9052064631956913, + "grad_norm": 0.7016968727111816, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 26530 + }, + { + "epoch": 1.9059245960502693, + "grad_norm": 0.7707533836364746, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26540 + }, + { + "epoch": 1.9066427289048473, + "grad_norm": 0.716044545173645, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 26550 + }, + { + "epoch": 1.9073608617594255, + "grad_norm": 0.7904782295227051, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 26560 + }, + { + "epoch": 1.9080789946140035, + "grad_norm": 0.8557461500167847, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 26570 + }, + { + "epoch": 1.9087971274685818, + "grad_norm": 0.6807048916816711, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26580 + }, + { + "epoch": 1.9095152603231598, + "grad_norm": 0.8374032974243164, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 26590 + }, + { + "epoch": 1.9102333931777378, + "grad_norm": 0.7936834692955017, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 26600 + }, + { + "epoch": 1.910951526032316, + "grad_norm": 0.6342210173606873, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 26610 + }, + { + "epoch": 1.9116696588868942, + "grad_norm": 0.8222208023071289, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 26620 + }, + { + "epoch": 1.9123877917414722, + "grad_norm": 0.7890012860298157, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 26630 + }, + { + "epoch": 1.9131059245960502, + "grad_norm": 0.6415254473686218, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 26640 + }, + { + "epoch": 1.9138240574506282, + "grad_norm": 0.7936763763427734, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 26650 + }, + { + "epoch": 1.9145421903052064, + "grad_norm": 0.7174334526062012, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 26660 + }, + { + "epoch": 1.9152603231597847, + "grad_norm": 0.6503710746765137, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 26670 + }, + { + "epoch": 1.9159784560143627, + "grad_norm": 0.7618577480316162, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 26680 + }, + { + "epoch": 1.9166965888689407, + "grad_norm": 0.7984131574630737, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 26690 + }, + { + "epoch": 1.917414721723519, + "grad_norm": 0.6863887906074524, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 26700 + }, + { + "epoch": 1.918132854578097, + "grad_norm": 0.7621138691902161, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 26710 + }, + { + "epoch": 1.9188509874326751, + "grad_norm": 0.7855543494224548, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 26720 + }, + { + "epoch": 1.9195691202872531, + "grad_norm": 0.7045016288757324, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 26730 + }, + { + "epoch": 1.9202872531418311, + "grad_norm": 0.7799559235572815, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 26740 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 0.7999796271324158, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 26750 + }, + { + "epoch": 1.9217235188509876, + "grad_norm": 0.5479980111122131, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 26760 + }, + { + "epoch": 1.9224416517055656, + "grad_norm": 0.7192868590354919, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 26770 + }, + { + "epoch": 1.9231597845601436, + "grad_norm": 0.7642375826835632, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 26780 + }, + { + "epoch": 1.9238779174147216, + "grad_norm": 0.7015959620475769, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 26790 + }, + { + "epoch": 1.9245960502692998, + "grad_norm": 0.6685634851455688, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 26800 + }, + { + "epoch": 1.925314183123878, + "grad_norm": 0.674363911151886, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 26810 + }, + { + "epoch": 1.926032315978456, + "grad_norm": 0.769318163394928, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 26820 + }, + { + "epoch": 1.926750448833034, + "grad_norm": 0.7397989630699158, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 26830 + }, + { + "epoch": 1.9274685816876123, + "grad_norm": 0.7603814601898193, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 26840 + }, + { + "epoch": 1.9281867145421903, + "grad_norm": 0.5960564613342285, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 26850 + }, + { + "epoch": 1.9289048473967685, + "grad_norm": 0.8158858418464661, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 26860 + }, + { + "epoch": 1.9296229802513465, + "grad_norm": 0.7022058367729187, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 26870 + }, + { + "epoch": 1.9303411131059245, + "grad_norm": 0.7249060273170471, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 26880 + }, + { + "epoch": 1.9310592459605027, + "grad_norm": 0.7613264322280884, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 26890 + }, + { + "epoch": 1.931777378815081, + "grad_norm": 0.6857499480247498, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 26900 + }, + { + "epoch": 1.932495511669659, + "grad_norm": 0.6968346834182739, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 26910 + }, + { + "epoch": 1.933213644524237, + "grad_norm": 0.7079267501831055, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 26920 + }, + { + "epoch": 1.933931777378815, + "grad_norm": 0.6571618914604187, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 26930 + }, + { + "epoch": 1.9346499102333932, + "grad_norm": 0.7460548281669617, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 26940 + }, + { + "epoch": 1.9353680430879714, + "grad_norm": 0.7954307794570923, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 26950 + }, + { + "epoch": 1.9360861759425494, + "grad_norm": 0.8696223497390747, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 26960 + }, + { + "epoch": 1.9368043087971274, + "grad_norm": 0.726004421710968, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 26970 + }, + { + "epoch": 1.9375224416517056, + "grad_norm": 0.8760337829589844, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 26980 + }, + { + "epoch": 1.9382405745062836, + "grad_norm": 0.7308675646781921, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 26990 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 0.5900304317474365, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 27000 + }, + { + "epoch": 1.9396768402154398, + "grad_norm": 0.8839457631111145, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 27010 + }, + { + "epoch": 1.9403949730700178, + "grad_norm": 0.7239173650741577, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 27020 + }, + { + "epoch": 1.941113105924596, + "grad_norm": 0.8972901701927185, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 27030 + }, + { + "epoch": 1.9418312387791743, + "grad_norm": 0.7140652537345886, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 27040 + }, + { + "epoch": 1.9425493716337523, + "grad_norm": 0.7502743005752563, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 27050 + }, + { + "epoch": 1.9432675044883303, + "grad_norm": 0.6420751810073853, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 27060 + }, + { + "epoch": 1.9439856373429083, + "grad_norm": 0.6671820282936096, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 27070 + }, + { + "epoch": 1.9447037701974865, + "grad_norm": 0.6268796324729919, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 27080 + }, + { + "epoch": 1.9454219030520647, + "grad_norm": 0.6850021481513977, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 27090 + }, + { + "epoch": 1.9461400359066428, + "grad_norm": 0.6380038261413574, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 27100 + }, + { + "epoch": 1.9468581687612208, + "grad_norm": 0.5806204080581665, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 27110 + }, + { + "epoch": 1.947576301615799, + "grad_norm": 0.8236927390098572, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 27120 + }, + { + "epoch": 1.948294434470377, + "grad_norm": 0.7915826439857483, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27130 + }, + { + "epoch": 1.9490125673249552, + "grad_norm": 0.7467429041862488, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 27140 + }, + { + "epoch": 1.9497307001795332, + "grad_norm": 0.6278707981109619, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27150 + }, + { + "epoch": 1.9504488330341112, + "grad_norm": 0.7353739142417908, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 27160 + }, + { + "epoch": 1.9511669658886894, + "grad_norm": 0.6443645358085632, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27170 + }, + { + "epoch": 1.9518850987432677, + "grad_norm": 0.770800769329071, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 27180 + }, + { + "epoch": 1.9526032315978457, + "grad_norm": 0.8982598781585693, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 27190 + }, + { + "epoch": 1.9533213644524237, + "grad_norm": 0.775017499923706, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 27200 + }, + { + "epoch": 1.9540394973070017, + "grad_norm": 0.8271628618240356, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 27210 + }, + { + "epoch": 1.9547576301615799, + "grad_norm": 0.7460184693336487, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 27220 + }, + { + "epoch": 1.955475763016158, + "grad_norm": 0.7732188105583191, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 27230 + }, + { + "epoch": 1.956193895870736, + "grad_norm": 0.7398577332496643, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 27240 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 0.7132339477539062, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 27250 + }, + { + "epoch": 1.9576301615798921, + "grad_norm": 0.6718965768814087, + "learning_rate": 0.0002, + "loss": 0.7731, + "step": 27260 + }, + { + "epoch": 1.9583482944344703, + "grad_norm": 0.7914422154426575, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 27270 + }, + { + "epoch": 1.9590664272890486, + "grad_norm": 0.8314110636711121, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 27280 + }, + { + "epoch": 1.9597845601436266, + "grad_norm": 0.7810674905776978, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 27290 + }, + { + "epoch": 1.9605026929982046, + "grad_norm": 0.7691007256507874, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 27300 + }, + { + "epoch": 1.9612208258527828, + "grad_norm": 0.6753138899803162, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 27310 + }, + { + "epoch": 1.961938958707361, + "grad_norm": 0.5881175994873047, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 27320 + }, + { + "epoch": 1.962657091561939, + "grad_norm": 0.8414133191108704, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27330 + }, + { + "epoch": 1.963375224416517, + "grad_norm": 0.7363715171813965, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 27340 + }, + { + "epoch": 1.964093357271095, + "grad_norm": 0.6526232361793518, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 27350 + }, + { + "epoch": 1.9648114901256732, + "grad_norm": 0.6821389198303223, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 27360 + }, + { + "epoch": 1.9655296229802515, + "grad_norm": 0.7306062579154968, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 27370 + }, + { + "epoch": 1.9662477558348295, + "grad_norm": 0.6458130478858948, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 27380 + }, + { + "epoch": 1.9669658886894075, + "grad_norm": 0.7243196368217468, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 27390 + }, + { + "epoch": 1.9676840215439855, + "grad_norm": 0.8062235713005066, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 27400 + }, + { + "epoch": 1.9684021543985637, + "grad_norm": 0.68441241979599, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 27410 + }, + { + "epoch": 1.969120287253142, + "grad_norm": 0.7504498958587646, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 27420 + }, + { + "epoch": 1.96983842010772, + "grad_norm": 0.7469466328620911, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 27430 + }, + { + "epoch": 1.970556552962298, + "grad_norm": 0.7109853625297546, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 27440 + }, + { + "epoch": 1.9712746858168761, + "grad_norm": 0.6964903473854065, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 27450 + }, + { + "epoch": 1.9719928186714544, + "grad_norm": 0.8224200010299683, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 27460 + }, + { + "epoch": 1.9727109515260324, + "grad_norm": 0.6195617318153381, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 27470 + }, + { + "epoch": 1.9734290843806104, + "grad_norm": 0.691511332988739, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 27480 + }, + { + "epoch": 1.9741472172351884, + "grad_norm": 0.7437900304794312, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 27490 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 0.7987960577011108, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 27500 + }, + { + "epoch": 1.9755834829443448, + "grad_norm": 0.7117776274681091, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 27510 + }, + { + "epoch": 1.9763016157989228, + "grad_norm": 0.8473866581916809, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 27520 + }, + { + "epoch": 1.9770197486535008, + "grad_norm": 0.7178242802619934, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 27530 + }, + { + "epoch": 1.9777378815080788, + "grad_norm": 0.760145902633667, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 27540 + }, + { + "epoch": 1.978456014362657, + "grad_norm": 0.764436662197113, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 27550 + }, + { + "epoch": 1.9791741472172353, + "grad_norm": 0.7245904803276062, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 27560 + }, + { + "epoch": 1.9798922800718133, + "grad_norm": 0.6317000389099121, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 27570 + }, + { + "epoch": 1.9806104129263913, + "grad_norm": 0.8764704465866089, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 27580 + }, + { + "epoch": 1.9813285457809695, + "grad_norm": 0.6111825108528137, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 27590 + }, + { + "epoch": 1.9820466786355477, + "grad_norm": 0.6797714233398438, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 27600 + }, + { + "epoch": 1.9827648114901257, + "grad_norm": 0.7754142880439758, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 27610 + }, + { + "epoch": 1.9834829443447037, + "grad_norm": 0.7243061661720276, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 27620 + }, + { + "epoch": 1.9842010771992817, + "grad_norm": 0.6194812655448914, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 27630 + }, + { + "epoch": 1.98491921005386, + "grad_norm": 0.6399638056755066, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27640 + }, + { + "epoch": 1.9856373429084382, + "grad_norm": 0.7637218832969666, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 27650 + }, + { + "epoch": 1.9863554757630162, + "grad_norm": 0.9099404811859131, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 27660 + }, + { + "epoch": 1.9870736086175942, + "grad_norm": 0.6892596483230591, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 27670 + }, + { + "epoch": 1.9877917414721722, + "grad_norm": 0.5962418913841248, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 27680 + }, + { + "epoch": 1.9885098743267504, + "grad_norm": 0.5750163197517395, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27690 + }, + { + "epoch": 1.9892280071813286, + "grad_norm": 0.6740097403526306, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 27700 + }, + { + "epoch": 1.9899461400359066, + "grad_norm": 0.6968644857406616, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 27710 + }, + { + "epoch": 1.9906642728904846, + "grad_norm": 0.6788132190704346, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 27720 + }, + { + "epoch": 1.9913824057450629, + "grad_norm": 0.8600544929504395, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 27730 + }, + { + "epoch": 1.992100538599641, + "grad_norm": 0.6227671504020691, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 27740 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 0.6611875295639038, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 27750 + }, + { + "epoch": 1.993536804308797, + "grad_norm": 0.714568018913269, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 27760 + }, + { + "epoch": 1.994254937163375, + "grad_norm": 0.6328669190406799, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27770 + }, + { + "epoch": 1.9949730700179533, + "grad_norm": 0.8673429489135742, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27780 + }, + { + "epoch": 1.9956912028725315, + "grad_norm": 0.820620059967041, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 27790 + }, + { + "epoch": 1.9964093357271095, + "grad_norm": 0.8748094439506531, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 27800 + }, + { + "epoch": 1.9971274685816875, + "grad_norm": 0.8118113875389099, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 27810 + }, + { + "epoch": 1.9978456014362656, + "grad_norm": 0.6886725425720215, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 27820 + }, + { + "epoch": 1.9985637342908438, + "grad_norm": 0.7101268768310547, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 27830 + }, + { + "epoch": 1.999281867145422, + "grad_norm": 0.7823781967163086, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 27840 + }, + { + "epoch": 2.0, + "grad_norm": 0.8491085767745972, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 27850 + }, + { + "epoch": 2.0, + "eval_loss": 1.0868422985076904, + "eval_runtime": 55.1699, + "eval_samples_per_second": 13.286, + "eval_steps_per_second": 1.668, + "step": 27850 + }, + { + "epoch": 2.000718132854578, + "grad_norm": 0.9003389477729797, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 27860 + }, + { + "epoch": 2.001436265709156, + "grad_norm": 0.8898349404335022, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 27870 + }, + { + "epoch": 2.0021543985637344, + "grad_norm": 0.7525973320007324, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 27880 + }, + { + "epoch": 2.0028725314183125, + "grad_norm": 0.7821497321128845, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 27890 + }, + { + "epoch": 2.0035906642728905, + "grad_norm": 0.6334691047668457, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 27900 + }, + { + "epoch": 2.0043087971274685, + "grad_norm": 0.732991099357605, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 27910 + }, + { + "epoch": 2.0050269299820465, + "grad_norm": 0.949942946434021, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 27920 + }, + { + "epoch": 2.005745062836625, + "grad_norm": 0.657267689704895, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 27930 + }, + { + "epoch": 2.006463195691203, + "grad_norm": 0.8329252004623413, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 27940 + }, + { + "epoch": 2.007181328545781, + "grad_norm": 0.7816959023475647, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 27950 + }, + { + "epoch": 2.007899461400359, + "grad_norm": 0.7546323537826538, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 27960 + }, + { + "epoch": 2.0086175942549374, + "grad_norm": 0.9519657492637634, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 27970 + }, + { + "epoch": 2.0093357271095154, + "grad_norm": 0.7934315800666809, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 27980 + }, + { + "epoch": 2.0100538599640934, + "grad_norm": 0.9579764604568481, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 27990 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 0.764167070388794, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 28000 + }, + { + "epoch": 2.0114901256732494, + "grad_norm": 0.7380000948905945, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 28010 + }, + { + "epoch": 2.012208258527828, + "grad_norm": 0.7220044732093811, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 28020 + }, + { + "epoch": 2.012926391382406, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 28030 + }, + { + "epoch": 2.013644524236984, + "grad_norm": 0.7507190704345703, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28040 + }, + { + "epoch": 2.014362657091562, + "grad_norm": 0.9488387703895569, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 28050 + }, + { + "epoch": 2.01508078994614, + "grad_norm": 0.9092940092086792, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 28060 + }, + { + "epoch": 2.0157989228007183, + "grad_norm": 0.7859629392623901, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28070 + }, + { + "epoch": 2.0165170556552963, + "grad_norm": 0.7636393904685974, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 28080 + }, + { + "epoch": 2.0172351885098743, + "grad_norm": 0.8860714435577393, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 28090 + }, + { + "epoch": 2.0179533213644523, + "grad_norm": 0.6837195158004761, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 28100 + }, + { + "epoch": 2.0186714542190307, + "grad_norm": 0.7778242826461792, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 28110 + }, + { + "epoch": 2.0193895870736087, + "grad_norm": 0.7164766788482666, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 28120 + }, + { + "epoch": 2.0201077199281867, + "grad_norm": 0.8965572118759155, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 28130 + }, + { + "epoch": 2.0208258527827647, + "grad_norm": 0.8074374794960022, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 28140 + }, + { + "epoch": 2.0215439856373427, + "grad_norm": 0.8307222127914429, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 28150 + }, + { + "epoch": 2.022262118491921, + "grad_norm": 0.9600032567977905, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 28160 + }, + { + "epoch": 2.022980251346499, + "grad_norm": 0.8541040420532227, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 28170 + }, + { + "epoch": 2.023698384201077, + "grad_norm": 0.8864985704421997, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 28180 + }, + { + "epoch": 2.024416517055655, + "grad_norm": 0.7926326990127563, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 28190 + }, + { + "epoch": 2.025134649910233, + "grad_norm": 1.0548077821731567, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28200 + }, + { + "epoch": 2.0258527827648116, + "grad_norm": 0.7468827366828918, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 28210 + }, + { + "epoch": 2.0265709156193896, + "grad_norm": 0.7683286070823669, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 28220 + }, + { + "epoch": 2.0272890484739676, + "grad_norm": 0.7307319641113281, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 28230 + }, + { + "epoch": 2.0280071813285456, + "grad_norm": 0.7813416719436646, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 28240 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 0.7954556941986084, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 28250 + }, + { + "epoch": 2.029443447037702, + "grad_norm": 0.8836418986320496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 28260 + }, + { + "epoch": 2.03016157989228, + "grad_norm": 0.7092728614807129, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28270 + }, + { + "epoch": 2.030879712746858, + "grad_norm": 0.8512285351753235, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 28280 + }, + { + "epoch": 2.031597845601436, + "grad_norm": 0.8005346059799194, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 28290 + }, + { + "epoch": 2.0323159784560145, + "grad_norm": 0.8872515559196472, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 28300 + }, + { + "epoch": 2.0330341113105925, + "grad_norm": 0.7948436737060547, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 28310 + }, + { + "epoch": 2.0337522441651705, + "grad_norm": 0.7418082356452942, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 28320 + }, + { + "epoch": 2.0344703770197485, + "grad_norm": 0.9600949287414551, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 28330 + }, + { + "epoch": 2.0351885098743265, + "grad_norm": 0.9767434597015381, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 28340 + }, + { + "epoch": 2.035906642728905, + "grad_norm": 0.7435336709022522, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 28350 + }, + { + "epoch": 2.036624775583483, + "grad_norm": 0.997978925704956, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 28360 + }, + { + "epoch": 2.037342908438061, + "grad_norm": 0.9072412252426147, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 28370 + }, + { + "epoch": 2.038061041292639, + "grad_norm": 0.8396701812744141, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 28380 + }, + { + "epoch": 2.0387791741472174, + "grad_norm": 1.0449832677841187, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 28390 + }, + { + "epoch": 2.0394973070017954, + "grad_norm": 0.6471025943756104, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 28400 + }, + { + "epoch": 2.0402154398563734, + "grad_norm": 0.8147950768470764, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 28410 + }, + { + "epoch": 2.0409335727109514, + "grad_norm": 0.902508020401001, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 28420 + }, + { + "epoch": 2.0416517055655294, + "grad_norm": 0.6426262855529785, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 28430 + }, + { + "epoch": 2.042369838420108, + "grad_norm": 0.8016643524169922, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 28440 + }, + { + "epoch": 2.043087971274686, + "grad_norm": 0.6841614246368408, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 28450 + }, + { + "epoch": 2.043806104129264, + "grad_norm": 0.7713631987571716, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 28460 + }, + { + "epoch": 2.044524236983842, + "grad_norm": 0.8795675039291382, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 28470 + }, + { + "epoch": 2.04524236983842, + "grad_norm": 0.725447416305542, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 28480 + }, + { + "epoch": 2.0459605026929983, + "grad_norm": 0.806861162185669, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 28490 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 0.752953827381134, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 28500 + }, + { + "epoch": 2.0473967684021543, + "grad_norm": 0.7143173813819885, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 28510 + }, + { + "epoch": 2.0481149012567323, + "grad_norm": 0.9316226243972778, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 28520 + }, + { + "epoch": 2.048833034111311, + "grad_norm": 0.7292338609695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 28530 + }, + { + "epoch": 2.049551166965889, + "grad_norm": 0.7392885088920593, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 28540 + }, + { + "epoch": 2.050269299820467, + "grad_norm": 0.7288873195648193, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 28550 + }, + { + "epoch": 2.050987432675045, + "grad_norm": 0.7791221141815186, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 28560 + }, + { + "epoch": 2.051705565529623, + "grad_norm": 0.821983814239502, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 28570 + }, + { + "epoch": 2.0524236983842012, + "grad_norm": 0.8925826549530029, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28580 + }, + { + "epoch": 2.0531418312387792, + "grad_norm": 0.7181646227836609, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 28590 + }, + { + "epoch": 2.0538599640933572, + "grad_norm": 0.6387725472450256, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 28600 + }, + { + "epoch": 2.0545780969479353, + "grad_norm": 0.8398096561431885, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 28610 + }, + { + "epoch": 2.0552962298025133, + "grad_norm": 1.0458195209503174, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 28620 + }, + { + "epoch": 2.0560143626570917, + "grad_norm": 0.7032150626182556, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28630 + }, + { + "epoch": 2.0567324955116697, + "grad_norm": 0.8850845098495483, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 28640 + }, + { + "epoch": 2.0574506283662477, + "grad_norm": 0.8587120175361633, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 28650 + }, + { + "epoch": 2.0581687612208257, + "grad_norm": 0.7462602853775024, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28660 + }, + { + "epoch": 2.058886894075404, + "grad_norm": 0.7355574369430542, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 28670 + }, + { + "epoch": 2.059605026929982, + "grad_norm": 0.9229736328125, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 28680 + }, + { + "epoch": 2.06032315978456, + "grad_norm": 0.7685085535049438, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 28690 + }, + { + "epoch": 2.061041292639138, + "grad_norm": 0.6749364137649536, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 28700 + }, + { + "epoch": 2.061759425493716, + "grad_norm": 0.7608520984649658, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28710 + }, + { + "epoch": 2.0624775583482946, + "grad_norm": 0.9451281428337097, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28720 + }, + { + "epoch": 2.0631956912028726, + "grad_norm": 0.7869735360145569, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 28730 + }, + { + "epoch": 2.0639138240574506, + "grad_norm": 0.8422008156776428, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 28740 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 0.7486162781715393, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 28750 + }, + { + "epoch": 2.0653500897666066, + "grad_norm": 0.9374173879623413, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28760 + }, + { + "epoch": 2.066068222621185, + "grad_norm": 0.8749295473098755, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 28770 + }, + { + "epoch": 2.066786355475763, + "grad_norm": 0.8265942931175232, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 28780 + }, + { + "epoch": 2.067504488330341, + "grad_norm": 0.8541982769966125, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 28790 + }, + { + "epoch": 2.068222621184919, + "grad_norm": 0.8220006227493286, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 28800 + }, + { + "epoch": 2.0689407540394975, + "grad_norm": 0.7302022576332092, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 28810 + }, + { + "epoch": 2.0696588868940755, + "grad_norm": 0.7073875069618225, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 28820 + }, + { + "epoch": 2.0703770197486535, + "grad_norm": 0.7792919874191284, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28830 + }, + { + "epoch": 2.0710951526032315, + "grad_norm": 0.8268185257911682, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 28840 + }, + { + "epoch": 2.0718132854578095, + "grad_norm": 0.7576423287391663, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 28850 + }, + { + "epoch": 2.072531418312388, + "grad_norm": 0.8255910873413086, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 28860 + }, + { + "epoch": 2.073249551166966, + "grad_norm": 0.7900934815406799, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 28870 + }, + { + "epoch": 2.073967684021544, + "grad_norm": 0.846665620803833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 28880 + }, + { + "epoch": 2.074685816876122, + "grad_norm": 0.8159831166267395, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 28890 + }, + { + "epoch": 2.0754039497307, + "grad_norm": 0.7395941615104675, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 28900 + }, + { + "epoch": 2.0761220825852784, + "grad_norm": 0.9765046238899231, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 28910 + }, + { + "epoch": 2.0768402154398564, + "grad_norm": 0.8358173966407776, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 28920 + }, + { + "epoch": 2.0775583482944344, + "grad_norm": 0.6848723292350769, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 28930 + }, + { + "epoch": 2.0782764811490124, + "grad_norm": 0.7965065836906433, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 28940 + }, + { + "epoch": 2.078994614003591, + "grad_norm": 0.7618608474731445, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 28950 + }, + { + "epoch": 2.079712746858169, + "grad_norm": 0.890615701675415, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 28960 + }, + { + "epoch": 2.080430879712747, + "grad_norm": 0.7310431003570557, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28970 + }, + { + "epoch": 2.081149012567325, + "grad_norm": 0.8228268027305603, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 28980 + }, + { + "epoch": 2.081867145421903, + "grad_norm": 0.883577287197113, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28990 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 0.8359243869781494, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 29000 + }, + { + "epoch": 2.0833034111310593, + "grad_norm": 0.8285391330718994, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 29010 + }, + { + "epoch": 2.0840215439856373, + "grad_norm": 0.8991064429283142, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 29020 + }, + { + "epoch": 2.0847396768402153, + "grad_norm": 0.6911244988441467, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 29030 + }, + { + "epoch": 2.0854578096947933, + "grad_norm": 0.8462249636650085, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 29040 + }, + { + "epoch": 2.0861759425493718, + "grad_norm": 0.9149548411369324, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 29050 + }, + { + "epoch": 2.0868940754039498, + "grad_norm": 0.7365630269050598, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 29060 + }, + { + "epoch": 2.087612208258528, + "grad_norm": 0.8439079523086548, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 29070 + }, + { + "epoch": 2.088330341113106, + "grad_norm": 0.7123780846595764, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 29080 + }, + { + "epoch": 2.0890484739676842, + "grad_norm": 0.6854261755943298, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 29090 + }, + { + "epoch": 2.0897666068222622, + "grad_norm": 0.83026123046875, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 29100 + }, + { + "epoch": 2.0904847396768402, + "grad_norm": 0.8413158059120178, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 29110 + }, + { + "epoch": 2.0912028725314182, + "grad_norm": 0.9646758437156677, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 29120 + }, + { + "epoch": 2.0919210053859962, + "grad_norm": 0.8421565890312195, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 29130 + }, + { + "epoch": 2.0926391382405747, + "grad_norm": 0.7748899459838867, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 29140 + }, + { + "epoch": 2.0933572710951527, + "grad_norm": 0.5973830819129944, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 29150 + }, + { + "epoch": 2.0940754039497307, + "grad_norm": 0.8440837860107422, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 29160 + }, + { + "epoch": 2.0947935368043087, + "grad_norm": 0.7392688989639282, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 29170 + }, + { + "epoch": 2.0955116696588867, + "grad_norm": 1.0522996187210083, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 29180 + }, + { + "epoch": 2.096229802513465, + "grad_norm": 0.7330273389816284, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 29190 + }, + { + "epoch": 2.096947935368043, + "grad_norm": 1.11064875125885, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 29200 + }, + { + "epoch": 2.097666068222621, + "grad_norm": 0.795446515083313, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 29210 + }, + { + "epoch": 2.098384201077199, + "grad_norm": 0.5552594661712646, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 29220 + }, + { + "epoch": 2.0991023339317776, + "grad_norm": 0.7327710390090942, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 29230 + }, + { + "epoch": 2.0998204667863556, + "grad_norm": 0.7474247217178345, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 29240 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": 0.7775853276252747, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 29250 + }, + { + "epoch": 2.1012567324955116, + "grad_norm": 0.769527018070221, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29260 + }, + { + "epoch": 2.1019748653500896, + "grad_norm": 0.8350797891616821, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 29270 + }, + { + "epoch": 2.102692998204668, + "grad_norm": 0.8749061822891235, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29280 + }, + { + "epoch": 2.103411131059246, + "grad_norm": 0.7838778495788574, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 29290 + }, + { + "epoch": 2.104129263913824, + "grad_norm": 0.8144710063934326, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 29300 + }, + { + "epoch": 2.104847396768402, + "grad_norm": 0.7965250015258789, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 29310 + }, + { + "epoch": 2.10556552962298, + "grad_norm": 0.7075945138931274, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 29320 + }, + { + "epoch": 2.1062836624775585, + "grad_norm": 0.9449555277824402, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 29330 + }, + { + "epoch": 2.1070017953321365, + "grad_norm": 0.9114580750465393, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 29340 + }, + { + "epoch": 2.1077199281867145, + "grad_norm": 0.8768125176429749, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 29350 + }, + { + "epoch": 2.1084380610412925, + "grad_norm": 0.8586908578872681, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 29360 + }, + { + "epoch": 2.109156193895871, + "grad_norm": 0.8351234793663025, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 29370 + }, + { + "epoch": 2.109874326750449, + "grad_norm": 0.686488687992096, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 29380 + }, + { + "epoch": 2.110592459605027, + "grad_norm": 0.7910184264183044, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 29390 + }, + { + "epoch": 2.111310592459605, + "grad_norm": 0.7649612426757812, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 29400 + }, + { + "epoch": 2.112028725314183, + "grad_norm": 0.7790259122848511, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29410 + }, + { + "epoch": 2.1127468581687614, + "grad_norm": 0.8386351466178894, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 29420 + }, + { + "epoch": 2.1134649910233394, + "grad_norm": 0.8605695366859436, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 29430 + }, + { + "epoch": 2.1141831238779174, + "grad_norm": 0.6808947920799255, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 29440 + }, + { + "epoch": 2.1149012567324954, + "grad_norm": 0.8310001492500305, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 29450 + }, + { + "epoch": 2.1156193895870734, + "grad_norm": 1.289986252784729, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 29460 + }, + { + "epoch": 2.116337522441652, + "grad_norm": 0.8679313659667969, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 29470 + }, + { + "epoch": 2.11705565529623, + "grad_norm": 0.9149175882339478, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 29480 + }, + { + "epoch": 2.117773788150808, + "grad_norm": 0.8405622839927673, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 29490 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 0.9174691438674927, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 29500 + }, + { + "epoch": 2.1192100538599643, + "grad_norm": 0.8865614533424377, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29510 + }, + { + "epoch": 2.1199281867145423, + "grad_norm": 0.645301342010498, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29520 + }, + { + "epoch": 2.1206463195691203, + "grad_norm": 0.7612960338592529, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 29530 + }, + { + "epoch": 2.1213644524236983, + "grad_norm": 0.7575576305389404, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 29540 + }, + { + "epoch": 2.1220825852782763, + "grad_norm": 0.8746156096458435, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 29550 + }, + { + "epoch": 2.1228007181328548, + "grad_norm": 0.8488934636116028, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 29560 + }, + { + "epoch": 2.1235188509874328, + "grad_norm": 0.8064972162246704, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 29570 + }, + { + "epoch": 2.1242369838420108, + "grad_norm": 0.7410933971405029, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 29580 + }, + { + "epoch": 2.1249551166965888, + "grad_norm": 0.7023535966873169, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 29590 + }, + { + "epoch": 2.1256732495511668, + "grad_norm": 0.8591743111610413, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 29600 + }, + { + "epoch": 2.126391382405745, + "grad_norm": 0.7270186543464661, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 29610 + }, + { + "epoch": 2.127109515260323, + "grad_norm": 0.9639726281166077, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 29620 + }, + { + "epoch": 2.127827648114901, + "grad_norm": 0.8519027829170227, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 29630 + }, + { + "epoch": 2.128545780969479, + "grad_norm": 0.8786447048187256, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 29640 + }, + { + "epoch": 2.129263913824057, + "grad_norm": 0.7452822923660278, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29650 + }, + { + "epoch": 2.1299820466786357, + "grad_norm": 0.9385744333267212, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 29660 + }, + { + "epoch": 2.1307001795332137, + "grad_norm": 0.7650160193443298, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 29670 + }, + { + "epoch": 2.1314183123877917, + "grad_norm": 0.7581976652145386, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 29680 + }, + { + "epoch": 2.1321364452423697, + "grad_norm": 0.8455183506011963, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 29690 + }, + { + "epoch": 2.132854578096948, + "grad_norm": 0.7200509905815125, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 29700 + }, + { + "epoch": 2.133572710951526, + "grad_norm": 0.7071877121925354, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 29710 + }, + { + "epoch": 2.134290843806104, + "grad_norm": 0.9197220802307129, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 29720 + }, + { + "epoch": 2.135008976660682, + "grad_norm": 0.6787277460098267, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 29730 + }, + { + "epoch": 2.13572710951526, + "grad_norm": 0.8183788061141968, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 29740 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 0.7958994507789612, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29750 + }, + { + "epoch": 2.1371633752244166, + "grad_norm": 0.8803889155387878, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 29760 + }, + { + "epoch": 2.1378815080789946, + "grad_norm": 0.6682677268981934, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 29770 + }, + { + "epoch": 2.1385996409335726, + "grad_norm": 1.0198085308074951, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 29780 + }, + { + "epoch": 2.139317773788151, + "grad_norm": 1.0258227586746216, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 29790 + }, + { + "epoch": 2.140035906642729, + "grad_norm": 0.8920917510986328, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 29800 + }, + { + "epoch": 2.140754039497307, + "grad_norm": 0.8352635502815247, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 29810 + }, + { + "epoch": 2.141472172351885, + "grad_norm": 0.8422067165374756, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 29820 + }, + { + "epoch": 2.142190305206463, + "grad_norm": 0.8845202326774597, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 29830 + }, + { + "epoch": 2.1429084380610415, + "grad_norm": 0.659397542476654, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 29840 + }, + { + "epoch": 2.1436265709156195, + "grad_norm": 0.6233306527137756, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 29850 + }, + { + "epoch": 2.1443447037701975, + "grad_norm": 0.8951199054718018, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 29860 + }, + { + "epoch": 2.1450628366247755, + "grad_norm": 0.6980211734771729, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 29870 + }, + { + "epoch": 2.1457809694793535, + "grad_norm": 0.8463385105133057, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29880 + }, + { + "epoch": 2.146499102333932, + "grad_norm": 0.682183027267456, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 29890 + }, + { + "epoch": 2.14721723518851, + "grad_norm": 0.8491033911705017, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 29900 + }, + { + "epoch": 2.147935368043088, + "grad_norm": 0.8112631440162659, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 29910 + }, + { + "epoch": 2.148653500897666, + "grad_norm": 1.0186359882354736, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29920 + }, + { + "epoch": 2.149371633752244, + "grad_norm": 0.7904929518699646, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 29930 + }, + { + "epoch": 2.1500897666068224, + "grad_norm": 0.8381312489509583, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29940 + }, + { + "epoch": 2.1508078994614004, + "grad_norm": 0.7596192359924316, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 29950 + }, + { + "epoch": 2.1515260323159784, + "grad_norm": 0.7532448768615723, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 29960 + }, + { + "epoch": 2.1522441651705564, + "grad_norm": 0.7877430319786072, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 29970 + }, + { + "epoch": 2.152962298025135, + "grad_norm": 0.6870610117912292, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 29980 + }, + { + "epoch": 2.153680430879713, + "grad_norm": 0.7154987454414368, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 29990 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 0.7692370414733887, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 30000 + }, + { + "epoch": 2.155116696588869, + "grad_norm": 0.7745859026908875, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 30010 + }, + { + "epoch": 2.155834829443447, + "grad_norm": 0.718207061290741, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 30020 + }, + { + "epoch": 2.1565529622980253, + "grad_norm": 0.8851615786552429, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30030 + }, + { + "epoch": 2.1572710951526033, + "grad_norm": 0.736194372177124, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 30040 + }, + { + "epoch": 2.1579892280071813, + "grad_norm": 0.9908117055892944, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 30050 + }, + { + "epoch": 2.1587073608617593, + "grad_norm": 0.6772316694259644, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30060 + }, + { + "epoch": 2.1594254937163377, + "grad_norm": 0.7474411725997925, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 30070 + }, + { + "epoch": 2.1601436265709157, + "grad_norm": 0.8140033483505249, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 30080 + }, + { + "epoch": 2.1608617594254937, + "grad_norm": 0.912555992603302, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 30090 + }, + { + "epoch": 2.1615798922800717, + "grad_norm": 0.8189636468887329, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 30100 + }, + { + "epoch": 2.1622980251346497, + "grad_norm": 0.7520000338554382, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 30110 + }, + { + "epoch": 2.163016157989228, + "grad_norm": 0.9635465741157532, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 30120 + }, + { + "epoch": 2.163734290843806, + "grad_norm": 0.9139830470085144, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 30130 + }, + { + "epoch": 2.164452423698384, + "grad_norm": 0.844384491443634, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 30140 + }, + { + "epoch": 2.165170556552962, + "grad_norm": 0.8296793103218079, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 30150 + }, + { + "epoch": 2.16588868940754, + "grad_norm": 0.7929309606552124, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30160 + }, + { + "epoch": 2.1666068222621186, + "grad_norm": 0.8046507239341736, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 30170 + }, + { + "epoch": 2.1673249551166966, + "grad_norm": 0.8161377310752869, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 30180 + }, + { + "epoch": 2.1680430879712747, + "grad_norm": 0.6984363794326782, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 30190 + }, + { + "epoch": 2.1687612208258527, + "grad_norm": 0.8578489422798157, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30200 + }, + { + "epoch": 2.1694793536804307, + "grad_norm": 0.8051524758338928, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30210 + }, + { + "epoch": 2.170197486535009, + "grad_norm": 0.6775792241096497, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 30220 + }, + { + "epoch": 2.170915619389587, + "grad_norm": 0.7102242708206177, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 30230 + }, + { + "epoch": 2.171633752244165, + "grad_norm": 0.9038975238800049, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 30240 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 0.8509918451309204, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 30250 + }, + { + "epoch": 2.1730700179533216, + "grad_norm": 0.8816375732421875, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 30260 + }, + { + "epoch": 2.1737881508078996, + "grad_norm": 0.7907037138938904, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 30270 + }, + { + "epoch": 2.1745062836624776, + "grad_norm": 0.7104434967041016, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 30280 + }, + { + "epoch": 2.1752244165170556, + "grad_norm": 1.028658151626587, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 30290 + }, + { + "epoch": 2.1759425493716336, + "grad_norm": 0.8542430400848389, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 30300 + }, + { + "epoch": 2.176660682226212, + "grad_norm": 0.7438064813613892, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30310 + }, + { + "epoch": 2.17737881508079, + "grad_norm": 0.8384708762168884, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 30320 + }, + { + "epoch": 2.178096947935368, + "grad_norm": 0.9034163355827332, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 30330 + }, + { + "epoch": 2.178815080789946, + "grad_norm": 0.9659526944160461, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 30340 + }, + { + "epoch": 2.1795332136445245, + "grad_norm": 0.6685642600059509, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 30350 + }, + { + "epoch": 2.1802513464991025, + "grad_norm": 0.9180589318275452, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 30360 + }, + { + "epoch": 2.1809694793536805, + "grad_norm": 0.9550795555114746, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 30370 + }, + { + "epoch": 2.1816876122082585, + "grad_norm": 0.8517686724662781, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 30380 + }, + { + "epoch": 2.1824057450628365, + "grad_norm": 0.7351927161216736, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 30390 + }, + { + "epoch": 2.183123877917415, + "grad_norm": 0.8439408540725708, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 30400 + }, + { + "epoch": 2.183842010771993, + "grad_norm": 0.8322570323944092, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 30410 + }, + { + "epoch": 2.184560143626571, + "grad_norm": 0.6735888123512268, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 30420 + }, + { + "epoch": 2.185278276481149, + "grad_norm": 0.7273133397102356, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 30430 + }, + { + "epoch": 2.185996409335727, + "grad_norm": 0.7841959595680237, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 30440 + }, + { + "epoch": 2.1867145421903054, + "grad_norm": 0.67259281873703, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 30450 + }, + { + "epoch": 2.1874326750448834, + "grad_norm": 0.7646223306655884, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 30460 + }, + { + "epoch": 2.1881508078994614, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 30470 + }, + { + "epoch": 2.1888689407540394, + "grad_norm": 0.8818342685699463, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 30480 + }, + { + "epoch": 2.1895870736086174, + "grad_norm": 0.7421377897262573, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 30490 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 0.8180080652236938, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30500 + }, + { + "epoch": 2.191023339317774, + "grad_norm": 0.8003571033477783, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30510 + }, + { + "epoch": 2.191741472172352, + "grad_norm": 0.8200605511665344, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 30520 + }, + { + "epoch": 2.19245960502693, + "grad_norm": 0.8878887295722961, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 30530 + }, + { + "epoch": 2.1931777378815083, + "grad_norm": 0.8518163561820984, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 30540 + }, + { + "epoch": 2.1938958707360863, + "grad_norm": 0.8182454705238342, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 30550 + }, + { + "epoch": 2.1946140035906643, + "grad_norm": 0.9395919442176819, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 30560 + }, + { + "epoch": 2.1953321364452423, + "grad_norm": 0.7916256189346313, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 30570 + }, + { + "epoch": 2.1960502692998203, + "grad_norm": 0.7303445339202881, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 30580 + }, + { + "epoch": 2.1967684021543987, + "grad_norm": 0.7407387495040894, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 30590 + }, + { + "epoch": 2.1974865350089767, + "grad_norm": 0.7410500645637512, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 30600 + }, + { + "epoch": 2.1982046678635547, + "grad_norm": 0.9176440834999084, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 30610 + }, + { + "epoch": 2.1989228007181327, + "grad_norm": 0.8823038935661316, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 30620 + }, + { + "epoch": 2.199640933572711, + "grad_norm": 0.9263436198234558, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 30630 + }, + { + "epoch": 2.200359066427289, + "grad_norm": 0.6753571033477783, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 30640 + }, + { + "epoch": 2.201077199281867, + "grad_norm": 0.841160774230957, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 30650 + }, + { + "epoch": 2.201795332136445, + "grad_norm": 0.8786441683769226, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 30660 + }, + { + "epoch": 2.202513464991023, + "grad_norm": 0.8833681344985962, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 30670 + }, + { + "epoch": 2.2032315978456016, + "grad_norm": 0.6609824299812317, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 30680 + }, + { + "epoch": 2.2039497307001796, + "grad_norm": 0.7308626174926758, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 30690 + }, + { + "epoch": 2.2046678635547576, + "grad_norm": 0.8854711055755615, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 30700 + }, + { + "epoch": 2.2053859964093356, + "grad_norm": 0.839043140411377, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 30710 + }, + { + "epoch": 2.2061041292639136, + "grad_norm": 0.9030174016952515, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 30720 + }, + { + "epoch": 2.206822262118492, + "grad_norm": 0.6856667399406433, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 30730 + }, + { + "epoch": 2.20754039497307, + "grad_norm": 0.8823501467704773, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 30740 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 0.8501278162002563, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 30750 + }, + { + "epoch": 2.208976660682226, + "grad_norm": 0.8099446892738342, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 30760 + }, + { + "epoch": 2.209694793536804, + "grad_norm": 0.7203072905540466, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 30770 + }, + { + "epoch": 2.2104129263913825, + "grad_norm": 1.0898563861846924, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 30780 + }, + { + "epoch": 2.2111310592459605, + "grad_norm": 0.8157216906547546, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 30790 + }, + { + "epoch": 2.2118491921005385, + "grad_norm": 0.7617478966712952, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 30800 + }, + { + "epoch": 2.2125673249551165, + "grad_norm": 0.790503978729248, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 30810 + }, + { + "epoch": 2.213285457809695, + "grad_norm": 0.9289199113845825, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 30820 + }, + { + "epoch": 2.214003590664273, + "grad_norm": 0.9267001748085022, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 30830 + }, + { + "epoch": 2.214721723518851, + "grad_norm": 0.716023862361908, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 30840 + }, + { + "epoch": 2.215439856373429, + "grad_norm": 0.8733863234519958, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 30850 + }, + { + "epoch": 2.216157989228007, + "grad_norm": 0.7743660807609558, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 30860 + }, + { + "epoch": 2.2168761220825854, + "grad_norm": 0.7974567413330078, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 30870 + }, + { + "epoch": 2.2175942549371634, + "grad_norm": 0.6617984771728516, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 30880 + }, + { + "epoch": 2.2183123877917414, + "grad_norm": 0.6925143003463745, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 30890 + }, + { + "epoch": 2.2190305206463194, + "grad_norm": 0.6853532195091248, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 30900 + }, + { + "epoch": 2.219748653500898, + "grad_norm": 0.7964699268341064, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 30910 + }, + { + "epoch": 2.220466786355476, + "grad_norm": 0.8116228580474854, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 30920 + }, + { + "epoch": 2.221184919210054, + "grad_norm": 1.0121010541915894, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 30930 + }, + { + "epoch": 2.221903052064632, + "grad_norm": 0.7348445653915405, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 30940 + }, + { + "epoch": 2.22262118491921, + "grad_norm": 0.8998047709465027, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 30950 + }, + { + "epoch": 2.2233393177737883, + "grad_norm": 0.6108106970787048, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 30960 + }, + { + "epoch": 2.2240574506283664, + "grad_norm": 1.287834882736206, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 30970 + }, + { + "epoch": 2.2247755834829444, + "grad_norm": 0.8584468960762024, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 30980 + }, + { + "epoch": 2.2254937163375224, + "grad_norm": 0.865276038646698, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 30990 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 0.8713302612304688, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 31000 + }, + { + "epoch": 2.226929982046679, + "grad_norm": 0.9210535883903503, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 31010 + }, + { + "epoch": 2.227648114901257, + "grad_norm": 0.8578430414199829, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 31020 + }, + { + "epoch": 2.228366247755835, + "grad_norm": 0.7128387093544006, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 31030 + }, + { + "epoch": 2.229084380610413, + "grad_norm": 0.8059941530227661, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 31040 + }, + { + "epoch": 2.229802513464991, + "grad_norm": 0.8043261170387268, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 31050 + }, + { + "epoch": 2.2305206463195693, + "grad_norm": 0.9260253310203552, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 31060 + }, + { + "epoch": 2.2312387791741473, + "grad_norm": 0.7908085584640503, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 31070 + }, + { + "epoch": 2.2319569120287253, + "grad_norm": 0.7860442996025085, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 31080 + }, + { + "epoch": 2.2326750448833033, + "grad_norm": 0.8388702273368835, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 31090 + }, + { + "epoch": 2.2333931777378817, + "grad_norm": 0.835686206817627, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 31100 + }, + { + "epoch": 2.2341113105924597, + "grad_norm": 0.8148298859596252, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 31110 + }, + { + "epoch": 2.2348294434470377, + "grad_norm": 0.8501878976821899, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 31120 + }, + { + "epoch": 2.2355475763016157, + "grad_norm": 0.793323278427124, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 31130 + }, + { + "epoch": 2.2362657091561937, + "grad_norm": 0.8234742879867554, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31140 + }, + { + "epoch": 2.236983842010772, + "grad_norm": 0.8691303133964539, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 31150 + }, + { + "epoch": 2.23770197486535, + "grad_norm": 0.8707090020179749, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 31160 + }, + { + "epoch": 2.238420107719928, + "grad_norm": 0.8468940854072571, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 31170 + }, + { + "epoch": 2.239138240574506, + "grad_norm": 0.7275772094726562, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 31180 + }, + { + "epoch": 2.2398563734290846, + "grad_norm": 0.8765808939933777, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 31190 + }, + { + "epoch": 2.2405745062836626, + "grad_norm": 1.02803635597229, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 31200 + }, + { + "epoch": 2.2412926391382406, + "grad_norm": 0.7999185919761658, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 31210 + }, + { + "epoch": 2.2420107719928186, + "grad_norm": 0.5711870789527893, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 31220 + }, + { + "epoch": 2.2427289048473966, + "grad_norm": 0.7183604836463928, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 31230 + }, + { + "epoch": 2.243447037701975, + "grad_norm": 0.8819206357002258, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 31240 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 0.9078969955444336, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 31250 + }, + { + "epoch": 2.244883303411131, + "grad_norm": 1.184506893157959, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 31260 + }, + { + "epoch": 2.245601436265709, + "grad_norm": 0.8660752177238464, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 31270 + }, + { + "epoch": 2.246319569120287, + "grad_norm": 1.011796236038208, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 31280 + }, + { + "epoch": 2.2470377019748655, + "grad_norm": 0.9168157577514648, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 31290 + }, + { + "epoch": 2.2477558348294435, + "grad_norm": 0.7798577547073364, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 31300 + }, + { + "epoch": 2.2484739676840215, + "grad_norm": 0.6609913110733032, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 31310 + }, + { + "epoch": 2.2491921005385995, + "grad_norm": 0.64737868309021, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 31320 + }, + { + "epoch": 2.2499102333931775, + "grad_norm": 1.0700385570526123, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 31330 + }, + { + "epoch": 2.250628366247756, + "grad_norm": 0.7838551998138428, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 31340 + }, + { + "epoch": 2.251346499102334, + "grad_norm": 0.9225728511810303, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 31350 + }, + { + "epoch": 2.252064631956912, + "grad_norm": 0.7956384420394897, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 31360 + }, + { + "epoch": 2.25278276481149, + "grad_norm": 0.7645466923713684, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 31370 + }, + { + "epoch": 2.2535008976660684, + "grad_norm": 0.9595549702644348, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 31380 + }, + { + "epoch": 2.2542190305206464, + "grad_norm": 0.6124163866043091, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 31390 + }, + { + "epoch": 2.2549371633752244, + "grad_norm": 0.7531530261039734, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 31400 + }, + { + "epoch": 2.2556552962298024, + "grad_norm": 0.6904721856117249, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 31410 + }, + { + "epoch": 2.2563734290843804, + "grad_norm": 0.7644204497337341, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 31420 + }, + { + "epoch": 2.257091561938959, + "grad_norm": 0.7879737019538879, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 31430 + }, + { + "epoch": 2.257809694793537, + "grad_norm": 0.796450138092041, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 31440 + }, + { + "epoch": 2.258527827648115, + "grad_norm": 0.7536656856536865, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31450 + }, + { + "epoch": 2.259245960502693, + "grad_norm": 0.6797451376914978, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 31460 + }, + { + "epoch": 2.2599640933572713, + "grad_norm": 0.7833347320556641, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 31470 + }, + { + "epoch": 2.2606822262118493, + "grad_norm": 0.7571428418159485, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 31480 + }, + { + "epoch": 2.2614003590664273, + "grad_norm": 0.7028690576553345, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 31490 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 0.7854651212692261, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 31500 + }, + { + "epoch": 2.2628366247755833, + "grad_norm": 1.1924974918365479, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 31510 + }, + { + "epoch": 2.2635547576301613, + "grad_norm": 0.8087588548660278, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 31520 + }, + { + "epoch": 2.26427289048474, + "grad_norm": 0.8521981835365295, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31530 + }, + { + "epoch": 2.264991023339318, + "grad_norm": 0.754585862159729, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 31540 + }, + { + "epoch": 2.265709156193896, + "grad_norm": 0.8403395414352417, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 31550 + }, + { + "epoch": 2.266427289048474, + "grad_norm": 0.9724786877632141, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 31560 + }, + { + "epoch": 2.2671454219030522, + "grad_norm": 0.7568767070770264, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 31570 + }, + { + "epoch": 2.2678635547576302, + "grad_norm": 0.712009608745575, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 31580 + }, + { + "epoch": 2.2685816876122082, + "grad_norm": 0.7649937868118286, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 31590 + }, + { + "epoch": 2.2692998204667862, + "grad_norm": 0.7319537997245789, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 31600 + }, + { + "epoch": 2.2700179533213642, + "grad_norm": 0.9597942233085632, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 31610 + }, + { + "epoch": 2.2707360861759427, + "grad_norm": 0.7403358817100525, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 31620 + }, + { + "epoch": 2.2714542190305207, + "grad_norm": 0.7395114898681641, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 31630 + }, + { + "epoch": 2.2721723518850987, + "grad_norm": 0.8835344314575195, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 31640 + }, + { + "epoch": 2.2728904847396767, + "grad_norm": 0.76587975025177, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 31650 + }, + { + "epoch": 2.273608617594255, + "grad_norm": 0.6472584009170532, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 31660 + }, + { + "epoch": 2.274326750448833, + "grad_norm": 1.0170460939407349, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 31670 + }, + { + "epoch": 2.275044883303411, + "grad_norm": 0.8170912265777588, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 31680 + }, + { + "epoch": 2.275763016157989, + "grad_norm": 0.6821279525756836, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 31690 + }, + { + "epoch": 2.276481149012567, + "grad_norm": 0.8150709867477417, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 31700 + }, + { + "epoch": 2.2771992818671456, + "grad_norm": 0.6786386370658875, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 31710 + }, + { + "epoch": 2.2779174147217236, + "grad_norm": 0.8871912360191345, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 31720 + }, + { + "epoch": 2.2786355475763016, + "grad_norm": 0.7710220813751221, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 31730 + }, + { + "epoch": 2.2793536804308796, + "grad_norm": 0.8073079586029053, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 31740 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 0.8228550553321838, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 31750 + }, + { + "epoch": 2.280789946140036, + "grad_norm": 0.7987996339797974, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 31760 + }, + { + "epoch": 2.281508078994614, + "grad_norm": 0.744326651096344, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 31770 + }, + { + "epoch": 2.282226211849192, + "grad_norm": 0.7672302722930908, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 31780 + }, + { + "epoch": 2.28294434470377, + "grad_norm": 0.8079774975776672, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 31790 + }, + { + "epoch": 2.283662477558348, + "grad_norm": 0.7383643984794617, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 31800 + }, + { + "epoch": 2.2843806104129265, + "grad_norm": 0.8542332649230957, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 31810 + }, + { + "epoch": 2.2850987432675045, + "grad_norm": 0.7657321691513062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 31820 + }, + { + "epoch": 2.2858168761220825, + "grad_norm": 0.7485944628715515, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 31830 + }, + { + "epoch": 2.2865350089766605, + "grad_norm": 0.7817596793174744, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 31840 + }, + { + "epoch": 2.287253141831239, + "grad_norm": 0.840421736240387, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31850 + }, + { + "epoch": 2.287971274685817, + "grad_norm": 0.8190447688102722, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 31860 + }, + { + "epoch": 2.288689407540395, + "grad_norm": 0.9582287669181824, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 31870 + }, + { + "epoch": 2.289407540394973, + "grad_norm": 1.0939116477966309, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 31880 + }, + { + "epoch": 2.290125673249551, + "grad_norm": 1.0901678800582886, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 31890 + }, + { + "epoch": 2.2908438061041294, + "grad_norm": 0.8025168776512146, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 31900 + }, + { + "epoch": 2.2915619389587074, + "grad_norm": 0.8157371878623962, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 31910 + }, + { + "epoch": 2.2922800718132854, + "grad_norm": 0.7735328078269958, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 31920 + }, + { + "epoch": 2.2929982046678634, + "grad_norm": 0.7501550316810608, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 31930 + }, + { + "epoch": 2.293716337522442, + "grad_norm": 0.76664799451828, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 31940 + }, + { + "epoch": 2.29443447037702, + "grad_norm": 1.0044599771499634, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 31950 + }, + { + "epoch": 2.295152603231598, + "grad_norm": 0.7773551344871521, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 31960 + }, + { + "epoch": 2.295870736086176, + "grad_norm": 0.9021226763725281, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 31970 + }, + { + "epoch": 2.296588868940754, + "grad_norm": 0.9075915813446045, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 31980 + }, + { + "epoch": 2.2973070017953323, + "grad_norm": 0.9109290242195129, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 31990 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 0.7742900252342224, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32000 + }, + { + "epoch": 2.2987432675044883, + "grad_norm": 0.633260190486908, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 32010 + }, + { + "epoch": 2.2994614003590663, + "grad_norm": 0.8593834042549133, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 32020 + }, + { + "epoch": 2.3001795332136448, + "grad_norm": 0.88165283203125, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32030 + }, + { + "epoch": 2.3008976660682228, + "grad_norm": 0.7840633988380432, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 32040 + }, + { + "epoch": 2.3016157989228008, + "grad_norm": 0.8150764107704163, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 32050 + }, + { + "epoch": 2.3023339317773788, + "grad_norm": 0.7683324813842773, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32060 + }, + { + "epoch": 2.3030520646319568, + "grad_norm": 0.7581049799919128, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 32070 + }, + { + "epoch": 2.3037701974865348, + "grad_norm": 0.911687970161438, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32080 + }, + { + "epoch": 2.3044883303411132, + "grad_norm": 1.0596355199813843, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32090 + }, + { + "epoch": 2.3052064631956912, + "grad_norm": 0.7329661846160889, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 32100 + }, + { + "epoch": 2.3059245960502692, + "grad_norm": 0.8251074552536011, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 32110 + }, + { + "epoch": 2.3066427289048472, + "grad_norm": 0.7765523195266724, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 32120 + }, + { + "epoch": 2.3073608617594257, + "grad_norm": 0.8246980905532837, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 32130 + }, + { + "epoch": 2.3080789946140037, + "grad_norm": 0.833387017250061, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 32140 + }, + { + "epoch": 2.3087971274685817, + "grad_norm": 0.9558065533638, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 32150 + }, + { + "epoch": 2.3095152603231597, + "grad_norm": 0.788151204586029, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 32160 + }, + { + "epoch": 2.3102333931777377, + "grad_norm": 0.8662320971488953, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 32170 + }, + { + "epoch": 2.310951526032316, + "grad_norm": 0.7079060673713684, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 32180 + }, + { + "epoch": 2.311669658886894, + "grad_norm": 0.8477022647857666, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 32190 + }, + { + "epoch": 2.312387791741472, + "grad_norm": 0.6549711227416992, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 32200 + }, + { + "epoch": 2.31310592459605, + "grad_norm": 0.8274375796318054, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 32210 + }, + { + "epoch": 2.3138240574506286, + "grad_norm": 0.6305822730064392, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 32220 + }, + { + "epoch": 2.3145421903052066, + "grad_norm": 0.8105725049972534, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 32230 + }, + { + "epoch": 2.3152603231597846, + "grad_norm": 0.7317119240760803, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 32240 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 0.7729924917221069, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 32250 + }, + { + "epoch": 2.3166965888689406, + "grad_norm": 0.8092145919799805, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 32260 + }, + { + "epoch": 2.317414721723519, + "grad_norm": 0.8723762035369873, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 32270 + }, + { + "epoch": 2.318132854578097, + "grad_norm": 0.9699533581733704, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 32280 + }, + { + "epoch": 2.318850987432675, + "grad_norm": 1.2972444295883179, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 32290 + }, + { + "epoch": 2.319569120287253, + "grad_norm": 0.7888450622558594, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 32300 + }, + { + "epoch": 2.3202872531418315, + "grad_norm": 0.7457000017166138, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 32310 + }, + { + "epoch": 2.3210053859964095, + "grad_norm": 0.7270606756210327, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 32320 + }, + { + "epoch": 2.3217235188509875, + "grad_norm": 0.7930711507797241, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32330 + }, + { + "epoch": 2.3224416517055655, + "grad_norm": 0.9015030264854431, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 32340 + }, + { + "epoch": 2.3231597845601435, + "grad_norm": 0.9385523796081543, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 32350 + }, + { + "epoch": 2.3238779174147215, + "grad_norm": 0.7293606400489807, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 32360 + }, + { + "epoch": 2.3245960502693, + "grad_norm": 0.797618567943573, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32370 + }, + { + "epoch": 2.325314183123878, + "grad_norm": 0.8588258028030396, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 32380 + }, + { + "epoch": 2.326032315978456, + "grad_norm": 0.7490078210830688, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 32390 + }, + { + "epoch": 2.326750448833034, + "grad_norm": 0.7569956183433533, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 32400 + }, + { + "epoch": 2.3274685816876124, + "grad_norm": 0.8754122853279114, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 32410 + }, + { + "epoch": 2.3281867145421904, + "grad_norm": 0.9410699605941772, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 32420 + }, + { + "epoch": 2.3289048473967684, + "grad_norm": 1.1309062242507935, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 32430 + }, + { + "epoch": 2.3296229802513464, + "grad_norm": 0.7923168540000916, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 32440 + }, + { + "epoch": 2.3303411131059244, + "grad_norm": 0.830387532711029, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 32450 + }, + { + "epoch": 2.331059245960503, + "grad_norm": 0.9087454080581665, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 32460 + }, + { + "epoch": 2.331777378815081, + "grad_norm": 0.8892660737037659, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 32470 + }, + { + "epoch": 2.332495511669659, + "grad_norm": 0.84930819272995, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 32480 + }, + { + "epoch": 2.333213644524237, + "grad_norm": 0.7736781239509583, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 32490 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 0.7396222352981567, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 32500 + }, + { + "epoch": 2.3346499102333933, + "grad_norm": 0.7710241079330444, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 32510 + }, + { + "epoch": 2.3353680430879713, + "grad_norm": 0.7297301888465881, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 32520 + }, + { + "epoch": 2.3360861759425493, + "grad_norm": 0.9084094166755676, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 32530 + }, + { + "epoch": 2.3368043087971273, + "grad_norm": 0.6425859332084656, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 32540 + }, + { + "epoch": 2.3375224416517058, + "grad_norm": 0.8646581172943115, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 32550 + }, + { + "epoch": 2.3382405745062838, + "grad_norm": 0.91925048828125, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 32560 + }, + { + "epoch": 2.3389587073608618, + "grad_norm": 0.8687716722488403, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 32570 + }, + { + "epoch": 2.3396768402154398, + "grad_norm": 0.9769517183303833, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 32580 + }, + { + "epoch": 2.340394973070018, + "grad_norm": 0.7240557074546814, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 32590 + }, + { + "epoch": 2.341113105924596, + "grad_norm": 0.6631549000740051, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32600 + }, + { + "epoch": 2.341831238779174, + "grad_norm": 0.9103635549545288, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 32610 + }, + { + "epoch": 2.342549371633752, + "grad_norm": 0.8718403577804565, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 32620 + }, + { + "epoch": 2.34326750448833, + "grad_norm": 0.8020271062850952, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 32630 + }, + { + "epoch": 2.343985637342908, + "grad_norm": 0.7834265232086182, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 32640 + }, + { + "epoch": 2.3447037701974867, + "grad_norm": 0.8909988403320312, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 32650 + }, + { + "epoch": 2.3454219030520647, + "grad_norm": 0.6915582418441772, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 32660 + }, + { + "epoch": 2.3461400359066427, + "grad_norm": 0.8829401135444641, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 32670 + }, + { + "epoch": 2.3468581687612207, + "grad_norm": 0.8869150876998901, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 32680 + }, + { + "epoch": 2.347576301615799, + "grad_norm": 0.8348933458328247, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 32690 + }, + { + "epoch": 2.348294434470377, + "grad_norm": 0.7591108679771423, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32700 + }, + { + "epoch": 2.349012567324955, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 32710 + }, + { + "epoch": 2.349730700179533, + "grad_norm": 0.8537896275520325, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 32720 + }, + { + "epoch": 2.350448833034111, + "grad_norm": 0.7750797867774963, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 32730 + }, + { + "epoch": 2.3511669658886896, + "grad_norm": 0.7553941607475281, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 32740 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 0.8083372712135315, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 32750 + }, + { + "epoch": 2.3526032315978456, + "grad_norm": 0.8016324043273926, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 32760 + }, + { + "epoch": 2.3533213644524236, + "grad_norm": 0.7524061799049377, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 32770 + }, + { + "epoch": 2.354039497307002, + "grad_norm": 0.9046763777732849, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 32780 + }, + { + "epoch": 2.35475763016158, + "grad_norm": 0.9704324007034302, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 32790 + }, + { + "epoch": 2.355475763016158, + "grad_norm": 0.8756019473075867, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 32800 + }, + { + "epoch": 2.356193895870736, + "grad_norm": 0.7345646023750305, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32810 + }, + { + "epoch": 2.356912028725314, + "grad_norm": 0.8022899031639099, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 32820 + }, + { + "epoch": 2.3576301615798925, + "grad_norm": 0.7663353085517883, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 32830 + }, + { + "epoch": 2.3583482944344705, + "grad_norm": 0.7802956104278564, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32840 + }, + { + "epoch": 2.3590664272890485, + "grad_norm": 0.8130960464477539, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 32850 + }, + { + "epoch": 2.3597845601436265, + "grad_norm": 0.9671252369880676, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32860 + }, + { + "epoch": 2.3605026929982045, + "grad_norm": 0.8806724548339844, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32870 + }, + { + "epoch": 2.361220825852783, + "grad_norm": 0.9378283619880676, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 32880 + }, + { + "epoch": 2.361938958707361, + "grad_norm": 0.8638162612915039, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32890 + }, + { + "epoch": 2.362657091561939, + "grad_norm": 0.7321885228157043, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 32900 + }, + { + "epoch": 2.363375224416517, + "grad_norm": 0.8445415496826172, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 32910 + }, + { + "epoch": 2.364093357271095, + "grad_norm": 0.915715754032135, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 32920 + }, + { + "epoch": 2.3648114901256734, + "grad_norm": 0.8674854040145874, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 32930 + }, + { + "epoch": 2.3655296229802514, + "grad_norm": 0.7577189207077026, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 32940 + }, + { + "epoch": 2.3662477558348294, + "grad_norm": 0.8649988174438477, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 32950 + }, + { + "epoch": 2.3669658886894074, + "grad_norm": 0.9760734438896179, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 32960 + }, + { + "epoch": 2.367684021543986, + "grad_norm": 0.8909491300582886, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 32970 + }, + { + "epoch": 2.368402154398564, + "grad_norm": 0.6970168948173523, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32980 + }, + { + "epoch": 2.369120287253142, + "grad_norm": 0.8208426237106323, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 32990 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 0.8477405309677124, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 33000 + }, + { + "epoch": 2.370556552962298, + "grad_norm": 0.7771625518798828, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 33010 + }, + { + "epoch": 2.3712746858168763, + "grad_norm": 0.7811821103096008, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33020 + }, + { + "epoch": 2.3719928186714543, + "grad_norm": 0.6280415654182434, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33030 + }, + { + "epoch": 2.3727109515260323, + "grad_norm": 0.8733929395675659, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 33040 + }, + { + "epoch": 2.3734290843806103, + "grad_norm": 0.6169558167457581, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33050 + }, + { + "epoch": 2.3741472172351887, + "grad_norm": 0.7414724826812744, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33060 + }, + { + "epoch": 2.3748653500897667, + "grad_norm": 0.7484683990478516, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 33070 + }, + { + "epoch": 2.3755834829443447, + "grad_norm": 0.8495098948478699, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 33080 + }, + { + "epoch": 2.3763016157989227, + "grad_norm": 0.9057353734970093, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 33090 + }, + { + "epoch": 2.3770197486535007, + "grad_norm": 0.8028274178504944, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 33100 + }, + { + "epoch": 2.377737881508079, + "grad_norm": 1.2398128509521484, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 33110 + }, + { + "epoch": 2.378456014362657, + "grad_norm": 0.7894110679626465, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 33120 + }, + { + "epoch": 2.379174147217235, + "grad_norm": 0.8530096411705017, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 33130 + }, + { + "epoch": 2.379892280071813, + "grad_norm": 0.892613410949707, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 33140 + }, + { + "epoch": 2.380610412926391, + "grad_norm": 0.868606448173523, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 33150 + }, + { + "epoch": 2.3813285457809696, + "grad_norm": 0.6801115870475769, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 33160 + }, + { + "epoch": 2.3820466786355476, + "grad_norm": 0.9517148733139038, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 33170 + }, + { + "epoch": 2.3827648114901256, + "grad_norm": 0.8986499309539795, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 33180 + }, + { + "epoch": 2.3834829443447036, + "grad_norm": 0.8467642068862915, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33190 + }, + { + "epoch": 2.3842010771992816, + "grad_norm": 0.8400940299034119, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 33200 + }, + { + "epoch": 2.38491921005386, + "grad_norm": 0.86443030834198, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 33210 + }, + { + "epoch": 2.385637342908438, + "grad_norm": 0.8599014282226562, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 33220 + }, + { + "epoch": 2.386355475763016, + "grad_norm": 0.868735134601593, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33230 + }, + { + "epoch": 2.387073608617594, + "grad_norm": 0.941734790802002, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 33240 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 0.9342881441116333, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 33250 + }, + { + "epoch": 2.3885098743267505, + "grad_norm": 1.012920618057251, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 33260 + }, + { + "epoch": 2.3892280071813286, + "grad_norm": 0.6949151754379272, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 33270 + }, + { + "epoch": 2.3899461400359066, + "grad_norm": 0.8283912539482117, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 33280 + }, + { + "epoch": 2.3906642728904846, + "grad_norm": 0.807273805141449, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 33290 + }, + { + "epoch": 2.391382405745063, + "grad_norm": 0.8109124302864075, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 33300 + }, + { + "epoch": 2.392100538599641, + "grad_norm": 0.7477563619613647, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 33310 + }, + { + "epoch": 2.392818671454219, + "grad_norm": 0.6961637735366821, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 33320 + }, + { + "epoch": 2.393536804308797, + "grad_norm": 0.9424173831939697, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 33330 + }, + { + "epoch": 2.3942549371633755, + "grad_norm": 0.8289623856544495, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 33340 + }, + { + "epoch": 2.3949730700179535, + "grad_norm": 0.8106551170349121, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 33350 + }, + { + "epoch": 2.3956912028725315, + "grad_norm": 0.8800507187843323, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33360 + }, + { + "epoch": 2.3964093357271095, + "grad_norm": 0.7662274241447449, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 33370 + }, + { + "epoch": 2.3971274685816875, + "grad_norm": 0.889204740524292, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 33380 + }, + { + "epoch": 2.3978456014362655, + "grad_norm": 0.7991349697113037, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 33390 + }, + { + "epoch": 2.398563734290844, + "grad_norm": 0.8210278749465942, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 33400 + }, + { + "epoch": 2.399281867145422, + "grad_norm": 0.91801917552948, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 33410 + }, + { + "epoch": 2.4, + "grad_norm": 0.8086220622062683, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 33420 + }, + { + "epoch": 2.400718132854578, + "grad_norm": 0.901613175868988, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 33430 + }, + { + "epoch": 2.4014362657091564, + "grad_norm": 0.9865965247154236, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 33440 + }, + { + "epoch": 2.4021543985637344, + "grad_norm": 0.8160675168037415, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 33450 + }, + { + "epoch": 2.4028725314183124, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33460 + }, + { + "epoch": 2.4035906642728904, + "grad_norm": 0.8490013480186462, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 33470 + }, + { + "epoch": 2.4043087971274684, + "grad_norm": 0.6947163939476013, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33480 + }, + { + "epoch": 2.405026929982047, + "grad_norm": 0.7984827756881714, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 33490 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 0.7826083302497864, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 33500 + }, + { + "epoch": 2.406463195691203, + "grad_norm": 0.8213959336280823, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 33510 + }, + { + "epoch": 2.407181328545781, + "grad_norm": 0.8790069818496704, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 33520 + }, + { + "epoch": 2.4078994614003593, + "grad_norm": 0.9093378782272339, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 33530 + }, + { + "epoch": 2.4086175942549373, + "grad_norm": 0.8085389137268066, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 33540 + }, + { + "epoch": 2.4093357271095153, + "grad_norm": 0.7952343225479126, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 33550 + }, + { + "epoch": 2.4100538599640933, + "grad_norm": 0.9576563835144043, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 33560 + }, + { + "epoch": 2.4107719928186713, + "grad_norm": 0.7722929120063782, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 33570 + }, + { + "epoch": 2.4114901256732497, + "grad_norm": 0.8634604215621948, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 33580 + }, + { + "epoch": 2.4122082585278277, + "grad_norm": 0.7805271148681641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 33590 + }, + { + "epoch": 2.4129263913824057, + "grad_norm": 0.8274481296539307, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 33600 + }, + { + "epoch": 2.4136445242369837, + "grad_norm": 0.9265141487121582, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 33610 + }, + { + "epoch": 2.414362657091562, + "grad_norm": 0.7497374415397644, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 33620 + }, + { + "epoch": 2.41508078994614, + "grad_norm": 0.7048972249031067, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 33630 + }, + { + "epoch": 2.415798922800718, + "grad_norm": 0.8449550271034241, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 33640 + }, + { + "epoch": 2.416517055655296, + "grad_norm": 0.7581984400749207, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 33650 + }, + { + "epoch": 2.417235188509874, + "grad_norm": 0.7744191288948059, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 33660 + }, + { + "epoch": 2.417953321364452, + "grad_norm": 0.6736614108085632, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 33670 + }, + { + "epoch": 2.4186714542190306, + "grad_norm": 0.985431432723999, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33680 + }, + { + "epoch": 2.4193895870736086, + "grad_norm": 0.8027978539466858, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33690 + }, + { + "epoch": 2.4201077199281866, + "grad_norm": 0.6809377074241638, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 33700 + }, + { + "epoch": 2.4208258527827646, + "grad_norm": 0.8305349946022034, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 33710 + }, + { + "epoch": 2.421543985637343, + "grad_norm": 0.7632496356964111, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 33720 + }, + { + "epoch": 2.422262118491921, + "grad_norm": 0.7241050601005554, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 33730 + }, + { + "epoch": 2.422980251346499, + "grad_norm": 0.6729857325553894, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 33740 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 0.7741881012916565, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 33750 + }, + { + "epoch": 2.424416517055655, + "grad_norm": 0.7844415903091431, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 33760 + }, + { + "epoch": 2.4251346499102335, + "grad_norm": 0.7960098385810852, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 33770 + }, + { + "epoch": 2.4258527827648115, + "grad_norm": 0.8267978429794312, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 33780 + }, + { + "epoch": 2.4265709156193895, + "grad_norm": 0.7498974204063416, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 33790 + }, + { + "epoch": 2.4272890484739675, + "grad_norm": 0.8357859253883362, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 33800 + }, + { + "epoch": 2.428007181328546, + "grad_norm": 0.8056104779243469, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 33810 + }, + { + "epoch": 2.428725314183124, + "grad_norm": 0.806897759437561, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 33820 + }, + { + "epoch": 2.429443447037702, + "grad_norm": 0.7770048975944519, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 33830 + }, + { + "epoch": 2.43016157989228, + "grad_norm": 0.8311458230018616, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 33840 + }, + { + "epoch": 2.430879712746858, + "grad_norm": 0.9201730489730835, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 33850 + }, + { + "epoch": 2.4315978456014364, + "grad_norm": 0.83509761095047, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 33860 + }, + { + "epoch": 2.4323159784560144, + "grad_norm": 0.7680139541625977, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 33870 + }, + { + "epoch": 2.4330341113105924, + "grad_norm": 0.8956670165061951, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 33880 + }, + { + "epoch": 2.4337522441651704, + "grad_norm": 0.717941164970398, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33890 + }, + { + "epoch": 2.434470377019749, + "grad_norm": 0.777206540107727, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 33900 + }, + { + "epoch": 2.435188509874327, + "grad_norm": 0.90232914686203, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 33910 + }, + { + "epoch": 2.435906642728905, + "grad_norm": 1.0817158222198486, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 33920 + }, + { + "epoch": 2.436624775583483, + "grad_norm": 0.7890931367874146, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 33930 + }, + { + "epoch": 2.437342908438061, + "grad_norm": 0.9279449582099915, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 33940 + }, + { + "epoch": 2.438061041292639, + "grad_norm": 0.8313823342323303, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 33950 + }, + { + "epoch": 2.4387791741472173, + "grad_norm": 1.0510340929031372, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 33960 + }, + { + "epoch": 2.4394973070017953, + "grad_norm": 0.8002574443817139, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 33970 + }, + { + "epoch": 2.4402154398563733, + "grad_norm": 0.7822834253311157, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33980 + }, + { + "epoch": 2.4409335727109513, + "grad_norm": 0.9050403237342834, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 33990 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 0.7569652199745178, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 34000 + }, + { + "epoch": 2.442369838420108, + "grad_norm": 0.6609470844268799, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 34010 + }, + { + "epoch": 2.443087971274686, + "grad_norm": 0.8090947866439819, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34020 + }, + { + "epoch": 2.443806104129264, + "grad_norm": 0.647814929485321, + "learning_rate": 0.0002, + "loss": 0.6621, + "step": 34030 + }, + { + "epoch": 2.444524236983842, + "grad_norm": 0.9308601021766663, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 34040 + }, + { + "epoch": 2.4452423698384202, + "grad_norm": 0.8259239792823792, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34050 + }, + { + "epoch": 2.4459605026929983, + "grad_norm": 0.9410025477409363, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 34060 + }, + { + "epoch": 2.4466786355475763, + "grad_norm": 0.7446974515914917, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 34070 + }, + { + "epoch": 2.4473967684021543, + "grad_norm": 0.7093849182128906, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 34080 + }, + { + "epoch": 2.4481149012567327, + "grad_norm": 0.8726152181625366, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 34090 + }, + { + "epoch": 2.4488330341113107, + "grad_norm": 0.808300793170929, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 34100 + }, + { + "epoch": 2.4495511669658887, + "grad_norm": 0.6884859800338745, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 34110 + }, + { + "epoch": 2.4502692998204667, + "grad_norm": 0.7151864767074585, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 34120 + }, + { + "epoch": 2.4509874326750447, + "grad_norm": 0.9261866807937622, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 34130 + }, + { + "epoch": 2.451705565529623, + "grad_norm": 0.8069018125534058, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 34140 + }, + { + "epoch": 2.452423698384201, + "grad_norm": 0.8001297116279602, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 34150 + }, + { + "epoch": 2.453141831238779, + "grad_norm": 0.8547799587249756, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 34160 + }, + { + "epoch": 2.453859964093357, + "grad_norm": 0.6693823337554932, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 34170 + }, + { + "epoch": 2.4545780969479356, + "grad_norm": 0.6646198630332947, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34180 + }, + { + "epoch": 2.4552962298025136, + "grad_norm": 0.9330950975418091, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 34190 + }, + { + "epoch": 2.4560143626570916, + "grad_norm": 0.7738645672798157, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 34200 + }, + { + "epoch": 2.4567324955116696, + "grad_norm": 0.7929846048355103, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 34210 + }, + { + "epoch": 2.4574506283662476, + "grad_norm": 0.8936280012130737, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34220 + }, + { + "epoch": 2.4581687612208256, + "grad_norm": 0.9099360108375549, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 34230 + }, + { + "epoch": 2.458886894075404, + "grad_norm": 0.7941291928291321, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 34240 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 0.7169737219810486, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 34250 + }, + { + "epoch": 2.46032315978456, + "grad_norm": 0.8994171023368835, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 34260 + }, + { + "epoch": 2.461041292639138, + "grad_norm": 0.8087331056594849, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 34270 + }, + { + "epoch": 2.4617594254937165, + "grad_norm": 0.935502827167511, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 34280 + }, + { + "epoch": 2.4624775583482945, + "grad_norm": 0.8957464694976807, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 34290 + }, + { + "epoch": 2.4631956912028725, + "grad_norm": 0.9017183780670166, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 34300 + }, + { + "epoch": 2.4639138240574505, + "grad_norm": 0.7778640389442444, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34310 + }, + { + "epoch": 2.4646319569120285, + "grad_norm": 0.8870323896408081, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 34320 + }, + { + "epoch": 2.465350089766607, + "grad_norm": 0.7660176753997803, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 34330 + }, + { + "epoch": 2.466068222621185, + "grad_norm": 0.8442226648330688, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 34340 + }, + { + "epoch": 2.466786355475763, + "grad_norm": 0.7522561550140381, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 34350 + }, + { + "epoch": 2.467504488330341, + "grad_norm": 0.9355213046073914, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 34360 + }, + { + "epoch": 2.4682226211849194, + "grad_norm": 0.8487382531166077, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 34370 + }, + { + "epoch": 2.4689407540394974, + "grad_norm": 0.7869813442230225, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 34380 + }, + { + "epoch": 2.4696588868940754, + "grad_norm": 0.7562848329544067, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 34390 + }, + { + "epoch": 2.4703770197486534, + "grad_norm": 0.740829586982727, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 34400 + }, + { + "epoch": 2.4710951526032314, + "grad_norm": 1.0862116813659668, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 34410 + }, + { + "epoch": 2.47181328545781, + "grad_norm": 0.9633645415306091, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 34420 + }, + { + "epoch": 2.472531418312388, + "grad_norm": 0.8467186093330383, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 34430 + }, + { + "epoch": 2.473249551166966, + "grad_norm": 0.9972147941589355, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 34440 + }, + { + "epoch": 2.473967684021544, + "grad_norm": 0.8086632490158081, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 34450 + }, + { + "epoch": 2.4746858168761223, + "grad_norm": 0.9043704271316528, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 34460 + }, + { + "epoch": 2.4754039497307003, + "grad_norm": 0.8275330662727356, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34470 + }, + { + "epoch": 2.4761220825852783, + "grad_norm": 0.8142464756965637, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 34480 + }, + { + "epoch": 2.4768402154398563, + "grad_norm": 0.7116754651069641, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 34490 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 0.8742281198501587, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 34500 + }, + { + "epoch": 2.4782764811490123, + "grad_norm": 0.7545657157897949, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 34510 + }, + { + "epoch": 2.478994614003591, + "grad_norm": 0.7586482167243958, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 34520 + }, + { + "epoch": 2.479712746858169, + "grad_norm": 0.9212547540664673, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 34530 + }, + { + "epoch": 2.480430879712747, + "grad_norm": 0.9391530752182007, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 34540 + }, + { + "epoch": 2.481149012567325, + "grad_norm": 1.119698166847229, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 34550 + }, + { + "epoch": 2.4818671454219032, + "grad_norm": 0.8499019145965576, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34560 + }, + { + "epoch": 2.4825852782764812, + "grad_norm": 0.7629778385162354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 34570 + }, + { + "epoch": 2.4833034111310592, + "grad_norm": 0.7667021155357361, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 34580 + }, + { + "epoch": 2.4840215439856372, + "grad_norm": 0.6711493730545044, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 34590 + }, + { + "epoch": 2.4847396768402152, + "grad_norm": 0.7354223728179932, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34600 + }, + { + "epoch": 2.4854578096947937, + "grad_norm": 0.875295102596283, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 34610 + }, + { + "epoch": 2.4861759425493717, + "grad_norm": 0.7341493964195251, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 34620 + }, + { + "epoch": 2.4868940754039497, + "grad_norm": 0.9049216508865356, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 34630 + }, + { + "epoch": 2.4876122082585277, + "grad_norm": 0.7214788198471069, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 34640 + }, + { + "epoch": 2.488330341113106, + "grad_norm": 0.7514070868492126, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 34650 + }, + { + "epoch": 2.489048473967684, + "grad_norm": 0.6929763555526733, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 34660 + }, + { + "epoch": 2.489766606822262, + "grad_norm": 1.11346435546875, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 34670 + }, + { + "epoch": 2.49048473967684, + "grad_norm": 0.9285556674003601, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 34680 + }, + { + "epoch": 2.491202872531418, + "grad_norm": 0.7699695825576782, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 34690 + }, + { + "epoch": 2.4919210053859966, + "grad_norm": 0.872349739074707, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 34700 + }, + { + "epoch": 2.4926391382405746, + "grad_norm": 0.8692147135734558, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 34710 + }, + { + "epoch": 2.4933572710951526, + "grad_norm": 0.799740195274353, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 34720 + }, + { + "epoch": 2.4940754039497306, + "grad_norm": 0.7320986986160278, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 34730 + }, + { + "epoch": 2.494793536804309, + "grad_norm": 0.8233383893966675, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 34740 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 0.9605086445808411, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34750 + }, + { + "epoch": 2.496229802513465, + "grad_norm": 0.8597773909568787, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 34760 + }, + { + "epoch": 2.496947935368043, + "grad_norm": 0.7459201812744141, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34770 + }, + { + "epoch": 2.497666068222621, + "grad_norm": 0.778457522392273, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 34780 + }, + { + "epoch": 2.498384201077199, + "grad_norm": 0.8591375946998596, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 34790 + }, + { + "epoch": 2.4991023339317775, + "grad_norm": 0.9689867496490479, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 34800 + }, + { + "epoch": 2.4998204667863555, + "grad_norm": 0.7430615425109863, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 34810 + }, + { + "epoch": 2.5005385996409335, + "grad_norm": 0.8545114994049072, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 34820 + }, + { + "epoch": 2.5012567324955115, + "grad_norm": 0.7115356922149658, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 34830 + }, + { + "epoch": 2.50197486535009, + "grad_norm": 0.7616795301437378, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34840 + }, + { + "epoch": 2.502692998204668, + "grad_norm": 0.8097891211509705, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 34850 + }, + { + "epoch": 2.503411131059246, + "grad_norm": 0.7397396564483643, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 34860 + }, + { + "epoch": 2.504129263913824, + "grad_norm": 0.7531594038009644, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 34870 + }, + { + "epoch": 2.504847396768402, + "grad_norm": 0.8050091862678528, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 34880 + }, + { + "epoch": 2.5055655296229804, + "grad_norm": 0.7550507187843323, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 34890 + }, + { + "epoch": 2.5062836624775584, + "grad_norm": 1.0131759643554688, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34900 + }, + { + "epoch": 2.5070017953321364, + "grad_norm": 0.9275356531143188, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 34910 + }, + { + "epoch": 2.5077199281867144, + "grad_norm": 0.6655791997909546, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 34920 + }, + { + "epoch": 2.508438061041293, + "grad_norm": 0.79361891746521, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 34930 + }, + { + "epoch": 2.509156193895871, + "grad_norm": 0.8223658800125122, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 34940 + }, + { + "epoch": 2.509874326750449, + "grad_norm": 1.0070416927337646, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 34950 + }, + { + "epoch": 2.510592459605027, + "grad_norm": 0.8408986330032349, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 34960 + }, + { + "epoch": 2.511310592459605, + "grad_norm": 0.8178259134292603, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 34970 + }, + { + "epoch": 2.512028725314183, + "grad_norm": 0.747876763343811, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 34980 + }, + { + "epoch": 2.5127468581687613, + "grad_norm": 0.8551825881004333, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 34990 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 0.8366564512252808, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 35000 + }, + { + "epoch": 2.5141831238779173, + "grad_norm": 0.8491294384002686, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 35010 + }, + { + "epoch": 2.5149012567324958, + "grad_norm": 0.8854562640190125, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 35020 + }, + { + "epoch": 2.5156193895870738, + "grad_norm": 0.8652133345603943, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 35030 + }, + { + "epoch": 2.5163375224416518, + "grad_norm": 0.8734033107757568, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 35040 + }, + { + "epoch": 2.5170556552962298, + "grad_norm": 0.8613446950912476, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 35050 + }, + { + "epoch": 2.5177737881508078, + "grad_norm": 0.762395441532135, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 35060 + }, + { + "epoch": 2.5184919210053858, + "grad_norm": 0.806220293045044, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 35070 + }, + { + "epoch": 2.519210053859964, + "grad_norm": 0.7781713008880615, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 35080 + }, + { + "epoch": 2.519928186714542, + "grad_norm": 0.8639848828315735, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 35090 + }, + { + "epoch": 2.52064631956912, + "grad_norm": 0.7331740260124207, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 35100 + }, + { + "epoch": 2.521364452423698, + "grad_norm": 0.8148137927055359, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 35110 + }, + { + "epoch": 2.5220825852782767, + "grad_norm": 0.6939297914505005, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 35120 + }, + { + "epoch": 2.5228007181328547, + "grad_norm": 0.8151076436042786, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 35130 + }, + { + "epoch": 2.5235188509874327, + "grad_norm": 0.9193238019943237, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 35140 + }, + { + "epoch": 2.5242369838420107, + "grad_norm": 0.8230985403060913, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 35150 + }, + { + "epoch": 2.5249551166965887, + "grad_norm": 0.865492582321167, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 35160 + }, + { + "epoch": 2.525673249551167, + "grad_norm": 0.7673570513725281, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35170 + }, + { + "epoch": 2.526391382405745, + "grad_norm": 0.8296313881874084, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 35180 + }, + { + "epoch": 2.527109515260323, + "grad_norm": 0.6531317234039307, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 35190 + }, + { + "epoch": 2.527827648114901, + "grad_norm": 0.9865642189979553, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 35200 + }, + { + "epoch": 2.5285457809694796, + "grad_norm": 0.8001098036766052, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 35210 + }, + { + "epoch": 2.5292639138240576, + "grad_norm": 0.7523218393325806, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 35220 + }, + { + "epoch": 2.5299820466786356, + "grad_norm": 1.061640977859497, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 35230 + }, + { + "epoch": 2.5307001795332136, + "grad_norm": 0.9668078422546387, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35240 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 0.9554983973503113, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 35250 + }, + { + "epoch": 2.5321364452423696, + "grad_norm": 0.8343066573143005, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 35260 + }, + { + "epoch": 2.532854578096948, + "grad_norm": 0.8408095240592957, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 35270 + }, + { + "epoch": 2.533572710951526, + "grad_norm": 0.8593984842300415, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 35280 + }, + { + "epoch": 2.534290843806104, + "grad_norm": 0.7593855261802673, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 35290 + }, + { + "epoch": 2.5350089766606825, + "grad_norm": 0.9179701209068298, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 35300 + }, + { + "epoch": 2.5357271095152605, + "grad_norm": 0.749022901058197, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 35310 + }, + { + "epoch": 2.5364452423698385, + "grad_norm": 0.7172152400016785, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 35320 + }, + { + "epoch": 2.5371633752244165, + "grad_norm": 0.8228873610496521, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 35330 + }, + { + "epoch": 2.5378815080789945, + "grad_norm": 0.9663547277450562, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 35340 + }, + { + "epoch": 2.5385996409335725, + "grad_norm": 0.8446536660194397, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35350 + }, + { + "epoch": 2.539317773788151, + "grad_norm": 0.9751029014587402, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 35360 + }, + { + "epoch": 2.540035906642729, + "grad_norm": 0.7460315823554993, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 35370 + }, + { + "epoch": 2.540754039497307, + "grad_norm": 0.8269246816635132, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 35380 + }, + { + "epoch": 2.541472172351885, + "grad_norm": 0.7200030088424683, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 35390 + }, + { + "epoch": 2.5421903052064634, + "grad_norm": 0.9586671590805054, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 35400 + }, + { + "epoch": 2.5429084380610414, + "grad_norm": 0.7872378826141357, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 35410 + }, + { + "epoch": 2.5436265709156194, + "grad_norm": 0.8257358074188232, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 35420 + }, + { + "epoch": 2.5443447037701974, + "grad_norm": 0.6924505829811096, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 35430 + }, + { + "epoch": 2.5450628366247754, + "grad_norm": 1.1171481609344482, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 35440 + }, + { + "epoch": 2.545780969479354, + "grad_norm": 0.9635605216026306, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 35450 + }, + { + "epoch": 2.546499102333932, + "grad_norm": 0.9760567545890808, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 35460 + }, + { + "epoch": 2.54721723518851, + "grad_norm": 0.8523460030555725, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 35470 + }, + { + "epoch": 2.547935368043088, + "grad_norm": 0.9316970109939575, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 35480 + }, + { + "epoch": 2.5486535008976663, + "grad_norm": 0.7401485443115234, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 35490 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 1.0627065896987915, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 35500 + }, + { + "epoch": 2.5500897666068223, + "grad_norm": 0.7463156580924988, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 35510 + }, + { + "epoch": 2.5508078994614003, + "grad_norm": 0.9935570359230042, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 35520 + }, + { + "epoch": 2.5515260323159783, + "grad_norm": 0.8824051022529602, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 35530 + }, + { + "epoch": 2.5522441651705563, + "grad_norm": 0.8018375635147095, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 35540 + }, + { + "epoch": 2.5529622980251347, + "grad_norm": 0.7523182034492493, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 35550 + }, + { + "epoch": 2.5536804308797127, + "grad_norm": 0.6771712303161621, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 35560 + }, + { + "epoch": 2.5543985637342908, + "grad_norm": 0.7903336882591248, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 35570 + }, + { + "epoch": 2.555116696588869, + "grad_norm": 0.7973808646202087, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 35580 + }, + { + "epoch": 2.555834829443447, + "grad_norm": 0.9082772731781006, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 35590 + }, + { + "epoch": 2.556552962298025, + "grad_norm": 0.779671311378479, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 35600 + }, + { + "epoch": 2.557271095152603, + "grad_norm": 0.710058331489563, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 35610 + }, + { + "epoch": 2.557989228007181, + "grad_norm": 0.8217873573303223, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 35620 + }, + { + "epoch": 2.558707360861759, + "grad_norm": 0.8017855286598206, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 35630 + }, + { + "epoch": 2.5594254937163377, + "grad_norm": 0.6671402454376221, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 35640 + }, + { + "epoch": 2.5601436265709157, + "grad_norm": 0.9357045292854309, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 35650 + }, + { + "epoch": 2.5608617594254937, + "grad_norm": 0.7676312327384949, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35660 + }, + { + "epoch": 2.5615798922800717, + "grad_norm": 0.7602545619010925, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 35670 + }, + { + "epoch": 2.56229802513465, + "grad_norm": 0.8112275004386902, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35680 + }, + { + "epoch": 2.563016157989228, + "grad_norm": 0.73296719789505, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 35690 + }, + { + "epoch": 2.563734290843806, + "grad_norm": 0.9007818102836609, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 35700 + }, + { + "epoch": 2.564452423698384, + "grad_norm": 0.7526060938835144, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 35710 + }, + { + "epoch": 2.565170556552962, + "grad_norm": 0.813875675201416, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 35720 + }, + { + "epoch": 2.5658886894075406, + "grad_norm": 0.7767695784568787, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 35730 + }, + { + "epoch": 2.5666068222621186, + "grad_norm": 0.7840573787689209, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35740 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 0.7400487661361694, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 35750 + }, + { + "epoch": 2.5680430879712746, + "grad_norm": 0.7424315810203552, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 35760 + }, + { + "epoch": 2.568761220825853, + "grad_norm": 0.7812185883522034, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 35770 + }, + { + "epoch": 2.569479353680431, + "grad_norm": 0.8397669196128845, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 35780 + }, + { + "epoch": 2.570197486535009, + "grad_norm": 0.7543849945068359, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 35790 + }, + { + "epoch": 2.570915619389587, + "grad_norm": 0.903634786605835, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 35800 + }, + { + "epoch": 2.571633752244165, + "grad_norm": 0.853335976600647, + "learning_rate": 0.0002, + "loss": 0.6884, + "step": 35810 + }, + { + "epoch": 2.572351885098743, + "grad_norm": 0.8441029787063599, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 35820 + }, + { + "epoch": 2.5730700179533215, + "grad_norm": 0.9072228670120239, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 35830 + }, + { + "epoch": 2.5737881508078995, + "grad_norm": 0.7720168828964233, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 35840 + }, + { + "epoch": 2.5745062836624775, + "grad_norm": 0.8719366788864136, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35850 + }, + { + "epoch": 2.575224416517056, + "grad_norm": 0.766209065914154, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 35860 + }, + { + "epoch": 2.575942549371634, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 35870 + }, + { + "epoch": 2.576660682226212, + "grad_norm": 0.8068482875823975, + "learning_rate": 0.0002, + "loss": 0.7309, + "step": 35880 + }, + { + "epoch": 2.57737881508079, + "grad_norm": 0.8321225643157959, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 35890 + }, + { + "epoch": 2.578096947935368, + "grad_norm": 0.9787611961364746, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 35900 + }, + { + "epoch": 2.578815080789946, + "grad_norm": 0.6955108642578125, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 35910 + }, + { + "epoch": 2.5795332136445244, + "grad_norm": 0.8309195637702942, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 35920 + }, + { + "epoch": 2.5802513464991024, + "grad_norm": 0.9309390783309937, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 35930 + }, + { + "epoch": 2.5809694793536804, + "grad_norm": 0.903537392616272, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 35940 + }, + { + "epoch": 2.5816876122082584, + "grad_norm": 0.9530633091926575, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 35950 + }, + { + "epoch": 2.582405745062837, + "grad_norm": 1.0140212774276733, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 35960 + }, + { + "epoch": 2.583123877917415, + "grad_norm": 0.8224637508392334, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 35970 + }, + { + "epoch": 2.583842010771993, + "grad_norm": 0.7952998280525208, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 35980 + }, + { + "epoch": 2.584560143626571, + "grad_norm": 0.6057878136634827, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 35990 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 0.9172457456588745, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 36000 + }, + { + "epoch": 2.5859964093357273, + "grad_norm": 1.0061585903167725, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36010 + }, + { + "epoch": 2.5867145421903053, + "grad_norm": 0.8555058240890503, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 36020 + }, + { + "epoch": 2.5874326750448833, + "grad_norm": 0.7732099890708923, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 36030 + }, + { + "epoch": 2.5881508078994613, + "grad_norm": 0.9026121497154236, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 36040 + }, + { + "epoch": 2.5888689407540397, + "grad_norm": 0.7477090954780579, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 36050 + }, + { + "epoch": 2.5895870736086177, + "grad_norm": 0.8835780024528503, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 36060 + }, + { + "epoch": 2.5903052064631957, + "grad_norm": 0.7555899024009705, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 36070 + }, + { + "epoch": 2.5910233393177737, + "grad_norm": 0.7983574867248535, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 36080 + }, + { + "epoch": 2.5917414721723517, + "grad_norm": 0.9261698722839355, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 36090 + }, + { + "epoch": 2.5924596050269297, + "grad_norm": 0.6834031343460083, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 36100 + }, + { + "epoch": 2.593177737881508, + "grad_norm": 0.9528526067733765, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 36110 + }, + { + "epoch": 2.593895870736086, + "grad_norm": 0.7469993233680725, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 36120 + }, + { + "epoch": 2.594614003590664, + "grad_norm": 0.6750355362892151, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 36130 + }, + { + "epoch": 2.5953321364452426, + "grad_norm": 0.8591015338897705, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 36140 + }, + { + "epoch": 2.5960502692998206, + "grad_norm": 0.7359472513198853, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 36150 + }, + { + "epoch": 2.5967684021543986, + "grad_norm": 0.8450608253479004, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36160 + }, + { + "epoch": 2.5974865350089766, + "grad_norm": 0.9069468975067139, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36170 + }, + { + "epoch": 2.5982046678635546, + "grad_norm": 0.9261118173599243, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 36180 + }, + { + "epoch": 2.5989228007181326, + "grad_norm": 0.7164715528488159, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 36190 + }, + { + "epoch": 2.599640933572711, + "grad_norm": 0.8809511661529541, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 36200 + }, + { + "epoch": 2.600359066427289, + "grad_norm": 0.9872701168060303, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 36210 + }, + { + "epoch": 2.601077199281867, + "grad_norm": 0.7544043064117432, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 36220 + }, + { + "epoch": 2.601795332136445, + "grad_norm": 0.9890767335891724, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 36230 + }, + { + "epoch": 2.6025134649910235, + "grad_norm": 0.907865047454834, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 36240 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 0.7724096179008484, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 36250 + }, + { + "epoch": 2.6039497307001795, + "grad_norm": 0.7996655106544495, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36260 + }, + { + "epoch": 2.6046678635547575, + "grad_norm": 0.7184412479400635, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 36270 + }, + { + "epoch": 2.6053859964093355, + "grad_norm": 0.7781601548194885, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 36280 + }, + { + "epoch": 2.6061041292639135, + "grad_norm": 0.8972102403640747, + "learning_rate": 0.0002, + "loss": 0.6975, + "step": 36290 + }, + { + "epoch": 2.606822262118492, + "grad_norm": 0.6831884980201721, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 36300 + }, + { + "epoch": 2.60754039497307, + "grad_norm": 0.9049789905548096, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 36310 + }, + { + "epoch": 2.608258527827648, + "grad_norm": 0.8062970042228699, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 36320 + }, + { + "epoch": 2.6089766606822264, + "grad_norm": 0.94797682762146, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 36330 + }, + { + "epoch": 2.6096947935368044, + "grad_norm": 0.7907559275627136, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 36340 + }, + { + "epoch": 2.6104129263913824, + "grad_norm": 0.6720156073570251, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 36350 + }, + { + "epoch": 2.6111310592459605, + "grad_norm": 0.729228138923645, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 36360 + }, + { + "epoch": 2.6118491921005385, + "grad_norm": 0.9072836637496948, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 36370 + }, + { + "epoch": 2.6125673249551165, + "grad_norm": 0.8022173643112183, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36380 + }, + { + "epoch": 2.613285457809695, + "grad_norm": 0.7475612163543701, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 36390 + }, + { + "epoch": 2.614003590664273, + "grad_norm": 0.7976534366607666, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 36400 + }, + { + "epoch": 2.614721723518851, + "grad_norm": 0.7118260860443115, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36410 + }, + { + "epoch": 2.6154398563734294, + "grad_norm": 0.666500985622406, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36420 + }, + { + "epoch": 2.6161579892280074, + "grad_norm": 0.8776089549064636, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 36430 + }, + { + "epoch": 2.6168761220825854, + "grad_norm": 0.9375919699668884, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 36440 + }, + { + "epoch": 2.6175942549371634, + "grad_norm": 0.8162244558334351, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 36450 + }, + { + "epoch": 2.6183123877917414, + "grad_norm": 0.8459304571151733, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 36460 + }, + { + "epoch": 2.6190305206463194, + "grad_norm": 0.7731037735939026, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 36470 + }, + { + "epoch": 2.619748653500898, + "grad_norm": 0.7857680320739746, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 36480 + }, + { + "epoch": 2.620466786355476, + "grad_norm": 0.8415161371231079, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 36490 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 0.8103558421134949, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 36500 + }, + { + "epoch": 2.621903052064632, + "grad_norm": 0.7876150608062744, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 36510 + }, + { + "epoch": 2.6226211849192103, + "grad_norm": 0.7316484451293945, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 36520 + }, + { + "epoch": 2.6233393177737883, + "grad_norm": 0.7209784984588623, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 36530 + }, + { + "epoch": 2.6240574506283663, + "grad_norm": 0.8933016657829285, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 36540 + }, + { + "epoch": 2.6247755834829443, + "grad_norm": 0.8078171610832214, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 36550 + }, + { + "epoch": 2.6254937163375223, + "grad_norm": 0.9134724736213684, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 36560 + }, + { + "epoch": 2.6262118491921003, + "grad_norm": 0.8691368699073792, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 36570 + }, + { + "epoch": 2.6269299820466787, + "grad_norm": 0.706479012966156, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 36580 + }, + { + "epoch": 2.6276481149012567, + "grad_norm": 0.9333644509315491, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 36590 + }, + { + "epoch": 2.6283662477558347, + "grad_norm": 0.8156154155731201, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 36600 + }, + { + "epoch": 2.629084380610413, + "grad_norm": 0.812745213508606, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 36610 + }, + { + "epoch": 2.629802513464991, + "grad_norm": 0.8898148536682129, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 36620 + }, + { + "epoch": 2.630520646319569, + "grad_norm": 0.8083946108818054, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36630 + }, + { + "epoch": 2.631238779174147, + "grad_norm": 0.7050122618675232, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 36640 + }, + { + "epoch": 2.631956912028725, + "grad_norm": 0.8155789971351624, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 36650 + }, + { + "epoch": 2.632675044883303, + "grad_norm": 0.9102175235748291, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 36660 + }, + { + "epoch": 2.6333931777378816, + "grad_norm": 0.6621248126029968, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36670 + }, + { + "epoch": 2.6341113105924596, + "grad_norm": 0.7338519096374512, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 36680 + }, + { + "epoch": 2.6348294434470376, + "grad_norm": 0.7536506652832031, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 36690 + }, + { + "epoch": 2.635547576301616, + "grad_norm": 0.9357436299324036, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 36700 + }, + { + "epoch": 2.636265709156194, + "grad_norm": 0.7732111215591431, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 36710 + }, + { + "epoch": 2.636983842010772, + "grad_norm": 0.6863537430763245, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36720 + }, + { + "epoch": 2.63770197486535, + "grad_norm": 0.8014764785766602, + "learning_rate": 0.0002, + "loss": 0.7058, + "step": 36730 + }, + { + "epoch": 2.638420107719928, + "grad_norm": 0.8103911280632019, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 36740 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 0.882652997970581, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 36750 + }, + { + "epoch": 2.6398563734290845, + "grad_norm": 0.8705278038978577, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 36760 + }, + { + "epoch": 2.6405745062836625, + "grad_norm": 0.80764240026474, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36770 + }, + { + "epoch": 2.6412926391382405, + "grad_norm": 0.9668620824813843, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 36780 + }, + { + "epoch": 2.6420107719928185, + "grad_norm": 0.7477577328681946, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 36790 + }, + { + "epoch": 2.642728904847397, + "grad_norm": 0.8344516754150391, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 36800 + }, + { + "epoch": 2.643447037701975, + "grad_norm": 0.9520720839500427, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 36810 + }, + { + "epoch": 2.644165170556553, + "grad_norm": 0.5942372679710388, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 36820 + }, + { + "epoch": 2.644883303411131, + "grad_norm": 0.7411555051803589, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 36830 + }, + { + "epoch": 2.645601436265709, + "grad_norm": 0.6597771048545837, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 36840 + }, + { + "epoch": 2.646319569120287, + "grad_norm": 0.8636548519134521, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 36850 + }, + { + "epoch": 2.6470377019748654, + "grad_norm": 0.8557497262954712, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 36860 + }, + { + "epoch": 2.6477558348294434, + "grad_norm": 0.8535996675491333, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 36870 + }, + { + "epoch": 2.6484739676840214, + "grad_norm": 0.7996463775634766, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 36880 + }, + { + "epoch": 2.6491921005386, + "grad_norm": 0.6462067365646362, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 36890 + }, + { + "epoch": 2.649910233393178, + "grad_norm": 0.8849772214889526, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36900 + }, + { + "epoch": 2.650628366247756, + "grad_norm": 0.999173641204834, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 36910 + }, + { + "epoch": 2.651346499102334, + "grad_norm": 0.7221724987030029, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 36920 + }, + { + "epoch": 2.652064631956912, + "grad_norm": 0.8122989535331726, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 36930 + }, + { + "epoch": 2.65278276481149, + "grad_norm": 0.724267840385437, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 36940 + }, + { + "epoch": 2.6535008976660683, + "grad_norm": 0.8250583410263062, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 36950 + }, + { + "epoch": 2.6542190305206463, + "grad_norm": 0.7623526453971863, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 36960 + }, + { + "epoch": 2.6549371633752243, + "grad_norm": 0.6474025845527649, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 36970 + }, + { + "epoch": 2.655655296229803, + "grad_norm": 0.9751694202423096, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 36980 + }, + { + "epoch": 2.656373429084381, + "grad_norm": 0.8338939547538757, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 36990 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 0.8877421021461487, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 37000 + }, + { + "epoch": 2.657809694793537, + "grad_norm": 0.9590298533439636, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 37010 + }, + { + "epoch": 2.658527827648115, + "grad_norm": 0.8224121928215027, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 37020 + }, + { + "epoch": 2.659245960502693, + "grad_norm": 0.9871236681938171, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 37030 + }, + { + "epoch": 2.6599640933572712, + "grad_norm": 0.8729037046432495, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 37040 + }, + { + "epoch": 2.6606822262118492, + "grad_norm": 0.6279319524765015, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 37050 + }, + { + "epoch": 2.6614003590664272, + "grad_norm": 1.0278962850570679, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37060 + }, + { + "epoch": 2.6621184919210052, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 37070 + }, + { + "epoch": 2.6628366247755837, + "grad_norm": 0.7432018518447876, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 37080 + }, + { + "epoch": 2.6635547576301617, + "grad_norm": 0.9425008296966553, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 37090 + }, + { + "epoch": 2.6642728904847397, + "grad_norm": 0.7542579174041748, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 37100 + }, + { + "epoch": 2.6649910233393177, + "grad_norm": 0.8469315767288208, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 37110 + }, + { + "epoch": 2.6657091561938957, + "grad_norm": 0.865777313709259, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 37120 + }, + { + "epoch": 2.6664272890484737, + "grad_norm": 0.7293250560760498, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 37130 + }, + { + "epoch": 2.667145421903052, + "grad_norm": 0.7199395895004272, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 37140 + }, + { + "epoch": 2.66786355475763, + "grad_norm": 0.7801268100738525, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 37150 + }, + { + "epoch": 2.668581687612208, + "grad_norm": 0.8706921935081482, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 37160 + }, + { + "epoch": 2.6692998204667866, + "grad_norm": 0.7124722599983215, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 37170 + }, + { + "epoch": 2.6700179533213646, + "grad_norm": 0.8333015441894531, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 37180 + }, + { + "epoch": 2.6707360861759426, + "grad_norm": 0.8822736740112305, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 37190 + }, + { + "epoch": 2.6714542190305206, + "grad_norm": 0.8300906419754028, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 37200 + }, + { + "epoch": 2.6721723518850986, + "grad_norm": 0.887126088142395, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37210 + }, + { + "epoch": 2.6728904847396766, + "grad_norm": 0.7473671436309814, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 37220 + }, + { + "epoch": 2.673608617594255, + "grad_norm": 0.8121018409729004, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 37230 + }, + { + "epoch": 2.674326750448833, + "grad_norm": 0.7882586717605591, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 37240 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 0.797060489654541, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 37250 + }, + { + "epoch": 2.6757630161579895, + "grad_norm": 0.9776935577392578, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 37260 + }, + { + "epoch": 2.6764811490125675, + "grad_norm": 0.9527283906936646, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37270 + }, + { + "epoch": 2.6771992818671455, + "grad_norm": 0.7232038974761963, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 37280 + }, + { + "epoch": 2.6779174147217235, + "grad_norm": 0.8514575362205505, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 37290 + }, + { + "epoch": 2.6786355475763015, + "grad_norm": 0.8951214551925659, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 37300 + }, + { + "epoch": 2.6793536804308795, + "grad_norm": 0.7569643259048462, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 37310 + }, + { + "epoch": 2.680071813285458, + "grad_norm": 1.0522346496582031, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 37320 + }, + { + "epoch": 2.680789946140036, + "grad_norm": 0.8914180994033813, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 37330 + }, + { + "epoch": 2.681508078994614, + "grad_norm": 0.8251807689666748, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 37340 + }, + { + "epoch": 2.682226211849192, + "grad_norm": 0.8215394020080566, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 37350 + }, + { + "epoch": 2.6829443447037704, + "grad_norm": 0.8043696880340576, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 37360 + }, + { + "epoch": 2.6836624775583484, + "grad_norm": 0.767250657081604, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 37370 + }, + { + "epoch": 2.6843806104129264, + "grad_norm": 0.817740261554718, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 37380 + }, + { + "epoch": 2.6850987432675044, + "grad_norm": 0.7963255047798157, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 37390 + }, + { + "epoch": 2.6858168761220824, + "grad_norm": 0.839271605014801, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 37400 + }, + { + "epoch": 2.6865350089766604, + "grad_norm": 0.7882823348045349, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 37410 + }, + { + "epoch": 2.687253141831239, + "grad_norm": 0.8316412568092346, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 37420 + }, + { + "epoch": 2.687971274685817, + "grad_norm": 1.0044993162155151, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37430 + }, + { + "epoch": 2.688689407540395, + "grad_norm": 0.8342832326889038, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 37440 + }, + { + "epoch": 2.6894075403949733, + "grad_norm": 0.6743215322494507, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 37450 + }, + { + "epoch": 2.6901256732495513, + "grad_norm": 0.6872923970222473, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 37460 + }, + { + "epoch": 2.6908438061041293, + "grad_norm": 0.7377792596817017, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 37470 + }, + { + "epoch": 2.6915619389587073, + "grad_norm": 0.7677304744720459, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 37480 + }, + { + "epoch": 2.6922800718132853, + "grad_norm": 0.9951061010360718, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 37490 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 0.7452111840248108, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 37500 + }, + { + "epoch": 2.6937163375224418, + "grad_norm": 0.9663393497467041, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 37510 + }, + { + "epoch": 2.6944344703770198, + "grad_norm": 0.7919635772705078, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 37520 + }, + { + "epoch": 2.6951526032315978, + "grad_norm": 0.9977981448173523, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 37530 + }, + { + "epoch": 2.695870736086176, + "grad_norm": 0.7279480695724487, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 37540 + }, + { + "epoch": 2.6965888689407542, + "grad_norm": 0.7218075394630432, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 37550 + }, + { + "epoch": 2.6973070017953322, + "grad_norm": 0.9041047096252441, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 37560 + }, + { + "epoch": 2.6980251346499102, + "grad_norm": 0.7689407467842102, + "learning_rate": 0.0002, + "loss": 0.6848, + "step": 37570 + }, + { + "epoch": 2.6987432675044882, + "grad_norm": 0.8184728622436523, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 37580 + }, + { + "epoch": 2.6994614003590662, + "grad_norm": 0.7536661624908447, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 37590 + }, + { + "epoch": 2.7001795332136447, + "grad_norm": 0.8371431231498718, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 37600 + }, + { + "epoch": 2.7008976660682227, + "grad_norm": 0.8562723994255066, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 37610 + }, + { + "epoch": 2.7016157989228007, + "grad_norm": 0.8227898478507996, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 37620 + }, + { + "epoch": 2.7023339317773787, + "grad_norm": 0.764792799949646, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 37630 + }, + { + "epoch": 2.703052064631957, + "grad_norm": 0.7782649993896484, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 37640 + }, + { + "epoch": 2.703770197486535, + "grad_norm": 0.7669944167137146, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 37650 + }, + { + "epoch": 2.704488330341113, + "grad_norm": 0.7945750951766968, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 37660 + }, + { + "epoch": 2.705206463195691, + "grad_norm": 0.6840786337852478, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 37670 + }, + { + "epoch": 2.705924596050269, + "grad_norm": 1.0565117597579956, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 37680 + }, + { + "epoch": 2.706642728904847, + "grad_norm": 0.7407042384147644, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 37690 + }, + { + "epoch": 2.7073608617594256, + "grad_norm": 0.7862113118171692, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 37700 + }, + { + "epoch": 2.7080789946140036, + "grad_norm": 0.7487596273422241, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 37710 + }, + { + "epoch": 2.7087971274685816, + "grad_norm": 0.9416596293449402, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 37720 + }, + { + "epoch": 2.70951526032316, + "grad_norm": 0.8943207263946533, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 37730 + }, + { + "epoch": 2.710233393177738, + "grad_norm": 0.9263445138931274, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 37740 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 0.6869737505912781, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 37750 + }, + { + "epoch": 2.711669658886894, + "grad_norm": 0.9186407923698425, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 37760 + }, + { + "epoch": 2.712387791741472, + "grad_norm": 0.8379335999488831, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 37770 + }, + { + "epoch": 2.71310592459605, + "grad_norm": 0.7248736023902893, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 37780 + }, + { + "epoch": 2.7138240574506285, + "grad_norm": 0.8636229038238525, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 37790 + }, + { + "epoch": 2.7145421903052065, + "grad_norm": 0.7590767741203308, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 37800 + }, + { + "epoch": 2.7152603231597845, + "grad_norm": 0.8946404457092285, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 37810 + }, + { + "epoch": 2.7159784560143625, + "grad_norm": 0.7822132706642151, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 37820 + }, + { + "epoch": 2.716696588868941, + "grad_norm": 0.7882820963859558, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 37830 + }, + { + "epoch": 2.717414721723519, + "grad_norm": 0.8025872707366943, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 37840 + }, + { + "epoch": 2.718132854578097, + "grad_norm": 0.8618839979171753, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 37850 + }, + { + "epoch": 2.718850987432675, + "grad_norm": 0.6975733637809753, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 37860 + }, + { + "epoch": 2.719569120287253, + "grad_norm": 0.7952182292938232, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 37870 + }, + { + "epoch": 2.7202872531418314, + "grad_norm": 0.7580680251121521, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 37880 + }, + { + "epoch": 2.7210053859964094, + "grad_norm": 0.9504257440567017, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 37890 + }, + { + "epoch": 2.7217235188509874, + "grad_norm": 0.856614351272583, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 37900 + }, + { + "epoch": 2.7224416517055654, + "grad_norm": 1.0092085599899292, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 37910 + }, + { + "epoch": 2.723159784560144, + "grad_norm": 0.9009839296340942, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 37920 + }, + { + "epoch": 2.723877917414722, + "grad_norm": 0.9247435331344604, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 37930 + }, + { + "epoch": 2.7245960502693, + "grad_norm": 1.0774317979812622, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 37940 + }, + { + "epoch": 2.725314183123878, + "grad_norm": 0.9104372262954712, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 37950 + }, + { + "epoch": 2.726032315978456, + "grad_norm": 0.7904245257377625, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 37960 + }, + { + "epoch": 2.726750448833034, + "grad_norm": 0.9555521607398987, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 37970 + }, + { + "epoch": 2.7274685816876123, + "grad_norm": 0.7769099473953247, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 37980 + }, + { + "epoch": 2.7281867145421903, + "grad_norm": 0.9202065467834473, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 37990 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 0.732510507106781, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 38000 + }, + { + "epoch": 2.7296229802513468, + "grad_norm": 0.7723771929740906, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 38010 + }, + { + "epoch": 2.7303411131059248, + "grad_norm": 0.7948567867279053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 38020 + }, + { + "epoch": 2.7310592459605028, + "grad_norm": 0.7702966928482056, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 38030 + }, + { + "epoch": 2.7317773788150808, + "grad_norm": 0.689098060131073, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 38040 + }, + { + "epoch": 2.7324955116696588, + "grad_norm": 0.7951080203056335, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 38050 + }, + { + "epoch": 2.7332136445242368, + "grad_norm": 0.7284924983978271, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 38060 + }, + { + "epoch": 2.733931777378815, + "grad_norm": 0.9198044538497925, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 38070 + }, + { + "epoch": 2.734649910233393, + "grad_norm": 0.8653260469436646, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 38080 + }, + { + "epoch": 2.735368043087971, + "grad_norm": 0.8503400683403015, + "learning_rate": 0.0002, + "loss": 0.6832, + "step": 38090 + }, + { + "epoch": 2.736086175942549, + "grad_norm": 0.8388783931732178, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 38100 + }, + { + "epoch": 2.7368043087971277, + "grad_norm": 0.7636904716491699, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 38110 + }, + { + "epoch": 2.7375224416517057, + "grad_norm": 0.8990790247917175, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 38120 + }, + { + "epoch": 2.7382405745062837, + "grad_norm": 0.8878970742225647, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 38130 + }, + { + "epoch": 2.7389587073608617, + "grad_norm": 0.7684310078620911, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 38140 + }, + { + "epoch": 2.7396768402154397, + "grad_norm": 1.0777359008789062, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 38150 + }, + { + "epoch": 2.740394973070018, + "grad_norm": 0.768764317035675, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 38160 + }, + { + "epoch": 2.741113105924596, + "grad_norm": 0.7490760087966919, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 38170 + }, + { + "epoch": 2.741831238779174, + "grad_norm": 0.860373854637146, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 38180 + }, + { + "epoch": 2.742549371633752, + "grad_norm": 0.7145599722862244, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 38190 + }, + { + "epoch": 2.7432675044883306, + "grad_norm": 0.8347760438919067, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 38200 + }, + { + "epoch": 2.7439856373429086, + "grad_norm": 0.8425729274749756, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 38210 + }, + { + "epoch": 2.7447037701974866, + "grad_norm": 0.9289436936378479, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 38220 + }, + { + "epoch": 2.7454219030520646, + "grad_norm": 0.7608675360679626, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 38230 + }, + { + "epoch": 2.7461400359066426, + "grad_norm": 0.8067167401313782, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 38240 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 0.8599629402160645, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 38250 + }, + { + "epoch": 2.747576301615799, + "grad_norm": 0.8425742387771606, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 38260 + }, + { + "epoch": 2.748294434470377, + "grad_norm": 0.8626754283905029, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 38270 + }, + { + "epoch": 2.749012567324955, + "grad_norm": 0.797652006149292, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 38280 + }, + { + "epoch": 2.7497307001795335, + "grad_norm": 0.7971500754356384, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 38290 + }, + { + "epoch": 2.7504488330341115, + "grad_norm": 0.9786333441734314, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 38300 + }, + { + "epoch": 2.7511669658886895, + "grad_norm": 0.7146100997924805, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 38310 + }, + { + "epoch": 2.7518850987432675, + "grad_norm": 0.8436099886894226, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 38320 + }, + { + "epoch": 2.7526032315978455, + "grad_norm": 0.8943847417831421, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 38330 + }, + { + "epoch": 2.7533213644524235, + "grad_norm": 0.8170148730278015, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 38340 + }, + { + "epoch": 2.754039497307002, + "grad_norm": 0.7804728746414185, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 38350 + }, + { + "epoch": 2.75475763016158, + "grad_norm": 0.9139971137046814, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38360 + }, + { + "epoch": 2.755475763016158, + "grad_norm": 0.835332453250885, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 38370 + }, + { + "epoch": 2.756193895870736, + "grad_norm": 1.0904794931411743, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 38380 + }, + { + "epoch": 2.7569120287253144, + "grad_norm": 0.7443365454673767, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 38390 + }, + { + "epoch": 2.7576301615798924, + "grad_norm": 1.1336839199066162, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 38400 + }, + { + "epoch": 2.7583482944344704, + "grad_norm": 0.9024015665054321, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 38410 + }, + { + "epoch": 2.7590664272890484, + "grad_norm": 0.7380578517913818, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 38420 + }, + { + "epoch": 2.7597845601436264, + "grad_norm": 0.9860634207725525, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 38430 + }, + { + "epoch": 2.760502692998205, + "grad_norm": 0.7928970456123352, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 38440 + }, + { + "epoch": 2.761220825852783, + "grad_norm": 1.0357221364974976, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 38450 + }, + { + "epoch": 2.761938958707361, + "grad_norm": 0.8110901117324829, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 38460 + }, + { + "epoch": 2.762657091561939, + "grad_norm": 0.8420981764793396, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 38470 + }, + { + "epoch": 2.7633752244165173, + "grad_norm": 0.858955979347229, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 38480 + }, + { + "epoch": 2.7640933572710953, + "grad_norm": 0.9851368069648743, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 38490 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 0.8073325753211975, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 38500 + }, + { + "epoch": 2.7655296229802513, + "grad_norm": 1.0654062032699585, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38510 + }, + { + "epoch": 2.7662477558348293, + "grad_norm": 0.719603955745697, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 38520 + }, + { + "epoch": 2.7669658886894073, + "grad_norm": 0.9790831804275513, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38530 + }, + { + "epoch": 2.7676840215439857, + "grad_norm": 0.907619833946228, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 38540 + }, + { + "epoch": 2.7684021543985637, + "grad_norm": 0.7463719248771667, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 38550 + }, + { + "epoch": 2.7691202872531417, + "grad_norm": 1.0687178373336792, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 38560 + }, + { + "epoch": 2.76983842010772, + "grad_norm": 0.7397776246070862, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 38570 + }, + { + "epoch": 2.770556552962298, + "grad_norm": 0.7392559051513672, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 38580 + }, + { + "epoch": 2.771274685816876, + "grad_norm": 0.9774793982505798, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38590 + }, + { + "epoch": 2.771992818671454, + "grad_norm": 0.9502208828926086, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 38600 + }, + { + "epoch": 2.772710951526032, + "grad_norm": 0.776108980178833, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 38610 + }, + { + "epoch": 2.77342908438061, + "grad_norm": 0.7633077502250671, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 38620 + }, + { + "epoch": 2.7741472172351886, + "grad_norm": 0.9445580244064331, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 38630 + }, + { + "epoch": 2.7748653500897666, + "grad_norm": 0.943165123462677, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 38640 + }, + { + "epoch": 2.7755834829443446, + "grad_norm": 0.9045929908752441, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 38650 + }, + { + "epoch": 2.7763016157989227, + "grad_norm": 0.9425684213638306, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 38660 + }, + { + "epoch": 2.777019748653501, + "grad_norm": 0.9106295704841614, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 38670 + }, + { + "epoch": 2.777737881508079, + "grad_norm": 0.6264749765396118, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 38680 + }, + { + "epoch": 2.778456014362657, + "grad_norm": 0.9156801700592041, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 38690 + }, + { + "epoch": 2.779174147217235, + "grad_norm": 0.9752956032752991, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 38700 + }, + { + "epoch": 2.779892280071813, + "grad_norm": 0.7849555611610413, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 38710 + }, + { + "epoch": 2.780610412926391, + "grad_norm": 0.8109981417655945, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 38720 + }, + { + "epoch": 2.7813285457809696, + "grad_norm": 0.7882387638092041, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 38730 + }, + { + "epoch": 2.7820466786355476, + "grad_norm": 0.9049678444862366, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 38740 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 0.7678212523460388, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38750 + }, + { + "epoch": 2.783482944344704, + "grad_norm": 0.9754453301429749, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 38760 + }, + { + "epoch": 2.784201077199282, + "grad_norm": 0.7643493413925171, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 38770 + }, + { + "epoch": 2.78491921005386, + "grad_norm": 0.7440303564071655, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 38780 + }, + { + "epoch": 2.785637342908438, + "grad_norm": 0.8870946168899536, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 38790 + }, + { + "epoch": 2.786355475763016, + "grad_norm": 0.8100579977035522, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 38800 + }, + { + "epoch": 2.787073608617594, + "grad_norm": 0.7082616090774536, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 38810 + }, + { + "epoch": 2.7877917414721725, + "grad_norm": 0.7880047559738159, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 38820 + }, + { + "epoch": 2.7885098743267505, + "grad_norm": 0.7217963337898254, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 38830 + }, + { + "epoch": 2.7892280071813285, + "grad_norm": 0.799124002456665, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 38840 + }, + { + "epoch": 2.789946140035907, + "grad_norm": 1.0004022121429443, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 38850 + }, + { + "epoch": 2.790664272890485, + "grad_norm": 0.7866547107696533, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 38860 + }, + { + "epoch": 2.791382405745063, + "grad_norm": 0.891603410243988, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 38870 + }, + { + "epoch": 2.792100538599641, + "grad_norm": 0.7687129378318787, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 38880 + }, + { + "epoch": 2.792818671454219, + "grad_norm": 0.7549769282341003, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 38890 + }, + { + "epoch": 2.793536804308797, + "grad_norm": 0.7792351245880127, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 38900 + }, + { + "epoch": 2.7942549371633754, + "grad_norm": 0.7352819442749023, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 38910 + }, + { + "epoch": 2.7949730700179534, + "grad_norm": 0.8758018612861633, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 38920 + }, + { + "epoch": 2.7956912028725314, + "grad_norm": 0.8213023543357849, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38930 + }, + { + "epoch": 2.7964093357271094, + "grad_norm": 0.899368941783905, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 38940 + }, + { + "epoch": 2.797127468581688, + "grad_norm": 0.7497758269309998, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 38950 + }, + { + "epoch": 2.797845601436266, + "grad_norm": 0.870704710483551, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 38960 + }, + { + "epoch": 2.798563734290844, + "grad_norm": 0.8021528720855713, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 38970 + }, + { + "epoch": 2.799281867145422, + "grad_norm": 0.7541360855102539, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 38980 + }, + { + "epoch": 2.8, + "grad_norm": 0.8909788131713867, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 38990 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 0.8175999522209167, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 39000 + }, + { + "epoch": 2.8014362657091563, + "grad_norm": 0.7336044311523438, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 39010 + }, + { + "epoch": 2.8021543985637343, + "grad_norm": 0.7354168891906738, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 39020 + }, + { + "epoch": 2.8028725314183123, + "grad_norm": 0.8771968483924866, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 39030 + }, + { + "epoch": 2.8035906642728907, + "grad_norm": 0.8073309063911438, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39040 + }, + { + "epoch": 2.8043087971274687, + "grad_norm": 0.8475365042686462, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39050 + }, + { + "epoch": 2.8050269299820467, + "grad_norm": 0.7233281135559082, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 39060 + }, + { + "epoch": 2.8057450628366247, + "grad_norm": 0.9850572347640991, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39070 + }, + { + "epoch": 2.8064631956912027, + "grad_norm": 1.0635435581207275, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 39080 + }, + { + "epoch": 2.8071813285457807, + "grad_norm": 0.8183665871620178, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 39090 + }, + { + "epoch": 2.807899461400359, + "grad_norm": 0.802228569984436, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 39100 + }, + { + "epoch": 2.808617594254937, + "grad_norm": 0.9861624836921692, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 39110 + }, + { + "epoch": 2.809335727109515, + "grad_norm": 0.675205409526825, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 39120 + }, + { + "epoch": 2.8100538599640936, + "grad_norm": 0.7503975629806519, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 39130 + }, + { + "epoch": 2.8107719928186716, + "grad_norm": 0.8266825675964355, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 39140 + }, + { + "epoch": 2.8114901256732496, + "grad_norm": 0.6956485509872437, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39150 + }, + { + "epoch": 2.8122082585278276, + "grad_norm": 0.7363799214363098, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 39160 + }, + { + "epoch": 2.8129263913824056, + "grad_norm": 1.3893407583236694, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 39170 + }, + { + "epoch": 2.8136445242369836, + "grad_norm": 1.0619654655456543, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 39180 + }, + { + "epoch": 2.814362657091562, + "grad_norm": 0.7924326062202454, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 39190 + }, + { + "epoch": 2.81508078994614, + "grad_norm": 0.8838121294975281, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 39200 + }, + { + "epoch": 2.815798922800718, + "grad_norm": 0.9059016108512878, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 39210 + }, + { + "epoch": 2.816517055655296, + "grad_norm": 0.9284590482711792, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 39220 + }, + { + "epoch": 2.8172351885098745, + "grad_norm": 0.7992225289344788, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 39230 + }, + { + "epoch": 2.8179533213644525, + "grad_norm": 0.816376805305481, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 39240 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 0.9183637499809265, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 39250 + }, + { + "epoch": 2.8193895870736085, + "grad_norm": 0.7232057452201843, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 39260 + }, + { + "epoch": 2.8201077199281865, + "grad_norm": 0.9012457728385925, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 39270 + }, + { + "epoch": 2.8208258527827645, + "grad_norm": 0.7796093821525574, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 39280 + }, + { + "epoch": 2.821543985637343, + "grad_norm": 0.8331146836280823, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 39290 + }, + { + "epoch": 2.822262118491921, + "grad_norm": 0.8031269907951355, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 39300 + }, + { + "epoch": 2.822980251346499, + "grad_norm": 0.8563299179077148, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 39310 + }, + { + "epoch": 2.8236983842010774, + "grad_norm": 0.8083387613296509, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 39320 + }, + { + "epoch": 2.8244165170556554, + "grad_norm": 0.8132631182670593, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 39330 + }, + { + "epoch": 2.8251346499102334, + "grad_norm": 0.9071316719055176, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39340 + }, + { + "epoch": 2.8258527827648114, + "grad_norm": 0.8224168419837952, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 39350 + }, + { + "epoch": 2.8265709156193894, + "grad_norm": 1.073014497756958, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 39360 + }, + { + "epoch": 2.8272890484739674, + "grad_norm": 0.9466553926467896, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 39370 + }, + { + "epoch": 2.828007181328546, + "grad_norm": 0.8946257829666138, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 39380 + }, + { + "epoch": 2.828725314183124, + "grad_norm": 0.8497758507728577, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 39390 + }, + { + "epoch": 2.829443447037702, + "grad_norm": 0.8952143788337708, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 39400 + }, + { + "epoch": 2.8301615798922803, + "grad_norm": 0.8839313983917236, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 39410 + }, + { + "epoch": 2.8308797127468583, + "grad_norm": 0.7576757669448853, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 39420 + }, + { + "epoch": 2.8315978456014363, + "grad_norm": 0.8212469816207886, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 39430 + }, + { + "epoch": 2.8323159784560143, + "grad_norm": 0.9289504885673523, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 39440 + }, + { + "epoch": 2.8330341113105924, + "grad_norm": 0.8745405077934265, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 39450 + }, + { + "epoch": 2.8337522441651704, + "grad_norm": 0.7974533438682556, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 39460 + }, + { + "epoch": 2.834470377019749, + "grad_norm": 0.914289116859436, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 39470 + }, + { + "epoch": 2.835188509874327, + "grad_norm": 0.7686914801597595, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 39480 + }, + { + "epoch": 2.835906642728905, + "grad_norm": 0.9289370179176331, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39490 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 0.8851973414421082, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 39500 + }, + { + "epoch": 2.8373429084380613, + "grad_norm": 0.7754096388816833, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 39510 + }, + { + "epoch": 2.8380610412926393, + "grad_norm": 0.8801632523536682, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 39520 + }, + { + "epoch": 2.8387791741472173, + "grad_norm": 0.9031528234481812, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 39530 + }, + { + "epoch": 2.8394973070017953, + "grad_norm": 0.7113721966743469, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 39540 + }, + { + "epoch": 2.8402154398563733, + "grad_norm": 0.7880923748016357, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 39550 + }, + { + "epoch": 2.8409335727109513, + "grad_norm": 2.4828813076019287, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39560 + }, + { + "epoch": 2.8416517055655297, + "grad_norm": 0.9174619913101196, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 39570 + }, + { + "epoch": 2.8423698384201077, + "grad_norm": 0.9708074927330017, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 39580 + }, + { + "epoch": 2.8430879712746857, + "grad_norm": 0.7968248724937439, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 39590 + }, + { + "epoch": 2.843806104129264, + "grad_norm": 0.7967682480812073, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 39600 + }, + { + "epoch": 2.844524236983842, + "grad_norm": 0.7487651109695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 39610 + }, + { + "epoch": 2.84524236983842, + "grad_norm": 0.6997556686401367, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 39620 + }, + { + "epoch": 2.845960502692998, + "grad_norm": 0.7639351487159729, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39630 + }, + { + "epoch": 2.846678635547576, + "grad_norm": 0.9086648225784302, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 39640 + }, + { + "epoch": 2.847396768402154, + "grad_norm": 0.91103196144104, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 39650 + }, + { + "epoch": 2.8481149012567326, + "grad_norm": 0.8096913695335388, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 39660 + }, + { + "epoch": 2.8488330341113106, + "grad_norm": 0.8961427807807922, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39670 + }, + { + "epoch": 2.8495511669658886, + "grad_norm": 0.7489904761314392, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 39680 + }, + { + "epoch": 2.850269299820467, + "grad_norm": 0.7893617749214172, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 39690 + }, + { + "epoch": 2.850987432675045, + "grad_norm": 0.8259761929512024, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 39700 + }, + { + "epoch": 2.851705565529623, + "grad_norm": 0.7006617188453674, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 39710 + }, + { + "epoch": 2.852423698384201, + "grad_norm": 0.8922327756881714, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 39720 + }, + { + "epoch": 2.853141831238779, + "grad_norm": 0.9058550000190735, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 39730 + }, + { + "epoch": 2.853859964093357, + "grad_norm": 0.7627129554748535, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 39740 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 0.9316968321800232, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39750 + }, + { + "epoch": 2.8552962298025135, + "grad_norm": 0.8424679040908813, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39760 + }, + { + "epoch": 2.8560143626570915, + "grad_norm": 0.6185386776924133, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 39770 + }, + { + "epoch": 2.8567324955116695, + "grad_norm": 0.709902286529541, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 39780 + }, + { + "epoch": 2.857450628366248, + "grad_norm": 0.93730229139328, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 39790 + }, + { + "epoch": 2.858168761220826, + "grad_norm": 0.875989556312561, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 39800 + }, + { + "epoch": 2.858886894075404, + "grad_norm": 0.7424131631851196, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 39810 + }, + { + "epoch": 2.859605026929982, + "grad_norm": 0.9108477830886841, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 39820 + }, + { + "epoch": 2.86032315978456, + "grad_norm": 0.8248386383056641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 39830 + }, + { + "epoch": 2.861041292639138, + "grad_norm": 0.8739979863166809, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 39840 + }, + { + "epoch": 2.8617594254937164, + "grad_norm": 0.7940961122512817, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 39850 + }, + { + "epoch": 2.8624775583482944, + "grad_norm": 0.7594687938690186, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 39860 + }, + { + "epoch": 2.8631956912028724, + "grad_norm": 0.9884313941001892, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 39870 + }, + { + "epoch": 2.863913824057451, + "grad_norm": 0.8537741303443909, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 39880 + }, + { + "epoch": 2.864631956912029, + "grad_norm": 0.7407512664794922, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 39890 + }, + { + "epoch": 2.865350089766607, + "grad_norm": 1.0179548263549805, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 39900 + }, + { + "epoch": 2.866068222621185, + "grad_norm": 0.8822470307350159, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 39910 + }, + { + "epoch": 2.866786355475763, + "grad_norm": 0.794448733329773, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 39920 + }, + { + "epoch": 2.867504488330341, + "grad_norm": 0.8115299940109253, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 39930 + }, + { + "epoch": 2.8682226211849193, + "grad_norm": 0.7998958826065063, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 39940 + }, + { + "epoch": 2.8689407540394973, + "grad_norm": 0.8222435116767883, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 39950 + }, + { + "epoch": 2.8696588868940753, + "grad_norm": 0.9495923519134521, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39960 + }, + { + "epoch": 2.8703770197486533, + "grad_norm": 0.6749192476272583, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 39970 + }, + { + "epoch": 2.871095152603232, + "grad_norm": 0.8910874128341675, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 39980 + }, + { + "epoch": 2.87181328545781, + "grad_norm": 0.7051638960838318, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 39990 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 0.8456535339355469, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 40000 + }, + { + "epoch": 2.873249551166966, + "grad_norm": 0.934894859790802, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 40010 + }, + { + "epoch": 2.873967684021544, + "grad_norm": 0.6740477681159973, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 40020 + }, + { + "epoch": 2.8746858168761222, + "grad_norm": 0.6632325649261475, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 40030 + }, + { + "epoch": 2.8754039497307002, + "grad_norm": 0.8889022469520569, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 40040 + }, + { + "epoch": 2.8761220825852782, + "grad_norm": 0.7460705637931824, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 40050 + }, + { + "epoch": 2.8768402154398562, + "grad_norm": 0.9795911908149719, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 40060 + }, + { + "epoch": 2.8775583482944347, + "grad_norm": 1.0002509355545044, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 40070 + }, + { + "epoch": 2.8782764811490127, + "grad_norm": 0.7867239713668823, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 40080 + }, + { + "epoch": 2.8789946140035907, + "grad_norm": 1.0221471786499023, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 40090 + }, + { + "epoch": 2.8797127468581687, + "grad_norm": 0.8091005086898804, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 40100 + }, + { + "epoch": 2.8804308797127467, + "grad_norm": 0.8485820293426514, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 40110 + }, + { + "epoch": 2.8811490125673247, + "grad_norm": 0.7850196957588196, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 40120 + }, + { + "epoch": 2.881867145421903, + "grad_norm": 0.7906134128570557, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 40130 + }, + { + "epoch": 2.882585278276481, + "grad_norm": 0.7957962155342102, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 40140 + }, + { + "epoch": 2.883303411131059, + "grad_norm": 1.0687522888183594, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 40150 + }, + { + "epoch": 2.8840215439856376, + "grad_norm": 0.713752031326294, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 40160 + }, + { + "epoch": 2.8847396768402156, + "grad_norm": 1.1603864431381226, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 40170 + }, + { + "epoch": 2.8854578096947936, + "grad_norm": 0.8423245549201965, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 40180 + }, + { + "epoch": 2.8861759425493716, + "grad_norm": 0.7554550766944885, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40190 + }, + { + "epoch": 2.8868940754039496, + "grad_norm": 0.6006978750228882, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 40200 + }, + { + "epoch": 2.8876122082585276, + "grad_norm": 0.923068106174469, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 40210 + }, + { + "epoch": 2.888330341113106, + "grad_norm": 0.7659787535667419, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 40220 + }, + { + "epoch": 2.889048473967684, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 40230 + }, + { + "epoch": 2.889766606822262, + "grad_norm": 1.1267355680465698, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 40240 + }, + { + "epoch": 2.89048473967684, + "grad_norm": 0.8548554182052612, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 40250 + }, + { + "epoch": 2.8912028725314185, + "grad_norm": 0.7846875786781311, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 40260 + }, + { + "epoch": 2.8919210053859965, + "grad_norm": 0.8606904745101929, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40270 + }, + { + "epoch": 2.8926391382405745, + "grad_norm": 0.6508898138999939, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 40280 + }, + { + "epoch": 2.8933572710951525, + "grad_norm": 0.7903237342834473, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 40290 + }, + { + "epoch": 2.8940754039497305, + "grad_norm": 0.7320941686630249, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 40300 + }, + { + "epoch": 2.894793536804309, + "grad_norm": 1.0031821727752686, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 40310 + }, + { + "epoch": 2.895511669658887, + "grad_norm": 0.7463554739952087, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 40320 + }, + { + "epoch": 2.896229802513465, + "grad_norm": 0.8455599546432495, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 40330 + }, + { + "epoch": 2.896947935368043, + "grad_norm": 0.7645914554595947, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 40340 + }, + { + "epoch": 2.8976660682226214, + "grad_norm": 0.9074810147285461, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 40350 + }, + { + "epoch": 2.8983842010771994, + "grad_norm": 0.9070153832435608, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 40360 + }, + { + "epoch": 2.8991023339317774, + "grad_norm": 0.8649221658706665, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 40370 + }, + { + "epoch": 2.8998204667863554, + "grad_norm": 1.0325016975402832, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 40380 + }, + { + "epoch": 2.9005385996409334, + "grad_norm": 0.8688622713088989, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 40390 + }, + { + "epoch": 2.9012567324955114, + "grad_norm": 0.83316969871521, + "learning_rate": 0.0002, + "loss": 0.7209, + "step": 40400 + }, + { + "epoch": 2.90197486535009, + "grad_norm": 1.0146536827087402, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 40410 + }, + { + "epoch": 2.902692998204668, + "grad_norm": 6.21811580657959, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 40420 + }, + { + "epoch": 2.903411131059246, + "grad_norm": 0.8747655749320984, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 40430 + }, + { + "epoch": 2.9041292639138243, + "grad_norm": 0.8671547174453735, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 40440 + }, + { + "epoch": 2.9048473967684023, + "grad_norm": 0.7888760566711426, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 40450 + }, + { + "epoch": 2.9055655296229803, + "grad_norm": 0.7182217240333557, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 40460 + }, + { + "epoch": 2.9062836624775583, + "grad_norm": 0.8802227973937988, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 40470 + }, + { + "epoch": 2.9070017953321363, + "grad_norm": 0.8106126189231873, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 40480 + }, + { + "epoch": 2.9077199281867143, + "grad_norm": 0.7313538789749146, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 40490 + }, + { + "epoch": 2.9084380610412928, + "grad_norm": 0.6098655462265015, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40500 + }, + { + "epoch": 2.9091561938958708, + "grad_norm": 0.8849560618400574, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 40510 + }, + { + "epoch": 2.9098743267504488, + "grad_norm": 0.8761322498321533, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 40520 + }, + { + "epoch": 2.9105924596050268, + "grad_norm": 0.8259703516960144, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 40530 + }, + { + "epoch": 2.911310592459605, + "grad_norm": 0.6613079309463501, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 40540 + }, + { + "epoch": 2.912028725314183, + "grad_norm": 0.825678825378418, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 40550 + }, + { + "epoch": 2.912746858168761, + "grad_norm": 0.824850857257843, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 40560 + }, + { + "epoch": 2.9134649910233392, + "grad_norm": 0.9629682898521423, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 40570 + }, + { + "epoch": 2.9141831238779172, + "grad_norm": 0.7446485161781311, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 40580 + }, + { + "epoch": 2.9149012567324957, + "grad_norm": 0.9028317928314209, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 40590 + }, + { + "epoch": 2.9156193895870737, + "grad_norm": 0.9646022319793701, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 40600 + }, + { + "epoch": 2.9163375224416517, + "grad_norm": 0.8845045566558838, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 40610 + }, + { + "epoch": 2.9170556552962297, + "grad_norm": 0.9660372734069824, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 40620 + }, + { + "epoch": 2.917773788150808, + "grad_norm": 0.8914347290992737, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 40630 + }, + { + "epoch": 2.918491921005386, + "grad_norm": 0.7789235711097717, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 40640 + }, + { + "epoch": 2.919210053859964, + "grad_norm": 0.8221206665039062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 40650 + }, + { + "epoch": 2.919928186714542, + "grad_norm": 0.9550618529319763, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 40660 + }, + { + "epoch": 2.92064631956912, + "grad_norm": 0.868315577507019, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 40670 + }, + { + "epoch": 2.921364452423698, + "grad_norm": 0.852878749370575, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 40680 + }, + { + "epoch": 2.9220825852782766, + "grad_norm": 0.8388790488243103, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 40690 + }, + { + "epoch": 2.9228007181328546, + "grad_norm": 0.9897602200508118, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 40700 + }, + { + "epoch": 2.9235188509874326, + "grad_norm": 0.8050527572631836, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 40710 + }, + { + "epoch": 2.924236983842011, + "grad_norm": 0.7296929955482483, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 40720 + }, + { + "epoch": 2.924955116696589, + "grad_norm": 0.917475700378418, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 40730 + }, + { + "epoch": 2.925673249551167, + "grad_norm": 0.9118483662605286, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 40740 + }, + { + "epoch": 2.926391382405745, + "grad_norm": 0.7722473740577698, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 40750 + }, + { + "epoch": 2.927109515260323, + "grad_norm": 0.7950358986854553, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 40760 + }, + { + "epoch": 2.927827648114901, + "grad_norm": 0.8868561387062073, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 40770 + }, + { + "epoch": 2.9285457809694795, + "grad_norm": 0.7923154830932617, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 40780 + }, + { + "epoch": 2.9292639138240575, + "grad_norm": 0.7285428047180176, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 40790 + }, + { + "epoch": 2.9299820466786355, + "grad_norm": 0.794775664806366, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 40800 + }, + { + "epoch": 2.9307001795332135, + "grad_norm": 0.8351698517799377, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 40810 + }, + { + "epoch": 2.931418312387792, + "grad_norm": 0.853082001209259, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40820 + }, + { + "epoch": 2.93213644524237, + "grad_norm": 0.8209722638130188, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 40830 + }, + { + "epoch": 2.932854578096948, + "grad_norm": 0.8982136845588684, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 40840 + }, + { + "epoch": 2.933572710951526, + "grad_norm": 0.8373305201530457, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 40850 + }, + { + "epoch": 2.934290843806104, + "grad_norm": 0.8326864242553711, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 40860 + }, + { + "epoch": 2.9350089766606824, + "grad_norm": 0.7232590317726135, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 40870 + }, + { + "epoch": 2.9357271095152604, + "grad_norm": 0.823615312576294, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 40880 + }, + { + "epoch": 2.9364452423698384, + "grad_norm": 0.7532811760902405, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 40890 + }, + { + "epoch": 2.9371633752244164, + "grad_norm": 0.9594773650169373, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 40900 + }, + { + "epoch": 2.937881508078995, + "grad_norm": 0.8368398547172546, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 40910 + }, + { + "epoch": 2.938599640933573, + "grad_norm": 0.8336817026138306, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 40920 + }, + { + "epoch": 2.939317773788151, + "grad_norm": 0.8413758277893066, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 40930 + }, + { + "epoch": 2.940035906642729, + "grad_norm": 0.7117549180984497, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 40940 + }, + { + "epoch": 2.940754039497307, + "grad_norm": 0.8741925954818726, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 40950 + }, + { + "epoch": 2.941472172351885, + "grad_norm": 0.8476088047027588, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 40960 + }, + { + "epoch": 2.9421903052064633, + "grad_norm": 0.674659788608551, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 40970 + }, + { + "epoch": 2.9429084380610413, + "grad_norm": 0.7087500691413879, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 40980 + }, + { + "epoch": 2.9436265709156193, + "grad_norm": 0.9202252626419067, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 40990 + }, + { + "epoch": 2.9443447037701977, + "grad_norm": 0.9775124192237854, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 41000 + }, + { + "epoch": 2.9450628366247757, + "grad_norm": 0.7465068101882935, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 41010 + }, + { + "epoch": 2.9457809694793538, + "grad_norm": 0.7229986786842346, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 41020 + }, + { + "epoch": 2.9464991023339318, + "grad_norm": 0.7228954434394836, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 41030 + }, + { + "epoch": 2.9472172351885098, + "grad_norm": 0.9396149516105652, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 41040 + }, + { + "epoch": 2.9479353680430878, + "grad_norm": 0.9458696842193604, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 41050 + }, + { + "epoch": 2.948653500897666, + "grad_norm": 0.8276246190071106, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 41060 + }, + { + "epoch": 2.949371633752244, + "grad_norm": 0.7927420139312744, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 41070 + }, + { + "epoch": 2.950089766606822, + "grad_norm": 0.7403103709220886, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 41080 + }, + { + "epoch": 2.9508078994614, + "grad_norm": 0.9813524484634399, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 41090 + }, + { + "epoch": 2.9515260323159787, + "grad_norm": 0.8560924530029297, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 41100 + }, + { + "epoch": 2.9522441651705567, + "grad_norm": 0.6937443017959595, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 41110 + }, + { + "epoch": 2.9529622980251347, + "grad_norm": 0.8440476655960083, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 41120 + }, + { + "epoch": 2.9536804308797127, + "grad_norm": 1.1260770559310913, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 41130 + }, + { + "epoch": 2.9543985637342907, + "grad_norm": 0.8789936900138855, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 41140 + }, + { + "epoch": 2.9551166965888687, + "grad_norm": 0.8205832839012146, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 41150 + }, + { + "epoch": 2.955834829443447, + "grad_norm": 0.8148444294929504, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 41160 + }, + { + "epoch": 2.956552962298025, + "grad_norm": 0.791296660900116, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41170 + }, + { + "epoch": 2.957271095152603, + "grad_norm": 1.3229854106903076, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 41180 + }, + { + "epoch": 2.9579892280071816, + "grad_norm": 0.906423807144165, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 41190 + }, + { + "epoch": 2.9587073608617596, + "grad_norm": 0.8707411289215088, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 41200 + }, + { + "epoch": 2.9594254937163376, + "grad_norm": 1.0362473726272583, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 41210 + }, + { + "epoch": 2.9601436265709156, + "grad_norm": 0.818546712398529, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 41220 + }, + { + "epoch": 2.9608617594254936, + "grad_norm": 0.8558517098426819, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 41230 + }, + { + "epoch": 2.9615798922800716, + "grad_norm": 0.8262931704521179, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 41240 + }, + { + "epoch": 2.96229802513465, + "grad_norm": 0.9603250026702881, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 41250 + }, + { + "epoch": 2.963016157989228, + "grad_norm": 0.891610860824585, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 41260 + }, + { + "epoch": 2.963734290843806, + "grad_norm": 0.9823883175849915, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 41270 + }, + { + "epoch": 2.9644524236983845, + "grad_norm": 0.8783510327339172, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 41280 + }, + { + "epoch": 2.9651705565529625, + "grad_norm": 0.873656690120697, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 41290 + }, + { + "epoch": 2.9658886894075405, + "grad_norm": 0.8281165957450867, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 41300 + }, + { + "epoch": 2.9666068222621185, + "grad_norm": 0.8008899092674255, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 41310 + }, + { + "epoch": 2.9673249551166965, + "grad_norm": 0.8564065098762512, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41320 + }, + { + "epoch": 2.9680430879712745, + "grad_norm": 0.786119818687439, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41330 + }, + { + "epoch": 2.968761220825853, + "grad_norm": 1.3152399063110352, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 41340 + }, + { + "epoch": 2.969479353680431, + "grad_norm": 0.7551527619361877, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 41350 + }, + { + "epoch": 2.970197486535009, + "grad_norm": 1.1397290229797363, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 41360 + }, + { + "epoch": 2.970915619389587, + "grad_norm": 0.8333854079246521, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 41370 + }, + { + "epoch": 2.9716337522441654, + "grad_norm": 0.8096165657043457, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 41380 + }, + { + "epoch": 2.9723518850987434, + "grad_norm": 0.8378547430038452, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 41390 + }, + { + "epoch": 2.9730700179533214, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 41400 + }, + { + "epoch": 2.9737881508078994, + "grad_norm": 0.8722409605979919, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 41410 + }, + { + "epoch": 2.9745062836624774, + "grad_norm": 0.6680061221122742, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 41420 + }, + { + "epoch": 2.9752244165170554, + "grad_norm": 0.7666152715682983, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 41430 + }, + { + "epoch": 2.975942549371634, + "grad_norm": 0.8489957451820374, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 41440 + }, + { + "epoch": 2.976660682226212, + "grad_norm": 0.8516127467155457, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 41450 + }, + { + "epoch": 2.97737881508079, + "grad_norm": 0.8836804628372192, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 41460 + }, + { + "epoch": 2.9780969479353683, + "grad_norm": 1.0963364839553833, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 41470 + }, + { + "epoch": 2.9788150807899463, + "grad_norm": 0.9908610582351685, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 41480 + }, + { + "epoch": 2.9795332136445243, + "grad_norm": 0.8822041153907776, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 41490 + }, + { + "epoch": 2.9802513464991023, + "grad_norm": 0.717723548412323, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 41500 + }, + { + "epoch": 2.9809694793536803, + "grad_norm": 0.8413400053977966, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 41510 + }, + { + "epoch": 2.9816876122082583, + "grad_norm": 0.8771023750305176, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41520 + }, + { + "epoch": 2.9824057450628367, + "grad_norm": 0.7185000777244568, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 41530 + }, + { + "epoch": 2.9831238779174147, + "grad_norm": 0.8299767374992371, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 41540 + }, + { + "epoch": 2.9838420107719927, + "grad_norm": 0.9309971928596497, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 41550 + }, + { + "epoch": 2.984560143626571, + "grad_norm": 0.7644693851470947, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 41560 + }, + { + "epoch": 2.985278276481149, + "grad_norm": 0.7888111472129822, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 41570 + }, + { + "epoch": 2.985996409335727, + "grad_norm": 1.0921967029571533, + "learning_rate": 0.0002, + "loss": 0.6984, + "step": 41580 + }, + { + "epoch": 2.986714542190305, + "grad_norm": 0.8116785883903503, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 41590 + }, + { + "epoch": 2.987432675044883, + "grad_norm": 0.983269214630127, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 41600 + }, + { + "epoch": 2.988150807899461, + "grad_norm": 0.81700599193573, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 41610 + }, + { + "epoch": 2.9888689407540396, + "grad_norm": 0.7545617818832397, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 41620 + }, + { + "epoch": 2.9895870736086176, + "grad_norm": 0.8695791363716125, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 41630 + }, + { + "epoch": 2.9903052064631956, + "grad_norm": 0.8980445861816406, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 41640 + }, + { + "epoch": 2.9910233393177736, + "grad_norm": 0.7884747982025146, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 41650 + }, + { + "epoch": 2.991741472172352, + "grad_norm": 0.8347880840301514, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 41660 + }, + { + "epoch": 2.99245960502693, + "grad_norm": 0.7786261439323425, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 41670 + }, + { + "epoch": 2.993177737881508, + "grad_norm": 0.7830624580383301, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 41680 + }, + { + "epoch": 2.993895870736086, + "grad_norm": 0.8293532133102417, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 41690 + }, + { + "epoch": 2.994614003590664, + "grad_norm": 0.8476244211196899, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 41700 + }, + { + "epoch": 2.995332136445242, + "grad_norm": 0.7218726873397827, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 41710 + }, + { + "epoch": 2.9960502692998205, + "grad_norm": 0.8144199252128601, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 41720 + }, + { + "epoch": 2.9967684021543985, + "grad_norm": 0.7047123312950134, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 41730 + }, + { + "epoch": 2.9974865350089765, + "grad_norm": 0.8412184715270996, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 41740 + }, + { + "epoch": 2.998204667863555, + "grad_norm": 0.8840848207473755, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 41750 + }, + { + "epoch": 2.998922800718133, + "grad_norm": 0.7302142977714539, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 41760 + }, + { + "epoch": 2.999640933572711, + "grad_norm": 0.7075994610786438, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 41770 + }, + { + "epoch": 3.0, + "eval_loss": 1.1079821586608887, + "eval_runtime": 55.1897, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 41775 + }, + { + "epoch": 3.000359066427289, + "grad_norm": 0.8630077838897705, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 41780 + }, + { + "epoch": 3.001077199281867, + "grad_norm": 0.8901806473731995, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 41790 + }, + { + "epoch": 3.0017953321364454, + "grad_norm": 0.8291767835617065, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 41800 + }, + { + "epoch": 3.0025134649910235, + "grad_norm": 0.792519211769104, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 41810 + }, + { + "epoch": 3.0032315978456015, + "grad_norm": 1.1330063343048096, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 41820 + }, + { + "epoch": 3.0039497307001795, + "grad_norm": 0.9401350617408752, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 41830 + }, + { + "epoch": 3.0046678635547575, + "grad_norm": 0.8065463304519653, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 41840 + }, + { + "epoch": 3.005385996409336, + "grad_norm": 0.8309979438781738, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 41850 + }, + { + "epoch": 3.006104129263914, + "grad_norm": 0.7432689070701599, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 41860 + }, + { + "epoch": 3.006822262118492, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 41870 + }, + { + "epoch": 3.00754039497307, + "grad_norm": 1.4364255666732788, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 41880 + }, + { + "epoch": 3.008258527827648, + "grad_norm": 0.9023072123527527, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 41890 + }, + { + "epoch": 3.0089766606822264, + "grad_norm": 0.7790587544441223, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 41900 + }, + { + "epoch": 3.0096947935368044, + "grad_norm": 0.9163706302642822, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 41910 + }, + { + "epoch": 3.0104129263913824, + "grad_norm": 0.8147963285446167, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 41920 + }, + { + "epoch": 3.0111310592459604, + "grad_norm": 0.8432748913764954, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 41930 + }, + { + "epoch": 3.011849192100539, + "grad_norm": 0.9216182231903076, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 41940 + }, + { + "epoch": 3.012567324955117, + "grad_norm": 0.62154221534729, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 41950 + }, + { + "epoch": 3.013285457809695, + "grad_norm": 0.8902392387390137, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 41960 + }, + { + "epoch": 3.014003590664273, + "grad_norm": 0.9601083993911743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 41970 + }, + { + "epoch": 3.014721723518851, + "grad_norm": 0.8938809037208557, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 41980 + }, + { + "epoch": 3.0154398563734293, + "grad_norm": 1.0621999502182007, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 41990 + }, + { + "epoch": 3.0161579892280073, + "grad_norm": 0.7310585379600525, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 42000 + }, + { + "epoch": 3.0168761220825853, + "grad_norm": 0.8475853800773621, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 42010 + }, + { + "epoch": 3.0175942549371633, + "grad_norm": 0.8509864807128906, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 42020 + }, + { + "epoch": 3.0183123877917413, + "grad_norm": 0.7461876273155212, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 42030 + }, + { + "epoch": 3.0190305206463197, + "grad_norm": 0.7734265327453613, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 42040 + }, + { + "epoch": 3.0197486535008977, + "grad_norm": 0.9056455492973328, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 42050 + }, + { + "epoch": 3.0204667863554757, + "grad_norm": 0.9183889031410217, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 42060 + }, + { + "epoch": 3.0211849192100537, + "grad_norm": 1.0777326822280884, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 42070 + }, + { + "epoch": 3.021903052064632, + "grad_norm": 0.9217308163642883, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 42080 + }, + { + "epoch": 3.02262118491921, + "grad_norm": 0.8220202326774597, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42090 + }, + { + "epoch": 3.023339317773788, + "grad_norm": 0.8454978466033936, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 42100 + }, + { + "epoch": 3.024057450628366, + "grad_norm": 0.8116370439529419, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 42110 + }, + { + "epoch": 3.024775583482944, + "grad_norm": 0.8064935207366943, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 42120 + }, + { + "epoch": 3.0254937163375226, + "grad_norm": 0.9718650579452515, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 42130 + }, + { + "epoch": 3.0262118491921006, + "grad_norm": 0.8817588090896606, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 42140 + }, + { + "epoch": 3.0269299820466786, + "grad_norm": 0.7757318615913391, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 42150 + }, + { + "epoch": 3.0276481149012566, + "grad_norm": 0.7500545382499695, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 42160 + }, + { + "epoch": 3.0283662477558346, + "grad_norm": 0.72913658618927, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 42170 + }, + { + "epoch": 3.029084380610413, + "grad_norm": 0.7641891837120056, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 42180 + }, + { + "epoch": 3.029802513464991, + "grad_norm": 0.7682021856307983, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 42190 + }, + { + "epoch": 3.030520646319569, + "grad_norm": 0.8145958781242371, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 42200 + }, + { + "epoch": 3.031238779174147, + "grad_norm": 1.0546396970748901, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 42210 + }, + { + "epoch": 3.0319569120287255, + "grad_norm": 0.8222804665565491, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 42220 + }, + { + "epoch": 3.0326750448833035, + "grad_norm": 0.8245829343795776, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 42230 + }, + { + "epoch": 3.0333931777378815, + "grad_norm": 0.9059963822364807, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 42240 + }, + { + "epoch": 3.0341113105924595, + "grad_norm": 1.026747465133667, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 42250 + }, + { + "epoch": 3.0348294434470375, + "grad_norm": 0.9108404517173767, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42260 + }, + { + "epoch": 3.035547576301616, + "grad_norm": 0.9828516840934753, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 42270 + }, + { + "epoch": 3.036265709156194, + "grad_norm": 0.9664266705513, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 42280 + }, + { + "epoch": 3.036983842010772, + "grad_norm": 0.7577654719352722, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42290 + }, + { + "epoch": 3.03770197486535, + "grad_norm": 0.8331853151321411, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 42300 + }, + { + "epoch": 3.038420107719928, + "grad_norm": 0.8017228245735168, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 42310 + }, + { + "epoch": 3.0391382405745064, + "grad_norm": 1.0316718816757202, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 42320 + }, + { + "epoch": 3.0398563734290844, + "grad_norm": 0.9379803538322449, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 42330 + }, + { + "epoch": 3.0405745062836624, + "grad_norm": 0.7554476857185364, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 42340 + }, + { + "epoch": 3.0412926391382404, + "grad_norm": 0.7377917766571045, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 42350 + }, + { + "epoch": 3.042010771992819, + "grad_norm": 1.0655276775360107, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 42360 + }, + { + "epoch": 3.042728904847397, + "grad_norm": 0.7748511433601379, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 42370 + }, + { + "epoch": 3.043447037701975, + "grad_norm": 0.848649799823761, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 42380 + }, + { + "epoch": 3.044165170556553, + "grad_norm": 0.7754636406898499, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 42390 + }, + { + "epoch": 3.044883303411131, + "grad_norm": 0.8173656463623047, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 42400 + }, + { + "epoch": 3.0456014362657093, + "grad_norm": 0.7881983518600464, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 42410 + }, + { + "epoch": 3.0463195691202873, + "grad_norm": 0.971072256565094, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 42420 + }, + { + "epoch": 3.0470377019748653, + "grad_norm": 0.8400143384933472, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 42430 + }, + { + "epoch": 3.0477558348294433, + "grad_norm": 1.0028647184371948, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 42440 + }, + { + "epoch": 3.0484739676840213, + "grad_norm": 0.9728034734725952, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 42450 + }, + { + "epoch": 3.0491921005386, + "grad_norm": 0.937633752822876, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 42460 + }, + { + "epoch": 3.049910233393178, + "grad_norm": 1.0265642404556274, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 42470 + }, + { + "epoch": 3.050628366247756, + "grad_norm": 0.9733216762542725, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 42480 + }, + { + "epoch": 3.051346499102334, + "grad_norm": 0.7039174437522888, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 42490 + }, + { + "epoch": 3.0520646319569122, + "grad_norm": 0.7515231370925903, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 42500 + }, + { + "epoch": 3.0527827648114902, + "grad_norm": 0.9115300178527832, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 42510 + }, + { + "epoch": 3.0535008976660682, + "grad_norm": 0.7403655648231506, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 42520 + }, + { + "epoch": 3.0542190305206462, + "grad_norm": 0.7826810479164124, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 42530 + }, + { + "epoch": 3.0549371633752243, + "grad_norm": 0.8007349371910095, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 42540 + }, + { + "epoch": 3.0556552962298027, + "grad_norm": 0.7975959777832031, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 42550 + }, + { + "epoch": 3.0563734290843807, + "grad_norm": 0.9665228128433228, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42560 + }, + { + "epoch": 3.0570915619389587, + "grad_norm": 0.8386123180389404, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 42570 + }, + { + "epoch": 3.0578096947935367, + "grad_norm": 0.7437782287597656, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 42580 + }, + { + "epoch": 3.0585278276481147, + "grad_norm": 0.8360698223114014, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 42590 + }, + { + "epoch": 3.059245960502693, + "grad_norm": 0.8982073664665222, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42600 + }, + { + "epoch": 3.059964093357271, + "grad_norm": 0.9425758719444275, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 42610 + }, + { + "epoch": 3.060682226211849, + "grad_norm": 0.8567131161689758, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42620 + }, + { + "epoch": 3.061400359066427, + "grad_norm": 0.9322942495346069, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 42630 + }, + { + "epoch": 3.0621184919210056, + "grad_norm": 0.8283235430717468, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 42640 + }, + { + "epoch": 3.0628366247755836, + "grad_norm": 0.8457967638969421, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 42650 + }, + { + "epoch": 3.0635547576301616, + "grad_norm": 0.8205100893974304, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42660 + }, + { + "epoch": 3.0642728904847396, + "grad_norm": 0.8385181427001953, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 42670 + }, + { + "epoch": 3.0649910233393176, + "grad_norm": 1.2959390878677368, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 42680 + }, + { + "epoch": 3.065709156193896, + "grad_norm": 0.7150540351867676, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 42690 + }, + { + "epoch": 3.066427289048474, + "grad_norm": 0.6647360920906067, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 42700 + }, + { + "epoch": 3.067145421903052, + "grad_norm": 0.9148316979408264, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 42710 + }, + { + "epoch": 3.06786355475763, + "grad_norm": 0.8606209754943848, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 42720 + }, + { + "epoch": 3.068581687612208, + "grad_norm": 1.4255632162094116, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42730 + }, + { + "epoch": 3.0692998204667865, + "grad_norm": 0.9131710529327393, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 42740 + }, + { + "epoch": 3.0700179533213645, + "grad_norm": 0.9560360908508301, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 42750 + }, + { + "epoch": 3.0707360861759425, + "grad_norm": 0.9278100728988647, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42760 + }, + { + "epoch": 3.0714542190305205, + "grad_norm": 0.7258471846580505, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 42770 + }, + { + "epoch": 3.072172351885099, + "grad_norm": 1.1537690162658691, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 42780 + }, + { + "epoch": 3.072890484739677, + "grad_norm": 0.8562588691711426, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 42790 + }, + { + "epoch": 3.073608617594255, + "grad_norm": 1.0271626710891724, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 42800 + }, + { + "epoch": 3.074326750448833, + "grad_norm": 0.85148024559021, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 42810 + }, + { + "epoch": 3.075044883303411, + "grad_norm": 0.805772602558136, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 42820 + }, + { + "epoch": 3.0757630161579894, + "grad_norm": 0.8057122230529785, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 42830 + }, + { + "epoch": 3.0764811490125674, + "grad_norm": 0.7997274994850159, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 42840 + }, + { + "epoch": 3.0771992818671454, + "grad_norm": 0.8739321231842041, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 42850 + }, + { + "epoch": 3.0779174147217234, + "grad_norm": 0.833951473236084, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 42860 + }, + { + "epoch": 3.0786355475763014, + "grad_norm": 0.8813839554786682, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 42870 + }, + { + "epoch": 3.07935368043088, + "grad_norm": 0.9020521640777588, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 42880 + }, + { + "epoch": 3.080071813285458, + "grad_norm": 0.888148844242096, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 42890 + }, + { + "epoch": 3.080789946140036, + "grad_norm": 0.8110589385032654, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 42900 + }, + { + "epoch": 3.081508078994614, + "grad_norm": 0.818738579750061, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 42910 + }, + { + "epoch": 3.082226211849192, + "grad_norm": 0.9607479572296143, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 42920 + }, + { + "epoch": 3.0829443447037703, + "grad_norm": 0.8162698745727539, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 42930 + }, + { + "epoch": 3.0836624775583483, + "grad_norm": 0.8170801997184753, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 42940 + }, + { + "epoch": 3.0843806104129263, + "grad_norm": 0.9250763654708862, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 42950 + }, + { + "epoch": 3.0850987432675043, + "grad_norm": 0.898097813129425, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 42960 + }, + { + "epoch": 3.0858168761220828, + "grad_norm": 0.9398433566093445, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 42970 + }, + { + "epoch": 3.0865350089766608, + "grad_norm": 1.052808165550232, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 42980 + }, + { + "epoch": 3.087253141831239, + "grad_norm": 0.8974723219871521, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 42990 + }, + { + "epoch": 3.087971274685817, + "grad_norm": 0.7517408728599548, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 43000 + }, + { + "epoch": 3.088689407540395, + "grad_norm": 0.8054485321044922, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 43010 + }, + { + "epoch": 3.0894075403949732, + "grad_norm": 0.9896154999732971, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 43020 + }, + { + "epoch": 3.0901256732495512, + "grad_norm": 0.7887356281280518, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 43030 + }, + { + "epoch": 3.0908438061041292, + "grad_norm": 1.0119125843048096, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 43040 + }, + { + "epoch": 3.0915619389587072, + "grad_norm": 0.8753892779350281, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 43050 + }, + { + "epoch": 3.0922800718132857, + "grad_norm": 0.8322654962539673, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43060 + }, + { + "epoch": 3.0929982046678637, + "grad_norm": 1.0605992078781128, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 43070 + }, + { + "epoch": 3.0937163375224417, + "grad_norm": 0.8783912062644958, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 43080 + }, + { + "epoch": 3.0944344703770197, + "grad_norm": 0.8839107751846313, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 43090 + }, + { + "epoch": 3.0951526032315977, + "grad_norm": 1.1655086278915405, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 43100 + }, + { + "epoch": 3.095870736086176, + "grad_norm": 0.7051523327827454, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 43110 + }, + { + "epoch": 3.096588868940754, + "grad_norm": 0.7793807983398438, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43120 + }, + { + "epoch": 3.097307001795332, + "grad_norm": 0.8352194428443909, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 43130 + }, + { + "epoch": 3.09802513464991, + "grad_norm": 0.9684847593307495, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 43140 + }, + { + "epoch": 3.098743267504488, + "grad_norm": 1.1106340885162354, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 43150 + }, + { + "epoch": 3.0994614003590666, + "grad_norm": 0.7814911603927612, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 43160 + }, + { + "epoch": 3.1001795332136446, + "grad_norm": 0.7923110723495483, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 43170 + }, + { + "epoch": 3.1008976660682226, + "grad_norm": 0.87022864818573, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 43180 + }, + { + "epoch": 3.1016157989228006, + "grad_norm": 0.9352855682373047, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 43190 + }, + { + "epoch": 3.1023339317773786, + "grad_norm": 0.8548445105552673, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 43200 + }, + { + "epoch": 3.103052064631957, + "grad_norm": 0.9576025009155273, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 43210 + }, + { + "epoch": 3.103770197486535, + "grad_norm": 0.7430430054664612, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 43220 + }, + { + "epoch": 3.104488330341113, + "grad_norm": 0.9619144797325134, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 43230 + }, + { + "epoch": 3.105206463195691, + "grad_norm": 0.8622338771820068, + "learning_rate": 0.0002, + "loss": 0.6171, + "step": 43240 + }, + { + "epoch": 3.1059245960502695, + "grad_norm": 0.853489339351654, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43250 + }, + { + "epoch": 3.1066427289048475, + "grad_norm": 0.9253206849098206, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 43260 + }, + { + "epoch": 3.1073608617594255, + "grad_norm": 0.9700671434402466, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 43270 + }, + { + "epoch": 3.1080789946140035, + "grad_norm": 1.0550731420516968, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 43280 + }, + { + "epoch": 3.1087971274685815, + "grad_norm": 0.939452052116394, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 43290 + }, + { + "epoch": 3.10951526032316, + "grad_norm": 0.8855276107788086, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 43300 + }, + { + "epoch": 3.110233393177738, + "grad_norm": 0.92197185754776, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 43310 + }, + { + "epoch": 3.110951526032316, + "grad_norm": 0.8825578689575195, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 43320 + }, + { + "epoch": 3.111669658886894, + "grad_norm": 0.9964608550071716, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 43330 + }, + { + "epoch": 3.1123877917414724, + "grad_norm": 0.9070520401000977, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 43340 + }, + { + "epoch": 3.1131059245960504, + "grad_norm": 0.9699633717536926, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 43350 + }, + { + "epoch": 3.1138240574506284, + "grad_norm": 0.7384091019630432, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 43360 + }, + { + "epoch": 3.1145421903052064, + "grad_norm": 0.9445326328277588, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 43370 + }, + { + "epoch": 3.1152603231597844, + "grad_norm": 0.8906524181365967, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 43380 + }, + { + "epoch": 3.115978456014363, + "grad_norm": 0.8850129246711731, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 43390 + }, + { + "epoch": 3.116696588868941, + "grad_norm": 0.7091860771179199, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 43400 + }, + { + "epoch": 3.117414721723519, + "grad_norm": 0.8992764949798584, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 43410 + }, + { + "epoch": 3.118132854578097, + "grad_norm": 0.9166698455810547, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43420 + }, + { + "epoch": 3.118850987432675, + "grad_norm": 1.1195749044418335, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 43430 + }, + { + "epoch": 3.1195691202872533, + "grad_norm": 0.9414069652557373, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 43440 + }, + { + "epoch": 3.1202872531418313, + "grad_norm": 0.7641217112541199, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 43450 + }, + { + "epoch": 3.1210053859964093, + "grad_norm": 1.2659285068511963, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 43460 + }, + { + "epoch": 3.1217235188509873, + "grad_norm": 0.9968213438987732, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 43470 + }, + { + "epoch": 3.1224416517055653, + "grad_norm": 0.8819042444229126, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 43480 + }, + { + "epoch": 3.1231597845601438, + "grad_norm": 0.9124775528907776, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 43490 + }, + { + "epoch": 3.1238779174147218, + "grad_norm": 0.868354082107544, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 43500 + }, + { + "epoch": 3.1245960502692998, + "grad_norm": 0.7367526292800903, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 43510 + }, + { + "epoch": 3.1253141831238778, + "grad_norm": 0.7553679943084717, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43520 + }, + { + "epoch": 3.126032315978456, + "grad_norm": 0.7970008850097656, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 43530 + }, + { + "epoch": 3.126750448833034, + "grad_norm": 0.9117488861083984, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 43540 + }, + { + "epoch": 3.127468581687612, + "grad_norm": 0.8004103899002075, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 43550 + }, + { + "epoch": 3.12818671454219, + "grad_norm": 0.736518919467926, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 43560 + }, + { + "epoch": 3.128904847396768, + "grad_norm": 0.8568395376205444, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 43570 + }, + { + "epoch": 3.1296229802513467, + "grad_norm": 0.9344052672386169, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 43580 + }, + { + "epoch": 3.1303411131059247, + "grad_norm": 0.7986525297164917, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 43590 + }, + { + "epoch": 3.1310592459605027, + "grad_norm": 0.8283242583274841, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 43600 + }, + { + "epoch": 3.1317773788150807, + "grad_norm": 0.6534292101860046, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 43610 + }, + { + "epoch": 3.132495511669659, + "grad_norm": 0.9585428833961487, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 43620 + }, + { + "epoch": 3.133213644524237, + "grad_norm": 0.8299157023429871, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 43630 + }, + { + "epoch": 3.133931777378815, + "grad_norm": 0.9050052762031555, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 43640 + }, + { + "epoch": 3.134649910233393, + "grad_norm": 1.0457062721252441, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 43650 + }, + { + "epoch": 3.135368043087971, + "grad_norm": 0.907691240310669, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 43660 + }, + { + "epoch": 3.1360861759425496, + "grad_norm": 0.8868935108184814, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 43670 + }, + { + "epoch": 3.1368043087971276, + "grad_norm": 0.8585456609725952, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 43680 + }, + { + "epoch": 3.1375224416517056, + "grad_norm": 1.0402741432189941, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 43690 + }, + { + "epoch": 3.1382405745062836, + "grad_norm": 1.0866798162460327, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 43700 + }, + { + "epoch": 3.1389587073608616, + "grad_norm": 0.7637296915054321, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 43710 + }, + { + "epoch": 3.13967684021544, + "grad_norm": 0.755235493183136, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 43720 + }, + { + "epoch": 3.140394973070018, + "grad_norm": 0.7258853316307068, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 43730 + }, + { + "epoch": 3.141113105924596, + "grad_norm": 1.0425268411636353, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 43740 + }, + { + "epoch": 3.141831238779174, + "grad_norm": 0.9171959757804871, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 43750 + }, + { + "epoch": 3.142549371633752, + "grad_norm": 0.8900150656700134, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 43760 + }, + { + "epoch": 3.1432675044883305, + "grad_norm": 0.9879246354103088, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 43770 + }, + { + "epoch": 3.1439856373429085, + "grad_norm": 0.7853389382362366, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 43780 + }, + { + "epoch": 3.1447037701974865, + "grad_norm": 1.0245232582092285, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 43790 + }, + { + "epoch": 3.1454219030520645, + "grad_norm": 0.8486390113830566, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 43800 + }, + { + "epoch": 3.146140035906643, + "grad_norm": 0.8536406755447388, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 43810 + }, + { + "epoch": 3.146858168761221, + "grad_norm": 0.9653734564781189, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 43820 + }, + { + "epoch": 3.147576301615799, + "grad_norm": 0.8292608857154846, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 43830 + }, + { + "epoch": 3.148294434470377, + "grad_norm": 1.147524118423462, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 43840 + }, + { + "epoch": 3.149012567324955, + "grad_norm": 0.9317546486854553, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 43850 + }, + { + "epoch": 3.1497307001795334, + "grad_norm": 0.8651045560836792, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 43860 + }, + { + "epoch": 3.1504488330341114, + "grad_norm": 0.8718969225883484, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 43870 + }, + { + "epoch": 3.1511669658886894, + "grad_norm": 1.0140702724456787, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 43880 + }, + { + "epoch": 3.1518850987432674, + "grad_norm": 0.75941401720047, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43890 + }, + { + "epoch": 3.152603231597846, + "grad_norm": 0.6618940234184265, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 43900 + }, + { + "epoch": 3.153321364452424, + "grad_norm": 1.0013338327407837, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 43910 + }, + { + "epoch": 3.154039497307002, + "grad_norm": 0.8735299706459045, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 43920 + }, + { + "epoch": 3.15475763016158, + "grad_norm": 1.141914963722229, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 43930 + }, + { + "epoch": 3.155475763016158, + "grad_norm": 1.0916038751602173, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 43940 + }, + { + "epoch": 3.1561938958707363, + "grad_norm": 0.7042547464370728, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 43950 + }, + { + "epoch": 3.1569120287253143, + "grad_norm": 0.9885236620903015, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 43960 + }, + { + "epoch": 3.1576301615798923, + "grad_norm": 0.8083009719848633, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 43970 + }, + { + "epoch": 3.1583482944344703, + "grad_norm": 1.082627296447754, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 43980 + }, + { + "epoch": 3.1590664272890483, + "grad_norm": 0.9293290376663208, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 43990 + }, + { + "epoch": 3.1597845601436267, + "grad_norm": 0.861003041267395, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 44000 + }, + { + "epoch": 3.1605026929982047, + "grad_norm": 0.9565994143486023, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 44010 + }, + { + "epoch": 3.1612208258527827, + "grad_norm": 0.9609305262565613, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 44020 + }, + { + "epoch": 3.1619389587073607, + "grad_norm": 0.847830593585968, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 44030 + }, + { + "epoch": 3.1626570915619387, + "grad_norm": 0.852357804775238, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 44040 + }, + { + "epoch": 3.163375224416517, + "grad_norm": 0.8634562492370605, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44050 + }, + { + "epoch": 3.164093357271095, + "grad_norm": 1.0259950160980225, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 44060 + }, + { + "epoch": 3.164811490125673, + "grad_norm": 0.9615250825881958, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 44070 + }, + { + "epoch": 3.165529622980251, + "grad_norm": 0.9892165660858154, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 44080 + }, + { + "epoch": 3.1662477558348296, + "grad_norm": 0.8827354907989502, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 44090 + }, + { + "epoch": 3.1669658886894076, + "grad_norm": 0.9258168339729309, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 44100 + }, + { + "epoch": 3.1676840215439857, + "grad_norm": 0.7983399033546448, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 44110 + }, + { + "epoch": 3.1684021543985637, + "grad_norm": 0.9917809963226318, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 44120 + }, + { + "epoch": 3.1691202872531417, + "grad_norm": 1.058927297592163, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44130 + }, + { + "epoch": 3.16983842010772, + "grad_norm": 1.0095895528793335, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44140 + }, + { + "epoch": 3.170556552962298, + "grad_norm": 0.9032495617866516, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 44150 + }, + { + "epoch": 3.171274685816876, + "grad_norm": 0.9391272664070129, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 44160 + }, + { + "epoch": 3.171992818671454, + "grad_norm": 0.990755558013916, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44170 + }, + { + "epoch": 3.172710951526032, + "grad_norm": 0.9310759902000427, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 44180 + }, + { + "epoch": 3.1734290843806106, + "grad_norm": 0.7698856592178345, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 44190 + }, + { + "epoch": 3.1741472172351886, + "grad_norm": 0.7735867500305176, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 44200 + }, + { + "epoch": 3.1748653500897666, + "grad_norm": 1.1447525024414062, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 44210 + }, + { + "epoch": 3.1755834829443446, + "grad_norm": 0.8667060136795044, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 44220 + }, + { + "epoch": 3.176301615798923, + "grad_norm": 0.8596829771995544, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 44230 + }, + { + "epoch": 3.177019748653501, + "grad_norm": 0.8607654571533203, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 44240 + }, + { + "epoch": 3.177737881508079, + "grad_norm": 0.9346948266029358, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 44250 + }, + { + "epoch": 3.178456014362657, + "grad_norm": 0.852344810962677, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 44260 + }, + { + "epoch": 3.179174147217235, + "grad_norm": 0.9260450005531311, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 44270 + }, + { + "epoch": 3.1798922800718135, + "grad_norm": 0.924053430557251, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 44280 + }, + { + "epoch": 3.1806104129263915, + "grad_norm": 1.001965045928955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 44290 + }, + { + "epoch": 3.1813285457809695, + "grad_norm": 0.943215012550354, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44300 + }, + { + "epoch": 3.1820466786355475, + "grad_norm": 1.006977915763855, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 44310 + }, + { + "epoch": 3.1827648114901255, + "grad_norm": 0.9768950343132019, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 44320 + }, + { + "epoch": 3.183482944344704, + "grad_norm": 0.9297489523887634, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 44330 + }, + { + "epoch": 3.184201077199282, + "grad_norm": 0.9110919237136841, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 44340 + }, + { + "epoch": 3.18491921005386, + "grad_norm": 0.9821381568908691, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 44350 + }, + { + "epoch": 3.185637342908438, + "grad_norm": 0.8451243042945862, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 44360 + }, + { + "epoch": 3.1863554757630164, + "grad_norm": 0.9676638245582581, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 44370 + }, + { + "epoch": 3.1870736086175944, + "grad_norm": 0.9826035499572754, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 44380 + }, + { + "epoch": 3.1877917414721724, + "grad_norm": 0.9453121423721313, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 44390 + }, + { + "epoch": 3.1885098743267504, + "grad_norm": 0.7766330242156982, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 44400 + }, + { + "epoch": 3.1892280071813284, + "grad_norm": 0.9302349090576172, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 44410 + }, + { + "epoch": 3.189946140035907, + "grad_norm": 0.8335331082344055, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 44420 + }, + { + "epoch": 3.190664272890485, + "grad_norm": 0.6722736358642578, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 44430 + }, + { + "epoch": 3.191382405745063, + "grad_norm": 0.9047536849975586, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 44440 + }, + { + "epoch": 3.192100538599641, + "grad_norm": 0.9653822183609009, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 44450 + }, + { + "epoch": 3.192818671454219, + "grad_norm": 0.7750703692436218, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 44460 + }, + { + "epoch": 3.1935368043087973, + "grad_norm": 0.7767539024353027, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 44470 + }, + { + "epoch": 3.1942549371633753, + "grad_norm": 0.8597778081893921, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44480 + }, + { + "epoch": 3.1949730700179533, + "grad_norm": 1.1711493730545044, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 44490 + }, + { + "epoch": 3.1956912028725313, + "grad_norm": 0.9025220274925232, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 44500 + }, + { + "epoch": 3.1964093357271093, + "grad_norm": 0.8084979057312012, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44510 + }, + { + "epoch": 3.1971274685816877, + "grad_norm": 0.8475074172019958, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44520 + }, + { + "epoch": 3.1978456014362657, + "grad_norm": 0.9915644526481628, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 44530 + }, + { + "epoch": 3.1985637342908437, + "grad_norm": 0.992231547832489, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 44540 + }, + { + "epoch": 3.1992818671454217, + "grad_norm": 0.9804556369781494, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 44550 + }, + { + "epoch": 3.2, + "grad_norm": 1.045558214187622, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 44560 + }, + { + "epoch": 3.200718132854578, + "grad_norm": 1.0880261659622192, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 44570 + }, + { + "epoch": 3.201436265709156, + "grad_norm": 0.9511138200759888, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44580 + }, + { + "epoch": 3.202154398563734, + "grad_norm": 0.9115344882011414, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 44590 + }, + { + "epoch": 3.202872531418312, + "grad_norm": 1.0738362073898315, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 44600 + }, + { + "epoch": 3.2035906642728906, + "grad_norm": 0.8209697604179382, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44610 + }, + { + "epoch": 3.2043087971274686, + "grad_norm": 0.9220197796821594, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44620 + }, + { + "epoch": 3.2050269299820466, + "grad_norm": 0.8859700560569763, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 44630 + }, + { + "epoch": 3.2057450628366246, + "grad_norm": 0.9772757291793823, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 44640 + }, + { + "epoch": 3.206463195691203, + "grad_norm": 0.9385574460029602, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 44650 + }, + { + "epoch": 3.207181328545781, + "grad_norm": 0.839958906173706, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 44660 + }, + { + "epoch": 3.207899461400359, + "grad_norm": 0.860478401184082, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 44670 + }, + { + "epoch": 3.208617594254937, + "grad_norm": 0.846886396408081, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 44680 + }, + { + "epoch": 3.209335727109515, + "grad_norm": 0.8591006398200989, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 44690 + }, + { + "epoch": 3.2100538599640935, + "grad_norm": 0.9236023426055908, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 44700 + }, + { + "epoch": 3.2107719928186715, + "grad_norm": 0.7348999977111816, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44710 + }, + { + "epoch": 3.2114901256732495, + "grad_norm": 1.0041730403900146, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 44720 + }, + { + "epoch": 3.2122082585278275, + "grad_norm": 0.8382687568664551, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 44730 + }, + { + "epoch": 3.2129263913824055, + "grad_norm": 0.8253511190414429, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 44740 + }, + { + "epoch": 3.213644524236984, + "grad_norm": 0.9589242935180664, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 44750 + }, + { + "epoch": 3.214362657091562, + "grad_norm": 0.8938157558441162, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 44760 + }, + { + "epoch": 3.21508078994614, + "grad_norm": 1.0085135698318481, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 44770 + }, + { + "epoch": 3.215798922800718, + "grad_norm": 0.8647134304046631, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 44780 + }, + { + "epoch": 3.216517055655296, + "grad_norm": 1.09453284740448, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 44790 + }, + { + "epoch": 3.2172351885098744, + "grad_norm": 0.8710666298866272, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 44800 + }, + { + "epoch": 3.2179533213644524, + "grad_norm": 0.8080880641937256, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 44810 + }, + { + "epoch": 3.2186714542190304, + "grad_norm": 1.0440675020217896, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 44820 + }, + { + "epoch": 3.2193895870736084, + "grad_norm": 1.1036376953125, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 44830 + }, + { + "epoch": 3.220107719928187, + "grad_norm": 0.8783546686172485, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44840 + }, + { + "epoch": 3.220825852782765, + "grad_norm": 0.7816855907440186, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 44850 + }, + { + "epoch": 3.221543985637343, + "grad_norm": 1.0099157094955444, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 44860 + }, + { + "epoch": 3.222262118491921, + "grad_norm": 1.054928183555603, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 44870 + }, + { + "epoch": 3.222980251346499, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 44880 + }, + { + "epoch": 3.2236983842010773, + "grad_norm": 0.9730798602104187, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 44890 + }, + { + "epoch": 3.2244165170556554, + "grad_norm": 0.7911382913589478, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 44900 + }, + { + "epoch": 3.2251346499102334, + "grad_norm": 0.9574400782585144, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 44910 + }, + { + "epoch": 3.2258527827648114, + "grad_norm": 0.8101068139076233, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 44920 + }, + { + "epoch": 3.22657091561939, + "grad_norm": 0.754146933555603, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 44930 + }, + { + "epoch": 3.227289048473968, + "grad_norm": 0.7471939921379089, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 44940 + }, + { + "epoch": 3.228007181328546, + "grad_norm": 1.0040855407714844, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 44950 + }, + { + "epoch": 3.228725314183124, + "grad_norm": 1.0016074180603027, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 44960 + }, + { + "epoch": 3.229443447037702, + "grad_norm": 1.0432976484298706, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 44970 + }, + { + "epoch": 3.2301615798922803, + "grad_norm": 0.8517055511474609, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 44980 + }, + { + "epoch": 3.2308797127468583, + "grad_norm": 0.9174178242683411, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 44990 + }, + { + "epoch": 3.2315978456014363, + "grad_norm": 0.9733774065971375, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 45000 + }, + { + "epoch": 3.2323159784560143, + "grad_norm": 0.9074714779853821, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 45010 + }, + { + "epoch": 3.2330341113105923, + "grad_norm": 0.8802759051322937, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 45020 + }, + { + "epoch": 3.2337522441651707, + "grad_norm": 1.0620871782302856, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 45030 + }, + { + "epoch": 3.2344703770197487, + "grad_norm": 0.8069542050361633, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 45040 + }, + { + "epoch": 3.2351885098743267, + "grad_norm": 0.9139137864112854, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 45050 + }, + { + "epoch": 3.2359066427289047, + "grad_norm": 0.8936411142349243, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 45060 + }, + { + "epoch": 3.2366247755834827, + "grad_norm": 0.9098079204559326, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 45070 + }, + { + "epoch": 3.237342908438061, + "grad_norm": 1.062953233718872, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45080 + }, + { + "epoch": 3.238061041292639, + "grad_norm": 0.8656470775604248, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 45090 + }, + { + "epoch": 3.238779174147217, + "grad_norm": 0.9299449920654297, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 45100 + }, + { + "epoch": 3.239497307001795, + "grad_norm": 1.0102022886276245, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 45110 + }, + { + "epoch": 3.2402154398563736, + "grad_norm": 0.8074561953544617, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 45120 + }, + { + "epoch": 3.2409335727109516, + "grad_norm": 1.044105887413025, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 45130 + }, + { + "epoch": 3.2416517055655296, + "grad_norm": 0.8742762207984924, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 45140 + }, + { + "epoch": 3.2423698384201076, + "grad_norm": 0.8240015506744385, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 45150 + }, + { + "epoch": 3.2430879712746856, + "grad_norm": 0.8438951373100281, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 45160 + }, + { + "epoch": 3.243806104129264, + "grad_norm": 1.02358877658844, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 45170 + }, + { + "epoch": 3.244524236983842, + "grad_norm": 0.8824774026870728, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 45180 + }, + { + "epoch": 3.24524236983842, + "grad_norm": 0.971015989780426, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 45190 + }, + { + "epoch": 3.245960502692998, + "grad_norm": 0.9282383918762207, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 45200 + }, + { + "epoch": 3.2466786355475765, + "grad_norm": 0.7908362746238708, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 45210 + }, + { + "epoch": 3.2473967684021545, + "grad_norm": 1.0721662044525146, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 45220 + }, + { + "epoch": 3.2481149012567325, + "grad_norm": 0.9516810774803162, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 45230 + }, + { + "epoch": 3.2488330341113105, + "grad_norm": 0.7914131283760071, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 45240 + }, + { + "epoch": 3.2495511669658885, + "grad_norm": 0.8492292761802673, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 45250 + }, + { + "epoch": 3.250269299820467, + "grad_norm": 0.8880114555358887, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 45260 + }, + { + "epoch": 3.250987432675045, + "grad_norm": 0.7808310985565186, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 45270 + }, + { + "epoch": 3.251705565529623, + "grad_norm": 0.8566828966140747, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 45280 + }, + { + "epoch": 3.252423698384201, + "grad_norm": 0.7929658889770508, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45290 + }, + { + "epoch": 3.253141831238779, + "grad_norm": 0.678207516670227, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 45300 + }, + { + "epoch": 3.2538599640933574, + "grad_norm": 0.9963029623031616, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45310 + }, + { + "epoch": 3.2545780969479354, + "grad_norm": 0.835304856300354, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 45320 + }, + { + "epoch": 3.2552962298025134, + "grad_norm": 0.7281617522239685, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 45330 + }, + { + "epoch": 3.2560143626570914, + "grad_norm": 1.244890570640564, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 45340 + }, + { + "epoch": 3.2567324955116694, + "grad_norm": 0.8372750282287598, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 45350 + }, + { + "epoch": 3.257450628366248, + "grad_norm": 1.0029667615890503, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 45360 + }, + { + "epoch": 3.258168761220826, + "grad_norm": 0.8561908602714539, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 45370 + }, + { + "epoch": 3.258886894075404, + "grad_norm": 1.0058085918426514, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 45380 + }, + { + "epoch": 3.259605026929982, + "grad_norm": 0.7768221497535706, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 45390 + }, + { + "epoch": 3.2603231597845603, + "grad_norm": 0.8443793058395386, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 45400 + }, + { + "epoch": 3.2610412926391383, + "grad_norm": 1.0140392780303955, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 45410 + }, + { + "epoch": 3.2617594254937163, + "grad_norm": 0.8397058248519897, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 45420 + }, + { + "epoch": 3.2624775583482943, + "grad_norm": 0.9717063903808594, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 45430 + }, + { + "epoch": 3.2631956912028723, + "grad_norm": 1.0279473066329956, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 45440 + }, + { + "epoch": 3.263913824057451, + "grad_norm": 1.207457184791565, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 45450 + }, + { + "epoch": 3.264631956912029, + "grad_norm": 0.8121998906135559, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 45460 + }, + { + "epoch": 3.265350089766607, + "grad_norm": 1.037733554840088, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 45470 + }, + { + "epoch": 3.266068222621185, + "grad_norm": 0.9305754899978638, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 45480 + }, + { + "epoch": 3.2667863554757632, + "grad_norm": 0.9733602404594421, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 45490 + }, + { + "epoch": 3.2675044883303412, + "grad_norm": 0.8345039486885071, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 45500 + }, + { + "epoch": 3.2682226211849192, + "grad_norm": 0.8601692318916321, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45510 + }, + { + "epoch": 3.2689407540394972, + "grad_norm": 0.7921277284622192, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 45520 + }, + { + "epoch": 3.2696588868940752, + "grad_norm": 0.8324153423309326, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 45530 + }, + { + "epoch": 3.2703770197486537, + "grad_norm": 0.85141521692276, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 45540 + }, + { + "epoch": 3.2710951526032317, + "grad_norm": 0.9399608373641968, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 45550 + }, + { + "epoch": 3.2718132854578097, + "grad_norm": 0.9829166531562805, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 45560 + }, + { + "epoch": 3.2725314183123877, + "grad_norm": 0.9936266541481018, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 45570 + }, + { + "epoch": 3.2732495511669657, + "grad_norm": 1.036165714263916, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 45580 + }, + { + "epoch": 3.273967684021544, + "grad_norm": 0.8988680243492126, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45590 + }, + { + "epoch": 3.274685816876122, + "grad_norm": 0.9173405766487122, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 45600 + }, + { + "epoch": 3.2754039497307, + "grad_norm": 0.9967324733734131, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 45610 + }, + { + "epoch": 3.276122082585278, + "grad_norm": 0.9097777009010315, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 45620 + }, + { + "epoch": 3.276840215439856, + "grad_norm": 1.0559430122375488, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 45630 + }, + { + "epoch": 3.2775583482944346, + "grad_norm": 0.9583360552787781, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 45640 + }, + { + "epoch": 3.2782764811490126, + "grad_norm": 0.7630334496498108, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 45650 + }, + { + "epoch": 3.2789946140035906, + "grad_norm": 0.9955230355262756, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 45660 + }, + { + "epoch": 3.2797127468581686, + "grad_norm": 0.8685793876647949, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45670 + }, + { + "epoch": 3.280430879712747, + "grad_norm": 0.919913113117218, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 45680 + }, + { + "epoch": 3.281149012567325, + "grad_norm": 0.826144814491272, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 45690 + }, + { + "epoch": 3.281867145421903, + "grad_norm": 0.9750179052352905, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 45700 + }, + { + "epoch": 3.282585278276481, + "grad_norm": 0.7931897640228271, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 45710 + }, + { + "epoch": 3.283303411131059, + "grad_norm": 1.0380089282989502, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 45720 + }, + { + "epoch": 3.2840215439856375, + "grad_norm": 0.8220566511154175, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 45730 + }, + { + "epoch": 3.2847396768402155, + "grad_norm": 0.9688239693641663, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 45740 + }, + { + "epoch": 3.2854578096947935, + "grad_norm": 0.8760311603546143, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 45750 + }, + { + "epoch": 3.2861759425493715, + "grad_norm": 0.8103382587432861, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 45760 + }, + { + "epoch": 3.28689407540395, + "grad_norm": 0.8835865259170532, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 45770 + }, + { + "epoch": 3.287612208258528, + "grad_norm": 0.9021160006523132, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45780 + }, + { + "epoch": 3.288330341113106, + "grad_norm": 0.8182386159896851, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 45790 + }, + { + "epoch": 3.289048473967684, + "grad_norm": 0.8555024862289429, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45800 + }, + { + "epoch": 3.289766606822262, + "grad_norm": 1.0982348918914795, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 45810 + }, + { + "epoch": 3.2904847396768404, + "grad_norm": 1.06246817111969, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 45820 + }, + { + "epoch": 3.2912028725314184, + "grad_norm": 1.1727149486541748, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 45830 + }, + { + "epoch": 3.2919210053859964, + "grad_norm": 0.8224700093269348, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 45840 + }, + { + "epoch": 3.2926391382405744, + "grad_norm": 0.8195698261260986, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 45850 + }, + { + "epoch": 3.2933572710951524, + "grad_norm": 0.8424476981163025, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 45860 + }, + { + "epoch": 3.294075403949731, + "grad_norm": 0.9804632067680359, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 45870 + }, + { + "epoch": 3.294793536804309, + "grad_norm": 0.8701804876327515, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 45880 + }, + { + "epoch": 3.295511669658887, + "grad_norm": 0.8876864910125732, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 45890 + }, + { + "epoch": 3.296229802513465, + "grad_norm": 1.0105448961257935, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 45900 + }, + { + "epoch": 3.296947935368043, + "grad_norm": 0.847017228603363, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 45910 + }, + { + "epoch": 3.2976660682226213, + "grad_norm": 0.7610297799110413, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 45920 + }, + { + "epoch": 3.2983842010771993, + "grad_norm": 0.7272670269012451, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 45930 + }, + { + "epoch": 3.2991023339317773, + "grad_norm": 0.8243510127067566, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 45940 + }, + { + "epoch": 3.2998204667863553, + "grad_norm": 1.0113074779510498, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 45950 + }, + { + "epoch": 3.3005385996409338, + "grad_norm": 0.8578087687492371, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 45960 + }, + { + "epoch": 3.3012567324955118, + "grad_norm": 0.9511606097221375, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 45970 + }, + { + "epoch": 3.3019748653500898, + "grad_norm": 0.8612566590309143, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 45980 + }, + { + "epoch": 3.3026929982046678, + "grad_norm": 0.8702331185340881, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 45990 + }, + { + "epoch": 3.3034111310592458, + "grad_norm": 1.0229583978652954, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 46000 + }, + { + "epoch": 3.304129263913824, + "grad_norm": 1.1775577068328857, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 46010 + }, + { + "epoch": 3.3048473967684022, + "grad_norm": 0.9922171831130981, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 46020 + }, + { + "epoch": 3.3055655296229802, + "grad_norm": 0.8246880769729614, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 46030 + }, + { + "epoch": 3.3062836624775582, + "grad_norm": 0.9351653456687927, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 46040 + }, + { + "epoch": 3.3070017953321367, + "grad_norm": 0.9617429375648499, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 46050 + }, + { + "epoch": 3.3077199281867147, + "grad_norm": 0.9753885269165039, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 46060 + }, + { + "epoch": 3.3084380610412927, + "grad_norm": 0.8532425165176392, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 46070 + }, + { + "epoch": 3.3091561938958707, + "grad_norm": 0.9722012877464294, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 46080 + }, + { + "epoch": 3.3098743267504487, + "grad_norm": 0.8950021266937256, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 46090 + }, + { + "epoch": 3.3105924596050267, + "grad_norm": 0.8536333441734314, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46100 + }, + { + "epoch": 3.311310592459605, + "grad_norm": 0.9423946738243103, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46110 + }, + { + "epoch": 3.312028725314183, + "grad_norm": 0.8573169112205505, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 46120 + }, + { + "epoch": 3.312746858168761, + "grad_norm": 1.0122376680374146, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 46130 + }, + { + "epoch": 3.313464991023339, + "grad_norm": 0.7492560744285583, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 46140 + }, + { + "epoch": 3.3141831238779176, + "grad_norm": 1.023658037185669, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 46150 + }, + { + "epoch": 3.3149012567324956, + "grad_norm": 1.1191970109939575, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 46160 + }, + { + "epoch": 3.3156193895870736, + "grad_norm": 0.9847373962402344, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 46170 + }, + { + "epoch": 3.3163375224416516, + "grad_norm": 0.7315911054611206, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 46180 + }, + { + "epoch": 3.3170556552962296, + "grad_norm": 0.8267890214920044, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 46190 + }, + { + "epoch": 3.317773788150808, + "grad_norm": 0.8898099064826965, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 46200 + }, + { + "epoch": 3.318491921005386, + "grad_norm": 0.8525369167327881, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 46210 + }, + { + "epoch": 3.319210053859964, + "grad_norm": 0.8074760437011719, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 46220 + }, + { + "epoch": 3.319928186714542, + "grad_norm": 0.8473616242408752, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 46230 + }, + { + "epoch": 3.3206463195691205, + "grad_norm": 0.8678314089775085, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 46240 + }, + { + "epoch": 3.3213644524236985, + "grad_norm": 0.8718782067298889, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 46250 + }, + { + "epoch": 3.3220825852782765, + "grad_norm": 0.9384858012199402, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 46260 + }, + { + "epoch": 3.3228007181328545, + "grad_norm": 0.9295032620429993, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 46270 + }, + { + "epoch": 3.3235188509874325, + "grad_norm": 0.9472482800483704, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 46280 + }, + { + "epoch": 3.324236983842011, + "grad_norm": 0.7970638275146484, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 46290 + }, + { + "epoch": 3.324955116696589, + "grad_norm": 0.9508723020553589, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 46300 + }, + { + "epoch": 3.325673249551167, + "grad_norm": 0.9153636693954468, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 46310 + }, + { + "epoch": 3.326391382405745, + "grad_norm": 0.7890323400497437, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 46320 + }, + { + "epoch": 3.3271095152603234, + "grad_norm": 0.8711825609207153, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46330 + }, + { + "epoch": 3.3278276481149014, + "grad_norm": 0.9938926696777344, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 46340 + }, + { + "epoch": 3.3285457809694794, + "grad_norm": 0.8497524857521057, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 46350 + }, + { + "epoch": 3.3292639138240574, + "grad_norm": 0.9191650748252869, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 46360 + }, + { + "epoch": 3.3299820466786354, + "grad_norm": 0.8974085450172424, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 46370 + }, + { + "epoch": 3.3307001795332134, + "grad_norm": 0.9928934574127197, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 46380 + }, + { + "epoch": 3.331418312387792, + "grad_norm": 0.9011030197143555, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46390 + }, + { + "epoch": 3.33213644524237, + "grad_norm": 0.898594856262207, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 46400 + }, + { + "epoch": 3.332854578096948, + "grad_norm": 0.7506672143936157, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 46410 + }, + { + "epoch": 3.333572710951526, + "grad_norm": 0.9239172339439392, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 46420 + }, + { + "epoch": 3.3342908438061043, + "grad_norm": 1.0749682188034058, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46430 + }, + { + "epoch": 3.3350089766606823, + "grad_norm": 0.9262617230415344, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 46440 + }, + { + "epoch": 3.3357271095152603, + "grad_norm": 0.8681274056434631, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 46450 + }, + { + "epoch": 3.3364452423698383, + "grad_norm": 0.9558620452880859, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 46460 + }, + { + "epoch": 3.3371633752244163, + "grad_norm": 0.8907097578048706, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 46470 + }, + { + "epoch": 3.3378815080789948, + "grad_norm": 1.0941565036773682, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 46480 + }, + { + "epoch": 3.3385996409335728, + "grad_norm": 0.8971590995788574, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 46490 + }, + { + "epoch": 3.3393177737881508, + "grad_norm": 1.0315606594085693, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 46500 + }, + { + "epoch": 3.3400359066427288, + "grad_norm": 0.7717124223709106, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 46510 + }, + { + "epoch": 3.340754039497307, + "grad_norm": 0.8060970902442932, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 46520 + }, + { + "epoch": 3.341472172351885, + "grad_norm": 0.969510018825531, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 46530 + }, + { + "epoch": 3.342190305206463, + "grad_norm": 0.8837248682975769, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 46540 + }, + { + "epoch": 3.342908438061041, + "grad_norm": 0.9561076164245605, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 46550 + }, + { + "epoch": 3.343626570915619, + "grad_norm": 0.8529208898544312, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 46560 + }, + { + "epoch": 3.3443447037701977, + "grad_norm": 1.1300519704818726, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 46570 + }, + { + "epoch": 3.3450628366247757, + "grad_norm": 0.8330956101417542, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46580 + }, + { + "epoch": 3.3457809694793537, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 46590 + }, + { + "epoch": 3.3464991023339317, + "grad_norm": 1.0470821857452393, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 46600 + }, + { + "epoch": 3.34721723518851, + "grad_norm": 0.9933704137802124, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46610 + }, + { + "epoch": 3.347935368043088, + "grad_norm": 0.8130798935890198, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 46620 + }, + { + "epoch": 3.348653500897666, + "grad_norm": 0.9746946692466736, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46630 + }, + { + "epoch": 3.349371633752244, + "grad_norm": 0.8607267141342163, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46640 + }, + { + "epoch": 3.350089766606822, + "grad_norm": 0.800335705280304, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 46650 + }, + { + "epoch": 3.3508078994614, + "grad_norm": 1.0083239078521729, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 46660 + }, + { + "epoch": 3.3515260323159786, + "grad_norm": 1.0774433612823486, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 46670 + }, + { + "epoch": 3.3522441651705566, + "grad_norm": 0.9378824234008789, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46680 + }, + { + "epoch": 3.3529622980251346, + "grad_norm": 0.8490564227104187, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 46690 + }, + { + "epoch": 3.3536804308797126, + "grad_norm": 1.0415582656860352, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 46700 + }, + { + "epoch": 3.354398563734291, + "grad_norm": 0.8514367938041687, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 46710 + }, + { + "epoch": 3.355116696588869, + "grad_norm": 0.7691360712051392, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 46720 + }, + { + "epoch": 3.355834829443447, + "grad_norm": 0.8345438241958618, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 46730 + }, + { + "epoch": 3.356552962298025, + "grad_norm": 1.023492693901062, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 46740 + }, + { + "epoch": 3.357271095152603, + "grad_norm": 0.9648325443267822, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 46750 + }, + { + "epoch": 3.3579892280071815, + "grad_norm": 0.9029248356819153, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 46760 + }, + { + "epoch": 3.3587073608617595, + "grad_norm": 0.9109513759613037, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 46770 + }, + { + "epoch": 3.3594254937163375, + "grad_norm": 0.7757390141487122, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 46780 + }, + { + "epoch": 3.3601436265709155, + "grad_norm": 0.794035792350769, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46790 + }, + { + "epoch": 3.360861759425494, + "grad_norm": 0.8211429715156555, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 46800 + }, + { + "epoch": 3.361579892280072, + "grad_norm": 0.8620322346687317, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46810 + }, + { + "epoch": 3.36229802513465, + "grad_norm": 0.9392538070678711, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 46820 + }, + { + "epoch": 3.363016157989228, + "grad_norm": 0.8297873139381409, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 46830 + }, + { + "epoch": 3.363734290843806, + "grad_norm": 0.9158190488815308, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 46840 + }, + { + "epoch": 3.3644524236983844, + "grad_norm": 1.1449424028396606, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 46850 + }, + { + "epoch": 3.3651705565529624, + "grad_norm": 0.8718444108963013, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 46860 + }, + { + "epoch": 3.3658886894075404, + "grad_norm": 0.7744014263153076, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 46870 + }, + { + "epoch": 3.3666068222621184, + "grad_norm": 0.8392460942268372, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 46880 + }, + { + "epoch": 3.367324955116697, + "grad_norm": 1.0424989461898804, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 46890 + }, + { + "epoch": 3.368043087971275, + "grad_norm": 1.4696359634399414, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 46900 + }, + { + "epoch": 3.368761220825853, + "grad_norm": 0.9298201203346252, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46910 + }, + { + "epoch": 3.369479353680431, + "grad_norm": 0.8965262770652771, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 46920 + }, + { + "epoch": 3.370197486535009, + "grad_norm": 0.9395381808280945, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 46930 + }, + { + "epoch": 3.370915619389587, + "grad_norm": 0.9069047570228577, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 46940 + }, + { + "epoch": 3.3716337522441653, + "grad_norm": 0.9208605885505676, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46950 + }, + { + "epoch": 3.3723518850987433, + "grad_norm": 0.9493077397346497, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 46960 + }, + { + "epoch": 3.3730700179533213, + "grad_norm": 1.0804208517074585, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 46970 + }, + { + "epoch": 3.3737881508078993, + "grad_norm": 0.9465714693069458, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 46980 + }, + { + "epoch": 3.3745062836624777, + "grad_norm": 0.9189882278442383, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 46990 + }, + { + "epoch": 3.3752244165170557, + "grad_norm": 1.0199357271194458, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 47000 + }, + { + "epoch": 3.3759425493716337, + "grad_norm": 0.8999426960945129, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 47010 + }, + { + "epoch": 3.3766606822262117, + "grad_norm": 0.8923690319061279, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 47020 + }, + { + "epoch": 3.3773788150807897, + "grad_norm": 0.7459347248077393, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 47030 + }, + { + "epoch": 3.378096947935368, + "grad_norm": 0.7702858448028564, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 47040 + }, + { + "epoch": 3.378815080789946, + "grad_norm": 0.8296625018119812, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 47050 + }, + { + "epoch": 3.379533213644524, + "grad_norm": 1.2952555418014526, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47060 + }, + { + "epoch": 3.380251346499102, + "grad_norm": 0.7778869271278381, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 47070 + }, + { + "epoch": 3.3809694793536806, + "grad_norm": 0.9151549339294434, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 47080 + }, + { + "epoch": 3.3816876122082586, + "grad_norm": 0.7883925437927246, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 47090 + }, + { + "epoch": 3.3824057450628366, + "grad_norm": 0.9602295756340027, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 47100 + }, + { + "epoch": 3.3831238779174146, + "grad_norm": 0.7953121066093445, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47110 + }, + { + "epoch": 3.3838420107719926, + "grad_norm": 1.110148549079895, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 47120 + }, + { + "epoch": 3.384560143626571, + "grad_norm": 0.9359608888626099, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 47130 + }, + { + "epoch": 3.385278276481149, + "grad_norm": 0.7877762317657471, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 47140 + }, + { + "epoch": 3.385996409335727, + "grad_norm": 0.8586933016777039, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47150 + }, + { + "epoch": 3.386714542190305, + "grad_norm": 0.8920878767967224, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 47160 + }, + { + "epoch": 3.3874326750448835, + "grad_norm": 0.9692603349685669, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 47170 + }, + { + "epoch": 3.3881508078994615, + "grad_norm": 0.9038610458374023, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 47180 + }, + { + "epoch": 3.3888689407540395, + "grad_norm": 1.6299188137054443, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 47190 + }, + { + "epoch": 3.3895870736086176, + "grad_norm": 0.9704291820526123, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 47200 + }, + { + "epoch": 3.3903052064631956, + "grad_norm": 0.9503401517868042, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 47210 + }, + { + "epoch": 3.3910233393177736, + "grad_norm": 1.0051378011703491, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 47220 + }, + { + "epoch": 3.391741472172352, + "grad_norm": 0.7336357235908508, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 47230 + }, + { + "epoch": 3.39245960502693, + "grad_norm": 0.9847398996353149, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47240 + }, + { + "epoch": 3.393177737881508, + "grad_norm": 0.8100917339324951, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 47250 + }, + { + "epoch": 3.393895870736086, + "grad_norm": 0.9752838611602783, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 47260 + }, + { + "epoch": 3.3946140035906645, + "grad_norm": 0.9400623440742493, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 47270 + }, + { + "epoch": 3.3953321364452425, + "grad_norm": 0.7310057878494263, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 47280 + }, + { + "epoch": 3.3960502692998205, + "grad_norm": 0.8898789286613464, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 47290 + }, + { + "epoch": 3.3967684021543985, + "grad_norm": 1.0157585144042969, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 47300 + }, + { + "epoch": 3.3974865350089765, + "grad_norm": 0.9108527898788452, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 47310 + }, + { + "epoch": 3.398204667863555, + "grad_norm": 0.9796249270439148, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 47320 + }, + { + "epoch": 3.398922800718133, + "grad_norm": 0.8176435232162476, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 47330 + }, + { + "epoch": 3.399640933572711, + "grad_norm": 0.9981188178062439, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 47340 + }, + { + "epoch": 3.400359066427289, + "grad_norm": 0.9774404764175415, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47350 + }, + { + "epoch": 3.4010771992818674, + "grad_norm": 0.8624991774559021, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 47360 + }, + { + "epoch": 3.4017953321364454, + "grad_norm": 0.9191665053367615, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 47370 + }, + { + "epoch": 3.4025134649910234, + "grad_norm": 0.7971290946006775, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 47380 + }, + { + "epoch": 3.4032315978456014, + "grad_norm": 0.8336732983589172, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 47390 + }, + { + "epoch": 3.4039497307001794, + "grad_norm": 0.7730334401130676, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 47400 + }, + { + "epoch": 3.404667863554758, + "grad_norm": 0.8559145927429199, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 47410 + }, + { + "epoch": 3.405385996409336, + "grad_norm": 1.0261447429656982, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 47420 + }, + { + "epoch": 3.406104129263914, + "grad_norm": 0.9931781888008118, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 47430 + }, + { + "epoch": 3.406822262118492, + "grad_norm": 0.8971807360649109, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 47440 + }, + { + "epoch": 3.4075403949730703, + "grad_norm": 0.8886999487876892, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 47450 + }, + { + "epoch": 3.4082585278276483, + "grad_norm": 0.9551735520362854, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 47460 + }, + { + "epoch": 3.4089766606822263, + "grad_norm": 0.9066859483718872, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 47470 + }, + { + "epoch": 3.4096947935368043, + "grad_norm": 0.9192125201225281, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 47480 + }, + { + "epoch": 3.4104129263913823, + "grad_norm": 0.9332839250564575, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 47490 + }, + { + "epoch": 3.4111310592459603, + "grad_norm": 0.745563805103302, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47500 + }, + { + "epoch": 3.4118491921005387, + "grad_norm": 0.6843905448913574, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 47510 + }, + { + "epoch": 3.4125673249551167, + "grad_norm": 0.8063111305236816, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 47520 + }, + { + "epoch": 3.4132854578096947, + "grad_norm": 0.9666593670845032, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 47530 + }, + { + "epoch": 3.4140035906642727, + "grad_norm": 0.8112747073173523, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47540 + }, + { + "epoch": 3.414721723518851, + "grad_norm": 0.820807933807373, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 47550 + }, + { + "epoch": 3.415439856373429, + "grad_norm": 0.8476285338401794, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 47560 + }, + { + "epoch": 3.416157989228007, + "grad_norm": 1.0232552289962769, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47570 + }, + { + "epoch": 3.416876122082585, + "grad_norm": 0.8749372363090515, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 47580 + }, + { + "epoch": 3.417594254937163, + "grad_norm": 0.8117937445640564, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 47590 + }, + { + "epoch": 3.4183123877917416, + "grad_norm": 0.9010460376739502, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 47600 + }, + { + "epoch": 3.4190305206463196, + "grad_norm": 0.8955527544021606, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 47610 + }, + { + "epoch": 3.4197486535008976, + "grad_norm": 0.884186327457428, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 47620 + }, + { + "epoch": 3.4204667863554756, + "grad_norm": 0.8995241522789001, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 47630 + }, + { + "epoch": 3.421184919210054, + "grad_norm": 1.0627013444900513, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47640 + }, + { + "epoch": 3.421903052064632, + "grad_norm": 0.8619979619979858, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 47650 + }, + { + "epoch": 3.42262118491921, + "grad_norm": 0.9682498574256897, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 47660 + }, + { + "epoch": 3.423339317773788, + "grad_norm": 0.9614400863647461, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 47670 + }, + { + "epoch": 3.424057450628366, + "grad_norm": 0.7986962795257568, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 47680 + }, + { + "epoch": 3.4247755834829445, + "grad_norm": 0.8255957961082458, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 47690 + }, + { + "epoch": 3.4254937163375225, + "grad_norm": 0.9139757752418518, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 47700 + }, + { + "epoch": 3.4262118491921005, + "grad_norm": 0.8086292743682861, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 47710 + }, + { + "epoch": 3.4269299820466785, + "grad_norm": 0.8852273225784302, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 47720 + }, + { + "epoch": 3.427648114901257, + "grad_norm": 0.7568784356117249, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 47730 + }, + { + "epoch": 3.428366247755835, + "grad_norm": 0.8933039903640747, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 47740 + }, + { + "epoch": 3.429084380610413, + "grad_norm": 0.8101669549942017, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 47750 + }, + { + "epoch": 3.429802513464991, + "grad_norm": 0.7021054625511169, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 47760 + }, + { + "epoch": 3.430520646319569, + "grad_norm": 0.8282538652420044, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 47770 + }, + { + "epoch": 3.431238779174147, + "grad_norm": 0.8168348670005798, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 47780 + }, + { + "epoch": 3.4319569120287254, + "grad_norm": 0.9504001140594482, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 47790 + }, + { + "epoch": 3.4326750448833034, + "grad_norm": 0.7500190734863281, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47800 + }, + { + "epoch": 3.4333931777378814, + "grad_norm": 0.8645710945129395, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 47810 + }, + { + "epoch": 3.4341113105924594, + "grad_norm": 0.8088704943656921, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 47820 + }, + { + "epoch": 3.434829443447038, + "grad_norm": 0.9981673955917358, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 47830 + }, + { + "epoch": 3.435547576301616, + "grad_norm": 0.9363315105438232, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 47840 + }, + { + "epoch": 3.436265709156194, + "grad_norm": 0.8471030592918396, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 47850 + }, + { + "epoch": 3.436983842010772, + "grad_norm": 0.9447668790817261, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 47860 + }, + { + "epoch": 3.43770197486535, + "grad_norm": 0.9494127631187439, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 47870 + }, + { + "epoch": 3.4384201077199283, + "grad_norm": 0.8340432643890381, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47880 + }, + { + "epoch": 3.4391382405745063, + "grad_norm": 0.8466387987136841, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 47890 + }, + { + "epoch": 3.4398563734290843, + "grad_norm": 0.9498962759971619, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47900 + }, + { + "epoch": 3.4405745062836623, + "grad_norm": 0.8490501046180725, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 47910 + }, + { + "epoch": 3.441292639138241, + "grad_norm": 0.9506490230560303, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 47920 + }, + { + "epoch": 3.442010771992819, + "grad_norm": 0.7944257855415344, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 47930 + }, + { + "epoch": 3.442728904847397, + "grad_norm": 0.9725518226623535, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 47940 + }, + { + "epoch": 3.443447037701975, + "grad_norm": 0.7823024392127991, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47950 + }, + { + "epoch": 3.444165170556553, + "grad_norm": 0.810565173625946, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 47960 + }, + { + "epoch": 3.4448833034111312, + "grad_norm": 0.9809024333953857, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 47970 + }, + { + "epoch": 3.4456014362657092, + "grad_norm": 0.8818578720092773, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 47980 + }, + { + "epoch": 3.4463195691202873, + "grad_norm": 0.9843092560768127, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 47990 + }, + { + "epoch": 3.4470377019748653, + "grad_norm": 0.916313886642456, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 48000 + }, + { + "epoch": 3.4477558348294433, + "grad_norm": 0.908442497253418, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 48010 + }, + { + "epoch": 3.4484739676840217, + "grad_norm": 0.9880178570747375, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 48020 + }, + { + "epoch": 3.4491921005385997, + "grad_norm": 0.9276854991912842, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 48030 + }, + { + "epoch": 3.4499102333931777, + "grad_norm": 1.0879448652267456, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 48040 + }, + { + "epoch": 3.4506283662477557, + "grad_norm": 0.7430389523506165, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 48050 + }, + { + "epoch": 3.4513464991023337, + "grad_norm": 1.0880072116851807, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 48060 + }, + { + "epoch": 3.452064631956912, + "grad_norm": 1.0424141883850098, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 48070 + }, + { + "epoch": 3.45278276481149, + "grad_norm": 0.926330029964447, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 48080 + }, + { + "epoch": 3.453500897666068, + "grad_norm": 0.8911219239234924, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 48090 + }, + { + "epoch": 3.454219030520646, + "grad_norm": 0.8727201223373413, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 48100 + }, + { + "epoch": 3.4549371633752246, + "grad_norm": 0.8573940396308899, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48110 + }, + { + "epoch": 3.4556552962298026, + "grad_norm": 1.0427064895629883, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 48120 + }, + { + "epoch": 3.4563734290843806, + "grad_norm": 0.8688231706619263, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 48130 + }, + { + "epoch": 3.4570915619389586, + "grad_norm": 0.8856009244918823, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 48140 + }, + { + "epoch": 3.4578096947935366, + "grad_norm": 0.9535353183746338, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 48150 + }, + { + "epoch": 3.458527827648115, + "grad_norm": 0.9466010928153992, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 48160 + }, + { + "epoch": 3.459245960502693, + "grad_norm": 0.9783535599708557, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 48170 + }, + { + "epoch": 3.459964093357271, + "grad_norm": 0.8010456562042236, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 48180 + }, + { + "epoch": 3.460682226211849, + "grad_norm": 0.8928955793380737, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 48190 + }, + { + "epoch": 3.4614003590664275, + "grad_norm": 0.7565838694572449, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 48200 + }, + { + "epoch": 3.4621184919210055, + "grad_norm": 1.0044180154800415, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 48210 + }, + { + "epoch": 3.4628366247755835, + "grad_norm": 0.8161038160324097, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 48220 + }, + { + "epoch": 3.4635547576301615, + "grad_norm": 1.1000211238861084, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 48230 + }, + { + "epoch": 3.4642728904847395, + "grad_norm": 0.7942240238189697, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 48240 + }, + { + "epoch": 3.464991023339318, + "grad_norm": 0.7546432018280029, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 48250 + }, + { + "epoch": 3.465709156193896, + "grad_norm": 0.7705255150794983, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 48260 + }, + { + "epoch": 3.466427289048474, + "grad_norm": 0.7958067059516907, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 48270 + }, + { + "epoch": 3.467145421903052, + "grad_norm": 0.9199120402336121, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48280 + }, + { + "epoch": 3.46786355475763, + "grad_norm": 1.118672251701355, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 48290 + }, + { + "epoch": 3.4685816876122084, + "grad_norm": 0.9161015748977661, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 48300 + }, + { + "epoch": 3.4692998204667864, + "grad_norm": 1.1086218357086182, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 48310 + }, + { + "epoch": 3.4700179533213644, + "grad_norm": 1.0123368501663208, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 48320 + }, + { + "epoch": 3.4707360861759424, + "grad_norm": 0.7380602359771729, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 48330 + }, + { + "epoch": 3.4714542190305204, + "grad_norm": 0.8967105150222778, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 48340 + }, + { + "epoch": 3.472172351885099, + "grad_norm": 1.0134044885635376, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48350 + }, + { + "epoch": 3.472890484739677, + "grad_norm": 1.080815076828003, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 48360 + }, + { + "epoch": 3.473608617594255, + "grad_norm": 1.151721477508545, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 48370 + }, + { + "epoch": 3.474326750448833, + "grad_norm": 0.9436505436897278, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 48380 + }, + { + "epoch": 3.4750448833034113, + "grad_norm": 0.9154609441757202, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 48390 + }, + { + "epoch": 3.4757630161579893, + "grad_norm": 0.8943037986755371, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 48400 + }, + { + "epoch": 3.4764811490125673, + "grad_norm": 0.936988115310669, + "learning_rate": 0.0002, + "loss": 0.6316, + "step": 48410 + }, + { + "epoch": 3.4771992818671453, + "grad_norm": 0.826960027217865, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 48420 + }, + { + "epoch": 3.4779174147217233, + "grad_norm": 1.0487587451934814, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 48430 + }, + { + "epoch": 3.478635547576302, + "grad_norm": 0.729163646697998, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 48440 + }, + { + "epoch": 3.47935368043088, + "grad_norm": 0.8156948089599609, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 48450 + }, + { + "epoch": 3.480071813285458, + "grad_norm": 0.8004332184791565, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 48460 + }, + { + "epoch": 3.480789946140036, + "grad_norm": 0.9632692337036133, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 48470 + }, + { + "epoch": 3.4815080789946142, + "grad_norm": 1.0950212478637695, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 48480 + }, + { + "epoch": 3.4822262118491922, + "grad_norm": 0.8574318885803223, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 48490 + }, + { + "epoch": 3.4829443447037702, + "grad_norm": 0.8552606701850891, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 48500 + }, + { + "epoch": 3.4836624775583482, + "grad_norm": 0.9698445200920105, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 48510 + }, + { + "epoch": 3.4843806104129262, + "grad_norm": 0.9427815675735474, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 48520 + }, + { + "epoch": 3.4850987432675042, + "grad_norm": 0.7902070879936218, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 48530 + }, + { + "epoch": 3.4858168761220827, + "grad_norm": 1.0300066471099854, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 48540 + }, + { + "epoch": 3.4865350089766607, + "grad_norm": 1.1688778400421143, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 48550 + }, + { + "epoch": 3.4872531418312387, + "grad_norm": 1.0012071132659912, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 48560 + }, + { + "epoch": 3.4879712746858167, + "grad_norm": 1.112094759941101, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 48570 + }, + { + "epoch": 3.488689407540395, + "grad_norm": 0.8547284603118896, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 48580 + }, + { + "epoch": 3.489407540394973, + "grad_norm": 0.8827278017997742, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 48590 + }, + { + "epoch": 3.490125673249551, + "grad_norm": 0.9255490303039551, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 48600 + }, + { + "epoch": 3.490843806104129, + "grad_norm": 0.8000030517578125, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 48610 + }, + { + "epoch": 3.491561938958707, + "grad_norm": 0.9327391386032104, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 48620 + }, + { + "epoch": 3.4922800718132856, + "grad_norm": 0.9004138708114624, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 48630 + }, + { + "epoch": 3.4929982046678636, + "grad_norm": 0.9886971116065979, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 48640 + }, + { + "epoch": 3.4937163375224416, + "grad_norm": 0.9890487194061279, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 48650 + }, + { + "epoch": 3.4944344703770196, + "grad_norm": 0.7024438977241516, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 48660 + }, + { + "epoch": 3.495152603231598, + "grad_norm": 0.8397303223609924, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 48670 + }, + { + "epoch": 3.495870736086176, + "grad_norm": 0.9120950698852539, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 48680 + }, + { + "epoch": 3.496588868940754, + "grad_norm": 1.057299017906189, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48690 + }, + { + "epoch": 3.497307001795332, + "grad_norm": 0.821325957775116, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 48700 + }, + { + "epoch": 3.49802513464991, + "grad_norm": 1.0029970407485962, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 48710 + }, + { + "epoch": 3.4987432675044885, + "grad_norm": 0.9483712911605835, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 48720 + }, + { + "epoch": 3.4994614003590665, + "grad_norm": 0.9637855291366577, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 48730 + }, + { + "epoch": 3.5001795332136445, + "grad_norm": 0.6848894357681274, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 48740 + }, + { + "epoch": 3.5008976660682225, + "grad_norm": 0.7848573327064514, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 48750 + }, + { + "epoch": 3.501615798922801, + "grad_norm": 1.0341308116912842, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 48760 + }, + { + "epoch": 3.502333931777379, + "grad_norm": 0.8858218193054199, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 48770 + }, + { + "epoch": 3.503052064631957, + "grad_norm": 0.8366939425468445, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 48780 + }, + { + "epoch": 3.503770197486535, + "grad_norm": 0.7926092147827148, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 48790 + }, + { + "epoch": 3.504488330341113, + "grad_norm": 0.8503843545913696, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 48800 + }, + { + "epoch": 3.505206463195691, + "grad_norm": 0.8867869973182678, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 48810 + }, + { + "epoch": 3.5059245960502694, + "grad_norm": 1.0336930751800537, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 48820 + }, + { + "epoch": 3.5066427289048474, + "grad_norm": 0.8564051985740662, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 48830 + }, + { + "epoch": 3.5073608617594254, + "grad_norm": 0.9202605485916138, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 48840 + }, + { + "epoch": 3.508078994614004, + "grad_norm": 0.8838639855384827, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 48850 + }, + { + "epoch": 3.508797127468582, + "grad_norm": 0.8975196480751038, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48860 + }, + { + "epoch": 3.50951526032316, + "grad_norm": 0.8842370510101318, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 48870 + }, + { + "epoch": 3.510233393177738, + "grad_norm": 0.9195886254310608, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 48880 + }, + { + "epoch": 3.510951526032316, + "grad_norm": 0.986130952835083, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 48890 + }, + { + "epoch": 3.511669658886894, + "grad_norm": 0.8119593858718872, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 48900 + }, + { + "epoch": 3.5123877917414723, + "grad_norm": 0.9027136564254761, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 48910 + }, + { + "epoch": 3.5131059245960503, + "grad_norm": 0.8560537099838257, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 48920 + }, + { + "epoch": 3.5138240574506283, + "grad_norm": 0.7073559165000916, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 48930 + }, + { + "epoch": 3.5145421903052063, + "grad_norm": 0.8753304481506348, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 48940 + }, + { + "epoch": 3.5152603231597848, + "grad_norm": 0.9151145815849304, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 48950 + }, + { + "epoch": 3.5159784560143628, + "grad_norm": 0.7794315814971924, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 48960 + }, + { + "epoch": 3.5166965888689408, + "grad_norm": 0.9226023554801941, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 48970 + }, + { + "epoch": 3.5174147217235188, + "grad_norm": 0.8442051410675049, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48980 + }, + { + "epoch": 3.5181328545780968, + "grad_norm": 0.9769423007965088, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 48990 + }, + { + "epoch": 3.5188509874326748, + "grad_norm": 0.740347146987915, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 49000 + }, + { + "epoch": 3.519569120287253, + "grad_norm": 0.8963457345962524, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 49010 + }, + { + "epoch": 3.520287253141831, + "grad_norm": 0.8410176634788513, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 49020 + }, + { + "epoch": 3.521005385996409, + "grad_norm": 1.0486022233963013, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 49030 + }, + { + "epoch": 3.5217235188509877, + "grad_norm": 0.95393967628479, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 49040 + }, + { + "epoch": 3.5224416517055657, + "grad_norm": 0.8261157274246216, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49050 + }, + { + "epoch": 3.5231597845601437, + "grad_norm": 0.9321704506874084, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 49060 + }, + { + "epoch": 3.5238779174147217, + "grad_norm": 1.2596088647842407, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 49070 + }, + { + "epoch": 3.5245960502692997, + "grad_norm": 0.8584637641906738, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 49080 + }, + { + "epoch": 3.5253141831238777, + "grad_norm": 0.850520670413971, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 49090 + }, + { + "epoch": 3.526032315978456, + "grad_norm": 0.8915920257568359, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 49100 + }, + { + "epoch": 3.526750448833034, + "grad_norm": 0.9070239067077637, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 49110 + }, + { + "epoch": 3.527468581687612, + "grad_norm": 0.699878990650177, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 49120 + }, + { + "epoch": 3.5281867145421906, + "grad_norm": 0.9003779888153076, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 49130 + }, + { + "epoch": 3.5289048473967686, + "grad_norm": 0.7886711955070496, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 49140 + }, + { + "epoch": 3.5296229802513466, + "grad_norm": 0.7368922233581543, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 49150 + }, + { + "epoch": 3.5303411131059246, + "grad_norm": 0.8585197329521179, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 49160 + }, + { + "epoch": 3.5310592459605026, + "grad_norm": 1.0205435752868652, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 49170 + }, + { + "epoch": 3.5317773788150806, + "grad_norm": 0.8756650686264038, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 49180 + }, + { + "epoch": 3.532495511669659, + "grad_norm": 1.0278643369674683, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 49190 + }, + { + "epoch": 3.533213644524237, + "grad_norm": 0.8641911745071411, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 49200 + }, + { + "epoch": 3.533931777378815, + "grad_norm": 0.8730159401893616, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 49210 + }, + { + "epoch": 3.534649910233393, + "grad_norm": 0.918637216091156, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 49220 + }, + { + "epoch": 3.5353680430879715, + "grad_norm": 1.0467222929000854, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 49230 + }, + { + "epoch": 3.5360861759425495, + "grad_norm": 1.005009412765503, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 49240 + }, + { + "epoch": 3.5368043087971275, + "grad_norm": 0.9775063395500183, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 49250 + }, + { + "epoch": 3.5375224416517055, + "grad_norm": 0.8198322057723999, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 49260 + }, + { + "epoch": 3.5382405745062835, + "grad_norm": 0.8184829354286194, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 49270 + }, + { + "epoch": 3.5389587073608615, + "grad_norm": 0.9520270824432373, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 49280 + }, + { + "epoch": 3.53967684021544, + "grad_norm": 0.7816803455352783, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 49290 + }, + { + "epoch": 3.540394973070018, + "grad_norm": 0.6915702819824219, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 49300 + }, + { + "epoch": 3.541113105924596, + "grad_norm": 0.8282375931739807, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 49310 + }, + { + "epoch": 3.5418312387791744, + "grad_norm": 1.0797513723373413, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 49320 + }, + { + "epoch": 3.5425493716337524, + "grad_norm": 0.868671715259552, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 49330 + }, + { + "epoch": 3.5432675044883304, + "grad_norm": 0.8534455895423889, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 49340 + }, + { + "epoch": 3.5439856373429084, + "grad_norm": 0.816411554813385, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 49350 + }, + { + "epoch": 3.5447037701974864, + "grad_norm": 0.7813423275947571, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 49360 + }, + { + "epoch": 3.5454219030520644, + "grad_norm": 0.8002013564109802, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 49370 + }, + { + "epoch": 3.546140035906643, + "grad_norm": 0.9740113615989685, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 49380 + }, + { + "epoch": 3.546858168761221, + "grad_norm": 0.9046127200126648, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 49390 + }, + { + "epoch": 3.547576301615799, + "grad_norm": 0.8635150194168091, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 49400 + }, + { + "epoch": 3.5482944344703773, + "grad_norm": 0.9488558769226074, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 49410 + }, + { + "epoch": 3.5490125673249553, + "grad_norm": 0.9637090563774109, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 49420 + }, + { + "epoch": 3.5497307001795333, + "grad_norm": 1.042245626449585, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 49430 + }, + { + "epoch": 3.5504488330341113, + "grad_norm": 0.9076175689697266, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 49440 + }, + { + "epoch": 3.5511669658886893, + "grad_norm": 0.8480596542358398, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 49450 + }, + { + "epoch": 3.5518850987432673, + "grad_norm": 0.8483007550239563, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 49460 + }, + { + "epoch": 3.5526032315978457, + "grad_norm": 0.7855815887451172, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 49470 + }, + { + "epoch": 3.5533213644524237, + "grad_norm": 0.8435823917388916, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 49480 + }, + { + "epoch": 3.5540394973070017, + "grad_norm": 0.8613026142120361, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 49490 + }, + { + "epoch": 3.5547576301615798, + "grad_norm": 0.9654812812805176, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 49500 + }, + { + "epoch": 3.555475763016158, + "grad_norm": 0.8888838887214661, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 49510 + }, + { + "epoch": 3.556193895870736, + "grad_norm": 0.7718146443367004, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49520 + }, + { + "epoch": 3.556912028725314, + "grad_norm": 0.9487382173538208, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 49530 + }, + { + "epoch": 3.557630161579892, + "grad_norm": 0.9256559610366821, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 49540 + }, + { + "epoch": 3.55834829443447, + "grad_norm": 0.8879945874214172, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 49550 + }, + { + "epoch": 3.559066427289048, + "grad_norm": 0.8498744368553162, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 49560 + }, + { + "epoch": 3.5597845601436267, + "grad_norm": 0.9550948143005371, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 49570 + }, + { + "epoch": 3.5605026929982047, + "grad_norm": 0.8386164903640747, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 49580 + }, + { + "epoch": 3.5612208258527827, + "grad_norm": 0.925573468208313, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 49590 + }, + { + "epoch": 3.561938958707361, + "grad_norm": 0.8867112398147583, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 49600 + }, + { + "epoch": 3.562657091561939, + "grad_norm": 0.7638537883758545, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 49610 + }, + { + "epoch": 3.563375224416517, + "grad_norm": 0.9491845965385437, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 49620 + }, + { + "epoch": 3.564093357271095, + "grad_norm": 0.8384189605712891, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 49630 + }, + { + "epoch": 3.564811490125673, + "grad_norm": 0.8850575089454651, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 49640 + }, + { + "epoch": 3.565529622980251, + "grad_norm": 1.020916223526001, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 49650 + }, + { + "epoch": 3.5662477558348296, + "grad_norm": 0.9298280477523804, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 49660 + }, + { + "epoch": 3.5669658886894076, + "grad_norm": 0.9795742034912109, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 49670 + }, + { + "epoch": 3.5676840215439856, + "grad_norm": 0.9401193261146545, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 49680 + }, + { + "epoch": 3.568402154398564, + "grad_norm": 1.0383585691452026, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49690 + }, + { + "epoch": 3.569120287253142, + "grad_norm": 0.8370866179466248, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 49700 + }, + { + "epoch": 3.56983842010772, + "grad_norm": 0.8207486271858215, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 49710 + }, + { + "epoch": 3.570556552962298, + "grad_norm": 0.8551223278045654, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49720 + }, + { + "epoch": 3.571274685816876, + "grad_norm": 0.8041176199913025, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 49730 + }, + { + "epoch": 3.571992818671454, + "grad_norm": 0.9862527847290039, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 49740 + }, + { + "epoch": 3.5727109515260325, + "grad_norm": 0.7557165622711182, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 49750 + }, + { + "epoch": 3.5734290843806105, + "grad_norm": 1.0908563137054443, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 49760 + }, + { + "epoch": 3.5741472172351885, + "grad_norm": 0.7245369553565979, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 49770 + }, + { + "epoch": 3.5748653500897665, + "grad_norm": 0.7851184010505676, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 49780 + }, + { + "epoch": 3.575583482944345, + "grad_norm": 0.9443599581718445, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 49790 + }, + { + "epoch": 3.576301615798923, + "grad_norm": 1.021196961402893, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 49800 + }, + { + "epoch": 3.577019748653501, + "grad_norm": 0.9099196195602417, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 49810 + }, + { + "epoch": 3.577737881508079, + "grad_norm": 0.9397716522216797, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 49820 + }, + { + "epoch": 3.578456014362657, + "grad_norm": 0.9214922785758972, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 49830 + }, + { + "epoch": 3.579174147217235, + "grad_norm": 1.0053879022598267, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 49840 + }, + { + "epoch": 3.5798922800718134, + "grad_norm": 0.9415460228919983, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 49850 + }, + { + "epoch": 3.5806104129263914, + "grad_norm": 1.0807833671569824, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 49860 + }, + { + "epoch": 3.5813285457809694, + "grad_norm": 1.0070871114730835, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 49870 + }, + { + "epoch": 3.582046678635548, + "grad_norm": 0.9707024693489075, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 49880 + }, + { + "epoch": 3.582764811490126, + "grad_norm": 0.9979593753814697, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 49890 + }, + { + "epoch": 3.583482944344704, + "grad_norm": 0.7238648533821106, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 49900 + }, + { + "epoch": 3.584201077199282, + "grad_norm": 0.8168631792068481, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 49910 + }, + { + "epoch": 3.58491921005386, + "grad_norm": 0.8156409859657288, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 49920 + }, + { + "epoch": 3.585637342908438, + "grad_norm": 0.9256414175033569, + "learning_rate": 0.0002, + "loss": 0.6248, + "step": 49930 + }, + { + "epoch": 3.5863554757630163, + "grad_norm": 1.0090070962905884, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 49940 + }, + { + "epoch": 3.5870736086175943, + "grad_norm": 0.8257701992988586, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 49950 + }, + { + "epoch": 3.5877917414721723, + "grad_norm": 0.9189013242721558, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 49960 + }, + { + "epoch": 3.5885098743267507, + "grad_norm": 0.8497788310050964, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 49970 + }, + { + "epoch": 3.5892280071813287, + "grad_norm": 0.9596505761146545, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 49980 + }, + { + "epoch": 3.5899461400359067, + "grad_norm": 0.8773331642150879, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 49990 + }, + { + "epoch": 3.5906642728904847, + "grad_norm": 0.8952302932739258, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50000 + }, + { + "epoch": 3.5913824057450627, + "grad_norm": 0.7713809609413147, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 50010 + }, + { + "epoch": 3.5921005385996407, + "grad_norm": 1.0151346921920776, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 50020 + }, + { + "epoch": 3.592818671454219, + "grad_norm": 0.8793733716011047, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 50030 + }, + { + "epoch": 3.593536804308797, + "grad_norm": 0.8881325721740723, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 50040 + }, + { + "epoch": 3.594254937163375, + "grad_norm": 0.9346749782562256, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 50050 + }, + { + "epoch": 3.594973070017953, + "grad_norm": 0.8705052137374878, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 50060 + }, + { + "epoch": 3.5956912028725316, + "grad_norm": 1.039197564125061, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 50070 + }, + { + "epoch": 3.5964093357271096, + "grad_norm": 0.7053273320198059, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 50080 + }, + { + "epoch": 3.5971274685816876, + "grad_norm": 0.8268665671348572, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 50090 + }, + { + "epoch": 3.5978456014362656, + "grad_norm": 0.8921764492988586, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 50100 + }, + { + "epoch": 3.5985637342908436, + "grad_norm": 0.9756084680557251, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 50110 + }, + { + "epoch": 3.5992818671454216, + "grad_norm": 0.9275530576705933, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 50120 + }, + { + "epoch": 3.6, + "grad_norm": 0.9030009508132935, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 50130 + }, + { + "epoch": 3.600718132854578, + "grad_norm": 0.7805638909339905, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 50140 + }, + { + "epoch": 3.601436265709156, + "grad_norm": 0.7627325057983398, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 50150 + }, + { + "epoch": 3.6021543985637345, + "grad_norm": 0.7809714078903198, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 50160 + }, + { + "epoch": 3.6028725314183125, + "grad_norm": 0.7910378575325012, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 50170 + }, + { + "epoch": 3.6035906642728905, + "grad_norm": 1.004438042640686, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 50180 + }, + { + "epoch": 3.6043087971274685, + "grad_norm": 0.825969934463501, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 50190 + }, + { + "epoch": 3.6050269299820465, + "grad_norm": 0.8866565227508545, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 50200 + }, + { + "epoch": 3.6057450628366245, + "grad_norm": 0.8920543193817139, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 50210 + }, + { + "epoch": 3.606463195691203, + "grad_norm": 1.106584906578064, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 50220 + }, + { + "epoch": 3.607181328545781, + "grad_norm": 0.916607677936554, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 50230 + }, + { + "epoch": 3.607899461400359, + "grad_norm": 0.8014767169952393, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 50240 + }, + { + "epoch": 3.608617594254937, + "grad_norm": 0.9556822776794434, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 50250 + }, + { + "epoch": 3.6093357271095154, + "grad_norm": 0.9630016684532166, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50260 + }, + { + "epoch": 3.6100538599640934, + "grad_norm": 0.9862125515937805, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 50270 + }, + { + "epoch": 3.6107719928186714, + "grad_norm": 1.0043333768844604, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 50280 + }, + { + "epoch": 3.6114901256732495, + "grad_norm": 0.9255319833755493, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 50290 + }, + { + "epoch": 3.6122082585278275, + "grad_norm": 1.012023687362671, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 50300 + }, + { + "epoch": 3.612926391382406, + "grad_norm": 1.0701122283935547, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50310 + }, + { + "epoch": 3.613644524236984, + "grad_norm": 0.8270810842514038, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 50320 + }, + { + "epoch": 3.614362657091562, + "grad_norm": 0.8881328105926514, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 50330 + }, + { + "epoch": 3.61508078994614, + "grad_norm": 0.9536844491958618, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 50340 + }, + { + "epoch": 3.6157989228007184, + "grad_norm": 0.8044326305389404, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 50350 + }, + { + "epoch": 3.6165170556552964, + "grad_norm": 0.834591805934906, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50360 + }, + { + "epoch": 3.6172351885098744, + "grad_norm": 0.903752863407135, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 50370 + }, + { + "epoch": 3.6179533213644524, + "grad_norm": 0.9148632884025574, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 50380 + }, + { + "epoch": 3.6186714542190304, + "grad_norm": 0.9280176162719727, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 50390 + }, + { + "epoch": 3.6193895870736084, + "grad_norm": 0.9524136781692505, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 50400 + }, + { + "epoch": 3.620107719928187, + "grad_norm": 1.1751197576522827, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 50410 + }, + { + "epoch": 3.620825852782765, + "grad_norm": 1.032279133796692, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 50420 + }, + { + "epoch": 3.621543985637343, + "grad_norm": 0.790741503238678, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 50430 + }, + { + "epoch": 3.6222621184919213, + "grad_norm": 0.9584221243858337, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 50440 + }, + { + "epoch": 3.6229802513464993, + "grad_norm": 0.7792508006095886, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 50450 + }, + { + "epoch": 3.6236983842010773, + "grad_norm": 0.8273448944091797, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 50460 + }, + { + "epoch": 3.6244165170556553, + "grad_norm": 0.8001132607460022, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 50470 + }, + { + "epoch": 3.6251346499102333, + "grad_norm": 1.077109694480896, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 50480 + }, + { + "epoch": 3.6258527827648113, + "grad_norm": 1.111274003982544, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 50490 + }, + { + "epoch": 3.6265709156193897, + "grad_norm": 0.7757347822189331, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 50500 + }, + { + "epoch": 3.6272890484739677, + "grad_norm": 0.9217049479484558, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 50510 + }, + { + "epoch": 3.6280071813285457, + "grad_norm": 0.9362251162528992, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 50520 + }, + { + "epoch": 3.6287253141831237, + "grad_norm": 0.9435479044914246, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 50530 + }, + { + "epoch": 3.629443447037702, + "grad_norm": 0.7748915553092957, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 50540 + }, + { + "epoch": 3.63016157989228, + "grad_norm": 0.8238945007324219, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 50550 + }, + { + "epoch": 3.630879712746858, + "grad_norm": 0.8421505093574524, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 50560 + }, + { + "epoch": 3.631597845601436, + "grad_norm": 1.0272293090820312, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 50570 + }, + { + "epoch": 3.632315978456014, + "grad_norm": 0.7643818259239197, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 50580 + }, + { + "epoch": 3.6330341113105926, + "grad_norm": 0.9756225347518921, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 50590 + }, + { + "epoch": 3.6337522441651706, + "grad_norm": 0.9311570525169373, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 50600 + }, + { + "epoch": 3.6344703770197486, + "grad_norm": 0.8829827904701233, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 50610 + }, + { + "epoch": 3.6351885098743266, + "grad_norm": 0.9473454356193542, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 50620 + }, + { + "epoch": 3.635906642728905, + "grad_norm": 1.1023668050765991, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 50630 + }, + { + "epoch": 3.636624775583483, + "grad_norm": 0.8490299582481384, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 50640 + }, + { + "epoch": 3.637342908438061, + "grad_norm": 1.1129392385482788, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 50650 + }, + { + "epoch": 3.638061041292639, + "grad_norm": 1.0334501266479492, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 50660 + }, + { + "epoch": 3.638779174147217, + "grad_norm": 0.8397296667098999, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 50670 + }, + { + "epoch": 3.639497307001795, + "grad_norm": 0.7984256744384766, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 50680 + }, + { + "epoch": 3.6402154398563735, + "grad_norm": 1.1182054281234741, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 50690 + }, + { + "epoch": 3.6409335727109515, + "grad_norm": 0.8743279576301575, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 50700 + }, + { + "epoch": 3.6416517055655295, + "grad_norm": 0.9101628661155701, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 50710 + }, + { + "epoch": 3.642369838420108, + "grad_norm": 0.8866934180259705, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 50720 + }, + { + "epoch": 3.643087971274686, + "grad_norm": 0.863945484161377, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 50730 + }, + { + "epoch": 3.643806104129264, + "grad_norm": 1.0845744609832764, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 50740 + }, + { + "epoch": 3.644524236983842, + "grad_norm": 0.8610911965370178, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 50750 + }, + { + "epoch": 3.64524236983842, + "grad_norm": 0.8502625226974487, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 50760 + }, + { + "epoch": 3.645960502692998, + "grad_norm": 0.847372829914093, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 50770 + }, + { + "epoch": 3.6466786355475764, + "grad_norm": 0.8649292588233948, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 50780 + }, + { + "epoch": 3.6473967684021544, + "grad_norm": 0.8742905855178833, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 50790 + }, + { + "epoch": 3.6481149012567324, + "grad_norm": 0.9546048641204834, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 50800 + }, + { + "epoch": 3.6488330341113104, + "grad_norm": 0.7893161773681641, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 50810 + }, + { + "epoch": 3.649551166965889, + "grad_norm": 0.9350247979164124, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 50820 + }, + { + "epoch": 3.650269299820467, + "grad_norm": 0.772149384021759, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 50830 + }, + { + "epoch": 3.650987432675045, + "grad_norm": 0.8281718492507935, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 50840 + }, + { + "epoch": 3.651705565529623, + "grad_norm": 0.8063850402832031, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 50850 + }, + { + "epoch": 3.652423698384201, + "grad_norm": 0.8101351261138916, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 50860 + }, + { + "epoch": 3.6531418312387793, + "grad_norm": 0.8747833371162415, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 50870 + }, + { + "epoch": 3.6538599640933573, + "grad_norm": 0.9634656310081482, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 50880 + }, + { + "epoch": 3.6545780969479353, + "grad_norm": 1.1646045446395874, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 50890 + }, + { + "epoch": 3.6552962298025133, + "grad_norm": 0.8538454174995422, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 50900 + }, + { + "epoch": 3.656014362657092, + "grad_norm": 0.7639184594154358, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 50910 + }, + { + "epoch": 3.65673249551167, + "grad_norm": 0.8750212788581848, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 50920 + }, + { + "epoch": 3.657450628366248, + "grad_norm": 0.9161198735237122, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 50930 + }, + { + "epoch": 3.658168761220826, + "grad_norm": 0.7987924814224243, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 50940 + }, + { + "epoch": 3.658886894075404, + "grad_norm": 0.8939290642738342, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 50950 + }, + { + "epoch": 3.659605026929982, + "grad_norm": 0.9803797602653503, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 50960 + }, + { + "epoch": 3.6603231597845602, + "grad_norm": 1.2423512935638428, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 50970 + }, + { + "epoch": 3.6610412926391382, + "grad_norm": 1.0023225545883179, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 50980 + }, + { + "epoch": 3.6617594254937162, + "grad_norm": 0.9066677689552307, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 50990 + }, + { + "epoch": 3.6624775583482947, + "grad_norm": 0.8906226754188538, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 51000 + }, + { + "epoch": 3.6631956912028727, + "grad_norm": 0.7449954152107239, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51010 + }, + { + "epoch": 3.6639138240574507, + "grad_norm": 0.812612771987915, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 51020 + }, + { + "epoch": 3.6646319569120287, + "grad_norm": 0.861818253993988, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 51030 + }, + { + "epoch": 3.6653500897666067, + "grad_norm": 0.849726676940918, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 51040 + }, + { + "epoch": 3.6660682226211847, + "grad_norm": 0.9738494753837585, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 51050 + }, + { + "epoch": 3.666786355475763, + "grad_norm": 0.928989827632904, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 51060 + }, + { + "epoch": 3.667504488330341, + "grad_norm": 0.9725563526153564, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 51070 + }, + { + "epoch": 3.668222621184919, + "grad_norm": 0.9366095066070557, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51080 + }, + { + "epoch": 3.668940754039497, + "grad_norm": 0.8012986779212952, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 51090 + }, + { + "epoch": 3.6696588868940756, + "grad_norm": 1.0646892786026, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51100 + }, + { + "epoch": 3.6703770197486536, + "grad_norm": 0.7245157361030579, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 51110 + }, + { + "epoch": 3.6710951526032316, + "grad_norm": 0.6938936114311218, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 51120 + }, + { + "epoch": 3.6718132854578096, + "grad_norm": 0.8461366295814514, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 51130 + }, + { + "epoch": 3.6725314183123876, + "grad_norm": 0.8392583131790161, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 51140 + }, + { + "epoch": 3.673249551166966, + "grad_norm": 0.7245259284973145, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 51150 + }, + { + "epoch": 3.673967684021544, + "grad_norm": 1.0742167234420776, + "learning_rate": 0.0002, + "loss": 0.6165, + "step": 51160 + }, + { + "epoch": 3.674685816876122, + "grad_norm": 0.9553889036178589, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 51170 + }, + { + "epoch": 3.6754039497307, + "grad_norm": 0.8713715672492981, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 51180 + }, + { + "epoch": 3.6761220825852785, + "grad_norm": 0.7499800324440002, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 51190 + }, + { + "epoch": 3.6768402154398565, + "grad_norm": 1.1118139028549194, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 51200 + }, + { + "epoch": 3.6775583482944345, + "grad_norm": 0.8146613836288452, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 51210 + }, + { + "epoch": 3.6782764811490125, + "grad_norm": 0.9331285357475281, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 51220 + }, + { + "epoch": 3.6789946140035905, + "grad_norm": 1.0497597455978394, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 51230 + }, + { + "epoch": 3.6797127468581685, + "grad_norm": 0.879814863204956, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51240 + }, + { + "epoch": 3.680430879712747, + "grad_norm": 0.9896606802940369, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 51250 + }, + { + "epoch": 3.681149012567325, + "grad_norm": 0.928236186504364, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 51260 + }, + { + "epoch": 3.681867145421903, + "grad_norm": 0.8436732292175293, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 51270 + }, + { + "epoch": 3.6825852782764814, + "grad_norm": 0.93634432554245, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51280 + }, + { + "epoch": 3.6833034111310594, + "grad_norm": 0.8477143049240112, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 51290 + }, + { + "epoch": 3.6840215439856374, + "grad_norm": 0.8720934987068176, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 51300 + }, + { + "epoch": 3.6847396768402154, + "grad_norm": 0.7322931289672852, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 51310 + }, + { + "epoch": 3.6854578096947934, + "grad_norm": 1.0064427852630615, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 51320 + }, + { + "epoch": 3.6861759425493714, + "grad_norm": 1.0197817087173462, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 51330 + }, + { + "epoch": 3.68689407540395, + "grad_norm": 0.8764060139656067, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 51340 + }, + { + "epoch": 3.687612208258528, + "grad_norm": 0.9763964414596558, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 51350 + }, + { + "epoch": 3.688330341113106, + "grad_norm": 0.8389105200767517, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 51360 + }, + { + "epoch": 3.689048473967684, + "grad_norm": 0.9215750694274902, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 51370 + }, + { + "epoch": 3.6897666068222623, + "grad_norm": 0.8444913625717163, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 51380 + }, + { + "epoch": 3.6904847396768403, + "grad_norm": 0.9635153412818909, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 51390 + }, + { + "epoch": 3.6912028725314183, + "grad_norm": 1.0397378206253052, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 51400 + }, + { + "epoch": 3.6919210053859963, + "grad_norm": 0.9154748320579529, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 51410 + }, + { + "epoch": 3.6926391382405743, + "grad_norm": 0.906445324420929, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 51420 + }, + { + "epoch": 3.6933572710951523, + "grad_norm": 0.9237992763519287, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 51430 + }, + { + "epoch": 3.6940754039497308, + "grad_norm": 0.8796338438987732, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 51440 + }, + { + "epoch": 3.6947935368043088, + "grad_norm": 0.8613203763961792, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 51450 + }, + { + "epoch": 3.6955116696588868, + "grad_norm": 0.7957607507705688, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 51460 + }, + { + "epoch": 3.6962298025134652, + "grad_norm": 0.9183711409568787, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 51470 + }, + { + "epoch": 3.6969479353680432, + "grad_norm": 1.0108308792114258, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 51480 + }, + { + "epoch": 3.6976660682226212, + "grad_norm": 0.7768247127532959, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 51490 + }, + { + "epoch": 3.6983842010771992, + "grad_norm": 1.0051485300064087, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 51500 + }, + { + "epoch": 3.6991023339317772, + "grad_norm": 0.82451993227005, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 51510 + }, + { + "epoch": 3.6998204667863552, + "grad_norm": 0.9542286992073059, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 51520 + }, + { + "epoch": 3.7005385996409337, + "grad_norm": 0.693890392780304, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 51530 + }, + { + "epoch": 3.7012567324955117, + "grad_norm": 0.9068924784660339, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 51540 + }, + { + "epoch": 3.7019748653500897, + "grad_norm": 0.8694922924041748, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 51550 + }, + { + "epoch": 3.702692998204668, + "grad_norm": 0.941081702709198, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 51560 + }, + { + "epoch": 3.703411131059246, + "grad_norm": 0.7385984659194946, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 51570 + }, + { + "epoch": 3.704129263913824, + "grad_norm": 1.0399216413497925, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51580 + }, + { + "epoch": 3.704847396768402, + "grad_norm": 0.9802294969558716, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 51590 + }, + { + "epoch": 3.70556552962298, + "grad_norm": 1.0409669876098633, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51600 + }, + { + "epoch": 3.706283662477558, + "grad_norm": 0.8972786068916321, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 51610 + }, + { + "epoch": 3.7070017953321366, + "grad_norm": 1.1916245222091675, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 51620 + }, + { + "epoch": 3.7077199281867146, + "grad_norm": 0.9545385241508484, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 51630 + }, + { + "epoch": 3.7084380610412926, + "grad_norm": 1.0773427486419678, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 51640 + }, + { + "epoch": 3.7091561938958706, + "grad_norm": 1.0856024026870728, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 51650 + }, + { + "epoch": 3.709874326750449, + "grad_norm": 0.7678500413894653, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51660 + }, + { + "epoch": 3.710592459605027, + "grad_norm": 0.7276270985603333, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 51670 + }, + { + "epoch": 3.711310592459605, + "grad_norm": 0.8859017491340637, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 51680 + }, + { + "epoch": 3.712028725314183, + "grad_norm": 0.9037614464759827, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 51690 + }, + { + "epoch": 3.712746858168761, + "grad_norm": 0.9223412275314331, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51700 + }, + { + "epoch": 3.713464991023339, + "grad_norm": 0.8812923431396484, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 51710 + }, + { + "epoch": 3.7141831238779175, + "grad_norm": 0.8242456912994385, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 51720 + }, + { + "epoch": 3.7149012567324955, + "grad_norm": 0.8368834257125854, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 51730 + }, + { + "epoch": 3.7156193895870735, + "grad_norm": 0.8624704480171204, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 51740 + }, + { + "epoch": 3.716337522441652, + "grad_norm": 0.9138273596763611, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51750 + }, + { + "epoch": 3.71705565529623, + "grad_norm": 0.8088571429252625, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 51760 + }, + { + "epoch": 3.717773788150808, + "grad_norm": 0.882808268070221, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 51770 + }, + { + "epoch": 3.718491921005386, + "grad_norm": 0.9368035197257996, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 51780 + }, + { + "epoch": 3.719210053859964, + "grad_norm": 0.8341794013977051, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 51790 + }, + { + "epoch": 3.719928186714542, + "grad_norm": 0.8692073225975037, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 51800 + }, + { + "epoch": 3.7206463195691204, + "grad_norm": 0.7566918730735779, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 51810 + }, + { + "epoch": 3.7213644524236984, + "grad_norm": 1.113138198852539, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 51820 + }, + { + "epoch": 3.7220825852782764, + "grad_norm": 0.8793158531188965, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 51830 + }, + { + "epoch": 3.722800718132855, + "grad_norm": 0.8856439590454102, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 51840 + }, + { + "epoch": 3.723518850987433, + "grad_norm": 1.0182029008865356, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 51850 + }, + { + "epoch": 3.724236983842011, + "grad_norm": 1.1177181005477905, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 51860 + }, + { + "epoch": 3.724955116696589, + "grad_norm": 0.6600990295410156, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 51870 + }, + { + "epoch": 3.725673249551167, + "grad_norm": 1.0563536882400513, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 51880 + }, + { + "epoch": 3.726391382405745, + "grad_norm": 1.1067734956741333, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 51890 + }, + { + "epoch": 3.7271095152603233, + "grad_norm": 1.0204616785049438, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 51900 + }, + { + "epoch": 3.7278276481149013, + "grad_norm": 0.8647155165672302, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51910 + }, + { + "epoch": 3.7285457809694793, + "grad_norm": 1.0754971504211426, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 51920 + }, + { + "epoch": 3.7292639138240573, + "grad_norm": 1.0448992252349854, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 51930 + }, + { + "epoch": 3.7299820466786358, + "grad_norm": 0.963434100151062, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 51940 + }, + { + "epoch": 3.7307001795332138, + "grad_norm": 0.8112701773643494, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51950 + }, + { + "epoch": 3.7314183123877918, + "grad_norm": 0.7975119948387146, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 51960 + }, + { + "epoch": 3.7321364452423698, + "grad_norm": 0.7953376173973083, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 51970 + }, + { + "epoch": 3.7328545780969478, + "grad_norm": 0.9519981741905212, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 51980 + }, + { + "epoch": 3.7335727109515258, + "grad_norm": 0.8705791234970093, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 51990 + }, + { + "epoch": 3.734290843806104, + "grad_norm": 0.870205283164978, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 52000 + }, + { + "epoch": 3.735008976660682, + "grad_norm": 0.9558930993080139, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 52010 + }, + { + "epoch": 3.73572710951526, + "grad_norm": 0.9330434799194336, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 52020 + }, + { + "epoch": 3.7364452423698387, + "grad_norm": 0.783620297908783, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 52030 + }, + { + "epoch": 3.7371633752244167, + "grad_norm": 0.7575166821479797, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52040 + }, + { + "epoch": 3.7378815080789947, + "grad_norm": 1.0592705011367798, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 52050 + }, + { + "epoch": 3.7385996409335727, + "grad_norm": 0.9309433102607727, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 52060 + }, + { + "epoch": 3.7393177737881507, + "grad_norm": 0.972861647605896, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 52070 + }, + { + "epoch": 3.7400359066427287, + "grad_norm": 0.9318740963935852, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 52080 + }, + { + "epoch": 3.740754039497307, + "grad_norm": 0.7938477396965027, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 52090 + }, + { + "epoch": 3.741472172351885, + "grad_norm": 1.1515966653823853, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 52100 + }, + { + "epoch": 3.742190305206463, + "grad_norm": 1.076869010925293, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 52110 + }, + { + "epoch": 3.7429084380610416, + "grad_norm": 0.8516066670417786, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 52120 + }, + { + "epoch": 3.7436265709156196, + "grad_norm": 0.6853429079055786, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 52130 + }, + { + "epoch": 3.7443447037701976, + "grad_norm": 0.8179695010185242, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52140 + }, + { + "epoch": 3.7450628366247756, + "grad_norm": 0.8395232558250427, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 52150 + }, + { + "epoch": 3.7457809694793536, + "grad_norm": 1.0178003311157227, + "learning_rate": 0.0002, + "loss": 0.6902, + "step": 52160 + }, + { + "epoch": 3.7464991023339316, + "grad_norm": 1.1801023483276367, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 52170 + }, + { + "epoch": 3.74721723518851, + "grad_norm": 0.8215751647949219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 52180 + }, + { + "epoch": 3.747935368043088, + "grad_norm": 1.17083740234375, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 52190 + }, + { + "epoch": 3.748653500897666, + "grad_norm": 0.9230290651321411, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 52200 + }, + { + "epoch": 3.749371633752244, + "grad_norm": 0.8431521058082581, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 52210 + }, + { + "epoch": 3.7500897666068225, + "grad_norm": 0.9690840244293213, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 52220 + }, + { + "epoch": 3.7508078994614005, + "grad_norm": 1.0022395849227905, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 52230 + }, + { + "epoch": 3.7515260323159785, + "grad_norm": 1.0489065647125244, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 52240 + }, + { + "epoch": 3.7522441651705565, + "grad_norm": 0.7880696058273315, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 52250 + }, + { + "epoch": 3.7529622980251345, + "grad_norm": 1.0255829095840454, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 52260 + }, + { + "epoch": 3.7536804308797125, + "grad_norm": 0.8470141291618347, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 52270 + }, + { + "epoch": 3.754398563734291, + "grad_norm": 0.9040523171424866, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 52280 + }, + { + "epoch": 3.755116696588869, + "grad_norm": 0.9564392566680908, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 52290 + }, + { + "epoch": 3.755834829443447, + "grad_norm": 0.907857358455658, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 52300 + }, + { + "epoch": 3.7565529622980254, + "grad_norm": 0.8929873704910278, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 52310 + }, + { + "epoch": 3.7572710951526034, + "grad_norm": 0.854434072971344, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 52320 + }, + { + "epoch": 3.7579892280071814, + "grad_norm": 0.8744779229164124, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 52330 + }, + { + "epoch": 3.7587073608617594, + "grad_norm": 0.9022667407989502, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52340 + }, + { + "epoch": 3.7594254937163374, + "grad_norm": 0.8884857892990112, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52350 + }, + { + "epoch": 3.7601436265709154, + "grad_norm": 1.0228430032730103, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 52360 + }, + { + "epoch": 3.760861759425494, + "grad_norm": 0.8593528270721436, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 52370 + }, + { + "epoch": 3.761579892280072, + "grad_norm": 0.9435563087463379, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 52380 + }, + { + "epoch": 3.76229802513465, + "grad_norm": 0.7545679807662964, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52390 + }, + { + "epoch": 3.7630161579892283, + "grad_norm": 0.9411585927009583, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52400 + }, + { + "epoch": 3.7637342908438063, + "grad_norm": 0.9764377474784851, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 52410 + }, + { + "epoch": 3.7644524236983843, + "grad_norm": 1.0718384981155396, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 52420 + }, + { + "epoch": 3.7651705565529623, + "grad_norm": 0.8765230774879456, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52430 + }, + { + "epoch": 3.7658886894075403, + "grad_norm": 0.9275036454200745, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 52440 + }, + { + "epoch": 3.7666068222621183, + "grad_norm": 0.967410147190094, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 52450 + }, + { + "epoch": 3.7673249551166967, + "grad_norm": 0.7738949060440063, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 52460 + }, + { + "epoch": 3.7680430879712747, + "grad_norm": 1.0828070640563965, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 52470 + }, + { + "epoch": 3.7687612208258527, + "grad_norm": 0.9570213556289673, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 52480 + }, + { + "epoch": 3.7694793536804307, + "grad_norm": 1.0688215494155884, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 52490 + }, + { + "epoch": 3.770197486535009, + "grad_norm": 0.7970073223114014, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 52500 + }, + { + "epoch": 3.770915619389587, + "grad_norm": 0.7132976651191711, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 52510 + }, + { + "epoch": 3.771633752244165, + "grad_norm": 1.152268648147583, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 52520 + }, + { + "epoch": 3.772351885098743, + "grad_norm": 0.8645235896110535, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52530 + }, + { + "epoch": 3.773070017953321, + "grad_norm": 0.7725570201873779, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 52540 + }, + { + "epoch": 3.773788150807899, + "grad_norm": 0.9718102812767029, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 52550 + }, + { + "epoch": 3.7745062836624776, + "grad_norm": 0.7568017840385437, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 52560 + }, + { + "epoch": 3.7752244165170556, + "grad_norm": 0.9578912854194641, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 52570 + }, + { + "epoch": 3.7759425493716336, + "grad_norm": 0.8657314777374268, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 52580 + }, + { + "epoch": 3.776660682226212, + "grad_norm": 0.7564393281936646, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 52590 + }, + { + "epoch": 3.77737881508079, + "grad_norm": 0.7631160616874695, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 52600 + }, + { + "epoch": 3.778096947935368, + "grad_norm": 1.1852056980133057, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 52610 + }, + { + "epoch": 3.778815080789946, + "grad_norm": 1.0620790719985962, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 52620 + }, + { + "epoch": 3.779533213644524, + "grad_norm": 0.8677777647972107, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 52630 + }, + { + "epoch": 3.780251346499102, + "grad_norm": 0.9913218021392822, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 52640 + }, + { + "epoch": 3.7809694793536806, + "grad_norm": 0.9868429899215698, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 52650 + }, + { + "epoch": 3.7816876122082586, + "grad_norm": 0.8791782259941101, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 52660 + }, + { + "epoch": 3.7824057450628366, + "grad_norm": 0.9503955245018005, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 52670 + }, + { + "epoch": 3.7831238779174146, + "grad_norm": 0.8647131323814392, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 52680 + }, + { + "epoch": 3.783842010771993, + "grad_norm": 0.9819629788398743, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52690 + }, + { + "epoch": 3.784560143626571, + "grad_norm": 0.8548610210418701, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 52700 + }, + { + "epoch": 3.785278276481149, + "grad_norm": 0.8706230521202087, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 52710 + }, + { + "epoch": 3.785996409335727, + "grad_norm": 1.0032461881637573, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52720 + }, + { + "epoch": 3.786714542190305, + "grad_norm": 1.0578246116638184, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 52730 + }, + { + "epoch": 3.7874326750448835, + "grad_norm": 0.9854007363319397, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52740 + }, + { + "epoch": 3.7881508078994615, + "grad_norm": 0.8389187455177307, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 52750 + }, + { + "epoch": 3.7888689407540395, + "grad_norm": 0.9192399978637695, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 52760 + }, + { + "epoch": 3.7895870736086175, + "grad_norm": 0.9518283605575562, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 52770 + }, + { + "epoch": 3.790305206463196, + "grad_norm": 1.1296825408935547, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52780 + }, + { + "epoch": 3.791023339317774, + "grad_norm": 1.0589144229888916, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 52790 + }, + { + "epoch": 3.791741472172352, + "grad_norm": 0.8954343199729919, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 52800 + }, + { + "epoch": 3.79245960502693, + "grad_norm": 0.8283370733261108, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 52810 + }, + { + "epoch": 3.793177737881508, + "grad_norm": 0.910642683506012, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 52820 + }, + { + "epoch": 3.793895870736086, + "grad_norm": 0.9255108833312988, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 52830 + }, + { + "epoch": 3.7946140035906644, + "grad_norm": 0.8773723244667053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 52840 + }, + { + "epoch": 3.7953321364452424, + "grad_norm": 0.8454240560531616, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 52850 + }, + { + "epoch": 3.7960502692998204, + "grad_norm": 0.7636052966117859, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 52860 + }, + { + "epoch": 3.796768402154399, + "grad_norm": 0.9358382821083069, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 52870 + }, + { + "epoch": 3.797486535008977, + "grad_norm": 0.9662801623344421, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 52880 + }, + { + "epoch": 3.798204667863555, + "grad_norm": 0.995907187461853, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 52890 + }, + { + "epoch": 3.798922800718133, + "grad_norm": 0.8700127005577087, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 52900 + }, + { + "epoch": 3.799640933572711, + "grad_norm": 0.8987792134284973, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 52910 + }, + { + "epoch": 3.800359066427289, + "grad_norm": 0.9753904938697815, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 52920 + }, + { + "epoch": 3.8010771992818673, + "grad_norm": 0.7873555421829224, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 52930 + }, + { + "epoch": 3.8017953321364453, + "grad_norm": 0.8177929520606995, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 52940 + }, + { + "epoch": 3.8025134649910233, + "grad_norm": 0.8865532279014587, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 52950 + }, + { + "epoch": 3.8032315978456013, + "grad_norm": 0.9113775491714478, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 52960 + }, + { + "epoch": 3.8039497307001797, + "grad_norm": 0.9424585700035095, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 52970 + }, + { + "epoch": 3.8046678635547577, + "grad_norm": 0.8347237706184387, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 52980 + }, + { + "epoch": 3.8053859964093357, + "grad_norm": 0.826863169670105, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 52990 + }, + { + "epoch": 3.8061041292639137, + "grad_norm": 0.7313310503959656, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 53000 + }, + { + "epoch": 3.8068222621184917, + "grad_norm": 0.8352667093276978, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 53010 + }, + { + "epoch": 3.80754039497307, + "grad_norm": 0.748461127281189, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 53020 + }, + { + "epoch": 3.808258527827648, + "grad_norm": 0.943256139755249, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 53030 + }, + { + "epoch": 3.808976660682226, + "grad_norm": 1.0448410511016846, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 53040 + }, + { + "epoch": 3.809694793536804, + "grad_norm": 0.9047636985778809, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 53050 + }, + { + "epoch": 3.8104129263913826, + "grad_norm": 0.8594381213188171, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 53060 + }, + { + "epoch": 3.8111310592459606, + "grad_norm": 0.7593536972999573, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 53070 + }, + { + "epoch": 3.8118491921005386, + "grad_norm": 0.7189019918441772, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 53080 + }, + { + "epoch": 3.8125673249551166, + "grad_norm": 0.8569809198379517, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53090 + }, + { + "epoch": 3.8132854578096946, + "grad_norm": 0.923378050327301, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53100 + }, + { + "epoch": 3.8140035906642726, + "grad_norm": 0.9088824391365051, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 53110 + }, + { + "epoch": 3.814721723518851, + "grad_norm": 1.1386840343475342, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 53120 + }, + { + "epoch": 3.815439856373429, + "grad_norm": 0.8389552235603333, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 53130 + }, + { + "epoch": 3.816157989228007, + "grad_norm": 0.7940975427627563, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 53140 + }, + { + "epoch": 3.8168761220825855, + "grad_norm": 0.8389907479286194, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 53150 + }, + { + "epoch": 3.8175942549371635, + "grad_norm": 0.774206280708313, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 53160 + }, + { + "epoch": 3.8183123877917415, + "grad_norm": 1.189447283744812, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 53170 + }, + { + "epoch": 3.8190305206463195, + "grad_norm": 0.9875882863998413, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 53180 + }, + { + "epoch": 3.8197486535008975, + "grad_norm": 0.9205945134162903, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 53190 + }, + { + "epoch": 3.8204667863554755, + "grad_norm": 0.8312796354293823, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 53200 + }, + { + "epoch": 3.821184919210054, + "grad_norm": 0.9755756855010986, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 53210 + }, + { + "epoch": 3.821903052064632, + "grad_norm": 1.0722965002059937, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53220 + }, + { + "epoch": 3.82262118491921, + "grad_norm": 0.7720510959625244, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 53230 + }, + { + "epoch": 3.823339317773788, + "grad_norm": 1.020147681236267, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 53240 + }, + { + "epoch": 3.8240574506283664, + "grad_norm": 0.8241816759109497, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53250 + }, + { + "epoch": 3.8247755834829444, + "grad_norm": 0.8939895629882812, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 53260 + }, + { + "epoch": 3.8254937163375224, + "grad_norm": 1.010852336883545, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 53270 + }, + { + "epoch": 3.8262118491921004, + "grad_norm": 0.8201420307159424, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 53280 + }, + { + "epoch": 3.8269299820466784, + "grad_norm": 0.8797973990440369, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 53290 + }, + { + "epoch": 3.827648114901257, + "grad_norm": 0.9034950137138367, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 53300 + }, + { + "epoch": 3.828366247755835, + "grad_norm": 0.926802933216095, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 53310 + }, + { + "epoch": 3.829084380610413, + "grad_norm": 1.0205509662628174, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 53320 + }, + { + "epoch": 3.829802513464991, + "grad_norm": 0.9524099230766296, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 53330 + }, + { + "epoch": 3.8305206463195693, + "grad_norm": 0.9692625999450684, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 53340 + }, + { + "epoch": 3.8312387791741473, + "grad_norm": 0.7255275845527649, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 53350 + }, + { + "epoch": 3.8319569120287253, + "grad_norm": 0.7199059724807739, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53360 + }, + { + "epoch": 3.8326750448833034, + "grad_norm": 1.004464864730835, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 53370 + }, + { + "epoch": 3.8333931777378814, + "grad_norm": 0.9092583060264587, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53380 + }, + { + "epoch": 3.8341113105924594, + "grad_norm": 0.945091724395752, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 53390 + }, + { + "epoch": 3.834829443447038, + "grad_norm": 0.7980135679244995, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 53400 + }, + { + "epoch": 3.835547576301616, + "grad_norm": 0.7812868356704712, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 53410 + }, + { + "epoch": 3.836265709156194, + "grad_norm": 0.8957077860832214, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53420 + }, + { + "epoch": 3.8369838420107722, + "grad_norm": 0.9119600653648376, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 53430 + }, + { + "epoch": 3.8377019748653503, + "grad_norm": 0.8208187222480774, + "learning_rate": 0.0002, + "loss": 0.7346, + "step": 53440 + }, + { + "epoch": 3.8384201077199283, + "grad_norm": 0.7930439114570618, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 53450 + }, + { + "epoch": 3.8391382405745063, + "grad_norm": 0.8937777280807495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 53460 + }, + { + "epoch": 3.8398563734290843, + "grad_norm": 0.7583796977996826, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 53470 + }, + { + "epoch": 3.8405745062836623, + "grad_norm": 1.0735969543457031, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 53480 + }, + { + "epoch": 3.8412926391382407, + "grad_norm": 1.1106033325195312, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 53490 + }, + { + "epoch": 3.8420107719928187, + "grad_norm": 1.092631220817566, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 53500 + }, + { + "epoch": 3.8427289048473967, + "grad_norm": 0.9961787462234497, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 53510 + }, + { + "epoch": 3.8434470377019747, + "grad_norm": 0.833831250667572, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 53520 + }, + { + "epoch": 3.844165170556553, + "grad_norm": 1.0000009536743164, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 53530 + }, + { + "epoch": 3.844883303411131, + "grad_norm": 0.9784213304519653, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 53540 + }, + { + "epoch": 3.845601436265709, + "grad_norm": 0.8582558035850525, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 53550 + }, + { + "epoch": 3.846319569120287, + "grad_norm": 0.8267415761947632, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 53560 + }, + { + "epoch": 3.847037701974865, + "grad_norm": 0.8783000111579895, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 53570 + }, + { + "epoch": 3.8477558348294436, + "grad_norm": 0.9866999983787537, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 53580 + }, + { + "epoch": 3.8484739676840216, + "grad_norm": 0.8459296226501465, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 53590 + }, + { + "epoch": 3.8491921005385996, + "grad_norm": 0.9804834723472595, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 53600 + }, + { + "epoch": 3.8499102333931776, + "grad_norm": 0.951074481010437, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 53610 + }, + { + "epoch": 3.850628366247756, + "grad_norm": 0.8020104169845581, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 53620 + }, + { + "epoch": 3.851346499102334, + "grad_norm": 0.9296963214874268, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 53630 + }, + { + "epoch": 3.852064631956912, + "grad_norm": 0.8983652591705322, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 53640 + }, + { + "epoch": 3.85278276481149, + "grad_norm": 1.031858205795288, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 53650 + }, + { + "epoch": 3.853500897666068, + "grad_norm": 0.8943952918052673, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 53660 + }, + { + "epoch": 3.854219030520646, + "grad_norm": 1.0072312355041504, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 53670 + }, + { + "epoch": 3.8549371633752245, + "grad_norm": 1.0604884624481201, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 53680 + }, + { + "epoch": 3.8556552962298025, + "grad_norm": 0.834223210811615, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 53690 + }, + { + "epoch": 3.8563734290843805, + "grad_norm": 0.9872867465019226, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 53700 + }, + { + "epoch": 3.857091561938959, + "grad_norm": 0.7999459505081177, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53710 + }, + { + "epoch": 3.857809694793537, + "grad_norm": 0.717722475528717, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 53720 + }, + { + "epoch": 3.858527827648115, + "grad_norm": 1.0675442218780518, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 53730 + }, + { + "epoch": 3.859245960502693, + "grad_norm": 0.9789777398109436, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 53740 + }, + { + "epoch": 3.859964093357271, + "grad_norm": 0.9318669438362122, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 53750 + }, + { + "epoch": 3.860682226211849, + "grad_norm": 0.9848631024360657, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 53760 + }, + { + "epoch": 3.8614003590664274, + "grad_norm": 0.8754391670227051, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 53770 + }, + { + "epoch": 3.8621184919210054, + "grad_norm": 0.9024585485458374, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 53780 + }, + { + "epoch": 3.8628366247755834, + "grad_norm": 0.8974794745445251, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 53790 + }, + { + "epoch": 3.8635547576301614, + "grad_norm": 0.8342790603637695, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 53800 + }, + { + "epoch": 3.86427289048474, + "grad_norm": 0.8177682757377625, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 53810 + }, + { + "epoch": 3.864991023339318, + "grad_norm": 1.0259089469909668, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 53820 + }, + { + "epoch": 3.865709156193896, + "grad_norm": 1.042290210723877, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 53830 + }, + { + "epoch": 3.866427289048474, + "grad_norm": 0.7316540479660034, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 53840 + }, + { + "epoch": 3.867145421903052, + "grad_norm": 0.9384970664978027, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53850 + }, + { + "epoch": 3.86786355475763, + "grad_norm": 0.9273143410682678, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53860 + }, + { + "epoch": 3.8685816876122083, + "grad_norm": 1.1183570623397827, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 53870 + }, + { + "epoch": 3.8692998204667863, + "grad_norm": 0.9455275535583496, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 53880 + }, + { + "epoch": 3.8700179533213643, + "grad_norm": 0.8702114820480347, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 53890 + }, + { + "epoch": 3.870736086175943, + "grad_norm": 0.8751053214073181, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53900 + }, + { + "epoch": 3.871454219030521, + "grad_norm": 0.9793110489845276, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 53910 + }, + { + "epoch": 3.872172351885099, + "grad_norm": 0.9705014824867249, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 53920 + }, + { + "epoch": 3.872890484739677, + "grad_norm": 1.051504373550415, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 53930 + }, + { + "epoch": 3.873608617594255, + "grad_norm": 0.8590622544288635, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 53940 + }, + { + "epoch": 3.874326750448833, + "grad_norm": 0.7828099727630615, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 53950 + }, + { + "epoch": 3.8750448833034112, + "grad_norm": 0.86341792345047, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 53960 + }, + { + "epoch": 3.8757630161579892, + "grad_norm": 1.114670991897583, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 53970 + }, + { + "epoch": 3.8764811490125672, + "grad_norm": 0.8559519052505493, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 53980 + }, + { + "epoch": 3.8771992818671457, + "grad_norm": 1.0518953800201416, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 53990 + }, + { + "epoch": 3.8779174147217237, + "grad_norm": 0.7157500982284546, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 54000 + }, + { + "epoch": 3.8786355475763017, + "grad_norm": 0.8390372395515442, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 54010 + }, + { + "epoch": 3.8793536804308797, + "grad_norm": 0.8486756086349487, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 54020 + }, + { + "epoch": 3.8800718132854577, + "grad_norm": 0.8361587524414062, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 54030 + }, + { + "epoch": 3.8807899461400357, + "grad_norm": 0.9490554928779602, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 54040 + }, + { + "epoch": 3.881508078994614, + "grad_norm": 1.0311323404312134, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 54050 + }, + { + "epoch": 3.882226211849192, + "grad_norm": 0.84800124168396, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54060 + }, + { + "epoch": 3.88294434470377, + "grad_norm": 0.8940879702568054, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 54070 + }, + { + "epoch": 3.883662477558348, + "grad_norm": 0.985542356967926, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 54080 + }, + { + "epoch": 3.8843806104129266, + "grad_norm": 0.8846475481987, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 54090 + }, + { + "epoch": 3.8850987432675046, + "grad_norm": 0.9186338186264038, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 54100 + }, + { + "epoch": 3.8858168761220826, + "grad_norm": 1.106598973274231, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 54110 + }, + { + "epoch": 3.8865350089766606, + "grad_norm": 0.8167300224304199, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 54120 + }, + { + "epoch": 3.8872531418312386, + "grad_norm": 0.9153622984886169, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 54130 + }, + { + "epoch": 3.8879712746858166, + "grad_norm": 0.8464475274085999, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 54140 + }, + { + "epoch": 3.888689407540395, + "grad_norm": 0.8889452815055847, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 54150 + }, + { + "epoch": 3.889407540394973, + "grad_norm": 0.7861065864562988, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 54160 + }, + { + "epoch": 3.890125673249551, + "grad_norm": 0.882674515247345, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 54170 + }, + { + "epoch": 3.8908438061041295, + "grad_norm": 0.8503835201263428, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 54180 + }, + { + "epoch": 3.8915619389587075, + "grad_norm": 0.888455331325531, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 54190 + }, + { + "epoch": 3.8922800718132855, + "grad_norm": 1.0473699569702148, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 54200 + }, + { + "epoch": 3.8929982046678635, + "grad_norm": 0.9548208713531494, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 54210 + }, + { + "epoch": 3.8937163375224415, + "grad_norm": 0.9158754944801331, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 54220 + }, + { + "epoch": 3.8944344703770195, + "grad_norm": 0.9001154899597168, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54230 + }, + { + "epoch": 3.895152603231598, + "grad_norm": 0.9736626148223877, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54240 + }, + { + "epoch": 3.895870736086176, + "grad_norm": 0.8809846043586731, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 54250 + }, + { + "epoch": 3.896588868940754, + "grad_norm": 0.887583315372467, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 54260 + }, + { + "epoch": 3.8973070017953324, + "grad_norm": 0.8395712971687317, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 54270 + }, + { + "epoch": 3.8980251346499104, + "grad_norm": 0.8391315937042236, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 54280 + }, + { + "epoch": 3.8987432675044884, + "grad_norm": 0.8210049271583557, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54290 + }, + { + "epoch": 3.8994614003590664, + "grad_norm": 1.1364530324935913, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54300 + }, + { + "epoch": 3.9001795332136444, + "grad_norm": 0.7712056636810303, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 54310 + }, + { + "epoch": 3.9008976660682224, + "grad_norm": 0.9466049671173096, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 54320 + }, + { + "epoch": 3.901615798922801, + "grad_norm": 1.0367140769958496, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 54330 + }, + { + "epoch": 3.902333931777379, + "grad_norm": 1.0168321132659912, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 54340 + }, + { + "epoch": 3.903052064631957, + "grad_norm": 0.7830407619476318, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 54350 + }, + { + "epoch": 3.903770197486535, + "grad_norm": 0.9649789333343506, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 54360 + }, + { + "epoch": 3.9044883303411133, + "grad_norm": 0.681077778339386, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 54370 + }, + { + "epoch": 3.9052064631956913, + "grad_norm": 0.8970136046409607, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 54380 + }, + { + "epoch": 3.9059245960502693, + "grad_norm": 0.9155173301696777, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 54390 + }, + { + "epoch": 3.9066427289048473, + "grad_norm": 1.0447794198989868, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 54400 + }, + { + "epoch": 3.9073608617594253, + "grad_norm": 0.7823813557624817, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 54410 + }, + { + "epoch": 3.9080789946140033, + "grad_norm": 0.9289445877075195, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 54420 + }, + { + "epoch": 3.9087971274685818, + "grad_norm": 0.9983111619949341, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 54430 + }, + { + "epoch": 3.9095152603231598, + "grad_norm": 0.7952495813369751, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 54440 + }, + { + "epoch": 3.9102333931777378, + "grad_norm": 0.8045601844787598, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 54450 + }, + { + "epoch": 3.910951526032316, + "grad_norm": 0.936585009098053, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 54460 + }, + { + "epoch": 3.911669658886894, + "grad_norm": 0.745793879032135, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 54470 + }, + { + "epoch": 3.912387791741472, + "grad_norm": 0.9137616157531738, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 54480 + }, + { + "epoch": 3.9131059245960502, + "grad_norm": 0.826316237449646, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 54490 + }, + { + "epoch": 3.9138240574506282, + "grad_norm": 0.94313645362854, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 54500 + }, + { + "epoch": 3.9145421903052062, + "grad_norm": 1.045893907546997, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 54510 + }, + { + "epoch": 3.9152603231597847, + "grad_norm": 0.9122704863548279, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 54520 + }, + { + "epoch": 3.9159784560143627, + "grad_norm": 1.0999689102172852, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 54530 + }, + { + "epoch": 3.9166965888689407, + "grad_norm": 0.9281555414199829, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 54540 + }, + { + "epoch": 3.917414721723519, + "grad_norm": 1.1439622640609741, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 54550 + }, + { + "epoch": 3.918132854578097, + "grad_norm": 0.9375617504119873, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 54560 + }, + { + "epoch": 3.918850987432675, + "grad_norm": 0.92906653881073, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 54570 + }, + { + "epoch": 3.919569120287253, + "grad_norm": 1.0840893983840942, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 54580 + }, + { + "epoch": 3.920287253141831, + "grad_norm": 0.8145509362220764, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 54590 + }, + { + "epoch": 3.921005385996409, + "grad_norm": 0.973737895488739, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 54600 + }, + { + "epoch": 3.9217235188509876, + "grad_norm": 0.9302353858947754, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 54610 + }, + { + "epoch": 3.9224416517055656, + "grad_norm": 0.9167897701263428, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 54620 + }, + { + "epoch": 3.9231597845601436, + "grad_norm": 0.8096851706504822, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 54630 + }, + { + "epoch": 3.9238779174147216, + "grad_norm": 0.8006368279457092, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 54640 + }, + { + "epoch": 3.9245960502693, + "grad_norm": 0.7800863981246948, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 54650 + }, + { + "epoch": 3.925314183123878, + "grad_norm": 1.0331560373306274, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 54660 + }, + { + "epoch": 3.926032315978456, + "grad_norm": 1.0057517290115356, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 54670 + }, + { + "epoch": 3.926750448833034, + "grad_norm": 0.8920564651489258, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 54680 + }, + { + "epoch": 3.927468581687612, + "grad_norm": 0.7704599499702454, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 54690 + }, + { + "epoch": 3.92818671454219, + "grad_norm": 0.827032208442688, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 54700 + }, + { + "epoch": 3.9289048473967685, + "grad_norm": 1.0019268989562988, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 54710 + }, + { + "epoch": 3.9296229802513465, + "grad_norm": 0.862033486366272, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 54720 + }, + { + "epoch": 3.9303411131059245, + "grad_norm": 0.8965592980384827, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 54730 + }, + { + "epoch": 3.931059245960503, + "grad_norm": 0.7689077854156494, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 54740 + }, + { + "epoch": 3.931777378815081, + "grad_norm": 0.846276581287384, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 54750 + }, + { + "epoch": 3.932495511669659, + "grad_norm": 0.8932713866233826, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 54760 + }, + { + "epoch": 3.933213644524237, + "grad_norm": 0.9711386561393738, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 54770 + }, + { + "epoch": 3.933931777378815, + "grad_norm": 0.9290250539779663, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 54780 + }, + { + "epoch": 3.934649910233393, + "grad_norm": 1.0897367000579834, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 54790 + }, + { + "epoch": 3.9353680430879714, + "grad_norm": 0.8451842665672302, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 54800 + }, + { + "epoch": 3.9360861759425494, + "grad_norm": 0.8400090336799622, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 54810 + }, + { + "epoch": 3.9368043087971274, + "grad_norm": 0.951383650302887, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 54820 + }, + { + "epoch": 3.937522441651706, + "grad_norm": 0.848838210105896, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 54830 + }, + { + "epoch": 3.938240574506284, + "grad_norm": 0.735763669013977, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 54840 + }, + { + "epoch": 3.938958707360862, + "grad_norm": 0.979037344455719, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 54850 + }, + { + "epoch": 3.93967684021544, + "grad_norm": 0.933674693107605, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 54860 + }, + { + "epoch": 3.940394973070018, + "grad_norm": 0.835593044757843, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 54870 + }, + { + "epoch": 3.941113105924596, + "grad_norm": 1.0034281015396118, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 54880 + }, + { + "epoch": 3.9418312387791743, + "grad_norm": 0.9732975959777832, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 54890 + }, + { + "epoch": 3.9425493716337523, + "grad_norm": 0.9666336178779602, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54900 + }, + { + "epoch": 3.9432675044883303, + "grad_norm": 0.755310595035553, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 54910 + }, + { + "epoch": 3.9439856373429083, + "grad_norm": 0.8732092976570129, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 54920 + }, + { + "epoch": 3.9447037701974867, + "grad_norm": 1.139453649520874, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 54930 + }, + { + "epoch": 3.9454219030520647, + "grad_norm": 0.9044837951660156, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 54940 + }, + { + "epoch": 3.9461400359066428, + "grad_norm": 1.0496679544448853, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 54950 + }, + { + "epoch": 3.9468581687612208, + "grad_norm": 1.0099035501480103, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 54960 + }, + { + "epoch": 3.9475763016157988, + "grad_norm": 1.0694963932037354, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 54970 + }, + { + "epoch": 3.9482944344703768, + "grad_norm": 1.0012997388839722, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 54980 + }, + { + "epoch": 3.949012567324955, + "grad_norm": 0.8910513520240784, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 54990 + }, + { + "epoch": 3.949730700179533, + "grad_norm": 1.0267579555511475, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 55000 + }, + { + "epoch": 3.950448833034111, + "grad_norm": 0.9786432385444641, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 55010 + }, + { + "epoch": 3.9511669658886897, + "grad_norm": 0.8703538775444031, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55020 + }, + { + "epoch": 3.9518850987432677, + "grad_norm": 0.8970484137535095, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 55030 + }, + { + "epoch": 3.9526032315978457, + "grad_norm": 0.8781577944755554, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 55040 + }, + { + "epoch": 3.9533213644524237, + "grad_norm": 0.8040280938148499, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 55050 + }, + { + "epoch": 3.9540394973070017, + "grad_norm": 0.851926326751709, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 55060 + }, + { + "epoch": 3.9547576301615797, + "grad_norm": 0.8597240447998047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 55070 + }, + { + "epoch": 3.955475763016158, + "grad_norm": 0.9461944699287415, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55080 + }, + { + "epoch": 3.956193895870736, + "grad_norm": 0.7576611042022705, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 55090 + }, + { + "epoch": 3.956912028725314, + "grad_norm": 0.9484710693359375, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 55100 + }, + { + "epoch": 3.957630161579892, + "grad_norm": 0.9487117528915405, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 55110 + }, + { + "epoch": 3.9583482944344706, + "grad_norm": 0.870090663433075, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55120 + }, + { + "epoch": 3.9590664272890486, + "grad_norm": 0.8496458530426025, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 55130 + }, + { + "epoch": 3.9597845601436266, + "grad_norm": 1.0121779441833496, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 55140 + }, + { + "epoch": 3.9605026929982046, + "grad_norm": 0.8912323713302612, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 55150 + }, + { + "epoch": 3.9612208258527826, + "grad_norm": 0.8398444652557373, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 55160 + }, + { + "epoch": 3.961938958707361, + "grad_norm": 0.8046348690986633, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 55170 + }, + { + "epoch": 3.962657091561939, + "grad_norm": 1.0369254350662231, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 55180 + }, + { + "epoch": 3.963375224416517, + "grad_norm": 1.172431230545044, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 55190 + }, + { + "epoch": 3.964093357271095, + "grad_norm": 0.8093554377555847, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 55200 + }, + { + "epoch": 3.9648114901256735, + "grad_norm": 0.8851078748703003, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 55210 + }, + { + "epoch": 3.9655296229802515, + "grad_norm": 0.7494266033172607, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 55220 + }, + { + "epoch": 3.9662477558348295, + "grad_norm": 0.9556898474693298, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 55230 + }, + { + "epoch": 3.9669658886894075, + "grad_norm": 1.016017198562622, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 55240 + }, + { + "epoch": 3.9676840215439855, + "grad_norm": 0.8425998091697693, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 55250 + }, + { + "epoch": 3.9684021543985635, + "grad_norm": 0.717673122882843, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 55260 + }, + { + "epoch": 3.969120287253142, + "grad_norm": 0.8366572856903076, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 55270 + }, + { + "epoch": 3.96983842010772, + "grad_norm": 0.8981583118438721, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 55280 + }, + { + "epoch": 3.970556552962298, + "grad_norm": 0.8868781328201294, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 55290 + }, + { + "epoch": 3.9712746858168764, + "grad_norm": 1.0632785558700562, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 55300 + }, + { + "epoch": 3.9719928186714544, + "grad_norm": 0.8813109993934631, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 55310 + }, + { + "epoch": 3.9727109515260324, + "grad_norm": 0.8225542306900024, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 55320 + }, + { + "epoch": 3.9734290843806104, + "grad_norm": 1.1391420364379883, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 55330 + }, + { + "epoch": 3.9741472172351884, + "grad_norm": 1.0371832847595215, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55340 + }, + { + "epoch": 3.9748653500897664, + "grad_norm": 1.0542186498641968, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 55350 + }, + { + "epoch": 3.975583482944345, + "grad_norm": 1.0178009271621704, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 55360 + }, + { + "epoch": 3.976301615798923, + "grad_norm": 0.7927802205085754, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 55370 + }, + { + "epoch": 3.977019748653501, + "grad_norm": 0.9350495934486389, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55380 + }, + { + "epoch": 3.977737881508079, + "grad_norm": 1.0240116119384766, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 55390 + }, + { + "epoch": 3.9784560143626573, + "grad_norm": 1.0279067754745483, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 55400 + }, + { + "epoch": 3.9791741472172353, + "grad_norm": 1.1228227615356445, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 55410 + }, + { + "epoch": 3.9798922800718133, + "grad_norm": 0.9500134587287903, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 55420 + }, + { + "epoch": 3.9806104129263913, + "grad_norm": 0.9229732155799866, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 55430 + }, + { + "epoch": 3.9813285457809693, + "grad_norm": 0.7946729063987732, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 55440 + }, + { + "epoch": 3.9820466786355477, + "grad_norm": 0.9987489581108093, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 55450 + }, + { + "epoch": 3.9827648114901257, + "grad_norm": 0.9670467972755432, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 55460 + }, + { + "epoch": 3.9834829443447037, + "grad_norm": 0.835028350353241, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 55470 + }, + { + "epoch": 3.9842010771992817, + "grad_norm": 0.8678702712059021, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 55480 + }, + { + "epoch": 3.98491921005386, + "grad_norm": 0.8581197261810303, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 55490 + }, + { + "epoch": 3.985637342908438, + "grad_norm": 0.779848039150238, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 55500 + }, + { + "epoch": 3.986355475763016, + "grad_norm": 0.8827589154243469, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 55510 + }, + { + "epoch": 3.987073608617594, + "grad_norm": 1.0108301639556885, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55520 + }, + { + "epoch": 3.987791741472172, + "grad_norm": 0.8506004214286804, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 55530 + }, + { + "epoch": 3.98850987432675, + "grad_norm": 1.0297727584838867, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 55540 + }, + { + "epoch": 3.9892280071813286, + "grad_norm": 0.8579224944114685, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55550 + }, + { + "epoch": 3.9899461400359066, + "grad_norm": 0.8503788113594055, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 55560 + }, + { + "epoch": 3.9906642728904846, + "grad_norm": 1.1144801378250122, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 55570 + }, + { + "epoch": 3.991382405745063, + "grad_norm": 0.8418305516242981, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 55580 + }, + { + "epoch": 3.992100538599641, + "grad_norm": 1.0065871477127075, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 55590 + }, + { + "epoch": 3.992818671454219, + "grad_norm": 0.8160259127616882, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 55600 + }, + { + "epoch": 3.993536804308797, + "grad_norm": 0.8678009510040283, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55610 + }, + { + "epoch": 3.994254937163375, + "grad_norm": 0.863465428352356, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 55620 + }, + { + "epoch": 3.994973070017953, + "grad_norm": 0.9242135286331177, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 55630 + }, + { + "epoch": 3.9956912028725315, + "grad_norm": 1.0285470485687256, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 55640 + }, + { + "epoch": 3.9964093357271095, + "grad_norm": 0.8953320384025574, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 55650 + }, + { + "epoch": 3.9971274685816875, + "grad_norm": 0.915892481803894, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 55660 + }, + { + "epoch": 3.9978456014362656, + "grad_norm": 0.8235118985176086, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 55670 + }, + { + "epoch": 3.998563734290844, + "grad_norm": 1.0178656578063965, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 55680 + }, + { + "epoch": 3.999281867145422, + "grad_norm": 0.9926803708076477, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 55690 + }, + { + "epoch": 4.0, + "grad_norm": 0.9213629961013794, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 55700 + }, + { + "epoch": 4.0, + "eval_loss": 1.1152480840682983, + "eval_runtime": 55.2237, + "eval_samples_per_second": 13.273, + "eval_steps_per_second": 1.666, + "step": 55700 + }, + { + "epoch": 4.000718132854578, + "grad_norm": 1.0820496082305908, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 55710 + }, + { + "epoch": 4.001436265709156, + "grad_norm": 0.9036441445350647, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 55720 + }, + { + "epoch": 4.002154398563734, + "grad_norm": 1.102754831314087, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 55730 + }, + { + "epoch": 4.002872531418312, + "grad_norm": 0.98259437084198, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 55740 + }, + { + "epoch": 4.003590664272891, + "grad_norm": 1.1935845613479614, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 55750 + }, + { + "epoch": 4.004308797127469, + "grad_norm": 0.9925830960273743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 55760 + }, + { + "epoch": 4.005026929982047, + "grad_norm": 1.075087070465088, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 55770 + }, + { + "epoch": 4.005745062836625, + "grad_norm": 0.8746396899223328, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 55780 + }, + { + "epoch": 4.006463195691203, + "grad_norm": 0.7635995745658875, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 55790 + }, + { + "epoch": 4.007181328545781, + "grad_norm": 0.9064885377883911, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 55800 + }, + { + "epoch": 4.007899461400359, + "grad_norm": 1.018478274345398, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 55810 + }, + { + "epoch": 4.008617594254937, + "grad_norm": 0.9797589778900146, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 55820 + }, + { + "epoch": 4.009335727109515, + "grad_norm": 0.7867457866668701, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 55830 + }, + { + "epoch": 4.010053859964093, + "grad_norm": 0.9998070597648621, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 55840 + }, + { + "epoch": 4.010771992818672, + "grad_norm": 0.8656311631202698, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 55850 + }, + { + "epoch": 4.01149012567325, + "grad_norm": 0.945469081401825, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 55860 + }, + { + "epoch": 4.012208258527828, + "grad_norm": 0.8809926509857178, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 55870 + }, + { + "epoch": 4.012926391382406, + "grad_norm": 0.8047897219657898, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 55880 + }, + { + "epoch": 4.013644524236984, + "grad_norm": 1.0563900470733643, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 55890 + }, + { + "epoch": 4.014362657091562, + "grad_norm": 0.8578300476074219, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 55900 + }, + { + "epoch": 4.01508078994614, + "grad_norm": 1.0304765701293945, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 55910 + }, + { + "epoch": 4.015798922800718, + "grad_norm": 0.8087666034698486, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 55920 + }, + { + "epoch": 4.016517055655296, + "grad_norm": 1.0192348957061768, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 55930 + }, + { + "epoch": 4.017235188509875, + "grad_norm": 1.061194658279419, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 55940 + }, + { + "epoch": 4.017953321364453, + "grad_norm": 0.93668133020401, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 55950 + }, + { + "epoch": 4.018671454219031, + "grad_norm": 1.1569286584854126, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 55960 + }, + { + "epoch": 4.019389587073609, + "grad_norm": 0.9853817224502563, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 55970 + }, + { + "epoch": 4.020107719928187, + "grad_norm": 0.851109504699707, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 55980 + }, + { + "epoch": 4.020825852782765, + "grad_norm": 1.053525447845459, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 55990 + }, + { + "epoch": 4.021543985637343, + "grad_norm": 0.8307225704193115, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 56000 + }, + { + "epoch": 4.022262118491921, + "grad_norm": 1.2741150856018066, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 56010 + }, + { + "epoch": 4.022980251346499, + "grad_norm": 0.9708344340324402, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 56020 + }, + { + "epoch": 4.023698384201078, + "grad_norm": 1.265034556388855, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 56030 + }, + { + "epoch": 4.024416517055656, + "grad_norm": 0.9364367723464966, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 56040 + }, + { + "epoch": 4.025134649910234, + "grad_norm": 0.8643592000007629, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 56050 + }, + { + "epoch": 4.025852782764812, + "grad_norm": 0.9742133021354675, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 56060 + }, + { + "epoch": 4.02657091561939, + "grad_norm": 1.1793473958969116, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 56070 + }, + { + "epoch": 4.027289048473968, + "grad_norm": 0.9641149044036865, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 56080 + }, + { + "epoch": 4.028007181328546, + "grad_norm": 0.9426136016845703, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 56090 + }, + { + "epoch": 4.028725314183124, + "grad_norm": 0.9211869835853577, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 56100 + }, + { + "epoch": 4.029443447037702, + "grad_norm": 1.1576565504074097, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 56110 + }, + { + "epoch": 4.03016157989228, + "grad_norm": 1.0014013051986694, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 56120 + }, + { + "epoch": 4.0308797127468585, + "grad_norm": 0.9307010769844055, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 56130 + }, + { + "epoch": 4.0315978456014365, + "grad_norm": 0.8290148377418518, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 56140 + }, + { + "epoch": 4.0323159784560145, + "grad_norm": 1.0648446083068848, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 56150 + }, + { + "epoch": 4.0330341113105925, + "grad_norm": 1.1545547246932983, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 56160 + }, + { + "epoch": 4.0337522441651705, + "grad_norm": 0.9643545150756836, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 56170 + }, + { + "epoch": 4.0344703770197485, + "grad_norm": 0.8913900256156921, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 56180 + }, + { + "epoch": 4.0351885098743265, + "grad_norm": 0.9445754289627075, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 56190 + }, + { + "epoch": 4.0359066427289045, + "grad_norm": 0.9353124499320984, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 56200 + }, + { + "epoch": 4.0366247755834825, + "grad_norm": 1.1780431270599365, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 56210 + }, + { + "epoch": 4.037342908438061, + "grad_norm": 0.9208880662918091, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 56220 + }, + { + "epoch": 4.038061041292639, + "grad_norm": 0.9475517272949219, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 56230 + }, + { + "epoch": 4.038779174147217, + "grad_norm": 0.7478583455085754, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 56240 + }, + { + "epoch": 4.039497307001795, + "grad_norm": 1.0026403665542603, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 56250 + }, + { + "epoch": 4.040215439856373, + "grad_norm": 0.9664973020553589, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 56260 + }, + { + "epoch": 4.040933572710951, + "grad_norm": 1.0655616521835327, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 56270 + }, + { + "epoch": 4.041651705565529, + "grad_norm": 0.8367540240287781, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 56280 + }, + { + "epoch": 4.042369838420107, + "grad_norm": 0.7982191443443298, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 56290 + }, + { + "epoch": 4.043087971274685, + "grad_norm": 0.8304495215415955, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 56300 + }, + { + "epoch": 4.043806104129264, + "grad_norm": 0.95123291015625, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 56310 + }, + { + "epoch": 4.044524236983842, + "grad_norm": 0.9504102468490601, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 56320 + }, + { + "epoch": 4.04524236983842, + "grad_norm": 0.7432710528373718, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 56330 + }, + { + "epoch": 4.045960502692998, + "grad_norm": 0.9327874183654785, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 56340 + }, + { + "epoch": 4.046678635547576, + "grad_norm": 0.9161670804023743, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 56350 + }, + { + "epoch": 4.047396768402154, + "grad_norm": 0.9371771812438965, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 56360 + }, + { + "epoch": 4.048114901256732, + "grad_norm": 1.0332437753677368, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 56370 + }, + { + "epoch": 4.04883303411131, + "grad_norm": 0.7346320748329163, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 56380 + }, + { + "epoch": 4.049551166965888, + "grad_norm": 0.8247857689857483, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 56390 + }, + { + "epoch": 4.050269299820466, + "grad_norm": 0.925325334072113, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 56400 + }, + { + "epoch": 4.050987432675045, + "grad_norm": 0.7344088554382324, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 56410 + }, + { + "epoch": 4.051705565529623, + "grad_norm": 0.9204918146133423, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 56420 + }, + { + "epoch": 4.052423698384201, + "grad_norm": 0.8273472785949707, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 56430 + }, + { + "epoch": 4.053141831238779, + "grad_norm": 0.9524998068809509, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 56440 + }, + { + "epoch": 4.053859964093357, + "grad_norm": 0.9168205857276917, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 56450 + }, + { + "epoch": 4.054578096947935, + "grad_norm": 0.9634994864463806, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 56460 + }, + { + "epoch": 4.055296229802513, + "grad_norm": 1.2027593851089478, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 56470 + }, + { + "epoch": 4.056014362657091, + "grad_norm": 1.2347805500030518, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 56480 + }, + { + "epoch": 4.056732495511669, + "grad_norm": 0.8621458411216736, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 56490 + }, + { + "epoch": 4.057450628366248, + "grad_norm": 0.9194608330726624, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 56500 + }, + { + "epoch": 4.058168761220826, + "grad_norm": 1.0153663158416748, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 56510 + }, + { + "epoch": 4.058886894075404, + "grad_norm": 0.9170986413955688, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 56520 + }, + { + "epoch": 4.059605026929982, + "grad_norm": 1.033057689666748, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 56530 + }, + { + "epoch": 4.06032315978456, + "grad_norm": 1.0125197172164917, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 56540 + }, + { + "epoch": 4.061041292639138, + "grad_norm": 0.9429898262023926, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 56550 + }, + { + "epoch": 4.061759425493716, + "grad_norm": 0.9242179989814758, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 56560 + }, + { + "epoch": 4.062477558348294, + "grad_norm": 0.9365091323852539, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 56570 + }, + { + "epoch": 4.063195691202872, + "grad_norm": 0.9148455858230591, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 56580 + }, + { + "epoch": 4.063913824057451, + "grad_norm": 0.8546709418296814, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 56590 + }, + { + "epoch": 4.064631956912029, + "grad_norm": 0.9743902087211609, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 56600 + }, + { + "epoch": 4.065350089766607, + "grad_norm": 1.0599974393844604, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 56610 + }, + { + "epoch": 4.066068222621185, + "grad_norm": 0.9677841067314148, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 56620 + }, + { + "epoch": 4.066786355475763, + "grad_norm": 0.8892754316329956, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 56630 + }, + { + "epoch": 4.067504488330341, + "grad_norm": 0.8837814331054688, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 56640 + }, + { + "epoch": 4.068222621184919, + "grad_norm": 0.9284095764160156, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 56650 + }, + { + "epoch": 4.068940754039497, + "grad_norm": 1.0163567066192627, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 56660 + }, + { + "epoch": 4.069658886894075, + "grad_norm": 0.8713456988334656, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 56670 + }, + { + "epoch": 4.070377019748653, + "grad_norm": 0.8356686234474182, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 56680 + }, + { + "epoch": 4.071095152603232, + "grad_norm": 0.8998766541481018, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 56690 + }, + { + "epoch": 4.07181328545781, + "grad_norm": 1.0441967248916626, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 56700 + }, + { + "epoch": 4.072531418312388, + "grad_norm": 0.9313125610351562, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 56710 + }, + { + "epoch": 4.073249551166966, + "grad_norm": 0.9912964701652527, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 56720 + }, + { + "epoch": 4.073967684021544, + "grad_norm": 0.9048459529876709, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 56730 + }, + { + "epoch": 4.074685816876122, + "grad_norm": 1.0248944759368896, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 56740 + }, + { + "epoch": 4.0754039497307, + "grad_norm": 1.4526786804199219, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 56750 + }, + { + "epoch": 4.076122082585278, + "grad_norm": 0.9813178181648254, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 56760 + }, + { + "epoch": 4.076840215439856, + "grad_norm": 1.0686813592910767, + "learning_rate": 0.0002, + "loss": 0.5707, + "step": 56770 + }, + { + "epoch": 4.077558348294435, + "grad_norm": 1.1093482971191406, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 56780 + }, + { + "epoch": 4.078276481149013, + "grad_norm": 0.9377819895744324, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 56790 + }, + { + "epoch": 4.078994614003591, + "grad_norm": 0.8043649196624756, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 56800 + }, + { + "epoch": 4.079712746858169, + "grad_norm": 0.7995415925979614, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 56810 + }, + { + "epoch": 4.080430879712747, + "grad_norm": 1.0076148509979248, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 56820 + }, + { + "epoch": 4.081149012567325, + "grad_norm": 0.8192076683044434, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 56830 + }, + { + "epoch": 4.081867145421903, + "grad_norm": 0.9226266145706177, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 56840 + }, + { + "epoch": 4.082585278276481, + "grad_norm": 0.8877972960472107, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 56850 + }, + { + "epoch": 4.083303411131059, + "grad_norm": 0.9578937888145447, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 56860 + }, + { + "epoch": 4.084021543985638, + "grad_norm": 0.8929167985916138, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 56870 + }, + { + "epoch": 4.084739676840216, + "grad_norm": 1.0015977621078491, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 56880 + }, + { + "epoch": 4.085457809694794, + "grad_norm": 0.9768750667572021, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 56890 + }, + { + "epoch": 4.086175942549372, + "grad_norm": 1.0834569931030273, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 56900 + }, + { + "epoch": 4.08689407540395, + "grad_norm": 0.8761230707168579, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 56910 + }, + { + "epoch": 4.087612208258528, + "grad_norm": 1.027064323425293, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 56920 + }, + { + "epoch": 4.088330341113106, + "grad_norm": 1.130336880683899, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 56930 + }, + { + "epoch": 4.089048473967684, + "grad_norm": 0.8157579898834229, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 56940 + }, + { + "epoch": 4.089766606822262, + "grad_norm": 1.071175217628479, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 56950 + }, + { + "epoch": 4.09048473967684, + "grad_norm": 0.9534492492675781, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 56960 + }, + { + "epoch": 4.091202872531419, + "grad_norm": 0.9584037661552429, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 56970 + }, + { + "epoch": 4.091921005385997, + "grad_norm": 1.1513131856918335, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 56980 + }, + { + "epoch": 4.092639138240575, + "grad_norm": 1.0167666673660278, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 56990 + }, + { + "epoch": 4.093357271095153, + "grad_norm": 1.0630987882614136, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 57000 + }, + { + "epoch": 4.094075403949731, + "grad_norm": 1.0326893329620361, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 57010 + }, + { + "epoch": 4.094793536804309, + "grad_norm": 0.9701678156852722, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 57020 + }, + { + "epoch": 4.095511669658887, + "grad_norm": 0.839935302734375, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 57030 + }, + { + "epoch": 4.096229802513465, + "grad_norm": 0.8995838761329651, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 57040 + }, + { + "epoch": 4.096947935368043, + "grad_norm": 0.8039916157722473, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 57050 + }, + { + "epoch": 4.097666068222622, + "grad_norm": 1.126122236251831, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 57060 + }, + { + "epoch": 4.0983842010772, + "grad_norm": 0.8749837875366211, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 57070 + }, + { + "epoch": 4.099102333931778, + "grad_norm": 0.8630341291427612, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 57080 + }, + { + "epoch": 4.099820466786356, + "grad_norm": 0.8889496922492981, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 57090 + }, + { + "epoch": 4.100538599640934, + "grad_norm": 0.9050310254096985, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 57100 + }, + { + "epoch": 4.101256732495512, + "grad_norm": 0.943072497844696, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 57110 + }, + { + "epoch": 4.10197486535009, + "grad_norm": 0.9031552672386169, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 57120 + }, + { + "epoch": 4.102692998204668, + "grad_norm": 0.939862847328186, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 57130 + }, + { + "epoch": 4.103411131059246, + "grad_norm": 0.8080634474754333, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 57140 + }, + { + "epoch": 4.1041292639138245, + "grad_norm": 0.9181693196296692, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 57150 + }, + { + "epoch": 4.1048473967684025, + "grad_norm": 0.9609217643737793, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 57160 + }, + { + "epoch": 4.1055655296229805, + "grad_norm": 1.1246516704559326, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 57170 + }, + { + "epoch": 4.1062836624775585, + "grad_norm": 1.0616880655288696, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 57180 + }, + { + "epoch": 4.1070017953321365, + "grad_norm": 0.9954505562782288, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 57190 + }, + { + "epoch": 4.1077199281867145, + "grad_norm": 1.0602279901504517, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 57200 + }, + { + "epoch": 4.1084380610412925, + "grad_norm": 0.8984764814376831, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 57210 + }, + { + "epoch": 4.1091561938958705, + "grad_norm": 0.845167875289917, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 57220 + }, + { + "epoch": 4.1098743267504485, + "grad_norm": 0.7901500463485718, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 57230 + }, + { + "epoch": 4.1105924596050265, + "grad_norm": 1.0462526082992554, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 57240 + }, + { + "epoch": 4.111310592459605, + "grad_norm": 0.9098827838897705, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 57250 + }, + { + "epoch": 4.112028725314183, + "grad_norm": 0.9234077334403992, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 57260 + }, + { + "epoch": 4.112746858168761, + "grad_norm": 1.0033560991287231, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 57270 + }, + { + "epoch": 4.113464991023339, + "grad_norm": 1.0620051622390747, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 57280 + }, + { + "epoch": 4.114183123877917, + "grad_norm": 0.8679345846176147, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 57290 + }, + { + "epoch": 4.114901256732495, + "grad_norm": 0.7557345628738403, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 57300 + }, + { + "epoch": 4.115619389587073, + "grad_norm": 0.8970935344696045, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 57310 + }, + { + "epoch": 4.116337522441651, + "grad_norm": 1.0779842138290405, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 57320 + }, + { + "epoch": 4.117055655296229, + "grad_norm": 1.2036106586456299, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 57330 + }, + { + "epoch": 4.117773788150808, + "grad_norm": 0.8337953686714172, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 57340 + }, + { + "epoch": 4.118491921005386, + "grad_norm": 0.9850410223007202, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 57350 + }, + { + "epoch": 4.119210053859964, + "grad_norm": 0.8028770685195923, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 57360 + }, + { + "epoch": 4.119928186714542, + "grad_norm": 0.8693217039108276, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 57370 + }, + { + "epoch": 4.12064631956912, + "grad_norm": 0.8795534372329712, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 57380 + }, + { + "epoch": 4.121364452423698, + "grad_norm": 1.0081543922424316, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 57390 + }, + { + "epoch": 4.122082585278276, + "grad_norm": 0.8776742219924927, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 57400 + }, + { + "epoch": 4.122800718132854, + "grad_norm": 0.8247824311256409, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 57410 + }, + { + "epoch": 4.123518850987432, + "grad_norm": 1.1346335411071777, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 57420 + }, + { + "epoch": 4.124236983842011, + "grad_norm": 1.0671089887619019, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 57430 + }, + { + "epoch": 4.124955116696589, + "grad_norm": 0.8548333048820496, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 57440 + }, + { + "epoch": 4.125673249551167, + "grad_norm": 1.0221573114395142, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 57450 + }, + { + "epoch": 4.126391382405745, + "grad_norm": 0.9746617674827576, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 57460 + }, + { + "epoch": 4.127109515260323, + "grad_norm": 0.8104965090751648, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 57470 + }, + { + "epoch": 4.127827648114901, + "grad_norm": 1.0401487350463867, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 57480 + }, + { + "epoch": 4.128545780969479, + "grad_norm": 0.8828882575035095, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 57490 + }, + { + "epoch": 4.129263913824057, + "grad_norm": 1.0121098756790161, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 57500 + }, + { + "epoch": 4.129982046678635, + "grad_norm": 0.8789737820625305, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 57510 + }, + { + "epoch": 4.130700179533213, + "grad_norm": 1.0386744737625122, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 57520 + }, + { + "epoch": 4.131418312387792, + "grad_norm": 1.0092610120773315, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 57530 + }, + { + "epoch": 4.13213644524237, + "grad_norm": 0.8706282377243042, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 57540 + }, + { + "epoch": 4.132854578096948, + "grad_norm": 0.9270507097244263, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 57550 + }, + { + "epoch": 4.133572710951526, + "grad_norm": 1.0303068161010742, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 57560 + }, + { + "epoch": 4.134290843806104, + "grad_norm": 1.1169062852859497, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 57570 + }, + { + "epoch": 4.135008976660682, + "grad_norm": 0.8530599474906921, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 57580 + }, + { + "epoch": 4.13572710951526, + "grad_norm": 1.1395039558410645, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 57590 + }, + { + "epoch": 4.136445242369838, + "grad_norm": 0.8944115042686462, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 57600 + }, + { + "epoch": 4.137163375224416, + "grad_norm": 1.137966275215149, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 57610 + }, + { + "epoch": 4.137881508078995, + "grad_norm": 0.8244962692260742, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 57620 + }, + { + "epoch": 4.138599640933573, + "grad_norm": 1.1935817003250122, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 57630 + }, + { + "epoch": 4.139317773788151, + "grad_norm": 0.9774235486984253, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 57640 + }, + { + "epoch": 4.140035906642729, + "grad_norm": 1.066219449043274, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 57650 + }, + { + "epoch": 4.140754039497307, + "grad_norm": 0.8631396293640137, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 57660 + }, + { + "epoch": 4.141472172351885, + "grad_norm": 0.888410747051239, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 57670 + }, + { + "epoch": 4.142190305206463, + "grad_norm": 1.002642035484314, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 57680 + }, + { + "epoch": 4.142908438061041, + "grad_norm": 1.0092825889587402, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 57690 + }, + { + "epoch": 4.143626570915619, + "grad_norm": 0.9126971364021301, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 57700 + }, + { + "epoch": 4.144344703770198, + "grad_norm": 1.0303562879562378, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 57710 + }, + { + "epoch": 4.145062836624776, + "grad_norm": 1.1230897903442383, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 57720 + }, + { + "epoch": 4.145780969479354, + "grad_norm": 1.0494099855422974, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 57730 + }, + { + "epoch": 4.146499102333932, + "grad_norm": 0.9555442333221436, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 57740 + }, + { + "epoch": 4.14721723518851, + "grad_norm": 0.8255124092102051, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 57750 + }, + { + "epoch": 4.147935368043088, + "grad_norm": 1.097853660583496, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 57760 + }, + { + "epoch": 4.148653500897666, + "grad_norm": 1.0272663831710815, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 57770 + }, + { + "epoch": 4.149371633752244, + "grad_norm": 1.022571086883545, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 57780 + }, + { + "epoch": 4.150089766606822, + "grad_norm": 0.964543342590332, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 57790 + }, + { + "epoch": 4.1508078994614, + "grad_norm": 0.9251219034194946, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 57800 + }, + { + "epoch": 4.151526032315979, + "grad_norm": 1.081840991973877, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 57810 + }, + { + "epoch": 4.152244165170557, + "grad_norm": 0.8989445567131042, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 57820 + }, + { + "epoch": 4.152962298025135, + "grad_norm": 0.903629720211029, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 57830 + }, + { + "epoch": 4.153680430879713, + "grad_norm": 0.8985397219657898, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 57840 + }, + { + "epoch": 4.154398563734291, + "grad_norm": 1.047778844833374, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 57850 + }, + { + "epoch": 4.155116696588869, + "grad_norm": 0.9804165363311768, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 57860 + }, + { + "epoch": 4.155834829443447, + "grad_norm": 1.187309980392456, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 57870 + }, + { + "epoch": 4.156552962298025, + "grad_norm": 0.9854836463928223, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 57880 + }, + { + "epoch": 4.157271095152603, + "grad_norm": 0.8494308590888977, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 57890 + }, + { + "epoch": 4.157989228007182, + "grad_norm": 0.9359684586524963, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 57900 + }, + { + "epoch": 4.15870736086176, + "grad_norm": 0.8971988558769226, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 57910 + }, + { + "epoch": 4.159425493716338, + "grad_norm": 0.8848021030426025, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 57920 + }, + { + "epoch": 4.160143626570916, + "grad_norm": 0.982877790927887, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 57930 + }, + { + "epoch": 4.160861759425494, + "grad_norm": 0.8668819069862366, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 57940 + }, + { + "epoch": 4.161579892280072, + "grad_norm": 1.06569504737854, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 57950 + }, + { + "epoch": 4.16229802513465, + "grad_norm": 1.165740728378296, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 57960 + }, + { + "epoch": 4.163016157989228, + "grad_norm": 1.0534512996673584, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 57970 + }, + { + "epoch": 4.163734290843806, + "grad_norm": 0.8785330653190613, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 57980 + }, + { + "epoch": 4.164452423698384, + "grad_norm": 1.1244874000549316, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 57990 + }, + { + "epoch": 4.165170556552963, + "grad_norm": 0.8839399218559265, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 58000 + }, + { + "epoch": 4.165888689407541, + "grad_norm": 1.0603798627853394, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 58010 + }, + { + "epoch": 4.166606822262119, + "grad_norm": 0.9737853407859802, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 58020 + }, + { + "epoch": 4.167324955116697, + "grad_norm": 1.0650558471679688, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 58030 + }, + { + "epoch": 4.168043087971275, + "grad_norm": 0.7528959512710571, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 58040 + }, + { + "epoch": 4.168761220825853, + "grad_norm": 0.9286156892776489, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 58050 + }, + { + "epoch": 4.169479353680431, + "grad_norm": 1.0225880146026611, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 58060 + }, + { + "epoch": 4.170197486535009, + "grad_norm": 0.9990654587745667, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 58070 + }, + { + "epoch": 4.170915619389587, + "grad_norm": 1.052057147026062, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 58080 + }, + { + "epoch": 4.1716337522441655, + "grad_norm": 0.7366801500320435, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 58090 + }, + { + "epoch": 4.1723518850987436, + "grad_norm": 1.0943711996078491, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 58100 + }, + { + "epoch": 4.1730700179533216, + "grad_norm": 1.1297656297683716, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 58110 + }, + { + "epoch": 4.1737881508078996, + "grad_norm": 0.7861461639404297, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 58120 + }, + { + "epoch": 4.174506283662478, + "grad_norm": 0.8643335103988647, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 58130 + }, + { + "epoch": 4.175224416517056, + "grad_norm": 0.957288384437561, + "learning_rate": 0.0002, + "loss": 0.6103, + "step": 58140 + }, + { + "epoch": 4.175942549371634, + "grad_norm": 0.9175366759300232, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 58150 + }, + { + "epoch": 4.176660682226212, + "grad_norm": 1.129935622215271, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 58160 + }, + { + "epoch": 4.17737881508079, + "grad_norm": 0.9683087468147278, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 58170 + }, + { + "epoch": 4.1780969479353685, + "grad_norm": 1.045171856880188, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 58180 + }, + { + "epoch": 4.1788150807899465, + "grad_norm": 0.9858742952346802, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 58190 + }, + { + "epoch": 4.1795332136445245, + "grad_norm": 0.8513413071632385, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 58200 + }, + { + "epoch": 4.1802513464991025, + "grad_norm": 0.9584265947341919, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 58210 + }, + { + "epoch": 4.1809694793536805, + "grad_norm": 0.8828920722007751, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 58220 + }, + { + "epoch": 4.1816876122082585, + "grad_norm": 0.9849961400032043, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 58230 + }, + { + "epoch": 4.1824057450628365, + "grad_norm": 1.0601637363433838, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 58240 + }, + { + "epoch": 4.1831238779174145, + "grad_norm": 1.2206604480743408, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 58250 + }, + { + "epoch": 4.1838420107719925, + "grad_norm": 1.1768009662628174, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 58260 + }, + { + "epoch": 4.184560143626571, + "grad_norm": 0.9521295428276062, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 58270 + }, + { + "epoch": 4.185278276481149, + "grad_norm": 0.892971932888031, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 58280 + }, + { + "epoch": 4.185996409335727, + "grad_norm": 0.8712016940116882, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 58290 + }, + { + "epoch": 4.186714542190305, + "grad_norm": 1.0190843343734741, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 58300 + }, + { + "epoch": 4.187432675044883, + "grad_norm": 1.0149270296096802, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 58310 + }, + { + "epoch": 4.188150807899461, + "grad_norm": 1.1818004846572876, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 58320 + }, + { + "epoch": 4.188868940754039, + "grad_norm": 0.7892335653305054, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 58330 + }, + { + "epoch": 4.189587073608617, + "grad_norm": 0.9792808890342712, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 58340 + }, + { + "epoch": 4.190305206463195, + "grad_norm": 0.9946883320808411, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 58350 + }, + { + "epoch": 4.191023339317773, + "grad_norm": 1.0363789796829224, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 58360 + }, + { + "epoch": 4.191741472172352, + "grad_norm": 0.9285917282104492, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 58370 + }, + { + "epoch": 4.19245960502693, + "grad_norm": 0.9461679458618164, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 58380 + }, + { + "epoch": 4.193177737881508, + "grad_norm": 1.0344175100326538, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 58390 + }, + { + "epoch": 4.193895870736086, + "grad_norm": 0.9530242085456848, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 58400 + }, + { + "epoch": 4.194614003590664, + "grad_norm": 0.9171900749206543, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 58410 + }, + { + "epoch": 4.195332136445242, + "grad_norm": 0.8094898462295532, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 58420 + }, + { + "epoch": 4.19605026929982, + "grad_norm": 0.921981930732727, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 58430 + }, + { + "epoch": 4.196768402154398, + "grad_norm": 0.9783532023429871, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 58440 + }, + { + "epoch": 4.197486535008976, + "grad_norm": 1.017805576324463, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 58450 + }, + { + "epoch": 4.198204667863555, + "grad_norm": 0.9244308471679688, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 58460 + }, + { + "epoch": 4.198922800718133, + "grad_norm": 0.9942585229873657, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 58470 + }, + { + "epoch": 4.199640933572711, + "grad_norm": 1.1045037508010864, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 58480 + }, + { + "epoch": 4.200359066427289, + "grad_norm": 0.9483149647712708, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58490 + }, + { + "epoch": 4.201077199281867, + "grad_norm": 1.0807271003723145, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 58500 + }, + { + "epoch": 4.201795332136445, + "grad_norm": 0.7697445750236511, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 58510 + }, + { + "epoch": 4.202513464991023, + "grad_norm": 1.0761178731918335, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 58520 + }, + { + "epoch": 4.203231597845601, + "grad_norm": 0.9992024898529053, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 58530 + }, + { + "epoch": 4.203949730700179, + "grad_norm": 0.8741498589515686, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 58540 + }, + { + "epoch": 4.204667863554757, + "grad_norm": 0.8557528853416443, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 58550 + }, + { + "epoch": 4.205385996409336, + "grad_norm": 0.8853630423545837, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 58560 + }, + { + "epoch": 4.206104129263914, + "grad_norm": 0.9858933687210083, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 58570 + }, + { + "epoch": 4.206822262118492, + "grad_norm": 1.104732871055603, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 58580 + }, + { + "epoch": 4.20754039497307, + "grad_norm": 0.9345462322235107, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 58590 + }, + { + "epoch": 4.208258527827648, + "grad_norm": 0.9620407819747925, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 58600 + }, + { + "epoch": 4.208976660682226, + "grad_norm": 0.8546963334083557, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 58610 + }, + { + "epoch": 4.209694793536804, + "grad_norm": 0.8125145435333252, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 58620 + }, + { + "epoch": 4.210412926391382, + "grad_norm": 0.8481138944625854, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 58630 + }, + { + "epoch": 4.21113105924596, + "grad_norm": 0.8884692788124084, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 58640 + }, + { + "epoch": 4.211849192100539, + "grad_norm": 1.09279465675354, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 58650 + }, + { + "epoch": 4.212567324955117, + "grad_norm": 0.9806583523750305, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 58660 + }, + { + "epoch": 4.213285457809695, + "grad_norm": 0.9510366916656494, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 58670 + }, + { + "epoch": 4.214003590664273, + "grad_norm": 0.7517459988594055, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 58680 + }, + { + "epoch": 4.214721723518851, + "grad_norm": 1.1134123802185059, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 58690 + }, + { + "epoch": 4.215439856373429, + "grad_norm": 0.8307328820228577, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 58700 + }, + { + "epoch": 4.216157989228007, + "grad_norm": 0.8211639523506165, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 58710 + }, + { + "epoch": 4.216876122082585, + "grad_norm": 1.0749584436416626, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 58720 + }, + { + "epoch": 4.217594254937163, + "grad_norm": 1.1394833326339722, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 58730 + }, + { + "epoch": 4.218312387791742, + "grad_norm": 1.05130934715271, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 58740 + }, + { + "epoch": 4.21903052064632, + "grad_norm": 0.7949456572532654, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 58750 + }, + { + "epoch": 4.219748653500898, + "grad_norm": 0.906506359577179, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 58760 + }, + { + "epoch": 4.220466786355476, + "grad_norm": 0.8338989615440369, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 58770 + }, + { + "epoch": 4.221184919210054, + "grad_norm": 0.9325370788574219, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 58780 + }, + { + "epoch": 4.221903052064632, + "grad_norm": 1.0208096504211426, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 58790 + }, + { + "epoch": 4.22262118491921, + "grad_norm": 1.0075920820236206, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 58800 + }, + { + "epoch": 4.223339317773788, + "grad_norm": 0.9858701229095459, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 58810 + }, + { + "epoch": 4.224057450628366, + "grad_norm": 1.0010110139846802, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 58820 + }, + { + "epoch": 4.224775583482945, + "grad_norm": 0.9360540509223938, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 58830 + }, + { + "epoch": 4.225493716337523, + "grad_norm": 0.9021786451339722, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 58840 + }, + { + "epoch": 4.226211849192101, + "grad_norm": 1.1778476238250732, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 58850 + }, + { + "epoch": 4.226929982046679, + "grad_norm": 1.0061023235321045, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 58860 + }, + { + "epoch": 4.227648114901257, + "grad_norm": 0.8839752674102783, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58870 + }, + { + "epoch": 4.228366247755835, + "grad_norm": 1.0078870058059692, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 58880 + }, + { + "epoch": 4.229084380610413, + "grad_norm": 0.8926451206207275, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 58890 + }, + { + "epoch": 4.229802513464991, + "grad_norm": 1.4018772840499878, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 58900 + }, + { + "epoch": 4.230520646319569, + "grad_norm": 0.9911289215087891, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 58910 + }, + { + "epoch": 4.231238779174147, + "grad_norm": 0.9374576807022095, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58920 + }, + { + "epoch": 4.231956912028726, + "grad_norm": 1.179650068283081, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 58930 + }, + { + "epoch": 4.232675044883304, + "grad_norm": 0.9434911012649536, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 58940 + }, + { + "epoch": 4.233393177737882, + "grad_norm": 1.0061911344528198, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 58950 + }, + { + "epoch": 4.23411131059246, + "grad_norm": 0.9663233757019043, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 58960 + }, + { + "epoch": 4.234829443447038, + "grad_norm": 0.8897581696510315, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 58970 + }, + { + "epoch": 4.235547576301616, + "grad_norm": 0.873281717300415, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 58980 + }, + { + "epoch": 4.236265709156194, + "grad_norm": 0.9146949052810669, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 58990 + }, + { + "epoch": 4.236983842010772, + "grad_norm": 0.9381195306777954, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 59000 + }, + { + "epoch": 4.23770197486535, + "grad_norm": 0.9700697064399719, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 59010 + }, + { + "epoch": 4.238420107719929, + "grad_norm": 0.9050154685974121, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 59020 + }, + { + "epoch": 4.239138240574507, + "grad_norm": 0.9901503324508667, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 59030 + }, + { + "epoch": 4.239856373429085, + "grad_norm": 0.9009594321250916, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 59040 + }, + { + "epoch": 4.240574506283663, + "grad_norm": 1.0924968719482422, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 59050 + }, + { + "epoch": 4.241292639138241, + "grad_norm": 0.9939947724342346, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 59060 + }, + { + "epoch": 4.242010771992819, + "grad_norm": 1.0577857494354248, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 59070 + }, + { + "epoch": 4.242728904847397, + "grad_norm": 1.0836747884750366, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 59080 + }, + { + "epoch": 4.243447037701975, + "grad_norm": 0.97043377161026, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 59090 + }, + { + "epoch": 4.244165170556553, + "grad_norm": 0.7711901664733887, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 59100 + }, + { + "epoch": 4.244883303411131, + "grad_norm": 1.0143170356750488, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 59110 + }, + { + "epoch": 4.2456014362657095, + "grad_norm": 0.9151925444602966, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 59120 + }, + { + "epoch": 4.2463195691202875, + "grad_norm": 0.9252700209617615, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 59130 + }, + { + "epoch": 4.2470377019748655, + "grad_norm": 0.8429408073425293, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 59140 + }, + { + "epoch": 4.2477558348294435, + "grad_norm": 0.9645987153053284, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 59150 + }, + { + "epoch": 4.2484739676840215, + "grad_norm": 0.9949791431427002, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 59160 + }, + { + "epoch": 4.2491921005385995, + "grad_norm": 0.9128350615501404, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 59170 + }, + { + "epoch": 4.2499102333931775, + "grad_norm": 0.7406911849975586, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 59180 + }, + { + "epoch": 4.2506283662477555, + "grad_norm": 1.0237419605255127, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 59190 + }, + { + "epoch": 4.2513464991023335, + "grad_norm": 0.805459201335907, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 59200 + }, + { + "epoch": 4.252064631956912, + "grad_norm": 0.8477254509925842, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 59210 + }, + { + "epoch": 4.25278276481149, + "grad_norm": 0.984023928642273, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 59220 + }, + { + "epoch": 4.253500897666068, + "grad_norm": 1.0667484998703003, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 59230 + }, + { + "epoch": 4.254219030520646, + "grad_norm": 0.7192284464836121, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 59240 + }, + { + "epoch": 4.254937163375224, + "grad_norm": 0.9557451009750366, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 59250 + }, + { + "epoch": 4.255655296229802, + "grad_norm": 0.9209784865379333, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 59260 + }, + { + "epoch": 4.25637342908438, + "grad_norm": 0.9785363674163818, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 59270 + }, + { + "epoch": 4.257091561938958, + "grad_norm": 0.910214364528656, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 59280 + }, + { + "epoch": 4.257809694793536, + "grad_norm": 0.8945858478546143, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 59290 + }, + { + "epoch": 4.258527827648114, + "grad_norm": 1.0984420776367188, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 59300 + }, + { + "epoch": 4.259245960502693, + "grad_norm": 1.0256640911102295, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 59310 + }, + { + "epoch": 4.259964093357271, + "grad_norm": 0.978397786617279, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 59320 + }, + { + "epoch": 4.260682226211849, + "grad_norm": 0.7587000727653503, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 59330 + }, + { + "epoch": 4.261400359066427, + "grad_norm": 0.9384620785713196, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 59340 + }, + { + "epoch": 4.262118491921005, + "grad_norm": 0.893992006778717, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 59350 + }, + { + "epoch": 4.262836624775583, + "grad_norm": 1.0231536626815796, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 59360 + }, + { + "epoch": 4.263554757630161, + "grad_norm": 0.9810128211975098, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 59370 + }, + { + "epoch": 4.264272890484739, + "grad_norm": 1.0868116617202759, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 59380 + }, + { + "epoch": 4.264991023339318, + "grad_norm": 1.1433676481246948, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 59390 + }, + { + "epoch": 4.265709156193896, + "grad_norm": 0.9836946725845337, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 59400 + }, + { + "epoch": 4.266427289048474, + "grad_norm": 0.9473603963851929, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 59410 + }, + { + "epoch": 4.267145421903052, + "grad_norm": 0.9066835641860962, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 59420 + }, + { + "epoch": 4.26786355475763, + "grad_norm": 1.0534718036651611, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 59430 + }, + { + "epoch": 4.268581687612208, + "grad_norm": 1.0392775535583496, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 59440 + }, + { + "epoch": 4.269299820466786, + "grad_norm": 1.011472463607788, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 59450 + }, + { + "epoch": 4.270017953321364, + "grad_norm": 1.0704147815704346, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 59460 + }, + { + "epoch": 4.270736086175942, + "grad_norm": 0.9349238872528076, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 59470 + }, + { + "epoch": 4.27145421903052, + "grad_norm": 0.8745087385177612, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 59480 + }, + { + "epoch": 4.272172351885099, + "grad_norm": 0.8823763728141785, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 59490 + }, + { + "epoch": 4.272890484739677, + "grad_norm": 1.110912799835205, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 59500 + }, + { + "epoch": 4.273608617594255, + "grad_norm": 1.0000925064086914, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 59510 + }, + { + "epoch": 4.274326750448833, + "grad_norm": 1.1578227281570435, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 59520 + }, + { + "epoch": 4.275044883303411, + "grad_norm": 0.875720202922821, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 59530 + }, + { + "epoch": 4.275763016157989, + "grad_norm": 0.9562238454818726, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 59540 + }, + { + "epoch": 4.276481149012567, + "grad_norm": 0.8384222388267517, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 59550 + }, + { + "epoch": 4.277199281867145, + "grad_norm": 1.2719428539276123, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 59560 + }, + { + "epoch": 4.277917414721723, + "grad_norm": 1.0656434297561646, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 59570 + }, + { + "epoch": 4.278635547576302, + "grad_norm": 1.0766716003417969, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 59580 + }, + { + "epoch": 4.27935368043088, + "grad_norm": 0.8892807960510254, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 59590 + }, + { + "epoch": 4.280071813285458, + "grad_norm": 0.8956300020217896, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 59600 + }, + { + "epoch": 4.280789946140036, + "grad_norm": 0.9562926888465881, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 59610 + }, + { + "epoch": 4.281508078994614, + "grad_norm": 1.009141445159912, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 59620 + }, + { + "epoch": 4.282226211849192, + "grad_norm": 1.0546064376831055, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 59630 + }, + { + "epoch": 4.28294434470377, + "grad_norm": 0.8831254243850708, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 59640 + }, + { + "epoch": 4.283662477558348, + "grad_norm": 0.9560053944587708, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 59650 + }, + { + "epoch": 4.284380610412926, + "grad_norm": 1.030339241027832, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 59660 + }, + { + "epoch": 4.285098743267504, + "grad_norm": 1.00662100315094, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 59670 + }, + { + "epoch": 4.285816876122083, + "grad_norm": 1.0759116411209106, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 59680 + }, + { + "epoch": 4.286535008976661, + "grad_norm": 0.9985393285751343, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 59690 + }, + { + "epoch": 4.287253141831239, + "grad_norm": 0.9044474959373474, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 59700 + }, + { + "epoch": 4.287971274685817, + "grad_norm": 1.1224442720413208, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 59710 + }, + { + "epoch": 4.288689407540395, + "grad_norm": 0.8436414003372192, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 59720 + }, + { + "epoch": 4.289407540394973, + "grad_norm": 1.0695041418075562, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 59730 + }, + { + "epoch": 4.290125673249551, + "grad_norm": 0.8809951543807983, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 59740 + }, + { + "epoch": 4.290843806104129, + "grad_norm": 1.0213792324066162, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 59750 + }, + { + "epoch": 4.291561938958707, + "grad_norm": 0.9660196900367737, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 59760 + }, + { + "epoch": 4.292280071813286, + "grad_norm": 0.8005787134170532, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 59770 + }, + { + "epoch": 4.292998204667864, + "grad_norm": 1.0016109943389893, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 59780 + }, + { + "epoch": 4.293716337522442, + "grad_norm": 0.9112903475761414, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 59790 + }, + { + "epoch": 4.29443447037702, + "grad_norm": 0.9999852180480957, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 59800 + }, + { + "epoch": 4.295152603231598, + "grad_norm": 0.9323953986167908, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 59810 + }, + { + "epoch": 4.295870736086176, + "grad_norm": 0.903037965297699, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 59820 + }, + { + "epoch": 4.296588868940754, + "grad_norm": 1.2462431192398071, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 59830 + }, + { + "epoch": 4.297307001795332, + "grad_norm": 1.2322230339050293, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 59840 + }, + { + "epoch": 4.29802513464991, + "grad_norm": 0.9584668278694153, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 59850 + }, + { + "epoch": 4.298743267504488, + "grad_norm": 0.9664767980575562, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 59860 + }, + { + "epoch": 4.299461400359067, + "grad_norm": 0.8860437273979187, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 59870 + }, + { + "epoch": 4.300179533213645, + "grad_norm": 1.0825127363204956, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 59880 + }, + { + "epoch": 4.300897666068223, + "grad_norm": 1.1312100887298584, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 59890 + }, + { + "epoch": 4.301615798922801, + "grad_norm": 0.8289751410484314, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 59900 + }, + { + "epoch": 4.302333931777379, + "grad_norm": 0.8990927934646606, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 59910 + }, + { + "epoch": 4.303052064631957, + "grad_norm": 0.9667525887489319, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 59920 + }, + { + "epoch": 4.303770197486535, + "grad_norm": 0.8656060695648193, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 59930 + }, + { + "epoch": 4.304488330341113, + "grad_norm": 0.8909396529197693, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 59940 + }, + { + "epoch": 4.305206463195692, + "grad_norm": 0.9533283114433289, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 59950 + }, + { + "epoch": 4.30592459605027, + "grad_norm": 0.9090739488601685, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 59960 + }, + { + "epoch": 4.306642728904848, + "grad_norm": 1.096656322479248, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 59970 + }, + { + "epoch": 4.307360861759426, + "grad_norm": 1.0392465591430664, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 59980 + }, + { + "epoch": 4.308078994614004, + "grad_norm": 0.8733913898468018, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 59990 + }, + { + "epoch": 4.308797127468582, + "grad_norm": 0.8287094235420227, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 60000 + }, + { + "epoch": 4.30951526032316, + "grad_norm": 0.9267017245292664, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 60010 + }, + { + "epoch": 4.310233393177738, + "grad_norm": 0.9969515800476074, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 60020 + }, + { + "epoch": 4.310951526032316, + "grad_norm": 1.0005015134811401, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 60030 + }, + { + "epoch": 4.311669658886894, + "grad_norm": 1.1215369701385498, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 60040 + }, + { + "epoch": 4.312387791741473, + "grad_norm": 1.0434890985488892, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 60050 + }, + { + "epoch": 4.313105924596051, + "grad_norm": 0.967989981174469, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 60060 + }, + { + "epoch": 4.313824057450629, + "grad_norm": 1.007599115371704, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 60070 + }, + { + "epoch": 4.314542190305207, + "grad_norm": 0.9356340765953064, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 60080 + }, + { + "epoch": 4.315260323159785, + "grad_norm": 0.9566757678985596, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 60090 + }, + { + "epoch": 4.315978456014363, + "grad_norm": 1.1066830158233643, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 60100 + }, + { + "epoch": 4.316696588868941, + "grad_norm": 0.9895772933959961, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 60110 + }, + { + "epoch": 4.317414721723519, + "grad_norm": 1.07423734664917, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 60120 + }, + { + "epoch": 4.318132854578097, + "grad_norm": 1.0777037143707275, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 60130 + }, + { + "epoch": 4.3188509874326755, + "grad_norm": 1.1475656032562256, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 60140 + }, + { + "epoch": 4.3195691202872535, + "grad_norm": 1.0705864429473877, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 60150 + }, + { + "epoch": 4.3202872531418315, + "grad_norm": 0.8676854968070984, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 60160 + }, + { + "epoch": 4.3210053859964095, + "grad_norm": 0.9488174319267273, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 60170 + }, + { + "epoch": 4.3217235188509875, + "grad_norm": 1.1171153783798218, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 60180 + }, + { + "epoch": 4.3224416517055655, + "grad_norm": 1.091435194015503, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 60190 + }, + { + "epoch": 4.3231597845601435, + "grad_norm": 0.880944013595581, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 60200 + }, + { + "epoch": 4.3238779174147215, + "grad_norm": 0.8458809852600098, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 60210 + }, + { + "epoch": 4.3245960502692995, + "grad_norm": 0.7900225520133972, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 60220 + }, + { + "epoch": 4.3253141831238775, + "grad_norm": 0.966742753982544, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 60230 + }, + { + "epoch": 4.326032315978456, + "grad_norm": 0.8948110342025757, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 60240 + }, + { + "epoch": 4.326750448833034, + "grad_norm": 0.8598700165748596, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 60250 + }, + { + "epoch": 4.327468581687612, + "grad_norm": 1.127610206604004, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 60260 + }, + { + "epoch": 4.32818671454219, + "grad_norm": 0.8357340693473816, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 60270 + }, + { + "epoch": 4.328904847396768, + "grad_norm": 0.8771896362304688, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 60280 + }, + { + "epoch": 4.329622980251346, + "grad_norm": 0.9202101826667786, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 60290 + }, + { + "epoch": 4.330341113105924, + "grad_norm": 1.1427538394927979, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 60300 + }, + { + "epoch": 4.331059245960502, + "grad_norm": 0.8711863160133362, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 60310 + }, + { + "epoch": 4.33177737881508, + "grad_norm": 0.972723662853241, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 60320 + }, + { + "epoch": 4.332495511669659, + "grad_norm": 1.1496877670288086, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 60330 + }, + { + "epoch": 4.333213644524237, + "grad_norm": 1.008581519126892, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 60340 + }, + { + "epoch": 4.333931777378815, + "grad_norm": 1.0802706480026245, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 60350 + }, + { + "epoch": 4.334649910233393, + "grad_norm": 0.8394291996955872, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 60360 + }, + { + "epoch": 4.335368043087971, + "grad_norm": 0.8355905413627625, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 60370 + }, + { + "epoch": 4.336086175942549, + "grad_norm": 0.9583960175514221, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 60380 + }, + { + "epoch": 4.336804308797127, + "grad_norm": 1.138934850692749, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 60390 + }, + { + "epoch": 4.337522441651705, + "grad_norm": 1.0334709882736206, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 60400 + }, + { + "epoch": 4.338240574506283, + "grad_norm": 0.729686439037323, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 60410 + }, + { + "epoch": 4.338958707360861, + "grad_norm": 0.8735929727554321, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 60420 + }, + { + "epoch": 4.33967684021544, + "grad_norm": 0.9617681503295898, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 60430 + }, + { + "epoch": 4.340394973070018, + "grad_norm": 0.9439655542373657, + "learning_rate": 0.0002, + "loss": 0.5865, + "step": 60440 + }, + { + "epoch": 4.341113105924596, + "grad_norm": 0.9275408387184143, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 60450 + }, + { + "epoch": 4.341831238779174, + "grad_norm": 1.0693308115005493, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 60460 + }, + { + "epoch": 4.342549371633752, + "grad_norm": 0.9234438538551331, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 60470 + }, + { + "epoch": 4.34326750448833, + "grad_norm": 1.1376168727874756, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 60480 + }, + { + "epoch": 4.343985637342908, + "grad_norm": 0.9218108654022217, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 60490 + }, + { + "epoch": 4.344703770197486, + "grad_norm": 1.1467362642288208, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 60500 + }, + { + "epoch": 4.345421903052064, + "grad_norm": 0.9459165930747986, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 60510 + }, + { + "epoch": 4.346140035906643, + "grad_norm": 0.9460827708244324, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 60520 + }, + { + "epoch": 4.346858168761221, + "grad_norm": 1.0845041275024414, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 60530 + }, + { + "epoch": 4.347576301615799, + "grad_norm": 1.082675576210022, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 60540 + }, + { + "epoch": 4.348294434470377, + "grad_norm": 0.8443698883056641, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 60550 + }, + { + "epoch": 4.349012567324955, + "grad_norm": 1.018393874168396, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 60560 + }, + { + "epoch": 4.349730700179533, + "grad_norm": 0.8796373009681702, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 60570 + }, + { + "epoch": 4.350448833034111, + "grad_norm": 1.097942590713501, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 60580 + }, + { + "epoch": 4.351166965888689, + "grad_norm": 0.8750485181808472, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 60590 + }, + { + "epoch": 4.351885098743267, + "grad_norm": 1.0339995622634888, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 60600 + }, + { + "epoch": 4.352603231597846, + "grad_norm": 0.9077731966972351, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 60610 + }, + { + "epoch": 4.353321364452424, + "grad_norm": 1.051321029663086, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 60620 + }, + { + "epoch": 4.354039497307002, + "grad_norm": 1.0018669366836548, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 60630 + }, + { + "epoch": 4.35475763016158, + "grad_norm": 1.0349196195602417, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 60640 + }, + { + "epoch": 4.355475763016158, + "grad_norm": 1.009589672088623, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 60650 + }, + { + "epoch": 4.356193895870736, + "grad_norm": 1.0463480949401855, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 60660 + }, + { + "epoch": 4.356912028725314, + "grad_norm": 0.9815132021903992, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 60670 + }, + { + "epoch": 4.357630161579892, + "grad_norm": 1.0977262258529663, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 60680 + }, + { + "epoch": 4.35834829443447, + "grad_norm": 0.8450005054473877, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 60690 + }, + { + "epoch": 4.359066427289049, + "grad_norm": 1.0959078073501587, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 60700 + }, + { + "epoch": 4.359784560143627, + "grad_norm": 0.9155098795890808, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 60710 + }, + { + "epoch": 4.360502692998205, + "grad_norm": 0.9267987012863159, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 60720 + }, + { + "epoch": 4.361220825852783, + "grad_norm": 1.177472472190857, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 60730 + }, + { + "epoch": 4.361938958707361, + "grad_norm": 0.8615312576293945, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 60740 + }, + { + "epoch": 4.362657091561939, + "grad_norm": 1.0939710140228271, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 60750 + }, + { + "epoch": 4.363375224416517, + "grad_norm": 1.0928049087524414, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 60760 + }, + { + "epoch": 4.364093357271095, + "grad_norm": 1.0796833038330078, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 60770 + }, + { + "epoch": 4.364811490125673, + "grad_norm": 0.9768339991569519, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 60780 + }, + { + "epoch": 4.365529622980251, + "grad_norm": 0.9082722067832947, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 60790 + }, + { + "epoch": 4.36624775583483, + "grad_norm": 0.9614832997322083, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 60800 + }, + { + "epoch": 4.366965888689408, + "grad_norm": 0.8874651789665222, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 60810 + }, + { + "epoch": 4.367684021543986, + "grad_norm": 0.8810178637504578, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 60820 + }, + { + "epoch": 4.368402154398564, + "grad_norm": 1.0893806219100952, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 60830 + }, + { + "epoch": 4.369120287253142, + "grad_norm": 0.9042278528213501, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 60840 + }, + { + "epoch": 4.36983842010772, + "grad_norm": 1.0832217931747437, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 60850 + }, + { + "epoch": 4.370556552962298, + "grad_norm": 0.9431114792823792, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 60860 + }, + { + "epoch": 4.371274685816876, + "grad_norm": 1.031553030014038, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 60870 + }, + { + "epoch": 4.371992818671454, + "grad_norm": 0.8702824711799622, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 60880 + }, + { + "epoch": 4.372710951526033, + "grad_norm": 1.1109199523925781, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 60890 + }, + { + "epoch": 4.373429084380611, + "grad_norm": 0.8369361162185669, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 60900 + }, + { + "epoch": 4.374147217235189, + "grad_norm": 0.988915205001831, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 60910 + }, + { + "epoch": 4.374865350089767, + "grad_norm": 0.9365919232368469, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 60920 + }, + { + "epoch": 4.375583482944345, + "grad_norm": 0.9789398908615112, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 60930 + }, + { + "epoch": 4.376301615798923, + "grad_norm": 0.8786931037902832, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 60940 + }, + { + "epoch": 4.377019748653501, + "grad_norm": 0.8891511559486389, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 60950 + }, + { + "epoch": 4.377737881508079, + "grad_norm": 0.9561707377433777, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 60960 + }, + { + "epoch": 4.378456014362657, + "grad_norm": 0.8674200177192688, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 60970 + }, + { + "epoch": 4.379174147217235, + "grad_norm": 0.9285916090011597, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 60980 + }, + { + "epoch": 4.379892280071814, + "grad_norm": 0.9185547232627869, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 60990 + }, + { + "epoch": 4.380610412926392, + "grad_norm": 1.081664800643921, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 61000 + }, + { + "epoch": 4.38132854578097, + "grad_norm": 1.0475854873657227, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 61010 + }, + { + "epoch": 4.382046678635548, + "grad_norm": 1.1519653797149658, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 61020 + }, + { + "epoch": 4.382764811490126, + "grad_norm": 0.8757607936859131, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 61030 + }, + { + "epoch": 4.383482944344704, + "grad_norm": 0.8707934021949768, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 61040 + }, + { + "epoch": 4.384201077199282, + "grad_norm": 1.1807516813278198, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 61050 + }, + { + "epoch": 4.38491921005386, + "grad_norm": 1.0674688816070557, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 61060 + }, + { + "epoch": 4.385637342908438, + "grad_norm": 0.9321209788322449, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 61070 + }, + { + "epoch": 4.3863554757630165, + "grad_norm": 1.0786446332931519, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 61080 + }, + { + "epoch": 4.3870736086175945, + "grad_norm": 0.9733907580375671, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 61090 + }, + { + "epoch": 4.3877917414721725, + "grad_norm": 0.9476010203361511, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 61100 + }, + { + "epoch": 4.3885098743267505, + "grad_norm": 1.1321563720703125, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 61110 + }, + { + "epoch": 4.3892280071813286, + "grad_norm": 0.9379117488861084, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 61120 + }, + { + "epoch": 4.3899461400359066, + "grad_norm": 0.8409728407859802, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 61130 + }, + { + "epoch": 4.3906642728904846, + "grad_norm": 0.8309189081192017, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 61140 + }, + { + "epoch": 4.391382405745063, + "grad_norm": 0.8922196626663208, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 61150 + }, + { + "epoch": 4.392100538599641, + "grad_norm": 0.8274614214897156, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 61160 + }, + { + "epoch": 4.392818671454219, + "grad_norm": 1.0928618907928467, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 61170 + }, + { + "epoch": 4.3935368043087974, + "grad_norm": 0.9771125316619873, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 61180 + }, + { + "epoch": 4.3942549371633755, + "grad_norm": 0.8844535946846008, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 61190 + }, + { + "epoch": 4.3949730700179535, + "grad_norm": 1.0498822927474976, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 61200 + }, + { + "epoch": 4.3956912028725315, + "grad_norm": 0.9882155060768127, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 61210 + }, + { + "epoch": 4.3964093357271095, + "grad_norm": 1.090356707572937, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 61220 + }, + { + "epoch": 4.3971274685816875, + "grad_norm": 1.0908088684082031, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 61230 + }, + { + "epoch": 4.3978456014362655, + "grad_norm": 1.0013501644134521, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 61240 + }, + { + "epoch": 4.3985637342908435, + "grad_norm": 1.0916062593460083, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 61250 + }, + { + "epoch": 4.399281867145422, + "grad_norm": 1.0817667245864868, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 61260 + }, + { + "epoch": 4.4, + "grad_norm": 0.9745162129402161, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 61270 + }, + { + "epoch": 4.400718132854578, + "grad_norm": 1.0653400421142578, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 61280 + }, + { + "epoch": 4.401436265709156, + "grad_norm": 1.0082067251205444, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 61290 + }, + { + "epoch": 4.402154398563734, + "grad_norm": 0.7963659167289734, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 61300 + }, + { + "epoch": 4.402872531418312, + "grad_norm": 1.0428845882415771, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 61310 + }, + { + "epoch": 4.40359066427289, + "grad_norm": 0.9205707311630249, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 61320 + }, + { + "epoch": 4.404308797127468, + "grad_norm": 1.0103533267974854, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 61330 + }, + { + "epoch": 4.405026929982046, + "grad_norm": 1.113547682762146, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 61340 + }, + { + "epoch": 4.405745062836624, + "grad_norm": 1.137488842010498, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 61350 + }, + { + "epoch": 4.406463195691203, + "grad_norm": 1.1284101009368896, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 61360 + }, + { + "epoch": 4.407181328545781, + "grad_norm": 0.8010451197624207, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 61370 + }, + { + "epoch": 4.407899461400359, + "grad_norm": 0.8893977403640747, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 61380 + }, + { + "epoch": 4.408617594254937, + "grad_norm": 0.9098272323608398, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 61390 + }, + { + "epoch": 4.409335727109515, + "grad_norm": 1.0613329410552979, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 61400 + }, + { + "epoch": 4.410053859964093, + "grad_norm": 1.0070269107818604, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 61410 + }, + { + "epoch": 4.410771992818671, + "grad_norm": 0.8632227778434753, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 61420 + }, + { + "epoch": 4.411490125673249, + "grad_norm": 1.0183731317520142, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 61430 + }, + { + "epoch": 4.412208258527827, + "grad_norm": 0.9049941897392273, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 61440 + }, + { + "epoch": 4.412926391382406, + "grad_norm": 1.0184082984924316, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 61450 + }, + { + "epoch": 4.413644524236984, + "grad_norm": 0.9994277358055115, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 61460 + }, + { + "epoch": 4.414362657091562, + "grad_norm": 1.0112420320510864, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 61470 + }, + { + "epoch": 4.41508078994614, + "grad_norm": 0.9751759171485901, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 61480 + }, + { + "epoch": 4.415798922800718, + "grad_norm": 1.047135591506958, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 61490 + }, + { + "epoch": 4.416517055655296, + "grad_norm": 0.886282742023468, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 61500 + }, + { + "epoch": 4.417235188509874, + "grad_norm": 0.971964418888092, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 61510 + }, + { + "epoch": 4.417953321364452, + "grad_norm": 0.9603846073150635, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 61520 + }, + { + "epoch": 4.41867145421903, + "grad_norm": 1.060042142868042, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 61530 + }, + { + "epoch": 4.419389587073608, + "grad_norm": 1.1231369972229004, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 61540 + }, + { + "epoch": 4.420107719928187, + "grad_norm": 0.8269591331481934, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 61550 + }, + { + "epoch": 4.420825852782765, + "grad_norm": 1.0341241359710693, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 61560 + }, + { + "epoch": 4.421543985637343, + "grad_norm": 0.7276636958122253, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 61570 + }, + { + "epoch": 4.422262118491921, + "grad_norm": 1.0663669109344482, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 61580 + }, + { + "epoch": 4.422980251346499, + "grad_norm": 0.9764387011528015, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 61590 + }, + { + "epoch": 4.423698384201077, + "grad_norm": 1.0953258275985718, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 61600 + }, + { + "epoch": 4.424416517055655, + "grad_norm": 0.8877012729644775, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 61610 + }, + { + "epoch": 4.425134649910233, + "grad_norm": 0.8781440854072571, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 61620 + }, + { + "epoch": 4.425852782764811, + "grad_norm": 0.8333432674407959, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 61630 + }, + { + "epoch": 4.42657091561939, + "grad_norm": 0.9647989869117737, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 61640 + }, + { + "epoch": 4.427289048473968, + "grad_norm": 1.0801783800125122, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 61650 + }, + { + "epoch": 4.428007181328546, + "grad_norm": 0.8215882778167725, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 61660 + }, + { + "epoch": 4.428725314183124, + "grad_norm": 0.9853931665420532, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 61670 + }, + { + "epoch": 4.429443447037702, + "grad_norm": 0.8658010959625244, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 61680 + }, + { + "epoch": 4.43016157989228, + "grad_norm": 1.124064326286316, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 61690 + }, + { + "epoch": 4.430879712746858, + "grad_norm": 1.009340763092041, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 61700 + }, + { + "epoch": 4.431597845601436, + "grad_norm": 0.8705293536186218, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 61710 + }, + { + "epoch": 4.432315978456014, + "grad_norm": 1.1323511600494385, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 61720 + }, + { + "epoch": 4.433034111310592, + "grad_norm": 1.1203019618988037, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 61730 + }, + { + "epoch": 4.433752244165171, + "grad_norm": 1.1683770418167114, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 61740 + }, + { + "epoch": 4.434470377019749, + "grad_norm": 1.0735899209976196, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 61750 + }, + { + "epoch": 4.435188509874327, + "grad_norm": 1.142496109008789, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 61760 + }, + { + "epoch": 4.435906642728905, + "grad_norm": 1.1157732009887695, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 61770 + }, + { + "epoch": 4.436624775583483, + "grad_norm": 0.8845949172973633, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 61780 + }, + { + "epoch": 4.437342908438061, + "grad_norm": 1.1212759017944336, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 61790 + }, + { + "epoch": 4.438061041292639, + "grad_norm": 0.8832488656044006, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 61800 + }, + { + "epoch": 4.438779174147217, + "grad_norm": 0.9059590101242065, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 61810 + }, + { + "epoch": 4.439497307001796, + "grad_norm": 1.0625685453414917, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 61820 + }, + { + "epoch": 4.440215439856374, + "grad_norm": 0.9565598368644714, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 61830 + }, + { + "epoch": 4.440933572710952, + "grad_norm": 0.8975377082824707, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 61840 + }, + { + "epoch": 4.44165170556553, + "grad_norm": 1.0412718057632446, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 61850 + }, + { + "epoch": 4.442369838420108, + "grad_norm": 0.9923529624938965, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 61860 + }, + { + "epoch": 4.443087971274686, + "grad_norm": 1.3025734424591064, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 61870 + }, + { + "epoch": 4.443806104129264, + "grad_norm": 1.0031960010528564, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 61880 + }, + { + "epoch": 4.444524236983842, + "grad_norm": 1.0974701642990112, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 61890 + }, + { + "epoch": 4.44524236983842, + "grad_norm": 1.1044024229049683, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 61900 + }, + { + "epoch": 4.445960502692998, + "grad_norm": 1.0782772302627563, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 61910 + }, + { + "epoch": 4.446678635547577, + "grad_norm": 1.006304383277893, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 61920 + }, + { + "epoch": 4.447396768402155, + "grad_norm": 0.9258833527565002, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 61930 + }, + { + "epoch": 4.448114901256733, + "grad_norm": 0.9888426065444946, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 61940 + }, + { + "epoch": 4.448833034111311, + "grad_norm": 0.9592963457107544, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 61950 + }, + { + "epoch": 4.449551166965889, + "grad_norm": 1.0527986288070679, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 61960 + }, + { + "epoch": 4.450269299820467, + "grad_norm": 0.8613291382789612, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 61970 + }, + { + "epoch": 4.450987432675045, + "grad_norm": 1.1083767414093018, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 61980 + }, + { + "epoch": 4.451705565529623, + "grad_norm": 0.772679328918457, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 61990 + }, + { + "epoch": 4.452423698384201, + "grad_norm": 0.9052274227142334, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 62000 + }, + { + "epoch": 4.45314183123878, + "grad_norm": 1.129667043685913, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 62010 + }, + { + "epoch": 4.453859964093358, + "grad_norm": 0.9994529485702515, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 62020 + }, + { + "epoch": 4.454578096947936, + "grad_norm": 0.982155978679657, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 62030 + }, + { + "epoch": 4.455296229802514, + "grad_norm": 0.9139904975891113, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 62040 + }, + { + "epoch": 4.456014362657092, + "grad_norm": 1.0877810716629028, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 62050 + }, + { + "epoch": 4.45673249551167, + "grad_norm": 1.0535308122634888, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 62060 + }, + { + "epoch": 4.457450628366248, + "grad_norm": 1.0225313901901245, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 62070 + }, + { + "epoch": 4.458168761220826, + "grad_norm": 0.8443132042884827, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 62080 + }, + { + "epoch": 4.458886894075404, + "grad_norm": 1.0426654815673828, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 62090 + }, + { + "epoch": 4.459605026929982, + "grad_norm": 1.1110700368881226, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 62100 + }, + { + "epoch": 4.4603231597845605, + "grad_norm": 1.0200893878936768, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 62110 + }, + { + "epoch": 4.4610412926391385, + "grad_norm": 0.9102830290794373, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 62120 + }, + { + "epoch": 4.4617594254937165, + "grad_norm": 1.1395094394683838, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 62130 + }, + { + "epoch": 4.4624775583482945, + "grad_norm": 1.1202316284179688, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 62140 + }, + { + "epoch": 4.4631956912028725, + "grad_norm": 1.142580509185791, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 62150 + }, + { + "epoch": 4.4639138240574505, + "grad_norm": 0.9843677878379822, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 62160 + }, + { + "epoch": 4.4646319569120285, + "grad_norm": 1.0351676940917969, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 62170 + }, + { + "epoch": 4.4653500897666065, + "grad_norm": 0.9365093111991882, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 62180 + }, + { + "epoch": 4.4660682226211845, + "grad_norm": 1.041193962097168, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 62190 + }, + { + "epoch": 4.466786355475763, + "grad_norm": 0.9686329960823059, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 62200 + }, + { + "epoch": 4.467504488330341, + "grad_norm": 1.028622031211853, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 62210 + }, + { + "epoch": 4.468222621184919, + "grad_norm": 0.9717516899108887, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 62220 + }, + { + "epoch": 4.468940754039497, + "grad_norm": 1.0467450618743896, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 62230 + }, + { + "epoch": 4.469658886894075, + "grad_norm": 0.943717896938324, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 62240 + }, + { + "epoch": 4.470377019748653, + "grad_norm": 0.909429132938385, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 62250 + }, + { + "epoch": 4.471095152603231, + "grad_norm": 1.0294792652130127, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 62260 + }, + { + "epoch": 4.471813285457809, + "grad_norm": 1.1044281721115112, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 62270 + }, + { + "epoch": 4.472531418312387, + "grad_norm": 1.1555784940719604, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 62280 + }, + { + "epoch": 4.473249551166965, + "grad_norm": 0.9441297650337219, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 62290 + }, + { + "epoch": 4.473967684021544, + "grad_norm": 0.9164380431175232, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 62300 + }, + { + "epoch": 4.474685816876122, + "grad_norm": 1.1139159202575684, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 62310 + }, + { + "epoch": 4.4754039497307, + "grad_norm": 1.0201882123947144, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 62320 + }, + { + "epoch": 4.476122082585278, + "grad_norm": 1.1471681594848633, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 62330 + }, + { + "epoch": 4.476840215439856, + "grad_norm": 1.0333549976348877, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 62340 + }, + { + "epoch": 4.477558348294434, + "grad_norm": 0.8929767608642578, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 62350 + }, + { + "epoch": 4.478276481149012, + "grad_norm": 0.9465752840042114, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 62360 + }, + { + "epoch": 4.47899461400359, + "grad_norm": 1.2155033349990845, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 62370 + }, + { + "epoch": 4.479712746858169, + "grad_norm": 0.7181217074394226, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 62380 + }, + { + "epoch": 4.480430879712747, + "grad_norm": 1.0052744150161743, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 62390 + }, + { + "epoch": 4.481149012567325, + "grad_norm": 0.8522219061851501, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 62400 + }, + { + "epoch": 4.481867145421903, + "grad_norm": 0.8844723105430603, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 62410 + }, + { + "epoch": 4.482585278276481, + "grad_norm": 0.9542465209960938, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 62420 + }, + { + "epoch": 4.483303411131059, + "grad_norm": 0.8963674306869507, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 62430 + }, + { + "epoch": 4.484021543985637, + "grad_norm": 0.8105363845825195, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 62440 + }, + { + "epoch": 4.484739676840215, + "grad_norm": 0.9618421196937561, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 62450 + }, + { + "epoch": 4.485457809694793, + "grad_norm": 1.1931076049804688, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 62460 + }, + { + "epoch": 4.486175942549371, + "grad_norm": 0.7406999468803406, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 62470 + }, + { + "epoch": 4.48689407540395, + "grad_norm": 0.7698216438293457, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 62480 + }, + { + "epoch": 4.487612208258528, + "grad_norm": 0.862271249294281, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 62490 + }, + { + "epoch": 4.488330341113106, + "grad_norm": 1.0025171041488647, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 62500 + }, + { + "epoch": 4.489048473967684, + "grad_norm": 0.8474493622779846, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 62510 + }, + { + "epoch": 4.489766606822262, + "grad_norm": 0.8965697884559631, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 62520 + }, + { + "epoch": 4.49048473967684, + "grad_norm": 1.1276488304138184, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 62530 + }, + { + "epoch": 4.491202872531418, + "grad_norm": 1.0253537893295288, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 62540 + }, + { + "epoch": 4.491921005385996, + "grad_norm": 1.1750596761703491, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 62550 + }, + { + "epoch": 4.492639138240574, + "grad_norm": 0.9951794147491455, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 62560 + }, + { + "epoch": 4.493357271095153, + "grad_norm": 1.2510017156600952, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 62570 + }, + { + "epoch": 4.494075403949731, + "grad_norm": 1.4066375494003296, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 62580 + }, + { + "epoch": 4.494793536804309, + "grad_norm": 0.988175094127655, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 62590 + }, + { + "epoch": 4.495511669658887, + "grad_norm": 1.2049115896224976, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 62600 + }, + { + "epoch": 4.496229802513465, + "grad_norm": 0.962464451789856, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 62610 + }, + { + "epoch": 4.496947935368043, + "grad_norm": 0.9324793815612793, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 62620 + }, + { + "epoch": 4.497666068222621, + "grad_norm": 0.9174214005470276, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 62630 + }, + { + "epoch": 4.498384201077199, + "grad_norm": 0.9729902148246765, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 62640 + }, + { + "epoch": 4.499102333931777, + "grad_norm": 1.0190484523773193, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 62650 + }, + { + "epoch": 4.499820466786355, + "grad_norm": 1.1473679542541504, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 62660 + }, + { + "epoch": 4.500538599640934, + "grad_norm": 1.0160558223724365, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 62670 + }, + { + "epoch": 4.501256732495512, + "grad_norm": 0.8083887100219727, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 62680 + }, + { + "epoch": 4.50197486535009, + "grad_norm": 0.941933274269104, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 62690 + }, + { + "epoch": 4.502692998204668, + "grad_norm": 0.9962822794914246, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 62700 + }, + { + "epoch": 4.503411131059246, + "grad_norm": 0.8993943333625793, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 62710 + }, + { + "epoch": 4.504129263913824, + "grad_norm": 0.9438319206237793, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 62720 + }, + { + "epoch": 4.504847396768402, + "grad_norm": 0.7951892018318176, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 62730 + }, + { + "epoch": 4.50556552962298, + "grad_norm": 0.8875413537025452, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 62740 + }, + { + "epoch": 4.506283662477558, + "grad_norm": 0.993819534778595, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 62750 + }, + { + "epoch": 4.507001795332137, + "grad_norm": 0.9177559018135071, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 62760 + }, + { + "epoch": 4.507719928186715, + "grad_norm": 0.8632771968841553, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 62770 + }, + { + "epoch": 4.508438061041293, + "grad_norm": 0.943778395652771, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 62780 + }, + { + "epoch": 4.509156193895871, + "grad_norm": 0.8754997849464417, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 62790 + }, + { + "epoch": 4.509874326750449, + "grad_norm": 1.102683424949646, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 62800 + }, + { + "epoch": 4.510592459605027, + "grad_norm": 1.1156457662582397, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 62810 + }, + { + "epoch": 4.511310592459605, + "grad_norm": 0.9178887009620667, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 62820 + }, + { + "epoch": 4.512028725314183, + "grad_norm": 0.9520689249038696, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 62830 + }, + { + "epoch": 4.512746858168761, + "grad_norm": 0.8880525231361389, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 62840 + }, + { + "epoch": 4.513464991023339, + "grad_norm": 0.9541497826576233, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 62850 + }, + { + "epoch": 4.514183123877918, + "grad_norm": 1.003766417503357, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 62860 + }, + { + "epoch": 4.514901256732496, + "grad_norm": 0.8844705820083618, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 62870 + }, + { + "epoch": 4.515619389587074, + "grad_norm": 1.1870828866958618, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 62880 + }, + { + "epoch": 4.516337522441652, + "grad_norm": 0.863487184047699, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 62890 + }, + { + "epoch": 4.51705565529623, + "grad_norm": 0.997770369052887, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 62900 + }, + { + "epoch": 4.517773788150808, + "grad_norm": 0.9708612561225891, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 62910 + }, + { + "epoch": 4.518491921005386, + "grad_norm": 1.1381206512451172, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 62920 + }, + { + "epoch": 4.519210053859964, + "grad_norm": 1.0386693477630615, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 62930 + }, + { + "epoch": 4.519928186714543, + "grad_norm": 1.1711705923080444, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 62940 + }, + { + "epoch": 4.520646319569121, + "grad_norm": 0.8727447390556335, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 62950 + }, + { + "epoch": 4.521364452423699, + "grad_norm": 0.9215193390846252, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 62960 + }, + { + "epoch": 4.522082585278277, + "grad_norm": 1.005467176437378, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 62970 + }, + { + "epoch": 4.522800718132855, + "grad_norm": 0.8761187791824341, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 62980 + }, + { + "epoch": 4.523518850987433, + "grad_norm": 0.957848310470581, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 62990 + }, + { + "epoch": 4.524236983842011, + "grad_norm": 0.8634148836135864, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 63000 + }, + { + "epoch": 4.524955116696589, + "grad_norm": 0.9557477235794067, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 63010 + }, + { + "epoch": 4.525673249551167, + "grad_norm": 1.017720341682434, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 63020 + }, + { + "epoch": 4.526391382405745, + "grad_norm": 1.0281825065612793, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 63030 + }, + { + "epoch": 4.527109515260323, + "grad_norm": 1.253974437713623, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 63040 + }, + { + "epoch": 4.527827648114902, + "grad_norm": 0.8489068150520325, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 63050 + }, + { + "epoch": 4.52854578096948, + "grad_norm": 0.9681686162948608, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 63060 + }, + { + "epoch": 4.529263913824058, + "grad_norm": 1.10277259349823, + "learning_rate": 0.0002, + "loss": 0.6166, + "step": 63070 + }, + { + "epoch": 4.529982046678636, + "grad_norm": 0.9469163417816162, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 63080 + }, + { + "epoch": 4.530700179533214, + "grad_norm": 1.1228134632110596, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 63090 + }, + { + "epoch": 4.531418312387792, + "grad_norm": 0.9673212170600891, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 63100 + }, + { + "epoch": 4.53213644524237, + "grad_norm": 1.0221107006072998, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 63110 + }, + { + "epoch": 4.532854578096948, + "grad_norm": 0.826372504234314, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 63120 + }, + { + "epoch": 4.5335727109515265, + "grad_norm": 1.1805331707000732, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 63130 + }, + { + "epoch": 4.5342908438061045, + "grad_norm": 0.9645666480064392, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 63140 + }, + { + "epoch": 4.5350089766606825, + "grad_norm": 1.0838309526443481, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 63150 + }, + { + "epoch": 4.5357271095152605, + "grad_norm": 1.061414361000061, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 63160 + }, + { + "epoch": 4.5364452423698385, + "grad_norm": 0.841961145401001, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 63170 + }, + { + "epoch": 4.5371633752244165, + "grad_norm": 1.1220186948776245, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 63180 + }, + { + "epoch": 4.5378815080789945, + "grad_norm": 1.036441445350647, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 63190 + }, + { + "epoch": 4.5385996409335725, + "grad_norm": 0.9089716076850891, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 63200 + }, + { + "epoch": 4.5393177737881505, + "grad_norm": 0.8699982762336731, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 63210 + }, + { + "epoch": 4.5400359066427285, + "grad_norm": 0.8489565253257751, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 63220 + }, + { + "epoch": 4.540754039497307, + "grad_norm": 0.7778416275978088, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 63230 + }, + { + "epoch": 4.541472172351885, + "grad_norm": 1.0625852346420288, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 63240 + }, + { + "epoch": 4.542190305206463, + "grad_norm": 0.8515732884407043, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 63250 + }, + { + "epoch": 4.542908438061041, + "grad_norm": 0.7679561376571655, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 63260 + }, + { + "epoch": 4.543626570915619, + "grad_norm": 0.7358446717262268, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 63270 + }, + { + "epoch": 4.544344703770197, + "grad_norm": 1.0866128206253052, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 63280 + }, + { + "epoch": 4.545062836624775, + "grad_norm": 1.0870225429534912, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 63290 + }, + { + "epoch": 4.545780969479353, + "grad_norm": 0.951095461845398, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 63300 + }, + { + "epoch": 4.546499102333931, + "grad_norm": 1.0914306640625, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 63310 + }, + { + "epoch": 4.54721723518851, + "grad_norm": 0.8676106333732605, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 63320 + }, + { + "epoch": 4.547935368043088, + "grad_norm": 1.0129096508026123, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 63330 + }, + { + "epoch": 4.548653500897666, + "grad_norm": 0.8710526823997498, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 63340 + }, + { + "epoch": 4.549371633752244, + "grad_norm": 0.7014815807342529, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 63350 + }, + { + "epoch": 4.550089766606822, + "grad_norm": 1.1546777486801147, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 63360 + }, + { + "epoch": 4.5508078994614, + "grad_norm": 0.7464957237243652, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 63370 + }, + { + "epoch": 4.551526032315978, + "grad_norm": 0.9976209998130798, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 63380 + }, + { + "epoch": 4.552244165170556, + "grad_norm": 0.9543681740760803, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 63390 + }, + { + "epoch": 4.552962298025134, + "grad_norm": 1.1498578786849976, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 63400 + }, + { + "epoch": 4.553680430879712, + "grad_norm": 1.0162293910980225, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 63410 + }, + { + "epoch": 4.554398563734291, + "grad_norm": 0.9015304446220398, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 63420 + }, + { + "epoch": 4.555116696588869, + "grad_norm": 1.1639831066131592, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 63430 + }, + { + "epoch": 4.555834829443447, + "grad_norm": 0.9494703412055969, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 63440 + }, + { + "epoch": 4.556552962298025, + "grad_norm": 1.0555956363677979, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 63450 + }, + { + "epoch": 4.557271095152603, + "grad_norm": 0.8513827919960022, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 63460 + }, + { + "epoch": 4.557989228007181, + "grad_norm": 1.0614275932312012, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 63470 + }, + { + "epoch": 4.558707360861759, + "grad_norm": 0.8341137766838074, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 63480 + }, + { + "epoch": 4.559425493716337, + "grad_norm": 1.2136222124099731, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 63490 + }, + { + "epoch": 4.560143626570916, + "grad_norm": 0.8806019425392151, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 63500 + }, + { + "epoch": 4.560861759425494, + "grad_norm": 1.2548854351043701, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 63510 + }, + { + "epoch": 4.561579892280072, + "grad_norm": 1.0162668228149414, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 63520 + }, + { + "epoch": 4.56229802513465, + "grad_norm": 1.0487624406814575, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 63530 + }, + { + "epoch": 4.563016157989228, + "grad_norm": 1.2505502700805664, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 63540 + }, + { + "epoch": 4.563734290843806, + "grad_norm": 0.9930511713027954, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 63550 + }, + { + "epoch": 4.564452423698384, + "grad_norm": 0.8132568001747131, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 63560 + }, + { + "epoch": 4.565170556552962, + "grad_norm": 1.0129177570343018, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 63570 + }, + { + "epoch": 4.56588868940754, + "grad_norm": 0.9011693596839905, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 63580 + }, + { + "epoch": 4.566606822262118, + "grad_norm": 0.9161545634269714, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 63590 + }, + { + "epoch": 4.567324955116696, + "grad_norm": 0.8852348327636719, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 63600 + }, + { + "epoch": 4.568043087971275, + "grad_norm": 0.8579391837120056, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 63610 + }, + { + "epoch": 4.568761220825853, + "grad_norm": 0.9271050095558167, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 63620 + }, + { + "epoch": 4.569479353680431, + "grad_norm": 0.9881834983825684, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 63630 + }, + { + "epoch": 4.570197486535009, + "grad_norm": 1.0255686044692993, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 63640 + }, + { + "epoch": 4.570915619389587, + "grad_norm": 0.8758876919746399, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 63650 + }, + { + "epoch": 4.571633752244165, + "grad_norm": 1.0134185552597046, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 63660 + }, + { + "epoch": 4.572351885098743, + "grad_norm": 0.8535705208778381, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 63670 + }, + { + "epoch": 4.573070017953321, + "grad_norm": 0.9614834785461426, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 63680 + }, + { + "epoch": 4.5737881508079, + "grad_norm": 0.9004243612289429, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 63690 + }, + { + "epoch": 4.574506283662478, + "grad_norm": 0.9563080072402954, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 63700 + }, + { + "epoch": 4.575224416517056, + "grad_norm": 1.024857521057129, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 63710 + }, + { + "epoch": 4.575942549371634, + "grad_norm": 0.9345638155937195, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 63720 + }, + { + "epoch": 4.576660682226212, + "grad_norm": 1.27083158493042, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 63730 + }, + { + "epoch": 4.57737881508079, + "grad_norm": 1.0866559743881226, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 63740 + }, + { + "epoch": 4.578096947935368, + "grad_norm": 0.9253925681114197, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 63750 + }, + { + "epoch": 4.578815080789946, + "grad_norm": 0.8127399682998657, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 63760 + }, + { + "epoch": 4.579533213644524, + "grad_norm": 1.0453993082046509, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 63770 + }, + { + "epoch": 4.580251346499102, + "grad_norm": 1.2227544784545898, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 63780 + }, + { + "epoch": 4.580969479353681, + "grad_norm": 1.0207865238189697, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 63790 + }, + { + "epoch": 4.581687612208259, + "grad_norm": 1.030447244644165, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 63800 + }, + { + "epoch": 4.582405745062837, + "grad_norm": 1.0855677127838135, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 63810 + }, + { + "epoch": 4.583123877917415, + "grad_norm": 0.9572556018829346, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 63820 + }, + { + "epoch": 4.583842010771993, + "grad_norm": 0.9061040282249451, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 63830 + }, + { + "epoch": 4.584560143626571, + "grad_norm": 0.9267677068710327, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 63840 + }, + { + "epoch": 4.585278276481149, + "grad_norm": 1.070076823234558, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 63850 + }, + { + "epoch": 4.585996409335727, + "grad_norm": 1.045881748199463, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 63860 + }, + { + "epoch": 4.586714542190305, + "grad_norm": 0.9190576672554016, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 63870 + }, + { + "epoch": 4.587432675044884, + "grad_norm": 0.9263932704925537, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 63880 + }, + { + "epoch": 4.588150807899462, + "grad_norm": 1.0217589139938354, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 63890 + }, + { + "epoch": 4.58886894075404, + "grad_norm": 0.9200088381767273, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 63900 + }, + { + "epoch": 4.589587073608618, + "grad_norm": 0.9877251386642456, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 63910 + }, + { + "epoch": 4.590305206463196, + "grad_norm": 1.0059093236923218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 63920 + }, + { + "epoch": 4.591023339317774, + "grad_norm": 1.2618095874786377, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 63930 + }, + { + "epoch": 4.591741472172352, + "grad_norm": 1.1779268980026245, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 63940 + }, + { + "epoch": 4.59245960502693, + "grad_norm": 1.2339502573013306, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 63950 + }, + { + "epoch": 4.593177737881508, + "grad_norm": 0.7488788366317749, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 63960 + }, + { + "epoch": 4.593895870736086, + "grad_norm": 0.8366380929946899, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 63970 + }, + { + "epoch": 4.594614003590665, + "grad_norm": 1.0292677879333496, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 63980 + }, + { + "epoch": 4.595332136445243, + "grad_norm": 0.7938551306724548, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 63990 + }, + { + "epoch": 4.596050269299821, + "grad_norm": 0.7958516478538513, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 64000 + }, + { + "epoch": 4.596768402154399, + "grad_norm": 0.9613908529281616, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 64010 + }, + { + "epoch": 4.597486535008977, + "grad_norm": 1.0253773927688599, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 64020 + }, + { + "epoch": 4.598204667863555, + "grad_norm": 1.0560888051986694, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 64030 + }, + { + "epoch": 4.598922800718133, + "grad_norm": 1.1093556880950928, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 64040 + }, + { + "epoch": 4.599640933572711, + "grad_norm": 0.8492098450660706, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 64050 + }, + { + "epoch": 4.6003590664272895, + "grad_norm": 1.0070436000823975, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 64060 + }, + { + "epoch": 4.6010771992818675, + "grad_norm": 0.9774282574653625, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 64070 + }, + { + "epoch": 4.6017953321364455, + "grad_norm": 1.0744960308074951, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 64080 + }, + { + "epoch": 4.6025134649910235, + "grad_norm": 1.0101491212844849, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 64090 + }, + { + "epoch": 4.6032315978456015, + "grad_norm": 1.2306591272354126, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 64100 + }, + { + "epoch": 4.6039497307001795, + "grad_norm": 0.9187033176422119, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 64110 + }, + { + "epoch": 4.6046678635547575, + "grad_norm": 0.9178676605224609, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 64120 + }, + { + "epoch": 4.6053859964093355, + "grad_norm": 1.006374716758728, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 64130 + }, + { + "epoch": 4.6061041292639135, + "grad_norm": 1.0774449110031128, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 64140 + }, + { + "epoch": 4.6068222621184916, + "grad_norm": 1.0360658168792725, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 64150 + }, + { + "epoch": 4.6075403949730696, + "grad_norm": 1.1061090230941772, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 64160 + }, + { + "epoch": 4.608258527827648, + "grad_norm": 1.0320971012115479, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 64170 + }, + { + "epoch": 4.6089766606822264, + "grad_norm": 0.8596988916397095, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 64180 + }, + { + "epoch": 4.6096947935368044, + "grad_norm": 1.1665741205215454, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 64190 + }, + { + "epoch": 4.6104129263913824, + "grad_norm": 0.857207715511322, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 64200 + }, + { + "epoch": 4.6111310592459605, + "grad_norm": 1.0088987350463867, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 64210 + }, + { + "epoch": 4.6118491921005385, + "grad_norm": 1.0985605716705322, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 64220 + }, + { + "epoch": 4.6125673249551165, + "grad_norm": 0.9504913687705994, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 64230 + }, + { + "epoch": 4.6132854578096945, + "grad_norm": 0.8415018916130066, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 64240 + }, + { + "epoch": 4.614003590664273, + "grad_norm": 0.9857034087181091, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 64250 + }, + { + "epoch": 4.614721723518851, + "grad_norm": 1.0164235830307007, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 64260 + }, + { + "epoch": 4.615439856373429, + "grad_norm": 0.949481725692749, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 64270 + }, + { + "epoch": 4.616157989228007, + "grad_norm": 0.9526455998420715, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 64280 + }, + { + "epoch": 4.616876122082585, + "grad_norm": 1.1121242046356201, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 64290 + }, + { + "epoch": 4.617594254937163, + "grad_norm": 0.9598871469497681, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 64300 + }, + { + "epoch": 4.618312387791741, + "grad_norm": 1.0406304597854614, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 64310 + }, + { + "epoch": 4.619030520646319, + "grad_norm": 1.1816964149475098, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 64320 + }, + { + "epoch": 4.619748653500897, + "grad_norm": 0.9818326830863953, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 64330 + }, + { + "epoch": 4.620466786355475, + "grad_norm": 0.952017605304718, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 64340 + }, + { + "epoch": 4.621184919210053, + "grad_norm": 1.1263453960418701, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 64350 + }, + { + "epoch": 4.621903052064632, + "grad_norm": 1.1158473491668701, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 64360 + }, + { + "epoch": 4.62262118491921, + "grad_norm": 0.9056766033172607, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 64370 + }, + { + "epoch": 4.623339317773788, + "grad_norm": 0.8113203048706055, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 64380 + }, + { + "epoch": 4.624057450628366, + "grad_norm": 0.8646712899208069, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 64390 + }, + { + "epoch": 4.624775583482944, + "grad_norm": 1.0064425468444824, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 64400 + }, + { + "epoch": 4.625493716337522, + "grad_norm": 0.9867565631866455, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 64410 + }, + { + "epoch": 4.6262118491921, + "grad_norm": 1.018764615058899, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 64420 + }, + { + "epoch": 4.626929982046678, + "grad_norm": 1.0607863664627075, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 64430 + }, + { + "epoch": 4.627648114901257, + "grad_norm": 1.012825846672058, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 64440 + }, + { + "epoch": 4.628366247755835, + "grad_norm": 0.8441653847694397, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 64450 + }, + { + "epoch": 4.629084380610413, + "grad_norm": 0.9819194674491882, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 64460 + }, + { + "epoch": 4.629802513464991, + "grad_norm": 0.925519585609436, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 64470 + }, + { + "epoch": 4.630520646319569, + "grad_norm": 0.9409030079841614, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 64480 + }, + { + "epoch": 4.631238779174147, + "grad_norm": 1.148024559020996, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 64490 + }, + { + "epoch": 4.631956912028725, + "grad_norm": 0.8225533962249756, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 64500 + }, + { + "epoch": 4.632675044883303, + "grad_norm": 0.8806734681129456, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 64510 + }, + { + "epoch": 4.633393177737881, + "grad_norm": 0.9656694531440735, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 64520 + }, + { + "epoch": 4.634111310592459, + "grad_norm": 0.9977783560752869, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 64530 + }, + { + "epoch": 4.634829443447038, + "grad_norm": 0.9259420037269592, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 64540 + }, + { + "epoch": 4.635547576301616, + "grad_norm": 1.0215885639190674, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 64550 + }, + { + "epoch": 4.636265709156194, + "grad_norm": 1.1082557439804077, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 64560 + }, + { + "epoch": 4.636983842010772, + "grad_norm": 1.1183207035064697, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 64570 + }, + { + "epoch": 4.63770197486535, + "grad_norm": 0.9914339184761047, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 64580 + }, + { + "epoch": 4.638420107719928, + "grad_norm": 0.8065831661224365, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 64590 + }, + { + "epoch": 4.639138240574506, + "grad_norm": 1.1546721458435059, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 64600 + }, + { + "epoch": 4.639856373429084, + "grad_norm": 1.0395900011062622, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 64610 + }, + { + "epoch": 4.640574506283663, + "grad_norm": 0.9957455992698669, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 64620 + }, + { + "epoch": 4.641292639138241, + "grad_norm": 1.069557785987854, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 64630 + }, + { + "epoch": 4.642010771992819, + "grad_norm": 1.005236268043518, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 64640 + }, + { + "epoch": 4.642728904847397, + "grad_norm": 1.0216304063796997, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 64650 + }, + { + "epoch": 4.643447037701975, + "grad_norm": 0.8567317128181458, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 64660 + }, + { + "epoch": 4.644165170556553, + "grad_norm": 1.0386067628860474, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 64670 + }, + { + "epoch": 4.644883303411131, + "grad_norm": 0.9566055536270142, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 64680 + }, + { + "epoch": 4.645601436265709, + "grad_norm": 1.0990564823150635, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 64690 + }, + { + "epoch": 4.646319569120287, + "grad_norm": 0.9962695240974426, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 64700 + }, + { + "epoch": 4.647037701974865, + "grad_norm": 0.9041377305984497, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 64710 + }, + { + "epoch": 4.647755834829443, + "grad_norm": 0.8611233234405518, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 64720 + }, + { + "epoch": 4.648473967684022, + "grad_norm": 1.1569812297821045, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 64730 + }, + { + "epoch": 4.6491921005386, + "grad_norm": 0.7946197390556335, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 64740 + }, + { + "epoch": 4.649910233393178, + "grad_norm": 0.9612061381340027, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 64750 + }, + { + "epoch": 4.650628366247756, + "grad_norm": 0.9669303297996521, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 64760 + }, + { + "epoch": 4.651346499102334, + "grad_norm": 0.8117775321006775, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 64770 + }, + { + "epoch": 4.652064631956912, + "grad_norm": 1.2326241731643677, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 64780 + }, + { + "epoch": 4.65278276481149, + "grad_norm": 0.7494568228721619, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 64790 + }, + { + "epoch": 4.653500897666068, + "grad_norm": 0.8145379424095154, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 64800 + }, + { + "epoch": 4.654219030520647, + "grad_norm": 1.0139610767364502, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 64810 + }, + { + "epoch": 4.654937163375225, + "grad_norm": 0.9887115359306335, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 64820 + }, + { + "epoch": 4.655655296229803, + "grad_norm": 0.9565147161483765, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 64830 + }, + { + "epoch": 4.656373429084381, + "grad_norm": 0.9022467136383057, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 64840 + }, + { + "epoch": 4.657091561938959, + "grad_norm": 1.075003981590271, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 64850 + }, + { + "epoch": 4.657809694793537, + "grad_norm": 0.8705733418464661, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 64860 + }, + { + "epoch": 4.658527827648115, + "grad_norm": 1.0826832056045532, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 64870 + }, + { + "epoch": 4.659245960502693, + "grad_norm": 1.1056268215179443, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 64880 + }, + { + "epoch": 4.659964093357271, + "grad_norm": 0.8664149641990662, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 64890 + }, + { + "epoch": 4.660682226211849, + "grad_norm": 0.9487230181694031, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 64900 + }, + { + "epoch": 4.661400359066427, + "grad_norm": 1.0357837677001953, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 64910 + }, + { + "epoch": 4.662118491921006, + "grad_norm": 0.8620632290840149, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 64920 + }, + { + "epoch": 4.662836624775584, + "grad_norm": 1.108986735343933, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 64930 + }, + { + "epoch": 4.663554757630162, + "grad_norm": 0.8017674684524536, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 64940 + }, + { + "epoch": 4.66427289048474, + "grad_norm": 0.882347583770752, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 64950 + }, + { + "epoch": 4.664991023339318, + "grad_norm": 0.9466867446899414, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 64960 + }, + { + "epoch": 4.665709156193896, + "grad_norm": 1.1823636293411255, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 64970 + }, + { + "epoch": 4.666427289048474, + "grad_norm": 0.9535016417503357, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 64980 + }, + { + "epoch": 4.667145421903052, + "grad_norm": 0.9456726312637329, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 64990 + }, + { + "epoch": 4.667863554757631, + "grad_norm": 0.7761920690536499, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 65000 + }, + { + "epoch": 4.668581687612209, + "grad_norm": 1.060357689857483, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 65010 + }, + { + "epoch": 4.669299820466787, + "grad_norm": 0.9083862900733948, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 65020 + }, + { + "epoch": 4.670017953321365, + "grad_norm": 0.8745762705802917, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 65030 + }, + { + "epoch": 4.670736086175943, + "grad_norm": 0.8715422749519348, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 65040 + }, + { + "epoch": 4.671454219030521, + "grad_norm": 0.9407707452774048, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 65050 + }, + { + "epoch": 4.672172351885099, + "grad_norm": 0.8998945355415344, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 65060 + }, + { + "epoch": 4.672890484739677, + "grad_norm": 0.9147891998291016, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 65070 + }, + { + "epoch": 4.673608617594255, + "grad_norm": 1.116614580154419, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 65080 + }, + { + "epoch": 4.674326750448833, + "grad_norm": 1.0764213800430298, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 65090 + }, + { + "epoch": 4.6750448833034115, + "grad_norm": 0.9115945100784302, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 65100 + }, + { + "epoch": 4.6757630161579895, + "grad_norm": 1.001251459121704, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 65110 + }, + { + "epoch": 4.6764811490125675, + "grad_norm": 1.0330020189285278, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 65120 + }, + { + "epoch": 4.6771992818671455, + "grad_norm": 0.9083197116851807, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 65130 + }, + { + "epoch": 4.6779174147217235, + "grad_norm": 0.9298770427703857, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 65140 + }, + { + "epoch": 4.6786355475763015, + "grad_norm": 1.0009549856185913, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 65150 + }, + { + "epoch": 4.6793536804308795, + "grad_norm": 0.951389729976654, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 65160 + }, + { + "epoch": 4.6800718132854575, + "grad_norm": 1.151870608329773, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 65170 + }, + { + "epoch": 4.680789946140036, + "grad_norm": 1.0074727535247803, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 65180 + }, + { + "epoch": 4.681508078994614, + "grad_norm": 1.0490152835845947, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 65190 + }, + { + "epoch": 4.682226211849192, + "grad_norm": 0.8967363834381104, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 65200 + }, + { + "epoch": 4.68294434470377, + "grad_norm": 1.2314889430999756, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 65210 + }, + { + "epoch": 4.683662477558348, + "grad_norm": 0.7764074802398682, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 65220 + }, + { + "epoch": 4.684380610412926, + "grad_norm": 1.0587822198867798, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 65230 + }, + { + "epoch": 4.685098743267504, + "grad_norm": 0.916114091873169, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 65240 + }, + { + "epoch": 4.685816876122082, + "grad_norm": 0.9117472767829895, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 65250 + }, + { + "epoch": 4.68653500897666, + "grad_norm": 0.8369293212890625, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 65260 + }, + { + "epoch": 4.687253141831238, + "grad_norm": 0.9700121879577637, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 65270 + }, + { + "epoch": 4.687971274685816, + "grad_norm": 1.0008411407470703, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 65280 + }, + { + "epoch": 4.688689407540395, + "grad_norm": 0.9339549541473389, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 65290 + }, + { + "epoch": 4.689407540394973, + "grad_norm": 0.956701934337616, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 65300 + }, + { + "epoch": 4.690125673249551, + "grad_norm": 1.2042720317840576, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 65310 + }, + { + "epoch": 4.690843806104129, + "grad_norm": 0.8679144382476807, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 65320 + }, + { + "epoch": 4.691561938958707, + "grad_norm": 1.2320687770843506, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 65330 + }, + { + "epoch": 4.692280071813285, + "grad_norm": 0.8397238850593567, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 65340 + }, + { + "epoch": 4.692998204667863, + "grad_norm": 0.7850362658500671, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 65350 + }, + { + "epoch": 4.693716337522441, + "grad_norm": 0.9281290173530579, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 65360 + }, + { + "epoch": 4.69443447037702, + "grad_norm": 1.1506335735321045, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 65370 + }, + { + "epoch": 4.695152603231598, + "grad_norm": 1.0910584926605225, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 65380 + }, + { + "epoch": 4.695870736086176, + "grad_norm": 0.8937386274337769, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 65390 + }, + { + "epoch": 4.696588868940754, + "grad_norm": 1.0163888931274414, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 65400 + }, + { + "epoch": 4.697307001795332, + "grad_norm": 1.0290007591247559, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 65410 + }, + { + "epoch": 4.69802513464991, + "grad_norm": 0.9046576023101807, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 65420 + }, + { + "epoch": 4.698743267504488, + "grad_norm": 1.0030237436294556, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 65430 + }, + { + "epoch": 4.699461400359066, + "grad_norm": 0.8196740746498108, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 65440 + }, + { + "epoch": 4.700179533213644, + "grad_norm": 0.9036651849746704, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 65450 + }, + { + "epoch": 4.700897666068222, + "grad_norm": 1.2080141305923462, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 65460 + }, + { + "epoch": 4.7016157989228, + "grad_norm": 0.8743635416030884, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 65470 + }, + { + "epoch": 4.702333931777379, + "grad_norm": 0.9566192030906677, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 65480 + }, + { + "epoch": 4.703052064631957, + "grad_norm": 1.0505144596099854, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 65490 + }, + { + "epoch": 4.703770197486535, + "grad_norm": 0.8797298073768616, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 65500 + }, + { + "epoch": 4.704488330341113, + "grad_norm": 0.9970770478248596, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 65510 + }, + { + "epoch": 4.705206463195691, + "grad_norm": 1.1743851900100708, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 65520 + }, + { + "epoch": 4.705924596050269, + "grad_norm": 0.9534381031990051, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 65530 + }, + { + "epoch": 4.706642728904847, + "grad_norm": 0.9735581278800964, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 65540 + }, + { + "epoch": 4.707360861759425, + "grad_norm": 1.185352087020874, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 65550 + }, + { + "epoch": 4.708078994614004, + "grad_norm": 0.9383901357650757, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 65560 + }, + { + "epoch": 4.708797127468582, + "grad_norm": 1.0194662809371948, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 65570 + }, + { + "epoch": 4.70951526032316, + "grad_norm": 0.8448300361633301, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 65580 + }, + { + "epoch": 4.710233393177738, + "grad_norm": 1.1930629014968872, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 65590 + }, + { + "epoch": 4.710951526032316, + "grad_norm": 1.0038636922836304, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 65600 + }, + { + "epoch": 4.711669658886894, + "grad_norm": 0.8206564784049988, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 65610 + }, + { + "epoch": 4.712387791741472, + "grad_norm": 1.0984861850738525, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 65620 + }, + { + "epoch": 4.71310592459605, + "grad_norm": 1.2891547679901123, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 65630 + }, + { + "epoch": 4.713824057450628, + "grad_norm": 0.927062451839447, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 65640 + }, + { + "epoch": 4.714542190305206, + "grad_norm": 0.8647334575653076, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 65650 + }, + { + "epoch": 4.715260323159785, + "grad_norm": 1.1017670631408691, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 65660 + }, + { + "epoch": 4.715978456014363, + "grad_norm": 0.9589072465896606, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 65670 + }, + { + "epoch": 4.716696588868941, + "grad_norm": 0.9496776461601257, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 65680 + }, + { + "epoch": 4.717414721723519, + "grad_norm": 0.9266180396080017, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 65690 + }, + { + "epoch": 4.718132854578097, + "grad_norm": 0.8699696063995361, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 65700 + }, + { + "epoch": 4.718850987432675, + "grad_norm": 1.0444015264511108, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 65710 + }, + { + "epoch": 4.719569120287253, + "grad_norm": 1.0100741386413574, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 65720 + }, + { + "epoch": 4.720287253141831, + "grad_norm": 1.1442630290985107, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 65730 + }, + { + "epoch": 4.721005385996409, + "grad_norm": 0.8937877416610718, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 65740 + }, + { + "epoch": 4.721723518850988, + "grad_norm": 1.0718764066696167, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 65750 + }, + { + "epoch": 4.722441651705566, + "grad_norm": 0.8838587999343872, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 65760 + }, + { + "epoch": 4.723159784560144, + "grad_norm": 1.1247940063476562, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 65770 + }, + { + "epoch": 4.723877917414722, + "grad_norm": 0.9491105675697327, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 65780 + }, + { + "epoch": 4.7245960502693, + "grad_norm": 1.0896921157836914, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 65790 + }, + { + "epoch": 4.725314183123878, + "grad_norm": 1.0097380876541138, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 65800 + }, + { + "epoch": 4.726032315978456, + "grad_norm": 0.911763608455658, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 65810 + }, + { + "epoch": 4.726750448833034, + "grad_norm": 1.1295124292373657, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 65820 + }, + { + "epoch": 4.727468581687612, + "grad_norm": 0.7637538313865662, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 65830 + }, + { + "epoch": 4.72818671454219, + "grad_norm": 0.9255306720733643, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 65840 + }, + { + "epoch": 4.728904847396769, + "grad_norm": 0.9847530126571655, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 65850 + }, + { + "epoch": 4.729622980251347, + "grad_norm": 0.9036182761192322, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 65860 + }, + { + "epoch": 4.730341113105925, + "grad_norm": 0.8284199833869934, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 65870 + }, + { + "epoch": 4.731059245960503, + "grad_norm": 1.0142838954925537, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 65880 + }, + { + "epoch": 4.731777378815081, + "grad_norm": 0.9389033913612366, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 65890 + }, + { + "epoch": 4.732495511669659, + "grad_norm": 0.8870056867599487, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 65900 + }, + { + "epoch": 4.733213644524237, + "grad_norm": 1.1211678981781006, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 65910 + }, + { + "epoch": 4.733931777378815, + "grad_norm": 0.7796614170074463, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 65920 + }, + { + "epoch": 4.734649910233394, + "grad_norm": 1.0360451936721802, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 65930 + }, + { + "epoch": 4.735368043087972, + "grad_norm": 0.8383482098579407, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 65940 + }, + { + "epoch": 4.73608617594255, + "grad_norm": 0.7985122799873352, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 65950 + }, + { + "epoch": 4.736804308797128, + "grad_norm": 1.0314199924468994, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 65960 + }, + { + "epoch": 4.737522441651706, + "grad_norm": 0.9279016852378845, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 65970 + }, + { + "epoch": 4.738240574506284, + "grad_norm": 1.1046063899993896, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 65980 + }, + { + "epoch": 4.738958707360862, + "grad_norm": 0.9075793623924255, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 65990 + }, + { + "epoch": 4.73967684021544, + "grad_norm": 1.0945355892181396, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 66000 + }, + { + "epoch": 4.740394973070018, + "grad_norm": 0.8885519504547119, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 66010 + }, + { + "epoch": 4.741113105924596, + "grad_norm": 0.9312083125114441, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 66020 + }, + { + "epoch": 4.741831238779174, + "grad_norm": 1.1574538946151733, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 66030 + }, + { + "epoch": 4.742549371633753, + "grad_norm": 0.9346209168434143, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 66040 + }, + { + "epoch": 4.743267504488331, + "grad_norm": 0.8935149312019348, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 66050 + }, + { + "epoch": 4.743985637342909, + "grad_norm": 0.8958369493484497, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 66060 + }, + { + "epoch": 4.744703770197487, + "grad_norm": 0.9383506774902344, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 66070 + }, + { + "epoch": 4.745421903052065, + "grad_norm": 0.9868947863578796, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 66080 + }, + { + "epoch": 4.746140035906643, + "grad_norm": 1.3417645692825317, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 66090 + }, + { + "epoch": 4.746858168761221, + "grad_norm": 1.070693850517273, + "learning_rate": 0.0002, + "loss": 0.5417, + "step": 66100 + }, + { + "epoch": 4.747576301615799, + "grad_norm": 0.8841570019721985, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 66110 + }, + { + "epoch": 4.7482944344703775, + "grad_norm": 0.7963120341300964, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 66120 + }, + { + "epoch": 4.7490125673249555, + "grad_norm": 0.8145691156387329, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 66130 + }, + { + "epoch": 4.7497307001795335, + "grad_norm": 0.9074729681015015, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 66140 + }, + { + "epoch": 4.7504488330341115, + "grad_norm": 0.9129886627197266, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 66150 + }, + { + "epoch": 4.7511669658886895, + "grad_norm": 0.91527259349823, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 66160 + }, + { + "epoch": 4.7518850987432675, + "grad_norm": 0.9569419622421265, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 66170 + }, + { + "epoch": 4.7526032315978455, + "grad_norm": 0.8777104616165161, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 66180 + }, + { + "epoch": 4.7533213644524235, + "grad_norm": 0.9673085808753967, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 66190 + }, + { + "epoch": 4.7540394973070015, + "grad_norm": 1.0683966875076294, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 66200 + }, + { + "epoch": 4.7547576301615795, + "grad_norm": 1.1591907739639282, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 66210 + }, + { + "epoch": 4.755475763016158, + "grad_norm": 1.1973309516906738, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 66220 + }, + { + "epoch": 4.756193895870736, + "grad_norm": 0.8472012281417847, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 66230 + }, + { + "epoch": 4.756912028725314, + "grad_norm": 0.9896261692047119, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 66240 + }, + { + "epoch": 4.757630161579892, + "grad_norm": 0.8498432040214539, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 66250 + }, + { + "epoch": 4.75834829443447, + "grad_norm": 0.9624166488647461, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 66260 + }, + { + "epoch": 4.759066427289048, + "grad_norm": 1.0951786041259766, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 66270 + }, + { + "epoch": 4.759784560143626, + "grad_norm": 0.9863157868385315, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 66280 + }, + { + "epoch": 4.760502692998204, + "grad_norm": 1.0062068700790405, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 66290 + }, + { + "epoch": 4.761220825852782, + "grad_norm": 0.8075495958328247, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 66300 + }, + { + "epoch": 4.761938958707361, + "grad_norm": 0.9617878198623657, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 66310 + }, + { + "epoch": 4.762657091561939, + "grad_norm": 1.097091555595398, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 66320 + }, + { + "epoch": 4.763375224416517, + "grad_norm": 1.2713453769683838, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 66330 + }, + { + "epoch": 4.764093357271095, + "grad_norm": 0.9473448991775513, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 66340 + }, + { + "epoch": 4.764811490125673, + "grad_norm": 1.0176854133605957, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 66350 + }, + { + "epoch": 4.765529622980251, + "grad_norm": 1.0486242771148682, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 66360 + }, + { + "epoch": 4.766247755834829, + "grad_norm": 1.249985694885254, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 66370 + }, + { + "epoch": 4.766965888689407, + "grad_norm": 1.283875584602356, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 66380 + }, + { + "epoch": 4.767684021543985, + "grad_norm": 1.0009022951126099, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 66390 + }, + { + "epoch": 4.768402154398563, + "grad_norm": 0.9718021750450134, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 66400 + }, + { + "epoch": 4.769120287253142, + "grad_norm": 1.0865732431411743, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 66410 + }, + { + "epoch": 4.76983842010772, + "grad_norm": 0.9273189306259155, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 66420 + }, + { + "epoch": 4.770556552962298, + "grad_norm": 1.067535638809204, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 66430 + }, + { + "epoch": 4.771274685816876, + "grad_norm": 1.0551011562347412, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 66440 + }, + { + "epoch": 4.771992818671454, + "grad_norm": 1.0336146354675293, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 66450 + }, + { + "epoch": 4.772710951526032, + "grad_norm": 0.8738380670547485, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 66460 + }, + { + "epoch": 4.77342908438061, + "grad_norm": 1.1048321723937988, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 66470 + }, + { + "epoch": 4.774147217235188, + "grad_norm": 0.8471167683601379, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 66480 + }, + { + "epoch": 4.774865350089767, + "grad_norm": 1.2527031898498535, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 66490 + }, + { + "epoch": 4.775583482944345, + "grad_norm": 1.0056052207946777, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 66500 + }, + { + "epoch": 4.776301615798923, + "grad_norm": 1.142456293106079, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 66510 + }, + { + "epoch": 4.777019748653501, + "grad_norm": 1.1813132762908936, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 66520 + }, + { + "epoch": 4.777737881508079, + "grad_norm": 0.8683654069900513, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 66530 + }, + { + "epoch": 4.778456014362657, + "grad_norm": 1.0577980279922485, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 66540 + }, + { + "epoch": 4.779174147217235, + "grad_norm": 1.077438473701477, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 66550 + }, + { + "epoch": 4.779892280071813, + "grad_norm": 1.0107938051223755, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 66560 + }, + { + "epoch": 4.780610412926391, + "grad_norm": 0.8071168065071106, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 66570 + }, + { + "epoch": 4.781328545780969, + "grad_norm": 0.8887564539909363, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 66580 + }, + { + "epoch": 4.782046678635547, + "grad_norm": 0.9823092222213745, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 66590 + }, + { + "epoch": 4.782764811490126, + "grad_norm": 0.9026784300804138, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 66600 + }, + { + "epoch": 4.783482944344704, + "grad_norm": 0.8912792205810547, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 66610 + }, + { + "epoch": 4.784201077199282, + "grad_norm": 1.0955979824066162, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 66620 + }, + { + "epoch": 4.78491921005386, + "grad_norm": 0.8614793419837952, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 66630 + }, + { + "epoch": 4.785637342908438, + "grad_norm": 0.7247269153594971, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 66640 + }, + { + "epoch": 4.786355475763016, + "grad_norm": 0.9685400724411011, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 66650 + }, + { + "epoch": 4.787073608617594, + "grad_norm": 0.9219905734062195, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 66660 + }, + { + "epoch": 4.787791741472172, + "grad_norm": 0.9217489361763, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 66670 + }, + { + "epoch": 4.788509874326751, + "grad_norm": 1.13791823387146, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 66680 + }, + { + "epoch": 4.789228007181329, + "grad_norm": 0.857542872428894, + "learning_rate": 0.0002, + "loss": 0.6114, + "step": 66690 + }, + { + "epoch": 4.789946140035907, + "grad_norm": 0.9886694550514221, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 66700 + }, + { + "epoch": 4.790664272890485, + "grad_norm": 0.987952470779419, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 66710 + }, + { + "epoch": 4.791382405745063, + "grad_norm": 1.051612377166748, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 66720 + }, + { + "epoch": 4.792100538599641, + "grad_norm": 0.9816454648971558, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 66730 + }, + { + "epoch": 4.792818671454219, + "grad_norm": 1.0953829288482666, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 66740 + }, + { + "epoch": 4.793536804308797, + "grad_norm": 0.8720369935035706, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 66750 + }, + { + "epoch": 4.794254937163375, + "grad_norm": 0.8910234570503235, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 66760 + }, + { + "epoch": 4.794973070017953, + "grad_norm": 0.8300510048866272, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 66770 + }, + { + "epoch": 4.795691202872531, + "grad_norm": 0.9380533695220947, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 66780 + }, + { + "epoch": 4.79640933572711, + "grad_norm": 0.8361864686012268, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 66790 + }, + { + "epoch": 4.797127468581688, + "grad_norm": 1.051262617111206, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 66800 + }, + { + "epoch": 4.797845601436266, + "grad_norm": 1.1324400901794434, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 66810 + }, + { + "epoch": 4.798563734290844, + "grad_norm": 0.853903591632843, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 66820 + }, + { + "epoch": 4.799281867145422, + "grad_norm": 0.9949867725372314, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 66830 + }, + { + "epoch": 4.8, + "grad_norm": 0.9204033017158508, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 66840 + }, + { + "epoch": 4.800718132854578, + "grad_norm": 0.7461584806442261, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 66850 + }, + { + "epoch": 4.801436265709156, + "grad_norm": 1.1019874811172485, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 66860 + }, + { + "epoch": 4.802154398563735, + "grad_norm": 1.1695797443389893, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 66870 + }, + { + "epoch": 4.802872531418313, + "grad_norm": 1.0902758836746216, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 66880 + }, + { + "epoch": 4.803590664272891, + "grad_norm": 0.8778618574142456, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 66890 + }, + { + "epoch": 4.804308797127469, + "grad_norm": 0.905505359172821, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 66900 + }, + { + "epoch": 4.805026929982047, + "grad_norm": 1.0802056789398193, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 66910 + }, + { + "epoch": 4.805745062836625, + "grad_norm": 0.7899449467658997, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 66920 + }, + { + "epoch": 4.806463195691203, + "grad_norm": 1.1938519477844238, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 66930 + }, + { + "epoch": 4.807181328545781, + "grad_norm": 1.0213780403137207, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 66940 + }, + { + "epoch": 4.807899461400359, + "grad_norm": 0.9925506711006165, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 66950 + }, + { + "epoch": 4.808617594254937, + "grad_norm": 1.0174424648284912, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 66960 + }, + { + "epoch": 4.809335727109516, + "grad_norm": 1.0515072345733643, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 66970 + }, + { + "epoch": 4.810053859964094, + "grad_norm": 1.0161492824554443, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 66980 + }, + { + "epoch": 4.810771992818672, + "grad_norm": 0.8421840071678162, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 66990 + }, + { + "epoch": 4.81149012567325, + "grad_norm": 1.0493539571762085, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 67000 + }, + { + "epoch": 4.812208258527828, + "grad_norm": 1.1133309602737427, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 67010 + }, + { + "epoch": 4.812926391382406, + "grad_norm": 0.924017071723938, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 67020 + }, + { + "epoch": 4.813644524236984, + "grad_norm": 1.0568689107894897, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 67030 + }, + { + "epoch": 4.814362657091562, + "grad_norm": 0.989414632320404, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 67040 + }, + { + "epoch": 4.8150807899461405, + "grad_norm": 0.9256827235221863, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 67050 + }, + { + "epoch": 4.8157989228007185, + "grad_norm": 0.9538901448249817, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 67060 + }, + { + "epoch": 4.8165170556552965, + "grad_norm": 1.0373849868774414, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 67070 + }, + { + "epoch": 4.8172351885098745, + "grad_norm": 1.0019729137420654, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 67080 + }, + { + "epoch": 4.8179533213644525, + "grad_norm": 0.9930381178855896, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 67090 + }, + { + "epoch": 4.8186714542190305, + "grad_norm": 1.0008453130722046, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 67100 + }, + { + "epoch": 4.8193895870736085, + "grad_norm": 1.0153851509094238, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 67110 + }, + { + "epoch": 4.8201077199281865, + "grad_norm": 1.0193161964416504, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 67120 + }, + { + "epoch": 4.8208258527827645, + "grad_norm": 1.0204501152038574, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 67130 + }, + { + "epoch": 4.8215439856373425, + "grad_norm": 0.9097670316696167, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 67140 + }, + { + "epoch": 4.8222621184919205, + "grad_norm": 0.9288716912269592, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 67150 + }, + { + "epoch": 4.822980251346499, + "grad_norm": 0.9975850582122803, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 67160 + }, + { + "epoch": 4.823698384201077, + "grad_norm": 0.8502511382102966, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 67170 + }, + { + "epoch": 4.824416517055655, + "grad_norm": 1.0129257440567017, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 67180 + }, + { + "epoch": 4.825134649910233, + "grad_norm": 1.0009492635726929, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 67190 + }, + { + "epoch": 4.825852782764811, + "grad_norm": 0.9273321032524109, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 67200 + }, + { + "epoch": 4.8265709156193894, + "grad_norm": 1.0438604354858398, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 67210 + }, + { + "epoch": 4.8272890484739674, + "grad_norm": 1.119573712348938, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 67220 + }, + { + "epoch": 4.8280071813285454, + "grad_norm": 0.9607422351837158, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 67230 + }, + { + "epoch": 4.828725314183124, + "grad_norm": 0.9614062905311584, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 67240 + }, + { + "epoch": 4.829443447037702, + "grad_norm": 1.1017652750015259, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 67250 + }, + { + "epoch": 4.83016157989228, + "grad_norm": 1.0521706342697144, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 67260 + }, + { + "epoch": 4.830879712746858, + "grad_norm": 0.7685959339141846, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 67270 + }, + { + "epoch": 4.831597845601436, + "grad_norm": 0.7894896268844604, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 67280 + }, + { + "epoch": 4.832315978456014, + "grad_norm": 1.0882996320724487, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 67290 + }, + { + "epoch": 4.833034111310592, + "grad_norm": 0.9215409755706787, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 67300 + }, + { + "epoch": 4.83375224416517, + "grad_norm": 0.8660635352134705, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 67310 + }, + { + "epoch": 4.834470377019748, + "grad_norm": 0.980879008769989, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 67320 + }, + { + "epoch": 4.835188509874326, + "grad_norm": 1.0356814861297607, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 67330 + }, + { + "epoch": 4.835906642728904, + "grad_norm": 1.0265507698059082, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 67340 + }, + { + "epoch": 4.836624775583483, + "grad_norm": 1.0659137964248657, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 67350 + }, + { + "epoch": 4.837342908438061, + "grad_norm": 0.9485231637954712, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 67360 + }, + { + "epoch": 4.838061041292639, + "grad_norm": 1.0950140953063965, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 67370 + }, + { + "epoch": 4.838779174147217, + "grad_norm": 0.8907382488250732, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 67380 + }, + { + "epoch": 4.839497307001795, + "grad_norm": 0.9777120351791382, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 67390 + }, + { + "epoch": 4.840215439856373, + "grad_norm": 0.8482252955436707, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 67400 + }, + { + "epoch": 4.840933572710951, + "grad_norm": 0.8505899906158447, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 67410 + }, + { + "epoch": 4.841651705565529, + "grad_norm": 0.8574482798576355, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 67420 + }, + { + "epoch": 4.842369838420108, + "grad_norm": 1.092310905456543, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 67430 + }, + { + "epoch": 4.843087971274686, + "grad_norm": 0.9418560266494751, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 67440 + }, + { + "epoch": 4.843806104129264, + "grad_norm": 1.1310782432556152, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 67450 + }, + { + "epoch": 4.844524236983842, + "grad_norm": 0.9993671774864197, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 67460 + }, + { + "epoch": 4.84524236983842, + "grad_norm": 0.8322528600692749, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 67470 + }, + { + "epoch": 4.845960502692998, + "grad_norm": 0.8488435745239258, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 67480 + }, + { + "epoch": 4.846678635547576, + "grad_norm": 0.8070611357688904, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 67490 + }, + { + "epoch": 4.847396768402154, + "grad_norm": 0.8200163245201111, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 67500 + }, + { + "epoch": 4.848114901256732, + "grad_norm": 0.91901034116745, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 67510 + }, + { + "epoch": 4.84883303411131, + "grad_norm": 1.0938435792922974, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 67520 + }, + { + "epoch": 4.849551166965889, + "grad_norm": 0.7926174402236938, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 67530 + }, + { + "epoch": 4.850269299820467, + "grad_norm": 0.9914385676383972, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 67540 + }, + { + "epoch": 4.850987432675045, + "grad_norm": 1.033065915107727, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 67550 + }, + { + "epoch": 4.851705565529623, + "grad_norm": 0.9700239300727844, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 67560 + }, + { + "epoch": 4.852423698384201, + "grad_norm": 0.8550103902816772, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 67570 + }, + { + "epoch": 4.853141831238779, + "grad_norm": 1.0009654760360718, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 67580 + }, + { + "epoch": 4.853859964093357, + "grad_norm": 1.0766186714172363, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 67590 + }, + { + "epoch": 4.854578096947935, + "grad_norm": 0.9512220621109009, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 67600 + }, + { + "epoch": 4.855296229802514, + "grad_norm": 0.8434456586837769, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 67610 + }, + { + "epoch": 4.856014362657092, + "grad_norm": 1.0276665687561035, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 67620 + }, + { + "epoch": 4.85673249551167, + "grad_norm": 0.9758516550064087, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 67630 + }, + { + "epoch": 4.857450628366248, + "grad_norm": 0.8988076448440552, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 67640 + }, + { + "epoch": 4.858168761220826, + "grad_norm": 1.0038257837295532, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 67650 + }, + { + "epoch": 4.858886894075404, + "grad_norm": 0.9973093867301941, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 67660 + }, + { + "epoch": 4.859605026929982, + "grad_norm": 0.9754974246025085, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 67670 + }, + { + "epoch": 4.86032315978456, + "grad_norm": 1.1829560995101929, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 67680 + }, + { + "epoch": 4.861041292639138, + "grad_norm": 1.1077659130096436, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 67690 + }, + { + "epoch": 4.861759425493716, + "grad_norm": 0.9862872958183289, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 67700 + }, + { + "epoch": 4.862477558348294, + "grad_norm": 0.9826052188873291, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 67710 + }, + { + "epoch": 4.863195691202873, + "grad_norm": 0.940082848072052, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 67720 + }, + { + "epoch": 4.863913824057451, + "grad_norm": 0.895434558391571, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 67730 + }, + { + "epoch": 4.864631956912029, + "grad_norm": 1.1194682121276855, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 67740 + }, + { + "epoch": 4.865350089766607, + "grad_norm": 0.9984544515609741, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 67750 + }, + { + "epoch": 4.866068222621185, + "grad_norm": 1.049224615097046, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 67760 + }, + { + "epoch": 4.866786355475763, + "grad_norm": 1.009515643119812, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 67770 + }, + { + "epoch": 4.867504488330341, + "grad_norm": 1.0336902141571045, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 67780 + }, + { + "epoch": 4.868222621184919, + "grad_norm": 0.9310635924339294, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 67790 + }, + { + "epoch": 4.868940754039498, + "grad_norm": 0.934882640838623, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 67800 + }, + { + "epoch": 4.869658886894076, + "grad_norm": 0.8663495779037476, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 67810 + }, + { + "epoch": 4.870377019748654, + "grad_norm": 1.0085018873214722, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 67820 + }, + { + "epoch": 4.871095152603232, + "grad_norm": 0.896507978439331, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 67830 + }, + { + "epoch": 4.87181328545781, + "grad_norm": 0.925809919834137, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 67840 + }, + { + "epoch": 4.872531418312388, + "grad_norm": 0.8044029474258423, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 67850 + }, + { + "epoch": 4.873249551166966, + "grad_norm": 1.0026800632476807, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 67860 + }, + { + "epoch": 4.873967684021544, + "grad_norm": 0.9577589631080627, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 67870 + }, + { + "epoch": 4.874685816876122, + "grad_norm": 0.8225193619728088, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 67880 + }, + { + "epoch": 4.8754039497307, + "grad_norm": 1.0019139051437378, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 67890 + }, + { + "epoch": 4.876122082585278, + "grad_norm": 0.9282827377319336, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 67900 + }, + { + "epoch": 4.876840215439857, + "grad_norm": 0.8204836249351501, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 67910 + }, + { + "epoch": 4.877558348294435, + "grad_norm": 0.907356321811676, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 67920 + }, + { + "epoch": 4.878276481149013, + "grad_norm": 1.12422776222229, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 67930 + }, + { + "epoch": 4.878994614003591, + "grad_norm": 0.8230205178260803, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 67940 + }, + { + "epoch": 4.879712746858169, + "grad_norm": 1.1588479280471802, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 67950 + }, + { + "epoch": 4.880430879712747, + "grad_norm": 1.1064553260803223, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 67960 + }, + { + "epoch": 4.881149012567325, + "grad_norm": 0.9311534762382507, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 67970 + }, + { + "epoch": 4.881867145421903, + "grad_norm": 0.7575639486312866, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 67980 + }, + { + "epoch": 4.882585278276482, + "grad_norm": 0.9201191067695618, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 67990 + }, + { + "epoch": 4.88330341113106, + "grad_norm": 0.8487658500671387, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 68000 + }, + { + "epoch": 4.884021543985638, + "grad_norm": 0.9645208716392517, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 68010 + }, + { + "epoch": 4.884739676840216, + "grad_norm": 0.8594469428062439, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 68020 + }, + { + "epoch": 4.885457809694794, + "grad_norm": 0.9518412947654724, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 68030 + }, + { + "epoch": 4.886175942549372, + "grad_norm": 1.0934258699417114, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 68040 + }, + { + "epoch": 4.88689407540395, + "grad_norm": 0.988761842250824, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 68050 + }, + { + "epoch": 4.887612208258528, + "grad_norm": 0.7572013735771179, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 68060 + }, + { + "epoch": 4.888330341113106, + "grad_norm": 0.8801929950714111, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 68070 + }, + { + "epoch": 4.889048473967684, + "grad_norm": 1.0080658197402954, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 68080 + }, + { + "epoch": 4.8897666068222625, + "grad_norm": 0.9588785171508789, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 68090 + }, + { + "epoch": 4.8904847396768405, + "grad_norm": 1.0994032621383667, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 68100 + }, + { + "epoch": 4.8912028725314185, + "grad_norm": 0.9851962924003601, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 68110 + }, + { + "epoch": 4.8919210053859965, + "grad_norm": 0.9566116333007812, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 68120 + }, + { + "epoch": 4.8926391382405745, + "grad_norm": 0.8708083033561707, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 68130 + }, + { + "epoch": 4.8933572710951525, + "grad_norm": 1.2182754278182983, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 68140 + }, + { + "epoch": 4.8940754039497305, + "grad_norm": 1.047988772392273, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 68150 + }, + { + "epoch": 4.8947935368043085, + "grad_norm": 0.8665831685066223, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 68160 + }, + { + "epoch": 4.8955116696588865, + "grad_norm": 0.9313908219337463, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 68170 + }, + { + "epoch": 4.896229802513465, + "grad_norm": 0.9568582773208618, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 68180 + }, + { + "epoch": 4.896947935368043, + "grad_norm": 1.0427594184875488, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 68190 + }, + { + "epoch": 4.897666068222621, + "grad_norm": 0.9132021069526672, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 68200 + }, + { + "epoch": 4.898384201077199, + "grad_norm": 0.9597318768501282, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 68210 + }, + { + "epoch": 4.899102333931777, + "grad_norm": 1.0736947059631348, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 68220 + }, + { + "epoch": 4.899820466786355, + "grad_norm": 0.9318404793739319, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 68230 + }, + { + "epoch": 4.900538599640933, + "grad_norm": 0.8594326972961426, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 68240 + }, + { + "epoch": 4.901256732495511, + "grad_norm": 1.1437443494796753, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 68250 + }, + { + "epoch": 4.901974865350089, + "grad_norm": 1.1599408388137817, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 68260 + }, + { + "epoch": 4.902692998204667, + "grad_norm": 1.160628080368042, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 68270 + }, + { + "epoch": 4.903411131059246, + "grad_norm": 1.0147801637649536, + "learning_rate": 0.0002, + "loss": 0.613, + "step": 68280 + }, + { + "epoch": 4.904129263913824, + "grad_norm": 0.8622691631317139, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 68290 + }, + { + "epoch": 4.904847396768402, + "grad_norm": 0.7179980874061584, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 68300 + }, + { + "epoch": 4.90556552962298, + "grad_norm": 1.1705092191696167, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 68310 + }, + { + "epoch": 4.906283662477558, + "grad_norm": 1.1687676906585693, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 68320 + }, + { + "epoch": 4.907001795332136, + "grad_norm": 1.1621531248092651, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 68330 + }, + { + "epoch": 4.907719928186714, + "grad_norm": 1.0241422653198242, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 68340 + }, + { + "epoch": 4.908438061041292, + "grad_norm": 0.943354070186615, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 68350 + }, + { + "epoch": 4.909156193895871, + "grad_norm": 0.8091703653335571, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 68360 + }, + { + "epoch": 4.909874326750449, + "grad_norm": 0.8871228694915771, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 68370 + }, + { + "epoch": 4.910592459605027, + "grad_norm": 1.0951069593429565, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 68380 + }, + { + "epoch": 4.911310592459605, + "grad_norm": 1.1355193853378296, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 68390 + }, + { + "epoch": 4.912028725314183, + "grad_norm": 1.0741122961044312, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 68400 + }, + { + "epoch": 4.912746858168761, + "grad_norm": 0.9285269975662231, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 68410 + }, + { + "epoch": 4.913464991023339, + "grad_norm": 1.080695390701294, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 68420 + }, + { + "epoch": 4.914183123877917, + "grad_norm": 0.921331524848938, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 68430 + }, + { + "epoch": 4.914901256732495, + "grad_norm": 0.9763174057006836, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 68440 + }, + { + "epoch": 4.915619389587073, + "grad_norm": 1.1133354902267456, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 68450 + }, + { + "epoch": 4.916337522441651, + "grad_norm": 0.8373502492904663, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 68460 + }, + { + "epoch": 4.91705565529623, + "grad_norm": 0.9192346334457397, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 68470 + }, + { + "epoch": 4.917773788150808, + "grad_norm": 1.0724657773971558, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 68480 + }, + { + "epoch": 4.918491921005386, + "grad_norm": 0.9209843873977661, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 68490 + }, + { + "epoch": 4.919210053859964, + "grad_norm": 0.9201577305793762, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 68500 + }, + { + "epoch": 4.919928186714542, + "grad_norm": 0.8086138963699341, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 68510 + }, + { + "epoch": 4.92064631956912, + "grad_norm": 1.0917785167694092, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 68520 + }, + { + "epoch": 4.921364452423698, + "grad_norm": 0.9287897944450378, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 68530 + }, + { + "epoch": 4.922082585278276, + "grad_norm": 0.9830158948898315, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 68540 + }, + { + "epoch": 4.922800718132855, + "grad_norm": 0.8674678802490234, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 68550 + }, + { + "epoch": 4.923518850987433, + "grad_norm": 0.7996176481246948, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 68560 + }, + { + "epoch": 4.924236983842011, + "grad_norm": 1.1284033060073853, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 68570 + }, + { + "epoch": 4.924955116696589, + "grad_norm": 0.894339919090271, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 68580 + }, + { + "epoch": 4.925673249551167, + "grad_norm": 1.1140280961990356, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 68590 + }, + { + "epoch": 4.926391382405745, + "grad_norm": 0.9048344492912292, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 68600 + }, + { + "epoch": 4.927109515260323, + "grad_norm": 0.9380471706390381, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 68610 + }, + { + "epoch": 4.927827648114901, + "grad_norm": 0.8598429560661316, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 68620 + }, + { + "epoch": 4.928545780969479, + "grad_norm": 1.0813355445861816, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 68630 + }, + { + "epoch": 4.929263913824057, + "grad_norm": 0.979053795337677, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 68640 + }, + { + "epoch": 4.929982046678636, + "grad_norm": 0.8194574117660522, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 68650 + }, + { + "epoch": 4.930700179533214, + "grad_norm": 0.8593540787696838, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 68660 + }, + { + "epoch": 4.931418312387792, + "grad_norm": 1.0134016275405884, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 68670 + }, + { + "epoch": 4.93213644524237, + "grad_norm": 1.060586929321289, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 68680 + }, + { + "epoch": 4.932854578096948, + "grad_norm": 0.84132319688797, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 68690 + }, + { + "epoch": 4.933572710951526, + "grad_norm": 1.0767526626586914, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 68700 + }, + { + "epoch": 4.934290843806104, + "grad_norm": 0.8858519792556763, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 68710 + }, + { + "epoch": 4.935008976660682, + "grad_norm": 1.194031000137329, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 68720 + }, + { + "epoch": 4.93572710951526, + "grad_norm": 0.8270226120948792, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 68730 + }, + { + "epoch": 4.936445242369839, + "grad_norm": 1.0385973453521729, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 68740 + }, + { + "epoch": 4.937163375224417, + "grad_norm": 0.9062243700027466, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 68750 + }, + { + "epoch": 4.937881508078995, + "grad_norm": 1.0526955127716064, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 68760 + }, + { + "epoch": 4.938599640933573, + "grad_norm": 0.930604100227356, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 68770 + }, + { + "epoch": 4.939317773788151, + "grad_norm": 0.9635265469551086, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 68780 + }, + { + "epoch": 4.940035906642729, + "grad_norm": 0.9825171232223511, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 68790 + }, + { + "epoch": 4.940754039497307, + "grad_norm": 0.9621182680130005, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 68800 + }, + { + "epoch": 4.941472172351885, + "grad_norm": 0.9655307531356812, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 68810 + }, + { + "epoch": 4.942190305206463, + "grad_norm": 1.2948180437088013, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 68820 + }, + { + "epoch": 4.942908438061041, + "grad_norm": 0.9206728339195251, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 68830 + }, + { + "epoch": 4.94362657091562, + "grad_norm": 1.0235631465911865, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 68840 + }, + { + "epoch": 4.944344703770198, + "grad_norm": 1.0542538166046143, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 68850 + }, + { + "epoch": 4.945062836624776, + "grad_norm": 0.9787087440490723, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 68860 + }, + { + "epoch": 4.945780969479354, + "grad_norm": 0.9527219533920288, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 68870 + }, + { + "epoch": 4.946499102333932, + "grad_norm": 1.1525826454162598, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 68880 + }, + { + "epoch": 4.94721723518851, + "grad_norm": 0.8610072731971741, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 68890 + }, + { + "epoch": 4.947935368043088, + "grad_norm": 1.1403616666793823, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 68900 + }, + { + "epoch": 4.948653500897666, + "grad_norm": 1.10334312915802, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 68910 + }, + { + "epoch": 4.949371633752245, + "grad_norm": 0.8633760809898376, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 68920 + }, + { + "epoch": 4.950089766606823, + "grad_norm": 1.1291080713272095, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 68930 + }, + { + "epoch": 4.950807899461401, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 68940 + }, + { + "epoch": 4.951526032315979, + "grad_norm": 0.9207960963249207, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 68950 + }, + { + "epoch": 4.952244165170557, + "grad_norm": 0.9815934300422668, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 68960 + }, + { + "epoch": 4.952962298025135, + "grad_norm": 0.9725701808929443, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 68970 + }, + { + "epoch": 4.953680430879713, + "grad_norm": 0.844926655292511, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 68980 + }, + { + "epoch": 4.954398563734291, + "grad_norm": 0.9898511171340942, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 68990 + }, + { + "epoch": 4.955116696588869, + "grad_norm": 1.1311410665512085, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 69000 + }, + { + "epoch": 4.955834829443447, + "grad_norm": 1.218610405921936, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 69010 + }, + { + "epoch": 4.956552962298025, + "grad_norm": 1.1536420583724976, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 69020 + }, + { + "epoch": 4.957271095152604, + "grad_norm": 1.1857786178588867, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 69030 + }, + { + "epoch": 4.957989228007182, + "grad_norm": 0.9969246983528137, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 69040 + }, + { + "epoch": 4.95870736086176, + "grad_norm": 1.138635277748108, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 69050 + }, + { + "epoch": 4.959425493716338, + "grad_norm": 1.110474705696106, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 69060 + }, + { + "epoch": 4.960143626570916, + "grad_norm": 1.0366318225860596, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 69070 + }, + { + "epoch": 4.960861759425494, + "grad_norm": 0.6927996277809143, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 69080 + }, + { + "epoch": 4.961579892280072, + "grad_norm": 1.0368026494979858, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 69090 + }, + { + "epoch": 4.96229802513465, + "grad_norm": 1.0638312101364136, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 69100 + }, + { + "epoch": 4.9630161579892285, + "grad_norm": 1.0372415781021118, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 69110 + }, + { + "epoch": 4.9637342908438065, + "grad_norm": 0.8257387280464172, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 69120 + }, + { + "epoch": 4.9644524236983845, + "grad_norm": 1.0046974420547485, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 69130 + }, + { + "epoch": 4.9651705565529625, + "grad_norm": 1.0139652490615845, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 69140 + }, + { + "epoch": 4.9658886894075405, + "grad_norm": 1.0214691162109375, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 69150 + }, + { + "epoch": 4.9666068222621185, + "grad_norm": 1.1042424440383911, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 69160 + }, + { + "epoch": 4.9673249551166965, + "grad_norm": 0.8749067783355713, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 69170 + }, + { + "epoch": 4.9680430879712745, + "grad_norm": 0.9894024133682251, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 69180 + }, + { + "epoch": 4.9687612208258525, + "grad_norm": 1.0218034982681274, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 69190 + }, + { + "epoch": 4.9694793536804305, + "grad_norm": 0.9782929420471191, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 69200 + }, + { + "epoch": 4.9701974865350085, + "grad_norm": 0.9373409748077393, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 69210 + }, + { + "epoch": 4.970915619389587, + "grad_norm": 1.0329546928405762, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 69220 + }, + { + "epoch": 4.971633752244165, + "grad_norm": 0.9746108055114746, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 69230 + }, + { + "epoch": 4.972351885098743, + "grad_norm": 0.9202073216438293, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 69240 + }, + { + "epoch": 4.973070017953321, + "grad_norm": 1.078032374382019, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 69250 + }, + { + "epoch": 4.973788150807899, + "grad_norm": 0.8860024809837341, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 69260 + }, + { + "epoch": 4.974506283662477, + "grad_norm": 0.915212094783783, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 69270 + }, + { + "epoch": 4.975224416517055, + "grad_norm": 1.1192166805267334, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 69280 + }, + { + "epoch": 4.975942549371633, + "grad_norm": 0.8387445211410522, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 69290 + }, + { + "epoch": 4.976660682226212, + "grad_norm": 1.1210044622421265, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 69300 + }, + { + "epoch": 4.97737881508079, + "grad_norm": 1.0051207542419434, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 69310 + }, + { + "epoch": 4.978096947935368, + "grad_norm": 0.9248682856559753, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 69320 + }, + { + "epoch": 4.978815080789946, + "grad_norm": 0.8265128135681152, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 69330 + }, + { + "epoch": 4.979533213644524, + "grad_norm": 0.9432681798934937, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 69340 + }, + { + "epoch": 4.980251346499102, + "grad_norm": 1.0135977268218994, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 69350 + }, + { + "epoch": 4.98096947935368, + "grad_norm": 0.9857245683670044, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 69360 + }, + { + "epoch": 4.981687612208258, + "grad_norm": 0.9215952157974243, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 69370 + }, + { + "epoch": 4.982405745062836, + "grad_norm": 1.1518077850341797, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 69380 + }, + { + "epoch": 4.983123877917414, + "grad_norm": 0.8836095929145813, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 69390 + }, + { + "epoch": 4.983842010771993, + "grad_norm": 0.8082528710365295, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 69400 + }, + { + "epoch": 4.984560143626571, + "grad_norm": 0.9295604825019836, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 69410 + }, + { + "epoch": 4.985278276481149, + "grad_norm": 1.002057433128357, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 69420 + }, + { + "epoch": 4.985996409335727, + "grad_norm": 0.8127216100692749, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 69430 + }, + { + "epoch": 4.986714542190305, + "grad_norm": 1.058138370513916, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 69440 + }, + { + "epoch": 4.987432675044883, + "grad_norm": 0.8451166749000549, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 69450 + }, + { + "epoch": 4.988150807899461, + "grad_norm": 0.9687268137931824, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 69460 + }, + { + "epoch": 4.988868940754039, + "grad_norm": 1.0342036485671997, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 69470 + }, + { + "epoch": 4.989587073608618, + "grad_norm": 0.9042398929595947, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 69480 + }, + { + "epoch": 4.990305206463196, + "grad_norm": 1.0575438737869263, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 69490 + }, + { + "epoch": 4.991023339317774, + "grad_norm": 0.9364935159683228, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 69500 + }, + { + "epoch": 4.991741472172352, + "grad_norm": 1.0327378511428833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 69510 + }, + { + "epoch": 4.99245960502693, + "grad_norm": 0.815592885017395, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 69520 + }, + { + "epoch": 4.993177737881508, + "grad_norm": 1.0813369750976562, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 69530 + }, + { + "epoch": 4.993895870736086, + "grad_norm": 1.0277023315429688, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 69540 + }, + { + "epoch": 4.994614003590664, + "grad_norm": 1.0291162729263306, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 69550 + }, + { + "epoch": 4.995332136445242, + "grad_norm": 0.8435685634613037, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 69560 + }, + { + "epoch": 4.99605026929982, + "grad_norm": 1.1972291469573975, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 69570 + }, + { + "epoch": 4.996768402154398, + "grad_norm": 0.8114907741546631, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 69580 + }, + { + "epoch": 4.997486535008977, + "grad_norm": 0.8296133875846863, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 69590 + }, + { + "epoch": 4.998204667863555, + "grad_norm": 1.1728706359863281, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 69600 + }, + { + "epoch": 4.998922800718133, + "grad_norm": 0.9586578607559204, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 69610 + }, + { + "epoch": 4.999640933572711, + "grad_norm": 0.9725151062011719, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 69620 + }, + { + "epoch": 5.0, + "eval_loss": 1.133581519126892, + "eval_runtime": 55.2151, + "eval_samples_per_second": 13.275, + "eval_steps_per_second": 1.666, + "step": 69625 + } + ], + "logging_steps": 10, + "max_steps": 111400, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.222089103310848e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-69625/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a047a4dd89b08c50b0e2191d4179ffdb224a3211 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a88f9308ef20b3c4d153d4a957cacde1fa46066edf30eb26c2c605509f4a45e0 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b59deedd1e70b40e2392aa97eff87153c9c9cb0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6628b8a61dca8165ab27b4260143b429e6cc5f9ed4088ad34ca64278ec6c383 +size 55532922 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec1e34fe25b70f7d9f35aa605b196bc5c462bdfc --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dab978c7ee23cfc662f0d42da7b45d93a548df41020526bfe93c08cc85ee0a39 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c055f80a403296da68796ccb67b060c75cb45c5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e5e91a9ace3d8d8fb92d6af4c7059181b791a614ce519207f20a036473293bc +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b43b8c6fca5067e1a79745a76022a1a19cf04121 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/trainer_state.json @@ -0,0 +1,58566 @@ +{ + "best_metric": 1.0868422985076904, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 83550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000718132854578097, + "grad_norm": 1.0291756391525269, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 10 + }, + { + "epoch": 0.001436265709156194, + "grad_norm": 0.6570823192596436, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 20 + }, + { + "epoch": 0.0021543985637342907, + "grad_norm": 0.693844199180603, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 30 + }, + { + "epoch": 0.002872531418312388, + "grad_norm": 0.5608532428741455, + "learning_rate": 0.0002, + "loss": 0.9377, + "step": 40 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 0.549075722694397, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 50 + }, + { + "epoch": 0.004308797127468581, + "grad_norm": 0.47189879417419434, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 60 + }, + { + "epoch": 0.005026929982046679, + "grad_norm": 0.5799676775932312, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 70 + }, + { + "epoch": 0.005745062836624776, + "grad_norm": 0.45907193422317505, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 80 + }, + { + "epoch": 0.006463195691202872, + "grad_norm": 0.4373045861721039, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 90 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 100 + }, + { + "epoch": 0.007899461400359067, + "grad_norm": 0.5248253345489502, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 110 + }, + { + "epoch": 0.008617594254937163, + "grad_norm": 0.5082874298095703, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 120 + }, + { + "epoch": 0.00933572710951526, + "grad_norm": 0.42670881748199463, + "learning_rate": 0.0002, + "loss": 0.8678, + "step": 130 + }, + { + "epoch": 0.010053859964093357, + "grad_norm": 0.43311649560928345, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 0.43456509709358215, + "learning_rate": 0.0002, + "loss": 0.9252, + "step": 150 + }, + { + "epoch": 0.011490125673249552, + "grad_norm": 0.9222815632820129, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 160 + }, + { + "epoch": 0.012208258527827648, + "grad_norm": 0.42752256989479065, + "learning_rate": 0.0002, + "loss": 0.8651, + "step": 170 + }, + { + "epoch": 0.012926391382405745, + "grad_norm": 0.4175542891025543, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 180 + }, + { + "epoch": 0.013644524236983842, + "grad_norm": 0.4377831518650055, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 190 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 0.47263655066490173, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 200 + }, + { + "epoch": 0.015080789946140035, + "grad_norm": 0.3870520293712616, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 210 + }, + { + "epoch": 0.015798922800718134, + "grad_norm": 0.4950464963912964, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 220 + }, + { + "epoch": 0.01651705565529623, + "grad_norm": 0.4643295407295227, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 230 + }, + { + "epoch": 0.017235188509874325, + "grad_norm": 0.5152903199195862, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 240 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 0.3800727427005768, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 250 + }, + { + "epoch": 0.01867145421903052, + "grad_norm": 0.43700528144836426, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 260 + }, + { + "epoch": 0.01938958707360862, + "grad_norm": 0.3712887763977051, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 270 + }, + { + "epoch": 0.020107719928186715, + "grad_norm": 0.4202553629875183, + "learning_rate": 0.0002, + "loss": 0.8329, + "step": 280 + }, + { + "epoch": 0.02082585278276481, + "grad_norm": 0.40585094690322876, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 0.4685470759868622, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 300 + }, + { + "epoch": 0.022262118491921005, + "grad_norm": 0.373169481754303, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 310 + }, + { + "epoch": 0.022980251346499104, + "grad_norm": 0.39681482315063477, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 320 + }, + { + "epoch": 0.0236983842010772, + "grad_norm": 0.3919322192668915, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 330 + }, + { + "epoch": 0.024416517055655295, + "grad_norm": 0.4728981554508209, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 340 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 0.42439374327659607, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 350 + }, + { + "epoch": 0.02585278276481149, + "grad_norm": 0.425650030374527, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 360 + }, + { + "epoch": 0.02657091561938959, + "grad_norm": 0.4076762795448303, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 370 + }, + { + "epoch": 0.027289048473967684, + "grad_norm": 0.44335922598838806, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 380 + }, + { + "epoch": 0.02800718132854578, + "grad_norm": 0.5313619375228882, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 390 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 0.37089797854423523, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 400 + }, + { + "epoch": 0.029443447037701975, + "grad_norm": 0.5193604826927185, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 410 + }, + { + "epoch": 0.03016157989228007, + "grad_norm": 0.4428552985191345, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 420 + }, + { + "epoch": 0.03087971274685817, + "grad_norm": 0.384171724319458, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 430 + }, + { + "epoch": 0.03159784560143627, + "grad_norm": 0.3906913101673126, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 440 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 0.5365669131278992, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 450 + }, + { + "epoch": 0.03303411131059246, + "grad_norm": 0.4785287380218506, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 460 + }, + { + "epoch": 0.03375224416517056, + "grad_norm": 0.40048182010650635, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 470 + }, + { + "epoch": 0.03447037701974865, + "grad_norm": 0.49529239535331726, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 480 + }, + { + "epoch": 0.03518850987432675, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 490 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 0.3802863359451294, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.03662477558348295, + "grad_norm": 0.40374308824539185, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 510 + }, + { + "epoch": 0.03734290843806104, + "grad_norm": 0.4320009648799896, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 520 + }, + { + "epoch": 0.03806104129263914, + "grad_norm": 0.5198846459388733, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 530 + }, + { + "epoch": 0.03877917414721724, + "grad_norm": 0.4136947989463806, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 540 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 0.39344364404678345, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 550 + }, + { + "epoch": 0.04021543985637343, + "grad_norm": 0.4659644067287445, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 560 + }, + { + "epoch": 0.04093357271095153, + "grad_norm": 0.3898842930793762, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 570 + }, + { + "epoch": 0.04165170556552962, + "grad_norm": 0.3964841961860657, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 580 + }, + { + "epoch": 0.04236983842010772, + "grad_norm": 0.5172179341316223, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 590 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 0.5362544059753418, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 600 + }, + { + "epoch": 0.04380610412926391, + "grad_norm": 0.3975909948348999, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 610 + }, + { + "epoch": 0.04452423698384201, + "grad_norm": 0.3905031085014343, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 620 + }, + { + "epoch": 0.04524236983842011, + "grad_norm": 0.5148088932037354, + "learning_rate": 0.0002, + "loss": 0.7723, + "step": 630 + }, + { + "epoch": 0.04596050269299821, + "grad_norm": 0.38826194405555725, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 640 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 0.5432049036026001, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.0473967684021544, + "grad_norm": 0.42048221826553345, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 660 + }, + { + "epoch": 0.0481149012567325, + "grad_norm": 0.4683088958263397, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 670 + }, + { + "epoch": 0.04883303411131059, + "grad_norm": 0.4623735249042511, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 680 + }, + { + "epoch": 0.04955116696588869, + "grad_norm": 0.509128212928772, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 690 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 0.45767295360565186, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 700 + }, + { + "epoch": 0.05098743267504488, + "grad_norm": 0.4023726284503937, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 710 + }, + { + "epoch": 0.05170556552962298, + "grad_norm": 0.4407201409339905, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 720 + }, + { + "epoch": 0.05242369838420108, + "grad_norm": 0.41862091422080994, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 730 + }, + { + "epoch": 0.05314183123877918, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 740 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 0.4882921576499939, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 750 + }, + { + "epoch": 0.05457809694793537, + "grad_norm": 0.47890132665634155, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 760 + }, + { + "epoch": 0.05529622980251347, + "grad_norm": 0.5811166167259216, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 770 + }, + { + "epoch": 0.05601436265709156, + "grad_norm": 0.41113588213920593, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 780 + }, + { + "epoch": 0.05673249551166966, + "grad_norm": 0.4120602607727051, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 790 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 0.39287394285202026, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 800 + }, + { + "epoch": 0.05816876122082585, + "grad_norm": 0.3986941874027252, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 810 + }, + { + "epoch": 0.05888689407540395, + "grad_norm": 0.4264012575149536, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 820 + }, + { + "epoch": 0.05960502692998205, + "grad_norm": 0.481139600276947, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 830 + }, + { + "epoch": 0.06032315978456014, + "grad_norm": 0.5561784505844116, + "learning_rate": 0.0002, + "loss": 0.8477, + "step": 840 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 0.4787197411060333, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 850 + }, + { + "epoch": 0.06175942549371634, + "grad_norm": 0.46454647183418274, + "learning_rate": 0.0002, + "loss": 0.8567, + "step": 860 + }, + { + "epoch": 0.06247755834829444, + "grad_norm": 0.5929669141769409, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 870 + }, + { + "epoch": 0.06319569120287254, + "grad_norm": 0.4561384618282318, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 880 + }, + { + "epoch": 0.06391382405745062, + "grad_norm": 0.45767998695373535, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 890 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 0.42475444078445435, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 900 + }, + { + "epoch": 0.06535008976660682, + "grad_norm": 0.4911022484302521, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 910 + }, + { + "epoch": 0.06606822262118492, + "grad_norm": 0.5229166746139526, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 920 + }, + { + "epoch": 0.06678635547576302, + "grad_norm": 0.38134580850601196, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 930 + }, + { + "epoch": 0.06750448833034112, + "grad_norm": 0.4171486496925354, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 940 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 0.45171529054641724, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 950 + }, + { + "epoch": 0.0689407540394973, + "grad_norm": 0.44889307022094727, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 960 + }, + { + "epoch": 0.0696588868940754, + "grad_norm": 0.44902464747428894, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 970 + }, + { + "epoch": 0.0703770197486535, + "grad_norm": 0.4671969413757324, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 980 + }, + { + "epoch": 0.0710951526032316, + "grad_norm": 0.4686984717845917, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 990 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1000 + }, + { + "epoch": 0.0725314183123878, + "grad_norm": 0.48861828446388245, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1010 + }, + { + "epoch": 0.0732495511669659, + "grad_norm": 0.7603165507316589, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 1020 + }, + { + "epoch": 0.07396768402154398, + "grad_norm": 0.501654863357544, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 1030 + }, + { + "epoch": 0.07468581687612208, + "grad_norm": 0.45291560888290405, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 1040 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 0.42454713582992554, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 1050 + }, + { + "epoch": 0.07612208258527828, + "grad_norm": 0.4655592441558838, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1060 + }, + { + "epoch": 0.07684021543985638, + "grad_norm": 0.5011071562767029, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 1070 + }, + { + "epoch": 0.07755834829443448, + "grad_norm": 0.37221577763557434, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 1080 + }, + { + "epoch": 0.07827648114901256, + "grad_norm": 0.5123572945594788, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 1090 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 0.44138720631599426, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1100 + }, + { + "epoch": 0.07971274685816876, + "grad_norm": 0.38932886719703674, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 1110 + }, + { + "epoch": 0.08043087971274686, + "grad_norm": 0.435820072889328, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 1120 + }, + { + "epoch": 0.08114901256732496, + "grad_norm": 0.3820142149925232, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 1130 + }, + { + "epoch": 0.08186714542190306, + "grad_norm": 0.39680808782577515, + "learning_rate": 0.0002, + "loss": 0.8617, + "step": 1140 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 0.4833722412586212, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1150 + }, + { + "epoch": 0.08330341113105924, + "grad_norm": 0.5045956969261169, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1160 + }, + { + "epoch": 0.08402154398563734, + "grad_norm": 0.3652207553386688, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 1170 + }, + { + "epoch": 0.08473967684021544, + "grad_norm": 0.44447052478790283, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 1180 + }, + { + "epoch": 0.08545780969479354, + "grad_norm": 0.44942694902420044, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 1190 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 0.48789075016975403, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1200 + }, + { + "epoch": 0.08689407540394974, + "grad_norm": 0.3981451094150543, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1210 + }, + { + "epoch": 0.08761220825852782, + "grad_norm": 0.45545220375061035, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 1220 + }, + { + "epoch": 0.08833034111310592, + "grad_norm": 0.562138557434082, + "learning_rate": 0.0002, + "loss": 0.8406, + "step": 1230 + }, + { + "epoch": 0.08904847396768402, + "grad_norm": 0.48523494601249695, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 1240 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 0.35054388642311096, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1250 + }, + { + "epoch": 0.09048473967684022, + "grad_norm": 0.4148605167865753, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 1260 + }, + { + "epoch": 0.09120287253141832, + "grad_norm": 0.50171959400177, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 1270 + }, + { + "epoch": 0.09192100538599642, + "grad_norm": 0.41747573018074036, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 1280 + }, + { + "epoch": 0.0926391382405745, + "grad_norm": 0.43028751015663147, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1290 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 0.41274991631507874, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 1300 + }, + { + "epoch": 0.0940754039497307, + "grad_norm": 0.5399569272994995, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 1310 + }, + { + "epoch": 0.0947935368043088, + "grad_norm": 0.44284379482269287, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 1320 + }, + { + "epoch": 0.0955116696588869, + "grad_norm": 0.42511969804763794, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1330 + }, + { + "epoch": 0.096229802513465, + "grad_norm": 0.5717929005622864, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1340 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1350 + }, + { + "epoch": 0.09766606822262118, + "grad_norm": 0.4144339859485626, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 1360 + }, + { + "epoch": 0.09838420107719928, + "grad_norm": 0.43676936626434326, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 1370 + }, + { + "epoch": 0.09910233393177738, + "grad_norm": 0.5297161340713501, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 1380 + }, + { + "epoch": 0.09982046678635548, + "grad_norm": 0.5319193601608276, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1390 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 0.4083728492259979, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1400 + }, + { + "epoch": 0.10125673249551168, + "grad_norm": 0.4193868339061737, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1410 + }, + { + "epoch": 0.10197486535008976, + "grad_norm": 0.4062198996543884, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 1420 + }, + { + "epoch": 0.10269299820466786, + "grad_norm": 0.43972232937812805, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1430 + }, + { + "epoch": 0.10341113105924596, + "grad_norm": 0.4598410725593567, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1440 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 0.571662187576294, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1450 + }, + { + "epoch": 0.10484739676840216, + "grad_norm": 0.5437791347503662, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1460 + }, + { + "epoch": 0.10556552962298026, + "grad_norm": 0.4241923391819, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1470 + }, + { + "epoch": 0.10628366247755835, + "grad_norm": 0.5185145735740662, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1480 + }, + { + "epoch": 0.10700179533213644, + "grad_norm": 0.537626326084137, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 1490 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 0.4573661983013153, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 1500 + }, + { + "epoch": 0.10843806104129264, + "grad_norm": 0.4521017074584961, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 1510 + }, + { + "epoch": 0.10915619389587074, + "grad_norm": 0.6835159063339233, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1520 + }, + { + "epoch": 0.10987432675044884, + "grad_norm": 0.43522894382476807, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 1530 + }, + { + "epoch": 0.11059245960502694, + "grad_norm": 0.685547411441803, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 1540 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 0.5283669233322144, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1550 + }, + { + "epoch": 0.11202872531418312, + "grad_norm": 0.4869283437728882, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 1560 + }, + { + "epoch": 0.11274685816876122, + "grad_norm": 0.43024054169654846, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1570 + }, + { + "epoch": 0.11346499102333932, + "grad_norm": 0.46726059913635254, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1580 + }, + { + "epoch": 0.11418312387791742, + "grad_norm": 0.5046039819717407, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 1590 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 0.48972827196121216, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 1600 + }, + { + "epoch": 0.11561938958707361, + "grad_norm": 0.5221049189567566, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 1610 + }, + { + "epoch": 0.1163375224416517, + "grad_norm": 0.49169477820396423, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 1620 + }, + { + "epoch": 0.1170556552962298, + "grad_norm": 0.48462188243865967, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 1630 + }, + { + "epoch": 0.1177737881508079, + "grad_norm": 0.9001021981239319, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 1640 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 0.47555917501449585, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 1650 + }, + { + "epoch": 0.1192100538599641, + "grad_norm": 0.4523521959781647, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1660 + }, + { + "epoch": 0.1199281867145422, + "grad_norm": 0.510956346988678, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 1670 + }, + { + "epoch": 0.12064631956912028, + "grad_norm": 0.48063746094703674, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 0.12136445242369838, + "grad_norm": 0.5209490060806274, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 1690 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 0.5488983988761902, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1700 + }, + { + "epoch": 0.12280071813285458, + "grad_norm": 0.5263523459434509, + "learning_rate": 0.0002, + "loss": 0.829, + "step": 1710 + }, + { + "epoch": 0.12351885098743268, + "grad_norm": 0.45365768671035767, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 1720 + }, + { + "epoch": 0.12423698384201078, + "grad_norm": 0.4366922378540039, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 1730 + }, + { + "epoch": 0.12495511669658887, + "grad_norm": 0.4841083884239197, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 1740 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 0.46546968817710876, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 1750 + }, + { + "epoch": 0.12639138240574507, + "grad_norm": 0.39987099170684814, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1760 + }, + { + "epoch": 0.12710951526032316, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 1770 + }, + { + "epoch": 0.12782764811490124, + "grad_norm": 0.46716657280921936, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 1780 + }, + { + "epoch": 0.12854578096947936, + "grad_norm": 0.46164995431900024, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1790 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 0.4910370111465454, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 1800 + }, + { + "epoch": 0.12998204667863555, + "grad_norm": 0.5615737438201904, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 1810 + }, + { + "epoch": 0.13070017953321364, + "grad_norm": 0.5739728808403015, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1820 + }, + { + "epoch": 0.13141831238779175, + "grad_norm": 0.44104722142219543, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 1830 + }, + { + "epoch": 0.13213644524236984, + "grad_norm": 0.46373724937438965, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 1840 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 0.4481196403503418, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.13357271095152604, + "grad_norm": 0.5689327716827393, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 1860 + }, + { + "epoch": 0.13429084380610412, + "grad_norm": 0.5334849953651428, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 1870 + }, + { + "epoch": 0.13500897666068223, + "grad_norm": 0.5177253484725952, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 1880 + }, + { + "epoch": 0.13572710951526032, + "grad_norm": 0.4919368326663971, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 1890 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 0.5987576842308044, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 1900 + }, + { + "epoch": 0.13716337522441652, + "grad_norm": 0.49790486693382263, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 1910 + }, + { + "epoch": 0.1378815080789946, + "grad_norm": 0.5337542295455933, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1920 + }, + { + "epoch": 0.13859964093357272, + "grad_norm": 0.5171598792076111, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 1930 + }, + { + "epoch": 0.1393177737881508, + "grad_norm": 0.5003953576087952, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1940 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 0.5147887468338013, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 1950 + }, + { + "epoch": 0.140754039497307, + "grad_norm": 0.6365984678268433, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 1960 + }, + { + "epoch": 0.1414721723518851, + "grad_norm": 0.5449512004852295, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 1970 + }, + { + "epoch": 0.1421903052064632, + "grad_norm": 0.4062703847885132, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1980 + }, + { + "epoch": 0.14290843806104128, + "grad_norm": 0.4446912705898285, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 1990 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 0.49001234769821167, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 2000 + }, + { + "epoch": 0.14434470377019748, + "grad_norm": 0.5591765642166138, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 2010 + }, + { + "epoch": 0.1450628366247756, + "grad_norm": 0.6476696133613586, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 2020 + }, + { + "epoch": 0.14578096947935368, + "grad_norm": 0.44688376784324646, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 2030 + }, + { + "epoch": 0.1464991023339318, + "grad_norm": 0.4437490701675415, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 2040 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 0.59927898645401, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 2050 + }, + { + "epoch": 0.14793536804308796, + "grad_norm": 0.4356591999530792, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 2060 + }, + { + "epoch": 0.14865350089766607, + "grad_norm": 0.5560822486877441, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2070 + }, + { + "epoch": 0.14937163375224416, + "grad_norm": 0.43027108907699585, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2080 + }, + { + "epoch": 0.15008976660682227, + "grad_norm": 0.41215455532073975, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 2090 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 0.4607839584350586, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 2100 + }, + { + "epoch": 0.15152603231597844, + "grad_norm": 0.4699854254722595, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2110 + }, + { + "epoch": 0.15224416517055656, + "grad_norm": 0.5111975073814392, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2120 + }, + { + "epoch": 0.15296229802513464, + "grad_norm": 0.4713742733001709, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2130 + }, + { + "epoch": 0.15368043087971275, + "grad_norm": 0.3816622793674469, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2140 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 0.4637526273727417, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 2150 + }, + { + "epoch": 0.15511669658886895, + "grad_norm": 0.3691818118095398, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2160 + }, + { + "epoch": 0.15583482944344704, + "grad_norm": 0.4435218274593353, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 2170 + }, + { + "epoch": 0.15655296229802512, + "grad_norm": 0.5282211899757385, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 2180 + }, + { + "epoch": 0.15727109515260324, + "grad_norm": 0.7611056566238403, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 2190 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 0.5951169729232788, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 2200 + }, + { + "epoch": 0.15870736086175943, + "grad_norm": 0.5243265628814697, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2210 + }, + { + "epoch": 0.15942549371633752, + "grad_norm": 0.518944501876831, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 2220 + }, + { + "epoch": 0.16014362657091563, + "grad_norm": 0.4264616072177887, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2230 + }, + { + "epoch": 0.16086175942549372, + "grad_norm": 0.4619045853614807, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 2240 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 0.4047030508518219, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2250 + }, + { + "epoch": 0.16229802513464991, + "grad_norm": 0.47133687138557434, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 2260 + }, + { + "epoch": 0.163016157989228, + "grad_norm": 0.4990246593952179, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 2270 + }, + { + "epoch": 0.1637342908438061, + "grad_norm": 0.5145298838615417, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 2280 + }, + { + "epoch": 0.1644524236983842, + "grad_norm": 0.5354352593421936, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 2290 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 0.47621065378189087, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 2300 + }, + { + "epoch": 0.1658886894075404, + "grad_norm": 0.45333582162857056, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 2310 + }, + { + "epoch": 0.16660682226211848, + "grad_norm": 0.4832790493965149, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 2320 + }, + { + "epoch": 0.1673249551166966, + "grad_norm": 0.4922761619091034, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2330 + }, + { + "epoch": 0.16804308797127468, + "grad_norm": 0.5701655149459839, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 2340 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 0.5170459151268005, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 2350 + }, + { + "epoch": 0.16947935368043088, + "grad_norm": 0.6562373638153076, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2360 + }, + { + "epoch": 0.170197486535009, + "grad_norm": 0.5350262522697449, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 2370 + }, + { + "epoch": 0.17091561938958708, + "grad_norm": 0.5163491368293762, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 2380 + }, + { + "epoch": 0.17163375224416516, + "grad_norm": 0.48841530084609985, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2390 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 0.44912993907928467, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 2400 + }, + { + "epoch": 0.17307001795332136, + "grad_norm": 0.5770647525787354, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 2410 + }, + { + "epoch": 0.17378815080789947, + "grad_norm": 0.4716179072856903, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 2420 + }, + { + "epoch": 0.17450628366247756, + "grad_norm": 0.5465078949928284, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 2430 + }, + { + "epoch": 0.17522441651705564, + "grad_norm": 0.40810713171958923, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 2440 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 0.3789578080177307, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 2450 + }, + { + "epoch": 0.17666068222621184, + "grad_norm": 0.4615110158920288, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 2460 + }, + { + "epoch": 0.17737881508078995, + "grad_norm": 0.4400235712528229, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2470 + }, + { + "epoch": 0.17809694793536804, + "grad_norm": 0.5935020446777344, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2480 + }, + { + "epoch": 0.17881508078994615, + "grad_norm": 0.5672990679740906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 2490 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 0.4132838845252991, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 2500 + }, + { + "epoch": 0.18025134649910232, + "grad_norm": 0.5373716950416565, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 2510 + }, + { + "epoch": 0.18096947935368043, + "grad_norm": 0.5335832834243774, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2520 + }, + { + "epoch": 0.18168761220825852, + "grad_norm": 0.5705642700195312, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.18240574506283663, + "grad_norm": 0.4807959496974945, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 2540 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 0.4430573880672455, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2550 + }, + { + "epoch": 0.18384201077199283, + "grad_norm": 0.5294728875160217, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 2560 + }, + { + "epoch": 0.18456014362657092, + "grad_norm": 0.661173403263092, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2570 + }, + { + "epoch": 0.185278276481149, + "grad_norm": 0.5044304728507996, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2580 + }, + { + "epoch": 0.18599640933572711, + "grad_norm": 0.48929551243782043, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 2590 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 0.5054438710212708, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2600 + }, + { + "epoch": 0.1874326750448833, + "grad_norm": 0.5613677501678467, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 2610 + }, + { + "epoch": 0.1881508078994614, + "grad_norm": 0.5762478709220886, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 2620 + }, + { + "epoch": 0.1888689407540395, + "grad_norm": 0.4523695409297943, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2630 + }, + { + "epoch": 0.1895870736086176, + "grad_norm": 0.5235317945480347, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 2640 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 0.4894576370716095, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 2650 + }, + { + "epoch": 0.1910233393177738, + "grad_norm": 0.45731106400489807, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2660 + }, + { + "epoch": 0.19174147217235188, + "grad_norm": 0.4726541042327881, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2670 + }, + { + "epoch": 0.19245960502693, + "grad_norm": 0.4281631410121918, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 2680 + }, + { + "epoch": 0.19317773788150808, + "grad_norm": 0.48011314868927, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 2690 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 0.45785006880760193, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2700 + }, + { + "epoch": 0.19461400359066428, + "grad_norm": 0.5244625210762024, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 2710 + }, + { + "epoch": 0.19533213644524236, + "grad_norm": 0.4674883186817169, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2720 + }, + { + "epoch": 0.19605026929982047, + "grad_norm": 0.5969558358192444, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 2730 + }, + { + "epoch": 0.19676840215439856, + "grad_norm": 0.44413265585899353, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 2740 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 0.5094553828239441, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2750 + }, + { + "epoch": 0.19820466786355476, + "grad_norm": 0.4931736886501312, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2760 + }, + { + "epoch": 0.19892280071813284, + "grad_norm": 0.4766625463962555, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 2770 + }, + { + "epoch": 0.19964093357271095, + "grad_norm": 0.4196971654891968, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2780 + }, + { + "epoch": 0.20035906642728904, + "grad_norm": 0.4693375825881958, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 2790 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 0.5407108664512634, + "learning_rate": 0.0002, + "loss": 0.8336, + "step": 2800 + }, + { + "epoch": 0.20179533213644524, + "grad_norm": 0.42864227294921875, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2810 + }, + { + "epoch": 0.20251346499102335, + "grad_norm": 0.4928833246231079, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 2820 + }, + { + "epoch": 0.20323159784560144, + "grad_norm": 0.5575131773948669, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2830 + }, + { + "epoch": 0.20394973070017952, + "grad_norm": 0.505114734172821, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2840 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 0.4727420210838318, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 2850 + }, + { + "epoch": 0.20538599640933572, + "grad_norm": 0.48218145966529846, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 2860 + }, + { + "epoch": 0.20610412926391383, + "grad_norm": 0.5196906328201294, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2870 + }, + { + "epoch": 0.20682226211849192, + "grad_norm": 0.4927639067173004, + "learning_rate": 0.0002, + "loss": 0.8401, + "step": 2880 + }, + { + "epoch": 0.20754039497307003, + "grad_norm": 0.5076990127563477, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 2890 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.4606800079345703, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 2900 + }, + { + "epoch": 0.2089766606822262, + "grad_norm": 0.6184319257736206, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2910 + }, + { + "epoch": 0.2096947935368043, + "grad_norm": 0.5237935781478882, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2920 + }, + { + "epoch": 0.2104129263913824, + "grad_norm": 0.43966251611709595, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 2930 + }, + { + "epoch": 0.2111310592459605, + "grad_norm": 0.48786666989326477, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2940 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 0.4397817552089691, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 2950 + }, + { + "epoch": 0.2125673249551167, + "grad_norm": 0.5155336260795593, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.2132854578096948, + "grad_norm": 0.48058274388313293, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2970 + }, + { + "epoch": 0.21400359066427288, + "grad_norm": 0.5022647976875305, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2980 + }, + { + "epoch": 0.214721723518851, + "grad_norm": 0.5417225360870361, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 2990 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 0.46300315856933594, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 3000 + }, + { + "epoch": 0.2161579892280072, + "grad_norm": 0.5375089049339294, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 3010 + }, + { + "epoch": 0.21687612208258528, + "grad_norm": 0.5050022602081299, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 3020 + }, + { + "epoch": 0.21759425493716336, + "grad_norm": 0.46347716450691223, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 3030 + }, + { + "epoch": 0.21831238779174147, + "grad_norm": 0.544874370098114, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 3040 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 3050 + }, + { + "epoch": 0.21974865350089767, + "grad_norm": 0.5527157187461853, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 3060 + }, + { + "epoch": 0.22046678635547576, + "grad_norm": 0.5565235018730164, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 3070 + }, + { + "epoch": 0.22118491921005387, + "grad_norm": 0.4900645613670349, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 3080 + }, + { + "epoch": 0.22190305206463196, + "grad_norm": 0.4951242208480835, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 3090 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 0.5831719636917114, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 3100 + }, + { + "epoch": 0.22333931777378815, + "grad_norm": 0.417576402425766, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 3110 + }, + { + "epoch": 0.22405745062836624, + "grad_norm": 0.4715117812156677, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 3120 + }, + { + "epoch": 0.22477558348294435, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 3130 + }, + { + "epoch": 0.22549371633752244, + "grad_norm": 0.408184289932251, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 3140 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 3150 + }, + { + "epoch": 0.22692998204667864, + "grad_norm": 0.5631294846534729, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3160 + }, + { + "epoch": 0.22764811490125672, + "grad_norm": 0.5054665803909302, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3170 + }, + { + "epoch": 0.22836624775583483, + "grad_norm": 0.47388020157814026, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 3180 + }, + { + "epoch": 0.22908438061041292, + "grad_norm": 0.45871609449386597, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 3190 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.42431211471557617, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 3200 + }, + { + "epoch": 0.23052064631956912, + "grad_norm": 0.584872305393219, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3210 + }, + { + "epoch": 0.23123877917414723, + "grad_norm": 0.5489653944969177, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 3220 + }, + { + "epoch": 0.23195691202872532, + "grad_norm": 0.5803213119506836, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 3230 + }, + { + "epoch": 0.2326750448833034, + "grad_norm": 0.906505823135376, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3240 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 0.4569525718688965, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 3250 + }, + { + "epoch": 0.2341113105924596, + "grad_norm": 0.5566741228103638, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3260 + }, + { + "epoch": 0.2348294434470377, + "grad_norm": 0.5059959888458252, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3270 + }, + { + "epoch": 0.2355475763016158, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 3280 + }, + { + "epoch": 0.2362657091561939, + "grad_norm": 0.5149409174919128, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 3290 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 0.7323763966560364, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3300 + }, + { + "epoch": 0.23770197486535008, + "grad_norm": 0.6794836521148682, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 3310 + }, + { + "epoch": 0.2384201077199282, + "grad_norm": 0.5176534056663513, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 3320 + }, + { + "epoch": 0.23913824057450628, + "grad_norm": 0.42245906591415405, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 3330 + }, + { + "epoch": 0.2398563734290844, + "grad_norm": 0.43535107374191284, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 0.7038307785987854, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 3350 + }, + { + "epoch": 0.24129263913824056, + "grad_norm": 0.5689977407455444, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 0.24201077199281867, + "grad_norm": 0.538136899471283, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 3370 + }, + { + "epoch": 0.24272890484739676, + "grad_norm": 0.7433661222457886, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 3380 + }, + { + "epoch": 0.24344703770197487, + "grad_norm": 0.6996734738349915, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3390 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.5055703520774841, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 3400 + }, + { + "epoch": 0.24488330341113107, + "grad_norm": 0.5218513607978821, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 3410 + }, + { + "epoch": 0.24560143626570916, + "grad_norm": 0.42782822251319885, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3420 + }, + { + "epoch": 0.24631956912028724, + "grad_norm": 0.4991157650947571, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 3430 + }, + { + "epoch": 0.24703770197486535, + "grad_norm": 0.5063165426254272, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3440 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 0.45863136649131775, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3450 + }, + { + "epoch": 0.24847396768402155, + "grad_norm": 0.474728524684906, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3460 + }, + { + "epoch": 0.24919210053859964, + "grad_norm": 0.522570013999939, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 3470 + }, + { + "epoch": 0.24991023339317775, + "grad_norm": 0.5474396347999573, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 3480 + }, + { + "epoch": 0.2506283662477558, + "grad_norm": 0.49094662070274353, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3490 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 0.6399132609367371, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 3500 + }, + { + "epoch": 0.25206463195691203, + "grad_norm": 0.5910066366195679, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 3510 + }, + { + "epoch": 0.25278276481149015, + "grad_norm": 0.4761259853839874, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3520 + }, + { + "epoch": 0.2535008976660682, + "grad_norm": 0.5124502182006836, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 3530 + }, + { + "epoch": 0.2542190305206463, + "grad_norm": 0.4329150915145874, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3540 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 0.4839608371257782, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 3550 + }, + { + "epoch": 0.2556552962298025, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3560 + }, + { + "epoch": 0.2563734290843806, + "grad_norm": 0.5761468410491943, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 3570 + }, + { + "epoch": 0.2570915619389587, + "grad_norm": 0.49266132712364197, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3580 + }, + { + "epoch": 0.2578096947935368, + "grad_norm": 0.7377930879592896, + "learning_rate": 0.0002, + "loss": 0.7946, + "step": 3590 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 0.543541431427002, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3600 + }, + { + "epoch": 0.259245960502693, + "grad_norm": 0.48385897278785706, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3610 + }, + { + "epoch": 0.2599640933572711, + "grad_norm": 0.5152639746665955, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3620 + }, + { + "epoch": 0.26068222621184917, + "grad_norm": 0.5601988434791565, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 3630 + }, + { + "epoch": 0.2614003590664273, + "grad_norm": 0.4349626302719116, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 0.5487161874771118, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3650 + }, + { + "epoch": 0.2628366247755835, + "grad_norm": 0.45603805780410767, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 3660 + }, + { + "epoch": 0.26355475763016156, + "grad_norm": 0.5012730956077576, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 3670 + }, + { + "epoch": 0.2642728904847397, + "grad_norm": 0.4523845314979553, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 3680 + }, + { + "epoch": 0.2649910233393178, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 3690 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 0.48467493057250977, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 3700 + }, + { + "epoch": 0.26642728904847396, + "grad_norm": 0.4860585927963257, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3710 + }, + { + "epoch": 0.26714542190305207, + "grad_norm": 0.5067077875137329, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3720 + }, + { + "epoch": 0.2678635547576302, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3730 + }, + { + "epoch": 0.26858168761220824, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 3740 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 0.5026951432228088, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 3750 + }, + { + "epoch": 0.27001795332136447, + "grad_norm": 0.49474090337753296, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3760 + }, + { + "epoch": 0.2707360861759425, + "grad_norm": 0.6381985545158386, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 3770 + }, + { + "epoch": 0.27145421903052064, + "grad_norm": 0.4784011244773865, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 3780 + }, + { + "epoch": 0.27217235188509875, + "grad_norm": 0.5126543045043945, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 3790 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 3800 + }, + { + "epoch": 0.2736086175942549, + "grad_norm": 0.5427033305168152, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 3810 + }, + { + "epoch": 0.27432675044883303, + "grad_norm": 0.46467480063438416, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 3820 + }, + { + "epoch": 0.27504488330341115, + "grad_norm": 0.494367390871048, + "learning_rate": 0.0002, + "loss": 0.8414, + "step": 3830 + }, + { + "epoch": 0.2757630161579892, + "grad_norm": 0.59856778383255, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3840 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 0.422128826379776, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 3850 + }, + { + "epoch": 0.27719928186714543, + "grad_norm": 0.5757306814193726, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 3860 + }, + { + "epoch": 0.27791741472172354, + "grad_norm": 0.5850930213928223, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.2786355475763016, + "grad_norm": 0.5633023977279663, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3880 + }, + { + "epoch": 0.2793536804308797, + "grad_norm": 0.5037940144538879, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 3890 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 0.5255506038665771, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 3900 + }, + { + "epoch": 0.2807899461400359, + "grad_norm": 0.44584617018699646, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 3910 + }, + { + "epoch": 0.281508078994614, + "grad_norm": 0.4803239405155182, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 3920 + }, + { + "epoch": 0.2822262118491921, + "grad_norm": 0.5206008553504944, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 3930 + }, + { + "epoch": 0.2829443447037702, + "grad_norm": 0.5596373081207275, + "learning_rate": 0.0002, + "loss": 0.8988, + "step": 3940 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 0.4487258493900299, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 3950 + }, + { + "epoch": 0.2843806104129264, + "grad_norm": 0.4774281978607178, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3960 + }, + { + "epoch": 0.2850987432675045, + "grad_norm": 0.571829617023468, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3970 + }, + { + "epoch": 0.28581687612208256, + "grad_norm": 0.45251455903053284, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 3980 + }, + { + "epoch": 0.2865350089766607, + "grad_norm": 0.5119943618774414, + "learning_rate": 0.0002, + "loss": 0.8007, + "step": 3990 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.42333969473838806, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 4000 + }, + { + "epoch": 0.2879712746858169, + "grad_norm": 0.5694096684455872, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 4010 + }, + { + "epoch": 0.28868940754039496, + "grad_norm": 0.44457492232322693, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 4020 + }, + { + "epoch": 0.2894075403949731, + "grad_norm": 0.496545672416687, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 4030 + }, + { + "epoch": 0.2901256732495512, + "grad_norm": 0.5092352032661438, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 4040 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 0.5124567151069641, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4050 + }, + { + "epoch": 0.29156193895870736, + "grad_norm": 0.5148161053657532, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4060 + }, + { + "epoch": 0.29228007181328547, + "grad_norm": 0.48183947801589966, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4070 + }, + { + "epoch": 0.2929982046678636, + "grad_norm": 0.47728800773620605, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4080 + }, + { + "epoch": 0.29371633752244164, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.5343585014343262, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 4100 + }, + { + "epoch": 0.29515260323159787, + "grad_norm": 0.5760312676429749, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 4110 + }, + { + "epoch": 0.2958707360861759, + "grad_norm": 0.5894787907600403, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4120 + }, + { + "epoch": 0.29658886894075404, + "grad_norm": 0.4528578817844391, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 4130 + }, + { + "epoch": 0.29730700179533215, + "grad_norm": 0.6027235388755798, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 4140 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.5060310959815979, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 4150 + }, + { + "epoch": 0.2987432675044883, + "grad_norm": 0.475252628326416, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4160 + }, + { + "epoch": 0.29946140035906643, + "grad_norm": 0.4855351448059082, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 4170 + }, + { + "epoch": 0.30017953321364454, + "grad_norm": 0.6720767021179199, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4180 + }, + { + "epoch": 0.3008976660682226, + "grad_norm": 0.6409553289413452, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 4190 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.5508167147636414, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 4200 + }, + { + "epoch": 0.30233393177737883, + "grad_norm": 0.45958149433135986, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 4210 + }, + { + "epoch": 0.3030520646319569, + "grad_norm": 0.5201641321182251, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 4220 + }, + { + "epoch": 0.303770197486535, + "grad_norm": 0.5440032482147217, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4230 + }, + { + "epoch": 0.3044883303411131, + "grad_norm": 0.43566814064979553, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4240 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 0.4479893445968628, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 4250 + }, + { + "epoch": 0.3059245960502693, + "grad_norm": 0.40390217304229736, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4260 + }, + { + "epoch": 0.3066427289048474, + "grad_norm": 0.5143486261367798, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 4270 + }, + { + "epoch": 0.3073608617594255, + "grad_norm": 0.5289962887763977, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 4280 + }, + { + "epoch": 0.30807899461400357, + "grad_norm": 0.609561026096344, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 4290 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 0.5967493653297424, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 4300 + }, + { + "epoch": 0.3095152603231598, + "grad_norm": 0.5323672890663147, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4310 + }, + { + "epoch": 0.3102333931777379, + "grad_norm": 0.4996737241744995, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 4320 + }, + { + "epoch": 0.31095152603231596, + "grad_norm": 0.5528829097747803, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 4330 + }, + { + "epoch": 0.3116696588868941, + "grad_norm": 0.5394268035888672, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4340 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 0.4654628038406372, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 4350 + }, + { + "epoch": 0.31310592459605024, + "grad_norm": 0.4933706521987915, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.31382405745062836, + "grad_norm": 0.5310598611831665, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 4370 + }, + { + "epoch": 0.31454219030520647, + "grad_norm": 0.5558765530586243, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4380 + }, + { + "epoch": 0.3152603231597846, + "grad_norm": 0.5281313061714172, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 4390 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 0.5100293755531311, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4400 + }, + { + "epoch": 0.31669658886894075, + "grad_norm": 0.48762813210487366, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 4410 + }, + { + "epoch": 0.31741472172351887, + "grad_norm": 0.5211702585220337, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 4420 + }, + { + "epoch": 0.3181328545780969, + "grad_norm": 0.696747899055481, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 4430 + }, + { + "epoch": 0.31885098743267504, + "grad_norm": 0.6334946751594543, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4440 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.5333067178726196, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4450 + }, + { + "epoch": 0.32028725314183126, + "grad_norm": 0.500091552734375, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 4460 + }, + { + "epoch": 0.3210053859964093, + "grad_norm": 0.5190957188606262, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4470 + }, + { + "epoch": 0.32172351885098743, + "grad_norm": 0.6702370047569275, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 4480 + }, + { + "epoch": 0.32244165170556555, + "grad_norm": 0.4393869638442993, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 4490 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 4500 + }, + { + "epoch": 0.3238779174147217, + "grad_norm": 0.561836838722229, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 4510 + }, + { + "epoch": 0.32459605026929983, + "grad_norm": 0.44366541504859924, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 4520 + }, + { + "epoch": 0.32531418312387794, + "grad_norm": 0.46504274010658264, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 4530 + }, + { + "epoch": 0.326032315978456, + "grad_norm": 0.5498034954071045, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 4540 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 0.5901338458061218, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 4550 + }, + { + "epoch": 0.3274685816876122, + "grad_norm": 0.5485442876815796, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 4560 + }, + { + "epoch": 0.3281867145421903, + "grad_norm": 0.512584924697876, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4570 + }, + { + "epoch": 0.3289048473967684, + "grad_norm": 0.5208188891410828, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 4580 + }, + { + "epoch": 0.3296229802513465, + "grad_norm": 0.4923836886882782, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 4590 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 0.49258530139923096, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 4600 + }, + { + "epoch": 0.3310592459605027, + "grad_norm": 0.4788922667503357, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 4610 + }, + { + "epoch": 0.3317773788150808, + "grad_norm": 0.48276954889297485, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4620 + }, + { + "epoch": 0.3324955116696589, + "grad_norm": 0.6300732493400574, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4630 + }, + { + "epoch": 0.33321364452423696, + "grad_norm": 0.47594770789146423, + "learning_rate": 0.0002, + "loss": 0.8434, + "step": 4640 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.4728924632072449, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 4650 + }, + { + "epoch": 0.3346499102333932, + "grad_norm": 0.5586788654327393, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4660 + }, + { + "epoch": 0.3353680430879713, + "grad_norm": 0.4573180377483368, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 4670 + }, + { + "epoch": 0.33608617594254936, + "grad_norm": 0.6391524076461792, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 4680 + }, + { + "epoch": 0.33680430879712747, + "grad_norm": 0.6570921540260315, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 4690 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 0.4601454734802246, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 4700 + }, + { + "epoch": 0.33824057450628364, + "grad_norm": 0.5640755295753479, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 4710 + }, + { + "epoch": 0.33895870736086176, + "grad_norm": 0.43475520610809326, + "learning_rate": 0.0002, + "loss": 0.8326, + "step": 4720 + }, + { + "epoch": 0.33967684021543987, + "grad_norm": 0.4785807132720947, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 4730 + }, + { + "epoch": 0.340394973070018, + "grad_norm": 0.4934665262699127, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 4740 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 0.45327693223953247, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 4750 + }, + { + "epoch": 0.34183123877917415, + "grad_norm": 0.4710456430912018, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4760 + }, + { + "epoch": 0.34254937163375226, + "grad_norm": 0.5591559410095215, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 4770 + }, + { + "epoch": 0.3432675044883303, + "grad_norm": 0.48958835005760193, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 4780 + }, + { + "epoch": 0.34398563734290843, + "grad_norm": 0.4613766670227051, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 4790 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 0.5425335764884949, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 4800 + }, + { + "epoch": 0.3454219030520646, + "grad_norm": 0.4964924156665802, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.3461400359066427, + "grad_norm": 0.613449215888977, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 4820 + }, + { + "epoch": 0.34685816876122083, + "grad_norm": 0.6553348898887634, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 4830 + }, + { + "epoch": 0.34757630161579894, + "grad_norm": 0.5863470435142517, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 4840 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.5338097810745239, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 4850 + }, + { + "epoch": 0.3490125673249551, + "grad_norm": 0.6129760146141052, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 4860 + }, + { + "epoch": 0.3497307001795332, + "grad_norm": 0.6100956797599792, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 4870 + }, + { + "epoch": 0.3504488330341113, + "grad_norm": 0.5478541254997253, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 4880 + }, + { + "epoch": 0.3511669658886894, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 4890 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6141043901443481, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 4900 + }, + { + "epoch": 0.3526032315978456, + "grad_norm": 0.597191572189331, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 4910 + }, + { + "epoch": 0.3533213644524237, + "grad_norm": 0.5988389253616333, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 4920 + }, + { + "epoch": 0.3540394973070018, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 4930 + }, + { + "epoch": 0.3547576301615799, + "grad_norm": 0.5932779312133789, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 4940 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 0.48911359906196594, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 4950 + }, + { + "epoch": 0.3561938958707361, + "grad_norm": 0.5435750484466553, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4960 + }, + { + "epoch": 0.3569120287253142, + "grad_norm": 0.4786977767944336, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 4970 + }, + { + "epoch": 0.3576301615798923, + "grad_norm": 0.4022316336631775, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 4980 + }, + { + "epoch": 0.35834829443447036, + "grad_norm": 0.4848504364490509, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 4990 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 0.5093459486961365, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 5000 + }, + { + "epoch": 0.3597845601436266, + "grad_norm": 0.47368478775024414, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 5010 + }, + { + "epoch": 0.36050269299820464, + "grad_norm": 0.6041097044944763, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 5020 + }, + { + "epoch": 0.36122082585278276, + "grad_norm": 0.5384424924850464, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 5030 + }, + { + "epoch": 0.36193895870736087, + "grad_norm": 0.4668518602848053, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 5040 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 0.5471060276031494, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 5050 + }, + { + "epoch": 0.36337522441651704, + "grad_norm": 0.731369137763977, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 5060 + }, + { + "epoch": 0.36409335727109515, + "grad_norm": 0.5119590759277344, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 5070 + }, + { + "epoch": 0.36481149012567327, + "grad_norm": 0.567428469657898, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 5080 + }, + { + "epoch": 0.3655296229802513, + "grad_norm": 0.5139971375465393, + "learning_rate": 0.0002, + "loss": 0.7616, + "step": 5090 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.5701581835746765, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 5100 + }, + { + "epoch": 0.36696588868940755, + "grad_norm": 0.5022063851356506, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 5110 + }, + { + "epoch": 0.36768402154398566, + "grad_norm": 0.4684354364871979, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 5120 + }, + { + "epoch": 0.3684021543985637, + "grad_norm": 0.5423495769500732, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 5130 + }, + { + "epoch": 0.36912028725314183, + "grad_norm": 0.46262967586517334, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 5140 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 0.4720141589641571, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 5150 + }, + { + "epoch": 0.370556552962298, + "grad_norm": 0.5113096833229065, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 5160 + }, + { + "epoch": 0.3712746858168761, + "grad_norm": 0.5253350138664246, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 5170 + }, + { + "epoch": 0.37199281867145423, + "grad_norm": 0.5799776315689087, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 5180 + }, + { + "epoch": 0.37271095152603234, + "grad_norm": 0.5166001319885254, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5190 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 5200 + }, + { + "epoch": 0.3741472172351885, + "grad_norm": 0.45811113715171814, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 5210 + }, + { + "epoch": 0.3748653500897666, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 5220 + }, + { + "epoch": 0.3755834829443447, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 0.3763016157989228, + "grad_norm": 0.3858596086502075, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 5240 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 0.6941536068916321, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 5250 + }, + { + "epoch": 0.377737881508079, + "grad_norm": 0.46940872073173523, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 5260 + }, + { + "epoch": 0.3784560143626571, + "grad_norm": 0.5413833260536194, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5270 + }, + { + "epoch": 0.3791741472172352, + "grad_norm": 0.5165658593177795, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 5280 + }, + { + "epoch": 0.3798922800718133, + "grad_norm": 0.6567398309707642, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 5290 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.5466915965080261, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 5300 + }, + { + "epoch": 0.3813285457809695, + "grad_norm": 0.4800598621368408, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 5310 + }, + { + "epoch": 0.3820466786355476, + "grad_norm": 0.4551742970943451, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 5320 + }, + { + "epoch": 0.3827648114901257, + "grad_norm": 0.5561164617538452, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 5330 + }, + { + "epoch": 0.38348294434470376, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 5340 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.465762197971344, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 5350 + }, + { + "epoch": 0.38491921005386, + "grad_norm": 0.6176838874816895, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 5360 + }, + { + "epoch": 0.38563734290843804, + "grad_norm": 0.657926082611084, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 5370 + }, + { + "epoch": 0.38635547576301615, + "grad_norm": 0.5063281655311584, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 5380 + }, + { + "epoch": 0.38707360861759427, + "grad_norm": 0.6960828304290771, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 5390 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 0.46712034940719604, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 5400 + }, + { + "epoch": 0.38850987432675044, + "grad_norm": 0.598114013671875, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 5410 + }, + { + "epoch": 0.38922800718132855, + "grad_norm": 0.6798132061958313, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 5420 + }, + { + "epoch": 0.38994614003590666, + "grad_norm": 0.5194289088249207, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 5430 + }, + { + "epoch": 0.3906642728904847, + "grad_norm": 0.48175323009490967, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 5440 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 0.4979408085346222, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 5450 + }, + { + "epoch": 0.39210053859964095, + "grad_norm": 0.6440972685813904, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5460 + }, + { + "epoch": 0.392818671454219, + "grad_norm": 0.5977227091789246, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 5470 + }, + { + "epoch": 0.3935368043087971, + "grad_norm": 0.4735909104347229, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 5480 + }, + { + "epoch": 0.39425493716337523, + "grad_norm": 0.48181721568107605, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 5490 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.6339454650878906, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 5500 + }, + { + "epoch": 0.3956912028725314, + "grad_norm": 0.5364336371421814, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5510 + }, + { + "epoch": 0.3964093357271095, + "grad_norm": 0.5499233603477478, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 5520 + }, + { + "epoch": 0.3971274685816876, + "grad_norm": 0.47249847650527954, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 5530 + }, + { + "epoch": 0.3978456014362657, + "grad_norm": 0.5692135095596313, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 5540 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 0.6009272933006287, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 5550 + }, + { + "epoch": 0.3992818671454219, + "grad_norm": 0.5198255181312561, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5560 + }, + { + "epoch": 0.4, + "grad_norm": 0.5474766492843628, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 0.4007181328545781, + "grad_norm": 0.5577479600906372, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 5580 + }, + { + "epoch": 0.4014362657091562, + "grad_norm": 0.5350302457809448, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5590 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 0.6310991048812866, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 5600 + }, + { + "epoch": 0.40287253141831236, + "grad_norm": 0.5695762038230896, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5610 + }, + { + "epoch": 0.4035906642728905, + "grad_norm": 0.5431827306747437, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 5620 + }, + { + "epoch": 0.4043087971274686, + "grad_norm": 0.4923325777053833, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 5630 + }, + { + "epoch": 0.4050269299820467, + "grad_norm": 0.531399667263031, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 5640 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.5854769349098206, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 5650 + }, + { + "epoch": 0.40646319569120287, + "grad_norm": 0.6684802174568176, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 5660 + }, + { + "epoch": 0.407181328545781, + "grad_norm": 0.6618620753288269, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 5670 + }, + { + "epoch": 0.40789946140035904, + "grad_norm": 0.4930776059627533, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 5680 + }, + { + "epoch": 0.40861759425493716, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 5690 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5700 + }, + { + "epoch": 0.4100538599640934, + "grad_norm": 0.6773046851158142, + "learning_rate": 0.0002, + "loss": 0.8386, + "step": 5710 + }, + { + "epoch": 0.41077199281867144, + "grad_norm": 0.6750592589378357, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 5720 + }, + { + "epoch": 0.41149012567324955, + "grad_norm": 0.5277232527732849, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5730 + }, + { + "epoch": 0.41220825852782766, + "grad_norm": 0.5155990719795227, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 5740 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 0.5236294865608215, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 5750 + }, + { + "epoch": 0.41364452423698383, + "grad_norm": 0.5073592066764832, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 5760 + }, + { + "epoch": 0.41436265709156195, + "grad_norm": 0.6997184753417969, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 5770 + }, + { + "epoch": 0.41508078994614006, + "grad_norm": 0.5282439589500427, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 5780 + }, + { + "epoch": 0.4157989228007181, + "grad_norm": 0.4997355341911316, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5790 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.6081610321998596, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5800 + }, + { + "epoch": 0.41723518850987434, + "grad_norm": 0.5640295147895813, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 5810 + }, + { + "epoch": 0.4179533213644524, + "grad_norm": 0.6443586349487305, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 0.4186714542190305, + "grad_norm": 0.6456229090690613, + "learning_rate": 0.0002, + "loss": 0.8132, + "step": 5830 + }, + { + "epoch": 0.4193895870736086, + "grad_norm": 0.5422267317771912, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5840 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 0.45251885056495667, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5850 + }, + { + "epoch": 0.4208258527827648, + "grad_norm": 0.781165599822998, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5860 + }, + { + "epoch": 0.4215439856373429, + "grad_norm": 0.5359160900115967, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5870 + }, + { + "epoch": 0.422262118491921, + "grad_norm": 0.6201958656311035, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5880 + }, + { + "epoch": 0.4229802513464991, + "grad_norm": 0.5985850691795349, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 5890 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.5550961494445801, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 5900 + }, + { + "epoch": 0.4244165170556553, + "grad_norm": 0.6284893155097961, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 5910 + }, + { + "epoch": 0.4251346499102334, + "grad_norm": 0.6143685579299927, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 5920 + }, + { + "epoch": 0.4258527827648115, + "grad_norm": 0.5065329670906067, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5930 + }, + { + "epoch": 0.4265709156193896, + "grad_norm": 0.7274345755577087, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 5940 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.606531023979187, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 5950 + }, + { + "epoch": 0.42800718132854576, + "grad_norm": 0.5983648300170898, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5960 + }, + { + "epoch": 0.4287253141831239, + "grad_norm": 0.5546031594276428, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5970 + }, + { + "epoch": 0.429443447037702, + "grad_norm": 0.666868269443512, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 5980 + }, + { + "epoch": 0.4301615798922801, + "grad_norm": 0.41438576579093933, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5990 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 0.5012526512145996, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 6000 + }, + { + "epoch": 0.43159784560143627, + "grad_norm": 0.6071694493293762, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 6010 + }, + { + "epoch": 0.4323159784560144, + "grad_norm": 0.5538384914398193, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 6020 + }, + { + "epoch": 0.43303411131059244, + "grad_norm": 0.5798718929290771, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 6030 + }, + { + "epoch": 0.43375224416517055, + "grad_norm": 0.5442442893981934, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 6040 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.6895565390586853, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 6050 + }, + { + "epoch": 0.4351885098743267, + "grad_norm": 0.6498045325279236, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 6060 + }, + { + "epoch": 0.43590664272890484, + "grad_norm": 0.5225510001182556, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 6070 + }, + { + "epoch": 0.43662477558348295, + "grad_norm": 0.6366992592811584, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 6080 + }, + { + "epoch": 0.43734290843806106, + "grad_norm": 0.47929027676582336, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 6090 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.5722405910491943, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 6100 + }, + { + "epoch": 0.43877917414721723, + "grad_norm": 0.6008004546165466, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 6110 + }, + { + "epoch": 0.43949730700179535, + "grad_norm": 0.5922580361366272, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 6120 + }, + { + "epoch": 0.4402154398563734, + "grad_norm": 0.7051905393600464, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 6130 + }, + { + "epoch": 0.4409335727109515, + "grad_norm": 0.5146450400352478, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 6140 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5605781674385071, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 6150 + }, + { + "epoch": 0.44236983842010774, + "grad_norm": 0.8008661866188049, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 6160 + }, + { + "epoch": 0.4430879712746858, + "grad_norm": 0.47406497597694397, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 6170 + }, + { + "epoch": 0.4438061041292639, + "grad_norm": 0.612287700176239, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 6180 + }, + { + "epoch": 0.444524236983842, + "grad_norm": 0.561188280582428, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 6190 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.6233669519424438, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 6200 + }, + { + "epoch": 0.4459605026929982, + "grad_norm": 0.45546263456344604, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6210 + }, + { + "epoch": 0.4466786355475763, + "grad_norm": 0.5947871208190918, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 6220 + }, + { + "epoch": 0.4473967684021544, + "grad_norm": 0.6109753847122192, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 6230 + }, + { + "epoch": 0.4481149012567325, + "grad_norm": 0.6380727887153625, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6240 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.5225699543952942, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 6250 + }, + { + "epoch": 0.4495511669658887, + "grad_norm": 0.521503210067749, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.45026929982046676, + "grad_norm": 0.5523216128349304, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 6270 + }, + { + "epoch": 0.4509874326750449, + "grad_norm": 0.5954921841621399, + "learning_rate": 0.0002, + "loss": 0.8228, + "step": 6280 + }, + { + "epoch": 0.451705565529623, + "grad_norm": 0.702751100063324, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 6290 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 0.5756356120109558, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 6300 + }, + { + "epoch": 0.45314183123877916, + "grad_norm": 0.45365944504737854, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 6310 + }, + { + "epoch": 0.45385996409335727, + "grad_norm": 0.5027855038642883, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6320 + }, + { + "epoch": 0.4545780969479354, + "grad_norm": 0.6551687121391296, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 6330 + }, + { + "epoch": 0.45529622980251344, + "grad_norm": 0.5296684503555298, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6340 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 0.5762032866477966, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6350 + }, + { + "epoch": 0.45673249551166967, + "grad_norm": 0.5234073996543884, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6360 + }, + { + "epoch": 0.4574506283662478, + "grad_norm": 0.5090946555137634, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 6370 + }, + { + "epoch": 0.45816876122082584, + "grad_norm": 0.6515111327171326, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 6380 + }, + { + "epoch": 0.45888689407540395, + "grad_norm": 0.7904898524284363, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 6390 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.6379680037498474, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 6400 + }, + { + "epoch": 0.4603231597845601, + "grad_norm": 0.641759991645813, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 6410 + }, + { + "epoch": 0.46104129263913823, + "grad_norm": 0.5273829698562622, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 6420 + }, + { + "epoch": 0.46175942549371635, + "grad_norm": 0.5668497681617737, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6430 + }, + { + "epoch": 0.46247755834829446, + "grad_norm": 0.5862061381340027, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 6440 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 0.5239592790603638, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 6450 + }, + { + "epoch": 0.46391382405745063, + "grad_norm": 0.5078722834587097, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 6460 + }, + { + "epoch": 0.46463195691202874, + "grad_norm": 0.566509485244751, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 6470 + }, + { + "epoch": 0.4653500897666068, + "grad_norm": 0.5952697396278381, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 6480 + }, + { + "epoch": 0.4660682226211849, + "grad_norm": 0.6548156142234802, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 6490 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 0.4768427908420563, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.46750448833034114, + "grad_norm": 0.5588273406028748, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 6510 + }, + { + "epoch": 0.4682226211849192, + "grad_norm": 0.5348677039146423, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 6520 + }, + { + "epoch": 0.4689407540394973, + "grad_norm": 0.4784318804740906, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 6530 + }, + { + "epoch": 0.4696588868940754, + "grad_norm": 0.5112265944480896, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 6540 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 0.7250495553016663, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 6550 + }, + { + "epoch": 0.4710951526032316, + "grad_norm": 0.538608968257904, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 6560 + }, + { + "epoch": 0.4718132854578097, + "grad_norm": 0.5981247425079346, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 6570 + }, + { + "epoch": 0.4725314183123878, + "grad_norm": 0.5466762781143188, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 6580 + }, + { + "epoch": 0.4732495511669659, + "grad_norm": 0.5609987378120422, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 6590 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 0.6091027855873108, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 6600 + }, + { + "epoch": 0.4746858168761221, + "grad_norm": 0.5542886853218079, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 6610 + }, + { + "epoch": 0.47540394973070016, + "grad_norm": 0.5656579732894897, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6620 + }, + { + "epoch": 0.4761220825852783, + "grad_norm": 0.47507357597351074, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 6630 + }, + { + "epoch": 0.4768402154398564, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6640 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 0.7129740715026855, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 6650 + }, + { + "epoch": 0.47827648114901256, + "grad_norm": 0.5189188718795776, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 6660 + }, + { + "epoch": 0.47899461400359067, + "grad_norm": 0.7548696398735046, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 6670 + }, + { + "epoch": 0.4797127468581688, + "grad_norm": 0.4729466438293457, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 6680 + }, + { + "epoch": 0.48043087971274684, + "grad_norm": 0.6190000772476196, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 6690 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 0.6276983022689819, + "learning_rate": 0.0002, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.48186714542190306, + "grad_norm": 0.6097590923309326, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 6710 + }, + { + "epoch": 0.4825852782764811, + "grad_norm": 0.6507330536842346, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 6720 + }, + { + "epoch": 0.48330341113105924, + "grad_norm": 0.5501991510391235, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 6730 + }, + { + "epoch": 0.48402154398563735, + "grad_norm": 0.5928015112876892, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 6740 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 0.5523008704185486, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 6750 + }, + { + "epoch": 0.4854578096947935, + "grad_norm": 0.5997263789176941, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 6760 + }, + { + "epoch": 0.48617594254937163, + "grad_norm": 0.6201002597808838, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 6770 + }, + { + "epoch": 0.48689407540394974, + "grad_norm": 0.6338862776756287, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 6780 + }, + { + "epoch": 0.4876122082585278, + "grad_norm": 0.5542550086975098, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6790 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.5587872862815857, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 6800 + }, + { + "epoch": 0.489048473967684, + "grad_norm": 0.5895681977272034, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 6810 + }, + { + "epoch": 0.48976660682226214, + "grad_norm": 0.4948221743106842, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 6820 + }, + { + "epoch": 0.4904847396768402, + "grad_norm": 0.44546931982040405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 6830 + }, + { + "epoch": 0.4912028725314183, + "grad_norm": 0.632046103477478, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 6840 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 0.49396243691444397, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 6850 + }, + { + "epoch": 0.4926391382405745, + "grad_norm": 0.497745156288147, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6860 + }, + { + "epoch": 0.4933572710951526, + "grad_norm": 0.7336170077323914, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 6870 + }, + { + "epoch": 0.4940754039497307, + "grad_norm": 0.6723181009292603, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 6880 + }, + { + "epoch": 0.4947935368043088, + "grad_norm": 0.5887754559516907, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 6890 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 0.6580226421356201, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 6900 + }, + { + "epoch": 0.496229802513465, + "grad_norm": 0.7385056614875793, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 6910 + }, + { + "epoch": 0.4969479353680431, + "grad_norm": 0.48736000061035156, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6920 + }, + { + "epoch": 0.49766606822262116, + "grad_norm": 0.6304559111595154, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 6930 + }, + { + "epoch": 0.4983842010771993, + "grad_norm": 0.607148289680481, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6940 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.5467981696128845, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 6950 + }, + { + "epoch": 0.4998204667863555, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 6960 + }, + { + "epoch": 0.5005385996409336, + "grad_norm": 0.5487921833992004, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 6970 + }, + { + "epoch": 0.5012567324955116, + "grad_norm": 0.5706006288528442, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 6980 + }, + { + "epoch": 0.5019748653500897, + "grad_norm": 0.539536714553833, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 6990 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 0.5527397394180298, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 7000 + }, + { + "epoch": 0.503411131059246, + "grad_norm": 0.5498567223548889, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 7010 + }, + { + "epoch": 0.5041292639138241, + "grad_norm": 0.5878575444221497, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 7020 + }, + { + "epoch": 0.5048473967684022, + "grad_norm": 0.646153450012207, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 7030 + }, + { + "epoch": 0.5055655296229803, + "grad_norm": 0.5603899359703064, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 7040 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 0.5849952697753906, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 7050 + }, + { + "epoch": 0.5070017953321364, + "grad_norm": 0.6082724928855896, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 7060 + }, + { + "epoch": 0.5077199281867145, + "grad_norm": 0.5900670289993286, + "learning_rate": 0.0002, + "loss": 0.8046, + "step": 7070 + }, + { + "epoch": 0.5084380610412926, + "grad_norm": 0.5856624841690063, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 7080 + }, + { + "epoch": 0.5091561938958707, + "grad_norm": 0.6177338361740112, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7090 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 0.5559300184249878, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 7100 + }, + { + "epoch": 0.510592459605027, + "grad_norm": 0.62027907371521, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 7110 + }, + { + "epoch": 0.511310592459605, + "grad_norm": 0.6334301829338074, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7120 + }, + { + "epoch": 0.5120287253141831, + "grad_norm": 0.513795018196106, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 7130 + }, + { + "epoch": 0.5127468581687612, + "grad_norm": 0.7004675269126892, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 7140 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5614308714866638, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7150 + }, + { + "epoch": 0.5141831238779174, + "grad_norm": 0.5037539601325989, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 7160 + }, + { + "epoch": 0.5149012567324955, + "grad_norm": 0.5568661093711853, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 7170 + }, + { + "epoch": 0.5156193895870737, + "grad_norm": 0.7513397336006165, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7180 + }, + { + "epoch": 0.5163375224416517, + "grad_norm": 0.7264583706855774, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 7190 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.6355819702148438, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 7200 + }, + { + "epoch": 0.5177737881508079, + "grad_norm": 0.6063222289085388, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 7210 + }, + { + "epoch": 0.518491921005386, + "grad_norm": 0.6484307646751404, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 7220 + }, + { + "epoch": 0.5192100538599641, + "grad_norm": 0.5260455012321472, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 7230 + }, + { + "epoch": 0.5199281867145422, + "grad_norm": 0.6718002557754517, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7240 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 0.5997617244720459, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 7250 + }, + { + "epoch": 0.5213644524236983, + "grad_norm": 0.5838589668273926, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 7260 + }, + { + "epoch": 0.5220825852782764, + "grad_norm": 0.5755977630615234, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 7270 + }, + { + "epoch": 0.5228007181328546, + "grad_norm": 0.6442093253135681, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 7280 + }, + { + "epoch": 0.5235188509874327, + "grad_norm": 0.6128416657447815, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 7290 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.509742796421051, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 7300 + }, + { + "epoch": 0.5249551166965889, + "grad_norm": 0.5450230836868286, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 7310 + }, + { + "epoch": 0.525673249551167, + "grad_norm": 0.5437141060829163, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 7320 + }, + { + "epoch": 0.526391382405745, + "grad_norm": 0.5291738510131836, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 7330 + }, + { + "epoch": 0.5271095152603231, + "grad_norm": 0.5101743936538696, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 7340 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.5678408145904541, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 7350 + }, + { + "epoch": 0.5285457809694794, + "grad_norm": 0.6332360506057739, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7360 + }, + { + "epoch": 0.5292639138240575, + "grad_norm": 0.4935058653354645, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 7370 + }, + { + "epoch": 0.5299820466786356, + "grad_norm": 0.6399656534194946, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7380 + }, + { + "epoch": 0.5307001795332137, + "grad_norm": 0.5986794233322144, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 7390 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.6948414444923401, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 7400 + }, + { + "epoch": 0.5321364452423698, + "grad_norm": 0.5337842106819153, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 7410 + }, + { + "epoch": 0.5328545780969479, + "grad_norm": 0.6897268295288086, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 7420 + }, + { + "epoch": 0.533572710951526, + "grad_norm": 0.6361175179481506, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 7430 + }, + { + "epoch": 0.5342908438061041, + "grad_norm": 0.5242252945899963, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 7440 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 0.5731322765350342, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 7450 + }, + { + "epoch": 0.5357271095152604, + "grad_norm": 0.5790955424308777, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 7460 + }, + { + "epoch": 0.5364452423698384, + "grad_norm": 0.4979061782360077, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 7470 + }, + { + "epoch": 0.5371633752244165, + "grad_norm": 0.7335101962089539, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 7480 + }, + { + "epoch": 0.5378815080789946, + "grad_norm": 0.592521071434021, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 7490 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.5784769654273987, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 7500 + }, + { + "epoch": 0.5393177737881508, + "grad_norm": 0.8148589730262756, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 7510 + }, + { + "epoch": 0.5400359066427289, + "grad_norm": 0.5727689862251282, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7520 + }, + { + "epoch": 0.540754039497307, + "grad_norm": 0.6958279609680176, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 7530 + }, + { + "epoch": 0.541472172351885, + "grad_norm": 0.6302788257598877, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 7540 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.5950970649719238, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 7550 + }, + { + "epoch": 0.5429084380610413, + "grad_norm": 0.4275270104408264, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 7560 + }, + { + "epoch": 0.5436265709156194, + "grad_norm": 0.7579900622367859, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 7570 + }, + { + "epoch": 0.5443447037701975, + "grad_norm": 0.5835317969322205, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 7580 + }, + { + "epoch": 0.5450628366247756, + "grad_norm": 0.5305142998695374, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 7590 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7600 + }, + { + "epoch": 0.5464991023339317, + "grad_norm": 0.5341935753822327, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 7610 + }, + { + "epoch": 0.5472172351885098, + "grad_norm": 0.6070826053619385, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 7620 + }, + { + "epoch": 0.547935368043088, + "grad_norm": 0.6193035840988159, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 7630 + }, + { + "epoch": 0.5486535008976661, + "grad_norm": 0.6171614527702332, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 7640 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.5700938105583191, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 7650 + }, + { + "epoch": 0.5500897666068223, + "grad_norm": 0.5742418169975281, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7660 + }, + { + "epoch": 0.5508078994614004, + "grad_norm": 0.6450320482254028, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 7670 + }, + { + "epoch": 0.5515260323159784, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 7680 + }, + { + "epoch": 0.5522441651705565, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 7690 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.5846288204193115, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7700 + }, + { + "epoch": 0.5536804308797127, + "grad_norm": 0.623315155506134, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7710 + }, + { + "epoch": 0.5543985637342909, + "grad_norm": 0.6607962250709534, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7720 + }, + { + "epoch": 0.555116696588869, + "grad_norm": 0.5258557200431824, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 7730 + }, + { + "epoch": 0.5558348294434471, + "grad_norm": 0.6464316844940186, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7740 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 0.6390621662139893, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 7750 + }, + { + "epoch": 0.5572710951526032, + "grad_norm": 0.5327560305595398, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 7760 + }, + { + "epoch": 0.5579892280071813, + "grad_norm": 0.8202064633369446, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 7770 + }, + { + "epoch": 0.5587073608617594, + "grad_norm": 0.45350968837738037, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 7780 + }, + { + "epoch": 0.5594254937163375, + "grad_norm": 0.5031413435935974, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 7790 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.5047417879104614, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 7800 + }, + { + "epoch": 0.5608617594254938, + "grad_norm": 0.668912410736084, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 0.5615798922800718, + "grad_norm": 0.6106061339378357, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7820 + }, + { + "epoch": 0.5622980251346499, + "grad_norm": 0.5558443665504456, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 7830 + }, + { + "epoch": 0.563016157989228, + "grad_norm": 0.5937177538871765, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 7840 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 0.67307448387146, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 7850 + }, + { + "epoch": 0.5644524236983842, + "grad_norm": 0.4615475833415985, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7860 + }, + { + "epoch": 0.5651705565529623, + "grad_norm": 0.5462577939033508, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 7870 + }, + { + "epoch": 0.5658886894075404, + "grad_norm": 0.6422402858734131, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7880 + }, + { + "epoch": 0.5666068222621184, + "grad_norm": 0.5313532948493958, + "learning_rate": 0.0002, + "loss": 0.8327, + "step": 7890 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 0.5647847056388855, + "learning_rate": 0.0002, + "loss": 0.7771, + "step": 7900 + }, + { + "epoch": 0.5680430879712747, + "grad_norm": 0.6581610441207886, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 7910 + }, + { + "epoch": 0.5687612208258528, + "grad_norm": 0.46947669982910156, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 7920 + }, + { + "epoch": 0.5694793536804309, + "grad_norm": 0.6420038342475891, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7930 + }, + { + "epoch": 0.570197486535009, + "grad_norm": 0.6730441451072693, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 7940 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.3849070966243744, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 7950 + }, + { + "epoch": 0.5716337522441651, + "grad_norm": 0.6076335906982422, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 7960 + }, + { + "epoch": 0.5723518850987432, + "grad_norm": 0.6446982026100159, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 7970 + }, + { + "epoch": 0.5730700179533214, + "grad_norm": 0.6019234657287598, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 7980 + }, + { + "epoch": 0.5737881508078995, + "grad_norm": 0.620880663394928, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 7990 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 0.4927573502063751, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 8000 + }, + { + "epoch": 0.5752244165170557, + "grad_norm": 0.6276804804801941, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8010 + }, + { + "epoch": 0.5759425493716338, + "grad_norm": 0.484518826007843, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 8020 + }, + { + "epoch": 0.5766606822262118, + "grad_norm": 0.5019962787628174, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.5773788150807899, + "grad_norm": 0.6685234308242798, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 8040 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 0.5762107372283936, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 8050 + }, + { + "epoch": 0.5788150807899461, + "grad_norm": 0.6402477025985718, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 8060 + }, + { + "epoch": 0.5795332136445243, + "grad_norm": 0.5919345617294312, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8070 + }, + { + "epoch": 0.5802513464991024, + "grad_norm": 0.47100913524627686, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 8080 + }, + { + "epoch": 0.5809694793536805, + "grad_norm": 0.6029118895530701, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 8090 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 0.5896338820457458, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 8100 + }, + { + "epoch": 0.5824057450628366, + "grad_norm": 0.49017754197120667, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 8110 + }, + { + "epoch": 0.5831238779174147, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 8120 + }, + { + "epoch": 0.5838420107719928, + "grad_norm": 0.6874517798423767, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.5845601436265709, + "grad_norm": 0.5429391264915466, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 8140 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.5533722639083862, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5859964093357272, + "grad_norm": 0.5827956199645996, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 8160 + }, + { + "epoch": 0.5867145421903052, + "grad_norm": 0.6670212149620056, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 8170 + }, + { + "epoch": 0.5874326750448833, + "grad_norm": 0.5231172442436218, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 8180 + }, + { + "epoch": 0.5881508078994614, + "grad_norm": 0.567447304725647, + "learning_rate": 0.0002, + "loss": 0.7975, + "step": 8190 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 0.5318575501441956, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8200 + }, + { + "epoch": 0.5895870736086176, + "grad_norm": 0.6959463357925415, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 8210 + }, + { + "epoch": 0.5903052064631957, + "grad_norm": 0.6964931488037109, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 8220 + }, + { + "epoch": 0.5910233393177737, + "grad_norm": 0.5164617896080017, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 8230 + }, + { + "epoch": 0.5917414721723518, + "grad_norm": 0.5456110239028931, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 8240 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.6553666591644287, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 8250 + }, + { + "epoch": 0.5931777378815081, + "grad_norm": 0.6185845732688904, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 8260 + }, + { + "epoch": 0.5938958707360862, + "grad_norm": 0.6110545992851257, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8270 + }, + { + "epoch": 0.5946140035906643, + "grad_norm": 0.5186824202537537, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 8280 + }, + { + "epoch": 0.5953321364452424, + "grad_norm": 0.7003735303878784, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 8290 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 0.4606216549873352, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 8300 + }, + { + "epoch": 0.5967684021543985, + "grad_norm": 0.5903441309928894, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 8310 + }, + { + "epoch": 0.5974865350089766, + "grad_norm": 0.7916744947433472, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 8320 + }, + { + "epoch": 0.5982046678635548, + "grad_norm": 0.5506401062011719, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 8330 + }, + { + "epoch": 0.5989228007181329, + "grad_norm": 0.5749204158782959, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 8340 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.6807544827461243, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 8350 + }, + { + "epoch": 0.6003590664272891, + "grad_norm": 0.5782986283302307, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 8360 + }, + { + "epoch": 0.6010771992818671, + "grad_norm": 0.7336342334747314, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 8370 + }, + { + "epoch": 0.6017953321364452, + "grad_norm": 0.5762712955474854, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.6025134649910233, + "grad_norm": 0.5726776719093323, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 8390 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.5355535745620728, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 8400 + }, + { + "epoch": 0.6039497307001795, + "grad_norm": 0.6762161254882812, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 8410 + }, + { + "epoch": 0.6046678635547577, + "grad_norm": 0.8200717568397522, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 8420 + }, + { + "epoch": 0.6053859964093358, + "grad_norm": 0.5600009560585022, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 8430 + }, + { + "epoch": 0.6061041292639138, + "grad_norm": 0.6465966105461121, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 8440 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.5176072120666504, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 8450 + }, + { + "epoch": 0.60754039497307, + "grad_norm": 0.5777280926704407, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 8460 + }, + { + "epoch": 0.6082585278276481, + "grad_norm": 0.5989252924919128, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 8470 + }, + { + "epoch": 0.6089766606822262, + "grad_norm": 0.5207306742668152, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8480 + }, + { + "epoch": 0.6096947935368043, + "grad_norm": 0.5242675542831421, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 8490 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 0.5631455183029175, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 8500 + }, + { + "epoch": 0.6111310592459605, + "grad_norm": 0.65207439661026, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 8510 + }, + { + "epoch": 0.6118491921005386, + "grad_norm": 0.5808899998664856, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8520 + }, + { + "epoch": 0.6125673249551167, + "grad_norm": 0.558127760887146, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 8530 + }, + { + "epoch": 0.6132854578096948, + "grad_norm": 0.6063143014907837, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8540 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.5491744875907898, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 8550 + }, + { + "epoch": 0.614721723518851, + "grad_norm": 0.5105780959129333, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8560 + }, + { + "epoch": 0.6154398563734291, + "grad_norm": 0.6892395615577698, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 8570 + }, + { + "epoch": 0.6161579892280071, + "grad_norm": 0.7411758899688721, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8580 + }, + { + "epoch": 0.6168761220825852, + "grad_norm": 0.6745429635047913, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 8590 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 0.596007227897644, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 8600 + }, + { + "epoch": 0.6183123877917415, + "grad_norm": 0.6751060485839844, + "learning_rate": 0.0002, + "loss": 0.7963, + "step": 8610 + }, + { + "epoch": 0.6190305206463196, + "grad_norm": 0.711124837398529, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 8620 + }, + { + "epoch": 0.6197486535008977, + "grad_norm": 0.6110914945602417, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 8630 + }, + { + "epoch": 0.6204667863554758, + "grad_norm": 0.5687659978866577, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 8640 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.7025772929191589, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8650 + }, + { + "epoch": 0.6219030520646319, + "grad_norm": 0.6456184983253479, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 8660 + }, + { + "epoch": 0.62262118491921, + "grad_norm": 0.5317023992538452, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 8670 + }, + { + "epoch": 0.6233393177737881, + "grad_norm": 0.5531691908836365, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 8680 + }, + { + "epoch": 0.6240574506283663, + "grad_norm": 0.6063531637191772, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 8690 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 1.094390630722046, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 8700 + }, + { + "epoch": 0.6254937163375225, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 8710 + }, + { + "epoch": 0.6262118491921005, + "grad_norm": 0.5470370054244995, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 8720 + }, + { + "epoch": 0.6269299820466786, + "grad_norm": 0.5852634310722351, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 8730 + }, + { + "epoch": 0.6276481149012567, + "grad_norm": 0.6120240092277527, + "learning_rate": 0.0002, + "loss": 0.8712, + "step": 8740 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 0.5608004927635193, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 8750 + }, + { + "epoch": 0.6290843806104129, + "grad_norm": 0.5980432033538818, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 8760 + }, + { + "epoch": 0.629802513464991, + "grad_norm": 0.5670580863952637, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 8770 + }, + { + "epoch": 0.6305206463195692, + "grad_norm": 0.5931687951087952, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 8780 + }, + { + "epoch": 0.6312387791741472, + "grad_norm": 0.7872577905654907, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 8790 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.6355181336402893, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 8800 + }, + { + "epoch": 0.6326750448833034, + "grad_norm": 0.501913845539093, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 8810 + }, + { + "epoch": 0.6333931777378815, + "grad_norm": 0.5956716537475586, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8820 + }, + { + "epoch": 0.6341113105924596, + "grad_norm": 0.6448253393173218, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 8830 + }, + { + "epoch": 0.6348294434470377, + "grad_norm": 0.6139631271362305, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 8840 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.5894306302070618, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 8850 + }, + { + "epoch": 0.6362657091561938, + "grad_norm": 0.8724799752235413, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 8860 + }, + { + "epoch": 0.636983842010772, + "grad_norm": 0.5413858890533447, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 8870 + }, + { + "epoch": 0.6377019748653501, + "grad_norm": 0.5993430614471436, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 8880 + }, + { + "epoch": 0.6384201077199282, + "grad_norm": 0.539415717124939, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 0.600125789642334, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 0.6398563734290844, + "grad_norm": 0.5597978234291077, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 8910 + }, + { + "epoch": 0.6405745062836625, + "grad_norm": 0.6262031197547913, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 8920 + }, + { + "epoch": 0.6412926391382405, + "grad_norm": 0.72662752866745, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 8930 + }, + { + "epoch": 0.6420107719928186, + "grad_norm": 0.613002598285675, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 8940 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.6511827707290649, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 8950 + }, + { + "epoch": 0.6434470377019749, + "grad_norm": 0.5383973717689514, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 8960 + }, + { + "epoch": 0.644165170556553, + "grad_norm": 0.5236184597015381, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 8970 + }, + { + "epoch": 0.6448833034111311, + "grad_norm": 0.5938544273376465, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 8980 + }, + { + "epoch": 0.6456014362657092, + "grad_norm": 0.4594680964946747, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 8990 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 0.6314211487770081, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9000 + }, + { + "epoch": 0.6470377019748653, + "grad_norm": 0.6291103363037109, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 9010 + }, + { + "epoch": 0.6477558348294434, + "grad_norm": 0.5888266563415527, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 9020 + }, + { + "epoch": 0.6484739676840215, + "grad_norm": 0.5613022446632385, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9030 + }, + { + "epoch": 0.6491921005385997, + "grad_norm": 0.7219604253768921, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 9040 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 0.5846529006958008, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 9050 + }, + { + "epoch": 0.6506283662477559, + "grad_norm": 0.7264063954353333, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 9060 + }, + { + "epoch": 0.6513464991023339, + "grad_norm": 0.5797538757324219, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9070 + }, + { + "epoch": 0.652064631956912, + "grad_norm": 0.4857395887374878, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9080 + }, + { + "epoch": 0.6527827648114901, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 9090 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.6105342507362366, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 9100 + }, + { + "epoch": 0.6542190305206463, + "grad_norm": 0.6408740282058716, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 9110 + }, + { + "epoch": 0.6549371633752245, + "grad_norm": 0.7474880814552307, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 9120 + }, + { + "epoch": 0.6556552962298026, + "grad_norm": 0.584768533706665, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 9130 + }, + { + "epoch": 0.6563734290843806, + "grad_norm": 0.6368113160133362, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9140 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.693631649017334, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 9150 + }, + { + "epoch": 0.6578096947935368, + "grad_norm": 0.6094512343406677, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 9160 + }, + { + "epoch": 0.6585278276481149, + "grad_norm": 0.7154942750930786, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 9170 + }, + { + "epoch": 0.659245960502693, + "grad_norm": 0.5749237537384033, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9180 + }, + { + "epoch": 0.6599640933572711, + "grad_norm": 0.6214450001716614, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 9190 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.6357814073562622, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9200 + }, + { + "epoch": 0.6614003590664272, + "grad_norm": 0.5677326917648315, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 9210 + }, + { + "epoch": 0.6621184919210054, + "grad_norm": 0.5432633757591248, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 9220 + }, + { + "epoch": 0.6628366247755835, + "grad_norm": 0.43935060501098633, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 9230 + }, + { + "epoch": 0.6635547576301616, + "grad_norm": 0.5350922346115112, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 9240 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.7745687365531921, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 9250 + }, + { + "epoch": 0.6649910233393178, + "grad_norm": 0.5767113566398621, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9260 + }, + { + "epoch": 0.6657091561938959, + "grad_norm": 0.49304983019828796, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9270 + }, + { + "epoch": 0.6664272890484739, + "grad_norm": 0.6355269551277161, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 9280 + }, + { + "epoch": 0.667145421903052, + "grad_norm": 0.5539451241493225, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 9290 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.5225138068199158, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 9300 + }, + { + "epoch": 0.6685816876122083, + "grad_norm": 0.5435736179351807, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 9310 + }, + { + "epoch": 0.6692998204667864, + "grad_norm": 0.611266553401947, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 9320 + }, + { + "epoch": 0.6700179533213645, + "grad_norm": 0.5880926251411438, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 9330 + }, + { + "epoch": 0.6707360861759426, + "grad_norm": 0.5301468372344971, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9340 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.5614377856254578, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9350 + }, + { + "epoch": 0.6721723518850987, + "grad_norm": 0.7177342176437378, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 9360 + }, + { + "epoch": 0.6728904847396768, + "grad_norm": 0.5187423825263977, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9370 + }, + { + "epoch": 0.6736086175942549, + "grad_norm": 0.49305087327957153, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 9380 + }, + { + "epoch": 0.6743267504488331, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 9390 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 0.8308040499687195, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 9400 + }, + { + "epoch": 0.6757630161579893, + "grad_norm": 0.6522438526153564, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 9410 + }, + { + "epoch": 0.6764811490125673, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 9420 + }, + { + "epoch": 0.6771992818671454, + "grad_norm": 0.783802330493927, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 9430 + }, + { + "epoch": 0.6779174147217235, + "grad_norm": 0.5246656537055969, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 9440 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.6630974411964417, + "learning_rate": 0.0002, + "loss": 0.7866, + "step": 9450 + }, + { + "epoch": 0.6793536804308797, + "grad_norm": 0.5012770295143127, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9460 + }, + { + "epoch": 0.6800718132854578, + "grad_norm": 0.6208643317222595, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 9470 + }, + { + "epoch": 0.680789946140036, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9480 + }, + { + "epoch": 0.681508078994614, + "grad_norm": 0.6613174080848694, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 9490 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.6417899131774902, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9500 + }, + { + "epoch": 0.6829443447037702, + "grad_norm": 0.5060321092605591, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 9510 + }, + { + "epoch": 0.6836624775583483, + "grad_norm": 0.586670458316803, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 9520 + }, + { + "epoch": 0.6843806104129264, + "grad_norm": 0.6607828736305237, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 9530 + }, + { + "epoch": 0.6850987432675045, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9540 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.741000771522522, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 9550 + }, + { + "epoch": 0.6865350089766606, + "grad_norm": 0.4687826335430145, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 9560 + }, + { + "epoch": 0.6872531418312388, + "grad_norm": 0.6452056169509888, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 9570 + }, + { + "epoch": 0.6879712746858169, + "grad_norm": 0.6393555402755737, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 9580 + }, + { + "epoch": 0.688689407540395, + "grad_norm": 0.4907757043838501, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 9590 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.5380825996398926, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 9600 + }, + { + "epoch": 0.6901256732495512, + "grad_norm": 0.5657393932342529, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 9610 + }, + { + "epoch": 0.6908438061041292, + "grad_norm": 0.8505447506904602, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 9620 + }, + { + "epoch": 0.6915619389587073, + "grad_norm": 0.5389836430549622, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 9630 + }, + { + "epoch": 0.6922800718132854, + "grad_norm": 0.4977441728115082, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 9640 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 0.5855389833450317, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 9650 + }, + { + "epoch": 0.6937163375224417, + "grad_norm": 0.633994996547699, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 9660 + }, + { + "epoch": 0.6944344703770198, + "grad_norm": 0.5592191815376282, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 9670 + }, + { + "epoch": 0.6951526032315979, + "grad_norm": 0.6030594706535339, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9680 + }, + { + "epoch": 0.6958707360861759, + "grad_norm": 0.6782388687133789, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 9690 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 0.6777627468109131, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 9700 + }, + { + "epoch": 0.6973070017953321, + "grad_norm": 0.5674123764038086, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 9710 + }, + { + "epoch": 0.6980251346499102, + "grad_norm": 0.5280387997627258, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 9720 + }, + { + "epoch": 0.6987432675044883, + "grad_norm": 0.5471981763839722, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 9730 + }, + { + "epoch": 0.6994614003590665, + "grad_norm": 0.6751061677932739, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9740 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.5942487716674805, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 9750 + }, + { + "epoch": 0.7008976660682226, + "grad_norm": 0.6165713667869568, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 9760 + }, + { + "epoch": 0.7016157989228007, + "grad_norm": 0.5745091438293457, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 9770 + }, + { + "epoch": 0.7023339317773788, + "grad_norm": 0.600308358669281, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 9780 + }, + { + "epoch": 0.7030520646319569, + "grad_norm": 0.6448577046394348, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 9790 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.5662767291069031, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9800 + }, + { + "epoch": 0.7044883303411131, + "grad_norm": 0.6490433812141418, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 9810 + }, + { + "epoch": 0.7052064631956912, + "grad_norm": 0.6126134991645813, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 9820 + }, + { + "epoch": 0.7059245960502692, + "grad_norm": 0.7181116938591003, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 9830 + }, + { + "epoch": 0.7066427289048474, + "grad_norm": 0.7805212140083313, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9840 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.7521958947181702, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9850 + }, + { + "epoch": 0.7080789946140036, + "grad_norm": 0.5610787868499756, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9860 + }, + { + "epoch": 0.7087971274685817, + "grad_norm": 0.7026229500770569, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 9870 + }, + { + "epoch": 0.7095152603231598, + "grad_norm": 0.551691472530365, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 9880 + }, + { + "epoch": 0.7102333931777379, + "grad_norm": 0.5841995477676392, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9890 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 0.7170061469078064, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 9900 + }, + { + "epoch": 0.711669658886894, + "grad_norm": 0.49836990237236023, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 9910 + }, + { + "epoch": 0.7123877917414722, + "grad_norm": 0.5234556794166565, + "learning_rate": 0.0002, + "loss": 0.7667, + "step": 9920 + }, + { + "epoch": 0.7131059245960503, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 9930 + }, + { + "epoch": 0.7138240574506284, + "grad_norm": 0.5657515525817871, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9940 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.5969128012657166, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 9950 + }, + { + "epoch": 0.7152603231597846, + "grad_norm": 0.7136867046356201, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 9960 + }, + { + "epoch": 0.7159784560143626, + "grad_norm": 0.6774699091911316, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9970 + }, + { + "epoch": 0.7166965888689407, + "grad_norm": 0.6066371202468872, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 9980 + }, + { + "epoch": 0.7174147217235188, + "grad_norm": 0.7355279922485352, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 9990 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 0.7996646761894226, + "learning_rate": 0.0002, + "loss": 0.7643, + "step": 10000 + }, + { + "epoch": 0.7188509874326751, + "grad_norm": 0.628839910030365, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10010 + }, + { + "epoch": 0.7195691202872532, + "grad_norm": 0.5472931265830994, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 10020 + }, + { + "epoch": 0.7202872531418313, + "grad_norm": 0.5776344537734985, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 10030 + }, + { + "epoch": 0.7210053859964093, + "grad_norm": 0.5041707158088684, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10040 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5965308547019958, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 10050 + }, + { + "epoch": 0.7224416517055655, + "grad_norm": 0.5892689228057861, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 10060 + }, + { + "epoch": 0.7231597845601436, + "grad_norm": 0.5695884227752686, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 10070 + }, + { + "epoch": 0.7238779174147217, + "grad_norm": 0.6547690629959106, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 10080 + }, + { + "epoch": 0.7245960502692999, + "grad_norm": 0.6759928464889526, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 10090 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.6829725503921509, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 10100 + }, + { + "epoch": 0.726032315978456, + "grad_norm": 0.5242751240730286, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 10110 + }, + { + "epoch": 0.7267504488330341, + "grad_norm": 0.6947014927864075, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 10120 + }, + { + "epoch": 0.7274685816876122, + "grad_norm": 0.6094982624053955, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10130 + }, + { + "epoch": 0.7281867145421903, + "grad_norm": 0.628461480140686, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 10140 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.4952087104320526, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10150 + }, + { + "epoch": 0.7296229802513465, + "grad_norm": 0.6917221546173096, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10160 + }, + { + "epoch": 0.7303411131059246, + "grad_norm": 0.6866413354873657, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10170 + }, + { + "epoch": 0.7310592459605026, + "grad_norm": 0.5505863428115845, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 10180 + }, + { + "epoch": 0.7317773788150808, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 10190 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.5001798272132874, + "learning_rate": 0.0002, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 0.733213644524237, + "grad_norm": 0.5117581486701965, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 10210 + }, + { + "epoch": 0.7339317773788151, + "grad_norm": 0.7716088891029358, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 10220 + }, + { + "epoch": 0.7346499102333932, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 10230 + }, + { + "epoch": 0.7353680430879713, + "grad_norm": 0.6433483362197876, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10240 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.6241081357002258, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10250 + }, + { + "epoch": 0.7368043087971274, + "grad_norm": 0.7198845744132996, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10260 + }, + { + "epoch": 0.7375224416517056, + "grad_norm": 0.5879023671150208, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 10270 + }, + { + "epoch": 0.7382405745062837, + "grad_norm": 0.5810162425041199, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 10280 + }, + { + "epoch": 0.7389587073608618, + "grad_norm": 0.6336500644683838, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10290 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.5627583861351013, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 10300 + }, + { + "epoch": 0.740394973070018, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 10310 + }, + { + "epoch": 0.741113105924596, + "grad_norm": 0.5519505143165588, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 10320 + }, + { + "epoch": 0.7418312387791741, + "grad_norm": 0.628710925579071, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 10330 + }, + { + "epoch": 0.7425493716337522, + "grad_norm": 0.6466957926750183, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10340 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 0.6269286274909973, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 10350 + }, + { + "epoch": 0.7439856373429085, + "grad_norm": 0.6985455751419067, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 10360 + }, + { + "epoch": 0.7447037701974866, + "grad_norm": 0.6203648447990417, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 10370 + }, + { + "epoch": 0.7454219030520647, + "grad_norm": 0.6524295210838318, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 10380 + }, + { + "epoch": 0.7461400359066427, + "grad_norm": 0.6108002662658691, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 10390 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 0.5196276903152466, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 10400 + }, + { + "epoch": 0.7475763016157989, + "grad_norm": 0.6207506656646729, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 10410 + }, + { + "epoch": 0.748294434470377, + "grad_norm": 0.6015686988830566, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 10420 + }, + { + "epoch": 0.7490125673249551, + "grad_norm": 0.6402649879455566, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 10430 + }, + { + "epoch": 0.7497307001795332, + "grad_norm": 0.7816081047058105, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 10440 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.6148143410682678, + "learning_rate": 0.0002, + "loss": 0.8021, + "step": 10450 + }, + { + "epoch": 0.7511669658886894, + "grad_norm": 0.6496613621711731, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 10460 + }, + { + "epoch": 0.7518850987432675, + "grad_norm": 0.49158045649528503, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 10470 + }, + { + "epoch": 0.7526032315978456, + "grad_norm": 0.8629217743873596, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 10480 + }, + { + "epoch": 0.7533213644524237, + "grad_norm": 0.6800066828727722, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 10490 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.6480063199996948, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 10500 + }, + { + "epoch": 0.7547576301615799, + "grad_norm": 0.5740751028060913, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10510 + }, + { + "epoch": 0.755475763016158, + "grad_norm": 0.7182627320289612, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 10520 + }, + { + "epoch": 0.756193895870736, + "grad_norm": 0.6482816934585571, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 10530 + }, + { + "epoch": 0.7569120287253142, + "grad_norm": 0.4937674105167389, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 10540 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.6818482875823975, + "learning_rate": 0.0002, + "loss": 0.7783, + "step": 10550 + }, + { + "epoch": 0.7583482944344704, + "grad_norm": 0.6375173926353455, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10560 + }, + { + "epoch": 0.7590664272890485, + "grad_norm": 0.528798520565033, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 10570 + }, + { + "epoch": 0.7597845601436266, + "grad_norm": 0.42099910974502563, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 10580 + }, + { + "epoch": 0.7605026929982047, + "grad_norm": 0.529604434967041, + "learning_rate": 0.0002, + "loss": 0.8218, + "step": 10590 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.6236841082572937, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 10600 + }, + { + "epoch": 0.7619389587073608, + "grad_norm": 0.6194891929626465, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10610 + }, + { + "epoch": 0.762657091561939, + "grad_norm": 0.5206209421157837, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 10620 + }, + { + "epoch": 0.7633752244165171, + "grad_norm": 0.7981295585632324, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 10630 + }, + { + "epoch": 0.7640933572710952, + "grad_norm": 0.6113479137420654, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 10640 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 0.7025435566902161, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10650 + }, + { + "epoch": 0.7655296229802514, + "grad_norm": 0.46914348006248474, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 10660 + }, + { + "epoch": 0.7662477558348294, + "grad_norm": 0.6134725213050842, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 10670 + }, + { + "epoch": 0.7669658886894075, + "grad_norm": 0.583859920501709, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10680 + }, + { + "epoch": 0.7676840215439856, + "grad_norm": 0.511349081993103, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10690 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.6467110514640808, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 10700 + }, + { + "epoch": 0.7691202872531419, + "grad_norm": 0.7210163474082947, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 10710 + }, + { + "epoch": 0.76983842010772, + "grad_norm": 0.6034521460533142, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 10720 + }, + { + "epoch": 0.7705565529622981, + "grad_norm": 0.6237271428108215, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 10730 + }, + { + "epoch": 0.7712746858168761, + "grad_norm": 0.664328396320343, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 10740 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.6550520062446594, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 10750 + }, + { + "epoch": 0.7727109515260323, + "grad_norm": 0.5103325843811035, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 10760 + }, + { + "epoch": 0.7734290843806104, + "grad_norm": 0.7171200513839722, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 10770 + }, + { + "epoch": 0.7741472172351885, + "grad_norm": 0.5947384834289551, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 10780 + }, + { + "epoch": 0.7748653500897666, + "grad_norm": 0.5293096899986267, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10790 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.6372577548027039, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10800 + }, + { + "epoch": 0.7763016157989228, + "grad_norm": 0.5738261938095093, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.7770197486535009, + "grad_norm": 0.7309247255325317, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 10820 + }, + { + "epoch": 0.777737881508079, + "grad_norm": 0.8867193460464478, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10830 + }, + { + "epoch": 0.7784560143626571, + "grad_norm": 0.6151437759399414, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 10840 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.5645464658737183, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 0.7798922800718133, + "grad_norm": 0.5118698477745056, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10860 + }, + { + "epoch": 0.7806104129263913, + "grad_norm": 0.618181049823761, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 10870 + }, + { + "epoch": 0.7813285457809694, + "grad_norm": 0.7206462025642395, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 10880 + }, + { + "epoch": 0.7820466786355476, + "grad_norm": 0.7993820905685425, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 10890 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.5072754621505737, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10900 + }, + { + "epoch": 0.7834829443447038, + "grad_norm": 0.5829088687896729, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 10910 + }, + { + "epoch": 0.7842010771992819, + "grad_norm": 0.5778957605361938, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 10920 + }, + { + "epoch": 0.78491921005386, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.785637342908438, + "grad_norm": 0.5778013467788696, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 10940 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.6129629611968994, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10950 + }, + { + "epoch": 0.7870736086175942, + "grad_norm": 0.5637320876121521, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10960 + }, + { + "epoch": 0.7877917414721723, + "grad_norm": 0.6253715753555298, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10970 + }, + { + "epoch": 0.7885098743267505, + "grad_norm": 0.6209888458251953, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10980 + }, + { + "epoch": 0.7892280071813286, + "grad_norm": 1.0841948986053467, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10990 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.6570560336112976, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 11000 + }, + { + "epoch": 0.7906642728904847, + "grad_norm": 0.4830388128757477, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11010 + }, + { + "epoch": 0.7913824057450628, + "grad_norm": 0.7607520222663879, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11020 + }, + { + "epoch": 0.7921005385996409, + "grad_norm": 0.8202590346336365, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 11030 + }, + { + "epoch": 0.792818671454219, + "grad_norm": 0.5640848278999329, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11040 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 0.7773675322532654, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 11050 + }, + { + "epoch": 0.7942549371633753, + "grad_norm": 0.664139986038208, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11060 + }, + { + "epoch": 0.7949730700179534, + "grad_norm": 0.6097795367240906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 11070 + }, + { + "epoch": 0.7956912028725314, + "grad_norm": 0.9208881258964539, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 11080 + }, + { + "epoch": 0.7964093357271095, + "grad_norm": 0.6210731863975525, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 11090 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.7060235738754272, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 11100 + }, + { + "epoch": 0.7978456014362657, + "grad_norm": 0.48695266246795654, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 11110 + }, + { + "epoch": 0.7985637342908438, + "grad_norm": 0.6458830833435059, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 11120 + }, + { + "epoch": 0.7992818671454219, + "grad_norm": 0.572545051574707, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 11130 + }, + { + "epoch": 0.8, + "grad_norm": 0.5925027132034302, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 11140 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 0.569622278213501, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 11150 + }, + { + "epoch": 0.8014362657091562, + "grad_norm": 0.537146806716919, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 11160 + }, + { + "epoch": 0.8021543985637343, + "grad_norm": 0.7118613719940186, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 11170 + }, + { + "epoch": 0.8028725314183124, + "grad_norm": 0.6183688044548035, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 11180 + }, + { + "epoch": 0.8035906642728905, + "grad_norm": 0.5187385082244873, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 11190 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.5422571301460266, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 11200 + }, + { + "epoch": 0.8050269299820467, + "grad_norm": 0.635050892829895, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 11210 + }, + { + "epoch": 0.8057450628366247, + "grad_norm": 0.6584872007369995, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 11220 + }, + { + "epoch": 0.8064631956912028, + "grad_norm": 0.624921977519989, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 11230 + }, + { + "epoch": 0.807181328545781, + "grad_norm": 0.6837546229362488, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 11240 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 0.5861160755157471, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11250 + }, + { + "epoch": 0.8086175942549372, + "grad_norm": 0.5751383900642395, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 11260 + }, + { + "epoch": 0.8093357271095153, + "grad_norm": 0.7181510329246521, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11270 + }, + { + "epoch": 0.8100538599640934, + "grad_norm": 0.5862139463424683, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11280 + }, + { + "epoch": 0.8107719928186714, + "grad_norm": 0.4880113899707794, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 11290 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.565590500831604, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 11300 + }, + { + "epoch": 0.8122082585278276, + "grad_norm": 0.6171264052391052, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11310 + }, + { + "epoch": 0.8129263913824057, + "grad_norm": 0.5815969109535217, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 11320 + }, + { + "epoch": 0.8136445242369839, + "grad_norm": 0.5407653450965881, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 11330 + }, + { + "epoch": 0.814362657091562, + "grad_norm": 0.6990084648132324, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 11340 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.5845068097114563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11350 + }, + { + "epoch": 0.8157989228007181, + "grad_norm": 0.5978701114654541, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11360 + }, + { + "epoch": 0.8165170556552962, + "grad_norm": 0.6873053312301636, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 11370 + }, + { + "epoch": 0.8172351885098743, + "grad_norm": 0.7048654556274414, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 11380 + }, + { + "epoch": 0.8179533213644524, + "grad_norm": 0.7631531953811646, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 11390 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.704922080039978, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 11400 + }, + { + "epoch": 0.8193895870736086, + "grad_norm": 0.595460832118988, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11410 + }, + { + "epoch": 0.8201077199281868, + "grad_norm": 0.5882242918014526, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 11420 + }, + { + "epoch": 0.8208258527827648, + "grad_norm": 0.6433175206184387, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 11430 + }, + { + "epoch": 0.8215439856373429, + "grad_norm": 0.6047986149787903, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 11440 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 11450 + }, + { + "epoch": 0.8229802513464991, + "grad_norm": 0.5558379888534546, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 11460 + }, + { + "epoch": 0.8236983842010772, + "grad_norm": 0.6745542287826538, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 11470 + }, + { + "epoch": 0.8244165170556553, + "grad_norm": 0.7082334756851196, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 11480 + }, + { + "epoch": 0.8251346499102334, + "grad_norm": 0.703889787197113, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11490 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.5261096358299255, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 11500 + }, + { + "epoch": 0.8265709156193896, + "grad_norm": 0.6009393930435181, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 11510 + }, + { + "epoch": 0.8272890484739677, + "grad_norm": 0.584274172782898, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 11520 + }, + { + "epoch": 0.8280071813285458, + "grad_norm": 0.6803238987922668, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 11530 + }, + { + "epoch": 0.8287253141831239, + "grad_norm": 0.6230084896087646, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 11540 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.6090595722198486, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 11550 + }, + { + "epoch": 0.8301615798922801, + "grad_norm": 0.5292693376541138, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 11560 + }, + { + "epoch": 0.8308797127468581, + "grad_norm": 0.5675389766693115, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 11570 + }, + { + "epoch": 0.8315978456014362, + "grad_norm": 0.554874062538147, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 11580 + }, + { + "epoch": 0.8323159784560143, + "grad_norm": 0.8582373261451721, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 11590 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.5743035674095154, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 11600 + }, + { + "epoch": 0.8337522441651706, + "grad_norm": 0.5749582648277283, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11610 + }, + { + "epoch": 0.8344703770197487, + "grad_norm": 0.5207278728485107, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11620 + }, + { + "epoch": 0.8351885098743268, + "grad_norm": 0.6262611150741577, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 11630 + }, + { + "epoch": 0.8359066427289048, + "grad_norm": 0.5490066409111023, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 11640 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6283167600631714, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 11650 + }, + { + "epoch": 0.837342908438061, + "grad_norm": 0.7701452374458313, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 11660 + }, + { + "epoch": 0.8380610412926391, + "grad_norm": 0.5825072526931763, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 11670 + }, + { + "epoch": 0.8387791741472173, + "grad_norm": 0.6119720935821533, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 11680 + }, + { + "epoch": 0.8394973070017954, + "grad_norm": 0.689383327960968, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 11690 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.5396560430526733, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 11700 + }, + { + "epoch": 0.8409335727109515, + "grad_norm": 0.577178955078125, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 11710 + }, + { + "epoch": 0.8416517055655296, + "grad_norm": 0.6652564406394958, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 11720 + }, + { + "epoch": 0.8423698384201077, + "grad_norm": 0.588377058506012, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 11730 + }, + { + "epoch": 0.8430879712746858, + "grad_norm": 0.6180438995361328, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 11740 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.6897811889648438, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11750 + }, + { + "epoch": 0.844524236983842, + "grad_norm": 0.5826608538627625, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 11760 + }, + { + "epoch": 0.8452423698384202, + "grad_norm": 0.6511976718902588, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 11770 + }, + { + "epoch": 0.8459605026929982, + "grad_norm": 0.4738382399082184, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 11780 + }, + { + "epoch": 0.8466786355475763, + "grad_norm": 0.541780948638916, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 11790 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 0.6115241050720215, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 11800 + }, + { + "epoch": 0.8481149012567325, + "grad_norm": 0.7067801356315613, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 11810 + }, + { + "epoch": 0.8488330341113106, + "grad_norm": 0.5602791905403137, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 11820 + }, + { + "epoch": 0.8495511669658887, + "grad_norm": 0.6968005299568176, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 11830 + }, + { + "epoch": 0.8502692998204668, + "grad_norm": 0.621132493019104, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11840 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.5777568817138672, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11850 + }, + { + "epoch": 0.851705565529623, + "grad_norm": 0.6468178629875183, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 11860 + }, + { + "epoch": 0.8524236983842011, + "grad_norm": 0.6216070652008057, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 11870 + }, + { + "epoch": 0.8531418312387792, + "grad_norm": 0.7402005791664124, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 11880 + }, + { + "epoch": 0.8538599640933573, + "grad_norm": 0.5192958116531372, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 11890 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 0.6050501465797424, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 11900 + }, + { + "epoch": 0.8552962298025135, + "grad_norm": 0.5363124012947083, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11910 + }, + { + "epoch": 0.8560143626570915, + "grad_norm": 0.525288462638855, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11920 + }, + { + "epoch": 0.8567324955116696, + "grad_norm": 0.6129848957061768, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 11930 + }, + { + "epoch": 0.8574506283662477, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 11940 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.5862830281257629, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 11950 + }, + { + "epoch": 0.858886894075404, + "grad_norm": 0.7078025341033936, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 11960 + }, + { + "epoch": 0.8596050269299821, + "grad_norm": 0.6600908637046814, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 11970 + }, + { + "epoch": 0.8603231597845602, + "grad_norm": 0.5914377570152283, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 11980 + }, + { + "epoch": 0.8610412926391382, + "grad_norm": 0.7844575047492981, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 11990 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.6605148315429688, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12000 + }, + { + "epoch": 0.8624775583482944, + "grad_norm": 0.6320111155509949, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 12010 + }, + { + "epoch": 0.8631956912028725, + "grad_norm": 0.5833557844161987, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 12020 + }, + { + "epoch": 0.8639138240574507, + "grad_norm": 0.5322666764259338, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 12030 + }, + { + "epoch": 0.8646319569120288, + "grad_norm": 0.568696141242981, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 12040 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 0.5739135146141052, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 12050 + }, + { + "epoch": 0.8660682226211849, + "grad_norm": 0.6667993068695068, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 12060 + }, + { + "epoch": 0.866786355475763, + "grad_norm": 0.5393701195716858, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 12070 + }, + { + "epoch": 0.8675044883303411, + "grad_norm": 0.7036312818527222, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 12080 + }, + { + "epoch": 0.8682226211849192, + "grad_norm": 0.5851739048957825, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 12090 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.6554462909698486, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 12100 + }, + { + "epoch": 0.8696588868940754, + "grad_norm": 0.8224838376045227, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 12110 + }, + { + "epoch": 0.8703770197486534, + "grad_norm": 0.513981819152832, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 12120 + }, + { + "epoch": 0.8710951526032316, + "grad_norm": 0.6913988590240479, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 12130 + }, + { + "epoch": 0.8718132854578097, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 12140 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.6216937303543091, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 12150 + }, + { + "epoch": 0.8732495511669659, + "grad_norm": 0.5594495534896851, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 12160 + }, + { + "epoch": 0.873967684021544, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 12170 + }, + { + "epoch": 0.8746858168761221, + "grad_norm": 0.5285239815711975, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 12180 + }, + { + "epoch": 0.8754039497307001, + "grad_norm": 1.0394607782363892, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 12190 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 12200 + }, + { + "epoch": 0.8768402154398564, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 12210 + }, + { + "epoch": 0.8775583482944345, + "grad_norm": 0.593204915523529, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 12220 + }, + { + "epoch": 0.8782764811490126, + "grad_norm": 0.7141679525375366, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 12230 + }, + { + "epoch": 0.8789946140035907, + "grad_norm": 0.6381585597991943, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 12240 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.7076981067657471, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12250 + }, + { + "epoch": 0.8804308797127468, + "grad_norm": 0.8046461939811707, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 12260 + }, + { + "epoch": 0.8811490125673249, + "grad_norm": 0.635160505771637, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 12270 + }, + { + "epoch": 0.881867145421903, + "grad_norm": 0.6388354301452637, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8825852782764811, + "grad_norm": 0.5612906217575073, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 12290 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6716228723526001, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 12300 + }, + { + "epoch": 0.8840215439856374, + "grad_norm": 0.6488762497901917, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 12310 + }, + { + "epoch": 0.8847396768402155, + "grad_norm": 0.5770853757858276, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 12320 + }, + { + "epoch": 0.8854578096947935, + "grad_norm": 0.5006616711616516, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 12330 + }, + { + "epoch": 0.8861759425493716, + "grad_norm": 0.6428417563438416, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 12340 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 0.5721977949142456, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12350 + }, + { + "epoch": 0.8876122082585278, + "grad_norm": 0.7000266313552856, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 12360 + }, + { + "epoch": 0.8883303411131059, + "grad_norm": 0.5252631306648254, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 12370 + }, + { + "epoch": 0.889048473967684, + "grad_norm": 0.5788044929504395, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 0.8897666068222622, + "grad_norm": 0.6730653643608093, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.5556851029396057, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 12400 + }, + { + "epoch": 0.8912028725314183, + "grad_norm": 0.616189181804657, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 12410 + }, + { + "epoch": 0.8919210053859964, + "grad_norm": 0.6360940337181091, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 12420 + }, + { + "epoch": 0.8926391382405745, + "grad_norm": 0.5832887887954712, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 12430 + }, + { + "epoch": 0.8933572710951526, + "grad_norm": 0.8319168090820312, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 12440 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.5415005087852478, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 12450 + }, + { + "epoch": 0.8947935368043088, + "grad_norm": 0.4959808588027954, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 12460 + }, + { + "epoch": 0.8955116696588868, + "grad_norm": 0.5102260708808899, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 12470 + }, + { + "epoch": 0.896229802513465, + "grad_norm": 0.773972749710083, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12480 + }, + { + "epoch": 0.8969479353680431, + "grad_norm": 0.6314513087272644, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 12490 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.6503705382347107, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 12500 + }, + { + "epoch": 0.8983842010771993, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 12510 + }, + { + "epoch": 0.8991023339317774, + "grad_norm": 0.7222756743431091, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 12520 + }, + { + "epoch": 0.8998204667863555, + "grad_norm": 0.7242336869239807, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 12530 + }, + { + "epoch": 0.9005385996409335, + "grad_norm": 0.625769317150116, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 12540 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.6003357172012329, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 12550 + }, + { + "epoch": 0.9019748653500897, + "grad_norm": 0.6089374423027039, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 12560 + }, + { + "epoch": 0.9026929982046679, + "grad_norm": 0.6232544183731079, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 12570 + }, + { + "epoch": 0.903411131059246, + "grad_norm": 0.5426769256591797, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 12580 + }, + { + "epoch": 0.9041292639138241, + "grad_norm": 0.5711943507194519, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 12590 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.5287838578224182, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 12600 + }, + { + "epoch": 0.9055655296229802, + "grad_norm": 0.6192951798439026, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 12610 + }, + { + "epoch": 0.9062836624775583, + "grad_norm": 0.493082195520401, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 12620 + }, + { + "epoch": 0.9070017953321364, + "grad_norm": 0.7668463587760925, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 12630 + }, + { + "epoch": 0.9077199281867145, + "grad_norm": 0.6298037767410278, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 12640 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.5502580404281616, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 12650 + }, + { + "epoch": 0.9091561938958708, + "grad_norm": 0.5525170564651489, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.9098743267504489, + "grad_norm": 0.9753695726394653, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 12670 + }, + { + "epoch": 0.9105924596050269, + "grad_norm": 0.611427366733551, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 12680 + }, + { + "epoch": 0.911310592459605, + "grad_norm": 0.5141594409942627, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 12690 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 0.6739137172698975, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 12700 + }, + { + "epoch": 0.9127468581687612, + "grad_norm": 0.5759707689285278, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 12710 + }, + { + "epoch": 0.9134649910233393, + "grad_norm": 0.5548733472824097, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12720 + }, + { + "epoch": 0.9141831238779174, + "grad_norm": 0.7014280557632446, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 12730 + }, + { + "epoch": 0.9149012567324956, + "grad_norm": 0.5939958691596985, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 12740 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 0.5995593667030334, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12750 + }, + { + "epoch": 0.9163375224416517, + "grad_norm": 0.6686680316925049, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 12760 + }, + { + "epoch": 0.9170556552962298, + "grad_norm": 0.4742372930049896, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 12770 + }, + { + "epoch": 0.9177737881508079, + "grad_norm": 0.5493217706680298, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 12780 + }, + { + "epoch": 0.918491921005386, + "grad_norm": 0.5641885995864868, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 12790 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 0.5814061164855957, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 12800 + }, + { + "epoch": 0.9199281867145422, + "grad_norm": 0.6774331331253052, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 12810 + }, + { + "epoch": 0.9206463195691202, + "grad_norm": 0.5592127442359924, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 12820 + }, + { + "epoch": 0.9213644524236984, + "grad_norm": 0.5246456861495972, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 12830 + }, + { + "epoch": 0.9220825852782765, + "grad_norm": 0.6524264812469482, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 12840 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 0.6010791063308716, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12850 + }, + { + "epoch": 0.9235188509874327, + "grad_norm": 0.5289866924285889, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 12860 + }, + { + "epoch": 0.9242369838420108, + "grad_norm": 0.6850762367248535, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 12870 + }, + { + "epoch": 0.9249551166965889, + "grad_norm": 0.5293797850608826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 12880 + }, + { + "epoch": 0.9256732495511669, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 12890 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.7026739716529846, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 12900 + }, + { + "epoch": 0.9271095152603231, + "grad_norm": 0.6884756684303284, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 12910 + }, + { + "epoch": 0.9278276481149013, + "grad_norm": 0.637884795665741, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 12920 + }, + { + "epoch": 0.9285457809694794, + "grad_norm": 0.513913631439209, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 12930 + }, + { + "epoch": 0.9292639138240575, + "grad_norm": 0.6642340421676636, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 12940 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.5708861947059631, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 12950 + }, + { + "epoch": 0.9307001795332136, + "grad_norm": 0.5896512866020203, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 12960 + }, + { + "epoch": 0.9314183123877917, + "grad_norm": 0.5754874348640442, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 12970 + }, + { + "epoch": 0.9321364452423698, + "grad_norm": 0.6363751888275146, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 12980 + }, + { + "epoch": 0.9328545780969479, + "grad_norm": 0.7660197019577026, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 12990 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.607728898525238, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 13000 + }, + { + "epoch": 0.9342908438061042, + "grad_norm": 0.5257042050361633, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 13010 + }, + { + "epoch": 0.9350089766606823, + "grad_norm": 0.7916908264160156, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 13020 + }, + { + "epoch": 0.9357271095152603, + "grad_norm": 0.8310123085975647, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 13030 + }, + { + "epoch": 0.9364452423698384, + "grad_norm": 0.6543728113174438, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 13040 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.7153878808021545, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 13050 + }, + { + "epoch": 0.9378815080789946, + "grad_norm": 0.7510694265365601, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 13060 + }, + { + "epoch": 0.9385996409335727, + "grad_norm": 0.5524464249610901, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 13070 + }, + { + "epoch": 0.9393177737881508, + "grad_norm": 0.6657140254974365, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 13080 + }, + { + "epoch": 0.940035906642729, + "grad_norm": 0.5757394433021545, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 13090 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.6171187162399292, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 13100 + }, + { + "epoch": 0.9414721723518851, + "grad_norm": 0.5946314334869385, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 13110 + }, + { + "epoch": 0.9421903052064632, + "grad_norm": 0.5727229714393616, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 13120 + }, + { + "epoch": 0.9429084380610413, + "grad_norm": 0.7805224061012268, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 13130 + }, + { + "epoch": 0.9436265709156194, + "grad_norm": 0.5763523578643799, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 13140 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 0.8310899138450623, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13150 + }, + { + "epoch": 0.9450628366247756, + "grad_norm": 0.7531784772872925, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 13160 + }, + { + "epoch": 0.9457809694793536, + "grad_norm": 0.678779661655426, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 13170 + }, + { + "epoch": 0.9464991023339318, + "grad_norm": 0.8096453547477722, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13180 + }, + { + "epoch": 0.9472172351885099, + "grad_norm": 0.6743921637535095, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 13190 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.606852114200592, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 13200 + }, + { + "epoch": 0.9486535008976661, + "grad_norm": 0.6550270915031433, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 13210 + }, + { + "epoch": 0.9493716337522442, + "grad_norm": 0.6494552493095398, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 13220 + }, + { + "epoch": 0.9500897666068223, + "grad_norm": 0.5867666602134705, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 13230 + }, + { + "epoch": 0.9508078994614003, + "grad_norm": 0.6283786296844482, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 13240 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 0.6824573278427124, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 13250 + }, + { + "epoch": 0.9522441651705565, + "grad_norm": 0.6945744156837463, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 13260 + }, + { + "epoch": 0.9529622980251347, + "grad_norm": 0.6468575596809387, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 13270 + }, + { + "epoch": 0.9536804308797128, + "grad_norm": 0.6819407939910889, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 0.9543985637342909, + "grad_norm": 0.6660491824150085, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 13290 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6320462226867676, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 13300 + }, + { + "epoch": 0.955834829443447, + "grad_norm": 0.46753761172294617, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 13310 + }, + { + "epoch": 0.9565529622980251, + "grad_norm": 0.6608774065971375, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 13320 + }, + { + "epoch": 0.9572710951526032, + "grad_norm": 0.607448935508728, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 13330 + }, + { + "epoch": 0.9579892280071813, + "grad_norm": 0.6796701550483704, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 13340 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.7655861377716064, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 13350 + }, + { + "epoch": 0.9594254937163376, + "grad_norm": 0.5881335735321045, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 13360 + }, + { + "epoch": 0.9601436265709156, + "grad_norm": 0.6855270862579346, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 13370 + }, + { + "epoch": 0.9608617594254937, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 13380 + }, + { + "epoch": 0.9615798922800718, + "grad_norm": 0.5983994603157043, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 13390 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.6141189932823181, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 13400 + }, + { + "epoch": 0.963016157989228, + "grad_norm": 0.6539722084999084, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 13410 + }, + { + "epoch": 0.9637342908438061, + "grad_norm": 0.5425801277160645, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 13420 + }, + { + "epoch": 0.9644524236983842, + "grad_norm": 0.8038925528526306, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 13430 + }, + { + "epoch": 0.9651705565529622, + "grad_norm": 0.5729590058326721, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 13440 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5695241689682007, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 13450 + }, + { + "epoch": 0.9666068222621185, + "grad_norm": 0.5913681387901306, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 13460 + }, + { + "epoch": 0.9673249551166966, + "grad_norm": 1.1798994541168213, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 13470 + }, + { + "epoch": 0.9680430879712747, + "grad_norm": 0.5931369066238403, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 13480 + }, + { + "epoch": 0.9687612208258528, + "grad_norm": 0.6269514560699463, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 13490 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.7380245327949524, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 13500 + }, + { + "epoch": 0.9701974865350089, + "grad_norm": 0.5668187141418457, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 13510 + }, + { + "epoch": 0.970915619389587, + "grad_norm": 0.547149121761322, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 13520 + }, + { + "epoch": 0.9716337522441651, + "grad_norm": 0.49131739139556885, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 13530 + }, + { + "epoch": 0.9723518850987433, + "grad_norm": 0.6385366320610046, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 13540 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5962417125701904, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 13550 + }, + { + "epoch": 0.9737881508078995, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 13560 + }, + { + "epoch": 0.9745062836624776, + "grad_norm": 0.5757403373718262, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 13570 + }, + { + "epoch": 0.9752244165170556, + "grad_norm": 0.7214667201042175, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 13580 + }, + { + "epoch": 0.9759425493716337, + "grad_norm": 0.5902701020240784, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 13590 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.752805769443512, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 13600 + }, + { + "epoch": 0.9773788150807899, + "grad_norm": 0.5943595767021179, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 13610 + }, + { + "epoch": 0.978096947935368, + "grad_norm": 0.6752488613128662, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 13620 + }, + { + "epoch": 0.9788150807899462, + "grad_norm": 0.5295413732528687, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 13630 + }, + { + "epoch": 0.9795332136445243, + "grad_norm": 0.732549250125885, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13640 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.5701823830604553, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 13650 + }, + { + "epoch": 0.9809694793536804, + "grad_norm": 0.576898455619812, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13660 + }, + { + "epoch": 0.9816876122082585, + "grad_norm": 0.5916832089424133, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 13670 + }, + { + "epoch": 0.9824057450628366, + "grad_norm": 0.5554524660110474, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 13680 + }, + { + "epoch": 0.9831238779174147, + "grad_norm": 0.6988440752029419, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 13690 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 13700 + }, + { + "epoch": 0.984560143626571, + "grad_norm": 2.421210289001465, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13710 + }, + { + "epoch": 0.985278276481149, + "grad_norm": 0.6307598948478699, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 13720 + }, + { + "epoch": 0.9859964093357271, + "grad_norm": 0.6832480430603027, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 13730 + }, + { + "epoch": 0.9867145421903052, + "grad_norm": 0.5974255204200745, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13740 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.6540380716323853, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 13750 + }, + { + "epoch": 0.9881508078994614, + "grad_norm": 0.7532727122306824, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 13760 + }, + { + "epoch": 0.9888689407540395, + "grad_norm": 0.6776283383369446, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 13770 + }, + { + "epoch": 0.9895870736086176, + "grad_norm": 0.5776281356811523, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 13780 + }, + { + "epoch": 0.9903052064631956, + "grad_norm": 0.5473008751869202, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 13790 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.5428591370582581, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 13800 + }, + { + "epoch": 0.9917414721723519, + "grad_norm": 0.5173406004905701, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 13810 + }, + { + "epoch": 0.99245960502693, + "grad_norm": 0.6462617516517639, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 13820 + }, + { + "epoch": 0.9931777378815081, + "grad_norm": 0.5800426006317139, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 13830 + }, + { + "epoch": 0.9938958707360862, + "grad_norm": 0.5015466809272766, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 13840 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.59474778175354, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 13850 + }, + { + "epoch": 0.9953321364452423, + "grad_norm": 0.5609583258628845, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 13860 + }, + { + "epoch": 0.9960502692998204, + "grad_norm": 0.5762063264846802, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 13870 + }, + { + "epoch": 0.9967684021543985, + "grad_norm": 0.6419214010238647, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 13880 + }, + { + "epoch": 0.9974865350089767, + "grad_norm": 0.7821950316429138, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 13890 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.6216017007827759, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 13900 + }, + { + "epoch": 0.9989228007181329, + "grad_norm": 0.5446485877037048, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 13910 + }, + { + "epoch": 0.999640933572711, + "grad_norm": 0.5037565231323242, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 13920 + }, + { + "epoch": 1.0, + "eval_loss": 1.09147310256958, + "eval_runtime": 55.1915, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 13925 + }, + { + "epoch": 1.000359066427289, + "grad_norm": 0.5808277130126953, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 13930 + }, + { + "epoch": 1.0010771992818672, + "grad_norm": 0.47258496284484863, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 13940 + }, + { + "epoch": 1.0017953321364452, + "grad_norm": 0.8921670317649841, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 13950 + }, + { + "epoch": 1.0025134649910232, + "grad_norm": 0.746729850769043, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 13960 + }, + { + "epoch": 1.0032315978456015, + "grad_norm": 0.6243796944618225, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13970 + }, + { + "epoch": 1.0039497307001795, + "grad_norm": 0.6725090742111206, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 13980 + }, + { + "epoch": 1.0046678635547577, + "grad_norm": 0.8762497305870056, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 13990 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 0.7694411873817444, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 14000 + }, + { + "epoch": 1.006104129263914, + "grad_norm": 0.6208822727203369, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 14010 + }, + { + "epoch": 1.006822262118492, + "grad_norm": 0.8503357768058777, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 14020 + }, + { + "epoch": 1.00754039497307, + "grad_norm": 0.5813316106796265, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14030 + }, + { + "epoch": 1.0082585278276481, + "grad_norm": 0.8186036348342896, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 14040 + }, + { + "epoch": 1.0089766606822261, + "grad_norm": 0.759873628616333, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14050 + }, + { + "epoch": 1.0096947935368044, + "grad_norm": 0.8437777161598206, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 14060 + }, + { + "epoch": 1.0104129263913824, + "grad_norm": 0.5750975012779236, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14070 + }, + { + "epoch": 1.0111310592459606, + "grad_norm": 0.5873221158981323, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 14080 + }, + { + "epoch": 1.0118491921005386, + "grad_norm": 0.6381314396858215, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 14090 + }, + { + "epoch": 1.0125673249551166, + "grad_norm": 0.6510405540466309, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 14100 + }, + { + "epoch": 1.0132854578096948, + "grad_norm": 0.7698671221733093, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 14110 + }, + { + "epoch": 1.0140035906642728, + "grad_norm": 0.646180272102356, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 14120 + }, + { + "epoch": 1.014721723518851, + "grad_norm": 0.6183205246925354, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 14130 + }, + { + "epoch": 1.015439856373429, + "grad_norm": 0.5082563757896423, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 14140 + }, + { + "epoch": 1.0161579892280073, + "grad_norm": 0.7285500764846802, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 14150 + }, + { + "epoch": 1.0168761220825853, + "grad_norm": 0.6368175148963928, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 14160 + }, + { + "epoch": 1.0175942549371633, + "grad_norm": 0.44868743419647217, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 14170 + }, + { + "epoch": 1.0183123877917415, + "grad_norm": 0.6346513628959656, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 14180 + }, + { + "epoch": 1.0190305206463195, + "grad_norm": 0.7287803292274475, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 14190 + }, + { + "epoch": 1.0197486535008977, + "grad_norm": 0.6701363325119019, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 14200 + }, + { + "epoch": 1.0204667863554757, + "grad_norm": 0.6419289112091064, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 14210 + }, + { + "epoch": 1.021184919210054, + "grad_norm": 0.7703002095222473, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 14220 + }, + { + "epoch": 1.021903052064632, + "grad_norm": 0.6803670525550842, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14230 + }, + { + "epoch": 1.02262118491921, + "grad_norm": 0.5780976414680481, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 14240 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 0.5096051096916199, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 14250 + }, + { + "epoch": 1.0240574506283662, + "grad_norm": 0.6058611869812012, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 14260 + }, + { + "epoch": 1.0247755834829444, + "grad_norm": 0.6703311204910278, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 14270 + }, + { + "epoch": 1.0254937163375224, + "grad_norm": 0.7143640518188477, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 14280 + }, + { + "epoch": 1.0262118491921006, + "grad_norm": 0.6730744242668152, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 14290 + }, + { + "epoch": 1.0269299820466786, + "grad_norm": 0.8180603384971619, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14300 + }, + { + "epoch": 1.0276481149012566, + "grad_norm": 0.6752267479896545, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 14310 + }, + { + "epoch": 1.0283662477558349, + "grad_norm": 0.678428590297699, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 14320 + }, + { + "epoch": 1.0290843806104129, + "grad_norm": 0.5959973931312561, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 14330 + }, + { + "epoch": 1.029802513464991, + "grad_norm": 0.5797176957130432, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 14340 + }, + { + "epoch": 1.030520646319569, + "grad_norm": 0.6415652632713318, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 14350 + }, + { + "epoch": 1.0312387791741473, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 14360 + }, + { + "epoch": 1.0319569120287253, + "grad_norm": 0.7158452272415161, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 14370 + }, + { + "epoch": 1.0326750448833033, + "grad_norm": 0.6066089272499084, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 14380 + }, + { + "epoch": 1.0333931777378815, + "grad_norm": 0.7359582781791687, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 14390 + }, + { + "epoch": 1.0341113105924595, + "grad_norm": 0.7372373938560486, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 14400 + }, + { + "epoch": 1.0348294434470378, + "grad_norm": 0.7511868476867676, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 14410 + }, + { + "epoch": 1.0355475763016158, + "grad_norm": 0.5449917912483215, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 14420 + }, + { + "epoch": 1.036265709156194, + "grad_norm": 0.6700817346572876, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 14430 + }, + { + "epoch": 1.036983842010772, + "grad_norm": 0.7061316967010498, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14440 + }, + { + "epoch": 1.03770197486535, + "grad_norm": 0.7582663893699646, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 14450 + }, + { + "epoch": 1.0384201077199282, + "grad_norm": 0.6408873200416565, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 14460 + }, + { + "epoch": 1.0391382405745062, + "grad_norm": 0.7645436525344849, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 14470 + }, + { + "epoch": 1.0398563734290844, + "grad_norm": 0.6522644758224487, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 14480 + }, + { + "epoch": 1.0405745062836624, + "grad_norm": 0.784273624420166, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 14490 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 0.673891544342041, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 14500 + }, + { + "epoch": 1.0420107719928187, + "grad_norm": 0.6566316485404968, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 14510 + }, + { + "epoch": 1.0427289048473967, + "grad_norm": 0.6062059998512268, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 14520 + }, + { + "epoch": 1.0434470377019749, + "grad_norm": 0.6884504556655884, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14530 + }, + { + "epoch": 1.044165170556553, + "grad_norm": 0.6642231345176697, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14540 + }, + { + "epoch": 1.0448833034111311, + "grad_norm": 0.6989523768424988, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 14550 + }, + { + "epoch": 1.0456014362657091, + "grad_norm": 0.8179892301559448, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 14560 + }, + { + "epoch": 1.0463195691202873, + "grad_norm": 0.6426970362663269, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 14570 + }, + { + "epoch": 1.0470377019748653, + "grad_norm": 0.678445041179657, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 14580 + }, + { + "epoch": 1.0477558348294433, + "grad_norm": 0.7573820352554321, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 14590 + }, + { + "epoch": 1.0484739676840216, + "grad_norm": 0.734443724155426, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 14600 + }, + { + "epoch": 1.0491921005385996, + "grad_norm": 0.7333676218986511, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14610 + }, + { + "epoch": 1.0499102333931778, + "grad_norm": 0.6122187972068787, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14620 + }, + { + "epoch": 1.0506283662477558, + "grad_norm": 0.6916412711143494, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 14630 + }, + { + "epoch": 1.051346499102334, + "grad_norm": 0.5898127555847168, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 14640 + }, + { + "epoch": 1.052064631956912, + "grad_norm": 0.6071873307228088, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14650 + }, + { + "epoch": 1.05278276481149, + "grad_norm": 0.6530455946922302, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 14660 + }, + { + "epoch": 1.0535008976660682, + "grad_norm": 0.6919314861297607, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14670 + }, + { + "epoch": 1.0542190305206462, + "grad_norm": 0.7843509912490845, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 14680 + }, + { + "epoch": 1.0549371633752245, + "grad_norm": 0.6106747388839722, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 14690 + }, + { + "epoch": 1.0556552962298025, + "grad_norm": 0.7828368544578552, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 14700 + }, + { + "epoch": 1.0563734290843807, + "grad_norm": 0.6772044897079468, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 14710 + }, + { + "epoch": 1.0570915619389587, + "grad_norm": 0.5430962443351746, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 14720 + }, + { + "epoch": 1.0578096947935367, + "grad_norm": 0.7364194989204407, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 14730 + }, + { + "epoch": 1.058527827648115, + "grad_norm": 0.5607585310935974, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 14740 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 0.7917081713676453, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 14750 + }, + { + "epoch": 1.0599640933572712, + "grad_norm": 0.7852025628089905, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 14760 + }, + { + "epoch": 1.0606822262118492, + "grad_norm": 0.6329161524772644, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 14770 + }, + { + "epoch": 1.0614003590664274, + "grad_norm": 0.7607306838035583, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14780 + }, + { + "epoch": 1.0621184919210054, + "grad_norm": 0.7236617207527161, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14790 + }, + { + "epoch": 1.0628366247755834, + "grad_norm": 0.793542206287384, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 14800 + }, + { + "epoch": 1.0635547576301616, + "grad_norm": 0.53999263048172, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 14810 + }, + { + "epoch": 1.0642728904847396, + "grad_norm": 0.5821034908294678, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 14820 + }, + { + "epoch": 1.0649910233393178, + "grad_norm": 0.6593600511550903, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 14830 + }, + { + "epoch": 1.0657091561938958, + "grad_norm": 0.70230633020401, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 14840 + }, + { + "epoch": 1.066427289048474, + "grad_norm": 0.5715264081954956, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14850 + }, + { + "epoch": 1.067145421903052, + "grad_norm": 0.6610119938850403, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 14860 + }, + { + "epoch": 1.06786355475763, + "grad_norm": 0.5470091700553894, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 14870 + }, + { + "epoch": 1.0685816876122083, + "grad_norm": 0.7529906630516052, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 14880 + }, + { + "epoch": 1.0692998204667863, + "grad_norm": 0.7532844543457031, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 14890 + }, + { + "epoch": 1.0700179533213645, + "grad_norm": 0.6439316868782043, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14900 + }, + { + "epoch": 1.0707360861759425, + "grad_norm": 0.5580114126205444, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14910 + }, + { + "epoch": 1.0714542190305207, + "grad_norm": 0.6299236416816711, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 14920 + }, + { + "epoch": 1.0721723518850987, + "grad_norm": 0.6934021711349487, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 14930 + }, + { + "epoch": 1.0728904847396767, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 14940 + }, + { + "epoch": 1.073608617594255, + "grad_norm": 0.8921014070510864, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14950 + }, + { + "epoch": 1.074326750448833, + "grad_norm": 0.5934301614761353, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 14960 + }, + { + "epoch": 1.0750448833034112, + "grad_norm": 0.8379642367362976, + "learning_rate": 0.0002, + "loss": 0.7595, + "step": 14970 + }, + { + "epoch": 1.0757630161579892, + "grad_norm": 0.6842767596244812, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 14980 + }, + { + "epoch": 1.0764811490125674, + "grad_norm": 0.7296533584594727, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 14990 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 0.6821087002754211, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15000 + }, + { + "epoch": 1.0779174147217234, + "grad_norm": 0.6133626699447632, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 15010 + }, + { + "epoch": 1.0786355475763016, + "grad_norm": 0.6774773001670837, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 15020 + }, + { + "epoch": 1.0793536804308796, + "grad_norm": 0.6818786859512329, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 15030 + }, + { + "epoch": 1.0800718132854579, + "grad_norm": 0.7763522863388062, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15040 + }, + { + "epoch": 1.0807899461400359, + "grad_norm": 0.7259193658828735, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15050 + }, + { + "epoch": 1.081508078994614, + "grad_norm": 0.6797525882720947, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 15060 + }, + { + "epoch": 1.082226211849192, + "grad_norm": 0.5775881409645081, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 15070 + }, + { + "epoch": 1.08294434470377, + "grad_norm": 0.7055524587631226, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15080 + }, + { + "epoch": 1.0836624775583483, + "grad_norm": 0.8018748760223389, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 15090 + }, + { + "epoch": 1.0843806104129263, + "grad_norm": 0.6738115549087524, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 15100 + }, + { + "epoch": 1.0850987432675046, + "grad_norm": 0.6586359143257141, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 15110 + }, + { + "epoch": 1.0858168761220826, + "grad_norm": 0.7396895885467529, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 15120 + }, + { + "epoch": 1.0865350089766608, + "grad_norm": 0.7224817276000977, + "learning_rate": 0.0002, + "loss": 0.7473, + "step": 15130 + }, + { + "epoch": 1.0872531418312388, + "grad_norm": 0.798514187335968, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 15140 + }, + { + "epoch": 1.0879712746858168, + "grad_norm": 0.79301518201828, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 15150 + }, + { + "epoch": 1.088689407540395, + "grad_norm": 0.7106764316558838, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 15160 + }, + { + "epoch": 1.089407540394973, + "grad_norm": 0.6525473594665527, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 15170 + }, + { + "epoch": 1.0901256732495512, + "grad_norm": 0.6001671552658081, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 15180 + }, + { + "epoch": 1.0908438061041292, + "grad_norm": 0.6949557662010193, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 15190 + }, + { + "epoch": 1.0915619389587075, + "grad_norm": 0.5713186860084534, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 15200 + }, + { + "epoch": 1.0922800718132855, + "grad_norm": 0.8773220181465149, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 15210 + }, + { + "epoch": 1.0929982046678635, + "grad_norm": 0.5837785601615906, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 15220 + }, + { + "epoch": 1.0937163375224417, + "grad_norm": 0.7243856191635132, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 15230 + }, + { + "epoch": 1.0944344703770197, + "grad_norm": 0.7008263468742371, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 15240 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 0.7061941623687744, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 15250 + }, + { + "epoch": 1.095870736086176, + "grad_norm": 0.575903594493866, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 15260 + }, + { + "epoch": 1.0965888689407541, + "grad_norm": 0.6794043183326721, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 15270 + }, + { + "epoch": 1.0973070017953321, + "grad_norm": 0.7194870710372925, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 15280 + }, + { + "epoch": 1.0980251346499101, + "grad_norm": 0.8063322305679321, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 15290 + }, + { + "epoch": 1.0987432675044884, + "grad_norm": 0.786101758480072, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 15300 + }, + { + "epoch": 1.0994614003590664, + "grad_norm": 0.827474057674408, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 15310 + }, + { + "epoch": 1.1001795332136446, + "grad_norm": 0.6514455080032349, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 15320 + }, + { + "epoch": 1.1008976660682226, + "grad_norm": 0.7534348368644714, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15330 + }, + { + "epoch": 1.1016157989228008, + "grad_norm": 0.6991367340087891, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 15340 + }, + { + "epoch": 1.1023339317773788, + "grad_norm": 0.6742196679115295, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15350 + }, + { + "epoch": 1.1030520646319568, + "grad_norm": 0.7373757362365723, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 15360 + }, + { + "epoch": 1.103770197486535, + "grad_norm": 0.6834485530853271, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 15370 + }, + { + "epoch": 1.104488330341113, + "grad_norm": 0.6454901099205017, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 15380 + }, + { + "epoch": 1.1052064631956913, + "grad_norm": 0.7764508128166199, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 15390 + }, + { + "epoch": 1.1059245960502693, + "grad_norm": 0.668560802936554, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 15400 + }, + { + "epoch": 1.1066427289048475, + "grad_norm": 0.579655110836029, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 15410 + }, + { + "epoch": 1.1073608617594255, + "grad_norm": 0.7196493148803711, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 15420 + }, + { + "epoch": 1.1080789946140035, + "grad_norm": 0.5530232191085815, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 15430 + }, + { + "epoch": 1.1087971274685817, + "grad_norm": 0.6542958617210388, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 15440 + }, + { + "epoch": 1.1095152603231597, + "grad_norm": 0.7468852400779724, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 15450 + }, + { + "epoch": 1.110233393177738, + "grad_norm": 0.8119780421257019, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 15460 + }, + { + "epoch": 1.110951526032316, + "grad_norm": 0.7807733416557312, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 15470 + }, + { + "epoch": 1.1116696588868942, + "grad_norm": 0.7352553009986877, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 15480 + }, + { + "epoch": 1.1123877917414722, + "grad_norm": 0.8455224633216858, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 15490 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 0.635308563709259, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 15500 + }, + { + "epoch": 1.1138240574506284, + "grad_norm": 0.6268794536590576, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15510 + }, + { + "epoch": 1.1145421903052064, + "grad_norm": 0.6829593181610107, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 15520 + }, + { + "epoch": 1.1152603231597846, + "grad_norm": 0.5997796058654785, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 15530 + }, + { + "epoch": 1.1159784560143626, + "grad_norm": 0.7500942349433899, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 15540 + }, + { + "epoch": 1.1166965888689409, + "grad_norm": 0.7052047848701477, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 15550 + }, + { + "epoch": 1.1174147217235189, + "grad_norm": 0.6698189377784729, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 15560 + }, + { + "epoch": 1.1181328545780969, + "grad_norm": 0.7890462875366211, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 15570 + }, + { + "epoch": 1.118850987432675, + "grad_norm": 0.7002465128898621, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 15580 + }, + { + "epoch": 1.119569120287253, + "grad_norm": 0.7456073760986328, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 15590 + }, + { + "epoch": 1.1202872531418313, + "grad_norm": 0.7997385263442993, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 15600 + }, + { + "epoch": 1.1210053859964093, + "grad_norm": 0.6640482544898987, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15610 + }, + { + "epoch": 1.1217235188509875, + "grad_norm": 0.7765318155288696, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15620 + }, + { + "epoch": 1.1224416517055655, + "grad_norm": 0.7184962630271912, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 15630 + }, + { + "epoch": 1.1231597845601435, + "grad_norm": 0.7310904264450073, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 15640 + }, + { + "epoch": 1.1238779174147218, + "grad_norm": 0.7406452298164368, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 15650 + }, + { + "epoch": 1.1245960502692998, + "grad_norm": 0.7546738982200623, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 15660 + }, + { + "epoch": 1.125314183123878, + "grad_norm": 0.7069764733314514, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 15670 + }, + { + "epoch": 1.126032315978456, + "grad_norm": 0.6309521198272705, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 15680 + }, + { + "epoch": 1.1267504488330342, + "grad_norm": 0.8050156831741333, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 15690 + }, + { + "epoch": 1.1274685816876122, + "grad_norm": 0.726556122303009, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 15700 + }, + { + "epoch": 1.1281867145421902, + "grad_norm": 0.77745521068573, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 15710 + }, + { + "epoch": 1.1289048473967684, + "grad_norm": 0.7467634677886963, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 15720 + }, + { + "epoch": 1.1296229802513464, + "grad_norm": 0.8207895755767822, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 15730 + }, + { + "epoch": 1.1303411131059247, + "grad_norm": 0.8253937363624573, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 15740 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 0.6313983798027039, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 15750 + }, + { + "epoch": 1.1317773788150807, + "grad_norm": 0.8040992021560669, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 15760 + }, + { + "epoch": 1.132495511669659, + "grad_norm": 0.5937064290046692, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 15770 + }, + { + "epoch": 1.133213644524237, + "grad_norm": 0.6486281156539917, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 15780 + }, + { + "epoch": 1.1339317773788151, + "grad_norm": 0.6161853075027466, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 15790 + }, + { + "epoch": 1.1346499102333931, + "grad_norm": 0.6926610469818115, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 15800 + }, + { + "epoch": 1.1353680430879713, + "grad_norm": 0.6084047555923462, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 15810 + }, + { + "epoch": 1.1360861759425493, + "grad_norm": 0.6928383111953735, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 15820 + }, + { + "epoch": 1.1368043087971276, + "grad_norm": 0.7784243822097778, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 15830 + }, + { + "epoch": 1.1375224416517056, + "grad_norm": 0.7169384956359863, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 15840 + }, + { + "epoch": 1.1382405745062836, + "grad_norm": 0.6953616142272949, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 15850 + }, + { + "epoch": 1.1389587073608618, + "grad_norm": 0.7345215082168579, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15860 + }, + { + "epoch": 1.1396768402154398, + "grad_norm": 0.5469502806663513, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 15870 + }, + { + "epoch": 1.140394973070018, + "grad_norm": 0.687680721282959, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15880 + }, + { + "epoch": 1.141113105924596, + "grad_norm": 0.6879996657371521, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 15890 + }, + { + "epoch": 1.141831238779174, + "grad_norm": 0.728886067867279, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 15900 + }, + { + "epoch": 1.1425493716337523, + "grad_norm": 0.929531455039978, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 15910 + }, + { + "epoch": 1.1432675044883303, + "grad_norm": 0.8122507333755493, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 15920 + }, + { + "epoch": 1.1439856373429085, + "grad_norm": 0.6494652628898621, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 15930 + }, + { + "epoch": 1.1447037701974865, + "grad_norm": 0.7307567596435547, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15940 + }, + { + "epoch": 1.1454219030520647, + "grad_norm": 0.548678994178772, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 15950 + }, + { + "epoch": 1.1461400359066427, + "grad_norm": 0.8011603951454163, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 15960 + }, + { + "epoch": 1.146858168761221, + "grad_norm": 0.7026647329330444, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 15970 + }, + { + "epoch": 1.147576301615799, + "grad_norm": 0.7338995933532715, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 15980 + }, + { + "epoch": 1.148294434470377, + "grad_norm": 0.8453443646430969, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 15990 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 0.6787207126617432, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 16000 + }, + { + "epoch": 1.1497307001795332, + "grad_norm": 0.6314631104469299, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 16010 + }, + { + "epoch": 1.1504488330341114, + "grad_norm": 0.8812752962112427, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16020 + }, + { + "epoch": 1.1511669658886894, + "grad_norm": 0.6528969407081604, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 16030 + }, + { + "epoch": 1.1518850987432674, + "grad_norm": 0.7843571305274963, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 16040 + }, + { + "epoch": 1.1526032315978456, + "grad_norm": 0.7095080018043518, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 16050 + }, + { + "epoch": 1.1533213644524236, + "grad_norm": 0.7495582103729248, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 16060 + }, + { + "epoch": 1.1540394973070018, + "grad_norm": 0.6002049446105957, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 16070 + }, + { + "epoch": 1.1547576301615798, + "grad_norm": 0.565014123916626, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 16080 + }, + { + "epoch": 1.155475763016158, + "grad_norm": 0.8209971785545349, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 16090 + }, + { + "epoch": 1.156193895870736, + "grad_norm": 0.7137531042098999, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 16100 + }, + { + "epoch": 1.1569120287253143, + "grad_norm": 0.7307516932487488, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 16110 + }, + { + "epoch": 1.1576301615798923, + "grad_norm": 0.6686444878578186, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 16120 + }, + { + "epoch": 1.1583482944344703, + "grad_norm": 0.7977298498153687, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 16130 + }, + { + "epoch": 1.1590664272890485, + "grad_norm": 0.6980607509613037, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 16140 + }, + { + "epoch": 1.1597845601436265, + "grad_norm": 0.6622613668441772, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 16150 + }, + { + "epoch": 1.1605026929982047, + "grad_norm": 0.6598347425460815, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 16160 + }, + { + "epoch": 1.1612208258527827, + "grad_norm": 0.6686234474182129, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 16170 + }, + { + "epoch": 1.1619389587073607, + "grad_norm": 0.7308177947998047, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 16180 + }, + { + "epoch": 1.162657091561939, + "grad_norm": 0.939537525177002, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 16190 + }, + { + "epoch": 1.163375224416517, + "grad_norm": 0.5514758825302124, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 16200 + }, + { + "epoch": 1.1640933572710952, + "grad_norm": 0.589142918586731, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 16210 + }, + { + "epoch": 1.1648114901256732, + "grad_norm": 0.6888012290000916, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 16220 + }, + { + "epoch": 1.1655296229802514, + "grad_norm": 0.82566899061203, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 16230 + }, + { + "epoch": 1.1662477558348294, + "grad_norm": 0.6107817888259888, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 16240 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 0.7831398844718933, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 16250 + }, + { + "epoch": 1.1676840215439857, + "grad_norm": 0.6468397974967957, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 16260 + }, + { + "epoch": 1.1684021543985637, + "grad_norm": 0.7284161448478699, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 16270 + }, + { + "epoch": 1.1691202872531419, + "grad_norm": 0.6182818412780762, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 16280 + }, + { + "epoch": 1.1698384201077199, + "grad_norm": 0.7091781497001648, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 16290 + }, + { + "epoch": 1.170556552962298, + "grad_norm": 0.7327643632888794, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 16300 + }, + { + "epoch": 1.171274685816876, + "grad_norm": 0.5864694118499756, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 16310 + }, + { + "epoch": 1.171992818671454, + "grad_norm": 0.7049986720085144, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 16320 + }, + { + "epoch": 1.1727109515260323, + "grad_norm": 0.7563399076461792, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 16330 + }, + { + "epoch": 1.1734290843806103, + "grad_norm": 0.5888143181800842, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16340 + }, + { + "epoch": 1.1741472172351886, + "grad_norm": 0.8670049905776978, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 16350 + }, + { + "epoch": 1.1748653500897666, + "grad_norm": 0.8045654296875, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 16360 + }, + { + "epoch": 1.1755834829443448, + "grad_norm": 0.9115668535232544, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 16370 + }, + { + "epoch": 1.1763016157989228, + "grad_norm": 0.6943584084510803, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 16380 + }, + { + "epoch": 1.177019748653501, + "grad_norm": 0.7931740283966064, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 16390 + }, + { + "epoch": 1.177737881508079, + "grad_norm": 0.7967953085899353, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16400 + }, + { + "epoch": 1.178456014362657, + "grad_norm": 0.575165867805481, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 16410 + }, + { + "epoch": 1.1791741472172352, + "grad_norm": 0.6803409457206726, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 16420 + }, + { + "epoch": 1.1798922800718132, + "grad_norm": 0.7661909461021423, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 16430 + }, + { + "epoch": 1.1806104129263915, + "grad_norm": 0.7907630205154419, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 16440 + }, + { + "epoch": 1.1813285457809695, + "grad_norm": 0.7215338945388794, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 16450 + }, + { + "epoch": 1.1820466786355475, + "grad_norm": 0.6824054718017578, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 16460 + }, + { + "epoch": 1.1827648114901257, + "grad_norm": 0.8057665228843689, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 16470 + }, + { + "epoch": 1.1834829443447037, + "grad_norm": 0.7487542033195496, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 16480 + }, + { + "epoch": 1.184201077199282, + "grad_norm": 0.7254953384399414, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 16490 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 0.6986604332923889, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 16500 + }, + { + "epoch": 1.1856373429084381, + "grad_norm": 0.7889591455459595, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 16510 + }, + { + "epoch": 1.1863554757630161, + "grad_norm": 0.6029604077339172, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 16520 + }, + { + "epoch": 1.1870736086175944, + "grad_norm": 0.680322527885437, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 16530 + }, + { + "epoch": 1.1877917414721724, + "grad_norm": 0.8588826060295105, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 16540 + }, + { + "epoch": 1.1885098743267504, + "grad_norm": 0.7614806890487671, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 16550 + }, + { + "epoch": 1.1892280071813286, + "grad_norm": 0.7523183226585388, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 16560 + }, + { + "epoch": 1.1899461400359066, + "grad_norm": 0.8299532532691956, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 16570 + }, + { + "epoch": 1.1906642728904848, + "grad_norm": 0.6709241271018982, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 16580 + }, + { + "epoch": 1.1913824057450628, + "grad_norm": 0.665414035320282, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16590 + }, + { + "epoch": 1.1921005385996408, + "grad_norm": 0.7582152485847473, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 16600 + }, + { + "epoch": 1.192818671454219, + "grad_norm": 0.5856947302818298, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 16610 + }, + { + "epoch": 1.193536804308797, + "grad_norm": 0.6972885727882385, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 16620 + }, + { + "epoch": 1.1942549371633753, + "grad_norm": 0.6884734630584717, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 16630 + }, + { + "epoch": 1.1949730700179533, + "grad_norm": 0.7380475401878357, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 16640 + }, + { + "epoch": 1.1956912028725315, + "grad_norm": 0.7976197600364685, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 16650 + }, + { + "epoch": 1.1964093357271095, + "grad_norm": 0.819256067276001, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 16660 + }, + { + "epoch": 1.1971274685816877, + "grad_norm": 0.587867796421051, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 16670 + }, + { + "epoch": 1.1978456014362657, + "grad_norm": 0.9162678122520447, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 16680 + }, + { + "epoch": 1.1985637342908437, + "grad_norm": 0.7452084422111511, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 16690 + }, + { + "epoch": 1.199281867145422, + "grad_norm": 0.7966971397399902, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 16700 + }, + { + "epoch": 1.2, + "grad_norm": 0.6605724692344666, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 16710 + }, + { + "epoch": 1.2007181328545782, + "grad_norm": 0.6499220728874207, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16720 + }, + { + "epoch": 1.2014362657091562, + "grad_norm": 0.7422114610671997, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 16730 + }, + { + "epoch": 1.2021543985637342, + "grad_norm": 0.6652370095252991, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 16740 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 0.8761070370674133, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 16750 + }, + { + "epoch": 1.2035906642728904, + "grad_norm": 0.7294463515281677, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 16760 + }, + { + "epoch": 1.2043087971274686, + "grad_norm": 0.7725599408149719, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 16770 + }, + { + "epoch": 1.2050269299820466, + "grad_norm": 0.5630005598068237, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 16780 + }, + { + "epoch": 1.2057450628366249, + "grad_norm": 0.7601404786109924, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16790 + }, + { + "epoch": 1.2064631956912029, + "grad_norm": 0.6859985589981079, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16800 + }, + { + "epoch": 1.207181328545781, + "grad_norm": 0.7040054798126221, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 16810 + }, + { + "epoch": 1.207899461400359, + "grad_norm": 0.7058989405632019, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 16820 + }, + { + "epoch": 1.208617594254937, + "grad_norm": 0.7646133899688721, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16830 + }, + { + "epoch": 1.2093357271095153, + "grad_norm": 0.669550359249115, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 16840 + }, + { + "epoch": 1.2100538599640933, + "grad_norm": 0.6613401174545288, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16850 + }, + { + "epoch": 1.2107719928186715, + "grad_norm": 0.8636519312858582, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 16860 + }, + { + "epoch": 1.2114901256732495, + "grad_norm": 0.6077507138252258, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 16870 + }, + { + "epoch": 1.2122082585278275, + "grad_norm": 0.7892228364944458, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 16880 + }, + { + "epoch": 1.2129263913824058, + "grad_norm": 0.7424154877662659, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 16890 + }, + { + "epoch": 1.2136445242369838, + "grad_norm": 0.6525408029556274, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 16900 + }, + { + "epoch": 1.214362657091562, + "grad_norm": 0.6178015470504761, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 16910 + }, + { + "epoch": 1.21508078994614, + "grad_norm": 0.7319437861442566, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 16920 + }, + { + "epoch": 1.2157989228007182, + "grad_norm": 0.6823344826698303, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 16930 + }, + { + "epoch": 1.2165170556552962, + "grad_norm": 0.5681257843971252, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 16940 + }, + { + "epoch": 1.2172351885098744, + "grad_norm": 0.7939814925193787, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 16950 + }, + { + "epoch": 1.2179533213644524, + "grad_norm": 0.7031611800193787, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 16960 + }, + { + "epoch": 1.2186714542190304, + "grad_norm": 0.7610133290290833, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16970 + }, + { + "epoch": 1.2193895870736087, + "grad_norm": 0.8707142472267151, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 16980 + }, + { + "epoch": 1.2201077199281867, + "grad_norm": 0.6603384017944336, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 16990 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 0.7218315005302429, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 17000 + }, + { + "epoch": 1.221543985637343, + "grad_norm": 0.8043148517608643, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17010 + }, + { + "epoch": 1.222262118491921, + "grad_norm": 0.7232559323310852, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17020 + }, + { + "epoch": 1.2229802513464991, + "grad_norm": 0.690376341342926, + "learning_rate": 0.0002, + "loss": 0.7681, + "step": 17030 + }, + { + "epoch": 1.2236983842010771, + "grad_norm": 0.602436363697052, + "learning_rate": 0.0002, + "loss": 0.7042, + "step": 17040 + }, + { + "epoch": 1.2244165170556554, + "grad_norm": 0.7610493898391724, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 17050 + }, + { + "epoch": 1.2251346499102334, + "grad_norm": 0.7504690885543823, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 17060 + }, + { + "epoch": 1.2258527827648116, + "grad_norm": 0.8080246448516846, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 17070 + }, + { + "epoch": 1.2265709156193896, + "grad_norm": 1.0240572690963745, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 17080 + }, + { + "epoch": 1.2272890484739678, + "grad_norm": 0.6874111294746399, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 17090 + }, + { + "epoch": 1.2280071813285458, + "grad_norm": 0.800069272518158, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 17100 + }, + { + "epoch": 1.2287253141831238, + "grad_norm": 0.8628103137016296, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 17110 + }, + { + "epoch": 1.229443447037702, + "grad_norm": 0.7408499121665955, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 17120 + }, + { + "epoch": 1.23016157989228, + "grad_norm": 0.6494335532188416, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 17130 + }, + { + "epoch": 1.2308797127468583, + "grad_norm": 0.6493549942970276, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17140 + }, + { + "epoch": 1.2315978456014363, + "grad_norm": 0.6972658038139343, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 17150 + }, + { + "epoch": 1.2323159784560143, + "grad_norm": 0.6877315044403076, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 17160 + }, + { + "epoch": 1.2330341113105925, + "grad_norm": 0.7569024562835693, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 17170 + }, + { + "epoch": 1.2337522441651705, + "grad_norm": 0.696260392665863, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 17180 + }, + { + "epoch": 1.2344703770197487, + "grad_norm": 0.6150345802307129, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 17190 + }, + { + "epoch": 1.2351885098743267, + "grad_norm": 0.69009929895401, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 17200 + }, + { + "epoch": 1.235906642728905, + "grad_norm": 0.7035185098648071, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 17210 + }, + { + "epoch": 1.236624775583483, + "grad_norm": 0.6792506575584412, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17220 + }, + { + "epoch": 1.2373429084380612, + "grad_norm": 0.6310356855392456, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 17230 + }, + { + "epoch": 1.2380610412926392, + "grad_norm": 0.647026538848877, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 17240 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 0.7609930038452148, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 17250 + }, + { + "epoch": 1.2394973070017954, + "grad_norm": 0.791890561580658, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 17260 + }, + { + "epoch": 1.2402154398563734, + "grad_norm": 0.7126715183258057, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 17270 + }, + { + "epoch": 1.2409335727109516, + "grad_norm": 0.7850401401519775, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 17280 + }, + { + "epoch": 1.2416517055655296, + "grad_norm": 0.6694281697273254, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 17290 + }, + { + "epoch": 1.2423698384201076, + "grad_norm": 0.6418080925941467, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 17300 + }, + { + "epoch": 1.2430879712746858, + "grad_norm": 0.7308132648468018, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 17310 + }, + { + "epoch": 1.2438061041292638, + "grad_norm": 0.8322312235832214, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17320 + }, + { + "epoch": 1.244524236983842, + "grad_norm": 0.6959006190299988, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 17330 + }, + { + "epoch": 1.24524236983842, + "grad_norm": 0.7110121846199036, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17340 + }, + { + "epoch": 1.2459605026929983, + "grad_norm": 0.6496296525001526, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 17350 + }, + { + "epoch": 1.2466786355475763, + "grad_norm": 0.7649076581001282, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 17360 + }, + { + "epoch": 1.2473967684021545, + "grad_norm": 0.7139049172401428, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 17370 + }, + { + "epoch": 1.2481149012567325, + "grad_norm": 0.7709113955497742, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 17380 + }, + { + "epoch": 1.2488330341113105, + "grad_norm": 0.7160373330116272, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 17390 + }, + { + "epoch": 1.2495511669658887, + "grad_norm": 0.5608301162719727, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17400 + }, + { + "epoch": 1.2502692998204668, + "grad_norm": 0.6913180351257324, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 17410 + }, + { + "epoch": 1.250987432675045, + "grad_norm": 0.6980322599411011, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 17420 + }, + { + "epoch": 1.251705565529623, + "grad_norm": 0.8155394792556763, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 17430 + }, + { + "epoch": 1.252423698384201, + "grad_norm": 0.8015886545181274, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 17440 + }, + { + "epoch": 1.2531418312387792, + "grad_norm": 0.5985556244850159, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17450 + }, + { + "epoch": 1.2538599640933572, + "grad_norm": 0.70317143201828, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17460 + }, + { + "epoch": 1.2545780969479354, + "grad_norm": 0.612501323223114, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17470 + }, + { + "epoch": 1.2552962298025134, + "grad_norm": 0.7347102165222168, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 17480 + }, + { + "epoch": 1.2560143626570914, + "grad_norm": 0.9189441800117493, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 17490 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 0.7727932929992676, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 17500 + }, + { + "epoch": 1.2574506283662479, + "grad_norm": 0.6782869696617126, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 17510 + }, + { + "epoch": 1.2581687612208259, + "grad_norm": 0.5710638761520386, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17520 + }, + { + "epoch": 1.2588868940754039, + "grad_norm": 0.6856266856193542, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 17530 + }, + { + "epoch": 1.259605026929982, + "grad_norm": 0.7257347702980042, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 17540 + }, + { + "epoch": 1.26032315978456, + "grad_norm": 0.6343092918395996, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 17550 + }, + { + "epoch": 1.2610412926391383, + "grad_norm": 0.6482594013214111, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 17560 + }, + { + "epoch": 1.2617594254937163, + "grad_norm": 0.6542837619781494, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 17570 + }, + { + "epoch": 1.2624775583482943, + "grad_norm": 0.7106123566627502, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 17580 + }, + { + "epoch": 1.2631956912028726, + "grad_norm": 0.9081960320472717, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 17590 + }, + { + "epoch": 1.2639138240574506, + "grad_norm": 0.7010290026664734, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 17600 + }, + { + "epoch": 1.2646319569120288, + "grad_norm": 0.9973132610321045, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 17610 + }, + { + "epoch": 1.2653500897666068, + "grad_norm": 0.8003297448158264, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 17620 + }, + { + "epoch": 1.2660682226211848, + "grad_norm": 0.7383468151092529, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 17630 + }, + { + "epoch": 1.266786355475763, + "grad_norm": 0.6337200999259949, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 17640 + }, + { + "epoch": 1.2675044883303412, + "grad_norm": 0.6371761560440063, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 17650 + }, + { + "epoch": 1.2682226211849192, + "grad_norm": 0.7283522486686707, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 17660 + }, + { + "epoch": 1.2689407540394972, + "grad_norm": 0.8191015720367432, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 17670 + }, + { + "epoch": 1.2696588868940755, + "grad_norm": 0.6210351586341858, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 17680 + }, + { + "epoch": 1.2703770197486535, + "grad_norm": 0.6563277840614319, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 17690 + }, + { + "epoch": 1.2710951526032317, + "grad_norm": 0.7111260294914246, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 17700 + }, + { + "epoch": 1.2718132854578097, + "grad_norm": 0.7061500549316406, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 17710 + }, + { + "epoch": 1.2725314183123877, + "grad_norm": 0.7657744884490967, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 17720 + }, + { + "epoch": 1.273249551166966, + "grad_norm": 0.6952996850013733, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17730 + }, + { + "epoch": 1.273967684021544, + "grad_norm": 0.5678043961524963, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 17740 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 0.8608036041259766, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 17750 + }, + { + "epoch": 1.2754039497307001, + "grad_norm": 0.7184045910835266, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 17760 + }, + { + "epoch": 1.2761220825852782, + "grad_norm": 0.6647557616233826, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 17770 + }, + { + "epoch": 1.2768402154398564, + "grad_norm": 0.6899349093437195, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17780 + }, + { + "epoch": 1.2775583482944346, + "grad_norm": 0.7073346972465515, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 17790 + }, + { + "epoch": 1.2782764811490126, + "grad_norm": 0.8896707892417908, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 17800 + }, + { + "epoch": 1.2789946140035906, + "grad_norm": 0.5072778463363647, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 17810 + }, + { + "epoch": 1.2797127468581688, + "grad_norm": 0.8889711499214172, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 17820 + }, + { + "epoch": 1.2804308797127468, + "grad_norm": 0.5583778619766235, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 17830 + }, + { + "epoch": 1.281149012567325, + "grad_norm": 0.6526148915290833, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 17840 + }, + { + "epoch": 1.281867145421903, + "grad_norm": 0.7658175826072693, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 17850 + }, + { + "epoch": 1.282585278276481, + "grad_norm": 0.5547847151756287, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 17860 + }, + { + "epoch": 1.2833034111310593, + "grad_norm": 0.6153780817985535, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17870 + }, + { + "epoch": 1.2840215439856373, + "grad_norm": 0.8474061489105225, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 17880 + }, + { + "epoch": 1.2847396768402155, + "grad_norm": 0.859260618686676, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 17890 + }, + { + "epoch": 1.2854578096947935, + "grad_norm": 0.7270520329475403, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 17900 + }, + { + "epoch": 1.2861759425493715, + "grad_norm": 0.8166249394416809, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 17910 + }, + { + "epoch": 1.2868940754039497, + "grad_norm": 0.9158982038497925, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17920 + }, + { + "epoch": 1.287612208258528, + "grad_norm": 0.8132565021514893, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17930 + }, + { + "epoch": 1.288330341113106, + "grad_norm": 0.7914409637451172, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17940 + }, + { + "epoch": 1.289048473967684, + "grad_norm": 0.6256071329116821, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 17950 + }, + { + "epoch": 1.2897666068222622, + "grad_norm": 0.6463542580604553, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 17960 + }, + { + "epoch": 1.2904847396768402, + "grad_norm": 0.6702672839164734, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 17970 + }, + { + "epoch": 1.2912028725314184, + "grad_norm": 0.8666605949401855, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 17980 + }, + { + "epoch": 1.2919210053859964, + "grad_norm": 0.8055952787399292, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17990 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 0.6909741163253784, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 18000 + }, + { + "epoch": 1.2933572710951526, + "grad_norm": 0.663702130317688, + "learning_rate": 0.0002, + "loss": 0.7766, + "step": 18010 + }, + { + "epoch": 1.2940754039497306, + "grad_norm": 0.6952448487281799, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 18020 + }, + { + "epoch": 1.2947935368043089, + "grad_norm": 0.5722854137420654, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18030 + }, + { + "epoch": 1.2955116696588869, + "grad_norm": 0.7987681031227112, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 18040 + }, + { + "epoch": 1.2962298025134649, + "grad_norm": 0.661133348941803, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 18050 + }, + { + "epoch": 1.296947935368043, + "grad_norm": 0.6025064587593079, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 18060 + }, + { + "epoch": 1.2976660682226213, + "grad_norm": 0.7569907903671265, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 18070 + }, + { + "epoch": 1.2983842010771993, + "grad_norm": 0.7222012281417847, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18080 + }, + { + "epoch": 1.2991023339317773, + "grad_norm": 0.5291963815689087, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 18090 + }, + { + "epoch": 1.2998204667863555, + "grad_norm": 0.6808363199234009, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 18100 + }, + { + "epoch": 1.3005385996409335, + "grad_norm": 0.6797927618026733, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 18110 + }, + { + "epoch": 1.3012567324955118, + "grad_norm": 0.7775542140007019, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 18120 + }, + { + "epoch": 1.3019748653500898, + "grad_norm": 0.7369466423988342, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18130 + }, + { + "epoch": 1.3026929982046678, + "grad_norm": 0.6822494864463806, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 18140 + }, + { + "epoch": 1.303411131059246, + "grad_norm": 0.9222138524055481, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 18150 + }, + { + "epoch": 1.304129263913824, + "grad_norm": 0.7485767006874084, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 18160 + }, + { + "epoch": 1.3048473967684022, + "grad_norm": 0.6383684277534485, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 18170 + }, + { + "epoch": 1.3055655296229802, + "grad_norm": 0.5934187173843384, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 18180 + }, + { + "epoch": 1.3062836624775582, + "grad_norm": 0.7265770435333252, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 18190 + }, + { + "epoch": 1.3070017953321365, + "grad_norm": 0.8149140477180481, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 18200 + }, + { + "epoch": 1.3077199281867147, + "grad_norm": 0.8067880272865295, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 18210 + }, + { + "epoch": 1.3084380610412927, + "grad_norm": 0.6109178066253662, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18220 + }, + { + "epoch": 1.3091561938958707, + "grad_norm": 0.7194176316261292, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 18230 + }, + { + "epoch": 1.309874326750449, + "grad_norm": 0.6452242136001587, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 18240 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 0.680550217628479, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 18250 + }, + { + "epoch": 1.3113105924596051, + "grad_norm": 0.7005740404129028, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 18260 + }, + { + "epoch": 1.3120287253141831, + "grad_norm": 0.7217825055122375, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 18270 + }, + { + "epoch": 1.3127468581687611, + "grad_norm": 0.7730209231376648, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 18280 + }, + { + "epoch": 1.3134649910233394, + "grad_norm": 0.8291956186294556, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18290 + }, + { + "epoch": 1.3141831238779174, + "grad_norm": 0.758528470993042, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18300 + }, + { + "epoch": 1.3149012567324956, + "grad_norm": 0.9682782292366028, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 18310 + }, + { + "epoch": 1.3156193895870736, + "grad_norm": 0.5784780979156494, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 18320 + }, + { + "epoch": 1.3163375224416516, + "grad_norm": 0.5870532393455505, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 18330 + }, + { + "epoch": 1.3170556552962298, + "grad_norm": 0.5950172543525696, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 18340 + }, + { + "epoch": 1.317773788150808, + "grad_norm": 0.7625961899757385, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 18350 + }, + { + "epoch": 1.318491921005386, + "grad_norm": 0.8027397394180298, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 18360 + }, + { + "epoch": 1.319210053859964, + "grad_norm": 0.8424779772758484, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 18370 + }, + { + "epoch": 1.3199281867145423, + "grad_norm": 0.5741737484931946, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 18380 + }, + { + "epoch": 1.3206463195691203, + "grad_norm": 0.7363710999488831, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 18390 + }, + { + "epoch": 1.3213644524236985, + "grad_norm": 0.7900536060333252, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 18400 + }, + { + "epoch": 1.3220825852782765, + "grad_norm": 0.6273105144500732, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 18410 + }, + { + "epoch": 1.3228007181328545, + "grad_norm": 0.7612496018409729, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 18420 + }, + { + "epoch": 1.3235188509874327, + "grad_norm": 0.729653537273407, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 18430 + }, + { + "epoch": 1.3242369838420107, + "grad_norm": 0.6599212288856506, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 18440 + }, + { + "epoch": 1.324955116696589, + "grad_norm": 0.762320876121521, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18450 + }, + { + "epoch": 1.325673249551167, + "grad_norm": 0.7468838095664978, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18460 + }, + { + "epoch": 1.326391382405745, + "grad_norm": 0.6376237273216248, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 18470 + }, + { + "epoch": 1.3271095152603232, + "grad_norm": 0.6722603440284729, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18480 + }, + { + "epoch": 1.3278276481149014, + "grad_norm": 0.7011231780052185, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 18490 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 0.5325027108192444, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 18500 + }, + { + "epoch": 1.3292639138240574, + "grad_norm": 0.6916731595993042, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 18510 + }, + { + "epoch": 1.3299820466786356, + "grad_norm": 0.6529106497764587, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18520 + }, + { + "epoch": 1.3307001795332136, + "grad_norm": 0.7708640694618225, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 18530 + }, + { + "epoch": 1.3314183123877918, + "grad_norm": 0.7125861048698425, + "learning_rate": 0.0002, + "loss": 0.7688, + "step": 18540 + }, + { + "epoch": 1.3321364452423698, + "grad_norm": 0.7663969993591309, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 18550 + }, + { + "epoch": 1.3328545780969479, + "grad_norm": 0.601141631603241, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 18560 + }, + { + "epoch": 1.333572710951526, + "grad_norm": 0.6185581088066101, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 18570 + }, + { + "epoch": 1.334290843806104, + "grad_norm": 0.6136596202850342, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 18580 + }, + { + "epoch": 1.3350089766606823, + "grad_norm": 0.8377187252044678, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 18590 + }, + { + "epoch": 1.3357271095152603, + "grad_norm": 0.7649989724159241, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 18600 + }, + { + "epoch": 1.3364452423698383, + "grad_norm": 0.7944515347480774, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 18610 + }, + { + "epoch": 1.3371633752244165, + "grad_norm": 0.619024395942688, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 18620 + }, + { + "epoch": 1.3378815080789948, + "grad_norm": 0.7849082946777344, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 18630 + }, + { + "epoch": 1.3385996409335728, + "grad_norm": 0.5740780830383301, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18640 + }, + { + "epoch": 1.3393177737881508, + "grad_norm": 0.6897456645965576, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 18650 + }, + { + "epoch": 1.340035906642729, + "grad_norm": 0.6263600587844849, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 18660 + }, + { + "epoch": 1.340754039497307, + "grad_norm": 0.5744550824165344, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 18670 + }, + { + "epoch": 1.3414721723518852, + "grad_norm": 0.7785728573799133, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 18680 + }, + { + "epoch": 1.3421903052064632, + "grad_norm": 0.6944230198860168, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 18690 + }, + { + "epoch": 1.3429084380610412, + "grad_norm": 0.7388073801994324, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 18700 + }, + { + "epoch": 1.3436265709156194, + "grad_norm": 0.9555586576461792, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 18710 + }, + { + "epoch": 1.3443447037701974, + "grad_norm": 0.8510582447052002, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 18720 + }, + { + "epoch": 1.3450628366247757, + "grad_norm": 0.6093049645423889, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 18730 + }, + { + "epoch": 1.3457809694793537, + "grad_norm": 0.9159273505210876, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 18740 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 0.7188084721565247, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 18750 + }, + { + "epoch": 1.3472172351885099, + "grad_norm": 0.7228650450706482, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 18760 + }, + { + "epoch": 1.347935368043088, + "grad_norm": 0.8160615563392639, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 18770 + }, + { + "epoch": 1.3486535008976661, + "grad_norm": 0.6485389471054077, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 18780 + }, + { + "epoch": 1.3493716337522441, + "grad_norm": 0.6755139827728271, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 18790 + }, + { + "epoch": 1.3500897666068223, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 18800 + }, + { + "epoch": 1.3508078994614003, + "grad_norm": 0.6954510807991028, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 18810 + }, + { + "epoch": 1.3515260323159786, + "grad_norm": 0.9948558807373047, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 18820 + }, + { + "epoch": 1.3522441651705566, + "grad_norm": 0.708381175994873, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18830 + }, + { + "epoch": 1.3529622980251346, + "grad_norm": 0.6409999132156372, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 18840 + }, + { + "epoch": 1.3536804308797128, + "grad_norm": 0.6365936994552612, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18850 + }, + { + "epoch": 1.3543985637342908, + "grad_norm": 0.7620742917060852, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 18860 + }, + { + "epoch": 1.355116696588869, + "grad_norm": 0.6849071383476257, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 18870 + }, + { + "epoch": 1.355834829443447, + "grad_norm": 0.5776316523551941, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18880 + }, + { + "epoch": 1.356552962298025, + "grad_norm": 0.597236156463623, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 18890 + }, + { + "epoch": 1.3572710951526032, + "grad_norm": 0.6569282412528992, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 18900 + }, + { + "epoch": 1.3579892280071812, + "grad_norm": 0.6384802460670471, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 18910 + }, + { + "epoch": 1.3587073608617595, + "grad_norm": 0.6623879671096802, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 18920 + }, + { + "epoch": 1.3594254937163375, + "grad_norm": 0.6149632334709167, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 18930 + }, + { + "epoch": 1.3601436265709157, + "grad_norm": 0.6978002190589905, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 18940 + }, + { + "epoch": 1.3608617594254937, + "grad_norm": 0.7579124569892883, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 18950 + }, + { + "epoch": 1.361579892280072, + "grad_norm": 0.7138084173202515, + "learning_rate": 0.0002, + "loss": 0.7589, + "step": 18960 + }, + { + "epoch": 1.36229802513465, + "grad_norm": 0.678322434425354, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18970 + }, + { + "epoch": 1.363016157989228, + "grad_norm": 0.694346010684967, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18980 + }, + { + "epoch": 1.3637342908438062, + "grad_norm": 0.682262659072876, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18990 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 0.9068194627761841, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 19000 + }, + { + "epoch": 1.3651705565529624, + "grad_norm": 0.6691566705703735, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 19010 + }, + { + "epoch": 1.3658886894075404, + "grad_norm": 0.7791378498077393, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 19020 + }, + { + "epoch": 1.3666068222621184, + "grad_norm": 0.717107355594635, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 19030 + }, + { + "epoch": 1.3673249551166966, + "grad_norm": 0.7897566556930542, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 19040 + }, + { + "epoch": 1.3680430879712746, + "grad_norm": 0.8823844790458679, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 19050 + }, + { + "epoch": 1.3687612208258528, + "grad_norm": 0.6512053608894348, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 19060 + }, + { + "epoch": 1.3694793536804308, + "grad_norm": 0.6871389150619507, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 19070 + }, + { + "epoch": 1.370197486535009, + "grad_norm": 0.6795603036880493, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 19080 + }, + { + "epoch": 1.370915619389587, + "grad_norm": 0.6569121479988098, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 19090 + }, + { + "epoch": 1.3716337522441653, + "grad_norm": 0.6769960522651672, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 19100 + }, + { + "epoch": 1.3723518850987433, + "grad_norm": 0.726613461971283, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 19110 + }, + { + "epoch": 1.3730700179533213, + "grad_norm": 0.7287817001342773, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 19120 + }, + { + "epoch": 1.3737881508078995, + "grad_norm": 0.6169242858886719, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 19130 + }, + { + "epoch": 1.3745062836624775, + "grad_norm": 0.6537347435951233, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 19140 + }, + { + "epoch": 1.3752244165170557, + "grad_norm": 0.6113879680633545, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 19150 + }, + { + "epoch": 1.3759425493716337, + "grad_norm": 0.6415297985076904, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 19160 + }, + { + "epoch": 1.3766606822262117, + "grad_norm": 0.6812838315963745, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 19170 + }, + { + "epoch": 1.37737881508079, + "grad_norm": 0.7331814169883728, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 19180 + }, + { + "epoch": 1.378096947935368, + "grad_norm": 0.7265108823776245, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 19190 + }, + { + "epoch": 1.3788150807899462, + "grad_norm": 0.6233167052268982, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 19200 + }, + { + "epoch": 1.3795332136445242, + "grad_norm": 0.6841492652893066, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 19210 + }, + { + "epoch": 1.3802513464991024, + "grad_norm": 0.822853684425354, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 19220 + }, + { + "epoch": 1.3809694793536804, + "grad_norm": 0.8078812956809998, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 19230 + }, + { + "epoch": 1.3816876122082586, + "grad_norm": 0.7269898056983948, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 19240 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 0.6297033429145813, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 19250 + }, + { + "epoch": 1.3831238779174146, + "grad_norm": 0.8097442388534546, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 19260 + }, + { + "epoch": 1.3838420107719929, + "grad_norm": 0.6442803740501404, + "learning_rate": 0.0002, + "loss": 0.7281, + "step": 19270 + }, + { + "epoch": 1.3845601436265709, + "grad_norm": 0.659866213798523, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 19280 + }, + { + "epoch": 1.385278276481149, + "grad_norm": 0.7537921667098999, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 19290 + }, + { + "epoch": 1.385996409335727, + "grad_norm": 0.8441828489303589, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 19300 + }, + { + "epoch": 1.386714542190305, + "grad_norm": 0.8506057262420654, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19310 + }, + { + "epoch": 1.3874326750448833, + "grad_norm": 0.6747094392776489, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 19320 + }, + { + "epoch": 1.3881508078994613, + "grad_norm": 0.7906509041786194, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 19330 + }, + { + "epoch": 1.3888689407540395, + "grad_norm": 0.6784867644309998, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 19340 + }, + { + "epoch": 1.3895870736086176, + "grad_norm": 0.6371709108352661, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 19350 + }, + { + "epoch": 1.3903052064631956, + "grad_norm": 0.7858285307884216, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 19360 + }, + { + "epoch": 1.3910233393177738, + "grad_norm": 0.711395263671875, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19370 + }, + { + "epoch": 1.391741472172352, + "grad_norm": 0.7023257613182068, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19380 + }, + { + "epoch": 1.39245960502693, + "grad_norm": 0.7036022543907166, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19390 + }, + { + "epoch": 1.393177737881508, + "grad_norm": 0.6418436169624329, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 19400 + }, + { + "epoch": 1.3938958707360862, + "grad_norm": 0.7108847498893738, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 19410 + }, + { + "epoch": 1.3946140035906642, + "grad_norm": 0.6940230131149292, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 19420 + }, + { + "epoch": 1.3953321364452425, + "grad_norm": 0.6750220656394958, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 19430 + }, + { + "epoch": 1.3960502692998205, + "grad_norm": 0.7479177713394165, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 19440 + }, + { + "epoch": 1.3967684021543985, + "grad_norm": 0.626124918460846, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 19450 + }, + { + "epoch": 1.3974865350089767, + "grad_norm": 0.8908559083938599, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 19460 + }, + { + "epoch": 1.3982046678635547, + "grad_norm": 0.6163712739944458, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 19470 + }, + { + "epoch": 1.398922800718133, + "grad_norm": 0.6993312239646912, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 19480 + }, + { + "epoch": 1.399640933572711, + "grad_norm": 0.6162890791893005, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 19490 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 0.7797643542289734, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 19500 + }, + { + "epoch": 1.4010771992818671, + "grad_norm": 0.7038744688034058, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 19510 + }, + { + "epoch": 1.4017953321364454, + "grad_norm": 0.6902393698692322, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 19520 + }, + { + "epoch": 1.4025134649910234, + "grad_norm": 0.5436386466026306, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 19530 + }, + { + "epoch": 1.4032315978456014, + "grad_norm": 0.6537990570068359, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19540 + }, + { + "epoch": 1.4039497307001796, + "grad_norm": 0.739691972732544, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 19550 + }, + { + "epoch": 1.4046678635547576, + "grad_norm": 0.7287635803222656, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 19560 + }, + { + "epoch": 1.4053859964093358, + "grad_norm": 0.6809501051902771, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 19570 + }, + { + "epoch": 1.4061041292639138, + "grad_norm": 0.8302195072174072, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 19580 + }, + { + "epoch": 1.4068222621184918, + "grad_norm": 0.6613629460334778, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 19590 + }, + { + "epoch": 1.40754039497307, + "grad_norm": 0.7897207736968994, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 19600 + }, + { + "epoch": 1.408258527827648, + "grad_norm": 0.8368293642997742, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 19610 + }, + { + "epoch": 1.4089766606822263, + "grad_norm": 0.665109395980835, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 19620 + }, + { + "epoch": 1.4096947935368043, + "grad_norm": 0.7359302639961243, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 19630 + }, + { + "epoch": 1.4104129263913823, + "grad_norm": 0.8048052787780762, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 19640 + }, + { + "epoch": 1.4111310592459605, + "grad_norm": 0.7414906620979309, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 19650 + }, + { + "epoch": 1.4118491921005387, + "grad_norm": 0.7894161343574524, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 19660 + }, + { + "epoch": 1.4125673249551167, + "grad_norm": 0.6724628210067749, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 19670 + }, + { + "epoch": 1.4132854578096947, + "grad_norm": 0.9397756457328796, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 19680 + }, + { + "epoch": 1.414003590664273, + "grad_norm": 0.6684842109680176, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 19690 + }, + { + "epoch": 1.414721723518851, + "grad_norm": 0.7753993272781372, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 19700 + }, + { + "epoch": 1.4154398563734292, + "grad_norm": 0.6934253573417664, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 19710 + }, + { + "epoch": 1.4161579892280072, + "grad_norm": 0.8567284941673279, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 19720 + }, + { + "epoch": 1.4168761220825852, + "grad_norm": 0.9471787214279175, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 19730 + }, + { + "epoch": 1.4175942549371634, + "grad_norm": 0.6664855480194092, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 19740 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 0.6713361740112305, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 19750 + }, + { + "epoch": 1.4190305206463196, + "grad_norm": 0.6488258838653564, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 19760 + }, + { + "epoch": 1.4197486535008976, + "grad_norm": 0.7089938521385193, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19770 + }, + { + "epoch": 1.4204667863554756, + "grad_norm": 0.6433218717575073, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 19780 + }, + { + "epoch": 1.4211849192100539, + "grad_norm": 0.7025160193443298, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 19790 + }, + { + "epoch": 1.421903052064632, + "grad_norm": 0.7030544877052307, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 19800 + }, + { + "epoch": 1.42262118491921, + "grad_norm": 0.6515552401542664, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 19810 + }, + { + "epoch": 1.423339317773788, + "grad_norm": 0.6463841795921326, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 19820 + }, + { + "epoch": 1.4240574506283663, + "grad_norm": 0.6654344201087952, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19830 + }, + { + "epoch": 1.4247755834829443, + "grad_norm": 0.7223384380340576, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 19840 + }, + { + "epoch": 1.4254937163375225, + "grad_norm": 0.6575722694396973, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 19850 + }, + { + "epoch": 1.4262118491921005, + "grad_norm": 0.6216059327125549, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 19860 + }, + { + "epoch": 1.4269299820466785, + "grad_norm": 0.7451487183570862, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19870 + }, + { + "epoch": 1.4276481149012568, + "grad_norm": 0.6563336253166199, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 19880 + }, + { + "epoch": 1.4283662477558348, + "grad_norm": 0.8021975159645081, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 19890 + }, + { + "epoch": 1.429084380610413, + "grad_norm": 0.7474712133407593, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 19900 + }, + { + "epoch": 1.429802513464991, + "grad_norm": 0.7316377758979797, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 19910 + }, + { + "epoch": 1.430520646319569, + "grad_norm": 0.646892786026001, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 19920 + }, + { + "epoch": 1.4312387791741472, + "grad_norm": 0.6268765926361084, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 19930 + }, + { + "epoch": 1.4319569120287254, + "grad_norm": 0.7104699611663818, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 19940 + }, + { + "epoch": 1.4326750448833034, + "grad_norm": 0.6742063760757446, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 19950 + }, + { + "epoch": 1.4333931777378814, + "grad_norm": 0.6973381638526917, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 19960 + }, + { + "epoch": 1.4341113105924597, + "grad_norm": 0.5819381475448608, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 19970 + }, + { + "epoch": 1.4348294434470377, + "grad_norm": 0.680623471736908, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 19980 + }, + { + "epoch": 1.435547576301616, + "grad_norm": 0.5899890661239624, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 19990 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 0.6225098371505737, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 20000 + }, + { + "epoch": 1.436983842010772, + "grad_norm": 0.6314228773117065, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 20010 + }, + { + "epoch": 1.4377019748653501, + "grad_norm": 0.8690667152404785, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 20020 + }, + { + "epoch": 1.4384201077199281, + "grad_norm": 0.7166543006896973, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 20030 + }, + { + "epoch": 1.4391382405745063, + "grad_norm": 0.7051591873168945, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 20040 + }, + { + "epoch": 1.4398563734290843, + "grad_norm": 0.7606652975082397, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 20050 + }, + { + "epoch": 1.4405745062836623, + "grad_norm": 0.6343185305595398, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 20060 + }, + { + "epoch": 1.4412926391382406, + "grad_norm": 0.5625789761543274, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 20070 + }, + { + "epoch": 1.4420107719928188, + "grad_norm": 0.6081897020339966, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 20080 + }, + { + "epoch": 1.4427289048473968, + "grad_norm": 0.9571536779403687, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 20090 + }, + { + "epoch": 1.4434470377019748, + "grad_norm": 0.869531512260437, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 20100 + }, + { + "epoch": 1.444165170556553, + "grad_norm": 0.6865507960319519, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 20110 + }, + { + "epoch": 1.444883303411131, + "grad_norm": 0.7572755813598633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 20120 + }, + { + "epoch": 1.4456014362657092, + "grad_norm": 0.79011070728302, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 20130 + }, + { + "epoch": 1.4463195691202873, + "grad_norm": 0.8297342658042908, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 20140 + }, + { + "epoch": 1.4470377019748653, + "grad_norm": 0.6593490839004517, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 20150 + }, + { + "epoch": 1.4477558348294435, + "grad_norm": 1.0264687538146973, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 20160 + }, + { + "epoch": 1.4484739676840215, + "grad_norm": 0.7032888531684875, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 20170 + }, + { + "epoch": 1.4491921005385997, + "grad_norm": 0.6438494920730591, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 20180 + }, + { + "epoch": 1.4499102333931777, + "grad_norm": 0.7448790669441223, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 20190 + }, + { + "epoch": 1.4506283662477557, + "grad_norm": 0.7551555037498474, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 20200 + }, + { + "epoch": 1.451346499102334, + "grad_norm": 0.6677857041358948, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 20210 + }, + { + "epoch": 1.4520646319569122, + "grad_norm": 0.7888486385345459, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 20220 + }, + { + "epoch": 1.4527827648114902, + "grad_norm": 0.6658565402030945, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 20230 + }, + { + "epoch": 1.4535008976660682, + "grad_norm": 0.6800249814987183, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 20240 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 0.7419682741165161, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 20250 + }, + { + "epoch": 1.4549371633752244, + "grad_norm": 0.8848792910575867, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 20260 + }, + { + "epoch": 1.4556552962298026, + "grad_norm": 0.6513857245445251, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 20270 + }, + { + "epoch": 1.4563734290843806, + "grad_norm": 0.5605742335319519, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 20280 + }, + { + "epoch": 1.4570915619389586, + "grad_norm": 0.6737141013145447, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 20290 + }, + { + "epoch": 1.4578096947935368, + "grad_norm": 0.6663289666175842, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 20300 + }, + { + "epoch": 1.4585278276481148, + "grad_norm": 0.7157106995582581, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20310 + }, + { + "epoch": 1.459245960502693, + "grad_norm": 0.7713354825973511, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 20320 + }, + { + "epoch": 1.459964093357271, + "grad_norm": 0.8334044218063354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 20330 + }, + { + "epoch": 1.460682226211849, + "grad_norm": 0.7268327474594116, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 20340 + }, + { + "epoch": 1.4614003590664273, + "grad_norm": 0.6791431903839111, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 20350 + }, + { + "epoch": 1.4621184919210055, + "grad_norm": 0.8177870512008667, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 20360 + }, + { + "epoch": 1.4628366247755835, + "grad_norm": 0.8064364790916443, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 20370 + }, + { + "epoch": 1.4635547576301615, + "grad_norm": 0.6547006964683533, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 20380 + }, + { + "epoch": 1.4642728904847397, + "grad_norm": 0.6381436586380005, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 20390 + }, + { + "epoch": 1.4649910233393177, + "grad_norm": 0.7351248264312744, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 20400 + }, + { + "epoch": 1.465709156193896, + "grad_norm": 0.7037558555603027, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 20410 + }, + { + "epoch": 1.466427289048474, + "grad_norm": 0.6294074654579163, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 20420 + }, + { + "epoch": 1.467145421903052, + "grad_norm": 0.9722632765769958, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 20430 + }, + { + "epoch": 1.4678635547576302, + "grad_norm": 0.753065824508667, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 20440 + }, + { + "epoch": 1.4685816876122082, + "grad_norm": 0.7317194938659668, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20450 + }, + { + "epoch": 1.4692998204667864, + "grad_norm": 0.6862193942070007, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 20460 + }, + { + "epoch": 1.4700179533213644, + "grad_norm": 0.7643225193023682, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 20470 + }, + { + "epoch": 1.4707360861759424, + "grad_norm": 0.5904353260993958, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 20480 + }, + { + "epoch": 1.4714542190305206, + "grad_norm": 0.5812238454818726, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20490 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 0.7478151321411133, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 20500 + }, + { + "epoch": 1.4728904847396769, + "grad_norm": 0.7625645399093628, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 20510 + }, + { + "epoch": 1.4736086175942549, + "grad_norm": 0.6354498267173767, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 20520 + }, + { + "epoch": 1.474326750448833, + "grad_norm": 0.8731162548065186, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 20530 + }, + { + "epoch": 1.475044883303411, + "grad_norm": 0.7346670627593994, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 20540 + }, + { + "epoch": 1.4757630161579893, + "grad_norm": 1.038447618484497, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 20550 + }, + { + "epoch": 1.4764811490125673, + "grad_norm": 0.7032809257507324, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 20560 + }, + { + "epoch": 1.4771992818671453, + "grad_norm": 0.8008337020874023, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 20570 + }, + { + "epoch": 1.4779174147217236, + "grad_norm": 0.6735056638717651, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 20580 + }, + { + "epoch": 1.4786355475763016, + "grad_norm": 0.622056245803833, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 20590 + }, + { + "epoch": 1.4793536804308798, + "grad_norm": 0.6580422520637512, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 20600 + }, + { + "epoch": 1.4800718132854578, + "grad_norm": 0.8401153087615967, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20610 + }, + { + "epoch": 1.4807899461400358, + "grad_norm": 0.7564560770988464, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 20620 + }, + { + "epoch": 1.481508078994614, + "grad_norm": 0.8319511413574219, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 20630 + }, + { + "epoch": 1.4822262118491922, + "grad_norm": 0.7430182695388794, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 20640 + }, + { + "epoch": 1.4829443447037702, + "grad_norm": 0.7996522784233093, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 20650 + }, + { + "epoch": 1.4836624775583482, + "grad_norm": 0.6993277072906494, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 20660 + }, + { + "epoch": 1.4843806104129265, + "grad_norm": 0.8621185421943665, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 20670 + }, + { + "epoch": 1.4850987432675045, + "grad_norm": 0.7709757685661316, + "learning_rate": 0.0002, + "loss": 0.7327, + "step": 20680 + }, + { + "epoch": 1.4858168761220827, + "grad_norm": 0.743760347366333, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 20690 + }, + { + "epoch": 1.4865350089766607, + "grad_norm": 0.8353745341300964, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 20700 + }, + { + "epoch": 1.4872531418312387, + "grad_norm": 0.8510433435440063, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 20710 + }, + { + "epoch": 1.487971274685817, + "grad_norm": 0.7065894603729248, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 20720 + }, + { + "epoch": 1.488689407540395, + "grad_norm": 0.6878955960273743, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 20730 + }, + { + "epoch": 1.4894075403949731, + "grad_norm": 0.7861111760139465, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 20740 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 0.4810725152492523, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20750 + }, + { + "epoch": 1.4908438061041291, + "grad_norm": 0.7246082425117493, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 20760 + }, + { + "epoch": 1.4915619389587074, + "grad_norm": 0.7101936340332031, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 20770 + }, + { + "epoch": 1.4922800718132856, + "grad_norm": 0.7508591413497925, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 20780 + }, + { + "epoch": 1.4929982046678636, + "grad_norm": 0.8872039914131165, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 20790 + }, + { + "epoch": 1.4937163375224416, + "grad_norm": 0.7257922887802124, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 20800 + }, + { + "epoch": 1.4944344703770198, + "grad_norm": 0.7886278629302979, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 20810 + }, + { + "epoch": 1.4951526032315978, + "grad_norm": 0.6746290922164917, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 20820 + }, + { + "epoch": 1.495870736086176, + "grad_norm": 0.8118207454681396, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 20830 + }, + { + "epoch": 1.496588868940754, + "grad_norm": 0.7337301969528198, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 20840 + }, + { + "epoch": 1.497307001795332, + "grad_norm": 0.5451242327690125, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 20850 + }, + { + "epoch": 1.4980251346499103, + "grad_norm": 0.8398377299308777, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 20860 + }, + { + "epoch": 1.4987432675044883, + "grad_norm": 0.7196659445762634, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 20870 + }, + { + "epoch": 1.4994614003590665, + "grad_norm": 0.6659539937973022, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 20880 + }, + { + "epoch": 1.5001795332136445, + "grad_norm": 0.6071978807449341, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 20890 + }, + { + "epoch": 1.5008976660682225, + "grad_norm": 0.6704870462417603, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 20900 + }, + { + "epoch": 1.5016157989228007, + "grad_norm": 0.7216639518737793, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 20910 + }, + { + "epoch": 1.502333931777379, + "grad_norm": 0.6050528287887573, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 20920 + }, + { + "epoch": 1.503052064631957, + "grad_norm": 0.7422218918800354, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 20930 + }, + { + "epoch": 1.503770197486535, + "grad_norm": 0.7157148122787476, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20940 + }, + { + "epoch": 1.504488330341113, + "grad_norm": 0.6704899668693542, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 20950 + }, + { + "epoch": 1.5052064631956912, + "grad_norm": 0.7573544979095459, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 20960 + }, + { + "epoch": 1.5059245960502694, + "grad_norm": 0.6710506677627563, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 20970 + }, + { + "epoch": 1.5066427289048474, + "grad_norm": 0.7559793591499329, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 20980 + }, + { + "epoch": 1.5073608617594254, + "grad_norm": 0.6705940961837769, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 20990 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 0.8016680479049683, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21000 + }, + { + "epoch": 1.5087971274685816, + "grad_norm": 0.8154481649398804, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 21010 + }, + { + "epoch": 1.5095152603231599, + "grad_norm": 0.5830582976341248, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 21020 + }, + { + "epoch": 1.5102333931777379, + "grad_norm": 0.7088601589202881, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 21030 + }, + { + "epoch": 1.5109515260323159, + "grad_norm": 0.7499658465385437, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 21040 + }, + { + "epoch": 1.511669658886894, + "grad_norm": 0.7684667706489563, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 21050 + }, + { + "epoch": 1.5123877917414723, + "grad_norm": 0.7183627486228943, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 21060 + }, + { + "epoch": 1.5131059245960503, + "grad_norm": 0.8201524615287781, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 21070 + }, + { + "epoch": 1.5138240574506283, + "grad_norm": 0.6359647512435913, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 21080 + }, + { + "epoch": 1.5145421903052063, + "grad_norm": 0.7419124245643616, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 21090 + }, + { + "epoch": 1.5152603231597845, + "grad_norm": 0.6145808696746826, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 21100 + }, + { + "epoch": 1.5159784560143628, + "grad_norm": 0.7116656303405762, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 21110 + }, + { + "epoch": 1.5166965888689408, + "grad_norm": 0.8927125334739685, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 21120 + }, + { + "epoch": 1.5174147217235188, + "grad_norm": 0.7527788877487183, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 21130 + }, + { + "epoch": 1.518132854578097, + "grad_norm": 0.7537266612052917, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 21140 + }, + { + "epoch": 1.518850987432675, + "grad_norm": 0.9051724672317505, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 21150 + }, + { + "epoch": 1.5195691202872532, + "grad_norm": 0.7258086800575256, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 21160 + }, + { + "epoch": 1.5202872531418312, + "grad_norm": 0.60377436876297, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 21170 + }, + { + "epoch": 1.5210053859964092, + "grad_norm": 0.613362729549408, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 21180 + }, + { + "epoch": 1.5217235188509874, + "grad_norm": 0.6311782002449036, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 21190 + }, + { + "epoch": 1.5224416517055657, + "grad_norm": 0.7814380526542664, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 21200 + }, + { + "epoch": 1.5231597845601437, + "grad_norm": 0.8482790589332581, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 21210 + }, + { + "epoch": 1.5238779174147217, + "grad_norm": 0.6767336130142212, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21220 + }, + { + "epoch": 1.5245960502692997, + "grad_norm": 0.7000219821929932, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 21230 + }, + { + "epoch": 1.525314183123878, + "grad_norm": 0.8848617076873779, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 21240 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 0.692258894443512, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 21250 + }, + { + "epoch": 1.5267504488330341, + "grad_norm": 0.7701950073242188, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 21260 + }, + { + "epoch": 1.5274685816876121, + "grad_norm": 0.7454132437705994, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 21270 + }, + { + "epoch": 1.5281867145421903, + "grad_norm": 0.7299574613571167, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 21280 + }, + { + "epoch": 1.5289048473967684, + "grad_norm": 0.6693950891494751, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 21290 + }, + { + "epoch": 1.5296229802513466, + "grad_norm": 0.8323785066604614, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 21300 + }, + { + "epoch": 1.5303411131059246, + "grad_norm": 0.8998763561248779, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 21310 + }, + { + "epoch": 1.5310592459605026, + "grad_norm": 0.8118193745613098, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 21320 + }, + { + "epoch": 1.5317773788150808, + "grad_norm": 0.8966332077980042, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 21330 + }, + { + "epoch": 1.532495511669659, + "grad_norm": 0.7849827408790588, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 21340 + }, + { + "epoch": 1.533213644524237, + "grad_norm": 0.897583544254303, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 21350 + }, + { + "epoch": 1.533931777378815, + "grad_norm": 0.7998009324073792, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21360 + }, + { + "epoch": 1.534649910233393, + "grad_norm": 0.5890361070632935, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 21370 + }, + { + "epoch": 1.5353680430879713, + "grad_norm": 0.7321302890777588, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 21380 + }, + { + "epoch": 1.5360861759425495, + "grad_norm": 0.7746050357818604, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 21390 + }, + { + "epoch": 1.5368043087971275, + "grad_norm": 0.7033910155296326, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 21400 + }, + { + "epoch": 1.5375224416517055, + "grad_norm": 0.7229148149490356, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 21410 + }, + { + "epoch": 1.5382405745062837, + "grad_norm": 0.8055810928344727, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 21420 + }, + { + "epoch": 1.5389587073608617, + "grad_norm": 0.9411654472351074, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 21430 + }, + { + "epoch": 1.53967684021544, + "grad_norm": 0.7297126650810242, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21440 + }, + { + "epoch": 1.540394973070018, + "grad_norm": 0.7316457629203796, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 21450 + }, + { + "epoch": 1.541113105924596, + "grad_norm": 0.8568798303604126, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 21460 + }, + { + "epoch": 1.5418312387791742, + "grad_norm": 0.7829580307006836, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21470 + }, + { + "epoch": 1.5425493716337524, + "grad_norm": 0.6679823398590088, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 21480 + }, + { + "epoch": 1.5432675044883304, + "grad_norm": 0.5680868029594421, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 21490 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 0.6878862380981445, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 21500 + }, + { + "epoch": 1.5447037701974864, + "grad_norm": 0.7391727566719055, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 21510 + }, + { + "epoch": 1.5454219030520646, + "grad_norm": 0.844994843006134, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 21520 + }, + { + "epoch": 1.5461400359066428, + "grad_norm": 0.7852550148963928, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 21530 + }, + { + "epoch": 1.5468581687612208, + "grad_norm": 0.8370407223701477, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 21540 + }, + { + "epoch": 1.5475763016157988, + "grad_norm": 0.7138169407844543, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 21550 + }, + { + "epoch": 1.548294434470377, + "grad_norm": 0.7660839557647705, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 21560 + }, + { + "epoch": 1.549012567324955, + "grad_norm": 0.6628666520118713, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 21570 + }, + { + "epoch": 1.5497307001795333, + "grad_norm": 0.602262020111084, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 21580 + }, + { + "epoch": 1.5504488330341113, + "grad_norm": 0.6120333671569824, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 21590 + }, + { + "epoch": 1.5511669658886893, + "grad_norm": 0.6742582321166992, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 21600 + }, + { + "epoch": 1.5518850987432675, + "grad_norm": 0.6788192391395569, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 21610 + }, + { + "epoch": 1.5526032315978457, + "grad_norm": 0.7124713659286499, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 21620 + }, + { + "epoch": 1.5533213644524237, + "grad_norm": 0.6297248005867004, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 21630 + }, + { + "epoch": 1.5540394973070017, + "grad_norm": 0.8977078199386597, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21640 + }, + { + "epoch": 1.5547576301615798, + "grad_norm": 0.7543209791183472, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 21650 + }, + { + "epoch": 1.555475763016158, + "grad_norm": 0.8704302310943604, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 21660 + }, + { + "epoch": 1.5561938958707362, + "grad_norm": 0.7848012447357178, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 21670 + }, + { + "epoch": 1.5569120287253142, + "grad_norm": 0.7496278285980225, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 21680 + }, + { + "epoch": 1.5576301615798922, + "grad_norm": 0.7305200099945068, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 21690 + }, + { + "epoch": 1.5583482944344704, + "grad_norm": 0.6671105623245239, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 21700 + }, + { + "epoch": 1.5590664272890484, + "grad_norm": 0.8536111116409302, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 21710 + }, + { + "epoch": 1.5597845601436267, + "grad_norm": 0.7360461354255676, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 21720 + }, + { + "epoch": 1.5605026929982047, + "grad_norm": 0.6665109395980835, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 21730 + }, + { + "epoch": 1.5612208258527827, + "grad_norm": 0.5879628658294678, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 21740 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 0.6937240958213806, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 21750 + }, + { + "epoch": 1.562657091561939, + "grad_norm": 0.7118659019470215, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 21760 + }, + { + "epoch": 1.563375224416517, + "grad_norm": 0.7858866453170776, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 21770 + }, + { + "epoch": 1.564093357271095, + "grad_norm": 0.8691372871398926, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 21780 + }, + { + "epoch": 1.564811490125673, + "grad_norm": 0.8884942531585693, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 21790 + }, + { + "epoch": 1.5655296229802513, + "grad_norm": 0.6335656046867371, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 21800 + }, + { + "epoch": 1.5662477558348296, + "grad_norm": 0.8666166067123413, + "learning_rate": 0.0002, + "loss": 0.7233, + "step": 21810 + }, + { + "epoch": 1.5669658886894076, + "grad_norm": 0.7961624264717102, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 21820 + }, + { + "epoch": 1.5676840215439856, + "grad_norm": 0.6331174373626709, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 21830 + }, + { + "epoch": 1.5684021543985638, + "grad_norm": 0.6476998925209045, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 21840 + }, + { + "epoch": 1.5691202872531418, + "grad_norm": 0.8279129266738892, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 21850 + }, + { + "epoch": 1.56983842010772, + "grad_norm": 0.6997109651565552, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 21860 + }, + { + "epoch": 1.570556552962298, + "grad_norm": 0.6992211937904358, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 21870 + }, + { + "epoch": 1.571274685816876, + "grad_norm": 0.7766915559768677, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 21880 + }, + { + "epoch": 1.5719928186714542, + "grad_norm": 0.6845845580101013, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 21890 + }, + { + "epoch": 1.5727109515260325, + "grad_norm": 0.7247874140739441, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 21900 + }, + { + "epoch": 1.5734290843806105, + "grad_norm": 0.802342414855957, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21910 + }, + { + "epoch": 1.5741472172351885, + "grad_norm": 0.7797709107398987, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 21920 + }, + { + "epoch": 1.5748653500897665, + "grad_norm": 0.6534958481788635, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21930 + }, + { + "epoch": 1.5755834829443447, + "grad_norm": 0.6003528237342834, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 21940 + }, + { + "epoch": 1.576301615798923, + "grad_norm": 0.6920075416564941, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 21950 + }, + { + "epoch": 1.577019748653501, + "grad_norm": 0.7213456034660339, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 21960 + }, + { + "epoch": 1.577737881508079, + "grad_norm": 0.7101914286613464, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 21970 + }, + { + "epoch": 1.5784560143626571, + "grad_norm": 0.9531592130661011, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 21980 + }, + { + "epoch": 1.5791741472172351, + "grad_norm": 0.7690590023994446, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 21990 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 0.8226363062858582, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 22000 + }, + { + "epoch": 1.5806104129263914, + "grad_norm": 0.6128851175308228, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 22010 + }, + { + "epoch": 1.5813285457809694, + "grad_norm": 0.827008068561554, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 22020 + }, + { + "epoch": 1.5820466786355476, + "grad_norm": 0.6729007363319397, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 22030 + }, + { + "epoch": 1.5827648114901258, + "grad_norm": 0.6397014260292053, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 22040 + }, + { + "epoch": 1.5834829443447038, + "grad_norm": 0.6927793622016907, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 22050 + }, + { + "epoch": 1.5842010771992818, + "grad_norm": 0.7527112364768982, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 22060 + }, + { + "epoch": 1.5849192100538598, + "grad_norm": 0.6418012380599976, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 22070 + }, + { + "epoch": 1.585637342908438, + "grad_norm": 0.7627281546592712, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 22080 + }, + { + "epoch": 1.5863554757630163, + "grad_norm": 0.753851592540741, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22090 + }, + { + "epoch": 1.5870736086175943, + "grad_norm": 0.6049349904060364, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 22100 + }, + { + "epoch": 1.5877917414721723, + "grad_norm": 0.6677758693695068, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 22110 + }, + { + "epoch": 1.5885098743267505, + "grad_norm": 0.913489818572998, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22120 + }, + { + "epoch": 1.5892280071813285, + "grad_norm": 0.6779162883758545, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 22130 + }, + { + "epoch": 1.5899461400359067, + "grad_norm": 0.910076916217804, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 22140 + }, + { + "epoch": 1.5906642728904847, + "grad_norm": 0.9506068229675293, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 22150 + }, + { + "epoch": 1.5913824057450627, + "grad_norm": 0.6552460789680481, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 22160 + }, + { + "epoch": 1.592100538599641, + "grad_norm": 0.6855819821357727, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22170 + }, + { + "epoch": 1.5928186714542192, + "grad_norm": 0.6713384985923767, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 22180 + }, + { + "epoch": 1.5935368043087972, + "grad_norm": 0.7168547511100769, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 22190 + }, + { + "epoch": 1.5942549371633752, + "grad_norm": 0.8395482897758484, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22200 + }, + { + "epoch": 1.5949730700179532, + "grad_norm": 0.6676998138427734, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 22210 + }, + { + "epoch": 1.5956912028725314, + "grad_norm": 0.5837140083312988, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 22220 + }, + { + "epoch": 1.5964093357271096, + "grad_norm": 0.8399306535720825, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 22230 + }, + { + "epoch": 1.5971274685816876, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22240 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 0.768604040145874, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 22250 + }, + { + "epoch": 1.5985637342908436, + "grad_norm": 0.6382646560668945, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 22260 + }, + { + "epoch": 1.5992818671454219, + "grad_norm": 0.7244897484779358, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 22270 + }, + { + "epoch": 1.6, + "grad_norm": 0.6250987648963928, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 22280 + }, + { + "epoch": 1.600718132854578, + "grad_norm": 0.8731992244720459, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 22290 + }, + { + "epoch": 1.601436265709156, + "grad_norm": 0.5861822962760925, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 22300 + }, + { + "epoch": 1.6021543985637343, + "grad_norm": 0.716805100440979, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 22310 + }, + { + "epoch": 1.6028725314183125, + "grad_norm": 0.6650034189224243, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 22320 + }, + { + "epoch": 1.6035906642728905, + "grad_norm": 0.6944432854652405, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 22330 + }, + { + "epoch": 1.6043087971274685, + "grad_norm": 0.7411999106407166, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 22340 + }, + { + "epoch": 1.6050269299820465, + "grad_norm": 0.831828773021698, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 22350 + }, + { + "epoch": 1.6057450628366248, + "grad_norm": 0.6252152919769287, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 22360 + }, + { + "epoch": 1.606463195691203, + "grad_norm": 0.8643325567245483, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22370 + }, + { + "epoch": 1.607181328545781, + "grad_norm": 0.7330279350280762, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 22380 + }, + { + "epoch": 1.607899461400359, + "grad_norm": 0.7235422730445862, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 22390 + }, + { + "epoch": 1.608617594254937, + "grad_norm": 0.6940887570381165, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 22400 + }, + { + "epoch": 1.6093357271095152, + "grad_norm": 0.7907325625419617, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 22410 + }, + { + "epoch": 1.6100538599640934, + "grad_norm": 0.6899075508117676, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 22420 + }, + { + "epoch": 1.6107719928186714, + "grad_norm": 0.7057487368583679, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 22430 + }, + { + "epoch": 1.6114901256732495, + "grad_norm": 0.9235003590583801, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 22440 + }, + { + "epoch": 1.6122082585278277, + "grad_norm": 0.7238173484802246, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22450 + }, + { + "epoch": 1.612926391382406, + "grad_norm": 0.5931997299194336, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 22460 + }, + { + "epoch": 1.613644524236984, + "grad_norm": 0.6705866456031799, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 22470 + }, + { + "epoch": 1.614362657091562, + "grad_norm": 0.7392773032188416, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 22480 + }, + { + "epoch": 1.61508078994614, + "grad_norm": 0.6286543607711792, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 22490 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 0.7467446327209473, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 22500 + }, + { + "epoch": 1.6165170556552964, + "grad_norm": 0.8353021740913391, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 22510 + }, + { + "epoch": 1.6172351885098744, + "grad_norm": 0.7333045601844788, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 22520 + }, + { + "epoch": 1.6179533213644524, + "grad_norm": 0.6203709244728088, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 22530 + }, + { + "epoch": 1.6186714542190304, + "grad_norm": 0.5585690140724182, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 22540 + }, + { + "epoch": 1.6193895870736086, + "grad_norm": 0.7157222032546997, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 22550 + }, + { + "epoch": 1.6201077199281868, + "grad_norm": 0.8129993677139282, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 22560 + }, + { + "epoch": 1.6208258527827648, + "grad_norm": 0.6745335459709167, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 22570 + }, + { + "epoch": 1.6215439856373428, + "grad_norm": 0.7684996724128723, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 22580 + }, + { + "epoch": 1.622262118491921, + "grad_norm": 0.6735436916351318, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22590 + }, + { + "epoch": 1.6229802513464993, + "grad_norm": 0.7394272089004517, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 22600 + }, + { + "epoch": 1.6236983842010773, + "grad_norm": 0.7268046140670776, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 22610 + }, + { + "epoch": 1.6244165170556553, + "grad_norm": 0.8338810205459595, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 22620 + }, + { + "epoch": 1.6251346499102333, + "grad_norm": 0.9293080568313599, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 22630 + }, + { + "epoch": 1.6258527827648115, + "grad_norm": 0.8084996938705444, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 22640 + }, + { + "epoch": 1.6265709156193897, + "grad_norm": 0.6605180501937866, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22650 + }, + { + "epoch": 1.6272890484739677, + "grad_norm": 0.8402717113494873, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 22660 + }, + { + "epoch": 1.6280071813285457, + "grad_norm": 0.653055727481842, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 22670 + }, + { + "epoch": 1.6287253141831237, + "grad_norm": 0.6477823257446289, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 22680 + }, + { + "epoch": 1.629443447037702, + "grad_norm": 0.9053590893745422, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 22690 + }, + { + "epoch": 1.6301615798922802, + "grad_norm": 0.90384441614151, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 22700 + }, + { + "epoch": 1.6308797127468582, + "grad_norm": 0.6789469122886658, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 22710 + }, + { + "epoch": 1.6315978456014362, + "grad_norm": 0.7221854329109192, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 22720 + }, + { + "epoch": 1.6323159784560144, + "grad_norm": 0.7724022269248962, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 22730 + }, + { + "epoch": 1.6330341113105926, + "grad_norm": 0.8213715553283691, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 22740 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 0.7102876305580139, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 22750 + }, + { + "epoch": 1.6344703770197486, + "grad_norm": 0.8817880749702454, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 22760 + }, + { + "epoch": 1.6351885098743266, + "grad_norm": 0.8446506857872009, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 22770 + }, + { + "epoch": 1.6359066427289048, + "grad_norm": 0.6749029755592346, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 22780 + }, + { + "epoch": 1.636624775583483, + "grad_norm": 0.7013556957244873, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 22790 + }, + { + "epoch": 1.637342908438061, + "grad_norm": 0.7767965793609619, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22800 + }, + { + "epoch": 1.638061041292639, + "grad_norm": 0.7354073524475098, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 22810 + }, + { + "epoch": 1.638779174147217, + "grad_norm": 0.8871088027954102, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 22820 + }, + { + "epoch": 1.6394973070017953, + "grad_norm": 0.6573871374130249, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 22830 + }, + { + "epoch": 1.6402154398563735, + "grad_norm": 0.5679349303245544, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 22840 + }, + { + "epoch": 1.6409335727109515, + "grad_norm": 0.7072559595108032, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 22850 + }, + { + "epoch": 1.6416517055655295, + "grad_norm": 0.7639257311820984, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 22860 + }, + { + "epoch": 1.6423698384201078, + "grad_norm": 0.6699341535568237, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 22870 + }, + { + "epoch": 1.643087971274686, + "grad_norm": 0.8285767436027527, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 22880 + }, + { + "epoch": 1.643806104129264, + "grad_norm": 0.7328150272369385, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 22890 + }, + { + "epoch": 1.644524236983842, + "grad_norm": 0.8122354745864868, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 22900 + }, + { + "epoch": 1.64524236983842, + "grad_norm": 0.7322969436645508, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 22910 + }, + { + "epoch": 1.6459605026929982, + "grad_norm": 0.7269576191902161, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 22920 + }, + { + "epoch": 1.6466786355475764, + "grad_norm": 0.7037042379379272, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 22930 + }, + { + "epoch": 1.6473967684021544, + "grad_norm": 0.6960355639457703, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 22940 + }, + { + "epoch": 1.6481149012567324, + "grad_norm": 0.7446839213371277, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 22950 + }, + { + "epoch": 1.6488330341113104, + "grad_norm": 0.7201664447784424, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 22960 + }, + { + "epoch": 1.6495511669658887, + "grad_norm": 0.7062349319458008, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 22970 + }, + { + "epoch": 1.6502692998204669, + "grad_norm": 0.7666636109352112, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 22980 + }, + { + "epoch": 1.6509874326750449, + "grad_norm": 0.7872112393379211, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 22990 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 0.7428551316261292, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 23000 + }, + { + "epoch": 1.6524236983842011, + "grad_norm": 0.6087952852249146, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 23010 + }, + { + "epoch": 1.6531418312387793, + "grad_norm": 0.7191354036331177, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 23020 + }, + { + "epoch": 1.6538599640933573, + "grad_norm": 0.8679710626602173, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 23030 + }, + { + "epoch": 1.6545780969479353, + "grad_norm": 0.7232310175895691, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 23040 + }, + { + "epoch": 1.6552962298025133, + "grad_norm": 0.5695104002952576, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 23050 + }, + { + "epoch": 1.6560143626570916, + "grad_norm": 0.6363076567649841, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 23060 + }, + { + "epoch": 1.6567324955116698, + "grad_norm": 0.8168749809265137, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23070 + }, + { + "epoch": 1.6574506283662478, + "grad_norm": 0.7664111852645874, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 23080 + }, + { + "epoch": 1.6581687612208258, + "grad_norm": 0.6748140454292297, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 23090 + }, + { + "epoch": 1.6588868940754038, + "grad_norm": 0.6258183121681213, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 23100 + }, + { + "epoch": 1.659605026929982, + "grad_norm": 0.8669735193252563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 23110 + }, + { + "epoch": 1.6603231597845602, + "grad_norm": 0.5606119632720947, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 23120 + }, + { + "epoch": 1.6610412926391382, + "grad_norm": 0.6602507829666138, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 23130 + }, + { + "epoch": 1.6617594254937162, + "grad_norm": 0.7237988710403442, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 23140 + }, + { + "epoch": 1.6624775583482945, + "grad_norm": 0.9054415225982666, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 23150 + }, + { + "epoch": 1.6631956912028727, + "grad_norm": 0.5186660289764404, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 23160 + }, + { + "epoch": 1.6639138240574507, + "grad_norm": 0.719584584236145, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 23170 + }, + { + "epoch": 1.6646319569120287, + "grad_norm": 0.7583617568016052, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 23180 + }, + { + "epoch": 1.6653500897666067, + "grad_norm": 0.7985982298851013, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 23190 + }, + { + "epoch": 1.666068222621185, + "grad_norm": 0.6952691674232483, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23200 + }, + { + "epoch": 1.6667863554757631, + "grad_norm": 0.7184221744537354, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 23210 + }, + { + "epoch": 1.6675044883303412, + "grad_norm": 0.8256361484527588, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 23220 + }, + { + "epoch": 1.6682226211849192, + "grad_norm": 0.7534128427505493, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 23230 + }, + { + "epoch": 1.6689407540394972, + "grad_norm": 0.7711095213890076, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 23240 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 0.6326615810394287, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 23250 + }, + { + "epoch": 1.6703770197486536, + "grad_norm": 0.8345766663551331, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 23260 + }, + { + "epoch": 1.6710951526032316, + "grad_norm": 0.9079837203025818, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 23270 + }, + { + "epoch": 1.6718132854578096, + "grad_norm": 0.7310197353363037, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 23280 + }, + { + "epoch": 1.6725314183123878, + "grad_norm": 0.7573344707489014, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 23290 + }, + { + "epoch": 1.673249551166966, + "grad_norm": 0.7708047032356262, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 23300 + }, + { + "epoch": 1.673967684021544, + "grad_norm": 0.7665812969207764, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 23310 + }, + { + "epoch": 1.674685816876122, + "grad_norm": 0.7988788485527039, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 23320 + }, + { + "epoch": 1.6754039497307, + "grad_norm": 0.755042552947998, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 23330 + }, + { + "epoch": 1.6761220825852783, + "grad_norm": 0.6605848670005798, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 23340 + }, + { + "epoch": 1.6768402154398565, + "grad_norm": 0.8762016296386719, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 23350 + }, + { + "epoch": 1.6775583482944345, + "grad_norm": 0.604742169380188, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 23360 + }, + { + "epoch": 1.6782764811490125, + "grad_norm": 0.7479172945022583, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 23370 + }, + { + "epoch": 1.6789946140035905, + "grad_norm": 0.6418702602386475, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 23380 + }, + { + "epoch": 1.6797127468581687, + "grad_norm": 0.6783933639526367, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 23390 + }, + { + "epoch": 1.680430879712747, + "grad_norm": 0.7036024928092957, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 23400 + }, + { + "epoch": 1.681149012567325, + "grad_norm": 0.6833266615867615, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 23410 + }, + { + "epoch": 1.681867145421903, + "grad_norm": 0.8867062330245972, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 23420 + }, + { + "epoch": 1.6825852782764812, + "grad_norm": 0.7825753092765808, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 23430 + }, + { + "epoch": 1.6833034111310592, + "grad_norm": 0.6396880745887756, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 23440 + }, + { + "epoch": 1.6840215439856374, + "grad_norm": 0.5723230242729187, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 23450 + }, + { + "epoch": 1.6847396768402154, + "grad_norm": 0.6949231624603271, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 23460 + }, + { + "epoch": 1.6854578096947934, + "grad_norm": 0.8290650248527527, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 23470 + }, + { + "epoch": 1.6861759425493716, + "grad_norm": 0.7765078544616699, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 23480 + }, + { + "epoch": 1.6868940754039499, + "grad_norm": 0.7084149718284607, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 23490 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 0.6916654109954834, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 23500 + }, + { + "epoch": 1.6883303411131059, + "grad_norm": 0.5615179538726807, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 23510 + }, + { + "epoch": 1.6890484739676839, + "grad_norm": 0.7996105551719666, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 23520 + }, + { + "epoch": 1.689766606822262, + "grad_norm": 0.7010168433189392, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23530 + }, + { + "epoch": 1.6904847396768403, + "grad_norm": 0.7876442074775696, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 23540 + }, + { + "epoch": 1.6912028725314183, + "grad_norm": 0.7508043646812439, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 23550 + }, + { + "epoch": 1.6919210053859963, + "grad_norm": 0.8125874400138855, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 23560 + }, + { + "epoch": 1.6926391382405745, + "grad_norm": 0.711840808391571, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 23570 + }, + { + "epoch": 1.6933572710951525, + "grad_norm": 0.6540026068687439, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 23580 + }, + { + "epoch": 1.6940754039497308, + "grad_norm": 0.8376550078392029, + "learning_rate": 0.0002, + "loss": 0.7578, + "step": 23590 + }, + { + "epoch": 1.6947935368043088, + "grad_norm": 0.7075366973876953, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 23600 + }, + { + "epoch": 1.6955116696588868, + "grad_norm": 0.7522266507148743, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23610 + }, + { + "epoch": 1.696229802513465, + "grad_norm": 0.7572667002677917, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 23620 + }, + { + "epoch": 1.6969479353680432, + "grad_norm": 0.6126907467842102, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 23630 + }, + { + "epoch": 1.6976660682226212, + "grad_norm": 0.7473152875900269, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 23640 + }, + { + "epoch": 1.6983842010771992, + "grad_norm": 0.6630390286445618, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 23650 + }, + { + "epoch": 1.6991023339317772, + "grad_norm": 0.5848073363304138, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 23660 + }, + { + "epoch": 1.6998204667863555, + "grad_norm": 0.5901942849159241, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 23670 + }, + { + "epoch": 1.7005385996409337, + "grad_norm": 0.7896918058395386, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 23680 + }, + { + "epoch": 1.7012567324955117, + "grad_norm": 0.705362856388092, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 23690 + }, + { + "epoch": 1.7019748653500897, + "grad_norm": 0.9917470812797546, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 23700 + }, + { + "epoch": 1.702692998204668, + "grad_norm": 0.7550538778305054, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 23710 + }, + { + "epoch": 1.703411131059246, + "grad_norm": 0.8348238468170166, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23720 + }, + { + "epoch": 1.7041292639138241, + "grad_norm": 0.5979694128036499, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 23730 + }, + { + "epoch": 1.7048473967684021, + "grad_norm": 0.7451775670051575, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 23740 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 0.7614818215370178, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 23750 + }, + { + "epoch": 1.7062836624775584, + "grad_norm": 0.5590742826461792, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 23760 + }, + { + "epoch": 1.7070017953321366, + "grad_norm": 0.7039094567298889, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 23770 + }, + { + "epoch": 1.7077199281867146, + "grad_norm": 0.7963233590126038, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23780 + }, + { + "epoch": 1.7084380610412926, + "grad_norm": 0.7214934825897217, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 23790 + }, + { + "epoch": 1.7091561938958706, + "grad_norm": 0.7310500741004944, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23800 + }, + { + "epoch": 1.7098743267504488, + "grad_norm": 0.6653284430503845, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 23810 + }, + { + "epoch": 1.710592459605027, + "grad_norm": 0.6632702946662903, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 23820 + }, + { + "epoch": 1.711310592459605, + "grad_norm": 0.6314955949783325, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 23830 + }, + { + "epoch": 1.712028725314183, + "grad_norm": 0.73652583360672, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 23840 + }, + { + "epoch": 1.7127468581687613, + "grad_norm": 0.5685144662857056, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 23850 + }, + { + "epoch": 1.7134649910233393, + "grad_norm": 0.7010223865509033, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 23860 + }, + { + "epoch": 1.7141831238779175, + "grad_norm": 0.7643879652023315, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 23870 + }, + { + "epoch": 1.7149012567324955, + "grad_norm": 0.7543165683746338, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 23880 + }, + { + "epoch": 1.7156193895870735, + "grad_norm": 0.8816508054733276, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 23890 + }, + { + "epoch": 1.7163375224416517, + "grad_norm": 0.7979614734649658, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23900 + }, + { + "epoch": 1.71705565529623, + "grad_norm": 0.7631057500839233, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 23910 + }, + { + "epoch": 1.717773788150808, + "grad_norm": 0.6349977254867554, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 23920 + }, + { + "epoch": 1.718491921005386, + "grad_norm": 0.7464412450790405, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 23930 + }, + { + "epoch": 1.719210053859964, + "grad_norm": 0.6985567212104797, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 23940 + }, + { + "epoch": 1.7199281867145422, + "grad_norm": 0.6641302704811096, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 23950 + }, + { + "epoch": 1.7206463195691204, + "grad_norm": 0.7299597263336182, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 23960 + }, + { + "epoch": 1.7213644524236984, + "grad_norm": 0.7812355756759644, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 23970 + }, + { + "epoch": 1.7220825852782764, + "grad_norm": 0.667571485042572, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 23980 + }, + { + "epoch": 1.7228007181328546, + "grad_norm": 0.8244081735610962, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 23990 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 0.6684445738792419, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 24000 + }, + { + "epoch": 1.7242369838420109, + "grad_norm": 0.7002949118614197, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 24010 + }, + { + "epoch": 1.7249551166965889, + "grad_norm": 0.6249772906303406, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 24020 + }, + { + "epoch": 1.7256732495511669, + "grad_norm": 0.7279905080795288, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 24030 + }, + { + "epoch": 1.726391382405745, + "grad_norm": 0.631148636341095, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 24040 + }, + { + "epoch": 1.7271095152603233, + "grad_norm": 0.7486464977264404, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 24050 + }, + { + "epoch": 1.7278276481149013, + "grad_norm": 0.7494347095489502, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 24060 + }, + { + "epoch": 1.7285457809694793, + "grad_norm": 0.7821264863014221, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 24070 + }, + { + "epoch": 1.7292639138240573, + "grad_norm": 0.7211608290672302, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 24080 + }, + { + "epoch": 1.7299820466786355, + "grad_norm": 0.7028553485870361, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 24090 + }, + { + "epoch": 1.7307001795332138, + "grad_norm": 0.6189247369766235, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 24100 + }, + { + "epoch": 1.7314183123877918, + "grad_norm": 0.7339756488800049, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 24110 + }, + { + "epoch": 1.7321364452423698, + "grad_norm": 0.6700502038002014, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 24120 + }, + { + "epoch": 1.732854578096948, + "grad_norm": 0.6139533519744873, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 24130 + }, + { + "epoch": 1.733572710951526, + "grad_norm": 0.7249825596809387, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 24140 + }, + { + "epoch": 1.7342908438061042, + "grad_norm": 0.6531777381896973, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 24150 + }, + { + "epoch": 1.7350089766606822, + "grad_norm": 0.8443833589553833, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 24160 + }, + { + "epoch": 1.7357271095152602, + "grad_norm": 0.7040373086929321, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 24170 + }, + { + "epoch": 1.7364452423698384, + "grad_norm": 0.8647749423980713, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24180 + }, + { + "epoch": 1.7371633752244167, + "grad_norm": 0.7297305464744568, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 24190 + }, + { + "epoch": 1.7378815080789947, + "grad_norm": 0.8191218376159668, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 24200 + }, + { + "epoch": 1.7385996409335727, + "grad_norm": 0.7315607666969299, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 24210 + }, + { + "epoch": 1.7393177737881507, + "grad_norm": 0.694486677646637, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 24220 + }, + { + "epoch": 1.740035906642729, + "grad_norm": 0.8115953207015991, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 24230 + }, + { + "epoch": 1.7407540394973071, + "grad_norm": 0.7379186153411865, + "learning_rate": 0.0002, + "loss": 0.7792, + "step": 24240 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 0.6820309162139893, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 24250 + }, + { + "epoch": 1.7421903052064631, + "grad_norm": 0.8210766911506653, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 24260 + }, + { + "epoch": 1.7429084380610413, + "grad_norm": 0.724466860294342, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 24270 + }, + { + "epoch": 1.7436265709156193, + "grad_norm": 0.8768740296363831, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 24280 + }, + { + "epoch": 1.7443447037701976, + "grad_norm": 0.6691206097602844, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24290 + }, + { + "epoch": 1.7450628366247756, + "grad_norm": 0.6529893279075623, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 24300 + }, + { + "epoch": 1.7457809694793536, + "grad_norm": 0.904729962348938, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 24310 + }, + { + "epoch": 1.7464991023339318, + "grad_norm": 0.655235230922699, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24320 + }, + { + "epoch": 1.74721723518851, + "grad_norm": 0.9476361274719238, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 24330 + }, + { + "epoch": 1.747935368043088, + "grad_norm": 0.55366051197052, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 24340 + }, + { + "epoch": 1.748653500897666, + "grad_norm": 0.7192568182945251, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 24350 + }, + { + "epoch": 1.749371633752244, + "grad_norm": 0.7193983793258667, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 24360 + }, + { + "epoch": 1.7500897666068223, + "grad_norm": 0.753998339176178, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24370 + }, + { + "epoch": 1.7508078994614005, + "grad_norm": 1.1058299541473389, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 24380 + }, + { + "epoch": 1.7515260323159785, + "grad_norm": 0.7213007211685181, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 24390 + }, + { + "epoch": 1.7522441651705565, + "grad_norm": 0.972494900226593, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 24400 + }, + { + "epoch": 1.7529622980251347, + "grad_norm": 0.8045306205749512, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 24410 + }, + { + "epoch": 1.7536804308797127, + "grad_norm": 0.82415372133255, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24420 + }, + { + "epoch": 1.754398563734291, + "grad_norm": 0.72683185338974, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 24430 + }, + { + "epoch": 1.755116696588869, + "grad_norm": 0.687907338142395, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 24440 + }, + { + "epoch": 1.755834829443447, + "grad_norm": 0.6616531610488892, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 24450 + }, + { + "epoch": 1.7565529622980252, + "grad_norm": 0.7225571870803833, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 24460 + }, + { + "epoch": 1.7572710951526034, + "grad_norm": 0.7597603797912598, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 24470 + }, + { + "epoch": 1.7579892280071814, + "grad_norm": 0.7850660681724548, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 24480 + }, + { + "epoch": 1.7587073608617594, + "grad_norm": 0.9843530058860779, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 24490 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 0.7010256052017212, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 24500 + }, + { + "epoch": 1.7601436265709156, + "grad_norm": 0.5669383406639099, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 24510 + }, + { + "epoch": 1.7608617594254938, + "grad_norm": 0.7043302655220032, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 24520 + }, + { + "epoch": 1.7615798922800718, + "grad_norm": 0.8000741600990295, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 24530 + }, + { + "epoch": 1.7622980251346498, + "grad_norm": 0.7084416747093201, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 24540 + }, + { + "epoch": 1.763016157989228, + "grad_norm": 0.7290608882904053, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 24550 + }, + { + "epoch": 1.763734290843806, + "grad_norm": 0.8710007071495056, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 24560 + }, + { + "epoch": 1.7644524236983843, + "grad_norm": 0.6346535682678223, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 24570 + }, + { + "epoch": 1.7651705565529623, + "grad_norm": 0.8990599513053894, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 24580 + }, + { + "epoch": 1.7658886894075403, + "grad_norm": 0.7823857665061951, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 24590 + }, + { + "epoch": 1.7666068222621185, + "grad_norm": 0.6250144839286804, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 24600 + }, + { + "epoch": 1.7673249551166967, + "grad_norm": 0.715657114982605, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 24610 + }, + { + "epoch": 1.7680430879712747, + "grad_norm": 0.6254874467849731, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 24620 + }, + { + "epoch": 1.7687612208258527, + "grad_norm": 0.6873717904090881, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 24630 + }, + { + "epoch": 1.7694793536804307, + "grad_norm": 0.7273038625717163, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 24640 + }, + { + "epoch": 1.770197486535009, + "grad_norm": 0.9079981446266174, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 24650 + }, + { + "epoch": 1.7709156193895872, + "grad_norm": 0.6262510418891907, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 24660 + }, + { + "epoch": 1.7716337522441652, + "grad_norm": 0.7326231002807617, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 24670 + }, + { + "epoch": 1.7723518850987432, + "grad_norm": 0.7828301787376404, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 24680 + }, + { + "epoch": 1.7730700179533212, + "grad_norm": 0.5881586670875549, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 24690 + }, + { + "epoch": 1.7737881508078994, + "grad_norm": 0.7101683020591736, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 24700 + }, + { + "epoch": 1.7745062836624776, + "grad_norm": 0.8466469049453735, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 24710 + }, + { + "epoch": 1.7752244165170556, + "grad_norm": 0.7770822644233704, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 24720 + }, + { + "epoch": 1.7759425493716336, + "grad_norm": 0.7259120345115662, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 24730 + }, + { + "epoch": 1.7766606822262119, + "grad_norm": 0.7696824669837952, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 24740 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 0.7603837847709656, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 24750 + }, + { + "epoch": 1.778096947935368, + "grad_norm": 0.6166595220565796, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 24760 + }, + { + "epoch": 1.778815080789946, + "grad_norm": 0.7493758797645569, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 24770 + }, + { + "epoch": 1.779533213644524, + "grad_norm": 0.7177459597587585, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 24780 + }, + { + "epoch": 1.7802513464991023, + "grad_norm": 0.6666781306266785, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 24790 + }, + { + "epoch": 1.7809694793536806, + "grad_norm": 0.6556468605995178, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 24800 + }, + { + "epoch": 1.7816876122082586, + "grad_norm": 0.6119393706321716, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 24810 + }, + { + "epoch": 1.7824057450628366, + "grad_norm": 0.8573325276374817, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 24820 + }, + { + "epoch": 1.7831238779174146, + "grad_norm": 0.8017005920410156, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 24830 + }, + { + "epoch": 1.7838420107719928, + "grad_norm": 0.7337947487831116, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24840 + }, + { + "epoch": 1.784560143626571, + "grad_norm": 0.6717178225517273, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 24850 + }, + { + "epoch": 1.785278276481149, + "grad_norm": 0.8243708610534668, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 24860 + }, + { + "epoch": 1.785996409335727, + "grad_norm": 0.8111547827720642, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24870 + }, + { + "epoch": 1.7867145421903052, + "grad_norm": 0.8577823042869568, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 24880 + }, + { + "epoch": 1.7874326750448835, + "grad_norm": 0.6488644480705261, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 24890 + }, + { + "epoch": 1.7881508078994615, + "grad_norm": 0.6446744799613953, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 24900 + }, + { + "epoch": 1.7888689407540395, + "grad_norm": 0.6400182247161865, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 24910 + }, + { + "epoch": 1.7895870736086175, + "grad_norm": 0.8059108853340149, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 24920 + }, + { + "epoch": 1.7903052064631957, + "grad_norm": 0.7101734280586243, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 24930 + }, + { + "epoch": 1.791023339317774, + "grad_norm": 1.0397762060165405, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 24940 + }, + { + "epoch": 1.791741472172352, + "grad_norm": 0.6231128573417664, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 24950 + }, + { + "epoch": 1.79245960502693, + "grad_norm": 5.905253887176514, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 24960 + }, + { + "epoch": 1.793177737881508, + "grad_norm": 0.8003911375999451, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 24970 + }, + { + "epoch": 1.7938958707360861, + "grad_norm": 0.6340393424034119, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 24980 + }, + { + "epoch": 1.7946140035906644, + "grad_norm": 0.8701013922691345, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 24990 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 0.9085575342178345, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 25000 + }, + { + "epoch": 1.7960502692998204, + "grad_norm": 0.6306625604629517, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 25010 + }, + { + "epoch": 1.7967684021543986, + "grad_norm": 0.6985056400299072, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25020 + }, + { + "epoch": 1.7974865350089768, + "grad_norm": 0.7309113144874573, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 25030 + }, + { + "epoch": 1.7982046678635548, + "grad_norm": 0.6795042157173157, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 25040 + }, + { + "epoch": 1.7989228007181328, + "grad_norm": 0.6920178532600403, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25050 + }, + { + "epoch": 1.7996409335727108, + "grad_norm": 0.6578564047813416, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25060 + }, + { + "epoch": 1.800359066427289, + "grad_norm": 0.6718358993530273, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 25070 + }, + { + "epoch": 1.8010771992818673, + "grad_norm": 0.9086750149726868, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 25080 + }, + { + "epoch": 1.8017953321364453, + "grad_norm": 0.6102437973022461, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 25090 + }, + { + "epoch": 1.8025134649910233, + "grad_norm": 0.6391313076019287, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 25100 + }, + { + "epoch": 1.8032315978456013, + "grad_norm": 0.7150128483772278, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 25110 + }, + { + "epoch": 1.8039497307001795, + "grad_norm": 0.9833421111106873, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 25120 + }, + { + "epoch": 1.8046678635547577, + "grad_norm": 0.774002194404602, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25130 + }, + { + "epoch": 1.8053859964093357, + "grad_norm": 0.644443154335022, + "learning_rate": 0.0002, + "loss": 0.7329, + "step": 25140 + }, + { + "epoch": 1.8061041292639137, + "grad_norm": 0.6996100544929504, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 25150 + }, + { + "epoch": 1.806822262118492, + "grad_norm": 0.7545985579490662, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 25160 + }, + { + "epoch": 1.8075403949730702, + "grad_norm": 0.7505226731300354, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 25170 + }, + { + "epoch": 1.8082585278276482, + "grad_norm": 0.800681471824646, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 25180 + }, + { + "epoch": 1.8089766606822262, + "grad_norm": 0.8268337845802307, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 25190 + }, + { + "epoch": 1.8096947935368042, + "grad_norm": 0.6436594128608704, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 25200 + }, + { + "epoch": 1.8104129263913824, + "grad_norm": 0.6961014270782471, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 25210 + }, + { + "epoch": 1.8111310592459606, + "grad_norm": 0.6649489998817444, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 25220 + }, + { + "epoch": 1.8118491921005386, + "grad_norm": 0.7071637511253357, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 25230 + }, + { + "epoch": 1.8125673249551166, + "grad_norm": 0.9082241654396057, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 25240 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 0.6318159103393555, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 25250 + }, + { + "epoch": 1.8140035906642729, + "grad_norm": 0.8006597757339478, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 25260 + }, + { + "epoch": 1.814721723518851, + "grad_norm": 0.7950259447097778, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 25270 + }, + { + "epoch": 1.815439856373429, + "grad_norm": 0.8376588821411133, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 25280 + }, + { + "epoch": 1.816157989228007, + "grad_norm": 0.8343217968940735, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 25290 + }, + { + "epoch": 1.8168761220825853, + "grad_norm": 0.6240017414093018, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 25300 + }, + { + "epoch": 1.8175942549371635, + "grad_norm": 0.7079808712005615, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 25310 + }, + { + "epoch": 1.8183123877917415, + "grad_norm": 0.5930073261260986, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 25320 + }, + { + "epoch": 1.8190305206463195, + "grad_norm": 0.6994491815567017, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 25330 + }, + { + "epoch": 1.8197486535008975, + "grad_norm": 0.8285305500030518, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 25340 + }, + { + "epoch": 1.8204667863554758, + "grad_norm": 0.6880194544792175, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 25350 + }, + { + "epoch": 1.821184919210054, + "grad_norm": 0.7301307916641235, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 25360 + }, + { + "epoch": 1.821903052064632, + "grad_norm": 0.8117532730102539, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 25370 + }, + { + "epoch": 1.82262118491921, + "grad_norm": 0.8098701238632202, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 25380 + }, + { + "epoch": 1.823339317773788, + "grad_norm": 0.6899038553237915, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 25390 + }, + { + "epoch": 1.8240574506283662, + "grad_norm": 0.7350431084632874, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 25400 + }, + { + "epoch": 1.8247755834829444, + "grad_norm": 0.8723382949829102, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 25410 + }, + { + "epoch": 1.8254937163375224, + "grad_norm": 0.7448108196258545, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 25420 + }, + { + "epoch": 1.8262118491921004, + "grad_norm": 0.7525040507316589, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25430 + }, + { + "epoch": 1.8269299820466787, + "grad_norm": 0.7148599028587341, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25440 + }, + { + "epoch": 1.827648114901257, + "grad_norm": 1.1802153587341309, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 25450 + }, + { + "epoch": 1.828366247755835, + "grad_norm": 0.619945764541626, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25460 + }, + { + "epoch": 1.829084380610413, + "grad_norm": 0.7065792679786682, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 25470 + }, + { + "epoch": 1.829802513464991, + "grad_norm": 0.6626001596450806, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 25480 + }, + { + "epoch": 1.8305206463195691, + "grad_norm": 0.8368920087814331, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 25490 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 0.7528934478759766, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 25500 + }, + { + "epoch": 1.8319569120287253, + "grad_norm": 0.6472136378288269, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 25510 + }, + { + "epoch": 1.8326750448833034, + "grad_norm": 0.7818671464920044, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 25520 + }, + { + "epoch": 1.8333931777378814, + "grad_norm": 0.8280798196792603, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 25530 + }, + { + "epoch": 1.8341113105924596, + "grad_norm": 0.7038599252700806, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 25540 + }, + { + "epoch": 1.8348294434470378, + "grad_norm": 0.6345962882041931, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 25550 + }, + { + "epoch": 1.8355475763016158, + "grad_norm": 0.6891741752624512, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 25560 + }, + { + "epoch": 1.8362657091561938, + "grad_norm": 0.7753492593765259, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 25570 + }, + { + "epoch": 1.836983842010772, + "grad_norm": 0.6907210946083069, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 25580 + }, + { + "epoch": 1.8377019748653503, + "grad_norm": 0.7483090162277222, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 25590 + }, + { + "epoch": 1.8384201077199283, + "grad_norm": 0.8749029636383057, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 25600 + }, + { + "epoch": 1.8391382405745063, + "grad_norm": 0.6936851143836975, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 25610 + }, + { + "epoch": 1.8398563734290843, + "grad_norm": 0.7273763418197632, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 25620 + }, + { + "epoch": 1.8405745062836625, + "grad_norm": 0.7655298113822937, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 25630 + }, + { + "epoch": 1.8412926391382407, + "grad_norm": 0.7207344770431519, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 25640 + }, + { + "epoch": 1.8420107719928187, + "grad_norm": 0.6970131397247314, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 25650 + }, + { + "epoch": 1.8427289048473967, + "grad_norm": 0.7777560353279114, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25660 + }, + { + "epoch": 1.8434470377019747, + "grad_norm": 0.7070116400718689, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 25670 + }, + { + "epoch": 1.844165170556553, + "grad_norm": 0.6980257630348206, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 25680 + }, + { + "epoch": 1.8448833034111312, + "grad_norm": 0.906563401222229, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 25690 + }, + { + "epoch": 1.8456014362657092, + "grad_norm": 0.567991316318512, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 25700 + }, + { + "epoch": 1.8463195691202872, + "grad_norm": 0.5954506993293762, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 25710 + }, + { + "epoch": 1.8470377019748654, + "grad_norm": 0.8073318600654602, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 25720 + }, + { + "epoch": 1.8477558348294436, + "grad_norm": 0.7439551949501038, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 25730 + }, + { + "epoch": 1.8484739676840216, + "grad_norm": 0.8091771602630615, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 25740 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 0.6584576964378357, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 25750 + }, + { + "epoch": 1.8499102333931776, + "grad_norm": 0.8161963224411011, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 25760 + }, + { + "epoch": 1.8506283662477558, + "grad_norm": 0.7337122559547424, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 25770 + }, + { + "epoch": 1.851346499102334, + "grad_norm": 0.8968114256858826, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25780 + }, + { + "epoch": 1.852064631956912, + "grad_norm": 0.8647686839103699, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 25790 + }, + { + "epoch": 1.85278276481149, + "grad_norm": 0.7775349020957947, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 25800 + }, + { + "epoch": 1.853500897666068, + "grad_norm": 0.686072587966919, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 25810 + }, + { + "epoch": 1.8542190305206463, + "grad_norm": 0.7053380012512207, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 25820 + }, + { + "epoch": 1.8549371633752245, + "grad_norm": 0.7899979948997498, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 25830 + }, + { + "epoch": 1.8556552962298025, + "grad_norm": 0.6970776915550232, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 25840 + }, + { + "epoch": 1.8563734290843805, + "grad_norm": 0.7210841774940491, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 25850 + }, + { + "epoch": 1.8570915619389587, + "grad_norm": 0.7297208905220032, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 25860 + }, + { + "epoch": 1.857809694793537, + "grad_norm": 0.7782729268074036, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 25870 + }, + { + "epoch": 1.858527827648115, + "grad_norm": 0.7227505445480347, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 25880 + }, + { + "epoch": 1.859245960502693, + "grad_norm": 0.7489684224128723, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 25890 + }, + { + "epoch": 1.859964093357271, + "grad_norm": 0.7447289824485779, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 25900 + }, + { + "epoch": 1.8606822262118492, + "grad_norm": 0.8516317009925842, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 25910 + }, + { + "epoch": 1.8614003590664274, + "grad_norm": 0.6864543557167053, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 25920 + }, + { + "epoch": 1.8621184919210054, + "grad_norm": 0.6753451824188232, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 25930 + }, + { + "epoch": 1.8628366247755834, + "grad_norm": 0.631679117679596, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25940 + }, + { + "epoch": 1.8635547576301614, + "grad_norm": 0.7715049982070923, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 25950 + }, + { + "epoch": 1.8642728904847397, + "grad_norm": 0.7354850769042969, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 25960 + }, + { + "epoch": 1.8649910233393179, + "grad_norm": 0.7443442940711975, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 25970 + }, + { + "epoch": 1.8657091561938959, + "grad_norm": 0.6880337595939636, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 25980 + }, + { + "epoch": 1.8664272890484739, + "grad_norm": 0.843941867351532, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 25990 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 0.6904318928718567, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 26000 + }, + { + "epoch": 1.86786355475763, + "grad_norm": 0.9041751623153687, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 26010 + }, + { + "epoch": 1.8685816876122083, + "grad_norm": 0.7470057010650635, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 26020 + }, + { + "epoch": 1.8692998204667863, + "grad_norm": 0.6921331882476807, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 26030 + }, + { + "epoch": 1.8700179533213643, + "grad_norm": 0.7627376914024353, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 26040 + }, + { + "epoch": 1.8707360861759426, + "grad_norm": 0.7784932851791382, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 26050 + }, + { + "epoch": 1.8714542190305208, + "grad_norm": 0.6399524807929993, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 26060 + }, + { + "epoch": 1.8721723518850988, + "grad_norm": 0.6478492617607117, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26070 + }, + { + "epoch": 1.8728904847396768, + "grad_norm": 0.6376804113388062, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 26080 + }, + { + "epoch": 1.8736086175942548, + "grad_norm": 0.6976892352104187, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 26090 + }, + { + "epoch": 1.874326750448833, + "grad_norm": 0.7997903227806091, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 26100 + }, + { + "epoch": 1.8750448833034112, + "grad_norm": 0.6984273791313171, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 26110 + }, + { + "epoch": 1.8757630161579892, + "grad_norm": 0.7020659446716309, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26120 + }, + { + "epoch": 1.8764811490125672, + "grad_norm": 0.784986138343811, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 26130 + }, + { + "epoch": 1.8771992818671455, + "grad_norm": 0.7369210124015808, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 26140 + }, + { + "epoch": 1.8779174147217235, + "grad_norm": 0.7730622291564941, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 26150 + }, + { + "epoch": 1.8786355475763017, + "grad_norm": 0.7253434658050537, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 26160 + }, + { + "epoch": 1.8793536804308797, + "grad_norm": 0.8019800186157227, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 26170 + }, + { + "epoch": 1.8800718132854577, + "grad_norm": 0.7337628602981567, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 26180 + }, + { + "epoch": 1.880789946140036, + "grad_norm": 0.7049200534820557, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 26190 + }, + { + "epoch": 1.8815080789946141, + "grad_norm": 0.6451525092124939, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 26200 + }, + { + "epoch": 1.8822262118491921, + "grad_norm": 0.7660874724388123, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 26210 + }, + { + "epoch": 1.8829443447037701, + "grad_norm": 0.8464223146438599, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26220 + }, + { + "epoch": 1.8836624775583481, + "grad_norm": 0.859503984451294, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 26230 + }, + { + "epoch": 1.8843806104129264, + "grad_norm": 0.6969478726387024, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 26240 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 0.6860285997390747, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 26250 + }, + { + "epoch": 1.8858168761220826, + "grad_norm": 0.5873110294342041, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 26260 + }, + { + "epoch": 1.8865350089766606, + "grad_norm": 0.6959530115127563, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 26270 + }, + { + "epoch": 1.8872531418312388, + "grad_norm": 0.8734689950942993, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 26280 + }, + { + "epoch": 1.8879712746858168, + "grad_norm": 0.7385509014129639, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 26290 + }, + { + "epoch": 1.888689407540395, + "grad_norm": 0.6702063083648682, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 26300 + }, + { + "epoch": 1.889407540394973, + "grad_norm": 0.8177255988121033, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 26310 + }, + { + "epoch": 1.890125673249551, + "grad_norm": 0.6638466715812683, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 26320 + }, + { + "epoch": 1.8908438061041293, + "grad_norm": 0.8584128618240356, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 26330 + }, + { + "epoch": 1.8915619389587075, + "grad_norm": 0.677561342716217, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 26340 + }, + { + "epoch": 1.8922800718132855, + "grad_norm": 0.6931864619255066, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 26350 + }, + { + "epoch": 1.8929982046678635, + "grad_norm": 0.6583828330039978, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 26360 + }, + { + "epoch": 1.8937163375224415, + "grad_norm": 0.6708519458770752, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 26370 + }, + { + "epoch": 1.8944344703770197, + "grad_norm": 0.7684788107872009, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 26380 + }, + { + "epoch": 1.895152603231598, + "grad_norm": 0.703217625617981, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 26390 + }, + { + "epoch": 1.895870736086176, + "grad_norm": 0.6686710119247437, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26400 + }, + { + "epoch": 1.896588868940754, + "grad_norm": 0.7429705262184143, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 26410 + }, + { + "epoch": 1.8973070017953322, + "grad_norm": 0.7835305333137512, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 26420 + }, + { + "epoch": 1.8980251346499102, + "grad_norm": 0.7793689370155334, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 26430 + }, + { + "epoch": 1.8987432675044884, + "grad_norm": 0.7337237000465393, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 26440 + }, + { + "epoch": 1.8994614003590664, + "grad_norm": 0.5734546780586243, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 26450 + }, + { + "epoch": 1.9001795332136444, + "grad_norm": 0.655937135219574, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 26460 + }, + { + "epoch": 1.9008976660682226, + "grad_norm": 1.0200905799865723, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 26470 + }, + { + "epoch": 1.9016157989228009, + "grad_norm": 0.6118829250335693, + "learning_rate": 0.0002, + "loss": 0.733, + "step": 26480 + }, + { + "epoch": 1.9023339317773789, + "grad_norm": 0.7459297776222229, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 26490 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 0.9451959729194641, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 26500 + }, + { + "epoch": 1.9037701974865349, + "grad_norm": 0.9694880247116089, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 26510 + }, + { + "epoch": 1.904488330341113, + "grad_norm": 0.806532084941864, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 26520 + }, + { + "epoch": 1.9052064631956913, + "grad_norm": 0.7016968727111816, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 26530 + }, + { + "epoch": 1.9059245960502693, + "grad_norm": 0.7707533836364746, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26540 + }, + { + "epoch": 1.9066427289048473, + "grad_norm": 0.716044545173645, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 26550 + }, + { + "epoch": 1.9073608617594255, + "grad_norm": 0.7904782295227051, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 26560 + }, + { + "epoch": 1.9080789946140035, + "grad_norm": 0.8557461500167847, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 26570 + }, + { + "epoch": 1.9087971274685818, + "grad_norm": 0.6807048916816711, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26580 + }, + { + "epoch": 1.9095152603231598, + "grad_norm": 0.8374032974243164, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 26590 + }, + { + "epoch": 1.9102333931777378, + "grad_norm": 0.7936834692955017, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 26600 + }, + { + "epoch": 1.910951526032316, + "grad_norm": 0.6342210173606873, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 26610 + }, + { + "epoch": 1.9116696588868942, + "grad_norm": 0.8222208023071289, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 26620 + }, + { + "epoch": 1.9123877917414722, + "grad_norm": 0.7890012860298157, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 26630 + }, + { + "epoch": 1.9131059245960502, + "grad_norm": 0.6415254473686218, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 26640 + }, + { + "epoch": 1.9138240574506282, + "grad_norm": 0.7936763763427734, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 26650 + }, + { + "epoch": 1.9145421903052064, + "grad_norm": 0.7174334526062012, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 26660 + }, + { + "epoch": 1.9152603231597847, + "grad_norm": 0.6503710746765137, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 26670 + }, + { + "epoch": 1.9159784560143627, + "grad_norm": 0.7618577480316162, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 26680 + }, + { + "epoch": 1.9166965888689407, + "grad_norm": 0.7984131574630737, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 26690 + }, + { + "epoch": 1.917414721723519, + "grad_norm": 0.6863887906074524, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 26700 + }, + { + "epoch": 1.918132854578097, + "grad_norm": 0.7621138691902161, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 26710 + }, + { + "epoch": 1.9188509874326751, + "grad_norm": 0.7855543494224548, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 26720 + }, + { + "epoch": 1.9195691202872531, + "grad_norm": 0.7045016288757324, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 26730 + }, + { + "epoch": 1.9202872531418311, + "grad_norm": 0.7799559235572815, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 26740 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 0.7999796271324158, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 26750 + }, + { + "epoch": 1.9217235188509876, + "grad_norm": 0.5479980111122131, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 26760 + }, + { + "epoch": 1.9224416517055656, + "grad_norm": 0.7192868590354919, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 26770 + }, + { + "epoch": 1.9231597845601436, + "grad_norm": 0.7642375826835632, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 26780 + }, + { + "epoch": 1.9238779174147216, + "grad_norm": 0.7015959620475769, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 26790 + }, + { + "epoch": 1.9245960502692998, + "grad_norm": 0.6685634851455688, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 26800 + }, + { + "epoch": 1.925314183123878, + "grad_norm": 0.674363911151886, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 26810 + }, + { + "epoch": 1.926032315978456, + "grad_norm": 0.769318163394928, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 26820 + }, + { + "epoch": 1.926750448833034, + "grad_norm": 0.7397989630699158, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 26830 + }, + { + "epoch": 1.9274685816876123, + "grad_norm": 0.7603814601898193, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 26840 + }, + { + "epoch": 1.9281867145421903, + "grad_norm": 0.5960564613342285, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 26850 + }, + { + "epoch": 1.9289048473967685, + "grad_norm": 0.8158858418464661, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 26860 + }, + { + "epoch": 1.9296229802513465, + "grad_norm": 0.7022058367729187, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 26870 + }, + { + "epoch": 1.9303411131059245, + "grad_norm": 0.7249060273170471, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 26880 + }, + { + "epoch": 1.9310592459605027, + "grad_norm": 0.7613264322280884, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 26890 + }, + { + "epoch": 1.931777378815081, + "grad_norm": 0.6857499480247498, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 26900 + }, + { + "epoch": 1.932495511669659, + "grad_norm": 0.6968346834182739, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 26910 + }, + { + "epoch": 1.933213644524237, + "grad_norm": 0.7079267501831055, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 26920 + }, + { + "epoch": 1.933931777378815, + "grad_norm": 0.6571618914604187, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 26930 + }, + { + "epoch": 1.9346499102333932, + "grad_norm": 0.7460548281669617, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 26940 + }, + { + "epoch": 1.9353680430879714, + "grad_norm": 0.7954307794570923, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 26950 + }, + { + "epoch": 1.9360861759425494, + "grad_norm": 0.8696223497390747, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 26960 + }, + { + "epoch": 1.9368043087971274, + "grad_norm": 0.726004421710968, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 26970 + }, + { + "epoch": 1.9375224416517056, + "grad_norm": 0.8760337829589844, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 26980 + }, + { + "epoch": 1.9382405745062836, + "grad_norm": 0.7308675646781921, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 26990 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 0.5900304317474365, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 27000 + }, + { + "epoch": 1.9396768402154398, + "grad_norm": 0.8839457631111145, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 27010 + }, + { + "epoch": 1.9403949730700178, + "grad_norm": 0.7239173650741577, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 27020 + }, + { + "epoch": 1.941113105924596, + "grad_norm": 0.8972901701927185, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 27030 + }, + { + "epoch": 1.9418312387791743, + "grad_norm": 0.7140652537345886, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 27040 + }, + { + "epoch": 1.9425493716337523, + "grad_norm": 0.7502743005752563, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 27050 + }, + { + "epoch": 1.9432675044883303, + "grad_norm": 0.6420751810073853, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 27060 + }, + { + "epoch": 1.9439856373429083, + "grad_norm": 0.6671820282936096, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 27070 + }, + { + "epoch": 1.9447037701974865, + "grad_norm": 0.6268796324729919, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 27080 + }, + { + "epoch": 1.9454219030520647, + "grad_norm": 0.6850021481513977, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 27090 + }, + { + "epoch": 1.9461400359066428, + "grad_norm": 0.6380038261413574, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 27100 + }, + { + "epoch": 1.9468581687612208, + "grad_norm": 0.5806204080581665, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 27110 + }, + { + "epoch": 1.947576301615799, + "grad_norm": 0.8236927390098572, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 27120 + }, + { + "epoch": 1.948294434470377, + "grad_norm": 0.7915826439857483, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27130 + }, + { + "epoch": 1.9490125673249552, + "grad_norm": 0.7467429041862488, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 27140 + }, + { + "epoch": 1.9497307001795332, + "grad_norm": 0.6278707981109619, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27150 + }, + { + "epoch": 1.9504488330341112, + "grad_norm": 0.7353739142417908, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 27160 + }, + { + "epoch": 1.9511669658886894, + "grad_norm": 0.6443645358085632, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27170 + }, + { + "epoch": 1.9518850987432677, + "grad_norm": 0.770800769329071, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 27180 + }, + { + "epoch": 1.9526032315978457, + "grad_norm": 0.8982598781585693, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 27190 + }, + { + "epoch": 1.9533213644524237, + "grad_norm": 0.775017499923706, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 27200 + }, + { + "epoch": 1.9540394973070017, + "grad_norm": 0.8271628618240356, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 27210 + }, + { + "epoch": 1.9547576301615799, + "grad_norm": 0.7460184693336487, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 27220 + }, + { + "epoch": 1.955475763016158, + "grad_norm": 0.7732188105583191, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 27230 + }, + { + "epoch": 1.956193895870736, + "grad_norm": 0.7398577332496643, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 27240 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 0.7132339477539062, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 27250 + }, + { + "epoch": 1.9576301615798921, + "grad_norm": 0.6718965768814087, + "learning_rate": 0.0002, + "loss": 0.7731, + "step": 27260 + }, + { + "epoch": 1.9583482944344703, + "grad_norm": 0.7914422154426575, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 27270 + }, + { + "epoch": 1.9590664272890486, + "grad_norm": 0.8314110636711121, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 27280 + }, + { + "epoch": 1.9597845601436266, + "grad_norm": 0.7810674905776978, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 27290 + }, + { + "epoch": 1.9605026929982046, + "grad_norm": 0.7691007256507874, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 27300 + }, + { + "epoch": 1.9612208258527828, + "grad_norm": 0.6753138899803162, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 27310 + }, + { + "epoch": 1.961938958707361, + "grad_norm": 0.5881175994873047, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 27320 + }, + { + "epoch": 1.962657091561939, + "grad_norm": 0.8414133191108704, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27330 + }, + { + "epoch": 1.963375224416517, + "grad_norm": 0.7363715171813965, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 27340 + }, + { + "epoch": 1.964093357271095, + "grad_norm": 0.6526232361793518, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 27350 + }, + { + "epoch": 1.9648114901256732, + "grad_norm": 0.6821389198303223, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 27360 + }, + { + "epoch": 1.9655296229802515, + "grad_norm": 0.7306062579154968, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 27370 + }, + { + "epoch": 1.9662477558348295, + "grad_norm": 0.6458130478858948, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 27380 + }, + { + "epoch": 1.9669658886894075, + "grad_norm": 0.7243196368217468, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 27390 + }, + { + "epoch": 1.9676840215439855, + "grad_norm": 0.8062235713005066, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 27400 + }, + { + "epoch": 1.9684021543985637, + "grad_norm": 0.68441241979599, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 27410 + }, + { + "epoch": 1.969120287253142, + "grad_norm": 0.7504498958587646, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 27420 + }, + { + "epoch": 1.96983842010772, + "grad_norm": 0.7469466328620911, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 27430 + }, + { + "epoch": 1.970556552962298, + "grad_norm": 0.7109853625297546, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 27440 + }, + { + "epoch": 1.9712746858168761, + "grad_norm": 0.6964903473854065, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 27450 + }, + { + "epoch": 1.9719928186714544, + "grad_norm": 0.8224200010299683, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 27460 + }, + { + "epoch": 1.9727109515260324, + "grad_norm": 0.6195617318153381, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 27470 + }, + { + "epoch": 1.9734290843806104, + "grad_norm": 0.691511332988739, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 27480 + }, + { + "epoch": 1.9741472172351884, + "grad_norm": 0.7437900304794312, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 27490 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 0.7987960577011108, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 27500 + }, + { + "epoch": 1.9755834829443448, + "grad_norm": 0.7117776274681091, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 27510 + }, + { + "epoch": 1.9763016157989228, + "grad_norm": 0.8473866581916809, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 27520 + }, + { + "epoch": 1.9770197486535008, + "grad_norm": 0.7178242802619934, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 27530 + }, + { + "epoch": 1.9777378815080788, + "grad_norm": 0.760145902633667, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 27540 + }, + { + "epoch": 1.978456014362657, + "grad_norm": 0.764436662197113, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 27550 + }, + { + "epoch": 1.9791741472172353, + "grad_norm": 0.7245904803276062, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 27560 + }, + { + "epoch": 1.9798922800718133, + "grad_norm": 0.6317000389099121, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 27570 + }, + { + "epoch": 1.9806104129263913, + "grad_norm": 0.8764704465866089, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 27580 + }, + { + "epoch": 1.9813285457809695, + "grad_norm": 0.6111825108528137, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 27590 + }, + { + "epoch": 1.9820466786355477, + "grad_norm": 0.6797714233398438, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 27600 + }, + { + "epoch": 1.9827648114901257, + "grad_norm": 0.7754142880439758, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 27610 + }, + { + "epoch": 1.9834829443447037, + "grad_norm": 0.7243061661720276, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 27620 + }, + { + "epoch": 1.9842010771992817, + "grad_norm": 0.6194812655448914, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 27630 + }, + { + "epoch": 1.98491921005386, + "grad_norm": 0.6399638056755066, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27640 + }, + { + "epoch": 1.9856373429084382, + "grad_norm": 0.7637218832969666, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 27650 + }, + { + "epoch": 1.9863554757630162, + "grad_norm": 0.9099404811859131, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 27660 + }, + { + "epoch": 1.9870736086175942, + "grad_norm": 0.6892596483230591, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 27670 + }, + { + "epoch": 1.9877917414721722, + "grad_norm": 0.5962418913841248, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 27680 + }, + { + "epoch": 1.9885098743267504, + "grad_norm": 0.5750163197517395, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27690 + }, + { + "epoch": 1.9892280071813286, + "grad_norm": 0.6740097403526306, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 27700 + }, + { + "epoch": 1.9899461400359066, + "grad_norm": 0.6968644857406616, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 27710 + }, + { + "epoch": 1.9906642728904846, + "grad_norm": 0.6788132190704346, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 27720 + }, + { + "epoch": 1.9913824057450629, + "grad_norm": 0.8600544929504395, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 27730 + }, + { + "epoch": 1.992100538599641, + "grad_norm": 0.6227671504020691, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 27740 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 0.6611875295639038, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 27750 + }, + { + "epoch": 1.993536804308797, + "grad_norm": 0.714568018913269, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 27760 + }, + { + "epoch": 1.994254937163375, + "grad_norm": 0.6328669190406799, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27770 + }, + { + "epoch": 1.9949730700179533, + "grad_norm": 0.8673429489135742, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27780 + }, + { + "epoch": 1.9956912028725315, + "grad_norm": 0.820620059967041, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 27790 + }, + { + "epoch": 1.9964093357271095, + "grad_norm": 0.8748094439506531, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 27800 + }, + { + "epoch": 1.9971274685816875, + "grad_norm": 0.8118113875389099, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 27810 + }, + { + "epoch": 1.9978456014362656, + "grad_norm": 0.6886725425720215, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 27820 + }, + { + "epoch": 1.9985637342908438, + "grad_norm": 0.7101268768310547, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 27830 + }, + { + "epoch": 1.999281867145422, + "grad_norm": 0.7823781967163086, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 27840 + }, + { + "epoch": 2.0, + "grad_norm": 0.8491085767745972, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 27850 + }, + { + "epoch": 2.0, + "eval_loss": 1.0868422985076904, + "eval_runtime": 55.1699, + "eval_samples_per_second": 13.286, + "eval_steps_per_second": 1.668, + "step": 27850 + }, + { + "epoch": 2.000718132854578, + "grad_norm": 0.9003389477729797, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 27860 + }, + { + "epoch": 2.001436265709156, + "grad_norm": 0.8898349404335022, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 27870 + }, + { + "epoch": 2.0021543985637344, + "grad_norm": 0.7525973320007324, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 27880 + }, + { + "epoch": 2.0028725314183125, + "grad_norm": 0.7821497321128845, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 27890 + }, + { + "epoch": 2.0035906642728905, + "grad_norm": 0.6334691047668457, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 27900 + }, + { + "epoch": 2.0043087971274685, + "grad_norm": 0.732991099357605, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 27910 + }, + { + "epoch": 2.0050269299820465, + "grad_norm": 0.949942946434021, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 27920 + }, + { + "epoch": 2.005745062836625, + "grad_norm": 0.657267689704895, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 27930 + }, + { + "epoch": 2.006463195691203, + "grad_norm": 0.8329252004623413, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 27940 + }, + { + "epoch": 2.007181328545781, + "grad_norm": 0.7816959023475647, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 27950 + }, + { + "epoch": 2.007899461400359, + "grad_norm": 0.7546323537826538, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 27960 + }, + { + "epoch": 2.0086175942549374, + "grad_norm": 0.9519657492637634, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 27970 + }, + { + "epoch": 2.0093357271095154, + "grad_norm": 0.7934315800666809, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 27980 + }, + { + "epoch": 2.0100538599640934, + "grad_norm": 0.9579764604568481, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 27990 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 0.764167070388794, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 28000 + }, + { + "epoch": 2.0114901256732494, + "grad_norm": 0.7380000948905945, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 28010 + }, + { + "epoch": 2.012208258527828, + "grad_norm": 0.7220044732093811, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 28020 + }, + { + "epoch": 2.012926391382406, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 28030 + }, + { + "epoch": 2.013644524236984, + "grad_norm": 0.7507190704345703, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28040 + }, + { + "epoch": 2.014362657091562, + "grad_norm": 0.9488387703895569, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 28050 + }, + { + "epoch": 2.01508078994614, + "grad_norm": 0.9092940092086792, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 28060 + }, + { + "epoch": 2.0157989228007183, + "grad_norm": 0.7859629392623901, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28070 + }, + { + "epoch": 2.0165170556552963, + "grad_norm": 0.7636393904685974, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 28080 + }, + { + "epoch": 2.0172351885098743, + "grad_norm": 0.8860714435577393, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 28090 + }, + { + "epoch": 2.0179533213644523, + "grad_norm": 0.6837195158004761, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 28100 + }, + { + "epoch": 2.0186714542190307, + "grad_norm": 0.7778242826461792, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 28110 + }, + { + "epoch": 2.0193895870736087, + "grad_norm": 0.7164766788482666, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 28120 + }, + { + "epoch": 2.0201077199281867, + "grad_norm": 0.8965572118759155, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 28130 + }, + { + "epoch": 2.0208258527827647, + "grad_norm": 0.8074374794960022, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 28140 + }, + { + "epoch": 2.0215439856373427, + "grad_norm": 0.8307222127914429, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 28150 + }, + { + "epoch": 2.022262118491921, + "grad_norm": 0.9600032567977905, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 28160 + }, + { + "epoch": 2.022980251346499, + "grad_norm": 0.8541040420532227, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 28170 + }, + { + "epoch": 2.023698384201077, + "grad_norm": 0.8864985704421997, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 28180 + }, + { + "epoch": 2.024416517055655, + "grad_norm": 0.7926326990127563, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 28190 + }, + { + "epoch": 2.025134649910233, + "grad_norm": 1.0548077821731567, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28200 + }, + { + "epoch": 2.0258527827648116, + "grad_norm": 0.7468827366828918, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 28210 + }, + { + "epoch": 2.0265709156193896, + "grad_norm": 0.7683286070823669, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 28220 + }, + { + "epoch": 2.0272890484739676, + "grad_norm": 0.7307319641113281, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 28230 + }, + { + "epoch": 2.0280071813285456, + "grad_norm": 0.7813416719436646, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 28240 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 0.7954556941986084, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 28250 + }, + { + "epoch": 2.029443447037702, + "grad_norm": 0.8836418986320496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 28260 + }, + { + "epoch": 2.03016157989228, + "grad_norm": 0.7092728614807129, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28270 + }, + { + "epoch": 2.030879712746858, + "grad_norm": 0.8512285351753235, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 28280 + }, + { + "epoch": 2.031597845601436, + "grad_norm": 0.8005346059799194, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 28290 + }, + { + "epoch": 2.0323159784560145, + "grad_norm": 0.8872515559196472, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 28300 + }, + { + "epoch": 2.0330341113105925, + "grad_norm": 0.7948436737060547, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 28310 + }, + { + "epoch": 2.0337522441651705, + "grad_norm": 0.7418082356452942, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 28320 + }, + { + "epoch": 2.0344703770197485, + "grad_norm": 0.9600949287414551, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 28330 + }, + { + "epoch": 2.0351885098743265, + "grad_norm": 0.9767434597015381, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 28340 + }, + { + "epoch": 2.035906642728905, + "grad_norm": 0.7435336709022522, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 28350 + }, + { + "epoch": 2.036624775583483, + "grad_norm": 0.997978925704956, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 28360 + }, + { + "epoch": 2.037342908438061, + "grad_norm": 0.9072412252426147, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 28370 + }, + { + "epoch": 2.038061041292639, + "grad_norm": 0.8396701812744141, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 28380 + }, + { + "epoch": 2.0387791741472174, + "grad_norm": 1.0449832677841187, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 28390 + }, + { + "epoch": 2.0394973070017954, + "grad_norm": 0.6471025943756104, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 28400 + }, + { + "epoch": 2.0402154398563734, + "grad_norm": 0.8147950768470764, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 28410 + }, + { + "epoch": 2.0409335727109514, + "grad_norm": 0.902508020401001, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 28420 + }, + { + "epoch": 2.0416517055655294, + "grad_norm": 0.6426262855529785, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 28430 + }, + { + "epoch": 2.042369838420108, + "grad_norm": 0.8016643524169922, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 28440 + }, + { + "epoch": 2.043087971274686, + "grad_norm": 0.6841614246368408, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 28450 + }, + { + "epoch": 2.043806104129264, + "grad_norm": 0.7713631987571716, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 28460 + }, + { + "epoch": 2.044524236983842, + "grad_norm": 0.8795675039291382, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 28470 + }, + { + "epoch": 2.04524236983842, + "grad_norm": 0.725447416305542, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 28480 + }, + { + "epoch": 2.0459605026929983, + "grad_norm": 0.806861162185669, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 28490 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 0.752953827381134, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 28500 + }, + { + "epoch": 2.0473967684021543, + "grad_norm": 0.7143173813819885, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 28510 + }, + { + "epoch": 2.0481149012567323, + "grad_norm": 0.9316226243972778, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 28520 + }, + { + "epoch": 2.048833034111311, + "grad_norm": 0.7292338609695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 28530 + }, + { + "epoch": 2.049551166965889, + "grad_norm": 0.7392885088920593, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 28540 + }, + { + "epoch": 2.050269299820467, + "grad_norm": 0.7288873195648193, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 28550 + }, + { + "epoch": 2.050987432675045, + "grad_norm": 0.7791221141815186, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 28560 + }, + { + "epoch": 2.051705565529623, + "grad_norm": 0.821983814239502, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 28570 + }, + { + "epoch": 2.0524236983842012, + "grad_norm": 0.8925826549530029, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28580 + }, + { + "epoch": 2.0531418312387792, + "grad_norm": 0.7181646227836609, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 28590 + }, + { + "epoch": 2.0538599640933572, + "grad_norm": 0.6387725472450256, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 28600 + }, + { + "epoch": 2.0545780969479353, + "grad_norm": 0.8398096561431885, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 28610 + }, + { + "epoch": 2.0552962298025133, + "grad_norm": 1.0458195209503174, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 28620 + }, + { + "epoch": 2.0560143626570917, + "grad_norm": 0.7032150626182556, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28630 + }, + { + "epoch": 2.0567324955116697, + "grad_norm": 0.8850845098495483, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 28640 + }, + { + "epoch": 2.0574506283662477, + "grad_norm": 0.8587120175361633, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 28650 + }, + { + "epoch": 2.0581687612208257, + "grad_norm": 0.7462602853775024, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28660 + }, + { + "epoch": 2.058886894075404, + "grad_norm": 0.7355574369430542, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 28670 + }, + { + "epoch": 2.059605026929982, + "grad_norm": 0.9229736328125, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 28680 + }, + { + "epoch": 2.06032315978456, + "grad_norm": 0.7685085535049438, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 28690 + }, + { + "epoch": 2.061041292639138, + "grad_norm": 0.6749364137649536, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 28700 + }, + { + "epoch": 2.061759425493716, + "grad_norm": 0.7608520984649658, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28710 + }, + { + "epoch": 2.0624775583482946, + "grad_norm": 0.9451281428337097, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28720 + }, + { + "epoch": 2.0631956912028726, + "grad_norm": 0.7869735360145569, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 28730 + }, + { + "epoch": 2.0639138240574506, + "grad_norm": 0.8422008156776428, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 28740 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 0.7486162781715393, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 28750 + }, + { + "epoch": 2.0653500897666066, + "grad_norm": 0.9374173879623413, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28760 + }, + { + "epoch": 2.066068222621185, + "grad_norm": 0.8749295473098755, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 28770 + }, + { + "epoch": 2.066786355475763, + "grad_norm": 0.8265942931175232, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 28780 + }, + { + "epoch": 2.067504488330341, + "grad_norm": 0.8541982769966125, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 28790 + }, + { + "epoch": 2.068222621184919, + "grad_norm": 0.8220006227493286, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 28800 + }, + { + "epoch": 2.0689407540394975, + "grad_norm": 0.7302022576332092, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 28810 + }, + { + "epoch": 2.0696588868940755, + "grad_norm": 0.7073875069618225, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 28820 + }, + { + "epoch": 2.0703770197486535, + "grad_norm": 0.7792919874191284, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28830 + }, + { + "epoch": 2.0710951526032315, + "grad_norm": 0.8268185257911682, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 28840 + }, + { + "epoch": 2.0718132854578095, + "grad_norm": 0.7576423287391663, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 28850 + }, + { + "epoch": 2.072531418312388, + "grad_norm": 0.8255910873413086, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 28860 + }, + { + "epoch": 2.073249551166966, + "grad_norm": 0.7900934815406799, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 28870 + }, + { + "epoch": 2.073967684021544, + "grad_norm": 0.846665620803833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 28880 + }, + { + "epoch": 2.074685816876122, + "grad_norm": 0.8159831166267395, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 28890 + }, + { + "epoch": 2.0754039497307, + "grad_norm": 0.7395941615104675, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 28900 + }, + { + "epoch": 2.0761220825852784, + "grad_norm": 0.9765046238899231, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 28910 + }, + { + "epoch": 2.0768402154398564, + "grad_norm": 0.8358173966407776, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 28920 + }, + { + "epoch": 2.0775583482944344, + "grad_norm": 0.6848723292350769, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 28930 + }, + { + "epoch": 2.0782764811490124, + "grad_norm": 0.7965065836906433, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 28940 + }, + { + "epoch": 2.078994614003591, + "grad_norm": 0.7618608474731445, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 28950 + }, + { + "epoch": 2.079712746858169, + "grad_norm": 0.890615701675415, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 28960 + }, + { + "epoch": 2.080430879712747, + "grad_norm": 0.7310431003570557, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28970 + }, + { + "epoch": 2.081149012567325, + "grad_norm": 0.8228268027305603, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 28980 + }, + { + "epoch": 2.081867145421903, + "grad_norm": 0.883577287197113, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28990 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 0.8359243869781494, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 29000 + }, + { + "epoch": 2.0833034111310593, + "grad_norm": 0.8285391330718994, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 29010 + }, + { + "epoch": 2.0840215439856373, + "grad_norm": 0.8991064429283142, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 29020 + }, + { + "epoch": 2.0847396768402153, + "grad_norm": 0.6911244988441467, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 29030 + }, + { + "epoch": 2.0854578096947933, + "grad_norm": 0.8462249636650085, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 29040 + }, + { + "epoch": 2.0861759425493718, + "grad_norm": 0.9149548411369324, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 29050 + }, + { + "epoch": 2.0868940754039498, + "grad_norm": 0.7365630269050598, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 29060 + }, + { + "epoch": 2.087612208258528, + "grad_norm": 0.8439079523086548, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 29070 + }, + { + "epoch": 2.088330341113106, + "grad_norm": 0.7123780846595764, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 29080 + }, + { + "epoch": 2.0890484739676842, + "grad_norm": 0.6854261755943298, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 29090 + }, + { + "epoch": 2.0897666068222622, + "grad_norm": 0.83026123046875, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 29100 + }, + { + "epoch": 2.0904847396768402, + "grad_norm": 0.8413158059120178, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 29110 + }, + { + "epoch": 2.0912028725314182, + "grad_norm": 0.9646758437156677, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 29120 + }, + { + "epoch": 2.0919210053859962, + "grad_norm": 0.8421565890312195, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 29130 + }, + { + "epoch": 2.0926391382405747, + "grad_norm": 0.7748899459838867, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 29140 + }, + { + "epoch": 2.0933572710951527, + "grad_norm": 0.5973830819129944, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 29150 + }, + { + "epoch": 2.0940754039497307, + "grad_norm": 0.8440837860107422, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 29160 + }, + { + "epoch": 2.0947935368043087, + "grad_norm": 0.7392688989639282, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 29170 + }, + { + "epoch": 2.0955116696588867, + "grad_norm": 1.0522996187210083, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 29180 + }, + { + "epoch": 2.096229802513465, + "grad_norm": 0.7330273389816284, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 29190 + }, + { + "epoch": 2.096947935368043, + "grad_norm": 1.11064875125885, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 29200 + }, + { + "epoch": 2.097666068222621, + "grad_norm": 0.795446515083313, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 29210 + }, + { + "epoch": 2.098384201077199, + "grad_norm": 0.5552594661712646, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 29220 + }, + { + "epoch": 2.0991023339317776, + "grad_norm": 0.7327710390090942, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 29230 + }, + { + "epoch": 2.0998204667863556, + "grad_norm": 0.7474247217178345, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 29240 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": 0.7775853276252747, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 29250 + }, + { + "epoch": 2.1012567324955116, + "grad_norm": 0.769527018070221, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29260 + }, + { + "epoch": 2.1019748653500896, + "grad_norm": 0.8350797891616821, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 29270 + }, + { + "epoch": 2.102692998204668, + "grad_norm": 0.8749061822891235, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29280 + }, + { + "epoch": 2.103411131059246, + "grad_norm": 0.7838778495788574, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 29290 + }, + { + "epoch": 2.104129263913824, + "grad_norm": 0.8144710063934326, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 29300 + }, + { + "epoch": 2.104847396768402, + "grad_norm": 0.7965250015258789, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 29310 + }, + { + "epoch": 2.10556552962298, + "grad_norm": 0.7075945138931274, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 29320 + }, + { + "epoch": 2.1062836624775585, + "grad_norm": 0.9449555277824402, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 29330 + }, + { + "epoch": 2.1070017953321365, + "grad_norm": 0.9114580750465393, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 29340 + }, + { + "epoch": 2.1077199281867145, + "grad_norm": 0.8768125176429749, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 29350 + }, + { + "epoch": 2.1084380610412925, + "grad_norm": 0.8586908578872681, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 29360 + }, + { + "epoch": 2.109156193895871, + "grad_norm": 0.8351234793663025, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 29370 + }, + { + "epoch": 2.109874326750449, + "grad_norm": 0.686488687992096, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 29380 + }, + { + "epoch": 2.110592459605027, + "grad_norm": 0.7910184264183044, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 29390 + }, + { + "epoch": 2.111310592459605, + "grad_norm": 0.7649612426757812, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 29400 + }, + { + "epoch": 2.112028725314183, + "grad_norm": 0.7790259122848511, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29410 + }, + { + "epoch": 2.1127468581687614, + "grad_norm": 0.8386351466178894, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 29420 + }, + { + "epoch": 2.1134649910233394, + "grad_norm": 0.8605695366859436, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 29430 + }, + { + "epoch": 2.1141831238779174, + "grad_norm": 0.6808947920799255, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 29440 + }, + { + "epoch": 2.1149012567324954, + "grad_norm": 0.8310001492500305, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 29450 + }, + { + "epoch": 2.1156193895870734, + "grad_norm": 1.289986252784729, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 29460 + }, + { + "epoch": 2.116337522441652, + "grad_norm": 0.8679313659667969, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 29470 + }, + { + "epoch": 2.11705565529623, + "grad_norm": 0.9149175882339478, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 29480 + }, + { + "epoch": 2.117773788150808, + "grad_norm": 0.8405622839927673, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 29490 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 0.9174691438674927, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 29500 + }, + { + "epoch": 2.1192100538599643, + "grad_norm": 0.8865614533424377, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29510 + }, + { + "epoch": 2.1199281867145423, + "grad_norm": 0.645301342010498, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29520 + }, + { + "epoch": 2.1206463195691203, + "grad_norm": 0.7612960338592529, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 29530 + }, + { + "epoch": 2.1213644524236983, + "grad_norm": 0.7575576305389404, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 29540 + }, + { + "epoch": 2.1220825852782763, + "grad_norm": 0.8746156096458435, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 29550 + }, + { + "epoch": 2.1228007181328548, + "grad_norm": 0.8488934636116028, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 29560 + }, + { + "epoch": 2.1235188509874328, + "grad_norm": 0.8064972162246704, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 29570 + }, + { + "epoch": 2.1242369838420108, + "grad_norm": 0.7410933971405029, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 29580 + }, + { + "epoch": 2.1249551166965888, + "grad_norm": 0.7023535966873169, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 29590 + }, + { + "epoch": 2.1256732495511668, + "grad_norm": 0.8591743111610413, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 29600 + }, + { + "epoch": 2.126391382405745, + "grad_norm": 0.7270186543464661, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 29610 + }, + { + "epoch": 2.127109515260323, + "grad_norm": 0.9639726281166077, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 29620 + }, + { + "epoch": 2.127827648114901, + "grad_norm": 0.8519027829170227, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 29630 + }, + { + "epoch": 2.128545780969479, + "grad_norm": 0.8786447048187256, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 29640 + }, + { + "epoch": 2.129263913824057, + "grad_norm": 0.7452822923660278, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29650 + }, + { + "epoch": 2.1299820466786357, + "grad_norm": 0.9385744333267212, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 29660 + }, + { + "epoch": 2.1307001795332137, + "grad_norm": 0.7650160193443298, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 29670 + }, + { + "epoch": 2.1314183123877917, + "grad_norm": 0.7581976652145386, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 29680 + }, + { + "epoch": 2.1321364452423697, + "grad_norm": 0.8455183506011963, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 29690 + }, + { + "epoch": 2.132854578096948, + "grad_norm": 0.7200509905815125, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 29700 + }, + { + "epoch": 2.133572710951526, + "grad_norm": 0.7071877121925354, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 29710 + }, + { + "epoch": 2.134290843806104, + "grad_norm": 0.9197220802307129, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 29720 + }, + { + "epoch": 2.135008976660682, + "grad_norm": 0.6787277460098267, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 29730 + }, + { + "epoch": 2.13572710951526, + "grad_norm": 0.8183788061141968, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 29740 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 0.7958994507789612, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29750 + }, + { + "epoch": 2.1371633752244166, + "grad_norm": 0.8803889155387878, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 29760 + }, + { + "epoch": 2.1378815080789946, + "grad_norm": 0.6682677268981934, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 29770 + }, + { + "epoch": 2.1385996409335726, + "grad_norm": 1.0198085308074951, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 29780 + }, + { + "epoch": 2.139317773788151, + "grad_norm": 1.0258227586746216, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 29790 + }, + { + "epoch": 2.140035906642729, + "grad_norm": 0.8920917510986328, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 29800 + }, + { + "epoch": 2.140754039497307, + "grad_norm": 0.8352635502815247, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 29810 + }, + { + "epoch": 2.141472172351885, + "grad_norm": 0.8422067165374756, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 29820 + }, + { + "epoch": 2.142190305206463, + "grad_norm": 0.8845202326774597, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 29830 + }, + { + "epoch": 2.1429084380610415, + "grad_norm": 0.659397542476654, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 29840 + }, + { + "epoch": 2.1436265709156195, + "grad_norm": 0.6233306527137756, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 29850 + }, + { + "epoch": 2.1443447037701975, + "grad_norm": 0.8951199054718018, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 29860 + }, + { + "epoch": 2.1450628366247755, + "grad_norm": 0.6980211734771729, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 29870 + }, + { + "epoch": 2.1457809694793535, + "grad_norm": 0.8463385105133057, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29880 + }, + { + "epoch": 2.146499102333932, + "grad_norm": 0.682183027267456, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 29890 + }, + { + "epoch": 2.14721723518851, + "grad_norm": 0.8491033911705017, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 29900 + }, + { + "epoch": 2.147935368043088, + "grad_norm": 0.8112631440162659, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 29910 + }, + { + "epoch": 2.148653500897666, + "grad_norm": 1.0186359882354736, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29920 + }, + { + "epoch": 2.149371633752244, + "grad_norm": 0.7904929518699646, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 29930 + }, + { + "epoch": 2.1500897666068224, + "grad_norm": 0.8381312489509583, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29940 + }, + { + "epoch": 2.1508078994614004, + "grad_norm": 0.7596192359924316, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 29950 + }, + { + "epoch": 2.1515260323159784, + "grad_norm": 0.7532448768615723, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 29960 + }, + { + "epoch": 2.1522441651705564, + "grad_norm": 0.7877430319786072, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 29970 + }, + { + "epoch": 2.152962298025135, + "grad_norm": 0.6870610117912292, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 29980 + }, + { + "epoch": 2.153680430879713, + "grad_norm": 0.7154987454414368, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 29990 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 0.7692370414733887, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 30000 + }, + { + "epoch": 2.155116696588869, + "grad_norm": 0.7745859026908875, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 30010 + }, + { + "epoch": 2.155834829443447, + "grad_norm": 0.718207061290741, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 30020 + }, + { + "epoch": 2.1565529622980253, + "grad_norm": 0.8851615786552429, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30030 + }, + { + "epoch": 2.1572710951526033, + "grad_norm": 0.736194372177124, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 30040 + }, + { + "epoch": 2.1579892280071813, + "grad_norm": 0.9908117055892944, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 30050 + }, + { + "epoch": 2.1587073608617593, + "grad_norm": 0.6772316694259644, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30060 + }, + { + "epoch": 2.1594254937163377, + "grad_norm": 0.7474411725997925, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 30070 + }, + { + "epoch": 2.1601436265709157, + "grad_norm": 0.8140033483505249, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 30080 + }, + { + "epoch": 2.1608617594254937, + "grad_norm": 0.912555992603302, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 30090 + }, + { + "epoch": 2.1615798922800717, + "grad_norm": 0.8189636468887329, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 30100 + }, + { + "epoch": 2.1622980251346497, + "grad_norm": 0.7520000338554382, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 30110 + }, + { + "epoch": 2.163016157989228, + "grad_norm": 0.9635465741157532, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 30120 + }, + { + "epoch": 2.163734290843806, + "grad_norm": 0.9139830470085144, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 30130 + }, + { + "epoch": 2.164452423698384, + "grad_norm": 0.844384491443634, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 30140 + }, + { + "epoch": 2.165170556552962, + "grad_norm": 0.8296793103218079, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 30150 + }, + { + "epoch": 2.16588868940754, + "grad_norm": 0.7929309606552124, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30160 + }, + { + "epoch": 2.1666068222621186, + "grad_norm": 0.8046507239341736, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 30170 + }, + { + "epoch": 2.1673249551166966, + "grad_norm": 0.8161377310752869, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 30180 + }, + { + "epoch": 2.1680430879712747, + "grad_norm": 0.6984363794326782, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 30190 + }, + { + "epoch": 2.1687612208258527, + "grad_norm": 0.8578489422798157, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30200 + }, + { + "epoch": 2.1694793536804307, + "grad_norm": 0.8051524758338928, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30210 + }, + { + "epoch": 2.170197486535009, + "grad_norm": 0.6775792241096497, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 30220 + }, + { + "epoch": 2.170915619389587, + "grad_norm": 0.7102242708206177, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 30230 + }, + { + "epoch": 2.171633752244165, + "grad_norm": 0.9038975238800049, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 30240 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 0.8509918451309204, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 30250 + }, + { + "epoch": 2.1730700179533216, + "grad_norm": 0.8816375732421875, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 30260 + }, + { + "epoch": 2.1737881508078996, + "grad_norm": 0.7907037138938904, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 30270 + }, + { + "epoch": 2.1745062836624776, + "grad_norm": 0.7104434967041016, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 30280 + }, + { + "epoch": 2.1752244165170556, + "grad_norm": 1.028658151626587, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 30290 + }, + { + "epoch": 2.1759425493716336, + "grad_norm": 0.8542430400848389, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 30300 + }, + { + "epoch": 2.176660682226212, + "grad_norm": 0.7438064813613892, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30310 + }, + { + "epoch": 2.17737881508079, + "grad_norm": 0.8384708762168884, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 30320 + }, + { + "epoch": 2.178096947935368, + "grad_norm": 0.9034163355827332, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 30330 + }, + { + "epoch": 2.178815080789946, + "grad_norm": 0.9659526944160461, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 30340 + }, + { + "epoch": 2.1795332136445245, + "grad_norm": 0.6685642600059509, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 30350 + }, + { + "epoch": 2.1802513464991025, + "grad_norm": 0.9180589318275452, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 30360 + }, + { + "epoch": 2.1809694793536805, + "grad_norm": 0.9550795555114746, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 30370 + }, + { + "epoch": 2.1816876122082585, + "grad_norm": 0.8517686724662781, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 30380 + }, + { + "epoch": 2.1824057450628365, + "grad_norm": 0.7351927161216736, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 30390 + }, + { + "epoch": 2.183123877917415, + "grad_norm": 0.8439408540725708, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 30400 + }, + { + "epoch": 2.183842010771993, + "grad_norm": 0.8322570323944092, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 30410 + }, + { + "epoch": 2.184560143626571, + "grad_norm": 0.6735888123512268, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 30420 + }, + { + "epoch": 2.185278276481149, + "grad_norm": 0.7273133397102356, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 30430 + }, + { + "epoch": 2.185996409335727, + "grad_norm": 0.7841959595680237, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 30440 + }, + { + "epoch": 2.1867145421903054, + "grad_norm": 0.67259281873703, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 30450 + }, + { + "epoch": 2.1874326750448834, + "grad_norm": 0.7646223306655884, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 30460 + }, + { + "epoch": 2.1881508078994614, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 30470 + }, + { + "epoch": 2.1888689407540394, + "grad_norm": 0.8818342685699463, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 30480 + }, + { + "epoch": 2.1895870736086174, + "grad_norm": 0.7421377897262573, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 30490 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 0.8180080652236938, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30500 + }, + { + "epoch": 2.191023339317774, + "grad_norm": 0.8003571033477783, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30510 + }, + { + "epoch": 2.191741472172352, + "grad_norm": 0.8200605511665344, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 30520 + }, + { + "epoch": 2.19245960502693, + "grad_norm": 0.8878887295722961, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 30530 + }, + { + "epoch": 2.1931777378815083, + "grad_norm": 0.8518163561820984, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 30540 + }, + { + "epoch": 2.1938958707360863, + "grad_norm": 0.8182454705238342, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 30550 + }, + { + "epoch": 2.1946140035906643, + "grad_norm": 0.9395919442176819, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 30560 + }, + { + "epoch": 2.1953321364452423, + "grad_norm": 0.7916256189346313, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 30570 + }, + { + "epoch": 2.1960502692998203, + "grad_norm": 0.7303445339202881, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 30580 + }, + { + "epoch": 2.1967684021543987, + "grad_norm": 0.7407387495040894, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 30590 + }, + { + "epoch": 2.1974865350089767, + "grad_norm": 0.7410500645637512, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 30600 + }, + { + "epoch": 2.1982046678635547, + "grad_norm": 0.9176440834999084, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 30610 + }, + { + "epoch": 2.1989228007181327, + "grad_norm": 0.8823038935661316, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 30620 + }, + { + "epoch": 2.199640933572711, + "grad_norm": 0.9263436198234558, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 30630 + }, + { + "epoch": 2.200359066427289, + "grad_norm": 0.6753571033477783, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 30640 + }, + { + "epoch": 2.201077199281867, + "grad_norm": 0.841160774230957, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 30650 + }, + { + "epoch": 2.201795332136445, + "grad_norm": 0.8786441683769226, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 30660 + }, + { + "epoch": 2.202513464991023, + "grad_norm": 0.8833681344985962, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 30670 + }, + { + "epoch": 2.2032315978456016, + "grad_norm": 0.6609824299812317, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 30680 + }, + { + "epoch": 2.2039497307001796, + "grad_norm": 0.7308626174926758, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 30690 + }, + { + "epoch": 2.2046678635547576, + "grad_norm": 0.8854711055755615, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 30700 + }, + { + "epoch": 2.2053859964093356, + "grad_norm": 0.839043140411377, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 30710 + }, + { + "epoch": 2.2061041292639136, + "grad_norm": 0.9030174016952515, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 30720 + }, + { + "epoch": 2.206822262118492, + "grad_norm": 0.6856667399406433, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 30730 + }, + { + "epoch": 2.20754039497307, + "grad_norm": 0.8823501467704773, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 30740 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 0.8501278162002563, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 30750 + }, + { + "epoch": 2.208976660682226, + "grad_norm": 0.8099446892738342, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 30760 + }, + { + "epoch": 2.209694793536804, + "grad_norm": 0.7203072905540466, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 30770 + }, + { + "epoch": 2.2104129263913825, + "grad_norm": 1.0898563861846924, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 30780 + }, + { + "epoch": 2.2111310592459605, + "grad_norm": 0.8157216906547546, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 30790 + }, + { + "epoch": 2.2118491921005385, + "grad_norm": 0.7617478966712952, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 30800 + }, + { + "epoch": 2.2125673249551165, + "grad_norm": 0.790503978729248, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 30810 + }, + { + "epoch": 2.213285457809695, + "grad_norm": 0.9289199113845825, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 30820 + }, + { + "epoch": 2.214003590664273, + "grad_norm": 0.9267001748085022, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 30830 + }, + { + "epoch": 2.214721723518851, + "grad_norm": 0.716023862361908, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 30840 + }, + { + "epoch": 2.215439856373429, + "grad_norm": 0.8733863234519958, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 30850 + }, + { + "epoch": 2.216157989228007, + "grad_norm": 0.7743660807609558, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 30860 + }, + { + "epoch": 2.2168761220825854, + "grad_norm": 0.7974567413330078, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 30870 + }, + { + "epoch": 2.2175942549371634, + "grad_norm": 0.6617984771728516, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 30880 + }, + { + "epoch": 2.2183123877917414, + "grad_norm": 0.6925143003463745, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 30890 + }, + { + "epoch": 2.2190305206463194, + "grad_norm": 0.6853532195091248, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 30900 + }, + { + "epoch": 2.219748653500898, + "grad_norm": 0.7964699268341064, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 30910 + }, + { + "epoch": 2.220466786355476, + "grad_norm": 0.8116228580474854, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 30920 + }, + { + "epoch": 2.221184919210054, + "grad_norm": 1.0121010541915894, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 30930 + }, + { + "epoch": 2.221903052064632, + "grad_norm": 0.7348445653915405, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 30940 + }, + { + "epoch": 2.22262118491921, + "grad_norm": 0.8998047709465027, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 30950 + }, + { + "epoch": 2.2233393177737883, + "grad_norm": 0.6108106970787048, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 30960 + }, + { + "epoch": 2.2240574506283664, + "grad_norm": 1.287834882736206, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 30970 + }, + { + "epoch": 2.2247755834829444, + "grad_norm": 0.8584468960762024, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 30980 + }, + { + "epoch": 2.2254937163375224, + "grad_norm": 0.865276038646698, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 30990 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 0.8713302612304688, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 31000 + }, + { + "epoch": 2.226929982046679, + "grad_norm": 0.9210535883903503, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 31010 + }, + { + "epoch": 2.227648114901257, + "grad_norm": 0.8578430414199829, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 31020 + }, + { + "epoch": 2.228366247755835, + "grad_norm": 0.7128387093544006, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 31030 + }, + { + "epoch": 2.229084380610413, + "grad_norm": 0.8059941530227661, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 31040 + }, + { + "epoch": 2.229802513464991, + "grad_norm": 0.8043261170387268, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 31050 + }, + { + "epoch": 2.2305206463195693, + "grad_norm": 0.9260253310203552, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 31060 + }, + { + "epoch": 2.2312387791741473, + "grad_norm": 0.7908085584640503, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 31070 + }, + { + "epoch": 2.2319569120287253, + "grad_norm": 0.7860442996025085, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 31080 + }, + { + "epoch": 2.2326750448833033, + "grad_norm": 0.8388702273368835, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 31090 + }, + { + "epoch": 2.2333931777378817, + "grad_norm": 0.835686206817627, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 31100 + }, + { + "epoch": 2.2341113105924597, + "grad_norm": 0.8148298859596252, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 31110 + }, + { + "epoch": 2.2348294434470377, + "grad_norm": 0.8501878976821899, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 31120 + }, + { + "epoch": 2.2355475763016157, + "grad_norm": 0.793323278427124, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 31130 + }, + { + "epoch": 2.2362657091561937, + "grad_norm": 0.8234742879867554, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31140 + }, + { + "epoch": 2.236983842010772, + "grad_norm": 0.8691303133964539, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 31150 + }, + { + "epoch": 2.23770197486535, + "grad_norm": 0.8707090020179749, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 31160 + }, + { + "epoch": 2.238420107719928, + "grad_norm": 0.8468940854072571, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 31170 + }, + { + "epoch": 2.239138240574506, + "grad_norm": 0.7275772094726562, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 31180 + }, + { + "epoch": 2.2398563734290846, + "grad_norm": 0.8765808939933777, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 31190 + }, + { + "epoch": 2.2405745062836626, + "grad_norm": 1.02803635597229, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 31200 + }, + { + "epoch": 2.2412926391382406, + "grad_norm": 0.7999185919761658, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 31210 + }, + { + "epoch": 2.2420107719928186, + "grad_norm": 0.5711870789527893, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 31220 + }, + { + "epoch": 2.2427289048473966, + "grad_norm": 0.7183604836463928, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 31230 + }, + { + "epoch": 2.243447037701975, + "grad_norm": 0.8819206357002258, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 31240 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 0.9078969955444336, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 31250 + }, + { + "epoch": 2.244883303411131, + "grad_norm": 1.184506893157959, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 31260 + }, + { + "epoch": 2.245601436265709, + "grad_norm": 0.8660752177238464, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 31270 + }, + { + "epoch": 2.246319569120287, + "grad_norm": 1.011796236038208, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 31280 + }, + { + "epoch": 2.2470377019748655, + "grad_norm": 0.9168157577514648, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 31290 + }, + { + "epoch": 2.2477558348294435, + "grad_norm": 0.7798577547073364, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 31300 + }, + { + "epoch": 2.2484739676840215, + "grad_norm": 0.6609913110733032, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 31310 + }, + { + "epoch": 2.2491921005385995, + "grad_norm": 0.64737868309021, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 31320 + }, + { + "epoch": 2.2499102333931775, + "grad_norm": 1.0700385570526123, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 31330 + }, + { + "epoch": 2.250628366247756, + "grad_norm": 0.7838551998138428, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 31340 + }, + { + "epoch": 2.251346499102334, + "grad_norm": 0.9225728511810303, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 31350 + }, + { + "epoch": 2.252064631956912, + "grad_norm": 0.7956384420394897, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 31360 + }, + { + "epoch": 2.25278276481149, + "grad_norm": 0.7645466923713684, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 31370 + }, + { + "epoch": 2.2535008976660684, + "grad_norm": 0.9595549702644348, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 31380 + }, + { + "epoch": 2.2542190305206464, + "grad_norm": 0.6124163866043091, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 31390 + }, + { + "epoch": 2.2549371633752244, + "grad_norm": 0.7531530261039734, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 31400 + }, + { + "epoch": 2.2556552962298024, + "grad_norm": 0.6904721856117249, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 31410 + }, + { + "epoch": 2.2563734290843804, + "grad_norm": 0.7644204497337341, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 31420 + }, + { + "epoch": 2.257091561938959, + "grad_norm": 0.7879737019538879, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 31430 + }, + { + "epoch": 2.257809694793537, + "grad_norm": 0.796450138092041, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 31440 + }, + { + "epoch": 2.258527827648115, + "grad_norm": 0.7536656856536865, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31450 + }, + { + "epoch": 2.259245960502693, + "grad_norm": 0.6797451376914978, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 31460 + }, + { + "epoch": 2.2599640933572713, + "grad_norm": 0.7833347320556641, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 31470 + }, + { + "epoch": 2.2606822262118493, + "grad_norm": 0.7571428418159485, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 31480 + }, + { + "epoch": 2.2614003590664273, + "grad_norm": 0.7028690576553345, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 31490 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 0.7854651212692261, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 31500 + }, + { + "epoch": 2.2628366247755833, + "grad_norm": 1.1924974918365479, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 31510 + }, + { + "epoch": 2.2635547576301613, + "grad_norm": 0.8087588548660278, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 31520 + }, + { + "epoch": 2.26427289048474, + "grad_norm": 0.8521981835365295, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31530 + }, + { + "epoch": 2.264991023339318, + "grad_norm": 0.754585862159729, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 31540 + }, + { + "epoch": 2.265709156193896, + "grad_norm": 0.8403395414352417, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 31550 + }, + { + "epoch": 2.266427289048474, + "grad_norm": 0.9724786877632141, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 31560 + }, + { + "epoch": 2.2671454219030522, + "grad_norm": 0.7568767070770264, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 31570 + }, + { + "epoch": 2.2678635547576302, + "grad_norm": 0.712009608745575, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 31580 + }, + { + "epoch": 2.2685816876122082, + "grad_norm": 0.7649937868118286, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 31590 + }, + { + "epoch": 2.2692998204667862, + "grad_norm": 0.7319537997245789, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 31600 + }, + { + "epoch": 2.2700179533213642, + "grad_norm": 0.9597942233085632, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 31610 + }, + { + "epoch": 2.2707360861759427, + "grad_norm": 0.7403358817100525, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 31620 + }, + { + "epoch": 2.2714542190305207, + "grad_norm": 0.7395114898681641, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 31630 + }, + { + "epoch": 2.2721723518850987, + "grad_norm": 0.8835344314575195, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 31640 + }, + { + "epoch": 2.2728904847396767, + "grad_norm": 0.76587975025177, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 31650 + }, + { + "epoch": 2.273608617594255, + "grad_norm": 0.6472584009170532, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 31660 + }, + { + "epoch": 2.274326750448833, + "grad_norm": 1.0170460939407349, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 31670 + }, + { + "epoch": 2.275044883303411, + "grad_norm": 0.8170912265777588, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 31680 + }, + { + "epoch": 2.275763016157989, + "grad_norm": 0.6821279525756836, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 31690 + }, + { + "epoch": 2.276481149012567, + "grad_norm": 0.8150709867477417, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 31700 + }, + { + "epoch": 2.2771992818671456, + "grad_norm": 0.6786386370658875, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 31710 + }, + { + "epoch": 2.2779174147217236, + "grad_norm": 0.8871912360191345, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 31720 + }, + { + "epoch": 2.2786355475763016, + "grad_norm": 0.7710220813751221, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 31730 + }, + { + "epoch": 2.2793536804308796, + "grad_norm": 0.8073079586029053, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 31740 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 0.8228550553321838, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 31750 + }, + { + "epoch": 2.280789946140036, + "grad_norm": 0.7987996339797974, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 31760 + }, + { + "epoch": 2.281508078994614, + "grad_norm": 0.744326651096344, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 31770 + }, + { + "epoch": 2.282226211849192, + "grad_norm": 0.7672302722930908, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 31780 + }, + { + "epoch": 2.28294434470377, + "grad_norm": 0.8079774975776672, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 31790 + }, + { + "epoch": 2.283662477558348, + "grad_norm": 0.7383643984794617, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 31800 + }, + { + "epoch": 2.2843806104129265, + "grad_norm": 0.8542332649230957, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 31810 + }, + { + "epoch": 2.2850987432675045, + "grad_norm": 0.7657321691513062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 31820 + }, + { + "epoch": 2.2858168761220825, + "grad_norm": 0.7485944628715515, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 31830 + }, + { + "epoch": 2.2865350089766605, + "grad_norm": 0.7817596793174744, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 31840 + }, + { + "epoch": 2.287253141831239, + "grad_norm": 0.840421736240387, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31850 + }, + { + "epoch": 2.287971274685817, + "grad_norm": 0.8190447688102722, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 31860 + }, + { + "epoch": 2.288689407540395, + "grad_norm": 0.9582287669181824, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 31870 + }, + { + "epoch": 2.289407540394973, + "grad_norm": 1.0939116477966309, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 31880 + }, + { + "epoch": 2.290125673249551, + "grad_norm": 1.0901678800582886, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 31890 + }, + { + "epoch": 2.2908438061041294, + "grad_norm": 0.8025168776512146, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 31900 + }, + { + "epoch": 2.2915619389587074, + "grad_norm": 0.8157371878623962, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 31910 + }, + { + "epoch": 2.2922800718132854, + "grad_norm": 0.7735328078269958, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 31920 + }, + { + "epoch": 2.2929982046678634, + "grad_norm": 0.7501550316810608, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 31930 + }, + { + "epoch": 2.293716337522442, + "grad_norm": 0.76664799451828, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 31940 + }, + { + "epoch": 2.29443447037702, + "grad_norm": 1.0044599771499634, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 31950 + }, + { + "epoch": 2.295152603231598, + "grad_norm": 0.7773551344871521, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 31960 + }, + { + "epoch": 2.295870736086176, + "grad_norm": 0.9021226763725281, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 31970 + }, + { + "epoch": 2.296588868940754, + "grad_norm": 0.9075915813446045, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 31980 + }, + { + "epoch": 2.2973070017953323, + "grad_norm": 0.9109290242195129, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 31990 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 0.7742900252342224, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32000 + }, + { + "epoch": 2.2987432675044883, + "grad_norm": 0.633260190486908, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 32010 + }, + { + "epoch": 2.2994614003590663, + "grad_norm": 0.8593834042549133, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 32020 + }, + { + "epoch": 2.3001795332136448, + "grad_norm": 0.88165283203125, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32030 + }, + { + "epoch": 2.3008976660682228, + "grad_norm": 0.7840633988380432, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 32040 + }, + { + "epoch": 2.3016157989228008, + "grad_norm": 0.8150764107704163, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 32050 + }, + { + "epoch": 2.3023339317773788, + "grad_norm": 0.7683324813842773, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32060 + }, + { + "epoch": 2.3030520646319568, + "grad_norm": 0.7581049799919128, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 32070 + }, + { + "epoch": 2.3037701974865348, + "grad_norm": 0.911687970161438, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32080 + }, + { + "epoch": 2.3044883303411132, + "grad_norm": 1.0596355199813843, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32090 + }, + { + "epoch": 2.3052064631956912, + "grad_norm": 0.7329661846160889, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 32100 + }, + { + "epoch": 2.3059245960502692, + "grad_norm": 0.8251074552536011, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 32110 + }, + { + "epoch": 2.3066427289048472, + "grad_norm": 0.7765523195266724, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 32120 + }, + { + "epoch": 2.3073608617594257, + "grad_norm": 0.8246980905532837, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 32130 + }, + { + "epoch": 2.3080789946140037, + "grad_norm": 0.833387017250061, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 32140 + }, + { + "epoch": 2.3087971274685817, + "grad_norm": 0.9558065533638, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 32150 + }, + { + "epoch": 2.3095152603231597, + "grad_norm": 0.788151204586029, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 32160 + }, + { + "epoch": 2.3102333931777377, + "grad_norm": 0.8662320971488953, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 32170 + }, + { + "epoch": 2.310951526032316, + "grad_norm": 0.7079060673713684, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 32180 + }, + { + "epoch": 2.311669658886894, + "grad_norm": 0.8477022647857666, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 32190 + }, + { + "epoch": 2.312387791741472, + "grad_norm": 0.6549711227416992, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 32200 + }, + { + "epoch": 2.31310592459605, + "grad_norm": 0.8274375796318054, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 32210 + }, + { + "epoch": 2.3138240574506286, + "grad_norm": 0.6305822730064392, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 32220 + }, + { + "epoch": 2.3145421903052066, + "grad_norm": 0.8105725049972534, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 32230 + }, + { + "epoch": 2.3152603231597846, + "grad_norm": 0.7317119240760803, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 32240 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 0.7729924917221069, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 32250 + }, + { + "epoch": 2.3166965888689406, + "grad_norm": 0.8092145919799805, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 32260 + }, + { + "epoch": 2.317414721723519, + "grad_norm": 0.8723762035369873, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 32270 + }, + { + "epoch": 2.318132854578097, + "grad_norm": 0.9699533581733704, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 32280 + }, + { + "epoch": 2.318850987432675, + "grad_norm": 1.2972444295883179, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 32290 + }, + { + "epoch": 2.319569120287253, + "grad_norm": 0.7888450622558594, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 32300 + }, + { + "epoch": 2.3202872531418315, + "grad_norm": 0.7457000017166138, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 32310 + }, + { + "epoch": 2.3210053859964095, + "grad_norm": 0.7270606756210327, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 32320 + }, + { + "epoch": 2.3217235188509875, + "grad_norm": 0.7930711507797241, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32330 + }, + { + "epoch": 2.3224416517055655, + "grad_norm": 0.9015030264854431, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 32340 + }, + { + "epoch": 2.3231597845601435, + "grad_norm": 0.9385523796081543, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 32350 + }, + { + "epoch": 2.3238779174147215, + "grad_norm": 0.7293606400489807, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 32360 + }, + { + "epoch": 2.3245960502693, + "grad_norm": 0.797618567943573, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32370 + }, + { + "epoch": 2.325314183123878, + "grad_norm": 0.8588258028030396, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 32380 + }, + { + "epoch": 2.326032315978456, + "grad_norm": 0.7490078210830688, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 32390 + }, + { + "epoch": 2.326750448833034, + "grad_norm": 0.7569956183433533, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 32400 + }, + { + "epoch": 2.3274685816876124, + "grad_norm": 0.8754122853279114, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 32410 + }, + { + "epoch": 2.3281867145421904, + "grad_norm": 0.9410699605941772, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 32420 + }, + { + "epoch": 2.3289048473967684, + "grad_norm": 1.1309062242507935, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 32430 + }, + { + "epoch": 2.3296229802513464, + "grad_norm": 0.7923168540000916, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 32440 + }, + { + "epoch": 2.3303411131059244, + "grad_norm": 0.830387532711029, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 32450 + }, + { + "epoch": 2.331059245960503, + "grad_norm": 0.9087454080581665, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 32460 + }, + { + "epoch": 2.331777378815081, + "grad_norm": 0.8892660737037659, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 32470 + }, + { + "epoch": 2.332495511669659, + "grad_norm": 0.84930819272995, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 32480 + }, + { + "epoch": 2.333213644524237, + "grad_norm": 0.7736781239509583, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 32490 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 0.7396222352981567, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 32500 + }, + { + "epoch": 2.3346499102333933, + "grad_norm": 0.7710241079330444, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 32510 + }, + { + "epoch": 2.3353680430879713, + "grad_norm": 0.7297301888465881, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 32520 + }, + { + "epoch": 2.3360861759425493, + "grad_norm": 0.9084094166755676, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 32530 + }, + { + "epoch": 2.3368043087971273, + "grad_norm": 0.6425859332084656, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 32540 + }, + { + "epoch": 2.3375224416517058, + "grad_norm": 0.8646581172943115, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 32550 + }, + { + "epoch": 2.3382405745062838, + "grad_norm": 0.91925048828125, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 32560 + }, + { + "epoch": 2.3389587073608618, + "grad_norm": 0.8687716722488403, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 32570 + }, + { + "epoch": 2.3396768402154398, + "grad_norm": 0.9769517183303833, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 32580 + }, + { + "epoch": 2.340394973070018, + "grad_norm": 0.7240557074546814, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 32590 + }, + { + "epoch": 2.341113105924596, + "grad_norm": 0.6631549000740051, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32600 + }, + { + "epoch": 2.341831238779174, + "grad_norm": 0.9103635549545288, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 32610 + }, + { + "epoch": 2.342549371633752, + "grad_norm": 0.8718403577804565, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 32620 + }, + { + "epoch": 2.34326750448833, + "grad_norm": 0.8020271062850952, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 32630 + }, + { + "epoch": 2.343985637342908, + "grad_norm": 0.7834265232086182, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 32640 + }, + { + "epoch": 2.3447037701974867, + "grad_norm": 0.8909988403320312, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 32650 + }, + { + "epoch": 2.3454219030520647, + "grad_norm": 0.6915582418441772, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 32660 + }, + { + "epoch": 2.3461400359066427, + "grad_norm": 0.8829401135444641, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 32670 + }, + { + "epoch": 2.3468581687612207, + "grad_norm": 0.8869150876998901, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 32680 + }, + { + "epoch": 2.347576301615799, + "grad_norm": 0.8348933458328247, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 32690 + }, + { + "epoch": 2.348294434470377, + "grad_norm": 0.7591108679771423, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32700 + }, + { + "epoch": 2.349012567324955, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 32710 + }, + { + "epoch": 2.349730700179533, + "grad_norm": 0.8537896275520325, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 32720 + }, + { + "epoch": 2.350448833034111, + "grad_norm": 0.7750797867774963, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 32730 + }, + { + "epoch": 2.3511669658886896, + "grad_norm": 0.7553941607475281, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 32740 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 0.8083372712135315, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 32750 + }, + { + "epoch": 2.3526032315978456, + "grad_norm": 0.8016324043273926, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 32760 + }, + { + "epoch": 2.3533213644524236, + "grad_norm": 0.7524061799049377, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 32770 + }, + { + "epoch": 2.354039497307002, + "grad_norm": 0.9046763777732849, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 32780 + }, + { + "epoch": 2.35475763016158, + "grad_norm": 0.9704324007034302, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 32790 + }, + { + "epoch": 2.355475763016158, + "grad_norm": 0.8756019473075867, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 32800 + }, + { + "epoch": 2.356193895870736, + "grad_norm": 0.7345646023750305, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32810 + }, + { + "epoch": 2.356912028725314, + "grad_norm": 0.8022899031639099, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 32820 + }, + { + "epoch": 2.3576301615798925, + "grad_norm": 0.7663353085517883, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 32830 + }, + { + "epoch": 2.3583482944344705, + "grad_norm": 0.7802956104278564, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32840 + }, + { + "epoch": 2.3590664272890485, + "grad_norm": 0.8130960464477539, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 32850 + }, + { + "epoch": 2.3597845601436265, + "grad_norm": 0.9671252369880676, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32860 + }, + { + "epoch": 2.3605026929982045, + "grad_norm": 0.8806724548339844, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32870 + }, + { + "epoch": 2.361220825852783, + "grad_norm": 0.9378283619880676, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 32880 + }, + { + "epoch": 2.361938958707361, + "grad_norm": 0.8638162612915039, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32890 + }, + { + "epoch": 2.362657091561939, + "grad_norm": 0.7321885228157043, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 32900 + }, + { + "epoch": 2.363375224416517, + "grad_norm": 0.8445415496826172, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 32910 + }, + { + "epoch": 2.364093357271095, + "grad_norm": 0.915715754032135, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 32920 + }, + { + "epoch": 2.3648114901256734, + "grad_norm": 0.8674854040145874, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 32930 + }, + { + "epoch": 2.3655296229802514, + "grad_norm": 0.7577189207077026, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 32940 + }, + { + "epoch": 2.3662477558348294, + "grad_norm": 0.8649988174438477, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 32950 + }, + { + "epoch": 2.3669658886894074, + "grad_norm": 0.9760734438896179, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 32960 + }, + { + "epoch": 2.367684021543986, + "grad_norm": 0.8909491300582886, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 32970 + }, + { + "epoch": 2.368402154398564, + "grad_norm": 0.6970168948173523, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32980 + }, + { + "epoch": 2.369120287253142, + "grad_norm": 0.8208426237106323, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 32990 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 0.8477405309677124, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 33000 + }, + { + "epoch": 2.370556552962298, + "grad_norm": 0.7771625518798828, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 33010 + }, + { + "epoch": 2.3712746858168763, + "grad_norm": 0.7811821103096008, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33020 + }, + { + "epoch": 2.3719928186714543, + "grad_norm": 0.6280415654182434, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33030 + }, + { + "epoch": 2.3727109515260323, + "grad_norm": 0.8733929395675659, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 33040 + }, + { + "epoch": 2.3734290843806103, + "grad_norm": 0.6169558167457581, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33050 + }, + { + "epoch": 2.3741472172351887, + "grad_norm": 0.7414724826812744, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33060 + }, + { + "epoch": 2.3748653500897667, + "grad_norm": 0.7484683990478516, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 33070 + }, + { + "epoch": 2.3755834829443447, + "grad_norm": 0.8495098948478699, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 33080 + }, + { + "epoch": 2.3763016157989227, + "grad_norm": 0.9057353734970093, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 33090 + }, + { + "epoch": 2.3770197486535007, + "grad_norm": 0.8028274178504944, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 33100 + }, + { + "epoch": 2.377737881508079, + "grad_norm": 1.2398128509521484, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 33110 + }, + { + "epoch": 2.378456014362657, + "grad_norm": 0.7894110679626465, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 33120 + }, + { + "epoch": 2.379174147217235, + "grad_norm": 0.8530096411705017, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 33130 + }, + { + "epoch": 2.379892280071813, + "grad_norm": 0.892613410949707, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 33140 + }, + { + "epoch": 2.380610412926391, + "grad_norm": 0.868606448173523, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 33150 + }, + { + "epoch": 2.3813285457809696, + "grad_norm": 0.6801115870475769, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 33160 + }, + { + "epoch": 2.3820466786355476, + "grad_norm": 0.9517148733139038, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 33170 + }, + { + "epoch": 2.3827648114901256, + "grad_norm": 0.8986499309539795, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 33180 + }, + { + "epoch": 2.3834829443447036, + "grad_norm": 0.8467642068862915, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33190 + }, + { + "epoch": 2.3842010771992816, + "grad_norm": 0.8400940299034119, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 33200 + }, + { + "epoch": 2.38491921005386, + "grad_norm": 0.86443030834198, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 33210 + }, + { + "epoch": 2.385637342908438, + "grad_norm": 0.8599014282226562, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 33220 + }, + { + "epoch": 2.386355475763016, + "grad_norm": 0.868735134601593, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33230 + }, + { + "epoch": 2.387073608617594, + "grad_norm": 0.941734790802002, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 33240 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 0.9342881441116333, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 33250 + }, + { + "epoch": 2.3885098743267505, + "grad_norm": 1.012920618057251, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 33260 + }, + { + "epoch": 2.3892280071813286, + "grad_norm": 0.6949151754379272, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 33270 + }, + { + "epoch": 2.3899461400359066, + "grad_norm": 0.8283912539482117, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 33280 + }, + { + "epoch": 2.3906642728904846, + "grad_norm": 0.807273805141449, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 33290 + }, + { + "epoch": 2.391382405745063, + "grad_norm": 0.8109124302864075, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 33300 + }, + { + "epoch": 2.392100538599641, + "grad_norm": 0.7477563619613647, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 33310 + }, + { + "epoch": 2.392818671454219, + "grad_norm": 0.6961637735366821, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 33320 + }, + { + "epoch": 2.393536804308797, + "grad_norm": 0.9424173831939697, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 33330 + }, + { + "epoch": 2.3942549371633755, + "grad_norm": 0.8289623856544495, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 33340 + }, + { + "epoch": 2.3949730700179535, + "grad_norm": 0.8106551170349121, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 33350 + }, + { + "epoch": 2.3956912028725315, + "grad_norm": 0.8800507187843323, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33360 + }, + { + "epoch": 2.3964093357271095, + "grad_norm": 0.7662274241447449, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 33370 + }, + { + "epoch": 2.3971274685816875, + "grad_norm": 0.889204740524292, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 33380 + }, + { + "epoch": 2.3978456014362655, + "grad_norm": 0.7991349697113037, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 33390 + }, + { + "epoch": 2.398563734290844, + "grad_norm": 0.8210278749465942, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 33400 + }, + { + "epoch": 2.399281867145422, + "grad_norm": 0.91801917552948, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 33410 + }, + { + "epoch": 2.4, + "grad_norm": 0.8086220622062683, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 33420 + }, + { + "epoch": 2.400718132854578, + "grad_norm": 0.901613175868988, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 33430 + }, + { + "epoch": 2.4014362657091564, + "grad_norm": 0.9865965247154236, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 33440 + }, + { + "epoch": 2.4021543985637344, + "grad_norm": 0.8160675168037415, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 33450 + }, + { + "epoch": 2.4028725314183124, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33460 + }, + { + "epoch": 2.4035906642728904, + "grad_norm": 0.8490013480186462, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 33470 + }, + { + "epoch": 2.4043087971274684, + "grad_norm": 0.6947163939476013, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33480 + }, + { + "epoch": 2.405026929982047, + "grad_norm": 0.7984827756881714, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 33490 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 0.7826083302497864, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 33500 + }, + { + "epoch": 2.406463195691203, + "grad_norm": 0.8213959336280823, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 33510 + }, + { + "epoch": 2.407181328545781, + "grad_norm": 0.8790069818496704, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 33520 + }, + { + "epoch": 2.4078994614003593, + "grad_norm": 0.9093378782272339, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 33530 + }, + { + "epoch": 2.4086175942549373, + "grad_norm": 0.8085389137268066, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 33540 + }, + { + "epoch": 2.4093357271095153, + "grad_norm": 0.7952343225479126, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 33550 + }, + { + "epoch": 2.4100538599640933, + "grad_norm": 0.9576563835144043, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 33560 + }, + { + "epoch": 2.4107719928186713, + "grad_norm": 0.7722929120063782, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 33570 + }, + { + "epoch": 2.4114901256732497, + "grad_norm": 0.8634604215621948, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 33580 + }, + { + "epoch": 2.4122082585278277, + "grad_norm": 0.7805271148681641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 33590 + }, + { + "epoch": 2.4129263913824057, + "grad_norm": 0.8274481296539307, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 33600 + }, + { + "epoch": 2.4136445242369837, + "grad_norm": 0.9265141487121582, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 33610 + }, + { + "epoch": 2.414362657091562, + "grad_norm": 0.7497374415397644, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 33620 + }, + { + "epoch": 2.41508078994614, + "grad_norm": 0.7048972249031067, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 33630 + }, + { + "epoch": 2.415798922800718, + "grad_norm": 0.8449550271034241, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 33640 + }, + { + "epoch": 2.416517055655296, + "grad_norm": 0.7581984400749207, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 33650 + }, + { + "epoch": 2.417235188509874, + "grad_norm": 0.7744191288948059, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 33660 + }, + { + "epoch": 2.417953321364452, + "grad_norm": 0.6736614108085632, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 33670 + }, + { + "epoch": 2.4186714542190306, + "grad_norm": 0.985431432723999, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33680 + }, + { + "epoch": 2.4193895870736086, + "grad_norm": 0.8027978539466858, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33690 + }, + { + "epoch": 2.4201077199281866, + "grad_norm": 0.6809377074241638, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 33700 + }, + { + "epoch": 2.4208258527827646, + "grad_norm": 0.8305349946022034, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 33710 + }, + { + "epoch": 2.421543985637343, + "grad_norm": 0.7632496356964111, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 33720 + }, + { + "epoch": 2.422262118491921, + "grad_norm": 0.7241050601005554, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 33730 + }, + { + "epoch": 2.422980251346499, + "grad_norm": 0.6729857325553894, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 33740 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 0.7741881012916565, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 33750 + }, + { + "epoch": 2.424416517055655, + "grad_norm": 0.7844415903091431, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 33760 + }, + { + "epoch": 2.4251346499102335, + "grad_norm": 0.7960098385810852, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 33770 + }, + { + "epoch": 2.4258527827648115, + "grad_norm": 0.8267978429794312, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 33780 + }, + { + "epoch": 2.4265709156193895, + "grad_norm": 0.7498974204063416, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 33790 + }, + { + "epoch": 2.4272890484739675, + "grad_norm": 0.8357859253883362, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 33800 + }, + { + "epoch": 2.428007181328546, + "grad_norm": 0.8056104779243469, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 33810 + }, + { + "epoch": 2.428725314183124, + "grad_norm": 0.806897759437561, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 33820 + }, + { + "epoch": 2.429443447037702, + "grad_norm": 0.7770048975944519, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 33830 + }, + { + "epoch": 2.43016157989228, + "grad_norm": 0.8311458230018616, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 33840 + }, + { + "epoch": 2.430879712746858, + "grad_norm": 0.9201730489730835, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 33850 + }, + { + "epoch": 2.4315978456014364, + "grad_norm": 0.83509761095047, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 33860 + }, + { + "epoch": 2.4323159784560144, + "grad_norm": 0.7680139541625977, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 33870 + }, + { + "epoch": 2.4330341113105924, + "grad_norm": 0.8956670165061951, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 33880 + }, + { + "epoch": 2.4337522441651704, + "grad_norm": 0.717941164970398, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33890 + }, + { + "epoch": 2.434470377019749, + "grad_norm": 0.777206540107727, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 33900 + }, + { + "epoch": 2.435188509874327, + "grad_norm": 0.90232914686203, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 33910 + }, + { + "epoch": 2.435906642728905, + "grad_norm": 1.0817158222198486, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 33920 + }, + { + "epoch": 2.436624775583483, + "grad_norm": 0.7890931367874146, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 33930 + }, + { + "epoch": 2.437342908438061, + "grad_norm": 0.9279449582099915, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 33940 + }, + { + "epoch": 2.438061041292639, + "grad_norm": 0.8313823342323303, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 33950 + }, + { + "epoch": 2.4387791741472173, + "grad_norm": 1.0510340929031372, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 33960 + }, + { + "epoch": 2.4394973070017953, + "grad_norm": 0.8002574443817139, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 33970 + }, + { + "epoch": 2.4402154398563733, + "grad_norm": 0.7822834253311157, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33980 + }, + { + "epoch": 2.4409335727109513, + "grad_norm": 0.9050403237342834, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 33990 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 0.7569652199745178, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 34000 + }, + { + "epoch": 2.442369838420108, + "grad_norm": 0.6609470844268799, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 34010 + }, + { + "epoch": 2.443087971274686, + "grad_norm": 0.8090947866439819, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34020 + }, + { + "epoch": 2.443806104129264, + "grad_norm": 0.647814929485321, + "learning_rate": 0.0002, + "loss": 0.6621, + "step": 34030 + }, + { + "epoch": 2.444524236983842, + "grad_norm": 0.9308601021766663, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 34040 + }, + { + "epoch": 2.4452423698384202, + "grad_norm": 0.8259239792823792, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34050 + }, + { + "epoch": 2.4459605026929983, + "grad_norm": 0.9410025477409363, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 34060 + }, + { + "epoch": 2.4466786355475763, + "grad_norm": 0.7446974515914917, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 34070 + }, + { + "epoch": 2.4473967684021543, + "grad_norm": 0.7093849182128906, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 34080 + }, + { + "epoch": 2.4481149012567327, + "grad_norm": 0.8726152181625366, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 34090 + }, + { + "epoch": 2.4488330341113107, + "grad_norm": 0.808300793170929, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 34100 + }, + { + "epoch": 2.4495511669658887, + "grad_norm": 0.6884859800338745, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 34110 + }, + { + "epoch": 2.4502692998204667, + "grad_norm": 0.7151864767074585, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 34120 + }, + { + "epoch": 2.4509874326750447, + "grad_norm": 0.9261866807937622, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 34130 + }, + { + "epoch": 2.451705565529623, + "grad_norm": 0.8069018125534058, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 34140 + }, + { + "epoch": 2.452423698384201, + "grad_norm": 0.8001297116279602, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 34150 + }, + { + "epoch": 2.453141831238779, + "grad_norm": 0.8547799587249756, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 34160 + }, + { + "epoch": 2.453859964093357, + "grad_norm": 0.6693823337554932, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 34170 + }, + { + "epoch": 2.4545780969479356, + "grad_norm": 0.6646198630332947, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34180 + }, + { + "epoch": 2.4552962298025136, + "grad_norm": 0.9330950975418091, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 34190 + }, + { + "epoch": 2.4560143626570916, + "grad_norm": 0.7738645672798157, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 34200 + }, + { + "epoch": 2.4567324955116696, + "grad_norm": 0.7929846048355103, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 34210 + }, + { + "epoch": 2.4574506283662476, + "grad_norm": 0.8936280012130737, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34220 + }, + { + "epoch": 2.4581687612208256, + "grad_norm": 0.9099360108375549, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 34230 + }, + { + "epoch": 2.458886894075404, + "grad_norm": 0.7941291928291321, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 34240 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 0.7169737219810486, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 34250 + }, + { + "epoch": 2.46032315978456, + "grad_norm": 0.8994171023368835, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 34260 + }, + { + "epoch": 2.461041292639138, + "grad_norm": 0.8087331056594849, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 34270 + }, + { + "epoch": 2.4617594254937165, + "grad_norm": 0.935502827167511, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 34280 + }, + { + "epoch": 2.4624775583482945, + "grad_norm": 0.8957464694976807, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 34290 + }, + { + "epoch": 2.4631956912028725, + "grad_norm": 0.9017183780670166, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 34300 + }, + { + "epoch": 2.4639138240574505, + "grad_norm": 0.7778640389442444, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34310 + }, + { + "epoch": 2.4646319569120285, + "grad_norm": 0.8870323896408081, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 34320 + }, + { + "epoch": 2.465350089766607, + "grad_norm": 0.7660176753997803, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 34330 + }, + { + "epoch": 2.466068222621185, + "grad_norm": 0.8442226648330688, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 34340 + }, + { + "epoch": 2.466786355475763, + "grad_norm": 0.7522561550140381, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 34350 + }, + { + "epoch": 2.467504488330341, + "grad_norm": 0.9355213046073914, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 34360 + }, + { + "epoch": 2.4682226211849194, + "grad_norm": 0.8487382531166077, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 34370 + }, + { + "epoch": 2.4689407540394974, + "grad_norm": 0.7869813442230225, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 34380 + }, + { + "epoch": 2.4696588868940754, + "grad_norm": 0.7562848329544067, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 34390 + }, + { + "epoch": 2.4703770197486534, + "grad_norm": 0.740829586982727, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 34400 + }, + { + "epoch": 2.4710951526032314, + "grad_norm": 1.0862116813659668, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 34410 + }, + { + "epoch": 2.47181328545781, + "grad_norm": 0.9633645415306091, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 34420 + }, + { + "epoch": 2.472531418312388, + "grad_norm": 0.8467186093330383, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 34430 + }, + { + "epoch": 2.473249551166966, + "grad_norm": 0.9972147941589355, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 34440 + }, + { + "epoch": 2.473967684021544, + "grad_norm": 0.8086632490158081, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 34450 + }, + { + "epoch": 2.4746858168761223, + "grad_norm": 0.9043704271316528, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 34460 + }, + { + "epoch": 2.4754039497307003, + "grad_norm": 0.8275330662727356, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34470 + }, + { + "epoch": 2.4761220825852783, + "grad_norm": 0.8142464756965637, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 34480 + }, + { + "epoch": 2.4768402154398563, + "grad_norm": 0.7116754651069641, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 34490 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 0.8742281198501587, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 34500 + }, + { + "epoch": 2.4782764811490123, + "grad_norm": 0.7545657157897949, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 34510 + }, + { + "epoch": 2.478994614003591, + "grad_norm": 0.7586482167243958, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 34520 + }, + { + "epoch": 2.479712746858169, + "grad_norm": 0.9212547540664673, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 34530 + }, + { + "epoch": 2.480430879712747, + "grad_norm": 0.9391530752182007, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 34540 + }, + { + "epoch": 2.481149012567325, + "grad_norm": 1.119698166847229, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 34550 + }, + { + "epoch": 2.4818671454219032, + "grad_norm": 0.8499019145965576, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34560 + }, + { + "epoch": 2.4825852782764812, + "grad_norm": 0.7629778385162354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 34570 + }, + { + "epoch": 2.4833034111310592, + "grad_norm": 0.7667021155357361, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 34580 + }, + { + "epoch": 2.4840215439856372, + "grad_norm": 0.6711493730545044, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 34590 + }, + { + "epoch": 2.4847396768402152, + "grad_norm": 0.7354223728179932, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34600 + }, + { + "epoch": 2.4854578096947937, + "grad_norm": 0.875295102596283, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 34610 + }, + { + "epoch": 2.4861759425493717, + "grad_norm": 0.7341493964195251, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 34620 + }, + { + "epoch": 2.4868940754039497, + "grad_norm": 0.9049216508865356, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 34630 + }, + { + "epoch": 2.4876122082585277, + "grad_norm": 0.7214788198471069, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 34640 + }, + { + "epoch": 2.488330341113106, + "grad_norm": 0.7514070868492126, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 34650 + }, + { + "epoch": 2.489048473967684, + "grad_norm": 0.6929763555526733, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 34660 + }, + { + "epoch": 2.489766606822262, + "grad_norm": 1.11346435546875, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 34670 + }, + { + "epoch": 2.49048473967684, + "grad_norm": 0.9285556674003601, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 34680 + }, + { + "epoch": 2.491202872531418, + "grad_norm": 0.7699695825576782, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 34690 + }, + { + "epoch": 2.4919210053859966, + "grad_norm": 0.872349739074707, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 34700 + }, + { + "epoch": 2.4926391382405746, + "grad_norm": 0.8692147135734558, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 34710 + }, + { + "epoch": 2.4933572710951526, + "grad_norm": 0.799740195274353, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 34720 + }, + { + "epoch": 2.4940754039497306, + "grad_norm": 0.7320986986160278, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 34730 + }, + { + "epoch": 2.494793536804309, + "grad_norm": 0.8233383893966675, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 34740 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 0.9605086445808411, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34750 + }, + { + "epoch": 2.496229802513465, + "grad_norm": 0.8597773909568787, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 34760 + }, + { + "epoch": 2.496947935368043, + "grad_norm": 0.7459201812744141, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34770 + }, + { + "epoch": 2.497666068222621, + "grad_norm": 0.778457522392273, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 34780 + }, + { + "epoch": 2.498384201077199, + "grad_norm": 0.8591375946998596, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 34790 + }, + { + "epoch": 2.4991023339317775, + "grad_norm": 0.9689867496490479, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 34800 + }, + { + "epoch": 2.4998204667863555, + "grad_norm": 0.7430615425109863, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 34810 + }, + { + "epoch": 2.5005385996409335, + "grad_norm": 0.8545114994049072, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 34820 + }, + { + "epoch": 2.5012567324955115, + "grad_norm": 0.7115356922149658, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 34830 + }, + { + "epoch": 2.50197486535009, + "grad_norm": 0.7616795301437378, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34840 + }, + { + "epoch": 2.502692998204668, + "grad_norm": 0.8097891211509705, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 34850 + }, + { + "epoch": 2.503411131059246, + "grad_norm": 0.7397396564483643, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 34860 + }, + { + "epoch": 2.504129263913824, + "grad_norm": 0.7531594038009644, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 34870 + }, + { + "epoch": 2.504847396768402, + "grad_norm": 0.8050091862678528, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 34880 + }, + { + "epoch": 2.5055655296229804, + "grad_norm": 0.7550507187843323, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 34890 + }, + { + "epoch": 2.5062836624775584, + "grad_norm": 1.0131759643554688, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34900 + }, + { + "epoch": 2.5070017953321364, + "grad_norm": 0.9275356531143188, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 34910 + }, + { + "epoch": 2.5077199281867144, + "grad_norm": 0.6655791997909546, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 34920 + }, + { + "epoch": 2.508438061041293, + "grad_norm": 0.79361891746521, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 34930 + }, + { + "epoch": 2.509156193895871, + "grad_norm": 0.8223658800125122, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 34940 + }, + { + "epoch": 2.509874326750449, + "grad_norm": 1.0070416927337646, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 34950 + }, + { + "epoch": 2.510592459605027, + "grad_norm": 0.8408986330032349, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 34960 + }, + { + "epoch": 2.511310592459605, + "grad_norm": 0.8178259134292603, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 34970 + }, + { + "epoch": 2.512028725314183, + "grad_norm": 0.747876763343811, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 34980 + }, + { + "epoch": 2.5127468581687613, + "grad_norm": 0.8551825881004333, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 34990 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 0.8366564512252808, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 35000 + }, + { + "epoch": 2.5141831238779173, + "grad_norm": 0.8491294384002686, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 35010 + }, + { + "epoch": 2.5149012567324958, + "grad_norm": 0.8854562640190125, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 35020 + }, + { + "epoch": 2.5156193895870738, + "grad_norm": 0.8652133345603943, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 35030 + }, + { + "epoch": 2.5163375224416518, + "grad_norm": 0.8734033107757568, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 35040 + }, + { + "epoch": 2.5170556552962298, + "grad_norm": 0.8613446950912476, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 35050 + }, + { + "epoch": 2.5177737881508078, + "grad_norm": 0.762395441532135, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 35060 + }, + { + "epoch": 2.5184919210053858, + "grad_norm": 0.806220293045044, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 35070 + }, + { + "epoch": 2.519210053859964, + "grad_norm": 0.7781713008880615, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 35080 + }, + { + "epoch": 2.519928186714542, + "grad_norm": 0.8639848828315735, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 35090 + }, + { + "epoch": 2.52064631956912, + "grad_norm": 0.7331740260124207, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 35100 + }, + { + "epoch": 2.521364452423698, + "grad_norm": 0.8148137927055359, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 35110 + }, + { + "epoch": 2.5220825852782767, + "grad_norm": 0.6939297914505005, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 35120 + }, + { + "epoch": 2.5228007181328547, + "grad_norm": 0.8151076436042786, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 35130 + }, + { + "epoch": 2.5235188509874327, + "grad_norm": 0.9193238019943237, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 35140 + }, + { + "epoch": 2.5242369838420107, + "grad_norm": 0.8230985403060913, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 35150 + }, + { + "epoch": 2.5249551166965887, + "grad_norm": 0.865492582321167, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 35160 + }, + { + "epoch": 2.525673249551167, + "grad_norm": 0.7673570513725281, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35170 + }, + { + "epoch": 2.526391382405745, + "grad_norm": 0.8296313881874084, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 35180 + }, + { + "epoch": 2.527109515260323, + "grad_norm": 0.6531317234039307, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 35190 + }, + { + "epoch": 2.527827648114901, + "grad_norm": 0.9865642189979553, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 35200 + }, + { + "epoch": 2.5285457809694796, + "grad_norm": 0.8001098036766052, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 35210 + }, + { + "epoch": 2.5292639138240576, + "grad_norm": 0.7523218393325806, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 35220 + }, + { + "epoch": 2.5299820466786356, + "grad_norm": 1.061640977859497, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 35230 + }, + { + "epoch": 2.5307001795332136, + "grad_norm": 0.9668078422546387, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35240 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 0.9554983973503113, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 35250 + }, + { + "epoch": 2.5321364452423696, + "grad_norm": 0.8343066573143005, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 35260 + }, + { + "epoch": 2.532854578096948, + "grad_norm": 0.8408095240592957, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 35270 + }, + { + "epoch": 2.533572710951526, + "grad_norm": 0.8593984842300415, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 35280 + }, + { + "epoch": 2.534290843806104, + "grad_norm": 0.7593855261802673, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 35290 + }, + { + "epoch": 2.5350089766606825, + "grad_norm": 0.9179701209068298, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 35300 + }, + { + "epoch": 2.5357271095152605, + "grad_norm": 0.749022901058197, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 35310 + }, + { + "epoch": 2.5364452423698385, + "grad_norm": 0.7172152400016785, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 35320 + }, + { + "epoch": 2.5371633752244165, + "grad_norm": 0.8228873610496521, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 35330 + }, + { + "epoch": 2.5378815080789945, + "grad_norm": 0.9663547277450562, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 35340 + }, + { + "epoch": 2.5385996409335725, + "grad_norm": 0.8446536660194397, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35350 + }, + { + "epoch": 2.539317773788151, + "grad_norm": 0.9751029014587402, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 35360 + }, + { + "epoch": 2.540035906642729, + "grad_norm": 0.7460315823554993, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 35370 + }, + { + "epoch": 2.540754039497307, + "grad_norm": 0.8269246816635132, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 35380 + }, + { + "epoch": 2.541472172351885, + "grad_norm": 0.7200030088424683, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 35390 + }, + { + "epoch": 2.5421903052064634, + "grad_norm": 0.9586671590805054, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 35400 + }, + { + "epoch": 2.5429084380610414, + "grad_norm": 0.7872378826141357, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 35410 + }, + { + "epoch": 2.5436265709156194, + "grad_norm": 0.8257358074188232, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 35420 + }, + { + "epoch": 2.5443447037701974, + "grad_norm": 0.6924505829811096, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 35430 + }, + { + "epoch": 2.5450628366247754, + "grad_norm": 1.1171481609344482, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 35440 + }, + { + "epoch": 2.545780969479354, + "grad_norm": 0.9635605216026306, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 35450 + }, + { + "epoch": 2.546499102333932, + "grad_norm": 0.9760567545890808, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 35460 + }, + { + "epoch": 2.54721723518851, + "grad_norm": 0.8523460030555725, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 35470 + }, + { + "epoch": 2.547935368043088, + "grad_norm": 0.9316970109939575, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 35480 + }, + { + "epoch": 2.5486535008976663, + "grad_norm": 0.7401485443115234, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 35490 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 1.0627065896987915, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 35500 + }, + { + "epoch": 2.5500897666068223, + "grad_norm": 0.7463156580924988, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 35510 + }, + { + "epoch": 2.5508078994614003, + "grad_norm": 0.9935570359230042, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 35520 + }, + { + "epoch": 2.5515260323159783, + "grad_norm": 0.8824051022529602, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 35530 + }, + { + "epoch": 2.5522441651705563, + "grad_norm": 0.8018375635147095, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 35540 + }, + { + "epoch": 2.5529622980251347, + "grad_norm": 0.7523182034492493, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 35550 + }, + { + "epoch": 2.5536804308797127, + "grad_norm": 0.6771712303161621, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 35560 + }, + { + "epoch": 2.5543985637342908, + "grad_norm": 0.7903336882591248, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 35570 + }, + { + "epoch": 2.555116696588869, + "grad_norm": 0.7973808646202087, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 35580 + }, + { + "epoch": 2.555834829443447, + "grad_norm": 0.9082772731781006, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 35590 + }, + { + "epoch": 2.556552962298025, + "grad_norm": 0.779671311378479, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 35600 + }, + { + "epoch": 2.557271095152603, + "grad_norm": 0.710058331489563, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 35610 + }, + { + "epoch": 2.557989228007181, + "grad_norm": 0.8217873573303223, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 35620 + }, + { + "epoch": 2.558707360861759, + "grad_norm": 0.8017855286598206, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 35630 + }, + { + "epoch": 2.5594254937163377, + "grad_norm": 0.6671402454376221, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 35640 + }, + { + "epoch": 2.5601436265709157, + "grad_norm": 0.9357045292854309, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 35650 + }, + { + "epoch": 2.5608617594254937, + "grad_norm": 0.7676312327384949, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35660 + }, + { + "epoch": 2.5615798922800717, + "grad_norm": 0.7602545619010925, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 35670 + }, + { + "epoch": 2.56229802513465, + "grad_norm": 0.8112275004386902, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35680 + }, + { + "epoch": 2.563016157989228, + "grad_norm": 0.73296719789505, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 35690 + }, + { + "epoch": 2.563734290843806, + "grad_norm": 0.9007818102836609, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 35700 + }, + { + "epoch": 2.564452423698384, + "grad_norm": 0.7526060938835144, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 35710 + }, + { + "epoch": 2.565170556552962, + "grad_norm": 0.813875675201416, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 35720 + }, + { + "epoch": 2.5658886894075406, + "grad_norm": 0.7767695784568787, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 35730 + }, + { + "epoch": 2.5666068222621186, + "grad_norm": 0.7840573787689209, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35740 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 0.7400487661361694, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 35750 + }, + { + "epoch": 2.5680430879712746, + "grad_norm": 0.7424315810203552, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 35760 + }, + { + "epoch": 2.568761220825853, + "grad_norm": 0.7812185883522034, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 35770 + }, + { + "epoch": 2.569479353680431, + "grad_norm": 0.8397669196128845, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 35780 + }, + { + "epoch": 2.570197486535009, + "grad_norm": 0.7543849945068359, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 35790 + }, + { + "epoch": 2.570915619389587, + "grad_norm": 0.903634786605835, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 35800 + }, + { + "epoch": 2.571633752244165, + "grad_norm": 0.853335976600647, + "learning_rate": 0.0002, + "loss": 0.6884, + "step": 35810 + }, + { + "epoch": 2.572351885098743, + "grad_norm": 0.8441029787063599, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 35820 + }, + { + "epoch": 2.5730700179533215, + "grad_norm": 0.9072228670120239, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 35830 + }, + { + "epoch": 2.5737881508078995, + "grad_norm": 0.7720168828964233, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 35840 + }, + { + "epoch": 2.5745062836624775, + "grad_norm": 0.8719366788864136, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35850 + }, + { + "epoch": 2.575224416517056, + "grad_norm": 0.766209065914154, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 35860 + }, + { + "epoch": 2.575942549371634, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 35870 + }, + { + "epoch": 2.576660682226212, + "grad_norm": 0.8068482875823975, + "learning_rate": 0.0002, + "loss": 0.7309, + "step": 35880 + }, + { + "epoch": 2.57737881508079, + "grad_norm": 0.8321225643157959, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 35890 + }, + { + "epoch": 2.578096947935368, + "grad_norm": 0.9787611961364746, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 35900 + }, + { + "epoch": 2.578815080789946, + "grad_norm": 0.6955108642578125, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 35910 + }, + { + "epoch": 2.5795332136445244, + "grad_norm": 0.8309195637702942, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 35920 + }, + { + "epoch": 2.5802513464991024, + "grad_norm": 0.9309390783309937, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 35930 + }, + { + "epoch": 2.5809694793536804, + "grad_norm": 0.903537392616272, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 35940 + }, + { + "epoch": 2.5816876122082584, + "grad_norm": 0.9530633091926575, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 35950 + }, + { + "epoch": 2.582405745062837, + "grad_norm": 1.0140212774276733, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 35960 + }, + { + "epoch": 2.583123877917415, + "grad_norm": 0.8224637508392334, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 35970 + }, + { + "epoch": 2.583842010771993, + "grad_norm": 0.7952998280525208, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 35980 + }, + { + "epoch": 2.584560143626571, + "grad_norm": 0.6057878136634827, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 35990 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 0.9172457456588745, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 36000 + }, + { + "epoch": 2.5859964093357273, + "grad_norm": 1.0061585903167725, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36010 + }, + { + "epoch": 2.5867145421903053, + "grad_norm": 0.8555058240890503, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 36020 + }, + { + "epoch": 2.5874326750448833, + "grad_norm": 0.7732099890708923, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 36030 + }, + { + "epoch": 2.5881508078994613, + "grad_norm": 0.9026121497154236, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 36040 + }, + { + "epoch": 2.5888689407540397, + "grad_norm": 0.7477090954780579, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 36050 + }, + { + "epoch": 2.5895870736086177, + "grad_norm": 0.8835780024528503, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 36060 + }, + { + "epoch": 2.5903052064631957, + "grad_norm": 0.7555899024009705, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 36070 + }, + { + "epoch": 2.5910233393177737, + "grad_norm": 0.7983574867248535, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 36080 + }, + { + "epoch": 2.5917414721723517, + "grad_norm": 0.9261698722839355, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 36090 + }, + { + "epoch": 2.5924596050269297, + "grad_norm": 0.6834031343460083, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 36100 + }, + { + "epoch": 2.593177737881508, + "grad_norm": 0.9528526067733765, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 36110 + }, + { + "epoch": 2.593895870736086, + "grad_norm": 0.7469993233680725, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 36120 + }, + { + "epoch": 2.594614003590664, + "grad_norm": 0.6750355362892151, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 36130 + }, + { + "epoch": 2.5953321364452426, + "grad_norm": 0.8591015338897705, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 36140 + }, + { + "epoch": 2.5960502692998206, + "grad_norm": 0.7359472513198853, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 36150 + }, + { + "epoch": 2.5967684021543986, + "grad_norm": 0.8450608253479004, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36160 + }, + { + "epoch": 2.5974865350089766, + "grad_norm": 0.9069468975067139, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36170 + }, + { + "epoch": 2.5982046678635546, + "grad_norm": 0.9261118173599243, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 36180 + }, + { + "epoch": 2.5989228007181326, + "grad_norm": 0.7164715528488159, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 36190 + }, + { + "epoch": 2.599640933572711, + "grad_norm": 0.8809511661529541, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 36200 + }, + { + "epoch": 2.600359066427289, + "grad_norm": 0.9872701168060303, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 36210 + }, + { + "epoch": 2.601077199281867, + "grad_norm": 0.7544043064117432, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 36220 + }, + { + "epoch": 2.601795332136445, + "grad_norm": 0.9890767335891724, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 36230 + }, + { + "epoch": 2.6025134649910235, + "grad_norm": 0.907865047454834, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 36240 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 0.7724096179008484, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 36250 + }, + { + "epoch": 2.6039497307001795, + "grad_norm": 0.7996655106544495, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36260 + }, + { + "epoch": 2.6046678635547575, + "grad_norm": 0.7184412479400635, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 36270 + }, + { + "epoch": 2.6053859964093355, + "grad_norm": 0.7781601548194885, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 36280 + }, + { + "epoch": 2.6061041292639135, + "grad_norm": 0.8972102403640747, + "learning_rate": 0.0002, + "loss": 0.6975, + "step": 36290 + }, + { + "epoch": 2.606822262118492, + "grad_norm": 0.6831884980201721, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 36300 + }, + { + "epoch": 2.60754039497307, + "grad_norm": 0.9049789905548096, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 36310 + }, + { + "epoch": 2.608258527827648, + "grad_norm": 0.8062970042228699, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 36320 + }, + { + "epoch": 2.6089766606822264, + "grad_norm": 0.94797682762146, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 36330 + }, + { + "epoch": 2.6096947935368044, + "grad_norm": 0.7907559275627136, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 36340 + }, + { + "epoch": 2.6104129263913824, + "grad_norm": 0.6720156073570251, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 36350 + }, + { + "epoch": 2.6111310592459605, + "grad_norm": 0.729228138923645, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 36360 + }, + { + "epoch": 2.6118491921005385, + "grad_norm": 0.9072836637496948, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 36370 + }, + { + "epoch": 2.6125673249551165, + "grad_norm": 0.8022173643112183, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36380 + }, + { + "epoch": 2.613285457809695, + "grad_norm": 0.7475612163543701, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 36390 + }, + { + "epoch": 2.614003590664273, + "grad_norm": 0.7976534366607666, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 36400 + }, + { + "epoch": 2.614721723518851, + "grad_norm": 0.7118260860443115, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36410 + }, + { + "epoch": 2.6154398563734294, + "grad_norm": 0.666500985622406, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36420 + }, + { + "epoch": 2.6161579892280074, + "grad_norm": 0.8776089549064636, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 36430 + }, + { + "epoch": 2.6168761220825854, + "grad_norm": 0.9375919699668884, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 36440 + }, + { + "epoch": 2.6175942549371634, + "grad_norm": 0.8162244558334351, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 36450 + }, + { + "epoch": 2.6183123877917414, + "grad_norm": 0.8459304571151733, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 36460 + }, + { + "epoch": 2.6190305206463194, + "grad_norm": 0.7731037735939026, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 36470 + }, + { + "epoch": 2.619748653500898, + "grad_norm": 0.7857680320739746, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 36480 + }, + { + "epoch": 2.620466786355476, + "grad_norm": 0.8415161371231079, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 36490 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 0.8103558421134949, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 36500 + }, + { + "epoch": 2.621903052064632, + "grad_norm": 0.7876150608062744, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 36510 + }, + { + "epoch": 2.6226211849192103, + "grad_norm": 0.7316484451293945, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 36520 + }, + { + "epoch": 2.6233393177737883, + "grad_norm": 0.7209784984588623, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 36530 + }, + { + "epoch": 2.6240574506283663, + "grad_norm": 0.8933016657829285, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 36540 + }, + { + "epoch": 2.6247755834829443, + "grad_norm": 0.8078171610832214, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 36550 + }, + { + "epoch": 2.6254937163375223, + "grad_norm": 0.9134724736213684, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 36560 + }, + { + "epoch": 2.6262118491921003, + "grad_norm": 0.8691368699073792, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 36570 + }, + { + "epoch": 2.6269299820466787, + "grad_norm": 0.706479012966156, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 36580 + }, + { + "epoch": 2.6276481149012567, + "grad_norm": 0.9333644509315491, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 36590 + }, + { + "epoch": 2.6283662477558347, + "grad_norm": 0.8156154155731201, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 36600 + }, + { + "epoch": 2.629084380610413, + "grad_norm": 0.812745213508606, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 36610 + }, + { + "epoch": 2.629802513464991, + "grad_norm": 0.8898148536682129, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 36620 + }, + { + "epoch": 2.630520646319569, + "grad_norm": 0.8083946108818054, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36630 + }, + { + "epoch": 2.631238779174147, + "grad_norm": 0.7050122618675232, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 36640 + }, + { + "epoch": 2.631956912028725, + "grad_norm": 0.8155789971351624, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 36650 + }, + { + "epoch": 2.632675044883303, + "grad_norm": 0.9102175235748291, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 36660 + }, + { + "epoch": 2.6333931777378816, + "grad_norm": 0.6621248126029968, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36670 + }, + { + "epoch": 2.6341113105924596, + "grad_norm": 0.7338519096374512, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 36680 + }, + { + "epoch": 2.6348294434470376, + "grad_norm": 0.7536506652832031, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 36690 + }, + { + "epoch": 2.635547576301616, + "grad_norm": 0.9357436299324036, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 36700 + }, + { + "epoch": 2.636265709156194, + "grad_norm": 0.7732111215591431, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 36710 + }, + { + "epoch": 2.636983842010772, + "grad_norm": 0.6863537430763245, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36720 + }, + { + "epoch": 2.63770197486535, + "grad_norm": 0.8014764785766602, + "learning_rate": 0.0002, + "loss": 0.7058, + "step": 36730 + }, + { + "epoch": 2.638420107719928, + "grad_norm": 0.8103911280632019, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 36740 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 0.882652997970581, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 36750 + }, + { + "epoch": 2.6398563734290845, + "grad_norm": 0.8705278038978577, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 36760 + }, + { + "epoch": 2.6405745062836625, + "grad_norm": 0.80764240026474, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36770 + }, + { + "epoch": 2.6412926391382405, + "grad_norm": 0.9668620824813843, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 36780 + }, + { + "epoch": 2.6420107719928185, + "grad_norm": 0.7477577328681946, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 36790 + }, + { + "epoch": 2.642728904847397, + "grad_norm": 0.8344516754150391, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 36800 + }, + { + "epoch": 2.643447037701975, + "grad_norm": 0.9520720839500427, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 36810 + }, + { + "epoch": 2.644165170556553, + "grad_norm": 0.5942372679710388, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 36820 + }, + { + "epoch": 2.644883303411131, + "grad_norm": 0.7411555051803589, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 36830 + }, + { + "epoch": 2.645601436265709, + "grad_norm": 0.6597771048545837, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 36840 + }, + { + "epoch": 2.646319569120287, + "grad_norm": 0.8636548519134521, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 36850 + }, + { + "epoch": 2.6470377019748654, + "grad_norm": 0.8557497262954712, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 36860 + }, + { + "epoch": 2.6477558348294434, + "grad_norm": 0.8535996675491333, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 36870 + }, + { + "epoch": 2.6484739676840214, + "grad_norm": 0.7996463775634766, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 36880 + }, + { + "epoch": 2.6491921005386, + "grad_norm": 0.6462067365646362, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 36890 + }, + { + "epoch": 2.649910233393178, + "grad_norm": 0.8849772214889526, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36900 + }, + { + "epoch": 2.650628366247756, + "grad_norm": 0.999173641204834, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 36910 + }, + { + "epoch": 2.651346499102334, + "grad_norm": 0.7221724987030029, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 36920 + }, + { + "epoch": 2.652064631956912, + "grad_norm": 0.8122989535331726, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 36930 + }, + { + "epoch": 2.65278276481149, + "grad_norm": 0.724267840385437, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 36940 + }, + { + "epoch": 2.6535008976660683, + "grad_norm": 0.8250583410263062, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 36950 + }, + { + "epoch": 2.6542190305206463, + "grad_norm": 0.7623526453971863, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 36960 + }, + { + "epoch": 2.6549371633752243, + "grad_norm": 0.6474025845527649, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 36970 + }, + { + "epoch": 2.655655296229803, + "grad_norm": 0.9751694202423096, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 36980 + }, + { + "epoch": 2.656373429084381, + "grad_norm": 0.8338939547538757, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 36990 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 0.8877421021461487, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 37000 + }, + { + "epoch": 2.657809694793537, + "grad_norm": 0.9590298533439636, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 37010 + }, + { + "epoch": 2.658527827648115, + "grad_norm": 0.8224121928215027, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 37020 + }, + { + "epoch": 2.659245960502693, + "grad_norm": 0.9871236681938171, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 37030 + }, + { + "epoch": 2.6599640933572712, + "grad_norm": 0.8729037046432495, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 37040 + }, + { + "epoch": 2.6606822262118492, + "grad_norm": 0.6279319524765015, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 37050 + }, + { + "epoch": 2.6614003590664272, + "grad_norm": 1.0278962850570679, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37060 + }, + { + "epoch": 2.6621184919210052, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 37070 + }, + { + "epoch": 2.6628366247755837, + "grad_norm": 0.7432018518447876, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 37080 + }, + { + "epoch": 2.6635547576301617, + "grad_norm": 0.9425008296966553, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 37090 + }, + { + "epoch": 2.6642728904847397, + "grad_norm": 0.7542579174041748, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 37100 + }, + { + "epoch": 2.6649910233393177, + "grad_norm": 0.8469315767288208, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 37110 + }, + { + "epoch": 2.6657091561938957, + "grad_norm": 0.865777313709259, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 37120 + }, + { + "epoch": 2.6664272890484737, + "grad_norm": 0.7293250560760498, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 37130 + }, + { + "epoch": 2.667145421903052, + "grad_norm": 0.7199395895004272, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 37140 + }, + { + "epoch": 2.66786355475763, + "grad_norm": 0.7801268100738525, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 37150 + }, + { + "epoch": 2.668581687612208, + "grad_norm": 0.8706921935081482, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 37160 + }, + { + "epoch": 2.6692998204667866, + "grad_norm": 0.7124722599983215, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 37170 + }, + { + "epoch": 2.6700179533213646, + "grad_norm": 0.8333015441894531, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 37180 + }, + { + "epoch": 2.6707360861759426, + "grad_norm": 0.8822736740112305, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 37190 + }, + { + "epoch": 2.6714542190305206, + "grad_norm": 0.8300906419754028, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 37200 + }, + { + "epoch": 2.6721723518850986, + "grad_norm": 0.887126088142395, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37210 + }, + { + "epoch": 2.6728904847396766, + "grad_norm": 0.7473671436309814, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 37220 + }, + { + "epoch": 2.673608617594255, + "grad_norm": 0.8121018409729004, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 37230 + }, + { + "epoch": 2.674326750448833, + "grad_norm": 0.7882586717605591, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 37240 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 0.797060489654541, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 37250 + }, + { + "epoch": 2.6757630161579895, + "grad_norm": 0.9776935577392578, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 37260 + }, + { + "epoch": 2.6764811490125675, + "grad_norm": 0.9527283906936646, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37270 + }, + { + "epoch": 2.6771992818671455, + "grad_norm": 0.7232038974761963, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 37280 + }, + { + "epoch": 2.6779174147217235, + "grad_norm": 0.8514575362205505, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 37290 + }, + { + "epoch": 2.6786355475763015, + "grad_norm": 0.8951214551925659, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 37300 + }, + { + "epoch": 2.6793536804308795, + "grad_norm": 0.7569643259048462, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 37310 + }, + { + "epoch": 2.680071813285458, + "grad_norm": 1.0522346496582031, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 37320 + }, + { + "epoch": 2.680789946140036, + "grad_norm": 0.8914180994033813, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 37330 + }, + { + "epoch": 2.681508078994614, + "grad_norm": 0.8251807689666748, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 37340 + }, + { + "epoch": 2.682226211849192, + "grad_norm": 0.8215394020080566, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 37350 + }, + { + "epoch": 2.6829443447037704, + "grad_norm": 0.8043696880340576, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 37360 + }, + { + "epoch": 2.6836624775583484, + "grad_norm": 0.767250657081604, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 37370 + }, + { + "epoch": 2.6843806104129264, + "grad_norm": 0.817740261554718, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 37380 + }, + { + "epoch": 2.6850987432675044, + "grad_norm": 0.7963255047798157, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 37390 + }, + { + "epoch": 2.6858168761220824, + "grad_norm": 0.839271605014801, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 37400 + }, + { + "epoch": 2.6865350089766604, + "grad_norm": 0.7882823348045349, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 37410 + }, + { + "epoch": 2.687253141831239, + "grad_norm": 0.8316412568092346, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 37420 + }, + { + "epoch": 2.687971274685817, + "grad_norm": 1.0044993162155151, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37430 + }, + { + "epoch": 2.688689407540395, + "grad_norm": 0.8342832326889038, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 37440 + }, + { + "epoch": 2.6894075403949733, + "grad_norm": 0.6743215322494507, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 37450 + }, + { + "epoch": 2.6901256732495513, + "grad_norm": 0.6872923970222473, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 37460 + }, + { + "epoch": 2.6908438061041293, + "grad_norm": 0.7377792596817017, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 37470 + }, + { + "epoch": 2.6915619389587073, + "grad_norm": 0.7677304744720459, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 37480 + }, + { + "epoch": 2.6922800718132853, + "grad_norm": 0.9951061010360718, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 37490 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 0.7452111840248108, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 37500 + }, + { + "epoch": 2.6937163375224418, + "grad_norm": 0.9663393497467041, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 37510 + }, + { + "epoch": 2.6944344703770198, + "grad_norm": 0.7919635772705078, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 37520 + }, + { + "epoch": 2.6951526032315978, + "grad_norm": 0.9977981448173523, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 37530 + }, + { + "epoch": 2.695870736086176, + "grad_norm": 0.7279480695724487, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 37540 + }, + { + "epoch": 2.6965888689407542, + "grad_norm": 0.7218075394630432, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 37550 + }, + { + "epoch": 2.6973070017953322, + "grad_norm": 0.9041047096252441, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 37560 + }, + { + "epoch": 2.6980251346499102, + "grad_norm": 0.7689407467842102, + "learning_rate": 0.0002, + "loss": 0.6848, + "step": 37570 + }, + { + "epoch": 2.6987432675044882, + "grad_norm": 0.8184728622436523, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 37580 + }, + { + "epoch": 2.6994614003590662, + "grad_norm": 0.7536661624908447, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 37590 + }, + { + "epoch": 2.7001795332136447, + "grad_norm": 0.8371431231498718, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 37600 + }, + { + "epoch": 2.7008976660682227, + "grad_norm": 0.8562723994255066, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 37610 + }, + { + "epoch": 2.7016157989228007, + "grad_norm": 0.8227898478507996, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 37620 + }, + { + "epoch": 2.7023339317773787, + "grad_norm": 0.764792799949646, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 37630 + }, + { + "epoch": 2.703052064631957, + "grad_norm": 0.7782649993896484, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 37640 + }, + { + "epoch": 2.703770197486535, + "grad_norm": 0.7669944167137146, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 37650 + }, + { + "epoch": 2.704488330341113, + "grad_norm": 0.7945750951766968, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 37660 + }, + { + "epoch": 2.705206463195691, + "grad_norm": 0.6840786337852478, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 37670 + }, + { + "epoch": 2.705924596050269, + "grad_norm": 1.0565117597579956, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 37680 + }, + { + "epoch": 2.706642728904847, + "grad_norm": 0.7407042384147644, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 37690 + }, + { + "epoch": 2.7073608617594256, + "grad_norm": 0.7862113118171692, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 37700 + }, + { + "epoch": 2.7080789946140036, + "grad_norm": 0.7487596273422241, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 37710 + }, + { + "epoch": 2.7087971274685816, + "grad_norm": 0.9416596293449402, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 37720 + }, + { + "epoch": 2.70951526032316, + "grad_norm": 0.8943207263946533, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 37730 + }, + { + "epoch": 2.710233393177738, + "grad_norm": 0.9263445138931274, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 37740 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 0.6869737505912781, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 37750 + }, + { + "epoch": 2.711669658886894, + "grad_norm": 0.9186407923698425, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 37760 + }, + { + "epoch": 2.712387791741472, + "grad_norm": 0.8379335999488831, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 37770 + }, + { + "epoch": 2.71310592459605, + "grad_norm": 0.7248736023902893, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 37780 + }, + { + "epoch": 2.7138240574506285, + "grad_norm": 0.8636229038238525, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 37790 + }, + { + "epoch": 2.7145421903052065, + "grad_norm": 0.7590767741203308, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 37800 + }, + { + "epoch": 2.7152603231597845, + "grad_norm": 0.8946404457092285, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 37810 + }, + { + "epoch": 2.7159784560143625, + "grad_norm": 0.7822132706642151, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 37820 + }, + { + "epoch": 2.716696588868941, + "grad_norm": 0.7882820963859558, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 37830 + }, + { + "epoch": 2.717414721723519, + "grad_norm": 0.8025872707366943, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 37840 + }, + { + "epoch": 2.718132854578097, + "grad_norm": 0.8618839979171753, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 37850 + }, + { + "epoch": 2.718850987432675, + "grad_norm": 0.6975733637809753, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 37860 + }, + { + "epoch": 2.719569120287253, + "grad_norm": 0.7952182292938232, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 37870 + }, + { + "epoch": 2.7202872531418314, + "grad_norm": 0.7580680251121521, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 37880 + }, + { + "epoch": 2.7210053859964094, + "grad_norm": 0.9504257440567017, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 37890 + }, + { + "epoch": 2.7217235188509874, + "grad_norm": 0.856614351272583, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 37900 + }, + { + "epoch": 2.7224416517055654, + "grad_norm": 1.0092085599899292, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 37910 + }, + { + "epoch": 2.723159784560144, + "grad_norm": 0.9009839296340942, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 37920 + }, + { + "epoch": 2.723877917414722, + "grad_norm": 0.9247435331344604, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 37930 + }, + { + "epoch": 2.7245960502693, + "grad_norm": 1.0774317979812622, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 37940 + }, + { + "epoch": 2.725314183123878, + "grad_norm": 0.9104372262954712, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 37950 + }, + { + "epoch": 2.726032315978456, + "grad_norm": 0.7904245257377625, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 37960 + }, + { + "epoch": 2.726750448833034, + "grad_norm": 0.9555521607398987, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 37970 + }, + { + "epoch": 2.7274685816876123, + "grad_norm": 0.7769099473953247, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 37980 + }, + { + "epoch": 2.7281867145421903, + "grad_norm": 0.9202065467834473, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 37990 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 0.732510507106781, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 38000 + }, + { + "epoch": 2.7296229802513468, + "grad_norm": 0.7723771929740906, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 38010 + }, + { + "epoch": 2.7303411131059248, + "grad_norm": 0.7948567867279053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 38020 + }, + { + "epoch": 2.7310592459605028, + "grad_norm": 0.7702966928482056, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 38030 + }, + { + "epoch": 2.7317773788150808, + "grad_norm": 0.689098060131073, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 38040 + }, + { + "epoch": 2.7324955116696588, + "grad_norm": 0.7951080203056335, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 38050 + }, + { + "epoch": 2.7332136445242368, + "grad_norm": 0.7284924983978271, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 38060 + }, + { + "epoch": 2.733931777378815, + "grad_norm": 0.9198044538497925, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 38070 + }, + { + "epoch": 2.734649910233393, + "grad_norm": 0.8653260469436646, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 38080 + }, + { + "epoch": 2.735368043087971, + "grad_norm": 0.8503400683403015, + "learning_rate": 0.0002, + "loss": 0.6832, + "step": 38090 + }, + { + "epoch": 2.736086175942549, + "grad_norm": 0.8388783931732178, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 38100 + }, + { + "epoch": 2.7368043087971277, + "grad_norm": 0.7636904716491699, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 38110 + }, + { + "epoch": 2.7375224416517057, + "grad_norm": 0.8990790247917175, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 38120 + }, + { + "epoch": 2.7382405745062837, + "grad_norm": 0.8878970742225647, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 38130 + }, + { + "epoch": 2.7389587073608617, + "grad_norm": 0.7684310078620911, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 38140 + }, + { + "epoch": 2.7396768402154397, + "grad_norm": 1.0777359008789062, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 38150 + }, + { + "epoch": 2.740394973070018, + "grad_norm": 0.768764317035675, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 38160 + }, + { + "epoch": 2.741113105924596, + "grad_norm": 0.7490760087966919, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 38170 + }, + { + "epoch": 2.741831238779174, + "grad_norm": 0.860373854637146, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 38180 + }, + { + "epoch": 2.742549371633752, + "grad_norm": 0.7145599722862244, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 38190 + }, + { + "epoch": 2.7432675044883306, + "grad_norm": 0.8347760438919067, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 38200 + }, + { + "epoch": 2.7439856373429086, + "grad_norm": 0.8425729274749756, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 38210 + }, + { + "epoch": 2.7447037701974866, + "grad_norm": 0.9289436936378479, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 38220 + }, + { + "epoch": 2.7454219030520646, + "grad_norm": 0.7608675360679626, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 38230 + }, + { + "epoch": 2.7461400359066426, + "grad_norm": 0.8067167401313782, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 38240 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 0.8599629402160645, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 38250 + }, + { + "epoch": 2.747576301615799, + "grad_norm": 0.8425742387771606, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 38260 + }, + { + "epoch": 2.748294434470377, + "grad_norm": 0.8626754283905029, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 38270 + }, + { + "epoch": 2.749012567324955, + "grad_norm": 0.797652006149292, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 38280 + }, + { + "epoch": 2.7497307001795335, + "grad_norm": 0.7971500754356384, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 38290 + }, + { + "epoch": 2.7504488330341115, + "grad_norm": 0.9786333441734314, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 38300 + }, + { + "epoch": 2.7511669658886895, + "grad_norm": 0.7146100997924805, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 38310 + }, + { + "epoch": 2.7518850987432675, + "grad_norm": 0.8436099886894226, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 38320 + }, + { + "epoch": 2.7526032315978455, + "grad_norm": 0.8943847417831421, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 38330 + }, + { + "epoch": 2.7533213644524235, + "grad_norm": 0.8170148730278015, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 38340 + }, + { + "epoch": 2.754039497307002, + "grad_norm": 0.7804728746414185, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 38350 + }, + { + "epoch": 2.75475763016158, + "grad_norm": 0.9139971137046814, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38360 + }, + { + "epoch": 2.755475763016158, + "grad_norm": 0.835332453250885, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 38370 + }, + { + "epoch": 2.756193895870736, + "grad_norm": 1.0904794931411743, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 38380 + }, + { + "epoch": 2.7569120287253144, + "grad_norm": 0.7443365454673767, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 38390 + }, + { + "epoch": 2.7576301615798924, + "grad_norm": 1.1336839199066162, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 38400 + }, + { + "epoch": 2.7583482944344704, + "grad_norm": 0.9024015665054321, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 38410 + }, + { + "epoch": 2.7590664272890484, + "grad_norm": 0.7380578517913818, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 38420 + }, + { + "epoch": 2.7597845601436264, + "grad_norm": 0.9860634207725525, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 38430 + }, + { + "epoch": 2.760502692998205, + "grad_norm": 0.7928970456123352, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 38440 + }, + { + "epoch": 2.761220825852783, + "grad_norm": 1.0357221364974976, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 38450 + }, + { + "epoch": 2.761938958707361, + "grad_norm": 0.8110901117324829, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 38460 + }, + { + "epoch": 2.762657091561939, + "grad_norm": 0.8420981764793396, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 38470 + }, + { + "epoch": 2.7633752244165173, + "grad_norm": 0.858955979347229, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 38480 + }, + { + "epoch": 2.7640933572710953, + "grad_norm": 0.9851368069648743, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 38490 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 0.8073325753211975, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 38500 + }, + { + "epoch": 2.7655296229802513, + "grad_norm": 1.0654062032699585, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38510 + }, + { + "epoch": 2.7662477558348293, + "grad_norm": 0.719603955745697, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 38520 + }, + { + "epoch": 2.7669658886894073, + "grad_norm": 0.9790831804275513, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38530 + }, + { + "epoch": 2.7676840215439857, + "grad_norm": 0.907619833946228, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 38540 + }, + { + "epoch": 2.7684021543985637, + "grad_norm": 0.7463719248771667, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 38550 + }, + { + "epoch": 2.7691202872531417, + "grad_norm": 1.0687178373336792, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 38560 + }, + { + "epoch": 2.76983842010772, + "grad_norm": 0.7397776246070862, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 38570 + }, + { + "epoch": 2.770556552962298, + "grad_norm": 0.7392559051513672, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 38580 + }, + { + "epoch": 2.771274685816876, + "grad_norm": 0.9774793982505798, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38590 + }, + { + "epoch": 2.771992818671454, + "grad_norm": 0.9502208828926086, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 38600 + }, + { + "epoch": 2.772710951526032, + "grad_norm": 0.776108980178833, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 38610 + }, + { + "epoch": 2.77342908438061, + "grad_norm": 0.7633077502250671, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 38620 + }, + { + "epoch": 2.7741472172351886, + "grad_norm": 0.9445580244064331, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 38630 + }, + { + "epoch": 2.7748653500897666, + "grad_norm": 0.943165123462677, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 38640 + }, + { + "epoch": 2.7755834829443446, + "grad_norm": 0.9045929908752441, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 38650 + }, + { + "epoch": 2.7763016157989227, + "grad_norm": 0.9425684213638306, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 38660 + }, + { + "epoch": 2.777019748653501, + "grad_norm": 0.9106295704841614, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 38670 + }, + { + "epoch": 2.777737881508079, + "grad_norm": 0.6264749765396118, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 38680 + }, + { + "epoch": 2.778456014362657, + "grad_norm": 0.9156801700592041, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 38690 + }, + { + "epoch": 2.779174147217235, + "grad_norm": 0.9752956032752991, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 38700 + }, + { + "epoch": 2.779892280071813, + "grad_norm": 0.7849555611610413, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 38710 + }, + { + "epoch": 2.780610412926391, + "grad_norm": 0.8109981417655945, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 38720 + }, + { + "epoch": 2.7813285457809696, + "grad_norm": 0.7882387638092041, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 38730 + }, + { + "epoch": 2.7820466786355476, + "grad_norm": 0.9049678444862366, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 38740 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 0.7678212523460388, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38750 + }, + { + "epoch": 2.783482944344704, + "grad_norm": 0.9754453301429749, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 38760 + }, + { + "epoch": 2.784201077199282, + "grad_norm": 0.7643493413925171, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 38770 + }, + { + "epoch": 2.78491921005386, + "grad_norm": 0.7440303564071655, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 38780 + }, + { + "epoch": 2.785637342908438, + "grad_norm": 0.8870946168899536, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 38790 + }, + { + "epoch": 2.786355475763016, + "grad_norm": 0.8100579977035522, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 38800 + }, + { + "epoch": 2.787073608617594, + "grad_norm": 0.7082616090774536, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 38810 + }, + { + "epoch": 2.7877917414721725, + "grad_norm": 0.7880047559738159, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 38820 + }, + { + "epoch": 2.7885098743267505, + "grad_norm": 0.7217963337898254, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 38830 + }, + { + "epoch": 2.7892280071813285, + "grad_norm": 0.799124002456665, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 38840 + }, + { + "epoch": 2.789946140035907, + "grad_norm": 1.0004022121429443, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 38850 + }, + { + "epoch": 2.790664272890485, + "grad_norm": 0.7866547107696533, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 38860 + }, + { + "epoch": 2.791382405745063, + "grad_norm": 0.891603410243988, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 38870 + }, + { + "epoch": 2.792100538599641, + "grad_norm": 0.7687129378318787, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 38880 + }, + { + "epoch": 2.792818671454219, + "grad_norm": 0.7549769282341003, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 38890 + }, + { + "epoch": 2.793536804308797, + "grad_norm": 0.7792351245880127, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 38900 + }, + { + "epoch": 2.7942549371633754, + "grad_norm": 0.7352819442749023, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 38910 + }, + { + "epoch": 2.7949730700179534, + "grad_norm": 0.8758018612861633, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 38920 + }, + { + "epoch": 2.7956912028725314, + "grad_norm": 0.8213023543357849, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38930 + }, + { + "epoch": 2.7964093357271094, + "grad_norm": 0.899368941783905, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 38940 + }, + { + "epoch": 2.797127468581688, + "grad_norm": 0.7497758269309998, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 38950 + }, + { + "epoch": 2.797845601436266, + "grad_norm": 0.870704710483551, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 38960 + }, + { + "epoch": 2.798563734290844, + "grad_norm": 0.8021528720855713, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 38970 + }, + { + "epoch": 2.799281867145422, + "grad_norm": 0.7541360855102539, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 38980 + }, + { + "epoch": 2.8, + "grad_norm": 0.8909788131713867, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 38990 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 0.8175999522209167, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 39000 + }, + { + "epoch": 2.8014362657091563, + "grad_norm": 0.7336044311523438, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 39010 + }, + { + "epoch": 2.8021543985637343, + "grad_norm": 0.7354168891906738, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 39020 + }, + { + "epoch": 2.8028725314183123, + "grad_norm": 0.8771968483924866, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 39030 + }, + { + "epoch": 2.8035906642728907, + "grad_norm": 0.8073309063911438, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39040 + }, + { + "epoch": 2.8043087971274687, + "grad_norm": 0.8475365042686462, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39050 + }, + { + "epoch": 2.8050269299820467, + "grad_norm": 0.7233281135559082, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 39060 + }, + { + "epoch": 2.8057450628366247, + "grad_norm": 0.9850572347640991, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39070 + }, + { + "epoch": 2.8064631956912027, + "grad_norm": 1.0635435581207275, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 39080 + }, + { + "epoch": 2.8071813285457807, + "grad_norm": 0.8183665871620178, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 39090 + }, + { + "epoch": 2.807899461400359, + "grad_norm": 0.802228569984436, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 39100 + }, + { + "epoch": 2.808617594254937, + "grad_norm": 0.9861624836921692, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 39110 + }, + { + "epoch": 2.809335727109515, + "grad_norm": 0.675205409526825, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 39120 + }, + { + "epoch": 2.8100538599640936, + "grad_norm": 0.7503975629806519, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 39130 + }, + { + "epoch": 2.8107719928186716, + "grad_norm": 0.8266825675964355, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 39140 + }, + { + "epoch": 2.8114901256732496, + "grad_norm": 0.6956485509872437, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39150 + }, + { + "epoch": 2.8122082585278276, + "grad_norm": 0.7363799214363098, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 39160 + }, + { + "epoch": 2.8129263913824056, + "grad_norm": 1.3893407583236694, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 39170 + }, + { + "epoch": 2.8136445242369836, + "grad_norm": 1.0619654655456543, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 39180 + }, + { + "epoch": 2.814362657091562, + "grad_norm": 0.7924326062202454, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 39190 + }, + { + "epoch": 2.81508078994614, + "grad_norm": 0.8838121294975281, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 39200 + }, + { + "epoch": 2.815798922800718, + "grad_norm": 0.9059016108512878, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 39210 + }, + { + "epoch": 2.816517055655296, + "grad_norm": 0.9284590482711792, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 39220 + }, + { + "epoch": 2.8172351885098745, + "grad_norm": 0.7992225289344788, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 39230 + }, + { + "epoch": 2.8179533213644525, + "grad_norm": 0.816376805305481, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 39240 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 0.9183637499809265, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 39250 + }, + { + "epoch": 2.8193895870736085, + "grad_norm": 0.7232057452201843, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 39260 + }, + { + "epoch": 2.8201077199281865, + "grad_norm": 0.9012457728385925, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 39270 + }, + { + "epoch": 2.8208258527827645, + "grad_norm": 0.7796093821525574, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 39280 + }, + { + "epoch": 2.821543985637343, + "grad_norm": 0.8331146836280823, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 39290 + }, + { + "epoch": 2.822262118491921, + "grad_norm": 0.8031269907951355, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 39300 + }, + { + "epoch": 2.822980251346499, + "grad_norm": 0.8563299179077148, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 39310 + }, + { + "epoch": 2.8236983842010774, + "grad_norm": 0.8083387613296509, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 39320 + }, + { + "epoch": 2.8244165170556554, + "grad_norm": 0.8132631182670593, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 39330 + }, + { + "epoch": 2.8251346499102334, + "grad_norm": 0.9071316719055176, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39340 + }, + { + "epoch": 2.8258527827648114, + "grad_norm": 0.8224168419837952, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 39350 + }, + { + "epoch": 2.8265709156193894, + "grad_norm": 1.073014497756958, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 39360 + }, + { + "epoch": 2.8272890484739674, + "grad_norm": 0.9466553926467896, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 39370 + }, + { + "epoch": 2.828007181328546, + "grad_norm": 0.8946257829666138, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 39380 + }, + { + "epoch": 2.828725314183124, + "grad_norm": 0.8497758507728577, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 39390 + }, + { + "epoch": 2.829443447037702, + "grad_norm": 0.8952143788337708, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 39400 + }, + { + "epoch": 2.8301615798922803, + "grad_norm": 0.8839313983917236, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 39410 + }, + { + "epoch": 2.8308797127468583, + "grad_norm": 0.7576757669448853, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 39420 + }, + { + "epoch": 2.8315978456014363, + "grad_norm": 0.8212469816207886, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 39430 + }, + { + "epoch": 2.8323159784560143, + "grad_norm": 0.9289504885673523, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 39440 + }, + { + "epoch": 2.8330341113105924, + "grad_norm": 0.8745405077934265, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 39450 + }, + { + "epoch": 2.8337522441651704, + "grad_norm": 0.7974533438682556, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 39460 + }, + { + "epoch": 2.834470377019749, + "grad_norm": 0.914289116859436, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 39470 + }, + { + "epoch": 2.835188509874327, + "grad_norm": 0.7686914801597595, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 39480 + }, + { + "epoch": 2.835906642728905, + "grad_norm": 0.9289370179176331, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39490 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 0.8851973414421082, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 39500 + }, + { + "epoch": 2.8373429084380613, + "grad_norm": 0.7754096388816833, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 39510 + }, + { + "epoch": 2.8380610412926393, + "grad_norm": 0.8801632523536682, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 39520 + }, + { + "epoch": 2.8387791741472173, + "grad_norm": 0.9031528234481812, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 39530 + }, + { + "epoch": 2.8394973070017953, + "grad_norm": 0.7113721966743469, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 39540 + }, + { + "epoch": 2.8402154398563733, + "grad_norm": 0.7880923748016357, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 39550 + }, + { + "epoch": 2.8409335727109513, + "grad_norm": 2.4828813076019287, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39560 + }, + { + "epoch": 2.8416517055655297, + "grad_norm": 0.9174619913101196, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 39570 + }, + { + "epoch": 2.8423698384201077, + "grad_norm": 0.9708074927330017, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 39580 + }, + { + "epoch": 2.8430879712746857, + "grad_norm": 0.7968248724937439, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 39590 + }, + { + "epoch": 2.843806104129264, + "grad_norm": 0.7967682480812073, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 39600 + }, + { + "epoch": 2.844524236983842, + "grad_norm": 0.7487651109695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 39610 + }, + { + "epoch": 2.84524236983842, + "grad_norm": 0.6997556686401367, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 39620 + }, + { + "epoch": 2.845960502692998, + "grad_norm": 0.7639351487159729, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39630 + }, + { + "epoch": 2.846678635547576, + "grad_norm": 0.9086648225784302, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 39640 + }, + { + "epoch": 2.847396768402154, + "grad_norm": 0.91103196144104, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 39650 + }, + { + "epoch": 2.8481149012567326, + "grad_norm": 0.8096913695335388, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 39660 + }, + { + "epoch": 2.8488330341113106, + "grad_norm": 0.8961427807807922, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39670 + }, + { + "epoch": 2.8495511669658886, + "grad_norm": 0.7489904761314392, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 39680 + }, + { + "epoch": 2.850269299820467, + "grad_norm": 0.7893617749214172, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 39690 + }, + { + "epoch": 2.850987432675045, + "grad_norm": 0.8259761929512024, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 39700 + }, + { + "epoch": 2.851705565529623, + "grad_norm": 0.7006617188453674, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 39710 + }, + { + "epoch": 2.852423698384201, + "grad_norm": 0.8922327756881714, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 39720 + }, + { + "epoch": 2.853141831238779, + "grad_norm": 0.9058550000190735, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 39730 + }, + { + "epoch": 2.853859964093357, + "grad_norm": 0.7627129554748535, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 39740 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 0.9316968321800232, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39750 + }, + { + "epoch": 2.8552962298025135, + "grad_norm": 0.8424679040908813, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39760 + }, + { + "epoch": 2.8560143626570915, + "grad_norm": 0.6185386776924133, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 39770 + }, + { + "epoch": 2.8567324955116695, + "grad_norm": 0.709902286529541, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 39780 + }, + { + "epoch": 2.857450628366248, + "grad_norm": 0.93730229139328, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 39790 + }, + { + "epoch": 2.858168761220826, + "grad_norm": 0.875989556312561, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 39800 + }, + { + "epoch": 2.858886894075404, + "grad_norm": 0.7424131631851196, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 39810 + }, + { + "epoch": 2.859605026929982, + "grad_norm": 0.9108477830886841, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 39820 + }, + { + "epoch": 2.86032315978456, + "grad_norm": 0.8248386383056641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 39830 + }, + { + "epoch": 2.861041292639138, + "grad_norm": 0.8739979863166809, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 39840 + }, + { + "epoch": 2.8617594254937164, + "grad_norm": 0.7940961122512817, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 39850 + }, + { + "epoch": 2.8624775583482944, + "grad_norm": 0.7594687938690186, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 39860 + }, + { + "epoch": 2.8631956912028724, + "grad_norm": 0.9884313941001892, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 39870 + }, + { + "epoch": 2.863913824057451, + "grad_norm": 0.8537741303443909, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 39880 + }, + { + "epoch": 2.864631956912029, + "grad_norm": 0.7407512664794922, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 39890 + }, + { + "epoch": 2.865350089766607, + "grad_norm": 1.0179548263549805, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 39900 + }, + { + "epoch": 2.866068222621185, + "grad_norm": 0.8822470307350159, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 39910 + }, + { + "epoch": 2.866786355475763, + "grad_norm": 0.794448733329773, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 39920 + }, + { + "epoch": 2.867504488330341, + "grad_norm": 0.8115299940109253, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 39930 + }, + { + "epoch": 2.8682226211849193, + "grad_norm": 0.7998958826065063, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 39940 + }, + { + "epoch": 2.8689407540394973, + "grad_norm": 0.8222435116767883, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 39950 + }, + { + "epoch": 2.8696588868940753, + "grad_norm": 0.9495923519134521, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39960 + }, + { + "epoch": 2.8703770197486533, + "grad_norm": 0.6749192476272583, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 39970 + }, + { + "epoch": 2.871095152603232, + "grad_norm": 0.8910874128341675, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 39980 + }, + { + "epoch": 2.87181328545781, + "grad_norm": 0.7051638960838318, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 39990 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 0.8456535339355469, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 40000 + }, + { + "epoch": 2.873249551166966, + "grad_norm": 0.934894859790802, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 40010 + }, + { + "epoch": 2.873967684021544, + "grad_norm": 0.6740477681159973, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 40020 + }, + { + "epoch": 2.8746858168761222, + "grad_norm": 0.6632325649261475, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 40030 + }, + { + "epoch": 2.8754039497307002, + "grad_norm": 0.8889022469520569, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 40040 + }, + { + "epoch": 2.8761220825852782, + "grad_norm": 0.7460705637931824, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 40050 + }, + { + "epoch": 2.8768402154398562, + "grad_norm": 0.9795911908149719, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 40060 + }, + { + "epoch": 2.8775583482944347, + "grad_norm": 1.0002509355545044, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 40070 + }, + { + "epoch": 2.8782764811490127, + "grad_norm": 0.7867239713668823, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 40080 + }, + { + "epoch": 2.8789946140035907, + "grad_norm": 1.0221471786499023, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 40090 + }, + { + "epoch": 2.8797127468581687, + "grad_norm": 0.8091005086898804, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 40100 + }, + { + "epoch": 2.8804308797127467, + "grad_norm": 0.8485820293426514, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 40110 + }, + { + "epoch": 2.8811490125673247, + "grad_norm": 0.7850196957588196, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 40120 + }, + { + "epoch": 2.881867145421903, + "grad_norm": 0.7906134128570557, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 40130 + }, + { + "epoch": 2.882585278276481, + "grad_norm": 0.7957962155342102, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 40140 + }, + { + "epoch": 2.883303411131059, + "grad_norm": 1.0687522888183594, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 40150 + }, + { + "epoch": 2.8840215439856376, + "grad_norm": 0.713752031326294, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 40160 + }, + { + "epoch": 2.8847396768402156, + "grad_norm": 1.1603864431381226, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 40170 + }, + { + "epoch": 2.8854578096947936, + "grad_norm": 0.8423245549201965, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 40180 + }, + { + "epoch": 2.8861759425493716, + "grad_norm": 0.7554550766944885, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40190 + }, + { + "epoch": 2.8868940754039496, + "grad_norm": 0.6006978750228882, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 40200 + }, + { + "epoch": 2.8876122082585276, + "grad_norm": 0.923068106174469, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 40210 + }, + { + "epoch": 2.888330341113106, + "grad_norm": 0.7659787535667419, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 40220 + }, + { + "epoch": 2.889048473967684, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 40230 + }, + { + "epoch": 2.889766606822262, + "grad_norm": 1.1267355680465698, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 40240 + }, + { + "epoch": 2.89048473967684, + "grad_norm": 0.8548554182052612, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 40250 + }, + { + "epoch": 2.8912028725314185, + "grad_norm": 0.7846875786781311, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 40260 + }, + { + "epoch": 2.8919210053859965, + "grad_norm": 0.8606904745101929, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40270 + }, + { + "epoch": 2.8926391382405745, + "grad_norm": 0.6508898138999939, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 40280 + }, + { + "epoch": 2.8933572710951525, + "grad_norm": 0.7903237342834473, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 40290 + }, + { + "epoch": 2.8940754039497305, + "grad_norm": 0.7320941686630249, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 40300 + }, + { + "epoch": 2.894793536804309, + "grad_norm": 1.0031821727752686, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 40310 + }, + { + "epoch": 2.895511669658887, + "grad_norm": 0.7463554739952087, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 40320 + }, + { + "epoch": 2.896229802513465, + "grad_norm": 0.8455599546432495, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 40330 + }, + { + "epoch": 2.896947935368043, + "grad_norm": 0.7645914554595947, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 40340 + }, + { + "epoch": 2.8976660682226214, + "grad_norm": 0.9074810147285461, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 40350 + }, + { + "epoch": 2.8983842010771994, + "grad_norm": 0.9070153832435608, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 40360 + }, + { + "epoch": 2.8991023339317774, + "grad_norm": 0.8649221658706665, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 40370 + }, + { + "epoch": 2.8998204667863554, + "grad_norm": 1.0325016975402832, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 40380 + }, + { + "epoch": 2.9005385996409334, + "grad_norm": 0.8688622713088989, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 40390 + }, + { + "epoch": 2.9012567324955114, + "grad_norm": 0.83316969871521, + "learning_rate": 0.0002, + "loss": 0.7209, + "step": 40400 + }, + { + "epoch": 2.90197486535009, + "grad_norm": 1.0146536827087402, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 40410 + }, + { + "epoch": 2.902692998204668, + "grad_norm": 6.21811580657959, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 40420 + }, + { + "epoch": 2.903411131059246, + "grad_norm": 0.8747655749320984, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 40430 + }, + { + "epoch": 2.9041292639138243, + "grad_norm": 0.8671547174453735, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 40440 + }, + { + "epoch": 2.9048473967684023, + "grad_norm": 0.7888760566711426, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 40450 + }, + { + "epoch": 2.9055655296229803, + "grad_norm": 0.7182217240333557, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 40460 + }, + { + "epoch": 2.9062836624775583, + "grad_norm": 0.8802227973937988, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 40470 + }, + { + "epoch": 2.9070017953321363, + "grad_norm": 0.8106126189231873, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 40480 + }, + { + "epoch": 2.9077199281867143, + "grad_norm": 0.7313538789749146, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 40490 + }, + { + "epoch": 2.9084380610412928, + "grad_norm": 0.6098655462265015, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40500 + }, + { + "epoch": 2.9091561938958708, + "grad_norm": 0.8849560618400574, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 40510 + }, + { + "epoch": 2.9098743267504488, + "grad_norm": 0.8761322498321533, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 40520 + }, + { + "epoch": 2.9105924596050268, + "grad_norm": 0.8259703516960144, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 40530 + }, + { + "epoch": 2.911310592459605, + "grad_norm": 0.6613079309463501, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 40540 + }, + { + "epoch": 2.912028725314183, + "grad_norm": 0.825678825378418, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 40550 + }, + { + "epoch": 2.912746858168761, + "grad_norm": 0.824850857257843, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 40560 + }, + { + "epoch": 2.9134649910233392, + "grad_norm": 0.9629682898521423, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 40570 + }, + { + "epoch": 2.9141831238779172, + "grad_norm": 0.7446485161781311, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 40580 + }, + { + "epoch": 2.9149012567324957, + "grad_norm": 0.9028317928314209, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 40590 + }, + { + "epoch": 2.9156193895870737, + "grad_norm": 0.9646022319793701, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 40600 + }, + { + "epoch": 2.9163375224416517, + "grad_norm": 0.8845045566558838, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 40610 + }, + { + "epoch": 2.9170556552962297, + "grad_norm": 0.9660372734069824, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 40620 + }, + { + "epoch": 2.917773788150808, + "grad_norm": 0.8914347290992737, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 40630 + }, + { + "epoch": 2.918491921005386, + "grad_norm": 0.7789235711097717, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 40640 + }, + { + "epoch": 2.919210053859964, + "grad_norm": 0.8221206665039062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 40650 + }, + { + "epoch": 2.919928186714542, + "grad_norm": 0.9550618529319763, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 40660 + }, + { + "epoch": 2.92064631956912, + "grad_norm": 0.868315577507019, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 40670 + }, + { + "epoch": 2.921364452423698, + "grad_norm": 0.852878749370575, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 40680 + }, + { + "epoch": 2.9220825852782766, + "grad_norm": 0.8388790488243103, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 40690 + }, + { + "epoch": 2.9228007181328546, + "grad_norm": 0.9897602200508118, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 40700 + }, + { + "epoch": 2.9235188509874326, + "grad_norm": 0.8050527572631836, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 40710 + }, + { + "epoch": 2.924236983842011, + "grad_norm": 0.7296929955482483, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 40720 + }, + { + "epoch": 2.924955116696589, + "grad_norm": 0.917475700378418, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 40730 + }, + { + "epoch": 2.925673249551167, + "grad_norm": 0.9118483662605286, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 40740 + }, + { + "epoch": 2.926391382405745, + "grad_norm": 0.7722473740577698, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 40750 + }, + { + "epoch": 2.927109515260323, + "grad_norm": 0.7950358986854553, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 40760 + }, + { + "epoch": 2.927827648114901, + "grad_norm": 0.8868561387062073, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 40770 + }, + { + "epoch": 2.9285457809694795, + "grad_norm": 0.7923154830932617, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 40780 + }, + { + "epoch": 2.9292639138240575, + "grad_norm": 0.7285428047180176, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 40790 + }, + { + "epoch": 2.9299820466786355, + "grad_norm": 0.794775664806366, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 40800 + }, + { + "epoch": 2.9307001795332135, + "grad_norm": 0.8351698517799377, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 40810 + }, + { + "epoch": 2.931418312387792, + "grad_norm": 0.853082001209259, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40820 + }, + { + "epoch": 2.93213644524237, + "grad_norm": 0.8209722638130188, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 40830 + }, + { + "epoch": 2.932854578096948, + "grad_norm": 0.8982136845588684, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 40840 + }, + { + "epoch": 2.933572710951526, + "grad_norm": 0.8373305201530457, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 40850 + }, + { + "epoch": 2.934290843806104, + "grad_norm": 0.8326864242553711, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 40860 + }, + { + "epoch": 2.9350089766606824, + "grad_norm": 0.7232590317726135, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 40870 + }, + { + "epoch": 2.9357271095152604, + "grad_norm": 0.823615312576294, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 40880 + }, + { + "epoch": 2.9364452423698384, + "grad_norm": 0.7532811760902405, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 40890 + }, + { + "epoch": 2.9371633752244164, + "grad_norm": 0.9594773650169373, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 40900 + }, + { + "epoch": 2.937881508078995, + "grad_norm": 0.8368398547172546, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 40910 + }, + { + "epoch": 2.938599640933573, + "grad_norm": 0.8336817026138306, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 40920 + }, + { + "epoch": 2.939317773788151, + "grad_norm": 0.8413758277893066, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 40930 + }, + { + "epoch": 2.940035906642729, + "grad_norm": 0.7117549180984497, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 40940 + }, + { + "epoch": 2.940754039497307, + "grad_norm": 0.8741925954818726, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 40950 + }, + { + "epoch": 2.941472172351885, + "grad_norm": 0.8476088047027588, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 40960 + }, + { + "epoch": 2.9421903052064633, + "grad_norm": 0.674659788608551, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 40970 + }, + { + "epoch": 2.9429084380610413, + "grad_norm": 0.7087500691413879, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 40980 + }, + { + "epoch": 2.9436265709156193, + "grad_norm": 0.9202252626419067, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 40990 + }, + { + "epoch": 2.9443447037701977, + "grad_norm": 0.9775124192237854, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 41000 + }, + { + "epoch": 2.9450628366247757, + "grad_norm": 0.7465068101882935, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 41010 + }, + { + "epoch": 2.9457809694793538, + "grad_norm": 0.7229986786842346, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 41020 + }, + { + "epoch": 2.9464991023339318, + "grad_norm": 0.7228954434394836, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 41030 + }, + { + "epoch": 2.9472172351885098, + "grad_norm": 0.9396149516105652, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 41040 + }, + { + "epoch": 2.9479353680430878, + "grad_norm": 0.9458696842193604, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 41050 + }, + { + "epoch": 2.948653500897666, + "grad_norm": 0.8276246190071106, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 41060 + }, + { + "epoch": 2.949371633752244, + "grad_norm": 0.7927420139312744, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 41070 + }, + { + "epoch": 2.950089766606822, + "grad_norm": 0.7403103709220886, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 41080 + }, + { + "epoch": 2.9508078994614, + "grad_norm": 0.9813524484634399, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 41090 + }, + { + "epoch": 2.9515260323159787, + "grad_norm": 0.8560924530029297, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 41100 + }, + { + "epoch": 2.9522441651705567, + "grad_norm": 0.6937443017959595, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 41110 + }, + { + "epoch": 2.9529622980251347, + "grad_norm": 0.8440476655960083, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 41120 + }, + { + "epoch": 2.9536804308797127, + "grad_norm": 1.1260770559310913, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 41130 + }, + { + "epoch": 2.9543985637342907, + "grad_norm": 0.8789936900138855, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 41140 + }, + { + "epoch": 2.9551166965888687, + "grad_norm": 0.8205832839012146, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 41150 + }, + { + "epoch": 2.955834829443447, + "grad_norm": 0.8148444294929504, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 41160 + }, + { + "epoch": 2.956552962298025, + "grad_norm": 0.791296660900116, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41170 + }, + { + "epoch": 2.957271095152603, + "grad_norm": 1.3229854106903076, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 41180 + }, + { + "epoch": 2.9579892280071816, + "grad_norm": 0.906423807144165, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 41190 + }, + { + "epoch": 2.9587073608617596, + "grad_norm": 0.8707411289215088, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 41200 + }, + { + "epoch": 2.9594254937163376, + "grad_norm": 1.0362473726272583, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 41210 + }, + { + "epoch": 2.9601436265709156, + "grad_norm": 0.818546712398529, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 41220 + }, + { + "epoch": 2.9608617594254936, + "grad_norm": 0.8558517098426819, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 41230 + }, + { + "epoch": 2.9615798922800716, + "grad_norm": 0.8262931704521179, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 41240 + }, + { + "epoch": 2.96229802513465, + "grad_norm": 0.9603250026702881, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 41250 + }, + { + "epoch": 2.963016157989228, + "grad_norm": 0.891610860824585, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 41260 + }, + { + "epoch": 2.963734290843806, + "grad_norm": 0.9823883175849915, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 41270 + }, + { + "epoch": 2.9644524236983845, + "grad_norm": 0.8783510327339172, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 41280 + }, + { + "epoch": 2.9651705565529625, + "grad_norm": 0.873656690120697, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 41290 + }, + { + "epoch": 2.9658886894075405, + "grad_norm": 0.8281165957450867, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 41300 + }, + { + "epoch": 2.9666068222621185, + "grad_norm": 0.8008899092674255, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 41310 + }, + { + "epoch": 2.9673249551166965, + "grad_norm": 0.8564065098762512, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41320 + }, + { + "epoch": 2.9680430879712745, + "grad_norm": 0.786119818687439, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41330 + }, + { + "epoch": 2.968761220825853, + "grad_norm": 1.3152399063110352, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 41340 + }, + { + "epoch": 2.969479353680431, + "grad_norm": 0.7551527619361877, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 41350 + }, + { + "epoch": 2.970197486535009, + "grad_norm": 1.1397290229797363, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 41360 + }, + { + "epoch": 2.970915619389587, + "grad_norm": 0.8333854079246521, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 41370 + }, + { + "epoch": 2.9716337522441654, + "grad_norm": 0.8096165657043457, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 41380 + }, + { + "epoch": 2.9723518850987434, + "grad_norm": 0.8378547430038452, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 41390 + }, + { + "epoch": 2.9730700179533214, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 41400 + }, + { + "epoch": 2.9737881508078994, + "grad_norm": 0.8722409605979919, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 41410 + }, + { + "epoch": 2.9745062836624774, + "grad_norm": 0.6680061221122742, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 41420 + }, + { + "epoch": 2.9752244165170554, + "grad_norm": 0.7666152715682983, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 41430 + }, + { + "epoch": 2.975942549371634, + "grad_norm": 0.8489957451820374, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 41440 + }, + { + "epoch": 2.976660682226212, + "grad_norm": 0.8516127467155457, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 41450 + }, + { + "epoch": 2.97737881508079, + "grad_norm": 0.8836804628372192, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 41460 + }, + { + "epoch": 2.9780969479353683, + "grad_norm": 1.0963364839553833, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 41470 + }, + { + "epoch": 2.9788150807899463, + "grad_norm": 0.9908610582351685, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 41480 + }, + { + "epoch": 2.9795332136445243, + "grad_norm": 0.8822041153907776, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 41490 + }, + { + "epoch": 2.9802513464991023, + "grad_norm": 0.717723548412323, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 41500 + }, + { + "epoch": 2.9809694793536803, + "grad_norm": 0.8413400053977966, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 41510 + }, + { + "epoch": 2.9816876122082583, + "grad_norm": 0.8771023750305176, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41520 + }, + { + "epoch": 2.9824057450628367, + "grad_norm": 0.7185000777244568, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 41530 + }, + { + "epoch": 2.9831238779174147, + "grad_norm": 0.8299767374992371, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 41540 + }, + { + "epoch": 2.9838420107719927, + "grad_norm": 0.9309971928596497, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 41550 + }, + { + "epoch": 2.984560143626571, + "grad_norm": 0.7644693851470947, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 41560 + }, + { + "epoch": 2.985278276481149, + "grad_norm": 0.7888111472129822, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 41570 + }, + { + "epoch": 2.985996409335727, + "grad_norm": 1.0921967029571533, + "learning_rate": 0.0002, + "loss": 0.6984, + "step": 41580 + }, + { + "epoch": 2.986714542190305, + "grad_norm": 0.8116785883903503, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 41590 + }, + { + "epoch": 2.987432675044883, + "grad_norm": 0.983269214630127, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 41600 + }, + { + "epoch": 2.988150807899461, + "grad_norm": 0.81700599193573, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 41610 + }, + { + "epoch": 2.9888689407540396, + "grad_norm": 0.7545617818832397, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 41620 + }, + { + "epoch": 2.9895870736086176, + "grad_norm": 0.8695791363716125, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 41630 + }, + { + "epoch": 2.9903052064631956, + "grad_norm": 0.8980445861816406, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 41640 + }, + { + "epoch": 2.9910233393177736, + "grad_norm": 0.7884747982025146, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 41650 + }, + { + "epoch": 2.991741472172352, + "grad_norm": 0.8347880840301514, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 41660 + }, + { + "epoch": 2.99245960502693, + "grad_norm": 0.7786261439323425, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 41670 + }, + { + "epoch": 2.993177737881508, + "grad_norm": 0.7830624580383301, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 41680 + }, + { + "epoch": 2.993895870736086, + "grad_norm": 0.8293532133102417, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 41690 + }, + { + "epoch": 2.994614003590664, + "grad_norm": 0.8476244211196899, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 41700 + }, + { + "epoch": 2.995332136445242, + "grad_norm": 0.7218726873397827, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 41710 + }, + { + "epoch": 2.9960502692998205, + "grad_norm": 0.8144199252128601, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 41720 + }, + { + "epoch": 2.9967684021543985, + "grad_norm": 0.7047123312950134, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 41730 + }, + { + "epoch": 2.9974865350089765, + "grad_norm": 0.8412184715270996, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 41740 + }, + { + "epoch": 2.998204667863555, + "grad_norm": 0.8840848207473755, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 41750 + }, + { + "epoch": 2.998922800718133, + "grad_norm": 0.7302142977714539, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 41760 + }, + { + "epoch": 2.999640933572711, + "grad_norm": 0.7075994610786438, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 41770 + }, + { + "epoch": 3.0, + "eval_loss": 1.1079821586608887, + "eval_runtime": 55.1897, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 41775 + }, + { + "epoch": 3.000359066427289, + "grad_norm": 0.8630077838897705, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 41780 + }, + { + "epoch": 3.001077199281867, + "grad_norm": 0.8901806473731995, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 41790 + }, + { + "epoch": 3.0017953321364454, + "grad_norm": 0.8291767835617065, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 41800 + }, + { + "epoch": 3.0025134649910235, + "grad_norm": 0.792519211769104, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 41810 + }, + { + "epoch": 3.0032315978456015, + "grad_norm": 1.1330063343048096, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 41820 + }, + { + "epoch": 3.0039497307001795, + "grad_norm": 0.9401350617408752, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 41830 + }, + { + "epoch": 3.0046678635547575, + "grad_norm": 0.8065463304519653, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 41840 + }, + { + "epoch": 3.005385996409336, + "grad_norm": 0.8309979438781738, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 41850 + }, + { + "epoch": 3.006104129263914, + "grad_norm": 0.7432689070701599, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 41860 + }, + { + "epoch": 3.006822262118492, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 41870 + }, + { + "epoch": 3.00754039497307, + "grad_norm": 1.4364255666732788, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 41880 + }, + { + "epoch": 3.008258527827648, + "grad_norm": 0.9023072123527527, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 41890 + }, + { + "epoch": 3.0089766606822264, + "grad_norm": 0.7790587544441223, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 41900 + }, + { + "epoch": 3.0096947935368044, + "grad_norm": 0.9163706302642822, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 41910 + }, + { + "epoch": 3.0104129263913824, + "grad_norm": 0.8147963285446167, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 41920 + }, + { + "epoch": 3.0111310592459604, + "grad_norm": 0.8432748913764954, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 41930 + }, + { + "epoch": 3.011849192100539, + "grad_norm": 0.9216182231903076, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 41940 + }, + { + "epoch": 3.012567324955117, + "grad_norm": 0.62154221534729, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 41950 + }, + { + "epoch": 3.013285457809695, + "grad_norm": 0.8902392387390137, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 41960 + }, + { + "epoch": 3.014003590664273, + "grad_norm": 0.9601083993911743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 41970 + }, + { + "epoch": 3.014721723518851, + "grad_norm": 0.8938809037208557, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 41980 + }, + { + "epoch": 3.0154398563734293, + "grad_norm": 1.0621999502182007, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 41990 + }, + { + "epoch": 3.0161579892280073, + "grad_norm": 0.7310585379600525, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 42000 + }, + { + "epoch": 3.0168761220825853, + "grad_norm": 0.8475853800773621, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 42010 + }, + { + "epoch": 3.0175942549371633, + "grad_norm": 0.8509864807128906, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 42020 + }, + { + "epoch": 3.0183123877917413, + "grad_norm": 0.7461876273155212, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 42030 + }, + { + "epoch": 3.0190305206463197, + "grad_norm": 0.7734265327453613, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 42040 + }, + { + "epoch": 3.0197486535008977, + "grad_norm": 0.9056455492973328, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 42050 + }, + { + "epoch": 3.0204667863554757, + "grad_norm": 0.9183889031410217, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 42060 + }, + { + "epoch": 3.0211849192100537, + "grad_norm": 1.0777326822280884, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 42070 + }, + { + "epoch": 3.021903052064632, + "grad_norm": 0.9217308163642883, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 42080 + }, + { + "epoch": 3.02262118491921, + "grad_norm": 0.8220202326774597, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42090 + }, + { + "epoch": 3.023339317773788, + "grad_norm": 0.8454978466033936, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 42100 + }, + { + "epoch": 3.024057450628366, + "grad_norm": 0.8116370439529419, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 42110 + }, + { + "epoch": 3.024775583482944, + "grad_norm": 0.8064935207366943, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 42120 + }, + { + "epoch": 3.0254937163375226, + "grad_norm": 0.9718650579452515, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 42130 + }, + { + "epoch": 3.0262118491921006, + "grad_norm": 0.8817588090896606, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 42140 + }, + { + "epoch": 3.0269299820466786, + "grad_norm": 0.7757318615913391, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 42150 + }, + { + "epoch": 3.0276481149012566, + "grad_norm": 0.7500545382499695, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 42160 + }, + { + "epoch": 3.0283662477558346, + "grad_norm": 0.72913658618927, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 42170 + }, + { + "epoch": 3.029084380610413, + "grad_norm": 0.7641891837120056, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 42180 + }, + { + "epoch": 3.029802513464991, + "grad_norm": 0.7682021856307983, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 42190 + }, + { + "epoch": 3.030520646319569, + "grad_norm": 0.8145958781242371, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 42200 + }, + { + "epoch": 3.031238779174147, + "grad_norm": 1.0546396970748901, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 42210 + }, + { + "epoch": 3.0319569120287255, + "grad_norm": 0.8222804665565491, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 42220 + }, + { + "epoch": 3.0326750448833035, + "grad_norm": 0.8245829343795776, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 42230 + }, + { + "epoch": 3.0333931777378815, + "grad_norm": 0.9059963822364807, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 42240 + }, + { + "epoch": 3.0341113105924595, + "grad_norm": 1.026747465133667, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 42250 + }, + { + "epoch": 3.0348294434470375, + "grad_norm": 0.9108404517173767, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42260 + }, + { + "epoch": 3.035547576301616, + "grad_norm": 0.9828516840934753, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 42270 + }, + { + "epoch": 3.036265709156194, + "grad_norm": 0.9664266705513, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 42280 + }, + { + "epoch": 3.036983842010772, + "grad_norm": 0.7577654719352722, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42290 + }, + { + "epoch": 3.03770197486535, + "grad_norm": 0.8331853151321411, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 42300 + }, + { + "epoch": 3.038420107719928, + "grad_norm": 0.8017228245735168, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 42310 + }, + { + "epoch": 3.0391382405745064, + "grad_norm": 1.0316718816757202, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 42320 + }, + { + "epoch": 3.0398563734290844, + "grad_norm": 0.9379803538322449, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 42330 + }, + { + "epoch": 3.0405745062836624, + "grad_norm": 0.7554476857185364, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 42340 + }, + { + "epoch": 3.0412926391382404, + "grad_norm": 0.7377917766571045, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 42350 + }, + { + "epoch": 3.042010771992819, + "grad_norm": 1.0655276775360107, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 42360 + }, + { + "epoch": 3.042728904847397, + "grad_norm": 0.7748511433601379, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 42370 + }, + { + "epoch": 3.043447037701975, + "grad_norm": 0.848649799823761, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 42380 + }, + { + "epoch": 3.044165170556553, + "grad_norm": 0.7754636406898499, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 42390 + }, + { + "epoch": 3.044883303411131, + "grad_norm": 0.8173656463623047, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 42400 + }, + { + "epoch": 3.0456014362657093, + "grad_norm": 0.7881983518600464, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 42410 + }, + { + "epoch": 3.0463195691202873, + "grad_norm": 0.971072256565094, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 42420 + }, + { + "epoch": 3.0470377019748653, + "grad_norm": 0.8400143384933472, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 42430 + }, + { + "epoch": 3.0477558348294433, + "grad_norm": 1.0028647184371948, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 42440 + }, + { + "epoch": 3.0484739676840213, + "grad_norm": 0.9728034734725952, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 42450 + }, + { + "epoch": 3.0491921005386, + "grad_norm": 0.937633752822876, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 42460 + }, + { + "epoch": 3.049910233393178, + "grad_norm": 1.0265642404556274, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 42470 + }, + { + "epoch": 3.050628366247756, + "grad_norm": 0.9733216762542725, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 42480 + }, + { + "epoch": 3.051346499102334, + "grad_norm": 0.7039174437522888, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 42490 + }, + { + "epoch": 3.0520646319569122, + "grad_norm": 0.7515231370925903, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 42500 + }, + { + "epoch": 3.0527827648114902, + "grad_norm": 0.9115300178527832, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 42510 + }, + { + "epoch": 3.0535008976660682, + "grad_norm": 0.7403655648231506, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 42520 + }, + { + "epoch": 3.0542190305206462, + "grad_norm": 0.7826810479164124, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 42530 + }, + { + "epoch": 3.0549371633752243, + "grad_norm": 0.8007349371910095, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 42540 + }, + { + "epoch": 3.0556552962298027, + "grad_norm": 0.7975959777832031, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 42550 + }, + { + "epoch": 3.0563734290843807, + "grad_norm": 0.9665228128433228, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42560 + }, + { + "epoch": 3.0570915619389587, + "grad_norm": 0.8386123180389404, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 42570 + }, + { + "epoch": 3.0578096947935367, + "grad_norm": 0.7437782287597656, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 42580 + }, + { + "epoch": 3.0585278276481147, + "grad_norm": 0.8360698223114014, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 42590 + }, + { + "epoch": 3.059245960502693, + "grad_norm": 0.8982073664665222, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42600 + }, + { + "epoch": 3.059964093357271, + "grad_norm": 0.9425758719444275, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 42610 + }, + { + "epoch": 3.060682226211849, + "grad_norm": 0.8567131161689758, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42620 + }, + { + "epoch": 3.061400359066427, + "grad_norm": 0.9322942495346069, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 42630 + }, + { + "epoch": 3.0621184919210056, + "grad_norm": 0.8283235430717468, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 42640 + }, + { + "epoch": 3.0628366247755836, + "grad_norm": 0.8457967638969421, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 42650 + }, + { + "epoch": 3.0635547576301616, + "grad_norm": 0.8205100893974304, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42660 + }, + { + "epoch": 3.0642728904847396, + "grad_norm": 0.8385181427001953, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 42670 + }, + { + "epoch": 3.0649910233393176, + "grad_norm": 1.2959390878677368, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 42680 + }, + { + "epoch": 3.065709156193896, + "grad_norm": 0.7150540351867676, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 42690 + }, + { + "epoch": 3.066427289048474, + "grad_norm": 0.6647360920906067, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 42700 + }, + { + "epoch": 3.067145421903052, + "grad_norm": 0.9148316979408264, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 42710 + }, + { + "epoch": 3.06786355475763, + "grad_norm": 0.8606209754943848, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 42720 + }, + { + "epoch": 3.068581687612208, + "grad_norm": 1.4255632162094116, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42730 + }, + { + "epoch": 3.0692998204667865, + "grad_norm": 0.9131710529327393, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 42740 + }, + { + "epoch": 3.0700179533213645, + "grad_norm": 0.9560360908508301, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 42750 + }, + { + "epoch": 3.0707360861759425, + "grad_norm": 0.9278100728988647, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42760 + }, + { + "epoch": 3.0714542190305205, + "grad_norm": 0.7258471846580505, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 42770 + }, + { + "epoch": 3.072172351885099, + "grad_norm": 1.1537690162658691, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 42780 + }, + { + "epoch": 3.072890484739677, + "grad_norm": 0.8562588691711426, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 42790 + }, + { + "epoch": 3.073608617594255, + "grad_norm": 1.0271626710891724, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 42800 + }, + { + "epoch": 3.074326750448833, + "grad_norm": 0.85148024559021, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 42810 + }, + { + "epoch": 3.075044883303411, + "grad_norm": 0.805772602558136, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 42820 + }, + { + "epoch": 3.0757630161579894, + "grad_norm": 0.8057122230529785, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 42830 + }, + { + "epoch": 3.0764811490125674, + "grad_norm": 0.7997274994850159, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 42840 + }, + { + "epoch": 3.0771992818671454, + "grad_norm": 0.8739321231842041, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 42850 + }, + { + "epoch": 3.0779174147217234, + "grad_norm": 0.833951473236084, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 42860 + }, + { + "epoch": 3.0786355475763014, + "grad_norm": 0.8813839554786682, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 42870 + }, + { + "epoch": 3.07935368043088, + "grad_norm": 0.9020521640777588, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 42880 + }, + { + "epoch": 3.080071813285458, + "grad_norm": 0.888148844242096, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 42890 + }, + { + "epoch": 3.080789946140036, + "grad_norm": 0.8110589385032654, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 42900 + }, + { + "epoch": 3.081508078994614, + "grad_norm": 0.818738579750061, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 42910 + }, + { + "epoch": 3.082226211849192, + "grad_norm": 0.9607479572296143, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 42920 + }, + { + "epoch": 3.0829443447037703, + "grad_norm": 0.8162698745727539, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 42930 + }, + { + "epoch": 3.0836624775583483, + "grad_norm": 0.8170801997184753, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 42940 + }, + { + "epoch": 3.0843806104129263, + "grad_norm": 0.9250763654708862, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 42950 + }, + { + "epoch": 3.0850987432675043, + "grad_norm": 0.898097813129425, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 42960 + }, + { + "epoch": 3.0858168761220828, + "grad_norm": 0.9398433566093445, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 42970 + }, + { + "epoch": 3.0865350089766608, + "grad_norm": 1.052808165550232, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 42980 + }, + { + "epoch": 3.087253141831239, + "grad_norm": 0.8974723219871521, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 42990 + }, + { + "epoch": 3.087971274685817, + "grad_norm": 0.7517408728599548, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 43000 + }, + { + "epoch": 3.088689407540395, + "grad_norm": 0.8054485321044922, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 43010 + }, + { + "epoch": 3.0894075403949732, + "grad_norm": 0.9896154999732971, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 43020 + }, + { + "epoch": 3.0901256732495512, + "grad_norm": 0.7887356281280518, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 43030 + }, + { + "epoch": 3.0908438061041292, + "grad_norm": 1.0119125843048096, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 43040 + }, + { + "epoch": 3.0915619389587072, + "grad_norm": 0.8753892779350281, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 43050 + }, + { + "epoch": 3.0922800718132857, + "grad_norm": 0.8322654962539673, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43060 + }, + { + "epoch": 3.0929982046678637, + "grad_norm": 1.0605992078781128, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 43070 + }, + { + "epoch": 3.0937163375224417, + "grad_norm": 0.8783912062644958, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 43080 + }, + { + "epoch": 3.0944344703770197, + "grad_norm": 0.8839107751846313, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 43090 + }, + { + "epoch": 3.0951526032315977, + "grad_norm": 1.1655086278915405, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 43100 + }, + { + "epoch": 3.095870736086176, + "grad_norm": 0.7051523327827454, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 43110 + }, + { + "epoch": 3.096588868940754, + "grad_norm": 0.7793807983398438, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43120 + }, + { + "epoch": 3.097307001795332, + "grad_norm": 0.8352194428443909, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 43130 + }, + { + "epoch": 3.09802513464991, + "grad_norm": 0.9684847593307495, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 43140 + }, + { + "epoch": 3.098743267504488, + "grad_norm": 1.1106340885162354, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 43150 + }, + { + "epoch": 3.0994614003590666, + "grad_norm": 0.7814911603927612, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 43160 + }, + { + "epoch": 3.1001795332136446, + "grad_norm": 0.7923110723495483, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 43170 + }, + { + "epoch": 3.1008976660682226, + "grad_norm": 0.87022864818573, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 43180 + }, + { + "epoch": 3.1016157989228006, + "grad_norm": 0.9352855682373047, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 43190 + }, + { + "epoch": 3.1023339317773786, + "grad_norm": 0.8548445105552673, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 43200 + }, + { + "epoch": 3.103052064631957, + "grad_norm": 0.9576025009155273, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 43210 + }, + { + "epoch": 3.103770197486535, + "grad_norm": 0.7430430054664612, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 43220 + }, + { + "epoch": 3.104488330341113, + "grad_norm": 0.9619144797325134, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 43230 + }, + { + "epoch": 3.105206463195691, + "grad_norm": 0.8622338771820068, + "learning_rate": 0.0002, + "loss": 0.6171, + "step": 43240 + }, + { + "epoch": 3.1059245960502695, + "grad_norm": 0.853489339351654, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43250 + }, + { + "epoch": 3.1066427289048475, + "grad_norm": 0.9253206849098206, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 43260 + }, + { + "epoch": 3.1073608617594255, + "grad_norm": 0.9700671434402466, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 43270 + }, + { + "epoch": 3.1080789946140035, + "grad_norm": 1.0550731420516968, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 43280 + }, + { + "epoch": 3.1087971274685815, + "grad_norm": 0.939452052116394, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 43290 + }, + { + "epoch": 3.10951526032316, + "grad_norm": 0.8855276107788086, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 43300 + }, + { + "epoch": 3.110233393177738, + "grad_norm": 0.92197185754776, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 43310 + }, + { + "epoch": 3.110951526032316, + "grad_norm": 0.8825578689575195, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 43320 + }, + { + "epoch": 3.111669658886894, + "grad_norm": 0.9964608550071716, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 43330 + }, + { + "epoch": 3.1123877917414724, + "grad_norm": 0.9070520401000977, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 43340 + }, + { + "epoch": 3.1131059245960504, + "grad_norm": 0.9699633717536926, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 43350 + }, + { + "epoch": 3.1138240574506284, + "grad_norm": 0.7384091019630432, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 43360 + }, + { + "epoch": 3.1145421903052064, + "grad_norm": 0.9445326328277588, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 43370 + }, + { + "epoch": 3.1152603231597844, + "grad_norm": 0.8906524181365967, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 43380 + }, + { + "epoch": 3.115978456014363, + "grad_norm": 0.8850129246711731, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 43390 + }, + { + "epoch": 3.116696588868941, + "grad_norm": 0.7091860771179199, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 43400 + }, + { + "epoch": 3.117414721723519, + "grad_norm": 0.8992764949798584, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 43410 + }, + { + "epoch": 3.118132854578097, + "grad_norm": 0.9166698455810547, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43420 + }, + { + "epoch": 3.118850987432675, + "grad_norm": 1.1195749044418335, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 43430 + }, + { + "epoch": 3.1195691202872533, + "grad_norm": 0.9414069652557373, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 43440 + }, + { + "epoch": 3.1202872531418313, + "grad_norm": 0.7641217112541199, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 43450 + }, + { + "epoch": 3.1210053859964093, + "grad_norm": 1.2659285068511963, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 43460 + }, + { + "epoch": 3.1217235188509873, + "grad_norm": 0.9968213438987732, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 43470 + }, + { + "epoch": 3.1224416517055653, + "grad_norm": 0.8819042444229126, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 43480 + }, + { + "epoch": 3.1231597845601438, + "grad_norm": 0.9124775528907776, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 43490 + }, + { + "epoch": 3.1238779174147218, + "grad_norm": 0.868354082107544, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 43500 + }, + { + "epoch": 3.1245960502692998, + "grad_norm": 0.7367526292800903, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 43510 + }, + { + "epoch": 3.1253141831238778, + "grad_norm": 0.7553679943084717, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43520 + }, + { + "epoch": 3.126032315978456, + "grad_norm": 0.7970008850097656, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 43530 + }, + { + "epoch": 3.126750448833034, + "grad_norm": 0.9117488861083984, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 43540 + }, + { + "epoch": 3.127468581687612, + "grad_norm": 0.8004103899002075, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 43550 + }, + { + "epoch": 3.12818671454219, + "grad_norm": 0.736518919467926, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 43560 + }, + { + "epoch": 3.128904847396768, + "grad_norm": 0.8568395376205444, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 43570 + }, + { + "epoch": 3.1296229802513467, + "grad_norm": 0.9344052672386169, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 43580 + }, + { + "epoch": 3.1303411131059247, + "grad_norm": 0.7986525297164917, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 43590 + }, + { + "epoch": 3.1310592459605027, + "grad_norm": 0.8283242583274841, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 43600 + }, + { + "epoch": 3.1317773788150807, + "grad_norm": 0.6534292101860046, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 43610 + }, + { + "epoch": 3.132495511669659, + "grad_norm": 0.9585428833961487, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 43620 + }, + { + "epoch": 3.133213644524237, + "grad_norm": 0.8299157023429871, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 43630 + }, + { + "epoch": 3.133931777378815, + "grad_norm": 0.9050052762031555, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 43640 + }, + { + "epoch": 3.134649910233393, + "grad_norm": 1.0457062721252441, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 43650 + }, + { + "epoch": 3.135368043087971, + "grad_norm": 0.907691240310669, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 43660 + }, + { + "epoch": 3.1360861759425496, + "grad_norm": 0.8868935108184814, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 43670 + }, + { + "epoch": 3.1368043087971276, + "grad_norm": 0.8585456609725952, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 43680 + }, + { + "epoch": 3.1375224416517056, + "grad_norm": 1.0402741432189941, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 43690 + }, + { + "epoch": 3.1382405745062836, + "grad_norm": 1.0866798162460327, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 43700 + }, + { + "epoch": 3.1389587073608616, + "grad_norm": 0.7637296915054321, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 43710 + }, + { + "epoch": 3.13967684021544, + "grad_norm": 0.755235493183136, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 43720 + }, + { + "epoch": 3.140394973070018, + "grad_norm": 0.7258853316307068, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 43730 + }, + { + "epoch": 3.141113105924596, + "grad_norm": 1.0425268411636353, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 43740 + }, + { + "epoch": 3.141831238779174, + "grad_norm": 0.9171959757804871, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 43750 + }, + { + "epoch": 3.142549371633752, + "grad_norm": 0.8900150656700134, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 43760 + }, + { + "epoch": 3.1432675044883305, + "grad_norm": 0.9879246354103088, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 43770 + }, + { + "epoch": 3.1439856373429085, + "grad_norm": 0.7853389382362366, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 43780 + }, + { + "epoch": 3.1447037701974865, + "grad_norm": 1.0245232582092285, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 43790 + }, + { + "epoch": 3.1454219030520645, + "grad_norm": 0.8486390113830566, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 43800 + }, + { + "epoch": 3.146140035906643, + "grad_norm": 0.8536406755447388, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 43810 + }, + { + "epoch": 3.146858168761221, + "grad_norm": 0.9653734564781189, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 43820 + }, + { + "epoch": 3.147576301615799, + "grad_norm": 0.8292608857154846, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 43830 + }, + { + "epoch": 3.148294434470377, + "grad_norm": 1.147524118423462, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 43840 + }, + { + "epoch": 3.149012567324955, + "grad_norm": 0.9317546486854553, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 43850 + }, + { + "epoch": 3.1497307001795334, + "grad_norm": 0.8651045560836792, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 43860 + }, + { + "epoch": 3.1504488330341114, + "grad_norm": 0.8718969225883484, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 43870 + }, + { + "epoch": 3.1511669658886894, + "grad_norm": 1.0140702724456787, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 43880 + }, + { + "epoch": 3.1518850987432674, + "grad_norm": 0.75941401720047, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43890 + }, + { + "epoch": 3.152603231597846, + "grad_norm": 0.6618940234184265, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 43900 + }, + { + "epoch": 3.153321364452424, + "grad_norm": 1.0013338327407837, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 43910 + }, + { + "epoch": 3.154039497307002, + "grad_norm": 0.8735299706459045, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 43920 + }, + { + "epoch": 3.15475763016158, + "grad_norm": 1.141914963722229, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 43930 + }, + { + "epoch": 3.155475763016158, + "grad_norm": 1.0916038751602173, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 43940 + }, + { + "epoch": 3.1561938958707363, + "grad_norm": 0.7042547464370728, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 43950 + }, + { + "epoch": 3.1569120287253143, + "grad_norm": 0.9885236620903015, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 43960 + }, + { + "epoch": 3.1576301615798923, + "grad_norm": 0.8083009719848633, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 43970 + }, + { + "epoch": 3.1583482944344703, + "grad_norm": 1.082627296447754, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 43980 + }, + { + "epoch": 3.1590664272890483, + "grad_norm": 0.9293290376663208, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 43990 + }, + { + "epoch": 3.1597845601436267, + "grad_norm": 0.861003041267395, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 44000 + }, + { + "epoch": 3.1605026929982047, + "grad_norm": 0.9565994143486023, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 44010 + }, + { + "epoch": 3.1612208258527827, + "grad_norm": 0.9609305262565613, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 44020 + }, + { + "epoch": 3.1619389587073607, + "grad_norm": 0.847830593585968, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 44030 + }, + { + "epoch": 3.1626570915619387, + "grad_norm": 0.852357804775238, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 44040 + }, + { + "epoch": 3.163375224416517, + "grad_norm": 0.8634562492370605, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44050 + }, + { + "epoch": 3.164093357271095, + "grad_norm": 1.0259950160980225, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 44060 + }, + { + "epoch": 3.164811490125673, + "grad_norm": 0.9615250825881958, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 44070 + }, + { + "epoch": 3.165529622980251, + "grad_norm": 0.9892165660858154, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 44080 + }, + { + "epoch": 3.1662477558348296, + "grad_norm": 0.8827354907989502, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 44090 + }, + { + "epoch": 3.1669658886894076, + "grad_norm": 0.9258168339729309, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 44100 + }, + { + "epoch": 3.1676840215439857, + "grad_norm": 0.7983399033546448, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 44110 + }, + { + "epoch": 3.1684021543985637, + "grad_norm": 0.9917809963226318, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 44120 + }, + { + "epoch": 3.1691202872531417, + "grad_norm": 1.058927297592163, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44130 + }, + { + "epoch": 3.16983842010772, + "grad_norm": 1.0095895528793335, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44140 + }, + { + "epoch": 3.170556552962298, + "grad_norm": 0.9032495617866516, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 44150 + }, + { + "epoch": 3.171274685816876, + "grad_norm": 0.9391272664070129, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 44160 + }, + { + "epoch": 3.171992818671454, + "grad_norm": 0.990755558013916, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44170 + }, + { + "epoch": 3.172710951526032, + "grad_norm": 0.9310759902000427, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 44180 + }, + { + "epoch": 3.1734290843806106, + "grad_norm": 0.7698856592178345, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 44190 + }, + { + "epoch": 3.1741472172351886, + "grad_norm": 0.7735867500305176, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 44200 + }, + { + "epoch": 3.1748653500897666, + "grad_norm": 1.1447525024414062, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 44210 + }, + { + "epoch": 3.1755834829443446, + "grad_norm": 0.8667060136795044, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 44220 + }, + { + "epoch": 3.176301615798923, + "grad_norm": 0.8596829771995544, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 44230 + }, + { + "epoch": 3.177019748653501, + "grad_norm": 0.8607654571533203, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 44240 + }, + { + "epoch": 3.177737881508079, + "grad_norm": 0.9346948266029358, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 44250 + }, + { + "epoch": 3.178456014362657, + "grad_norm": 0.852344810962677, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 44260 + }, + { + "epoch": 3.179174147217235, + "grad_norm": 0.9260450005531311, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 44270 + }, + { + "epoch": 3.1798922800718135, + "grad_norm": 0.924053430557251, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 44280 + }, + { + "epoch": 3.1806104129263915, + "grad_norm": 1.001965045928955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 44290 + }, + { + "epoch": 3.1813285457809695, + "grad_norm": 0.943215012550354, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44300 + }, + { + "epoch": 3.1820466786355475, + "grad_norm": 1.006977915763855, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 44310 + }, + { + "epoch": 3.1827648114901255, + "grad_norm": 0.9768950343132019, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 44320 + }, + { + "epoch": 3.183482944344704, + "grad_norm": 0.9297489523887634, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 44330 + }, + { + "epoch": 3.184201077199282, + "grad_norm": 0.9110919237136841, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 44340 + }, + { + "epoch": 3.18491921005386, + "grad_norm": 0.9821381568908691, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 44350 + }, + { + "epoch": 3.185637342908438, + "grad_norm": 0.8451243042945862, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 44360 + }, + { + "epoch": 3.1863554757630164, + "grad_norm": 0.9676638245582581, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 44370 + }, + { + "epoch": 3.1870736086175944, + "grad_norm": 0.9826035499572754, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 44380 + }, + { + "epoch": 3.1877917414721724, + "grad_norm": 0.9453121423721313, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 44390 + }, + { + "epoch": 3.1885098743267504, + "grad_norm": 0.7766330242156982, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 44400 + }, + { + "epoch": 3.1892280071813284, + "grad_norm": 0.9302349090576172, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 44410 + }, + { + "epoch": 3.189946140035907, + "grad_norm": 0.8335331082344055, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 44420 + }, + { + "epoch": 3.190664272890485, + "grad_norm": 0.6722736358642578, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 44430 + }, + { + "epoch": 3.191382405745063, + "grad_norm": 0.9047536849975586, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 44440 + }, + { + "epoch": 3.192100538599641, + "grad_norm": 0.9653822183609009, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 44450 + }, + { + "epoch": 3.192818671454219, + "grad_norm": 0.7750703692436218, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 44460 + }, + { + "epoch": 3.1935368043087973, + "grad_norm": 0.7767539024353027, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 44470 + }, + { + "epoch": 3.1942549371633753, + "grad_norm": 0.8597778081893921, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44480 + }, + { + "epoch": 3.1949730700179533, + "grad_norm": 1.1711493730545044, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 44490 + }, + { + "epoch": 3.1956912028725313, + "grad_norm": 0.9025220274925232, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 44500 + }, + { + "epoch": 3.1964093357271093, + "grad_norm": 0.8084979057312012, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44510 + }, + { + "epoch": 3.1971274685816877, + "grad_norm": 0.8475074172019958, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44520 + }, + { + "epoch": 3.1978456014362657, + "grad_norm": 0.9915644526481628, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 44530 + }, + { + "epoch": 3.1985637342908437, + "grad_norm": 0.992231547832489, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 44540 + }, + { + "epoch": 3.1992818671454217, + "grad_norm": 0.9804556369781494, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 44550 + }, + { + "epoch": 3.2, + "grad_norm": 1.045558214187622, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 44560 + }, + { + "epoch": 3.200718132854578, + "grad_norm": 1.0880261659622192, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 44570 + }, + { + "epoch": 3.201436265709156, + "grad_norm": 0.9511138200759888, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44580 + }, + { + "epoch": 3.202154398563734, + "grad_norm": 0.9115344882011414, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 44590 + }, + { + "epoch": 3.202872531418312, + "grad_norm": 1.0738362073898315, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 44600 + }, + { + "epoch": 3.2035906642728906, + "grad_norm": 0.8209697604179382, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44610 + }, + { + "epoch": 3.2043087971274686, + "grad_norm": 0.9220197796821594, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44620 + }, + { + "epoch": 3.2050269299820466, + "grad_norm": 0.8859700560569763, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 44630 + }, + { + "epoch": 3.2057450628366246, + "grad_norm": 0.9772757291793823, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 44640 + }, + { + "epoch": 3.206463195691203, + "grad_norm": 0.9385574460029602, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 44650 + }, + { + "epoch": 3.207181328545781, + "grad_norm": 0.839958906173706, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 44660 + }, + { + "epoch": 3.207899461400359, + "grad_norm": 0.860478401184082, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 44670 + }, + { + "epoch": 3.208617594254937, + "grad_norm": 0.846886396408081, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 44680 + }, + { + "epoch": 3.209335727109515, + "grad_norm": 0.8591006398200989, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 44690 + }, + { + "epoch": 3.2100538599640935, + "grad_norm": 0.9236023426055908, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 44700 + }, + { + "epoch": 3.2107719928186715, + "grad_norm": 0.7348999977111816, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44710 + }, + { + "epoch": 3.2114901256732495, + "grad_norm": 1.0041730403900146, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 44720 + }, + { + "epoch": 3.2122082585278275, + "grad_norm": 0.8382687568664551, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 44730 + }, + { + "epoch": 3.2129263913824055, + "grad_norm": 0.8253511190414429, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 44740 + }, + { + "epoch": 3.213644524236984, + "grad_norm": 0.9589242935180664, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 44750 + }, + { + "epoch": 3.214362657091562, + "grad_norm": 0.8938157558441162, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 44760 + }, + { + "epoch": 3.21508078994614, + "grad_norm": 1.0085135698318481, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 44770 + }, + { + "epoch": 3.215798922800718, + "grad_norm": 0.8647134304046631, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 44780 + }, + { + "epoch": 3.216517055655296, + "grad_norm": 1.09453284740448, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 44790 + }, + { + "epoch": 3.2172351885098744, + "grad_norm": 0.8710666298866272, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 44800 + }, + { + "epoch": 3.2179533213644524, + "grad_norm": 0.8080880641937256, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 44810 + }, + { + "epoch": 3.2186714542190304, + "grad_norm": 1.0440675020217896, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 44820 + }, + { + "epoch": 3.2193895870736084, + "grad_norm": 1.1036376953125, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 44830 + }, + { + "epoch": 3.220107719928187, + "grad_norm": 0.8783546686172485, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44840 + }, + { + "epoch": 3.220825852782765, + "grad_norm": 0.7816855907440186, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 44850 + }, + { + "epoch": 3.221543985637343, + "grad_norm": 1.0099157094955444, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 44860 + }, + { + "epoch": 3.222262118491921, + "grad_norm": 1.054928183555603, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 44870 + }, + { + "epoch": 3.222980251346499, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 44880 + }, + { + "epoch": 3.2236983842010773, + "grad_norm": 0.9730798602104187, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 44890 + }, + { + "epoch": 3.2244165170556554, + "grad_norm": 0.7911382913589478, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 44900 + }, + { + "epoch": 3.2251346499102334, + "grad_norm": 0.9574400782585144, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 44910 + }, + { + "epoch": 3.2258527827648114, + "grad_norm": 0.8101068139076233, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 44920 + }, + { + "epoch": 3.22657091561939, + "grad_norm": 0.754146933555603, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 44930 + }, + { + "epoch": 3.227289048473968, + "grad_norm": 0.7471939921379089, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 44940 + }, + { + "epoch": 3.228007181328546, + "grad_norm": 1.0040855407714844, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 44950 + }, + { + "epoch": 3.228725314183124, + "grad_norm": 1.0016074180603027, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 44960 + }, + { + "epoch": 3.229443447037702, + "grad_norm": 1.0432976484298706, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 44970 + }, + { + "epoch": 3.2301615798922803, + "grad_norm": 0.8517055511474609, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 44980 + }, + { + "epoch": 3.2308797127468583, + "grad_norm": 0.9174178242683411, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 44990 + }, + { + "epoch": 3.2315978456014363, + "grad_norm": 0.9733774065971375, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 45000 + }, + { + "epoch": 3.2323159784560143, + "grad_norm": 0.9074714779853821, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 45010 + }, + { + "epoch": 3.2330341113105923, + "grad_norm": 0.8802759051322937, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 45020 + }, + { + "epoch": 3.2337522441651707, + "grad_norm": 1.0620871782302856, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 45030 + }, + { + "epoch": 3.2344703770197487, + "grad_norm": 0.8069542050361633, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 45040 + }, + { + "epoch": 3.2351885098743267, + "grad_norm": 0.9139137864112854, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 45050 + }, + { + "epoch": 3.2359066427289047, + "grad_norm": 0.8936411142349243, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 45060 + }, + { + "epoch": 3.2366247755834827, + "grad_norm": 0.9098079204559326, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 45070 + }, + { + "epoch": 3.237342908438061, + "grad_norm": 1.062953233718872, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45080 + }, + { + "epoch": 3.238061041292639, + "grad_norm": 0.8656470775604248, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 45090 + }, + { + "epoch": 3.238779174147217, + "grad_norm": 0.9299449920654297, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 45100 + }, + { + "epoch": 3.239497307001795, + "grad_norm": 1.0102022886276245, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 45110 + }, + { + "epoch": 3.2402154398563736, + "grad_norm": 0.8074561953544617, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 45120 + }, + { + "epoch": 3.2409335727109516, + "grad_norm": 1.044105887413025, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 45130 + }, + { + "epoch": 3.2416517055655296, + "grad_norm": 0.8742762207984924, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 45140 + }, + { + "epoch": 3.2423698384201076, + "grad_norm": 0.8240015506744385, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 45150 + }, + { + "epoch": 3.2430879712746856, + "grad_norm": 0.8438951373100281, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 45160 + }, + { + "epoch": 3.243806104129264, + "grad_norm": 1.02358877658844, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 45170 + }, + { + "epoch": 3.244524236983842, + "grad_norm": 0.8824774026870728, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 45180 + }, + { + "epoch": 3.24524236983842, + "grad_norm": 0.971015989780426, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 45190 + }, + { + "epoch": 3.245960502692998, + "grad_norm": 0.9282383918762207, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 45200 + }, + { + "epoch": 3.2466786355475765, + "grad_norm": 0.7908362746238708, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 45210 + }, + { + "epoch": 3.2473967684021545, + "grad_norm": 1.0721662044525146, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 45220 + }, + { + "epoch": 3.2481149012567325, + "grad_norm": 0.9516810774803162, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 45230 + }, + { + "epoch": 3.2488330341113105, + "grad_norm": 0.7914131283760071, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 45240 + }, + { + "epoch": 3.2495511669658885, + "grad_norm": 0.8492292761802673, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 45250 + }, + { + "epoch": 3.250269299820467, + "grad_norm": 0.8880114555358887, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 45260 + }, + { + "epoch": 3.250987432675045, + "grad_norm": 0.7808310985565186, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 45270 + }, + { + "epoch": 3.251705565529623, + "grad_norm": 0.8566828966140747, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 45280 + }, + { + "epoch": 3.252423698384201, + "grad_norm": 0.7929658889770508, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45290 + }, + { + "epoch": 3.253141831238779, + "grad_norm": 0.678207516670227, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 45300 + }, + { + "epoch": 3.2538599640933574, + "grad_norm": 0.9963029623031616, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45310 + }, + { + "epoch": 3.2545780969479354, + "grad_norm": 0.835304856300354, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 45320 + }, + { + "epoch": 3.2552962298025134, + "grad_norm": 0.7281617522239685, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 45330 + }, + { + "epoch": 3.2560143626570914, + "grad_norm": 1.244890570640564, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 45340 + }, + { + "epoch": 3.2567324955116694, + "grad_norm": 0.8372750282287598, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 45350 + }, + { + "epoch": 3.257450628366248, + "grad_norm": 1.0029667615890503, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 45360 + }, + { + "epoch": 3.258168761220826, + "grad_norm": 0.8561908602714539, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 45370 + }, + { + "epoch": 3.258886894075404, + "grad_norm": 1.0058085918426514, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 45380 + }, + { + "epoch": 3.259605026929982, + "grad_norm": 0.7768221497535706, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 45390 + }, + { + "epoch": 3.2603231597845603, + "grad_norm": 0.8443793058395386, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 45400 + }, + { + "epoch": 3.2610412926391383, + "grad_norm": 1.0140392780303955, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 45410 + }, + { + "epoch": 3.2617594254937163, + "grad_norm": 0.8397058248519897, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 45420 + }, + { + "epoch": 3.2624775583482943, + "grad_norm": 0.9717063903808594, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 45430 + }, + { + "epoch": 3.2631956912028723, + "grad_norm": 1.0279473066329956, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 45440 + }, + { + "epoch": 3.263913824057451, + "grad_norm": 1.207457184791565, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 45450 + }, + { + "epoch": 3.264631956912029, + "grad_norm": 0.8121998906135559, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 45460 + }, + { + "epoch": 3.265350089766607, + "grad_norm": 1.037733554840088, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 45470 + }, + { + "epoch": 3.266068222621185, + "grad_norm": 0.9305754899978638, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 45480 + }, + { + "epoch": 3.2667863554757632, + "grad_norm": 0.9733602404594421, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 45490 + }, + { + "epoch": 3.2675044883303412, + "grad_norm": 0.8345039486885071, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 45500 + }, + { + "epoch": 3.2682226211849192, + "grad_norm": 0.8601692318916321, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45510 + }, + { + "epoch": 3.2689407540394972, + "grad_norm": 0.7921277284622192, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 45520 + }, + { + "epoch": 3.2696588868940752, + "grad_norm": 0.8324153423309326, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 45530 + }, + { + "epoch": 3.2703770197486537, + "grad_norm": 0.85141521692276, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 45540 + }, + { + "epoch": 3.2710951526032317, + "grad_norm": 0.9399608373641968, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 45550 + }, + { + "epoch": 3.2718132854578097, + "grad_norm": 0.9829166531562805, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 45560 + }, + { + "epoch": 3.2725314183123877, + "grad_norm": 0.9936266541481018, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 45570 + }, + { + "epoch": 3.2732495511669657, + "grad_norm": 1.036165714263916, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 45580 + }, + { + "epoch": 3.273967684021544, + "grad_norm": 0.8988680243492126, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45590 + }, + { + "epoch": 3.274685816876122, + "grad_norm": 0.9173405766487122, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 45600 + }, + { + "epoch": 3.2754039497307, + "grad_norm": 0.9967324733734131, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 45610 + }, + { + "epoch": 3.276122082585278, + "grad_norm": 0.9097777009010315, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 45620 + }, + { + "epoch": 3.276840215439856, + "grad_norm": 1.0559430122375488, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 45630 + }, + { + "epoch": 3.2775583482944346, + "grad_norm": 0.9583360552787781, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 45640 + }, + { + "epoch": 3.2782764811490126, + "grad_norm": 0.7630334496498108, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 45650 + }, + { + "epoch": 3.2789946140035906, + "grad_norm": 0.9955230355262756, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 45660 + }, + { + "epoch": 3.2797127468581686, + "grad_norm": 0.8685793876647949, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45670 + }, + { + "epoch": 3.280430879712747, + "grad_norm": 0.919913113117218, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 45680 + }, + { + "epoch": 3.281149012567325, + "grad_norm": 0.826144814491272, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 45690 + }, + { + "epoch": 3.281867145421903, + "grad_norm": 0.9750179052352905, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 45700 + }, + { + "epoch": 3.282585278276481, + "grad_norm": 0.7931897640228271, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 45710 + }, + { + "epoch": 3.283303411131059, + "grad_norm": 1.0380089282989502, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 45720 + }, + { + "epoch": 3.2840215439856375, + "grad_norm": 0.8220566511154175, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 45730 + }, + { + "epoch": 3.2847396768402155, + "grad_norm": 0.9688239693641663, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 45740 + }, + { + "epoch": 3.2854578096947935, + "grad_norm": 0.8760311603546143, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 45750 + }, + { + "epoch": 3.2861759425493715, + "grad_norm": 0.8103382587432861, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 45760 + }, + { + "epoch": 3.28689407540395, + "grad_norm": 0.8835865259170532, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 45770 + }, + { + "epoch": 3.287612208258528, + "grad_norm": 0.9021160006523132, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45780 + }, + { + "epoch": 3.288330341113106, + "grad_norm": 0.8182386159896851, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 45790 + }, + { + "epoch": 3.289048473967684, + "grad_norm": 0.8555024862289429, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45800 + }, + { + "epoch": 3.289766606822262, + "grad_norm": 1.0982348918914795, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 45810 + }, + { + "epoch": 3.2904847396768404, + "grad_norm": 1.06246817111969, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 45820 + }, + { + "epoch": 3.2912028725314184, + "grad_norm": 1.1727149486541748, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 45830 + }, + { + "epoch": 3.2919210053859964, + "grad_norm": 0.8224700093269348, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 45840 + }, + { + "epoch": 3.2926391382405744, + "grad_norm": 0.8195698261260986, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 45850 + }, + { + "epoch": 3.2933572710951524, + "grad_norm": 0.8424476981163025, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 45860 + }, + { + "epoch": 3.294075403949731, + "grad_norm": 0.9804632067680359, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 45870 + }, + { + "epoch": 3.294793536804309, + "grad_norm": 0.8701804876327515, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 45880 + }, + { + "epoch": 3.295511669658887, + "grad_norm": 0.8876864910125732, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 45890 + }, + { + "epoch": 3.296229802513465, + "grad_norm": 1.0105448961257935, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 45900 + }, + { + "epoch": 3.296947935368043, + "grad_norm": 0.847017228603363, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 45910 + }, + { + "epoch": 3.2976660682226213, + "grad_norm": 0.7610297799110413, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 45920 + }, + { + "epoch": 3.2983842010771993, + "grad_norm": 0.7272670269012451, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 45930 + }, + { + "epoch": 3.2991023339317773, + "grad_norm": 0.8243510127067566, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 45940 + }, + { + "epoch": 3.2998204667863553, + "grad_norm": 1.0113074779510498, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 45950 + }, + { + "epoch": 3.3005385996409338, + "grad_norm": 0.8578087687492371, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 45960 + }, + { + "epoch": 3.3012567324955118, + "grad_norm": 0.9511606097221375, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 45970 + }, + { + "epoch": 3.3019748653500898, + "grad_norm": 0.8612566590309143, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 45980 + }, + { + "epoch": 3.3026929982046678, + "grad_norm": 0.8702331185340881, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 45990 + }, + { + "epoch": 3.3034111310592458, + "grad_norm": 1.0229583978652954, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 46000 + }, + { + "epoch": 3.304129263913824, + "grad_norm": 1.1775577068328857, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 46010 + }, + { + "epoch": 3.3048473967684022, + "grad_norm": 0.9922171831130981, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 46020 + }, + { + "epoch": 3.3055655296229802, + "grad_norm": 0.8246880769729614, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 46030 + }, + { + "epoch": 3.3062836624775582, + "grad_norm": 0.9351653456687927, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 46040 + }, + { + "epoch": 3.3070017953321367, + "grad_norm": 0.9617429375648499, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 46050 + }, + { + "epoch": 3.3077199281867147, + "grad_norm": 0.9753885269165039, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 46060 + }, + { + "epoch": 3.3084380610412927, + "grad_norm": 0.8532425165176392, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 46070 + }, + { + "epoch": 3.3091561938958707, + "grad_norm": 0.9722012877464294, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 46080 + }, + { + "epoch": 3.3098743267504487, + "grad_norm": 0.8950021266937256, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 46090 + }, + { + "epoch": 3.3105924596050267, + "grad_norm": 0.8536333441734314, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46100 + }, + { + "epoch": 3.311310592459605, + "grad_norm": 0.9423946738243103, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46110 + }, + { + "epoch": 3.312028725314183, + "grad_norm": 0.8573169112205505, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 46120 + }, + { + "epoch": 3.312746858168761, + "grad_norm": 1.0122376680374146, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 46130 + }, + { + "epoch": 3.313464991023339, + "grad_norm": 0.7492560744285583, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 46140 + }, + { + "epoch": 3.3141831238779176, + "grad_norm": 1.023658037185669, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 46150 + }, + { + "epoch": 3.3149012567324956, + "grad_norm": 1.1191970109939575, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 46160 + }, + { + "epoch": 3.3156193895870736, + "grad_norm": 0.9847373962402344, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 46170 + }, + { + "epoch": 3.3163375224416516, + "grad_norm": 0.7315911054611206, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 46180 + }, + { + "epoch": 3.3170556552962296, + "grad_norm": 0.8267890214920044, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 46190 + }, + { + "epoch": 3.317773788150808, + "grad_norm": 0.8898099064826965, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 46200 + }, + { + "epoch": 3.318491921005386, + "grad_norm": 0.8525369167327881, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 46210 + }, + { + "epoch": 3.319210053859964, + "grad_norm": 0.8074760437011719, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 46220 + }, + { + "epoch": 3.319928186714542, + "grad_norm": 0.8473616242408752, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 46230 + }, + { + "epoch": 3.3206463195691205, + "grad_norm": 0.8678314089775085, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 46240 + }, + { + "epoch": 3.3213644524236985, + "grad_norm": 0.8718782067298889, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 46250 + }, + { + "epoch": 3.3220825852782765, + "grad_norm": 0.9384858012199402, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 46260 + }, + { + "epoch": 3.3228007181328545, + "grad_norm": 0.9295032620429993, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 46270 + }, + { + "epoch": 3.3235188509874325, + "grad_norm": 0.9472482800483704, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 46280 + }, + { + "epoch": 3.324236983842011, + "grad_norm": 0.7970638275146484, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 46290 + }, + { + "epoch": 3.324955116696589, + "grad_norm": 0.9508723020553589, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 46300 + }, + { + "epoch": 3.325673249551167, + "grad_norm": 0.9153636693954468, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 46310 + }, + { + "epoch": 3.326391382405745, + "grad_norm": 0.7890323400497437, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 46320 + }, + { + "epoch": 3.3271095152603234, + "grad_norm": 0.8711825609207153, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46330 + }, + { + "epoch": 3.3278276481149014, + "grad_norm": 0.9938926696777344, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 46340 + }, + { + "epoch": 3.3285457809694794, + "grad_norm": 0.8497524857521057, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 46350 + }, + { + "epoch": 3.3292639138240574, + "grad_norm": 0.9191650748252869, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 46360 + }, + { + "epoch": 3.3299820466786354, + "grad_norm": 0.8974085450172424, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 46370 + }, + { + "epoch": 3.3307001795332134, + "grad_norm": 0.9928934574127197, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 46380 + }, + { + "epoch": 3.331418312387792, + "grad_norm": 0.9011030197143555, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46390 + }, + { + "epoch": 3.33213644524237, + "grad_norm": 0.898594856262207, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 46400 + }, + { + "epoch": 3.332854578096948, + "grad_norm": 0.7506672143936157, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 46410 + }, + { + "epoch": 3.333572710951526, + "grad_norm": 0.9239172339439392, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 46420 + }, + { + "epoch": 3.3342908438061043, + "grad_norm": 1.0749682188034058, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46430 + }, + { + "epoch": 3.3350089766606823, + "grad_norm": 0.9262617230415344, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 46440 + }, + { + "epoch": 3.3357271095152603, + "grad_norm": 0.8681274056434631, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 46450 + }, + { + "epoch": 3.3364452423698383, + "grad_norm": 0.9558620452880859, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 46460 + }, + { + "epoch": 3.3371633752244163, + "grad_norm": 0.8907097578048706, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 46470 + }, + { + "epoch": 3.3378815080789948, + "grad_norm": 1.0941565036773682, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 46480 + }, + { + "epoch": 3.3385996409335728, + "grad_norm": 0.8971590995788574, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 46490 + }, + { + "epoch": 3.3393177737881508, + "grad_norm": 1.0315606594085693, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 46500 + }, + { + "epoch": 3.3400359066427288, + "grad_norm": 0.7717124223709106, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 46510 + }, + { + "epoch": 3.340754039497307, + "grad_norm": 0.8060970902442932, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 46520 + }, + { + "epoch": 3.341472172351885, + "grad_norm": 0.969510018825531, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 46530 + }, + { + "epoch": 3.342190305206463, + "grad_norm": 0.8837248682975769, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 46540 + }, + { + "epoch": 3.342908438061041, + "grad_norm": 0.9561076164245605, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 46550 + }, + { + "epoch": 3.343626570915619, + "grad_norm": 0.8529208898544312, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 46560 + }, + { + "epoch": 3.3443447037701977, + "grad_norm": 1.1300519704818726, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 46570 + }, + { + "epoch": 3.3450628366247757, + "grad_norm": 0.8330956101417542, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46580 + }, + { + "epoch": 3.3457809694793537, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 46590 + }, + { + "epoch": 3.3464991023339317, + "grad_norm": 1.0470821857452393, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 46600 + }, + { + "epoch": 3.34721723518851, + "grad_norm": 0.9933704137802124, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46610 + }, + { + "epoch": 3.347935368043088, + "grad_norm": 0.8130798935890198, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 46620 + }, + { + "epoch": 3.348653500897666, + "grad_norm": 0.9746946692466736, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46630 + }, + { + "epoch": 3.349371633752244, + "grad_norm": 0.8607267141342163, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46640 + }, + { + "epoch": 3.350089766606822, + "grad_norm": 0.800335705280304, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 46650 + }, + { + "epoch": 3.3508078994614, + "grad_norm": 1.0083239078521729, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 46660 + }, + { + "epoch": 3.3515260323159786, + "grad_norm": 1.0774433612823486, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 46670 + }, + { + "epoch": 3.3522441651705566, + "grad_norm": 0.9378824234008789, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46680 + }, + { + "epoch": 3.3529622980251346, + "grad_norm": 0.8490564227104187, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 46690 + }, + { + "epoch": 3.3536804308797126, + "grad_norm": 1.0415582656860352, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 46700 + }, + { + "epoch": 3.354398563734291, + "grad_norm": 0.8514367938041687, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 46710 + }, + { + "epoch": 3.355116696588869, + "grad_norm": 0.7691360712051392, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 46720 + }, + { + "epoch": 3.355834829443447, + "grad_norm": 0.8345438241958618, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 46730 + }, + { + "epoch": 3.356552962298025, + "grad_norm": 1.023492693901062, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 46740 + }, + { + "epoch": 3.357271095152603, + "grad_norm": 0.9648325443267822, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 46750 + }, + { + "epoch": 3.3579892280071815, + "grad_norm": 0.9029248356819153, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 46760 + }, + { + "epoch": 3.3587073608617595, + "grad_norm": 0.9109513759613037, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 46770 + }, + { + "epoch": 3.3594254937163375, + "grad_norm": 0.7757390141487122, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 46780 + }, + { + "epoch": 3.3601436265709155, + "grad_norm": 0.794035792350769, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46790 + }, + { + "epoch": 3.360861759425494, + "grad_norm": 0.8211429715156555, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 46800 + }, + { + "epoch": 3.361579892280072, + "grad_norm": 0.8620322346687317, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46810 + }, + { + "epoch": 3.36229802513465, + "grad_norm": 0.9392538070678711, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 46820 + }, + { + "epoch": 3.363016157989228, + "grad_norm": 0.8297873139381409, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 46830 + }, + { + "epoch": 3.363734290843806, + "grad_norm": 0.9158190488815308, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 46840 + }, + { + "epoch": 3.3644524236983844, + "grad_norm": 1.1449424028396606, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 46850 + }, + { + "epoch": 3.3651705565529624, + "grad_norm": 0.8718444108963013, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 46860 + }, + { + "epoch": 3.3658886894075404, + "grad_norm": 0.7744014263153076, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 46870 + }, + { + "epoch": 3.3666068222621184, + "grad_norm": 0.8392460942268372, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 46880 + }, + { + "epoch": 3.367324955116697, + "grad_norm": 1.0424989461898804, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 46890 + }, + { + "epoch": 3.368043087971275, + "grad_norm": 1.4696359634399414, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 46900 + }, + { + "epoch": 3.368761220825853, + "grad_norm": 0.9298201203346252, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46910 + }, + { + "epoch": 3.369479353680431, + "grad_norm": 0.8965262770652771, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 46920 + }, + { + "epoch": 3.370197486535009, + "grad_norm": 0.9395381808280945, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 46930 + }, + { + "epoch": 3.370915619389587, + "grad_norm": 0.9069047570228577, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 46940 + }, + { + "epoch": 3.3716337522441653, + "grad_norm": 0.9208605885505676, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46950 + }, + { + "epoch": 3.3723518850987433, + "grad_norm": 0.9493077397346497, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 46960 + }, + { + "epoch": 3.3730700179533213, + "grad_norm": 1.0804208517074585, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 46970 + }, + { + "epoch": 3.3737881508078993, + "grad_norm": 0.9465714693069458, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 46980 + }, + { + "epoch": 3.3745062836624777, + "grad_norm": 0.9189882278442383, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 46990 + }, + { + "epoch": 3.3752244165170557, + "grad_norm": 1.0199357271194458, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 47000 + }, + { + "epoch": 3.3759425493716337, + "grad_norm": 0.8999426960945129, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 47010 + }, + { + "epoch": 3.3766606822262117, + "grad_norm": 0.8923690319061279, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 47020 + }, + { + "epoch": 3.3773788150807897, + "grad_norm": 0.7459347248077393, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 47030 + }, + { + "epoch": 3.378096947935368, + "grad_norm": 0.7702858448028564, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 47040 + }, + { + "epoch": 3.378815080789946, + "grad_norm": 0.8296625018119812, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 47050 + }, + { + "epoch": 3.379533213644524, + "grad_norm": 1.2952555418014526, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47060 + }, + { + "epoch": 3.380251346499102, + "grad_norm": 0.7778869271278381, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 47070 + }, + { + "epoch": 3.3809694793536806, + "grad_norm": 0.9151549339294434, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 47080 + }, + { + "epoch": 3.3816876122082586, + "grad_norm": 0.7883925437927246, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 47090 + }, + { + "epoch": 3.3824057450628366, + "grad_norm": 0.9602295756340027, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 47100 + }, + { + "epoch": 3.3831238779174146, + "grad_norm": 0.7953121066093445, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47110 + }, + { + "epoch": 3.3838420107719926, + "grad_norm": 1.110148549079895, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 47120 + }, + { + "epoch": 3.384560143626571, + "grad_norm": 0.9359608888626099, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 47130 + }, + { + "epoch": 3.385278276481149, + "grad_norm": 0.7877762317657471, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 47140 + }, + { + "epoch": 3.385996409335727, + "grad_norm": 0.8586933016777039, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47150 + }, + { + "epoch": 3.386714542190305, + "grad_norm": 0.8920878767967224, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 47160 + }, + { + "epoch": 3.3874326750448835, + "grad_norm": 0.9692603349685669, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 47170 + }, + { + "epoch": 3.3881508078994615, + "grad_norm": 0.9038610458374023, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 47180 + }, + { + "epoch": 3.3888689407540395, + "grad_norm": 1.6299188137054443, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 47190 + }, + { + "epoch": 3.3895870736086176, + "grad_norm": 0.9704291820526123, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 47200 + }, + { + "epoch": 3.3903052064631956, + "grad_norm": 0.9503401517868042, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 47210 + }, + { + "epoch": 3.3910233393177736, + "grad_norm": 1.0051378011703491, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 47220 + }, + { + "epoch": 3.391741472172352, + "grad_norm": 0.7336357235908508, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 47230 + }, + { + "epoch": 3.39245960502693, + "grad_norm": 0.9847398996353149, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47240 + }, + { + "epoch": 3.393177737881508, + "grad_norm": 0.8100917339324951, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 47250 + }, + { + "epoch": 3.393895870736086, + "grad_norm": 0.9752838611602783, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 47260 + }, + { + "epoch": 3.3946140035906645, + "grad_norm": 0.9400623440742493, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 47270 + }, + { + "epoch": 3.3953321364452425, + "grad_norm": 0.7310057878494263, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 47280 + }, + { + "epoch": 3.3960502692998205, + "grad_norm": 0.8898789286613464, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 47290 + }, + { + "epoch": 3.3967684021543985, + "grad_norm": 1.0157585144042969, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 47300 + }, + { + "epoch": 3.3974865350089765, + "grad_norm": 0.9108527898788452, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 47310 + }, + { + "epoch": 3.398204667863555, + "grad_norm": 0.9796249270439148, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 47320 + }, + { + "epoch": 3.398922800718133, + "grad_norm": 0.8176435232162476, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 47330 + }, + { + "epoch": 3.399640933572711, + "grad_norm": 0.9981188178062439, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 47340 + }, + { + "epoch": 3.400359066427289, + "grad_norm": 0.9774404764175415, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47350 + }, + { + "epoch": 3.4010771992818674, + "grad_norm": 0.8624991774559021, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 47360 + }, + { + "epoch": 3.4017953321364454, + "grad_norm": 0.9191665053367615, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 47370 + }, + { + "epoch": 3.4025134649910234, + "grad_norm": 0.7971290946006775, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 47380 + }, + { + "epoch": 3.4032315978456014, + "grad_norm": 0.8336732983589172, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 47390 + }, + { + "epoch": 3.4039497307001794, + "grad_norm": 0.7730334401130676, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 47400 + }, + { + "epoch": 3.404667863554758, + "grad_norm": 0.8559145927429199, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 47410 + }, + { + "epoch": 3.405385996409336, + "grad_norm": 1.0261447429656982, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 47420 + }, + { + "epoch": 3.406104129263914, + "grad_norm": 0.9931781888008118, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 47430 + }, + { + "epoch": 3.406822262118492, + "grad_norm": 0.8971807360649109, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 47440 + }, + { + "epoch": 3.4075403949730703, + "grad_norm": 0.8886999487876892, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 47450 + }, + { + "epoch": 3.4082585278276483, + "grad_norm": 0.9551735520362854, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 47460 + }, + { + "epoch": 3.4089766606822263, + "grad_norm": 0.9066859483718872, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 47470 + }, + { + "epoch": 3.4096947935368043, + "grad_norm": 0.9192125201225281, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 47480 + }, + { + "epoch": 3.4104129263913823, + "grad_norm": 0.9332839250564575, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 47490 + }, + { + "epoch": 3.4111310592459603, + "grad_norm": 0.745563805103302, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47500 + }, + { + "epoch": 3.4118491921005387, + "grad_norm": 0.6843905448913574, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 47510 + }, + { + "epoch": 3.4125673249551167, + "grad_norm": 0.8063111305236816, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 47520 + }, + { + "epoch": 3.4132854578096947, + "grad_norm": 0.9666593670845032, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 47530 + }, + { + "epoch": 3.4140035906642727, + "grad_norm": 0.8112747073173523, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47540 + }, + { + "epoch": 3.414721723518851, + "grad_norm": 0.820807933807373, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 47550 + }, + { + "epoch": 3.415439856373429, + "grad_norm": 0.8476285338401794, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 47560 + }, + { + "epoch": 3.416157989228007, + "grad_norm": 1.0232552289962769, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47570 + }, + { + "epoch": 3.416876122082585, + "grad_norm": 0.8749372363090515, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 47580 + }, + { + "epoch": 3.417594254937163, + "grad_norm": 0.8117937445640564, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 47590 + }, + { + "epoch": 3.4183123877917416, + "grad_norm": 0.9010460376739502, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 47600 + }, + { + "epoch": 3.4190305206463196, + "grad_norm": 0.8955527544021606, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 47610 + }, + { + "epoch": 3.4197486535008976, + "grad_norm": 0.884186327457428, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 47620 + }, + { + "epoch": 3.4204667863554756, + "grad_norm": 0.8995241522789001, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 47630 + }, + { + "epoch": 3.421184919210054, + "grad_norm": 1.0627013444900513, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47640 + }, + { + "epoch": 3.421903052064632, + "grad_norm": 0.8619979619979858, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 47650 + }, + { + "epoch": 3.42262118491921, + "grad_norm": 0.9682498574256897, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 47660 + }, + { + "epoch": 3.423339317773788, + "grad_norm": 0.9614400863647461, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 47670 + }, + { + "epoch": 3.424057450628366, + "grad_norm": 0.7986962795257568, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 47680 + }, + { + "epoch": 3.4247755834829445, + "grad_norm": 0.8255957961082458, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 47690 + }, + { + "epoch": 3.4254937163375225, + "grad_norm": 0.9139757752418518, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 47700 + }, + { + "epoch": 3.4262118491921005, + "grad_norm": 0.8086292743682861, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 47710 + }, + { + "epoch": 3.4269299820466785, + "grad_norm": 0.8852273225784302, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 47720 + }, + { + "epoch": 3.427648114901257, + "grad_norm": 0.7568784356117249, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 47730 + }, + { + "epoch": 3.428366247755835, + "grad_norm": 0.8933039903640747, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 47740 + }, + { + "epoch": 3.429084380610413, + "grad_norm": 0.8101669549942017, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 47750 + }, + { + "epoch": 3.429802513464991, + "grad_norm": 0.7021054625511169, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 47760 + }, + { + "epoch": 3.430520646319569, + "grad_norm": 0.8282538652420044, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 47770 + }, + { + "epoch": 3.431238779174147, + "grad_norm": 0.8168348670005798, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 47780 + }, + { + "epoch": 3.4319569120287254, + "grad_norm": 0.9504001140594482, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 47790 + }, + { + "epoch": 3.4326750448833034, + "grad_norm": 0.7500190734863281, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47800 + }, + { + "epoch": 3.4333931777378814, + "grad_norm": 0.8645710945129395, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 47810 + }, + { + "epoch": 3.4341113105924594, + "grad_norm": 0.8088704943656921, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 47820 + }, + { + "epoch": 3.434829443447038, + "grad_norm": 0.9981673955917358, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 47830 + }, + { + "epoch": 3.435547576301616, + "grad_norm": 0.9363315105438232, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 47840 + }, + { + "epoch": 3.436265709156194, + "grad_norm": 0.8471030592918396, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 47850 + }, + { + "epoch": 3.436983842010772, + "grad_norm": 0.9447668790817261, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 47860 + }, + { + "epoch": 3.43770197486535, + "grad_norm": 0.9494127631187439, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 47870 + }, + { + "epoch": 3.4384201077199283, + "grad_norm": 0.8340432643890381, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47880 + }, + { + "epoch": 3.4391382405745063, + "grad_norm": 0.8466387987136841, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 47890 + }, + { + "epoch": 3.4398563734290843, + "grad_norm": 0.9498962759971619, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47900 + }, + { + "epoch": 3.4405745062836623, + "grad_norm": 0.8490501046180725, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 47910 + }, + { + "epoch": 3.441292639138241, + "grad_norm": 0.9506490230560303, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 47920 + }, + { + "epoch": 3.442010771992819, + "grad_norm": 0.7944257855415344, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 47930 + }, + { + "epoch": 3.442728904847397, + "grad_norm": 0.9725518226623535, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 47940 + }, + { + "epoch": 3.443447037701975, + "grad_norm": 0.7823024392127991, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47950 + }, + { + "epoch": 3.444165170556553, + "grad_norm": 0.810565173625946, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 47960 + }, + { + "epoch": 3.4448833034111312, + "grad_norm": 0.9809024333953857, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 47970 + }, + { + "epoch": 3.4456014362657092, + "grad_norm": 0.8818578720092773, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 47980 + }, + { + "epoch": 3.4463195691202873, + "grad_norm": 0.9843092560768127, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 47990 + }, + { + "epoch": 3.4470377019748653, + "grad_norm": 0.916313886642456, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 48000 + }, + { + "epoch": 3.4477558348294433, + "grad_norm": 0.908442497253418, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 48010 + }, + { + "epoch": 3.4484739676840217, + "grad_norm": 0.9880178570747375, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 48020 + }, + { + "epoch": 3.4491921005385997, + "grad_norm": 0.9276854991912842, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 48030 + }, + { + "epoch": 3.4499102333931777, + "grad_norm": 1.0879448652267456, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 48040 + }, + { + "epoch": 3.4506283662477557, + "grad_norm": 0.7430389523506165, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 48050 + }, + { + "epoch": 3.4513464991023337, + "grad_norm": 1.0880072116851807, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 48060 + }, + { + "epoch": 3.452064631956912, + "grad_norm": 1.0424141883850098, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 48070 + }, + { + "epoch": 3.45278276481149, + "grad_norm": 0.926330029964447, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 48080 + }, + { + "epoch": 3.453500897666068, + "grad_norm": 0.8911219239234924, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 48090 + }, + { + "epoch": 3.454219030520646, + "grad_norm": 0.8727201223373413, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 48100 + }, + { + "epoch": 3.4549371633752246, + "grad_norm": 0.8573940396308899, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48110 + }, + { + "epoch": 3.4556552962298026, + "grad_norm": 1.0427064895629883, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 48120 + }, + { + "epoch": 3.4563734290843806, + "grad_norm": 0.8688231706619263, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 48130 + }, + { + "epoch": 3.4570915619389586, + "grad_norm": 0.8856009244918823, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 48140 + }, + { + "epoch": 3.4578096947935366, + "grad_norm": 0.9535353183746338, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 48150 + }, + { + "epoch": 3.458527827648115, + "grad_norm": 0.9466010928153992, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 48160 + }, + { + "epoch": 3.459245960502693, + "grad_norm": 0.9783535599708557, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 48170 + }, + { + "epoch": 3.459964093357271, + "grad_norm": 0.8010456562042236, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 48180 + }, + { + "epoch": 3.460682226211849, + "grad_norm": 0.8928955793380737, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 48190 + }, + { + "epoch": 3.4614003590664275, + "grad_norm": 0.7565838694572449, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 48200 + }, + { + "epoch": 3.4621184919210055, + "grad_norm": 1.0044180154800415, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 48210 + }, + { + "epoch": 3.4628366247755835, + "grad_norm": 0.8161038160324097, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 48220 + }, + { + "epoch": 3.4635547576301615, + "grad_norm": 1.1000211238861084, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 48230 + }, + { + "epoch": 3.4642728904847395, + "grad_norm": 0.7942240238189697, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 48240 + }, + { + "epoch": 3.464991023339318, + "grad_norm": 0.7546432018280029, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 48250 + }, + { + "epoch": 3.465709156193896, + "grad_norm": 0.7705255150794983, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 48260 + }, + { + "epoch": 3.466427289048474, + "grad_norm": 0.7958067059516907, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 48270 + }, + { + "epoch": 3.467145421903052, + "grad_norm": 0.9199120402336121, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48280 + }, + { + "epoch": 3.46786355475763, + "grad_norm": 1.118672251701355, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 48290 + }, + { + "epoch": 3.4685816876122084, + "grad_norm": 0.9161015748977661, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 48300 + }, + { + "epoch": 3.4692998204667864, + "grad_norm": 1.1086218357086182, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 48310 + }, + { + "epoch": 3.4700179533213644, + "grad_norm": 1.0123368501663208, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 48320 + }, + { + "epoch": 3.4707360861759424, + "grad_norm": 0.7380602359771729, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 48330 + }, + { + "epoch": 3.4714542190305204, + "grad_norm": 0.8967105150222778, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 48340 + }, + { + "epoch": 3.472172351885099, + "grad_norm": 1.0134044885635376, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48350 + }, + { + "epoch": 3.472890484739677, + "grad_norm": 1.080815076828003, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 48360 + }, + { + "epoch": 3.473608617594255, + "grad_norm": 1.151721477508545, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 48370 + }, + { + "epoch": 3.474326750448833, + "grad_norm": 0.9436505436897278, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 48380 + }, + { + "epoch": 3.4750448833034113, + "grad_norm": 0.9154609441757202, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 48390 + }, + { + "epoch": 3.4757630161579893, + "grad_norm": 0.8943037986755371, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 48400 + }, + { + "epoch": 3.4764811490125673, + "grad_norm": 0.936988115310669, + "learning_rate": 0.0002, + "loss": 0.6316, + "step": 48410 + }, + { + "epoch": 3.4771992818671453, + "grad_norm": 0.826960027217865, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 48420 + }, + { + "epoch": 3.4779174147217233, + "grad_norm": 1.0487587451934814, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 48430 + }, + { + "epoch": 3.478635547576302, + "grad_norm": 0.729163646697998, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 48440 + }, + { + "epoch": 3.47935368043088, + "grad_norm": 0.8156948089599609, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 48450 + }, + { + "epoch": 3.480071813285458, + "grad_norm": 0.8004332184791565, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 48460 + }, + { + "epoch": 3.480789946140036, + "grad_norm": 0.9632692337036133, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 48470 + }, + { + "epoch": 3.4815080789946142, + "grad_norm": 1.0950212478637695, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 48480 + }, + { + "epoch": 3.4822262118491922, + "grad_norm": 0.8574318885803223, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 48490 + }, + { + "epoch": 3.4829443447037702, + "grad_norm": 0.8552606701850891, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 48500 + }, + { + "epoch": 3.4836624775583482, + "grad_norm": 0.9698445200920105, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 48510 + }, + { + "epoch": 3.4843806104129262, + "grad_norm": 0.9427815675735474, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 48520 + }, + { + "epoch": 3.4850987432675042, + "grad_norm": 0.7902070879936218, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 48530 + }, + { + "epoch": 3.4858168761220827, + "grad_norm": 1.0300066471099854, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 48540 + }, + { + "epoch": 3.4865350089766607, + "grad_norm": 1.1688778400421143, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 48550 + }, + { + "epoch": 3.4872531418312387, + "grad_norm": 1.0012071132659912, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 48560 + }, + { + "epoch": 3.4879712746858167, + "grad_norm": 1.112094759941101, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 48570 + }, + { + "epoch": 3.488689407540395, + "grad_norm": 0.8547284603118896, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 48580 + }, + { + "epoch": 3.489407540394973, + "grad_norm": 0.8827278017997742, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 48590 + }, + { + "epoch": 3.490125673249551, + "grad_norm": 0.9255490303039551, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 48600 + }, + { + "epoch": 3.490843806104129, + "grad_norm": 0.8000030517578125, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 48610 + }, + { + "epoch": 3.491561938958707, + "grad_norm": 0.9327391386032104, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 48620 + }, + { + "epoch": 3.4922800718132856, + "grad_norm": 0.9004138708114624, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 48630 + }, + { + "epoch": 3.4929982046678636, + "grad_norm": 0.9886971116065979, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 48640 + }, + { + "epoch": 3.4937163375224416, + "grad_norm": 0.9890487194061279, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 48650 + }, + { + "epoch": 3.4944344703770196, + "grad_norm": 0.7024438977241516, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 48660 + }, + { + "epoch": 3.495152603231598, + "grad_norm": 0.8397303223609924, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 48670 + }, + { + "epoch": 3.495870736086176, + "grad_norm": 0.9120950698852539, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 48680 + }, + { + "epoch": 3.496588868940754, + "grad_norm": 1.057299017906189, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48690 + }, + { + "epoch": 3.497307001795332, + "grad_norm": 0.821325957775116, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 48700 + }, + { + "epoch": 3.49802513464991, + "grad_norm": 1.0029970407485962, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 48710 + }, + { + "epoch": 3.4987432675044885, + "grad_norm": 0.9483712911605835, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 48720 + }, + { + "epoch": 3.4994614003590665, + "grad_norm": 0.9637855291366577, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 48730 + }, + { + "epoch": 3.5001795332136445, + "grad_norm": 0.6848894357681274, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 48740 + }, + { + "epoch": 3.5008976660682225, + "grad_norm": 0.7848573327064514, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 48750 + }, + { + "epoch": 3.501615798922801, + "grad_norm": 1.0341308116912842, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 48760 + }, + { + "epoch": 3.502333931777379, + "grad_norm": 0.8858218193054199, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 48770 + }, + { + "epoch": 3.503052064631957, + "grad_norm": 0.8366939425468445, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 48780 + }, + { + "epoch": 3.503770197486535, + "grad_norm": 0.7926092147827148, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 48790 + }, + { + "epoch": 3.504488330341113, + "grad_norm": 0.8503843545913696, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 48800 + }, + { + "epoch": 3.505206463195691, + "grad_norm": 0.8867869973182678, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 48810 + }, + { + "epoch": 3.5059245960502694, + "grad_norm": 1.0336930751800537, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 48820 + }, + { + "epoch": 3.5066427289048474, + "grad_norm": 0.8564051985740662, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 48830 + }, + { + "epoch": 3.5073608617594254, + "grad_norm": 0.9202605485916138, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 48840 + }, + { + "epoch": 3.508078994614004, + "grad_norm": 0.8838639855384827, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 48850 + }, + { + "epoch": 3.508797127468582, + "grad_norm": 0.8975196480751038, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48860 + }, + { + "epoch": 3.50951526032316, + "grad_norm": 0.8842370510101318, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 48870 + }, + { + "epoch": 3.510233393177738, + "grad_norm": 0.9195886254310608, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 48880 + }, + { + "epoch": 3.510951526032316, + "grad_norm": 0.986130952835083, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 48890 + }, + { + "epoch": 3.511669658886894, + "grad_norm": 0.8119593858718872, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 48900 + }, + { + "epoch": 3.5123877917414723, + "grad_norm": 0.9027136564254761, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 48910 + }, + { + "epoch": 3.5131059245960503, + "grad_norm": 0.8560537099838257, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 48920 + }, + { + "epoch": 3.5138240574506283, + "grad_norm": 0.7073559165000916, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 48930 + }, + { + "epoch": 3.5145421903052063, + "grad_norm": 0.8753304481506348, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 48940 + }, + { + "epoch": 3.5152603231597848, + "grad_norm": 0.9151145815849304, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 48950 + }, + { + "epoch": 3.5159784560143628, + "grad_norm": 0.7794315814971924, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 48960 + }, + { + "epoch": 3.5166965888689408, + "grad_norm": 0.9226023554801941, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 48970 + }, + { + "epoch": 3.5174147217235188, + "grad_norm": 0.8442051410675049, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48980 + }, + { + "epoch": 3.5181328545780968, + "grad_norm": 0.9769423007965088, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 48990 + }, + { + "epoch": 3.5188509874326748, + "grad_norm": 0.740347146987915, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 49000 + }, + { + "epoch": 3.519569120287253, + "grad_norm": 0.8963457345962524, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 49010 + }, + { + "epoch": 3.520287253141831, + "grad_norm": 0.8410176634788513, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 49020 + }, + { + "epoch": 3.521005385996409, + "grad_norm": 1.0486022233963013, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 49030 + }, + { + "epoch": 3.5217235188509877, + "grad_norm": 0.95393967628479, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 49040 + }, + { + "epoch": 3.5224416517055657, + "grad_norm": 0.8261157274246216, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49050 + }, + { + "epoch": 3.5231597845601437, + "grad_norm": 0.9321704506874084, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 49060 + }, + { + "epoch": 3.5238779174147217, + "grad_norm": 1.2596088647842407, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 49070 + }, + { + "epoch": 3.5245960502692997, + "grad_norm": 0.8584637641906738, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 49080 + }, + { + "epoch": 3.5253141831238777, + "grad_norm": 0.850520670413971, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 49090 + }, + { + "epoch": 3.526032315978456, + "grad_norm": 0.8915920257568359, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 49100 + }, + { + "epoch": 3.526750448833034, + "grad_norm": 0.9070239067077637, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 49110 + }, + { + "epoch": 3.527468581687612, + "grad_norm": 0.699878990650177, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 49120 + }, + { + "epoch": 3.5281867145421906, + "grad_norm": 0.9003779888153076, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 49130 + }, + { + "epoch": 3.5289048473967686, + "grad_norm": 0.7886711955070496, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 49140 + }, + { + "epoch": 3.5296229802513466, + "grad_norm": 0.7368922233581543, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 49150 + }, + { + "epoch": 3.5303411131059246, + "grad_norm": 0.8585197329521179, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 49160 + }, + { + "epoch": 3.5310592459605026, + "grad_norm": 1.0205435752868652, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 49170 + }, + { + "epoch": 3.5317773788150806, + "grad_norm": 0.8756650686264038, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 49180 + }, + { + "epoch": 3.532495511669659, + "grad_norm": 1.0278643369674683, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 49190 + }, + { + "epoch": 3.533213644524237, + "grad_norm": 0.8641911745071411, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 49200 + }, + { + "epoch": 3.533931777378815, + "grad_norm": 0.8730159401893616, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 49210 + }, + { + "epoch": 3.534649910233393, + "grad_norm": 0.918637216091156, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 49220 + }, + { + "epoch": 3.5353680430879715, + "grad_norm": 1.0467222929000854, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 49230 + }, + { + "epoch": 3.5360861759425495, + "grad_norm": 1.005009412765503, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 49240 + }, + { + "epoch": 3.5368043087971275, + "grad_norm": 0.9775063395500183, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 49250 + }, + { + "epoch": 3.5375224416517055, + "grad_norm": 0.8198322057723999, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 49260 + }, + { + "epoch": 3.5382405745062835, + "grad_norm": 0.8184829354286194, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 49270 + }, + { + "epoch": 3.5389587073608615, + "grad_norm": 0.9520270824432373, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 49280 + }, + { + "epoch": 3.53967684021544, + "grad_norm": 0.7816803455352783, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 49290 + }, + { + "epoch": 3.540394973070018, + "grad_norm": 0.6915702819824219, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 49300 + }, + { + "epoch": 3.541113105924596, + "grad_norm": 0.8282375931739807, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 49310 + }, + { + "epoch": 3.5418312387791744, + "grad_norm": 1.0797513723373413, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 49320 + }, + { + "epoch": 3.5425493716337524, + "grad_norm": 0.868671715259552, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 49330 + }, + { + "epoch": 3.5432675044883304, + "grad_norm": 0.8534455895423889, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 49340 + }, + { + "epoch": 3.5439856373429084, + "grad_norm": 0.816411554813385, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 49350 + }, + { + "epoch": 3.5447037701974864, + "grad_norm": 0.7813423275947571, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 49360 + }, + { + "epoch": 3.5454219030520644, + "grad_norm": 0.8002013564109802, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 49370 + }, + { + "epoch": 3.546140035906643, + "grad_norm": 0.9740113615989685, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 49380 + }, + { + "epoch": 3.546858168761221, + "grad_norm": 0.9046127200126648, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 49390 + }, + { + "epoch": 3.547576301615799, + "grad_norm": 0.8635150194168091, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 49400 + }, + { + "epoch": 3.5482944344703773, + "grad_norm": 0.9488558769226074, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 49410 + }, + { + "epoch": 3.5490125673249553, + "grad_norm": 0.9637090563774109, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 49420 + }, + { + "epoch": 3.5497307001795333, + "grad_norm": 1.042245626449585, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 49430 + }, + { + "epoch": 3.5504488330341113, + "grad_norm": 0.9076175689697266, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 49440 + }, + { + "epoch": 3.5511669658886893, + "grad_norm": 0.8480596542358398, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 49450 + }, + { + "epoch": 3.5518850987432673, + "grad_norm": 0.8483007550239563, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 49460 + }, + { + "epoch": 3.5526032315978457, + "grad_norm": 0.7855815887451172, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 49470 + }, + { + "epoch": 3.5533213644524237, + "grad_norm": 0.8435823917388916, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 49480 + }, + { + "epoch": 3.5540394973070017, + "grad_norm": 0.8613026142120361, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 49490 + }, + { + "epoch": 3.5547576301615798, + "grad_norm": 0.9654812812805176, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 49500 + }, + { + "epoch": 3.555475763016158, + "grad_norm": 0.8888838887214661, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 49510 + }, + { + "epoch": 3.556193895870736, + "grad_norm": 0.7718146443367004, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49520 + }, + { + "epoch": 3.556912028725314, + "grad_norm": 0.9487382173538208, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 49530 + }, + { + "epoch": 3.557630161579892, + "grad_norm": 0.9256559610366821, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 49540 + }, + { + "epoch": 3.55834829443447, + "grad_norm": 0.8879945874214172, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 49550 + }, + { + "epoch": 3.559066427289048, + "grad_norm": 0.8498744368553162, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 49560 + }, + { + "epoch": 3.5597845601436267, + "grad_norm": 0.9550948143005371, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 49570 + }, + { + "epoch": 3.5605026929982047, + "grad_norm": 0.8386164903640747, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 49580 + }, + { + "epoch": 3.5612208258527827, + "grad_norm": 0.925573468208313, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 49590 + }, + { + "epoch": 3.561938958707361, + "grad_norm": 0.8867112398147583, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 49600 + }, + { + "epoch": 3.562657091561939, + "grad_norm": 0.7638537883758545, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 49610 + }, + { + "epoch": 3.563375224416517, + "grad_norm": 0.9491845965385437, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 49620 + }, + { + "epoch": 3.564093357271095, + "grad_norm": 0.8384189605712891, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 49630 + }, + { + "epoch": 3.564811490125673, + "grad_norm": 0.8850575089454651, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 49640 + }, + { + "epoch": 3.565529622980251, + "grad_norm": 1.020916223526001, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 49650 + }, + { + "epoch": 3.5662477558348296, + "grad_norm": 0.9298280477523804, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 49660 + }, + { + "epoch": 3.5669658886894076, + "grad_norm": 0.9795742034912109, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 49670 + }, + { + "epoch": 3.5676840215439856, + "grad_norm": 0.9401193261146545, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 49680 + }, + { + "epoch": 3.568402154398564, + "grad_norm": 1.0383585691452026, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49690 + }, + { + "epoch": 3.569120287253142, + "grad_norm": 0.8370866179466248, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 49700 + }, + { + "epoch": 3.56983842010772, + "grad_norm": 0.8207486271858215, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 49710 + }, + { + "epoch": 3.570556552962298, + "grad_norm": 0.8551223278045654, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49720 + }, + { + "epoch": 3.571274685816876, + "grad_norm": 0.8041176199913025, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 49730 + }, + { + "epoch": 3.571992818671454, + "grad_norm": 0.9862527847290039, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 49740 + }, + { + "epoch": 3.5727109515260325, + "grad_norm": 0.7557165622711182, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 49750 + }, + { + "epoch": 3.5734290843806105, + "grad_norm": 1.0908563137054443, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 49760 + }, + { + "epoch": 3.5741472172351885, + "grad_norm": 0.7245369553565979, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 49770 + }, + { + "epoch": 3.5748653500897665, + "grad_norm": 0.7851184010505676, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 49780 + }, + { + "epoch": 3.575583482944345, + "grad_norm": 0.9443599581718445, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 49790 + }, + { + "epoch": 3.576301615798923, + "grad_norm": 1.021196961402893, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 49800 + }, + { + "epoch": 3.577019748653501, + "grad_norm": 0.9099196195602417, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 49810 + }, + { + "epoch": 3.577737881508079, + "grad_norm": 0.9397716522216797, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 49820 + }, + { + "epoch": 3.578456014362657, + "grad_norm": 0.9214922785758972, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 49830 + }, + { + "epoch": 3.579174147217235, + "grad_norm": 1.0053879022598267, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 49840 + }, + { + "epoch": 3.5798922800718134, + "grad_norm": 0.9415460228919983, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 49850 + }, + { + "epoch": 3.5806104129263914, + "grad_norm": 1.0807833671569824, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 49860 + }, + { + "epoch": 3.5813285457809694, + "grad_norm": 1.0070871114730835, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 49870 + }, + { + "epoch": 3.582046678635548, + "grad_norm": 0.9707024693489075, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 49880 + }, + { + "epoch": 3.582764811490126, + "grad_norm": 0.9979593753814697, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 49890 + }, + { + "epoch": 3.583482944344704, + "grad_norm": 0.7238648533821106, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 49900 + }, + { + "epoch": 3.584201077199282, + "grad_norm": 0.8168631792068481, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 49910 + }, + { + "epoch": 3.58491921005386, + "grad_norm": 0.8156409859657288, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 49920 + }, + { + "epoch": 3.585637342908438, + "grad_norm": 0.9256414175033569, + "learning_rate": 0.0002, + "loss": 0.6248, + "step": 49930 + }, + { + "epoch": 3.5863554757630163, + "grad_norm": 1.0090070962905884, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 49940 + }, + { + "epoch": 3.5870736086175943, + "grad_norm": 0.8257701992988586, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 49950 + }, + { + "epoch": 3.5877917414721723, + "grad_norm": 0.9189013242721558, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 49960 + }, + { + "epoch": 3.5885098743267507, + "grad_norm": 0.8497788310050964, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 49970 + }, + { + "epoch": 3.5892280071813287, + "grad_norm": 0.9596505761146545, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 49980 + }, + { + "epoch": 3.5899461400359067, + "grad_norm": 0.8773331642150879, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 49990 + }, + { + "epoch": 3.5906642728904847, + "grad_norm": 0.8952302932739258, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50000 + }, + { + "epoch": 3.5913824057450627, + "grad_norm": 0.7713809609413147, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 50010 + }, + { + "epoch": 3.5921005385996407, + "grad_norm": 1.0151346921920776, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 50020 + }, + { + "epoch": 3.592818671454219, + "grad_norm": 0.8793733716011047, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 50030 + }, + { + "epoch": 3.593536804308797, + "grad_norm": 0.8881325721740723, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 50040 + }, + { + "epoch": 3.594254937163375, + "grad_norm": 0.9346749782562256, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 50050 + }, + { + "epoch": 3.594973070017953, + "grad_norm": 0.8705052137374878, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 50060 + }, + { + "epoch": 3.5956912028725316, + "grad_norm": 1.039197564125061, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 50070 + }, + { + "epoch": 3.5964093357271096, + "grad_norm": 0.7053273320198059, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 50080 + }, + { + "epoch": 3.5971274685816876, + "grad_norm": 0.8268665671348572, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 50090 + }, + { + "epoch": 3.5978456014362656, + "grad_norm": 0.8921764492988586, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 50100 + }, + { + "epoch": 3.5985637342908436, + "grad_norm": 0.9756084680557251, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 50110 + }, + { + "epoch": 3.5992818671454216, + "grad_norm": 0.9275530576705933, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 50120 + }, + { + "epoch": 3.6, + "grad_norm": 0.9030009508132935, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 50130 + }, + { + "epoch": 3.600718132854578, + "grad_norm": 0.7805638909339905, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 50140 + }, + { + "epoch": 3.601436265709156, + "grad_norm": 0.7627325057983398, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 50150 + }, + { + "epoch": 3.6021543985637345, + "grad_norm": 0.7809714078903198, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 50160 + }, + { + "epoch": 3.6028725314183125, + "grad_norm": 0.7910378575325012, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 50170 + }, + { + "epoch": 3.6035906642728905, + "grad_norm": 1.004438042640686, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 50180 + }, + { + "epoch": 3.6043087971274685, + "grad_norm": 0.825969934463501, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 50190 + }, + { + "epoch": 3.6050269299820465, + "grad_norm": 0.8866565227508545, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 50200 + }, + { + "epoch": 3.6057450628366245, + "grad_norm": 0.8920543193817139, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 50210 + }, + { + "epoch": 3.606463195691203, + "grad_norm": 1.106584906578064, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 50220 + }, + { + "epoch": 3.607181328545781, + "grad_norm": 0.916607677936554, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 50230 + }, + { + "epoch": 3.607899461400359, + "grad_norm": 0.8014767169952393, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 50240 + }, + { + "epoch": 3.608617594254937, + "grad_norm": 0.9556822776794434, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 50250 + }, + { + "epoch": 3.6093357271095154, + "grad_norm": 0.9630016684532166, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50260 + }, + { + "epoch": 3.6100538599640934, + "grad_norm": 0.9862125515937805, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 50270 + }, + { + "epoch": 3.6107719928186714, + "grad_norm": 1.0043333768844604, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 50280 + }, + { + "epoch": 3.6114901256732495, + "grad_norm": 0.9255319833755493, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 50290 + }, + { + "epoch": 3.6122082585278275, + "grad_norm": 1.012023687362671, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 50300 + }, + { + "epoch": 3.612926391382406, + "grad_norm": 1.0701122283935547, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50310 + }, + { + "epoch": 3.613644524236984, + "grad_norm": 0.8270810842514038, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 50320 + }, + { + "epoch": 3.614362657091562, + "grad_norm": 0.8881328105926514, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 50330 + }, + { + "epoch": 3.61508078994614, + "grad_norm": 0.9536844491958618, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 50340 + }, + { + "epoch": 3.6157989228007184, + "grad_norm": 0.8044326305389404, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 50350 + }, + { + "epoch": 3.6165170556552964, + "grad_norm": 0.834591805934906, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50360 + }, + { + "epoch": 3.6172351885098744, + "grad_norm": 0.903752863407135, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 50370 + }, + { + "epoch": 3.6179533213644524, + "grad_norm": 0.9148632884025574, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 50380 + }, + { + "epoch": 3.6186714542190304, + "grad_norm": 0.9280176162719727, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 50390 + }, + { + "epoch": 3.6193895870736084, + "grad_norm": 0.9524136781692505, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 50400 + }, + { + "epoch": 3.620107719928187, + "grad_norm": 1.1751197576522827, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 50410 + }, + { + "epoch": 3.620825852782765, + "grad_norm": 1.032279133796692, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 50420 + }, + { + "epoch": 3.621543985637343, + "grad_norm": 0.790741503238678, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 50430 + }, + { + "epoch": 3.6222621184919213, + "grad_norm": 0.9584221243858337, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 50440 + }, + { + "epoch": 3.6229802513464993, + "grad_norm": 0.7792508006095886, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 50450 + }, + { + "epoch": 3.6236983842010773, + "grad_norm": 0.8273448944091797, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 50460 + }, + { + "epoch": 3.6244165170556553, + "grad_norm": 0.8001132607460022, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 50470 + }, + { + "epoch": 3.6251346499102333, + "grad_norm": 1.077109694480896, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 50480 + }, + { + "epoch": 3.6258527827648113, + "grad_norm": 1.111274003982544, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 50490 + }, + { + "epoch": 3.6265709156193897, + "grad_norm": 0.7757347822189331, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 50500 + }, + { + "epoch": 3.6272890484739677, + "grad_norm": 0.9217049479484558, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 50510 + }, + { + "epoch": 3.6280071813285457, + "grad_norm": 0.9362251162528992, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 50520 + }, + { + "epoch": 3.6287253141831237, + "grad_norm": 0.9435479044914246, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 50530 + }, + { + "epoch": 3.629443447037702, + "grad_norm": 0.7748915553092957, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 50540 + }, + { + "epoch": 3.63016157989228, + "grad_norm": 0.8238945007324219, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 50550 + }, + { + "epoch": 3.630879712746858, + "grad_norm": 0.8421505093574524, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 50560 + }, + { + "epoch": 3.631597845601436, + "grad_norm": 1.0272293090820312, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 50570 + }, + { + "epoch": 3.632315978456014, + "grad_norm": 0.7643818259239197, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 50580 + }, + { + "epoch": 3.6330341113105926, + "grad_norm": 0.9756225347518921, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 50590 + }, + { + "epoch": 3.6337522441651706, + "grad_norm": 0.9311570525169373, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 50600 + }, + { + "epoch": 3.6344703770197486, + "grad_norm": 0.8829827904701233, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 50610 + }, + { + "epoch": 3.6351885098743266, + "grad_norm": 0.9473454356193542, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 50620 + }, + { + "epoch": 3.635906642728905, + "grad_norm": 1.1023668050765991, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 50630 + }, + { + "epoch": 3.636624775583483, + "grad_norm": 0.8490299582481384, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 50640 + }, + { + "epoch": 3.637342908438061, + "grad_norm": 1.1129392385482788, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 50650 + }, + { + "epoch": 3.638061041292639, + "grad_norm": 1.0334501266479492, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 50660 + }, + { + "epoch": 3.638779174147217, + "grad_norm": 0.8397296667098999, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 50670 + }, + { + "epoch": 3.639497307001795, + "grad_norm": 0.7984256744384766, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 50680 + }, + { + "epoch": 3.6402154398563735, + "grad_norm": 1.1182054281234741, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 50690 + }, + { + "epoch": 3.6409335727109515, + "grad_norm": 0.8743279576301575, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 50700 + }, + { + "epoch": 3.6416517055655295, + "grad_norm": 0.9101628661155701, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 50710 + }, + { + "epoch": 3.642369838420108, + "grad_norm": 0.8866934180259705, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 50720 + }, + { + "epoch": 3.643087971274686, + "grad_norm": 0.863945484161377, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 50730 + }, + { + "epoch": 3.643806104129264, + "grad_norm": 1.0845744609832764, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 50740 + }, + { + "epoch": 3.644524236983842, + "grad_norm": 0.8610911965370178, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 50750 + }, + { + "epoch": 3.64524236983842, + "grad_norm": 0.8502625226974487, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 50760 + }, + { + "epoch": 3.645960502692998, + "grad_norm": 0.847372829914093, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 50770 + }, + { + "epoch": 3.6466786355475764, + "grad_norm": 0.8649292588233948, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 50780 + }, + { + "epoch": 3.6473967684021544, + "grad_norm": 0.8742905855178833, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 50790 + }, + { + "epoch": 3.6481149012567324, + "grad_norm": 0.9546048641204834, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 50800 + }, + { + "epoch": 3.6488330341113104, + "grad_norm": 0.7893161773681641, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 50810 + }, + { + "epoch": 3.649551166965889, + "grad_norm": 0.9350247979164124, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 50820 + }, + { + "epoch": 3.650269299820467, + "grad_norm": 0.772149384021759, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 50830 + }, + { + "epoch": 3.650987432675045, + "grad_norm": 0.8281718492507935, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 50840 + }, + { + "epoch": 3.651705565529623, + "grad_norm": 0.8063850402832031, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 50850 + }, + { + "epoch": 3.652423698384201, + "grad_norm": 0.8101351261138916, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 50860 + }, + { + "epoch": 3.6531418312387793, + "grad_norm": 0.8747833371162415, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 50870 + }, + { + "epoch": 3.6538599640933573, + "grad_norm": 0.9634656310081482, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 50880 + }, + { + "epoch": 3.6545780969479353, + "grad_norm": 1.1646045446395874, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 50890 + }, + { + "epoch": 3.6552962298025133, + "grad_norm": 0.8538454174995422, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 50900 + }, + { + "epoch": 3.656014362657092, + "grad_norm": 0.7639184594154358, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 50910 + }, + { + "epoch": 3.65673249551167, + "grad_norm": 0.8750212788581848, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 50920 + }, + { + "epoch": 3.657450628366248, + "grad_norm": 0.9161198735237122, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 50930 + }, + { + "epoch": 3.658168761220826, + "grad_norm": 0.7987924814224243, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 50940 + }, + { + "epoch": 3.658886894075404, + "grad_norm": 0.8939290642738342, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 50950 + }, + { + "epoch": 3.659605026929982, + "grad_norm": 0.9803797602653503, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 50960 + }, + { + "epoch": 3.6603231597845602, + "grad_norm": 1.2423512935638428, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 50970 + }, + { + "epoch": 3.6610412926391382, + "grad_norm": 1.0023225545883179, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 50980 + }, + { + "epoch": 3.6617594254937162, + "grad_norm": 0.9066677689552307, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 50990 + }, + { + "epoch": 3.6624775583482947, + "grad_norm": 0.8906226754188538, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 51000 + }, + { + "epoch": 3.6631956912028727, + "grad_norm": 0.7449954152107239, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51010 + }, + { + "epoch": 3.6639138240574507, + "grad_norm": 0.812612771987915, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 51020 + }, + { + "epoch": 3.6646319569120287, + "grad_norm": 0.861818253993988, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 51030 + }, + { + "epoch": 3.6653500897666067, + "grad_norm": 0.849726676940918, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 51040 + }, + { + "epoch": 3.6660682226211847, + "grad_norm": 0.9738494753837585, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 51050 + }, + { + "epoch": 3.666786355475763, + "grad_norm": 0.928989827632904, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 51060 + }, + { + "epoch": 3.667504488330341, + "grad_norm": 0.9725563526153564, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 51070 + }, + { + "epoch": 3.668222621184919, + "grad_norm": 0.9366095066070557, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51080 + }, + { + "epoch": 3.668940754039497, + "grad_norm": 0.8012986779212952, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 51090 + }, + { + "epoch": 3.6696588868940756, + "grad_norm": 1.0646892786026, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51100 + }, + { + "epoch": 3.6703770197486536, + "grad_norm": 0.7245157361030579, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 51110 + }, + { + "epoch": 3.6710951526032316, + "grad_norm": 0.6938936114311218, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 51120 + }, + { + "epoch": 3.6718132854578096, + "grad_norm": 0.8461366295814514, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 51130 + }, + { + "epoch": 3.6725314183123876, + "grad_norm": 0.8392583131790161, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 51140 + }, + { + "epoch": 3.673249551166966, + "grad_norm": 0.7245259284973145, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 51150 + }, + { + "epoch": 3.673967684021544, + "grad_norm": 1.0742167234420776, + "learning_rate": 0.0002, + "loss": 0.6165, + "step": 51160 + }, + { + "epoch": 3.674685816876122, + "grad_norm": 0.9553889036178589, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 51170 + }, + { + "epoch": 3.6754039497307, + "grad_norm": 0.8713715672492981, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 51180 + }, + { + "epoch": 3.6761220825852785, + "grad_norm": 0.7499800324440002, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 51190 + }, + { + "epoch": 3.6768402154398565, + "grad_norm": 1.1118139028549194, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 51200 + }, + { + "epoch": 3.6775583482944345, + "grad_norm": 0.8146613836288452, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 51210 + }, + { + "epoch": 3.6782764811490125, + "grad_norm": 0.9331285357475281, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 51220 + }, + { + "epoch": 3.6789946140035905, + "grad_norm": 1.0497597455978394, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 51230 + }, + { + "epoch": 3.6797127468581685, + "grad_norm": 0.879814863204956, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51240 + }, + { + "epoch": 3.680430879712747, + "grad_norm": 0.9896606802940369, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 51250 + }, + { + "epoch": 3.681149012567325, + "grad_norm": 0.928236186504364, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 51260 + }, + { + "epoch": 3.681867145421903, + "grad_norm": 0.8436732292175293, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 51270 + }, + { + "epoch": 3.6825852782764814, + "grad_norm": 0.93634432554245, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51280 + }, + { + "epoch": 3.6833034111310594, + "grad_norm": 0.8477143049240112, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 51290 + }, + { + "epoch": 3.6840215439856374, + "grad_norm": 0.8720934987068176, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 51300 + }, + { + "epoch": 3.6847396768402154, + "grad_norm": 0.7322931289672852, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 51310 + }, + { + "epoch": 3.6854578096947934, + "grad_norm": 1.0064427852630615, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 51320 + }, + { + "epoch": 3.6861759425493714, + "grad_norm": 1.0197817087173462, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 51330 + }, + { + "epoch": 3.68689407540395, + "grad_norm": 0.8764060139656067, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 51340 + }, + { + "epoch": 3.687612208258528, + "grad_norm": 0.9763964414596558, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 51350 + }, + { + "epoch": 3.688330341113106, + "grad_norm": 0.8389105200767517, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 51360 + }, + { + "epoch": 3.689048473967684, + "grad_norm": 0.9215750694274902, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 51370 + }, + { + "epoch": 3.6897666068222623, + "grad_norm": 0.8444913625717163, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 51380 + }, + { + "epoch": 3.6904847396768403, + "grad_norm": 0.9635153412818909, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 51390 + }, + { + "epoch": 3.6912028725314183, + "grad_norm": 1.0397378206253052, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 51400 + }, + { + "epoch": 3.6919210053859963, + "grad_norm": 0.9154748320579529, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 51410 + }, + { + "epoch": 3.6926391382405743, + "grad_norm": 0.906445324420929, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 51420 + }, + { + "epoch": 3.6933572710951523, + "grad_norm": 0.9237992763519287, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 51430 + }, + { + "epoch": 3.6940754039497308, + "grad_norm": 0.8796338438987732, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 51440 + }, + { + "epoch": 3.6947935368043088, + "grad_norm": 0.8613203763961792, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 51450 + }, + { + "epoch": 3.6955116696588868, + "grad_norm": 0.7957607507705688, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 51460 + }, + { + "epoch": 3.6962298025134652, + "grad_norm": 0.9183711409568787, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 51470 + }, + { + "epoch": 3.6969479353680432, + "grad_norm": 1.0108308792114258, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 51480 + }, + { + "epoch": 3.6976660682226212, + "grad_norm": 0.7768247127532959, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 51490 + }, + { + "epoch": 3.6983842010771992, + "grad_norm": 1.0051485300064087, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 51500 + }, + { + "epoch": 3.6991023339317772, + "grad_norm": 0.82451993227005, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 51510 + }, + { + "epoch": 3.6998204667863552, + "grad_norm": 0.9542286992073059, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 51520 + }, + { + "epoch": 3.7005385996409337, + "grad_norm": 0.693890392780304, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 51530 + }, + { + "epoch": 3.7012567324955117, + "grad_norm": 0.9068924784660339, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 51540 + }, + { + "epoch": 3.7019748653500897, + "grad_norm": 0.8694922924041748, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 51550 + }, + { + "epoch": 3.702692998204668, + "grad_norm": 0.941081702709198, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 51560 + }, + { + "epoch": 3.703411131059246, + "grad_norm": 0.7385984659194946, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 51570 + }, + { + "epoch": 3.704129263913824, + "grad_norm": 1.0399216413497925, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51580 + }, + { + "epoch": 3.704847396768402, + "grad_norm": 0.9802294969558716, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 51590 + }, + { + "epoch": 3.70556552962298, + "grad_norm": 1.0409669876098633, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51600 + }, + { + "epoch": 3.706283662477558, + "grad_norm": 0.8972786068916321, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 51610 + }, + { + "epoch": 3.7070017953321366, + "grad_norm": 1.1916245222091675, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 51620 + }, + { + "epoch": 3.7077199281867146, + "grad_norm": 0.9545385241508484, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 51630 + }, + { + "epoch": 3.7084380610412926, + "grad_norm": 1.0773427486419678, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 51640 + }, + { + "epoch": 3.7091561938958706, + "grad_norm": 1.0856024026870728, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 51650 + }, + { + "epoch": 3.709874326750449, + "grad_norm": 0.7678500413894653, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51660 + }, + { + "epoch": 3.710592459605027, + "grad_norm": 0.7276270985603333, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 51670 + }, + { + "epoch": 3.711310592459605, + "grad_norm": 0.8859017491340637, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 51680 + }, + { + "epoch": 3.712028725314183, + "grad_norm": 0.9037614464759827, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 51690 + }, + { + "epoch": 3.712746858168761, + "grad_norm": 0.9223412275314331, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51700 + }, + { + "epoch": 3.713464991023339, + "grad_norm": 0.8812923431396484, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 51710 + }, + { + "epoch": 3.7141831238779175, + "grad_norm": 0.8242456912994385, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 51720 + }, + { + "epoch": 3.7149012567324955, + "grad_norm": 0.8368834257125854, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 51730 + }, + { + "epoch": 3.7156193895870735, + "grad_norm": 0.8624704480171204, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 51740 + }, + { + "epoch": 3.716337522441652, + "grad_norm": 0.9138273596763611, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51750 + }, + { + "epoch": 3.71705565529623, + "grad_norm": 0.8088571429252625, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 51760 + }, + { + "epoch": 3.717773788150808, + "grad_norm": 0.882808268070221, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 51770 + }, + { + "epoch": 3.718491921005386, + "grad_norm": 0.9368035197257996, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 51780 + }, + { + "epoch": 3.719210053859964, + "grad_norm": 0.8341794013977051, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 51790 + }, + { + "epoch": 3.719928186714542, + "grad_norm": 0.8692073225975037, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 51800 + }, + { + "epoch": 3.7206463195691204, + "grad_norm": 0.7566918730735779, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 51810 + }, + { + "epoch": 3.7213644524236984, + "grad_norm": 1.113138198852539, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 51820 + }, + { + "epoch": 3.7220825852782764, + "grad_norm": 0.8793158531188965, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 51830 + }, + { + "epoch": 3.722800718132855, + "grad_norm": 0.8856439590454102, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 51840 + }, + { + "epoch": 3.723518850987433, + "grad_norm": 1.0182029008865356, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 51850 + }, + { + "epoch": 3.724236983842011, + "grad_norm": 1.1177181005477905, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 51860 + }, + { + "epoch": 3.724955116696589, + "grad_norm": 0.6600990295410156, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 51870 + }, + { + "epoch": 3.725673249551167, + "grad_norm": 1.0563536882400513, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 51880 + }, + { + "epoch": 3.726391382405745, + "grad_norm": 1.1067734956741333, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 51890 + }, + { + "epoch": 3.7271095152603233, + "grad_norm": 1.0204616785049438, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 51900 + }, + { + "epoch": 3.7278276481149013, + "grad_norm": 0.8647155165672302, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51910 + }, + { + "epoch": 3.7285457809694793, + "grad_norm": 1.0754971504211426, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 51920 + }, + { + "epoch": 3.7292639138240573, + "grad_norm": 1.0448992252349854, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 51930 + }, + { + "epoch": 3.7299820466786358, + "grad_norm": 0.963434100151062, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 51940 + }, + { + "epoch": 3.7307001795332138, + "grad_norm": 0.8112701773643494, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51950 + }, + { + "epoch": 3.7314183123877918, + "grad_norm": 0.7975119948387146, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 51960 + }, + { + "epoch": 3.7321364452423698, + "grad_norm": 0.7953376173973083, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 51970 + }, + { + "epoch": 3.7328545780969478, + "grad_norm": 0.9519981741905212, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 51980 + }, + { + "epoch": 3.7335727109515258, + "grad_norm": 0.8705791234970093, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 51990 + }, + { + "epoch": 3.734290843806104, + "grad_norm": 0.870205283164978, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 52000 + }, + { + "epoch": 3.735008976660682, + "grad_norm": 0.9558930993080139, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 52010 + }, + { + "epoch": 3.73572710951526, + "grad_norm": 0.9330434799194336, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 52020 + }, + { + "epoch": 3.7364452423698387, + "grad_norm": 0.783620297908783, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 52030 + }, + { + "epoch": 3.7371633752244167, + "grad_norm": 0.7575166821479797, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52040 + }, + { + "epoch": 3.7378815080789947, + "grad_norm": 1.0592705011367798, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 52050 + }, + { + "epoch": 3.7385996409335727, + "grad_norm": 0.9309433102607727, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 52060 + }, + { + "epoch": 3.7393177737881507, + "grad_norm": 0.972861647605896, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 52070 + }, + { + "epoch": 3.7400359066427287, + "grad_norm": 0.9318740963935852, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 52080 + }, + { + "epoch": 3.740754039497307, + "grad_norm": 0.7938477396965027, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 52090 + }, + { + "epoch": 3.741472172351885, + "grad_norm": 1.1515966653823853, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 52100 + }, + { + "epoch": 3.742190305206463, + "grad_norm": 1.076869010925293, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 52110 + }, + { + "epoch": 3.7429084380610416, + "grad_norm": 0.8516066670417786, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 52120 + }, + { + "epoch": 3.7436265709156196, + "grad_norm": 0.6853429079055786, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 52130 + }, + { + "epoch": 3.7443447037701976, + "grad_norm": 0.8179695010185242, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52140 + }, + { + "epoch": 3.7450628366247756, + "grad_norm": 0.8395232558250427, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 52150 + }, + { + "epoch": 3.7457809694793536, + "grad_norm": 1.0178003311157227, + "learning_rate": 0.0002, + "loss": 0.6902, + "step": 52160 + }, + { + "epoch": 3.7464991023339316, + "grad_norm": 1.1801023483276367, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 52170 + }, + { + "epoch": 3.74721723518851, + "grad_norm": 0.8215751647949219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 52180 + }, + { + "epoch": 3.747935368043088, + "grad_norm": 1.17083740234375, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 52190 + }, + { + "epoch": 3.748653500897666, + "grad_norm": 0.9230290651321411, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 52200 + }, + { + "epoch": 3.749371633752244, + "grad_norm": 0.8431521058082581, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 52210 + }, + { + "epoch": 3.7500897666068225, + "grad_norm": 0.9690840244293213, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 52220 + }, + { + "epoch": 3.7508078994614005, + "grad_norm": 1.0022395849227905, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 52230 + }, + { + "epoch": 3.7515260323159785, + "grad_norm": 1.0489065647125244, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 52240 + }, + { + "epoch": 3.7522441651705565, + "grad_norm": 0.7880696058273315, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 52250 + }, + { + "epoch": 3.7529622980251345, + "grad_norm": 1.0255829095840454, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 52260 + }, + { + "epoch": 3.7536804308797125, + "grad_norm": 0.8470141291618347, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 52270 + }, + { + "epoch": 3.754398563734291, + "grad_norm": 0.9040523171424866, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 52280 + }, + { + "epoch": 3.755116696588869, + "grad_norm": 0.9564392566680908, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 52290 + }, + { + "epoch": 3.755834829443447, + "grad_norm": 0.907857358455658, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 52300 + }, + { + "epoch": 3.7565529622980254, + "grad_norm": 0.8929873704910278, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 52310 + }, + { + "epoch": 3.7572710951526034, + "grad_norm": 0.854434072971344, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 52320 + }, + { + "epoch": 3.7579892280071814, + "grad_norm": 0.8744779229164124, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 52330 + }, + { + "epoch": 3.7587073608617594, + "grad_norm": 0.9022667407989502, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52340 + }, + { + "epoch": 3.7594254937163374, + "grad_norm": 0.8884857892990112, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52350 + }, + { + "epoch": 3.7601436265709154, + "grad_norm": 1.0228430032730103, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 52360 + }, + { + "epoch": 3.760861759425494, + "grad_norm": 0.8593528270721436, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 52370 + }, + { + "epoch": 3.761579892280072, + "grad_norm": 0.9435563087463379, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 52380 + }, + { + "epoch": 3.76229802513465, + "grad_norm": 0.7545679807662964, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52390 + }, + { + "epoch": 3.7630161579892283, + "grad_norm": 0.9411585927009583, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52400 + }, + { + "epoch": 3.7637342908438063, + "grad_norm": 0.9764377474784851, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 52410 + }, + { + "epoch": 3.7644524236983843, + "grad_norm": 1.0718384981155396, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 52420 + }, + { + "epoch": 3.7651705565529623, + "grad_norm": 0.8765230774879456, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52430 + }, + { + "epoch": 3.7658886894075403, + "grad_norm": 0.9275036454200745, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 52440 + }, + { + "epoch": 3.7666068222621183, + "grad_norm": 0.967410147190094, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 52450 + }, + { + "epoch": 3.7673249551166967, + "grad_norm": 0.7738949060440063, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 52460 + }, + { + "epoch": 3.7680430879712747, + "grad_norm": 1.0828070640563965, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 52470 + }, + { + "epoch": 3.7687612208258527, + "grad_norm": 0.9570213556289673, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 52480 + }, + { + "epoch": 3.7694793536804307, + "grad_norm": 1.0688215494155884, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 52490 + }, + { + "epoch": 3.770197486535009, + "grad_norm": 0.7970073223114014, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 52500 + }, + { + "epoch": 3.770915619389587, + "grad_norm": 0.7132976651191711, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 52510 + }, + { + "epoch": 3.771633752244165, + "grad_norm": 1.152268648147583, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 52520 + }, + { + "epoch": 3.772351885098743, + "grad_norm": 0.8645235896110535, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52530 + }, + { + "epoch": 3.773070017953321, + "grad_norm": 0.7725570201873779, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 52540 + }, + { + "epoch": 3.773788150807899, + "grad_norm": 0.9718102812767029, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 52550 + }, + { + "epoch": 3.7745062836624776, + "grad_norm": 0.7568017840385437, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 52560 + }, + { + "epoch": 3.7752244165170556, + "grad_norm": 0.9578912854194641, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 52570 + }, + { + "epoch": 3.7759425493716336, + "grad_norm": 0.8657314777374268, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 52580 + }, + { + "epoch": 3.776660682226212, + "grad_norm": 0.7564393281936646, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 52590 + }, + { + "epoch": 3.77737881508079, + "grad_norm": 0.7631160616874695, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 52600 + }, + { + "epoch": 3.778096947935368, + "grad_norm": 1.1852056980133057, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 52610 + }, + { + "epoch": 3.778815080789946, + "grad_norm": 1.0620790719985962, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 52620 + }, + { + "epoch": 3.779533213644524, + "grad_norm": 0.8677777647972107, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 52630 + }, + { + "epoch": 3.780251346499102, + "grad_norm": 0.9913218021392822, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 52640 + }, + { + "epoch": 3.7809694793536806, + "grad_norm": 0.9868429899215698, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 52650 + }, + { + "epoch": 3.7816876122082586, + "grad_norm": 0.8791782259941101, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 52660 + }, + { + "epoch": 3.7824057450628366, + "grad_norm": 0.9503955245018005, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 52670 + }, + { + "epoch": 3.7831238779174146, + "grad_norm": 0.8647131323814392, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 52680 + }, + { + "epoch": 3.783842010771993, + "grad_norm": 0.9819629788398743, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52690 + }, + { + "epoch": 3.784560143626571, + "grad_norm": 0.8548610210418701, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 52700 + }, + { + "epoch": 3.785278276481149, + "grad_norm": 0.8706230521202087, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 52710 + }, + { + "epoch": 3.785996409335727, + "grad_norm": 1.0032461881637573, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52720 + }, + { + "epoch": 3.786714542190305, + "grad_norm": 1.0578246116638184, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 52730 + }, + { + "epoch": 3.7874326750448835, + "grad_norm": 0.9854007363319397, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52740 + }, + { + "epoch": 3.7881508078994615, + "grad_norm": 0.8389187455177307, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 52750 + }, + { + "epoch": 3.7888689407540395, + "grad_norm": 0.9192399978637695, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 52760 + }, + { + "epoch": 3.7895870736086175, + "grad_norm": 0.9518283605575562, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 52770 + }, + { + "epoch": 3.790305206463196, + "grad_norm": 1.1296825408935547, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52780 + }, + { + "epoch": 3.791023339317774, + "grad_norm": 1.0589144229888916, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 52790 + }, + { + "epoch": 3.791741472172352, + "grad_norm": 0.8954343199729919, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 52800 + }, + { + "epoch": 3.79245960502693, + "grad_norm": 0.8283370733261108, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 52810 + }, + { + "epoch": 3.793177737881508, + "grad_norm": 0.910642683506012, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 52820 + }, + { + "epoch": 3.793895870736086, + "grad_norm": 0.9255108833312988, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 52830 + }, + { + "epoch": 3.7946140035906644, + "grad_norm": 0.8773723244667053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 52840 + }, + { + "epoch": 3.7953321364452424, + "grad_norm": 0.8454240560531616, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 52850 + }, + { + "epoch": 3.7960502692998204, + "grad_norm": 0.7636052966117859, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 52860 + }, + { + "epoch": 3.796768402154399, + "grad_norm": 0.9358382821083069, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 52870 + }, + { + "epoch": 3.797486535008977, + "grad_norm": 0.9662801623344421, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 52880 + }, + { + "epoch": 3.798204667863555, + "grad_norm": 0.995907187461853, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 52890 + }, + { + "epoch": 3.798922800718133, + "grad_norm": 0.8700127005577087, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 52900 + }, + { + "epoch": 3.799640933572711, + "grad_norm": 0.8987792134284973, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 52910 + }, + { + "epoch": 3.800359066427289, + "grad_norm": 0.9753904938697815, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 52920 + }, + { + "epoch": 3.8010771992818673, + "grad_norm": 0.7873555421829224, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 52930 + }, + { + "epoch": 3.8017953321364453, + "grad_norm": 0.8177929520606995, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 52940 + }, + { + "epoch": 3.8025134649910233, + "grad_norm": 0.8865532279014587, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 52950 + }, + { + "epoch": 3.8032315978456013, + "grad_norm": 0.9113775491714478, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 52960 + }, + { + "epoch": 3.8039497307001797, + "grad_norm": 0.9424585700035095, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 52970 + }, + { + "epoch": 3.8046678635547577, + "grad_norm": 0.8347237706184387, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 52980 + }, + { + "epoch": 3.8053859964093357, + "grad_norm": 0.826863169670105, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 52990 + }, + { + "epoch": 3.8061041292639137, + "grad_norm": 0.7313310503959656, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 53000 + }, + { + "epoch": 3.8068222621184917, + "grad_norm": 0.8352667093276978, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 53010 + }, + { + "epoch": 3.80754039497307, + "grad_norm": 0.748461127281189, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 53020 + }, + { + "epoch": 3.808258527827648, + "grad_norm": 0.943256139755249, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 53030 + }, + { + "epoch": 3.808976660682226, + "grad_norm": 1.0448410511016846, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 53040 + }, + { + "epoch": 3.809694793536804, + "grad_norm": 0.9047636985778809, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 53050 + }, + { + "epoch": 3.8104129263913826, + "grad_norm": 0.8594381213188171, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 53060 + }, + { + "epoch": 3.8111310592459606, + "grad_norm": 0.7593536972999573, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 53070 + }, + { + "epoch": 3.8118491921005386, + "grad_norm": 0.7189019918441772, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 53080 + }, + { + "epoch": 3.8125673249551166, + "grad_norm": 0.8569809198379517, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53090 + }, + { + "epoch": 3.8132854578096946, + "grad_norm": 0.923378050327301, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53100 + }, + { + "epoch": 3.8140035906642726, + "grad_norm": 0.9088824391365051, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 53110 + }, + { + "epoch": 3.814721723518851, + "grad_norm": 1.1386840343475342, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 53120 + }, + { + "epoch": 3.815439856373429, + "grad_norm": 0.8389552235603333, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 53130 + }, + { + "epoch": 3.816157989228007, + "grad_norm": 0.7940975427627563, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 53140 + }, + { + "epoch": 3.8168761220825855, + "grad_norm": 0.8389907479286194, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 53150 + }, + { + "epoch": 3.8175942549371635, + "grad_norm": 0.774206280708313, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 53160 + }, + { + "epoch": 3.8183123877917415, + "grad_norm": 1.189447283744812, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 53170 + }, + { + "epoch": 3.8190305206463195, + "grad_norm": 0.9875882863998413, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 53180 + }, + { + "epoch": 3.8197486535008975, + "grad_norm": 0.9205945134162903, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 53190 + }, + { + "epoch": 3.8204667863554755, + "grad_norm": 0.8312796354293823, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 53200 + }, + { + "epoch": 3.821184919210054, + "grad_norm": 0.9755756855010986, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 53210 + }, + { + "epoch": 3.821903052064632, + "grad_norm": 1.0722965002059937, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53220 + }, + { + "epoch": 3.82262118491921, + "grad_norm": 0.7720510959625244, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 53230 + }, + { + "epoch": 3.823339317773788, + "grad_norm": 1.020147681236267, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 53240 + }, + { + "epoch": 3.8240574506283664, + "grad_norm": 0.8241816759109497, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53250 + }, + { + "epoch": 3.8247755834829444, + "grad_norm": 0.8939895629882812, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 53260 + }, + { + "epoch": 3.8254937163375224, + "grad_norm": 1.010852336883545, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 53270 + }, + { + "epoch": 3.8262118491921004, + "grad_norm": 0.8201420307159424, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 53280 + }, + { + "epoch": 3.8269299820466784, + "grad_norm": 0.8797973990440369, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 53290 + }, + { + "epoch": 3.827648114901257, + "grad_norm": 0.9034950137138367, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 53300 + }, + { + "epoch": 3.828366247755835, + "grad_norm": 0.926802933216095, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 53310 + }, + { + "epoch": 3.829084380610413, + "grad_norm": 1.0205509662628174, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 53320 + }, + { + "epoch": 3.829802513464991, + "grad_norm": 0.9524099230766296, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 53330 + }, + { + "epoch": 3.8305206463195693, + "grad_norm": 0.9692625999450684, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 53340 + }, + { + "epoch": 3.8312387791741473, + "grad_norm": 0.7255275845527649, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 53350 + }, + { + "epoch": 3.8319569120287253, + "grad_norm": 0.7199059724807739, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53360 + }, + { + "epoch": 3.8326750448833034, + "grad_norm": 1.004464864730835, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 53370 + }, + { + "epoch": 3.8333931777378814, + "grad_norm": 0.9092583060264587, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53380 + }, + { + "epoch": 3.8341113105924594, + "grad_norm": 0.945091724395752, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 53390 + }, + { + "epoch": 3.834829443447038, + "grad_norm": 0.7980135679244995, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 53400 + }, + { + "epoch": 3.835547576301616, + "grad_norm": 0.7812868356704712, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 53410 + }, + { + "epoch": 3.836265709156194, + "grad_norm": 0.8957077860832214, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53420 + }, + { + "epoch": 3.8369838420107722, + "grad_norm": 0.9119600653648376, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 53430 + }, + { + "epoch": 3.8377019748653503, + "grad_norm": 0.8208187222480774, + "learning_rate": 0.0002, + "loss": 0.7346, + "step": 53440 + }, + { + "epoch": 3.8384201077199283, + "grad_norm": 0.7930439114570618, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 53450 + }, + { + "epoch": 3.8391382405745063, + "grad_norm": 0.8937777280807495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 53460 + }, + { + "epoch": 3.8398563734290843, + "grad_norm": 0.7583796977996826, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 53470 + }, + { + "epoch": 3.8405745062836623, + "grad_norm": 1.0735969543457031, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 53480 + }, + { + "epoch": 3.8412926391382407, + "grad_norm": 1.1106033325195312, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 53490 + }, + { + "epoch": 3.8420107719928187, + "grad_norm": 1.092631220817566, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 53500 + }, + { + "epoch": 3.8427289048473967, + "grad_norm": 0.9961787462234497, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 53510 + }, + { + "epoch": 3.8434470377019747, + "grad_norm": 0.833831250667572, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 53520 + }, + { + "epoch": 3.844165170556553, + "grad_norm": 1.0000009536743164, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 53530 + }, + { + "epoch": 3.844883303411131, + "grad_norm": 0.9784213304519653, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 53540 + }, + { + "epoch": 3.845601436265709, + "grad_norm": 0.8582558035850525, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 53550 + }, + { + "epoch": 3.846319569120287, + "grad_norm": 0.8267415761947632, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 53560 + }, + { + "epoch": 3.847037701974865, + "grad_norm": 0.8783000111579895, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 53570 + }, + { + "epoch": 3.8477558348294436, + "grad_norm": 0.9866999983787537, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 53580 + }, + { + "epoch": 3.8484739676840216, + "grad_norm": 0.8459296226501465, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 53590 + }, + { + "epoch": 3.8491921005385996, + "grad_norm": 0.9804834723472595, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 53600 + }, + { + "epoch": 3.8499102333931776, + "grad_norm": 0.951074481010437, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 53610 + }, + { + "epoch": 3.850628366247756, + "grad_norm": 0.8020104169845581, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 53620 + }, + { + "epoch": 3.851346499102334, + "grad_norm": 0.9296963214874268, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 53630 + }, + { + "epoch": 3.852064631956912, + "grad_norm": 0.8983652591705322, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 53640 + }, + { + "epoch": 3.85278276481149, + "grad_norm": 1.031858205795288, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 53650 + }, + { + "epoch": 3.853500897666068, + "grad_norm": 0.8943952918052673, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 53660 + }, + { + "epoch": 3.854219030520646, + "grad_norm": 1.0072312355041504, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 53670 + }, + { + "epoch": 3.8549371633752245, + "grad_norm": 1.0604884624481201, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 53680 + }, + { + "epoch": 3.8556552962298025, + "grad_norm": 0.834223210811615, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 53690 + }, + { + "epoch": 3.8563734290843805, + "grad_norm": 0.9872867465019226, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 53700 + }, + { + "epoch": 3.857091561938959, + "grad_norm": 0.7999459505081177, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53710 + }, + { + "epoch": 3.857809694793537, + "grad_norm": 0.717722475528717, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 53720 + }, + { + "epoch": 3.858527827648115, + "grad_norm": 1.0675442218780518, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 53730 + }, + { + "epoch": 3.859245960502693, + "grad_norm": 0.9789777398109436, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 53740 + }, + { + "epoch": 3.859964093357271, + "grad_norm": 0.9318669438362122, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 53750 + }, + { + "epoch": 3.860682226211849, + "grad_norm": 0.9848631024360657, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 53760 + }, + { + "epoch": 3.8614003590664274, + "grad_norm": 0.8754391670227051, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 53770 + }, + { + "epoch": 3.8621184919210054, + "grad_norm": 0.9024585485458374, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 53780 + }, + { + "epoch": 3.8628366247755834, + "grad_norm": 0.8974794745445251, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 53790 + }, + { + "epoch": 3.8635547576301614, + "grad_norm": 0.8342790603637695, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 53800 + }, + { + "epoch": 3.86427289048474, + "grad_norm": 0.8177682757377625, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 53810 + }, + { + "epoch": 3.864991023339318, + "grad_norm": 1.0259089469909668, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 53820 + }, + { + "epoch": 3.865709156193896, + "grad_norm": 1.042290210723877, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 53830 + }, + { + "epoch": 3.866427289048474, + "grad_norm": 0.7316540479660034, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 53840 + }, + { + "epoch": 3.867145421903052, + "grad_norm": 0.9384970664978027, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53850 + }, + { + "epoch": 3.86786355475763, + "grad_norm": 0.9273143410682678, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53860 + }, + { + "epoch": 3.8685816876122083, + "grad_norm": 1.1183570623397827, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 53870 + }, + { + "epoch": 3.8692998204667863, + "grad_norm": 0.9455275535583496, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 53880 + }, + { + "epoch": 3.8700179533213643, + "grad_norm": 0.8702114820480347, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 53890 + }, + { + "epoch": 3.870736086175943, + "grad_norm": 0.8751053214073181, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53900 + }, + { + "epoch": 3.871454219030521, + "grad_norm": 0.9793110489845276, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 53910 + }, + { + "epoch": 3.872172351885099, + "grad_norm": 0.9705014824867249, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 53920 + }, + { + "epoch": 3.872890484739677, + "grad_norm": 1.051504373550415, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 53930 + }, + { + "epoch": 3.873608617594255, + "grad_norm": 0.8590622544288635, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 53940 + }, + { + "epoch": 3.874326750448833, + "grad_norm": 0.7828099727630615, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 53950 + }, + { + "epoch": 3.8750448833034112, + "grad_norm": 0.86341792345047, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 53960 + }, + { + "epoch": 3.8757630161579892, + "grad_norm": 1.114670991897583, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 53970 + }, + { + "epoch": 3.8764811490125672, + "grad_norm": 0.8559519052505493, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 53980 + }, + { + "epoch": 3.8771992818671457, + "grad_norm": 1.0518953800201416, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 53990 + }, + { + "epoch": 3.8779174147217237, + "grad_norm": 0.7157500982284546, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 54000 + }, + { + "epoch": 3.8786355475763017, + "grad_norm": 0.8390372395515442, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 54010 + }, + { + "epoch": 3.8793536804308797, + "grad_norm": 0.8486756086349487, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 54020 + }, + { + "epoch": 3.8800718132854577, + "grad_norm": 0.8361587524414062, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 54030 + }, + { + "epoch": 3.8807899461400357, + "grad_norm": 0.9490554928779602, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 54040 + }, + { + "epoch": 3.881508078994614, + "grad_norm": 1.0311323404312134, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 54050 + }, + { + "epoch": 3.882226211849192, + "grad_norm": 0.84800124168396, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54060 + }, + { + "epoch": 3.88294434470377, + "grad_norm": 0.8940879702568054, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 54070 + }, + { + "epoch": 3.883662477558348, + "grad_norm": 0.985542356967926, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 54080 + }, + { + "epoch": 3.8843806104129266, + "grad_norm": 0.8846475481987, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 54090 + }, + { + "epoch": 3.8850987432675046, + "grad_norm": 0.9186338186264038, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 54100 + }, + { + "epoch": 3.8858168761220826, + "grad_norm": 1.106598973274231, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 54110 + }, + { + "epoch": 3.8865350089766606, + "grad_norm": 0.8167300224304199, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 54120 + }, + { + "epoch": 3.8872531418312386, + "grad_norm": 0.9153622984886169, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 54130 + }, + { + "epoch": 3.8879712746858166, + "grad_norm": 0.8464475274085999, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 54140 + }, + { + "epoch": 3.888689407540395, + "grad_norm": 0.8889452815055847, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 54150 + }, + { + "epoch": 3.889407540394973, + "grad_norm": 0.7861065864562988, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 54160 + }, + { + "epoch": 3.890125673249551, + "grad_norm": 0.882674515247345, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 54170 + }, + { + "epoch": 3.8908438061041295, + "grad_norm": 0.8503835201263428, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 54180 + }, + { + "epoch": 3.8915619389587075, + "grad_norm": 0.888455331325531, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 54190 + }, + { + "epoch": 3.8922800718132855, + "grad_norm": 1.0473699569702148, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 54200 + }, + { + "epoch": 3.8929982046678635, + "grad_norm": 0.9548208713531494, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 54210 + }, + { + "epoch": 3.8937163375224415, + "grad_norm": 0.9158754944801331, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 54220 + }, + { + "epoch": 3.8944344703770195, + "grad_norm": 0.9001154899597168, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54230 + }, + { + "epoch": 3.895152603231598, + "grad_norm": 0.9736626148223877, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54240 + }, + { + "epoch": 3.895870736086176, + "grad_norm": 0.8809846043586731, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 54250 + }, + { + "epoch": 3.896588868940754, + "grad_norm": 0.887583315372467, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 54260 + }, + { + "epoch": 3.8973070017953324, + "grad_norm": 0.8395712971687317, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 54270 + }, + { + "epoch": 3.8980251346499104, + "grad_norm": 0.8391315937042236, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 54280 + }, + { + "epoch": 3.8987432675044884, + "grad_norm": 0.8210049271583557, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54290 + }, + { + "epoch": 3.8994614003590664, + "grad_norm": 1.1364530324935913, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54300 + }, + { + "epoch": 3.9001795332136444, + "grad_norm": 0.7712056636810303, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 54310 + }, + { + "epoch": 3.9008976660682224, + "grad_norm": 0.9466049671173096, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 54320 + }, + { + "epoch": 3.901615798922801, + "grad_norm": 1.0367140769958496, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 54330 + }, + { + "epoch": 3.902333931777379, + "grad_norm": 1.0168321132659912, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 54340 + }, + { + "epoch": 3.903052064631957, + "grad_norm": 0.7830407619476318, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 54350 + }, + { + "epoch": 3.903770197486535, + "grad_norm": 0.9649789333343506, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 54360 + }, + { + "epoch": 3.9044883303411133, + "grad_norm": 0.681077778339386, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 54370 + }, + { + "epoch": 3.9052064631956913, + "grad_norm": 0.8970136046409607, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 54380 + }, + { + "epoch": 3.9059245960502693, + "grad_norm": 0.9155173301696777, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 54390 + }, + { + "epoch": 3.9066427289048473, + "grad_norm": 1.0447794198989868, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 54400 + }, + { + "epoch": 3.9073608617594253, + "grad_norm": 0.7823813557624817, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 54410 + }, + { + "epoch": 3.9080789946140033, + "grad_norm": 0.9289445877075195, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 54420 + }, + { + "epoch": 3.9087971274685818, + "grad_norm": 0.9983111619949341, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 54430 + }, + { + "epoch": 3.9095152603231598, + "grad_norm": 0.7952495813369751, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 54440 + }, + { + "epoch": 3.9102333931777378, + "grad_norm": 0.8045601844787598, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 54450 + }, + { + "epoch": 3.910951526032316, + "grad_norm": 0.936585009098053, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 54460 + }, + { + "epoch": 3.911669658886894, + "grad_norm": 0.745793879032135, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 54470 + }, + { + "epoch": 3.912387791741472, + "grad_norm": 0.9137616157531738, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 54480 + }, + { + "epoch": 3.9131059245960502, + "grad_norm": 0.826316237449646, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 54490 + }, + { + "epoch": 3.9138240574506282, + "grad_norm": 0.94313645362854, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 54500 + }, + { + "epoch": 3.9145421903052062, + "grad_norm": 1.045893907546997, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 54510 + }, + { + "epoch": 3.9152603231597847, + "grad_norm": 0.9122704863548279, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 54520 + }, + { + "epoch": 3.9159784560143627, + "grad_norm": 1.0999689102172852, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 54530 + }, + { + "epoch": 3.9166965888689407, + "grad_norm": 0.9281555414199829, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 54540 + }, + { + "epoch": 3.917414721723519, + "grad_norm": 1.1439622640609741, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 54550 + }, + { + "epoch": 3.918132854578097, + "grad_norm": 0.9375617504119873, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 54560 + }, + { + "epoch": 3.918850987432675, + "grad_norm": 0.92906653881073, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 54570 + }, + { + "epoch": 3.919569120287253, + "grad_norm": 1.0840893983840942, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 54580 + }, + { + "epoch": 3.920287253141831, + "grad_norm": 0.8145509362220764, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 54590 + }, + { + "epoch": 3.921005385996409, + "grad_norm": 0.973737895488739, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 54600 + }, + { + "epoch": 3.9217235188509876, + "grad_norm": 0.9302353858947754, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 54610 + }, + { + "epoch": 3.9224416517055656, + "grad_norm": 0.9167897701263428, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 54620 + }, + { + "epoch": 3.9231597845601436, + "grad_norm": 0.8096851706504822, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 54630 + }, + { + "epoch": 3.9238779174147216, + "grad_norm": 0.8006368279457092, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 54640 + }, + { + "epoch": 3.9245960502693, + "grad_norm": 0.7800863981246948, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 54650 + }, + { + "epoch": 3.925314183123878, + "grad_norm": 1.0331560373306274, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 54660 + }, + { + "epoch": 3.926032315978456, + "grad_norm": 1.0057517290115356, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 54670 + }, + { + "epoch": 3.926750448833034, + "grad_norm": 0.8920564651489258, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 54680 + }, + { + "epoch": 3.927468581687612, + "grad_norm": 0.7704599499702454, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 54690 + }, + { + "epoch": 3.92818671454219, + "grad_norm": 0.827032208442688, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 54700 + }, + { + "epoch": 3.9289048473967685, + "grad_norm": 1.0019268989562988, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 54710 + }, + { + "epoch": 3.9296229802513465, + "grad_norm": 0.862033486366272, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 54720 + }, + { + "epoch": 3.9303411131059245, + "grad_norm": 0.8965592980384827, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 54730 + }, + { + "epoch": 3.931059245960503, + "grad_norm": 0.7689077854156494, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 54740 + }, + { + "epoch": 3.931777378815081, + "grad_norm": 0.846276581287384, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 54750 + }, + { + "epoch": 3.932495511669659, + "grad_norm": 0.8932713866233826, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 54760 + }, + { + "epoch": 3.933213644524237, + "grad_norm": 0.9711386561393738, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 54770 + }, + { + "epoch": 3.933931777378815, + "grad_norm": 0.9290250539779663, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 54780 + }, + { + "epoch": 3.934649910233393, + "grad_norm": 1.0897367000579834, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 54790 + }, + { + "epoch": 3.9353680430879714, + "grad_norm": 0.8451842665672302, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 54800 + }, + { + "epoch": 3.9360861759425494, + "grad_norm": 0.8400090336799622, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 54810 + }, + { + "epoch": 3.9368043087971274, + "grad_norm": 0.951383650302887, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 54820 + }, + { + "epoch": 3.937522441651706, + "grad_norm": 0.848838210105896, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 54830 + }, + { + "epoch": 3.938240574506284, + "grad_norm": 0.735763669013977, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 54840 + }, + { + "epoch": 3.938958707360862, + "grad_norm": 0.979037344455719, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 54850 + }, + { + "epoch": 3.93967684021544, + "grad_norm": 0.933674693107605, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 54860 + }, + { + "epoch": 3.940394973070018, + "grad_norm": 0.835593044757843, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 54870 + }, + { + "epoch": 3.941113105924596, + "grad_norm": 1.0034281015396118, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 54880 + }, + { + "epoch": 3.9418312387791743, + "grad_norm": 0.9732975959777832, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 54890 + }, + { + "epoch": 3.9425493716337523, + "grad_norm": 0.9666336178779602, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54900 + }, + { + "epoch": 3.9432675044883303, + "grad_norm": 0.755310595035553, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 54910 + }, + { + "epoch": 3.9439856373429083, + "grad_norm": 0.8732092976570129, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 54920 + }, + { + "epoch": 3.9447037701974867, + "grad_norm": 1.139453649520874, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 54930 + }, + { + "epoch": 3.9454219030520647, + "grad_norm": 0.9044837951660156, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 54940 + }, + { + "epoch": 3.9461400359066428, + "grad_norm": 1.0496679544448853, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 54950 + }, + { + "epoch": 3.9468581687612208, + "grad_norm": 1.0099035501480103, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 54960 + }, + { + "epoch": 3.9475763016157988, + "grad_norm": 1.0694963932037354, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 54970 + }, + { + "epoch": 3.9482944344703768, + "grad_norm": 1.0012997388839722, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 54980 + }, + { + "epoch": 3.949012567324955, + "grad_norm": 0.8910513520240784, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 54990 + }, + { + "epoch": 3.949730700179533, + "grad_norm": 1.0267579555511475, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 55000 + }, + { + "epoch": 3.950448833034111, + "grad_norm": 0.9786432385444641, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 55010 + }, + { + "epoch": 3.9511669658886897, + "grad_norm": 0.8703538775444031, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55020 + }, + { + "epoch": 3.9518850987432677, + "grad_norm": 0.8970484137535095, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 55030 + }, + { + "epoch": 3.9526032315978457, + "grad_norm": 0.8781577944755554, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 55040 + }, + { + "epoch": 3.9533213644524237, + "grad_norm": 0.8040280938148499, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 55050 + }, + { + "epoch": 3.9540394973070017, + "grad_norm": 0.851926326751709, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 55060 + }, + { + "epoch": 3.9547576301615797, + "grad_norm": 0.8597240447998047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 55070 + }, + { + "epoch": 3.955475763016158, + "grad_norm": 0.9461944699287415, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55080 + }, + { + "epoch": 3.956193895870736, + "grad_norm": 0.7576611042022705, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 55090 + }, + { + "epoch": 3.956912028725314, + "grad_norm": 0.9484710693359375, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 55100 + }, + { + "epoch": 3.957630161579892, + "grad_norm": 0.9487117528915405, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 55110 + }, + { + "epoch": 3.9583482944344706, + "grad_norm": 0.870090663433075, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55120 + }, + { + "epoch": 3.9590664272890486, + "grad_norm": 0.8496458530426025, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 55130 + }, + { + "epoch": 3.9597845601436266, + "grad_norm": 1.0121779441833496, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 55140 + }, + { + "epoch": 3.9605026929982046, + "grad_norm": 0.8912323713302612, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 55150 + }, + { + "epoch": 3.9612208258527826, + "grad_norm": 0.8398444652557373, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 55160 + }, + { + "epoch": 3.961938958707361, + "grad_norm": 0.8046348690986633, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 55170 + }, + { + "epoch": 3.962657091561939, + "grad_norm": 1.0369254350662231, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 55180 + }, + { + "epoch": 3.963375224416517, + "grad_norm": 1.172431230545044, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 55190 + }, + { + "epoch": 3.964093357271095, + "grad_norm": 0.8093554377555847, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 55200 + }, + { + "epoch": 3.9648114901256735, + "grad_norm": 0.8851078748703003, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 55210 + }, + { + "epoch": 3.9655296229802515, + "grad_norm": 0.7494266033172607, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 55220 + }, + { + "epoch": 3.9662477558348295, + "grad_norm": 0.9556898474693298, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 55230 + }, + { + "epoch": 3.9669658886894075, + "grad_norm": 1.016017198562622, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 55240 + }, + { + "epoch": 3.9676840215439855, + "grad_norm": 0.8425998091697693, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 55250 + }, + { + "epoch": 3.9684021543985635, + "grad_norm": 0.717673122882843, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 55260 + }, + { + "epoch": 3.969120287253142, + "grad_norm": 0.8366572856903076, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 55270 + }, + { + "epoch": 3.96983842010772, + "grad_norm": 0.8981583118438721, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 55280 + }, + { + "epoch": 3.970556552962298, + "grad_norm": 0.8868781328201294, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 55290 + }, + { + "epoch": 3.9712746858168764, + "grad_norm": 1.0632785558700562, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 55300 + }, + { + "epoch": 3.9719928186714544, + "grad_norm": 0.8813109993934631, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 55310 + }, + { + "epoch": 3.9727109515260324, + "grad_norm": 0.8225542306900024, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 55320 + }, + { + "epoch": 3.9734290843806104, + "grad_norm": 1.1391420364379883, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 55330 + }, + { + "epoch": 3.9741472172351884, + "grad_norm": 1.0371832847595215, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55340 + }, + { + "epoch": 3.9748653500897664, + "grad_norm": 1.0542186498641968, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 55350 + }, + { + "epoch": 3.975583482944345, + "grad_norm": 1.0178009271621704, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 55360 + }, + { + "epoch": 3.976301615798923, + "grad_norm": 0.7927802205085754, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 55370 + }, + { + "epoch": 3.977019748653501, + "grad_norm": 0.9350495934486389, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55380 + }, + { + "epoch": 3.977737881508079, + "grad_norm": 1.0240116119384766, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 55390 + }, + { + "epoch": 3.9784560143626573, + "grad_norm": 1.0279067754745483, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 55400 + }, + { + "epoch": 3.9791741472172353, + "grad_norm": 1.1228227615356445, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 55410 + }, + { + "epoch": 3.9798922800718133, + "grad_norm": 0.9500134587287903, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 55420 + }, + { + "epoch": 3.9806104129263913, + "grad_norm": 0.9229732155799866, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 55430 + }, + { + "epoch": 3.9813285457809693, + "grad_norm": 0.7946729063987732, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 55440 + }, + { + "epoch": 3.9820466786355477, + "grad_norm": 0.9987489581108093, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 55450 + }, + { + "epoch": 3.9827648114901257, + "grad_norm": 0.9670467972755432, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 55460 + }, + { + "epoch": 3.9834829443447037, + "grad_norm": 0.835028350353241, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 55470 + }, + { + "epoch": 3.9842010771992817, + "grad_norm": 0.8678702712059021, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 55480 + }, + { + "epoch": 3.98491921005386, + "grad_norm": 0.8581197261810303, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 55490 + }, + { + "epoch": 3.985637342908438, + "grad_norm": 0.779848039150238, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 55500 + }, + { + "epoch": 3.986355475763016, + "grad_norm": 0.8827589154243469, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 55510 + }, + { + "epoch": 3.987073608617594, + "grad_norm": 1.0108301639556885, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55520 + }, + { + "epoch": 3.987791741472172, + "grad_norm": 0.8506004214286804, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 55530 + }, + { + "epoch": 3.98850987432675, + "grad_norm": 1.0297727584838867, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 55540 + }, + { + "epoch": 3.9892280071813286, + "grad_norm": 0.8579224944114685, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55550 + }, + { + "epoch": 3.9899461400359066, + "grad_norm": 0.8503788113594055, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 55560 + }, + { + "epoch": 3.9906642728904846, + "grad_norm": 1.1144801378250122, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 55570 + }, + { + "epoch": 3.991382405745063, + "grad_norm": 0.8418305516242981, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 55580 + }, + { + "epoch": 3.992100538599641, + "grad_norm": 1.0065871477127075, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 55590 + }, + { + "epoch": 3.992818671454219, + "grad_norm": 0.8160259127616882, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 55600 + }, + { + "epoch": 3.993536804308797, + "grad_norm": 0.8678009510040283, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55610 + }, + { + "epoch": 3.994254937163375, + "grad_norm": 0.863465428352356, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 55620 + }, + { + "epoch": 3.994973070017953, + "grad_norm": 0.9242135286331177, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 55630 + }, + { + "epoch": 3.9956912028725315, + "grad_norm": 1.0285470485687256, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 55640 + }, + { + "epoch": 3.9964093357271095, + "grad_norm": 0.8953320384025574, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 55650 + }, + { + "epoch": 3.9971274685816875, + "grad_norm": 0.915892481803894, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 55660 + }, + { + "epoch": 3.9978456014362656, + "grad_norm": 0.8235118985176086, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 55670 + }, + { + "epoch": 3.998563734290844, + "grad_norm": 1.0178656578063965, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 55680 + }, + { + "epoch": 3.999281867145422, + "grad_norm": 0.9926803708076477, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 55690 + }, + { + "epoch": 4.0, + "grad_norm": 0.9213629961013794, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 55700 + }, + { + "epoch": 4.0, + "eval_loss": 1.1152480840682983, + "eval_runtime": 55.2237, + "eval_samples_per_second": 13.273, + "eval_steps_per_second": 1.666, + "step": 55700 + }, + { + "epoch": 4.000718132854578, + "grad_norm": 1.0820496082305908, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 55710 + }, + { + "epoch": 4.001436265709156, + "grad_norm": 0.9036441445350647, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 55720 + }, + { + "epoch": 4.002154398563734, + "grad_norm": 1.102754831314087, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 55730 + }, + { + "epoch": 4.002872531418312, + "grad_norm": 0.98259437084198, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 55740 + }, + { + "epoch": 4.003590664272891, + "grad_norm": 1.1935845613479614, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 55750 + }, + { + "epoch": 4.004308797127469, + "grad_norm": 0.9925830960273743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 55760 + }, + { + "epoch": 4.005026929982047, + "grad_norm": 1.075087070465088, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 55770 + }, + { + "epoch": 4.005745062836625, + "grad_norm": 0.8746396899223328, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 55780 + }, + { + "epoch": 4.006463195691203, + "grad_norm": 0.7635995745658875, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 55790 + }, + { + "epoch": 4.007181328545781, + "grad_norm": 0.9064885377883911, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 55800 + }, + { + "epoch": 4.007899461400359, + "grad_norm": 1.018478274345398, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 55810 + }, + { + "epoch": 4.008617594254937, + "grad_norm": 0.9797589778900146, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 55820 + }, + { + "epoch": 4.009335727109515, + "grad_norm": 0.7867457866668701, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 55830 + }, + { + "epoch": 4.010053859964093, + "grad_norm": 0.9998070597648621, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 55840 + }, + { + "epoch": 4.010771992818672, + "grad_norm": 0.8656311631202698, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 55850 + }, + { + "epoch": 4.01149012567325, + "grad_norm": 0.945469081401825, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 55860 + }, + { + "epoch": 4.012208258527828, + "grad_norm": 0.8809926509857178, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 55870 + }, + { + "epoch": 4.012926391382406, + "grad_norm": 0.8047897219657898, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 55880 + }, + { + "epoch": 4.013644524236984, + "grad_norm": 1.0563900470733643, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 55890 + }, + { + "epoch": 4.014362657091562, + "grad_norm": 0.8578300476074219, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 55900 + }, + { + "epoch": 4.01508078994614, + "grad_norm": 1.0304765701293945, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 55910 + }, + { + "epoch": 4.015798922800718, + "grad_norm": 0.8087666034698486, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 55920 + }, + { + "epoch": 4.016517055655296, + "grad_norm": 1.0192348957061768, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 55930 + }, + { + "epoch": 4.017235188509875, + "grad_norm": 1.061194658279419, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 55940 + }, + { + "epoch": 4.017953321364453, + "grad_norm": 0.93668133020401, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 55950 + }, + { + "epoch": 4.018671454219031, + "grad_norm": 1.1569286584854126, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 55960 + }, + { + "epoch": 4.019389587073609, + "grad_norm": 0.9853817224502563, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 55970 + }, + { + "epoch": 4.020107719928187, + "grad_norm": 0.851109504699707, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 55980 + }, + { + "epoch": 4.020825852782765, + "grad_norm": 1.053525447845459, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 55990 + }, + { + "epoch": 4.021543985637343, + "grad_norm": 0.8307225704193115, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 56000 + }, + { + "epoch": 4.022262118491921, + "grad_norm": 1.2741150856018066, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 56010 + }, + { + "epoch": 4.022980251346499, + "grad_norm": 0.9708344340324402, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 56020 + }, + { + "epoch": 4.023698384201078, + "grad_norm": 1.265034556388855, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 56030 + }, + { + "epoch": 4.024416517055656, + "grad_norm": 0.9364367723464966, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 56040 + }, + { + "epoch": 4.025134649910234, + "grad_norm": 0.8643592000007629, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 56050 + }, + { + "epoch": 4.025852782764812, + "grad_norm": 0.9742133021354675, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 56060 + }, + { + "epoch": 4.02657091561939, + "grad_norm": 1.1793473958969116, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 56070 + }, + { + "epoch": 4.027289048473968, + "grad_norm": 0.9641149044036865, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 56080 + }, + { + "epoch": 4.028007181328546, + "grad_norm": 0.9426136016845703, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 56090 + }, + { + "epoch": 4.028725314183124, + "grad_norm": 0.9211869835853577, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 56100 + }, + { + "epoch": 4.029443447037702, + "grad_norm": 1.1576565504074097, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 56110 + }, + { + "epoch": 4.03016157989228, + "grad_norm": 1.0014013051986694, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 56120 + }, + { + "epoch": 4.0308797127468585, + "grad_norm": 0.9307010769844055, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 56130 + }, + { + "epoch": 4.0315978456014365, + "grad_norm": 0.8290148377418518, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 56140 + }, + { + "epoch": 4.0323159784560145, + "grad_norm": 1.0648446083068848, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 56150 + }, + { + "epoch": 4.0330341113105925, + "grad_norm": 1.1545547246932983, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 56160 + }, + { + "epoch": 4.0337522441651705, + "grad_norm": 0.9643545150756836, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 56170 + }, + { + "epoch": 4.0344703770197485, + "grad_norm": 0.8913900256156921, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 56180 + }, + { + "epoch": 4.0351885098743265, + "grad_norm": 0.9445754289627075, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 56190 + }, + { + "epoch": 4.0359066427289045, + "grad_norm": 0.9353124499320984, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 56200 + }, + { + "epoch": 4.0366247755834825, + "grad_norm": 1.1780431270599365, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 56210 + }, + { + "epoch": 4.037342908438061, + "grad_norm": 0.9208880662918091, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 56220 + }, + { + "epoch": 4.038061041292639, + "grad_norm": 0.9475517272949219, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 56230 + }, + { + "epoch": 4.038779174147217, + "grad_norm": 0.7478583455085754, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 56240 + }, + { + "epoch": 4.039497307001795, + "grad_norm": 1.0026403665542603, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 56250 + }, + { + "epoch": 4.040215439856373, + "grad_norm": 0.9664973020553589, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 56260 + }, + { + "epoch": 4.040933572710951, + "grad_norm": 1.0655616521835327, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 56270 + }, + { + "epoch": 4.041651705565529, + "grad_norm": 0.8367540240287781, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 56280 + }, + { + "epoch": 4.042369838420107, + "grad_norm": 0.7982191443443298, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 56290 + }, + { + "epoch": 4.043087971274685, + "grad_norm": 0.8304495215415955, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 56300 + }, + { + "epoch": 4.043806104129264, + "grad_norm": 0.95123291015625, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 56310 + }, + { + "epoch": 4.044524236983842, + "grad_norm": 0.9504102468490601, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 56320 + }, + { + "epoch": 4.04524236983842, + "grad_norm": 0.7432710528373718, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 56330 + }, + { + "epoch": 4.045960502692998, + "grad_norm": 0.9327874183654785, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 56340 + }, + { + "epoch": 4.046678635547576, + "grad_norm": 0.9161670804023743, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 56350 + }, + { + "epoch": 4.047396768402154, + "grad_norm": 0.9371771812438965, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 56360 + }, + { + "epoch": 4.048114901256732, + "grad_norm": 1.0332437753677368, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 56370 + }, + { + "epoch": 4.04883303411131, + "grad_norm": 0.7346320748329163, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 56380 + }, + { + "epoch": 4.049551166965888, + "grad_norm": 0.8247857689857483, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 56390 + }, + { + "epoch": 4.050269299820466, + "grad_norm": 0.925325334072113, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 56400 + }, + { + "epoch": 4.050987432675045, + "grad_norm": 0.7344088554382324, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 56410 + }, + { + "epoch": 4.051705565529623, + "grad_norm": 0.9204918146133423, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 56420 + }, + { + "epoch": 4.052423698384201, + "grad_norm": 0.8273472785949707, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 56430 + }, + { + "epoch": 4.053141831238779, + "grad_norm": 0.9524998068809509, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 56440 + }, + { + "epoch": 4.053859964093357, + "grad_norm": 0.9168205857276917, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 56450 + }, + { + "epoch": 4.054578096947935, + "grad_norm": 0.9634994864463806, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 56460 + }, + { + "epoch": 4.055296229802513, + "grad_norm": 1.2027593851089478, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 56470 + }, + { + "epoch": 4.056014362657091, + "grad_norm": 1.2347805500030518, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 56480 + }, + { + "epoch": 4.056732495511669, + "grad_norm": 0.8621458411216736, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 56490 + }, + { + "epoch": 4.057450628366248, + "grad_norm": 0.9194608330726624, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 56500 + }, + { + "epoch": 4.058168761220826, + "grad_norm": 1.0153663158416748, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 56510 + }, + { + "epoch": 4.058886894075404, + "grad_norm": 0.9170986413955688, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 56520 + }, + { + "epoch": 4.059605026929982, + "grad_norm": 1.033057689666748, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 56530 + }, + { + "epoch": 4.06032315978456, + "grad_norm": 1.0125197172164917, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 56540 + }, + { + "epoch": 4.061041292639138, + "grad_norm": 0.9429898262023926, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 56550 + }, + { + "epoch": 4.061759425493716, + "grad_norm": 0.9242179989814758, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 56560 + }, + { + "epoch": 4.062477558348294, + "grad_norm": 0.9365091323852539, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 56570 + }, + { + "epoch": 4.063195691202872, + "grad_norm": 0.9148455858230591, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 56580 + }, + { + "epoch": 4.063913824057451, + "grad_norm": 0.8546709418296814, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 56590 + }, + { + "epoch": 4.064631956912029, + "grad_norm": 0.9743902087211609, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 56600 + }, + { + "epoch": 4.065350089766607, + "grad_norm": 1.0599974393844604, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 56610 + }, + { + "epoch": 4.066068222621185, + "grad_norm": 0.9677841067314148, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 56620 + }, + { + "epoch": 4.066786355475763, + "grad_norm": 0.8892754316329956, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 56630 + }, + { + "epoch": 4.067504488330341, + "grad_norm": 0.8837814331054688, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 56640 + }, + { + "epoch": 4.068222621184919, + "grad_norm": 0.9284095764160156, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 56650 + }, + { + "epoch": 4.068940754039497, + "grad_norm": 1.0163567066192627, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 56660 + }, + { + "epoch": 4.069658886894075, + "grad_norm": 0.8713456988334656, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 56670 + }, + { + "epoch": 4.070377019748653, + "grad_norm": 0.8356686234474182, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 56680 + }, + { + "epoch": 4.071095152603232, + "grad_norm": 0.8998766541481018, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 56690 + }, + { + "epoch": 4.07181328545781, + "grad_norm": 1.0441967248916626, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 56700 + }, + { + "epoch": 4.072531418312388, + "grad_norm": 0.9313125610351562, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 56710 + }, + { + "epoch": 4.073249551166966, + "grad_norm": 0.9912964701652527, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 56720 + }, + { + "epoch": 4.073967684021544, + "grad_norm": 0.9048459529876709, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 56730 + }, + { + "epoch": 4.074685816876122, + "grad_norm": 1.0248944759368896, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 56740 + }, + { + "epoch": 4.0754039497307, + "grad_norm": 1.4526786804199219, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 56750 + }, + { + "epoch": 4.076122082585278, + "grad_norm": 0.9813178181648254, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 56760 + }, + { + "epoch": 4.076840215439856, + "grad_norm": 1.0686813592910767, + "learning_rate": 0.0002, + "loss": 0.5707, + "step": 56770 + }, + { + "epoch": 4.077558348294435, + "grad_norm": 1.1093482971191406, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 56780 + }, + { + "epoch": 4.078276481149013, + "grad_norm": 0.9377819895744324, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 56790 + }, + { + "epoch": 4.078994614003591, + "grad_norm": 0.8043649196624756, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 56800 + }, + { + "epoch": 4.079712746858169, + "grad_norm": 0.7995415925979614, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 56810 + }, + { + "epoch": 4.080430879712747, + "grad_norm": 1.0076148509979248, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 56820 + }, + { + "epoch": 4.081149012567325, + "grad_norm": 0.8192076683044434, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 56830 + }, + { + "epoch": 4.081867145421903, + "grad_norm": 0.9226266145706177, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 56840 + }, + { + "epoch": 4.082585278276481, + "grad_norm": 0.8877972960472107, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 56850 + }, + { + "epoch": 4.083303411131059, + "grad_norm": 0.9578937888145447, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 56860 + }, + { + "epoch": 4.084021543985638, + "grad_norm": 0.8929167985916138, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 56870 + }, + { + "epoch": 4.084739676840216, + "grad_norm": 1.0015977621078491, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 56880 + }, + { + "epoch": 4.085457809694794, + "grad_norm": 0.9768750667572021, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 56890 + }, + { + "epoch": 4.086175942549372, + "grad_norm": 1.0834569931030273, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 56900 + }, + { + "epoch": 4.08689407540395, + "grad_norm": 0.8761230707168579, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 56910 + }, + { + "epoch": 4.087612208258528, + "grad_norm": 1.027064323425293, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 56920 + }, + { + "epoch": 4.088330341113106, + "grad_norm": 1.130336880683899, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 56930 + }, + { + "epoch": 4.089048473967684, + "grad_norm": 0.8157579898834229, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 56940 + }, + { + "epoch": 4.089766606822262, + "grad_norm": 1.071175217628479, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 56950 + }, + { + "epoch": 4.09048473967684, + "grad_norm": 0.9534492492675781, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 56960 + }, + { + "epoch": 4.091202872531419, + "grad_norm": 0.9584037661552429, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 56970 + }, + { + "epoch": 4.091921005385997, + "grad_norm": 1.1513131856918335, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 56980 + }, + { + "epoch": 4.092639138240575, + "grad_norm": 1.0167666673660278, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 56990 + }, + { + "epoch": 4.093357271095153, + "grad_norm": 1.0630987882614136, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 57000 + }, + { + "epoch": 4.094075403949731, + "grad_norm": 1.0326893329620361, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 57010 + }, + { + "epoch": 4.094793536804309, + "grad_norm": 0.9701678156852722, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 57020 + }, + { + "epoch": 4.095511669658887, + "grad_norm": 0.839935302734375, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 57030 + }, + { + "epoch": 4.096229802513465, + "grad_norm": 0.8995838761329651, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 57040 + }, + { + "epoch": 4.096947935368043, + "grad_norm": 0.8039916157722473, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 57050 + }, + { + "epoch": 4.097666068222622, + "grad_norm": 1.126122236251831, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 57060 + }, + { + "epoch": 4.0983842010772, + "grad_norm": 0.8749837875366211, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 57070 + }, + { + "epoch": 4.099102333931778, + "grad_norm": 0.8630341291427612, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 57080 + }, + { + "epoch": 4.099820466786356, + "grad_norm": 0.8889496922492981, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 57090 + }, + { + "epoch": 4.100538599640934, + "grad_norm": 0.9050310254096985, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 57100 + }, + { + "epoch": 4.101256732495512, + "grad_norm": 0.943072497844696, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 57110 + }, + { + "epoch": 4.10197486535009, + "grad_norm": 0.9031552672386169, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 57120 + }, + { + "epoch": 4.102692998204668, + "grad_norm": 0.939862847328186, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 57130 + }, + { + "epoch": 4.103411131059246, + "grad_norm": 0.8080634474754333, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 57140 + }, + { + "epoch": 4.1041292639138245, + "grad_norm": 0.9181693196296692, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 57150 + }, + { + "epoch": 4.1048473967684025, + "grad_norm": 0.9609217643737793, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 57160 + }, + { + "epoch": 4.1055655296229805, + "grad_norm": 1.1246516704559326, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 57170 + }, + { + "epoch": 4.1062836624775585, + "grad_norm": 1.0616880655288696, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 57180 + }, + { + "epoch": 4.1070017953321365, + "grad_norm": 0.9954505562782288, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 57190 + }, + { + "epoch": 4.1077199281867145, + "grad_norm": 1.0602279901504517, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 57200 + }, + { + "epoch": 4.1084380610412925, + "grad_norm": 0.8984764814376831, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 57210 + }, + { + "epoch": 4.1091561938958705, + "grad_norm": 0.845167875289917, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 57220 + }, + { + "epoch": 4.1098743267504485, + "grad_norm": 0.7901500463485718, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 57230 + }, + { + "epoch": 4.1105924596050265, + "grad_norm": 1.0462526082992554, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 57240 + }, + { + "epoch": 4.111310592459605, + "grad_norm": 0.9098827838897705, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 57250 + }, + { + "epoch": 4.112028725314183, + "grad_norm": 0.9234077334403992, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 57260 + }, + { + "epoch": 4.112746858168761, + "grad_norm": 1.0033560991287231, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 57270 + }, + { + "epoch": 4.113464991023339, + "grad_norm": 1.0620051622390747, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 57280 + }, + { + "epoch": 4.114183123877917, + "grad_norm": 0.8679345846176147, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 57290 + }, + { + "epoch": 4.114901256732495, + "grad_norm": 0.7557345628738403, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 57300 + }, + { + "epoch": 4.115619389587073, + "grad_norm": 0.8970935344696045, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 57310 + }, + { + "epoch": 4.116337522441651, + "grad_norm": 1.0779842138290405, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 57320 + }, + { + "epoch": 4.117055655296229, + "grad_norm": 1.2036106586456299, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 57330 + }, + { + "epoch": 4.117773788150808, + "grad_norm": 0.8337953686714172, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 57340 + }, + { + "epoch": 4.118491921005386, + "grad_norm": 0.9850410223007202, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 57350 + }, + { + "epoch": 4.119210053859964, + "grad_norm": 0.8028770685195923, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 57360 + }, + { + "epoch": 4.119928186714542, + "grad_norm": 0.8693217039108276, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 57370 + }, + { + "epoch": 4.12064631956912, + "grad_norm": 0.8795534372329712, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 57380 + }, + { + "epoch": 4.121364452423698, + "grad_norm": 1.0081543922424316, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 57390 + }, + { + "epoch": 4.122082585278276, + "grad_norm": 0.8776742219924927, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 57400 + }, + { + "epoch": 4.122800718132854, + "grad_norm": 0.8247824311256409, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 57410 + }, + { + "epoch": 4.123518850987432, + "grad_norm": 1.1346335411071777, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 57420 + }, + { + "epoch": 4.124236983842011, + "grad_norm": 1.0671089887619019, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 57430 + }, + { + "epoch": 4.124955116696589, + "grad_norm": 0.8548333048820496, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 57440 + }, + { + "epoch": 4.125673249551167, + "grad_norm": 1.0221573114395142, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 57450 + }, + { + "epoch": 4.126391382405745, + "grad_norm": 0.9746617674827576, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 57460 + }, + { + "epoch": 4.127109515260323, + "grad_norm": 0.8104965090751648, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 57470 + }, + { + "epoch": 4.127827648114901, + "grad_norm": 1.0401487350463867, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 57480 + }, + { + "epoch": 4.128545780969479, + "grad_norm": 0.8828882575035095, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 57490 + }, + { + "epoch": 4.129263913824057, + "grad_norm": 1.0121098756790161, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 57500 + }, + { + "epoch": 4.129982046678635, + "grad_norm": 0.8789737820625305, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 57510 + }, + { + "epoch": 4.130700179533213, + "grad_norm": 1.0386744737625122, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 57520 + }, + { + "epoch": 4.131418312387792, + "grad_norm": 1.0092610120773315, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 57530 + }, + { + "epoch": 4.13213644524237, + "grad_norm": 0.8706282377243042, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 57540 + }, + { + "epoch": 4.132854578096948, + "grad_norm": 0.9270507097244263, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 57550 + }, + { + "epoch": 4.133572710951526, + "grad_norm": 1.0303068161010742, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 57560 + }, + { + "epoch": 4.134290843806104, + "grad_norm": 1.1169062852859497, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 57570 + }, + { + "epoch": 4.135008976660682, + "grad_norm": 0.8530599474906921, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 57580 + }, + { + "epoch": 4.13572710951526, + "grad_norm": 1.1395039558410645, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 57590 + }, + { + "epoch": 4.136445242369838, + "grad_norm": 0.8944115042686462, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 57600 + }, + { + "epoch": 4.137163375224416, + "grad_norm": 1.137966275215149, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 57610 + }, + { + "epoch": 4.137881508078995, + "grad_norm": 0.8244962692260742, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 57620 + }, + { + "epoch": 4.138599640933573, + "grad_norm": 1.1935817003250122, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 57630 + }, + { + "epoch": 4.139317773788151, + "grad_norm": 0.9774235486984253, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 57640 + }, + { + "epoch": 4.140035906642729, + "grad_norm": 1.066219449043274, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 57650 + }, + { + "epoch": 4.140754039497307, + "grad_norm": 0.8631396293640137, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 57660 + }, + { + "epoch": 4.141472172351885, + "grad_norm": 0.888410747051239, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 57670 + }, + { + "epoch": 4.142190305206463, + "grad_norm": 1.002642035484314, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 57680 + }, + { + "epoch": 4.142908438061041, + "grad_norm": 1.0092825889587402, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 57690 + }, + { + "epoch": 4.143626570915619, + "grad_norm": 0.9126971364021301, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 57700 + }, + { + "epoch": 4.144344703770198, + "grad_norm": 1.0303562879562378, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 57710 + }, + { + "epoch": 4.145062836624776, + "grad_norm": 1.1230897903442383, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 57720 + }, + { + "epoch": 4.145780969479354, + "grad_norm": 1.0494099855422974, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 57730 + }, + { + "epoch": 4.146499102333932, + "grad_norm": 0.9555442333221436, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 57740 + }, + { + "epoch": 4.14721723518851, + "grad_norm": 0.8255124092102051, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 57750 + }, + { + "epoch": 4.147935368043088, + "grad_norm": 1.097853660583496, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 57760 + }, + { + "epoch": 4.148653500897666, + "grad_norm": 1.0272663831710815, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 57770 + }, + { + "epoch": 4.149371633752244, + "grad_norm": 1.022571086883545, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 57780 + }, + { + "epoch": 4.150089766606822, + "grad_norm": 0.964543342590332, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 57790 + }, + { + "epoch": 4.1508078994614, + "grad_norm": 0.9251219034194946, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 57800 + }, + { + "epoch": 4.151526032315979, + "grad_norm": 1.081840991973877, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 57810 + }, + { + "epoch": 4.152244165170557, + "grad_norm": 0.8989445567131042, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 57820 + }, + { + "epoch": 4.152962298025135, + "grad_norm": 0.903629720211029, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 57830 + }, + { + "epoch": 4.153680430879713, + "grad_norm": 0.8985397219657898, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 57840 + }, + { + "epoch": 4.154398563734291, + "grad_norm": 1.047778844833374, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 57850 + }, + { + "epoch": 4.155116696588869, + "grad_norm": 0.9804165363311768, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 57860 + }, + { + "epoch": 4.155834829443447, + "grad_norm": 1.187309980392456, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 57870 + }, + { + "epoch": 4.156552962298025, + "grad_norm": 0.9854836463928223, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 57880 + }, + { + "epoch": 4.157271095152603, + "grad_norm": 0.8494308590888977, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 57890 + }, + { + "epoch": 4.157989228007182, + "grad_norm": 0.9359684586524963, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 57900 + }, + { + "epoch": 4.15870736086176, + "grad_norm": 0.8971988558769226, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 57910 + }, + { + "epoch": 4.159425493716338, + "grad_norm": 0.8848021030426025, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 57920 + }, + { + "epoch": 4.160143626570916, + "grad_norm": 0.982877790927887, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 57930 + }, + { + "epoch": 4.160861759425494, + "grad_norm": 0.8668819069862366, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 57940 + }, + { + "epoch": 4.161579892280072, + "grad_norm": 1.06569504737854, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 57950 + }, + { + "epoch": 4.16229802513465, + "grad_norm": 1.165740728378296, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 57960 + }, + { + "epoch": 4.163016157989228, + "grad_norm": 1.0534512996673584, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 57970 + }, + { + "epoch": 4.163734290843806, + "grad_norm": 0.8785330653190613, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 57980 + }, + { + "epoch": 4.164452423698384, + "grad_norm": 1.1244874000549316, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 57990 + }, + { + "epoch": 4.165170556552963, + "grad_norm": 0.8839399218559265, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 58000 + }, + { + "epoch": 4.165888689407541, + "grad_norm": 1.0603798627853394, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 58010 + }, + { + "epoch": 4.166606822262119, + "grad_norm": 0.9737853407859802, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 58020 + }, + { + "epoch": 4.167324955116697, + "grad_norm": 1.0650558471679688, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 58030 + }, + { + "epoch": 4.168043087971275, + "grad_norm": 0.7528959512710571, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 58040 + }, + { + "epoch": 4.168761220825853, + "grad_norm": 0.9286156892776489, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 58050 + }, + { + "epoch": 4.169479353680431, + "grad_norm": 1.0225880146026611, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 58060 + }, + { + "epoch": 4.170197486535009, + "grad_norm": 0.9990654587745667, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 58070 + }, + { + "epoch": 4.170915619389587, + "grad_norm": 1.052057147026062, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 58080 + }, + { + "epoch": 4.1716337522441655, + "grad_norm": 0.7366801500320435, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 58090 + }, + { + "epoch": 4.1723518850987436, + "grad_norm": 1.0943711996078491, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 58100 + }, + { + "epoch": 4.1730700179533216, + "grad_norm": 1.1297656297683716, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 58110 + }, + { + "epoch": 4.1737881508078996, + "grad_norm": 0.7861461639404297, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 58120 + }, + { + "epoch": 4.174506283662478, + "grad_norm": 0.8643335103988647, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 58130 + }, + { + "epoch": 4.175224416517056, + "grad_norm": 0.957288384437561, + "learning_rate": 0.0002, + "loss": 0.6103, + "step": 58140 + }, + { + "epoch": 4.175942549371634, + "grad_norm": 0.9175366759300232, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 58150 + }, + { + "epoch": 4.176660682226212, + "grad_norm": 1.129935622215271, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 58160 + }, + { + "epoch": 4.17737881508079, + "grad_norm": 0.9683087468147278, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 58170 + }, + { + "epoch": 4.1780969479353685, + "grad_norm": 1.045171856880188, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 58180 + }, + { + "epoch": 4.1788150807899465, + "grad_norm": 0.9858742952346802, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 58190 + }, + { + "epoch": 4.1795332136445245, + "grad_norm": 0.8513413071632385, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 58200 + }, + { + "epoch": 4.1802513464991025, + "grad_norm": 0.9584265947341919, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 58210 + }, + { + "epoch": 4.1809694793536805, + "grad_norm": 0.8828920722007751, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 58220 + }, + { + "epoch": 4.1816876122082585, + "grad_norm": 0.9849961400032043, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 58230 + }, + { + "epoch": 4.1824057450628365, + "grad_norm": 1.0601637363433838, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 58240 + }, + { + "epoch": 4.1831238779174145, + "grad_norm": 1.2206604480743408, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 58250 + }, + { + "epoch": 4.1838420107719925, + "grad_norm": 1.1768009662628174, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 58260 + }, + { + "epoch": 4.184560143626571, + "grad_norm": 0.9521295428276062, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 58270 + }, + { + "epoch": 4.185278276481149, + "grad_norm": 0.892971932888031, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 58280 + }, + { + "epoch": 4.185996409335727, + "grad_norm": 0.8712016940116882, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 58290 + }, + { + "epoch": 4.186714542190305, + "grad_norm": 1.0190843343734741, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 58300 + }, + { + "epoch": 4.187432675044883, + "grad_norm": 1.0149270296096802, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 58310 + }, + { + "epoch": 4.188150807899461, + "grad_norm": 1.1818004846572876, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 58320 + }, + { + "epoch": 4.188868940754039, + "grad_norm": 0.7892335653305054, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 58330 + }, + { + "epoch": 4.189587073608617, + "grad_norm": 0.9792808890342712, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 58340 + }, + { + "epoch": 4.190305206463195, + "grad_norm": 0.9946883320808411, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 58350 + }, + { + "epoch": 4.191023339317773, + "grad_norm": 1.0363789796829224, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 58360 + }, + { + "epoch": 4.191741472172352, + "grad_norm": 0.9285917282104492, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 58370 + }, + { + "epoch": 4.19245960502693, + "grad_norm": 0.9461679458618164, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 58380 + }, + { + "epoch": 4.193177737881508, + "grad_norm": 1.0344175100326538, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 58390 + }, + { + "epoch": 4.193895870736086, + "grad_norm": 0.9530242085456848, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 58400 + }, + { + "epoch": 4.194614003590664, + "grad_norm": 0.9171900749206543, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 58410 + }, + { + "epoch": 4.195332136445242, + "grad_norm": 0.8094898462295532, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 58420 + }, + { + "epoch": 4.19605026929982, + "grad_norm": 0.921981930732727, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 58430 + }, + { + "epoch": 4.196768402154398, + "grad_norm": 0.9783532023429871, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 58440 + }, + { + "epoch": 4.197486535008976, + "grad_norm": 1.017805576324463, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 58450 + }, + { + "epoch": 4.198204667863555, + "grad_norm": 0.9244308471679688, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 58460 + }, + { + "epoch": 4.198922800718133, + "grad_norm": 0.9942585229873657, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 58470 + }, + { + "epoch": 4.199640933572711, + "grad_norm": 1.1045037508010864, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 58480 + }, + { + "epoch": 4.200359066427289, + "grad_norm": 0.9483149647712708, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58490 + }, + { + "epoch": 4.201077199281867, + "grad_norm": 1.0807271003723145, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 58500 + }, + { + "epoch": 4.201795332136445, + "grad_norm": 0.7697445750236511, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 58510 + }, + { + "epoch": 4.202513464991023, + "grad_norm": 1.0761178731918335, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 58520 + }, + { + "epoch": 4.203231597845601, + "grad_norm": 0.9992024898529053, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 58530 + }, + { + "epoch": 4.203949730700179, + "grad_norm": 0.8741498589515686, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 58540 + }, + { + "epoch": 4.204667863554757, + "grad_norm": 0.8557528853416443, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 58550 + }, + { + "epoch": 4.205385996409336, + "grad_norm": 0.8853630423545837, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 58560 + }, + { + "epoch": 4.206104129263914, + "grad_norm": 0.9858933687210083, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 58570 + }, + { + "epoch": 4.206822262118492, + "grad_norm": 1.104732871055603, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 58580 + }, + { + "epoch": 4.20754039497307, + "grad_norm": 0.9345462322235107, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 58590 + }, + { + "epoch": 4.208258527827648, + "grad_norm": 0.9620407819747925, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 58600 + }, + { + "epoch": 4.208976660682226, + "grad_norm": 0.8546963334083557, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 58610 + }, + { + "epoch": 4.209694793536804, + "grad_norm": 0.8125145435333252, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 58620 + }, + { + "epoch": 4.210412926391382, + "grad_norm": 0.8481138944625854, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 58630 + }, + { + "epoch": 4.21113105924596, + "grad_norm": 0.8884692788124084, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 58640 + }, + { + "epoch": 4.211849192100539, + "grad_norm": 1.09279465675354, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 58650 + }, + { + "epoch": 4.212567324955117, + "grad_norm": 0.9806583523750305, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 58660 + }, + { + "epoch": 4.213285457809695, + "grad_norm": 0.9510366916656494, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 58670 + }, + { + "epoch": 4.214003590664273, + "grad_norm": 0.7517459988594055, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 58680 + }, + { + "epoch": 4.214721723518851, + "grad_norm": 1.1134123802185059, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 58690 + }, + { + "epoch": 4.215439856373429, + "grad_norm": 0.8307328820228577, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 58700 + }, + { + "epoch": 4.216157989228007, + "grad_norm": 0.8211639523506165, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 58710 + }, + { + "epoch": 4.216876122082585, + "grad_norm": 1.0749584436416626, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 58720 + }, + { + "epoch": 4.217594254937163, + "grad_norm": 1.1394833326339722, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 58730 + }, + { + "epoch": 4.218312387791742, + "grad_norm": 1.05130934715271, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 58740 + }, + { + "epoch": 4.21903052064632, + "grad_norm": 0.7949456572532654, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 58750 + }, + { + "epoch": 4.219748653500898, + "grad_norm": 0.906506359577179, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 58760 + }, + { + "epoch": 4.220466786355476, + "grad_norm": 0.8338989615440369, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 58770 + }, + { + "epoch": 4.221184919210054, + "grad_norm": 0.9325370788574219, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 58780 + }, + { + "epoch": 4.221903052064632, + "grad_norm": 1.0208096504211426, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 58790 + }, + { + "epoch": 4.22262118491921, + "grad_norm": 1.0075920820236206, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 58800 + }, + { + "epoch": 4.223339317773788, + "grad_norm": 0.9858701229095459, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 58810 + }, + { + "epoch": 4.224057450628366, + "grad_norm": 1.0010110139846802, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 58820 + }, + { + "epoch": 4.224775583482945, + "grad_norm": 0.9360540509223938, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 58830 + }, + { + "epoch": 4.225493716337523, + "grad_norm": 0.9021786451339722, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 58840 + }, + { + "epoch": 4.226211849192101, + "grad_norm": 1.1778476238250732, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 58850 + }, + { + "epoch": 4.226929982046679, + "grad_norm": 1.0061023235321045, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 58860 + }, + { + "epoch": 4.227648114901257, + "grad_norm": 0.8839752674102783, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58870 + }, + { + "epoch": 4.228366247755835, + "grad_norm": 1.0078870058059692, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 58880 + }, + { + "epoch": 4.229084380610413, + "grad_norm": 0.8926451206207275, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 58890 + }, + { + "epoch": 4.229802513464991, + "grad_norm": 1.4018772840499878, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 58900 + }, + { + "epoch": 4.230520646319569, + "grad_norm": 0.9911289215087891, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 58910 + }, + { + "epoch": 4.231238779174147, + "grad_norm": 0.9374576807022095, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58920 + }, + { + "epoch": 4.231956912028726, + "grad_norm": 1.179650068283081, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 58930 + }, + { + "epoch": 4.232675044883304, + "grad_norm": 0.9434911012649536, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 58940 + }, + { + "epoch": 4.233393177737882, + "grad_norm": 1.0061911344528198, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 58950 + }, + { + "epoch": 4.23411131059246, + "grad_norm": 0.9663233757019043, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 58960 + }, + { + "epoch": 4.234829443447038, + "grad_norm": 0.8897581696510315, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 58970 + }, + { + "epoch": 4.235547576301616, + "grad_norm": 0.873281717300415, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 58980 + }, + { + "epoch": 4.236265709156194, + "grad_norm": 0.9146949052810669, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 58990 + }, + { + "epoch": 4.236983842010772, + "grad_norm": 0.9381195306777954, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 59000 + }, + { + "epoch": 4.23770197486535, + "grad_norm": 0.9700697064399719, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 59010 + }, + { + "epoch": 4.238420107719929, + "grad_norm": 0.9050154685974121, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 59020 + }, + { + "epoch": 4.239138240574507, + "grad_norm": 0.9901503324508667, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 59030 + }, + { + "epoch": 4.239856373429085, + "grad_norm": 0.9009594321250916, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 59040 + }, + { + "epoch": 4.240574506283663, + "grad_norm": 1.0924968719482422, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 59050 + }, + { + "epoch": 4.241292639138241, + "grad_norm": 0.9939947724342346, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 59060 + }, + { + "epoch": 4.242010771992819, + "grad_norm": 1.0577857494354248, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 59070 + }, + { + "epoch": 4.242728904847397, + "grad_norm": 1.0836747884750366, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 59080 + }, + { + "epoch": 4.243447037701975, + "grad_norm": 0.97043377161026, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 59090 + }, + { + "epoch": 4.244165170556553, + "grad_norm": 0.7711901664733887, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 59100 + }, + { + "epoch": 4.244883303411131, + "grad_norm": 1.0143170356750488, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 59110 + }, + { + "epoch": 4.2456014362657095, + "grad_norm": 0.9151925444602966, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 59120 + }, + { + "epoch": 4.2463195691202875, + "grad_norm": 0.9252700209617615, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 59130 + }, + { + "epoch": 4.2470377019748655, + "grad_norm": 0.8429408073425293, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 59140 + }, + { + "epoch": 4.2477558348294435, + "grad_norm": 0.9645987153053284, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 59150 + }, + { + "epoch": 4.2484739676840215, + "grad_norm": 0.9949791431427002, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 59160 + }, + { + "epoch": 4.2491921005385995, + "grad_norm": 0.9128350615501404, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 59170 + }, + { + "epoch": 4.2499102333931775, + "grad_norm": 0.7406911849975586, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 59180 + }, + { + "epoch": 4.2506283662477555, + "grad_norm": 1.0237419605255127, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 59190 + }, + { + "epoch": 4.2513464991023335, + "grad_norm": 0.805459201335907, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 59200 + }, + { + "epoch": 4.252064631956912, + "grad_norm": 0.8477254509925842, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 59210 + }, + { + "epoch": 4.25278276481149, + "grad_norm": 0.984023928642273, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 59220 + }, + { + "epoch": 4.253500897666068, + "grad_norm": 1.0667484998703003, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 59230 + }, + { + "epoch": 4.254219030520646, + "grad_norm": 0.7192284464836121, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 59240 + }, + { + "epoch": 4.254937163375224, + "grad_norm": 0.9557451009750366, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 59250 + }, + { + "epoch": 4.255655296229802, + "grad_norm": 0.9209784865379333, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 59260 + }, + { + "epoch": 4.25637342908438, + "grad_norm": 0.9785363674163818, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 59270 + }, + { + "epoch": 4.257091561938958, + "grad_norm": 0.910214364528656, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 59280 + }, + { + "epoch": 4.257809694793536, + "grad_norm": 0.8945858478546143, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 59290 + }, + { + "epoch": 4.258527827648114, + "grad_norm": 1.0984420776367188, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 59300 + }, + { + "epoch": 4.259245960502693, + "grad_norm": 1.0256640911102295, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 59310 + }, + { + "epoch": 4.259964093357271, + "grad_norm": 0.978397786617279, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 59320 + }, + { + "epoch": 4.260682226211849, + "grad_norm": 0.7587000727653503, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 59330 + }, + { + "epoch": 4.261400359066427, + "grad_norm": 0.9384620785713196, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 59340 + }, + { + "epoch": 4.262118491921005, + "grad_norm": 0.893992006778717, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 59350 + }, + { + "epoch": 4.262836624775583, + "grad_norm": 1.0231536626815796, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 59360 + }, + { + "epoch": 4.263554757630161, + "grad_norm": 0.9810128211975098, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 59370 + }, + { + "epoch": 4.264272890484739, + "grad_norm": 1.0868116617202759, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 59380 + }, + { + "epoch": 4.264991023339318, + "grad_norm": 1.1433676481246948, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 59390 + }, + { + "epoch": 4.265709156193896, + "grad_norm": 0.9836946725845337, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 59400 + }, + { + "epoch": 4.266427289048474, + "grad_norm": 0.9473603963851929, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 59410 + }, + { + "epoch": 4.267145421903052, + "grad_norm": 0.9066835641860962, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 59420 + }, + { + "epoch": 4.26786355475763, + "grad_norm": 1.0534718036651611, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 59430 + }, + { + "epoch": 4.268581687612208, + "grad_norm": 1.0392775535583496, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 59440 + }, + { + "epoch": 4.269299820466786, + "grad_norm": 1.011472463607788, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 59450 + }, + { + "epoch": 4.270017953321364, + "grad_norm": 1.0704147815704346, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 59460 + }, + { + "epoch": 4.270736086175942, + "grad_norm": 0.9349238872528076, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 59470 + }, + { + "epoch": 4.27145421903052, + "grad_norm": 0.8745087385177612, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 59480 + }, + { + "epoch": 4.272172351885099, + "grad_norm": 0.8823763728141785, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 59490 + }, + { + "epoch": 4.272890484739677, + "grad_norm": 1.110912799835205, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 59500 + }, + { + "epoch": 4.273608617594255, + "grad_norm": 1.0000925064086914, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 59510 + }, + { + "epoch": 4.274326750448833, + "grad_norm": 1.1578227281570435, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 59520 + }, + { + "epoch": 4.275044883303411, + "grad_norm": 0.875720202922821, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 59530 + }, + { + "epoch": 4.275763016157989, + "grad_norm": 0.9562238454818726, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 59540 + }, + { + "epoch": 4.276481149012567, + "grad_norm": 0.8384222388267517, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 59550 + }, + { + "epoch": 4.277199281867145, + "grad_norm": 1.2719428539276123, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 59560 + }, + { + "epoch": 4.277917414721723, + "grad_norm": 1.0656434297561646, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 59570 + }, + { + "epoch": 4.278635547576302, + "grad_norm": 1.0766716003417969, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 59580 + }, + { + "epoch": 4.27935368043088, + "grad_norm": 0.8892807960510254, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 59590 + }, + { + "epoch": 4.280071813285458, + "grad_norm": 0.8956300020217896, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 59600 + }, + { + "epoch": 4.280789946140036, + "grad_norm": 0.9562926888465881, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 59610 + }, + { + "epoch": 4.281508078994614, + "grad_norm": 1.009141445159912, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 59620 + }, + { + "epoch": 4.282226211849192, + "grad_norm": 1.0546064376831055, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 59630 + }, + { + "epoch": 4.28294434470377, + "grad_norm": 0.8831254243850708, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 59640 + }, + { + "epoch": 4.283662477558348, + "grad_norm": 0.9560053944587708, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 59650 + }, + { + "epoch": 4.284380610412926, + "grad_norm": 1.030339241027832, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 59660 + }, + { + "epoch": 4.285098743267504, + "grad_norm": 1.00662100315094, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 59670 + }, + { + "epoch": 4.285816876122083, + "grad_norm": 1.0759116411209106, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 59680 + }, + { + "epoch": 4.286535008976661, + "grad_norm": 0.9985393285751343, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 59690 + }, + { + "epoch": 4.287253141831239, + "grad_norm": 0.9044474959373474, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 59700 + }, + { + "epoch": 4.287971274685817, + "grad_norm": 1.1224442720413208, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 59710 + }, + { + "epoch": 4.288689407540395, + "grad_norm": 0.8436414003372192, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 59720 + }, + { + "epoch": 4.289407540394973, + "grad_norm": 1.0695041418075562, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 59730 + }, + { + "epoch": 4.290125673249551, + "grad_norm": 0.8809951543807983, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 59740 + }, + { + "epoch": 4.290843806104129, + "grad_norm": 1.0213792324066162, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 59750 + }, + { + "epoch": 4.291561938958707, + "grad_norm": 0.9660196900367737, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 59760 + }, + { + "epoch": 4.292280071813286, + "grad_norm": 0.8005787134170532, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 59770 + }, + { + "epoch": 4.292998204667864, + "grad_norm": 1.0016109943389893, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 59780 + }, + { + "epoch": 4.293716337522442, + "grad_norm": 0.9112903475761414, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 59790 + }, + { + "epoch": 4.29443447037702, + "grad_norm": 0.9999852180480957, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 59800 + }, + { + "epoch": 4.295152603231598, + "grad_norm": 0.9323953986167908, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 59810 + }, + { + "epoch": 4.295870736086176, + "grad_norm": 0.903037965297699, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 59820 + }, + { + "epoch": 4.296588868940754, + "grad_norm": 1.2462431192398071, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 59830 + }, + { + "epoch": 4.297307001795332, + "grad_norm": 1.2322230339050293, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 59840 + }, + { + "epoch": 4.29802513464991, + "grad_norm": 0.9584668278694153, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 59850 + }, + { + "epoch": 4.298743267504488, + "grad_norm": 0.9664767980575562, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 59860 + }, + { + "epoch": 4.299461400359067, + "grad_norm": 0.8860437273979187, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 59870 + }, + { + "epoch": 4.300179533213645, + "grad_norm": 1.0825127363204956, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 59880 + }, + { + "epoch": 4.300897666068223, + "grad_norm": 1.1312100887298584, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 59890 + }, + { + "epoch": 4.301615798922801, + "grad_norm": 0.8289751410484314, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 59900 + }, + { + "epoch": 4.302333931777379, + "grad_norm": 0.8990927934646606, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 59910 + }, + { + "epoch": 4.303052064631957, + "grad_norm": 0.9667525887489319, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 59920 + }, + { + "epoch": 4.303770197486535, + "grad_norm": 0.8656060695648193, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 59930 + }, + { + "epoch": 4.304488330341113, + "grad_norm": 0.8909396529197693, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 59940 + }, + { + "epoch": 4.305206463195692, + "grad_norm": 0.9533283114433289, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 59950 + }, + { + "epoch": 4.30592459605027, + "grad_norm": 0.9090739488601685, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 59960 + }, + { + "epoch": 4.306642728904848, + "grad_norm": 1.096656322479248, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 59970 + }, + { + "epoch": 4.307360861759426, + "grad_norm": 1.0392465591430664, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 59980 + }, + { + "epoch": 4.308078994614004, + "grad_norm": 0.8733913898468018, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 59990 + }, + { + "epoch": 4.308797127468582, + "grad_norm": 0.8287094235420227, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 60000 + }, + { + "epoch": 4.30951526032316, + "grad_norm": 0.9267017245292664, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 60010 + }, + { + "epoch": 4.310233393177738, + "grad_norm": 0.9969515800476074, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 60020 + }, + { + "epoch": 4.310951526032316, + "grad_norm": 1.0005015134811401, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 60030 + }, + { + "epoch": 4.311669658886894, + "grad_norm": 1.1215369701385498, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 60040 + }, + { + "epoch": 4.312387791741473, + "grad_norm": 1.0434890985488892, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 60050 + }, + { + "epoch": 4.313105924596051, + "grad_norm": 0.967989981174469, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 60060 + }, + { + "epoch": 4.313824057450629, + "grad_norm": 1.007599115371704, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 60070 + }, + { + "epoch": 4.314542190305207, + "grad_norm": 0.9356340765953064, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 60080 + }, + { + "epoch": 4.315260323159785, + "grad_norm": 0.9566757678985596, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 60090 + }, + { + "epoch": 4.315978456014363, + "grad_norm": 1.1066830158233643, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 60100 + }, + { + "epoch": 4.316696588868941, + "grad_norm": 0.9895772933959961, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 60110 + }, + { + "epoch": 4.317414721723519, + "grad_norm": 1.07423734664917, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 60120 + }, + { + "epoch": 4.318132854578097, + "grad_norm": 1.0777037143707275, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 60130 + }, + { + "epoch": 4.3188509874326755, + "grad_norm": 1.1475656032562256, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 60140 + }, + { + "epoch": 4.3195691202872535, + "grad_norm": 1.0705864429473877, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 60150 + }, + { + "epoch": 4.3202872531418315, + "grad_norm": 0.8676854968070984, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 60160 + }, + { + "epoch": 4.3210053859964095, + "grad_norm": 0.9488174319267273, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 60170 + }, + { + "epoch": 4.3217235188509875, + "grad_norm": 1.1171153783798218, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 60180 + }, + { + "epoch": 4.3224416517055655, + "grad_norm": 1.091435194015503, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 60190 + }, + { + "epoch": 4.3231597845601435, + "grad_norm": 0.880944013595581, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 60200 + }, + { + "epoch": 4.3238779174147215, + "grad_norm": 0.8458809852600098, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 60210 + }, + { + "epoch": 4.3245960502692995, + "grad_norm": 0.7900225520133972, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 60220 + }, + { + "epoch": 4.3253141831238775, + "grad_norm": 0.966742753982544, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 60230 + }, + { + "epoch": 4.326032315978456, + "grad_norm": 0.8948110342025757, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 60240 + }, + { + "epoch": 4.326750448833034, + "grad_norm": 0.8598700165748596, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 60250 + }, + { + "epoch": 4.327468581687612, + "grad_norm": 1.127610206604004, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 60260 + }, + { + "epoch": 4.32818671454219, + "grad_norm": 0.8357340693473816, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 60270 + }, + { + "epoch": 4.328904847396768, + "grad_norm": 0.8771896362304688, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 60280 + }, + { + "epoch": 4.329622980251346, + "grad_norm": 0.9202101826667786, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 60290 + }, + { + "epoch": 4.330341113105924, + "grad_norm": 1.1427538394927979, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 60300 + }, + { + "epoch": 4.331059245960502, + "grad_norm": 0.8711863160133362, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 60310 + }, + { + "epoch": 4.33177737881508, + "grad_norm": 0.972723662853241, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 60320 + }, + { + "epoch": 4.332495511669659, + "grad_norm": 1.1496877670288086, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 60330 + }, + { + "epoch": 4.333213644524237, + "grad_norm": 1.008581519126892, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 60340 + }, + { + "epoch": 4.333931777378815, + "grad_norm": 1.0802706480026245, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 60350 + }, + { + "epoch": 4.334649910233393, + "grad_norm": 0.8394291996955872, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 60360 + }, + { + "epoch": 4.335368043087971, + "grad_norm": 0.8355905413627625, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 60370 + }, + { + "epoch": 4.336086175942549, + "grad_norm": 0.9583960175514221, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 60380 + }, + { + "epoch": 4.336804308797127, + "grad_norm": 1.138934850692749, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 60390 + }, + { + "epoch": 4.337522441651705, + "grad_norm": 1.0334709882736206, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 60400 + }, + { + "epoch": 4.338240574506283, + "grad_norm": 0.729686439037323, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 60410 + }, + { + "epoch": 4.338958707360861, + "grad_norm": 0.8735929727554321, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 60420 + }, + { + "epoch": 4.33967684021544, + "grad_norm": 0.9617681503295898, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 60430 + }, + { + "epoch": 4.340394973070018, + "grad_norm": 0.9439655542373657, + "learning_rate": 0.0002, + "loss": 0.5865, + "step": 60440 + }, + { + "epoch": 4.341113105924596, + "grad_norm": 0.9275408387184143, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 60450 + }, + { + "epoch": 4.341831238779174, + "grad_norm": 1.0693308115005493, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 60460 + }, + { + "epoch": 4.342549371633752, + "grad_norm": 0.9234438538551331, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 60470 + }, + { + "epoch": 4.34326750448833, + "grad_norm": 1.1376168727874756, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 60480 + }, + { + "epoch": 4.343985637342908, + "grad_norm": 0.9218108654022217, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 60490 + }, + { + "epoch": 4.344703770197486, + "grad_norm": 1.1467362642288208, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 60500 + }, + { + "epoch": 4.345421903052064, + "grad_norm": 0.9459165930747986, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 60510 + }, + { + "epoch": 4.346140035906643, + "grad_norm": 0.9460827708244324, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 60520 + }, + { + "epoch": 4.346858168761221, + "grad_norm": 1.0845041275024414, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 60530 + }, + { + "epoch": 4.347576301615799, + "grad_norm": 1.082675576210022, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 60540 + }, + { + "epoch": 4.348294434470377, + "grad_norm": 0.8443698883056641, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 60550 + }, + { + "epoch": 4.349012567324955, + "grad_norm": 1.018393874168396, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 60560 + }, + { + "epoch": 4.349730700179533, + "grad_norm": 0.8796373009681702, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 60570 + }, + { + "epoch": 4.350448833034111, + "grad_norm": 1.097942590713501, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 60580 + }, + { + "epoch": 4.351166965888689, + "grad_norm": 0.8750485181808472, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 60590 + }, + { + "epoch": 4.351885098743267, + "grad_norm": 1.0339995622634888, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 60600 + }, + { + "epoch": 4.352603231597846, + "grad_norm": 0.9077731966972351, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 60610 + }, + { + "epoch": 4.353321364452424, + "grad_norm": 1.051321029663086, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 60620 + }, + { + "epoch": 4.354039497307002, + "grad_norm": 1.0018669366836548, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 60630 + }, + { + "epoch": 4.35475763016158, + "grad_norm": 1.0349196195602417, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 60640 + }, + { + "epoch": 4.355475763016158, + "grad_norm": 1.009589672088623, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 60650 + }, + { + "epoch": 4.356193895870736, + "grad_norm": 1.0463480949401855, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 60660 + }, + { + "epoch": 4.356912028725314, + "grad_norm": 0.9815132021903992, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 60670 + }, + { + "epoch": 4.357630161579892, + "grad_norm": 1.0977262258529663, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 60680 + }, + { + "epoch": 4.35834829443447, + "grad_norm": 0.8450005054473877, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 60690 + }, + { + "epoch": 4.359066427289049, + "grad_norm": 1.0959078073501587, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 60700 + }, + { + "epoch": 4.359784560143627, + "grad_norm": 0.9155098795890808, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 60710 + }, + { + "epoch": 4.360502692998205, + "grad_norm": 0.9267987012863159, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 60720 + }, + { + "epoch": 4.361220825852783, + "grad_norm": 1.177472472190857, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 60730 + }, + { + "epoch": 4.361938958707361, + "grad_norm": 0.8615312576293945, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 60740 + }, + { + "epoch": 4.362657091561939, + "grad_norm": 1.0939710140228271, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 60750 + }, + { + "epoch": 4.363375224416517, + "grad_norm": 1.0928049087524414, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 60760 + }, + { + "epoch": 4.364093357271095, + "grad_norm": 1.0796833038330078, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 60770 + }, + { + "epoch": 4.364811490125673, + "grad_norm": 0.9768339991569519, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 60780 + }, + { + "epoch": 4.365529622980251, + "grad_norm": 0.9082722067832947, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 60790 + }, + { + "epoch": 4.36624775583483, + "grad_norm": 0.9614832997322083, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 60800 + }, + { + "epoch": 4.366965888689408, + "grad_norm": 0.8874651789665222, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 60810 + }, + { + "epoch": 4.367684021543986, + "grad_norm": 0.8810178637504578, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 60820 + }, + { + "epoch": 4.368402154398564, + "grad_norm": 1.0893806219100952, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 60830 + }, + { + "epoch": 4.369120287253142, + "grad_norm": 0.9042278528213501, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 60840 + }, + { + "epoch": 4.36983842010772, + "grad_norm": 1.0832217931747437, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 60850 + }, + { + "epoch": 4.370556552962298, + "grad_norm": 0.9431114792823792, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 60860 + }, + { + "epoch": 4.371274685816876, + "grad_norm": 1.031553030014038, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 60870 + }, + { + "epoch": 4.371992818671454, + "grad_norm": 0.8702824711799622, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 60880 + }, + { + "epoch": 4.372710951526033, + "grad_norm": 1.1109199523925781, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 60890 + }, + { + "epoch": 4.373429084380611, + "grad_norm": 0.8369361162185669, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 60900 + }, + { + "epoch": 4.374147217235189, + "grad_norm": 0.988915205001831, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 60910 + }, + { + "epoch": 4.374865350089767, + "grad_norm": 0.9365919232368469, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 60920 + }, + { + "epoch": 4.375583482944345, + "grad_norm": 0.9789398908615112, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 60930 + }, + { + "epoch": 4.376301615798923, + "grad_norm": 0.8786931037902832, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 60940 + }, + { + "epoch": 4.377019748653501, + "grad_norm": 0.8891511559486389, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 60950 + }, + { + "epoch": 4.377737881508079, + "grad_norm": 0.9561707377433777, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 60960 + }, + { + "epoch": 4.378456014362657, + "grad_norm": 0.8674200177192688, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 60970 + }, + { + "epoch": 4.379174147217235, + "grad_norm": 0.9285916090011597, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 60980 + }, + { + "epoch": 4.379892280071814, + "grad_norm": 0.9185547232627869, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 60990 + }, + { + "epoch": 4.380610412926392, + "grad_norm": 1.081664800643921, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 61000 + }, + { + "epoch": 4.38132854578097, + "grad_norm": 1.0475854873657227, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 61010 + }, + { + "epoch": 4.382046678635548, + "grad_norm": 1.1519653797149658, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 61020 + }, + { + "epoch": 4.382764811490126, + "grad_norm": 0.8757607936859131, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 61030 + }, + { + "epoch": 4.383482944344704, + "grad_norm": 0.8707934021949768, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 61040 + }, + { + "epoch": 4.384201077199282, + "grad_norm": 1.1807516813278198, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 61050 + }, + { + "epoch": 4.38491921005386, + "grad_norm": 1.0674688816070557, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 61060 + }, + { + "epoch": 4.385637342908438, + "grad_norm": 0.9321209788322449, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 61070 + }, + { + "epoch": 4.3863554757630165, + "grad_norm": 1.0786446332931519, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 61080 + }, + { + "epoch": 4.3870736086175945, + "grad_norm": 0.9733907580375671, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 61090 + }, + { + "epoch": 4.3877917414721725, + "grad_norm": 0.9476010203361511, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 61100 + }, + { + "epoch": 4.3885098743267505, + "grad_norm": 1.1321563720703125, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 61110 + }, + { + "epoch": 4.3892280071813286, + "grad_norm": 0.9379117488861084, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 61120 + }, + { + "epoch": 4.3899461400359066, + "grad_norm": 0.8409728407859802, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 61130 + }, + { + "epoch": 4.3906642728904846, + "grad_norm": 0.8309189081192017, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 61140 + }, + { + "epoch": 4.391382405745063, + "grad_norm": 0.8922196626663208, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 61150 + }, + { + "epoch": 4.392100538599641, + "grad_norm": 0.8274614214897156, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 61160 + }, + { + "epoch": 4.392818671454219, + "grad_norm": 1.0928618907928467, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 61170 + }, + { + "epoch": 4.3935368043087974, + "grad_norm": 0.9771125316619873, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 61180 + }, + { + "epoch": 4.3942549371633755, + "grad_norm": 0.8844535946846008, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 61190 + }, + { + "epoch": 4.3949730700179535, + "grad_norm": 1.0498822927474976, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 61200 + }, + { + "epoch": 4.3956912028725315, + "grad_norm": 0.9882155060768127, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 61210 + }, + { + "epoch": 4.3964093357271095, + "grad_norm": 1.090356707572937, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 61220 + }, + { + "epoch": 4.3971274685816875, + "grad_norm": 1.0908088684082031, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 61230 + }, + { + "epoch": 4.3978456014362655, + "grad_norm": 1.0013501644134521, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 61240 + }, + { + "epoch": 4.3985637342908435, + "grad_norm": 1.0916062593460083, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 61250 + }, + { + "epoch": 4.399281867145422, + "grad_norm": 1.0817667245864868, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 61260 + }, + { + "epoch": 4.4, + "grad_norm": 0.9745162129402161, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 61270 + }, + { + "epoch": 4.400718132854578, + "grad_norm": 1.0653400421142578, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 61280 + }, + { + "epoch": 4.401436265709156, + "grad_norm": 1.0082067251205444, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 61290 + }, + { + "epoch": 4.402154398563734, + "grad_norm": 0.7963659167289734, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 61300 + }, + { + "epoch": 4.402872531418312, + "grad_norm": 1.0428845882415771, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 61310 + }, + { + "epoch": 4.40359066427289, + "grad_norm": 0.9205707311630249, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 61320 + }, + { + "epoch": 4.404308797127468, + "grad_norm": 1.0103533267974854, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 61330 + }, + { + "epoch": 4.405026929982046, + "grad_norm": 1.113547682762146, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 61340 + }, + { + "epoch": 4.405745062836624, + "grad_norm": 1.137488842010498, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 61350 + }, + { + "epoch": 4.406463195691203, + "grad_norm": 1.1284101009368896, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 61360 + }, + { + "epoch": 4.407181328545781, + "grad_norm": 0.8010451197624207, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 61370 + }, + { + "epoch": 4.407899461400359, + "grad_norm": 0.8893977403640747, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 61380 + }, + { + "epoch": 4.408617594254937, + "grad_norm": 0.9098272323608398, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 61390 + }, + { + "epoch": 4.409335727109515, + "grad_norm": 1.0613329410552979, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 61400 + }, + { + "epoch": 4.410053859964093, + "grad_norm": 1.0070269107818604, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 61410 + }, + { + "epoch": 4.410771992818671, + "grad_norm": 0.8632227778434753, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 61420 + }, + { + "epoch": 4.411490125673249, + "grad_norm": 1.0183731317520142, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 61430 + }, + { + "epoch": 4.412208258527827, + "grad_norm": 0.9049941897392273, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 61440 + }, + { + "epoch": 4.412926391382406, + "grad_norm": 1.0184082984924316, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 61450 + }, + { + "epoch": 4.413644524236984, + "grad_norm": 0.9994277358055115, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 61460 + }, + { + "epoch": 4.414362657091562, + "grad_norm": 1.0112420320510864, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 61470 + }, + { + "epoch": 4.41508078994614, + "grad_norm": 0.9751759171485901, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 61480 + }, + { + "epoch": 4.415798922800718, + "grad_norm": 1.047135591506958, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 61490 + }, + { + "epoch": 4.416517055655296, + "grad_norm": 0.886282742023468, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 61500 + }, + { + "epoch": 4.417235188509874, + "grad_norm": 0.971964418888092, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 61510 + }, + { + "epoch": 4.417953321364452, + "grad_norm": 0.9603846073150635, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 61520 + }, + { + "epoch": 4.41867145421903, + "grad_norm": 1.060042142868042, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 61530 + }, + { + "epoch": 4.419389587073608, + "grad_norm": 1.1231369972229004, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 61540 + }, + { + "epoch": 4.420107719928187, + "grad_norm": 0.8269591331481934, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 61550 + }, + { + "epoch": 4.420825852782765, + "grad_norm": 1.0341241359710693, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 61560 + }, + { + "epoch": 4.421543985637343, + "grad_norm": 0.7276636958122253, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 61570 + }, + { + "epoch": 4.422262118491921, + "grad_norm": 1.0663669109344482, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 61580 + }, + { + "epoch": 4.422980251346499, + "grad_norm": 0.9764387011528015, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 61590 + }, + { + "epoch": 4.423698384201077, + "grad_norm": 1.0953258275985718, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 61600 + }, + { + "epoch": 4.424416517055655, + "grad_norm": 0.8877012729644775, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 61610 + }, + { + "epoch": 4.425134649910233, + "grad_norm": 0.8781440854072571, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 61620 + }, + { + "epoch": 4.425852782764811, + "grad_norm": 0.8333432674407959, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 61630 + }, + { + "epoch": 4.42657091561939, + "grad_norm": 0.9647989869117737, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 61640 + }, + { + "epoch": 4.427289048473968, + "grad_norm": 1.0801783800125122, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 61650 + }, + { + "epoch": 4.428007181328546, + "grad_norm": 0.8215882778167725, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 61660 + }, + { + "epoch": 4.428725314183124, + "grad_norm": 0.9853931665420532, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 61670 + }, + { + "epoch": 4.429443447037702, + "grad_norm": 0.8658010959625244, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 61680 + }, + { + "epoch": 4.43016157989228, + "grad_norm": 1.124064326286316, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 61690 + }, + { + "epoch": 4.430879712746858, + "grad_norm": 1.009340763092041, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 61700 + }, + { + "epoch": 4.431597845601436, + "grad_norm": 0.8705293536186218, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 61710 + }, + { + "epoch": 4.432315978456014, + "grad_norm": 1.1323511600494385, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 61720 + }, + { + "epoch": 4.433034111310592, + "grad_norm": 1.1203019618988037, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 61730 + }, + { + "epoch": 4.433752244165171, + "grad_norm": 1.1683770418167114, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 61740 + }, + { + "epoch": 4.434470377019749, + "grad_norm": 1.0735899209976196, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 61750 + }, + { + "epoch": 4.435188509874327, + "grad_norm": 1.142496109008789, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 61760 + }, + { + "epoch": 4.435906642728905, + "grad_norm": 1.1157732009887695, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 61770 + }, + { + "epoch": 4.436624775583483, + "grad_norm": 0.8845949172973633, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 61780 + }, + { + "epoch": 4.437342908438061, + "grad_norm": 1.1212759017944336, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 61790 + }, + { + "epoch": 4.438061041292639, + "grad_norm": 0.8832488656044006, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 61800 + }, + { + "epoch": 4.438779174147217, + "grad_norm": 0.9059590101242065, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 61810 + }, + { + "epoch": 4.439497307001796, + "grad_norm": 1.0625685453414917, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 61820 + }, + { + "epoch": 4.440215439856374, + "grad_norm": 0.9565598368644714, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 61830 + }, + { + "epoch": 4.440933572710952, + "grad_norm": 0.8975377082824707, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 61840 + }, + { + "epoch": 4.44165170556553, + "grad_norm": 1.0412718057632446, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 61850 + }, + { + "epoch": 4.442369838420108, + "grad_norm": 0.9923529624938965, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 61860 + }, + { + "epoch": 4.443087971274686, + "grad_norm": 1.3025734424591064, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 61870 + }, + { + "epoch": 4.443806104129264, + "grad_norm": 1.0031960010528564, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 61880 + }, + { + "epoch": 4.444524236983842, + "grad_norm": 1.0974701642990112, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 61890 + }, + { + "epoch": 4.44524236983842, + "grad_norm": 1.1044024229049683, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 61900 + }, + { + "epoch": 4.445960502692998, + "grad_norm": 1.0782772302627563, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 61910 + }, + { + "epoch": 4.446678635547577, + "grad_norm": 1.006304383277893, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 61920 + }, + { + "epoch": 4.447396768402155, + "grad_norm": 0.9258833527565002, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 61930 + }, + { + "epoch": 4.448114901256733, + "grad_norm": 0.9888426065444946, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 61940 + }, + { + "epoch": 4.448833034111311, + "grad_norm": 0.9592963457107544, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 61950 + }, + { + "epoch": 4.449551166965889, + "grad_norm": 1.0527986288070679, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 61960 + }, + { + "epoch": 4.450269299820467, + "grad_norm": 0.8613291382789612, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 61970 + }, + { + "epoch": 4.450987432675045, + "grad_norm": 1.1083767414093018, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 61980 + }, + { + "epoch": 4.451705565529623, + "grad_norm": 0.772679328918457, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 61990 + }, + { + "epoch": 4.452423698384201, + "grad_norm": 0.9052274227142334, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 62000 + }, + { + "epoch": 4.45314183123878, + "grad_norm": 1.129667043685913, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 62010 + }, + { + "epoch": 4.453859964093358, + "grad_norm": 0.9994529485702515, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 62020 + }, + { + "epoch": 4.454578096947936, + "grad_norm": 0.982155978679657, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 62030 + }, + { + "epoch": 4.455296229802514, + "grad_norm": 0.9139904975891113, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 62040 + }, + { + "epoch": 4.456014362657092, + "grad_norm": 1.0877810716629028, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 62050 + }, + { + "epoch": 4.45673249551167, + "grad_norm": 1.0535308122634888, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 62060 + }, + { + "epoch": 4.457450628366248, + "grad_norm": 1.0225313901901245, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 62070 + }, + { + "epoch": 4.458168761220826, + "grad_norm": 0.8443132042884827, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 62080 + }, + { + "epoch": 4.458886894075404, + "grad_norm": 1.0426654815673828, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 62090 + }, + { + "epoch": 4.459605026929982, + "grad_norm": 1.1110700368881226, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 62100 + }, + { + "epoch": 4.4603231597845605, + "grad_norm": 1.0200893878936768, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 62110 + }, + { + "epoch": 4.4610412926391385, + "grad_norm": 0.9102830290794373, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 62120 + }, + { + "epoch": 4.4617594254937165, + "grad_norm": 1.1395094394683838, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 62130 + }, + { + "epoch": 4.4624775583482945, + "grad_norm": 1.1202316284179688, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 62140 + }, + { + "epoch": 4.4631956912028725, + "grad_norm": 1.142580509185791, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 62150 + }, + { + "epoch": 4.4639138240574505, + "grad_norm": 0.9843677878379822, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 62160 + }, + { + "epoch": 4.4646319569120285, + "grad_norm": 1.0351676940917969, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 62170 + }, + { + "epoch": 4.4653500897666065, + "grad_norm": 0.9365093111991882, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 62180 + }, + { + "epoch": 4.4660682226211845, + "grad_norm": 1.041193962097168, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 62190 + }, + { + "epoch": 4.466786355475763, + "grad_norm": 0.9686329960823059, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 62200 + }, + { + "epoch": 4.467504488330341, + "grad_norm": 1.028622031211853, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 62210 + }, + { + "epoch": 4.468222621184919, + "grad_norm": 0.9717516899108887, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 62220 + }, + { + "epoch": 4.468940754039497, + "grad_norm": 1.0467450618743896, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 62230 + }, + { + "epoch": 4.469658886894075, + "grad_norm": 0.943717896938324, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 62240 + }, + { + "epoch": 4.470377019748653, + "grad_norm": 0.909429132938385, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 62250 + }, + { + "epoch": 4.471095152603231, + "grad_norm": 1.0294792652130127, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 62260 + }, + { + "epoch": 4.471813285457809, + "grad_norm": 1.1044281721115112, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 62270 + }, + { + "epoch": 4.472531418312387, + "grad_norm": 1.1555784940719604, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 62280 + }, + { + "epoch": 4.473249551166965, + "grad_norm": 0.9441297650337219, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 62290 + }, + { + "epoch": 4.473967684021544, + "grad_norm": 0.9164380431175232, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 62300 + }, + { + "epoch": 4.474685816876122, + "grad_norm": 1.1139159202575684, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 62310 + }, + { + "epoch": 4.4754039497307, + "grad_norm": 1.0201882123947144, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 62320 + }, + { + "epoch": 4.476122082585278, + "grad_norm": 1.1471681594848633, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 62330 + }, + { + "epoch": 4.476840215439856, + "grad_norm": 1.0333549976348877, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 62340 + }, + { + "epoch": 4.477558348294434, + "grad_norm": 0.8929767608642578, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 62350 + }, + { + "epoch": 4.478276481149012, + "grad_norm": 0.9465752840042114, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 62360 + }, + { + "epoch": 4.47899461400359, + "grad_norm": 1.2155033349990845, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 62370 + }, + { + "epoch": 4.479712746858169, + "grad_norm": 0.7181217074394226, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 62380 + }, + { + "epoch": 4.480430879712747, + "grad_norm": 1.0052744150161743, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 62390 + }, + { + "epoch": 4.481149012567325, + "grad_norm": 0.8522219061851501, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 62400 + }, + { + "epoch": 4.481867145421903, + "grad_norm": 0.8844723105430603, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 62410 + }, + { + "epoch": 4.482585278276481, + "grad_norm": 0.9542465209960938, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 62420 + }, + { + "epoch": 4.483303411131059, + "grad_norm": 0.8963674306869507, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 62430 + }, + { + "epoch": 4.484021543985637, + "grad_norm": 0.8105363845825195, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 62440 + }, + { + "epoch": 4.484739676840215, + "grad_norm": 0.9618421196937561, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 62450 + }, + { + "epoch": 4.485457809694793, + "grad_norm": 1.1931076049804688, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 62460 + }, + { + "epoch": 4.486175942549371, + "grad_norm": 0.7406999468803406, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 62470 + }, + { + "epoch": 4.48689407540395, + "grad_norm": 0.7698216438293457, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 62480 + }, + { + "epoch": 4.487612208258528, + "grad_norm": 0.862271249294281, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 62490 + }, + { + "epoch": 4.488330341113106, + "grad_norm": 1.0025171041488647, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 62500 + }, + { + "epoch": 4.489048473967684, + "grad_norm": 0.8474493622779846, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 62510 + }, + { + "epoch": 4.489766606822262, + "grad_norm": 0.8965697884559631, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 62520 + }, + { + "epoch": 4.49048473967684, + "grad_norm": 1.1276488304138184, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 62530 + }, + { + "epoch": 4.491202872531418, + "grad_norm": 1.0253537893295288, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 62540 + }, + { + "epoch": 4.491921005385996, + "grad_norm": 1.1750596761703491, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 62550 + }, + { + "epoch": 4.492639138240574, + "grad_norm": 0.9951794147491455, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 62560 + }, + { + "epoch": 4.493357271095153, + "grad_norm": 1.2510017156600952, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 62570 + }, + { + "epoch": 4.494075403949731, + "grad_norm": 1.4066375494003296, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 62580 + }, + { + "epoch": 4.494793536804309, + "grad_norm": 0.988175094127655, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 62590 + }, + { + "epoch": 4.495511669658887, + "grad_norm": 1.2049115896224976, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 62600 + }, + { + "epoch": 4.496229802513465, + "grad_norm": 0.962464451789856, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 62610 + }, + { + "epoch": 4.496947935368043, + "grad_norm": 0.9324793815612793, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 62620 + }, + { + "epoch": 4.497666068222621, + "grad_norm": 0.9174214005470276, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 62630 + }, + { + "epoch": 4.498384201077199, + "grad_norm": 0.9729902148246765, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 62640 + }, + { + "epoch": 4.499102333931777, + "grad_norm": 1.0190484523773193, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 62650 + }, + { + "epoch": 4.499820466786355, + "grad_norm": 1.1473679542541504, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 62660 + }, + { + "epoch": 4.500538599640934, + "grad_norm": 1.0160558223724365, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 62670 + }, + { + "epoch": 4.501256732495512, + "grad_norm": 0.8083887100219727, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 62680 + }, + { + "epoch": 4.50197486535009, + "grad_norm": 0.941933274269104, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 62690 + }, + { + "epoch": 4.502692998204668, + "grad_norm": 0.9962822794914246, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 62700 + }, + { + "epoch": 4.503411131059246, + "grad_norm": 0.8993943333625793, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 62710 + }, + { + "epoch": 4.504129263913824, + "grad_norm": 0.9438319206237793, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 62720 + }, + { + "epoch": 4.504847396768402, + "grad_norm": 0.7951892018318176, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 62730 + }, + { + "epoch": 4.50556552962298, + "grad_norm": 0.8875413537025452, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 62740 + }, + { + "epoch": 4.506283662477558, + "grad_norm": 0.993819534778595, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 62750 + }, + { + "epoch": 4.507001795332137, + "grad_norm": 0.9177559018135071, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 62760 + }, + { + "epoch": 4.507719928186715, + "grad_norm": 0.8632771968841553, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 62770 + }, + { + "epoch": 4.508438061041293, + "grad_norm": 0.943778395652771, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 62780 + }, + { + "epoch": 4.509156193895871, + "grad_norm": 0.8754997849464417, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 62790 + }, + { + "epoch": 4.509874326750449, + "grad_norm": 1.102683424949646, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 62800 + }, + { + "epoch": 4.510592459605027, + "grad_norm": 1.1156457662582397, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 62810 + }, + { + "epoch": 4.511310592459605, + "grad_norm": 0.9178887009620667, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 62820 + }, + { + "epoch": 4.512028725314183, + "grad_norm": 0.9520689249038696, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 62830 + }, + { + "epoch": 4.512746858168761, + "grad_norm": 0.8880525231361389, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 62840 + }, + { + "epoch": 4.513464991023339, + "grad_norm": 0.9541497826576233, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 62850 + }, + { + "epoch": 4.514183123877918, + "grad_norm": 1.003766417503357, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 62860 + }, + { + "epoch": 4.514901256732496, + "grad_norm": 0.8844705820083618, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 62870 + }, + { + "epoch": 4.515619389587074, + "grad_norm": 1.1870828866958618, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 62880 + }, + { + "epoch": 4.516337522441652, + "grad_norm": 0.863487184047699, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 62890 + }, + { + "epoch": 4.51705565529623, + "grad_norm": 0.997770369052887, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 62900 + }, + { + "epoch": 4.517773788150808, + "grad_norm": 0.9708612561225891, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 62910 + }, + { + "epoch": 4.518491921005386, + "grad_norm": 1.1381206512451172, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 62920 + }, + { + "epoch": 4.519210053859964, + "grad_norm": 1.0386693477630615, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 62930 + }, + { + "epoch": 4.519928186714543, + "grad_norm": 1.1711705923080444, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 62940 + }, + { + "epoch": 4.520646319569121, + "grad_norm": 0.8727447390556335, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 62950 + }, + { + "epoch": 4.521364452423699, + "grad_norm": 0.9215193390846252, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 62960 + }, + { + "epoch": 4.522082585278277, + "grad_norm": 1.005467176437378, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 62970 + }, + { + "epoch": 4.522800718132855, + "grad_norm": 0.8761187791824341, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 62980 + }, + { + "epoch": 4.523518850987433, + "grad_norm": 0.957848310470581, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 62990 + }, + { + "epoch": 4.524236983842011, + "grad_norm": 0.8634148836135864, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 63000 + }, + { + "epoch": 4.524955116696589, + "grad_norm": 0.9557477235794067, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 63010 + }, + { + "epoch": 4.525673249551167, + "grad_norm": 1.017720341682434, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 63020 + }, + { + "epoch": 4.526391382405745, + "grad_norm": 1.0281825065612793, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 63030 + }, + { + "epoch": 4.527109515260323, + "grad_norm": 1.253974437713623, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 63040 + }, + { + "epoch": 4.527827648114902, + "grad_norm": 0.8489068150520325, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 63050 + }, + { + "epoch": 4.52854578096948, + "grad_norm": 0.9681686162948608, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 63060 + }, + { + "epoch": 4.529263913824058, + "grad_norm": 1.10277259349823, + "learning_rate": 0.0002, + "loss": 0.6166, + "step": 63070 + }, + { + "epoch": 4.529982046678636, + "grad_norm": 0.9469163417816162, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 63080 + }, + { + "epoch": 4.530700179533214, + "grad_norm": 1.1228134632110596, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 63090 + }, + { + "epoch": 4.531418312387792, + "grad_norm": 0.9673212170600891, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 63100 + }, + { + "epoch": 4.53213644524237, + "grad_norm": 1.0221107006072998, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 63110 + }, + { + "epoch": 4.532854578096948, + "grad_norm": 0.826372504234314, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 63120 + }, + { + "epoch": 4.5335727109515265, + "grad_norm": 1.1805331707000732, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 63130 + }, + { + "epoch": 4.5342908438061045, + "grad_norm": 0.9645666480064392, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 63140 + }, + { + "epoch": 4.5350089766606825, + "grad_norm": 1.0838309526443481, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 63150 + }, + { + "epoch": 4.5357271095152605, + "grad_norm": 1.061414361000061, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 63160 + }, + { + "epoch": 4.5364452423698385, + "grad_norm": 0.841961145401001, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 63170 + }, + { + "epoch": 4.5371633752244165, + "grad_norm": 1.1220186948776245, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 63180 + }, + { + "epoch": 4.5378815080789945, + "grad_norm": 1.036441445350647, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 63190 + }, + { + "epoch": 4.5385996409335725, + "grad_norm": 0.9089716076850891, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 63200 + }, + { + "epoch": 4.5393177737881505, + "grad_norm": 0.8699982762336731, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 63210 + }, + { + "epoch": 4.5400359066427285, + "grad_norm": 0.8489565253257751, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 63220 + }, + { + "epoch": 4.540754039497307, + "grad_norm": 0.7778416275978088, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 63230 + }, + { + "epoch": 4.541472172351885, + "grad_norm": 1.0625852346420288, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 63240 + }, + { + "epoch": 4.542190305206463, + "grad_norm": 0.8515732884407043, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 63250 + }, + { + "epoch": 4.542908438061041, + "grad_norm": 0.7679561376571655, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 63260 + }, + { + "epoch": 4.543626570915619, + "grad_norm": 0.7358446717262268, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 63270 + }, + { + "epoch": 4.544344703770197, + "grad_norm": 1.0866128206253052, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 63280 + }, + { + "epoch": 4.545062836624775, + "grad_norm": 1.0870225429534912, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 63290 + }, + { + "epoch": 4.545780969479353, + "grad_norm": 0.951095461845398, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 63300 + }, + { + "epoch": 4.546499102333931, + "grad_norm": 1.0914306640625, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 63310 + }, + { + "epoch": 4.54721723518851, + "grad_norm": 0.8676106333732605, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 63320 + }, + { + "epoch": 4.547935368043088, + "grad_norm": 1.0129096508026123, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 63330 + }, + { + "epoch": 4.548653500897666, + "grad_norm": 0.8710526823997498, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 63340 + }, + { + "epoch": 4.549371633752244, + "grad_norm": 0.7014815807342529, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 63350 + }, + { + "epoch": 4.550089766606822, + "grad_norm": 1.1546777486801147, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 63360 + }, + { + "epoch": 4.5508078994614, + "grad_norm": 0.7464957237243652, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 63370 + }, + { + "epoch": 4.551526032315978, + "grad_norm": 0.9976209998130798, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 63380 + }, + { + "epoch": 4.552244165170556, + "grad_norm": 0.9543681740760803, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 63390 + }, + { + "epoch": 4.552962298025134, + "grad_norm": 1.1498578786849976, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 63400 + }, + { + "epoch": 4.553680430879712, + "grad_norm": 1.0162293910980225, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 63410 + }, + { + "epoch": 4.554398563734291, + "grad_norm": 0.9015304446220398, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 63420 + }, + { + "epoch": 4.555116696588869, + "grad_norm": 1.1639831066131592, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 63430 + }, + { + "epoch": 4.555834829443447, + "grad_norm": 0.9494703412055969, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 63440 + }, + { + "epoch": 4.556552962298025, + "grad_norm": 1.0555956363677979, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 63450 + }, + { + "epoch": 4.557271095152603, + "grad_norm": 0.8513827919960022, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 63460 + }, + { + "epoch": 4.557989228007181, + "grad_norm": 1.0614275932312012, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 63470 + }, + { + "epoch": 4.558707360861759, + "grad_norm": 0.8341137766838074, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 63480 + }, + { + "epoch": 4.559425493716337, + "grad_norm": 1.2136222124099731, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 63490 + }, + { + "epoch": 4.560143626570916, + "grad_norm": 0.8806019425392151, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 63500 + }, + { + "epoch": 4.560861759425494, + "grad_norm": 1.2548854351043701, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 63510 + }, + { + "epoch": 4.561579892280072, + "grad_norm": 1.0162668228149414, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 63520 + }, + { + "epoch": 4.56229802513465, + "grad_norm": 1.0487624406814575, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 63530 + }, + { + "epoch": 4.563016157989228, + "grad_norm": 1.2505502700805664, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 63540 + }, + { + "epoch": 4.563734290843806, + "grad_norm": 0.9930511713027954, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 63550 + }, + { + "epoch": 4.564452423698384, + "grad_norm": 0.8132568001747131, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 63560 + }, + { + "epoch": 4.565170556552962, + "grad_norm": 1.0129177570343018, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 63570 + }, + { + "epoch": 4.56588868940754, + "grad_norm": 0.9011693596839905, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 63580 + }, + { + "epoch": 4.566606822262118, + "grad_norm": 0.9161545634269714, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 63590 + }, + { + "epoch": 4.567324955116696, + "grad_norm": 0.8852348327636719, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 63600 + }, + { + "epoch": 4.568043087971275, + "grad_norm": 0.8579391837120056, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 63610 + }, + { + "epoch": 4.568761220825853, + "grad_norm": 0.9271050095558167, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 63620 + }, + { + "epoch": 4.569479353680431, + "grad_norm": 0.9881834983825684, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 63630 + }, + { + "epoch": 4.570197486535009, + "grad_norm": 1.0255686044692993, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 63640 + }, + { + "epoch": 4.570915619389587, + "grad_norm": 0.8758876919746399, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 63650 + }, + { + "epoch": 4.571633752244165, + "grad_norm": 1.0134185552597046, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 63660 + }, + { + "epoch": 4.572351885098743, + "grad_norm": 0.8535705208778381, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 63670 + }, + { + "epoch": 4.573070017953321, + "grad_norm": 0.9614834785461426, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 63680 + }, + { + "epoch": 4.5737881508079, + "grad_norm": 0.9004243612289429, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 63690 + }, + { + "epoch": 4.574506283662478, + "grad_norm": 0.9563080072402954, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 63700 + }, + { + "epoch": 4.575224416517056, + "grad_norm": 1.024857521057129, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 63710 + }, + { + "epoch": 4.575942549371634, + "grad_norm": 0.9345638155937195, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 63720 + }, + { + "epoch": 4.576660682226212, + "grad_norm": 1.27083158493042, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 63730 + }, + { + "epoch": 4.57737881508079, + "grad_norm": 1.0866559743881226, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 63740 + }, + { + "epoch": 4.578096947935368, + "grad_norm": 0.9253925681114197, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 63750 + }, + { + "epoch": 4.578815080789946, + "grad_norm": 0.8127399682998657, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 63760 + }, + { + "epoch": 4.579533213644524, + "grad_norm": 1.0453993082046509, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 63770 + }, + { + "epoch": 4.580251346499102, + "grad_norm": 1.2227544784545898, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 63780 + }, + { + "epoch": 4.580969479353681, + "grad_norm": 1.0207865238189697, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 63790 + }, + { + "epoch": 4.581687612208259, + "grad_norm": 1.030447244644165, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 63800 + }, + { + "epoch": 4.582405745062837, + "grad_norm": 1.0855677127838135, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 63810 + }, + { + "epoch": 4.583123877917415, + "grad_norm": 0.9572556018829346, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 63820 + }, + { + "epoch": 4.583842010771993, + "grad_norm": 0.9061040282249451, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 63830 + }, + { + "epoch": 4.584560143626571, + "grad_norm": 0.9267677068710327, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 63840 + }, + { + "epoch": 4.585278276481149, + "grad_norm": 1.070076823234558, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 63850 + }, + { + "epoch": 4.585996409335727, + "grad_norm": 1.045881748199463, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 63860 + }, + { + "epoch": 4.586714542190305, + "grad_norm": 0.9190576672554016, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 63870 + }, + { + "epoch": 4.587432675044884, + "grad_norm": 0.9263932704925537, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 63880 + }, + { + "epoch": 4.588150807899462, + "grad_norm": 1.0217589139938354, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 63890 + }, + { + "epoch": 4.58886894075404, + "grad_norm": 0.9200088381767273, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 63900 + }, + { + "epoch": 4.589587073608618, + "grad_norm": 0.9877251386642456, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 63910 + }, + { + "epoch": 4.590305206463196, + "grad_norm": 1.0059093236923218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 63920 + }, + { + "epoch": 4.591023339317774, + "grad_norm": 1.2618095874786377, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 63930 + }, + { + "epoch": 4.591741472172352, + "grad_norm": 1.1779268980026245, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 63940 + }, + { + "epoch": 4.59245960502693, + "grad_norm": 1.2339502573013306, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 63950 + }, + { + "epoch": 4.593177737881508, + "grad_norm": 0.7488788366317749, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 63960 + }, + { + "epoch": 4.593895870736086, + "grad_norm": 0.8366380929946899, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 63970 + }, + { + "epoch": 4.594614003590665, + "grad_norm": 1.0292677879333496, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 63980 + }, + { + "epoch": 4.595332136445243, + "grad_norm": 0.7938551306724548, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 63990 + }, + { + "epoch": 4.596050269299821, + "grad_norm": 0.7958516478538513, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 64000 + }, + { + "epoch": 4.596768402154399, + "grad_norm": 0.9613908529281616, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 64010 + }, + { + "epoch": 4.597486535008977, + "grad_norm": 1.0253773927688599, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 64020 + }, + { + "epoch": 4.598204667863555, + "grad_norm": 1.0560888051986694, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 64030 + }, + { + "epoch": 4.598922800718133, + "grad_norm": 1.1093556880950928, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 64040 + }, + { + "epoch": 4.599640933572711, + "grad_norm": 0.8492098450660706, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 64050 + }, + { + "epoch": 4.6003590664272895, + "grad_norm": 1.0070436000823975, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 64060 + }, + { + "epoch": 4.6010771992818675, + "grad_norm": 0.9774282574653625, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 64070 + }, + { + "epoch": 4.6017953321364455, + "grad_norm": 1.0744960308074951, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 64080 + }, + { + "epoch": 4.6025134649910235, + "grad_norm": 1.0101491212844849, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 64090 + }, + { + "epoch": 4.6032315978456015, + "grad_norm": 1.2306591272354126, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 64100 + }, + { + "epoch": 4.6039497307001795, + "grad_norm": 0.9187033176422119, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 64110 + }, + { + "epoch": 4.6046678635547575, + "grad_norm": 0.9178676605224609, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 64120 + }, + { + "epoch": 4.6053859964093355, + "grad_norm": 1.006374716758728, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 64130 + }, + { + "epoch": 4.6061041292639135, + "grad_norm": 1.0774449110031128, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 64140 + }, + { + "epoch": 4.6068222621184916, + "grad_norm": 1.0360658168792725, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 64150 + }, + { + "epoch": 4.6075403949730696, + "grad_norm": 1.1061090230941772, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 64160 + }, + { + "epoch": 4.608258527827648, + "grad_norm": 1.0320971012115479, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 64170 + }, + { + "epoch": 4.6089766606822264, + "grad_norm": 0.8596988916397095, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 64180 + }, + { + "epoch": 4.6096947935368044, + "grad_norm": 1.1665741205215454, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 64190 + }, + { + "epoch": 4.6104129263913824, + "grad_norm": 0.857207715511322, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 64200 + }, + { + "epoch": 4.6111310592459605, + "grad_norm": 1.0088987350463867, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 64210 + }, + { + "epoch": 4.6118491921005385, + "grad_norm": 1.0985605716705322, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 64220 + }, + { + "epoch": 4.6125673249551165, + "grad_norm": 0.9504913687705994, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 64230 + }, + { + "epoch": 4.6132854578096945, + "grad_norm": 0.8415018916130066, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 64240 + }, + { + "epoch": 4.614003590664273, + "grad_norm": 0.9857034087181091, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 64250 + }, + { + "epoch": 4.614721723518851, + "grad_norm": 1.0164235830307007, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 64260 + }, + { + "epoch": 4.615439856373429, + "grad_norm": 0.949481725692749, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 64270 + }, + { + "epoch": 4.616157989228007, + "grad_norm": 0.9526455998420715, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 64280 + }, + { + "epoch": 4.616876122082585, + "grad_norm": 1.1121242046356201, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 64290 + }, + { + "epoch": 4.617594254937163, + "grad_norm": 0.9598871469497681, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 64300 + }, + { + "epoch": 4.618312387791741, + "grad_norm": 1.0406304597854614, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 64310 + }, + { + "epoch": 4.619030520646319, + "grad_norm": 1.1816964149475098, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 64320 + }, + { + "epoch": 4.619748653500897, + "grad_norm": 0.9818326830863953, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 64330 + }, + { + "epoch": 4.620466786355475, + "grad_norm": 0.952017605304718, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 64340 + }, + { + "epoch": 4.621184919210053, + "grad_norm": 1.1263453960418701, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 64350 + }, + { + "epoch": 4.621903052064632, + "grad_norm": 1.1158473491668701, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 64360 + }, + { + "epoch": 4.62262118491921, + "grad_norm": 0.9056766033172607, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 64370 + }, + { + "epoch": 4.623339317773788, + "grad_norm": 0.8113203048706055, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 64380 + }, + { + "epoch": 4.624057450628366, + "grad_norm": 0.8646712899208069, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 64390 + }, + { + "epoch": 4.624775583482944, + "grad_norm": 1.0064425468444824, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 64400 + }, + { + "epoch": 4.625493716337522, + "grad_norm": 0.9867565631866455, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 64410 + }, + { + "epoch": 4.6262118491921, + "grad_norm": 1.018764615058899, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 64420 + }, + { + "epoch": 4.626929982046678, + "grad_norm": 1.0607863664627075, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 64430 + }, + { + "epoch": 4.627648114901257, + "grad_norm": 1.012825846672058, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 64440 + }, + { + "epoch": 4.628366247755835, + "grad_norm": 0.8441653847694397, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 64450 + }, + { + "epoch": 4.629084380610413, + "grad_norm": 0.9819194674491882, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 64460 + }, + { + "epoch": 4.629802513464991, + "grad_norm": 0.925519585609436, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 64470 + }, + { + "epoch": 4.630520646319569, + "grad_norm": 0.9409030079841614, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 64480 + }, + { + "epoch": 4.631238779174147, + "grad_norm": 1.148024559020996, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 64490 + }, + { + "epoch": 4.631956912028725, + "grad_norm": 0.8225533962249756, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 64500 + }, + { + "epoch": 4.632675044883303, + "grad_norm": 0.8806734681129456, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 64510 + }, + { + "epoch": 4.633393177737881, + "grad_norm": 0.9656694531440735, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 64520 + }, + { + "epoch": 4.634111310592459, + "grad_norm": 0.9977783560752869, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 64530 + }, + { + "epoch": 4.634829443447038, + "grad_norm": 0.9259420037269592, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 64540 + }, + { + "epoch": 4.635547576301616, + "grad_norm": 1.0215885639190674, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 64550 + }, + { + "epoch": 4.636265709156194, + "grad_norm": 1.1082557439804077, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 64560 + }, + { + "epoch": 4.636983842010772, + "grad_norm": 1.1183207035064697, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 64570 + }, + { + "epoch": 4.63770197486535, + "grad_norm": 0.9914339184761047, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 64580 + }, + { + "epoch": 4.638420107719928, + "grad_norm": 0.8065831661224365, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 64590 + }, + { + "epoch": 4.639138240574506, + "grad_norm": 1.1546721458435059, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 64600 + }, + { + "epoch": 4.639856373429084, + "grad_norm": 1.0395900011062622, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 64610 + }, + { + "epoch": 4.640574506283663, + "grad_norm": 0.9957455992698669, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 64620 + }, + { + "epoch": 4.641292639138241, + "grad_norm": 1.069557785987854, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 64630 + }, + { + "epoch": 4.642010771992819, + "grad_norm": 1.005236268043518, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 64640 + }, + { + "epoch": 4.642728904847397, + "grad_norm": 1.0216304063796997, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 64650 + }, + { + "epoch": 4.643447037701975, + "grad_norm": 0.8567317128181458, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 64660 + }, + { + "epoch": 4.644165170556553, + "grad_norm": 1.0386067628860474, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 64670 + }, + { + "epoch": 4.644883303411131, + "grad_norm": 0.9566055536270142, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 64680 + }, + { + "epoch": 4.645601436265709, + "grad_norm": 1.0990564823150635, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 64690 + }, + { + "epoch": 4.646319569120287, + "grad_norm": 0.9962695240974426, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 64700 + }, + { + "epoch": 4.647037701974865, + "grad_norm": 0.9041377305984497, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 64710 + }, + { + "epoch": 4.647755834829443, + "grad_norm": 0.8611233234405518, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 64720 + }, + { + "epoch": 4.648473967684022, + "grad_norm": 1.1569812297821045, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 64730 + }, + { + "epoch": 4.6491921005386, + "grad_norm": 0.7946197390556335, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 64740 + }, + { + "epoch": 4.649910233393178, + "grad_norm": 0.9612061381340027, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 64750 + }, + { + "epoch": 4.650628366247756, + "grad_norm": 0.9669303297996521, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 64760 + }, + { + "epoch": 4.651346499102334, + "grad_norm": 0.8117775321006775, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 64770 + }, + { + "epoch": 4.652064631956912, + "grad_norm": 1.2326241731643677, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 64780 + }, + { + "epoch": 4.65278276481149, + "grad_norm": 0.7494568228721619, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 64790 + }, + { + "epoch": 4.653500897666068, + "grad_norm": 0.8145379424095154, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 64800 + }, + { + "epoch": 4.654219030520647, + "grad_norm": 1.0139610767364502, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 64810 + }, + { + "epoch": 4.654937163375225, + "grad_norm": 0.9887115359306335, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 64820 + }, + { + "epoch": 4.655655296229803, + "grad_norm": 0.9565147161483765, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 64830 + }, + { + "epoch": 4.656373429084381, + "grad_norm": 0.9022467136383057, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 64840 + }, + { + "epoch": 4.657091561938959, + "grad_norm": 1.075003981590271, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 64850 + }, + { + "epoch": 4.657809694793537, + "grad_norm": 0.8705733418464661, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 64860 + }, + { + "epoch": 4.658527827648115, + "grad_norm": 1.0826832056045532, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 64870 + }, + { + "epoch": 4.659245960502693, + "grad_norm": 1.1056268215179443, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 64880 + }, + { + "epoch": 4.659964093357271, + "grad_norm": 0.8664149641990662, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 64890 + }, + { + "epoch": 4.660682226211849, + "grad_norm": 0.9487230181694031, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 64900 + }, + { + "epoch": 4.661400359066427, + "grad_norm": 1.0357837677001953, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 64910 + }, + { + "epoch": 4.662118491921006, + "grad_norm": 0.8620632290840149, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 64920 + }, + { + "epoch": 4.662836624775584, + "grad_norm": 1.108986735343933, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 64930 + }, + { + "epoch": 4.663554757630162, + "grad_norm": 0.8017674684524536, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 64940 + }, + { + "epoch": 4.66427289048474, + "grad_norm": 0.882347583770752, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 64950 + }, + { + "epoch": 4.664991023339318, + "grad_norm": 0.9466867446899414, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 64960 + }, + { + "epoch": 4.665709156193896, + "grad_norm": 1.1823636293411255, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 64970 + }, + { + "epoch": 4.666427289048474, + "grad_norm": 0.9535016417503357, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 64980 + }, + { + "epoch": 4.667145421903052, + "grad_norm": 0.9456726312637329, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 64990 + }, + { + "epoch": 4.667863554757631, + "grad_norm": 0.7761920690536499, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 65000 + }, + { + "epoch": 4.668581687612209, + "grad_norm": 1.060357689857483, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 65010 + }, + { + "epoch": 4.669299820466787, + "grad_norm": 0.9083862900733948, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 65020 + }, + { + "epoch": 4.670017953321365, + "grad_norm": 0.8745762705802917, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 65030 + }, + { + "epoch": 4.670736086175943, + "grad_norm": 0.8715422749519348, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 65040 + }, + { + "epoch": 4.671454219030521, + "grad_norm": 0.9407707452774048, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 65050 + }, + { + "epoch": 4.672172351885099, + "grad_norm": 0.8998945355415344, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 65060 + }, + { + "epoch": 4.672890484739677, + "grad_norm": 0.9147891998291016, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 65070 + }, + { + "epoch": 4.673608617594255, + "grad_norm": 1.116614580154419, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 65080 + }, + { + "epoch": 4.674326750448833, + "grad_norm": 1.0764213800430298, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 65090 + }, + { + "epoch": 4.6750448833034115, + "grad_norm": 0.9115945100784302, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 65100 + }, + { + "epoch": 4.6757630161579895, + "grad_norm": 1.001251459121704, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 65110 + }, + { + "epoch": 4.6764811490125675, + "grad_norm": 1.0330020189285278, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 65120 + }, + { + "epoch": 4.6771992818671455, + "grad_norm": 0.9083197116851807, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 65130 + }, + { + "epoch": 4.6779174147217235, + "grad_norm": 0.9298770427703857, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 65140 + }, + { + "epoch": 4.6786355475763015, + "grad_norm": 1.0009549856185913, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 65150 + }, + { + "epoch": 4.6793536804308795, + "grad_norm": 0.951389729976654, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 65160 + }, + { + "epoch": 4.6800718132854575, + "grad_norm": 1.151870608329773, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 65170 + }, + { + "epoch": 4.680789946140036, + "grad_norm": 1.0074727535247803, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 65180 + }, + { + "epoch": 4.681508078994614, + "grad_norm": 1.0490152835845947, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 65190 + }, + { + "epoch": 4.682226211849192, + "grad_norm": 0.8967363834381104, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 65200 + }, + { + "epoch": 4.68294434470377, + "grad_norm": 1.2314889430999756, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 65210 + }, + { + "epoch": 4.683662477558348, + "grad_norm": 0.7764074802398682, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 65220 + }, + { + "epoch": 4.684380610412926, + "grad_norm": 1.0587822198867798, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 65230 + }, + { + "epoch": 4.685098743267504, + "grad_norm": 0.916114091873169, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 65240 + }, + { + "epoch": 4.685816876122082, + "grad_norm": 0.9117472767829895, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 65250 + }, + { + "epoch": 4.68653500897666, + "grad_norm": 0.8369293212890625, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 65260 + }, + { + "epoch": 4.687253141831238, + "grad_norm": 0.9700121879577637, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 65270 + }, + { + "epoch": 4.687971274685816, + "grad_norm": 1.0008411407470703, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 65280 + }, + { + "epoch": 4.688689407540395, + "grad_norm": 0.9339549541473389, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 65290 + }, + { + "epoch": 4.689407540394973, + "grad_norm": 0.956701934337616, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 65300 + }, + { + "epoch": 4.690125673249551, + "grad_norm": 1.2042720317840576, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 65310 + }, + { + "epoch": 4.690843806104129, + "grad_norm": 0.8679144382476807, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 65320 + }, + { + "epoch": 4.691561938958707, + "grad_norm": 1.2320687770843506, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 65330 + }, + { + "epoch": 4.692280071813285, + "grad_norm": 0.8397238850593567, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 65340 + }, + { + "epoch": 4.692998204667863, + "grad_norm": 0.7850362658500671, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 65350 + }, + { + "epoch": 4.693716337522441, + "grad_norm": 0.9281290173530579, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 65360 + }, + { + "epoch": 4.69443447037702, + "grad_norm": 1.1506335735321045, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 65370 + }, + { + "epoch": 4.695152603231598, + "grad_norm": 1.0910584926605225, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 65380 + }, + { + "epoch": 4.695870736086176, + "grad_norm": 0.8937386274337769, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 65390 + }, + { + "epoch": 4.696588868940754, + "grad_norm": 1.0163888931274414, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 65400 + }, + { + "epoch": 4.697307001795332, + "grad_norm": 1.0290007591247559, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 65410 + }, + { + "epoch": 4.69802513464991, + "grad_norm": 0.9046576023101807, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 65420 + }, + { + "epoch": 4.698743267504488, + "grad_norm": 1.0030237436294556, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 65430 + }, + { + "epoch": 4.699461400359066, + "grad_norm": 0.8196740746498108, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 65440 + }, + { + "epoch": 4.700179533213644, + "grad_norm": 0.9036651849746704, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 65450 + }, + { + "epoch": 4.700897666068222, + "grad_norm": 1.2080141305923462, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 65460 + }, + { + "epoch": 4.7016157989228, + "grad_norm": 0.8743635416030884, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 65470 + }, + { + "epoch": 4.702333931777379, + "grad_norm": 0.9566192030906677, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 65480 + }, + { + "epoch": 4.703052064631957, + "grad_norm": 1.0505144596099854, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 65490 + }, + { + "epoch": 4.703770197486535, + "grad_norm": 0.8797298073768616, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 65500 + }, + { + "epoch": 4.704488330341113, + "grad_norm": 0.9970770478248596, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 65510 + }, + { + "epoch": 4.705206463195691, + "grad_norm": 1.1743851900100708, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 65520 + }, + { + "epoch": 4.705924596050269, + "grad_norm": 0.9534381031990051, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 65530 + }, + { + "epoch": 4.706642728904847, + "grad_norm": 0.9735581278800964, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 65540 + }, + { + "epoch": 4.707360861759425, + "grad_norm": 1.185352087020874, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 65550 + }, + { + "epoch": 4.708078994614004, + "grad_norm": 0.9383901357650757, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 65560 + }, + { + "epoch": 4.708797127468582, + "grad_norm": 1.0194662809371948, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 65570 + }, + { + "epoch": 4.70951526032316, + "grad_norm": 0.8448300361633301, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 65580 + }, + { + "epoch": 4.710233393177738, + "grad_norm": 1.1930629014968872, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 65590 + }, + { + "epoch": 4.710951526032316, + "grad_norm": 1.0038636922836304, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 65600 + }, + { + "epoch": 4.711669658886894, + "grad_norm": 0.8206564784049988, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 65610 + }, + { + "epoch": 4.712387791741472, + "grad_norm": 1.0984861850738525, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 65620 + }, + { + "epoch": 4.71310592459605, + "grad_norm": 1.2891547679901123, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 65630 + }, + { + "epoch": 4.713824057450628, + "grad_norm": 0.927062451839447, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 65640 + }, + { + "epoch": 4.714542190305206, + "grad_norm": 0.8647334575653076, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 65650 + }, + { + "epoch": 4.715260323159785, + "grad_norm": 1.1017670631408691, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 65660 + }, + { + "epoch": 4.715978456014363, + "grad_norm": 0.9589072465896606, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 65670 + }, + { + "epoch": 4.716696588868941, + "grad_norm": 0.9496776461601257, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 65680 + }, + { + "epoch": 4.717414721723519, + "grad_norm": 0.9266180396080017, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 65690 + }, + { + "epoch": 4.718132854578097, + "grad_norm": 0.8699696063995361, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 65700 + }, + { + "epoch": 4.718850987432675, + "grad_norm": 1.0444015264511108, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 65710 + }, + { + "epoch": 4.719569120287253, + "grad_norm": 1.0100741386413574, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 65720 + }, + { + "epoch": 4.720287253141831, + "grad_norm": 1.1442630290985107, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 65730 + }, + { + "epoch": 4.721005385996409, + "grad_norm": 0.8937877416610718, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 65740 + }, + { + "epoch": 4.721723518850988, + "grad_norm": 1.0718764066696167, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 65750 + }, + { + "epoch": 4.722441651705566, + "grad_norm": 0.8838587999343872, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 65760 + }, + { + "epoch": 4.723159784560144, + "grad_norm": 1.1247940063476562, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 65770 + }, + { + "epoch": 4.723877917414722, + "grad_norm": 0.9491105675697327, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 65780 + }, + { + "epoch": 4.7245960502693, + "grad_norm": 1.0896921157836914, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 65790 + }, + { + "epoch": 4.725314183123878, + "grad_norm": 1.0097380876541138, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 65800 + }, + { + "epoch": 4.726032315978456, + "grad_norm": 0.911763608455658, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 65810 + }, + { + "epoch": 4.726750448833034, + "grad_norm": 1.1295124292373657, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 65820 + }, + { + "epoch": 4.727468581687612, + "grad_norm": 0.7637538313865662, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 65830 + }, + { + "epoch": 4.72818671454219, + "grad_norm": 0.9255306720733643, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 65840 + }, + { + "epoch": 4.728904847396769, + "grad_norm": 0.9847530126571655, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 65850 + }, + { + "epoch": 4.729622980251347, + "grad_norm": 0.9036182761192322, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 65860 + }, + { + "epoch": 4.730341113105925, + "grad_norm": 0.8284199833869934, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 65870 + }, + { + "epoch": 4.731059245960503, + "grad_norm": 1.0142838954925537, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 65880 + }, + { + "epoch": 4.731777378815081, + "grad_norm": 0.9389033913612366, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 65890 + }, + { + "epoch": 4.732495511669659, + "grad_norm": 0.8870056867599487, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 65900 + }, + { + "epoch": 4.733213644524237, + "grad_norm": 1.1211678981781006, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 65910 + }, + { + "epoch": 4.733931777378815, + "grad_norm": 0.7796614170074463, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 65920 + }, + { + "epoch": 4.734649910233394, + "grad_norm": 1.0360451936721802, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 65930 + }, + { + "epoch": 4.735368043087972, + "grad_norm": 0.8383482098579407, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 65940 + }, + { + "epoch": 4.73608617594255, + "grad_norm": 0.7985122799873352, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 65950 + }, + { + "epoch": 4.736804308797128, + "grad_norm": 1.0314199924468994, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 65960 + }, + { + "epoch": 4.737522441651706, + "grad_norm": 0.9279016852378845, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 65970 + }, + { + "epoch": 4.738240574506284, + "grad_norm": 1.1046063899993896, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 65980 + }, + { + "epoch": 4.738958707360862, + "grad_norm": 0.9075793623924255, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 65990 + }, + { + "epoch": 4.73967684021544, + "grad_norm": 1.0945355892181396, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 66000 + }, + { + "epoch": 4.740394973070018, + "grad_norm": 0.8885519504547119, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 66010 + }, + { + "epoch": 4.741113105924596, + "grad_norm": 0.9312083125114441, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 66020 + }, + { + "epoch": 4.741831238779174, + "grad_norm": 1.1574538946151733, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 66030 + }, + { + "epoch": 4.742549371633753, + "grad_norm": 0.9346209168434143, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 66040 + }, + { + "epoch": 4.743267504488331, + "grad_norm": 0.8935149312019348, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 66050 + }, + { + "epoch": 4.743985637342909, + "grad_norm": 0.8958369493484497, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 66060 + }, + { + "epoch": 4.744703770197487, + "grad_norm": 0.9383506774902344, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 66070 + }, + { + "epoch": 4.745421903052065, + "grad_norm": 0.9868947863578796, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 66080 + }, + { + "epoch": 4.746140035906643, + "grad_norm": 1.3417645692825317, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 66090 + }, + { + "epoch": 4.746858168761221, + "grad_norm": 1.070693850517273, + "learning_rate": 0.0002, + "loss": 0.5417, + "step": 66100 + }, + { + "epoch": 4.747576301615799, + "grad_norm": 0.8841570019721985, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 66110 + }, + { + "epoch": 4.7482944344703775, + "grad_norm": 0.7963120341300964, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 66120 + }, + { + "epoch": 4.7490125673249555, + "grad_norm": 0.8145691156387329, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 66130 + }, + { + "epoch": 4.7497307001795335, + "grad_norm": 0.9074729681015015, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 66140 + }, + { + "epoch": 4.7504488330341115, + "grad_norm": 0.9129886627197266, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 66150 + }, + { + "epoch": 4.7511669658886895, + "grad_norm": 0.91527259349823, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 66160 + }, + { + "epoch": 4.7518850987432675, + "grad_norm": 0.9569419622421265, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 66170 + }, + { + "epoch": 4.7526032315978455, + "grad_norm": 0.8777104616165161, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 66180 + }, + { + "epoch": 4.7533213644524235, + "grad_norm": 0.9673085808753967, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 66190 + }, + { + "epoch": 4.7540394973070015, + "grad_norm": 1.0683966875076294, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 66200 + }, + { + "epoch": 4.7547576301615795, + "grad_norm": 1.1591907739639282, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 66210 + }, + { + "epoch": 4.755475763016158, + "grad_norm": 1.1973309516906738, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 66220 + }, + { + "epoch": 4.756193895870736, + "grad_norm": 0.8472012281417847, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 66230 + }, + { + "epoch": 4.756912028725314, + "grad_norm": 0.9896261692047119, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 66240 + }, + { + "epoch": 4.757630161579892, + "grad_norm": 0.8498432040214539, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 66250 + }, + { + "epoch": 4.75834829443447, + "grad_norm": 0.9624166488647461, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 66260 + }, + { + "epoch": 4.759066427289048, + "grad_norm": 1.0951786041259766, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 66270 + }, + { + "epoch": 4.759784560143626, + "grad_norm": 0.9863157868385315, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 66280 + }, + { + "epoch": 4.760502692998204, + "grad_norm": 1.0062068700790405, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 66290 + }, + { + "epoch": 4.761220825852782, + "grad_norm": 0.8075495958328247, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 66300 + }, + { + "epoch": 4.761938958707361, + "grad_norm": 0.9617878198623657, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 66310 + }, + { + "epoch": 4.762657091561939, + "grad_norm": 1.097091555595398, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 66320 + }, + { + "epoch": 4.763375224416517, + "grad_norm": 1.2713453769683838, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 66330 + }, + { + "epoch": 4.764093357271095, + "grad_norm": 0.9473448991775513, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 66340 + }, + { + "epoch": 4.764811490125673, + "grad_norm": 1.0176854133605957, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 66350 + }, + { + "epoch": 4.765529622980251, + "grad_norm": 1.0486242771148682, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 66360 + }, + { + "epoch": 4.766247755834829, + "grad_norm": 1.249985694885254, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 66370 + }, + { + "epoch": 4.766965888689407, + "grad_norm": 1.283875584602356, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 66380 + }, + { + "epoch": 4.767684021543985, + "grad_norm": 1.0009022951126099, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 66390 + }, + { + "epoch": 4.768402154398563, + "grad_norm": 0.9718021750450134, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 66400 + }, + { + "epoch": 4.769120287253142, + "grad_norm": 1.0865732431411743, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 66410 + }, + { + "epoch": 4.76983842010772, + "grad_norm": 0.9273189306259155, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 66420 + }, + { + "epoch": 4.770556552962298, + "grad_norm": 1.067535638809204, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 66430 + }, + { + "epoch": 4.771274685816876, + "grad_norm": 1.0551011562347412, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 66440 + }, + { + "epoch": 4.771992818671454, + "grad_norm": 1.0336146354675293, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 66450 + }, + { + "epoch": 4.772710951526032, + "grad_norm": 0.8738380670547485, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 66460 + }, + { + "epoch": 4.77342908438061, + "grad_norm": 1.1048321723937988, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 66470 + }, + { + "epoch": 4.774147217235188, + "grad_norm": 0.8471167683601379, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 66480 + }, + { + "epoch": 4.774865350089767, + "grad_norm": 1.2527031898498535, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 66490 + }, + { + "epoch": 4.775583482944345, + "grad_norm": 1.0056052207946777, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 66500 + }, + { + "epoch": 4.776301615798923, + "grad_norm": 1.142456293106079, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 66510 + }, + { + "epoch": 4.777019748653501, + "grad_norm": 1.1813132762908936, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 66520 + }, + { + "epoch": 4.777737881508079, + "grad_norm": 0.8683654069900513, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 66530 + }, + { + "epoch": 4.778456014362657, + "grad_norm": 1.0577980279922485, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 66540 + }, + { + "epoch": 4.779174147217235, + "grad_norm": 1.077438473701477, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 66550 + }, + { + "epoch": 4.779892280071813, + "grad_norm": 1.0107938051223755, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 66560 + }, + { + "epoch": 4.780610412926391, + "grad_norm": 0.8071168065071106, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 66570 + }, + { + "epoch": 4.781328545780969, + "grad_norm": 0.8887564539909363, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 66580 + }, + { + "epoch": 4.782046678635547, + "grad_norm": 0.9823092222213745, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 66590 + }, + { + "epoch": 4.782764811490126, + "grad_norm": 0.9026784300804138, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 66600 + }, + { + "epoch": 4.783482944344704, + "grad_norm": 0.8912792205810547, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 66610 + }, + { + "epoch": 4.784201077199282, + "grad_norm": 1.0955979824066162, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 66620 + }, + { + "epoch": 4.78491921005386, + "grad_norm": 0.8614793419837952, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 66630 + }, + { + "epoch": 4.785637342908438, + "grad_norm": 0.7247269153594971, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 66640 + }, + { + "epoch": 4.786355475763016, + "grad_norm": 0.9685400724411011, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 66650 + }, + { + "epoch": 4.787073608617594, + "grad_norm": 0.9219905734062195, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 66660 + }, + { + "epoch": 4.787791741472172, + "grad_norm": 0.9217489361763, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 66670 + }, + { + "epoch": 4.788509874326751, + "grad_norm": 1.13791823387146, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 66680 + }, + { + "epoch": 4.789228007181329, + "grad_norm": 0.857542872428894, + "learning_rate": 0.0002, + "loss": 0.6114, + "step": 66690 + }, + { + "epoch": 4.789946140035907, + "grad_norm": 0.9886694550514221, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 66700 + }, + { + "epoch": 4.790664272890485, + "grad_norm": 0.987952470779419, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 66710 + }, + { + "epoch": 4.791382405745063, + "grad_norm": 1.051612377166748, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 66720 + }, + { + "epoch": 4.792100538599641, + "grad_norm": 0.9816454648971558, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 66730 + }, + { + "epoch": 4.792818671454219, + "grad_norm": 1.0953829288482666, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 66740 + }, + { + "epoch": 4.793536804308797, + "grad_norm": 0.8720369935035706, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 66750 + }, + { + "epoch": 4.794254937163375, + "grad_norm": 0.8910234570503235, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 66760 + }, + { + "epoch": 4.794973070017953, + "grad_norm": 0.8300510048866272, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 66770 + }, + { + "epoch": 4.795691202872531, + "grad_norm": 0.9380533695220947, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 66780 + }, + { + "epoch": 4.79640933572711, + "grad_norm": 0.8361864686012268, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 66790 + }, + { + "epoch": 4.797127468581688, + "grad_norm": 1.051262617111206, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 66800 + }, + { + "epoch": 4.797845601436266, + "grad_norm": 1.1324400901794434, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 66810 + }, + { + "epoch": 4.798563734290844, + "grad_norm": 0.853903591632843, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 66820 + }, + { + "epoch": 4.799281867145422, + "grad_norm": 0.9949867725372314, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 66830 + }, + { + "epoch": 4.8, + "grad_norm": 0.9204033017158508, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 66840 + }, + { + "epoch": 4.800718132854578, + "grad_norm": 0.7461584806442261, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 66850 + }, + { + "epoch": 4.801436265709156, + "grad_norm": 1.1019874811172485, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 66860 + }, + { + "epoch": 4.802154398563735, + "grad_norm": 1.1695797443389893, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 66870 + }, + { + "epoch": 4.802872531418313, + "grad_norm": 1.0902758836746216, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 66880 + }, + { + "epoch": 4.803590664272891, + "grad_norm": 0.8778618574142456, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 66890 + }, + { + "epoch": 4.804308797127469, + "grad_norm": 0.905505359172821, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 66900 + }, + { + "epoch": 4.805026929982047, + "grad_norm": 1.0802056789398193, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 66910 + }, + { + "epoch": 4.805745062836625, + "grad_norm": 0.7899449467658997, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 66920 + }, + { + "epoch": 4.806463195691203, + "grad_norm": 1.1938519477844238, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 66930 + }, + { + "epoch": 4.807181328545781, + "grad_norm": 1.0213780403137207, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 66940 + }, + { + "epoch": 4.807899461400359, + "grad_norm": 0.9925506711006165, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 66950 + }, + { + "epoch": 4.808617594254937, + "grad_norm": 1.0174424648284912, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 66960 + }, + { + "epoch": 4.809335727109516, + "grad_norm": 1.0515072345733643, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 66970 + }, + { + "epoch": 4.810053859964094, + "grad_norm": 1.0161492824554443, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 66980 + }, + { + "epoch": 4.810771992818672, + "grad_norm": 0.8421840071678162, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 66990 + }, + { + "epoch": 4.81149012567325, + "grad_norm": 1.0493539571762085, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 67000 + }, + { + "epoch": 4.812208258527828, + "grad_norm": 1.1133309602737427, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 67010 + }, + { + "epoch": 4.812926391382406, + "grad_norm": 0.924017071723938, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 67020 + }, + { + "epoch": 4.813644524236984, + "grad_norm": 1.0568689107894897, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 67030 + }, + { + "epoch": 4.814362657091562, + "grad_norm": 0.989414632320404, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 67040 + }, + { + "epoch": 4.8150807899461405, + "grad_norm": 0.9256827235221863, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 67050 + }, + { + "epoch": 4.8157989228007185, + "grad_norm": 0.9538901448249817, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 67060 + }, + { + "epoch": 4.8165170556552965, + "grad_norm": 1.0373849868774414, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 67070 + }, + { + "epoch": 4.8172351885098745, + "grad_norm": 1.0019729137420654, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 67080 + }, + { + "epoch": 4.8179533213644525, + "grad_norm": 0.9930381178855896, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 67090 + }, + { + "epoch": 4.8186714542190305, + "grad_norm": 1.0008453130722046, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 67100 + }, + { + "epoch": 4.8193895870736085, + "grad_norm": 1.0153851509094238, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 67110 + }, + { + "epoch": 4.8201077199281865, + "grad_norm": 1.0193161964416504, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 67120 + }, + { + "epoch": 4.8208258527827645, + "grad_norm": 1.0204501152038574, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 67130 + }, + { + "epoch": 4.8215439856373425, + "grad_norm": 0.9097670316696167, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 67140 + }, + { + "epoch": 4.8222621184919205, + "grad_norm": 0.9288716912269592, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 67150 + }, + { + "epoch": 4.822980251346499, + "grad_norm": 0.9975850582122803, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 67160 + }, + { + "epoch": 4.823698384201077, + "grad_norm": 0.8502511382102966, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 67170 + }, + { + "epoch": 4.824416517055655, + "grad_norm": 1.0129257440567017, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 67180 + }, + { + "epoch": 4.825134649910233, + "grad_norm": 1.0009492635726929, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 67190 + }, + { + "epoch": 4.825852782764811, + "grad_norm": 0.9273321032524109, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 67200 + }, + { + "epoch": 4.8265709156193894, + "grad_norm": 1.0438604354858398, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 67210 + }, + { + "epoch": 4.8272890484739674, + "grad_norm": 1.119573712348938, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 67220 + }, + { + "epoch": 4.8280071813285454, + "grad_norm": 0.9607422351837158, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 67230 + }, + { + "epoch": 4.828725314183124, + "grad_norm": 0.9614062905311584, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 67240 + }, + { + "epoch": 4.829443447037702, + "grad_norm": 1.1017652750015259, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 67250 + }, + { + "epoch": 4.83016157989228, + "grad_norm": 1.0521706342697144, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 67260 + }, + { + "epoch": 4.830879712746858, + "grad_norm": 0.7685959339141846, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 67270 + }, + { + "epoch": 4.831597845601436, + "grad_norm": 0.7894896268844604, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 67280 + }, + { + "epoch": 4.832315978456014, + "grad_norm": 1.0882996320724487, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 67290 + }, + { + "epoch": 4.833034111310592, + "grad_norm": 0.9215409755706787, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 67300 + }, + { + "epoch": 4.83375224416517, + "grad_norm": 0.8660635352134705, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 67310 + }, + { + "epoch": 4.834470377019748, + "grad_norm": 0.980879008769989, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 67320 + }, + { + "epoch": 4.835188509874326, + "grad_norm": 1.0356814861297607, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 67330 + }, + { + "epoch": 4.835906642728904, + "grad_norm": 1.0265507698059082, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 67340 + }, + { + "epoch": 4.836624775583483, + "grad_norm": 1.0659137964248657, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 67350 + }, + { + "epoch": 4.837342908438061, + "grad_norm": 0.9485231637954712, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 67360 + }, + { + "epoch": 4.838061041292639, + "grad_norm": 1.0950140953063965, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 67370 + }, + { + "epoch": 4.838779174147217, + "grad_norm": 0.8907382488250732, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 67380 + }, + { + "epoch": 4.839497307001795, + "grad_norm": 0.9777120351791382, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 67390 + }, + { + "epoch": 4.840215439856373, + "grad_norm": 0.8482252955436707, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 67400 + }, + { + "epoch": 4.840933572710951, + "grad_norm": 0.8505899906158447, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 67410 + }, + { + "epoch": 4.841651705565529, + "grad_norm": 0.8574482798576355, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 67420 + }, + { + "epoch": 4.842369838420108, + "grad_norm": 1.092310905456543, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 67430 + }, + { + "epoch": 4.843087971274686, + "grad_norm": 0.9418560266494751, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 67440 + }, + { + "epoch": 4.843806104129264, + "grad_norm": 1.1310782432556152, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 67450 + }, + { + "epoch": 4.844524236983842, + "grad_norm": 0.9993671774864197, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 67460 + }, + { + "epoch": 4.84524236983842, + "grad_norm": 0.8322528600692749, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 67470 + }, + { + "epoch": 4.845960502692998, + "grad_norm": 0.8488435745239258, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 67480 + }, + { + "epoch": 4.846678635547576, + "grad_norm": 0.8070611357688904, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 67490 + }, + { + "epoch": 4.847396768402154, + "grad_norm": 0.8200163245201111, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 67500 + }, + { + "epoch": 4.848114901256732, + "grad_norm": 0.91901034116745, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 67510 + }, + { + "epoch": 4.84883303411131, + "grad_norm": 1.0938435792922974, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 67520 + }, + { + "epoch": 4.849551166965889, + "grad_norm": 0.7926174402236938, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 67530 + }, + { + "epoch": 4.850269299820467, + "grad_norm": 0.9914385676383972, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 67540 + }, + { + "epoch": 4.850987432675045, + "grad_norm": 1.033065915107727, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 67550 + }, + { + "epoch": 4.851705565529623, + "grad_norm": 0.9700239300727844, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 67560 + }, + { + "epoch": 4.852423698384201, + "grad_norm": 0.8550103902816772, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 67570 + }, + { + "epoch": 4.853141831238779, + "grad_norm": 1.0009654760360718, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 67580 + }, + { + "epoch": 4.853859964093357, + "grad_norm": 1.0766186714172363, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 67590 + }, + { + "epoch": 4.854578096947935, + "grad_norm": 0.9512220621109009, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 67600 + }, + { + "epoch": 4.855296229802514, + "grad_norm": 0.8434456586837769, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 67610 + }, + { + "epoch": 4.856014362657092, + "grad_norm": 1.0276665687561035, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 67620 + }, + { + "epoch": 4.85673249551167, + "grad_norm": 0.9758516550064087, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 67630 + }, + { + "epoch": 4.857450628366248, + "grad_norm": 0.8988076448440552, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 67640 + }, + { + "epoch": 4.858168761220826, + "grad_norm": 1.0038257837295532, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 67650 + }, + { + "epoch": 4.858886894075404, + "grad_norm": 0.9973093867301941, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 67660 + }, + { + "epoch": 4.859605026929982, + "grad_norm": 0.9754974246025085, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 67670 + }, + { + "epoch": 4.86032315978456, + "grad_norm": 1.1829560995101929, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 67680 + }, + { + "epoch": 4.861041292639138, + "grad_norm": 1.1077659130096436, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 67690 + }, + { + "epoch": 4.861759425493716, + "grad_norm": 0.9862872958183289, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 67700 + }, + { + "epoch": 4.862477558348294, + "grad_norm": 0.9826052188873291, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 67710 + }, + { + "epoch": 4.863195691202873, + "grad_norm": 0.940082848072052, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 67720 + }, + { + "epoch": 4.863913824057451, + "grad_norm": 0.895434558391571, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 67730 + }, + { + "epoch": 4.864631956912029, + "grad_norm": 1.1194682121276855, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 67740 + }, + { + "epoch": 4.865350089766607, + "grad_norm": 0.9984544515609741, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 67750 + }, + { + "epoch": 4.866068222621185, + "grad_norm": 1.049224615097046, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 67760 + }, + { + "epoch": 4.866786355475763, + "grad_norm": 1.009515643119812, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 67770 + }, + { + "epoch": 4.867504488330341, + "grad_norm": 1.0336902141571045, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 67780 + }, + { + "epoch": 4.868222621184919, + "grad_norm": 0.9310635924339294, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 67790 + }, + { + "epoch": 4.868940754039498, + "grad_norm": 0.934882640838623, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 67800 + }, + { + "epoch": 4.869658886894076, + "grad_norm": 0.8663495779037476, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 67810 + }, + { + "epoch": 4.870377019748654, + "grad_norm": 1.0085018873214722, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 67820 + }, + { + "epoch": 4.871095152603232, + "grad_norm": 0.896507978439331, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 67830 + }, + { + "epoch": 4.87181328545781, + "grad_norm": 0.925809919834137, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 67840 + }, + { + "epoch": 4.872531418312388, + "grad_norm": 0.8044029474258423, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 67850 + }, + { + "epoch": 4.873249551166966, + "grad_norm": 1.0026800632476807, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 67860 + }, + { + "epoch": 4.873967684021544, + "grad_norm": 0.9577589631080627, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 67870 + }, + { + "epoch": 4.874685816876122, + "grad_norm": 0.8225193619728088, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 67880 + }, + { + "epoch": 4.8754039497307, + "grad_norm": 1.0019139051437378, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 67890 + }, + { + "epoch": 4.876122082585278, + "grad_norm": 0.9282827377319336, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 67900 + }, + { + "epoch": 4.876840215439857, + "grad_norm": 0.8204836249351501, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 67910 + }, + { + "epoch": 4.877558348294435, + "grad_norm": 0.907356321811676, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 67920 + }, + { + "epoch": 4.878276481149013, + "grad_norm": 1.12422776222229, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 67930 + }, + { + "epoch": 4.878994614003591, + "grad_norm": 0.8230205178260803, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 67940 + }, + { + "epoch": 4.879712746858169, + "grad_norm": 1.1588479280471802, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 67950 + }, + { + "epoch": 4.880430879712747, + "grad_norm": 1.1064553260803223, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 67960 + }, + { + "epoch": 4.881149012567325, + "grad_norm": 0.9311534762382507, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 67970 + }, + { + "epoch": 4.881867145421903, + "grad_norm": 0.7575639486312866, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 67980 + }, + { + "epoch": 4.882585278276482, + "grad_norm": 0.9201191067695618, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 67990 + }, + { + "epoch": 4.88330341113106, + "grad_norm": 0.8487658500671387, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 68000 + }, + { + "epoch": 4.884021543985638, + "grad_norm": 0.9645208716392517, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 68010 + }, + { + "epoch": 4.884739676840216, + "grad_norm": 0.8594469428062439, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 68020 + }, + { + "epoch": 4.885457809694794, + "grad_norm": 0.9518412947654724, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 68030 + }, + { + "epoch": 4.886175942549372, + "grad_norm": 1.0934258699417114, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 68040 + }, + { + "epoch": 4.88689407540395, + "grad_norm": 0.988761842250824, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 68050 + }, + { + "epoch": 4.887612208258528, + "grad_norm": 0.7572013735771179, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 68060 + }, + { + "epoch": 4.888330341113106, + "grad_norm": 0.8801929950714111, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 68070 + }, + { + "epoch": 4.889048473967684, + "grad_norm": 1.0080658197402954, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 68080 + }, + { + "epoch": 4.8897666068222625, + "grad_norm": 0.9588785171508789, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 68090 + }, + { + "epoch": 4.8904847396768405, + "grad_norm": 1.0994032621383667, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 68100 + }, + { + "epoch": 4.8912028725314185, + "grad_norm": 0.9851962924003601, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 68110 + }, + { + "epoch": 4.8919210053859965, + "grad_norm": 0.9566116333007812, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 68120 + }, + { + "epoch": 4.8926391382405745, + "grad_norm": 0.8708083033561707, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 68130 + }, + { + "epoch": 4.8933572710951525, + "grad_norm": 1.2182754278182983, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 68140 + }, + { + "epoch": 4.8940754039497305, + "grad_norm": 1.047988772392273, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 68150 + }, + { + "epoch": 4.8947935368043085, + "grad_norm": 0.8665831685066223, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 68160 + }, + { + "epoch": 4.8955116696588865, + "grad_norm": 0.9313908219337463, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 68170 + }, + { + "epoch": 4.896229802513465, + "grad_norm": 0.9568582773208618, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 68180 + }, + { + "epoch": 4.896947935368043, + "grad_norm": 1.0427594184875488, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 68190 + }, + { + "epoch": 4.897666068222621, + "grad_norm": 0.9132021069526672, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 68200 + }, + { + "epoch": 4.898384201077199, + "grad_norm": 0.9597318768501282, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 68210 + }, + { + "epoch": 4.899102333931777, + "grad_norm": 1.0736947059631348, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 68220 + }, + { + "epoch": 4.899820466786355, + "grad_norm": 0.9318404793739319, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 68230 + }, + { + "epoch": 4.900538599640933, + "grad_norm": 0.8594326972961426, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 68240 + }, + { + "epoch": 4.901256732495511, + "grad_norm": 1.1437443494796753, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 68250 + }, + { + "epoch": 4.901974865350089, + "grad_norm": 1.1599408388137817, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 68260 + }, + { + "epoch": 4.902692998204667, + "grad_norm": 1.160628080368042, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 68270 + }, + { + "epoch": 4.903411131059246, + "grad_norm": 1.0147801637649536, + "learning_rate": 0.0002, + "loss": 0.613, + "step": 68280 + }, + { + "epoch": 4.904129263913824, + "grad_norm": 0.8622691631317139, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 68290 + }, + { + "epoch": 4.904847396768402, + "grad_norm": 0.7179980874061584, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 68300 + }, + { + "epoch": 4.90556552962298, + "grad_norm": 1.1705092191696167, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 68310 + }, + { + "epoch": 4.906283662477558, + "grad_norm": 1.1687676906585693, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 68320 + }, + { + "epoch": 4.907001795332136, + "grad_norm": 1.1621531248092651, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 68330 + }, + { + "epoch": 4.907719928186714, + "grad_norm": 1.0241422653198242, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 68340 + }, + { + "epoch": 4.908438061041292, + "grad_norm": 0.943354070186615, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 68350 + }, + { + "epoch": 4.909156193895871, + "grad_norm": 0.8091703653335571, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 68360 + }, + { + "epoch": 4.909874326750449, + "grad_norm": 0.8871228694915771, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 68370 + }, + { + "epoch": 4.910592459605027, + "grad_norm": 1.0951069593429565, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 68380 + }, + { + "epoch": 4.911310592459605, + "grad_norm": 1.1355193853378296, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 68390 + }, + { + "epoch": 4.912028725314183, + "grad_norm": 1.0741122961044312, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 68400 + }, + { + "epoch": 4.912746858168761, + "grad_norm": 0.9285269975662231, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 68410 + }, + { + "epoch": 4.913464991023339, + "grad_norm": 1.080695390701294, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 68420 + }, + { + "epoch": 4.914183123877917, + "grad_norm": 0.921331524848938, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 68430 + }, + { + "epoch": 4.914901256732495, + "grad_norm": 0.9763174057006836, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 68440 + }, + { + "epoch": 4.915619389587073, + "grad_norm": 1.1133354902267456, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 68450 + }, + { + "epoch": 4.916337522441651, + "grad_norm": 0.8373502492904663, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 68460 + }, + { + "epoch": 4.91705565529623, + "grad_norm": 0.9192346334457397, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 68470 + }, + { + "epoch": 4.917773788150808, + "grad_norm": 1.0724657773971558, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 68480 + }, + { + "epoch": 4.918491921005386, + "grad_norm": 0.9209843873977661, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 68490 + }, + { + "epoch": 4.919210053859964, + "grad_norm": 0.9201577305793762, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 68500 + }, + { + "epoch": 4.919928186714542, + "grad_norm": 0.8086138963699341, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 68510 + }, + { + "epoch": 4.92064631956912, + "grad_norm": 1.0917785167694092, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 68520 + }, + { + "epoch": 4.921364452423698, + "grad_norm": 0.9287897944450378, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 68530 + }, + { + "epoch": 4.922082585278276, + "grad_norm": 0.9830158948898315, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 68540 + }, + { + "epoch": 4.922800718132855, + "grad_norm": 0.8674678802490234, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 68550 + }, + { + "epoch": 4.923518850987433, + "grad_norm": 0.7996176481246948, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 68560 + }, + { + "epoch": 4.924236983842011, + "grad_norm": 1.1284033060073853, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 68570 + }, + { + "epoch": 4.924955116696589, + "grad_norm": 0.894339919090271, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 68580 + }, + { + "epoch": 4.925673249551167, + "grad_norm": 1.1140280961990356, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 68590 + }, + { + "epoch": 4.926391382405745, + "grad_norm": 0.9048344492912292, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 68600 + }, + { + "epoch": 4.927109515260323, + "grad_norm": 0.9380471706390381, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 68610 + }, + { + "epoch": 4.927827648114901, + "grad_norm": 0.8598429560661316, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 68620 + }, + { + "epoch": 4.928545780969479, + "grad_norm": 1.0813355445861816, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 68630 + }, + { + "epoch": 4.929263913824057, + "grad_norm": 0.979053795337677, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 68640 + }, + { + "epoch": 4.929982046678636, + "grad_norm": 0.8194574117660522, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 68650 + }, + { + "epoch": 4.930700179533214, + "grad_norm": 0.8593540787696838, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 68660 + }, + { + "epoch": 4.931418312387792, + "grad_norm": 1.0134016275405884, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 68670 + }, + { + "epoch": 4.93213644524237, + "grad_norm": 1.060586929321289, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 68680 + }, + { + "epoch": 4.932854578096948, + "grad_norm": 0.84132319688797, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 68690 + }, + { + "epoch": 4.933572710951526, + "grad_norm": 1.0767526626586914, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 68700 + }, + { + "epoch": 4.934290843806104, + "grad_norm": 0.8858519792556763, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 68710 + }, + { + "epoch": 4.935008976660682, + "grad_norm": 1.194031000137329, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 68720 + }, + { + "epoch": 4.93572710951526, + "grad_norm": 0.8270226120948792, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 68730 + }, + { + "epoch": 4.936445242369839, + "grad_norm": 1.0385973453521729, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 68740 + }, + { + "epoch": 4.937163375224417, + "grad_norm": 0.9062243700027466, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 68750 + }, + { + "epoch": 4.937881508078995, + "grad_norm": 1.0526955127716064, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 68760 + }, + { + "epoch": 4.938599640933573, + "grad_norm": 0.930604100227356, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 68770 + }, + { + "epoch": 4.939317773788151, + "grad_norm": 0.9635265469551086, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 68780 + }, + { + "epoch": 4.940035906642729, + "grad_norm": 0.9825171232223511, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 68790 + }, + { + "epoch": 4.940754039497307, + "grad_norm": 0.9621182680130005, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 68800 + }, + { + "epoch": 4.941472172351885, + "grad_norm": 0.9655307531356812, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 68810 + }, + { + "epoch": 4.942190305206463, + "grad_norm": 1.2948180437088013, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 68820 + }, + { + "epoch": 4.942908438061041, + "grad_norm": 0.9206728339195251, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 68830 + }, + { + "epoch": 4.94362657091562, + "grad_norm": 1.0235631465911865, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 68840 + }, + { + "epoch": 4.944344703770198, + "grad_norm": 1.0542538166046143, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 68850 + }, + { + "epoch": 4.945062836624776, + "grad_norm": 0.9787087440490723, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 68860 + }, + { + "epoch": 4.945780969479354, + "grad_norm": 0.9527219533920288, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 68870 + }, + { + "epoch": 4.946499102333932, + "grad_norm": 1.1525826454162598, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 68880 + }, + { + "epoch": 4.94721723518851, + "grad_norm": 0.8610072731971741, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 68890 + }, + { + "epoch": 4.947935368043088, + "grad_norm": 1.1403616666793823, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 68900 + }, + { + "epoch": 4.948653500897666, + "grad_norm": 1.10334312915802, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 68910 + }, + { + "epoch": 4.949371633752245, + "grad_norm": 0.8633760809898376, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 68920 + }, + { + "epoch": 4.950089766606823, + "grad_norm": 1.1291080713272095, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 68930 + }, + { + "epoch": 4.950807899461401, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 68940 + }, + { + "epoch": 4.951526032315979, + "grad_norm": 0.9207960963249207, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 68950 + }, + { + "epoch": 4.952244165170557, + "grad_norm": 0.9815934300422668, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 68960 + }, + { + "epoch": 4.952962298025135, + "grad_norm": 0.9725701808929443, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 68970 + }, + { + "epoch": 4.953680430879713, + "grad_norm": 0.844926655292511, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 68980 + }, + { + "epoch": 4.954398563734291, + "grad_norm": 0.9898511171340942, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 68990 + }, + { + "epoch": 4.955116696588869, + "grad_norm": 1.1311410665512085, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 69000 + }, + { + "epoch": 4.955834829443447, + "grad_norm": 1.218610405921936, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 69010 + }, + { + "epoch": 4.956552962298025, + "grad_norm": 1.1536420583724976, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 69020 + }, + { + "epoch": 4.957271095152604, + "grad_norm": 1.1857786178588867, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 69030 + }, + { + "epoch": 4.957989228007182, + "grad_norm": 0.9969246983528137, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 69040 + }, + { + "epoch": 4.95870736086176, + "grad_norm": 1.138635277748108, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 69050 + }, + { + "epoch": 4.959425493716338, + "grad_norm": 1.110474705696106, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 69060 + }, + { + "epoch": 4.960143626570916, + "grad_norm": 1.0366318225860596, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 69070 + }, + { + "epoch": 4.960861759425494, + "grad_norm": 0.6927996277809143, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 69080 + }, + { + "epoch": 4.961579892280072, + "grad_norm": 1.0368026494979858, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 69090 + }, + { + "epoch": 4.96229802513465, + "grad_norm": 1.0638312101364136, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 69100 + }, + { + "epoch": 4.9630161579892285, + "grad_norm": 1.0372415781021118, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 69110 + }, + { + "epoch": 4.9637342908438065, + "grad_norm": 0.8257387280464172, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 69120 + }, + { + "epoch": 4.9644524236983845, + "grad_norm": 1.0046974420547485, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 69130 + }, + { + "epoch": 4.9651705565529625, + "grad_norm": 1.0139652490615845, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 69140 + }, + { + "epoch": 4.9658886894075405, + "grad_norm": 1.0214691162109375, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 69150 + }, + { + "epoch": 4.9666068222621185, + "grad_norm": 1.1042424440383911, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 69160 + }, + { + "epoch": 4.9673249551166965, + "grad_norm": 0.8749067783355713, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 69170 + }, + { + "epoch": 4.9680430879712745, + "grad_norm": 0.9894024133682251, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 69180 + }, + { + "epoch": 4.9687612208258525, + "grad_norm": 1.0218034982681274, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 69190 + }, + { + "epoch": 4.9694793536804305, + "grad_norm": 0.9782929420471191, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 69200 + }, + { + "epoch": 4.9701974865350085, + "grad_norm": 0.9373409748077393, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 69210 + }, + { + "epoch": 4.970915619389587, + "grad_norm": 1.0329546928405762, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 69220 + }, + { + "epoch": 4.971633752244165, + "grad_norm": 0.9746108055114746, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 69230 + }, + { + "epoch": 4.972351885098743, + "grad_norm": 0.9202073216438293, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 69240 + }, + { + "epoch": 4.973070017953321, + "grad_norm": 1.078032374382019, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 69250 + }, + { + "epoch": 4.973788150807899, + "grad_norm": 0.8860024809837341, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 69260 + }, + { + "epoch": 4.974506283662477, + "grad_norm": 0.915212094783783, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 69270 + }, + { + "epoch": 4.975224416517055, + "grad_norm": 1.1192166805267334, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 69280 + }, + { + "epoch": 4.975942549371633, + "grad_norm": 0.8387445211410522, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 69290 + }, + { + "epoch": 4.976660682226212, + "grad_norm": 1.1210044622421265, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 69300 + }, + { + "epoch": 4.97737881508079, + "grad_norm": 1.0051207542419434, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 69310 + }, + { + "epoch": 4.978096947935368, + "grad_norm": 0.9248682856559753, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 69320 + }, + { + "epoch": 4.978815080789946, + "grad_norm": 0.8265128135681152, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 69330 + }, + { + "epoch": 4.979533213644524, + "grad_norm": 0.9432681798934937, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 69340 + }, + { + "epoch": 4.980251346499102, + "grad_norm": 1.0135977268218994, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 69350 + }, + { + "epoch": 4.98096947935368, + "grad_norm": 0.9857245683670044, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 69360 + }, + { + "epoch": 4.981687612208258, + "grad_norm": 0.9215952157974243, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 69370 + }, + { + "epoch": 4.982405745062836, + "grad_norm": 1.1518077850341797, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 69380 + }, + { + "epoch": 4.983123877917414, + "grad_norm": 0.8836095929145813, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 69390 + }, + { + "epoch": 4.983842010771993, + "grad_norm": 0.8082528710365295, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 69400 + }, + { + "epoch": 4.984560143626571, + "grad_norm": 0.9295604825019836, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 69410 + }, + { + "epoch": 4.985278276481149, + "grad_norm": 1.002057433128357, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 69420 + }, + { + "epoch": 4.985996409335727, + "grad_norm": 0.8127216100692749, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 69430 + }, + { + "epoch": 4.986714542190305, + "grad_norm": 1.058138370513916, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 69440 + }, + { + "epoch": 4.987432675044883, + "grad_norm": 0.8451166749000549, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 69450 + }, + { + "epoch": 4.988150807899461, + "grad_norm": 0.9687268137931824, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 69460 + }, + { + "epoch": 4.988868940754039, + "grad_norm": 1.0342036485671997, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 69470 + }, + { + "epoch": 4.989587073608618, + "grad_norm": 0.9042398929595947, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 69480 + }, + { + "epoch": 4.990305206463196, + "grad_norm": 1.0575438737869263, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 69490 + }, + { + "epoch": 4.991023339317774, + "grad_norm": 0.9364935159683228, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 69500 + }, + { + "epoch": 4.991741472172352, + "grad_norm": 1.0327378511428833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 69510 + }, + { + "epoch": 4.99245960502693, + "grad_norm": 0.815592885017395, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 69520 + }, + { + "epoch": 4.993177737881508, + "grad_norm": 1.0813369750976562, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 69530 + }, + { + "epoch": 4.993895870736086, + "grad_norm": 1.0277023315429688, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 69540 + }, + { + "epoch": 4.994614003590664, + "grad_norm": 1.0291162729263306, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 69550 + }, + { + "epoch": 4.995332136445242, + "grad_norm": 0.8435685634613037, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 69560 + }, + { + "epoch": 4.99605026929982, + "grad_norm": 1.1972291469573975, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 69570 + }, + { + "epoch": 4.996768402154398, + "grad_norm": 0.8114907741546631, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 69580 + }, + { + "epoch": 4.997486535008977, + "grad_norm": 0.8296133875846863, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 69590 + }, + { + "epoch": 4.998204667863555, + "grad_norm": 1.1728706359863281, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 69600 + }, + { + "epoch": 4.998922800718133, + "grad_norm": 0.9586578607559204, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 69610 + }, + { + "epoch": 4.999640933572711, + "grad_norm": 0.9725151062011719, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 69620 + }, + { + "epoch": 5.0, + "eval_loss": 1.133581519126892, + "eval_runtime": 55.2151, + "eval_samples_per_second": 13.275, + "eval_steps_per_second": 1.666, + "step": 69625 + }, + { + "epoch": 5.000359066427289, + "grad_norm": 0.9312055706977844, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 69630 + }, + { + "epoch": 5.001077199281867, + "grad_norm": 1.0534896850585938, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 69640 + }, + { + "epoch": 5.001795332136445, + "grad_norm": 0.8891698718070984, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 69650 + }, + { + "epoch": 5.002513464991023, + "grad_norm": 0.7791097164154053, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 69660 + }, + { + "epoch": 5.003231597845601, + "grad_norm": 1.2891173362731934, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 69670 + }, + { + "epoch": 5.00394973070018, + "grad_norm": 0.7909513711929321, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 69680 + }, + { + "epoch": 5.004667863554758, + "grad_norm": 0.988648533821106, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 69690 + }, + { + "epoch": 5.005385996409336, + "grad_norm": 0.9669296741485596, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 69700 + }, + { + "epoch": 5.006104129263914, + "grad_norm": 1.2393349409103394, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 69710 + }, + { + "epoch": 5.006822262118492, + "grad_norm": 1.2420750856399536, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 69720 + }, + { + "epoch": 5.00754039497307, + "grad_norm": 1.1698096990585327, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 69730 + }, + { + "epoch": 5.008258527827648, + "grad_norm": 1.2228301763534546, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 69740 + }, + { + "epoch": 5.008976660682226, + "grad_norm": 0.9350621104240417, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 69750 + }, + { + "epoch": 5.009694793536804, + "grad_norm": 0.9828507304191589, + "learning_rate": 0.0002, + "loss": 0.5278, + "step": 69760 + }, + { + "epoch": 5.010412926391383, + "grad_norm": 0.9372149109840393, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 69770 + }, + { + "epoch": 5.011131059245961, + "grad_norm": 0.8098477125167847, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 69780 + }, + { + "epoch": 5.011849192100539, + "grad_norm": 1.0418338775634766, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 69790 + }, + { + "epoch": 5.012567324955117, + "grad_norm": 1.0175801515579224, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 69800 + }, + { + "epoch": 5.013285457809695, + "grad_norm": 1.2128081321716309, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 69810 + }, + { + "epoch": 5.014003590664273, + "grad_norm": 1.001805067062378, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 69820 + }, + { + "epoch": 5.014721723518851, + "grad_norm": 0.8957470059394836, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 69830 + }, + { + "epoch": 5.015439856373429, + "grad_norm": 0.9344548583030701, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 69840 + }, + { + "epoch": 5.016157989228007, + "grad_norm": 0.8545927405357361, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 69850 + }, + { + "epoch": 5.016876122082586, + "grad_norm": 1.3907777070999146, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 69860 + }, + { + "epoch": 5.017594254937164, + "grad_norm": 0.8112093806266785, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 69870 + }, + { + "epoch": 5.018312387791742, + "grad_norm": 1.0151532888412476, + "learning_rate": 0.0002, + "loss": 0.5, + "step": 69880 + }, + { + "epoch": 5.01903052064632, + "grad_norm": 1.249021053314209, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 69890 + }, + { + "epoch": 5.019748653500898, + "grad_norm": 0.9310314059257507, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 69900 + }, + { + "epoch": 5.020466786355476, + "grad_norm": 0.9444572925567627, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 69910 + }, + { + "epoch": 5.021184919210054, + "grad_norm": 1.0952081680297852, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 69920 + }, + { + "epoch": 5.021903052064632, + "grad_norm": 1.2106375694274902, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 69930 + }, + { + "epoch": 5.02262118491921, + "grad_norm": 1.0179580450057983, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 69940 + }, + { + "epoch": 5.023339317773788, + "grad_norm": 1.0865367650985718, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 69950 + }, + { + "epoch": 5.024057450628367, + "grad_norm": 1.0965075492858887, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 69960 + }, + { + "epoch": 5.024775583482945, + "grad_norm": 0.8879445791244507, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 69970 + }, + { + "epoch": 5.025493716337523, + "grad_norm": 1.2588363885879517, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 69980 + }, + { + "epoch": 5.026211849192101, + "grad_norm": 0.935705304145813, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 69990 + }, + { + "epoch": 5.026929982046679, + "grad_norm": 1.072012186050415, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 70000 + }, + { + "epoch": 5.027648114901257, + "grad_norm": 1.286438226699829, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 70010 + }, + { + "epoch": 5.028366247755835, + "grad_norm": 1.1165392398834229, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 70020 + }, + { + "epoch": 5.029084380610413, + "grad_norm": 0.7998424172401428, + "learning_rate": 0.0002, + "loss": 0.5348, + "step": 70030 + }, + { + "epoch": 5.029802513464991, + "grad_norm": 1.5669852495193481, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 70040 + }, + { + "epoch": 5.0305206463195695, + "grad_norm": 0.9780290722846985, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 70050 + }, + { + "epoch": 5.0312387791741475, + "grad_norm": 0.9837628602981567, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 70060 + }, + { + "epoch": 5.0319569120287255, + "grad_norm": 0.9558916091918945, + "learning_rate": 0.0002, + "loss": 0.5369, + "step": 70070 + }, + { + "epoch": 5.0326750448833035, + "grad_norm": 0.8893155455589294, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 70080 + }, + { + "epoch": 5.0333931777378815, + "grad_norm": 1.1403675079345703, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 70090 + }, + { + "epoch": 5.0341113105924595, + "grad_norm": 1.0453649759292603, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 70100 + }, + { + "epoch": 5.0348294434470375, + "grad_norm": 0.8127498030662537, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 70110 + }, + { + "epoch": 5.0355475763016155, + "grad_norm": 0.9344680309295654, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 70120 + }, + { + "epoch": 5.0362657091561935, + "grad_norm": 1.0302079916000366, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 70130 + }, + { + "epoch": 5.036983842010772, + "grad_norm": 1.0549713373184204, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 70140 + }, + { + "epoch": 5.03770197486535, + "grad_norm": 0.8916767835617065, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 70150 + }, + { + "epoch": 5.038420107719928, + "grad_norm": 0.9799798130989075, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 70160 + }, + { + "epoch": 5.039138240574506, + "grad_norm": 1.15560781955719, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 70170 + }, + { + "epoch": 5.039856373429084, + "grad_norm": 1.0577017068862915, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 70180 + }, + { + "epoch": 5.040574506283662, + "grad_norm": 1.027990698814392, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 70190 + }, + { + "epoch": 5.04129263913824, + "grad_norm": 1.0818232297897339, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 70200 + }, + { + "epoch": 5.042010771992818, + "grad_norm": 1.0287196636199951, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 70210 + }, + { + "epoch": 5.042728904847396, + "grad_norm": 1.1569273471832275, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 70220 + }, + { + "epoch": 5.0434470377019744, + "grad_norm": 1.0485484600067139, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 70230 + }, + { + "epoch": 5.044165170556553, + "grad_norm": 0.9244540333747864, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 70240 + }, + { + "epoch": 5.044883303411131, + "grad_norm": 0.9576422572135925, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 70250 + }, + { + "epoch": 5.045601436265709, + "grad_norm": 0.8719421625137329, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 70260 + }, + { + "epoch": 5.046319569120287, + "grad_norm": 0.8685409426689148, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 70270 + }, + { + "epoch": 5.047037701974865, + "grad_norm": 1.2735247611999512, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 70280 + }, + { + "epoch": 5.047755834829443, + "grad_norm": 0.9082128405570984, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 70290 + }, + { + "epoch": 5.048473967684021, + "grad_norm": 1.0626471042633057, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 70300 + }, + { + "epoch": 5.049192100538599, + "grad_norm": 1.1463991403579712, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 70310 + }, + { + "epoch": 5.049910233393177, + "grad_norm": 0.8825355172157288, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 70320 + }, + { + "epoch": 5.050628366247756, + "grad_norm": 1.0549408197402954, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 70330 + }, + { + "epoch": 5.051346499102334, + "grad_norm": 1.3740944862365723, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 70340 + }, + { + "epoch": 5.052064631956912, + "grad_norm": 1.4197895526885986, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 70350 + }, + { + "epoch": 5.05278276481149, + "grad_norm": 1.1764925718307495, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 70360 + }, + { + "epoch": 5.053500897666068, + "grad_norm": 1.0443403720855713, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 70370 + }, + { + "epoch": 5.054219030520646, + "grad_norm": 1.1807527542114258, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 70380 + }, + { + "epoch": 5.054937163375224, + "grad_norm": 1.4032433032989502, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 70390 + }, + { + "epoch": 5.055655296229802, + "grad_norm": 0.9815662503242493, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 70400 + }, + { + "epoch": 5.05637342908438, + "grad_norm": 0.9368446469306946, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 70410 + }, + { + "epoch": 5.057091561938959, + "grad_norm": 1.1156736612319946, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 70420 + }, + { + "epoch": 5.057809694793537, + "grad_norm": 1.01651132106781, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 70430 + }, + { + "epoch": 5.058527827648115, + "grad_norm": 0.9906342029571533, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 70440 + }, + { + "epoch": 5.059245960502693, + "grad_norm": 0.8666667938232422, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 70450 + }, + { + "epoch": 5.059964093357271, + "grad_norm": 1.0508924722671509, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 70460 + }, + { + "epoch": 5.060682226211849, + "grad_norm": 1.2472858428955078, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 70470 + }, + { + "epoch": 5.061400359066427, + "grad_norm": 1.019073724746704, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 70480 + }, + { + "epoch": 5.062118491921005, + "grad_norm": 0.9745403528213501, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 70490 + }, + { + "epoch": 5.062836624775583, + "grad_norm": 1.121208906173706, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 70500 + }, + { + "epoch": 5.063554757630161, + "grad_norm": 1.0535147190093994, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 70510 + }, + { + "epoch": 5.06427289048474, + "grad_norm": 1.0368950366973877, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 70520 + }, + { + "epoch": 5.064991023339318, + "grad_norm": 0.948964536190033, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 70530 + }, + { + "epoch": 5.065709156193896, + "grad_norm": 1.0289826393127441, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 70540 + }, + { + "epoch": 5.066427289048474, + "grad_norm": 1.118374228477478, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 70550 + }, + { + "epoch": 5.067145421903052, + "grad_norm": 0.8712816834449768, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 70560 + }, + { + "epoch": 5.06786355475763, + "grad_norm": 0.9057969450950623, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 70570 + }, + { + "epoch": 5.068581687612208, + "grad_norm": 0.9292685985565186, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 70580 + }, + { + "epoch": 5.069299820466786, + "grad_norm": 0.9159911274909973, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 70590 + }, + { + "epoch": 5.070017953321364, + "grad_norm": 0.973848819732666, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 70600 + }, + { + "epoch": 5.070736086175943, + "grad_norm": 0.7892279028892517, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 70610 + }, + { + "epoch": 5.071454219030521, + "grad_norm": 0.9943311214447021, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 70620 + }, + { + "epoch": 5.072172351885099, + "grad_norm": 1.1457926034927368, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 70630 + }, + { + "epoch": 5.072890484739677, + "grad_norm": 0.9307738542556763, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 70640 + }, + { + "epoch": 5.073608617594255, + "grad_norm": 1.0899816751480103, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 70650 + }, + { + "epoch": 5.074326750448833, + "grad_norm": 0.8357672691345215, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 70660 + }, + { + "epoch": 5.075044883303411, + "grad_norm": 0.8889468312263489, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 70670 + }, + { + "epoch": 5.075763016157989, + "grad_norm": 0.9152118563652039, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 70680 + }, + { + "epoch": 5.076481149012567, + "grad_norm": 1.106160044670105, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 70690 + }, + { + "epoch": 5.077199281867145, + "grad_norm": 0.8519207835197449, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 70700 + }, + { + "epoch": 5.077917414721724, + "grad_norm": 0.9754986763000488, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 70710 + }, + { + "epoch": 5.078635547576302, + "grad_norm": 1.167883276939392, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 70720 + }, + { + "epoch": 5.07935368043088, + "grad_norm": 0.987622082233429, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 70730 + }, + { + "epoch": 5.080071813285458, + "grad_norm": 1.0008184909820557, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 70740 + }, + { + "epoch": 5.080789946140036, + "grad_norm": 0.6318819522857666, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 70750 + }, + { + "epoch": 5.081508078994614, + "grad_norm": 0.984886884689331, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 70760 + }, + { + "epoch": 5.082226211849192, + "grad_norm": 1.0583622455596924, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 70770 + }, + { + "epoch": 5.08294434470377, + "grad_norm": 0.9730119705200195, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 70780 + }, + { + "epoch": 5.083662477558348, + "grad_norm": 1.0201330184936523, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 70790 + }, + { + "epoch": 5.084380610412927, + "grad_norm": 1.0479248762130737, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 70800 + }, + { + "epoch": 5.085098743267505, + "grad_norm": 0.9185113906860352, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 70810 + }, + { + "epoch": 5.085816876122083, + "grad_norm": 0.9326799511909485, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 70820 + }, + { + "epoch": 5.086535008976661, + "grad_norm": 0.958739697933197, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 70830 + }, + { + "epoch": 5.087253141831239, + "grad_norm": 0.9643770456314087, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 70840 + }, + { + "epoch": 5.087971274685817, + "grad_norm": 0.8650234341621399, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 70850 + }, + { + "epoch": 5.088689407540395, + "grad_norm": 0.9354105591773987, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 70860 + }, + { + "epoch": 5.089407540394973, + "grad_norm": 0.8736345171928406, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 70870 + }, + { + "epoch": 5.090125673249551, + "grad_norm": 0.9172632098197937, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 70880 + }, + { + "epoch": 5.09084380610413, + "grad_norm": 0.9495565295219421, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 70890 + }, + { + "epoch": 5.091561938958708, + "grad_norm": 1.0328829288482666, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 70900 + }, + { + "epoch": 5.092280071813286, + "grad_norm": 0.9335703253746033, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 70910 + }, + { + "epoch": 5.092998204667864, + "grad_norm": 1.0919437408447266, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 70920 + }, + { + "epoch": 5.093716337522442, + "grad_norm": 1.03340744972229, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 70930 + }, + { + "epoch": 5.09443447037702, + "grad_norm": 1.0501604080200195, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 70940 + }, + { + "epoch": 5.095152603231598, + "grad_norm": 0.9442012310028076, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 70950 + }, + { + "epoch": 5.095870736086176, + "grad_norm": 1.2592464685440063, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 70960 + }, + { + "epoch": 5.096588868940754, + "grad_norm": 1.0961427688598633, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 70970 + }, + { + "epoch": 5.097307001795333, + "grad_norm": 1.0472424030303955, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 70980 + }, + { + "epoch": 5.098025134649911, + "grad_norm": 0.9489352107048035, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 70990 + }, + { + "epoch": 5.098743267504489, + "grad_norm": 1.0499446392059326, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 71000 + }, + { + "epoch": 5.099461400359067, + "grad_norm": 1.013005018234253, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 71010 + }, + { + "epoch": 5.100179533213645, + "grad_norm": 0.9594261050224304, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 71020 + }, + { + "epoch": 5.100897666068223, + "grad_norm": 1.2016123533248901, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 71030 + }, + { + "epoch": 5.101615798922801, + "grad_norm": 1.0389765501022339, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 71040 + }, + { + "epoch": 5.102333931777379, + "grad_norm": 1.053534746170044, + "learning_rate": 0.0002, + "loss": 0.5036, + "step": 71050 + }, + { + "epoch": 5.103052064631957, + "grad_norm": 1.1379448175430298, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 71060 + }, + { + "epoch": 5.103770197486535, + "grad_norm": 0.8796491622924805, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 71070 + }, + { + "epoch": 5.1044883303411135, + "grad_norm": 1.0591254234313965, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 71080 + }, + { + "epoch": 5.1052064631956915, + "grad_norm": 0.9622171521186829, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 71090 + }, + { + "epoch": 5.1059245960502695, + "grad_norm": 0.9173060059547424, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 71100 + }, + { + "epoch": 5.1066427289048475, + "grad_norm": 0.8363444805145264, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 71110 + }, + { + "epoch": 5.1073608617594255, + "grad_norm": 1.1006172895431519, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 71120 + }, + { + "epoch": 5.1080789946140035, + "grad_norm": 1.0720574855804443, + "learning_rate": 0.0002, + "loss": 0.5753, + "step": 71130 + }, + { + "epoch": 5.1087971274685815, + "grad_norm": 1.0560680627822876, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 71140 + }, + { + "epoch": 5.1095152603231595, + "grad_norm": 0.8485415577888489, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 71150 + }, + { + "epoch": 5.1102333931777375, + "grad_norm": 1.109383225440979, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 71160 + }, + { + "epoch": 5.110951526032316, + "grad_norm": 0.9296035766601562, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 71170 + }, + { + "epoch": 5.111669658886894, + "grad_norm": 1.2855182886123657, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 71180 + }, + { + "epoch": 5.112387791741472, + "grad_norm": 1.0313524007797241, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 71190 + }, + { + "epoch": 5.11310592459605, + "grad_norm": 1.0436697006225586, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 71200 + }, + { + "epoch": 5.113824057450628, + "grad_norm": 0.901333212852478, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 71210 + }, + { + "epoch": 5.114542190305206, + "grad_norm": 1.2170051336288452, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 71220 + }, + { + "epoch": 5.115260323159784, + "grad_norm": 0.8850961327552795, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 71230 + }, + { + "epoch": 5.115978456014362, + "grad_norm": 1.0147113800048828, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 71240 + }, + { + "epoch": 5.11669658886894, + "grad_norm": 1.0043506622314453, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 71250 + }, + { + "epoch": 5.117414721723518, + "grad_norm": 0.9887113571166992, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 71260 + }, + { + "epoch": 5.118132854578097, + "grad_norm": 1.1013392210006714, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 71270 + }, + { + "epoch": 5.118850987432675, + "grad_norm": 0.9213799238204956, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 71280 + }, + { + "epoch": 5.119569120287253, + "grad_norm": 1.047400712966919, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 71290 + }, + { + "epoch": 5.120287253141831, + "grad_norm": 1.030534029006958, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 71300 + }, + { + "epoch": 5.121005385996409, + "grad_norm": 0.9464976191520691, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 71310 + }, + { + "epoch": 5.121723518850987, + "grad_norm": 0.8610315918922424, + "learning_rate": 0.0002, + "loss": 0.5707, + "step": 71320 + }, + { + "epoch": 5.122441651705565, + "grad_norm": 1.0824426412582397, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 71330 + }, + { + "epoch": 5.123159784560143, + "grad_norm": 0.9382733106613159, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 71340 + }, + { + "epoch": 5.123877917414721, + "grad_norm": 0.9364684224128723, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 71350 + }, + { + "epoch": 5.1245960502693, + "grad_norm": 0.9583013653755188, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 71360 + }, + { + "epoch": 5.125314183123878, + "grad_norm": 1.287533164024353, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 71370 + }, + { + "epoch": 5.126032315978456, + "grad_norm": 1.5031169652938843, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 71380 + }, + { + "epoch": 5.126750448833034, + "grad_norm": 0.9891406297683716, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 71390 + }, + { + "epoch": 5.127468581687612, + "grad_norm": 1.1851537227630615, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 71400 + }, + { + "epoch": 5.12818671454219, + "grad_norm": 0.9869971871376038, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 71410 + }, + { + "epoch": 5.128904847396768, + "grad_norm": 0.961662769317627, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 71420 + }, + { + "epoch": 5.129622980251346, + "grad_norm": 1.1036419868469238, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 71430 + }, + { + "epoch": 5.130341113105924, + "grad_norm": 1.175361156463623, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 71440 + }, + { + "epoch": 5.131059245960503, + "grad_norm": 0.9801875948905945, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 71450 + }, + { + "epoch": 5.131777378815081, + "grad_norm": 0.9424611330032349, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 71460 + }, + { + "epoch": 5.132495511669659, + "grad_norm": 1.11662757396698, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 71470 + }, + { + "epoch": 5.133213644524237, + "grad_norm": 0.9969366192817688, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 71480 + }, + { + "epoch": 5.133931777378815, + "grad_norm": 1.278640866279602, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 71490 + }, + { + "epoch": 5.134649910233393, + "grad_norm": 1.1090457439422607, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 71500 + }, + { + "epoch": 5.135368043087971, + "grad_norm": 1.01808500289917, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 71510 + }, + { + "epoch": 5.136086175942549, + "grad_norm": 1.029135823249817, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 71520 + }, + { + "epoch": 5.136804308797127, + "grad_norm": 1.1207175254821777, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 71530 + }, + { + "epoch": 5.137522441651706, + "grad_norm": 1.0327218770980835, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 71540 + }, + { + "epoch": 5.138240574506284, + "grad_norm": 1.042490839958191, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 71550 + }, + { + "epoch": 5.138958707360862, + "grad_norm": 1.1800413131713867, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 71560 + }, + { + "epoch": 5.13967684021544, + "grad_norm": 1.0748766660690308, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 71570 + }, + { + "epoch": 5.140394973070018, + "grad_norm": 0.9983090758323669, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 71580 + }, + { + "epoch": 5.141113105924596, + "grad_norm": 1.30636727809906, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 71590 + }, + { + "epoch": 5.141831238779174, + "grad_norm": 0.9960222840309143, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 71600 + }, + { + "epoch": 5.142549371633752, + "grad_norm": 1.237027645111084, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 71610 + }, + { + "epoch": 5.14326750448833, + "grad_norm": 1.0913307666778564, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 71620 + }, + { + "epoch": 5.143985637342908, + "grad_norm": 0.940657913684845, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 71630 + }, + { + "epoch": 5.144703770197487, + "grad_norm": 1.093796730041504, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 71640 + }, + { + "epoch": 5.145421903052065, + "grad_norm": 0.9703856110572815, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 71650 + }, + { + "epoch": 5.146140035906643, + "grad_norm": 0.9874776005744934, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 71660 + }, + { + "epoch": 5.146858168761221, + "grad_norm": 0.9723859429359436, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 71670 + }, + { + "epoch": 5.147576301615799, + "grad_norm": 0.997107207775116, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 71680 + }, + { + "epoch": 5.148294434470377, + "grad_norm": 1.0261175632476807, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 71690 + }, + { + "epoch": 5.149012567324955, + "grad_norm": 0.9093905687332153, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 71700 + }, + { + "epoch": 5.149730700179533, + "grad_norm": 0.9909888505935669, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 71710 + }, + { + "epoch": 5.150448833034111, + "grad_norm": 0.9111971259117126, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 71720 + }, + { + "epoch": 5.15116696588869, + "grad_norm": 0.9319643974304199, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 71730 + }, + { + "epoch": 5.151885098743268, + "grad_norm": 1.0744104385375977, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 71740 + }, + { + "epoch": 5.152603231597846, + "grad_norm": 1.1555477380752563, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 71750 + }, + { + "epoch": 5.153321364452424, + "grad_norm": 0.9809171557426453, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 71760 + }, + { + "epoch": 5.154039497307002, + "grad_norm": 0.7937686443328857, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 71770 + }, + { + "epoch": 5.15475763016158, + "grad_norm": 1.1925430297851562, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 71780 + }, + { + "epoch": 5.155475763016158, + "grad_norm": 1.077412486076355, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 71790 + }, + { + "epoch": 5.156193895870736, + "grad_norm": 0.7992808222770691, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 71800 + }, + { + "epoch": 5.156912028725314, + "grad_norm": 1.0938535928726196, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 71810 + }, + { + "epoch": 5.157630161579892, + "grad_norm": 0.9458112120628357, + "learning_rate": 0.0002, + "loss": 0.5562, + "step": 71820 + }, + { + "epoch": 5.158348294434471, + "grad_norm": 0.984940230846405, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 71830 + }, + { + "epoch": 5.159066427289049, + "grad_norm": 0.9242565035820007, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 71840 + }, + { + "epoch": 5.159784560143627, + "grad_norm": 0.8386720418930054, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 71850 + }, + { + "epoch": 5.160502692998205, + "grad_norm": 0.9627357721328735, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 71860 + }, + { + "epoch": 5.161220825852783, + "grad_norm": 1.0118762254714966, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 71870 + }, + { + "epoch": 5.161938958707361, + "grad_norm": 1.1552608013153076, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 71880 + }, + { + "epoch": 5.162657091561939, + "grad_norm": 1.0910389423370361, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 71890 + }, + { + "epoch": 5.163375224416517, + "grad_norm": 1.046639084815979, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 71900 + }, + { + "epoch": 5.164093357271095, + "grad_norm": 1.0087649822235107, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 71910 + }, + { + "epoch": 5.164811490125674, + "grad_norm": 0.9418644309043884, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 71920 + }, + { + "epoch": 5.165529622980252, + "grad_norm": 1.1213915348052979, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 71930 + }, + { + "epoch": 5.16624775583483, + "grad_norm": 1.043786644935608, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 71940 + }, + { + "epoch": 5.166965888689408, + "grad_norm": 1.2150449752807617, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 71950 + }, + { + "epoch": 5.167684021543986, + "grad_norm": 1.1214520931243896, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 71960 + }, + { + "epoch": 5.168402154398564, + "grad_norm": 0.9235218167304993, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 71970 + }, + { + "epoch": 5.169120287253142, + "grad_norm": 0.8736480474472046, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 71980 + }, + { + "epoch": 5.16983842010772, + "grad_norm": 0.8723195195198059, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 71990 + }, + { + "epoch": 5.170556552962298, + "grad_norm": 1.0873022079467773, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 72000 + }, + { + "epoch": 5.1712746858168765, + "grad_norm": 0.9196295142173767, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 72010 + }, + { + "epoch": 5.1719928186714546, + "grad_norm": 0.9244471192359924, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 72020 + }, + { + "epoch": 5.1727109515260326, + "grad_norm": 1.0555505752563477, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 72030 + }, + { + "epoch": 5.1734290843806106, + "grad_norm": 1.1527929306030273, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 72040 + }, + { + "epoch": 5.174147217235189, + "grad_norm": 0.9069058895111084, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 72050 + }, + { + "epoch": 5.174865350089767, + "grad_norm": 1.1047141551971436, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 72060 + }, + { + "epoch": 5.175583482944345, + "grad_norm": 0.9805511832237244, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 72070 + }, + { + "epoch": 5.176301615798923, + "grad_norm": 1.1636970043182373, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 72080 + }, + { + "epoch": 5.177019748653501, + "grad_norm": 1.0193538665771484, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 72090 + }, + { + "epoch": 5.177737881508079, + "grad_norm": 0.8850618600845337, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 72100 + }, + { + "epoch": 5.1784560143626575, + "grad_norm": 1.042271614074707, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 72110 + }, + { + "epoch": 5.1791741472172355, + "grad_norm": 1.1405227184295654, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 72120 + }, + { + "epoch": 5.1798922800718135, + "grad_norm": 1.0013195276260376, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 72130 + }, + { + "epoch": 5.1806104129263915, + "grad_norm": 1.0474903583526611, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 72140 + }, + { + "epoch": 5.1813285457809695, + "grad_norm": 1.0384612083435059, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 72150 + }, + { + "epoch": 5.1820466786355475, + "grad_norm": 1.145086646080017, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 72160 + }, + { + "epoch": 5.1827648114901255, + "grad_norm": 1.0845173597335815, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 72170 + }, + { + "epoch": 5.1834829443447035, + "grad_norm": 0.9870346188545227, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 72180 + }, + { + "epoch": 5.1842010771992815, + "grad_norm": 1.1098768711090088, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 72190 + }, + { + "epoch": 5.18491921005386, + "grad_norm": 0.9397785067558289, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 72200 + }, + { + "epoch": 5.185637342908438, + "grad_norm": 1.0817532539367676, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 72210 + }, + { + "epoch": 5.186355475763016, + "grad_norm": 1.0027309656143188, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 72220 + }, + { + "epoch": 5.187073608617594, + "grad_norm": 0.8262016773223877, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 72230 + }, + { + "epoch": 5.187791741472172, + "grad_norm": 0.9968137741088867, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 72240 + }, + { + "epoch": 5.18850987432675, + "grad_norm": 0.9072695970535278, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 72250 + }, + { + "epoch": 5.189228007181328, + "grad_norm": 1.0388357639312744, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 72260 + }, + { + "epoch": 5.189946140035906, + "grad_norm": 0.8883537650108337, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 72270 + }, + { + "epoch": 5.190664272890484, + "grad_norm": 1.0161921977996826, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 72280 + }, + { + "epoch": 5.191382405745063, + "grad_norm": 0.964936375617981, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 72290 + }, + { + "epoch": 5.192100538599641, + "grad_norm": 0.9728496670722961, + "learning_rate": 0.0002, + "loss": 0.5145, + "step": 72300 + }, + { + "epoch": 5.192818671454219, + "grad_norm": 1.2411649227142334, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 72310 + }, + { + "epoch": 5.193536804308797, + "grad_norm": 0.9430946111679077, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 72320 + }, + { + "epoch": 5.194254937163375, + "grad_norm": 1.1522886753082275, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 72330 + }, + { + "epoch": 5.194973070017953, + "grad_norm": 1.0727189779281616, + "learning_rate": 0.0002, + "loss": 0.5013, + "step": 72340 + }, + { + "epoch": 5.195691202872531, + "grad_norm": 1.2506077289581299, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 72350 + }, + { + "epoch": 5.196409335727109, + "grad_norm": 1.0949938297271729, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 72360 + }, + { + "epoch": 5.197127468581687, + "grad_norm": 1.191125750541687, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 72370 + }, + { + "epoch": 5.197845601436265, + "grad_norm": 1.1154223680496216, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 72380 + }, + { + "epoch": 5.198563734290844, + "grad_norm": 0.9623886942863464, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 72390 + }, + { + "epoch": 5.199281867145422, + "grad_norm": 0.9432680010795593, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 72400 + }, + { + "epoch": 5.2, + "grad_norm": 1.035905122756958, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 72410 + }, + { + "epoch": 5.200718132854578, + "grad_norm": 0.9044913053512573, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 72420 + }, + { + "epoch": 5.201436265709156, + "grad_norm": 1.082187533378601, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 72430 + }, + { + "epoch": 5.202154398563734, + "grad_norm": 0.9368400573730469, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 72440 + }, + { + "epoch": 5.202872531418312, + "grad_norm": 1.1515194177627563, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 72450 + }, + { + "epoch": 5.20359066427289, + "grad_norm": 0.8333232402801514, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 72460 + }, + { + "epoch": 5.204308797127468, + "grad_norm": 1.0885688066482544, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 72470 + }, + { + "epoch": 5.205026929982047, + "grad_norm": 0.8189428448677063, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 72480 + }, + { + "epoch": 5.205745062836625, + "grad_norm": 1.0145429372787476, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 72490 + }, + { + "epoch": 5.206463195691203, + "grad_norm": 1.132490634918213, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 72500 + }, + { + "epoch": 5.207181328545781, + "grad_norm": 0.8866808414459229, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 72510 + }, + { + "epoch": 5.207899461400359, + "grad_norm": 0.9681518077850342, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 72520 + }, + { + "epoch": 5.208617594254937, + "grad_norm": 0.9992330074310303, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 72530 + }, + { + "epoch": 5.209335727109515, + "grad_norm": 1.0767436027526855, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 72540 + }, + { + "epoch": 5.210053859964093, + "grad_norm": 1.1362388134002686, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 72550 + }, + { + "epoch": 5.210771992818671, + "grad_norm": 0.9741758704185486, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 72560 + }, + { + "epoch": 5.211490125673249, + "grad_norm": 0.8216298818588257, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 72570 + }, + { + "epoch": 5.212208258527828, + "grad_norm": 0.7500724792480469, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 72580 + }, + { + "epoch": 5.212926391382406, + "grad_norm": 0.9152594804763794, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 72590 + }, + { + "epoch": 5.213644524236984, + "grad_norm": 1.014940857887268, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 72600 + }, + { + "epoch": 5.214362657091562, + "grad_norm": 0.9333099722862244, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 72610 + }, + { + "epoch": 5.21508078994614, + "grad_norm": 0.7940610647201538, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 72620 + }, + { + "epoch": 5.215798922800718, + "grad_norm": 1.0365521907806396, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 72630 + }, + { + "epoch": 5.216517055655296, + "grad_norm": 1.37727952003479, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 72640 + }, + { + "epoch": 5.217235188509874, + "grad_norm": 1.2019168138504028, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 72650 + }, + { + "epoch": 5.217953321364452, + "grad_norm": 1.1696226596832275, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 72660 + }, + { + "epoch": 5.218671454219031, + "grad_norm": 0.9608798623085022, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 72670 + }, + { + "epoch": 5.219389587073609, + "grad_norm": 0.9139777421951294, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 72680 + }, + { + "epoch": 5.220107719928187, + "grad_norm": 0.9937016367912292, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 72690 + }, + { + "epoch": 5.220825852782765, + "grad_norm": 1.2787059545516968, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 72700 + }, + { + "epoch": 5.221543985637343, + "grad_norm": 1.0757197141647339, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 72710 + }, + { + "epoch": 5.222262118491921, + "grad_norm": 0.8053579926490784, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 72720 + }, + { + "epoch": 5.222980251346499, + "grad_norm": 1.0239759683609009, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 72730 + }, + { + "epoch": 5.223698384201077, + "grad_norm": 0.9972975850105286, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 72740 + }, + { + "epoch": 5.224416517055655, + "grad_norm": 1.0504519939422607, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 72750 + }, + { + "epoch": 5.225134649910234, + "grad_norm": 1.1793010234832764, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 72760 + }, + { + "epoch": 5.225852782764812, + "grad_norm": 1.1098815202713013, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 72770 + }, + { + "epoch": 5.22657091561939, + "grad_norm": 1.1078516244888306, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 72780 + }, + { + "epoch": 5.227289048473968, + "grad_norm": 0.8684433698654175, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 72790 + }, + { + "epoch": 5.228007181328546, + "grad_norm": 1.159390926361084, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 72800 + }, + { + "epoch": 5.228725314183124, + "grad_norm": 1.0468506813049316, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 72810 + }, + { + "epoch": 5.229443447037702, + "grad_norm": 0.8684625029563904, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 72820 + }, + { + "epoch": 5.23016157989228, + "grad_norm": 1.0117321014404297, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 72830 + }, + { + "epoch": 5.230879712746858, + "grad_norm": 1.0513219833374023, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 72840 + }, + { + "epoch": 5.231597845601437, + "grad_norm": 1.0659555196762085, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 72850 + }, + { + "epoch": 5.232315978456015, + "grad_norm": 0.7726831436157227, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 72860 + }, + { + "epoch": 5.233034111310593, + "grad_norm": 1.0346935987472534, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 72870 + }, + { + "epoch": 5.233752244165171, + "grad_norm": 0.9112410545349121, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 72880 + }, + { + "epoch": 5.234470377019749, + "grad_norm": 1.2933332920074463, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 72890 + }, + { + "epoch": 5.235188509874327, + "grad_norm": 0.9740806221961975, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 72900 + }, + { + "epoch": 5.235906642728905, + "grad_norm": 0.8041712641716003, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 72910 + }, + { + "epoch": 5.236624775583483, + "grad_norm": 0.9510180950164795, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 72920 + }, + { + "epoch": 5.237342908438061, + "grad_norm": 0.9103419780731201, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 72930 + }, + { + "epoch": 5.238061041292639, + "grad_norm": 0.8317763805389404, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 72940 + }, + { + "epoch": 5.238779174147218, + "grad_norm": 1.0269867181777954, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 72950 + }, + { + "epoch": 5.239497307001796, + "grad_norm": 1.0599713325500488, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 72960 + }, + { + "epoch": 5.240215439856374, + "grad_norm": 0.9341228008270264, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 72970 + }, + { + "epoch": 5.240933572710952, + "grad_norm": 1.1216323375701904, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 72980 + }, + { + "epoch": 5.24165170556553, + "grad_norm": 0.9396152496337891, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 72990 + }, + { + "epoch": 5.242369838420108, + "grad_norm": 1.1474549770355225, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 73000 + }, + { + "epoch": 5.243087971274686, + "grad_norm": 1.2160102128982544, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 73010 + }, + { + "epoch": 5.243806104129264, + "grad_norm": 1.0755409002304077, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 73020 + }, + { + "epoch": 5.244524236983842, + "grad_norm": 1.0645225048065186, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 73030 + }, + { + "epoch": 5.2452423698384205, + "grad_norm": 1.1155469417572021, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 73040 + }, + { + "epoch": 5.2459605026929985, + "grad_norm": 1.1631708145141602, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 73050 + }, + { + "epoch": 5.2466786355475765, + "grad_norm": 0.8747480511665344, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 73060 + }, + { + "epoch": 5.2473967684021545, + "grad_norm": 0.9174497723579407, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 73070 + }, + { + "epoch": 5.2481149012567325, + "grad_norm": 1.334018349647522, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 73080 + }, + { + "epoch": 5.2488330341113105, + "grad_norm": 1.0842393636703491, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 73090 + }, + { + "epoch": 5.2495511669658885, + "grad_norm": 1.0531692504882812, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 73100 + }, + { + "epoch": 5.2502692998204665, + "grad_norm": 0.9069980978965759, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 73110 + }, + { + "epoch": 5.2509874326750445, + "grad_norm": 1.1319832801818848, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 73120 + }, + { + "epoch": 5.2517055655296225, + "grad_norm": 1.0468456745147705, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 73130 + }, + { + "epoch": 5.252423698384201, + "grad_norm": 1.1752768754959106, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 73140 + }, + { + "epoch": 5.253141831238779, + "grad_norm": 1.0697909593582153, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 73150 + }, + { + "epoch": 5.253859964093357, + "grad_norm": 1.1179429292678833, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 73160 + }, + { + "epoch": 5.254578096947935, + "grad_norm": 0.9088113903999329, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 73170 + }, + { + "epoch": 5.255296229802513, + "grad_norm": 0.8814208507537842, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 73180 + }, + { + "epoch": 5.256014362657091, + "grad_norm": 1.026688814163208, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 73190 + }, + { + "epoch": 5.256732495511669, + "grad_norm": 0.9974902868270874, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 73200 + }, + { + "epoch": 5.257450628366247, + "grad_norm": 0.948743999004364, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 73210 + }, + { + "epoch": 5.258168761220825, + "grad_norm": 0.9069591164588928, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 73220 + }, + { + "epoch": 5.258886894075404, + "grad_norm": 1.0574030876159668, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 73230 + }, + { + "epoch": 5.259605026929982, + "grad_norm": 0.9299649596214294, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 73240 + }, + { + "epoch": 5.26032315978456, + "grad_norm": 0.9888820648193359, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 73250 + }, + { + "epoch": 5.261041292639138, + "grad_norm": 1.0164920091629028, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 73260 + }, + { + "epoch": 5.261759425493716, + "grad_norm": 0.933210551738739, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 73270 + }, + { + "epoch": 5.262477558348294, + "grad_norm": 1.1754034757614136, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 73280 + }, + { + "epoch": 5.263195691202872, + "grad_norm": 1.1599570512771606, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 73290 + }, + { + "epoch": 5.26391382405745, + "grad_norm": 1.0497905015945435, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 73300 + }, + { + "epoch": 5.264631956912028, + "grad_norm": 1.3603366613388062, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 73310 + }, + { + "epoch": 5.265350089766607, + "grad_norm": 1.0283215045928955, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 73320 + }, + { + "epoch": 5.266068222621185, + "grad_norm": 1.1043906211853027, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 73330 + }, + { + "epoch": 5.266786355475763, + "grad_norm": 0.9386111497879028, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 73340 + }, + { + "epoch": 5.267504488330341, + "grad_norm": 1.3586112260818481, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 73350 + }, + { + "epoch": 5.268222621184919, + "grad_norm": 1.034179449081421, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 73360 + }, + { + "epoch": 5.268940754039497, + "grad_norm": 0.9645284414291382, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 73370 + }, + { + "epoch": 5.269658886894075, + "grad_norm": 1.1078046560287476, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 73380 + }, + { + "epoch": 5.270377019748653, + "grad_norm": 0.9737151265144348, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 73390 + }, + { + "epoch": 5.271095152603231, + "grad_norm": 1.1911388635635376, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 73400 + }, + { + "epoch": 5.27181328545781, + "grad_norm": 0.9089180827140808, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 73410 + }, + { + "epoch": 5.272531418312388, + "grad_norm": 1.094515085220337, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 73420 + }, + { + "epoch": 5.273249551166966, + "grad_norm": 1.2531700134277344, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 73430 + }, + { + "epoch": 5.273967684021544, + "grad_norm": 0.9279667139053345, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 73440 + }, + { + "epoch": 5.274685816876122, + "grad_norm": 0.9872317314147949, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 73450 + }, + { + "epoch": 5.2754039497307, + "grad_norm": 1.0645262002944946, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 73460 + }, + { + "epoch": 5.276122082585278, + "grad_norm": 0.9505489468574524, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 73470 + }, + { + "epoch": 5.276840215439856, + "grad_norm": 1.0444035530090332, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 73480 + }, + { + "epoch": 5.277558348294434, + "grad_norm": 1.1813455820083618, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 73490 + }, + { + "epoch": 5.278276481149012, + "grad_norm": 0.782117486000061, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 73500 + }, + { + "epoch": 5.278994614003591, + "grad_norm": 0.8837172389030457, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 73510 + }, + { + "epoch": 5.279712746858169, + "grad_norm": 0.8320443630218506, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 73520 + }, + { + "epoch": 5.280430879712747, + "grad_norm": 1.111466407775879, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 73530 + }, + { + "epoch": 5.281149012567325, + "grad_norm": 1.0448017120361328, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 73540 + }, + { + "epoch": 5.281867145421903, + "grad_norm": 1.2046639919281006, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 73550 + }, + { + "epoch": 5.282585278276481, + "grad_norm": 1.084886074066162, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 73560 + }, + { + "epoch": 5.283303411131059, + "grad_norm": 0.8321937918663025, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 73570 + }, + { + "epoch": 5.284021543985637, + "grad_norm": 1.172440767288208, + "learning_rate": 0.0002, + "loss": 0.5735, + "step": 73580 + }, + { + "epoch": 5.284739676840215, + "grad_norm": 0.937133252620697, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 73590 + }, + { + "epoch": 5.285457809694794, + "grad_norm": 1.0996583700180054, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 73600 + }, + { + "epoch": 5.286175942549372, + "grad_norm": 1.2459958791732788, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 73610 + }, + { + "epoch": 5.28689407540395, + "grad_norm": 0.8362332582473755, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 73620 + }, + { + "epoch": 5.287612208258528, + "grad_norm": 0.9784061312675476, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 73630 + }, + { + "epoch": 5.288330341113106, + "grad_norm": 1.087041974067688, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 73640 + }, + { + "epoch": 5.289048473967684, + "grad_norm": 0.8641281723976135, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 73650 + }, + { + "epoch": 5.289766606822262, + "grad_norm": 1.030386209487915, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 73660 + }, + { + "epoch": 5.29048473967684, + "grad_norm": 1.0551509857177734, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 73670 + }, + { + "epoch": 5.291202872531418, + "grad_norm": 0.9969013333320618, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 73680 + }, + { + "epoch": 5.291921005385996, + "grad_norm": 0.9566490054130554, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 73690 + }, + { + "epoch": 5.292639138240575, + "grad_norm": 1.1376742124557495, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 73700 + }, + { + "epoch": 5.293357271095153, + "grad_norm": 1.0127843618392944, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 73710 + }, + { + "epoch": 5.294075403949731, + "grad_norm": 0.9500759243965149, + "learning_rate": 0.0002, + "loss": 0.5673, + "step": 73720 + }, + { + "epoch": 5.294793536804309, + "grad_norm": 0.9597342610359192, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 73730 + }, + { + "epoch": 5.295511669658887, + "grad_norm": 1.0982595682144165, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 73740 + }, + { + "epoch": 5.296229802513465, + "grad_norm": 0.9007689952850342, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 73750 + }, + { + "epoch": 5.296947935368043, + "grad_norm": 0.9329614639282227, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 73760 + }, + { + "epoch": 5.297666068222621, + "grad_norm": 1.235142469406128, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 73770 + }, + { + "epoch": 5.298384201077199, + "grad_norm": 1.0875943899154663, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 73780 + }, + { + "epoch": 5.299102333931778, + "grad_norm": 1.0499054193496704, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 73790 + }, + { + "epoch": 5.299820466786356, + "grad_norm": 1.117954969406128, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 73800 + }, + { + "epoch": 5.300538599640934, + "grad_norm": 0.800291121006012, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 73810 + }, + { + "epoch": 5.301256732495512, + "grad_norm": 1.1461842060089111, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 73820 + }, + { + "epoch": 5.30197486535009, + "grad_norm": 1.0084760189056396, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 73830 + }, + { + "epoch": 5.302692998204668, + "grad_norm": 1.1249386072158813, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 73840 + }, + { + "epoch": 5.303411131059246, + "grad_norm": 1.0846004486083984, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 73850 + }, + { + "epoch": 5.304129263913824, + "grad_norm": 1.1557925939559937, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 73860 + }, + { + "epoch": 5.304847396768402, + "grad_norm": 1.2287988662719727, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 73870 + }, + { + "epoch": 5.30556552962298, + "grad_norm": 0.9618542194366455, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 73880 + }, + { + "epoch": 5.306283662477559, + "grad_norm": 0.9429472088813782, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 73890 + }, + { + "epoch": 5.307001795332137, + "grad_norm": 0.9032631516456604, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 73900 + }, + { + "epoch": 5.307719928186715, + "grad_norm": 1.0008580684661865, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 73910 + }, + { + "epoch": 5.308438061041293, + "grad_norm": 0.9795624017715454, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 73920 + }, + { + "epoch": 5.309156193895871, + "grad_norm": 1.1194090843200684, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 73930 + }, + { + "epoch": 5.309874326750449, + "grad_norm": 1.1057528257369995, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 73940 + }, + { + "epoch": 5.310592459605027, + "grad_norm": 0.7807615995407104, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 73950 + }, + { + "epoch": 5.311310592459605, + "grad_norm": 0.9465593099594116, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 73960 + }, + { + "epoch": 5.312028725314184, + "grad_norm": 1.104210615158081, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 73970 + }, + { + "epoch": 5.312746858168762, + "grad_norm": 1.0452964305877686, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 73980 + }, + { + "epoch": 5.31346499102334, + "grad_norm": 1.0314992666244507, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 73990 + }, + { + "epoch": 5.314183123877918, + "grad_norm": 0.9187130928039551, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 74000 + }, + { + "epoch": 5.314901256732496, + "grad_norm": 0.8660678267478943, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 74010 + }, + { + "epoch": 5.315619389587074, + "grad_norm": 0.9470953345298767, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 74020 + }, + { + "epoch": 5.316337522441652, + "grad_norm": 1.0028631687164307, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 74030 + }, + { + "epoch": 5.31705565529623, + "grad_norm": 1.0237356424331665, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 74040 + }, + { + "epoch": 5.317773788150808, + "grad_norm": 1.0299798250198364, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 74050 + }, + { + "epoch": 5.318491921005386, + "grad_norm": 1.0326799154281616, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 74060 + }, + { + "epoch": 5.3192100538599645, + "grad_norm": 1.156346082687378, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 74070 + }, + { + "epoch": 5.3199281867145425, + "grad_norm": 1.1542664766311646, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 74080 + }, + { + "epoch": 5.3206463195691205, + "grad_norm": 1.0503013134002686, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 74090 + }, + { + "epoch": 5.3213644524236985, + "grad_norm": 1.1088979244232178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 74100 + }, + { + "epoch": 5.3220825852782765, + "grad_norm": 0.9314014911651611, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 74110 + }, + { + "epoch": 5.3228007181328545, + "grad_norm": 1.0813525915145874, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 74120 + }, + { + "epoch": 5.3235188509874325, + "grad_norm": 0.7824062705039978, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 74130 + }, + { + "epoch": 5.3242369838420105, + "grad_norm": 1.0552699565887451, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 74140 + }, + { + "epoch": 5.3249551166965885, + "grad_norm": 1.0916554927825928, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 74150 + }, + { + "epoch": 5.325673249551167, + "grad_norm": 1.205618143081665, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 74160 + }, + { + "epoch": 5.326391382405745, + "grad_norm": 1.2551230192184448, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 74170 + }, + { + "epoch": 5.327109515260323, + "grad_norm": 0.7715005278587341, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 74180 + }, + { + "epoch": 5.327827648114901, + "grad_norm": 1.1059352159500122, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 74190 + }, + { + "epoch": 5.328545780969479, + "grad_norm": 0.9441812634468079, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 74200 + }, + { + "epoch": 5.329263913824057, + "grad_norm": 1.0012084245681763, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 74210 + }, + { + "epoch": 5.329982046678635, + "grad_norm": 0.8594073057174683, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 74220 + }, + { + "epoch": 5.330700179533213, + "grad_norm": 0.8931775093078613, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 74230 + }, + { + "epoch": 5.331418312387791, + "grad_norm": 0.967250406742096, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 74240 + }, + { + "epoch": 5.332136445242369, + "grad_norm": 0.9776269793510437, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 74250 + }, + { + "epoch": 5.332854578096948, + "grad_norm": 0.9393186569213867, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 74260 + }, + { + "epoch": 5.333572710951526, + "grad_norm": 1.0081093311309814, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 74270 + }, + { + "epoch": 5.334290843806104, + "grad_norm": 0.9002147316932678, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 74280 + }, + { + "epoch": 5.335008976660682, + "grad_norm": 0.9237701296806335, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 74290 + }, + { + "epoch": 5.33572710951526, + "grad_norm": 1.070694923400879, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 74300 + }, + { + "epoch": 5.336445242369838, + "grad_norm": 1.0134668350219727, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 74310 + }, + { + "epoch": 5.337163375224416, + "grad_norm": 1.0903294086456299, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 74320 + }, + { + "epoch": 5.337881508078994, + "grad_norm": 0.9000239372253418, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 74330 + }, + { + "epoch": 5.338599640933572, + "grad_norm": 1.0584321022033691, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 74340 + }, + { + "epoch": 5.339317773788151, + "grad_norm": 1.046420931816101, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 74350 + }, + { + "epoch": 5.340035906642729, + "grad_norm": 0.8862320184707642, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 74360 + }, + { + "epoch": 5.340754039497307, + "grad_norm": 0.8197309970855713, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 74370 + }, + { + "epoch": 5.341472172351885, + "grad_norm": 0.9539661407470703, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 74380 + }, + { + "epoch": 5.342190305206463, + "grad_norm": 1.481026530265808, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 74390 + }, + { + "epoch": 5.342908438061041, + "grad_norm": 1.0685169696807861, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 74400 + }, + { + "epoch": 5.343626570915619, + "grad_norm": 1.1468359231948853, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 74410 + }, + { + "epoch": 5.344344703770197, + "grad_norm": 0.9982373714447021, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 74420 + }, + { + "epoch": 5.345062836624775, + "grad_norm": 0.9273471236228943, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 74430 + }, + { + "epoch": 5.345780969479353, + "grad_norm": 1.058828592300415, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 74440 + }, + { + "epoch": 5.346499102333932, + "grad_norm": 1.0442006587982178, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 74450 + }, + { + "epoch": 5.34721723518851, + "grad_norm": 1.0955053567886353, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 74460 + }, + { + "epoch": 5.347935368043088, + "grad_norm": 0.9326002597808838, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 74470 + }, + { + "epoch": 5.348653500897666, + "grad_norm": 0.9496979117393494, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 74480 + }, + { + "epoch": 5.349371633752244, + "grad_norm": 1.1995937824249268, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 74490 + }, + { + "epoch": 5.350089766606822, + "grad_norm": 0.8761899471282959, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 74500 + }, + { + "epoch": 5.3508078994614, + "grad_norm": 1.2390170097351074, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 74510 + }, + { + "epoch": 5.351526032315978, + "grad_norm": 0.9101138114929199, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 74520 + }, + { + "epoch": 5.352244165170557, + "grad_norm": 0.925466001033783, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 74530 + }, + { + "epoch": 5.352962298025135, + "grad_norm": 0.9483969807624817, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 74540 + }, + { + "epoch": 5.353680430879713, + "grad_norm": 1.0530859231948853, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 74550 + }, + { + "epoch": 5.354398563734291, + "grad_norm": 1.209647536277771, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 74560 + }, + { + "epoch": 5.355116696588869, + "grad_norm": 0.9849331378936768, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 74570 + }, + { + "epoch": 5.355834829443447, + "grad_norm": 1.0822848081588745, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 74580 + }, + { + "epoch": 5.356552962298025, + "grad_norm": 1.1460528373718262, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 74590 + }, + { + "epoch": 5.357271095152603, + "grad_norm": 0.9509134292602539, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 74600 + }, + { + "epoch": 5.357989228007181, + "grad_norm": 0.9884999394416809, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 74610 + }, + { + "epoch": 5.358707360861759, + "grad_norm": 0.9619579911231995, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 74620 + }, + { + "epoch": 5.359425493716338, + "grad_norm": 0.8596125245094299, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 74630 + }, + { + "epoch": 5.360143626570916, + "grad_norm": 1.16913640499115, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 74640 + }, + { + "epoch": 5.360861759425494, + "grad_norm": 0.99276202917099, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 74650 + }, + { + "epoch": 5.361579892280072, + "grad_norm": 1.1293696165084839, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 74660 + }, + { + "epoch": 5.36229802513465, + "grad_norm": 1.187947154045105, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 74670 + }, + { + "epoch": 5.363016157989228, + "grad_norm": 0.8637247681617737, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 74680 + }, + { + "epoch": 5.363734290843806, + "grad_norm": 1.1049476861953735, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 74690 + }, + { + "epoch": 5.364452423698384, + "grad_norm": 1.1736515760421753, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 74700 + }, + { + "epoch": 5.365170556552962, + "grad_norm": 1.0203301906585693, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 74710 + }, + { + "epoch": 5.365888689407541, + "grad_norm": 1.15559720993042, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 74720 + }, + { + "epoch": 5.366606822262119, + "grad_norm": 1.2008144855499268, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 74730 + }, + { + "epoch": 5.367324955116697, + "grad_norm": 1.0385756492614746, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 74740 + }, + { + "epoch": 5.368043087971275, + "grad_norm": 0.8964240550994873, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 74750 + }, + { + "epoch": 5.368761220825853, + "grad_norm": 0.9824761748313904, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 74760 + }, + { + "epoch": 5.369479353680431, + "grad_norm": 0.8815994262695312, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 74770 + }, + { + "epoch": 5.370197486535009, + "grad_norm": 0.9729493856430054, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 74780 + }, + { + "epoch": 5.370915619389587, + "grad_norm": 1.1032123565673828, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 74790 + }, + { + "epoch": 5.371633752244165, + "grad_norm": 1.039591908454895, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 74800 + }, + { + "epoch": 5.372351885098743, + "grad_norm": 0.9741610884666443, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 74810 + }, + { + "epoch": 5.373070017953322, + "grad_norm": 0.9789814949035645, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 74820 + }, + { + "epoch": 5.3737881508079, + "grad_norm": 1.0777033567428589, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 74830 + }, + { + "epoch": 5.374506283662478, + "grad_norm": 0.9058641195297241, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 74840 + }, + { + "epoch": 5.375224416517056, + "grad_norm": 1.2161815166473389, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 74850 + }, + { + "epoch": 5.375942549371634, + "grad_norm": 1.1079481840133667, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 74860 + }, + { + "epoch": 5.376660682226212, + "grad_norm": 0.9494470357894897, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 74870 + }, + { + "epoch": 5.37737881508079, + "grad_norm": 1.0116358995437622, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 74880 + }, + { + "epoch": 5.378096947935368, + "grad_norm": 0.9382423162460327, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 74890 + }, + { + "epoch": 5.378815080789946, + "grad_norm": 1.036151647567749, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 74900 + }, + { + "epoch": 5.379533213644525, + "grad_norm": 0.9436623454093933, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 74910 + }, + { + "epoch": 5.380251346499103, + "grad_norm": 1.0149152278900146, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 74920 + }, + { + "epoch": 5.380969479353681, + "grad_norm": 1.1645641326904297, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 74930 + }, + { + "epoch": 5.381687612208259, + "grad_norm": 1.002287745475769, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 74940 + }, + { + "epoch": 5.382405745062837, + "grad_norm": 1.1176437139511108, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 74950 + }, + { + "epoch": 5.383123877917415, + "grad_norm": 0.9210802912712097, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 74960 + }, + { + "epoch": 5.383842010771993, + "grad_norm": 1.1873447895050049, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 74970 + }, + { + "epoch": 5.384560143626571, + "grad_norm": 0.8372976779937744, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 74980 + }, + { + "epoch": 5.385278276481149, + "grad_norm": 0.9220532178878784, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 74990 + }, + { + "epoch": 5.385996409335727, + "grad_norm": 0.9196901917457581, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 75000 + }, + { + "epoch": 5.3867145421903055, + "grad_norm": 0.9325235486030579, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 75010 + }, + { + "epoch": 5.3874326750448835, + "grad_norm": 1.0902531147003174, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 75020 + }, + { + "epoch": 5.3881508078994615, + "grad_norm": 1.049468755722046, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 75030 + }, + { + "epoch": 5.3888689407540395, + "grad_norm": 0.9372574687004089, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 75040 + }, + { + "epoch": 5.3895870736086176, + "grad_norm": 0.9013437628746033, + "learning_rate": 0.0002, + "loss": 0.6158, + "step": 75050 + }, + { + "epoch": 5.3903052064631956, + "grad_norm": 1.2111071348190308, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 75060 + }, + { + "epoch": 5.3910233393177736, + "grad_norm": 1.0006011724472046, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 75070 + }, + { + "epoch": 5.391741472172352, + "grad_norm": 0.9180546402931213, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 75080 + }, + { + "epoch": 5.3924596050269304, + "grad_norm": 1.096113920211792, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 75090 + }, + { + "epoch": 5.3931777378815084, + "grad_norm": 0.9041603207588196, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 75100 + }, + { + "epoch": 5.3938958707360865, + "grad_norm": 0.9675783514976501, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 75110 + }, + { + "epoch": 5.3946140035906645, + "grad_norm": 1.0952513217926025, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 75120 + }, + { + "epoch": 5.3953321364452425, + "grad_norm": 1.0166294574737549, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75130 + }, + { + "epoch": 5.3960502692998205, + "grad_norm": 1.0892874002456665, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 75140 + }, + { + "epoch": 5.3967684021543985, + "grad_norm": 0.9894046187400818, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 75150 + }, + { + "epoch": 5.3974865350089765, + "grad_norm": 0.9991754293441772, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 75160 + }, + { + "epoch": 5.3982046678635545, + "grad_norm": 1.1027519702911377, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 75170 + }, + { + "epoch": 5.3989228007181325, + "grad_norm": 1.0579880475997925, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 75180 + }, + { + "epoch": 5.399640933572711, + "grad_norm": 1.1149101257324219, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 75190 + }, + { + "epoch": 5.400359066427289, + "grad_norm": 0.8802945017814636, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 75200 + }, + { + "epoch": 5.401077199281867, + "grad_norm": 0.9168137907981873, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 75210 + }, + { + "epoch": 5.401795332136445, + "grad_norm": 1.232630968093872, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 75220 + }, + { + "epoch": 5.402513464991023, + "grad_norm": 1.1038591861724854, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 75230 + }, + { + "epoch": 5.403231597845601, + "grad_norm": 0.8985993266105652, + "learning_rate": 0.0002, + "loss": 0.5754, + "step": 75240 + }, + { + "epoch": 5.403949730700179, + "grad_norm": 1.1096316576004028, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 75250 + }, + { + "epoch": 5.404667863554757, + "grad_norm": 0.8516051173210144, + "learning_rate": 0.0002, + "loss": 0.5834, + "step": 75260 + }, + { + "epoch": 5.405385996409335, + "grad_norm": 0.9967356324195862, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 75270 + }, + { + "epoch": 5.406104129263914, + "grad_norm": 1.0092874765396118, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 75280 + }, + { + "epoch": 5.406822262118492, + "grad_norm": 1.049838662147522, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 75290 + }, + { + "epoch": 5.40754039497307, + "grad_norm": 1.1491070985794067, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 75300 + }, + { + "epoch": 5.408258527827648, + "grad_norm": 0.9348118901252747, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 75310 + }, + { + "epoch": 5.408976660682226, + "grad_norm": 1.1226147413253784, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 75320 + }, + { + "epoch": 5.409694793536804, + "grad_norm": 0.9042587876319885, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 75330 + }, + { + "epoch": 5.410412926391382, + "grad_norm": 1.1212877035140991, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 75340 + }, + { + "epoch": 5.41113105924596, + "grad_norm": 0.9805570840835571, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 75350 + }, + { + "epoch": 5.411849192100538, + "grad_norm": 0.9803917407989502, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 75360 + }, + { + "epoch": 5.412567324955116, + "grad_norm": 1.2139064073562622, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 75370 + }, + { + "epoch": 5.413285457809695, + "grad_norm": 0.9510865211486816, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 75380 + }, + { + "epoch": 5.414003590664273, + "grad_norm": 1.0752202272415161, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 75390 + }, + { + "epoch": 5.414721723518851, + "grad_norm": 1.1144053936004639, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 75400 + }, + { + "epoch": 5.415439856373429, + "grad_norm": 1.128998875617981, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 75410 + }, + { + "epoch": 5.416157989228007, + "grad_norm": 1.2901849746704102, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 75420 + }, + { + "epoch": 5.416876122082585, + "grad_norm": 1.2822786569595337, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 75430 + }, + { + "epoch": 5.417594254937163, + "grad_norm": 0.8724783658981323, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 75440 + }, + { + "epoch": 5.418312387791741, + "grad_norm": 1.1321152448654175, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 75450 + }, + { + "epoch": 5.419030520646319, + "grad_norm": 1.1211779117584229, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 75460 + }, + { + "epoch": 5.419748653500898, + "grad_norm": 1.0542290210723877, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 75470 + }, + { + "epoch": 5.420466786355476, + "grad_norm": 0.9432206153869629, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 75480 + }, + { + "epoch": 5.421184919210054, + "grad_norm": 1.2051608562469482, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 75490 + }, + { + "epoch": 5.421903052064632, + "grad_norm": 1.188256859779358, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 75500 + }, + { + "epoch": 5.42262118491921, + "grad_norm": 1.2768784761428833, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 75510 + }, + { + "epoch": 5.423339317773788, + "grad_norm": 0.8228567242622375, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75520 + }, + { + "epoch": 5.424057450628366, + "grad_norm": 1.235684871673584, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 75530 + }, + { + "epoch": 5.424775583482944, + "grad_norm": 0.8361109495162964, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 75540 + }, + { + "epoch": 5.425493716337522, + "grad_norm": 1.0450727939605713, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 75550 + }, + { + "epoch": 5.4262118491921, + "grad_norm": 0.9942979216575623, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 75560 + }, + { + "epoch": 5.426929982046679, + "grad_norm": 0.8162592053413391, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 75570 + }, + { + "epoch": 5.427648114901257, + "grad_norm": 0.9193033576011658, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 75580 + }, + { + "epoch": 5.428366247755835, + "grad_norm": 1.095130443572998, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 75590 + }, + { + "epoch": 5.429084380610413, + "grad_norm": 1.1752824783325195, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 75600 + }, + { + "epoch": 5.429802513464991, + "grad_norm": 1.2007960081100464, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 75610 + }, + { + "epoch": 5.430520646319569, + "grad_norm": 0.997347354888916, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 75620 + }, + { + "epoch": 5.431238779174147, + "grad_norm": 1.3878827095031738, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 75630 + }, + { + "epoch": 5.431956912028725, + "grad_norm": 1.1839812994003296, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 75640 + }, + { + "epoch": 5.432675044883303, + "grad_norm": 0.9912546873092651, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 75650 + }, + { + "epoch": 5.433393177737882, + "grad_norm": 0.9305517673492432, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 75660 + }, + { + "epoch": 5.43411131059246, + "grad_norm": 1.0036604404449463, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 75670 + }, + { + "epoch": 5.434829443447038, + "grad_norm": 1.2500226497650146, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 75680 + }, + { + "epoch": 5.435547576301616, + "grad_norm": 0.9476167559623718, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 75690 + }, + { + "epoch": 5.436265709156194, + "grad_norm": 0.9769760370254517, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 75700 + }, + { + "epoch": 5.436983842010772, + "grad_norm": 1.1001025438308716, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 75710 + }, + { + "epoch": 5.43770197486535, + "grad_norm": 1.1783069372177124, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 75720 + }, + { + "epoch": 5.438420107719928, + "grad_norm": 0.887438952922821, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75730 + }, + { + "epoch": 5.439138240574506, + "grad_norm": 0.9631154537200928, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 75740 + }, + { + "epoch": 5.439856373429085, + "grad_norm": 1.0824158191680908, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 75750 + }, + { + "epoch": 5.440574506283663, + "grad_norm": 1.0108296871185303, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 75760 + }, + { + "epoch": 5.441292639138241, + "grad_norm": 1.1728253364562988, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 75770 + }, + { + "epoch": 5.442010771992819, + "grad_norm": 1.0904773473739624, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 75780 + }, + { + "epoch": 5.442728904847397, + "grad_norm": 0.8982957601547241, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 75790 + }, + { + "epoch": 5.443447037701975, + "grad_norm": 1.0233404636383057, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 75800 + }, + { + "epoch": 5.444165170556553, + "grad_norm": 1.0092064142227173, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 75810 + }, + { + "epoch": 5.444883303411131, + "grad_norm": 1.2747842073440552, + "learning_rate": 0.0002, + "loss": 0.5673, + "step": 75820 + }, + { + "epoch": 5.445601436265709, + "grad_norm": 1.0365403890609741, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 75830 + }, + { + "epoch": 5.446319569120288, + "grad_norm": 1.0413976907730103, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 75840 + }, + { + "epoch": 5.447037701974866, + "grad_norm": 0.8858456015586853, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 75850 + }, + { + "epoch": 5.447755834829444, + "grad_norm": 0.9823445677757263, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 75860 + }, + { + "epoch": 5.448473967684022, + "grad_norm": 0.8515284061431885, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 75870 + }, + { + "epoch": 5.4491921005386, + "grad_norm": 1.130850911140442, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 75880 + }, + { + "epoch": 5.449910233393178, + "grad_norm": 0.984725832939148, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 75890 + }, + { + "epoch": 5.450628366247756, + "grad_norm": 1.1701595783233643, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 75900 + }, + { + "epoch": 5.451346499102334, + "grad_norm": 0.8988107442855835, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 75910 + }, + { + "epoch": 5.452064631956912, + "grad_norm": 0.9909947514533997, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 75920 + }, + { + "epoch": 5.45278276481149, + "grad_norm": 0.8861672282218933, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 75930 + }, + { + "epoch": 5.453500897666069, + "grad_norm": 0.9513981938362122, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 75940 + }, + { + "epoch": 5.454219030520647, + "grad_norm": 1.0320760011672974, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 75950 + }, + { + "epoch": 5.454937163375225, + "grad_norm": 0.9830206632614136, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 75960 + }, + { + "epoch": 5.455655296229803, + "grad_norm": 0.9816349148750305, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 75970 + }, + { + "epoch": 5.456373429084381, + "grad_norm": 0.9741218090057373, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 75980 + }, + { + "epoch": 5.457091561938959, + "grad_norm": 1.1291148662567139, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 75990 + }, + { + "epoch": 5.457809694793537, + "grad_norm": 0.9770109057426453, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 76000 + }, + { + "epoch": 5.458527827648115, + "grad_norm": 1.0204377174377441, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 76010 + }, + { + "epoch": 5.459245960502693, + "grad_norm": 1.0453336238861084, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 76020 + }, + { + "epoch": 5.4599640933572715, + "grad_norm": 1.1595505475997925, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 76030 + }, + { + "epoch": 5.4606822262118495, + "grad_norm": 1.1686701774597168, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 76040 + }, + { + "epoch": 5.4614003590664275, + "grad_norm": 1.14364755153656, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 76050 + }, + { + "epoch": 5.4621184919210055, + "grad_norm": 0.9742125868797302, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 76060 + }, + { + "epoch": 5.4628366247755835, + "grad_norm": 0.8235608339309692, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 76070 + }, + { + "epoch": 5.4635547576301615, + "grad_norm": 0.9801425337791443, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 76080 + }, + { + "epoch": 5.4642728904847395, + "grad_norm": 0.9001221060752869, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 76090 + }, + { + "epoch": 5.4649910233393175, + "grad_norm": 0.9292157888412476, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 76100 + }, + { + "epoch": 5.4657091561938955, + "grad_norm": 1.0024322271347046, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 76110 + }, + { + "epoch": 5.4664272890484735, + "grad_norm": 0.8057159781455994, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 76120 + }, + { + "epoch": 5.467145421903052, + "grad_norm": 1.0617927312850952, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 76130 + }, + { + "epoch": 5.46786355475763, + "grad_norm": 1.003967046737671, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 76140 + }, + { + "epoch": 5.468581687612208, + "grad_norm": 0.903408944606781, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 76150 + }, + { + "epoch": 5.469299820466786, + "grad_norm": 0.8173895478248596, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 76160 + }, + { + "epoch": 5.470017953321364, + "grad_norm": 1.0187482833862305, + "learning_rate": 0.0002, + "loss": 0.5526, + "step": 76170 + }, + { + "epoch": 5.470736086175942, + "grad_norm": 1.0418041944503784, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 76180 + }, + { + "epoch": 5.47145421903052, + "grad_norm": 0.9768357872962952, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 76190 + }, + { + "epoch": 5.472172351885098, + "grad_norm": 1.0834382772445679, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 76200 + }, + { + "epoch": 5.472890484739676, + "grad_norm": 0.8447439670562744, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 76210 + }, + { + "epoch": 5.473608617594255, + "grad_norm": 0.9379050135612488, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 76220 + }, + { + "epoch": 5.474326750448833, + "grad_norm": 1.0395485162734985, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 76230 + }, + { + "epoch": 5.475044883303411, + "grad_norm": 1.2082624435424805, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 76240 + }, + { + "epoch": 5.475763016157989, + "grad_norm": 1.0714443922042847, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 76250 + }, + { + "epoch": 5.476481149012567, + "grad_norm": 0.945319414138794, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 76260 + }, + { + "epoch": 5.477199281867145, + "grad_norm": 1.1415241956710815, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 76270 + }, + { + "epoch": 5.477917414721723, + "grad_norm": 0.9221673011779785, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 76280 + }, + { + "epoch": 5.478635547576301, + "grad_norm": 1.0118398666381836, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 76290 + }, + { + "epoch": 5.479353680430879, + "grad_norm": 1.396807312965393, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 76300 + }, + { + "epoch": 5.480071813285457, + "grad_norm": 1.0437991619110107, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 76310 + }, + { + "epoch": 5.480789946140036, + "grad_norm": 1.5910401344299316, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 76320 + }, + { + "epoch": 5.481508078994614, + "grad_norm": 0.9262010455131531, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 76330 + }, + { + "epoch": 5.482226211849192, + "grad_norm": 1.2534247636795044, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 76340 + }, + { + "epoch": 5.48294434470377, + "grad_norm": 1.186294674873352, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 76350 + }, + { + "epoch": 5.483662477558348, + "grad_norm": 0.9822857975959778, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 76360 + }, + { + "epoch": 5.484380610412926, + "grad_norm": 1.0006381273269653, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 76370 + }, + { + "epoch": 5.485098743267504, + "grad_norm": 0.8960304260253906, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 76380 + }, + { + "epoch": 5.485816876122082, + "grad_norm": 0.7309539914131165, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 76390 + }, + { + "epoch": 5.486535008976661, + "grad_norm": 0.9747139811515808, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 76400 + }, + { + "epoch": 5.487253141831239, + "grad_norm": 0.9586864113807678, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 76410 + }, + { + "epoch": 5.487971274685817, + "grad_norm": 1.0815327167510986, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 76420 + }, + { + "epoch": 5.488689407540395, + "grad_norm": 1.1324117183685303, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 76430 + }, + { + "epoch": 5.489407540394973, + "grad_norm": 0.8575648069381714, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 76440 + }, + { + "epoch": 5.490125673249551, + "grad_norm": 0.9821682572364807, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 76450 + }, + { + "epoch": 5.490843806104129, + "grad_norm": 1.1611464023590088, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 76460 + }, + { + "epoch": 5.491561938958707, + "grad_norm": 1.0340297222137451, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 76470 + }, + { + "epoch": 5.492280071813285, + "grad_norm": 1.0116628408432007, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 76480 + }, + { + "epoch": 5.492998204667863, + "grad_norm": 0.9619752764701843, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 76490 + }, + { + "epoch": 5.493716337522442, + "grad_norm": 0.9924456477165222, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 76500 + }, + { + "epoch": 5.49443447037702, + "grad_norm": 0.9449224472045898, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 76510 + }, + { + "epoch": 5.495152603231598, + "grad_norm": 0.9075009822845459, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 76520 + }, + { + "epoch": 5.495870736086176, + "grad_norm": 1.3078763484954834, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 76530 + }, + { + "epoch": 5.496588868940754, + "grad_norm": 1.3162729740142822, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 76540 + }, + { + "epoch": 5.497307001795332, + "grad_norm": 1.144333839416504, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 76550 + }, + { + "epoch": 5.49802513464991, + "grad_norm": 0.9332208633422852, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 76560 + }, + { + "epoch": 5.498743267504488, + "grad_norm": 0.9660165309906006, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 76570 + }, + { + "epoch": 5.499461400359066, + "grad_norm": 1.0954749584197998, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 76580 + }, + { + "epoch": 5.500179533213645, + "grad_norm": 1.0537810325622559, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 76590 + }, + { + "epoch": 5.500897666068223, + "grad_norm": 0.9944321513175964, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 76600 + }, + { + "epoch": 5.501615798922801, + "grad_norm": 1.094462513923645, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 76610 + }, + { + "epoch": 5.502333931777379, + "grad_norm": 1.0246481895446777, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 76620 + }, + { + "epoch": 5.503052064631957, + "grad_norm": 0.9705453515052795, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 76630 + }, + { + "epoch": 5.503770197486535, + "grad_norm": 1.5252249240875244, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 76640 + }, + { + "epoch": 5.504488330341113, + "grad_norm": 0.8469606637954712, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 76650 + }, + { + "epoch": 5.505206463195691, + "grad_norm": 1.1882504224777222, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 76660 + }, + { + "epoch": 5.505924596050269, + "grad_norm": 0.8447994589805603, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 76670 + }, + { + "epoch": 5.506642728904847, + "grad_norm": 0.9340696930885315, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 76680 + }, + { + "epoch": 5.507360861759426, + "grad_norm": 0.9622383713722229, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 76690 + }, + { + "epoch": 5.508078994614004, + "grad_norm": 1.1516523361206055, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 76700 + }, + { + "epoch": 5.508797127468582, + "grad_norm": 1.207190990447998, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 76710 + }, + { + "epoch": 5.50951526032316, + "grad_norm": 1.1244179010391235, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 76720 + }, + { + "epoch": 5.510233393177738, + "grad_norm": 1.052288293838501, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 76730 + }, + { + "epoch": 5.510951526032316, + "grad_norm": 0.9571291208267212, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 76740 + }, + { + "epoch": 5.511669658886894, + "grad_norm": 0.9449458122253418, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 76750 + }, + { + "epoch": 5.512387791741472, + "grad_norm": 1.0140511989593506, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 76760 + }, + { + "epoch": 5.513105924596051, + "grad_norm": 1.057715654373169, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 76770 + }, + { + "epoch": 5.513824057450629, + "grad_norm": 0.930642306804657, + "learning_rate": 0.0002, + "loss": 0.5643, + "step": 76780 + }, + { + "epoch": 5.514542190305207, + "grad_norm": 1.1213828325271606, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 76790 + }, + { + "epoch": 5.515260323159785, + "grad_norm": 0.9147387742996216, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 76800 + }, + { + "epoch": 5.515978456014363, + "grad_norm": 1.1786983013153076, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 76810 + }, + { + "epoch": 5.516696588868941, + "grad_norm": 1.1022626161575317, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 76820 + }, + { + "epoch": 5.517414721723519, + "grad_norm": 1.0389000177383423, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 76830 + }, + { + "epoch": 5.518132854578097, + "grad_norm": 1.0750621557235718, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 76840 + }, + { + "epoch": 5.518850987432675, + "grad_norm": 1.0372626781463623, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 76850 + }, + { + "epoch": 5.519569120287253, + "grad_norm": 1.0989108085632324, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 76860 + }, + { + "epoch": 5.520287253141831, + "grad_norm": 1.030346155166626, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 76870 + }, + { + "epoch": 5.52100538599641, + "grad_norm": 1.1362419128417969, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 76880 + }, + { + "epoch": 5.521723518850988, + "grad_norm": 0.9110873937606812, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 76890 + }, + { + "epoch": 5.522441651705566, + "grad_norm": 1.0214358568191528, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 76900 + }, + { + "epoch": 5.523159784560144, + "grad_norm": 1.3764830827713013, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 76910 + }, + { + "epoch": 5.523877917414722, + "grad_norm": 1.0396335124969482, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 76920 + }, + { + "epoch": 5.5245960502693, + "grad_norm": 1.1942898035049438, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 76930 + }, + { + "epoch": 5.525314183123878, + "grad_norm": 0.8795760869979858, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 76940 + }, + { + "epoch": 5.526032315978456, + "grad_norm": 1.1081048250198364, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 76950 + }, + { + "epoch": 5.526750448833035, + "grad_norm": 0.9652274250984192, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 76960 + }, + { + "epoch": 5.527468581687613, + "grad_norm": 0.96559739112854, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 76970 + }, + { + "epoch": 5.528186714542191, + "grad_norm": 1.0416076183319092, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 76980 + }, + { + "epoch": 5.528904847396769, + "grad_norm": 0.9854229092597961, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 76990 + }, + { + "epoch": 5.529622980251347, + "grad_norm": 1.0515462160110474, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 77000 + }, + { + "epoch": 5.530341113105925, + "grad_norm": 1.0287327766418457, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 77010 + }, + { + "epoch": 5.531059245960503, + "grad_norm": 0.9579883217811584, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 77020 + }, + { + "epoch": 5.531777378815081, + "grad_norm": 1.0365805625915527, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 77030 + }, + { + "epoch": 5.532495511669659, + "grad_norm": 1.1600725650787354, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 77040 + }, + { + "epoch": 5.533213644524237, + "grad_norm": 0.8598031401634216, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 77050 + }, + { + "epoch": 5.533931777378815, + "grad_norm": 0.8884791731834412, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 77060 + }, + { + "epoch": 5.5346499102333935, + "grad_norm": 0.900223433971405, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 77070 + }, + { + "epoch": 5.5353680430879715, + "grad_norm": 1.0212652683258057, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 77080 + }, + { + "epoch": 5.5360861759425495, + "grad_norm": 1.0924701690673828, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 77090 + }, + { + "epoch": 5.5368043087971275, + "grad_norm": 1.1955485343933105, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 77100 + }, + { + "epoch": 5.5375224416517055, + "grad_norm": 1.2157706022262573, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 77110 + }, + { + "epoch": 5.5382405745062835, + "grad_norm": 1.1118255853652954, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 77120 + }, + { + "epoch": 5.5389587073608615, + "grad_norm": 1.0146820545196533, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 77130 + }, + { + "epoch": 5.5396768402154395, + "grad_norm": 1.0876632928848267, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 77140 + }, + { + "epoch": 5.540394973070018, + "grad_norm": 0.7914495468139648, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 77150 + }, + { + "epoch": 5.541113105924596, + "grad_norm": 1.0584027767181396, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 77160 + }, + { + "epoch": 5.541831238779174, + "grad_norm": 0.9816845059394836, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 77170 + }, + { + "epoch": 5.542549371633752, + "grad_norm": 1.219076156616211, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 77180 + }, + { + "epoch": 5.54326750448833, + "grad_norm": 0.9526635408401489, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 77190 + }, + { + "epoch": 5.543985637342908, + "grad_norm": 0.8437230587005615, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 77200 + }, + { + "epoch": 5.544703770197486, + "grad_norm": 0.9670451283454895, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 77210 + }, + { + "epoch": 5.545421903052064, + "grad_norm": 1.015687346458435, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 77220 + }, + { + "epoch": 5.546140035906642, + "grad_norm": 0.8280553817749023, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 77230 + }, + { + "epoch": 5.54685816876122, + "grad_norm": 1.1320816278457642, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 77240 + }, + { + "epoch": 5.547576301615799, + "grad_norm": 1.3338711261749268, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 77250 + }, + { + "epoch": 5.548294434470377, + "grad_norm": 0.9553194642066956, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 77260 + }, + { + "epoch": 5.549012567324955, + "grad_norm": 1.0604912042617798, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 77270 + }, + { + "epoch": 5.549730700179533, + "grad_norm": 1.1037590503692627, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 77280 + }, + { + "epoch": 5.550448833034111, + "grad_norm": 1.166212558746338, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 77290 + }, + { + "epoch": 5.551166965888689, + "grad_norm": 1.0189802646636963, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 77300 + }, + { + "epoch": 5.551885098743267, + "grad_norm": 0.9592387080192566, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 77310 + }, + { + "epoch": 5.552603231597845, + "grad_norm": 0.9533785581588745, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 77320 + }, + { + "epoch": 5.553321364452424, + "grad_norm": 0.9666807055473328, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 77330 + }, + { + "epoch": 5.554039497307002, + "grad_norm": 0.8827478289604187, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 77340 + }, + { + "epoch": 5.55475763016158, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 77350 + }, + { + "epoch": 5.555475763016158, + "grad_norm": 1.14597487449646, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 77360 + }, + { + "epoch": 5.556193895870736, + "grad_norm": 1.009392499923706, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 77370 + }, + { + "epoch": 5.556912028725314, + "grad_norm": 1.115757942199707, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 77380 + }, + { + "epoch": 5.557630161579892, + "grad_norm": 0.9907452464103699, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 77390 + }, + { + "epoch": 5.55834829443447, + "grad_norm": 1.0667012929916382, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 77400 + }, + { + "epoch": 5.559066427289048, + "grad_norm": 0.9301251173019409, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 77410 + }, + { + "epoch": 5.559784560143626, + "grad_norm": 1.090384602546692, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 77420 + }, + { + "epoch": 5.560502692998204, + "grad_norm": 0.8073469996452332, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 77430 + }, + { + "epoch": 5.561220825852783, + "grad_norm": 1.1003652811050415, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 77440 + }, + { + "epoch": 5.561938958707361, + "grad_norm": 0.9493791460990906, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 77450 + }, + { + "epoch": 5.562657091561939, + "grad_norm": 0.925388514995575, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 77460 + }, + { + "epoch": 5.563375224416517, + "grad_norm": 1.0946427583694458, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 77470 + }, + { + "epoch": 5.564093357271095, + "grad_norm": 0.9791404008865356, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 77480 + }, + { + "epoch": 5.564811490125673, + "grad_norm": 1.0534733533859253, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 77490 + }, + { + "epoch": 5.565529622980251, + "grad_norm": 0.9351776242256165, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 77500 + }, + { + "epoch": 5.566247755834829, + "grad_norm": 1.004448413848877, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 77510 + }, + { + "epoch": 5.566965888689408, + "grad_norm": 1.0199403762817383, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 77520 + }, + { + "epoch": 5.567684021543986, + "grad_norm": 1.0693204402923584, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 77530 + }, + { + "epoch": 5.568402154398564, + "grad_norm": 1.0635178089141846, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 77540 + }, + { + "epoch": 5.569120287253142, + "grad_norm": 1.1154648065567017, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 77550 + }, + { + "epoch": 5.56983842010772, + "grad_norm": 0.999116837978363, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 77560 + }, + { + "epoch": 5.570556552962298, + "grad_norm": 0.9967397451400757, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 77570 + }, + { + "epoch": 5.571274685816876, + "grad_norm": 0.9684699773788452, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 77580 + }, + { + "epoch": 5.571992818671454, + "grad_norm": 1.027213454246521, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 77590 + }, + { + "epoch": 5.572710951526032, + "grad_norm": 1.0571194887161255, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 77600 + }, + { + "epoch": 5.57342908438061, + "grad_norm": 1.2010499238967896, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 77610 + }, + { + "epoch": 5.574147217235188, + "grad_norm": 1.1033680438995361, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 77620 + }, + { + "epoch": 5.574865350089767, + "grad_norm": 0.9394578337669373, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 77630 + }, + { + "epoch": 5.575583482944345, + "grad_norm": 1.379382610321045, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 77640 + }, + { + "epoch": 5.576301615798923, + "grad_norm": 0.9787197709083557, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 77650 + }, + { + "epoch": 5.577019748653501, + "grad_norm": 0.9680284261703491, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 77660 + }, + { + "epoch": 5.577737881508079, + "grad_norm": 1.0449682474136353, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 77670 + }, + { + "epoch": 5.578456014362657, + "grad_norm": 1.1243085861206055, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 77680 + }, + { + "epoch": 5.579174147217235, + "grad_norm": 0.9228966236114502, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 77690 + }, + { + "epoch": 5.579892280071813, + "grad_norm": 1.1349890232086182, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 77700 + }, + { + "epoch": 5.580610412926392, + "grad_norm": 1.2248499393463135, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 77710 + }, + { + "epoch": 5.58132854578097, + "grad_norm": 1.0066324472427368, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 77720 + }, + { + "epoch": 5.582046678635548, + "grad_norm": 1.2642878293991089, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 77730 + }, + { + "epoch": 5.582764811490126, + "grad_norm": 1.031591534614563, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 77740 + }, + { + "epoch": 5.583482944344704, + "grad_norm": 1.0925929546356201, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 77750 + }, + { + "epoch": 5.584201077199282, + "grad_norm": 1.0567110776901245, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 77760 + }, + { + "epoch": 5.58491921005386, + "grad_norm": 1.246246099472046, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 77770 + }, + { + "epoch": 5.585637342908438, + "grad_norm": 1.2467739582061768, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 77780 + }, + { + "epoch": 5.586355475763016, + "grad_norm": 1.2695211172103882, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 77790 + }, + { + "epoch": 5.587073608617594, + "grad_norm": 1.0498571395874023, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 77800 + }, + { + "epoch": 5.587791741472173, + "grad_norm": 1.0078339576721191, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 77810 + }, + { + "epoch": 5.588509874326751, + "grad_norm": 1.108199954032898, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 77820 + }, + { + "epoch": 5.589228007181329, + "grad_norm": 1.0577641725540161, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 77830 + }, + { + "epoch": 5.589946140035907, + "grad_norm": 1.2169439792633057, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 77840 + }, + { + "epoch": 5.590664272890485, + "grad_norm": 0.8310868740081787, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 77850 + }, + { + "epoch": 5.591382405745063, + "grad_norm": 0.9794082045555115, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 77860 + }, + { + "epoch": 5.592100538599641, + "grad_norm": 0.8867404460906982, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 77870 + }, + { + "epoch": 5.592818671454219, + "grad_norm": 0.9204208254814148, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 77880 + }, + { + "epoch": 5.593536804308797, + "grad_norm": 0.9801714420318604, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 77890 + }, + { + "epoch": 5.594254937163376, + "grad_norm": 0.9383925199508667, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 77900 + }, + { + "epoch": 5.594973070017954, + "grad_norm": 0.9124664068222046, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 77910 + }, + { + "epoch": 5.595691202872532, + "grad_norm": 0.9618783593177795, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 77920 + }, + { + "epoch": 5.59640933572711, + "grad_norm": 0.9575216770172119, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 77930 + }, + { + "epoch": 5.597127468581688, + "grad_norm": 1.1223464012145996, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 77940 + }, + { + "epoch": 5.597845601436266, + "grad_norm": 0.9947475790977478, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 77950 + }, + { + "epoch": 5.598563734290844, + "grad_norm": 1.141959309577942, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 77960 + }, + { + "epoch": 5.599281867145422, + "grad_norm": 1.095525860786438, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 77970 + }, + { + "epoch": 5.6, + "grad_norm": 0.9396624565124512, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 77980 + }, + { + "epoch": 5.600718132854578, + "grad_norm": 0.8162274956703186, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 77990 + }, + { + "epoch": 5.6014362657091565, + "grad_norm": 1.0130535364151, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 78000 + }, + { + "epoch": 5.6021543985637345, + "grad_norm": 1.0016634464263916, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 78010 + }, + { + "epoch": 5.6028725314183125, + "grad_norm": 0.8936169743537903, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 78020 + }, + { + "epoch": 5.6035906642728905, + "grad_norm": 1.169625163078308, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 78030 + }, + { + "epoch": 5.6043087971274685, + "grad_norm": 0.8896323442459106, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 78040 + }, + { + "epoch": 5.6050269299820465, + "grad_norm": 1.0939475297927856, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 78050 + }, + { + "epoch": 5.6057450628366245, + "grad_norm": 1.0880711078643799, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 78060 + }, + { + "epoch": 5.6064631956912026, + "grad_norm": 1.1426655054092407, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 78070 + }, + { + "epoch": 5.607181328545781, + "grad_norm": 1.118586540222168, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 78080 + }, + { + "epoch": 5.607899461400359, + "grad_norm": 0.8784464597702026, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 78090 + }, + { + "epoch": 5.608617594254937, + "grad_norm": 1.137229561805725, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 78100 + }, + { + "epoch": 5.6093357271095154, + "grad_norm": 1.1041932106018066, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 78110 + }, + { + "epoch": 5.6100538599640934, + "grad_norm": 1.0170503854751587, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 78120 + }, + { + "epoch": 5.6107719928186714, + "grad_norm": 1.298754334449768, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 78130 + }, + { + "epoch": 5.6114901256732495, + "grad_norm": 0.9344905018806458, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 78140 + }, + { + "epoch": 5.6122082585278275, + "grad_norm": 0.9467785954475403, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 78150 + }, + { + "epoch": 5.6129263913824055, + "grad_norm": 1.0617443323135376, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 78160 + }, + { + "epoch": 5.6136445242369835, + "grad_norm": 0.9017760753631592, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 78170 + }, + { + "epoch": 5.6143626570915615, + "grad_norm": 1.152601957321167, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 78180 + }, + { + "epoch": 5.61508078994614, + "grad_norm": 0.9889463186264038, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 78190 + }, + { + "epoch": 5.615798922800718, + "grad_norm": 1.0367393493652344, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 78200 + }, + { + "epoch": 5.616517055655296, + "grad_norm": 0.8466457724571228, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 78210 + }, + { + "epoch": 5.617235188509874, + "grad_norm": 0.936083197593689, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 78220 + }, + { + "epoch": 5.617953321364452, + "grad_norm": 1.018784999847412, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 78230 + }, + { + "epoch": 5.61867145421903, + "grad_norm": 0.8527804017066956, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 78240 + }, + { + "epoch": 5.619389587073608, + "grad_norm": 1.1873106956481934, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 78250 + }, + { + "epoch": 5.620107719928186, + "grad_norm": 0.9401728510856628, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 78260 + }, + { + "epoch": 5.620825852782765, + "grad_norm": 1.0801159143447876, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 78270 + }, + { + "epoch": 5.621543985637343, + "grad_norm": 1.0053739547729492, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 78280 + }, + { + "epoch": 5.622262118491921, + "grad_norm": 0.8599331378936768, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 78290 + }, + { + "epoch": 5.622980251346499, + "grad_norm": 2.3157296180725098, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 78300 + }, + { + "epoch": 5.623698384201077, + "grad_norm": 1.0027490854263306, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 78310 + }, + { + "epoch": 5.624416517055655, + "grad_norm": 0.996688961982727, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 78320 + }, + { + "epoch": 5.625134649910233, + "grad_norm": 1.0462113618850708, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 78330 + }, + { + "epoch": 5.625852782764811, + "grad_norm": 0.8750988245010376, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 78340 + }, + { + "epoch": 5.626570915619389, + "grad_norm": 0.8078145384788513, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 78350 + }, + { + "epoch": 5.627289048473967, + "grad_norm": 0.9047532081604004, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 78360 + }, + { + "epoch": 5.628007181328546, + "grad_norm": 0.9784479737281799, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 78370 + }, + { + "epoch": 5.628725314183124, + "grad_norm": 0.9529541730880737, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 78380 + }, + { + "epoch": 5.629443447037702, + "grad_norm": 0.8264740109443665, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 78390 + }, + { + "epoch": 5.63016157989228, + "grad_norm": 1.049724817276001, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 78400 + }, + { + "epoch": 5.630879712746858, + "grad_norm": 0.9866746068000793, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 78410 + }, + { + "epoch": 5.631597845601436, + "grad_norm": 0.897155225276947, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 78420 + }, + { + "epoch": 5.632315978456014, + "grad_norm": 1.225464940071106, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 78430 + }, + { + "epoch": 5.633034111310592, + "grad_norm": 0.8793753981590271, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 78440 + }, + { + "epoch": 5.63375224416517, + "grad_norm": 1.082482099533081, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 78450 + }, + { + "epoch": 5.634470377019749, + "grad_norm": 1.054064393043518, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 78460 + }, + { + "epoch": 5.635188509874327, + "grad_norm": 1.0032247304916382, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 78470 + }, + { + "epoch": 5.635906642728905, + "grad_norm": 0.8544651865959167, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 78480 + }, + { + "epoch": 5.636624775583483, + "grad_norm": 0.9475075602531433, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 78490 + }, + { + "epoch": 5.637342908438061, + "grad_norm": 1.0814138650894165, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 78500 + }, + { + "epoch": 5.638061041292639, + "grad_norm": 1.0813153982162476, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 78510 + }, + { + "epoch": 5.638779174147217, + "grad_norm": 1.0225616693496704, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 78520 + }, + { + "epoch": 5.639497307001795, + "grad_norm": 1.0777465105056763, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 78530 + }, + { + "epoch": 5.640215439856373, + "grad_norm": 1.156148910522461, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 78540 + }, + { + "epoch": 5.640933572710951, + "grad_norm": 1.0147465467453003, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 78550 + }, + { + "epoch": 5.64165170556553, + "grad_norm": 0.9606683850288391, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 78560 + }, + { + "epoch": 5.642369838420108, + "grad_norm": 0.9478723406791687, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 78570 + }, + { + "epoch": 5.643087971274686, + "grad_norm": 1.0653880834579468, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 78580 + }, + { + "epoch": 5.643806104129264, + "grad_norm": 1.7519923448562622, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 78590 + }, + { + "epoch": 5.644524236983842, + "grad_norm": 1.0567299127578735, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 78600 + }, + { + "epoch": 5.64524236983842, + "grad_norm": 0.8980287909507751, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 78610 + }, + { + "epoch": 5.645960502692998, + "grad_norm": 0.8792264461517334, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 78620 + }, + { + "epoch": 5.646678635547576, + "grad_norm": 1.2306275367736816, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 78630 + }, + { + "epoch": 5.647396768402155, + "grad_norm": 0.8259932398796082, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 78640 + }, + { + "epoch": 5.648114901256733, + "grad_norm": 0.9605076313018799, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 78650 + }, + { + "epoch": 5.648833034111311, + "grad_norm": 0.9967419505119324, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 78660 + }, + { + "epoch": 5.649551166965889, + "grad_norm": 0.9774024486541748, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 78670 + }, + { + "epoch": 5.650269299820467, + "grad_norm": 0.9838066697120667, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 78680 + }, + { + "epoch": 5.650987432675045, + "grad_norm": 1.1617798805236816, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 78690 + }, + { + "epoch": 5.651705565529623, + "grad_norm": 1.075006365776062, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 78700 + }, + { + "epoch": 5.652423698384201, + "grad_norm": 0.8859893679618835, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 78710 + }, + { + "epoch": 5.653141831238779, + "grad_norm": 1.0774717330932617, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 78720 + }, + { + "epoch": 5.653859964093357, + "grad_norm": 1.147273302078247, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 78730 + }, + { + "epoch": 5.654578096947935, + "grad_norm": 1.1403213739395142, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 78740 + }, + { + "epoch": 5.655296229802514, + "grad_norm": 0.9115353226661682, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 78750 + }, + { + "epoch": 5.656014362657092, + "grad_norm": 0.9303002953529358, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 78760 + }, + { + "epoch": 5.65673249551167, + "grad_norm": 0.9324957728385925, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 78770 + }, + { + "epoch": 5.657450628366248, + "grad_norm": 0.9688063859939575, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 78780 + }, + { + "epoch": 5.658168761220826, + "grad_norm": 0.9019638299942017, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 78790 + }, + { + "epoch": 5.658886894075404, + "grad_norm": 0.8236798048019409, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 78800 + }, + { + "epoch": 5.659605026929982, + "grad_norm": 1.2702386379241943, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 78810 + }, + { + "epoch": 5.66032315978456, + "grad_norm": 1.041077971458435, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 78820 + }, + { + "epoch": 5.661041292639139, + "grad_norm": 0.9028838276863098, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 78830 + }, + { + "epoch": 5.661759425493717, + "grad_norm": 0.9874144196510315, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 78840 + }, + { + "epoch": 5.662477558348295, + "grad_norm": 0.9633761048316956, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 78850 + }, + { + "epoch": 5.663195691202873, + "grad_norm": 0.9069564342498779, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 78860 + }, + { + "epoch": 5.663913824057451, + "grad_norm": 0.9560621976852417, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 78870 + }, + { + "epoch": 5.664631956912029, + "grad_norm": 0.9941161870956421, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 78880 + }, + { + "epoch": 5.665350089766607, + "grad_norm": 0.920407235622406, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 78890 + }, + { + "epoch": 5.666068222621185, + "grad_norm": 0.9909250140190125, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 78900 + }, + { + "epoch": 5.666786355475763, + "grad_norm": 0.9528568983078003, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 78910 + }, + { + "epoch": 5.667504488330341, + "grad_norm": 1.041440725326538, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 78920 + }, + { + "epoch": 5.66822262118492, + "grad_norm": 1.0072191953659058, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 78930 + }, + { + "epoch": 5.668940754039498, + "grad_norm": 1.0740574598312378, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 78940 + }, + { + "epoch": 5.669658886894076, + "grad_norm": 0.9168822169303894, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 78950 + }, + { + "epoch": 5.670377019748654, + "grad_norm": 1.1818004846572876, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 78960 + }, + { + "epoch": 5.671095152603232, + "grad_norm": 1.1925201416015625, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 78970 + }, + { + "epoch": 5.67181328545781, + "grad_norm": 0.879940390586853, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 78980 + }, + { + "epoch": 5.672531418312388, + "grad_norm": 1.0998331308364868, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 78990 + }, + { + "epoch": 5.673249551166966, + "grad_norm": 1.076637625694275, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 79000 + }, + { + "epoch": 5.673967684021544, + "grad_norm": 1.076864242553711, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 79010 + }, + { + "epoch": 5.6746858168761225, + "grad_norm": 1.0206586122512817, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 79020 + }, + { + "epoch": 5.6754039497307005, + "grad_norm": 0.8242515325546265, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 79030 + }, + { + "epoch": 5.6761220825852785, + "grad_norm": 1.1180634498596191, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 79040 + }, + { + "epoch": 5.6768402154398565, + "grad_norm": 1.0155152082443237, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 79050 + }, + { + "epoch": 5.6775583482944345, + "grad_norm": 1.0445241928100586, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 79060 + }, + { + "epoch": 5.6782764811490125, + "grad_norm": 0.9851725697517395, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 79070 + }, + { + "epoch": 5.6789946140035905, + "grad_norm": 0.9979640245437622, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 79080 + }, + { + "epoch": 5.6797127468581685, + "grad_norm": 1.0398952960968018, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 79090 + }, + { + "epoch": 5.6804308797127465, + "grad_norm": 1.094164252281189, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 79100 + }, + { + "epoch": 5.6811490125673245, + "grad_norm": 0.9546816945075989, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 79110 + }, + { + "epoch": 5.681867145421903, + "grad_norm": 1.1635938882827759, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 79120 + }, + { + "epoch": 5.682585278276481, + "grad_norm": 1.0260306596755981, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 79130 + }, + { + "epoch": 5.683303411131059, + "grad_norm": 0.9900122284889221, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 79140 + }, + { + "epoch": 5.684021543985637, + "grad_norm": 1.049688458442688, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 79150 + }, + { + "epoch": 5.684739676840215, + "grad_norm": 1.124272108078003, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 79160 + }, + { + "epoch": 5.685457809694793, + "grad_norm": 1.1109849214553833, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 79170 + }, + { + "epoch": 5.686175942549371, + "grad_norm": 0.739007830619812, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 79180 + }, + { + "epoch": 5.686894075403949, + "grad_norm": 1.2063007354736328, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 79190 + }, + { + "epoch": 5.687612208258528, + "grad_norm": 1.223317265510559, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 79200 + }, + { + "epoch": 5.688330341113106, + "grad_norm": 0.8042855858802795, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 79210 + }, + { + "epoch": 5.689048473967684, + "grad_norm": 0.9294175505638123, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 79220 + }, + { + "epoch": 5.689766606822262, + "grad_norm": 0.978084146976471, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 79230 + }, + { + "epoch": 5.69048473967684, + "grad_norm": 0.9271620512008667, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 79240 + }, + { + "epoch": 5.691202872531418, + "grad_norm": 1.158677339553833, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 79250 + }, + { + "epoch": 5.691921005385996, + "grad_norm": 0.9468576312065125, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 79260 + }, + { + "epoch": 5.692639138240574, + "grad_norm": 1.2025824785232544, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 79270 + }, + { + "epoch": 5.693357271095152, + "grad_norm": 1.0167860984802246, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 79280 + }, + { + "epoch": 5.69407540394973, + "grad_norm": 0.971199631690979, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 79290 + }, + { + "epoch": 5.694793536804308, + "grad_norm": 1.1757864952087402, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 79300 + }, + { + "epoch": 5.695511669658887, + "grad_norm": 1.0199662446975708, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 79310 + }, + { + "epoch": 5.696229802513465, + "grad_norm": 0.9662485122680664, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 79320 + }, + { + "epoch": 5.696947935368043, + "grad_norm": 0.9324414134025574, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 79330 + }, + { + "epoch": 5.697666068222621, + "grad_norm": 0.855752170085907, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 79340 + }, + { + "epoch": 5.698384201077199, + "grad_norm": 1.2723703384399414, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 79350 + }, + { + "epoch": 5.699102333931777, + "grad_norm": 1.0254011154174805, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 79360 + }, + { + "epoch": 5.699820466786355, + "grad_norm": 1.0958263874053955, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 79370 + }, + { + "epoch": 5.700538599640933, + "grad_norm": 1.0214145183563232, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 79380 + }, + { + "epoch": 5.701256732495512, + "grad_norm": 1.1087455749511719, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 79390 + }, + { + "epoch": 5.70197486535009, + "grad_norm": 0.8885074853897095, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 79400 + }, + { + "epoch": 5.702692998204668, + "grad_norm": 0.9854450821876526, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 79410 + }, + { + "epoch": 5.703411131059246, + "grad_norm": 0.858744204044342, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 79420 + }, + { + "epoch": 5.704129263913824, + "grad_norm": 0.9434788823127747, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 79430 + }, + { + "epoch": 5.704847396768402, + "grad_norm": 1.1388801336288452, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 79440 + }, + { + "epoch": 5.70556552962298, + "grad_norm": 1.0701899528503418, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 79450 + }, + { + "epoch": 5.706283662477558, + "grad_norm": 0.9147594571113586, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 79460 + }, + { + "epoch": 5.707001795332136, + "grad_norm": 1.055008053779602, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 79470 + }, + { + "epoch": 5.707719928186714, + "grad_norm": 0.7841609716415405, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 79480 + }, + { + "epoch": 5.708438061041292, + "grad_norm": 1.0334571599960327, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 79490 + }, + { + "epoch": 5.709156193895871, + "grad_norm": 1.2841367721557617, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 79500 + }, + { + "epoch": 5.709874326750449, + "grad_norm": 1.0296638011932373, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 79510 + }, + { + "epoch": 5.710592459605027, + "grad_norm": 0.9161922931671143, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 79520 + }, + { + "epoch": 5.711310592459605, + "grad_norm": 1.056856632232666, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 79530 + }, + { + "epoch": 5.712028725314183, + "grad_norm": 0.9919893145561218, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 79540 + }, + { + "epoch": 5.712746858168761, + "grad_norm": 1.1128891706466675, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 79550 + }, + { + "epoch": 5.713464991023339, + "grad_norm": 1.1171997785568237, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 79560 + }, + { + "epoch": 5.714183123877917, + "grad_norm": 0.9389346837997437, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 79570 + }, + { + "epoch": 5.714901256732496, + "grad_norm": 0.9869245886802673, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 79580 + }, + { + "epoch": 5.715619389587074, + "grad_norm": 0.9019966721534729, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 79590 + }, + { + "epoch": 5.716337522441652, + "grad_norm": 0.9791252017021179, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 79600 + }, + { + "epoch": 5.71705565529623, + "grad_norm": 1.0269849300384521, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 79610 + }, + { + "epoch": 5.717773788150808, + "grad_norm": 1.0340129137039185, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 79620 + }, + { + "epoch": 5.718491921005386, + "grad_norm": 0.9742604494094849, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 79630 + }, + { + "epoch": 5.719210053859964, + "grad_norm": 1.126868724822998, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 79640 + }, + { + "epoch": 5.719928186714542, + "grad_norm": 1.04326331615448, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 79650 + }, + { + "epoch": 5.72064631956912, + "grad_norm": 0.8300277590751648, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 79660 + }, + { + "epoch": 5.721364452423698, + "grad_norm": 0.8482570052146912, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 79670 + }, + { + "epoch": 5.722082585278277, + "grad_norm": 1.0777807235717773, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 79680 + }, + { + "epoch": 5.722800718132855, + "grad_norm": 1.2682723999023438, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 79690 + }, + { + "epoch": 5.723518850987433, + "grad_norm": 0.8742772340774536, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 79700 + }, + { + "epoch": 5.724236983842011, + "grad_norm": 0.9218387603759766, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 79710 + }, + { + "epoch": 5.724955116696589, + "grad_norm": 0.8977975845336914, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 79720 + }, + { + "epoch": 5.725673249551167, + "grad_norm": 1.0873085260391235, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 79730 + }, + { + "epoch": 5.726391382405745, + "grad_norm": 0.9811807870864868, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 79740 + }, + { + "epoch": 5.727109515260323, + "grad_norm": 0.926764965057373, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 79750 + }, + { + "epoch": 5.727827648114902, + "grad_norm": 1.0103713274002075, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 79760 + }, + { + "epoch": 5.72854578096948, + "grad_norm": 1.1389189958572388, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 79770 + }, + { + "epoch": 5.729263913824058, + "grad_norm": 1.1654961109161377, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 79780 + }, + { + "epoch": 5.729982046678636, + "grad_norm": 0.7925996780395508, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 79790 + }, + { + "epoch": 5.730700179533214, + "grad_norm": 1.3329131603240967, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 79800 + }, + { + "epoch": 5.731418312387792, + "grad_norm": 1.158328890800476, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 79810 + }, + { + "epoch": 5.73213644524237, + "grad_norm": 0.9904412031173706, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 79820 + }, + { + "epoch": 5.732854578096948, + "grad_norm": 1.099233865737915, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 79830 + }, + { + "epoch": 5.733572710951526, + "grad_norm": 1.0224473476409912, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 79840 + }, + { + "epoch": 5.734290843806104, + "grad_norm": 1.0482215881347656, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 79850 + }, + { + "epoch": 5.735008976660682, + "grad_norm": 0.9790018200874329, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 79860 + }, + { + "epoch": 5.735727109515261, + "grad_norm": 1.034548044204712, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 79870 + }, + { + "epoch": 5.736445242369839, + "grad_norm": 0.799286961555481, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 79880 + }, + { + "epoch": 5.737163375224417, + "grad_norm": 1.0119048357009888, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 79890 + }, + { + "epoch": 5.737881508078995, + "grad_norm": 0.9742264151573181, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 79900 + }, + { + "epoch": 5.738599640933573, + "grad_norm": 1.0408239364624023, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 79910 + }, + { + "epoch": 5.739317773788151, + "grad_norm": 0.9165748953819275, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 79920 + }, + { + "epoch": 5.740035906642729, + "grad_norm": 1.1859451532363892, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 79930 + }, + { + "epoch": 5.740754039497307, + "grad_norm": 0.8772084712982178, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 79940 + }, + { + "epoch": 5.741472172351886, + "grad_norm": 1.0123273134231567, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 79950 + }, + { + "epoch": 5.742190305206464, + "grad_norm": 1.1873936653137207, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 79960 + }, + { + "epoch": 5.742908438061042, + "grad_norm": 0.9065699577331543, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 79970 + }, + { + "epoch": 5.74362657091562, + "grad_norm": 1.1626464128494263, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 79980 + }, + { + "epoch": 5.744344703770198, + "grad_norm": 1.0311716794967651, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 79990 + }, + { + "epoch": 5.745062836624776, + "grad_norm": 1.0865558385849, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 80000 + }, + { + "epoch": 5.745780969479354, + "grad_norm": 1.0257176160812378, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 80010 + }, + { + "epoch": 5.746499102333932, + "grad_norm": 0.9805439710617065, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 80020 + }, + { + "epoch": 5.74721723518851, + "grad_norm": 0.9744977355003357, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 80030 + }, + { + "epoch": 5.747935368043088, + "grad_norm": 1.302816390991211, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 80040 + }, + { + "epoch": 5.748653500897666, + "grad_norm": 0.8866990208625793, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 80050 + }, + { + "epoch": 5.7493716337522445, + "grad_norm": 1.0133726596832275, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 80060 + }, + { + "epoch": 5.7500897666068225, + "grad_norm": 1.0043569803237915, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 80070 + }, + { + "epoch": 5.7508078994614005, + "grad_norm": 0.9100040197372437, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 80080 + }, + { + "epoch": 5.7515260323159785, + "grad_norm": 0.7994180917739868, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 80090 + }, + { + "epoch": 5.7522441651705565, + "grad_norm": 1.120188593864441, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 80100 + }, + { + "epoch": 5.7529622980251345, + "grad_norm": 0.9555420279502869, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 80110 + }, + { + "epoch": 5.7536804308797125, + "grad_norm": 1.0305951833724976, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 80120 + }, + { + "epoch": 5.7543985637342905, + "grad_norm": 0.9632731676101685, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 80130 + }, + { + "epoch": 5.755116696588869, + "grad_norm": 1.2654297351837158, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 80140 + }, + { + "epoch": 5.755834829443447, + "grad_norm": 1.027190089225769, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 80150 + }, + { + "epoch": 5.756552962298025, + "grad_norm": 0.9829175472259521, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 80160 + }, + { + "epoch": 5.757271095152603, + "grad_norm": 1.083803653717041, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 80170 + }, + { + "epoch": 5.757989228007181, + "grad_norm": 0.9353913068771362, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 80180 + }, + { + "epoch": 5.758707360861759, + "grad_norm": 1.1824370622634888, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 80190 + }, + { + "epoch": 5.759425493716337, + "grad_norm": 1.0901048183441162, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 80200 + }, + { + "epoch": 5.760143626570915, + "grad_norm": 1.0389254093170166, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 80210 + }, + { + "epoch": 5.760861759425493, + "grad_norm": 0.9746400117874146, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 80220 + }, + { + "epoch": 5.761579892280071, + "grad_norm": 0.9319248795509338, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 80230 + }, + { + "epoch": 5.76229802513465, + "grad_norm": 1.152784824371338, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 80240 + }, + { + "epoch": 5.763016157989228, + "grad_norm": 0.9462733864784241, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 80250 + }, + { + "epoch": 5.763734290843806, + "grad_norm": 0.8884182572364807, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 80260 + }, + { + "epoch": 5.764452423698384, + "grad_norm": 0.8755964636802673, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 80270 + }, + { + "epoch": 5.765170556552962, + "grad_norm": 0.8983452320098877, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 80280 + }, + { + "epoch": 5.76588868940754, + "grad_norm": 0.8565991520881653, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 80290 + }, + { + "epoch": 5.766606822262118, + "grad_norm": 1.0557159185409546, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 80300 + }, + { + "epoch": 5.767324955116696, + "grad_norm": 1.057214379310608, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 80310 + }, + { + "epoch": 5.768043087971275, + "grad_norm": 0.9852516055107117, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 80320 + }, + { + "epoch": 5.768761220825853, + "grad_norm": 1.0339698791503906, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 80330 + }, + { + "epoch": 5.769479353680431, + "grad_norm": 1.0056889057159424, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 80340 + }, + { + "epoch": 5.770197486535009, + "grad_norm": 1.0941663980484009, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 80350 + }, + { + "epoch": 5.770915619389587, + "grad_norm": 1.2145589590072632, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 80360 + }, + { + "epoch": 5.771633752244165, + "grad_norm": 0.9609606862068176, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 80370 + }, + { + "epoch": 5.772351885098743, + "grad_norm": 0.8815773129463196, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 80380 + }, + { + "epoch": 5.773070017953321, + "grad_norm": 1.2630987167358398, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 80390 + }, + { + "epoch": 5.773788150807899, + "grad_norm": 1.0605450868606567, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 80400 + }, + { + "epoch": 5.774506283662477, + "grad_norm": 1.165069341659546, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 80410 + }, + { + "epoch": 5.775224416517055, + "grad_norm": 0.9038028717041016, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 80420 + }, + { + "epoch": 5.775942549371634, + "grad_norm": 1.0571858882904053, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 80430 + }, + { + "epoch": 5.776660682226212, + "grad_norm": 1.0388168096542358, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 80440 + }, + { + "epoch": 5.77737881508079, + "grad_norm": 1.0552119016647339, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 80450 + }, + { + "epoch": 5.778096947935368, + "grad_norm": 1.0610109567642212, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 80460 + }, + { + "epoch": 5.778815080789946, + "grad_norm": 0.9906430244445801, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 80470 + }, + { + "epoch": 5.779533213644524, + "grad_norm": 1.1511857509613037, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 80480 + }, + { + "epoch": 5.780251346499102, + "grad_norm": 1.2738412618637085, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 80490 + }, + { + "epoch": 5.78096947935368, + "grad_norm": 0.8945937752723694, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 80500 + }, + { + "epoch": 5.781687612208259, + "grad_norm": 1.1105149984359741, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 80510 + }, + { + "epoch": 5.782405745062837, + "grad_norm": 0.8432297110557556, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 80520 + }, + { + "epoch": 5.783123877917415, + "grad_norm": 0.9257984757423401, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 80530 + }, + { + "epoch": 5.783842010771993, + "grad_norm": 1.1708799600601196, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 80540 + }, + { + "epoch": 5.784560143626571, + "grad_norm": 0.9969521164894104, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 80550 + }, + { + "epoch": 5.785278276481149, + "grad_norm": 1.0361413955688477, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 80560 + }, + { + "epoch": 5.785996409335727, + "grad_norm": 0.9876393675804138, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 80570 + }, + { + "epoch": 5.786714542190305, + "grad_norm": 1.0356241464614868, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 80580 + }, + { + "epoch": 5.787432675044883, + "grad_norm": 1.178865671157837, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 80590 + }, + { + "epoch": 5.788150807899461, + "grad_norm": 0.8614338636398315, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 80600 + }, + { + "epoch": 5.788868940754039, + "grad_norm": 1.020734429359436, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 80610 + }, + { + "epoch": 5.789587073608618, + "grad_norm": 1.035951852798462, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 80620 + }, + { + "epoch": 5.790305206463196, + "grad_norm": 0.898637592792511, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 80630 + }, + { + "epoch": 5.791023339317774, + "grad_norm": 0.9803016781806946, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 80640 + }, + { + "epoch": 5.791741472172352, + "grad_norm": 1.2902555465698242, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 80650 + }, + { + "epoch": 5.79245960502693, + "grad_norm": 1.3364112377166748, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 80660 + }, + { + "epoch": 5.793177737881508, + "grad_norm": 0.8553985953330994, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 80670 + }, + { + "epoch": 5.793895870736086, + "grad_norm": 0.8211889863014221, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 80680 + }, + { + "epoch": 5.794614003590664, + "grad_norm": 0.9288306832313538, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 80690 + }, + { + "epoch": 5.795332136445243, + "grad_norm": 1.0716029405593872, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 80700 + }, + { + "epoch": 5.796050269299821, + "grad_norm": 0.9957329034805298, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 80710 + }, + { + "epoch": 5.796768402154399, + "grad_norm": 0.9691376090049744, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 80720 + }, + { + "epoch": 5.797486535008977, + "grad_norm": 1.0590804815292358, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 80730 + }, + { + "epoch": 5.798204667863555, + "grad_norm": 1.0408968925476074, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 80740 + }, + { + "epoch": 5.798922800718133, + "grad_norm": 1.0249526500701904, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 80750 + }, + { + "epoch": 5.799640933572711, + "grad_norm": 1.3658806085586548, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 80760 + }, + { + "epoch": 5.800359066427289, + "grad_norm": 0.9562603831291199, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 80770 + }, + { + "epoch": 5.801077199281867, + "grad_norm": 0.8790915012359619, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 80780 + }, + { + "epoch": 5.801795332136445, + "grad_norm": 0.8351004123687744, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 80790 + }, + { + "epoch": 5.802513464991024, + "grad_norm": 0.964562714099884, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 80800 + }, + { + "epoch": 5.803231597845602, + "grad_norm": 1.0873116254806519, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 80810 + }, + { + "epoch": 5.80394973070018, + "grad_norm": 0.9821216464042664, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 80820 + }, + { + "epoch": 5.804667863554758, + "grad_norm": 1.1158807277679443, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 80830 + }, + { + "epoch": 5.805385996409336, + "grad_norm": 1.0098856687545776, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 80840 + }, + { + "epoch": 5.806104129263914, + "grad_norm": 0.9628035426139832, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 80850 + }, + { + "epoch": 5.806822262118492, + "grad_norm": 1.133800983428955, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 80860 + }, + { + "epoch": 5.80754039497307, + "grad_norm": 0.9423992037773132, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 80870 + }, + { + "epoch": 5.808258527827648, + "grad_norm": 1.0758612155914307, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 80880 + }, + { + "epoch": 5.808976660682227, + "grad_norm": 1.232029914855957, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 80890 + }, + { + "epoch": 5.809694793536805, + "grad_norm": 1.1063108444213867, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 80900 + }, + { + "epoch": 5.810412926391383, + "grad_norm": 0.9759877920150757, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 80910 + }, + { + "epoch": 5.811131059245961, + "grad_norm": 0.9180193543434143, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 80920 + }, + { + "epoch": 5.811849192100539, + "grad_norm": 1.0818052291870117, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 80930 + }, + { + "epoch": 5.812567324955117, + "grad_norm": 0.998986542224884, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 80940 + }, + { + "epoch": 5.813285457809695, + "grad_norm": 1.1549060344696045, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 80950 + }, + { + "epoch": 5.814003590664273, + "grad_norm": 1.1900213956832886, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 80960 + }, + { + "epoch": 5.814721723518851, + "grad_norm": 0.8114368915557861, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 80970 + }, + { + "epoch": 5.815439856373429, + "grad_norm": 1.0296406745910645, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 80980 + }, + { + "epoch": 5.8161579892280075, + "grad_norm": 1.0466746091842651, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 80990 + }, + { + "epoch": 5.8168761220825855, + "grad_norm": 1.0524508953094482, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 81000 + }, + { + "epoch": 5.8175942549371635, + "grad_norm": 1.1588358879089355, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 81010 + }, + { + "epoch": 5.8183123877917415, + "grad_norm": 0.9378601908683777, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 81020 + }, + { + "epoch": 5.8190305206463195, + "grad_norm": 0.9486441612243652, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 81030 + }, + { + "epoch": 5.8197486535008975, + "grad_norm": 0.9805227518081665, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 81040 + }, + { + "epoch": 5.8204667863554755, + "grad_norm": 1.1627717018127441, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 81050 + }, + { + "epoch": 5.8211849192100535, + "grad_norm": 1.0716841220855713, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 81060 + }, + { + "epoch": 5.821903052064632, + "grad_norm": 1.2398899793624878, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 81070 + }, + { + "epoch": 5.82262118491921, + "grad_norm": 1.0934730768203735, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 81080 + }, + { + "epoch": 5.823339317773788, + "grad_norm": 0.9701796174049377, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 81090 + }, + { + "epoch": 5.824057450628366, + "grad_norm": 1.0218969583511353, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 81100 + }, + { + "epoch": 5.824775583482944, + "grad_norm": 1.3066465854644775, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 81110 + }, + { + "epoch": 5.825493716337522, + "grad_norm": 1.1067441701889038, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 81120 + }, + { + "epoch": 5.8262118491921004, + "grad_norm": 0.9750344753265381, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 81130 + }, + { + "epoch": 5.8269299820466784, + "grad_norm": 1.129191279411316, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 81140 + }, + { + "epoch": 5.8276481149012564, + "grad_norm": 1.05964195728302, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 81150 + }, + { + "epoch": 5.8283662477558345, + "grad_norm": 1.1094872951507568, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 81160 + }, + { + "epoch": 5.8290843806104125, + "grad_norm": 0.9163196086883545, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 81170 + }, + { + "epoch": 5.829802513464991, + "grad_norm": 1.0035687685012817, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 81180 + }, + { + "epoch": 5.830520646319569, + "grad_norm": 1.0353461503982544, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 81190 + }, + { + "epoch": 5.831238779174147, + "grad_norm": 1.0566555261611938, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 81200 + }, + { + "epoch": 5.831956912028725, + "grad_norm": 1.2373290061950684, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 81210 + }, + { + "epoch": 5.832675044883303, + "grad_norm": 0.8818837404251099, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 81220 + }, + { + "epoch": 5.833393177737881, + "grad_norm": 1.1024713516235352, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 81230 + }, + { + "epoch": 5.834111310592459, + "grad_norm": 1.2478809356689453, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 81240 + }, + { + "epoch": 5.834829443447037, + "grad_norm": 0.8647364377975464, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 81250 + }, + { + "epoch": 5.835547576301616, + "grad_norm": 1.1106358766555786, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 81260 + }, + { + "epoch": 5.836265709156194, + "grad_norm": 0.9432938694953918, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 81270 + }, + { + "epoch": 5.836983842010772, + "grad_norm": 1.0283797979354858, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 81280 + }, + { + "epoch": 5.83770197486535, + "grad_norm": 1.158918857574463, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 81290 + }, + { + "epoch": 5.838420107719928, + "grad_norm": 0.9700069427490234, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 81300 + }, + { + "epoch": 5.839138240574506, + "grad_norm": 1.08310866355896, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 81310 + }, + { + "epoch": 5.839856373429084, + "grad_norm": 1.05460524559021, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 81320 + }, + { + "epoch": 5.840574506283662, + "grad_norm": 0.9849268794059753, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 81330 + }, + { + "epoch": 5.84129263913824, + "grad_norm": 0.888306736946106, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 81340 + }, + { + "epoch": 5.842010771992818, + "grad_norm": 1.0337001085281372, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 81350 + }, + { + "epoch": 5.842728904847397, + "grad_norm": 1.0778567790985107, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 81360 + }, + { + "epoch": 5.843447037701975, + "grad_norm": 1.1484156847000122, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 81370 + }, + { + "epoch": 5.844165170556553, + "grad_norm": 1.0948245525360107, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 81380 + }, + { + "epoch": 5.844883303411131, + "grad_norm": 0.9363969564437866, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 81390 + }, + { + "epoch": 5.845601436265709, + "grad_norm": 1.0151013135910034, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 81400 + }, + { + "epoch": 5.846319569120287, + "grad_norm": 0.9925733804702759, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 81410 + }, + { + "epoch": 5.847037701974865, + "grad_norm": 1.0356744527816772, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 81420 + }, + { + "epoch": 5.847755834829443, + "grad_norm": 1.0633001327514648, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 81430 + }, + { + "epoch": 5.848473967684021, + "grad_norm": 0.9900460839271545, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 81440 + }, + { + "epoch": 5.8491921005386, + "grad_norm": 1.2677979469299316, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 81450 + }, + { + "epoch": 5.849910233393178, + "grad_norm": 0.8174138069152832, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 81460 + }, + { + "epoch": 5.850628366247756, + "grad_norm": 1.1986393928527832, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 81470 + }, + { + "epoch": 5.851346499102334, + "grad_norm": 1.1009358167648315, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 81480 + }, + { + "epoch": 5.852064631956912, + "grad_norm": 0.966446578502655, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 81490 + }, + { + "epoch": 5.85278276481149, + "grad_norm": 0.9657767415046692, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 81500 + }, + { + "epoch": 5.853500897666068, + "grad_norm": 1.0480058193206787, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 81510 + }, + { + "epoch": 5.854219030520646, + "grad_norm": 1.2003830671310425, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 81520 + }, + { + "epoch": 5.854937163375224, + "grad_norm": 0.8683754205703735, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 81530 + }, + { + "epoch": 5.855655296229802, + "grad_norm": 1.0860967636108398, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 81540 + }, + { + "epoch": 5.856373429084381, + "grad_norm": 1.0415282249450684, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 81550 + }, + { + "epoch": 5.857091561938959, + "grad_norm": 0.9897454380989075, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 81560 + }, + { + "epoch": 5.857809694793537, + "grad_norm": 1.173884630203247, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 81570 + }, + { + "epoch": 5.858527827648115, + "grad_norm": 1.2426209449768066, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 81580 + }, + { + "epoch": 5.859245960502693, + "grad_norm": 0.9390465021133423, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 81590 + }, + { + "epoch": 5.859964093357271, + "grad_norm": 1.1387195587158203, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 81600 + }, + { + "epoch": 5.860682226211849, + "grad_norm": 0.9902143478393555, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 81610 + }, + { + "epoch": 5.861400359066427, + "grad_norm": 0.8328776359558105, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 81620 + }, + { + "epoch": 5.862118491921006, + "grad_norm": 0.9837837815284729, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 81630 + }, + { + "epoch": 5.862836624775584, + "grad_norm": 1.0013370513916016, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 81640 + }, + { + "epoch": 5.863554757630162, + "grad_norm": 0.9408028721809387, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 81650 + }, + { + "epoch": 5.86427289048474, + "grad_norm": 1.093140959739685, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 81660 + }, + { + "epoch": 5.864991023339318, + "grad_norm": 0.9554300904273987, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 81670 + }, + { + "epoch": 5.865709156193896, + "grad_norm": 1.1276485919952393, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 81680 + }, + { + "epoch": 5.866427289048474, + "grad_norm": 0.9628785252571106, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 81690 + }, + { + "epoch": 5.867145421903052, + "grad_norm": 0.9844689965248108, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 81700 + }, + { + "epoch": 5.86786355475763, + "grad_norm": 0.9679856896400452, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 81710 + }, + { + "epoch": 5.868581687612208, + "grad_norm": 1.0225571393966675, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 81720 + }, + { + "epoch": 5.869299820466786, + "grad_norm": 0.9330390691757202, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 81730 + }, + { + "epoch": 5.870017953321365, + "grad_norm": 1.0584566593170166, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 81740 + }, + { + "epoch": 5.870736086175943, + "grad_norm": 0.781548023223877, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 81750 + }, + { + "epoch": 5.871454219030521, + "grad_norm": 0.8906106352806091, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 81760 + }, + { + "epoch": 5.872172351885099, + "grad_norm": 1.1402281522750854, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 81770 + }, + { + "epoch": 5.872890484739677, + "grad_norm": 0.9991076588630676, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 81780 + }, + { + "epoch": 5.873608617594255, + "grad_norm": 1.0120140314102173, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 81790 + }, + { + "epoch": 5.874326750448833, + "grad_norm": 0.8857715725898743, + "learning_rate": 0.0002, + "loss": 0.6114, + "step": 81800 + }, + { + "epoch": 5.875044883303411, + "grad_norm": 0.8531954288482666, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 81810 + }, + { + "epoch": 5.87576301615799, + "grad_norm": 1.1601015329360962, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 81820 + }, + { + "epoch": 5.876481149012568, + "grad_norm": 1.1435350179672241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 81830 + }, + { + "epoch": 5.877199281867146, + "grad_norm": 0.9526153802871704, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 81840 + }, + { + "epoch": 5.877917414721724, + "grad_norm": 1.06845223903656, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 81850 + }, + { + "epoch": 5.878635547576302, + "grad_norm": 0.9239344596862793, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 81860 + }, + { + "epoch": 5.87935368043088, + "grad_norm": 0.8632398247718811, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 81870 + }, + { + "epoch": 5.880071813285458, + "grad_norm": 0.9148443341255188, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 81880 + }, + { + "epoch": 5.880789946140036, + "grad_norm": 0.9910652041435242, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 81890 + }, + { + "epoch": 5.881508078994614, + "grad_norm": 0.8335179090499878, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 81900 + }, + { + "epoch": 5.882226211849192, + "grad_norm": 0.9921387434005737, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 81910 + }, + { + "epoch": 5.88294434470377, + "grad_norm": 1.0532517433166504, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 81920 + }, + { + "epoch": 5.883662477558349, + "grad_norm": 1.026400089263916, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 81930 + }, + { + "epoch": 5.884380610412927, + "grad_norm": 1.019195318222046, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 81940 + }, + { + "epoch": 5.885098743267505, + "grad_norm": 0.987238347530365, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 81950 + }, + { + "epoch": 5.885816876122083, + "grad_norm": 1.1714487075805664, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 81960 + }, + { + "epoch": 5.886535008976661, + "grad_norm": 1.0854483842849731, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 81970 + }, + { + "epoch": 5.887253141831239, + "grad_norm": 1.0678396224975586, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 81980 + }, + { + "epoch": 5.887971274685817, + "grad_norm": 1.1009471416473389, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 81990 + }, + { + "epoch": 5.888689407540395, + "grad_norm": 1.2056844234466553, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 82000 + }, + { + "epoch": 5.8894075403949735, + "grad_norm": 1.131302833557129, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 82010 + }, + { + "epoch": 5.8901256732495515, + "grad_norm": 1.4466036558151245, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 82020 + }, + { + "epoch": 5.8908438061041295, + "grad_norm": 1.051228404045105, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 82030 + }, + { + "epoch": 5.8915619389587075, + "grad_norm": 1.0010617971420288, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 82040 + }, + { + "epoch": 5.8922800718132855, + "grad_norm": 0.9095138311386108, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 82050 + }, + { + "epoch": 5.8929982046678635, + "grad_norm": 1.0237005949020386, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 82060 + }, + { + "epoch": 5.8937163375224415, + "grad_norm": 1.035122036933899, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 82070 + }, + { + "epoch": 5.8944344703770195, + "grad_norm": 1.0271964073181152, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 82080 + }, + { + "epoch": 5.8951526032315975, + "grad_norm": 1.2044503688812256, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 82090 + }, + { + "epoch": 5.8958707360861755, + "grad_norm": 1.0275284051895142, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 82100 + }, + { + "epoch": 5.896588868940754, + "grad_norm": 0.9974840879440308, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 82110 + }, + { + "epoch": 5.897307001795332, + "grad_norm": 1.009968638420105, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 82120 + }, + { + "epoch": 5.89802513464991, + "grad_norm": 0.8396142721176147, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 82130 + }, + { + "epoch": 5.898743267504488, + "grad_norm": 1.002354621887207, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 82140 + }, + { + "epoch": 5.899461400359066, + "grad_norm": 0.9998893737792969, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 82150 + }, + { + "epoch": 5.900179533213644, + "grad_norm": 1.1027010679244995, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 82160 + }, + { + "epoch": 5.900897666068222, + "grad_norm": 1.2028530836105347, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 82170 + }, + { + "epoch": 5.9016157989228, + "grad_norm": 1.0018759965896606, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 82180 + }, + { + "epoch": 5.902333931777379, + "grad_norm": 0.8911277055740356, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 82190 + }, + { + "epoch": 5.903052064631957, + "grad_norm": 1.0172009468078613, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 82200 + }, + { + "epoch": 5.903770197486535, + "grad_norm": 1.1664029359817505, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 82210 + }, + { + "epoch": 5.904488330341113, + "grad_norm": 1.0620089769363403, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 82220 + }, + { + "epoch": 5.905206463195691, + "grad_norm": 1.0756114721298218, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 82230 + }, + { + "epoch": 5.905924596050269, + "grad_norm": 1.1727497577667236, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 82240 + }, + { + "epoch": 5.906642728904847, + "grad_norm": 0.9833515882492065, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 82250 + }, + { + "epoch": 5.907360861759425, + "grad_norm": 0.9236368536949158, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 82260 + }, + { + "epoch": 5.908078994614003, + "grad_norm": 0.9773947596549988, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 82270 + }, + { + "epoch": 5.908797127468581, + "grad_norm": 1.1427783966064453, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 82280 + }, + { + "epoch": 5.909515260323159, + "grad_norm": 1.0215164422988892, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 82290 + }, + { + "epoch": 5.910233393177738, + "grad_norm": 1.1157845258712769, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 82300 + }, + { + "epoch": 5.910951526032316, + "grad_norm": 1.1490662097930908, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 82310 + }, + { + "epoch": 5.911669658886894, + "grad_norm": 0.7233976125717163, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 82320 + }, + { + "epoch": 5.912387791741472, + "grad_norm": 1.0053865909576416, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 82330 + }, + { + "epoch": 5.91310592459605, + "grad_norm": 0.9764766097068787, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 82340 + }, + { + "epoch": 5.913824057450628, + "grad_norm": 0.9492928385734558, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 82350 + }, + { + "epoch": 5.914542190305206, + "grad_norm": 0.9538891315460205, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 82360 + }, + { + "epoch": 5.915260323159784, + "grad_norm": 1.2620314359664917, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 82370 + }, + { + "epoch": 5.915978456014363, + "grad_norm": 0.9913349151611328, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 82380 + }, + { + "epoch": 5.916696588868941, + "grad_norm": 0.9712074995040894, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 82390 + }, + { + "epoch": 5.917414721723519, + "grad_norm": 1.1554654836654663, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 82400 + }, + { + "epoch": 5.918132854578097, + "grad_norm": 1.1418904066085815, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 82410 + }, + { + "epoch": 5.918850987432675, + "grad_norm": 0.9405845999717712, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 82420 + }, + { + "epoch": 5.919569120287253, + "grad_norm": 1.0801819562911987, + "learning_rate": 0.0002, + "loss": 0.606, + "step": 82430 + }, + { + "epoch": 5.920287253141831, + "grad_norm": 0.8643896579742432, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 82440 + }, + { + "epoch": 5.921005385996409, + "grad_norm": 1.106025218963623, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 82450 + }, + { + "epoch": 5.921723518850987, + "grad_norm": 1.0338234901428223, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 82460 + }, + { + "epoch": 5.922441651705565, + "grad_norm": 1.0648493766784668, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 82470 + }, + { + "epoch": 5.923159784560143, + "grad_norm": 1.1950433254241943, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 82480 + }, + { + "epoch": 5.923877917414722, + "grad_norm": 0.8730897903442383, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 82490 + }, + { + "epoch": 5.9245960502693, + "grad_norm": 1.2262312173843384, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 82500 + }, + { + "epoch": 5.925314183123878, + "grad_norm": 0.9526116251945496, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 82510 + }, + { + "epoch": 5.926032315978456, + "grad_norm": 1.0540224313735962, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 82520 + }, + { + "epoch": 5.926750448833034, + "grad_norm": 1.0537306070327759, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 82530 + }, + { + "epoch": 5.927468581687612, + "grad_norm": 1.134207844734192, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 82540 + }, + { + "epoch": 5.92818671454219, + "grad_norm": 0.9042250514030457, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 82550 + }, + { + "epoch": 5.928904847396768, + "grad_norm": 1.0424834489822388, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 82560 + }, + { + "epoch": 5.929622980251347, + "grad_norm": 1.1571602821350098, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 82570 + }, + { + "epoch": 5.930341113105925, + "grad_norm": 1.1033377647399902, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 82580 + }, + { + "epoch": 5.931059245960503, + "grad_norm": 0.9211772680282593, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 82590 + }, + { + "epoch": 5.931777378815081, + "grad_norm": 1.0566459894180298, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 82600 + }, + { + "epoch": 5.932495511669659, + "grad_norm": 1.1773834228515625, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 82610 + }, + { + "epoch": 5.933213644524237, + "grad_norm": 1.193396806716919, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 82620 + }, + { + "epoch": 5.933931777378815, + "grad_norm": 1.1101785898208618, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 82630 + }, + { + "epoch": 5.934649910233393, + "grad_norm": 0.6988118886947632, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 82640 + }, + { + "epoch": 5.935368043087971, + "grad_norm": 0.9590985774993896, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 82650 + }, + { + "epoch": 5.936086175942549, + "grad_norm": 0.8512062430381775, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 82660 + }, + { + "epoch": 5.936804308797128, + "grad_norm": 1.0381710529327393, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 82670 + }, + { + "epoch": 5.937522441651706, + "grad_norm": 1.0816296339035034, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 82680 + }, + { + "epoch": 5.938240574506284, + "grad_norm": 1.0592364072799683, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 82690 + }, + { + "epoch": 5.938958707360862, + "grad_norm": 0.737452507019043, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 82700 + }, + { + "epoch": 5.93967684021544, + "grad_norm": 0.9019039869308472, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 82710 + }, + { + "epoch": 5.940394973070018, + "grad_norm": 1.0049666166305542, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 82720 + }, + { + "epoch": 5.941113105924596, + "grad_norm": 1.0016309022903442, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 82730 + }, + { + "epoch": 5.941831238779174, + "grad_norm": 0.7967594861984253, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 82740 + }, + { + "epoch": 5.942549371633753, + "grad_norm": 0.8978520631790161, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 82750 + }, + { + "epoch": 5.943267504488331, + "grad_norm": 1.0101654529571533, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 82760 + }, + { + "epoch": 5.943985637342909, + "grad_norm": 1.1515586376190186, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 82770 + }, + { + "epoch": 5.944703770197487, + "grad_norm": 0.8666134476661682, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 82780 + }, + { + "epoch": 5.945421903052065, + "grad_norm": 1.1365231275558472, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 82790 + }, + { + "epoch": 5.946140035906643, + "grad_norm": 1.211229920387268, + "learning_rate": 0.0002, + "loss": 0.6122, + "step": 82800 + }, + { + "epoch": 5.946858168761221, + "grad_norm": 0.9900869727134705, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 82810 + }, + { + "epoch": 5.947576301615799, + "grad_norm": 0.9555928111076355, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 82820 + }, + { + "epoch": 5.948294434470377, + "grad_norm": 0.8468470573425293, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 82830 + }, + { + "epoch": 5.949012567324955, + "grad_norm": 1.0280319452285767, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 82840 + }, + { + "epoch": 5.949730700179533, + "grad_norm": 0.930145800113678, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 82850 + }, + { + "epoch": 5.950448833034112, + "grad_norm": 1.0677028894424438, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 82860 + }, + { + "epoch": 5.95116696588869, + "grad_norm": 1.2035255432128906, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 82870 + }, + { + "epoch": 5.951885098743268, + "grad_norm": 0.897537887096405, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 82880 + }, + { + "epoch": 5.952603231597846, + "grad_norm": 1.2858690023422241, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 82890 + }, + { + "epoch": 5.953321364452424, + "grad_norm": 1.0300413370132446, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 82900 + }, + { + "epoch": 5.954039497307002, + "grad_norm": 0.9873301982879639, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 82910 + }, + { + "epoch": 5.95475763016158, + "grad_norm": 1.0315600633621216, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 82920 + }, + { + "epoch": 5.955475763016158, + "grad_norm": 1.0631790161132812, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 82930 + }, + { + "epoch": 5.9561938958707366, + "grad_norm": 1.035544514656067, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 82940 + }, + { + "epoch": 5.956912028725315, + "grad_norm": 1.0162041187286377, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 82950 + }, + { + "epoch": 5.957630161579893, + "grad_norm": 0.7858892679214478, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 82960 + }, + { + "epoch": 5.958348294434471, + "grad_norm": 1.0359784364700317, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 82970 + }, + { + "epoch": 5.959066427289049, + "grad_norm": 1.057173252105713, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 82980 + }, + { + "epoch": 5.959784560143627, + "grad_norm": 1.1017464399337769, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 82990 + }, + { + "epoch": 5.960502692998205, + "grad_norm": 1.0688945055007935, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 83000 + }, + { + "epoch": 5.961220825852783, + "grad_norm": 1.048864483833313, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 83010 + }, + { + "epoch": 5.961938958707361, + "grad_norm": 1.057308316230774, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 83020 + }, + { + "epoch": 5.962657091561939, + "grad_norm": 0.9014604687690735, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 83030 + }, + { + "epoch": 5.963375224416517, + "grad_norm": 0.9899709224700928, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 83040 + }, + { + "epoch": 5.9640933572710955, + "grad_norm": 1.0675519704818726, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 83050 + }, + { + "epoch": 5.9648114901256735, + "grad_norm": 0.9497889876365662, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 83060 + }, + { + "epoch": 5.9655296229802515, + "grad_norm": 0.9149549603462219, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 83070 + }, + { + "epoch": 5.9662477558348295, + "grad_norm": 1.329373836517334, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 83080 + }, + { + "epoch": 5.9669658886894075, + "grad_norm": 1.0731712579727173, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 83090 + }, + { + "epoch": 5.9676840215439855, + "grad_norm": 0.9498835802078247, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 83100 + }, + { + "epoch": 5.9684021543985635, + "grad_norm": 1.1222829818725586, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 83110 + }, + { + "epoch": 5.9691202872531415, + "grad_norm": 0.9923429489135742, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 83120 + }, + { + "epoch": 5.96983842010772, + "grad_norm": 0.9046645164489746, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 83130 + }, + { + "epoch": 5.970556552962298, + "grad_norm": 0.9259500503540039, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 83140 + }, + { + "epoch": 5.971274685816876, + "grad_norm": 1.0604174137115479, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 83150 + }, + { + "epoch": 5.971992818671454, + "grad_norm": 1.0391676425933838, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 83160 + }, + { + "epoch": 5.972710951526032, + "grad_norm": 0.8825796246528625, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 83170 + }, + { + "epoch": 5.97342908438061, + "grad_norm": 0.9687952399253845, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 83180 + }, + { + "epoch": 5.974147217235188, + "grad_norm": 0.9401392340660095, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 83190 + }, + { + "epoch": 5.974865350089766, + "grad_norm": 1.0526834726333618, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 83200 + }, + { + "epoch": 5.975583482944344, + "grad_norm": 1.1882060766220093, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 83210 + }, + { + "epoch": 5.976301615798922, + "grad_norm": 0.9182824492454529, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 83220 + }, + { + "epoch": 5.977019748653501, + "grad_norm": 1.344875454902649, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 83230 + }, + { + "epoch": 5.977737881508079, + "grad_norm": 1.3868434429168701, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 83240 + }, + { + "epoch": 5.978456014362657, + "grad_norm": 1.2702280282974243, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 83250 + }, + { + "epoch": 5.979174147217235, + "grad_norm": 0.9808234572410583, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 83260 + }, + { + "epoch": 5.979892280071813, + "grad_norm": 0.9225142598152161, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 83270 + }, + { + "epoch": 5.980610412926391, + "grad_norm": 1.1095874309539795, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 83280 + }, + { + "epoch": 5.981328545780969, + "grad_norm": 1.2650344371795654, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 83290 + }, + { + "epoch": 5.982046678635547, + "grad_norm": 0.8230084180831909, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 83300 + }, + { + "epoch": 5.982764811490125, + "grad_norm": 1.171427607536316, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 83310 + }, + { + "epoch": 5.983482944344704, + "grad_norm": 0.7458868026733398, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 83320 + }, + { + "epoch": 5.984201077199282, + "grad_norm": 0.9238616228103638, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 83330 + }, + { + "epoch": 5.98491921005386, + "grad_norm": 1.027495265007019, + "learning_rate": 0.0002, + "loss": 0.6316, + "step": 83340 + }, + { + "epoch": 5.985637342908438, + "grad_norm": 1.0694037675857544, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 83350 + }, + { + "epoch": 5.986355475763016, + "grad_norm": 0.9498767256736755, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 83360 + }, + { + "epoch": 5.987073608617594, + "grad_norm": 1.0524284839630127, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 83370 + }, + { + "epoch": 5.987791741472172, + "grad_norm": 1.07961905002594, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 83380 + }, + { + "epoch": 5.98850987432675, + "grad_norm": 1.1436965465545654, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 83390 + }, + { + "epoch": 5.989228007181328, + "grad_norm": 1.2610782384872437, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 83400 + }, + { + "epoch": 5.989946140035906, + "grad_norm": 1.1105682849884033, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 83410 + }, + { + "epoch": 5.990664272890485, + "grad_norm": 0.9900349378585815, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 83420 + }, + { + "epoch": 5.991382405745063, + "grad_norm": 0.8766723275184631, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 83430 + }, + { + "epoch": 5.992100538599641, + "grad_norm": 0.9532597661018372, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 83440 + }, + { + "epoch": 5.992818671454219, + "grad_norm": 1.016831398010254, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 83450 + }, + { + "epoch": 5.993536804308797, + "grad_norm": 0.9884716272354126, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 83460 + }, + { + "epoch": 5.994254937163375, + "grad_norm": 0.9415417909622192, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 83470 + }, + { + "epoch": 5.994973070017953, + "grad_norm": 0.8629752397537231, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 83480 + }, + { + "epoch": 5.995691202872531, + "grad_norm": 1.061378002166748, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 83490 + }, + { + "epoch": 5.99640933572711, + "grad_norm": 0.907195508480072, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 83500 + }, + { + "epoch": 5.997127468581688, + "grad_norm": 1.023658037185669, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 83510 + }, + { + "epoch": 5.997845601436266, + "grad_norm": 0.9893278479576111, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 83520 + }, + { + "epoch": 5.998563734290844, + "grad_norm": 1.1909127235412598, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 83530 + }, + { + "epoch": 5.999281867145422, + "grad_norm": 1.1800892353057861, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 83540 + }, + { + "epoch": 6.0, + "grad_norm": 1.0822563171386719, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 83550 + }, + { + "epoch": 6.0, + "eval_loss": 1.1494214534759521, + "eval_runtime": 55.1809, + "eval_samples_per_second": 13.284, + "eval_steps_per_second": 1.667, + "step": 83550 + } + ], + "logging_steps": 10, + "max_steps": 111400, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.8665069239730176e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-83550/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..45b40ffd41a8c10573f25c918ce2396c93257b9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:356f1dcd6cb46b62d1ed3019882afd20ccbdc013485aa0a39f64710180b3c0c7 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9970b5dbc527eb04f5b22f1481ff68e356da182c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e40ba5168d6c7a8b447dc4fc3a0bd746c10b63b0b3af8ee655548e4e907d6f5b +size 55532922 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e5975a45b10398ee3c78e1c9e841ceba824a03e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5d12c1150b684d10c5c43d580dbd05e328e739bd8bf52f5e011c35591648ff7 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..48379ee5063c8ebbb605bf852f2c4b88738fc51e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14725cd94f79ec53b2649a0cc28f6130f6a515d7df9304bfc8a6c16a5e8801c2 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..71fb226df7276dc70760da6297e0f2a77b49d908 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/trainer_state.json @@ -0,0 +1,68318 @@ +{ + "best_metric": 1.0868422985076904, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 97475, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000718132854578097, + "grad_norm": 1.0291756391525269, + "learning_rate": 0.0002, + "loss": 1.5816, + "step": 10 + }, + { + "epoch": 0.001436265709156194, + "grad_norm": 0.6570823192596436, + "learning_rate": 0.0002, + "loss": 1.1527, + "step": 20 + }, + { + "epoch": 0.0021543985637342907, + "grad_norm": 0.693844199180603, + "learning_rate": 0.0002, + "loss": 1.0014, + "step": 30 + }, + { + "epoch": 0.002872531418312388, + "grad_norm": 0.5608532428741455, + "learning_rate": 0.0002, + "loss": 0.9377, + "step": 40 + }, + { + "epoch": 0.003590664272890485, + "grad_norm": 0.549075722694397, + "learning_rate": 0.0002, + "loss": 0.9533, + "step": 50 + }, + { + "epoch": 0.004308797127468581, + "grad_norm": 0.47189879417419434, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 60 + }, + { + "epoch": 0.005026929982046679, + "grad_norm": 0.5799676775932312, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 70 + }, + { + "epoch": 0.005745062836624776, + "grad_norm": 0.45907193422317505, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 80 + }, + { + "epoch": 0.006463195691202872, + "grad_norm": 0.4373045861721039, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 90 + }, + { + "epoch": 0.00718132854578097, + "grad_norm": 0.5636304020881653, + "learning_rate": 0.0002, + "loss": 0.8879, + "step": 100 + }, + { + "epoch": 0.007899461400359067, + "grad_norm": 0.5248253345489502, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 110 + }, + { + "epoch": 0.008617594254937163, + "grad_norm": 0.5082874298095703, + "learning_rate": 0.0002, + "loss": 0.9021, + "step": 120 + }, + { + "epoch": 0.00933572710951526, + "grad_norm": 0.42670881748199463, + "learning_rate": 0.0002, + "loss": 0.8678, + "step": 130 + }, + { + "epoch": 0.010053859964093357, + "grad_norm": 0.43311649560928345, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 140 + }, + { + "epoch": 0.010771992818671455, + "grad_norm": 0.43456509709358215, + "learning_rate": 0.0002, + "loss": 0.9252, + "step": 150 + }, + { + "epoch": 0.011490125673249552, + "grad_norm": 0.9222815632820129, + "learning_rate": 0.0002, + "loss": 0.8812, + "step": 160 + }, + { + "epoch": 0.012208258527827648, + "grad_norm": 0.42752256989479065, + "learning_rate": 0.0002, + "loss": 0.8651, + "step": 170 + }, + { + "epoch": 0.012926391382405745, + "grad_norm": 0.4175542891025543, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 180 + }, + { + "epoch": 0.013644524236983842, + "grad_norm": 0.4377831518650055, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 190 + }, + { + "epoch": 0.01436265709156194, + "grad_norm": 0.47263655066490173, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 200 + }, + { + "epoch": 0.015080789946140035, + "grad_norm": 0.3870520293712616, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 210 + }, + { + "epoch": 0.015798922800718134, + "grad_norm": 0.4950464963912964, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 220 + }, + { + "epoch": 0.01651705565529623, + "grad_norm": 0.4643295407295227, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 230 + }, + { + "epoch": 0.017235188509874325, + "grad_norm": 0.5152903199195862, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 240 + }, + { + "epoch": 0.017953321364452424, + "grad_norm": 0.3800727427005768, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 250 + }, + { + "epoch": 0.01867145421903052, + "grad_norm": 0.43700528144836426, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 260 + }, + { + "epoch": 0.01938958707360862, + "grad_norm": 0.3712887763977051, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 270 + }, + { + "epoch": 0.020107719928186715, + "grad_norm": 0.4202553629875183, + "learning_rate": 0.0002, + "loss": 0.8329, + "step": 280 + }, + { + "epoch": 0.02082585278276481, + "grad_norm": 0.40585094690322876, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 290 + }, + { + "epoch": 0.02154398563734291, + "grad_norm": 0.4685470759868622, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 300 + }, + { + "epoch": 0.022262118491921005, + "grad_norm": 0.373169481754303, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 310 + }, + { + "epoch": 0.022980251346499104, + "grad_norm": 0.39681482315063477, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 320 + }, + { + "epoch": 0.0236983842010772, + "grad_norm": 0.3919322192668915, + "learning_rate": 0.0002, + "loss": 0.8667, + "step": 330 + }, + { + "epoch": 0.024416517055655295, + "grad_norm": 0.4728981554508209, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 340 + }, + { + "epoch": 0.025134649910233394, + "grad_norm": 0.42439374327659607, + "learning_rate": 0.0002, + "loss": 0.8662, + "step": 350 + }, + { + "epoch": 0.02585278276481149, + "grad_norm": 0.425650030374527, + "learning_rate": 0.0002, + "loss": 0.8618, + "step": 360 + }, + { + "epoch": 0.02657091561938959, + "grad_norm": 0.4076762795448303, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 370 + }, + { + "epoch": 0.027289048473967684, + "grad_norm": 0.44335922598838806, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 380 + }, + { + "epoch": 0.02800718132854578, + "grad_norm": 0.5313619375228882, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 390 + }, + { + "epoch": 0.02872531418312388, + "grad_norm": 0.37089797854423523, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 400 + }, + { + "epoch": 0.029443447037701975, + "grad_norm": 0.5193604826927185, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 410 + }, + { + "epoch": 0.03016157989228007, + "grad_norm": 0.4428552985191345, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 420 + }, + { + "epoch": 0.03087971274685817, + "grad_norm": 0.384171724319458, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 430 + }, + { + "epoch": 0.03159784560143627, + "grad_norm": 0.3906913101673126, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 440 + }, + { + "epoch": 0.03231597845601436, + "grad_norm": 0.5365669131278992, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 450 + }, + { + "epoch": 0.03303411131059246, + "grad_norm": 0.4785287380218506, + "learning_rate": 0.0002, + "loss": 0.8376, + "step": 460 + }, + { + "epoch": 0.03375224416517056, + "grad_norm": 0.40048182010650635, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 470 + }, + { + "epoch": 0.03447037701974865, + "grad_norm": 0.49529239535331726, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 480 + }, + { + "epoch": 0.03518850987432675, + "grad_norm": 0.5853474140167236, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 490 + }, + { + "epoch": 0.03590664272890485, + "grad_norm": 0.3802863359451294, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 500 + }, + { + "epoch": 0.03662477558348295, + "grad_norm": 0.40374308824539185, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 510 + }, + { + "epoch": 0.03734290843806104, + "grad_norm": 0.4320009648799896, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 520 + }, + { + "epoch": 0.03806104129263914, + "grad_norm": 0.5198846459388733, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 530 + }, + { + "epoch": 0.03877917414721724, + "grad_norm": 0.4136947989463806, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 540 + }, + { + "epoch": 0.03949730700179533, + "grad_norm": 0.39344364404678345, + "learning_rate": 0.0002, + "loss": 0.8752, + "step": 550 + }, + { + "epoch": 0.04021543985637343, + "grad_norm": 0.4659644067287445, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 560 + }, + { + "epoch": 0.04093357271095153, + "grad_norm": 0.3898842930793762, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 570 + }, + { + "epoch": 0.04165170556552962, + "grad_norm": 0.3964841961860657, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 580 + }, + { + "epoch": 0.04236983842010772, + "grad_norm": 0.5172179341316223, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 590 + }, + { + "epoch": 0.04308797127468582, + "grad_norm": 0.5362544059753418, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 600 + }, + { + "epoch": 0.04380610412926391, + "grad_norm": 0.3975909948348999, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 610 + }, + { + "epoch": 0.04452423698384201, + "grad_norm": 0.3905031085014343, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 620 + }, + { + "epoch": 0.04524236983842011, + "grad_norm": 0.5148088932037354, + "learning_rate": 0.0002, + "loss": 0.7723, + "step": 630 + }, + { + "epoch": 0.04596050269299821, + "grad_norm": 0.38826194405555725, + "learning_rate": 0.0002, + "loss": 0.8309, + "step": 640 + }, + { + "epoch": 0.0466786355475763, + "grad_norm": 0.5432049036026001, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 650 + }, + { + "epoch": 0.0473967684021544, + "grad_norm": 0.42048221826553345, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 660 + }, + { + "epoch": 0.0481149012567325, + "grad_norm": 0.4683088958263397, + "learning_rate": 0.0002, + "loss": 0.8337, + "step": 670 + }, + { + "epoch": 0.04883303411131059, + "grad_norm": 0.4623735249042511, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 680 + }, + { + "epoch": 0.04955116696588869, + "grad_norm": 0.509128212928772, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 690 + }, + { + "epoch": 0.05026929982046679, + "grad_norm": 0.45767295360565186, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 700 + }, + { + "epoch": 0.05098743267504488, + "grad_norm": 0.4023726284503937, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 710 + }, + { + "epoch": 0.05170556552962298, + "grad_norm": 0.4407201409339905, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 720 + }, + { + "epoch": 0.05242369838420108, + "grad_norm": 0.41862091422080994, + "learning_rate": 0.0002, + "loss": 0.8369, + "step": 730 + }, + { + "epoch": 0.05314183123877918, + "grad_norm": 0.37473055720329285, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 740 + }, + { + "epoch": 0.05385996409335727, + "grad_norm": 0.4882921576499939, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 750 + }, + { + "epoch": 0.05457809694793537, + "grad_norm": 0.47890132665634155, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 760 + }, + { + "epoch": 0.05529622980251347, + "grad_norm": 0.5811166167259216, + "learning_rate": 0.0002, + "loss": 0.9222, + "step": 770 + }, + { + "epoch": 0.05601436265709156, + "grad_norm": 0.41113588213920593, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 780 + }, + { + "epoch": 0.05673249551166966, + "grad_norm": 0.4120602607727051, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 790 + }, + { + "epoch": 0.05745062836624776, + "grad_norm": 0.39287394285202026, + "learning_rate": 0.0002, + "loss": 0.9038, + "step": 800 + }, + { + "epoch": 0.05816876122082585, + "grad_norm": 0.3986941874027252, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 810 + }, + { + "epoch": 0.05888689407540395, + "grad_norm": 0.4264012575149536, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 820 + }, + { + "epoch": 0.05960502692998205, + "grad_norm": 0.481139600276947, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 830 + }, + { + "epoch": 0.06032315978456014, + "grad_norm": 0.5561784505844116, + "learning_rate": 0.0002, + "loss": 0.8477, + "step": 840 + }, + { + "epoch": 0.06104129263913824, + "grad_norm": 0.4787197411060333, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 850 + }, + { + "epoch": 0.06175942549371634, + "grad_norm": 0.46454647183418274, + "learning_rate": 0.0002, + "loss": 0.8567, + "step": 860 + }, + { + "epoch": 0.06247755834829444, + "grad_norm": 0.5929669141769409, + "learning_rate": 0.0002, + "loss": 0.8429, + "step": 870 + }, + { + "epoch": 0.06319569120287254, + "grad_norm": 0.4561384618282318, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 880 + }, + { + "epoch": 0.06391382405745062, + "grad_norm": 0.45767998695373535, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 890 + }, + { + "epoch": 0.06463195691202872, + "grad_norm": 0.42475444078445435, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 900 + }, + { + "epoch": 0.06535008976660682, + "grad_norm": 0.4911022484302521, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 910 + }, + { + "epoch": 0.06606822262118492, + "grad_norm": 0.5229166746139526, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 920 + }, + { + "epoch": 0.06678635547576302, + "grad_norm": 0.38134580850601196, + "learning_rate": 0.0002, + "loss": 0.8563, + "step": 930 + }, + { + "epoch": 0.06750448833034112, + "grad_norm": 0.4171486496925354, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 940 + }, + { + "epoch": 0.06822262118491922, + "grad_norm": 0.45171529054641724, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 950 + }, + { + "epoch": 0.0689407540394973, + "grad_norm": 0.44889307022094727, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 960 + }, + { + "epoch": 0.0696588868940754, + "grad_norm": 0.44902464747428894, + "learning_rate": 0.0002, + "loss": 0.8149, + "step": 970 + }, + { + "epoch": 0.0703770197486535, + "grad_norm": 0.4671969413757324, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 980 + }, + { + "epoch": 0.0710951526032316, + "grad_norm": 0.4686984717845917, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 990 + }, + { + "epoch": 0.0718132854578097, + "grad_norm": 0.4513658583164215, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1000 + }, + { + "epoch": 0.0725314183123878, + "grad_norm": 0.48861828446388245, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1010 + }, + { + "epoch": 0.0732495511669659, + "grad_norm": 0.7603165507316589, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 1020 + }, + { + "epoch": 0.07396768402154398, + "grad_norm": 0.501654863357544, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 1030 + }, + { + "epoch": 0.07468581687612208, + "grad_norm": 0.45291560888290405, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 1040 + }, + { + "epoch": 0.07540394973070018, + "grad_norm": 0.42454713582992554, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 1050 + }, + { + "epoch": 0.07612208258527828, + "grad_norm": 0.4655592441558838, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1060 + }, + { + "epoch": 0.07684021543985638, + "grad_norm": 0.5011071562767029, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 1070 + }, + { + "epoch": 0.07755834829443448, + "grad_norm": 0.37221577763557434, + "learning_rate": 0.0002, + "loss": 0.8502, + "step": 1080 + }, + { + "epoch": 0.07827648114901256, + "grad_norm": 0.5123572945594788, + "learning_rate": 0.0002, + "loss": 0.8623, + "step": 1090 + }, + { + "epoch": 0.07899461400359066, + "grad_norm": 0.44138720631599426, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1100 + }, + { + "epoch": 0.07971274685816876, + "grad_norm": 0.38932886719703674, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 1110 + }, + { + "epoch": 0.08043087971274686, + "grad_norm": 0.435820072889328, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 1120 + }, + { + "epoch": 0.08114901256732496, + "grad_norm": 0.3820142149925232, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 1130 + }, + { + "epoch": 0.08186714542190306, + "grad_norm": 0.39680808782577515, + "learning_rate": 0.0002, + "loss": 0.8617, + "step": 1140 + }, + { + "epoch": 0.08258527827648116, + "grad_norm": 0.4833722412586212, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1150 + }, + { + "epoch": 0.08330341113105924, + "grad_norm": 0.5045956969261169, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 1160 + }, + { + "epoch": 0.08402154398563734, + "grad_norm": 0.3652207553386688, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 1170 + }, + { + "epoch": 0.08473967684021544, + "grad_norm": 0.44447052478790283, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 1180 + }, + { + "epoch": 0.08545780969479354, + "grad_norm": 0.44942694902420044, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 1190 + }, + { + "epoch": 0.08617594254937164, + "grad_norm": 0.48789075016975403, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1200 + }, + { + "epoch": 0.08689407540394974, + "grad_norm": 0.3981451094150543, + "learning_rate": 0.0002, + "loss": 0.8008, + "step": 1210 + }, + { + "epoch": 0.08761220825852782, + "grad_norm": 0.45545220375061035, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 1220 + }, + { + "epoch": 0.08833034111310592, + "grad_norm": 0.562138557434082, + "learning_rate": 0.0002, + "loss": 0.8406, + "step": 1230 + }, + { + "epoch": 0.08904847396768402, + "grad_norm": 0.48523494601249695, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 1240 + }, + { + "epoch": 0.08976660682226212, + "grad_norm": 0.35054388642311096, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 1250 + }, + { + "epoch": 0.09048473967684022, + "grad_norm": 0.4148605167865753, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 1260 + }, + { + "epoch": 0.09120287253141832, + "grad_norm": 0.50171959400177, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 1270 + }, + { + "epoch": 0.09192100538599642, + "grad_norm": 0.41747573018074036, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 1280 + }, + { + "epoch": 0.0926391382405745, + "grad_norm": 0.43028751015663147, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1290 + }, + { + "epoch": 0.0933572710951526, + "grad_norm": 0.41274991631507874, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 1300 + }, + { + "epoch": 0.0940754039497307, + "grad_norm": 0.5399569272994995, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 1310 + }, + { + "epoch": 0.0947935368043088, + "grad_norm": 0.44284379482269287, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 1320 + }, + { + "epoch": 0.0955116696588869, + "grad_norm": 0.42511969804763794, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1330 + }, + { + "epoch": 0.096229802513465, + "grad_norm": 0.5717929005622864, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1340 + }, + { + "epoch": 0.09694793536804308, + "grad_norm": 0.4104631245136261, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1350 + }, + { + "epoch": 0.09766606822262118, + "grad_norm": 0.4144339859485626, + "learning_rate": 0.0002, + "loss": 0.8697, + "step": 1360 + }, + { + "epoch": 0.09838420107719928, + "grad_norm": 0.43676936626434326, + "learning_rate": 0.0002, + "loss": 0.7909, + "step": 1370 + }, + { + "epoch": 0.09910233393177738, + "grad_norm": 0.5297161340713501, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 1380 + }, + { + "epoch": 0.09982046678635548, + "grad_norm": 0.5319193601608276, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1390 + }, + { + "epoch": 0.10053859964093358, + "grad_norm": 0.4083728492259979, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1400 + }, + { + "epoch": 0.10125673249551168, + "grad_norm": 0.4193868339061737, + "learning_rate": 0.0002, + "loss": 0.8436, + "step": 1410 + }, + { + "epoch": 0.10197486535008976, + "grad_norm": 0.4062198996543884, + "learning_rate": 0.0002, + "loss": 0.8634, + "step": 1420 + }, + { + "epoch": 0.10269299820466786, + "grad_norm": 0.43972232937812805, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1430 + }, + { + "epoch": 0.10341113105924596, + "grad_norm": 0.4598410725593567, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1440 + }, + { + "epoch": 0.10412926391382406, + "grad_norm": 0.571662187576294, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 1450 + }, + { + "epoch": 0.10484739676840216, + "grad_norm": 0.5437791347503662, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1460 + }, + { + "epoch": 0.10556552962298026, + "grad_norm": 0.4241923391819, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1470 + }, + { + "epoch": 0.10628366247755835, + "grad_norm": 0.5185145735740662, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1480 + }, + { + "epoch": 0.10700179533213644, + "grad_norm": 0.537626326084137, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 1490 + }, + { + "epoch": 0.10771992818671454, + "grad_norm": 0.4573661983013153, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 1500 + }, + { + "epoch": 0.10843806104129264, + "grad_norm": 0.4521017074584961, + "learning_rate": 0.0002, + "loss": 0.8318, + "step": 1510 + }, + { + "epoch": 0.10915619389587074, + "grad_norm": 0.6835159063339233, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1520 + }, + { + "epoch": 0.10987432675044884, + "grad_norm": 0.43522894382476807, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 1530 + }, + { + "epoch": 0.11059245960502694, + "grad_norm": 0.685547411441803, + "learning_rate": 0.0002, + "loss": 0.8211, + "step": 1540 + }, + { + "epoch": 0.11131059245960502, + "grad_norm": 0.5283669233322144, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 1550 + }, + { + "epoch": 0.11202872531418312, + "grad_norm": 0.4869283437728882, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 1560 + }, + { + "epoch": 0.11274685816876122, + "grad_norm": 0.43024054169654846, + "learning_rate": 0.0002, + "loss": 0.8614, + "step": 1570 + }, + { + "epoch": 0.11346499102333932, + "grad_norm": 0.46726059913635254, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1580 + }, + { + "epoch": 0.11418312387791742, + "grad_norm": 0.5046039819717407, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 1590 + }, + { + "epoch": 0.11490125673249552, + "grad_norm": 0.48972827196121216, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 1600 + }, + { + "epoch": 0.11561938958707361, + "grad_norm": 0.5221049189567566, + "learning_rate": 0.0002, + "loss": 0.8114, + "step": 1610 + }, + { + "epoch": 0.1163375224416517, + "grad_norm": 0.49169477820396423, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 1620 + }, + { + "epoch": 0.1170556552962298, + "grad_norm": 0.48462188243865967, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 1630 + }, + { + "epoch": 0.1177737881508079, + "grad_norm": 0.9001021981239319, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 1640 + }, + { + "epoch": 0.118491921005386, + "grad_norm": 0.47555917501449585, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 1650 + }, + { + "epoch": 0.1192100538599641, + "grad_norm": 0.4523521959781647, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1660 + }, + { + "epoch": 0.1199281867145422, + "grad_norm": 0.510956346988678, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 1670 + }, + { + "epoch": 0.12064631956912028, + "grad_norm": 0.48063746094703674, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 0.12136445242369838, + "grad_norm": 0.5209490060806274, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 1690 + }, + { + "epoch": 0.12208258527827648, + "grad_norm": 0.5488983988761902, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1700 + }, + { + "epoch": 0.12280071813285458, + "grad_norm": 0.5263523459434509, + "learning_rate": 0.0002, + "loss": 0.829, + "step": 1710 + }, + { + "epoch": 0.12351885098743268, + "grad_norm": 0.45365768671035767, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 1720 + }, + { + "epoch": 0.12423698384201078, + "grad_norm": 0.4366922378540039, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 1730 + }, + { + "epoch": 0.12495511669658887, + "grad_norm": 0.4841083884239197, + "learning_rate": 0.0002, + "loss": 0.8261, + "step": 1740 + }, + { + "epoch": 0.12567324955116696, + "grad_norm": 0.46546968817710876, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 1750 + }, + { + "epoch": 0.12639138240574507, + "grad_norm": 0.39987099170684814, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 1760 + }, + { + "epoch": 0.12710951526032316, + "grad_norm": 0.4661678969860077, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 1770 + }, + { + "epoch": 0.12782764811490124, + "grad_norm": 0.46716657280921936, + "learning_rate": 0.0002, + "loss": 0.8516, + "step": 1780 + }, + { + "epoch": 0.12854578096947936, + "grad_norm": 0.46164995431900024, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 1790 + }, + { + "epoch": 0.12926391382405744, + "grad_norm": 0.4910370111465454, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 1800 + }, + { + "epoch": 0.12998204667863555, + "grad_norm": 0.5615737438201904, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 1810 + }, + { + "epoch": 0.13070017953321364, + "grad_norm": 0.5739728808403015, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1820 + }, + { + "epoch": 0.13141831238779175, + "grad_norm": 0.44104722142219543, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 1830 + }, + { + "epoch": 0.13213644524236984, + "grad_norm": 0.46373724937438965, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 1840 + }, + { + "epoch": 0.13285457809694792, + "grad_norm": 0.4481196403503418, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 1850 + }, + { + "epoch": 0.13357271095152604, + "grad_norm": 0.5689327716827393, + "learning_rate": 0.0002, + "loss": 0.8508, + "step": 1860 + }, + { + "epoch": 0.13429084380610412, + "grad_norm": 0.5334849953651428, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 1870 + }, + { + "epoch": 0.13500897666068223, + "grad_norm": 0.5177253484725952, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 1880 + }, + { + "epoch": 0.13572710951526032, + "grad_norm": 0.4919368326663971, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 1890 + }, + { + "epoch": 0.13644524236983843, + "grad_norm": 0.5987576842308044, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 1900 + }, + { + "epoch": 0.13716337522441652, + "grad_norm": 0.49790486693382263, + "learning_rate": 0.0002, + "loss": 0.8546, + "step": 1910 + }, + { + "epoch": 0.1378815080789946, + "grad_norm": 0.5337542295455933, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 1920 + }, + { + "epoch": 0.13859964093357272, + "grad_norm": 0.5171598792076111, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 1930 + }, + { + "epoch": 0.1393177737881508, + "grad_norm": 0.5003953576087952, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1940 + }, + { + "epoch": 0.1400359066427289, + "grad_norm": 0.5147887468338013, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 1950 + }, + { + "epoch": 0.140754039497307, + "grad_norm": 0.6365984678268433, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 1960 + }, + { + "epoch": 0.1414721723518851, + "grad_norm": 0.5449512004852295, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 1970 + }, + { + "epoch": 0.1421903052064632, + "grad_norm": 0.4062703847885132, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1980 + }, + { + "epoch": 0.14290843806104128, + "grad_norm": 0.4446912705898285, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 1990 + }, + { + "epoch": 0.1436265709156194, + "grad_norm": 0.49001234769821167, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 2000 + }, + { + "epoch": 0.14434470377019748, + "grad_norm": 0.5591765642166138, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 2010 + }, + { + "epoch": 0.1450628366247756, + "grad_norm": 0.6476696133613586, + "learning_rate": 0.0002, + "loss": 0.7808, + "step": 2020 + }, + { + "epoch": 0.14578096947935368, + "grad_norm": 0.44688376784324646, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 2030 + }, + { + "epoch": 0.1464991023339318, + "grad_norm": 0.4437490701675415, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 2040 + }, + { + "epoch": 0.14721723518850988, + "grad_norm": 0.59927898645401, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 2050 + }, + { + "epoch": 0.14793536804308796, + "grad_norm": 0.4356591999530792, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 2060 + }, + { + "epoch": 0.14865350089766607, + "grad_norm": 0.5560822486877441, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2070 + }, + { + "epoch": 0.14937163375224416, + "grad_norm": 0.43027108907699585, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 2080 + }, + { + "epoch": 0.15008976660682227, + "grad_norm": 0.41215455532073975, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 2090 + }, + { + "epoch": 0.15080789946140036, + "grad_norm": 0.4607839584350586, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 2100 + }, + { + "epoch": 0.15152603231597844, + "grad_norm": 0.4699854254722595, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 2110 + }, + { + "epoch": 0.15224416517055656, + "grad_norm": 0.5111975073814392, + "learning_rate": 0.0002, + "loss": 0.8464, + "step": 2120 + }, + { + "epoch": 0.15296229802513464, + "grad_norm": 0.4713742733001709, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 2130 + }, + { + "epoch": 0.15368043087971275, + "grad_norm": 0.3816622793674469, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 2140 + }, + { + "epoch": 0.15439856373429084, + "grad_norm": 0.4637526273727417, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 2150 + }, + { + "epoch": 0.15511669658886895, + "grad_norm": 0.3691818118095398, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 2160 + }, + { + "epoch": 0.15583482944344704, + "grad_norm": 0.4435218274593353, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 2170 + }, + { + "epoch": 0.15655296229802512, + "grad_norm": 0.5282211899757385, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 2180 + }, + { + "epoch": 0.15727109515260324, + "grad_norm": 0.7611056566238403, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 2190 + }, + { + "epoch": 0.15798922800718132, + "grad_norm": 0.5951169729232788, + "learning_rate": 0.0002, + "loss": 0.8039, + "step": 2200 + }, + { + "epoch": 0.15870736086175943, + "grad_norm": 0.5243265628814697, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 2210 + }, + { + "epoch": 0.15942549371633752, + "grad_norm": 0.518944501876831, + "learning_rate": 0.0002, + "loss": 0.7817, + "step": 2220 + }, + { + "epoch": 0.16014362657091563, + "grad_norm": 0.4264616072177887, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2230 + }, + { + "epoch": 0.16086175942549372, + "grad_norm": 0.4619045853614807, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 2240 + }, + { + "epoch": 0.1615798922800718, + "grad_norm": 0.4047030508518219, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 2250 + }, + { + "epoch": 0.16229802513464991, + "grad_norm": 0.47133687138557434, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 2260 + }, + { + "epoch": 0.163016157989228, + "grad_norm": 0.4990246593952179, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 2270 + }, + { + "epoch": 0.1637342908438061, + "grad_norm": 0.5145298838615417, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 2280 + }, + { + "epoch": 0.1644524236983842, + "grad_norm": 0.5354352593421936, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 2290 + }, + { + "epoch": 0.1651705565529623, + "grad_norm": 0.47621065378189087, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 2300 + }, + { + "epoch": 0.1658886894075404, + "grad_norm": 0.45333582162857056, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 2310 + }, + { + "epoch": 0.16660682226211848, + "grad_norm": 0.4832790493965149, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 2320 + }, + { + "epoch": 0.1673249551166966, + "grad_norm": 0.4922761619091034, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 2330 + }, + { + "epoch": 0.16804308797127468, + "grad_norm": 0.5701655149459839, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 2340 + }, + { + "epoch": 0.1687612208258528, + "grad_norm": 0.5170459151268005, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 2350 + }, + { + "epoch": 0.16947935368043088, + "grad_norm": 0.6562373638153076, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 2360 + }, + { + "epoch": 0.170197486535009, + "grad_norm": 0.5350262522697449, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 2370 + }, + { + "epoch": 0.17091561938958708, + "grad_norm": 0.5163491368293762, + "learning_rate": 0.0002, + "loss": 0.8501, + "step": 2380 + }, + { + "epoch": 0.17163375224416516, + "grad_norm": 0.48841530084609985, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2390 + }, + { + "epoch": 0.17235188509874327, + "grad_norm": 0.44912993907928467, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 2400 + }, + { + "epoch": 0.17307001795332136, + "grad_norm": 0.5770647525787354, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 2410 + }, + { + "epoch": 0.17378815080789947, + "grad_norm": 0.4716179072856903, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 2420 + }, + { + "epoch": 0.17450628366247756, + "grad_norm": 0.5465078949928284, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 2430 + }, + { + "epoch": 0.17522441651705564, + "grad_norm": 0.40810713171958923, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 2440 + }, + { + "epoch": 0.17594254937163376, + "grad_norm": 0.3789578080177307, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 2450 + }, + { + "epoch": 0.17666068222621184, + "grad_norm": 0.4615110158920288, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 2460 + }, + { + "epoch": 0.17737881508078995, + "grad_norm": 0.4400235712528229, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2470 + }, + { + "epoch": 0.17809694793536804, + "grad_norm": 0.5935020446777344, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 2480 + }, + { + "epoch": 0.17881508078994615, + "grad_norm": 0.5672990679740906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 2490 + }, + { + "epoch": 0.17953321364452424, + "grad_norm": 0.4132838845252991, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 2500 + }, + { + "epoch": 0.18025134649910232, + "grad_norm": 0.5373716950416565, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 2510 + }, + { + "epoch": 0.18096947935368043, + "grad_norm": 0.5335832834243774, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 2520 + }, + { + "epoch": 0.18168761220825852, + "grad_norm": 0.5705642700195312, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.18240574506283663, + "grad_norm": 0.4807959496974945, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 2540 + }, + { + "epoch": 0.18312387791741472, + "grad_norm": 0.4430573880672455, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 2550 + }, + { + "epoch": 0.18384201077199283, + "grad_norm": 0.5294728875160217, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 2560 + }, + { + "epoch": 0.18456014362657092, + "grad_norm": 0.661173403263092, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 2570 + }, + { + "epoch": 0.185278276481149, + "grad_norm": 0.5044304728507996, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 2580 + }, + { + "epoch": 0.18599640933572711, + "grad_norm": 0.48929551243782043, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 2590 + }, + { + "epoch": 0.1867145421903052, + "grad_norm": 0.5054438710212708, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 2600 + }, + { + "epoch": 0.1874326750448833, + "grad_norm": 0.5613677501678467, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 2610 + }, + { + "epoch": 0.1881508078994614, + "grad_norm": 0.5762478709220886, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 2620 + }, + { + "epoch": 0.1888689407540395, + "grad_norm": 0.4523695409297943, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 2630 + }, + { + "epoch": 0.1895870736086176, + "grad_norm": 0.5235317945480347, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 2640 + }, + { + "epoch": 0.19030520646319568, + "grad_norm": 0.4894576370716095, + "learning_rate": 0.0002, + "loss": 0.8281, + "step": 2650 + }, + { + "epoch": 0.1910233393177738, + "grad_norm": 0.45731106400489807, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 2660 + }, + { + "epoch": 0.19174147217235188, + "grad_norm": 0.4726541042327881, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 2670 + }, + { + "epoch": 0.19245960502693, + "grad_norm": 0.4281631410121918, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 2680 + }, + { + "epoch": 0.19317773788150808, + "grad_norm": 0.48011314868927, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 2690 + }, + { + "epoch": 0.19389587073608616, + "grad_norm": 0.45785006880760193, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2700 + }, + { + "epoch": 0.19461400359066428, + "grad_norm": 0.5244625210762024, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 2710 + }, + { + "epoch": 0.19533213644524236, + "grad_norm": 0.4674883186817169, + "learning_rate": 0.0002, + "loss": 0.8674, + "step": 2720 + }, + { + "epoch": 0.19605026929982047, + "grad_norm": 0.5969558358192444, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 2730 + }, + { + "epoch": 0.19676840215439856, + "grad_norm": 0.44413265585899353, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 2740 + }, + { + "epoch": 0.19748653500897667, + "grad_norm": 0.5094553828239441, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2750 + }, + { + "epoch": 0.19820466786355476, + "grad_norm": 0.4931736886501312, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2760 + }, + { + "epoch": 0.19892280071813284, + "grad_norm": 0.4766625463962555, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 2770 + }, + { + "epoch": 0.19964093357271095, + "grad_norm": 0.4196971654891968, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 2780 + }, + { + "epoch": 0.20035906642728904, + "grad_norm": 0.4693375825881958, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 2790 + }, + { + "epoch": 0.20107719928186715, + "grad_norm": 0.5407108664512634, + "learning_rate": 0.0002, + "loss": 0.8336, + "step": 2800 + }, + { + "epoch": 0.20179533213644524, + "grad_norm": 0.42864227294921875, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 2810 + }, + { + "epoch": 0.20251346499102335, + "grad_norm": 0.4928833246231079, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 2820 + }, + { + "epoch": 0.20323159784560144, + "grad_norm": 0.5575131773948669, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2830 + }, + { + "epoch": 0.20394973070017952, + "grad_norm": 0.505114734172821, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 2840 + }, + { + "epoch": 0.20466786355475763, + "grad_norm": 0.4727420210838318, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 2850 + }, + { + "epoch": 0.20538599640933572, + "grad_norm": 0.48218145966529846, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 2860 + }, + { + "epoch": 0.20610412926391383, + "grad_norm": 0.5196906328201294, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2870 + }, + { + "epoch": 0.20682226211849192, + "grad_norm": 0.4927639067173004, + "learning_rate": 0.0002, + "loss": 0.8401, + "step": 2880 + }, + { + "epoch": 0.20754039497307003, + "grad_norm": 0.5076990127563477, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 2890 + }, + { + "epoch": 0.20825852782764812, + "grad_norm": 0.4606800079345703, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 2900 + }, + { + "epoch": 0.2089766606822262, + "grad_norm": 0.6184319257736206, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2910 + }, + { + "epoch": 0.2096947935368043, + "grad_norm": 0.5237935781478882, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 2920 + }, + { + "epoch": 0.2104129263913824, + "grad_norm": 0.43966251611709595, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 2930 + }, + { + "epoch": 0.2111310592459605, + "grad_norm": 0.48786666989326477, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2940 + }, + { + "epoch": 0.2118491921005386, + "grad_norm": 0.4397817552089691, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 2950 + }, + { + "epoch": 0.2125673249551167, + "grad_norm": 0.5155336260795593, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 2960 + }, + { + "epoch": 0.2132854578096948, + "grad_norm": 0.48058274388313293, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 2970 + }, + { + "epoch": 0.21400359066427288, + "grad_norm": 0.5022647976875305, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2980 + }, + { + "epoch": 0.214721723518851, + "grad_norm": 0.5417225360870361, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 2990 + }, + { + "epoch": 0.21543985637342908, + "grad_norm": 0.46300315856933594, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 3000 + }, + { + "epoch": 0.2161579892280072, + "grad_norm": 0.5375089049339294, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 3010 + }, + { + "epoch": 0.21687612208258528, + "grad_norm": 0.5050022602081299, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 3020 + }, + { + "epoch": 0.21759425493716336, + "grad_norm": 0.46347716450691223, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 3030 + }, + { + "epoch": 0.21831238779174147, + "grad_norm": 0.544874370098114, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 3040 + }, + { + "epoch": 0.21903052064631956, + "grad_norm": 0.4268142580986023, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 3050 + }, + { + "epoch": 0.21974865350089767, + "grad_norm": 0.5527157187461853, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 3060 + }, + { + "epoch": 0.22046678635547576, + "grad_norm": 0.5565235018730164, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 3070 + }, + { + "epoch": 0.22118491921005387, + "grad_norm": 0.4900645613670349, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 3080 + }, + { + "epoch": 0.22190305206463196, + "grad_norm": 0.4951242208480835, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 3090 + }, + { + "epoch": 0.22262118491921004, + "grad_norm": 0.5831719636917114, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 3100 + }, + { + "epoch": 0.22333931777378815, + "grad_norm": 0.417576402425766, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 3110 + }, + { + "epoch": 0.22405745062836624, + "grad_norm": 0.4715117812156677, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 3120 + }, + { + "epoch": 0.22477558348294435, + "grad_norm": 0.5956445336341858, + "learning_rate": 0.0002, + "loss": 0.778, + "step": 3130 + }, + { + "epoch": 0.22549371633752244, + "grad_norm": 0.408184289932251, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 3140 + }, + { + "epoch": 0.22621184919210055, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 3150 + }, + { + "epoch": 0.22692998204667864, + "grad_norm": 0.5631294846534729, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3160 + }, + { + "epoch": 0.22764811490125672, + "grad_norm": 0.5054665803909302, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3170 + }, + { + "epoch": 0.22836624775583483, + "grad_norm": 0.47388020157814026, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 3180 + }, + { + "epoch": 0.22908438061041292, + "grad_norm": 0.45871609449386597, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 3190 + }, + { + "epoch": 0.22980251346499103, + "grad_norm": 0.42431211471557617, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 3200 + }, + { + "epoch": 0.23052064631956912, + "grad_norm": 0.584872305393219, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3210 + }, + { + "epoch": 0.23123877917414723, + "grad_norm": 0.5489653944969177, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 3220 + }, + { + "epoch": 0.23195691202872532, + "grad_norm": 0.5803213119506836, + "learning_rate": 0.0002, + "loss": 0.8552, + "step": 3230 + }, + { + "epoch": 0.2326750448833034, + "grad_norm": 0.906505823135376, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3240 + }, + { + "epoch": 0.2333931777378815, + "grad_norm": 0.4569525718688965, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 3250 + }, + { + "epoch": 0.2341113105924596, + "grad_norm": 0.5566741228103638, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3260 + }, + { + "epoch": 0.2348294434470377, + "grad_norm": 0.5059959888458252, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 3270 + }, + { + "epoch": 0.2355475763016158, + "grad_norm": 0.530828058719635, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 3280 + }, + { + "epoch": 0.2362657091561939, + "grad_norm": 0.5149409174919128, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 3290 + }, + { + "epoch": 0.236983842010772, + "grad_norm": 0.7323763966560364, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3300 + }, + { + "epoch": 0.23770197486535008, + "grad_norm": 0.6794836521148682, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 3310 + }, + { + "epoch": 0.2384201077199282, + "grad_norm": 0.5176534056663513, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 3320 + }, + { + "epoch": 0.23913824057450628, + "grad_norm": 0.42245906591415405, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 3330 + }, + { + "epoch": 0.2398563734290844, + "grad_norm": 0.43535107374191284, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 3340 + }, + { + "epoch": 0.24057450628366248, + "grad_norm": 0.7038307785987854, + "learning_rate": 0.0002, + "loss": 0.825, + "step": 3350 + }, + { + "epoch": 0.24129263913824056, + "grad_norm": 0.5689977407455444, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 3360 + }, + { + "epoch": 0.24201077199281867, + "grad_norm": 0.538136899471283, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 3370 + }, + { + "epoch": 0.24272890484739676, + "grad_norm": 0.7433661222457886, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 3380 + }, + { + "epoch": 0.24344703770197487, + "grad_norm": 0.6996734738349915, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3390 + }, + { + "epoch": 0.24416517055655296, + "grad_norm": 0.5055703520774841, + "learning_rate": 0.0002, + "loss": 0.8288, + "step": 3400 + }, + { + "epoch": 0.24488330341113107, + "grad_norm": 0.5218513607978821, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 3410 + }, + { + "epoch": 0.24560143626570916, + "grad_norm": 0.42782822251319885, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3420 + }, + { + "epoch": 0.24631956912028724, + "grad_norm": 0.4991157650947571, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 3430 + }, + { + "epoch": 0.24703770197486535, + "grad_norm": 0.5063165426254272, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3440 + }, + { + "epoch": 0.24775583482944344, + "grad_norm": 0.45863136649131775, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 3450 + }, + { + "epoch": 0.24847396768402155, + "grad_norm": 0.474728524684906, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3460 + }, + { + "epoch": 0.24919210053859964, + "grad_norm": 0.522570013999939, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 3470 + }, + { + "epoch": 0.24991023339317775, + "grad_norm": 0.5474396347999573, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 3480 + }, + { + "epoch": 0.2506283662477558, + "grad_norm": 0.49094662070274353, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3490 + }, + { + "epoch": 0.2513464991023339, + "grad_norm": 0.6399132609367371, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 3500 + }, + { + "epoch": 0.25206463195691203, + "grad_norm": 0.5910066366195679, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 3510 + }, + { + "epoch": 0.25278276481149015, + "grad_norm": 0.4761259853839874, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3520 + }, + { + "epoch": 0.2535008976660682, + "grad_norm": 0.5124502182006836, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 3530 + }, + { + "epoch": 0.2542190305206463, + "grad_norm": 0.4329150915145874, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3540 + }, + { + "epoch": 0.25493716337522443, + "grad_norm": 0.4839608371257782, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 3550 + }, + { + "epoch": 0.2556552962298025, + "grad_norm": 0.5413459539413452, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3560 + }, + { + "epoch": 0.2563734290843806, + "grad_norm": 0.5761468410491943, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 3570 + }, + { + "epoch": 0.2570915619389587, + "grad_norm": 0.49266132712364197, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3580 + }, + { + "epoch": 0.2578096947935368, + "grad_norm": 0.7377930879592896, + "learning_rate": 0.0002, + "loss": 0.7946, + "step": 3590 + }, + { + "epoch": 0.2585278276481149, + "grad_norm": 0.543541431427002, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3600 + }, + { + "epoch": 0.259245960502693, + "grad_norm": 0.48385897278785706, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3610 + }, + { + "epoch": 0.2599640933572711, + "grad_norm": 0.5152639746665955, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3620 + }, + { + "epoch": 0.26068222621184917, + "grad_norm": 0.5601988434791565, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 3630 + }, + { + "epoch": 0.2614003590664273, + "grad_norm": 0.4349626302719116, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 3640 + }, + { + "epoch": 0.2621184919210054, + "grad_norm": 0.5487161874771118, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3650 + }, + { + "epoch": 0.2628366247755835, + "grad_norm": 0.45603805780410767, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 3660 + }, + { + "epoch": 0.26355475763016156, + "grad_norm": 0.5012730956077576, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 3670 + }, + { + "epoch": 0.2642728904847397, + "grad_norm": 0.4523845314979553, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 3680 + }, + { + "epoch": 0.2649910233393178, + "grad_norm": 0.5756664872169495, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 3690 + }, + { + "epoch": 0.26570915619389585, + "grad_norm": 0.48467493057250977, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 3700 + }, + { + "epoch": 0.26642728904847396, + "grad_norm": 0.4860585927963257, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 3710 + }, + { + "epoch": 0.26714542190305207, + "grad_norm": 0.5067077875137329, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3720 + }, + { + "epoch": 0.2678635547576302, + "grad_norm": 0.5490895509719849, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3730 + }, + { + "epoch": 0.26858168761220824, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 3740 + }, + { + "epoch": 0.26929982046678635, + "grad_norm": 0.5026951432228088, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 3750 + }, + { + "epoch": 0.27001795332136447, + "grad_norm": 0.49474090337753296, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3760 + }, + { + "epoch": 0.2707360861759425, + "grad_norm": 0.6381985545158386, + "learning_rate": 0.0002, + "loss": 0.8235, + "step": 3770 + }, + { + "epoch": 0.27145421903052064, + "grad_norm": 0.4784011244773865, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 3780 + }, + { + "epoch": 0.27217235188509875, + "grad_norm": 0.5126543045043945, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 3790 + }, + { + "epoch": 0.27289048473967686, + "grad_norm": 0.5428652763366699, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 3800 + }, + { + "epoch": 0.2736086175942549, + "grad_norm": 0.5427033305168152, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 3810 + }, + { + "epoch": 0.27432675044883303, + "grad_norm": 0.46467480063438416, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 3820 + }, + { + "epoch": 0.27504488330341115, + "grad_norm": 0.494367390871048, + "learning_rate": 0.0002, + "loss": 0.8414, + "step": 3830 + }, + { + "epoch": 0.2757630161579892, + "grad_norm": 0.59856778383255, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3840 + }, + { + "epoch": 0.2764811490125673, + "grad_norm": 0.422128826379776, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 3850 + }, + { + "epoch": 0.27719928186714543, + "grad_norm": 0.5757306814193726, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 3860 + }, + { + "epoch": 0.27791741472172354, + "grad_norm": 0.5850930213928223, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.2786355475763016, + "grad_norm": 0.5633023977279663, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3880 + }, + { + "epoch": 0.2793536804308797, + "grad_norm": 0.5037940144538879, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 3890 + }, + { + "epoch": 0.2800718132854578, + "grad_norm": 0.5255506038665771, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 3900 + }, + { + "epoch": 0.2807899461400359, + "grad_norm": 0.44584617018699646, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 3910 + }, + { + "epoch": 0.281508078994614, + "grad_norm": 0.4803239405155182, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 3920 + }, + { + "epoch": 0.2822262118491921, + "grad_norm": 0.5206008553504944, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 3930 + }, + { + "epoch": 0.2829443447037702, + "grad_norm": 0.5596373081207275, + "learning_rate": 0.0002, + "loss": 0.8988, + "step": 3940 + }, + { + "epoch": 0.2836624775583483, + "grad_norm": 0.4487258493900299, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 3950 + }, + { + "epoch": 0.2843806104129264, + "grad_norm": 0.4774281978607178, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 3960 + }, + { + "epoch": 0.2850987432675045, + "grad_norm": 0.571829617023468, + "learning_rate": 0.0002, + "loss": 0.8994, + "step": 3970 + }, + { + "epoch": 0.28581687612208256, + "grad_norm": 0.45251455903053284, + "learning_rate": 0.0002, + "loss": 0.7971, + "step": 3980 + }, + { + "epoch": 0.2865350089766607, + "grad_norm": 0.5119943618774414, + "learning_rate": 0.0002, + "loss": 0.8007, + "step": 3990 + }, + { + "epoch": 0.2872531418312388, + "grad_norm": 0.42333969473838806, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 4000 + }, + { + "epoch": 0.2879712746858169, + "grad_norm": 0.5694096684455872, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 4010 + }, + { + "epoch": 0.28868940754039496, + "grad_norm": 0.44457492232322693, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 4020 + }, + { + "epoch": 0.2894075403949731, + "grad_norm": 0.496545672416687, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 4030 + }, + { + "epoch": 0.2901256732495512, + "grad_norm": 0.5092352032661438, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 4040 + }, + { + "epoch": 0.29084380610412924, + "grad_norm": 0.5124567151069641, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 4050 + }, + { + "epoch": 0.29156193895870736, + "grad_norm": 0.5148161053657532, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4060 + }, + { + "epoch": 0.29228007181328547, + "grad_norm": 0.48183947801589966, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 4070 + }, + { + "epoch": 0.2929982046678636, + "grad_norm": 0.47728800773620605, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4080 + }, + { + "epoch": 0.29371633752244164, + "grad_norm": 0.5073143243789673, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.29443447037701975, + "grad_norm": 0.5343585014343262, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 4100 + }, + { + "epoch": 0.29515260323159787, + "grad_norm": 0.5760312676429749, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 4110 + }, + { + "epoch": 0.2958707360861759, + "grad_norm": 0.5894787907600403, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 4120 + }, + { + "epoch": 0.29658886894075404, + "grad_norm": 0.4528578817844391, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 4130 + }, + { + "epoch": 0.29730700179533215, + "grad_norm": 0.6027235388755798, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 4140 + }, + { + "epoch": 0.2980251346499102, + "grad_norm": 0.5060310959815979, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 4150 + }, + { + "epoch": 0.2987432675044883, + "grad_norm": 0.475252628326416, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4160 + }, + { + "epoch": 0.29946140035906643, + "grad_norm": 0.4855351448059082, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 4170 + }, + { + "epoch": 0.30017953321364454, + "grad_norm": 0.6720767021179199, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4180 + }, + { + "epoch": 0.3008976660682226, + "grad_norm": 0.6409553289413452, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 4190 + }, + { + "epoch": 0.3016157989228007, + "grad_norm": 0.5508167147636414, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 4200 + }, + { + "epoch": 0.30233393177737883, + "grad_norm": 0.45958149433135986, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 4210 + }, + { + "epoch": 0.3030520646319569, + "grad_norm": 0.5201641321182251, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 4220 + }, + { + "epoch": 0.303770197486535, + "grad_norm": 0.5440032482147217, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4230 + }, + { + "epoch": 0.3044883303411131, + "grad_norm": 0.43566814064979553, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4240 + }, + { + "epoch": 0.3052064631956912, + "grad_norm": 0.4479893445968628, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 4250 + }, + { + "epoch": 0.3059245960502693, + "grad_norm": 0.40390217304229736, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 4260 + }, + { + "epoch": 0.3066427289048474, + "grad_norm": 0.5143486261367798, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 4270 + }, + { + "epoch": 0.3073608617594255, + "grad_norm": 0.5289962887763977, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 4280 + }, + { + "epoch": 0.30807899461400357, + "grad_norm": 0.609561026096344, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 4290 + }, + { + "epoch": 0.3087971274685817, + "grad_norm": 0.5967493653297424, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 4300 + }, + { + "epoch": 0.3095152603231598, + "grad_norm": 0.5323672890663147, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4310 + }, + { + "epoch": 0.3102333931777379, + "grad_norm": 0.4996737241744995, + "learning_rate": 0.0002, + "loss": 0.8463, + "step": 4320 + }, + { + "epoch": 0.31095152603231596, + "grad_norm": 0.5528829097747803, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 4330 + }, + { + "epoch": 0.3116696588868941, + "grad_norm": 0.5394268035888672, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4340 + }, + { + "epoch": 0.3123877917414722, + "grad_norm": 0.4654628038406372, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 4350 + }, + { + "epoch": 0.31310592459605024, + "grad_norm": 0.4933706521987915, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 4360 + }, + { + "epoch": 0.31382405745062836, + "grad_norm": 0.5310598611831665, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 4370 + }, + { + "epoch": 0.31454219030520647, + "grad_norm": 0.5558765530586243, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4380 + }, + { + "epoch": 0.3152603231597846, + "grad_norm": 0.5281313061714172, + "learning_rate": 0.0002, + "loss": 0.8013, + "step": 4390 + }, + { + "epoch": 0.31597845601436264, + "grad_norm": 0.5100293755531311, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 4400 + }, + { + "epoch": 0.31669658886894075, + "grad_norm": 0.48762813210487366, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 4410 + }, + { + "epoch": 0.31741472172351887, + "grad_norm": 0.5211702585220337, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 4420 + }, + { + "epoch": 0.3181328545780969, + "grad_norm": 0.696747899055481, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 4430 + }, + { + "epoch": 0.31885098743267504, + "grad_norm": 0.6334946751594543, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4440 + }, + { + "epoch": 0.31956912028725315, + "grad_norm": 0.5333067178726196, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4450 + }, + { + "epoch": 0.32028725314183126, + "grad_norm": 0.500091552734375, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 4460 + }, + { + "epoch": 0.3210053859964093, + "grad_norm": 0.5190957188606262, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4470 + }, + { + "epoch": 0.32172351885098743, + "grad_norm": 0.6702370047569275, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 4480 + }, + { + "epoch": 0.32244165170556555, + "grad_norm": 0.4393869638442993, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 4490 + }, + { + "epoch": 0.3231597845601436, + "grad_norm": 0.4766499400138855, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 4500 + }, + { + "epoch": 0.3238779174147217, + "grad_norm": 0.561836838722229, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 4510 + }, + { + "epoch": 0.32459605026929983, + "grad_norm": 0.44366541504859924, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 4520 + }, + { + "epoch": 0.32531418312387794, + "grad_norm": 0.46504274010658264, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 4530 + }, + { + "epoch": 0.326032315978456, + "grad_norm": 0.5498034954071045, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 4540 + }, + { + "epoch": 0.3267504488330341, + "grad_norm": 0.5901338458061218, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 4550 + }, + { + "epoch": 0.3274685816876122, + "grad_norm": 0.5485442876815796, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 4560 + }, + { + "epoch": 0.3281867145421903, + "grad_norm": 0.512584924697876, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4570 + }, + { + "epoch": 0.3289048473967684, + "grad_norm": 0.5208188891410828, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 4580 + }, + { + "epoch": 0.3296229802513465, + "grad_norm": 0.4923836886882782, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 4590 + }, + { + "epoch": 0.3303411131059246, + "grad_norm": 0.49258530139923096, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 4600 + }, + { + "epoch": 0.3310592459605027, + "grad_norm": 0.4788922667503357, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 4610 + }, + { + "epoch": 0.3317773788150808, + "grad_norm": 0.48276954889297485, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4620 + }, + { + "epoch": 0.3324955116696589, + "grad_norm": 0.6300732493400574, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 4630 + }, + { + "epoch": 0.33321364452423696, + "grad_norm": 0.47594770789146423, + "learning_rate": 0.0002, + "loss": 0.8434, + "step": 4640 + }, + { + "epoch": 0.3339317773788151, + "grad_norm": 0.4728924632072449, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 4650 + }, + { + "epoch": 0.3346499102333932, + "grad_norm": 0.5586788654327393, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 4660 + }, + { + "epoch": 0.3353680430879713, + "grad_norm": 0.4573180377483368, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 4670 + }, + { + "epoch": 0.33608617594254936, + "grad_norm": 0.6391524076461792, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 4680 + }, + { + "epoch": 0.33680430879712747, + "grad_norm": 0.6570921540260315, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 4690 + }, + { + "epoch": 0.3375224416517056, + "grad_norm": 0.4601454734802246, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 4700 + }, + { + "epoch": 0.33824057450628364, + "grad_norm": 0.5640755295753479, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 4710 + }, + { + "epoch": 0.33895870736086176, + "grad_norm": 0.43475520610809326, + "learning_rate": 0.0002, + "loss": 0.8326, + "step": 4720 + }, + { + "epoch": 0.33967684021543987, + "grad_norm": 0.4785807132720947, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 4730 + }, + { + "epoch": 0.340394973070018, + "grad_norm": 0.4934665262699127, + "learning_rate": 0.0002, + "loss": 0.8257, + "step": 4740 + }, + { + "epoch": 0.34111310592459604, + "grad_norm": 0.45327693223953247, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 4750 + }, + { + "epoch": 0.34183123877917415, + "grad_norm": 0.4710456430912018, + "learning_rate": 0.0002, + "loss": 0.7944, + "step": 4760 + }, + { + "epoch": 0.34254937163375226, + "grad_norm": 0.5591559410095215, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 4770 + }, + { + "epoch": 0.3432675044883303, + "grad_norm": 0.48958835005760193, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 4780 + }, + { + "epoch": 0.34398563734290843, + "grad_norm": 0.4613766670227051, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 4790 + }, + { + "epoch": 0.34470377019748655, + "grad_norm": 0.5425335764884949, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 4800 + }, + { + "epoch": 0.3454219030520646, + "grad_norm": 0.4964924156665802, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 4810 + }, + { + "epoch": 0.3461400359066427, + "grad_norm": 0.613449215888977, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 4820 + }, + { + "epoch": 0.34685816876122083, + "grad_norm": 0.6553348898887634, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 4830 + }, + { + "epoch": 0.34757630161579894, + "grad_norm": 0.5863470435142517, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 4840 + }, + { + "epoch": 0.348294434470377, + "grad_norm": 0.5338097810745239, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 4850 + }, + { + "epoch": 0.3490125673249551, + "grad_norm": 0.6129760146141052, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 4860 + }, + { + "epoch": 0.3497307001795332, + "grad_norm": 0.6100956797599792, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 4870 + }, + { + "epoch": 0.3504488330341113, + "grad_norm": 0.5478541254997253, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 4880 + }, + { + "epoch": 0.3511669658886894, + "grad_norm": 0.5725359916687012, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 4890 + }, + { + "epoch": 0.3518850987432675, + "grad_norm": 0.6141043901443481, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 4900 + }, + { + "epoch": 0.3526032315978456, + "grad_norm": 0.597191572189331, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 4910 + }, + { + "epoch": 0.3533213644524237, + "grad_norm": 0.5988389253616333, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 4920 + }, + { + "epoch": 0.3540394973070018, + "grad_norm": 0.5503361821174622, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 4930 + }, + { + "epoch": 0.3547576301615799, + "grad_norm": 0.5932779312133789, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 4940 + }, + { + "epoch": 0.35547576301615796, + "grad_norm": 0.48911359906196594, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 4950 + }, + { + "epoch": 0.3561938958707361, + "grad_norm": 0.5435750484466553, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 4960 + }, + { + "epoch": 0.3569120287253142, + "grad_norm": 0.4786977767944336, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 4970 + }, + { + "epoch": 0.3576301615798923, + "grad_norm": 0.4022316336631775, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 4980 + }, + { + "epoch": 0.35834829443447036, + "grad_norm": 0.4848504364490509, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 4990 + }, + { + "epoch": 0.3590664272890485, + "grad_norm": 0.5093459486961365, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 5000 + }, + { + "epoch": 0.3597845601436266, + "grad_norm": 0.47368478775024414, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 5010 + }, + { + "epoch": 0.36050269299820464, + "grad_norm": 0.6041097044944763, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 5020 + }, + { + "epoch": 0.36122082585278276, + "grad_norm": 0.5384424924850464, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 5030 + }, + { + "epoch": 0.36193895870736087, + "grad_norm": 0.4668518602848053, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 5040 + }, + { + "epoch": 0.362657091561939, + "grad_norm": 0.5471060276031494, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 5050 + }, + { + "epoch": 0.36337522441651704, + "grad_norm": 0.731369137763977, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 5060 + }, + { + "epoch": 0.36409335727109515, + "grad_norm": 0.5119590759277344, + "learning_rate": 0.0002, + "loss": 0.8646, + "step": 5070 + }, + { + "epoch": 0.36481149012567327, + "grad_norm": 0.567428469657898, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 5080 + }, + { + "epoch": 0.3655296229802513, + "grad_norm": 0.5139971375465393, + "learning_rate": 0.0002, + "loss": 0.7616, + "step": 5090 + }, + { + "epoch": 0.36624775583482944, + "grad_norm": 0.5701581835746765, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 5100 + }, + { + "epoch": 0.36696588868940755, + "grad_norm": 0.5022063851356506, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 5110 + }, + { + "epoch": 0.36768402154398566, + "grad_norm": 0.4684354364871979, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 5120 + }, + { + "epoch": 0.3684021543985637, + "grad_norm": 0.5423495769500732, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 5130 + }, + { + "epoch": 0.36912028725314183, + "grad_norm": 0.46262967586517334, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 5140 + }, + { + "epoch": 0.36983842010771995, + "grad_norm": 0.4720141589641571, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 5150 + }, + { + "epoch": 0.370556552962298, + "grad_norm": 0.5113096833229065, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 5160 + }, + { + "epoch": 0.3712746858168761, + "grad_norm": 0.5253350138664246, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 5170 + }, + { + "epoch": 0.37199281867145423, + "grad_norm": 0.5799776315689087, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 5180 + }, + { + "epoch": 0.37271095152603234, + "grad_norm": 0.5166001319885254, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5190 + }, + { + "epoch": 0.3734290843806104, + "grad_norm": 0.5658290386199951, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 5200 + }, + { + "epoch": 0.3741472172351885, + "grad_norm": 0.45811113715171814, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 5210 + }, + { + "epoch": 0.3748653500897666, + "grad_norm": 0.5509489178657532, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 5220 + }, + { + "epoch": 0.3755834829443447, + "grad_norm": 0.47473257780075073, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5230 + }, + { + "epoch": 0.3763016157989228, + "grad_norm": 0.3858596086502075, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 5240 + }, + { + "epoch": 0.3770197486535009, + "grad_norm": 0.6941536068916321, + "learning_rate": 0.0002, + "loss": 0.8592, + "step": 5250 + }, + { + "epoch": 0.377737881508079, + "grad_norm": 0.46940872073173523, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 5260 + }, + { + "epoch": 0.3784560143626571, + "grad_norm": 0.5413833260536194, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 5270 + }, + { + "epoch": 0.3791741472172352, + "grad_norm": 0.5165658593177795, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 5280 + }, + { + "epoch": 0.3798922800718133, + "grad_norm": 0.6567398309707642, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 5290 + }, + { + "epoch": 0.38061041292639136, + "grad_norm": 0.5466915965080261, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 5300 + }, + { + "epoch": 0.3813285457809695, + "grad_norm": 0.4800598621368408, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 5310 + }, + { + "epoch": 0.3820466786355476, + "grad_norm": 0.4551742970943451, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 5320 + }, + { + "epoch": 0.3827648114901257, + "grad_norm": 0.5561164617538452, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 5330 + }, + { + "epoch": 0.38348294434470376, + "grad_norm": 0.6170380115509033, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 5340 + }, + { + "epoch": 0.38420107719928187, + "grad_norm": 0.465762197971344, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 5350 + }, + { + "epoch": 0.38491921005386, + "grad_norm": 0.6176838874816895, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 5360 + }, + { + "epoch": 0.38563734290843804, + "grad_norm": 0.657926082611084, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 5370 + }, + { + "epoch": 0.38635547576301615, + "grad_norm": 0.5063281655311584, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 5380 + }, + { + "epoch": 0.38707360861759427, + "grad_norm": 0.6960828304290771, + "learning_rate": 0.0002, + "loss": 0.8259, + "step": 5390 + }, + { + "epoch": 0.3877917414721723, + "grad_norm": 0.46712034940719604, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 5400 + }, + { + "epoch": 0.38850987432675044, + "grad_norm": 0.598114013671875, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 5410 + }, + { + "epoch": 0.38922800718132855, + "grad_norm": 0.6798132061958313, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 5420 + }, + { + "epoch": 0.38994614003590666, + "grad_norm": 0.5194289088249207, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 5430 + }, + { + "epoch": 0.3906642728904847, + "grad_norm": 0.48175323009490967, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 5440 + }, + { + "epoch": 0.39138240574506283, + "grad_norm": 0.4979408085346222, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 5450 + }, + { + "epoch": 0.39210053859964095, + "grad_norm": 0.6440972685813904, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5460 + }, + { + "epoch": 0.392818671454219, + "grad_norm": 0.5977227091789246, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 5470 + }, + { + "epoch": 0.3935368043087971, + "grad_norm": 0.4735909104347229, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 5480 + }, + { + "epoch": 0.39425493716337523, + "grad_norm": 0.48181721568107605, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 5490 + }, + { + "epoch": 0.39497307001795334, + "grad_norm": 0.6339454650878906, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 5500 + }, + { + "epoch": 0.3956912028725314, + "grad_norm": 0.5364336371421814, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 5510 + }, + { + "epoch": 0.3964093357271095, + "grad_norm": 0.5499233603477478, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 5520 + }, + { + "epoch": 0.3971274685816876, + "grad_norm": 0.47249847650527954, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 5530 + }, + { + "epoch": 0.3978456014362657, + "grad_norm": 0.5692135095596313, + "learning_rate": 0.0002, + "loss": 0.8207, + "step": 5540 + }, + { + "epoch": 0.3985637342908438, + "grad_norm": 0.6009272933006287, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 5550 + }, + { + "epoch": 0.3992818671454219, + "grad_norm": 0.5198255181312561, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5560 + }, + { + "epoch": 0.4, + "grad_norm": 0.5474766492843628, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 5570 + }, + { + "epoch": 0.4007181328545781, + "grad_norm": 0.5577479600906372, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 5580 + }, + { + "epoch": 0.4014362657091562, + "grad_norm": 0.5350302457809448, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5590 + }, + { + "epoch": 0.4021543985637343, + "grad_norm": 0.6310991048812866, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 5600 + }, + { + "epoch": 0.40287253141831236, + "grad_norm": 0.5695762038230896, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5610 + }, + { + "epoch": 0.4035906642728905, + "grad_norm": 0.5431827306747437, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 5620 + }, + { + "epoch": 0.4043087971274686, + "grad_norm": 0.4923325777053833, + "learning_rate": 0.0002, + "loss": 0.8743, + "step": 5630 + }, + { + "epoch": 0.4050269299820467, + "grad_norm": 0.531399667263031, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 5640 + }, + { + "epoch": 0.40574506283662476, + "grad_norm": 0.5854769349098206, + "learning_rate": 0.0002, + "loss": 0.7982, + "step": 5650 + }, + { + "epoch": 0.40646319569120287, + "grad_norm": 0.6684802174568176, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 5660 + }, + { + "epoch": 0.407181328545781, + "grad_norm": 0.6618620753288269, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 5670 + }, + { + "epoch": 0.40789946140035904, + "grad_norm": 0.4930776059627533, + "learning_rate": 0.0002, + "loss": 0.7707, + "step": 5680 + }, + { + "epoch": 0.40861759425493716, + "grad_norm": 0.506628155708313, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 5690 + }, + { + "epoch": 0.40933572710951527, + "grad_norm": 0.5250783562660217, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5700 + }, + { + "epoch": 0.4100538599640934, + "grad_norm": 0.6773046851158142, + "learning_rate": 0.0002, + "loss": 0.8386, + "step": 5710 + }, + { + "epoch": 0.41077199281867144, + "grad_norm": 0.6750592589378357, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 5720 + }, + { + "epoch": 0.41149012567324955, + "grad_norm": 0.5277232527732849, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5730 + }, + { + "epoch": 0.41220825852782766, + "grad_norm": 0.5155990719795227, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 5740 + }, + { + "epoch": 0.4129263913824057, + "grad_norm": 0.5236294865608215, + "learning_rate": 0.0002, + "loss": 0.871, + "step": 5750 + }, + { + "epoch": 0.41364452423698383, + "grad_norm": 0.5073592066764832, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 5760 + }, + { + "epoch": 0.41436265709156195, + "grad_norm": 0.6997184753417969, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 5770 + }, + { + "epoch": 0.41508078994614006, + "grad_norm": 0.5282439589500427, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 5780 + }, + { + "epoch": 0.4157989228007181, + "grad_norm": 0.4997355341911316, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5790 + }, + { + "epoch": 0.41651705565529623, + "grad_norm": 0.6081610321998596, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5800 + }, + { + "epoch": 0.41723518850987434, + "grad_norm": 0.5640295147895813, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 5810 + }, + { + "epoch": 0.4179533213644524, + "grad_norm": 0.6443586349487305, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 5820 + }, + { + "epoch": 0.4186714542190305, + "grad_norm": 0.6456229090690613, + "learning_rate": 0.0002, + "loss": 0.8132, + "step": 5830 + }, + { + "epoch": 0.4193895870736086, + "grad_norm": 0.5422267317771912, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5840 + }, + { + "epoch": 0.42010771992818674, + "grad_norm": 0.45251885056495667, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5850 + }, + { + "epoch": 0.4208258527827648, + "grad_norm": 0.781165599822998, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5860 + }, + { + "epoch": 0.4215439856373429, + "grad_norm": 0.5359160900115967, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5870 + }, + { + "epoch": 0.422262118491921, + "grad_norm": 0.6201958656311035, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5880 + }, + { + "epoch": 0.4229802513464991, + "grad_norm": 0.5985850691795349, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 5890 + }, + { + "epoch": 0.4236983842010772, + "grad_norm": 0.5550961494445801, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 5900 + }, + { + "epoch": 0.4244165170556553, + "grad_norm": 0.6284893155097961, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 5910 + }, + { + "epoch": 0.4251346499102334, + "grad_norm": 0.6143685579299927, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 5920 + }, + { + "epoch": 0.4258527827648115, + "grad_norm": 0.5065329670906067, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 5930 + }, + { + "epoch": 0.4265709156193896, + "grad_norm": 0.7274345755577087, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 5940 + }, + { + "epoch": 0.4272890484739677, + "grad_norm": 0.606531023979187, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 5950 + }, + { + "epoch": 0.42800718132854576, + "grad_norm": 0.5983648300170898, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5960 + }, + { + "epoch": 0.4287253141831239, + "grad_norm": 0.5546031594276428, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5970 + }, + { + "epoch": 0.429443447037702, + "grad_norm": 0.666868269443512, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 5980 + }, + { + "epoch": 0.4301615798922801, + "grad_norm": 0.41438576579093933, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 5990 + }, + { + "epoch": 0.43087971274685816, + "grad_norm": 0.5012526512145996, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 6000 + }, + { + "epoch": 0.43159784560143627, + "grad_norm": 0.6071694493293762, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 6010 + }, + { + "epoch": 0.4323159784560144, + "grad_norm": 0.5538384914398193, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 6020 + }, + { + "epoch": 0.43303411131059244, + "grad_norm": 0.5798718929290771, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 6030 + }, + { + "epoch": 0.43375224416517055, + "grad_norm": 0.5442442893981934, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 6040 + }, + { + "epoch": 0.43447037701974867, + "grad_norm": 0.6895565390586853, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 6050 + }, + { + "epoch": 0.4351885098743267, + "grad_norm": 0.6498045325279236, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 6060 + }, + { + "epoch": 0.43590664272890484, + "grad_norm": 0.5225510001182556, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 6070 + }, + { + "epoch": 0.43662477558348295, + "grad_norm": 0.6366992592811584, + "learning_rate": 0.0002, + "loss": 0.7809, + "step": 6080 + }, + { + "epoch": 0.43734290843806106, + "grad_norm": 0.47929027676582336, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 6090 + }, + { + "epoch": 0.4380610412926391, + "grad_norm": 0.5722405910491943, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 6100 + }, + { + "epoch": 0.43877917414721723, + "grad_norm": 0.6008004546165466, + "learning_rate": 0.0002, + "loss": 0.765, + "step": 6110 + }, + { + "epoch": 0.43949730700179535, + "grad_norm": 0.5922580361366272, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 6120 + }, + { + "epoch": 0.4402154398563734, + "grad_norm": 0.7051905393600464, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 6130 + }, + { + "epoch": 0.4409335727109515, + "grad_norm": 0.5146450400352478, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 6140 + }, + { + "epoch": 0.44165170556552963, + "grad_norm": 0.5605781674385071, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 6150 + }, + { + "epoch": 0.44236983842010774, + "grad_norm": 0.8008661866188049, + "learning_rate": 0.0002, + "loss": 0.8409, + "step": 6160 + }, + { + "epoch": 0.4430879712746858, + "grad_norm": 0.47406497597694397, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 6170 + }, + { + "epoch": 0.4438061041292639, + "grad_norm": 0.612287700176239, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 6180 + }, + { + "epoch": 0.444524236983842, + "grad_norm": 0.561188280582428, + "learning_rate": 0.0002, + "loss": 0.835, + "step": 6190 + }, + { + "epoch": 0.4452423698384201, + "grad_norm": 0.6233669519424438, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 6200 + }, + { + "epoch": 0.4459605026929982, + "grad_norm": 0.45546263456344604, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6210 + }, + { + "epoch": 0.4466786355475763, + "grad_norm": 0.5947871208190918, + "learning_rate": 0.0002, + "loss": 0.8183, + "step": 6220 + }, + { + "epoch": 0.4473967684021544, + "grad_norm": 0.6109753847122192, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 6230 + }, + { + "epoch": 0.4481149012567325, + "grad_norm": 0.6380727887153625, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 6240 + }, + { + "epoch": 0.4488330341113106, + "grad_norm": 0.5225699543952942, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 6250 + }, + { + "epoch": 0.4495511669658887, + "grad_norm": 0.521503210067749, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 6260 + }, + { + "epoch": 0.45026929982046676, + "grad_norm": 0.5523216128349304, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 6270 + }, + { + "epoch": 0.4509874326750449, + "grad_norm": 0.5954921841621399, + "learning_rate": 0.0002, + "loss": 0.8228, + "step": 6280 + }, + { + "epoch": 0.451705565529623, + "grad_norm": 0.702751100063324, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 6290 + }, + { + "epoch": 0.4524236983842011, + "grad_norm": 0.5756356120109558, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 6300 + }, + { + "epoch": 0.45314183123877916, + "grad_norm": 0.45365944504737854, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 6310 + }, + { + "epoch": 0.45385996409335727, + "grad_norm": 0.5027855038642883, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 6320 + }, + { + "epoch": 0.4545780969479354, + "grad_norm": 0.6551687121391296, + "learning_rate": 0.0002, + "loss": 0.8052, + "step": 6330 + }, + { + "epoch": 0.45529622980251344, + "grad_norm": 0.5296684503555298, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6340 + }, + { + "epoch": 0.45601436265709155, + "grad_norm": 0.5762032866477966, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6350 + }, + { + "epoch": 0.45673249551166967, + "grad_norm": 0.5234073996543884, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 6360 + }, + { + "epoch": 0.4574506283662478, + "grad_norm": 0.5090946555137634, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 6370 + }, + { + "epoch": 0.45816876122082584, + "grad_norm": 0.6515111327171326, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 6380 + }, + { + "epoch": 0.45888689407540395, + "grad_norm": 0.7904898524284363, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 6390 + }, + { + "epoch": 0.45960502692998206, + "grad_norm": 0.6379680037498474, + "learning_rate": 0.0002, + "loss": 0.841, + "step": 6400 + }, + { + "epoch": 0.4603231597845601, + "grad_norm": 0.641759991645813, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 6410 + }, + { + "epoch": 0.46104129263913823, + "grad_norm": 0.5273829698562622, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 6420 + }, + { + "epoch": 0.46175942549371635, + "grad_norm": 0.5668497681617737, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6430 + }, + { + "epoch": 0.46247755834829446, + "grad_norm": 0.5862061381340027, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 6440 + }, + { + "epoch": 0.4631956912028725, + "grad_norm": 0.5239592790603638, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 6450 + }, + { + "epoch": 0.46391382405745063, + "grad_norm": 0.5078722834587097, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 6460 + }, + { + "epoch": 0.46463195691202874, + "grad_norm": 0.566509485244751, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 6470 + }, + { + "epoch": 0.4653500897666068, + "grad_norm": 0.5952697396278381, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 6480 + }, + { + "epoch": 0.4660682226211849, + "grad_norm": 0.6548156142234802, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 6490 + }, + { + "epoch": 0.466786355475763, + "grad_norm": 0.4768427908420563, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 6500 + }, + { + "epoch": 0.46750448833034114, + "grad_norm": 0.5588273406028748, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 6510 + }, + { + "epoch": 0.4682226211849192, + "grad_norm": 0.5348677039146423, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 6520 + }, + { + "epoch": 0.4689407540394973, + "grad_norm": 0.4784318804740906, + "learning_rate": 0.0002, + "loss": 0.7969, + "step": 6530 + }, + { + "epoch": 0.4696588868940754, + "grad_norm": 0.5112265944480896, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 6540 + }, + { + "epoch": 0.4703770197486535, + "grad_norm": 0.7250495553016663, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 6550 + }, + { + "epoch": 0.4710951526032316, + "grad_norm": 0.538608968257904, + "learning_rate": 0.0002, + "loss": 0.808, + "step": 6560 + }, + { + "epoch": 0.4718132854578097, + "grad_norm": 0.5981247425079346, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 6570 + }, + { + "epoch": 0.4725314183123878, + "grad_norm": 0.5466762781143188, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 6580 + }, + { + "epoch": 0.4732495511669659, + "grad_norm": 0.5609987378120422, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 6590 + }, + { + "epoch": 0.473967684021544, + "grad_norm": 0.6091027855873108, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 6600 + }, + { + "epoch": 0.4746858168761221, + "grad_norm": 0.5542886853218079, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 6610 + }, + { + "epoch": 0.47540394973070016, + "grad_norm": 0.5656579732894897, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 6620 + }, + { + "epoch": 0.4761220825852783, + "grad_norm": 0.47507357597351074, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 6630 + }, + { + "epoch": 0.4768402154398564, + "grad_norm": 0.6039174199104309, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6640 + }, + { + "epoch": 0.47755834829443444, + "grad_norm": 0.7129740715026855, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 6650 + }, + { + "epoch": 0.47827648114901256, + "grad_norm": 0.5189188718795776, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 6660 + }, + { + "epoch": 0.47899461400359067, + "grad_norm": 0.7548696398735046, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 6670 + }, + { + "epoch": 0.4797127468581688, + "grad_norm": 0.4729466438293457, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 6680 + }, + { + "epoch": 0.48043087971274684, + "grad_norm": 0.6190000772476196, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 6690 + }, + { + "epoch": 0.48114901256732495, + "grad_norm": 0.6276983022689819, + "learning_rate": 0.0002, + "loss": 0.7691, + "step": 6700 + }, + { + "epoch": 0.48186714542190306, + "grad_norm": 0.6097590923309326, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 6710 + }, + { + "epoch": 0.4825852782764811, + "grad_norm": 0.6507330536842346, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 6720 + }, + { + "epoch": 0.48330341113105924, + "grad_norm": 0.5501991510391235, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 6730 + }, + { + "epoch": 0.48402154398563735, + "grad_norm": 0.5928015112876892, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 6740 + }, + { + "epoch": 0.48473967684021546, + "grad_norm": 0.5523008704185486, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 6750 + }, + { + "epoch": 0.4854578096947935, + "grad_norm": 0.5997263789176941, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 6760 + }, + { + "epoch": 0.48617594254937163, + "grad_norm": 0.6201002597808838, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 6770 + }, + { + "epoch": 0.48689407540394974, + "grad_norm": 0.6338862776756287, + "learning_rate": 0.0002, + "loss": 0.8018, + "step": 6780 + }, + { + "epoch": 0.4876122082585278, + "grad_norm": 0.5542550086975098, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6790 + }, + { + "epoch": 0.4883303411131059, + "grad_norm": 0.5587872862815857, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 6800 + }, + { + "epoch": 0.489048473967684, + "grad_norm": 0.5895681977272034, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 6810 + }, + { + "epoch": 0.48976660682226214, + "grad_norm": 0.4948221743106842, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 6820 + }, + { + "epoch": 0.4904847396768402, + "grad_norm": 0.44546931982040405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 6830 + }, + { + "epoch": 0.4912028725314183, + "grad_norm": 0.632046103477478, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 6840 + }, + { + "epoch": 0.4919210053859964, + "grad_norm": 0.49396243691444397, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 6850 + }, + { + "epoch": 0.4926391382405745, + "grad_norm": 0.497745156288147, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6860 + }, + { + "epoch": 0.4933572710951526, + "grad_norm": 0.7336170077323914, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 6870 + }, + { + "epoch": 0.4940754039497307, + "grad_norm": 0.6723181009292603, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 6880 + }, + { + "epoch": 0.4947935368043088, + "grad_norm": 0.5887754559516907, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 6890 + }, + { + "epoch": 0.4955116696588869, + "grad_norm": 0.6580226421356201, + "learning_rate": 0.0002, + "loss": 0.7904, + "step": 6900 + }, + { + "epoch": 0.496229802513465, + "grad_norm": 0.7385056614875793, + "learning_rate": 0.0002, + "loss": 0.8203, + "step": 6910 + }, + { + "epoch": 0.4969479353680431, + "grad_norm": 0.48736000061035156, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 6920 + }, + { + "epoch": 0.49766606822262116, + "grad_norm": 0.6304559111595154, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 6930 + }, + { + "epoch": 0.4983842010771993, + "grad_norm": 0.607148289680481, + "learning_rate": 0.0002, + "loss": 0.8323, + "step": 6940 + }, + { + "epoch": 0.4991023339317774, + "grad_norm": 0.5467981696128845, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 6950 + }, + { + "epoch": 0.4998204667863555, + "grad_norm": 0.7046723961830139, + "learning_rate": 0.0002, + "loss": 0.804, + "step": 6960 + }, + { + "epoch": 0.5005385996409336, + "grad_norm": 0.5487921833992004, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 6970 + }, + { + "epoch": 0.5012567324955116, + "grad_norm": 0.5706006288528442, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 6980 + }, + { + "epoch": 0.5019748653500897, + "grad_norm": 0.539536714553833, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 6990 + }, + { + "epoch": 0.5026929982046678, + "grad_norm": 0.5527397394180298, + "learning_rate": 0.0002, + "loss": 0.7829, + "step": 7000 + }, + { + "epoch": 0.503411131059246, + "grad_norm": 0.5498567223548889, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 7010 + }, + { + "epoch": 0.5041292639138241, + "grad_norm": 0.5878575444221497, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 7020 + }, + { + "epoch": 0.5048473967684022, + "grad_norm": 0.646153450012207, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 7030 + }, + { + "epoch": 0.5055655296229803, + "grad_norm": 0.5603899359703064, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 7040 + }, + { + "epoch": 0.5062836624775583, + "grad_norm": 0.5849952697753906, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 7050 + }, + { + "epoch": 0.5070017953321364, + "grad_norm": 0.6082724928855896, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 7060 + }, + { + "epoch": 0.5077199281867145, + "grad_norm": 0.5900670289993286, + "learning_rate": 0.0002, + "loss": 0.8046, + "step": 7070 + }, + { + "epoch": 0.5084380610412926, + "grad_norm": 0.5856624841690063, + "learning_rate": 0.0002, + "loss": 0.8612, + "step": 7080 + }, + { + "epoch": 0.5091561938958707, + "grad_norm": 0.6177338361740112, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7090 + }, + { + "epoch": 0.5098743267504489, + "grad_norm": 0.5559300184249878, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 7100 + }, + { + "epoch": 0.510592459605027, + "grad_norm": 0.62027907371521, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 7110 + }, + { + "epoch": 0.511310592459605, + "grad_norm": 0.6334301829338074, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7120 + }, + { + "epoch": 0.5120287253141831, + "grad_norm": 0.513795018196106, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 7130 + }, + { + "epoch": 0.5127468581687612, + "grad_norm": 0.7004675269126892, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 7140 + }, + { + "epoch": 0.5134649910233393, + "grad_norm": 0.5614308714866638, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7150 + }, + { + "epoch": 0.5141831238779174, + "grad_norm": 0.5037539601325989, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 7160 + }, + { + "epoch": 0.5149012567324955, + "grad_norm": 0.5568661093711853, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 7170 + }, + { + "epoch": 0.5156193895870737, + "grad_norm": 0.7513397336006165, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7180 + }, + { + "epoch": 0.5163375224416517, + "grad_norm": 0.7264583706855774, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 7190 + }, + { + "epoch": 0.5170556552962298, + "grad_norm": 0.6355819702148438, + "learning_rate": 0.0002, + "loss": 0.8671, + "step": 7200 + }, + { + "epoch": 0.5177737881508079, + "grad_norm": 0.6063222289085388, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 7210 + }, + { + "epoch": 0.518491921005386, + "grad_norm": 0.6484307646751404, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 7220 + }, + { + "epoch": 0.5192100538599641, + "grad_norm": 0.5260455012321472, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 7230 + }, + { + "epoch": 0.5199281867145422, + "grad_norm": 0.6718002557754517, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7240 + }, + { + "epoch": 0.5206463195691203, + "grad_norm": 0.5997617244720459, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 7250 + }, + { + "epoch": 0.5213644524236983, + "grad_norm": 0.5838589668273926, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 7260 + }, + { + "epoch": 0.5220825852782764, + "grad_norm": 0.5755977630615234, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 7270 + }, + { + "epoch": 0.5228007181328546, + "grad_norm": 0.6442093253135681, + "learning_rate": 0.0002, + "loss": 0.8233, + "step": 7280 + }, + { + "epoch": 0.5235188509874327, + "grad_norm": 0.6128416657447815, + "learning_rate": 0.0002, + "loss": 0.822, + "step": 7290 + }, + { + "epoch": 0.5242369838420108, + "grad_norm": 0.509742796421051, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 7300 + }, + { + "epoch": 0.5249551166965889, + "grad_norm": 0.5450230836868286, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 7310 + }, + { + "epoch": 0.525673249551167, + "grad_norm": 0.5437141060829163, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 7320 + }, + { + "epoch": 0.526391382405745, + "grad_norm": 0.5291738510131836, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 7330 + }, + { + "epoch": 0.5271095152603231, + "grad_norm": 0.5101743936538696, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 7340 + }, + { + "epoch": 0.5278276481149012, + "grad_norm": 0.5678408145904541, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 7350 + }, + { + "epoch": 0.5285457809694794, + "grad_norm": 0.6332360506057739, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 7360 + }, + { + "epoch": 0.5292639138240575, + "grad_norm": 0.4935058653354645, + "learning_rate": 0.0002, + "loss": 0.8521, + "step": 7370 + }, + { + "epoch": 0.5299820466786356, + "grad_norm": 0.6399656534194946, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7380 + }, + { + "epoch": 0.5307001795332137, + "grad_norm": 0.5986794233322144, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 7390 + }, + { + "epoch": 0.5314183123877917, + "grad_norm": 0.6948414444923401, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 7400 + }, + { + "epoch": 0.5321364452423698, + "grad_norm": 0.5337842106819153, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 7410 + }, + { + "epoch": 0.5328545780969479, + "grad_norm": 0.6897268295288086, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 7420 + }, + { + "epoch": 0.533572710951526, + "grad_norm": 0.6361175179481506, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 7430 + }, + { + "epoch": 0.5342908438061041, + "grad_norm": 0.5242252945899963, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 7440 + }, + { + "epoch": 0.5350089766606823, + "grad_norm": 0.5731322765350342, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 7450 + }, + { + "epoch": 0.5357271095152604, + "grad_norm": 0.5790955424308777, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 7460 + }, + { + "epoch": 0.5364452423698384, + "grad_norm": 0.4979061782360077, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 7470 + }, + { + "epoch": 0.5371633752244165, + "grad_norm": 0.7335101962089539, + "learning_rate": 0.0002, + "loss": 0.794, + "step": 7480 + }, + { + "epoch": 0.5378815080789946, + "grad_norm": 0.592521071434021, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 7490 + }, + { + "epoch": 0.5385996409335727, + "grad_norm": 0.5784769654273987, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 7500 + }, + { + "epoch": 0.5393177737881508, + "grad_norm": 0.8148589730262756, + "learning_rate": 0.0002, + "loss": 0.789, + "step": 7510 + }, + { + "epoch": 0.5400359066427289, + "grad_norm": 0.5727689862251282, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 7520 + }, + { + "epoch": 0.540754039497307, + "grad_norm": 0.6958279609680176, + "learning_rate": 0.0002, + "loss": 0.8321, + "step": 7530 + }, + { + "epoch": 0.541472172351885, + "grad_norm": 0.6302788257598877, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 7540 + }, + { + "epoch": 0.5421903052064632, + "grad_norm": 0.5950970649719238, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 7550 + }, + { + "epoch": 0.5429084380610413, + "grad_norm": 0.4275270104408264, + "learning_rate": 0.0002, + "loss": 0.8076, + "step": 7560 + }, + { + "epoch": 0.5436265709156194, + "grad_norm": 0.7579900622367859, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 7570 + }, + { + "epoch": 0.5443447037701975, + "grad_norm": 0.5835317969322205, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 7580 + }, + { + "epoch": 0.5450628366247756, + "grad_norm": 0.5305142998695374, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 7590 + }, + { + "epoch": 0.5457809694793537, + "grad_norm": 0.6076129674911499, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7600 + }, + { + "epoch": 0.5464991023339317, + "grad_norm": 0.5341935753822327, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 7610 + }, + { + "epoch": 0.5472172351885098, + "grad_norm": 0.6070826053619385, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 7620 + }, + { + "epoch": 0.547935368043088, + "grad_norm": 0.6193035840988159, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 7630 + }, + { + "epoch": 0.5486535008976661, + "grad_norm": 0.6171614527702332, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 7640 + }, + { + "epoch": 0.5493716337522442, + "grad_norm": 0.5700938105583191, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 7650 + }, + { + "epoch": 0.5500897666068223, + "grad_norm": 0.5742418169975281, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7660 + }, + { + "epoch": 0.5508078994614004, + "grad_norm": 0.6450320482254028, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 7670 + }, + { + "epoch": 0.5515260323159784, + "grad_norm": 0.542860209941864, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 7680 + }, + { + "epoch": 0.5522441651705565, + "grad_norm": 0.538007915019989, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 7690 + }, + { + "epoch": 0.5529622980251346, + "grad_norm": 0.5846288204193115, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 7700 + }, + { + "epoch": 0.5536804308797127, + "grad_norm": 0.623315155506134, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 7710 + }, + { + "epoch": 0.5543985637342909, + "grad_norm": 0.6607962250709534, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 7720 + }, + { + "epoch": 0.555116696588869, + "grad_norm": 0.5258557200431824, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 7730 + }, + { + "epoch": 0.5558348294434471, + "grad_norm": 0.6464316844940186, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 7740 + }, + { + "epoch": 0.5565529622980251, + "grad_norm": 0.6390621662139893, + "learning_rate": 0.0002, + "loss": 0.7683, + "step": 7750 + }, + { + "epoch": 0.5572710951526032, + "grad_norm": 0.5327560305595398, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 7760 + }, + { + "epoch": 0.5579892280071813, + "grad_norm": 0.8202064633369446, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 7770 + }, + { + "epoch": 0.5587073608617594, + "grad_norm": 0.45350968837738037, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 7780 + }, + { + "epoch": 0.5594254937163375, + "grad_norm": 0.5031413435935974, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 7790 + }, + { + "epoch": 0.5601436265709157, + "grad_norm": 0.5047417879104614, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 7800 + }, + { + "epoch": 0.5608617594254938, + "grad_norm": 0.668912410736084, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 7810 + }, + { + "epoch": 0.5615798922800718, + "grad_norm": 0.6106061339378357, + "learning_rate": 0.0002, + "loss": 0.8226, + "step": 7820 + }, + { + "epoch": 0.5622980251346499, + "grad_norm": 0.5558443665504456, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 7830 + }, + { + "epoch": 0.563016157989228, + "grad_norm": 0.5937177538871765, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 7840 + }, + { + "epoch": 0.5637342908438061, + "grad_norm": 0.67307448387146, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 7850 + }, + { + "epoch": 0.5644524236983842, + "grad_norm": 0.4615475833415985, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 7860 + }, + { + "epoch": 0.5651705565529623, + "grad_norm": 0.5462577939033508, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 7870 + }, + { + "epoch": 0.5658886894075404, + "grad_norm": 0.6422402858734131, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7880 + }, + { + "epoch": 0.5666068222621184, + "grad_norm": 0.5313532948493958, + "learning_rate": 0.0002, + "loss": 0.8327, + "step": 7890 + }, + { + "epoch": 0.5673249551166966, + "grad_norm": 0.5647847056388855, + "learning_rate": 0.0002, + "loss": 0.7771, + "step": 7900 + }, + { + "epoch": 0.5680430879712747, + "grad_norm": 0.6581610441207886, + "learning_rate": 0.0002, + "loss": 0.8126, + "step": 7910 + }, + { + "epoch": 0.5687612208258528, + "grad_norm": 0.46947669982910156, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 7920 + }, + { + "epoch": 0.5694793536804309, + "grad_norm": 0.6420038342475891, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 7930 + }, + { + "epoch": 0.570197486535009, + "grad_norm": 0.6730441451072693, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 7940 + }, + { + "epoch": 0.5709156193895871, + "grad_norm": 0.3849070966243744, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 7950 + }, + { + "epoch": 0.5716337522441651, + "grad_norm": 0.6076335906982422, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 7960 + }, + { + "epoch": 0.5723518850987432, + "grad_norm": 0.6446982026100159, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 7970 + }, + { + "epoch": 0.5730700179533214, + "grad_norm": 0.6019234657287598, + "learning_rate": 0.0002, + "loss": 0.7988, + "step": 7980 + }, + { + "epoch": 0.5737881508078995, + "grad_norm": 0.620880663394928, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 7990 + }, + { + "epoch": 0.5745062836624776, + "grad_norm": 0.4927573502063751, + "learning_rate": 0.0002, + "loss": 0.7712, + "step": 8000 + }, + { + "epoch": 0.5752244165170557, + "grad_norm": 0.6276804804801941, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8010 + }, + { + "epoch": 0.5759425493716338, + "grad_norm": 0.484518826007843, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 8020 + }, + { + "epoch": 0.5766606822262118, + "grad_norm": 0.5019962787628174, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.5773788150807899, + "grad_norm": 0.6685234308242798, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 8040 + }, + { + "epoch": 0.578096947935368, + "grad_norm": 0.5762107372283936, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 8050 + }, + { + "epoch": 0.5788150807899461, + "grad_norm": 0.6402477025985718, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 8060 + }, + { + "epoch": 0.5795332136445243, + "grad_norm": 0.5919345617294312, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8070 + }, + { + "epoch": 0.5802513464991024, + "grad_norm": 0.47100913524627686, + "learning_rate": 0.0002, + "loss": 0.8179, + "step": 8080 + }, + { + "epoch": 0.5809694793536805, + "grad_norm": 0.6029118895530701, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 8090 + }, + { + "epoch": 0.5816876122082585, + "grad_norm": 0.5896338820457458, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 8100 + }, + { + "epoch": 0.5824057450628366, + "grad_norm": 0.49017754197120667, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 8110 + }, + { + "epoch": 0.5831238779174147, + "grad_norm": 0.5049256086349487, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 8120 + }, + { + "epoch": 0.5838420107719928, + "grad_norm": 0.6874517798423767, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 8130 + }, + { + "epoch": 0.5845601436265709, + "grad_norm": 0.5429391264915466, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 8140 + }, + { + "epoch": 0.585278276481149, + "grad_norm": 0.5533722639083862, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 8150 + }, + { + "epoch": 0.5859964093357272, + "grad_norm": 0.5827956199645996, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 8160 + }, + { + "epoch": 0.5867145421903052, + "grad_norm": 0.6670212149620056, + "learning_rate": 0.0002, + "loss": 0.7758, + "step": 8170 + }, + { + "epoch": 0.5874326750448833, + "grad_norm": 0.5231172442436218, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 8180 + }, + { + "epoch": 0.5881508078994614, + "grad_norm": 0.567447304725647, + "learning_rate": 0.0002, + "loss": 0.7975, + "step": 8190 + }, + { + "epoch": 0.5888689407540395, + "grad_norm": 0.5318575501441956, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8200 + }, + { + "epoch": 0.5895870736086176, + "grad_norm": 0.6959463357925415, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 8210 + }, + { + "epoch": 0.5903052064631957, + "grad_norm": 0.6964931488037109, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 8220 + }, + { + "epoch": 0.5910233393177737, + "grad_norm": 0.5164617896080017, + "learning_rate": 0.0002, + "loss": 0.8382, + "step": 8230 + }, + { + "epoch": 0.5917414721723518, + "grad_norm": 0.5456110239028931, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 8240 + }, + { + "epoch": 0.59245960502693, + "grad_norm": 0.6553666591644287, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 8250 + }, + { + "epoch": 0.5931777378815081, + "grad_norm": 0.6185845732688904, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 8260 + }, + { + "epoch": 0.5938958707360862, + "grad_norm": 0.6110545992851257, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8270 + }, + { + "epoch": 0.5946140035906643, + "grad_norm": 0.5186824202537537, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 8280 + }, + { + "epoch": 0.5953321364452424, + "grad_norm": 0.7003735303878784, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 8290 + }, + { + "epoch": 0.5960502692998204, + "grad_norm": 0.4606216549873352, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 8300 + }, + { + "epoch": 0.5967684021543985, + "grad_norm": 0.5903441309928894, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 8310 + }, + { + "epoch": 0.5974865350089766, + "grad_norm": 0.7916744947433472, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 8320 + }, + { + "epoch": 0.5982046678635548, + "grad_norm": 0.5506401062011719, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 8330 + }, + { + "epoch": 0.5989228007181329, + "grad_norm": 0.5749204158782959, + "learning_rate": 0.0002, + "loss": 0.8137, + "step": 8340 + }, + { + "epoch": 0.599640933572711, + "grad_norm": 0.6807544827461243, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 8350 + }, + { + "epoch": 0.6003590664272891, + "grad_norm": 0.5782986283302307, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 8360 + }, + { + "epoch": 0.6010771992818671, + "grad_norm": 0.7336342334747314, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 8370 + }, + { + "epoch": 0.6017953321364452, + "grad_norm": 0.5762712955474854, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.6025134649910233, + "grad_norm": 0.5726776719093323, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 8390 + }, + { + "epoch": 0.6032315978456014, + "grad_norm": 0.5355535745620728, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 8400 + }, + { + "epoch": 0.6039497307001795, + "grad_norm": 0.6762161254882812, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 8410 + }, + { + "epoch": 0.6046678635547577, + "grad_norm": 0.8200717568397522, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 8420 + }, + { + "epoch": 0.6053859964093358, + "grad_norm": 0.5600009560585022, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 8430 + }, + { + "epoch": 0.6061041292639138, + "grad_norm": 0.6465966105461121, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 8440 + }, + { + "epoch": 0.6068222621184919, + "grad_norm": 0.5176072120666504, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 8450 + }, + { + "epoch": 0.60754039497307, + "grad_norm": 0.5777280926704407, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 8460 + }, + { + "epoch": 0.6082585278276481, + "grad_norm": 0.5989252924919128, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 8470 + }, + { + "epoch": 0.6089766606822262, + "grad_norm": 0.5207306742668152, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 8480 + }, + { + "epoch": 0.6096947935368043, + "grad_norm": 0.5242675542831421, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 8490 + }, + { + "epoch": 0.6104129263913824, + "grad_norm": 0.5631455183029175, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 8500 + }, + { + "epoch": 0.6111310592459605, + "grad_norm": 0.65207439661026, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 8510 + }, + { + "epoch": 0.6118491921005386, + "grad_norm": 0.5808899998664856, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8520 + }, + { + "epoch": 0.6125673249551167, + "grad_norm": 0.558127760887146, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 8530 + }, + { + "epoch": 0.6132854578096948, + "grad_norm": 0.6063143014907837, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 8540 + }, + { + "epoch": 0.6140035906642729, + "grad_norm": 0.5491744875907898, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 8550 + }, + { + "epoch": 0.614721723518851, + "grad_norm": 0.5105780959129333, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 8560 + }, + { + "epoch": 0.6154398563734291, + "grad_norm": 0.6892395615577698, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 8570 + }, + { + "epoch": 0.6161579892280071, + "grad_norm": 0.7411758899688721, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8580 + }, + { + "epoch": 0.6168761220825852, + "grad_norm": 0.6745429635047913, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 8590 + }, + { + "epoch": 0.6175942549371634, + "grad_norm": 0.596007227897644, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 8600 + }, + { + "epoch": 0.6183123877917415, + "grad_norm": 0.6751060485839844, + "learning_rate": 0.0002, + "loss": 0.7963, + "step": 8610 + }, + { + "epoch": 0.6190305206463196, + "grad_norm": 0.711124837398529, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 8620 + }, + { + "epoch": 0.6197486535008977, + "grad_norm": 0.6110914945602417, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 8630 + }, + { + "epoch": 0.6204667863554758, + "grad_norm": 0.5687659978866577, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 8640 + }, + { + "epoch": 0.6211849192100538, + "grad_norm": 0.7025772929191589, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8650 + }, + { + "epoch": 0.6219030520646319, + "grad_norm": 0.6456184983253479, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 8660 + }, + { + "epoch": 0.62262118491921, + "grad_norm": 0.5317023992538452, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 8670 + }, + { + "epoch": 0.6233393177737881, + "grad_norm": 0.5531691908836365, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 8680 + }, + { + "epoch": 0.6240574506283663, + "grad_norm": 0.6063531637191772, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 8690 + }, + { + "epoch": 0.6247755834829444, + "grad_norm": 1.094390630722046, + "learning_rate": 0.0002, + "loss": 0.7943, + "step": 8700 + }, + { + "epoch": 0.6254937163375225, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 8710 + }, + { + "epoch": 0.6262118491921005, + "grad_norm": 0.5470370054244995, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 8720 + }, + { + "epoch": 0.6269299820466786, + "grad_norm": 0.5852634310722351, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 8730 + }, + { + "epoch": 0.6276481149012567, + "grad_norm": 0.6120240092277527, + "learning_rate": 0.0002, + "loss": 0.8712, + "step": 8740 + }, + { + "epoch": 0.6283662477558348, + "grad_norm": 0.5608004927635193, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 8750 + }, + { + "epoch": 0.6290843806104129, + "grad_norm": 0.5980432033538818, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 8760 + }, + { + "epoch": 0.629802513464991, + "grad_norm": 0.5670580863952637, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 8770 + }, + { + "epoch": 0.6305206463195692, + "grad_norm": 0.5931687951087952, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 8780 + }, + { + "epoch": 0.6312387791741472, + "grad_norm": 0.7872577905654907, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 8790 + }, + { + "epoch": 0.6319569120287253, + "grad_norm": 0.6355181336402893, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 8800 + }, + { + "epoch": 0.6326750448833034, + "grad_norm": 0.501913845539093, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 8810 + }, + { + "epoch": 0.6333931777378815, + "grad_norm": 0.5956716537475586, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 8820 + }, + { + "epoch": 0.6341113105924596, + "grad_norm": 0.6448253393173218, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 8830 + }, + { + "epoch": 0.6348294434470377, + "grad_norm": 0.6139631271362305, + "learning_rate": 0.0002, + "loss": 0.7878, + "step": 8840 + }, + { + "epoch": 0.6355475763016158, + "grad_norm": 0.5894306302070618, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 8850 + }, + { + "epoch": 0.6362657091561938, + "grad_norm": 0.8724799752235413, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 8860 + }, + { + "epoch": 0.636983842010772, + "grad_norm": 0.5413858890533447, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 8870 + }, + { + "epoch": 0.6377019748653501, + "grad_norm": 0.5993430614471436, + "learning_rate": 0.0002, + "loss": 0.8175, + "step": 8880 + }, + { + "epoch": 0.6384201077199282, + "grad_norm": 0.539415717124939, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 8890 + }, + { + "epoch": 0.6391382405745063, + "grad_norm": 0.600125789642334, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 0.6398563734290844, + "grad_norm": 0.5597978234291077, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 8910 + }, + { + "epoch": 0.6405745062836625, + "grad_norm": 0.6262031197547913, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 8920 + }, + { + "epoch": 0.6412926391382405, + "grad_norm": 0.72662752866745, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 8930 + }, + { + "epoch": 0.6420107719928186, + "grad_norm": 0.613002598285675, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 8940 + }, + { + "epoch": 0.6427289048473968, + "grad_norm": 0.6511827707290649, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 8950 + }, + { + "epoch": 0.6434470377019749, + "grad_norm": 0.5383973717689514, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 8960 + }, + { + "epoch": 0.644165170556553, + "grad_norm": 0.5236184597015381, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 8970 + }, + { + "epoch": 0.6448833034111311, + "grad_norm": 0.5938544273376465, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 8980 + }, + { + "epoch": 0.6456014362657092, + "grad_norm": 0.4594680964946747, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 8990 + }, + { + "epoch": 0.6463195691202872, + "grad_norm": 0.6314211487770081, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 9000 + }, + { + "epoch": 0.6470377019748653, + "grad_norm": 0.6291103363037109, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 9010 + }, + { + "epoch": 0.6477558348294434, + "grad_norm": 0.5888266563415527, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 9020 + }, + { + "epoch": 0.6484739676840215, + "grad_norm": 0.5613022446632385, + "learning_rate": 0.0002, + "loss": 0.7685, + "step": 9030 + }, + { + "epoch": 0.6491921005385997, + "grad_norm": 0.7219604253768921, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 9040 + }, + { + "epoch": 0.6499102333931778, + "grad_norm": 0.5846529006958008, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 9050 + }, + { + "epoch": 0.6506283662477559, + "grad_norm": 0.7264063954353333, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 9060 + }, + { + "epoch": 0.6513464991023339, + "grad_norm": 0.5797538757324219, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9070 + }, + { + "epoch": 0.652064631956912, + "grad_norm": 0.4857395887374878, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9080 + }, + { + "epoch": 0.6527827648114901, + "grad_norm": 0.5044030547142029, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 9090 + }, + { + "epoch": 0.6535008976660682, + "grad_norm": 0.6105342507362366, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 9100 + }, + { + "epoch": 0.6542190305206463, + "grad_norm": 0.6408740282058716, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 9110 + }, + { + "epoch": 0.6549371633752245, + "grad_norm": 0.7474880814552307, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 9120 + }, + { + "epoch": 0.6556552962298026, + "grad_norm": 0.584768533706665, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 9130 + }, + { + "epoch": 0.6563734290843806, + "grad_norm": 0.6368113160133362, + "learning_rate": 0.0002, + "loss": 0.8273, + "step": 9140 + }, + { + "epoch": 0.6570915619389587, + "grad_norm": 0.693631649017334, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 9150 + }, + { + "epoch": 0.6578096947935368, + "grad_norm": 0.6094512343406677, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 9160 + }, + { + "epoch": 0.6585278276481149, + "grad_norm": 0.7154942750930786, + "learning_rate": 0.0002, + "loss": 0.8269, + "step": 9170 + }, + { + "epoch": 0.659245960502693, + "grad_norm": 0.5749237537384033, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9180 + }, + { + "epoch": 0.6599640933572711, + "grad_norm": 0.6214450001716614, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 9190 + }, + { + "epoch": 0.6606822262118492, + "grad_norm": 0.6357814073562622, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9200 + }, + { + "epoch": 0.6614003590664272, + "grad_norm": 0.5677326917648315, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 9210 + }, + { + "epoch": 0.6621184919210054, + "grad_norm": 0.5432633757591248, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 9220 + }, + { + "epoch": 0.6628366247755835, + "grad_norm": 0.43935060501098633, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 9230 + }, + { + "epoch": 0.6635547576301616, + "grad_norm": 0.5350922346115112, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 9240 + }, + { + "epoch": 0.6642728904847397, + "grad_norm": 0.7745687365531921, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 9250 + }, + { + "epoch": 0.6649910233393178, + "grad_norm": 0.5767113566398621, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9260 + }, + { + "epoch": 0.6657091561938959, + "grad_norm": 0.49304983019828796, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 9270 + }, + { + "epoch": 0.6664272890484739, + "grad_norm": 0.6355269551277161, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 9280 + }, + { + "epoch": 0.667145421903052, + "grad_norm": 0.5539451241493225, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 9290 + }, + { + "epoch": 0.6678635547576302, + "grad_norm": 0.5225138068199158, + "learning_rate": 0.0002, + "loss": 0.7888, + "step": 9300 + }, + { + "epoch": 0.6685816876122083, + "grad_norm": 0.5435736179351807, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 9310 + }, + { + "epoch": 0.6692998204667864, + "grad_norm": 0.611266553401947, + "learning_rate": 0.0002, + "loss": 0.8284, + "step": 9320 + }, + { + "epoch": 0.6700179533213645, + "grad_norm": 0.5880926251411438, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 9330 + }, + { + "epoch": 0.6707360861759426, + "grad_norm": 0.5301468372344971, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9340 + }, + { + "epoch": 0.6714542190305206, + "grad_norm": 0.5614377856254578, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 9350 + }, + { + "epoch": 0.6721723518850987, + "grad_norm": 0.7177342176437378, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 9360 + }, + { + "epoch": 0.6728904847396768, + "grad_norm": 0.5187423825263977, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9370 + }, + { + "epoch": 0.6736086175942549, + "grad_norm": 0.49305087327957153, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 9380 + }, + { + "epoch": 0.6743267504488331, + "grad_norm": 0.555867612361908, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 9390 + }, + { + "epoch": 0.6750448833034112, + "grad_norm": 0.8308040499687195, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 9400 + }, + { + "epoch": 0.6757630161579893, + "grad_norm": 0.6522438526153564, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 9410 + }, + { + "epoch": 0.6764811490125673, + "grad_norm": 0.5768371224403381, + "learning_rate": 0.0002, + "loss": 0.8283, + "step": 9420 + }, + { + "epoch": 0.6771992818671454, + "grad_norm": 0.783802330493927, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 9430 + }, + { + "epoch": 0.6779174147217235, + "grad_norm": 0.5246656537055969, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 9440 + }, + { + "epoch": 0.6786355475763016, + "grad_norm": 0.6630974411964417, + "learning_rate": 0.0002, + "loss": 0.7866, + "step": 9450 + }, + { + "epoch": 0.6793536804308797, + "grad_norm": 0.5012770295143127, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 9460 + }, + { + "epoch": 0.6800718132854578, + "grad_norm": 0.6208643317222595, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 9470 + }, + { + "epoch": 0.680789946140036, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9480 + }, + { + "epoch": 0.681508078994614, + "grad_norm": 0.6613174080848694, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 9490 + }, + { + "epoch": 0.6822262118491921, + "grad_norm": 0.6417899131774902, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9500 + }, + { + "epoch": 0.6829443447037702, + "grad_norm": 0.5060321092605591, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 9510 + }, + { + "epoch": 0.6836624775583483, + "grad_norm": 0.586670458316803, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 9520 + }, + { + "epoch": 0.6843806104129264, + "grad_norm": 0.6607828736305237, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 9530 + }, + { + "epoch": 0.6850987432675045, + "grad_norm": 0.5142775177955627, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9540 + }, + { + "epoch": 0.6858168761220825, + "grad_norm": 0.741000771522522, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 9550 + }, + { + "epoch": 0.6865350089766606, + "grad_norm": 0.4687826335430145, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 9560 + }, + { + "epoch": 0.6872531418312388, + "grad_norm": 0.6452056169509888, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 9570 + }, + { + "epoch": 0.6879712746858169, + "grad_norm": 0.6393555402755737, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 9580 + }, + { + "epoch": 0.688689407540395, + "grad_norm": 0.4907757043838501, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 9590 + }, + { + "epoch": 0.6894075403949731, + "grad_norm": 0.5380825996398926, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 9600 + }, + { + "epoch": 0.6901256732495512, + "grad_norm": 0.5657393932342529, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 9610 + }, + { + "epoch": 0.6908438061041292, + "grad_norm": 0.8505447506904602, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 9620 + }, + { + "epoch": 0.6915619389587073, + "grad_norm": 0.5389836430549622, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 9630 + }, + { + "epoch": 0.6922800718132854, + "grad_norm": 0.4977441728115082, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 9640 + }, + { + "epoch": 0.6929982046678635, + "grad_norm": 0.5855389833450317, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 9650 + }, + { + "epoch": 0.6937163375224417, + "grad_norm": 0.633994996547699, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 9660 + }, + { + "epoch": 0.6944344703770198, + "grad_norm": 0.5592191815376282, + "learning_rate": 0.0002, + "loss": 0.7918, + "step": 9670 + }, + { + "epoch": 0.6951526032315979, + "grad_norm": 0.6030594706535339, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9680 + }, + { + "epoch": 0.6958707360861759, + "grad_norm": 0.6782388687133789, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 9690 + }, + { + "epoch": 0.696588868940754, + "grad_norm": 0.6777627468109131, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 9700 + }, + { + "epoch": 0.6973070017953321, + "grad_norm": 0.5674123764038086, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 9710 + }, + { + "epoch": 0.6980251346499102, + "grad_norm": 0.5280387997627258, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 9720 + }, + { + "epoch": 0.6987432675044883, + "grad_norm": 0.5471981763839722, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 9730 + }, + { + "epoch": 0.6994614003590665, + "grad_norm": 0.6751061677932739, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9740 + }, + { + "epoch": 0.7001795332136446, + "grad_norm": 0.5942487716674805, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 9750 + }, + { + "epoch": 0.7008976660682226, + "grad_norm": 0.6165713667869568, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 9760 + }, + { + "epoch": 0.7016157989228007, + "grad_norm": 0.5745091438293457, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 9770 + }, + { + "epoch": 0.7023339317773788, + "grad_norm": 0.600308358669281, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 9780 + }, + { + "epoch": 0.7030520646319569, + "grad_norm": 0.6448577046394348, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 9790 + }, + { + "epoch": 0.703770197486535, + "grad_norm": 0.5662767291069031, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9800 + }, + { + "epoch": 0.7044883303411131, + "grad_norm": 0.6490433812141418, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 9810 + }, + { + "epoch": 0.7052064631956912, + "grad_norm": 0.6126134991645813, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 9820 + }, + { + "epoch": 0.7059245960502692, + "grad_norm": 0.7181116938591003, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 9830 + }, + { + "epoch": 0.7066427289048474, + "grad_norm": 0.7805212140083313, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 9840 + }, + { + "epoch": 0.7073608617594255, + "grad_norm": 0.7521958947181702, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 9850 + }, + { + "epoch": 0.7080789946140036, + "grad_norm": 0.5610787868499756, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 9860 + }, + { + "epoch": 0.7087971274685817, + "grad_norm": 0.7026229500770569, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 9870 + }, + { + "epoch": 0.7095152603231598, + "grad_norm": 0.551691472530365, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 9880 + }, + { + "epoch": 0.7102333931777379, + "grad_norm": 0.5841995477676392, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 9890 + }, + { + "epoch": 0.7109515260323159, + "grad_norm": 0.7170061469078064, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 9900 + }, + { + "epoch": 0.711669658886894, + "grad_norm": 0.49836990237236023, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 9910 + }, + { + "epoch": 0.7123877917414722, + "grad_norm": 0.5234556794166565, + "learning_rate": 0.0002, + "loss": 0.7667, + "step": 9920 + }, + { + "epoch": 0.7131059245960503, + "grad_norm": 0.7590384483337402, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 9930 + }, + { + "epoch": 0.7138240574506284, + "grad_norm": 0.5657515525817871, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 9940 + }, + { + "epoch": 0.7145421903052065, + "grad_norm": 0.5969128012657166, + "learning_rate": 0.0002, + "loss": 0.8184, + "step": 9950 + }, + { + "epoch": 0.7152603231597846, + "grad_norm": 0.7136867046356201, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 9960 + }, + { + "epoch": 0.7159784560143626, + "grad_norm": 0.6774699091911316, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 9970 + }, + { + "epoch": 0.7166965888689407, + "grad_norm": 0.6066371202468872, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 9980 + }, + { + "epoch": 0.7174147217235188, + "grad_norm": 0.7355279922485352, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 9990 + }, + { + "epoch": 0.718132854578097, + "grad_norm": 0.7996646761894226, + "learning_rate": 0.0002, + "loss": 0.7643, + "step": 10000 + }, + { + "epoch": 0.7188509874326751, + "grad_norm": 0.628839910030365, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 10010 + }, + { + "epoch": 0.7195691202872532, + "grad_norm": 0.5472931265830994, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 10020 + }, + { + "epoch": 0.7202872531418313, + "grad_norm": 0.5776344537734985, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 10030 + }, + { + "epoch": 0.7210053859964093, + "grad_norm": 0.5041707158088684, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10040 + }, + { + "epoch": 0.7217235188509874, + "grad_norm": 0.5965308547019958, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 10050 + }, + { + "epoch": 0.7224416517055655, + "grad_norm": 0.5892689228057861, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 10060 + }, + { + "epoch": 0.7231597845601436, + "grad_norm": 0.5695884227752686, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 10070 + }, + { + "epoch": 0.7238779174147217, + "grad_norm": 0.6547690629959106, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 10080 + }, + { + "epoch": 0.7245960502692999, + "grad_norm": 0.6759928464889526, + "learning_rate": 0.0002, + "loss": 0.7978, + "step": 10090 + }, + { + "epoch": 0.725314183123878, + "grad_norm": 0.6829725503921509, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 10100 + }, + { + "epoch": 0.726032315978456, + "grad_norm": 0.5242751240730286, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 10110 + }, + { + "epoch": 0.7267504488330341, + "grad_norm": 0.6947014927864075, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 10120 + }, + { + "epoch": 0.7274685816876122, + "grad_norm": 0.6094982624053955, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 10130 + }, + { + "epoch": 0.7281867145421903, + "grad_norm": 0.628461480140686, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 10140 + }, + { + "epoch": 0.7289048473967684, + "grad_norm": 0.4952087104320526, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10150 + }, + { + "epoch": 0.7296229802513465, + "grad_norm": 0.6917221546173096, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 10160 + }, + { + "epoch": 0.7303411131059246, + "grad_norm": 0.6866413354873657, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10170 + }, + { + "epoch": 0.7310592459605026, + "grad_norm": 0.5505863428115845, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 10180 + }, + { + "epoch": 0.7317773788150808, + "grad_norm": 0.5903199911117554, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 10190 + }, + { + "epoch": 0.7324955116696589, + "grad_norm": 0.5001798272132874, + "learning_rate": 0.0002, + "loss": 0.8072, + "step": 10200 + }, + { + "epoch": 0.733213644524237, + "grad_norm": 0.5117581486701965, + "learning_rate": 0.0002, + "loss": 0.7934, + "step": 10210 + }, + { + "epoch": 0.7339317773788151, + "grad_norm": 0.7716088891029358, + "learning_rate": 0.0002, + "loss": 0.8364, + "step": 10220 + }, + { + "epoch": 0.7346499102333932, + "grad_norm": 0.5973874926567078, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 10230 + }, + { + "epoch": 0.7353680430879713, + "grad_norm": 0.6433483362197876, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 10240 + }, + { + "epoch": 0.7360861759425493, + "grad_norm": 0.6241081357002258, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10250 + }, + { + "epoch": 0.7368043087971274, + "grad_norm": 0.7198845744132996, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 10260 + }, + { + "epoch": 0.7375224416517056, + "grad_norm": 0.5879023671150208, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 10270 + }, + { + "epoch": 0.7382405745062837, + "grad_norm": 0.5810162425041199, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 10280 + }, + { + "epoch": 0.7389587073608618, + "grad_norm": 0.6336500644683838, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 10290 + }, + { + "epoch": 0.7396768402154399, + "grad_norm": 0.5627583861351013, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 10300 + }, + { + "epoch": 0.740394973070018, + "grad_norm": 0.5396066904067993, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 10310 + }, + { + "epoch": 0.741113105924596, + "grad_norm": 0.5519505143165588, + "learning_rate": 0.0002, + "loss": 0.7698, + "step": 10320 + }, + { + "epoch": 0.7418312387791741, + "grad_norm": 0.628710925579071, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 10330 + }, + { + "epoch": 0.7425493716337522, + "grad_norm": 0.6466957926750183, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10340 + }, + { + "epoch": 0.7432675044883303, + "grad_norm": 0.6269286274909973, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 10350 + }, + { + "epoch": 0.7439856373429085, + "grad_norm": 0.6985455751419067, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 10360 + }, + { + "epoch": 0.7447037701974866, + "grad_norm": 0.6203648447990417, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 10370 + }, + { + "epoch": 0.7454219030520647, + "grad_norm": 0.6524295210838318, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 10380 + }, + { + "epoch": 0.7461400359066427, + "grad_norm": 0.6108002662658691, + "learning_rate": 0.0002, + "loss": 0.8005, + "step": 10390 + }, + { + "epoch": 0.7468581687612208, + "grad_norm": 0.5196276903152466, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 10400 + }, + { + "epoch": 0.7475763016157989, + "grad_norm": 0.6207506656646729, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 10410 + }, + { + "epoch": 0.748294434470377, + "grad_norm": 0.6015686988830566, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 10420 + }, + { + "epoch": 0.7490125673249551, + "grad_norm": 0.6402649879455566, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 10430 + }, + { + "epoch": 0.7497307001795332, + "grad_norm": 0.7816081047058105, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 10440 + }, + { + "epoch": 0.7504488330341114, + "grad_norm": 0.6148143410682678, + "learning_rate": 0.0002, + "loss": 0.8021, + "step": 10450 + }, + { + "epoch": 0.7511669658886894, + "grad_norm": 0.6496613621711731, + "learning_rate": 0.0002, + "loss": 0.7986, + "step": 10460 + }, + { + "epoch": 0.7518850987432675, + "grad_norm": 0.49158045649528503, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 10470 + }, + { + "epoch": 0.7526032315978456, + "grad_norm": 0.8629217743873596, + "learning_rate": 0.0002, + "loss": 0.8098, + "step": 10480 + }, + { + "epoch": 0.7533213644524237, + "grad_norm": 0.6800066828727722, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 10490 + }, + { + "epoch": 0.7540394973070018, + "grad_norm": 0.6480063199996948, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 10500 + }, + { + "epoch": 0.7547576301615799, + "grad_norm": 0.5740751028060913, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 10510 + }, + { + "epoch": 0.755475763016158, + "grad_norm": 0.7182627320289612, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 10520 + }, + { + "epoch": 0.756193895870736, + "grad_norm": 0.6482816934585571, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 10530 + }, + { + "epoch": 0.7569120287253142, + "grad_norm": 0.4937674105167389, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 10540 + }, + { + "epoch": 0.7576301615798923, + "grad_norm": 0.6818482875823975, + "learning_rate": 0.0002, + "loss": 0.7783, + "step": 10550 + }, + { + "epoch": 0.7583482944344704, + "grad_norm": 0.6375173926353455, + "learning_rate": 0.0002, + "loss": 0.8303, + "step": 10560 + }, + { + "epoch": 0.7590664272890485, + "grad_norm": 0.528798520565033, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 10570 + }, + { + "epoch": 0.7597845601436266, + "grad_norm": 0.42099910974502563, + "learning_rate": 0.0002, + "loss": 0.8435, + "step": 10580 + }, + { + "epoch": 0.7605026929982047, + "grad_norm": 0.529604434967041, + "learning_rate": 0.0002, + "loss": 0.8218, + "step": 10590 + }, + { + "epoch": 0.7612208258527827, + "grad_norm": 0.6236841082572937, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 10600 + }, + { + "epoch": 0.7619389587073608, + "grad_norm": 0.6194891929626465, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10610 + }, + { + "epoch": 0.762657091561939, + "grad_norm": 0.5206209421157837, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 10620 + }, + { + "epoch": 0.7633752244165171, + "grad_norm": 0.7981295585632324, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 10630 + }, + { + "epoch": 0.7640933572710952, + "grad_norm": 0.6113479137420654, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 10640 + }, + { + "epoch": 0.7648114901256733, + "grad_norm": 0.7025435566902161, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10650 + }, + { + "epoch": 0.7655296229802514, + "grad_norm": 0.46914348006248474, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 10660 + }, + { + "epoch": 0.7662477558348294, + "grad_norm": 0.6134725213050842, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 10670 + }, + { + "epoch": 0.7669658886894075, + "grad_norm": 0.583859920501709, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10680 + }, + { + "epoch": 0.7676840215439856, + "grad_norm": 0.511349081993103, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 10690 + }, + { + "epoch": 0.7684021543985637, + "grad_norm": 0.6467110514640808, + "learning_rate": 0.0002, + "loss": 0.8355, + "step": 10700 + }, + { + "epoch": 0.7691202872531419, + "grad_norm": 0.7210163474082947, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 10710 + }, + { + "epoch": 0.76983842010772, + "grad_norm": 0.6034521460533142, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 10720 + }, + { + "epoch": 0.7705565529622981, + "grad_norm": 0.6237271428108215, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 10730 + }, + { + "epoch": 0.7712746858168761, + "grad_norm": 0.664328396320343, + "learning_rate": 0.0002, + "loss": 0.8185, + "step": 10740 + }, + { + "epoch": 0.7719928186714542, + "grad_norm": 0.6550520062446594, + "learning_rate": 0.0002, + "loss": 0.8096, + "step": 10750 + }, + { + "epoch": 0.7727109515260323, + "grad_norm": 0.5103325843811035, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 10760 + }, + { + "epoch": 0.7734290843806104, + "grad_norm": 0.7171200513839722, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 10770 + }, + { + "epoch": 0.7741472172351885, + "grad_norm": 0.5947384834289551, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 10780 + }, + { + "epoch": 0.7748653500897666, + "grad_norm": 0.5293096899986267, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10790 + }, + { + "epoch": 0.7755834829443446, + "grad_norm": 0.6372577548027039, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10800 + }, + { + "epoch": 0.7763016157989228, + "grad_norm": 0.5738261938095093, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 10810 + }, + { + "epoch": 0.7770197486535009, + "grad_norm": 0.7309247255325317, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 10820 + }, + { + "epoch": 0.777737881508079, + "grad_norm": 0.8867193460464478, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 10830 + }, + { + "epoch": 0.7784560143626571, + "grad_norm": 0.6151437759399414, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 10840 + }, + { + "epoch": 0.7791741472172352, + "grad_norm": 0.5645464658737183, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 0.7798922800718133, + "grad_norm": 0.5118698477745056, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 10860 + }, + { + "epoch": 0.7806104129263913, + "grad_norm": 0.618181049823761, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 10870 + }, + { + "epoch": 0.7813285457809694, + "grad_norm": 0.7206462025642395, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 10880 + }, + { + "epoch": 0.7820466786355476, + "grad_norm": 0.7993820905685425, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 10890 + }, + { + "epoch": 0.7827648114901257, + "grad_norm": 0.5072754621505737, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 10900 + }, + { + "epoch": 0.7834829443447038, + "grad_norm": 0.5829088687896729, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 10910 + }, + { + "epoch": 0.7842010771992819, + "grad_norm": 0.5778957605361938, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 10920 + }, + { + "epoch": 0.78491921005386, + "grad_norm": 0.7237067222595215, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 10930 + }, + { + "epoch": 0.785637342908438, + "grad_norm": 0.5778013467788696, + "learning_rate": 0.0002, + "loss": 0.8357, + "step": 10940 + }, + { + "epoch": 0.7863554757630161, + "grad_norm": 0.6129629611968994, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10950 + }, + { + "epoch": 0.7870736086175942, + "grad_norm": 0.5637320876121521, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10960 + }, + { + "epoch": 0.7877917414721723, + "grad_norm": 0.6253715753555298, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 10970 + }, + { + "epoch": 0.7885098743267505, + "grad_norm": 0.6209888458251953, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 10980 + }, + { + "epoch": 0.7892280071813286, + "grad_norm": 1.0841948986053467, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 10990 + }, + { + "epoch": 0.7899461400359067, + "grad_norm": 0.6570560336112976, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 11000 + }, + { + "epoch": 0.7906642728904847, + "grad_norm": 0.4830388128757477, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11010 + }, + { + "epoch": 0.7913824057450628, + "grad_norm": 0.7607520222663879, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11020 + }, + { + "epoch": 0.7921005385996409, + "grad_norm": 0.8202590346336365, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 11030 + }, + { + "epoch": 0.792818671454219, + "grad_norm": 0.5640848278999329, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11040 + }, + { + "epoch": 0.7935368043087971, + "grad_norm": 0.7773675322532654, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 11050 + }, + { + "epoch": 0.7942549371633753, + "grad_norm": 0.664139986038208, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11060 + }, + { + "epoch": 0.7949730700179534, + "grad_norm": 0.6097795367240906, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 11070 + }, + { + "epoch": 0.7956912028725314, + "grad_norm": 0.9208881258964539, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 11080 + }, + { + "epoch": 0.7964093357271095, + "grad_norm": 0.6210731863975525, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 11090 + }, + { + "epoch": 0.7971274685816876, + "grad_norm": 0.7060235738754272, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 11100 + }, + { + "epoch": 0.7978456014362657, + "grad_norm": 0.48695266246795654, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 11110 + }, + { + "epoch": 0.7985637342908438, + "grad_norm": 0.6458830833435059, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 11120 + }, + { + "epoch": 0.7992818671454219, + "grad_norm": 0.572545051574707, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 11130 + }, + { + "epoch": 0.8, + "grad_norm": 0.5925027132034302, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 11140 + }, + { + "epoch": 0.800718132854578, + "grad_norm": 0.569622278213501, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 11150 + }, + { + "epoch": 0.8014362657091562, + "grad_norm": 0.537146806716919, + "learning_rate": 0.0002, + "loss": 0.7765, + "step": 11160 + }, + { + "epoch": 0.8021543985637343, + "grad_norm": 0.7118613719940186, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 11170 + }, + { + "epoch": 0.8028725314183124, + "grad_norm": 0.6183688044548035, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 11180 + }, + { + "epoch": 0.8035906642728905, + "grad_norm": 0.5187385082244873, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 11190 + }, + { + "epoch": 0.8043087971274686, + "grad_norm": 0.5422571301460266, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 11200 + }, + { + "epoch": 0.8050269299820467, + "grad_norm": 0.635050892829895, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 11210 + }, + { + "epoch": 0.8057450628366247, + "grad_norm": 0.6584872007369995, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 11220 + }, + { + "epoch": 0.8064631956912028, + "grad_norm": 0.624921977519989, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 11230 + }, + { + "epoch": 0.807181328545781, + "grad_norm": 0.6837546229362488, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 11240 + }, + { + "epoch": 0.8078994614003591, + "grad_norm": 0.5861160755157471, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11250 + }, + { + "epoch": 0.8086175942549372, + "grad_norm": 0.5751383900642395, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 11260 + }, + { + "epoch": 0.8093357271095153, + "grad_norm": 0.7181510329246521, + "learning_rate": 0.0002, + "loss": 0.8103, + "step": 11270 + }, + { + "epoch": 0.8100538599640934, + "grad_norm": 0.5862139463424683, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11280 + }, + { + "epoch": 0.8107719928186714, + "grad_norm": 0.4880113899707794, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 11290 + }, + { + "epoch": 0.8114901256732495, + "grad_norm": 0.565590500831604, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 11300 + }, + { + "epoch": 0.8122082585278276, + "grad_norm": 0.6171264052391052, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11310 + }, + { + "epoch": 0.8129263913824057, + "grad_norm": 0.5815969109535217, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 11320 + }, + { + "epoch": 0.8136445242369839, + "grad_norm": 0.5407653450965881, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 11330 + }, + { + "epoch": 0.814362657091562, + "grad_norm": 0.6990084648132324, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 11340 + }, + { + "epoch": 0.8150807899461401, + "grad_norm": 0.5845068097114563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 11350 + }, + { + "epoch": 0.8157989228007181, + "grad_norm": 0.5978701114654541, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 11360 + }, + { + "epoch": 0.8165170556552962, + "grad_norm": 0.6873053312301636, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 11370 + }, + { + "epoch": 0.8172351885098743, + "grad_norm": 0.7048654556274414, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 11380 + }, + { + "epoch": 0.8179533213644524, + "grad_norm": 0.7631531953811646, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 11390 + }, + { + "epoch": 0.8186714542190305, + "grad_norm": 0.704922080039978, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 11400 + }, + { + "epoch": 0.8193895870736086, + "grad_norm": 0.595460832118988, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 11410 + }, + { + "epoch": 0.8201077199281868, + "grad_norm": 0.5882242918014526, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 11420 + }, + { + "epoch": 0.8208258527827648, + "grad_norm": 0.6433175206184387, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 11430 + }, + { + "epoch": 0.8215439856373429, + "grad_norm": 0.6047986149787903, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 11440 + }, + { + "epoch": 0.822262118491921, + "grad_norm": 0.6462088823318481, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 11450 + }, + { + "epoch": 0.8229802513464991, + "grad_norm": 0.5558379888534546, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 11460 + }, + { + "epoch": 0.8236983842010772, + "grad_norm": 0.6745542287826538, + "learning_rate": 0.0002, + "loss": 0.7916, + "step": 11470 + }, + { + "epoch": 0.8244165170556553, + "grad_norm": 0.7082334756851196, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 11480 + }, + { + "epoch": 0.8251346499102334, + "grad_norm": 0.703889787197113, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11490 + }, + { + "epoch": 0.8258527827648114, + "grad_norm": 0.5261096358299255, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 11500 + }, + { + "epoch": 0.8265709156193896, + "grad_norm": 0.6009393930435181, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 11510 + }, + { + "epoch": 0.8272890484739677, + "grad_norm": 0.584274172782898, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 11520 + }, + { + "epoch": 0.8280071813285458, + "grad_norm": 0.6803238987922668, + "learning_rate": 0.0002, + "loss": 0.7926, + "step": 11530 + }, + { + "epoch": 0.8287253141831239, + "grad_norm": 0.6230084896087646, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 11540 + }, + { + "epoch": 0.829443447037702, + "grad_norm": 0.6090595722198486, + "learning_rate": 0.0002, + "loss": 0.7902, + "step": 11550 + }, + { + "epoch": 0.8301615798922801, + "grad_norm": 0.5292693376541138, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 11560 + }, + { + "epoch": 0.8308797127468581, + "grad_norm": 0.5675389766693115, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 11570 + }, + { + "epoch": 0.8315978456014362, + "grad_norm": 0.554874062538147, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 11580 + }, + { + "epoch": 0.8323159784560143, + "grad_norm": 0.8582373261451721, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 11590 + }, + { + "epoch": 0.8330341113105925, + "grad_norm": 0.5743035674095154, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 11600 + }, + { + "epoch": 0.8337522441651706, + "grad_norm": 0.5749582648277283, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11610 + }, + { + "epoch": 0.8344703770197487, + "grad_norm": 0.5207278728485107, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11620 + }, + { + "epoch": 0.8351885098743268, + "grad_norm": 0.6262611150741577, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 11630 + }, + { + "epoch": 0.8359066427289048, + "grad_norm": 0.5490066409111023, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 11640 + }, + { + "epoch": 0.8366247755834829, + "grad_norm": 0.6283167600631714, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 11650 + }, + { + "epoch": 0.837342908438061, + "grad_norm": 0.7701452374458313, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 11660 + }, + { + "epoch": 0.8380610412926391, + "grad_norm": 0.5825072526931763, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 11670 + }, + { + "epoch": 0.8387791741472173, + "grad_norm": 0.6119720935821533, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 11680 + }, + { + "epoch": 0.8394973070017954, + "grad_norm": 0.689383327960968, + "learning_rate": 0.0002, + "loss": 0.7995, + "step": 11690 + }, + { + "epoch": 0.8402154398563735, + "grad_norm": 0.5396560430526733, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 11700 + }, + { + "epoch": 0.8409335727109515, + "grad_norm": 0.577178955078125, + "learning_rate": 0.0002, + "loss": 0.8073, + "step": 11710 + }, + { + "epoch": 0.8416517055655296, + "grad_norm": 0.6652564406394958, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 11720 + }, + { + "epoch": 0.8423698384201077, + "grad_norm": 0.588377058506012, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 11730 + }, + { + "epoch": 0.8430879712746858, + "grad_norm": 0.6180438995361328, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 11740 + }, + { + "epoch": 0.8438061041292639, + "grad_norm": 0.6897811889648438, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11750 + }, + { + "epoch": 0.844524236983842, + "grad_norm": 0.5826608538627625, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 11760 + }, + { + "epoch": 0.8452423698384202, + "grad_norm": 0.6511976718902588, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 11770 + }, + { + "epoch": 0.8459605026929982, + "grad_norm": 0.4738382399082184, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 11780 + }, + { + "epoch": 0.8466786355475763, + "grad_norm": 0.541780948638916, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 11790 + }, + { + "epoch": 0.8473967684021544, + "grad_norm": 0.6115241050720215, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 11800 + }, + { + "epoch": 0.8481149012567325, + "grad_norm": 0.7067801356315613, + "learning_rate": 0.0002, + "loss": 0.834, + "step": 11810 + }, + { + "epoch": 0.8488330341113106, + "grad_norm": 0.5602791905403137, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 11820 + }, + { + "epoch": 0.8495511669658887, + "grad_norm": 0.6968005299568176, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 11830 + }, + { + "epoch": 0.8502692998204668, + "grad_norm": 0.621132493019104, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11840 + }, + { + "epoch": 0.8509874326750448, + "grad_norm": 0.5777568817138672, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 11850 + }, + { + "epoch": 0.851705565529623, + "grad_norm": 0.6468178629875183, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 11860 + }, + { + "epoch": 0.8524236983842011, + "grad_norm": 0.6216070652008057, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 11870 + }, + { + "epoch": 0.8531418312387792, + "grad_norm": 0.7402005791664124, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 11880 + }, + { + "epoch": 0.8538599640933573, + "grad_norm": 0.5192958116531372, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 11890 + }, + { + "epoch": 0.8545780969479354, + "grad_norm": 0.6050501465797424, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 11900 + }, + { + "epoch": 0.8552962298025135, + "grad_norm": 0.5363124012947083, + "learning_rate": 0.0002, + "loss": 0.8131, + "step": 11910 + }, + { + "epoch": 0.8560143626570915, + "grad_norm": 0.525288462638855, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 11920 + }, + { + "epoch": 0.8567324955116696, + "grad_norm": 0.6129848957061768, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 11930 + }, + { + "epoch": 0.8574506283662477, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 11940 + }, + { + "epoch": 0.8581687612208259, + "grad_norm": 0.5862830281257629, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 11950 + }, + { + "epoch": 0.858886894075404, + "grad_norm": 0.7078025341033936, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 11960 + }, + { + "epoch": 0.8596050269299821, + "grad_norm": 0.6600908637046814, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 11970 + }, + { + "epoch": 0.8603231597845602, + "grad_norm": 0.5914377570152283, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 11980 + }, + { + "epoch": 0.8610412926391382, + "grad_norm": 0.7844575047492981, + "learning_rate": 0.0002, + "loss": 0.8222, + "step": 11990 + }, + { + "epoch": 0.8617594254937163, + "grad_norm": 0.6605148315429688, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 12000 + }, + { + "epoch": 0.8624775583482944, + "grad_norm": 0.6320111155509949, + "learning_rate": 0.0002, + "loss": 0.8066, + "step": 12010 + }, + { + "epoch": 0.8631956912028725, + "grad_norm": 0.5833557844161987, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 12020 + }, + { + "epoch": 0.8639138240574507, + "grad_norm": 0.5322666764259338, + "learning_rate": 0.0002, + "loss": 0.8016, + "step": 12030 + }, + { + "epoch": 0.8646319569120288, + "grad_norm": 0.568696141242981, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 12040 + }, + { + "epoch": 0.8653500897666068, + "grad_norm": 0.5739135146141052, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 12050 + }, + { + "epoch": 0.8660682226211849, + "grad_norm": 0.6667993068695068, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 12060 + }, + { + "epoch": 0.866786355475763, + "grad_norm": 0.5393701195716858, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 12070 + }, + { + "epoch": 0.8675044883303411, + "grad_norm": 0.7036312818527222, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 12080 + }, + { + "epoch": 0.8682226211849192, + "grad_norm": 0.5851739048957825, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 12090 + }, + { + "epoch": 0.8689407540394973, + "grad_norm": 0.6554462909698486, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 12100 + }, + { + "epoch": 0.8696588868940754, + "grad_norm": 0.8224838376045227, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 12110 + }, + { + "epoch": 0.8703770197486534, + "grad_norm": 0.513981819152832, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 12120 + }, + { + "epoch": 0.8710951526032316, + "grad_norm": 0.6913988590240479, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 12130 + }, + { + "epoch": 0.8718132854578097, + "grad_norm": 0.5539003610610962, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 12140 + }, + { + "epoch": 0.8725314183123878, + "grad_norm": 0.6216937303543091, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 12150 + }, + { + "epoch": 0.8732495511669659, + "grad_norm": 0.5594495534896851, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 12160 + }, + { + "epoch": 0.873967684021544, + "grad_norm": 0.6025309562683105, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 12170 + }, + { + "epoch": 0.8746858168761221, + "grad_norm": 0.5285239815711975, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 12180 + }, + { + "epoch": 0.8754039497307001, + "grad_norm": 1.0394607782363892, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 12190 + }, + { + "epoch": 0.8761220825852782, + "grad_norm": 0.5128031373023987, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 12200 + }, + { + "epoch": 0.8768402154398564, + "grad_norm": 0.5883685946464539, + "learning_rate": 0.0002, + "loss": 0.8113, + "step": 12210 + }, + { + "epoch": 0.8775583482944345, + "grad_norm": 0.593204915523529, + "learning_rate": 0.0002, + "loss": 0.7493, + "step": 12220 + }, + { + "epoch": 0.8782764811490126, + "grad_norm": 0.7141679525375366, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 12230 + }, + { + "epoch": 0.8789946140035907, + "grad_norm": 0.6381585597991943, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 12240 + }, + { + "epoch": 0.8797127468581688, + "grad_norm": 0.7076981067657471, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12250 + }, + { + "epoch": 0.8804308797127468, + "grad_norm": 0.8046461939811707, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 12260 + }, + { + "epoch": 0.8811490125673249, + "grad_norm": 0.635160505771637, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 12270 + }, + { + "epoch": 0.881867145421903, + "grad_norm": 0.6388354301452637, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 12280 + }, + { + "epoch": 0.8825852782764811, + "grad_norm": 0.5612906217575073, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 12290 + }, + { + "epoch": 0.8833034111310593, + "grad_norm": 0.6716228723526001, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 12300 + }, + { + "epoch": 0.8840215439856374, + "grad_norm": 0.6488762497901917, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 12310 + }, + { + "epoch": 0.8847396768402155, + "grad_norm": 0.5770853757858276, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 12320 + }, + { + "epoch": 0.8854578096947935, + "grad_norm": 0.5006616711616516, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 12330 + }, + { + "epoch": 0.8861759425493716, + "grad_norm": 0.6428417563438416, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 12340 + }, + { + "epoch": 0.8868940754039497, + "grad_norm": 0.5721977949142456, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12350 + }, + { + "epoch": 0.8876122082585278, + "grad_norm": 0.7000266313552856, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 12360 + }, + { + "epoch": 0.8883303411131059, + "grad_norm": 0.5252631306648254, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 12370 + }, + { + "epoch": 0.889048473967684, + "grad_norm": 0.5788044929504395, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 12380 + }, + { + "epoch": 0.8897666068222622, + "grad_norm": 0.6730653643608093, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 12390 + }, + { + "epoch": 0.8904847396768402, + "grad_norm": 0.5556851029396057, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 12400 + }, + { + "epoch": 0.8912028725314183, + "grad_norm": 0.616189181804657, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 12410 + }, + { + "epoch": 0.8919210053859964, + "grad_norm": 0.6360940337181091, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 12420 + }, + { + "epoch": 0.8926391382405745, + "grad_norm": 0.5832887887954712, + "learning_rate": 0.0002, + "loss": 0.8088, + "step": 12430 + }, + { + "epoch": 0.8933572710951526, + "grad_norm": 0.8319168090820312, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 12440 + }, + { + "epoch": 0.8940754039497307, + "grad_norm": 0.5415005087852478, + "learning_rate": 0.0002, + "loss": 0.8597, + "step": 12450 + }, + { + "epoch": 0.8947935368043088, + "grad_norm": 0.4959808588027954, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 12460 + }, + { + "epoch": 0.8955116696588868, + "grad_norm": 0.5102260708808899, + "learning_rate": 0.0002, + "loss": 0.8493, + "step": 12470 + }, + { + "epoch": 0.896229802513465, + "grad_norm": 0.773972749710083, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12480 + }, + { + "epoch": 0.8969479353680431, + "grad_norm": 0.6314513087272644, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 12490 + }, + { + "epoch": 0.8976660682226212, + "grad_norm": 0.6503705382347107, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 12500 + }, + { + "epoch": 0.8983842010771993, + "grad_norm": 0.5702024102210999, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 12510 + }, + { + "epoch": 0.8991023339317774, + "grad_norm": 0.7222756743431091, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 12520 + }, + { + "epoch": 0.8998204667863555, + "grad_norm": 0.7242336869239807, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 12530 + }, + { + "epoch": 0.9005385996409335, + "grad_norm": 0.625769317150116, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 12540 + }, + { + "epoch": 0.9012567324955116, + "grad_norm": 0.6003357172012329, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 12550 + }, + { + "epoch": 0.9019748653500897, + "grad_norm": 0.6089374423027039, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 12560 + }, + { + "epoch": 0.9026929982046679, + "grad_norm": 0.6232544183731079, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 12570 + }, + { + "epoch": 0.903411131059246, + "grad_norm": 0.5426769256591797, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 12580 + }, + { + "epoch": 0.9041292639138241, + "grad_norm": 0.5711943507194519, + "learning_rate": 0.0002, + "loss": 0.8023, + "step": 12590 + }, + { + "epoch": 0.9048473967684022, + "grad_norm": 0.5287838578224182, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 12600 + }, + { + "epoch": 0.9055655296229802, + "grad_norm": 0.6192951798439026, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 12610 + }, + { + "epoch": 0.9062836624775583, + "grad_norm": 0.493082195520401, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 12620 + }, + { + "epoch": 0.9070017953321364, + "grad_norm": 0.7668463587760925, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 12630 + }, + { + "epoch": 0.9077199281867145, + "grad_norm": 0.6298037767410278, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 12640 + }, + { + "epoch": 0.9084380610412927, + "grad_norm": 0.5502580404281616, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 12650 + }, + { + "epoch": 0.9091561938958708, + "grad_norm": 0.5525170564651489, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 0.9098743267504489, + "grad_norm": 0.9753695726394653, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 12670 + }, + { + "epoch": 0.9105924596050269, + "grad_norm": 0.611427366733551, + "learning_rate": 0.0002, + "loss": 0.872, + "step": 12680 + }, + { + "epoch": 0.911310592459605, + "grad_norm": 0.5141594409942627, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 12690 + }, + { + "epoch": 0.9120287253141831, + "grad_norm": 0.6739137172698975, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 12700 + }, + { + "epoch": 0.9127468581687612, + "grad_norm": 0.5759707689285278, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 12710 + }, + { + "epoch": 0.9134649910233393, + "grad_norm": 0.5548733472824097, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12720 + }, + { + "epoch": 0.9141831238779174, + "grad_norm": 0.7014280557632446, + "learning_rate": 0.0002, + "loss": 0.8225, + "step": 12730 + }, + { + "epoch": 0.9149012567324956, + "grad_norm": 0.5939958691596985, + "learning_rate": 0.0002, + "loss": 0.7936, + "step": 12740 + }, + { + "epoch": 0.9156193895870736, + "grad_norm": 0.5995593667030334, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 12750 + }, + { + "epoch": 0.9163375224416517, + "grad_norm": 0.6686680316925049, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 12760 + }, + { + "epoch": 0.9170556552962298, + "grad_norm": 0.4742372930049896, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 12770 + }, + { + "epoch": 0.9177737881508079, + "grad_norm": 0.5493217706680298, + "learning_rate": 0.0002, + "loss": 0.7795, + "step": 12780 + }, + { + "epoch": 0.918491921005386, + "grad_norm": 0.5641885995864868, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 12790 + }, + { + "epoch": 0.9192100538599641, + "grad_norm": 0.5814061164855957, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 12800 + }, + { + "epoch": 0.9199281867145422, + "grad_norm": 0.6774331331253052, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 12810 + }, + { + "epoch": 0.9206463195691202, + "grad_norm": 0.5592127442359924, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 12820 + }, + { + "epoch": 0.9213644524236984, + "grad_norm": 0.5246456861495972, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 12830 + }, + { + "epoch": 0.9220825852782765, + "grad_norm": 0.6524264812469482, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 12840 + }, + { + "epoch": 0.9228007181328546, + "grad_norm": 0.6010791063308716, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 12850 + }, + { + "epoch": 0.9235188509874327, + "grad_norm": 0.5289866924285889, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 12860 + }, + { + "epoch": 0.9242369838420108, + "grad_norm": 0.6850762367248535, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 12870 + }, + { + "epoch": 0.9249551166965889, + "grad_norm": 0.5293797850608826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 12880 + }, + { + "epoch": 0.9256732495511669, + "grad_norm": 0.6045399308204651, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 12890 + }, + { + "epoch": 0.926391382405745, + "grad_norm": 0.7026739716529846, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 12900 + }, + { + "epoch": 0.9271095152603231, + "grad_norm": 0.6884756684303284, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 12910 + }, + { + "epoch": 0.9278276481149013, + "grad_norm": 0.637884795665741, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 12920 + }, + { + "epoch": 0.9285457809694794, + "grad_norm": 0.513913631439209, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 12930 + }, + { + "epoch": 0.9292639138240575, + "grad_norm": 0.6642340421676636, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 12940 + }, + { + "epoch": 0.9299820466786356, + "grad_norm": 0.5708861947059631, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 12950 + }, + { + "epoch": 0.9307001795332136, + "grad_norm": 0.5896512866020203, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 12960 + }, + { + "epoch": 0.9314183123877917, + "grad_norm": 0.5754874348640442, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 12970 + }, + { + "epoch": 0.9321364452423698, + "grad_norm": 0.6363751888275146, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 12980 + }, + { + "epoch": 0.9328545780969479, + "grad_norm": 0.7660197019577026, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 12990 + }, + { + "epoch": 0.933572710951526, + "grad_norm": 0.607728898525238, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 13000 + }, + { + "epoch": 0.9342908438061042, + "grad_norm": 0.5257042050361633, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 13010 + }, + { + "epoch": 0.9350089766606823, + "grad_norm": 0.7916908264160156, + "learning_rate": 0.0002, + "loss": 0.8129, + "step": 13020 + }, + { + "epoch": 0.9357271095152603, + "grad_norm": 0.8310123085975647, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 13030 + }, + { + "epoch": 0.9364452423698384, + "grad_norm": 0.6543728113174438, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 13040 + }, + { + "epoch": 0.9371633752244165, + "grad_norm": 0.7153878808021545, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 13050 + }, + { + "epoch": 0.9378815080789946, + "grad_norm": 0.7510694265365601, + "learning_rate": 0.0002, + "loss": 0.779, + "step": 13060 + }, + { + "epoch": 0.9385996409335727, + "grad_norm": 0.5524464249610901, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 13070 + }, + { + "epoch": 0.9393177737881508, + "grad_norm": 0.6657140254974365, + "learning_rate": 0.0002, + "loss": 0.8635, + "step": 13080 + }, + { + "epoch": 0.940035906642729, + "grad_norm": 0.5757394433021545, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 13090 + }, + { + "epoch": 0.940754039497307, + "grad_norm": 0.6171187162399292, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 13100 + }, + { + "epoch": 0.9414721723518851, + "grad_norm": 0.5946314334869385, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 13110 + }, + { + "epoch": 0.9421903052064632, + "grad_norm": 0.5727229714393616, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 13120 + }, + { + "epoch": 0.9429084380610413, + "grad_norm": 0.7805224061012268, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 13130 + }, + { + "epoch": 0.9436265709156194, + "grad_norm": 0.5763523578643799, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 13140 + }, + { + "epoch": 0.9443447037701975, + "grad_norm": 0.8310899138450623, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13150 + }, + { + "epoch": 0.9450628366247756, + "grad_norm": 0.7531784772872925, + "learning_rate": 0.0002, + "loss": 0.7818, + "step": 13160 + }, + { + "epoch": 0.9457809694793536, + "grad_norm": 0.678779661655426, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 13170 + }, + { + "epoch": 0.9464991023339318, + "grad_norm": 0.8096453547477722, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13180 + }, + { + "epoch": 0.9472172351885099, + "grad_norm": 0.6743921637535095, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 13190 + }, + { + "epoch": 0.947935368043088, + "grad_norm": 0.606852114200592, + "learning_rate": 0.0002, + "loss": 0.7949, + "step": 13200 + }, + { + "epoch": 0.9486535008976661, + "grad_norm": 0.6550270915031433, + "learning_rate": 0.0002, + "loss": 0.7908, + "step": 13210 + }, + { + "epoch": 0.9493716337522442, + "grad_norm": 0.6494552493095398, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 13220 + }, + { + "epoch": 0.9500897666068223, + "grad_norm": 0.5867666602134705, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 13230 + }, + { + "epoch": 0.9508078994614003, + "grad_norm": 0.6283786296844482, + "learning_rate": 0.0002, + "loss": 0.8117, + "step": 13240 + }, + { + "epoch": 0.9515260323159784, + "grad_norm": 0.6824573278427124, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 13250 + }, + { + "epoch": 0.9522441651705565, + "grad_norm": 0.6945744156837463, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 13260 + }, + { + "epoch": 0.9529622980251347, + "grad_norm": 0.6468575596809387, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 13270 + }, + { + "epoch": 0.9536804308797128, + "grad_norm": 0.6819407939910889, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 13280 + }, + { + "epoch": 0.9543985637342909, + "grad_norm": 0.6660491824150085, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 13290 + }, + { + "epoch": 0.9551166965888689, + "grad_norm": 0.6320462226867676, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 13300 + }, + { + "epoch": 0.955834829443447, + "grad_norm": 0.46753761172294617, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 13310 + }, + { + "epoch": 0.9565529622980251, + "grad_norm": 0.6608774065971375, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 13320 + }, + { + "epoch": 0.9572710951526032, + "grad_norm": 0.607448935508728, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 13330 + }, + { + "epoch": 0.9579892280071813, + "grad_norm": 0.6796701550483704, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 13340 + }, + { + "epoch": 0.9587073608617595, + "grad_norm": 0.7655861377716064, + "learning_rate": 0.0002, + "loss": 0.7979, + "step": 13350 + }, + { + "epoch": 0.9594254937163376, + "grad_norm": 0.5881335735321045, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 13360 + }, + { + "epoch": 0.9601436265709156, + "grad_norm": 0.6855270862579346, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 13370 + }, + { + "epoch": 0.9608617594254937, + "grad_norm": 0.6072475910186768, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 13380 + }, + { + "epoch": 0.9615798922800718, + "grad_norm": 0.5983994603157043, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 13390 + }, + { + "epoch": 0.9622980251346499, + "grad_norm": 0.6141189932823181, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 13400 + }, + { + "epoch": 0.963016157989228, + "grad_norm": 0.6539722084999084, + "learning_rate": 0.0002, + "loss": 0.8059, + "step": 13410 + }, + { + "epoch": 0.9637342908438061, + "grad_norm": 0.5425801277160645, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 13420 + }, + { + "epoch": 0.9644524236983842, + "grad_norm": 0.8038925528526306, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 13430 + }, + { + "epoch": 0.9651705565529622, + "grad_norm": 0.5729590058326721, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 13440 + }, + { + "epoch": 0.9658886894075404, + "grad_norm": 0.5695241689682007, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 13450 + }, + { + "epoch": 0.9666068222621185, + "grad_norm": 0.5913681387901306, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 13460 + }, + { + "epoch": 0.9673249551166966, + "grad_norm": 1.1798994541168213, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 13470 + }, + { + "epoch": 0.9680430879712747, + "grad_norm": 0.5931369066238403, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 13480 + }, + { + "epoch": 0.9687612208258528, + "grad_norm": 0.6269514560699463, + "learning_rate": 0.0002, + "loss": 0.8432, + "step": 13490 + }, + { + "epoch": 0.9694793536804309, + "grad_norm": 0.7380245327949524, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 13500 + }, + { + "epoch": 0.9701974865350089, + "grad_norm": 0.5668187141418457, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 13510 + }, + { + "epoch": 0.970915619389587, + "grad_norm": 0.547149121761322, + "learning_rate": 0.0002, + "loss": 0.7562, + "step": 13520 + }, + { + "epoch": 0.9716337522441651, + "grad_norm": 0.49131739139556885, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 13530 + }, + { + "epoch": 0.9723518850987433, + "grad_norm": 0.6385366320610046, + "learning_rate": 0.0002, + "loss": 0.8159, + "step": 13540 + }, + { + "epoch": 0.9730700179533214, + "grad_norm": 0.5962417125701904, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 13550 + }, + { + "epoch": 0.9737881508078995, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 13560 + }, + { + "epoch": 0.9745062836624776, + "grad_norm": 0.5757403373718262, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 13570 + }, + { + "epoch": 0.9752244165170556, + "grad_norm": 0.7214667201042175, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 13580 + }, + { + "epoch": 0.9759425493716337, + "grad_norm": 0.5902701020240784, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 13590 + }, + { + "epoch": 0.9766606822262118, + "grad_norm": 0.752805769443512, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 13600 + }, + { + "epoch": 0.9773788150807899, + "grad_norm": 0.5943595767021179, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 13610 + }, + { + "epoch": 0.978096947935368, + "grad_norm": 0.6752488613128662, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 13620 + }, + { + "epoch": 0.9788150807899462, + "grad_norm": 0.5295413732528687, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 13630 + }, + { + "epoch": 0.9795332136445243, + "grad_norm": 0.732549250125885, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 13640 + }, + { + "epoch": 0.9802513464991023, + "grad_norm": 0.5701823830604553, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 13650 + }, + { + "epoch": 0.9809694793536804, + "grad_norm": 0.576898455619812, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13660 + }, + { + "epoch": 0.9816876122082585, + "grad_norm": 0.5916832089424133, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 13670 + }, + { + "epoch": 0.9824057450628366, + "grad_norm": 0.5554524660110474, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 13680 + }, + { + "epoch": 0.9831238779174147, + "grad_norm": 0.6988440752029419, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 13690 + }, + { + "epoch": 0.9838420107719928, + "grad_norm": 0.6660445332527161, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 13700 + }, + { + "epoch": 0.984560143626571, + "grad_norm": 2.421210289001465, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 13710 + }, + { + "epoch": 0.985278276481149, + "grad_norm": 0.6307598948478699, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 13720 + }, + { + "epoch": 0.9859964093357271, + "grad_norm": 0.6832480430603027, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 13730 + }, + { + "epoch": 0.9867145421903052, + "grad_norm": 0.5974255204200745, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 13740 + }, + { + "epoch": 0.9874326750448833, + "grad_norm": 0.6540380716323853, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 13750 + }, + { + "epoch": 0.9881508078994614, + "grad_norm": 0.7532727122306824, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 13760 + }, + { + "epoch": 0.9888689407540395, + "grad_norm": 0.6776283383369446, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 13770 + }, + { + "epoch": 0.9895870736086176, + "grad_norm": 0.5776281356811523, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 13780 + }, + { + "epoch": 0.9903052064631956, + "grad_norm": 0.5473008751869202, + "learning_rate": 0.0002, + "loss": 0.8216, + "step": 13790 + }, + { + "epoch": 0.9910233393177738, + "grad_norm": 0.5428591370582581, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 13800 + }, + { + "epoch": 0.9917414721723519, + "grad_norm": 0.5173406004905701, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 13810 + }, + { + "epoch": 0.99245960502693, + "grad_norm": 0.6462617516517639, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 13820 + }, + { + "epoch": 0.9931777378815081, + "grad_norm": 0.5800426006317139, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 13830 + }, + { + "epoch": 0.9938958707360862, + "grad_norm": 0.5015466809272766, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 13840 + }, + { + "epoch": 0.9946140035906643, + "grad_norm": 0.59474778175354, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 13850 + }, + { + "epoch": 0.9953321364452423, + "grad_norm": 0.5609583258628845, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 13860 + }, + { + "epoch": 0.9960502692998204, + "grad_norm": 0.5762063264846802, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 13870 + }, + { + "epoch": 0.9967684021543985, + "grad_norm": 0.6419214010238647, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 13880 + }, + { + "epoch": 0.9974865350089767, + "grad_norm": 0.7821950316429138, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 13890 + }, + { + "epoch": 0.9982046678635548, + "grad_norm": 0.6216017007827759, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 13900 + }, + { + "epoch": 0.9989228007181329, + "grad_norm": 0.5446485877037048, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 13910 + }, + { + "epoch": 0.999640933572711, + "grad_norm": 0.5037565231323242, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 13920 + }, + { + "epoch": 1.0, + "eval_loss": 1.09147310256958, + "eval_runtime": 55.1915, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 13925 + }, + { + "epoch": 1.000359066427289, + "grad_norm": 0.5808277130126953, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 13930 + }, + { + "epoch": 1.0010771992818672, + "grad_norm": 0.47258496284484863, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 13940 + }, + { + "epoch": 1.0017953321364452, + "grad_norm": 0.8921670317649841, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 13950 + }, + { + "epoch": 1.0025134649910232, + "grad_norm": 0.746729850769043, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 13960 + }, + { + "epoch": 1.0032315978456015, + "grad_norm": 0.6243796944618225, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13970 + }, + { + "epoch": 1.0039497307001795, + "grad_norm": 0.6725090742111206, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 13980 + }, + { + "epoch": 1.0046678635547577, + "grad_norm": 0.8762497305870056, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 13990 + }, + { + "epoch": 1.0053859964093357, + "grad_norm": 0.7694411873817444, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 14000 + }, + { + "epoch": 1.006104129263914, + "grad_norm": 0.6208822727203369, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 14010 + }, + { + "epoch": 1.006822262118492, + "grad_norm": 0.8503357768058777, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 14020 + }, + { + "epoch": 1.00754039497307, + "grad_norm": 0.5813316106796265, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14030 + }, + { + "epoch": 1.0082585278276481, + "grad_norm": 0.8186036348342896, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 14040 + }, + { + "epoch": 1.0089766606822261, + "grad_norm": 0.759873628616333, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14050 + }, + { + "epoch": 1.0096947935368044, + "grad_norm": 0.8437777161598206, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 14060 + }, + { + "epoch": 1.0104129263913824, + "grad_norm": 0.5750975012779236, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14070 + }, + { + "epoch": 1.0111310592459606, + "grad_norm": 0.5873221158981323, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 14080 + }, + { + "epoch": 1.0118491921005386, + "grad_norm": 0.6381314396858215, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 14090 + }, + { + "epoch": 1.0125673249551166, + "grad_norm": 0.6510405540466309, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 14100 + }, + { + "epoch": 1.0132854578096948, + "grad_norm": 0.7698671221733093, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 14110 + }, + { + "epoch": 1.0140035906642728, + "grad_norm": 0.646180272102356, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 14120 + }, + { + "epoch": 1.014721723518851, + "grad_norm": 0.6183205246925354, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 14130 + }, + { + "epoch": 1.015439856373429, + "grad_norm": 0.5082563757896423, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 14140 + }, + { + "epoch": 1.0161579892280073, + "grad_norm": 0.7285500764846802, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 14150 + }, + { + "epoch": 1.0168761220825853, + "grad_norm": 0.6368175148963928, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 14160 + }, + { + "epoch": 1.0175942549371633, + "grad_norm": 0.44868743419647217, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 14170 + }, + { + "epoch": 1.0183123877917415, + "grad_norm": 0.6346513628959656, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 14180 + }, + { + "epoch": 1.0190305206463195, + "grad_norm": 0.7287803292274475, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 14190 + }, + { + "epoch": 1.0197486535008977, + "grad_norm": 0.6701363325119019, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 14200 + }, + { + "epoch": 1.0204667863554757, + "grad_norm": 0.6419289112091064, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 14210 + }, + { + "epoch": 1.021184919210054, + "grad_norm": 0.7703002095222473, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 14220 + }, + { + "epoch": 1.021903052064632, + "grad_norm": 0.6803670525550842, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14230 + }, + { + "epoch": 1.02262118491921, + "grad_norm": 0.5780976414680481, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 14240 + }, + { + "epoch": 1.0233393177737882, + "grad_norm": 0.5096051096916199, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 14250 + }, + { + "epoch": 1.0240574506283662, + "grad_norm": 0.6058611869812012, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 14260 + }, + { + "epoch": 1.0247755834829444, + "grad_norm": 0.6703311204910278, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 14270 + }, + { + "epoch": 1.0254937163375224, + "grad_norm": 0.7143640518188477, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 14280 + }, + { + "epoch": 1.0262118491921006, + "grad_norm": 0.6730744242668152, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 14290 + }, + { + "epoch": 1.0269299820466786, + "grad_norm": 0.8180603384971619, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14300 + }, + { + "epoch": 1.0276481149012566, + "grad_norm": 0.6752267479896545, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 14310 + }, + { + "epoch": 1.0283662477558349, + "grad_norm": 0.678428590297699, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 14320 + }, + { + "epoch": 1.0290843806104129, + "grad_norm": 0.5959973931312561, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 14330 + }, + { + "epoch": 1.029802513464991, + "grad_norm": 0.5797176957130432, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 14340 + }, + { + "epoch": 1.030520646319569, + "grad_norm": 0.6415652632713318, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 14350 + }, + { + "epoch": 1.0312387791741473, + "grad_norm": 0.6984175443649292, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 14360 + }, + { + "epoch": 1.0319569120287253, + "grad_norm": 0.7158452272415161, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 14370 + }, + { + "epoch": 1.0326750448833033, + "grad_norm": 0.6066089272499084, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 14380 + }, + { + "epoch": 1.0333931777378815, + "grad_norm": 0.7359582781791687, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 14390 + }, + { + "epoch": 1.0341113105924595, + "grad_norm": 0.7372373938560486, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 14400 + }, + { + "epoch": 1.0348294434470378, + "grad_norm": 0.7511868476867676, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 14410 + }, + { + "epoch": 1.0355475763016158, + "grad_norm": 0.5449917912483215, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 14420 + }, + { + "epoch": 1.036265709156194, + "grad_norm": 0.6700817346572876, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 14430 + }, + { + "epoch": 1.036983842010772, + "grad_norm": 0.7061316967010498, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14440 + }, + { + "epoch": 1.03770197486535, + "grad_norm": 0.7582663893699646, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 14450 + }, + { + "epoch": 1.0384201077199282, + "grad_norm": 0.6408873200416565, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 14460 + }, + { + "epoch": 1.0391382405745062, + "grad_norm": 0.7645436525344849, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 14470 + }, + { + "epoch": 1.0398563734290844, + "grad_norm": 0.6522644758224487, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 14480 + }, + { + "epoch": 1.0405745062836624, + "grad_norm": 0.784273624420166, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 14490 + }, + { + "epoch": 1.0412926391382407, + "grad_norm": 0.673891544342041, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 14500 + }, + { + "epoch": 1.0420107719928187, + "grad_norm": 0.6566316485404968, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 14510 + }, + { + "epoch": 1.0427289048473967, + "grad_norm": 0.6062059998512268, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 14520 + }, + { + "epoch": 1.0434470377019749, + "grad_norm": 0.6884504556655884, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14530 + }, + { + "epoch": 1.044165170556553, + "grad_norm": 0.6642231345176697, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14540 + }, + { + "epoch": 1.0448833034111311, + "grad_norm": 0.6989523768424988, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 14550 + }, + { + "epoch": 1.0456014362657091, + "grad_norm": 0.8179892301559448, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 14560 + }, + { + "epoch": 1.0463195691202873, + "grad_norm": 0.6426970362663269, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 14570 + }, + { + "epoch": 1.0470377019748653, + "grad_norm": 0.678445041179657, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 14580 + }, + { + "epoch": 1.0477558348294433, + "grad_norm": 0.7573820352554321, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 14590 + }, + { + "epoch": 1.0484739676840216, + "grad_norm": 0.734443724155426, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 14600 + }, + { + "epoch": 1.0491921005385996, + "grad_norm": 0.7333676218986511, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 14610 + }, + { + "epoch": 1.0499102333931778, + "grad_norm": 0.6122187972068787, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14620 + }, + { + "epoch": 1.0506283662477558, + "grad_norm": 0.6916412711143494, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 14630 + }, + { + "epoch": 1.051346499102334, + "grad_norm": 0.5898127555847168, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 14640 + }, + { + "epoch": 1.052064631956912, + "grad_norm": 0.6071873307228088, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 14650 + }, + { + "epoch": 1.05278276481149, + "grad_norm": 0.6530455946922302, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 14660 + }, + { + "epoch": 1.0535008976660682, + "grad_norm": 0.6919314861297607, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14670 + }, + { + "epoch": 1.0542190305206462, + "grad_norm": 0.7843509912490845, + "learning_rate": 0.0002, + "loss": 0.7481, + "step": 14680 + }, + { + "epoch": 1.0549371633752245, + "grad_norm": 0.6106747388839722, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 14690 + }, + { + "epoch": 1.0556552962298025, + "grad_norm": 0.7828368544578552, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 14700 + }, + { + "epoch": 1.0563734290843807, + "grad_norm": 0.6772044897079468, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 14710 + }, + { + "epoch": 1.0570915619389587, + "grad_norm": 0.5430962443351746, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 14720 + }, + { + "epoch": 1.0578096947935367, + "grad_norm": 0.7364194989204407, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 14730 + }, + { + "epoch": 1.058527827648115, + "grad_norm": 0.5607585310935974, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 14740 + }, + { + "epoch": 1.059245960502693, + "grad_norm": 0.7917081713676453, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 14750 + }, + { + "epoch": 1.0599640933572712, + "grad_norm": 0.7852025628089905, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 14760 + }, + { + "epoch": 1.0606822262118492, + "grad_norm": 0.6329161524772644, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 14770 + }, + { + "epoch": 1.0614003590664274, + "grad_norm": 0.7607306838035583, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14780 + }, + { + "epoch": 1.0621184919210054, + "grad_norm": 0.7236617207527161, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 14790 + }, + { + "epoch": 1.0628366247755834, + "grad_norm": 0.793542206287384, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 14800 + }, + { + "epoch": 1.0635547576301616, + "grad_norm": 0.53999263048172, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 14810 + }, + { + "epoch": 1.0642728904847396, + "grad_norm": 0.5821034908294678, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 14820 + }, + { + "epoch": 1.0649910233393178, + "grad_norm": 0.6593600511550903, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 14830 + }, + { + "epoch": 1.0657091561938958, + "grad_norm": 0.70230633020401, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 14840 + }, + { + "epoch": 1.066427289048474, + "grad_norm": 0.5715264081954956, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 14850 + }, + { + "epoch": 1.067145421903052, + "grad_norm": 0.6610119938850403, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 14860 + }, + { + "epoch": 1.06786355475763, + "grad_norm": 0.5470091700553894, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 14870 + }, + { + "epoch": 1.0685816876122083, + "grad_norm": 0.7529906630516052, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 14880 + }, + { + "epoch": 1.0692998204667863, + "grad_norm": 0.7532844543457031, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 14890 + }, + { + "epoch": 1.0700179533213645, + "grad_norm": 0.6439316868782043, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14900 + }, + { + "epoch": 1.0707360861759425, + "grad_norm": 0.5580114126205444, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 14910 + }, + { + "epoch": 1.0714542190305207, + "grad_norm": 0.6299236416816711, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 14920 + }, + { + "epoch": 1.0721723518850987, + "grad_norm": 0.6934021711349487, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 14930 + }, + { + "epoch": 1.0728904847396767, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6766, + "step": 14940 + }, + { + "epoch": 1.073608617594255, + "grad_norm": 0.8921014070510864, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 14950 + }, + { + "epoch": 1.074326750448833, + "grad_norm": 0.5934301614761353, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 14960 + }, + { + "epoch": 1.0750448833034112, + "grad_norm": 0.8379642367362976, + "learning_rate": 0.0002, + "loss": 0.7595, + "step": 14970 + }, + { + "epoch": 1.0757630161579892, + "grad_norm": 0.6842767596244812, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 14980 + }, + { + "epoch": 1.0764811490125674, + "grad_norm": 0.7296533584594727, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 14990 + }, + { + "epoch": 1.0771992818671454, + "grad_norm": 0.6821087002754211, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15000 + }, + { + "epoch": 1.0779174147217234, + "grad_norm": 0.6133626699447632, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 15010 + }, + { + "epoch": 1.0786355475763016, + "grad_norm": 0.6774773001670837, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 15020 + }, + { + "epoch": 1.0793536804308796, + "grad_norm": 0.6818786859512329, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 15030 + }, + { + "epoch": 1.0800718132854579, + "grad_norm": 0.7763522863388062, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15040 + }, + { + "epoch": 1.0807899461400359, + "grad_norm": 0.7259193658828735, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15050 + }, + { + "epoch": 1.081508078994614, + "grad_norm": 0.6797525882720947, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 15060 + }, + { + "epoch": 1.082226211849192, + "grad_norm": 0.5775881409645081, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 15070 + }, + { + "epoch": 1.08294434470377, + "grad_norm": 0.7055524587631226, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15080 + }, + { + "epoch": 1.0836624775583483, + "grad_norm": 0.8018748760223389, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 15090 + }, + { + "epoch": 1.0843806104129263, + "grad_norm": 0.6738115549087524, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 15100 + }, + { + "epoch": 1.0850987432675046, + "grad_norm": 0.6586359143257141, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 15110 + }, + { + "epoch": 1.0858168761220826, + "grad_norm": 0.7396895885467529, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 15120 + }, + { + "epoch": 1.0865350089766608, + "grad_norm": 0.7224817276000977, + "learning_rate": 0.0002, + "loss": 0.7473, + "step": 15130 + }, + { + "epoch": 1.0872531418312388, + "grad_norm": 0.798514187335968, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 15140 + }, + { + "epoch": 1.0879712746858168, + "grad_norm": 0.79301518201828, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 15150 + }, + { + "epoch": 1.088689407540395, + "grad_norm": 0.7106764316558838, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 15160 + }, + { + "epoch": 1.089407540394973, + "grad_norm": 0.6525473594665527, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 15170 + }, + { + "epoch": 1.0901256732495512, + "grad_norm": 0.6001671552658081, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 15180 + }, + { + "epoch": 1.0908438061041292, + "grad_norm": 0.6949557662010193, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 15190 + }, + { + "epoch": 1.0915619389587075, + "grad_norm": 0.5713186860084534, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 15200 + }, + { + "epoch": 1.0922800718132855, + "grad_norm": 0.8773220181465149, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 15210 + }, + { + "epoch": 1.0929982046678635, + "grad_norm": 0.5837785601615906, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 15220 + }, + { + "epoch": 1.0937163375224417, + "grad_norm": 0.7243856191635132, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 15230 + }, + { + "epoch": 1.0944344703770197, + "grad_norm": 0.7008263468742371, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 15240 + }, + { + "epoch": 1.095152603231598, + "grad_norm": 0.7061941623687744, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 15250 + }, + { + "epoch": 1.095870736086176, + "grad_norm": 0.575903594493866, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 15260 + }, + { + "epoch": 1.0965888689407541, + "grad_norm": 0.6794043183326721, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 15270 + }, + { + "epoch": 1.0973070017953321, + "grad_norm": 0.7194870710372925, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 15280 + }, + { + "epoch": 1.0980251346499101, + "grad_norm": 0.8063322305679321, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 15290 + }, + { + "epoch": 1.0987432675044884, + "grad_norm": 0.786101758480072, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 15300 + }, + { + "epoch": 1.0994614003590664, + "grad_norm": 0.827474057674408, + "learning_rate": 0.0002, + "loss": 0.7523, + "step": 15310 + }, + { + "epoch": 1.1001795332136446, + "grad_norm": 0.6514455080032349, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 15320 + }, + { + "epoch": 1.1008976660682226, + "grad_norm": 0.7534348368644714, + "learning_rate": 0.0002, + "loss": 0.745, + "step": 15330 + }, + { + "epoch": 1.1016157989228008, + "grad_norm": 0.6991367340087891, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 15340 + }, + { + "epoch": 1.1023339317773788, + "grad_norm": 0.6742196679115295, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15350 + }, + { + "epoch": 1.1030520646319568, + "grad_norm": 0.7373757362365723, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 15360 + }, + { + "epoch": 1.103770197486535, + "grad_norm": 0.6834485530853271, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 15370 + }, + { + "epoch": 1.104488330341113, + "grad_norm": 0.6454901099205017, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 15380 + }, + { + "epoch": 1.1052064631956913, + "grad_norm": 0.7764508128166199, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 15390 + }, + { + "epoch": 1.1059245960502693, + "grad_norm": 0.668560802936554, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 15400 + }, + { + "epoch": 1.1066427289048475, + "grad_norm": 0.579655110836029, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 15410 + }, + { + "epoch": 1.1073608617594255, + "grad_norm": 0.7196493148803711, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 15420 + }, + { + "epoch": 1.1080789946140035, + "grad_norm": 0.5530232191085815, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 15430 + }, + { + "epoch": 1.1087971274685817, + "grad_norm": 0.6542958617210388, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 15440 + }, + { + "epoch": 1.1095152603231597, + "grad_norm": 0.7468852400779724, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 15450 + }, + { + "epoch": 1.110233393177738, + "grad_norm": 0.8119780421257019, + "learning_rate": 0.0002, + "loss": 0.6898, + "step": 15460 + }, + { + "epoch": 1.110951526032316, + "grad_norm": 0.7807733416557312, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 15470 + }, + { + "epoch": 1.1116696588868942, + "grad_norm": 0.7352553009986877, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 15480 + }, + { + "epoch": 1.1123877917414722, + "grad_norm": 0.8455224633216858, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 15490 + }, + { + "epoch": 1.1131059245960502, + "grad_norm": 0.635308563709259, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 15500 + }, + { + "epoch": 1.1138240574506284, + "grad_norm": 0.6268794536590576, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15510 + }, + { + "epoch": 1.1145421903052064, + "grad_norm": 0.6829593181610107, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 15520 + }, + { + "epoch": 1.1152603231597846, + "grad_norm": 0.5997796058654785, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 15530 + }, + { + "epoch": 1.1159784560143626, + "grad_norm": 0.7500942349433899, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 15540 + }, + { + "epoch": 1.1166965888689409, + "grad_norm": 0.7052047848701477, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 15550 + }, + { + "epoch": 1.1174147217235189, + "grad_norm": 0.6698189377784729, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 15560 + }, + { + "epoch": 1.1181328545780969, + "grad_norm": 0.7890462875366211, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 15570 + }, + { + "epoch": 1.118850987432675, + "grad_norm": 0.7002465128898621, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 15580 + }, + { + "epoch": 1.119569120287253, + "grad_norm": 0.7456073760986328, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 15590 + }, + { + "epoch": 1.1202872531418313, + "grad_norm": 0.7997385263442993, + "learning_rate": 0.0002, + "loss": 0.7577, + "step": 15600 + }, + { + "epoch": 1.1210053859964093, + "grad_norm": 0.6640482544898987, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 15610 + }, + { + "epoch": 1.1217235188509875, + "grad_norm": 0.7765318155288696, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15620 + }, + { + "epoch": 1.1224416517055655, + "grad_norm": 0.7184962630271912, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 15630 + }, + { + "epoch": 1.1231597845601435, + "grad_norm": 0.7310904264450073, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 15640 + }, + { + "epoch": 1.1238779174147218, + "grad_norm": 0.7406452298164368, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 15650 + }, + { + "epoch": 1.1245960502692998, + "grad_norm": 0.7546738982200623, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 15660 + }, + { + "epoch": 1.125314183123878, + "grad_norm": 0.7069764733314514, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 15670 + }, + { + "epoch": 1.126032315978456, + "grad_norm": 0.6309521198272705, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 15680 + }, + { + "epoch": 1.1267504488330342, + "grad_norm": 0.8050156831741333, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 15690 + }, + { + "epoch": 1.1274685816876122, + "grad_norm": 0.726556122303009, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 15700 + }, + { + "epoch": 1.1281867145421902, + "grad_norm": 0.77745521068573, + "learning_rate": 0.0002, + "loss": 0.7763, + "step": 15710 + }, + { + "epoch": 1.1289048473967684, + "grad_norm": 0.7467634677886963, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 15720 + }, + { + "epoch": 1.1296229802513464, + "grad_norm": 0.8207895755767822, + "learning_rate": 0.0002, + "loss": 0.7676, + "step": 15730 + }, + { + "epoch": 1.1303411131059247, + "grad_norm": 0.8253937363624573, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 15740 + }, + { + "epoch": 1.1310592459605027, + "grad_norm": 0.6313983798027039, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 15750 + }, + { + "epoch": 1.1317773788150807, + "grad_norm": 0.8040992021560669, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 15760 + }, + { + "epoch": 1.132495511669659, + "grad_norm": 0.5937064290046692, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 15770 + }, + { + "epoch": 1.133213644524237, + "grad_norm": 0.6486281156539917, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 15780 + }, + { + "epoch": 1.1339317773788151, + "grad_norm": 0.6161853075027466, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 15790 + }, + { + "epoch": 1.1346499102333931, + "grad_norm": 0.6926610469818115, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 15800 + }, + { + "epoch": 1.1353680430879713, + "grad_norm": 0.6084047555923462, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 15810 + }, + { + "epoch": 1.1360861759425493, + "grad_norm": 0.6928383111953735, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 15820 + }, + { + "epoch": 1.1368043087971276, + "grad_norm": 0.7784243822097778, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 15830 + }, + { + "epoch": 1.1375224416517056, + "grad_norm": 0.7169384956359863, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 15840 + }, + { + "epoch": 1.1382405745062836, + "grad_norm": 0.6953616142272949, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 15850 + }, + { + "epoch": 1.1389587073608618, + "grad_norm": 0.7345215082168579, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15860 + }, + { + "epoch": 1.1396768402154398, + "grad_norm": 0.5469502806663513, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 15870 + }, + { + "epoch": 1.140394973070018, + "grad_norm": 0.687680721282959, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 15880 + }, + { + "epoch": 1.141113105924596, + "grad_norm": 0.6879996657371521, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 15890 + }, + { + "epoch": 1.141831238779174, + "grad_norm": 0.728886067867279, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 15900 + }, + { + "epoch": 1.1425493716337523, + "grad_norm": 0.929531455039978, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 15910 + }, + { + "epoch": 1.1432675044883303, + "grad_norm": 0.8122507333755493, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 15920 + }, + { + "epoch": 1.1439856373429085, + "grad_norm": 0.6494652628898621, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 15930 + }, + { + "epoch": 1.1447037701974865, + "grad_norm": 0.7307567596435547, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 15940 + }, + { + "epoch": 1.1454219030520647, + "grad_norm": 0.548678994178772, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 15950 + }, + { + "epoch": 1.1461400359066427, + "grad_norm": 0.8011603951454163, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 15960 + }, + { + "epoch": 1.146858168761221, + "grad_norm": 0.7026647329330444, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 15970 + }, + { + "epoch": 1.147576301615799, + "grad_norm": 0.7338995933532715, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 15980 + }, + { + "epoch": 1.148294434470377, + "grad_norm": 0.8453443646430969, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 15990 + }, + { + "epoch": 1.1490125673249552, + "grad_norm": 0.6787207126617432, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 16000 + }, + { + "epoch": 1.1497307001795332, + "grad_norm": 0.6314631104469299, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 16010 + }, + { + "epoch": 1.1504488330341114, + "grad_norm": 0.8812752962112427, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16020 + }, + { + "epoch": 1.1511669658886894, + "grad_norm": 0.6528969407081604, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 16030 + }, + { + "epoch": 1.1518850987432674, + "grad_norm": 0.7843571305274963, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 16040 + }, + { + "epoch": 1.1526032315978456, + "grad_norm": 0.7095080018043518, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 16050 + }, + { + "epoch": 1.1533213644524236, + "grad_norm": 0.7495582103729248, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 16060 + }, + { + "epoch": 1.1540394973070018, + "grad_norm": 0.6002049446105957, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 16070 + }, + { + "epoch": 1.1547576301615798, + "grad_norm": 0.565014123916626, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 16080 + }, + { + "epoch": 1.155475763016158, + "grad_norm": 0.8209971785545349, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 16090 + }, + { + "epoch": 1.156193895870736, + "grad_norm": 0.7137531042098999, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 16100 + }, + { + "epoch": 1.1569120287253143, + "grad_norm": 0.7307516932487488, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 16110 + }, + { + "epoch": 1.1576301615798923, + "grad_norm": 0.6686444878578186, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 16120 + }, + { + "epoch": 1.1583482944344703, + "grad_norm": 0.7977298498153687, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 16130 + }, + { + "epoch": 1.1590664272890485, + "grad_norm": 0.6980607509613037, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 16140 + }, + { + "epoch": 1.1597845601436265, + "grad_norm": 0.6622613668441772, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 16150 + }, + { + "epoch": 1.1605026929982047, + "grad_norm": 0.6598347425460815, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 16160 + }, + { + "epoch": 1.1612208258527827, + "grad_norm": 0.6686234474182129, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 16170 + }, + { + "epoch": 1.1619389587073607, + "grad_norm": 0.7308177947998047, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 16180 + }, + { + "epoch": 1.162657091561939, + "grad_norm": 0.939537525177002, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 16190 + }, + { + "epoch": 1.163375224416517, + "grad_norm": 0.5514758825302124, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 16200 + }, + { + "epoch": 1.1640933572710952, + "grad_norm": 0.589142918586731, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 16210 + }, + { + "epoch": 1.1648114901256732, + "grad_norm": 0.6888012290000916, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 16220 + }, + { + "epoch": 1.1655296229802514, + "grad_norm": 0.82566899061203, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 16230 + }, + { + "epoch": 1.1662477558348294, + "grad_norm": 0.6107817888259888, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 16240 + }, + { + "epoch": 1.1669658886894076, + "grad_norm": 0.7831398844718933, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 16250 + }, + { + "epoch": 1.1676840215439857, + "grad_norm": 0.6468397974967957, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 16260 + }, + { + "epoch": 1.1684021543985637, + "grad_norm": 0.7284161448478699, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 16270 + }, + { + "epoch": 1.1691202872531419, + "grad_norm": 0.6182818412780762, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 16280 + }, + { + "epoch": 1.1698384201077199, + "grad_norm": 0.7091781497001648, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 16290 + }, + { + "epoch": 1.170556552962298, + "grad_norm": 0.7327643632888794, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 16300 + }, + { + "epoch": 1.171274685816876, + "grad_norm": 0.5864694118499756, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 16310 + }, + { + "epoch": 1.171992818671454, + "grad_norm": 0.7049986720085144, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 16320 + }, + { + "epoch": 1.1727109515260323, + "grad_norm": 0.7563399076461792, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 16330 + }, + { + "epoch": 1.1734290843806103, + "grad_norm": 0.5888143181800842, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16340 + }, + { + "epoch": 1.1741472172351886, + "grad_norm": 0.8670049905776978, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 16350 + }, + { + "epoch": 1.1748653500897666, + "grad_norm": 0.8045654296875, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 16360 + }, + { + "epoch": 1.1755834829443448, + "grad_norm": 0.9115668535232544, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 16370 + }, + { + "epoch": 1.1763016157989228, + "grad_norm": 0.6943584084510803, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 16380 + }, + { + "epoch": 1.177019748653501, + "grad_norm": 0.7931740283966064, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 16390 + }, + { + "epoch": 1.177737881508079, + "grad_norm": 0.7967953085899353, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16400 + }, + { + "epoch": 1.178456014362657, + "grad_norm": 0.575165867805481, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 16410 + }, + { + "epoch": 1.1791741472172352, + "grad_norm": 0.6803409457206726, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 16420 + }, + { + "epoch": 1.1798922800718132, + "grad_norm": 0.7661909461021423, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 16430 + }, + { + "epoch": 1.1806104129263915, + "grad_norm": 0.7907630205154419, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 16440 + }, + { + "epoch": 1.1813285457809695, + "grad_norm": 0.7215338945388794, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 16450 + }, + { + "epoch": 1.1820466786355475, + "grad_norm": 0.6824054718017578, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 16460 + }, + { + "epoch": 1.1827648114901257, + "grad_norm": 0.8057665228843689, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 16470 + }, + { + "epoch": 1.1834829443447037, + "grad_norm": 0.7487542033195496, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 16480 + }, + { + "epoch": 1.184201077199282, + "grad_norm": 0.7254953384399414, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 16490 + }, + { + "epoch": 1.18491921005386, + "grad_norm": 0.6986604332923889, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 16500 + }, + { + "epoch": 1.1856373429084381, + "grad_norm": 0.7889591455459595, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 16510 + }, + { + "epoch": 1.1863554757630161, + "grad_norm": 0.6029604077339172, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 16520 + }, + { + "epoch": 1.1870736086175944, + "grad_norm": 0.680322527885437, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 16530 + }, + { + "epoch": 1.1877917414721724, + "grad_norm": 0.8588826060295105, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 16540 + }, + { + "epoch": 1.1885098743267504, + "grad_norm": 0.7614806890487671, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 16550 + }, + { + "epoch": 1.1892280071813286, + "grad_norm": 0.7523183226585388, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 16560 + }, + { + "epoch": 1.1899461400359066, + "grad_norm": 0.8299532532691956, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 16570 + }, + { + "epoch": 1.1906642728904848, + "grad_norm": 0.6709241271018982, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 16580 + }, + { + "epoch": 1.1913824057450628, + "grad_norm": 0.665414035320282, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16590 + }, + { + "epoch": 1.1921005385996408, + "grad_norm": 0.7582152485847473, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 16600 + }, + { + "epoch": 1.192818671454219, + "grad_norm": 0.5856947302818298, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 16610 + }, + { + "epoch": 1.193536804308797, + "grad_norm": 0.6972885727882385, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 16620 + }, + { + "epoch": 1.1942549371633753, + "grad_norm": 0.6884734630584717, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 16630 + }, + { + "epoch": 1.1949730700179533, + "grad_norm": 0.7380475401878357, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 16640 + }, + { + "epoch": 1.1956912028725315, + "grad_norm": 0.7976197600364685, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 16650 + }, + { + "epoch": 1.1964093357271095, + "grad_norm": 0.819256067276001, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 16660 + }, + { + "epoch": 1.1971274685816877, + "grad_norm": 0.587867796421051, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 16670 + }, + { + "epoch": 1.1978456014362657, + "grad_norm": 0.9162678122520447, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 16680 + }, + { + "epoch": 1.1985637342908437, + "grad_norm": 0.7452084422111511, + "learning_rate": 0.0002, + "loss": 0.7472, + "step": 16690 + }, + { + "epoch": 1.199281867145422, + "grad_norm": 0.7966971397399902, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 16700 + }, + { + "epoch": 1.2, + "grad_norm": 0.6605724692344666, + "learning_rate": 0.0002, + "loss": 0.8051, + "step": 16710 + }, + { + "epoch": 1.2007181328545782, + "grad_norm": 0.6499220728874207, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16720 + }, + { + "epoch": 1.2014362657091562, + "grad_norm": 0.7422114610671997, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 16730 + }, + { + "epoch": 1.2021543985637342, + "grad_norm": 0.6652370095252991, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 16740 + }, + { + "epoch": 1.2028725314183124, + "grad_norm": 0.8761070370674133, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 16750 + }, + { + "epoch": 1.2035906642728904, + "grad_norm": 0.7294463515281677, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 16760 + }, + { + "epoch": 1.2043087971274686, + "grad_norm": 0.7725599408149719, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 16770 + }, + { + "epoch": 1.2050269299820466, + "grad_norm": 0.5630005598068237, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 16780 + }, + { + "epoch": 1.2057450628366249, + "grad_norm": 0.7601404786109924, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16790 + }, + { + "epoch": 1.2064631956912029, + "grad_norm": 0.6859985589981079, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 16800 + }, + { + "epoch": 1.207181328545781, + "grad_norm": 0.7040054798126221, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 16810 + }, + { + "epoch": 1.207899461400359, + "grad_norm": 0.7058989405632019, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 16820 + }, + { + "epoch": 1.208617594254937, + "grad_norm": 0.7646133899688721, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 16830 + }, + { + "epoch": 1.2093357271095153, + "grad_norm": 0.669550359249115, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 16840 + }, + { + "epoch": 1.2100538599640933, + "grad_norm": 0.6613401174545288, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 16850 + }, + { + "epoch": 1.2107719928186715, + "grad_norm": 0.8636519312858582, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 16860 + }, + { + "epoch": 1.2114901256732495, + "grad_norm": 0.6077507138252258, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 16870 + }, + { + "epoch": 1.2122082585278275, + "grad_norm": 0.7892228364944458, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 16880 + }, + { + "epoch": 1.2129263913824058, + "grad_norm": 0.7424154877662659, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 16890 + }, + { + "epoch": 1.2136445242369838, + "grad_norm": 0.6525408029556274, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 16900 + }, + { + "epoch": 1.214362657091562, + "grad_norm": 0.6178015470504761, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 16910 + }, + { + "epoch": 1.21508078994614, + "grad_norm": 0.7319437861442566, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 16920 + }, + { + "epoch": 1.2157989228007182, + "grad_norm": 0.6823344826698303, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 16930 + }, + { + "epoch": 1.2165170556552962, + "grad_norm": 0.5681257843971252, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 16940 + }, + { + "epoch": 1.2172351885098744, + "grad_norm": 0.7939814925193787, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 16950 + }, + { + "epoch": 1.2179533213644524, + "grad_norm": 0.7031611800193787, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 16960 + }, + { + "epoch": 1.2186714542190304, + "grad_norm": 0.7610133290290833, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16970 + }, + { + "epoch": 1.2193895870736087, + "grad_norm": 0.8707142472267151, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 16980 + }, + { + "epoch": 1.2201077199281867, + "grad_norm": 0.6603384017944336, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 16990 + }, + { + "epoch": 1.220825852782765, + "grad_norm": 0.7218315005302429, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 17000 + }, + { + "epoch": 1.221543985637343, + "grad_norm": 0.8043148517608643, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17010 + }, + { + "epoch": 1.222262118491921, + "grad_norm": 0.7232559323310852, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17020 + }, + { + "epoch": 1.2229802513464991, + "grad_norm": 0.690376341342926, + "learning_rate": 0.0002, + "loss": 0.7681, + "step": 17030 + }, + { + "epoch": 1.2236983842010771, + "grad_norm": 0.602436363697052, + "learning_rate": 0.0002, + "loss": 0.7042, + "step": 17040 + }, + { + "epoch": 1.2244165170556554, + "grad_norm": 0.7610493898391724, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 17050 + }, + { + "epoch": 1.2251346499102334, + "grad_norm": 0.7504690885543823, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 17060 + }, + { + "epoch": 1.2258527827648116, + "grad_norm": 0.8080246448516846, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 17070 + }, + { + "epoch": 1.2265709156193896, + "grad_norm": 1.0240572690963745, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 17080 + }, + { + "epoch": 1.2272890484739678, + "grad_norm": 0.6874111294746399, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 17090 + }, + { + "epoch": 1.2280071813285458, + "grad_norm": 0.800069272518158, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 17100 + }, + { + "epoch": 1.2287253141831238, + "grad_norm": 0.8628103137016296, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 17110 + }, + { + "epoch": 1.229443447037702, + "grad_norm": 0.7408499121665955, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 17120 + }, + { + "epoch": 1.23016157989228, + "grad_norm": 0.6494335532188416, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 17130 + }, + { + "epoch": 1.2308797127468583, + "grad_norm": 0.6493549942970276, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17140 + }, + { + "epoch": 1.2315978456014363, + "grad_norm": 0.6972658038139343, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 17150 + }, + { + "epoch": 1.2323159784560143, + "grad_norm": 0.6877315044403076, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 17160 + }, + { + "epoch": 1.2330341113105925, + "grad_norm": 0.7569024562835693, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 17170 + }, + { + "epoch": 1.2337522441651705, + "grad_norm": 0.696260392665863, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 17180 + }, + { + "epoch": 1.2344703770197487, + "grad_norm": 0.6150345802307129, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 17190 + }, + { + "epoch": 1.2351885098743267, + "grad_norm": 0.69009929895401, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 17200 + }, + { + "epoch": 1.235906642728905, + "grad_norm": 0.7035185098648071, + "learning_rate": 0.0002, + "loss": 0.787, + "step": 17210 + }, + { + "epoch": 1.236624775583483, + "grad_norm": 0.6792506575584412, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17220 + }, + { + "epoch": 1.2373429084380612, + "grad_norm": 0.6310356855392456, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 17230 + }, + { + "epoch": 1.2380610412926392, + "grad_norm": 0.647026538848877, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 17240 + }, + { + "epoch": 1.2387791741472172, + "grad_norm": 0.7609930038452148, + "learning_rate": 0.0002, + "loss": 0.8014, + "step": 17250 + }, + { + "epoch": 1.2394973070017954, + "grad_norm": 0.791890561580658, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 17260 + }, + { + "epoch": 1.2402154398563734, + "grad_norm": 0.7126715183258057, + "learning_rate": 0.0002, + "loss": 0.7445, + "step": 17270 + }, + { + "epoch": 1.2409335727109516, + "grad_norm": 0.7850401401519775, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 17280 + }, + { + "epoch": 1.2416517055655296, + "grad_norm": 0.6694281697273254, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 17290 + }, + { + "epoch": 1.2423698384201076, + "grad_norm": 0.6418080925941467, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 17300 + }, + { + "epoch": 1.2430879712746858, + "grad_norm": 0.7308132648468018, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 17310 + }, + { + "epoch": 1.2438061041292638, + "grad_norm": 0.8322312235832214, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17320 + }, + { + "epoch": 1.244524236983842, + "grad_norm": 0.6959006190299988, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 17330 + }, + { + "epoch": 1.24524236983842, + "grad_norm": 0.7110121846199036, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 17340 + }, + { + "epoch": 1.2459605026929983, + "grad_norm": 0.6496296525001526, + "learning_rate": 0.0002, + "loss": 0.7858, + "step": 17350 + }, + { + "epoch": 1.2466786355475763, + "grad_norm": 0.7649076581001282, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 17360 + }, + { + "epoch": 1.2473967684021545, + "grad_norm": 0.7139049172401428, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 17370 + }, + { + "epoch": 1.2481149012567325, + "grad_norm": 0.7709113955497742, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 17380 + }, + { + "epoch": 1.2488330341113105, + "grad_norm": 0.7160373330116272, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 17390 + }, + { + "epoch": 1.2495511669658887, + "grad_norm": 0.5608301162719727, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17400 + }, + { + "epoch": 1.2502692998204668, + "grad_norm": 0.6913180351257324, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 17410 + }, + { + "epoch": 1.250987432675045, + "grad_norm": 0.6980322599411011, + "learning_rate": 0.0002, + "loss": 0.7167, + "step": 17420 + }, + { + "epoch": 1.251705565529623, + "grad_norm": 0.8155394792556763, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 17430 + }, + { + "epoch": 1.252423698384201, + "grad_norm": 0.8015886545181274, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 17440 + }, + { + "epoch": 1.2531418312387792, + "grad_norm": 0.5985556244850159, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17450 + }, + { + "epoch": 1.2538599640933572, + "grad_norm": 0.70317143201828, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17460 + }, + { + "epoch": 1.2545780969479354, + "grad_norm": 0.612501323223114, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 17470 + }, + { + "epoch": 1.2552962298025134, + "grad_norm": 0.7347102165222168, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 17480 + }, + { + "epoch": 1.2560143626570914, + "grad_norm": 0.9189441800117493, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 17490 + }, + { + "epoch": 1.2567324955116697, + "grad_norm": 0.7727932929992676, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 17500 + }, + { + "epoch": 1.2574506283662479, + "grad_norm": 0.6782869696617126, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 17510 + }, + { + "epoch": 1.2581687612208259, + "grad_norm": 0.5710638761520386, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 17520 + }, + { + "epoch": 1.2588868940754039, + "grad_norm": 0.6856266856193542, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 17530 + }, + { + "epoch": 1.259605026929982, + "grad_norm": 0.7257347702980042, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 17540 + }, + { + "epoch": 1.26032315978456, + "grad_norm": 0.6343092918395996, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 17550 + }, + { + "epoch": 1.2610412926391383, + "grad_norm": 0.6482594013214111, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 17560 + }, + { + "epoch": 1.2617594254937163, + "grad_norm": 0.6542837619781494, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 17570 + }, + { + "epoch": 1.2624775583482943, + "grad_norm": 0.7106123566627502, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 17580 + }, + { + "epoch": 1.2631956912028726, + "grad_norm": 0.9081960320472717, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 17590 + }, + { + "epoch": 1.2639138240574506, + "grad_norm": 0.7010290026664734, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 17600 + }, + { + "epoch": 1.2646319569120288, + "grad_norm": 0.9973132610321045, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 17610 + }, + { + "epoch": 1.2653500897666068, + "grad_norm": 0.8003297448158264, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 17620 + }, + { + "epoch": 1.2660682226211848, + "grad_norm": 0.7383468151092529, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 17630 + }, + { + "epoch": 1.266786355475763, + "grad_norm": 0.6337200999259949, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 17640 + }, + { + "epoch": 1.2675044883303412, + "grad_norm": 0.6371761560440063, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 17650 + }, + { + "epoch": 1.2682226211849192, + "grad_norm": 0.7283522486686707, + "learning_rate": 0.0002, + "loss": 0.7348, + "step": 17660 + }, + { + "epoch": 1.2689407540394972, + "grad_norm": 0.8191015720367432, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 17670 + }, + { + "epoch": 1.2696588868940755, + "grad_norm": 0.6210351586341858, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 17680 + }, + { + "epoch": 1.2703770197486535, + "grad_norm": 0.6563277840614319, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 17690 + }, + { + "epoch": 1.2710951526032317, + "grad_norm": 0.7111260294914246, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 17700 + }, + { + "epoch": 1.2718132854578097, + "grad_norm": 0.7061500549316406, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 17710 + }, + { + "epoch": 1.2725314183123877, + "grad_norm": 0.7657744884490967, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 17720 + }, + { + "epoch": 1.273249551166966, + "grad_norm": 0.6952996850013733, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 17730 + }, + { + "epoch": 1.273967684021544, + "grad_norm": 0.5678043961524963, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 17740 + }, + { + "epoch": 1.2746858168761221, + "grad_norm": 0.8608036041259766, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 17750 + }, + { + "epoch": 1.2754039497307001, + "grad_norm": 0.7184045910835266, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 17760 + }, + { + "epoch": 1.2761220825852782, + "grad_norm": 0.6647557616233826, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 17770 + }, + { + "epoch": 1.2768402154398564, + "grad_norm": 0.6899349093437195, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17780 + }, + { + "epoch": 1.2775583482944346, + "grad_norm": 0.7073346972465515, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 17790 + }, + { + "epoch": 1.2782764811490126, + "grad_norm": 0.8896707892417908, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 17800 + }, + { + "epoch": 1.2789946140035906, + "grad_norm": 0.5072778463363647, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 17810 + }, + { + "epoch": 1.2797127468581688, + "grad_norm": 0.8889711499214172, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 17820 + }, + { + "epoch": 1.2804308797127468, + "grad_norm": 0.5583778619766235, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 17830 + }, + { + "epoch": 1.281149012567325, + "grad_norm": 0.6526148915290833, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 17840 + }, + { + "epoch": 1.281867145421903, + "grad_norm": 0.7658175826072693, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 17850 + }, + { + "epoch": 1.282585278276481, + "grad_norm": 0.5547847151756287, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 17860 + }, + { + "epoch": 1.2833034111310593, + "grad_norm": 0.6153780817985535, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17870 + }, + { + "epoch": 1.2840215439856373, + "grad_norm": 0.8474061489105225, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 17880 + }, + { + "epoch": 1.2847396768402155, + "grad_norm": 0.859260618686676, + "learning_rate": 0.0002, + "loss": 0.7597, + "step": 17890 + }, + { + "epoch": 1.2854578096947935, + "grad_norm": 0.7270520329475403, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 17900 + }, + { + "epoch": 1.2861759425493715, + "grad_norm": 0.8166249394416809, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 17910 + }, + { + "epoch": 1.2868940754039497, + "grad_norm": 0.9158982038497925, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17920 + }, + { + "epoch": 1.287612208258528, + "grad_norm": 0.8132565021514893, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17930 + }, + { + "epoch": 1.288330341113106, + "grad_norm": 0.7914409637451172, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17940 + }, + { + "epoch": 1.289048473967684, + "grad_norm": 0.6256071329116821, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 17950 + }, + { + "epoch": 1.2897666068222622, + "grad_norm": 0.6463542580604553, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 17960 + }, + { + "epoch": 1.2904847396768402, + "grad_norm": 0.6702672839164734, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 17970 + }, + { + "epoch": 1.2912028725314184, + "grad_norm": 0.8666605949401855, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 17980 + }, + { + "epoch": 1.2919210053859964, + "grad_norm": 0.8055952787399292, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17990 + }, + { + "epoch": 1.2926391382405744, + "grad_norm": 0.6909741163253784, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 18000 + }, + { + "epoch": 1.2933572710951526, + "grad_norm": 0.663702130317688, + "learning_rate": 0.0002, + "loss": 0.7766, + "step": 18010 + }, + { + "epoch": 1.2940754039497306, + "grad_norm": 0.6952448487281799, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 18020 + }, + { + "epoch": 1.2947935368043089, + "grad_norm": 0.5722854137420654, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18030 + }, + { + "epoch": 1.2955116696588869, + "grad_norm": 0.7987681031227112, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 18040 + }, + { + "epoch": 1.2962298025134649, + "grad_norm": 0.661133348941803, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 18050 + }, + { + "epoch": 1.296947935368043, + "grad_norm": 0.6025064587593079, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 18060 + }, + { + "epoch": 1.2976660682226213, + "grad_norm": 0.7569907903671265, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 18070 + }, + { + "epoch": 1.2983842010771993, + "grad_norm": 0.7222012281417847, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18080 + }, + { + "epoch": 1.2991023339317773, + "grad_norm": 0.5291963815689087, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 18090 + }, + { + "epoch": 1.2998204667863555, + "grad_norm": 0.6808363199234009, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 18100 + }, + { + "epoch": 1.3005385996409335, + "grad_norm": 0.6797927618026733, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 18110 + }, + { + "epoch": 1.3012567324955118, + "grad_norm": 0.7775542140007019, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 18120 + }, + { + "epoch": 1.3019748653500898, + "grad_norm": 0.7369466423988342, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18130 + }, + { + "epoch": 1.3026929982046678, + "grad_norm": 0.6822494864463806, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 18140 + }, + { + "epoch": 1.303411131059246, + "grad_norm": 0.9222138524055481, + "learning_rate": 0.0002, + "loss": 0.7675, + "step": 18150 + }, + { + "epoch": 1.304129263913824, + "grad_norm": 0.7485767006874084, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 18160 + }, + { + "epoch": 1.3048473967684022, + "grad_norm": 0.6383684277534485, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 18170 + }, + { + "epoch": 1.3055655296229802, + "grad_norm": 0.5934187173843384, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 18180 + }, + { + "epoch": 1.3062836624775582, + "grad_norm": 0.7265770435333252, + "learning_rate": 0.0002, + "loss": 0.7576, + "step": 18190 + }, + { + "epoch": 1.3070017953321365, + "grad_norm": 0.8149140477180481, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 18200 + }, + { + "epoch": 1.3077199281867147, + "grad_norm": 0.8067880272865295, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 18210 + }, + { + "epoch": 1.3084380610412927, + "grad_norm": 0.6109178066253662, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18220 + }, + { + "epoch": 1.3091561938958707, + "grad_norm": 0.7194176316261292, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 18230 + }, + { + "epoch": 1.309874326750449, + "grad_norm": 0.6452242136001587, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 18240 + }, + { + "epoch": 1.310592459605027, + "grad_norm": 0.680550217628479, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 18250 + }, + { + "epoch": 1.3113105924596051, + "grad_norm": 0.7005740404129028, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 18260 + }, + { + "epoch": 1.3120287253141831, + "grad_norm": 0.7217825055122375, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 18270 + }, + { + "epoch": 1.3127468581687611, + "grad_norm": 0.7730209231376648, + "learning_rate": 0.0002, + "loss": 0.7797, + "step": 18280 + }, + { + "epoch": 1.3134649910233394, + "grad_norm": 0.8291956186294556, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18290 + }, + { + "epoch": 1.3141831238779174, + "grad_norm": 0.758528470993042, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 18300 + }, + { + "epoch": 1.3149012567324956, + "grad_norm": 0.9682782292366028, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 18310 + }, + { + "epoch": 1.3156193895870736, + "grad_norm": 0.5784780979156494, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 18320 + }, + { + "epoch": 1.3163375224416516, + "grad_norm": 0.5870532393455505, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 18330 + }, + { + "epoch": 1.3170556552962298, + "grad_norm": 0.5950172543525696, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 18340 + }, + { + "epoch": 1.317773788150808, + "grad_norm": 0.7625961899757385, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 18350 + }, + { + "epoch": 1.318491921005386, + "grad_norm": 0.8027397394180298, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 18360 + }, + { + "epoch": 1.319210053859964, + "grad_norm": 0.8424779772758484, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 18370 + }, + { + "epoch": 1.3199281867145423, + "grad_norm": 0.5741737484931946, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 18380 + }, + { + "epoch": 1.3206463195691203, + "grad_norm": 0.7363710999488831, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 18390 + }, + { + "epoch": 1.3213644524236985, + "grad_norm": 0.7900536060333252, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 18400 + }, + { + "epoch": 1.3220825852782765, + "grad_norm": 0.6273105144500732, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 18410 + }, + { + "epoch": 1.3228007181328545, + "grad_norm": 0.7612496018409729, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 18420 + }, + { + "epoch": 1.3235188509874327, + "grad_norm": 0.729653537273407, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 18430 + }, + { + "epoch": 1.3242369838420107, + "grad_norm": 0.6599212288856506, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 18440 + }, + { + "epoch": 1.324955116696589, + "grad_norm": 0.762320876121521, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18450 + }, + { + "epoch": 1.325673249551167, + "grad_norm": 0.7468838095664978, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18460 + }, + { + "epoch": 1.326391382405745, + "grad_norm": 0.6376237273216248, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 18470 + }, + { + "epoch": 1.3271095152603232, + "grad_norm": 0.6722603440284729, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 18480 + }, + { + "epoch": 1.3278276481149014, + "grad_norm": 0.7011231780052185, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 18490 + }, + { + "epoch": 1.3285457809694794, + "grad_norm": 0.5325027108192444, + "learning_rate": 0.0002, + "loss": 0.7942, + "step": 18500 + }, + { + "epoch": 1.3292639138240574, + "grad_norm": 0.6916731595993042, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 18510 + }, + { + "epoch": 1.3299820466786356, + "grad_norm": 0.6529106497764587, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18520 + }, + { + "epoch": 1.3307001795332136, + "grad_norm": 0.7708640694618225, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 18530 + }, + { + "epoch": 1.3314183123877918, + "grad_norm": 0.7125861048698425, + "learning_rate": 0.0002, + "loss": 0.7688, + "step": 18540 + }, + { + "epoch": 1.3321364452423698, + "grad_norm": 0.7663969993591309, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 18550 + }, + { + "epoch": 1.3328545780969479, + "grad_norm": 0.601141631603241, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 18560 + }, + { + "epoch": 1.333572710951526, + "grad_norm": 0.6185581088066101, + "learning_rate": 0.0002, + "loss": 0.734, + "step": 18570 + }, + { + "epoch": 1.334290843806104, + "grad_norm": 0.6136596202850342, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 18580 + }, + { + "epoch": 1.3350089766606823, + "grad_norm": 0.8377187252044678, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 18590 + }, + { + "epoch": 1.3357271095152603, + "grad_norm": 0.7649989724159241, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 18600 + }, + { + "epoch": 1.3364452423698383, + "grad_norm": 0.7944515347480774, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 18610 + }, + { + "epoch": 1.3371633752244165, + "grad_norm": 0.619024395942688, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 18620 + }, + { + "epoch": 1.3378815080789948, + "grad_norm": 0.7849082946777344, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 18630 + }, + { + "epoch": 1.3385996409335728, + "grad_norm": 0.5740780830383301, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18640 + }, + { + "epoch": 1.3393177737881508, + "grad_norm": 0.6897456645965576, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 18650 + }, + { + "epoch": 1.340035906642729, + "grad_norm": 0.6263600587844849, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 18660 + }, + { + "epoch": 1.340754039497307, + "grad_norm": 0.5744550824165344, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 18670 + }, + { + "epoch": 1.3414721723518852, + "grad_norm": 0.7785728573799133, + "learning_rate": 0.0002, + "loss": 0.7773, + "step": 18680 + }, + { + "epoch": 1.3421903052064632, + "grad_norm": 0.6944230198860168, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 18690 + }, + { + "epoch": 1.3429084380610412, + "grad_norm": 0.7388073801994324, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 18700 + }, + { + "epoch": 1.3436265709156194, + "grad_norm": 0.9555586576461792, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 18710 + }, + { + "epoch": 1.3443447037701974, + "grad_norm": 0.8510582447052002, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 18720 + }, + { + "epoch": 1.3450628366247757, + "grad_norm": 0.6093049645423889, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 18730 + }, + { + "epoch": 1.3457809694793537, + "grad_norm": 0.9159273505210876, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 18740 + }, + { + "epoch": 1.3464991023339317, + "grad_norm": 0.7188084721565247, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 18750 + }, + { + "epoch": 1.3472172351885099, + "grad_norm": 0.7228650450706482, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 18760 + }, + { + "epoch": 1.347935368043088, + "grad_norm": 0.8160615563392639, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 18770 + }, + { + "epoch": 1.3486535008976661, + "grad_norm": 0.6485389471054077, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 18780 + }, + { + "epoch": 1.3493716337522441, + "grad_norm": 0.6755139827728271, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 18790 + }, + { + "epoch": 1.3500897666068223, + "grad_norm": 0.6923297643661499, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 18800 + }, + { + "epoch": 1.3508078994614003, + "grad_norm": 0.6954510807991028, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 18810 + }, + { + "epoch": 1.3515260323159786, + "grad_norm": 0.9948558807373047, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 18820 + }, + { + "epoch": 1.3522441651705566, + "grad_norm": 0.708381175994873, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 18830 + }, + { + "epoch": 1.3529622980251346, + "grad_norm": 0.6409999132156372, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 18840 + }, + { + "epoch": 1.3536804308797128, + "grad_norm": 0.6365936994552612, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 18850 + }, + { + "epoch": 1.3543985637342908, + "grad_norm": 0.7620742917060852, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 18860 + }, + { + "epoch": 1.355116696588869, + "grad_norm": 0.6849071383476257, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 18870 + }, + { + "epoch": 1.355834829443447, + "grad_norm": 0.5776316523551941, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18880 + }, + { + "epoch": 1.356552962298025, + "grad_norm": 0.597236156463623, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 18890 + }, + { + "epoch": 1.3572710951526032, + "grad_norm": 0.6569282412528992, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 18900 + }, + { + "epoch": 1.3579892280071812, + "grad_norm": 0.6384802460670471, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 18910 + }, + { + "epoch": 1.3587073608617595, + "grad_norm": 0.6623879671096802, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 18920 + }, + { + "epoch": 1.3594254937163375, + "grad_norm": 0.6149632334709167, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 18930 + }, + { + "epoch": 1.3601436265709157, + "grad_norm": 0.6978002190589905, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 18940 + }, + { + "epoch": 1.3608617594254937, + "grad_norm": 0.7579124569892883, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 18950 + }, + { + "epoch": 1.361579892280072, + "grad_norm": 0.7138084173202515, + "learning_rate": 0.0002, + "loss": 0.7589, + "step": 18960 + }, + { + "epoch": 1.36229802513465, + "grad_norm": 0.678322434425354, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 18970 + }, + { + "epoch": 1.363016157989228, + "grad_norm": 0.694346010684967, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 18980 + }, + { + "epoch": 1.3637342908438062, + "grad_norm": 0.682262659072876, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 18990 + }, + { + "epoch": 1.3644524236983842, + "grad_norm": 0.9068194627761841, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 19000 + }, + { + "epoch": 1.3651705565529624, + "grad_norm": 0.6691566705703735, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 19010 + }, + { + "epoch": 1.3658886894075404, + "grad_norm": 0.7791378498077393, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 19020 + }, + { + "epoch": 1.3666068222621184, + "grad_norm": 0.717107355594635, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 19030 + }, + { + "epoch": 1.3673249551166966, + "grad_norm": 0.7897566556930542, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 19040 + }, + { + "epoch": 1.3680430879712746, + "grad_norm": 0.8823844790458679, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 19050 + }, + { + "epoch": 1.3687612208258528, + "grad_norm": 0.6512053608894348, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 19060 + }, + { + "epoch": 1.3694793536804308, + "grad_norm": 0.6871389150619507, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 19070 + }, + { + "epoch": 1.370197486535009, + "grad_norm": 0.6795603036880493, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 19080 + }, + { + "epoch": 1.370915619389587, + "grad_norm": 0.6569121479988098, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 19090 + }, + { + "epoch": 1.3716337522441653, + "grad_norm": 0.6769960522651672, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 19100 + }, + { + "epoch": 1.3723518850987433, + "grad_norm": 0.726613461971283, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 19110 + }, + { + "epoch": 1.3730700179533213, + "grad_norm": 0.7287817001342773, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 19120 + }, + { + "epoch": 1.3737881508078995, + "grad_norm": 0.6169242858886719, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 19130 + }, + { + "epoch": 1.3745062836624775, + "grad_norm": 0.6537347435951233, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 19140 + }, + { + "epoch": 1.3752244165170557, + "grad_norm": 0.6113879680633545, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 19150 + }, + { + "epoch": 1.3759425493716337, + "grad_norm": 0.6415297985076904, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 19160 + }, + { + "epoch": 1.3766606822262117, + "grad_norm": 0.6812838315963745, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 19170 + }, + { + "epoch": 1.37737881508079, + "grad_norm": 0.7331814169883728, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 19180 + }, + { + "epoch": 1.378096947935368, + "grad_norm": 0.7265108823776245, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 19190 + }, + { + "epoch": 1.3788150807899462, + "grad_norm": 0.6233167052268982, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 19200 + }, + { + "epoch": 1.3795332136445242, + "grad_norm": 0.6841492652893066, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 19210 + }, + { + "epoch": 1.3802513464991024, + "grad_norm": 0.822853684425354, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 19220 + }, + { + "epoch": 1.3809694793536804, + "grad_norm": 0.8078812956809998, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 19230 + }, + { + "epoch": 1.3816876122082586, + "grad_norm": 0.7269898056983948, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 19240 + }, + { + "epoch": 1.3824057450628366, + "grad_norm": 0.6297033429145813, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 19250 + }, + { + "epoch": 1.3831238779174146, + "grad_norm": 0.8097442388534546, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 19260 + }, + { + "epoch": 1.3838420107719929, + "grad_norm": 0.6442803740501404, + "learning_rate": 0.0002, + "loss": 0.7281, + "step": 19270 + }, + { + "epoch": 1.3845601436265709, + "grad_norm": 0.659866213798523, + "learning_rate": 0.0002, + "loss": 0.7598, + "step": 19280 + }, + { + "epoch": 1.385278276481149, + "grad_norm": 0.7537921667098999, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 19290 + }, + { + "epoch": 1.385996409335727, + "grad_norm": 0.8441828489303589, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 19300 + }, + { + "epoch": 1.386714542190305, + "grad_norm": 0.8506057262420654, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19310 + }, + { + "epoch": 1.3874326750448833, + "grad_norm": 0.6747094392776489, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 19320 + }, + { + "epoch": 1.3881508078994613, + "grad_norm": 0.7906509041786194, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 19330 + }, + { + "epoch": 1.3888689407540395, + "grad_norm": 0.6784867644309998, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 19340 + }, + { + "epoch": 1.3895870736086176, + "grad_norm": 0.6371709108352661, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 19350 + }, + { + "epoch": 1.3903052064631956, + "grad_norm": 0.7858285307884216, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 19360 + }, + { + "epoch": 1.3910233393177738, + "grad_norm": 0.711395263671875, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19370 + }, + { + "epoch": 1.391741472172352, + "grad_norm": 0.7023257613182068, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 19380 + }, + { + "epoch": 1.39245960502693, + "grad_norm": 0.7036022543907166, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19390 + }, + { + "epoch": 1.393177737881508, + "grad_norm": 0.6418436169624329, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 19400 + }, + { + "epoch": 1.3938958707360862, + "grad_norm": 0.7108847498893738, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 19410 + }, + { + "epoch": 1.3946140035906642, + "grad_norm": 0.6940230131149292, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 19420 + }, + { + "epoch": 1.3953321364452425, + "grad_norm": 0.6750220656394958, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 19430 + }, + { + "epoch": 1.3960502692998205, + "grad_norm": 0.7479177713394165, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 19440 + }, + { + "epoch": 1.3967684021543985, + "grad_norm": 0.626124918460846, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 19450 + }, + { + "epoch": 1.3974865350089767, + "grad_norm": 0.8908559083938599, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 19460 + }, + { + "epoch": 1.3982046678635547, + "grad_norm": 0.6163712739944458, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 19470 + }, + { + "epoch": 1.398922800718133, + "grad_norm": 0.6993312239646912, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 19480 + }, + { + "epoch": 1.399640933572711, + "grad_norm": 0.6162890791893005, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 19490 + }, + { + "epoch": 1.400359066427289, + "grad_norm": 0.7797643542289734, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 19500 + }, + { + "epoch": 1.4010771992818671, + "grad_norm": 0.7038744688034058, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 19510 + }, + { + "epoch": 1.4017953321364454, + "grad_norm": 0.6902393698692322, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 19520 + }, + { + "epoch": 1.4025134649910234, + "grad_norm": 0.5436386466026306, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 19530 + }, + { + "epoch": 1.4032315978456014, + "grad_norm": 0.6537990570068359, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19540 + }, + { + "epoch": 1.4039497307001796, + "grad_norm": 0.739691972732544, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 19550 + }, + { + "epoch": 1.4046678635547576, + "grad_norm": 0.7287635803222656, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 19560 + }, + { + "epoch": 1.4053859964093358, + "grad_norm": 0.6809501051902771, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 19570 + }, + { + "epoch": 1.4061041292639138, + "grad_norm": 0.8302195072174072, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 19580 + }, + { + "epoch": 1.4068222621184918, + "grad_norm": 0.6613629460334778, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 19590 + }, + { + "epoch": 1.40754039497307, + "grad_norm": 0.7897207736968994, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 19600 + }, + { + "epoch": 1.408258527827648, + "grad_norm": 0.8368293642997742, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 19610 + }, + { + "epoch": 1.4089766606822263, + "grad_norm": 0.665109395980835, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 19620 + }, + { + "epoch": 1.4096947935368043, + "grad_norm": 0.7359302639961243, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 19630 + }, + { + "epoch": 1.4104129263913823, + "grad_norm": 0.8048052787780762, + "learning_rate": 0.0002, + "loss": 0.7775, + "step": 19640 + }, + { + "epoch": 1.4111310592459605, + "grad_norm": 0.7414906620979309, + "learning_rate": 0.0002, + "loss": 0.7668, + "step": 19650 + }, + { + "epoch": 1.4118491921005387, + "grad_norm": 0.7894161343574524, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 19660 + }, + { + "epoch": 1.4125673249551167, + "grad_norm": 0.6724628210067749, + "learning_rate": 0.0002, + "loss": 0.7371, + "step": 19670 + }, + { + "epoch": 1.4132854578096947, + "grad_norm": 0.9397756457328796, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 19680 + }, + { + "epoch": 1.414003590664273, + "grad_norm": 0.6684842109680176, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 19690 + }, + { + "epoch": 1.414721723518851, + "grad_norm": 0.7753993272781372, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 19700 + }, + { + "epoch": 1.4154398563734292, + "grad_norm": 0.6934253573417664, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 19710 + }, + { + "epoch": 1.4161579892280072, + "grad_norm": 0.8567284941673279, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 19720 + }, + { + "epoch": 1.4168761220825852, + "grad_norm": 0.9471787214279175, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 19730 + }, + { + "epoch": 1.4175942549371634, + "grad_norm": 0.6664855480194092, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 19740 + }, + { + "epoch": 1.4183123877917414, + "grad_norm": 0.6713361740112305, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 19750 + }, + { + "epoch": 1.4190305206463196, + "grad_norm": 0.6488258838653564, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 19760 + }, + { + "epoch": 1.4197486535008976, + "grad_norm": 0.7089938521385193, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 19770 + }, + { + "epoch": 1.4204667863554756, + "grad_norm": 0.6433218717575073, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 19780 + }, + { + "epoch": 1.4211849192100539, + "grad_norm": 0.7025160193443298, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 19790 + }, + { + "epoch": 1.421903052064632, + "grad_norm": 0.7030544877052307, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 19800 + }, + { + "epoch": 1.42262118491921, + "grad_norm": 0.6515552401542664, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 19810 + }, + { + "epoch": 1.423339317773788, + "grad_norm": 0.6463841795921326, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 19820 + }, + { + "epoch": 1.4240574506283663, + "grad_norm": 0.6654344201087952, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 19830 + }, + { + "epoch": 1.4247755834829443, + "grad_norm": 0.7223384380340576, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 19840 + }, + { + "epoch": 1.4254937163375225, + "grad_norm": 0.6575722694396973, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 19850 + }, + { + "epoch": 1.4262118491921005, + "grad_norm": 0.6216059327125549, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 19860 + }, + { + "epoch": 1.4269299820466785, + "grad_norm": 0.7451487183570862, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 19870 + }, + { + "epoch": 1.4276481149012568, + "grad_norm": 0.6563336253166199, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 19880 + }, + { + "epoch": 1.4283662477558348, + "grad_norm": 0.8021975159645081, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 19890 + }, + { + "epoch": 1.429084380610413, + "grad_norm": 0.7474712133407593, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 19900 + }, + { + "epoch": 1.429802513464991, + "grad_norm": 0.7316377758979797, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 19910 + }, + { + "epoch": 1.430520646319569, + "grad_norm": 0.646892786026001, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 19920 + }, + { + "epoch": 1.4312387791741472, + "grad_norm": 0.6268765926361084, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 19930 + }, + { + "epoch": 1.4319569120287254, + "grad_norm": 0.7104699611663818, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 19940 + }, + { + "epoch": 1.4326750448833034, + "grad_norm": 0.6742063760757446, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 19950 + }, + { + "epoch": 1.4333931777378814, + "grad_norm": 0.6973381638526917, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 19960 + }, + { + "epoch": 1.4341113105924597, + "grad_norm": 0.5819381475448608, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 19970 + }, + { + "epoch": 1.4348294434470377, + "grad_norm": 0.680623471736908, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 19980 + }, + { + "epoch": 1.435547576301616, + "grad_norm": 0.5899890661239624, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 19990 + }, + { + "epoch": 1.436265709156194, + "grad_norm": 0.6225098371505737, + "learning_rate": 0.0002, + "loss": 0.7438, + "step": 20000 + }, + { + "epoch": 1.436983842010772, + "grad_norm": 0.6314228773117065, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 20010 + }, + { + "epoch": 1.4377019748653501, + "grad_norm": 0.8690667152404785, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 20020 + }, + { + "epoch": 1.4384201077199281, + "grad_norm": 0.7166543006896973, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 20030 + }, + { + "epoch": 1.4391382405745063, + "grad_norm": 0.7051591873168945, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 20040 + }, + { + "epoch": 1.4398563734290843, + "grad_norm": 0.7606652975082397, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 20050 + }, + { + "epoch": 1.4405745062836623, + "grad_norm": 0.6343185305595398, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 20060 + }, + { + "epoch": 1.4412926391382406, + "grad_norm": 0.5625789761543274, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 20070 + }, + { + "epoch": 1.4420107719928188, + "grad_norm": 0.6081897020339966, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 20080 + }, + { + "epoch": 1.4427289048473968, + "grad_norm": 0.9571536779403687, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 20090 + }, + { + "epoch": 1.4434470377019748, + "grad_norm": 0.869531512260437, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 20100 + }, + { + "epoch": 1.444165170556553, + "grad_norm": 0.6865507960319519, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 20110 + }, + { + "epoch": 1.444883303411131, + "grad_norm": 0.7572755813598633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 20120 + }, + { + "epoch": 1.4456014362657092, + "grad_norm": 0.79011070728302, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 20130 + }, + { + "epoch": 1.4463195691202873, + "grad_norm": 0.8297342658042908, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 20140 + }, + { + "epoch": 1.4470377019748653, + "grad_norm": 0.6593490839004517, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 20150 + }, + { + "epoch": 1.4477558348294435, + "grad_norm": 1.0264687538146973, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 20160 + }, + { + "epoch": 1.4484739676840215, + "grad_norm": 0.7032888531684875, + "learning_rate": 0.0002, + "loss": 0.7804, + "step": 20170 + }, + { + "epoch": 1.4491921005385997, + "grad_norm": 0.6438494920730591, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 20180 + }, + { + "epoch": 1.4499102333931777, + "grad_norm": 0.7448790669441223, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 20190 + }, + { + "epoch": 1.4506283662477557, + "grad_norm": 0.7551555037498474, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 20200 + }, + { + "epoch": 1.451346499102334, + "grad_norm": 0.6677857041358948, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 20210 + }, + { + "epoch": 1.4520646319569122, + "grad_norm": 0.7888486385345459, + "learning_rate": 0.0002, + "loss": 0.7261, + "step": 20220 + }, + { + "epoch": 1.4527827648114902, + "grad_norm": 0.6658565402030945, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 20230 + }, + { + "epoch": 1.4535008976660682, + "grad_norm": 0.6800249814987183, + "learning_rate": 0.0002, + "loss": 0.7862, + "step": 20240 + }, + { + "epoch": 1.4542190305206464, + "grad_norm": 0.7419682741165161, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 20250 + }, + { + "epoch": 1.4549371633752244, + "grad_norm": 0.8848792910575867, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 20260 + }, + { + "epoch": 1.4556552962298026, + "grad_norm": 0.6513857245445251, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 20270 + }, + { + "epoch": 1.4563734290843806, + "grad_norm": 0.5605742335319519, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 20280 + }, + { + "epoch": 1.4570915619389586, + "grad_norm": 0.6737141013145447, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 20290 + }, + { + "epoch": 1.4578096947935368, + "grad_norm": 0.6663289666175842, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 20300 + }, + { + "epoch": 1.4585278276481148, + "grad_norm": 0.7157106995582581, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20310 + }, + { + "epoch": 1.459245960502693, + "grad_norm": 0.7713354825973511, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 20320 + }, + { + "epoch": 1.459964093357271, + "grad_norm": 0.8334044218063354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 20330 + }, + { + "epoch": 1.460682226211849, + "grad_norm": 0.7268327474594116, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 20340 + }, + { + "epoch": 1.4614003590664273, + "grad_norm": 0.6791431903839111, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 20350 + }, + { + "epoch": 1.4621184919210055, + "grad_norm": 0.8177870512008667, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 20360 + }, + { + "epoch": 1.4628366247755835, + "grad_norm": 0.8064364790916443, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 20370 + }, + { + "epoch": 1.4635547576301615, + "grad_norm": 0.6547006964683533, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 20380 + }, + { + "epoch": 1.4642728904847397, + "grad_norm": 0.6381436586380005, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 20390 + }, + { + "epoch": 1.4649910233393177, + "grad_norm": 0.7351248264312744, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 20400 + }, + { + "epoch": 1.465709156193896, + "grad_norm": 0.7037558555603027, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 20410 + }, + { + "epoch": 1.466427289048474, + "grad_norm": 0.6294074654579163, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 20420 + }, + { + "epoch": 1.467145421903052, + "grad_norm": 0.9722632765769958, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 20430 + }, + { + "epoch": 1.4678635547576302, + "grad_norm": 0.753065824508667, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 20440 + }, + { + "epoch": 1.4685816876122082, + "grad_norm": 0.7317194938659668, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20450 + }, + { + "epoch": 1.4692998204667864, + "grad_norm": 0.6862193942070007, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 20460 + }, + { + "epoch": 1.4700179533213644, + "grad_norm": 0.7643225193023682, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 20470 + }, + { + "epoch": 1.4707360861759424, + "grad_norm": 0.5904353260993958, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 20480 + }, + { + "epoch": 1.4714542190305206, + "grad_norm": 0.5812238454818726, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20490 + }, + { + "epoch": 1.4721723518850989, + "grad_norm": 0.7478151321411133, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 20500 + }, + { + "epoch": 1.4728904847396769, + "grad_norm": 0.7625645399093628, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 20510 + }, + { + "epoch": 1.4736086175942549, + "grad_norm": 0.6354498267173767, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 20520 + }, + { + "epoch": 1.474326750448833, + "grad_norm": 0.8731162548065186, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 20530 + }, + { + "epoch": 1.475044883303411, + "grad_norm": 0.7346670627593994, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 20540 + }, + { + "epoch": 1.4757630161579893, + "grad_norm": 1.038447618484497, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 20550 + }, + { + "epoch": 1.4764811490125673, + "grad_norm": 0.7032809257507324, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 20560 + }, + { + "epoch": 1.4771992818671453, + "grad_norm": 0.8008337020874023, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 20570 + }, + { + "epoch": 1.4779174147217236, + "grad_norm": 0.6735056638717651, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 20580 + }, + { + "epoch": 1.4786355475763016, + "grad_norm": 0.622056245803833, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 20590 + }, + { + "epoch": 1.4793536804308798, + "grad_norm": 0.6580422520637512, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 20600 + }, + { + "epoch": 1.4800718132854578, + "grad_norm": 0.8401153087615967, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 20610 + }, + { + "epoch": 1.4807899461400358, + "grad_norm": 0.7564560770988464, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 20620 + }, + { + "epoch": 1.481508078994614, + "grad_norm": 0.8319511413574219, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 20630 + }, + { + "epoch": 1.4822262118491922, + "grad_norm": 0.7430182695388794, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 20640 + }, + { + "epoch": 1.4829443447037702, + "grad_norm": 0.7996522784233093, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 20650 + }, + { + "epoch": 1.4836624775583482, + "grad_norm": 0.6993277072906494, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 20660 + }, + { + "epoch": 1.4843806104129265, + "grad_norm": 0.8621185421943665, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 20670 + }, + { + "epoch": 1.4850987432675045, + "grad_norm": 0.7709757685661316, + "learning_rate": 0.0002, + "loss": 0.7327, + "step": 20680 + }, + { + "epoch": 1.4858168761220827, + "grad_norm": 0.743760347366333, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 20690 + }, + { + "epoch": 1.4865350089766607, + "grad_norm": 0.8353745341300964, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 20700 + }, + { + "epoch": 1.4872531418312387, + "grad_norm": 0.8510433435440063, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 20710 + }, + { + "epoch": 1.487971274685817, + "grad_norm": 0.7065894603729248, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 20720 + }, + { + "epoch": 1.488689407540395, + "grad_norm": 0.6878955960273743, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 20730 + }, + { + "epoch": 1.4894075403949731, + "grad_norm": 0.7861111760139465, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 20740 + }, + { + "epoch": 1.4901256732495511, + "grad_norm": 0.4810725152492523, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 20750 + }, + { + "epoch": 1.4908438061041291, + "grad_norm": 0.7246082425117493, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 20760 + }, + { + "epoch": 1.4915619389587074, + "grad_norm": 0.7101936340332031, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 20770 + }, + { + "epoch": 1.4922800718132856, + "grad_norm": 0.7508591413497925, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 20780 + }, + { + "epoch": 1.4929982046678636, + "grad_norm": 0.8872039914131165, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 20790 + }, + { + "epoch": 1.4937163375224416, + "grad_norm": 0.7257922887802124, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 20800 + }, + { + "epoch": 1.4944344703770198, + "grad_norm": 0.7886278629302979, + "learning_rate": 0.0002, + "loss": 0.7497, + "step": 20810 + }, + { + "epoch": 1.4951526032315978, + "grad_norm": 0.6746290922164917, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 20820 + }, + { + "epoch": 1.495870736086176, + "grad_norm": 0.8118207454681396, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 20830 + }, + { + "epoch": 1.496588868940754, + "grad_norm": 0.7337301969528198, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 20840 + }, + { + "epoch": 1.497307001795332, + "grad_norm": 0.5451242327690125, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 20850 + }, + { + "epoch": 1.4980251346499103, + "grad_norm": 0.8398377299308777, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 20860 + }, + { + "epoch": 1.4987432675044883, + "grad_norm": 0.7196659445762634, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 20870 + }, + { + "epoch": 1.4994614003590665, + "grad_norm": 0.6659539937973022, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 20880 + }, + { + "epoch": 1.5001795332136445, + "grad_norm": 0.6071978807449341, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 20890 + }, + { + "epoch": 1.5008976660682225, + "grad_norm": 0.6704870462417603, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 20900 + }, + { + "epoch": 1.5016157989228007, + "grad_norm": 0.7216639518737793, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 20910 + }, + { + "epoch": 1.502333931777379, + "grad_norm": 0.6050528287887573, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 20920 + }, + { + "epoch": 1.503052064631957, + "grad_norm": 0.7422218918800354, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 20930 + }, + { + "epoch": 1.503770197486535, + "grad_norm": 0.7157148122787476, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 20940 + }, + { + "epoch": 1.504488330341113, + "grad_norm": 0.6704899668693542, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 20950 + }, + { + "epoch": 1.5052064631956912, + "grad_norm": 0.7573544979095459, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 20960 + }, + { + "epoch": 1.5059245960502694, + "grad_norm": 0.6710506677627563, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 20970 + }, + { + "epoch": 1.5066427289048474, + "grad_norm": 0.7559793591499329, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 20980 + }, + { + "epoch": 1.5073608617594254, + "grad_norm": 0.6705940961837769, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 20990 + }, + { + "epoch": 1.5080789946140036, + "grad_norm": 0.8016680479049683, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21000 + }, + { + "epoch": 1.5087971274685816, + "grad_norm": 0.8154481649398804, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 21010 + }, + { + "epoch": 1.5095152603231599, + "grad_norm": 0.5830582976341248, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 21020 + }, + { + "epoch": 1.5102333931777379, + "grad_norm": 0.7088601589202881, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 21030 + }, + { + "epoch": 1.5109515260323159, + "grad_norm": 0.7499658465385437, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 21040 + }, + { + "epoch": 1.511669658886894, + "grad_norm": 0.7684667706489563, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 21050 + }, + { + "epoch": 1.5123877917414723, + "grad_norm": 0.7183627486228943, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 21060 + }, + { + "epoch": 1.5131059245960503, + "grad_norm": 0.8201524615287781, + "learning_rate": 0.0002, + "loss": 0.7777, + "step": 21070 + }, + { + "epoch": 1.5138240574506283, + "grad_norm": 0.6359647512435913, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 21080 + }, + { + "epoch": 1.5145421903052063, + "grad_norm": 0.7419124245643616, + "learning_rate": 0.0002, + "loss": 0.7231, + "step": 21090 + }, + { + "epoch": 1.5152603231597845, + "grad_norm": 0.6145808696746826, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 21100 + }, + { + "epoch": 1.5159784560143628, + "grad_norm": 0.7116656303405762, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 21110 + }, + { + "epoch": 1.5166965888689408, + "grad_norm": 0.8927125334739685, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 21120 + }, + { + "epoch": 1.5174147217235188, + "grad_norm": 0.7527788877487183, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 21130 + }, + { + "epoch": 1.518132854578097, + "grad_norm": 0.7537266612052917, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 21140 + }, + { + "epoch": 1.518850987432675, + "grad_norm": 0.9051724672317505, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 21150 + }, + { + "epoch": 1.5195691202872532, + "grad_norm": 0.7258086800575256, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 21160 + }, + { + "epoch": 1.5202872531418312, + "grad_norm": 0.60377436876297, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 21170 + }, + { + "epoch": 1.5210053859964092, + "grad_norm": 0.613362729549408, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 21180 + }, + { + "epoch": 1.5217235188509874, + "grad_norm": 0.6311782002449036, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 21190 + }, + { + "epoch": 1.5224416517055657, + "grad_norm": 0.7814380526542664, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 21200 + }, + { + "epoch": 1.5231597845601437, + "grad_norm": 0.8482790589332581, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 21210 + }, + { + "epoch": 1.5238779174147217, + "grad_norm": 0.6767336130142212, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 21220 + }, + { + "epoch": 1.5245960502692997, + "grad_norm": 0.7000219821929932, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 21230 + }, + { + "epoch": 1.525314183123878, + "grad_norm": 0.8848617076873779, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 21240 + }, + { + "epoch": 1.5260323159784561, + "grad_norm": 0.692258894443512, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 21250 + }, + { + "epoch": 1.5267504488330341, + "grad_norm": 0.7701950073242188, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 21260 + }, + { + "epoch": 1.5274685816876121, + "grad_norm": 0.7454132437705994, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 21270 + }, + { + "epoch": 1.5281867145421903, + "grad_norm": 0.7299574613571167, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 21280 + }, + { + "epoch": 1.5289048473967684, + "grad_norm": 0.6693950891494751, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 21290 + }, + { + "epoch": 1.5296229802513466, + "grad_norm": 0.8323785066604614, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 21300 + }, + { + "epoch": 1.5303411131059246, + "grad_norm": 0.8998763561248779, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 21310 + }, + { + "epoch": 1.5310592459605026, + "grad_norm": 0.8118193745613098, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 21320 + }, + { + "epoch": 1.5317773788150808, + "grad_norm": 0.8966332077980042, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 21330 + }, + { + "epoch": 1.532495511669659, + "grad_norm": 0.7849827408790588, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 21340 + }, + { + "epoch": 1.533213644524237, + "grad_norm": 0.897583544254303, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 21350 + }, + { + "epoch": 1.533931777378815, + "grad_norm": 0.7998009324073792, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21360 + }, + { + "epoch": 1.534649910233393, + "grad_norm": 0.5890361070632935, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 21370 + }, + { + "epoch": 1.5353680430879713, + "grad_norm": 0.7321302890777588, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 21380 + }, + { + "epoch": 1.5360861759425495, + "grad_norm": 0.7746050357818604, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 21390 + }, + { + "epoch": 1.5368043087971275, + "grad_norm": 0.7033910155296326, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 21400 + }, + { + "epoch": 1.5375224416517055, + "grad_norm": 0.7229148149490356, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 21410 + }, + { + "epoch": 1.5382405745062837, + "grad_norm": 0.8055810928344727, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 21420 + }, + { + "epoch": 1.5389587073608617, + "grad_norm": 0.9411654472351074, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 21430 + }, + { + "epoch": 1.53967684021544, + "grad_norm": 0.7297126650810242, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21440 + }, + { + "epoch": 1.540394973070018, + "grad_norm": 0.7316457629203796, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 21450 + }, + { + "epoch": 1.541113105924596, + "grad_norm": 0.8568798303604126, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 21460 + }, + { + "epoch": 1.5418312387791742, + "grad_norm": 0.7829580307006836, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 21470 + }, + { + "epoch": 1.5425493716337524, + "grad_norm": 0.6679823398590088, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 21480 + }, + { + "epoch": 1.5432675044883304, + "grad_norm": 0.5680868029594421, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 21490 + }, + { + "epoch": 1.5439856373429084, + "grad_norm": 0.6878862380981445, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 21500 + }, + { + "epoch": 1.5447037701974864, + "grad_norm": 0.7391727566719055, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 21510 + }, + { + "epoch": 1.5454219030520646, + "grad_norm": 0.844994843006134, + "learning_rate": 0.0002, + "loss": 0.7781, + "step": 21520 + }, + { + "epoch": 1.5461400359066428, + "grad_norm": 0.7852550148963928, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 21530 + }, + { + "epoch": 1.5468581687612208, + "grad_norm": 0.8370407223701477, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 21540 + }, + { + "epoch": 1.5475763016157988, + "grad_norm": 0.7138169407844543, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 21550 + }, + { + "epoch": 1.548294434470377, + "grad_norm": 0.7660839557647705, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 21560 + }, + { + "epoch": 1.549012567324955, + "grad_norm": 0.6628666520118713, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 21570 + }, + { + "epoch": 1.5497307001795333, + "grad_norm": 0.602262020111084, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 21580 + }, + { + "epoch": 1.5504488330341113, + "grad_norm": 0.6120333671569824, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 21590 + }, + { + "epoch": 1.5511669658886893, + "grad_norm": 0.6742582321166992, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 21600 + }, + { + "epoch": 1.5518850987432675, + "grad_norm": 0.6788192391395569, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 21610 + }, + { + "epoch": 1.5526032315978457, + "grad_norm": 0.7124713659286499, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 21620 + }, + { + "epoch": 1.5533213644524237, + "grad_norm": 0.6297248005867004, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 21630 + }, + { + "epoch": 1.5540394973070017, + "grad_norm": 0.8977078199386597, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21640 + }, + { + "epoch": 1.5547576301615798, + "grad_norm": 0.7543209791183472, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 21650 + }, + { + "epoch": 1.555475763016158, + "grad_norm": 0.8704302310943604, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 21660 + }, + { + "epoch": 1.5561938958707362, + "grad_norm": 0.7848012447357178, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 21670 + }, + { + "epoch": 1.5569120287253142, + "grad_norm": 0.7496278285980225, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 21680 + }, + { + "epoch": 1.5576301615798922, + "grad_norm": 0.7305200099945068, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 21690 + }, + { + "epoch": 1.5583482944344704, + "grad_norm": 0.6671105623245239, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 21700 + }, + { + "epoch": 1.5590664272890484, + "grad_norm": 0.8536111116409302, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 21710 + }, + { + "epoch": 1.5597845601436267, + "grad_norm": 0.7360461354255676, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 21720 + }, + { + "epoch": 1.5605026929982047, + "grad_norm": 0.6665109395980835, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 21730 + }, + { + "epoch": 1.5612208258527827, + "grad_norm": 0.5879628658294678, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 21740 + }, + { + "epoch": 1.5619389587073609, + "grad_norm": 0.6937240958213806, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 21750 + }, + { + "epoch": 1.562657091561939, + "grad_norm": 0.7118659019470215, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 21760 + }, + { + "epoch": 1.563375224416517, + "grad_norm": 0.7858866453170776, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 21770 + }, + { + "epoch": 1.564093357271095, + "grad_norm": 0.8691372871398926, + "learning_rate": 0.0002, + "loss": 0.7552, + "step": 21780 + }, + { + "epoch": 1.564811490125673, + "grad_norm": 0.8884942531585693, + "learning_rate": 0.0002, + "loss": 0.7684, + "step": 21790 + }, + { + "epoch": 1.5655296229802513, + "grad_norm": 0.6335656046867371, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 21800 + }, + { + "epoch": 1.5662477558348296, + "grad_norm": 0.8666166067123413, + "learning_rate": 0.0002, + "loss": 0.7233, + "step": 21810 + }, + { + "epoch": 1.5669658886894076, + "grad_norm": 0.7961624264717102, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 21820 + }, + { + "epoch": 1.5676840215439856, + "grad_norm": 0.6331174373626709, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 21830 + }, + { + "epoch": 1.5684021543985638, + "grad_norm": 0.6476998925209045, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 21840 + }, + { + "epoch": 1.5691202872531418, + "grad_norm": 0.8279129266738892, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 21850 + }, + { + "epoch": 1.56983842010772, + "grad_norm": 0.6997109651565552, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 21860 + }, + { + "epoch": 1.570556552962298, + "grad_norm": 0.6992211937904358, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 21870 + }, + { + "epoch": 1.571274685816876, + "grad_norm": 0.7766915559768677, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 21880 + }, + { + "epoch": 1.5719928186714542, + "grad_norm": 0.6845845580101013, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 21890 + }, + { + "epoch": 1.5727109515260325, + "grad_norm": 0.7247874140739441, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 21900 + }, + { + "epoch": 1.5734290843806105, + "grad_norm": 0.802342414855957, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 21910 + }, + { + "epoch": 1.5741472172351885, + "grad_norm": 0.7797709107398987, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 21920 + }, + { + "epoch": 1.5748653500897665, + "grad_norm": 0.6534958481788635, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 21930 + }, + { + "epoch": 1.5755834829443447, + "grad_norm": 0.6003528237342834, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 21940 + }, + { + "epoch": 1.576301615798923, + "grad_norm": 0.6920075416564941, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 21950 + }, + { + "epoch": 1.577019748653501, + "grad_norm": 0.7213456034660339, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 21960 + }, + { + "epoch": 1.577737881508079, + "grad_norm": 0.7101914286613464, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 21970 + }, + { + "epoch": 1.5784560143626571, + "grad_norm": 0.9531592130661011, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 21980 + }, + { + "epoch": 1.5791741472172351, + "grad_norm": 0.7690590023994446, + "learning_rate": 0.0002, + "loss": 0.7454, + "step": 21990 + }, + { + "epoch": 1.5798922800718134, + "grad_norm": 0.8226363062858582, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 22000 + }, + { + "epoch": 1.5806104129263914, + "grad_norm": 0.6128851175308228, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 22010 + }, + { + "epoch": 1.5813285457809694, + "grad_norm": 0.827008068561554, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 22020 + }, + { + "epoch": 1.5820466786355476, + "grad_norm": 0.6729007363319397, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 22030 + }, + { + "epoch": 1.5827648114901258, + "grad_norm": 0.6397014260292053, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 22040 + }, + { + "epoch": 1.5834829443447038, + "grad_norm": 0.6927793622016907, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 22050 + }, + { + "epoch": 1.5842010771992818, + "grad_norm": 0.7527112364768982, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 22060 + }, + { + "epoch": 1.5849192100538598, + "grad_norm": 0.6418012380599976, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 22070 + }, + { + "epoch": 1.585637342908438, + "grad_norm": 0.7627281546592712, + "learning_rate": 0.0002, + "loss": 0.727, + "step": 22080 + }, + { + "epoch": 1.5863554757630163, + "grad_norm": 0.753851592540741, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22090 + }, + { + "epoch": 1.5870736086175943, + "grad_norm": 0.6049349904060364, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 22100 + }, + { + "epoch": 1.5877917414721723, + "grad_norm": 0.6677758693695068, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 22110 + }, + { + "epoch": 1.5885098743267505, + "grad_norm": 0.913489818572998, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22120 + }, + { + "epoch": 1.5892280071813285, + "grad_norm": 0.6779162883758545, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 22130 + }, + { + "epoch": 1.5899461400359067, + "grad_norm": 0.910076916217804, + "learning_rate": 0.0002, + "loss": 0.7674, + "step": 22140 + }, + { + "epoch": 1.5906642728904847, + "grad_norm": 0.9506068229675293, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 22150 + }, + { + "epoch": 1.5913824057450627, + "grad_norm": 0.6552460789680481, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 22160 + }, + { + "epoch": 1.592100538599641, + "grad_norm": 0.6855819821357727, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22170 + }, + { + "epoch": 1.5928186714542192, + "grad_norm": 0.6713384985923767, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 22180 + }, + { + "epoch": 1.5935368043087972, + "grad_norm": 0.7168547511100769, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 22190 + }, + { + "epoch": 1.5942549371633752, + "grad_norm": 0.8395482897758484, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 22200 + }, + { + "epoch": 1.5949730700179532, + "grad_norm": 0.6676998138427734, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 22210 + }, + { + "epoch": 1.5956912028725314, + "grad_norm": 0.5837140083312988, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 22220 + }, + { + "epoch": 1.5964093357271096, + "grad_norm": 0.8399306535720825, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 22230 + }, + { + "epoch": 1.5971274685816876, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22240 + }, + { + "epoch": 1.5978456014362656, + "grad_norm": 0.768604040145874, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 22250 + }, + { + "epoch": 1.5985637342908436, + "grad_norm": 0.6382646560668945, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 22260 + }, + { + "epoch": 1.5992818671454219, + "grad_norm": 0.7244897484779358, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 22270 + }, + { + "epoch": 1.6, + "grad_norm": 0.6250987648963928, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 22280 + }, + { + "epoch": 1.600718132854578, + "grad_norm": 0.8731992244720459, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 22290 + }, + { + "epoch": 1.601436265709156, + "grad_norm": 0.5861822962760925, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 22300 + }, + { + "epoch": 1.6021543985637343, + "grad_norm": 0.716805100440979, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 22310 + }, + { + "epoch": 1.6028725314183125, + "grad_norm": 0.6650034189224243, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 22320 + }, + { + "epoch": 1.6035906642728905, + "grad_norm": 0.6944432854652405, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 22330 + }, + { + "epoch": 1.6043087971274685, + "grad_norm": 0.7411999106407166, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 22340 + }, + { + "epoch": 1.6050269299820465, + "grad_norm": 0.831828773021698, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 22350 + }, + { + "epoch": 1.6057450628366248, + "grad_norm": 0.6252152919769287, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 22360 + }, + { + "epoch": 1.606463195691203, + "grad_norm": 0.8643325567245483, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22370 + }, + { + "epoch": 1.607181328545781, + "grad_norm": 0.7330279350280762, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 22380 + }, + { + "epoch": 1.607899461400359, + "grad_norm": 0.7235422730445862, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 22390 + }, + { + "epoch": 1.608617594254937, + "grad_norm": 0.6940887570381165, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 22400 + }, + { + "epoch": 1.6093357271095152, + "grad_norm": 0.7907325625419617, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 22410 + }, + { + "epoch": 1.6100538599640934, + "grad_norm": 0.6899075508117676, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 22420 + }, + { + "epoch": 1.6107719928186714, + "grad_norm": 0.7057487368583679, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 22430 + }, + { + "epoch": 1.6114901256732495, + "grad_norm": 0.9235003590583801, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 22440 + }, + { + "epoch": 1.6122082585278277, + "grad_norm": 0.7238173484802246, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 22450 + }, + { + "epoch": 1.612926391382406, + "grad_norm": 0.5931997299194336, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 22460 + }, + { + "epoch": 1.613644524236984, + "grad_norm": 0.6705866456031799, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 22470 + }, + { + "epoch": 1.614362657091562, + "grad_norm": 0.7392773032188416, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 22480 + }, + { + "epoch": 1.61508078994614, + "grad_norm": 0.6286543607711792, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 22490 + }, + { + "epoch": 1.6157989228007181, + "grad_norm": 0.7467446327209473, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 22500 + }, + { + "epoch": 1.6165170556552964, + "grad_norm": 0.8353021740913391, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 22510 + }, + { + "epoch": 1.6172351885098744, + "grad_norm": 0.7333045601844788, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 22520 + }, + { + "epoch": 1.6179533213644524, + "grad_norm": 0.6203709244728088, + "learning_rate": 0.0002, + "loss": 0.7567, + "step": 22530 + }, + { + "epoch": 1.6186714542190304, + "grad_norm": 0.5585690140724182, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 22540 + }, + { + "epoch": 1.6193895870736086, + "grad_norm": 0.7157222032546997, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 22550 + }, + { + "epoch": 1.6201077199281868, + "grad_norm": 0.8129993677139282, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 22560 + }, + { + "epoch": 1.6208258527827648, + "grad_norm": 0.6745335459709167, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 22570 + }, + { + "epoch": 1.6215439856373428, + "grad_norm": 0.7684996724128723, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 22580 + }, + { + "epoch": 1.622262118491921, + "grad_norm": 0.6735436916351318, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 22590 + }, + { + "epoch": 1.6229802513464993, + "grad_norm": 0.7394272089004517, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 22600 + }, + { + "epoch": 1.6236983842010773, + "grad_norm": 0.7268046140670776, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 22610 + }, + { + "epoch": 1.6244165170556553, + "grad_norm": 0.8338810205459595, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 22620 + }, + { + "epoch": 1.6251346499102333, + "grad_norm": 0.9293080568313599, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 22630 + }, + { + "epoch": 1.6258527827648115, + "grad_norm": 0.8084996938705444, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 22640 + }, + { + "epoch": 1.6265709156193897, + "grad_norm": 0.6605180501937866, + "learning_rate": 0.0002, + "loss": 0.7053, + "step": 22650 + }, + { + "epoch": 1.6272890484739677, + "grad_norm": 0.8402717113494873, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 22660 + }, + { + "epoch": 1.6280071813285457, + "grad_norm": 0.653055727481842, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 22670 + }, + { + "epoch": 1.6287253141831237, + "grad_norm": 0.6477823257446289, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 22680 + }, + { + "epoch": 1.629443447037702, + "grad_norm": 0.9053590893745422, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 22690 + }, + { + "epoch": 1.6301615798922802, + "grad_norm": 0.90384441614151, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 22700 + }, + { + "epoch": 1.6308797127468582, + "grad_norm": 0.6789469122886658, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 22710 + }, + { + "epoch": 1.6315978456014362, + "grad_norm": 0.7221854329109192, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 22720 + }, + { + "epoch": 1.6323159784560144, + "grad_norm": 0.7724022269248962, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 22730 + }, + { + "epoch": 1.6330341113105926, + "grad_norm": 0.8213715553283691, + "learning_rate": 0.0002, + "loss": 0.7864, + "step": 22740 + }, + { + "epoch": 1.6337522441651706, + "grad_norm": 0.7102876305580139, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 22750 + }, + { + "epoch": 1.6344703770197486, + "grad_norm": 0.8817880749702454, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 22760 + }, + { + "epoch": 1.6351885098743266, + "grad_norm": 0.8446506857872009, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 22770 + }, + { + "epoch": 1.6359066427289048, + "grad_norm": 0.6749029755592346, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 22780 + }, + { + "epoch": 1.636624775583483, + "grad_norm": 0.7013556957244873, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 22790 + }, + { + "epoch": 1.637342908438061, + "grad_norm": 0.7767965793609619, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 22800 + }, + { + "epoch": 1.638061041292639, + "grad_norm": 0.7354073524475098, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 22810 + }, + { + "epoch": 1.638779174147217, + "grad_norm": 0.8871088027954102, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 22820 + }, + { + "epoch": 1.6394973070017953, + "grad_norm": 0.6573871374130249, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 22830 + }, + { + "epoch": 1.6402154398563735, + "grad_norm": 0.5679349303245544, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 22840 + }, + { + "epoch": 1.6409335727109515, + "grad_norm": 0.7072559595108032, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 22850 + }, + { + "epoch": 1.6416517055655295, + "grad_norm": 0.7639257311820984, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 22860 + }, + { + "epoch": 1.6423698384201078, + "grad_norm": 0.6699341535568237, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 22870 + }, + { + "epoch": 1.643087971274686, + "grad_norm": 0.8285767436027527, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 22880 + }, + { + "epoch": 1.643806104129264, + "grad_norm": 0.7328150272369385, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 22890 + }, + { + "epoch": 1.644524236983842, + "grad_norm": 0.8122354745864868, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 22900 + }, + { + "epoch": 1.64524236983842, + "grad_norm": 0.7322969436645508, + "learning_rate": 0.0002, + "loss": 0.7853, + "step": 22910 + }, + { + "epoch": 1.6459605026929982, + "grad_norm": 0.7269576191902161, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 22920 + }, + { + "epoch": 1.6466786355475764, + "grad_norm": 0.7037042379379272, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 22930 + }, + { + "epoch": 1.6473967684021544, + "grad_norm": 0.6960355639457703, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 22940 + }, + { + "epoch": 1.6481149012567324, + "grad_norm": 0.7446839213371277, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 22950 + }, + { + "epoch": 1.6488330341113104, + "grad_norm": 0.7201664447784424, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 22960 + }, + { + "epoch": 1.6495511669658887, + "grad_norm": 0.7062349319458008, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 22970 + }, + { + "epoch": 1.6502692998204669, + "grad_norm": 0.7666636109352112, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 22980 + }, + { + "epoch": 1.6509874326750449, + "grad_norm": 0.7872112393379211, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 22990 + }, + { + "epoch": 1.6517055655296229, + "grad_norm": 0.7428551316261292, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 23000 + }, + { + "epoch": 1.6524236983842011, + "grad_norm": 0.6087952852249146, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 23010 + }, + { + "epoch": 1.6531418312387793, + "grad_norm": 0.7191354036331177, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 23020 + }, + { + "epoch": 1.6538599640933573, + "grad_norm": 0.8679710626602173, + "learning_rate": 0.0002, + "loss": 0.7517, + "step": 23030 + }, + { + "epoch": 1.6545780969479353, + "grad_norm": 0.7232310175895691, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 23040 + }, + { + "epoch": 1.6552962298025133, + "grad_norm": 0.5695104002952576, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 23050 + }, + { + "epoch": 1.6560143626570916, + "grad_norm": 0.6363076567649841, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 23060 + }, + { + "epoch": 1.6567324955116698, + "grad_norm": 0.8168749809265137, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23070 + }, + { + "epoch": 1.6574506283662478, + "grad_norm": 0.7664111852645874, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 23080 + }, + { + "epoch": 1.6581687612208258, + "grad_norm": 0.6748140454292297, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 23090 + }, + { + "epoch": 1.6588868940754038, + "grad_norm": 0.6258183121681213, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 23100 + }, + { + "epoch": 1.659605026929982, + "grad_norm": 0.8669735193252563, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 23110 + }, + { + "epoch": 1.6603231597845602, + "grad_norm": 0.5606119632720947, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 23120 + }, + { + "epoch": 1.6610412926391382, + "grad_norm": 0.6602507829666138, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 23130 + }, + { + "epoch": 1.6617594254937162, + "grad_norm": 0.7237988710403442, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 23140 + }, + { + "epoch": 1.6624775583482945, + "grad_norm": 0.9054415225982666, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 23150 + }, + { + "epoch": 1.6631956912028727, + "grad_norm": 0.5186660289764404, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 23160 + }, + { + "epoch": 1.6639138240574507, + "grad_norm": 0.719584584236145, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 23170 + }, + { + "epoch": 1.6646319569120287, + "grad_norm": 0.7583617568016052, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 23180 + }, + { + "epoch": 1.6653500897666067, + "grad_norm": 0.7985982298851013, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 23190 + }, + { + "epoch": 1.666068222621185, + "grad_norm": 0.6952691674232483, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23200 + }, + { + "epoch": 1.6667863554757631, + "grad_norm": 0.7184221744537354, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 23210 + }, + { + "epoch": 1.6675044883303412, + "grad_norm": 0.8256361484527588, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 23220 + }, + { + "epoch": 1.6682226211849192, + "grad_norm": 0.7534128427505493, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 23230 + }, + { + "epoch": 1.6689407540394972, + "grad_norm": 0.7711095213890076, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 23240 + }, + { + "epoch": 1.6696588868940754, + "grad_norm": 0.6326615810394287, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 23250 + }, + { + "epoch": 1.6703770197486536, + "grad_norm": 0.8345766663551331, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 23260 + }, + { + "epoch": 1.6710951526032316, + "grad_norm": 0.9079837203025818, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 23270 + }, + { + "epoch": 1.6718132854578096, + "grad_norm": 0.7310197353363037, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 23280 + }, + { + "epoch": 1.6725314183123878, + "grad_norm": 0.7573344707489014, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 23290 + }, + { + "epoch": 1.673249551166966, + "grad_norm": 0.7708047032356262, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 23300 + }, + { + "epoch": 1.673967684021544, + "grad_norm": 0.7665812969207764, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 23310 + }, + { + "epoch": 1.674685816876122, + "grad_norm": 0.7988788485527039, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 23320 + }, + { + "epoch": 1.6754039497307, + "grad_norm": 0.755042552947998, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 23330 + }, + { + "epoch": 1.6761220825852783, + "grad_norm": 0.6605848670005798, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 23340 + }, + { + "epoch": 1.6768402154398565, + "grad_norm": 0.8762016296386719, + "learning_rate": 0.0002, + "loss": 0.7394, + "step": 23350 + }, + { + "epoch": 1.6775583482944345, + "grad_norm": 0.604742169380188, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 23360 + }, + { + "epoch": 1.6782764811490125, + "grad_norm": 0.7479172945022583, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 23370 + }, + { + "epoch": 1.6789946140035905, + "grad_norm": 0.6418702602386475, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 23380 + }, + { + "epoch": 1.6797127468581687, + "grad_norm": 0.6783933639526367, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 23390 + }, + { + "epoch": 1.680430879712747, + "grad_norm": 0.7036024928092957, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 23400 + }, + { + "epoch": 1.681149012567325, + "grad_norm": 0.6833266615867615, + "learning_rate": 0.0002, + "loss": 0.7439, + "step": 23410 + }, + { + "epoch": 1.681867145421903, + "grad_norm": 0.8867062330245972, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 23420 + }, + { + "epoch": 1.6825852782764812, + "grad_norm": 0.7825753092765808, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 23430 + }, + { + "epoch": 1.6833034111310592, + "grad_norm": 0.6396880745887756, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 23440 + }, + { + "epoch": 1.6840215439856374, + "grad_norm": 0.5723230242729187, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 23450 + }, + { + "epoch": 1.6847396768402154, + "grad_norm": 0.6949231624603271, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 23460 + }, + { + "epoch": 1.6854578096947934, + "grad_norm": 0.8290650248527527, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 23470 + }, + { + "epoch": 1.6861759425493716, + "grad_norm": 0.7765078544616699, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 23480 + }, + { + "epoch": 1.6868940754039499, + "grad_norm": 0.7084149718284607, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 23490 + }, + { + "epoch": 1.6876122082585279, + "grad_norm": 0.6916654109954834, + "learning_rate": 0.0002, + "loss": 0.8188, + "step": 23500 + }, + { + "epoch": 1.6883303411131059, + "grad_norm": 0.5615179538726807, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 23510 + }, + { + "epoch": 1.6890484739676839, + "grad_norm": 0.7996105551719666, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 23520 + }, + { + "epoch": 1.689766606822262, + "grad_norm": 0.7010168433189392, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23530 + }, + { + "epoch": 1.6904847396768403, + "grad_norm": 0.7876442074775696, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 23540 + }, + { + "epoch": 1.6912028725314183, + "grad_norm": 0.7508043646812439, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 23550 + }, + { + "epoch": 1.6919210053859963, + "grad_norm": 0.8125874400138855, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 23560 + }, + { + "epoch": 1.6926391382405745, + "grad_norm": 0.711840808391571, + "learning_rate": 0.0002, + "loss": 0.774, + "step": 23570 + }, + { + "epoch": 1.6933572710951525, + "grad_norm": 0.6540026068687439, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 23580 + }, + { + "epoch": 1.6940754039497308, + "grad_norm": 0.8376550078392029, + "learning_rate": 0.0002, + "loss": 0.7578, + "step": 23590 + }, + { + "epoch": 1.6947935368043088, + "grad_norm": 0.7075366973876953, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 23600 + }, + { + "epoch": 1.6955116696588868, + "grad_norm": 0.7522266507148743, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 23610 + }, + { + "epoch": 1.696229802513465, + "grad_norm": 0.7572667002677917, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 23620 + }, + { + "epoch": 1.6969479353680432, + "grad_norm": 0.6126907467842102, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 23630 + }, + { + "epoch": 1.6976660682226212, + "grad_norm": 0.7473152875900269, + "learning_rate": 0.0002, + "loss": 0.7182, + "step": 23640 + }, + { + "epoch": 1.6983842010771992, + "grad_norm": 0.6630390286445618, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 23650 + }, + { + "epoch": 1.6991023339317772, + "grad_norm": 0.5848073363304138, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 23660 + }, + { + "epoch": 1.6998204667863555, + "grad_norm": 0.5901942849159241, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 23670 + }, + { + "epoch": 1.7005385996409337, + "grad_norm": 0.7896918058395386, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 23680 + }, + { + "epoch": 1.7012567324955117, + "grad_norm": 0.705362856388092, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 23690 + }, + { + "epoch": 1.7019748653500897, + "grad_norm": 0.9917470812797546, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 23700 + }, + { + "epoch": 1.702692998204668, + "grad_norm": 0.7550538778305054, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 23710 + }, + { + "epoch": 1.703411131059246, + "grad_norm": 0.8348238468170166, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23720 + }, + { + "epoch": 1.7041292639138241, + "grad_norm": 0.5979694128036499, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 23730 + }, + { + "epoch": 1.7048473967684021, + "grad_norm": 0.7451775670051575, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 23740 + }, + { + "epoch": 1.7055655296229801, + "grad_norm": 0.7614818215370178, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 23750 + }, + { + "epoch": 1.7062836624775584, + "grad_norm": 0.5590742826461792, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 23760 + }, + { + "epoch": 1.7070017953321366, + "grad_norm": 0.7039094567298889, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 23770 + }, + { + "epoch": 1.7077199281867146, + "grad_norm": 0.7963233590126038, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 23780 + }, + { + "epoch": 1.7084380610412926, + "grad_norm": 0.7214934825897217, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 23790 + }, + { + "epoch": 1.7091561938958706, + "grad_norm": 0.7310500741004944, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 23800 + }, + { + "epoch": 1.7098743267504488, + "grad_norm": 0.6653284430503845, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 23810 + }, + { + "epoch": 1.710592459605027, + "grad_norm": 0.6632702946662903, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 23820 + }, + { + "epoch": 1.711310592459605, + "grad_norm": 0.6314955949783325, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 23830 + }, + { + "epoch": 1.712028725314183, + "grad_norm": 0.73652583360672, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 23840 + }, + { + "epoch": 1.7127468581687613, + "grad_norm": 0.5685144662857056, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 23850 + }, + { + "epoch": 1.7134649910233393, + "grad_norm": 0.7010223865509033, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 23860 + }, + { + "epoch": 1.7141831238779175, + "grad_norm": 0.7643879652023315, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 23870 + }, + { + "epoch": 1.7149012567324955, + "grad_norm": 0.7543165683746338, + "learning_rate": 0.0002, + "loss": 0.7449, + "step": 23880 + }, + { + "epoch": 1.7156193895870735, + "grad_norm": 0.8816508054733276, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 23890 + }, + { + "epoch": 1.7163375224416517, + "grad_norm": 0.7979614734649658, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 23900 + }, + { + "epoch": 1.71705565529623, + "grad_norm": 0.7631057500839233, + "learning_rate": 0.0002, + "loss": 0.7844, + "step": 23910 + }, + { + "epoch": 1.717773788150808, + "grad_norm": 0.6349977254867554, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 23920 + }, + { + "epoch": 1.718491921005386, + "grad_norm": 0.7464412450790405, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 23930 + }, + { + "epoch": 1.719210053859964, + "grad_norm": 0.6985567212104797, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 23940 + }, + { + "epoch": 1.7199281867145422, + "grad_norm": 0.6641302704811096, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 23950 + }, + { + "epoch": 1.7206463195691204, + "grad_norm": 0.7299597263336182, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 23960 + }, + { + "epoch": 1.7213644524236984, + "grad_norm": 0.7812355756759644, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 23970 + }, + { + "epoch": 1.7220825852782764, + "grad_norm": 0.667571485042572, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 23980 + }, + { + "epoch": 1.7228007181328546, + "grad_norm": 0.8244081735610962, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 23990 + }, + { + "epoch": 1.7235188509874326, + "grad_norm": 0.6684445738792419, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 24000 + }, + { + "epoch": 1.7242369838420109, + "grad_norm": 0.7002949118614197, + "learning_rate": 0.0002, + "loss": 0.8042, + "step": 24010 + }, + { + "epoch": 1.7249551166965889, + "grad_norm": 0.6249772906303406, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 24020 + }, + { + "epoch": 1.7256732495511669, + "grad_norm": 0.7279905080795288, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 24030 + }, + { + "epoch": 1.726391382405745, + "grad_norm": 0.631148636341095, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 24040 + }, + { + "epoch": 1.7271095152603233, + "grad_norm": 0.7486464977264404, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 24050 + }, + { + "epoch": 1.7278276481149013, + "grad_norm": 0.7494347095489502, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 24060 + }, + { + "epoch": 1.7285457809694793, + "grad_norm": 0.7821264863014221, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 24070 + }, + { + "epoch": 1.7292639138240573, + "grad_norm": 0.7211608290672302, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 24080 + }, + { + "epoch": 1.7299820466786355, + "grad_norm": 0.7028553485870361, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 24090 + }, + { + "epoch": 1.7307001795332138, + "grad_norm": 0.6189247369766235, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 24100 + }, + { + "epoch": 1.7314183123877918, + "grad_norm": 0.7339756488800049, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 24110 + }, + { + "epoch": 1.7321364452423698, + "grad_norm": 0.6700502038002014, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 24120 + }, + { + "epoch": 1.732854578096948, + "grad_norm": 0.6139533519744873, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 24130 + }, + { + "epoch": 1.733572710951526, + "grad_norm": 0.7249825596809387, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 24140 + }, + { + "epoch": 1.7342908438061042, + "grad_norm": 0.6531777381896973, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 24150 + }, + { + "epoch": 1.7350089766606822, + "grad_norm": 0.8443833589553833, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 24160 + }, + { + "epoch": 1.7357271095152602, + "grad_norm": 0.7040373086929321, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 24170 + }, + { + "epoch": 1.7364452423698384, + "grad_norm": 0.8647749423980713, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24180 + }, + { + "epoch": 1.7371633752244167, + "grad_norm": 0.7297305464744568, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 24190 + }, + { + "epoch": 1.7378815080789947, + "grad_norm": 0.8191218376159668, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 24200 + }, + { + "epoch": 1.7385996409335727, + "grad_norm": 0.7315607666969299, + "learning_rate": 0.0002, + "loss": 0.7665, + "step": 24210 + }, + { + "epoch": 1.7393177737881507, + "grad_norm": 0.694486677646637, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 24220 + }, + { + "epoch": 1.740035906642729, + "grad_norm": 0.8115953207015991, + "learning_rate": 0.0002, + "loss": 0.7476, + "step": 24230 + }, + { + "epoch": 1.7407540394973071, + "grad_norm": 0.7379186153411865, + "learning_rate": 0.0002, + "loss": 0.7792, + "step": 24240 + }, + { + "epoch": 1.7414721723518851, + "grad_norm": 0.6820309162139893, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 24250 + }, + { + "epoch": 1.7421903052064631, + "grad_norm": 0.8210766911506653, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 24260 + }, + { + "epoch": 1.7429084380610413, + "grad_norm": 0.724466860294342, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 24270 + }, + { + "epoch": 1.7436265709156193, + "grad_norm": 0.8768740296363831, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 24280 + }, + { + "epoch": 1.7443447037701976, + "grad_norm": 0.6691206097602844, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24290 + }, + { + "epoch": 1.7450628366247756, + "grad_norm": 0.6529893279075623, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 24300 + }, + { + "epoch": 1.7457809694793536, + "grad_norm": 0.904729962348938, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 24310 + }, + { + "epoch": 1.7464991023339318, + "grad_norm": 0.655235230922699, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24320 + }, + { + "epoch": 1.74721723518851, + "grad_norm": 0.9476361274719238, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 24330 + }, + { + "epoch": 1.747935368043088, + "grad_norm": 0.55366051197052, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 24340 + }, + { + "epoch": 1.748653500897666, + "grad_norm": 0.7192568182945251, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 24350 + }, + { + "epoch": 1.749371633752244, + "grad_norm": 0.7193983793258667, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 24360 + }, + { + "epoch": 1.7500897666068223, + "grad_norm": 0.753998339176178, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24370 + }, + { + "epoch": 1.7508078994614005, + "grad_norm": 1.1058299541473389, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 24380 + }, + { + "epoch": 1.7515260323159785, + "grad_norm": 0.7213007211685181, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 24390 + }, + { + "epoch": 1.7522441651705565, + "grad_norm": 0.972494900226593, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 24400 + }, + { + "epoch": 1.7529622980251347, + "grad_norm": 0.8045306205749512, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 24410 + }, + { + "epoch": 1.7536804308797127, + "grad_norm": 0.82415372133255, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 24420 + }, + { + "epoch": 1.754398563734291, + "grad_norm": 0.72683185338974, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 24430 + }, + { + "epoch": 1.755116696588869, + "grad_norm": 0.687907338142395, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 24440 + }, + { + "epoch": 1.755834829443447, + "grad_norm": 0.6616531610488892, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 24450 + }, + { + "epoch": 1.7565529622980252, + "grad_norm": 0.7225571870803833, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 24460 + }, + { + "epoch": 1.7572710951526034, + "grad_norm": 0.7597603797912598, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 24470 + }, + { + "epoch": 1.7579892280071814, + "grad_norm": 0.7850660681724548, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 24480 + }, + { + "epoch": 1.7587073608617594, + "grad_norm": 0.9843530058860779, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 24490 + }, + { + "epoch": 1.7594254937163374, + "grad_norm": 0.7010256052017212, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 24500 + }, + { + "epoch": 1.7601436265709156, + "grad_norm": 0.5669383406639099, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 24510 + }, + { + "epoch": 1.7608617594254938, + "grad_norm": 0.7043302655220032, + "learning_rate": 0.0002, + "loss": 0.7511, + "step": 24520 + }, + { + "epoch": 1.7615798922800718, + "grad_norm": 0.8000741600990295, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 24530 + }, + { + "epoch": 1.7622980251346498, + "grad_norm": 0.7084416747093201, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 24540 + }, + { + "epoch": 1.763016157989228, + "grad_norm": 0.7290608882904053, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 24550 + }, + { + "epoch": 1.763734290843806, + "grad_norm": 0.8710007071495056, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 24560 + }, + { + "epoch": 1.7644524236983843, + "grad_norm": 0.6346535682678223, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 24570 + }, + { + "epoch": 1.7651705565529623, + "grad_norm": 0.8990599513053894, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 24580 + }, + { + "epoch": 1.7658886894075403, + "grad_norm": 0.7823857665061951, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 24590 + }, + { + "epoch": 1.7666068222621185, + "grad_norm": 0.6250144839286804, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 24600 + }, + { + "epoch": 1.7673249551166967, + "grad_norm": 0.715657114982605, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 24610 + }, + { + "epoch": 1.7680430879712747, + "grad_norm": 0.6254874467849731, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 24620 + }, + { + "epoch": 1.7687612208258527, + "grad_norm": 0.6873717904090881, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 24630 + }, + { + "epoch": 1.7694793536804307, + "grad_norm": 0.7273038625717163, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 24640 + }, + { + "epoch": 1.770197486535009, + "grad_norm": 0.9079981446266174, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 24650 + }, + { + "epoch": 1.7709156193895872, + "grad_norm": 0.6262510418891907, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 24660 + }, + { + "epoch": 1.7716337522441652, + "grad_norm": 0.7326231002807617, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 24670 + }, + { + "epoch": 1.7723518850987432, + "grad_norm": 0.7828301787376404, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 24680 + }, + { + "epoch": 1.7730700179533212, + "grad_norm": 0.5881586670875549, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 24690 + }, + { + "epoch": 1.7737881508078994, + "grad_norm": 0.7101683020591736, + "learning_rate": 0.0002, + "loss": 0.744, + "step": 24700 + }, + { + "epoch": 1.7745062836624776, + "grad_norm": 0.8466469049453735, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 24710 + }, + { + "epoch": 1.7752244165170556, + "grad_norm": 0.7770822644233704, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 24720 + }, + { + "epoch": 1.7759425493716336, + "grad_norm": 0.7259120345115662, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 24730 + }, + { + "epoch": 1.7766606822262119, + "grad_norm": 0.7696824669837952, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 24740 + }, + { + "epoch": 1.77737881508079, + "grad_norm": 0.7603837847709656, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 24750 + }, + { + "epoch": 1.778096947935368, + "grad_norm": 0.6166595220565796, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 24760 + }, + { + "epoch": 1.778815080789946, + "grad_norm": 0.7493758797645569, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 24770 + }, + { + "epoch": 1.779533213644524, + "grad_norm": 0.7177459597587585, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 24780 + }, + { + "epoch": 1.7802513464991023, + "grad_norm": 0.6666781306266785, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 24790 + }, + { + "epoch": 1.7809694793536806, + "grad_norm": 0.6556468605995178, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 24800 + }, + { + "epoch": 1.7816876122082586, + "grad_norm": 0.6119393706321716, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 24810 + }, + { + "epoch": 1.7824057450628366, + "grad_norm": 0.8573325276374817, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 24820 + }, + { + "epoch": 1.7831238779174146, + "grad_norm": 0.8017005920410156, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 24830 + }, + { + "epoch": 1.7838420107719928, + "grad_norm": 0.7337947487831116, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 24840 + }, + { + "epoch": 1.784560143626571, + "grad_norm": 0.6717178225517273, + "learning_rate": 0.0002, + "loss": 0.7325, + "step": 24850 + }, + { + "epoch": 1.785278276481149, + "grad_norm": 0.8243708610534668, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 24860 + }, + { + "epoch": 1.785996409335727, + "grad_norm": 0.8111547827720642, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 24870 + }, + { + "epoch": 1.7867145421903052, + "grad_norm": 0.8577823042869568, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 24880 + }, + { + "epoch": 1.7874326750448835, + "grad_norm": 0.6488644480705261, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 24890 + }, + { + "epoch": 1.7881508078994615, + "grad_norm": 0.6446744799613953, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 24900 + }, + { + "epoch": 1.7888689407540395, + "grad_norm": 0.6400182247161865, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 24910 + }, + { + "epoch": 1.7895870736086175, + "grad_norm": 0.8059108853340149, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 24920 + }, + { + "epoch": 1.7903052064631957, + "grad_norm": 0.7101734280586243, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 24930 + }, + { + "epoch": 1.791023339317774, + "grad_norm": 1.0397762060165405, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 24940 + }, + { + "epoch": 1.791741472172352, + "grad_norm": 0.6231128573417664, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 24950 + }, + { + "epoch": 1.79245960502693, + "grad_norm": 5.905253887176514, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 24960 + }, + { + "epoch": 1.793177737881508, + "grad_norm": 0.8003911375999451, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 24970 + }, + { + "epoch": 1.7938958707360861, + "grad_norm": 0.6340393424034119, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 24980 + }, + { + "epoch": 1.7946140035906644, + "grad_norm": 0.8701013922691345, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 24990 + }, + { + "epoch": 1.7953321364452424, + "grad_norm": 0.9085575342178345, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 25000 + }, + { + "epoch": 1.7960502692998204, + "grad_norm": 0.6306625604629517, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 25010 + }, + { + "epoch": 1.7967684021543986, + "grad_norm": 0.6985056400299072, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25020 + }, + { + "epoch": 1.7974865350089768, + "grad_norm": 0.7309113144874573, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 25030 + }, + { + "epoch": 1.7982046678635548, + "grad_norm": 0.6795042157173157, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 25040 + }, + { + "epoch": 1.7989228007181328, + "grad_norm": 0.6920178532600403, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25050 + }, + { + "epoch": 1.7996409335727108, + "grad_norm": 0.6578564047813416, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25060 + }, + { + "epoch": 1.800359066427289, + "grad_norm": 0.6718358993530273, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 25070 + }, + { + "epoch": 1.8010771992818673, + "grad_norm": 0.9086750149726868, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 25080 + }, + { + "epoch": 1.8017953321364453, + "grad_norm": 0.6102437973022461, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 25090 + }, + { + "epoch": 1.8025134649910233, + "grad_norm": 0.6391313076019287, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 25100 + }, + { + "epoch": 1.8032315978456013, + "grad_norm": 0.7150128483772278, + "learning_rate": 0.0002, + "loss": 0.766, + "step": 25110 + }, + { + "epoch": 1.8039497307001795, + "grad_norm": 0.9833421111106873, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 25120 + }, + { + "epoch": 1.8046678635547577, + "grad_norm": 0.774002194404602, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 25130 + }, + { + "epoch": 1.8053859964093357, + "grad_norm": 0.644443154335022, + "learning_rate": 0.0002, + "loss": 0.7329, + "step": 25140 + }, + { + "epoch": 1.8061041292639137, + "grad_norm": 0.6996100544929504, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 25150 + }, + { + "epoch": 1.806822262118492, + "grad_norm": 0.7545985579490662, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 25160 + }, + { + "epoch": 1.8075403949730702, + "grad_norm": 0.7505226731300354, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 25170 + }, + { + "epoch": 1.8082585278276482, + "grad_norm": 0.800681471824646, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 25180 + }, + { + "epoch": 1.8089766606822262, + "grad_norm": 0.8268337845802307, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 25190 + }, + { + "epoch": 1.8096947935368042, + "grad_norm": 0.6436594128608704, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 25200 + }, + { + "epoch": 1.8104129263913824, + "grad_norm": 0.6961014270782471, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 25210 + }, + { + "epoch": 1.8111310592459606, + "grad_norm": 0.6649489998817444, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 25220 + }, + { + "epoch": 1.8118491921005386, + "grad_norm": 0.7071637511253357, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 25230 + }, + { + "epoch": 1.8125673249551166, + "grad_norm": 0.9082241654396057, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 25240 + }, + { + "epoch": 1.8132854578096946, + "grad_norm": 0.6318159103393555, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 25250 + }, + { + "epoch": 1.8140035906642729, + "grad_norm": 0.8006597757339478, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 25260 + }, + { + "epoch": 1.814721723518851, + "grad_norm": 0.7950259447097778, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 25270 + }, + { + "epoch": 1.815439856373429, + "grad_norm": 0.8376588821411133, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 25280 + }, + { + "epoch": 1.816157989228007, + "grad_norm": 0.8343217968940735, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 25290 + }, + { + "epoch": 1.8168761220825853, + "grad_norm": 0.6240017414093018, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 25300 + }, + { + "epoch": 1.8175942549371635, + "grad_norm": 0.7079808712005615, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 25310 + }, + { + "epoch": 1.8183123877917415, + "grad_norm": 0.5930073261260986, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 25320 + }, + { + "epoch": 1.8190305206463195, + "grad_norm": 0.6994491815567017, + "learning_rate": 0.0002, + "loss": 0.7647, + "step": 25330 + }, + { + "epoch": 1.8197486535008975, + "grad_norm": 0.8285305500030518, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 25340 + }, + { + "epoch": 1.8204667863554758, + "grad_norm": 0.6880194544792175, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 25350 + }, + { + "epoch": 1.821184919210054, + "grad_norm": 0.7301307916641235, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 25360 + }, + { + "epoch": 1.821903052064632, + "grad_norm": 0.8117532730102539, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 25370 + }, + { + "epoch": 1.82262118491921, + "grad_norm": 0.8098701238632202, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 25380 + }, + { + "epoch": 1.823339317773788, + "grad_norm": 0.6899038553237915, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 25390 + }, + { + "epoch": 1.8240574506283662, + "grad_norm": 0.7350431084632874, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 25400 + }, + { + "epoch": 1.8247755834829444, + "grad_norm": 0.8723382949829102, + "learning_rate": 0.0002, + "loss": 0.7389, + "step": 25410 + }, + { + "epoch": 1.8254937163375224, + "grad_norm": 0.7448108196258545, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 25420 + }, + { + "epoch": 1.8262118491921004, + "grad_norm": 0.7525040507316589, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25430 + }, + { + "epoch": 1.8269299820466787, + "grad_norm": 0.7148599028587341, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25440 + }, + { + "epoch": 1.827648114901257, + "grad_norm": 1.1802153587341309, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 25450 + }, + { + "epoch": 1.828366247755835, + "grad_norm": 0.619945764541626, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 25460 + }, + { + "epoch": 1.829084380610413, + "grad_norm": 0.7065792679786682, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 25470 + }, + { + "epoch": 1.829802513464991, + "grad_norm": 0.6626001596450806, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 25480 + }, + { + "epoch": 1.8305206463195691, + "grad_norm": 0.8368920087814331, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 25490 + }, + { + "epoch": 1.8312387791741473, + "grad_norm": 0.7528934478759766, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 25500 + }, + { + "epoch": 1.8319569120287253, + "grad_norm": 0.6472136378288269, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 25510 + }, + { + "epoch": 1.8326750448833034, + "grad_norm": 0.7818671464920044, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 25520 + }, + { + "epoch": 1.8333931777378814, + "grad_norm": 0.8280798196792603, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 25530 + }, + { + "epoch": 1.8341113105924596, + "grad_norm": 0.7038599252700806, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 25540 + }, + { + "epoch": 1.8348294434470378, + "grad_norm": 0.6345962882041931, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 25550 + }, + { + "epoch": 1.8355475763016158, + "grad_norm": 0.6891741752624512, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 25560 + }, + { + "epoch": 1.8362657091561938, + "grad_norm": 0.7753492593765259, + "learning_rate": 0.0002, + "loss": 0.754, + "step": 25570 + }, + { + "epoch": 1.836983842010772, + "grad_norm": 0.6907210946083069, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 25580 + }, + { + "epoch": 1.8377019748653503, + "grad_norm": 0.7483090162277222, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 25590 + }, + { + "epoch": 1.8384201077199283, + "grad_norm": 0.8749029636383057, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 25600 + }, + { + "epoch": 1.8391382405745063, + "grad_norm": 0.6936851143836975, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 25610 + }, + { + "epoch": 1.8398563734290843, + "grad_norm": 0.7273763418197632, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 25620 + }, + { + "epoch": 1.8405745062836625, + "grad_norm": 0.7655298113822937, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 25630 + }, + { + "epoch": 1.8412926391382407, + "grad_norm": 0.7207344770431519, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 25640 + }, + { + "epoch": 1.8420107719928187, + "grad_norm": 0.6970131397247314, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 25650 + }, + { + "epoch": 1.8427289048473967, + "grad_norm": 0.7777560353279114, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 25660 + }, + { + "epoch": 1.8434470377019747, + "grad_norm": 0.7070116400718689, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 25670 + }, + { + "epoch": 1.844165170556553, + "grad_norm": 0.6980257630348206, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 25680 + }, + { + "epoch": 1.8448833034111312, + "grad_norm": 0.906563401222229, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 25690 + }, + { + "epoch": 1.8456014362657092, + "grad_norm": 0.567991316318512, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 25700 + }, + { + "epoch": 1.8463195691202872, + "grad_norm": 0.5954506993293762, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 25710 + }, + { + "epoch": 1.8470377019748654, + "grad_norm": 0.8073318600654602, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 25720 + }, + { + "epoch": 1.8477558348294436, + "grad_norm": 0.7439551949501038, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 25730 + }, + { + "epoch": 1.8484739676840216, + "grad_norm": 0.8091771602630615, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 25740 + }, + { + "epoch": 1.8491921005385996, + "grad_norm": 0.6584576964378357, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 25750 + }, + { + "epoch": 1.8499102333931776, + "grad_norm": 0.8161963224411011, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 25760 + }, + { + "epoch": 1.8506283662477558, + "grad_norm": 0.7337122559547424, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 25770 + }, + { + "epoch": 1.851346499102334, + "grad_norm": 0.8968114256858826, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 25780 + }, + { + "epoch": 1.852064631956912, + "grad_norm": 0.8647686839103699, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 25790 + }, + { + "epoch": 1.85278276481149, + "grad_norm": 0.7775349020957947, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 25800 + }, + { + "epoch": 1.853500897666068, + "grad_norm": 0.686072587966919, + "learning_rate": 0.0002, + "loss": 0.7739, + "step": 25810 + }, + { + "epoch": 1.8542190305206463, + "grad_norm": 0.7053380012512207, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 25820 + }, + { + "epoch": 1.8549371633752245, + "grad_norm": 0.7899979948997498, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 25830 + }, + { + "epoch": 1.8556552962298025, + "grad_norm": 0.6970776915550232, + "learning_rate": 0.0002, + "loss": 0.7633, + "step": 25840 + }, + { + "epoch": 1.8563734290843805, + "grad_norm": 0.7210841774940491, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 25850 + }, + { + "epoch": 1.8570915619389587, + "grad_norm": 0.7297208905220032, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 25860 + }, + { + "epoch": 1.857809694793537, + "grad_norm": 0.7782729268074036, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 25870 + }, + { + "epoch": 1.858527827648115, + "grad_norm": 0.7227505445480347, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 25880 + }, + { + "epoch": 1.859245960502693, + "grad_norm": 0.7489684224128723, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 25890 + }, + { + "epoch": 1.859964093357271, + "grad_norm": 0.7447289824485779, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 25900 + }, + { + "epoch": 1.8606822262118492, + "grad_norm": 0.8516317009925842, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 25910 + }, + { + "epoch": 1.8614003590664274, + "grad_norm": 0.6864543557167053, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 25920 + }, + { + "epoch": 1.8621184919210054, + "grad_norm": 0.6753451824188232, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 25930 + }, + { + "epoch": 1.8628366247755834, + "grad_norm": 0.631679117679596, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 25940 + }, + { + "epoch": 1.8635547576301614, + "grad_norm": 0.7715049982070923, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 25950 + }, + { + "epoch": 1.8642728904847397, + "grad_norm": 0.7354850769042969, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 25960 + }, + { + "epoch": 1.8649910233393179, + "grad_norm": 0.7443442940711975, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 25970 + }, + { + "epoch": 1.8657091561938959, + "grad_norm": 0.6880337595939636, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 25980 + }, + { + "epoch": 1.8664272890484739, + "grad_norm": 0.843941867351532, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 25990 + }, + { + "epoch": 1.867145421903052, + "grad_norm": 0.6904318928718567, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 26000 + }, + { + "epoch": 1.86786355475763, + "grad_norm": 0.9041751623153687, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 26010 + }, + { + "epoch": 1.8685816876122083, + "grad_norm": 0.7470057010650635, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 26020 + }, + { + "epoch": 1.8692998204667863, + "grad_norm": 0.6921331882476807, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 26030 + }, + { + "epoch": 1.8700179533213643, + "grad_norm": 0.7627376914024353, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 26040 + }, + { + "epoch": 1.8707360861759426, + "grad_norm": 0.7784932851791382, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 26050 + }, + { + "epoch": 1.8714542190305208, + "grad_norm": 0.6399524807929993, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 26060 + }, + { + "epoch": 1.8721723518850988, + "grad_norm": 0.6478492617607117, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26070 + }, + { + "epoch": 1.8728904847396768, + "grad_norm": 0.6376804113388062, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 26080 + }, + { + "epoch": 1.8736086175942548, + "grad_norm": 0.6976892352104187, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 26090 + }, + { + "epoch": 1.874326750448833, + "grad_norm": 0.7997903227806091, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 26100 + }, + { + "epoch": 1.8750448833034112, + "grad_norm": 0.6984273791313171, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 26110 + }, + { + "epoch": 1.8757630161579892, + "grad_norm": 0.7020659446716309, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26120 + }, + { + "epoch": 1.8764811490125672, + "grad_norm": 0.784986138343811, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 26130 + }, + { + "epoch": 1.8771992818671455, + "grad_norm": 0.7369210124015808, + "learning_rate": 0.0002, + "loss": 0.7224, + "step": 26140 + }, + { + "epoch": 1.8779174147217235, + "grad_norm": 0.7730622291564941, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 26150 + }, + { + "epoch": 1.8786355475763017, + "grad_norm": 0.7253434658050537, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 26160 + }, + { + "epoch": 1.8793536804308797, + "grad_norm": 0.8019800186157227, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 26170 + }, + { + "epoch": 1.8800718132854577, + "grad_norm": 0.7337628602981567, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 26180 + }, + { + "epoch": 1.880789946140036, + "grad_norm": 0.7049200534820557, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 26190 + }, + { + "epoch": 1.8815080789946141, + "grad_norm": 0.6451525092124939, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 26200 + }, + { + "epoch": 1.8822262118491921, + "grad_norm": 0.7660874724388123, + "learning_rate": 0.0002, + "loss": 0.749, + "step": 26210 + }, + { + "epoch": 1.8829443447037701, + "grad_norm": 0.8464223146438599, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26220 + }, + { + "epoch": 1.8836624775583481, + "grad_norm": 0.859503984451294, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 26230 + }, + { + "epoch": 1.8843806104129264, + "grad_norm": 0.6969478726387024, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 26240 + }, + { + "epoch": 1.8850987432675046, + "grad_norm": 0.6860285997390747, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 26250 + }, + { + "epoch": 1.8858168761220826, + "grad_norm": 0.5873110294342041, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 26260 + }, + { + "epoch": 1.8865350089766606, + "grad_norm": 0.6959530115127563, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 26270 + }, + { + "epoch": 1.8872531418312388, + "grad_norm": 0.8734689950942993, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 26280 + }, + { + "epoch": 1.8879712746858168, + "grad_norm": 0.7385509014129639, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 26290 + }, + { + "epoch": 1.888689407540395, + "grad_norm": 0.6702063083648682, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 26300 + }, + { + "epoch": 1.889407540394973, + "grad_norm": 0.8177255988121033, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 26310 + }, + { + "epoch": 1.890125673249551, + "grad_norm": 0.6638466715812683, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 26320 + }, + { + "epoch": 1.8908438061041293, + "grad_norm": 0.8584128618240356, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 26330 + }, + { + "epoch": 1.8915619389587075, + "grad_norm": 0.677561342716217, + "learning_rate": 0.0002, + "loss": 0.7216, + "step": 26340 + }, + { + "epoch": 1.8922800718132855, + "grad_norm": 0.6931864619255066, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 26350 + }, + { + "epoch": 1.8929982046678635, + "grad_norm": 0.6583828330039978, + "learning_rate": 0.0002, + "loss": 0.7548, + "step": 26360 + }, + { + "epoch": 1.8937163375224415, + "grad_norm": 0.6708519458770752, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 26370 + }, + { + "epoch": 1.8944344703770197, + "grad_norm": 0.7684788107872009, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 26380 + }, + { + "epoch": 1.895152603231598, + "grad_norm": 0.703217625617981, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 26390 + }, + { + "epoch": 1.895870736086176, + "grad_norm": 0.6686710119247437, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 26400 + }, + { + "epoch": 1.896588868940754, + "grad_norm": 0.7429705262184143, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 26410 + }, + { + "epoch": 1.8973070017953322, + "grad_norm": 0.7835305333137512, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 26420 + }, + { + "epoch": 1.8980251346499102, + "grad_norm": 0.7793689370155334, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 26430 + }, + { + "epoch": 1.8987432675044884, + "grad_norm": 0.7337237000465393, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 26440 + }, + { + "epoch": 1.8994614003590664, + "grad_norm": 0.5734546780586243, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 26450 + }, + { + "epoch": 1.9001795332136444, + "grad_norm": 0.655937135219574, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 26460 + }, + { + "epoch": 1.9008976660682226, + "grad_norm": 1.0200905799865723, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 26470 + }, + { + "epoch": 1.9016157989228009, + "grad_norm": 0.6118829250335693, + "learning_rate": 0.0002, + "loss": 0.733, + "step": 26480 + }, + { + "epoch": 1.9023339317773789, + "grad_norm": 0.7459297776222229, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 26490 + }, + { + "epoch": 1.9030520646319569, + "grad_norm": 0.9451959729194641, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 26500 + }, + { + "epoch": 1.9037701974865349, + "grad_norm": 0.9694880247116089, + "learning_rate": 0.0002, + "loss": 0.7911, + "step": 26510 + }, + { + "epoch": 1.904488330341113, + "grad_norm": 0.806532084941864, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 26520 + }, + { + "epoch": 1.9052064631956913, + "grad_norm": 0.7016968727111816, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 26530 + }, + { + "epoch": 1.9059245960502693, + "grad_norm": 0.7707533836364746, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 26540 + }, + { + "epoch": 1.9066427289048473, + "grad_norm": 0.716044545173645, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 26550 + }, + { + "epoch": 1.9073608617594255, + "grad_norm": 0.7904782295227051, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 26560 + }, + { + "epoch": 1.9080789946140035, + "grad_norm": 0.8557461500167847, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 26570 + }, + { + "epoch": 1.9087971274685818, + "grad_norm": 0.6807048916816711, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 26580 + }, + { + "epoch": 1.9095152603231598, + "grad_norm": 0.8374032974243164, + "learning_rate": 0.0002, + "loss": 0.7066, + "step": 26590 + }, + { + "epoch": 1.9102333931777378, + "grad_norm": 0.7936834692955017, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 26600 + }, + { + "epoch": 1.910951526032316, + "grad_norm": 0.6342210173606873, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 26610 + }, + { + "epoch": 1.9116696588868942, + "grad_norm": 0.8222208023071289, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 26620 + }, + { + "epoch": 1.9123877917414722, + "grad_norm": 0.7890012860298157, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 26630 + }, + { + "epoch": 1.9131059245960502, + "grad_norm": 0.6415254473686218, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 26640 + }, + { + "epoch": 1.9138240574506282, + "grad_norm": 0.7936763763427734, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 26650 + }, + { + "epoch": 1.9145421903052064, + "grad_norm": 0.7174334526062012, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 26660 + }, + { + "epoch": 1.9152603231597847, + "grad_norm": 0.6503710746765137, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 26670 + }, + { + "epoch": 1.9159784560143627, + "grad_norm": 0.7618577480316162, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 26680 + }, + { + "epoch": 1.9166965888689407, + "grad_norm": 0.7984131574630737, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 26690 + }, + { + "epoch": 1.917414721723519, + "grad_norm": 0.6863887906074524, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 26700 + }, + { + "epoch": 1.918132854578097, + "grad_norm": 0.7621138691902161, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 26710 + }, + { + "epoch": 1.9188509874326751, + "grad_norm": 0.7855543494224548, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 26720 + }, + { + "epoch": 1.9195691202872531, + "grad_norm": 0.7045016288757324, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 26730 + }, + { + "epoch": 1.9202872531418311, + "grad_norm": 0.7799559235572815, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 26740 + }, + { + "epoch": 1.9210053859964094, + "grad_norm": 0.7999796271324158, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 26750 + }, + { + "epoch": 1.9217235188509876, + "grad_norm": 0.5479980111122131, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 26760 + }, + { + "epoch": 1.9224416517055656, + "grad_norm": 0.7192868590354919, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 26770 + }, + { + "epoch": 1.9231597845601436, + "grad_norm": 0.7642375826835632, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 26780 + }, + { + "epoch": 1.9238779174147216, + "grad_norm": 0.7015959620475769, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 26790 + }, + { + "epoch": 1.9245960502692998, + "grad_norm": 0.6685634851455688, + "learning_rate": 0.0002, + "loss": 0.8291, + "step": 26800 + }, + { + "epoch": 1.925314183123878, + "grad_norm": 0.674363911151886, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 26810 + }, + { + "epoch": 1.926032315978456, + "grad_norm": 0.769318163394928, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 26820 + }, + { + "epoch": 1.926750448833034, + "grad_norm": 0.7397989630699158, + "learning_rate": 0.0002, + "loss": 0.7323, + "step": 26830 + }, + { + "epoch": 1.9274685816876123, + "grad_norm": 0.7603814601898193, + "learning_rate": 0.0002, + "loss": 0.7399, + "step": 26840 + }, + { + "epoch": 1.9281867145421903, + "grad_norm": 0.5960564613342285, + "learning_rate": 0.0002, + "loss": 0.7147, + "step": 26850 + }, + { + "epoch": 1.9289048473967685, + "grad_norm": 0.8158858418464661, + "learning_rate": 0.0002, + "loss": 0.7292, + "step": 26860 + }, + { + "epoch": 1.9296229802513465, + "grad_norm": 0.7022058367729187, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 26870 + }, + { + "epoch": 1.9303411131059245, + "grad_norm": 0.7249060273170471, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 26880 + }, + { + "epoch": 1.9310592459605027, + "grad_norm": 0.7613264322280884, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 26890 + }, + { + "epoch": 1.931777378815081, + "grad_norm": 0.6857499480247498, + "learning_rate": 0.0002, + "loss": 0.7238, + "step": 26900 + }, + { + "epoch": 1.932495511669659, + "grad_norm": 0.6968346834182739, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 26910 + }, + { + "epoch": 1.933213644524237, + "grad_norm": 0.7079267501831055, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 26920 + }, + { + "epoch": 1.933931777378815, + "grad_norm": 0.6571618914604187, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 26930 + }, + { + "epoch": 1.9346499102333932, + "grad_norm": 0.7460548281669617, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 26940 + }, + { + "epoch": 1.9353680430879714, + "grad_norm": 0.7954307794570923, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 26950 + }, + { + "epoch": 1.9360861759425494, + "grad_norm": 0.8696223497390747, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 26960 + }, + { + "epoch": 1.9368043087971274, + "grad_norm": 0.726004421710968, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 26970 + }, + { + "epoch": 1.9375224416517056, + "grad_norm": 0.8760337829589844, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 26980 + }, + { + "epoch": 1.9382405745062836, + "grad_norm": 0.7308675646781921, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 26990 + }, + { + "epoch": 1.9389587073608618, + "grad_norm": 0.5900304317474365, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 27000 + }, + { + "epoch": 1.9396768402154398, + "grad_norm": 0.8839457631111145, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 27010 + }, + { + "epoch": 1.9403949730700178, + "grad_norm": 0.7239173650741577, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 27020 + }, + { + "epoch": 1.941113105924596, + "grad_norm": 0.8972901701927185, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 27030 + }, + { + "epoch": 1.9418312387791743, + "grad_norm": 0.7140652537345886, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 27040 + }, + { + "epoch": 1.9425493716337523, + "grad_norm": 0.7502743005752563, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 27050 + }, + { + "epoch": 1.9432675044883303, + "grad_norm": 0.6420751810073853, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 27060 + }, + { + "epoch": 1.9439856373429083, + "grad_norm": 0.6671820282936096, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 27070 + }, + { + "epoch": 1.9447037701974865, + "grad_norm": 0.6268796324729919, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 27080 + }, + { + "epoch": 1.9454219030520647, + "grad_norm": 0.6850021481513977, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 27090 + }, + { + "epoch": 1.9461400359066428, + "grad_norm": 0.6380038261413574, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 27100 + }, + { + "epoch": 1.9468581687612208, + "grad_norm": 0.5806204080581665, + "learning_rate": 0.0002, + "loss": 0.7638, + "step": 27110 + }, + { + "epoch": 1.947576301615799, + "grad_norm": 0.8236927390098572, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 27120 + }, + { + "epoch": 1.948294434470377, + "grad_norm": 0.7915826439857483, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27130 + }, + { + "epoch": 1.9490125673249552, + "grad_norm": 0.7467429041862488, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 27140 + }, + { + "epoch": 1.9497307001795332, + "grad_norm": 0.6278707981109619, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27150 + }, + { + "epoch": 1.9504488330341112, + "grad_norm": 0.7353739142417908, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 27160 + }, + { + "epoch": 1.9511669658886894, + "grad_norm": 0.6443645358085632, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27170 + }, + { + "epoch": 1.9518850987432677, + "grad_norm": 0.770800769329071, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 27180 + }, + { + "epoch": 1.9526032315978457, + "grad_norm": 0.8982598781585693, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 27190 + }, + { + "epoch": 1.9533213644524237, + "grad_norm": 0.775017499923706, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 27200 + }, + { + "epoch": 1.9540394973070017, + "grad_norm": 0.8271628618240356, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 27210 + }, + { + "epoch": 1.9547576301615799, + "grad_norm": 0.7460184693336487, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 27220 + }, + { + "epoch": 1.955475763016158, + "grad_norm": 0.7732188105583191, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 27230 + }, + { + "epoch": 1.956193895870736, + "grad_norm": 0.7398577332496643, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 27240 + }, + { + "epoch": 1.9569120287253141, + "grad_norm": 0.7132339477539062, + "learning_rate": 0.0002, + "loss": 0.7347, + "step": 27250 + }, + { + "epoch": 1.9576301615798921, + "grad_norm": 0.6718965768814087, + "learning_rate": 0.0002, + "loss": 0.7731, + "step": 27260 + }, + { + "epoch": 1.9583482944344703, + "grad_norm": 0.7914422154426575, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 27270 + }, + { + "epoch": 1.9590664272890486, + "grad_norm": 0.8314110636711121, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 27280 + }, + { + "epoch": 1.9597845601436266, + "grad_norm": 0.7810674905776978, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 27290 + }, + { + "epoch": 1.9605026929982046, + "grad_norm": 0.7691007256507874, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 27300 + }, + { + "epoch": 1.9612208258527828, + "grad_norm": 0.6753138899803162, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 27310 + }, + { + "epoch": 1.961938958707361, + "grad_norm": 0.5881175994873047, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 27320 + }, + { + "epoch": 1.962657091561939, + "grad_norm": 0.8414133191108704, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 27330 + }, + { + "epoch": 1.963375224416517, + "grad_norm": 0.7363715171813965, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 27340 + }, + { + "epoch": 1.964093357271095, + "grad_norm": 0.6526232361793518, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 27350 + }, + { + "epoch": 1.9648114901256732, + "grad_norm": 0.6821389198303223, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 27360 + }, + { + "epoch": 1.9655296229802515, + "grad_norm": 0.7306062579154968, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 27370 + }, + { + "epoch": 1.9662477558348295, + "grad_norm": 0.6458130478858948, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 27380 + }, + { + "epoch": 1.9669658886894075, + "grad_norm": 0.7243196368217468, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 27390 + }, + { + "epoch": 1.9676840215439855, + "grad_norm": 0.8062235713005066, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 27400 + }, + { + "epoch": 1.9684021543985637, + "grad_norm": 0.68441241979599, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 27410 + }, + { + "epoch": 1.969120287253142, + "grad_norm": 0.7504498958587646, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 27420 + }, + { + "epoch": 1.96983842010772, + "grad_norm": 0.7469466328620911, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 27430 + }, + { + "epoch": 1.970556552962298, + "grad_norm": 0.7109853625297546, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 27440 + }, + { + "epoch": 1.9712746858168761, + "grad_norm": 0.6964903473854065, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 27450 + }, + { + "epoch": 1.9719928186714544, + "grad_norm": 0.8224200010299683, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 27460 + }, + { + "epoch": 1.9727109515260324, + "grad_norm": 0.6195617318153381, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 27470 + }, + { + "epoch": 1.9734290843806104, + "grad_norm": 0.691511332988739, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 27480 + }, + { + "epoch": 1.9741472172351884, + "grad_norm": 0.7437900304794312, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 27490 + }, + { + "epoch": 1.9748653500897666, + "grad_norm": 0.7987960577011108, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 27500 + }, + { + "epoch": 1.9755834829443448, + "grad_norm": 0.7117776274681091, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 27510 + }, + { + "epoch": 1.9763016157989228, + "grad_norm": 0.8473866581916809, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 27520 + }, + { + "epoch": 1.9770197486535008, + "grad_norm": 0.7178242802619934, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 27530 + }, + { + "epoch": 1.9777378815080788, + "grad_norm": 0.760145902633667, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 27540 + }, + { + "epoch": 1.978456014362657, + "grad_norm": 0.764436662197113, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 27550 + }, + { + "epoch": 1.9791741472172353, + "grad_norm": 0.7245904803276062, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 27560 + }, + { + "epoch": 1.9798922800718133, + "grad_norm": 0.6317000389099121, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 27570 + }, + { + "epoch": 1.9806104129263913, + "grad_norm": 0.8764704465866089, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 27580 + }, + { + "epoch": 1.9813285457809695, + "grad_norm": 0.6111825108528137, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 27590 + }, + { + "epoch": 1.9820466786355477, + "grad_norm": 0.6797714233398438, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 27600 + }, + { + "epoch": 1.9827648114901257, + "grad_norm": 0.7754142880439758, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 27610 + }, + { + "epoch": 1.9834829443447037, + "grad_norm": 0.7243061661720276, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 27620 + }, + { + "epoch": 1.9842010771992817, + "grad_norm": 0.6194812655448914, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 27630 + }, + { + "epoch": 1.98491921005386, + "grad_norm": 0.6399638056755066, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27640 + }, + { + "epoch": 1.9856373429084382, + "grad_norm": 0.7637218832969666, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 27650 + }, + { + "epoch": 1.9863554757630162, + "grad_norm": 0.9099404811859131, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 27660 + }, + { + "epoch": 1.9870736086175942, + "grad_norm": 0.6892596483230591, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 27670 + }, + { + "epoch": 1.9877917414721722, + "grad_norm": 0.5962418913841248, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 27680 + }, + { + "epoch": 1.9885098743267504, + "grad_norm": 0.5750163197517395, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 27690 + }, + { + "epoch": 1.9892280071813286, + "grad_norm": 0.6740097403526306, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 27700 + }, + { + "epoch": 1.9899461400359066, + "grad_norm": 0.6968644857406616, + "learning_rate": 0.0002, + "loss": 0.7444, + "step": 27710 + }, + { + "epoch": 1.9906642728904846, + "grad_norm": 0.6788132190704346, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 27720 + }, + { + "epoch": 1.9913824057450629, + "grad_norm": 0.8600544929504395, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 27730 + }, + { + "epoch": 1.992100538599641, + "grad_norm": 0.6227671504020691, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 27740 + }, + { + "epoch": 1.992818671454219, + "grad_norm": 0.6611875295639038, + "learning_rate": 0.0002, + "loss": 0.7815, + "step": 27750 + }, + { + "epoch": 1.993536804308797, + "grad_norm": 0.714568018913269, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 27760 + }, + { + "epoch": 1.994254937163375, + "grad_norm": 0.6328669190406799, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 27770 + }, + { + "epoch": 1.9949730700179533, + "grad_norm": 0.8673429489135742, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 27780 + }, + { + "epoch": 1.9956912028725315, + "grad_norm": 0.820620059967041, + "learning_rate": 0.0002, + "loss": 0.7301, + "step": 27790 + }, + { + "epoch": 1.9964093357271095, + "grad_norm": 0.8748094439506531, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 27800 + }, + { + "epoch": 1.9971274685816875, + "grad_norm": 0.8118113875389099, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 27810 + }, + { + "epoch": 1.9978456014362656, + "grad_norm": 0.6886725425720215, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 27820 + }, + { + "epoch": 1.9985637342908438, + "grad_norm": 0.7101268768310547, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 27830 + }, + { + "epoch": 1.999281867145422, + "grad_norm": 0.7823781967163086, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 27840 + }, + { + "epoch": 2.0, + "grad_norm": 0.8491085767745972, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 27850 + }, + { + "epoch": 2.0, + "eval_loss": 1.0868422985076904, + "eval_runtime": 55.1699, + "eval_samples_per_second": 13.286, + "eval_steps_per_second": 1.668, + "step": 27850 + }, + { + "epoch": 2.000718132854578, + "grad_norm": 0.9003389477729797, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 27860 + }, + { + "epoch": 2.001436265709156, + "grad_norm": 0.8898349404335022, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 27870 + }, + { + "epoch": 2.0021543985637344, + "grad_norm": 0.7525973320007324, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 27880 + }, + { + "epoch": 2.0028725314183125, + "grad_norm": 0.7821497321128845, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 27890 + }, + { + "epoch": 2.0035906642728905, + "grad_norm": 0.6334691047668457, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 27900 + }, + { + "epoch": 2.0043087971274685, + "grad_norm": 0.732991099357605, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 27910 + }, + { + "epoch": 2.0050269299820465, + "grad_norm": 0.949942946434021, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 27920 + }, + { + "epoch": 2.005745062836625, + "grad_norm": 0.657267689704895, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 27930 + }, + { + "epoch": 2.006463195691203, + "grad_norm": 0.8329252004623413, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 27940 + }, + { + "epoch": 2.007181328545781, + "grad_norm": 0.7816959023475647, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 27950 + }, + { + "epoch": 2.007899461400359, + "grad_norm": 0.7546323537826538, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 27960 + }, + { + "epoch": 2.0086175942549374, + "grad_norm": 0.9519657492637634, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 27970 + }, + { + "epoch": 2.0093357271095154, + "grad_norm": 0.7934315800666809, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 27980 + }, + { + "epoch": 2.0100538599640934, + "grad_norm": 0.9579764604568481, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 27990 + }, + { + "epoch": 2.0107719928186714, + "grad_norm": 0.764167070388794, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 28000 + }, + { + "epoch": 2.0114901256732494, + "grad_norm": 0.7380000948905945, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 28010 + }, + { + "epoch": 2.012208258527828, + "grad_norm": 0.7220044732093811, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 28020 + }, + { + "epoch": 2.012926391382406, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 28030 + }, + { + "epoch": 2.013644524236984, + "grad_norm": 0.7507190704345703, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28040 + }, + { + "epoch": 2.014362657091562, + "grad_norm": 0.9488387703895569, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 28050 + }, + { + "epoch": 2.01508078994614, + "grad_norm": 0.9092940092086792, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 28060 + }, + { + "epoch": 2.0157989228007183, + "grad_norm": 0.7859629392623901, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28070 + }, + { + "epoch": 2.0165170556552963, + "grad_norm": 0.7636393904685974, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 28080 + }, + { + "epoch": 2.0172351885098743, + "grad_norm": 0.8860714435577393, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 28090 + }, + { + "epoch": 2.0179533213644523, + "grad_norm": 0.6837195158004761, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 28100 + }, + { + "epoch": 2.0186714542190307, + "grad_norm": 0.7778242826461792, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 28110 + }, + { + "epoch": 2.0193895870736087, + "grad_norm": 0.7164766788482666, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 28120 + }, + { + "epoch": 2.0201077199281867, + "grad_norm": 0.8965572118759155, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 28130 + }, + { + "epoch": 2.0208258527827647, + "grad_norm": 0.8074374794960022, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 28140 + }, + { + "epoch": 2.0215439856373427, + "grad_norm": 0.8307222127914429, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 28150 + }, + { + "epoch": 2.022262118491921, + "grad_norm": 0.9600032567977905, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 28160 + }, + { + "epoch": 2.022980251346499, + "grad_norm": 0.8541040420532227, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 28170 + }, + { + "epoch": 2.023698384201077, + "grad_norm": 0.8864985704421997, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 28180 + }, + { + "epoch": 2.024416517055655, + "grad_norm": 0.7926326990127563, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 28190 + }, + { + "epoch": 2.025134649910233, + "grad_norm": 1.0548077821731567, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 28200 + }, + { + "epoch": 2.0258527827648116, + "grad_norm": 0.7468827366828918, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 28210 + }, + { + "epoch": 2.0265709156193896, + "grad_norm": 0.7683286070823669, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 28220 + }, + { + "epoch": 2.0272890484739676, + "grad_norm": 0.7307319641113281, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 28230 + }, + { + "epoch": 2.0280071813285456, + "grad_norm": 0.7813416719436646, + "learning_rate": 0.0002, + "loss": 0.7148, + "step": 28240 + }, + { + "epoch": 2.028725314183124, + "grad_norm": 0.7954556941986084, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 28250 + }, + { + "epoch": 2.029443447037702, + "grad_norm": 0.8836418986320496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 28260 + }, + { + "epoch": 2.03016157989228, + "grad_norm": 0.7092728614807129, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28270 + }, + { + "epoch": 2.030879712746858, + "grad_norm": 0.8512285351753235, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 28280 + }, + { + "epoch": 2.031597845601436, + "grad_norm": 0.8005346059799194, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 28290 + }, + { + "epoch": 2.0323159784560145, + "grad_norm": 0.8872515559196472, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 28300 + }, + { + "epoch": 2.0330341113105925, + "grad_norm": 0.7948436737060547, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 28310 + }, + { + "epoch": 2.0337522441651705, + "grad_norm": 0.7418082356452942, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 28320 + }, + { + "epoch": 2.0344703770197485, + "grad_norm": 0.9600949287414551, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 28330 + }, + { + "epoch": 2.0351885098743265, + "grad_norm": 0.9767434597015381, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 28340 + }, + { + "epoch": 2.035906642728905, + "grad_norm": 0.7435336709022522, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 28350 + }, + { + "epoch": 2.036624775583483, + "grad_norm": 0.997978925704956, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 28360 + }, + { + "epoch": 2.037342908438061, + "grad_norm": 0.9072412252426147, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 28370 + }, + { + "epoch": 2.038061041292639, + "grad_norm": 0.8396701812744141, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 28380 + }, + { + "epoch": 2.0387791741472174, + "grad_norm": 1.0449832677841187, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 28390 + }, + { + "epoch": 2.0394973070017954, + "grad_norm": 0.6471025943756104, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 28400 + }, + { + "epoch": 2.0402154398563734, + "grad_norm": 0.8147950768470764, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 28410 + }, + { + "epoch": 2.0409335727109514, + "grad_norm": 0.902508020401001, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 28420 + }, + { + "epoch": 2.0416517055655294, + "grad_norm": 0.6426262855529785, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 28430 + }, + { + "epoch": 2.042369838420108, + "grad_norm": 0.8016643524169922, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 28440 + }, + { + "epoch": 2.043087971274686, + "grad_norm": 0.6841614246368408, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 28450 + }, + { + "epoch": 2.043806104129264, + "grad_norm": 0.7713631987571716, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 28460 + }, + { + "epoch": 2.044524236983842, + "grad_norm": 0.8795675039291382, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 28470 + }, + { + "epoch": 2.04524236983842, + "grad_norm": 0.725447416305542, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 28480 + }, + { + "epoch": 2.0459605026929983, + "grad_norm": 0.806861162185669, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 28490 + }, + { + "epoch": 2.0466786355475763, + "grad_norm": 0.752953827381134, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 28500 + }, + { + "epoch": 2.0473967684021543, + "grad_norm": 0.7143173813819885, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 28510 + }, + { + "epoch": 2.0481149012567323, + "grad_norm": 0.9316226243972778, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 28520 + }, + { + "epoch": 2.048833034111311, + "grad_norm": 0.7292338609695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 28530 + }, + { + "epoch": 2.049551166965889, + "grad_norm": 0.7392885088920593, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 28540 + }, + { + "epoch": 2.050269299820467, + "grad_norm": 0.7288873195648193, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 28550 + }, + { + "epoch": 2.050987432675045, + "grad_norm": 0.7791221141815186, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 28560 + }, + { + "epoch": 2.051705565529623, + "grad_norm": 0.821983814239502, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 28570 + }, + { + "epoch": 2.0524236983842012, + "grad_norm": 0.8925826549530029, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 28580 + }, + { + "epoch": 2.0531418312387792, + "grad_norm": 0.7181646227836609, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 28590 + }, + { + "epoch": 2.0538599640933572, + "grad_norm": 0.6387725472450256, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 28600 + }, + { + "epoch": 2.0545780969479353, + "grad_norm": 0.8398096561431885, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 28610 + }, + { + "epoch": 2.0552962298025133, + "grad_norm": 1.0458195209503174, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 28620 + }, + { + "epoch": 2.0560143626570917, + "grad_norm": 0.7032150626182556, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28630 + }, + { + "epoch": 2.0567324955116697, + "grad_norm": 0.8850845098495483, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 28640 + }, + { + "epoch": 2.0574506283662477, + "grad_norm": 0.8587120175361633, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 28650 + }, + { + "epoch": 2.0581687612208257, + "grad_norm": 0.7462602853775024, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28660 + }, + { + "epoch": 2.058886894075404, + "grad_norm": 0.7355574369430542, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 28670 + }, + { + "epoch": 2.059605026929982, + "grad_norm": 0.9229736328125, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 28680 + }, + { + "epoch": 2.06032315978456, + "grad_norm": 0.7685085535049438, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 28690 + }, + { + "epoch": 2.061041292639138, + "grad_norm": 0.6749364137649536, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 28700 + }, + { + "epoch": 2.061759425493716, + "grad_norm": 0.7608520984649658, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28710 + }, + { + "epoch": 2.0624775583482946, + "grad_norm": 0.9451281428337097, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 28720 + }, + { + "epoch": 2.0631956912028726, + "grad_norm": 0.7869735360145569, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 28730 + }, + { + "epoch": 2.0639138240574506, + "grad_norm": 0.8422008156776428, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 28740 + }, + { + "epoch": 2.0646319569120286, + "grad_norm": 0.7486162781715393, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 28750 + }, + { + "epoch": 2.0653500897666066, + "grad_norm": 0.9374173879623413, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28760 + }, + { + "epoch": 2.066068222621185, + "grad_norm": 0.8749295473098755, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 28770 + }, + { + "epoch": 2.066786355475763, + "grad_norm": 0.8265942931175232, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 28780 + }, + { + "epoch": 2.067504488330341, + "grad_norm": 0.8541982769966125, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 28790 + }, + { + "epoch": 2.068222621184919, + "grad_norm": 0.8220006227493286, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 28800 + }, + { + "epoch": 2.0689407540394975, + "grad_norm": 0.7302022576332092, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 28810 + }, + { + "epoch": 2.0696588868940755, + "grad_norm": 0.7073875069618225, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 28820 + }, + { + "epoch": 2.0703770197486535, + "grad_norm": 0.7792919874191284, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 28830 + }, + { + "epoch": 2.0710951526032315, + "grad_norm": 0.8268185257911682, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 28840 + }, + { + "epoch": 2.0718132854578095, + "grad_norm": 0.7576423287391663, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 28850 + }, + { + "epoch": 2.072531418312388, + "grad_norm": 0.8255910873413086, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 28860 + }, + { + "epoch": 2.073249551166966, + "grad_norm": 0.7900934815406799, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 28870 + }, + { + "epoch": 2.073967684021544, + "grad_norm": 0.846665620803833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 28880 + }, + { + "epoch": 2.074685816876122, + "grad_norm": 0.8159831166267395, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 28890 + }, + { + "epoch": 2.0754039497307, + "grad_norm": 0.7395941615104675, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 28900 + }, + { + "epoch": 2.0761220825852784, + "grad_norm": 0.9765046238899231, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 28910 + }, + { + "epoch": 2.0768402154398564, + "grad_norm": 0.8358173966407776, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 28920 + }, + { + "epoch": 2.0775583482944344, + "grad_norm": 0.6848723292350769, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 28930 + }, + { + "epoch": 2.0782764811490124, + "grad_norm": 0.7965065836906433, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 28940 + }, + { + "epoch": 2.078994614003591, + "grad_norm": 0.7618608474731445, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 28950 + }, + { + "epoch": 2.079712746858169, + "grad_norm": 0.890615701675415, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 28960 + }, + { + "epoch": 2.080430879712747, + "grad_norm": 0.7310431003570557, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 28970 + }, + { + "epoch": 2.081149012567325, + "grad_norm": 0.8228268027305603, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 28980 + }, + { + "epoch": 2.081867145421903, + "grad_norm": 0.883577287197113, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 28990 + }, + { + "epoch": 2.0825852782764813, + "grad_norm": 0.8359243869781494, + "learning_rate": 0.0002, + "loss": 0.7232, + "step": 29000 + }, + { + "epoch": 2.0833034111310593, + "grad_norm": 0.8285391330718994, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 29010 + }, + { + "epoch": 2.0840215439856373, + "grad_norm": 0.8991064429283142, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 29020 + }, + { + "epoch": 2.0847396768402153, + "grad_norm": 0.6911244988441467, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 29030 + }, + { + "epoch": 2.0854578096947933, + "grad_norm": 0.8462249636650085, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 29040 + }, + { + "epoch": 2.0861759425493718, + "grad_norm": 0.9149548411369324, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 29050 + }, + { + "epoch": 2.0868940754039498, + "grad_norm": 0.7365630269050598, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 29060 + }, + { + "epoch": 2.087612208258528, + "grad_norm": 0.8439079523086548, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 29070 + }, + { + "epoch": 2.088330341113106, + "grad_norm": 0.7123780846595764, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 29080 + }, + { + "epoch": 2.0890484739676842, + "grad_norm": 0.6854261755943298, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 29090 + }, + { + "epoch": 2.0897666068222622, + "grad_norm": 0.83026123046875, + "learning_rate": 0.0002, + "loss": 0.667, + "step": 29100 + }, + { + "epoch": 2.0904847396768402, + "grad_norm": 0.8413158059120178, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 29110 + }, + { + "epoch": 2.0912028725314182, + "grad_norm": 0.9646758437156677, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 29120 + }, + { + "epoch": 2.0919210053859962, + "grad_norm": 0.8421565890312195, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 29130 + }, + { + "epoch": 2.0926391382405747, + "grad_norm": 0.7748899459838867, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 29140 + }, + { + "epoch": 2.0933572710951527, + "grad_norm": 0.5973830819129944, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 29150 + }, + { + "epoch": 2.0940754039497307, + "grad_norm": 0.8440837860107422, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 29160 + }, + { + "epoch": 2.0947935368043087, + "grad_norm": 0.7392688989639282, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 29170 + }, + { + "epoch": 2.0955116696588867, + "grad_norm": 1.0522996187210083, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 29180 + }, + { + "epoch": 2.096229802513465, + "grad_norm": 0.7330273389816284, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 29190 + }, + { + "epoch": 2.096947935368043, + "grad_norm": 1.11064875125885, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 29200 + }, + { + "epoch": 2.097666068222621, + "grad_norm": 0.795446515083313, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 29210 + }, + { + "epoch": 2.098384201077199, + "grad_norm": 0.5552594661712646, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 29220 + }, + { + "epoch": 2.0991023339317776, + "grad_norm": 0.7327710390090942, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 29230 + }, + { + "epoch": 2.0998204667863556, + "grad_norm": 0.7474247217178345, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 29240 + }, + { + "epoch": 2.1005385996409336, + "grad_norm": 0.7775853276252747, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 29250 + }, + { + "epoch": 2.1012567324955116, + "grad_norm": 0.769527018070221, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29260 + }, + { + "epoch": 2.1019748653500896, + "grad_norm": 0.8350797891616821, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 29270 + }, + { + "epoch": 2.102692998204668, + "grad_norm": 0.8749061822891235, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 29280 + }, + { + "epoch": 2.103411131059246, + "grad_norm": 0.7838778495788574, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 29290 + }, + { + "epoch": 2.104129263913824, + "grad_norm": 0.8144710063934326, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 29300 + }, + { + "epoch": 2.104847396768402, + "grad_norm": 0.7965250015258789, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 29310 + }, + { + "epoch": 2.10556552962298, + "grad_norm": 0.7075945138931274, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 29320 + }, + { + "epoch": 2.1062836624775585, + "grad_norm": 0.9449555277824402, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 29330 + }, + { + "epoch": 2.1070017953321365, + "grad_norm": 0.9114580750465393, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 29340 + }, + { + "epoch": 2.1077199281867145, + "grad_norm": 0.8768125176429749, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 29350 + }, + { + "epoch": 2.1084380610412925, + "grad_norm": 0.8586908578872681, + "learning_rate": 0.0002, + "loss": 0.7134, + "step": 29360 + }, + { + "epoch": 2.109156193895871, + "grad_norm": 0.8351234793663025, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 29370 + }, + { + "epoch": 2.109874326750449, + "grad_norm": 0.686488687992096, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 29380 + }, + { + "epoch": 2.110592459605027, + "grad_norm": 0.7910184264183044, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 29390 + }, + { + "epoch": 2.111310592459605, + "grad_norm": 0.7649612426757812, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 29400 + }, + { + "epoch": 2.112028725314183, + "grad_norm": 0.7790259122848511, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29410 + }, + { + "epoch": 2.1127468581687614, + "grad_norm": 0.8386351466178894, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 29420 + }, + { + "epoch": 2.1134649910233394, + "grad_norm": 0.8605695366859436, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 29430 + }, + { + "epoch": 2.1141831238779174, + "grad_norm": 0.6808947920799255, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 29440 + }, + { + "epoch": 2.1149012567324954, + "grad_norm": 0.8310001492500305, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 29450 + }, + { + "epoch": 2.1156193895870734, + "grad_norm": 1.289986252784729, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 29460 + }, + { + "epoch": 2.116337522441652, + "grad_norm": 0.8679313659667969, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 29470 + }, + { + "epoch": 2.11705565529623, + "grad_norm": 0.9149175882339478, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 29480 + }, + { + "epoch": 2.117773788150808, + "grad_norm": 0.8405622839927673, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 29490 + }, + { + "epoch": 2.118491921005386, + "grad_norm": 0.9174691438674927, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 29500 + }, + { + "epoch": 2.1192100538599643, + "grad_norm": 0.8865614533424377, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29510 + }, + { + "epoch": 2.1199281867145423, + "grad_norm": 0.645301342010498, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29520 + }, + { + "epoch": 2.1206463195691203, + "grad_norm": 0.7612960338592529, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 29530 + }, + { + "epoch": 2.1213644524236983, + "grad_norm": 0.7575576305389404, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 29540 + }, + { + "epoch": 2.1220825852782763, + "grad_norm": 0.8746156096458435, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 29550 + }, + { + "epoch": 2.1228007181328548, + "grad_norm": 0.8488934636116028, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 29560 + }, + { + "epoch": 2.1235188509874328, + "grad_norm": 0.8064972162246704, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 29570 + }, + { + "epoch": 2.1242369838420108, + "grad_norm": 0.7410933971405029, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 29580 + }, + { + "epoch": 2.1249551166965888, + "grad_norm": 0.7023535966873169, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 29590 + }, + { + "epoch": 2.1256732495511668, + "grad_norm": 0.8591743111610413, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 29600 + }, + { + "epoch": 2.126391382405745, + "grad_norm": 0.7270186543464661, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 29610 + }, + { + "epoch": 2.127109515260323, + "grad_norm": 0.9639726281166077, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 29620 + }, + { + "epoch": 2.127827648114901, + "grad_norm": 0.8519027829170227, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 29630 + }, + { + "epoch": 2.128545780969479, + "grad_norm": 0.8786447048187256, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 29640 + }, + { + "epoch": 2.129263913824057, + "grad_norm": 0.7452822923660278, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 29650 + }, + { + "epoch": 2.1299820466786357, + "grad_norm": 0.9385744333267212, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 29660 + }, + { + "epoch": 2.1307001795332137, + "grad_norm": 0.7650160193443298, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 29670 + }, + { + "epoch": 2.1314183123877917, + "grad_norm": 0.7581976652145386, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 29680 + }, + { + "epoch": 2.1321364452423697, + "grad_norm": 0.8455183506011963, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 29690 + }, + { + "epoch": 2.132854578096948, + "grad_norm": 0.7200509905815125, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 29700 + }, + { + "epoch": 2.133572710951526, + "grad_norm": 0.7071877121925354, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 29710 + }, + { + "epoch": 2.134290843806104, + "grad_norm": 0.9197220802307129, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 29720 + }, + { + "epoch": 2.135008976660682, + "grad_norm": 0.6787277460098267, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 29730 + }, + { + "epoch": 2.13572710951526, + "grad_norm": 0.8183788061141968, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 29740 + }, + { + "epoch": 2.1364452423698386, + "grad_norm": 0.7958994507789612, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29750 + }, + { + "epoch": 2.1371633752244166, + "grad_norm": 0.8803889155387878, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 29760 + }, + { + "epoch": 2.1378815080789946, + "grad_norm": 0.6682677268981934, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 29770 + }, + { + "epoch": 2.1385996409335726, + "grad_norm": 1.0198085308074951, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 29780 + }, + { + "epoch": 2.139317773788151, + "grad_norm": 1.0258227586746216, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 29790 + }, + { + "epoch": 2.140035906642729, + "grad_norm": 0.8920917510986328, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 29800 + }, + { + "epoch": 2.140754039497307, + "grad_norm": 0.8352635502815247, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 29810 + }, + { + "epoch": 2.141472172351885, + "grad_norm": 0.8422067165374756, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 29820 + }, + { + "epoch": 2.142190305206463, + "grad_norm": 0.8845202326774597, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 29830 + }, + { + "epoch": 2.1429084380610415, + "grad_norm": 0.659397542476654, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 29840 + }, + { + "epoch": 2.1436265709156195, + "grad_norm": 0.6233306527137756, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 29850 + }, + { + "epoch": 2.1443447037701975, + "grad_norm": 0.8951199054718018, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 29860 + }, + { + "epoch": 2.1450628366247755, + "grad_norm": 0.6980211734771729, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 29870 + }, + { + "epoch": 2.1457809694793535, + "grad_norm": 0.8463385105133057, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 29880 + }, + { + "epoch": 2.146499102333932, + "grad_norm": 0.682183027267456, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 29890 + }, + { + "epoch": 2.14721723518851, + "grad_norm": 0.8491033911705017, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 29900 + }, + { + "epoch": 2.147935368043088, + "grad_norm": 0.8112631440162659, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 29910 + }, + { + "epoch": 2.148653500897666, + "grad_norm": 1.0186359882354736, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 29920 + }, + { + "epoch": 2.149371633752244, + "grad_norm": 0.7904929518699646, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 29930 + }, + { + "epoch": 2.1500897666068224, + "grad_norm": 0.8381312489509583, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 29940 + }, + { + "epoch": 2.1508078994614004, + "grad_norm": 0.7596192359924316, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 29950 + }, + { + "epoch": 2.1515260323159784, + "grad_norm": 0.7532448768615723, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 29960 + }, + { + "epoch": 2.1522441651705564, + "grad_norm": 0.7877430319786072, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 29970 + }, + { + "epoch": 2.152962298025135, + "grad_norm": 0.6870610117912292, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 29980 + }, + { + "epoch": 2.153680430879713, + "grad_norm": 0.7154987454414368, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 29990 + }, + { + "epoch": 2.154398563734291, + "grad_norm": 0.7692370414733887, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 30000 + }, + { + "epoch": 2.155116696588869, + "grad_norm": 0.7745859026908875, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 30010 + }, + { + "epoch": 2.155834829443447, + "grad_norm": 0.718207061290741, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 30020 + }, + { + "epoch": 2.1565529622980253, + "grad_norm": 0.8851615786552429, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30030 + }, + { + "epoch": 2.1572710951526033, + "grad_norm": 0.736194372177124, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 30040 + }, + { + "epoch": 2.1579892280071813, + "grad_norm": 0.9908117055892944, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 30050 + }, + { + "epoch": 2.1587073608617593, + "grad_norm": 0.6772316694259644, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 30060 + }, + { + "epoch": 2.1594254937163377, + "grad_norm": 0.7474411725997925, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 30070 + }, + { + "epoch": 2.1601436265709157, + "grad_norm": 0.8140033483505249, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 30080 + }, + { + "epoch": 2.1608617594254937, + "grad_norm": 0.912555992603302, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 30090 + }, + { + "epoch": 2.1615798922800717, + "grad_norm": 0.8189636468887329, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 30100 + }, + { + "epoch": 2.1622980251346497, + "grad_norm": 0.7520000338554382, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 30110 + }, + { + "epoch": 2.163016157989228, + "grad_norm": 0.9635465741157532, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 30120 + }, + { + "epoch": 2.163734290843806, + "grad_norm": 0.9139830470085144, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 30130 + }, + { + "epoch": 2.164452423698384, + "grad_norm": 0.844384491443634, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 30140 + }, + { + "epoch": 2.165170556552962, + "grad_norm": 0.8296793103218079, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 30150 + }, + { + "epoch": 2.16588868940754, + "grad_norm": 0.7929309606552124, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30160 + }, + { + "epoch": 2.1666068222621186, + "grad_norm": 0.8046507239341736, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 30170 + }, + { + "epoch": 2.1673249551166966, + "grad_norm": 0.8161377310752869, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 30180 + }, + { + "epoch": 2.1680430879712747, + "grad_norm": 0.6984363794326782, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 30190 + }, + { + "epoch": 2.1687612208258527, + "grad_norm": 0.8578489422798157, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30200 + }, + { + "epoch": 2.1694793536804307, + "grad_norm": 0.8051524758338928, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 30210 + }, + { + "epoch": 2.170197486535009, + "grad_norm": 0.6775792241096497, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 30220 + }, + { + "epoch": 2.170915619389587, + "grad_norm": 0.7102242708206177, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 30230 + }, + { + "epoch": 2.171633752244165, + "grad_norm": 0.9038975238800049, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 30240 + }, + { + "epoch": 2.172351885098743, + "grad_norm": 0.8509918451309204, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 30250 + }, + { + "epoch": 2.1730700179533216, + "grad_norm": 0.8816375732421875, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 30260 + }, + { + "epoch": 2.1737881508078996, + "grad_norm": 0.7907037138938904, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 30270 + }, + { + "epoch": 2.1745062836624776, + "grad_norm": 0.7104434967041016, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 30280 + }, + { + "epoch": 2.1752244165170556, + "grad_norm": 1.028658151626587, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 30290 + }, + { + "epoch": 2.1759425493716336, + "grad_norm": 0.8542430400848389, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 30300 + }, + { + "epoch": 2.176660682226212, + "grad_norm": 0.7438064813613892, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30310 + }, + { + "epoch": 2.17737881508079, + "grad_norm": 0.8384708762168884, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 30320 + }, + { + "epoch": 2.178096947935368, + "grad_norm": 0.9034163355827332, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 30330 + }, + { + "epoch": 2.178815080789946, + "grad_norm": 0.9659526944160461, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 30340 + }, + { + "epoch": 2.1795332136445245, + "grad_norm": 0.6685642600059509, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 30350 + }, + { + "epoch": 2.1802513464991025, + "grad_norm": 0.9180589318275452, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 30360 + }, + { + "epoch": 2.1809694793536805, + "grad_norm": 0.9550795555114746, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 30370 + }, + { + "epoch": 2.1816876122082585, + "grad_norm": 0.8517686724662781, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 30380 + }, + { + "epoch": 2.1824057450628365, + "grad_norm": 0.7351927161216736, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 30390 + }, + { + "epoch": 2.183123877917415, + "grad_norm": 0.8439408540725708, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 30400 + }, + { + "epoch": 2.183842010771993, + "grad_norm": 0.8322570323944092, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 30410 + }, + { + "epoch": 2.184560143626571, + "grad_norm": 0.6735888123512268, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 30420 + }, + { + "epoch": 2.185278276481149, + "grad_norm": 0.7273133397102356, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 30430 + }, + { + "epoch": 2.185996409335727, + "grad_norm": 0.7841959595680237, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 30440 + }, + { + "epoch": 2.1867145421903054, + "grad_norm": 0.67259281873703, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 30450 + }, + { + "epoch": 2.1874326750448834, + "grad_norm": 0.7646223306655884, + "learning_rate": 0.0002, + "loss": 0.6857, + "step": 30460 + }, + { + "epoch": 2.1881508078994614, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 30470 + }, + { + "epoch": 2.1888689407540394, + "grad_norm": 0.8818342685699463, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 30480 + }, + { + "epoch": 2.1895870736086174, + "grad_norm": 0.7421377897262573, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 30490 + }, + { + "epoch": 2.190305206463196, + "grad_norm": 0.8180080652236938, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 30500 + }, + { + "epoch": 2.191023339317774, + "grad_norm": 0.8003571033477783, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 30510 + }, + { + "epoch": 2.191741472172352, + "grad_norm": 0.8200605511665344, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 30520 + }, + { + "epoch": 2.19245960502693, + "grad_norm": 0.8878887295722961, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 30530 + }, + { + "epoch": 2.1931777378815083, + "grad_norm": 0.8518163561820984, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 30540 + }, + { + "epoch": 2.1938958707360863, + "grad_norm": 0.8182454705238342, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 30550 + }, + { + "epoch": 2.1946140035906643, + "grad_norm": 0.9395919442176819, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 30560 + }, + { + "epoch": 2.1953321364452423, + "grad_norm": 0.7916256189346313, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 30570 + }, + { + "epoch": 2.1960502692998203, + "grad_norm": 0.7303445339202881, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 30580 + }, + { + "epoch": 2.1967684021543987, + "grad_norm": 0.7407387495040894, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 30590 + }, + { + "epoch": 2.1974865350089767, + "grad_norm": 0.7410500645637512, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 30600 + }, + { + "epoch": 2.1982046678635547, + "grad_norm": 0.9176440834999084, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 30610 + }, + { + "epoch": 2.1989228007181327, + "grad_norm": 0.8823038935661316, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 30620 + }, + { + "epoch": 2.199640933572711, + "grad_norm": 0.9263436198234558, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 30630 + }, + { + "epoch": 2.200359066427289, + "grad_norm": 0.6753571033477783, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 30640 + }, + { + "epoch": 2.201077199281867, + "grad_norm": 0.841160774230957, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 30650 + }, + { + "epoch": 2.201795332136445, + "grad_norm": 0.8786441683769226, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 30660 + }, + { + "epoch": 2.202513464991023, + "grad_norm": 0.8833681344985962, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 30670 + }, + { + "epoch": 2.2032315978456016, + "grad_norm": 0.6609824299812317, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 30680 + }, + { + "epoch": 2.2039497307001796, + "grad_norm": 0.7308626174926758, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 30690 + }, + { + "epoch": 2.2046678635547576, + "grad_norm": 0.8854711055755615, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 30700 + }, + { + "epoch": 2.2053859964093356, + "grad_norm": 0.839043140411377, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 30710 + }, + { + "epoch": 2.2061041292639136, + "grad_norm": 0.9030174016952515, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 30720 + }, + { + "epoch": 2.206822262118492, + "grad_norm": 0.6856667399406433, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 30730 + }, + { + "epoch": 2.20754039497307, + "grad_norm": 0.8823501467704773, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 30740 + }, + { + "epoch": 2.208258527827648, + "grad_norm": 0.8501278162002563, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 30750 + }, + { + "epoch": 2.208976660682226, + "grad_norm": 0.8099446892738342, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 30760 + }, + { + "epoch": 2.209694793536804, + "grad_norm": 0.7203072905540466, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 30770 + }, + { + "epoch": 2.2104129263913825, + "grad_norm": 1.0898563861846924, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 30780 + }, + { + "epoch": 2.2111310592459605, + "grad_norm": 0.8157216906547546, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 30790 + }, + { + "epoch": 2.2118491921005385, + "grad_norm": 0.7617478966712952, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 30800 + }, + { + "epoch": 2.2125673249551165, + "grad_norm": 0.790503978729248, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 30810 + }, + { + "epoch": 2.213285457809695, + "grad_norm": 0.9289199113845825, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 30820 + }, + { + "epoch": 2.214003590664273, + "grad_norm": 0.9267001748085022, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 30830 + }, + { + "epoch": 2.214721723518851, + "grad_norm": 0.716023862361908, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 30840 + }, + { + "epoch": 2.215439856373429, + "grad_norm": 0.8733863234519958, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 30850 + }, + { + "epoch": 2.216157989228007, + "grad_norm": 0.7743660807609558, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 30860 + }, + { + "epoch": 2.2168761220825854, + "grad_norm": 0.7974567413330078, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 30870 + }, + { + "epoch": 2.2175942549371634, + "grad_norm": 0.6617984771728516, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 30880 + }, + { + "epoch": 2.2183123877917414, + "grad_norm": 0.6925143003463745, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 30890 + }, + { + "epoch": 2.2190305206463194, + "grad_norm": 0.6853532195091248, + "learning_rate": 0.0002, + "loss": 0.6986, + "step": 30900 + }, + { + "epoch": 2.219748653500898, + "grad_norm": 0.7964699268341064, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 30910 + }, + { + "epoch": 2.220466786355476, + "grad_norm": 0.8116228580474854, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 30920 + }, + { + "epoch": 2.221184919210054, + "grad_norm": 1.0121010541915894, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 30930 + }, + { + "epoch": 2.221903052064632, + "grad_norm": 0.7348445653915405, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 30940 + }, + { + "epoch": 2.22262118491921, + "grad_norm": 0.8998047709465027, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 30950 + }, + { + "epoch": 2.2233393177737883, + "grad_norm": 0.6108106970787048, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 30960 + }, + { + "epoch": 2.2240574506283664, + "grad_norm": 1.287834882736206, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 30970 + }, + { + "epoch": 2.2247755834829444, + "grad_norm": 0.8584468960762024, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 30980 + }, + { + "epoch": 2.2254937163375224, + "grad_norm": 0.865276038646698, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 30990 + }, + { + "epoch": 2.2262118491921004, + "grad_norm": 0.8713302612304688, + "learning_rate": 0.0002, + "loss": 0.7516, + "step": 31000 + }, + { + "epoch": 2.226929982046679, + "grad_norm": 0.9210535883903503, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 31010 + }, + { + "epoch": 2.227648114901257, + "grad_norm": 0.8578430414199829, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 31020 + }, + { + "epoch": 2.228366247755835, + "grad_norm": 0.7128387093544006, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 31030 + }, + { + "epoch": 2.229084380610413, + "grad_norm": 0.8059941530227661, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 31040 + }, + { + "epoch": 2.229802513464991, + "grad_norm": 0.8043261170387268, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 31050 + }, + { + "epoch": 2.2305206463195693, + "grad_norm": 0.9260253310203552, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 31060 + }, + { + "epoch": 2.2312387791741473, + "grad_norm": 0.7908085584640503, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 31070 + }, + { + "epoch": 2.2319569120287253, + "grad_norm": 0.7860442996025085, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 31080 + }, + { + "epoch": 2.2326750448833033, + "grad_norm": 0.8388702273368835, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 31090 + }, + { + "epoch": 2.2333931777378817, + "grad_norm": 0.835686206817627, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 31100 + }, + { + "epoch": 2.2341113105924597, + "grad_norm": 0.8148298859596252, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 31110 + }, + { + "epoch": 2.2348294434470377, + "grad_norm": 0.8501878976821899, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 31120 + }, + { + "epoch": 2.2355475763016157, + "grad_norm": 0.793323278427124, + "learning_rate": 0.0002, + "loss": 0.7262, + "step": 31130 + }, + { + "epoch": 2.2362657091561937, + "grad_norm": 0.8234742879867554, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31140 + }, + { + "epoch": 2.236983842010772, + "grad_norm": 0.8691303133964539, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 31150 + }, + { + "epoch": 2.23770197486535, + "grad_norm": 0.8707090020179749, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 31160 + }, + { + "epoch": 2.238420107719928, + "grad_norm": 0.8468940854072571, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 31170 + }, + { + "epoch": 2.239138240574506, + "grad_norm": 0.7275772094726562, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 31180 + }, + { + "epoch": 2.2398563734290846, + "grad_norm": 0.8765808939933777, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 31190 + }, + { + "epoch": 2.2405745062836626, + "grad_norm": 1.02803635597229, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 31200 + }, + { + "epoch": 2.2412926391382406, + "grad_norm": 0.7999185919761658, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 31210 + }, + { + "epoch": 2.2420107719928186, + "grad_norm": 0.5711870789527893, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 31220 + }, + { + "epoch": 2.2427289048473966, + "grad_norm": 0.7183604836463928, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 31230 + }, + { + "epoch": 2.243447037701975, + "grad_norm": 0.8819206357002258, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 31240 + }, + { + "epoch": 2.244165170556553, + "grad_norm": 0.9078969955444336, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 31250 + }, + { + "epoch": 2.244883303411131, + "grad_norm": 1.184506893157959, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 31260 + }, + { + "epoch": 2.245601436265709, + "grad_norm": 0.8660752177238464, + "learning_rate": 0.0002, + "loss": 0.7682, + "step": 31270 + }, + { + "epoch": 2.246319569120287, + "grad_norm": 1.011796236038208, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 31280 + }, + { + "epoch": 2.2470377019748655, + "grad_norm": 0.9168157577514648, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 31290 + }, + { + "epoch": 2.2477558348294435, + "grad_norm": 0.7798577547073364, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 31300 + }, + { + "epoch": 2.2484739676840215, + "grad_norm": 0.6609913110733032, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 31310 + }, + { + "epoch": 2.2491921005385995, + "grad_norm": 0.64737868309021, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 31320 + }, + { + "epoch": 2.2499102333931775, + "grad_norm": 1.0700385570526123, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 31330 + }, + { + "epoch": 2.250628366247756, + "grad_norm": 0.7838551998138428, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 31340 + }, + { + "epoch": 2.251346499102334, + "grad_norm": 0.9225728511810303, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 31350 + }, + { + "epoch": 2.252064631956912, + "grad_norm": 0.7956384420394897, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 31360 + }, + { + "epoch": 2.25278276481149, + "grad_norm": 0.7645466923713684, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 31370 + }, + { + "epoch": 2.2535008976660684, + "grad_norm": 0.9595549702644348, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 31380 + }, + { + "epoch": 2.2542190305206464, + "grad_norm": 0.6124163866043091, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 31390 + }, + { + "epoch": 2.2549371633752244, + "grad_norm": 0.7531530261039734, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 31400 + }, + { + "epoch": 2.2556552962298024, + "grad_norm": 0.6904721856117249, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 31410 + }, + { + "epoch": 2.2563734290843804, + "grad_norm": 0.7644204497337341, + "learning_rate": 0.0002, + "loss": 0.6661, + "step": 31420 + }, + { + "epoch": 2.257091561938959, + "grad_norm": 0.7879737019538879, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 31430 + }, + { + "epoch": 2.257809694793537, + "grad_norm": 0.796450138092041, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 31440 + }, + { + "epoch": 2.258527827648115, + "grad_norm": 0.7536656856536865, + "learning_rate": 0.0002, + "loss": 0.722, + "step": 31450 + }, + { + "epoch": 2.259245960502693, + "grad_norm": 0.6797451376914978, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 31460 + }, + { + "epoch": 2.2599640933572713, + "grad_norm": 0.7833347320556641, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 31470 + }, + { + "epoch": 2.2606822262118493, + "grad_norm": 0.7571428418159485, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 31480 + }, + { + "epoch": 2.2614003590664273, + "grad_norm": 0.7028690576553345, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 31490 + }, + { + "epoch": 2.2621184919210053, + "grad_norm": 0.7854651212692261, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 31500 + }, + { + "epoch": 2.2628366247755833, + "grad_norm": 1.1924974918365479, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 31510 + }, + { + "epoch": 2.2635547576301613, + "grad_norm": 0.8087588548660278, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 31520 + }, + { + "epoch": 2.26427289048474, + "grad_norm": 0.8521981835365295, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31530 + }, + { + "epoch": 2.264991023339318, + "grad_norm": 0.754585862159729, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 31540 + }, + { + "epoch": 2.265709156193896, + "grad_norm": 0.8403395414352417, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 31550 + }, + { + "epoch": 2.266427289048474, + "grad_norm": 0.9724786877632141, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 31560 + }, + { + "epoch": 2.2671454219030522, + "grad_norm": 0.7568767070770264, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 31570 + }, + { + "epoch": 2.2678635547576302, + "grad_norm": 0.712009608745575, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 31580 + }, + { + "epoch": 2.2685816876122082, + "grad_norm": 0.7649937868118286, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 31590 + }, + { + "epoch": 2.2692998204667862, + "grad_norm": 0.7319537997245789, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 31600 + }, + { + "epoch": 2.2700179533213642, + "grad_norm": 0.9597942233085632, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 31610 + }, + { + "epoch": 2.2707360861759427, + "grad_norm": 0.7403358817100525, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 31620 + }, + { + "epoch": 2.2714542190305207, + "grad_norm": 0.7395114898681641, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 31630 + }, + { + "epoch": 2.2721723518850987, + "grad_norm": 0.8835344314575195, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 31640 + }, + { + "epoch": 2.2728904847396767, + "grad_norm": 0.76587975025177, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 31650 + }, + { + "epoch": 2.273608617594255, + "grad_norm": 0.6472584009170532, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 31660 + }, + { + "epoch": 2.274326750448833, + "grad_norm": 1.0170460939407349, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 31670 + }, + { + "epoch": 2.275044883303411, + "grad_norm": 0.8170912265777588, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 31680 + }, + { + "epoch": 2.275763016157989, + "grad_norm": 0.6821279525756836, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 31690 + }, + { + "epoch": 2.276481149012567, + "grad_norm": 0.8150709867477417, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 31700 + }, + { + "epoch": 2.2771992818671456, + "grad_norm": 0.6786386370658875, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 31710 + }, + { + "epoch": 2.2779174147217236, + "grad_norm": 0.8871912360191345, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 31720 + }, + { + "epoch": 2.2786355475763016, + "grad_norm": 0.7710220813751221, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 31730 + }, + { + "epoch": 2.2793536804308796, + "grad_norm": 0.8073079586029053, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 31740 + }, + { + "epoch": 2.280071813285458, + "grad_norm": 0.8228550553321838, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 31750 + }, + { + "epoch": 2.280789946140036, + "grad_norm": 0.7987996339797974, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 31760 + }, + { + "epoch": 2.281508078994614, + "grad_norm": 0.744326651096344, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 31770 + }, + { + "epoch": 2.282226211849192, + "grad_norm": 0.7672302722930908, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 31780 + }, + { + "epoch": 2.28294434470377, + "grad_norm": 0.8079774975776672, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 31790 + }, + { + "epoch": 2.283662477558348, + "grad_norm": 0.7383643984794617, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 31800 + }, + { + "epoch": 2.2843806104129265, + "grad_norm": 0.8542332649230957, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 31810 + }, + { + "epoch": 2.2850987432675045, + "grad_norm": 0.7657321691513062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 31820 + }, + { + "epoch": 2.2858168761220825, + "grad_norm": 0.7485944628715515, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 31830 + }, + { + "epoch": 2.2865350089766605, + "grad_norm": 0.7817596793174744, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 31840 + }, + { + "epoch": 2.287253141831239, + "grad_norm": 0.840421736240387, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 31850 + }, + { + "epoch": 2.287971274685817, + "grad_norm": 0.8190447688102722, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 31860 + }, + { + "epoch": 2.288689407540395, + "grad_norm": 0.9582287669181824, + "learning_rate": 0.0002, + "loss": 0.7343, + "step": 31870 + }, + { + "epoch": 2.289407540394973, + "grad_norm": 1.0939116477966309, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 31880 + }, + { + "epoch": 2.290125673249551, + "grad_norm": 1.0901678800582886, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 31890 + }, + { + "epoch": 2.2908438061041294, + "grad_norm": 0.8025168776512146, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 31900 + }, + { + "epoch": 2.2915619389587074, + "grad_norm": 0.8157371878623962, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 31910 + }, + { + "epoch": 2.2922800718132854, + "grad_norm": 0.7735328078269958, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 31920 + }, + { + "epoch": 2.2929982046678634, + "grad_norm": 0.7501550316810608, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 31930 + }, + { + "epoch": 2.293716337522442, + "grad_norm": 0.76664799451828, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 31940 + }, + { + "epoch": 2.29443447037702, + "grad_norm": 1.0044599771499634, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 31950 + }, + { + "epoch": 2.295152603231598, + "grad_norm": 0.7773551344871521, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 31960 + }, + { + "epoch": 2.295870736086176, + "grad_norm": 0.9021226763725281, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 31970 + }, + { + "epoch": 2.296588868940754, + "grad_norm": 0.9075915813446045, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 31980 + }, + { + "epoch": 2.2973070017953323, + "grad_norm": 0.9109290242195129, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 31990 + }, + { + "epoch": 2.2980251346499103, + "grad_norm": 0.7742900252342224, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32000 + }, + { + "epoch": 2.2987432675044883, + "grad_norm": 0.633260190486908, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 32010 + }, + { + "epoch": 2.2994614003590663, + "grad_norm": 0.8593834042549133, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 32020 + }, + { + "epoch": 2.3001795332136448, + "grad_norm": 0.88165283203125, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32030 + }, + { + "epoch": 2.3008976660682228, + "grad_norm": 0.7840633988380432, + "learning_rate": 0.0002, + "loss": 0.7779, + "step": 32040 + }, + { + "epoch": 2.3016157989228008, + "grad_norm": 0.8150764107704163, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 32050 + }, + { + "epoch": 2.3023339317773788, + "grad_norm": 0.7683324813842773, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32060 + }, + { + "epoch": 2.3030520646319568, + "grad_norm": 0.7581049799919128, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 32070 + }, + { + "epoch": 2.3037701974865348, + "grad_norm": 0.911687970161438, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32080 + }, + { + "epoch": 2.3044883303411132, + "grad_norm": 1.0596355199813843, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32090 + }, + { + "epoch": 2.3052064631956912, + "grad_norm": 0.7329661846160889, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 32100 + }, + { + "epoch": 2.3059245960502692, + "grad_norm": 0.8251074552536011, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 32110 + }, + { + "epoch": 2.3066427289048472, + "grad_norm": 0.7765523195266724, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 32120 + }, + { + "epoch": 2.3073608617594257, + "grad_norm": 0.8246980905532837, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 32130 + }, + { + "epoch": 2.3080789946140037, + "grad_norm": 0.833387017250061, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 32140 + }, + { + "epoch": 2.3087971274685817, + "grad_norm": 0.9558065533638, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 32150 + }, + { + "epoch": 2.3095152603231597, + "grad_norm": 0.788151204586029, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 32160 + }, + { + "epoch": 2.3102333931777377, + "grad_norm": 0.8662320971488953, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 32170 + }, + { + "epoch": 2.310951526032316, + "grad_norm": 0.7079060673713684, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 32180 + }, + { + "epoch": 2.311669658886894, + "grad_norm": 0.8477022647857666, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 32190 + }, + { + "epoch": 2.312387791741472, + "grad_norm": 0.6549711227416992, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 32200 + }, + { + "epoch": 2.31310592459605, + "grad_norm": 0.8274375796318054, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 32210 + }, + { + "epoch": 2.3138240574506286, + "grad_norm": 0.6305822730064392, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 32220 + }, + { + "epoch": 2.3145421903052066, + "grad_norm": 0.8105725049972534, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 32230 + }, + { + "epoch": 2.3152603231597846, + "grad_norm": 0.7317119240760803, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 32240 + }, + { + "epoch": 2.3159784560143626, + "grad_norm": 0.7729924917221069, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 32250 + }, + { + "epoch": 2.3166965888689406, + "grad_norm": 0.8092145919799805, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 32260 + }, + { + "epoch": 2.317414721723519, + "grad_norm": 0.8723762035369873, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 32270 + }, + { + "epoch": 2.318132854578097, + "grad_norm": 0.9699533581733704, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 32280 + }, + { + "epoch": 2.318850987432675, + "grad_norm": 1.2972444295883179, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 32290 + }, + { + "epoch": 2.319569120287253, + "grad_norm": 0.7888450622558594, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 32300 + }, + { + "epoch": 2.3202872531418315, + "grad_norm": 0.7457000017166138, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 32310 + }, + { + "epoch": 2.3210053859964095, + "grad_norm": 0.7270606756210327, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 32320 + }, + { + "epoch": 2.3217235188509875, + "grad_norm": 0.7930711507797241, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32330 + }, + { + "epoch": 2.3224416517055655, + "grad_norm": 0.9015030264854431, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 32340 + }, + { + "epoch": 2.3231597845601435, + "grad_norm": 0.9385523796081543, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 32350 + }, + { + "epoch": 2.3238779174147215, + "grad_norm": 0.7293606400489807, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 32360 + }, + { + "epoch": 2.3245960502693, + "grad_norm": 0.797618567943573, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 32370 + }, + { + "epoch": 2.325314183123878, + "grad_norm": 0.8588258028030396, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 32380 + }, + { + "epoch": 2.326032315978456, + "grad_norm": 0.7490078210830688, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 32390 + }, + { + "epoch": 2.326750448833034, + "grad_norm": 0.7569956183433533, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 32400 + }, + { + "epoch": 2.3274685816876124, + "grad_norm": 0.8754122853279114, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 32410 + }, + { + "epoch": 2.3281867145421904, + "grad_norm": 0.9410699605941772, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 32420 + }, + { + "epoch": 2.3289048473967684, + "grad_norm": 1.1309062242507935, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 32430 + }, + { + "epoch": 2.3296229802513464, + "grad_norm": 0.7923168540000916, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 32440 + }, + { + "epoch": 2.3303411131059244, + "grad_norm": 0.830387532711029, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 32450 + }, + { + "epoch": 2.331059245960503, + "grad_norm": 0.9087454080581665, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 32460 + }, + { + "epoch": 2.331777378815081, + "grad_norm": 0.8892660737037659, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 32470 + }, + { + "epoch": 2.332495511669659, + "grad_norm": 0.84930819272995, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 32480 + }, + { + "epoch": 2.333213644524237, + "grad_norm": 0.7736781239509583, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 32490 + }, + { + "epoch": 2.3339317773788153, + "grad_norm": 0.7396222352981567, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 32500 + }, + { + "epoch": 2.3346499102333933, + "grad_norm": 0.7710241079330444, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 32510 + }, + { + "epoch": 2.3353680430879713, + "grad_norm": 0.7297301888465881, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 32520 + }, + { + "epoch": 2.3360861759425493, + "grad_norm": 0.9084094166755676, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 32530 + }, + { + "epoch": 2.3368043087971273, + "grad_norm": 0.6425859332084656, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 32540 + }, + { + "epoch": 2.3375224416517058, + "grad_norm": 0.8646581172943115, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 32550 + }, + { + "epoch": 2.3382405745062838, + "grad_norm": 0.91925048828125, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 32560 + }, + { + "epoch": 2.3389587073608618, + "grad_norm": 0.8687716722488403, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 32570 + }, + { + "epoch": 2.3396768402154398, + "grad_norm": 0.9769517183303833, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 32580 + }, + { + "epoch": 2.340394973070018, + "grad_norm": 0.7240557074546814, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 32590 + }, + { + "epoch": 2.341113105924596, + "grad_norm": 0.6631549000740051, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 32600 + }, + { + "epoch": 2.341831238779174, + "grad_norm": 0.9103635549545288, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 32610 + }, + { + "epoch": 2.342549371633752, + "grad_norm": 0.8718403577804565, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 32620 + }, + { + "epoch": 2.34326750448833, + "grad_norm": 0.8020271062850952, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 32630 + }, + { + "epoch": 2.343985637342908, + "grad_norm": 0.7834265232086182, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 32640 + }, + { + "epoch": 2.3447037701974867, + "grad_norm": 0.8909988403320312, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 32650 + }, + { + "epoch": 2.3454219030520647, + "grad_norm": 0.6915582418441772, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 32660 + }, + { + "epoch": 2.3461400359066427, + "grad_norm": 0.8829401135444641, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 32670 + }, + { + "epoch": 2.3468581687612207, + "grad_norm": 0.8869150876998901, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 32680 + }, + { + "epoch": 2.347576301615799, + "grad_norm": 0.8348933458328247, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 32690 + }, + { + "epoch": 2.348294434470377, + "grad_norm": 0.7591108679771423, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32700 + }, + { + "epoch": 2.349012567324955, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.0002, + "loss": 0.7155, + "step": 32710 + }, + { + "epoch": 2.349730700179533, + "grad_norm": 0.8537896275520325, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 32720 + }, + { + "epoch": 2.350448833034111, + "grad_norm": 0.7750797867774963, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 32730 + }, + { + "epoch": 2.3511669658886896, + "grad_norm": 0.7553941607475281, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 32740 + }, + { + "epoch": 2.3518850987432676, + "grad_norm": 0.8083372712135315, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 32750 + }, + { + "epoch": 2.3526032315978456, + "grad_norm": 0.8016324043273926, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 32760 + }, + { + "epoch": 2.3533213644524236, + "grad_norm": 0.7524061799049377, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 32770 + }, + { + "epoch": 2.354039497307002, + "grad_norm": 0.9046763777732849, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 32780 + }, + { + "epoch": 2.35475763016158, + "grad_norm": 0.9704324007034302, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 32790 + }, + { + "epoch": 2.355475763016158, + "grad_norm": 0.8756019473075867, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 32800 + }, + { + "epoch": 2.356193895870736, + "grad_norm": 0.7345646023750305, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32810 + }, + { + "epoch": 2.356912028725314, + "grad_norm": 0.8022899031639099, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 32820 + }, + { + "epoch": 2.3576301615798925, + "grad_norm": 0.7663353085517883, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 32830 + }, + { + "epoch": 2.3583482944344705, + "grad_norm": 0.7802956104278564, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 32840 + }, + { + "epoch": 2.3590664272890485, + "grad_norm": 0.8130960464477539, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 32850 + }, + { + "epoch": 2.3597845601436265, + "grad_norm": 0.9671252369880676, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 32860 + }, + { + "epoch": 2.3605026929982045, + "grad_norm": 0.8806724548339844, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 32870 + }, + { + "epoch": 2.361220825852783, + "grad_norm": 0.9378283619880676, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 32880 + }, + { + "epoch": 2.361938958707361, + "grad_norm": 0.8638162612915039, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 32890 + }, + { + "epoch": 2.362657091561939, + "grad_norm": 0.7321885228157043, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 32900 + }, + { + "epoch": 2.363375224416517, + "grad_norm": 0.8445415496826172, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 32910 + }, + { + "epoch": 2.364093357271095, + "grad_norm": 0.915715754032135, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 32920 + }, + { + "epoch": 2.3648114901256734, + "grad_norm": 0.8674854040145874, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 32930 + }, + { + "epoch": 2.3655296229802514, + "grad_norm": 0.7577189207077026, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 32940 + }, + { + "epoch": 2.3662477558348294, + "grad_norm": 0.8649988174438477, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 32950 + }, + { + "epoch": 2.3669658886894074, + "grad_norm": 0.9760734438896179, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 32960 + }, + { + "epoch": 2.367684021543986, + "grad_norm": 0.8909491300582886, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 32970 + }, + { + "epoch": 2.368402154398564, + "grad_norm": 0.6970168948173523, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 32980 + }, + { + "epoch": 2.369120287253142, + "grad_norm": 0.8208426237106323, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 32990 + }, + { + "epoch": 2.36983842010772, + "grad_norm": 0.8477405309677124, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 33000 + }, + { + "epoch": 2.370556552962298, + "grad_norm": 0.7771625518798828, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 33010 + }, + { + "epoch": 2.3712746858168763, + "grad_norm": 0.7811821103096008, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33020 + }, + { + "epoch": 2.3719928186714543, + "grad_norm": 0.6280415654182434, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33030 + }, + { + "epoch": 2.3727109515260323, + "grad_norm": 0.8733929395675659, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 33040 + }, + { + "epoch": 2.3734290843806103, + "grad_norm": 0.6169558167457581, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33050 + }, + { + "epoch": 2.3741472172351887, + "grad_norm": 0.7414724826812744, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33060 + }, + { + "epoch": 2.3748653500897667, + "grad_norm": 0.7484683990478516, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 33070 + }, + { + "epoch": 2.3755834829443447, + "grad_norm": 0.8495098948478699, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 33080 + }, + { + "epoch": 2.3763016157989227, + "grad_norm": 0.9057353734970093, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 33090 + }, + { + "epoch": 2.3770197486535007, + "grad_norm": 0.8028274178504944, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 33100 + }, + { + "epoch": 2.377737881508079, + "grad_norm": 1.2398128509521484, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 33110 + }, + { + "epoch": 2.378456014362657, + "grad_norm": 0.7894110679626465, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 33120 + }, + { + "epoch": 2.379174147217235, + "grad_norm": 0.8530096411705017, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 33130 + }, + { + "epoch": 2.379892280071813, + "grad_norm": 0.892613410949707, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 33140 + }, + { + "epoch": 2.380610412926391, + "grad_norm": 0.868606448173523, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 33150 + }, + { + "epoch": 2.3813285457809696, + "grad_norm": 0.6801115870475769, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 33160 + }, + { + "epoch": 2.3820466786355476, + "grad_norm": 0.9517148733139038, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 33170 + }, + { + "epoch": 2.3827648114901256, + "grad_norm": 0.8986499309539795, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 33180 + }, + { + "epoch": 2.3834829443447036, + "grad_norm": 0.8467642068862915, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33190 + }, + { + "epoch": 2.3842010771992816, + "grad_norm": 0.8400940299034119, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 33200 + }, + { + "epoch": 2.38491921005386, + "grad_norm": 0.86443030834198, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 33210 + }, + { + "epoch": 2.385637342908438, + "grad_norm": 0.8599014282226562, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 33220 + }, + { + "epoch": 2.386355475763016, + "grad_norm": 0.868735134601593, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 33230 + }, + { + "epoch": 2.387073608617594, + "grad_norm": 0.941734790802002, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 33240 + }, + { + "epoch": 2.3877917414721725, + "grad_norm": 0.9342881441116333, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 33250 + }, + { + "epoch": 2.3885098743267505, + "grad_norm": 1.012920618057251, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 33260 + }, + { + "epoch": 2.3892280071813286, + "grad_norm": 0.6949151754379272, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 33270 + }, + { + "epoch": 2.3899461400359066, + "grad_norm": 0.8283912539482117, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 33280 + }, + { + "epoch": 2.3906642728904846, + "grad_norm": 0.807273805141449, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 33290 + }, + { + "epoch": 2.391382405745063, + "grad_norm": 0.8109124302864075, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 33300 + }, + { + "epoch": 2.392100538599641, + "grad_norm": 0.7477563619613647, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 33310 + }, + { + "epoch": 2.392818671454219, + "grad_norm": 0.6961637735366821, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 33320 + }, + { + "epoch": 2.393536804308797, + "grad_norm": 0.9424173831939697, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 33330 + }, + { + "epoch": 2.3942549371633755, + "grad_norm": 0.8289623856544495, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 33340 + }, + { + "epoch": 2.3949730700179535, + "grad_norm": 0.8106551170349121, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 33350 + }, + { + "epoch": 2.3956912028725315, + "grad_norm": 0.8800507187843323, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33360 + }, + { + "epoch": 2.3964093357271095, + "grad_norm": 0.7662274241447449, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 33370 + }, + { + "epoch": 2.3971274685816875, + "grad_norm": 0.889204740524292, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 33380 + }, + { + "epoch": 2.3978456014362655, + "grad_norm": 0.7991349697113037, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 33390 + }, + { + "epoch": 2.398563734290844, + "grad_norm": 0.8210278749465942, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 33400 + }, + { + "epoch": 2.399281867145422, + "grad_norm": 0.91801917552948, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 33410 + }, + { + "epoch": 2.4, + "grad_norm": 0.8086220622062683, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 33420 + }, + { + "epoch": 2.400718132854578, + "grad_norm": 0.901613175868988, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 33430 + }, + { + "epoch": 2.4014362657091564, + "grad_norm": 0.9865965247154236, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 33440 + }, + { + "epoch": 2.4021543985637344, + "grad_norm": 0.8160675168037415, + "learning_rate": 0.0002, + "loss": 0.7543, + "step": 33450 + }, + { + "epoch": 2.4028725314183124, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33460 + }, + { + "epoch": 2.4035906642728904, + "grad_norm": 0.8490013480186462, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 33470 + }, + { + "epoch": 2.4043087971274684, + "grad_norm": 0.6947163939476013, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 33480 + }, + { + "epoch": 2.405026929982047, + "grad_norm": 0.7984827756881714, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 33490 + }, + { + "epoch": 2.405745062836625, + "grad_norm": 0.7826083302497864, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 33500 + }, + { + "epoch": 2.406463195691203, + "grad_norm": 0.8213959336280823, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 33510 + }, + { + "epoch": 2.407181328545781, + "grad_norm": 0.8790069818496704, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 33520 + }, + { + "epoch": 2.4078994614003593, + "grad_norm": 0.9093378782272339, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 33530 + }, + { + "epoch": 2.4086175942549373, + "grad_norm": 0.8085389137268066, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 33540 + }, + { + "epoch": 2.4093357271095153, + "grad_norm": 0.7952343225479126, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 33550 + }, + { + "epoch": 2.4100538599640933, + "grad_norm": 0.9576563835144043, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 33560 + }, + { + "epoch": 2.4107719928186713, + "grad_norm": 0.7722929120063782, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 33570 + }, + { + "epoch": 2.4114901256732497, + "grad_norm": 0.8634604215621948, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 33580 + }, + { + "epoch": 2.4122082585278277, + "grad_norm": 0.7805271148681641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 33590 + }, + { + "epoch": 2.4129263913824057, + "grad_norm": 0.8274481296539307, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 33600 + }, + { + "epoch": 2.4136445242369837, + "grad_norm": 0.9265141487121582, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 33610 + }, + { + "epoch": 2.414362657091562, + "grad_norm": 0.7497374415397644, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 33620 + }, + { + "epoch": 2.41508078994614, + "grad_norm": 0.7048972249031067, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 33630 + }, + { + "epoch": 2.415798922800718, + "grad_norm": 0.8449550271034241, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 33640 + }, + { + "epoch": 2.416517055655296, + "grad_norm": 0.7581984400749207, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 33650 + }, + { + "epoch": 2.417235188509874, + "grad_norm": 0.7744191288948059, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 33660 + }, + { + "epoch": 2.417953321364452, + "grad_norm": 0.6736614108085632, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 33670 + }, + { + "epoch": 2.4186714542190306, + "grad_norm": 0.985431432723999, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 33680 + }, + { + "epoch": 2.4193895870736086, + "grad_norm": 0.8027978539466858, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 33690 + }, + { + "epoch": 2.4201077199281866, + "grad_norm": 0.6809377074241638, + "learning_rate": 0.0002, + "loss": 0.7107, + "step": 33700 + }, + { + "epoch": 2.4208258527827646, + "grad_norm": 0.8305349946022034, + "learning_rate": 0.0002, + "loss": 0.7332, + "step": 33710 + }, + { + "epoch": 2.421543985637343, + "grad_norm": 0.7632496356964111, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 33720 + }, + { + "epoch": 2.422262118491921, + "grad_norm": 0.7241050601005554, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 33730 + }, + { + "epoch": 2.422980251346499, + "grad_norm": 0.6729857325553894, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 33740 + }, + { + "epoch": 2.423698384201077, + "grad_norm": 0.7741881012916565, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 33750 + }, + { + "epoch": 2.424416517055655, + "grad_norm": 0.7844415903091431, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 33760 + }, + { + "epoch": 2.4251346499102335, + "grad_norm": 0.7960098385810852, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 33770 + }, + { + "epoch": 2.4258527827648115, + "grad_norm": 0.8267978429794312, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 33780 + }, + { + "epoch": 2.4265709156193895, + "grad_norm": 0.7498974204063416, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 33790 + }, + { + "epoch": 2.4272890484739675, + "grad_norm": 0.8357859253883362, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 33800 + }, + { + "epoch": 2.428007181328546, + "grad_norm": 0.8056104779243469, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 33810 + }, + { + "epoch": 2.428725314183124, + "grad_norm": 0.806897759437561, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 33820 + }, + { + "epoch": 2.429443447037702, + "grad_norm": 0.7770048975944519, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 33830 + }, + { + "epoch": 2.43016157989228, + "grad_norm": 0.8311458230018616, + "learning_rate": 0.0002, + "loss": 0.7096, + "step": 33840 + }, + { + "epoch": 2.430879712746858, + "grad_norm": 0.9201730489730835, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 33850 + }, + { + "epoch": 2.4315978456014364, + "grad_norm": 0.83509761095047, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 33860 + }, + { + "epoch": 2.4323159784560144, + "grad_norm": 0.7680139541625977, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 33870 + }, + { + "epoch": 2.4330341113105924, + "grad_norm": 0.8956670165061951, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 33880 + }, + { + "epoch": 2.4337522441651704, + "grad_norm": 0.717941164970398, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 33890 + }, + { + "epoch": 2.434470377019749, + "grad_norm": 0.777206540107727, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 33900 + }, + { + "epoch": 2.435188509874327, + "grad_norm": 0.90232914686203, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 33910 + }, + { + "epoch": 2.435906642728905, + "grad_norm": 1.0817158222198486, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 33920 + }, + { + "epoch": 2.436624775583483, + "grad_norm": 0.7890931367874146, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 33930 + }, + { + "epoch": 2.437342908438061, + "grad_norm": 0.9279449582099915, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 33940 + }, + { + "epoch": 2.438061041292639, + "grad_norm": 0.8313823342323303, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 33950 + }, + { + "epoch": 2.4387791741472173, + "grad_norm": 1.0510340929031372, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 33960 + }, + { + "epoch": 2.4394973070017953, + "grad_norm": 0.8002574443817139, + "learning_rate": 0.0002, + "loss": 0.7203, + "step": 33970 + }, + { + "epoch": 2.4402154398563733, + "grad_norm": 0.7822834253311157, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 33980 + }, + { + "epoch": 2.4409335727109513, + "grad_norm": 0.9050403237342834, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 33990 + }, + { + "epoch": 2.44165170556553, + "grad_norm": 0.7569652199745178, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 34000 + }, + { + "epoch": 2.442369838420108, + "grad_norm": 0.6609470844268799, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 34010 + }, + { + "epoch": 2.443087971274686, + "grad_norm": 0.8090947866439819, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34020 + }, + { + "epoch": 2.443806104129264, + "grad_norm": 0.647814929485321, + "learning_rate": 0.0002, + "loss": 0.6621, + "step": 34030 + }, + { + "epoch": 2.444524236983842, + "grad_norm": 0.9308601021766663, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 34040 + }, + { + "epoch": 2.4452423698384202, + "grad_norm": 0.8259239792823792, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34050 + }, + { + "epoch": 2.4459605026929983, + "grad_norm": 0.9410025477409363, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 34060 + }, + { + "epoch": 2.4466786355475763, + "grad_norm": 0.7446974515914917, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 34070 + }, + { + "epoch": 2.4473967684021543, + "grad_norm": 0.7093849182128906, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 34080 + }, + { + "epoch": 2.4481149012567327, + "grad_norm": 0.8726152181625366, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 34090 + }, + { + "epoch": 2.4488330341113107, + "grad_norm": 0.808300793170929, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 34100 + }, + { + "epoch": 2.4495511669658887, + "grad_norm": 0.6884859800338745, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 34110 + }, + { + "epoch": 2.4502692998204667, + "grad_norm": 0.7151864767074585, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 34120 + }, + { + "epoch": 2.4509874326750447, + "grad_norm": 0.9261866807937622, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 34130 + }, + { + "epoch": 2.451705565529623, + "grad_norm": 0.8069018125534058, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 34140 + }, + { + "epoch": 2.452423698384201, + "grad_norm": 0.8001297116279602, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 34150 + }, + { + "epoch": 2.453141831238779, + "grad_norm": 0.8547799587249756, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 34160 + }, + { + "epoch": 2.453859964093357, + "grad_norm": 0.6693823337554932, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 34170 + }, + { + "epoch": 2.4545780969479356, + "grad_norm": 0.6646198630332947, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34180 + }, + { + "epoch": 2.4552962298025136, + "grad_norm": 0.9330950975418091, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 34190 + }, + { + "epoch": 2.4560143626570916, + "grad_norm": 0.7738645672798157, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 34200 + }, + { + "epoch": 2.4567324955116696, + "grad_norm": 0.7929846048355103, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 34210 + }, + { + "epoch": 2.4574506283662476, + "grad_norm": 0.8936280012130737, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34220 + }, + { + "epoch": 2.4581687612208256, + "grad_norm": 0.9099360108375549, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 34230 + }, + { + "epoch": 2.458886894075404, + "grad_norm": 0.7941291928291321, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 34240 + }, + { + "epoch": 2.459605026929982, + "grad_norm": 0.7169737219810486, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 34250 + }, + { + "epoch": 2.46032315978456, + "grad_norm": 0.8994171023368835, + "learning_rate": 0.0002, + "loss": 0.7417, + "step": 34260 + }, + { + "epoch": 2.461041292639138, + "grad_norm": 0.8087331056594849, + "learning_rate": 0.0002, + "loss": 0.6807, + "step": 34270 + }, + { + "epoch": 2.4617594254937165, + "grad_norm": 0.935502827167511, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 34280 + }, + { + "epoch": 2.4624775583482945, + "grad_norm": 0.8957464694976807, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 34290 + }, + { + "epoch": 2.4631956912028725, + "grad_norm": 0.9017183780670166, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 34300 + }, + { + "epoch": 2.4639138240574505, + "grad_norm": 0.7778640389442444, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34310 + }, + { + "epoch": 2.4646319569120285, + "grad_norm": 0.8870323896408081, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 34320 + }, + { + "epoch": 2.465350089766607, + "grad_norm": 0.7660176753997803, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 34330 + }, + { + "epoch": 2.466068222621185, + "grad_norm": 0.8442226648330688, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 34340 + }, + { + "epoch": 2.466786355475763, + "grad_norm": 0.7522561550140381, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 34350 + }, + { + "epoch": 2.467504488330341, + "grad_norm": 0.9355213046073914, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 34360 + }, + { + "epoch": 2.4682226211849194, + "grad_norm": 0.8487382531166077, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 34370 + }, + { + "epoch": 2.4689407540394974, + "grad_norm": 0.7869813442230225, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 34380 + }, + { + "epoch": 2.4696588868940754, + "grad_norm": 0.7562848329544067, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 34390 + }, + { + "epoch": 2.4703770197486534, + "grad_norm": 0.740829586982727, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 34400 + }, + { + "epoch": 2.4710951526032314, + "grad_norm": 1.0862116813659668, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 34410 + }, + { + "epoch": 2.47181328545781, + "grad_norm": 0.9633645415306091, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 34420 + }, + { + "epoch": 2.472531418312388, + "grad_norm": 0.8467186093330383, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 34430 + }, + { + "epoch": 2.473249551166966, + "grad_norm": 0.9972147941589355, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 34440 + }, + { + "epoch": 2.473967684021544, + "grad_norm": 0.8086632490158081, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 34450 + }, + { + "epoch": 2.4746858168761223, + "grad_norm": 0.9043704271316528, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 34460 + }, + { + "epoch": 2.4754039497307003, + "grad_norm": 0.8275330662727356, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34470 + }, + { + "epoch": 2.4761220825852783, + "grad_norm": 0.8142464756965637, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 34480 + }, + { + "epoch": 2.4768402154398563, + "grad_norm": 0.7116754651069641, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 34490 + }, + { + "epoch": 2.4775583482944343, + "grad_norm": 0.8742281198501587, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 34500 + }, + { + "epoch": 2.4782764811490123, + "grad_norm": 0.7545657157897949, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 34510 + }, + { + "epoch": 2.478994614003591, + "grad_norm": 0.7586482167243958, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 34520 + }, + { + "epoch": 2.479712746858169, + "grad_norm": 0.9212547540664673, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 34530 + }, + { + "epoch": 2.480430879712747, + "grad_norm": 0.9391530752182007, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 34540 + }, + { + "epoch": 2.481149012567325, + "grad_norm": 1.119698166847229, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 34550 + }, + { + "epoch": 2.4818671454219032, + "grad_norm": 0.8499019145965576, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 34560 + }, + { + "epoch": 2.4825852782764812, + "grad_norm": 0.7629778385162354, + "learning_rate": 0.0002, + "loss": 0.7043, + "step": 34570 + }, + { + "epoch": 2.4833034111310592, + "grad_norm": 0.7667021155357361, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 34580 + }, + { + "epoch": 2.4840215439856372, + "grad_norm": 0.6711493730545044, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 34590 + }, + { + "epoch": 2.4847396768402152, + "grad_norm": 0.7354223728179932, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 34600 + }, + { + "epoch": 2.4854578096947937, + "grad_norm": 0.875295102596283, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 34610 + }, + { + "epoch": 2.4861759425493717, + "grad_norm": 0.7341493964195251, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 34620 + }, + { + "epoch": 2.4868940754039497, + "grad_norm": 0.9049216508865356, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 34630 + }, + { + "epoch": 2.4876122082585277, + "grad_norm": 0.7214788198471069, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 34640 + }, + { + "epoch": 2.488330341113106, + "grad_norm": 0.7514070868492126, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 34650 + }, + { + "epoch": 2.489048473967684, + "grad_norm": 0.6929763555526733, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 34660 + }, + { + "epoch": 2.489766606822262, + "grad_norm": 1.11346435546875, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 34670 + }, + { + "epoch": 2.49048473967684, + "grad_norm": 0.9285556674003601, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 34680 + }, + { + "epoch": 2.491202872531418, + "grad_norm": 0.7699695825576782, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 34690 + }, + { + "epoch": 2.4919210053859966, + "grad_norm": 0.872349739074707, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 34700 + }, + { + "epoch": 2.4926391382405746, + "grad_norm": 0.8692147135734558, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 34710 + }, + { + "epoch": 2.4933572710951526, + "grad_norm": 0.799740195274353, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 34720 + }, + { + "epoch": 2.4940754039497306, + "grad_norm": 0.7320986986160278, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 34730 + }, + { + "epoch": 2.494793536804309, + "grad_norm": 0.8233383893966675, + "learning_rate": 0.0002, + "loss": 0.7138, + "step": 34740 + }, + { + "epoch": 2.495511669658887, + "grad_norm": 0.9605086445808411, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 34750 + }, + { + "epoch": 2.496229802513465, + "grad_norm": 0.8597773909568787, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 34760 + }, + { + "epoch": 2.496947935368043, + "grad_norm": 0.7459201812744141, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 34770 + }, + { + "epoch": 2.497666068222621, + "grad_norm": 0.778457522392273, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 34780 + }, + { + "epoch": 2.498384201077199, + "grad_norm": 0.8591375946998596, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 34790 + }, + { + "epoch": 2.4991023339317775, + "grad_norm": 0.9689867496490479, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 34800 + }, + { + "epoch": 2.4998204667863555, + "grad_norm": 0.7430615425109863, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 34810 + }, + { + "epoch": 2.5005385996409335, + "grad_norm": 0.8545114994049072, + "learning_rate": 0.0002, + "loss": 0.7207, + "step": 34820 + }, + { + "epoch": 2.5012567324955115, + "grad_norm": 0.7115356922149658, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 34830 + }, + { + "epoch": 2.50197486535009, + "grad_norm": 0.7616795301437378, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 34840 + }, + { + "epoch": 2.502692998204668, + "grad_norm": 0.8097891211509705, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 34850 + }, + { + "epoch": 2.503411131059246, + "grad_norm": 0.7397396564483643, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 34860 + }, + { + "epoch": 2.504129263913824, + "grad_norm": 0.7531594038009644, + "learning_rate": 0.0002, + "loss": 0.7213, + "step": 34870 + }, + { + "epoch": 2.504847396768402, + "grad_norm": 0.8050091862678528, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 34880 + }, + { + "epoch": 2.5055655296229804, + "grad_norm": 0.7550507187843323, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 34890 + }, + { + "epoch": 2.5062836624775584, + "grad_norm": 1.0131759643554688, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 34900 + }, + { + "epoch": 2.5070017953321364, + "grad_norm": 0.9275356531143188, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 34910 + }, + { + "epoch": 2.5077199281867144, + "grad_norm": 0.6655791997909546, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 34920 + }, + { + "epoch": 2.508438061041293, + "grad_norm": 0.79361891746521, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 34930 + }, + { + "epoch": 2.509156193895871, + "grad_norm": 0.8223658800125122, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 34940 + }, + { + "epoch": 2.509874326750449, + "grad_norm": 1.0070416927337646, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 34950 + }, + { + "epoch": 2.510592459605027, + "grad_norm": 0.8408986330032349, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 34960 + }, + { + "epoch": 2.511310592459605, + "grad_norm": 0.8178259134292603, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 34970 + }, + { + "epoch": 2.512028725314183, + "grad_norm": 0.747876763343811, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 34980 + }, + { + "epoch": 2.5127468581687613, + "grad_norm": 0.8551825881004333, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 34990 + }, + { + "epoch": 2.5134649910233393, + "grad_norm": 0.8366564512252808, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 35000 + }, + { + "epoch": 2.5141831238779173, + "grad_norm": 0.8491294384002686, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 35010 + }, + { + "epoch": 2.5149012567324958, + "grad_norm": 0.8854562640190125, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 35020 + }, + { + "epoch": 2.5156193895870738, + "grad_norm": 0.8652133345603943, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 35030 + }, + { + "epoch": 2.5163375224416518, + "grad_norm": 0.8734033107757568, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 35040 + }, + { + "epoch": 2.5170556552962298, + "grad_norm": 0.8613446950912476, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 35050 + }, + { + "epoch": 2.5177737881508078, + "grad_norm": 0.762395441532135, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 35060 + }, + { + "epoch": 2.5184919210053858, + "grad_norm": 0.806220293045044, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 35070 + }, + { + "epoch": 2.519210053859964, + "grad_norm": 0.7781713008880615, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 35080 + }, + { + "epoch": 2.519928186714542, + "grad_norm": 0.8639848828315735, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 35090 + }, + { + "epoch": 2.52064631956912, + "grad_norm": 0.7331740260124207, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 35100 + }, + { + "epoch": 2.521364452423698, + "grad_norm": 0.8148137927055359, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 35110 + }, + { + "epoch": 2.5220825852782767, + "grad_norm": 0.6939297914505005, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 35120 + }, + { + "epoch": 2.5228007181328547, + "grad_norm": 0.8151076436042786, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 35130 + }, + { + "epoch": 2.5235188509874327, + "grad_norm": 0.9193238019943237, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 35140 + }, + { + "epoch": 2.5242369838420107, + "grad_norm": 0.8230985403060913, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 35150 + }, + { + "epoch": 2.5249551166965887, + "grad_norm": 0.865492582321167, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 35160 + }, + { + "epoch": 2.525673249551167, + "grad_norm": 0.7673570513725281, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35170 + }, + { + "epoch": 2.526391382405745, + "grad_norm": 0.8296313881874084, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 35180 + }, + { + "epoch": 2.527109515260323, + "grad_norm": 0.6531317234039307, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 35190 + }, + { + "epoch": 2.527827648114901, + "grad_norm": 0.9865642189979553, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 35200 + }, + { + "epoch": 2.5285457809694796, + "grad_norm": 0.8001098036766052, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 35210 + }, + { + "epoch": 2.5292639138240576, + "grad_norm": 0.7523218393325806, + "learning_rate": 0.0002, + "loss": 0.6737, + "step": 35220 + }, + { + "epoch": 2.5299820466786356, + "grad_norm": 1.061640977859497, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 35230 + }, + { + "epoch": 2.5307001795332136, + "grad_norm": 0.9668078422546387, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35240 + }, + { + "epoch": 2.5314183123877916, + "grad_norm": 0.9554983973503113, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 35250 + }, + { + "epoch": 2.5321364452423696, + "grad_norm": 0.8343066573143005, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 35260 + }, + { + "epoch": 2.532854578096948, + "grad_norm": 0.8408095240592957, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 35270 + }, + { + "epoch": 2.533572710951526, + "grad_norm": 0.8593984842300415, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 35280 + }, + { + "epoch": 2.534290843806104, + "grad_norm": 0.7593855261802673, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 35290 + }, + { + "epoch": 2.5350089766606825, + "grad_norm": 0.9179701209068298, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 35300 + }, + { + "epoch": 2.5357271095152605, + "grad_norm": 0.749022901058197, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 35310 + }, + { + "epoch": 2.5364452423698385, + "grad_norm": 0.7172152400016785, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 35320 + }, + { + "epoch": 2.5371633752244165, + "grad_norm": 0.8228873610496521, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 35330 + }, + { + "epoch": 2.5378815080789945, + "grad_norm": 0.9663547277450562, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 35340 + }, + { + "epoch": 2.5385996409335725, + "grad_norm": 0.8446536660194397, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 35350 + }, + { + "epoch": 2.539317773788151, + "grad_norm": 0.9751029014587402, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 35360 + }, + { + "epoch": 2.540035906642729, + "grad_norm": 0.7460315823554993, + "learning_rate": 0.0002, + "loss": 0.7001, + "step": 35370 + }, + { + "epoch": 2.540754039497307, + "grad_norm": 0.8269246816635132, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 35380 + }, + { + "epoch": 2.541472172351885, + "grad_norm": 0.7200030088424683, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 35390 + }, + { + "epoch": 2.5421903052064634, + "grad_norm": 0.9586671590805054, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 35400 + }, + { + "epoch": 2.5429084380610414, + "grad_norm": 0.7872378826141357, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 35410 + }, + { + "epoch": 2.5436265709156194, + "grad_norm": 0.8257358074188232, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 35420 + }, + { + "epoch": 2.5443447037701974, + "grad_norm": 0.6924505829811096, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 35430 + }, + { + "epoch": 2.5450628366247754, + "grad_norm": 1.1171481609344482, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 35440 + }, + { + "epoch": 2.545780969479354, + "grad_norm": 0.9635605216026306, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 35450 + }, + { + "epoch": 2.546499102333932, + "grad_norm": 0.9760567545890808, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 35460 + }, + { + "epoch": 2.54721723518851, + "grad_norm": 0.8523460030555725, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 35470 + }, + { + "epoch": 2.547935368043088, + "grad_norm": 0.9316970109939575, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 35480 + }, + { + "epoch": 2.5486535008976663, + "grad_norm": 0.7401485443115234, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 35490 + }, + { + "epoch": 2.5493716337522443, + "grad_norm": 1.0627065896987915, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 35500 + }, + { + "epoch": 2.5500897666068223, + "grad_norm": 0.7463156580924988, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 35510 + }, + { + "epoch": 2.5508078994614003, + "grad_norm": 0.9935570359230042, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 35520 + }, + { + "epoch": 2.5515260323159783, + "grad_norm": 0.8824051022529602, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 35530 + }, + { + "epoch": 2.5522441651705563, + "grad_norm": 0.8018375635147095, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 35540 + }, + { + "epoch": 2.5529622980251347, + "grad_norm": 0.7523182034492493, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 35550 + }, + { + "epoch": 2.5536804308797127, + "grad_norm": 0.6771712303161621, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 35560 + }, + { + "epoch": 2.5543985637342908, + "grad_norm": 0.7903336882591248, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 35570 + }, + { + "epoch": 2.555116696588869, + "grad_norm": 0.7973808646202087, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 35580 + }, + { + "epoch": 2.555834829443447, + "grad_norm": 0.9082772731781006, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 35590 + }, + { + "epoch": 2.556552962298025, + "grad_norm": 0.779671311378479, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 35600 + }, + { + "epoch": 2.557271095152603, + "grad_norm": 0.710058331489563, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 35610 + }, + { + "epoch": 2.557989228007181, + "grad_norm": 0.8217873573303223, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 35620 + }, + { + "epoch": 2.558707360861759, + "grad_norm": 0.8017855286598206, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 35630 + }, + { + "epoch": 2.5594254937163377, + "grad_norm": 0.6671402454376221, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 35640 + }, + { + "epoch": 2.5601436265709157, + "grad_norm": 0.9357045292854309, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 35650 + }, + { + "epoch": 2.5608617594254937, + "grad_norm": 0.7676312327384949, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35660 + }, + { + "epoch": 2.5615798922800717, + "grad_norm": 0.7602545619010925, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 35670 + }, + { + "epoch": 2.56229802513465, + "grad_norm": 0.8112275004386902, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35680 + }, + { + "epoch": 2.563016157989228, + "grad_norm": 0.73296719789505, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 35690 + }, + { + "epoch": 2.563734290843806, + "grad_norm": 0.9007818102836609, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 35700 + }, + { + "epoch": 2.564452423698384, + "grad_norm": 0.7526060938835144, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 35710 + }, + { + "epoch": 2.565170556552962, + "grad_norm": 0.813875675201416, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 35720 + }, + { + "epoch": 2.5658886894075406, + "grad_norm": 0.7767695784568787, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 35730 + }, + { + "epoch": 2.5666068222621186, + "grad_norm": 0.7840573787689209, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 35740 + }, + { + "epoch": 2.5673249551166966, + "grad_norm": 0.7400487661361694, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 35750 + }, + { + "epoch": 2.5680430879712746, + "grad_norm": 0.7424315810203552, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 35760 + }, + { + "epoch": 2.568761220825853, + "grad_norm": 0.7812185883522034, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 35770 + }, + { + "epoch": 2.569479353680431, + "grad_norm": 0.8397669196128845, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 35780 + }, + { + "epoch": 2.570197486535009, + "grad_norm": 0.7543849945068359, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 35790 + }, + { + "epoch": 2.570915619389587, + "grad_norm": 0.903634786605835, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 35800 + }, + { + "epoch": 2.571633752244165, + "grad_norm": 0.853335976600647, + "learning_rate": 0.0002, + "loss": 0.6884, + "step": 35810 + }, + { + "epoch": 2.572351885098743, + "grad_norm": 0.8441029787063599, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 35820 + }, + { + "epoch": 2.5730700179533215, + "grad_norm": 0.9072228670120239, + "learning_rate": 0.0002, + "loss": 0.6874, + "step": 35830 + }, + { + "epoch": 2.5737881508078995, + "grad_norm": 0.7720168828964233, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 35840 + }, + { + "epoch": 2.5745062836624775, + "grad_norm": 0.8719366788864136, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 35850 + }, + { + "epoch": 2.575224416517056, + "grad_norm": 0.766209065914154, + "learning_rate": 0.0002, + "loss": 0.7842, + "step": 35860 + }, + { + "epoch": 2.575942549371634, + "grad_norm": 0.7814549207687378, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 35870 + }, + { + "epoch": 2.576660682226212, + "grad_norm": 0.8068482875823975, + "learning_rate": 0.0002, + "loss": 0.7309, + "step": 35880 + }, + { + "epoch": 2.57737881508079, + "grad_norm": 0.8321225643157959, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 35890 + }, + { + "epoch": 2.578096947935368, + "grad_norm": 0.9787611961364746, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 35900 + }, + { + "epoch": 2.578815080789946, + "grad_norm": 0.6955108642578125, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 35910 + }, + { + "epoch": 2.5795332136445244, + "grad_norm": 0.8309195637702942, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 35920 + }, + { + "epoch": 2.5802513464991024, + "grad_norm": 0.9309390783309937, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 35930 + }, + { + "epoch": 2.5809694793536804, + "grad_norm": 0.903537392616272, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 35940 + }, + { + "epoch": 2.5816876122082584, + "grad_norm": 0.9530633091926575, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 35950 + }, + { + "epoch": 2.582405745062837, + "grad_norm": 1.0140212774276733, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 35960 + }, + { + "epoch": 2.583123877917415, + "grad_norm": 0.8224637508392334, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 35970 + }, + { + "epoch": 2.583842010771993, + "grad_norm": 0.7952998280525208, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 35980 + }, + { + "epoch": 2.584560143626571, + "grad_norm": 0.6057878136634827, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 35990 + }, + { + "epoch": 2.585278276481149, + "grad_norm": 0.9172457456588745, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 36000 + }, + { + "epoch": 2.5859964093357273, + "grad_norm": 1.0061585903167725, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36010 + }, + { + "epoch": 2.5867145421903053, + "grad_norm": 0.8555058240890503, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 36020 + }, + { + "epoch": 2.5874326750448833, + "grad_norm": 0.7732099890708923, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 36030 + }, + { + "epoch": 2.5881508078994613, + "grad_norm": 0.9026121497154236, + "learning_rate": 0.0002, + "loss": 0.7383, + "step": 36040 + }, + { + "epoch": 2.5888689407540397, + "grad_norm": 0.7477090954780579, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 36050 + }, + { + "epoch": 2.5895870736086177, + "grad_norm": 0.8835780024528503, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 36060 + }, + { + "epoch": 2.5903052064631957, + "grad_norm": 0.7555899024009705, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 36070 + }, + { + "epoch": 2.5910233393177737, + "grad_norm": 0.7983574867248535, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 36080 + }, + { + "epoch": 2.5917414721723517, + "grad_norm": 0.9261698722839355, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 36090 + }, + { + "epoch": 2.5924596050269297, + "grad_norm": 0.6834031343460083, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 36100 + }, + { + "epoch": 2.593177737881508, + "grad_norm": 0.9528526067733765, + "learning_rate": 0.0002, + "loss": 0.702, + "step": 36110 + }, + { + "epoch": 2.593895870736086, + "grad_norm": 0.7469993233680725, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 36120 + }, + { + "epoch": 2.594614003590664, + "grad_norm": 0.6750355362892151, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 36130 + }, + { + "epoch": 2.5953321364452426, + "grad_norm": 0.8591015338897705, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 36140 + }, + { + "epoch": 2.5960502692998206, + "grad_norm": 0.7359472513198853, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 36150 + }, + { + "epoch": 2.5967684021543986, + "grad_norm": 0.8450608253479004, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36160 + }, + { + "epoch": 2.5974865350089766, + "grad_norm": 0.9069468975067139, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36170 + }, + { + "epoch": 2.5982046678635546, + "grad_norm": 0.9261118173599243, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 36180 + }, + { + "epoch": 2.5989228007181326, + "grad_norm": 0.7164715528488159, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 36190 + }, + { + "epoch": 2.599640933572711, + "grad_norm": 0.8809511661529541, + "learning_rate": 0.0002, + "loss": 0.7044, + "step": 36200 + }, + { + "epoch": 2.600359066427289, + "grad_norm": 0.9872701168060303, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 36210 + }, + { + "epoch": 2.601077199281867, + "grad_norm": 0.7544043064117432, + "learning_rate": 0.0002, + "loss": 0.689, + "step": 36220 + }, + { + "epoch": 2.601795332136445, + "grad_norm": 0.9890767335891724, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 36230 + }, + { + "epoch": 2.6025134649910235, + "grad_norm": 0.907865047454834, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 36240 + }, + { + "epoch": 2.6032315978456015, + "grad_norm": 0.7724096179008484, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 36250 + }, + { + "epoch": 2.6039497307001795, + "grad_norm": 0.7996655106544495, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 36260 + }, + { + "epoch": 2.6046678635547575, + "grad_norm": 0.7184412479400635, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 36270 + }, + { + "epoch": 2.6053859964093355, + "grad_norm": 0.7781601548194885, + "learning_rate": 0.0002, + "loss": 0.7133, + "step": 36280 + }, + { + "epoch": 2.6061041292639135, + "grad_norm": 0.8972102403640747, + "learning_rate": 0.0002, + "loss": 0.6975, + "step": 36290 + }, + { + "epoch": 2.606822262118492, + "grad_norm": 0.6831884980201721, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 36300 + }, + { + "epoch": 2.60754039497307, + "grad_norm": 0.9049789905548096, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 36310 + }, + { + "epoch": 2.608258527827648, + "grad_norm": 0.8062970042228699, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 36320 + }, + { + "epoch": 2.6089766606822264, + "grad_norm": 0.94797682762146, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 36330 + }, + { + "epoch": 2.6096947935368044, + "grad_norm": 0.7907559275627136, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 36340 + }, + { + "epoch": 2.6104129263913824, + "grad_norm": 0.6720156073570251, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 36350 + }, + { + "epoch": 2.6111310592459605, + "grad_norm": 0.729228138923645, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 36360 + }, + { + "epoch": 2.6118491921005385, + "grad_norm": 0.9072836637496948, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 36370 + }, + { + "epoch": 2.6125673249551165, + "grad_norm": 0.8022173643112183, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36380 + }, + { + "epoch": 2.613285457809695, + "grad_norm": 0.7475612163543701, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 36390 + }, + { + "epoch": 2.614003590664273, + "grad_norm": 0.7976534366607666, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 36400 + }, + { + "epoch": 2.614721723518851, + "grad_norm": 0.7118260860443115, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36410 + }, + { + "epoch": 2.6154398563734294, + "grad_norm": 0.666500985622406, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 36420 + }, + { + "epoch": 2.6161579892280074, + "grad_norm": 0.8776089549064636, + "learning_rate": 0.0002, + "loss": 0.719, + "step": 36430 + }, + { + "epoch": 2.6168761220825854, + "grad_norm": 0.9375919699668884, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 36440 + }, + { + "epoch": 2.6175942549371634, + "grad_norm": 0.8162244558334351, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 36450 + }, + { + "epoch": 2.6183123877917414, + "grad_norm": 0.8459304571151733, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 36460 + }, + { + "epoch": 2.6190305206463194, + "grad_norm": 0.7731037735939026, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 36470 + }, + { + "epoch": 2.619748653500898, + "grad_norm": 0.7857680320739746, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 36480 + }, + { + "epoch": 2.620466786355476, + "grad_norm": 0.8415161371231079, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 36490 + }, + { + "epoch": 2.621184919210054, + "grad_norm": 0.8103558421134949, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 36500 + }, + { + "epoch": 2.621903052064632, + "grad_norm": 0.7876150608062744, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 36510 + }, + { + "epoch": 2.6226211849192103, + "grad_norm": 0.7316484451293945, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 36520 + }, + { + "epoch": 2.6233393177737883, + "grad_norm": 0.7209784984588623, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 36530 + }, + { + "epoch": 2.6240574506283663, + "grad_norm": 0.8933016657829285, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 36540 + }, + { + "epoch": 2.6247755834829443, + "grad_norm": 0.8078171610832214, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 36550 + }, + { + "epoch": 2.6254937163375223, + "grad_norm": 0.9134724736213684, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 36560 + }, + { + "epoch": 2.6262118491921003, + "grad_norm": 0.8691368699073792, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 36570 + }, + { + "epoch": 2.6269299820466787, + "grad_norm": 0.706479012966156, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 36580 + }, + { + "epoch": 2.6276481149012567, + "grad_norm": 0.9333644509315491, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 36590 + }, + { + "epoch": 2.6283662477558347, + "grad_norm": 0.8156154155731201, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 36600 + }, + { + "epoch": 2.629084380610413, + "grad_norm": 0.812745213508606, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 36610 + }, + { + "epoch": 2.629802513464991, + "grad_norm": 0.8898148536682129, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 36620 + }, + { + "epoch": 2.630520646319569, + "grad_norm": 0.8083946108818054, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 36630 + }, + { + "epoch": 2.631238779174147, + "grad_norm": 0.7050122618675232, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 36640 + }, + { + "epoch": 2.631956912028725, + "grad_norm": 0.8155789971351624, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 36650 + }, + { + "epoch": 2.632675044883303, + "grad_norm": 0.9102175235748291, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 36660 + }, + { + "epoch": 2.6333931777378816, + "grad_norm": 0.6621248126029968, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 36670 + }, + { + "epoch": 2.6341113105924596, + "grad_norm": 0.7338519096374512, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 36680 + }, + { + "epoch": 2.6348294434470376, + "grad_norm": 0.7536506652832031, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 36690 + }, + { + "epoch": 2.635547576301616, + "grad_norm": 0.9357436299324036, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 36700 + }, + { + "epoch": 2.636265709156194, + "grad_norm": 0.7732111215591431, + "learning_rate": 0.0002, + "loss": 0.7729, + "step": 36710 + }, + { + "epoch": 2.636983842010772, + "grad_norm": 0.6863537430763245, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36720 + }, + { + "epoch": 2.63770197486535, + "grad_norm": 0.8014764785766602, + "learning_rate": 0.0002, + "loss": 0.7058, + "step": 36730 + }, + { + "epoch": 2.638420107719928, + "grad_norm": 0.8103911280632019, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 36740 + }, + { + "epoch": 2.639138240574506, + "grad_norm": 0.882652997970581, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 36750 + }, + { + "epoch": 2.6398563734290845, + "grad_norm": 0.8705278038978577, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 36760 + }, + { + "epoch": 2.6405745062836625, + "grad_norm": 0.80764240026474, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 36770 + }, + { + "epoch": 2.6412926391382405, + "grad_norm": 0.9668620824813843, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 36780 + }, + { + "epoch": 2.6420107719928185, + "grad_norm": 0.7477577328681946, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 36790 + }, + { + "epoch": 2.642728904847397, + "grad_norm": 0.8344516754150391, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 36800 + }, + { + "epoch": 2.643447037701975, + "grad_norm": 0.9520720839500427, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 36810 + }, + { + "epoch": 2.644165170556553, + "grad_norm": 0.5942372679710388, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 36820 + }, + { + "epoch": 2.644883303411131, + "grad_norm": 0.7411555051803589, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 36830 + }, + { + "epoch": 2.645601436265709, + "grad_norm": 0.6597771048545837, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 36840 + }, + { + "epoch": 2.646319569120287, + "grad_norm": 0.8636548519134521, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 36850 + }, + { + "epoch": 2.6470377019748654, + "grad_norm": 0.8557497262954712, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 36860 + }, + { + "epoch": 2.6477558348294434, + "grad_norm": 0.8535996675491333, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 36870 + }, + { + "epoch": 2.6484739676840214, + "grad_norm": 0.7996463775634766, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 36880 + }, + { + "epoch": 2.6491921005386, + "grad_norm": 0.6462067365646362, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 36890 + }, + { + "epoch": 2.649910233393178, + "grad_norm": 0.8849772214889526, + "learning_rate": 0.0002, + "loss": 0.6905, + "step": 36900 + }, + { + "epoch": 2.650628366247756, + "grad_norm": 0.999173641204834, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 36910 + }, + { + "epoch": 2.651346499102334, + "grad_norm": 0.7221724987030029, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 36920 + }, + { + "epoch": 2.652064631956912, + "grad_norm": 0.8122989535331726, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 36930 + }, + { + "epoch": 2.65278276481149, + "grad_norm": 0.724267840385437, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 36940 + }, + { + "epoch": 2.6535008976660683, + "grad_norm": 0.8250583410263062, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 36950 + }, + { + "epoch": 2.6542190305206463, + "grad_norm": 0.7623526453971863, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 36960 + }, + { + "epoch": 2.6549371633752243, + "grad_norm": 0.6474025845527649, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 36970 + }, + { + "epoch": 2.655655296229803, + "grad_norm": 0.9751694202423096, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 36980 + }, + { + "epoch": 2.656373429084381, + "grad_norm": 0.8338939547538757, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 36990 + }, + { + "epoch": 2.657091561938959, + "grad_norm": 0.8877421021461487, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 37000 + }, + { + "epoch": 2.657809694793537, + "grad_norm": 0.9590298533439636, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 37010 + }, + { + "epoch": 2.658527827648115, + "grad_norm": 0.8224121928215027, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 37020 + }, + { + "epoch": 2.659245960502693, + "grad_norm": 0.9871236681938171, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 37030 + }, + { + "epoch": 2.6599640933572712, + "grad_norm": 0.8729037046432495, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 37040 + }, + { + "epoch": 2.6606822262118492, + "grad_norm": 0.6279319524765015, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 37050 + }, + { + "epoch": 2.6614003590664272, + "grad_norm": 1.0278962850570679, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37060 + }, + { + "epoch": 2.6621184919210052, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 37070 + }, + { + "epoch": 2.6628366247755837, + "grad_norm": 0.7432018518447876, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 37080 + }, + { + "epoch": 2.6635547576301617, + "grad_norm": 0.9425008296966553, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 37090 + }, + { + "epoch": 2.6642728904847397, + "grad_norm": 0.7542579174041748, + "learning_rate": 0.0002, + "loss": 0.716, + "step": 37100 + }, + { + "epoch": 2.6649910233393177, + "grad_norm": 0.8469315767288208, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 37110 + }, + { + "epoch": 2.6657091561938957, + "grad_norm": 0.865777313709259, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 37120 + }, + { + "epoch": 2.6664272890484737, + "grad_norm": 0.7293250560760498, + "learning_rate": 0.0002, + "loss": 0.741, + "step": 37130 + }, + { + "epoch": 2.667145421903052, + "grad_norm": 0.7199395895004272, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 37140 + }, + { + "epoch": 2.66786355475763, + "grad_norm": 0.7801268100738525, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 37150 + }, + { + "epoch": 2.668581687612208, + "grad_norm": 0.8706921935081482, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 37160 + }, + { + "epoch": 2.6692998204667866, + "grad_norm": 0.7124722599983215, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 37170 + }, + { + "epoch": 2.6700179533213646, + "grad_norm": 0.8333015441894531, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 37180 + }, + { + "epoch": 2.6707360861759426, + "grad_norm": 0.8822736740112305, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 37190 + }, + { + "epoch": 2.6714542190305206, + "grad_norm": 0.8300906419754028, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 37200 + }, + { + "epoch": 2.6721723518850986, + "grad_norm": 0.887126088142395, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37210 + }, + { + "epoch": 2.6728904847396766, + "grad_norm": 0.7473671436309814, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 37220 + }, + { + "epoch": 2.673608617594255, + "grad_norm": 0.8121018409729004, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 37230 + }, + { + "epoch": 2.674326750448833, + "grad_norm": 0.7882586717605591, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 37240 + }, + { + "epoch": 2.675044883303411, + "grad_norm": 0.797060489654541, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 37250 + }, + { + "epoch": 2.6757630161579895, + "grad_norm": 0.9776935577392578, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 37260 + }, + { + "epoch": 2.6764811490125675, + "grad_norm": 0.9527283906936646, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 37270 + }, + { + "epoch": 2.6771992818671455, + "grad_norm": 0.7232038974761963, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 37280 + }, + { + "epoch": 2.6779174147217235, + "grad_norm": 0.8514575362205505, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 37290 + }, + { + "epoch": 2.6786355475763015, + "grad_norm": 0.8951214551925659, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 37300 + }, + { + "epoch": 2.6793536804308795, + "grad_norm": 0.7569643259048462, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 37310 + }, + { + "epoch": 2.680071813285458, + "grad_norm": 1.0522346496582031, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 37320 + }, + { + "epoch": 2.680789946140036, + "grad_norm": 0.8914180994033813, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 37330 + }, + { + "epoch": 2.681508078994614, + "grad_norm": 0.8251807689666748, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 37340 + }, + { + "epoch": 2.682226211849192, + "grad_norm": 0.8215394020080566, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 37350 + }, + { + "epoch": 2.6829443447037704, + "grad_norm": 0.8043696880340576, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 37360 + }, + { + "epoch": 2.6836624775583484, + "grad_norm": 0.767250657081604, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 37370 + }, + { + "epoch": 2.6843806104129264, + "grad_norm": 0.817740261554718, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 37380 + }, + { + "epoch": 2.6850987432675044, + "grad_norm": 0.7963255047798157, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 37390 + }, + { + "epoch": 2.6858168761220824, + "grad_norm": 0.839271605014801, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 37400 + }, + { + "epoch": 2.6865350089766604, + "grad_norm": 0.7882823348045349, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 37410 + }, + { + "epoch": 2.687253141831239, + "grad_norm": 0.8316412568092346, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 37420 + }, + { + "epoch": 2.687971274685817, + "grad_norm": 1.0044993162155151, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 37430 + }, + { + "epoch": 2.688689407540395, + "grad_norm": 0.8342832326889038, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 37440 + }, + { + "epoch": 2.6894075403949733, + "grad_norm": 0.6743215322494507, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 37450 + }, + { + "epoch": 2.6901256732495513, + "grad_norm": 0.6872923970222473, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 37460 + }, + { + "epoch": 2.6908438061041293, + "grad_norm": 0.7377792596817017, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 37470 + }, + { + "epoch": 2.6915619389587073, + "grad_norm": 0.7677304744720459, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 37480 + }, + { + "epoch": 2.6922800718132853, + "grad_norm": 0.9951061010360718, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 37490 + }, + { + "epoch": 2.6929982046678633, + "grad_norm": 0.7452111840248108, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 37500 + }, + { + "epoch": 2.6937163375224418, + "grad_norm": 0.9663393497467041, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 37510 + }, + { + "epoch": 2.6944344703770198, + "grad_norm": 0.7919635772705078, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 37520 + }, + { + "epoch": 2.6951526032315978, + "grad_norm": 0.9977981448173523, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 37530 + }, + { + "epoch": 2.695870736086176, + "grad_norm": 0.7279480695724487, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 37540 + }, + { + "epoch": 2.6965888689407542, + "grad_norm": 0.7218075394630432, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 37550 + }, + { + "epoch": 2.6973070017953322, + "grad_norm": 0.9041047096252441, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 37560 + }, + { + "epoch": 2.6980251346499102, + "grad_norm": 0.7689407467842102, + "learning_rate": 0.0002, + "loss": 0.6848, + "step": 37570 + }, + { + "epoch": 2.6987432675044882, + "grad_norm": 0.8184728622436523, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 37580 + }, + { + "epoch": 2.6994614003590662, + "grad_norm": 0.7536661624908447, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 37590 + }, + { + "epoch": 2.7001795332136447, + "grad_norm": 0.8371431231498718, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 37600 + }, + { + "epoch": 2.7008976660682227, + "grad_norm": 0.8562723994255066, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 37610 + }, + { + "epoch": 2.7016157989228007, + "grad_norm": 0.8227898478507996, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 37620 + }, + { + "epoch": 2.7023339317773787, + "grad_norm": 0.764792799949646, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 37630 + }, + { + "epoch": 2.703052064631957, + "grad_norm": 0.7782649993896484, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 37640 + }, + { + "epoch": 2.703770197486535, + "grad_norm": 0.7669944167137146, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 37650 + }, + { + "epoch": 2.704488330341113, + "grad_norm": 0.7945750951766968, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 37660 + }, + { + "epoch": 2.705206463195691, + "grad_norm": 0.6840786337852478, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 37670 + }, + { + "epoch": 2.705924596050269, + "grad_norm": 1.0565117597579956, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 37680 + }, + { + "epoch": 2.706642728904847, + "grad_norm": 0.7407042384147644, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 37690 + }, + { + "epoch": 2.7073608617594256, + "grad_norm": 0.7862113118171692, + "learning_rate": 0.0002, + "loss": 0.712, + "step": 37700 + }, + { + "epoch": 2.7080789946140036, + "grad_norm": 0.7487596273422241, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 37710 + }, + { + "epoch": 2.7087971274685816, + "grad_norm": 0.9416596293449402, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 37720 + }, + { + "epoch": 2.70951526032316, + "grad_norm": 0.8943207263946533, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 37730 + }, + { + "epoch": 2.710233393177738, + "grad_norm": 0.9263445138931274, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 37740 + }, + { + "epoch": 2.710951526032316, + "grad_norm": 0.6869737505912781, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 37750 + }, + { + "epoch": 2.711669658886894, + "grad_norm": 0.9186407923698425, + "learning_rate": 0.0002, + "loss": 0.724, + "step": 37760 + }, + { + "epoch": 2.712387791741472, + "grad_norm": 0.8379335999488831, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 37770 + }, + { + "epoch": 2.71310592459605, + "grad_norm": 0.7248736023902893, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 37780 + }, + { + "epoch": 2.7138240574506285, + "grad_norm": 0.8636229038238525, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 37790 + }, + { + "epoch": 2.7145421903052065, + "grad_norm": 0.7590767741203308, + "learning_rate": 0.0002, + "loss": 0.726, + "step": 37800 + }, + { + "epoch": 2.7152603231597845, + "grad_norm": 0.8946404457092285, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 37810 + }, + { + "epoch": 2.7159784560143625, + "grad_norm": 0.7822132706642151, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 37820 + }, + { + "epoch": 2.716696588868941, + "grad_norm": 0.7882820963859558, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 37830 + }, + { + "epoch": 2.717414721723519, + "grad_norm": 0.8025872707366943, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 37840 + }, + { + "epoch": 2.718132854578097, + "grad_norm": 0.8618839979171753, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 37850 + }, + { + "epoch": 2.718850987432675, + "grad_norm": 0.6975733637809753, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 37860 + }, + { + "epoch": 2.719569120287253, + "grad_norm": 0.7952182292938232, + "learning_rate": 0.0002, + "loss": 0.6858, + "step": 37870 + }, + { + "epoch": 2.7202872531418314, + "grad_norm": 0.7580680251121521, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 37880 + }, + { + "epoch": 2.7210053859964094, + "grad_norm": 0.9504257440567017, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 37890 + }, + { + "epoch": 2.7217235188509874, + "grad_norm": 0.856614351272583, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 37900 + }, + { + "epoch": 2.7224416517055654, + "grad_norm": 1.0092085599899292, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 37910 + }, + { + "epoch": 2.723159784560144, + "grad_norm": 0.9009839296340942, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 37920 + }, + { + "epoch": 2.723877917414722, + "grad_norm": 0.9247435331344604, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 37930 + }, + { + "epoch": 2.7245960502693, + "grad_norm": 1.0774317979812622, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 37940 + }, + { + "epoch": 2.725314183123878, + "grad_norm": 0.9104372262954712, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 37950 + }, + { + "epoch": 2.726032315978456, + "grad_norm": 0.7904245257377625, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 37960 + }, + { + "epoch": 2.726750448833034, + "grad_norm": 0.9555521607398987, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 37970 + }, + { + "epoch": 2.7274685816876123, + "grad_norm": 0.7769099473953247, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 37980 + }, + { + "epoch": 2.7281867145421903, + "grad_norm": 0.9202065467834473, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 37990 + }, + { + "epoch": 2.7289048473967683, + "grad_norm": 0.732510507106781, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 38000 + }, + { + "epoch": 2.7296229802513468, + "grad_norm": 0.7723771929740906, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 38010 + }, + { + "epoch": 2.7303411131059248, + "grad_norm": 0.7948567867279053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 38020 + }, + { + "epoch": 2.7310592459605028, + "grad_norm": 0.7702966928482056, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 38030 + }, + { + "epoch": 2.7317773788150808, + "grad_norm": 0.689098060131073, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 38040 + }, + { + "epoch": 2.7324955116696588, + "grad_norm": 0.7951080203056335, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 38050 + }, + { + "epoch": 2.7332136445242368, + "grad_norm": 0.7284924983978271, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 38060 + }, + { + "epoch": 2.733931777378815, + "grad_norm": 0.9198044538497925, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 38070 + }, + { + "epoch": 2.734649910233393, + "grad_norm": 0.8653260469436646, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 38080 + }, + { + "epoch": 2.735368043087971, + "grad_norm": 0.8503400683403015, + "learning_rate": 0.0002, + "loss": 0.6832, + "step": 38090 + }, + { + "epoch": 2.736086175942549, + "grad_norm": 0.8388783931732178, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 38100 + }, + { + "epoch": 2.7368043087971277, + "grad_norm": 0.7636904716491699, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 38110 + }, + { + "epoch": 2.7375224416517057, + "grad_norm": 0.8990790247917175, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 38120 + }, + { + "epoch": 2.7382405745062837, + "grad_norm": 0.8878970742225647, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 38130 + }, + { + "epoch": 2.7389587073608617, + "grad_norm": 0.7684310078620911, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 38140 + }, + { + "epoch": 2.7396768402154397, + "grad_norm": 1.0777359008789062, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 38150 + }, + { + "epoch": 2.740394973070018, + "grad_norm": 0.768764317035675, + "learning_rate": 0.0002, + "loss": 0.6904, + "step": 38160 + }, + { + "epoch": 2.741113105924596, + "grad_norm": 0.7490760087966919, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 38170 + }, + { + "epoch": 2.741831238779174, + "grad_norm": 0.860373854637146, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 38180 + }, + { + "epoch": 2.742549371633752, + "grad_norm": 0.7145599722862244, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 38190 + }, + { + "epoch": 2.7432675044883306, + "grad_norm": 0.8347760438919067, + "learning_rate": 0.0002, + "loss": 0.6798, + "step": 38200 + }, + { + "epoch": 2.7439856373429086, + "grad_norm": 0.8425729274749756, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 38210 + }, + { + "epoch": 2.7447037701974866, + "grad_norm": 0.9289436936378479, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 38220 + }, + { + "epoch": 2.7454219030520646, + "grad_norm": 0.7608675360679626, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 38230 + }, + { + "epoch": 2.7461400359066426, + "grad_norm": 0.8067167401313782, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 38240 + }, + { + "epoch": 2.7468581687612206, + "grad_norm": 0.8599629402160645, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 38250 + }, + { + "epoch": 2.747576301615799, + "grad_norm": 0.8425742387771606, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 38260 + }, + { + "epoch": 2.748294434470377, + "grad_norm": 0.8626754283905029, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 38270 + }, + { + "epoch": 2.749012567324955, + "grad_norm": 0.797652006149292, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 38280 + }, + { + "epoch": 2.7497307001795335, + "grad_norm": 0.7971500754356384, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 38290 + }, + { + "epoch": 2.7504488330341115, + "grad_norm": 0.9786333441734314, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 38300 + }, + { + "epoch": 2.7511669658886895, + "grad_norm": 0.7146100997924805, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 38310 + }, + { + "epoch": 2.7518850987432675, + "grad_norm": 0.8436099886894226, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 38320 + }, + { + "epoch": 2.7526032315978455, + "grad_norm": 0.8943847417831421, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 38330 + }, + { + "epoch": 2.7533213644524235, + "grad_norm": 0.8170148730278015, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 38340 + }, + { + "epoch": 2.754039497307002, + "grad_norm": 0.7804728746414185, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 38350 + }, + { + "epoch": 2.75475763016158, + "grad_norm": 0.9139971137046814, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38360 + }, + { + "epoch": 2.755475763016158, + "grad_norm": 0.835332453250885, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 38370 + }, + { + "epoch": 2.756193895870736, + "grad_norm": 1.0904794931411743, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 38380 + }, + { + "epoch": 2.7569120287253144, + "grad_norm": 0.7443365454673767, + "learning_rate": 0.0002, + "loss": 0.6881, + "step": 38390 + }, + { + "epoch": 2.7576301615798924, + "grad_norm": 1.1336839199066162, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 38400 + }, + { + "epoch": 2.7583482944344704, + "grad_norm": 0.9024015665054321, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 38410 + }, + { + "epoch": 2.7590664272890484, + "grad_norm": 0.7380578517913818, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 38420 + }, + { + "epoch": 2.7597845601436264, + "grad_norm": 0.9860634207725525, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 38430 + }, + { + "epoch": 2.760502692998205, + "grad_norm": 0.7928970456123352, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 38440 + }, + { + "epoch": 2.761220825852783, + "grad_norm": 1.0357221364974976, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 38450 + }, + { + "epoch": 2.761938958707361, + "grad_norm": 0.8110901117324829, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 38460 + }, + { + "epoch": 2.762657091561939, + "grad_norm": 0.8420981764793396, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 38470 + }, + { + "epoch": 2.7633752244165173, + "grad_norm": 0.858955979347229, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 38480 + }, + { + "epoch": 2.7640933572710953, + "grad_norm": 0.9851368069648743, + "learning_rate": 0.0002, + "loss": 0.7387, + "step": 38490 + }, + { + "epoch": 2.7648114901256733, + "grad_norm": 0.8073325753211975, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 38500 + }, + { + "epoch": 2.7655296229802513, + "grad_norm": 1.0654062032699585, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38510 + }, + { + "epoch": 2.7662477558348293, + "grad_norm": 0.719603955745697, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 38520 + }, + { + "epoch": 2.7669658886894073, + "grad_norm": 0.9790831804275513, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38530 + }, + { + "epoch": 2.7676840215439857, + "grad_norm": 0.907619833946228, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 38540 + }, + { + "epoch": 2.7684021543985637, + "grad_norm": 0.7463719248771667, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 38550 + }, + { + "epoch": 2.7691202872531417, + "grad_norm": 1.0687178373336792, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 38560 + }, + { + "epoch": 2.76983842010772, + "grad_norm": 0.7397776246070862, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 38570 + }, + { + "epoch": 2.770556552962298, + "grad_norm": 0.7392559051513672, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 38580 + }, + { + "epoch": 2.771274685816876, + "grad_norm": 0.9774793982505798, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 38590 + }, + { + "epoch": 2.771992818671454, + "grad_norm": 0.9502208828926086, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 38600 + }, + { + "epoch": 2.772710951526032, + "grad_norm": 0.776108980178833, + "learning_rate": 0.0002, + "loss": 0.6908, + "step": 38610 + }, + { + "epoch": 2.77342908438061, + "grad_norm": 0.7633077502250671, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 38620 + }, + { + "epoch": 2.7741472172351886, + "grad_norm": 0.9445580244064331, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 38630 + }, + { + "epoch": 2.7748653500897666, + "grad_norm": 0.943165123462677, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 38640 + }, + { + "epoch": 2.7755834829443446, + "grad_norm": 0.9045929908752441, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 38650 + }, + { + "epoch": 2.7763016157989227, + "grad_norm": 0.9425684213638306, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 38660 + }, + { + "epoch": 2.777019748653501, + "grad_norm": 0.9106295704841614, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 38670 + }, + { + "epoch": 2.777737881508079, + "grad_norm": 0.6264749765396118, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 38680 + }, + { + "epoch": 2.778456014362657, + "grad_norm": 0.9156801700592041, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 38690 + }, + { + "epoch": 2.779174147217235, + "grad_norm": 0.9752956032752991, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 38700 + }, + { + "epoch": 2.779892280071813, + "grad_norm": 0.7849555611610413, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 38710 + }, + { + "epoch": 2.780610412926391, + "grad_norm": 0.8109981417655945, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 38720 + }, + { + "epoch": 2.7813285457809696, + "grad_norm": 0.7882387638092041, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 38730 + }, + { + "epoch": 2.7820466786355476, + "grad_norm": 0.9049678444862366, + "learning_rate": 0.0002, + "loss": 0.6948, + "step": 38740 + }, + { + "epoch": 2.7827648114901256, + "grad_norm": 0.7678212523460388, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 38750 + }, + { + "epoch": 2.783482944344704, + "grad_norm": 0.9754453301429749, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 38760 + }, + { + "epoch": 2.784201077199282, + "grad_norm": 0.7643493413925171, + "learning_rate": 0.0002, + "loss": 0.7071, + "step": 38770 + }, + { + "epoch": 2.78491921005386, + "grad_norm": 0.7440303564071655, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 38780 + }, + { + "epoch": 2.785637342908438, + "grad_norm": 0.8870946168899536, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 38790 + }, + { + "epoch": 2.786355475763016, + "grad_norm": 0.8100579977035522, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 38800 + }, + { + "epoch": 2.787073608617594, + "grad_norm": 0.7082616090774536, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 38810 + }, + { + "epoch": 2.7877917414721725, + "grad_norm": 0.7880047559738159, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 38820 + }, + { + "epoch": 2.7885098743267505, + "grad_norm": 0.7217963337898254, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 38830 + }, + { + "epoch": 2.7892280071813285, + "grad_norm": 0.799124002456665, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 38840 + }, + { + "epoch": 2.789946140035907, + "grad_norm": 1.0004022121429443, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 38850 + }, + { + "epoch": 2.790664272890485, + "grad_norm": 0.7866547107696533, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 38860 + }, + { + "epoch": 2.791382405745063, + "grad_norm": 0.891603410243988, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 38870 + }, + { + "epoch": 2.792100538599641, + "grad_norm": 0.7687129378318787, + "learning_rate": 0.0002, + "loss": 0.6949, + "step": 38880 + }, + { + "epoch": 2.792818671454219, + "grad_norm": 0.7549769282341003, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 38890 + }, + { + "epoch": 2.793536804308797, + "grad_norm": 0.7792351245880127, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 38900 + }, + { + "epoch": 2.7942549371633754, + "grad_norm": 0.7352819442749023, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 38910 + }, + { + "epoch": 2.7949730700179534, + "grad_norm": 0.8758018612861633, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 38920 + }, + { + "epoch": 2.7956912028725314, + "grad_norm": 0.8213023543357849, + "learning_rate": 0.0002, + "loss": 0.7033, + "step": 38930 + }, + { + "epoch": 2.7964093357271094, + "grad_norm": 0.899368941783905, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 38940 + }, + { + "epoch": 2.797127468581688, + "grad_norm": 0.7497758269309998, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 38950 + }, + { + "epoch": 2.797845601436266, + "grad_norm": 0.870704710483551, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 38960 + }, + { + "epoch": 2.798563734290844, + "grad_norm": 0.8021528720855713, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 38970 + }, + { + "epoch": 2.799281867145422, + "grad_norm": 0.7541360855102539, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 38980 + }, + { + "epoch": 2.8, + "grad_norm": 0.8909788131713867, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 38990 + }, + { + "epoch": 2.800718132854578, + "grad_norm": 0.8175999522209167, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 39000 + }, + { + "epoch": 2.8014362657091563, + "grad_norm": 0.7336044311523438, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 39010 + }, + { + "epoch": 2.8021543985637343, + "grad_norm": 0.7354168891906738, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 39020 + }, + { + "epoch": 2.8028725314183123, + "grad_norm": 0.8771968483924866, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 39030 + }, + { + "epoch": 2.8035906642728907, + "grad_norm": 0.8073309063911438, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39040 + }, + { + "epoch": 2.8043087971274687, + "grad_norm": 0.8475365042686462, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39050 + }, + { + "epoch": 2.8050269299820467, + "grad_norm": 0.7233281135559082, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 39060 + }, + { + "epoch": 2.8057450628366247, + "grad_norm": 0.9850572347640991, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39070 + }, + { + "epoch": 2.8064631956912027, + "grad_norm": 1.0635435581207275, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 39080 + }, + { + "epoch": 2.8071813285457807, + "grad_norm": 0.8183665871620178, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 39090 + }, + { + "epoch": 2.807899461400359, + "grad_norm": 0.802228569984436, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 39100 + }, + { + "epoch": 2.808617594254937, + "grad_norm": 0.9861624836921692, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 39110 + }, + { + "epoch": 2.809335727109515, + "grad_norm": 0.675205409526825, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 39120 + }, + { + "epoch": 2.8100538599640936, + "grad_norm": 0.7503975629806519, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 39130 + }, + { + "epoch": 2.8107719928186716, + "grad_norm": 0.8266825675964355, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 39140 + }, + { + "epoch": 2.8114901256732496, + "grad_norm": 0.6956485509872437, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 39150 + }, + { + "epoch": 2.8122082585278276, + "grad_norm": 0.7363799214363098, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 39160 + }, + { + "epoch": 2.8129263913824056, + "grad_norm": 1.3893407583236694, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 39170 + }, + { + "epoch": 2.8136445242369836, + "grad_norm": 1.0619654655456543, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 39180 + }, + { + "epoch": 2.814362657091562, + "grad_norm": 0.7924326062202454, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 39190 + }, + { + "epoch": 2.81508078994614, + "grad_norm": 0.8838121294975281, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 39200 + }, + { + "epoch": 2.815798922800718, + "grad_norm": 0.9059016108512878, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 39210 + }, + { + "epoch": 2.816517055655296, + "grad_norm": 0.9284590482711792, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 39220 + }, + { + "epoch": 2.8172351885098745, + "grad_norm": 0.7992225289344788, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 39230 + }, + { + "epoch": 2.8179533213644525, + "grad_norm": 0.816376805305481, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 39240 + }, + { + "epoch": 2.8186714542190305, + "grad_norm": 0.9183637499809265, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 39250 + }, + { + "epoch": 2.8193895870736085, + "grad_norm": 0.7232057452201843, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 39260 + }, + { + "epoch": 2.8201077199281865, + "grad_norm": 0.9012457728385925, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 39270 + }, + { + "epoch": 2.8208258527827645, + "grad_norm": 0.7796093821525574, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 39280 + }, + { + "epoch": 2.821543985637343, + "grad_norm": 0.8331146836280823, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 39290 + }, + { + "epoch": 2.822262118491921, + "grad_norm": 0.8031269907951355, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 39300 + }, + { + "epoch": 2.822980251346499, + "grad_norm": 0.8563299179077148, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 39310 + }, + { + "epoch": 2.8236983842010774, + "grad_norm": 0.8083387613296509, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 39320 + }, + { + "epoch": 2.8244165170556554, + "grad_norm": 0.8132631182670593, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 39330 + }, + { + "epoch": 2.8251346499102334, + "grad_norm": 0.9071316719055176, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39340 + }, + { + "epoch": 2.8258527827648114, + "grad_norm": 0.8224168419837952, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 39350 + }, + { + "epoch": 2.8265709156193894, + "grad_norm": 1.073014497756958, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 39360 + }, + { + "epoch": 2.8272890484739674, + "grad_norm": 0.9466553926467896, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 39370 + }, + { + "epoch": 2.828007181328546, + "grad_norm": 0.8946257829666138, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 39380 + }, + { + "epoch": 2.828725314183124, + "grad_norm": 0.8497758507728577, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 39390 + }, + { + "epoch": 2.829443447037702, + "grad_norm": 0.8952143788337708, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 39400 + }, + { + "epoch": 2.8301615798922803, + "grad_norm": 0.8839313983917236, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 39410 + }, + { + "epoch": 2.8308797127468583, + "grad_norm": 0.7576757669448853, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 39420 + }, + { + "epoch": 2.8315978456014363, + "grad_norm": 0.8212469816207886, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 39430 + }, + { + "epoch": 2.8323159784560143, + "grad_norm": 0.9289504885673523, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 39440 + }, + { + "epoch": 2.8330341113105924, + "grad_norm": 0.8745405077934265, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 39450 + }, + { + "epoch": 2.8337522441651704, + "grad_norm": 0.7974533438682556, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 39460 + }, + { + "epoch": 2.834470377019749, + "grad_norm": 0.914289116859436, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 39470 + }, + { + "epoch": 2.835188509874327, + "grad_norm": 0.7686914801597595, + "learning_rate": 0.0002, + "loss": 0.7009, + "step": 39480 + }, + { + "epoch": 2.835906642728905, + "grad_norm": 0.9289370179176331, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39490 + }, + { + "epoch": 2.836624775583483, + "grad_norm": 0.8851973414421082, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 39500 + }, + { + "epoch": 2.8373429084380613, + "grad_norm": 0.7754096388816833, + "learning_rate": 0.0002, + "loss": 0.7012, + "step": 39510 + }, + { + "epoch": 2.8380610412926393, + "grad_norm": 0.8801632523536682, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 39520 + }, + { + "epoch": 2.8387791741472173, + "grad_norm": 0.9031528234481812, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 39530 + }, + { + "epoch": 2.8394973070017953, + "grad_norm": 0.7113721966743469, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 39540 + }, + { + "epoch": 2.8402154398563733, + "grad_norm": 0.7880923748016357, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 39550 + }, + { + "epoch": 2.8409335727109513, + "grad_norm": 2.4828813076019287, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 39560 + }, + { + "epoch": 2.8416517055655297, + "grad_norm": 0.9174619913101196, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 39570 + }, + { + "epoch": 2.8423698384201077, + "grad_norm": 0.9708074927330017, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 39580 + }, + { + "epoch": 2.8430879712746857, + "grad_norm": 0.7968248724937439, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 39590 + }, + { + "epoch": 2.843806104129264, + "grad_norm": 0.7967682480812073, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 39600 + }, + { + "epoch": 2.844524236983842, + "grad_norm": 0.7487651109695435, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 39610 + }, + { + "epoch": 2.84524236983842, + "grad_norm": 0.6997556686401367, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 39620 + }, + { + "epoch": 2.845960502692998, + "grad_norm": 0.7639351487159729, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39630 + }, + { + "epoch": 2.846678635547576, + "grad_norm": 0.9086648225784302, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 39640 + }, + { + "epoch": 2.847396768402154, + "grad_norm": 0.91103196144104, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 39650 + }, + { + "epoch": 2.8481149012567326, + "grad_norm": 0.8096913695335388, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 39660 + }, + { + "epoch": 2.8488330341113106, + "grad_norm": 0.8961427807807922, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 39670 + }, + { + "epoch": 2.8495511669658886, + "grad_norm": 0.7489904761314392, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 39680 + }, + { + "epoch": 2.850269299820467, + "grad_norm": 0.7893617749214172, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 39690 + }, + { + "epoch": 2.850987432675045, + "grad_norm": 0.8259761929512024, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 39700 + }, + { + "epoch": 2.851705565529623, + "grad_norm": 0.7006617188453674, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 39710 + }, + { + "epoch": 2.852423698384201, + "grad_norm": 0.8922327756881714, + "learning_rate": 0.0002, + "loss": 0.7095, + "step": 39720 + }, + { + "epoch": 2.853141831238779, + "grad_norm": 0.9058550000190735, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 39730 + }, + { + "epoch": 2.853859964093357, + "grad_norm": 0.7627129554748535, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 39740 + }, + { + "epoch": 2.8545780969479355, + "grad_norm": 0.9316968321800232, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 39750 + }, + { + "epoch": 2.8552962298025135, + "grad_norm": 0.8424679040908813, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 39760 + }, + { + "epoch": 2.8560143626570915, + "grad_norm": 0.6185386776924133, + "learning_rate": 0.0002, + "loss": 0.7018, + "step": 39770 + }, + { + "epoch": 2.8567324955116695, + "grad_norm": 0.709902286529541, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 39780 + }, + { + "epoch": 2.857450628366248, + "grad_norm": 0.93730229139328, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 39790 + }, + { + "epoch": 2.858168761220826, + "grad_norm": 0.875989556312561, + "learning_rate": 0.0002, + "loss": 0.6973, + "step": 39800 + }, + { + "epoch": 2.858886894075404, + "grad_norm": 0.7424131631851196, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 39810 + }, + { + "epoch": 2.859605026929982, + "grad_norm": 0.9108477830886841, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 39820 + }, + { + "epoch": 2.86032315978456, + "grad_norm": 0.8248386383056641, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 39830 + }, + { + "epoch": 2.861041292639138, + "grad_norm": 0.8739979863166809, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 39840 + }, + { + "epoch": 2.8617594254937164, + "grad_norm": 0.7940961122512817, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 39850 + }, + { + "epoch": 2.8624775583482944, + "grad_norm": 0.7594687938690186, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 39860 + }, + { + "epoch": 2.8631956912028724, + "grad_norm": 0.9884313941001892, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 39870 + }, + { + "epoch": 2.863913824057451, + "grad_norm": 0.8537741303443909, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 39880 + }, + { + "epoch": 2.864631956912029, + "grad_norm": 0.7407512664794922, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 39890 + }, + { + "epoch": 2.865350089766607, + "grad_norm": 1.0179548263549805, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 39900 + }, + { + "epoch": 2.866068222621185, + "grad_norm": 0.8822470307350159, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 39910 + }, + { + "epoch": 2.866786355475763, + "grad_norm": 0.794448733329773, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 39920 + }, + { + "epoch": 2.867504488330341, + "grad_norm": 0.8115299940109253, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 39930 + }, + { + "epoch": 2.8682226211849193, + "grad_norm": 0.7998958826065063, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 39940 + }, + { + "epoch": 2.8689407540394973, + "grad_norm": 0.8222435116767883, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 39950 + }, + { + "epoch": 2.8696588868940753, + "grad_norm": 0.9495923519134521, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 39960 + }, + { + "epoch": 2.8703770197486533, + "grad_norm": 0.6749192476272583, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 39970 + }, + { + "epoch": 2.871095152603232, + "grad_norm": 0.8910874128341675, + "learning_rate": 0.0002, + "loss": 0.7003, + "step": 39980 + }, + { + "epoch": 2.87181328545781, + "grad_norm": 0.7051638960838318, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 39990 + }, + { + "epoch": 2.872531418312388, + "grad_norm": 0.8456535339355469, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 40000 + }, + { + "epoch": 2.873249551166966, + "grad_norm": 0.934894859790802, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 40010 + }, + { + "epoch": 2.873967684021544, + "grad_norm": 0.6740477681159973, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 40020 + }, + { + "epoch": 2.8746858168761222, + "grad_norm": 0.6632325649261475, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 40030 + }, + { + "epoch": 2.8754039497307002, + "grad_norm": 0.8889022469520569, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 40040 + }, + { + "epoch": 2.8761220825852782, + "grad_norm": 0.7460705637931824, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 40050 + }, + { + "epoch": 2.8768402154398562, + "grad_norm": 0.9795911908149719, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 40060 + }, + { + "epoch": 2.8775583482944347, + "grad_norm": 1.0002509355545044, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 40070 + }, + { + "epoch": 2.8782764811490127, + "grad_norm": 0.7867239713668823, + "learning_rate": 0.0002, + "loss": 0.7191, + "step": 40080 + }, + { + "epoch": 2.8789946140035907, + "grad_norm": 1.0221471786499023, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 40090 + }, + { + "epoch": 2.8797127468581687, + "grad_norm": 0.8091005086898804, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 40100 + }, + { + "epoch": 2.8804308797127467, + "grad_norm": 0.8485820293426514, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 40110 + }, + { + "epoch": 2.8811490125673247, + "grad_norm": 0.7850196957588196, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 40120 + }, + { + "epoch": 2.881867145421903, + "grad_norm": 0.7906134128570557, + "learning_rate": 0.0002, + "loss": 0.6696, + "step": 40130 + }, + { + "epoch": 2.882585278276481, + "grad_norm": 0.7957962155342102, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 40140 + }, + { + "epoch": 2.883303411131059, + "grad_norm": 1.0687522888183594, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 40150 + }, + { + "epoch": 2.8840215439856376, + "grad_norm": 0.713752031326294, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 40160 + }, + { + "epoch": 2.8847396768402156, + "grad_norm": 1.1603864431381226, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 40170 + }, + { + "epoch": 2.8854578096947936, + "grad_norm": 0.8423245549201965, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 40180 + }, + { + "epoch": 2.8861759425493716, + "grad_norm": 0.7554550766944885, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40190 + }, + { + "epoch": 2.8868940754039496, + "grad_norm": 0.6006978750228882, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 40200 + }, + { + "epoch": 2.8876122082585276, + "grad_norm": 0.923068106174469, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 40210 + }, + { + "epoch": 2.888330341113106, + "grad_norm": 0.7659787535667419, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 40220 + }, + { + "epoch": 2.889048473967684, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 40230 + }, + { + "epoch": 2.889766606822262, + "grad_norm": 1.1267355680465698, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 40240 + }, + { + "epoch": 2.89048473967684, + "grad_norm": 0.8548554182052612, + "learning_rate": 0.0002, + "loss": 0.7636, + "step": 40250 + }, + { + "epoch": 2.8912028725314185, + "grad_norm": 0.7846875786781311, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 40260 + }, + { + "epoch": 2.8919210053859965, + "grad_norm": 0.8606904745101929, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 40270 + }, + { + "epoch": 2.8926391382405745, + "grad_norm": 0.6508898138999939, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 40280 + }, + { + "epoch": 2.8933572710951525, + "grad_norm": 0.7903237342834473, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 40290 + }, + { + "epoch": 2.8940754039497305, + "grad_norm": 0.7320941686630249, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 40300 + }, + { + "epoch": 2.894793536804309, + "grad_norm": 1.0031821727752686, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 40310 + }, + { + "epoch": 2.895511669658887, + "grad_norm": 0.7463554739952087, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 40320 + }, + { + "epoch": 2.896229802513465, + "grad_norm": 0.8455599546432495, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 40330 + }, + { + "epoch": 2.896947935368043, + "grad_norm": 0.7645914554595947, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 40340 + }, + { + "epoch": 2.8976660682226214, + "grad_norm": 0.9074810147285461, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 40350 + }, + { + "epoch": 2.8983842010771994, + "grad_norm": 0.9070153832435608, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 40360 + }, + { + "epoch": 2.8991023339317774, + "grad_norm": 0.8649221658706665, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 40370 + }, + { + "epoch": 2.8998204667863554, + "grad_norm": 1.0325016975402832, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 40380 + }, + { + "epoch": 2.9005385996409334, + "grad_norm": 0.8688622713088989, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 40390 + }, + { + "epoch": 2.9012567324955114, + "grad_norm": 0.83316969871521, + "learning_rate": 0.0002, + "loss": 0.7209, + "step": 40400 + }, + { + "epoch": 2.90197486535009, + "grad_norm": 1.0146536827087402, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 40410 + }, + { + "epoch": 2.902692998204668, + "grad_norm": 6.21811580657959, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 40420 + }, + { + "epoch": 2.903411131059246, + "grad_norm": 0.8747655749320984, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 40430 + }, + { + "epoch": 2.9041292639138243, + "grad_norm": 0.8671547174453735, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 40440 + }, + { + "epoch": 2.9048473967684023, + "grad_norm": 0.7888760566711426, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 40450 + }, + { + "epoch": 2.9055655296229803, + "grad_norm": 0.7182217240333557, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 40460 + }, + { + "epoch": 2.9062836624775583, + "grad_norm": 0.8802227973937988, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 40470 + }, + { + "epoch": 2.9070017953321363, + "grad_norm": 0.8106126189231873, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 40480 + }, + { + "epoch": 2.9077199281867143, + "grad_norm": 0.7313538789749146, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 40490 + }, + { + "epoch": 2.9084380610412928, + "grad_norm": 0.6098655462265015, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40500 + }, + { + "epoch": 2.9091561938958708, + "grad_norm": 0.8849560618400574, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 40510 + }, + { + "epoch": 2.9098743267504488, + "grad_norm": 0.8761322498321533, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 40520 + }, + { + "epoch": 2.9105924596050268, + "grad_norm": 0.8259703516960144, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 40530 + }, + { + "epoch": 2.911310592459605, + "grad_norm": 0.6613079309463501, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 40540 + }, + { + "epoch": 2.912028725314183, + "grad_norm": 0.825678825378418, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 40550 + }, + { + "epoch": 2.912746858168761, + "grad_norm": 0.824850857257843, + "learning_rate": 0.0002, + "loss": 0.7052, + "step": 40560 + }, + { + "epoch": 2.9134649910233392, + "grad_norm": 0.9629682898521423, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 40570 + }, + { + "epoch": 2.9141831238779172, + "grad_norm": 0.7446485161781311, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 40580 + }, + { + "epoch": 2.9149012567324957, + "grad_norm": 0.9028317928314209, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 40590 + }, + { + "epoch": 2.9156193895870737, + "grad_norm": 0.9646022319793701, + "learning_rate": 0.0002, + "loss": 0.7128, + "step": 40600 + }, + { + "epoch": 2.9163375224416517, + "grad_norm": 0.8845045566558838, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 40610 + }, + { + "epoch": 2.9170556552962297, + "grad_norm": 0.9660372734069824, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 40620 + }, + { + "epoch": 2.917773788150808, + "grad_norm": 0.8914347290992737, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 40630 + }, + { + "epoch": 2.918491921005386, + "grad_norm": 0.7789235711097717, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 40640 + }, + { + "epoch": 2.919210053859964, + "grad_norm": 0.8221206665039062, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 40650 + }, + { + "epoch": 2.919928186714542, + "grad_norm": 0.9550618529319763, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 40660 + }, + { + "epoch": 2.92064631956912, + "grad_norm": 0.868315577507019, + "learning_rate": 0.0002, + "loss": 0.6911, + "step": 40670 + }, + { + "epoch": 2.921364452423698, + "grad_norm": 0.852878749370575, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 40680 + }, + { + "epoch": 2.9220825852782766, + "grad_norm": 0.8388790488243103, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 40690 + }, + { + "epoch": 2.9228007181328546, + "grad_norm": 0.9897602200508118, + "learning_rate": 0.0002, + "loss": 0.7299, + "step": 40700 + }, + { + "epoch": 2.9235188509874326, + "grad_norm": 0.8050527572631836, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 40710 + }, + { + "epoch": 2.924236983842011, + "grad_norm": 0.7296929955482483, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 40720 + }, + { + "epoch": 2.924955116696589, + "grad_norm": 0.917475700378418, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 40730 + }, + { + "epoch": 2.925673249551167, + "grad_norm": 0.9118483662605286, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 40740 + }, + { + "epoch": 2.926391382405745, + "grad_norm": 0.7722473740577698, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 40750 + }, + { + "epoch": 2.927109515260323, + "grad_norm": 0.7950358986854553, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 40760 + }, + { + "epoch": 2.927827648114901, + "grad_norm": 0.8868561387062073, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 40770 + }, + { + "epoch": 2.9285457809694795, + "grad_norm": 0.7923154830932617, + "learning_rate": 0.0002, + "loss": 0.7513, + "step": 40780 + }, + { + "epoch": 2.9292639138240575, + "grad_norm": 0.7285428047180176, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 40790 + }, + { + "epoch": 2.9299820466786355, + "grad_norm": 0.794775664806366, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 40800 + }, + { + "epoch": 2.9307001795332135, + "grad_norm": 0.8351698517799377, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 40810 + }, + { + "epoch": 2.931418312387792, + "grad_norm": 0.853082001209259, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 40820 + }, + { + "epoch": 2.93213644524237, + "grad_norm": 0.8209722638130188, + "learning_rate": 0.0002, + "loss": 0.7047, + "step": 40830 + }, + { + "epoch": 2.932854578096948, + "grad_norm": 0.8982136845588684, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 40840 + }, + { + "epoch": 2.933572710951526, + "grad_norm": 0.8373305201530457, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 40850 + }, + { + "epoch": 2.934290843806104, + "grad_norm": 0.8326864242553711, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 40860 + }, + { + "epoch": 2.9350089766606824, + "grad_norm": 0.7232590317726135, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 40870 + }, + { + "epoch": 2.9357271095152604, + "grad_norm": 0.823615312576294, + "learning_rate": 0.0002, + "loss": 0.7311, + "step": 40880 + }, + { + "epoch": 2.9364452423698384, + "grad_norm": 0.7532811760902405, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 40890 + }, + { + "epoch": 2.9371633752244164, + "grad_norm": 0.9594773650169373, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 40900 + }, + { + "epoch": 2.937881508078995, + "grad_norm": 0.8368398547172546, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 40910 + }, + { + "epoch": 2.938599640933573, + "grad_norm": 0.8336817026138306, + "learning_rate": 0.0002, + "loss": 0.7201, + "step": 40920 + }, + { + "epoch": 2.939317773788151, + "grad_norm": 0.8413758277893066, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 40930 + }, + { + "epoch": 2.940035906642729, + "grad_norm": 0.7117549180984497, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 40940 + }, + { + "epoch": 2.940754039497307, + "grad_norm": 0.8741925954818726, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 40950 + }, + { + "epoch": 2.941472172351885, + "grad_norm": 0.8476088047027588, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 40960 + }, + { + "epoch": 2.9421903052064633, + "grad_norm": 0.674659788608551, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 40970 + }, + { + "epoch": 2.9429084380610413, + "grad_norm": 0.7087500691413879, + "learning_rate": 0.0002, + "loss": 0.6714, + "step": 40980 + }, + { + "epoch": 2.9436265709156193, + "grad_norm": 0.9202252626419067, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 40990 + }, + { + "epoch": 2.9443447037701977, + "grad_norm": 0.9775124192237854, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 41000 + }, + { + "epoch": 2.9450628366247757, + "grad_norm": 0.7465068101882935, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 41010 + }, + { + "epoch": 2.9457809694793538, + "grad_norm": 0.7229986786842346, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 41020 + }, + { + "epoch": 2.9464991023339318, + "grad_norm": 0.7228954434394836, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 41030 + }, + { + "epoch": 2.9472172351885098, + "grad_norm": 0.9396149516105652, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 41040 + }, + { + "epoch": 2.9479353680430878, + "grad_norm": 0.9458696842193604, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 41050 + }, + { + "epoch": 2.948653500897666, + "grad_norm": 0.8276246190071106, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 41060 + }, + { + "epoch": 2.949371633752244, + "grad_norm": 0.7927420139312744, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 41070 + }, + { + "epoch": 2.950089766606822, + "grad_norm": 0.7403103709220886, + "learning_rate": 0.0002, + "loss": 0.7442, + "step": 41080 + }, + { + "epoch": 2.9508078994614, + "grad_norm": 0.9813524484634399, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 41090 + }, + { + "epoch": 2.9515260323159787, + "grad_norm": 0.8560924530029297, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 41100 + }, + { + "epoch": 2.9522441651705567, + "grad_norm": 0.6937443017959595, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 41110 + }, + { + "epoch": 2.9529622980251347, + "grad_norm": 0.8440476655960083, + "learning_rate": 0.0002, + "loss": 0.6817, + "step": 41120 + }, + { + "epoch": 2.9536804308797127, + "grad_norm": 1.1260770559310913, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 41130 + }, + { + "epoch": 2.9543985637342907, + "grad_norm": 0.8789936900138855, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 41140 + }, + { + "epoch": 2.9551166965888687, + "grad_norm": 0.8205832839012146, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 41150 + }, + { + "epoch": 2.955834829443447, + "grad_norm": 0.8148444294929504, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 41160 + }, + { + "epoch": 2.956552962298025, + "grad_norm": 0.791296660900116, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41170 + }, + { + "epoch": 2.957271095152603, + "grad_norm": 1.3229854106903076, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 41180 + }, + { + "epoch": 2.9579892280071816, + "grad_norm": 0.906423807144165, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 41190 + }, + { + "epoch": 2.9587073608617596, + "grad_norm": 0.8707411289215088, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 41200 + }, + { + "epoch": 2.9594254937163376, + "grad_norm": 1.0362473726272583, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 41210 + }, + { + "epoch": 2.9601436265709156, + "grad_norm": 0.818546712398529, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 41220 + }, + { + "epoch": 2.9608617594254936, + "grad_norm": 0.8558517098426819, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 41230 + }, + { + "epoch": 2.9615798922800716, + "grad_norm": 0.8262931704521179, + "learning_rate": 0.0002, + "loss": 0.7056, + "step": 41240 + }, + { + "epoch": 2.96229802513465, + "grad_norm": 0.9603250026702881, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 41250 + }, + { + "epoch": 2.963016157989228, + "grad_norm": 0.891610860824585, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 41260 + }, + { + "epoch": 2.963734290843806, + "grad_norm": 0.9823883175849915, + "learning_rate": 0.0002, + "loss": 0.7732, + "step": 41270 + }, + { + "epoch": 2.9644524236983845, + "grad_norm": 0.8783510327339172, + "learning_rate": 0.0002, + "loss": 0.7144, + "step": 41280 + }, + { + "epoch": 2.9651705565529625, + "grad_norm": 0.873656690120697, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 41290 + }, + { + "epoch": 2.9658886894075405, + "grad_norm": 0.8281165957450867, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 41300 + }, + { + "epoch": 2.9666068222621185, + "grad_norm": 0.8008899092674255, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 41310 + }, + { + "epoch": 2.9673249551166965, + "grad_norm": 0.8564065098762512, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 41320 + }, + { + "epoch": 2.9680430879712745, + "grad_norm": 0.786119818687439, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41330 + }, + { + "epoch": 2.968761220825853, + "grad_norm": 1.3152399063110352, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 41340 + }, + { + "epoch": 2.969479353680431, + "grad_norm": 0.7551527619361877, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 41350 + }, + { + "epoch": 2.970197486535009, + "grad_norm": 1.1397290229797363, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 41360 + }, + { + "epoch": 2.970915619389587, + "grad_norm": 0.8333854079246521, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 41370 + }, + { + "epoch": 2.9716337522441654, + "grad_norm": 0.8096165657043457, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 41380 + }, + { + "epoch": 2.9723518850987434, + "grad_norm": 0.8378547430038452, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 41390 + }, + { + "epoch": 2.9730700179533214, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 41400 + }, + { + "epoch": 2.9737881508078994, + "grad_norm": 0.8722409605979919, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 41410 + }, + { + "epoch": 2.9745062836624774, + "grad_norm": 0.6680061221122742, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 41420 + }, + { + "epoch": 2.9752244165170554, + "grad_norm": 0.7666152715682983, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 41430 + }, + { + "epoch": 2.975942549371634, + "grad_norm": 0.8489957451820374, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 41440 + }, + { + "epoch": 2.976660682226212, + "grad_norm": 0.8516127467155457, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 41450 + }, + { + "epoch": 2.97737881508079, + "grad_norm": 0.8836804628372192, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 41460 + }, + { + "epoch": 2.9780969479353683, + "grad_norm": 1.0963364839553833, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 41470 + }, + { + "epoch": 2.9788150807899463, + "grad_norm": 0.9908610582351685, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 41480 + }, + { + "epoch": 2.9795332136445243, + "grad_norm": 0.8822041153907776, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 41490 + }, + { + "epoch": 2.9802513464991023, + "grad_norm": 0.717723548412323, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 41500 + }, + { + "epoch": 2.9809694793536803, + "grad_norm": 0.8413400053977966, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 41510 + }, + { + "epoch": 2.9816876122082583, + "grad_norm": 0.8771023750305176, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 41520 + }, + { + "epoch": 2.9824057450628367, + "grad_norm": 0.7185000777244568, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 41530 + }, + { + "epoch": 2.9831238779174147, + "grad_norm": 0.8299767374992371, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 41540 + }, + { + "epoch": 2.9838420107719927, + "grad_norm": 0.9309971928596497, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 41550 + }, + { + "epoch": 2.984560143626571, + "grad_norm": 0.7644693851470947, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 41560 + }, + { + "epoch": 2.985278276481149, + "grad_norm": 0.7888111472129822, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 41570 + }, + { + "epoch": 2.985996409335727, + "grad_norm": 1.0921967029571533, + "learning_rate": 0.0002, + "loss": 0.6984, + "step": 41580 + }, + { + "epoch": 2.986714542190305, + "grad_norm": 0.8116785883903503, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 41590 + }, + { + "epoch": 2.987432675044883, + "grad_norm": 0.983269214630127, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 41600 + }, + { + "epoch": 2.988150807899461, + "grad_norm": 0.81700599193573, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 41610 + }, + { + "epoch": 2.9888689407540396, + "grad_norm": 0.7545617818832397, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 41620 + }, + { + "epoch": 2.9895870736086176, + "grad_norm": 0.8695791363716125, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 41630 + }, + { + "epoch": 2.9903052064631956, + "grad_norm": 0.8980445861816406, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 41640 + }, + { + "epoch": 2.9910233393177736, + "grad_norm": 0.7884747982025146, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 41650 + }, + { + "epoch": 2.991741472172352, + "grad_norm": 0.8347880840301514, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 41660 + }, + { + "epoch": 2.99245960502693, + "grad_norm": 0.7786261439323425, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 41670 + }, + { + "epoch": 2.993177737881508, + "grad_norm": 0.7830624580383301, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 41680 + }, + { + "epoch": 2.993895870736086, + "grad_norm": 0.8293532133102417, + "learning_rate": 0.0002, + "loss": 0.7116, + "step": 41690 + }, + { + "epoch": 2.994614003590664, + "grad_norm": 0.8476244211196899, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 41700 + }, + { + "epoch": 2.995332136445242, + "grad_norm": 0.7218726873397827, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 41710 + }, + { + "epoch": 2.9960502692998205, + "grad_norm": 0.8144199252128601, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 41720 + }, + { + "epoch": 2.9967684021543985, + "grad_norm": 0.7047123312950134, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 41730 + }, + { + "epoch": 2.9974865350089765, + "grad_norm": 0.8412184715270996, + "learning_rate": 0.0002, + "loss": 0.6555, + "step": 41740 + }, + { + "epoch": 2.998204667863555, + "grad_norm": 0.8840848207473755, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 41750 + }, + { + "epoch": 2.998922800718133, + "grad_norm": 0.7302142977714539, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 41760 + }, + { + "epoch": 2.999640933572711, + "grad_norm": 0.7075994610786438, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 41770 + }, + { + "epoch": 3.0, + "eval_loss": 1.1079821586608887, + "eval_runtime": 55.1897, + "eval_samples_per_second": 13.281, + "eval_steps_per_second": 1.667, + "step": 41775 + }, + { + "epoch": 3.000359066427289, + "grad_norm": 0.8630077838897705, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 41780 + }, + { + "epoch": 3.001077199281867, + "grad_norm": 0.8901806473731995, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 41790 + }, + { + "epoch": 3.0017953321364454, + "grad_norm": 0.8291767835617065, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 41800 + }, + { + "epoch": 3.0025134649910235, + "grad_norm": 0.792519211769104, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 41810 + }, + { + "epoch": 3.0032315978456015, + "grad_norm": 1.1330063343048096, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 41820 + }, + { + "epoch": 3.0039497307001795, + "grad_norm": 0.9401350617408752, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 41830 + }, + { + "epoch": 3.0046678635547575, + "grad_norm": 0.8065463304519653, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 41840 + }, + { + "epoch": 3.005385996409336, + "grad_norm": 0.8309979438781738, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 41850 + }, + { + "epoch": 3.006104129263914, + "grad_norm": 0.7432689070701599, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 41860 + }, + { + "epoch": 3.006822262118492, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 41870 + }, + { + "epoch": 3.00754039497307, + "grad_norm": 1.4364255666732788, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 41880 + }, + { + "epoch": 3.008258527827648, + "grad_norm": 0.9023072123527527, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 41890 + }, + { + "epoch": 3.0089766606822264, + "grad_norm": 0.7790587544441223, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 41900 + }, + { + "epoch": 3.0096947935368044, + "grad_norm": 0.9163706302642822, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 41910 + }, + { + "epoch": 3.0104129263913824, + "grad_norm": 0.8147963285446167, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 41920 + }, + { + "epoch": 3.0111310592459604, + "grad_norm": 0.8432748913764954, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 41930 + }, + { + "epoch": 3.011849192100539, + "grad_norm": 0.9216182231903076, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 41940 + }, + { + "epoch": 3.012567324955117, + "grad_norm": 0.62154221534729, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 41950 + }, + { + "epoch": 3.013285457809695, + "grad_norm": 0.8902392387390137, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 41960 + }, + { + "epoch": 3.014003590664273, + "grad_norm": 0.9601083993911743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 41970 + }, + { + "epoch": 3.014721723518851, + "grad_norm": 0.8938809037208557, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 41980 + }, + { + "epoch": 3.0154398563734293, + "grad_norm": 1.0621999502182007, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 41990 + }, + { + "epoch": 3.0161579892280073, + "grad_norm": 0.7310585379600525, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 42000 + }, + { + "epoch": 3.0168761220825853, + "grad_norm": 0.8475853800773621, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 42010 + }, + { + "epoch": 3.0175942549371633, + "grad_norm": 0.8509864807128906, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 42020 + }, + { + "epoch": 3.0183123877917413, + "grad_norm": 0.7461876273155212, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 42030 + }, + { + "epoch": 3.0190305206463197, + "grad_norm": 0.7734265327453613, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 42040 + }, + { + "epoch": 3.0197486535008977, + "grad_norm": 0.9056455492973328, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 42050 + }, + { + "epoch": 3.0204667863554757, + "grad_norm": 0.9183889031410217, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 42060 + }, + { + "epoch": 3.0211849192100537, + "grad_norm": 1.0777326822280884, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 42070 + }, + { + "epoch": 3.021903052064632, + "grad_norm": 0.9217308163642883, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 42080 + }, + { + "epoch": 3.02262118491921, + "grad_norm": 0.8220202326774597, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42090 + }, + { + "epoch": 3.023339317773788, + "grad_norm": 0.8454978466033936, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 42100 + }, + { + "epoch": 3.024057450628366, + "grad_norm": 0.8116370439529419, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 42110 + }, + { + "epoch": 3.024775583482944, + "grad_norm": 0.8064935207366943, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 42120 + }, + { + "epoch": 3.0254937163375226, + "grad_norm": 0.9718650579452515, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 42130 + }, + { + "epoch": 3.0262118491921006, + "grad_norm": 0.8817588090896606, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 42140 + }, + { + "epoch": 3.0269299820466786, + "grad_norm": 0.7757318615913391, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 42150 + }, + { + "epoch": 3.0276481149012566, + "grad_norm": 0.7500545382499695, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 42160 + }, + { + "epoch": 3.0283662477558346, + "grad_norm": 0.72913658618927, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 42170 + }, + { + "epoch": 3.029084380610413, + "grad_norm": 0.7641891837120056, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 42180 + }, + { + "epoch": 3.029802513464991, + "grad_norm": 0.7682021856307983, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 42190 + }, + { + "epoch": 3.030520646319569, + "grad_norm": 0.8145958781242371, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 42200 + }, + { + "epoch": 3.031238779174147, + "grad_norm": 1.0546396970748901, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 42210 + }, + { + "epoch": 3.0319569120287255, + "grad_norm": 0.8222804665565491, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 42220 + }, + { + "epoch": 3.0326750448833035, + "grad_norm": 0.8245829343795776, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 42230 + }, + { + "epoch": 3.0333931777378815, + "grad_norm": 0.9059963822364807, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 42240 + }, + { + "epoch": 3.0341113105924595, + "grad_norm": 1.026747465133667, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 42250 + }, + { + "epoch": 3.0348294434470375, + "grad_norm": 0.9108404517173767, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42260 + }, + { + "epoch": 3.035547576301616, + "grad_norm": 0.9828516840934753, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 42270 + }, + { + "epoch": 3.036265709156194, + "grad_norm": 0.9664266705513, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 42280 + }, + { + "epoch": 3.036983842010772, + "grad_norm": 0.7577654719352722, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42290 + }, + { + "epoch": 3.03770197486535, + "grad_norm": 0.8331853151321411, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 42300 + }, + { + "epoch": 3.038420107719928, + "grad_norm": 0.8017228245735168, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 42310 + }, + { + "epoch": 3.0391382405745064, + "grad_norm": 1.0316718816757202, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 42320 + }, + { + "epoch": 3.0398563734290844, + "grad_norm": 0.9379803538322449, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 42330 + }, + { + "epoch": 3.0405745062836624, + "grad_norm": 0.7554476857185364, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 42340 + }, + { + "epoch": 3.0412926391382404, + "grad_norm": 0.7377917766571045, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 42350 + }, + { + "epoch": 3.042010771992819, + "grad_norm": 1.0655276775360107, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 42360 + }, + { + "epoch": 3.042728904847397, + "grad_norm": 0.7748511433601379, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 42370 + }, + { + "epoch": 3.043447037701975, + "grad_norm": 0.848649799823761, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 42380 + }, + { + "epoch": 3.044165170556553, + "grad_norm": 0.7754636406898499, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 42390 + }, + { + "epoch": 3.044883303411131, + "grad_norm": 0.8173656463623047, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 42400 + }, + { + "epoch": 3.0456014362657093, + "grad_norm": 0.7881983518600464, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 42410 + }, + { + "epoch": 3.0463195691202873, + "grad_norm": 0.971072256565094, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 42420 + }, + { + "epoch": 3.0470377019748653, + "grad_norm": 0.8400143384933472, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 42430 + }, + { + "epoch": 3.0477558348294433, + "grad_norm": 1.0028647184371948, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 42440 + }, + { + "epoch": 3.0484739676840213, + "grad_norm": 0.9728034734725952, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 42450 + }, + { + "epoch": 3.0491921005386, + "grad_norm": 0.937633752822876, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 42460 + }, + { + "epoch": 3.049910233393178, + "grad_norm": 1.0265642404556274, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 42470 + }, + { + "epoch": 3.050628366247756, + "grad_norm": 0.9733216762542725, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 42480 + }, + { + "epoch": 3.051346499102334, + "grad_norm": 0.7039174437522888, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 42490 + }, + { + "epoch": 3.0520646319569122, + "grad_norm": 0.7515231370925903, + "learning_rate": 0.0002, + "loss": 0.6422, + "step": 42500 + }, + { + "epoch": 3.0527827648114902, + "grad_norm": 0.9115300178527832, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 42510 + }, + { + "epoch": 3.0535008976660682, + "grad_norm": 0.7403655648231506, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 42520 + }, + { + "epoch": 3.0542190305206462, + "grad_norm": 0.7826810479164124, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 42530 + }, + { + "epoch": 3.0549371633752243, + "grad_norm": 0.8007349371910095, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 42540 + }, + { + "epoch": 3.0556552962298027, + "grad_norm": 0.7975959777832031, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 42550 + }, + { + "epoch": 3.0563734290843807, + "grad_norm": 0.9665228128433228, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 42560 + }, + { + "epoch": 3.0570915619389587, + "grad_norm": 0.8386123180389404, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 42570 + }, + { + "epoch": 3.0578096947935367, + "grad_norm": 0.7437782287597656, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 42580 + }, + { + "epoch": 3.0585278276481147, + "grad_norm": 0.8360698223114014, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 42590 + }, + { + "epoch": 3.059245960502693, + "grad_norm": 0.8982073664665222, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42600 + }, + { + "epoch": 3.059964093357271, + "grad_norm": 0.9425758719444275, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 42610 + }, + { + "epoch": 3.060682226211849, + "grad_norm": 0.8567131161689758, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42620 + }, + { + "epoch": 3.061400359066427, + "grad_norm": 0.9322942495346069, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 42630 + }, + { + "epoch": 3.0621184919210056, + "grad_norm": 0.8283235430717468, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 42640 + }, + { + "epoch": 3.0628366247755836, + "grad_norm": 0.8457967638969421, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 42650 + }, + { + "epoch": 3.0635547576301616, + "grad_norm": 0.8205100893974304, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 42660 + }, + { + "epoch": 3.0642728904847396, + "grad_norm": 0.8385181427001953, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 42670 + }, + { + "epoch": 3.0649910233393176, + "grad_norm": 1.2959390878677368, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 42680 + }, + { + "epoch": 3.065709156193896, + "grad_norm": 0.7150540351867676, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 42690 + }, + { + "epoch": 3.066427289048474, + "grad_norm": 0.6647360920906067, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 42700 + }, + { + "epoch": 3.067145421903052, + "grad_norm": 0.9148316979408264, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 42710 + }, + { + "epoch": 3.06786355475763, + "grad_norm": 0.8606209754943848, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 42720 + }, + { + "epoch": 3.068581687612208, + "grad_norm": 1.4255632162094116, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 42730 + }, + { + "epoch": 3.0692998204667865, + "grad_norm": 0.9131710529327393, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 42740 + }, + { + "epoch": 3.0700179533213645, + "grad_norm": 0.9560360908508301, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 42750 + }, + { + "epoch": 3.0707360861759425, + "grad_norm": 0.9278100728988647, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 42760 + }, + { + "epoch": 3.0714542190305205, + "grad_norm": 0.7258471846580505, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 42770 + }, + { + "epoch": 3.072172351885099, + "grad_norm": 1.1537690162658691, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 42780 + }, + { + "epoch": 3.072890484739677, + "grad_norm": 0.8562588691711426, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 42790 + }, + { + "epoch": 3.073608617594255, + "grad_norm": 1.0271626710891724, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 42800 + }, + { + "epoch": 3.074326750448833, + "grad_norm": 0.85148024559021, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 42810 + }, + { + "epoch": 3.075044883303411, + "grad_norm": 0.805772602558136, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 42820 + }, + { + "epoch": 3.0757630161579894, + "grad_norm": 0.8057122230529785, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 42830 + }, + { + "epoch": 3.0764811490125674, + "grad_norm": 0.7997274994850159, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 42840 + }, + { + "epoch": 3.0771992818671454, + "grad_norm": 0.8739321231842041, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 42850 + }, + { + "epoch": 3.0779174147217234, + "grad_norm": 0.833951473236084, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 42860 + }, + { + "epoch": 3.0786355475763014, + "grad_norm": 0.8813839554786682, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 42870 + }, + { + "epoch": 3.07935368043088, + "grad_norm": 0.9020521640777588, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 42880 + }, + { + "epoch": 3.080071813285458, + "grad_norm": 0.888148844242096, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 42890 + }, + { + "epoch": 3.080789946140036, + "grad_norm": 0.8110589385032654, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 42900 + }, + { + "epoch": 3.081508078994614, + "grad_norm": 0.818738579750061, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 42910 + }, + { + "epoch": 3.082226211849192, + "grad_norm": 0.9607479572296143, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 42920 + }, + { + "epoch": 3.0829443447037703, + "grad_norm": 0.8162698745727539, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 42930 + }, + { + "epoch": 3.0836624775583483, + "grad_norm": 0.8170801997184753, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 42940 + }, + { + "epoch": 3.0843806104129263, + "grad_norm": 0.9250763654708862, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 42950 + }, + { + "epoch": 3.0850987432675043, + "grad_norm": 0.898097813129425, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 42960 + }, + { + "epoch": 3.0858168761220828, + "grad_norm": 0.9398433566093445, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 42970 + }, + { + "epoch": 3.0865350089766608, + "grad_norm": 1.052808165550232, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 42980 + }, + { + "epoch": 3.087253141831239, + "grad_norm": 0.8974723219871521, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 42990 + }, + { + "epoch": 3.087971274685817, + "grad_norm": 0.7517408728599548, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 43000 + }, + { + "epoch": 3.088689407540395, + "grad_norm": 0.8054485321044922, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 43010 + }, + { + "epoch": 3.0894075403949732, + "grad_norm": 0.9896154999732971, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 43020 + }, + { + "epoch": 3.0901256732495512, + "grad_norm": 0.7887356281280518, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 43030 + }, + { + "epoch": 3.0908438061041292, + "grad_norm": 1.0119125843048096, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 43040 + }, + { + "epoch": 3.0915619389587072, + "grad_norm": 0.8753892779350281, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 43050 + }, + { + "epoch": 3.0922800718132857, + "grad_norm": 0.8322654962539673, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43060 + }, + { + "epoch": 3.0929982046678637, + "grad_norm": 1.0605992078781128, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 43070 + }, + { + "epoch": 3.0937163375224417, + "grad_norm": 0.8783912062644958, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 43080 + }, + { + "epoch": 3.0944344703770197, + "grad_norm": 0.8839107751846313, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 43090 + }, + { + "epoch": 3.0951526032315977, + "grad_norm": 1.1655086278915405, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 43100 + }, + { + "epoch": 3.095870736086176, + "grad_norm": 0.7051523327827454, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 43110 + }, + { + "epoch": 3.096588868940754, + "grad_norm": 0.7793807983398438, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43120 + }, + { + "epoch": 3.097307001795332, + "grad_norm": 0.8352194428443909, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 43130 + }, + { + "epoch": 3.09802513464991, + "grad_norm": 0.9684847593307495, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 43140 + }, + { + "epoch": 3.098743267504488, + "grad_norm": 1.1106340885162354, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 43150 + }, + { + "epoch": 3.0994614003590666, + "grad_norm": 0.7814911603927612, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 43160 + }, + { + "epoch": 3.1001795332136446, + "grad_norm": 0.7923110723495483, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 43170 + }, + { + "epoch": 3.1008976660682226, + "grad_norm": 0.87022864818573, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 43180 + }, + { + "epoch": 3.1016157989228006, + "grad_norm": 0.9352855682373047, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 43190 + }, + { + "epoch": 3.1023339317773786, + "grad_norm": 0.8548445105552673, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 43200 + }, + { + "epoch": 3.103052064631957, + "grad_norm": 0.9576025009155273, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 43210 + }, + { + "epoch": 3.103770197486535, + "grad_norm": 0.7430430054664612, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 43220 + }, + { + "epoch": 3.104488330341113, + "grad_norm": 0.9619144797325134, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 43230 + }, + { + "epoch": 3.105206463195691, + "grad_norm": 0.8622338771820068, + "learning_rate": 0.0002, + "loss": 0.6171, + "step": 43240 + }, + { + "epoch": 3.1059245960502695, + "grad_norm": 0.853489339351654, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43250 + }, + { + "epoch": 3.1066427289048475, + "grad_norm": 0.9253206849098206, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 43260 + }, + { + "epoch": 3.1073608617594255, + "grad_norm": 0.9700671434402466, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 43270 + }, + { + "epoch": 3.1080789946140035, + "grad_norm": 1.0550731420516968, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 43280 + }, + { + "epoch": 3.1087971274685815, + "grad_norm": 0.939452052116394, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 43290 + }, + { + "epoch": 3.10951526032316, + "grad_norm": 0.8855276107788086, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 43300 + }, + { + "epoch": 3.110233393177738, + "grad_norm": 0.92197185754776, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 43310 + }, + { + "epoch": 3.110951526032316, + "grad_norm": 0.8825578689575195, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 43320 + }, + { + "epoch": 3.111669658886894, + "grad_norm": 0.9964608550071716, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 43330 + }, + { + "epoch": 3.1123877917414724, + "grad_norm": 0.9070520401000977, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 43340 + }, + { + "epoch": 3.1131059245960504, + "grad_norm": 0.9699633717536926, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 43350 + }, + { + "epoch": 3.1138240574506284, + "grad_norm": 0.7384091019630432, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 43360 + }, + { + "epoch": 3.1145421903052064, + "grad_norm": 0.9445326328277588, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 43370 + }, + { + "epoch": 3.1152603231597844, + "grad_norm": 0.8906524181365967, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 43380 + }, + { + "epoch": 3.115978456014363, + "grad_norm": 0.8850129246711731, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 43390 + }, + { + "epoch": 3.116696588868941, + "grad_norm": 0.7091860771179199, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 43400 + }, + { + "epoch": 3.117414721723519, + "grad_norm": 0.8992764949798584, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 43410 + }, + { + "epoch": 3.118132854578097, + "grad_norm": 0.9166698455810547, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 43420 + }, + { + "epoch": 3.118850987432675, + "grad_norm": 1.1195749044418335, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 43430 + }, + { + "epoch": 3.1195691202872533, + "grad_norm": 0.9414069652557373, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 43440 + }, + { + "epoch": 3.1202872531418313, + "grad_norm": 0.7641217112541199, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 43450 + }, + { + "epoch": 3.1210053859964093, + "grad_norm": 1.2659285068511963, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 43460 + }, + { + "epoch": 3.1217235188509873, + "grad_norm": 0.9968213438987732, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 43470 + }, + { + "epoch": 3.1224416517055653, + "grad_norm": 0.8819042444229126, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 43480 + }, + { + "epoch": 3.1231597845601438, + "grad_norm": 0.9124775528907776, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 43490 + }, + { + "epoch": 3.1238779174147218, + "grad_norm": 0.868354082107544, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 43500 + }, + { + "epoch": 3.1245960502692998, + "grad_norm": 0.7367526292800903, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 43510 + }, + { + "epoch": 3.1253141831238778, + "grad_norm": 0.7553679943084717, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 43520 + }, + { + "epoch": 3.126032315978456, + "grad_norm": 0.7970008850097656, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 43530 + }, + { + "epoch": 3.126750448833034, + "grad_norm": 0.9117488861083984, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 43540 + }, + { + "epoch": 3.127468581687612, + "grad_norm": 0.8004103899002075, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 43550 + }, + { + "epoch": 3.12818671454219, + "grad_norm": 0.736518919467926, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 43560 + }, + { + "epoch": 3.128904847396768, + "grad_norm": 0.8568395376205444, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 43570 + }, + { + "epoch": 3.1296229802513467, + "grad_norm": 0.9344052672386169, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 43580 + }, + { + "epoch": 3.1303411131059247, + "grad_norm": 0.7986525297164917, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 43590 + }, + { + "epoch": 3.1310592459605027, + "grad_norm": 0.8283242583274841, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 43600 + }, + { + "epoch": 3.1317773788150807, + "grad_norm": 0.6534292101860046, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 43610 + }, + { + "epoch": 3.132495511669659, + "grad_norm": 0.9585428833961487, + "learning_rate": 0.0002, + "loss": 0.6994, + "step": 43620 + }, + { + "epoch": 3.133213644524237, + "grad_norm": 0.8299157023429871, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 43630 + }, + { + "epoch": 3.133931777378815, + "grad_norm": 0.9050052762031555, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 43640 + }, + { + "epoch": 3.134649910233393, + "grad_norm": 1.0457062721252441, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 43650 + }, + { + "epoch": 3.135368043087971, + "grad_norm": 0.907691240310669, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 43660 + }, + { + "epoch": 3.1360861759425496, + "grad_norm": 0.8868935108184814, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 43670 + }, + { + "epoch": 3.1368043087971276, + "grad_norm": 0.8585456609725952, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 43680 + }, + { + "epoch": 3.1375224416517056, + "grad_norm": 1.0402741432189941, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 43690 + }, + { + "epoch": 3.1382405745062836, + "grad_norm": 1.0866798162460327, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 43700 + }, + { + "epoch": 3.1389587073608616, + "grad_norm": 0.7637296915054321, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 43710 + }, + { + "epoch": 3.13967684021544, + "grad_norm": 0.755235493183136, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 43720 + }, + { + "epoch": 3.140394973070018, + "grad_norm": 0.7258853316307068, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 43730 + }, + { + "epoch": 3.141113105924596, + "grad_norm": 1.0425268411636353, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 43740 + }, + { + "epoch": 3.141831238779174, + "grad_norm": 0.9171959757804871, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 43750 + }, + { + "epoch": 3.142549371633752, + "grad_norm": 0.8900150656700134, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 43760 + }, + { + "epoch": 3.1432675044883305, + "grad_norm": 0.9879246354103088, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 43770 + }, + { + "epoch": 3.1439856373429085, + "grad_norm": 0.7853389382362366, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 43780 + }, + { + "epoch": 3.1447037701974865, + "grad_norm": 1.0245232582092285, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 43790 + }, + { + "epoch": 3.1454219030520645, + "grad_norm": 0.8486390113830566, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 43800 + }, + { + "epoch": 3.146140035906643, + "grad_norm": 0.8536406755447388, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 43810 + }, + { + "epoch": 3.146858168761221, + "grad_norm": 0.9653734564781189, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 43820 + }, + { + "epoch": 3.147576301615799, + "grad_norm": 0.8292608857154846, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 43830 + }, + { + "epoch": 3.148294434470377, + "grad_norm": 1.147524118423462, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 43840 + }, + { + "epoch": 3.149012567324955, + "grad_norm": 0.9317546486854553, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 43850 + }, + { + "epoch": 3.1497307001795334, + "grad_norm": 0.8651045560836792, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 43860 + }, + { + "epoch": 3.1504488330341114, + "grad_norm": 0.8718969225883484, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 43870 + }, + { + "epoch": 3.1511669658886894, + "grad_norm": 1.0140702724456787, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 43880 + }, + { + "epoch": 3.1518850987432674, + "grad_norm": 0.75941401720047, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 43890 + }, + { + "epoch": 3.152603231597846, + "grad_norm": 0.6618940234184265, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 43900 + }, + { + "epoch": 3.153321364452424, + "grad_norm": 1.0013338327407837, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 43910 + }, + { + "epoch": 3.154039497307002, + "grad_norm": 0.8735299706459045, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 43920 + }, + { + "epoch": 3.15475763016158, + "grad_norm": 1.141914963722229, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 43930 + }, + { + "epoch": 3.155475763016158, + "grad_norm": 1.0916038751602173, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 43940 + }, + { + "epoch": 3.1561938958707363, + "grad_norm": 0.7042547464370728, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 43950 + }, + { + "epoch": 3.1569120287253143, + "grad_norm": 0.9885236620903015, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 43960 + }, + { + "epoch": 3.1576301615798923, + "grad_norm": 0.8083009719848633, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 43970 + }, + { + "epoch": 3.1583482944344703, + "grad_norm": 1.082627296447754, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 43980 + }, + { + "epoch": 3.1590664272890483, + "grad_norm": 0.9293290376663208, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 43990 + }, + { + "epoch": 3.1597845601436267, + "grad_norm": 0.861003041267395, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 44000 + }, + { + "epoch": 3.1605026929982047, + "grad_norm": 0.9565994143486023, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 44010 + }, + { + "epoch": 3.1612208258527827, + "grad_norm": 0.9609305262565613, + "learning_rate": 0.0002, + "loss": 0.7038, + "step": 44020 + }, + { + "epoch": 3.1619389587073607, + "grad_norm": 0.847830593585968, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 44030 + }, + { + "epoch": 3.1626570915619387, + "grad_norm": 0.852357804775238, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 44040 + }, + { + "epoch": 3.163375224416517, + "grad_norm": 0.8634562492370605, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44050 + }, + { + "epoch": 3.164093357271095, + "grad_norm": 1.0259950160980225, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 44060 + }, + { + "epoch": 3.164811490125673, + "grad_norm": 0.9615250825881958, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 44070 + }, + { + "epoch": 3.165529622980251, + "grad_norm": 0.9892165660858154, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 44080 + }, + { + "epoch": 3.1662477558348296, + "grad_norm": 0.8827354907989502, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 44090 + }, + { + "epoch": 3.1669658886894076, + "grad_norm": 0.9258168339729309, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 44100 + }, + { + "epoch": 3.1676840215439857, + "grad_norm": 0.7983399033546448, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 44110 + }, + { + "epoch": 3.1684021543985637, + "grad_norm": 0.9917809963226318, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 44120 + }, + { + "epoch": 3.1691202872531417, + "grad_norm": 1.058927297592163, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44130 + }, + { + "epoch": 3.16983842010772, + "grad_norm": 1.0095895528793335, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44140 + }, + { + "epoch": 3.170556552962298, + "grad_norm": 0.9032495617866516, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 44150 + }, + { + "epoch": 3.171274685816876, + "grad_norm": 0.9391272664070129, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 44160 + }, + { + "epoch": 3.171992818671454, + "grad_norm": 0.990755558013916, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44170 + }, + { + "epoch": 3.172710951526032, + "grad_norm": 0.9310759902000427, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 44180 + }, + { + "epoch": 3.1734290843806106, + "grad_norm": 0.7698856592178345, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 44190 + }, + { + "epoch": 3.1741472172351886, + "grad_norm": 0.7735867500305176, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 44200 + }, + { + "epoch": 3.1748653500897666, + "grad_norm": 1.1447525024414062, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 44210 + }, + { + "epoch": 3.1755834829443446, + "grad_norm": 0.8667060136795044, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 44220 + }, + { + "epoch": 3.176301615798923, + "grad_norm": 0.8596829771995544, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 44230 + }, + { + "epoch": 3.177019748653501, + "grad_norm": 0.8607654571533203, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 44240 + }, + { + "epoch": 3.177737881508079, + "grad_norm": 0.9346948266029358, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 44250 + }, + { + "epoch": 3.178456014362657, + "grad_norm": 0.852344810962677, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 44260 + }, + { + "epoch": 3.179174147217235, + "grad_norm": 0.9260450005531311, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 44270 + }, + { + "epoch": 3.1798922800718135, + "grad_norm": 0.924053430557251, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 44280 + }, + { + "epoch": 3.1806104129263915, + "grad_norm": 1.001965045928955, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 44290 + }, + { + "epoch": 3.1813285457809695, + "grad_norm": 0.943215012550354, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 44300 + }, + { + "epoch": 3.1820466786355475, + "grad_norm": 1.006977915763855, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 44310 + }, + { + "epoch": 3.1827648114901255, + "grad_norm": 0.9768950343132019, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 44320 + }, + { + "epoch": 3.183482944344704, + "grad_norm": 0.9297489523887634, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 44330 + }, + { + "epoch": 3.184201077199282, + "grad_norm": 0.9110919237136841, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 44340 + }, + { + "epoch": 3.18491921005386, + "grad_norm": 0.9821381568908691, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 44350 + }, + { + "epoch": 3.185637342908438, + "grad_norm": 0.8451243042945862, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 44360 + }, + { + "epoch": 3.1863554757630164, + "grad_norm": 0.9676638245582581, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 44370 + }, + { + "epoch": 3.1870736086175944, + "grad_norm": 0.9826035499572754, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 44380 + }, + { + "epoch": 3.1877917414721724, + "grad_norm": 0.9453121423721313, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 44390 + }, + { + "epoch": 3.1885098743267504, + "grad_norm": 0.7766330242156982, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 44400 + }, + { + "epoch": 3.1892280071813284, + "grad_norm": 0.9302349090576172, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 44410 + }, + { + "epoch": 3.189946140035907, + "grad_norm": 0.8335331082344055, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 44420 + }, + { + "epoch": 3.190664272890485, + "grad_norm": 0.6722736358642578, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 44430 + }, + { + "epoch": 3.191382405745063, + "grad_norm": 0.9047536849975586, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 44440 + }, + { + "epoch": 3.192100538599641, + "grad_norm": 0.9653822183609009, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 44450 + }, + { + "epoch": 3.192818671454219, + "grad_norm": 0.7750703692436218, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 44460 + }, + { + "epoch": 3.1935368043087973, + "grad_norm": 0.7767539024353027, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 44470 + }, + { + "epoch": 3.1942549371633753, + "grad_norm": 0.8597778081893921, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44480 + }, + { + "epoch": 3.1949730700179533, + "grad_norm": 1.1711493730545044, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 44490 + }, + { + "epoch": 3.1956912028725313, + "grad_norm": 0.9025220274925232, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 44500 + }, + { + "epoch": 3.1964093357271093, + "grad_norm": 0.8084979057312012, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44510 + }, + { + "epoch": 3.1971274685816877, + "grad_norm": 0.8475074172019958, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 44520 + }, + { + "epoch": 3.1978456014362657, + "grad_norm": 0.9915644526481628, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 44530 + }, + { + "epoch": 3.1985637342908437, + "grad_norm": 0.992231547832489, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 44540 + }, + { + "epoch": 3.1992818671454217, + "grad_norm": 0.9804556369781494, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 44550 + }, + { + "epoch": 3.2, + "grad_norm": 1.045558214187622, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 44560 + }, + { + "epoch": 3.200718132854578, + "grad_norm": 1.0880261659622192, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 44570 + }, + { + "epoch": 3.201436265709156, + "grad_norm": 0.9511138200759888, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44580 + }, + { + "epoch": 3.202154398563734, + "grad_norm": 0.9115344882011414, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 44590 + }, + { + "epoch": 3.202872531418312, + "grad_norm": 1.0738362073898315, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 44600 + }, + { + "epoch": 3.2035906642728906, + "grad_norm": 0.8209697604179382, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 44610 + }, + { + "epoch": 3.2043087971274686, + "grad_norm": 0.9220197796821594, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 44620 + }, + { + "epoch": 3.2050269299820466, + "grad_norm": 0.8859700560569763, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 44630 + }, + { + "epoch": 3.2057450628366246, + "grad_norm": 0.9772757291793823, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 44640 + }, + { + "epoch": 3.206463195691203, + "grad_norm": 0.9385574460029602, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 44650 + }, + { + "epoch": 3.207181328545781, + "grad_norm": 0.839958906173706, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 44660 + }, + { + "epoch": 3.207899461400359, + "grad_norm": 0.860478401184082, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 44670 + }, + { + "epoch": 3.208617594254937, + "grad_norm": 0.846886396408081, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 44680 + }, + { + "epoch": 3.209335727109515, + "grad_norm": 0.8591006398200989, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 44690 + }, + { + "epoch": 3.2100538599640935, + "grad_norm": 0.9236023426055908, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 44700 + }, + { + "epoch": 3.2107719928186715, + "grad_norm": 0.7348999977111816, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 44710 + }, + { + "epoch": 3.2114901256732495, + "grad_norm": 1.0041730403900146, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 44720 + }, + { + "epoch": 3.2122082585278275, + "grad_norm": 0.8382687568664551, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 44730 + }, + { + "epoch": 3.2129263913824055, + "grad_norm": 0.8253511190414429, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 44740 + }, + { + "epoch": 3.213644524236984, + "grad_norm": 0.9589242935180664, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 44750 + }, + { + "epoch": 3.214362657091562, + "grad_norm": 0.8938157558441162, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 44760 + }, + { + "epoch": 3.21508078994614, + "grad_norm": 1.0085135698318481, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 44770 + }, + { + "epoch": 3.215798922800718, + "grad_norm": 0.8647134304046631, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 44780 + }, + { + "epoch": 3.216517055655296, + "grad_norm": 1.09453284740448, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 44790 + }, + { + "epoch": 3.2172351885098744, + "grad_norm": 0.8710666298866272, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 44800 + }, + { + "epoch": 3.2179533213644524, + "grad_norm": 0.8080880641937256, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 44810 + }, + { + "epoch": 3.2186714542190304, + "grad_norm": 1.0440675020217896, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 44820 + }, + { + "epoch": 3.2193895870736084, + "grad_norm": 1.1036376953125, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 44830 + }, + { + "epoch": 3.220107719928187, + "grad_norm": 0.8783546686172485, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 44840 + }, + { + "epoch": 3.220825852782765, + "grad_norm": 0.7816855907440186, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 44850 + }, + { + "epoch": 3.221543985637343, + "grad_norm": 1.0099157094955444, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 44860 + }, + { + "epoch": 3.222262118491921, + "grad_norm": 1.054928183555603, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 44870 + }, + { + "epoch": 3.222980251346499, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 44880 + }, + { + "epoch": 3.2236983842010773, + "grad_norm": 0.9730798602104187, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 44890 + }, + { + "epoch": 3.2244165170556554, + "grad_norm": 0.7911382913589478, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 44900 + }, + { + "epoch": 3.2251346499102334, + "grad_norm": 0.9574400782585144, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 44910 + }, + { + "epoch": 3.2258527827648114, + "grad_norm": 0.8101068139076233, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 44920 + }, + { + "epoch": 3.22657091561939, + "grad_norm": 0.754146933555603, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 44930 + }, + { + "epoch": 3.227289048473968, + "grad_norm": 0.7471939921379089, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 44940 + }, + { + "epoch": 3.228007181328546, + "grad_norm": 1.0040855407714844, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 44950 + }, + { + "epoch": 3.228725314183124, + "grad_norm": 1.0016074180603027, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 44960 + }, + { + "epoch": 3.229443447037702, + "grad_norm": 1.0432976484298706, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 44970 + }, + { + "epoch": 3.2301615798922803, + "grad_norm": 0.8517055511474609, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 44980 + }, + { + "epoch": 3.2308797127468583, + "grad_norm": 0.9174178242683411, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 44990 + }, + { + "epoch": 3.2315978456014363, + "grad_norm": 0.9733774065971375, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 45000 + }, + { + "epoch": 3.2323159784560143, + "grad_norm": 0.9074714779853821, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 45010 + }, + { + "epoch": 3.2330341113105923, + "grad_norm": 0.8802759051322937, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 45020 + }, + { + "epoch": 3.2337522441651707, + "grad_norm": 1.0620871782302856, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 45030 + }, + { + "epoch": 3.2344703770197487, + "grad_norm": 0.8069542050361633, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 45040 + }, + { + "epoch": 3.2351885098743267, + "grad_norm": 0.9139137864112854, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 45050 + }, + { + "epoch": 3.2359066427289047, + "grad_norm": 0.8936411142349243, + "learning_rate": 0.0002, + "loss": 0.6389, + "step": 45060 + }, + { + "epoch": 3.2366247755834827, + "grad_norm": 0.9098079204559326, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 45070 + }, + { + "epoch": 3.237342908438061, + "grad_norm": 1.062953233718872, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45080 + }, + { + "epoch": 3.238061041292639, + "grad_norm": 0.8656470775604248, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 45090 + }, + { + "epoch": 3.238779174147217, + "grad_norm": 0.9299449920654297, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 45100 + }, + { + "epoch": 3.239497307001795, + "grad_norm": 1.0102022886276245, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 45110 + }, + { + "epoch": 3.2402154398563736, + "grad_norm": 0.8074561953544617, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 45120 + }, + { + "epoch": 3.2409335727109516, + "grad_norm": 1.044105887413025, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 45130 + }, + { + "epoch": 3.2416517055655296, + "grad_norm": 0.8742762207984924, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 45140 + }, + { + "epoch": 3.2423698384201076, + "grad_norm": 0.8240015506744385, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 45150 + }, + { + "epoch": 3.2430879712746856, + "grad_norm": 0.8438951373100281, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 45160 + }, + { + "epoch": 3.243806104129264, + "grad_norm": 1.02358877658844, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 45170 + }, + { + "epoch": 3.244524236983842, + "grad_norm": 0.8824774026870728, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 45180 + }, + { + "epoch": 3.24524236983842, + "grad_norm": 0.971015989780426, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 45190 + }, + { + "epoch": 3.245960502692998, + "grad_norm": 0.9282383918762207, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 45200 + }, + { + "epoch": 3.2466786355475765, + "grad_norm": 0.7908362746238708, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 45210 + }, + { + "epoch": 3.2473967684021545, + "grad_norm": 1.0721662044525146, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 45220 + }, + { + "epoch": 3.2481149012567325, + "grad_norm": 0.9516810774803162, + "learning_rate": 0.0002, + "loss": 0.7102, + "step": 45230 + }, + { + "epoch": 3.2488330341113105, + "grad_norm": 0.7914131283760071, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 45240 + }, + { + "epoch": 3.2495511669658885, + "grad_norm": 0.8492292761802673, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 45250 + }, + { + "epoch": 3.250269299820467, + "grad_norm": 0.8880114555358887, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 45260 + }, + { + "epoch": 3.250987432675045, + "grad_norm": 0.7808310985565186, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 45270 + }, + { + "epoch": 3.251705565529623, + "grad_norm": 0.8566828966140747, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 45280 + }, + { + "epoch": 3.252423698384201, + "grad_norm": 0.7929658889770508, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45290 + }, + { + "epoch": 3.253141831238779, + "grad_norm": 0.678207516670227, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 45300 + }, + { + "epoch": 3.2538599640933574, + "grad_norm": 0.9963029623031616, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45310 + }, + { + "epoch": 3.2545780969479354, + "grad_norm": 0.835304856300354, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 45320 + }, + { + "epoch": 3.2552962298025134, + "grad_norm": 0.7281617522239685, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 45330 + }, + { + "epoch": 3.2560143626570914, + "grad_norm": 1.244890570640564, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 45340 + }, + { + "epoch": 3.2567324955116694, + "grad_norm": 0.8372750282287598, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 45350 + }, + { + "epoch": 3.257450628366248, + "grad_norm": 1.0029667615890503, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 45360 + }, + { + "epoch": 3.258168761220826, + "grad_norm": 0.8561908602714539, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 45370 + }, + { + "epoch": 3.258886894075404, + "grad_norm": 1.0058085918426514, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 45380 + }, + { + "epoch": 3.259605026929982, + "grad_norm": 0.7768221497535706, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 45390 + }, + { + "epoch": 3.2603231597845603, + "grad_norm": 0.8443793058395386, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 45400 + }, + { + "epoch": 3.2610412926391383, + "grad_norm": 1.0140392780303955, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 45410 + }, + { + "epoch": 3.2617594254937163, + "grad_norm": 0.8397058248519897, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 45420 + }, + { + "epoch": 3.2624775583482943, + "grad_norm": 0.9717063903808594, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 45430 + }, + { + "epoch": 3.2631956912028723, + "grad_norm": 1.0279473066329956, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 45440 + }, + { + "epoch": 3.263913824057451, + "grad_norm": 1.207457184791565, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 45450 + }, + { + "epoch": 3.264631956912029, + "grad_norm": 0.8121998906135559, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 45460 + }, + { + "epoch": 3.265350089766607, + "grad_norm": 1.037733554840088, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 45470 + }, + { + "epoch": 3.266068222621185, + "grad_norm": 0.9305754899978638, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 45480 + }, + { + "epoch": 3.2667863554757632, + "grad_norm": 0.9733602404594421, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 45490 + }, + { + "epoch": 3.2675044883303412, + "grad_norm": 0.8345039486885071, + "learning_rate": 0.0002, + "loss": 0.6713, + "step": 45500 + }, + { + "epoch": 3.2682226211849192, + "grad_norm": 0.8601692318916321, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 45510 + }, + { + "epoch": 3.2689407540394972, + "grad_norm": 0.7921277284622192, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 45520 + }, + { + "epoch": 3.2696588868940752, + "grad_norm": 0.8324153423309326, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 45530 + }, + { + "epoch": 3.2703770197486537, + "grad_norm": 0.85141521692276, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 45540 + }, + { + "epoch": 3.2710951526032317, + "grad_norm": 0.9399608373641968, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 45550 + }, + { + "epoch": 3.2718132854578097, + "grad_norm": 0.9829166531562805, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 45560 + }, + { + "epoch": 3.2725314183123877, + "grad_norm": 0.9936266541481018, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 45570 + }, + { + "epoch": 3.2732495511669657, + "grad_norm": 1.036165714263916, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 45580 + }, + { + "epoch": 3.273967684021544, + "grad_norm": 0.8988680243492126, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45590 + }, + { + "epoch": 3.274685816876122, + "grad_norm": 0.9173405766487122, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 45600 + }, + { + "epoch": 3.2754039497307, + "grad_norm": 0.9967324733734131, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 45610 + }, + { + "epoch": 3.276122082585278, + "grad_norm": 0.9097777009010315, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 45620 + }, + { + "epoch": 3.276840215439856, + "grad_norm": 1.0559430122375488, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 45630 + }, + { + "epoch": 3.2775583482944346, + "grad_norm": 0.9583360552787781, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 45640 + }, + { + "epoch": 3.2782764811490126, + "grad_norm": 0.7630334496498108, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 45650 + }, + { + "epoch": 3.2789946140035906, + "grad_norm": 0.9955230355262756, + "learning_rate": 0.0002, + "loss": 0.6873, + "step": 45660 + }, + { + "epoch": 3.2797127468581686, + "grad_norm": 0.8685793876647949, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 45670 + }, + { + "epoch": 3.280430879712747, + "grad_norm": 0.919913113117218, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 45680 + }, + { + "epoch": 3.281149012567325, + "grad_norm": 0.826144814491272, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 45690 + }, + { + "epoch": 3.281867145421903, + "grad_norm": 0.9750179052352905, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 45700 + }, + { + "epoch": 3.282585278276481, + "grad_norm": 0.7931897640228271, + "learning_rate": 0.0002, + "loss": 0.6589, + "step": 45710 + }, + { + "epoch": 3.283303411131059, + "grad_norm": 1.0380089282989502, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 45720 + }, + { + "epoch": 3.2840215439856375, + "grad_norm": 0.8220566511154175, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 45730 + }, + { + "epoch": 3.2847396768402155, + "grad_norm": 0.9688239693641663, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 45740 + }, + { + "epoch": 3.2854578096947935, + "grad_norm": 0.8760311603546143, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 45750 + }, + { + "epoch": 3.2861759425493715, + "grad_norm": 0.8103382587432861, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 45760 + }, + { + "epoch": 3.28689407540395, + "grad_norm": 0.8835865259170532, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 45770 + }, + { + "epoch": 3.287612208258528, + "grad_norm": 0.9021160006523132, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 45780 + }, + { + "epoch": 3.288330341113106, + "grad_norm": 0.8182386159896851, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 45790 + }, + { + "epoch": 3.289048473967684, + "grad_norm": 0.8555024862289429, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 45800 + }, + { + "epoch": 3.289766606822262, + "grad_norm": 1.0982348918914795, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 45810 + }, + { + "epoch": 3.2904847396768404, + "grad_norm": 1.06246817111969, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 45820 + }, + { + "epoch": 3.2912028725314184, + "grad_norm": 1.1727149486541748, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 45830 + }, + { + "epoch": 3.2919210053859964, + "grad_norm": 0.8224700093269348, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 45840 + }, + { + "epoch": 3.2926391382405744, + "grad_norm": 0.8195698261260986, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 45850 + }, + { + "epoch": 3.2933572710951524, + "grad_norm": 0.8424476981163025, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 45860 + }, + { + "epoch": 3.294075403949731, + "grad_norm": 0.9804632067680359, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 45870 + }, + { + "epoch": 3.294793536804309, + "grad_norm": 0.8701804876327515, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 45880 + }, + { + "epoch": 3.295511669658887, + "grad_norm": 0.8876864910125732, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 45890 + }, + { + "epoch": 3.296229802513465, + "grad_norm": 1.0105448961257935, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 45900 + }, + { + "epoch": 3.296947935368043, + "grad_norm": 0.847017228603363, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 45910 + }, + { + "epoch": 3.2976660682226213, + "grad_norm": 0.7610297799110413, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 45920 + }, + { + "epoch": 3.2983842010771993, + "grad_norm": 0.7272670269012451, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 45930 + }, + { + "epoch": 3.2991023339317773, + "grad_norm": 0.8243510127067566, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 45940 + }, + { + "epoch": 3.2998204667863553, + "grad_norm": 1.0113074779510498, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 45950 + }, + { + "epoch": 3.3005385996409338, + "grad_norm": 0.8578087687492371, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 45960 + }, + { + "epoch": 3.3012567324955118, + "grad_norm": 0.9511606097221375, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 45970 + }, + { + "epoch": 3.3019748653500898, + "grad_norm": 0.8612566590309143, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 45980 + }, + { + "epoch": 3.3026929982046678, + "grad_norm": 0.8702331185340881, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 45990 + }, + { + "epoch": 3.3034111310592458, + "grad_norm": 1.0229583978652954, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 46000 + }, + { + "epoch": 3.304129263913824, + "grad_norm": 1.1775577068328857, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 46010 + }, + { + "epoch": 3.3048473967684022, + "grad_norm": 0.9922171831130981, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 46020 + }, + { + "epoch": 3.3055655296229802, + "grad_norm": 0.8246880769729614, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 46030 + }, + { + "epoch": 3.3062836624775582, + "grad_norm": 0.9351653456687927, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 46040 + }, + { + "epoch": 3.3070017953321367, + "grad_norm": 0.9617429375648499, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 46050 + }, + { + "epoch": 3.3077199281867147, + "grad_norm": 0.9753885269165039, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 46060 + }, + { + "epoch": 3.3084380610412927, + "grad_norm": 0.8532425165176392, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 46070 + }, + { + "epoch": 3.3091561938958707, + "grad_norm": 0.9722012877464294, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 46080 + }, + { + "epoch": 3.3098743267504487, + "grad_norm": 0.8950021266937256, + "learning_rate": 0.0002, + "loss": 0.6629, + "step": 46090 + }, + { + "epoch": 3.3105924596050267, + "grad_norm": 0.8536333441734314, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46100 + }, + { + "epoch": 3.311310592459605, + "grad_norm": 0.9423946738243103, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46110 + }, + { + "epoch": 3.312028725314183, + "grad_norm": 0.8573169112205505, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 46120 + }, + { + "epoch": 3.312746858168761, + "grad_norm": 1.0122376680374146, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 46130 + }, + { + "epoch": 3.313464991023339, + "grad_norm": 0.7492560744285583, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 46140 + }, + { + "epoch": 3.3141831238779176, + "grad_norm": 1.023658037185669, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 46150 + }, + { + "epoch": 3.3149012567324956, + "grad_norm": 1.1191970109939575, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 46160 + }, + { + "epoch": 3.3156193895870736, + "grad_norm": 0.9847373962402344, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 46170 + }, + { + "epoch": 3.3163375224416516, + "grad_norm": 0.7315911054611206, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 46180 + }, + { + "epoch": 3.3170556552962296, + "grad_norm": 0.8267890214920044, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 46190 + }, + { + "epoch": 3.317773788150808, + "grad_norm": 0.8898099064826965, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 46200 + }, + { + "epoch": 3.318491921005386, + "grad_norm": 0.8525369167327881, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 46210 + }, + { + "epoch": 3.319210053859964, + "grad_norm": 0.8074760437011719, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 46220 + }, + { + "epoch": 3.319928186714542, + "grad_norm": 0.8473616242408752, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 46230 + }, + { + "epoch": 3.3206463195691205, + "grad_norm": 0.8678314089775085, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 46240 + }, + { + "epoch": 3.3213644524236985, + "grad_norm": 0.8718782067298889, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 46250 + }, + { + "epoch": 3.3220825852782765, + "grad_norm": 0.9384858012199402, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 46260 + }, + { + "epoch": 3.3228007181328545, + "grad_norm": 0.9295032620429993, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 46270 + }, + { + "epoch": 3.3235188509874325, + "grad_norm": 0.9472482800483704, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 46280 + }, + { + "epoch": 3.324236983842011, + "grad_norm": 0.7970638275146484, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 46290 + }, + { + "epoch": 3.324955116696589, + "grad_norm": 0.9508723020553589, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 46300 + }, + { + "epoch": 3.325673249551167, + "grad_norm": 0.9153636693954468, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 46310 + }, + { + "epoch": 3.326391382405745, + "grad_norm": 0.7890323400497437, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 46320 + }, + { + "epoch": 3.3271095152603234, + "grad_norm": 0.8711825609207153, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46330 + }, + { + "epoch": 3.3278276481149014, + "grad_norm": 0.9938926696777344, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 46340 + }, + { + "epoch": 3.3285457809694794, + "grad_norm": 0.8497524857521057, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 46350 + }, + { + "epoch": 3.3292639138240574, + "grad_norm": 0.9191650748252869, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 46360 + }, + { + "epoch": 3.3299820466786354, + "grad_norm": 0.8974085450172424, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 46370 + }, + { + "epoch": 3.3307001795332134, + "grad_norm": 0.9928934574127197, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 46380 + }, + { + "epoch": 3.331418312387792, + "grad_norm": 0.9011030197143555, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46390 + }, + { + "epoch": 3.33213644524237, + "grad_norm": 0.898594856262207, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 46400 + }, + { + "epoch": 3.332854578096948, + "grad_norm": 0.7506672143936157, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 46410 + }, + { + "epoch": 3.333572710951526, + "grad_norm": 0.9239172339439392, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 46420 + }, + { + "epoch": 3.3342908438061043, + "grad_norm": 1.0749682188034058, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 46430 + }, + { + "epoch": 3.3350089766606823, + "grad_norm": 0.9262617230415344, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 46440 + }, + { + "epoch": 3.3357271095152603, + "grad_norm": 0.8681274056434631, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 46450 + }, + { + "epoch": 3.3364452423698383, + "grad_norm": 0.9558620452880859, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 46460 + }, + { + "epoch": 3.3371633752244163, + "grad_norm": 0.8907097578048706, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 46470 + }, + { + "epoch": 3.3378815080789948, + "grad_norm": 1.0941565036773682, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 46480 + }, + { + "epoch": 3.3385996409335728, + "grad_norm": 0.8971590995788574, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 46490 + }, + { + "epoch": 3.3393177737881508, + "grad_norm": 1.0315606594085693, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 46500 + }, + { + "epoch": 3.3400359066427288, + "grad_norm": 0.7717124223709106, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 46510 + }, + { + "epoch": 3.340754039497307, + "grad_norm": 0.8060970902442932, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 46520 + }, + { + "epoch": 3.341472172351885, + "grad_norm": 0.969510018825531, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 46530 + }, + { + "epoch": 3.342190305206463, + "grad_norm": 0.8837248682975769, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 46540 + }, + { + "epoch": 3.342908438061041, + "grad_norm": 0.9561076164245605, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 46550 + }, + { + "epoch": 3.343626570915619, + "grad_norm": 0.8529208898544312, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 46560 + }, + { + "epoch": 3.3443447037701977, + "grad_norm": 1.1300519704818726, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 46570 + }, + { + "epoch": 3.3450628366247757, + "grad_norm": 0.8330956101417542, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46580 + }, + { + "epoch": 3.3457809694793537, + "grad_norm": 0.7699366211891174, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 46590 + }, + { + "epoch": 3.3464991023339317, + "grad_norm": 1.0470821857452393, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 46600 + }, + { + "epoch": 3.34721723518851, + "grad_norm": 0.9933704137802124, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46610 + }, + { + "epoch": 3.347935368043088, + "grad_norm": 0.8130798935890198, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 46620 + }, + { + "epoch": 3.348653500897666, + "grad_norm": 0.9746946692466736, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46630 + }, + { + "epoch": 3.349371633752244, + "grad_norm": 0.8607267141342163, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 46640 + }, + { + "epoch": 3.350089766606822, + "grad_norm": 0.800335705280304, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 46650 + }, + { + "epoch": 3.3508078994614, + "grad_norm": 1.0083239078521729, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 46660 + }, + { + "epoch": 3.3515260323159786, + "grad_norm": 1.0774433612823486, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 46670 + }, + { + "epoch": 3.3522441651705566, + "grad_norm": 0.9378824234008789, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 46680 + }, + { + "epoch": 3.3529622980251346, + "grad_norm": 0.8490564227104187, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 46690 + }, + { + "epoch": 3.3536804308797126, + "grad_norm": 1.0415582656860352, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 46700 + }, + { + "epoch": 3.354398563734291, + "grad_norm": 0.8514367938041687, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 46710 + }, + { + "epoch": 3.355116696588869, + "grad_norm": 0.7691360712051392, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 46720 + }, + { + "epoch": 3.355834829443447, + "grad_norm": 0.8345438241958618, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 46730 + }, + { + "epoch": 3.356552962298025, + "grad_norm": 1.023492693901062, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 46740 + }, + { + "epoch": 3.357271095152603, + "grad_norm": 0.9648325443267822, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 46750 + }, + { + "epoch": 3.3579892280071815, + "grad_norm": 0.9029248356819153, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 46760 + }, + { + "epoch": 3.3587073608617595, + "grad_norm": 0.9109513759613037, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 46770 + }, + { + "epoch": 3.3594254937163375, + "grad_norm": 0.7757390141487122, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 46780 + }, + { + "epoch": 3.3601436265709155, + "grad_norm": 0.794035792350769, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 46790 + }, + { + "epoch": 3.360861759425494, + "grad_norm": 0.8211429715156555, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 46800 + }, + { + "epoch": 3.361579892280072, + "grad_norm": 0.8620322346687317, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 46810 + }, + { + "epoch": 3.36229802513465, + "grad_norm": 0.9392538070678711, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 46820 + }, + { + "epoch": 3.363016157989228, + "grad_norm": 0.8297873139381409, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 46830 + }, + { + "epoch": 3.363734290843806, + "grad_norm": 0.9158190488815308, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 46840 + }, + { + "epoch": 3.3644524236983844, + "grad_norm": 1.1449424028396606, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 46850 + }, + { + "epoch": 3.3651705565529624, + "grad_norm": 0.8718444108963013, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 46860 + }, + { + "epoch": 3.3658886894075404, + "grad_norm": 0.7744014263153076, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 46870 + }, + { + "epoch": 3.3666068222621184, + "grad_norm": 0.8392460942268372, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 46880 + }, + { + "epoch": 3.367324955116697, + "grad_norm": 1.0424989461898804, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 46890 + }, + { + "epoch": 3.368043087971275, + "grad_norm": 1.4696359634399414, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 46900 + }, + { + "epoch": 3.368761220825853, + "grad_norm": 0.9298201203346252, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 46910 + }, + { + "epoch": 3.369479353680431, + "grad_norm": 0.8965262770652771, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 46920 + }, + { + "epoch": 3.370197486535009, + "grad_norm": 0.9395381808280945, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 46930 + }, + { + "epoch": 3.370915619389587, + "grad_norm": 0.9069047570228577, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 46940 + }, + { + "epoch": 3.3716337522441653, + "grad_norm": 0.9208605885505676, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 46950 + }, + { + "epoch": 3.3723518850987433, + "grad_norm": 0.9493077397346497, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 46960 + }, + { + "epoch": 3.3730700179533213, + "grad_norm": 1.0804208517074585, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 46970 + }, + { + "epoch": 3.3737881508078993, + "grad_norm": 0.9465714693069458, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 46980 + }, + { + "epoch": 3.3745062836624777, + "grad_norm": 0.9189882278442383, + "learning_rate": 0.0002, + "loss": 0.6633, + "step": 46990 + }, + { + "epoch": 3.3752244165170557, + "grad_norm": 1.0199357271194458, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 47000 + }, + { + "epoch": 3.3759425493716337, + "grad_norm": 0.8999426960945129, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 47010 + }, + { + "epoch": 3.3766606822262117, + "grad_norm": 0.8923690319061279, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 47020 + }, + { + "epoch": 3.3773788150807897, + "grad_norm": 0.7459347248077393, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 47030 + }, + { + "epoch": 3.378096947935368, + "grad_norm": 0.7702858448028564, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 47040 + }, + { + "epoch": 3.378815080789946, + "grad_norm": 0.8296625018119812, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 47050 + }, + { + "epoch": 3.379533213644524, + "grad_norm": 1.2952555418014526, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47060 + }, + { + "epoch": 3.380251346499102, + "grad_norm": 0.7778869271278381, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 47070 + }, + { + "epoch": 3.3809694793536806, + "grad_norm": 0.9151549339294434, + "learning_rate": 0.0002, + "loss": 0.6906, + "step": 47080 + }, + { + "epoch": 3.3816876122082586, + "grad_norm": 0.7883925437927246, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 47090 + }, + { + "epoch": 3.3824057450628366, + "grad_norm": 0.9602295756340027, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 47100 + }, + { + "epoch": 3.3831238779174146, + "grad_norm": 0.7953121066093445, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47110 + }, + { + "epoch": 3.3838420107719926, + "grad_norm": 1.110148549079895, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 47120 + }, + { + "epoch": 3.384560143626571, + "grad_norm": 0.9359608888626099, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 47130 + }, + { + "epoch": 3.385278276481149, + "grad_norm": 0.7877762317657471, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 47140 + }, + { + "epoch": 3.385996409335727, + "grad_norm": 0.8586933016777039, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47150 + }, + { + "epoch": 3.386714542190305, + "grad_norm": 0.8920878767967224, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 47160 + }, + { + "epoch": 3.3874326750448835, + "grad_norm": 0.9692603349685669, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 47170 + }, + { + "epoch": 3.3881508078994615, + "grad_norm": 0.9038610458374023, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 47180 + }, + { + "epoch": 3.3888689407540395, + "grad_norm": 1.6299188137054443, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 47190 + }, + { + "epoch": 3.3895870736086176, + "grad_norm": 0.9704291820526123, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 47200 + }, + { + "epoch": 3.3903052064631956, + "grad_norm": 0.9503401517868042, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 47210 + }, + { + "epoch": 3.3910233393177736, + "grad_norm": 1.0051378011703491, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 47220 + }, + { + "epoch": 3.391741472172352, + "grad_norm": 0.7336357235908508, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 47230 + }, + { + "epoch": 3.39245960502693, + "grad_norm": 0.9847398996353149, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47240 + }, + { + "epoch": 3.393177737881508, + "grad_norm": 0.8100917339324951, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 47250 + }, + { + "epoch": 3.393895870736086, + "grad_norm": 0.9752838611602783, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 47260 + }, + { + "epoch": 3.3946140035906645, + "grad_norm": 0.9400623440742493, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 47270 + }, + { + "epoch": 3.3953321364452425, + "grad_norm": 0.7310057878494263, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 47280 + }, + { + "epoch": 3.3960502692998205, + "grad_norm": 0.8898789286613464, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 47290 + }, + { + "epoch": 3.3967684021543985, + "grad_norm": 1.0157585144042969, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 47300 + }, + { + "epoch": 3.3974865350089765, + "grad_norm": 0.9108527898788452, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 47310 + }, + { + "epoch": 3.398204667863555, + "grad_norm": 0.9796249270439148, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 47320 + }, + { + "epoch": 3.398922800718133, + "grad_norm": 0.8176435232162476, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 47330 + }, + { + "epoch": 3.399640933572711, + "grad_norm": 0.9981188178062439, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 47340 + }, + { + "epoch": 3.400359066427289, + "grad_norm": 0.9774404764175415, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 47350 + }, + { + "epoch": 3.4010771992818674, + "grad_norm": 0.8624991774559021, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 47360 + }, + { + "epoch": 3.4017953321364454, + "grad_norm": 0.9191665053367615, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 47370 + }, + { + "epoch": 3.4025134649910234, + "grad_norm": 0.7971290946006775, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 47380 + }, + { + "epoch": 3.4032315978456014, + "grad_norm": 0.8336732983589172, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 47390 + }, + { + "epoch": 3.4039497307001794, + "grad_norm": 0.7730334401130676, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 47400 + }, + { + "epoch": 3.404667863554758, + "grad_norm": 0.8559145927429199, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 47410 + }, + { + "epoch": 3.405385996409336, + "grad_norm": 1.0261447429656982, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 47420 + }, + { + "epoch": 3.406104129263914, + "grad_norm": 0.9931781888008118, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 47430 + }, + { + "epoch": 3.406822262118492, + "grad_norm": 0.8971807360649109, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 47440 + }, + { + "epoch": 3.4075403949730703, + "grad_norm": 0.8886999487876892, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 47450 + }, + { + "epoch": 3.4082585278276483, + "grad_norm": 0.9551735520362854, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 47460 + }, + { + "epoch": 3.4089766606822263, + "grad_norm": 0.9066859483718872, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 47470 + }, + { + "epoch": 3.4096947935368043, + "grad_norm": 0.9192125201225281, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 47480 + }, + { + "epoch": 3.4104129263913823, + "grad_norm": 0.9332839250564575, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 47490 + }, + { + "epoch": 3.4111310592459603, + "grad_norm": 0.745563805103302, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47500 + }, + { + "epoch": 3.4118491921005387, + "grad_norm": 0.6843905448913574, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 47510 + }, + { + "epoch": 3.4125673249551167, + "grad_norm": 0.8063111305236816, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 47520 + }, + { + "epoch": 3.4132854578096947, + "grad_norm": 0.9666593670845032, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 47530 + }, + { + "epoch": 3.4140035906642727, + "grad_norm": 0.8112747073173523, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47540 + }, + { + "epoch": 3.414721723518851, + "grad_norm": 0.820807933807373, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 47550 + }, + { + "epoch": 3.415439856373429, + "grad_norm": 0.8476285338401794, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 47560 + }, + { + "epoch": 3.416157989228007, + "grad_norm": 1.0232552289962769, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47570 + }, + { + "epoch": 3.416876122082585, + "grad_norm": 0.8749372363090515, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 47580 + }, + { + "epoch": 3.417594254937163, + "grad_norm": 0.8117937445640564, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 47590 + }, + { + "epoch": 3.4183123877917416, + "grad_norm": 0.9010460376739502, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 47600 + }, + { + "epoch": 3.4190305206463196, + "grad_norm": 0.8955527544021606, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 47610 + }, + { + "epoch": 3.4197486535008976, + "grad_norm": 0.884186327457428, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 47620 + }, + { + "epoch": 3.4204667863554756, + "grad_norm": 0.8995241522789001, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 47630 + }, + { + "epoch": 3.421184919210054, + "grad_norm": 1.0627013444900513, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 47640 + }, + { + "epoch": 3.421903052064632, + "grad_norm": 0.8619979619979858, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 47650 + }, + { + "epoch": 3.42262118491921, + "grad_norm": 0.9682498574256897, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 47660 + }, + { + "epoch": 3.423339317773788, + "grad_norm": 0.9614400863647461, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 47670 + }, + { + "epoch": 3.424057450628366, + "grad_norm": 0.7986962795257568, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 47680 + }, + { + "epoch": 3.4247755834829445, + "grad_norm": 0.8255957961082458, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 47690 + }, + { + "epoch": 3.4254937163375225, + "grad_norm": 0.9139757752418518, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 47700 + }, + { + "epoch": 3.4262118491921005, + "grad_norm": 0.8086292743682861, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 47710 + }, + { + "epoch": 3.4269299820466785, + "grad_norm": 0.8852273225784302, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 47720 + }, + { + "epoch": 3.427648114901257, + "grad_norm": 0.7568784356117249, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 47730 + }, + { + "epoch": 3.428366247755835, + "grad_norm": 0.8933039903640747, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 47740 + }, + { + "epoch": 3.429084380610413, + "grad_norm": 0.8101669549942017, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 47750 + }, + { + "epoch": 3.429802513464991, + "grad_norm": 0.7021054625511169, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 47760 + }, + { + "epoch": 3.430520646319569, + "grad_norm": 0.8282538652420044, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 47770 + }, + { + "epoch": 3.431238779174147, + "grad_norm": 0.8168348670005798, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 47780 + }, + { + "epoch": 3.4319569120287254, + "grad_norm": 0.9504001140594482, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 47790 + }, + { + "epoch": 3.4326750448833034, + "grad_norm": 0.7500190734863281, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 47800 + }, + { + "epoch": 3.4333931777378814, + "grad_norm": 0.8645710945129395, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 47810 + }, + { + "epoch": 3.4341113105924594, + "grad_norm": 0.8088704943656921, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 47820 + }, + { + "epoch": 3.434829443447038, + "grad_norm": 0.9981673955917358, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 47830 + }, + { + "epoch": 3.435547576301616, + "grad_norm": 0.9363315105438232, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 47840 + }, + { + "epoch": 3.436265709156194, + "grad_norm": 0.8471030592918396, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 47850 + }, + { + "epoch": 3.436983842010772, + "grad_norm": 0.9447668790817261, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 47860 + }, + { + "epoch": 3.43770197486535, + "grad_norm": 0.9494127631187439, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 47870 + }, + { + "epoch": 3.4384201077199283, + "grad_norm": 0.8340432643890381, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 47880 + }, + { + "epoch": 3.4391382405745063, + "grad_norm": 0.8466387987136841, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 47890 + }, + { + "epoch": 3.4398563734290843, + "grad_norm": 0.9498962759971619, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 47900 + }, + { + "epoch": 3.4405745062836623, + "grad_norm": 0.8490501046180725, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 47910 + }, + { + "epoch": 3.441292639138241, + "grad_norm": 0.9506490230560303, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 47920 + }, + { + "epoch": 3.442010771992819, + "grad_norm": 0.7944257855415344, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 47930 + }, + { + "epoch": 3.442728904847397, + "grad_norm": 0.9725518226623535, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 47940 + }, + { + "epoch": 3.443447037701975, + "grad_norm": 0.7823024392127991, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 47950 + }, + { + "epoch": 3.444165170556553, + "grad_norm": 0.810565173625946, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 47960 + }, + { + "epoch": 3.4448833034111312, + "grad_norm": 0.9809024333953857, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 47970 + }, + { + "epoch": 3.4456014362657092, + "grad_norm": 0.8818578720092773, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 47980 + }, + { + "epoch": 3.4463195691202873, + "grad_norm": 0.9843092560768127, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 47990 + }, + { + "epoch": 3.4470377019748653, + "grad_norm": 0.916313886642456, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 48000 + }, + { + "epoch": 3.4477558348294433, + "grad_norm": 0.908442497253418, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 48010 + }, + { + "epoch": 3.4484739676840217, + "grad_norm": 0.9880178570747375, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 48020 + }, + { + "epoch": 3.4491921005385997, + "grad_norm": 0.9276854991912842, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 48030 + }, + { + "epoch": 3.4499102333931777, + "grad_norm": 1.0879448652267456, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 48040 + }, + { + "epoch": 3.4506283662477557, + "grad_norm": 0.7430389523506165, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 48050 + }, + { + "epoch": 3.4513464991023337, + "grad_norm": 1.0880072116851807, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 48060 + }, + { + "epoch": 3.452064631956912, + "grad_norm": 1.0424141883850098, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 48070 + }, + { + "epoch": 3.45278276481149, + "grad_norm": 0.926330029964447, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 48080 + }, + { + "epoch": 3.453500897666068, + "grad_norm": 0.8911219239234924, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 48090 + }, + { + "epoch": 3.454219030520646, + "grad_norm": 0.8727201223373413, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 48100 + }, + { + "epoch": 3.4549371633752246, + "grad_norm": 0.8573940396308899, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48110 + }, + { + "epoch": 3.4556552962298026, + "grad_norm": 1.0427064895629883, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 48120 + }, + { + "epoch": 3.4563734290843806, + "grad_norm": 0.8688231706619263, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 48130 + }, + { + "epoch": 3.4570915619389586, + "grad_norm": 0.8856009244918823, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 48140 + }, + { + "epoch": 3.4578096947935366, + "grad_norm": 0.9535353183746338, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 48150 + }, + { + "epoch": 3.458527827648115, + "grad_norm": 0.9466010928153992, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 48160 + }, + { + "epoch": 3.459245960502693, + "grad_norm": 0.9783535599708557, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 48170 + }, + { + "epoch": 3.459964093357271, + "grad_norm": 0.8010456562042236, + "learning_rate": 0.0002, + "loss": 0.6926, + "step": 48180 + }, + { + "epoch": 3.460682226211849, + "grad_norm": 0.8928955793380737, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 48190 + }, + { + "epoch": 3.4614003590664275, + "grad_norm": 0.7565838694572449, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 48200 + }, + { + "epoch": 3.4621184919210055, + "grad_norm": 1.0044180154800415, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 48210 + }, + { + "epoch": 3.4628366247755835, + "grad_norm": 0.8161038160324097, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 48220 + }, + { + "epoch": 3.4635547576301615, + "grad_norm": 1.1000211238861084, + "learning_rate": 0.0002, + "loss": 0.6869, + "step": 48230 + }, + { + "epoch": 3.4642728904847395, + "grad_norm": 0.7942240238189697, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 48240 + }, + { + "epoch": 3.464991023339318, + "grad_norm": 0.7546432018280029, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 48250 + }, + { + "epoch": 3.465709156193896, + "grad_norm": 0.7705255150794983, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 48260 + }, + { + "epoch": 3.466427289048474, + "grad_norm": 0.7958067059516907, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 48270 + }, + { + "epoch": 3.467145421903052, + "grad_norm": 0.9199120402336121, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48280 + }, + { + "epoch": 3.46786355475763, + "grad_norm": 1.118672251701355, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 48290 + }, + { + "epoch": 3.4685816876122084, + "grad_norm": 0.9161015748977661, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 48300 + }, + { + "epoch": 3.4692998204667864, + "grad_norm": 1.1086218357086182, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 48310 + }, + { + "epoch": 3.4700179533213644, + "grad_norm": 1.0123368501663208, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 48320 + }, + { + "epoch": 3.4707360861759424, + "grad_norm": 0.7380602359771729, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 48330 + }, + { + "epoch": 3.4714542190305204, + "grad_norm": 0.8967105150222778, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 48340 + }, + { + "epoch": 3.472172351885099, + "grad_norm": 1.0134044885635376, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 48350 + }, + { + "epoch": 3.472890484739677, + "grad_norm": 1.080815076828003, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 48360 + }, + { + "epoch": 3.473608617594255, + "grad_norm": 1.151721477508545, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 48370 + }, + { + "epoch": 3.474326750448833, + "grad_norm": 0.9436505436897278, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 48380 + }, + { + "epoch": 3.4750448833034113, + "grad_norm": 0.9154609441757202, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 48390 + }, + { + "epoch": 3.4757630161579893, + "grad_norm": 0.8943037986755371, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 48400 + }, + { + "epoch": 3.4764811490125673, + "grad_norm": 0.936988115310669, + "learning_rate": 0.0002, + "loss": 0.6316, + "step": 48410 + }, + { + "epoch": 3.4771992818671453, + "grad_norm": 0.826960027217865, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 48420 + }, + { + "epoch": 3.4779174147217233, + "grad_norm": 1.0487587451934814, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 48430 + }, + { + "epoch": 3.478635547576302, + "grad_norm": 0.729163646697998, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 48440 + }, + { + "epoch": 3.47935368043088, + "grad_norm": 0.8156948089599609, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 48450 + }, + { + "epoch": 3.480071813285458, + "grad_norm": 0.8004332184791565, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 48460 + }, + { + "epoch": 3.480789946140036, + "grad_norm": 0.9632692337036133, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 48470 + }, + { + "epoch": 3.4815080789946142, + "grad_norm": 1.0950212478637695, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 48480 + }, + { + "epoch": 3.4822262118491922, + "grad_norm": 0.8574318885803223, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 48490 + }, + { + "epoch": 3.4829443447037702, + "grad_norm": 0.8552606701850891, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 48500 + }, + { + "epoch": 3.4836624775583482, + "grad_norm": 0.9698445200920105, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 48510 + }, + { + "epoch": 3.4843806104129262, + "grad_norm": 0.9427815675735474, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 48520 + }, + { + "epoch": 3.4850987432675042, + "grad_norm": 0.7902070879936218, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 48530 + }, + { + "epoch": 3.4858168761220827, + "grad_norm": 1.0300066471099854, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 48540 + }, + { + "epoch": 3.4865350089766607, + "grad_norm": 1.1688778400421143, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 48550 + }, + { + "epoch": 3.4872531418312387, + "grad_norm": 1.0012071132659912, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 48560 + }, + { + "epoch": 3.4879712746858167, + "grad_norm": 1.112094759941101, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 48570 + }, + { + "epoch": 3.488689407540395, + "grad_norm": 0.8547284603118896, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 48580 + }, + { + "epoch": 3.489407540394973, + "grad_norm": 0.8827278017997742, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 48590 + }, + { + "epoch": 3.490125673249551, + "grad_norm": 0.9255490303039551, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 48600 + }, + { + "epoch": 3.490843806104129, + "grad_norm": 0.8000030517578125, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 48610 + }, + { + "epoch": 3.491561938958707, + "grad_norm": 0.9327391386032104, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 48620 + }, + { + "epoch": 3.4922800718132856, + "grad_norm": 0.9004138708114624, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 48630 + }, + { + "epoch": 3.4929982046678636, + "grad_norm": 0.9886971116065979, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 48640 + }, + { + "epoch": 3.4937163375224416, + "grad_norm": 0.9890487194061279, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 48650 + }, + { + "epoch": 3.4944344703770196, + "grad_norm": 0.7024438977241516, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 48660 + }, + { + "epoch": 3.495152603231598, + "grad_norm": 0.8397303223609924, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 48670 + }, + { + "epoch": 3.495870736086176, + "grad_norm": 0.9120950698852539, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 48680 + }, + { + "epoch": 3.496588868940754, + "grad_norm": 1.057299017906189, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48690 + }, + { + "epoch": 3.497307001795332, + "grad_norm": 0.821325957775116, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 48700 + }, + { + "epoch": 3.49802513464991, + "grad_norm": 1.0029970407485962, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 48710 + }, + { + "epoch": 3.4987432675044885, + "grad_norm": 0.9483712911605835, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 48720 + }, + { + "epoch": 3.4994614003590665, + "grad_norm": 0.9637855291366577, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 48730 + }, + { + "epoch": 3.5001795332136445, + "grad_norm": 0.6848894357681274, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 48740 + }, + { + "epoch": 3.5008976660682225, + "grad_norm": 0.7848573327064514, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 48750 + }, + { + "epoch": 3.501615798922801, + "grad_norm": 1.0341308116912842, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 48760 + }, + { + "epoch": 3.502333931777379, + "grad_norm": 0.8858218193054199, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 48770 + }, + { + "epoch": 3.503052064631957, + "grad_norm": 0.8366939425468445, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 48780 + }, + { + "epoch": 3.503770197486535, + "grad_norm": 0.7926092147827148, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 48790 + }, + { + "epoch": 3.504488330341113, + "grad_norm": 0.8503843545913696, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 48800 + }, + { + "epoch": 3.505206463195691, + "grad_norm": 0.8867869973182678, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 48810 + }, + { + "epoch": 3.5059245960502694, + "grad_norm": 1.0336930751800537, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 48820 + }, + { + "epoch": 3.5066427289048474, + "grad_norm": 0.8564051985740662, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 48830 + }, + { + "epoch": 3.5073608617594254, + "grad_norm": 0.9202605485916138, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 48840 + }, + { + "epoch": 3.508078994614004, + "grad_norm": 0.8838639855384827, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 48850 + }, + { + "epoch": 3.508797127468582, + "grad_norm": 0.8975196480751038, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 48860 + }, + { + "epoch": 3.50951526032316, + "grad_norm": 0.8842370510101318, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 48870 + }, + { + "epoch": 3.510233393177738, + "grad_norm": 0.9195886254310608, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 48880 + }, + { + "epoch": 3.510951526032316, + "grad_norm": 0.986130952835083, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 48890 + }, + { + "epoch": 3.511669658886894, + "grad_norm": 0.8119593858718872, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 48900 + }, + { + "epoch": 3.5123877917414723, + "grad_norm": 0.9027136564254761, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 48910 + }, + { + "epoch": 3.5131059245960503, + "grad_norm": 0.8560537099838257, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 48920 + }, + { + "epoch": 3.5138240574506283, + "grad_norm": 0.7073559165000916, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 48930 + }, + { + "epoch": 3.5145421903052063, + "grad_norm": 0.8753304481506348, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 48940 + }, + { + "epoch": 3.5152603231597848, + "grad_norm": 0.9151145815849304, + "learning_rate": 0.0002, + "loss": 0.6366, + "step": 48950 + }, + { + "epoch": 3.5159784560143628, + "grad_norm": 0.7794315814971924, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 48960 + }, + { + "epoch": 3.5166965888689408, + "grad_norm": 0.9226023554801941, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 48970 + }, + { + "epoch": 3.5174147217235188, + "grad_norm": 0.8442051410675049, + "learning_rate": 0.0002, + "loss": 0.6473, + "step": 48980 + }, + { + "epoch": 3.5181328545780968, + "grad_norm": 0.9769423007965088, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 48990 + }, + { + "epoch": 3.5188509874326748, + "grad_norm": 0.740347146987915, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 49000 + }, + { + "epoch": 3.519569120287253, + "grad_norm": 0.8963457345962524, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 49010 + }, + { + "epoch": 3.520287253141831, + "grad_norm": 0.8410176634788513, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 49020 + }, + { + "epoch": 3.521005385996409, + "grad_norm": 1.0486022233963013, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 49030 + }, + { + "epoch": 3.5217235188509877, + "grad_norm": 0.95393967628479, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 49040 + }, + { + "epoch": 3.5224416517055657, + "grad_norm": 0.8261157274246216, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49050 + }, + { + "epoch": 3.5231597845601437, + "grad_norm": 0.9321704506874084, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 49060 + }, + { + "epoch": 3.5238779174147217, + "grad_norm": 1.2596088647842407, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 49070 + }, + { + "epoch": 3.5245960502692997, + "grad_norm": 0.8584637641906738, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 49080 + }, + { + "epoch": 3.5253141831238777, + "grad_norm": 0.850520670413971, + "learning_rate": 0.0002, + "loss": 0.6708, + "step": 49090 + }, + { + "epoch": 3.526032315978456, + "grad_norm": 0.8915920257568359, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 49100 + }, + { + "epoch": 3.526750448833034, + "grad_norm": 0.9070239067077637, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 49110 + }, + { + "epoch": 3.527468581687612, + "grad_norm": 0.699878990650177, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 49120 + }, + { + "epoch": 3.5281867145421906, + "grad_norm": 0.9003779888153076, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 49130 + }, + { + "epoch": 3.5289048473967686, + "grad_norm": 0.7886711955070496, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 49140 + }, + { + "epoch": 3.5296229802513466, + "grad_norm": 0.7368922233581543, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 49150 + }, + { + "epoch": 3.5303411131059246, + "grad_norm": 0.8585197329521179, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 49160 + }, + { + "epoch": 3.5310592459605026, + "grad_norm": 1.0205435752868652, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 49170 + }, + { + "epoch": 3.5317773788150806, + "grad_norm": 0.8756650686264038, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 49180 + }, + { + "epoch": 3.532495511669659, + "grad_norm": 1.0278643369674683, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 49190 + }, + { + "epoch": 3.533213644524237, + "grad_norm": 0.8641911745071411, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 49200 + }, + { + "epoch": 3.533931777378815, + "grad_norm": 0.8730159401893616, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 49210 + }, + { + "epoch": 3.534649910233393, + "grad_norm": 0.918637216091156, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 49220 + }, + { + "epoch": 3.5353680430879715, + "grad_norm": 1.0467222929000854, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 49230 + }, + { + "epoch": 3.5360861759425495, + "grad_norm": 1.005009412765503, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 49240 + }, + { + "epoch": 3.5368043087971275, + "grad_norm": 0.9775063395500183, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 49250 + }, + { + "epoch": 3.5375224416517055, + "grad_norm": 0.8198322057723999, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 49260 + }, + { + "epoch": 3.5382405745062835, + "grad_norm": 0.8184829354286194, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 49270 + }, + { + "epoch": 3.5389587073608615, + "grad_norm": 0.9520270824432373, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 49280 + }, + { + "epoch": 3.53967684021544, + "grad_norm": 0.7816803455352783, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 49290 + }, + { + "epoch": 3.540394973070018, + "grad_norm": 0.6915702819824219, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 49300 + }, + { + "epoch": 3.541113105924596, + "grad_norm": 0.8282375931739807, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 49310 + }, + { + "epoch": 3.5418312387791744, + "grad_norm": 1.0797513723373413, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 49320 + }, + { + "epoch": 3.5425493716337524, + "grad_norm": 0.868671715259552, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 49330 + }, + { + "epoch": 3.5432675044883304, + "grad_norm": 0.8534455895423889, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 49340 + }, + { + "epoch": 3.5439856373429084, + "grad_norm": 0.816411554813385, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 49350 + }, + { + "epoch": 3.5447037701974864, + "grad_norm": 0.7813423275947571, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 49360 + }, + { + "epoch": 3.5454219030520644, + "grad_norm": 0.8002013564109802, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 49370 + }, + { + "epoch": 3.546140035906643, + "grad_norm": 0.9740113615989685, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 49380 + }, + { + "epoch": 3.546858168761221, + "grad_norm": 0.9046127200126648, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 49390 + }, + { + "epoch": 3.547576301615799, + "grad_norm": 0.8635150194168091, + "learning_rate": 0.0002, + "loss": 0.6444, + "step": 49400 + }, + { + "epoch": 3.5482944344703773, + "grad_norm": 0.9488558769226074, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 49410 + }, + { + "epoch": 3.5490125673249553, + "grad_norm": 0.9637090563774109, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 49420 + }, + { + "epoch": 3.5497307001795333, + "grad_norm": 1.042245626449585, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 49430 + }, + { + "epoch": 3.5504488330341113, + "grad_norm": 0.9076175689697266, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 49440 + }, + { + "epoch": 3.5511669658886893, + "grad_norm": 0.8480596542358398, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 49450 + }, + { + "epoch": 3.5518850987432673, + "grad_norm": 0.8483007550239563, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 49460 + }, + { + "epoch": 3.5526032315978457, + "grad_norm": 0.7855815887451172, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 49470 + }, + { + "epoch": 3.5533213644524237, + "grad_norm": 0.8435823917388916, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 49480 + }, + { + "epoch": 3.5540394973070017, + "grad_norm": 0.8613026142120361, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 49490 + }, + { + "epoch": 3.5547576301615798, + "grad_norm": 0.9654812812805176, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 49500 + }, + { + "epoch": 3.555475763016158, + "grad_norm": 0.8888838887214661, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 49510 + }, + { + "epoch": 3.556193895870736, + "grad_norm": 0.7718146443367004, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49520 + }, + { + "epoch": 3.556912028725314, + "grad_norm": 0.9487382173538208, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 49530 + }, + { + "epoch": 3.557630161579892, + "grad_norm": 0.9256559610366821, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 49540 + }, + { + "epoch": 3.55834829443447, + "grad_norm": 0.8879945874214172, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 49550 + }, + { + "epoch": 3.559066427289048, + "grad_norm": 0.8498744368553162, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 49560 + }, + { + "epoch": 3.5597845601436267, + "grad_norm": 0.9550948143005371, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 49570 + }, + { + "epoch": 3.5605026929982047, + "grad_norm": 0.8386164903640747, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 49580 + }, + { + "epoch": 3.5612208258527827, + "grad_norm": 0.925573468208313, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 49590 + }, + { + "epoch": 3.561938958707361, + "grad_norm": 0.8867112398147583, + "learning_rate": 0.0002, + "loss": 0.676, + "step": 49600 + }, + { + "epoch": 3.562657091561939, + "grad_norm": 0.7638537883758545, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 49610 + }, + { + "epoch": 3.563375224416517, + "grad_norm": 0.9491845965385437, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 49620 + }, + { + "epoch": 3.564093357271095, + "grad_norm": 0.8384189605712891, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 49630 + }, + { + "epoch": 3.564811490125673, + "grad_norm": 0.8850575089454651, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 49640 + }, + { + "epoch": 3.565529622980251, + "grad_norm": 1.020916223526001, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 49650 + }, + { + "epoch": 3.5662477558348296, + "grad_norm": 0.9298280477523804, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 49660 + }, + { + "epoch": 3.5669658886894076, + "grad_norm": 0.9795742034912109, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 49670 + }, + { + "epoch": 3.5676840215439856, + "grad_norm": 0.9401193261146545, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 49680 + }, + { + "epoch": 3.568402154398564, + "grad_norm": 1.0383585691452026, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 49690 + }, + { + "epoch": 3.569120287253142, + "grad_norm": 0.8370866179466248, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 49700 + }, + { + "epoch": 3.56983842010772, + "grad_norm": 0.8207486271858215, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 49710 + }, + { + "epoch": 3.570556552962298, + "grad_norm": 0.8551223278045654, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 49720 + }, + { + "epoch": 3.571274685816876, + "grad_norm": 0.8041176199913025, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 49730 + }, + { + "epoch": 3.571992818671454, + "grad_norm": 0.9862527847290039, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 49740 + }, + { + "epoch": 3.5727109515260325, + "grad_norm": 0.7557165622711182, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 49750 + }, + { + "epoch": 3.5734290843806105, + "grad_norm": 1.0908563137054443, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 49760 + }, + { + "epoch": 3.5741472172351885, + "grad_norm": 0.7245369553565979, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 49770 + }, + { + "epoch": 3.5748653500897665, + "grad_norm": 0.7851184010505676, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 49780 + }, + { + "epoch": 3.575583482944345, + "grad_norm": 0.9443599581718445, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 49790 + }, + { + "epoch": 3.576301615798923, + "grad_norm": 1.021196961402893, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 49800 + }, + { + "epoch": 3.577019748653501, + "grad_norm": 0.9099196195602417, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 49810 + }, + { + "epoch": 3.577737881508079, + "grad_norm": 0.9397716522216797, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 49820 + }, + { + "epoch": 3.578456014362657, + "grad_norm": 0.9214922785758972, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 49830 + }, + { + "epoch": 3.579174147217235, + "grad_norm": 1.0053879022598267, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 49840 + }, + { + "epoch": 3.5798922800718134, + "grad_norm": 0.9415460228919983, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 49850 + }, + { + "epoch": 3.5806104129263914, + "grad_norm": 1.0807833671569824, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 49860 + }, + { + "epoch": 3.5813285457809694, + "grad_norm": 1.0070871114730835, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 49870 + }, + { + "epoch": 3.582046678635548, + "grad_norm": 0.9707024693489075, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 49880 + }, + { + "epoch": 3.582764811490126, + "grad_norm": 0.9979593753814697, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 49890 + }, + { + "epoch": 3.583482944344704, + "grad_norm": 0.7238648533821106, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 49900 + }, + { + "epoch": 3.584201077199282, + "grad_norm": 0.8168631792068481, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 49910 + }, + { + "epoch": 3.58491921005386, + "grad_norm": 0.8156409859657288, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 49920 + }, + { + "epoch": 3.585637342908438, + "grad_norm": 0.9256414175033569, + "learning_rate": 0.0002, + "loss": 0.6248, + "step": 49930 + }, + { + "epoch": 3.5863554757630163, + "grad_norm": 1.0090070962905884, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 49940 + }, + { + "epoch": 3.5870736086175943, + "grad_norm": 0.8257701992988586, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 49950 + }, + { + "epoch": 3.5877917414721723, + "grad_norm": 0.9189013242721558, + "learning_rate": 0.0002, + "loss": 0.6996, + "step": 49960 + }, + { + "epoch": 3.5885098743267507, + "grad_norm": 0.8497788310050964, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 49970 + }, + { + "epoch": 3.5892280071813287, + "grad_norm": 0.9596505761146545, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 49980 + }, + { + "epoch": 3.5899461400359067, + "grad_norm": 0.8773331642150879, + "learning_rate": 0.0002, + "loss": 0.697, + "step": 49990 + }, + { + "epoch": 3.5906642728904847, + "grad_norm": 0.8952302932739258, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50000 + }, + { + "epoch": 3.5913824057450627, + "grad_norm": 0.7713809609413147, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 50010 + }, + { + "epoch": 3.5921005385996407, + "grad_norm": 1.0151346921920776, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 50020 + }, + { + "epoch": 3.592818671454219, + "grad_norm": 0.8793733716011047, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 50030 + }, + { + "epoch": 3.593536804308797, + "grad_norm": 0.8881325721740723, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 50040 + }, + { + "epoch": 3.594254937163375, + "grad_norm": 0.9346749782562256, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 50050 + }, + { + "epoch": 3.594973070017953, + "grad_norm": 0.8705052137374878, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 50060 + }, + { + "epoch": 3.5956912028725316, + "grad_norm": 1.039197564125061, + "learning_rate": 0.0002, + "loss": 0.6753, + "step": 50070 + }, + { + "epoch": 3.5964093357271096, + "grad_norm": 0.7053273320198059, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 50080 + }, + { + "epoch": 3.5971274685816876, + "grad_norm": 0.8268665671348572, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 50090 + }, + { + "epoch": 3.5978456014362656, + "grad_norm": 0.8921764492988586, + "learning_rate": 0.0002, + "loss": 0.6637, + "step": 50100 + }, + { + "epoch": 3.5985637342908436, + "grad_norm": 0.9756084680557251, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 50110 + }, + { + "epoch": 3.5992818671454216, + "grad_norm": 0.9275530576705933, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 50120 + }, + { + "epoch": 3.6, + "grad_norm": 0.9030009508132935, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 50130 + }, + { + "epoch": 3.600718132854578, + "grad_norm": 0.7805638909339905, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 50140 + }, + { + "epoch": 3.601436265709156, + "grad_norm": 0.7627325057983398, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 50150 + }, + { + "epoch": 3.6021543985637345, + "grad_norm": 0.7809714078903198, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 50160 + }, + { + "epoch": 3.6028725314183125, + "grad_norm": 0.7910378575325012, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 50170 + }, + { + "epoch": 3.6035906642728905, + "grad_norm": 1.004438042640686, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 50180 + }, + { + "epoch": 3.6043087971274685, + "grad_norm": 0.825969934463501, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 50190 + }, + { + "epoch": 3.6050269299820465, + "grad_norm": 0.8866565227508545, + "learning_rate": 0.0002, + "loss": 0.6788, + "step": 50200 + }, + { + "epoch": 3.6057450628366245, + "grad_norm": 0.8920543193817139, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 50210 + }, + { + "epoch": 3.606463195691203, + "grad_norm": 1.106584906578064, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 50220 + }, + { + "epoch": 3.607181328545781, + "grad_norm": 0.916607677936554, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 50230 + }, + { + "epoch": 3.607899461400359, + "grad_norm": 0.8014767169952393, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 50240 + }, + { + "epoch": 3.608617594254937, + "grad_norm": 0.9556822776794434, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 50250 + }, + { + "epoch": 3.6093357271095154, + "grad_norm": 0.9630016684532166, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50260 + }, + { + "epoch": 3.6100538599640934, + "grad_norm": 0.9862125515937805, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 50270 + }, + { + "epoch": 3.6107719928186714, + "grad_norm": 1.0043333768844604, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 50280 + }, + { + "epoch": 3.6114901256732495, + "grad_norm": 0.9255319833755493, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 50290 + }, + { + "epoch": 3.6122082585278275, + "grad_norm": 1.012023687362671, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 50300 + }, + { + "epoch": 3.612926391382406, + "grad_norm": 1.0701122283935547, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 50310 + }, + { + "epoch": 3.613644524236984, + "grad_norm": 0.8270810842514038, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 50320 + }, + { + "epoch": 3.614362657091562, + "grad_norm": 0.8881328105926514, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 50330 + }, + { + "epoch": 3.61508078994614, + "grad_norm": 0.9536844491958618, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 50340 + }, + { + "epoch": 3.6157989228007184, + "grad_norm": 0.8044326305389404, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 50350 + }, + { + "epoch": 3.6165170556552964, + "grad_norm": 0.834591805934906, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 50360 + }, + { + "epoch": 3.6172351885098744, + "grad_norm": 0.903752863407135, + "learning_rate": 0.0002, + "loss": 0.7173, + "step": 50370 + }, + { + "epoch": 3.6179533213644524, + "grad_norm": 0.9148632884025574, + "learning_rate": 0.0002, + "loss": 0.6305, + "step": 50380 + }, + { + "epoch": 3.6186714542190304, + "grad_norm": 0.9280176162719727, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 50390 + }, + { + "epoch": 3.6193895870736084, + "grad_norm": 0.9524136781692505, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 50400 + }, + { + "epoch": 3.620107719928187, + "grad_norm": 1.1751197576522827, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 50410 + }, + { + "epoch": 3.620825852782765, + "grad_norm": 1.032279133796692, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 50420 + }, + { + "epoch": 3.621543985637343, + "grad_norm": 0.790741503238678, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 50430 + }, + { + "epoch": 3.6222621184919213, + "grad_norm": 0.9584221243858337, + "learning_rate": 0.0002, + "loss": 0.695, + "step": 50440 + }, + { + "epoch": 3.6229802513464993, + "grad_norm": 0.7792508006095886, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 50450 + }, + { + "epoch": 3.6236983842010773, + "grad_norm": 0.8273448944091797, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 50460 + }, + { + "epoch": 3.6244165170556553, + "grad_norm": 0.8001132607460022, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 50470 + }, + { + "epoch": 3.6251346499102333, + "grad_norm": 1.077109694480896, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 50480 + }, + { + "epoch": 3.6258527827648113, + "grad_norm": 1.111274003982544, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 50490 + }, + { + "epoch": 3.6265709156193897, + "grad_norm": 0.7757347822189331, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 50500 + }, + { + "epoch": 3.6272890484739677, + "grad_norm": 0.9217049479484558, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 50510 + }, + { + "epoch": 3.6280071813285457, + "grad_norm": 0.9362251162528992, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 50520 + }, + { + "epoch": 3.6287253141831237, + "grad_norm": 0.9435479044914246, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 50530 + }, + { + "epoch": 3.629443447037702, + "grad_norm": 0.7748915553092957, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 50540 + }, + { + "epoch": 3.63016157989228, + "grad_norm": 0.8238945007324219, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 50550 + }, + { + "epoch": 3.630879712746858, + "grad_norm": 0.8421505093574524, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 50560 + }, + { + "epoch": 3.631597845601436, + "grad_norm": 1.0272293090820312, + "learning_rate": 0.0002, + "loss": 0.6544, + "step": 50570 + }, + { + "epoch": 3.632315978456014, + "grad_norm": 0.7643818259239197, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 50580 + }, + { + "epoch": 3.6330341113105926, + "grad_norm": 0.9756225347518921, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 50590 + }, + { + "epoch": 3.6337522441651706, + "grad_norm": 0.9311570525169373, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 50600 + }, + { + "epoch": 3.6344703770197486, + "grad_norm": 0.8829827904701233, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 50610 + }, + { + "epoch": 3.6351885098743266, + "grad_norm": 0.9473454356193542, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 50620 + }, + { + "epoch": 3.635906642728905, + "grad_norm": 1.1023668050765991, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 50630 + }, + { + "epoch": 3.636624775583483, + "grad_norm": 0.8490299582481384, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 50640 + }, + { + "epoch": 3.637342908438061, + "grad_norm": 1.1129392385482788, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 50650 + }, + { + "epoch": 3.638061041292639, + "grad_norm": 1.0334501266479492, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 50660 + }, + { + "epoch": 3.638779174147217, + "grad_norm": 0.8397296667098999, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 50670 + }, + { + "epoch": 3.639497307001795, + "grad_norm": 0.7984256744384766, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 50680 + }, + { + "epoch": 3.6402154398563735, + "grad_norm": 1.1182054281234741, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 50690 + }, + { + "epoch": 3.6409335727109515, + "grad_norm": 0.8743279576301575, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 50700 + }, + { + "epoch": 3.6416517055655295, + "grad_norm": 0.9101628661155701, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 50710 + }, + { + "epoch": 3.642369838420108, + "grad_norm": 0.8866934180259705, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 50720 + }, + { + "epoch": 3.643087971274686, + "grad_norm": 0.863945484161377, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 50730 + }, + { + "epoch": 3.643806104129264, + "grad_norm": 1.0845744609832764, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 50740 + }, + { + "epoch": 3.644524236983842, + "grad_norm": 0.8610911965370178, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 50750 + }, + { + "epoch": 3.64524236983842, + "grad_norm": 0.8502625226974487, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 50760 + }, + { + "epoch": 3.645960502692998, + "grad_norm": 0.847372829914093, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 50770 + }, + { + "epoch": 3.6466786355475764, + "grad_norm": 0.8649292588233948, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 50780 + }, + { + "epoch": 3.6473967684021544, + "grad_norm": 0.8742905855178833, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 50790 + }, + { + "epoch": 3.6481149012567324, + "grad_norm": 0.9546048641204834, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 50800 + }, + { + "epoch": 3.6488330341113104, + "grad_norm": 0.7893161773681641, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 50810 + }, + { + "epoch": 3.649551166965889, + "grad_norm": 0.9350247979164124, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 50820 + }, + { + "epoch": 3.650269299820467, + "grad_norm": 0.772149384021759, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 50830 + }, + { + "epoch": 3.650987432675045, + "grad_norm": 0.8281718492507935, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 50840 + }, + { + "epoch": 3.651705565529623, + "grad_norm": 0.8063850402832031, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 50850 + }, + { + "epoch": 3.652423698384201, + "grad_norm": 0.8101351261138916, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 50860 + }, + { + "epoch": 3.6531418312387793, + "grad_norm": 0.8747833371162415, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 50870 + }, + { + "epoch": 3.6538599640933573, + "grad_norm": 0.9634656310081482, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 50880 + }, + { + "epoch": 3.6545780969479353, + "grad_norm": 1.1646045446395874, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 50890 + }, + { + "epoch": 3.6552962298025133, + "grad_norm": 0.8538454174995422, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 50900 + }, + { + "epoch": 3.656014362657092, + "grad_norm": 0.7639184594154358, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 50910 + }, + { + "epoch": 3.65673249551167, + "grad_norm": 0.8750212788581848, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 50920 + }, + { + "epoch": 3.657450628366248, + "grad_norm": 0.9161198735237122, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 50930 + }, + { + "epoch": 3.658168761220826, + "grad_norm": 0.7987924814224243, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 50940 + }, + { + "epoch": 3.658886894075404, + "grad_norm": 0.8939290642738342, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 50950 + }, + { + "epoch": 3.659605026929982, + "grad_norm": 0.9803797602653503, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 50960 + }, + { + "epoch": 3.6603231597845602, + "grad_norm": 1.2423512935638428, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 50970 + }, + { + "epoch": 3.6610412926391382, + "grad_norm": 1.0023225545883179, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 50980 + }, + { + "epoch": 3.6617594254937162, + "grad_norm": 0.9066677689552307, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 50990 + }, + { + "epoch": 3.6624775583482947, + "grad_norm": 0.8906226754188538, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 51000 + }, + { + "epoch": 3.6631956912028727, + "grad_norm": 0.7449954152107239, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51010 + }, + { + "epoch": 3.6639138240574507, + "grad_norm": 0.812612771987915, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 51020 + }, + { + "epoch": 3.6646319569120287, + "grad_norm": 0.861818253993988, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 51030 + }, + { + "epoch": 3.6653500897666067, + "grad_norm": 0.849726676940918, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 51040 + }, + { + "epoch": 3.6660682226211847, + "grad_norm": 0.9738494753837585, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 51050 + }, + { + "epoch": 3.666786355475763, + "grad_norm": 0.928989827632904, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 51060 + }, + { + "epoch": 3.667504488330341, + "grad_norm": 0.9725563526153564, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 51070 + }, + { + "epoch": 3.668222621184919, + "grad_norm": 0.9366095066070557, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 51080 + }, + { + "epoch": 3.668940754039497, + "grad_norm": 0.8012986779212952, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 51090 + }, + { + "epoch": 3.6696588868940756, + "grad_norm": 1.0646892786026, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51100 + }, + { + "epoch": 3.6703770197486536, + "grad_norm": 0.7245157361030579, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 51110 + }, + { + "epoch": 3.6710951526032316, + "grad_norm": 0.6938936114311218, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 51120 + }, + { + "epoch": 3.6718132854578096, + "grad_norm": 0.8461366295814514, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 51130 + }, + { + "epoch": 3.6725314183123876, + "grad_norm": 0.8392583131790161, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 51140 + }, + { + "epoch": 3.673249551166966, + "grad_norm": 0.7245259284973145, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 51150 + }, + { + "epoch": 3.673967684021544, + "grad_norm": 1.0742167234420776, + "learning_rate": 0.0002, + "loss": 0.6165, + "step": 51160 + }, + { + "epoch": 3.674685816876122, + "grad_norm": 0.9553889036178589, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 51170 + }, + { + "epoch": 3.6754039497307, + "grad_norm": 0.8713715672492981, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 51180 + }, + { + "epoch": 3.6761220825852785, + "grad_norm": 0.7499800324440002, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 51190 + }, + { + "epoch": 3.6768402154398565, + "grad_norm": 1.1118139028549194, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 51200 + }, + { + "epoch": 3.6775583482944345, + "grad_norm": 0.8146613836288452, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 51210 + }, + { + "epoch": 3.6782764811490125, + "grad_norm": 0.9331285357475281, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 51220 + }, + { + "epoch": 3.6789946140035905, + "grad_norm": 1.0497597455978394, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 51230 + }, + { + "epoch": 3.6797127468581685, + "grad_norm": 0.879814863204956, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51240 + }, + { + "epoch": 3.680430879712747, + "grad_norm": 0.9896606802940369, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 51250 + }, + { + "epoch": 3.681149012567325, + "grad_norm": 0.928236186504364, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 51260 + }, + { + "epoch": 3.681867145421903, + "grad_norm": 0.8436732292175293, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 51270 + }, + { + "epoch": 3.6825852782764814, + "grad_norm": 0.93634432554245, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 51280 + }, + { + "epoch": 3.6833034111310594, + "grad_norm": 0.8477143049240112, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 51290 + }, + { + "epoch": 3.6840215439856374, + "grad_norm": 0.8720934987068176, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 51300 + }, + { + "epoch": 3.6847396768402154, + "grad_norm": 0.7322931289672852, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 51310 + }, + { + "epoch": 3.6854578096947934, + "grad_norm": 1.0064427852630615, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 51320 + }, + { + "epoch": 3.6861759425493714, + "grad_norm": 1.0197817087173462, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 51330 + }, + { + "epoch": 3.68689407540395, + "grad_norm": 0.8764060139656067, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 51340 + }, + { + "epoch": 3.687612208258528, + "grad_norm": 0.9763964414596558, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 51350 + }, + { + "epoch": 3.688330341113106, + "grad_norm": 0.8389105200767517, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 51360 + }, + { + "epoch": 3.689048473967684, + "grad_norm": 0.9215750694274902, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 51370 + }, + { + "epoch": 3.6897666068222623, + "grad_norm": 0.8444913625717163, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 51380 + }, + { + "epoch": 3.6904847396768403, + "grad_norm": 0.9635153412818909, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 51390 + }, + { + "epoch": 3.6912028725314183, + "grad_norm": 1.0397378206253052, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 51400 + }, + { + "epoch": 3.6919210053859963, + "grad_norm": 0.9154748320579529, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 51410 + }, + { + "epoch": 3.6926391382405743, + "grad_norm": 0.906445324420929, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 51420 + }, + { + "epoch": 3.6933572710951523, + "grad_norm": 0.9237992763519287, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 51430 + }, + { + "epoch": 3.6940754039497308, + "grad_norm": 0.8796338438987732, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 51440 + }, + { + "epoch": 3.6947935368043088, + "grad_norm": 0.8613203763961792, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 51450 + }, + { + "epoch": 3.6955116696588868, + "grad_norm": 0.7957607507705688, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 51460 + }, + { + "epoch": 3.6962298025134652, + "grad_norm": 0.9183711409568787, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 51470 + }, + { + "epoch": 3.6969479353680432, + "grad_norm": 1.0108308792114258, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 51480 + }, + { + "epoch": 3.6976660682226212, + "grad_norm": 0.7768247127532959, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 51490 + }, + { + "epoch": 3.6983842010771992, + "grad_norm": 1.0051485300064087, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 51500 + }, + { + "epoch": 3.6991023339317772, + "grad_norm": 0.82451993227005, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 51510 + }, + { + "epoch": 3.6998204667863552, + "grad_norm": 0.9542286992073059, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 51520 + }, + { + "epoch": 3.7005385996409337, + "grad_norm": 0.693890392780304, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 51530 + }, + { + "epoch": 3.7012567324955117, + "grad_norm": 0.9068924784660339, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 51540 + }, + { + "epoch": 3.7019748653500897, + "grad_norm": 0.8694922924041748, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 51550 + }, + { + "epoch": 3.702692998204668, + "grad_norm": 0.941081702709198, + "learning_rate": 0.0002, + "loss": 0.6563, + "step": 51560 + }, + { + "epoch": 3.703411131059246, + "grad_norm": 0.7385984659194946, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 51570 + }, + { + "epoch": 3.704129263913824, + "grad_norm": 1.0399216413497925, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51580 + }, + { + "epoch": 3.704847396768402, + "grad_norm": 0.9802294969558716, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 51590 + }, + { + "epoch": 3.70556552962298, + "grad_norm": 1.0409669876098633, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 51600 + }, + { + "epoch": 3.706283662477558, + "grad_norm": 0.8972786068916321, + "learning_rate": 0.0002, + "loss": 0.6812, + "step": 51610 + }, + { + "epoch": 3.7070017953321366, + "grad_norm": 1.1916245222091675, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 51620 + }, + { + "epoch": 3.7077199281867146, + "grad_norm": 0.9545385241508484, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 51630 + }, + { + "epoch": 3.7084380610412926, + "grad_norm": 1.0773427486419678, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 51640 + }, + { + "epoch": 3.7091561938958706, + "grad_norm": 1.0856024026870728, + "learning_rate": 0.0002, + "loss": 0.6768, + "step": 51650 + }, + { + "epoch": 3.709874326750449, + "grad_norm": 0.7678500413894653, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 51660 + }, + { + "epoch": 3.710592459605027, + "grad_norm": 0.7276270985603333, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 51670 + }, + { + "epoch": 3.711310592459605, + "grad_norm": 0.8859017491340637, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 51680 + }, + { + "epoch": 3.712028725314183, + "grad_norm": 0.9037614464759827, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 51690 + }, + { + "epoch": 3.712746858168761, + "grad_norm": 0.9223412275314331, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51700 + }, + { + "epoch": 3.713464991023339, + "grad_norm": 0.8812923431396484, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 51710 + }, + { + "epoch": 3.7141831238779175, + "grad_norm": 0.8242456912994385, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 51720 + }, + { + "epoch": 3.7149012567324955, + "grad_norm": 0.8368834257125854, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 51730 + }, + { + "epoch": 3.7156193895870735, + "grad_norm": 0.8624704480171204, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 51740 + }, + { + "epoch": 3.716337522441652, + "grad_norm": 0.9138273596763611, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51750 + }, + { + "epoch": 3.71705565529623, + "grad_norm": 0.8088571429252625, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 51760 + }, + { + "epoch": 3.717773788150808, + "grad_norm": 0.882808268070221, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 51770 + }, + { + "epoch": 3.718491921005386, + "grad_norm": 0.9368035197257996, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 51780 + }, + { + "epoch": 3.719210053859964, + "grad_norm": 0.8341794013977051, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 51790 + }, + { + "epoch": 3.719928186714542, + "grad_norm": 0.8692073225975037, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 51800 + }, + { + "epoch": 3.7206463195691204, + "grad_norm": 0.7566918730735779, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 51810 + }, + { + "epoch": 3.7213644524236984, + "grad_norm": 1.113138198852539, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 51820 + }, + { + "epoch": 3.7220825852782764, + "grad_norm": 0.8793158531188965, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 51830 + }, + { + "epoch": 3.722800718132855, + "grad_norm": 0.8856439590454102, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 51840 + }, + { + "epoch": 3.723518850987433, + "grad_norm": 1.0182029008865356, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 51850 + }, + { + "epoch": 3.724236983842011, + "grad_norm": 1.1177181005477905, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 51860 + }, + { + "epoch": 3.724955116696589, + "grad_norm": 0.6600990295410156, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 51870 + }, + { + "epoch": 3.725673249551167, + "grad_norm": 1.0563536882400513, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 51880 + }, + { + "epoch": 3.726391382405745, + "grad_norm": 1.1067734956741333, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 51890 + }, + { + "epoch": 3.7271095152603233, + "grad_norm": 1.0204616785049438, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 51900 + }, + { + "epoch": 3.7278276481149013, + "grad_norm": 0.8647155165672302, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 51910 + }, + { + "epoch": 3.7285457809694793, + "grad_norm": 1.0754971504211426, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 51920 + }, + { + "epoch": 3.7292639138240573, + "grad_norm": 1.0448992252349854, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 51930 + }, + { + "epoch": 3.7299820466786358, + "grad_norm": 0.963434100151062, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 51940 + }, + { + "epoch": 3.7307001795332138, + "grad_norm": 0.8112701773643494, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 51950 + }, + { + "epoch": 3.7314183123877918, + "grad_norm": 0.7975119948387146, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 51960 + }, + { + "epoch": 3.7321364452423698, + "grad_norm": 0.7953376173973083, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 51970 + }, + { + "epoch": 3.7328545780969478, + "grad_norm": 0.9519981741905212, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 51980 + }, + { + "epoch": 3.7335727109515258, + "grad_norm": 0.8705791234970093, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 51990 + }, + { + "epoch": 3.734290843806104, + "grad_norm": 0.870205283164978, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 52000 + }, + { + "epoch": 3.735008976660682, + "grad_norm": 0.9558930993080139, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 52010 + }, + { + "epoch": 3.73572710951526, + "grad_norm": 0.9330434799194336, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 52020 + }, + { + "epoch": 3.7364452423698387, + "grad_norm": 0.783620297908783, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 52030 + }, + { + "epoch": 3.7371633752244167, + "grad_norm": 0.7575166821479797, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52040 + }, + { + "epoch": 3.7378815080789947, + "grad_norm": 1.0592705011367798, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 52050 + }, + { + "epoch": 3.7385996409335727, + "grad_norm": 0.9309433102607727, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 52060 + }, + { + "epoch": 3.7393177737881507, + "grad_norm": 0.972861647605896, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 52070 + }, + { + "epoch": 3.7400359066427287, + "grad_norm": 0.9318740963935852, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 52080 + }, + { + "epoch": 3.740754039497307, + "grad_norm": 0.7938477396965027, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 52090 + }, + { + "epoch": 3.741472172351885, + "grad_norm": 1.1515966653823853, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 52100 + }, + { + "epoch": 3.742190305206463, + "grad_norm": 1.076869010925293, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 52110 + }, + { + "epoch": 3.7429084380610416, + "grad_norm": 0.8516066670417786, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 52120 + }, + { + "epoch": 3.7436265709156196, + "grad_norm": 0.6853429079055786, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 52130 + }, + { + "epoch": 3.7443447037701976, + "grad_norm": 0.8179695010185242, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52140 + }, + { + "epoch": 3.7450628366247756, + "grad_norm": 0.8395232558250427, + "learning_rate": 0.0002, + "loss": 0.6692, + "step": 52150 + }, + { + "epoch": 3.7457809694793536, + "grad_norm": 1.0178003311157227, + "learning_rate": 0.0002, + "loss": 0.6902, + "step": 52160 + }, + { + "epoch": 3.7464991023339316, + "grad_norm": 1.1801023483276367, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 52170 + }, + { + "epoch": 3.74721723518851, + "grad_norm": 0.8215751647949219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 52180 + }, + { + "epoch": 3.747935368043088, + "grad_norm": 1.17083740234375, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 52190 + }, + { + "epoch": 3.748653500897666, + "grad_norm": 0.9230290651321411, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 52200 + }, + { + "epoch": 3.749371633752244, + "grad_norm": 0.8431521058082581, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 52210 + }, + { + "epoch": 3.7500897666068225, + "grad_norm": 0.9690840244293213, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 52220 + }, + { + "epoch": 3.7508078994614005, + "grad_norm": 1.0022395849227905, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 52230 + }, + { + "epoch": 3.7515260323159785, + "grad_norm": 1.0489065647125244, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 52240 + }, + { + "epoch": 3.7522441651705565, + "grad_norm": 0.7880696058273315, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 52250 + }, + { + "epoch": 3.7529622980251345, + "grad_norm": 1.0255829095840454, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 52260 + }, + { + "epoch": 3.7536804308797125, + "grad_norm": 0.8470141291618347, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 52270 + }, + { + "epoch": 3.754398563734291, + "grad_norm": 0.9040523171424866, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 52280 + }, + { + "epoch": 3.755116696588869, + "grad_norm": 0.9564392566680908, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 52290 + }, + { + "epoch": 3.755834829443447, + "grad_norm": 0.907857358455658, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 52300 + }, + { + "epoch": 3.7565529622980254, + "grad_norm": 0.8929873704910278, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 52310 + }, + { + "epoch": 3.7572710951526034, + "grad_norm": 0.854434072971344, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 52320 + }, + { + "epoch": 3.7579892280071814, + "grad_norm": 0.8744779229164124, + "learning_rate": 0.0002, + "loss": 0.6668, + "step": 52330 + }, + { + "epoch": 3.7587073608617594, + "grad_norm": 0.9022667407989502, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52340 + }, + { + "epoch": 3.7594254937163374, + "grad_norm": 0.8884857892990112, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 52350 + }, + { + "epoch": 3.7601436265709154, + "grad_norm": 1.0228430032730103, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 52360 + }, + { + "epoch": 3.760861759425494, + "grad_norm": 0.8593528270721436, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 52370 + }, + { + "epoch": 3.761579892280072, + "grad_norm": 0.9435563087463379, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 52380 + }, + { + "epoch": 3.76229802513465, + "grad_norm": 0.7545679807662964, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52390 + }, + { + "epoch": 3.7630161579892283, + "grad_norm": 0.9411585927009583, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 52400 + }, + { + "epoch": 3.7637342908438063, + "grad_norm": 0.9764377474784851, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 52410 + }, + { + "epoch": 3.7644524236983843, + "grad_norm": 1.0718384981155396, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 52420 + }, + { + "epoch": 3.7651705565529623, + "grad_norm": 0.8765230774879456, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52430 + }, + { + "epoch": 3.7658886894075403, + "grad_norm": 0.9275036454200745, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 52440 + }, + { + "epoch": 3.7666068222621183, + "grad_norm": 0.967410147190094, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 52450 + }, + { + "epoch": 3.7673249551166967, + "grad_norm": 0.7738949060440063, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 52460 + }, + { + "epoch": 3.7680430879712747, + "grad_norm": 1.0828070640563965, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 52470 + }, + { + "epoch": 3.7687612208258527, + "grad_norm": 0.9570213556289673, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 52480 + }, + { + "epoch": 3.7694793536804307, + "grad_norm": 1.0688215494155884, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 52490 + }, + { + "epoch": 3.770197486535009, + "grad_norm": 0.7970073223114014, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 52500 + }, + { + "epoch": 3.770915619389587, + "grad_norm": 0.7132976651191711, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 52510 + }, + { + "epoch": 3.771633752244165, + "grad_norm": 1.152268648147583, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 52520 + }, + { + "epoch": 3.772351885098743, + "grad_norm": 0.8645235896110535, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52530 + }, + { + "epoch": 3.773070017953321, + "grad_norm": 0.7725570201873779, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 52540 + }, + { + "epoch": 3.773788150807899, + "grad_norm": 0.9718102812767029, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 52550 + }, + { + "epoch": 3.7745062836624776, + "grad_norm": 0.7568017840385437, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 52560 + }, + { + "epoch": 3.7752244165170556, + "grad_norm": 0.9578912854194641, + "learning_rate": 0.0002, + "loss": 0.6652, + "step": 52570 + }, + { + "epoch": 3.7759425493716336, + "grad_norm": 0.8657314777374268, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 52580 + }, + { + "epoch": 3.776660682226212, + "grad_norm": 0.7564393281936646, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 52590 + }, + { + "epoch": 3.77737881508079, + "grad_norm": 0.7631160616874695, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 52600 + }, + { + "epoch": 3.778096947935368, + "grad_norm": 1.1852056980133057, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 52610 + }, + { + "epoch": 3.778815080789946, + "grad_norm": 1.0620790719985962, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 52620 + }, + { + "epoch": 3.779533213644524, + "grad_norm": 0.8677777647972107, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 52630 + }, + { + "epoch": 3.780251346499102, + "grad_norm": 0.9913218021392822, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 52640 + }, + { + "epoch": 3.7809694793536806, + "grad_norm": 0.9868429899215698, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 52650 + }, + { + "epoch": 3.7816876122082586, + "grad_norm": 0.8791782259941101, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 52660 + }, + { + "epoch": 3.7824057450628366, + "grad_norm": 0.9503955245018005, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 52670 + }, + { + "epoch": 3.7831238779174146, + "grad_norm": 0.8647131323814392, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 52680 + }, + { + "epoch": 3.783842010771993, + "grad_norm": 0.9819629788398743, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 52690 + }, + { + "epoch": 3.784560143626571, + "grad_norm": 0.8548610210418701, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 52700 + }, + { + "epoch": 3.785278276481149, + "grad_norm": 0.8706230521202087, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 52710 + }, + { + "epoch": 3.785996409335727, + "grad_norm": 1.0032461881637573, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 52720 + }, + { + "epoch": 3.786714542190305, + "grad_norm": 1.0578246116638184, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 52730 + }, + { + "epoch": 3.7874326750448835, + "grad_norm": 0.9854007363319397, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 52740 + }, + { + "epoch": 3.7881508078994615, + "grad_norm": 0.8389187455177307, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 52750 + }, + { + "epoch": 3.7888689407540395, + "grad_norm": 0.9192399978637695, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 52760 + }, + { + "epoch": 3.7895870736086175, + "grad_norm": 0.9518283605575562, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 52770 + }, + { + "epoch": 3.790305206463196, + "grad_norm": 1.1296825408935547, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 52780 + }, + { + "epoch": 3.791023339317774, + "grad_norm": 1.0589144229888916, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 52790 + }, + { + "epoch": 3.791741472172352, + "grad_norm": 0.8954343199729919, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 52800 + }, + { + "epoch": 3.79245960502693, + "grad_norm": 0.8283370733261108, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 52810 + }, + { + "epoch": 3.793177737881508, + "grad_norm": 0.910642683506012, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 52820 + }, + { + "epoch": 3.793895870736086, + "grad_norm": 0.9255108833312988, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 52830 + }, + { + "epoch": 3.7946140035906644, + "grad_norm": 0.8773723244667053, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 52840 + }, + { + "epoch": 3.7953321364452424, + "grad_norm": 0.8454240560531616, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 52850 + }, + { + "epoch": 3.7960502692998204, + "grad_norm": 0.7636052966117859, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 52860 + }, + { + "epoch": 3.796768402154399, + "grad_norm": 0.9358382821083069, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 52870 + }, + { + "epoch": 3.797486535008977, + "grad_norm": 0.9662801623344421, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 52880 + }, + { + "epoch": 3.798204667863555, + "grad_norm": 0.995907187461853, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 52890 + }, + { + "epoch": 3.798922800718133, + "grad_norm": 0.8700127005577087, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 52900 + }, + { + "epoch": 3.799640933572711, + "grad_norm": 0.8987792134284973, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 52910 + }, + { + "epoch": 3.800359066427289, + "grad_norm": 0.9753904938697815, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 52920 + }, + { + "epoch": 3.8010771992818673, + "grad_norm": 0.7873555421829224, + "learning_rate": 0.0002, + "loss": 0.6536, + "step": 52930 + }, + { + "epoch": 3.8017953321364453, + "grad_norm": 0.8177929520606995, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 52940 + }, + { + "epoch": 3.8025134649910233, + "grad_norm": 0.8865532279014587, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 52950 + }, + { + "epoch": 3.8032315978456013, + "grad_norm": 0.9113775491714478, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 52960 + }, + { + "epoch": 3.8039497307001797, + "grad_norm": 0.9424585700035095, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 52970 + }, + { + "epoch": 3.8046678635547577, + "grad_norm": 0.8347237706184387, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 52980 + }, + { + "epoch": 3.8053859964093357, + "grad_norm": 0.826863169670105, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 52990 + }, + { + "epoch": 3.8061041292639137, + "grad_norm": 0.7313310503959656, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 53000 + }, + { + "epoch": 3.8068222621184917, + "grad_norm": 0.8352667093276978, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 53010 + }, + { + "epoch": 3.80754039497307, + "grad_norm": 0.748461127281189, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 53020 + }, + { + "epoch": 3.808258527827648, + "grad_norm": 0.943256139755249, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 53030 + }, + { + "epoch": 3.808976660682226, + "grad_norm": 1.0448410511016846, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 53040 + }, + { + "epoch": 3.809694793536804, + "grad_norm": 0.9047636985778809, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 53050 + }, + { + "epoch": 3.8104129263913826, + "grad_norm": 0.8594381213188171, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 53060 + }, + { + "epoch": 3.8111310592459606, + "grad_norm": 0.7593536972999573, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 53070 + }, + { + "epoch": 3.8118491921005386, + "grad_norm": 0.7189019918441772, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 53080 + }, + { + "epoch": 3.8125673249551166, + "grad_norm": 0.8569809198379517, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53090 + }, + { + "epoch": 3.8132854578096946, + "grad_norm": 0.923378050327301, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53100 + }, + { + "epoch": 3.8140035906642726, + "grad_norm": 0.9088824391365051, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 53110 + }, + { + "epoch": 3.814721723518851, + "grad_norm": 1.1386840343475342, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 53120 + }, + { + "epoch": 3.815439856373429, + "grad_norm": 0.8389552235603333, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 53130 + }, + { + "epoch": 3.816157989228007, + "grad_norm": 0.7940975427627563, + "learning_rate": 0.0002, + "loss": 0.6779, + "step": 53140 + }, + { + "epoch": 3.8168761220825855, + "grad_norm": 0.8389907479286194, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 53150 + }, + { + "epoch": 3.8175942549371635, + "grad_norm": 0.774206280708313, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 53160 + }, + { + "epoch": 3.8183123877917415, + "grad_norm": 1.189447283744812, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 53170 + }, + { + "epoch": 3.8190305206463195, + "grad_norm": 0.9875882863998413, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 53180 + }, + { + "epoch": 3.8197486535008975, + "grad_norm": 0.9205945134162903, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 53190 + }, + { + "epoch": 3.8204667863554755, + "grad_norm": 0.8312796354293823, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 53200 + }, + { + "epoch": 3.821184919210054, + "grad_norm": 0.9755756855010986, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 53210 + }, + { + "epoch": 3.821903052064632, + "grad_norm": 1.0722965002059937, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 53220 + }, + { + "epoch": 3.82262118491921, + "grad_norm": 0.7720510959625244, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 53230 + }, + { + "epoch": 3.823339317773788, + "grad_norm": 1.020147681236267, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 53240 + }, + { + "epoch": 3.8240574506283664, + "grad_norm": 0.8241816759109497, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53250 + }, + { + "epoch": 3.8247755834829444, + "grad_norm": 0.8939895629882812, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 53260 + }, + { + "epoch": 3.8254937163375224, + "grad_norm": 1.010852336883545, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 53270 + }, + { + "epoch": 3.8262118491921004, + "grad_norm": 0.8201420307159424, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 53280 + }, + { + "epoch": 3.8269299820466784, + "grad_norm": 0.8797973990440369, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 53290 + }, + { + "epoch": 3.827648114901257, + "grad_norm": 0.9034950137138367, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 53300 + }, + { + "epoch": 3.828366247755835, + "grad_norm": 0.926802933216095, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 53310 + }, + { + "epoch": 3.829084380610413, + "grad_norm": 1.0205509662628174, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 53320 + }, + { + "epoch": 3.829802513464991, + "grad_norm": 0.9524099230766296, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 53330 + }, + { + "epoch": 3.8305206463195693, + "grad_norm": 0.9692625999450684, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 53340 + }, + { + "epoch": 3.8312387791741473, + "grad_norm": 0.7255275845527649, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 53350 + }, + { + "epoch": 3.8319569120287253, + "grad_norm": 0.7199059724807739, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53360 + }, + { + "epoch": 3.8326750448833034, + "grad_norm": 1.004464864730835, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 53370 + }, + { + "epoch": 3.8333931777378814, + "grad_norm": 0.9092583060264587, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53380 + }, + { + "epoch": 3.8341113105924594, + "grad_norm": 0.945091724395752, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 53390 + }, + { + "epoch": 3.834829443447038, + "grad_norm": 0.7980135679244995, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 53400 + }, + { + "epoch": 3.835547576301616, + "grad_norm": 0.7812868356704712, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 53410 + }, + { + "epoch": 3.836265709156194, + "grad_norm": 0.8957077860832214, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 53420 + }, + { + "epoch": 3.8369838420107722, + "grad_norm": 0.9119600653648376, + "learning_rate": 0.0002, + "loss": 0.6754, + "step": 53430 + }, + { + "epoch": 3.8377019748653503, + "grad_norm": 0.8208187222480774, + "learning_rate": 0.0002, + "loss": 0.7346, + "step": 53440 + }, + { + "epoch": 3.8384201077199283, + "grad_norm": 0.7930439114570618, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 53450 + }, + { + "epoch": 3.8391382405745063, + "grad_norm": 0.8937777280807495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 53460 + }, + { + "epoch": 3.8398563734290843, + "grad_norm": 0.7583796977996826, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 53470 + }, + { + "epoch": 3.8405745062836623, + "grad_norm": 1.0735969543457031, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 53480 + }, + { + "epoch": 3.8412926391382407, + "grad_norm": 1.1106033325195312, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 53490 + }, + { + "epoch": 3.8420107719928187, + "grad_norm": 1.092631220817566, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 53500 + }, + { + "epoch": 3.8427289048473967, + "grad_norm": 0.9961787462234497, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 53510 + }, + { + "epoch": 3.8434470377019747, + "grad_norm": 0.833831250667572, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 53520 + }, + { + "epoch": 3.844165170556553, + "grad_norm": 1.0000009536743164, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 53530 + }, + { + "epoch": 3.844883303411131, + "grad_norm": 0.9784213304519653, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 53540 + }, + { + "epoch": 3.845601436265709, + "grad_norm": 0.8582558035850525, + "learning_rate": 0.0002, + "loss": 0.6816, + "step": 53550 + }, + { + "epoch": 3.846319569120287, + "grad_norm": 0.8267415761947632, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 53560 + }, + { + "epoch": 3.847037701974865, + "grad_norm": 0.8783000111579895, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 53570 + }, + { + "epoch": 3.8477558348294436, + "grad_norm": 0.9866999983787537, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 53580 + }, + { + "epoch": 3.8484739676840216, + "grad_norm": 0.8459296226501465, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 53590 + }, + { + "epoch": 3.8491921005385996, + "grad_norm": 0.9804834723472595, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 53600 + }, + { + "epoch": 3.8499102333931776, + "grad_norm": 0.951074481010437, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 53610 + }, + { + "epoch": 3.850628366247756, + "grad_norm": 0.8020104169845581, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 53620 + }, + { + "epoch": 3.851346499102334, + "grad_norm": 0.9296963214874268, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 53630 + }, + { + "epoch": 3.852064631956912, + "grad_norm": 0.8983652591705322, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 53640 + }, + { + "epoch": 3.85278276481149, + "grad_norm": 1.031858205795288, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 53650 + }, + { + "epoch": 3.853500897666068, + "grad_norm": 0.8943952918052673, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 53660 + }, + { + "epoch": 3.854219030520646, + "grad_norm": 1.0072312355041504, + "learning_rate": 0.0002, + "loss": 0.6745, + "step": 53670 + }, + { + "epoch": 3.8549371633752245, + "grad_norm": 1.0604884624481201, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 53680 + }, + { + "epoch": 3.8556552962298025, + "grad_norm": 0.834223210811615, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 53690 + }, + { + "epoch": 3.8563734290843805, + "grad_norm": 0.9872867465019226, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 53700 + }, + { + "epoch": 3.857091561938959, + "grad_norm": 0.7999459505081177, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53710 + }, + { + "epoch": 3.857809694793537, + "grad_norm": 0.717722475528717, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 53720 + }, + { + "epoch": 3.858527827648115, + "grad_norm": 1.0675442218780518, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 53730 + }, + { + "epoch": 3.859245960502693, + "grad_norm": 0.9789777398109436, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 53740 + }, + { + "epoch": 3.859964093357271, + "grad_norm": 0.9318669438362122, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 53750 + }, + { + "epoch": 3.860682226211849, + "grad_norm": 0.9848631024360657, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 53760 + }, + { + "epoch": 3.8614003590664274, + "grad_norm": 0.8754391670227051, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 53770 + }, + { + "epoch": 3.8621184919210054, + "grad_norm": 0.9024585485458374, + "learning_rate": 0.0002, + "loss": 0.6411, + "step": 53780 + }, + { + "epoch": 3.8628366247755834, + "grad_norm": 0.8974794745445251, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 53790 + }, + { + "epoch": 3.8635547576301614, + "grad_norm": 0.8342790603637695, + "learning_rate": 0.0002, + "loss": 0.6729, + "step": 53800 + }, + { + "epoch": 3.86427289048474, + "grad_norm": 0.8177682757377625, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 53810 + }, + { + "epoch": 3.864991023339318, + "grad_norm": 1.0259089469909668, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 53820 + }, + { + "epoch": 3.865709156193896, + "grad_norm": 1.042290210723877, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 53830 + }, + { + "epoch": 3.866427289048474, + "grad_norm": 0.7316540479660034, + "learning_rate": 0.0002, + "loss": 0.6963, + "step": 53840 + }, + { + "epoch": 3.867145421903052, + "grad_norm": 0.9384970664978027, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 53850 + }, + { + "epoch": 3.86786355475763, + "grad_norm": 0.9273143410682678, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 53860 + }, + { + "epoch": 3.8685816876122083, + "grad_norm": 1.1183570623397827, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 53870 + }, + { + "epoch": 3.8692998204667863, + "grad_norm": 0.9455275535583496, + "learning_rate": 0.0002, + "loss": 0.6712, + "step": 53880 + }, + { + "epoch": 3.8700179533213643, + "grad_norm": 0.8702114820480347, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 53890 + }, + { + "epoch": 3.870736086175943, + "grad_norm": 0.8751053214073181, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 53900 + }, + { + "epoch": 3.871454219030521, + "grad_norm": 0.9793110489845276, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 53910 + }, + { + "epoch": 3.872172351885099, + "grad_norm": 0.9705014824867249, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 53920 + }, + { + "epoch": 3.872890484739677, + "grad_norm": 1.051504373550415, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 53930 + }, + { + "epoch": 3.873608617594255, + "grad_norm": 0.8590622544288635, + "learning_rate": 0.0002, + "loss": 0.6606, + "step": 53940 + }, + { + "epoch": 3.874326750448833, + "grad_norm": 0.7828099727630615, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 53950 + }, + { + "epoch": 3.8750448833034112, + "grad_norm": 0.86341792345047, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 53960 + }, + { + "epoch": 3.8757630161579892, + "grad_norm": 1.114670991897583, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 53970 + }, + { + "epoch": 3.8764811490125672, + "grad_norm": 0.8559519052505493, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 53980 + }, + { + "epoch": 3.8771992818671457, + "grad_norm": 1.0518953800201416, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 53990 + }, + { + "epoch": 3.8779174147217237, + "grad_norm": 0.7157500982284546, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 54000 + }, + { + "epoch": 3.8786355475763017, + "grad_norm": 0.8390372395515442, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 54010 + }, + { + "epoch": 3.8793536804308797, + "grad_norm": 0.8486756086349487, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 54020 + }, + { + "epoch": 3.8800718132854577, + "grad_norm": 0.8361587524414062, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 54030 + }, + { + "epoch": 3.8807899461400357, + "grad_norm": 0.9490554928779602, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 54040 + }, + { + "epoch": 3.881508078994614, + "grad_norm": 1.0311323404312134, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 54050 + }, + { + "epoch": 3.882226211849192, + "grad_norm": 0.84800124168396, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54060 + }, + { + "epoch": 3.88294434470377, + "grad_norm": 0.8940879702568054, + "learning_rate": 0.0002, + "loss": 0.6995, + "step": 54070 + }, + { + "epoch": 3.883662477558348, + "grad_norm": 0.985542356967926, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 54080 + }, + { + "epoch": 3.8843806104129266, + "grad_norm": 0.8846475481987, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 54090 + }, + { + "epoch": 3.8850987432675046, + "grad_norm": 0.9186338186264038, + "learning_rate": 0.0002, + "loss": 0.6656, + "step": 54100 + }, + { + "epoch": 3.8858168761220826, + "grad_norm": 1.106598973274231, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 54110 + }, + { + "epoch": 3.8865350089766606, + "grad_norm": 0.8167300224304199, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 54120 + }, + { + "epoch": 3.8872531418312386, + "grad_norm": 0.9153622984886169, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 54130 + }, + { + "epoch": 3.8879712746858166, + "grad_norm": 0.8464475274085999, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 54140 + }, + { + "epoch": 3.888689407540395, + "grad_norm": 0.8889452815055847, + "learning_rate": 0.0002, + "loss": 0.6658, + "step": 54150 + }, + { + "epoch": 3.889407540394973, + "grad_norm": 0.7861065864562988, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 54160 + }, + { + "epoch": 3.890125673249551, + "grad_norm": 0.882674515247345, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 54170 + }, + { + "epoch": 3.8908438061041295, + "grad_norm": 0.8503835201263428, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 54180 + }, + { + "epoch": 3.8915619389587075, + "grad_norm": 0.888455331325531, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 54190 + }, + { + "epoch": 3.8922800718132855, + "grad_norm": 1.0473699569702148, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 54200 + }, + { + "epoch": 3.8929982046678635, + "grad_norm": 0.9548208713531494, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 54210 + }, + { + "epoch": 3.8937163375224415, + "grad_norm": 0.9158754944801331, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 54220 + }, + { + "epoch": 3.8944344703770195, + "grad_norm": 0.9001154899597168, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54230 + }, + { + "epoch": 3.895152603231598, + "grad_norm": 0.9736626148223877, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54240 + }, + { + "epoch": 3.895870736086176, + "grad_norm": 0.8809846043586731, + "learning_rate": 0.0002, + "loss": 0.7248, + "step": 54250 + }, + { + "epoch": 3.896588868940754, + "grad_norm": 0.887583315372467, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 54260 + }, + { + "epoch": 3.8973070017953324, + "grad_norm": 0.8395712971687317, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 54270 + }, + { + "epoch": 3.8980251346499104, + "grad_norm": 0.8391315937042236, + "learning_rate": 0.0002, + "loss": 0.681, + "step": 54280 + }, + { + "epoch": 3.8987432675044884, + "grad_norm": 0.8210049271583557, + "learning_rate": 0.0002, + "loss": 0.6352, + "step": 54290 + }, + { + "epoch": 3.8994614003590664, + "grad_norm": 1.1364530324935913, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 54300 + }, + { + "epoch": 3.9001795332136444, + "grad_norm": 0.7712056636810303, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 54310 + }, + { + "epoch": 3.9008976660682224, + "grad_norm": 0.9466049671173096, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 54320 + }, + { + "epoch": 3.901615798922801, + "grad_norm": 1.0367140769958496, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 54330 + }, + { + "epoch": 3.902333931777379, + "grad_norm": 1.0168321132659912, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 54340 + }, + { + "epoch": 3.903052064631957, + "grad_norm": 0.7830407619476318, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 54350 + }, + { + "epoch": 3.903770197486535, + "grad_norm": 0.9649789333343506, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 54360 + }, + { + "epoch": 3.9044883303411133, + "grad_norm": 0.681077778339386, + "learning_rate": 0.0002, + "loss": 0.6644, + "step": 54370 + }, + { + "epoch": 3.9052064631956913, + "grad_norm": 0.8970136046409607, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 54380 + }, + { + "epoch": 3.9059245960502693, + "grad_norm": 0.9155173301696777, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 54390 + }, + { + "epoch": 3.9066427289048473, + "grad_norm": 1.0447794198989868, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 54400 + }, + { + "epoch": 3.9073608617594253, + "grad_norm": 0.7823813557624817, + "learning_rate": 0.0002, + "loss": 0.6883, + "step": 54410 + }, + { + "epoch": 3.9080789946140033, + "grad_norm": 0.9289445877075195, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 54420 + }, + { + "epoch": 3.9087971274685818, + "grad_norm": 0.9983111619949341, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 54430 + }, + { + "epoch": 3.9095152603231598, + "grad_norm": 0.7952495813369751, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 54440 + }, + { + "epoch": 3.9102333931777378, + "grad_norm": 0.8045601844787598, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 54450 + }, + { + "epoch": 3.910951526032316, + "grad_norm": 0.936585009098053, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 54460 + }, + { + "epoch": 3.911669658886894, + "grad_norm": 0.745793879032135, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 54470 + }, + { + "epoch": 3.912387791741472, + "grad_norm": 0.9137616157531738, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 54480 + }, + { + "epoch": 3.9131059245960502, + "grad_norm": 0.826316237449646, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 54490 + }, + { + "epoch": 3.9138240574506282, + "grad_norm": 0.94313645362854, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 54500 + }, + { + "epoch": 3.9145421903052062, + "grad_norm": 1.045893907546997, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 54510 + }, + { + "epoch": 3.9152603231597847, + "grad_norm": 0.9122704863548279, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 54520 + }, + { + "epoch": 3.9159784560143627, + "grad_norm": 1.0999689102172852, + "learning_rate": 0.0002, + "loss": 0.7029, + "step": 54530 + }, + { + "epoch": 3.9166965888689407, + "grad_norm": 0.9281555414199829, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 54540 + }, + { + "epoch": 3.917414721723519, + "grad_norm": 1.1439622640609741, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 54550 + }, + { + "epoch": 3.918132854578097, + "grad_norm": 0.9375617504119873, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 54560 + }, + { + "epoch": 3.918850987432675, + "grad_norm": 0.92906653881073, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 54570 + }, + { + "epoch": 3.919569120287253, + "grad_norm": 1.0840893983840942, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 54580 + }, + { + "epoch": 3.920287253141831, + "grad_norm": 0.8145509362220764, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 54590 + }, + { + "epoch": 3.921005385996409, + "grad_norm": 0.973737895488739, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 54600 + }, + { + "epoch": 3.9217235188509876, + "grad_norm": 0.9302353858947754, + "learning_rate": 0.0002, + "loss": 0.6822, + "step": 54610 + }, + { + "epoch": 3.9224416517055656, + "grad_norm": 0.9167897701263428, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 54620 + }, + { + "epoch": 3.9231597845601436, + "grad_norm": 0.8096851706504822, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 54630 + }, + { + "epoch": 3.9238779174147216, + "grad_norm": 0.8006368279457092, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 54640 + }, + { + "epoch": 3.9245960502693, + "grad_norm": 0.7800863981246948, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 54650 + }, + { + "epoch": 3.925314183123878, + "grad_norm": 1.0331560373306274, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 54660 + }, + { + "epoch": 3.926032315978456, + "grad_norm": 1.0057517290115356, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 54670 + }, + { + "epoch": 3.926750448833034, + "grad_norm": 0.8920564651489258, + "learning_rate": 0.0002, + "loss": 0.6636, + "step": 54680 + }, + { + "epoch": 3.927468581687612, + "grad_norm": 0.7704599499702454, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 54690 + }, + { + "epoch": 3.92818671454219, + "grad_norm": 0.827032208442688, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 54700 + }, + { + "epoch": 3.9289048473967685, + "grad_norm": 1.0019268989562988, + "learning_rate": 0.0002, + "loss": 0.7083, + "step": 54710 + }, + { + "epoch": 3.9296229802513465, + "grad_norm": 0.862033486366272, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 54720 + }, + { + "epoch": 3.9303411131059245, + "grad_norm": 0.8965592980384827, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 54730 + }, + { + "epoch": 3.931059245960503, + "grad_norm": 0.7689077854156494, + "learning_rate": 0.0002, + "loss": 0.6739, + "step": 54740 + }, + { + "epoch": 3.931777378815081, + "grad_norm": 0.846276581287384, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 54750 + }, + { + "epoch": 3.932495511669659, + "grad_norm": 0.8932713866233826, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 54760 + }, + { + "epoch": 3.933213644524237, + "grad_norm": 0.9711386561393738, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 54770 + }, + { + "epoch": 3.933931777378815, + "grad_norm": 0.9290250539779663, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 54780 + }, + { + "epoch": 3.934649910233393, + "grad_norm": 1.0897367000579834, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 54790 + }, + { + "epoch": 3.9353680430879714, + "grad_norm": 0.8451842665672302, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 54800 + }, + { + "epoch": 3.9360861759425494, + "grad_norm": 0.8400090336799622, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 54810 + }, + { + "epoch": 3.9368043087971274, + "grad_norm": 0.951383650302887, + "learning_rate": 0.0002, + "loss": 0.6577, + "step": 54820 + }, + { + "epoch": 3.937522441651706, + "grad_norm": 0.848838210105896, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 54830 + }, + { + "epoch": 3.938240574506284, + "grad_norm": 0.735763669013977, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 54840 + }, + { + "epoch": 3.938958707360862, + "grad_norm": 0.979037344455719, + "learning_rate": 0.0002, + "loss": 0.6574, + "step": 54850 + }, + { + "epoch": 3.93967684021544, + "grad_norm": 0.933674693107605, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 54860 + }, + { + "epoch": 3.940394973070018, + "grad_norm": 0.835593044757843, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 54870 + }, + { + "epoch": 3.941113105924596, + "grad_norm": 1.0034281015396118, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 54880 + }, + { + "epoch": 3.9418312387791743, + "grad_norm": 0.9732975959777832, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 54890 + }, + { + "epoch": 3.9425493716337523, + "grad_norm": 0.9666336178779602, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 54900 + }, + { + "epoch": 3.9432675044883303, + "grad_norm": 0.755310595035553, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 54910 + }, + { + "epoch": 3.9439856373429083, + "grad_norm": 0.8732092976570129, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 54920 + }, + { + "epoch": 3.9447037701974867, + "grad_norm": 1.139453649520874, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 54930 + }, + { + "epoch": 3.9454219030520647, + "grad_norm": 0.9044837951660156, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 54940 + }, + { + "epoch": 3.9461400359066428, + "grad_norm": 1.0496679544448853, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 54950 + }, + { + "epoch": 3.9468581687612208, + "grad_norm": 1.0099035501480103, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 54960 + }, + { + "epoch": 3.9475763016157988, + "grad_norm": 1.0694963932037354, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 54970 + }, + { + "epoch": 3.9482944344703768, + "grad_norm": 1.0012997388839722, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 54980 + }, + { + "epoch": 3.949012567324955, + "grad_norm": 0.8910513520240784, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 54990 + }, + { + "epoch": 3.949730700179533, + "grad_norm": 1.0267579555511475, + "learning_rate": 0.0002, + "loss": 0.7184, + "step": 55000 + }, + { + "epoch": 3.950448833034111, + "grad_norm": 0.9786432385444641, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 55010 + }, + { + "epoch": 3.9511669658886897, + "grad_norm": 0.8703538775444031, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55020 + }, + { + "epoch": 3.9518850987432677, + "grad_norm": 0.8970484137535095, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 55030 + }, + { + "epoch": 3.9526032315978457, + "grad_norm": 0.8781577944755554, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 55040 + }, + { + "epoch": 3.9533213644524237, + "grad_norm": 0.8040280938148499, + "learning_rate": 0.0002, + "loss": 0.6944, + "step": 55050 + }, + { + "epoch": 3.9540394973070017, + "grad_norm": 0.851926326751709, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 55060 + }, + { + "epoch": 3.9547576301615797, + "grad_norm": 0.8597240447998047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 55070 + }, + { + "epoch": 3.955475763016158, + "grad_norm": 0.9461944699287415, + "learning_rate": 0.0002, + "loss": 0.6499, + "step": 55080 + }, + { + "epoch": 3.956193895870736, + "grad_norm": 0.7576611042022705, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 55090 + }, + { + "epoch": 3.956912028725314, + "grad_norm": 0.9484710693359375, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 55100 + }, + { + "epoch": 3.957630161579892, + "grad_norm": 0.9487117528915405, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 55110 + }, + { + "epoch": 3.9583482944344706, + "grad_norm": 0.870090663433075, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55120 + }, + { + "epoch": 3.9590664272890486, + "grad_norm": 0.8496458530426025, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 55130 + }, + { + "epoch": 3.9597845601436266, + "grad_norm": 1.0121779441833496, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 55140 + }, + { + "epoch": 3.9605026929982046, + "grad_norm": 0.8912323713302612, + "learning_rate": 0.0002, + "loss": 0.7005, + "step": 55150 + }, + { + "epoch": 3.9612208258527826, + "grad_norm": 0.8398444652557373, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 55160 + }, + { + "epoch": 3.961938958707361, + "grad_norm": 0.8046348690986633, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 55170 + }, + { + "epoch": 3.962657091561939, + "grad_norm": 1.0369254350662231, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 55180 + }, + { + "epoch": 3.963375224416517, + "grad_norm": 1.172431230545044, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 55190 + }, + { + "epoch": 3.964093357271095, + "grad_norm": 0.8093554377555847, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 55200 + }, + { + "epoch": 3.9648114901256735, + "grad_norm": 0.8851078748703003, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 55210 + }, + { + "epoch": 3.9655296229802515, + "grad_norm": 0.7494266033172607, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 55220 + }, + { + "epoch": 3.9662477558348295, + "grad_norm": 0.9556898474693298, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 55230 + }, + { + "epoch": 3.9669658886894075, + "grad_norm": 1.016017198562622, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 55240 + }, + { + "epoch": 3.9676840215439855, + "grad_norm": 0.8425998091697693, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 55250 + }, + { + "epoch": 3.9684021543985635, + "grad_norm": 0.717673122882843, + "learning_rate": 0.0002, + "loss": 0.6609, + "step": 55260 + }, + { + "epoch": 3.969120287253142, + "grad_norm": 0.8366572856903076, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 55270 + }, + { + "epoch": 3.96983842010772, + "grad_norm": 0.8981583118438721, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 55280 + }, + { + "epoch": 3.970556552962298, + "grad_norm": 0.8868781328201294, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 55290 + }, + { + "epoch": 3.9712746858168764, + "grad_norm": 1.0632785558700562, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 55300 + }, + { + "epoch": 3.9719928186714544, + "grad_norm": 0.8813109993934631, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 55310 + }, + { + "epoch": 3.9727109515260324, + "grad_norm": 0.8225542306900024, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 55320 + }, + { + "epoch": 3.9734290843806104, + "grad_norm": 1.1391420364379883, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 55330 + }, + { + "epoch": 3.9741472172351884, + "grad_norm": 1.0371832847595215, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55340 + }, + { + "epoch": 3.9748653500897664, + "grad_norm": 1.0542186498641968, + "learning_rate": 0.0002, + "loss": 0.7538, + "step": 55350 + }, + { + "epoch": 3.975583482944345, + "grad_norm": 1.0178009271621704, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 55360 + }, + { + "epoch": 3.976301615798923, + "grad_norm": 0.7927802205085754, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 55370 + }, + { + "epoch": 3.977019748653501, + "grad_norm": 0.9350495934486389, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 55380 + }, + { + "epoch": 3.977737881508079, + "grad_norm": 1.0240116119384766, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 55390 + }, + { + "epoch": 3.9784560143626573, + "grad_norm": 1.0279067754745483, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 55400 + }, + { + "epoch": 3.9791741472172353, + "grad_norm": 1.1228227615356445, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 55410 + }, + { + "epoch": 3.9798922800718133, + "grad_norm": 0.9500134587287903, + "learning_rate": 0.0002, + "loss": 0.6595, + "step": 55420 + }, + { + "epoch": 3.9806104129263913, + "grad_norm": 0.9229732155799866, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 55430 + }, + { + "epoch": 3.9813285457809693, + "grad_norm": 0.7946729063987732, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 55440 + }, + { + "epoch": 3.9820466786355477, + "grad_norm": 0.9987489581108093, + "learning_rate": 0.0002, + "loss": 0.6643, + "step": 55450 + }, + { + "epoch": 3.9827648114901257, + "grad_norm": 0.9670467972755432, + "learning_rate": 0.0002, + "loss": 0.6642, + "step": 55460 + }, + { + "epoch": 3.9834829443447037, + "grad_norm": 0.835028350353241, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 55470 + }, + { + "epoch": 3.9842010771992817, + "grad_norm": 0.8678702712059021, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 55480 + }, + { + "epoch": 3.98491921005386, + "grad_norm": 0.8581197261810303, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 55490 + }, + { + "epoch": 3.985637342908438, + "grad_norm": 0.779848039150238, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 55500 + }, + { + "epoch": 3.986355475763016, + "grad_norm": 0.8827589154243469, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 55510 + }, + { + "epoch": 3.987073608617594, + "grad_norm": 1.0108301639556885, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55520 + }, + { + "epoch": 3.987791741472172, + "grad_norm": 0.8506004214286804, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 55530 + }, + { + "epoch": 3.98850987432675, + "grad_norm": 1.0297727584838867, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 55540 + }, + { + "epoch": 3.9892280071813286, + "grad_norm": 0.8579224944114685, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 55550 + }, + { + "epoch": 3.9899461400359066, + "grad_norm": 0.8503788113594055, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 55560 + }, + { + "epoch": 3.9906642728904846, + "grad_norm": 1.1144801378250122, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 55570 + }, + { + "epoch": 3.991382405745063, + "grad_norm": 0.8418305516242981, + "learning_rate": 0.0002, + "loss": 0.6956, + "step": 55580 + }, + { + "epoch": 3.992100538599641, + "grad_norm": 1.0065871477127075, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 55590 + }, + { + "epoch": 3.992818671454219, + "grad_norm": 0.8160259127616882, + "learning_rate": 0.0002, + "loss": 0.6775, + "step": 55600 + }, + { + "epoch": 3.993536804308797, + "grad_norm": 0.8678009510040283, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 55610 + }, + { + "epoch": 3.994254937163375, + "grad_norm": 0.863465428352356, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 55620 + }, + { + "epoch": 3.994973070017953, + "grad_norm": 0.9242135286331177, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 55630 + }, + { + "epoch": 3.9956912028725315, + "grad_norm": 1.0285470485687256, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 55640 + }, + { + "epoch": 3.9964093357271095, + "grad_norm": 0.8953320384025574, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 55650 + }, + { + "epoch": 3.9971274685816875, + "grad_norm": 0.915892481803894, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 55660 + }, + { + "epoch": 3.9978456014362656, + "grad_norm": 0.8235118985176086, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 55670 + }, + { + "epoch": 3.998563734290844, + "grad_norm": 1.0178656578063965, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 55680 + }, + { + "epoch": 3.999281867145422, + "grad_norm": 0.9926803708076477, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 55690 + }, + { + "epoch": 4.0, + "grad_norm": 0.9213629961013794, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 55700 + }, + { + "epoch": 4.0, + "eval_loss": 1.1152480840682983, + "eval_runtime": 55.2237, + "eval_samples_per_second": 13.273, + "eval_steps_per_second": 1.666, + "step": 55700 + }, + { + "epoch": 4.000718132854578, + "grad_norm": 1.0820496082305908, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 55710 + }, + { + "epoch": 4.001436265709156, + "grad_norm": 0.9036441445350647, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 55720 + }, + { + "epoch": 4.002154398563734, + "grad_norm": 1.102754831314087, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 55730 + }, + { + "epoch": 4.002872531418312, + "grad_norm": 0.98259437084198, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 55740 + }, + { + "epoch": 4.003590664272891, + "grad_norm": 1.1935845613479614, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 55750 + }, + { + "epoch": 4.004308797127469, + "grad_norm": 0.9925830960273743, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 55760 + }, + { + "epoch": 4.005026929982047, + "grad_norm": 1.075087070465088, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 55770 + }, + { + "epoch": 4.005745062836625, + "grad_norm": 0.8746396899223328, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 55780 + }, + { + "epoch": 4.006463195691203, + "grad_norm": 0.7635995745658875, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 55790 + }, + { + "epoch": 4.007181328545781, + "grad_norm": 0.9064885377883911, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 55800 + }, + { + "epoch": 4.007899461400359, + "grad_norm": 1.018478274345398, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 55810 + }, + { + "epoch": 4.008617594254937, + "grad_norm": 0.9797589778900146, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 55820 + }, + { + "epoch": 4.009335727109515, + "grad_norm": 0.7867457866668701, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 55830 + }, + { + "epoch": 4.010053859964093, + "grad_norm": 0.9998070597648621, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 55840 + }, + { + "epoch": 4.010771992818672, + "grad_norm": 0.8656311631202698, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 55850 + }, + { + "epoch": 4.01149012567325, + "grad_norm": 0.945469081401825, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 55860 + }, + { + "epoch": 4.012208258527828, + "grad_norm": 0.8809926509857178, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 55870 + }, + { + "epoch": 4.012926391382406, + "grad_norm": 0.8047897219657898, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 55880 + }, + { + "epoch": 4.013644524236984, + "grad_norm": 1.0563900470733643, + "learning_rate": 0.0002, + "loss": 0.5322, + "step": 55890 + }, + { + "epoch": 4.014362657091562, + "grad_norm": 0.8578300476074219, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 55900 + }, + { + "epoch": 4.01508078994614, + "grad_norm": 1.0304765701293945, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 55910 + }, + { + "epoch": 4.015798922800718, + "grad_norm": 0.8087666034698486, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 55920 + }, + { + "epoch": 4.016517055655296, + "grad_norm": 1.0192348957061768, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 55930 + }, + { + "epoch": 4.017235188509875, + "grad_norm": 1.061194658279419, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 55940 + }, + { + "epoch": 4.017953321364453, + "grad_norm": 0.93668133020401, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 55950 + }, + { + "epoch": 4.018671454219031, + "grad_norm": 1.1569286584854126, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 55960 + }, + { + "epoch": 4.019389587073609, + "grad_norm": 0.9853817224502563, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 55970 + }, + { + "epoch": 4.020107719928187, + "grad_norm": 0.851109504699707, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 55980 + }, + { + "epoch": 4.020825852782765, + "grad_norm": 1.053525447845459, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 55990 + }, + { + "epoch": 4.021543985637343, + "grad_norm": 0.8307225704193115, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 56000 + }, + { + "epoch": 4.022262118491921, + "grad_norm": 1.2741150856018066, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 56010 + }, + { + "epoch": 4.022980251346499, + "grad_norm": 0.9708344340324402, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 56020 + }, + { + "epoch": 4.023698384201078, + "grad_norm": 1.265034556388855, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 56030 + }, + { + "epoch": 4.024416517055656, + "grad_norm": 0.9364367723464966, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 56040 + }, + { + "epoch": 4.025134649910234, + "grad_norm": 0.8643592000007629, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 56050 + }, + { + "epoch": 4.025852782764812, + "grad_norm": 0.9742133021354675, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 56060 + }, + { + "epoch": 4.02657091561939, + "grad_norm": 1.1793473958969116, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 56070 + }, + { + "epoch": 4.027289048473968, + "grad_norm": 0.9641149044036865, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 56080 + }, + { + "epoch": 4.028007181328546, + "grad_norm": 0.9426136016845703, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 56090 + }, + { + "epoch": 4.028725314183124, + "grad_norm": 0.9211869835853577, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 56100 + }, + { + "epoch": 4.029443447037702, + "grad_norm": 1.1576565504074097, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 56110 + }, + { + "epoch": 4.03016157989228, + "grad_norm": 1.0014013051986694, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 56120 + }, + { + "epoch": 4.0308797127468585, + "grad_norm": 0.9307010769844055, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 56130 + }, + { + "epoch": 4.0315978456014365, + "grad_norm": 0.8290148377418518, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 56140 + }, + { + "epoch": 4.0323159784560145, + "grad_norm": 1.0648446083068848, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 56150 + }, + { + "epoch": 4.0330341113105925, + "grad_norm": 1.1545547246932983, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 56160 + }, + { + "epoch": 4.0337522441651705, + "grad_norm": 0.9643545150756836, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 56170 + }, + { + "epoch": 4.0344703770197485, + "grad_norm": 0.8913900256156921, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 56180 + }, + { + "epoch": 4.0351885098743265, + "grad_norm": 0.9445754289627075, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 56190 + }, + { + "epoch": 4.0359066427289045, + "grad_norm": 0.9353124499320984, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 56200 + }, + { + "epoch": 4.0366247755834825, + "grad_norm": 1.1780431270599365, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 56210 + }, + { + "epoch": 4.037342908438061, + "grad_norm": 0.9208880662918091, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 56220 + }, + { + "epoch": 4.038061041292639, + "grad_norm": 0.9475517272949219, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 56230 + }, + { + "epoch": 4.038779174147217, + "grad_norm": 0.7478583455085754, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 56240 + }, + { + "epoch": 4.039497307001795, + "grad_norm": 1.0026403665542603, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 56250 + }, + { + "epoch": 4.040215439856373, + "grad_norm": 0.9664973020553589, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 56260 + }, + { + "epoch": 4.040933572710951, + "grad_norm": 1.0655616521835327, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 56270 + }, + { + "epoch": 4.041651705565529, + "grad_norm": 0.8367540240287781, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 56280 + }, + { + "epoch": 4.042369838420107, + "grad_norm": 0.7982191443443298, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 56290 + }, + { + "epoch": 4.043087971274685, + "grad_norm": 0.8304495215415955, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 56300 + }, + { + "epoch": 4.043806104129264, + "grad_norm": 0.95123291015625, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 56310 + }, + { + "epoch": 4.044524236983842, + "grad_norm": 0.9504102468490601, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 56320 + }, + { + "epoch": 4.04524236983842, + "grad_norm": 0.7432710528373718, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 56330 + }, + { + "epoch": 4.045960502692998, + "grad_norm": 0.9327874183654785, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 56340 + }, + { + "epoch": 4.046678635547576, + "grad_norm": 0.9161670804023743, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 56350 + }, + { + "epoch": 4.047396768402154, + "grad_norm": 0.9371771812438965, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 56360 + }, + { + "epoch": 4.048114901256732, + "grad_norm": 1.0332437753677368, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 56370 + }, + { + "epoch": 4.04883303411131, + "grad_norm": 0.7346320748329163, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 56380 + }, + { + "epoch": 4.049551166965888, + "grad_norm": 0.8247857689857483, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 56390 + }, + { + "epoch": 4.050269299820466, + "grad_norm": 0.925325334072113, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 56400 + }, + { + "epoch": 4.050987432675045, + "grad_norm": 0.7344088554382324, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 56410 + }, + { + "epoch": 4.051705565529623, + "grad_norm": 0.9204918146133423, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 56420 + }, + { + "epoch": 4.052423698384201, + "grad_norm": 0.8273472785949707, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 56430 + }, + { + "epoch": 4.053141831238779, + "grad_norm": 0.9524998068809509, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 56440 + }, + { + "epoch": 4.053859964093357, + "grad_norm": 0.9168205857276917, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 56450 + }, + { + "epoch": 4.054578096947935, + "grad_norm": 0.9634994864463806, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 56460 + }, + { + "epoch": 4.055296229802513, + "grad_norm": 1.2027593851089478, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 56470 + }, + { + "epoch": 4.056014362657091, + "grad_norm": 1.2347805500030518, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 56480 + }, + { + "epoch": 4.056732495511669, + "grad_norm": 0.8621458411216736, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 56490 + }, + { + "epoch": 4.057450628366248, + "grad_norm": 0.9194608330726624, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 56500 + }, + { + "epoch": 4.058168761220826, + "grad_norm": 1.0153663158416748, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 56510 + }, + { + "epoch": 4.058886894075404, + "grad_norm": 0.9170986413955688, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 56520 + }, + { + "epoch": 4.059605026929982, + "grad_norm": 1.033057689666748, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 56530 + }, + { + "epoch": 4.06032315978456, + "grad_norm": 1.0125197172164917, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 56540 + }, + { + "epoch": 4.061041292639138, + "grad_norm": 0.9429898262023926, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 56550 + }, + { + "epoch": 4.061759425493716, + "grad_norm": 0.9242179989814758, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 56560 + }, + { + "epoch": 4.062477558348294, + "grad_norm": 0.9365091323852539, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 56570 + }, + { + "epoch": 4.063195691202872, + "grad_norm": 0.9148455858230591, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 56580 + }, + { + "epoch": 4.063913824057451, + "grad_norm": 0.8546709418296814, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 56590 + }, + { + "epoch": 4.064631956912029, + "grad_norm": 0.9743902087211609, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 56600 + }, + { + "epoch": 4.065350089766607, + "grad_norm": 1.0599974393844604, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 56610 + }, + { + "epoch": 4.066068222621185, + "grad_norm": 0.9677841067314148, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 56620 + }, + { + "epoch": 4.066786355475763, + "grad_norm": 0.8892754316329956, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 56630 + }, + { + "epoch": 4.067504488330341, + "grad_norm": 0.8837814331054688, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 56640 + }, + { + "epoch": 4.068222621184919, + "grad_norm": 0.9284095764160156, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 56650 + }, + { + "epoch": 4.068940754039497, + "grad_norm": 1.0163567066192627, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 56660 + }, + { + "epoch": 4.069658886894075, + "grad_norm": 0.8713456988334656, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 56670 + }, + { + "epoch": 4.070377019748653, + "grad_norm": 0.8356686234474182, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 56680 + }, + { + "epoch": 4.071095152603232, + "grad_norm": 0.8998766541481018, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 56690 + }, + { + "epoch": 4.07181328545781, + "grad_norm": 1.0441967248916626, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 56700 + }, + { + "epoch": 4.072531418312388, + "grad_norm": 0.9313125610351562, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 56710 + }, + { + "epoch": 4.073249551166966, + "grad_norm": 0.9912964701652527, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 56720 + }, + { + "epoch": 4.073967684021544, + "grad_norm": 0.9048459529876709, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 56730 + }, + { + "epoch": 4.074685816876122, + "grad_norm": 1.0248944759368896, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 56740 + }, + { + "epoch": 4.0754039497307, + "grad_norm": 1.4526786804199219, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 56750 + }, + { + "epoch": 4.076122082585278, + "grad_norm": 0.9813178181648254, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 56760 + }, + { + "epoch": 4.076840215439856, + "grad_norm": 1.0686813592910767, + "learning_rate": 0.0002, + "loss": 0.5707, + "step": 56770 + }, + { + "epoch": 4.077558348294435, + "grad_norm": 1.1093482971191406, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 56780 + }, + { + "epoch": 4.078276481149013, + "grad_norm": 0.9377819895744324, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 56790 + }, + { + "epoch": 4.078994614003591, + "grad_norm": 0.8043649196624756, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 56800 + }, + { + "epoch": 4.079712746858169, + "grad_norm": 0.7995415925979614, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 56810 + }, + { + "epoch": 4.080430879712747, + "grad_norm": 1.0076148509979248, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 56820 + }, + { + "epoch": 4.081149012567325, + "grad_norm": 0.8192076683044434, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 56830 + }, + { + "epoch": 4.081867145421903, + "grad_norm": 0.9226266145706177, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 56840 + }, + { + "epoch": 4.082585278276481, + "grad_norm": 0.8877972960472107, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 56850 + }, + { + "epoch": 4.083303411131059, + "grad_norm": 0.9578937888145447, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 56860 + }, + { + "epoch": 4.084021543985638, + "grad_norm": 0.8929167985916138, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 56870 + }, + { + "epoch": 4.084739676840216, + "grad_norm": 1.0015977621078491, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 56880 + }, + { + "epoch": 4.085457809694794, + "grad_norm": 0.9768750667572021, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 56890 + }, + { + "epoch": 4.086175942549372, + "grad_norm": 1.0834569931030273, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 56900 + }, + { + "epoch": 4.08689407540395, + "grad_norm": 0.8761230707168579, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 56910 + }, + { + "epoch": 4.087612208258528, + "grad_norm": 1.027064323425293, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 56920 + }, + { + "epoch": 4.088330341113106, + "grad_norm": 1.130336880683899, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 56930 + }, + { + "epoch": 4.089048473967684, + "grad_norm": 0.8157579898834229, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 56940 + }, + { + "epoch": 4.089766606822262, + "grad_norm": 1.071175217628479, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 56950 + }, + { + "epoch": 4.09048473967684, + "grad_norm": 0.9534492492675781, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 56960 + }, + { + "epoch": 4.091202872531419, + "grad_norm": 0.9584037661552429, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 56970 + }, + { + "epoch": 4.091921005385997, + "grad_norm": 1.1513131856918335, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 56980 + }, + { + "epoch": 4.092639138240575, + "grad_norm": 1.0167666673660278, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 56990 + }, + { + "epoch": 4.093357271095153, + "grad_norm": 1.0630987882614136, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 57000 + }, + { + "epoch": 4.094075403949731, + "grad_norm": 1.0326893329620361, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 57010 + }, + { + "epoch": 4.094793536804309, + "grad_norm": 0.9701678156852722, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 57020 + }, + { + "epoch": 4.095511669658887, + "grad_norm": 0.839935302734375, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 57030 + }, + { + "epoch": 4.096229802513465, + "grad_norm": 0.8995838761329651, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 57040 + }, + { + "epoch": 4.096947935368043, + "grad_norm": 0.8039916157722473, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 57050 + }, + { + "epoch": 4.097666068222622, + "grad_norm": 1.126122236251831, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 57060 + }, + { + "epoch": 4.0983842010772, + "grad_norm": 0.8749837875366211, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 57070 + }, + { + "epoch": 4.099102333931778, + "grad_norm": 0.8630341291427612, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 57080 + }, + { + "epoch": 4.099820466786356, + "grad_norm": 0.8889496922492981, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 57090 + }, + { + "epoch": 4.100538599640934, + "grad_norm": 0.9050310254096985, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 57100 + }, + { + "epoch": 4.101256732495512, + "grad_norm": 0.943072497844696, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 57110 + }, + { + "epoch": 4.10197486535009, + "grad_norm": 0.9031552672386169, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 57120 + }, + { + "epoch": 4.102692998204668, + "grad_norm": 0.939862847328186, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 57130 + }, + { + "epoch": 4.103411131059246, + "grad_norm": 0.8080634474754333, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 57140 + }, + { + "epoch": 4.1041292639138245, + "grad_norm": 0.9181693196296692, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 57150 + }, + { + "epoch": 4.1048473967684025, + "grad_norm": 0.9609217643737793, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 57160 + }, + { + "epoch": 4.1055655296229805, + "grad_norm": 1.1246516704559326, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 57170 + }, + { + "epoch": 4.1062836624775585, + "grad_norm": 1.0616880655288696, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 57180 + }, + { + "epoch": 4.1070017953321365, + "grad_norm": 0.9954505562782288, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 57190 + }, + { + "epoch": 4.1077199281867145, + "grad_norm": 1.0602279901504517, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 57200 + }, + { + "epoch": 4.1084380610412925, + "grad_norm": 0.8984764814376831, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 57210 + }, + { + "epoch": 4.1091561938958705, + "grad_norm": 0.845167875289917, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 57220 + }, + { + "epoch": 4.1098743267504485, + "grad_norm": 0.7901500463485718, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 57230 + }, + { + "epoch": 4.1105924596050265, + "grad_norm": 1.0462526082992554, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 57240 + }, + { + "epoch": 4.111310592459605, + "grad_norm": 0.9098827838897705, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 57250 + }, + { + "epoch": 4.112028725314183, + "grad_norm": 0.9234077334403992, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 57260 + }, + { + "epoch": 4.112746858168761, + "grad_norm": 1.0033560991287231, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 57270 + }, + { + "epoch": 4.113464991023339, + "grad_norm": 1.0620051622390747, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 57280 + }, + { + "epoch": 4.114183123877917, + "grad_norm": 0.8679345846176147, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 57290 + }, + { + "epoch": 4.114901256732495, + "grad_norm": 0.7557345628738403, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 57300 + }, + { + "epoch": 4.115619389587073, + "grad_norm": 0.8970935344696045, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 57310 + }, + { + "epoch": 4.116337522441651, + "grad_norm": 1.0779842138290405, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 57320 + }, + { + "epoch": 4.117055655296229, + "grad_norm": 1.2036106586456299, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 57330 + }, + { + "epoch": 4.117773788150808, + "grad_norm": 0.8337953686714172, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 57340 + }, + { + "epoch": 4.118491921005386, + "grad_norm": 0.9850410223007202, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 57350 + }, + { + "epoch": 4.119210053859964, + "grad_norm": 0.8028770685195923, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 57360 + }, + { + "epoch": 4.119928186714542, + "grad_norm": 0.8693217039108276, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 57370 + }, + { + "epoch": 4.12064631956912, + "grad_norm": 0.8795534372329712, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 57380 + }, + { + "epoch": 4.121364452423698, + "grad_norm": 1.0081543922424316, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 57390 + }, + { + "epoch": 4.122082585278276, + "grad_norm": 0.8776742219924927, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 57400 + }, + { + "epoch": 4.122800718132854, + "grad_norm": 0.8247824311256409, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 57410 + }, + { + "epoch": 4.123518850987432, + "grad_norm": 1.1346335411071777, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 57420 + }, + { + "epoch": 4.124236983842011, + "grad_norm": 1.0671089887619019, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 57430 + }, + { + "epoch": 4.124955116696589, + "grad_norm": 0.8548333048820496, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 57440 + }, + { + "epoch": 4.125673249551167, + "grad_norm": 1.0221573114395142, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 57450 + }, + { + "epoch": 4.126391382405745, + "grad_norm": 0.9746617674827576, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 57460 + }, + { + "epoch": 4.127109515260323, + "grad_norm": 0.8104965090751648, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 57470 + }, + { + "epoch": 4.127827648114901, + "grad_norm": 1.0401487350463867, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 57480 + }, + { + "epoch": 4.128545780969479, + "grad_norm": 0.8828882575035095, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 57490 + }, + { + "epoch": 4.129263913824057, + "grad_norm": 1.0121098756790161, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 57500 + }, + { + "epoch": 4.129982046678635, + "grad_norm": 0.8789737820625305, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 57510 + }, + { + "epoch": 4.130700179533213, + "grad_norm": 1.0386744737625122, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 57520 + }, + { + "epoch": 4.131418312387792, + "grad_norm": 1.0092610120773315, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 57530 + }, + { + "epoch": 4.13213644524237, + "grad_norm": 0.8706282377243042, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 57540 + }, + { + "epoch": 4.132854578096948, + "grad_norm": 0.9270507097244263, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 57550 + }, + { + "epoch": 4.133572710951526, + "grad_norm": 1.0303068161010742, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 57560 + }, + { + "epoch": 4.134290843806104, + "grad_norm": 1.1169062852859497, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 57570 + }, + { + "epoch": 4.135008976660682, + "grad_norm": 0.8530599474906921, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 57580 + }, + { + "epoch": 4.13572710951526, + "grad_norm": 1.1395039558410645, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 57590 + }, + { + "epoch": 4.136445242369838, + "grad_norm": 0.8944115042686462, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 57600 + }, + { + "epoch": 4.137163375224416, + "grad_norm": 1.137966275215149, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 57610 + }, + { + "epoch": 4.137881508078995, + "grad_norm": 0.8244962692260742, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 57620 + }, + { + "epoch": 4.138599640933573, + "grad_norm": 1.1935817003250122, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 57630 + }, + { + "epoch": 4.139317773788151, + "grad_norm": 0.9774235486984253, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 57640 + }, + { + "epoch": 4.140035906642729, + "grad_norm": 1.066219449043274, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 57650 + }, + { + "epoch": 4.140754039497307, + "grad_norm": 0.8631396293640137, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 57660 + }, + { + "epoch": 4.141472172351885, + "grad_norm": 0.888410747051239, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 57670 + }, + { + "epoch": 4.142190305206463, + "grad_norm": 1.002642035484314, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 57680 + }, + { + "epoch": 4.142908438061041, + "grad_norm": 1.0092825889587402, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 57690 + }, + { + "epoch": 4.143626570915619, + "grad_norm": 0.9126971364021301, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 57700 + }, + { + "epoch": 4.144344703770198, + "grad_norm": 1.0303562879562378, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 57710 + }, + { + "epoch": 4.145062836624776, + "grad_norm": 1.1230897903442383, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 57720 + }, + { + "epoch": 4.145780969479354, + "grad_norm": 1.0494099855422974, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 57730 + }, + { + "epoch": 4.146499102333932, + "grad_norm": 0.9555442333221436, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 57740 + }, + { + "epoch": 4.14721723518851, + "grad_norm": 0.8255124092102051, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 57750 + }, + { + "epoch": 4.147935368043088, + "grad_norm": 1.097853660583496, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 57760 + }, + { + "epoch": 4.148653500897666, + "grad_norm": 1.0272663831710815, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 57770 + }, + { + "epoch": 4.149371633752244, + "grad_norm": 1.022571086883545, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 57780 + }, + { + "epoch": 4.150089766606822, + "grad_norm": 0.964543342590332, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 57790 + }, + { + "epoch": 4.1508078994614, + "grad_norm": 0.9251219034194946, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 57800 + }, + { + "epoch": 4.151526032315979, + "grad_norm": 1.081840991973877, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 57810 + }, + { + "epoch": 4.152244165170557, + "grad_norm": 0.8989445567131042, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 57820 + }, + { + "epoch": 4.152962298025135, + "grad_norm": 0.903629720211029, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 57830 + }, + { + "epoch": 4.153680430879713, + "grad_norm": 0.8985397219657898, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 57840 + }, + { + "epoch": 4.154398563734291, + "grad_norm": 1.047778844833374, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 57850 + }, + { + "epoch": 4.155116696588869, + "grad_norm": 0.9804165363311768, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 57860 + }, + { + "epoch": 4.155834829443447, + "grad_norm": 1.187309980392456, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 57870 + }, + { + "epoch": 4.156552962298025, + "grad_norm": 0.9854836463928223, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 57880 + }, + { + "epoch": 4.157271095152603, + "grad_norm": 0.8494308590888977, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 57890 + }, + { + "epoch": 4.157989228007182, + "grad_norm": 0.9359684586524963, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 57900 + }, + { + "epoch": 4.15870736086176, + "grad_norm": 0.8971988558769226, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 57910 + }, + { + "epoch": 4.159425493716338, + "grad_norm": 0.8848021030426025, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 57920 + }, + { + "epoch": 4.160143626570916, + "grad_norm": 0.982877790927887, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 57930 + }, + { + "epoch": 4.160861759425494, + "grad_norm": 0.8668819069862366, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 57940 + }, + { + "epoch": 4.161579892280072, + "grad_norm": 1.06569504737854, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 57950 + }, + { + "epoch": 4.16229802513465, + "grad_norm": 1.165740728378296, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 57960 + }, + { + "epoch": 4.163016157989228, + "grad_norm": 1.0534512996673584, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 57970 + }, + { + "epoch": 4.163734290843806, + "grad_norm": 0.8785330653190613, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 57980 + }, + { + "epoch": 4.164452423698384, + "grad_norm": 1.1244874000549316, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 57990 + }, + { + "epoch": 4.165170556552963, + "grad_norm": 0.8839399218559265, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 58000 + }, + { + "epoch": 4.165888689407541, + "grad_norm": 1.0603798627853394, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 58010 + }, + { + "epoch": 4.166606822262119, + "grad_norm": 0.9737853407859802, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 58020 + }, + { + "epoch": 4.167324955116697, + "grad_norm": 1.0650558471679688, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 58030 + }, + { + "epoch": 4.168043087971275, + "grad_norm": 0.7528959512710571, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 58040 + }, + { + "epoch": 4.168761220825853, + "grad_norm": 0.9286156892776489, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 58050 + }, + { + "epoch": 4.169479353680431, + "grad_norm": 1.0225880146026611, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 58060 + }, + { + "epoch": 4.170197486535009, + "grad_norm": 0.9990654587745667, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 58070 + }, + { + "epoch": 4.170915619389587, + "grad_norm": 1.052057147026062, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 58080 + }, + { + "epoch": 4.1716337522441655, + "grad_norm": 0.7366801500320435, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 58090 + }, + { + "epoch": 4.1723518850987436, + "grad_norm": 1.0943711996078491, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 58100 + }, + { + "epoch": 4.1730700179533216, + "grad_norm": 1.1297656297683716, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 58110 + }, + { + "epoch": 4.1737881508078996, + "grad_norm": 0.7861461639404297, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 58120 + }, + { + "epoch": 4.174506283662478, + "grad_norm": 0.8643335103988647, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 58130 + }, + { + "epoch": 4.175224416517056, + "grad_norm": 0.957288384437561, + "learning_rate": 0.0002, + "loss": 0.6103, + "step": 58140 + }, + { + "epoch": 4.175942549371634, + "grad_norm": 0.9175366759300232, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 58150 + }, + { + "epoch": 4.176660682226212, + "grad_norm": 1.129935622215271, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 58160 + }, + { + "epoch": 4.17737881508079, + "grad_norm": 0.9683087468147278, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 58170 + }, + { + "epoch": 4.1780969479353685, + "grad_norm": 1.045171856880188, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 58180 + }, + { + "epoch": 4.1788150807899465, + "grad_norm": 0.9858742952346802, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 58190 + }, + { + "epoch": 4.1795332136445245, + "grad_norm": 0.8513413071632385, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 58200 + }, + { + "epoch": 4.1802513464991025, + "grad_norm": 0.9584265947341919, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 58210 + }, + { + "epoch": 4.1809694793536805, + "grad_norm": 0.8828920722007751, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 58220 + }, + { + "epoch": 4.1816876122082585, + "grad_norm": 0.9849961400032043, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 58230 + }, + { + "epoch": 4.1824057450628365, + "grad_norm": 1.0601637363433838, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 58240 + }, + { + "epoch": 4.1831238779174145, + "grad_norm": 1.2206604480743408, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 58250 + }, + { + "epoch": 4.1838420107719925, + "grad_norm": 1.1768009662628174, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 58260 + }, + { + "epoch": 4.184560143626571, + "grad_norm": 0.9521295428276062, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 58270 + }, + { + "epoch": 4.185278276481149, + "grad_norm": 0.892971932888031, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 58280 + }, + { + "epoch": 4.185996409335727, + "grad_norm": 0.8712016940116882, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 58290 + }, + { + "epoch": 4.186714542190305, + "grad_norm": 1.0190843343734741, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 58300 + }, + { + "epoch": 4.187432675044883, + "grad_norm": 1.0149270296096802, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 58310 + }, + { + "epoch": 4.188150807899461, + "grad_norm": 1.1818004846572876, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 58320 + }, + { + "epoch": 4.188868940754039, + "grad_norm": 0.7892335653305054, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 58330 + }, + { + "epoch": 4.189587073608617, + "grad_norm": 0.9792808890342712, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 58340 + }, + { + "epoch": 4.190305206463195, + "grad_norm": 0.9946883320808411, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 58350 + }, + { + "epoch": 4.191023339317773, + "grad_norm": 1.0363789796829224, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 58360 + }, + { + "epoch": 4.191741472172352, + "grad_norm": 0.9285917282104492, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 58370 + }, + { + "epoch": 4.19245960502693, + "grad_norm": 0.9461679458618164, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 58380 + }, + { + "epoch": 4.193177737881508, + "grad_norm": 1.0344175100326538, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 58390 + }, + { + "epoch": 4.193895870736086, + "grad_norm": 0.9530242085456848, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 58400 + }, + { + "epoch": 4.194614003590664, + "grad_norm": 0.9171900749206543, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 58410 + }, + { + "epoch": 4.195332136445242, + "grad_norm": 0.8094898462295532, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 58420 + }, + { + "epoch": 4.19605026929982, + "grad_norm": 0.921981930732727, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 58430 + }, + { + "epoch": 4.196768402154398, + "grad_norm": 0.9783532023429871, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 58440 + }, + { + "epoch": 4.197486535008976, + "grad_norm": 1.017805576324463, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 58450 + }, + { + "epoch": 4.198204667863555, + "grad_norm": 0.9244308471679688, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 58460 + }, + { + "epoch": 4.198922800718133, + "grad_norm": 0.9942585229873657, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 58470 + }, + { + "epoch": 4.199640933572711, + "grad_norm": 1.1045037508010864, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 58480 + }, + { + "epoch": 4.200359066427289, + "grad_norm": 0.9483149647712708, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58490 + }, + { + "epoch": 4.201077199281867, + "grad_norm": 1.0807271003723145, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 58500 + }, + { + "epoch": 4.201795332136445, + "grad_norm": 0.7697445750236511, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 58510 + }, + { + "epoch": 4.202513464991023, + "grad_norm": 1.0761178731918335, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 58520 + }, + { + "epoch": 4.203231597845601, + "grad_norm": 0.9992024898529053, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 58530 + }, + { + "epoch": 4.203949730700179, + "grad_norm": 0.8741498589515686, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 58540 + }, + { + "epoch": 4.204667863554757, + "grad_norm": 0.8557528853416443, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 58550 + }, + { + "epoch": 4.205385996409336, + "grad_norm": 0.8853630423545837, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 58560 + }, + { + "epoch": 4.206104129263914, + "grad_norm": 0.9858933687210083, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 58570 + }, + { + "epoch": 4.206822262118492, + "grad_norm": 1.104732871055603, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 58580 + }, + { + "epoch": 4.20754039497307, + "grad_norm": 0.9345462322235107, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 58590 + }, + { + "epoch": 4.208258527827648, + "grad_norm": 0.9620407819747925, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 58600 + }, + { + "epoch": 4.208976660682226, + "grad_norm": 0.8546963334083557, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 58610 + }, + { + "epoch": 4.209694793536804, + "grad_norm": 0.8125145435333252, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 58620 + }, + { + "epoch": 4.210412926391382, + "grad_norm": 0.8481138944625854, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 58630 + }, + { + "epoch": 4.21113105924596, + "grad_norm": 0.8884692788124084, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 58640 + }, + { + "epoch": 4.211849192100539, + "grad_norm": 1.09279465675354, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 58650 + }, + { + "epoch": 4.212567324955117, + "grad_norm": 0.9806583523750305, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 58660 + }, + { + "epoch": 4.213285457809695, + "grad_norm": 0.9510366916656494, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 58670 + }, + { + "epoch": 4.214003590664273, + "grad_norm": 0.7517459988594055, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 58680 + }, + { + "epoch": 4.214721723518851, + "grad_norm": 1.1134123802185059, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 58690 + }, + { + "epoch": 4.215439856373429, + "grad_norm": 0.8307328820228577, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 58700 + }, + { + "epoch": 4.216157989228007, + "grad_norm": 0.8211639523506165, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 58710 + }, + { + "epoch": 4.216876122082585, + "grad_norm": 1.0749584436416626, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 58720 + }, + { + "epoch": 4.217594254937163, + "grad_norm": 1.1394833326339722, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 58730 + }, + { + "epoch": 4.218312387791742, + "grad_norm": 1.05130934715271, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 58740 + }, + { + "epoch": 4.21903052064632, + "grad_norm": 0.7949456572532654, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 58750 + }, + { + "epoch": 4.219748653500898, + "grad_norm": 0.906506359577179, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 58760 + }, + { + "epoch": 4.220466786355476, + "grad_norm": 0.8338989615440369, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 58770 + }, + { + "epoch": 4.221184919210054, + "grad_norm": 0.9325370788574219, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 58780 + }, + { + "epoch": 4.221903052064632, + "grad_norm": 1.0208096504211426, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 58790 + }, + { + "epoch": 4.22262118491921, + "grad_norm": 1.0075920820236206, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 58800 + }, + { + "epoch": 4.223339317773788, + "grad_norm": 0.9858701229095459, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 58810 + }, + { + "epoch": 4.224057450628366, + "grad_norm": 1.0010110139846802, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 58820 + }, + { + "epoch": 4.224775583482945, + "grad_norm": 0.9360540509223938, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 58830 + }, + { + "epoch": 4.225493716337523, + "grad_norm": 0.9021786451339722, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 58840 + }, + { + "epoch": 4.226211849192101, + "grad_norm": 1.1778476238250732, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 58850 + }, + { + "epoch": 4.226929982046679, + "grad_norm": 1.0061023235321045, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 58860 + }, + { + "epoch": 4.227648114901257, + "grad_norm": 0.8839752674102783, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58870 + }, + { + "epoch": 4.228366247755835, + "grad_norm": 1.0078870058059692, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 58880 + }, + { + "epoch": 4.229084380610413, + "grad_norm": 0.8926451206207275, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 58890 + }, + { + "epoch": 4.229802513464991, + "grad_norm": 1.4018772840499878, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 58900 + }, + { + "epoch": 4.230520646319569, + "grad_norm": 0.9911289215087891, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 58910 + }, + { + "epoch": 4.231238779174147, + "grad_norm": 0.9374576807022095, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 58920 + }, + { + "epoch": 4.231956912028726, + "grad_norm": 1.179650068283081, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 58930 + }, + { + "epoch": 4.232675044883304, + "grad_norm": 0.9434911012649536, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 58940 + }, + { + "epoch": 4.233393177737882, + "grad_norm": 1.0061911344528198, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 58950 + }, + { + "epoch": 4.23411131059246, + "grad_norm": 0.9663233757019043, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 58960 + }, + { + "epoch": 4.234829443447038, + "grad_norm": 0.8897581696510315, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 58970 + }, + { + "epoch": 4.235547576301616, + "grad_norm": 0.873281717300415, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 58980 + }, + { + "epoch": 4.236265709156194, + "grad_norm": 0.9146949052810669, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 58990 + }, + { + "epoch": 4.236983842010772, + "grad_norm": 0.9381195306777954, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 59000 + }, + { + "epoch": 4.23770197486535, + "grad_norm": 0.9700697064399719, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 59010 + }, + { + "epoch": 4.238420107719929, + "grad_norm": 0.9050154685974121, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 59020 + }, + { + "epoch": 4.239138240574507, + "grad_norm": 0.9901503324508667, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 59030 + }, + { + "epoch": 4.239856373429085, + "grad_norm": 0.9009594321250916, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 59040 + }, + { + "epoch": 4.240574506283663, + "grad_norm": 1.0924968719482422, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 59050 + }, + { + "epoch": 4.241292639138241, + "grad_norm": 0.9939947724342346, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 59060 + }, + { + "epoch": 4.242010771992819, + "grad_norm": 1.0577857494354248, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 59070 + }, + { + "epoch": 4.242728904847397, + "grad_norm": 1.0836747884750366, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 59080 + }, + { + "epoch": 4.243447037701975, + "grad_norm": 0.97043377161026, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 59090 + }, + { + "epoch": 4.244165170556553, + "grad_norm": 0.7711901664733887, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 59100 + }, + { + "epoch": 4.244883303411131, + "grad_norm": 1.0143170356750488, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 59110 + }, + { + "epoch": 4.2456014362657095, + "grad_norm": 0.9151925444602966, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 59120 + }, + { + "epoch": 4.2463195691202875, + "grad_norm": 0.9252700209617615, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 59130 + }, + { + "epoch": 4.2470377019748655, + "grad_norm": 0.8429408073425293, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 59140 + }, + { + "epoch": 4.2477558348294435, + "grad_norm": 0.9645987153053284, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 59150 + }, + { + "epoch": 4.2484739676840215, + "grad_norm": 0.9949791431427002, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 59160 + }, + { + "epoch": 4.2491921005385995, + "grad_norm": 0.9128350615501404, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 59170 + }, + { + "epoch": 4.2499102333931775, + "grad_norm": 0.7406911849975586, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 59180 + }, + { + "epoch": 4.2506283662477555, + "grad_norm": 1.0237419605255127, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 59190 + }, + { + "epoch": 4.2513464991023335, + "grad_norm": 0.805459201335907, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 59200 + }, + { + "epoch": 4.252064631956912, + "grad_norm": 0.8477254509925842, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 59210 + }, + { + "epoch": 4.25278276481149, + "grad_norm": 0.984023928642273, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 59220 + }, + { + "epoch": 4.253500897666068, + "grad_norm": 1.0667484998703003, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 59230 + }, + { + "epoch": 4.254219030520646, + "grad_norm": 0.7192284464836121, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 59240 + }, + { + "epoch": 4.254937163375224, + "grad_norm": 0.9557451009750366, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 59250 + }, + { + "epoch": 4.255655296229802, + "grad_norm": 0.9209784865379333, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 59260 + }, + { + "epoch": 4.25637342908438, + "grad_norm": 0.9785363674163818, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 59270 + }, + { + "epoch": 4.257091561938958, + "grad_norm": 0.910214364528656, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 59280 + }, + { + "epoch": 4.257809694793536, + "grad_norm": 0.8945858478546143, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 59290 + }, + { + "epoch": 4.258527827648114, + "grad_norm": 1.0984420776367188, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 59300 + }, + { + "epoch": 4.259245960502693, + "grad_norm": 1.0256640911102295, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 59310 + }, + { + "epoch": 4.259964093357271, + "grad_norm": 0.978397786617279, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 59320 + }, + { + "epoch": 4.260682226211849, + "grad_norm": 0.7587000727653503, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 59330 + }, + { + "epoch": 4.261400359066427, + "grad_norm": 0.9384620785713196, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 59340 + }, + { + "epoch": 4.262118491921005, + "grad_norm": 0.893992006778717, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 59350 + }, + { + "epoch": 4.262836624775583, + "grad_norm": 1.0231536626815796, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 59360 + }, + { + "epoch": 4.263554757630161, + "grad_norm": 0.9810128211975098, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 59370 + }, + { + "epoch": 4.264272890484739, + "grad_norm": 1.0868116617202759, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 59380 + }, + { + "epoch": 4.264991023339318, + "grad_norm": 1.1433676481246948, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 59390 + }, + { + "epoch": 4.265709156193896, + "grad_norm": 0.9836946725845337, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 59400 + }, + { + "epoch": 4.266427289048474, + "grad_norm": 0.9473603963851929, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 59410 + }, + { + "epoch": 4.267145421903052, + "grad_norm": 0.9066835641860962, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 59420 + }, + { + "epoch": 4.26786355475763, + "grad_norm": 1.0534718036651611, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 59430 + }, + { + "epoch": 4.268581687612208, + "grad_norm": 1.0392775535583496, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 59440 + }, + { + "epoch": 4.269299820466786, + "grad_norm": 1.011472463607788, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 59450 + }, + { + "epoch": 4.270017953321364, + "grad_norm": 1.0704147815704346, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 59460 + }, + { + "epoch": 4.270736086175942, + "grad_norm": 0.9349238872528076, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 59470 + }, + { + "epoch": 4.27145421903052, + "grad_norm": 0.8745087385177612, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 59480 + }, + { + "epoch": 4.272172351885099, + "grad_norm": 0.8823763728141785, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 59490 + }, + { + "epoch": 4.272890484739677, + "grad_norm": 1.110912799835205, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 59500 + }, + { + "epoch": 4.273608617594255, + "grad_norm": 1.0000925064086914, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 59510 + }, + { + "epoch": 4.274326750448833, + "grad_norm": 1.1578227281570435, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 59520 + }, + { + "epoch": 4.275044883303411, + "grad_norm": 0.875720202922821, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 59530 + }, + { + "epoch": 4.275763016157989, + "grad_norm": 0.9562238454818726, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 59540 + }, + { + "epoch": 4.276481149012567, + "grad_norm": 0.8384222388267517, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 59550 + }, + { + "epoch": 4.277199281867145, + "grad_norm": 1.2719428539276123, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 59560 + }, + { + "epoch": 4.277917414721723, + "grad_norm": 1.0656434297561646, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 59570 + }, + { + "epoch": 4.278635547576302, + "grad_norm": 1.0766716003417969, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 59580 + }, + { + "epoch": 4.27935368043088, + "grad_norm": 0.8892807960510254, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 59590 + }, + { + "epoch": 4.280071813285458, + "grad_norm": 0.8956300020217896, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 59600 + }, + { + "epoch": 4.280789946140036, + "grad_norm": 0.9562926888465881, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 59610 + }, + { + "epoch": 4.281508078994614, + "grad_norm": 1.009141445159912, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 59620 + }, + { + "epoch": 4.282226211849192, + "grad_norm": 1.0546064376831055, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 59630 + }, + { + "epoch": 4.28294434470377, + "grad_norm": 0.8831254243850708, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 59640 + }, + { + "epoch": 4.283662477558348, + "grad_norm": 0.9560053944587708, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 59650 + }, + { + "epoch": 4.284380610412926, + "grad_norm": 1.030339241027832, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 59660 + }, + { + "epoch": 4.285098743267504, + "grad_norm": 1.00662100315094, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 59670 + }, + { + "epoch": 4.285816876122083, + "grad_norm": 1.0759116411209106, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 59680 + }, + { + "epoch": 4.286535008976661, + "grad_norm": 0.9985393285751343, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 59690 + }, + { + "epoch": 4.287253141831239, + "grad_norm": 0.9044474959373474, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 59700 + }, + { + "epoch": 4.287971274685817, + "grad_norm": 1.1224442720413208, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 59710 + }, + { + "epoch": 4.288689407540395, + "grad_norm": 0.8436414003372192, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 59720 + }, + { + "epoch": 4.289407540394973, + "grad_norm": 1.0695041418075562, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 59730 + }, + { + "epoch": 4.290125673249551, + "grad_norm": 0.8809951543807983, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 59740 + }, + { + "epoch": 4.290843806104129, + "grad_norm": 1.0213792324066162, + "learning_rate": 0.0002, + "loss": 0.6508, + "step": 59750 + }, + { + "epoch": 4.291561938958707, + "grad_norm": 0.9660196900367737, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 59760 + }, + { + "epoch": 4.292280071813286, + "grad_norm": 0.8005787134170532, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 59770 + }, + { + "epoch": 4.292998204667864, + "grad_norm": 1.0016109943389893, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 59780 + }, + { + "epoch": 4.293716337522442, + "grad_norm": 0.9112903475761414, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 59790 + }, + { + "epoch": 4.29443447037702, + "grad_norm": 0.9999852180480957, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 59800 + }, + { + "epoch": 4.295152603231598, + "grad_norm": 0.9323953986167908, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 59810 + }, + { + "epoch": 4.295870736086176, + "grad_norm": 0.903037965297699, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 59820 + }, + { + "epoch": 4.296588868940754, + "grad_norm": 1.2462431192398071, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 59830 + }, + { + "epoch": 4.297307001795332, + "grad_norm": 1.2322230339050293, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 59840 + }, + { + "epoch": 4.29802513464991, + "grad_norm": 0.9584668278694153, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 59850 + }, + { + "epoch": 4.298743267504488, + "grad_norm": 0.9664767980575562, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 59860 + }, + { + "epoch": 4.299461400359067, + "grad_norm": 0.8860437273979187, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 59870 + }, + { + "epoch": 4.300179533213645, + "grad_norm": 1.0825127363204956, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 59880 + }, + { + "epoch": 4.300897666068223, + "grad_norm": 1.1312100887298584, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 59890 + }, + { + "epoch": 4.301615798922801, + "grad_norm": 0.8289751410484314, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 59900 + }, + { + "epoch": 4.302333931777379, + "grad_norm": 0.8990927934646606, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 59910 + }, + { + "epoch": 4.303052064631957, + "grad_norm": 0.9667525887489319, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 59920 + }, + { + "epoch": 4.303770197486535, + "grad_norm": 0.8656060695648193, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 59930 + }, + { + "epoch": 4.304488330341113, + "grad_norm": 0.8909396529197693, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 59940 + }, + { + "epoch": 4.305206463195692, + "grad_norm": 0.9533283114433289, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 59950 + }, + { + "epoch": 4.30592459605027, + "grad_norm": 0.9090739488601685, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 59960 + }, + { + "epoch": 4.306642728904848, + "grad_norm": 1.096656322479248, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 59970 + }, + { + "epoch": 4.307360861759426, + "grad_norm": 1.0392465591430664, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 59980 + }, + { + "epoch": 4.308078994614004, + "grad_norm": 0.8733913898468018, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 59990 + }, + { + "epoch": 4.308797127468582, + "grad_norm": 0.8287094235420227, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 60000 + }, + { + "epoch": 4.30951526032316, + "grad_norm": 0.9267017245292664, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 60010 + }, + { + "epoch": 4.310233393177738, + "grad_norm": 0.9969515800476074, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 60020 + }, + { + "epoch": 4.310951526032316, + "grad_norm": 1.0005015134811401, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 60030 + }, + { + "epoch": 4.311669658886894, + "grad_norm": 1.1215369701385498, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 60040 + }, + { + "epoch": 4.312387791741473, + "grad_norm": 1.0434890985488892, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 60050 + }, + { + "epoch": 4.313105924596051, + "grad_norm": 0.967989981174469, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 60060 + }, + { + "epoch": 4.313824057450629, + "grad_norm": 1.007599115371704, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 60070 + }, + { + "epoch": 4.314542190305207, + "grad_norm": 0.9356340765953064, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 60080 + }, + { + "epoch": 4.315260323159785, + "grad_norm": 0.9566757678985596, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 60090 + }, + { + "epoch": 4.315978456014363, + "grad_norm": 1.1066830158233643, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 60100 + }, + { + "epoch": 4.316696588868941, + "grad_norm": 0.9895772933959961, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 60110 + }, + { + "epoch": 4.317414721723519, + "grad_norm": 1.07423734664917, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 60120 + }, + { + "epoch": 4.318132854578097, + "grad_norm": 1.0777037143707275, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 60130 + }, + { + "epoch": 4.3188509874326755, + "grad_norm": 1.1475656032562256, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 60140 + }, + { + "epoch": 4.3195691202872535, + "grad_norm": 1.0705864429473877, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 60150 + }, + { + "epoch": 4.3202872531418315, + "grad_norm": 0.8676854968070984, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 60160 + }, + { + "epoch": 4.3210053859964095, + "grad_norm": 0.9488174319267273, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 60170 + }, + { + "epoch": 4.3217235188509875, + "grad_norm": 1.1171153783798218, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 60180 + }, + { + "epoch": 4.3224416517055655, + "grad_norm": 1.091435194015503, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 60190 + }, + { + "epoch": 4.3231597845601435, + "grad_norm": 0.880944013595581, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 60200 + }, + { + "epoch": 4.3238779174147215, + "grad_norm": 0.8458809852600098, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 60210 + }, + { + "epoch": 4.3245960502692995, + "grad_norm": 0.7900225520133972, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 60220 + }, + { + "epoch": 4.3253141831238775, + "grad_norm": 0.966742753982544, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 60230 + }, + { + "epoch": 4.326032315978456, + "grad_norm": 0.8948110342025757, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 60240 + }, + { + "epoch": 4.326750448833034, + "grad_norm": 0.8598700165748596, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 60250 + }, + { + "epoch": 4.327468581687612, + "grad_norm": 1.127610206604004, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 60260 + }, + { + "epoch": 4.32818671454219, + "grad_norm": 0.8357340693473816, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 60270 + }, + { + "epoch": 4.328904847396768, + "grad_norm": 0.8771896362304688, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 60280 + }, + { + "epoch": 4.329622980251346, + "grad_norm": 0.9202101826667786, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 60290 + }, + { + "epoch": 4.330341113105924, + "grad_norm": 1.1427538394927979, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 60300 + }, + { + "epoch": 4.331059245960502, + "grad_norm": 0.8711863160133362, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 60310 + }, + { + "epoch": 4.33177737881508, + "grad_norm": 0.972723662853241, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 60320 + }, + { + "epoch": 4.332495511669659, + "grad_norm": 1.1496877670288086, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 60330 + }, + { + "epoch": 4.333213644524237, + "grad_norm": 1.008581519126892, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 60340 + }, + { + "epoch": 4.333931777378815, + "grad_norm": 1.0802706480026245, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 60350 + }, + { + "epoch": 4.334649910233393, + "grad_norm": 0.8394291996955872, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 60360 + }, + { + "epoch": 4.335368043087971, + "grad_norm": 0.8355905413627625, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 60370 + }, + { + "epoch": 4.336086175942549, + "grad_norm": 0.9583960175514221, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 60380 + }, + { + "epoch": 4.336804308797127, + "grad_norm": 1.138934850692749, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 60390 + }, + { + "epoch": 4.337522441651705, + "grad_norm": 1.0334709882736206, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 60400 + }, + { + "epoch": 4.338240574506283, + "grad_norm": 0.729686439037323, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 60410 + }, + { + "epoch": 4.338958707360861, + "grad_norm": 0.8735929727554321, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 60420 + }, + { + "epoch": 4.33967684021544, + "grad_norm": 0.9617681503295898, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 60430 + }, + { + "epoch": 4.340394973070018, + "grad_norm": 0.9439655542373657, + "learning_rate": 0.0002, + "loss": 0.5865, + "step": 60440 + }, + { + "epoch": 4.341113105924596, + "grad_norm": 0.9275408387184143, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 60450 + }, + { + "epoch": 4.341831238779174, + "grad_norm": 1.0693308115005493, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 60460 + }, + { + "epoch": 4.342549371633752, + "grad_norm": 0.9234438538551331, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 60470 + }, + { + "epoch": 4.34326750448833, + "grad_norm": 1.1376168727874756, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 60480 + }, + { + "epoch": 4.343985637342908, + "grad_norm": 0.9218108654022217, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 60490 + }, + { + "epoch": 4.344703770197486, + "grad_norm": 1.1467362642288208, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 60500 + }, + { + "epoch": 4.345421903052064, + "grad_norm": 0.9459165930747986, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 60510 + }, + { + "epoch": 4.346140035906643, + "grad_norm": 0.9460827708244324, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 60520 + }, + { + "epoch": 4.346858168761221, + "grad_norm": 1.0845041275024414, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 60530 + }, + { + "epoch": 4.347576301615799, + "grad_norm": 1.082675576210022, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 60540 + }, + { + "epoch": 4.348294434470377, + "grad_norm": 0.8443698883056641, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 60550 + }, + { + "epoch": 4.349012567324955, + "grad_norm": 1.018393874168396, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 60560 + }, + { + "epoch": 4.349730700179533, + "grad_norm": 0.8796373009681702, + "learning_rate": 0.0002, + "loss": 0.6447, + "step": 60570 + }, + { + "epoch": 4.350448833034111, + "grad_norm": 1.097942590713501, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 60580 + }, + { + "epoch": 4.351166965888689, + "grad_norm": 0.8750485181808472, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 60590 + }, + { + "epoch": 4.351885098743267, + "grad_norm": 1.0339995622634888, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 60600 + }, + { + "epoch": 4.352603231597846, + "grad_norm": 0.9077731966972351, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 60610 + }, + { + "epoch": 4.353321364452424, + "grad_norm": 1.051321029663086, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 60620 + }, + { + "epoch": 4.354039497307002, + "grad_norm": 1.0018669366836548, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 60630 + }, + { + "epoch": 4.35475763016158, + "grad_norm": 1.0349196195602417, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 60640 + }, + { + "epoch": 4.355475763016158, + "grad_norm": 1.009589672088623, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 60650 + }, + { + "epoch": 4.356193895870736, + "grad_norm": 1.0463480949401855, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 60660 + }, + { + "epoch": 4.356912028725314, + "grad_norm": 0.9815132021903992, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 60670 + }, + { + "epoch": 4.357630161579892, + "grad_norm": 1.0977262258529663, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 60680 + }, + { + "epoch": 4.35834829443447, + "grad_norm": 0.8450005054473877, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 60690 + }, + { + "epoch": 4.359066427289049, + "grad_norm": 1.0959078073501587, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 60700 + }, + { + "epoch": 4.359784560143627, + "grad_norm": 0.9155098795890808, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 60710 + }, + { + "epoch": 4.360502692998205, + "grad_norm": 0.9267987012863159, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 60720 + }, + { + "epoch": 4.361220825852783, + "grad_norm": 1.177472472190857, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 60730 + }, + { + "epoch": 4.361938958707361, + "grad_norm": 0.8615312576293945, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 60740 + }, + { + "epoch": 4.362657091561939, + "grad_norm": 1.0939710140228271, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 60750 + }, + { + "epoch": 4.363375224416517, + "grad_norm": 1.0928049087524414, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 60760 + }, + { + "epoch": 4.364093357271095, + "grad_norm": 1.0796833038330078, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 60770 + }, + { + "epoch": 4.364811490125673, + "grad_norm": 0.9768339991569519, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 60780 + }, + { + "epoch": 4.365529622980251, + "grad_norm": 0.9082722067832947, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 60790 + }, + { + "epoch": 4.36624775583483, + "grad_norm": 0.9614832997322083, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 60800 + }, + { + "epoch": 4.366965888689408, + "grad_norm": 0.8874651789665222, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 60810 + }, + { + "epoch": 4.367684021543986, + "grad_norm": 0.8810178637504578, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 60820 + }, + { + "epoch": 4.368402154398564, + "grad_norm": 1.0893806219100952, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 60830 + }, + { + "epoch": 4.369120287253142, + "grad_norm": 0.9042278528213501, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 60840 + }, + { + "epoch": 4.36983842010772, + "grad_norm": 1.0832217931747437, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 60850 + }, + { + "epoch": 4.370556552962298, + "grad_norm": 0.9431114792823792, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 60860 + }, + { + "epoch": 4.371274685816876, + "grad_norm": 1.031553030014038, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 60870 + }, + { + "epoch": 4.371992818671454, + "grad_norm": 0.8702824711799622, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 60880 + }, + { + "epoch": 4.372710951526033, + "grad_norm": 1.1109199523925781, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 60890 + }, + { + "epoch": 4.373429084380611, + "grad_norm": 0.8369361162185669, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 60900 + }, + { + "epoch": 4.374147217235189, + "grad_norm": 0.988915205001831, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 60910 + }, + { + "epoch": 4.374865350089767, + "grad_norm": 0.9365919232368469, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 60920 + }, + { + "epoch": 4.375583482944345, + "grad_norm": 0.9789398908615112, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 60930 + }, + { + "epoch": 4.376301615798923, + "grad_norm": 0.8786931037902832, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 60940 + }, + { + "epoch": 4.377019748653501, + "grad_norm": 0.8891511559486389, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 60950 + }, + { + "epoch": 4.377737881508079, + "grad_norm": 0.9561707377433777, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 60960 + }, + { + "epoch": 4.378456014362657, + "grad_norm": 0.8674200177192688, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 60970 + }, + { + "epoch": 4.379174147217235, + "grad_norm": 0.9285916090011597, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 60980 + }, + { + "epoch": 4.379892280071814, + "grad_norm": 0.9185547232627869, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 60990 + }, + { + "epoch": 4.380610412926392, + "grad_norm": 1.081664800643921, + "learning_rate": 0.0002, + "loss": 0.6296, + "step": 61000 + }, + { + "epoch": 4.38132854578097, + "grad_norm": 1.0475854873657227, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 61010 + }, + { + "epoch": 4.382046678635548, + "grad_norm": 1.1519653797149658, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 61020 + }, + { + "epoch": 4.382764811490126, + "grad_norm": 0.8757607936859131, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 61030 + }, + { + "epoch": 4.383482944344704, + "grad_norm": 0.8707934021949768, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 61040 + }, + { + "epoch": 4.384201077199282, + "grad_norm": 1.1807516813278198, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 61050 + }, + { + "epoch": 4.38491921005386, + "grad_norm": 1.0674688816070557, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 61060 + }, + { + "epoch": 4.385637342908438, + "grad_norm": 0.9321209788322449, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 61070 + }, + { + "epoch": 4.3863554757630165, + "grad_norm": 1.0786446332931519, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 61080 + }, + { + "epoch": 4.3870736086175945, + "grad_norm": 0.9733907580375671, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 61090 + }, + { + "epoch": 4.3877917414721725, + "grad_norm": 0.9476010203361511, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 61100 + }, + { + "epoch": 4.3885098743267505, + "grad_norm": 1.1321563720703125, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 61110 + }, + { + "epoch": 4.3892280071813286, + "grad_norm": 0.9379117488861084, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 61120 + }, + { + "epoch": 4.3899461400359066, + "grad_norm": 0.8409728407859802, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 61130 + }, + { + "epoch": 4.3906642728904846, + "grad_norm": 0.8309189081192017, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 61140 + }, + { + "epoch": 4.391382405745063, + "grad_norm": 0.8922196626663208, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 61150 + }, + { + "epoch": 4.392100538599641, + "grad_norm": 0.8274614214897156, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 61160 + }, + { + "epoch": 4.392818671454219, + "grad_norm": 1.0928618907928467, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 61170 + }, + { + "epoch": 4.3935368043087974, + "grad_norm": 0.9771125316619873, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 61180 + }, + { + "epoch": 4.3942549371633755, + "grad_norm": 0.8844535946846008, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 61190 + }, + { + "epoch": 4.3949730700179535, + "grad_norm": 1.0498822927474976, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 61200 + }, + { + "epoch": 4.3956912028725315, + "grad_norm": 0.9882155060768127, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 61210 + }, + { + "epoch": 4.3964093357271095, + "grad_norm": 1.090356707572937, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 61220 + }, + { + "epoch": 4.3971274685816875, + "grad_norm": 1.0908088684082031, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 61230 + }, + { + "epoch": 4.3978456014362655, + "grad_norm": 1.0013501644134521, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 61240 + }, + { + "epoch": 4.3985637342908435, + "grad_norm": 1.0916062593460083, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 61250 + }, + { + "epoch": 4.399281867145422, + "grad_norm": 1.0817667245864868, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 61260 + }, + { + "epoch": 4.4, + "grad_norm": 0.9745162129402161, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 61270 + }, + { + "epoch": 4.400718132854578, + "grad_norm": 1.0653400421142578, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 61280 + }, + { + "epoch": 4.401436265709156, + "grad_norm": 1.0082067251205444, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 61290 + }, + { + "epoch": 4.402154398563734, + "grad_norm": 0.7963659167289734, + "learning_rate": 0.0002, + "loss": 0.5719, + "step": 61300 + }, + { + "epoch": 4.402872531418312, + "grad_norm": 1.0428845882415771, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 61310 + }, + { + "epoch": 4.40359066427289, + "grad_norm": 0.9205707311630249, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 61320 + }, + { + "epoch": 4.404308797127468, + "grad_norm": 1.0103533267974854, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 61330 + }, + { + "epoch": 4.405026929982046, + "grad_norm": 1.113547682762146, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 61340 + }, + { + "epoch": 4.405745062836624, + "grad_norm": 1.137488842010498, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 61350 + }, + { + "epoch": 4.406463195691203, + "grad_norm": 1.1284101009368896, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 61360 + }, + { + "epoch": 4.407181328545781, + "grad_norm": 0.8010451197624207, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 61370 + }, + { + "epoch": 4.407899461400359, + "grad_norm": 0.8893977403640747, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 61380 + }, + { + "epoch": 4.408617594254937, + "grad_norm": 0.9098272323608398, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 61390 + }, + { + "epoch": 4.409335727109515, + "grad_norm": 1.0613329410552979, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 61400 + }, + { + "epoch": 4.410053859964093, + "grad_norm": 1.0070269107818604, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 61410 + }, + { + "epoch": 4.410771992818671, + "grad_norm": 0.8632227778434753, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 61420 + }, + { + "epoch": 4.411490125673249, + "grad_norm": 1.0183731317520142, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 61430 + }, + { + "epoch": 4.412208258527827, + "grad_norm": 0.9049941897392273, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 61440 + }, + { + "epoch": 4.412926391382406, + "grad_norm": 1.0184082984924316, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 61450 + }, + { + "epoch": 4.413644524236984, + "grad_norm": 0.9994277358055115, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 61460 + }, + { + "epoch": 4.414362657091562, + "grad_norm": 1.0112420320510864, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 61470 + }, + { + "epoch": 4.41508078994614, + "grad_norm": 0.9751759171485901, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 61480 + }, + { + "epoch": 4.415798922800718, + "grad_norm": 1.047135591506958, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 61490 + }, + { + "epoch": 4.416517055655296, + "grad_norm": 0.886282742023468, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 61500 + }, + { + "epoch": 4.417235188509874, + "grad_norm": 0.971964418888092, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 61510 + }, + { + "epoch": 4.417953321364452, + "grad_norm": 0.9603846073150635, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 61520 + }, + { + "epoch": 4.41867145421903, + "grad_norm": 1.060042142868042, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 61530 + }, + { + "epoch": 4.419389587073608, + "grad_norm": 1.1231369972229004, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 61540 + }, + { + "epoch": 4.420107719928187, + "grad_norm": 0.8269591331481934, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 61550 + }, + { + "epoch": 4.420825852782765, + "grad_norm": 1.0341241359710693, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 61560 + }, + { + "epoch": 4.421543985637343, + "grad_norm": 0.7276636958122253, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 61570 + }, + { + "epoch": 4.422262118491921, + "grad_norm": 1.0663669109344482, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 61580 + }, + { + "epoch": 4.422980251346499, + "grad_norm": 0.9764387011528015, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 61590 + }, + { + "epoch": 4.423698384201077, + "grad_norm": 1.0953258275985718, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 61600 + }, + { + "epoch": 4.424416517055655, + "grad_norm": 0.8877012729644775, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 61610 + }, + { + "epoch": 4.425134649910233, + "grad_norm": 0.8781440854072571, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 61620 + }, + { + "epoch": 4.425852782764811, + "grad_norm": 0.8333432674407959, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 61630 + }, + { + "epoch": 4.42657091561939, + "grad_norm": 0.9647989869117737, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 61640 + }, + { + "epoch": 4.427289048473968, + "grad_norm": 1.0801783800125122, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 61650 + }, + { + "epoch": 4.428007181328546, + "grad_norm": 0.8215882778167725, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 61660 + }, + { + "epoch": 4.428725314183124, + "grad_norm": 0.9853931665420532, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 61670 + }, + { + "epoch": 4.429443447037702, + "grad_norm": 0.8658010959625244, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 61680 + }, + { + "epoch": 4.43016157989228, + "grad_norm": 1.124064326286316, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 61690 + }, + { + "epoch": 4.430879712746858, + "grad_norm": 1.009340763092041, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 61700 + }, + { + "epoch": 4.431597845601436, + "grad_norm": 0.8705293536186218, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 61710 + }, + { + "epoch": 4.432315978456014, + "grad_norm": 1.1323511600494385, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 61720 + }, + { + "epoch": 4.433034111310592, + "grad_norm": 1.1203019618988037, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 61730 + }, + { + "epoch": 4.433752244165171, + "grad_norm": 1.1683770418167114, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 61740 + }, + { + "epoch": 4.434470377019749, + "grad_norm": 1.0735899209976196, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 61750 + }, + { + "epoch": 4.435188509874327, + "grad_norm": 1.142496109008789, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 61760 + }, + { + "epoch": 4.435906642728905, + "grad_norm": 1.1157732009887695, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 61770 + }, + { + "epoch": 4.436624775583483, + "grad_norm": 0.8845949172973633, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 61780 + }, + { + "epoch": 4.437342908438061, + "grad_norm": 1.1212759017944336, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 61790 + }, + { + "epoch": 4.438061041292639, + "grad_norm": 0.8832488656044006, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 61800 + }, + { + "epoch": 4.438779174147217, + "grad_norm": 0.9059590101242065, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 61810 + }, + { + "epoch": 4.439497307001796, + "grad_norm": 1.0625685453414917, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 61820 + }, + { + "epoch": 4.440215439856374, + "grad_norm": 0.9565598368644714, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 61830 + }, + { + "epoch": 4.440933572710952, + "grad_norm": 0.8975377082824707, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 61840 + }, + { + "epoch": 4.44165170556553, + "grad_norm": 1.0412718057632446, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 61850 + }, + { + "epoch": 4.442369838420108, + "grad_norm": 0.9923529624938965, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 61860 + }, + { + "epoch": 4.443087971274686, + "grad_norm": 1.3025734424591064, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 61870 + }, + { + "epoch": 4.443806104129264, + "grad_norm": 1.0031960010528564, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 61880 + }, + { + "epoch": 4.444524236983842, + "grad_norm": 1.0974701642990112, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 61890 + }, + { + "epoch": 4.44524236983842, + "grad_norm": 1.1044024229049683, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 61900 + }, + { + "epoch": 4.445960502692998, + "grad_norm": 1.0782772302627563, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 61910 + }, + { + "epoch": 4.446678635547577, + "grad_norm": 1.006304383277893, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 61920 + }, + { + "epoch": 4.447396768402155, + "grad_norm": 0.9258833527565002, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 61930 + }, + { + "epoch": 4.448114901256733, + "grad_norm": 0.9888426065444946, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 61940 + }, + { + "epoch": 4.448833034111311, + "grad_norm": 0.9592963457107544, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 61950 + }, + { + "epoch": 4.449551166965889, + "grad_norm": 1.0527986288070679, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 61960 + }, + { + "epoch": 4.450269299820467, + "grad_norm": 0.8613291382789612, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 61970 + }, + { + "epoch": 4.450987432675045, + "grad_norm": 1.1083767414093018, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 61980 + }, + { + "epoch": 4.451705565529623, + "grad_norm": 0.772679328918457, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 61990 + }, + { + "epoch": 4.452423698384201, + "grad_norm": 0.9052274227142334, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 62000 + }, + { + "epoch": 4.45314183123878, + "grad_norm": 1.129667043685913, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 62010 + }, + { + "epoch": 4.453859964093358, + "grad_norm": 0.9994529485702515, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 62020 + }, + { + "epoch": 4.454578096947936, + "grad_norm": 0.982155978679657, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 62030 + }, + { + "epoch": 4.455296229802514, + "grad_norm": 0.9139904975891113, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 62040 + }, + { + "epoch": 4.456014362657092, + "grad_norm": 1.0877810716629028, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 62050 + }, + { + "epoch": 4.45673249551167, + "grad_norm": 1.0535308122634888, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 62060 + }, + { + "epoch": 4.457450628366248, + "grad_norm": 1.0225313901901245, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 62070 + }, + { + "epoch": 4.458168761220826, + "grad_norm": 0.8443132042884827, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 62080 + }, + { + "epoch": 4.458886894075404, + "grad_norm": 1.0426654815673828, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 62090 + }, + { + "epoch": 4.459605026929982, + "grad_norm": 1.1110700368881226, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 62100 + }, + { + "epoch": 4.4603231597845605, + "grad_norm": 1.0200893878936768, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 62110 + }, + { + "epoch": 4.4610412926391385, + "grad_norm": 0.9102830290794373, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 62120 + }, + { + "epoch": 4.4617594254937165, + "grad_norm": 1.1395094394683838, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 62130 + }, + { + "epoch": 4.4624775583482945, + "grad_norm": 1.1202316284179688, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 62140 + }, + { + "epoch": 4.4631956912028725, + "grad_norm": 1.142580509185791, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 62150 + }, + { + "epoch": 4.4639138240574505, + "grad_norm": 0.9843677878379822, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 62160 + }, + { + "epoch": 4.4646319569120285, + "grad_norm": 1.0351676940917969, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 62170 + }, + { + "epoch": 4.4653500897666065, + "grad_norm": 0.9365093111991882, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 62180 + }, + { + "epoch": 4.4660682226211845, + "grad_norm": 1.041193962097168, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 62190 + }, + { + "epoch": 4.466786355475763, + "grad_norm": 0.9686329960823059, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 62200 + }, + { + "epoch": 4.467504488330341, + "grad_norm": 1.028622031211853, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 62210 + }, + { + "epoch": 4.468222621184919, + "grad_norm": 0.9717516899108887, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 62220 + }, + { + "epoch": 4.468940754039497, + "grad_norm": 1.0467450618743896, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 62230 + }, + { + "epoch": 4.469658886894075, + "grad_norm": 0.943717896938324, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 62240 + }, + { + "epoch": 4.470377019748653, + "grad_norm": 0.909429132938385, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 62250 + }, + { + "epoch": 4.471095152603231, + "grad_norm": 1.0294792652130127, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 62260 + }, + { + "epoch": 4.471813285457809, + "grad_norm": 1.1044281721115112, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 62270 + }, + { + "epoch": 4.472531418312387, + "grad_norm": 1.1555784940719604, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 62280 + }, + { + "epoch": 4.473249551166965, + "grad_norm": 0.9441297650337219, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 62290 + }, + { + "epoch": 4.473967684021544, + "grad_norm": 0.9164380431175232, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 62300 + }, + { + "epoch": 4.474685816876122, + "grad_norm": 1.1139159202575684, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 62310 + }, + { + "epoch": 4.4754039497307, + "grad_norm": 1.0201882123947144, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 62320 + }, + { + "epoch": 4.476122082585278, + "grad_norm": 1.1471681594848633, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 62330 + }, + { + "epoch": 4.476840215439856, + "grad_norm": 1.0333549976348877, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 62340 + }, + { + "epoch": 4.477558348294434, + "grad_norm": 0.8929767608642578, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 62350 + }, + { + "epoch": 4.478276481149012, + "grad_norm": 0.9465752840042114, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 62360 + }, + { + "epoch": 4.47899461400359, + "grad_norm": 1.2155033349990845, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 62370 + }, + { + "epoch": 4.479712746858169, + "grad_norm": 0.7181217074394226, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 62380 + }, + { + "epoch": 4.480430879712747, + "grad_norm": 1.0052744150161743, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 62390 + }, + { + "epoch": 4.481149012567325, + "grad_norm": 0.8522219061851501, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 62400 + }, + { + "epoch": 4.481867145421903, + "grad_norm": 0.8844723105430603, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 62410 + }, + { + "epoch": 4.482585278276481, + "grad_norm": 0.9542465209960938, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 62420 + }, + { + "epoch": 4.483303411131059, + "grad_norm": 0.8963674306869507, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 62430 + }, + { + "epoch": 4.484021543985637, + "grad_norm": 0.8105363845825195, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 62440 + }, + { + "epoch": 4.484739676840215, + "grad_norm": 0.9618421196937561, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 62450 + }, + { + "epoch": 4.485457809694793, + "grad_norm": 1.1931076049804688, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 62460 + }, + { + "epoch": 4.486175942549371, + "grad_norm": 0.7406999468803406, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 62470 + }, + { + "epoch": 4.48689407540395, + "grad_norm": 0.7698216438293457, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 62480 + }, + { + "epoch": 4.487612208258528, + "grad_norm": 0.862271249294281, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 62490 + }, + { + "epoch": 4.488330341113106, + "grad_norm": 1.0025171041488647, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 62500 + }, + { + "epoch": 4.489048473967684, + "grad_norm": 0.8474493622779846, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 62510 + }, + { + "epoch": 4.489766606822262, + "grad_norm": 0.8965697884559631, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 62520 + }, + { + "epoch": 4.49048473967684, + "grad_norm": 1.1276488304138184, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 62530 + }, + { + "epoch": 4.491202872531418, + "grad_norm": 1.0253537893295288, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 62540 + }, + { + "epoch": 4.491921005385996, + "grad_norm": 1.1750596761703491, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 62550 + }, + { + "epoch": 4.492639138240574, + "grad_norm": 0.9951794147491455, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 62560 + }, + { + "epoch": 4.493357271095153, + "grad_norm": 1.2510017156600952, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 62570 + }, + { + "epoch": 4.494075403949731, + "grad_norm": 1.4066375494003296, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 62580 + }, + { + "epoch": 4.494793536804309, + "grad_norm": 0.988175094127655, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 62590 + }, + { + "epoch": 4.495511669658887, + "grad_norm": 1.2049115896224976, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 62600 + }, + { + "epoch": 4.496229802513465, + "grad_norm": 0.962464451789856, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 62610 + }, + { + "epoch": 4.496947935368043, + "grad_norm": 0.9324793815612793, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 62620 + }, + { + "epoch": 4.497666068222621, + "grad_norm": 0.9174214005470276, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 62630 + }, + { + "epoch": 4.498384201077199, + "grad_norm": 0.9729902148246765, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 62640 + }, + { + "epoch": 4.499102333931777, + "grad_norm": 1.0190484523773193, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 62650 + }, + { + "epoch": 4.499820466786355, + "grad_norm": 1.1473679542541504, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 62660 + }, + { + "epoch": 4.500538599640934, + "grad_norm": 1.0160558223724365, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 62670 + }, + { + "epoch": 4.501256732495512, + "grad_norm": 0.8083887100219727, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 62680 + }, + { + "epoch": 4.50197486535009, + "grad_norm": 0.941933274269104, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 62690 + }, + { + "epoch": 4.502692998204668, + "grad_norm": 0.9962822794914246, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 62700 + }, + { + "epoch": 4.503411131059246, + "grad_norm": 0.8993943333625793, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 62710 + }, + { + "epoch": 4.504129263913824, + "grad_norm": 0.9438319206237793, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 62720 + }, + { + "epoch": 4.504847396768402, + "grad_norm": 0.7951892018318176, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 62730 + }, + { + "epoch": 4.50556552962298, + "grad_norm": 0.8875413537025452, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 62740 + }, + { + "epoch": 4.506283662477558, + "grad_norm": 0.993819534778595, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 62750 + }, + { + "epoch": 4.507001795332137, + "grad_norm": 0.9177559018135071, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 62760 + }, + { + "epoch": 4.507719928186715, + "grad_norm": 0.8632771968841553, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 62770 + }, + { + "epoch": 4.508438061041293, + "grad_norm": 0.943778395652771, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 62780 + }, + { + "epoch": 4.509156193895871, + "grad_norm": 0.8754997849464417, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 62790 + }, + { + "epoch": 4.509874326750449, + "grad_norm": 1.102683424949646, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 62800 + }, + { + "epoch": 4.510592459605027, + "grad_norm": 1.1156457662582397, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 62810 + }, + { + "epoch": 4.511310592459605, + "grad_norm": 0.9178887009620667, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 62820 + }, + { + "epoch": 4.512028725314183, + "grad_norm": 0.9520689249038696, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 62830 + }, + { + "epoch": 4.512746858168761, + "grad_norm": 0.8880525231361389, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 62840 + }, + { + "epoch": 4.513464991023339, + "grad_norm": 0.9541497826576233, + "learning_rate": 0.0002, + "loss": 0.6895, + "step": 62850 + }, + { + "epoch": 4.514183123877918, + "grad_norm": 1.003766417503357, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 62860 + }, + { + "epoch": 4.514901256732496, + "grad_norm": 0.8844705820083618, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 62870 + }, + { + "epoch": 4.515619389587074, + "grad_norm": 1.1870828866958618, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 62880 + }, + { + "epoch": 4.516337522441652, + "grad_norm": 0.863487184047699, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 62890 + }, + { + "epoch": 4.51705565529623, + "grad_norm": 0.997770369052887, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 62900 + }, + { + "epoch": 4.517773788150808, + "grad_norm": 0.9708612561225891, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 62910 + }, + { + "epoch": 4.518491921005386, + "grad_norm": 1.1381206512451172, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 62920 + }, + { + "epoch": 4.519210053859964, + "grad_norm": 1.0386693477630615, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 62930 + }, + { + "epoch": 4.519928186714543, + "grad_norm": 1.1711705923080444, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 62940 + }, + { + "epoch": 4.520646319569121, + "grad_norm": 0.8727447390556335, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 62950 + }, + { + "epoch": 4.521364452423699, + "grad_norm": 0.9215193390846252, + "learning_rate": 0.0002, + "loss": 0.6298, + "step": 62960 + }, + { + "epoch": 4.522082585278277, + "grad_norm": 1.005467176437378, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 62970 + }, + { + "epoch": 4.522800718132855, + "grad_norm": 0.8761187791824341, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 62980 + }, + { + "epoch": 4.523518850987433, + "grad_norm": 0.957848310470581, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 62990 + }, + { + "epoch": 4.524236983842011, + "grad_norm": 0.8634148836135864, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 63000 + }, + { + "epoch": 4.524955116696589, + "grad_norm": 0.9557477235794067, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 63010 + }, + { + "epoch": 4.525673249551167, + "grad_norm": 1.017720341682434, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 63020 + }, + { + "epoch": 4.526391382405745, + "grad_norm": 1.0281825065612793, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 63030 + }, + { + "epoch": 4.527109515260323, + "grad_norm": 1.253974437713623, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 63040 + }, + { + "epoch": 4.527827648114902, + "grad_norm": 0.8489068150520325, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 63050 + }, + { + "epoch": 4.52854578096948, + "grad_norm": 0.9681686162948608, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 63060 + }, + { + "epoch": 4.529263913824058, + "grad_norm": 1.10277259349823, + "learning_rate": 0.0002, + "loss": 0.6166, + "step": 63070 + }, + { + "epoch": 4.529982046678636, + "grad_norm": 0.9469163417816162, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 63080 + }, + { + "epoch": 4.530700179533214, + "grad_norm": 1.1228134632110596, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 63090 + }, + { + "epoch": 4.531418312387792, + "grad_norm": 0.9673212170600891, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 63100 + }, + { + "epoch": 4.53213644524237, + "grad_norm": 1.0221107006072998, + "learning_rate": 0.0002, + "loss": 0.713, + "step": 63110 + }, + { + "epoch": 4.532854578096948, + "grad_norm": 0.826372504234314, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 63120 + }, + { + "epoch": 4.5335727109515265, + "grad_norm": 1.1805331707000732, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 63130 + }, + { + "epoch": 4.5342908438061045, + "grad_norm": 0.9645666480064392, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 63140 + }, + { + "epoch": 4.5350089766606825, + "grad_norm": 1.0838309526443481, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 63150 + }, + { + "epoch": 4.5357271095152605, + "grad_norm": 1.061414361000061, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 63160 + }, + { + "epoch": 4.5364452423698385, + "grad_norm": 0.841961145401001, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 63170 + }, + { + "epoch": 4.5371633752244165, + "grad_norm": 1.1220186948776245, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 63180 + }, + { + "epoch": 4.5378815080789945, + "grad_norm": 1.036441445350647, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 63190 + }, + { + "epoch": 4.5385996409335725, + "grad_norm": 0.9089716076850891, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 63200 + }, + { + "epoch": 4.5393177737881505, + "grad_norm": 0.8699982762336731, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 63210 + }, + { + "epoch": 4.5400359066427285, + "grad_norm": 0.8489565253257751, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 63220 + }, + { + "epoch": 4.540754039497307, + "grad_norm": 0.7778416275978088, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 63230 + }, + { + "epoch": 4.541472172351885, + "grad_norm": 1.0625852346420288, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 63240 + }, + { + "epoch": 4.542190305206463, + "grad_norm": 0.8515732884407043, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 63250 + }, + { + "epoch": 4.542908438061041, + "grad_norm": 0.7679561376571655, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 63260 + }, + { + "epoch": 4.543626570915619, + "grad_norm": 0.7358446717262268, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 63270 + }, + { + "epoch": 4.544344703770197, + "grad_norm": 1.0866128206253052, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 63280 + }, + { + "epoch": 4.545062836624775, + "grad_norm": 1.0870225429534912, + "learning_rate": 0.0002, + "loss": 0.6622, + "step": 63290 + }, + { + "epoch": 4.545780969479353, + "grad_norm": 0.951095461845398, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 63300 + }, + { + "epoch": 4.546499102333931, + "grad_norm": 1.0914306640625, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 63310 + }, + { + "epoch": 4.54721723518851, + "grad_norm": 0.8676106333732605, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 63320 + }, + { + "epoch": 4.547935368043088, + "grad_norm": 1.0129096508026123, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 63330 + }, + { + "epoch": 4.548653500897666, + "grad_norm": 0.8710526823997498, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 63340 + }, + { + "epoch": 4.549371633752244, + "grad_norm": 0.7014815807342529, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 63350 + }, + { + "epoch": 4.550089766606822, + "grad_norm": 1.1546777486801147, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 63360 + }, + { + "epoch": 4.5508078994614, + "grad_norm": 0.7464957237243652, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 63370 + }, + { + "epoch": 4.551526032315978, + "grad_norm": 0.9976209998130798, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 63380 + }, + { + "epoch": 4.552244165170556, + "grad_norm": 0.9543681740760803, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 63390 + }, + { + "epoch": 4.552962298025134, + "grad_norm": 1.1498578786849976, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 63400 + }, + { + "epoch": 4.553680430879712, + "grad_norm": 1.0162293910980225, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 63410 + }, + { + "epoch": 4.554398563734291, + "grad_norm": 0.9015304446220398, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 63420 + }, + { + "epoch": 4.555116696588869, + "grad_norm": 1.1639831066131592, + "learning_rate": 0.0002, + "loss": 0.6257, + "step": 63430 + }, + { + "epoch": 4.555834829443447, + "grad_norm": 0.9494703412055969, + "learning_rate": 0.0002, + "loss": 0.6763, + "step": 63440 + }, + { + "epoch": 4.556552962298025, + "grad_norm": 1.0555956363677979, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 63450 + }, + { + "epoch": 4.557271095152603, + "grad_norm": 0.8513827919960022, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 63460 + }, + { + "epoch": 4.557989228007181, + "grad_norm": 1.0614275932312012, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 63470 + }, + { + "epoch": 4.558707360861759, + "grad_norm": 0.8341137766838074, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 63480 + }, + { + "epoch": 4.559425493716337, + "grad_norm": 1.2136222124099731, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 63490 + }, + { + "epoch": 4.560143626570916, + "grad_norm": 0.8806019425392151, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 63500 + }, + { + "epoch": 4.560861759425494, + "grad_norm": 1.2548854351043701, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 63510 + }, + { + "epoch": 4.561579892280072, + "grad_norm": 1.0162668228149414, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 63520 + }, + { + "epoch": 4.56229802513465, + "grad_norm": 1.0487624406814575, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 63530 + }, + { + "epoch": 4.563016157989228, + "grad_norm": 1.2505502700805664, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 63540 + }, + { + "epoch": 4.563734290843806, + "grad_norm": 0.9930511713027954, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 63550 + }, + { + "epoch": 4.564452423698384, + "grad_norm": 0.8132568001747131, + "learning_rate": 0.0002, + "loss": 0.6695, + "step": 63560 + }, + { + "epoch": 4.565170556552962, + "grad_norm": 1.0129177570343018, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 63570 + }, + { + "epoch": 4.56588868940754, + "grad_norm": 0.9011693596839905, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 63580 + }, + { + "epoch": 4.566606822262118, + "grad_norm": 0.9161545634269714, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 63590 + }, + { + "epoch": 4.567324955116696, + "grad_norm": 0.8852348327636719, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 63600 + }, + { + "epoch": 4.568043087971275, + "grad_norm": 0.8579391837120056, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 63610 + }, + { + "epoch": 4.568761220825853, + "grad_norm": 0.9271050095558167, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 63620 + }, + { + "epoch": 4.569479353680431, + "grad_norm": 0.9881834983825684, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 63630 + }, + { + "epoch": 4.570197486535009, + "grad_norm": 1.0255686044692993, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 63640 + }, + { + "epoch": 4.570915619389587, + "grad_norm": 0.8758876919746399, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 63650 + }, + { + "epoch": 4.571633752244165, + "grad_norm": 1.0134185552597046, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 63660 + }, + { + "epoch": 4.572351885098743, + "grad_norm": 0.8535705208778381, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 63670 + }, + { + "epoch": 4.573070017953321, + "grad_norm": 0.9614834785461426, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 63680 + }, + { + "epoch": 4.5737881508079, + "grad_norm": 0.9004243612289429, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 63690 + }, + { + "epoch": 4.574506283662478, + "grad_norm": 0.9563080072402954, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 63700 + }, + { + "epoch": 4.575224416517056, + "grad_norm": 1.024857521057129, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 63710 + }, + { + "epoch": 4.575942549371634, + "grad_norm": 0.9345638155937195, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 63720 + }, + { + "epoch": 4.576660682226212, + "grad_norm": 1.27083158493042, + "learning_rate": 0.0002, + "loss": 0.6814, + "step": 63730 + }, + { + "epoch": 4.57737881508079, + "grad_norm": 1.0866559743881226, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 63740 + }, + { + "epoch": 4.578096947935368, + "grad_norm": 0.9253925681114197, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 63750 + }, + { + "epoch": 4.578815080789946, + "grad_norm": 0.8127399682998657, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 63760 + }, + { + "epoch": 4.579533213644524, + "grad_norm": 1.0453993082046509, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 63770 + }, + { + "epoch": 4.580251346499102, + "grad_norm": 1.2227544784545898, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 63780 + }, + { + "epoch": 4.580969479353681, + "grad_norm": 1.0207865238189697, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 63790 + }, + { + "epoch": 4.581687612208259, + "grad_norm": 1.030447244644165, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 63800 + }, + { + "epoch": 4.582405745062837, + "grad_norm": 1.0855677127838135, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 63810 + }, + { + "epoch": 4.583123877917415, + "grad_norm": 0.9572556018829346, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 63820 + }, + { + "epoch": 4.583842010771993, + "grad_norm": 0.9061040282249451, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 63830 + }, + { + "epoch": 4.584560143626571, + "grad_norm": 0.9267677068710327, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 63840 + }, + { + "epoch": 4.585278276481149, + "grad_norm": 1.070076823234558, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 63850 + }, + { + "epoch": 4.585996409335727, + "grad_norm": 1.045881748199463, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 63860 + }, + { + "epoch": 4.586714542190305, + "grad_norm": 0.9190576672554016, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 63870 + }, + { + "epoch": 4.587432675044884, + "grad_norm": 0.9263932704925537, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 63880 + }, + { + "epoch": 4.588150807899462, + "grad_norm": 1.0217589139938354, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 63890 + }, + { + "epoch": 4.58886894075404, + "grad_norm": 0.9200088381767273, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 63900 + }, + { + "epoch": 4.589587073608618, + "grad_norm": 0.9877251386642456, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 63910 + }, + { + "epoch": 4.590305206463196, + "grad_norm": 1.0059093236923218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 63920 + }, + { + "epoch": 4.591023339317774, + "grad_norm": 1.2618095874786377, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 63930 + }, + { + "epoch": 4.591741472172352, + "grad_norm": 1.1779268980026245, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 63940 + }, + { + "epoch": 4.59245960502693, + "grad_norm": 1.2339502573013306, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 63950 + }, + { + "epoch": 4.593177737881508, + "grad_norm": 0.7488788366317749, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 63960 + }, + { + "epoch": 4.593895870736086, + "grad_norm": 0.8366380929946899, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 63970 + }, + { + "epoch": 4.594614003590665, + "grad_norm": 1.0292677879333496, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 63980 + }, + { + "epoch": 4.595332136445243, + "grad_norm": 0.7938551306724548, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 63990 + }, + { + "epoch": 4.596050269299821, + "grad_norm": 0.7958516478538513, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 64000 + }, + { + "epoch": 4.596768402154399, + "grad_norm": 0.9613908529281616, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 64010 + }, + { + "epoch": 4.597486535008977, + "grad_norm": 1.0253773927688599, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 64020 + }, + { + "epoch": 4.598204667863555, + "grad_norm": 1.0560888051986694, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 64030 + }, + { + "epoch": 4.598922800718133, + "grad_norm": 1.1093556880950928, + "learning_rate": 0.0002, + "loss": 0.6681, + "step": 64040 + }, + { + "epoch": 4.599640933572711, + "grad_norm": 0.8492098450660706, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 64050 + }, + { + "epoch": 4.6003590664272895, + "grad_norm": 1.0070436000823975, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 64060 + }, + { + "epoch": 4.6010771992818675, + "grad_norm": 0.9774282574653625, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 64070 + }, + { + "epoch": 4.6017953321364455, + "grad_norm": 1.0744960308074951, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 64080 + }, + { + "epoch": 4.6025134649910235, + "grad_norm": 1.0101491212844849, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 64090 + }, + { + "epoch": 4.6032315978456015, + "grad_norm": 1.2306591272354126, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 64100 + }, + { + "epoch": 4.6039497307001795, + "grad_norm": 0.9187033176422119, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 64110 + }, + { + "epoch": 4.6046678635547575, + "grad_norm": 0.9178676605224609, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 64120 + }, + { + "epoch": 4.6053859964093355, + "grad_norm": 1.006374716758728, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 64130 + }, + { + "epoch": 4.6061041292639135, + "grad_norm": 1.0774449110031128, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 64140 + }, + { + "epoch": 4.6068222621184916, + "grad_norm": 1.0360658168792725, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 64150 + }, + { + "epoch": 4.6075403949730696, + "grad_norm": 1.1061090230941772, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 64160 + }, + { + "epoch": 4.608258527827648, + "grad_norm": 1.0320971012115479, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 64170 + }, + { + "epoch": 4.6089766606822264, + "grad_norm": 0.8596988916397095, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 64180 + }, + { + "epoch": 4.6096947935368044, + "grad_norm": 1.1665741205215454, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 64190 + }, + { + "epoch": 4.6104129263913824, + "grad_norm": 0.857207715511322, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 64200 + }, + { + "epoch": 4.6111310592459605, + "grad_norm": 1.0088987350463867, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 64210 + }, + { + "epoch": 4.6118491921005385, + "grad_norm": 1.0985605716705322, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 64220 + }, + { + "epoch": 4.6125673249551165, + "grad_norm": 0.9504913687705994, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 64230 + }, + { + "epoch": 4.6132854578096945, + "grad_norm": 0.8415018916130066, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 64240 + }, + { + "epoch": 4.614003590664273, + "grad_norm": 0.9857034087181091, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 64250 + }, + { + "epoch": 4.614721723518851, + "grad_norm": 1.0164235830307007, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 64260 + }, + { + "epoch": 4.615439856373429, + "grad_norm": 0.949481725692749, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 64270 + }, + { + "epoch": 4.616157989228007, + "grad_norm": 0.9526455998420715, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 64280 + }, + { + "epoch": 4.616876122082585, + "grad_norm": 1.1121242046356201, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 64290 + }, + { + "epoch": 4.617594254937163, + "grad_norm": 0.9598871469497681, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 64300 + }, + { + "epoch": 4.618312387791741, + "grad_norm": 1.0406304597854614, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 64310 + }, + { + "epoch": 4.619030520646319, + "grad_norm": 1.1816964149475098, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 64320 + }, + { + "epoch": 4.619748653500897, + "grad_norm": 0.9818326830863953, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 64330 + }, + { + "epoch": 4.620466786355475, + "grad_norm": 0.952017605304718, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 64340 + }, + { + "epoch": 4.621184919210053, + "grad_norm": 1.1263453960418701, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 64350 + }, + { + "epoch": 4.621903052064632, + "grad_norm": 1.1158473491668701, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 64360 + }, + { + "epoch": 4.62262118491921, + "grad_norm": 0.9056766033172607, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 64370 + }, + { + "epoch": 4.623339317773788, + "grad_norm": 0.8113203048706055, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 64380 + }, + { + "epoch": 4.624057450628366, + "grad_norm": 0.8646712899208069, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 64390 + }, + { + "epoch": 4.624775583482944, + "grad_norm": 1.0064425468444824, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 64400 + }, + { + "epoch": 4.625493716337522, + "grad_norm": 0.9867565631866455, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 64410 + }, + { + "epoch": 4.6262118491921, + "grad_norm": 1.018764615058899, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 64420 + }, + { + "epoch": 4.626929982046678, + "grad_norm": 1.0607863664627075, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 64430 + }, + { + "epoch": 4.627648114901257, + "grad_norm": 1.012825846672058, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 64440 + }, + { + "epoch": 4.628366247755835, + "grad_norm": 0.8441653847694397, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 64450 + }, + { + "epoch": 4.629084380610413, + "grad_norm": 0.9819194674491882, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 64460 + }, + { + "epoch": 4.629802513464991, + "grad_norm": 0.925519585609436, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 64470 + }, + { + "epoch": 4.630520646319569, + "grad_norm": 0.9409030079841614, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 64480 + }, + { + "epoch": 4.631238779174147, + "grad_norm": 1.148024559020996, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 64490 + }, + { + "epoch": 4.631956912028725, + "grad_norm": 0.8225533962249756, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 64500 + }, + { + "epoch": 4.632675044883303, + "grad_norm": 0.8806734681129456, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 64510 + }, + { + "epoch": 4.633393177737881, + "grad_norm": 0.9656694531440735, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 64520 + }, + { + "epoch": 4.634111310592459, + "grad_norm": 0.9977783560752869, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 64530 + }, + { + "epoch": 4.634829443447038, + "grad_norm": 0.9259420037269592, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 64540 + }, + { + "epoch": 4.635547576301616, + "grad_norm": 1.0215885639190674, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 64550 + }, + { + "epoch": 4.636265709156194, + "grad_norm": 1.1082557439804077, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 64560 + }, + { + "epoch": 4.636983842010772, + "grad_norm": 1.1183207035064697, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 64570 + }, + { + "epoch": 4.63770197486535, + "grad_norm": 0.9914339184761047, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 64580 + }, + { + "epoch": 4.638420107719928, + "grad_norm": 0.8065831661224365, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 64590 + }, + { + "epoch": 4.639138240574506, + "grad_norm": 1.1546721458435059, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 64600 + }, + { + "epoch": 4.639856373429084, + "grad_norm": 1.0395900011062622, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 64610 + }, + { + "epoch": 4.640574506283663, + "grad_norm": 0.9957455992698669, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 64620 + }, + { + "epoch": 4.641292639138241, + "grad_norm": 1.069557785987854, + "learning_rate": 0.0002, + "loss": 0.6653, + "step": 64630 + }, + { + "epoch": 4.642010771992819, + "grad_norm": 1.005236268043518, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 64640 + }, + { + "epoch": 4.642728904847397, + "grad_norm": 1.0216304063796997, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 64650 + }, + { + "epoch": 4.643447037701975, + "grad_norm": 0.8567317128181458, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 64660 + }, + { + "epoch": 4.644165170556553, + "grad_norm": 1.0386067628860474, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 64670 + }, + { + "epoch": 4.644883303411131, + "grad_norm": 0.9566055536270142, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 64680 + }, + { + "epoch": 4.645601436265709, + "grad_norm": 1.0990564823150635, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 64690 + }, + { + "epoch": 4.646319569120287, + "grad_norm": 0.9962695240974426, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 64700 + }, + { + "epoch": 4.647037701974865, + "grad_norm": 0.9041377305984497, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 64710 + }, + { + "epoch": 4.647755834829443, + "grad_norm": 0.8611233234405518, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 64720 + }, + { + "epoch": 4.648473967684022, + "grad_norm": 1.1569812297821045, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 64730 + }, + { + "epoch": 4.6491921005386, + "grad_norm": 0.7946197390556335, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 64740 + }, + { + "epoch": 4.649910233393178, + "grad_norm": 0.9612061381340027, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 64750 + }, + { + "epoch": 4.650628366247756, + "grad_norm": 0.9669303297996521, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 64760 + }, + { + "epoch": 4.651346499102334, + "grad_norm": 0.8117775321006775, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 64770 + }, + { + "epoch": 4.652064631956912, + "grad_norm": 1.2326241731643677, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 64780 + }, + { + "epoch": 4.65278276481149, + "grad_norm": 0.7494568228721619, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 64790 + }, + { + "epoch": 4.653500897666068, + "grad_norm": 0.8145379424095154, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 64800 + }, + { + "epoch": 4.654219030520647, + "grad_norm": 1.0139610767364502, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 64810 + }, + { + "epoch": 4.654937163375225, + "grad_norm": 0.9887115359306335, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 64820 + }, + { + "epoch": 4.655655296229803, + "grad_norm": 0.9565147161483765, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 64830 + }, + { + "epoch": 4.656373429084381, + "grad_norm": 0.9022467136383057, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 64840 + }, + { + "epoch": 4.657091561938959, + "grad_norm": 1.075003981590271, + "learning_rate": 0.0002, + "loss": 0.6395, + "step": 64850 + }, + { + "epoch": 4.657809694793537, + "grad_norm": 0.8705733418464661, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 64860 + }, + { + "epoch": 4.658527827648115, + "grad_norm": 1.0826832056045532, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 64870 + }, + { + "epoch": 4.659245960502693, + "grad_norm": 1.1056268215179443, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 64880 + }, + { + "epoch": 4.659964093357271, + "grad_norm": 0.8664149641990662, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 64890 + }, + { + "epoch": 4.660682226211849, + "grad_norm": 0.9487230181694031, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 64900 + }, + { + "epoch": 4.661400359066427, + "grad_norm": 1.0357837677001953, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 64910 + }, + { + "epoch": 4.662118491921006, + "grad_norm": 0.8620632290840149, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 64920 + }, + { + "epoch": 4.662836624775584, + "grad_norm": 1.108986735343933, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 64930 + }, + { + "epoch": 4.663554757630162, + "grad_norm": 0.8017674684524536, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 64940 + }, + { + "epoch": 4.66427289048474, + "grad_norm": 0.882347583770752, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 64950 + }, + { + "epoch": 4.664991023339318, + "grad_norm": 0.9466867446899414, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 64960 + }, + { + "epoch": 4.665709156193896, + "grad_norm": 1.1823636293411255, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 64970 + }, + { + "epoch": 4.666427289048474, + "grad_norm": 0.9535016417503357, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 64980 + }, + { + "epoch": 4.667145421903052, + "grad_norm": 0.9456726312637329, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 64990 + }, + { + "epoch": 4.667863554757631, + "grad_norm": 0.7761920690536499, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 65000 + }, + { + "epoch": 4.668581687612209, + "grad_norm": 1.060357689857483, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 65010 + }, + { + "epoch": 4.669299820466787, + "grad_norm": 0.9083862900733948, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 65020 + }, + { + "epoch": 4.670017953321365, + "grad_norm": 0.8745762705802917, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 65030 + }, + { + "epoch": 4.670736086175943, + "grad_norm": 0.8715422749519348, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 65040 + }, + { + "epoch": 4.671454219030521, + "grad_norm": 0.9407707452774048, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 65050 + }, + { + "epoch": 4.672172351885099, + "grad_norm": 0.8998945355415344, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 65060 + }, + { + "epoch": 4.672890484739677, + "grad_norm": 0.9147891998291016, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 65070 + }, + { + "epoch": 4.673608617594255, + "grad_norm": 1.116614580154419, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 65080 + }, + { + "epoch": 4.674326750448833, + "grad_norm": 1.0764213800430298, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 65090 + }, + { + "epoch": 4.6750448833034115, + "grad_norm": 0.9115945100784302, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 65100 + }, + { + "epoch": 4.6757630161579895, + "grad_norm": 1.001251459121704, + "learning_rate": 0.0002, + "loss": 0.6506, + "step": 65110 + }, + { + "epoch": 4.6764811490125675, + "grad_norm": 1.0330020189285278, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 65120 + }, + { + "epoch": 4.6771992818671455, + "grad_norm": 0.9083197116851807, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 65130 + }, + { + "epoch": 4.6779174147217235, + "grad_norm": 0.9298770427703857, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 65140 + }, + { + "epoch": 4.6786355475763015, + "grad_norm": 1.0009549856185913, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 65150 + }, + { + "epoch": 4.6793536804308795, + "grad_norm": 0.951389729976654, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 65160 + }, + { + "epoch": 4.6800718132854575, + "grad_norm": 1.151870608329773, + "learning_rate": 0.0002, + "loss": 0.6282, + "step": 65170 + }, + { + "epoch": 4.680789946140036, + "grad_norm": 1.0074727535247803, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 65180 + }, + { + "epoch": 4.681508078994614, + "grad_norm": 1.0490152835845947, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 65190 + }, + { + "epoch": 4.682226211849192, + "grad_norm": 0.8967363834381104, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 65200 + }, + { + "epoch": 4.68294434470377, + "grad_norm": 1.2314889430999756, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 65210 + }, + { + "epoch": 4.683662477558348, + "grad_norm": 0.7764074802398682, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 65220 + }, + { + "epoch": 4.684380610412926, + "grad_norm": 1.0587822198867798, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 65230 + }, + { + "epoch": 4.685098743267504, + "grad_norm": 0.916114091873169, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 65240 + }, + { + "epoch": 4.685816876122082, + "grad_norm": 0.9117472767829895, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 65250 + }, + { + "epoch": 4.68653500897666, + "grad_norm": 0.8369293212890625, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 65260 + }, + { + "epoch": 4.687253141831238, + "grad_norm": 0.9700121879577637, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 65270 + }, + { + "epoch": 4.687971274685816, + "grad_norm": 1.0008411407470703, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 65280 + }, + { + "epoch": 4.688689407540395, + "grad_norm": 0.9339549541473389, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 65290 + }, + { + "epoch": 4.689407540394973, + "grad_norm": 0.956701934337616, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 65300 + }, + { + "epoch": 4.690125673249551, + "grad_norm": 1.2042720317840576, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 65310 + }, + { + "epoch": 4.690843806104129, + "grad_norm": 0.8679144382476807, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 65320 + }, + { + "epoch": 4.691561938958707, + "grad_norm": 1.2320687770843506, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 65330 + }, + { + "epoch": 4.692280071813285, + "grad_norm": 0.8397238850593567, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 65340 + }, + { + "epoch": 4.692998204667863, + "grad_norm": 0.7850362658500671, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 65350 + }, + { + "epoch": 4.693716337522441, + "grad_norm": 0.9281290173530579, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 65360 + }, + { + "epoch": 4.69443447037702, + "grad_norm": 1.1506335735321045, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 65370 + }, + { + "epoch": 4.695152603231598, + "grad_norm": 1.0910584926605225, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 65380 + }, + { + "epoch": 4.695870736086176, + "grad_norm": 0.8937386274337769, + "learning_rate": 0.0002, + "loss": 0.66, + "step": 65390 + }, + { + "epoch": 4.696588868940754, + "grad_norm": 1.0163888931274414, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 65400 + }, + { + "epoch": 4.697307001795332, + "grad_norm": 1.0290007591247559, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 65410 + }, + { + "epoch": 4.69802513464991, + "grad_norm": 0.9046576023101807, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 65420 + }, + { + "epoch": 4.698743267504488, + "grad_norm": 1.0030237436294556, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 65430 + }, + { + "epoch": 4.699461400359066, + "grad_norm": 0.8196740746498108, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 65440 + }, + { + "epoch": 4.700179533213644, + "grad_norm": 0.9036651849746704, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 65450 + }, + { + "epoch": 4.700897666068222, + "grad_norm": 1.2080141305923462, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 65460 + }, + { + "epoch": 4.7016157989228, + "grad_norm": 0.8743635416030884, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 65470 + }, + { + "epoch": 4.702333931777379, + "grad_norm": 0.9566192030906677, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 65480 + }, + { + "epoch": 4.703052064631957, + "grad_norm": 1.0505144596099854, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 65490 + }, + { + "epoch": 4.703770197486535, + "grad_norm": 0.8797298073768616, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 65500 + }, + { + "epoch": 4.704488330341113, + "grad_norm": 0.9970770478248596, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 65510 + }, + { + "epoch": 4.705206463195691, + "grad_norm": 1.1743851900100708, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 65520 + }, + { + "epoch": 4.705924596050269, + "grad_norm": 0.9534381031990051, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 65530 + }, + { + "epoch": 4.706642728904847, + "grad_norm": 0.9735581278800964, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 65540 + }, + { + "epoch": 4.707360861759425, + "grad_norm": 1.185352087020874, + "learning_rate": 0.0002, + "loss": 0.6217, + "step": 65550 + }, + { + "epoch": 4.708078994614004, + "grad_norm": 0.9383901357650757, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 65560 + }, + { + "epoch": 4.708797127468582, + "grad_norm": 1.0194662809371948, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 65570 + }, + { + "epoch": 4.70951526032316, + "grad_norm": 0.8448300361633301, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 65580 + }, + { + "epoch": 4.710233393177738, + "grad_norm": 1.1930629014968872, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 65590 + }, + { + "epoch": 4.710951526032316, + "grad_norm": 1.0038636922836304, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 65600 + }, + { + "epoch": 4.711669658886894, + "grad_norm": 0.8206564784049988, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 65610 + }, + { + "epoch": 4.712387791741472, + "grad_norm": 1.0984861850738525, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 65620 + }, + { + "epoch": 4.71310592459605, + "grad_norm": 1.2891547679901123, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 65630 + }, + { + "epoch": 4.713824057450628, + "grad_norm": 0.927062451839447, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 65640 + }, + { + "epoch": 4.714542190305206, + "grad_norm": 0.8647334575653076, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 65650 + }, + { + "epoch": 4.715260323159785, + "grad_norm": 1.1017670631408691, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 65660 + }, + { + "epoch": 4.715978456014363, + "grad_norm": 0.9589072465896606, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 65670 + }, + { + "epoch": 4.716696588868941, + "grad_norm": 0.9496776461601257, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 65680 + }, + { + "epoch": 4.717414721723519, + "grad_norm": 0.9266180396080017, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 65690 + }, + { + "epoch": 4.718132854578097, + "grad_norm": 0.8699696063995361, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 65700 + }, + { + "epoch": 4.718850987432675, + "grad_norm": 1.0444015264511108, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 65710 + }, + { + "epoch": 4.719569120287253, + "grad_norm": 1.0100741386413574, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 65720 + }, + { + "epoch": 4.720287253141831, + "grad_norm": 1.1442630290985107, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 65730 + }, + { + "epoch": 4.721005385996409, + "grad_norm": 0.8937877416610718, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 65740 + }, + { + "epoch": 4.721723518850988, + "grad_norm": 1.0718764066696167, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 65750 + }, + { + "epoch": 4.722441651705566, + "grad_norm": 0.8838587999343872, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 65760 + }, + { + "epoch": 4.723159784560144, + "grad_norm": 1.1247940063476562, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 65770 + }, + { + "epoch": 4.723877917414722, + "grad_norm": 0.9491105675697327, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 65780 + }, + { + "epoch": 4.7245960502693, + "grad_norm": 1.0896921157836914, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 65790 + }, + { + "epoch": 4.725314183123878, + "grad_norm": 1.0097380876541138, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 65800 + }, + { + "epoch": 4.726032315978456, + "grad_norm": 0.911763608455658, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 65810 + }, + { + "epoch": 4.726750448833034, + "grad_norm": 1.1295124292373657, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 65820 + }, + { + "epoch": 4.727468581687612, + "grad_norm": 0.7637538313865662, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 65830 + }, + { + "epoch": 4.72818671454219, + "grad_norm": 0.9255306720733643, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 65840 + }, + { + "epoch": 4.728904847396769, + "grad_norm": 0.9847530126571655, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 65850 + }, + { + "epoch": 4.729622980251347, + "grad_norm": 0.9036182761192322, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 65860 + }, + { + "epoch": 4.730341113105925, + "grad_norm": 0.8284199833869934, + "learning_rate": 0.0002, + "loss": 0.6374, + "step": 65870 + }, + { + "epoch": 4.731059245960503, + "grad_norm": 1.0142838954925537, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 65880 + }, + { + "epoch": 4.731777378815081, + "grad_norm": 0.9389033913612366, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 65890 + }, + { + "epoch": 4.732495511669659, + "grad_norm": 0.8870056867599487, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 65900 + }, + { + "epoch": 4.733213644524237, + "grad_norm": 1.1211678981781006, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 65910 + }, + { + "epoch": 4.733931777378815, + "grad_norm": 0.7796614170074463, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 65920 + }, + { + "epoch": 4.734649910233394, + "grad_norm": 1.0360451936721802, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 65930 + }, + { + "epoch": 4.735368043087972, + "grad_norm": 0.8383482098579407, + "learning_rate": 0.0002, + "loss": 0.68, + "step": 65940 + }, + { + "epoch": 4.73608617594255, + "grad_norm": 0.7985122799873352, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 65950 + }, + { + "epoch": 4.736804308797128, + "grad_norm": 1.0314199924468994, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 65960 + }, + { + "epoch": 4.737522441651706, + "grad_norm": 0.9279016852378845, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 65970 + }, + { + "epoch": 4.738240574506284, + "grad_norm": 1.1046063899993896, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 65980 + }, + { + "epoch": 4.738958707360862, + "grad_norm": 0.9075793623924255, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 65990 + }, + { + "epoch": 4.73967684021544, + "grad_norm": 1.0945355892181396, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 66000 + }, + { + "epoch": 4.740394973070018, + "grad_norm": 0.8885519504547119, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 66010 + }, + { + "epoch": 4.741113105924596, + "grad_norm": 0.9312083125114441, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 66020 + }, + { + "epoch": 4.741831238779174, + "grad_norm": 1.1574538946151733, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 66030 + }, + { + "epoch": 4.742549371633753, + "grad_norm": 0.9346209168434143, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 66040 + }, + { + "epoch": 4.743267504488331, + "grad_norm": 0.8935149312019348, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 66050 + }, + { + "epoch": 4.743985637342909, + "grad_norm": 0.8958369493484497, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 66060 + }, + { + "epoch": 4.744703770197487, + "grad_norm": 0.9383506774902344, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 66070 + }, + { + "epoch": 4.745421903052065, + "grad_norm": 0.9868947863578796, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 66080 + }, + { + "epoch": 4.746140035906643, + "grad_norm": 1.3417645692825317, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 66090 + }, + { + "epoch": 4.746858168761221, + "grad_norm": 1.070693850517273, + "learning_rate": 0.0002, + "loss": 0.5417, + "step": 66100 + }, + { + "epoch": 4.747576301615799, + "grad_norm": 0.8841570019721985, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 66110 + }, + { + "epoch": 4.7482944344703775, + "grad_norm": 0.7963120341300964, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 66120 + }, + { + "epoch": 4.7490125673249555, + "grad_norm": 0.8145691156387329, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 66130 + }, + { + "epoch": 4.7497307001795335, + "grad_norm": 0.9074729681015015, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 66140 + }, + { + "epoch": 4.7504488330341115, + "grad_norm": 0.9129886627197266, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 66150 + }, + { + "epoch": 4.7511669658886895, + "grad_norm": 0.91527259349823, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 66160 + }, + { + "epoch": 4.7518850987432675, + "grad_norm": 0.9569419622421265, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 66170 + }, + { + "epoch": 4.7526032315978455, + "grad_norm": 0.8777104616165161, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 66180 + }, + { + "epoch": 4.7533213644524235, + "grad_norm": 0.9673085808753967, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 66190 + }, + { + "epoch": 4.7540394973070015, + "grad_norm": 1.0683966875076294, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 66200 + }, + { + "epoch": 4.7547576301615795, + "grad_norm": 1.1591907739639282, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 66210 + }, + { + "epoch": 4.755475763016158, + "grad_norm": 1.1973309516906738, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 66220 + }, + { + "epoch": 4.756193895870736, + "grad_norm": 0.8472012281417847, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 66230 + }, + { + "epoch": 4.756912028725314, + "grad_norm": 0.9896261692047119, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 66240 + }, + { + "epoch": 4.757630161579892, + "grad_norm": 0.8498432040214539, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 66250 + }, + { + "epoch": 4.75834829443447, + "grad_norm": 0.9624166488647461, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 66260 + }, + { + "epoch": 4.759066427289048, + "grad_norm": 1.0951786041259766, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 66270 + }, + { + "epoch": 4.759784560143626, + "grad_norm": 0.9863157868385315, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 66280 + }, + { + "epoch": 4.760502692998204, + "grad_norm": 1.0062068700790405, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 66290 + }, + { + "epoch": 4.761220825852782, + "grad_norm": 0.8075495958328247, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 66300 + }, + { + "epoch": 4.761938958707361, + "grad_norm": 0.9617878198623657, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 66310 + }, + { + "epoch": 4.762657091561939, + "grad_norm": 1.097091555595398, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 66320 + }, + { + "epoch": 4.763375224416517, + "grad_norm": 1.2713453769683838, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 66330 + }, + { + "epoch": 4.764093357271095, + "grad_norm": 0.9473448991775513, + "learning_rate": 0.0002, + "loss": 0.6726, + "step": 66340 + }, + { + "epoch": 4.764811490125673, + "grad_norm": 1.0176854133605957, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 66350 + }, + { + "epoch": 4.765529622980251, + "grad_norm": 1.0486242771148682, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 66360 + }, + { + "epoch": 4.766247755834829, + "grad_norm": 1.249985694885254, + "learning_rate": 0.0002, + "loss": 0.6875, + "step": 66370 + }, + { + "epoch": 4.766965888689407, + "grad_norm": 1.283875584602356, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 66380 + }, + { + "epoch": 4.767684021543985, + "grad_norm": 1.0009022951126099, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 66390 + }, + { + "epoch": 4.768402154398563, + "grad_norm": 0.9718021750450134, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 66400 + }, + { + "epoch": 4.769120287253142, + "grad_norm": 1.0865732431411743, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 66410 + }, + { + "epoch": 4.76983842010772, + "grad_norm": 0.9273189306259155, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 66420 + }, + { + "epoch": 4.770556552962298, + "grad_norm": 1.067535638809204, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 66430 + }, + { + "epoch": 4.771274685816876, + "grad_norm": 1.0551011562347412, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 66440 + }, + { + "epoch": 4.771992818671454, + "grad_norm": 1.0336146354675293, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 66450 + }, + { + "epoch": 4.772710951526032, + "grad_norm": 0.8738380670547485, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 66460 + }, + { + "epoch": 4.77342908438061, + "grad_norm": 1.1048321723937988, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 66470 + }, + { + "epoch": 4.774147217235188, + "grad_norm": 0.8471167683601379, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 66480 + }, + { + "epoch": 4.774865350089767, + "grad_norm": 1.2527031898498535, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 66490 + }, + { + "epoch": 4.775583482944345, + "grad_norm": 1.0056052207946777, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 66500 + }, + { + "epoch": 4.776301615798923, + "grad_norm": 1.142456293106079, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 66510 + }, + { + "epoch": 4.777019748653501, + "grad_norm": 1.1813132762908936, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 66520 + }, + { + "epoch": 4.777737881508079, + "grad_norm": 0.8683654069900513, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 66530 + }, + { + "epoch": 4.778456014362657, + "grad_norm": 1.0577980279922485, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 66540 + }, + { + "epoch": 4.779174147217235, + "grad_norm": 1.077438473701477, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 66550 + }, + { + "epoch": 4.779892280071813, + "grad_norm": 1.0107938051223755, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 66560 + }, + { + "epoch": 4.780610412926391, + "grad_norm": 0.8071168065071106, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 66570 + }, + { + "epoch": 4.781328545780969, + "grad_norm": 0.8887564539909363, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 66580 + }, + { + "epoch": 4.782046678635547, + "grad_norm": 0.9823092222213745, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 66590 + }, + { + "epoch": 4.782764811490126, + "grad_norm": 0.9026784300804138, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 66600 + }, + { + "epoch": 4.783482944344704, + "grad_norm": 0.8912792205810547, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 66610 + }, + { + "epoch": 4.784201077199282, + "grad_norm": 1.0955979824066162, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 66620 + }, + { + "epoch": 4.78491921005386, + "grad_norm": 0.8614793419837952, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 66630 + }, + { + "epoch": 4.785637342908438, + "grad_norm": 0.7247269153594971, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 66640 + }, + { + "epoch": 4.786355475763016, + "grad_norm": 0.9685400724411011, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 66650 + }, + { + "epoch": 4.787073608617594, + "grad_norm": 0.9219905734062195, + "learning_rate": 0.0002, + "loss": 0.6419, + "step": 66660 + }, + { + "epoch": 4.787791741472172, + "grad_norm": 0.9217489361763, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 66670 + }, + { + "epoch": 4.788509874326751, + "grad_norm": 1.13791823387146, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 66680 + }, + { + "epoch": 4.789228007181329, + "grad_norm": 0.857542872428894, + "learning_rate": 0.0002, + "loss": 0.6114, + "step": 66690 + }, + { + "epoch": 4.789946140035907, + "grad_norm": 0.9886694550514221, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 66700 + }, + { + "epoch": 4.790664272890485, + "grad_norm": 0.987952470779419, + "learning_rate": 0.0002, + "loss": 0.6436, + "step": 66710 + }, + { + "epoch": 4.791382405745063, + "grad_norm": 1.051612377166748, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 66720 + }, + { + "epoch": 4.792100538599641, + "grad_norm": 0.9816454648971558, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 66730 + }, + { + "epoch": 4.792818671454219, + "grad_norm": 1.0953829288482666, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 66740 + }, + { + "epoch": 4.793536804308797, + "grad_norm": 0.8720369935035706, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 66750 + }, + { + "epoch": 4.794254937163375, + "grad_norm": 0.8910234570503235, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 66760 + }, + { + "epoch": 4.794973070017953, + "grad_norm": 0.8300510048866272, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 66770 + }, + { + "epoch": 4.795691202872531, + "grad_norm": 0.9380533695220947, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 66780 + }, + { + "epoch": 4.79640933572711, + "grad_norm": 0.8361864686012268, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 66790 + }, + { + "epoch": 4.797127468581688, + "grad_norm": 1.051262617111206, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 66800 + }, + { + "epoch": 4.797845601436266, + "grad_norm": 1.1324400901794434, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 66810 + }, + { + "epoch": 4.798563734290844, + "grad_norm": 0.853903591632843, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 66820 + }, + { + "epoch": 4.799281867145422, + "grad_norm": 0.9949867725372314, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 66830 + }, + { + "epoch": 4.8, + "grad_norm": 0.9204033017158508, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 66840 + }, + { + "epoch": 4.800718132854578, + "grad_norm": 0.7461584806442261, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 66850 + }, + { + "epoch": 4.801436265709156, + "grad_norm": 1.1019874811172485, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 66860 + }, + { + "epoch": 4.802154398563735, + "grad_norm": 1.1695797443389893, + "learning_rate": 0.0002, + "loss": 0.6514, + "step": 66870 + }, + { + "epoch": 4.802872531418313, + "grad_norm": 1.0902758836746216, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 66880 + }, + { + "epoch": 4.803590664272891, + "grad_norm": 0.8778618574142456, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 66890 + }, + { + "epoch": 4.804308797127469, + "grad_norm": 0.905505359172821, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 66900 + }, + { + "epoch": 4.805026929982047, + "grad_norm": 1.0802056789398193, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 66910 + }, + { + "epoch": 4.805745062836625, + "grad_norm": 0.7899449467658997, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 66920 + }, + { + "epoch": 4.806463195691203, + "grad_norm": 1.1938519477844238, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 66930 + }, + { + "epoch": 4.807181328545781, + "grad_norm": 1.0213780403137207, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 66940 + }, + { + "epoch": 4.807899461400359, + "grad_norm": 0.9925506711006165, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 66950 + }, + { + "epoch": 4.808617594254937, + "grad_norm": 1.0174424648284912, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 66960 + }, + { + "epoch": 4.809335727109516, + "grad_norm": 1.0515072345733643, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 66970 + }, + { + "epoch": 4.810053859964094, + "grad_norm": 1.0161492824554443, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 66980 + }, + { + "epoch": 4.810771992818672, + "grad_norm": 0.8421840071678162, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 66990 + }, + { + "epoch": 4.81149012567325, + "grad_norm": 1.0493539571762085, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 67000 + }, + { + "epoch": 4.812208258527828, + "grad_norm": 1.1133309602737427, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 67010 + }, + { + "epoch": 4.812926391382406, + "grad_norm": 0.924017071723938, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 67020 + }, + { + "epoch": 4.813644524236984, + "grad_norm": 1.0568689107894897, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 67030 + }, + { + "epoch": 4.814362657091562, + "grad_norm": 0.989414632320404, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 67040 + }, + { + "epoch": 4.8150807899461405, + "grad_norm": 0.9256827235221863, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 67050 + }, + { + "epoch": 4.8157989228007185, + "grad_norm": 0.9538901448249817, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 67060 + }, + { + "epoch": 4.8165170556552965, + "grad_norm": 1.0373849868774414, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 67070 + }, + { + "epoch": 4.8172351885098745, + "grad_norm": 1.0019729137420654, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 67080 + }, + { + "epoch": 4.8179533213644525, + "grad_norm": 0.9930381178855896, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 67090 + }, + { + "epoch": 4.8186714542190305, + "grad_norm": 1.0008453130722046, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 67100 + }, + { + "epoch": 4.8193895870736085, + "grad_norm": 1.0153851509094238, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 67110 + }, + { + "epoch": 4.8201077199281865, + "grad_norm": 1.0193161964416504, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 67120 + }, + { + "epoch": 4.8208258527827645, + "grad_norm": 1.0204501152038574, + "learning_rate": 0.0002, + "loss": 0.6602, + "step": 67130 + }, + { + "epoch": 4.8215439856373425, + "grad_norm": 0.9097670316696167, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 67140 + }, + { + "epoch": 4.8222621184919205, + "grad_norm": 0.9288716912269592, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 67150 + }, + { + "epoch": 4.822980251346499, + "grad_norm": 0.9975850582122803, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 67160 + }, + { + "epoch": 4.823698384201077, + "grad_norm": 0.8502511382102966, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 67170 + }, + { + "epoch": 4.824416517055655, + "grad_norm": 1.0129257440567017, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 67180 + }, + { + "epoch": 4.825134649910233, + "grad_norm": 1.0009492635726929, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 67190 + }, + { + "epoch": 4.825852782764811, + "grad_norm": 0.9273321032524109, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 67200 + }, + { + "epoch": 4.8265709156193894, + "grad_norm": 1.0438604354858398, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 67210 + }, + { + "epoch": 4.8272890484739674, + "grad_norm": 1.119573712348938, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 67220 + }, + { + "epoch": 4.8280071813285454, + "grad_norm": 0.9607422351837158, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 67230 + }, + { + "epoch": 4.828725314183124, + "grad_norm": 0.9614062905311584, + "learning_rate": 0.0002, + "loss": 0.6259, + "step": 67240 + }, + { + "epoch": 4.829443447037702, + "grad_norm": 1.1017652750015259, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 67250 + }, + { + "epoch": 4.83016157989228, + "grad_norm": 1.0521706342697144, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 67260 + }, + { + "epoch": 4.830879712746858, + "grad_norm": 0.7685959339141846, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 67270 + }, + { + "epoch": 4.831597845601436, + "grad_norm": 0.7894896268844604, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 67280 + }, + { + "epoch": 4.832315978456014, + "grad_norm": 1.0882996320724487, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 67290 + }, + { + "epoch": 4.833034111310592, + "grad_norm": 0.9215409755706787, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 67300 + }, + { + "epoch": 4.83375224416517, + "grad_norm": 0.8660635352134705, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 67310 + }, + { + "epoch": 4.834470377019748, + "grad_norm": 0.980879008769989, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 67320 + }, + { + "epoch": 4.835188509874326, + "grad_norm": 1.0356814861297607, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 67330 + }, + { + "epoch": 4.835906642728904, + "grad_norm": 1.0265507698059082, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 67340 + }, + { + "epoch": 4.836624775583483, + "grad_norm": 1.0659137964248657, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 67350 + }, + { + "epoch": 4.837342908438061, + "grad_norm": 0.9485231637954712, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 67360 + }, + { + "epoch": 4.838061041292639, + "grad_norm": 1.0950140953063965, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 67370 + }, + { + "epoch": 4.838779174147217, + "grad_norm": 0.8907382488250732, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 67380 + }, + { + "epoch": 4.839497307001795, + "grad_norm": 0.9777120351791382, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 67390 + }, + { + "epoch": 4.840215439856373, + "grad_norm": 0.8482252955436707, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 67400 + }, + { + "epoch": 4.840933572710951, + "grad_norm": 0.8505899906158447, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 67410 + }, + { + "epoch": 4.841651705565529, + "grad_norm": 0.8574482798576355, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 67420 + }, + { + "epoch": 4.842369838420108, + "grad_norm": 1.092310905456543, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 67430 + }, + { + "epoch": 4.843087971274686, + "grad_norm": 0.9418560266494751, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 67440 + }, + { + "epoch": 4.843806104129264, + "grad_norm": 1.1310782432556152, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 67450 + }, + { + "epoch": 4.844524236983842, + "grad_norm": 0.9993671774864197, + "learning_rate": 0.0002, + "loss": 0.664, + "step": 67460 + }, + { + "epoch": 4.84524236983842, + "grad_norm": 0.8322528600692749, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 67470 + }, + { + "epoch": 4.845960502692998, + "grad_norm": 0.8488435745239258, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 67480 + }, + { + "epoch": 4.846678635547576, + "grad_norm": 0.8070611357688904, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 67490 + }, + { + "epoch": 4.847396768402154, + "grad_norm": 0.8200163245201111, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 67500 + }, + { + "epoch": 4.848114901256732, + "grad_norm": 0.91901034116745, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 67510 + }, + { + "epoch": 4.84883303411131, + "grad_norm": 1.0938435792922974, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 67520 + }, + { + "epoch": 4.849551166965889, + "grad_norm": 0.7926174402236938, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 67530 + }, + { + "epoch": 4.850269299820467, + "grad_norm": 0.9914385676383972, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 67540 + }, + { + "epoch": 4.850987432675045, + "grad_norm": 1.033065915107727, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 67550 + }, + { + "epoch": 4.851705565529623, + "grad_norm": 0.9700239300727844, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 67560 + }, + { + "epoch": 4.852423698384201, + "grad_norm": 0.8550103902816772, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 67570 + }, + { + "epoch": 4.853141831238779, + "grad_norm": 1.0009654760360718, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 67580 + }, + { + "epoch": 4.853859964093357, + "grad_norm": 1.0766186714172363, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 67590 + }, + { + "epoch": 4.854578096947935, + "grad_norm": 0.9512220621109009, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 67600 + }, + { + "epoch": 4.855296229802514, + "grad_norm": 0.8434456586837769, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 67610 + }, + { + "epoch": 4.856014362657092, + "grad_norm": 1.0276665687561035, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 67620 + }, + { + "epoch": 4.85673249551167, + "grad_norm": 0.9758516550064087, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 67630 + }, + { + "epoch": 4.857450628366248, + "grad_norm": 0.8988076448440552, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 67640 + }, + { + "epoch": 4.858168761220826, + "grad_norm": 1.0038257837295532, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 67650 + }, + { + "epoch": 4.858886894075404, + "grad_norm": 0.9973093867301941, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 67660 + }, + { + "epoch": 4.859605026929982, + "grad_norm": 0.9754974246025085, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 67670 + }, + { + "epoch": 4.86032315978456, + "grad_norm": 1.1829560995101929, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 67680 + }, + { + "epoch": 4.861041292639138, + "grad_norm": 1.1077659130096436, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 67690 + }, + { + "epoch": 4.861759425493716, + "grad_norm": 0.9862872958183289, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 67700 + }, + { + "epoch": 4.862477558348294, + "grad_norm": 0.9826052188873291, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 67710 + }, + { + "epoch": 4.863195691202873, + "grad_norm": 0.940082848072052, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 67720 + }, + { + "epoch": 4.863913824057451, + "grad_norm": 0.895434558391571, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 67730 + }, + { + "epoch": 4.864631956912029, + "grad_norm": 1.1194682121276855, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 67740 + }, + { + "epoch": 4.865350089766607, + "grad_norm": 0.9984544515609741, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 67750 + }, + { + "epoch": 4.866068222621185, + "grad_norm": 1.049224615097046, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 67760 + }, + { + "epoch": 4.866786355475763, + "grad_norm": 1.009515643119812, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 67770 + }, + { + "epoch": 4.867504488330341, + "grad_norm": 1.0336902141571045, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 67780 + }, + { + "epoch": 4.868222621184919, + "grad_norm": 0.9310635924339294, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 67790 + }, + { + "epoch": 4.868940754039498, + "grad_norm": 0.934882640838623, + "learning_rate": 0.0002, + "loss": 0.7267, + "step": 67800 + }, + { + "epoch": 4.869658886894076, + "grad_norm": 0.8663495779037476, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 67810 + }, + { + "epoch": 4.870377019748654, + "grad_norm": 1.0085018873214722, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 67820 + }, + { + "epoch": 4.871095152603232, + "grad_norm": 0.896507978439331, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 67830 + }, + { + "epoch": 4.87181328545781, + "grad_norm": 0.925809919834137, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 67840 + }, + { + "epoch": 4.872531418312388, + "grad_norm": 0.8044029474258423, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 67850 + }, + { + "epoch": 4.873249551166966, + "grad_norm": 1.0026800632476807, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 67860 + }, + { + "epoch": 4.873967684021544, + "grad_norm": 0.9577589631080627, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 67870 + }, + { + "epoch": 4.874685816876122, + "grad_norm": 0.8225193619728088, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 67880 + }, + { + "epoch": 4.8754039497307, + "grad_norm": 1.0019139051437378, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 67890 + }, + { + "epoch": 4.876122082585278, + "grad_norm": 0.9282827377319336, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 67900 + }, + { + "epoch": 4.876840215439857, + "grad_norm": 0.8204836249351501, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 67910 + }, + { + "epoch": 4.877558348294435, + "grad_norm": 0.907356321811676, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 67920 + }, + { + "epoch": 4.878276481149013, + "grad_norm": 1.12422776222229, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 67930 + }, + { + "epoch": 4.878994614003591, + "grad_norm": 0.8230205178260803, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 67940 + }, + { + "epoch": 4.879712746858169, + "grad_norm": 1.1588479280471802, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 67950 + }, + { + "epoch": 4.880430879712747, + "grad_norm": 1.1064553260803223, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 67960 + }, + { + "epoch": 4.881149012567325, + "grad_norm": 0.9311534762382507, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 67970 + }, + { + "epoch": 4.881867145421903, + "grad_norm": 0.7575639486312866, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 67980 + }, + { + "epoch": 4.882585278276482, + "grad_norm": 0.9201191067695618, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 67990 + }, + { + "epoch": 4.88330341113106, + "grad_norm": 0.8487658500671387, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 68000 + }, + { + "epoch": 4.884021543985638, + "grad_norm": 0.9645208716392517, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 68010 + }, + { + "epoch": 4.884739676840216, + "grad_norm": 0.8594469428062439, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 68020 + }, + { + "epoch": 4.885457809694794, + "grad_norm": 0.9518412947654724, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 68030 + }, + { + "epoch": 4.886175942549372, + "grad_norm": 1.0934258699417114, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 68040 + }, + { + "epoch": 4.88689407540395, + "grad_norm": 0.988761842250824, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 68050 + }, + { + "epoch": 4.887612208258528, + "grad_norm": 0.7572013735771179, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 68060 + }, + { + "epoch": 4.888330341113106, + "grad_norm": 0.8801929950714111, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 68070 + }, + { + "epoch": 4.889048473967684, + "grad_norm": 1.0080658197402954, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 68080 + }, + { + "epoch": 4.8897666068222625, + "grad_norm": 0.9588785171508789, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 68090 + }, + { + "epoch": 4.8904847396768405, + "grad_norm": 1.0994032621383667, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 68100 + }, + { + "epoch": 4.8912028725314185, + "grad_norm": 0.9851962924003601, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 68110 + }, + { + "epoch": 4.8919210053859965, + "grad_norm": 0.9566116333007812, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 68120 + }, + { + "epoch": 4.8926391382405745, + "grad_norm": 0.8708083033561707, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 68130 + }, + { + "epoch": 4.8933572710951525, + "grad_norm": 1.2182754278182983, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 68140 + }, + { + "epoch": 4.8940754039497305, + "grad_norm": 1.047988772392273, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 68150 + }, + { + "epoch": 4.8947935368043085, + "grad_norm": 0.8665831685066223, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 68160 + }, + { + "epoch": 4.8955116696588865, + "grad_norm": 0.9313908219337463, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 68170 + }, + { + "epoch": 4.896229802513465, + "grad_norm": 0.9568582773208618, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 68180 + }, + { + "epoch": 4.896947935368043, + "grad_norm": 1.0427594184875488, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 68190 + }, + { + "epoch": 4.897666068222621, + "grad_norm": 0.9132021069526672, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 68200 + }, + { + "epoch": 4.898384201077199, + "grad_norm": 0.9597318768501282, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 68210 + }, + { + "epoch": 4.899102333931777, + "grad_norm": 1.0736947059631348, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 68220 + }, + { + "epoch": 4.899820466786355, + "grad_norm": 0.9318404793739319, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 68230 + }, + { + "epoch": 4.900538599640933, + "grad_norm": 0.8594326972961426, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 68240 + }, + { + "epoch": 4.901256732495511, + "grad_norm": 1.1437443494796753, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 68250 + }, + { + "epoch": 4.901974865350089, + "grad_norm": 1.1599408388137817, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 68260 + }, + { + "epoch": 4.902692998204667, + "grad_norm": 1.160628080368042, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 68270 + }, + { + "epoch": 4.903411131059246, + "grad_norm": 1.0147801637649536, + "learning_rate": 0.0002, + "loss": 0.613, + "step": 68280 + }, + { + "epoch": 4.904129263913824, + "grad_norm": 0.8622691631317139, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 68290 + }, + { + "epoch": 4.904847396768402, + "grad_norm": 0.7179980874061584, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 68300 + }, + { + "epoch": 4.90556552962298, + "grad_norm": 1.1705092191696167, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 68310 + }, + { + "epoch": 4.906283662477558, + "grad_norm": 1.1687676906585693, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 68320 + }, + { + "epoch": 4.907001795332136, + "grad_norm": 1.1621531248092651, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 68330 + }, + { + "epoch": 4.907719928186714, + "grad_norm": 1.0241422653198242, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 68340 + }, + { + "epoch": 4.908438061041292, + "grad_norm": 0.943354070186615, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 68350 + }, + { + "epoch": 4.909156193895871, + "grad_norm": 0.8091703653335571, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 68360 + }, + { + "epoch": 4.909874326750449, + "grad_norm": 0.8871228694915771, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 68370 + }, + { + "epoch": 4.910592459605027, + "grad_norm": 1.0951069593429565, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 68380 + }, + { + "epoch": 4.911310592459605, + "grad_norm": 1.1355193853378296, + "learning_rate": 0.0002, + "loss": 0.6407, + "step": 68390 + }, + { + "epoch": 4.912028725314183, + "grad_norm": 1.0741122961044312, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 68400 + }, + { + "epoch": 4.912746858168761, + "grad_norm": 0.9285269975662231, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 68410 + }, + { + "epoch": 4.913464991023339, + "grad_norm": 1.080695390701294, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 68420 + }, + { + "epoch": 4.914183123877917, + "grad_norm": 0.921331524848938, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 68430 + }, + { + "epoch": 4.914901256732495, + "grad_norm": 0.9763174057006836, + "learning_rate": 0.0002, + "loss": 0.701, + "step": 68440 + }, + { + "epoch": 4.915619389587073, + "grad_norm": 1.1133354902267456, + "learning_rate": 0.0002, + "loss": 0.6429, + "step": 68450 + }, + { + "epoch": 4.916337522441651, + "grad_norm": 0.8373502492904663, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 68460 + }, + { + "epoch": 4.91705565529623, + "grad_norm": 0.9192346334457397, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 68470 + }, + { + "epoch": 4.917773788150808, + "grad_norm": 1.0724657773971558, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 68480 + }, + { + "epoch": 4.918491921005386, + "grad_norm": 0.9209843873977661, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 68490 + }, + { + "epoch": 4.919210053859964, + "grad_norm": 0.9201577305793762, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 68500 + }, + { + "epoch": 4.919928186714542, + "grad_norm": 0.8086138963699341, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 68510 + }, + { + "epoch": 4.92064631956912, + "grad_norm": 1.0917785167694092, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 68520 + }, + { + "epoch": 4.921364452423698, + "grad_norm": 0.9287897944450378, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 68530 + }, + { + "epoch": 4.922082585278276, + "grad_norm": 0.9830158948898315, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 68540 + }, + { + "epoch": 4.922800718132855, + "grad_norm": 0.8674678802490234, + "learning_rate": 0.0002, + "loss": 0.6583, + "step": 68550 + }, + { + "epoch": 4.923518850987433, + "grad_norm": 0.7996176481246948, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 68560 + }, + { + "epoch": 4.924236983842011, + "grad_norm": 1.1284033060073853, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 68570 + }, + { + "epoch": 4.924955116696589, + "grad_norm": 0.894339919090271, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 68580 + }, + { + "epoch": 4.925673249551167, + "grad_norm": 1.1140280961990356, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 68590 + }, + { + "epoch": 4.926391382405745, + "grad_norm": 0.9048344492912292, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 68600 + }, + { + "epoch": 4.927109515260323, + "grad_norm": 0.9380471706390381, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 68610 + }, + { + "epoch": 4.927827648114901, + "grad_norm": 0.8598429560661316, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 68620 + }, + { + "epoch": 4.928545780969479, + "grad_norm": 1.0813355445861816, + "learning_rate": 0.0002, + "loss": 0.6486, + "step": 68630 + }, + { + "epoch": 4.929263913824057, + "grad_norm": 0.979053795337677, + "learning_rate": 0.0002, + "loss": 0.6367, + "step": 68640 + }, + { + "epoch": 4.929982046678636, + "grad_norm": 0.8194574117660522, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 68650 + }, + { + "epoch": 4.930700179533214, + "grad_norm": 0.8593540787696838, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 68660 + }, + { + "epoch": 4.931418312387792, + "grad_norm": 1.0134016275405884, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 68670 + }, + { + "epoch": 4.93213644524237, + "grad_norm": 1.060586929321289, + "learning_rate": 0.0002, + "loss": 0.6221, + "step": 68680 + }, + { + "epoch": 4.932854578096948, + "grad_norm": 0.84132319688797, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 68690 + }, + { + "epoch": 4.933572710951526, + "grad_norm": 1.0767526626586914, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 68700 + }, + { + "epoch": 4.934290843806104, + "grad_norm": 0.8858519792556763, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 68710 + }, + { + "epoch": 4.935008976660682, + "grad_norm": 1.194031000137329, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 68720 + }, + { + "epoch": 4.93572710951526, + "grad_norm": 0.8270226120948792, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 68730 + }, + { + "epoch": 4.936445242369839, + "grad_norm": 1.0385973453521729, + "learning_rate": 0.0002, + "loss": 0.6538, + "step": 68740 + }, + { + "epoch": 4.937163375224417, + "grad_norm": 0.9062243700027466, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 68750 + }, + { + "epoch": 4.937881508078995, + "grad_norm": 1.0526955127716064, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 68760 + }, + { + "epoch": 4.938599640933573, + "grad_norm": 0.930604100227356, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 68770 + }, + { + "epoch": 4.939317773788151, + "grad_norm": 0.9635265469551086, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 68780 + }, + { + "epoch": 4.940035906642729, + "grad_norm": 0.9825171232223511, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 68790 + }, + { + "epoch": 4.940754039497307, + "grad_norm": 0.9621182680130005, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 68800 + }, + { + "epoch": 4.941472172351885, + "grad_norm": 0.9655307531356812, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 68810 + }, + { + "epoch": 4.942190305206463, + "grad_norm": 1.2948180437088013, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 68820 + }, + { + "epoch": 4.942908438061041, + "grad_norm": 0.9206728339195251, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 68830 + }, + { + "epoch": 4.94362657091562, + "grad_norm": 1.0235631465911865, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 68840 + }, + { + "epoch": 4.944344703770198, + "grad_norm": 1.0542538166046143, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 68850 + }, + { + "epoch": 4.945062836624776, + "grad_norm": 0.9787087440490723, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 68860 + }, + { + "epoch": 4.945780969479354, + "grad_norm": 0.9527219533920288, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 68870 + }, + { + "epoch": 4.946499102333932, + "grad_norm": 1.1525826454162598, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 68880 + }, + { + "epoch": 4.94721723518851, + "grad_norm": 0.8610072731971741, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 68890 + }, + { + "epoch": 4.947935368043088, + "grad_norm": 1.1403616666793823, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 68900 + }, + { + "epoch": 4.948653500897666, + "grad_norm": 1.10334312915802, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 68910 + }, + { + "epoch": 4.949371633752245, + "grad_norm": 0.8633760809898376, + "learning_rate": 0.0002, + "loss": 0.6123, + "step": 68920 + }, + { + "epoch": 4.950089766606823, + "grad_norm": 1.1291080713272095, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 68930 + }, + { + "epoch": 4.950807899461401, + "grad_norm": 1.0176939964294434, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 68940 + }, + { + "epoch": 4.951526032315979, + "grad_norm": 0.9207960963249207, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 68950 + }, + { + "epoch": 4.952244165170557, + "grad_norm": 0.9815934300422668, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 68960 + }, + { + "epoch": 4.952962298025135, + "grad_norm": 0.9725701808929443, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 68970 + }, + { + "epoch": 4.953680430879713, + "grad_norm": 0.844926655292511, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 68980 + }, + { + "epoch": 4.954398563734291, + "grad_norm": 0.9898511171340942, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 68990 + }, + { + "epoch": 4.955116696588869, + "grad_norm": 1.1311410665512085, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 69000 + }, + { + "epoch": 4.955834829443447, + "grad_norm": 1.218610405921936, + "learning_rate": 0.0002, + "loss": 0.6525, + "step": 69010 + }, + { + "epoch": 4.956552962298025, + "grad_norm": 1.1536420583724976, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 69020 + }, + { + "epoch": 4.957271095152604, + "grad_norm": 1.1857786178588867, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 69030 + }, + { + "epoch": 4.957989228007182, + "grad_norm": 0.9969246983528137, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 69040 + }, + { + "epoch": 4.95870736086176, + "grad_norm": 1.138635277748108, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 69050 + }, + { + "epoch": 4.959425493716338, + "grad_norm": 1.110474705696106, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 69060 + }, + { + "epoch": 4.960143626570916, + "grad_norm": 1.0366318225860596, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 69070 + }, + { + "epoch": 4.960861759425494, + "grad_norm": 0.6927996277809143, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 69080 + }, + { + "epoch": 4.961579892280072, + "grad_norm": 1.0368026494979858, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 69090 + }, + { + "epoch": 4.96229802513465, + "grad_norm": 1.0638312101364136, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 69100 + }, + { + "epoch": 4.9630161579892285, + "grad_norm": 1.0372415781021118, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 69110 + }, + { + "epoch": 4.9637342908438065, + "grad_norm": 0.8257387280464172, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 69120 + }, + { + "epoch": 4.9644524236983845, + "grad_norm": 1.0046974420547485, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 69130 + }, + { + "epoch": 4.9651705565529625, + "grad_norm": 1.0139652490615845, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 69140 + }, + { + "epoch": 4.9658886894075405, + "grad_norm": 1.0214691162109375, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 69150 + }, + { + "epoch": 4.9666068222621185, + "grad_norm": 1.1042424440383911, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 69160 + }, + { + "epoch": 4.9673249551166965, + "grad_norm": 0.8749067783355713, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 69170 + }, + { + "epoch": 4.9680430879712745, + "grad_norm": 0.9894024133682251, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 69180 + }, + { + "epoch": 4.9687612208258525, + "grad_norm": 1.0218034982681274, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 69190 + }, + { + "epoch": 4.9694793536804305, + "grad_norm": 0.9782929420471191, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 69200 + }, + { + "epoch": 4.9701974865350085, + "grad_norm": 0.9373409748077393, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 69210 + }, + { + "epoch": 4.970915619389587, + "grad_norm": 1.0329546928405762, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 69220 + }, + { + "epoch": 4.971633752244165, + "grad_norm": 0.9746108055114746, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 69230 + }, + { + "epoch": 4.972351885098743, + "grad_norm": 0.9202073216438293, + "learning_rate": 0.0002, + "loss": 0.6342, + "step": 69240 + }, + { + "epoch": 4.973070017953321, + "grad_norm": 1.078032374382019, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 69250 + }, + { + "epoch": 4.973788150807899, + "grad_norm": 0.8860024809837341, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 69260 + }, + { + "epoch": 4.974506283662477, + "grad_norm": 0.915212094783783, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 69270 + }, + { + "epoch": 4.975224416517055, + "grad_norm": 1.1192166805267334, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 69280 + }, + { + "epoch": 4.975942549371633, + "grad_norm": 0.8387445211410522, + "learning_rate": 0.0002, + "loss": 0.6347, + "step": 69290 + }, + { + "epoch": 4.976660682226212, + "grad_norm": 1.1210044622421265, + "learning_rate": 0.0002, + "loss": 0.6392, + "step": 69300 + }, + { + "epoch": 4.97737881508079, + "grad_norm": 1.0051207542419434, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 69310 + }, + { + "epoch": 4.978096947935368, + "grad_norm": 0.9248682856559753, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 69320 + }, + { + "epoch": 4.978815080789946, + "grad_norm": 0.8265128135681152, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 69330 + }, + { + "epoch": 4.979533213644524, + "grad_norm": 0.9432681798934937, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 69340 + }, + { + "epoch": 4.980251346499102, + "grad_norm": 1.0135977268218994, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 69350 + }, + { + "epoch": 4.98096947935368, + "grad_norm": 0.9857245683670044, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 69360 + }, + { + "epoch": 4.981687612208258, + "grad_norm": 0.9215952157974243, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 69370 + }, + { + "epoch": 4.982405745062836, + "grad_norm": 1.1518077850341797, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 69380 + }, + { + "epoch": 4.983123877917414, + "grad_norm": 0.8836095929145813, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 69390 + }, + { + "epoch": 4.983842010771993, + "grad_norm": 0.8082528710365295, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 69400 + }, + { + "epoch": 4.984560143626571, + "grad_norm": 0.9295604825019836, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 69410 + }, + { + "epoch": 4.985278276481149, + "grad_norm": 1.002057433128357, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 69420 + }, + { + "epoch": 4.985996409335727, + "grad_norm": 0.8127216100692749, + "learning_rate": 0.0002, + "loss": 0.6275, + "step": 69430 + }, + { + "epoch": 4.986714542190305, + "grad_norm": 1.058138370513916, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 69440 + }, + { + "epoch": 4.987432675044883, + "grad_norm": 0.8451166749000549, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 69450 + }, + { + "epoch": 4.988150807899461, + "grad_norm": 0.9687268137931824, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 69460 + }, + { + "epoch": 4.988868940754039, + "grad_norm": 1.0342036485671997, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 69470 + }, + { + "epoch": 4.989587073608618, + "grad_norm": 0.9042398929595947, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 69480 + }, + { + "epoch": 4.990305206463196, + "grad_norm": 1.0575438737869263, + "learning_rate": 0.0002, + "loss": 0.6193, + "step": 69490 + }, + { + "epoch": 4.991023339317774, + "grad_norm": 0.9364935159683228, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 69500 + }, + { + "epoch": 4.991741472172352, + "grad_norm": 1.0327378511428833, + "learning_rate": 0.0002, + "loss": 0.6532, + "step": 69510 + }, + { + "epoch": 4.99245960502693, + "grad_norm": 0.815592885017395, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 69520 + }, + { + "epoch": 4.993177737881508, + "grad_norm": 1.0813369750976562, + "learning_rate": 0.0002, + "loss": 0.6776, + "step": 69530 + }, + { + "epoch": 4.993895870736086, + "grad_norm": 1.0277023315429688, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 69540 + }, + { + "epoch": 4.994614003590664, + "grad_norm": 1.0291162729263306, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 69550 + }, + { + "epoch": 4.995332136445242, + "grad_norm": 0.8435685634613037, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 69560 + }, + { + "epoch": 4.99605026929982, + "grad_norm": 1.1972291469573975, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 69570 + }, + { + "epoch": 4.996768402154398, + "grad_norm": 0.8114907741546631, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 69580 + }, + { + "epoch": 4.997486535008977, + "grad_norm": 0.8296133875846863, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 69590 + }, + { + "epoch": 4.998204667863555, + "grad_norm": 1.1728706359863281, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 69600 + }, + { + "epoch": 4.998922800718133, + "grad_norm": 0.9586578607559204, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 69610 + }, + { + "epoch": 4.999640933572711, + "grad_norm": 0.9725151062011719, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 69620 + }, + { + "epoch": 5.0, + "eval_loss": 1.133581519126892, + "eval_runtime": 55.2151, + "eval_samples_per_second": 13.275, + "eval_steps_per_second": 1.666, + "step": 69625 + }, + { + "epoch": 5.000359066427289, + "grad_norm": 0.9312055706977844, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 69630 + }, + { + "epoch": 5.001077199281867, + "grad_norm": 1.0534896850585938, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 69640 + }, + { + "epoch": 5.001795332136445, + "grad_norm": 0.8891698718070984, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 69650 + }, + { + "epoch": 5.002513464991023, + "grad_norm": 0.7791097164154053, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 69660 + }, + { + "epoch": 5.003231597845601, + "grad_norm": 1.2891173362731934, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 69670 + }, + { + "epoch": 5.00394973070018, + "grad_norm": 0.7909513711929321, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 69680 + }, + { + "epoch": 5.004667863554758, + "grad_norm": 0.988648533821106, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 69690 + }, + { + "epoch": 5.005385996409336, + "grad_norm": 0.9669296741485596, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 69700 + }, + { + "epoch": 5.006104129263914, + "grad_norm": 1.2393349409103394, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 69710 + }, + { + "epoch": 5.006822262118492, + "grad_norm": 1.2420750856399536, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 69720 + }, + { + "epoch": 5.00754039497307, + "grad_norm": 1.1698096990585327, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 69730 + }, + { + "epoch": 5.008258527827648, + "grad_norm": 1.2228301763534546, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 69740 + }, + { + "epoch": 5.008976660682226, + "grad_norm": 0.9350621104240417, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 69750 + }, + { + "epoch": 5.009694793536804, + "grad_norm": 0.9828507304191589, + "learning_rate": 0.0002, + "loss": 0.5278, + "step": 69760 + }, + { + "epoch": 5.010412926391383, + "grad_norm": 0.9372149109840393, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 69770 + }, + { + "epoch": 5.011131059245961, + "grad_norm": 0.8098477125167847, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 69780 + }, + { + "epoch": 5.011849192100539, + "grad_norm": 1.0418338775634766, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 69790 + }, + { + "epoch": 5.012567324955117, + "grad_norm": 1.0175801515579224, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 69800 + }, + { + "epoch": 5.013285457809695, + "grad_norm": 1.2128081321716309, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 69810 + }, + { + "epoch": 5.014003590664273, + "grad_norm": 1.001805067062378, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 69820 + }, + { + "epoch": 5.014721723518851, + "grad_norm": 0.8957470059394836, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 69830 + }, + { + "epoch": 5.015439856373429, + "grad_norm": 0.9344548583030701, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 69840 + }, + { + "epoch": 5.016157989228007, + "grad_norm": 0.8545927405357361, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 69850 + }, + { + "epoch": 5.016876122082586, + "grad_norm": 1.3907777070999146, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 69860 + }, + { + "epoch": 5.017594254937164, + "grad_norm": 0.8112093806266785, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 69870 + }, + { + "epoch": 5.018312387791742, + "grad_norm": 1.0151532888412476, + "learning_rate": 0.0002, + "loss": 0.5, + "step": 69880 + }, + { + "epoch": 5.01903052064632, + "grad_norm": 1.249021053314209, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 69890 + }, + { + "epoch": 5.019748653500898, + "grad_norm": 0.9310314059257507, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 69900 + }, + { + "epoch": 5.020466786355476, + "grad_norm": 0.9444572925567627, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 69910 + }, + { + "epoch": 5.021184919210054, + "grad_norm": 1.0952081680297852, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 69920 + }, + { + "epoch": 5.021903052064632, + "grad_norm": 1.2106375694274902, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 69930 + }, + { + "epoch": 5.02262118491921, + "grad_norm": 1.0179580450057983, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 69940 + }, + { + "epoch": 5.023339317773788, + "grad_norm": 1.0865367650985718, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 69950 + }, + { + "epoch": 5.024057450628367, + "grad_norm": 1.0965075492858887, + "learning_rate": 0.0002, + "loss": 0.6011, + "step": 69960 + }, + { + "epoch": 5.024775583482945, + "grad_norm": 0.8879445791244507, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 69970 + }, + { + "epoch": 5.025493716337523, + "grad_norm": 1.2588363885879517, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 69980 + }, + { + "epoch": 5.026211849192101, + "grad_norm": 0.935705304145813, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 69990 + }, + { + "epoch": 5.026929982046679, + "grad_norm": 1.072012186050415, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 70000 + }, + { + "epoch": 5.027648114901257, + "grad_norm": 1.286438226699829, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 70010 + }, + { + "epoch": 5.028366247755835, + "grad_norm": 1.1165392398834229, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 70020 + }, + { + "epoch": 5.029084380610413, + "grad_norm": 0.7998424172401428, + "learning_rate": 0.0002, + "loss": 0.5348, + "step": 70030 + }, + { + "epoch": 5.029802513464991, + "grad_norm": 1.5669852495193481, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 70040 + }, + { + "epoch": 5.0305206463195695, + "grad_norm": 0.9780290722846985, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 70050 + }, + { + "epoch": 5.0312387791741475, + "grad_norm": 0.9837628602981567, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 70060 + }, + { + "epoch": 5.0319569120287255, + "grad_norm": 0.9558916091918945, + "learning_rate": 0.0002, + "loss": 0.5369, + "step": 70070 + }, + { + "epoch": 5.0326750448833035, + "grad_norm": 0.8893155455589294, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 70080 + }, + { + "epoch": 5.0333931777378815, + "grad_norm": 1.1403675079345703, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 70090 + }, + { + "epoch": 5.0341113105924595, + "grad_norm": 1.0453649759292603, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 70100 + }, + { + "epoch": 5.0348294434470375, + "grad_norm": 0.8127498030662537, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 70110 + }, + { + "epoch": 5.0355475763016155, + "grad_norm": 0.9344680309295654, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 70120 + }, + { + "epoch": 5.0362657091561935, + "grad_norm": 1.0302079916000366, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 70130 + }, + { + "epoch": 5.036983842010772, + "grad_norm": 1.0549713373184204, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 70140 + }, + { + "epoch": 5.03770197486535, + "grad_norm": 0.8916767835617065, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 70150 + }, + { + "epoch": 5.038420107719928, + "grad_norm": 0.9799798130989075, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 70160 + }, + { + "epoch": 5.039138240574506, + "grad_norm": 1.15560781955719, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 70170 + }, + { + "epoch": 5.039856373429084, + "grad_norm": 1.0577017068862915, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 70180 + }, + { + "epoch": 5.040574506283662, + "grad_norm": 1.027990698814392, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 70190 + }, + { + "epoch": 5.04129263913824, + "grad_norm": 1.0818232297897339, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 70200 + }, + { + "epoch": 5.042010771992818, + "grad_norm": 1.0287196636199951, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 70210 + }, + { + "epoch": 5.042728904847396, + "grad_norm": 1.1569273471832275, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 70220 + }, + { + "epoch": 5.0434470377019744, + "grad_norm": 1.0485484600067139, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 70230 + }, + { + "epoch": 5.044165170556553, + "grad_norm": 0.9244540333747864, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 70240 + }, + { + "epoch": 5.044883303411131, + "grad_norm": 0.9576422572135925, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 70250 + }, + { + "epoch": 5.045601436265709, + "grad_norm": 0.8719421625137329, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 70260 + }, + { + "epoch": 5.046319569120287, + "grad_norm": 0.8685409426689148, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 70270 + }, + { + "epoch": 5.047037701974865, + "grad_norm": 1.2735247611999512, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 70280 + }, + { + "epoch": 5.047755834829443, + "grad_norm": 0.9082128405570984, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 70290 + }, + { + "epoch": 5.048473967684021, + "grad_norm": 1.0626471042633057, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 70300 + }, + { + "epoch": 5.049192100538599, + "grad_norm": 1.1463991403579712, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 70310 + }, + { + "epoch": 5.049910233393177, + "grad_norm": 0.8825355172157288, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 70320 + }, + { + "epoch": 5.050628366247756, + "grad_norm": 1.0549408197402954, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 70330 + }, + { + "epoch": 5.051346499102334, + "grad_norm": 1.3740944862365723, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 70340 + }, + { + "epoch": 5.052064631956912, + "grad_norm": 1.4197895526885986, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 70350 + }, + { + "epoch": 5.05278276481149, + "grad_norm": 1.1764925718307495, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 70360 + }, + { + "epoch": 5.053500897666068, + "grad_norm": 1.0443403720855713, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 70370 + }, + { + "epoch": 5.054219030520646, + "grad_norm": 1.1807527542114258, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 70380 + }, + { + "epoch": 5.054937163375224, + "grad_norm": 1.4032433032989502, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 70390 + }, + { + "epoch": 5.055655296229802, + "grad_norm": 0.9815662503242493, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 70400 + }, + { + "epoch": 5.05637342908438, + "grad_norm": 0.9368446469306946, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 70410 + }, + { + "epoch": 5.057091561938959, + "grad_norm": 1.1156736612319946, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 70420 + }, + { + "epoch": 5.057809694793537, + "grad_norm": 1.01651132106781, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 70430 + }, + { + "epoch": 5.058527827648115, + "grad_norm": 0.9906342029571533, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 70440 + }, + { + "epoch": 5.059245960502693, + "grad_norm": 0.8666667938232422, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 70450 + }, + { + "epoch": 5.059964093357271, + "grad_norm": 1.0508924722671509, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 70460 + }, + { + "epoch": 5.060682226211849, + "grad_norm": 1.2472858428955078, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 70470 + }, + { + "epoch": 5.061400359066427, + "grad_norm": 1.019073724746704, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 70480 + }, + { + "epoch": 5.062118491921005, + "grad_norm": 0.9745403528213501, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 70490 + }, + { + "epoch": 5.062836624775583, + "grad_norm": 1.121208906173706, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 70500 + }, + { + "epoch": 5.063554757630161, + "grad_norm": 1.0535147190093994, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 70510 + }, + { + "epoch": 5.06427289048474, + "grad_norm": 1.0368950366973877, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 70520 + }, + { + "epoch": 5.064991023339318, + "grad_norm": 0.948964536190033, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 70530 + }, + { + "epoch": 5.065709156193896, + "grad_norm": 1.0289826393127441, + "learning_rate": 0.0002, + "loss": 0.5254, + "step": 70540 + }, + { + "epoch": 5.066427289048474, + "grad_norm": 1.118374228477478, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 70550 + }, + { + "epoch": 5.067145421903052, + "grad_norm": 0.8712816834449768, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 70560 + }, + { + "epoch": 5.06786355475763, + "grad_norm": 0.9057969450950623, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 70570 + }, + { + "epoch": 5.068581687612208, + "grad_norm": 0.9292685985565186, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 70580 + }, + { + "epoch": 5.069299820466786, + "grad_norm": 0.9159911274909973, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 70590 + }, + { + "epoch": 5.070017953321364, + "grad_norm": 0.973848819732666, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 70600 + }, + { + "epoch": 5.070736086175943, + "grad_norm": 0.7892279028892517, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 70610 + }, + { + "epoch": 5.071454219030521, + "grad_norm": 0.9943311214447021, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 70620 + }, + { + "epoch": 5.072172351885099, + "grad_norm": 1.1457926034927368, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 70630 + }, + { + "epoch": 5.072890484739677, + "grad_norm": 0.9307738542556763, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 70640 + }, + { + "epoch": 5.073608617594255, + "grad_norm": 1.0899816751480103, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 70650 + }, + { + "epoch": 5.074326750448833, + "grad_norm": 0.8357672691345215, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 70660 + }, + { + "epoch": 5.075044883303411, + "grad_norm": 0.8889468312263489, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 70670 + }, + { + "epoch": 5.075763016157989, + "grad_norm": 0.9152118563652039, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 70680 + }, + { + "epoch": 5.076481149012567, + "grad_norm": 1.106160044670105, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 70690 + }, + { + "epoch": 5.077199281867145, + "grad_norm": 0.8519207835197449, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 70700 + }, + { + "epoch": 5.077917414721724, + "grad_norm": 0.9754986763000488, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 70710 + }, + { + "epoch": 5.078635547576302, + "grad_norm": 1.167883276939392, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 70720 + }, + { + "epoch": 5.07935368043088, + "grad_norm": 0.987622082233429, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 70730 + }, + { + "epoch": 5.080071813285458, + "grad_norm": 1.0008184909820557, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 70740 + }, + { + "epoch": 5.080789946140036, + "grad_norm": 0.6318819522857666, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 70750 + }, + { + "epoch": 5.081508078994614, + "grad_norm": 0.984886884689331, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 70760 + }, + { + "epoch": 5.082226211849192, + "grad_norm": 1.0583622455596924, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 70770 + }, + { + "epoch": 5.08294434470377, + "grad_norm": 0.9730119705200195, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 70780 + }, + { + "epoch": 5.083662477558348, + "grad_norm": 1.0201330184936523, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 70790 + }, + { + "epoch": 5.084380610412927, + "grad_norm": 1.0479248762130737, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 70800 + }, + { + "epoch": 5.085098743267505, + "grad_norm": 0.9185113906860352, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 70810 + }, + { + "epoch": 5.085816876122083, + "grad_norm": 0.9326799511909485, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 70820 + }, + { + "epoch": 5.086535008976661, + "grad_norm": 0.958739697933197, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 70830 + }, + { + "epoch": 5.087253141831239, + "grad_norm": 0.9643770456314087, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 70840 + }, + { + "epoch": 5.087971274685817, + "grad_norm": 0.8650234341621399, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 70850 + }, + { + "epoch": 5.088689407540395, + "grad_norm": 0.9354105591773987, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 70860 + }, + { + "epoch": 5.089407540394973, + "grad_norm": 0.8736345171928406, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 70870 + }, + { + "epoch": 5.090125673249551, + "grad_norm": 0.9172632098197937, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 70880 + }, + { + "epoch": 5.09084380610413, + "grad_norm": 0.9495565295219421, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 70890 + }, + { + "epoch": 5.091561938958708, + "grad_norm": 1.0328829288482666, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 70900 + }, + { + "epoch": 5.092280071813286, + "grad_norm": 0.9335703253746033, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 70910 + }, + { + "epoch": 5.092998204667864, + "grad_norm": 1.0919437408447266, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 70920 + }, + { + "epoch": 5.093716337522442, + "grad_norm": 1.03340744972229, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 70930 + }, + { + "epoch": 5.09443447037702, + "grad_norm": 1.0501604080200195, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 70940 + }, + { + "epoch": 5.095152603231598, + "grad_norm": 0.9442012310028076, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 70950 + }, + { + "epoch": 5.095870736086176, + "grad_norm": 1.2592464685440063, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 70960 + }, + { + "epoch": 5.096588868940754, + "grad_norm": 1.0961427688598633, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 70970 + }, + { + "epoch": 5.097307001795333, + "grad_norm": 1.0472424030303955, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 70980 + }, + { + "epoch": 5.098025134649911, + "grad_norm": 0.9489352107048035, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 70990 + }, + { + "epoch": 5.098743267504489, + "grad_norm": 1.0499446392059326, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 71000 + }, + { + "epoch": 5.099461400359067, + "grad_norm": 1.013005018234253, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 71010 + }, + { + "epoch": 5.100179533213645, + "grad_norm": 0.9594261050224304, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 71020 + }, + { + "epoch": 5.100897666068223, + "grad_norm": 1.2016123533248901, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 71030 + }, + { + "epoch": 5.101615798922801, + "grad_norm": 1.0389765501022339, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 71040 + }, + { + "epoch": 5.102333931777379, + "grad_norm": 1.053534746170044, + "learning_rate": 0.0002, + "loss": 0.5036, + "step": 71050 + }, + { + "epoch": 5.103052064631957, + "grad_norm": 1.1379448175430298, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 71060 + }, + { + "epoch": 5.103770197486535, + "grad_norm": 0.8796491622924805, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 71070 + }, + { + "epoch": 5.1044883303411135, + "grad_norm": 1.0591254234313965, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 71080 + }, + { + "epoch": 5.1052064631956915, + "grad_norm": 0.9622171521186829, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 71090 + }, + { + "epoch": 5.1059245960502695, + "grad_norm": 0.9173060059547424, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 71100 + }, + { + "epoch": 5.1066427289048475, + "grad_norm": 0.8363444805145264, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 71110 + }, + { + "epoch": 5.1073608617594255, + "grad_norm": 1.1006172895431519, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 71120 + }, + { + "epoch": 5.1080789946140035, + "grad_norm": 1.0720574855804443, + "learning_rate": 0.0002, + "loss": 0.5753, + "step": 71130 + }, + { + "epoch": 5.1087971274685815, + "grad_norm": 1.0560680627822876, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 71140 + }, + { + "epoch": 5.1095152603231595, + "grad_norm": 0.8485415577888489, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 71150 + }, + { + "epoch": 5.1102333931777375, + "grad_norm": 1.109383225440979, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 71160 + }, + { + "epoch": 5.110951526032316, + "grad_norm": 0.9296035766601562, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 71170 + }, + { + "epoch": 5.111669658886894, + "grad_norm": 1.2855182886123657, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 71180 + }, + { + "epoch": 5.112387791741472, + "grad_norm": 1.0313524007797241, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 71190 + }, + { + "epoch": 5.11310592459605, + "grad_norm": 1.0436697006225586, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 71200 + }, + { + "epoch": 5.113824057450628, + "grad_norm": 0.901333212852478, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 71210 + }, + { + "epoch": 5.114542190305206, + "grad_norm": 1.2170051336288452, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 71220 + }, + { + "epoch": 5.115260323159784, + "grad_norm": 0.8850961327552795, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 71230 + }, + { + "epoch": 5.115978456014362, + "grad_norm": 1.0147113800048828, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 71240 + }, + { + "epoch": 5.11669658886894, + "grad_norm": 1.0043506622314453, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 71250 + }, + { + "epoch": 5.117414721723518, + "grad_norm": 0.9887113571166992, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 71260 + }, + { + "epoch": 5.118132854578097, + "grad_norm": 1.1013392210006714, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 71270 + }, + { + "epoch": 5.118850987432675, + "grad_norm": 0.9213799238204956, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 71280 + }, + { + "epoch": 5.119569120287253, + "grad_norm": 1.047400712966919, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 71290 + }, + { + "epoch": 5.120287253141831, + "grad_norm": 1.030534029006958, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 71300 + }, + { + "epoch": 5.121005385996409, + "grad_norm": 0.9464976191520691, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 71310 + }, + { + "epoch": 5.121723518850987, + "grad_norm": 0.8610315918922424, + "learning_rate": 0.0002, + "loss": 0.5707, + "step": 71320 + }, + { + "epoch": 5.122441651705565, + "grad_norm": 1.0824426412582397, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 71330 + }, + { + "epoch": 5.123159784560143, + "grad_norm": 0.9382733106613159, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 71340 + }, + { + "epoch": 5.123877917414721, + "grad_norm": 0.9364684224128723, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 71350 + }, + { + "epoch": 5.1245960502693, + "grad_norm": 0.9583013653755188, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 71360 + }, + { + "epoch": 5.125314183123878, + "grad_norm": 1.287533164024353, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 71370 + }, + { + "epoch": 5.126032315978456, + "grad_norm": 1.5031169652938843, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 71380 + }, + { + "epoch": 5.126750448833034, + "grad_norm": 0.9891406297683716, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 71390 + }, + { + "epoch": 5.127468581687612, + "grad_norm": 1.1851537227630615, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 71400 + }, + { + "epoch": 5.12818671454219, + "grad_norm": 0.9869971871376038, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 71410 + }, + { + "epoch": 5.128904847396768, + "grad_norm": 0.961662769317627, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 71420 + }, + { + "epoch": 5.129622980251346, + "grad_norm": 1.1036419868469238, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 71430 + }, + { + "epoch": 5.130341113105924, + "grad_norm": 1.175361156463623, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 71440 + }, + { + "epoch": 5.131059245960503, + "grad_norm": 0.9801875948905945, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 71450 + }, + { + "epoch": 5.131777378815081, + "grad_norm": 0.9424611330032349, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 71460 + }, + { + "epoch": 5.132495511669659, + "grad_norm": 1.11662757396698, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 71470 + }, + { + "epoch": 5.133213644524237, + "grad_norm": 0.9969366192817688, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 71480 + }, + { + "epoch": 5.133931777378815, + "grad_norm": 1.278640866279602, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 71490 + }, + { + "epoch": 5.134649910233393, + "grad_norm": 1.1090457439422607, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 71500 + }, + { + "epoch": 5.135368043087971, + "grad_norm": 1.01808500289917, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 71510 + }, + { + "epoch": 5.136086175942549, + "grad_norm": 1.029135823249817, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 71520 + }, + { + "epoch": 5.136804308797127, + "grad_norm": 1.1207175254821777, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 71530 + }, + { + "epoch": 5.137522441651706, + "grad_norm": 1.0327218770980835, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 71540 + }, + { + "epoch": 5.138240574506284, + "grad_norm": 1.042490839958191, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 71550 + }, + { + "epoch": 5.138958707360862, + "grad_norm": 1.1800413131713867, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 71560 + }, + { + "epoch": 5.13967684021544, + "grad_norm": 1.0748766660690308, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 71570 + }, + { + "epoch": 5.140394973070018, + "grad_norm": 0.9983090758323669, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 71580 + }, + { + "epoch": 5.141113105924596, + "grad_norm": 1.30636727809906, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 71590 + }, + { + "epoch": 5.141831238779174, + "grad_norm": 0.9960222840309143, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 71600 + }, + { + "epoch": 5.142549371633752, + "grad_norm": 1.237027645111084, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 71610 + }, + { + "epoch": 5.14326750448833, + "grad_norm": 1.0913307666778564, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 71620 + }, + { + "epoch": 5.143985637342908, + "grad_norm": 0.940657913684845, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 71630 + }, + { + "epoch": 5.144703770197487, + "grad_norm": 1.093796730041504, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 71640 + }, + { + "epoch": 5.145421903052065, + "grad_norm": 0.9703856110572815, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 71650 + }, + { + "epoch": 5.146140035906643, + "grad_norm": 0.9874776005744934, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 71660 + }, + { + "epoch": 5.146858168761221, + "grad_norm": 0.9723859429359436, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 71670 + }, + { + "epoch": 5.147576301615799, + "grad_norm": 0.997107207775116, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 71680 + }, + { + "epoch": 5.148294434470377, + "grad_norm": 1.0261175632476807, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 71690 + }, + { + "epoch": 5.149012567324955, + "grad_norm": 0.9093905687332153, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 71700 + }, + { + "epoch": 5.149730700179533, + "grad_norm": 0.9909888505935669, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 71710 + }, + { + "epoch": 5.150448833034111, + "grad_norm": 0.9111971259117126, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 71720 + }, + { + "epoch": 5.15116696588869, + "grad_norm": 0.9319643974304199, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 71730 + }, + { + "epoch": 5.151885098743268, + "grad_norm": 1.0744104385375977, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 71740 + }, + { + "epoch": 5.152603231597846, + "grad_norm": 1.1555477380752563, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 71750 + }, + { + "epoch": 5.153321364452424, + "grad_norm": 0.9809171557426453, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 71760 + }, + { + "epoch": 5.154039497307002, + "grad_norm": 0.7937686443328857, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 71770 + }, + { + "epoch": 5.15475763016158, + "grad_norm": 1.1925430297851562, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 71780 + }, + { + "epoch": 5.155475763016158, + "grad_norm": 1.077412486076355, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 71790 + }, + { + "epoch": 5.156193895870736, + "grad_norm": 0.7992808222770691, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 71800 + }, + { + "epoch": 5.156912028725314, + "grad_norm": 1.0938535928726196, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 71810 + }, + { + "epoch": 5.157630161579892, + "grad_norm": 0.9458112120628357, + "learning_rate": 0.0002, + "loss": 0.5562, + "step": 71820 + }, + { + "epoch": 5.158348294434471, + "grad_norm": 0.984940230846405, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 71830 + }, + { + "epoch": 5.159066427289049, + "grad_norm": 0.9242565035820007, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 71840 + }, + { + "epoch": 5.159784560143627, + "grad_norm": 0.8386720418930054, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 71850 + }, + { + "epoch": 5.160502692998205, + "grad_norm": 0.9627357721328735, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 71860 + }, + { + "epoch": 5.161220825852783, + "grad_norm": 1.0118762254714966, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 71870 + }, + { + "epoch": 5.161938958707361, + "grad_norm": 1.1552608013153076, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 71880 + }, + { + "epoch": 5.162657091561939, + "grad_norm": 1.0910389423370361, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 71890 + }, + { + "epoch": 5.163375224416517, + "grad_norm": 1.046639084815979, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 71900 + }, + { + "epoch": 5.164093357271095, + "grad_norm": 1.0087649822235107, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 71910 + }, + { + "epoch": 5.164811490125674, + "grad_norm": 0.9418644309043884, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 71920 + }, + { + "epoch": 5.165529622980252, + "grad_norm": 1.1213915348052979, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 71930 + }, + { + "epoch": 5.16624775583483, + "grad_norm": 1.043786644935608, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 71940 + }, + { + "epoch": 5.166965888689408, + "grad_norm": 1.2150449752807617, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 71950 + }, + { + "epoch": 5.167684021543986, + "grad_norm": 1.1214520931243896, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 71960 + }, + { + "epoch": 5.168402154398564, + "grad_norm": 0.9235218167304993, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 71970 + }, + { + "epoch": 5.169120287253142, + "grad_norm": 0.8736480474472046, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 71980 + }, + { + "epoch": 5.16983842010772, + "grad_norm": 0.8723195195198059, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 71990 + }, + { + "epoch": 5.170556552962298, + "grad_norm": 1.0873022079467773, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 72000 + }, + { + "epoch": 5.1712746858168765, + "grad_norm": 0.9196295142173767, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 72010 + }, + { + "epoch": 5.1719928186714546, + "grad_norm": 0.9244471192359924, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 72020 + }, + { + "epoch": 5.1727109515260326, + "grad_norm": 1.0555505752563477, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 72030 + }, + { + "epoch": 5.1734290843806106, + "grad_norm": 1.1527929306030273, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 72040 + }, + { + "epoch": 5.174147217235189, + "grad_norm": 0.9069058895111084, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 72050 + }, + { + "epoch": 5.174865350089767, + "grad_norm": 1.1047141551971436, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 72060 + }, + { + "epoch": 5.175583482944345, + "grad_norm": 0.9805511832237244, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 72070 + }, + { + "epoch": 5.176301615798923, + "grad_norm": 1.1636970043182373, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 72080 + }, + { + "epoch": 5.177019748653501, + "grad_norm": 1.0193538665771484, + "learning_rate": 0.0002, + "loss": 0.6424, + "step": 72090 + }, + { + "epoch": 5.177737881508079, + "grad_norm": 0.8850618600845337, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 72100 + }, + { + "epoch": 5.1784560143626575, + "grad_norm": 1.042271614074707, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 72110 + }, + { + "epoch": 5.1791741472172355, + "grad_norm": 1.1405227184295654, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 72120 + }, + { + "epoch": 5.1798922800718135, + "grad_norm": 1.0013195276260376, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 72130 + }, + { + "epoch": 5.1806104129263915, + "grad_norm": 1.0474903583526611, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 72140 + }, + { + "epoch": 5.1813285457809695, + "grad_norm": 1.0384612083435059, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 72150 + }, + { + "epoch": 5.1820466786355475, + "grad_norm": 1.145086646080017, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 72160 + }, + { + "epoch": 5.1827648114901255, + "grad_norm": 1.0845173597335815, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 72170 + }, + { + "epoch": 5.1834829443447035, + "grad_norm": 0.9870346188545227, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 72180 + }, + { + "epoch": 5.1842010771992815, + "grad_norm": 1.1098768711090088, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 72190 + }, + { + "epoch": 5.18491921005386, + "grad_norm": 0.9397785067558289, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 72200 + }, + { + "epoch": 5.185637342908438, + "grad_norm": 1.0817532539367676, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 72210 + }, + { + "epoch": 5.186355475763016, + "grad_norm": 1.0027309656143188, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 72220 + }, + { + "epoch": 5.187073608617594, + "grad_norm": 0.8262016773223877, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 72230 + }, + { + "epoch": 5.187791741472172, + "grad_norm": 0.9968137741088867, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 72240 + }, + { + "epoch": 5.18850987432675, + "grad_norm": 0.9072695970535278, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 72250 + }, + { + "epoch": 5.189228007181328, + "grad_norm": 1.0388357639312744, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 72260 + }, + { + "epoch": 5.189946140035906, + "grad_norm": 0.8883537650108337, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 72270 + }, + { + "epoch": 5.190664272890484, + "grad_norm": 1.0161921977996826, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 72280 + }, + { + "epoch": 5.191382405745063, + "grad_norm": 0.964936375617981, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 72290 + }, + { + "epoch": 5.192100538599641, + "grad_norm": 0.9728496670722961, + "learning_rate": 0.0002, + "loss": 0.5145, + "step": 72300 + }, + { + "epoch": 5.192818671454219, + "grad_norm": 1.2411649227142334, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 72310 + }, + { + "epoch": 5.193536804308797, + "grad_norm": 0.9430946111679077, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 72320 + }, + { + "epoch": 5.194254937163375, + "grad_norm": 1.1522886753082275, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 72330 + }, + { + "epoch": 5.194973070017953, + "grad_norm": 1.0727189779281616, + "learning_rate": 0.0002, + "loss": 0.5013, + "step": 72340 + }, + { + "epoch": 5.195691202872531, + "grad_norm": 1.2506077289581299, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 72350 + }, + { + "epoch": 5.196409335727109, + "grad_norm": 1.0949938297271729, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 72360 + }, + { + "epoch": 5.197127468581687, + "grad_norm": 1.191125750541687, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 72370 + }, + { + "epoch": 5.197845601436265, + "grad_norm": 1.1154223680496216, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 72380 + }, + { + "epoch": 5.198563734290844, + "grad_norm": 0.9623886942863464, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 72390 + }, + { + "epoch": 5.199281867145422, + "grad_norm": 0.9432680010795593, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 72400 + }, + { + "epoch": 5.2, + "grad_norm": 1.035905122756958, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 72410 + }, + { + "epoch": 5.200718132854578, + "grad_norm": 0.9044913053512573, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 72420 + }, + { + "epoch": 5.201436265709156, + "grad_norm": 1.082187533378601, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 72430 + }, + { + "epoch": 5.202154398563734, + "grad_norm": 0.9368400573730469, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 72440 + }, + { + "epoch": 5.202872531418312, + "grad_norm": 1.1515194177627563, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 72450 + }, + { + "epoch": 5.20359066427289, + "grad_norm": 0.8333232402801514, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 72460 + }, + { + "epoch": 5.204308797127468, + "grad_norm": 1.0885688066482544, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 72470 + }, + { + "epoch": 5.205026929982047, + "grad_norm": 0.8189428448677063, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 72480 + }, + { + "epoch": 5.205745062836625, + "grad_norm": 1.0145429372787476, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 72490 + }, + { + "epoch": 5.206463195691203, + "grad_norm": 1.132490634918213, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 72500 + }, + { + "epoch": 5.207181328545781, + "grad_norm": 0.8866808414459229, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 72510 + }, + { + "epoch": 5.207899461400359, + "grad_norm": 0.9681518077850342, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 72520 + }, + { + "epoch": 5.208617594254937, + "grad_norm": 0.9992330074310303, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 72530 + }, + { + "epoch": 5.209335727109515, + "grad_norm": 1.0767436027526855, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 72540 + }, + { + "epoch": 5.210053859964093, + "grad_norm": 1.1362388134002686, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 72550 + }, + { + "epoch": 5.210771992818671, + "grad_norm": 0.9741758704185486, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 72560 + }, + { + "epoch": 5.211490125673249, + "grad_norm": 0.8216298818588257, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 72570 + }, + { + "epoch": 5.212208258527828, + "grad_norm": 0.7500724792480469, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 72580 + }, + { + "epoch": 5.212926391382406, + "grad_norm": 0.9152594804763794, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 72590 + }, + { + "epoch": 5.213644524236984, + "grad_norm": 1.014940857887268, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 72600 + }, + { + "epoch": 5.214362657091562, + "grad_norm": 0.9333099722862244, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 72610 + }, + { + "epoch": 5.21508078994614, + "grad_norm": 0.7940610647201538, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 72620 + }, + { + "epoch": 5.215798922800718, + "grad_norm": 1.0365521907806396, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 72630 + }, + { + "epoch": 5.216517055655296, + "grad_norm": 1.37727952003479, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 72640 + }, + { + "epoch": 5.217235188509874, + "grad_norm": 1.2019168138504028, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 72650 + }, + { + "epoch": 5.217953321364452, + "grad_norm": 1.1696226596832275, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 72660 + }, + { + "epoch": 5.218671454219031, + "grad_norm": 0.9608798623085022, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 72670 + }, + { + "epoch": 5.219389587073609, + "grad_norm": 0.9139777421951294, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 72680 + }, + { + "epoch": 5.220107719928187, + "grad_norm": 0.9937016367912292, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 72690 + }, + { + "epoch": 5.220825852782765, + "grad_norm": 1.2787059545516968, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 72700 + }, + { + "epoch": 5.221543985637343, + "grad_norm": 1.0757197141647339, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 72710 + }, + { + "epoch": 5.222262118491921, + "grad_norm": 0.8053579926490784, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 72720 + }, + { + "epoch": 5.222980251346499, + "grad_norm": 1.0239759683609009, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 72730 + }, + { + "epoch": 5.223698384201077, + "grad_norm": 0.9972975850105286, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 72740 + }, + { + "epoch": 5.224416517055655, + "grad_norm": 1.0504519939422607, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 72750 + }, + { + "epoch": 5.225134649910234, + "grad_norm": 1.1793010234832764, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 72760 + }, + { + "epoch": 5.225852782764812, + "grad_norm": 1.1098815202713013, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 72770 + }, + { + "epoch": 5.22657091561939, + "grad_norm": 1.1078516244888306, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 72780 + }, + { + "epoch": 5.227289048473968, + "grad_norm": 0.8684433698654175, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 72790 + }, + { + "epoch": 5.228007181328546, + "grad_norm": 1.159390926361084, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 72800 + }, + { + "epoch": 5.228725314183124, + "grad_norm": 1.0468506813049316, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 72810 + }, + { + "epoch": 5.229443447037702, + "grad_norm": 0.8684625029563904, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 72820 + }, + { + "epoch": 5.23016157989228, + "grad_norm": 1.0117321014404297, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 72830 + }, + { + "epoch": 5.230879712746858, + "grad_norm": 1.0513219833374023, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 72840 + }, + { + "epoch": 5.231597845601437, + "grad_norm": 1.0659555196762085, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 72850 + }, + { + "epoch": 5.232315978456015, + "grad_norm": 0.7726831436157227, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 72860 + }, + { + "epoch": 5.233034111310593, + "grad_norm": 1.0346935987472534, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 72870 + }, + { + "epoch": 5.233752244165171, + "grad_norm": 0.9112410545349121, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 72880 + }, + { + "epoch": 5.234470377019749, + "grad_norm": 1.2933332920074463, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 72890 + }, + { + "epoch": 5.235188509874327, + "grad_norm": 0.9740806221961975, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 72900 + }, + { + "epoch": 5.235906642728905, + "grad_norm": 0.8041712641716003, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 72910 + }, + { + "epoch": 5.236624775583483, + "grad_norm": 0.9510180950164795, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 72920 + }, + { + "epoch": 5.237342908438061, + "grad_norm": 0.9103419780731201, + "learning_rate": 0.0002, + "loss": 0.6312, + "step": 72930 + }, + { + "epoch": 5.238061041292639, + "grad_norm": 0.8317763805389404, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 72940 + }, + { + "epoch": 5.238779174147218, + "grad_norm": 1.0269867181777954, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 72950 + }, + { + "epoch": 5.239497307001796, + "grad_norm": 1.0599713325500488, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 72960 + }, + { + "epoch": 5.240215439856374, + "grad_norm": 0.9341228008270264, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 72970 + }, + { + "epoch": 5.240933572710952, + "grad_norm": 1.1216323375701904, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 72980 + }, + { + "epoch": 5.24165170556553, + "grad_norm": 0.9396152496337891, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 72990 + }, + { + "epoch": 5.242369838420108, + "grad_norm": 1.1474549770355225, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 73000 + }, + { + "epoch": 5.243087971274686, + "grad_norm": 1.2160102128982544, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 73010 + }, + { + "epoch": 5.243806104129264, + "grad_norm": 1.0755409002304077, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 73020 + }, + { + "epoch": 5.244524236983842, + "grad_norm": 1.0645225048065186, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 73030 + }, + { + "epoch": 5.2452423698384205, + "grad_norm": 1.1155469417572021, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 73040 + }, + { + "epoch": 5.2459605026929985, + "grad_norm": 1.1631708145141602, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 73050 + }, + { + "epoch": 5.2466786355475765, + "grad_norm": 0.8747480511665344, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 73060 + }, + { + "epoch": 5.2473967684021545, + "grad_norm": 0.9174497723579407, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 73070 + }, + { + "epoch": 5.2481149012567325, + "grad_norm": 1.334018349647522, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 73080 + }, + { + "epoch": 5.2488330341113105, + "grad_norm": 1.0842393636703491, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 73090 + }, + { + "epoch": 5.2495511669658885, + "grad_norm": 1.0531692504882812, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 73100 + }, + { + "epoch": 5.2502692998204665, + "grad_norm": 0.9069980978965759, + "learning_rate": 0.0002, + "loss": 0.5912, + "step": 73110 + }, + { + "epoch": 5.2509874326750445, + "grad_norm": 1.1319832801818848, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 73120 + }, + { + "epoch": 5.2517055655296225, + "grad_norm": 1.0468456745147705, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 73130 + }, + { + "epoch": 5.252423698384201, + "grad_norm": 1.1752768754959106, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 73140 + }, + { + "epoch": 5.253141831238779, + "grad_norm": 1.0697909593582153, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 73150 + }, + { + "epoch": 5.253859964093357, + "grad_norm": 1.1179429292678833, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 73160 + }, + { + "epoch": 5.254578096947935, + "grad_norm": 0.9088113903999329, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 73170 + }, + { + "epoch": 5.255296229802513, + "grad_norm": 0.8814208507537842, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 73180 + }, + { + "epoch": 5.256014362657091, + "grad_norm": 1.026688814163208, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 73190 + }, + { + "epoch": 5.256732495511669, + "grad_norm": 0.9974902868270874, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 73200 + }, + { + "epoch": 5.257450628366247, + "grad_norm": 0.948743999004364, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 73210 + }, + { + "epoch": 5.258168761220825, + "grad_norm": 0.9069591164588928, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 73220 + }, + { + "epoch": 5.258886894075404, + "grad_norm": 1.0574030876159668, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 73230 + }, + { + "epoch": 5.259605026929982, + "grad_norm": 0.9299649596214294, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 73240 + }, + { + "epoch": 5.26032315978456, + "grad_norm": 0.9888820648193359, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 73250 + }, + { + "epoch": 5.261041292639138, + "grad_norm": 1.0164920091629028, + "learning_rate": 0.0002, + "loss": 0.5993, + "step": 73260 + }, + { + "epoch": 5.261759425493716, + "grad_norm": 0.933210551738739, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 73270 + }, + { + "epoch": 5.262477558348294, + "grad_norm": 1.1754034757614136, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 73280 + }, + { + "epoch": 5.263195691202872, + "grad_norm": 1.1599570512771606, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 73290 + }, + { + "epoch": 5.26391382405745, + "grad_norm": 1.0497905015945435, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 73300 + }, + { + "epoch": 5.264631956912028, + "grad_norm": 1.3603366613388062, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 73310 + }, + { + "epoch": 5.265350089766607, + "grad_norm": 1.0283215045928955, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 73320 + }, + { + "epoch": 5.266068222621185, + "grad_norm": 1.1043906211853027, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 73330 + }, + { + "epoch": 5.266786355475763, + "grad_norm": 0.9386111497879028, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 73340 + }, + { + "epoch": 5.267504488330341, + "grad_norm": 1.3586112260818481, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 73350 + }, + { + "epoch": 5.268222621184919, + "grad_norm": 1.034179449081421, + "learning_rate": 0.0002, + "loss": 0.6213, + "step": 73360 + }, + { + "epoch": 5.268940754039497, + "grad_norm": 0.9645284414291382, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 73370 + }, + { + "epoch": 5.269658886894075, + "grad_norm": 1.1078046560287476, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 73380 + }, + { + "epoch": 5.270377019748653, + "grad_norm": 0.9737151265144348, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 73390 + }, + { + "epoch": 5.271095152603231, + "grad_norm": 1.1911388635635376, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 73400 + }, + { + "epoch": 5.27181328545781, + "grad_norm": 0.9089180827140808, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 73410 + }, + { + "epoch": 5.272531418312388, + "grad_norm": 1.094515085220337, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 73420 + }, + { + "epoch": 5.273249551166966, + "grad_norm": 1.2531700134277344, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 73430 + }, + { + "epoch": 5.273967684021544, + "grad_norm": 0.9279667139053345, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 73440 + }, + { + "epoch": 5.274685816876122, + "grad_norm": 0.9872317314147949, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 73450 + }, + { + "epoch": 5.2754039497307, + "grad_norm": 1.0645262002944946, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 73460 + }, + { + "epoch": 5.276122082585278, + "grad_norm": 0.9505489468574524, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 73470 + }, + { + "epoch": 5.276840215439856, + "grad_norm": 1.0444035530090332, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 73480 + }, + { + "epoch": 5.277558348294434, + "grad_norm": 1.1813455820083618, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 73490 + }, + { + "epoch": 5.278276481149012, + "grad_norm": 0.782117486000061, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 73500 + }, + { + "epoch": 5.278994614003591, + "grad_norm": 0.8837172389030457, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 73510 + }, + { + "epoch": 5.279712746858169, + "grad_norm": 0.8320443630218506, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 73520 + }, + { + "epoch": 5.280430879712747, + "grad_norm": 1.111466407775879, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 73530 + }, + { + "epoch": 5.281149012567325, + "grad_norm": 1.0448017120361328, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 73540 + }, + { + "epoch": 5.281867145421903, + "grad_norm": 1.2046639919281006, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 73550 + }, + { + "epoch": 5.282585278276481, + "grad_norm": 1.084886074066162, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 73560 + }, + { + "epoch": 5.283303411131059, + "grad_norm": 0.8321937918663025, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 73570 + }, + { + "epoch": 5.284021543985637, + "grad_norm": 1.172440767288208, + "learning_rate": 0.0002, + "loss": 0.5735, + "step": 73580 + }, + { + "epoch": 5.284739676840215, + "grad_norm": 0.937133252620697, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 73590 + }, + { + "epoch": 5.285457809694794, + "grad_norm": 1.0996583700180054, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 73600 + }, + { + "epoch": 5.286175942549372, + "grad_norm": 1.2459958791732788, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 73610 + }, + { + "epoch": 5.28689407540395, + "grad_norm": 0.8362332582473755, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 73620 + }, + { + "epoch": 5.287612208258528, + "grad_norm": 0.9784061312675476, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 73630 + }, + { + "epoch": 5.288330341113106, + "grad_norm": 1.087041974067688, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 73640 + }, + { + "epoch": 5.289048473967684, + "grad_norm": 0.8641281723976135, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 73650 + }, + { + "epoch": 5.289766606822262, + "grad_norm": 1.030386209487915, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 73660 + }, + { + "epoch": 5.29048473967684, + "grad_norm": 1.0551509857177734, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 73670 + }, + { + "epoch": 5.291202872531418, + "grad_norm": 0.9969013333320618, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 73680 + }, + { + "epoch": 5.291921005385996, + "grad_norm": 0.9566490054130554, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 73690 + }, + { + "epoch": 5.292639138240575, + "grad_norm": 1.1376742124557495, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 73700 + }, + { + "epoch": 5.293357271095153, + "grad_norm": 1.0127843618392944, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 73710 + }, + { + "epoch": 5.294075403949731, + "grad_norm": 0.9500759243965149, + "learning_rate": 0.0002, + "loss": 0.5673, + "step": 73720 + }, + { + "epoch": 5.294793536804309, + "grad_norm": 0.9597342610359192, + "learning_rate": 0.0002, + "loss": 0.6251, + "step": 73730 + }, + { + "epoch": 5.295511669658887, + "grad_norm": 1.0982595682144165, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 73740 + }, + { + "epoch": 5.296229802513465, + "grad_norm": 0.9007689952850342, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 73750 + }, + { + "epoch": 5.296947935368043, + "grad_norm": 0.9329614639282227, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 73760 + }, + { + "epoch": 5.297666068222621, + "grad_norm": 1.235142469406128, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 73770 + }, + { + "epoch": 5.298384201077199, + "grad_norm": 1.0875943899154663, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 73780 + }, + { + "epoch": 5.299102333931778, + "grad_norm": 1.0499054193496704, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 73790 + }, + { + "epoch": 5.299820466786356, + "grad_norm": 1.117954969406128, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 73800 + }, + { + "epoch": 5.300538599640934, + "grad_norm": 0.800291121006012, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 73810 + }, + { + "epoch": 5.301256732495512, + "grad_norm": 1.1461842060089111, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 73820 + }, + { + "epoch": 5.30197486535009, + "grad_norm": 1.0084760189056396, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 73830 + }, + { + "epoch": 5.302692998204668, + "grad_norm": 1.1249386072158813, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 73840 + }, + { + "epoch": 5.303411131059246, + "grad_norm": 1.0846004486083984, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 73850 + }, + { + "epoch": 5.304129263913824, + "grad_norm": 1.1557925939559937, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 73860 + }, + { + "epoch": 5.304847396768402, + "grad_norm": 1.2287988662719727, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 73870 + }, + { + "epoch": 5.30556552962298, + "grad_norm": 0.9618542194366455, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 73880 + }, + { + "epoch": 5.306283662477559, + "grad_norm": 0.9429472088813782, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 73890 + }, + { + "epoch": 5.307001795332137, + "grad_norm": 0.9032631516456604, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 73900 + }, + { + "epoch": 5.307719928186715, + "grad_norm": 1.0008580684661865, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 73910 + }, + { + "epoch": 5.308438061041293, + "grad_norm": 0.9795624017715454, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 73920 + }, + { + "epoch": 5.309156193895871, + "grad_norm": 1.1194090843200684, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 73930 + }, + { + "epoch": 5.309874326750449, + "grad_norm": 1.1057528257369995, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 73940 + }, + { + "epoch": 5.310592459605027, + "grad_norm": 0.7807615995407104, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 73950 + }, + { + "epoch": 5.311310592459605, + "grad_norm": 0.9465593099594116, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 73960 + }, + { + "epoch": 5.312028725314184, + "grad_norm": 1.104210615158081, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 73970 + }, + { + "epoch": 5.312746858168762, + "grad_norm": 1.0452964305877686, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 73980 + }, + { + "epoch": 5.31346499102334, + "grad_norm": 1.0314992666244507, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 73990 + }, + { + "epoch": 5.314183123877918, + "grad_norm": 0.9187130928039551, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 74000 + }, + { + "epoch": 5.314901256732496, + "grad_norm": 0.8660678267478943, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 74010 + }, + { + "epoch": 5.315619389587074, + "grad_norm": 0.9470953345298767, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 74020 + }, + { + "epoch": 5.316337522441652, + "grad_norm": 1.0028631687164307, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 74030 + }, + { + "epoch": 5.31705565529623, + "grad_norm": 1.0237356424331665, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 74040 + }, + { + "epoch": 5.317773788150808, + "grad_norm": 1.0299798250198364, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 74050 + }, + { + "epoch": 5.318491921005386, + "grad_norm": 1.0326799154281616, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 74060 + }, + { + "epoch": 5.3192100538599645, + "grad_norm": 1.156346082687378, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 74070 + }, + { + "epoch": 5.3199281867145425, + "grad_norm": 1.1542664766311646, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 74080 + }, + { + "epoch": 5.3206463195691205, + "grad_norm": 1.0503013134002686, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 74090 + }, + { + "epoch": 5.3213644524236985, + "grad_norm": 1.1088979244232178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 74100 + }, + { + "epoch": 5.3220825852782765, + "grad_norm": 0.9314014911651611, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 74110 + }, + { + "epoch": 5.3228007181328545, + "grad_norm": 1.0813525915145874, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 74120 + }, + { + "epoch": 5.3235188509874325, + "grad_norm": 0.7824062705039978, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 74130 + }, + { + "epoch": 5.3242369838420105, + "grad_norm": 1.0552699565887451, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 74140 + }, + { + "epoch": 5.3249551166965885, + "grad_norm": 1.0916554927825928, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 74150 + }, + { + "epoch": 5.325673249551167, + "grad_norm": 1.205618143081665, + "learning_rate": 0.0002, + "loss": 0.6128, + "step": 74160 + }, + { + "epoch": 5.326391382405745, + "grad_norm": 1.2551230192184448, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 74170 + }, + { + "epoch": 5.327109515260323, + "grad_norm": 0.7715005278587341, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 74180 + }, + { + "epoch": 5.327827648114901, + "grad_norm": 1.1059352159500122, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 74190 + }, + { + "epoch": 5.328545780969479, + "grad_norm": 0.9441812634468079, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 74200 + }, + { + "epoch": 5.329263913824057, + "grad_norm": 1.0012084245681763, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 74210 + }, + { + "epoch": 5.329982046678635, + "grad_norm": 0.8594073057174683, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 74220 + }, + { + "epoch": 5.330700179533213, + "grad_norm": 0.8931775093078613, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 74230 + }, + { + "epoch": 5.331418312387791, + "grad_norm": 0.967250406742096, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 74240 + }, + { + "epoch": 5.332136445242369, + "grad_norm": 0.9776269793510437, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 74250 + }, + { + "epoch": 5.332854578096948, + "grad_norm": 0.9393186569213867, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 74260 + }, + { + "epoch": 5.333572710951526, + "grad_norm": 1.0081093311309814, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 74270 + }, + { + "epoch": 5.334290843806104, + "grad_norm": 0.9002147316932678, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 74280 + }, + { + "epoch": 5.335008976660682, + "grad_norm": 0.9237701296806335, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 74290 + }, + { + "epoch": 5.33572710951526, + "grad_norm": 1.070694923400879, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 74300 + }, + { + "epoch": 5.336445242369838, + "grad_norm": 1.0134668350219727, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 74310 + }, + { + "epoch": 5.337163375224416, + "grad_norm": 1.0903294086456299, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 74320 + }, + { + "epoch": 5.337881508078994, + "grad_norm": 0.9000239372253418, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 74330 + }, + { + "epoch": 5.338599640933572, + "grad_norm": 1.0584321022033691, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 74340 + }, + { + "epoch": 5.339317773788151, + "grad_norm": 1.046420931816101, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 74350 + }, + { + "epoch": 5.340035906642729, + "grad_norm": 0.8862320184707642, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 74360 + }, + { + "epoch": 5.340754039497307, + "grad_norm": 0.8197309970855713, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 74370 + }, + { + "epoch": 5.341472172351885, + "grad_norm": 0.9539661407470703, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 74380 + }, + { + "epoch": 5.342190305206463, + "grad_norm": 1.481026530265808, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 74390 + }, + { + "epoch": 5.342908438061041, + "grad_norm": 1.0685169696807861, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 74400 + }, + { + "epoch": 5.343626570915619, + "grad_norm": 1.1468359231948853, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 74410 + }, + { + "epoch": 5.344344703770197, + "grad_norm": 0.9982373714447021, + "learning_rate": 0.0002, + "loss": 0.556, + "step": 74420 + }, + { + "epoch": 5.345062836624775, + "grad_norm": 0.9273471236228943, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 74430 + }, + { + "epoch": 5.345780969479353, + "grad_norm": 1.058828592300415, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 74440 + }, + { + "epoch": 5.346499102333932, + "grad_norm": 1.0442006587982178, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 74450 + }, + { + "epoch": 5.34721723518851, + "grad_norm": 1.0955053567886353, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 74460 + }, + { + "epoch": 5.347935368043088, + "grad_norm": 0.9326002597808838, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 74470 + }, + { + "epoch": 5.348653500897666, + "grad_norm": 0.9496979117393494, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 74480 + }, + { + "epoch": 5.349371633752244, + "grad_norm": 1.1995937824249268, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 74490 + }, + { + "epoch": 5.350089766606822, + "grad_norm": 0.8761899471282959, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 74500 + }, + { + "epoch": 5.3508078994614, + "grad_norm": 1.2390170097351074, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 74510 + }, + { + "epoch": 5.351526032315978, + "grad_norm": 0.9101138114929199, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 74520 + }, + { + "epoch": 5.352244165170557, + "grad_norm": 0.925466001033783, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 74530 + }, + { + "epoch": 5.352962298025135, + "grad_norm": 0.9483969807624817, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 74540 + }, + { + "epoch": 5.353680430879713, + "grad_norm": 1.0530859231948853, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 74550 + }, + { + "epoch": 5.354398563734291, + "grad_norm": 1.209647536277771, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 74560 + }, + { + "epoch": 5.355116696588869, + "grad_norm": 0.9849331378936768, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 74570 + }, + { + "epoch": 5.355834829443447, + "grad_norm": 1.0822848081588745, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 74580 + }, + { + "epoch": 5.356552962298025, + "grad_norm": 1.1460528373718262, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 74590 + }, + { + "epoch": 5.357271095152603, + "grad_norm": 0.9509134292602539, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 74600 + }, + { + "epoch": 5.357989228007181, + "grad_norm": 0.9884999394416809, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 74610 + }, + { + "epoch": 5.358707360861759, + "grad_norm": 0.9619579911231995, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 74620 + }, + { + "epoch": 5.359425493716338, + "grad_norm": 0.8596125245094299, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 74630 + }, + { + "epoch": 5.360143626570916, + "grad_norm": 1.16913640499115, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 74640 + }, + { + "epoch": 5.360861759425494, + "grad_norm": 0.99276202917099, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 74650 + }, + { + "epoch": 5.361579892280072, + "grad_norm": 1.1293696165084839, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 74660 + }, + { + "epoch": 5.36229802513465, + "grad_norm": 1.187947154045105, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 74670 + }, + { + "epoch": 5.363016157989228, + "grad_norm": 0.8637247681617737, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 74680 + }, + { + "epoch": 5.363734290843806, + "grad_norm": 1.1049476861953735, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 74690 + }, + { + "epoch": 5.364452423698384, + "grad_norm": 1.1736515760421753, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 74700 + }, + { + "epoch": 5.365170556552962, + "grad_norm": 1.0203301906585693, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 74710 + }, + { + "epoch": 5.365888689407541, + "grad_norm": 1.15559720993042, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 74720 + }, + { + "epoch": 5.366606822262119, + "grad_norm": 1.2008144855499268, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 74730 + }, + { + "epoch": 5.367324955116697, + "grad_norm": 1.0385756492614746, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 74740 + }, + { + "epoch": 5.368043087971275, + "grad_norm": 0.8964240550994873, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 74750 + }, + { + "epoch": 5.368761220825853, + "grad_norm": 0.9824761748313904, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 74760 + }, + { + "epoch": 5.369479353680431, + "grad_norm": 0.8815994262695312, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 74770 + }, + { + "epoch": 5.370197486535009, + "grad_norm": 0.9729493856430054, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 74780 + }, + { + "epoch": 5.370915619389587, + "grad_norm": 1.1032123565673828, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 74790 + }, + { + "epoch": 5.371633752244165, + "grad_norm": 1.039591908454895, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 74800 + }, + { + "epoch": 5.372351885098743, + "grad_norm": 0.9741610884666443, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 74810 + }, + { + "epoch": 5.373070017953322, + "grad_norm": 0.9789814949035645, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 74820 + }, + { + "epoch": 5.3737881508079, + "grad_norm": 1.0777033567428589, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 74830 + }, + { + "epoch": 5.374506283662478, + "grad_norm": 0.9058641195297241, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 74840 + }, + { + "epoch": 5.375224416517056, + "grad_norm": 1.2161815166473389, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 74850 + }, + { + "epoch": 5.375942549371634, + "grad_norm": 1.1079481840133667, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 74860 + }, + { + "epoch": 5.376660682226212, + "grad_norm": 0.9494470357894897, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 74870 + }, + { + "epoch": 5.37737881508079, + "grad_norm": 1.0116358995437622, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 74880 + }, + { + "epoch": 5.378096947935368, + "grad_norm": 0.9382423162460327, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 74890 + }, + { + "epoch": 5.378815080789946, + "grad_norm": 1.036151647567749, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 74900 + }, + { + "epoch": 5.379533213644525, + "grad_norm": 0.9436623454093933, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 74910 + }, + { + "epoch": 5.380251346499103, + "grad_norm": 1.0149152278900146, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 74920 + }, + { + "epoch": 5.380969479353681, + "grad_norm": 1.1645641326904297, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 74930 + }, + { + "epoch": 5.381687612208259, + "grad_norm": 1.002287745475769, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 74940 + }, + { + "epoch": 5.382405745062837, + "grad_norm": 1.1176437139511108, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 74950 + }, + { + "epoch": 5.383123877917415, + "grad_norm": 0.9210802912712097, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 74960 + }, + { + "epoch": 5.383842010771993, + "grad_norm": 1.1873447895050049, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 74970 + }, + { + "epoch": 5.384560143626571, + "grad_norm": 0.8372976779937744, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 74980 + }, + { + "epoch": 5.385278276481149, + "grad_norm": 0.9220532178878784, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 74990 + }, + { + "epoch": 5.385996409335727, + "grad_norm": 0.9196901917457581, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 75000 + }, + { + "epoch": 5.3867145421903055, + "grad_norm": 0.9325235486030579, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 75010 + }, + { + "epoch": 5.3874326750448835, + "grad_norm": 1.0902531147003174, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 75020 + }, + { + "epoch": 5.3881508078994615, + "grad_norm": 1.049468755722046, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 75030 + }, + { + "epoch": 5.3888689407540395, + "grad_norm": 0.9372574687004089, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 75040 + }, + { + "epoch": 5.3895870736086176, + "grad_norm": 0.9013437628746033, + "learning_rate": 0.0002, + "loss": 0.6158, + "step": 75050 + }, + { + "epoch": 5.3903052064631956, + "grad_norm": 1.2111071348190308, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 75060 + }, + { + "epoch": 5.3910233393177736, + "grad_norm": 1.0006011724472046, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 75070 + }, + { + "epoch": 5.391741472172352, + "grad_norm": 0.9180546402931213, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 75080 + }, + { + "epoch": 5.3924596050269304, + "grad_norm": 1.096113920211792, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 75090 + }, + { + "epoch": 5.3931777378815084, + "grad_norm": 0.9041603207588196, + "learning_rate": 0.0002, + "loss": 0.5416, + "step": 75100 + }, + { + "epoch": 5.3938958707360865, + "grad_norm": 0.9675783514976501, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 75110 + }, + { + "epoch": 5.3946140035906645, + "grad_norm": 1.0952513217926025, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 75120 + }, + { + "epoch": 5.3953321364452425, + "grad_norm": 1.0166294574737549, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75130 + }, + { + "epoch": 5.3960502692998205, + "grad_norm": 1.0892874002456665, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 75140 + }, + { + "epoch": 5.3967684021543985, + "grad_norm": 0.9894046187400818, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 75150 + }, + { + "epoch": 5.3974865350089765, + "grad_norm": 0.9991754293441772, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 75160 + }, + { + "epoch": 5.3982046678635545, + "grad_norm": 1.1027519702911377, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 75170 + }, + { + "epoch": 5.3989228007181325, + "grad_norm": 1.0579880475997925, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 75180 + }, + { + "epoch": 5.399640933572711, + "grad_norm": 1.1149101257324219, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 75190 + }, + { + "epoch": 5.400359066427289, + "grad_norm": 0.8802945017814636, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 75200 + }, + { + "epoch": 5.401077199281867, + "grad_norm": 0.9168137907981873, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 75210 + }, + { + "epoch": 5.401795332136445, + "grad_norm": 1.232630968093872, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 75220 + }, + { + "epoch": 5.402513464991023, + "grad_norm": 1.1038591861724854, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 75230 + }, + { + "epoch": 5.403231597845601, + "grad_norm": 0.8985993266105652, + "learning_rate": 0.0002, + "loss": 0.5754, + "step": 75240 + }, + { + "epoch": 5.403949730700179, + "grad_norm": 1.1096316576004028, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 75250 + }, + { + "epoch": 5.404667863554757, + "grad_norm": 0.8516051173210144, + "learning_rate": 0.0002, + "loss": 0.5834, + "step": 75260 + }, + { + "epoch": 5.405385996409335, + "grad_norm": 0.9967356324195862, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 75270 + }, + { + "epoch": 5.406104129263914, + "grad_norm": 1.0092874765396118, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 75280 + }, + { + "epoch": 5.406822262118492, + "grad_norm": 1.049838662147522, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 75290 + }, + { + "epoch": 5.40754039497307, + "grad_norm": 1.1491070985794067, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 75300 + }, + { + "epoch": 5.408258527827648, + "grad_norm": 0.9348118901252747, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 75310 + }, + { + "epoch": 5.408976660682226, + "grad_norm": 1.1226147413253784, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 75320 + }, + { + "epoch": 5.409694793536804, + "grad_norm": 0.9042587876319885, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 75330 + }, + { + "epoch": 5.410412926391382, + "grad_norm": 1.1212877035140991, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 75340 + }, + { + "epoch": 5.41113105924596, + "grad_norm": 0.9805570840835571, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 75350 + }, + { + "epoch": 5.411849192100538, + "grad_norm": 0.9803917407989502, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 75360 + }, + { + "epoch": 5.412567324955116, + "grad_norm": 1.2139064073562622, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 75370 + }, + { + "epoch": 5.413285457809695, + "grad_norm": 0.9510865211486816, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 75380 + }, + { + "epoch": 5.414003590664273, + "grad_norm": 1.0752202272415161, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 75390 + }, + { + "epoch": 5.414721723518851, + "grad_norm": 1.1144053936004639, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 75400 + }, + { + "epoch": 5.415439856373429, + "grad_norm": 1.128998875617981, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 75410 + }, + { + "epoch": 5.416157989228007, + "grad_norm": 1.2901849746704102, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 75420 + }, + { + "epoch": 5.416876122082585, + "grad_norm": 1.2822786569595337, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 75430 + }, + { + "epoch": 5.417594254937163, + "grad_norm": 0.8724783658981323, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 75440 + }, + { + "epoch": 5.418312387791741, + "grad_norm": 1.1321152448654175, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 75450 + }, + { + "epoch": 5.419030520646319, + "grad_norm": 1.1211779117584229, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 75460 + }, + { + "epoch": 5.419748653500898, + "grad_norm": 1.0542290210723877, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 75470 + }, + { + "epoch": 5.420466786355476, + "grad_norm": 0.9432206153869629, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 75480 + }, + { + "epoch": 5.421184919210054, + "grad_norm": 1.2051608562469482, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 75490 + }, + { + "epoch": 5.421903052064632, + "grad_norm": 1.188256859779358, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 75500 + }, + { + "epoch": 5.42262118491921, + "grad_norm": 1.2768784761428833, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 75510 + }, + { + "epoch": 5.423339317773788, + "grad_norm": 0.8228567242622375, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75520 + }, + { + "epoch": 5.424057450628366, + "grad_norm": 1.235684871673584, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 75530 + }, + { + "epoch": 5.424775583482944, + "grad_norm": 0.8361109495162964, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 75540 + }, + { + "epoch": 5.425493716337522, + "grad_norm": 1.0450727939605713, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 75550 + }, + { + "epoch": 5.4262118491921, + "grad_norm": 0.9942979216575623, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 75560 + }, + { + "epoch": 5.426929982046679, + "grad_norm": 0.8162592053413391, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 75570 + }, + { + "epoch": 5.427648114901257, + "grad_norm": 0.9193033576011658, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 75580 + }, + { + "epoch": 5.428366247755835, + "grad_norm": 1.095130443572998, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 75590 + }, + { + "epoch": 5.429084380610413, + "grad_norm": 1.1752824783325195, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 75600 + }, + { + "epoch": 5.429802513464991, + "grad_norm": 1.2007960081100464, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 75610 + }, + { + "epoch": 5.430520646319569, + "grad_norm": 0.997347354888916, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 75620 + }, + { + "epoch": 5.431238779174147, + "grad_norm": 1.3878827095031738, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 75630 + }, + { + "epoch": 5.431956912028725, + "grad_norm": 1.1839812994003296, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 75640 + }, + { + "epoch": 5.432675044883303, + "grad_norm": 0.9912546873092651, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 75650 + }, + { + "epoch": 5.433393177737882, + "grad_norm": 0.9305517673492432, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 75660 + }, + { + "epoch": 5.43411131059246, + "grad_norm": 1.0036604404449463, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 75670 + }, + { + "epoch": 5.434829443447038, + "grad_norm": 1.2500226497650146, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 75680 + }, + { + "epoch": 5.435547576301616, + "grad_norm": 0.9476167559623718, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 75690 + }, + { + "epoch": 5.436265709156194, + "grad_norm": 0.9769760370254517, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 75700 + }, + { + "epoch": 5.436983842010772, + "grad_norm": 1.1001025438308716, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 75710 + }, + { + "epoch": 5.43770197486535, + "grad_norm": 1.1783069372177124, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 75720 + }, + { + "epoch": 5.438420107719928, + "grad_norm": 0.887438952922821, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 75730 + }, + { + "epoch": 5.439138240574506, + "grad_norm": 0.9631154537200928, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 75740 + }, + { + "epoch": 5.439856373429085, + "grad_norm": 1.0824158191680908, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 75750 + }, + { + "epoch": 5.440574506283663, + "grad_norm": 1.0108296871185303, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 75760 + }, + { + "epoch": 5.441292639138241, + "grad_norm": 1.1728253364562988, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 75770 + }, + { + "epoch": 5.442010771992819, + "grad_norm": 1.0904773473739624, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 75780 + }, + { + "epoch": 5.442728904847397, + "grad_norm": 0.8982957601547241, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 75790 + }, + { + "epoch": 5.443447037701975, + "grad_norm": 1.0233404636383057, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 75800 + }, + { + "epoch": 5.444165170556553, + "grad_norm": 1.0092064142227173, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 75810 + }, + { + "epoch": 5.444883303411131, + "grad_norm": 1.2747842073440552, + "learning_rate": 0.0002, + "loss": 0.5673, + "step": 75820 + }, + { + "epoch": 5.445601436265709, + "grad_norm": 1.0365403890609741, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 75830 + }, + { + "epoch": 5.446319569120288, + "grad_norm": 1.0413976907730103, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 75840 + }, + { + "epoch": 5.447037701974866, + "grad_norm": 0.8858456015586853, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 75850 + }, + { + "epoch": 5.447755834829444, + "grad_norm": 0.9823445677757263, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 75860 + }, + { + "epoch": 5.448473967684022, + "grad_norm": 0.8515284061431885, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 75870 + }, + { + "epoch": 5.4491921005386, + "grad_norm": 1.130850911140442, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 75880 + }, + { + "epoch": 5.449910233393178, + "grad_norm": 0.984725832939148, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 75890 + }, + { + "epoch": 5.450628366247756, + "grad_norm": 1.1701595783233643, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 75900 + }, + { + "epoch": 5.451346499102334, + "grad_norm": 0.8988107442855835, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 75910 + }, + { + "epoch": 5.452064631956912, + "grad_norm": 0.9909947514533997, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 75920 + }, + { + "epoch": 5.45278276481149, + "grad_norm": 0.8861672282218933, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 75930 + }, + { + "epoch": 5.453500897666069, + "grad_norm": 0.9513981938362122, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 75940 + }, + { + "epoch": 5.454219030520647, + "grad_norm": 1.0320760011672974, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 75950 + }, + { + "epoch": 5.454937163375225, + "grad_norm": 0.9830206632614136, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 75960 + }, + { + "epoch": 5.455655296229803, + "grad_norm": 0.9816349148750305, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 75970 + }, + { + "epoch": 5.456373429084381, + "grad_norm": 0.9741218090057373, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 75980 + }, + { + "epoch": 5.457091561938959, + "grad_norm": 1.1291148662567139, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 75990 + }, + { + "epoch": 5.457809694793537, + "grad_norm": 0.9770109057426453, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 76000 + }, + { + "epoch": 5.458527827648115, + "grad_norm": 1.0204377174377441, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 76010 + }, + { + "epoch": 5.459245960502693, + "grad_norm": 1.0453336238861084, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 76020 + }, + { + "epoch": 5.4599640933572715, + "grad_norm": 1.1595505475997925, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 76030 + }, + { + "epoch": 5.4606822262118495, + "grad_norm": 1.1686701774597168, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 76040 + }, + { + "epoch": 5.4614003590664275, + "grad_norm": 1.14364755153656, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 76050 + }, + { + "epoch": 5.4621184919210055, + "grad_norm": 0.9742125868797302, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 76060 + }, + { + "epoch": 5.4628366247755835, + "grad_norm": 0.8235608339309692, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 76070 + }, + { + "epoch": 5.4635547576301615, + "grad_norm": 0.9801425337791443, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 76080 + }, + { + "epoch": 5.4642728904847395, + "grad_norm": 0.9001221060752869, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 76090 + }, + { + "epoch": 5.4649910233393175, + "grad_norm": 0.9292157888412476, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 76100 + }, + { + "epoch": 5.4657091561938955, + "grad_norm": 1.0024322271347046, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 76110 + }, + { + "epoch": 5.4664272890484735, + "grad_norm": 0.8057159781455994, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 76120 + }, + { + "epoch": 5.467145421903052, + "grad_norm": 1.0617927312850952, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 76130 + }, + { + "epoch": 5.46786355475763, + "grad_norm": 1.003967046737671, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 76140 + }, + { + "epoch": 5.468581687612208, + "grad_norm": 0.903408944606781, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 76150 + }, + { + "epoch": 5.469299820466786, + "grad_norm": 0.8173895478248596, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 76160 + }, + { + "epoch": 5.470017953321364, + "grad_norm": 1.0187482833862305, + "learning_rate": 0.0002, + "loss": 0.5526, + "step": 76170 + }, + { + "epoch": 5.470736086175942, + "grad_norm": 1.0418041944503784, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 76180 + }, + { + "epoch": 5.47145421903052, + "grad_norm": 0.9768357872962952, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 76190 + }, + { + "epoch": 5.472172351885098, + "grad_norm": 1.0834382772445679, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 76200 + }, + { + "epoch": 5.472890484739676, + "grad_norm": 0.8447439670562744, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 76210 + }, + { + "epoch": 5.473608617594255, + "grad_norm": 0.9379050135612488, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 76220 + }, + { + "epoch": 5.474326750448833, + "grad_norm": 1.0395485162734985, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 76230 + }, + { + "epoch": 5.475044883303411, + "grad_norm": 1.2082624435424805, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 76240 + }, + { + "epoch": 5.475763016157989, + "grad_norm": 1.0714443922042847, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 76250 + }, + { + "epoch": 5.476481149012567, + "grad_norm": 0.945319414138794, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 76260 + }, + { + "epoch": 5.477199281867145, + "grad_norm": 1.1415241956710815, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 76270 + }, + { + "epoch": 5.477917414721723, + "grad_norm": 0.9221673011779785, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 76280 + }, + { + "epoch": 5.478635547576301, + "grad_norm": 1.0118398666381836, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 76290 + }, + { + "epoch": 5.479353680430879, + "grad_norm": 1.396807312965393, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 76300 + }, + { + "epoch": 5.480071813285457, + "grad_norm": 1.0437991619110107, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 76310 + }, + { + "epoch": 5.480789946140036, + "grad_norm": 1.5910401344299316, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 76320 + }, + { + "epoch": 5.481508078994614, + "grad_norm": 0.9262010455131531, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 76330 + }, + { + "epoch": 5.482226211849192, + "grad_norm": 1.2534247636795044, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 76340 + }, + { + "epoch": 5.48294434470377, + "grad_norm": 1.186294674873352, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 76350 + }, + { + "epoch": 5.483662477558348, + "grad_norm": 0.9822857975959778, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 76360 + }, + { + "epoch": 5.484380610412926, + "grad_norm": 1.0006381273269653, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 76370 + }, + { + "epoch": 5.485098743267504, + "grad_norm": 0.8960304260253906, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 76380 + }, + { + "epoch": 5.485816876122082, + "grad_norm": 0.7309539914131165, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 76390 + }, + { + "epoch": 5.486535008976661, + "grad_norm": 0.9747139811515808, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 76400 + }, + { + "epoch": 5.487253141831239, + "grad_norm": 0.9586864113807678, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 76410 + }, + { + "epoch": 5.487971274685817, + "grad_norm": 1.0815327167510986, + "learning_rate": 0.0002, + "loss": 0.6236, + "step": 76420 + }, + { + "epoch": 5.488689407540395, + "grad_norm": 1.1324117183685303, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 76430 + }, + { + "epoch": 5.489407540394973, + "grad_norm": 0.8575648069381714, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 76440 + }, + { + "epoch": 5.490125673249551, + "grad_norm": 0.9821682572364807, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 76450 + }, + { + "epoch": 5.490843806104129, + "grad_norm": 1.1611464023590088, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 76460 + }, + { + "epoch": 5.491561938958707, + "grad_norm": 1.0340297222137451, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 76470 + }, + { + "epoch": 5.492280071813285, + "grad_norm": 1.0116628408432007, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 76480 + }, + { + "epoch": 5.492998204667863, + "grad_norm": 0.9619752764701843, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 76490 + }, + { + "epoch": 5.493716337522442, + "grad_norm": 0.9924456477165222, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 76500 + }, + { + "epoch": 5.49443447037702, + "grad_norm": 0.9449224472045898, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 76510 + }, + { + "epoch": 5.495152603231598, + "grad_norm": 0.9075009822845459, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 76520 + }, + { + "epoch": 5.495870736086176, + "grad_norm": 1.3078763484954834, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 76530 + }, + { + "epoch": 5.496588868940754, + "grad_norm": 1.3162729740142822, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 76540 + }, + { + "epoch": 5.497307001795332, + "grad_norm": 1.144333839416504, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 76550 + }, + { + "epoch": 5.49802513464991, + "grad_norm": 0.9332208633422852, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 76560 + }, + { + "epoch": 5.498743267504488, + "grad_norm": 0.9660165309906006, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 76570 + }, + { + "epoch": 5.499461400359066, + "grad_norm": 1.0954749584197998, + "learning_rate": 0.0002, + "loss": 0.6023, + "step": 76580 + }, + { + "epoch": 5.500179533213645, + "grad_norm": 1.0537810325622559, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 76590 + }, + { + "epoch": 5.500897666068223, + "grad_norm": 0.9944321513175964, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 76600 + }, + { + "epoch": 5.501615798922801, + "grad_norm": 1.094462513923645, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 76610 + }, + { + "epoch": 5.502333931777379, + "grad_norm": 1.0246481895446777, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 76620 + }, + { + "epoch": 5.503052064631957, + "grad_norm": 0.9705453515052795, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 76630 + }, + { + "epoch": 5.503770197486535, + "grad_norm": 1.5252249240875244, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 76640 + }, + { + "epoch": 5.504488330341113, + "grad_norm": 0.8469606637954712, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 76650 + }, + { + "epoch": 5.505206463195691, + "grad_norm": 1.1882504224777222, + "learning_rate": 0.0002, + "loss": 0.6125, + "step": 76660 + }, + { + "epoch": 5.505924596050269, + "grad_norm": 0.8447994589805603, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 76670 + }, + { + "epoch": 5.506642728904847, + "grad_norm": 0.9340696930885315, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 76680 + }, + { + "epoch": 5.507360861759426, + "grad_norm": 0.9622383713722229, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 76690 + }, + { + "epoch": 5.508078994614004, + "grad_norm": 1.1516523361206055, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 76700 + }, + { + "epoch": 5.508797127468582, + "grad_norm": 1.207190990447998, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 76710 + }, + { + "epoch": 5.50951526032316, + "grad_norm": 1.1244179010391235, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 76720 + }, + { + "epoch": 5.510233393177738, + "grad_norm": 1.052288293838501, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 76730 + }, + { + "epoch": 5.510951526032316, + "grad_norm": 0.9571291208267212, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 76740 + }, + { + "epoch": 5.511669658886894, + "grad_norm": 0.9449458122253418, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 76750 + }, + { + "epoch": 5.512387791741472, + "grad_norm": 1.0140511989593506, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 76760 + }, + { + "epoch": 5.513105924596051, + "grad_norm": 1.057715654373169, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 76770 + }, + { + "epoch": 5.513824057450629, + "grad_norm": 0.930642306804657, + "learning_rate": 0.0002, + "loss": 0.5643, + "step": 76780 + }, + { + "epoch": 5.514542190305207, + "grad_norm": 1.1213828325271606, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 76790 + }, + { + "epoch": 5.515260323159785, + "grad_norm": 0.9147387742996216, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 76800 + }, + { + "epoch": 5.515978456014363, + "grad_norm": 1.1786983013153076, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 76810 + }, + { + "epoch": 5.516696588868941, + "grad_norm": 1.1022626161575317, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 76820 + }, + { + "epoch": 5.517414721723519, + "grad_norm": 1.0389000177383423, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 76830 + }, + { + "epoch": 5.518132854578097, + "grad_norm": 1.0750621557235718, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 76840 + }, + { + "epoch": 5.518850987432675, + "grad_norm": 1.0372626781463623, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 76850 + }, + { + "epoch": 5.519569120287253, + "grad_norm": 1.0989108085632324, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 76860 + }, + { + "epoch": 5.520287253141831, + "grad_norm": 1.030346155166626, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 76870 + }, + { + "epoch": 5.52100538599641, + "grad_norm": 1.1362419128417969, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 76880 + }, + { + "epoch": 5.521723518850988, + "grad_norm": 0.9110873937606812, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 76890 + }, + { + "epoch": 5.522441651705566, + "grad_norm": 1.0214358568191528, + "learning_rate": 0.0002, + "loss": 0.6161, + "step": 76900 + }, + { + "epoch": 5.523159784560144, + "grad_norm": 1.3764830827713013, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 76910 + }, + { + "epoch": 5.523877917414722, + "grad_norm": 1.0396335124969482, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 76920 + }, + { + "epoch": 5.5245960502693, + "grad_norm": 1.1942898035049438, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 76930 + }, + { + "epoch": 5.525314183123878, + "grad_norm": 0.8795760869979858, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 76940 + }, + { + "epoch": 5.526032315978456, + "grad_norm": 1.1081048250198364, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 76950 + }, + { + "epoch": 5.526750448833035, + "grad_norm": 0.9652274250984192, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 76960 + }, + { + "epoch": 5.527468581687613, + "grad_norm": 0.96559739112854, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 76970 + }, + { + "epoch": 5.528186714542191, + "grad_norm": 1.0416076183319092, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 76980 + }, + { + "epoch": 5.528904847396769, + "grad_norm": 0.9854229092597961, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 76990 + }, + { + "epoch": 5.529622980251347, + "grad_norm": 1.0515462160110474, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 77000 + }, + { + "epoch": 5.530341113105925, + "grad_norm": 1.0287327766418457, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 77010 + }, + { + "epoch": 5.531059245960503, + "grad_norm": 0.9579883217811584, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 77020 + }, + { + "epoch": 5.531777378815081, + "grad_norm": 1.0365805625915527, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 77030 + }, + { + "epoch": 5.532495511669659, + "grad_norm": 1.1600725650787354, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 77040 + }, + { + "epoch": 5.533213644524237, + "grad_norm": 0.8598031401634216, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 77050 + }, + { + "epoch": 5.533931777378815, + "grad_norm": 0.8884791731834412, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 77060 + }, + { + "epoch": 5.5346499102333935, + "grad_norm": 0.900223433971405, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 77070 + }, + { + "epoch": 5.5353680430879715, + "grad_norm": 1.0212652683258057, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 77080 + }, + { + "epoch": 5.5360861759425495, + "grad_norm": 1.0924701690673828, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 77090 + }, + { + "epoch": 5.5368043087971275, + "grad_norm": 1.1955485343933105, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 77100 + }, + { + "epoch": 5.5375224416517055, + "grad_norm": 1.2157706022262573, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 77110 + }, + { + "epoch": 5.5382405745062835, + "grad_norm": 1.1118255853652954, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 77120 + }, + { + "epoch": 5.5389587073608615, + "grad_norm": 1.0146820545196533, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 77130 + }, + { + "epoch": 5.5396768402154395, + "grad_norm": 1.0876632928848267, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 77140 + }, + { + "epoch": 5.540394973070018, + "grad_norm": 0.7914495468139648, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 77150 + }, + { + "epoch": 5.541113105924596, + "grad_norm": 1.0584027767181396, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 77160 + }, + { + "epoch": 5.541831238779174, + "grad_norm": 0.9816845059394836, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 77170 + }, + { + "epoch": 5.542549371633752, + "grad_norm": 1.219076156616211, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 77180 + }, + { + "epoch": 5.54326750448833, + "grad_norm": 0.9526635408401489, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 77190 + }, + { + "epoch": 5.543985637342908, + "grad_norm": 0.8437230587005615, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 77200 + }, + { + "epoch": 5.544703770197486, + "grad_norm": 0.9670451283454895, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 77210 + }, + { + "epoch": 5.545421903052064, + "grad_norm": 1.015687346458435, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 77220 + }, + { + "epoch": 5.546140035906642, + "grad_norm": 0.8280553817749023, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 77230 + }, + { + "epoch": 5.54685816876122, + "grad_norm": 1.1320816278457642, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 77240 + }, + { + "epoch": 5.547576301615799, + "grad_norm": 1.3338711261749268, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 77250 + }, + { + "epoch": 5.548294434470377, + "grad_norm": 0.9553194642066956, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 77260 + }, + { + "epoch": 5.549012567324955, + "grad_norm": 1.0604912042617798, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 77270 + }, + { + "epoch": 5.549730700179533, + "grad_norm": 1.1037590503692627, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 77280 + }, + { + "epoch": 5.550448833034111, + "grad_norm": 1.166212558746338, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 77290 + }, + { + "epoch": 5.551166965888689, + "grad_norm": 1.0189802646636963, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 77300 + }, + { + "epoch": 5.551885098743267, + "grad_norm": 0.9592387080192566, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 77310 + }, + { + "epoch": 5.552603231597845, + "grad_norm": 0.9533785581588745, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 77320 + }, + { + "epoch": 5.553321364452424, + "grad_norm": 0.9666807055473328, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 77330 + }, + { + "epoch": 5.554039497307002, + "grad_norm": 0.8827478289604187, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 77340 + }, + { + "epoch": 5.55475763016158, + "grad_norm": 0.9574757814407349, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 77350 + }, + { + "epoch": 5.555475763016158, + "grad_norm": 1.14597487449646, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 77360 + }, + { + "epoch": 5.556193895870736, + "grad_norm": 1.009392499923706, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 77370 + }, + { + "epoch": 5.556912028725314, + "grad_norm": 1.115757942199707, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 77380 + }, + { + "epoch": 5.557630161579892, + "grad_norm": 0.9907452464103699, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 77390 + }, + { + "epoch": 5.55834829443447, + "grad_norm": 1.0667012929916382, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 77400 + }, + { + "epoch": 5.559066427289048, + "grad_norm": 0.9301251173019409, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 77410 + }, + { + "epoch": 5.559784560143626, + "grad_norm": 1.090384602546692, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 77420 + }, + { + "epoch": 5.560502692998204, + "grad_norm": 0.8073469996452332, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 77430 + }, + { + "epoch": 5.561220825852783, + "grad_norm": 1.1003652811050415, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 77440 + }, + { + "epoch": 5.561938958707361, + "grad_norm": 0.9493791460990906, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 77450 + }, + { + "epoch": 5.562657091561939, + "grad_norm": 0.925388514995575, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 77460 + }, + { + "epoch": 5.563375224416517, + "grad_norm": 1.0946427583694458, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 77470 + }, + { + "epoch": 5.564093357271095, + "grad_norm": 0.9791404008865356, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 77480 + }, + { + "epoch": 5.564811490125673, + "grad_norm": 1.0534733533859253, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 77490 + }, + { + "epoch": 5.565529622980251, + "grad_norm": 0.9351776242256165, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 77500 + }, + { + "epoch": 5.566247755834829, + "grad_norm": 1.004448413848877, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 77510 + }, + { + "epoch": 5.566965888689408, + "grad_norm": 1.0199403762817383, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 77520 + }, + { + "epoch": 5.567684021543986, + "grad_norm": 1.0693204402923584, + "learning_rate": 0.0002, + "loss": 0.6358, + "step": 77530 + }, + { + "epoch": 5.568402154398564, + "grad_norm": 1.0635178089141846, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 77540 + }, + { + "epoch": 5.569120287253142, + "grad_norm": 1.1154648065567017, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 77550 + }, + { + "epoch": 5.56983842010772, + "grad_norm": 0.999116837978363, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 77560 + }, + { + "epoch": 5.570556552962298, + "grad_norm": 0.9967397451400757, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 77570 + }, + { + "epoch": 5.571274685816876, + "grad_norm": 0.9684699773788452, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 77580 + }, + { + "epoch": 5.571992818671454, + "grad_norm": 1.027213454246521, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 77590 + }, + { + "epoch": 5.572710951526032, + "grad_norm": 1.0571194887161255, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 77600 + }, + { + "epoch": 5.57342908438061, + "grad_norm": 1.2010499238967896, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 77610 + }, + { + "epoch": 5.574147217235188, + "grad_norm": 1.1033680438995361, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 77620 + }, + { + "epoch": 5.574865350089767, + "grad_norm": 0.9394578337669373, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 77630 + }, + { + "epoch": 5.575583482944345, + "grad_norm": 1.379382610321045, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 77640 + }, + { + "epoch": 5.576301615798923, + "grad_norm": 0.9787197709083557, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 77650 + }, + { + "epoch": 5.577019748653501, + "grad_norm": 0.9680284261703491, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 77660 + }, + { + "epoch": 5.577737881508079, + "grad_norm": 1.0449682474136353, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 77670 + }, + { + "epoch": 5.578456014362657, + "grad_norm": 1.1243085861206055, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 77680 + }, + { + "epoch": 5.579174147217235, + "grad_norm": 0.9228966236114502, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 77690 + }, + { + "epoch": 5.579892280071813, + "grad_norm": 1.1349890232086182, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 77700 + }, + { + "epoch": 5.580610412926392, + "grad_norm": 1.2248499393463135, + "learning_rate": 0.0002, + "loss": 0.6272, + "step": 77710 + }, + { + "epoch": 5.58132854578097, + "grad_norm": 1.0066324472427368, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 77720 + }, + { + "epoch": 5.582046678635548, + "grad_norm": 1.2642878293991089, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 77730 + }, + { + "epoch": 5.582764811490126, + "grad_norm": 1.031591534614563, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 77740 + }, + { + "epoch": 5.583482944344704, + "grad_norm": 1.0925929546356201, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 77750 + }, + { + "epoch": 5.584201077199282, + "grad_norm": 1.0567110776901245, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 77760 + }, + { + "epoch": 5.58491921005386, + "grad_norm": 1.246246099472046, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 77770 + }, + { + "epoch": 5.585637342908438, + "grad_norm": 1.2467739582061768, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 77780 + }, + { + "epoch": 5.586355475763016, + "grad_norm": 1.2695211172103882, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 77790 + }, + { + "epoch": 5.587073608617594, + "grad_norm": 1.0498571395874023, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 77800 + }, + { + "epoch": 5.587791741472173, + "grad_norm": 1.0078339576721191, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 77810 + }, + { + "epoch": 5.588509874326751, + "grad_norm": 1.108199954032898, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 77820 + }, + { + "epoch": 5.589228007181329, + "grad_norm": 1.0577641725540161, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 77830 + }, + { + "epoch": 5.589946140035907, + "grad_norm": 1.2169439792633057, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 77840 + }, + { + "epoch": 5.590664272890485, + "grad_norm": 0.8310868740081787, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 77850 + }, + { + "epoch": 5.591382405745063, + "grad_norm": 0.9794082045555115, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 77860 + }, + { + "epoch": 5.592100538599641, + "grad_norm": 0.8867404460906982, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 77870 + }, + { + "epoch": 5.592818671454219, + "grad_norm": 0.9204208254814148, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 77880 + }, + { + "epoch": 5.593536804308797, + "grad_norm": 0.9801714420318604, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 77890 + }, + { + "epoch": 5.594254937163376, + "grad_norm": 0.9383925199508667, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 77900 + }, + { + "epoch": 5.594973070017954, + "grad_norm": 0.9124664068222046, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 77910 + }, + { + "epoch": 5.595691202872532, + "grad_norm": 0.9618783593177795, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 77920 + }, + { + "epoch": 5.59640933572711, + "grad_norm": 0.9575216770172119, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 77930 + }, + { + "epoch": 5.597127468581688, + "grad_norm": 1.1223464012145996, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 77940 + }, + { + "epoch": 5.597845601436266, + "grad_norm": 0.9947475790977478, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 77950 + }, + { + "epoch": 5.598563734290844, + "grad_norm": 1.141959309577942, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 77960 + }, + { + "epoch": 5.599281867145422, + "grad_norm": 1.095525860786438, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 77970 + }, + { + "epoch": 5.6, + "grad_norm": 0.9396624565124512, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 77980 + }, + { + "epoch": 5.600718132854578, + "grad_norm": 0.8162274956703186, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 77990 + }, + { + "epoch": 5.6014362657091565, + "grad_norm": 1.0130535364151, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 78000 + }, + { + "epoch": 5.6021543985637345, + "grad_norm": 1.0016634464263916, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 78010 + }, + { + "epoch": 5.6028725314183125, + "grad_norm": 0.8936169743537903, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 78020 + }, + { + "epoch": 5.6035906642728905, + "grad_norm": 1.169625163078308, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 78030 + }, + { + "epoch": 5.6043087971274685, + "grad_norm": 0.8896323442459106, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 78040 + }, + { + "epoch": 5.6050269299820465, + "grad_norm": 1.0939475297927856, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 78050 + }, + { + "epoch": 5.6057450628366245, + "grad_norm": 1.0880711078643799, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 78060 + }, + { + "epoch": 5.6064631956912026, + "grad_norm": 1.1426655054092407, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 78070 + }, + { + "epoch": 5.607181328545781, + "grad_norm": 1.118586540222168, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 78080 + }, + { + "epoch": 5.607899461400359, + "grad_norm": 0.8784464597702026, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 78090 + }, + { + "epoch": 5.608617594254937, + "grad_norm": 1.137229561805725, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 78100 + }, + { + "epoch": 5.6093357271095154, + "grad_norm": 1.1041932106018066, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 78110 + }, + { + "epoch": 5.6100538599640934, + "grad_norm": 1.0170503854751587, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 78120 + }, + { + "epoch": 5.6107719928186714, + "grad_norm": 1.298754334449768, + "learning_rate": 0.0002, + "loss": 0.6376, + "step": 78130 + }, + { + "epoch": 5.6114901256732495, + "grad_norm": 0.9344905018806458, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 78140 + }, + { + "epoch": 5.6122082585278275, + "grad_norm": 0.9467785954475403, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 78150 + }, + { + "epoch": 5.6129263913824055, + "grad_norm": 1.0617443323135376, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 78160 + }, + { + "epoch": 5.6136445242369835, + "grad_norm": 0.9017760753631592, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 78170 + }, + { + "epoch": 5.6143626570915615, + "grad_norm": 1.152601957321167, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 78180 + }, + { + "epoch": 5.61508078994614, + "grad_norm": 0.9889463186264038, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 78190 + }, + { + "epoch": 5.615798922800718, + "grad_norm": 1.0367393493652344, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 78200 + }, + { + "epoch": 5.616517055655296, + "grad_norm": 0.8466457724571228, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 78210 + }, + { + "epoch": 5.617235188509874, + "grad_norm": 0.936083197593689, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 78220 + }, + { + "epoch": 5.617953321364452, + "grad_norm": 1.018784999847412, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 78230 + }, + { + "epoch": 5.61867145421903, + "grad_norm": 0.8527804017066956, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 78240 + }, + { + "epoch": 5.619389587073608, + "grad_norm": 1.1873106956481934, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 78250 + }, + { + "epoch": 5.620107719928186, + "grad_norm": 0.9401728510856628, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 78260 + }, + { + "epoch": 5.620825852782765, + "grad_norm": 1.0801159143447876, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 78270 + }, + { + "epoch": 5.621543985637343, + "grad_norm": 1.0053739547729492, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 78280 + }, + { + "epoch": 5.622262118491921, + "grad_norm": 0.8599331378936768, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 78290 + }, + { + "epoch": 5.622980251346499, + "grad_norm": 2.3157296180725098, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 78300 + }, + { + "epoch": 5.623698384201077, + "grad_norm": 1.0027490854263306, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 78310 + }, + { + "epoch": 5.624416517055655, + "grad_norm": 0.996688961982727, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 78320 + }, + { + "epoch": 5.625134649910233, + "grad_norm": 1.0462113618850708, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 78330 + }, + { + "epoch": 5.625852782764811, + "grad_norm": 0.8750988245010376, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 78340 + }, + { + "epoch": 5.626570915619389, + "grad_norm": 0.8078145384788513, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 78350 + }, + { + "epoch": 5.627289048473967, + "grad_norm": 0.9047532081604004, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 78360 + }, + { + "epoch": 5.628007181328546, + "grad_norm": 0.9784479737281799, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 78370 + }, + { + "epoch": 5.628725314183124, + "grad_norm": 0.9529541730880737, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 78380 + }, + { + "epoch": 5.629443447037702, + "grad_norm": 0.8264740109443665, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 78390 + }, + { + "epoch": 5.63016157989228, + "grad_norm": 1.049724817276001, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 78400 + }, + { + "epoch": 5.630879712746858, + "grad_norm": 0.9866746068000793, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 78410 + }, + { + "epoch": 5.631597845601436, + "grad_norm": 0.897155225276947, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 78420 + }, + { + "epoch": 5.632315978456014, + "grad_norm": 1.225464940071106, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 78430 + }, + { + "epoch": 5.633034111310592, + "grad_norm": 0.8793753981590271, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 78440 + }, + { + "epoch": 5.63375224416517, + "grad_norm": 1.082482099533081, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 78450 + }, + { + "epoch": 5.634470377019749, + "grad_norm": 1.054064393043518, + "learning_rate": 0.0002, + "loss": 0.6546, + "step": 78460 + }, + { + "epoch": 5.635188509874327, + "grad_norm": 1.0032247304916382, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 78470 + }, + { + "epoch": 5.635906642728905, + "grad_norm": 0.8544651865959167, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 78480 + }, + { + "epoch": 5.636624775583483, + "grad_norm": 0.9475075602531433, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 78490 + }, + { + "epoch": 5.637342908438061, + "grad_norm": 1.0814138650894165, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 78500 + }, + { + "epoch": 5.638061041292639, + "grad_norm": 1.0813153982162476, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 78510 + }, + { + "epoch": 5.638779174147217, + "grad_norm": 1.0225616693496704, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 78520 + }, + { + "epoch": 5.639497307001795, + "grad_norm": 1.0777465105056763, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 78530 + }, + { + "epoch": 5.640215439856373, + "grad_norm": 1.156148910522461, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 78540 + }, + { + "epoch": 5.640933572710951, + "grad_norm": 1.0147465467453003, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 78550 + }, + { + "epoch": 5.64165170556553, + "grad_norm": 0.9606683850288391, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 78560 + }, + { + "epoch": 5.642369838420108, + "grad_norm": 0.9478723406791687, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 78570 + }, + { + "epoch": 5.643087971274686, + "grad_norm": 1.0653880834579468, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 78580 + }, + { + "epoch": 5.643806104129264, + "grad_norm": 1.7519923448562622, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 78590 + }, + { + "epoch": 5.644524236983842, + "grad_norm": 1.0567299127578735, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 78600 + }, + { + "epoch": 5.64524236983842, + "grad_norm": 0.8980287909507751, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 78610 + }, + { + "epoch": 5.645960502692998, + "grad_norm": 0.8792264461517334, + "learning_rate": 0.0002, + "loss": 0.6319, + "step": 78620 + }, + { + "epoch": 5.646678635547576, + "grad_norm": 1.2306275367736816, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 78630 + }, + { + "epoch": 5.647396768402155, + "grad_norm": 0.8259932398796082, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 78640 + }, + { + "epoch": 5.648114901256733, + "grad_norm": 0.9605076313018799, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 78650 + }, + { + "epoch": 5.648833034111311, + "grad_norm": 0.9967419505119324, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 78660 + }, + { + "epoch": 5.649551166965889, + "grad_norm": 0.9774024486541748, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 78670 + }, + { + "epoch": 5.650269299820467, + "grad_norm": 0.9838066697120667, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 78680 + }, + { + "epoch": 5.650987432675045, + "grad_norm": 1.1617798805236816, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 78690 + }, + { + "epoch": 5.651705565529623, + "grad_norm": 1.075006365776062, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 78700 + }, + { + "epoch": 5.652423698384201, + "grad_norm": 0.8859893679618835, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 78710 + }, + { + "epoch": 5.653141831238779, + "grad_norm": 1.0774717330932617, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 78720 + }, + { + "epoch": 5.653859964093357, + "grad_norm": 1.147273302078247, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 78730 + }, + { + "epoch": 5.654578096947935, + "grad_norm": 1.1403213739395142, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 78740 + }, + { + "epoch": 5.655296229802514, + "grad_norm": 0.9115353226661682, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 78750 + }, + { + "epoch": 5.656014362657092, + "grad_norm": 0.9303002953529358, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 78760 + }, + { + "epoch": 5.65673249551167, + "grad_norm": 0.9324957728385925, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 78770 + }, + { + "epoch": 5.657450628366248, + "grad_norm": 0.9688063859939575, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 78780 + }, + { + "epoch": 5.658168761220826, + "grad_norm": 0.9019638299942017, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 78790 + }, + { + "epoch": 5.658886894075404, + "grad_norm": 0.8236798048019409, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 78800 + }, + { + "epoch": 5.659605026929982, + "grad_norm": 1.2702386379241943, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 78810 + }, + { + "epoch": 5.66032315978456, + "grad_norm": 1.041077971458435, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 78820 + }, + { + "epoch": 5.661041292639139, + "grad_norm": 0.9028838276863098, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 78830 + }, + { + "epoch": 5.661759425493717, + "grad_norm": 0.9874144196510315, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 78840 + }, + { + "epoch": 5.662477558348295, + "grad_norm": 0.9633761048316956, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 78850 + }, + { + "epoch": 5.663195691202873, + "grad_norm": 0.9069564342498779, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 78860 + }, + { + "epoch": 5.663913824057451, + "grad_norm": 0.9560621976852417, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 78870 + }, + { + "epoch": 5.664631956912029, + "grad_norm": 0.9941161870956421, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 78880 + }, + { + "epoch": 5.665350089766607, + "grad_norm": 0.920407235622406, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 78890 + }, + { + "epoch": 5.666068222621185, + "grad_norm": 0.9909250140190125, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 78900 + }, + { + "epoch": 5.666786355475763, + "grad_norm": 0.9528568983078003, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 78910 + }, + { + "epoch": 5.667504488330341, + "grad_norm": 1.041440725326538, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 78920 + }, + { + "epoch": 5.66822262118492, + "grad_norm": 1.0072191953659058, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 78930 + }, + { + "epoch": 5.668940754039498, + "grad_norm": 1.0740574598312378, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 78940 + }, + { + "epoch": 5.669658886894076, + "grad_norm": 0.9168822169303894, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 78950 + }, + { + "epoch": 5.670377019748654, + "grad_norm": 1.1818004846572876, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 78960 + }, + { + "epoch": 5.671095152603232, + "grad_norm": 1.1925201416015625, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 78970 + }, + { + "epoch": 5.67181328545781, + "grad_norm": 0.879940390586853, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 78980 + }, + { + "epoch": 5.672531418312388, + "grad_norm": 1.0998331308364868, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 78990 + }, + { + "epoch": 5.673249551166966, + "grad_norm": 1.076637625694275, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 79000 + }, + { + "epoch": 5.673967684021544, + "grad_norm": 1.076864242553711, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 79010 + }, + { + "epoch": 5.6746858168761225, + "grad_norm": 1.0206586122512817, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 79020 + }, + { + "epoch": 5.6754039497307005, + "grad_norm": 0.8242515325546265, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 79030 + }, + { + "epoch": 5.6761220825852785, + "grad_norm": 1.1180634498596191, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 79040 + }, + { + "epoch": 5.6768402154398565, + "grad_norm": 1.0155152082443237, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 79050 + }, + { + "epoch": 5.6775583482944345, + "grad_norm": 1.0445241928100586, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 79060 + }, + { + "epoch": 5.6782764811490125, + "grad_norm": 0.9851725697517395, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 79070 + }, + { + "epoch": 5.6789946140035905, + "grad_norm": 0.9979640245437622, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 79080 + }, + { + "epoch": 5.6797127468581685, + "grad_norm": 1.0398952960968018, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 79090 + }, + { + "epoch": 5.6804308797127465, + "grad_norm": 1.094164252281189, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 79100 + }, + { + "epoch": 5.6811490125673245, + "grad_norm": 0.9546816945075989, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 79110 + }, + { + "epoch": 5.681867145421903, + "grad_norm": 1.1635938882827759, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 79120 + }, + { + "epoch": 5.682585278276481, + "grad_norm": 1.0260306596755981, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 79130 + }, + { + "epoch": 5.683303411131059, + "grad_norm": 0.9900122284889221, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 79140 + }, + { + "epoch": 5.684021543985637, + "grad_norm": 1.049688458442688, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 79150 + }, + { + "epoch": 5.684739676840215, + "grad_norm": 1.124272108078003, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 79160 + }, + { + "epoch": 5.685457809694793, + "grad_norm": 1.1109849214553833, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 79170 + }, + { + "epoch": 5.686175942549371, + "grad_norm": 0.739007830619812, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 79180 + }, + { + "epoch": 5.686894075403949, + "grad_norm": 1.2063007354736328, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 79190 + }, + { + "epoch": 5.687612208258528, + "grad_norm": 1.223317265510559, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 79200 + }, + { + "epoch": 5.688330341113106, + "grad_norm": 0.8042855858802795, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 79210 + }, + { + "epoch": 5.689048473967684, + "grad_norm": 0.9294175505638123, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 79220 + }, + { + "epoch": 5.689766606822262, + "grad_norm": 0.978084146976471, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 79230 + }, + { + "epoch": 5.69048473967684, + "grad_norm": 0.9271620512008667, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 79240 + }, + { + "epoch": 5.691202872531418, + "grad_norm": 1.158677339553833, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 79250 + }, + { + "epoch": 5.691921005385996, + "grad_norm": 0.9468576312065125, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 79260 + }, + { + "epoch": 5.692639138240574, + "grad_norm": 1.2025824785232544, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 79270 + }, + { + "epoch": 5.693357271095152, + "grad_norm": 1.0167860984802246, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 79280 + }, + { + "epoch": 5.69407540394973, + "grad_norm": 0.971199631690979, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 79290 + }, + { + "epoch": 5.694793536804308, + "grad_norm": 1.1757864952087402, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 79300 + }, + { + "epoch": 5.695511669658887, + "grad_norm": 1.0199662446975708, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 79310 + }, + { + "epoch": 5.696229802513465, + "grad_norm": 0.9662485122680664, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 79320 + }, + { + "epoch": 5.696947935368043, + "grad_norm": 0.9324414134025574, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 79330 + }, + { + "epoch": 5.697666068222621, + "grad_norm": 0.855752170085907, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 79340 + }, + { + "epoch": 5.698384201077199, + "grad_norm": 1.2723703384399414, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 79350 + }, + { + "epoch": 5.699102333931777, + "grad_norm": 1.0254011154174805, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 79360 + }, + { + "epoch": 5.699820466786355, + "grad_norm": 1.0958263874053955, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 79370 + }, + { + "epoch": 5.700538599640933, + "grad_norm": 1.0214145183563232, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 79380 + }, + { + "epoch": 5.701256732495512, + "grad_norm": 1.1087455749511719, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 79390 + }, + { + "epoch": 5.70197486535009, + "grad_norm": 0.8885074853897095, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 79400 + }, + { + "epoch": 5.702692998204668, + "grad_norm": 0.9854450821876526, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 79410 + }, + { + "epoch": 5.703411131059246, + "grad_norm": 0.858744204044342, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 79420 + }, + { + "epoch": 5.704129263913824, + "grad_norm": 0.9434788823127747, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 79430 + }, + { + "epoch": 5.704847396768402, + "grad_norm": 1.1388801336288452, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 79440 + }, + { + "epoch": 5.70556552962298, + "grad_norm": 1.0701899528503418, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 79450 + }, + { + "epoch": 5.706283662477558, + "grad_norm": 0.9147594571113586, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 79460 + }, + { + "epoch": 5.707001795332136, + "grad_norm": 1.055008053779602, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 79470 + }, + { + "epoch": 5.707719928186714, + "grad_norm": 0.7841609716415405, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 79480 + }, + { + "epoch": 5.708438061041292, + "grad_norm": 1.0334571599960327, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 79490 + }, + { + "epoch": 5.709156193895871, + "grad_norm": 1.2841367721557617, + "learning_rate": 0.0002, + "loss": 0.5924, + "step": 79500 + }, + { + "epoch": 5.709874326750449, + "grad_norm": 1.0296638011932373, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 79510 + }, + { + "epoch": 5.710592459605027, + "grad_norm": 0.9161922931671143, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 79520 + }, + { + "epoch": 5.711310592459605, + "grad_norm": 1.056856632232666, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 79530 + }, + { + "epoch": 5.712028725314183, + "grad_norm": 0.9919893145561218, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 79540 + }, + { + "epoch": 5.712746858168761, + "grad_norm": 1.1128891706466675, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 79550 + }, + { + "epoch": 5.713464991023339, + "grad_norm": 1.1171997785568237, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 79560 + }, + { + "epoch": 5.714183123877917, + "grad_norm": 0.9389346837997437, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 79570 + }, + { + "epoch": 5.714901256732496, + "grad_norm": 0.9869245886802673, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 79580 + }, + { + "epoch": 5.715619389587074, + "grad_norm": 0.9019966721534729, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 79590 + }, + { + "epoch": 5.716337522441652, + "grad_norm": 0.9791252017021179, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 79600 + }, + { + "epoch": 5.71705565529623, + "grad_norm": 1.0269849300384521, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 79610 + }, + { + "epoch": 5.717773788150808, + "grad_norm": 1.0340129137039185, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 79620 + }, + { + "epoch": 5.718491921005386, + "grad_norm": 0.9742604494094849, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 79630 + }, + { + "epoch": 5.719210053859964, + "grad_norm": 1.126868724822998, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 79640 + }, + { + "epoch": 5.719928186714542, + "grad_norm": 1.04326331615448, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 79650 + }, + { + "epoch": 5.72064631956912, + "grad_norm": 0.8300277590751648, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 79660 + }, + { + "epoch": 5.721364452423698, + "grad_norm": 0.8482570052146912, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 79670 + }, + { + "epoch": 5.722082585278277, + "grad_norm": 1.0777807235717773, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 79680 + }, + { + "epoch": 5.722800718132855, + "grad_norm": 1.2682723999023438, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 79690 + }, + { + "epoch": 5.723518850987433, + "grad_norm": 0.8742772340774536, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 79700 + }, + { + "epoch": 5.724236983842011, + "grad_norm": 0.9218387603759766, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 79710 + }, + { + "epoch": 5.724955116696589, + "grad_norm": 0.8977975845336914, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 79720 + }, + { + "epoch": 5.725673249551167, + "grad_norm": 1.0873085260391235, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 79730 + }, + { + "epoch": 5.726391382405745, + "grad_norm": 0.9811807870864868, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 79740 + }, + { + "epoch": 5.727109515260323, + "grad_norm": 0.926764965057373, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 79750 + }, + { + "epoch": 5.727827648114902, + "grad_norm": 1.0103713274002075, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 79760 + }, + { + "epoch": 5.72854578096948, + "grad_norm": 1.1389189958572388, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 79770 + }, + { + "epoch": 5.729263913824058, + "grad_norm": 1.1654961109161377, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 79780 + }, + { + "epoch": 5.729982046678636, + "grad_norm": 0.7925996780395508, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 79790 + }, + { + "epoch": 5.730700179533214, + "grad_norm": 1.3329131603240967, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 79800 + }, + { + "epoch": 5.731418312387792, + "grad_norm": 1.158328890800476, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 79810 + }, + { + "epoch": 5.73213644524237, + "grad_norm": 0.9904412031173706, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 79820 + }, + { + "epoch": 5.732854578096948, + "grad_norm": 1.099233865737915, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 79830 + }, + { + "epoch": 5.733572710951526, + "grad_norm": 1.0224473476409912, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 79840 + }, + { + "epoch": 5.734290843806104, + "grad_norm": 1.0482215881347656, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 79850 + }, + { + "epoch": 5.735008976660682, + "grad_norm": 0.9790018200874329, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 79860 + }, + { + "epoch": 5.735727109515261, + "grad_norm": 1.034548044204712, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 79870 + }, + { + "epoch": 5.736445242369839, + "grad_norm": 0.799286961555481, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 79880 + }, + { + "epoch": 5.737163375224417, + "grad_norm": 1.0119048357009888, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 79890 + }, + { + "epoch": 5.737881508078995, + "grad_norm": 0.9742264151573181, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 79900 + }, + { + "epoch": 5.738599640933573, + "grad_norm": 1.0408239364624023, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 79910 + }, + { + "epoch": 5.739317773788151, + "grad_norm": 0.9165748953819275, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 79920 + }, + { + "epoch": 5.740035906642729, + "grad_norm": 1.1859451532363892, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 79930 + }, + { + "epoch": 5.740754039497307, + "grad_norm": 0.8772084712982178, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 79940 + }, + { + "epoch": 5.741472172351886, + "grad_norm": 1.0123273134231567, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 79950 + }, + { + "epoch": 5.742190305206464, + "grad_norm": 1.1873936653137207, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 79960 + }, + { + "epoch": 5.742908438061042, + "grad_norm": 0.9065699577331543, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 79970 + }, + { + "epoch": 5.74362657091562, + "grad_norm": 1.1626464128494263, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 79980 + }, + { + "epoch": 5.744344703770198, + "grad_norm": 1.0311716794967651, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 79990 + }, + { + "epoch": 5.745062836624776, + "grad_norm": 1.0865558385849, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 80000 + }, + { + "epoch": 5.745780969479354, + "grad_norm": 1.0257176160812378, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 80010 + }, + { + "epoch": 5.746499102333932, + "grad_norm": 0.9805439710617065, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 80020 + }, + { + "epoch": 5.74721723518851, + "grad_norm": 0.9744977355003357, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 80030 + }, + { + "epoch": 5.747935368043088, + "grad_norm": 1.302816390991211, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 80040 + }, + { + "epoch": 5.748653500897666, + "grad_norm": 0.8866990208625793, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 80050 + }, + { + "epoch": 5.7493716337522445, + "grad_norm": 1.0133726596832275, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 80060 + }, + { + "epoch": 5.7500897666068225, + "grad_norm": 1.0043569803237915, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 80070 + }, + { + "epoch": 5.7508078994614005, + "grad_norm": 0.9100040197372437, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 80080 + }, + { + "epoch": 5.7515260323159785, + "grad_norm": 0.7994180917739868, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 80090 + }, + { + "epoch": 5.7522441651705565, + "grad_norm": 1.120188593864441, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 80100 + }, + { + "epoch": 5.7529622980251345, + "grad_norm": 0.9555420279502869, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 80110 + }, + { + "epoch": 5.7536804308797125, + "grad_norm": 1.0305951833724976, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 80120 + }, + { + "epoch": 5.7543985637342905, + "grad_norm": 0.9632731676101685, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 80130 + }, + { + "epoch": 5.755116696588869, + "grad_norm": 1.2654297351837158, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 80140 + }, + { + "epoch": 5.755834829443447, + "grad_norm": 1.027190089225769, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 80150 + }, + { + "epoch": 5.756552962298025, + "grad_norm": 0.9829175472259521, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 80160 + }, + { + "epoch": 5.757271095152603, + "grad_norm": 1.083803653717041, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 80170 + }, + { + "epoch": 5.757989228007181, + "grad_norm": 0.9353913068771362, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 80180 + }, + { + "epoch": 5.758707360861759, + "grad_norm": 1.1824370622634888, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 80190 + }, + { + "epoch": 5.759425493716337, + "grad_norm": 1.0901048183441162, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 80200 + }, + { + "epoch": 5.760143626570915, + "grad_norm": 1.0389254093170166, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 80210 + }, + { + "epoch": 5.760861759425493, + "grad_norm": 0.9746400117874146, + "learning_rate": 0.0002, + "loss": 0.6085, + "step": 80220 + }, + { + "epoch": 5.761579892280071, + "grad_norm": 0.9319248795509338, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 80230 + }, + { + "epoch": 5.76229802513465, + "grad_norm": 1.152784824371338, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 80240 + }, + { + "epoch": 5.763016157989228, + "grad_norm": 0.9462733864784241, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 80250 + }, + { + "epoch": 5.763734290843806, + "grad_norm": 0.8884182572364807, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 80260 + }, + { + "epoch": 5.764452423698384, + "grad_norm": 0.8755964636802673, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 80270 + }, + { + "epoch": 5.765170556552962, + "grad_norm": 0.8983452320098877, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 80280 + }, + { + "epoch": 5.76588868940754, + "grad_norm": 0.8565991520881653, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 80290 + }, + { + "epoch": 5.766606822262118, + "grad_norm": 1.0557159185409546, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 80300 + }, + { + "epoch": 5.767324955116696, + "grad_norm": 1.057214379310608, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 80310 + }, + { + "epoch": 5.768043087971275, + "grad_norm": 0.9852516055107117, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 80320 + }, + { + "epoch": 5.768761220825853, + "grad_norm": 1.0339698791503906, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 80330 + }, + { + "epoch": 5.769479353680431, + "grad_norm": 1.0056889057159424, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 80340 + }, + { + "epoch": 5.770197486535009, + "grad_norm": 1.0941663980484009, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 80350 + }, + { + "epoch": 5.770915619389587, + "grad_norm": 1.2145589590072632, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 80360 + }, + { + "epoch": 5.771633752244165, + "grad_norm": 0.9609606862068176, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 80370 + }, + { + "epoch": 5.772351885098743, + "grad_norm": 0.8815773129463196, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 80380 + }, + { + "epoch": 5.773070017953321, + "grad_norm": 1.2630987167358398, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 80390 + }, + { + "epoch": 5.773788150807899, + "grad_norm": 1.0605450868606567, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 80400 + }, + { + "epoch": 5.774506283662477, + "grad_norm": 1.165069341659546, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 80410 + }, + { + "epoch": 5.775224416517055, + "grad_norm": 0.9038028717041016, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 80420 + }, + { + "epoch": 5.775942549371634, + "grad_norm": 1.0571858882904053, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 80430 + }, + { + "epoch": 5.776660682226212, + "grad_norm": 1.0388168096542358, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 80440 + }, + { + "epoch": 5.77737881508079, + "grad_norm": 1.0552119016647339, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 80450 + }, + { + "epoch": 5.778096947935368, + "grad_norm": 1.0610109567642212, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 80460 + }, + { + "epoch": 5.778815080789946, + "grad_norm": 0.9906430244445801, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 80470 + }, + { + "epoch": 5.779533213644524, + "grad_norm": 1.1511857509613037, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 80480 + }, + { + "epoch": 5.780251346499102, + "grad_norm": 1.2738412618637085, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 80490 + }, + { + "epoch": 5.78096947935368, + "grad_norm": 0.8945937752723694, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 80500 + }, + { + "epoch": 5.781687612208259, + "grad_norm": 1.1105149984359741, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 80510 + }, + { + "epoch": 5.782405745062837, + "grad_norm": 0.8432297110557556, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 80520 + }, + { + "epoch": 5.783123877917415, + "grad_norm": 0.9257984757423401, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 80530 + }, + { + "epoch": 5.783842010771993, + "grad_norm": 1.1708799600601196, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 80540 + }, + { + "epoch": 5.784560143626571, + "grad_norm": 0.9969521164894104, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 80550 + }, + { + "epoch": 5.785278276481149, + "grad_norm": 1.0361413955688477, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 80560 + }, + { + "epoch": 5.785996409335727, + "grad_norm": 0.9876393675804138, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 80570 + }, + { + "epoch": 5.786714542190305, + "grad_norm": 1.0356241464614868, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 80580 + }, + { + "epoch": 5.787432675044883, + "grad_norm": 1.178865671157837, + "learning_rate": 0.0002, + "loss": 0.5647, + "step": 80590 + }, + { + "epoch": 5.788150807899461, + "grad_norm": 0.8614338636398315, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 80600 + }, + { + "epoch": 5.788868940754039, + "grad_norm": 1.020734429359436, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 80610 + }, + { + "epoch": 5.789587073608618, + "grad_norm": 1.035951852798462, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 80620 + }, + { + "epoch": 5.790305206463196, + "grad_norm": 0.898637592792511, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 80630 + }, + { + "epoch": 5.791023339317774, + "grad_norm": 0.9803016781806946, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 80640 + }, + { + "epoch": 5.791741472172352, + "grad_norm": 1.2902555465698242, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 80650 + }, + { + "epoch": 5.79245960502693, + "grad_norm": 1.3364112377166748, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 80660 + }, + { + "epoch": 5.793177737881508, + "grad_norm": 0.8553985953330994, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 80670 + }, + { + "epoch": 5.793895870736086, + "grad_norm": 0.8211889863014221, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 80680 + }, + { + "epoch": 5.794614003590664, + "grad_norm": 0.9288306832313538, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 80690 + }, + { + "epoch": 5.795332136445243, + "grad_norm": 1.0716029405593872, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 80700 + }, + { + "epoch": 5.796050269299821, + "grad_norm": 0.9957329034805298, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 80710 + }, + { + "epoch": 5.796768402154399, + "grad_norm": 0.9691376090049744, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 80720 + }, + { + "epoch": 5.797486535008977, + "grad_norm": 1.0590804815292358, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 80730 + }, + { + "epoch": 5.798204667863555, + "grad_norm": 1.0408968925476074, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 80740 + }, + { + "epoch": 5.798922800718133, + "grad_norm": 1.0249526500701904, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 80750 + }, + { + "epoch": 5.799640933572711, + "grad_norm": 1.3658806085586548, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 80760 + }, + { + "epoch": 5.800359066427289, + "grad_norm": 0.9562603831291199, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 80770 + }, + { + "epoch": 5.801077199281867, + "grad_norm": 0.8790915012359619, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 80780 + }, + { + "epoch": 5.801795332136445, + "grad_norm": 0.8351004123687744, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 80790 + }, + { + "epoch": 5.802513464991024, + "grad_norm": 0.964562714099884, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 80800 + }, + { + "epoch": 5.803231597845602, + "grad_norm": 1.0873116254806519, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 80810 + }, + { + "epoch": 5.80394973070018, + "grad_norm": 0.9821216464042664, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 80820 + }, + { + "epoch": 5.804667863554758, + "grad_norm": 1.1158807277679443, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 80830 + }, + { + "epoch": 5.805385996409336, + "grad_norm": 1.0098856687545776, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 80840 + }, + { + "epoch": 5.806104129263914, + "grad_norm": 0.9628035426139832, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 80850 + }, + { + "epoch": 5.806822262118492, + "grad_norm": 1.133800983428955, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 80860 + }, + { + "epoch": 5.80754039497307, + "grad_norm": 0.9423992037773132, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 80870 + }, + { + "epoch": 5.808258527827648, + "grad_norm": 1.0758612155914307, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 80880 + }, + { + "epoch": 5.808976660682227, + "grad_norm": 1.232029914855957, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 80890 + }, + { + "epoch": 5.809694793536805, + "grad_norm": 1.1063108444213867, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 80900 + }, + { + "epoch": 5.810412926391383, + "grad_norm": 0.9759877920150757, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 80910 + }, + { + "epoch": 5.811131059245961, + "grad_norm": 0.9180193543434143, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 80920 + }, + { + "epoch": 5.811849192100539, + "grad_norm": 1.0818052291870117, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 80930 + }, + { + "epoch": 5.812567324955117, + "grad_norm": 0.998986542224884, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 80940 + }, + { + "epoch": 5.813285457809695, + "grad_norm": 1.1549060344696045, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 80950 + }, + { + "epoch": 5.814003590664273, + "grad_norm": 1.1900213956832886, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 80960 + }, + { + "epoch": 5.814721723518851, + "grad_norm": 0.8114368915557861, + "learning_rate": 0.0002, + "loss": 0.6249, + "step": 80970 + }, + { + "epoch": 5.815439856373429, + "grad_norm": 1.0296406745910645, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 80980 + }, + { + "epoch": 5.8161579892280075, + "grad_norm": 1.0466746091842651, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 80990 + }, + { + "epoch": 5.8168761220825855, + "grad_norm": 1.0524508953094482, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 81000 + }, + { + "epoch": 5.8175942549371635, + "grad_norm": 1.1588358879089355, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 81010 + }, + { + "epoch": 5.8183123877917415, + "grad_norm": 0.9378601908683777, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 81020 + }, + { + "epoch": 5.8190305206463195, + "grad_norm": 0.9486441612243652, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 81030 + }, + { + "epoch": 5.8197486535008975, + "grad_norm": 0.9805227518081665, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 81040 + }, + { + "epoch": 5.8204667863554755, + "grad_norm": 1.1627717018127441, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 81050 + }, + { + "epoch": 5.8211849192100535, + "grad_norm": 1.0716841220855713, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 81060 + }, + { + "epoch": 5.821903052064632, + "grad_norm": 1.2398899793624878, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 81070 + }, + { + "epoch": 5.82262118491921, + "grad_norm": 1.0934730768203735, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 81080 + }, + { + "epoch": 5.823339317773788, + "grad_norm": 0.9701796174049377, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 81090 + }, + { + "epoch": 5.824057450628366, + "grad_norm": 1.0218969583511353, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 81100 + }, + { + "epoch": 5.824775583482944, + "grad_norm": 1.3066465854644775, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 81110 + }, + { + "epoch": 5.825493716337522, + "grad_norm": 1.1067441701889038, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 81120 + }, + { + "epoch": 5.8262118491921004, + "grad_norm": 0.9750344753265381, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 81130 + }, + { + "epoch": 5.8269299820466784, + "grad_norm": 1.129191279411316, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 81140 + }, + { + "epoch": 5.8276481149012564, + "grad_norm": 1.05964195728302, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 81150 + }, + { + "epoch": 5.8283662477558345, + "grad_norm": 1.1094872951507568, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 81160 + }, + { + "epoch": 5.8290843806104125, + "grad_norm": 0.9163196086883545, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 81170 + }, + { + "epoch": 5.829802513464991, + "grad_norm": 1.0035687685012817, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 81180 + }, + { + "epoch": 5.830520646319569, + "grad_norm": 1.0353461503982544, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 81190 + }, + { + "epoch": 5.831238779174147, + "grad_norm": 1.0566555261611938, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 81200 + }, + { + "epoch": 5.831956912028725, + "grad_norm": 1.2373290061950684, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 81210 + }, + { + "epoch": 5.832675044883303, + "grad_norm": 0.8818837404251099, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 81220 + }, + { + "epoch": 5.833393177737881, + "grad_norm": 1.1024713516235352, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 81230 + }, + { + "epoch": 5.834111310592459, + "grad_norm": 1.2478809356689453, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 81240 + }, + { + "epoch": 5.834829443447037, + "grad_norm": 0.8647364377975464, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 81250 + }, + { + "epoch": 5.835547576301616, + "grad_norm": 1.1106358766555786, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 81260 + }, + { + "epoch": 5.836265709156194, + "grad_norm": 0.9432938694953918, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 81270 + }, + { + "epoch": 5.836983842010772, + "grad_norm": 1.0283797979354858, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 81280 + }, + { + "epoch": 5.83770197486535, + "grad_norm": 1.158918857574463, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 81290 + }, + { + "epoch": 5.838420107719928, + "grad_norm": 0.9700069427490234, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 81300 + }, + { + "epoch": 5.839138240574506, + "grad_norm": 1.08310866355896, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 81310 + }, + { + "epoch": 5.839856373429084, + "grad_norm": 1.05460524559021, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 81320 + }, + { + "epoch": 5.840574506283662, + "grad_norm": 0.9849268794059753, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 81330 + }, + { + "epoch": 5.84129263913824, + "grad_norm": 0.888306736946106, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 81340 + }, + { + "epoch": 5.842010771992818, + "grad_norm": 1.0337001085281372, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 81350 + }, + { + "epoch": 5.842728904847397, + "grad_norm": 1.0778567790985107, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 81360 + }, + { + "epoch": 5.843447037701975, + "grad_norm": 1.1484156847000122, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 81370 + }, + { + "epoch": 5.844165170556553, + "grad_norm": 1.0948245525360107, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 81380 + }, + { + "epoch": 5.844883303411131, + "grad_norm": 0.9363969564437866, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 81390 + }, + { + "epoch": 5.845601436265709, + "grad_norm": 1.0151013135910034, + "learning_rate": 0.0002, + "loss": 0.6336, + "step": 81400 + }, + { + "epoch": 5.846319569120287, + "grad_norm": 0.9925733804702759, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 81410 + }, + { + "epoch": 5.847037701974865, + "grad_norm": 1.0356744527816772, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 81420 + }, + { + "epoch": 5.847755834829443, + "grad_norm": 1.0633001327514648, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 81430 + }, + { + "epoch": 5.848473967684021, + "grad_norm": 0.9900460839271545, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 81440 + }, + { + "epoch": 5.8491921005386, + "grad_norm": 1.2677979469299316, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 81450 + }, + { + "epoch": 5.849910233393178, + "grad_norm": 0.8174138069152832, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 81460 + }, + { + "epoch": 5.850628366247756, + "grad_norm": 1.1986393928527832, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 81470 + }, + { + "epoch": 5.851346499102334, + "grad_norm": 1.1009358167648315, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 81480 + }, + { + "epoch": 5.852064631956912, + "grad_norm": 0.966446578502655, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 81490 + }, + { + "epoch": 5.85278276481149, + "grad_norm": 0.9657767415046692, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 81500 + }, + { + "epoch": 5.853500897666068, + "grad_norm": 1.0480058193206787, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 81510 + }, + { + "epoch": 5.854219030520646, + "grad_norm": 1.2003830671310425, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 81520 + }, + { + "epoch": 5.854937163375224, + "grad_norm": 0.8683754205703735, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 81530 + }, + { + "epoch": 5.855655296229802, + "grad_norm": 1.0860967636108398, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 81540 + }, + { + "epoch": 5.856373429084381, + "grad_norm": 1.0415282249450684, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 81550 + }, + { + "epoch": 5.857091561938959, + "grad_norm": 0.9897454380989075, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 81560 + }, + { + "epoch": 5.857809694793537, + "grad_norm": 1.173884630203247, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 81570 + }, + { + "epoch": 5.858527827648115, + "grad_norm": 1.2426209449768066, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 81580 + }, + { + "epoch": 5.859245960502693, + "grad_norm": 0.9390465021133423, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 81590 + }, + { + "epoch": 5.859964093357271, + "grad_norm": 1.1387195587158203, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 81600 + }, + { + "epoch": 5.860682226211849, + "grad_norm": 0.9902143478393555, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 81610 + }, + { + "epoch": 5.861400359066427, + "grad_norm": 0.8328776359558105, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 81620 + }, + { + "epoch": 5.862118491921006, + "grad_norm": 0.9837837815284729, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 81630 + }, + { + "epoch": 5.862836624775584, + "grad_norm": 1.0013370513916016, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 81640 + }, + { + "epoch": 5.863554757630162, + "grad_norm": 0.9408028721809387, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 81650 + }, + { + "epoch": 5.86427289048474, + "grad_norm": 1.093140959739685, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 81660 + }, + { + "epoch": 5.864991023339318, + "grad_norm": 0.9554300904273987, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 81670 + }, + { + "epoch": 5.865709156193896, + "grad_norm": 1.1276485919952393, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 81680 + }, + { + "epoch": 5.866427289048474, + "grad_norm": 0.9628785252571106, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 81690 + }, + { + "epoch": 5.867145421903052, + "grad_norm": 0.9844689965248108, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 81700 + }, + { + "epoch": 5.86786355475763, + "grad_norm": 0.9679856896400452, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 81710 + }, + { + "epoch": 5.868581687612208, + "grad_norm": 1.0225571393966675, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 81720 + }, + { + "epoch": 5.869299820466786, + "grad_norm": 0.9330390691757202, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 81730 + }, + { + "epoch": 5.870017953321365, + "grad_norm": 1.0584566593170166, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 81740 + }, + { + "epoch": 5.870736086175943, + "grad_norm": 0.781548023223877, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 81750 + }, + { + "epoch": 5.871454219030521, + "grad_norm": 0.8906106352806091, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 81760 + }, + { + "epoch": 5.872172351885099, + "grad_norm": 1.1402281522750854, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 81770 + }, + { + "epoch": 5.872890484739677, + "grad_norm": 0.9991076588630676, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 81780 + }, + { + "epoch": 5.873608617594255, + "grad_norm": 1.0120140314102173, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 81790 + }, + { + "epoch": 5.874326750448833, + "grad_norm": 0.8857715725898743, + "learning_rate": 0.0002, + "loss": 0.6114, + "step": 81800 + }, + { + "epoch": 5.875044883303411, + "grad_norm": 0.8531954288482666, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 81810 + }, + { + "epoch": 5.87576301615799, + "grad_norm": 1.1601015329360962, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 81820 + }, + { + "epoch": 5.876481149012568, + "grad_norm": 1.1435350179672241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 81830 + }, + { + "epoch": 5.877199281867146, + "grad_norm": 0.9526153802871704, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 81840 + }, + { + "epoch": 5.877917414721724, + "grad_norm": 1.06845223903656, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 81850 + }, + { + "epoch": 5.878635547576302, + "grad_norm": 0.9239344596862793, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 81860 + }, + { + "epoch": 5.87935368043088, + "grad_norm": 0.8632398247718811, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 81870 + }, + { + "epoch": 5.880071813285458, + "grad_norm": 0.9148443341255188, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 81880 + }, + { + "epoch": 5.880789946140036, + "grad_norm": 0.9910652041435242, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 81890 + }, + { + "epoch": 5.881508078994614, + "grad_norm": 0.8335179090499878, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 81900 + }, + { + "epoch": 5.882226211849192, + "grad_norm": 0.9921387434005737, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 81910 + }, + { + "epoch": 5.88294434470377, + "grad_norm": 1.0532517433166504, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 81920 + }, + { + "epoch": 5.883662477558349, + "grad_norm": 1.026400089263916, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 81930 + }, + { + "epoch": 5.884380610412927, + "grad_norm": 1.019195318222046, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 81940 + }, + { + "epoch": 5.885098743267505, + "grad_norm": 0.987238347530365, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 81950 + }, + { + "epoch": 5.885816876122083, + "grad_norm": 1.1714487075805664, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 81960 + }, + { + "epoch": 5.886535008976661, + "grad_norm": 1.0854483842849731, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 81970 + }, + { + "epoch": 5.887253141831239, + "grad_norm": 1.0678396224975586, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 81980 + }, + { + "epoch": 5.887971274685817, + "grad_norm": 1.1009471416473389, + "learning_rate": 0.0002, + "loss": 0.6061, + "step": 81990 + }, + { + "epoch": 5.888689407540395, + "grad_norm": 1.2056844234466553, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 82000 + }, + { + "epoch": 5.8894075403949735, + "grad_norm": 1.131302833557129, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 82010 + }, + { + "epoch": 5.8901256732495515, + "grad_norm": 1.4466036558151245, + "learning_rate": 0.0002, + "loss": 0.5822, + "step": 82020 + }, + { + "epoch": 5.8908438061041295, + "grad_norm": 1.051228404045105, + "learning_rate": 0.0002, + "loss": 0.6295, + "step": 82030 + }, + { + "epoch": 5.8915619389587075, + "grad_norm": 1.0010617971420288, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 82040 + }, + { + "epoch": 5.8922800718132855, + "grad_norm": 0.9095138311386108, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 82050 + }, + { + "epoch": 5.8929982046678635, + "grad_norm": 1.0237005949020386, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 82060 + }, + { + "epoch": 5.8937163375224415, + "grad_norm": 1.035122036933899, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 82070 + }, + { + "epoch": 5.8944344703770195, + "grad_norm": 1.0271964073181152, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 82080 + }, + { + "epoch": 5.8951526032315975, + "grad_norm": 1.2044503688812256, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 82090 + }, + { + "epoch": 5.8958707360861755, + "grad_norm": 1.0275284051895142, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 82100 + }, + { + "epoch": 5.896588868940754, + "grad_norm": 0.9974840879440308, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 82110 + }, + { + "epoch": 5.897307001795332, + "grad_norm": 1.009968638420105, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 82120 + }, + { + "epoch": 5.89802513464991, + "grad_norm": 0.8396142721176147, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 82130 + }, + { + "epoch": 5.898743267504488, + "grad_norm": 1.002354621887207, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 82140 + }, + { + "epoch": 5.899461400359066, + "grad_norm": 0.9998893737792969, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 82150 + }, + { + "epoch": 5.900179533213644, + "grad_norm": 1.1027010679244995, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 82160 + }, + { + "epoch": 5.900897666068222, + "grad_norm": 1.2028530836105347, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 82170 + }, + { + "epoch": 5.9016157989228, + "grad_norm": 1.0018759965896606, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 82180 + }, + { + "epoch": 5.902333931777379, + "grad_norm": 0.8911277055740356, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 82190 + }, + { + "epoch": 5.903052064631957, + "grad_norm": 1.0172009468078613, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 82200 + }, + { + "epoch": 5.903770197486535, + "grad_norm": 1.1664029359817505, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 82210 + }, + { + "epoch": 5.904488330341113, + "grad_norm": 1.0620089769363403, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 82220 + }, + { + "epoch": 5.905206463195691, + "grad_norm": 1.0756114721298218, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 82230 + }, + { + "epoch": 5.905924596050269, + "grad_norm": 1.1727497577667236, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 82240 + }, + { + "epoch": 5.906642728904847, + "grad_norm": 0.9833515882492065, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 82250 + }, + { + "epoch": 5.907360861759425, + "grad_norm": 0.9236368536949158, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 82260 + }, + { + "epoch": 5.908078994614003, + "grad_norm": 0.9773947596549988, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 82270 + }, + { + "epoch": 5.908797127468581, + "grad_norm": 1.1427783966064453, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 82280 + }, + { + "epoch": 5.909515260323159, + "grad_norm": 1.0215164422988892, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 82290 + }, + { + "epoch": 5.910233393177738, + "grad_norm": 1.1157845258712769, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 82300 + }, + { + "epoch": 5.910951526032316, + "grad_norm": 1.1490662097930908, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 82310 + }, + { + "epoch": 5.911669658886894, + "grad_norm": 0.7233976125717163, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 82320 + }, + { + "epoch": 5.912387791741472, + "grad_norm": 1.0053865909576416, + "learning_rate": 0.0002, + "loss": 0.6199, + "step": 82330 + }, + { + "epoch": 5.91310592459605, + "grad_norm": 0.9764766097068787, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 82340 + }, + { + "epoch": 5.913824057450628, + "grad_norm": 0.9492928385734558, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 82350 + }, + { + "epoch": 5.914542190305206, + "grad_norm": 0.9538891315460205, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 82360 + }, + { + "epoch": 5.915260323159784, + "grad_norm": 1.2620314359664917, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 82370 + }, + { + "epoch": 5.915978456014363, + "grad_norm": 0.9913349151611328, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 82380 + }, + { + "epoch": 5.916696588868941, + "grad_norm": 0.9712074995040894, + "learning_rate": 0.0002, + "loss": 0.5877, + "step": 82390 + }, + { + "epoch": 5.917414721723519, + "grad_norm": 1.1554654836654663, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 82400 + }, + { + "epoch": 5.918132854578097, + "grad_norm": 1.1418904066085815, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 82410 + }, + { + "epoch": 5.918850987432675, + "grad_norm": 0.9405845999717712, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 82420 + }, + { + "epoch": 5.919569120287253, + "grad_norm": 1.0801819562911987, + "learning_rate": 0.0002, + "loss": 0.606, + "step": 82430 + }, + { + "epoch": 5.920287253141831, + "grad_norm": 0.8643896579742432, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 82440 + }, + { + "epoch": 5.921005385996409, + "grad_norm": 1.106025218963623, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 82450 + }, + { + "epoch": 5.921723518850987, + "grad_norm": 1.0338234901428223, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 82460 + }, + { + "epoch": 5.922441651705565, + "grad_norm": 1.0648493766784668, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 82470 + }, + { + "epoch": 5.923159784560143, + "grad_norm": 1.1950433254241943, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 82480 + }, + { + "epoch": 5.923877917414722, + "grad_norm": 0.8730897903442383, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 82490 + }, + { + "epoch": 5.9245960502693, + "grad_norm": 1.2262312173843384, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 82500 + }, + { + "epoch": 5.925314183123878, + "grad_norm": 0.9526116251945496, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 82510 + }, + { + "epoch": 5.926032315978456, + "grad_norm": 1.0540224313735962, + "learning_rate": 0.0002, + "loss": 0.6372, + "step": 82520 + }, + { + "epoch": 5.926750448833034, + "grad_norm": 1.0537306070327759, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 82530 + }, + { + "epoch": 5.927468581687612, + "grad_norm": 1.134207844734192, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 82540 + }, + { + "epoch": 5.92818671454219, + "grad_norm": 0.9042250514030457, + "learning_rate": 0.0002, + "loss": 0.622, + "step": 82550 + }, + { + "epoch": 5.928904847396768, + "grad_norm": 1.0424834489822388, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 82560 + }, + { + "epoch": 5.929622980251347, + "grad_norm": 1.1571602821350098, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 82570 + }, + { + "epoch": 5.930341113105925, + "grad_norm": 1.1033377647399902, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 82580 + }, + { + "epoch": 5.931059245960503, + "grad_norm": 0.9211772680282593, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 82590 + }, + { + "epoch": 5.931777378815081, + "grad_norm": 1.0566459894180298, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 82600 + }, + { + "epoch": 5.932495511669659, + "grad_norm": 1.1773834228515625, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 82610 + }, + { + "epoch": 5.933213644524237, + "grad_norm": 1.193396806716919, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 82620 + }, + { + "epoch": 5.933931777378815, + "grad_norm": 1.1101785898208618, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 82630 + }, + { + "epoch": 5.934649910233393, + "grad_norm": 0.6988118886947632, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 82640 + }, + { + "epoch": 5.935368043087971, + "grad_norm": 0.9590985774993896, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 82650 + }, + { + "epoch": 5.936086175942549, + "grad_norm": 0.8512062430381775, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 82660 + }, + { + "epoch": 5.936804308797128, + "grad_norm": 1.0381710529327393, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 82670 + }, + { + "epoch": 5.937522441651706, + "grad_norm": 1.0816296339035034, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 82680 + }, + { + "epoch": 5.938240574506284, + "grad_norm": 1.0592364072799683, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 82690 + }, + { + "epoch": 5.938958707360862, + "grad_norm": 0.737452507019043, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 82700 + }, + { + "epoch": 5.93967684021544, + "grad_norm": 0.9019039869308472, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 82710 + }, + { + "epoch": 5.940394973070018, + "grad_norm": 1.0049666166305542, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 82720 + }, + { + "epoch": 5.941113105924596, + "grad_norm": 1.0016309022903442, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 82730 + }, + { + "epoch": 5.941831238779174, + "grad_norm": 0.7967594861984253, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 82740 + }, + { + "epoch": 5.942549371633753, + "grad_norm": 0.8978520631790161, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 82750 + }, + { + "epoch": 5.943267504488331, + "grad_norm": 1.0101654529571533, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 82760 + }, + { + "epoch": 5.943985637342909, + "grad_norm": 1.1515586376190186, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 82770 + }, + { + "epoch": 5.944703770197487, + "grad_norm": 0.8666134476661682, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 82780 + }, + { + "epoch": 5.945421903052065, + "grad_norm": 1.1365231275558472, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 82790 + }, + { + "epoch": 5.946140035906643, + "grad_norm": 1.211229920387268, + "learning_rate": 0.0002, + "loss": 0.6122, + "step": 82800 + }, + { + "epoch": 5.946858168761221, + "grad_norm": 0.9900869727134705, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 82810 + }, + { + "epoch": 5.947576301615799, + "grad_norm": 0.9555928111076355, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 82820 + }, + { + "epoch": 5.948294434470377, + "grad_norm": 0.8468470573425293, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 82830 + }, + { + "epoch": 5.949012567324955, + "grad_norm": 1.0280319452285767, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 82840 + }, + { + "epoch": 5.949730700179533, + "grad_norm": 0.930145800113678, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 82850 + }, + { + "epoch": 5.950448833034112, + "grad_norm": 1.0677028894424438, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 82860 + }, + { + "epoch": 5.95116696588869, + "grad_norm": 1.2035255432128906, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 82870 + }, + { + "epoch": 5.951885098743268, + "grad_norm": 0.897537887096405, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 82880 + }, + { + "epoch": 5.952603231597846, + "grad_norm": 1.2858690023422241, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 82890 + }, + { + "epoch": 5.953321364452424, + "grad_norm": 1.0300413370132446, + "learning_rate": 0.0002, + "loss": 0.6111, + "step": 82900 + }, + { + "epoch": 5.954039497307002, + "grad_norm": 0.9873301982879639, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 82910 + }, + { + "epoch": 5.95475763016158, + "grad_norm": 1.0315600633621216, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 82920 + }, + { + "epoch": 5.955475763016158, + "grad_norm": 1.0631790161132812, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 82930 + }, + { + "epoch": 5.9561938958707366, + "grad_norm": 1.035544514656067, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 82940 + }, + { + "epoch": 5.956912028725315, + "grad_norm": 1.0162041187286377, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 82950 + }, + { + "epoch": 5.957630161579893, + "grad_norm": 0.7858892679214478, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 82960 + }, + { + "epoch": 5.958348294434471, + "grad_norm": 1.0359784364700317, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 82970 + }, + { + "epoch": 5.959066427289049, + "grad_norm": 1.057173252105713, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 82980 + }, + { + "epoch": 5.959784560143627, + "grad_norm": 1.1017464399337769, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 82990 + }, + { + "epoch": 5.960502692998205, + "grad_norm": 1.0688945055007935, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 83000 + }, + { + "epoch": 5.961220825852783, + "grad_norm": 1.048864483833313, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 83010 + }, + { + "epoch": 5.961938958707361, + "grad_norm": 1.057308316230774, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 83020 + }, + { + "epoch": 5.962657091561939, + "grad_norm": 0.9014604687690735, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 83030 + }, + { + "epoch": 5.963375224416517, + "grad_norm": 0.9899709224700928, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 83040 + }, + { + "epoch": 5.9640933572710955, + "grad_norm": 1.0675519704818726, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 83050 + }, + { + "epoch": 5.9648114901256735, + "grad_norm": 0.9497889876365662, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 83060 + }, + { + "epoch": 5.9655296229802515, + "grad_norm": 0.9149549603462219, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 83070 + }, + { + "epoch": 5.9662477558348295, + "grad_norm": 1.329373836517334, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 83080 + }, + { + "epoch": 5.9669658886894075, + "grad_norm": 1.0731712579727173, + "learning_rate": 0.0002, + "loss": 0.6077, + "step": 83090 + }, + { + "epoch": 5.9676840215439855, + "grad_norm": 0.9498835802078247, + "learning_rate": 0.0002, + "loss": 0.6269, + "step": 83100 + }, + { + "epoch": 5.9684021543985635, + "grad_norm": 1.1222829818725586, + "learning_rate": 0.0002, + "loss": 0.6196, + "step": 83110 + }, + { + "epoch": 5.9691202872531415, + "grad_norm": 0.9923429489135742, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 83120 + }, + { + "epoch": 5.96983842010772, + "grad_norm": 0.9046645164489746, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 83130 + }, + { + "epoch": 5.970556552962298, + "grad_norm": 0.9259500503540039, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 83140 + }, + { + "epoch": 5.971274685816876, + "grad_norm": 1.0604174137115479, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 83150 + }, + { + "epoch": 5.971992818671454, + "grad_norm": 1.0391676425933838, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 83160 + }, + { + "epoch": 5.972710951526032, + "grad_norm": 0.8825796246528625, + "learning_rate": 0.0002, + "loss": 0.5861, + "step": 83170 + }, + { + "epoch": 5.97342908438061, + "grad_norm": 0.9687952399253845, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 83180 + }, + { + "epoch": 5.974147217235188, + "grad_norm": 0.9401392340660095, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 83190 + }, + { + "epoch": 5.974865350089766, + "grad_norm": 1.0526834726333618, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 83200 + }, + { + "epoch": 5.975583482944344, + "grad_norm": 1.1882060766220093, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 83210 + }, + { + "epoch": 5.976301615798922, + "grad_norm": 0.9182824492454529, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 83220 + }, + { + "epoch": 5.977019748653501, + "grad_norm": 1.344875454902649, + "learning_rate": 0.0002, + "loss": 0.6092, + "step": 83230 + }, + { + "epoch": 5.977737881508079, + "grad_norm": 1.3868434429168701, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 83240 + }, + { + "epoch": 5.978456014362657, + "grad_norm": 1.2702280282974243, + "learning_rate": 0.0002, + "loss": 0.6187, + "step": 83250 + }, + { + "epoch": 5.979174147217235, + "grad_norm": 0.9808234572410583, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 83260 + }, + { + "epoch": 5.979892280071813, + "grad_norm": 0.9225142598152161, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 83270 + }, + { + "epoch": 5.980610412926391, + "grad_norm": 1.1095874309539795, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 83280 + }, + { + "epoch": 5.981328545780969, + "grad_norm": 1.2650344371795654, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 83290 + }, + { + "epoch": 5.982046678635547, + "grad_norm": 0.8230084180831909, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 83300 + }, + { + "epoch": 5.982764811490125, + "grad_norm": 1.171427607536316, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 83310 + }, + { + "epoch": 5.983482944344704, + "grad_norm": 0.7458868026733398, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 83320 + }, + { + "epoch": 5.984201077199282, + "grad_norm": 0.9238616228103638, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 83330 + }, + { + "epoch": 5.98491921005386, + "grad_norm": 1.027495265007019, + "learning_rate": 0.0002, + "loss": 0.6316, + "step": 83340 + }, + { + "epoch": 5.985637342908438, + "grad_norm": 1.0694037675857544, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 83350 + }, + { + "epoch": 5.986355475763016, + "grad_norm": 0.9498767256736755, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 83360 + }, + { + "epoch": 5.987073608617594, + "grad_norm": 1.0524284839630127, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 83370 + }, + { + "epoch": 5.987791741472172, + "grad_norm": 1.07961905002594, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 83380 + }, + { + "epoch": 5.98850987432675, + "grad_norm": 1.1436965465545654, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 83390 + }, + { + "epoch": 5.989228007181328, + "grad_norm": 1.2610782384872437, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 83400 + }, + { + "epoch": 5.989946140035906, + "grad_norm": 1.1105682849884033, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 83410 + }, + { + "epoch": 5.990664272890485, + "grad_norm": 0.9900349378585815, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 83420 + }, + { + "epoch": 5.991382405745063, + "grad_norm": 0.8766723275184631, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 83430 + }, + { + "epoch": 5.992100538599641, + "grad_norm": 0.9532597661018372, + "learning_rate": 0.0002, + "loss": 0.5944, + "step": 83440 + }, + { + "epoch": 5.992818671454219, + "grad_norm": 1.016831398010254, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 83450 + }, + { + "epoch": 5.993536804308797, + "grad_norm": 0.9884716272354126, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 83460 + }, + { + "epoch": 5.994254937163375, + "grad_norm": 0.9415417909622192, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 83470 + }, + { + "epoch": 5.994973070017953, + "grad_norm": 0.8629752397537231, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 83480 + }, + { + "epoch": 5.995691202872531, + "grad_norm": 1.061378002166748, + "learning_rate": 0.0002, + "loss": 0.5961, + "step": 83490 + }, + { + "epoch": 5.99640933572711, + "grad_norm": 0.907195508480072, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 83500 + }, + { + "epoch": 5.997127468581688, + "grad_norm": 1.023658037185669, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 83510 + }, + { + "epoch": 5.997845601436266, + "grad_norm": 0.9893278479576111, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 83520 + }, + { + "epoch": 5.998563734290844, + "grad_norm": 1.1909127235412598, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 83530 + }, + { + "epoch": 5.999281867145422, + "grad_norm": 1.1800892353057861, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 83540 + }, + { + "epoch": 6.0, + "grad_norm": 1.0822563171386719, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 83550 + }, + { + "epoch": 6.0, + "eval_loss": 1.1494214534759521, + "eval_runtime": 55.1809, + "eval_samples_per_second": 13.284, + "eval_steps_per_second": 1.667, + "step": 83550 + }, + { + "epoch": 6.000718132854578, + "grad_norm": 0.8760911226272583, + "learning_rate": 0.0002, + "loss": 0.529, + "step": 83560 + }, + { + "epoch": 6.001436265709156, + "grad_norm": 1.0037305355072021, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 83570 + }, + { + "epoch": 6.002154398563734, + "grad_norm": 1.0550320148468018, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 83580 + }, + { + "epoch": 6.002872531418312, + "grad_norm": 0.7841113805770874, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 83590 + }, + { + "epoch": 6.003590664272891, + "grad_norm": 1.1221094131469727, + "learning_rate": 0.0002, + "loss": 0.5332, + "step": 83600 + }, + { + "epoch": 6.004308797127469, + "grad_norm": 1.174143671989441, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 83610 + }, + { + "epoch": 6.005026929982047, + "grad_norm": 1.1316391229629517, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 83620 + }, + { + "epoch": 6.005745062836625, + "grad_norm": 0.9318140745162964, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 83630 + }, + { + "epoch": 6.006463195691203, + "grad_norm": 1.1589723825454712, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 83640 + }, + { + "epoch": 6.007181328545781, + "grad_norm": 0.7452214360237122, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 83650 + }, + { + "epoch": 6.007899461400359, + "grad_norm": 1.205767035484314, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 83660 + }, + { + "epoch": 6.008617594254937, + "grad_norm": 0.8741596341133118, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 83670 + }, + { + "epoch": 6.009335727109515, + "grad_norm": 1.152982234954834, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 83680 + }, + { + "epoch": 6.010053859964093, + "grad_norm": 1.2438874244689941, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 83690 + }, + { + "epoch": 6.010771992818672, + "grad_norm": 1.142795443534851, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 83700 + }, + { + "epoch": 6.01149012567325, + "grad_norm": 1.1999919414520264, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 83710 + }, + { + "epoch": 6.012208258527828, + "grad_norm": 1.1839698553085327, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 83720 + }, + { + "epoch": 6.012926391382406, + "grad_norm": 1.1131623983383179, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 83730 + }, + { + "epoch": 6.013644524236984, + "grad_norm": 0.8436203598976135, + "learning_rate": 0.0002, + "loss": 0.5086, + "step": 83740 + }, + { + "epoch": 6.014362657091562, + "grad_norm": 0.9938826560974121, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 83750 + }, + { + "epoch": 6.01508078994614, + "grad_norm": 1.1624900102615356, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 83760 + }, + { + "epoch": 6.015798922800718, + "grad_norm": 1.0212476253509521, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 83770 + }, + { + "epoch": 6.016517055655296, + "grad_norm": 0.8108501434326172, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 83780 + }, + { + "epoch": 6.017235188509875, + "grad_norm": 1.3106935024261475, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 83790 + }, + { + "epoch": 6.017953321364453, + "grad_norm": 1.3103147745132446, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 83800 + }, + { + "epoch": 6.018671454219031, + "grad_norm": 0.7501855492591858, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 83810 + }, + { + "epoch": 6.019389587073609, + "grad_norm": 0.9246482253074646, + "learning_rate": 0.0002, + "loss": 0.5079, + "step": 83820 + }, + { + "epoch": 6.020107719928187, + "grad_norm": 1.0305052995681763, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 83830 + }, + { + "epoch": 6.020825852782765, + "grad_norm": 1.0912569761276245, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 83840 + }, + { + "epoch": 6.021543985637343, + "grad_norm": 0.9320057034492493, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 83850 + }, + { + "epoch": 6.022262118491921, + "grad_norm": 1.160483479499817, + "learning_rate": 0.0002, + "loss": 0.4795, + "step": 83860 + }, + { + "epoch": 6.022980251346499, + "grad_norm": 1.0211237668991089, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 83870 + }, + { + "epoch": 6.023698384201078, + "grad_norm": 0.8101710081100464, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 83880 + }, + { + "epoch": 6.024416517055656, + "grad_norm": 1.0671406984329224, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 83890 + }, + { + "epoch": 6.025134649910234, + "grad_norm": 1.3084125518798828, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 83900 + }, + { + "epoch": 6.025852782764812, + "grad_norm": 1.0144813060760498, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 83910 + }, + { + "epoch": 6.02657091561939, + "grad_norm": 1.134848952293396, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 83920 + }, + { + "epoch": 6.027289048473968, + "grad_norm": 1.183115005493164, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 83930 + }, + { + "epoch": 6.028007181328546, + "grad_norm": 0.961912989616394, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 83940 + }, + { + "epoch": 6.028725314183124, + "grad_norm": 0.9033881425857544, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 83950 + }, + { + "epoch": 6.029443447037702, + "grad_norm": 1.0272901058197021, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 83960 + }, + { + "epoch": 6.03016157989228, + "grad_norm": 1.0007939338684082, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 83970 + }, + { + "epoch": 6.0308797127468585, + "grad_norm": 1.0941389799118042, + "learning_rate": 0.0002, + "loss": 0.5215, + "step": 83980 + }, + { + "epoch": 6.0315978456014365, + "grad_norm": 0.9068517088890076, + "learning_rate": 0.0002, + "loss": 0.4881, + "step": 83990 + }, + { + "epoch": 6.0323159784560145, + "grad_norm": 0.8636500835418701, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 84000 + }, + { + "epoch": 6.0330341113105925, + "grad_norm": 1.352675437927246, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 84010 + }, + { + "epoch": 6.0337522441651705, + "grad_norm": 1.0889637470245361, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 84020 + }, + { + "epoch": 6.0344703770197485, + "grad_norm": 0.9063141345977783, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 84030 + }, + { + "epoch": 6.0351885098743265, + "grad_norm": 1.317254900932312, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 84040 + }, + { + "epoch": 6.0359066427289045, + "grad_norm": 1.1001603603363037, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 84050 + }, + { + "epoch": 6.0366247755834825, + "grad_norm": 0.8041839003562927, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 84060 + }, + { + "epoch": 6.037342908438061, + "grad_norm": 1.125082015991211, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 84070 + }, + { + "epoch": 6.038061041292639, + "grad_norm": 0.8926277160644531, + "learning_rate": 0.0002, + "loss": 0.5023, + "step": 84080 + }, + { + "epoch": 6.038779174147217, + "grad_norm": 1.0548304319381714, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 84090 + }, + { + "epoch": 6.039497307001795, + "grad_norm": 1.2299435138702393, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 84100 + }, + { + "epoch": 6.040215439856373, + "grad_norm": 0.7348281741142273, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 84110 + }, + { + "epoch": 6.040933572710951, + "grad_norm": 1.032209873199463, + "learning_rate": 0.0002, + "loss": 0.5598, + "step": 84120 + }, + { + "epoch": 6.041651705565529, + "grad_norm": 0.925134003162384, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 84130 + }, + { + "epoch": 6.042369838420107, + "grad_norm": 1.1078300476074219, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 84140 + }, + { + "epoch": 6.043087971274685, + "grad_norm": 0.9045702815055847, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 84150 + }, + { + "epoch": 6.043806104129264, + "grad_norm": 0.8836823105812073, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 84160 + }, + { + "epoch": 6.044524236983842, + "grad_norm": 0.8083572387695312, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 84170 + }, + { + "epoch": 6.04524236983842, + "grad_norm": 0.8744190335273743, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 84180 + }, + { + "epoch": 6.045960502692998, + "grad_norm": 1.1944562196731567, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 84190 + }, + { + "epoch": 6.046678635547576, + "grad_norm": 1.3782621622085571, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 84200 + }, + { + "epoch": 6.047396768402154, + "grad_norm": 1.2800641059875488, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 84210 + }, + { + "epoch": 6.048114901256732, + "grad_norm": 1.1035456657409668, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 84220 + }, + { + "epoch": 6.04883303411131, + "grad_norm": 1.243274211883545, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 84230 + }, + { + "epoch": 6.049551166965888, + "grad_norm": 0.8821795582771301, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 84240 + }, + { + "epoch": 6.050269299820466, + "grad_norm": 0.8730825185775757, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 84250 + }, + { + "epoch": 6.050987432675045, + "grad_norm": 0.9874304533004761, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 84260 + }, + { + "epoch": 6.051705565529623, + "grad_norm": 1.3245618343353271, + "learning_rate": 0.0002, + "loss": 0.5261, + "step": 84270 + }, + { + "epoch": 6.052423698384201, + "grad_norm": 1.04741370677948, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 84280 + }, + { + "epoch": 6.053141831238779, + "grad_norm": 1.1984949111938477, + "learning_rate": 0.0002, + "loss": 0.511, + "step": 84290 + }, + { + "epoch": 6.053859964093357, + "grad_norm": 0.9603039622306824, + "learning_rate": 0.0002, + "loss": 0.5148, + "step": 84300 + }, + { + "epoch": 6.054578096947935, + "grad_norm": 1.178102731704712, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 84310 + }, + { + "epoch": 6.055296229802513, + "grad_norm": 1.135046124458313, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 84320 + }, + { + "epoch": 6.056014362657091, + "grad_norm": 0.9682887196540833, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 84330 + }, + { + "epoch": 6.056732495511669, + "grad_norm": 0.9676550030708313, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 84340 + }, + { + "epoch": 6.057450628366248, + "grad_norm": 1.0987977981567383, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 84350 + }, + { + "epoch": 6.058168761220826, + "grad_norm": 0.9808574914932251, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 84360 + }, + { + "epoch": 6.058886894075404, + "grad_norm": 1.0585200786590576, + "learning_rate": 0.0002, + "loss": 0.4836, + "step": 84370 + }, + { + "epoch": 6.059605026929982, + "grad_norm": 0.9592017531394958, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 84380 + }, + { + "epoch": 6.06032315978456, + "grad_norm": 0.9652285575866699, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 84390 + }, + { + "epoch": 6.061041292639138, + "grad_norm": 1.1223928928375244, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 84400 + }, + { + "epoch": 6.061759425493716, + "grad_norm": 1.0554455518722534, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 84410 + }, + { + "epoch": 6.062477558348294, + "grad_norm": 1.4566363096237183, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 84420 + }, + { + "epoch": 6.063195691202872, + "grad_norm": 1.0793368816375732, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 84430 + }, + { + "epoch": 6.063913824057451, + "grad_norm": 1.1032981872558594, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 84440 + }, + { + "epoch": 6.064631956912029, + "grad_norm": 1.0701037645339966, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 84450 + }, + { + "epoch": 6.065350089766607, + "grad_norm": 0.9359426498413086, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 84460 + }, + { + "epoch": 6.066068222621185, + "grad_norm": 1.0277773141860962, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 84470 + }, + { + "epoch": 6.066786355475763, + "grad_norm": 1.029319405555725, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 84480 + }, + { + "epoch": 6.067504488330341, + "grad_norm": 1.3563756942749023, + "learning_rate": 0.0002, + "loss": 0.4949, + "step": 84490 + }, + { + "epoch": 6.068222621184919, + "grad_norm": 0.9577816128730774, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 84500 + }, + { + "epoch": 6.068940754039497, + "grad_norm": 0.9856799840927124, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 84510 + }, + { + "epoch": 6.069658886894075, + "grad_norm": 1.3285183906555176, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 84520 + }, + { + "epoch": 6.070377019748653, + "grad_norm": 1.0407335758209229, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 84530 + }, + { + "epoch": 6.071095152603232, + "grad_norm": 1.3125360012054443, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 84540 + }, + { + "epoch": 6.07181328545781, + "grad_norm": 1.0198888778686523, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 84550 + }, + { + "epoch": 6.072531418312388, + "grad_norm": 1.198135256767273, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 84560 + }, + { + "epoch": 6.073249551166966, + "grad_norm": 1.1547776460647583, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 84570 + }, + { + "epoch": 6.073967684021544, + "grad_norm": 1.1667766571044922, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 84580 + }, + { + "epoch": 6.074685816876122, + "grad_norm": 0.945159375667572, + "learning_rate": 0.0002, + "loss": 0.5465, + "step": 84590 + }, + { + "epoch": 6.0754039497307, + "grad_norm": 1.0362721681594849, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 84600 + }, + { + "epoch": 6.076122082585278, + "grad_norm": 1.1442973613739014, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 84610 + }, + { + "epoch": 6.076840215439856, + "grad_norm": 1.2077388763427734, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 84620 + }, + { + "epoch": 6.077558348294435, + "grad_norm": 1.1404398679733276, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 84630 + }, + { + "epoch": 6.078276481149013, + "grad_norm": 1.0291249752044678, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 84640 + }, + { + "epoch": 6.078994614003591, + "grad_norm": 1.2045460939407349, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 84650 + }, + { + "epoch": 6.079712746858169, + "grad_norm": 0.9492267966270447, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 84660 + }, + { + "epoch": 6.080430879712747, + "grad_norm": 0.9108620285987854, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 84670 + }, + { + "epoch": 6.081149012567325, + "grad_norm": 1.0403251647949219, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 84680 + }, + { + "epoch": 6.081867145421903, + "grad_norm": 0.8537648916244507, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 84690 + }, + { + "epoch": 6.082585278276481, + "grad_norm": 0.8450568914413452, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 84700 + }, + { + "epoch": 6.083303411131059, + "grad_norm": 0.9770439267158508, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 84710 + }, + { + "epoch": 6.084021543985638, + "grad_norm": 0.7480165958404541, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 84720 + }, + { + "epoch": 6.084739676840216, + "grad_norm": 1.0038665533065796, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 84730 + }, + { + "epoch": 6.085457809694794, + "grad_norm": 1.2631266117095947, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 84740 + }, + { + "epoch": 6.086175942549372, + "grad_norm": 1.0285290479660034, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 84750 + }, + { + "epoch": 6.08689407540395, + "grad_norm": 0.8775458335876465, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 84760 + }, + { + "epoch": 6.087612208258528, + "grad_norm": 1.105391263961792, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 84770 + }, + { + "epoch": 6.088330341113106, + "grad_norm": 0.9214589595794678, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 84780 + }, + { + "epoch": 6.089048473967684, + "grad_norm": 1.1920515298843384, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 84790 + }, + { + "epoch": 6.089766606822262, + "grad_norm": 1.0314369201660156, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 84800 + }, + { + "epoch": 6.09048473967684, + "grad_norm": 1.1323022842407227, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 84810 + }, + { + "epoch": 6.091202872531419, + "grad_norm": 0.9882907271385193, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 84820 + }, + { + "epoch": 6.091921005385997, + "grad_norm": 0.9372309446334839, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 84830 + }, + { + "epoch": 6.092639138240575, + "grad_norm": 0.9904384016990662, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 84840 + }, + { + "epoch": 6.093357271095153, + "grad_norm": 1.1983239650726318, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 84850 + }, + { + "epoch": 6.094075403949731, + "grad_norm": 1.0157414674758911, + "learning_rate": 0.0002, + "loss": 0.5018, + "step": 84860 + }, + { + "epoch": 6.094793536804309, + "grad_norm": 1.1213963031768799, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 84870 + }, + { + "epoch": 6.095511669658887, + "grad_norm": 0.9863889813423157, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 84880 + }, + { + "epoch": 6.096229802513465, + "grad_norm": 1.2265585660934448, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 84890 + }, + { + "epoch": 6.096947935368043, + "grad_norm": 0.9000206589698792, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 84900 + }, + { + "epoch": 6.097666068222622, + "grad_norm": 0.9284350872039795, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 84910 + }, + { + "epoch": 6.0983842010772, + "grad_norm": 0.8180069923400879, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 84920 + }, + { + "epoch": 6.099102333931778, + "grad_norm": 1.0313721895217896, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 84930 + }, + { + "epoch": 6.099820466786356, + "grad_norm": 0.9959180355072021, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 84940 + }, + { + "epoch": 6.100538599640934, + "grad_norm": 1.1720712184906006, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 84950 + }, + { + "epoch": 6.101256732495512, + "grad_norm": 1.1033729314804077, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 84960 + }, + { + "epoch": 6.10197486535009, + "grad_norm": 1.2325657606124878, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 84970 + }, + { + "epoch": 6.102692998204668, + "grad_norm": 1.204935073852539, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 84980 + }, + { + "epoch": 6.103411131059246, + "grad_norm": 0.9543479084968567, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 84990 + }, + { + "epoch": 6.1041292639138245, + "grad_norm": 1.0036866664886475, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 85000 + }, + { + "epoch": 6.1048473967684025, + "grad_norm": 1.0862882137298584, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 85010 + }, + { + "epoch": 6.1055655296229805, + "grad_norm": 1.052764892578125, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 85020 + }, + { + "epoch": 6.1062836624775585, + "grad_norm": 1.1948769092559814, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 85030 + }, + { + "epoch": 6.1070017953321365, + "grad_norm": 1.0291588306427002, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 85040 + }, + { + "epoch": 6.1077199281867145, + "grad_norm": 1.2162322998046875, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 85050 + }, + { + "epoch": 6.1084380610412925, + "grad_norm": 1.2867375612258911, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 85060 + }, + { + "epoch": 6.1091561938958705, + "grad_norm": 0.9639427661895752, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 85070 + }, + { + "epoch": 6.1098743267504485, + "grad_norm": 1.0775039196014404, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 85080 + }, + { + "epoch": 6.1105924596050265, + "grad_norm": 1.0423188209533691, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 85090 + }, + { + "epoch": 6.111310592459605, + "grad_norm": 0.9388473033905029, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 85100 + }, + { + "epoch": 6.112028725314183, + "grad_norm": 1.0761773586273193, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 85110 + }, + { + "epoch": 6.112746858168761, + "grad_norm": 1.0886104106903076, + "learning_rate": 0.0002, + "loss": 0.5144, + "step": 85120 + }, + { + "epoch": 6.113464991023339, + "grad_norm": 0.8716141581535339, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 85130 + }, + { + "epoch": 6.114183123877917, + "grad_norm": 1.5060595273971558, + "learning_rate": 0.0002, + "loss": 0.5598, + "step": 85140 + }, + { + "epoch": 6.114901256732495, + "grad_norm": 1.2417129278182983, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 85150 + }, + { + "epoch": 6.115619389587073, + "grad_norm": 1.063604712486267, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 85160 + }, + { + "epoch": 6.116337522441651, + "grad_norm": 1.1341352462768555, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 85170 + }, + { + "epoch": 6.117055655296229, + "grad_norm": 1.011865258216858, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 85180 + }, + { + "epoch": 6.117773788150808, + "grad_norm": 1.0746972560882568, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 85190 + }, + { + "epoch": 6.118491921005386, + "grad_norm": 0.9522349238395691, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 85200 + }, + { + "epoch": 6.119210053859964, + "grad_norm": 1.091785192489624, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 85210 + }, + { + "epoch": 6.119928186714542, + "grad_norm": 1.1013420820236206, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 85220 + }, + { + "epoch": 6.12064631956912, + "grad_norm": 0.9477053880691528, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 85230 + }, + { + "epoch": 6.121364452423698, + "grad_norm": 1.1278045177459717, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 85240 + }, + { + "epoch": 6.122082585278276, + "grad_norm": 1.0343154668807983, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 85250 + }, + { + "epoch": 6.122800718132854, + "grad_norm": 0.9023236036300659, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 85260 + }, + { + "epoch": 6.123518850987432, + "grad_norm": 1.1085705757141113, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 85270 + }, + { + "epoch": 6.124236983842011, + "grad_norm": 1.2945729494094849, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 85280 + }, + { + "epoch": 6.124955116696589, + "grad_norm": 1.0367915630340576, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 85290 + }, + { + "epoch": 6.125673249551167, + "grad_norm": 0.9990636706352234, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 85300 + }, + { + "epoch": 6.126391382405745, + "grad_norm": 0.9737518429756165, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 85310 + }, + { + "epoch": 6.127109515260323, + "grad_norm": 1.0211181640625, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 85320 + }, + { + "epoch": 6.127827648114901, + "grad_norm": 0.9609670042991638, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 85330 + }, + { + "epoch": 6.128545780969479, + "grad_norm": 1.124629259109497, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 85340 + }, + { + "epoch": 6.129263913824057, + "grad_norm": 0.9436500072479248, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 85350 + }, + { + "epoch": 6.129982046678635, + "grad_norm": 1.3075382709503174, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 85360 + }, + { + "epoch": 6.130700179533213, + "grad_norm": 0.9185589551925659, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 85370 + }, + { + "epoch": 6.131418312387792, + "grad_norm": 1.1051443815231323, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 85380 + }, + { + "epoch": 6.13213644524237, + "grad_norm": 1.185263752937317, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 85390 + }, + { + "epoch": 6.132854578096948, + "grad_norm": 1.0959895849227905, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 85400 + }, + { + "epoch": 6.133572710951526, + "grad_norm": 0.9279834032058716, + "learning_rate": 0.0002, + "loss": 0.4946, + "step": 85410 + }, + { + "epoch": 6.134290843806104, + "grad_norm": 1.36788010597229, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 85420 + }, + { + "epoch": 6.135008976660682, + "grad_norm": 1.0156842470169067, + "learning_rate": 0.0002, + "loss": 0.5122, + "step": 85430 + }, + { + "epoch": 6.13572710951526, + "grad_norm": 0.9998385906219482, + "learning_rate": 0.0002, + "loss": 0.5287, + "step": 85440 + }, + { + "epoch": 6.136445242369838, + "grad_norm": 1.21120285987854, + "learning_rate": 0.0002, + "loss": 0.5205, + "step": 85450 + }, + { + "epoch": 6.137163375224416, + "grad_norm": 1.1198976039886475, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 85460 + }, + { + "epoch": 6.137881508078995, + "grad_norm": 0.8551197648048401, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 85470 + }, + { + "epoch": 6.138599640933573, + "grad_norm": 1.378423810005188, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 85480 + }, + { + "epoch": 6.139317773788151, + "grad_norm": 1.0602139234542847, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 85490 + }, + { + "epoch": 6.140035906642729, + "grad_norm": 0.9416277408599854, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 85500 + }, + { + "epoch": 6.140754039497307, + "grad_norm": 0.9356902241706848, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 85510 + }, + { + "epoch": 6.141472172351885, + "grad_norm": 1.1635851860046387, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 85520 + }, + { + "epoch": 6.142190305206463, + "grad_norm": 0.7880265712738037, + "learning_rate": 0.0002, + "loss": 0.5026, + "step": 85530 + }, + { + "epoch": 6.142908438061041, + "grad_norm": 1.0618375539779663, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 85540 + }, + { + "epoch": 6.143626570915619, + "grad_norm": 0.8438394665718079, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 85550 + }, + { + "epoch": 6.144344703770198, + "grad_norm": 1.0630128383636475, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 85560 + }, + { + "epoch": 6.145062836624776, + "grad_norm": 1.027308464050293, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 85570 + }, + { + "epoch": 6.145780969479354, + "grad_norm": 1.0832568407058716, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 85580 + }, + { + "epoch": 6.146499102333932, + "grad_norm": 0.9134858250617981, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 85590 + }, + { + "epoch": 6.14721723518851, + "grad_norm": 1.2738041877746582, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 85600 + }, + { + "epoch": 6.147935368043088, + "grad_norm": 0.9961518049240112, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 85610 + }, + { + "epoch": 6.148653500897666, + "grad_norm": 0.8851816654205322, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 85620 + }, + { + "epoch": 6.149371633752244, + "grad_norm": 0.96479731798172, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 85630 + }, + { + "epoch": 6.150089766606822, + "grad_norm": 0.903256893157959, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 85640 + }, + { + "epoch": 6.1508078994614, + "grad_norm": 1.065151333808899, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 85650 + }, + { + "epoch": 6.151526032315979, + "grad_norm": 0.9824285507202148, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 85660 + }, + { + "epoch": 6.152244165170557, + "grad_norm": 1.1620386838912964, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 85670 + }, + { + "epoch": 6.152962298025135, + "grad_norm": 1.134757161140442, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 85680 + }, + { + "epoch": 6.153680430879713, + "grad_norm": 1.165537714958191, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 85690 + }, + { + "epoch": 6.154398563734291, + "grad_norm": 0.9486454129219055, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 85700 + }, + { + "epoch": 6.155116696588869, + "grad_norm": 0.9379110932350159, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 85710 + }, + { + "epoch": 6.155834829443447, + "grad_norm": 1.0051493644714355, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 85720 + }, + { + "epoch": 6.156552962298025, + "grad_norm": 0.9311991333961487, + "learning_rate": 0.0002, + "loss": 0.5389, + "step": 85730 + }, + { + "epoch": 6.157271095152603, + "grad_norm": 1.2071181535720825, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 85740 + }, + { + "epoch": 6.157989228007182, + "grad_norm": 1.2609243392944336, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 85750 + }, + { + "epoch": 6.15870736086176, + "grad_norm": 1.0485966205596924, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 85760 + }, + { + "epoch": 6.159425493716338, + "grad_norm": 0.9949250817298889, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 85770 + }, + { + "epoch": 6.160143626570916, + "grad_norm": 0.8191118836402893, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 85780 + }, + { + "epoch": 6.160861759425494, + "grad_norm": 0.96427983045578, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 85790 + }, + { + "epoch": 6.161579892280072, + "grad_norm": 1.0336496829986572, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 85800 + }, + { + "epoch": 6.16229802513465, + "grad_norm": 1.0699222087860107, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 85810 + }, + { + "epoch": 6.163016157989228, + "grad_norm": 1.2340054512023926, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 85820 + }, + { + "epoch": 6.163734290843806, + "grad_norm": 0.981848955154419, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 85830 + }, + { + "epoch": 6.164452423698384, + "grad_norm": 1.2059850692749023, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 85840 + }, + { + "epoch": 6.165170556552963, + "grad_norm": 1.0239924192428589, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 85850 + }, + { + "epoch": 6.165888689407541, + "grad_norm": 0.8601624369621277, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 85860 + }, + { + "epoch": 6.166606822262119, + "grad_norm": 1.1900125741958618, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 85870 + }, + { + "epoch": 6.167324955116697, + "grad_norm": 0.9747354388237, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 85880 + }, + { + "epoch": 6.168043087971275, + "grad_norm": 1.1277778148651123, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 85890 + }, + { + "epoch": 6.168761220825853, + "grad_norm": 1.1270111799240112, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 85900 + }, + { + "epoch": 6.169479353680431, + "grad_norm": 1.1610701084136963, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 85910 + }, + { + "epoch": 6.170197486535009, + "grad_norm": 0.873607873916626, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 85920 + }, + { + "epoch": 6.170915619389587, + "grad_norm": 1.040145993232727, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 85930 + }, + { + "epoch": 6.1716337522441655, + "grad_norm": 1.0139122009277344, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 85940 + }, + { + "epoch": 6.1723518850987436, + "grad_norm": 1.0575451850891113, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 85950 + }, + { + "epoch": 6.1730700179533216, + "grad_norm": 1.100884199142456, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 85960 + }, + { + "epoch": 6.1737881508078996, + "grad_norm": 1.1741244792938232, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 85970 + }, + { + "epoch": 6.174506283662478, + "grad_norm": 0.9446555376052856, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 85980 + }, + { + "epoch": 6.175224416517056, + "grad_norm": 0.9297952055931091, + "learning_rate": 0.0002, + "loss": 0.493, + "step": 85990 + }, + { + "epoch": 6.175942549371634, + "grad_norm": 1.196361780166626, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 86000 + }, + { + "epoch": 6.176660682226212, + "grad_norm": 1.0719913244247437, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 86010 + }, + { + "epoch": 6.17737881508079, + "grad_norm": 1.0942085981369019, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 86020 + }, + { + "epoch": 6.1780969479353685, + "grad_norm": 0.8989787697792053, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 86030 + }, + { + "epoch": 6.1788150807899465, + "grad_norm": 1.071344017982483, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 86040 + }, + { + "epoch": 6.1795332136445245, + "grad_norm": 0.9686782360076904, + "learning_rate": 0.0002, + "loss": 0.4885, + "step": 86050 + }, + { + "epoch": 6.1802513464991025, + "grad_norm": 1.0769884586334229, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 86060 + }, + { + "epoch": 6.1809694793536805, + "grad_norm": 0.9761241674423218, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 86070 + }, + { + "epoch": 6.1816876122082585, + "grad_norm": 1.0531808137893677, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 86080 + }, + { + "epoch": 6.1824057450628365, + "grad_norm": 1.0523570775985718, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 86090 + }, + { + "epoch": 6.1831238779174145, + "grad_norm": 1.2155946493148804, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 86100 + }, + { + "epoch": 6.1838420107719925, + "grad_norm": 1.1012920141220093, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 86110 + }, + { + "epoch": 6.184560143626571, + "grad_norm": 0.8764983415603638, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 86120 + }, + { + "epoch": 6.185278276481149, + "grad_norm": 0.950320303440094, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 86130 + }, + { + "epoch": 6.185996409335727, + "grad_norm": 1.1183594465255737, + "learning_rate": 0.0002, + "loss": 0.5275, + "step": 86140 + }, + { + "epoch": 6.186714542190305, + "grad_norm": 1.1919164657592773, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 86150 + }, + { + "epoch": 6.187432675044883, + "grad_norm": 1.1478904485702515, + "learning_rate": 0.0002, + "loss": 0.5121, + "step": 86160 + }, + { + "epoch": 6.188150807899461, + "grad_norm": 1.0764135122299194, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 86170 + }, + { + "epoch": 6.188868940754039, + "grad_norm": 1.195090889930725, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 86180 + }, + { + "epoch": 6.189587073608617, + "grad_norm": 1.089442253112793, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 86190 + }, + { + "epoch": 6.190305206463195, + "grad_norm": 0.9705546498298645, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 86200 + }, + { + "epoch": 6.191023339317773, + "grad_norm": 1.164642333984375, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 86210 + }, + { + "epoch": 6.191741472172352, + "grad_norm": 0.9551387429237366, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 86220 + }, + { + "epoch": 6.19245960502693, + "grad_norm": 1.0483227968215942, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 86230 + }, + { + "epoch": 6.193177737881508, + "grad_norm": 1.0068920850753784, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 86240 + }, + { + "epoch": 6.193895870736086, + "grad_norm": 1.142656683921814, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 86250 + }, + { + "epoch": 6.194614003590664, + "grad_norm": 1.1186467409133911, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 86260 + }, + { + "epoch": 6.195332136445242, + "grad_norm": 1.1664706468582153, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 86270 + }, + { + "epoch": 6.19605026929982, + "grad_norm": 1.2658511400222778, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 86280 + }, + { + "epoch": 6.196768402154398, + "grad_norm": 1.122759222984314, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 86290 + }, + { + "epoch": 6.197486535008976, + "grad_norm": 1.1611319780349731, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 86300 + }, + { + "epoch": 6.198204667863555, + "grad_norm": 1.0476176738739014, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 86310 + }, + { + "epoch": 6.198922800718133, + "grad_norm": 1.2284801006317139, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 86320 + }, + { + "epoch": 6.199640933572711, + "grad_norm": 1.1340757608413696, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 86330 + }, + { + "epoch": 6.200359066427289, + "grad_norm": 1.045088768005371, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 86340 + }, + { + "epoch": 6.201077199281867, + "grad_norm": 1.1200770139694214, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 86350 + }, + { + "epoch": 6.201795332136445, + "grad_norm": 1.1879554986953735, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 86360 + }, + { + "epoch": 6.202513464991023, + "grad_norm": 1.1146271228790283, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 86370 + }, + { + "epoch": 6.203231597845601, + "grad_norm": 0.8934822678565979, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 86380 + }, + { + "epoch": 6.203949730700179, + "grad_norm": 1.21973717212677, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 86390 + }, + { + "epoch": 6.204667863554757, + "grad_norm": 0.9424970746040344, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 86400 + }, + { + "epoch": 6.205385996409336, + "grad_norm": 1.0036219358444214, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 86410 + }, + { + "epoch": 6.206104129263914, + "grad_norm": 0.9319575428962708, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 86420 + }, + { + "epoch": 6.206822262118492, + "grad_norm": 1.0548789501190186, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 86430 + }, + { + "epoch": 6.20754039497307, + "grad_norm": 0.9361019730567932, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 86440 + }, + { + "epoch": 6.208258527827648, + "grad_norm": 0.9350554347038269, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 86450 + }, + { + "epoch": 6.208976660682226, + "grad_norm": 1.291595458984375, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 86460 + }, + { + "epoch": 6.209694793536804, + "grad_norm": 1.0414642095565796, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 86470 + }, + { + "epoch": 6.210412926391382, + "grad_norm": 1.1983444690704346, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 86480 + }, + { + "epoch": 6.21113105924596, + "grad_norm": 0.9444540739059448, + "learning_rate": 0.0002, + "loss": 0.493, + "step": 86490 + }, + { + "epoch": 6.211849192100539, + "grad_norm": 1.072526216506958, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 86500 + }, + { + "epoch": 6.212567324955117, + "grad_norm": 1.0109381675720215, + "learning_rate": 0.0002, + "loss": 0.5509, + "step": 86510 + }, + { + "epoch": 6.213285457809695, + "grad_norm": 1.1661816835403442, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 86520 + }, + { + "epoch": 6.214003590664273, + "grad_norm": 1.0434976816177368, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 86530 + }, + { + "epoch": 6.214721723518851, + "grad_norm": 1.1290796995162964, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 86540 + }, + { + "epoch": 6.215439856373429, + "grad_norm": 0.746512234210968, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 86550 + }, + { + "epoch": 6.216157989228007, + "grad_norm": 1.0346291065216064, + "learning_rate": 0.0002, + "loss": 0.5412, + "step": 86560 + }, + { + "epoch": 6.216876122082585, + "grad_norm": 1.2428497076034546, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 86570 + }, + { + "epoch": 6.217594254937163, + "grad_norm": 1.0040535926818848, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 86580 + }, + { + "epoch": 6.218312387791742, + "grad_norm": 0.9300616383552551, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 86590 + }, + { + "epoch": 6.21903052064632, + "grad_norm": 1.0006635189056396, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 86600 + }, + { + "epoch": 6.219748653500898, + "grad_norm": 1.1402281522750854, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 86610 + }, + { + "epoch": 6.220466786355476, + "grad_norm": 1.1543347835540771, + "learning_rate": 0.0002, + "loss": 0.5324, + "step": 86620 + }, + { + "epoch": 6.221184919210054, + "grad_norm": 1.1074384450912476, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 86630 + }, + { + "epoch": 6.221903052064632, + "grad_norm": 0.9032864570617676, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 86640 + }, + { + "epoch": 6.22262118491921, + "grad_norm": 1.094516396522522, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 86650 + }, + { + "epoch": 6.223339317773788, + "grad_norm": 1.2248685359954834, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 86660 + }, + { + "epoch": 6.224057450628366, + "grad_norm": 1.0211371183395386, + "learning_rate": 0.0002, + "loss": 0.5873, + "step": 86670 + }, + { + "epoch": 6.224775583482945, + "grad_norm": 1.0956611633300781, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 86680 + }, + { + "epoch": 6.225493716337523, + "grad_norm": 1.1494320631027222, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 86690 + }, + { + "epoch": 6.226211849192101, + "grad_norm": 0.968108594417572, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 86700 + }, + { + "epoch": 6.226929982046679, + "grad_norm": 1.376665711402893, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 86710 + }, + { + "epoch": 6.227648114901257, + "grad_norm": 1.2121574878692627, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 86720 + }, + { + "epoch": 6.228366247755835, + "grad_norm": 1.001272439956665, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 86730 + }, + { + "epoch": 6.229084380610413, + "grad_norm": 0.9023162722587585, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 86740 + }, + { + "epoch": 6.229802513464991, + "grad_norm": 1.2660632133483887, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 86750 + }, + { + "epoch": 6.230520646319569, + "grad_norm": 1.0549668073654175, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 86760 + }, + { + "epoch": 6.231238779174147, + "grad_norm": 1.0364645719528198, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 86770 + }, + { + "epoch": 6.231956912028726, + "grad_norm": 1.2197567224502563, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 86780 + }, + { + "epoch": 6.232675044883304, + "grad_norm": 0.8866947889328003, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 86790 + }, + { + "epoch": 6.233393177737882, + "grad_norm": 1.1795434951782227, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 86800 + }, + { + "epoch": 6.23411131059246, + "grad_norm": 1.0882378816604614, + "learning_rate": 0.0002, + "loss": 0.5309, + "step": 86810 + }, + { + "epoch": 6.234829443447038, + "grad_norm": 1.181888222694397, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 86820 + }, + { + "epoch": 6.235547576301616, + "grad_norm": 1.031209111213684, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 86830 + }, + { + "epoch": 6.236265709156194, + "grad_norm": 1.2889492511749268, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 86840 + }, + { + "epoch": 6.236983842010772, + "grad_norm": 0.874086856842041, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 86850 + }, + { + "epoch": 6.23770197486535, + "grad_norm": 1.1912312507629395, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 86860 + }, + { + "epoch": 6.238420107719929, + "grad_norm": 1.0963071584701538, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 86870 + }, + { + "epoch": 6.239138240574507, + "grad_norm": 1.028746485710144, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 86880 + }, + { + "epoch": 6.239856373429085, + "grad_norm": 1.0736430883407593, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 86890 + }, + { + "epoch": 6.240574506283663, + "grad_norm": 0.9559927582740784, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 86900 + }, + { + "epoch": 6.241292639138241, + "grad_norm": 0.9696667790412903, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 86910 + }, + { + "epoch": 6.242010771992819, + "grad_norm": 1.0710713863372803, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 86920 + }, + { + "epoch": 6.242728904847397, + "grad_norm": 1.0459970235824585, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 86930 + }, + { + "epoch": 6.243447037701975, + "grad_norm": 1.212083339691162, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 86940 + }, + { + "epoch": 6.244165170556553, + "grad_norm": 1.0369303226470947, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 86950 + }, + { + "epoch": 6.244883303411131, + "grad_norm": 1.180519700050354, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 86960 + }, + { + "epoch": 6.2456014362657095, + "grad_norm": 1.0670114755630493, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 86970 + }, + { + "epoch": 6.2463195691202875, + "grad_norm": 1.072209119796753, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 86980 + }, + { + "epoch": 6.2470377019748655, + "grad_norm": 0.9642090201377869, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 86990 + }, + { + "epoch": 6.2477558348294435, + "grad_norm": 1.077467918395996, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 87000 + }, + { + "epoch": 6.2484739676840215, + "grad_norm": 1.1081476211547852, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 87010 + }, + { + "epoch": 6.2491921005385995, + "grad_norm": 0.8815084099769592, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 87020 + }, + { + "epoch": 6.2499102333931775, + "grad_norm": 0.8562555313110352, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 87030 + }, + { + "epoch": 6.2506283662477555, + "grad_norm": 0.8729159235954285, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 87040 + }, + { + "epoch": 6.2513464991023335, + "grad_norm": 1.005082368850708, + "learning_rate": 0.0002, + "loss": 0.5179, + "step": 87050 + }, + { + "epoch": 6.252064631956912, + "grad_norm": 1.3991386890411377, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 87060 + }, + { + "epoch": 6.25278276481149, + "grad_norm": 1.090180516242981, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 87070 + }, + { + "epoch": 6.253500897666068, + "grad_norm": 1.08149254322052, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 87080 + }, + { + "epoch": 6.254219030520646, + "grad_norm": 1.1021103858947754, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 87090 + }, + { + "epoch": 6.254937163375224, + "grad_norm": 1.2393771409988403, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 87100 + }, + { + "epoch": 6.255655296229802, + "grad_norm": 0.9702037572860718, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 87110 + }, + { + "epoch": 6.25637342908438, + "grad_norm": 1.203088641166687, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 87120 + }, + { + "epoch": 6.257091561938958, + "grad_norm": 0.9722330570220947, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 87130 + }, + { + "epoch": 6.257809694793536, + "grad_norm": 0.9802384376525879, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 87140 + }, + { + "epoch": 6.258527827648114, + "grad_norm": 0.9991751909255981, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 87150 + }, + { + "epoch": 6.259245960502693, + "grad_norm": 1.1102324724197388, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 87160 + }, + { + "epoch": 6.259964093357271, + "grad_norm": 1.1357909440994263, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 87170 + }, + { + "epoch": 6.260682226211849, + "grad_norm": 1.1128548383712769, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 87180 + }, + { + "epoch": 6.261400359066427, + "grad_norm": 1.1135061979293823, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 87190 + }, + { + "epoch": 6.262118491921005, + "grad_norm": 0.9545563459396362, + "learning_rate": 0.0002, + "loss": 0.4923, + "step": 87200 + }, + { + "epoch": 6.262836624775583, + "grad_norm": 1.3011159896850586, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 87210 + }, + { + "epoch": 6.263554757630161, + "grad_norm": 1.217691421508789, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 87220 + }, + { + "epoch": 6.264272890484739, + "grad_norm": 0.9615218043327332, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 87230 + }, + { + "epoch": 6.264991023339318, + "grad_norm": 0.9935932159423828, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 87240 + }, + { + "epoch": 6.265709156193896, + "grad_norm": 1.01247239112854, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 87250 + }, + { + "epoch": 6.266427289048474, + "grad_norm": 1.1960358619689941, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 87260 + }, + { + "epoch": 6.267145421903052, + "grad_norm": 1.053942322731018, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 87270 + }, + { + "epoch": 6.26786355475763, + "grad_norm": 1.2450612783432007, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 87280 + }, + { + "epoch": 6.268581687612208, + "grad_norm": 0.7816058397293091, + "learning_rate": 0.0002, + "loss": 0.5149, + "step": 87290 + }, + { + "epoch": 6.269299820466786, + "grad_norm": 1.014817237854004, + "learning_rate": 0.0002, + "loss": 0.549, + "step": 87300 + }, + { + "epoch": 6.270017953321364, + "grad_norm": 1.1871070861816406, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 87310 + }, + { + "epoch": 6.270736086175942, + "grad_norm": 1.0170562267303467, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 87320 + }, + { + "epoch": 6.27145421903052, + "grad_norm": 1.216288685798645, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 87330 + }, + { + "epoch": 6.272172351885099, + "grad_norm": 0.8846057653427124, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 87340 + }, + { + "epoch": 6.272890484739677, + "grad_norm": 1.181233286857605, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 87350 + }, + { + "epoch": 6.273608617594255, + "grad_norm": 1.0051873922348022, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 87360 + }, + { + "epoch": 6.274326750448833, + "grad_norm": 1.1179516315460205, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 87370 + }, + { + "epoch": 6.275044883303411, + "grad_norm": 1.0118002891540527, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 87380 + }, + { + "epoch": 6.275763016157989, + "grad_norm": 1.0948026180267334, + "learning_rate": 0.0002, + "loss": 0.5789, + "step": 87390 + }, + { + "epoch": 6.276481149012567, + "grad_norm": 1.0836515426635742, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 87400 + }, + { + "epoch": 6.277199281867145, + "grad_norm": 0.9548853039741516, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 87410 + }, + { + "epoch": 6.277917414721723, + "grad_norm": 1.2531564235687256, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 87420 + }, + { + "epoch": 6.278635547576302, + "grad_norm": 1.010250449180603, + "learning_rate": 0.0002, + "loss": 0.5651, + "step": 87430 + }, + { + "epoch": 6.27935368043088, + "grad_norm": 1.3306254148483276, + "learning_rate": 0.0002, + "loss": 0.6222, + "step": 87440 + }, + { + "epoch": 6.280071813285458, + "grad_norm": 0.9485062956809998, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 87450 + }, + { + "epoch": 6.280789946140036, + "grad_norm": 0.9938563704490662, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 87460 + }, + { + "epoch": 6.281508078994614, + "grad_norm": 1.1747362613677979, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 87470 + }, + { + "epoch": 6.282226211849192, + "grad_norm": 1.1712254285812378, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 87480 + }, + { + "epoch": 6.28294434470377, + "grad_norm": 1.1453865766525269, + "learning_rate": 0.0002, + "loss": 0.6165, + "step": 87490 + }, + { + "epoch": 6.283662477558348, + "grad_norm": 0.974902331829071, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 87500 + }, + { + "epoch": 6.284380610412926, + "grad_norm": 1.1181912422180176, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 87510 + }, + { + "epoch": 6.285098743267504, + "grad_norm": 1.047453761100769, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 87520 + }, + { + "epoch": 6.285816876122083, + "grad_norm": 1.185815453529358, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 87530 + }, + { + "epoch": 6.286535008976661, + "grad_norm": 1.1126786470413208, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 87540 + }, + { + "epoch": 6.287253141831239, + "grad_norm": 1.0931676626205444, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 87550 + }, + { + "epoch": 6.287971274685817, + "grad_norm": 0.9930597543716431, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 87560 + }, + { + "epoch": 6.288689407540395, + "grad_norm": 0.9909583926200867, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 87570 + }, + { + "epoch": 6.289407540394973, + "grad_norm": 1.3766822814941406, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 87580 + }, + { + "epoch": 6.290125673249551, + "grad_norm": 1.0137864351272583, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 87590 + }, + { + "epoch": 6.290843806104129, + "grad_norm": 0.8761594295501709, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 87600 + }, + { + "epoch": 6.291561938958707, + "grad_norm": 1.155881404876709, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 87610 + }, + { + "epoch": 6.292280071813286, + "grad_norm": 0.9972963333129883, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 87620 + }, + { + "epoch": 6.292998204667864, + "grad_norm": 1.195021152496338, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 87630 + }, + { + "epoch": 6.293716337522442, + "grad_norm": 0.9872829914093018, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 87640 + }, + { + "epoch": 6.29443447037702, + "grad_norm": 1.3643794059753418, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 87650 + }, + { + "epoch": 6.295152603231598, + "grad_norm": 0.9389668703079224, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 87660 + }, + { + "epoch": 6.295870736086176, + "grad_norm": 1.379319429397583, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 87670 + }, + { + "epoch": 6.296588868940754, + "grad_norm": 1.1253849267959595, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 87680 + }, + { + "epoch": 6.297307001795332, + "grad_norm": 1.2402328252792358, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 87690 + }, + { + "epoch": 6.29802513464991, + "grad_norm": 1.085004210472107, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 87700 + }, + { + "epoch": 6.298743267504488, + "grad_norm": 1.0939021110534668, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 87710 + }, + { + "epoch": 6.299461400359067, + "grad_norm": 1.0350301265716553, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 87720 + }, + { + "epoch": 6.300179533213645, + "grad_norm": 0.9862944483757019, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 87730 + }, + { + "epoch": 6.300897666068223, + "grad_norm": 0.990942656993866, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 87740 + }, + { + "epoch": 6.301615798922801, + "grad_norm": 0.9287887215614319, + "learning_rate": 0.0002, + "loss": 0.4843, + "step": 87750 + }, + { + "epoch": 6.302333931777379, + "grad_norm": 1.225714087486267, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 87760 + }, + { + "epoch": 6.303052064631957, + "grad_norm": 1.0181951522827148, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 87770 + }, + { + "epoch": 6.303770197486535, + "grad_norm": 0.9808282256126404, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 87780 + }, + { + "epoch": 6.304488330341113, + "grad_norm": 1.1413379907608032, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 87790 + }, + { + "epoch": 6.305206463195692, + "grad_norm": 1.1188091039657593, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 87800 + }, + { + "epoch": 6.30592459605027, + "grad_norm": 1.297154188156128, + "learning_rate": 0.0002, + "loss": 0.497, + "step": 87810 + }, + { + "epoch": 6.306642728904848, + "grad_norm": 1.0723271369934082, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 87820 + }, + { + "epoch": 6.307360861759426, + "grad_norm": 1.067265510559082, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 87830 + }, + { + "epoch": 6.308078994614004, + "grad_norm": 1.01328444480896, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 87840 + }, + { + "epoch": 6.308797127468582, + "grad_norm": 1.092671513557434, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 87850 + }, + { + "epoch": 6.30951526032316, + "grad_norm": 1.168721079826355, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 87860 + }, + { + "epoch": 6.310233393177738, + "grad_norm": 1.165495753288269, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 87870 + }, + { + "epoch": 6.310951526032316, + "grad_norm": 1.10816490650177, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 87880 + }, + { + "epoch": 6.311669658886894, + "grad_norm": 0.9667611718177795, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 87890 + }, + { + "epoch": 6.312387791741473, + "grad_norm": 1.22564697265625, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 87900 + }, + { + "epoch": 6.313105924596051, + "grad_norm": 1.1156506538391113, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 87910 + }, + { + "epoch": 6.313824057450629, + "grad_norm": 1.03804349899292, + "learning_rate": 0.0002, + "loss": 0.5324, + "step": 87920 + }, + { + "epoch": 6.314542190305207, + "grad_norm": 0.9424136281013489, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 87930 + }, + { + "epoch": 6.315260323159785, + "grad_norm": 1.2243257761001587, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 87940 + }, + { + "epoch": 6.315978456014363, + "grad_norm": 1.0930471420288086, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 87950 + }, + { + "epoch": 6.316696588868941, + "grad_norm": 1.096875548362732, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 87960 + }, + { + "epoch": 6.317414721723519, + "grad_norm": 1.0606242418289185, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 87970 + }, + { + "epoch": 6.318132854578097, + "grad_norm": 0.8657089471817017, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 87980 + }, + { + "epoch": 6.3188509874326755, + "grad_norm": 0.9751629829406738, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 87990 + }, + { + "epoch": 6.3195691202872535, + "grad_norm": 1.0751961469650269, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 88000 + }, + { + "epoch": 6.3202872531418315, + "grad_norm": 1.0679874420166016, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 88010 + }, + { + "epoch": 6.3210053859964095, + "grad_norm": 1.4102588891983032, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 88020 + }, + { + "epoch": 6.3217235188509875, + "grad_norm": 0.8747799396514893, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 88030 + }, + { + "epoch": 6.3224416517055655, + "grad_norm": 1.0866155624389648, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 88040 + }, + { + "epoch": 6.3231597845601435, + "grad_norm": 1.2255747318267822, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 88050 + }, + { + "epoch": 6.3238779174147215, + "grad_norm": 1.031588077545166, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 88060 + }, + { + "epoch": 6.3245960502692995, + "grad_norm": 1.1994154453277588, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 88070 + }, + { + "epoch": 6.3253141831238775, + "grad_norm": 0.9172461032867432, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 88080 + }, + { + "epoch": 6.326032315978456, + "grad_norm": 0.8762667775154114, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 88090 + }, + { + "epoch": 6.326750448833034, + "grad_norm": 1.166225790977478, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 88100 + }, + { + "epoch": 6.327468581687612, + "grad_norm": 1.014858365058899, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 88110 + }, + { + "epoch": 6.32818671454219, + "grad_norm": 1.1080266237258911, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 88120 + }, + { + "epoch": 6.328904847396768, + "grad_norm": 0.9775443077087402, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 88130 + }, + { + "epoch": 6.329622980251346, + "grad_norm": 0.9032314419746399, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 88140 + }, + { + "epoch": 6.330341113105924, + "grad_norm": 1.0170091390609741, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 88150 + }, + { + "epoch": 6.331059245960502, + "grad_norm": 0.9412024617195129, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 88160 + }, + { + "epoch": 6.33177737881508, + "grad_norm": 0.9090259671211243, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 88170 + }, + { + "epoch": 6.332495511669659, + "grad_norm": 0.8896998167037964, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 88180 + }, + { + "epoch": 6.333213644524237, + "grad_norm": 1.1648571491241455, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 88190 + }, + { + "epoch": 6.333931777378815, + "grad_norm": 1.13261878490448, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 88200 + }, + { + "epoch": 6.334649910233393, + "grad_norm": 0.9561943411827087, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 88210 + }, + { + "epoch": 6.335368043087971, + "grad_norm": 1.3076379299163818, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 88220 + }, + { + "epoch": 6.336086175942549, + "grad_norm": 0.9788665175437927, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 88230 + }, + { + "epoch": 6.336804308797127, + "grad_norm": 1.2843645811080933, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 88240 + }, + { + "epoch": 6.337522441651705, + "grad_norm": 1.1531981229782104, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 88250 + }, + { + "epoch": 6.338240574506283, + "grad_norm": 1.1946183443069458, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 88260 + }, + { + "epoch": 6.338958707360861, + "grad_norm": 1.1190218925476074, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 88270 + }, + { + "epoch": 6.33967684021544, + "grad_norm": 1.0605140924453735, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 88280 + }, + { + "epoch": 6.340394973070018, + "grad_norm": 1.0237314701080322, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 88290 + }, + { + "epoch": 6.341113105924596, + "grad_norm": 1.1268457174301147, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 88300 + }, + { + "epoch": 6.341831238779174, + "grad_norm": 1.0750062465667725, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 88310 + }, + { + "epoch": 6.342549371633752, + "grad_norm": 1.2356536388397217, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 88320 + }, + { + "epoch": 6.34326750448833, + "grad_norm": 1.0375114679336548, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 88330 + }, + { + "epoch": 6.343985637342908, + "grad_norm": 1.063388705253601, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 88340 + }, + { + "epoch": 6.344703770197486, + "grad_norm": 0.9182760715484619, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 88350 + }, + { + "epoch": 6.345421903052064, + "grad_norm": 0.9787414073944092, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 88360 + }, + { + "epoch": 6.346140035906643, + "grad_norm": 1.295432448387146, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 88370 + }, + { + "epoch": 6.346858168761221, + "grad_norm": 0.9269146919250488, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 88380 + }, + { + "epoch": 6.347576301615799, + "grad_norm": 0.9076777696609497, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 88390 + }, + { + "epoch": 6.348294434470377, + "grad_norm": 1.1186468601226807, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 88400 + }, + { + "epoch": 6.349012567324955, + "grad_norm": 1.1021504402160645, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 88410 + }, + { + "epoch": 6.349730700179533, + "grad_norm": 1.2439358234405518, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 88420 + }, + { + "epoch": 6.350448833034111, + "grad_norm": 1.1228888034820557, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 88430 + }, + { + "epoch": 6.351166965888689, + "grad_norm": 1.226587176322937, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 88440 + }, + { + "epoch": 6.351885098743267, + "grad_norm": 1.2813525199890137, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 88450 + }, + { + "epoch": 6.352603231597846, + "grad_norm": 1.411405086517334, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 88460 + }, + { + "epoch": 6.353321364452424, + "grad_norm": 1.3659696578979492, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 88470 + }, + { + "epoch": 6.354039497307002, + "grad_norm": 1.1398485898971558, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 88480 + }, + { + "epoch": 6.35475763016158, + "grad_norm": 1.2088590860366821, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 88490 + }, + { + "epoch": 6.355475763016158, + "grad_norm": 0.9191108345985413, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 88500 + }, + { + "epoch": 6.356193895870736, + "grad_norm": 0.9855144619941711, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 88510 + }, + { + "epoch": 6.356912028725314, + "grad_norm": 1.0576577186584473, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 88520 + }, + { + "epoch": 6.357630161579892, + "grad_norm": 1.0213230848312378, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 88530 + }, + { + "epoch": 6.35834829443447, + "grad_norm": 1.2086849212646484, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 88540 + }, + { + "epoch": 6.359066427289049, + "grad_norm": 1.05294930934906, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 88550 + }, + { + "epoch": 6.359784560143627, + "grad_norm": 1.1798300743103027, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 88560 + }, + { + "epoch": 6.360502692998205, + "grad_norm": 1.088749885559082, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 88570 + }, + { + "epoch": 6.361220825852783, + "grad_norm": 1.0071386098861694, + "learning_rate": 0.0002, + "loss": 0.5299, + "step": 88580 + }, + { + "epoch": 6.361938958707361, + "grad_norm": 1.2080132961273193, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 88590 + }, + { + "epoch": 6.362657091561939, + "grad_norm": 0.9784366488456726, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 88600 + }, + { + "epoch": 6.363375224416517, + "grad_norm": 0.9475322961807251, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 88610 + }, + { + "epoch": 6.364093357271095, + "grad_norm": 0.8267584443092346, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 88620 + }, + { + "epoch": 6.364811490125673, + "grad_norm": 1.05606210231781, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 88630 + }, + { + "epoch": 6.365529622980251, + "grad_norm": 1.2059335708618164, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 88640 + }, + { + "epoch": 6.36624775583483, + "grad_norm": 1.1900845766067505, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 88650 + }, + { + "epoch": 6.366965888689408, + "grad_norm": 1.0271358489990234, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 88660 + }, + { + "epoch": 6.367684021543986, + "grad_norm": 1.1839162111282349, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 88670 + }, + { + "epoch": 6.368402154398564, + "grad_norm": 0.9042913317680359, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 88680 + }, + { + "epoch": 6.369120287253142, + "grad_norm": 1.079893946647644, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 88690 + }, + { + "epoch": 6.36983842010772, + "grad_norm": 1.0999629497528076, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 88700 + }, + { + "epoch": 6.370556552962298, + "grad_norm": 1.0618157386779785, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 88710 + }, + { + "epoch": 6.371274685816876, + "grad_norm": 0.9567645788192749, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 88720 + }, + { + "epoch": 6.371992818671454, + "grad_norm": 1.0342025756835938, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 88730 + }, + { + "epoch": 6.372710951526033, + "grad_norm": 1.0789190530776978, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 88740 + }, + { + "epoch": 6.373429084380611, + "grad_norm": 0.9956819415092468, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 88750 + }, + { + "epoch": 6.374147217235189, + "grad_norm": 0.9103280305862427, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 88760 + }, + { + "epoch": 6.374865350089767, + "grad_norm": 0.9856002330780029, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 88770 + }, + { + "epoch": 6.375583482944345, + "grad_norm": 1.1801226139068604, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 88780 + }, + { + "epoch": 6.376301615798923, + "grad_norm": 0.9876776933670044, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 88790 + }, + { + "epoch": 6.377019748653501, + "grad_norm": 1.0169886350631714, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 88800 + }, + { + "epoch": 6.377737881508079, + "grad_norm": 1.0118076801300049, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 88810 + }, + { + "epoch": 6.378456014362657, + "grad_norm": 1.0641456842422485, + "learning_rate": 0.0002, + "loss": 0.5205, + "step": 88820 + }, + { + "epoch": 6.379174147217235, + "grad_norm": 1.1138534545898438, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 88830 + }, + { + "epoch": 6.379892280071814, + "grad_norm": 1.1518962383270264, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 88840 + }, + { + "epoch": 6.380610412926392, + "grad_norm": 1.3662128448486328, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 88850 + }, + { + "epoch": 6.38132854578097, + "grad_norm": 0.9544311761856079, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 88860 + }, + { + "epoch": 6.382046678635548, + "grad_norm": 0.9747556447982788, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 88870 + }, + { + "epoch": 6.382764811490126, + "grad_norm": 1.1651948690414429, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 88880 + }, + { + "epoch": 6.383482944344704, + "grad_norm": 1.4048396348953247, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 88890 + }, + { + "epoch": 6.384201077199282, + "grad_norm": 1.1144068241119385, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 88900 + }, + { + "epoch": 6.38491921005386, + "grad_norm": 1.2978034019470215, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 88910 + }, + { + "epoch": 6.385637342908438, + "grad_norm": 1.1776132583618164, + "learning_rate": 0.0002, + "loss": 0.5279, + "step": 88920 + }, + { + "epoch": 6.3863554757630165, + "grad_norm": 0.8849034905433655, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 88930 + }, + { + "epoch": 6.3870736086175945, + "grad_norm": 1.1207057237625122, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 88940 + }, + { + "epoch": 6.3877917414721725, + "grad_norm": 0.9364172220230103, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 88950 + }, + { + "epoch": 6.3885098743267505, + "grad_norm": 1.1731317043304443, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 88960 + }, + { + "epoch": 6.3892280071813286, + "grad_norm": 1.0411573648452759, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 88970 + }, + { + "epoch": 6.3899461400359066, + "grad_norm": 1.0817447900772095, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 88980 + }, + { + "epoch": 6.3906642728904846, + "grad_norm": 1.0037593841552734, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 88990 + }, + { + "epoch": 6.391382405745063, + "grad_norm": 1.1684437990188599, + "learning_rate": 0.0002, + "loss": 0.562, + "step": 89000 + }, + { + "epoch": 6.392100538599641, + "grad_norm": 1.0237388610839844, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 89010 + }, + { + "epoch": 6.392818671454219, + "grad_norm": 1.24791419506073, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 89020 + }, + { + "epoch": 6.3935368043087974, + "grad_norm": 0.842664897441864, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 89030 + }, + { + "epoch": 6.3942549371633755, + "grad_norm": 1.1692326068878174, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 89040 + }, + { + "epoch": 6.3949730700179535, + "grad_norm": 1.0786939859390259, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 89050 + }, + { + "epoch": 6.3956912028725315, + "grad_norm": 1.1315077543258667, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 89060 + }, + { + "epoch": 6.3964093357271095, + "grad_norm": 0.9949214458465576, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 89070 + }, + { + "epoch": 6.3971274685816875, + "grad_norm": 1.0302025079727173, + "learning_rate": 0.0002, + "loss": 0.5367, + "step": 89080 + }, + { + "epoch": 6.3978456014362655, + "grad_norm": 0.9664030075073242, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 89090 + }, + { + "epoch": 6.3985637342908435, + "grad_norm": 1.1251037120819092, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 89100 + }, + { + "epoch": 6.399281867145422, + "grad_norm": 1.1103272438049316, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 89110 + }, + { + "epoch": 6.4, + "grad_norm": 0.9192888736724854, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 89120 + }, + { + "epoch": 6.400718132854578, + "grad_norm": 1.027806043624878, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 89130 + }, + { + "epoch": 6.401436265709156, + "grad_norm": 1.1219452619552612, + "learning_rate": 0.0002, + "loss": 0.608, + "step": 89140 + }, + { + "epoch": 6.402154398563734, + "grad_norm": 1.1703979969024658, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 89150 + }, + { + "epoch": 6.402872531418312, + "grad_norm": 1.025874376296997, + "learning_rate": 0.0002, + "loss": 0.5251, + "step": 89160 + }, + { + "epoch": 6.40359066427289, + "grad_norm": 1.070225715637207, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 89170 + }, + { + "epoch": 6.404308797127468, + "grad_norm": 1.1915208101272583, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 89180 + }, + { + "epoch": 6.405026929982046, + "grad_norm": 1.1954079866409302, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 89190 + }, + { + "epoch": 6.405745062836624, + "grad_norm": 1.035910964012146, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 89200 + }, + { + "epoch": 6.406463195691203, + "grad_norm": 1.1363351345062256, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 89210 + }, + { + "epoch": 6.407181328545781, + "grad_norm": 1.2086843252182007, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 89220 + }, + { + "epoch": 6.407899461400359, + "grad_norm": 1.3492387533187866, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 89230 + }, + { + "epoch": 6.408617594254937, + "grad_norm": 0.8746330738067627, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 89240 + }, + { + "epoch": 6.409335727109515, + "grad_norm": 1.0165427923202515, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 89250 + }, + { + "epoch": 6.410053859964093, + "grad_norm": 1.0314675569534302, + "learning_rate": 0.0002, + "loss": 0.5437, + "step": 89260 + }, + { + "epoch": 6.410771992818671, + "grad_norm": 1.2128242254257202, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 89270 + }, + { + "epoch": 6.411490125673249, + "grad_norm": 0.9496060013771057, + "learning_rate": 0.0002, + "loss": 0.59, + "step": 89280 + }, + { + "epoch": 6.412208258527827, + "grad_norm": 1.1838264465332031, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 89290 + }, + { + "epoch": 6.412926391382406, + "grad_norm": 1.1700918674468994, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 89300 + }, + { + "epoch": 6.413644524236984, + "grad_norm": 1.2102051973342896, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 89310 + }, + { + "epoch": 6.414362657091562, + "grad_norm": 0.9485594630241394, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 89320 + }, + { + "epoch": 6.41508078994614, + "grad_norm": 1.041496753692627, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 89330 + }, + { + "epoch": 6.415798922800718, + "grad_norm": 1.0785019397735596, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 89340 + }, + { + "epoch": 6.416517055655296, + "grad_norm": 0.9527593851089478, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 89350 + }, + { + "epoch": 6.417235188509874, + "grad_norm": 0.9879035353660583, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 89360 + }, + { + "epoch": 6.417953321364452, + "grad_norm": 0.9143751263618469, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 89370 + }, + { + "epoch": 6.41867145421903, + "grad_norm": 0.9145408272743225, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 89380 + }, + { + "epoch": 6.419389587073608, + "grad_norm": 1.0128624439239502, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 89390 + }, + { + "epoch": 6.420107719928187, + "grad_norm": 0.9454543590545654, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 89400 + }, + { + "epoch": 6.420825852782765, + "grad_norm": 1.0659215450286865, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 89410 + }, + { + "epoch": 6.421543985637343, + "grad_norm": 1.1622642278671265, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 89420 + }, + { + "epoch": 6.422262118491921, + "grad_norm": 0.9805575013160706, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 89430 + }, + { + "epoch": 6.422980251346499, + "grad_norm": 0.871903121471405, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 89440 + }, + { + "epoch": 6.423698384201077, + "grad_norm": 0.992355227470398, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 89450 + }, + { + "epoch": 6.424416517055655, + "grad_norm": 1.4055765867233276, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 89460 + }, + { + "epoch": 6.425134649910233, + "grad_norm": 1.0447325706481934, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 89470 + }, + { + "epoch": 6.425852782764811, + "grad_norm": 1.1162594556808472, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 89480 + }, + { + "epoch": 6.42657091561939, + "grad_norm": 1.0767697095870972, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 89490 + }, + { + "epoch": 6.427289048473968, + "grad_norm": 1.2253819704055786, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 89500 + }, + { + "epoch": 6.428007181328546, + "grad_norm": 1.0623136758804321, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 89510 + }, + { + "epoch": 6.428725314183124, + "grad_norm": 1.3238742351531982, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 89520 + }, + { + "epoch": 6.429443447037702, + "grad_norm": 1.2376916408538818, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 89530 + }, + { + "epoch": 6.43016157989228, + "grad_norm": 1.197453260421753, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 89540 + }, + { + "epoch": 6.430879712746858, + "grad_norm": 1.0539700984954834, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 89550 + }, + { + "epoch": 6.431597845601436, + "grad_norm": 1.0659761428833008, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 89560 + }, + { + "epoch": 6.432315978456014, + "grad_norm": 1.0186322927474976, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 89570 + }, + { + "epoch": 6.433034111310592, + "grad_norm": 1.232337474822998, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 89580 + }, + { + "epoch": 6.433752244165171, + "grad_norm": 1.1512500047683716, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 89590 + }, + { + "epoch": 6.434470377019749, + "grad_norm": 1.0068955421447754, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 89600 + }, + { + "epoch": 6.435188509874327, + "grad_norm": 1.1359424591064453, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 89610 + }, + { + "epoch": 6.435906642728905, + "grad_norm": 1.4369128942489624, + "learning_rate": 0.0002, + "loss": 0.553, + "step": 89620 + }, + { + "epoch": 6.436624775583483, + "grad_norm": 0.9382445216178894, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 89630 + }, + { + "epoch": 6.437342908438061, + "grad_norm": 0.8607977628707886, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 89640 + }, + { + "epoch": 6.438061041292639, + "grad_norm": 0.9498276114463806, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 89650 + }, + { + "epoch": 6.438779174147217, + "grad_norm": 1.4109948873519897, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 89660 + }, + { + "epoch": 6.439497307001796, + "grad_norm": 1.106134295463562, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 89670 + }, + { + "epoch": 6.440215439856374, + "grad_norm": 1.128963589668274, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 89680 + }, + { + "epoch": 6.440933572710952, + "grad_norm": 1.1370604038238525, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 89690 + }, + { + "epoch": 6.44165170556553, + "grad_norm": 1.380922794342041, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 89700 + }, + { + "epoch": 6.442369838420108, + "grad_norm": 0.9597383737564087, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 89710 + }, + { + "epoch": 6.443087971274686, + "grad_norm": 1.1491756439208984, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 89720 + }, + { + "epoch": 6.443806104129264, + "grad_norm": 1.1313573122024536, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 89730 + }, + { + "epoch": 6.444524236983842, + "grad_norm": 1.1081135272979736, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 89740 + }, + { + "epoch": 6.44524236983842, + "grad_norm": 1.0297505855560303, + "learning_rate": 0.0002, + "loss": 0.5648, + "step": 89750 + }, + { + "epoch": 6.445960502692998, + "grad_norm": 1.0534520149230957, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 89760 + }, + { + "epoch": 6.446678635547577, + "grad_norm": 1.218485951423645, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 89770 + }, + { + "epoch": 6.447396768402155, + "grad_norm": 0.9336987137794495, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 89780 + }, + { + "epoch": 6.448114901256733, + "grad_norm": 0.9854478240013123, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 89790 + }, + { + "epoch": 6.448833034111311, + "grad_norm": 1.1036708354949951, + "learning_rate": 0.0002, + "loss": 0.5718, + "step": 89800 + }, + { + "epoch": 6.449551166965889, + "grad_norm": 1.2220509052276611, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 89810 + }, + { + "epoch": 6.450269299820467, + "grad_norm": 0.9955567121505737, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 89820 + }, + { + "epoch": 6.450987432675045, + "grad_norm": 1.0350912809371948, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 89830 + }, + { + "epoch": 6.451705565529623, + "grad_norm": 1.156080722808838, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 89840 + }, + { + "epoch": 6.452423698384201, + "grad_norm": 0.8922389149665833, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 89850 + }, + { + "epoch": 6.45314183123878, + "grad_norm": 0.9318913221359253, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 89860 + }, + { + "epoch": 6.453859964093358, + "grad_norm": 0.9420756101608276, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 89870 + }, + { + "epoch": 6.454578096947936, + "grad_norm": 1.0303646326065063, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 89880 + }, + { + "epoch": 6.455296229802514, + "grad_norm": 1.070806860923767, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 89890 + }, + { + "epoch": 6.456014362657092, + "grad_norm": 0.9890686869621277, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 89900 + }, + { + "epoch": 6.45673249551167, + "grad_norm": 1.1254929304122925, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 89910 + }, + { + "epoch": 6.457450628366248, + "grad_norm": 1.0023183822631836, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 89920 + }, + { + "epoch": 6.458168761220826, + "grad_norm": 1.118721604347229, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 89930 + }, + { + "epoch": 6.458886894075404, + "grad_norm": 1.2170203924179077, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 89940 + }, + { + "epoch": 6.459605026929982, + "grad_norm": 1.0662257671356201, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 89950 + }, + { + "epoch": 6.4603231597845605, + "grad_norm": 0.8912546634674072, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 89960 + }, + { + "epoch": 6.4610412926391385, + "grad_norm": 1.0346225500106812, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 89970 + }, + { + "epoch": 6.4617594254937165, + "grad_norm": 1.239388346672058, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 89980 + }, + { + "epoch": 6.4624775583482945, + "grad_norm": 1.0100152492523193, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 89990 + }, + { + "epoch": 6.4631956912028725, + "grad_norm": 1.1496137380599976, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 90000 + }, + { + "epoch": 6.4639138240574505, + "grad_norm": 0.9652666449546814, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 90010 + }, + { + "epoch": 6.4646319569120285, + "grad_norm": 1.459730863571167, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 90020 + }, + { + "epoch": 6.4653500897666065, + "grad_norm": 0.9096665978431702, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 90030 + }, + { + "epoch": 6.4660682226211845, + "grad_norm": 1.1356233358383179, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 90040 + }, + { + "epoch": 6.466786355475763, + "grad_norm": 1.0192385911941528, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 90050 + }, + { + "epoch": 6.467504488330341, + "grad_norm": 0.9494831562042236, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 90060 + }, + { + "epoch": 6.468222621184919, + "grad_norm": 0.9784388542175293, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 90070 + }, + { + "epoch": 6.468940754039497, + "grad_norm": 1.0754846334457397, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 90080 + }, + { + "epoch": 6.469658886894075, + "grad_norm": 0.9019646644592285, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 90090 + }, + { + "epoch": 6.470377019748653, + "grad_norm": 1.1848793029785156, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 90100 + }, + { + "epoch": 6.471095152603231, + "grad_norm": 1.1312837600708008, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 90110 + }, + { + "epoch": 6.471813285457809, + "grad_norm": 0.9868128299713135, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 90120 + }, + { + "epoch": 6.472531418312387, + "grad_norm": 0.894279956817627, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 90130 + }, + { + "epoch": 6.473249551166965, + "grad_norm": 1.1206544637680054, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 90140 + }, + { + "epoch": 6.473967684021544, + "grad_norm": 1.048126220703125, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 90150 + }, + { + "epoch": 6.474685816876122, + "grad_norm": 0.9624786972999573, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 90160 + }, + { + "epoch": 6.4754039497307, + "grad_norm": 1.3301671743392944, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 90170 + }, + { + "epoch": 6.476122082585278, + "grad_norm": 1.1016923189163208, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 90180 + }, + { + "epoch": 6.476840215439856, + "grad_norm": 1.084158182144165, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 90190 + }, + { + "epoch": 6.477558348294434, + "grad_norm": 1.0704890489578247, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 90200 + }, + { + "epoch": 6.478276481149012, + "grad_norm": 1.0849730968475342, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 90210 + }, + { + "epoch": 6.47899461400359, + "grad_norm": 1.0671768188476562, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 90220 + }, + { + "epoch": 6.479712746858169, + "grad_norm": 1.1208873987197876, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 90230 + }, + { + "epoch": 6.480430879712747, + "grad_norm": 1.1958850622177124, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 90240 + }, + { + "epoch": 6.481149012567325, + "grad_norm": 1.2102761268615723, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 90250 + }, + { + "epoch": 6.481867145421903, + "grad_norm": 1.0813510417938232, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 90260 + }, + { + "epoch": 6.482585278276481, + "grad_norm": 0.8553891777992249, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 90270 + }, + { + "epoch": 6.483303411131059, + "grad_norm": 1.0855463743209839, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 90280 + }, + { + "epoch": 6.484021543985637, + "grad_norm": 1.1179498434066772, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 90290 + }, + { + "epoch": 6.484739676840215, + "grad_norm": 1.1268035173416138, + "learning_rate": 0.0002, + "loss": 0.62, + "step": 90300 + }, + { + "epoch": 6.485457809694793, + "grad_norm": 1.0755188465118408, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 90310 + }, + { + "epoch": 6.486175942549371, + "grad_norm": 1.0469547510147095, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 90320 + }, + { + "epoch": 6.48689407540395, + "grad_norm": 0.8739270567893982, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 90330 + }, + { + "epoch": 6.487612208258528, + "grad_norm": 1.2452377080917358, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 90340 + }, + { + "epoch": 6.488330341113106, + "grad_norm": 1.1576505899429321, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 90350 + }, + { + "epoch": 6.489048473967684, + "grad_norm": 1.0247524976730347, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 90360 + }, + { + "epoch": 6.489766606822262, + "grad_norm": 1.1306205987930298, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 90370 + }, + { + "epoch": 6.49048473967684, + "grad_norm": 1.0545839071273804, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 90380 + }, + { + "epoch": 6.491202872531418, + "grad_norm": 1.281407117843628, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 90390 + }, + { + "epoch": 6.491921005385996, + "grad_norm": 1.2330801486968994, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 90400 + }, + { + "epoch": 6.492639138240574, + "grad_norm": 0.8966873288154602, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 90410 + }, + { + "epoch": 6.493357271095153, + "grad_norm": 0.9748067259788513, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 90420 + }, + { + "epoch": 6.494075403949731, + "grad_norm": 0.9285972118377686, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 90430 + }, + { + "epoch": 6.494793536804309, + "grad_norm": 1.123449444770813, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 90440 + }, + { + "epoch": 6.495511669658887, + "grad_norm": 1.4190359115600586, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 90450 + }, + { + "epoch": 6.496229802513465, + "grad_norm": 0.9877263307571411, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 90460 + }, + { + "epoch": 6.496947935368043, + "grad_norm": 0.9850174188613892, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 90470 + }, + { + "epoch": 6.497666068222621, + "grad_norm": 1.3609496355056763, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 90480 + }, + { + "epoch": 6.498384201077199, + "grad_norm": 0.8299460411071777, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 90490 + }, + { + "epoch": 6.499102333931777, + "grad_norm": 1.3359589576721191, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 90500 + }, + { + "epoch": 6.499820466786355, + "grad_norm": 1.1211248636245728, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 90510 + }, + { + "epoch": 6.500538599640934, + "grad_norm": 1.1070419549942017, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 90520 + }, + { + "epoch": 6.501256732495512, + "grad_norm": 1.1590572595596313, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 90530 + }, + { + "epoch": 6.50197486535009, + "grad_norm": 0.9865858554840088, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 90540 + }, + { + "epoch": 6.502692998204668, + "grad_norm": 0.9752925634384155, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 90550 + }, + { + "epoch": 6.503411131059246, + "grad_norm": 1.2411525249481201, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 90560 + }, + { + "epoch": 6.504129263913824, + "grad_norm": 1.1538971662521362, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 90570 + }, + { + "epoch": 6.504847396768402, + "grad_norm": 1.2818700075149536, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 90580 + }, + { + "epoch": 6.50556552962298, + "grad_norm": 1.2787950038909912, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 90590 + }, + { + "epoch": 6.506283662477558, + "grad_norm": 1.1357126235961914, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 90600 + }, + { + "epoch": 6.507001795332137, + "grad_norm": 1.0781097412109375, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 90610 + }, + { + "epoch": 6.507719928186715, + "grad_norm": 0.9754705429077148, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 90620 + }, + { + "epoch": 6.508438061041293, + "grad_norm": 1.018410563468933, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 90630 + }, + { + "epoch": 6.509156193895871, + "grad_norm": 1.0382000207901, + "learning_rate": 0.0002, + "loss": 0.562, + "step": 90640 + }, + { + "epoch": 6.509874326750449, + "grad_norm": 0.9059327840805054, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 90650 + }, + { + "epoch": 6.510592459605027, + "grad_norm": 1.2049181461334229, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 90660 + }, + { + "epoch": 6.511310592459605, + "grad_norm": 1.1005393266677856, + "learning_rate": 0.0002, + "loss": 0.6158, + "step": 90670 + }, + { + "epoch": 6.512028725314183, + "grad_norm": 1.0504072904586792, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 90680 + }, + { + "epoch": 6.512746858168761, + "grad_norm": 1.2491340637207031, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 90690 + }, + { + "epoch": 6.513464991023339, + "grad_norm": 0.9971826672554016, + "learning_rate": 0.0002, + "loss": 0.5851, + "step": 90700 + }, + { + "epoch": 6.514183123877918, + "grad_norm": 1.0228981971740723, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 90710 + }, + { + "epoch": 6.514901256732496, + "grad_norm": 1.1531293392181396, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 90720 + }, + { + "epoch": 6.515619389587074, + "grad_norm": 0.9401963949203491, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 90730 + }, + { + "epoch": 6.516337522441652, + "grad_norm": 1.3876653909683228, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 90740 + }, + { + "epoch": 6.51705565529623, + "grad_norm": 1.3111445903778076, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 90750 + }, + { + "epoch": 6.517773788150808, + "grad_norm": 0.8705055713653564, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 90760 + }, + { + "epoch": 6.518491921005386, + "grad_norm": 1.213295340538025, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 90770 + }, + { + "epoch": 6.519210053859964, + "grad_norm": 1.2075343132019043, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 90780 + }, + { + "epoch": 6.519928186714543, + "grad_norm": 0.9814115166664124, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 90790 + }, + { + "epoch": 6.520646319569121, + "grad_norm": 1.0937272310256958, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 90800 + }, + { + "epoch": 6.521364452423699, + "grad_norm": 1.0839916467666626, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 90810 + }, + { + "epoch": 6.522082585278277, + "grad_norm": 1.1918399333953857, + "learning_rate": 0.0002, + "loss": 0.6166, + "step": 90820 + }, + { + "epoch": 6.522800718132855, + "grad_norm": 1.1677868366241455, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 90830 + }, + { + "epoch": 6.523518850987433, + "grad_norm": 1.0840870141983032, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 90840 + }, + { + "epoch": 6.524236983842011, + "grad_norm": 1.10408353805542, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 90850 + }, + { + "epoch": 6.524955116696589, + "grad_norm": 1.056705355644226, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 90860 + }, + { + "epoch": 6.525673249551167, + "grad_norm": 1.0552406311035156, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 90870 + }, + { + "epoch": 6.526391382405745, + "grad_norm": 1.000816822052002, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 90880 + }, + { + "epoch": 6.527109515260323, + "grad_norm": 1.1465239524841309, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 90890 + }, + { + "epoch": 6.527827648114902, + "grad_norm": 0.9380449652671814, + "learning_rate": 0.0002, + "loss": 0.5652, + "step": 90900 + }, + { + "epoch": 6.52854578096948, + "grad_norm": 0.9572200179100037, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 90910 + }, + { + "epoch": 6.529263913824058, + "grad_norm": 1.0058002471923828, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 90920 + }, + { + "epoch": 6.529982046678636, + "grad_norm": 1.0932626724243164, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 90930 + }, + { + "epoch": 6.530700179533214, + "grad_norm": 0.9283126592636108, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 90940 + }, + { + "epoch": 6.531418312387792, + "grad_norm": 1.1347819566726685, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 90950 + }, + { + "epoch": 6.53213644524237, + "grad_norm": 1.4964616298675537, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 90960 + }, + { + "epoch": 6.532854578096948, + "grad_norm": 1.1725877523422241, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 90970 + }, + { + "epoch": 6.5335727109515265, + "grad_norm": 1.185640811920166, + "learning_rate": 0.0002, + "loss": 0.6339, + "step": 90980 + }, + { + "epoch": 6.5342908438061045, + "grad_norm": 1.0598312616348267, + "learning_rate": 0.0002, + "loss": 0.6021, + "step": 90990 + }, + { + "epoch": 6.5350089766606825, + "grad_norm": 1.389320731163025, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 91000 + }, + { + "epoch": 6.5357271095152605, + "grad_norm": 1.102960467338562, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 91010 + }, + { + "epoch": 6.5364452423698385, + "grad_norm": 1.2482284307479858, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 91020 + }, + { + "epoch": 6.5371633752244165, + "grad_norm": 1.213861346244812, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 91030 + }, + { + "epoch": 6.5378815080789945, + "grad_norm": 1.1872318983078003, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 91040 + }, + { + "epoch": 6.5385996409335725, + "grad_norm": 1.0767916440963745, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 91050 + }, + { + "epoch": 6.5393177737881505, + "grad_norm": 1.0610442161560059, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 91060 + }, + { + "epoch": 6.5400359066427285, + "grad_norm": 1.0161356925964355, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 91070 + }, + { + "epoch": 6.540754039497307, + "grad_norm": 1.373284101486206, + "learning_rate": 0.0002, + "loss": 0.5421, + "step": 91080 + }, + { + "epoch": 6.541472172351885, + "grad_norm": 1.1611387729644775, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 91090 + }, + { + "epoch": 6.542190305206463, + "grad_norm": 1.1980092525482178, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 91100 + }, + { + "epoch": 6.542908438061041, + "grad_norm": 1.1174312829971313, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 91110 + }, + { + "epoch": 6.543626570915619, + "grad_norm": 1.1376914978027344, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 91120 + }, + { + "epoch": 6.544344703770197, + "grad_norm": 1.0551620721817017, + "learning_rate": 0.0002, + "loss": 0.5549, + "step": 91130 + }, + { + "epoch": 6.545062836624775, + "grad_norm": 1.2839815616607666, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 91140 + }, + { + "epoch": 6.545780969479353, + "grad_norm": 0.7656933665275574, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 91150 + }, + { + "epoch": 6.546499102333931, + "grad_norm": 1.1079483032226562, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 91160 + }, + { + "epoch": 6.54721723518851, + "grad_norm": 1.4870734214782715, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 91170 + }, + { + "epoch": 6.547935368043088, + "grad_norm": 1.1784024238586426, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 91180 + }, + { + "epoch": 6.548653500897666, + "grad_norm": 1.3510793447494507, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 91190 + }, + { + "epoch": 6.549371633752244, + "grad_norm": 1.0237789154052734, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 91200 + }, + { + "epoch": 6.550089766606822, + "grad_norm": 1.0721405744552612, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 91210 + }, + { + "epoch": 6.5508078994614, + "grad_norm": 0.9794955253601074, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 91220 + }, + { + "epoch": 6.551526032315978, + "grad_norm": 1.1046847105026245, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 91230 + }, + { + "epoch": 6.552244165170556, + "grad_norm": 0.9706982374191284, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 91240 + }, + { + "epoch": 6.552962298025134, + "grad_norm": 0.9466179609298706, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 91250 + }, + { + "epoch": 6.553680430879712, + "grad_norm": 1.126806616783142, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 91260 + }, + { + "epoch": 6.554398563734291, + "grad_norm": 0.9713812470436096, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 91270 + }, + { + "epoch": 6.555116696588869, + "grad_norm": 0.8955506682395935, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 91280 + }, + { + "epoch": 6.555834829443447, + "grad_norm": 1.2066279649734497, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 91290 + }, + { + "epoch": 6.556552962298025, + "grad_norm": 0.957999587059021, + "learning_rate": 0.0002, + "loss": 0.5442, + "step": 91300 + }, + { + "epoch": 6.557271095152603, + "grad_norm": 1.253709077835083, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 91310 + }, + { + "epoch": 6.557989228007181, + "grad_norm": 1.0075397491455078, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 91320 + }, + { + "epoch": 6.558707360861759, + "grad_norm": 0.9356904029846191, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 91330 + }, + { + "epoch": 6.559425493716337, + "grad_norm": 1.1555782556533813, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 91340 + }, + { + "epoch": 6.560143626570916, + "grad_norm": 0.9786396026611328, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 91350 + }, + { + "epoch": 6.560861759425494, + "grad_norm": 1.156374454498291, + "learning_rate": 0.0002, + "loss": 0.5417, + "step": 91360 + }, + { + "epoch": 6.561579892280072, + "grad_norm": 1.0572668313980103, + "learning_rate": 0.0002, + "loss": 0.5836, + "step": 91370 + }, + { + "epoch": 6.56229802513465, + "grad_norm": 1.4248497486114502, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 91380 + }, + { + "epoch": 6.563016157989228, + "grad_norm": 1.1191383600234985, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 91390 + }, + { + "epoch": 6.563734290843806, + "grad_norm": 0.9622306227684021, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 91400 + }, + { + "epoch": 6.564452423698384, + "grad_norm": 1.3683338165283203, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 91410 + }, + { + "epoch": 6.565170556552962, + "grad_norm": 1.0363010168075562, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 91420 + }, + { + "epoch": 6.56588868940754, + "grad_norm": 1.2861888408660889, + "learning_rate": 0.0002, + "loss": 0.5718, + "step": 91430 + }, + { + "epoch": 6.566606822262118, + "grad_norm": 1.0330547094345093, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 91440 + }, + { + "epoch": 6.567324955116696, + "grad_norm": 1.044992446899414, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 91450 + }, + { + "epoch": 6.568043087971275, + "grad_norm": 1.0722706317901611, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 91460 + }, + { + "epoch": 6.568761220825853, + "grad_norm": 1.1327447891235352, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 91470 + }, + { + "epoch": 6.569479353680431, + "grad_norm": 1.2709840536117554, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 91480 + }, + { + "epoch": 6.570197486535009, + "grad_norm": 1.0964101552963257, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 91490 + }, + { + "epoch": 6.570915619389587, + "grad_norm": 0.9897898435592651, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 91500 + }, + { + "epoch": 6.571633752244165, + "grad_norm": 1.0143952369689941, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 91510 + }, + { + "epoch": 6.572351885098743, + "grad_norm": 0.923865020275116, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 91520 + }, + { + "epoch": 6.573070017953321, + "grad_norm": 1.144390344619751, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 91530 + }, + { + "epoch": 6.5737881508079, + "grad_norm": 1.0636180639266968, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 91540 + }, + { + "epoch": 6.574506283662478, + "grad_norm": 1.0699774026870728, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 91550 + }, + { + "epoch": 6.575224416517056, + "grad_norm": 1.2139345407485962, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 91560 + }, + { + "epoch": 6.575942549371634, + "grad_norm": 1.4551644325256348, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 91570 + }, + { + "epoch": 6.576660682226212, + "grad_norm": 1.2388415336608887, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 91580 + }, + { + "epoch": 6.57737881508079, + "grad_norm": 0.9303404688835144, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 91590 + }, + { + "epoch": 6.578096947935368, + "grad_norm": 0.932905912399292, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 91600 + }, + { + "epoch": 6.578815080789946, + "grad_norm": 1.0726542472839355, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 91610 + }, + { + "epoch": 6.579533213644524, + "grad_norm": 1.138890266418457, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 91620 + }, + { + "epoch": 6.580251346499102, + "grad_norm": 1.087165355682373, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 91630 + }, + { + "epoch": 6.580969479353681, + "grad_norm": 1.0526753664016724, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 91640 + }, + { + "epoch": 6.581687612208259, + "grad_norm": 1.068217158317566, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 91650 + }, + { + "epoch": 6.582405745062837, + "grad_norm": 1.09737229347229, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 91660 + }, + { + "epoch": 6.583123877917415, + "grad_norm": 0.9466586112976074, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 91670 + }, + { + "epoch": 6.583842010771993, + "grad_norm": 1.2311620712280273, + "learning_rate": 0.0002, + "loss": 0.6083, + "step": 91680 + }, + { + "epoch": 6.584560143626571, + "grad_norm": 1.2385680675506592, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 91690 + }, + { + "epoch": 6.585278276481149, + "grad_norm": 0.947889506816864, + "learning_rate": 0.0002, + "loss": 0.6515, + "step": 91700 + }, + { + "epoch": 6.585996409335727, + "grad_norm": 0.9600529670715332, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 91710 + }, + { + "epoch": 6.586714542190305, + "grad_norm": 1.3595638275146484, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 91720 + }, + { + "epoch": 6.587432675044884, + "grad_norm": 1.0087260007858276, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 91730 + }, + { + "epoch": 6.588150807899462, + "grad_norm": 1.0008373260498047, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 91740 + }, + { + "epoch": 6.58886894075404, + "grad_norm": 1.0367980003356934, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 91750 + }, + { + "epoch": 6.589587073608618, + "grad_norm": 1.1934503316879272, + "learning_rate": 0.0002, + "loss": 0.5834, + "step": 91760 + }, + { + "epoch": 6.590305206463196, + "grad_norm": 1.0295839309692383, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 91770 + }, + { + "epoch": 6.591023339317774, + "grad_norm": 0.926913857460022, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 91780 + }, + { + "epoch": 6.591741472172352, + "grad_norm": 1.055837631225586, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 91790 + }, + { + "epoch": 6.59245960502693, + "grad_norm": 1.006401777267456, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 91800 + }, + { + "epoch": 6.593177737881508, + "grad_norm": 1.1368589401245117, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 91810 + }, + { + "epoch": 6.593895870736086, + "grad_norm": 0.8494837880134583, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 91820 + }, + { + "epoch": 6.594614003590665, + "grad_norm": 1.3219822645187378, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 91830 + }, + { + "epoch": 6.595332136445243, + "grad_norm": 1.0583800077438354, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 91840 + }, + { + "epoch": 6.596050269299821, + "grad_norm": 1.0579098463058472, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 91850 + }, + { + "epoch": 6.596768402154399, + "grad_norm": 1.0618008375167847, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 91860 + }, + { + "epoch": 6.597486535008977, + "grad_norm": 0.9425104260444641, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 91870 + }, + { + "epoch": 6.598204667863555, + "grad_norm": 0.9130632281303406, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 91880 + }, + { + "epoch": 6.598922800718133, + "grad_norm": 1.126438856124878, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 91890 + }, + { + "epoch": 6.599640933572711, + "grad_norm": 0.9135168194770813, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 91900 + }, + { + "epoch": 6.6003590664272895, + "grad_norm": 1.1640992164611816, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 91910 + }, + { + "epoch": 6.6010771992818675, + "grad_norm": 1.2641936540603638, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 91920 + }, + { + "epoch": 6.6017953321364455, + "grad_norm": 1.1252738237380981, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 91930 + }, + { + "epoch": 6.6025134649910235, + "grad_norm": 1.0307750701904297, + "learning_rate": 0.0002, + "loss": 0.5466, + "step": 91940 + }, + { + "epoch": 6.6032315978456015, + "grad_norm": 0.978972315788269, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 91950 + }, + { + "epoch": 6.6039497307001795, + "grad_norm": 1.1350890398025513, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 91960 + }, + { + "epoch": 6.6046678635547575, + "grad_norm": 0.9177488088607788, + "learning_rate": 0.0002, + "loss": 0.6263, + "step": 91970 + }, + { + "epoch": 6.6053859964093355, + "grad_norm": 1.0381031036376953, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 91980 + }, + { + "epoch": 6.6061041292639135, + "grad_norm": 1.1706395149230957, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 91990 + }, + { + "epoch": 6.6068222621184916, + "grad_norm": 1.1102650165557861, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 92000 + }, + { + "epoch": 6.6075403949730696, + "grad_norm": 0.9234306812286377, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 92010 + }, + { + "epoch": 6.608258527827648, + "grad_norm": 1.2014371156692505, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 92020 + }, + { + "epoch": 6.6089766606822264, + "grad_norm": 0.9392209053039551, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 92030 + }, + { + "epoch": 6.6096947935368044, + "grad_norm": 1.0882072448730469, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 92040 + }, + { + "epoch": 6.6104129263913824, + "grad_norm": 1.032155156135559, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 92050 + }, + { + "epoch": 6.6111310592459605, + "grad_norm": 0.913979172706604, + "learning_rate": 0.0002, + "loss": 0.5498, + "step": 92060 + }, + { + "epoch": 6.6118491921005385, + "grad_norm": 1.205101490020752, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 92070 + }, + { + "epoch": 6.6125673249551165, + "grad_norm": 1.0713984966278076, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 92080 + }, + { + "epoch": 6.6132854578096945, + "grad_norm": 0.9191082715988159, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 92090 + }, + { + "epoch": 6.614003590664273, + "grad_norm": 0.9553678631782532, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 92100 + }, + { + "epoch": 6.614721723518851, + "grad_norm": 1.333262324333191, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 92110 + }, + { + "epoch": 6.615439856373429, + "grad_norm": 1.030739426612854, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 92120 + }, + { + "epoch": 6.616157989228007, + "grad_norm": 0.8777900338172913, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 92130 + }, + { + "epoch": 6.616876122082585, + "grad_norm": 1.071578860282898, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 92140 + }, + { + "epoch": 6.617594254937163, + "grad_norm": 1.1931039094924927, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 92150 + }, + { + "epoch": 6.618312387791741, + "grad_norm": 1.2041425704956055, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 92160 + }, + { + "epoch": 6.619030520646319, + "grad_norm": 0.8523036241531372, + "learning_rate": 0.0002, + "loss": 0.5544, + "step": 92170 + }, + { + "epoch": 6.619748653500897, + "grad_norm": 1.1914807558059692, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 92180 + }, + { + "epoch": 6.620466786355475, + "grad_norm": 1.1336464881896973, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 92190 + }, + { + "epoch": 6.621184919210053, + "grad_norm": 1.2282923460006714, + "learning_rate": 0.0002, + "loss": 0.5566, + "step": 92200 + }, + { + "epoch": 6.621903052064632, + "grad_norm": 1.1887043714523315, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 92210 + }, + { + "epoch": 6.62262118491921, + "grad_norm": 0.9654178619384766, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 92220 + }, + { + "epoch": 6.623339317773788, + "grad_norm": 0.7957702875137329, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 92230 + }, + { + "epoch": 6.624057450628366, + "grad_norm": 0.8697461485862732, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 92240 + }, + { + "epoch": 6.624775583482944, + "grad_norm": 1.0392963886260986, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 92250 + }, + { + "epoch": 6.625493716337522, + "grad_norm": 1.1502392292022705, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 92260 + }, + { + "epoch": 6.6262118491921, + "grad_norm": 1.2818870544433594, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 92270 + }, + { + "epoch": 6.626929982046678, + "grad_norm": 0.8769828081130981, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 92280 + }, + { + "epoch": 6.627648114901257, + "grad_norm": 1.2273039817810059, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 92290 + }, + { + "epoch": 6.628366247755835, + "grad_norm": 0.8619378805160522, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 92300 + }, + { + "epoch": 6.629084380610413, + "grad_norm": 0.9501098990440369, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 92310 + }, + { + "epoch": 6.629802513464991, + "grad_norm": 1.0698163509368896, + "learning_rate": 0.0002, + "loss": 0.6012, + "step": 92320 + }, + { + "epoch": 6.630520646319569, + "grad_norm": 1.0689377784729004, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 92330 + }, + { + "epoch": 6.631238779174147, + "grad_norm": 1.2086275815963745, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 92340 + }, + { + "epoch": 6.631956912028725, + "grad_norm": 1.1256859302520752, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 92350 + }, + { + "epoch": 6.632675044883303, + "grad_norm": 0.9717738032341003, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 92360 + }, + { + "epoch": 6.633393177737881, + "grad_norm": 0.9784330725669861, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 92370 + }, + { + "epoch": 6.634111310592459, + "grad_norm": 1.2600007057189941, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 92380 + }, + { + "epoch": 6.634829443447038, + "grad_norm": 0.889910101890564, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 92390 + }, + { + "epoch": 6.635547576301616, + "grad_norm": 1.010524868965149, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 92400 + }, + { + "epoch": 6.636265709156194, + "grad_norm": 1.325664758682251, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 92410 + }, + { + "epoch": 6.636983842010772, + "grad_norm": 1.3910914659500122, + "learning_rate": 0.0002, + "loss": 0.6149, + "step": 92420 + }, + { + "epoch": 6.63770197486535, + "grad_norm": 0.8858863115310669, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 92430 + }, + { + "epoch": 6.638420107719928, + "grad_norm": 1.1841683387756348, + "learning_rate": 0.0002, + "loss": 0.6007, + "step": 92440 + }, + { + "epoch": 6.639138240574506, + "grad_norm": 1.2783559560775757, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 92450 + }, + { + "epoch": 6.639856373429084, + "grad_norm": 0.9154769778251648, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 92460 + }, + { + "epoch": 6.640574506283663, + "grad_norm": 1.003371000289917, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 92470 + }, + { + "epoch": 6.641292639138241, + "grad_norm": 0.9700522422790527, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 92480 + }, + { + "epoch": 6.642010771992819, + "grad_norm": 1.273629069328308, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 92490 + }, + { + "epoch": 6.642728904847397, + "grad_norm": 1.2746435403823853, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 92500 + }, + { + "epoch": 6.643447037701975, + "grad_norm": 1.0184870958328247, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 92510 + }, + { + "epoch": 6.644165170556553, + "grad_norm": 0.9988235831260681, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 92520 + }, + { + "epoch": 6.644883303411131, + "grad_norm": 1.075997233390808, + "learning_rate": 0.0002, + "loss": 0.5275, + "step": 92530 + }, + { + "epoch": 6.645601436265709, + "grad_norm": 1.180784821510315, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 92540 + }, + { + "epoch": 6.646319569120287, + "grad_norm": 1.0889579057693481, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 92550 + }, + { + "epoch": 6.647037701974865, + "grad_norm": 1.0069187879562378, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 92560 + }, + { + "epoch": 6.647755834829443, + "grad_norm": 1.110495686531067, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 92570 + }, + { + "epoch": 6.648473967684022, + "grad_norm": 1.0540684461593628, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 92580 + }, + { + "epoch": 6.6491921005386, + "grad_norm": 1.0917930603027344, + "learning_rate": 0.0002, + "loss": 0.5718, + "step": 92590 + }, + { + "epoch": 6.649910233393178, + "grad_norm": 1.225898027420044, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 92600 + }, + { + "epoch": 6.650628366247756, + "grad_norm": 0.9372484087944031, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 92610 + }, + { + "epoch": 6.651346499102334, + "grad_norm": 0.98685622215271, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 92620 + }, + { + "epoch": 6.652064631956912, + "grad_norm": 1.1148556470870972, + "learning_rate": 0.0002, + "loss": 0.6096, + "step": 92630 + }, + { + "epoch": 6.65278276481149, + "grad_norm": 1.1483707427978516, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 92640 + }, + { + "epoch": 6.653500897666068, + "grad_norm": 1.092708706855774, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 92650 + }, + { + "epoch": 6.654219030520647, + "grad_norm": 1.0641281604766846, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 92660 + }, + { + "epoch": 6.654937163375225, + "grad_norm": 0.9953374862670898, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 92670 + }, + { + "epoch": 6.655655296229803, + "grad_norm": 0.9792306423187256, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 92680 + }, + { + "epoch": 6.656373429084381, + "grad_norm": 1.1209690570831299, + "learning_rate": 0.0002, + "loss": 0.5945, + "step": 92690 + }, + { + "epoch": 6.657091561938959, + "grad_norm": 0.8281117677688599, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 92700 + }, + { + "epoch": 6.657809694793537, + "grad_norm": 0.9189280867576599, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 92710 + }, + { + "epoch": 6.658527827648115, + "grad_norm": 1.1859153509140015, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 92720 + }, + { + "epoch": 6.659245960502693, + "grad_norm": 0.9750476479530334, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 92730 + }, + { + "epoch": 6.659964093357271, + "grad_norm": 0.9973570704460144, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 92740 + }, + { + "epoch": 6.660682226211849, + "grad_norm": 1.0170378684997559, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 92750 + }, + { + "epoch": 6.661400359066427, + "grad_norm": 1.352283239364624, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 92760 + }, + { + "epoch": 6.662118491921006, + "grad_norm": 1.1020066738128662, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 92770 + }, + { + "epoch": 6.662836624775584, + "grad_norm": 1.0750092267990112, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 92780 + }, + { + "epoch": 6.663554757630162, + "grad_norm": 1.1006640195846558, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 92790 + }, + { + "epoch": 6.66427289048474, + "grad_norm": 1.2372384071350098, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 92800 + }, + { + "epoch": 6.664991023339318, + "grad_norm": 1.084846019744873, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 92810 + }, + { + "epoch": 6.665709156193896, + "grad_norm": 1.1738693714141846, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 92820 + }, + { + "epoch": 6.666427289048474, + "grad_norm": 1.159678339958191, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 92830 + }, + { + "epoch": 6.667145421903052, + "grad_norm": 0.9957766532897949, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 92840 + }, + { + "epoch": 6.667863554757631, + "grad_norm": 1.1403744220733643, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 92850 + }, + { + "epoch": 6.668581687612209, + "grad_norm": 1.0120519399642944, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 92860 + }, + { + "epoch": 6.669299820466787, + "grad_norm": 1.0876718759536743, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 92870 + }, + { + "epoch": 6.670017953321365, + "grad_norm": 1.175749659538269, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 92880 + }, + { + "epoch": 6.670736086175943, + "grad_norm": 0.9808473587036133, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 92890 + }, + { + "epoch": 6.671454219030521, + "grad_norm": 1.121573805809021, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 92900 + }, + { + "epoch": 6.672172351885099, + "grad_norm": 0.9749727249145508, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 92910 + }, + { + "epoch": 6.672890484739677, + "grad_norm": 1.0969820022583008, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 92920 + }, + { + "epoch": 6.673608617594255, + "grad_norm": 1.0777957439422607, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 92930 + }, + { + "epoch": 6.674326750448833, + "grad_norm": 1.2342437505722046, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 92940 + }, + { + "epoch": 6.6750448833034115, + "grad_norm": 1.18901789188385, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 92950 + }, + { + "epoch": 6.6757630161579895, + "grad_norm": 1.2212412357330322, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 92960 + }, + { + "epoch": 6.6764811490125675, + "grad_norm": 1.0007524490356445, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 92970 + }, + { + "epoch": 6.6771992818671455, + "grad_norm": 1.1012821197509766, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 92980 + }, + { + "epoch": 6.6779174147217235, + "grad_norm": 0.9446989893913269, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 92990 + }, + { + "epoch": 6.6786355475763015, + "grad_norm": 1.5307164192199707, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 93000 + }, + { + "epoch": 6.6793536804308795, + "grad_norm": 1.4290575981140137, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 93010 + }, + { + "epoch": 6.6800718132854575, + "grad_norm": 1.2367054224014282, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 93020 + }, + { + "epoch": 6.680789946140036, + "grad_norm": 0.874568521976471, + "learning_rate": 0.0002, + "loss": 0.5915, + "step": 93030 + }, + { + "epoch": 6.681508078994614, + "grad_norm": 1.152861475944519, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 93040 + }, + { + "epoch": 6.682226211849192, + "grad_norm": 0.9524891972541809, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 93050 + }, + { + "epoch": 6.68294434470377, + "grad_norm": 0.8084558844566345, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 93060 + }, + { + "epoch": 6.683662477558348, + "grad_norm": 1.1458806991577148, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 93070 + }, + { + "epoch": 6.684380610412926, + "grad_norm": 1.1427397727966309, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 93080 + }, + { + "epoch": 6.685098743267504, + "grad_norm": 1.1136237382888794, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 93090 + }, + { + "epoch": 6.685816876122082, + "grad_norm": 1.0270767211914062, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 93100 + }, + { + "epoch": 6.68653500897666, + "grad_norm": 0.9473410844802856, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 93110 + }, + { + "epoch": 6.687253141831238, + "grad_norm": 1.011011004447937, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 93120 + }, + { + "epoch": 6.687971274685816, + "grad_norm": 0.9286965131759644, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 93130 + }, + { + "epoch": 6.688689407540395, + "grad_norm": 1.226515293121338, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 93140 + }, + { + "epoch": 6.689407540394973, + "grad_norm": 0.9131909608840942, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 93150 + }, + { + "epoch": 6.690125673249551, + "grad_norm": 1.2111890316009521, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 93160 + }, + { + "epoch": 6.690843806104129, + "grad_norm": 0.9296384453773499, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 93170 + }, + { + "epoch": 6.691561938958707, + "grad_norm": 0.9636726975440979, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 93180 + }, + { + "epoch": 6.692280071813285, + "grad_norm": 1.0116214752197266, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 93190 + }, + { + "epoch": 6.692998204667863, + "grad_norm": 1.2671175003051758, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 93200 + }, + { + "epoch": 6.693716337522441, + "grad_norm": 1.0676039457321167, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 93210 + }, + { + "epoch": 6.69443447037702, + "grad_norm": 1.3277634382247925, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 93220 + }, + { + "epoch": 6.695152603231598, + "grad_norm": 0.9312936663627625, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 93230 + }, + { + "epoch": 6.695870736086176, + "grad_norm": 1.410414457321167, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 93240 + }, + { + "epoch": 6.696588868940754, + "grad_norm": 1.014519453048706, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 93250 + }, + { + "epoch": 6.697307001795332, + "grad_norm": 0.9211319088935852, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 93260 + }, + { + "epoch": 6.69802513464991, + "grad_norm": 1.1027755737304688, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 93270 + }, + { + "epoch": 6.698743267504488, + "grad_norm": 1.0538618564605713, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 93280 + }, + { + "epoch": 6.699461400359066, + "grad_norm": 1.159927248954773, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 93290 + }, + { + "epoch": 6.700179533213644, + "grad_norm": 1.1329137086868286, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 93300 + }, + { + "epoch": 6.700897666068222, + "grad_norm": 0.9797694683074951, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 93310 + }, + { + "epoch": 6.7016157989228, + "grad_norm": 1.0968587398529053, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 93320 + }, + { + "epoch": 6.702333931777379, + "grad_norm": 0.9620516896247864, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 93330 + }, + { + "epoch": 6.703052064631957, + "grad_norm": 1.048879623413086, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 93340 + }, + { + "epoch": 6.703770197486535, + "grad_norm": 1.086421012878418, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 93350 + }, + { + "epoch": 6.704488330341113, + "grad_norm": 1.1045429706573486, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 93360 + }, + { + "epoch": 6.705206463195691, + "grad_norm": 1.081629991531372, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 93370 + }, + { + "epoch": 6.705924596050269, + "grad_norm": 0.9947898387908936, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 93380 + }, + { + "epoch": 6.706642728904847, + "grad_norm": 0.8837184309959412, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 93390 + }, + { + "epoch": 6.707360861759425, + "grad_norm": 1.1838666200637817, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 93400 + }, + { + "epoch": 6.708078994614004, + "grad_norm": 0.9221062064170837, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 93410 + }, + { + "epoch": 6.708797127468582, + "grad_norm": 1.0049937963485718, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 93420 + }, + { + "epoch": 6.70951526032316, + "grad_norm": 0.8895014524459839, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 93430 + }, + { + "epoch": 6.710233393177738, + "grad_norm": 1.2572799921035767, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 93440 + }, + { + "epoch": 6.710951526032316, + "grad_norm": 1.082982063293457, + "learning_rate": 0.0002, + "loss": 0.5763, + "step": 93450 + }, + { + "epoch": 6.711669658886894, + "grad_norm": 1.1520570516586304, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 93460 + }, + { + "epoch": 6.712387791741472, + "grad_norm": 1.0604512691497803, + "learning_rate": 0.0002, + "loss": 0.6059, + "step": 93470 + }, + { + "epoch": 6.71310592459605, + "grad_norm": 0.9887481331825256, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 93480 + }, + { + "epoch": 6.713824057450628, + "grad_norm": 1.0163664817810059, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 93490 + }, + { + "epoch": 6.714542190305206, + "grad_norm": 1.187687873840332, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 93500 + }, + { + "epoch": 6.715260323159785, + "grad_norm": 0.8770190477371216, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 93510 + }, + { + "epoch": 6.715978456014363, + "grad_norm": 1.1552737951278687, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 93520 + }, + { + "epoch": 6.716696588868941, + "grad_norm": 1.168770432472229, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 93530 + }, + { + "epoch": 6.717414721723519, + "grad_norm": 1.1071383953094482, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 93540 + }, + { + "epoch": 6.718132854578097, + "grad_norm": 0.8549296259880066, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 93550 + }, + { + "epoch": 6.718850987432675, + "grad_norm": 1.1576329469680786, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 93560 + }, + { + "epoch": 6.719569120287253, + "grad_norm": 1.1610777378082275, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 93570 + }, + { + "epoch": 6.720287253141831, + "grad_norm": 1.0316133499145508, + "learning_rate": 0.0002, + "loss": 0.6055, + "step": 93580 + }, + { + "epoch": 6.721005385996409, + "grad_norm": 1.1048495769500732, + "learning_rate": 0.0002, + "loss": 0.5889, + "step": 93590 + }, + { + "epoch": 6.721723518850988, + "grad_norm": 1.1212984323501587, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 93600 + }, + { + "epoch": 6.722441651705566, + "grad_norm": 1.1465938091278076, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 93610 + }, + { + "epoch": 6.723159784560144, + "grad_norm": 0.8978183269500732, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 93620 + }, + { + "epoch": 6.723877917414722, + "grad_norm": 1.0475369691848755, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 93630 + }, + { + "epoch": 6.7245960502693, + "grad_norm": 1.0717675685882568, + "learning_rate": 0.0002, + "loss": 0.5565, + "step": 93640 + }, + { + "epoch": 6.725314183123878, + "grad_norm": 1.2429792881011963, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 93650 + }, + { + "epoch": 6.726032315978456, + "grad_norm": 1.0333678722381592, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 93660 + }, + { + "epoch": 6.726750448833034, + "grad_norm": 1.211590051651001, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 93670 + }, + { + "epoch": 6.727468581687612, + "grad_norm": 1.0022165775299072, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 93680 + }, + { + "epoch": 6.72818671454219, + "grad_norm": 1.0192183256149292, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 93690 + }, + { + "epoch": 6.728904847396769, + "grad_norm": 0.9370006322860718, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 93700 + }, + { + "epoch": 6.729622980251347, + "grad_norm": 0.7869033813476562, + "learning_rate": 0.0002, + "loss": 0.5796, + "step": 93710 + }, + { + "epoch": 6.730341113105925, + "grad_norm": 0.899703860282898, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 93720 + }, + { + "epoch": 6.731059245960503, + "grad_norm": 1.1216487884521484, + "learning_rate": 0.0002, + "loss": 0.623, + "step": 93730 + }, + { + "epoch": 6.731777378815081, + "grad_norm": 0.9117740988731384, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 93740 + }, + { + "epoch": 6.732495511669659, + "grad_norm": 1.070947289466858, + "learning_rate": 0.0002, + "loss": 0.6382, + "step": 93750 + }, + { + "epoch": 6.733213644524237, + "grad_norm": 1.0529371500015259, + "learning_rate": 0.0002, + "loss": 0.6014, + "step": 93760 + }, + { + "epoch": 6.733931777378815, + "grad_norm": 0.7950748801231384, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 93770 + }, + { + "epoch": 6.734649910233394, + "grad_norm": 1.0469520092010498, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 93780 + }, + { + "epoch": 6.735368043087972, + "grad_norm": 1.4734543561935425, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 93790 + }, + { + "epoch": 6.73608617594255, + "grad_norm": 0.8239574432373047, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 93800 + }, + { + "epoch": 6.736804308797128, + "grad_norm": 1.1228505373001099, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 93810 + }, + { + "epoch": 6.737522441651706, + "grad_norm": 1.0902183055877686, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 93820 + }, + { + "epoch": 6.738240574506284, + "grad_norm": 1.220467209815979, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 93830 + }, + { + "epoch": 6.738958707360862, + "grad_norm": 1.199582815170288, + "learning_rate": 0.0002, + "loss": 0.5963, + "step": 93840 + }, + { + "epoch": 6.73967684021544, + "grad_norm": 1.1008597612380981, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 93850 + }, + { + "epoch": 6.740394973070018, + "grad_norm": 0.8596068620681763, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 93860 + }, + { + "epoch": 6.741113105924596, + "grad_norm": 1.220947027206421, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 93870 + }, + { + "epoch": 6.741831238779174, + "grad_norm": 1.2840452194213867, + "learning_rate": 0.0002, + "loss": 0.5425, + "step": 93880 + }, + { + "epoch": 6.742549371633753, + "grad_norm": 1.1923094987869263, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 93890 + }, + { + "epoch": 6.743267504488331, + "grad_norm": 1.1287206411361694, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 93900 + }, + { + "epoch": 6.743985637342909, + "grad_norm": 0.9465082287788391, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 93910 + }, + { + "epoch": 6.744703770197487, + "grad_norm": 0.9888480305671692, + "learning_rate": 0.0002, + "loss": 0.5795, + "step": 93920 + }, + { + "epoch": 6.745421903052065, + "grad_norm": 1.1438485383987427, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 93930 + }, + { + "epoch": 6.746140035906643, + "grad_norm": 0.8203039169311523, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 93940 + }, + { + "epoch": 6.746858168761221, + "grad_norm": 1.217855453491211, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 93950 + }, + { + "epoch": 6.747576301615799, + "grad_norm": 1.245977520942688, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 93960 + }, + { + "epoch": 6.7482944344703775, + "grad_norm": 1.240097165107727, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 93970 + }, + { + "epoch": 6.7490125673249555, + "grad_norm": 0.9436663389205933, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 93980 + }, + { + "epoch": 6.7497307001795335, + "grad_norm": 0.9331963062286377, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 93990 + }, + { + "epoch": 6.7504488330341115, + "grad_norm": 0.9809562563896179, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 94000 + }, + { + "epoch": 6.7511669658886895, + "grad_norm": 1.1596009731292725, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 94010 + }, + { + "epoch": 6.7518850987432675, + "grad_norm": 1.082684874534607, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 94020 + }, + { + "epoch": 6.7526032315978455, + "grad_norm": 0.9931458234786987, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 94030 + }, + { + "epoch": 6.7533213644524235, + "grad_norm": 0.8717518448829651, + "learning_rate": 0.0002, + "loss": 0.5606, + "step": 94040 + }, + { + "epoch": 6.7540394973070015, + "grad_norm": 0.9379602074623108, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 94050 + }, + { + "epoch": 6.7547576301615795, + "grad_norm": 0.8819605708122253, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 94060 + }, + { + "epoch": 6.755475763016158, + "grad_norm": 1.111547589302063, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 94070 + }, + { + "epoch": 6.756193895870736, + "grad_norm": 1.0755881071090698, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 94080 + }, + { + "epoch": 6.756912028725314, + "grad_norm": 1.0734093189239502, + "learning_rate": 0.0002, + "loss": 0.5494, + "step": 94090 + }, + { + "epoch": 6.757630161579892, + "grad_norm": 1.0390300750732422, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 94100 + }, + { + "epoch": 6.75834829443447, + "grad_norm": 0.9557124972343445, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 94110 + }, + { + "epoch": 6.759066427289048, + "grad_norm": 1.0970680713653564, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 94120 + }, + { + "epoch": 6.759784560143626, + "grad_norm": 1.0715644359588623, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 94130 + }, + { + "epoch": 6.760502692998204, + "grad_norm": 1.1311662197113037, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 94140 + }, + { + "epoch": 6.761220825852782, + "grad_norm": 0.9891370534896851, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 94150 + }, + { + "epoch": 6.761938958707361, + "grad_norm": 0.9472686648368835, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 94160 + }, + { + "epoch": 6.762657091561939, + "grad_norm": 1.1044381856918335, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 94170 + }, + { + "epoch": 6.763375224416517, + "grad_norm": 1.2088780403137207, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 94180 + }, + { + "epoch": 6.764093357271095, + "grad_norm": 0.9210726618766785, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 94190 + }, + { + "epoch": 6.764811490125673, + "grad_norm": 1.0969771146774292, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 94200 + }, + { + "epoch": 6.765529622980251, + "grad_norm": 1.1030265092849731, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 94210 + }, + { + "epoch": 6.766247755834829, + "grad_norm": 0.9451745748519897, + "learning_rate": 0.0002, + "loss": 0.5973, + "step": 94220 + }, + { + "epoch": 6.766965888689407, + "grad_norm": 1.0216296911239624, + "learning_rate": 0.0002, + "loss": 0.616, + "step": 94230 + }, + { + "epoch": 6.767684021543985, + "grad_norm": 1.4021092653274536, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 94240 + }, + { + "epoch": 6.768402154398563, + "grad_norm": 1.2341269254684448, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 94250 + }, + { + "epoch": 6.769120287253142, + "grad_norm": 1.1086686849594116, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 94260 + }, + { + "epoch": 6.76983842010772, + "grad_norm": 0.8565682172775269, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 94270 + }, + { + "epoch": 6.770556552962298, + "grad_norm": 0.9314411878585815, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 94280 + }, + { + "epoch": 6.771274685816876, + "grad_norm": 1.0592315196990967, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 94290 + }, + { + "epoch": 6.771992818671454, + "grad_norm": 1.086379885673523, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 94300 + }, + { + "epoch": 6.772710951526032, + "grad_norm": 1.13401198387146, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 94310 + }, + { + "epoch": 6.77342908438061, + "grad_norm": 1.0137985944747925, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 94320 + }, + { + "epoch": 6.774147217235188, + "grad_norm": 1.0459709167480469, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 94330 + }, + { + "epoch": 6.774865350089767, + "grad_norm": 1.2213165760040283, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 94340 + }, + { + "epoch": 6.775583482944345, + "grad_norm": 1.099478006362915, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 94350 + }, + { + "epoch": 6.776301615798923, + "grad_norm": 1.124526858329773, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 94360 + }, + { + "epoch": 6.777019748653501, + "grad_norm": 1.0199998617172241, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 94370 + }, + { + "epoch": 6.777737881508079, + "grad_norm": 1.1849408149719238, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 94380 + }, + { + "epoch": 6.778456014362657, + "grad_norm": 1.2265552282333374, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 94390 + }, + { + "epoch": 6.779174147217235, + "grad_norm": 0.7576864361763, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 94400 + }, + { + "epoch": 6.779892280071813, + "grad_norm": 0.8172970414161682, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 94410 + }, + { + "epoch": 6.780610412926391, + "grad_norm": 1.1105220317840576, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 94420 + }, + { + "epoch": 6.781328545780969, + "grad_norm": 1.0542421340942383, + "learning_rate": 0.0002, + "loss": 0.5918, + "step": 94430 + }, + { + "epoch": 6.782046678635547, + "grad_norm": 1.0088121891021729, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 94440 + }, + { + "epoch": 6.782764811490126, + "grad_norm": 0.9872488379478455, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 94450 + }, + { + "epoch": 6.783482944344704, + "grad_norm": 1.2545148134231567, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 94460 + }, + { + "epoch": 6.784201077199282, + "grad_norm": 0.8847712278366089, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 94470 + }, + { + "epoch": 6.78491921005386, + "grad_norm": 0.7758765816688538, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 94480 + }, + { + "epoch": 6.785637342908438, + "grad_norm": 1.0454037189483643, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 94490 + }, + { + "epoch": 6.786355475763016, + "grad_norm": 1.1336725950241089, + "learning_rate": 0.0002, + "loss": 0.5943, + "step": 94500 + }, + { + "epoch": 6.787073608617594, + "grad_norm": 1.081356406211853, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 94510 + }, + { + "epoch": 6.787791741472172, + "grad_norm": 1.126288890838623, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 94520 + }, + { + "epoch": 6.788509874326751, + "grad_norm": 1.1156792640686035, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 94530 + }, + { + "epoch": 6.789228007181329, + "grad_norm": 1.0243451595306396, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 94540 + }, + { + "epoch": 6.789946140035907, + "grad_norm": 0.9778338670730591, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 94550 + }, + { + "epoch": 6.790664272890485, + "grad_norm": 0.9668094515800476, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 94560 + }, + { + "epoch": 6.791382405745063, + "grad_norm": 1.121848464012146, + "learning_rate": 0.0002, + "loss": 0.6285, + "step": 94570 + }, + { + "epoch": 6.792100538599641, + "grad_norm": 1.105825662612915, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 94580 + }, + { + "epoch": 6.792818671454219, + "grad_norm": 1.1236833333969116, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 94590 + }, + { + "epoch": 6.793536804308797, + "grad_norm": 1.0655126571655273, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 94600 + }, + { + "epoch": 6.794254937163375, + "grad_norm": 0.9249289631843567, + "learning_rate": 0.0002, + "loss": 0.5271, + "step": 94610 + }, + { + "epoch": 6.794973070017953, + "grad_norm": 1.0177690982818604, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 94620 + }, + { + "epoch": 6.795691202872531, + "grad_norm": 1.1961153745651245, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 94630 + }, + { + "epoch": 6.79640933572711, + "grad_norm": 1.0987505912780762, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 94640 + }, + { + "epoch": 6.797127468581688, + "grad_norm": 1.0165259838104248, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 94650 + }, + { + "epoch": 6.797845601436266, + "grad_norm": 1.1336601972579956, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 94660 + }, + { + "epoch": 6.798563734290844, + "grad_norm": 1.0786010026931763, + "learning_rate": 0.0002, + "loss": 0.6252, + "step": 94670 + }, + { + "epoch": 6.799281867145422, + "grad_norm": 1.2896602153778076, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 94680 + }, + { + "epoch": 6.8, + "grad_norm": 1.0934168100357056, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 94690 + }, + { + "epoch": 6.800718132854578, + "grad_norm": 1.1080414056777954, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 94700 + }, + { + "epoch": 6.801436265709156, + "grad_norm": 1.1141704320907593, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 94710 + }, + { + "epoch": 6.802154398563735, + "grad_norm": 0.9571144580841064, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 94720 + }, + { + "epoch": 6.802872531418313, + "grad_norm": 0.8907591700553894, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 94730 + }, + { + "epoch": 6.803590664272891, + "grad_norm": 1.0547759532928467, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 94740 + }, + { + "epoch": 6.804308797127469, + "grad_norm": 0.973573625087738, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 94750 + }, + { + "epoch": 6.805026929982047, + "grad_norm": 0.7889130711555481, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 94760 + }, + { + "epoch": 6.805745062836625, + "grad_norm": 0.9414647221565247, + "learning_rate": 0.0002, + "loss": 0.6004, + "step": 94770 + }, + { + "epoch": 6.806463195691203, + "grad_norm": 0.9452534317970276, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 94780 + }, + { + "epoch": 6.807181328545781, + "grad_norm": 1.2215145826339722, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 94790 + }, + { + "epoch": 6.807899461400359, + "grad_norm": 1.116302490234375, + "learning_rate": 0.0002, + "loss": 0.6045, + "step": 94800 + }, + { + "epoch": 6.808617594254937, + "grad_norm": 0.850916862487793, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 94810 + }, + { + "epoch": 6.809335727109516, + "grad_norm": 0.8699719905853271, + "learning_rate": 0.0002, + "loss": 0.5411, + "step": 94820 + }, + { + "epoch": 6.810053859964094, + "grad_norm": 1.0958143472671509, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 94830 + }, + { + "epoch": 6.810771992818672, + "grad_norm": 1.128580927848816, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 94840 + }, + { + "epoch": 6.81149012567325, + "grad_norm": 0.9490674138069153, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 94850 + }, + { + "epoch": 6.812208258527828, + "grad_norm": 0.9294022917747498, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 94860 + }, + { + "epoch": 6.812926391382406, + "grad_norm": 1.048378348350525, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 94870 + }, + { + "epoch": 6.813644524236984, + "grad_norm": 1.1972805261611938, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 94880 + }, + { + "epoch": 6.814362657091562, + "grad_norm": 0.7709503769874573, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 94890 + }, + { + "epoch": 6.8150807899461405, + "grad_norm": 1.0244873762130737, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 94900 + }, + { + "epoch": 6.8157989228007185, + "grad_norm": 1.0576984882354736, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 94910 + }, + { + "epoch": 6.8165170556552965, + "grad_norm": 1.3478775024414062, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 94920 + }, + { + "epoch": 6.8172351885098745, + "grad_norm": 0.982311487197876, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 94930 + }, + { + "epoch": 6.8179533213644525, + "grad_norm": 1.1846535205841064, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 94940 + }, + { + "epoch": 6.8186714542190305, + "grad_norm": 0.9255896210670471, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 94950 + }, + { + "epoch": 6.8193895870736085, + "grad_norm": 0.9418646693229675, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 94960 + }, + { + "epoch": 6.8201077199281865, + "grad_norm": 1.189335584640503, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 94970 + }, + { + "epoch": 6.8208258527827645, + "grad_norm": 1.1003406047821045, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 94980 + }, + { + "epoch": 6.8215439856373425, + "grad_norm": 0.9203724265098572, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 94990 + }, + { + "epoch": 6.8222621184919205, + "grad_norm": 1.093252182006836, + "learning_rate": 0.0002, + "loss": 0.5862, + "step": 95000 + }, + { + "epoch": 6.822980251346499, + "grad_norm": 1.2737812995910645, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 95010 + }, + { + "epoch": 6.823698384201077, + "grad_norm": 1.1859848499298096, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 95020 + }, + { + "epoch": 6.824416517055655, + "grad_norm": 0.9591164588928223, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 95030 + }, + { + "epoch": 6.825134649910233, + "grad_norm": 1.0144239664077759, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 95040 + }, + { + "epoch": 6.825852782764811, + "grad_norm": 1.2520356178283691, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 95050 + }, + { + "epoch": 6.8265709156193894, + "grad_norm": 1.003438115119934, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 95060 + }, + { + "epoch": 6.8272890484739674, + "grad_norm": 0.9512312412261963, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 95070 + }, + { + "epoch": 6.8280071813285454, + "grad_norm": 0.9984938502311707, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 95080 + }, + { + "epoch": 6.828725314183124, + "grad_norm": 0.9630827307701111, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 95090 + }, + { + "epoch": 6.829443447037702, + "grad_norm": 0.8859394192695618, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 95100 + }, + { + "epoch": 6.83016157989228, + "grad_norm": 0.9082155227661133, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 95110 + }, + { + "epoch": 6.830879712746858, + "grad_norm": 1.0707300901412964, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 95120 + }, + { + "epoch": 6.831597845601436, + "grad_norm": 1.2023502588272095, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 95130 + }, + { + "epoch": 6.832315978456014, + "grad_norm": 1.0189216136932373, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 95140 + }, + { + "epoch": 6.833034111310592, + "grad_norm": 1.1216851472854614, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 95150 + }, + { + "epoch": 6.83375224416517, + "grad_norm": 1.124589204788208, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 95160 + }, + { + "epoch": 6.834470377019748, + "grad_norm": 1.1183217763900757, + "learning_rate": 0.0002, + "loss": 0.5374, + "step": 95170 + }, + { + "epoch": 6.835188509874326, + "grad_norm": 1.0307188034057617, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 95180 + }, + { + "epoch": 6.835906642728904, + "grad_norm": 1.2438706159591675, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 95190 + }, + { + "epoch": 6.836624775583483, + "grad_norm": 1.117887258529663, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 95200 + }, + { + "epoch": 6.837342908438061, + "grad_norm": 0.8934445381164551, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 95210 + }, + { + "epoch": 6.838061041292639, + "grad_norm": 1.097379207611084, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 95220 + }, + { + "epoch": 6.838779174147217, + "grad_norm": 1.1034258604049683, + "learning_rate": 0.0002, + "loss": 0.5792, + "step": 95230 + }, + { + "epoch": 6.839497307001795, + "grad_norm": 1.052120327949524, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 95240 + }, + { + "epoch": 6.840215439856373, + "grad_norm": 1.0844687223434448, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 95250 + }, + { + "epoch": 6.840933572710951, + "grad_norm": 1.1553566455841064, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 95260 + }, + { + "epoch": 6.841651705565529, + "grad_norm": 1.1977533102035522, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 95270 + }, + { + "epoch": 6.842369838420108, + "grad_norm": 0.9635998010635376, + "learning_rate": 0.0002, + "loss": 0.5562, + "step": 95280 + }, + { + "epoch": 6.843087971274686, + "grad_norm": 1.0867844820022583, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 95290 + }, + { + "epoch": 6.843806104129264, + "grad_norm": 1.1252882480621338, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 95300 + }, + { + "epoch": 6.844524236983842, + "grad_norm": 1.1130266189575195, + "learning_rate": 0.0002, + "loss": 0.5468, + "step": 95310 + }, + { + "epoch": 6.84524236983842, + "grad_norm": 1.058863878250122, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 95320 + }, + { + "epoch": 6.845960502692998, + "grad_norm": 1.173840880393982, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 95330 + }, + { + "epoch": 6.846678635547576, + "grad_norm": 1.09446120262146, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 95340 + }, + { + "epoch": 6.847396768402154, + "grad_norm": 1.0762465000152588, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 95350 + }, + { + "epoch": 6.848114901256732, + "grad_norm": 1.0056897401809692, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 95360 + }, + { + "epoch": 6.84883303411131, + "grad_norm": 0.929190456867218, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 95370 + }, + { + "epoch": 6.849551166965889, + "grad_norm": 1.1152058839797974, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 95380 + }, + { + "epoch": 6.850269299820467, + "grad_norm": 1.0163987874984741, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 95390 + }, + { + "epoch": 6.850987432675045, + "grad_norm": 1.1169452667236328, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 95400 + }, + { + "epoch": 6.851705565529623, + "grad_norm": 1.2225226163864136, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 95410 + }, + { + "epoch": 6.852423698384201, + "grad_norm": 1.0833172798156738, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 95420 + }, + { + "epoch": 6.853141831238779, + "grad_norm": 1.0159578323364258, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 95430 + }, + { + "epoch": 6.853859964093357, + "grad_norm": 1.1164990663528442, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 95440 + }, + { + "epoch": 6.854578096947935, + "grad_norm": 1.1340656280517578, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 95450 + }, + { + "epoch": 6.855296229802514, + "grad_norm": 1.1228697299957275, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 95460 + }, + { + "epoch": 6.856014362657092, + "grad_norm": 1.0189276933670044, + "learning_rate": 0.0002, + "loss": 0.6323, + "step": 95470 + }, + { + "epoch": 6.85673249551167, + "grad_norm": 1.1692779064178467, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 95480 + }, + { + "epoch": 6.857450628366248, + "grad_norm": 1.0779703855514526, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 95490 + }, + { + "epoch": 6.858168761220826, + "grad_norm": 1.0127906799316406, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 95500 + }, + { + "epoch": 6.858886894075404, + "grad_norm": 1.2124756574630737, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 95510 + }, + { + "epoch": 6.859605026929982, + "grad_norm": 1.0948219299316406, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 95520 + }, + { + "epoch": 6.86032315978456, + "grad_norm": 0.8796268701553345, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 95530 + }, + { + "epoch": 6.861041292639138, + "grad_norm": 1.0725175142288208, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 95540 + }, + { + "epoch": 6.861759425493716, + "grad_norm": 0.9067171812057495, + "learning_rate": 0.0002, + "loss": 0.5748, + "step": 95550 + }, + { + "epoch": 6.862477558348294, + "grad_norm": 1.0576670169830322, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 95560 + }, + { + "epoch": 6.863195691202873, + "grad_norm": 0.9622264504432678, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 95570 + }, + { + "epoch": 6.863913824057451, + "grad_norm": 1.0197248458862305, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 95580 + }, + { + "epoch": 6.864631956912029, + "grad_norm": 0.9197335243225098, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 95590 + }, + { + "epoch": 6.865350089766607, + "grad_norm": 1.0169627666473389, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 95600 + }, + { + "epoch": 6.866068222621185, + "grad_norm": 0.9868543744087219, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 95610 + }, + { + "epoch": 6.866786355475763, + "grad_norm": 0.9861942529678345, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 95620 + }, + { + "epoch": 6.867504488330341, + "grad_norm": 1.0906847715377808, + "learning_rate": 0.0002, + "loss": 0.5753, + "step": 95630 + }, + { + "epoch": 6.868222621184919, + "grad_norm": 1.2462674379348755, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 95640 + }, + { + "epoch": 6.868940754039498, + "grad_norm": 0.9801536202430725, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 95650 + }, + { + "epoch": 6.869658886894076, + "grad_norm": 1.0568761825561523, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 95660 + }, + { + "epoch": 6.870377019748654, + "grad_norm": 0.8431015014648438, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 95670 + }, + { + "epoch": 6.871095152603232, + "grad_norm": 1.2253447771072388, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 95680 + }, + { + "epoch": 6.87181328545781, + "grad_norm": 0.8862479329109192, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 95690 + }, + { + "epoch": 6.872531418312388, + "grad_norm": 1.0733704566955566, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 95700 + }, + { + "epoch": 6.873249551166966, + "grad_norm": 0.9327288269996643, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 95710 + }, + { + "epoch": 6.873967684021544, + "grad_norm": 0.9877831339836121, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 95720 + }, + { + "epoch": 6.874685816876122, + "grad_norm": 0.9772239327430725, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 95730 + }, + { + "epoch": 6.8754039497307, + "grad_norm": 0.9799681901931763, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 95740 + }, + { + "epoch": 6.876122082585278, + "grad_norm": 1.0650758743286133, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 95750 + }, + { + "epoch": 6.876840215439857, + "grad_norm": 1.068557858467102, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 95760 + }, + { + "epoch": 6.877558348294435, + "grad_norm": 1.1335437297821045, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 95770 + }, + { + "epoch": 6.878276481149013, + "grad_norm": 0.8993158936500549, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 95780 + }, + { + "epoch": 6.878994614003591, + "grad_norm": 1.0593502521514893, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 95790 + }, + { + "epoch": 6.879712746858169, + "grad_norm": 1.2181397676467896, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 95800 + }, + { + "epoch": 6.880430879712747, + "grad_norm": 0.9614198207855225, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 95810 + }, + { + "epoch": 6.881149012567325, + "grad_norm": 1.021591067314148, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 95820 + }, + { + "epoch": 6.881867145421903, + "grad_norm": 1.3752840757369995, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 95830 + }, + { + "epoch": 6.882585278276482, + "grad_norm": 1.236355185508728, + "learning_rate": 0.0002, + "loss": 0.5758, + "step": 95840 + }, + { + "epoch": 6.88330341113106, + "grad_norm": 1.1957523822784424, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 95850 + }, + { + "epoch": 6.884021543985638, + "grad_norm": 0.8793587684631348, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 95860 + }, + { + "epoch": 6.884739676840216, + "grad_norm": 1.202054738998413, + "learning_rate": 0.0002, + "loss": 0.6482, + "step": 95870 + }, + { + "epoch": 6.885457809694794, + "grad_norm": 0.8061116337776184, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 95880 + }, + { + "epoch": 6.886175942549372, + "grad_norm": 1.0037956237792969, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 95890 + }, + { + "epoch": 6.88689407540395, + "grad_norm": 1.006435751914978, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 95900 + }, + { + "epoch": 6.887612208258528, + "grad_norm": 1.141200304031372, + "learning_rate": 0.0002, + "loss": 0.6145, + "step": 95910 + }, + { + "epoch": 6.888330341113106, + "grad_norm": 0.9017927050590515, + "learning_rate": 0.0002, + "loss": 0.6168, + "step": 95920 + }, + { + "epoch": 6.889048473967684, + "grad_norm": 0.9288154244422913, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 95930 + }, + { + "epoch": 6.8897666068222625, + "grad_norm": 1.2263801097869873, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 95940 + }, + { + "epoch": 6.8904847396768405, + "grad_norm": 1.2005410194396973, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 95950 + }, + { + "epoch": 6.8912028725314185, + "grad_norm": 1.0801531076431274, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 95960 + }, + { + "epoch": 6.8919210053859965, + "grad_norm": 1.1115456819534302, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 95970 + }, + { + "epoch": 6.8926391382405745, + "grad_norm": 1.062920093536377, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 95980 + }, + { + "epoch": 6.8933572710951525, + "grad_norm": 0.9343897700309753, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 95990 + }, + { + "epoch": 6.8940754039497305, + "grad_norm": 1.0236390829086304, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 96000 + }, + { + "epoch": 6.8947935368043085, + "grad_norm": 1.0680996179580688, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 96010 + }, + { + "epoch": 6.8955116696588865, + "grad_norm": 1.1796760559082031, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 96020 + }, + { + "epoch": 6.896229802513465, + "grad_norm": 0.9805570840835571, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 96030 + }, + { + "epoch": 6.896947935368043, + "grad_norm": 1.245386004447937, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 96040 + }, + { + "epoch": 6.897666068222621, + "grad_norm": 1.0306174755096436, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 96050 + }, + { + "epoch": 6.898384201077199, + "grad_norm": 1.0599836111068726, + "learning_rate": 0.0002, + "loss": 0.6198, + "step": 96060 + }, + { + "epoch": 6.899102333931777, + "grad_norm": 1.1438795328140259, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 96070 + }, + { + "epoch": 6.899820466786355, + "grad_norm": 0.9044751524925232, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 96080 + }, + { + "epoch": 6.900538599640933, + "grad_norm": 0.9689591526985168, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 96090 + }, + { + "epoch": 6.901256732495511, + "grad_norm": 1.003217339515686, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 96100 + }, + { + "epoch": 6.901974865350089, + "grad_norm": 1.1630250215530396, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 96110 + }, + { + "epoch": 6.902692998204667, + "grad_norm": 1.0304425954818726, + "learning_rate": 0.0002, + "loss": 0.5661, + "step": 96120 + }, + { + "epoch": 6.903411131059246, + "grad_norm": 1.0148587226867676, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 96130 + }, + { + "epoch": 6.904129263913824, + "grad_norm": 1.3722255229949951, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 96140 + }, + { + "epoch": 6.904847396768402, + "grad_norm": 1.1518549919128418, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 96150 + }, + { + "epoch": 6.90556552962298, + "grad_norm": 1.0342949628829956, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 96160 + }, + { + "epoch": 6.906283662477558, + "grad_norm": 1.0178996324539185, + "learning_rate": 0.0002, + "loss": 0.5691, + "step": 96170 + }, + { + "epoch": 6.907001795332136, + "grad_norm": 1.3429099321365356, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 96180 + }, + { + "epoch": 6.907719928186714, + "grad_norm": 1.2281367778778076, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 96190 + }, + { + "epoch": 6.908438061041292, + "grad_norm": 0.8190469145774841, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 96200 + }, + { + "epoch": 6.909156193895871, + "grad_norm": 1.1344635486602783, + "learning_rate": 0.0002, + "loss": 0.5929, + "step": 96210 + }, + { + "epoch": 6.909874326750449, + "grad_norm": 1.0540097951889038, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 96220 + }, + { + "epoch": 6.910592459605027, + "grad_norm": 1.044974446296692, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 96230 + }, + { + "epoch": 6.911310592459605, + "grad_norm": 0.6890087723731995, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 96240 + }, + { + "epoch": 6.912028725314183, + "grad_norm": 1.1266905069351196, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 96250 + }, + { + "epoch": 6.912746858168761, + "grad_norm": 1.3173121213912964, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 96260 + }, + { + "epoch": 6.913464991023339, + "grad_norm": 1.0043895244598389, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 96270 + }, + { + "epoch": 6.914183123877917, + "grad_norm": 1.0634605884552002, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 96280 + }, + { + "epoch": 6.914901256732495, + "grad_norm": 1.234516978263855, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 96290 + }, + { + "epoch": 6.915619389587073, + "grad_norm": 1.042026162147522, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 96300 + }, + { + "epoch": 6.916337522441651, + "grad_norm": 1.063632845878601, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 96310 + }, + { + "epoch": 6.91705565529623, + "grad_norm": 1.0733225345611572, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 96320 + }, + { + "epoch": 6.917773788150808, + "grad_norm": 1.4382662773132324, + "learning_rate": 0.0002, + "loss": 0.6003, + "step": 96330 + }, + { + "epoch": 6.918491921005386, + "grad_norm": 1.19964599609375, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 96340 + }, + { + "epoch": 6.919210053859964, + "grad_norm": 0.9012235403060913, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 96350 + }, + { + "epoch": 6.919928186714542, + "grad_norm": 0.8663099408149719, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 96360 + }, + { + "epoch": 6.92064631956912, + "grad_norm": 0.8944193124771118, + "learning_rate": 0.0002, + "loss": 0.5164, + "step": 96370 + }, + { + "epoch": 6.921364452423698, + "grad_norm": 1.1201437711715698, + "learning_rate": 0.0002, + "loss": 0.5556, + "step": 96380 + }, + { + "epoch": 6.922082585278276, + "grad_norm": 1.0434664487838745, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 96390 + }, + { + "epoch": 6.922800718132855, + "grad_norm": 1.2666915655136108, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 96400 + }, + { + "epoch": 6.923518850987433, + "grad_norm": 0.9610332250595093, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 96410 + }, + { + "epoch": 6.924236983842011, + "grad_norm": 1.1521750688552856, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 96420 + }, + { + "epoch": 6.924955116696589, + "grad_norm": 0.921970546245575, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 96430 + }, + { + "epoch": 6.925673249551167, + "grad_norm": 1.1277226209640503, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 96440 + }, + { + "epoch": 6.926391382405745, + "grad_norm": 1.147425889968872, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 96450 + }, + { + "epoch": 6.927109515260323, + "grad_norm": 1.0128270387649536, + "learning_rate": 0.0002, + "loss": 0.6032, + "step": 96460 + }, + { + "epoch": 6.927827648114901, + "grad_norm": 1.0726343393325806, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 96470 + }, + { + "epoch": 6.928545780969479, + "grad_norm": 0.9902656078338623, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 96480 + }, + { + "epoch": 6.929263913824057, + "grad_norm": 0.9662004709243774, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 96490 + }, + { + "epoch": 6.929982046678636, + "grad_norm": 0.9595714807510376, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 96500 + }, + { + "epoch": 6.930700179533214, + "grad_norm": 1.0666614770889282, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 96510 + }, + { + "epoch": 6.931418312387792, + "grad_norm": 0.8744403123855591, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 96520 + }, + { + "epoch": 6.93213644524237, + "grad_norm": 1.0382628440856934, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 96530 + }, + { + "epoch": 6.932854578096948, + "grad_norm": 0.9165884256362915, + "learning_rate": 0.0002, + "loss": 0.5445, + "step": 96540 + }, + { + "epoch": 6.933572710951526, + "grad_norm": 0.9073842763900757, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 96550 + }, + { + "epoch": 6.934290843806104, + "grad_norm": 1.100635051727295, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 96560 + }, + { + "epoch": 6.935008976660682, + "grad_norm": 1.1503266096115112, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 96570 + }, + { + "epoch": 6.93572710951526, + "grad_norm": 0.9526805281639099, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 96580 + }, + { + "epoch": 6.936445242369839, + "grad_norm": 1.115716814994812, + "learning_rate": 0.0002, + "loss": 0.5959, + "step": 96590 + }, + { + "epoch": 6.937163375224417, + "grad_norm": 1.0669193267822266, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 96600 + }, + { + "epoch": 6.937881508078995, + "grad_norm": 1.0191189050674438, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 96610 + }, + { + "epoch": 6.938599640933573, + "grad_norm": 1.1885946989059448, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 96620 + }, + { + "epoch": 6.939317773788151, + "grad_norm": 0.9806031584739685, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 96630 + }, + { + "epoch": 6.940035906642729, + "grad_norm": 0.9700000286102295, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 96640 + }, + { + "epoch": 6.940754039497307, + "grad_norm": 1.0870105028152466, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 96650 + }, + { + "epoch": 6.941472172351885, + "grad_norm": 0.7441867589950562, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 96660 + }, + { + "epoch": 6.942190305206463, + "grad_norm": 0.8631957173347473, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 96670 + }, + { + "epoch": 6.942908438061041, + "grad_norm": 1.0538444519042969, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 96680 + }, + { + "epoch": 6.94362657091562, + "grad_norm": 1.0235437154769897, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 96690 + }, + { + "epoch": 6.944344703770198, + "grad_norm": 1.069114089012146, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 96700 + }, + { + "epoch": 6.945062836624776, + "grad_norm": 1.0421861410140991, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 96710 + }, + { + "epoch": 6.945780969479354, + "grad_norm": 0.9244136810302734, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 96720 + }, + { + "epoch": 6.946499102333932, + "grad_norm": 0.962041437625885, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 96730 + }, + { + "epoch": 6.94721723518851, + "grad_norm": 1.049677848815918, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 96740 + }, + { + "epoch": 6.947935368043088, + "grad_norm": 1.0276710987091064, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 96750 + }, + { + "epoch": 6.948653500897666, + "grad_norm": 1.036650538444519, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 96760 + }, + { + "epoch": 6.949371633752245, + "grad_norm": 1.0379945039749146, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 96770 + }, + { + "epoch": 6.950089766606823, + "grad_norm": 0.9768070578575134, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 96780 + }, + { + "epoch": 6.950807899461401, + "grad_norm": 1.0515118837356567, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 96790 + }, + { + "epoch": 6.951526032315979, + "grad_norm": 0.9186223149299622, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 96800 + }, + { + "epoch": 6.952244165170557, + "grad_norm": 1.0430902242660522, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 96810 + }, + { + "epoch": 6.952962298025135, + "grad_norm": 0.7750678658485413, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 96820 + }, + { + "epoch": 6.953680430879713, + "grad_norm": 1.1721138954162598, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 96830 + }, + { + "epoch": 6.954398563734291, + "grad_norm": 1.2088165283203125, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 96840 + }, + { + "epoch": 6.955116696588869, + "grad_norm": 0.9956802129745483, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 96850 + }, + { + "epoch": 6.955834829443447, + "grad_norm": 1.0444421768188477, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 96860 + }, + { + "epoch": 6.956552962298025, + "grad_norm": 1.2420955896377563, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 96870 + }, + { + "epoch": 6.957271095152604, + "grad_norm": 1.0187203884124756, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 96880 + }, + { + "epoch": 6.957989228007182, + "grad_norm": 1.0883756875991821, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 96890 + }, + { + "epoch": 6.95870736086176, + "grad_norm": 1.1869568824768066, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 96900 + }, + { + "epoch": 6.959425493716338, + "grad_norm": 1.242119312286377, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 96910 + }, + { + "epoch": 6.960143626570916, + "grad_norm": 1.0262869596481323, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 96920 + }, + { + "epoch": 6.960861759425494, + "grad_norm": 0.9577149152755737, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 96930 + }, + { + "epoch": 6.961579892280072, + "grad_norm": 0.9224622249603271, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 96940 + }, + { + "epoch": 6.96229802513465, + "grad_norm": 1.0761854648590088, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 96950 + }, + { + "epoch": 6.9630161579892285, + "grad_norm": 1.1029279232025146, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 96960 + }, + { + "epoch": 6.9637342908438065, + "grad_norm": 1.1132091283798218, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 96970 + }, + { + "epoch": 6.9644524236983845, + "grad_norm": 0.9723706245422363, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 96980 + }, + { + "epoch": 6.9651705565529625, + "grad_norm": 1.0453037023544312, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 96990 + }, + { + "epoch": 6.9658886894075405, + "grad_norm": 1.16423499584198, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 97000 + }, + { + "epoch": 6.9666068222621185, + "grad_norm": 1.1522771120071411, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 97010 + }, + { + "epoch": 6.9673249551166965, + "grad_norm": 1.020828127861023, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 97020 + }, + { + "epoch": 6.9680430879712745, + "grad_norm": 1.0301889181137085, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 97030 + }, + { + "epoch": 6.9687612208258525, + "grad_norm": 1.0615862607955933, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 97040 + }, + { + "epoch": 6.9694793536804305, + "grad_norm": 1.1750848293304443, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 97050 + }, + { + "epoch": 6.9701974865350085, + "grad_norm": 0.916283905506134, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 97060 + }, + { + "epoch": 6.970915619389587, + "grad_norm": 1.0715203285217285, + "learning_rate": 0.0002, + "loss": 0.6158, + "step": 97070 + }, + { + "epoch": 6.971633752244165, + "grad_norm": 1.1171340942382812, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 97080 + }, + { + "epoch": 6.972351885098743, + "grad_norm": 0.886015772819519, + "learning_rate": 0.0002, + "loss": 0.6361, + "step": 97090 + }, + { + "epoch": 6.973070017953321, + "grad_norm": 0.9498746991157532, + "learning_rate": 0.0002, + "loss": 0.5934, + "step": 97100 + }, + { + "epoch": 6.973788150807899, + "grad_norm": 1.1563011407852173, + "learning_rate": 0.0002, + "loss": 0.5951, + "step": 97110 + }, + { + "epoch": 6.974506283662477, + "grad_norm": 0.9086321592330933, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 97120 + }, + { + "epoch": 6.975224416517055, + "grad_norm": 0.9804864525794983, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 97130 + }, + { + "epoch": 6.975942549371633, + "grad_norm": 1.5005993843078613, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 97140 + }, + { + "epoch": 6.976660682226212, + "grad_norm": 1.1720819473266602, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 97150 + }, + { + "epoch": 6.97737881508079, + "grad_norm": 1.095572590827942, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 97160 + }, + { + "epoch": 6.978096947935368, + "grad_norm": 1.1880861520767212, + "learning_rate": 0.0002, + "loss": 0.5721, + "step": 97170 + }, + { + "epoch": 6.978815080789946, + "grad_norm": 1.0959832668304443, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 97180 + }, + { + "epoch": 6.979533213644524, + "grad_norm": 1.2158745527267456, + "learning_rate": 0.0002, + "loss": 0.5834, + "step": 97190 + }, + { + "epoch": 6.980251346499102, + "grad_norm": 1.0073821544647217, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 97200 + }, + { + "epoch": 6.98096947935368, + "grad_norm": 0.8503464460372925, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 97210 + }, + { + "epoch": 6.981687612208258, + "grad_norm": 0.9399861097335815, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 97220 + }, + { + "epoch": 6.982405745062836, + "grad_norm": 1.1167447566986084, + "learning_rate": 0.0002, + "loss": 0.6135, + "step": 97230 + }, + { + "epoch": 6.983123877917414, + "grad_norm": 1.2710384130477905, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 97240 + }, + { + "epoch": 6.983842010771993, + "grad_norm": 0.8514767289161682, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 97250 + }, + { + "epoch": 6.984560143626571, + "grad_norm": 0.9983348846435547, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 97260 + }, + { + "epoch": 6.985278276481149, + "grad_norm": 1.1713277101516724, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 97270 + }, + { + "epoch": 6.985996409335727, + "grad_norm": 1.346272349357605, + "learning_rate": 0.0002, + "loss": 0.5297, + "step": 97280 + }, + { + "epoch": 6.986714542190305, + "grad_norm": 1.0687556266784668, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 97290 + }, + { + "epoch": 6.987432675044883, + "grad_norm": 1.035805106163025, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 97300 + }, + { + "epoch": 6.988150807899461, + "grad_norm": 1.149027705192566, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 97310 + }, + { + "epoch": 6.988868940754039, + "grad_norm": 0.9672921895980835, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 97320 + }, + { + "epoch": 6.989587073608618, + "grad_norm": 1.0306763648986816, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 97330 + }, + { + "epoch": 6.990305206463196, + "grad_norm": 1.1457809209823608, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 97340 + }, + { + "epoch": 6.991023339317774, + "grad_norm": 0.9718224406242371, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 97350 + }, + { + "epoch": 6.991741472172352, + "grad_norm": 0.9872630834579468, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 97360 + }, + { + "epoch": 6.99245960502693, + "grad_norm": 1.0302132368087769, + "learning_rate": 0.0002, + "loss": 0.611, + "step": 97370 + }, + { + "epoch": 6.993177737881508, + "grad_norm": 1.001103162765503, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 97380 + }, + { + "epoch": 6.993895870736086, + "grad_norm": 0.9207047820091248, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 97390 + }, + { + "epoch": 6.994614003590664, + "grad_norm": 1.1986219882965088, + "learning_rate": 0.0002, + "loss": 0.5752, + "step": 97400 + }, + { + "epoch": 6.995332136445242, + "grad_norm": 1.343885064125061, + "learning_rate": 0.0002, + "loss": 0.5938, + "step": 97410 + }, + { + "epoch": 6.99605026929982, + "grad_norm": 1.0611628293991089, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 97420 + }, + { + "epoch": 6.996768402154398, + "grad_norm": 0.9514605402946472, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 97430 + }, + { + "epoch": 6.997486535008977, + "grad_norm": 1.0259917974472046, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 97440 + }, + { + "epoch": 6.998204667863555, + "grad_norm": 1.0735033750534058, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 97450 + }, + { + "epoch": 6.998922800718133, + "grad_norm": 1.053984522819519, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 97460 + }, + { + "epoch": 6.999640933572711, + "grad_norm": 1.0285807847976685, + "learning_rate": 0.0002, + "loss": 0.581, + "step": 97470 + }, + { + "epoch": 7.0, + "eval_loss": 1.168665885925293, + "eval_runtime": 55.1686, + "eval_samples_per_second": 13.287, + "eval_steps_per_second": 1.668, + "step": 97475 + } + ], + "logging_steps": 10, + "max_steps": 111400, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.510924744635187e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-97475/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e7587a22ad2bd20c7b96e5733e0c8d7d70385123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033c07d3e283fc234116f7755cbafb36d6210a0df13f9f88f3009bffa4160700 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..72b03c5ea547a3a6cae0a27ea029d2566ebafc13 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 13925, "epoch_duration": 18356.92157459259, "total_accumulated_duration": 18356.92157459259, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.5816, "grad_norm": 1.0291756391525269, "learning_rate": 0.0002, "epoch": 0.000718132854578097, "step": 10}, {"loss": 1.1527, "grad_norm": 0.6570823192596436, "learning_rate": 0.0002, "epoch": 0.001436265709156194, "step": 20}, {"loss": 1.0014, "grad_norm": 0.693844199180603, "learning_rate": 0.0002, "epoch": 0.0021543985637342907, "step": 30}, {"loss": 0.9377, "grad_norm": 0.5608532428741455, "learning_rate": 0.0002, "epoch": 0.002872531418312388, "step": 40}, {"loss": 0.9533, "grad_norm": 0.549075722694397, "learning_rate": 0.0002, "epoch": 0.003590664272890485, "step": 50}, {"loss": 0.9164, "grad_norm": 0.47189879417419434, "learning_rate": 0.0002, "epoch": 0.004308797127468581, "step": 60}, {"loss": 0.8898, "grad_norm": 0.5799676775932312, "learning_rate": 0.0002, "epoch": 0.005026929982046679, "step": 70}, {"loss": 0.859, "grad_norm": 0.45907193422317505, "learning_rate": 0.0002, "epoch": 0.005745062836624776, "step": 80}, {"loss": 0.8697, "grad_norm": 0.4373045861721039, "learning_rate": 0.0002, "epoch": 0.006463195691202872, "step": 90}, {"loss": 0.8879, "grad_norm": 0.5636304020881653, "learning_rate": 0.0002, "epoch": 0.00718132854578097, "step": 100}, {"loss": 0.8397, "grad_norm": 0.5248253345489502, "learning_rate": 0.0002, "epoch": 0.007899461400359067, "step": 110}, {"loss": 0.9021, "grad_norm": 0.5082874298095703, "learning_rate": 0.0002, "epoch": 0.008617594254937163, "step": 120}, {"loss": 0.8678, "grad_norm": 0.42670881748199463, "learning_rate": 0.0002, "epoch": 0.00933572710951526, "step": 130}, {"loss": 0.7847, "grad_norm": 0.43311649560928345, "learning_rate": 0.0002, "epoch": 0.010053859964093357, "step": 140}, {"loss": 0.9252, "grad_norm": 0.43456509709358215, "learning_rate": 0.0002, "epoch": 0.010771992818671455, "step": 150}, {"loss": 0.8812, "grad_norm": 0.9222815632820129, "learning_rate": 0.0002, "epoch": 0.011490125673249552, "step": 160}, {"loss": 0.8651, "grad_norm": 0.42752256989479065, "learning_rate": 0.0002, "epoch": 0.012208258527827648, "step": 170}, {"loss": 0.8898, "grad_norm": 0.4175542891025543, "learning_rate": 0.0002, "epoch": 0.012926391382405745, "step": 180}, {"loss": 0.8519, "grad_norm": 0.4377831518650055, "learning_rate": 0.0002, "epoch": 0.013644524236983842, "step": 190}, {"loss": 0.8849, "grad_norm": 0.47263655066490173, "learning_rate": 0.0002, "epoch": 0.01436265709156194, "step": 200}, {"loss": 0.8764, "grad_norm": 0.3870520293712616, "learning_rate": 0.0002, "epoch": 0.015080789946140035, "step": 210}, {"loss": 0.833, "grad_norm": 0.4950464963912964, "learning_rate": 0.0002, "epoch": 0.015798922800718134, "step": 220}, {"loss": 0.8323, "grad_norm": 0.4643295407295227, "learning_rate": 0.0002, "epoch": 0.01651705565529623, "step": 230}, {"loss": 0.8363, "grad_norm": 0.5152903199195862, "learning_rate": 0.0002, "epoch": 0.017235188509874325, "step": 240}, {"loss": 0.873, "grad_norm": 0.3800727427005768, "learning_rate": 0.0002, "epoch": 0.017953321364452424, "step": 250}, {"loss": 0.8252, "grad_norm": 0.43700528144836426, "learning_rate": 0.0002, "epoch": 0.01867145421903052, "step": 260}, {"loss": 0.8686, "grad_norm": 0.3712887763977051, "learning_rate": 0.0002, "epoch": 0.01938958707360862, "step": 270}, {"loss": 0.8329, "grad_norm": 0.4202553629875183, "learning_rate": 0.0002, "epoch": 0.020107719928186715, "step": 280}, {"loss": 0.8143, "grad_norm": 0.40585094690322876, "learning_rate": 0.0002, "epoch": 0.02082585278276481, "step": 290}, {"loss": 0.8463, "grad_norm": 0.4685470759868622, "learning_rate": 0.0002, "epoch": 0.02154398563734291, "step": 300}, {"loss": 0.8321, "grad_norm": 0.373169481754303, "learning_rate": 0.0002, "epoch": 0.022262118491921005, "step": 310}, {"loss": 0.8031, "grad_norm": 0.39681482315063477, "learning_rate": 0.0002, "epoch": 0.022980251346499104, "step": 320}, {"loss": 0.8667, "grad_norm": 0.3919322192668915, "learning_rate": 0.0002, "epoch": 0.0236983842010772, "step": 330}, {"loss": 0.8196, "grad_norm": 0.4728981554508209, "learning_rate": 0.0002, "epoch": 0.024416517055655295, "step": 340}, {"loss": 0.8662, "grad_norm": 0.42439374327659607, "learning_rate": 0.0002, "epoch": 0.025134649910233394, "step": 350}, {"loss": 0.8618, "grad_norm": 0.425650030374527, "learning_rate": 0.0002, "epoch": 0.02585278276481149, "step": 360}, {"loss": 0.8249, "grad_norm": 0.4076762795448303, "learning_rate": 0.0002, "epoch": 0.02657091561938959, "step": 370}, {"loss": 0.8293, "grad_norm": 0.44335922598838806, "learning_rate": 0.0002, "epoch": 0.027289048473967684, "step": 380}, {"loss": 0.8288, "grad_norm": 0.5313619375228882, "learning_rate": 0.0002, "epoch": 0.02800718132854578, "step": 390}, {"loss": 0.8431, "grad_norm": 0.37089797854423523, "learning_rate": 0.0002, "epoch": 0.02872531418312388, "step": 400}, {"loss": 0.7644, "grad_norm": 0.5193604826927185, "learning_rate": 0.0002, "epoch": 0.029443447037701975, "step": 410}, {"loss": 0.7853, "grad_norm": 0.4428552985191345, "learning_rate": 0.0002, "epoch": 0.03016157989228007, "step": 420}, {"loss": 0.8641, "grad_norm": 0.384171724319458, "learning_rate": 0.0002, "epoch": 0.03087971274685817, "step": 430}, {"loss": 0.8236, "grad_norm": 0.3906913101673126, "learning_rate": 0.0002, "epoch": 0.03159784560143627, "step": 440}, {"loss": 0.8215, "grad_norm": 0.5365669131278992, "learning_rate": 0.0002, "epoch": 0.03231597845601436, "step": 450}, {"loss": 0.8376, "grad_norm": 0.4785287380218506, "learning_rate": 0.0002, "epoch": 0.03303411131059246, "step": 460}, {"loss": 0.8439, "grad_norm": 0.40048182010650635, "learning_rate": 0.0002, "epoch": 0.03375224416517056, "step": 470}, {"loss": 0.8306, "grad_norm": 0.49529239535331726, "learning_rate": 0.0002, "epoch": 0.03447037701974865, "step": 480}, {"loss": 0.8653, "grad_norm": 0.5853474140167236, "learning_rate": 0.0002, "epoch": 0.03518850987432675, "step": 490}, {"loss": 0.7952, "grad_norm": 0.3802863359451294, "learning_rate": 0.0002, "epoch": 0.03590664272890485, "step": 500}, {"loss": 0.8986, "grad_norm": 0.40374308824539185, "learning_rate": 0.0002, "epoch": 0.03662477558348295, "step": 510}, {"loss": 0.8495, "grad_norm": 0.4320009648799896, "learning_rate": 0.0002, "epoch": 0.03734290843806104, "step": 520}, {"loss": 0.8838, "grad_norm": 0.5198846459388733, "learning_rate": 0.0002, "epoch": 0.03806104129263914, "step": 530}, {"loss": 0.8343, "grad_norm": 0.4136947989463806, "learning_rate": 0.0002, "epoch": 0.03877917414721724, "step": 540}, {"loss": 0.8752, "grad_norm": 0.39344364404678345, "learning_rate": 0.0002, "epoch": 0.03949730700179533, "step": 550}, {"loss": 0.8088, "grad_norm": 0.4659644067287445, "learning_rate": 0.0002, "epoch": 0.04021543985637343, "step": 560}, {"loss": 0.766, "grad_norm": 0.3898842930793762, "learning_rate": 0.0002, "epoch": 0.04093357271095153, "step": 570}, {"loss": 0.7806, "grad_norm": 0.3964841961860657, "learning_rate": 0.0002, "epoch": 0.04165170556552962, "step": 580}, {"loss": 0.801, "grad_norm": 0.5172179341316223, "learning_rate": 0.0002, "epoch": 0.04236983842010772, "step": 590}, {"loss": 0.8253, "grad_norm": 0.5362544059753418, "learning_rate": 0.0002, "epoch": 0.04308797127468582, "step": 600}, {"loss": 0.8701, "grad_norm": 0.3975909948348999, "learning_rate": 0.0002, "epoch": 0.04380610412926391, "step": 610}, {"loss": 0.844, "grad_norm": 0.3905031085014343, "learning_rate": 0.0002, "epoch": 0.04452423698384201, "step": 620}, {"loss": 0.7723, "grad_norm": 0.5148088932037354, "learning_rate": 0.0002, "epoch": 0.04524236983842011, "step": 630}, {"loss": 0.8309, "grad_norm": 0.38826194405555725, "learning_rate": 0.0002, "epoch": 0.04596050269299821, "step": 640}, {"loss": 0.8379, "grad_norm": 0.5432049036026001, "learning_rate": 0.0002, "epoch": 0.0466786355475763, "step": 650}, {"loss": 0.838, "grad_norm": 0.42048221826553345, "learning_rate": 0.0002, "epoch": 0.0473967684021544, "step": 660}, {"loss": 0.8337, "grad_norm": 0.4683088958263397, "learning_rate": 0.0002, "epoch": 0.0481149012567325, "step": 670}, {"loss": 0.7982, "grad_norm": 0.4623735249042511, "learning_rate": 0.0002, "epoch": 0.04883303411131059, "step": 680}, {"loss": 0.8905, "grad_norm": 0.509128212928772, "learning_rate": 0.0002, "epoch": 0.04955116696588869, "step": 690}, {"loss": 0.8193, "grad_norm": 0.45767295360565186, "learning_rate": 0.0002, "epoch": 0.05026929982046679, "step": 700}, {"loss": 0.7658, "grad_norm": 0.4023726284503937, "learning_rate": 0.0002, "epoch": 0.05098743267504488, "step": 710}, {"loss": 0.8552, "grad_norm": 0.4407201409339905, "learning_rate": 0.0002, "epoch": 0.05170556552962298, "step": 720}, {"loss": 0.8369, "grad_norm": 0.41862091422080994, "learning_rate": 0.0002, "epoch": 0.05242369838420108, "step": 730}, {"loss": 0.8856, "grad_norm": 0.37473055720329285, "learning_rate": 0.0002, "epoch": 0.05314183123877918, "step": 740}, {"loss": 0.8282, "grad_norm": 0.4882921576499939, "learning_rate": 0.0002, "epoch": 0.05385996409335727, "step": 750}, {"loss": 0.8257, "grad_norm": 0.47890132665634155, "learning_rate": 0.0002, "epoch": 0.05457809694793537, "step": 760}, {"loss": 0.9222, "grad_norm": 0.5811166167259216, "learning_rate": 0.0002, "epoch": 0.05529622980251347, "step": 770}, {"loss": 0.7943, "grad_norm": 0.41113588213920593, "learning_rate": 0.0002, "epoch": 0.05601436265709156, "step": 780}, {"loss": 0.791, "grad_norm": 0.4120602607727051, "learning_rate": 0.0002, "epoch": 0.05673249551166966, "step": 790}, {"loss": 0.9038, "grad_norm": 0.39287394285202026, "learning_rate": 0.0002, "epoch": 0.05745062836624776, "step": 800}, {"loss": 0.8131, "grad_norm": 0.3986941874027252, "learning_rate": 0.0002, "epoch": 0.05816876122082585, "step": 810}, {"loss": 0.8268, "grad_norm": 0.4264012575149536, "learning_rate": 0.0002, "epoch": 0.05888689407540395, "step": 820}, {"loss": 0.7881, "grad_norm": 0.481139600276947, "learning_rate": 0.0002, "epoch": 0.05960502692998205, "step": 830}, {"loss": 0.8477, "grad_norm": 0.5561784505844116, "learning_rate": 0.0002, "epoch": 0.06032315978456014, "step": 840}, {"loss": 0.7817, "grad_norm": 0.4787197411060333, "learning_rate": 0.0002, "epoch": 0.06104129263913824, "step": 850}, {"loss": 0.8567, "grad_norm": 0.46454647183418274, "learning_rate": 0.0002, "epoch": 0.06175942549371634, "step": 860}, {"loss": 0.8429, "grad_norm": 0.5929669141769409, "learning_rate": 0.0002, "epoch": 0.06247755834829444, "step": 870}, {"loss": 0.8019, "grad_norm": 0.4561384618282318, "learning_rate": 0.0002, "epoch": 0.06319569120287254, "step": 880}, {"loss": 0.8686, "grad_norm": 0.45767998695373535, "learning_rate": 0.0002, "epoch": 0.06391382405745062, "step": 890}, {"loss": 0.818, "grad_norm": 0.42475444078445435, "learning_rate": 0.0002, "epoch": 0.06463195691202872, "step": 900}, {"loss": 0.8579, "grad_norm": 0.4911022484302521, "learning_rate": 0.0002, "epoch": 0.06535008976660682, "step": 910}, {"loss": 0.8067, "grad_norm": 0.5229166746139526, "learning_rate": 0.0002, "epoch": 0.06606822262118492, "step": 920}, {"loss": 0.8563, "grad_norm": 0.38134580850601196, "learning_rate": 0.0002, "epoch": 0.06678635547576302, "step": 930}, {"loss": 0.815, "grad_norm": 0.4171486496925354, "learning_rate": 0.0002, "epoch": 0.06750448833034112, "step": 940}, {"loss": 0.8122, "grad_norm": 0.45171529054641724, "learning_rate": 0.0002, "epoch": 0.06822262118491922, "step": 950}, {"loss": 0.8436, "grad_norm": 0.44889307022094727, "learning_rate": 0.0002, "epoch": 0.0689407540394973, "step": 960}, {"loss": 0.8149, "grad_norm": 0.44902464747428894, "learning_rate": 0.0002, "epoch": 0.0696588868940754, "step": 970}, {"loss": 0.7916, "grad_norm": 0.4671969413757324, "learning_rate": 0.0002, "epoch": 0.0703770197486535, "step": 980}, {"loss": 0.8147, "grad_norm": 0.4686984717845917, "learning_rate": 0.0002, "epoch": 0.0710951526032316, "step": 990}, {"loss": 0.806, "grad_norm": 0.4513658583164215, "learning_rate": 0.0002, "epoch": 0.0718132854578097, "step": 1000}, {"loss": 0.8348, "grad_norm": 0.48861828446388245, "learning_rate": 0.0002, "epoch": 0.0725314183123878, "step": 1010}, {"loss": 0.8038, "grad_norm": 0.7603165507316589, "learning_rate": 0.0002, "epoch": 0.0732495511669659, "step": 1020}, {"loss": 0.7844, "grad_norm": 0.501654863357544, "learning_rate": 0.0002, "epoch": 0.07396768402154398, "step": 1030}, {"loss": 0.7623, "grad_norm": 0.45291560888290405, "learning_rate": 0.0002, "epoch": 0.07468581687612208, "step": 1040}, {"loss": 0.8174, "grad_norm": 0.42454713582992554, "learning_rate": 0.0002, "epoch": 0.07540394973070018, "step": 1050}, {"loss": 0.7874, "grad_norm": 0.4655592441558838, "learning_rate": 0.0002, "epoch": 0.07612208258527828, "step": 1060}, {"loss": 0.8855, "grad_norm": 0.5011071562767029, "learning_rate": 0.0002, "epoch": 0.07684021543985638, "step": 1070}, {"loss": 0.8502, "grad_norm": 0.37221577763557434, "learning_rate": 0.0002, "epoch": 0.07755834829443448, "step": 1080}, {"loss": 0.8623, "grad_norm": 0.5123572945594788, "learning_rate": 0.0002, "epoch": 0.07827648114901256, "step": 1090}, {"loss": 0.8527, "grad_norm": 0.44138720631599426, "learning_rate": 0.0002, "epoch": 0.07899461400359066, "step": 1100}, {"loss": 0.7949, "grad_norm": 0.38932886719703674, "learning_rate": 0.0002, "epoch": 0.07971274685816876, "step": 1110}, {"loss": 0.8289, "grad_norm": 0.435820072889328, "learning_rate": 0.0002, "epoch": 0.08043087971274686, "step": 1120}, {"loss": 0.787, "grad_norm": 0.3820142149925232, "learning_rate": 0.0002, "epoch": 0.08114901256732496, "step": 1130}, {"loss": 0.8617, "grad_norm": 0.39680808782577515, "learning_rate": 0.0002, "epoch": 0.08186714542190306, "step": 1140}, {"loss": 0.8047, "grad_norm": 0.4833722412586212, "learning_rate": 0.0002, "epoch": 0.08258527827648116, "step": 1150}, {"loss": 0.8513, "grad_norm": 0.5045956969261169, "learning_rate": 0.0002, "epoch": 0.08330341113105924, "step": 1160}, {"loss": 0.8366, "grad_norm": 0.3652207553386688, "learning_rate": 0.0002, "epoch": 0.08402154398563734, "step": 1170}, {"loss": 0.8464, "grad_norm": 0.44447052478790283, "learning_rate": 0.0002, "epoch": 0.08473967684021544, "step": 1180}, {"loss": 0.8362, "grad_norm": 0.44942694902420044, "learning_rate": 0.0002, "epoch": 0.08545780969479354, "step": 1190}, {"loss": 0.7932, "grad_norm": 0.48789075016975403, "learning_rate": 0.0002, "epoch": 0.08617594254937164, "step": 1200}, {"loss": 0.8008, "grad_norm": 0.3981451094150543, "learning_rate": 0.0002, "epoch": 0.08689407540394974, "step": 1210}, {"loss": 0.8296, "grad_norm": 0.45545220375061035, "learning_rate": 0.0002, "epoch": 0.08761220825852782, "step": 1220}, {"loss": 0.8406, "grad_norm": 0.562138557434082, "learning_rate": 0.0002, "epoch": 0.08833034111310592, "step": 1230}, {"loss": 0.808, "grad_norm": 0.48523494601249695, "learning_rate": 0.0002, "epoch": 0.08904847396768402, "step": 1240}, {"loss": 0.8024, "grad_norm": 0.35054388642311096, "learning_rate": 0.0002, "epoch": 0.08976660682226212, "step": 1250}, {"loss": 0.8635, "grad_norm": 0.4148605167865753, "learning_rate": 0.0002, "epoch": 0.09048473967684022, "step": 1260}, {"loss": 0.8379, "grad_norm": 0.50171959400177, "learning_rate": 0.0002, "epoch": 0.09120287253141832, "step": 1270}, {"loss": 0.8466, "grad_norm": 0.41747573018074036, "learning_rate": 0.0002, "epoch": 0.09192100538599642, "step": 1280}, {"loss": 0.7905, "grad_norm": 0.43028751015663147, "learning_rate": 0.0002, "epoch": 0.0926391382405745, "step": 1290}, {"loss": 0.8071, "grad_norm": 0.41274991631507874, "learning_rate": 0.0002, "epoch": 0.0933572710951526, "step": 1300}, {"loss": 0.8214, "grad_norm": 0.5399569272994995, "learning_rate": 0.0002, "epoch": 0.0940754039497307, "step": 1310}, {"loss": 0.8108, "grad_norm": 0.44284379482269287, "learning_rate": 0.0002, "epoch": 0.0947935368043088, "step": 1320}, {"loss": 0.8301, "grad_norm": 0.42511969804763794, "learning_rate": 0.0002, "epoch": 0.0955116696588869, "step": 1330}, {"loss": 0.8527, "grad_norm": 0.5717929005622864, "learning_rate": 0.0002, "epoch": 0.096229802513465, "step": 1340}, {"loss": 0.8232, "grad_norm": 0.4104631245136261, "learning_rate": 0.0002, "epoch": 0.09694793536804308, "step": 1350}, {"loss": 0.8697, "grad_norm": 0.4144339859485626, "learning_rate": 0.0002, "epoch": 0.09766606822262118, "step": 1360}, {"loss": 0.7909, "grad_norm": 0.43676936626434326, "learning_rate": 0.0002, "epoch": 0.09838420107719928, "step": 1370}, {"loss": 0.8757, "grad_norm": 0.5297161340713501, "learning_rate": 0.0002, "epoch": 0.09910233393177738, "step": 1380}, {"loss": 0.7772, "grad_norm": 0.5319193601608276, "learning_rate": 0.0002, "epoch": 0.09982046678635548, "step": 1390}, {"loss": 0.8167, "grad_norm": 0.4083728492259979, "learning_rate": 0.0002, "epoch": 0.10053859964093358, "step": 1400}, {"loss": 0.8436, "grad_norm": 0.4193868339061737, "learning_rate": 0.0002, "epoch": 0.10125673249551168, "step": 1410}, {"loss": 0.8634, "grad_norm": 0.4062198996543884, "learning_rate": 0.0002, "epoch": 0.10197486535008976, "step": 1420}, {"loss": 0.7984, "grad_norm": 0.43972232937812805, "learning_rate": 0.0002, "epoch": 0.10269299820466786, "step": 1430}, {"loss": 0.8278, "grad_norm": 0.4598410725593567, "learning_rate": 0.0002, "epoch": 0.10341113105924596, "step": 1440}, {"loss": 0.8527, "grad_norm": 0.571662187576294, "learning_rate": 0.0002, "epoch": 0.10412926391382406, "step": 1450}, {"loss": 0.8485, "grad_norm": 0.5437791347503662, "learning_rate": 0.0002, "epoch": 0.10484739676840216, "step": 1460}, {"loss": 0.8172, "grad_norm": 0.4241923391819, "learning_rate": 0.0002, "epoch": 0.10556552962298026, "step": 1470}, {"loss": 0.8224, "grad_norm": 0.5185145735740662, "learning_rate": 0.0002, "epoch": 0.10628366247755835, "step": 1480}, {"loss": 0.8292, "grad_norm": 0.537626326084137, "learning_rate": 0.0002, "epoch": 0.10700179533213644, "step": 1490}, {"loss": 0.8227, "grad_norm": 0.4573661983013153, "learning_rate": 0.0002, "epoch": 0.10771992818671454, "step": 1500}, {"loss": 0.8318, "grad_norm": 0.4521017074584961, "learning_rate": 0.0002, "epoch": 0.10843806104129264, "step": 1510}, {"loss": 0.8107, "grad_norm": 0.6835159063339233, "learning_rate": 0.0002, "epoch": 0.10915619389587074, "step": 1520}, {"loss": 0.8256, "grad_norm": 0.43522894382476807, "learning_rate": 0.0002, "epoch": 0.10987432675044884, "step": 1530}, {"loss": 0.8211, "grad_norm": 0.685547411441803, "learning_rate": 0.0002, "epoch": 0.11059245960502694, "step": 1540}, {"loss": 0.8393, "grad_norm": 0.5283669233322144, "learning_rate": 0.0002, "epoch": 0.11131059245960502, "step": 1550}, {"loss": 0.8493, "grad_norm": 0.4869283437728882, "learning_rate": 0.0002, "epoch": 0.11202872531418312, "step": 1560}, {"loss": 0.8614, "grad_norm": 0.43024054169654846, "learning_rate": 0.0002, "epoch": 0.11274685816876122, "step": 1570}, {"loss": 0.8026, "grad_norm": 0.46726059913635254, "learning_rate": 0.0002, "epoch": 0.11346499102333932, "step": 1580}, {"loss": 0.8103, "grad_norm": 0.5046039819717407, "learning_rate": 0.0002, "epoch": 0.11418312387791742, "step": 1590}, {"loss": 0.8242, "grad_norm": 0.48972827196121216, "learning_rate": 0.0002, "epoch": 0.11490125673249552, "step": 1600}, {"loss": 0.8114, "grad_norm": 0.5221049189567566, "learning_rate": 0.0002, "epoch": 0.11561938958707361, "step": 1610}, {"loss": 0.8022, "grad_norm": 0.49169477820396423, "learning_rate": 0.0002, "epoch": 0.1163375224416517, "step": 1620}, {"loss": 0.8223, "grad_norm": 0.48462188243865967, "learning_rate": 0.0002, "epoch": 0.1170556552962298, "step": 1630}, {"loss": 0.8409, "grad_norm": 0.9001021981239319, "learning_rate": 0.0002, "epoch": 0.1177737881508079, "step": 1640}, {"loss": 0.8037, "grad_norm": 0.47555917501449585, "learning_rate": 0.0002, "epoch": 0.118491921005386, "step": 1650}, {"loss": 0.8047, "grad_norm": 0.4523521959781647, "learning_rate": 0.0002, "epoch": 0.1192100538599641, "step": 1660}, {"loss": 0.8552, "grad_norm": 0.510956346988678, "learning_rate": 0.0002, "epoch": 0.1199281867145422, "step": 1670}, {"loss": 0.8081, "grad_norm": 0.48063746094703674, "learning_rate": 0.0002, "epoch": 0.12064631956912028, "step": 1680}, {"loss": 0.7712, "grad_norm": 0.5209490060806274, "learning_rate": 0.0002, "epoch": 0.12136445242369838, "step": 1690}, {"loss": 0.8019, "grad_norm": 0.5488983988761902, "learning_rate": 0.0002, "epoch": 0.12208258527827648, "step": 1700}, {"loss": 0.829, "grad_norm": 0.5263523459434509, "learning_rate": 0.0002, "epoch": 0.12280071813285458, "step": 1710}, {"loss": 0.7761, "grad_norm": 0.45365768671035767, "learning_rate": 0.0002, "epoch": 0.12351885098743268, "step": 1720}, {"loss": 0.8432, "grad_norm": 0.4366922378540039, "learning_rate": 0.0002, "epoch": 0.12423698384201078, "step": 1730}, {"loss": 0.8261, "grad_norm": 0.4841083884239197, "learning_rate": 0.0002, "epoch": 0.12495511669658887, "step": 1740}, {"loss": 0.7834, "grad_norm": 0.46546968817710876, "learning_rate": 0.0002, "epoch": 0.12567324955116696, "step": 1750}, {"loss": 0.7874, "grad_norm": 0.39987099170684814, "learning_rate": 0.0002, "epoch": 0.12639138240574507, "step": 1760}, {"loss": 0.813, "grad_norm": 0.4661678969860077, "learning_rate": 0.0002, "epoch": 0.12710951526032316, "step": 1770}, {"loss": 0.8516, "grad_norm": 0.46716657280921936, "learning_rate": 0.0002, "epoch": 0.12782764811490124, "step": 1780}, {"loss": 0.8065, "grad_norm": 0.46164995431900024, "learning_rate": 0.0002, "epoch": 0.12854578096947936, "step": 1790}, {"loss": 0.8911, "grad_norm": 0.4910370111465454, "learning_rate": 0.0002, "epoch": 0.12926391382405744, "step": 1800}, {"loss": 0.7773, "grad_norm": 0.5615737438201904, "learning_rate": 0.0002, "epoch": 0.12998204667863555, "step": 1810}, {"loss": 0.7726, "grad_norm": 0.5739728808403015, "learning_rate": 0.0002, "epoch": 0.13070017953321364, "step": 1820}, {"loss": 0.8307, "grad_norm": 0.44104722142219543, "learning_rate": 0.0002, "epoch": 0.13141831238779175, "step": 1830}, {"loss": 0.7533, "grad_norm": 0.46373724937438965, "learning_rate": 0.0002, "epoch": 0.13213644524236984, "step": 1840}, {"loss": 0.8181, "grad_norm": 0.4481196403503418, "learning_rate": 0.0002, "epoch": 0.13285457809694792, "step": 1850}, {"loss": 0.8508, "grad_norm": 0.5689327716827393, "learning_rate": 0.0002, "epoch": 0.13357271095152604, "step": 1860}, {"loss": 0.8364, "grad_norm": 0.5334849953651428, "learning_rate": 0.0002, "epoch": 0.13429084380610412, "step": 1870}, {"loss": 0.8018, "grad_norm": 0.5177253484725952, "learning_rate": 0.0002, "epoch": 0.13500897666068223, "step": 1880}, {"loss": 0.869, "grad_norm": 0.4919368326663971, "learning_rate": 0.0002, "epoch": 0.13572710951526032, "step": 1890}, {"loss": 0.7647, "grad_norm": 0.5987576842308044, "learning_rate": 0.0002, "epoch": 0.13644524236983843, "step": 1900}, {"loss": 0.8546, "grad_norm": 0.49790486693382263, "learning_rate": 0.0002, "epoch": 0.13716337522441652, "step": 1910}, {"loss": 0.8402, "grad_norm": 0.5337542295455933, "learning_rate": 0.0002, "epoch": 0.1378815080789946, "step": 1920}, {"loss": 0.815, "grad_norm": 0.5171598792076111, "learning_rate": 0.0002, "epoch": 0.13859964093357272, "step": 1930}, {"loss": 0.843, "grad_norm": 0.5003953576087952, "learning_rate": 0.0002, "epoch": 0.1393177737881508, "step": 1940}, {"loss": 0.7867, "grad_norm": 0.5147887468338013, "learning_rate": 0.0002, "epoch": 0.1400359066427289, "step": 1950}, {"loss": 0.8215, "grad_norm": 0.6365984678268433, "learning_rate": 0.0002, "epoch": 0.140754039497307, "step": 1960}, {"loss": 0.8397, "grad_norm": 0.5449512004852295, "learning_rate": 0.0002, "epoch": 0.1414721723518851, "step": 1970}, {"loss": 0.8177, "grad_norm": 0.4062703847885132, "learning_rate": 0.0002, "epoch": 0.1421903052064632, "step": 1980}, {"loss": 0.8058, "grad_norm": 0.4446912705898285, "learning_rate": 0.0002, "epoch": 0.14290843806104128, "step": 1990}, {"loss": 0.7854, "grad_norm": 0.49001234769821167, "learning_rate": 0.0002, "epoch": 0.1436265709156194, "step": 2000}, {"loss": 0.8136, "grad_norm": 0.5591765642166138, "learning_rate": 0.0002, "epoch": 0.14434470377019748, "step": 2010}, {"loss": 0.7808, "grad_norm": 0.6476696133613586, "learning_rate": 0.0002, "epoch": 0.1450628366247756, "step": 2020}, {"loss": 0.8137, "grad_norm": 0.44688376784324646, "learning_rate": 0.0002, "epoch": 0.14578096947935368, "step": 2030}, {"loss": 0.8253, "grad_norm": 0.4437490701675415, "learning_rate": 0.0002, "epoch": 0.1464991023339318, "step": 2040}, {"loss": 0.7654, "grad_norm": 0.59927898645401, "learning_rate": 0.0002, "epoch": 0.14721723518850988, "step": 2050}, {"loss": 0.825, "grad_norm": 0.4356591999530792, "learning_rate": 0.0002, "epoch": 0.14793536804308796, "step": 2060}, {"loss": 0.8038, "grad_norm": 0.5560822486877441, "learning_rate": 0.0002, "epoch": 0.14865350089766607, "step": 2070}, {"loss": 0.838, "grad_norm": 0.43027108907699585, "learning_rate": 0.0002, "epoch": 0.14937163375224416, "step": 2080}, {"loss": 0.8317, "grad_norm": 0.41215455532073975, "learning_rate": 0.0002, "epoch": 0.15008976660682227, "step": 2090}, {"loss": 0.7948, "grad_norm": 0.4607839584350586, "learning_rate": 0.0002, "epoch": 0.15080789946140036, "step": 2100}, {"loss": 0.7981, "grad_norm": 0.4699854254722595, "learning_rate": 0.0002, "epoch": 0.15152603231597844, "step": 2110}, {"loss": 0.8464, "grad_norm": 0.5111975073814392, "learning_rate": 0.0002, "epoch": 0.15224416517055656, "step": 2120}, {"loss": 0.7672, "grad_norm": 0.4713742733001709, "learning_rate": 0.0002, "epoch": 0.15296229802513464, "step": 2130}, {"loss": 0.7692, "grad_norm": 0.3816622793674469, "learning_rate": 0.0002, "epoch": 0.15368043087971275, "step": 2140}, {"loss": 0.7824, "grad_norm": 0.4637526273727417, "learning_rate": 0.0002, "epoch": 0.15439856373429084, "step": 2150}, {"loss": 0.8185, "grad_norm": 0.3691818118095398, "learning_rate": 0.0002, "epoch": 0.15511669658886895, "step": 2160}, {"loss": 0.8298, "grad_norm": 0.4435218274593353, "learning_rate": 0.0002, "epoch": 0.15583482944344704, "step": 2170}, {"loss": 0.7917, "grad_norm": 0.5282211899757385, "learning_rate": 0.0002, "epoch": 0.15655296229802512, "step": 2180}, {"loss": 0.8006, "grad_norm": 0.7611056566238403, "learning_rate": 0.0002, "epoch": 0.15727109515260324, "step": 2190}, {"loss": 0.8039, "grad_norm": 0.5951169729232788, "learning_rate": 0.0002, "epoch": 0.15798922800718132, "step": 2200}, {"loss": 0.8314, "grad_norm": 0.5243265628814697, "learning_rate": 0.0002, "epoch": 0.15870736086175943, "step": 2210}, {"loss": 0.7817, "grad_norm": 0.518944501876831, "learning_rate": 0.0002, "epoch": 0.15942549371633752, "step": 2220}, {"loss": 0.8187, "grad_norm": 0.4264616072177887, "learning_rate": 0.0002, "epoch": 0.16014362657091563, "step": 2230}, {"loss": 0.7916, "grad_norm": 0.4619045853614807, "learning_rate": 0.0002, "epoch": 0.16086175942549372, "step": 2240}, {"loss": 0.84, "grad_norm": 0.4047030508518219, "learning_rate": 0.0002, "epoch": 0.1615798922800718, "step": 2250}, {"loss": 0.8133, "grad_norm": 0.47133687138557434, "learning_rate": 0.0002, "epoch": 0.16229802513464991, "step": 2260}, {"loss": 0.8032, "grad_norm": 0.4990246593952179, "learning_rate": 0.0002, "epoch": 0.163016157989228, "step": 2270}, {"loss": 0.752, "grad_norm": 0.5145298838615417, "learning_rate": 0.0002, "epoch": 0.1637342908438061, "step": 2280}, {"loss": 0.8441, "grad_norm": 0.5354352593421936, "learning_rate": 0.0002, "epoch": 0.1644524236983842, "step": 2290}, {"loss": 0.8099, "grad_norm": 0.47621065378189087, "learning_rate": 0.0002, "epoch": 0.1651705565529623, "step": 2300}, {"loss": 0.8105, "grad_norm": 0.45333582162857056, "learning_rate": 0.0002, "epoch": 0.1658886894075404, "step": 2310}, {"loss": 0.8126, "grad_norm": 0.4832790493965149, "learning_rate": 0.0002, "epoch": 0.16660682226211848, "step": 2320}, {"loss": 0.8271, "grad_norm": 0.4922761619091034, "learning_rate": 0.0002, "epoch": 0.1673249551166966, "step": 2330}, {"loss": 0.8324, "grad_norm": 0.5701655149459839, "learning_rate": 0.0002, "epoch": 0.16804308797127468, "step": 2340}, {"loss": 0.844, "grad_norm": 0.5170459151268005, "learning_rate": 0.0002, "epoch": 0.1687612208258528, "step": 2350}, {"loss": 0.7995, "grad_norm": 0.6562373638153076, "learning_rate": 0.0002, "epoch": 0.16947935368043088, "step": 2360}, {"loss": 0.7733, "grad_norm": 0.5350262522697449, "learning_rate": 0.0002, "epoch": 0.170197486535009, "step": 2370}, {"loss": 0.8501, "grad_norm": 0.5163491368293762, "learning_rate": 0.0002, "epoch": 0.17091561938958708, "step": 2380}, {"loss": 0.7708, "grad_norm": 0.48841530084609985, "learning_rate": 0.0002, "epoch": 0.17163375224416516, "step": 2390}, {"loss": 0.7969, "grad_norm": 0.44912993907928467, "learning_rate": 0.0002, "epoch": 0.17235188509874327, "step": 2400}, {"loss": 0.7706, "grad_norm": 0.5770647525787354, "learning_rate": 0.0002, "epoch": 0.17307001795332136, "step": 2410}, {"loss": 0.8233, "grad_norm": 0.4716179072856903, "learning_rate": 0.0002, "epoch": 0.17378815080789947, "step": 2420}, {"loss": 0.7802, "grad_norm": 0.5465078949928284, "learning_rate": 0.0002, "epoch": 0.17450628366247756, "step": 2430}, {"loss": 0.8191, "grad_norm": 0.40810713171958923, "learning_rate": 0.0002, "epoch": 0.17522441651705564, "step": 2440}, {"loss": 0.7971, "grad_norm": 0.3789578080177307, "learning_rate": 0.0002, "epoch": 0.17594254937163376, "step": 2450}, {"loss": 0.7437, "grad_norm": 0.4615110158920288, "learning_rate": 0.0002, "epoch": 0.17666068222621184, "step": 2460}, {"loss": 0.8102, "grad_norm": 0.4400235712528229, "learning_rate": 0.0002, "epoch": 0.17737881508078995, "step": 2470}, {"loss": 0.8254, "grad_norm": 0.5935020446777344, "learning_rate": 0.0002, "epoch": 0.17809694793536804, "step": 2480}, {"loss": 0.7886, "grad_norm": 0.5672990679740906, "learning_rate": 0.0002, "epoch": 0.17881508078994615, "step": 2490}, {"loss": 0.7829, "grad_norm": 0.4132838845252991, "learning_rate": 0.0002, "epoch": 0.17953321364452424, "step": 2500}, {"loss": 0.8056, "grad_norm": 0.5373716950416565, "learning_rate": 0.0002, "epoch": 0.18025134649910232, "step": 2510}, {"loss": 0.8061, "grad_norm": 0.5335832834243774, "learning_rate": 0.0002, "epoch": 0.18096947935368043, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5705642700195312, "learning_rate": 0.0002, "epoch": 0.18168761220825852, "step": 2530}, {"loss": 0.7779, "grad_norm": 0.4807959496974945, "learning_rate": 0.0002, "epoch": 0.18240574506283663, "step": 2540}, {"loss": 0.7767, "grad_norm": 0.4430573880672455, "learning_rate": 0.0002, "epoch": 0.18312387791741472, "step": 2550}, {"loss": 0.7921, "grad_norm": 0.5294728875160217, "learning_rate": 0.0002, "epoch": 0.18384201077199283, "step": 2560}, {"loss": 0.8102, "grad_norm": 0.661173403263092, "learning_rate": 0.0002, "epoch": 0.18456014362657092, "step": 2570}, {"loss": 0.803, "grad_norm": 0.5044304728507996, "learning_rate": 0.0002, "epoch": 0.185278276481149, "step": 2580}, {"loss": 0.7833, "grad_norm": 0.48929551243782043, "learning_rate": 0.0002, "epoch": 0.18599640933572711, "step": 2590}, {"loss": 0.8252, "grad_norm": 0.5054438710212708, "learning_rate": 0.0002, "epoch": 0.1867145421903052, "step": 2600}, {"loss": 0.7665, "grad_norm": 0.5613677501678467, "learning_rate": 0.0002, "epoch": 0.1874326750448833, "step": 2610}, {"loss": 0.7954, "grad_norm": 0.5762478709220886, "learning_rate": 0.0002, "epoch": 0.1881508078994614, "step": 2620}, {"loss": 0.8312, "grad_norm": 0.4523695409297943, "learning_rate": 0.0002, "epoch": 0.1888689407540395, "step": 2630}, {"loss": 0.8098, "grad_norm": 0.5235317945480347, "learning_rate": 0.0002, "epoch": 0.1895870736086176, "step": 2640}, {"loss": 0.8281, "grad_norm": 0.4894576370716095, "learning_rate": 0.0002, "epoch": 0.19030520646319568, "step": 2650}, {"loss": 0.7923, "grad_norm": 0.45731106400489807, "learning_rate": 0.0002, "epoch": 0.1910233393177738, "step": 2660}, {"loss": 0.7942, "grad_norm": 0.4726541042327881, "learning_rate": 0.0002, "epoch": 0.19174147217235188, "step": 2670}, {"loss": 0.7979, "grad_norm": 0.4281631410121918, "learning_rate": 0.0002, "epoch": 0.19245960502693, "step": 2680}, {"loss": 0.8076, "grad_norm": 0.48011314868927, "learning_rate": 0.0002, "epoch": 0.19317773788150808, "step": 2690}, {"loss": 0.7785, "grad_norm": 0.45785006880760193, "learning_rate": 0.0002, "epoch": 0.19389587073608616, "step": 2700}, {"loss": 0.7726, "grad_norm": 0.5244625210762024, "learning_rate": 0.0002, "epoch": 0.19461400359066428, "step": 2710}, {"loss": 0.8674, "grad_norm": 0.4674883186817169, "learning_rate": 0.0002, "epoch": 0.19533213644524236, "step": 2720}, {"loss": 0.8465, "grad_norm": 0.5969558358192444, "learning_rate": 0.0002, "epoch": 0.19605026929982047, "step": 2730}, {"loss": 0.8238, "grad_norm": 0.44413265585899353, "learning_rate": 0.0002, "epoch": 0.19676840215439856, "step": 2740}, {"loss": 0.8181, "grad_norm": 0.5094553828239441, "learning_rate": 0.0002, "epoch": 0.19748653500897667, "step": 2750}, {"loss": 0.7593, "grad_norm": 0.4931736886501312, "learning_rate": 0.0002, "epoch": 0.19820466786355476, "step": 2760}, {"loss": 0.8535, "grad_norm": 0.4766625463962555, "learning_rate": 0.0002, "epoch": 0.19892280071813284, "step": 2770}, {"loss": 0.754, "grad_norm": 0.4196971654891968, "learning_rate": 0.0002, "epoch": 0.19964093357271095, "step": 2780}, {"loss": 0.7794, "grad_norm": 0.4693375825881958, "learning_rate": 0.0002, "epoch": 0.20035906642728904, "step": 2790}, {"loss": 0.8336, "grad_norm": 0.5407108664512634, "learning_rate": 0.0002, "epoch": 0.20107719928186715, "step": 2800}, {"loss": 0.7938, "grad_norm": 0.42864227294921875, "learning_rate": 0.0002, "epoch": 0.20179533213644524, "step": 2810}, {"loss": 0.8059, "grad_norm": 0.4928833246231079, "learning_rate": 0.0002, "epoch": 0.20251346499102335, "step": 2820}, {"loss": 0.8221, "grad_norm": 0.5575131773948669, "learning_rate": 0.0002, "epoch": 0.20323159784560144, "step": 2830}, {"loss": 0.7712, "grad_norm": 0.505114734172821, "learning_rate": 0.0002, "epoch": 0.20394973070017952, "step": 2840}, {"loss": 0.7986, "grad_norm": 0.4727420210838318, "learning_rate": 0.0002, "epoch": 0.20466786355475763, "step": 2850}, {"loss": 0.7662, "grad_norm": 0.48218145966529846, "learning_rate": 0.0002, "epoch": 0.20538599640933572, "step": 2860}, {"loss": 0.8055, "grad_norm": 0.5196906328201294, "learning_rate": 0.0002, "epoch": 0.20610412926391383, "step": 2870}, {"loss": 0.8401, "grad_norm": 0.4927639067173004, "learning_rate": 0.0002, "epoch": 0.20682226211849192, "step": 2880}, {"loss": 0.8067, "grad_norm": 0.5076990127563477, "learning_rate": 0.0002, "epoch": 0.20754039497307003, "step": 2890}, {"loss": 0.789, "grad_norm": 0.4606800079345703, "learning_rate": 0.0002, "epoch": 0.20825852782764812, "step": 2900}, {"loss": 0.8381, "grad_norm": 0.6184319257736206, "learning_rate": 0.0002, "epoch": 0.2089766606822262, "step": 2910}, {"loss": 0.8019, "grad_norm": 0.5237935781478882, "learning_rate": 0.0002, "epoch": 0.2096947935368043, "step": 2920}, {"loss": 0.7763, "grad_norm": 0.43966251611709595, "learning_rate": 0.0002, "epoch": 0.2104129263913824, "step": 2930}, {"loss": 0.7915, "grad_norm": 0.48786666989326477, "learning_rate": 0.0002, "epoch": 0.2111310592459605, "step": 2940}, {"loss": 0.7549, "grad_norm": 0.4397817552089691, "learning_rate": 0.0002, "epoch": 0.2118491921005386, "step": 2950}, {"loss": 0.8342, "grad_norm": 0.5155336260795593, "learning_rate": 0.0002, "epoch": 0.2125673249551167, "step": 2960}, {"loss": 0.7885, "grad_norm": 0.48058274388313293, "learning_rate": 0.0002, "epoch": 0.2132854578096948, "step": 2970}, {"loss": 0.8208, "grad_norm": 0.5022647976875305, "learning_rate": 0.0002, "epoch": 0.21400359066427288, "step": 2980}, {"loss": 0.784, "grad_norm": 0.5417225360870361, "learning_rate": 0.0002, "epoch": 0.214721723518851, "step": 2990}, {"loss": 0.8518, "grad_norm": 0.46300315856933594, "learning_rate": 0.0002, "epoch": 0.21543985637342908, "step": 3000}, {"loss": 0.764, "grad_norm": 0.5375089049339294, "learning_rate": 0.0002, "epoch": 0.2161579892280072, "step": 3010}, {"loss": 0.8459, "grad_norm": 0.5050022602081299, "learning_rate": 0.0002, "epoch": 0.21687612208258528, "step": 3020}, {"loss": 0.797, "grad_norm": 0.46347716450691223, "learning_rate": 0.0002, "epoch": 0.21759425493716336, "step": 3030}, {"loss": 0.8014, "grad_norm": 0.544874370098114, "learning_rate": 0.0002, "epoch": 0.21831238779174147, "step": 3040}, {"loss": 0.802, "grad_norm": 0.4268142580986023, "learning_rate": 0.0002, "epoch": 0.21903052064631956, "step": 3050}, {"loss": 0.8224, "grad_norm": 0.5527157187461853, "learning_rate": 0.0002, "epoch": 0.21974865350089767, "step": 3060}, {"loss": 0.771, "grad_norm": 0.5565235018730164, "learning_rate": 0.0002, "epoch": 0.22046678635547576, "step": 3070}, {"loss": 0.7807, "grad_norm": 0.4900645613670349, "learning_rate": 0.0002, "epoch": 0.22118491921005387, "step": 3080}, {"loss": 0.8321, "grad_norm": 0.4951242208480835, "learning_rate": 0.0002, "epoch": 0.22190305206463196, "step": 3090}, {"loss": 0.8301, "grad_norm": 0.5831719636917114, "learning_rate": 0.0002, "epoch": 0.22262118491921004, "step": 3100}, {"loss": 0.8011, "grad_norm": 0.417576402425766, "learning_rate": 0.0002, "epoch": 0.22333931777378815, "step": 3110}, {"loss": 0.8226, "grad_norm": 0.4715117812156677, "learning_rate": 0.0002, "epoch": 0.22405745062836624, "step": 3120}, {"loss": 0.778, "grad_norm": 0.5956445336341858, "learning_rate": 0.0002, "epoch": 0.22477558348294435, "step": 3130}, {"loss": 0.788, "grad_norm": 0.408184289932251, "learning_rate": 0.0002, "epoch": 0.22549371633752244, "step": 3140}, {"loss": 0.8096, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 0.22621184919210055, "step": 3150}, {"loss": 0.7722, "grad_norm": 0.5631294846534729, "learning_rate": 0.0002, "epoch": 0.22692998204667864, "step": 3160}, {"loss": 0.7933, "grad_norm": 0.5054665803909302, "learning_rate": 0.0002, "epoch": 0.22764811490125672, "step": 3170}, {"loss": 0.8572, "grad_norm": 0.47388020157814026, "learning_rate": 0.0002, "epoch": 0.22836624775583483, "step": 3180}, {"loss": 0.8148, "grad_norm": 0.45871609449386597, "learning_rate": 0.0002, "epoch": 0.22908438061041292, "step": 3190}, {"loss": 0.8373, "grad_norm": 0.42431211471557617, "learning_rate": 0.0002, "epoch": 0.22980251346499103, "step": 3200}, {"loss": 0.7847, "grad_norm": 0.584872305393219, "learning_rate": 0.0002, "epoch": 0.23052064631956912, "step": 3210}, {"loss": 0.8118, "grad_norm": 0.5489653944969177, "learning_rate": 0.0002, "epoch": 0.23123877917414723, "step": 3220}, {"loss": 0.8552, "grad_norm": 0.5803213119506836, "learning_rate": 0.0002, "epoch": 0.23195691202872532, "step": 3230}, {"loss": 0.7702, "grad_norm": 0.906505823135376, "learning_rate": 0.0002, "epoch": 0.2326750448833034, "step": 3240}, {"loss": 0.8454, "grad_norm": 0.4569525718688965, "learning_rate": 0.0002, "epoch": 0.2333931777378815, "step": 3250}, {"loss": 0.7641, "grad_norm": 0.5566741228103638, "learning_rate": 0.0002, "epoch": 0.2341113105924596, "step": 3260}, {"loss": 0.7964, "grad_norm": 0.5059959888458252, "learning_rate": 0.0002, "epoch": 0.2348294434470377, "step": 3270}, {"loss": 0.7965, "grad_norm": 0.530828058719635, "learning_rate": 0.0002, "epoch": 0.2355475763016158, "step": 3280}, {"loss": 0.807, "grad_norm": 0.5149409174919128, "learning_rate": 0.0002, "epoch": 0.2362657091561939, "step": 3290}, {"loss": 0.8067, "grad_norm": 0.7323763966560364, "learning_rate": 0.0002, "epoch": 0.236983842010772, "step": 3300}, {"loss": 0.774, "grad_norm": 0.6794836521148682, "learning_rate": 0.0002, "epoch": 0.23770197486535008, "step": 3310}, {"loss": 0.7902, "grad_norm": 0.5176534056663513, "learning_rate": 0.0002, "epoch": 0.2384201077199282, "step": 3320}, {"loss": 0.8119, "grad_norm": 0.42245906591415405, "learning_rate": 0.0002, "epoch": 0.23913824057450628, "step": 3330}, {"loss": 0.868, "grad_norm": 0.43535107374191284, "learning_rate": 0.0002, "epoch": 0.2398563734290844, "step": 3340}, {"loss": 0.825, "grad_norm": 0.7038307785987854, "learning_rate": 0.0002, "epoch": 0.24057450628366248, "step": 3350}, {"loss": 0.7818, "grad_norm": 0.5689977407455444, "learning_rate": 0.0002, "epoch": 0.24129263913824056, "step": 3360}, {"loss": 0.7958, "grad_norm": 0.538136899471283, "learning_rate": 0.0002, "epoch": 0.24201077199281867, "step": 3370}, {"loss": 0.7995, "grad_norm": 0.7433661222457886, "learning_rate": 0.0002, "epoch": 0.24272890484739676, "step": 3380}, {"loss": 0.8564, "grad_norm": 0.6996734738349915, "learning_rate": 0.0002, "epoch": 0.24344703770197487, "step": 3390}, {"loss": 0.8288, "grad_norm": 0.5055703520774841, "learning_rate": 0.0002, "epoch": 0.24416517055655296, "step": 3400}, {"loss": 0.7741, "grad_norm": 0.5218513607978821, "learning_rate": 0.0002, "epoch": 0.24488330341113107, "step": 3410}, {"loss": 0.7903, "grad_norm": 0.42782822251319885, "learning_rate": 0.0002, "epoch": 0.24560143626570916, "step": 3420}, {"loss": 0.8005, "grad_norm": 0.4991157650947571, "learning_rate": 0.0002, "epoch": 0.24631956912028724, "step": 3430}, {"loss": 0.8151, "grad_norm": 0.5063165426254272, "learning_rate": 0.0002, "epoch": 0.24703770197486535, "step": 3440}, {"loss": 0.7722, "grad_norm": 0.45863136649131775, "learning_rate": 0.0002, "epoch": 0.24775583482944344, "step": 3450}, {"loss": 0.8236, "grad_norm": 0.474728524684906, "learning_rate": 0.0002, "epoch": 0.24847396768402155, "step": 3460}, {"loss": 0.7698, "grad_norm": 0.522570013999939, "learning_rate": 0.0002, "epoch": 0.24919210053859964, "step": 3470}, {"loss": 0.7448, "grad_norm": 0.5474396347999573, "learning_rate": 0.0002, "epoch": 0.24991023339317775, "step": 3480}, {"loss": 0.8339, "grad_norm": 0.49094662070274353, "learning_rate": 0.0002, "epoch": 0.2506283662477558, "step": 3490}, {"loss": 0.7864, "grad_norm": 0.6399132609367371, "learning_rate": 0.0002, "epoch": 0.2513464991023339, "step": 3500}, {"loss": 0.7988, "grad_norm": 0.5910066366195679, "learning_rate": 0.0002, "epoch": 0.25206463195691203, "step": 3510}, {"loss": 0.813, "grad_norm": 0.4761259853839874, "learning_rate": 0.0002, "epoch": 0.25278276481149015, "step": 3520}, {"loss": 0.812, "grad_norm": 0.5124502182006836, "learning_rate": 0.0002, "epoch": 0.2535008976660682, "step": 3530}, {"loss": 0.7699, "grad_norm": 0.4329150915145874, "learning_rate": 0.0002, "epoch": 0.2542190305206463, "step": 3540}, {"loss": 0.8205, "grad_norm": 0.4839608371257782, "learning_rate": 0.0002, "epoch": 0.25493716337522443, "step": 3550}, {"loss": 0.8279, "grad_norm": 0.5413459539413452, "learning_rate": 0.0002, "epoch": 0.2556552962298025, "step": 3560}, {"loss": 0.8253, "grad_norm": 0.5761468410491943, "learning_rate": 0.0002, "epoch": 0.2563734290843806, "step": 3570}, {"loss": 0.8473, "grad_norm": 0.49266132712364197, "learning_rate": 0.0002, "epoch": 0.2570915619389587, "step": 3580}, {"loss": 0.7946, "grad_norm": 0.7377930879592896, "learning_rate": 0.0002, "epoch": 0.2578096947935368, "step": 3590}, {"loss": 0.799, "grad_norm": 0.543541431427002, "learning_rate": 0.0002, "epoch": 0.2585278276481149, "step": 3600}, {"loss": 0.8044, "grad_norm": 0.48385897278785706, "learning_rate": 0.0002, "epoch": 0.259245960502693, "step": 3610}, {"loss": 0.7686, "grad_norm": 0.5152639746665955, "learning_rate": 0.0002, "epoch": 0.2599640933572711, "step": 3620}, {"loss": 0.7438, "grad_norm": 0.5601988434791565, "learning_rate": 0.0002, "epoch": 0.26068222621184917, "step": 3630}, {"loss": 0.7915, "grad_norm": 0.4349626302719116, "learning_rate": 0.0002, "epoch": 0.2614003590664273, "step": 3640}, {"loss": 0.7825, "grad_norm": 0.5487161874771118, "learning_rate": 0.0002, "epoch": 0.2621184919210054, "step": 3650}, {"loss": 0.8085, "grad_norm": 0.45603805780410767, "learning_rate": 0.0002, "epoch": 0.2628366247755835, "step": 3660}, {"loss": 0.7858, "grad_norm": 0.5012730956077576, "learning_rate": 0.0002, "epoch": 0.26355475763016156, "step": 3670}, {"loss": 0.8022, "grad_norm": 0.4523845314979553, "learning_rate": 0.0002, "epoch": 0.2642728904847397, "step": 3680}, {"loss": 0.7932, "grad_norm": 0.5756664872169495, "learning_rate": 0.0002, "epoch": 0.2649910233393178, "step": 3690}, {"loss": 0.816, "grad_norm": 0.48467493057250977, "learning_rate": 0.0002, "epoch": 0.26570915619389585, "step": 3700}, {"loss": 0.7825, "grad_norm": 0.4860585927963257, "learning_rate": 0.0002, "epoch": 0.26642728904847396, "step": 3710}, {"loss": 0.7903, "grad_norm": 0.5067077875137329, "learning_rate": 0.0002, "epoch": 0.26714542190305207, "step": 3720}, {"loss": 0.8155, "grad_norm": 0.5490895509719849, "learning_rate": 0.0002, "epoch": 0.2678635547576302, "step": 3730}, {"loss": 0.7542, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.26858168761220824, "step": 3740}, {"loss": 0.7991, "grad_norm": 0.5026951432228088, "learning_rate": 0.0002, "epoch": 0.26929982046678635, "step": 3750}, {"loss": 0.8152, "grad_norm": 0.49474090337753296, "learning_rate": 0.0002, "epoch": 0.27001795332136447, "step": 3760}, {"loss": 0.8235, "grad_norm": 0.6381985545158386, "learning_rate": 0.0002, "epoch": 0.2707360861759425, "step": 3770}, {"loss": 0.8024, "grad_norm": 0.4784011244773865, "learning_rate": 0.0002, "epoch": 0.27145421903052064, "step": 3780}, {"loss": 0.7746, "grad_norm": 0.5126543045043945, "learning_rate": 0.0002, "epoch": 0.27217235188509875, "step": 3790}, {"loss": 0.841, "grad_norm": 0.5428652763366699, "learning_rate": 0.0002, "epoch": 0.27289048473967686, "step": 3800}, {"loss": 0.8137, "grad_norm": 0.5427033305168152, "learning_rate": 0.0002, "epoch": 0.2736086175942549, "step": 3810}, {"loss": 0.7274, "grad_norm": 0.46467480063438416, "learning_rate": 0.0002, "epoch": 0.27432675044883303, "step": 3820}, {"loss": 0.8414, "grad_norm": 0.494367390871048, "learning_rate": 0.0002, "epoch": 0.27504488330341115, "step": 3830}, {"loss": 0.8151, "grad_norm": 0.59856778383255, "learning_rate": 0.0002, "epoch": 0.2757630161579892, "step": 3840}, {"loss": 0.7899, "grad_norm": 0.422128826379776, "learning_rate": 0.0002, "epoch": 0.2764811490125673, "step": 3850}, {"loss": 0.8153, "grad_norm": 0.5757306814193726, "learning_rate": 0.0002, "epoch": 0.27719928186714543, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5850930213928223, "learning_rate": 0.0002, "epoch": 0.27791741472172354, "step": 3870}, {"loss": 0.8044, "grad_norm": 0.5633023977279663, "learning_rate": 0.0002, "epoch": 0.2786355475763016, "step": 3880}, {"loss": 0.8402, "grad_norm": 0.5037940144538879, "learning_rate": 0.0002, "epoch": 0.2793536804308797, "step": 3890}, {"loss": 0.822, "grad_norm": 0.5255506038665771, "learning_rate": 0.0002, "epoch": 0.2800718132854578, "step": 3900}, {"loss": 0.7625, "grad_norm": 0.44584617018699646, "learning_rate": 0.0002, "epoch": 0.2807899461400359, "step": 3910}, {"loss": 0.8131, "grad_norm": 0.4803239405155182, "learning_rate": 0.0002, "epoch": 0.281508078994614, "step": 3920}, {"loss": 0.8122, "grad_norm": 0.5206008553504944, "learning_rate": 0.0002, "epoch": 0.2822262118491921, "step": 3930}, {"loss": 0.8988, "grad_norm": 0.5596373081207275, "learning_rate": 0.0002, "epoch": 0.2829443447037702, "step": 3940}, {"loss": 0.8091, "grad_norm": 0.4487258493900299, "learning_rate": 0.0002, "epoch": 0.2836624775583483, "step": 3950}, {"loss": 0.7933, "grad_norm": 0.4774281978607178, "learning_rate": 0.0002, "epoch": 0.2843806104129264, "step": 3960}, {"loss": 0.8994, "grad_norm": 0.571829617023468, "learning_rate": 0.0002, "epoch": 0.2850987432675045, "step": 3970}, {"loss": 0.7971, "grad_norm": 0.45251455903053284, "learning_rate": 0.0002, "epoch": 0.28581687612208256, "step": 3980}, {"loss": 0.8007, "grad_norm": 0.5119943618774414, "learning_rate": 0.0002, "epoch": 0.2865350089766607, "step": 3990}, {"loss": 0.8087, "grad_norm": 0.42333969473838806, "learning_rate": 0.0002, "epoch": 0.2872531418312388, "step": 4000}, {"loss": 0.7978, "grad_norm": 0.5694096684455872, "learning_rate": 0.0002, "epoch": 0.2879712746858169, "step": 4010}, {"loss": 0.845, "grad_norm": 0.44457492232322693, "learning_rate": 0.0002, "epoch": 0.28868940754039496, "step": 4020}, {"loss": 0.7268, "grad_norm": 0.496545672416687, "learning_rate": 0.0002, "epoch": 0.2894075403949731, "step": 4030}, {"loss": 0.7908, "grad_norm": 0.5092352032661438, "learning_rate": 0.0002, "epoch": 0.2901256732495512, "step": 4040}, {"loss": 0.807, "grad_norm": 0.5124567151069641, "learning_rate": 0.0002, "epoch": 0.29084380610412924, "step": 4050}, {"loss": 0.8191, "grad_norm": 0.5148161053657532, "learning_rate": 0.0002, "epoch": 0.29156193895870736, "step": 4060}, {"loss": 0.7893, "grad_norm": 0.48183947801589966, "learning_rate": 0.0002, "epoch": 0.29228007181328547, "step": 4070}, {"loss": 0.8397, "grad_norm": 0.47728800773620605, "learning_rate": 0.0002, "epoch": 0.2929982046678636, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.29371633752244164, "step": 4090}, {"loss": 0.8019, "grad_norm": 0.5343585014343262, "learning_rate": 0.0002, "epoch": 0.29443447037701975, "step": 4100}, {"loss": 0.7933, "grad_norm": 0.5760312676429749, "learning_rate": 0.0002, "epoch": 0.29515260323159787, "step": 4110}, {"loss": 0.811, "grad_norm": 0.5894787907600403, "learning_rate": 0.0002, "epoch": 0.2958707360861759, "step": 4120}, {"loss": 0.7375, "grad_norm": 0.4528578817844391, "learning_rate": 0.0002, "epoch": 0.29658886894075404, "step": 4130}, {"loss": 0.7761, "grad_norm": 0.6027235388755798, "learning_rate": 0.0002, "epoch": 0.29730700179533215, "step": 4140}, {"loss": 0.7636, "grad_norm": 0.5060310959815979, "learning_rate": 0.0002, "epoch": 0.2980251346499102, "step": 4150}, {"loss": 0.8122, "grad_norm": 0.475252628326416, "learning_rate": 0.0002, "epoch": 0.2987432675044883, "step": 4160}, {"loss": 0.8306, "grad_norm": 0.4855351448059082, "learning_rate": 0.0002, "epoch": 0.29946140035906643, "step": 4170}, {"loss": 0.7863, "grad_norm": 0.6720767021179199, "learning_rate": 0.0002, "epoch": 0.30017953321364454, "step": 4180}, {"loss": 0.7755, "grad_norm": 0.6409553289413452, "learning_rate": 0.0002, "epoch": 0.3008976660682226, "step": 4190}, {"loss": 0.8333, "grad_norm": 0.5508167147636414, "learning_rate": 0.0002, "epoch": 0.3016157989228007, "step": 4200}, {"loss": 0.8138, "grad_norm": 0.45958149433135986, "learning_rate": 0.0002, "epoch": 0.30233393177737883, "step": 4210}, {"loss": 0.8266, "grad_norm": 0.5201641321182251, "learning_rate": 0.0002, "epoch": 0.3030520646319569, "step": 4220}, {"loss": 0.8246, "grad_norm": 0.5440032482147217, "learning_rate": 0.0002, "epoch": 0.303770197486535, "step": 4230}, {"loss": 0.7863, "grad_norm": 0.43566814064979553, "learning_rate": 0.0002, "epoch": 0.3044883303411131, "step": 4240}, {"loss": 0.7835, "grad_norm": 0.4479893445968628, "learning_rate": 0.0002, "epoch": 0.3052064631956912, "step": 4250}, {"loss": 0.7646, "grad_norm": 0.40390217304229736, "learning_rate": 0.0002, "epoch": 0.3059245960502693, "step": 4260}, {"loss": 0.8382, "grad_norm": 0.5143486261367798, "learning_rate": 0.0002, "epoch": 0.3066427289048474, "step": 4270}, {"loss": 0.799, "grad_norm": 0.5289962887763977, "learning_rate": 0.0002, "epoch": 0.3073608617594255, "step": 4280}, {"loss": 0.7706, "grad_norm": 0.609561026096344, "learning_rate": 0.0002, "epoch": 0.30807899461400357, "step": 4290}, {"loss": 0.809, "grad_norm": 0.5967493653297424, "learning_rate": 0.0002, "epoch": 0.3087971274685817, "step": 4300}, {"loss": 0.8034, "grad_norm": 0.5323672890663147, "learning_rate": 0.0002, "epoch": 0.3095152603231598, "step": 4310}, {"loss": 0.8463, "grad_norm": 0.4996737241744995, "learning_rate": 0.0002, "epoch": 0.3102333931777379, "step": 4320}, {"loss": 0.7879, "grad_norm": 0.5528829097747803, "learning_rate": 0.0002, "epoch": 0.31095152603231596, "step": 4330}, {"loss": 0.8383, "grad_norm": 0.5394268035888672, "learning_rate": 0.0002, "epoch": 0.3116696588868941, "step": 4340}, {"loss": 0.8258, "grad_norm": 0.4654628038406372, "learning_rate": 0.0002, "epoch": 0.3123877917414722, "step": 4350}, {"loss": 0.8045, "grad_norm": 0.4933706521987915, "learning_rate": 0.0002, "epoch": 0.31310592459605024, "step": 4360}, {"loss": 0.7891, "grad_norm": 0.5310598611831665, "learning_rate": 0.0002, "epoch": 0.31382405745062836, "step": 4370}, {"loss": 0.8362, "grad_norm": 0.5558765530586243, "learning_rate": 0.0002, "epoch": 0.31454219030520647, "step": 4380}, {"loss": 0.8013, "grad_norm": 0.5281313061714172, "learning_rate": 0.0002, "epoch": 0.3152603231597846, "step": 4390}, {"loss": 0.8034, "grad_norm": 0.5100293755531311, "learning_rate": 0.0002, "epoch": 0.31597845601436264, "step": 4400}, {"loss": 0.795, "grad_norm": 0.48762813210487366, "learning_rate": 0.0002, "epoch": 0.31669658886894075, "step": 4410}, {"loss": 0.7941, "grad_norm": 0.5211702585220337, "learning_rate": 0.0002, "epoch": 0.31741472172351887, "step": 4420}, {"loss": 0.8079, "grad_norm": 0.696747899055481, "learning_rate": 0.0002, "epoch": 0.3181328545780969, "step": 4430}, {"loss": 0.77, "grad_norm": 0.6334946751594543, "learning_rate": 0.0002, "epoch": 0.31885098743267504, "step": 4440}, {"loss": 0.7871, "grad_norm": 0.5333067178726196, "learning_rate": 0.0002, "epoch": 0.31956912028725315, "step": 4450}, {"loss": 0.7846, "grad_norm": 0.500091552734375, "learning_rate": 0.0002, "epoch": 0.32028725314183126, "step": 4460}, {"loss": 0.7884, "grad_norm": 0.5190957188606262, "learning_rate": 0.0002, "epoch": 0.3210053859964093, "step": 4470}, {"loss": 0.7988, "grad_norm": 0.6702370047569275, "learning_rate": 0.0002, "epoch": 0.32172351885098743, "step": 4480}, {"loss": 0.8014, "grad_norm": 0.4393869638442993, "learning_rate": 0.0002, "epoch": 0.32244165170556555, "step": 4490}, {"loss": 0.8373, "grad_norm": 0.4766499400138855, "learning_rate": 0.0002, "epoch": 0.3231597845601436, "step": 4500}, {"loss": 0.7567, "grad_norm": 0.561836838722229, "learning_rate": 0.0002, "epoch": 0.3238779174147217, "step": 4510}, {"loss": 0.7727, "grad_norm": 0.44366541504859924, "learning_rate": 0.0002, "epoch": 0.32459605026929983, "step": 4520}, {"loss": 0.8109, "grad_norm": 0.46504274010658264, "learning_rate": 0.0002, "epoch": 0.32531418312387794, "step": 4530}, {"loss": 0.7868, "grad_norm": 0.5498034954071045, "learning_rate": 0.0002, "epoch": 0.326032315978456, "step": 4540}, {"loss": 0.7638, "grad_norm": 0.5901338458061218, "learning_rate": 0.0002, "epoch": 0.3267504488330341, "step": 4550}, {"loss": 0.8016, "grad_norm": 0.5485442876815796, "learning_rate": 0.0002, "epoch": 0.3274685816876122, "step": 4560}, {"loss": 0.7944, "grad_norm": 0.512584924697876, "learning_rate": 0.0002, "epoch": 0.3281867145421903, "step": 4570}, {"loss": 0.8193, "grad_norm": 0.5208188891410828, "learning_rate": 0.0002, "epoch": 0.3289048473967684, "step": 4580}, {"loss": 0.7833, "grad_norm": 0.4923836886882782, "learning_rate": 0.0002, "epoch": 0.3296229802513465, "step": 4590}, {"loss": 0.8102, "grad_norm": 0.49258530139923096, "learning_rate": 0.0002, "epoch": 0.3303411131059246, "step": 4600}, {"loss": 0.7874, "grad_norm": 0.4788922667503357, "learning_rate": 0.0002, "epoch": 0.3310592459605027, "step": 4610}, {"loss": 0.8298, "grad_norm": 0.48276954889297485, "learning_rate": 0.0002, "epoch": 0.3317773788150808, "step": 4620}, {"loss": 0.8519, "grad_norm": 0.6300732493400574, "learning_rate": 0.0002, "epoch": 0.3324955116696589, "step": 4630}, {"loss": 0.8434, "grad_norm": 0.47594770789146423, "learning_rate": 0.0002, "epoch": 0.33321364452423696, "step": 4640}, {"loss": 0.8123, "grad_norm": 0.4728924632072449, "learning_rate": 0.0002, "epoch": 0.3339317773788151, "step": 4650}, {"loss": 0.8113, "grad_norm": 0.5586788654327393, "learning_rate": 0.0002, "epoch": 0.3346499102333932, "step": 4660}, {"loss": 0.7949, "grad_norm": 0.4573180377483368, "learning_rate": 0.0002, "epoch": 0.3353680430879713, "step": 4670}, {"loss": 0.8341, "grad_norm": 0.6391524076461792, "learning_rate": 0.0002, "epoch": 0.33608617594254936, "step": 4680}, {"loss": 0.8126, "grad_norm": 0.6570921540260315, "learning_rate": 0.0002, "epoch": 0.33680430879712747, "step": 4690}, {"loss": 0.796, "grad_norm": 0.4601454734802246, "learning_rate": 0.0002, "epoch": 0.3375224416517056, "step": 4700}, {"loss": 0.8158, "grad_norm": 0.5640755295753479, "learning_rate": 0.0002, "epoch": 0.33824057450628364, "step": 4710}, {"loss": 0.8326, "grad_norm": 0.43475520610809326, "learning_rate": 0.0002, "epoch": 0.33895870736086176, "step": 4720}, {"loss": 0.7684, "grad_norm": 0.4785807132720947, "learning_rate": 0.0002, "epoch": 0.33967684021543987, "step": 4730}, {"loss": 0.8257, "grad_norm": 0.4934665262699127, "learning_rate": 0.0002, "epoch": 0.340394973070018, "step": 4740}, {"loss": 0.7713, "grad_norm": 0.45327693223953247, "learning_rate": 0.0002, "epoch": 0.34111310592459604, "step": 4750}, {"loss": 0.7944, "grad_norm": 0.4710456430912018, "learning_rate": 0.0002, "epoch": 0.34183123877917415, "step": 4760}, {"loss": 0.7689, "grad_norm": 0.5591559410095215, "learning_rate": 0.0002, "epoch": 0.34254937163375226, "step": 4770}, {"loss": 0.8204, "grad_norm": 0.48958835005760193, "learning_rate": 0.0002, "epoch": 0.3432675044883303, "step": 4780}, {"loss": 0.8232, "grad_norm": 0.4613766670227051, "learning_rate": 0.0002, "epoch": 0.34398563734290843, "step": 4790}, {"loss": 0.8339, "grad_norm": 0.5425335764884949, "learning_rate": 0.0002, "epoch": 0.34470377019748655, "step": 4800}, {"loss": 0.828, "grad_norm": 0.4964924156665802, "learning_rate": 0.0002, "epoch": 0.3454219030520646, "step": 4810}, {"loss": 0.8264, "grad_norm": 0.613449215888977, "learning_rate": 0.0002, "epoch": 0.3461400359066427, "step": 4820}, {"loss": 0.846, "grad_norm": 0.6553348898887634, "learning_rate": 0.0002, "epoch": 0.34685816876122083, "step": 4830}, {"loss": 0.8181, "grad_norm": 0.5863470435142517, "learning_rate": 0.0002, "epoch": 0.34757630161579894, "step": 4840}, {"loss": 0.8205, "grad_norm": 0.5338097810745239, "learning_rate": 0.0002, "epoch": 0.348294434470377, "step": 4850}, {"loss": 0.7926, "grad_norm": 0.6129760146141052, "learning_rate": 0.0002, "epoch": 0.3490125673249551, "step": 4860}, {"loss": 0.7745, "grad_norm": 0.6100956797599792, "learning_rate": 0.0002, "epoch": 0.3497307001795332, "step": 4870}, {"loss": 0.7642, "grad_norm": 0.5478541254997253, "learning_rate": 0.0002, "epoch": 0.3504488330341113, "step": 4880}, {"loss": 0.7558, "grad_norm": 0.5725359916687012, "learning_rate": 0.0002, "epoch": 0.3511669658886894, "step": 4890}, {"loss": 0.8208, "grad_norm": 0.6141043901443481, "learning_rate": 0.0002, "epoch": 0.3518850987432675, "step": 4900}, {"loss": 0.841, "grad_norm": 0.597191572189331, "learning_rate": 0.0002, "epoch": 0.3526032315978456, "step": 4910}, {"loss": 0.8234, "grad_norm": 0.5988389253616333, "learning_rate": 0.0002, "epoch": 0.3533213644524237, "step": 4920}, {"loss": 0.7775, "grad_norm": 0.5503361821174622, "learning_rate": 0.0002, "epoch": 0.3540394973070018, "step": 4930}, {"loss": 0.8315, "grad_norm": 0.5932779312133789, "learning_rate": 0.0002, "epoch": 0.3547576301615799, "step": 4940}, {"loss": 0.8407, "grad_norm": 0.48911359906196594, "learning_rate": 0.0002, "epoch": 0.35547576301615796, "step": 4950}, {"loss": 0.8191, "grad_norm": 0.5435750484466553, "learning_rate": 0.0002, "epoch": 0.3561938958707361, "step": 4960}, {"loss": 0.7551, "grad_norm": 0.4786977767944336, "learning_rate": 0.0002, "epoch": 0.3569120287253142, "step": 4970}, {"loss": 0.7845, "grad_norm": 0.4022316336631775, "learning_rate": 0.0002, "epoch": 0.3576301615798923, "step": 4980}, {"loss": 0.8032, "grad_norm": 0.4848504364490509, "learning_rate": 0.0002, "epoch": 0.35834829443447036, "step": 4990}, {"loss": 0.809, "grad_norm": 0.5093459486961365, "learning_rate": 0.0002, "epoch": 0.3590664272890485, "step": 5000}, {"loss": 0.8424, "grad_norm": 0.47368478775024414, "learning_rate": 0.0002, "epoch": 0.3597845601436266, "step": 5010}, {"loss": 0.811, "grad_norm": 0.6041097044944763, "learning_rate": 0.0002, "epoch": 0.36050269299820464, "step": 5020}, {"loss": 0.8023, "grad_norm": 0.5384424924850464, "learning_rate": 0.0002, "epoch": 0.36122082585278276, "step": 5030}, {"loss": 0.826, "grad_norm": 0.4668518602848053, "learning_rate": 0.0002, "epoch": 0.36193895870736087, "step": 5040}, {"loss": 0.7785, "grad_norm": 0.5471060276031494, "learning_rate": 0.0002, "epoch": 0.362657091561939, "step": 5050}, {"loss": 0.7511, "grad_norm": 0.731369137763977, "learning_rate": 0.0002, "epoch": 0.36337522441651704, "step": 5060}, {"loss": 0.8646, "grad_norm": 0.5119590759277344, "learning_rate": 0.0002, "epoch": 0.36409335727109515, "step": 5070}, {"loss": 0.8125, "grad_norm": 0.567428469657898, "learning_rate": 0.0002, "epoch": 0.36481149012567327, "step": 5080}, {"loss": 0.7616, "grad_norm": 0.5139971375465393, "learning_rate": 0.0002, "epoch": 0.3655296229802513, "step": 5090}, {"loss": 0.8091, "grad_norm": 0.5701581835746765, "learning_rate": 0.0002, "epoch": 0.36624775583482944, "step": 5100}, {"loss": 0.821, "grad_norm": 0.5022063851356506, "learning_rate": 0.0002, "epoch": 0.36696588868940755, "step": 5110}, {"loss": 0.7879, "grad_norm": 0.4684354364871979, "learning_rate": 0.0002, "epoch": 0.36768402154398566, "step": 5120}, {"loss": 0.8028, "grad_norm": 0.5423495769500732, "learning_rate": 0.0002, "epoch": 0.3684021543985637, "step": 5130}, {"loss": 0.7763, "grad_norm": 0.46262967586517334, "learning_rate": 0.0002, "epoch": 0.36912028725314183, "step": 5140}, {"loss": 0.8485, "grad_norm": 0.4720141589641571, "learning_rate": 0.0002, "epoch": 0.36983842010771995, "step": 5150}, {"loss": 0.7778, "grad_norm": 0.5113096833229065, "learning_rate": 0.0002, "epoch": 0.370556552962298, "step": 5160}, {"loss": 0.7854, "grad_norm": 0.5253350138664246, "learning_rate": 0.0002, "epoch": 0.3712746858168761, "step": 5170}, {"loss": 0.8539, "grad_norm": 0.5799776315689087, "learning_rate": 0.0002, "epoch": 0.37199281867145423, "step": 5180}, {"loss": 0.78, "grad_norm": 0.5166001319885254, "learning_rate": 0.0002, "epoch": 0.37271095152603234, "step": 5190}, {"loss": 0.7939, "grad_norm": 0.5658290386199951, "learning_rate": 0.0002, "epoch": 0.3734290843806104, "step": 5200}, {"loss": 0.8059, "grad_norm": 0.45811113715171814, "learning_rate": 0.0002, "epoch": 0.3741472172351885, "step": 5210}, {"loss": 0.8024, "grad_norm": 0.5509489178657532, "learning_rate": 0.0002, "epoch": 0.3748653500897666, "step": 5220}, {"loss": 0.7537, "grad_norm": 0.47473257780075073, "learning_rate": 0.0002, "epoch": 0.3755834829443447, "step": 5230}, {"loss": 0.8159, "grad_norm": 0.3858596086502075, "learning_rate": 0.0002, "epoch": 0.3763016157989228, "step": 5240}, {"loss": 0.8592, "grad_norm": 0.6941536068916321, "learning_rate": 0.0002, "epoch": 0.3770197486535009, "step": 5250}, {"loss": 0.8489, "grad_norm": 0.46940872073173523, "learning_rate": 0.0002, "epoch": 0.377737881508079, "step": 5260}, {"loss": 0.7818, "grad_norm": 0.5413833260536194, "learning_rate": 0.0002, "epoch": 0.3784560143626571, "step": 5270}, {"loss": 0.8202, "grad_norm": 0.5165658593177795, "learning_rate": 0.0002, "epoch": 0.3791741472172352, "step": 5280}, {"loss": 0.7837, "grad_norm": 0.6567398309707642, "learning_rate": 0.0002, "epoch": 0.3798922800718133, "step": 5290}, {"loss": 0.7991, "grad_norm": 0.5466915965080261, "learning_rate": 0.0002, "epoch": 0.38061041292639136, "step": 5300}, {"loss": 0.7683, "grad_norm": 0.4800598621368408, "learning_rate": 0.0002, "epoch": 0.3813285457809695, "step": 5310}, {"loss": 0.8653, "grad_norm": 0.4551742970943451, "learning_rate": 0.0002, "epoch": 0.3820466786355476, "step": 5320}, {"loss": 0.8283, "grad_norm": 0.5561164617538452, "learning_rate": 0.0002, "epoch": 0.3827648114901257, "step": 5330}, {"loss": 0.8192, "grad_norm": 0.6170380115509033, "learning_rate": 0.0002, "epoch": 0.38348294434470376, "step": 5340}, {"loss": 0.8015, "grad_norm": 0.465762197971344, "learning_rate": 0.0002, "epoch": 0.38420107719928187, "step": 5350}, {"loss": 0.7561, "grad_norm": 0.6176838874816895, "learning_rate": 0.0002, "epoch": 0.38491921005386, "step": 5360}, {"loss": 0.7571, "grad_norm": 0.657926082611084, "learning_rate": 0.0002, "epoch": 0.38563734290843804, "step": 5370}, {"loss": 0.7366, "grad_norm": 0.5063281655311584, "learning_rate": 0.0002, "epoch": 0.38635547576301615, "step": 5380}, {"loss": 0.8259, "grad_norm": 0.6960828304290771, "learning_rate": 0.0002, "epoch": 0.38707360861759427, "step": 5390}, {"loss": 0.8058, "grad_norm": 0.46712034940719604, "learning_rate": 0.0002, "epoch": 0.3877917414721723, "step": 5400}, {"loss": 0.7674, "grad_norm": 0.598114013671875, "learning_rate": 0.0002, "epoch": 0.38850987432675044, "step": 5410}, {"loss": 0.8256, "grad_norm": 0.6798132061958313, "learning_rate": 0.0002, "epoch": 0.38922800718132855, "step": 5420}, {"loss": 0.844, "grad_norm": 0.5194289088249207, "learning_rate": 0.0002, "epoch": 0.38994614003590666, "step": 5430}, {"loss": 0.7666, "grad_norm": 0.48175323009490967, "learning_rate": 0.0002, "epoch": 0.3906642728904847, "step": 5440}, {"loss": 0.8089, "grad_norm": 0.4979408085346222, "learning_rate": 0.0002, "epoch": 0.39138240574506283, "step": 5450}, {"loss": 0.7938, "grad_norm": 0.6440972685813904, "learning_rate": 0.0002, "epoch": 0.39210053859964095, "step": 5460}, {"loss": 0.8531, "grad_norm": 0.5977227091789246, "learning_rate": 0.0002, "epoch": 0.392818671454219, "step": 5470}, {"loss": 0.8384, "grad_norm": 0.4735909104347229, "learning_rate": 0.0002, "epoch": 0.3935368043087971, "step": 5480}, {"loss": 0.8579, "grad_norm": 0.48181721568107605, "learning_rate": 0.0002, "epoch": 0.39425493716337523, "step": 5490}, {"loss": 0.8113, "grad_norm": 0.6339454650878906, "learning_rate": 0.0002, "epoch": 0.39497307001795334, "step": 5500}, {"loss": 0.7682, "grad_norm": 0.5364336371421814, "learning_rate": 0.0002, "epoch": 0.3956912028725314, "step": 5510}, {"loss": 0.8198, "grad_norm": 0.5499233603477478, "learning_rate": 0.0002, "epoch": 0.3964093357271095, "step": 5520}, {"loss": 0.7981, "grad_norm": 0.47249847650527954, "learning_rate": 0.0002, "epoch": 0.3971274685816876, "step": 5530}, {"loss": 0.8207, "grad_norm": 0.5692135095596313, "learning_rate": 0.0002, "epoch": 0.3978456014362657, "step": 5540}, {"loss": 0.8173, "grad_norm": 0.6009272933006287, "learning_rate": 0.0002, "epoch": 0.3985637342908438, "step": 5550}, {"loss": 0.7622, "grad_norm": 0.5198255181312561, "learning_rate": 0.0002, "epoch": 0.3992818671454219, "step": 5560}, {"loss": 0.8597, "grad_norm": 0.5474766492843628, "learning_rate": 0.0002, "epoch": 0.4, "step": 5570}, {"loss": 0.841, "grad_norm": 0.5577479600906372, "learning_rate": 0.0002, "epoch": 0.4007181328545781, "step": 5580}, {"loss": 0.7986, "grad_norm": 0.5350302457809448, "learning_rate": 0.0002, "epoch": 0.4014362657091562, "step": 5590}, {"loss": 0.7892, "grad_norm": 0.6310991048812866, "learning_rate": 0.0002, "epoch": 0.4021543985637343, "step": 5600}, {"loss": 0.7834, "grad_norm": 0.5695762038230896, "learning_rate": 0.0002, "epoch": 0.40287253141831236, "step": 5610}, {"loss": 0.7508, "grad_norm": 0.5431827306747437, "learning_rate": 0.0002, "epoch": 0.4035906642728905, "step": 5620}, {"loss": 0.8743, "grad_norm": 0.4923325777053833, "learning_rate": 0.0002, "epoch": 0.4043087971274686, "step": 5630}, {"loss": 0.7745, "grad_norm": 0.531399667263031, "learning_rate": 0.0002, "epoch": 0.4050269299820467, "step": 5640}, {"loss": 0.7982, "grad_norm": 0.5854769349098206, "learning_rate": 0.0002, "epoch": 0.40574506283662476, "step": 5650}, {"loss": 0.8225, "grad_norm": 0.6684802174568176, "learning_rate": 0.0002, "epoch": 0.40646319569120287, "step": 5660}, {"loss": 0.7405, "grad_norm": 0.6618620753288269, "learning_rate": 0.0002, "epoch": 0.407181328545781, "step": 5670}, {"loss": 0.7707, "grad_norm": 0.4930776059627533, "learning_rate": 0.0002, "epoch": 0.40789946140035904, "step": 5680}, {"loss": 0.7846, "grad_norm": 0.506628155708313, "learning_rate": 0.0002, "epoch": 0.40861759425493716, "step": 5690}, {"loss": 0.7827, "grad_norm": 0.5250783562660217, "learning_rate": 0.0002, "epoch": 0.40933572710951527, "step": 5700}, {"loss": 0.8386, "grad_norm": 0.6773046851158142, "learning_rate": 0.0002, "epoch": 0.4100538599640934, "step": 5710}, {"loss": 0.8096, "grad_norm": 0.6750592589378357, "learning_rate": 0.0002, "epoch": 0.41077199281867144, "step": 5720}, {"loss": 0.7873, "grad_norm": 0.5277232527732849, "learning_rate": 0.0002, "epoch": 0.41149012567324955, "step": 5730}, {"loss": 0.762, "grad_norm": 0.5155990719795227, "learning_rate": 0.0002, "epoch": 0.41220825852782766, "step": 5740}, {"loss": 0.871, "grad_norm": 0.5236294865608215, "learning_rate": 0.0002, "epoch": 0.4129263913824057, "step": 5750}, {"loss": 0.7753, "grad_norm": 0.5073592066764832, "learning_rate": 0.0002, "epoch": 0.41364452423698383, "step": 5760}, {"loss": 0.7984, "grad_norm": 0.6997184753417969, "learning_rate": 0.0002, "epoch": 0.41436265709156195, "step": 5770}, {"loss": 0.7579, "grad_norm": 0.5282439589500427, "learning_rate": 0.0002, "epoch": 0.41508078994614006, "step": 5780}, {"loss": 0.7831, "grad_norm": 0.4997355341911316, "learning_rate": 0.0002, "epoch": 0.4157989228007181, "step": 5790}, {"loss": 0.8022, "grad_norm": 0.6081610321998596, "learning_rate": 0.0002, "epoch": 0.41651705565529623, "step": 5800}, {"loss": 0.8068, "grad_norm": 0.5640295147895813, "learning_rate": 0.0002, "epoch": 0.41723518850987434, "step": 5810}, {"loss": 0.7819, "grad_norm": 0.6443586349487305, "learning_rate": 0.0002, "epoch": 0.4179533213644524, "step": 5820}, {"loss": 0.8132, "grad_norm": 0.6456229090690613, "learning_rate": 0.0002, "epoch": 0.4186714542190305, "step": 5830}, {"loss": 0.785, "grad_norm": 0.5422267317771912, "learning_rate": 0.0002, "epoch": 0.4193895870736086, "step": 5840}, {"loss": 0.7962, "grad_norm": 0.45251885056495667, "learning_rate": 0.0002, "epoch": 0.42010771992818674, "step": 5850}, {"loss": 0.7945, "grad_norm": 0.781165599822998, "learning_rate": 0.0002, "epoch": 0.4208258527827648, "step": 5860}, {"loss": 0.8171, "grad_norm": 0.5359160900115967, "learning_rate": 0.0002, "epoch": 0.4215439856373429, "step": 5870}, {"loss": 0.8012, "grad_norm": 0.6201958656311035, "learning_rate": 0.0002, "epoch": 0.422262118491921, "step": 5880}, {"loss": 0.8363, "grad_norm": 0.5985850691795349, "learning_rate": 0.0002, "epoch": 0.4229802513464991, "step": 5890}, {"loss": 0.7842, "grad_norm": 0.5550961494445801, "learning_rate": 0.0002, "epoch": 0.4236983842010772, "step": 5900}, {"loss": 0.7717, "grad_norm": 0.6284893155097961, "learning_rate": 0.0002, "epoch": 0.4244165170556553, "step": 5910}, {"loss": 0.8165, "grad_norm": 0.6143685579299927, "learning_rate": 0.0002, "epoch": 0.4251346499102334, "step": 5920}, {"loss": 0.7986, "grad_norm": 0.5065329670906067, "learning_rate": 0.0002, "epoch": 0.4258527827648115, "step": 5930}, {"loss": 0.7883, "grad_norm": 0.7274345755577087, "learning_rate": 0.0002, "epoch": 0.4265709156193896, "step": 5940}, {"loss": 0.8126, "grad_norm": 0.606531023979187, "learning_rate": 0.0002, "epoch": 0.4272890484739677, "step": 5950}, {"loss": 0.7805, "grad_norm": 0.5983648300170898, "learning_rate": 0.0002, "epoch": 0.42800718132854576, "step": 5960}, {"loss": 0.8124, "grad_norm": 0.5546031594276428, "learning_rate": 0.0002, "epoch": 0.4287253141831239, "step": 5970}, {"loss": 0.8184, "grad_norm": 0.666868269443512, "learning_rate": 0.0002, "epoch": 0.429443447037702, "step": 5980}, {"loss": 0.8171, "grad_norm": 0.41438576579093933, "learning_rate": 0.0002, "epoch": 0.4301615798922801, "step": 5990}, {"loss": 0.8456, "grad_norm": 0.5012526512145996, "learning_rate": 0.0002, "epoch": 0.43087971274685816, "step": 6000}, {"loss": 0.7837, "grad_norm": 0.6071694493293762, "learning_rate": 0.0002, "epoch": 0.43159784560143627, "step": 6010}, {"loss": 0.8364, "grad_norm": 0.5538384914398193, "learning_rate": 0.0002, "epoch": 0.4323159784560144, "step": 6020}, {"loss": 0.7888, "grad_norm": 0.5798718929290771, "learning_rate": 0.0002, "epoch": 0.43303411131059244, "step": 6030}, {"loss": 0.8196, "grad_norm": 0.5442442893981934, "learning_rate": 0.0002, "epoch": 0.43375224416517055, "step": 6040}, {"loss": 0.8041, "grad_norm": 0.6895565390586853, "learning_rate": 0.0002, "epoch": 0.43447037701974867, "step": 6050}, {"loss": 0.8154, "grad_norm": 0.6498045325279236, "learning_rate": 0.0002, "epoch": 0.4351885098743267, "step": 6060}, {"loss": 0.782, "grad_norm": 0.5225510001182556, "learning_rate": 0.0002, "epoch": 0.43590664272890484, "step": 6070}, {"loss": 0.7809, "grad_norm": 0.6366992592811584, "learning_rate": 0.0002, "epoch": 0.43662477558348295, "step": 6080}, {"loss": 0.7715, "grad_norm": 0.47929027676582336, "learning_rate": 0.0002, "epoch": 0.43734290843806106, "step": 6090}, {"loss": 0.7481, "grad_norm": 0.5722405910491943, "learning_rate": 0.0002, "epoch": 0.4380610412926391, "step": 6100}, {"loss": 0.765, "grad_norm": 0.6008004546165466, "learning_rate": 0.0002, "epoch": 0.43877917414721723, "step": 6110}, {"loss": 0.7795, "grad_norm": 0.5922580361366272, "learning_rate": 0.0002, "epoch": 0.43949730700179535, "step": 6120}, {"loss": 0.8542, "grad_norm": 0.7051905393600464, "learning_rate": 0.0002, "epoch": 0.4402154398563734, "step": 6130}, {"loss": 0.8159, "grad_norm": 0.5146450400352478, "learning_rate": 0.0002, "epoch": 0.4409335727109515, "step": 6140}, {"loss": 0.8178, "grad_norm": 0.5605781674385071, "learning_rate": 0.0002, "epoch": 0.44165170556552963, "step": 6150}, {"loss": 0.8409, "grad_norm": 0.8008661866188049, "learning_rate": 0.0002, "epoch": 0.44236983842010774, "step": 6160}, {"loss": 0.797, "grad_norm": 0.47406497597694397, "learning_rate": 0.0002, "epoch": 0.4430879712746858, "step": 6170}, {"loss": 0.7853, "grad_norm": 0.612287700176239, "learning_rate": 0.0002, "epoch": 0.4438061041292639, "step": 6180}, {"loss": 0.835, "grad_norm": 0.561188280582428, "learning_rate": 0.0002, "epoch": 0.444524236983842, "step": 6190}, {"loss": 0.7604, "grad_norm": 0.6233669519424438, "learning_rate": 0.0002, "epoch": 0.4452423698384201, "step": 6200}, {"loss": 0.7539, "grad_norm": 0.45546263456344604, "learning_rate": 0.0002, "epoch": 0.4459605026929982, "step": 6210}, {"loss": 0.8183, "grad_norm": 0.5947871208190918, "learning_rate": 0.0002, "epoch": 0.4466786355475763, "step": 6220}, {"loss": 0.789, "grad_norm": 0.6109753847122192, "learning_rate": 0.0002, "epoch": 0.4473967684021544, "step": 6230}, {"loss": 0.7811, "grad_norm": 0.6380727887153625, "learning_rate": 0.0002, "epoch": 0.4481149012567325, "step": 6240}, {"loss": 0.7845, "grad_norm": 0.5225699543952942, "learning_rate": 0.0002, "epoch": 0.4488330341113106, "step": 6250}, {"loss": 0.8217, "grad_norm": 0.521503210067749, "learning_rate": 0.0002, "epoch": 0.4495511669658887, "step": 6260}, {"loss": 0.8392, "grad_norm": 0.5523216128349304, "learning_rate": 0.0002, "epoch": 0.45026929982046676, "step": 6270}, {"loss": 0.8228, "grad_norm": 0.5954921841621399, "learning_rate": 0.0002, "epoch": 0.4509874326750449, "step": 6280}, {"loss": 0.7798, "grad_norm": 0.702751100063324, "learning_rate": 0.0002, "epoch": 0.451705565529623, "step": 6290}, {"loss": 0.7865, "grad_norm": 0.5756356120109558, "learning_rate": 0.0002, "epoch": 0.4524236983842011, "step": 6300}, {"loss": 0.8128, "grad_norm": 0.45365944504737854, "learning_rate": 0.0002, "epoch": 0.45314183123877916, "step": 6310}, {"loss": 0.8027, "grad_norm": 0.5027855038642883, "learning_rate": 0.0002, "epoch": 0.45385996409335727, "step": 6320}, {"loss": 0.8052, "grad_norm": 0.6551687121391296, "learning_rate": 0.0002, "epoch": 0.4545780969479354, "step": 6330}, {"loss": 0.7507, "grad_norm": 0.5296684503555298, "learning_rate": 0.0002, "epoch": 0.45529622980251344, "step": 6340}, {"loss": 0.8209, "grad_norm": 0.5762032866477966, "learning_rate": 0.0002, "epoch": 0.45601436265709155, "step": 6350}, {"loss": 0.8209, "grad_norm": 0.5234073996543884, "learning_rate": 0.0002, "epoch": 0.45673249551166967, "step": 6360}, {"loss": 0.8412, "grad_norm": 0.5090946555137634, "learning_rate": 0.0002, "epoch": 0.4574506283662478, "step": 6370}, {"loss": 0.787, "grad_norm": 0.6515111327171326, "learning_rate": 0.0002, "epoch": 0.45816876122082584, "step": 6380}, {"loss": 0.7351, "grad_norm": 0.7904898524284363, "learning_rate": 0.0002, "epoch": 0.45888689407540395, "step": 6390}, {"loss": 0.841, "grad_norm": 0.6379680037498474, "learning_rate": 0.0002, "epoch": 0.45960502692998206, "step": 6400}, {"loss": 0.7727, "grad_norm": 0.641759991645813, "learning_rate": 0.0002, "epoch": 0.4603231597845601, "step": 6410}, {"loss": 0.8346, "grad_norm": 0.5273829698562622, "learning_rate": 0.0002, "epoch": 0.46104129263913823, "step": 6420}, {"loss": 0.7722, "grad_norm": 0.5668497681617737, "learning_rate": 0.0002, "epoch": 0.46175942549371635, "step": 6430}, {"loss": 0.8157, "grad_norm": 0.5862061381340027, "learning_rate": 0.0002, "epoch": 0.46247755834829446, "step": 6440}, {"loss": 0.818, "grad_norm": 0.5239592790603638, "learning_rate": 0.0002, "epoch": 0.4631956912028725, "step": 6450}, {"loss": 0.7803, "grad_norm": 0.5078722834587097, "learning_rate": 0.0002, "epoch": 0.46391382405745063, "step": 6460}, {"loss": 0.7934, "grad_norm": 0.566509485244751, "learning_rate": 0.0002, "epoch": 0.46463195691202874, "step": 6470}, {"loss": 0.7746, "grad_norm": 0.5952697396278381, "learning_rate": 0.0002, "epoch": 0.4653500897666068, "step": 6480}, {"loss": 0.8088, "grad_norm": 0.6548156142234802, "learning_rate": 0.0002, "epoch": 0.4660682226211849, "step": 6490}, {"loss": 0.8303, "grad_norm": 0.4768427908420563, "learning_rate": 0.0002, "epoch": 0.466786355475763, "step": 6500}, {"loss": 0.805, "grad_norm": 0.5588273406028748, "learning_rate": 0.0002, "epoch": 0.46750448833034114, "step": 6510}, {"loss": 0.7774, "grad_norm": 0.5348677039146423, "learning_rate": 0.0002, "epoch": 0.4682226211849192, "step": 6520}, {"loss": 0.7969, "grad_norm": 0.4784318804740906, "learning_rate": 0.0002, "epoch": 0.4689407540394973, "step": 6530}, {"loss": 0.8073, "grad_norm": 0.5112265944480896, "learning_rate": 0.0002, "epoch": 0.4696588868940754, "step": 6540}, {"loss": 0.8289, "grad_norm": 0.7250495553016663, "learning_rate": 0.0002, "epoch": 0.4703770197486535, "step": 6550}, {"loss": 0.808, "grad_norm": 0.538608968257904, "learning_rate": 0.0002, "epoch": 0.4710951526032316, "step": 6560}, {"loss": 0.7977, "grad_norm": 0.5981247425079346, "learning_rate": 0.0002, "epoch": 0.4718132854578097, "step": 6570}, {"loss": 0.8092, "grad_norm": 0.5466762781143188, "learning_rate": 0.0002, "epoch": 0.4725314183123878, "step": 6580}, {"loss": 0.8136, "grad_norm": 0.5609987378120422, "learning_rate": 0.0002, "epoch": 0.4732495511669659, "step": 6590}, {"loss": 0.8575, "grad_norm": 0.6091027855873108, "learning_rate": 0.0002, "epoch": 0.473967684021544, "step": 6600}, {"loss": 0.7741, "grad_norm": 0.5542886853218079, "learning_rate": 0.0002, "epoch": 0.4746858168761221, "step": 6610}, {"loss": 0.7867, "grad_norm": 0.5656579732894897, "learning_rate": 0.0002, "epoch": 0.47540394973070016, "step": 6620}, {"loss": 0.7647, "grad_norm": 0.47507357597351074, "learning_rate": 0.0002, "epoch": 0.4761220825852783, "step": 6630}, {"loss": 0.8323, "grad_norm": 0.6039174199104309, "learning_rate": 0.0002, "epoch": 0.4768402154398564, "step": 6640}, {"loss": 0.7812, "grad_norm": 0.7129740715026855, "learning_rate": 0.0002, "epoch": 0.47755834829443444, "step": 6650}, {"loss": 0.8001, "grad_norm": 0.5189188718795776, "learning_rate": 0.0002, "epoch": 0.47827648114901256, "step": 6660}, {"loss": 0.7467, "grad_norm": 0.7548696398735046, "learning_rate": 0.0002, "epoch": 0.47899461400359067, "step": 6670}, {"loss": 0.7694, "grad_norm": 0.4729466438293457, "learning_rate": 0.0002, "epoch": 0.4797127468581688, "step": 6680}, {"loss": 0.7497, "grad_norm": 0.6190000772476196, "learning_rate": 0.0002, "epoch": 0.48043087971274684, "step": 6690}, {"loss": 0.7691, "grad_norm": 0.6276983022689819, "learning_rate": 0.0002, "epoch": 0.48114901256732495, "step": 6700}, {"loss": 0.7947, "grad_norm": 0.6097590923309326, "learning_rate": 0.0002, "epoch": 0.48186714542190306, "step": 6710}, {"loss": 0.7735, "grad_norm": 0.6507330536842346, "learning_rate": 0.0002, "epoch": 0.4825852782764811, "step": 6720}, {"loss": 0.817, "grad_norm": 0.5501991510391235, "learning_rate": 0.0002, "epoch": 0.48330341113105924, "step": 6730}, {"loss": 0.7998, "grad_norm": 0.5928015112876892, "learning_rate": 0.0002, "epoch": 0.48402154398563735, "step": 6740}, {"loss": 0.7717, "grad_norm": 0.5523008704185486, "learning_rate": 0.0002, "epoch": 0.48473967684021546, "step": 6750}, {"loss": 0.7821, "grad_norm": 0.5997263789176941, "learning_rate": 0.0002, "epoch": 0.4854578096947935, "step": 6760}, {"loss": 0.7619, "grad_norm": 0.6201002597808838, "learning_rate": 0.0002, "epoch": 0.48617594254937163, "step": 6770}, {"loss": 0.8018, "grad_norm": 0.6338862776756287, "learning_rate": 0.0002, "epoch": 0.48689407540394974, "step": 6780}, {"loss": 0.7547, "grad_norm": 0.5542550086975098, "learning_rate": 0.0002, "epoch": 0.4876122082585278, "step": 6790}, {"loss": 0.7754, "grad_norm": 0.5587872862815857, "learning_rate": 0.0002, "epoch": 0.4883303411131059, "step": 6800}, {"loss": 0.7913, "grad_norm": 0.5895681977272034, "learning_rate": 0.0002, "epoch": 0.489048473967684, "step": 6810}, {"loss": 0.7799, "grad_norm": 0.4948221743106842, "learning_rate": 0.0002, "epoch": 0.48976660682226214, "step": 6820}, {"loss": 0.8057, "grad_norm": 0.44546931982040405, "learning_rate": 0.0002, "epoch": 0.4904847396768402, "step": 6830}, {"loss": 0.8124, "grad_norm": 0.632046103477478, "learning_rate": 0.0002, "epoch": 0.4912028725314183, "step": 6840}, {"loss": 0.8014, "grad_norm": 0.49396243691444397, "learning_rate": 0.0002, "epoch": 0.4919210053859964, "step": 6850}, {"loss": 0.7127, "grad_norm": 0.497745156288147, "learning_rate": 0.0002, "epoch": 0.4926391382405745, "step": 6860}, {"loss": 0.8306, "grad_norm": 0.7336170077323914, "learning_rate": 0.0002, "epoch": 0.4933572710951526, "step": 6870}, {"loss": 0.8342, "grad_norm": 0.6723181009292603, "learning_rate": 0.0002, "epoch": 0.4940754039497307, "step": 6880}, {"loss": 0.8251, "grad_norm": 0.5887754559516907, "learning_rate": 0.0002, "epoch": 0.4947935368043088, "step": 6890}, {"loss": 0.7904, "grad_norm": 0.6580226421356201, "learning_rate": 0.0002, "epoch": 0.4955116696588869, "step": 6900}, {"loss": 0.8203, "grad_norm": 0.7385056614875793, "learning_rate": 0.0002, "epoch": 0.496229802513465, "step": 6910}, {"loss": 0.87, "grad_norm": 0.48736000061035156, "learning_rate": 0.0002, "epoch": 0.4969479353680431, "step": 6920}, {"loss": 0.8045, "grad_norm": 0.6304559111595154, "learning_rate": 0.0002, "epoch": 0.49766606822262116, "step": 6930}, {"loss": 0.8323, "grad_norm": 0.607148289680481, "learning_rate": 0.0002, "epoch": 0.4983842010771993, "step": 6940}, {"loss": 0.8277, "grad_norm": 0.5467981696128845, "learning_rate": 0.0002, "epoch": 0.4991023339317774, "step": 6950}, {"loss": 0.804, "grad_norm": 0.7046723961830139, "learning_rate": 0.0002, "epoch": 0.4998204667863555, "step": 6960}, {"loss": 0.7836, "grad_norm": 0.5487921833992004, "learning_rate": 0.0002, "epoch": 0.5005385996409336, "step": 6970}, {"loss": 0.8445, "grad_norm": 0.5706006288528442, "learning_rate": 0.0002, "epoch": 0.5012567324955116, "step": 6980}, {"loss": 0.8216, "grad_norm": 0.539536714553833, "learning_rate": 0.0002, "epoch": 0.5019748653500897, "step": 6990}, {"loss": 0.7829, "grad_norm": 0.5527397394180298, "learning_rate": 0.0002, "epoch": 0.5026929982046678, "step": 7000}, {"loss": 0.8342, "grad_norm": 0.5498567223548889, "learning_rate": 0.0002, "epoch": 0.503411131059246, "step": 7010}, {"loss": 0.8073, "grad_norm": 0.5878575444221497, "learning_rate": 0.0002, "epoch": 0.5041292639138241, "step": 7020}, {"loss": 0.8284, "grad_norm": 0.646153450012207, "learning_rate": 0.0002, "epoch": 0.5048473967684022, "step": 7030}, {"loss": 0.7758, "grad_norm": 0.5603899359703064, "learning_rate": 0.0002, "epoch": 0.5055655296229803, "step": 7040}, {"loss": 0.8002, "grad_norm": 0.5849952697753906, "learning_rate": 0.0002, "epoch": 0.5062836624775583, "step": 7050}, {"loss": 0.7953, "grad_norm": 0.6082724928855896, "learning_rate": 0.0002, "epoch": 0.5070017953321364, "step": 7060}, {"loss": 0.8046, "grad_norm": 0.5900670289993286, "learning_rate": 0.0002, "epoch": 0.5077199281867145, "step": 7070}, {"loss": 0.8612, "grad_norm": 0.5856624841690063, "learning_rate": 0.0002, "epoch": 0.5084380610412926, "step": 7080}, {"loss": 0.8289, "grad_norm": 0.6177338361740112, "learning_rate": 0.0002, "epoch": 0.5091561938958707, "step": 7090}, {"loss": 0.8139, "grad_norm": 0.5559300184249878, "learning_rate": 0.0002, "epoch": 0.5098743267504489, "step": 7100}, {"loss": 0.8083, "grad_norm": 0.62027907371521, "learning_rate": 0.0002, "epoch": 0.510592459605027, "step": 7110}, {"loss": 0.8037, "grad_norm": 0.6334301829338074, "learning_rate": 0.0002, "epoch": 0.511310592459605, "step": 7120}, {"loss": 0.8107, "grad_norm": 0.513795018196106, "learning_rate": 0.0002, "epoch": 0.5120287253141831, "step": 7130}, {"loss": 0.7566, "grad_norm": 0.7004675269126892, "learning_rate": 0.0002, "epoch": 0.5127468581687612, "step": 7140}, {"loss": 0.7893, "grad_norm": 0.5614308714866638, "learning_rate": 0.0002, "epoch": 0.5134649910233393, "step": 7150}, {"loss": 0.7868, "grad_norm": 0.5037539601325989, "learning_rate": 0.0002, "epoch": 0.5141831238779174, "step": 7160}, {"loss": 0.7981, "grad_norm": 0.5568661093711853, "learning_rate": 0.0002, "epoch": 0.5149012567324955, "step": 7170}, {"loss": 0.8333, "grad_norm": 0.7513397336006165, "learning_rate": 0.0002, "epoch": 0.5156193895870737, "step": 7180}, {"loss": 0.792, "grad_norm": 0.7264583706855774, "learning_rate": 0.0002, "epoch": 0.5163375224416517, "step": 7190}, {"loss": 0.8671, "grad_norm": 0.6355819702148438, "learning_rate": 0.0002, "epoch": 0.5170556552962298, "step": 7200}, {"loss": 0.7734, "grad_norm": 0.6063222289085388, "learning_rate": 0.0002, "epoch": 0.5177737881508079, "step": 7210}, {"loss": 0.812, "grad_norm": 0.6484307646751404, "learning_rate": 0.0002, "epoch": 0.518491921005386, "step": 7220}, {"loss": 0.7852, "grad_norm": 0.5260455012321472, "learning_rate": 0.0002, "epoch": 0.5192100538599641, "step": 7230}, {"loss": 0.8301, "grad_norm": 0.6718002557754517, "learning_rate": 0.0002, "epoch": 0.5199281867145422, "step": 7240}, {"loss": 0.8178, "grad_norm": 0.5997617244720459, "learning_rate": 0.0002, "epoch": 0.5206463195691203, "step": 7250}, {"loss": 0.7631, "grad_norm": 0.5838589668273926, "learning_rate": 0.0002, "epoch": 0.5213644524236983, "step": 7260}, {"loss": 0.7853, "grad_norm": 0.5755977630615234, "learning_rate": 0.0002, "epoch": 0.5220825852782764, "step": 7270}, {"loss": 0.8233, "grad_norm": 0.6442093253135681, "learning_rate": 0.0002, "epoch": 0.5228007181328546, "step": 7280}, {"loss": 0.822, "grad_norm": 0.6128416657447815, "learning_rate": 0.0002, "epoch": 0.5235188509874327, "step": 7290}, {"loss": 0.802, "grad_norm": 0.509742796421051, "learning_rate": 0.0002, "epoch": 0.5242369838420108, "step": 7300}, {"loss": 0.7438, "grad_norm": 0.5450230836868286, "learning_rate": 0.0002, "epoch": 0.5249551166965889, "step": 7310}, {"loss": 0.7881, "grad_norm": 0.5437141060829163, "learning_rate": 0.0002, "epoch": 0.525673249551167, "step": 7320}, {"loss": 0.795, "grad_norm": 0.5291738510131836, "learning_rate": 0.0002, "epoch": 0.526391382405745, "step": 7330}, {"loss": 0.8204, "grad_norm": 0.5101743936538696, "learning_rate": 0.0002, "epoch": 0.5271095152603231, "step": 7340}, {"loss": 0.856, "grad_norm": 0.5678408145904541, "learning_rate": 0.0002, "epoch": 0.5278276481149012, "step": 7350}, {"loss": 0.8435, "grad_norm": 0.6332360506057739, "learning_rate": 0.0002, "epoch": 0.5285457809694794, "step": 7360}, {"loss": 0.8521, "grad_norm": 0.4935058653354645, "learning_rate": 0.0002, "epoch": 0.5292639138240575, "step": 7370}, {"loss": 0.7699, "grad_norm": 0.6399656534194946, "learning_rate": 0.0002, "epoch": 0.5299820466786356, "step": 7380}, {"loss": 0.7956, "grad_norm": 0.5986794233322144, "learning_rate": 0.0002, "epoch": 0.5307001795332137, "step": 7390}, {"loss": 0.774, "grad_norm": 0.6948414444923401, "learning_rate": 0.0002, "epoch": 0.5314183123877917, "step": 7400}, {"loss": 0.8267, "grad_norm": 0.5337842106819153, "learning_rate": 0.0002, "epoch": 0.5321364452423698, "step": 7410}, {"loss": 0.7634, "grad_norm": 0.6897268295288086, "learning_rate": 0.0002, "epoch": 0.5328545780969479, "step": 7420}, {"loss": 0.7606, "grad_norm": 0.6361175179481506, "learning_rate": 0.0002, "epoch": 0.533572710951526, "step": 7430}, {"loss": 0.7592, "grad_norm": 0.5242252945899963, "learning_rate": 0.0002, "epoch": 0.5342908438061041, "step": 7440}, {"loss": 0.7387, "grad_norm": 0.5731322765350342, "learning_rate": 0.0002, "epoch": 0.5350089766606823, "step": 7450}, {"loss": 0.8215, "grad_norm": 0.5790955424308777, "learning_rate": 0.0002, "epoch": 0.5357271095152604, "step": 7460}, {"loss": 0.7714, "grad_norm": 0.4979061782360077, "learning_rate": 0.0002, "epoch": 0.5364452423698384, "step": 7470}, {"loss": 0.794, "grad_norm": 0.7335101962089539, "learning_rate": 0.0002, "epoch": 0.5371633752244165, "step": 7480}, {"loss": 0.787, "grad_norm": 0.592521071434021, "learning_rate": 0.0002, "epoch": 0.5378815080789946, "step": 7490}, {"loss": 0.7421, "grad_norm": 0.5784769654273987, "learning_rate": 0.0002, "epoch": 0.5385996409335727, "step": 7500}, {"loss": 0.789, "grad_norm": 0.8148589730262756, "learning_rate": 0.0002, "epoch": 0.5393177737881508, "step": 7510}, {"loss": 0.7777, "grad_norm": 0.5727689862251282, "learning_rate": 0.0002, "epoch": 0.5400359066427289, "step": 7520}, {"loss": 0.8321, "grad_norm": 0.6958279609680176, "learning_rate": 0.0002, "epoch": 0.540754039497307, "step": 7530}, {"loss": 0.7678, "grad_norm": 0.6302788257598877, "learning_rate": 0.0002, "epoch": 0.541472172351885, "step": 7540}, {"loss": 0.7772, "grad_norm": 0.5950970649719238, "learning_rate": 0.0002, "epoch": 0.5421903052064632, "step": 7550}, {"loss": 0.8076, "grad_norm": 0.4275270104408264, "learning_rate": 0.0002, "epoch": 0.5429084380610413, "step": 7560}, {"loss": 0.8158, "grad_norm": 0.7579900622367859, "learning_rate": 0.0002, "epoch": 0.5436265709156194, "step": 7570}, {"loss": 0.8036, "grad_norm": 0.5835317969322205, "learning_rate": 0.0002, "epoch": 0.5443447037701975, "step": 7580}, {"loss": 0.7947, "grad_norm": 0.5305142998695374, "learning_rate": 0.0002, "epoch": 0.5450628366247756, "step": 7590}, {"loss": 0.8043, "grad_norm": 0.6076129674911499, "learning_rate": 0.0002, "epoch": 0.5457809694793537, "step": 7600}, {"loss": 0.8197, "grad_norm": 0.5341935753822327, "learning_rate": 0.0002, "epoch": 0.5464991023339317, "step": 7610}, {"loss": 0.7424, "grad_norm": 0.6070826053619385, "learning_rate": 0.0002, "epoch": 0.5472172351885098, "step": 7620}, {"loss": 0.7801, "grad_norm": 0.6193035840988159, "learning_rate": 0.0002, "epoch": 0.547935368043088, "step": 7630}, {"loss": 0.7639, "grad_norm": 0.6171614527702332, "learning_rate": 0.0002, "epoch": 0.5486535008976661, "step": 7640}, {"loss": 0.7655, "grad_norm": 0.5700938105583191, "learning_rate": 0.0002, "epoch": 0.5493716337522442, "step": 7650}, {"loss": 0.8289, "grad_norm": 0.5742418169975281, "learning_rate": 0.0002, "epoch": 0.5500897666068223, "step": 7660}, {"loss": 0.7942, "grad_norm": 0.6450320482254028, "learning_rate": 0.0002, "epoch": 0.5508078994614004, "step": 7670}, {"loss": 0.807, "grad_norm": 0.542860209941864, "learning_rate": 0.0002, "epoch": 0.5515260323159784, "step": 7680}, {"loss": 0.8298, "grad_norm": 0.538007915019989, "learning_rate": 0.0002, "epoch": 0.5522441651705565, "step": 7690}, {"loss": 0.8301, "grad_norm": 0.5846288204193115, "learning_rate": 0.0002, "epoch": 0.5529622980251346, "step": 7700}, {"loss": 0.7893, "grad_norm": 0.623315155506134, "learning_rate": 0.0002, "epoch": 0.5536804308797127, "step": 7710}, {"loss": 0.8043, "grad_norm": 0.6607962250709534, "learning_rate": 0.0002, "epoch": 0.5543985637342909, "step": 7720}, {"loss": 0.7615, "grad_norm": 0.5258557200431824, "learning_rate": 0.0002, "epoch": 0.555116696588869, "step": 7730}, {"loss": 0.8177, "grad_norm": 0.6464316844940186, "learning_rate": 0.0002, "epoch": 0.5558348294434471, "step": 7740}, {"loss": 0.7683, "grad_norm": 0.6390621662139893, "learning_rate": 0.0002, "epoch": 0.5565529622980251, "step": 7750}, {"loss": 0.8447, "grad_norm": 0.5327560305595398, "learning_rate": 0.0002, "epoch": 0.5572710951526032, "step": 7760}, {"loss": 0.7833, "grad_norm": 0.8202064633369446, "learning_rate": 0.0002, "epoch": 0.5579892280071813, "step": 7770}, {"loss": 0.7818, "grad_norm": 0.45350968837738037, "learning_rate": 0.0002, "epoch": 0.5587073608617594, "step": 7780}, {"loss": 0.7299, "grad_norm": 0.5031413435935974, "learning_rate": 0.0002, "epoch": 0.5594254937163375, "step": 7790}, {"loss": 0.7542, "grad_norm": 0.5047417879104614, "learning_rate": 0.0002, "epoch": 0.5601436265709157, "step": 7800}, {"loss": 0.7989, "grad_norm": 0.668912410736084, "learning_rate": 0.0002, "epoch": 0.5608617594254938, "step": 7810}, {"loss": 0.8226, "grad_norm": 0.6106061339378357, "learning_rate": 0.0002, "epoch": 0.5615798922800718, "step": 7820}, {"loss": 0.7489, "grad_norm": 0.5558443665504456, "learning_rate": 0.0002, "epoch": 0.5622980251346499, "step": 7830}, {"loss": 0.79, "grad_norm": 0.5937177538871765, "learning_rate": 0.0002, "epoch": 0.563016157989228, "step": 7840}, {"loss": 0.7857, "grad_norm": 0.67307448387146, "learning_rate": 0.0002, "epoch": 0.5637342908438061, "step": 7850}, {"loss": 0.8037, "grad_norm": 0.4615475833415985, "learning_rate": 0.0002, "epoch": 0.5644524236983842, "step": 7860}, {"loss": 0.7519, "grad_norm": 0.5462577939033508, "learning_rate": 0.0002, "epoch": 0.5651705565529623, "step": 7870}, {"loss": 0.7821, "grad_norm": 0.6422402858734131, "learning_rate": 0.0002, "epoch": 0.5658886894075404, "step": 7880}, {"loss": 0.8327, "grad_norm": 0.5313532948493958, "learning_rate": 0.0002, "epoch": 0.5666068222621184, "step": 7890}, {"loss": 0.7771, "grad_norm": 0.5647847056388855, "learning_rate": 0.0002, "epoch": 0.5673249551166966, "step": 7900}, {"loss": 0.8126, "grad_norm": 0.6581610441207886, "learning_rate": 0.0002, "epoch": 0.5680430879712747, "step": 7910}, {"loss": 0.7549, "grad_norm": 0.46947669982910156, "learning_rate": 0.0002, "epoch": 0.5687612208258528, "step": 7920}, {"loss": 0.8333, "grad_norm": 0.6420038342475891, "learning_rate": 0.0002, "epoch": 0.5694793536804309, "step": 7930}, {"loss": 0.7921, "grad_norm": 0.6730441451072693, "learning_rate": 0.0002, "epoch": 0.570197486535009, "step": 7940}, {"loss": 0.7668, "grad_norm": 0.3849070966243744, "learning_rate": 0.0002, "epoch": 0.5709156193895871, "step": 7950}, {"loss": 0.8297, "grad_norm": 0.6076335906982422, "learning_rate": 0.0002, "epoch": 0.5716337522441651, "step": 7960}, {"loss": 0.7932, "grad_norm": 0.6446982026100159, "learning_rate": 0.0002, "epoch": 0.5723518850987432, "step": 7970}, {"loss": 0.7988, "grad_norm": 0.6019234657287598, "learning_rate": 0.0002, "epoch": 0.5730700179533214, "step": 7980}, {"loss": 0.8103, "grad_norm": 0.620880663394928, "learning_rate": 0.0002, "epoch": 0.5737881508078995, "step": 7990}, {"loss": 0.7712, "grad_norm": 0.4927573502063751, "learning_rate": 0.0002, "epoch": 0.5745062836624776, "step": 8000}, {"loss": 0.7499, "grad_norm": 0.6276804804801941, "learning_rate": 0.0002, "epoch": 0.5752244165170557, "step": 8010}, {"loss": 0.8232, "grad_norm": 0.484518826007843, "learning_rate": 0.0002, "epoch": 0.5759425493716338, "step": 8020}, {"loss": 0.7658, "grad_norm": 0.5019962787628174, "learning_rate": 0.0002, "epoch": 0.5766606822262118, "step": 8030}, {"loss": 0.7827, "grad_norm": 0.6685234308242798, "learning_rate": 0.0002, "epoch": 0.5773788150807899, "step": 8040}, {"loss": 0.7811, "grad_norm": 0.5762107372283936, "learning_rate": 0.0002, "epoch": 0.578096947935368, "step": 8050}, {"loss": 0.8256, "grad_norm": 0.6402477025985718, "learning_rate": 0.0002, "epoch": 0.5788150807899461, "step": 8060}, {"loss": 0.779, "grad_norm": 0.5919345617294312, "learning_rate": 0.0002, "epoch": 0.5795332136445243, "step": 8070}, {"loss": 0.8179, "grad_norm": 0.47100913524627686, "learning_rate": 0.0002, "epoch": 0.5802513464991024, "step": 8080}, {"loss": 0.7832, "grad_norm": 0.6029118895530701, "learning_rate": 0.0002, "epoch": 0.5809694793536805, "step": 8090}, {"loss": 0.8061, "grad_norm": 0.5896338820457458, "learning_rate": 0.0002, "epoch": 0.5816876122082585, "step": 8100}, {"loss": 0.7991, "grad_norm": 0.49017754197120667, "learning_rate": 0.0002, "epoch": 0.5824057450628366, "step": 8110}, {"loss": 0.8148, "grad_norm": 0.5049256086349487, "learning_rate": 0.0002, "epoch": 0.5831238779174147, "step": 8120}, {"loss": 0.7561, "grad_norm": 0.6874517798423767, "learning_rate": 0.0002, "epoch": 0.5838420107719928, "step": 8130}, {"loss": 0.7908, "grad_norm": 0.5429391264915466, "learning_rate": 0.0002, "epoch": 0.5845601436265709, "step": 8140}, {"loss": 0.7834, "grad_norm": 0.5533722639083862, "learning_rate": 0.0002, "epoch": 0.585278276481149, "step": 8150}, {"loss": 0.7725, "grad_norm": 0.5827956199645996, "learning_rate": 0.0002, "epoch": 0.5859964093357272, "step": 8160}, {"loss": 0.7758, "grad_norm": 0.6670212149620056, "learning_rate": 0.0002, "epoch": 0.5867145421903052, "step": 8170}, {"loss": 0.7625, "grad_norm": 0.5231172442436218, "learning_rate": 0.0002, "epoch": 0.5874326750448833, "step": 8180}, {"loss": 0.7975, "grad_norm": 0.567447304725647, "learning_rate": 0.0002, "epoch": 0.5881508078994614, "step": 8190}, {"loss": 0.7463, "grad_norm": 0.5318575501441956, "learning_rate": 0.0002, "epoch": 0.5888689407540395, "step": 8200}, {"loss": 0.7961, "grad_norm": 0.6959463357925415, "learning_rate": 0.0002, "epoch": 0.5895870736086176, "step": 8210}, {"loss": 0.7575, "grad_norm": 0.6964931488037109, "learning_rate": 0.0002, "epoch": 0.5903052064631957, "step": 8220}, {"loss": 0.8382, "grad_norm": 0.5164617896080017, "learning_rate": 0.0002, "epoch": 0.5910233393177737, "step": 8230}, {"loss": 0.8152, "grad_norm": 0.5456110239028931, "learning_rate": 0.0002, "epoch": 0.5917414721723518, "step": 8240}, {"loss": 0.7627, "grad_norm": 0.6553666591644287, "learning_rate": 0.0002, "epoch": 0.59245960502693, "step": 8250}, {"loss": 0.8134, "grad_norm": 0.6185845732688904, "learning_rate": 0.0002, "epoch": 0.5931777378815081, "step": 8260}, {"loss": 0.8216, "grad_norm": 0.6110545992851257, "learning_rate": 0.0002, "epoch": 0.5938958707360862, "step": 8270}, {"loss": 0.805, "grad_norm": 0.5186824202537537, "learning_rate": 0.0002, "epoch": 0.5946140035906643, "step": 8280}, {"loss": 0.7934, "grad_norm": 0.7003735303878784, "learning_rate": 0.0002, "epoch": 0.5953321364452424, "step": 8290}, {"loss": 0.8095, "grad_norm": 0.4606216549873352, "learning_rate": 0.0002, "epoch": 0.5960502692998204, "step": 8300}, {"loss": 0.8051, "grad_norm": 0.5903441309928894, "learning_rate": 0.0002, "epoch": 0.5967684021543985, "step": 8310}, {"loss": 0.7861, "grad_norm": 0.7916744947433472, "learning_rate": 0.0002, "epoch": 0.5974865350089766, "step": 8320}, {"loss": 0.8234, "grad_norm": 0.5506401062011719, "learning_rate": 0.0002, "epoch": 0.5982046678635548, "step": 8330}, {"loss": 0.8137, "grad_norm": 0.5749204158782959, "learning_rate": 0.0002, "epoch": 0.5989228007181329, "step": 8340}, {"loss": 0.8133, "grad_norm": 0.6807544827461243, "learning_rate": 0.0002, "epoch": 0.599640933572711, "step": 8350}, {"loss": 0.8089, "grad_norm": 0.5782986283302307, "learning_rate": 0.0002, "epoch": 0.6003590664272891, "step": 8360}, {"loss": 0.8725, "grad_norm": 0.7336342334747314, "learning_rate": 0.0002, "epoch": 0.6010771992818671, "step": 8370}, {"loss": 0.7992, "grad_norm": 0.5762712955474854, "learning_rate": 0.0002, "epoch": 0.6017953321364452, "step": 8380}, {"loss": 0.8037, "grad_norm": 0.5726776719093323, "learning_rate": 0.0002, "epoch": 0.6025134649910233, "step": 8390}, {"loss": 0.7918, "grad_norm": 0.5355535745620728, "learning_rate": 0.0002, "epoch": 0.6032315978456014, "step": 8400}, {"loss": 0.8138, "grad_norm": 0.6762161254882812, "learning_rate": 0.0002, "epoch": 0.6039497307001795, "step": 8410}, {"loss": 0.8357, "grad_norm": 0.8200717568397522, "learning_rate": 0.0002, "epoch": 0.6046678635547577, "step": 8420}, {"loss": 0.79, "grad_norm": 0.5600009560585022, "learning_rate": 0.0002, "epoch": 0.6053859964093358, "step": 8430}, {"loss": 0.7387, "grad_norm": 0.6465966105461121, "learning_rate": 0.0002, "epoch": 0.6061041292639138, "step": 8440}, {"loss": 0.838, "grad_norm": 0.5176072120666504, "learning_rate": 0.0002, "epoch": 0.6068222621184919, "step": 8450}, {"loss": 0.7855, "grad_norm": 0.5777280926704407, "learning_rate": 0.0002, "epoch": 0.60754039497307, "step": 8460}, {"loss": 0.7776, "grad_norm": 0.5989252924919128, "learning_rate": 0.0002, "epoch": 0.6082585278276481, "step": 8470}, {"loss": 0.8216, "grad_norm": 0.5207306742668152, "learning_rate": 0.0002, "epoch": 0.6089766606822262, "step": 8480}, {"loss": 0.8092, "grad_norm": 0.5242675542831421, "learning_rate": 0.0002, "epoch": 0.6096947935368043, "step": 8490}, {"loss": 0.7546, "grad_norm": 0.5631455183029175, "learning_rate": 0.0002, "epoch": 0.6104129263913824, "step": 8500}, {"loss": 0.7495, "grad_norm": 0.65207439661026, "learning_rate": 0.0002, "epoch": 0.6111310592459605, "step": 8510}, {"loss": 0.8023, "grad_norm": 0.5808899998664856, "learning_rate": 0.0002, "epoch": 0.6118491921005386, "step": 8520}, {"loss": 0.7763, "grad_norm": 0.558127760887146, "learning_rate": 0.0002, "epoch": 0.6125673249551167, "step": 8530}, {"loss": 0.8012, "grad_norm": 0.6063143014907837, "learning_rate": 0.0002, "epoch": 0.6132854578096948, "step": 8540}, {"loss": 0.7496, "grad_norm": 0.5491744875907898, "learning_rate": 0.0002, "epoch": 0.6140035906642729, "step": 8550}, {"loss": 0.779, "grad_norm": 0.5105780959129333, "learning_rate": 0.0002, "epoch": 0.614721723518851, "step": 8560}, {"loss": 0.7983, "grad_norm": 0.6892395615577698, "learning_rate": 0.0002, "epoch": 0.6154398563734291, "step": 8570}, {"loss": 0.7563, "grad_norm": 0.7411758899688721, "learning_rate": 0.0002, "epoch": 0.6161579892280071, "step": 8580}, {"loss": 0.7455, "grad_norm": 0.6745429635047913, "learning_rate": 0.0002, "epoch": 0.6168761220825852, "step": 8590}, {"loss": 0.8213, "grad_norm": 0.596007227897644, "learning_rate": 0.0002, "epoch": 0.6175942549371634, "step": 8600}, {"loss": 0.7963, "grad_norm": 0.6751060485839844, "learning_rate": 0.0002, "epoch": 0.6183123877917415, "step": 8610}, {"loss": 0.7343, "grad_norm": 0.711124837398529, "learning_rate": 0.0002, "epoch": 0.6190305206463196, "step": 8620}, {"loss": 0.773, "grad_norm": 0.6110914945602417, "learning_rate": 0.0002, "epoch": 0.6197486535008977, "step": 8630}, {"loss": 0.7497, "grad_norm": 0.5687659978866577, "learning_rate": 0.0002, "epoch": 0.6204667863554758, "step": 8640}, {"loss": 0.7754, "grad_norm": 0.7025772929191589, "learning_rate": 0.0002, "epoch": 0.6211849192100538, "step": 8650}, {"loss": 0.7423, "grad_norm": 0.6456184983253479, "learning_rate": 0.0002, "epoch": 0.6219030520646319, "step": 8660}, {"loss": 0.7449, "grad_norm": 0.5317023992538452, "learning_rate": 0.0002, "epoch": 0.62262118491921, "step": 8670}, {"loss": 0.8146, "grad_norm": 0.5531691908836365, "learning_rate": 0.0002, "epoch": 0.6233393177737881, "step": 8680}, {"loss": 0.8171, "grad_norm": 0.6063531637191772, "learning_rate": 0.0002, "epoch": 0.6240574506283663, "step": 8690}, {"loss": 0.7943, "grad_norm": 1.094390630722046, "learning_rate": 0.0002, "epoch": 0.6247755834829444, "step": 8700}, {"loss": 0.7993, "grad_norm": 0.5558148622512817, "learning_rate": 0.0002, "epoch": 0.6254937163375225, "step": 8710}, {"loss": 0.7747, "grad_norm": 0.5470370054244995, "learning_rate": 0.0002, "epoch": 0.6262118491921005, "step": 8720}, {"loss": 0.8252, "grad_norm": 0.5852634310722351, "learning_rate": 0.0002, "epoch": 0.6269299820466786, "step": 8730}, {"loss": 0.8712, "grad_norm": 0.6120240092277527, "learning_rate": 0.0002, "epoch": 0.6276481149012567, "step": 8740}, {"loss": 0.8367, "grad_norm": 0.5608004927635193, "learning_rate": 0.0002, "epoch": 0.6283662477558348, "step": 8750}, {"loss": 0.7711, "grad_norm": 0.5980432033538818, "learning_rate": 0.0002, "epoch": 0.6290843806104129, "step": 8760}, {"loss": 0.7903, "grad_norm": 0.5670580863952637, "learning_rate": 0.0002, "epoch": 0.629802513464991, "step": 8770}, {"loss": 0.7765, "grad_norm": 0.5931687951087952, "learning_rate": 0.0002, "epoch": 0.6305206463195692, "step": 8780}, {"loss": 0.7752, "grad_norm": 0.7872577905654907, "learning_rate": 0.0002, "epoch": 0.6312387791741472, "step": 8790}, {"loss": 0.8045, "grad_norm": 0.6355181336402893, "learning_rate": 0.0002, "epoch": 0.6319569120287253, "step": 8800}, {"loss": 0.7651, "grad_norm": 0.501913845539093, "learning_rate": 0.0002, "epoch": 0.6326750448833034, "step": 8810}, {"loss": 0.8023, "grad_norm": 0.5956716537475586, "learning_rate": 0.0002, "epoch": 0.6333931777378815, "step": 8820}, {"loss": 0.798, "grad_norm": 0.6448253393173218, "learning_rate": 0.0002, "epoch": 0.6341113105924596, "step": 8830}, {"loss": 0.7878, "grad_norm": 0.6139631271362305, "learning_rate": 0.0002, "epoch": 0.6348294434470377, "step": 8840}, {"loss": 0.7767, "grad_norm": 0.5894306302070618, "learning_rate": 0.0002, "epoch": 0.6355475763016158, "step": 8850}, {"loss": 0.7516, "grad_norm": 0.8724799752235413, "learning_rate": 0.0002, "epoch": 0.6362657091561938, "step": 8860}, {"loss": 0.7715, "grad_norm": 0.5413858890533447, "learning_rate": 0.0002, "epoch": 0.636983842010772, "step": 8870}, {"loss": 0.8175, "grad_norm": 0.5993430614471436, "learning_rate": 0.0002, "epoch": 0.6377019748653501, "step": 8880}, {"loss": 0.7865, "grad_norm": 0.539415717124939, "learning_rate": 0.0002, "epoch": 0.6384201077199282, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.600125789642334, "learning_rate": 0.0002, "epoch": 0.6391382405745063, "step": 8900}, {"loss": 0.7886, "grad_norm": 0.5597978234291077, "learning_rate": 0.0002, "epoch": 0.6398563734290844, "step": 8910}, {"loss": 0.8468, "grad_norm": 0.6262031197547913, "learning_rate": 0.0002, "epoch": 0.6405745062836625, "step": 8920}, {"loss": 0.7523, "grad_norm": 0.72662752866745, "learning_rate": 0.0002, "epoch": 0.6412926391382405, "step": 8930}, {"loss": 0.8099, "grad_norm": 0.613002598285675, "learning_rate": 0.0002, "epoch": 0.6420107719928186, "step": 8940}, {"loss": 0.8112, "grad_norm": 0.6511827707290649, "learning_rate": 0.0002, "epoch": 0.6427289048473968, "step": 8950}, {"loss": 0.7479, "grad_norm": 0.5383973717689514, "learning_rate": 0.0002, "epoch": 0.6434470377019749, "step": 8960}, {"loss": 0.764, "grad_norm": 0.5236184597015381, "learning_rate": 0.0002, "epoch": 0.644165170556553, "step": 8970}, {"loss": 0.7515, "grad_norm": 0.5938544273376465, "learning_rate": 0.0002, "epoch": 0.6448833034111311, "step": 8980}, {"loss": 0.8103, "grad_norm": 0.4594680964946747, "learning_rate": 0.0002, "epoch": 0.6456014362657092, "step": 8990}, {"loss": 0.7495, "grad_norm": 0.6314211487770081, "learning_rate": 0.0002, "epoch": 0.6463195691202872, "step": 9000}, {"loss": 0.8162, "grad_norm": 0.6291103363037109, "learning_rate": 0.0002, "epoch": 0.6470377019748653, "step": 9010}, {"loss": 0.8167, "grad_norm": 0.5888266563415527, "learning_rate": 0.0002, "epoch": 0.6477558348294434, "step": 9020}, {"loss": 0.7685, "grad_norm": 0.5613022446632385, "learning_rate": 0.0002, "epoch": 0.6484739676840215, "step": 9030}, {"loss": 0.8142, "grad_norm": 0.7219604253768921, "learning_rate": 0.0002, "epoch": 0.6491921005385997, "step": 9040}, {"loss": 0.805, "grad_norm": 0.5846529006958008, "learning_rate": 0.0002, "epoch": 0.6499102333931778, "step": 9050}, {"loss": 0.8471, "grad_norm": 0.7264063954353333, "learning_rate": 0.0002, "epoch": 0.6506283662477559, "step": 9060}, {"loss": 0.7925, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "epoch": 0.6513464991023339, "step": 9070}, {"loss": 0.7961, "grad_norm": 0.4857395887374878, "learning_rate": 0.0002, "epoch": 0.652064631956912, "step": 9080}, {"loss": 0.7567, "grad_norm": 0.5044030547142029, "learning_rate": 0.0002, "epoch": 0.6527827648114901, "step": 9090}, {"loss": 0.7889, "grad_norm": 0.6105342507362366, "learning_rate": 0.0002, "epoch": 0.6535008976660682, "step": 9100}, {"loss": 0.7692, "grad_norm": 0.6408740282058716, "learning_rate": 0.0002, "epoch": 0.6542190305206463, "step": 9110}, {"loss": 0.7788, "grad_norm": 0.7474880814552307, "learning_rate": 0.0002, "epoch": 0.6549371633752245, "step": 9120}, {"loss": 0.7694, "grad_norm": 0.584768533706665, "learning_rate": 0.0002, "epoch": 0.6556552962298026, "step": 9130}, {"loss": 0.8273, "grad_norm": 0.6368113160133362, "learning_rate": 0.0002, "epoch": 0.6563734290843806, "step": 9140}, {"loss": 0.7493, "grad_norm": 0.693631649017334, "learning_rate": 0.0002, "epoch": 0.6570915619389587, "step": 9150}, {"loss": 0.7636, "grad_norm": 0.6094512343406677, "learning_rate": 0.0002, "epoch": 0.6578096947935368, "step": 9160}, {"loss": 0.8269, "grad_norm": 0.7154942750930786, "learning_rate": 0.0002, "epoch": 0.6585278276481149, "step": 9170}, {"loss": 0.7623, "grad_norm": 0.5749237537384033, "learning_rate": 0.0002, "epoch": 0.659245960502693, "step": 9180}, {"loss": 0.799, "grad_norm": 0.6214450001716614, "learning_rate": 0.0002, "epoch": 0.6599640933572711, "step": 9190}, {"loss": 0.7973, "grad_norm": 0.6357814073562622, "learning_rate": 0.0002, "epoch": 0.6606822262118492, "step": 9200}, {"loss": 0.773, "grad_norm": 0.5677326917648315, "learning_rate": 0.0002, "epoch": 0.6614003590664272, "step": 9210}, {"loss": 0.8173, "grad_norm": 0.5432633757591248, "learning_rate": 0.0002, "epoch": 0.6621184919210054, "step": 9220}, {"loss": 0.7573, "grad_norm": 0.43935060501098633, "learning_rate": 0.0002, "epoch": 0.6628366247755835, "step": 9230}, {"loss": 0.848, "grad_norm": 0.5350922346115112, "learning_rate": 0.0002, "epoch": 0.6635547576301616, "step": 9240}, {"loss": 0.7409, "grad_norm": 0.7745687365531921, "learning_rate": 0.0002, "epoch": 0.6642728904847397, "step": 9250}, {"loss": 0.7412, "grad_norm": 0.5767113566398621, "learning_rate": 0.0002, "epoch": 0.6649910233393178, "step": 9260}, {"loss": 0.8197, "grad_norm": 0.49304983019828796, "learning_rate": 0.0002, "epoch": 0.6657091561938959, "step": 9270}, {"loss": 0.7856, "grad_norm": 0.6355269551277161, "learning_rate": 0.0002, "epoch": 0.6664272890484739, "step": 9280}, {"loss": 0.7659, "grad_norm": 0.5539451241493225, "learning_rate": 0.0002, "epoch": 0.667145421903052, "step": 9290}, {"loss": 0.7888, "grad_norm": 0.5225138068199158, "learning_rate": 0.0002, "epoch": 0.6678635547576302, "step": 9300}, {"loss": 0.8048, "grad_norm": 0.5435736179351807, "learning_rate": 0.0002, "epoch": 0.6685816876122083, "step": 9310}, {"loss": 0.8284, "grad_norm": 0.611266553401947, "learning_rate": 0.0002, "epoch": 0.6692998204667864, "step": 9320}, {"loss": 0.8081, "grad_norm": 0.5880926251411438, "learning_rate": 0.0002, "epoch": 0.6700179533213645, "step": 9330}, {"loss": 0.7781, "grad_norm": 0.5301468372344971, "learning_rate": 0.0002, "epoch": 0.6707360861759426, "step": 9340}, {"loss": 0.7586, "grad_norm": 0.5614377856254578, "learning_rate": 0.0002, "epoch": 0.6714542190305206, "step": 9350}, {"loss": 0.7538, "grad_norm": 0.7177342176437378, "learning_rate": 0.0002, "epoch": 0.6721723518850987, "step": 9360}, {"loss": 0.7412, "grad_norm": 0.5187423825263977, "learning_rate": 0.0002, "epoch": 0.6728904847396768, "step": 9370}, {"loss": 0.7456, "grad_norm": 0.49305087327957153, "learning_rate": 0.0002, "epoch": 0.6736086175942549, "step": 9380}, {"loss": 0.7926, "grad_norm": 0.555867612361908, "learning_rate": 0.0002, "epoch": 0.6743267504488331, "step": 9390}, {"loss": 0.7486, "grad_norm": 0.8308040499687195, "learning_rate": 0.0002, "epoch": 0.6750448833034112, "step": 9400}, {"loss": 0.8225, "grad_norm": 0.6522438526153564, "learning_rate": 0.0002, "epoch": 0.6757630161579893, "step": 9410}, {"loss": 0.8283, "grad_norm": 0.5768371224403381, "learning_rate": 0.0002, "epoch": 0.6764811490125673, "step": 9420}, {"loss": 0.7815, "grad_norm": 0.783802330493927, "learning_rate": 0.0002, "epoch": 0.6771992818671454, "step": 9430}, {"loss": 0.7511, "grad_norm": 0.5246656537055969, "learning_rate": 0.0002, "epoch": 0.6779174147217235, "step": 9440}, {"loss": 0.7866, "grad_norm": 0.6630974411964417, "learning_rate": 0.0002, "epoch": 0.6786355475763016, "step": 9450}, {"loss": 0.7961, "grad_norm": 0.5012770295143127, "learning_rate": 0.0002, "epoch": 0.6793536804308797, "step": 9460}, {"loss": 0.7762, "grad_norm": 0.6208643317222595, "learning_rate": 0.0002, "epoch": 0.6800718132854578, "step": 9470}, {"loss": 0.7229, "grad_norm": 0.6033898591995239, "learning_rate": 0.0002, "epoch": 0.680789946140036, "step": 9480}, {"loss": 0.8315, "grad_norm": 0.6613174080848694, "learning_rate": 0.0002, "epoch": 0.681508078994614, "step": 9490}, {"loss": 0.7874, "grad_norm": 0.6417899131774902, "learning_rate": 0.0002, "epoch": 0.6822262118491921, "step": 9500}, {"loss": 0.7979, "grad_norm": 0.5060321092605591, "learning_rate": 0.0002, "epoch": 0.6829443447037702, "step": 9510}, {"loss": 0.7908, "grad_norm": 0.586670458316803, "learning_rate": 0.0002, "epoch": 0.6836624775583483, "step": 9520}, {"loss": 0.7652, "grad_norm": 0.6607828736305237, "learning_rate": 0.0002, "epoch": 0.6843806104129264, "step": 9530}, {"loss": 0.7645, "grad_norm": 0.5142775177955627, "learning_rate": 0.0002, "epoch": 0.6850987432675045, "step": 9540}, {"loss": 0.7553, "grad_norm": 0.741000771522522, "learning_rate": 0.0002, "epoch": 0.6858168761220825, "step": 9550}, {"loss": 0.8453, "grad_norm": 0.4687826335430145, "learning_rate": 0.0002, "epoch": 0.6865350089766606, "step": 9560}, {"loss": 0.7582, "grad_norm": 0.6452056169509888, "learning_rate": 0.0002, "epoch": 0.6872531418312388, "step": 9570}, {"loss": 0.7965, "grad_norm": 0.6393555402755737, "learning_rate": 0.0002, "epoch": 0.6879712746858169, "step": 9580}, {"loss": 0.802, "grad_norm": 0.4907757043838501, "learning_rate": 0.0002, "epoch": 0.688689407540395, "step": 9590}, {"loss": 0.7813, "grad_norm": 0.5380825996398926, "learning_rate": 0.0002, "epoch": 0.6894075403949731, "step": 9600}, {"loss": 0.8188, "grad_norm": 0.5657393932342529, "learning_rate": 0.0002, "epoch": 0.6901256732495512, "step": 9610}, {"loss": 0.7581, "grad_norm": 0.8505447506904602, "learning_rate": 0.0002, "epoch": 0.6908438061041292, "step": 9620}, {"loss": 0.7631, "grad_norm": 0.5389836430549622, "learning_rate": 0.0002, "epoch": 0.6915619389587073, "step": 9630}, {"loss": 0.8015, "grad_norm": 0.4977441728115082, "learning_rate": 0.0002, "epoch": 0.6922800718132854, "step": 9640}, {"loss": 0.8057, "grad_norm": 0.5855389833450317, "learning_rate": 0.0002, "epoch": 0.6929982046678635, "step": 9650}, {"loss": 0.7735, "grad_norm": 0.633994996547699, "learning_rate": 0.0002, "epoch": 0.6937163375224417, "step": 9660}, {"loss": 0.7918, "grad_norm": 0.5592191815376282, "learning_rate": 0.0002, "epoch": 0.6944344703770198, "step": 9670}, {"loss": 0.7883, "grad_norm": 0.6030594706535339, "learning_rate": 0.0002, "epoch": 0.6951526032315979, "step": 9680}, {"loss": 0.7472, "grad_norm": 0.6782388687133789, "learning_rate": 0.0002, "epoch": 0.6958707360861759, "step": 9690}, {"loss": 0.8097, "grad_norm": 0.6777627468109131, "learning_rate": 0.0002, "epoch": 0.696588868940754, "step": 9700}, {"loss": 0.7958, "grad_norm": 0.5674123764038086, "learning_rate": 0.0002, "epoch": 0.6973070017953321, "step": 9710}, {"loss": 0.7743, "grad_norm": 0.5280387997627258, "learning_rate": 0.0002, "epoch": 0.6980251346499102, "step": 9720}, {"loss": 0.7496, "grad_norm": 0.5471981763839722, "learning_rate": 0.0002, "epoch": 0.6987432675044883, "step": 9730}, {"loss": 0.7837, "grad_norm": 0.6751061677932739, "learning_rate": 0.0002, "epoch": 0.6994614003590665, "step": 9740}, {"loss": 0.7686, "grad_norm": 0.5942487716674805, "learning_rate": 0.0002, "epoch": 0.7001795332136446, "step": 9750}, {"loss": 0.757, "grad_norm": 0.6165713667869568, "learning_rate": 0.0002, "epoch": 0.7008976660682226, "step": 9760}, {"loss": 0.7864, "grad_norm": 0.5745091438293457, "learning_rate": 0.0002, "epoch": 0.7016157989228007, "step": 9770}, {"loss": 0.8079, "grad_norm": 0.600308358669281, "learning_rate": 0.0002, "epoch": 0.7023339317773788, "step": 9780}, {"loss": 0.7527, "grad_norm": 0.6448577046394348, "learning_rate": 0.0002, "epoch": 0.7030520646319569, "step": 9790}, {"loss": 0.7725, "grad_norm": 0.5662767291069031, "learning_rate": 0.0002, "epoch": 0.703770197486535, "step": 9800}, {"loss": 0.8028, "grad_norm": 0.6490433812141418, "learning_rate": 0.0002, "epoch": 0.7044883303411131, "step": 9810}, {"loss": 0.8006, "grad_norm": 0.6126134991645813, "learning_rate": 0.0002, "epoch": 0.7052064631956912, "step": 9820}, {"loss": 0.8034, "grad_norm": 0.7181116938591003, "learning_rate": 0.0002, "epoch": 0.7059245960502692, "step": 9830}, {"loss": 0.7937, "grad_norm": 0.7805212140083313, "learning_rate": 0.0002, "epoch": 0.7066427289048474, "step": 9840}, {"loss": 0.7781, "grad_norm": 0.7521958947181702, "learning_rate": 0.0002, "epoch": 0.7073608617594255, "step": 9850}, {"loss": 0.7412, "grad_norm": 0.5610787868499756, "learning_rate": 0.0002, "epoch": 0.7080789946140036, "step": 9860}, {"loss": 0.7627, "grad_norm": 0.7026229500770569, "learning_rate": 0.0002, "epoch": 0.7087971274685817, "step": 9870}, {"loss": 0.8085, "grad_norm": 0.551691472530365, "learning_rate": 0.0002, "epoch": 0.7095152603231598, "step": 9880}, {"loss": 0.7874, "grad_norm": 0.5841995477676392, "learning_rate": 0.0002, "epoch": 0.7102333931777379, "step": 9890}, {"loss": 0.7749, "grad_norm": 0.7170061469078064, "learning_rate": 0.0002, "epoch": 0.7109515260323159, "step": 9900}, {"loss": 0.7917, "grad_norm": 0.49836990237236023, "learning_rate": 0.0002, "epoch": 0.711669658886894, "step": 9910}, {"loss": 0.7667, "grad_norm": 0.5234556794166565, "learning_rate": 0.0002, "epoch": 0.7123877917414722, "step": 9920}, {"loss": 0.8438, "grad_norm": 0.7590384483337402, "learning_rate": 0.0002, "epoch": 0.7131059245960503, "step": 9930}, {"loss": 0.7725, "grad_norm": 0.5657515525817871, "learning_rate": 0.0002, "epoch": 0.7138240574506284, "step": 9940}, {"loss": 0.8184, "grad_norm": 0.5969128012657166, "learning_rate": 0.0002, "epoch": 0.7145421903052065, "step": 9950}, {"loss": 0.7375, "grad_norm": 0.7136867046356201, "learning_rate": 0.0002, "epoch": 0.7152603231597846, "step": 9960}, {"loss": 0.7883, "grad_norm": 0.6774699091911316, "learning_rate": 0.0002, "epoch": 0.7159784560143626, "step": 9970}, {"loss": 0.7629, "grad_norm": 0.6066371202468872, "learning_rate": 0.0002, "epoch": 0.7166965888689407, "step": 9980}, {"loss": 0.7767, "grad_norm": 0.7355279922485352, "learning_rate": 0.0002, "epoch": 0.7174147217235188, "step": 9990}, {"loss": 0.7643, "grad_norm": 0.7996646761894226, "learning_rate": 0.0002, "epoch": 0.718132854578097, "step": 10000}, {"loss": 0.8304, "grad_norm": 0.628839910030365, "learning_rate": 0.0002, "epoch": 0.7188509874326751, "step": 10010}, {"loss": 0.7292, "grad_norm": 0.5472931265830994, "learning_rate": 0.0002, "epoch": 0.7195691202872532, "step": 10020}, {"loss": 0.7787, "grad_norm": 0.5776344537734985, "learning_rate": 0.0002, "epoch": 0.7202872531418313, "step": 10030}, {"loss": 0.7432, "grad_norm": 0.5041707158088684, "learning_rate": 0.0002, "epoch": 0.7210053859964093, "step": 10040}, {"loss": 0.7923, "grad_norm": 0.5965308547019958, "learning_rate": 0.0002, "epoch": 0.7217235188509874, "step": 10050}, {"loss": 0.8131, "grad_norm": 0.5892689228057861, "learning_rate": 0.0002, "epoch": 0.7224416517055655, "step": 10060}, {"loss": 0.7961, "grad_norm": 0.5695884227752686, "learning_rate": 0.0002, "epoch": 0.7231597845601436, "step": 10070}, {"loss": 0.7806, "grad_norm": 0.6547690629959106, "learning_rate": 0.0002, "epoch": 0.7238779174147217, "step": 10080}, {"loss": 0.7978, "grad_norm": 0.6759928464889526, "learning_rate": 0.0002, "epoch": 0.7245960502692999, "step": 10090}, {"loss": 0.7547, "grad_norm": 0.6829725503921509, "learning_rate": 0.0002, "epoch": 0.725314183123878, "step": 10100}, {"loss": 0.7507, "grad_norm": 0.5242751240730286, "learning_rate": 0.0002, "epoch": 0.726032315978456, "step": 10110}, {"loss": 0.8042, "grad_norm": 0.6947014927864075, "learning_rate": 0.0002, "epoch": 0.7267504488330341, "step": 10120}, {"loss": 0.7621, "grad_norm": 0.6094982624053955, "learning_rate": 0.0002, "epoch": 0.7274685816876122, "step": 10130}, {"loss": 0.7911, "grad_norm": 0.628461480140686, "learning_rate": 0.0002, "epoch": 0.7281867145421903, "step": 10140}, {"loss": 0.7839, "grad_norm": 0.4952087104320526, "learning_rate": 0.0002, "epoch": 0.7289048473967684, "step": 10150}, {"loss": 0.7582, "grad_norm": 0.6917221546173096, "learning_rate": 0.0002, "epoch": 0.7296229802513465, "step": 10160}, {"loss": 0.7791, "grad_norm": 0.6866413354873657, "learning_rate": 0.0002, "epoch": 0.7303411131059246, "step": 10170}, {"loss": 0.7628, "grad_norm": 0.5505863428115845, "learning_rate": 0.0002, "epoch": 0.7310592459605026, "step": 10180}, {"loss": 0.7941, "grad_norm": 0.5903199911117554, "learning_rate": 0.0002, "epoch": 0.7317773788150808, "step": 10190}, {"loss": 0.8072, "grad_norm": 0.5001798272132874, "learning_rate": 0.0002, "epoch": 0.7324955116696589, "step": 10200}, {"loss": 0.7934, "grad_norm": 0.5117581486701965, "learning_rate": 0.0002, "epoch": 0.733213644524237, "step": 10210}, {"loss": 0.8364, "grad_norm": 0.7716088891029358, "learning_rate": 0.0002, "epoch": 0.7339317773788151, "step": 10220}, {"loss": 0.7775, "grad_norm": 0.5973874926567078, "learning_rate": 0.0002, "epoch": 0.7346499102333932, "step": 10230}, {"loss": 0.7689, "grad_norm": 0.6433483362197876, "learning_rate": 0.0002, "epoch": 0.7353680430879713, "step": 10240}, {"loss": 0.8307, "grad_norm": 0.6241081357002258, "learning_rate": 0.0002, "epoch": 0.7360861759425493, "step": 10250}, {"loss": 0.7432, "grad_norm": 0.7198845744132996, "learning_rate": 0.0002, "epoch": 0.7368043087971274, "step": 10260}, {"loss": 0.7545, "grad_norm": 0.5879023671150208, "learning_rate": 0.0002, "epoch": 0.7375224416517056, "step": 10270}, {"loss": 0.7526, "grad_norm": 0.5810162425041199, "learning_rate": 0.0002, "epoch": 0.7382405745062837, "step": 10280}, {"loss": 0.7839, "grad_norm": 0.6336500644683838, "learning_rate": 0.0002, "epoch": 0.7389587073608618, "step": 10290}, {"loss": 0.7597, "grad_norm": 0.5627583861351013, "learning_rate": 0.0002, "epoch": 0.7396768402154399, "step": 10300}, {"loss": 0.8166, "grad_norm": 0.5396066904067993, "learning_rate": 0.0002, "epoch": 0.740394973070018, "step": 10310}, {"loss": 0.7698, "grad_norm": 0.5519505143165588, "learning_rate": 0.0002, "epoch": 0.741113105924596, "step": 10320}, {"loss": 0.7953, "grad_norm": 0.628710925579071, "learning_rate": 0.0002, "epoch": 0.7418312387791741, "step": 10330}, {"loss": 0.805, "grad_norm": 0.6466957926750183, "learning_rate": 0.0002, "epoch": 0.7425493716337522, "step": 10340}, {"loss": 0.8173, "grad_norm": 0.6269286274909973, "learning_rate": 0.0002, "epoch": 0.7432675044883303, "step": 10350}, {"loss": 0.8315, "grad_norm": 0.6985455751419067, "learning_rate": 0.0002, "epoch": 0.7439856373429085, "step": 10360}, {"loss": 0.7598, "grad_norm": 0.6203648447990417, "learning_rate": 0.0002, "epoch": 0.7447037701974866, "step": 10370}, {"loss": 0.7937, "grad_norm": 0.6524295210838318, "learning_rate": 0.0002, "epoch": 0.7454219030520647, "step": 10380}, {"loss": 0.8005, "grad_norm": 0.6108002662658691, "learning_rate": 0.0002, "epoch": 0.7461400359066427, "step": 10390}, {"loss": 0.7592, "grad_norm": 0.5196276903152466, "learning_rate": 0.0002, "epoch": 0.7468581687612208, "step": 10400}, {"loss": 0.7769, "grad_norm": 0.6207506656646729, "learning_rate": 0.0002, "epoch": 0.7475763016157989, "step": 10410}, {"loss": 0.8066, "grad_norm": 0.6015686988830566, "learning_rate": 0.0002, "epoch": 0.748294434470377, "step": 10420}, {"loss": 0.7993, "grad_norm": 0.6402649879455566, "learning_rate": 0.0002, "epoch": 0.7490125673249551, "step": 10430}, {"loss": 0.802, "grad_norm": 0.7816081047058105, "learning_rate": 0.0002, "epoch": 0.7497307001795332, "step": 10440}, {"loss": 0.8021, "grad_norm": 0.6148143410682678, "learning_rate": 0.0002, "epoch": 0.7504488330341114, "step": 10450}, {"loss": 0.7986, "grad_norm": 0.6496613621711731, "learning_rate": 0.0002, "epoch": 0.7511669658886894, "step": 10460}, {"loss": 0.8152, "grad_norm": 0.49158045649528503, "learning_rate": 0.0002, "epoch": 0.7518850987432675, "step": 10470}, {"loss": 0.8098, "grad_norm": 0.8629217743873596, "learning_rate": 0.0002, "epoch": 0.7526032315978456, "step": 10480}, {"loss": 0.807, "grad_norm": 0.6800066828727722, "learning_rate": 0.0002, "epoch": 0.7533213644524237, "step": 10490}, {"loss": 0.7238, "grad_norm": 0.6480063199996948, "learning_rate": 0.0002, "epoch": 0.7540394973070018, "step": 10500}, {"loss": 0.7818, "grad_norm": 0.5740751028060913, "learning_rate": 0.0002, "epoch": 0.7547576301615799, "step": 10510}, {"loss": 0.7732, "grad_norm": 0.7182627320289612, "learning_rate": 0.0002, "epoch": 0.755475763016158, "step": 10520}, {"loss": 0.7752, "grad_norm": 0.6482816934585571, "learning_rate": 0.0002, "epoch": 0.756193895870736, "step": 10530}, {"loss": 0.7564, "grad_norm": 0.4937674105167389, "learning_rate": 0.0002, "epoch": 0.7569120287253142, "step": 10540}, {"loss": 0.7783, "grad_norm": 0.6818482875823975, "learning_rate": 0.0002, "epoch": 0.7576301615798923, "step": 10550}, {"loss": 0.8303, "grad_norm": 0.6375173926353455, "learning_rate": 0.0002, "epoch": 0.7583482944344704, "step": 10560}, {"loss": 0.77, "grad_norm": 0.528798520565033, "learning_rate": 0.0002, "epoch": 0.7590664272890485, "step": 10570}, {"loss": 0.8435, "grad_norm": 0.42099910974502563, "learning_rate": 0.0002, "epoch": 0.7597845601436266, "step": 10580}, {"loss": 0.8218, "grad_norm": 0.529604434967041, "learning_rate": 0.0002, "epoch": 0.7605026929982047, "step": 10590}, {"loss": 0.7833, "grad_norm": 0.6236841082572937, "learning_rate": 0.0002, "epoch": 0.7612208258527827, "step": 10600}, {"loss": 0.777, "grad_norm": 0.6194891929626465, "learning_rate": 0.0002, "epoch": 0.7619389587073608, "step": 10610}, {"loss": 0.7967, "grad_norm": 0.5206209421157837, "learning_rate": 0.0002, "epoch": 0.762657091561939, "step": 10620}, {"loss": 0.811, "grad_norm": 0.7981295585632324, "learning_rate": 0.0002, "epoch": 0.7633752244165171, "step": 10630}, {"loss": 0.8016, "grad_norm": 0.6113479137420654, "learning_rate": 0.0002, "epoch": 0.7640933572710952, "step": 10640}, {"loss": 0.7642, "grad_norm": 0.7025435566902161, "learning_rate": 0.0002, "epoch": 0.7648114901256733, "step": 10650}, {"loss": 0.7293, "grad_norm": 0.46914348006248474, "learning_rate": 0.0002, "epoch": 0.7655296229802514, "step": 10660}, {"loss": 0.8079, "grad_norm": 0.6134725213050842, "learning_rate": 0.0002, "epoch": 0.7662477558348294, "step": 10670}, {"loss": 0.7469, "grad_norm": 0.583859920501709, "learning_rate": 0.0002, "epoch": 0.7669658886894075, "step": 10680}, {"loss": 0.843, "grad_norm": 0.511349081993103, "learning_rate": 0.0002, "epoch": 0.7676840215439856, "step": 10690}, {"loss": 0.8355, "grad_norm": 0.6467110514640808, "learning_rate": 0.0002, "epoch": 0.7684021543985637, "step": 10700}, {"loss": 0.7935, "grad_norm": 0.7210163474082947, "learning_rate": 0.0002, "epoch": 0.7691202872531419, "step": 10710}, {"loss": 0.7807, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "epoch": 0.76983842010772, "step": 10720}, {"loss": 0.7742, "grad_norm": 0.6237271428108215, "learning_rate": 0.0002, "epoch": 0.7705565529622981, "step": 10730}, {"loss": 0.8185, "grad_norm": 0.664328396320343, "learning_rate": 0.0002, "epoch": 0.7712746858168761, "step": 10740}, {"loss": 0.8096, "grad_norm": 0.6550520062446594, "learning_rate": 0.0002, "epoch": 0.7719928186714542, "step": 10750}, {"loss": 0.7538, "grad_norm": 0.5103325843811035, "learning_rate": 0.0002, "epoch": 0.7727109515260323, "step": 10760}, {"loss": 0.7777, "grad_norm": 0.7171200513839722, "learning_rate": 0.0002, "epoch": 0.7734290843806104, "step": 10770}, {"loss": 0.7743, "grad_norm": 0.5947384834289551, "learning_rate": 0.0002, "epoch": 0.7741472172351885, "step": 10780}, {"loss": 0.781, "grad_norm": 0.5293096899986267, "learning_rate": 0.0002, "epoch": 0.7748653500897666, "step": 10790}, {"loss": 0.777, "grad_norm": 0.6372577548027039, "learning_rate": 0.0002, "epoch": 0.7755834829443446, "step": 10800}, {"loss": 0.7972, "grad_norm": 0.5738261938095093, "learning_rate": 0.0002, "epoch": 0.7763016157989228, "step": 10810}, {"loss": 0.7877, "grad_norm": 0.7309247255325317, "learning_rate": 0.0002, "epoch": 0.7770197486535009, "step": 10820}, {"loss": 0.7745, "grad_norm": 0.8867193460464478, "learning_rate": 0.0002, "epoch": 0.777737881508079, "step": 10830}, {"loss": 0.7959, "grad_norm": 0.6151437759399414, "learning_rate": 0.0002, "epoch": 0.7784560143626571, "step": 10840}, {"loss": 0.7897, "grad_norm": 0.5645464658737183, "learning_rate": 0.0002, "epoch": 0.7791741472172352, "step": 10850}, {"loss": 0.7858, "grad_norm": 0.5118698477745056, "learning_rate": 0.0002, "epoch": 0.7798922800718133, "step": 10860}, {"loss": 0.8064, "grad_norm": 0.618181049823761, "learning_rate": 0.0002, "epoch": 0.7806104129263913, "step": 10870}, {"loss": 0.7675, "grad_norm": 0.7206462025642395, "learning_rate": 0.0002, "epoch": 0.7813285457809694, "step": 10880}, {"loss": 0.8162, "grad_norm": 0.7993820905685425, "learning_rate": 0.0002, "epoch": 0.7820466786355476, "step": 10890}, {"loss": 0.781, "grad_norm": 0.5072754621505737, "learning_rate": 0.0002, "epoch": 0.7827648114901257, "step": 10900}, {"loss": 0.7575, "grad_norm": 0.5829088687896729, "learning_rate": 0.0002, "epoch": 0.7834829443447038, "step": 10910}, {"loss": 0.7552, "grad_norm": 0.5778957605361938, "learning_rate": 0.0002, "epoch": 0.7842010771992819, "step": 10920}, {"loss": 0.7652, "grad_norm": 0.7237067222595215, "learning_rate": 0.0002, "epoch": 0.78491921005386, "step": 10930}, {"loss": 0.8357, "grad_norm": 0.5778013467788696, "learning_rate": 0.0002, "epoch": 0.785637342908438, "step": 10940}, {"loss": 0.7464, "grad_norm": 0.6129629611968994, "learning_rate": 0.0002, "epoch": 0.7863554757630161, "step": 10950}, {"loss": 0.7863, "grad_norm": 0.5637320876121521, "learning_rate": 0.0002, "epoch": 0.7870736086175942, "step": 10960}, {"loss": 0.7645, "grad_norm": 0.6253715753555298, "learning_rate": 0.0002, "epoch": 0.7877917414721723, "step": 10970}, {"loss": 0.8307, "grad_norm": 0.6209888458251953, "learning_rate": 0.0002, "epoch": 0.7885098743267505, "step": 10980}, {"loss": 0.7899, "grad_norm": 1.0841948986053467, "learning_rate": 0.0002, "epoch": 0.7892280071813286, "step": 10990}, {"loss": 0.7659, "grad_norm": 0.6570560336112976, "learning_rate": 0.0002, "epoch": 0.7899461400359067, "step": 11000}, {"loss": 0.7839, "grad_norm": 0.4830388128757477, "learning_rate": 0.0002, "epoch": 0.7906642728904847, "step": 11010}, {"loss": 0.8064, "grad_norm": 0.7607520222663879, "learning_rate": 0.0002, "epoch": 0.7913824057450628, "step": 11020}, {"loss": 0.8009, "grad_norm": 0.8202590346336365, "learning_rate": 0.0002, "epoch": 0.7921005385996409, "step": 11030}, {"loss": 0.7788, "grad_norm": 0.5640848278999329, "learning_rate": 0.0002, "epoch": 0.792818671454219, "step": 11040}, {"loss": 0.8298, "grad_norm": 0.7773675322532654, "learning_rate": 0.0002, "epoch": 0.7935368043087971, "step": 11050}, {"loss": 0.793, "grad_norm": 0.664139986038208, "learning_rate": 0.0002, "epoch": 0.7942549371633753, "step": 11060}, {"loss": 0.7886, "grad_norm": 0.6097795367240906, "learning_rate": 0.0002, "epoch": 0.7949730700179534, "step": 11070}, {"loss": 0.7989, "grad_norm": 0.9208881258964539, "learning_rate": 0.0002, "epoch": 0.7956912028725314, "step": 11080}, {"loss": 0.8045, "grad_norm": 0.6210731863975525, "learning_rate": 0.0002, "epoch": 0.7964093357271095, "step": 11090}, {"loss": 0.7868, "grad_norm": 0.7060235738754272, "learning_rate": 0.0002, "epoch": 0.7971274685816876, "step": 11100}, {"loss": 0.8041, "grad_norm": 0.48695266246795654, "learning_rate": 0.0002, "epoch": 0.7978456014362657, "step": 11110}, {"loss": 0.7885, "grad_norm": 0.6458830833435059, "learning_rate": 0.0002, "epoch": 0.7985637342908438, "step": 11120}, {"loss": 0.7773, "grad_norm": 0.572545051574707, "learning_rate": 0.0002, "epoch": 0.7992818671454219, "step": 11130}, {"loss": 0.7984, "grad_norm": 0.5925027132034302, "learning_rate": 0.0002, "epoch": 0.8, "step": 11140}, {"loss": 0.7571, "grad_norm": 0.569622278213501, "learning_rate": 0.0002, "epoch": 0.800718132854578, "step": 11150}, {"loss": 0.7765, "grad_norm": 0.537146806716919, "learning_rate": 0.0002, "epoch": 0.8014362657091562, "step": 11160}, {"loss": 0.7896, "grad_norm": 0.7118613719940186, "learning_rate": 0.0002, "epoch": 0.8021543985637343, "step": 11170}, {"loss": 0.7398, "grad_norm": 0.6183688044548035, "learning_rate": 0.0002, "epoch": 0.8028725314183124, "step": 11180}, {"loss": 0.7545, "grad_norm": 0.5187385082244873, "learning_rate": 0.0002, "epoch": 0.8035906642728905, "step": 11190}, {"loss": 0.766, "grad_norm": 0.5422571301460266, "learning_rate": 0.0002, "epoch": 0.8043087971274686, "step": 11200}, {"loss": 0.756, "grad_norm": 0.635050892829895, "learning_rate": 0.0002, "epoch": 0.8050269299820467, "step": 11210}, {"loss": 0.7337, "grad_norm": 0.6584872007369995, "learning_rate": 0.0002, "epoch": 0.8057450628366247, "step": 11220}, {"loss": 0.7467, "grad_norm": 0.624921977519989, "learning_rate": 0.0002, "epoch": 0.8064631956912028, "step": 11230}, {"loss": 0.7559, "grad_norm": 0.6837546229362488, "learning_rate": 0.0002, "epoch": 0.807181328545781, "step": 11240}, {"loss": 0.7861, "grad_norm": 0.5861160755157471, "learning_rate": 0.0002, "epoch": 0.8078994614003591, "step": 11250}, {"loss": 0.7883, "grad_norm": 0.5751383900642395, "learning_rate": 0.0002, "epoch": 0.8086175942549372, "step": 11260}, {"loss": 0.8103, "grad_norm": 0.7181510329246521, "learning_rate": 0.0002, "epoch": 0.8093357271095153, "step": 11270}, {"loss": 0.8066, "grad_norm": 0.5862139463424683, "learning_rate": 0.0002, "epoch": 0.8100538599640934, "step": 11280}, {"loss": 0.7692, "grad_norm": 0.4880113899707794, "learning_rate": 0.0002, "epoch": 0.8107719928186714, "step": 11290}, {"loss": 0.8154, "grad_norm": 0.565590500831604, "learning_rate": 0.0002, "epoch": 0.8114901256732495, "step": 11300}, {"loss": 0.7893, "grad_norm": 0.6171264052391052, "learning_rate": 0.0002, "epoch": 0.8122082585278276, "step": 11310}, {"loss": 0.816, "grad_norm": 0.5815969109535217, "learning_rate": 0.0002, "epoch": 0.8129263913824057, "step": 11320}, {"loss": 0.7462, "grad_norm": 0.5407653450965881, "learning_rate": 0.0002, "epoch": 0.8136445242369839, "step": 11330}, {"loss": 0.7647, "grad_norm": 0.6990084648132324, "learning_rate": 0.0002, "epoch": 0.814362657091562, "step": 11340}, {"loss": 0.783, "grad_norm": 0.5845068097114563, "learning_rate": 0.0002, "epoch": 0.8150807899461401, "step": 11350}, {"loss": 0.7839, "grad_norm": 0.5978701114654541, "learning_rate": 0.0002, "epoch": 0.8157989228007181, "step": 11360}, {"loss": 0.7342, "grad_norm": 0.6873053312301636, "learning_rate": 0.0002, "epoch": 0.8165170556552962, "step": 11370}, {"loss": 0.7656, "grad_norm": 0.7048654556274414, "learning_rate": 0.0002, "epoch": 0.8172351885098743, "step": 11380}, {"loss": 0.7293, "grad_norm": 0.7631531953811646, "learning_rate": 0.0002, "epoch": 0.8179533213644524, "step": 11390}, {"loss": 0.8606, "grad_norm": 0.704922080039978, "learning_rate": 0.0002, "epoch": 0.8186714542190305, "step": 11400}, {"loss": 0.8066, "grad_norm": 0.595460832118988, "learning_rate": 0.0002, "epoch": 0.8193895870736086, "step": 11410}, {"loss": 0.809, "grad_norm": 0.5882242918014526, "learning_rate": 0.0002, "epoch": 0.8201077199281868, "step": 11420}, {"loss": 0.7639, "grad_norm": 0.6433175206184387, "learning_rate": 0.0002, "epoch": 0.8208258527827648, "step": 11430}, {"loss": 0.7522, "grad_norm": 0.6047986149787903, "learning_rate": 0.0002, "epoch": 0.8215439856373429, "step": 11440}, {"loss": 0.8305, "grad_norm": 0.6462088823318481, "learning_rate": 0.0002, "epoch": 0.822262118491921, "step": 11450}, {"loss": 0.8144, "grad_norm": 0.5558379888534546, "learning_rate": 0.0002, "epoch": 0.8229802513464991, "step": 11460}, {"loss": 0.7916, "grad_norm": 0.6745542287826538, "learning_rate": 0.0002, "epoch": 0.8236983842010772, "step": 11470}, {"loss": 0.7853, "grad_norm": 0.7082334756851196, "learning_rate": 0.0002, "epoch": 0.8244165170556553, "step": 11480}, {"loss": 0.7533, "grad_norm": 0.703889787197113, "learning_rate": 0.0002, "epoch": 0.8251346499102334, "step": 11490}, {"loss": 0.8085, "grad_norm": 0.5261096358299255, "learning_rate": 0.0002, "epoch": 0.8258527827648114, "step": 11500}, {"loss": 0.7903, "grad_norm": 0.6009393930435181, "learning_rate": 0.0002, "epoch": 0.8265709156193896, "step": 11510}, {"loss": 0.7377, "grad_norm": 0.584274172782898, "learning_rate": 0.0002, "epoch": 0.8272890484739677, "step": 11520}, {"loss": 0.7926, "grad_norm": 0.6803238987922668, "learning_rate": 0.0002, "epoch": 0.8280071813285458, "step": 11530}, {"loss": 0.7948, "grad_norm": 0.6230084896087646, "learning_rate": 0.0002, "epoch": 0.8287253141831239, "step": 11540}, {"loss": 0.7902, "grad_norm": 0.6090595722198486, "learning_rate": 0.0002, "epoch": 0.829443447037702, "step": 11550}, {"loss": 0.7514, "grad_norm": 0.5292693376541138, "learning_rate": 0.0002, "epoch": 0.8301615798922801, "step": 11560}, {"loss": 0.7979, "grad_norm": 0.5675389766693115, "learning_rate": 0.0002, "epoch": 0.8308797127468581, "step": 11570}, {"loss": 0.7851, "grad_norm": 0.554874062538147, "learning_rate": 0.0002, "epoch": 0.8315978456014362, "step": 11580}, {"loss": 0.8004, "grad_norm": 0.8582373261451721, "learning_rate": 0.0002, "epoch": 0.8323159784560143, "step": 11590}, {"loss": 0.7864, "grad_norm": 0.5743035674095154, "learning_rate": 0.0002, "epoch": 0.8330341113105925, "step": 11600}, {"loss": 0.7714, "grad_norm": 0.5749582648277283, "learning_rate": 0.0002, "epoch": 0.8337522441651706, "step": 11610}, {"loss": 0.8131, "grad_norm": 0.5207278728485107, "learning_rate": 0.0002, "epoch": 0.8344703770197487, "step": 11620}, {"loss": 0.785, "grad_norm": 0.6262611150741577, "learning_rate": 0.0002, "epoch": 0.8351885098743268, "step": 11630}, {"loss": 0.7699, "grad_norm": 0.5490066409111023, "learning_rate": 0.0002, "epoch": 0.8359066427289048, "step": 11640}, {"loss": 0.7779, "grad_norm": 0.6283167600631714, "learning_rate": 0.0002, "epoch": 0.8366247755834829, "step": 11650}, {"loss": 0.7508, "grad_norm": 0.7701452374458313, "learning_rate": 0.0002, "epoch": 0.837342908438061, "step": 11660}, {"loss": 0.7662, "grad_norm": 0.5825072526931763, "learning_rate": 0.0002, "epoch": 0.8380610412926391, "step": 11670}, {"loss": 0.758, "grad_norm": 0.6119720935821533, "learning_rate": 0.0002, "epoch": 0.8387791741472173, "step": 11680}, {"loss": 0.7995, "grad_norm": 0.689383327960968, "learning_rate": 0.0002, "epoch": 0.8394973070017954, "step": 11690}, {"loss": 0.7615, "grad_norm": 0.5396560430526733, "learning_rate": 0.0002, "epoch": 0.8402154398563735, "step": 11700}, {"loss": 0.8073, "grad_norm": 0.577178955078125, "learning_rate": 0.0002, "epoch": 0.8409335727109515, "step": 11710}, {"loss": 0.7911, "grad_norm": 0.6652564406394958, "learning_rate": 0.0002, "epoch": 0.8416517055655296, "step": 11720}, {"loss": 0.7708, "grad_norm": 0.588377058506012, "learning_rate": 0.0002, "epoch": 0.8423698384201077, "step": 11730}, {"loss": 0.8245, "grad_norm": 0.6180438995361328, "learning_rate": 0.0002, "epoch": 0.8430879712746858, "step": 11740}, {"loss": 0.729, "grad_norm": 0.6897811889648438, "learning_rate": 0.0002, "epoch": 0.8438061041292639, "step": 11750}, {"loss": 0.8026, "grad_norm": 0.5826608538627625, "learning_rate": 0.0002, "epoch": 0.844524236983842, "step": 11760}, {"loss": 0.7959, "grad_norm": 0.6511976718902588, "learning_rate": 0.0002, "epoch": 0.8452423698384202, "step": 11770}, {"loss": 0.7705, "grad_norm": 0.4738382399082184, "learning_rate": 0.0002, "epoch": 0.8459605026929982, "step": 11780}, {"loss": 0.8317, "grad_norm": 0.541780948638916, "learning_rate": 0.0002, "epoch": 0.8466786355475763, "step": 11790}, {"loss": 0.774, "grad_norm": 0.6115241050720215, "learning_rate": 0.0002, "epoch": 0.8473967684021544, "step": 11800}, {"loss": 0.834, "grad_norm": 0.7067801356315613, "learning_rate": 0.0002, "epoch": 0.8481149012567325, "step": 11810}, {"loss": 0.7725, "grad_norm": 0.5602791905403137, "learning_rate": 0.0002, "epoch": 0.8488330341113106, "step": 11820}, {"loss": 0.7832, "grad_norm": 0.6968005299568176, "learning_rate": 0.0002, "epoch": 0.8495511669658887, "step": 11830}, {"loss": 0.7556, "grad_norm": 0.621132493019104, "learning_rate": 0.0002, "epoch": 0.8502692998204668, "step": 11840}, {"loss": 0.8036, "grad_norm": 0.5777568817138672, "learning_rate": 0.0002, "epoch": 0.8509874326750448, "step": 11850}, {"loss": 0.8071, "grad_norm": 0.6468178629875183, "learning_rate": 0.0002, "epoch": 0.851705565529623, "step": 11860}, {"loss": 0.8074, "grad_norm": 0.6216070652008057, "learning_rate": 0.0002, "epoch": 0.8524236983842011, "step": 11870}, {"loss": 0.7736, "grad_norm": 0.7402005791664124, "learning_rate": 0.0002, "epoch": 0.8531418312387792, "step": 11880}, {"loss": 0.7877, "grad_norm": 0.5192958116531372, "learning_rate": 0.0002, "epoch": 0.8538599640933573, "step": 11890}, {"loss": 0.7113, "grad_norm": 0.6050501465797424, "learning_rate": 0.0002, "epoch": 0.8545780969479354, "step": 11900}, {"loss": 0.8131, "grad_norm": 0.5363124012947083, "learning_rate": 0.0002, "epoch": 0.8552962298025135, "step": 11910}, {"loss": 0.7861, "grad_norm": 0.525288462638855, "learning_rate": 0.0002, "epoch": 0.8560143626570915, "step": 11920}, {"loss": 0.726, "grad_norm": 0.6129848957061768, "learning_rate": 0.0002, "epoch": 0.8567324955116696, "step": 11930}, {"loss": 0.7921, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 0.8574506283662477, "step": 11940}, {"loss": 0.772, "grad_norm": 0.5862830281257629, "learning_rate": 0.0002, "epoch": 0.8581687612208259, "step": 11950}, {"loss": 0.7272, "grad_norm": 0.7078025341033936, "learning_rate": 0.0002, "epoch": 0.858886894075404, "step": 11960}, {"loss": 0.7733, "grad_norm": 0.6600908637046814, "learning_rate": 0.0002, "epoch": 0.8596050269299821, "step": 11970}, {"loss": 0.7784, "grad_norm": 0.5914377570152283, "learning_rate": 0.0002, "epoch": 0.8603231597845602, "step": 11980}, {"loss": 0.8222, "grad_norm": 0.7844575047492981, "learning_rate": 0.0002, "epoch": 0.8610412926391382, "step": 11990}, {"loss": 0.8059, "grad_norm": 0.6605148315429688, "learning_rate": 0.0002, "epoch": 0.8617594254937163, "step": 12000}, {"loss": 0.8066, "grad_norm": 0.6320111155509949, "learning_rate": 0.0002, "epoch": 0.8624775583482944, "step": 12010}, {"loss": 0.7844, "grad_norm": 0.5833557844161987, "learning_rate": 0.0002, "epoch": 0.8631956912028725, "step": 12020}, {"loss": 0.8016, "grad_norm": 0.5322666764259338, "learning_rate": 0.0002, "epoch": 0.8639138240574507, "step": 12030}, {"loss": 0.8142, "grad_norm": 0.568696141242981, "learning_rate": 0.0002, "epoch": 0.8646319569120288, "step": 12040}, {"loss": 0.7929, "grad_norm": 0.5739135146141052, "learning_rate": 0.0002, "epoch": 0.8653500897666068, "step": 12050}, {"loss": 0.7877, "grad_norm": 0.6667993068695068, "learning_rate": 0.0002, "epoch": 0.8660682226211849, "step": 12060}, {"loss": 0.7538, "grad_norm": 0.5393701195716858, "learning_rate": 0.0002, "epoch": 0.866786355475763, "step": 12070}, {"loss": 0.8014, "grad_norm": 0.7036312818527222, "learning_rate": 0.0002, "epoch": 0.8675044883303411, "step": 12080}, {"loss": 0.7937, "grad_norm": 0.5851739048957825, "learning_rate": 0.0002, "epoch": 0.8682226211849192, "step": 12090}, {"loss": 0.8121, "grad_norm": 0.6554462909698486, "learning_rate": 0.0002, "epoch": 0.8689407540394973, "step": 12100}, {"loss": 0.8541, "grad_norm": 0.8224838376045227, "learning_rate": 0.0002, "epoch": 0.8696588868940754, "step": 12110}, {"loss": 0.73, "grad_norm": 0.513981819152832, "learning_rate": 0.0002, "epoch": 0.8703770197486534, "step": 12120}, {"loss": 0.7371, "grad_norm": 0.6913988590240479, "learning_rate": 0.0002, "epoch": 0.8710951526032316, "step": 12130}, {"loss": 0.762, "grad_norm": 0.5539003610610962, "learning_rate": 0.0002, "epoch": 0.8718132854578097, "step": 12140}, {"loss": 0.7535, "grad_norm": 0.6216937303543091, "learning_rate": 0.0002, "epoch": 0.8725314183123878, "step": 12150}, {"loss": 0.7344, "grad_norm": 0.5594495534896851, "learning_rate": 0.0002, "epoch": 0.8732495511669659, "step": 12160}, {"loss": 0.7342, "grad_norm": 0.6025309562683105, "learning_rate": 0.0002, "epoch": 0.873967684021544, "step": 12170}, {"loss": 0.7561, "grad_norm": 0.5285239815711975, "learning_rate": 0.0002, "epoch": 0.8746858168761221, "step": 12180}, {"loss": 0.7619, "grad_norm": 1.0394607782363892, "learning_rate": 0.0002, "epoch": 0.8754039497307001, "step": 12190}, {"loss": 0.8111, "grad_norm": 0.5128031373023987, "learning_rate": 0.0002, "epoch": 0.8761220825852782, "step": 12200}, {"loss": 0.8113, "grad_norm": 0.5883685946464539, "learning_rate": 0.0002, "epoch": 0.8768402154398564, "step": 12210}, {"loss": 0.7493, "grad_norm": 0.593204915523529, "learning_rate": 0.0002, "epoch": 0.8775583482944345, "step": 12220}, {"loss": 0.7739, "grad_norm": 0.7141679525375366, "learning_rate": 0.0002, "epoch": 0.8782764811490126, "step": 12230}, {"loss": 0.8155, "grad_norm": 0.6381585597991943, "learning_rate": 0.0002, "epoch": 0.8789946140035907, "step": 12240}, {"loss": 0.7756, "grad_norm": 0.7076981067657471, "learning_rate": 0.0002, "epoch": 0.8797127468581688, "step": 12250}, {"loss": 0.8186, "grad_norm": 0.8046461939811707, "learning_rate": 0.0002, "epoch": 0.8804308797127468, "step": 12260}, {"loss": 0.7615, "grad_norm": 0.635160505771637, "learning_rate": 0.0002, "epoch": 0.8811490125673249, "step": 12270}, {"loss": 0.7695, "grad_norm": 0.6388354301452637, "learning_rate": 0.0002, "epoch": 0.881867145421903, "step": 12280}, {"loss": 0.81, "grad_norm": 0.5612906217575073, "learning_rate": 0.0002, "epoch": 0.8825852782764811, "step": 12290}, {"loss": 0.8055, "grad_norm": 0.6716228723526001, "learning_rate": 0.0002, "epoch": 0.8833034111310593, "step": 12300}, {"loss": 0.757, "grad_norm": 0.6488762497901917, "learning_rate": 0.0002, "epoch": 0.8840215439856374, "step": 12310}, {"loss": 0.7794, "grad_norm": 0.5770853757858276, "learning_rate": 0.0002, "epoch": 0.8847396768402155, "step": 12320}, {"loss": 0.7617, "grad_norm": 0.5006616711616516, "learning_rate": 0.0002, "epoch": 0.8854578096947935, "step": 12330}, {"loss": 0.7512, "grad_norm": 0.6428417563438416, "learning_rate": 0.0002, "epoch": 0.8861759425493716, "step": 12340}, {"loss": 0.796, "grad_norm": 0.5721977949142456, "learning_rate": 0.0002, "epoch": 0.8868940754039497, "step": 12350}, {"loss": 0.7764, "grad_norm": 0.7000266313552856, "learning_rate": 0.0002, "epoch": 0.8876122082585278, "step": 12360}, {"loss": 0.7524, "grad_norm": 0.5252631306648254, "learning_rate": 0.0002, "epoch": 0.8883303411131059, "step": 12370}, {"loss": 0.7635, "grad_norm": 0.5788044929504395, "learning_rate": 0.0002, "epoch": 0.889048473967684, "step": 12380}, {"loss": 0.7856, "grad_norm": 0.6730653643608093, "learning_rate": 0.0002, "epoch": 0.8897666068222622, "step": 12390}, {"loss": 0.7925, "grad_norm": 0.5556851029396057, "learning_rate": 0.0002, "epoch": 0.8904847396768402, "step": 12400}, {"loss": 0.6958, "grad_norm": 0.616189181804657, "learning_rate": 0.0002, "epoch": 0.8912028725314183, "step": 12410}, {"loss": 0.7468, "grad_norm": 0.6360940337181091, "learning_rate": 0.0002, "epoch": 0.8919210053859964, "step": 12420}, {"loss": 0.8088, "grad_norm": 0.5832887887954712, "learning_rate": 0.0002, "epoch": 0.8926391382405745, "step": 12430}, {"loss": 0.7383, "grad_norm": 0.8319168090820312, "learning_rate": 0.0002, "epoch": 0.8933572710951526, "step": 12440}, {"loss": 0.8597, "grad_norm": 0.5415005087852478, "learning_rate": 0.0002, "epoch": 0.8940754039497307, "step": 12450}, {"loss": 0.7439, "grad_norm": 0.4959808588027954, "learning_rate": 0.0002, "epoch": 0.8947935368043088, "step": 12460}, {"loss": 0.8493, "grad_norm": 0.5102260708808899, "learning_rate": 0.0002, "epoch": 0.8955116696588868, "step": 12470}, {"loss": 0.7274, "grad_norm": 0.773972749710083, "learning_rate": 0.0002, "epoch": 0.896229802513465, "step": 12480}, {"loss": 0.7797, "grad_norm": 0.6314513087272644, "learning_rate": 0.0002, "epoch": 0.8969479353680431, "step": 12490}, {"loss": 0.7839, "grad_norm": 0.6503705382347107, "learning_rate": 0.0002, "epoch": 0.8976660682226212, "step": 12500}, {"loss": 0.8177, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 0.8983842010771993, "step": 12510}, {"loss": 0.7448, "grad_norm": 0.7222756743431091, "learning_rate": 0.0002, "epoch": 0.8991023339317774, "step": 12520}, {"loss": 0.7779, "grad_norm": 0.7242336869239807, "learning_rate": 0.0002, "epoch": 0.8998204667863555, "step": 12530}, {"loss": 0.7577, "grad_norm": 0.625769317150116, "learning_rate": 0.0002, "epoch": 0.9005385996409335, "step": 12540}, {"loss": 0.8528, "grad_norm": 0.6003357172012329, "learning_rate": 0.0002, "epoch": 0.9012567324955116, "step": 12550}, {"loss": 0.7871, "grad_norm": 0.6089374423027039, "learning_rate": 0.0002, "epoch": 0.9019748653500897, "step": 12560}, {"loss": 0.74, "grad_norm": 0.6232544183731079, "learning_rate": 0.0002, "epoch": 0.9026929982046679, "step": 12570}, {"loss": 0.7993, "grad_norm": 0.5426769256591797, "learning_rate": 0.0002, "epoch": 0.903411131059246, "step": 12580}, {"loss": 0.8023, "grad_norm": 0.5711943507194519, "learning_rate": 0.0002, "epoch": 0.9041292639138241, "step": 12590}, {"loss": 0.7915, "grad_norm": 0.5287838578224182, "learning_rate": 0.0002, "epoch": 0.9048473967684022, "step": 12600}, {"loss": 0.7394, "grad_norm": 0.6192951798439026, "learning_rate": 0.0002, "epoch": 0.9055655296229802, "step": 12610}, {"loss": 0.7547, "grad_norm": 0.493082195520401, "learning_rate": 0.0002, "epoch": 0.9062836624775583, "step": 12620}, {"loss": 0.7604, "grad_norm": 0.7668463587760925, "learning_rate": 0.0002, "epoch": 0.9070017953321364, "step": 12630}, {"loss": 0.8079, "grad_norm": 0.6298037767410278, "learning_rate": 0.0002, "epoch": 0.9077199281867145, "step": 12640}, {"loss": 0.7451, "grad_norm": 0.5502580404281616, "learning_rate": 0.0002, "epoch": 0.9084380610412927, "step": 12650}, {"loss": 0.763, "grad_norm": 0.5525170564651489, "learning_rate": 0.0002, "epoch": 0.9091561938958708, "step": 12660}, {"loss": 0.7579, "grad_norm": 0.9753695726394653, "learning_rate": 0.0002, "epoch": 0.9098743267504489, "step": 12670}, {"loss": 0.872, "grad_norm": 0.611427366733551, "learning_rate": 0.0002, "epoch": 0.9105924596050269, "step": 12680}, {"loss": 0.7786, "grad_norm": 0.5141594409942627, "learning_rate": 0.0002, "epoch": 0.911310592459605, "step": 12690}, {"loss": 0.7384, "grad_norm": 0.6739137172698975, "learning_rate": 0.0002, "epoch": 0.9120287253141831, "step": 12700}, {"loss": 0.8579, "grad_norm": 0.5759707689285278, "learning_rate": 0.0002, "epoch": 0.9127468581687612, "step": 12710}, {"loss": 0.7559, "grad_norm": 0.5548733472824097, "learning_rate": 0.0002, "epoch": 0.9134649910233393, "step": 12720}, {"loss": 0.8225, "grad_norm": 0.7014280557632446, "learning_rate": 0.0002, "epoch": 0.9141831238779174, "step": 12730}, {"loss": 0.7936, "grad_norm": 0.5939958691596985, "learning_rate": 0.0002, "epoch": 0.9149012567324956, "step": 12740}, {"loss": 0.7756, "grad_norm": 0.5995593667030334, "learning_rate": 0.0002, "epoch": 0.9156193895870736, "step": 12750}, {"loss": 0.7423, "grad_norm": 0.6686680316925049, "learning_rate": 0.0002, "epoch": 0.9163375224416517, "step": 12760}, {"loss": 0.8057, "grad_norm": 0.4742372930049896, "learning_rate": 0.0002, "epoch": 0.9170556552962298, "step": 12770}, {"loss": 0.7795, "grad_norm": 0.5493217706680298, "learning_rate": 0.0002, "epoch": 0.9177737881508079, "step": 12780}, {"loss": 0.7859, "grad_norm": 0.5641885995864868, "learning_rate": 0.0002, "epoch": 0.918491921005386, "step": 12790}, {"loss": 0.7775, "grad_norm": 0.5814061164855957, "learning_rate": 0.0002, "epoch": 0.9192100538599641, "step": 12800}, {"loss": 0.8204, "grad_norm": 0.6774331331253052, "learning_rate": 0.0002, "epoch": 0.9199281867145422, "step": 12810}, {"loss": 0.8205, "grad_norm": 0.5592127442359924, "learning_rate": 0.0002, "epoch": 0.9206463195691202, "step": 12820}, {"loss": 0.7788, "grad_norm": 0.5246456861495972, "learning_rate": 0.0002, "epoch": 0.9213644524236984, "step": 12830}, {"loss": 0.7886, "grad_norm": 0.6524264812469482, "learning_rate": 0.0002, "epoch": 0.9220825852782765, "step": 12840}, {"loss": 0.796, "grad_norm": 0.6010791063308716, "learning_rate": 0.0002, "epoch": 0.9228007181328546, "step": 12850}, {"loss": 0.7998, "grad_norm": 0.5289866924285889, "learning_rate": 0.0002, "epoch": 0.9235188509874327, "step": 12860}, {"loss": 0.7582, "grad_norm": 0.6850762367248535, "learning_rate": 0.0002, "epoch": 0.9242369838420108, "step": 12870}, {"loss": 0.7894, "grad_norm": 0.5293797850608826, "learning_rate": 0.0002, "epoch": 0.9249551166965889, "step": 12880}, {"loss": 0.7738, "grad_norm": 0.6045399308204651, "learning_rate": 0.0002, "epoch": 0.9256732495511669, "step": 12890}, {"loss": 0.7207, "grad_norm": 0.7026739716529846, "learning_rate": 0.0002, "epoch": 0.926391382405745, "step": 12900}, {"loss": 0.7726, "grad_norm": 0.6884756684303284, "learning_rate": 0.0002, "epoch": 0.9271095152603231, "step": 12910}, {"loss": 0.7913, "grad_norm": 0.637884795665741, "learning_rate": 0.0002, "epoch": 0.9278276481149013, "step": 12920}, {"loss": 0.7513, "grad_norm": 0.513913631439209, "learning_rate": 0.0002, "epoch": 0.9285457809694794, "step": 12930}, {"loss": 0.8, "grad_norm": 0.6642340421676636, "learning_rate": 0.0002, "epoch": 0.9292639138240575, "step": 12940}, {"loss": 0.8026, "grad_norm": 0.5708861947059631, "learning_rate": 0.0002, "epoch": 0.9299820466786356, "step": 12950}, {"loss": 0.8234, "grad_norm": 0.5896512866020203, "learning_rate": 0.0002, "epoch": 0.9307001795332136, "step": 12960}, {"loss": 0.77, "grad_norm": 0.5754874348640442, "learning_rate": 0.0002, "epoch": 0.9314183123877917, "step": 12970}, {"loss": 0.7594, "grad_norm": 0.6363751888275146, "learning_rate": 0.0002, "epoch": 0.9321364452423698, "step": 12980}, {"loss": 0.7898, "grad_norm": 0.7660197019577026, "learning_rate": 0.0002, "epoch": 0.9328545780969479, "step": 12990}, {"loss": 0.792, "grad_norm": 0.607728898525238, "learning_rate": 0.0002, "epoch": 0.933572710951526, "step": 13000}, {"loss": 0.734, "grad_norm": 0.5257042050361633, "learning_rate": 0.0002, "epoch": 0.9342908438061042, "step": 13010}, {"loss": 0.8129, "grad_norm": 0.7916908264160156, "learning_rate": 0.0002, "epoch": 0.9350089766606823, "step": 13020}, {"loss": 0.81, "grad_norm": 0.8310123085975647, "learning_rate": 0.0002, "epoch": 0.9357271095152603, "step": 13030}, {"loss": 0.7738, "grad_norm": 0.6543728113174438, "learning_rate": 0.0002, "epoch": 0.9364452423698384, "step": 13040}, {"loss": 0.7797, "grad_norm": 0.7153878808021545, "learning_rate": 0.0002, "epoch": 0.9371633752244165, "step": 13050}, {"loss": 0.779, "grad_norm": 0.7510694265365601, "learning_rate": 0.0002, "epoch": 0.9378815080789946, "step": 13060}, {"loss": 0.7761, "grad_norm": 0.5524464249610901, "learning_rate": 0.0002, "epoch": 0.9385996409335727, "step": 13070}, {"loss": 0.8635, "grad_norm": 0.6657140254974365, "learning_rate": 0.0002, "epoch": 0.9393177737881508, "step": 13080}, {"loss": 0.8097, "grad_norm": 0.5757394433021545, "learning_rate": 0.0002, "epoch": 0.940035906642729, "step": 13090}, {"loss": 0.7967, "grad_norm": 0.6171187162399292, "learning_rate": 0.0002, "epoch": 0.940754039497307, "step": 13100}, {"loss": 0.8197, "grad_norm": 0.5946314334869385, "learning_rate": 0.0002, "epoch": 0.9414721723518851, "step": 13110}, {"loss": 0.7184, "grad_norm": 0.5727229714393616, "learning_rate": 0.0002, "epoch": 0.9421903052064632, "step": 13120}, {"loss": 0.7981, "grad_norm": 0.7805224061012268, "learning_rate": 0.0002, "epoch": 0.9429084380610413, "step": 13130}, {"loss": 0.8045, "grad_norm": 0.5763523578643799, "learning_rate": 0.0002, "epoch": 0.9436265709156194, "step": 13140}, {"loss": 0.7462, "grad_norm": 0.8310899138450623, "learning_rate": 0.0002, "epoch": 0.9443447037701975, "step": 13150}, {"loss": 0.7818, "grad_norm": 0.7531784772872925, "learning_rate": 0.0002, "epoch": 0.9450628366247756, "step": 13160}, {"loss": 0.8418, "grad_norm": 0.678779661655426, "learning_rate": 0.0002, "epoch": 0.9457809694793536, "step": 13170}, {"loss": 0.8064, "grad_norm": 0.8096453547477722, "learning_rate": 0.0002, "epoch": 0.9464991023339318, "step": 13180}, {"loss": 0.7676, "grad_norm": 0.6743921637535095, "learning_rate": 0.0002, "epoch": 0.9472172351885099, "step": 13190}, {"loss": 0.7949, "grad_norm": 0.606852114200592, "learning_rate": 0.0002, "epoch": 0.947935368043088, "step": 13200}, {"loss": 0.7908, "grad_norm": 0.6550270915031433, "learning_rate": 0.0002, "epoch": 0.9486535008976661, "step": 13210}, {"loss": 0.7564, "grad_norm": 0.6494552493095398, "learning_rate": 0.0002, "epoch": 0.9493716337522442, "step": 13220}, {"loss": 0.7974, "grad_norm": 0.5867666602134705, "learning_rate": 0.0002, "epoch": 0.9500897666068223, "step": 13230}, {"loss": 0.8117, "grad_norm": 0.6283786296844482, "learning_rate": 0.0002, "epoch": 0.9508078994614003, "step": 13240}, {"loss": 0.7775, "grad_norm": 0.6824573278427124, "learning_rate": 0.0002, "epoch": 0.9515260323159784, "step": 13250}, {"loss": 0.7674, "grad_norm": 0.6945744156837463, "learning_rate": 0.0002, "epoch": 0.9522441651705565, "step": 13260}, {"loss": 0.7384, "grad_norm": 0.6468575596809387, "learning_rate": 0.0002, "epoch": 0.9529622980251347, "step": 13270}, {"loss": 0.7548, "grad_norm": 0.6819407939910889, "learning_rate": 0.0002, "epoch": 0.9536804308797128, "step": 13280}, {"loss": 0.7933, "grad_norm": 0.6660491824150085, "learning_rate": 0.0002, "epoch": 0.9543985637342909, "step": 13290}, {"loss": 0.7293, "grad_norm": 0.6320462226867676, "learning_rate": 0.0002, "epoch": 0.9551166965888689, "step": 13300}, {"loss": 0.8122, "grad_norm": 0.46753761172294617, "learning_rate": 0.0002, "epoch": 0.955834829443447, "step": 13310}, {"loss": 0.7953, "grad_norm": 0.6608774065971375, "learning_rate": 0.0002, "epoch": 0.9565529622980251, "step": 13320}, {"loss": 0.8217, "grad_norm": 0.607448935508728, "learning_rate": 0.0002, "epoch": 0.9572710951526032, "step": 13330}, {"loss": 0.7278, "grad_norm": 0.6796701550483704, "learning_rate": 0.0002, "epoch": 0.9579892280071813, "step": 13340}, {"loss": 0.7979, "grad_norm": 0.7655861377716064, "learning_rate": 0.0002, "epoch": 0.9587073608617595, "step": 13350}, {"loss": 0.7822, "grad_norm": 0.5881335735321045, "learning_rate": 0.0002, "epoch": 0.9594254937163376, "step": 13360}, {"loss": 0.815, "grad_norm": 0.6855270862579346, "learning_rate": 0.0002, "epoch": 0.9601436265709156, "step": 13370}, {"loss": 0.8025, "grad_norm": 0.6072475910186768, "learning_rate": 0.0002, "epoch": 0.9608617594254937, "step": 13380}, {"loss": 0.7756, "grad_norm": 0.5983994603157043, "learning_rate": 0.0002, "epoch": 0.9615798922800718, "step": 13390}, {"loss": 0.8121, "grad_norm": 0.6141189932823181, "learning_rate": 0.0002, "epoch": 0.9622980251346499, "step": 13400}, {"loss": 0.8059, "grad_norm": 0.6539722084999084, "learning_rate": 0.0002, "epoch": 0.963016157989228, "step": 13410}, {"loss": 0.8085, "grad_norm": 0.5425801277160645, "learning_rate": 0.0002, "epoch": 0.9637342908438061, "step": 13420}, {"loss": 0.7687, "grad_norm": 0.8038925528526306, "learning_rate": 0.0002, "epoch": 0.9644524236983842, "step": 13430}, {"loss": 0.8015, "grad_norm": 0.5729590058326721, "learning_rate": 0.0002, "epoch": 0.9651705565529622, "step": 13440}, {"loss": 0.782, "grad_norm": 0.5695241689682007, "learning_rate": 0.0002, "epoch": 0.9658886894075404, "step": 13450}, {"loss": 0.7984, "grad_norm": 0.5913681387901306, "learning_rate": 0.0002, "epoch": 0.9666068222621185, "step": 13460}, {"loss": 0.7947, "grad_norm": 1.1798994541168213, "learning_rate": 0.0002, "epoch": 0.9673249551166966, "step": 13470}, {"loss": 0.7342, "grad_norm": 0.5931369066238403, "learning_rate": 0.0002, "epoch": 0.9680430879712747, "step": 13480}, {"loss": 0.8432, "grad_norm": 0.6269514560699463, "learning_rate": 0.0002, "epoch": 0.9687612208258528, "step": 13490}, {"loss": 0.7357, "grad_norm": 0.7380245327949524, "learning_rate": 0.0002, "epoch": 0.9694793536804309, "step": 13500}, {"loss": 0.8006, "grad_norm": 0.5668187141418457, "learning_rate": 0.0002, "epoch": 0.9701974865350089, "step": 13510}, {"loss": 0.7562, "grad_norm": 0.547149121761322, "learning_rate": 0.0002, "epoch": 0.970915619389587, "step": 13520}, {"loss": 0.8239, "grad_norm": 0.49131739139556885, "learning_rate": 0.0002, "epoch": 0.9716337522441651, "step": 13530}, {"loss": 0.8159, "grad_norm": 0.6385366320610046, "learning_rate": 0.0002, "epoch": 0.9723518850987433, "step": 13540}, {"loss": 0.7882, "grad_norm": 0.5962417125701904, "learning_rate": 0.0002, "epoch": 0.9730700179533214, "step": 13550}, {"loss": 0.7353, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9737881508078995, "step": 13560}, {"loss": 0.7511, "grad_norm": 0.5757403373718262, "learning_rate": 0.0002, "epoch": 0.9745062836624776, "step": 13570}, {"loss": 0.7858, "grad_norm": 0.7214667201042175, "learning_rate": 0.0002, "epoch": 0.9752244165170556, "step": 13580}, {"loss": 0.7492, "grad_norm": 0.5902701020240784, "learning_rate": 0.0002, "epoch": 0.9759425493716337, "step": 13590}, {"loss": 0.8177, "grad_norm": 0.752805769443512, "learning_rate": 0.0002, "epoch": 0.9766606822262118, "step": 13600}, {"loss": 0.7622, "grad_norm": 0.5943595767021179, "learning_rate": 0.0002, "epoch": 0.9773788150807899, "step": 13610}, {"loss": 0.7781, "grad_norm": 0.6752488613128662, "learning_rate": 0.0002, "epoch": 0.978096947935368, "step": 13620}, {"loss": 0.8022, "grad_norm": 0.5295413732528687, "learning_rate": 0.0002, "epoch": 0.9788150807899462, "step": 13630}, {"loss": 0.7462, "grad_norm": 0.732549250125885, "learning_rate": 0.0002, "epoch": 0.9795332136445243, "step": 13640}, {"loss": 0.7939, "grad_norm": 0.5701823830604553, "learning_rate": 0.0002, "epoch": 0.9802513464991023, "step": 13650}, {"loss": 0.7609, "grad_norm": 0.576898455619812, "learning_rate": 0.0002, "epoch": 0.9809694793536804, "step": 13660}, {"loss": 0.7576, "grad_norm": 0.5916832089424133, "learning_rate": 0.0002, "epoch": 0.9816876122082585, "step": 13670}, {"loss": 0.7587, "grad_norm": 0.5554524660110474, "learning_rate": 0.0002, "epoch": 0.9824057450628366, "step": 13680}, {"loss": 0.8274, "grad_norm": 0.6988440752029419, "learning_rate": 0.0002, "epoch": 0.9831238779174147, "step": 13690}, {"loss": 0.7485, "grad_norm": 0.6660445332527161, "learning_rate": 0.0002, "epoch": 0.9838420107719928, "step": 13700}, {"loss": 0.7609, "grad_norm": 2.421210289001465, "learning_rate": 0.0002, "epoch": 0.984560143626571, "step": 13710}, {"loss": 0.784, "grad_norm": 0.6307598948478699, "learning_rate": 0.0002, "epoch": 0.985278276481149, "step": 13720}, {"loss": 0.7757, "grad_norm": 0.6832480430603027, "learning_rate": 0.0002, "epoch": 0.9859964093357271, "step": 13730}, {"loss": 0.8064, "grad_norm": 0.5974255204200745, "learning_rate": 0.0002, "epoch": 0.9867145421903052, "step": 13740}, {"loss": 0.7871, "grad_norm": 0.6540380716323853, "learning_rate": 0.0002, "epoch": 0.9874326750448833, "step": 13750}, {"loss": 0.7735, "grad_norm": 0.7532727122306824, "learning_rate": 0.0002, "epoch": 0.9881508078994614, "step": 13760}, {"loss": 0.7392, "grad_norm": 0.6776283383369446, "learning_rate": 0.0002, "epoch": 0.9888689407540395, "step": 13770}, {"loss": 0.7852, "grad_norm": 0.5776281356811523, "learning_rate": 0.0002, "epoch": 0.9895870736086176, "step": 13780}, {"loss": 0.8216, "grad_norm": 0.5473008751869202, "learning_rate": 0.0002, "epoch": 0.9903052064631956, "step": 13790}, {"loss": 0.7776, "grad_norm": 0.5428591370582581, "learning_rate": 0.0002, "epoch": 0.9910233393177738, "step": 13800}, {"loss": 0.7823, "grad_norm": 0.5173406004905701, "learning_rate": 0.0002, "epoch": 0.9917414721723519, "step": 13810}, {"loss": 0.762, "grad_norm": 0.6462617516517639, "learning_rate": 0.0002, "epoch": 0.99245960502693, "step": 13820}, {"loss": 0.7656, "grad_norm": 0.5800426006317139, "learning_rate": 0.0002, "epoch": 0.9931777378815081, "step": 13830}, {"loss": 0.8028, "grad_norm": 0.5015466809272766, "learning_rate": 0.0002, "epoch": 0.9938958707360862, "step": 13840}, {"loss": 0.7782, "grad_norm": 0.59474778175354, "learning_rate": 0.0002, "epoch": 0.9946140035906643, "step": 13850}, {"loss": 0.7891, "grad_norm": 0.5609583258628845, "learning_rate": 0.0002, "epoch": 0.9953321364452423, "step": 13860}, {"loss": 0.7647, "grad_norm": 0.5762063264846802, "learning_rate": 0.0002, "epoch": 0.9960502692998204, "step": 13870}, {"loss": 0.7594, "grad_norm": 0.6419214010238647, "learning_rate": 0.0002, "epoch": 0.9967684021543985, "step": 13880}, {"loss": 0.7599, "grad_norm": 0.7821950316429138, "learning_rate": 0.0002, "epoch": 0.9974865350089767, "step": 13890}, {"loss": 0.7529, "grad_norm": 0.6216017007827759, "learning_rate": 0.0002, "epoch": 0.9982046678635548, "step": 13900}, {"loss": 0.7621, "grad_norm": 0.5446485877037048, "learning_rate": 0.0002, "epoch": 0.9989228007181329, "step": 13910}, {"loss": 0.74, "grad_norm": 0.5037565231323242, "learning_rate": 0.0002, "epoch": 0.999640933572711, "step": 13920}]} +{"epoch": 2.0, "step": 27850, "epoch_duration": 14989.80767917633, "total_accumulated_duration": 33346.72925376892, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-13925", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.5816, "grad_norm": 1.0291756391525269, "learning_rate": 0.0002, "epoch": 0.000718132854578097, "step": 10}, {"loss": 1.1527, "grad_norm": 0.6570823192596436, "learning_rate": 0.0002, "epoch": 0.001436265709156194, "step": 20}, {"loss": 1.0014, "grad_norm": 0.693844199180603, "learning_rate": 0.0002, "epoch": 0.0021543985637342907, "step": 30}, {"loss": 0.9377, "grad_norm": 0.5608532428741455, "learning_rate": 0.0002, "epoch": 0.002872531418312388, "step": 40}, {"loss": 0.9533, "grad_norm": 0.549075722694397, "learning_rate": 0.0002, "epoch": 0.003590664272890485, "step": 50}, {"loss": 0.9164, "grad_norm": 0.47189879417419434, "learning_rate": 0.0002, "epoch": 0.004308797127468581, "step": 60}, {"loss": 0.8898, "grad_norm": 0.5799676775932312, "learning_rate": 0.0002, "epoch": 0.005026929982046679, "step": 70}, {"loss": 0.859, "grad_norm": 0.45907193422317505, "learning_rate": 0.0002, "epoch": 0.005745062836624776, "step": 80}, {"loss": 0.8697, "grad_norm": 0.4373045861721039, "learning_rate": 0.0002, "epoch": 0.006463195691202872, "step": 90}, {"loss": 0.8879, "grad_norm": 0.5636304020881653, "learning_rate": 0.0002, "epoch": 0.00718132854578097, "step": 100}, {"loss": 0.8397, "grad_norm": 0.5248253345489502, "learning_rate": 0.0002, "epoch": 0.007899461400359067, "step": 110}, {"loss": 0.9021, "grad_norm": 0.5082874298095703, "learning_rate": 0.0002, "epoch": 0.008617594254937163, "step": 120}, {"loss": 0.8678, "grad_norm": 0.42670881748199463, "learning_rate": 0.0002, "epoch": 0.00933572710951526, "step": 130}, {"loss": 0.7847, "grad_norm": 0.43311649560928345, "learning_rate": 0.0002, "epoch": 0.010053859964093357, "step": 140}, {"loss": 0.9252, "grad_norm": 0.43456509709358215, "learning_rate": 0.0002, "epoch": 0.010771992818671455, "step": 150}, {"loss": 0.8812, "grad_norm": 0.9222815632820129, "learning_rate": 0.0002, "epoch": 0.011490125673249552, "step": 160}, {"loss": 0.8651, "grad_norm": 0.42752256989479065, "learning_rate": 0.0002, "epoch": 0.012208258527827648, "step": 170}, {"loss": 0.8898, "grad_norm": 0.4175542891025543, "learning_rate": 0.0002, "epoch": 0.012926391382405745, "step": 180}, {"loss": 0.8519, "grad_norm": 0.4377831518650055, "learning_rate": 0.0002, "epoch": 0.013644524236983842, "step": 190}, {"loss": 0.8849, "grad_norm": 0.47263655066490173, "learning_rate": 0.0002, "epoch": 0.01436265709156194, "step": 200}, {"loss": 0.8764, "grad_norm": 0.3870520293712616, "learning_rate": 0.0002, "epoch": 0.015080789946140035, "step": 210}, {"loss": 0.833, "grad_norm": 0.4950464963912964, "learning_rate": 0.0002, "epoch": 0.015798922800718134, "step": 220}, {"loss": 0.8323, "grad_norm": 0.4643295407295227, "learning_rate": 0.0002, "epoch": 0.01651705565529623, "step": 230}, {"loss": 0.8363, "grad_norm": 0.5152903199195862, "learning_rate": 0.0002, "epoch": 0.017235188509874325, "step": 240}, {"loss": 0.873, "grad_norm": 0.3800727427005768, "learning_rate": 0.0002, "epoch": 0.017953321364452424, "step": 250}, {"loss": 0.8252, "grad_norm": 0.43700528144836426, "learning_rate": 0.0002, "epoch": 0.01867145421903052, "step": 260}, {"loss": 0.8686, "grad_norm": 0.3712887763977051, "learning_rate": 0.0002, "epoch": 0.01938958707360862, "step": 270}, {"loss": 0.8329, "grad_norm": 0.4202553629875183, "learning_rate": 0.0002, "epoch": 0.020107719928186715, "step": 280}, {"loss": 0.8143, "grad_norm": 0.40585094690322876, "learning_rate": 0.0002, "epoch": 0.02082585278276481, "step": 290}, {"loss": 0.8463, "grad_norm": 0.4685470759868622, "learning_rate": 0.0002, "epoch": 0.02154398563734291, "step": 300}, {"loss": 0.8321, "grad_norm": 0.373169481754303, "learning_rate": 0.0002, "epoch": 0.022262118491921005, "step": 310}, {"loss": 0.8031, "grad_norm": 0.39681482315063477, "learning_rate": 0.0002, "epoch": 0.022980251346499104, "step": 320}, {"loss": 0.8667, "grad_norm": 0.3919322192668915, "learning_rate": 0.0002, "epoch": 0.0236983842010772, "step": 330}, {"loss": 0.8196, "grad_norm": 0.4728981554508209, "learning_rate": 0.0002, "epoch": 0.024416517055655295, "step": 340}, {"loss": 0.8662, "grad_norm": 0.42439374327659607, "learning_rate": 0.0002, "epoch": 0.025134649910233394, "step": 350}, {"loss": 0.8618, "grad_norm": 0.425650030374527, "learning_rate": 0.0002, "epoch": 0.02585278276481149, "step": 360}, {"loss": 0.8249, "grad_norm": 0.4076762795448303, "learning_rate": 0.0002, "epoch": 0.02657091561938959, "step": 370}, {"loss": 0.8293, "grad_norm": 0.44335922598838806, "learning_rate": 0.0002, "epoch": 0.027289048473967684, "step": 380}, {"loss": 0.8288, "grad_norm": 0.5313619375228882, "learning_rate": 0.0002, "epoch": 0.02800718132854578, "step": 390}, {"loss": 0.8431, "grad_norm": 0.37089797854423523, "learning_rate": 0.0002, "epoch": 0.02872531418312388, "step": 400}, {"loss": 0.7644, "grad_norm": 0.5193604826927185, "learning_rate": 0.0002, "epoch": 0.029443447037701975, "step": 410}, {"loss": 0.7853, "grad_norm": 0.4428552985191345, "learning_rate": 0.0002, "epoch": 0.03016157989228007, "step": 420}, {"loss": 0.8641, "grad_norm": 0.384171724319458, "learning_rate": 0.0002, "epoch": 0.03087971274685817, "step": 430}, {"loss": 0.8236, "grad_norm": 0.3906913101673126, "learning_rate": 0.0002, "epoch": 0.03159784560143627, "step": 440}, {"loss": 0.8215, "grad_norm": 0.5365669131278992, "learning_rate": 0.0002, "epoch": 0.03231597845601436, "step": 450}, {"loss": 0.8376, "grad_norm": 0.4785287380218506, "learning_rate": 0.0002, "epoch": 0.03303411131059246, "step": 460}, {"loss": 0.8439, "grad_norm": 0.40048182010650635, "learning_rate": 0.0002, "epoch": 0.03375224416517056, "step": 470}, {"loss": 0.8306, "grad_norm": 0.49529239535331726, "learning_rate": 0.0002, "epoch": 0.03447037701974865, "step": 480}, {"loss": 0.8653, "grad_norm": 0.5853474140167236, "learning_rate": 0.0002, "epoch": 0.03518850987432675, "step": 490}, {"loss": 0.7952, "grad_norm": 0.3802863359451294, "learning_rate": 0.0002, "epoch": 0.03590664272890485, "step": 500}, {"loss": 0.8986, "grad_norm": 0.40374308824539185, "learning_rate": 0.0002, "epoch": 0.03662477558348295, "step": 510}, {"loss": 0.8495, "grad_norm": 0.4320009648799896, "learning_rate": 0.0002, "epoch": 0.03734290843806104, "step": 520}, {"loss": 0.8838, "grad_norm": 0.5198846459388733, "learning_rate": 0.0002, "epoch": 0.03806104129263914, "step": 530}, {"loss": 0.8343, "grad_norm": 0.4136947989463806, "learning_rate": 0.0002, "epoch": 0.03877917414721724, "step": 540}, {"loss": 0.8752, "grad_norm": 0.39344364404678345, "learning_rate": 0.0002, "epoch": 0.03949730700179533, "step": 550}, {"loss": 0.8088, "grad_norm": 0.4659644067287445, "learning_rate": 0.0002, "epoch": 0.04021543985637343, "step": 560}, {"loss": 0.766, "grad_norm": 0.3898842930793762, "learning_rate": 0.0002, "epoch": 0.04093357271095153, "step": 570}, {"loss": 0.7806, "grad_norm": 0.3964841961860657, "learning_rate": 0.0002, "epoch": 0.04165170556552962, "step": 580}, {"loss": 0.801, "grad_norm": 0.5172179341316223, "learning_rate": 0.0002, "epoch": 0.04236983842010772, "step": 590}, {"loss": 0.8253, "grad_norm": 0.5362544059753418, "learning_rate": 0.0002, "epoch": 0.04308797127468582, "step": 600}, {"loss": 0.8701, "grad_norm": 0.3975909948348999, "learning_rate": 0.0002, "epoch": 0.04380610412926391, "step": 610}, {"loss": 0.844, "grad_norm": 0.3905031085014343, "learning_rate": 0.0002, "epoch": 0.04452423698384201, "step": 620}, {"loss": 0.7723, "grad_norm": 0.5148088932037354, "learning_rate": 0.0002, "epoch": 0.04524236983842011, "step": 630}, {"loss": 0.8309, "grad_norm": 0.38826194405555725, "learning_rate": 0.0002, "epoch": 0.04596050269299821, "step": 640}, {"loss": 0.8379, "grad_norm": 0.5432049036026001, "learning_rate": 0.0002, "epoch": 0.0466786355475763, "step": 650}, {"loss": 0.838, "grad_norm": 0.42048221826553345, "learning_rate": 0.0002, "epoch": 0.0473967684021544, "step": 660}, {"loss": 0.8337, "grad_norm": 0.4683088958263397, "learning_rate": 0.0002, "epoch": 0.0481149012567325, "step": 670}, {"loss": 0.7982, "grad_norm": 0.4623735249042511, "learning_rate": 0.0002, "epoch": 0.04883303411131059, "step": 680}, {"loss": 0.8905, "grad_norm": 0.509128212928772, "learning_rate": 0.0002, "epoch": 0.04955116696588869, "step": 690}, {"loss": 0.8193, "grad_norm": 0.45767295360565186, "learning_rate": 0.0002, "epoch": 0.05026929982046679, "step": 700}, {"loss": 0.7658, "grad_norm": 0.4023726284503937, "learning_rate": 0.0002, "epoch": 0.05098743267504488, "step": 710}, {"loss": 0.8552, "grad_norm": 0.4407201409339905, "learning_rate": 0.0002, "epoch": 0.05170556552962298, "step": 720}, {"loss": 0.8369, "grad_norm": 0.41862091422080994, "learning_rate": 0.0002, "epoch": 0.05242369838420108, "step": 730}, {"loss": 0.8856, "grad_norm": 0.37473055720329285, "learning_rate": 0.0002, "epoch": 0.05314183123877918, "step": 740}, {"loss": 0.8282, "grad_norm": 0.4882921576499939, "learning_rate": 0.0002, "epoch": 0.05385996409335727, "step": 750}, {"loss": 0.8257, "grad_norm": 0.47890132665634155, "learning_rate": 0.0002, "epoch": 0.05457809694793537, "step": 760}, {"loss": 0.9222, "grad_norm": 0.5811166167259216, "learning_rate": 0.0002, "epoch": 0.05529622980251347, "step": 770}, {"loss": 0.7943, "grad_norm": 0.41113588213920593, "learning_rate": 0.0002, "epoch": 0.05601436265709156, "step": 780}, {"loss": 0.791, "grad_norm": 0.4120602607727051, "learning_rate": 0.0002, "epoch": 0.05673249551166966, "step": 790}, {"loss": 0.9038, "grad_norm": 0.39287394285202026, "learning_rate": 0.0002, "epoch": 0.05745062836624776, "step": 800}, {"loss": 0.8131, "grad_norm": 0.3986941874027252, "learning_rate": 0.0002, "epoch": 0.05816876122082585, "step": 810}, {"loss": 0.8268, "grad_norm": 0.4264012575149536, "learning_rate": 0.0002, "epoch": 0.05888689407540395, "step": 820}, {"loss": 0.7881, "grad_norm": 0.481139600276947, "learning_rate": 0.0002, "epoch": 0.05960502692998205, "step": 830}, {"loss": 0.8477, "grad_norm": 0.5561784505844116, "learning_rate": 0.0002, "epoch": 0.06032315978456014, "step": 840}, {"loss": 0.7817, "grad_norm": 0.4787197411060333, "learning_rate": 0.0002, "epoch": 0.06104129263913824, "step": 850}, {"loss": 0.8567, "grad_norm": 0.46454647183418274, "learning_rate": 0.0002, "epoch": 0.06175942549371634, "step": 860}, {"loss": 0.8429, "grad_norm": 0.5929669141769409, "learning_rate": 0.0002, "epoch": 0.06247755834829444, "step": 870}, {"loss": 0.8019, "grad_norm": 0.4561384618282318, "learning_rate": 0.0002, "epoch": 0.06319569120287254, "step": 880}, {"loss": 0.8686, "grad_norm": 0.45767998695373535, "learning_rate": 0.0002, "epoch": 0.06391382405745062, "step": 890}, {"loss": 0.818, "grad_norm": 0.42475444078445435, "learning_rate": 0.0002, "epoch": 0.06463195691202872, "step": 900}, {"loss": 0.8579, "grad_norm": 0.4911022484302521, "learning_rate": 0.0002, "epoch": 0.06535008976660682, "step": 910}, {"loss": 0.8067, "grad_norm": 0.5229166746139526, "learning_rate": 0.0002, "epoch": 0.06606822262118492, "step": 920}, {"loss": 0.8563, "grad_norm": 0.38134580850601196, "learning_rate": 0.0002, "epoch": 0.06678635547576302, "step": 930}, {"loss": 0.815, "grad_norm": 0.4171486496925354, "learning_rate": 0.0002, "epoch": 0.06750448833034112, "step": 940}, {"loss": 0.8122, "grad_norm": 0.45171529054641724, "learning_rate": 0.0002, "epoch": 0.06822262118491922, "step": 950}, {"loss": 0.8436, "grad_norm": 0.44889307022094727, "learning_rate": 0.0002, "epoch": 0.0689407540394973, "step": 960}, {"loss": 0.8149, "grad_norm": 0.44902464747428894, "learning_rate": 0.0002, "epoch": 0.0696588868940754, "step": 970}, {"loss": 0.7916, "grad_norm": 0.4671969413757324, "learning_rate": 0.0002, "epoch": 0.0703770197486535, "step": 980}, {"loss": 0.8147, "grad_norm": 0.4686984717845917, "learning_rate": 0.0002, "epoch": 0.0710951526032316, "step": 990}, {"loss": 0.806, "grad_norm": 0.4513658583164215, "learning_rate": 0.0002, "epoch": 0.0718132854578097, "step": 1000}, {"loss": 0.8348, "grad_norm": 0.48861828446388245, "learning_rate": 0.0002, "epoch": 0.0725314183123878, "step": 1010}, {"loss": 0.8038, "grad_norm": 0.7603165507316589, "learning_rate": 0.0002, "epoch": 0.0732495511669659, "step": 1020}, {"loss": 0.7844, "grad_norm": 0.501654863357544, "learning_rate": 0.0002, "epoch": 0.07396768402154398, "step": 1030}, {"loss": 0.7623, "grad_norm": 0.45291560888290405, "learning_rate": 0.0002, "epoch": 0.07468581687612208, "step": 1040}, {"loss": 0.8174, "grad_norm": 0.42454713582992554, "learning_rate": 0.0002, "epoch": 0.07540394973070018, "step": 1050}, {"loss": 0.7874, "grad_norm": 0.4655592441558838, "learning_rate": 0.0002, "epoch": 0.07612208258527828, "step": 1060}, {"loss": 0.8855, "grad_norm": 0.5011071562767029, "learning_rate": 0.0002, "epoch": 0.07684021543985638, "step": 1070}, {"loss": 0.8502, "grad_norm": 0.37221577763557434, "learning_rate": 0.0002, "epoch": 0.07755834829443448, "step": 1080}, {"loss": 0.8623, "grad_norm": 0.5123572945594788, "learning_rate": 0.0002, "epoch": 0.07827648114901256, "step": 1090}, {"loss": 0.8527, "grad_norm": 0.44138720631599426, "learning_rate": 0.0002, "epoch": 0.07899461400359066, "step": 1100}, {"loss": 0.7949, "grad_norm": 0.38932886719703674, "learning_rate": 0.0002, "epoch": 0.07971274685816876, "step": 1110}, {"loss": 0.8289, "grad_norm": 0.435820072889328, "learning_rate": 0.0002, "epoch": 0.08043087971274686, "step": 1120}, {"loss": 0.787, "grad_norm": 0.3820142149925232, "learning_rate": 0.0002, "epoch": 0.08114901256732496, "step": 1130}, {"loss": 0.8617, "grad_norm": 0.39680808782577515, "learning_rate": 0.0002, "epoch": 0.08186714542190306, "step": 1140}, {"loss": 0.8047, "grad_norm": 0.4833722412586212, "learning_rate": 0.0002, "epoch": 0.08258527827648116, "step": 1150}, {"loss": 0.8513, "grad_norm": 0.5045956969261169, "learning_rate": 0.0002, "epoch": 0.08330341113105924, "step": 1160}, {"loss": 0.8366, "grad_norm": 0.3652207553386688, "learning_rate": 0.0002, "epoch": 0.08402154398563734, "step": 1170}, {"loss": 0.8464, "grad_norm": 0.44447052478790283, "learning_rate": 0.0002, "epoch": 0.08473967684021544, "step": 1180}, {"loss": 0.8362, "grad_norm": 0.44942694902420044, "learning_rate": 0.0002, "epoch": 0.08545780969479354, "step": 1190}, {"loss": 0.7932, "grad_norm": 0.48789075016975403, "learning_rate": 0.0002, "epoch": 0.08617594254937164, "step": 1200}, {"loss": 0.8008, "grad_norm": 0.3981451094150543, "learning_rate": 0.0002, "epoch": 0.08689407540394974, "step": 1210}, {"loss": 0.8296, "grad_norm": 0.45545220375061035, "learning_rate": 0.0002, "epoch": 0.08761220825852782, "step": 1220}, {"loss": 0.8406, "grad_norm": 0.562138557434082, "learning_rate": 0.0002, "epoch": 0.08833034111310592, "step": 1230}, {"loss": 0.808, "grad_norm": 0.48523494601249695, "learning_rate": 0.0002, "epoch": 0.08904847396768402, "step": 1240}, {"loss": 0.8024, "grad_norm": 0.35054388642311096, "learning_rate": 0.0002, "epoch": 0.08976660682226212, "step": 1250}, {"loss": 0.8635, "grad_norm": 0.4148605167865753, "learning_rate": 0.0002, "epoch": 0.09048473967684022, "step": 1260}, {"loss": 0.8379, "grad_norm": 0.50171959400177, "learning_rate": 0.0002, "epoch": 0.09120287253141832, "step": 1270}, {"loss": 0.8466, "grad_norm": 0.41747573018074036, "learning_rate": 0.0002, "epoch": 0.09192100538599642, "step": 1280}, {"loss": 0.7905, "grad_norm": 0.43028751015663147, "learning_rate": 0.0002, "epoch": 0.0926391382405745, "step": 1290}, {"loss": 0.8071, "grad_norm": 0.41274991631507874, "learning_rate": 0.0002, "epoch": 0.0933572710951526, "step": 1300}, {"loss": 0.8214, "grad_norm": 0.5399569272994995, "learning_rate": 0.0002, "epoch": 0.0940754039497307, "step": 1310}, {"loss": 0.8108, "grad_norm": 0.44284379482269287, "learning_rate": 0.0002, "epoch": 0.0947935368043088, "step": 1320}, {"loss": 0.8301, "grad_norm": 0.42511969804763794, "learning_rate": 0.0002, "epoch": 0.0955116696588869, "step": 1330}, {"loss": 0.8527, "grad_norm": 0.5717929005622864, "learning_rate": 0.0002, "epoch": 0.096229802513465, "step": 1340}, {"loss": 0.8232, "grad_norm": 0.4104631245136261, "learning_rate": 0.0002, "epoch": 0.09694793536804308, "step": 1350}, {"loss": 0.8697, "grad_norm": 0.4144339859485626, "learning_rate": 0.0002, "epoch": 0.09766606822262118, "step": 1360}, {"loss": 0.7909, "grad_norm": 0.43676936626434326, "learning_rate": 0.0002, "epoch": 0.09838420107719928, "step": 1370}, {"loss": 0.8757, "grad_norm": 0.5297161340713501, "learning_rate": 0.0002, "epoch": 0.09910233393177738, "step": 1380}, {"loss": 0.7772, "grad_norm": 0.5319193601608276, "learning_rate": 0.0002, "epoch": 0.09982046678635548, "step": 1390}, {"loss": 0.8167, "grad_norm": 0.4083728492259979, "learning_rate": 0.0002, "epoch": 0.10053859964093358, "step": 1400}, {"loss": 0.8436, "grad_norm": 0.4193868339061737, "learning_rate": 0.0002, "epoch": 0.10125673249551168, "step": 1410}, {"loss": 0.8634, "grad_norm": 0.4062198996543884, "learning_rate": 0.0002, "epoch": 0.10197486535008976, "step": 1420}, {"loss": 0.7984, "grad_norm": 0.43972232937812805, "learning_rate": 0.0002, "epoch": 0.10269299820466786, "step": 1430}, {"loss": 0.8278, "grad_norm": 0.4598410725593567, "learning_rate": 0.0002, "epoch": 0.10341113105924596, "step": 1440}, {"loss": 0.8527, "grad_norm": 0.571662187576294, "learning_rate": 0.0002, "epoch": 0.10412926391382406, "step": 1450}, {"loss": 0.8485, "grad_norm": 0.5437791347503662, "learning_rate": 0.0002, "epoch": 0.10484739676840216, "step": 1460}, {"loss": 0.8172, "grad_norm": 0.4241923391819, "learning_rate": 0.0002, "epoch": 0.10556552962298026, "step": 1470}, {"loss": 0.8224, "grad_norm": 0.5185145735740662, "learning_rate": 0.0002, "epoch": 0.10628366247755835, "step": 1480}, {"loss": 0.8292, "grad_norm": 0.537626326084137, "learning_rate": 0.0002, "epoch": 0.10700179533213644, "step": 1490}, {"loss": 0.8227, "grad_norm": 0.4573661983013153, "learning_rate": 0.0002, "epoch": 0.10771992818671454, "step": 1500}, {"loss": 0.8318, "grad_norm": 0.4521017074584961, "learning_rate": 0.0002, "epoch": 0.10843806104129264, "step": 1510}, {"loss": 0.8107, "grad_norm": 0.6835159063339233, "learning_rate": 0.0002, "epoch": 0.10915619389587074, "step": 1520}, {"loss": 0.8256, "grad_norm": 0.43522894382476807, "learning_rate": 0.0002, "epoch": 0.10987432675044884, "step": 1530}, {"loss": 0.8211, "grad_norm": 0.685547411441803, "learning_rate": 0.0002, "epoch": 0.11059245960502694, "step": 1540}, {"loss": 0.8393, "grad_norm": 0.5283669233322144, "learning_rate": 0.0002, "epoch": 0.11131059245960502, "step": 1550}, {"loss": 0.8493, "grad_norm": 0.4869283437728882, "learning_rate": 0.0002, "epoch": 0.11202872531418312, "step": 1560}, {"loss": 0.8614, "grad_norm": 0.43024054169654846, "learning_rate": 0.0002, "epoch": 0.11274685816876122, "step": 1570}, {"loss": 0.8026, "grad_norm": 0.46726059913635254, "learning_rate": 0.0002, "epoch": 0.11346499102333932, "step": 1580}, {"loss": 0.8103, "grad_norm": 0.5046039819717407, "learning_rate": 0.0002, "epoch": 0.11418312387791742, "step": 1590}, {"loss": 0.8242, "grad_norm": 0.48972827196121216, "learning_rate": 0.0002, "epoch": 0.11490125673249552, "step": 1600}, {"loss": 0.8114, "grad_norm": 0.5221049189567566, "learning_rate": 0.0002, "epoch": 0.11561938958707361, "step": 1610}, {"loss": 0.8022, "grad_norm": 0.49169477820396423, "learning_rate": 0.0002, "epoch": 0.1163375224416517, "step": 1620}, {"loss": 0.8223, "grad_norm": 0.48462188243865967, "learning_rate": 0.0002, "epoch": 0.1170556552962298, "step": 1630}, {"loss": 0.8409, "grad_norm": 0.9001021981239319, "learning_rate": 0.0002, "epoch": 0.1177737881508079, "step": 1640}, {"loss": 0.8037, "grad_norm": 0.47555917501449585, "learning_rate": 0.0002, "epoch": 0.118491921005386, "step": 1650}, {"loss": 0.8047, "grad_norm": 0.4523521959781647, "learning_rate": 0.0002, "epoch": 0.1192100538599641, "step": 1660}, {"loss": 0.8552, "grad_norm": 0.510956346988678, "learning_rate": 0.0002, "epoch": 0.1199281867145422, "step": 1670}, {"loss": 0.8081, "grad_norm": 0.48063746094703674, "learning_rate": 0.0002, "epoch": 0.12064631956912028, "step": 1680}, {"loss": 0.7712, "grad_norm": 0.5209490060806274, "learning_rate": 0.0002, "epoch": 0.12136445242369838, "step": 1690}, {"loss": 0.8019, "grad_norm": 0.5488983988761902, "learning_rate": 0.0002, "epoch": 0.12208258527827648, "step": 1700}, {"loss": 0.829, "grad_norm": 0.5263523459434509, "learning_rate": 0.0002, "epoch": 0.12280071813285458, "step": 1710}, {"loss": 0.7761, "grad_norm": 0.45365768671035767, "learning_rate": 0.0002, "epoch": 0.12351885098743268, "step": 1720}, {"loss": 0.8432, "grad_norm": 0.4366922378540039, "learning_rate": 0.0002, "epoch": 0.12423698384201078, "step": 1730}, {"loss": 0.8261, "grad_norm": 0.4841083884239197, "learning_rate": 0.0002, "epoch": 0.12495511669658887, "step": 1740}, {"loss": 0.7834, "grad_norm": 0.46546968817710876, "learning_rate": 0.0002, "epoch": 0.12567324955116696, "step": 1750}, {"loss": 0.7874, "grad_norm": 0.39987099170684814, "learning_rate": 0.0002, "epoch": 0.12639138240574507, "step": 1760}, {"loss": 0.813, "grad_norm": 0.4661678969860077, "learning_rate": 0.0002, "epoch": 0.12710951526032316, "step": 1770}, {"loss": 0.8516, "grad_norm": 0.46716657280921936, "learning_rate": 0.0002, "epoch": 0.12782764811490124, "step": 1780}, {"loss": 0.8065, "grad_norm": 0.46164995431900024, "learning_rate": 0.0002, "epoch": 0.12854578096947936, "step": 1790}, {"loss": 0.8911, "grad_norm": 0.4910370111465454, "learning_rate": 0.0002, "epoch": 0.12926391382405744, "step": 1800}, {"loss": 0.7773, "grad_norm": 0.5615737438201904, "learning_rate": 0.0002, "epoch": 0.12998204667863555, "step": 1810}, {"loss": 0.7726, "grad_norm": 0.5739728808403015, "learning_rate": 0.0002, "epoch": 0.13070017953321364, "step": 1820}, {"loss": 0.8307, "grad_norm": 0.44104722142219543, "learning_rate": 0.0002, "epoch": 0.13141831238779175, "step": 1830}, {"loss": 0.7533, "grad_norm": 0.46373724937438965, "learning_rate": 0.0002, "epoch": 0.13213644524236984, "step": 1840}, {"loss": 0.8181, "grad_norm": 0.4481196403503418, "learning_rate": 0.0002, "epoch": 0.13285457809694792, "step": 1850}, {"loss": 0.8508, "grad_norm": 0.5689327716827393, "learning_rate": 0.0002, "epoch": 0.13357271095152604, "step": 1860}, {"loss": 0.8364, "grad_norm": 0.5334849953651428, "learning_rate": 0.0002, "epoch": 0.13429084380610412, "step": 1870}, {"loss": 0.8018, "grad_norm": 0.5177253484725952, "learning_rate": 0.0002, "epoch": 0.13500897666068223, "step": 1880}, {"loss": 0.869, "grad_norm": 0.4919368326663971, "learning_rate": 0.0002, "epoch": 0.13572710951526032, "step": 1890}, {"loss": 0.7647, "grad_norm": 0.5987576842308044, "learning_rate": 0.0002, "epoch": 0.13644524236983843, "step": 1900}, {"loss": 0.8546, "grad_norm": 0.49790486693382263, "learning_rate": 0.0002, "epoch": 0.13716337522441652, "step": 1910}, {"loss": 0.8402, "grad_norm": 0.5337542295455933, "learning_rate": 0.0002, "epoch": 0.1378815080789946, "step": 1920}, {"loss": 0.815, "grad_norm": 0.5171598792076111, "learning_rate": 0.0002, "epoch": 0.13859964093357272, "step": 1930}, {"loss": 0.843, "grad_norm": 0.5003953576087952, "learning_rate": 0.0002, "epoch": 0.1393177737881508, "step": 1940}, {"loss": 0.7867, "grad_norm": 0.5147887468338013, "learning_rate": 0.0002, "epoch": 0.1400359066427289, "step": 1950}, {"loss": 0.8215, "grad_norm": 0.6365984678268433, "learning_rate": 0.0002, "epoch": 0.140754039497307, "step": 1960}, {"loss": 0.8397, "grad_norm": 0.5449512004852295, "learning_rate": 0.0002, "epoch": 0.1414721723518851, "step": 1970}, {"loss": 0.8177, "grad_norm": 0.4062703847885132, "learning_rate": 0.0002, "epoch": 0.1421903052064632, "step": 1980}, {"loss": 0.8058, "grad_norm": 0.4446912705898285, "learning_rate": 0.0002, "epoch": 0.14290843806104128, "step": 1990}, {"loss": 0.7854, "grad_norm": 0.49001234769821167, "learning_rate": 0.0002, "epoch": 0.1436265709156194, "step": 2000}, {"loss": 0.8136, "grad_norm": 0.5591765642166138, "learning_rate": 0.0002, "epoch": 0.14434470377019748, "step": 2010}, {"loss": 0.7808, "grad_norm": 0.6476696133613586, "learning_rate": 0.0002, "epoch": 0.1450628366247756, "step": 2020}, {"loss": 0.8137, "grad_norm": 0.44688376784324646, "learning_rate": 0.0002, "epoch": 0.14578096947935368, "step": 2030}, {"loss": 0.8253, "grad_norm": 0.4437490701675415, "learning_rate": 0.0002, "epoch": 0.1464991023339318, "step": 2040}, {"loss": 0.7654, "grad_norm": 0.59927898645401, "learning_rate": 0.0002, "epoch": 0.14721723518850988, "step": 2050}, {"loss": 0.825, "grad_norm": 0.4356591999530792, "learning_rate": 0.0002, "epoch": 0.14793536804308796, "step": 2060}, {"loss": 0.8038, "grad_norm": 0.5560822486877441, "learning_rate": 0.0002, "epoch": 0.14865350089766607, "step": 2070}, {"loss": 0.838, "grad_norm": 0.43027108907699585, "learning_rate": 0.0002, "epoch": 0.14937163375224416, "step": 2080}, {"loss": 0.8317, "grad_norm": 0.41215455532073975, "learning_rate": 0.0002, "epoch": 0.15008976660682227, "step": 2090}, {"loss": 0.7948, "grad_norm": 0.4607839584350586, "learning_rate": 0.0002, "epoch": 0.15080789946140036, "step": 2100}, {"loss": 0.7981, "grad_norm": 0.4699854254722595, "learning_rate": 0.0002, "epoch": 0.15152603231597844, "step": 2110}, {"loss": 0.8464, "grad_norm": 0.5111975073814392, "learning_rate": 0.0002, "epoch": 0.15224416517055656, "step": 2120}, {"loss": 0.7672, "grad_norm": 0.4713742733001709, "learning_rate": 0.0002, "epoch": 0.15296229802513464, "step": 2130}, {"loss": 0.7692, "grad_norm": 0.3816622793674469, "learning_rate": 0.0002, "epoch": 0.15368043087971275, "step": 2140}, {"loss": 0.7824, "grad_norm": 0.4637526273727417, "learning_rate": 0.0002, "epoch": 0.15439856373429084, "step": 2150}, {"loss": 0.8185, "grad_norm": 0.3691818118095398, "learning_rate": 0.0002, "epoch": 0.15511669658886895, "step": 2160}, {"loss": 0.8298, "grad_norm": 0.4435218274593353, "learning_rate": 0.0002, "epoch": 0.15583482944344704, "step": 2170}, {"loss": 0.7917, "grad_norm": 0.5282211899757385, "learning_rate": 0.0002, "epoch": 0.15655296229802512, "step": 2180}, {"loss": 0.8006, "grad_norm": 0.7611056566238403, "learning_rate": 0.0002, "epoch": 0.15727109515260324, "step": 2190}, {"loss": 0.8039, "grad_norm": 0.5951169729232788, "learning_rate": 0.0002, "epoch": 0.15798922800718132, "step": 2200}, {"loss": 0.8314, "grad_norm": 0.5243265628814697, "learning_rate": 0.0002, "epoch": 0.15870736086175943, "step": 2210}, {"loss": 0.7817, "grad_norm": 0.518944501876831, "learning_rate": 0.0002, "epoch": 0.15942549371633752, "step": 2220}, {"loss": 0.8187, "grad_norm": 0.4264616072177887, "learning_rate": 0.0002, "epoch": 0.16014362657091563, "step": 2230}, {"loss": 0.7916, "grad_norm": 0.4619045853614807, "learning_rate": 0.0002, "epoch": 0.16086175942549372, "step": 2240}, {"loss": 0.84, "grad_norm": 0.4047030508518219, "learning_rate": 0.0002, "epoch": 0.1615798922800718, "step": 2250}, {"loss": 0.8133, "grad_norm": 0.47133687138557434, "learning_rate": 0.0002, "epoch": 0.16229802513464991, "step": 2260}, {"loss": 0.8032, "grad_norm": 0.4990246593952179, "learning_rate": 0.0002, "epoch": 0.163016157989228, "step": 2270}, {"loss": 0.752, "grad_norm": 0.5145298838615417, "learning_rate": 0.0002, "epoch": 0.1637342908438061, "step": 2280}, {"loss": 0.8441, "grad_norm": 0.5354352593421936, "learning_rate": 0.0002, "epoch": 0.1644524236983842, "step": 2290}, {"loss": 0.8099, "grad_norm": 0.47621065378189087, "learning_rate": 0.0002, "epoch": 0.1651705565529623, "step": 2300}, {"loss": 0.8105, "grad_norm": 0.45333582162857056, "learning_rate": 0.0002, "epoch": 0.1658886894075404, "step": 2310}, {"loss": 0.8126, "grad_norm": 0.4832790493965149, "learning_rate": 0.0002, "epoch": 0.16660682226211848, "step": 2320}, {"loss": 0.8271, "grad_norm": 0.4922761619091034, "learning_rate": 0.0002, "epoch": 0.1673249551166966, "step": 2330}, {"loss": 0.8324, "grad_norm": 0.5701655149459839, "learning_rate": 0.0002, "epoch": 0.16804308797127468, "step": 2340}, {"loss": 0.844, "grad_norm": 0.5170459151268005, "learning_rate": 0.0002, "epoch": 0.1687612208258528, "step": 2350}, {"loss": 0.7995, "grad_norm": 0.6562373638153076, "learning_rate": 0.0002, "epoch": 0.16947935368043088, "step": 2360}, {"loss": 0.7733, "grad_norm": 0.5350262522697449, "learning_rate": 0.0002, "epoch": 0.170197486535009, "step": 2370}, {"loss": 0.8501, "grad_norm": 0.5163491368293762, "learning_rate": 0.0002, "epoch": 0.17091561938958708, "step": 2380}, {"loss": 0.7708, "grad_norm": 0.48841530084609985, "learning_rate": 0.0002, "epoch": 0.17163375224416516, "step": 2390}, {"loss": 0.7969, "grad_norm": 0.44912993907928467, "learning_rate": 0.0002, "epoch": 0.17235188509874327, "step": 2400}, {"loss": 0.7706, "grad_norm": 0.5770647525787354, "learning_rate": 0.0002, "epoch": 0.17307001795332136, "step": 2410}, {"loss": 0.8233, "grad_norm": 0.4716179072856903, "learning_rate": 0.0002, "epoch": 0.17378815080789947, "step": 2420}, {"loss": 0.7802, "grad_norm": 0.5465078949928284, "learning_rate": 0.0002, "epoch": 0.17450628366247756, "step": 2430}, {"loss": 0.8191, "grad_norm": 0.40810713171958923, "learning_rate": 0.0002, "epoch": 0.17522441651705564, "step": 2440}, {"loss": 0.7971, "grad_norm": 0.3789578080177307, "learning_rate": 0.0002, "epoch": 0.17594254937163376, "step": 2450}, {"loss": 0.7437, "grad_norm": 0.4615110158920288, "learning_rate": 0.0002, "epoch": 0.17666068222621184, "step": 2460}, {"loss": 0.8102, "grad_norm": 0.4400235712528229, "learning_rate": 0.0002, "epoch": 0.17737881508078995, "step": 2470}, {"loss": 0.8254, "grad_norm": 0.5935020446777344, "learning_rate": 0.0002, "epoch": 0.17809694793536804, "step": 2480}, {"loss": 0.7886, "grad_norm": 0.5672990679740906, "learning_rate": 0.0002, "epoch": 0.17881508078994615, "step": 2490}, {"loss": 0.7829, "grad_norm": 0.4132838845252991, "learning_rate": 0.0002, "epoch": 0.17953321364452424, "step": 2500}, {"loss": 0.8056, "grad_norm": 0.5373716950416565, "learning_rate": 0.0002, "epoch": 0.18025134649910232, "step": 2510}, {"loss": 0.8061, "grad_norm": 0.5335832834243774, "learning_rate": 0.0002, "epoch": 0.18096947935368043, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5705642700195312, "learning_rate": 0.0002, "epoch": 0.18168761220825852, "step": 2530}, {"loss": 0.7779, "grad_norm": 0.4807959496974945, "learning_rate": 0.0002, "epoch": 0.18240574506283663, "step": 2540}, {"loss": 0.7767, "grad_norm": 0.4430573880672455, "learning_rate": 0.0002, "epoch": 0.18312387791741472, "step": 2550}, {"loss": 0.7921, "grad_norm": 0.5294728875160217, "learning_rate": 0.0002, "epoch": 0.18384201077199283, "step": 2560}, {"loss": 0.8102, "grad_norm": 0.661173403263092, "learning_rate": 0.0002, "epoch": 0.18456014362657092, "step": 2570}, {"loss": 0.803, "grad_norm": 0.5044304728507996, "learning_rate": 0.0002, "epoch": 0.185278276481149, "step": 2580}, {"loss": 0.7833, "grad_norm": 0.48929551243782043, "learning_rate": 0.0002, "epoch": 0.18599640933572711, "step": 2590}, {"loss": 0.8252, "grad_norm": 0.5054438710212708, "learning_rate": 0.0002, "epoch": 0.1867145421903052, "step": 2600}, {"loss": 0.7665, "grad_norm": 0.5613677501678467, "learning_rate": 0.0002, "epoch": 0.1874326750448833, "step": 2610}, {"loss": 0.7954, "grad_norm": 0.5762478709220886, "learning_rate": 0.0002, "epoch": 0.1881508078994614, "step": 2620}, {"loss": 0.8312, "grad_norm": 0.4523695409297943, "learning_rate": 0.0002, "epoch": 0.1888689407540395, "step": 2630}, {"loss": 0.8098, "grad_norm": 0.5235317945480347, "learning_rate": 0.0002, "epoch": 0.1895870736086176, "step": 2640}, {"loss": 0.8281, "grad_norm": 0.4894576370716095, "learning_rate": 0.0002, "epoch": 0.19030520646319568, "step": 2650}, {"loss": 0.7923, "grad_norm": 0.45731106400489807, "learning_rate": 0.0002, "epoch": 0.1910233393177738, "step": 2660}, {"loss": 0.7942, "grad_norm": 0.4726541042327881, "learning_rate": 0.0002, "epoch": 0.19174147217235188, "step": 2670}, {"loss": 0.7979, "grad_norm": 0.4281631410121918, "learning_rate": 0.0002, "epoch": 0.19245960502693, "step": 2680}, {"loss": 0.8076, "grad_norm": 0.48011314868927, "learning_rate": 0.0002, "epoch": 0.19317773788150808, "step": 2690}, {"loss": 0.7785, "grad_norm": 0.45785006880760193, "learning_rate": 0.0002, "epoch": 0.19389587073608616, "step": 2700}, {"loss": 0.7726, "grad_norm": 0.5244625210762024, "learning_rate": 0.0002, "epoch": 0.19461400359066428, "step": 2710}, {"loss": 0.8674, "grad_norm": 0.4674883186817169, "learning_rate": 0.0002, "epoch": 0.19533213644524236, "step": 2720}, {"loss": 0.8465, "grad_norm": 0.5969558358192444, "learning_rate": 0.0002, "epoch": 0.19605026929982047, "step": 2730}, {"loss": 0.8238, "grad_norm": 0.44413265585899353, "learning_rate": 0.0002, "epoch": 0.19676840215439856, "step": 2740}, {"loss": 0.8181, "grad_norm": 0.5094553828239441, "learning_rate": 0.0002, "epoch": 0.19748653500897667, "step": 2750}, {"loss": 0.7593, "grad_norm": 0.4931736886501312, "learning_rate": 0.0002, "epoch": 0.19820466786355476, "step": 2760}, {"loss": 0.8535, "grad_norm": 0.4766625463962555, "learning_rate": 0.0002, "epoch": 0.19892280071813284, "step": 2770}, {"loss": 0.754, "grad_norm": 0.4196971654891968, "learning_rate": 0.0002, "epoch": 0.19964093357271095, "step": 2780}, {"loss": 0.7794, "grad_norm": 0.4693375825881958, "learning_rate": 0.0002, "epoch": 0.20035906642728904, "step": 2790}, {"loss": 0.8336, "grad_norm": 0.5407108664512634, "learning_rate": 0.0002, "epoch": 0.20107719928186715, "step": 2800}, {"loss": 0.7938, "grad_norm": 0.42864227294921875, "learning_rate": 0.0002, "epoch": 0.20179533213644524, "step": 2810}, {"loss": 0.8059, "grad_norm": 0.4928833246231079, "learning_rate": 0.0002, "epoch": 0.20251346499102335, "step": 2820}, {"loss": 0.8221, "grad_norm": 0.5575131773948669, "learning_rate": 0.0002, "epoch": 0.20323159784560144, "step": 2830}, {"loss": 0.7712, "grad_norm": 0.505114734172821, "learning_rate": 0.0002, "epoch": 0.20394973070017952, "step": 2840}, {"loss": 0.7986, "grad_norm": 0.4727420210838318, "learning_rate": 0.0002, "epoch": 0.20466786355475763, "step": 2850}, {"loss": 0.7662, "grad_norm": 0.48218145966529846, "learning_rate": 0.0002, "epoch": 0.20538599640933572, "step": 2860}, {"loss": 0.8055, "grad_norm": 0.5196906328201294, "learning_rate": 0.0002, "epoch": 0.20610412926391383, "step": 2870}, {"loss": 0.8401, "grad_norm": 0.4927639067173004, "learning_rate": 0.0002, "epoch": 0.20682226211849192, "step": 2880}, {"loss": 0.8067, "grad_norm": 0.5076990127563477, "learning_rate": 0.0002, "epoch": 0.20754039497307003, "step": 2890}, {"loss": 0.789, "grad_norm": 0.4606800079345703, "learning_rate": 0.0002, "epoch": 0.20825852782764812, "step": 2900}, {"loss": 0.8381, "grad_norm": 0.6184319257736206, "learning_rate": 0.0002, "epoch": 0.2089766606822262, "step": 2910}, {"loss": 0.8019, "grad_norm": 0.5237935781478882, "learning_rate": 0.0002, "epoch": 0.2096947935368043, "step": 2920}, {"loss": 0.7763, "grad_norm": 0.43966251611709595, "learning_rate": 0.0002, "epoch": 0.2104129263913824, "step": 2930}, {"loss": 0.7915, "grad_norm": 0.48786666989326477, "learning_rate": 0.0002, "epoch": 0.2111310592459605, "step": 2940}, {"loss": 0.7549, "grad_norm": 0.4397817552089691, "learning_rate": 0.0002, "epoch": 0.2118491921005386, "step": 2950}, {"loss": 0.8342, "grad_norm": 0.5155336260795593, "learning_rate": 0.0002, "epoch": 0.2125673249551167, "step": 2960}, {"loss": 0.7885, "grad_norm": 0.48058274388313293, "learning_rate": 0.0002, "epoch": 0.2132854578096948, "step": 2970}, {"loss": 0.8208, "grad_norm": 0.5022647976875305, "learning_rate": 0.0002, "epoch": 0.21400359066427288, "step": 2980}, {"loss": 0.784, "grad_norm": 0.5417225360870361, "learning_rate": 0.0002, "epoch": 0.214721723518851, "step": 2990}, {"loss": 0.8518, "grad_norm": 0.46300315856933594, "learning_rate": 0.0002, "epoch": 0.21543985637342908, "step": 3000}, {"loss": 0.764, "grad_norm": 0.5375089049339294, "learning_rate": 0.0002, "epoch": 0.2161579892280072, "step": 3010}, {"loss": 0.8459, "grad_norm": 0.5050022602081299, "learning_rate": 0.0002, "epoch": 0.21687612208258528, "step": 3020}, {"loss": 0.797, "grad_norm": 0.46347716450691223, "learning_rate": 0.0002, "epoch": 0.21759425493716336, "step": 3030}, {"loss": 0.8014, "grad_norm": 0.544874370098114, "learning_rate": 0.0002, "epoch": 0.21831238779174147, "step": 3040}, {"loss": 0.802, "grad_norm": 0.4268142580986023, "learning_rate": 0.0002, "epoch": 0.21903052064631956, "step": 3050}, {"loss": 0.8224, "grad_norm": 0.5527157187461853, "learning_rate": 0.0002, "epoch": 0.21974865350089767, "step": 3060}, {"loss": 0.771, "grad_norm": 0.5565235018730164, "learning_rate": 0.0002, "epoch": 0.22046678635547576, "step": 3070}, {"loss": 0.7807, "grad_norm": 0.4900645613670349, "learning_rate": 0.0002, "epoch": 0.22118491921005387, "step": 3080}, {"loss": 0.8321, "grad_norm": 0.4951242208480835, "learning_rate": 0.0002, "epoch": 0.22190305206463196, "step": 3090}, {"loss": 0.8301, "grad_norm": 0.5831719636917114, "learning_rate": 0.0002, "epoch": 0.22262118491921004, "step": 3100}, {"loss": 0.8011, "grad_norm": 0.417576402425766, "learning_rate": 0.0002, "epoch": 0.22333931777378815, "step": 3110}, {"loss": 0.8226, "grad_norm": 0.4715117812156677, "learning_rate": 0.0002, "epoch": 0.22405745062836624, "step": 3120}, {"loss": 0.778, "grad_norm": 0.5956445336341858, "learning_rate": 0.0002, "epoch": 0.22477558348294435, "step": 3130}, {"loss": 0.788, "grad_norm": 0.408184289932251, "learning_rate": 0.0002, "epoch": 0.22549371633752244, "step": 3140}, {"loss": 0.8096, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 0.22621184919210055, "step": 3150}, {"loss": 0.7722, "grad_norm": 0.5631294846534729, "learning_rate": 0.0002, "epoch": 0.22692998204667864, "step": 3160}, {"loss": 0.7933, "grad_norm": 0.5054665803909302, "learning_rate": 0.0002, "epoch": 0.22764811490125672, "step": 3170}, {"loss": 0.8572, "grad_norm": 0.47388020157814026, "learning_rate": 0.0002, "epoch": 0.22836624775583483, "step": 3180}, {"loss": 0.8148, "grad_norm": 0.45871609449386597, "learning_rate": 0.0002, "epoch": 0.22908438061041292, "step": 3190}, {"loss": 0.8373, "grad_norm": 0.42431211471557617, "learning_rate": 0.0002, "epoch": 0.22980251346499103, "step": 3200}, {"loss": 0.7847, "grad_norm": 0.584872305393219, "learning_rate": 0.0002, "epoch": 0.23052064631956912, "step": 3210}, {"loss": 0.8118, "grad_norm": 0.5489653944969177, "learning_rate": 0.0002, "epoch": 0.23123877917414723, "step": 3220}, {"loss": 0.8552, "grad_norm": 0.5803213119506836, "learning_rate": 0.0002, "epoch": 0.23195691202872532, "step": 3230}, {"loss": 0.7702, "grad_norm": 0.906505823135376, "learning_rate": 0.0002, "epoch": 0.2326750448833034, "step": 3240}, {"loss": 0.8454, "grad_norm": 0.4569525718688965, "learning_rate": 0.0002, "epoch": 0.2333931777378815, "step": 3250}, {"loss": 0.7641, "grad_norm": 0.5566741228103638, "learning_rate": 0.0002, "epoch": 0.2341113105924596, "step": 3260}, {"loss": 0.7964, "grad_norm": 0.5059959888458252, "learning_rate": 0.0002, "epoch": 0.2348294434470377, "step": 3270}, {"loss": 0.7965, "grad_norm": 0.530828058719635, "learning_rate": 0.0002, "epoch": 0.2355475763016158, "step": 3280}, {"loss": 0.807, "grad_norm": 0.5149409174919128, "learning_rate": 0.0002, "epoch": 0.2362657091561939, "step": 3290}, {"loss": 0.8067, "grad_norm": 0.7323763966560364, "learning_rate": 0.0002, "epoch": 0.236983842010772, "step": 3300}, {"loss": 0.774, "grad_norm": 0.6794836521148682, "learning_rate": 0.0002, "epoch": 0.23770197486535008, "step": 3310}, {"loss": 0.7902, "grad_norm": 0.5176534056663513, "learning_rate": 0.0002, "epoch": 0.2384201077199282, "step": 3320}, {"loss": 0.8119, "grad_norm": 0.42245906591415405, "learning_rate": 0.0002, "epoch": 0.23913824057450628, "step": 3330}, {"loss": 0.868, "grad_norm": 0.43535107374191284, "learning_rate": 0.0002, "epoch": 0.2398563734290844, "step": 3340}, {"loss": 0.825, "grad_norm": 0.7038307785987854, "learning_rate": 0.0002, "epoch": 0.24057450628366248, "step": 3350}, {"loss": 0.7818, "grad_norm": 0.5689977407455444, "learning_rate": 0.0002, "epoch": 0.24129263913824056, "step": 3360}, {"loss": 0.7958, "grad_norm": 0.538136899471283, "learning_rate": 0.0002, "epoch": 0.24201077199281867, "step": 3370}, {"loss": 0.7995, "grad_norm": 0.7433661222457886, "learning_rate": 0.0002, "epoch": 0.24272890484739676, "step": 3380}, {"loss": 0.8564, "grad_norm": 0.6996734738349915, "learning_rate": 0.0002, "epoch": 0.24344703770197487, "step": 3390}, {"loss": 0.8288, "grad_norm": 0.5055703520774841, "learning_rate": 0.0002, "epoch": 0.24416517055655296, "step": 3400}, {"loss": 0.7741, "grad_norm": 0.5218513607978821, "learning_rate": 0.0002, "epoch": 0.24488330341113107, "step": 3410}, {"loss": 0.7903, "grad_norm": 0.42782822251319885, "learning_rate": 0.0002, "epoch": 0.24560143626570916, "step": 3420}, {"loss": 0.8005, "grad_norm": 0.4991157650947571, "learning_rate": 0.0002, "epoch": 0.24631956912028724, "step": 3430}, {"loss": 0.8151, "grad_norm": 0.5063165426254272, "learning_rate": 0.0002, "epoch": 0.24703770197486535, "step": 3440}, {"loss": 0.7722, "grad_norm": 0.45863136649131775, "learning_rate": 0.0002, "epoch": 0.24775583482944344, "step": 3450}, {"loss": 0.8236, "grad_norm": 0.474728524684906, "learning_rate": 0.0002, "epoch": 0.24847396768402155, "step": 3460}, {"loss": 0.7698, "grad_norm": 0.522570013999939, "learning_rate": 0.0002, "epoch": 0.24919210053859964, "step": 3470}, {"loss": 0.7448, "grad_norm": 0.5474396347999573, "learning_rate": 0.0002, "epoch": 0.24991023339317775, "step": 3480}, {"loss": 0.8339, "grad_norm": 0.49094662070274353, "learning_rate": 0.0002, "epoch": 0.2506283662477558, "step": 3490}, {"loss": 0.7864, "grad_norm": 0.6399132609367371, "learning_rate": 0.0002, "epoch": 0.2513464991023339, "step": 3500}, {"loss": 0.7988, "grad_norm": 0.5910066366195679, "learning_rate": 0.0002, "epoch": 0.25206463195691203, "step": 3510}, {"loss": 0.813, "grad_norm": 0.4761259853839874, "learning_rate": 0.0002, "epoch": 0.25278276481149015, "step": 3520}, {"loss": 0.812, "grad_norm": 0.5124502182006836, "learning_rate": 0.0002, "epoch": 0.2535008976660682, "step": 3530}, {"loss": 0.7699, "grad_norm": 0.4329150915145874, "learning_rate": 0.0002, "epoch": 0.2542190305206463, "step": 3540}, {"loss": 0.8205, "grad_norm": 0.4839608371257782, "learning_rate": 0.0002, "epoch": 0.25493716337522443, "step": 3550}, {"loss": 0.8279, "grad_norm": 0.5413459539413452, "learning_rate": 0.0002, "epoch": 0.2556552962298025, "step": 3560}, {"loss": 0.8253, "grad_norm": 0.5761468410491943, "learning_rate": 0.0002, "epoch": 0.2563734290843806, "step": 3570}, {"loss": 0.8473, "grad_norm": 0.49266132712364197, "learning_rate": 0.0002, "epoch": 0.2570915619389587, "step": 3580}, {"loss": 0.7946, "grad_norm": 0.7377930879592896, "learning_rate": 0.0002, "epoch": 0.2578096947935368, "step": 3590}, {"loss": 0.799, "grad_norm": 0.543541431427002, "learning_rate": 0.0002, "epoch": 0.2585278276481149, "step": 3600}, {"loss": 0.8044, "grad_norm": 0.48385897278785706, "learning_rate": 0.0002, "epoch": 0.259245960502693, "step": 3610}, {"loss": 0.7686, "grad_norm": 0.5152639746665955, "learning_rate": 0.0002, "epoch": 0.2599640933572711, "step": 3620}, {"loss": 0.7438, "grad_norm": 0.5601988434791565, "learning_rate": 0.0002, "epoch": 0.26068222621184917, "step": 3630}, {"loss": 0.7915, "grad_norm": 0.4349626302719116, "learning_rate": 0.0002, "epoch": 0.2614003590664273, "step": 3640}, {"loss": 0.7825, "grad_norm": 0.5487161874771118, "learning_rate": 0.0002, "epoch": 0.2621184919210054, "step": 3650}, {"loss": 0.8085, "grad_norm": 0.45603805780410767, "learning_rate": 0.0002, "epoch": 0.2628366247755835, "step": 3660}, {"loss": 0.7858, "grad_norm": 0.5012730956077576, "learning_rate": 0.0002, "epoch": 0.26355475763016156, "step": 3670}, {"loss": 0.8022, "grad_norm": 0.4523845314979553, "learning_rate": 0.0002, "epoch": 0.2642728904847397, "step": 3680}, {"loss": 0.7932, "grad_norm": 0.5756664872169495, "learning_rate": 0.0002, "epoch": 0.2649910233393178, "step": 3690}, {"loss": 0.816, "grad_norm": 0.48467493057250977, "learning_rate": 0.0002, "epoch": 0.26570915619389585, "step": 3700}, {"loss": 0.7825, "grad_norm": 0.4860585927963257, "learning_rate": 0.0002, "epoch": 0.26642728904847396, "step": 3710}, {"loss": 0.7903, "grad_norm": 0.5067077875137329, "learning_rate": 0.0002, "epoch": 0.26714542190305207, "step": 3720}, {"loss": 0.8155, "grad_norm": 0.5490895509719849, "learning_rate": 0.0002, "epoch": 0.2678635547576302, "step": 3730}, {"loss": 0.7542, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.26858168761220824, "step": 3740}, {"loss": 0.7991, "grad_norm": 0.5026951432228088, "learning_rate": 0.0002, "epoch": 0.26929982046678635, "step": 3750}, {"loss": 0.8152, "grad_norm": 0.49474090337753296, "learning_rate": 0.0002, "epoch": 0.27001795332136447, "step": 3760}, {"loss": 0.8235, "grad_norm": 0.6381985545158386, "learning_rate": 0.0002, "epoch": 0.2707360861759425, "step": 3770}, {"loss": 0.8024, "grad_norm": 0.4784011244773865, "learning_rate": 0.0002, "epoch": 0.27145421903052064, "step": 3780}, {"loss": 0.7746, "grad_norm": 0.5126543045043945, "learning_rate": 0.0002, "epoch": 0.27217235188509875, "step": 3790}, {"loss": 0.841, "grad_norm": 0.5428652763366699, "learning_rate": 0.0002, "epoch": 0.27289048473967686, "step": 3800}, {"loss": 0.8137, "grad_norm": 0.5427033305168152, "learning_rate": 0.0002, "epoch": 0.2736086175942549, "step": 3810}, {"loss": 0.7274, "grad_norm": 0.46467480063438416, "learning_rate": 0.0002, "epoch": 0.27432675044883303, "step": 3820}, {"loss": 0.8414, "grad_norm": 0.494367390871048, "learning_rate": 0.0002, "epoch": 0.27504488330341115, "step": 3830}, {"loss": 0.8151, "grad_norm": 0.59856778383255, "learning_rate": 0.0002, "epoch": 0.2757630161579892, "step": 3840}, {"loss": 0.7899, "grad_norm": 0.422128826379776, "learning_rate": 0.0002, "epoch": 0.2764811490125673, "step": 3850}, {"loss": 0.8153, "grad_norm": 0.5757306814193726, "learning_rate": 0.0002, "epoch": 0.27719928186714543, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5850930213928223, "learning_rate": 0.0002, "epoch": 0.27791741472172354, "step": 3870}, {"loss": 0.8044, "grad_norm": 0.5633023977279663, "learning_rate": 0.0002, "epoch": 0.2786355475763016, "step": 3880}, {"loss": 0.8402, "grad_norm": 0.5037940144538879, "learning_rate": 0.0002, "epoch": 0.2793536804308797, "step": 3890}, {"loss": 0.822, "grad_norm": 0.5255506038665771, "learning_rate": 0.0002, "epoch": 0.2800718132854578, "step": 3900}, {"loss": 0.7625, "grad_norm": 0.44584617018699646, "learning_rate": 0.0002, "epoch": 0.2807899461400359, "step": 3910}, {"loss": 0.8131, "grad_norm": 0.4803239405155182, "learning_rate": 0.0002, "epoch": 0.281508078994614, "step": 3920}, {"loss": 0.8122, "grad_norm": 0.5206008553504944, "learning_rate": 0.0002, "epoch": 0.2822262118491921, "step": 3930}, {"loss": 0.8988, "grad_norm": 0.5596373081207275, "learning_rate": 0.0002, "epoch": 0.2829443447037702, "step": 3940}, {"loss": 0.8091, "grad_norm": 0.4487258493900299, "learning_rate": 0.0002, "epoch": 0.2836624775583483, "step": 3950}, {"loss": 0.7933, "grad_norm": 0.4774281978607178, "learning_rate": 0.0002, "epoch": 0.2843806104129264, "step": 3960}, {"loss": 0.8994, "grad_norm": 0.571829617023468, "learning_rate": 0.0002, "epoch": 0.2850987432675045, "step": 3970}, {"loss": 0.7971, "grad_norm": 0.45251455903053284, "learning_rate": 0.0002, "epoch": 0.28581687612208256, "step": 3980}, {"loss": 0.8007, "grad_norm": 0.5119943618774414, "learning_rate": 0.0002, "epoch": 0.2865350089766607, "step": 3990}, {"loss": 0.8087, "grad_norm": 0.42333969473838806, "learning_rate": 0.0002, "epoch": 0.2872531418312388, "step": 4000}, {"loss": 0.7978, "grad_norm": 0.5694096684455872, "learning_rate": 0.0002, "epoch": 0.2879712746858169, "step": 4010}, {"loss": 0.845, "grad_norm": 0.44457492232322693, "learning_rate": 0.0002, "epoch": 0.28868940754039496, "step": 4020}, {"loss": 0.7268, "grad_norm": 0.496545672416687, "learning_rate": 0.0002, "epoch": 0.2894075403949731, "step": 4030}, {"loss": 0.7908, "grad_norm": 0.5092352032661438, "learning_rate": 0.0002, "epoch": 0.2901256732495512, "step": 4040}, {"loss": 0.807, "grad_norm": 0.5124567151069641, "learning_rate": 0.0002, "epoch": 0.29084380610412924, "step": 4050}, {"loss": 0.8191, "grad_norm": 0.5148161053657532, "learning_rate": 0.0002, "epoch": 0.29156193895870736, "step": 4060}, {"loss": 0.7893, "grad_norm": 0.48183947801589966, "learning_rate": 0.0002, "epoch": 0.29228007181328547, "step": 4070}, {"loss": 0.8397, "grad_norm": 0.47728800773620605, "learning_rate": 0.0002, "epoch": 0.2929982046678636, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.29371633752244164, "step": 4090}, {"loss": 0.8019, "grad_norm": 0.5343585014343262, "learning_rate": 0.0002, "epoch": 0.29443447037701975, "step": 4100}, {"loss": 0.7933, "grad_norm": 0.5760312676429749, "learning_rate": 0.0002, "epoch": 0.29515260323159787, "step": 4110}, {"loss": 0.811, "grad_norm": 0.5894787907600403, "learning_rate": 0.0002, "epoch": 0.2958707360861759, "step": 4120}, {"loss": 0.7375, "grad_norm": 0.4528578817844391, "learning_rate": 0.0002, "epoch": 0.29658886894075404, "step": 4130}, {"loss": 0.7761, "grad_norm": 0.6027235388755798, "learning_rate": 0.0002, "epoch": 0.29730700179533215, "step": 4140}, {"loss": 0.7636, "grad_norm": 0.5060310959815979, "learning_rate": 0.0002, "epoch": 0.2980251346499102, "step": 4150}, {"loss": 0.8122, "grad_norm": 0.475252628326416, "learning_rate": 0.0002, "epoch": 0.2987432675044883, "step": 4160}, {"loss": 0.8306, "grad_norm": 0.4855351448059082, "learning_rate": 0.0002, "epoch": 0.29946140035906643, "step": 4170}, {"loss": 0.7863, "grad_norm": 0.6720767021179199, "learning_rate": 0.0002, "epoch": 0.30017953321364454, "step": 4180}, {"loss": 0.7755, "grad_norm": 0.6409553289413452, "learning_rate": 0.0002, "epoch": 0.3008976660682226, "step": 4190}, {"loss": 0.8333, "grad_norm": 0.5508167147636414, "learning_rate": 0.0002, "epoch": 0.3016157989228007, "step": 4200}, {"loss": 0.8138, "grad_norm": 0.45958149433135986, "learning_rate": 0.0002, "epoch": 0.30233393177737883, "step": 4210}, {"loss": 0.8266, "grad_norm": 0.5201641321182251, "learning_rate": 0.0002, "epoch": 0.3030520646319569, "step": 4220}, {"loss": 0.8246, "grad_norm": 0.5440032482147217, "learning_rate": 0.0002, "epoch": 0.303770197486535, "step": 4230}, {"loss": 0.7863, "grad_norm": 0.43566814064979553, "learning_rate": 0.0002, "epoch": 0.3044883303411131, "step": 4240}, {"loss": 0.7835, "grad_norm": 0.4479893445968628, "learning_rate": 0.0002, "epoch": 0.3052064631956912, "step": 4250}, {"loss": 0.7646, "grad_norm": 0.40390217304229736, "learning_rate": 0.0002, "epoch": 0.3059245960502693, "step": 4260}, {"loss": 0.8382, "grad_norm": 0.5143486261367798, "learning_rate": 0.0002, "epoch": 0.3066427289048474, "step": 4270}, {"loss": 0.799, "grad_norm": 0.5289962887763977, "learning_rate": 0.0002, "epoch": 0.3073608617594255, "step": 4280}, {"loss": 0.7706, "grad_norm": 0.609561026096344, "learning_rate": 0.0002, "epoch": 0.30807899461400357, "step": 4290}, {"loss": 0.809, "grad_norm": 0.5967493653297424, "learning_rate": 0.0002, "epoch": 0.3087971274685817, "step": 4300}, {"loss": 0.8034, "grad_norm": 0.5323672890663147, "learning_rate": 0.0002, "epoch": 0.3095152603231598, "step": 4310}, {"loss": 0.8463, "grad_norm": 0.4996737241744995, "learning_rate": 0.0002, "epoch": 0.3102333931777379, "step": 4320}, {"loss": 0.7879, "grad_norm": 0.5528829097747803, "learning_rate": 0.0002, "epoch": 0.31095152603231596, "step": 4330}, {"loss": 0.8383, "grad_norm": 0.5394268035888672, "learning_rate": 0.0002, "epoch": 0.3116696588868941, "step": 4340}, {"loss": 0.8258, "grad_norm": 0.4654628038406372, "learning_rate": 0.0002, "epoch": 0.3123877917414722, "step": 4350}, {"loss": 0.8045, "grad_norm": 0.4933706521987915, "learning_rate": 0.0002, "epoch": 0.31310592459605024, "step": 4360}, {"loss": 0.7891, "grad_norm": 0.5310598611831665, "learning_rate": 0.0002, "epoch": 0.31382405745062836, "step": 4370}, {"loss": 0.8362, "grad_norm": 0.5558765530586243, "learning_rate": 0.0002, "epoch": 0.31454219030520647, "step": 4380}, {"loss": 0.8013, "grad_norm": 0.5281313061714172, "learning_rate": 0.0002, "epoch": 0.3152603231597846, "step": 4390}, {"loss": 0.8034, "grad_norm": 0.5100293755531311, "learning_rate": 0.0002, "epoch": 0.31597845601436264, "step": 4400}, {"loss": 0.795, "grad_norm": 0.48762813210487366, "learning_rate": 0.0002, "epoch": 0.31669658886894075, "step": 4410}, {"loss": 0.7941, "grad_norm": 0.5211702585220337, "learning_rate": 0.0002, "epoch": 0.31741472172351887, "step": 4420}, {"loss": 0.8079, "grad_norm": 0.696747899055481, "learning_rate": 0.0002, "epoch": 0.3181328545780969, "step": 4430}, {"loss": 0.77, "grad_norm": 0.6334946751594543, "learning_rate": 0.0002, "epoch": 0.31885098743267504, "step": 4440}, {"loss": 0.7871, "grad_norm": 0.5333067178726196, "learning_rate": 0.0002, "epoch": 0.31956912028725315, "step": 4450}, {"loss": 0.7846, "grad_norm": 0.500091552734375, "learning_rate": 0.0002, "epoch": 0.32028725314183126, "step": 4460}, {"loss": 0.7884, "grad_norm": 0.5190957188606262, "learning_rate": 0.0002, "epoch": 0.3210053859964093, "step": 4470}, {"loss": 0.7988, "grad_norm": 0.6702370047569275, "learning_rate": 0.0002, "epoch": 0.32172351885098743, "step": 4480}, {"loss": 0.8014, "grad_norm": 0.4393869638442993, "learning_rate": 0.0002, "epoch": 0.32244165170556555, "step": 4490}, {"loss": 0.8373, "grad_norm": 0.4766499400138855, "learning_rate": 0.0002, "epoch": 0.3231597845601436, "step": 4500}, {"loss": 0.7567, "grad_norm": 0.561836838722229, "learning_rate": 0.0002, "epoch": 0.3238779174147217, "step": 4510}, {"loss": 0.7727, "grad_norm": 0.44366541504859924, "learning_rate": 0.0002, "epoch": 0.32459605026929983, "step": 4520}, {"loss": 0.8109, "grad_norm": 0.46504274010658264, "learning_rate": 0.0002, "epoch": 0.32531418312387794, "step": 4530}, {"loss": 0.7868, "grad_norm": 0.5498034954071045, "learning_rate": 0.0002, "epoch": 0.326032315978456, "step": 4540}, {"loss": 0.7638, "grad_norm": 0.5901338458061218, "learning_rate": 0.0002, "epoch": 0.3267504488330341, "step": 4550}, {"loss": 0.8016, "grad_norm": 0.5485442876815796, "learning_rate": 0.0002, "epoch": 0.3274685816876122, "step": 4560}, {"loss": 0.7944, "grad_norm": 0.512584924697876, "learning_rate": 0.0002, "epoch": 0.3281867145421903, "step": 4570}, {"loss": 0.8193, "grad_norm": 0.5208188891410828, "learning_rate": 0.0002, "epoch": 0.3289048473967684, "step": 4580}, {"loss": 0.7833, "grad_norm": 0.4923836886882782, "learning_rate": 0.0002, "epoch": 0.3296229802513465, "step": 4590}, {"loss": 0.8102, "grad_norm": 0.49258530139923096, "learning_rate": 0.0002, "epoch": 0.3303411131059246, "step": 4600}, {"loss": 0.7874, "grad_norm": 0.4788922667503357, "learning_rate": 0.0002, "epoch": 0.3310592459605027, "step": 4610}, {"loss": 0.8298, "grad_norm": 0.48276954889297485, "learning_rate": 0.0002, "epoch": 0.3317773788150808, "step": 4620}, {"loss": 0.8519, "grad_norm": 0.6300732493400574, "learning_rate": 0.0002, "epoch": 0.3324955116696589, "step": 4630}, {"loss": 0.8434, "grad_norm": 0.47594770789146423, "learning_rate": 0.0002, "epoch": 0.33321364452423696, "step": 4640}, {"loss": 0.8123, "grad_norm": 0.4728924632072449, "learning_rate": 0.0002, "epoch": 0.3339317773788151, "step": 4650}, {"loss": 0.8113, "grad_norm": 0.5586788654327393, "learning_rate": 0.0002, "epoch": 0.3346499102333932, "step": 4660}, {"loss": 0.7949, "grad_norm": 0.4573180377483368, "learning_rate": 0.0002, "epoch": 0.3353680430879713, "step": 4670}, {"loss": 0.8341, "grad_norm": 0.6391524076461792, "learning_rate": 0.0002, "epoch": 0.33608617594254936, "step": 4680}, {"loss": 0.8126, "grad_norm": 0.6570921540260315, "learning_rate": 0.0002, "epoch": 0.33680430879712747, "step": 4690}, {"loss": 0.796, "grad_norm": 0.4601454734802246, "learning_rate": 0.0002, "epoch": 0.3375224416517056, "step": 4700}, {"loss": 0.8158, "grad_norm": 0.5640755295753479, "learning_rate": 0.0002, "epoch": 0.33824057450628364, "step": 4710}, {"loss": 0.8326, "grad_norm": 0.43475520610809326, "learning_rate": 0.0002, "epoch": 0.33895870736086176, "step": 4720}, {"loss": 0.7684, "grad_norm": 0.4785807132720947, "learning_rate": 0.0002, "epoch": 0.33967684021543987, "step": 4730}, {"loss": 0.8257, "grad_norm": 0.4934665262699127, "learning_rate": 0.0002, "epoch": 0.340394973070018, "step": 4740}, {"loss": 0.7713, "grad_norm": 0.45327693223953247, "learning_rate": 0.0002, "epoch": 0.34111310592459604, "step": 4750}, {"loss": 0.7944, "grad_norm": 0.4710456430912018, "learning_rate": 0.0002, "epoch": 0.34183123877917415, "step": 4760}, {"loss": 0.7689, "grad_norm": 0.5591559410095215, "learning_rate": 0.0002, "epoch": 0.34254937163375226, "step": 4770}, {"loss": 0.8204, "grad_norm": 0.48958835005760193, "learning_rate": 0.0002, "epoch": 0.3432675044883303, "step": 4780}, {"loss": 0.8232, "grad_norm": 0.4613766670227051, "learning_rate": 0.0002, "epoch": 0.34398563734290843, "step": 4790}, {"loss": 0.8339, "grad_norm": 0.5425335764884949, "learning_rate": 0.0002, "epoch": 0.34470377019748655, "step": 4800}, {"loss": 0.828, "grad_norm": 0.4964924156665802, "learning_rate": 0.0002, "epoch": 0.3454219030520646, "step": 4810}, {"loss": 0.8264, "grad_norm": 0.613449215888977, "learning_rate": 0.0002, "epoch": 0.3461400359066427, "step": 4820}, {"loss": 0.846, "grad_norm": 0.6553348898887634, "learning_rate": 0.0002, "epoch": 0.34685816876122083, "step": 4830}, {"loss": 0.8181, "grad_norm": 0.5863470435142517, "learning_rate": 0.0002, "epoch": 0.34757630161579894, "step": 4840}, {"loss": 0.8205, "grad_norm": 0.5338097810745239, "learning_rate": 0.0002, "epoch": 0.348294434470377, "step": 4850}, {"loss": 0.7926, "grad_norm": 0.6129760146141052, "learning_rate": 0.0002, "epoch": 0.3490125673249551, "step": 4860}, {"loss": 0.7745, "grad_norm": 0.6100956797599792, "learning_rate": 0.0002, "epoch": 0.3497307001795332, "step": 4870}, {"loss": 0.7642, "grad_norm": 0.5478541254997253, "learning_rate": 0.0002, "epoch": 0.3504488330341113, "step": 4880}, {"loss": 0.7558, "grad_norm": 0.5725359916687012, "learning_rate": 0.0002, "epoch": 0.3511669658886894, "step": 4890}, {"loss": 0.8208, "grad_norm": 0.6141043901443481, "learning_rate": 0.0002, "epoch": 0.3518850987432675, "step": 4900}, {"loss": 0.841, "grad_norm": 0.597191572189331, "learning_rate": 0.0002, "epoch": 0.3526032315978456, "step": 4910}, {"loss": 0.8234, "grad_norm": 0.5988389253616333, "learning_rate": 0.0002, "epoch": 0.3533213644524237, "step": 4920}, {"loss": 0.7775, "grad_norm": 0.5503361821174622, "learning_rate": 0.0002, "epoch": 0.3540394973070018, "step": 4930}, {"loss": 0.8315, "grad_norm": 0.5932779312133789, "learning_rate": 0.0002, "epoch": 0.3547576301615799, "step": 4940}, {"loss": 0.8407, "grad_norm": 0.48911359906196594, "learning_rate": 0.0002, "epoch": 0.35547576301615796, "step": 4950}, {"loss": 0.8191, "grad_norm": 0.5435750484466553, "learning_rate": 0.0002, "epoch": 0.3561938958707361, "step": 4960}, {"loss": 0.7551, "grad_norm": 0.4786977767944336, "learning_rate": 0.0002, "epoch": 0.3569120287253142, "step": 4970}, {"loss": 0.7845, "grad_norm": 0.4022316336631775, "learning_rate": 0.0002, "epoch": 0.3576301615798923, "step": 4980}, {"loss": 0.8032, "grad_norm": 0.4848504364490509, "learning_rate": 0.0002, "epoch": 0.35834829443447036, "step": 4990}, {"loss": 0.809, "grad_norm": 0.5093459486961365, "learning_rate": 0.0002, "epoch": 0.3590664272890485, "step": 5000}, {"loss": 0.8424, "grad_norm": 0.47368478775024414, "learning_rate": 0.0002, "epoch": 0.3597845601436266, "step": 5010}, {"loss": 0.811, "grad_norm": 0.6041097044944763, "learning_rate": 0.0002, "epoch": 0.36050269299820464, "step": 5020}, {"loss": 0.8023, "grad_norm": 0.5384424924850464, "learning_rate": 0.0002, "epoch": 0.36122082585278276, "step": 5030}, {"loss": 0.826, "grad_norm": 0.4668518602848053, "learning_rate": 0.0002, "epoch": 0.36193895870736087, "step": 5040}, {"loss": 0.7785, "grad_norm": 0.5471060276031494, "learning_rate": 0.0002, "epoch": 0.362657091561939, "step": 5050}, {"loss": 0.7511, "grad_norm": 0.731369137763977, "learning_rate": 0.0002, "epoch": 0.36337522441651704, "step": 5060}, {"loss": 0.8646, "grad_norm": 0.5119590759277344, "learning_rate": 0.0002, "epoch": 0.36409335727109515, "step": 5070}, {"loss": 0.8125, "grad_norm": 0.567428469657898, "learning_rate": 0.0002, "epoch": 0.36481149012567327, "step": 5080}, {"loss": 0.7616, "grad_norm": 0.5139971375465393, "learning_rate": 0.0002, "epoch": 0.3655296229802513, "step": 5090}, {"loss": 0.8091, "grad_norm": 0.5701581835746765, "learning_rate": 0.0002, "epoch": 0.36624775583482944, "step": 5100}, {"loss": 0.821, "grad_norm": 0.5022063851356506, "learning_rate": 0.0002, "epoch": 0.36696588868940755, "step": 5110}, {"loss": 0.7879, "grad_norm": 0.4684354364871979, "learning_rate": 0.0002, "epoch": 0.36768402154398566, "step": 5120}, {"loss": 0.8028, "grad_norm": 0.5423495769500732, "learning_rate": 0.0002, "epoch": 0.3684021543985637, "step": 5130}, {"loss": 0.7763, "grad_norm": 0.46262967586517334, "learning_rate": 0.0002, "epoch": 0.36912028725314183, "step": 5140}, {"loss": 0.8485, "grad_norm": 0.4720141589641571, "learning_rate": 0.0002, "epoch": 0.36983842010771995, "step": 5150}, {"loss": 0.7778, "grad_norm": 0.5113096833229065, "learning_rate": 0.0002, "epoch": 0.370556552962298, "step": 5160}, {"loss": 0.7854, "grad_norm": 0.5253350138664246, "learning_rate": 0.0002, "epoch": 0.3712746858168761, "step": 5170}, {"loss": 0.8539, "grad_norm": 0.5799776315689087, "learning_rate": 0.0002, "epoch": 0.37199281867145423, "step": 5180}, {"loss": 0.78, "grad_norm": 0.5166001319885254, "learning_rate": 0.0002, "epoch": 0.37271095152603234, "step": 5190}, {"loss": 0.7939, "grad_norm": 0.5658290386199951, "learning_rate": 0.0002, "epoch": 0.3734290843806104, "step": 5200}, {"loss": 0.8059, "grad_norm": 0.45811113715171814, "learning_rate": 0.0002, "epoch": 0.3741472172351885, "step": 5210}, {"loss": 0.8024, "grad_norm": 0.5509489178657532, "learning_rate": 0.0002, "epoch": 0.3748653500897666, "step": 5220}, {"loss": 0.7537, "grad_norm": 0.47473257780075073, "learning_rate": 0.0002, "epoch": 0.3755834829443447, "step": 5230}, {"loss": 0.8159, "grad_norm": 0.3858596086502075, "learning_rate": 0.0002, "epoch": 0.3763016157989228, "step": 5240}, {"loss": 0.8592, "grad_norm": 0.6941536068916321, "learning_rate": 0.0002, "epoch": 0.3770197486535009, "step": 5250}, {"loss": 0.8489, "grad_norm": 0.46940872073173523, "learning_rate": 0.0002, "epoch": 0.377737881508079, "step": 5260}, {"loss": 0.7818, "grad_norm": 0.5413833260536194, "learning_rate": 0.0002, "epoch": 0.3784560143626571, "step": 5270}, {"loss": 0.8202, "grad_norm": 0.5165658593177795, "learning_rate": 0.0002, "epoch": 0.3791741472172352, "step": 5280}, {"loss": 0.7837, "grad_norm": 0.6567398309707642, "learning_rate": 0.0002, "epoch": 0.3798922800718133, "step": 5290}, {"loss": 0.7991, "grad_norm": 0.5466915965080261, "learning_rate": 0.0002, "epoch": 0.38061041292639136, "step": 5300}, {"loss": 0.7683, "grad_norm": 0.4800598621368408, "learning_rate": 0.0002, "epoch": 0.3813285457809695, "step": 5310}, {"loss": 0.8653, "grad_norm": 0.4551742970943451, "learning_rate": 0.0002, "epoch": 0.3820466786355476, "step": 5320}, {"loss": 0.8283, "grad_norm": 0.5561164617538452, "learning_rate": 0.0002, "epoch": 0.3827648114901257, "step": 5330}, {"loss": 0.8192, "grad_norm": 0.6170380115509033, "learning_rate": 0.0002, "epoch": 0.38348294434470376, "step": 5340}, {"loss": 0.8015, "grad_norm": 0.465762197971344, "learning_rate": 0.0002, "epoch": 0.38420107719928187, "step": 5350}, {"loss": 0.7561, "grad_norm": 0.6176838874816895, "learning_rate": 0.0002, "epoch": 0.38491921005386, "step": 5360}, {"loss": 0.7571, "grad_norm": 0.657926082611084, "learning_rate": 0.0002, "epoch": 0.38563734290843804, "step": 5370}, {"loss": 0.7366, "grad_norm": 0.5063281655311584, "learning_rate": 0.0002, "epoch": 0.38635547576301615, "step": 5380}, {"loss": 0.8259, "grad_norm": 0.6960828304290771, "learning_rate": 0.0002, "epoch": 0.38707360861759427, "step": 5390}, {"loss": 0.8058, "grad_norm": 0.46712034940719604, "learning_rate": 0.0002, "epoch": 0.3877917414721723, "step": 5400}, {"loss": 0.7674, "grad_norm": 0.598114013671875, "learning_rate": 0.0002, "epoch": 0.38850987432675044, "step": 5410}, {"loss": 0.8256, "grad_norm": 0.6798132061958313, "learning_rate": 0.0002, "epoch": 0.38922800718132855, "step": 5420}, {"loss": 0.844, "grad_norm": 0.5194289088249207, "learning_rate": 0.0002, "epoch": 0.38994614003590666, "step": 5430}, {"loss": 0.7666, "grad_norm": 0.48175323009490967, "learning_rate": 0.0002, "epoch": 0.3906642728904847, "step": 5440}, {"loss": 0.8089, "grad_norm": 0.4979408085346222, "learning_rate": 0.0002, "epoch": 0.39138240574506283, "step": 5450}, {"loss": 0.7938, "grad_norm": 0.6440972685813904, "learning_rate": 0.0002, "epoch": 0.39210053859964095, "step": 5460}, {"loss": 0.8531, "grad_norm": 0.5977227091789246, "learning_rate": 0.0002, "epoch": 0.392818671454219, "step": 5470}, {"loss": 0.8384, "grad_norm": 0.4735909104347229, "learning_rate": 0.0002, "epoch": 0.3935368043087971, "step": 5480}, {"loss": 0.8579, "grad_norm": 0.48181721568107605, "learning_rate": 0.0002, "epoch": 0.39425493716337523, "step": 5490}, {"loss": 0.8113, "grad_norm": 0.6339454650878906, "learning_rate": 0.0002, "epoch": 0.39497307001795334, "step": 5500}, {"loss": 0.7682, "grad_norm": 0.5364336371421814, "learning_rate": 0.0002, "epoch": 0.3956912028725314, "step": 5510}, {"loss": 0.8198, "grad_norm": 0.5499233603477478, "learning_rate": 0.0002, "epoch": 0.3964093357271095, "step": 5520}, {"loss": 0.7981, "grad_norm": 0.47249847650527954, "learning_rate": 0.0002, "epoch": 0.3971274685816876, "step": 5530}, {"loss": 0.8207, "grad_norm": 0.5692135095596313, "learning_rate": 0.0002, "epoch": 0.3978456014362657, "step": 5540}, {"loss": 0.8173, "grad_norm": 0.6009272933006287, "learning_rate": 0.0002, "epoch": 0.3985637342908438, "step": 5550}, {"loss": 0.7622, "grad_norm": 0.5198255181312561, "learning_rate": 0.0002, "epoch": 0.3992818671454219, "step": 5560}, {"loss": 0.8597, "grad_norm": 0.5474766492843628, "learning_rate": 0.0002, "epoch": 0.4, "step": 5570}, {"loss": 0.841, "grad_norm": 0.5577479600906372, "learning_rate": 0.0002, "epoch": 0.4007181328545781, "step": 5580}, {"loss": 0.7986, "grad_norm": 0.5350302457809448, "learning_rate": 0.0002, "epoch": 0.4014362657091562, "step": 5590}, {"loss": 0.7892, "grad_norm": 0.6310991048812866, "learning_rate": 0.0002, "epoch": 0.4021543985637343, "step": 5600}, {"loss": 0.7834, "grad_norm": 0.5695762038230896, "learning_rate": 0.0002, "epoch": 0.40287253141831236, "step": 5610}, {"loss": 0.7508, "grad_norm": 0.5431827306747437, "learning_rate": 0.0002, "epoch": 0.4035906642728905, "step": 5620}, {"loss": 0.8743, "grad_norm": 0.4923325777053833, "learning_rate": 0.0002, "epoch": 0.4043087971274686, "step": 5630}, {"loss": 0.7745, "grad_norm": 0.531399667263031, "learning_rate": 0.0002, "epoch": 0.4050269299820467, "step": 5640}, {"loss": 0.7982, "grad_norm": 0.5854769349098206, "learning_rate": 0.0002, "epoch": 0.40574506283662476, "step": 5650}, {"loss": 0.8225, "grad_norm": 0.6684802174568176, "learning_rate": 0.0002, "epoch": 0.40646319569120287, "step": 5660}, {"loss": 0.7405, "grad_norm": 0.6618620753288269, "learning_rate": 0.0002, "epoch": 0.407181328545781, "step": 5670}, {"loss": 0.7707, "grad_norm": 0.4930776059627533, "learning_rate": 0.0002, "epoch": 0.40789946140035904, "step": 5680}, {"loss": 0.7846, "grad_norm": 0.506628155708313, "learning_rate": 0.0002, "epoch": 0.40861759425493716, "step": 5690}, {"loss": 0.7827, "grad_norm": 0.5250783562660217, "learning_rate": 0.0002, "epoch": 0.40933572710951527, "step": 5700}, {"loss": 0.8386, "grad_norm": 0.6773046851158142, "learning_rate": 0.0002, "epoch": 0.4100538599640934, "step": 5710}, {"loss": 0.8096, "grad_norm": 0.6750592589378357, "learning_rate": 0.0002, "epoch": 0.41077199281867144, "step": 5720}, {"loss": 0.7873, "grad_norm": 0.5277232527732849, "learning_rate": 0.0002, "epoch": 0.41149012567324955, "step": 5730}, {"loss": 0.762, "grad_norm": 0.5155990719795227, "learning_rate": 0.0002, "epoch": 0.41220825852782766, "step": 5740}, {"loss": 0.871, "grad_norm": 0.5236294865608215, "learning_rate": 0.0002, "epoch": 0.4129263913824057, "step": 5750}, {"loss": 0.7753, "grad_norm": 0.5073592066764832, "learning_rate": 0.0002, "epoch": 0.41364452423698383, "step": 5760}, {"loss": 0.7984, "grad_norm": 0.6997184753417969, "learning_rate": 0.0002, "epoch": 0.41436265709156195, "step": 5770}, {"loss": 0.7579, "grad_norm": 0.5282439589500427, "learning_rate": 0.0002, "epoch": 0.41508078994614006, "step": 5780}, {"loss": 0.7831, "grad_norm": 0.4997355341911316, "learning_rate": 0.0002, "epoch": 0.4157989228007181, "step": 5790}, {"loss": 0.8022, "grad_norm": 0.6081610321998596, "learning_rate": 0.0002, "epoch": 0.41651705565529623, "step": 5800}, {"loss": 0.8068, "grad_norm": 0.5640295147895813, "learning_rate": 0.0002, "epoch": 0.41723518850987434, "step": 5810}, {"loss": 0.7819, "grad_norm": 0.6443586349487305, "learning_rate": 0.0002, "epoch": 0.4179533213644524, "step": 5820}, {"loss": 0.8132, "grad_norm": 0.6456229090690613, "learning_rate": 0.0002, "epoch": 0.4186714542190305, "step": 5830}, {"loss": 0.785, "grad_norm": 0.5422267317771912, "learning_rate": 0.0002, "epoch": 0.4193895870736086, "step": 5840}, {"loss": 0.7962, "grad_norm": 0.45251885056495667, "learning_rate": 0.0002, "epoch": 0.42010771992818674, "step": 5850}, {"loss": 0.7945, "grad_norm": 0.781165599822998, "learning_rate": 0.0002, "epoch": 0.4208258527827648, "step": 5860}, {"loss": 0.8171, "grad_norm": 0.5359160900115967, "learning_rate": 0.0002, "epoch": 0.4215439856373429, "step": 5870}, {"loss": 0.8012, "grad_norm": 0.6201958656311035, "learning_rate": 0.0002, "epoch": 0.422262118491921, "step": 5880}, {"loss": 0.8363, "grad_norm": 0.5985850691795349, "learning_rate": 0.0002, "epoch": 0.4229802513464991, "step": 5890}, {"loss": 0.7842, "grad_norm": 0.5550961494445801, "learning_rate": 0.0002, "epoch": 0.4236983842010772, "step": 5900}, {"loss": 0.7717, "grad_norm": 0.6284893155097961, "learning_rate": 0.0002, "epoch": 0.4244165170556553, "step": 5910}, {"loss": 0.8165, "grad_norm": 0.6143685579299927, "learning_rate": 0.0002, "epoch": 0.4251346499102334, "step": 5920}, {"loss": 0.7986, "grad_norm": 0.5065329670906067, "learning_rate": 0.0002, "epoch": 0.4258527827648115, "step": 5930}, {"loss": 0.7883, "grad_norm": 0.7274345755577087, "learning_rate": 0.0002, "epoch": 0.4265709156193896, "step": 5940}, {"loss": 0.8126, "grad_norm": 0.606531023979187, "learning_rate": 0.0002, "epoch": 0.4272890484739677, "step": 5950}, {"loss": 0.7805, "grad_norm": 0.5983648300170898, "learning_rate": 0.0002, "epoch": 0.42800718132854576, "step": 5960}, {"loss": 0.8124, "grad_norm": 0.5546031594276428, "learning_rate": 0.0002, "epoch": 0.4287253141831239, "step": 5970}, {"loss": 0.8184, "grad_norm": 0.666868269443512, "learning_rate": 0.0002, "epoch": 0.429443447037702, "step": 5980}, {"loss": 0.8171, "grad_norm": 0.41438576579093933, "learning_rate": 0.0002, "epoch": 0.4301615798922801, "step": 5990}, {"loss": 0.8456, "grad_norm": 0.5012526512145996, "learning_rate": 0.0002, "epoch": 0.43087971274685816, "step": 6000}, {"loss": 0.7837, "grad_norm": 0.6071694493293762, "learning_rate": 0.0002, "epoch": 0.43159784560143627, "step": 6010}, {"loss": 0.8364, "grad_norm": 0.5538384914398193, "learning_rate": 0.0002, "epoch": 0.4323159784560144, "step": 6020}, {"loss": 0.7888, "grad_norm": 0.5798718929290771, "learning_rate": 0.0002, "epoch": 0.43303411131059244, "step": 6030}, {"loss": 0.8196, "grad_norm": 0.5442442893981934, "learning_rate": 0.0002, "epoch": 0.43375224416517055, "step": 6040}, {"loss": 0.8041, "grad_norm": 0.6895565390586853, "learning_rate": 0.0002, "epoch": 0.43447037701974867, "step": 6050}, {"loss": 0.8154, "grad_norm": 0.6498045325279236, "learning_rate": 0.0002, "epoch": 0.4351885098743267, "step": 6060}, {"loss": 0.782, "grad_norm": 0.5225510001182556, "learning_rate": 0.0002, "epoch": 0.43590664272890484, "step": 6070}, {"loss": 0.7809, "grad_norm": 0.6366992592811584, "learning_rate": 0.0002, "epoch": 0.43662477558348295, "step": 6080}, {"loss": 0.7715, "grad_norm": 0.47929027676582336, "learning_rate": 0.0002, "epoch": 0.43734290843806106, "step": 6090}, {"loss": 0.7481, "grad_norm": 0.5722405910491943, "learning_rate": 0.0002, "epoch": 0.4380610412926391, "step": 6100}, {"loss": 0.765, "grad_norm": 0.6008004546165466, "learning_rate": 0.0002, "epoch": 0.43877917414721723, "step": 6110}, {"loss": 0.7795, "grad_norm": 0.5922580361366272, "learning_rate": 0.0002, "epoch": 0.43949730700179535, "step": 6120}, {"loss": 0.8542, "grad_norm": 0.7051905393600464, "learning_rate": 0.0002, "epoch": 0.4402154398563734, "step": 6130}, {"loss": 0.8159, "grad_norm": 0.5146450400352478, "learning_rate": 0.0002, "epoch": 0.4409335727109515, "step": 6140}, {"loss": 0.8178, "grad_norm": 0.5605781674385071, "learning_rate": 0.0002, "epoch": 0.44165170556552963, "step": 6150}, {"loss": 0.8409, "grad_norm": 0.8008661866188049, "learning_rate": 0.0002, "epoch": 0.44236983842010774, "step": 6160}, {"loss": 0.797, "grad_norm": 0.47406497597694397, "learning_rate": 0.0002, "epoch": 0.4430879712746858, "step": 6170}, {"loss": 0.7853, "grad_norm": 0.612287700176239, "learning_rate": 0.0002, "epoch": 0.4438061041292639, "step": 6180}, {"loss": 0.835, "grad_norm": 0.561188280582428, "learning_rate": 0.0002, "epoch": 0.444524236983842, "step": 6190}, {"loss": 0.7604, "grad_norm": 0.6233669519424438, "learning_rate": 0.0002, "epoch": 0.4452423698384201, "step": 6200}, {"loss": 0.7539, "grad_norm": 0.45546263456344604, "learning_rate": 0.0002, "epoch": 0.4459605026929982, "step": 6210}, {"loss": 0.8183, "grad_norm": 0.5947871208190918, "learning_rate": 0.0002, "epoch": 0.4466786355475763, "step": 6220}, {"loss": 0.789, "grad_norm": 0.6109753847122192, "learning_rate": 0.0002, "epoch": 0.4473967684021544, "step": 6230}, {"loss": 0.7811, "grad_norm": 0.6380727887153625, "learning_rate": 0.0002, "epoch": 0.4481149012567325, "step": 6240}, {"loss": 0.7845, "grad_norm": 0.5225699543952942, "learning_rate": 0.0002, "epoch": 0.4488330341113106, "step": 6250}, {"loss": 0.8217, "grad_norm": 0.521503210067749, "learning_rate": 0.0002, "epoch": 0.4495511669658887, "step": 6260}, {"loss": 0.8392, "grad_norm": 0.5523216128349304, "learning_rate": 0.0002, "epoch": 0.45026929982046676, "step": 6270}, {"loss": 0.8228, "grad_norm": 0.5954921841621399, "learning_rate": 0.0002, "epoch": 0.4509874326750449, "step": 6280}, {"loss": 0.7798, "grad_norm": 0.702751100063324, "learning_rate": 0.0002, "epoch": 0.451705565529623, "step": 6290}, {"loss": 0.7865, "grad_norm": 0.5756356120109558, "learning_rate": 0.0002, "epoch": 0.4524236983842011, "step": 6300}, {"loss": 0.8128, "grad_norm": 0.45365944504737854, "learning_rate": 0.0002, "epoch": 0.45314183123877916, "step": 6310}, {"loss": 0.8027, "grad_norm": 0.5027855038642883, "learning_rate": 0.0002, "epoch": 0.45385996409335727, "step": 6320}, {"loss": 0.8052, "grad_norm": 0.6551687121391296, "learning_rate": 0.0002, "epoch": 0.4545780969479354, "step": 6330}, {"loss": 0.7507, "grad_norm": 0.5296684503555298, "learning_rate": 0.0002, "epoch": 0.45529622980251344, "step": 6340}, {"loss": 0.8209, "grad_norm": 0.5762032866477966, "learning_rate": 0.0002, "epoch": 0.45601436265709155, "step": 6350}, {"loss": 0.8209, "grad_norm": 0.5234073996543884, "learning_rate": 0.0002, "epoch": 0.45673249551166967, "step": 6360}, {"loss": 0.8412, "grad_norm": 0.5090946555137634, "learning_rate": 0.0002, "epoch": 0.4574506283662478, "step": 6370}, {"loss": 0.787, "grad_norm": 0.6515111327171326, "learning_rate": 0.0002, "epoch": 0.45816876122082584, "step": 6380}, {"loss": 0.7351, "grad_norm": 0.7904898524284363, "learning_rate": 0.0002, "epoch": 0.45888689407540395, "step": 6390}, {"loss": 0.841, "grad_norm": 0.6379680037498474, "learning_rate": 0.0002, "epoch": 0.45960502692998206, "step": 6400}, {"loss": 0.7727, "grad_norm": 0.641759991645813, "learning_rate": 0.0002, "epoch": 0.4603231597845601, "step": 6410}, {"loss": 0.8346, "grad_norm": 0.5273829698562622, "learning_rate": 0.0002, "epoch": 0.46104129263913823, "step": 6420}, {"loss": 0.7722, "grad_norm": 0.5668497681617737, "learning_rate": 0.0002, "epoch": 0.46175942549371635, "step": 6430}, {"loss": 0.8157, "grad_norm": 0.5862061381340027, "learning_rate": 0.0002, "epoch": 0.46247755834829446, "step": 6440}, {"loss": 0.818, "grad_norm": 0.5239592790603638, "learning_rate": 0.0002, "epoch": 0.4631956912028725, "step": 6450}, {"loss": 0.7803, "grad_norm": 0.5078722834587097, "learning_rate": 0.0002, "epoch": 0.46391382405745063, "step": 6460}, {"loss": 0.7934, "grad_norm": 0.566509485244751, "learning_rate": 0.0002, "epoch": 0.46463195691202874, "step": 6470}, {"loss": 0.7746, "grad_norm": 0.5952697396278381, "learning_rate": 0.0002, "epoch": 0.4653500897666068, "step": 6480}, {"loss": 0.8088, "grad_norm": 0.6548156142234802, "learning_rate": 0.0002, "epoch": 0.4660682226211849, "step": 6490}, {"loss": 0.8303, "grad_norm": 0.4768427908420563, "learning_rate": 0.0002, "epoch": 0.466786355475763, "step": 6500}, {"loss": 0.805, "grad_norm": 0.5588273406028748, "learning_rate": 0.0002, "epoch": 0.46750448833034114, "step": 6510}, {"loss": 0.7774, "grad_norm": 0.5348677039146423, "learning_rate": 0.0002, "epoch": 0.4682226211849192, "step": 6520}, {"loss": 0.7969, "grad_norm": 0.4784318804740906, "learning_rate": 0.0002, "epoch": 0.4689407540394973, "step": 6530}, {"loss": 0.8073, "grad_norm": 0.5112265944480896, "learning_rate": 0.0002, "epoch": 0.4696588868940754, "step": 6540}, {"loss": 0.8289, "grad_norm": 0.7250495553016663, "learning_rate": 0.0002, "epoch": 0.4703770197486535, "step": 6550}, {"loss": 0.808, "grad_norm": 0.538608968257904, "learning_rate": 0.0002, "epoch": 0.4710951526032316, "step": 6560}, {"loss": 0.7977, "grad_norm": 0.5981247425079346, "learning_rate": 0.0002, "epoch": 0.4718132854578097, "step": 6570}, {"loss": 0.8092, "grad_norm": 0.5466762781143188, "learning_rate": 0.0002, "epoch": 0.4725314183123878, "step": 6580}, {"loss": 0.8136, "grad_norm": 0.5609987378120422, "learning_rate": 0.0002, "epoch": 0.4732495511669659, "step": 6590}, {"loss": 0.8575, "grad_norm": 0.6091027855873108, "learning_rate": 0.0002, "epoch": 0.473967684021544, "step": 6600}, {"loss": 0.7741, "grad_norm": 0.5542886853218079, "learning_rate": 0.0002, "epoch": 0.4746858168761221, "step": 6610}, {"loss": 0.7867, "grad_norm": 0.5656579732894897, "learning_rate": 0.0002, "epoch": 0.47540394973070016, "step": 6620}, {"loss": 0.7647, "grad_norm": 0.47507357597351074, "learning_rate": 0.0002, "epoch": 0.4761220825852783, "step": 6630}, {"loss": 0.8323, "grad_norm": 0.6039174199104309, "learning_rate": 0.0002, "epoch": 0.4768402154398564, "step": 6640}, {"loss": 0.7812, "grad_norm": 0.7129740715026855, "learning_rate": 0.0002, "epoch": 0.47755834829443444, "step": 6650}, {"loss": 0.8001, "grad_norm": 0.5189188718795776, "learning_rate": 0.0002, "epoch": 0.47827648114901256, "step": 6660}, {"loss": 0.7467, "grad_norm": 0.7548696398735046, "learning_rate": 0.0002, "epoch": 0.47899461400359067, "step": 6670}, {"loss": 0.7694, "grad_norm": 0.4729466438293457, "learning_rate": 0.0002, "epoch": 0.4797127468581688, "step": 6680}, {"loss": 0.7497, "grad_norm": 0.6190000772476196, "learning_rate": 0.0002, "epoch": 0.48043087971274684, "step": 6690}, {"loss": 0.7691, "grad_norm": 0.6276983022689819, "learning_rate": 0.0002, "epoch": 0.48114901256732495, "step": 6700}, {"loss": 0.7947, "grad_norm": 0.6097590923309326, "learning_rate": 0.0002, "epoch": 0.48186714542190306, "step": 6710}, {"loss": 0.7735, "grad_norm": 0.6507330536842346, "learning_rate": 0.0002, "epoch": 0.4825852782764811, "step": 6720}, {"loss": 0.817, "grad_norm": 0.5501991510391235, "learning_rate": 0.0002, "epoch": 0.48330341113105924, "step": 6730}, {"loss": 0.7998, "grad_norm": 0.5928015112876892, "learning_rate": 0.0002, "epoch": 0.48402154398563735, "step": 6740}, {"loss": 0.7717, "grad_norm": 0.5523008704185486, "learning_rate": 0.0002, "epoch": 0.48473967684021546, "step": 6750}, {"loss": 0.7821, "grad_norm": 0.5997263789176941, "learning_rate": 0.0002, "epoch": 0.4854578096947935, "step": 6760}, {"loss": 0.7619, "grad_norm": 0.6201002597808838, "learning_rate": 0.0002, "epoch": 0.48617594254937163, "step": 6770}, {"loss": 0.8018, "grad_norm": 0.6338862776756287, "learning_rate": 0.0002, "epoch": 0.48689407540394974, "step": 6780}, {"loss": 0.7547, "grad_norm": 0.5542550086975098, "learning_rate": 0.0002, "epoch": 0.4876122082585278, "step": 6790}, {"loss": 0.7754, "grad_norm": 0.5587872862815857, "learning_rate": 0.0002, "epoch": 0.4883303411131059, "step": 6800}, {"loss": 0.7913, "grad_norm": 0.5895681977272034, "learning_rate": 0.0002, "epoch": 0.489048473967684, "step": 6810}, {"loss": 0.7799, "grad_norm": 0.4948221743106842, "learning_rate": 0.0002, "epoch": 0.48976660682226214, "step": 6820}, {"loss": 0.8057, "grad_norm": 0.44546931982040405, "learning_rate": 0.0002, "epoch": 0.4904847396768402, "step": 6830}, {"loss": 0.8124, "grad_norm": 0.632046103477478, "learning_rate": 0.0002, "epoch": 0.4912028725314183, "step": 6840}, {"loss": 0.8014, "grad_norm": 0.49396243691444397, "learning_rate": 0.0002, "epoch": 0.4919210053859964, "step": 6850}, {"loss": 0.7127, "grad_norm": 0.497745156288147, "learning_rate": 0.0002, "epoch": 0.4926391382405745, "step": 6860}, {"loss": 0.8306, "grad_norm": 0.7336170077323914, "learning_rate": 0.0002, "epoch": 0.4933572710951526, "step": 6870}, {"loss": 0.8342, "grad_norm": 0.6723181009292603, "learning_rate": 0.0002, "epoch": 0.4940754039497307, "step": 6880}, {"loss": 0.8251, "grad_norm": 0.5887754559516907, "learning_rate": 0.0002, "epoch": 0.4947935368043088, "step": 6890}, {"loss": 0.7904, "grad_norm": 0.6580226421356201, "learning_rate": 0.0002, "epoch": 0.4955116696588869, "step": 6900}, {"loss": 0.8203, "grad_norm": 0.7385056614875793, "learning_rate": 0.0002, "epoch": 0.496229802513465, "step": 6910}, {"loss": 0.87, "grad_norm": 0.48736000061035156, "learning_rate": 0.0002, "epoch": 0.4969479353680431, "step": 6920}, {"loss": 0.8045, "grad_norm": 0.6304559111595154, "learning_rate": 0.0002, "epoch": 0.49766606822262116, "step": 6930}, {"loss": 0.8323, "grad_norm": 0.607148289680481, "learning_rate": 0.0002, "epoch": 0.4983842010771993, "step": 6940}, {"loss": 0.8277, "grad_norm": 0.5467981696128845, "learning_rate": 0.0002, "epoch": 0.4991023339317774, "step": 6950}, {"loss": 0.804, "grad_norm": 0.7046723961830139, "learning_rate": 0.0002, "epoch": 0.4998204667863555, "step": 6960}, {"loss": 0.7836, "grad_norm": 0.5487921833992004, "learning_rate": 0.0002, "epoch": 0.5005385996409336, "step": 6970}, {"loss": 0.8445, "grad_norm": 0.5706006288528442, "learning_rate": 0.0002, "epoch": 0.5012567324955116, "step": 6980}, {"loss": 0.8216, "grad_norm": 0.539536714553833, "learning_rate": 0.0002, "epoch": 0.5019748653500897, "step": 6990}, {"loss": 0.7829, "grad_norm": 0.5527397394180298, "learning_rate": 0.0002, "epoch": 0.5026929982046678, "step": 7000}, {"loss": 0.8342, "grad_norm": 0.5498567223548889, "learning_rate": 0.0002, "epoch": 0.503411131059246, "step": 7010}, {"loss": 0.8073, "grad_norm": 0.5878575444221497, "learning_rate": 0.0002, "epoch": 0.5041292639138241, "step": 7020}, {"loss": 0.8284, "grad_norm": 0.646153450012207, "learning_rate": 0.0002, "epoch": 0.5048473967684022, "step": 7030}, {"loss": 0.7758, "grad_norm": 0.5603899359703064, "learning_rate": 0.0002, "epoch": 0.5055655296229803, "step": 7040}, {"loss": 0.8002, "grad_norm": 0.5849952697753906, "learning_rate": 0.0002, "epoch": 0.5062836624775583, "step": 7050}, {"loss": 0.7953, "grad_norm": 0.6082724928855896, "learning_rate": 0.0002, "epoch": 0.5070017953321364, "step": 7060}, {"loss": 0.8046, "grad_norm": 0.5900670289993286, "learning_rate": 0.0002, "epoch": 0.5077199281867145, "step": 7070}, {"loss": 0.8612, "grad_norm": 0.5856624841690063, "learning_rate": 0.0002, "epoch": 0.5084380610412926, "step": 7080}, {"loss": 0.8289, "grad_norm": 0.6177338361740112, "learning_rate": 0.0002, "epoch": 0.5091561938958707, "step": 7090}, {"loss": 0.8139, "grad_norm": 0.5559300184249878, "learning_rate": 0.0002, "epoch": 0.5098743267504489, "step": 7100}, {"loss": 0.8083, "grad_norm": 0.62027907371521, "learning_rate": 0.0002, "epoch": 0.510592459605027, "step": 7110}, {"loss": 0.8037, "grad_norm": 0.6334301829338074, "learning_rate": 0.0002, "epoch": 0.511310592459605, "step": 7120}, {"loss": 0.8107, "grad_norm": 0.513795018196106, "learning_rate": 0.0002, "epoch": 0.5120287253141831, "step": 7130}, {"loss": 0.7566, "grad_norm": 0.7004675269126892, "learning_rate": 0.0002, "epoch": 0.5127468581687612, "step": 7140}, {"loss": 0.7893, "grad_norm": 0.5614308714866638, "learning_rate": 0.0002, "epoch": 0.5134649910233393, "step": 7150}, {"loss": 0.7868, "grad_norm": 0.5037539601325989, "learning_rate": 0.0002, "epoch": 0.5141831238779174, "step": 7160}, {"loss": 0.7981, "grad_norm": 0.5568661093711853, "learning_rate": 0.0002, "epoch": 0.5149012567324955, "step": 7170}, {"loss": 0.8333, "grad_norm": 0.7513397336006165, "learning_rate": 0.0002, "epoch": 0.5156193895870737, "step": 7180}, {"loss": 0.792, "grad_norm": 0.7264583706855774, "learning_rate": 0.0002, "epoch": 0.5163375224416517, "step": 7190}, {"loss": 0.8671, "grad_norm": 0.6355819702148438, "learning_rate": 0.0002, "epoch": 0.5170556552962298, "step": 7200}, {"loss": 0.7734, "grad_norm": 0.6063222289085388, "learning_rate": 0.0002, "epoch": 0.5177737881508079, "step": 7210}, {"loss": 0.812, "grad_norm": 0.6484307646751404, "learning_rate": 0.0002, "epoch": 0.518491921005386, "step": 7220}, {"loss": 0.7852, "grad_norm": 0.5260455012321472, "learning_rate": 0.0002, "epoch": 0.5192100538599641, "step": 7230}, {"loss": 0.8301, "grad_norm": 0.6718002557754517, "learning_rate": 0.0002, "epoch": 0.5199281867145422, "step": 7240}, {"loss": 0.8178, "grad_norm": 0.5997617244720459, "learning_rate": 0.0002, "epoch": 0.5206463195691203, "step": 7250}, {"loss": 0.7631, "grad_norm": 0.5838589668273926, "learning_rate": 0.0002, "epoch": 0.5213644524236983, "step": 7260}, {"loss": 0.7853, "grad_norm": 0.5755977630615234, "learning_rate": 0.0002, "epoch": 0.5220825852782764, "step": 7270}, {"loss": 0.8233, "grad_norm": 0.6442093253135681, "learning_rate": 0.0002, "epoch": 0.5228007181328546, "step": 7280}, {"loss": 0.822, "grad_norm": 0.6128416657447815, "learning_rate": 0.0002, "epoch": 0.5235188509874327, "step": 7290}, {"loss": 0.802, "grad_norm": 0.509742796421051, "learning_rate": 0.0002, "epoch": 0.5242369838420108, "step": 7300}, {"loss": 0.7438, "grad_norm": 0.5450230836868286, "learning_rate": 0.0002, "epoch": 0.5249551166965889, "step": 7310}, {"loss": 0.7881, "grad_norm": 0.5437141060829163, "learning_rate": 0.0002, "epoch": 0.525673249551167, "step": 7320}, {"loss": 0.795, "grad_norm": 0.5291738510131836, "learning_rate": 0.0002, "epoch": 0.526391382405745, "step": 7330}, {"loss": 0.8204, "grad_norm": 0.5101743936538696, "learning_rate": 0.0002, "epoch": 0.5271095152603231, "step": 7340}, {"loss": 0.856, "grad_norm": 0.5678408145904541, "learning_rate": 0.0002, "epoch": 0.5278276481149012, "step": 7350}, {"loss": 0.8435, "grad_norm": 0.6332360506057739, "learning_rate": 0.0002, "epoch": 0.5285457809694794, "step": 7360}, {"loss": 0.8521, "grad_norm": 0.4935058653354645, "learning_rate": 0.0002, "epoch": 0.5292639138240575, "step": 7370}, {"loss": 0.7699, "grad_norm": 0.6399656534194946, "learning_rate": 0.0002, "epoch": 0.5299820466786356, "step": 7380}, {"loss": 0.7956, "grad_norm": 0.5986794233322144, "learning_rate": 0.0002, "epoch": 0.5307001795332137, "step": 7390}, {"loss": 0.774, "grad_norm": 0.6948414444923401, "learning_rate": 0.0002, "epoch": 0.5314183123877917, "step": 7400}, {"loss": 0.8267, "grad_norm": 0.5337842106819153, "learning_rate": 0.0002, "epoch": 0.5321364452423698, "step": 7410}, {"loss": 0.7634, "grad_norm": 0.6897268295288086, "learning_rate": 0.0002, "epoch": 0.5328545780969479, "step": 7420}, {"loss": 0.7606, "grad_norm": 0.6361175179481506, "learning_rate": 0.0002, "epoch": 0.533572710951526, "step": 7430}, {"loss": 0.7592, "grad_norm": 0.5242252945899963, "learning_rate": 0.0002, "epoch": 0.5342908438061041, "step": 7440}, {"loss": 0.7387, "grad_norm": 0.5731322765350342, "learning_rate": 0.0002, "epoch": 0.5350089766606823, "step": 7450}, {"loss": 0.8215, "grad_norm": 0.5790955424308777, "learning_rate": 0.0002, "epoch": 0.5357271095152604, "step": 7460}, {"loss": 0.7714, "grad_norm": 0.4979061782360077, "learning_rate": 0.0002, "epoch": 0.5364452423698384, "step": 7470}, {"loss": 0.794, "grad_norm": 0.7335101962089539, "learning_rate": 0.0002, "epoch": 0.5371633752244165, "step": 7480}, {"loss": 0.787, "grad_norm": 0.592521071434021, "learning_rate": 0.0002, "epoch": 0.5378815080789946, "step": 7490}, {"loss": 0.7421, "grad_norm": 0.5784769654273987, "learning_rate": 0.0002, "epoch": 0.5385996409335727, "step": 7500}, {"loss": 0.789, "grad_norm": 0.8148589730262756, "learning_rate": 0.0002, "epoch": 0.5393177737881508, "step": 7510}, {"loss": 0.7777, "grad_norm": 0.5727689862251282, "learning_rate": 0.0002, "epoch": 0.5400359066427289, "step": 7520}, {"loss": 0.8321, "grad_norm": 0.6958279609680176, "learning_rate": 0.0002, "epoch": 0.540754039497307, "step": 7530}, {"loss": 0.7678, "grad_norm": 0.6302788257598877, "learning_rate": 0.0002, "epoch": 0.541472172351885, "step": 7540}, {"loss": 0.7772, "grad_norm": 0.5950970649719238, "learning_rate": 0.0002, "epoch": 0.5421903052064632, "step": 7550}, {"loss": 0.8076, "grad_norm": 0.4275270104408264, "learning_rate": 0.0002, "epoch": 0.5429084380610413, "step": 7560}, {"loss": 0.8158, "grad_norm": 0.7579900622367859, "learning_rate": 0.0002, "epoch": 0.5436265709156194, "step": 7570}, {"loss": 0.8036, "grad_norm": 0.5835317969322205, "learning_rate": 0.0002, "epoch": 0.5443447037701975, "step": 7580}, {"loss": 0.7947, "grad_norm": 0.5305142998695374, "learning_rate": 0.0002, "epoch": 0.5450628366247756, "step": 7590}, {"loss": 0.8043, "grad_norm": 0.6076129674911499, "learning_rate": 0.0002, "epoch": 0.5457809694793537, "step": 7600}, {"loss": 0.8197, "grad_norm": 0.5341935753822327, "learning_rate": 0.0002, "epoch": 0.5464991023339317, "step": 7610}, {"loss": 0.7424, "grad_norm": 0.6070826053619385, "learning_rate": 0.0002, "epoch": 0.5472172351885098, "step": 7620}, {"loss": 0.7801, "grad_norm": 0.6193035840988159, "learning_rate": 0.0002, "epoch": 0.547935368043088, "step": 7630}, {"loss": 0.7639, "grad_norm": 0.6171614527702332, "learning_rate": 0.0002, "epoch": 0.5486535008976661, "step": 7640}, {"loss": 0.7655, "grad_norm": 0.5700938105583191, "learning_rate": 0.0002, "epoch": 0.5493716337522442, "step": 7650}, {"loss": 0.8289, "grad_norm": 0.5742418169975281, "learning_rate": 0.0002, "epoch": 0.5500897666068223, "step": 7660}, {"loss": 0.7942, "grad_norm": 0.6450320482254028, "learning_rate": 0.0002, "epoch": 0.5508078994614004, "step": 7670}, {"loss": 0.807, "grad_norm": 0.542860209941864, "learning_rate": 0.0002, "epoch": 0.5515260323159784, "step": 7680}, {"loss": 0.8298, "grad_norm": 0.538007915019989, "learning_rate": 0.0002, "epoch": 0.5522441651705565, "step": 7690}, {"loss": 0.8301, "grad_norm": 0.5846288204193115, "learning_rate": 0.0002, "epoch": 0.5529622980251346, "step": 7700}, {"loss": 0.7893, "grad_norm": 0.623315155506134, "learning_rate": 0.0002, "epoch": 0.5536804308797127, "step": 7710}, {"loss": 0.8043, "grad_norm": 0.6607962250709534, "learning_rate": 0.0002, "epoch": 0.5543985637342909, "step": 7720}, {"loss": 0.7615, "grad_norm": 0.5258557200431824, "learning_rate": 0.0002, "epoch": 0.555116696588869, "step": 7730}, {"loss": 0.8177, "grad_norm": 0.6464316844940186, "learning_rate": 0.0002, "epoch": 0.5558348294434471, "step": 7740}, {"loss": 0.7683, "grad_norm": 0.6390621662139893, "learning_rate": 0.0002, "epoch": 0.5565529622980251, "step": 7750}, {"loss": 0.8447, "grad_norm": 0.5327560305595398, "learning_rate": 0.0002, "epoch": 0.5572710951526032, "step": 7760}, {"loss": 0.7833, "grad_norm": 0.8202064633369446, "learning_rate": 0.0002, "epoch": 0.5579892280071813, "step": 7770}, {"loss": 0.7818, "grad_norm": 0.45350968837738037, "learning_rate": 0.0002, "epoch": 0.5587073608617594, "step": 7780}, {"loss": 0.7299, "grad_norm": 0.5031413435935974, "learning_rate": 0.0002, "epoch": 0.5594254937163375, "step": 7790}, {"loss": 0.7542, "grad_norm": 0.5047417879104614, "learning_rate": 0.0002, "epoch": 0.5601436265709157, "step": 7800}, {"loss": 0.7989, "grad_norm": 0.668912410736084, "learning_rate": 0.0002, "epoch": 0.5608617594254938, "step": 7810}, {"loss": 0.8226, "grad_norm": 0.6106061339378357, "learning_rate": 0.0002, "epoch": 0.5615798922800718, "step": 7820}, {"loss": 0.7489, "grad_norm": 0.5558443665504456, "learning_rate": 0.0002, "epoch": 0.5622980251346499, "step": 7830}, {"loss": 0.79, "grad_norm": 0.5937177538871765, "learning_rate": 0.0002, "epoch": 0.563016157989228, "step": 7840}, {"loss": 0.7857, "grad_norm": 0.67307448387146, "learning_rate": 0.0002, "epoch": 0.5637342908438061, "step": 7850}, {"loss": 0.8037, "grad_norm": 0.4615475833415985, "learning_rate": 0.0002, "epoch": 0.5644524236983842, "step": 7860}, {"loss": 0.7519, "grad_norm": 0.5462577939033508, "learning_rate": 0.0002, "epoch": 0.5651705565529623, "step": 7870}, {"loss": 0.7821, "grad_norm": 0.6422402858734131, "learning_rate": 0.0002, "epoch": 0.5658886894075404, "step": 7880}, {"loss": 0.8327, "grad_norm": 0.5313532948493958, "learning_rate": 0.0002, "epoch": 0.5666068222621184, "step": 7890}, {"loss": 0.7771, "grad_norm": 0.5647847056388855, "learning_rate": 0.0002, "epoch": 0.5673249551166966, "step": 7900}, {"loss": 0.8126, "grad_norm": 0.6581610441207886, "learning_rate": 0.0002, "epoch": 0.5680430879712747, "step": 7910}, {"loss": 0.7549, "grad_norm": 0.46947669982910156, "learning_rate": 0.0002, "epoch": 0.5687612208258528, "step": 7920}, {"loss": 0.8333, "grad_norm": 0.6420038342475891, "learning_rate": 0.0002, "epoch": 0.5694793536804309, "step": 7930}, {"loss": 0.7921, "grad_norm": 0.6730441451072693, "learning_rate": 0.0002, "epoch": 0.570197486535009, "step": 7940}, {"loss": 0.7668, "grad_norm": 0.3849070966243744, "learning_rate": 0.0002, "epoch": 0.5709156193895871, "step": 7950}, {"loss": 0.8297, "grad_norm": 0.6076335906982422, "learning_rate": 0.0002, "epoch": 0.5716337522441651, "step": 7960}, {"loss": 0.7932, "grad_norm": 0.6446982026100159, "learning_rate": 0.0002, "epoch": 0.5723518850987432, "step": 7970}, {"loss": 0.7988, "grad_norm": 0.6019234657287598, "learning_rate": 0.0002, "epoch": 0.5730700179533214, "step": 7980}, {"loss": 0.8103, "grad_norm": 0.620880663394928, "learning_rate": 0.0002, "epoch": 0.5737881508078995, "step": 7990}, {"loss": 0.7712, "grad_norm": 0.4927573502063751, "learning_rate": 0.0002, "epoch": 0.5745062836624776, "step": 8000}, {"loss": 0.7499, "grad_norm": 0.6276804804801941, "learning_rate": 0.0002, "epoch": 0.5752244165170557, "step": 8010}, {"loss": 0.8232, "grad_norm": 0.484518826007843, "learning_rate": 0.0002, "epoch": 0.5759425493716338, "step": 8020}, {"loss": 0.7658, "grad_norm": 0.5019962787628174, "learning_rate": 0.0002, "epoch": 0.5766606822262118, "step": 8030}, {"loss": 0.7827, "grad_norm": 0.6685234308242798, "learning_rate": 0.0002, "epoch": 0.5773788150807899, "step": 8040}, {"loss": 0.7811, "grad_norm": 0.5762107372283936, "learning_rate": 0.0002, "epoch": 0.578096947935368, "step": 8050}, {"loss": 0.8256, "grad_norm": 0.6402477025985718, "learning_rate": 0.0002, "epoch": 0.5788150807899461, "step": 8060}, {"loss": 0.779, "grad_norm": 0.5919345617294312, "learning_rate": 0.0002, "epoch": 0.5795332136445243, "step": 8070}, {"loss": 0.8179, "grad_norm": 0.47100913524627686, "learning_rate": 0.0002, "epoch": 0.5802513464991024, "step": 8080}, {"loss": 0.7832, "grad_norm": 0.6029118895530701, "learning_rate": 0.0002, "epoch": 0.5809694793536805, "step": 8090}, {"loss": 0.8061, "grad_norm": 0.5896338820457458, "learning_rate": 0.0002, "epoch": 0.5816876122082585, "step": 8100}, {"loss": 0.7991, "grad_norm": 0.49017754197120667, "learning_rate": 0.0002, "epoch": 0.5824057450628366, "step": 8110}, {"loss": 0.8148, "grad_norm": 0.5049256086349487, "learning_rate": 0.0002, "epoch": 0.5831238779174147, "step": 8120}, {"loss": 0.7561, "grad_norm": 0.6874517798423767, "learning_rate": 0.0002, "epoch": 0.5838420107719928, "step": 8130}, {"loss": 0.7908, "grad_norm": 0.5429391264915466, "learning_rate": 0.0002, "epoch": 0.5845601436265709, "step": 8140}, {"loss": 0.7834, "grad_norm": 0.5533722639083862, "learning_rate": 0.0002, "epoch": 0.585278276481149, "step": 8150}, {"loss": 0.7725, "grad_norm": 0.5827956199645996, "learning_rate": 0.0002, "epoch": 0.5859964093357272, "step": 8160}, {"loss": 0.7758, "grad_norm": 0.6670212149620056, "learning_rate": 0.0002, "epoch": 0.5867145421903052, "step": 8170}, {"loss": 0.7625, "grad_norm": 0.5231172442436218, "learning_rate": 0.0002, "epoch": 0.5874326750448833, "step": 8180}, {"loss": 0.7975, "grad_norm": 0.567447304725647, "learning_rate": 0.0002, "epoch": 0.5881508078994614, "step": 8190}, {"loss": 0.7463, "grad_norm": 0.5318575501441956, "learning_rate": 0.0002, "epoch": 0.5888689407540395, "step": 8200}, {"loss": 0.7961, "grad_norm": 0.6959463357925415, "learning_rate": 0.0002, "epoch": 0.5895870736086176, "step": 8210}, {"loss": 0.7575, "grad_norm": 0.6964931488037109, "learning_rate": 0.0002, "epoch": 0.5903052064631957, "step": 8220}, {"loss": 0.8382, "grad_norm": 0.5164617896080017, "learning_rate": 0.0002, "epoch": 0.5910233393177737, "step": 8230}, {"loss": 0.8152, "grad_norm": 0.5456110239028931, "learning_rate": 0.0002, "epoch": 0.5917414721723518, "step": 8240}, {"loss": 0.7627, "grad_norm": 0.6553666591644287, "learning_rate": 0.0002, "epoch": 0.59245960502693, "step": 8250}, {"loss": 0.8134, "grad_norm": 0.6185845732688904, "learning_rate": 0.0002, "epoch": 0.5931777378815081, "step": 8260}, {"loss": 0.8216, "grad_norm": 0.6110545992851257, "learning_rate": 0.0002, "epoch": 0.5938958707360862, "step": 8270}, {"loss": 0.805, "grad_norm": 0.5186824202537537, "learning_rate": 0.0002, "epoch": 0.5946140035906643, "step": 8280}, {"loss": 0.7934, "grad_norm": 0.7003735303878784, "learning_rate": 0.0002, "epoch": 0.5953321364452424, "step": 8290}, {"loss": 0.8095, "grad_norm": 0.4606216549873352, "learning_rate": 0.0002, "epoch": 0.5960502692998204, "step": 8300}, {"loss": 0.8051, "grad_norm": 0.5903441309928894, "learning_rate": 0.0002, "epoch": 0.5967684021543985, "step": 8310}, {"loss": 0.7861, "grad_norm": 0.7916744947433472, "learning_rate": 0.0002, "epoch": 0.5974865350089766, "step": 8320}, {"loss": 0.8234, "grad_norm": 0.5506401062011719, "learning_rate": 0.0002, "epoch": 0.5982046678635548, "step": 8330}, {"loss": 0.8137, "grad_norm": 0.5749204158782959, "learning_rate": 0.0002, "epoch": 0.5989228007181329, "step": 8340}, {"loss": 0.8133, "grad_norm": 0.6807544827461243, "learning_rate": 0.0002, "epoch": 0.599640933572711, "step": 8350}, {"loss": 0.8089, "grad_norm": 0.5782986283302307, "learning_rate": 0.0002, "epoch": 0.6003590664272891, "step": 8360}, {"loss": 0.8725, "grad_norm": 0.7336342334747314, "learning_rate": 0.0002, "epoch": 0.6010771992818671, "step": 8370}, {"loss": 0.7992, "grad_norm": 0.5762712955474854, "learning_rate": 0.0002, "epoch": 0.6017953321364452, "step": 8380}, {"loss": 0.8037, "grad_norm": 0.5726776719093323, "learning_rate": 0.0002, "epoch": 0.6025134649910233, "step": 8390}, {"loss": 0.7918, "grad_norm": 0.5355535745620728, "learning_rate": 0.0002, "epoch": 0.6032315978456014, "step": 8400}, {"loss": 0.8138, "grad_norm": 0.6762161254882812, "learning_rate": 0.0002, "epoch": 0.6039497307001795, "step": 8410}, {"loss": 0.8357, "grad_norm": 0.8200717568397522, "learning_rate": 0.0002, "epoch": 0.6046678635547577, "step": 8420}, {"loss": 0.79, "grad_norm": 0.5600009560585022, "learning_rate": 0.0002, "epoch": 0.6053859964093358, "step": 8430}, {"loss": 0.7387, "grad_norm": 0.6465966105461121, "learning_rate": 0.0002, "epoch": 0.6061041292639138, "step": 8440}, {"loss": 0.838, "grad_norm": 0.5176072120666504, "learning_rate": 0.0002, "epoch": 0.6068222621184919, "step": 8450}, {"loss": 0.7855, "grad_norm": 0.5777280926704407, "learning_rate": 0.0002, "epoch": 0.60754039497307, "step": 8460}, {"loss": 0.7776, "grad_norm": 0.5989252924919128, "learning_rate": 0.0002, "epoch": 0.6082585278276481, "step": 8470}, {"loss": 0.8216, "grad_norm": 0.5207306742668152, "learning_rate": 0.0002, "epoch": 0.6089766606822262, "step": 8480}, {"loss": 0.8092, "grad_norm": 0.5242675542831421, "learning_rate": 0.0002, "epoch": 0.6096947935368043, "step": 8490}, {"loss": 0.7546, "grad_norm": 0.5631455183029175, "learning_rate": 0.0002, "epoch": 0.6104129263913824, "step": 8500}, {"loss": 0.7495, "grad_norm": 0.65207439661026, "learning_rate": 0.0002, "epoch": 0.6111310592459605, "step": 8510}, {"loss": 0.8023, "grad_norm": 0.5808899998664856, "learning_rate": 0.0002, "epoch": 0.6118491921005386, "step": 8520}, {"loss": 0.7763, "grad_norm": 0.558127760887146, "learning_rate": 0.0002, "epoch": 0.6125673249551167, "step": 8530}, {"loss": 0.8012, "grad_norm": 0.6063143014907837, "learning_rate": 0.0002, "epoch": 0.6132854578096948, "step": 8540}, {"loss": 0.7496, "grad_norm": 0.5491744875907898, "learning_rate": 0.0002, "epoch": 0.6140035906642729, "step": 8550}, {"loss": 0.779, "grad_norm": 0.5105780959129333, "learning_rate": 0.0002, "epoch": 0.614721723518851, "step": 8560}, {"loss": 0.7983, "grad_norm": 0.6892395615577698, "learning_rate": 0.0002, "epoch": 0.6154398563734291, "step": 8570}, {"loss": 0.7563, "grad_norm": 0.7411758899688721, "learning_rate": 0.0002, "epoch": 0.6161579892280071, "step": 8580}, {"loss": 0.7455, "grad_norm": 0.6745429635047913, "learning_rate": 0.0002, "epoch": 0.6168761220825852, "step": 8590}, {"loss": 0.8213, "grad_norm": 0.596007227897644, "learning_rate": 0.0002, "epoch": 0.6175942549371634, "step": 8600}, {"loss": 0.7963, "grad_norm": 0.6751060485839844, "learning_rate": 0.0002, "epoch": 0.6183123877917415, "step": 8610}, {"loss": 0.7343, "grad_norm": 0.711124837398529, "learning_rate": 0.0002, "epoch": 0.6190305206463196, "step": 8620}, {"loss": 0.773, "grad_norm": 0.6110914945602417, "learning_rate": 0.0002, "epoch": 0.6197486535008977, "step": 8630}, {"loss": 0.7497, "grad_norm": 0.5687659978866577, "learning_rate": 0.0002, "epoch": 0.6204667863554758, "step": 8640}, {"loss": 0.7754, "grad_norm": 0.7025772929191589, "learning_rate": 0.0002, "epoch": 0.6211849192100538, "step": 8650}, {"loss": 0.7423, "grad_norm": 0.6456184983253479, "learning_rate": 0.0002, "epoch": 0.6219030520646319, "step": 8660}, {"loss": 0.7449, "grad_norm": 0.5317023992538452, "learning_rate": 0.0002, "epoch": 0.62262118491921, "step": 8670}, {"loss": 0.8146, "grad_norm": 0.5531691908836365, "learning_rate": 0.0002, "epoch": 0.6233393177737881, "step": 8680}, {"loss": 0.8171, "grad_norm": 0.6063531637191772, "learning_rate": 0.0002, "epoch": 0.6240574506283663, "step": 8690}, {"loss": 0.7943, "grad_norm": 1.094390630722046, "learning_rate": 0.0002, "epoch": 0.6247755834829444, "step": 8700}, {"loss": 0.7993, "grad_norm": 0.5558148622512817, "learning_rate": 0.0002, "epoch": 0.6254937163375225, "step": 8710}, {"loss": 0.7747, "grad_norm": 0.5470370054244995, "learning_rate": 0.0002, "epoch": 0.6262118491921005, "step": 8720}, {"loss": 0.8252, "grad_norm": 0.5852634310722351, "learning_rate": 0.0002, "epoch": 0.6269299820466786, "step": 8730}, {"loss": 0.8712, "grad_norm": 0.6120240092277527, "learning_rate": 0.0002, "epoch": 0.6276481149012567, "step": 8740}, {"loss": 0.8367, "grad_norm": 0.5608004927635193, "learning_rate": 0.0002, "epoch": 0.6283662477558348, "step": 8750}, {"loss": 0.7711, "grad_norm": 0.5980432033538818, "learning_rate": 0.0002, "epoch": 0.6290843806104129, "step": 8760}, {"loss": 0.7903, "grad_norm": 0.5670580863952637, "learning_rate": 0.0002, "epoch": 0.629802513464991, "step": 8770}, {"loss": 0.7765, "grad_norm": 0.5931687951087952, "learning_rate": 0.0002, "epoch": 0.6305206463195692, "step": 8780}, {"loss": 0.7752, "grad_norm": 0.7872577905654907, "learning_rate": 0.0002, "epoch": 0.6312387791741472, "step": 8790}, {"loss": 0.8045, "grad_norm": 0.6355181336402893, "learning_rate": 0.0002, "epoch": 0.6319569120287253, "step": 8800}, {"loss": 0.7651, "grad_norm": 0.501913845539093, "learning_rate": 0.0002, "epoch": 0.6326750448833034, "step": 8810}, {"loss": 0.8023, "grad_norm": 0.5956716537475586, "learning_rate": 0.0002, "epoch": 0.6333931777378815, "step": 8820}, {"loss": 0.798, "grad_norm": 0.6448253393173218, "learning_rate": 0.0002, "epoch": 0.6341113105924596, "step": 8830}, {"loss": 0.7878, "grad_norm": 0.6139631271362305, "learning_rate": 0.0002, "epoch": 0.6348294434470377, "step": 8840}, {"loss": 0.7767, "grad_norm": 0.5894306302070618, "learning_rate": 0.0002, "epoch": 0.6355475763016158, "step": 8850}, {"loss": 0.7516, "grad_norm": 0.8724799752235413, "learning_rate": 0.0002, "epoch": 0.6362657091561938, "step": 8860}, {"loss": 0.7715, "grad_norm": 0.5413858890533447, "learning_rate": 0.0002, "epoch": 0.636983842010772, "step": 8870}, {"loss": 0.8175, "grad_norm": 0.5993430614471436, "learning_rate": 0.0002, "epoch": 0.6377019748653501, "step": 8880}, {"loss": 0.7865, "grad_norm": 0.539415717124939, "learning_rate": 0.0002, "epoch": 0.6384201077199282, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.600125789642334, "learning_rate": 0.0002, "epoch": 0.6391382405745063, "step": 8900}, {"loss": 0.7886, "grad_norm": 0.5597978234291077, "learning_rate": 0.0002, "epoch": 0.6398563734290844, "step": 8910}, {"loss": 0.8468, "grad_norm": 0.6262031197547913, "learning_rate": 0.0002, "epoch": 0.6405745062836625, "step": 8920}, {"loss": 0.7523, "grad_norm": 0.72662752866745, "learning_rate": 0.0002, "epoch": 0.6412926391382405, "step": 8930}, {"loss": 0.8099, "grad_norm": 0.613002598285675, "learning_rate": 0.0002, "epoch": 0.6420107719928186, "step": 8940}, {"loss": 0.8112, "grad_norm": 0.6511827707290649, "learning_rate": 0.0002, "epoch": 0.6427289048473968, "step": 8950}, {"loss": 0.7479, "grad_norm": 0.5383973717689514, "learning_rate": 0.0002, "epoch": 0.6434470377019749, "step": 8960}, {"loss": 0.764, "grad_norm": 0.5236184597015381, "learning_rate": 0.0002, "epoch": 0.644165170556553, "step": 8970}, {"loss": 0.7515, "grad_norm": 0.5938544273376465, "learning_rate": 0.0002, "epoch": 0.6448833034111311, "step": 8980}, {"loss": 0.8103, "grad_norm": 0.4594680964946747, "learning_rate": 0.0002, "epoch": 0.6456014362657092, "step": 8990}, {"loss": 0.7495, "grad_norm": 0.6314211487770081, "learning_rate": 0.0002, "epoch": 0.6463195691202872, "step": 9000}, {"loss": 0.8162, "grad_norm": 0.6291103363037109, "learning_rate": 0.0002, "epoch": 0.6470377019748653, "step": 9010}, {"loss": 0.8167, "grad_norm": 0.5888266563415527, "learning_rate": 0.0002, "epoch": 0.6477558348294434, "step": 9020}, {"loss": 0.7685, "grad_norm": 0.5613022446632385, "learning_rate": 0.0002, "epoch": 0.6484739676840215, "step": 9030}, {"loss": 0.8142, "grad_norm": 0.7219604253768921, "learning_rate": 0.0002, "epoch": 0.6491921005385997, "step": 9040}, {"loss": 0.805, "grad_norm": 0.5846529006958008, "learning_rate": 0.0002, "epoch": 0.6499102333931778, "step": 9050}, {"loss": 0.8471, "grad_norm": 0.7264063954353333, "learning_rate": 0.0002, "epoch": 0.6506283662477559, "step": 9060}, {"loss": 0.7925, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "epoch": 0.6513464991023339, "step": 9070}, {"loss": 0.7961, "grad_norm": 0.4857395887374878, "learning_rate": 0.0002, "epoch": 0.652064631956912, "step": 9080}, {"loss": 0.7567, "grad_norm": 0.5044030547142029, "learning_rate": 0.0002, "epoch": 0.6527827648114901, "step": 9090}, {"loss": 0.7889, "grad_norm": 0.6105342507362366, "learning_rate": 0.0002, "epoch": 0.6535008976660682, "step": 9100}, {"loss": 0.7692, "grad_norm": 0.6408740282058716, "learning_rate": 0.0002, "epoch": 0.6542190305206463, "step": 9110}, {"loss": 0.7788, "grad_norm": 0.7474880814552307, "learning_rate": 0.0002, "epoch": 0.6549371633752245, "step": 9120}, {"loss": 0.7694, "grad_norm": 0.584768533706665, "learning_rate": 0.0002, "epoch": 0.6556552962298026, "step": 9130}, {"loss": 0.8273, "grad_norm": 0.6368113160133362, "learning_rate": 0.0002, "epoch": 0.6563734290843806, "step": 9140}, {"loss": 0.7493, "grad_norm": 0.693631649017334, "learning_rate": 0.0002, "epoch": 0.6570915619389587, "step": 9150}, {"loss": 0.7636, "grad_norm": 0.6094512343406677, "learning_rate": 0.0002, "epoch": 0.6578096947935368, "step": 9160}, {"loss": 0.8269, "grad_norm": 0.7154942750930786, "learning_rate": 0.0002, "epoch": 0.6585278276481149, "step": 9170}, {"loss": 0.7623, "grad_norm": 0.5749237537384033, "learning_rate": 0.0002, "epoch": 0.659245960502693, "step": 9180}, {"loss": 0.799, "grad_norm": 0.6214450001716614, "learning_rate": 0.0002, "epoch": 0.6599640933572711, "step": 9190}, {"loss": 0.7973, "grad_norm": 0.6357814073562622, "learning_rate": 0.0002, "epoch": 0.6606822262118492, "step": 9200}, {"loss": 0.773, "grad_norm": 0.5677326917648315, "learning_rate": 0.0002, "epoch": 0.6614003590664272, "step": 9210}, {"loss": 0.8173, "grad_norm": 0.5432633757591248, "learning_rate": 0.0002, "epoch": 0.6621184919210054, "step": 9220}, {"loss": 0.7573, "grad_norm": 0.43935060501098633, "learning_rate": 0.0002, "epoch": 0.6628366247755835, "step": 9230}, {"loss": 0.848, "grad_norm": 0.5350922346115112, "learning_rate": 0.0002, "epoch": 0.6635547576301616, "step": 9240}, {"loss": 0.7409, "grad_norm": 0.7745687365531921, "learning_rate": 0.0002, "epoch": 0.6642728904847397, "step": 9250}, {"loss": 0.7412, "grad_norm": 0.5767113566398621, "learning_rate": 0.0002, "epoch": 0.6649910233393178, "step": 9260}, {"loss": 0.8197, "grad_norm": 0.49304983019828796, "learning_rate": 0.0002, "epoch": 0.6657091561938959, "step": 9270}, {"loss": 0.7856, "grad_norm": 0.6355269551277161, "learning_rate": 0.0002, "epoch": 0.6664272890484739, "step": 9280}, {"loss": 0.7659, "grad_norm": 0.5539451241493225, "learning_rate": 0.0002, "epoch": 0.667145421903052, "step": 9290}, {"loss": 0.7888, "grad_norm": 0.5225138068199158, "learning_rate": 0.0002, "epoch": 0.6678635547576302, "step": 9300}, {"loss": 0.8048, "grad_norm": 0.5435736179351807, "learning_rate": 0.0002, "epoch": 0.6685816876122083, "step": 9310}, {"loss": 0.8284, "grad_norm": 0.611266553401947, "learning_rate": 0.0002, "epoch": 0.6692998204667864, "step": 9320}, {"loss": 0.8081, "grad_norm": 0.5880926251411438, "learning_rate": 0.0002, "epoch": 0.6700179533213645, "step": 9330}, {"loss": 0.7781, "grad_norm": 0.5301468372344971, "learning_rate": 0.0002, "epoch": 0.6707360861759426, "step": 9340}, {"loss": 0.7586, "grad_norm": 0.5614377856254578, "learning_rate": 0.0002, "epoch": 0.6714542190305206, "step": 9350}, {"loss": 0.7538, "grad_norm": 0.7177342176437378, "learning_rate": 0.0002, "epoch": 0.6721723518850987, "step": 9360}, {"loss": 0.7412, "grad_norm": 0.5187423825263977, "learning_rate": 0.0002, "epoch": 0.6728904847396768, "step": 9370}, {"loss": 0.7456, "grad_norm": 0.49305087327957153, "learning_rate": 0.0002, "epoch": 0.6736086175942549, "step": 9380}, {"loss": 0.7926, "grad_norm": 0.555867612361908, "learning_rate": 0.0002, "epoch": 0.6743267504488331, "step": 9390}, {"loss": 0.7486, "grad_norm": 0.8308040499687195, "learning_rate": 0.0002, "epoch": 0.6750448833034112, "step": 9400}, {"loss": 0.8225, "grad_norm": 0.6522438526153564, "learning_rate": 0.0002, "epoch": 0.6757630161579893, "step": 9410}, {"loss": 0.8283, "grad_norm": 0.5768371224403381, "learning_rate": 0.0002, "epoch": 0.6764811490125673, "step": 9420}, {"loss": 0.7815, "grad_norm": 0.783802330493927, "learning_rate": 0.0002, "epoch": 0.6771992818671454, "step": 9430}, {"loss": 0.7511, "grad_norm": 0.5246656537055969, "learning_rate": 0.0002, "epoch": 0.6779174147217235, "step": 9440}, {"loss": 0.7866, "grad_norm": 0.6630974411964417, "learning_rate": 0.0002, "epoch": 0.6786355475763016, "step": 9450}, {"loss": 0.7961, "grad_norm": 0.5012770295143127, "learning_rate": 0.0002, "epoch": 0.6793536804308797, "step": 9460}, {"loss": 0.7762, "grad_norm": 0.6208643317222595, "learning_rate": 0.0002, "epoch": 0.6800718132854578, "step": 9470}, {"loss": 0.7229, "grad_norm": 0.6033898591995239, "learning_rate": 0.0002, "epoch": 0.680789946140036, "step": 9480}, {"loss": 0.8315, "grad_norm": 0.6613174080848694, "learning_rate": 0.0002, "epoch": 0.681508078994614, "step": 9490}, {"loss": 0.7874, "grad_norm": 0.6417899131774902, "learning_rate": 0.0002, "epoch": 0.6822262118491921, "step": 9500}, {"loss": 0.7979, "grad_norm": 0.5060321092605591, "learning_rate": 0.0002, "epoch": 0.6829443447037702, "step": 9510}, {"loss": 0.7908, "grad_norm": 0.586670458316803, "learning_rate": 0.0002, "epoch": 0.6836624775583483, "step": 9520}, {"loss": 0.7652, "grad_norm": 0.6607828736305237, "learning_rate": 0.0002, "epoch": 0.6843806104129264, "step": 9530}, {"loss": 0.7645, "grad_norm": 0.5142775177955627, "learning_rate": 0.0002, "epoch": 0.6850987432675045, "step": 9540}, {"loss": 0.7553, "grad_norm": 0.741000771522522, "learning_rate": 0.0002, "epoch": 0.6858168761220825, "step": 9550}, {"loss": 0.8453, "grad_norm": 0.4687826335430145, "learning_rate": 0.0002, "epoch": 0.6865350089766606, "step": 9560}, {"loss": 0.7582, "grad_norm": 0.6452056169509888, "learning_rate": 0.0002, "epoch": 0.6872531418312388, "step": 9570}, {"loss": 0.7965, "grad_norm": 0.6393555402755737, "learning_rate": 0.0002, "epoch": 0.6879712746858169, "step": 9580}, {"loss": 0.802, "grad_norm": 0.4907757043838501, "learning_rate": 0.0002, "epoch": 0.688689407540395, "step": 9590}, {"loss": 0.7813, "grad_norm": 0.5380825996398926, "learning_rate": 0.0002, "epoch": 0.6894075403949731, "step": 9600}, {"loss": 0.8188, "grad_norm": 0.5657393932342529, "learning_rate": 0.0002, "epoch": 0.6901256732495512, "step": 9610}, {"loss": 0.7581, "grad_norm": 0.8505447506904602, "learning_rate": 0.0002, "epoch": 0.6908438061041292, "step": 9620}, {"loss": 0.7631, "grad_norm": 0.5389836430549622, "learning_rate": 0.0002, "epoch": 0.6915619389587073, "step": 9630}, {"loss": 0.8015, "grad_norm": 0.4977441728115082, "learning_rate": 0.0002, "epoch": 0.6922800718132854, "step": 9640}, {"loss": 0.8057, "grad_norm": 0.5855389833450317, "learning_rate": 0.0002, "epoch": 0.6929982046678635, "step": 9650}, {"loss": 0.7735, "grad_norm": 0.633994996547699, "learning_rate": 0.0002, "epoch": 0.6937163375224417, "step": 9660}, {"loss": 0.7918, "grad_norm": 0.5592191815376282, "learning_rate": 0.0002, "epoch": 0.6944344703770198, "step": 9670}, {"loss": 0.7883, "grad_norm": 0.6030594706535339, "learning_rate": 0.0002, "epoch": 0.6951526032315979, "step": 9680}, {"loss": 0.7472, "grad_norm": 0.6782388687133789, "learning_rate": 0.0002, "epoch": 0.6958707360861759, "step": 9690}, {"loss": 0.8097, "grad_norm": 0.6777627468109131, "learning_rate": 0.0002, "epoch": 0.696588868940754, "step": 9700}, {"loss": 0.7958, "grad_norm": 0.5674123764038086, "learning_rate": 0.0002, "epoch": 0.6973070017953321, "step": 9710}, {"loss": 0.7743, "grad_norm": 0.5280387997627258, "learning_rate": 0.0002, "epoch": 0.6980251346499102, "step": 9720}, {"loss": 0.7496, "grad_norm": 0.5471981763839722, "learning_rate": 0.0002, "epoch": 0.6987432675044883, "step": 9730}, {"loss": 0.7837, "grad_norm": 0.6751061677932739, "learning_rate": 0.0002, "epoch": 0.6994614003590665, "step": 9740}, {"loss": 0.7686, "grad_norm": 0.5942487716674805, "learning_rate": 0.0002, "epoch": 0.7001795332136446, "step": 9750}, {"loss": 0.757, "grad_norm": 0.6165713667869568, "learning_rate": 0.0002, "epoch": 0.7008976660682226, "step": 9760}, {"loss": 0.7864, "grad_norm": 0.5745091438293457, "learning_rate": 0.0002, "epoch": 0.7016157989228007, "step": 9770}, {"loss": 0.8079, "grad_norm": 0.600308358669281, "learning_rate": 0.0002, "epoch": 0.7023339317773788, "step": 9780}, {"loss": 0.7527, "grad_norm": 0.6448577046394348, "learning_rate": 0.0002, "epoch": 0.7030520646319569, "step": 9790}, {"loss": 0.7725, "grad_norm": 0.5662767291069031, "learning_rate": 0.0002, "epoch": 0.703770197486535, "step": 9800}, {"loss": 0.8028, "grad_norm": 0.6490433812141418, "learning_rate": 0.0002, "epoch": 0.7044883303411131, "step": 9810}, {"loss": 0.8006, "grad_norm": 0.6126134991645813, "learning_rate": 0.0002, "epoch": 0.7052064631956912, "step": 9820}, {"loss": 0.8034, "grad_norm": 0.7181116938591003, "learning_rate": 0.0002, "epoch": 0.7059245960502692, "step": 9830}, {"loss": 0.7937, "grad_norm": 0.7805212140083313, "learning_rate": 0.0002, "epoch": 0.7066427289048474, "step": 9840}, {"loss": 0.7781, "grad_norm": 0.7521958947181702, "learning_rate": 0.0002, "epoch": 0.7073608617594255, "step": 9850}, {"loss": 0.7412, "grad_norm": 0.5610787868499756, "learning_rate": 0.0002, "epoch": 0.7080789946140036, "step": 9860}, {"loss": 0.7627, "grad_norm": 0.7026229500770569, "learning_rate": 0.0002, "epoch": 0.7087971274685817, "step": 9870}, {"loss": 0.8085, "grad_norm": 0.551691472530365, "learning_rate": 0.0002, "epoch": 0.7095152603231598, "step": 9880}, {"loss": 0.7874, "grad_norm": 0.5841995477676392, "learning_rate": 0.0002, "epoch": 0.7102333931777379, "step": 9890}, {"loss": 0.7749, "grad_norm": 0.7170061469078064, "learning_rate": 0.0002, "epoch": 0.7109515260323159, "step": 9900}, {"loss": 0.7917, "grad_norm": 0.49836990237236023, "learning_rate": 0.0002, "epoch": 0.711669658886894, "step": 9910}, {"loss": 0.7667, "grad_norm": 0.5234556794166565, "learning_rate": 0.0002, "epoch": 0.7123877917414722, "step": 9920}, {"loss": 0.8438, "grad_norm": 0.7590384483337402, "learning_rate": 0.0002, "epoch": 0.7131059245960503, "step": 9930}, {"loss": 0.7725, "grad_norm": 0.5657515525817871, "learning_rate": 0.0002, "epoch": 0.7138240574506284, "step": 9940}, {"loss": 0.8184, "grad_norm": 0.5969128012657166, "learning_rate": 0.0002, "epoch": 0.7145421903052065, "step": 9950}, {"loss": 0.7375, "grad_norm": 0.7136867046356201, "learning_rate": 0.0002, "epoch": 0.7152603231597846, "step": 9960}, {"loss": 0.7883, "grad_norm": 0.6774699091911316, "learning_rate": 0.0002, "epoch": 0.7159784560143626, "step": 9970}, {"loss": 0.7629, "grad_norm": 0.6066371202468872, "learning_rate": 0.0002, "epoch": 0.7166965888689407, "step": 9980}, {"loss": 0.7767, "grad_norm": 0.7355279922485352, "learning_rate": 0.0002, "epoch": 0.7174147217235188, "step": 9990}, {"loss": 0.7643, "grad_norm": 0.7996646761894226, "learning_rate": 0.0002, "epoch": 0.718132854578097, "step": 10000}, {"loss": 0.8304, "grad_norm": 0.628839910030365, "learning_rate": 0.0002, "epoch": 0.7188509874326751, "step": 10010}, {"loss": 0.7292, "grad_norm": 0.5472931265830994, "learning_rate": 0.0002, "epoch": 0.7195691202872532, "step": 10020}, {"loss": 0.7787, "grad_norm": 0.5776344537734985, "learning_rate": 0.0002, "epoch": 0.7202872531418313, "step": 10030}, {"loss": 0.7432, "grad_norm": 0.5041707158088684, "learning_rate": 0.0002, "epoch": 0.7210053859964093, "step": 10040}, {"loss": 0.7923, "grad_norm": 0.5965308547019958, "learning_rate": 0.0002, "epoch": 0.7217235188509874, "step": 10050}, {"loss": 0.8131, "grad_norm": 0.5892689228057861, "learning_rate": 0.0002, "epoch": 0.7224416517055655, "step": 10060}, {"loss": 0.7961, "grad_norm": 0.5695884227752686, "learning_rate": 0.0002, "epoch": 0.7231597845601436, "step": 10070}, {"loss": 0.7806, "grad_norm": 0.6547690629959106, "learning_rate": 0.0002, "epoch": 0.7238779174147217, "step": 10080}, {"loss": 0.7978, "grad_norm": 0.6759928464889526, "learning_rate": 0.0002, "epoch": 0.7245960502692999, "step": 10090}, {"loss": 0.7547, "grad_norm": 0.6829725503921509, "learning_rate": 0.0002, "epoch": 0.725314183123878, "step": 10100}, {"loss": 0.7507, "grad_norm": 0.5242751240730286, "learning_rate": 0.0002, "epoch": 0.726032315978456, "step": 10110}, {"loss": 0.8042, "grad_norm": 0.6947014927864075, "learning_rate": 0.0002, "epoch": 0.7267504488330341, "step": 10120}, {"loss": 0.7621, "grad_norm": 0.6094982624053955, "learning_rate": 0.0002, "epoch": 0.7274685816876122, "step": 10130}, {"loss": 0.7911, "grad_norm": 0.628461480140686, "learning_rate": 0.0002, "epoch": 0.7281867145421903, "step": 10140}, {"loss": 0.7839, "grad_norm": 0.4952087104320526, "learning_rate": 0.0002, "epoch": 0.7289048473967684, "step": 10150}, {"loss": 0.7582, "grad_norm": 0.6917221546173096, "learning_rate": 0.0002, "epoch": 0.7296229802513465, "step": 10160}, {"loss": 0.7791, "grad_norm": 0.6866413354873657, "learning_rate": 0.0002, "epoch": 0.7303411131059246, "step": 10170}, {"loss": 0.7628, "grad_norm": 0.5505863428115845, "learning_rate": 0.0002, "epoch": 0.7310592459605026, "step": 10180}, {"loss": 0.7941, "grad_norm": 0.5903199911117554, "learning_rate": 0.0002, "epoch": 0.7317773788150808, "step": 10190}, {"loss": 0.8072, "grad_norm": 0.5001798272132874, "learning_rate": 0.0002, "epoch": 0.7324955116696589, "step": 10200}, {"loss": 0.7934, "grad_norm": 0.5117581486701965, "learning_rate": 0.0002, "epoch": 0.733213644524237, "step": 10210}, {"loss": 0.8364, "grad_norm": 0.7716088891029358, "learning_rate": 0.0002, "epoch": 0.7339317773788151, "step": 10220}, {"loss": 0.7775, "grad_norm": 0.5973874926567078, "learning_rate": 0.0002, "epoch": 0.7346499102333932, "step": 10230}, {"loss": 0.7689, "grad_norm": 0.6433483362197876, "learning_rate": 0.0002, "epoch": 0.7353680430879713, "step": 10240}, {"loss": 0.8307, "grad_norm": 0.6241081357002258, "learning_rate": 0.0002, "epoch": 0.7360861759425493, "step": 10250}, {"loss": 0.7432, "grad_norm": 0.7198845744132996, "learning_rate": 0.0002, "epoch": 0.7368043087971274, "step": 10260}, {"loss": 0.7545, "grad_norm": 0.5879023671150208, "learning_rate": 0.0002, "epoch": 0.7375224416517056, "step": 10270}, {"loss": 0.7526, "grad_norm": 0.5810162425041199, "learning_rate": 0.0002, "epoch": 0.7382405745062837, "step": 10280}, {"loss": 0.7839, "grad_norm": 0.6336500644683838, "learning_rate": 0.0002, "epoch": 0.7389587073608618, "step": 10290}, {"loss": 0.7597, "grad_norm": 0.5627583861351013, "learning_rate": 0.0002, "epoch": 0.7396768402154399, "step": 10300}, {"loss": 0.8166, "grad_norm": 0.5396066904067993, "learning_rate": 0.0002, "epoch": 0.740394973070018, "step": 10310}, {"loss": 0.7698, "grad_norm": 0.5519505143165588, "learning_rate": 0.0002, "epoch": 0.741113105924596, "step": 10320}, {"loss": 0.7953, "grad_norm": 0.628710925579071, "learning_rate": 0.0002, "epoch": 0.7418312387791741, "step": 10330}, {"loss": 0.805, "grad_norm": 0.6466957926750183, "learning_rate": 0.0002, "epoch": 0.7425493716337522, "step": 10340}, {"loss": 0.8173, "grad_norm": 0.6269286274909973, "learning_rate": 0.0002, "epoch": 0.7432675044883303, "step": 10350}, {"loss": 0.8315, "grad_norm": 0.6985455751419067, "learning_rate": 0.0002, "epoch": 0.7439856373429085, "step": 10360}, {"loss": 0.7598, "grad_norm": 0.6203648447990417, "learning_rate": 0.0002, "epoch": 0.7447037701974866, "step": 10370}, {"loss": 0.7937, "grad_norm": 0.6524295210838318, "learning_rate": 0.0002, "epoch": 0.7454219030520647, "step": 10380}, {"loss": 0.8005, "grad_norm": 0.6108002662658691, "learning_rate": 0.0002, "epoch": 0.7461400359066427, "step": 10390}, {"loss": 0.7592, "grad_norm": 0.5196276903152466, "learning_rate": 0.0002, "epoch": 0.7468581687612208, "step": 10400}, {"loss": 0.7769, "grad_norm": 0.6207506656646729, "learning_rate": 0.0002, "epoch": 0.7475763016157989, "step": 10410}, {"loss": 0.8066, "grad_norm": 0.6015686988830566, "learning_rate": 0.0002, "epoch": 0.748294434470377, "step": 10420}, {"loss": 0.7993, "grad_norm": 0.6402649879455566, "learning_rate": 0.0002, "epoch": 0.7490125673249551, "step": 10430}, {"loss": 0.802, "grad_norm": 0.7816081047058105, "learning_rate": 0.0002, "epoch": 0.7497307001795332, "step": 10440}, {"loss": 0.8021, "grad_norm": 0.6148143410682678, "learning_rate": 0.0002, "epoch": 0.7504488330341114, "step": 10450}, {"loss": 0.7986, "grad_norm": 0.6496613621711731, "learning_rate": 0.0002, "epoch": 0.7511669658886894, "step": 10460}, {"loss": 0.8152, "grad_norm": 0.49158045649528503, "learning_rate": 0.0002, "epoch": 0.7518850987432675, "step": 10470}, {"loss": 0.8098, "grad_norm": 0.8629217743873596, "learning_rate": 0.0002, "epoch": 0.7526032315978456, "step": 10480}, {"loss": 0.807, "grad_norm": 0.6800066828727722, "learning_rate": 0.0002, "epoch": 0.7533213644524237, "step": 10490}, {"loss": 0.7238, "grad_norm": 0.6480063199996948, "learning_rate": 0.0002, "epoch": 0.7540394973070018, "step": 10500}, {"loss": 0.7818, "grad_norm": 0.5740751028060913, "learning_rate": 0.0002, "epoch": 0.7547576301615799, "step": 10510}, {"loss": 0.7732, "grad_norm": 0.7182627320289612, "learning_rate": 0.0002, "epoch": 0.755475763016158, "step": 10520}, {"loss": 0.7752, "grad_norm": 0.6482816934585571, "learning_rate": 0.0002, "epoch": 0.756193895870736, "step": 10530}, {"loss": 0.7564, "grad_norm": 0.4937674105167389, "learning_rate": 0.0002, "epoch": 0.7569120287253142, "step": 10540}, {"loss": 0.7783, "grad_norm": 0.6818482875823975, "learning_rate": 0.0002, "epoch": 0.7576301615798923, "step": 10550}, {"loss": 0.8303, "grad_norm": 0.6375173926353455, "learning_rate": 0.0002, "epoch": 0.7583482944344704, "step": 10560}, {"loss": 0.77, "grad_norm": 0.528798520565033, "learning_rate": 0.0002, "epoch": 0.7590664272890485, "step": 10570}, {"loss": 0.8435, "grad_norm": 0.42099910974502563, "learning_rate": 0.0002, "epoch": 0.7597845601436266, "step": 10580}, {"loss": 0.8218, "grad_norm": 0.529604434967041, "learning_rate": 0.0002, "epoch": 0.7605026929982047, "step": 10590}, {"loss": 0.7833, "grad_norm": 0.6236841082572937, "learning_rate": 0.0002, "epoch": 0.7612208258527827, "step": 10600}, {"loss": 0.777, "grad_norm": 0.6194891929626465, "learning_rate": 0.0002, "epoch": 0.7619389587073608, "step": 10610}, {"loss": 0.7967, "grad_norm": 0.5206209421157837, "learning_rate": 0.0002, "epoch": 0.762657091561939, "step": 10620}, {"loss": 0.811, "grad_norm": 0.7981295585632324, "learning_rate": 0.0002, "epoch": 0.7633752244165171, "step": 10630}, {"loss": 0.8016, "grad_norm": 0.6113479137420654, "learning_rate": 0.0002, "epoch": 0.7640933572710952, "step": 10640}, {"loss": 0.7642, "grad_norm": 0.7025435566902161, "learning_rate": 0.0002, "epoch": 0.7648114901256733, "step": 10650}, {"loss": 0.7293, "grad_norm": 0.46914348006248474, "learning_rate": 0.0002, "epoch": 0.7655296229802514, "step": 10660}, {"loss": 0.8079, "grad_norm": 0.6134725213050842, "learning_rate": 0.0002, "epoch": 0.7662477558348294, "step": 10670}, {"loss": 0.7469, "grad_norm": 0.583859920501709, "learning_rate": 0.0002, "epoch": 0.7669658886894075, "step": 10680}, {"loss": 0.843, "grad_norm": 0.511349081993103, "learning_rate": 0.0002, "epoch": 0.7676840215439856, "step": 10690}, {"loss": 0.8355, "grad_norm": 0.6467110514640808, "learning_rate": 0.0002, "epoch": 0.7684021543985637, "step": 10700}, {"loss": 0.7935, "grad_norm": 0.7210163474082947, "learning_rate": 0.0002, "epoch": 0.7691202872531419, "step": 10710}, {"loss": 0.7807, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "epoch": 0.76983842010772, "step": 10720}, {"loss": 0.7742, "grad_norm": 0.6237271428108215, "learning_rate": 0.0002, "epoch": 0.7705565529622981, "step": 10730}, {"loss": 0.8185, "grad_norm": 0.664328396320343, "learning_rate": 0.0002, "epoch": 0.7712746858168761, "step": 10740}, {"loss": 0.8096, "grad_norm": 0.6550520062446594, "learning_rate": 0.0002, "epoch": 0.7719928186714542, "step": 10750}, {"loss": 0.7538, "grad_norm": 0.5103325843811035, "learning_rate": 0.0002, "epoch": 0.7727109515260323, "step": 10760}, {"loss": 0.7777, "grad_norm": 0.7171200513839722, "learning_rate": 0.0002, "epoch": 0.7734290843806104, "step": 10770}, {"loss": 0.7743, "grad_norm": 0.5947384834289551, "learning_rate": 0.0002, "epoch": 0.7741472172351885, "step": 10780}, {"loss": 0.781, "grad_norm": 0.5293096899986267, "learning_rate": 0.0002, "epoch": 0.7748653500897666, "step": 10790}, {"loss": 0.777, "grad_norm": 0.6372577548027039, "learning_rate": 0.0002, "epoch": 0.7755834829443446, "step": 10800}, {"loss": 0.7972, "grad_norm": 0.5738261938095093, "learning_rate": 0.0002, "epoch": 0.7763016157989228, "step": 10810}, {"loss": 0.7877, "grad_norm": 0.7309247255325317, "learning_rate": 0.0002, "epoch": 0.7770197486535009, "step": 10820}, {"loss": 0.7745, "grad_norm": 0.8867193460464478, "learning_rate": 0.0002, "epoch": 0.777737881508079, "step": 10830}, {"loss": 0.7959, "grad_norm": 0.6151437759399414, "learning_rate": 0.0002, "epoch": 0.7784560143626571, "step": 10840}, {"loss": 0.7897, "grad_norm": 0.5645464658737183, "learning_rate": 0.0002, "epoch": 0.7791741472172352, "step": 10850}, {"loss": 0.7858, "grad_norm": 0.5118698477745056, "learning_rate": 0.0002, "epoch": 0.7798922800718133, "step": 10860}, {"loss": 0.8064, "grad_norm": 0.618181049823761, "learning_rate": 0.0002, "epoch": 0.7806104129263913, "step": 10870}, {"loss": 0.7675, "grad_norm": 0.7206462025642395, "learning_rate": 0.0002, "epoch": 0.7813285457809694, "step": 10880}, {"loss": 0.8162, "grad_norm": 0.7993820905685425, "learning_rate": 0.0002, "epoch": 0.7820466786355476, "step": 10890}, {"loss": 0.781, "grad_norm": 0.5072754621505737, "learning_rate": 0.0002, "epoch": 0.7827648114901257, "step": 10900}, {"loss": 0.7575, "grad_norm": 0.5829088687896729, "learning_rate": 0.0002, "epoch": 0.7834829443447038, "step": 10910}, {"loss": 0.7552, "grad_norm": 0.5778957605361938, "learning_rate": 0.0002, "epoch": 0.7842010771992819, "step": 10920}, {"loss": 0.7652, "grad_norm": 0.7237067222595215, "learning_rate": 0.0002, "epoch": 0.78491921005386, "step": 10930}, {"loss": 0.8357, "grad_norm": 0.5778013467788696, "learning_rate": 0.0002, "epoch": 0.785637342908438, "step": 10940}, {"loss": 0.7464, "grad_norm": 0.6129629611968994, "learning_rate": 0.0002, "epoch": 0.7863554757630161, "step": 10950}, {"loss": 0.7863, "grad_norm": 0.5637320876121521, "learning_rate": 0.0002, "epoch": 0.7870736086175942, "step": 10960}, {"loss": 0.7645, "grad_norm": 0.6253715753555298, "learning_rate": 0.0002, "epoch": 0.7877917414721723, "step": 10970}, {"loss": 0.8307, "grad_norm": 0.6209888458251953, "learning_rate": 0.0002, "epoch": 0.7885098743267505, "step": 10980}, {"loss": 0.7899, "grad_norm": 1.0841948986053467, "learning_rate": 0.0002, "epoch": 0.7892280071813286, "step": 10990}, {"loss": 0.7659, "grad_norm": 0.6570560336112976, "learning_rate": 0.0002, "epoch": 0.7899461400359067, "step": 11000}, {"loss": 0.7839, "grad_norm": 0.4830388128757477, "learning_rate": 0.0002, "epoch": 0.7906642728904847, "step": 11010}, {"loss": 0.8064, "grad_norm": 0.7607520222663879, "learning_rate": 0.0002, "epoch": 0.7913824057450628, "step": 11020}, {"loss": 0.8009, "grad_norm": 0.8202590346336365, "learning_rate": 0.0002, "epoch": 0.7921005385996409, "step": 11030}, {"loss": 0.7788, "grad_norm": 0.5640848278999329, "learning_rate": 0.0002, "epoch": 0.792818671454219, "step": 11040}, {"loss": 0.8298, "grad_norm": 0.7773675322532654, "learning_rate": 0.0002, "epoch": 0.7935368043087971, "step": 11050}, {"loss": 0.793, "grad_norm": 0.664139986038208, "learning_rate": 0.0002, "epoch": 0.7942549371633753, "step": 11060}, {"loss": 0.7886, "grad_norm": 0.6097795367240906, "learning_rate": 0.0002, "epoch": 0.7949730700179534, "step": 11070}, {"loss": 0.7989, "grad_norm": 0.9208881258964539, "learning_rate": 0.0002, "epoch": 0.7956912028725314, "step": 11080}, {"loss": 0.8045, "grad_norm": 0.6210731863975525, "learning_rate": 0.0002, "epoch": 0.7964093357271095, "step": 11090}, {"loss": 0.7868, "grad_norm": 0.7060235738754272, "learning_rate": 0.0002, "epoch": 0.7971274685816876, "step": 11100}, {"loss": 0.8041, "grad_norm": 0.48695266246795654, "learning_rate": 0.0002, "epoch": 0.7978456014362657, "step": 11110}, {"loss": 0.7885, "grad_norm": 0.6458830833435059, "learning_rate": 0.0002, "epoch": 0.7985637342908438, "step": 11120}, {"loss": 0.7773, "grad_norm": 0.572545051574707, "learning_rate": 0.0002, "epoch": 0.7992818671454219, "step": 11130}, {"loss": 0.7984, "grad_norm": 0.5925027132034302, "learning_rate": 0.0002, "epoch": 0.8, "step": 11140}, {"loss": 0.7571, "grad_norm": 0.569622278213501, "learning_rate": 0.0002, "epoch": 0.800718132854578, "step": 11150}, {"loss": 0.7765, "grad_norm": 0.537146806716919, "learning_rate": 0.0002, "epoch": 0.8014362657091562, "step": 11160}, {"loss": 0.7896, "grad_norm": 0.7118613719940186, "learning_rate": 0.0002, "epoch": 0.8021543985637343, "step": 11170}, {"loss": 0.7398, "grad_norm": 0.6183688044548035, "learning_rate": 0.0002, "epoch": 0.8028725314183124, "step": 11180}, {"loss": 0.7545, "grad_norm": 0.5187385082244873, "learning_rate": 0.0002, "epoch": 0.8035906642728905, "step": 11190}, {"loss": 0.766, "grad_norm": 0.5422571301460266, "learning_rate": 0.0002, "epoch": 0.8043087971274686, "step": 11200}, {"loss": 0.756, "grad_norm": 0.635050892829895, "learning_rate": 0.0002, "epoch": 0.8050269299820467, "step": 11210}, {"loss": 0.7337, "grad_norm": 0.6584872007369995, "learning_rate": 0.0002, "epoch": 0.8057450628366247, "step": 11220}, {"loss": 0.7467, "grad_norm": 0.624921977519989, "learning_rate": 0.0002, "epoch": 0.8064631956912028, "step": 11230}, {"loss": 0.7559, "grad_norm": 0.6837546229362488, "learning_rate": 0.0002, "epoch": 0.807181328545781, "step": 11240}, {"loss": 0.7861, "grad_norm": 0.5861160755157471, "learning_rate": 0.0002, "epoch": 0.8078994614003591, "step": 11250}, {"loss": 0.7883, "grad_norm": 0.5751383900642395, "learning_rate": 0.0002, "epoch": 0.8086175942549372, "step": 11260}, {"loss": 0.8103, "grad_norm": 0.7181510329246521, "learning_rate": 0.0002, "epoch": 0.8093357271095153, "step": 11270}, {"loss": 0.8066, "grad_norm": 0.5862139463424683, "learning_rate": 0.0002, "epoch": 0.8100538599640934, "step": 11280}, {"loss": 0.7692, "grad_norm": 0.4880113899707794, "learning_rate": 0.0002, "epoch": 0.8107719928186714, "step": 11290}, {"loss": 0.8154, "grad_norm": 0.565590500831604, "learning_rate": 0.0002, "epoch": 0.8114901256732495, "step": 11300}, {"loss": 0.7893, "grad_norm": 0.6171264052391052, "learning_rate": 0.0002, "epoch": 0.8122082585278276, "step": 11310}, {"loss": 0.816, "grad_norm": 0.5815969109535217, "learning_rate": 0.0002, "epoch": 0.8129263913824057, "step": 11320}, {"loss": 0.7462, "grad_norm": 0.5407653450965881, "learning_rate": 0.0002, "epoch": 0.8136445242369839, "step": 11330}, {"loss": 0.7647, "grad_norm": 0.6990084648132324, "learning_rate": 0.0002, "epoch": 0.814362657091562, "step": 11340}, {"loss": 0.783, "grad_norm": 0.5845068097114563, "learning_rate": 0.0002, "epoch": 0.8150807899461401, "step": 11350}, {"loss": 0.7839, "grad_norm": 0.5978701114654541, "learning_rate": 0.0002, "epoch": 0.8157989228007181, "step": 11360}, {"loss": 0.7342, "grad_norm": 0.6873053312301636, "learning_rate": 0.0002, "epoch": 0.8165170556552962, "step": 11370}, {"loss": 0.7656, "grad_norm": 0.7048654556274414, "learning_rate": 0.0002, "epoch": 0.8172351885098743, "step": 11380}, {"loss": 0.7293, "grad_norm": 0.7631531953811646, "learning_rate": 0.0002, "epoch": 0.8179533213644524, "step": 11390}, {"loss": 0.8606, "grad_norm": 0.704922080039978, "learning_rate": 0.0002, "epoch": 0.8186714542190305, "step": 11400}, {"loss": 0.8066, "grad_norm": 0.595460832118988, "learning_rate": 0.0002, "epoch": 0.8193895870736086, "step": 11410}, {"loss": 0.809, "grad_norm": 0.5882242918014526, "learning_rate": 0.0002, "epoch": 0.8201077199281868, "step": 11420}, {"loss": 0.7639, "grad_norm": 0.6433175206184387, "learning_rate": 0.0002, "epoch": 0.8208258527827648, "step": 11430}, {"loss": 0.7522, "grad_norm": 0.6047986149787903, "learning_rate": 0.0002, "epoch": 0.8215439856373429, "step": 11440}, {"loss": 0.8305, "grad_norm": 0.6462088823318481, "learning_rate": 0.0002, "epoch": 0.822262118491921, "step": 11450}, {"loss": 0.8144, "grad_norm": 0.5558379888534546, "learning_rate": 0.0002, "epoch": 0.8229802513464991, "step": 11460}, {"loss": 0.7916, "grad_norm": 0.6745542287826538, "learning_rate": 0.0002, "epoch": 0.8236983842010772, "step": 11470}, {"loss": 0.7853, "grad_norm": 0.7082334756851196, "learning_rate": 0.0002, "epoch": 0.8244165170556553, "step": 11480}, {"loss": 0.7533, "grad_norm": 0.703889787197113, "learning_rate": 0.0002, "epoch": 0.8251346499102334, "step": 11490}, {"loss": 0.8085, "grad_norm": 0.5261096358299255, "learning_rate": 0.0002, "epoch": 0.8258527827648114, "step": 11500}, {"loss": 0.7903, "grad_norm": 0.6009393930435181, "learning_rate": 0.0002, "epoch": 0.8265709156193896, "step": 11510}, {"loss": 0.7377, "grad_norm": 0.584274172782898, "learning_rate": 0.0002, "epoch": 0.8272890484739677, "step": 11520}, {"loss": 0.7926, "grad_norm": 0.6803238987922668, "learning_rate": 0.0002, "epoch": 0.8280071813285458, "step": 11530}, {"loss": 0.7948, "grad_norm": 0.6230084896087646, "learning_rate": 0.0002, "epoch": 0.8287253141831239, "step": 11540}, {"loss": 0.7902, "grad_norm": 0.6090595722198486, "learning_rate": 0.0002, "epoch": 0.829443447037702, "step": 11550}, {"loss": 0.7514, "grad_norm": 0.5292693376541138, "learning_rate": 0.0002, "epoch": 0.8301615798922801, "step": 11560}, {"loss": 0.7979, "grad_norm": 0.5675389766693115, "learning_rate": 0.0002, "epoch": 0.8308797127468581, "step": 11570}, {"loss": 0.7851, "grad_norm": 0.554874062538147, "learning_rate": 0.0002, "epoch": 0.8315978456014362, "step": 11580}, {"loss": 0.8004, "grad_norm": 0.8582373261451721, "learning_rate": 0.0002, "epoch": 0.8323159784560143, "step": 11590}, {"loss": 0.7864, "grad_norm": 0.5743035674095154, "learning_rate": 0.0002, "epoch": 0.8330341113105925, "step": 11600}, {"loss": 0.7714, "grad_norm": 0.5749582648277283, "learning_rate": 0.0002, "epoch": 0.8337522441651706, "step": 11610}, {"loss": 0.8131, "grad_norm": 0.5207278728485107, "learning_rate": 0.0002, "epoch": 0.8344703770197487, "step": 11620}, {"loss": 0.785, "grad_norm": 0.6262611150741577, "learning_rate": 0.0002, "epoch": 0.8351885098743268, "step": 11630}, {"loss": 0.7699, "grad_norm": 0.5490066409111023, "learning_rate": 0.0002, "epoch": 0.8359066427289048, "step": 11640}, {"loss": 0.7779, "grad_norm": 0.6283167600631714, "learning_rate": 0.0002, "epoch": 0.8366247755834829, "step": 11650}, {"loss": 0.7508, "grad_norm": 0.7701452374458313, "learning_rate": 0.0002, "epoch": 0.837342908438061, "step": 11660}, {"loss": 0.7662, "grad_norm": 0.5825072526931763, "learning_rate": 0.0002, "epoch": 0.8380610412926391, "step": 11670}, {"loss": 0.758, "grad_norm": 0.6119720935821533, "learning_rate": 0.0002, "epoch": 0.8387791741472173, "step": 11680}, {"loss": 0.7995, "grad_norm": 0.689383327960968, "learning_rate": 0.0002, "epoch": 0.8394973070017954, "step": 11690}, {"loss": 0.7615, "grad_norm": 0.5396560430526733, "learning_rate": 0.0002, "epoch": 0.8402154398563735, "step": 11700}, {"loss": 0.8073, "grad_norm": 0.577178955078125, "learning_rate": 0.0002, "epoch": 0.8409335727109515, "step": 11710}, {"loss": 0.7911, "grad_norm": 0.6652564406394958, "learning_rate": 0.0002, "epoch": 0.8416517055655296, "step": 11720}, {"loss": 0.7708, "grad_norm": 0.588377058506012, "learning_rate": 0.0002, "epoch": 0.8423698384201077, "step": 11730}, {"loss": 0.8245, "grad_norm": 0.6180438995361328, "learning_rate": 0.0002, "epoch": 0.8430879712746858, "step": 11740}, {"loss": 0.729, "grad_norm": 0.6897811889648438, "learning_rate": 0.0002, "epoch": 0.8438061041292639, "step": 11750}, {"loss": 0.8026, "grad_norm": 0.5826608538627625, "learning_rate": 0.0002, "epoch": 0.844524236983842, "step": 11760}, {"loss": 0.7959, "grad_norm": 0.6511976718902588, "learning_rate": 0.0002, "epoch": 0.8452423698384202, "step": 11770}, {"loss": 0.7705, "grad_norm": 0.4738382399082184, "learning_rate": 0.0002, "epoch": 0.8459605026929982, "step": 11780}, {"loss": 0.8317, "grad_norm": 0.541780948638916, "learning_rate": 0.0002, "epoch": 0.8466786355475763, "step": 11790}, {"loss": 0.774, "grad_norm": 0.6115241050720215, "learning_rate": 0.0002, "epoch": 0.8473967684021544, "step": 11800}, {"loss": 0.834, "grad_norm": 0.7067801356315613, "learning_rate": 0.0002, "epoch": 0.8481149012567325, "step": 11810}, {"loss": 0.7725, "grad_norm": 0.5602791905403137, "learning_rate": 0.0002, "epoch": 0.8488330341113106, "step": 11820}, {"loss": 0.7832, "grad_norm": 0.6968005299568176, "learning_rate": 0.0002, "epoch": 0.8495511669658887, "step": 11830}, {"loss": 0.7556, "grad_norm": 0.621132493019104, "learning_rate": 0.0002, "epoch": 0.8502692998204668, "step": 11840}, {"loss": 0.8036, "grad_norm": 0.5777568817138672, "learning_rate": 0.0002, "epoch": 0.8509874326750448, "step": 11850}, {"loss": 0.8071, "grad_norm": 0.6468178629875183, "learning_rate": 0.0002, "epoch": 0.851705565529623, "step": 11860}, {"loss": 0.8074, "grad_norm": 0.6216070652008057, "learning_rate": 0.0002, "epoch": 0.8524236983842011, "step": 11870}, {"loss": 0.7736, "grad_norm": 0.7402005791664124, "learning_rate": 0.0002, "epoch": 0.8531418312387792, "step": 11880}, {"loss": 0.7877, "grad_norm": 0.5192958116531372, "learning_rate": 0.0002, "epoch": 0.8538599640933573, "step": 11890}, {"loss": 0.7113, "grad_norm": 0.6050501465797424, "learning_rate": 0.0002, "epoch": 0.8545780969479354, "step": 11900}, {"loss": 0.8131, "grad_norm": 0.5363124012947083, "learning_rate": 0.0002, "epoch": 0.8552962298025135, "step": 11910}, {"loss": 0.7861, "grad_norm": 0.525288462638855, "learning_rate": 0.0002, "epoch": 0.8560143626570915, "step": 11920}, {"loss": 0.726, "grad_norm": 0.6129848957061768, "learning_rate": 0.0002, "epoch": 0.8567324955116696, "step": 11930}, {"loss": 0.7921, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 0.8574506283662477, "step": 11940}, {"loss": 0.772, "grad_norm": 0.5862830281257629, "learning_rate": 0.0002, "epoch": 0.8581687612208259, "step": 11950}, {"loss": 0.7272, "grad_norm": 0.7078025341033936, "learning_rate": 0.0002, "epoch": 0.858886894075404, "step": 11960}, {"loss": 0.7733, "grad_norm": 0.6600908637046814, "learning_rate": 0.0002, "epoch": 0.8596050269299821, "step": 11970}, {"loss": 0.7784, "grad_norm": 0.5914377570152283, "learning_rate": 0.0002, "epoch": 0.8603231597845602, "step": 11980}, {"loss": 0.8222, "grad_norm": 0.7844575047492981, "learning_rate": 0.0002, "epoch": 0.8610412926391382, "step": 11990}, {"loss": 0.8059, "grad_norm": 0.6605148315429688, "learning_rate": 0.0002, "epoch": 0.8617594254937163, "step": 12000}, {"loss": 0.8066, "grad_norm": 0.6320111155509949, "learning_rate": 0.0002, "epoch": 0.8624775583482944, "step": 12010}, {"loss": 0.7844, "grad_norm": 0.5833557844161987, "learning_rate": 0.0002, "epoch": 0.8631956912028725, "step": 12020}, {"loss": 0.8016, "grad_norm": 0.5322666764259338, "learning_rate": 0.0002, "epoch": 0.8639138240574507, "step": 12030}, {"loss": 0.8142, "grad_norm": 0.568696141242981, "learning_rate": 0.0002, "epoch": 0.8646319569120288, "step": 12040}, {"loss": 0.7929, "grad_norm": 0.5739135146141052, "learning_rate": 0.0002, "epoch": 0.8653500897666068, "step": 12050}, {"loss": 0.7877, "grad_norm": 0.6667993068695068, "learning_rate": 0.0002, "epoch": 0.8660682226211849, "step": 12060}, {"loss": 0.7538, "grad_norm": 0.5393701195716858, "learning_rate": 0.0002, "epoch": 0.866786355475763, "step": 12070}, {"loss": 0.8014, "grad_norm": 0.7036312818527222, "learning_rate": 0.0002, "epoch": 0.8675044883303411, "step": 12080}, {"loss": 0.7937, "grad_norm": 0.5851739048957825, "learning_rate": 0.0002, "epoch": 0.8682226211849192, "step": 12090}, {"loss": 0.8121, "grad_norm": 0.6554462909698486, "learning_rate": 0.0002, "epoch": 0.8689407540394973, "step": 12100}, {"loss": 0.8541, "grad_norm": 0.8224838376045227, "learning_rate": 0.0002, "epoch": 0.8696588868940754, "step": 12110}, {"loss": 0.73, "grad_norm": 0.513981819152832, "learning_rate": 0.0002, "epoch": 0.8703770197486534, "step": 12120}, {"loss": 0.7371, "grad_norm": 0.6913988590240479, "learning_rate": 0.0002, "epoch": 0.8710951526032316, "step": 12130}, {"loss": 0.762, "grad_norm": 0.5539003610610962, "learning_rate": 0.0002, "epoch": 0.8718132854578097, "step": 12140}, {"loss": 0.7535, "grad_norm": 0.6216937303543091, "learning_rate": 0.0002, "epoch": 0.8725314183123878, "step": 12150}, {"loss": 0.7344, "grad_norm": 0.5594495534896851, "learning_rate": 0.0002, "epoch": 0.8732495511669659, "step": 12160}, {"loss": 0.7342, "grad_norm": 0.6025309562683105, "learning_rate": 0.0002, "epoch": 0.873967684021544, "step": 12170}, {"loss": 0.7561, "grad_norm": 0.5285239815711975, "learning_rate": 0.0002, "epoch": 0.8746858168761221, "step": 12180}, {"loss": 0.7619, "grad_norm": 1.0394607782363892, "learning_rate": 0.0002, "epoch": 0.8754039497307001, "step": 12190}, {"loss": 0.8111, "grad_norm": 0.5128031373023987, "learning_rate": 0.0002, "epoch": 0.8761220825852782, "step": 12200}, {"loss": 0.8113, "grad_norm": 0.5883685946464539, "learning_rate": 0.0002, "epoch": 0.8768402154398564, "step": 12210}, {"loss": 0.7493, "grad_norm": 0.593204915523529, "learning_rate": 0.0002, "epoch": 0.8775583482944345, "step": 12220}, {"loss": 0.7739, "grad_norm": 0.7141679525375366, "learning_rate": 0.0002, "epoch": 0.8782764811490126, "step": 12230}, {"loss": 0.8155, "grad_norm": 0.6381585597991943, "learning_rate": 0.0002, "epoch": 0.8789946140035907, "step": 12240}, {"loss": 0.7756, "grad_norm": 0.7076981067657471, "learning_rate": 0.0002, "epoch": 0.8797127468581688, "step": 12250}, {"loss": 0.8186, "grad_norm": 0.8046461939811707, "learning_rate": 0.0002, "epoch": 0.8804308797127468, "step": 12260}, {"loss": 0.7615, "grad_norm": 0.635160505771637, "learning_rate": 0.0002, "epoch": 0.8811490125673249, "step": 12270}, {"loss": 0.7695, "grad_norm": 0.6388354301452637, "learning_rate": 0.0002, "epoch": 0.881867145421903, "step": 12280}, {"loss": 0.81, "grad_norm": 0.5612906217575073, "learning_rate": 0.0002, "epoch": 0.8825852782764811, "step": 12290}, {"loss": 0.8055, "grad_norm": 0.6716228723526001, "learning_rate": 0.0002, "epoch": 0.8833034111310593, "step": 12300}, {"loss": 0.757, "grad_norm": 0.6488762497901917, "learning_rate": 0.0002, "epoch": 0.8840215439856374, "step": 12310}, {"loss": 0.7794, "grad_norm": 0.5770853757858276, "learning_rate": 0.0002, "epoch": 0.8847396768402155, "step": 12320}, {"loss": 0.7617, "grad_norm": 0.5006616711616516, "learning_rate": 0.0002, "epoch": 0.8854578096947935, "step": 12330}, {"loss": 0.7512, "grad_norm": 0.6428417563438416, "learning_rate": 0.0002, "epoch": 0.8861759425493716, "step": 12340}, {"loss": 0.796, "grad_norm": 0.5721977949142456, "learning_rate": 0.0002, "epoch": 0.8868940754039497, "step": 12350}, {"loss": 0.7764, "grad_norm": 0.7000266313552856, "learning_rate": 0.0002, "epoch": 0.8876122082585278, "step": 12360}, {"loss": 0.7524, "grad_norm": 0.5252631306648254, "learning_rate": 0.0002, "epoch": 0.8883303411131059, "step": 12370}, {"loss": 0.7635, "grad_norm": 0.5788044929504395, "learning_rate": 0.0002, "epoch": 0.889048473967684, "step": 12380}, {"loss": 0.7856, "grad_norm": 0.6730653643608093, "learning_rate": 0.0002, "epoch": 0.8897666068222622, "step": 12390}, {"loss": 0.7925, "grad_norm": 0.5556851029396057, "learning_rate": 0.0002, "epoch": 0.8904847396768402, "step": 12400}, {"loss": 0.6958, "grad_norm": 0.616189181804657, "learning_rate": 0.0002, "epoch": 0.8912028725314183, "step": 12410}, {"loss": 0.7468, "grad_norm": 0.6360940337181091, "learning_rate": 0.0002, "epoch": 0.8919210053859964, "step": 12420}, {"loss": 0.8088, "grad_norm": 0.5832887887954712, "learning_rate": 0.0002, "epoch": 0.8926391382405745, "step": 12430}, {"loss": 0.7383, "grad_norm": 0.8319168090820312, "learning_rate": 0.0002, "epoch": 0.8933572710951526, "step": 12440}, {"loss": 0.8597, "grad_norm": 0.5415005087852478, "learning_rate": 0.0002, "epoch": 0.8940754039497307, "step": 12450}, {"loss": 0.7439, "grad_norm": 0.4959808588027954, "learning_rate": 0.0002, "epoch": 0.8947935368043088, "step": 12460}, {"loss": 0.8493, "grad_norm": 0.5102260708808899, "learning_rate": 0.0002, "epoch": 0.8955116696588868, "step": 12470}, {"loss": 0.7274, "grad_norm": 0.773972749710083, "learning_rate": 0.0002, "epoch": 0.896229802513465, "step": 12480}, {"loss": 0.7797, "grad_norm": 0.6314513087272644, "learning_rate": 0.0002, "epoch": 0.8969479353680431, "step": 12490}, {"loss": 0.7839, "grad_norm": 0.6503705382347107, "learning_rate": 0.0002, "epoch": 0.8976660682226212, "step": 12500}, {"loss": 0.8177, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 0.8983842010771993, "step": 12510}, {"loss": 0.7448, "grad_norm": 0.7222756743431091, "learning_rate": 0.0002, "epoch": 0.8991023339317774, "step": 12520}, {"loss": 0.7779, "grad_norm": 0.7242336869239807, "learning_rate": 0.0002, "epoch": 0.8998204667863555, "step": 12530}, {"loss": 0.7577, "grad_norm": 0.625769317150116, "learning_rate": 0.0002, "epoch": 0.9005385996409335, "step": 12540}, {"loss": 0.8528, "grad_norm": 0.6003357172012329, "learning_rate": 0.0002, "epoch": 0.9012567324955116, "step": 12550}, {"loss": 0.7871, "grad_norm": 0.6089374423027039, "learning_rate": 0.0002, "epoch": 0.9019748653500897, "step": 12560}, {"loss": 0.74, "grad_norm": 0.6232544183731079, "learning_rate": 0.0002, "epoch": 0.9026929982046679, "step": 12570}, {"loss": 0.7993, "grad_norm": 0.5426769256591797, "learning_rate": 0.0002, "epoch": 0.903411131059246, "step": 12580}, {"loss": 0.8023, "grad_norm": 0.5711943507194519, "learning_rate": 0.0002, "epoch": 0.9041292639138241, "step": 12590}, {"loss": 0.7915, "grad_norm": 0.5287838578224182, "learning_rate": 0.0002, "epoch": 0.9048473967684022, "step": 12600}, {"loss": 0.7394, "grad_norm": 0.6192951798439026, "learning_rate": 0.0002, "epoch": 0.9055655296229802, "step": 12610}, {"loss": 0.7547, "grad_norm": 0.493082195520401, "learning_rate": 0.0002, "epoch": 0.9062836624775583, "step": 12620}, {"loss": 0.7604, "grad_norm": 0.7668463587760925, "learning_rate": 0.0002, "epoch": 0.9070017953321364, "step": 12630}, {"loss": 0.8079, "grad_norm": 0.6298037767410278, "learning_rate": 0.0002, "epoch": 0.9077199281867145, "step": 12640}, {"loss": 0.7451, "grad_norm": 0.5502580404281616, "learning_rate": 0.0002, "epoch": 0.9084380610412927, "step": 12650}, {"loss": 0.763, "grad_norm": 0.5525170564651489, "learning_rate": 0.0002, "epoch": 0.9091561938958708, "step": 12660}, {"loss": 0.7579, "grad_norm": 0.9753695726394653, "learning_rate": 0.0002, "epoch": 0.9098743267504489, "step": 12670}, {"loss": 0.872, "grad_norm": 0.611427366733551, "learning_rate": 0.0002, "epoch": 0.9105924596050269, "step": 12680}, {"loss": 0.7786, "grad_norm": 0.5141594409942627, "learning_rate": 0.0002, "epoch": 0.911310592459605, "step": 12690}, {"loss": 0.7384, "grad_norm": 0.6739137172698975, "learning_rate": 0.0002, "epoch": 0.9120287253141831, "step": 12700}, {"loss": 0.8579, "grad_norm": 0.5759707689285278, "learning_rate": 0.0002, "epoch": 0.9127468581687612, "step": 12710}, {"loss": 0.7559, "grad_norm": 0.5548733472824097, "learning_rate": 0.0002, "epoch": 0.9134649910233393, "step": 12720}, {"loss": 0.8225, "grad_norm": 0.7014280557632446, "learning_rate": 0.0002, "epoch": 0.9141831238779174, "step": 12730}, {"loss": 0.7936, "grad_norm": 0.5939958691596985, "learning_rate": 0.0002, "epoch": 0.9149012567324956, "step": 12740}, {"loss": 0.7756, "grad_norm": 0.5995593667030334, "learning_rate": 0.0002, "epoch": 0.9156193895870736, "step": 12750}, {"loss": 0.7423, "grad_norm": 0.6686680316925049, "learning_rate": 0.0002, "epoch": 0.9163375224416517, "step": 12760}, {"loss": 0.8057, "grad_norm": 0.4742372930049896, "learning_rate": 0.0002, "epoch": 0.9170556552962298, "step": 12770}, {"loss": 0.7795, "grad_norm": 0.5493217706680298, "learning_rate": 0.0002, "epoch": 0.9177737881508079, "step": 12780}, {"loss": 0.7859, "grad_norm": 0.5641885995864868, "learning_rate": 0.0002, "epoch": 0.918491921005386, "step": 12790}, {"loss": 0.7775, "grad_norm": 0.5814061164855957, "learning_rate": 0.0002, "epoch": 0.9192100538599641, "step": 12800}, {"loss": 0.8204, "grad_norm": 0.6774331331253052, "learning_rate": 0.0002, "epoch": 0.9199281867145422, "step": 12810}, {"loss": 0.8205, "grad_norm": 0.5592127442359924, "learning_rate": 0.0002, "epoch": 0.9206463195691202, "step": 12820}, {"loss": 0.7788, "grad_norm": 0.5246456861495972, "learning_rate": 0.0002, "epoch": 0.9213644524236984, "step": 12830}, {"loss": 0.7886, "grad_norm": 0.6524264812469482, "learning_rate": 0.0002, "epoch": 0.9220825852782765, "step": 12840}, {"loss": 0.796, "grad_norm": 0.6010791063308716, "learning_rate": 0.0002, "epoch": 0.9228007181328546, "step": 12850}, {"loss": 0.7998, "grad_norm": 0.5289866924285889, "learning_rate": 0.0002, "epoch": 0.9235188509874327, "step": 12860}, {"loss": 0.7582, "grad_norm": 0.6850762367248535, "learning_rate": 0.0002, "epoch": 0.9242369838420108, "step": 12870}, {"loss": 0.7894, "grad_norm": 0.5293797850608826, "learning_rate": 0.0002, "epoch": 0.9249551166965889, "step": 12880}, {"loss": 0.7738, "grad_norm": 0.6045399308204651, "learning_rate": 0.0002, "epoch": 0.9256732495511669, "step": 12890}, {"loss": 0.7207, "grad_norm": 0.7026739716529846, "learning_rate": 0.0002, "epoch": 0.926391382405745, "step": 12900}, {"loss": 0.7726, "grad_norm": 0.6884756684303284, "learning_rate": 0.0002, "epoch": 0.9271095152603231, "step": 12910}, {"loss": 0.7913, "grad_norm": 0.637884795665741, "learning_rate": 0.0002, "epoch": 0.9278276481149013, "step": 12920}, {"loss": 0.7513, "grad_norm": 0.513913631439209, "learning_rate": 0.0002, "epoch": 0.9285457809694794, "step": 12930}, {"loss": 0.8, "grad_norm": 0.6642340421676636, "learning_rate": 0.0002, "epoch": 0.9292639138240575, "step": 12940}, {"loss": 0.8026, "grad_norm": 0.5708861947059631, "learning_rate": 0.0002, "epoch": 0.9299820466786356, "step": 12950}, {"loss": 0.8234, "grad_norm": 0.5896512866020203, "learning_rate": 0.0002, "epoch": 0.9307001795332136, "step": 12960}, {"loss": 0.77, "grad_norm": 0.5754874348640442, "learning_rate": 0.0002, "epoch": 0.9314183123877917, "step": 12970}, {"loss": 0.7594, "grad_norm": 0.6363751888275146, "learning_rate": 0.0002, "epoch": 0.9321364452423698, "step": 12980}, {"loss": 0.7898, "grad_norm": 0.7660197019577026, "learning_rate": 0.0002, "epoch": 0.9328545780969479, "step": 12990}, {"loss": 0.792, "grad_norm": 0.607728898525238, "learning_rate": 0.0002, "epoch": 0.933572710951526, "step": 13000}, {"loss": 0.734, "grad_norm": 0.5257042050361633, "learning_rate": 0.0002, "epoch": 0.9342908438061042, "step": 13010}, {"loss": 0.8129, "grad_norm": 0.7916908264160156, "learning_rate": 0.0002, "epoch": 0.9350089766606823, "step": 13020}, {"loss": 0.81, "grad_norm": 0.8310123085975647, "learning_rate": 0.0002, "epoch": 0.9357271095152603, "step": 13030}, {"loss": 0.7738, "grad_norm": 0.6543728113174438, "learning_rate": 0.0002, "epoch": 0.9364452423698384, "step": 13040}, {"loss": 0.7797, "grad_norm": 0.7153878808021545, "learning_rate": 0.0002, "epoch": 0.9371633752244165, "step": 13050}, {"loss": 0.779, "grad_norm": 0.7510694265365601, "learning_rate": 0.0002, "epoch": 0.9378815080789946, "step": 13060}, {"loss": 0.7761, "grad_norm": 0.5524464249610901, "learning_rate": 0.0002, "epoch": 0.9385996409335727, "step": 13070}, {"loss": 0.8635, "grad_norm": 0.6657140254974365, "learning_rate": 0.0002, "epoch": 0.9393177737881508, "step": 13080}, {"loss": 0.8097, "grad_norm": 0.5757394433021545, "learning_rate": 0.0002, "epoch": 0.940035906642729, "step": 13090}, {"loss": 0.7967, "grad_norm": 0.6171187162399292, "learning_rate": 0.0002, "epoch": 0.940754039497307, "step": 13100}, {"loss": 0.8197, "grad_norm": 0.5946314334869385, "learning_rate": 0.0002, "epoch": 0.9414721723518851, "step": 13110}, {"loss": 0.7184, "grad_norm": 0.5727229714393616, "learning_rate": 0.0002, "epoch": 0.9421903052064632, "step": 13120}, {"loss": 0.7981, "grad_norm": 0.7805224061012268, "learning_rate": 0.0002, "epoch": 0.9429084380610413, "step": 13130}, {"loss": 0.8045, "grad_norm": 0.5763523578643799, "learning_rate": 0.0002, "epoch": 0.9436265709156194, "step": 13140}, {"loss": 0.7462, "grad_norm": 0.8310899138450623, "learning_rate": 0.0002, "epoch": 0.9443447037701975, "step": 13150}, {"loss": 0.7818, "grad_norm": 0.7531784772872925, "learning_rate": 0.0002, "epoch": 0.9450628366247756, "step": 13160}, {"loss": 0.8418, "grad_norm": 0.678779661655426, "learning_rate": 0.0002, "epoch": 0.9457809694793536, "step": 13170}, {"loss": 0.8064, "grad_norm": 0.8096453547477722, "learning_rate": 0.0002, "epoch": 0.9464991023339318, "step": 13180}, {"loss": 0.7676, "grad_norm": 0.6743921637535095, "learning_rate": 0.0002, "epoch": 0.9472172351885099, "step": 13190}, {"loss": 0.7949, "grad_norm": 0.606852114200592, "learning_rate": 0.0002, "epoch": 0.947935368043088, "step": 13200}, {"loss": 0.7908, "grad_norm": 0.6550270915031433, "learning_rate": 0.0002, "epoch": 0.9486535008976661, "step": 13210}, {"loss": 0.7564, "grad_norm": 0.6494552493095398, "learning_rate": 0.0002, "epoch": 0.9493716337522442, "step": 13220}, {"loss": 0.7974, "grad_norm": 0.5867666602134705, "learning_rate": 0.0002, "epoch": 0.9500897666068223, "step": 13230}, {"loss": 0.8117, "grad_norm": 0.6283786296844482, "learning_rate": 0.0002, "epoch": 0.9508078994614003, "step": 13240}, {"loss": 0.7775, "grad_norm": 0.6824573278427124, "learning_rate": 0.0002, "epoch": 0.9515260323159784, "step": 13250}, {"loss": 0.7674, "grad_norm": 0.6945744156837463, "learning_rate": 0.0002, "epoch": 0.9522441651705565, "step": 13260}, {"loss": 0.7384, "grad_norm": 0.6468575596809387, "learning_rate": 0.0002, "epoch": 0.9529622980251347, "step": 13270}, {"loss": 0.7548, "grad_norm": 0.6819407939910889, "learning_rate": 0.0002, "epoch": 0.9536804308797128, "step": 13280}, {"loss": 0.7933, "grad_norm": 0.6660491824150085, "learning_rate": 0.0002, "epoch": 0.9543985637342909, "step": 13290}, {"loss": 0.7293, "grad_norm": 0.6320462226867676, "learning_rate": 0.0002, "epoch": 0.9551166965888689, "step": 13300}, {"loss": 0.8122, "grad_norm": 0.46753761172294617, "learning_rate": 0.0002, "epoch": 0.955834829443447, "step": 13310}, {"loss": 0.7953, "grad_norm": 0.6608774065971375, "learning_rate": 0.0002, "epoch": 0.9565529622980251, "step": 13320}, {"loss": 0.8217, "grad_norm": 0.607448935508728, "learning_rate": 0.0002, "epoch": 0.9572710951526032, "step": 13330}, {"loss": 0.7278, "grad_norm": 0.6796701550483704, "learning_rate": 0.0002, "epoch": 0.9579892280071813, "step": 13340}, {"loss": 0.7979, "grad_norm": 0.7655861377716064, "learning_rate": 0.0002, "epoch": 0.9587073608617595, "step": 13350}, {"loss": 0.7822, "grad_norm": 0.5881335735321045, "learning_rate": 0.0002, "epoch": 0.9594254937163376, "step": 13360}, {"loss": 0.815, "grad_norm": 0.6855270862579346, "learning_rate": 0.0002, "epoch": 0.9601436265709156, "step": 13370}, {"loss": 0.8025, "grad_norm": 0.6072475910186768, "learning_rate": 0.0002, "epoch": 0.9608617594254937, "step": 13380}, {"loss": 0.7756, "grad_norm": 0.5983994603157043, "learning_rate": 0.0002, "epoch": 0.9615798922800718, "step": 13390}, {"loss": 0.8121, "grad_norm": 0.6141189932823181, "learning_rate": 0.0002, "epoch": 0.9622980251346499, "step": 13400}, {"loss": 0.8059, "grad_norm": 0.6539722084999084, "learning_rate": 0.0002, "epoch": 0.963016157989228, "step": 13410}, {"loss": 0.8085, "grad_norm": 0.5425801277160645, "learning_rate": 0.0002, "epoch": 0.9637342908438061, "step": 13420}, {"loss": 0.7687, "grad_norm": 0.8038925528526306, "learning_rate": 0.0002, "epoch": 0.9644524236983842, "step": 13430}, {"loss": 0.8015, "grad_norm": 0.5729590058326721, "learning_rate": 0.0002, "epoch": 0.9651705565529622, "step": 13440}, {"loss": 0.782, "grad_norm": 0.5695241689682007, "learning_rate": 0.0002, "epoch": 0.9658886894075404, "step": 13450}, {"loss": 0.7984, "grad_norm": 0.5913681387901306, "learning_rate": 0.0002, "epoch": 0.9666068222621185, "step": 13460}, {"loss": 0.7947, "grad_norm": 1.1798994541168213, "learning_rate": 0.0002, "epoch": 0.9673249551166966, "step": 13470}, {"loss": 0.7342, "grad_norm": 0.5931369066238403, "learning_rate": 0.0002, "epoch": 0.9680430879712747, "step": 13480}, {"loss": 0.8432, "grad_norm": 0.6269514560699463, "learning_rate": 0.0002, "epoch": 0.9687612208258528, "step": 13490}, {"loss": 0.7357, "grad_norm": 0.7380245327949524, "learning_rate": 0.0002, "epoch": 0.9694793536804309, "step": 13500}, {"loss": 0.8006, "grad_norm": 0.5668187141418457, "learning_rate": 0.0002, "epoch": 0.9701974865350089, "step": 13510}, {"loss": 0.7562, "grad_norm": 0.547149121761322, "learning_rate": 0.0002, "epoch": 0.970915619389587, "step": 13520}, {"loss": 0.8239, "grad_norm": 0.49131739139556885, "learning_rate": 0.0002, "epoch": 0.9716337522441651, "step": 13530}, {"loss": 0.8159, "grad_norm": 0.6385366320610046, "learning_rate": 0.0002, "epoch": 0.9723518850987433, "step": 13540}, {"loss": 0.7882, "grad_norm": 0.5962417125701904, "learning_rate": 0.0002, "epoch": 0.9730700179533214, "step": 13550}, {"loss": 0.7353, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9737881508078995, "step": 13560}, {"loss": 0.7511, "grad_norm": 0.5757403373718262, "learning_rate": 0.0002, "epoch": 0.9745062836624776, "step": 13570}, {"loss": 0.7858, "grad_norm": 0.7214667201042175, "learning_rate": 0.0002, "epoch": 0.9752244165170556, "step": 13580}, {"loss": 0.7492, "grad_norm": 0.5902701020240784, "learning_rate": 0.0002, "epoch": 0.9759425493716337, "step": 13590}, {"loss": 0.8177, "grad_norm": 0.752805769443512, "learning_rate": 0.0002, "epoch": 0.9766606822262118, "step": 13600}, {"loss": 0.7622, "grad_norm": 0.5943595767021179, "learning_rate": 0.0002, "epoch": 0.9773788150807899, "step": 13610}, {"loss": 0.7781, "grad_norm": 0.6752488613128662, "learning_rate": 0.0002, "epoch": 0.978096947935368, "step": 13620}, {"loss": 0.8022, "grad_norm": 0.5295413732528687, "learning_rate": 0.0002, "epoch": 0.9788150807899462, "step": 13630}, {"loss": 0.7462, "grad_norm": 0.732549250125885, "learning_rate": 0.0002, "epoch": 0.9795332136445243, "step": 13640}, {"loss": 0.7939, "grad_norm": 0.5701823830604553, "learning_rate": 0.0002, "epoch": 0.9802513464991023, "step": 13650}, {"loss": 0.7609, "grad_norm": 0.576898455619812, "learning_rate": 0.0002, "epoch": 0.9809694793536804, "step": 13660}, {"loss": 0.7576, "grad_norm": 0.5916832089424133, "learning_rate": 0.0002, "epoch": 0.9816876122082585, "step": 13670}, {"loss": 0.7587, "grad_norm": 0.5554524660110474, "learning_rate": 0.0002, "epoch": 0.9824057450628366, "step": 13680}, {"loss": 0.8274, "grad_norm": 0.6988440752029419, "learning_rate": 0.0002, "epoch": 0.9831238779174147, "step": 13690}, {"loss": 0.7485, "grad_norm": 0.6660445332527161, "learning_rate": 0.0002, "epoch": 0.9838420107719928, "step": 13700}, {"loss": 0.7609, "grad_norm": 2.421210289001465, "learning_rate": 0.0002, "epoch": 0.984560143626571, "step": 13710}, {"loss": 0.784, "grad_norm": 0.6307598948478699, "learning_rate": 0.0002, "epoch": 0.985278276481149, "step": 13720}, {"loss": 0.7757, "grad_norm": 0.6832480430603027, "learning_rate": 0.0002, "epoch": 0.9859964093357271, "step": 13730}, {"loss": 0.8064, "grad_norm": 0.5974255204200745, "learning_rate": 0.0002, "epoch": 0.9867145421903052, "step": 13740}, {"loss": 0.7871, "grad_norm": 0.6540380716323853, "learning_rate": 0.0002, "epoch": 0.9874326750448833, "step": 13750}, {"loss": 0.7735, "grad_norm": 0.7532727122306824, "learning_rate": 0.0002, "epoch": 0.9881508078994614, "step": 13760}, {"loss": 0.7392, "grad_norm": 0.6776283383369446, "learning_rate": 0.0002, "epoch": 0.9888689407540395, "step": 13770}, {"loss": 0.7852, "grad_norm": 0.5776281356811523, "learning_rate": 0.0002, "epoch": 0.9895870736086176, "step": 13780}, {"loss": 0.8216, "grad_norm": 0.5473008751869202, "learning_rate": 0.0002, "epoch": 0.9903052064631956, "step": 13790}, {"loss": 0.7776, "grad_norm": 0.5428591370582581, "learning_rate": 0.0002, "epoch": 0.9910233393177738, "step": 13800}, {"loss": 0.7823, "grad_norm": 0.5173406004905701, "learning_rate": 0.0002, "epoch": 0.9917414721723519, "step": 13810}, {"loss": 0.762, "grad_norm": 0.6462617516517639, "learning_rate": 0.0002, "epoch": 0.99245960502693, "step": 13820}, {"loss": 0.7656, "grad_norm": 0.5800426006317139, "learning_rate": 0.0002, "epoch": 0.9931777378815081, "step": 13830}, {"loss": 0.8028, "grad_norm": 0.5015466809272766, "learning_rate": 0.0002, "epoch": 0.9938958707360862, "step": 13840}, {"loss": 0.7782, "grad_norm": 0.59474778175354, "learning_rate": 0.0002, "epoch": 0.9946140035906643, "step": 13850}, {"loss": 0.7891, "grad_norm": 0.5609583258628845, "learning_rate": 0.0002, "epoch": 0.9953321364452423, "step": 13860}, {"loss": 0.7647, "grad_norm": 0.5762063264846802, "learning_rate": 0.0002, "epoch": 0.9960502692998204, "step": 13870}, {"loss": 0.7594, "grad_norm": 0.6419214010238647, "learning_rate": 0.0002, "epoch": 0.9967684021543985, "step": 13880}, {"loss": 0.7599, "grad_norm": 0.7821950316429138, "learning_rate": 0.0002, "epoch": 0.9974865350089767, "step": 13890}, {"loss": 0.7529, "grad_norm": 0.6216017007827759, "learning_rate": 0.0002, "epoch": 0.9982046678635548, "step": 13900}, {"loss": 0.7621, "grad_norm": 0.5446485877037048, "learning_rate": 0.0002, "epoch": 0.9989228007181329, "step": 13910}, {"loss": 0.74, "grad_norm": 0.5037565231323242, "learning_rate": 0.0002, "epoch": 0.999640933572711, "step": 13920}, {"eval_loss": 1.09147310256958, "eval_runtime": 55.1915, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 1.0, "step": 13925}, {"loss": 0.7479, "grad_norm": 0.5808277130126953, "learning_rate": 0.0002, "epoch": 1.000359066427289, "step": 13930}, {"loss": 0.7147, "grad_norm": 0.47258496284484863, "learning_rate": 0.0002, "epoch": 1.0010771992818672, "step": 13940}, {"loss": 0.7075, "grad_norm": 0.8921670317649841, "learning_rate": 0.0002, "epoch": 1.0017953321364452, "step": 13950}, {"loss": 0.7737, "grad_norm": 0.746729850769043, "learning_rate": 0.0002, "epoch": 1.0025134649910232, "step": 13960}, {"loss": 0.6912, "grad_norm": 0.6243796944618225, "learning_rate": 0.0002, "epoch": 1.0032315978456015, "step": 13970}, {"loss": 0.7171, "grad_norm": 0.6725090742111206, "learning_rate": 0.0002, "epoch": 1.0039497307001795, "step": 13980}, {"loss": 0.7094, "grad_norm": 0.8762497305870056, "learning_rate": 0.0002, "epoch": 1.0046678635547577, "step": 13990}, {"loss": 0.7183, "grad_norm": 0.7694411873817444, "learning_rate": 0.0002, "epoch": 1.0053859964093357, "step": 14000}, {"loss": 0.7741, "grad_norm": 0.6208822727203369, "learning_rate": 0.0002, "epoch": 1.006104129263914, "step": 14010}, {"loss": 0.7291, "grad_norm": 0.8503357768058777, "learning_rate": 0.0002, "epoch": 1.006822262118492, "step": 14020}, {"loss": 0.7189, "grad_norm": 0.5813316106796265, "learning_rate": 0.0002, "epoch": 1.00754039497307, "step": 14030}, {"loss": 0.751, "grad_norm": 0.8186036348342896, "learning_rate": 0.0002, "epoch": 1.0082585278276481, "step": 14040}, {"loss": 0.7205, "grad_norm": 0.759873628616333, "learning_rate": 0.0002, "epoch": 1.0089766606822261, "step": 14050}, {"loss": 0.7517, "grad_norm": 0.8437777161598206, "learning_rate": 0.0002, "epoch": 1.0096947935368044, "step": 14060}, {"loss": 0.7205, "grad_norm": 0.5750975012779236, "learning_rate": 0.0002, "epoch": 1.0104129263913824, "step": 14070}, {"loss": 0.7079, "grad_norm": 0.5873221158981323, "learning_rate": 0.0002, "epoch": 1.0111310592459606, "step": 14080}, {"loss": 0.7645, "grad_norm": 0.6381314396858215, "learning_rate": 0.0002, "epoch": 1.0118491921005386, "step": 14090}, {"loss": 0.7246, "grad_norm": 0.6510405540466309, "learning_rate": 0.0002, "epoch": 1.0125673249551166, "step": 14100}, {"loss": 0.6906, "grad_norm": 0.7698671221733093, "learning_rate": 0.0002, "epoch": 1.0132854578096948, "step": 14110}, {"loss": 0.7008, "grad_norm": 0.646180272102356, "learning_rate": 0.0002, "epoch": 1.0140035906642728, "step": 14120}, {"loss": 0.7446, "grad_norm": 0.6183205246925354, "learning_rate": 0.0002, "epoch": 1.014721723518851, "step": 14130}, {"loss": 0.747, "grad_norm": 0.5082563757896423, "learning_rate": 0.0002, "epoch": 1.015439856373429, "step": 14140}, {"loss": 0.7229, "grad_norm": 0.7285500764846802, "learning_rate": 0.0002, "epoch": 1.0161579892280073, "step": 14150}, {"loss": 0.6879, "grad_norm": 0.6368175148963928, "learning_rate": 0.0002, "epoch": 1.0168761220825853, "step": 14160}, {"loss": 0.712, "grad_norm": 0.44868743419647217, "learning_rate": 0.0002, "epoch": 1.0175942549371633, "step": 14170}, {"loss": 0.7299, "grad_norm": 0.6346513628959656, "learning_rate": 0.0002, "epoch": 1.0183123877917415, "step": 14180}, {"loss": 0.7099, "grad_norm": 0.7287803292274475, "learning_rate": 0.0002, "epoch": 1.0190305206463195, "step": 14190}, {"loss": 0.6915, "grad_norm": 0.6701363325119019, "learning_rate": 0.0002, "epoch": 1.0197486535008977, "step": 14200}, {"loss": 0.7389, "grad_norm": 0.6419289112091064, "learning_rate": 0.0002, "epoch": 1.0204667863554757, "step": 14210}, {"loss": 0.7386, "grad_norm": 0.7703002095222473, "learning_rate": 0.0002, "epoch": 1.021184919210054, "step": 14220}, {"loss": 0.6819, "grad_norm": 0.6803670525550842, "learning_rate": 0.0002, "epoch": 1.021903052064632, "step": 14230}, {"loss": 0.74, "grad_norm": 0.5780976414680481, "learning_rate": 0.0002, "epoch": 1.02262118491921, "step": 14240}, {"loss": 0.6912, "grad_norm": 0.5096051096916199, "learning_rate": 0.0002, "epoch": 1.0233393177737882, "step": 14250}, {"loss": 0.7585, "grad_norm": 0.6058611869812012, "learning_rate": 0.0002, "epoch": 1.0240574506283662, "step": 14260}, {"loss": 0.7542, "grad_norm": 0.6703311204910278, "learning_rate": 0.0002, "epoch": 1.0247755834829444, "step": 14270}, {"loss": 0.7541, "grad_norm": 0.7143640518188477, "learning_rate": 0.0002, "epoch": 1.0254937163375224, "step": 14280}, {"loss": 0.7411, "grad_norm": 0.6730744242668152, "learning_rate": 0.0002, "epoch": 1.0262118491921006, "step": 14290}, {"loss": 0.7072, "grad_norm": 0.8180603384971619, "learning_rate": 0.0002, "epoch": 1.0269299820466786, "step": 14300}, {"loss": 0.6944, "grad_norm": 0.6752267479896545, "learning_rate": 0.0002, "epoch": 1.0276481149012566, "step": 14310}, {"loss": 0.7105, "grad_norm": 0.678428590297699, "learning_rate": 0.0002, "epoch": 1.0283662477558349, "step": 14320}, {"loss": 0.7496, "grad_norm": 0.5959973931312561, "learning_rate": 0.0002, "epoch": 1.0290843806104129, "step": 14330}, {"loss": 0.7196, "grad_norm": 0.5797176957130432, "learning_rate": 0.0002, "epoch": 1.029802513464991, "step": 14340}, {"loss": 0.7853, "grad_norm": 0.6415652632713318, "learning_rate": 0.0002, "epoch": 1.030520646319569, "step": 14350}, {"loss": 0.7297, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 1.0312387791741473, "step": 14360}, {"loss": 0.7715, "grad_norm": 0.7158452272415161, "learning_rate": 0.0002, "epoch": 1.0319569120287253, "step": 14370}, {"loss": 0.7526, "grad_norm": 0.6066089272499084, "learning_rate": 0.0002, "epoch": 1.0326750448833033, "step": 14380}, {"loss": 0.7639, "grad_norm": 0.7359582781791687, "learning_rate": 0.0002, "epoch": 1.0333931777378815, "step": 14390}, {"loss": 0.7445, "grad_norm": 0.7372373938560486, "learning_rate": 0.0002, "epoch": 1.0341113105924595, "step": 14400}, {"loss": 0.7262, "grad_norm": 0.7511868476867676, "learning_rate": 0.0002, "epoch": 1.0348294434470378, "step": 14410}, {"loss": 0.7145, "grad_norm": 0.5449917912483215, "learning_rate": 0.0002, "epoch": 1.0355475763016158, "step": 14420}, {"loss": 0.6908, "grad_norm": 0.6700817346572876, "learning_rate": 0.0002, "epoch": 1.036265709156194, "step": 14430}, {"loss": 0.7237, "grad_norm": 0.7061316967010498, "learning_rate": 0.0002, "epoch": 1.036983842010772, "step": 14440}, {"loss": 0.7166, "grad_norm": 0.7582663893699646, "learning_rate": 0.0002, "epoch": 1.03770197486535, "step": 14450}, {"loss": 0.7447, "grad_norm": 0.6408873200416565, "learning_rate": 0.0002, "epoch": 1.0384201077199282, "step": 14460}, {"loss": 0.728, "grad_norm": 0.7645436525344849, "learning_rate": 0.0002, "epoch": 1.0391382405745062, "step": 14470}, {"loss": 0.7764, "grad_norm": 0.6522644758224487, "learning_rate": 0.0002, "epoch": 1.0398563734290844, "step": 14480}, {"loss": 0.7249, "grad_norm": 0.784273624420166, "learning_rate": 0.0002, "epoch": 1.0405745062836624, "step": 14490}, {"loss": 0.7173, "grad_norm": 0.673891544342041, "learning_rate": 0.0002, "epoch": 1.0412926391382407, "step": 14500}, {"loss": 0.6647, "grad_norm": 0.6566316485404968, "learning_rate": 0.0002, "epoch": 1.0420107719928187, "step": 14510}, {"loss": 0.7626, "grad_norm": 0.6062059998512268, "learning_rate": 0.0002, "epoch": 1.0427289048473967, "step": 14520}, {"loss": 0.7061, "grad_norm": 0.6884504556655884, "learning_rate": 0.0002, "epoch": 1.0434470377019749, "step": 14530}, {"loss": 0.7293, "grad_norm": 0.6642231345176697, "learning_rate": 0.0002, "epoch": 1.044165170556553, "step": 14540}, {"loss": 0.7084, "grad_norm": 0.6989523768424988, "learning_rate": 0.0002, "epoch": 1.0448833034111311, "step": 14550}, {"loss": 0.7751, "grad_norm": 0.8179892301559448, "learning_rate": 0.0002, "epoch": 1.0456014362657091, "step": 14560}, {"loss": 0.7225, "grad_norm": 0.6426970362663269, "learning_rate": 0.0002, "epoch": 1.0463195691202873, "step": 14570}, {"loss": 0.7756, "grad_norm": 0.678445041179657, "learning_rate": 0.0002, "epoch": 1.0470377019748653, "step": 14580}, {"loss": 0.7172, "grad_norm": 0.7573820352554321, "learning_rate": 0.0002, "epoch": 1.0477558348294433, "step": 14590}, {"loss": 0.8092, "grad_norm": 0.734443724155426, "learning_rate": 0.0002, "epoch": 1.0484739676840216, "step": 14600}, {"loss": 0.7205, "grad_norm": 0.7333676218986511, "learning_rate": 0.0002, "epoch": 1.0491921005385996, "step": 14610}, {"loss": 0.7276, "grad_norm": 0.6122187972068787, "learning_rate": 0.0002, "epoch": 1.0499102333931778, "step": 14620}, {"loss": 0.7051, "grad_norm": 0.6916412711143494, "learning_rate": 0.0002, "epoch": 1.0506283662477558, "step": 14630}, {"loss": 0.7315, "grad_norm": 0.5898127555847168, "learning_rate": 0.0002, "epoch": 1.051346499102334, "step": 14640}, {"loss": 0.7293, "grad_norm": 0.6071873307228088, "learning_rate": 0.0002, "epoch": 1.052064631956912, "step": 14650}, {"loss": 0.7924, "grad_norm": 0.6530455946922302, "learning_rate": 0.0002, "epoch": 1.05278276481149, "step": 14660}, {"loss": 0.7055, "grad_norm": 0.6919314861297607, "learning_rate": 0.0002, "epoch": 1.0535008976660682, "step": 14670}, {"loss": 0.7481, "grad_norm": 0.7843509912490845, "learning_rate": 0.0002, "epoch": 1.0542190305206462, "step": 14680}, {"loss": 0.7253, "grad_norm": 0.6106747388839722, "learning_rate": 0.0002, "epoch": 1.0549371633752245, "step": 14690}, {"loss": 0.7206, "grad_norm": 0.7828368544578552, "learning_rate": 0.0002, "epoch": 1.0556552962298025, "step": 14700}, {"loss": 0.6933, "grad_norm": 0.6772044897079468, "learning_rate": 0.0002, "epoch": 1.0563734290843807, "step": 14710}, {"loss": 0.6851, "grad_norm": 0.5430962443351746, "learning_rate": 0.0002, "epoch": 1.0570915619389587, "step": 14720}, {"loss": 0.7306, "grad_norm": 0.7364194989204407, "learning_rate": 0.0002, "epoch": 1.0578096947935367, "step": 14730}, {"loss": 0.703, "grad_norm": 0.5607585310935974, "learning_rate": 0.0002, "epoch": 1.058527827648115, "step": 14740}, {"loss": 0.7488, "grad_norm": 0.7917081713676453, "learning_rate": 0.0002, "epoch": 1.059245960502693, "step": 14750}, {"loss": 0.71, "grad_norm": 0.7852025628089905, "learning_rate": 0.0002, "epoch": 1.0599640933572712, "step": 14760}, {"loss": 0.7093, "grad_norm": 0.6329161524772644, "learning_rate": 0.0002, "epoch": 1.0606822262118492, "step": 14770}, {"loss": 0.7244, "grad_norm": 0.7607306838035583, "learning_rate": 0.0002, "epoch": 1.0614003590664274, "step": 14780}, {"loss": 0.7237, "grad_norm": 0.7236617207527161, "learning_rate": 0.0002, "epoch": 1.0621184919210054, "step": 14790}, {"loss": 0.7133, "grad_norm": 0.793542206287384, "learning_rate": 0.0002, "epoch": 1.0628366247755834, "step": 14800}, {"loss": 0.7482, "grad_norm": 0.53999263048172, "learning_rate": 0.0002, "epoch": 1.0635547576301616, "step": 14810}, {"loss": 0.732, "grad_norm": 0.5821034908294678, "learning_rate": 0.0002, "epoch": 1.0642728904847396, "step": 14820}, {"loss": 0.7066, "grad_norm": 0.6593600511550903, "learning_rate": 0.0002, "epoch": 1.0649910233393178, "step": 14830}, {"loss": 0.7458, "grad_norm": 0.70230633020401, "learning_rate": 0.0002, "epoch": 1.0657091561938958, "step": 14840}, {"loss": 0.7244, "grad_norm": 0.5715264081954956, "learning_rate": 0.0002, "epoch": 1.066427289048474, "step": 14850}, {"loss": 0.723, "grad_norm": 0.6610119938850403, "learning_rate": 0.0002, "epoch": 1.067145421903052, "step": 14860}, {"loss": 0.745, "grad_norm": 0.5470091700553894, "learning_rate": 0.0002, "epoch": 1.06786355475763, "step": 14870}, {"loss": 0.7464, "grad_norm": 0.7529906630516052, "learning_rate": 0.0002, "epoch": 1.0685816876122083, "step": 14880}, {"loss": 0.7421, "grad_norm": 0.7532844543457031, "learning_rate": 0.0002, "epoch": 1.0692998204667863, "step": 14890}, {"loss": 0.6706, "grad_norm": 0.6439316868782043, "learning_rate": 0.0002, "epoch": 1.0700179533213645, "step": 14900}, {"loss": 0.7276, "grad_norm": 0.5580114126205444, "learning_rate": 0.0002, "epoch": 1.0707360861759425, "step": 14910}, {"loss": 0.7478, "grad_norm": 0.6299236416816711, "learning_rate": 0.0002, "epoch": 1.0714542190305207, "step": 14920}, {"loss": 0.7927, "grad_norm": 0.6934021711349487, "learning_rate": 0.0002, "epoch": 1.0721723518850987, "step": 14930}, {"loss": 0.6766, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 1.0728904847396767, "step": 14940}, {"loss": 0.7072, "grad_norm": 0.8921014070510864, "learning_rate": 0.0002, "epoch": 1.073608617594255, "step": 14950}, {"loss": 0.7127, "grad_norm": 0.5934301614761353, "learning_rate": 0.0002, "epoch": 1.074326750448833, "step": 14960}, {"loss": 0.7595, "grad_norm": 0.8379642367362976, "learning_rate": 0.0002, "epoch": 1.0750448833034112, "step": 14970}, {"loss": 0.7231, "grad_norm": 0.6842767596244812, "learning_rate": 0.0002, "epoch": 1.0757630161579892, "step": 14980}, {"loss": 0.7362, "grad_norm": 0.7296533584594727, "learning_rate": 0.0002, "epoch": 1.0764811490125674, "step": 14990}, {"loss": 0.688, "grad_norm": 0.6821087002754211, "learning_rate": 0.0002, "epoch": 1.0771992818671454, "step": 15000}, {"loss": 0.6808, "grad_norm": 0.6133626699447632, "learning_rate": 0.0002, "epoch": 1.0779174147217234, "step": 15010}, {"loss": 0.7351, "grad_norm": 0.6774773001670837, "learning_rate": 0.0002, "epoch": 1.0786355475763016, "step": 15020}, {"loss": 0.7403, "grad_norm": 0.6818786859512329, "learning_rate": 0.0002, "epoch": 1.0793536804308796, "step": 15030}, {"loss": 0.7005, "grad_norm": 0.7763522863388062, "learning_rate": 0.0002, "epoch": 1.0800718132854579, "step": 15040}, {"loss": 0.7028, "grad_norm": 0.7259193658828735, "learning_rate": 0.0002, "epoch": 1.0807899461400359, "step": 15050}, {"loss": 0.7232, "grad_norm": 0.6797525882720947, "learning_rate": 0.0002, "epoch": 1.081508078994614, "step": 15060}, {"loss": 0.7051, "grad_norm": 0.5775881409645081, "learning_rate": 0.0002, "epoch": 1.082226211849192, "step": 15070}, {"loss": 0.745, "grad_norm": 0.7055524587631226, "learning_rate": 0.0002, "epoch": 1.08294434470377, "step": 15080}, {"loss": 0.7539, "grad_norm": 0.8018748760223389, "learning_rate": 0.0002, "epoch": 1.0836624775583483, "step": 15090}, {"loss": 0.6833, "grad_norm": 0.6738115549087524, "learning_rate": 0.0002, "epoch": 1.0843806104129263, "step": 15100}, {"loss": 0.7014, "grad_norm": 0.6586359143257141, "learning_rate": 0.0002, "epoch": 1.0850987432675046, "step": 15110}, {"loss": 0.7391, "grad_norm": 0.7396895885467529, "learning_rate": 0.0002, "epoch": 1.0858168761220826, "step": 15120}, {"loss": 0.7473, "grad_norm": 0.7224817276000977, "learning_rate": 0.0002, "epoch": 1.0865350089766608, "step": 15130}, {"loss": 0.7137, "grad_norm": 0.798514187335968, "learning_rate": 0.0002, "epoch": 1.0872531418312388, "step": 15140}, {"loss": 0.757, "grad_norm": 0.79301518201828, "learning_rate": 0.0002, "epoch": 1.0879712746858168, "step": 15150}, {"loss": 0.7, "grad_norm": 0.7106764316558838, "learning_rate": 0.0002, "epoch": 1.088689407540395, "step": 15160}, {"loss": 0.7515, "grad_norm": 0.6525473594665527, "learning_rate": 0.0002, "epoch": 1.089407540394973, "step": 15170}, {"loss": 0.7067, "grad_norm": 0.6001671552658081, "learning_rate": 0.0002, "epoch": 1.0901256732495512, "step": 15180}, {"loss": 0.722, "grad_norm": 0.6949557662010193, "learning_rate": 0.0002, "epoch": 1.0908438061041292, "step": 15190}, {"loss": 0.7165, "grad_norm": 0.5713186860084534, "learning_rate": 0.0002, "epoch": 1.0915619389587075, "step": 15200}, {"loss": 0.7073, "grad_norm": 0.8773220181465149, "learning_rate": 0.0002, "epoch": 1.0922800718132855, "step": 15210}, {"loss": 0.7332, "grad_norm": 0.5837785601615906, "learning_rate": 0.0002, "epoch": 1.0929982046678635, "step": 15220}, {"loss": 0.7451, "grad_norm": 0.7243856191635132, "learning_rate": 0.0002, "epoch": 1.0937163375224417, "step": 15230}, {"loss": 0.6885, "grad_norm": 0.7008263468742371, "learning_rate": 0.0002, "epoch": 1.0944344703770197, "step": 15240}, {"loss": 0.7259, "grad_norm": 0.7061941623687744, "learning_rate": 0.0002, "epoch": 1.095152603231598, "step": 15250}, {"loss": 0.7482, "grad_norm": 0.575903594493866, "learning_rate": 0.0002, "epoch": 1.095870736086176, "step": 15260}, {"loss": 0.7001, "grad_norm": 0.6794043183326721, "learning_rate": 0.0002, "epoch": 1.0965888689407541, "step": 15270}, {"loss": 0.708, "grad_norm": 0.7194870710372925, "learning_rate": 0.0002, "epoch": 1.0973070017953321, "step": 15280}, {"loss": 0.7248, "grad_norm": 0.8063322305679321, "learning_rate": 0.0002, "epoch": 1.0980251346499101, "step": 15290}, {"loss": 0.7128, "grad_norm": 0.786101758480072, "learning_rate": 0.0002, "epoch": 1.0987432675044884, "step": 15300}, {"loss": 0.7523, "grad_norm": 0.827474057674408, "learning_rate": 0.0002, "epoch": 1.0994614003590664, "step": 15310}, {"loss": 0.7624, "grad_norm": 0.6514455080032349, "learning_rate": 0.0002, "epoch": 1.1001795332136446, "step": 15320}, {"loss": 0.745, "grad_norm": 0.7534348368644714, "learning_rate": 0.0002, "epoch": 1.1008976660682226, "step": 15330}, {"loss": 0.7359, "grad_norm": 0.6991367340087891, "learning_rate": 0.0002, "epoch": 1.1016157989228008, "step": 15340}, {"loss": 0.717, "grad_norm": 0.6742196679115295, "learning_rate": 0.0002, "epoch": 1.1023339317773788, "step": 15350}, {"loss": 0.737, "grad_norm": 0.7373757362365723, "learning_rate": 0.0002, "epoch": 1.1030520646319568, "step": 15360}, {"loss": 0.7421, "grad_norm": 0.6834485530853271, "learning_rate": 0.0002, "epoch": 1.103770197486535, "step": 15370}, {"loss": 0.7015, "grad_norm": 0.6454901099205017, "learning_rate": 0.0002, "epoch": 1.104488330341113, "step": 15380}, {"loss": 0.7276, "grad_norm": 0.7764508128166199, "learning_rate": 0.0002, "epoch": 1.1052064631956913, "step": 15390}, {"loss": 0.747, "grad_norm": 0.668560802936554, "learning_rate": 0.0002, "epoch": 1.1059245960502693, "step": 15400}, {"loss": 0.6705, "grad_norm": 0.579655110836029, "learning_rate": 0.0002, "epoch": 1.1066427289048475, "step": 15410}, {"loss": 0.7101, "grad_norm": 0.7196493148803711, "learning_rate": 0.0002, "epoch": 1.1073608617594255, "step": 15420}, {"loss": 0.8027, "grad_norm": 0.5530232191085815, "learning_rate": 0.0002, "epoch": 1.1080789946140035, "step": 15430}, {"loss": 0.7369, "grad_norm": 0.6542958617210388, "learning_rate": 0.0002, "epoch": 1.1087971274685817, "step": 15440}, {"loss": 0.7475, "grad_norm": 0.7468852400779724, "learning_rate": 0.0002, "epoch": 1.1095152603231597, "step": 15450}, {"loss": 0.6898, "grad_norm": 0.8119780421257019, "learning_rate": 0.0002, "epoch": 1.110233393177738, "step": 15460}, {"loss": 0.7652, "grad_norm": 0.7807733416557312, "learning_rate": 0.0002, "epoch": 1.110951526032316, "step": 15470}, {"loss": 0.697, "grad_norm": 0.7352553009986877, "learning_rate": 0.0002, "epoch": 1.1116696588868942, "step": 15480}, {"loss": 0.7509, "grad_norm": 0.8455224633216858, "learning_rate": 0.0002, "epoch": 1.1123877917414722, "step": 15490}, {"loss": 0.7757, "grad_norm": 0.635308563709259, "learning_rate": 0.0002, "epoch": 1.1131059245960502, "step": 15500}, {"loss": 0.685, "grad_norm": 0.6268794536590576, "learning_rate": 0.0002, "epoch": 1.1138240574506284, "step": 15510}, {"loss": 0.7174, "grad_norm": 0.6829593181610107, "learning_rate": 0.0002, "epoch": 1.1145421903052064, "step": 15520}, {"loss": 0.7264, "grad_norm": 0.5997796058654785, "learning_rate": 0.0002, "epoch": 1.1152603231597846, "step": 15530}, {"loss": 0.7167, "grad_norm": 0.7500942349433899, "learning_rate": 0.0002, "epoch": 1.1159784560143626, "step": 15540}, {"loss": 0.7275, "grad_norm": 0.7052047848701477, "learning_rate": 0.0002, "epoch": 1.1166965888689409, "step": 15550}, {"loss": 0.7832, "grad_norm": 0.6698189377784729, "learning_rate": 0.0002, "epoch": 1.1174147217235189, "step": 15560}, {"loss": 0.7587, "grad_norm": 0.7890462875366211, "learning_rate": 0.0002, "epoch": 1.1181328545780969, "step": 15570}, {"loss": 0.7092, "grad_norm": 0.7002465128898621, "learning_rate": 0.0002, "epoch": 1.118850987432675, "step": 15580}, {"loss": 0.6903, "grad_norm": 0.7456073760986328, "learning_rate": 0.0002, "epoch": 1.119569120287253, "step": 15590}, {"loss": 0.7577, "grad_norm": 0.7997385263442993, "learning_rate": 0.0002, "epoch": 1.1202872531418313, "step": 15600}, {"loss": 0.7005, "grad_norm": 0.6640482544898987, "learning_rate": 0.0002, "epoch": 1.1210053859964093, "step": 15610}, {"loss": 0.7334, "grad_norm": 0.7765318155288696, "learning_rate": 0.0002, "epoch": 1.1217235188509875, "step": 15620}, {"loss": 0.6977, "grad_norm": 0.7184962630271912, "learning_rate": 0.0002, "epoch": 1.1224416517055655, "step": 15630}, {"loss": 0.7362, "grad_norm": 0.7310904264450073, "learning_rate": 0.0002, "epoch": 1.1231597845601435, "step": 15640}, {"loss": 0.7278, "grad_norm": 0.7406452298164368, "learning_rate": 0.0002, "epoch": 1.1238779174147218, "step": 15650}, {"loss": 0.7074, "grad_norm": 0.7546738982200623, "learning_rate": 0.0002, "epoch": 1.1245960502692998, "step": 15660}, {"loss": 0.7641, "grad_norm": 0.7069764733314514, "learning_rate": 0.0002, "epoch": 1.125314183123878, "step": 15670}, {"loss": 0.76, "grad_norm": 0.6309521198272705, "learning_rate": 0.0002, "epoch": 1.126032315978456, "step": 15680}, {"loss": 0.7862, "grad_norm": 0.8050156831741333, "learning_rate": 0.0002, "epoch": 1.1267504488330342, "step": 15690}, {"loss": 0.7553, "grad_norm": 0.726556122303009, "learning_rate": 0.0002, "epoch": 1.1274685816876122, "step": 15700}, {"loss": 0.7763, "grad_norm": 0.77745521068573, "learning_rate": 0.0002, "epoch": 1.1281867145421902, "step": 15710}, {"loss": 0.7703, "grad_norm": 0.7467634677886963, "learning_rate": 0.0002, "epoch": 1.1289048473967684, "step": 15720}, {"loss": 0.7676, "grad_norm": 0.8207895755767822, "learning_rate": 0.0002, "epoch": 1.1296229802513464, "step": 15730}, {"loss": 0.6747, "grad_norm": 0.8253937363624573, "learning_rate": 0.0002, "epoch": 1.1303411131059247, "step": 15740}, {"loss": 0.6983, "grad_norm": 0.6313983798027039, "learning_rate": 0.0002, "epoch": 1.1310592459605027, "step": 15750}, {"loss": 0.6916, "grad_norm": 0.8040992021560669, "learning_rate": 0.0002, "epoch": 1.1317773788150807, "step": 15760}, {"loss": 0.7295, "grad_norm": 0.5937064290046692, "learning_rate": 0.0002, "epoch": 1.132495511669659, "step": 15770}, {"loss": 0.7494, "grad_norm": 0.6486281156539917, "learning_rate": 0.0002, "epoch": 1.133213644524237, "step": 15780}, {"loss": 0.7029, "grad_norm": 0.6161853075027466, "learning_rate": 0.0002, "epoch": 1.1339317773788151, "step": 15790}, {"loss": 0.7019, "grad_norm": 0.6926610469818115, "learning_rate": 0.0002, "epoch": 1.1346499102333931, "step": 15800}, {"loss": 0.6906, "grad_norm": 0.6084047555923462, "learning_rate": 0.0002, "epoch": 1.1353680430879713, "step": 15810}, {"loss": 0.7091, "grad_norm": 0.6928383111953735, "learning_rate": 0.0002, "epoch": 1.1360861759425493, "step": 15820}, {"loss": 0.7238, "grad_norm": 0.7784243822097778, "learning_rate": 0.0002, "epoch": 1.1368043087971276, "step": 15830}, {"loss": 0.6943, "grad_norm": 0.7169384956359863, "learning_rate": 0.0002, "epoch": 1.1375224416517056, "step": 15840}, {"loss": 0.7287, "grad_norm": 0.6953616142272949, "learning_rate": 0.0002, "epoch": 1.1382405745062836, "step": 15850}, {"loss": 0.7489, "grad_norm": 0.7345215082168579, "learning_rate": 0.0002, "epoch": 1.1389587073608618, "step": 15860}, {"loss": 0.683, "grad_norm": 0.5469502806663513, "learning_rate": 0.0002, "epoch": 1.1396768402154398, "step": 15870}, {"loss": 0.717, "grad_norm": 0.687680721282959, "learning_rate": 0.0002, "epoch": 1.140394973070018, "step": 15880}, {"loss": 0.7171, "grad_norm": 0.6879996657371521, "learning_rate": 0.0002, "epoch": 1.141113105924596, "step": 15890}, {"loss": 0.7321, "grad_norm": 0.728886067867279, "learning_rate": 0.0002, "epoch": 1.141831238779174, "step": 15900}, {"loss": 0.7752, "grad_norm": 0.929531455039978, "learning_rate": 0.0002, "epoch": 1.1425493716337523, "step": 15910}, {"loss": 0.7353, "grad_norm": 0.8122507333755493, "learning_rate": 0.0002, "epoch": 1.1432675044883303, "step": 15920}, {"loss": 0.7138, "grad_norm": 0.6494652628898621, "learning_rate": 0.0002, "epoch": 1.1439856373429085, "step": 15930}, {"loss": 0.7489, "grad_norm": 0.7307567596435547, "learning_rate": 0.0002, "epoch": 1.1447037701974865, "step": 15940}, {"loss": 0.7385, "grad_norm": 0.548678994178772, "learning_rate": 0.0002, "epoch": 1.1454219030520647, "step": 15950}, {"loss": 0.7152, "grad_norm": 0.8011603951454163, "learning_rate": 0.0002, "epoch": 1.1461400359066427, "step": 15960}, {"loss": 0.7324, "grad_norm": 0.7026647329330444, "learning_rate": 0.0002, "epoch": 1.146858168761221, "step": 15970}, {"loss": 0.7464, "grad_norm": 0.7338995933532715, "learning_rate": 0.0002, "epoch": 1.147576301615799, "step": 15980}, {"loss": 0.7416, "grad_norm": 0.8453443646430969, "learning_rate": 0.0002, "epoch": 1.148294434470377, "step": 15990}, {"loss": 0.7419, "grad_norm": 0.6787207126617432, "learning_rate": 0.0002, "epoch": 1.1490125673249552, "step": 16000}, {"loss": 0.7487, "grad_norm": 0.6314631104469299, "learning_rate": 0.0002, "epoch": 1.1497307001795332, "step": 16010}, {"loss": 0.7165, "grad_norm": 0.8812752962112427, "learning_rate": 0.0002, "epoch": 1.1504488330341114, "step": 16020}, {"loss": 0.774, "grad_norm": 0.6528969407081604, "learning_rate": 0.0002, "epoch": 1.1511669658886894, "step": 16030}, {"loss": 0.7321, "grad_norm": 0.7843571305274963, "learning_rate": 0.0002, "epoch": 1.1518850987432674, "step": 16040}, {"loss": 0.7769, "grad_norm": 0.7095080018043518, "learning_rate": 0.0002, "epoch": 1.1526032315978456, "step": 16050}, {"loss": 0.744, "grad_norm": 0.7495582103729248, "learning_rate": 0.0002, "epoch": 1.1533213644524236, "step": 16060}, {"loss": 0.7813, "grad_norm": 0.6002049446105957, "learning_rate": 0.0002, "epoch": 1.1540394973070018, "step": 16070}, {"loss": 0.7117, "grad_norm": 0.565014123916626, "learning_rate": 0.0002, "epoch": 1.1547576301615798, "step": 16080}, {"loss": 0.7664, "grad_norm": 0.8209971785545349, "learning_rate": 0.0002, "epoch": 1.155475763016158, "step": 16090}, {"loss": 0.7486, "grad_norm": 0.7137531042098999, "learning_rate": 0.0002, "epoch": 1.156193895870736, "step": 16100}, {"loss": 0.7197, "grad_norm": 0.7307516932487488, "learning_rate": 0.0002, "epoch": 1.1569120287253143, "step": 16110}, {"loss": 0.7351, "grad_norm": 0.6686444878578186, "learning_rate": 0.0002, "epoch": 1.1576301615798923, "step": 16120}, {"loss": 0.7407, "grad_norm": 0.7977298498153687, "learning_rate": 0.0002, "epoch": 1.1583482944344703, "step": 16130}, {"loss": 0.6696, "grad_norm": 0.6980607509613037, "learning_rate": 0.0002, "epoch": 1.1590664272890485, "step": 16140}, {"loss": 0.7513, "grad_norm": 0.6622613668441772, "learning_rate": 0.0002, "epoch": 1.1597845601436265, "step": 16150}, {"loss": 0.7162, "grad_norm": 0.6598347425460815, "learning_rate": 0.0002, "epoch": 1.1605026929982047, "step": 16160}, {"loss": 0.7418, "grad_norm": 0.6686234474182129, "learning_rate": 0.0002, "epoch": 1.1612208258527827, "step": 16170}, {"loss": 0.7104, "grad_norm": 0.7308177947998047, "learning_rate": 0.0002, "epoch": 1.1619389587073607, "step": 16180}, {"loss": 0.7337, "grad_norm": 0.939537525177002, "learning_rate": 0.0002, "epoch": 1.162657091561939, "step": 16190}, {"loss": 0.7054, "grad_norm": 0.5514758825302124, "learning_rate": 0.0002, "epoch": 1.163375224416517, "step": 16200}, {"loss": 0.7449, "grad_norm": 0.589142918586731, "learning_rate": 0.0002, "epoch": 1.1640933572710952, "step": 16210}, {"loss": 0.7438, "grad_norm": 0.6888012290000916, "learning_rate": 0.0002, "epoch": 1.1648114901256732, "step": 16220}, {"loss": 0.719, "grad_norm": 0.82566899061203, "learning_rate": 0.0002, "epoch": 1.1655296229802514, "step": 16230}, {"loss": 0.7274, "grad_norm": 0.6107817888259888, "learning_rate": 0.0002, "epoch": 1.1662477558348294, "step": 16240}, {"loss": 0.6849, "grad_norm": 0.7831398844718933, "learning_rate": 0.0002, "epoch": 1.1669658886894076, "step": 16250}, {"loss": 0.7077, "grad_norm": 0.6468397974967957, "learning_rate": 0.0002, "epoch": 1.1676840215439857, "step": 16260}, {"loss": 0.7056, "grad_norm": 0.7284161448478699, "learning_rate": 0.0002, "epoch": 1.1684021543985637, "step": 16270}, {"loss": 0.7476, "grad_norm": 0.6182818412780762, "learning_rate": 0.0002, "epoch": 1.1691202872531419, "step": 16280}, {"loss": 0.7608, "grad_norm": 0.7091781497001648, "learning_rate": 0.0002, "epoch": 1.1698384201077199, "step": 16290}, {"loss": 0.7235, "grad_norm": 0.7327643632888794, "learning_rate": 0.0002, "epoch": 1.170556552962298, "step": 16300}, {"loss": 0.7304, "grad_norm": 0.5864694118499756, "learning_rate": 0.0002, "epoch": 1.171274685816876, "step": 16310}, {"loss": 0.7011, "grad_norm": 0.7049986720085144, "learning_rate": 0.0002, "epoch": 1.171992818671454, "step": 16320}, {"loss": 0.7234, "grad_norm": 0.7563399076461792, "learning_rate": 0.0002, "epoch": 1.1727109515260323, "step": 16330}, {"loss": 0.7313, "grad_norm": 0.5888143181800842, "learning_rate": 0.0002, "epoch": 1.1734290843806103, "step": 16340}, {"loss": 0.7078, "grad_norm": 0.8670049905776978, "learning_rate": 0.0002, "epoch": 1.1741472172351886, "step": 16350}, {"loss": 0.7656, "grad_norm": 0.8045654296875, "learning_rate": 0.0002, "epoch": 1.1748653500897666, "step": 16360}, {"loss": 0.7942, "grad_norm": 0.9115668535232544, "learning_rate": 0.0002, "epoch": 1.1755834829443448, "step": 16370}, {"loss": 0.6807, "grad_norm": 0.6943584084510803, "learning_rate": 0.0002, "epoch": 1.1763016157989228, "step": 16380}, {"loss": 0.7558, "grad_norm": 0.7931740283966064, "learning_rate": 0.0002, "epoch": 1.177019748653501, "step": 16390}, {"loss": 0.7247, "grad_norm": 0.7967953085899353, "learning_rate": 0.0002, "epoch": 1.177737881508079, "step": 16400}, {"loss": 0.7294, "grad_norm": 0.575165867805481, "learning_rate": 0.0002, "epoch": 1.178456014362657, "step": 16410}, {"loss": 0.8045, "grad_norm": 0.6803409457206726, "learning_rate": 0.0002, "epoch": 1.1791741472172352, "step": 16420}, {"loss": 0.7594, "grad_norm": 0.7661909461021423, "learning_rate": 0.0002, "epoch": 1.1798922800718132, "step": 16430}, {"loss": 0.7387, "grad_norm": 0.7907630205154419, "learning_rate": 0.0002, "epoch": 1.1806104129263915, "step": 16440}, {"loss": 0.6954, "grad_norm": 0.7215338945388794, "learning_rate": 0.0002, "epoch": 1.1813285457809695, "step": 16450}, {"loss": 0.7503, "grad_norm": 0.6824054718017578, "learning_rate": 0.0002, "epoch": 1.1820466786355475, "step": 16460}, {"loss": 0.7548, "grad_norm": 0.8057665228843689, "learning_rate": 0.0002, "epoch": 1.1827648114901257, "step": 16470}, {"loss": 0.7572, "grad_norm": 0.7487542033195496, "learning_rate": 0.0002, "epoch": 1.1834829443447037, "step": 16480}, {"loss": 0.7267, "grad_norm": 0.7254953384399414, "learning_rate": 0.0002, "epoch": 1.184201077199282, "step": 16490}, {"loss": 0.6906, "grad_norm": 0.6986604332923889, "learning_rate": 0.0002, "epoch": 1.18491921005386, "step": 16500}, {"loss": 0.6979, "grad_norm": 0.7889591455459595, "learning_rate": 0.0002, "epoch": 1.1856373429084381, "step": 16510}, {"loss": 0.7455, "grad_norm": 0.6029604077339172, "learning_rate": 0.0002, "epoch": 1.1863554757630161, "step": 16520}, {"loss": 0.7673, "grad_norm": 0.680322527885437, "learning_rate": 0.0002, "epoch": 1.1870736086175944, "step": 16530}, {"loss": 0.708, "grad_norm": 0.8588826060295105, "learning_rate": 0.0002, "epoch": 1.1877917414721724, "step": 16540}, {"loss": 0.7291, "grad_norm": 0.7614806890487671, "learning_rate": 0.0002, "epoch": 1.1885098743267504, "step": 16550}, {"loss": 0.7021, "grad_norm": 0.7523183226585388, "learning_rate": 0.0002, "epoch": 1.1892280071813286, "step": 16560}, {"loss": 0.7452, "grad_norm": 0.8299532532691956, "learning_rate": 0.0002, "epoch": 1.1899461400359066, "step": 16570}, {"loss": 0.7409, "grad_norm": 0.6709241271018982, "learning_rate": 0.0002, "epoch": 1.1906642728904848, "step": 16580}, {"loss": 0.7322, "grad_norm": 0.665414035320282, "learning_rate": 0.0002, "epoch": 1.1913824057450628, "step": 16590}, {"loss": 0.7699, "grad_norm": 0.7582152485847473, "learning_rate": 0.0002, "epoch": 1.1921005385996408, "step": 16600}, {"loss": 0.7069, "grad_norm": 0.5856947302818298, "learning_rate": 0.0002, "epoch": 1.192818671454219, "step": 16610}, {"loss": 0.7444, "grad_norm": 0.6972885727882385, "learning_rate": 0.0002, "epoch": 1.193536804308797, "step": 16620}, {"loss": 0.7265, "grad_norm": 0.6884734630584717, "learning_rate": 0.0002, "epoch": 1.1942549371633753, "step": 16630}, {"loss": 0.6881, "grad_norm": 0.7380475401878357, "learning_rate": 0.0002, "epoch": 1.1949730700179533, "step": 16640}, {"loss": 0.7297, "grad_norm": 0.7976197600364685, "learning_rate": 0.0002, "epoch": 1.1956912028725315, "step": 16650}, {"loss": 0.7328, "grad_norm": 0.819256067276001, "learning_rate": 0.0002, "epoch": 1.1964093357271095, "step": 16660}, {"loss": 0.771, "grad_norm": 0.587867796421051, "learning_rate": 0.0002, "epoch": 1.1971274685816877, "step": 16670}, {"loss": 0.7357, "grad_norm": 0.9162678122520447, "learning_rate": 0.0002, "epoch": 1.1978456014362657, "step": 16680}, {"loss": 0.7472, "grad_norm": 0.7452084422111511, "learning_rate": 0.0002, "epoch": 1.1985637342908437, "step": 16690}, {"loss": 0.7257, "grad_norm": 0.7966971397399902, "learning_rate": 0.0002, "epoch": 1.199281867145422, "step": 16700}, {"loss": 0.8051, "grad_norm": 0.6605724692344666, "learning_rate": 0.0002, "epoch": 1.2, "step": 16710}, {"loss": 0.729, "grad_norm": 0.6499220728874207, "learning_rate": 0.0002, "epoch": 1.2007181328545782, "step": 16720}, {"loss": 0.7107, "grad_norm": 0.7422114610671997, "learning_rate": 0.0002, "epoch": 1.2014362657091562, "step": 16730}, {"loss": 0.6712, "grad_norm": 0.6652370095252991, "learning_rate": 0.0002, "epoch": 1.2021543985637342, "step": 16740}, {"loss": 0.7804, "grad_norm": 0.8761070370674133, "learning_rate": 0.0002, "epoch": 1.2028725314183124, "step": 16750}, {"loss": 0.737, "grad_norm": 0.7294463515281677, "learning_rate": 0.0002, "epoch": 1.2035906642728904, "step": 16760}, {"loss": 0.7638, "grad_norm": 0.7725599408149719, "learning_rate": 0.0002, "epoch": 1.2043087971274686, "step": 16770}, {"loss": 0.6857, "grad_norm": 0.5630005598068237, "learning_rate": 0.0002, "epoch": 1.2050269299820466, "step": 16780}, {"loss": 0.7344, "grad_norm": 0.7601404786109924, "learning_rate": 0.0002, "epoch": 1.2057450628366249, "step": 16790}, {"loss": 0.729, "grad_norm": 0.6859985589981079, "learning_rate": 0.0002, "epoch": 1.2064631956912029, "step": 16800}, {"loss": 0.7203, "grad_norm": 0.7040054798126221, "learning_rate": 0.0002, "epoch": 1.207181328545781, "step": 16810}, {"loss": 0.7727, "grad_norm": 0.7058989405632019, "learning_rate": 0.0002, "epoch": 1.207899461400359, "step": 16820}, {"loss": 0.7247, "grad_norm": 0.7646133899688721, "learning_rate": 0.0002, "epoch": 1.208617594254937, "step": 16830}, {"loss": 0.7903, "grad_norm": 0.669550359249115, "learning_rate": 0.0002, "epoch": 1.2093357271095153, "step": 16840}, {"loss": 0.7313, "grad_norm": 0.6613401174545288, "learning_rate": 0.0002, "epoch": 1.2100538599640933, "step": 16850}, {"loss": 0.7181, "grad_norm": 0.8636519312858582, "learning_rate": 0.0002, "epoch": 1.2107719928186715, "step": 16860}, {"loss": 0.7111, "grad_norm": 0.6077507138252258, "learning_rate": 0.0002, "epoch": 1.2114901256732495, "step": 16870}, {"loss": 0.7706, "grad_norm": 0.7892228364944458, "learning_rate": 0.0002, "epoch": 1.2122082585278275, "step": 16880}, {"loss": 0.685, "grad_norm": 0.7424154877662659, "learning_rate": 0.0002, "epoch": 1.2129263913824058, "step": 16890}, {"loss": 0.6707, "grad_norm": 0.6525408029556274, "learning_rate": 0.0002, "epoch": 1.2136445242369838, "step": 16900}, {"loss": 0.7721, "grad_norm": 0.6178015470504761, "learning_rate": 0.0002, "epoch": 1.214362657091562, "step": 16910}, {"loss": 0.6971, "grad_norm": 0.7319437861442566, "learning_rate": 0.0002, "epoch": 1.21508078994614, "step": 16920}, {"loss": 0.7261, "grad_norm": 0.6823344826698303, "learning_rate": 0.0002, "epoch": 1.2157989228007182, "step": 16930}, {"loss": 0.7048, "grad_norm": 0.5681257843971252, "learning_rate": 0.0002, "epoch": 1.2165170556552962, "step": 16940}, {"loss": 0.7398, "grad_norm": 0.7939814925193787, "learning_rate": 0.0002, "epoch": 1.2172351885098744, "step": 16950}, {"loss": 0.7192, "grad_norm": 0.7031611800193787, "learning_rate": 0.0002, "epoch": 1.2179533213644524, "step": 16960}, {"loss": 0.7212, "grad_norm": 0.7610133290290833, "learning_rate": 0.0002, "epoch": 1.2186714542190304, "step": 16970}, {"loss": 0.7599, "grad_norm": 0.8707142472267151, "learning_rate": 0.0002, "epoch": 1.2193895870736087, "step": 16980}, {"loss": 0.7121, "grad_norm": 0.6603384017944336, "learning_rate": 0.0002, "epoch": 1.2201077199281867, "step": 16990}, {"loss": 0.7315, "grad_norm": 0.7218315005302429, "learning_rate": 0.0002, "epoch": 1.220825852782765, "step": 17000}, {"loss": 0.7513, "grad_norm": 0.8043148517608643, "learning_rate": 0.0002, "epoch": 1.221543985637343, "step": 17010}, {"loss": 0.6749, "grad_norm": 0.7232559323310852, "learning_rate": 0.0002, "epoch": 1.222262118491921, "step": 17020}, {"loss": 0.7681, "grad_norm": 0.690376341342926, "learning_rate": 0.0002, "epoch": 1.2229802513464991, "step": 17030}, {"loss": 0.7042, "grad_norm": 0.602436363697052, "learning_rate": 0.0002, "epoch": 1.2236983842010771, "step": 17040}, {"loss": 0.7129, "grad_norm": 0.7610493898391724, "learning_rate": 0.0002, "epoch": 1.2244165170556554, "step": 17050}, {"loss": 0.758, "grad_norm": 0.7504690885543823, "learning_rate": 0.0002, "epoch": 1.2251346499102334, "step": 17060}, {"loss": 0.6908, "grad_norm": 0.8080246448516846, "learning_rate": 0.0002, "epoch": 1.2258527827648116, "step": 17070}, {"loss": 0.7519, "grad_norm": 1.0240572690963745, "learning_rate": 0.0002, "epoch": 1.2265709156193896, "step": 17080}, {"loss": 0.7193, "grad_norm": 0.6874111294746399, "learning_rate": 0.0002, "epoch": 1.2272890484739678, "step": 17090}, {"loss": 0.79, "grad_norm": 0.800069272518158, "learning_rate": 0.0002, "epoch": 1.2280071813285458, "step": 17100}, {"loss": 0.742, "grad_norm": 0.8628103137016296, "learning_rate": 0.0002, "epoch": 1.2287253141831238, "step": 17110}, {"loss": 0.7022, "grad_norm": 0.7408499121665955, "learning_rate": 0.0002, "epoch": 1.229443447037702, "step": 17120}, {"loss": 0.6774, "grad_norm": 0.6494335532188416, "learning_rate": 0.0002, "epoch": 1.23016157989228, "step": 17130}, {"loss": 0.7025, "grad_norm": 0.6493549942970276, "learning_rate": 0.0002, "epoch": 1.2308797127468583, "step": 17140}, {"loss": 0.7448, "grad_norm": 0.6972658038139343, "learning_rate": 0.0002, "epoch": 1.2315978456014363, "step": 17150}, {"loss": 0.7219, "grad_norm": 0.6877315044403076, "learning_rate": 0.0002, "epoch": 1.2323159784560143, "step": 17160}, {"loss": 0.7945, "grad_norm": 0.7569024562835693, "learning_rate": 0.0002, "epoch": 1.2330341113105925, "step": 17170}, {"loss": 0.7467, "grad_norm": 0.696260392665863, "learning_rate": 0.0002, "epoch": 1.2337522441651705, "step": 17180}, {"loss": 0.6716, "grad_norm": 0.6150345802307129, "learning_rate": 0.0002, "epoch": 1.2344703770197487, "step": 17190}, {"loss": 0.7416, "grad_norm": 0.69009929895401, "learning_rate": 0.0002, "epoch": 1.2351885098743267, "step": 17200}, {"loss": 0.787, "grad_norm": 0.7035185098648071, "learning_rate": 0.0002, "epoch": 1.235906642728905, "step": 17210}, {"loss": 0.6896, "grad_norm": 0.6792506575584412, "learning_rate": 0.0002, "epoch": 1.236624775583483, "step": 17220}, {"loss": 0.6953, "grad_norm": 0.6310356855392456, "learning_rate": 0.0002, "epoch": 1.2373429084380612, "step": 17230}, {"loss": 0.7531, "grad_norm": 0.647026538848877, "learning_rate": 0.0002, "epoch": 1.2380610412926392, "step": 17240}, {"loss": 0.8014, "grad_norm": 0.7609930038452148, "learning_rate": 0.0002, "epoch": 1.2387791741472172, "step": 17250}, {"loss": 0.8045, "grad_norm": 0.791890561580658, "learning_rate": 0.0002, "epoch": 1.2394973070017954, "step": 17260}, {"loss": 0.7445, "grad_norm": 0.7126715183258057, "learning_rate": 0.0002, "epoch": 1.2402154398563734, "step": 17270}, {"loss": 0.6561, "grad_norm": 0.7850401401519775, "learning_rate": 0.0002, "epoch": 1.2409335727109516, "step": 17280}, {"loss": 0.7454, "grad_norm": 0.6694281697273254, "learning_rate": 0.0002, "epoch": 1.2416517055655296, "step": 17290}, {"loss": 0.6711, "grad_norm": 0.6418080925941467, "learning_rate": 0.0002, "epoch": 1.2423698384201076, "step": 17300}, {"loss": 0.7504, "grad_norm": 0.7308132648468018, "learning_rate": 0.0002, "epoch": 1.2430879712746858, "step": 17310}, {"loss": 0.6896, "grad_norm": 0.8322312235832214, "learning_rate": 0.0002, "epoch": 1.2438061041292638, "step": 17320}, {"loss": 0.7341, "grad_norm": 0.6959006190299988, "learning_rate": 0.0002, "epoch": 1.244524236983842, "step": 17330}, {"loss": 0.7025, "grad_norm": 0.7110121846199036, "learning_rate": 0.0002, "epoch": 1.24524236983842, "step": 17340}, {"loss": 0.7858, "grad_norm": 0.6496296525001526, "learning_rate": 0.0002, "epoch": 1.2459605026929983, "step": 17350}, {"loss": 0.7061, "grad_norm": 0.7649076581001282, "learning_rate": 0.0002, "epoch": 1.2466786355475763, "step": 17360}, {"loss": 0.7155, "grad_norm": 0.7139049172401428, "learning_rate": 0.0002, "epoch": 1.2473967684021545, "step": 17370}, {"loss": 0.6932, "grad_norm": 0.7709113955497742, "learning_rate": 0.0002, "epoch": 1.2481149012567325, "step": 17380}, {"loss": 0.731, "grad_norm": 0.7160373330116272, "learning_rate": 0.0002, "epoch": 1.2488330341113105, "step": 17390}, {"loss": 0.7146, "grad_norm": 0.5608301162719727, "learning_rate": 0.0002, "epoch": 1.2495511669658887, "step": 17400}, {"loss": 0.7368, "grad_norm": 0.6913180351257324, "learning_rate": 0.0002, "epoch": 1.2502692998204668, "step": 17410}, {"loss": 0.7167, "grad_norm": 0.6980322599411011, "learning_rate": 0.0002, "epoch": 1.250987432675045, "step": 17420}, {"loss": 0.7096, "grad_norm": 0.8155394792556763, "learning_rate": 0.0002, "epoch": 1.251705565529623, "step": 17430}, {"loss": 0.7477, "grad_norm": 0.8015886545181274, "learning_rate": 0.0002, "epoch": 1.252423698384201, "step": 17440}, {"loss": 0.7006, "grad_norm": 0.5985556244850159, "learning_rate": 0.0002, "epoch": 1.2531418312387792, "step": 17450}, {"loss": 0.7171, "grad_norm": 0.70317143201828, "learning_rate": 0.0002, "epoch": 1.2538599640933572, "step": 17460}, {"loss": 0.7006, "grad_norm": 0.612501323223114, "learning_rate": 0.0002, "epoch": 1.2545780969479354, "step": 17470}, {"loss": 0.7639, "grad_norm": 0.7347102165222168, "learning_rate": 0.0002, "epoch": 1.2552962298025134, "step": 17480}, {"loss": 0.7303, "grad_norm": 0.9189441800117493, "learning_rate": 0.0002, "epoch": 1.2560143626570914, "step": 17490}, {"loss": 0.7547, "grad_norm": 0.7727932929992676, "learning_rate": 0.0002, "epoch": 1.2567324955116697, "step": 17500}, {"loss": 0.6979, "grad_norm": 0.6782869696617126, "learning_rate": 0.0002, "epoch": 1.2574506283662479, "step": 17510}, {"loss": 0.7146, "grad_norm": 0.5710638761520386, "learning_rate": 0.0002, "epoch": 1.2581687612208259, "step": 17520}, {"loss": 0.6999, "grad_norm": 0.6856266856193542, "learning_rate": 0.0002, "epoch": 1.2588868940754039, "step": 17530}, {"loss": 0.7229, "grad_norm": 0.7257347702980042, "learning_rate": 0.0002, "epoch": 1.259605026929982, "step": 17540}, {"loss": 0.7475, "grad_norm": 0.6343092918395996, "learning_rate": 0.0002, "epoch": 1.26032315978456, "step": 17550}, {"loss": 0.7863, "grad_norm": 0.6482594013214111, "learning_rate": 0.0002, "epoch": 1.2610412926391383, "step": 17560}, {"loss": 0.716, "grad_norm": 0.6542837619781494, "learning_rate": 0.0002, "epoch": 1.2617594254937163, "step": 17570}, {"loss": 0.7871, "grad_norm": 0.7106123566627502, "learning_rate": 0.0002, "epoch": 1.2624775583482943, "step": 17580}, {"loss": 0.7446, "grad_norm": 0.9081960320472717, "learning_rate": 0.0002, "epoch": 1.2631956912028726, "step": 17590}, {"loss": 0.7591, "grad_norm": 0.7010290026664734, "learning_rate": 0.0002, "epoch": 1.2639138240574506, "step": 17600}, {"loss": 0.7391, "grad_norm": 0.9973132610321045, "learning_rate": 0.0002, "epoch": 1.2646319569120288, "step": 17610}, {"loss": 0.725, "grad_norm": 0.8003297448158264, "learning_rate": 0.0002, "epoch": 1.2653500897666068, "step": 17620}, {"loss": 0.697, "grad_norm": 0.7383468151092529, "learning_rate": 0.0002, "epoch": 1.2660682226211848, "step": 17630}, {"loss": 0.785, "grad_norm": 0.6337200999259949, "learning_rate": 0.0002, "epoch": 1.266786355475763, "step": 17640}, {"loss": 0.7469, "grad_norm": 0.6371761560440063, "learning_rate": 0.0002, "epoch": 1.2675044883303412, "step": 17650}, {"loss": 0.7348, "grad_norm": 0.7283522486686707, "learning_rate": 0.0002, "epoch": 1.2682226211849192, "step": 17660}, {"loss": 0.7251, "grad_norm": 0.8191015720367432, "learning_rate": 0.0002, "epoch": 1.2689407540394972, "step": 17670}, {"loss": 0.7558, "grad_norm": 0.6210351586341858, "learning_rate": 0.0002, "epoch": 1.2696588868940755, "step": 17680}, {"loss": 0.7733, "grad_norm": 0.6563277840614319, "learning_rate": 0.0002, "epoch": 1.2703770197486535, "step": 17690}, {"loss": 0.7065, "grad_norm": 0.7111260294914246, "learning_rate": 0.0002, "epoch": 1.2710951526032317, "step": 17700}, {"loss": 0.7079, "grad_norm": 0.7061500549316406, "learning_rate": 0.0002, "epoch": 1.2718132854578097, "step": 17710}, {"loss": 0.7612, "grad_norm": 0.7657744884490967, "learning_rate": 0.0002, "epoch": 1.2725314183123877, "step": 17720}, {"loss": 0.7513, "grad_norm": 0.6952996850013733, "learning_rate": 0.0002, "epoch": 1.273249551166966, "step": 17730}, {"loss": 0.7402, "grad_norm": 0.5678043961524963, "learning_rate": 0.0002, "epoch": 1.273967684021544, "step": 17740}, {"loss": 0.7357, "grad_norm": 0.8608036041259766, "learning_rate": 0.0002, "epoch": 1.2746858168761221, "step": 17750}, {"loss": 0.7482, "grad_norm": 0.7184045910835266, "learning_rate": 0.0002, "epoch": 1.2754039497307001, "step": 17760}, {"loss": 0.7277, "grad_norm": 0.6647557616233826, "learning_rate": 0.0002, "epoch": 1.2761220825852782, "step": 17770}, {"loss": 0.6866, "grad_norm": 0.6899349093437195, "learning_rate": 0.0002, "epoch": 1.2768402154398564, "step": 17780}, {"loss": 0.721, "grad_norm": 0.7073346972465515, "learning_rate": 0.0002, "epoch": 1.2775583482944346, "step": 17790}, {"loss": 0.7432, "grad_norm": 0.8896707892417908, "learning_rate": 0.0002, "epoch": 1.2782764811490126, "step": 17800}, {"loss": 0.7318, "grad_norm": 0.5072778463363647, "learning_rate": 0.0002, "epoch": 1.2789946140035906, "step": 17810}, {"loss": 0.7648, "grad_norm": 0.8889711499214172, "learning_rate": 0.0002, "epoch": 1.2797127468581688, "step": 17820}, {"loss": 0.6894, "grad_norm": 0.5583778619766235, "learning_rate": 0.0002, "epoch": 1.2804308797127468, "step": 17830}, {"loss": 0.7488, "grad_norm": 0.6526148915290833, "learning_rate": 0.0002, "epoch": 1.281149012567325, "step": 17840}, {"loss": 0.7462, "grad_norm": 0.7658175826072693, "learning_rate": 0.0002, "epoch": 1.281867145421903, "step": 17850}, {"loss": 0.7298, "grad_norm": 0.5547847151756287, "learning_rate": 0.0002, "epoch": 1.282585278276481, "step": 17860}, {"loss": 0.705, "grad_norm": 0.6153780817985535, "learning_rate": 0.0002, "epoch": 1.2833034111310593, "step": 17870}, {"loss": 0.7173, "grad_norm": 0.8474061489105225, "learning_rate": 0.0002, "epoch": 1.2840215439856373, "step": 17880}, {"loss": 0.7597, "grad_norm": 0.859260618686676, "learning_rate": 0.0002, "epoch": 1.2847396768402155, "step": 17890}, {"loss": 0.7237, "grad_norm": 0.7270520329475403, "learning_rate": 0.0002, "epoch": 1.2854578096947935, "step": 17900}, {"loss": 0.701, "grad_norm": 0.8166249394416809, "learning_rate": 0.0002, "epoch": 1.2861759425493715, "step": 17910}, {"loss": 0.686, "grad_norm": 0.9158982038497925, "learning_rate": 0.0002, "epoch": 1.2868940754039497, "step": 17920}, {"loss": 0.7243, "grad_norm": 0.8132565021514893, "learning_rate": 0.0002, "epoch": 1.287612208258528, "step": 17930}, {"loss": 0.6909, "grad_norm": 0.7914409637451172, "learning_rate": 0.0002, "epoch": 1.288330341113106, "step": 17940}, {"loss": 0.7034, "grad_norm": 0.6256071329116821, "learning_rate": 0.0002, "epoch": 1.289048473967684, "step": 17950}, {"loss": 0.7279, "grad_norm": 0.6463542580604553, "learning_rate": 0.0002, "epoch": 1.2897666068222622, "step": 17960}, {"loss": 0.7601, "grad_norm": 0.6702672839164734, "learning_rate": 0.0002, "epoch": 1.2904847396768402, "step": 17970}, {"loss": 0.7355, "grad_norm": 0.8666605949401855, "learning_rate": 0.0002, "epoch": 1.2912028725314184, "step": 17980}, {"loss": 0.6838, "grad_norm": 0.8055952787399292, "learning_rate": 0.0002, "epoch": 1.2919210053859964, "step": 17990}, {"loss": 0.7361, "grad_norm": 0.6909741163253784, "learning_rate": 0.0002, "epoch": 1.2926391382405744, "step": 18000}, {"loss": 0.7766, "grad_norm": 0.663702130317688, "learning_rate": 0.0002, "epoch": 1.2933572710951526, "step": 18010}, {"loss": 0.7071, "grad_norm": 0.6952448487281799, "learning_rate": 0.0002, "epoch": 1.2940754039497306, "step": 18020}, {"loss": 0.7359, "grad_norm": 0.5722854137420654, "learning_rate": 0.0002, "epoch": 1.2947935368043089, "step": 18030}, {"loss": 0.764, "grad_norm": 0.7987681031227112, "learning_rate": 0.0002, "epoch": 1.2955116696588869, "step": 18040}, {"loss": 0.743, "grad_norm": 0.661133348941803, "learning_rate": 0.0002, "epoch": 1.2962298025134649, "step": 18050}, {"loss": 0.7627, "grad_norm": 0.6025064587593079, "learning_rate": 0.0002, "epoch": 1.296947935368043, "step": 18060}, {"loss": 0.7242, "grad_norm": 0.7569907903671265, "learning_rate": 0.0002, "epoch": 1.2976660682226213, "step": 18070}, {"loss": 0.7234, "grad_norm": 0.7222012281417847, "learning_rate": 0.0002, "epoch": 1.2983842010771993, "step": 18080}, {"loss": 0.7133, "grad_norm": 0.5291963815689087, "learning_rate": 0.0002, "epoch": 1.2991023339317773, "step": 18090}, {"loss": 0.7215, "grad_norm": 0.6808363199234009, "learning_rate": 0.0002, "epoch": 1.2998204667863555, "step": 18100}, {"loss": 0.7621, "grad_norm": 0.6797927618026733, "learning_rate": 0.0002, "epoch": 1.3005385996409335, "step": 18110}, {"loss": 0.7474, "grad_norm": 0.7775542140007019, "learning_rate": 0.0002, "epoch": 1.3012567324955118, "step": 18120}, {"loss": 0.7376, "grad_norm": 0.7369466423988342, "learning_rate": 0.0002, "epoch": 1.3019748653500898, "step": 18130}, {"loss": 0.7098, "grad_norm": 0.6822494864463806, "learning_rate": 0.0002, "epoch": 1.3026929982046678, "step": 18140}, {"loss": 0.7675, "grad_norm": 0.9222138524055481, "learning_rate": 0.0002, "epoch": 1.303411131059246, "step": 18150}, {"loss": 0.7593, "grad_norm": 0.7485767006874084, "learning_rate": 0.0002, "epoch": 1.304129263913824, "step": 18160}, {"loss": 0.7293, "grad_norm": 0.6383684277534485, "learning_rate": 0.0002, "epoch": 1.3048473967684022, "step": 18170}, {"loss": 0.7929, "grad_norm": 0.5934187173843384, "learning_rate": 0.0002, "epoch": 1.3055655296229802, "step": 18180}, {"loss": 0.7576, "grad_norm": 0.7265770435333252, "learning_rate": 0.0002, "epoch": 1.3062836624775582, "step": 18190}, {"loss": 0.7126, "grad_norm": 0.8149140477180481, "learning_rate": 0.0002, "epoch": 1.3070017953321365, "step": 18200}, {"loss": 0.7529, "grad_norm": 0.8067880272865295, "learning_rate": 0.0002, "epoch": 1.3077199281867147, "step": 18210}, {"loss": 0.7173, "grad_norm": 0.6109178066253662, "learning_rate": 0.0002, "epoch": 1.3084380610412927, "step": 18220}, {"loss": 0.7452, "grad_norm": 0.7194176316261292, "learning_rate": 0.0002, "epoch": 1.3091561938958707, "step": 18230}, {"loss": 0.732, "grad_norm": 0.6452242136001587, "learning_rate": 0.0002, "epoch": 1.309874326750449, "step": 18240}, {"loss": 0.7772, "grad_norm": 0.680550217628479, "learning_rate": 0.0002, "epoch": 1.310592459605027, "step": 18250}, {"loss": 0.7334, "grad_norm": 0.7005740404129028, "learning_rate": 0.0002, "epoch": 1.3113105924596051, "step": 18260}, {"loss": 0.7537, "grad_norm": 0.7217825055122375, "learning_rate": 0.0002, "epoch": 1.3120287253141831, "step": 18270}, {"loss": 0.7797, "grad_norm": 0.7730209231376648, "learning_rate": 0.0002, "epoch": 1.3127468581687611, "step": 18280}, {"loss": 0.7257, "grad_norm": 0.8291956186294556, "learning_rate": 0.0002, "epoch": 1.3134649910233394, "step": 18290}, {"loss": 0.7234, "grad_norm": 0.758528470993042, "learning_rate": 0.0002, "epoch": 1.3141831238779174, "step": 18300}, {"loss": 0.6915, "grad_norm": 0.9682782292366028, "learning_rate": 0.0002, "epoch": 1.3149012567324956, "step": 18310}, {"loss": 0.686, "grad_norm": 0.5784780979156494, "learning_rate": 0.0002, "epoch": 1.3156193895870736, "step": 18320}, {"loss": 0.7277, "grad_norm": 0.5870532393455505, "learning_rate": 0.0002, "epoch": 1.3163375224416516, "step": 18330}, {"loss": 0.7594, "grad_norm": 0.5950172543525696, "learning_rate": 0.0002, "epoch": 1.3170556552962298, "step": 18340}, {"loss": 0.7086, "grad_norm": 0.7625961899757385, "learning_rate": 0.0002, "epoch": 1.317773788150808, "step": 18350}, {"loss": 0.7075, "grad_norm": 0.8027397394180298, "learning_rate": 0.0002, "epoch": 1.318491921005386, "step": 18360}, {"loss": 0.7249, "grad_norm": 0.8424779772758484, "learning_rate": 0.0002, "epoch": 1.319210053859964, "step": 18370}, {"loss": 0.7349, "grad_norm": 0.5741737484931946, "learning_rate": 0.0002, "epoch": 1.3199281867145423, "step": 18380}, {"loss": 0.7421, "grad_norm": 0.7363710999488831, "learning_rate": 0.0002, "epoch": 1.3206463195691203, "step": 18390}, {"loss": 0.7208, "grad_norm": 0.7900536060333252, "learning_rate": 0.0002, "epoch": 1.3213644524236985, "step": 18400}, {"loss": 0.6836, "grad_norm": 0.6273105144500732, "learning_rate": 0.0002, "epoch": 1.3220825852782765, "step": 18410}, {"loss": 0.7365, "grad_norm": 0.7612496018409729, "learning_rate": 0.0002, "epoch": 1.3228007181328545, "step": 18420}, {"loss": 0.7521, "grad_norm": 0.729653537273407, "learning_rate": 0.0002, "epoch": 1.3235188509874327, "step": 18430}, {"loss": 0.7153, "grad_norm": 0.6599212288856506, "learning_rate": 0.0002, "epoch": 1.3242369838420107, "step": 18440}, {"loss": 0.7315, "grad_norm": 0.762320876121521, "learning_rate": 0.0002, "epoch": 1.324955116696589, "step": 18450}, {"loss": 0.6986, "grad_norm": 0.7468838095664978, "learning_rate": 0.0002, "epoch": 1.325673249551167, "step": 18460}, {"loss": 0.7527, "grad_norm": 0.6376237273216248, "learning_rate": 0.0002, "epoch": 1.326391382405745, "step": 18470}, {"loss": 0.7173, "grad_norm": 0.6722603440284729, "learning_rate": 0.0002, "epoch": 1.3271095152603232, "step": 18480}, {"loss": 0.6821, "grad_norm": 0.7011231780052185, "learning_rate": 0.0002, "epoch": 1.3278276481149014, "step": 18490}, {"loss": 0.7942, "grad_norm": 0.5325027108192444, "learning_rate": 0.0002, "epoch": 1.3285457809694794, "step": 18500}, {"loss": 0.6709, "grad_norm": 0.6916731595993042, "learning_rate": 0.0002, "epoch": 1.3292639138240574, "step": 18510}, {"loss": 0.7204, "grad_norm": 0.6529106497764587, "learning_rate": 0.0002, "epoch": 1.3299820466786356, "step": 18520}, {"loss": 0.7289, "grad_norm": 0.7708640694618225, "learning_rate": 0.0002, "epoch": 1.3307001795332136, "step": 18530}, {"loss": 0.7688, "grad_norm": 0.7125861048698425, "learning_rate": 0.0002, "epoch": 1.3314183123877918, "step": 18540}, {"loss": 0.723, "grad_norm": 0.7663969993591309, "learning_rate": 0.0002, "epoch": 1.3321364452423698, "step": 18550}, {"loss": 0.6993, "grad_norm": 0.601141631603241, "learning_rate": 0.0002, "epoch": 1.3328545780969479, "step": 18560}, {"loss": 0.734, "grad_norm": 0.6185581088066101, "learning_rate": 0.0002, "epoch": 1.333572710951526, "step": 18570}, {"loss": 0.6938, "grad_norm": 0.6136596202850342, "learning_rate": 0.0002, "epoch": 1.334290843806104, "step": 18580}, {"loss": 0.6963, "grad_norm": 0.8377187252044678, "learning_rate": 0.0002, "epoch": 1.3350089766606823, "step": 18590}, {"loss": 0.7399, "grad_norm": 0.7649989724159241, "learning_rate": 0.0002, "epoch": 1.3357271095152603, "step": 18600}, {"loss": 0.7565, "grad_norm": 0.7944515347480774, "learning_rate": 0.0002, "epoch": 1.3364452423698383, "step": 18610}, {"loss": 0.7894, "grad_norm": 0.619024395942688, "learning_rate": 0.0002, "epoch": 1.3371633752244165, "step": 18620}, {"loss": 0.7497, "grad_norm": 0.7849082946777344, "learning_rate": 0.0002, "epoch": 1.3378815080789948, "step": 18630}, {"loss": 0.7123, "grad_norm": 0.5740780830383301, "learning_rate": 0.0002, "epoch": 1.3385996409335728, "step": 18640}, {"loss": 0.7211, "grad_norm": 0.6897456645965576, "learning_rate": 0.0002, "epoch": 1.3393177737881508, "step": 18650}, {"loss": 0.7174, "grad_norm": 0.6263600587844849, "learning_rate": 0.0002, "epoch": 1.340035906642729, "step": 18660}, {"loss": 0.7048, "grad_norm": 0.5744550824165344, "learning_rate": 0.0002, "epoch": 1.340754039497307, "step": 18670}, {"loss": 0.7773, "grad_norm": 0.7785728573799133, "learning_rate": 0.0002, "epoch": 1.3414721723518852, "step": 18680}, {"loss": 0.7697, "grad_norm": 0.6944230198860168, "learning_rate": 0.0002, "epoch": 1.3421903052064632, "step": 18690}, {"loss": 0.7387, "grad_norm": 0.7388073801994324, "learning_rate": 0.0002, "epoch": 1.3429084380610412, "step": 18700}, {"loss": 0.7776, "grad_norm": 0.9555586576461792, "learning_rate": 0.0002, "epoch": 1.3436265709156194, "step": 18710}, {"loss": 0.7308, "grad_norm": 0.8510582447052002, "learning_rate": 0.0002, "epoch": 1.3443447037701974, "step": 18720}, {"loss": 0.7131, "grad_norm": 0.6093049645423889, "learning_rate": 0.0002, "epoch": 1.3450628366247757, "step": 18730}, {"loss": 0.7194, "grad_norm": 0.9159273505210876, "learning_rate": 0.0002, "epoch": 1.3457809694793537, "step": 18740}, {"loss": 0.7626, "grad_norm": 0.7188084721565247, "learning_rate": 0.0002, "epoch": 1.3464991023339317, "step": 18750}, {"loss": 0.7212, "grad_norm": 0.7228650450706482, "learning_rate": 0.0002, "epoch": 1.3472172351885099, "step": 18760}, {"loss": 0.7213, "grad_norm": 0.8160615563392639, "learning_rate": 0.0002, "epoch": 1.347935368043088, "step": 18770}, {"loss": 0.7093, "grad_norm": 0.6485389471054077, "learning_rate": 0.0002, "epoch": 1.3486535008976661, "step": 18780}, {"loss": 0.7044, "grad_norm": 0.6755139827728271, "learning_rate": 0.0002, "epoch": 1.3493716337522441, "step": 18790}, {"loss": 0.7413, "grad_norm": 0.6923297643661499, "learning_rate": 0.0002, "epoch": 1.3500897666068223, "step": 18800}, {"loss": 0.7184, "grad_norm": 0.6954510807991028, "learning_rate": 0.0002, "epoch": 1.3508078994614003, "step": 18810}, {"loss": 0.6987, "grad_norm": 0.9948558807373047, "learning_rate": 0.0002, "epoch": 1.3515260323159786, "step": 18820}, {"loss": 0.7315, "grad_norm": 0.708381175994873, "learning_rate": 0.0002, "epoch": 1.3522441651705566, "step": 18830}, {"loss": 0.7135, "grad_norm": 0.6409999132156372, "learning_rate": 0.0002, "epoch": 1.3529622980251346, "step": 18840}, {"loss": 0.7204, "grad_norm": 0.6365936994552612, "learning_rate": 0.0002, "epoch": 1.3536804308797128, "step": 18850}, {"loss": 0.691, "grad_norm": 0.7620742917060852, "learning_rate": 0.0002, "epoch": 1.3543985637342908, "step": 18860}, {"loss": 0.7458, "grad_norm": 0.6849071383476257, "learning_rate": 0.0002, "epoch": 1.355116696588869, "step": 18870}, {"loss": 0.7221, "grad_norm": 0.5776316523551941, "learning_rate": 0.0002, "epoch": 1.355834829443447, "step": 18880}, {"loss": 0.7412, "grad_norm": 0.597236156463623, "learning_rate": 0.0002, "epoch": 1.356552962298025, "step": 18890}, {"loss": 0.7065, "grad_norm": 0.6569282412528992, "learning_rate": 0.0002, "epoch": 1.3572710951526032, "step": 18900}, {"loss": 0.6995, "grad_norm": 0.6384802460670471, "learning_rate": 0.0002, "epoch": 1.3579892280071812, "step": 18910}, {"loss": 0.7592, "grad_norm": 0.6623879671096802, "learning_rate": 0.0002, "epoch": 1.3587073608617595, "step": 18920}, {"loss": 0.7288, "grad_norm": 0.6149632334709167, "learning_rate": 0.0002, "epoch": 1.3594254937163375, "step": 18930}, {"loss": 0.7392, "grad_norm": 0.6978002190589905, "learning_rate": 0.0002, "epoch": 1.3601436265709157, "step": 18940}, {"loss": 0.7405, "grad_norm": 0.7579124569892883, "learning_rate": 0.0002, "epoch": 1.3608617594254937, "step": 18950}, {"loss": 0.7589, "grad_norm": 0.7138084173202515, "learning_rate": 0.0002, "epoch": 1.361579892280072, "step": 18960}, {"loss": 0.7257, "grad_norm": 0.678322434425354, "learning_rate": 0.0002, "epoch": 1.36229802513465, "step": 18970}, {"loss": 0.7221, "grad_norm": 0.694346010684967, "learning_rate": 0.0002, "epoch": 1.363016157989228, "step": 18980}, {"loss": 0.6986, "grad_norm": 0.682262659072876, "learning_rate": 0.0002, "epoch": 1.3637342908438062, "step": 18990}, {"loss": 0.7297, "grad_norm": 0.9068194627761841, "learning_rate": 0.0002, "epoch": 1.3644524236983842, "step": 19000}, {"loss": 0.756, "grad_norm": 0.6691566705703735, "learning_rate": 0.0002, "epoch": 1.3651705565529624, "step": 19010}, {"loss": 0.7158, "grad_norm": 0.7791378498077393, "learning_rate": 0.0002, "epoch": 1.3658886894075404, "step": 19020}, {"loss": 0.6904, "grad_norm": 0.717107355594635, "learning_rate": 0.0002, "epoch": 1.3666068222621184, "step": 19030}, {"loss": 0.7308, "grad_norm": 0.7897566556930542, "learning_rate": 0.0002, "epoch": 1.3673249551166966, "step": 19040}, {"loss": 0.7278, "grad_norm": 0.8823844790458679, "learning_rate": 0.0002, "epoch": 1.3680430879712746, "step": 19050}, {"loss": 0.7252, "grad_norm": 0.6512053608894348, "learning_rate": 0.0002, "epoch": 1.3687612208258528, "step": 19060}, {"loss": 0.6861, "grad_norm": 0.6871389150619507, "learning_rate": 0.0002, "epoch": 1.3694793536804308, "step": 19070}, {"loss": 0.7311, "grad_norm": 0.6795603036880493, "learning_rate": 0.0002, "epoch": 1.370197486535009, "step": 19080}, {"loss": 0.7351, "grad_norm": 0.6569121479988098, "learning_rate": 0.0002, "epoch": 1.370915619389587, "step": 19090}, {"loss": 0.7743, "grad_norm": 0.6769960522651672, "learning_rate": 0.0002, "epoch": 1.3716337522441653, "step": 19100}, {"loss": 0.7275, "grad_norm": 0.726613461971283, "learning_rate": 0.0002, "epoch": 1.3723518850987433, "step": 19110}, {"loss": 0.7484, "grad_norm": 0.7287817001342773, "learning_rate": 0.0002, "epoch": 1.3730700179533213, "step": 19120}, {"loss": 0.7305, "grad_norm": 0.6169242858886719, "learning_rate": 0.0002, "epoch": 1.3737881508078995, "step": 19130}, {"loss": 0.7195, "grad_norm": 0.6537347435951233, "learning_rate": 0.0002, "epoch": 1.3745062836624775, "step": 19140}, {"loss": 0.7402, "grad_norm": 0.6113879680633545, "learning_rate": 0.0002, "epoch": 1.3752244165170557, "step": 19150}, {"loss": 0.7012, "grad_norm": 0.6415297985076904, "learning_rate": 0.0002, "epoch": 1.3759425493716337, "step": 19160}, {"loss": 0.7367, "grad_norm": 0.6812838315963745, "learning_rate": 0.0002, "epoch": 1.3766606822262117, "step": 19170}, {"loss": 0.7117, "grad_norm": 0.7331814169883728, "learning_rate": 0.0002, "epoch": 1.37737881508079, "step": 19180}, {"loss": 0.7496, "grad_norm": 0.7265108823776245, "learning_rate": 0.0002, "epoch": 1.378096947935368, "step": 19190}, {"loss": 0.699, "grad_norm": 0.6233167052268982, "learning_rate": 0.0002, "epoch": 1.3788150807899462, "step": 19200}, {"loss": 0.6978, "grad_norm": 0.6841492652893066, "learning_rate": 0.0002, "epoch": 1.3795332136445242, "step": 19210}, {"loss": 0.6934, "grad_norm": 0.822853684425354, "learning_rate": 0.0002, "epoch": 1.3802513464991024, "step": 19220}, {"loss": 0.7574, "grad_norm": 0.8078812956809998, "learning_rate": 0.0002, "epoch": 1.3809694793536804, "step": 19230}, {"loss": 0.7429, "grad_norm": 0.7269898056983948, "learning_rate": 0.0002, "epoch": 1.3816876122082586, "step": 19240}, {"loss": 0.7552, "grad_norm": 0.6297033429145813, "learning_rate": 0.0002, "epoch": 1.3824057450628366, "step": 19250}, {"loss": 0.7396, "grad_norm": 0.8097442388534546, "learning_rate": 0.0002, "epoch": 1.3831238779174146, "step": 19260}, {"loss": 0.7281, "grad_norm": 0.6442803740501404, "learning_rate": 0.0002, "epoch": 1.3838420107719929, "step": 19270}, {"loss": 0.7598, "grad_norm": 0.659866213798523, "learning_rate": 0.0002, "epoch": 1.3845601436265709, "step": 19280}, {"loss": 0.7262, "grad_norm": 0.7537921667098999, "learning_rate": 0.0002, "epoch": 1.385278276481149, "step": 19290}, {"loss": 0.7215, "grad_norm": 0.8441828489303589, "learning_rate": 0.0002, "epoch": 1.385996409335727, "step": 19300}, {"loss": 0.725, "grad_norm": 0.8506057262420654, "learning_rate": 0.0002, "epoch": 1.386714542190305, "step": 19310}, {"loss": 0.7747, "grad_norm": 0.6747094392776489, "learning_rate": 0.0002, "epoch": 1.3874326750448833, "step": 19320}, {"loss": 0.7785, "grad_norm": 0.7906509041786194, "learning_rate": 0.0002, "epoch": 1.3881508078994613, "step": 19330}, {"loss": 0.8147, "grad_norm": 0.6784867644309998, "learning_rate": 0.0002, "epoch": 1.3888689407540395, "step": 19340}, {"loss": 0.7861, "grad_norm": 0.6371709108352661, "learning_rate": 0.0002, "epoch": 1.3895870736086176, "step": 19350}, {"loss": 0.7434, "grad_norm": 0.7858285307884216, "learning_rate": 0.0002, "epoch": 1.3903052064631956, "step": 19360}, {"loss": 0.7638, "grad_norm": 0.711395263671875, "learning_rate": 0.0002, "epoch": 1.3910233393177738, "step": 19370}, {"loss": 0.725, "grad_norm": 0.7023257613182068, "learning_rate": 0.0002, "epoch": 1.391741472172352, "step": 19380}, {"loss": 0.7612, "grad_norm": 0.7036022543907166, "learning_rate": 0.0002, "epoch": 1.39245960502693, "step": 19390}, {"loss": 0.7354, "grad_norm": 0.6418436169624329, "learning_rate": 0.0002, "epoch": 1.393177737881508, "step": 19400}, {"loss": 0.7444, "grad_norm": 0.7108847498893738, "learning_rate": 0.0002, "epoch": 1.3938958707360862, "step": 19410}, {"loss": 0.771, "grad_norm": 0.6940230131149292, "learning_rate": 0.0002, "epoch": 1.3946140035906642, "step": 19420}, {"loss": 0.6791, "grad_norm": 0.6750220656394958, "learning_rate": 0.0002, "epoch": 1.3953321364452425, "step": 19430}, {"loss": 0.7466, "grad_norm": 0.7479177713394165, "learning_rate": 0.0002, "epoch": 1.3960502692998205, "step": 19440}, {"loss": 0.7259, "grad_norm": 0.626124918460846, "learning_rate": 0.0002, "epoch": 1.3967684021543985, "step": 19450}, {"loss": 0.7108, "grad_norm": 0.8908559083938599, "learning_rate": 0.0002, "epoch": 1.3974865350089767, "step": 19460}, {"loss": 0.7451, "grad_norm": 0.6163712739944458, "learning_rate": 0.0002, "epoch": 1.3982046678635547, "step": 19470}, {"loss": 0.7437, "grad_norm": 0.6993312239646912, "learning_rate": 0.0002, "epoch": 1.398922800718133, "step": 19480}, {"loss": 0.7035, "grad_norm": 0.6162890791893005, "learning_rate": 0.0002, "epoch": 1.399640933572711, "step": 19490}, {"loss": 0.7455, "grad_norm": 0.7797643542289734, "learning_rate": 0.0002, "epoch": 1.400359066427289, "step": 19500}, {"loss": 0.7497, "grad_norm": 0.7038744688034058, "learning_rate": 0.0002, "epoch": 1.4010771992818671, "step": 19510}, {"loss": 0.7084, "grad_norm": 0.6902393698692322, "learning_rate": 0.0002, "epoch": 1.4017953321364454, "step": 19520}, {"loss": 0.7136, "grad_norm": 0.5436386466026306, "learning_rate": 0.0002, "epoch": 1.4025134649910234, "step": 19530}, {"loss": 0.7457, "grad_norm": 0.6537990570068359, "learning_rate": 0.0002, "epoch": 1.4032315978456014, "step": 19540}, {"loss": 0.727, "grad_norm": 0.739691972732544, "learning_rate": 0.0002, "epoch": 1.4039497307001796, "step": 19550}, {"loss": 0.7537, "grad_norm": 0.7287635803222656, "learning_rate": 0.0002, "epoch": 1.4046678635547576, "step": 19560}, {"loss": 0.707, "grad_norm": 0.6809501051902771, "learning_rate": 0.0002, "epoch": 1.4053859964093358, "step": 19570}, {"loss": 0.7336, "grad_norm": 0.8302195072174072, "learning_rate": 0.0002, "epoch": 1.4061041292639138, "step": 19580}, {"loss": 0.7201, "grad_norm": 0.6613629460334778, "learning_rate": 0.0002, "epoch": 1.4068222621184918, "step": 19590}, {"loss": 0.7415, "grad_norm": 0.7897207736968994, "learning_rate": 0.0002, "epoch": 1.40754039497307, "step": 19600}, {"loss": 0.7483, "grad_norm": 0.8368293642997742, "learning_rate": 0.0002, "epoch": 1.408258527827648, "step": 19610}, {"loss": 0.7412, "grad_norm": 0.665109395980835, "learning_rate": 0.0002, "epoch": 1.4089766606822263, "step": 19620}, {"loss": 0.7339, "grad_norm": 0.7359302639961243, "learning_rate": 0.0002, "epoch": 1.4096947935368043, "step": 19630}, {"loss": 0.7775, "grad_norm": 0.8048052787780762, "learning_rate": 0.0002, "epoch": 1.4104129263913823, "step": 19640}, {"loss": 0.7668, "grad_norm": 0.7414906620979309, "learning_rate": 0.0002, "epoch": 1.4111310592459605, "step": 19650}, {"loss": 0.7386, "grad_norm": 0.7894161343574524, "learning_rate": 0.0002, "epoch": 1.4118491921005387, "step": 19660}, {"loss": 0.7371, "grad_norm": 0.6724628210067749, "learning_rate": 0.0002, "epoch": 1.4125673249551167, "step": 19670}, {"loss": 0.7243, "grad_norm": 0.9397756457328796, "learning_rate": 0.0002, "epoch": 1.4132854578096947, "step": 19680}, {"loss": 0.7109, "grad_norm": 0.6684842109680176, "learning_rate": 0.0002, "epoch": 1.414003590664273, "step": 19690}, {"loss": 0.7693, "grad_norm": 0.7753993272781372, "learning_rate": 0.0002, "epoch": 1.414721723518851, "step": 19700}, {"loss": 0.7653, "grad_norm": 0.6934253573417664, "learning_rate": 0.0002, "epoch": 1.4154398563734292, "step": 19710}, {"loss": 0.7393, "grad_norm": 0.8567284941673279, "learning_rate": 0.0002, "epoch": 1.4161579892280072, "step": 19720}, {"loss": 0.6907, "grad_norm": 0.9471787214279175, "learning_rate": 0.0002, "epoch": 1.4168761220825852, "step": 19730}, {"loss": 0.709, "grad_norm": 0.6664855480194092, "learning_rate": 0.0002, "epoch": 1.4175942549371634, "step": 19740}, {"loss": 0.7149, "grad_norm": 0.6713361740112305, "learning_rate": 0.0002, "epoch": 1.4183123877917414, "step": 19750}, {"loss": 0.7302, "grad_norm": 0.6488258838653564, "learning_rate": 0.0002, "epoch": 1.4190305206463196, "step": 19760}, {"loss": 0.7612, "grad_norm": 0.7089938521385193, "learning_rate": 0.0002, "epoch": 1.4197486535008976, "step": 19770}, {"loss": 0.7245, "grad_norm": 0.6433218717575073, "learning_rate": 0.0002, "epoch": 1.4204667863554756, "step": 19780}, {"loss": 0.7105, "grad_norm": 0.7025160193443298, "learning_rate": 0.0002, "epoch": 1.4211849192100539, "step": 19790}, {"loss": 0.7948, "grad_norm": 0.7030544877052307, "learning_rate": 0.0002, "epoch": 1.421903052064632, "step": 19800}, {"loss": 0.7333, "grad_norm": 0.6515552401542664, "learning_rate": 0.0002, "epoch": 1.42262118491921, "step": 19810}, {"loss": 0.7342, "grad_norm": 0.6463841795921326, "learning_rate": 0.0002, "epoch": 1.423339317773788, "step": 19820}, {"loss": 0.7457, "grad_norm": 0.6654344201087952, "learning_rate": 0.0002, "epoch": 1.4240574506283663, "step": 19830}, {"loss": 0.7289, "grad_norm": 0.7223384380340576, "learning_rate": 0.0002, "epoch": 1.4247755834829443, "step": 19840}, {"loss": 0.7471, "grad_norm": 0.6575722694396973, "learning_rate": 0.0002, "epoch": 1.4254937163375225, "step": 19850}, {"loss": 0.7559, "grad_norm": 0.6216059327125549, "learning_rate": 0.0002, "epoch": 1.4262118491921005, "step": 19860}, {"loss": 0.7638, "grad_norm": 0.7451487183570862, "learning_rate": 0.0002, "epoch": 1.4269299820466785, "step": 19870}, {"loss": 0.7083, "grad_norm": 0.6563336253166199, "learning_rate": 0.0002, "epoch": 1.4276481149012568, "step": 19880}, {"loss": 0.7122, "grad_norm": 0.8021975159645081, "learning_rate": 0.0002, "epoch": 1.4283662477558348, "step": 19890}, {"loss": 0.7389, "grad_norm": 0.7474712133407593, "learning_rate": 0.0002, "epoch": 1.429084380610413, "step": 19900}, {"loss": 0.7839, "grad_norm": 0.7316377758979797, "learning_rate": 0.0002, "epoch": 1.429802513464991, "step": 19910}, {"loss": 0.7588, "grad_norm": 0.646892786026001, "learning_rate": 0.0002, "epoch": 1.430520646319569, "step": 19920}, {"loss": 0.7175, "grad_norm": 0.6268765926361084, "learning_rate": 0.0002, "epoch": 1.4312387791741472, "step": 19930}, {"loss": 0.7502, "grad_norm": 0.7104699611663818, "learning_rate": 0.0002, "epoch": 1.4319569120287254, "step": 19940}, {"loss": 0.7006, "grad_norm": 0.6742063760757446, "learning_rate": 0.0002, "epoch": 1.4326750448833034, "step": 19950}, {"loss": 0.7394, "grad_norm": 0.6973381638526917, "learning_rate": 0.0002, "epoch": 1.4333931777378814, "step": 19960}, {"loss": 0.7428, "grad_norm": 0.5819381475448608, "learning_rate": 0.0002, "epoch": 1.4341113105924597, "step": 19970}, {"loss": 0.7836, "grad_norm": 0.680623471736908, "learning_rate": 0.0002, "epoch": 1.4348294434470377, "step": 19980}, {"loss": 0.7063, "grad_norm": 0.5899890661239624, "learning_rate": 0.0002, "epoch": 1.435547576301616, "step": 19990}, {"loss": 0.7438, "grad_norm": 0.6225098371505737, "learning_rate": 0.0002, "epoch": 1.436265709156194, "step": 20000}, {"loss": 0.7065, "grad_norm": 0.6314228773117065, "learning_rate": 0.0002, "epoch": 1.436983842010772, "step": 20010}, {"loss": 0.677, "grad_norm": 0.8690667152404785, "learning_rate": 0.0002, "epoch": 1.4377019748653501, "step": 20020}, {"loss": 0.7491, "grad_norm": 0.7166543006896973, "learning_rate": 0.0002, "epoch": 1.4384201077199281, "step": 20030}, {"loss": 0.7686, "grad_norm": 0.7051591873168945, "learning_rate": 0.0002, "epoch": 1.4391382405745063, "step": 20040}, {"loss": 0.6669, "grad_norm": 0.7606652975082397, "learning_rate": 0.0002, "epoch": 1.4398563734290843, "step": 20050}, {"loss": 0.7427, "grad_norm": 0.6343185305595398, "learning_rate": 0.0002, "epoch": 1.4405745062836623, "step": 20060}, {"loss": 0.6956, "grad_norm": 0.5625789761543274, "learning_rate": 0.0002, "epoch": 1.4412926391382406, "step": 20070}, {"loss": 0.7421, "grad_norm": 0.6081897020339966, "learning_rate": 0.0002, "epoch": 1.4420107719928188, "step": 20080}, {"loss": 0.7646, "grad_norm": 0.9571536779403687, "learning_rate": 0.0002, "epoch": 1.4427289048473968, "step": 20090}, {"loss": 0.6939, "grad_norm": 0.869531512260437, "learning_rate": 0.0002, "epoch": 1.4434470377019748, "step": 20100}, {"loss": 0.7684, "grad_norm": 0.6865507960319519, "learning_rate": 0.0002, "epoch": 1.444165170556553, "step": 20110}, {"loss": 0.6835, "grad_norm": 0.7572755813598633, "learning_rate": 0.0002, "epoch": 1.444883303411131, "step": 20120}, {"loss": 0.7392, "grad_norm": 0.79011070728302, "learning_rate": 0.0002, "epoch": 1.4456014362657092, "step": 20130}, {"loss": 0.7624, "grad_norm": 0.8297342658042908, "learning_rate": 0.0002, "epoch": 1.4463195691202873, "step": 20140}, {"loss": 0.696, "grad_norm": 0.6593490839004517, "learning_rate": 0.0002, "epoch": 1.4470377019748653, "step": 20150}, {"loss": 0.7062, "grad_norm": 1.0264687538146973, "learning_rate": 0.0002, "epoch": 1.4477558348294435, "step": 20160}, {"loss": 0.7804, "grad_norm": 0.7032888531684875, "learning_rate": 0.0002, "epoch": 1.4484739676840215, "step": 20170}, {"loss": 0.7692, "grad_norm": 0.6438494920730591, "learning_rate": 0.0002, "epoch": 1.4491921005385997, "step": 20180}, {"loss": 0.7189, "grad_norm": 0.7448790669441223, "learning_rate": 0.0002, "epoch": 1.4499102333931777, "step": 20190}, {"loss": 0.7389, "grad_norm": 0.7551555037498474, "learning_rate": 0.0002, "epoch": 1.4506283662477557, "step": 20200}, {"loss": 0.7636, "grad_norm": 0.6677857041358948, "learning_rate": 0.0002, "epoch": 1.451346499102334, "step": 20210}, {"loss": 0.7261, "grad_norm": 0.7888486385345459, "learning_rate": 0.0002, "epoch": 1.4520646319569122, "step": 20220}, {"loss": 0.7349, "grad_norm": 0.6658565402030945, "learning_rate": 0.0002, "epoch": 1.4527827648114902, "step": 20230}, {"loss": 0.7862, "grad_norm": 0.6800249814987183, "learning_rate": 0.0002, "epoch": 1.4535008976660682, "step": 20240}, {"loss": 0.7464, "grad_norm": 0.7419682741165161, "learning_rate": 0.0002, "epoch": 1.4542190305206464, "step": 20250}, {"loss": 0.7118, "grad_norm": 0.8848792910575867, "learning_rate": 0.0002, "epoch": 1.4549371633752244, "step": 20260}, {"loss": 0.729, "grad_norm": 0.6513857245445251, "learning_rate": 0.0002, "epoch": 1.4556552962298026, "step": 20270}, {"loss": 0.7325, "grad_norm": 0.5605742335319519, "learning_rate": 0.0002, "epoch": 1.4563734290843806, "step": 20280}, {"loss": 0.7078, "grad_norm": 0.6737141013145447, "learning_rate": 0.0002, "epoch": 1.4570915619389586, "step": 20290}, {"loss": 0.6971, "grad_norm": 0.6663289666175842, "learning_rate": 0.0002, "epoch": 1.4578096947935368, "step": 20300}, {"loss": 0.7161, "grad_norm": 0.7157106995582581, "learning_rate": 0.0002, "epoch": 1.4585278276481148, "step": 20310}, {"loss": 0.7024, "grad_norm": 0.7713354825973511, "learning_rate": 0.0002, "epoch": 1.459245960502693, "step": 20320}, {"loss": 0.7043, "grad_norm": 0.8334044218063354, "learning_rate": 0.0002, "epoch": 1.459964093357271, "step": 20330}, {"loss": 0.7151, "grad_norm": 0.7268327474594116, "learning_rate": 0.0002, "epoch": 1.460682226211849, "step": 20340}, {"loss": 0.7415, "grad_norm": 0.6791431903839111, "learning_rate": 0.0002, "epoch": 1.4614003590664273, "step": 20350}, {"loss": 0.7738, "grad_norm": 0.8177870512008667, "learning_rate": 0.0002, "epoch": 1.4621184919210055, "step": 20360}, {"loss": 0.7212, "grad_norm": 0.8064364790916443, "learning_rate": 0.0002, "epoch": 1.4628366247755835, "step": 20370}, {"loss": 0.7285, "grad_norm": 0.6547006964683533, "learning_rate": 0.0002, "epoch": 1.4635547576301615, "step": 20380}, {"loss": 0.7444, "grad_norm": 0.6381436586380005, "learning_rate": 0.0002, "epoch": 1.4642728904847397, "step": 20390}, {"loss": 0.7593, "grad_norm": 0.7351248264312744, "learning_rate": 0.0002, "epoch": 1.4649910233393177, "step": 20400}, {"loss": 0.7385, "grad_norm": 0.7037558555603027, "learning_rate": 0.0002, "epoch": 1.465709156193896, "step": 20410}, {"loss": 0.7815, "grad_norm": 0.6294074654579163, "learning_rate": 0.0002, "epoch": 1.466427289048474, "step": 20420}, {"loss": 0.6665, "grad_norm": 0.9722632765769958, "learning_rate": 0.0002, "epoch": 1.467145421903052, "step": 20430}, {"loss": 0.7363, "grad_norm": 0.753065824508667, "learning_rate": 0.0002, "epoch": 1.4678635547576302, "step": 20440}, {"loss": 0.7568, "grad_norm": 0.7317194938659668, "learning_rate": 0.0002, "epoch": 1.4685816876122082, "step": 20450}, {"loss": 0.6948, "grad_norm": 0.6862193942070007, "learning_rate": 0.0002, "epoch": 1.4692998204667864, "step": 20460}, {"loss": 0.7552, "grad_norm": 0.7643225193023682, "learning_rate": 0.0002, "epoch": 1.4700179533213644, "step": 20470}, {"loss": 0.6757, "grad_norm": 0.5904353260993958, "learning_rate": 0.0002, "epoch": 1.4707360861759424, "step": 20480}, {"loss": 0.7779, "grad_norm": 0.5812238454818726, "learning_rate": 0.0002, "epoch": 1.4714542190305206, "step": 20490}, {"loss": 0.7252, "grad_norm": 0.7478151321411133, "learning_rate": 0.0002, "epoch": 1.4721723518850989, "step": 20500}, {"loss": 0.7165, "grad_norm": 0.7625645399093628, "learning_rate": 0.0002, "epoch": 1.4728904847396769, "step": 20510}, {"loss": 0.7383, "grad_norm": 0.6354498267173767, "learning_rate": 0.0002, "epoch": 1.4736086175942549, "step": 20520}, {"loss": 0.7095, "grad_norm": 0.8731162548065186, "learning_rate": 0.0002, "epoch": 1.474326750448833, "step": 20530}, {"loss": 0.7535, "grad_norm": 0.7346670627593994, "learning_rate": 0.0002, "epoch": 1.475044883303411, "step": 20540}, {"loss": 0.78, "grad_norm": 1.038447618484497, "learning_rate": 0.0002, "epoch": 1.4757630161579893, "step": 20550}, {"loss": 0.7026, "grad_norm": 0.7032809257507324, "learning_rate": 0.0002, "epoch": 1.4764811490125673, "step": 20560}, {"loss": 0.6776, "grad_norm": 0.8008337020874023, "learning_rate": 0.0002, "epoch": 1.4771992818671453, "step": 20570}, {"loss": 0.776, "grad_norm": 0.6735056638717651, "learning_rate": 0.0002, "epoch": 1.4779174147217236, "step": 20580}, {"loss": 0.7632, "grad_norm": 0.622056245803833, "learning_rate": 0.0002, "epoch": 1.4786355475763016, "step": 20590}, {"loss": 0.7467, "grad_norm": 0.6580422520637512, "learning_rate": 0.0002, "epoch": 1.4793536804308798, "step": 20600}, {"loss": 0.7161, "grad_norm": 0.8401153087615967, "learning_rate": 0.0002, "epoch": 1.4800718132854578, "step": 20610}, {"loss": 0.7581, "grad_norm": 0.7564560770988464, "learning_rate": 0.0002, "epoch": 1.4807899461400358, "step": 20620}, {"loss": 0.7507, "grad_norm": 0.8319511413574219, "learning_rate": 0.0002, "epoch": 1.481508078994614, "step": 20630}, {"loss": 0.7379, "grad_norm": 0.7430182695388794, "learning_rate": 0.0002, "epoch": 1.4822262118491922, "step": 20640}, {"loss": 0.7273, "grad_norm": 0.7996522784233093, "learning_rate": 0.0002, "epoch": 1.4829443447037702, "step": 20650}, {"loss": 0.7223, "grad_norm": 0.6993277072906494, "learning_rate": 0.0002, "epoch": 1.4836624775583482, "step": 20660}, {"loss": 0.7328, "grad_norm": 0.8621185421943665, "learning_rate": 0.0002, "epoch": 1.4843806104129265, "step": 20670}, {"loss": 0.7327, "grad_norm": 0.7709757685661316, "learning_rate": 0.0002, "epoch": 1.4850987432675045, "step": 20680}, {"loss": 0.7053, "grad_norm": 0.743760347366333, "learning_rate": 0.0002, "epoch": 1.4858168761220827, "step": 20690}, {"loss": 0.6763, "grad_norm": 0.8353745341300964, "learning_rate": 0.0002, "epoch": 1.4865350089766607, "step": 20700}, {"loss": 0.6933, "grad_norm": 0.8510433435440063, "learning_rate": 0.0002, "epoch": 1.4872531418312387, "step": 20710}, {"loss": 0.7486, "grad_norm": 0.7065894603729248, "learning_rate": 0.0002, "epoch": 1.487971274685817, "step": 20720}, {"loss": 0.736, "grad_norm": 0.6878955960273743, "learning_rate": 0.0002, "epoch": 1.488689407540395, "step": 20730}, {"loss": 0.6958, "grad_norm": 0.7861111760139465, "learning_rate": 0.0002, "epoch": 1.4894075403949731, "step": 20740}, {"loss": 0.7568, "grad_norm": 0.4810725152492523, "learning_rate": 0.0002, "epoch": 1.4901256732495511, "step": 20750}, {"loss": 0.8147, "grad_norm": 0.7246082425117493, "learning_rate": 0.0002, "epoch": 1.4908438061041291, "step": 20760}, {"loss": 0.7312, "grad_norm": 0.7101936340332031, "learning_rate": 0.0002, "epoch": 1.4915619389587074, "step": 20770}, {"loss": 0.7393, "grad_norm": 0.7508591413497925, "learning_rate": 0.0002, "epoch": 1.4922800718132856, "step": 20780}, {"loss": 0.7635, "grad_norm": 0.8872039914131165, "learning_rate": 0.0002, "epoch": 1.4929982046678636, "step": 20790}, {"loss": 0.7352, "grad_norm": 0.7257922887802124, "learning_rate": 0.0002, "epoch": 1.4937163375224416, "step": 20800}, {"loss": 0.7497, "grad_norm": 0.7886278629302979, "learning_rate": 0.0002, "epoch": 1.4944344703770198, "step": 20810}, {"loss": 0.7247, "grad_norm": 0.6746290922164917, "learning_rate": 0.0002, "epoch": 1.4951526032315978, "step": 20820}, {"loss": 0.7836, "grad_norm": 0.8118207454681396, "learning_rate": 0.0002, "epoch": 1.495870736086176, "step": 20830}, {"loss": 0.7323, "grad_norm": 0.7337301969528198, "learning_rate": 0.0002, "epoch": 1.496588868940754, "step": 20840}, {"loss": 0.7105, "grad_norm": 0.5451242327690125, "learning_rate": 0.0002, "epoch": 1.497307001795332, "step": 20850}, {"loss": 0.7255, "grad_norm": 0.8398377299308777, "learning_rate": 0.0002, "epoch": 1.4980251346499103, "step": 20860}, {"loss": 0.7217, "grad_norm": 0.7196659445762634, "learning_rate": 0.0002, "epoch": 1.4987432675044883, "step": 20870}, {"loss": 0.6843, "grad_norm": 0.6659539937973022, "learning_rate": 0.0002, "epoch": 1.4994614003590665, "step": 20880}, {"loss": 0.7337, "grad_norm": 0.6071978807449341, "learning_rate": 0.0002, "epoch": 1.5001795332136445, "step": 20890}, {"loss": 0.7221, "grad_norm": 0.6704870462417603, "learning_rate": 0.0002, "epoch": 1.5008976660682225, "step": 20900}, {"loss": 0.6946, "grad_norm": 0.7216639518737793, "learning_rate": 0.0002, "epoch": 1.5016157989228007, "step": 20910}, {"loss": 0.7282, "grad_norm": 0.6050528287887573, "learning_rate": 0.0002, "epoch": 1.502333931777379, "step": 20920}, {"loss": 0.7142, "grad_norm": 0.7422218918800354, "learning_rate": 0.0002, "epoch": 1.503052064631957, "step": 20930}, {"loss": 0.7779, "grad_norm": 0.7157148122787476, "learning_rate": 0.0002, "epoch": 1.503770197486535, "step": 20940}, {"loss": 0.7179, "grad_norm": 0.6704899668693542, "learning_rate": 0.0002, "epoch": 1.504488330341113, "step": 20950}, {"loss": 0.7124, "grad_norm": 0.7573544979095459, "learning_rate": 0.0002, "epoch": 1.5052064631956912, "step": 20960}, {"loss": 0.7831, "grad_norm": 0.6710506677627563, "learning_rate": 0.0002, "epoch": 1.5059245960502694, "step": 20970}, {"loss": 0.7123, "grad_norm": 0.7559793591499329, "learning_rate": 0.0002, "epoch": 1.5066427289048474, "step": 20980}, {"loss": 0.7442, "grad_norm": 0.6705940961837769, "learning_rate": 0.0002, "epoch": 1.5073608617594254, "step": 20990}, {"loss": 0.7387, "grad_norm": 0.8016680479049683, "learning_rate": 0.0002, "epoch": 1.5080789946140036, "step": 21000}, {"loss": 0.7101, "grad_norm": 0.8154481649398804, "learning_rate": 0.0002, "epoch": 1.5087971274685816, "step": 21010}, {"loss": 0.7223, "grad_norm": 0.5830582976341248, "learning_rate": 0.0002, "epoch": 1.5095152603231599, "step": 21020}, {"loss": 0.753, "grad_norm": 0.7088601589202881, "learning_rate": 0.0002, "epoch": 1.5102333931777379, "step": 21030}, {"loss": 0.7278, "grad_norm": 0.7499658465385437, "learning_rate": 0.0002, "epoch": 1.5109515260323159, "step": 21040}, {"loss": 0.7441, "grad_norm": 0.7684667706489563, "learning_rate": 0.0002, "epoch": 1.511669658886894, "step": 21050}, {"loss": 0.7665, "grad_norm": 0.7183627486228943, "learning_rate": 0.0002, "epoch": 1.5123877917414723, "step": 21060}, {"loss": 0.7777, "grad_norm": 0.8201524615287781, "learning_rate": 0.0002, "epoch": 1.5131059245960503, "step": 21070}, {"loss": 0.7005, "grad_norm": 0.6359647512435913, "learning_rate": 0.0002, "epoch": 1.5138240574506283, "step": 21080}, {"loss": 0.7231, "grad_norm": 0.7419124245643616, "learning_rate": 0.0002, "epoch": 1.5145421903052063, "step": 21090}, {"loss": 0.724, "grad_norm": 0.6145808696746826, "learning_rate": 0.0002, "epoch": 1.5152603231597845, "step": 21100}, {"loss": 0.7563, "grad_norm": 0.7116656303405762, "learning_rate": 0.0002, "epoch": 1.5159784560143628, "step": 21110}, {"loss": 0.7221, "grad_norm": 0.8927125334739685, "learning_rate": 0.0002, "epoch": 1.5166965888689408, "step": 21120}, {"loss": 0.7159, "grad_norm": 0.7527788877487183, "learning_rate": 0.0002, "epoch": 1.5174147217235188, "step": 21130}, {"loss": 0.7147, "grad_norm": 0.7537266612052917, "learning_rate": 0.0002, "epoch": 1.518132854578097, "step": 21140}, {"loss": 0.7451, "grad_norm": 0.9051724672317505, "learning_rate": 0.0002, "epoch": 1.518850987432675, "step": 21150}, {"loss": 0.7362, "grad_norm": 0.7258086800575256, "learning_rate": 0.0002, "epoch": 1.5195691202872532, "step": 21160}, {"loss": 0.7096, "grad_norm": 0.60377436876297, "learning_rate": 0.0002, "epoch": 1.5202872531418312, "step": 21170}, {"loss": 0.7141, "grad_norm": 0.613362729549408, "learning_rate": 0.0002, "epoch": 1.5210053859964092, "step": 21180}, {"loss": 0.7018, "grad_norm": 0.6311782002449036, "learning_rate": 0.0002, "epoch": 1.5217235188509874, "step": 21190}, {"loss": 0.8144, "grad_norm": 0.7814380526542664, "learning_rate": 0.0002, "epoch": 1.5224416517055657, "step": 21200}, {"loss": 0.7505, "grad_norm": 0.8482790589332581, "learning_rate": 0.0002, "epoch": 1.5231597845601437, "step": 21210}, {"loss": 0.7387, "grad_norm": 0.6767336130142212, "learning_rate": 0.0002, "epoch": 1.5238779174147217, "step": 21220}, {"loss": 0.7556, "grad_norm": 0.7000219821929932, "learning_rate": 0.0002, "epoch": 1.5245960502692997, "step": 21230}, {"loss": 0.7628, "grad_norm": 0.8848617076873779, "learning_rate": 0.0002, "epoch": 1.525314183123878, "step": 21240}, {"loss": 0.7226, "grad_norm": 0.692258894443512, "learning_rate": 0.0002, "epoch": 1.5260323159784561, "step": 21250}, {"loss": 0.7535, "grad_norm": 0.7701950073242188, "learning_rate": 0.0002, "epoch": 1.5267504488330341, "step": 21260}, {"loss": 0.7531, "grad_norm": 0.7454132437705994, "learning_rate": 0.0002, "epoch": 1.5274685816876121, "step": 21270}, {"loss": 0.7663, "grad_norm": 0.7299574613571167, "learning_rate": 0.0002, "epoch": 1.5281867145421903, "step": 21280}, {"loss": 0.6993, "grad_norm": 0.6693950891494751, "learning_rate": 0.0002, "epoch": 1.5289048473967684, "step": 21290}, {"loss": 0.7567, "grad_norm": 0.8323785066604614, "learning_rate": 0.0002, "epoch": 1.5296229802513466, "step": 21300}, {"loss": 0.7205, "grad_norm": 0.8998763561248779, "learning_rate": 0.0002, "epoch": 1.5303411131059246, "step": 21310}, {"loss": 0.7779, "grad_norm": 0.8118193745613098, "learning_rate": 0.0002, "epoch": 1.5310592459605026, "step": 21320}, {"loss": 0.7642, "grad_norm": 0.8966332077980042, "learning_rate": 0.0002, "epoch": 1.5317773788150808, "step": 21330}, {"loss": 0.7626, "grad_norm": 0.7849827408790588, "learning_rate": 0.0002, "epoch": 1.532495511669659, "step": 21340}, {"loss": 0.7501, "grad_norm": 0.897583544254303, "learning_rate": 0.0002, "epoch": 1.533213644524237, "step": 21350}, {"loss": 0.7812, "grad_norm": 0.7998009324073792, "learning_rate": 0.0002, "epoch": 1.533931777378815, "step": 21360}, {"loss": 0.7217, "grad_norm": 0.5890361070632935, "learning_rate": 0.0002, "epoch": 1.534649910233393, "step": 21370}, {"loss": 0.7283, "grad_norm": 0.7321302890777588, "learning_rate": 0.0002, "epoch": 1.5353680430879713, "step": 21380}, {"loss": 0.7238, "grad_norm": 0.7746050357818604, "learning_rate": 0.0002, "epoch": 1.5360861759425495, "step": 21390}, {"loss": 0.7146, "grad_norm": 0.7033910155296326, "learning_rate": 0.0002, "epoch": 1.5368043087971275, "step": 21400}, {"loss": 0.6783, "grad_norm": 0.7229148149490356, "learning_rate": 0.0002, "epoch": 1.5375224416517055, "step": 21410}, {"loss": 0.7347, "grad_norm": 0.8055810928344727, "learning_rate": 0.0002, "epoch": 1.5382405745062837, "step": 21420}, {"loss": 0.7382, "grad_norm": 0.9411654472351074, "learning_rate": 0.0002, "epoch": 1.5389587073608617, "step": 21430}, {"loss": 0.6916, "grad_norm": 0.7297126650810242, "learning_rate": 0.0002, "epoch": 1.53967684021544, "step": 21440}, {"loss": 0.6977, "grad_norm": 0.7316457629203796, "learning_rate": 0.0002, "epoch": 1.540394973070018, "step": 21450}, {"loss": 0.713, "grad_norm": 0.8568798303604126, "learning_rate": 0.0002, "epoch": 1.541113105924596, "step": 21460}, {"loss": 0.6916, "grad_norm": 0.7829580307006836, "learning_rate": 0.0002, "epoch": 1.5418312387791742, "step": 21470}, {"loss": 0.712, "grad_norm": 0.6679823398590088, "learning_rate": 0.0002, "epoch": 1.5425493716337524, "step": 21480}, {"loss": 0.6978, "grad_norm": 0.5680868029594421, "learning_rate": 0.0002, "epoch": 1.5432675044883304, "step": 21490}, {"loss": 0.7638, "grad_norm": 0.6878862380981445, "learning_rate": 0.0002, "epoch": 1.5439856373429084, "step": 21500}, {"loss": 0.7634, "grad_norm": 0.7391727566719055, "learning_rate": 0.0002, "epoch": 1.5447037701974864, "step": 21510}, {"loss": 0.7781, "grad_norm": 0.844994843006134, "learning_rate": 0.0002, "epoch": 1.5454219030520646, "step": 21520}, {"loss": 0.7052, "grad_norm": 0.7852550148963928, "learning_rate": 0.0002, "epoch": 1.5461400359066428, "step": 21530}, {"loss": 0.7364, "grad_norm": 0.8370407223701477, "learning_rate": 0.0002, "epoch": 1.5468581687612208, "step": 21540}, {"loss": 0.7266, "grad_norm": 0.7138169407844543, "learning_rate": 0.0002, "epoch": 1.5475763016157988, "step": 21550}, {"loss": 0.7078, "grad_norm": 0.7660839557647705, "learning_rate": 0.0002, "epoch": 1.548294434470377, "step": 21560}, {"loss": 0.7056, "grad_norm": 0.6628666520118713, "learning_rate": 0.0002, "epoch": 1.549012567324955, "step": 21570}, {"loss": 0.7384, "grad_norm": 0.602262020111084, "learning_rate": 0.0002, "epoch": 1.5497307001795333, "step": 21580}, {"loss": 0.7258, "grad_norm": 0.6120333671569824, "learning_rate": 0.0002, "epoch": 1.5504488330341113, "step": 21590}, {"loss": 0.8094, "grad_norm": 0.6742582321166992, "learning_rate": 0.0002, "epoch": 1.5511669658886893, "step": 21600}, {"loss": 0.6807, "grad_norm": 0.6788192391395569, "learning_rate": 0.0002, "epoch": 1.5518850987432675, "step": 21610}, {"loss": 0.6969, "grad_norm": 0.7124713659286499, "learning_rate": 0.0002, "epoch": 1.5526032315978457, "step": 21620}, {"loss": 0.7296, "grad_norm": 0.6297248005867004, "learning_rate": 0.0002, "epoch": 1.5533213644524237, "step": 21630}, {"loss": 0.7466, "grad_norm": 0.8977078199386597, "learning_rate": 0.0002, "epoch": 1.5540394973070017, "step": 21640}, {"loss": 0.7376, "grad_norm": 0.7543209791183472, "learning_rate": 0.0002, "epoch": 1.5547576301615798, "step": 21650}, {"loss": 0.749, "grad_norm": 0.8704302310943604, "learning_rate": 0.0002, "epoch": 1.555475763016158, "step": 21660}, {"loss": 0.7801, "grad_norm": 0.7848012447357178, "learning_rate": 0.0002, "epoch": 1.5561938958707362, "step": 21670}, {"loss": 0.7062, "grad_norm": 0.7496278285980225, "learning_rate": 0.0002, "epoch": 1.5569120287253142, "step": 21680}, {"loss": 0.7503, "grad_norm": 0.7305200099945068, "learning_rate": 0.0002, "epoch": 1.5576301615798922, "step": 21690}, {"loss": 0.7429, "grad_norm": 0.6671105623245239, "learning_rate": 0.0002, "epoch": 1.5583482944344704, "step": 21700}, {"loss": 0.7293, "grad_norm": 0.8536111116409302, "learning_rate": 0.0002, "epoch": 1.5590664272890484, "step": 21710}, {"loss": 0.7169, "grad_norm": 0.7360461354255676, "learning_rate": 0.0002, "epoch": 1.5597845601436267, "step": 21720}, {"loss": 0.7314, "grad_norm": 0.6665109395980835, "learning_rate": 0.0002, "epoch": 1.5605026929982047, "step": 21730}, {"loss": 0.7262, "grad_norm": 0.5879628658294678, "learning_rate": 0.0002, "epoch": 1.5612208258527827, "step": 21740}, {"loss": 0.7099, "grad_norm": 0.6937240958213806, "learning_rate": 0.0002, "epoch": 1.5619389587073609, "step": 21750}, {"loss": 0.7669, "grad_norm": 0.7118659019470215, "learning_rate": 0.0002, "epoch": 1.562657091561939, "step": 21760}, {"loss": 0.7196, "grad_norm": 0.7858866453170776, "learning_rate": 0.0002, "epoch": 1.563375224416517, "step": 21770}, {"loss": 0.7552, "grad_norm": 0.8691372871398926, "learning_rate": 0.0002, "epoch": 1.564093357271095, "step": 21780}, {"loss": 0.7684, "grad_norm": 0.8884942531585693, "learning_rate": 0.0002, "epoch": 1.564811490125673, "step": 21790}, {"loss": 0.7128, "grad_norm": 0.6335656046867371, "learning_rate": 0.0002, "epoch": 1.5655296229802513, "step": 21800}, {"loss": 0.7233, "grad_norm": 0.8666166067123413, "learning_rate": 0.0002, "epoch": 1.5662477558348296, "step": 21810}, {"loss": 0.6771, "grad_norm": 0.7961624264717102, "learning_rate": 0.0002, "epoch": 1.5669658886894076, "step": 21820}, {"loss": 0.7286, "grad_norm": 0.6331174373626709, "learning_rate": 0.0002, "epoch": 1.5676840215439856, "step": 21830}, {"loss": 0.7273, "grad_norm": 0.6476998925209045, "learning_rate": 0.0002, "epoch": 1.5684021543985638, "step": 21840}, {"loss": 0.7507, "grad_norm": 0.8279129266738892, "learning_rate": 0.0002, "epoch": 1.5691202872531418, "step": 21850}, {"loss": 0.7219, "grad_norm": 0.6997109651565552, "learning_rate": 0.0002, "epoch": 1.56983842010772, "step": 21860}, {"loss": 0.7424, "grad_norm": 0.6992211937904358, "learning_rate": 0.0002, "epoch": 1.570556552962298, "step": 21870}, {"loss": 0.7275, "grad_norm": 0.7766915559768677, "learning_rate": 0.0002, "epoch": 1.571274685816876, "step": 21880}, {"loss": 0.7651, "grad_norm": 0.6845845580101013, "learning_rate": 0.0002, "epoch": 1.5719928186714542, "step": 21890}, {"loss": 0.706, "grad_norm": 0.7247874140739441, "learning_rate": 0.0002, "epoch": 1.5727109515260325, "step": 21900}, {"loss": 0.7812, "grad_norm": 0.802342414855957, "learning_rate": 0.0002, "epoch": 1.5734290843806105, "step": 21910}, {"loss": 0.7028, "grad_norm": 0.7797709107398987, "learning_rate": 0.0002, "epoch": 1.5741472172351885, "step": 21920}, {"loss": 0.7466, "grad_norm": 0.6534958481788635, "learning_rate": 0.0002, "epoch": 1.5748653500897665, "step": 21930}, {"loss": 0.7148, "grad_norm": 0.6003528237342834, "learning_rate": 0.0002, "epoch": 1.5755834829443447, "step": 21940}, {"loss": 0.7282, "grad_norm": 0.6920075416564941, "learning_rate": 0.0002, "epoch": 1.576301615798923, "step": 21950}, {"loss": 0.6533, "grad_norm": 0.7213456034660339, "learning_rate": 0.0002, "epoch": 1.577019748653501, "step": 21960}, {"loss": 0.6875, "grad_norm": 0.7101914286613464, "learning_rate": 0.0002, "epoch": 1.577737881508079, "step": 21970}, {"loss": 0.7421, "grad_norm": 0.9531592130661011, "learning_rate": 0.0002, "epoch": 1.5784560143626571, "step": 21980}, {"loss": 0.7454, "grad_norm": 0.7690590023994446, "learning_rate": 0.0002, "epoch": 1.5791741472172351, "step": 21990}, {"loss": 0.7135, "grad_norm": 0.8226363062858582, "learning_rate": 0.0002, "epoch": 1.5798922800718134, "step": 22000}, {"loss": 0.7518, "grad_norm": 0.6128851175308228, "learning_rate": 0.0002, "epoch": 1.5806104129263914, "step": 22010}, {"loss": 0.7253, "grad_norm": 0.827008068561554, "learning_rate": 0.0002, "epoch": 1.5813285457809694, "step": 22020}, {"loss": 0.7176, "grad_norm": 0.6729007363319397, "learning_rate": 0.0002, "epoch": 1.5820466786355476, "step": 22030}, {"loss": 0.7503, "grad_norm": 0.6397014260292053, "learning_rate": 0.0002, "epoch": 1.5827648114901258, "step": 22040}, {"loss": 0.7531, "grad_norm": 0.6927793622016907, "learning_rate": 0.0002, "epoch": 1.5834829443447038, "step": 22050}, {"loss": 0.7499, "grad_norm": 0.7527112364768982, "learning_rate": 0.0002, "epoch": 1.5842010771992818, "step": 22060}, {"loss": 0.739, "grad_norm": 0.6418012380599976, "learning_rate": 0.0002, "epoch": 1.5849192100538598, "step": 22070}, {"loss": 0.727, "grad_norm": 0.7627281546592712, "learning_rate": 0.0002, "epoch": 1.585637342908438, "step": 22080}, {"loss": 0.7115, "grad_norm": 0.753851592540741, "learning_rate": 0.0002, "epoch": 1.5863554757630163, "step": 22090}, {"loss": 0.7677, "grad_norm": 0.6049349904060364, "learning_rate": 0.0002, "epoch": 1.5870736086175943, "step": 22100}, {"loss": 0.7494, "grad_norm": 0.6677758693695068, "learning_rate": 0.0002, "epoch": 1.5877917414721723, "step": 22110}, {"loss": 0.7259, "grad_norm": 0.913489818572998, "learning_rate": 0.0002, "epoch": 1.5885098743267505, "step": 22120}, {"loss": 0.7823, "grad_norm": 0.6779162883758545, "learning_rate": 0.0002, "epoch": 1.5892280071813285, "step": 22130}, {"loss": 0.7674, "grad_norm": 0.910076916217804, "learning_rate": 0.0002, "epoch": 1.5899461400359067, "step": 22140}, {"loss": 0.7162, "grad_norm": 0.9506068229675293, "learning_rate": 0.0002, "epoch": 1.5906642728904847, "step": 22150}, {"loss": 0.7343, "grad_norm": 0.6552460789680481, "learning_rate": 0.0002, "epoch": 1.5913824057450627, "step": 22160}, {"loss": 0.7488, "grad_norm": 0.6855819821357727, "learning_rate": 0.0002, "epoch": 1.592100538599641, "step": 22170}, {"loss": 0.6785, "grad_norm": 0.6713384985923767, "learning_rate": 0.0002, "epoch": 1.5928186714542192, "step": 22180}, {"loss": 0.7287, "grad_norm": 0.7168547511100769, "learning_rate": 0.0002, "epoch": 1.5935368043087972, "step": 22190}, {"loss": 0.7259, "grad_norm": 0.8395482897758484, "learning_rate": 0.0002, "epoch": 1.5942549371633752, "step": 22200}, {"loss": 0.6995, "grad_norm": 0.6676998138427734, "learning_rate": 0.0002, "epoch": 1.5949730700179532, "step": 22210}, {"loss": 0.7152, "grad_norm": 0.5837140083312988, "learning_rate": 0.0002, "epoch": 1.5956912028725314, "step": 22220}, {"loss": 0.7464, "grad_norm": 0.8399306535720825, "learning_rate": 0.0002, "epoch": 1.5964093357271096, "step": 22230}, {"loss": 0.7053, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 1.5971274685816876, "step": 22240}, {"loss": 0.784, "grad_norm": 0.768604040145874, "learning_rate": 0.0002, "epoch": 1.5978456014362656, "step": 22250}, {"loss": 0.6946, "grad_norm": 0.6382646560668945, "learning_rate": 0.0002, "epoch": 1.5985637342908436, "step": 22260}, {"loss": 0.7035, "grad_norm": 0.7244897484779358, "learning_rate": 0.0002, "epoch": 1.5992818671454219, "step": 22270}, {"loss": 0.7168, "grad_norm": 0.6250987648963928, "learning_rate": 0.0002, "epoch": 1.6, "step": 22280}, {"loss": 0.7182, "grad_norm": 0.8731992244720459, "learning_rate": 0.0002, "epoch": 1.600718132854578, "step": 22290}, {"loss": 0.6866, "grad_norm": 0.5861822962760925, "learning_rate": 0.0002, "epoch": 1.601436265709156, "step": 22300}, {"loss": 0.6909, "grad_norm": 0.716805100440979, "learning_rate": 0.0002, "epoch": 1.6021543985637343, "step": 22310}, {"loss": 0.7377, "grad_norm": 0.6650034189224243, "learning_rate": 0.0002, "epoch": 1.6028725314183125, "step": 22320}, {"loss": 0.7107, "grad_norm": 0.6944432854652405, "learning_rate": 0.0002, "epoch": 1.6035906642728905, "step": 22330}, {"loss": 0.682, "grad_norm": 0.7411999106407166, "learning_rate": 0.0002, "epoch": 1.6043087971274685, "step": 22340}, {"loss": 0.7294, "grad_norm": 0.831828773021698, "learning_rate": 0.0002, "epoch": 1.6050269299820465, "step": 22350}, {"loss": 0.7305, "grad_norm": 0.6252152919769287, "learning_rate": 0.0002, "epoch": 1.6057450628366248, "step": 22360}, {"loss": 0.7479, "grad_norm": 0.8643325567245483, "learning_rate": 0.0002, "epoch": 1.606463195691203, "step": 22370}, {"loss": 0.7417, "grad_norm": 0.7330279350280762, "learning_rate": 0.0002, "epoch": 1.607181328545781, "step": 22380}, {"loss": 0.7198, "grad_norm": 0.7235422730445862, "learning_rate": 0.0002, "epoch": 1.607899461400359, "step": 22390}, {"loss": 0.7638, "grad_norm": 0.6940887570381165, "learning_rate": 0.0002, "epoch": 1.608617594254937, "step": 22400}, {"loss": 0.714, "grad_norm": 0.7907325625419617, "learning_rate": 0.0002, "epoch": 1.6093357271095152, "step": 22410}, {"loss": 0.7824, "grad_norm": 0.6899075508117676, "learning_rate": 0.0002, "epoch": 1.6100538599640934, "step": 22420}, {"loss": 0.7502, "grad_norm": 0.7057487368583679, "learning_rate": 0.0002, "epoch": 1.6107719928186714, "step": 22430}, {"loss": 0.7437, "grad_norm": 0.9235003590583801, "learning_rate": 0.0002, "epoch": 1.6114901256732495, "step": 22440}, {"loss": 0.7115, "grad_norm": 0.7238173484802246, "learning_rate": 0.0002, "epoch": 1.6122082585278277, "step": 22450}, {"loss": 0.7628, "grad_norm": 0.5931997299194336, "learning_rate": 0.0002, "epoch": 1.612926391382406, "step": 22460}, {"loss": 0.6663, "grad_norm": 0.6705866456031799, "learning_rate": 0.0002, "epoch": 1.613644524236984, "step": 22470}, {"loss": 0.749, "grad_norm": 0.7392773032188416, "learning_rate": 0.0002, "epoch": 1.614362657091562, "step": 22480}, {"loss": 0.7292, "grad_norm": 0.6286543607711792, "learning_rate": 0.0002, "epoch": 1.61508078994614, "step": 22490}, {"loss": 0.7264, "grad_norm": 0.7467446327209473, "learning_rate": 0.0002, "epoch": 1.6157989228007181, "step": 22500}, {"loss": 0.732, "grad_norm": 0.8353021740913391, "learning_rate": 0.0002, "epoch": 1.6165170556552964, "step": 22510}, {"loss": 0.7626, "grad_norm": 0.7333045601844788, "learning_rate": 0.0002, "epoch": 1.6172351885098744, "step": 22520}, {"loss": 0.7567, "grad_norm": 0.6203709244728088, "learning_rate": 0.0002, "epoch": 1.6179533213644524, "step": 22530}, {"loss": 0.7478, "grad_norm": 0.5585690140724182, "learning_rate": 0.0002, "epoch": 1.6186714542190304, "step": 22540}, {"loss": 0.669, "grad_norm": 0.7157222032546997, "learning_rate": 0.0002, "epoch": 1.6193895870736086, "step": 22550}, {"loss": 0.7224, "grad_norm": 0.8129993677139282, "learning_rate": 0.0002, "epoch": 1.6201077199281868, "step": 22560}, {"loss": 0.7374, "grad_norm": 0.6745335459709167, "learning_rate": 0.0002, "epoch": 1.6208258527827648, "step": 22570}, {"loss": 0.7276, "grad_norm": 0.7684996724128723, "learning_rate": 0.0002, "epoch": 1.6215439856373428, "step": 22580}, {"loss": 0.7479, "grad_norm": 0.6735436916351318, "learning_rate": 0.0002, "epoch": 1.622262118491921, "step": 22590}, {"loss": 0.6596, "grad_norm": 0.7394272089004517, "learning_rate": 0.0002, "epoch": 1.6229802513464993, "step": 22600}, {"loss": 0.7382, "grad_norm": 0.7268046140670776, "learning_rate": 0.0002, "epoch": 1.6236983842010773, "step": 22610}, {"loss": 0.7619, "grad_norm": 0.8338810205459595, "learning_rate": 0.0002, "epoch": 1.6244165170556553, "step": 22620}, {"loss": 0.7247, "grad_norm": 0.9293080568313599, "learning_rate": 0.0002, "epoch": 1.6251346499102333, "step": 22630}, {"loss": 0.7601, "grad_norm": 0.8084996938705444, "learning_rate": 0.0002, "epoch": 1.6258527827648115, "step": 22640}, {"loss": 0.7053, "grad_norm": 0.6605180501937866, "learning_rate": 0.0002, "epoch": 1.6265709156193897, "step": 22650}, {"loss": 0.7489, "grad_norm": 0.8402717113494873, "learning_rate": 0.0002, "epoch": 1.6272890484739677, "step": 22660}, {"loss": 0.7468, "grad_norm": 0.653055727481842, "learning_rate": 0.0002, "epoch": 1.6280071813285457, "step": 22670}, {"loss": 0.7179, "grad_norm": 0.6477823257446289, "learning_rate": 0.0002, "epoch": 1.6287253141831237, "step": 22680}, {"loss": 0.7216, "grad_norm": 0.9053590893745422, "learning_rate": 0.0002, "epoch": 1.629443447037702, "step": 22690}, {"loss": 0.7257, "grad_norm": 0.90384441614151, "learning_rate": 0.0002, "epoch": 1.6301615798922802, "step": 22700}, {"loss": 0.7703, "grad_norm": 0.6789469122886658, "learning_rate": 0.0002, "epoch": 1.6308797127468582, "step": 22710}, {"loss": 0.7706, "grad_norm": 0.7221854329109192, "learning_rate": 0.0002, "epoch": 1.6315978456014362, "step": 22720}, {"loss": 0.7457, "grad_norm": 0.7724022269248962, "learning_rate": 0.0002, "epoch": 1.6323159784560144, "step": 22730}, {"loss": 0.7864, "grad_norm": 0.8213715553283691, "learning_rate": 0.0002, "epoch": 1.6330341113105926, "step": 22740}, {"loss": 0.7356, "grad_norm": 0.7102876305580139, "learning_rate": 0.0002, "epoch": 1.6337522441651706, "step": 22750}, {"loss": 0.7208, "grad_norm": 0.8817880749702454, "learning_rate": 0.0002, "epoch": 1.6344703770197486, "step": 22760}, {"loss": 0.7722, "grad_norm": 0.8446506857872009, "learning_rate": 0.0002, "epoch": 1.6351885098743266, "step": 22770}, {"loss": 0.7341, "grad_norm": 0.6749029755592346, "learning_rate": 0.0002, "epoch": 1.6359066427289048, "step": 22780}, {"loss": 0.7599, "grad_norm": 0.7013556957244873, "learning_rate": 0.0002, "epoch": 1.636624775583483, "step": 22790}, {"loss": 0.7488, "grad_norm": 0.7767965793609619, "learning_rate": 0.0002, "epoch": 1.637342908438061, "step": 22800}, {"loss": 0.7387, "grad_norm": 0.7354073524475098, "learning_rate": 0.0002, "epoch": 1.638061041292639, "step": 22810}, {"loss": 0.7816, "grad_norm": 0.8871088027954102, "learning_rate": 0.0002, "epoch": 1.638779174147217, "step": 22820}, {"loss": 0.7243, "grad_norm": 0.6573871374130249, "learning_rate": 0.0002, "epoch": 1.6394973070017953, "step": 22830}, {"loss": 0.7812, "grad_norm": 0.5679349303245544, "learning_rate": 0.0002, "epoch": 1.6402154398563735, "step": 22840}, {"loss": 0.7402, "grad_norm": 0.7072559595108032, "learning_rate": 0.0002, "epoch": 1.6409335727109515, "step": 22850}, {"loss": 0.751, "grad_norm": 0.7639257311820984, "learning_rate": 0.0002, "epoch": 1.6416517055655295, "step": 22860}, {"loss": 0.7357, "grad_norm": 0.6699341535568237, "learning_rate": 0.0002, "epoch": 1.6423698384201078, "step": 22870}, {"loss": 0.7295, "grad_norm": 0.8285767436027527, "learning_rate": 0.0002, "epoch": 1.643087971274686, "step": 22880}, {"loss": 0.7267, "grad_norm": 0.7328150272369385, "learning_rate": 0.0002, "epoch": 1.643806104129264, "step": 22890}, {"loss": 0.6904, "grad_norm": 0.8122354745864868, "learning_rate": 0.0002, "epoch": 1.644524236983842, "step": 22900}, {"loss": 0.7853, "grad_norm": 0.7322969436645508, "learning_rate": 0.0002, "epoch": 1.64524236983842, "step": 22910}, {"loss": 0.7629, "grad_norm": 0.7269576191902161, "learning_rate": 0.0002, "epoch": 1.6459605026929982, "step": 22920}, {"loss": 0.728, "grad_norm": 0.7037042379379272, "learning_rate": 0.0002, "epoch": 1.6466786355475764, "step": 22930}, {"loss": 0.752, "grad_norm": 0.6960355639457703, "learning_rate": 0.0002, "epoch": 1.6473967684021544, "step": 22940}, {"loss": 0.7484, "grad_norm": 0.7446839213371277, "learning_rate": 0.0002, "epoch": 1.6481149012567324, "step": 22950}, {"loss": 0.7528, "grad_norm": 0.7201664447784424, "learning_rate": 0.0002, "epoch": 1.6488330341113104, "step": 22960}, {"loss": 0.7183, "grad_norm": 0.7062349319458008, "learning_rate": 0.0002, "epoch": 1.6495511669658887, "step": 22970}, {"loss": 0.6999, "grad_norm": 0.7666636109352112, "learning_rate": 0.0002, "epoch": 1.6502692998204669, "step": 22980}, {"loss": 0.7103, "grad_norm": 0.7872112393379211, "learning_rate": 0.0002, "epoch": 1.6509874326750449, "step": 22990}, {"loss": 0.7307, "grad_norm": 0.7428551316261292, "learning_rate": 0.0002, "epoch": 1.6517055655296229, "step": 23000}, {"loss": 0.7573, "grad_norm": 0.6087952852249146, "learning_rate": 0.0002, "epoch": 1.6524236983842011, "step": 23010}, {"loss": 0.8045, "grad_norm": 0.7191354036331177, "learning_rate": 0.0002, "epoch": 1.6531418312387793, "step": 23020}, {"loss": 0.7517, "grad_norm": 0.8679710626602173, "learning_rate": 0.0002, "epoch": 1.6538599640933573, "step": 23030}, {"loss": 0.7084, "grad_norm": 0.7232310175895691, "learning_rate": 0.0002, "epoch": 1.6545780969479353, "step": 23040}, {"loss": 0.7007, "grad_norm": 0.5695104002952576, "learning_rate": 0.0002, "epoch": 1.6552962298025133, "step": 23050}, {"loss": 0.7115, "grad_norm": 0.6363076567649841, "learning_rate": 0.0002, "epoch": 1.6560143626570916, "step": 23060}, {"loss": 0.7639, "grad_norm": 0.8168749809265137, "learning_rate": 0.0002, "epoch": 1.6567324955116698, "step": 23070}, {"loss": 0.6768, "grad_norm": 0.7664111852645874, "learning_rate": 0.0002, "epoch": 1.6574506283662478, "step": 23080}, {"loss": 0.7492, "grad_norm": 0.6748140454292297, "learning_rate": 0.0002, "epoch": 1.6581687612208258, "step": 23090}, {"loss": 0.7213, "grad_norm": 0.6258183121681213, "learning_rate": 0.0002, "epoch": 1.6588868940754038, "step": 23100}, {"loss": 0.783, "grad_norm": 0.8669735193252563, "learning_rate": 0.0002, "epoch": 1.659605026929982, "step": 23110}, {"loss": 0.6847, "grad_norm": 0.5606119632720947, "learning_rate": 0.0002, "epoch": 1.6603231597845602, "step": 23120}, {"loss": 0.6889, "grad_norm": 0.6602507829666138, "learning_rate": 0.0002, "epoch": 1.6610412926391382, "step": 23130}, {"loss": 0.7605, "grad_norm": 0.7237988710403442, "learning_rate": 0.0002, "epoch": 1.6617594254937162, "step": 23140}, {"loss": 0.7663, "grad_norm": 0.9054415225982666, "learning_rate": 0.0002, "epoch": 1.6624775583482945, "step": 23150}, {"loss": 0.7603, "grad_norm": 0.5186660289764404, "learning_rate": 0.0002, "epoch": 1.6631956912028727, "step": 23160}, {"loss": 0.7442, "grad_norm": 0.719584584236145, "learning_rate": 0.0002, "epoch": 1.6639138240574507, "step": 23170}, {"loss": 0.7715, "grad_norm": 0.7583617568016052, "learning_rate": 0.0002, "epoch": 1.6646319569120287, "step": 23180}, {"loss": 0.7402, "grad_norm": 0.7985982298851013, "learning_rate": 0.0002, "epoch": 1.6653500897666067, "step": 23190}, {"loss": 0.7515, "grad_norm": 0.6952691674232483, "learning_rate": 0.0002, "epoch": 1.666068222621185, "step": 23200}, {"loss": 0.7491, "grad_norm": 0.7184221744537354, "learning_rate": 0.0002, "epoch": 1.6667863554757631, "step": 23210}, {"loss": 0.7608, "grad_norm": 0.8256361484527588, "learning_rate": 0.0002, "epoch": 1.6675044883303412, "step": 23220}, {"loss": 0.7331, "grad_norm": 0.7534128427505493, "learning_rate": 0.0002, "epoch": 1.6682226211849192, "step": 23230}, {"loss": 0.7196, "grad_norm": 0.7711095213890076, "learning_rate": 0.0002, "epoch": 1.6689407540394972, "step": 23240}, {"loss": 0.7871, "grad_norm": 0.6326615810394287, "learning_rate": 0.0002, "epoch": 1.6696588868940754, "step": 23250}, {"loss": 0.7244, "grad_norm": 0.8345766663551331, "learning_rate": 0.0002, "epoch": 1.6703770197486536, "step": 23260}, {"loss": 0.7819, "grad_norm": 0.9079837203025818, "learning_rate": 0.0002, "epoch": 1.6710951526032316, "step": 23270}, {"loss": 0.7259, "grad_norm": 0.7310197353363037, "learning_rate": 0.0002, "epoch": 1.6718132854578096, "step": 23280}, {"loss": 0.7253, "grad_norm": 0.7573344707489014, "learning_rate": 0.0002, "epoch": 1.6725314183123878, "step": 23290}, {"loss": 0.6817, "grad_norm": 0.7708047032356262, "learning_rate": 0.0002, "epoch": 1.673249551166966, "step": 23300}, {"loss": 0.7247, "grad_norm": 0.7665812969207764, "learning_rate": 0.0002, "epoch": 1.673967684021544, "step": 23310}, {"loss": 0.7048, "grad_norm": 0.7988788485527039, "learning_rate": 0.0002, "epoch": 1.674685816876122, "step": 23320}, {"loss": 0.7396, "grad_norm": 0.755042552947998, "learning_rate": 0.0002, "epoch": 1.6754039497307, "step": 23330}, {"loss": 0.7392, "grad_norm": 0.6605848670005798, "learning_rate": 0.0002, "epoch": 1.6761220825852783, "step": 23340}, {"loss": 0.7394, "grad_norm": 0.8762016296386719, "learning_rate": 0.0002, "epoch": 1.6768402154398565, "step": 23350}, {"loss": 0.7661, "grad_norm": 0.604742169380188, "learning_rate": 0.0002, "epoch": 1.6775583482944345, "step": 23360}, {"loss": 0.7422, "grad_norm": 0.7479172945022583, "learning_rate": 0.0002, "epoch": 1.6782764811490125, "step": 23370}, {"loss": 0.7248, "grad_norm": 0.6418702602386475, "learning_rate": 0.0002, "epoch": 1.6789946140035905, "step": 23380}, {"loss": 0.7717, "grad_norm": 0.6783933639526367, "learning_rate": 0.0002, "epoch": 1.6797127468581687, "step": 23390}, {"loss": 0.7099, "grad_norm": 0.7036024928092957, "learning_rate": 0.0002, "epoch": 1.680430879712747, "step": 23400}, {"loss": 0.7439, "grad_norm": 0.6833266615867615, "learning_rate": 0.0002, "epoch": 1.681149012567325, "step": 23410}, {"loss": 0.753, "grad_norm": 0.8867062330245972, "learning_rate": 0.0002, "epoch": 1.681867145421903, "step": 23420}, {"loss": 0.7694, "grad_norm": 0.7825753092765808, "learning_rate": 0.0002, "epoch": 1.6825852782764812, "step": 23430}, {"loss": 0.7127, "grad_norm": 0.6396880745887756, "learning_rate": 0.0002, "epoch": 1.6833034111310592, "step": 23440}, {"loss": 0.7465, "grad_norm": 0.5723230242729187, "learning_rate": 0.0002, "epoch": 1.6840215439856374, "step": 23450}, {"loss": 0.7102, "grad_norm": 0.6949231624603271, "learning_rate": 0.0002, "epoch": 1.6847396768402154, "step": 23460}, {"loss": 0.7421, "grad_norm": 0.8290650248527527, "learning_rate": 0.0002, "epoch": 1.6854578096947934, "step": 23470}, {"loss": 0.7774, "grad_norm": 0.7765078544616699, "learning_rate": 0.0002, "epoch": 1.6861759425493716, "step": 23480}, {"loss": 0.7271, "grad_norm": 0.7084149718284607, "learning_rate": 0.0002, "epoch": 1.6868940754039499, "step": 23490}, {"loss": 0.8188, "grad_norm": 0.6916654109954834, "learning_rate": 0.0002, "epoch": 1.6876122082585279, "step": 23500}, {"loss": 0.7235, "grad_norm": 0.5615179538726807, "learning_rate": 0.0002, "epoch": 1.6883303411131059, "step": 23510}, {"loss": 0.7203, "grad_norm": 0.7996105551719666, "learning_rate": 0.0002, "epoch": 1.6890484739676839, "step": 23520}, {"loss": 0.7145, "grad_norm": 0.7010168433189392, "learning_rate": 0.0002, "epoch": 1.689766606822262, "step": 23530}, {"loss": 0.7696, "grad_norm": 0.7876442074775696, "learning_rate": 0.0002, "epoch": 1.6904847396768403, "step": 23540}, {"loss": 0.6966, "grad_norm": 0.7508043646812439, "learning_rate": 0.0002, "epoch": 1.6912028725314183, "step": 23550}, {"loss": 0.729, "grad_norm": 0.8125874400138855, "learning_rate": 0.0002, "epoch": 1.6919210053859963, "step": 23560}, {"loss": 0.774, "grad_norm": 0.711840808391571, "learning_rate": 0.0002, "epoch": 1.6926391382405745, "step": 23570}, {"loss": 0.7165, "grad_norm": 0.6540026068687439, "learning_rate": 0.0002, "epoch": 1.6933572710951525, "step": 23580}, {"loss": 0.7578, "grad_norm": 0.8376550078392029, "learning_rate": 0.0002, "epoch": 1.6940754039497308, "step": 23590}, {"loss": 0.7746, "grad_norm": 0.7075366973876953, "learning_rate": 0.0002, "epoch": 1.6947935368043088, "step": 23600}, {"loss": 0.7639, "grad_norm": 0.7522266507148743, "learning_rate": 0.0002, "epoch": 1.6955116696588868, "step": 23610}, {"loss": 0.7386, "grad_norm": 0.7572667002677917, "learning_rate": 0.0002, "epoch": 1.696229802513465, "step": 23620}, {"loss": 0.6896, "grad_norm": 0.6126907467842102, "learning_rate": 0.0002, "epoch": 1.6969479353680432, "step": 23630}, {"loss": 0.7182, "grad_norm": 0.7473152875900269, "learning_rate": 0.0002, "epoch": 1.6976660682226212, "step": 23640}, {"loss": 0.7272, "grad_norm": 0.6630390286445618, "learning_rate": 0.0002, "epoch": 1.6983842010771992, "step": 23650}, {"loss": 0.7232, "grad_norm": 0.5848073363304138, "learning_rate": 0.0002, "epoch": 1.6991023339317772, "step": 23660}, {"loss": 0.6923, "grad_norm": 0.5901942849159241, "learning_rate": 0.0002, "epoch": 1.6998204667863555, "step": 23670}, {"loss": 0.79, "grad_norm": 0.7896918058395386, "learning_rate": 0.0002, "epoch": 1.7005385996409337, "step": 23680}, {"loss": 0.77, "grad_norm": 0.705362856388092, "learning_rate": 0.0002, "epoch": 1.7012567324955117, "step": 23690}, {"loss": 0.751, "grad_norm": 0.9917470812797546, "learning_rate": 0.0002, "epoch": 1.7019748653500897, "step": 23700}, {"loss": 0.7403, "grad_norm": 0.7550538778305054, "learning_rate": 0.0002, "epoch": 1.702692998204668, "step": 23710}, {"loss": 0.7398, "grad_norm": 0.8348238468170166, "learning_rate": 0.0002, "epoch": 1.703411131059246, "step": 23720}, {"loss": 0.7799, "grad_norm": 0.5979694128036499, "learning_rate": 0.0002, "epoch": 1.7041292639138241, "step": 23730}, {"loss": 0.7035, "grad_norm": 0.7451775670051575, "learning_rate": 0.0002, "epoch": 1.7048473967684021, "step": 23740}, {"loss": 0.7237, "grad_norm": 0.7614818215370178, "learning_rate": 0.0002, "epoch": 1.7055655296229801, "step": 23750}, {"loss": 0.7636, "grad_norm": 0.5590742826461792, "learning_rate": 0.0002, "epoch": 1.7062836624775584, "step": 23760}, {"loss": 0.701, "grad_norm": 0.7039094567298889, "learning_rate": 0.0002, "epoch": 1.7070017953321366, "step": 23770}, {"loss": 0.7145, "grad_norm": 0.7963233590126038, "learning_rate": 0.0002, "epoch": 1.7077199281867146, "step": 23780}, {"loss": 0.7702, "grad_norm": 0.7214934825897217, "learning_rate": 0.0002, "epoch": 1.7084380610412926, "step": 23790}, {"loss": 0.7515, "grad_norm": 0.7310500741004944, "learning_rate": 0.0002, "epoch": 1.7091561938958706, "step": 23800}, {"loss": 0.7038, "grad_norm": 0.6653284430503845, "learning_rate": 0.0002, "epoch": 1.7098743267504488, "step": 23810}, {"loss": 0.698, "grad_norm": 0.6632702946662903, "learning_rate": 0.0002, "epoch": 1.710592459605027, "step": 23820}, {"loss": 0.7338, "grad_norm": 0.6314955949783325, "learning_rate": 0.0002, "epoch": 1.711310592459605, "step": 23830}, {"loss": 0.7511, "grad_norm": 0.73652583360672, "learning_rate": 0.0002, "epoch": 1.712028725314183, "step": 23840}, {"loss": 0.6999, "grad_norm": 0.5685144662857056, "learning_rate": 0.0002, "epoch": 1.7127468581687613, "step": 23850}, {"loss": 0.7295, "grad_norm": 0.7010223865509033, "learning_rate": 0.0002, "epoch": 1.7134649910233393, "step": 23860}, {"loss": 0.7488, "grad_norm": 0.7643879652023315, "learning_rate": 0.0002, "epoch": 1.7141831238779175, "step": 23870}, {"loss": 0.7449, "grad_norm": 0.7543165683746338, "learning_rate": 0.0002, "epoch": 1.7149012567324955, "step": 23880}, {"loss": 0.6946, "grad_norm": 0.8816508054733276, "learning_rate": 0.0002, "epoch": 1.7156193895870735, "step": 23890}, {"loss": 0.7398, "grad_norm": 0.7979614734649658, "learning_rate": 0.0002, "epoch": 1.7163375224416517, "step": 23900}, {"loss": 0.7844, "grad_norm": 0.7631057500839233, "learning_rate": 0.0002, "epoch": 1.71705565529623, "step": 23910}, {"loss": 0.7409, "grad_norm": 0.6349977254867554, "learning_rate": 0.0002, "epoch": 1.717773788150808, "step": 23920}, {"loss": 0.74, "grad_norm": 0.7464412450790405, "learning_rate": 0.0002, "epoch": 1.718491921005386, "step": 23930}, {"loss": 0.7164, "grad_norm": 0.6985567212104797, "learning_rate": 0.0002, "epoch": 1.719210053859964, "step": 23940}, {"loss": 0.7256, "grad_norm": 0.6641302704811096, "learning_rate": 0.0002, "epoch": 1.7199281867145422, "step": 23950}, {"loss": 0.7154, "grad_norm": 0.7299597263336182, "learning_rate": 0.0002, "epoch": 1.7206463195691204, "step": 23960}, {"loss": 0.7535, "grad_norm": 0.7812355756759644, "learning_rate": 0.0002, "epoch": 1.7213644524236984, "step": 23970}, {"loss": 0.7363, "grad_norm": 0.667571485042572, "learning_rate": 0.0002, "epoch": 1.7220825852782764, "step": 23980}, {"loss": 0.7427, "grad_norm": 0.8244081735610962, "learning_rate": 0.0002, "epoch": 1.7228007181328546, "step": 23990}, {"loss": 0.7191, "grad_norm": 0.6684445738792419, "learning_rate": 0.0002, "epoch": 1.7235188509874326, "step": 24000}, {"loss": 0.8042, "grad_norm": 0.7002949118614197, "learning_rate": 0.0002, "epoch": 1.7242369838420109, "step": 24010}, {"loss": 0.7134, "grad_norm": 0.6249772906303406, "learning_rate": 0.0002, "epoch": 1.7249551166965889, "step": 24020}, {"loss": 0.721, "grad_norm": 0.7279905080795288, "learning_rate": 0.0002, "epoch": 1.7256732495511669, "step": 24030}, {"loss": 0.7374, "grad_norm": 0.631148636341095, "learning_rate": 0.0002, "epoch": 1.726391382405745, "step": 24040}, {"loss": 0.697, "grad_norm": 0.7486464977264404, "learning_rate": 0.0002, "epoch": 1.7271095152603233, "step": 24050}, {"loss": 0.715, "grad_norm": 0.7494347095489502, "learning_rate": 0.0002, "epoch": 1.7278276481149013, "step": 24060}, {"loss": 0.7609, "grad_norm": 0.7821264863014221, "learning_rate": 0.0002, "epoch": 1.7285457809694793, "step": 24070}, {"loss": 0.6925, "grad_norm": 0.7211608290672302, "learning_rate": 0.0002, "epoch": 1.7292639138240573, "step": 24080}, {"loss": 0.7444, "grad_norm": 0.7028553485870361, "learning_rate": 0.0002, "epoch": 1.7299820466786355, "step": 24090}, {"loss": 0.8065, "grad_norm": 0.6189247369766235, "learning_rate": 0.0002, "epoch": 1.7307001795332138, "step": 24100}, {"loss": 0.7011, "grad_norm": 0.7339756488800049, "learning_rate": 0.0002, "epoch": 1.7314183123877918, "step": 24110}, {"loss": 0.8071, "grad_norm": 0.6700502038002014, "learning_rate": 0.0002, "epoch": 1.7321364452423698, "step": 24120}, {"loss": 0.7608, "grad_norm": 0.6139533519744873, "learning_rate": 0.0002, "epoch": 1.732854578096948, "step": 24130}, {"loss": 0.7251, "grad_norm": 0.7249825596809387, "learning_rate": 0.0002, "epoch": 1.733572710951526, "step": 24140}, {"loss": 0.6954, "grad_norm": 0.6531777381896973, "learning_rate": 0.0002, "epoch": 1.7342908438061042, "step": 24150}, {"loss": 0.7214, "grad_norm": 0.8443833589553833, "learning_rate": 0.0002, "epoch": 1.7350089766606822, "step": 24160}, {"loss": 0.75, "grad_norm": 0.7040373086929321, "learning_rate": 0.0002, "epoch": 1.7357271095152602, "step": 24170}, {"loss": 0.701, "grad_norm": 0.8647749423980713, "learning_rate": 0.0002, "epoch": 1.7364452423698384, "step": 24180}, {"loss": 0.7033, "grad_norm": 0.7297305464744568, "learning_rate": 0.0002, "epoch": 1.7371633752244167, "step": 24190}, {"loss": 0.7187, "grad_norm": 0.8191218376159668, "learning_rate": 0.0002, "epoch": 1.7378815080789947, "step": 24200}, {"loss": 0.7665, "grad_norm": 0.7315607666969299, "learning_rate": 0.0002, "epoch": 1.7385996409335727, "step": 24210}, {"loss": 0.7467, "grad_norm": 0.694486677646637, "learning_rate": 0.0002, "epoch": 1.7393177737881507, "step": 24220}, {"loss": 0.7476, "grad_norm": 0.8115953207015991, "learning_rate": 0.0002, "epoch": 1.740035906642729, "step": 24230}, {"loss": 0.7792, "grad_norm": 0.7379186153411865, "learning_rate": 0.0002, "epoch": 1.7407540394973071, "step": 24240}, {"loss": 0.7224, "grad_norm": 0.6820309162139893, "learning_rate": 0.0002, "epoch": 1.7414721723518851, "step": 24250}, {"loss": 0.7558, "grad_norm": 0.8210766911506653, "learning_rate": 0.0002, "epoch": 1.7421903052064631, "step": 24260}, {"loss": 0.7098, "grad_norm": 0.724466860294342, "learning_rate": 0.0002, "epoch": 1.7429084380610413, "step": 24270}, {"loss": 0.7343, "grad_norm": 0.8768740296363831, "learning_rate": 0.0002, "epoch": 1.7436265709156193, "step": 24280}, {"loss": 0.7041, "grad_norm": 0.6691206097602844, "learning_rate": 0.0002, "epoch": 1.7443447037701976, "step": 24290}, {"loss": 0.7526, "grad_norm": 0.6529893279075623, "learning_rate": 0.0002, "epoch": 1.7450628366247756, "step": 24300}, {"loss": 0.7638, "grad_norm": 0.904729962348938, "learning_rate": 0.0002, "epoch": 1.7457809694793536, "step": 24310}, {"loss": 0.7463, "grad_norm": 0.655235230922699, "learning_rate": 0.0002, "epoch": 1.7464991023339318, "step": 24320}, {"loss": 0.7625, "grad_norm": 0.9476361274719238, "learning_rate": 0.0002, "epoch": 1.74721723518851, "step": 24330}, {"loss": 0.688, "grad_norm": 0.55366051197052, "learning_rate": 0.0002, "epoch": 1.747935368043088, "step": 24340}, {"loss": 0.7664, "grad_norm": 0.7192568182945251, "learning_rate": 0.0002, "epoch": 1.748653500897666, "step": 24350}, {"loss": 0.7423, "grad_norm": 0.7193983793258667, "learning_rate": 0.0002, "epoch": 1.749371633752244, "step": 24360}, {"loss": 0.7463, "grad_norm": 0.753998339176178, "learning_rate": 0.0002, "epoch": 1.7500897666068223, "step": 24370}, {"loss": 0.7415, "grad_norm": 1.1058299541473389, "learning_rate": 0.0002, "epoch": 1.7508078994614005, "step": 24380}, {"loss": 0.7373, "grad_norm": 0.7213007211685181, "learning_rate": 0.0002, "epoch": 1.7515260323159785, "step": 24390}, {"loss": 0.7395, "grad_norm": 0.972494900226593, "learning_rate": 0.0002, "epoch": 1.7522441651705565, "step": 24400}, {"loss": 0.7689, "grad_norm": 0.8045306205749512, "learning_rate": 0.0002, "epoch": 1.7529622980251347, "step": 24410}, {"loss": 0.7463, "grad_norm": 0.82415372133255, "learning_rate": 0.0002, "epoch": 1.7536804308797127, "step": 24420}, {"loss": 0.7384, "grad_norm": 0.72683185338974, "learning_rate": 0.0002, "epoch": 1.754398563734291, "step": 24430}, {"loss": 0.7512, "grad_norm": 0.687907338142395, "learning_rate": 0.0002, "epoch": 1.755116696588869, "step": 24440}, {"loss": 0.7627, "grad_norm": 0.6616531610488892, "learning_rate": 0.0002, "epoch": 1.755834829443447, "step": 24450}, {"loss": 0.7425, "grad_norm": 0.7225571870803833, "learning_rate": 0.0002, "epoch": 1.7565529622980252, "step": 24460}, {"loss": 0.7584, "grad_norm": 0.7597603797912598, "learning_rate": 0.0002, "epoch": 1.7572710951526034, "step": 24470}, {"loss": 0.7076, "grad_norm": 0.7850660681724548, "learning_rate": 0.0002, "epoch": 1.7579892280071814, "step": 24480}, {"loss": 0.7294, "grad_norm": 0.9843530058860779, "learning_rate": 0.0002, "epoch": 1.7587073608617594, "step": 24490}, {"loss": 0.7237, "grad_norm": 0.7010256052017212, "learning_rate": 0.0002, "epoch": 1.7594254937163374, "step": 24500}, {"loss": 0.7143, "grad_norm": 0.5669383406639099, "learning_rate": 0.0002, "epoch": 1.7601436265709156, "step": 24510}, {"loss": 0.7511, "grad_norm": 0.7043302655220032, "learning_rate": 0.0002, "epoch": 1.7608617594254938, "step": 24520}, {"loss": 0.73, "grad_norm": 0.8000741600990295, "learning_rate": 0.0002, "epoch": 1.7615798922800718, "step": 24530}, {"loss": 0.6994, "grad_norm": 0.7084416747093201, "learning_rate": 0.0002, "epoch": 1.7622980251346498, "step": 24540}, {"loss": 0.7337, "grad_norm": 0.7290608882904053, "learning_rate": 0.0002, "epoch": 1.763016157989228, "step": 24550}, {"loss": 0.6968, "grad_norm": 0.8710007071495056, "learning_rate": 0.0002, "epoch": 1.763734290843806, "step": 24560}, {"loss": 0.7023, "grad_norm": 0.6346535682678223, "learning_rate": 0.0002, "epoch": 1.7644524236983843, "step": 24570}, {"loss": 0.684, "grad_norm": 0.8990599513053894, "learning_rate": 0.0002, "epoch": 1.7651705565529623, "step": 24580}, {"loss": 0.7222, "grad_norm": 0.7823857665061951, "learning_rate": 0.0002, "epoch": 1.7658886894075403, "step": 24590}, {"loss": 0.7392, "grad_norm": 0.6250144839286804, "learning_rate": 0.0002, "epoch": 1.7666068222621185, "step": 24600}, {"loss": 0.7159, "grad_norm": 0.715657114982605, "learning_rate": 0.0002, "epoch": 1.7673249551166967, "step": 24610}, {"loss": 0.7245, "grad_norm": 0.6254874467849731, "learning_rate": 0.0002, "epoch": 1.7680430879712747, "step": 24620}, {"loss": 0.7258, "grad_norm": 0.6873717904090881, "learning_rate": 0.0002, "epoch": 1.7687612208258527, "step": 24630}, {"loss": 0.7951, "grad_norm": 0.7273038625717163, "learning_rate": 0.0002, "epoch": 1.7694793536804307, "step": 24640}, {"loss": 0.7417, "grad_norm": 0.9079981446266174, "learning_rate": 0.0002, "epoch": 1.770197486535009, "step": 24650}, {"loss": 0.7138, "grad_norm": 0.6262510418891907, "learning_rate": 0.0002, "epoch": 1.7709156193895872, "step": 24660}, {"loss": 0.6995, "grad_norm": 0.7326231002807617, "learning_rate": 0.0002, "epoch": 1.7716337522441652, "step": 24670}, {"loss": 0.7483, "grad_norm": 0.7828301787376404, "learning_rate": 0.0002, "epoch": 1.7723518850987432, "step": 24680}, {"loss": 0.689, "grad_norm": 0.5881586670875549, "learning_rate": 0.0002, "epoch": 1.7730700179533212, "step": 24690}, {"loss": 0.744, "grad_norm": 0.7101683020591736, "learning_rate": 0.0002, "epoch": 1.7737881508078994, "step": 24700}, {"loss": 0.7145, "grad_norm": 0.8466469049453735, "learning_rate": 0.0002, "epoch": 1.7745062836624776, "step": 24710}, {"loss": 0.7428, "grad_norm": 0.7770822644233704, "learning_rate": 0.0002, "epoch": 1.7752244165170556, "step": 24720}, {"loss": 0.7299, "grad_norm": 0.7259120345115662, "learning_rate": 0.0002, "epoch": 1.7759425493716336, "step": 24730}, {"loss": 0.6909, "grad_norm": 0.7696824669837952, "learning_rate": 0.0002, "epoch": 1.7766606822262119, "step": 24740}, {"loss": 0.7659, "grad_norm": 0.7603837847709656, "learning_rate": 0.0002, "epoch": 1.77737881508079, "step": 24750}, {"loss": 0.6966, "grad_norm": 0.6166595220565796, "learning_rate": 0.0002, "epoch": 1.778096947935368, "step": 24760}, {"loss": 0.6987, "grad_norm": 0.7493758797645569, "learning_rate": 0.0002, "epoch": 1.778815080789946, "step": 24770}, {"loss": 0.6808, "grad_norm": 0.7177459597587585, "learning_rate": 0.0002, "epoch": 1.779533213644524, "step": 24780}, {"loss": 0.7411, "grad_norm": 0.6666781306266785, "learning_rate": 0.0002, "epoch": 1.7802513464991023, "step": 24790}, {"loss": 0.6867, "grad_norm": 0.6556468605995178, "learning_rate": 0.0002, "epoch": 1.7809694793536806, "step": 24800}, {"loss": 0.7375, "grad_norm": 0.6119393706321716, "learning_rate": 0.0002, "epoch": 1.7816876122082586, "step": 24810}, {"loss": 0.7059, "grad_norm": 0.8573325276374817, "learning_rate": 0.0002, "epoch": 1.7824057450628366, "step": 24820}, {"loss": 0.7708, "grad_norm": 0.8017005920410156, "learning_rate": 0.0002, "epoch": 1.7831238779174146, "step": 24830}, {"loss": 0.7041, "grad_norm": 0.7337947487831116, "learning_rate": 0.0002, "epoch": 1.7838420107719928, "step": 24840}, {"loss": 0.7325, "grad_norm": 0.6717178225517273, "learning_rate": 0.0002, "epoch": 1.784560143626571, "step": 24850}, {"loss": 0.7285, "grad_norm": 0.8243708610534668, "learning_rate": 0.0002, "epoch": 1.785278276481149, "step": 24860}, {"loss": 0.701, "grad_norm": 0.8111547827720642, "learning_rate": 0.0002, "epoch": 1.785996409335727, "step": 24870}, {"loss": 0.7105, "grad_norm": 0.8577823042869568, "learning_rate": 0.0002, "epoch": 1.7867145421903052, "step": 24880}, {"loss": 0.7419, "grad_norm": 0.6488644480705261, "learning_rate": 0.0002, "epoch": 1.7874326750448835, "step": 24890}, {"loss": 0.7112, "grad_norm": 0.6446744799613953, "learning_rate": 0.0002, "epoch": 1.7881508078994615, "step": 24900}, {"loss": 0.7531, "grad_norm": 0.6400182247161865, "learning_rate": 0.0002, "epoch": 1.7888689407540395, "step": 24910}, {"loss": 0.711, "grad_norm": 0.8059108853340149, "learning_rate": 0.0002, "epoch": 1.7895870736086175, "step": 24920}, {"loss": 0.7678, "grad_norm": 0.7101734280586243, "learning_rate": 0.0002, "epoch": 1.7903052064631957, "step": 24930}, {"loss": 0.7648, "grad_norm": 1.0397762060165405, "learning_rate": 0.0002, "epoch": 1.791023339317774, "step": 24940}, {"loss": 0.7079, "grad_norm": 0.6231128573417664, "learning_rate": 0.0002, "epoch": 1.791741472172352, "step": 24950}, {"loss": 0.7525, "grad_norm": 5.905253887176514, "learning_rate": 0.0002, "epoch": 1.79245960502693, "step": 24960}, {"loss": 0.7286, "grad_norm": 0.8003911375999451, "learning_rate": 0.0002, "epoch": 1.793177737881508, "step": 24970}, {"loss": 0.7002, "grad_norm": 0.6340393424034119, "learning_rate": 0.0002, "epoch": 1.7938958707360861, "step": 24980}, {"loss": 0.7056, "grad_norm": 0.8701013922691345, "learning_rate": 0.0002, "epoch": 1.7946140035906644, "step": 24990}, {"loss": 0.7192, "grad_norm": 0.9085575342178345, "learning_rate": 0.0002, "epoch": 1.7953321364452424, "step": 25000}, {"loss": 0.7367, "grad_norm": 0.6306625604629517, "learning_rate": 0.0002, "epoch": 1.7960502692998204, "step": 25010}, {"loss": 0.7122, "grad_norm": 0.6985056400299072, "learning_rate": 0.0002, "epoch": 1.7967684021543986, "step": 25020}, {"loss": 0.7005, "grad_norm": 0.7309113144874573, "learning_rate": 0.0002, "epoch": 1.7974865350089768, "step": 25030}, {"loss": 0.7414, "grad_norm": 0.6795042157173157, "learning_rate": 0.0002, "epoch": 1.7982046678635548, "step": 25040}, {"loss": 0.7606, "grad_norm": 0.6920178532600403, "learning_rate": 0.0002, "epoch": 1.7989228007181328, "step": 25050}, {"loss": 0.7094, "grad_norm": 0.6578564047813416, "learning_rate": 0.0002, "epoch": 1.7996409335727108, "step": 25060}, {"loss": 0.7471, "grad_norm": 0.6718358993530273, "learning_rate": 0.0002, "epoch": 1.800359066427289, "step": 25070}, {"loss": 0.7271, "grad_norm": 0.9086750149726868, "learning_rate": 0.0002, "epoch": 1.8010771992818673, "step": 25080}, {"loss": 0.7653, "grad_norm": 0.6102437973022461, "learning_rate": 0.0002, "epoch": 1.8017953321364453, "step": 25090}, {"loss": 0.7538, "grad_norm": 0.6391313076019287, "learning_rate": 0.0002, "epoch": 1.8025134649910233, "step": 25100}, {"loss": 0.766, "grad_norm": 0.7150128483772278, "learning_rate": 0.0002, "epoch": 1.8032315978456013, "step": 25110}, {"loss": 0.7036, "grad_norm": 0.9833421111106873, "learning_rate": 0.0002, "epoch": 1.8039497307001795, "step": 25120}, {"loss": 0.7122, "grad_norm": 0.774002194404602, "learning_rate": 0.0002, "epoch": 1.8046678635547577, "step": 25130}, {"loss": 0.7329, "grad_norm": 0.644443154335022, "learning_rate": 0.0002, "epoch": 1.8053859964093357, "step": 25140}, {"loss": 0.7039, "grad_norm": 0.6996100544929504, "learning_rate": 0.0002, "epoch": 1.8061041292639137, "step": 25150}, {"loss": 0.6962, "grad_norm": 0.7545985579490662, "learning_rate": 0.0002, "epoch": 1.806822262118492, "step": 25160}, {"loss": 0.7432, "grad_norm": 0.7505226731300354, "learning_rate": 0.0002, "epoch": 1.8075403949730702, "step": 25170}, {"loss": 0.7189, "grad_norm": 0.800681471824646, "learning_rate": 0.0002, "epoch": 1.8082585278276482, "step": 25180}, {"loss": 0.7131, "grad_norm": 0.8268337845802307, "learning_rate": 0.0002, "epoch": 1.8089766606822262, "step": 25190}, {"loss": 0.7933, "grad_norm": 0.6436594128608704, "learning_rate": 0.0002, "epoch": 1.8096947935368042, "step": 25200}, {"loss": 0.7478, "grad_norm": 0.6961014270782471, "learning_rate": 0.0002, "epoch": 1.8104129263913824, "step": 25210}, {"loss": 0.7519, "grad_norm": 0.6649489998817444, "learning_rate": 0.0002, "epoch": 1.8111310592459606, "step": 25220}, {"loss": 0.7307, "grad_norm": 0.7071637511253357, "learning_rate": 0.0002, "epoch": 1.8118491921005386, "step": 25230}, {"loss": 0.7074, "grad_norm": 0.9082241654396057, "learning_rate": 0.0002, "epoch": 1.8125673249551166, "step": 25240}, {"loss": 0.7406, "grad_norm": 0.6318159103393555, "learning_rate": 0.0002, "epoch": 1.8132854578096946, "step": 25250}, {"loss": 0.7081, "grad_norm": 0.8006597757339478, "learning_rate": 0.0002, "epoch": 1.8140035906642729, "step": 25260}, {"loss": 0.7593, "grad_norm": 0.7950259447097778, "learning_rate": 0.0002, "epoch": 1.814721723518851, "step": 25270}, {"loss": 0.6897, "grad_norm": 0.8376588821411133, "learning_rate": 0.0002, "epoch": 1.815439856373429, "step": 25280}, {"loss": 0.747, "grad_norm": 0.8343217968940735, "learning_rate": 0.0002, "epoch": 1.816157989228007, "step": 25290}, {"loss": 0.7611, "grad_norm": 0.6240017414093018, "learning_rate": 0.0002, "epoch": 1.8168761220825853, "step": 25300}, {"loss": 0.7458, "grad_norm": 0.7079808712005615, "learning_rate": 0.0002, "epoch": 1.8175942549371635, "step": 25310}, {"loss": 0.7254, "grad_norm": 0.5930073261260986, "learning_rate": 0.0002, "epoch": 1.8183123877917415, "step": 25320}, {"loss": 0.7647, "grad_norm": 0.6994491815567017, "learning_rate": 0.0002, "epoch": 1.8190305206463195, "step": 25330}, {"loss": 0.726, "grad_norm": 0.8285305500030518, "learning_rate": 0.0002, "epoch": 1.8197486535008975, "step": 25340}, {"loss": 0.7215, "grad_norm": 0.6880194544792175, "learning_rate": 0.0002, "epoch": 1.8204667863554758, "step": 25350}, {"loss": 0.7365, "grad_norm": 0.7301307916641235, "learning_rate": 0.0002, "epoch": 1.821184919210054, "step": 25360}, {"loss": 0.7308, "grad_norm": 0.8117532730102539, "learning_rate": 0.0002, "epoch": 1.821903052064632, "step": 25370}, {"loss": 0.7395, "grad_norm": 0.8098701238632202, "learning_rate": 0.0002, "epoch": 1.82262118491921, "step": 25380}, {"loss": 0.7082, "grad_norm": 0.6899038553237915, "learning_rate": 0.0002, "epoch": 1.823339317773788, "step": 25390}, {"loss": 0.697, "grad_norm": 0.7350431084632874, "learning_rate": 0.0002, "epoch": 1.8240574506283662, "step": 25400}, {"loss": 0.7389, "grad_norm": 0.8723382949829102, "learning_rate": 0.0002, "epoch": 1.8247755834829444, "step": 25410}, {"loss": 0.7375, "grad_norm": 0.7448108196258545, "learning_rate": 0.0002, "epoch": 1.8254937163375224, "step": 25420}, {"loss": 0.7279, "grad_norm": 0.7525040507316589, "learning_rate": 0.0002, "epoch": 1.8262118491921004, "step": 25430}, {"loss": 0.7164, "grad_norm": 0.7148599028587341, "learning_rate": 0.0002, "epoch": 1.8269299820466787, "step": 25440}, {"loss": 0.7955, "grad_norm": 1.1802153587341309, "learning_rate": 0.0002, "epoch": 1.827648114901257, "step": 25450}, {"loss": 0.7094, "grad_norm": 0.619945764541626, "learning_rate": 0.0002, "epoch": 1.828366247755835, "step": 25460}, {"loss": 0.8234, "grad_norm": 0.7065792679786682, "learning_rate": 0.0002, "epoch": 1.829084380610413, "step": 25470}, {"loss": 0.796, "grad_norm": 0.6626001596450806, "learning_rate": 0.0002, "epoch": 1.829802513464991, "step": 25480}, {"loss": 0.7402, "grad_norm": 0.8368920087814331, "learning_rate": 0.0002, "epoch": 1.8305206463195691, "step": 25490}, {"loss": 0.6513, "grad_norm": 0.7528934478759766, "learning_rate": 0.0002, "epoch": 1.8312387791741473, "step": 25500}, {"loss": 0.7272, "grad_norm": 0.6472136378288269, "learning_rate": 0.0002, "epoch": 1.8319569120287253, "step": 25510}, {"loss": 0.7221, "grad_norm": 0.7818671464920044, "learning_rate": 0.0002, "epoch": 1.8326750448833034, "step": 25520}, {"loss": 0.7582, "grad_norm": 0.8280798196792603, "learning_rate": 0.0002, "epoch": 1.8333931777378814, "step": 25530}, {"loss": 0.7079, "grad_norm": 0.7038599252700806, "learning_rate": 0.0002, "epoch": 1.8341113105924596, "step": 25540}, {"loss": 0.711, "grad_norm": 0.6345962882041931, "learning_rate": 0.0002, "epoch": 1.8348294434470378, "step": 25550}, {"loss": 0.7553, "grad_norm": 0.6891741752624512, "learning_rate": 0.0002, "epoch": 1.8355475763016158, "step": 25560}, {"loss": 0.754, "grad_norm": 0.7753492593765259, "learning_rate": 0.0002, "epoch": 1.8362657091561938, "step": 25570}, {"loss": 0.7149, "grad_norm": 0.6907210946083069, "learning_rate": 0.0002, "epoch": 1.836983842010772, "step": 25580}, {"loss": 0.705, "grad_norm": 0.7483090162277222, "learning_rate": 0.0002, "epoch": 1.8377019748653503, "step": 25590}, {"loss": 0.7716, "grad_norm": 0.8749029636383057, "learning_rate": 0.0002, "epoch": 1.8384201077199283, "step": 25600}, {"loss": 0.7745, "grad_norm": 0.6936851143836975, "learning_rate": 0.0002, "epoch": 1.8391382405745063, "step": 25610}, {"loss": 0.7297, "grad_norm": 0.7273763418197632, "learning_rate": 0.0002, "epoch": 1.8398563734290843, "step": 25620}, {"loss": 0.724, "grad_norm": 0.7655298113822937, "learning_rate": 0.0002, "epoch": 1.8405745062836625, "step": 25630}, {"loss": 0.7566, "grad_norm": 0.7207344770431519, "learning_rate": 0.0002, "epoch": 1.8412926391382407, "step": 25640}, {"loss": 0.7092, "grad_norm": 0.6970131397247314, "learning_rate": 0.0002, "epoch": 1.8420107719928187, "step": 25650}, {"loss": 0.7164, "grad_norm": 0.7777560353279114, "learning_rate": 0.0002, "epoch": 1.8427289048473967, "step": 25660}, {"loss": 0.7594, "grad_norm": 0.7070116400718689, "learning_rate": 0.0002, "epoch": 1.8434470377019747, "step": 25670}, {"loss": 0.7603, "grad_norm": 0.6980257630348206, "learning_rate": 0.0002, "epoch": 1.844165170556553, "step": 25680}, {"loss": 0.7782, "grad_norm": 0.906563401222229, "learning_rate": 0.0002, "epoch": 1.8448833034111312, "step": 25690}, {"loss": 0.7377, "grad_norm": 0.567991316318512, "learning_rate": 0.0002, "epoch": 1.8456014362657092, "step": 25700}, {"loss": 0.7236, "grad_norm": 0.5954506993293762, "learning_rate": 0.0002, "epoch": 1.8463195691202872, "step": 25710}, {"loss": 0.7287, "grad_norm": 0.8073318600654602, "learning_rate": 0.0002, "epoch": 1.8470377019748654, "step": 25720}, {"loss": 0.7627, "grad_norm": 0.7439551949501038, "learning_rate": 0.0002, "epoch": 1.8477558348294436, "step": 25730}, {"loss": 0.7719, "grad_norm": 0.8091771602630615, "learning_rate": 0.0002, "epoch": 1.8484739676840216, "step": 25740}, {"loss": 0.7477, "grad_norm": 0.6584576964378357, "learning_rate": 0.0002, "epoch": 1.8491921005385996, "step": 25750}, {"loss": 0.6988, "grad_norm": 0.8161963224411011, "learning_rate": 0.0002, "epoch": 1.8499102333931776, "step": 25760}, {"loss": 0.7607, "grad_norm": 0.7337122559547424, "learning_rate": 0.0002, "epoch": 1.8506283662477558, "step": 25770}, {"loss": 0.7279, "grad_norm": 0.8968114256858826, "learning_rate": 0.0002, "epoch": 1.851346499102334, "step": 25780}, {"loss": 0.7162, "grad_norm": 0.8647686839103699, "learning_rate": 0.0002, "epoch": 1.852064631956912, "step": 25790}, {"loss": 0.7315, "grad_norm": 0.7775349020957947, "learning_rate": 0.0002, "epoch": 1.85278276481149, "step": 25800}, {"loss": 0.7739, "grad_norm": 0.686072587966919, "learning_rate": 0.0002, "epoch": 1.853500897666068, "step": 25810}, {"loss": 0.7138, "grad_norm": 0.7053380012512207, "learning_rate": 0.0002, "epoch": 1.8542190305206463, "step": 25820}, {"loss": 0.7583, "grad_norm": 0.7899979948997498, "learning_rate": 0.0002, "epoch": 1.8549371633752245, "step": 25830}, {"loss": 0.7633, "grad_norm": 0.6970776915550232, "learning_rate": 0.0002, "epoch": 1.8556552962298025, "step": 25840}, {"loss": 0.7704, "grad_norm": 0.7210841774940491, "learning_rate": 0.0002, "epoch": 1.8563734290843805, "step": 25850}, {"loss": 0.7422, "grad_norm": 0.7297208905220032, "learning_rate": 0.0002, "epoch": 1.8570915619389587, "step": 25860}, {"loss": 0.698, "grad_norm": 0.7782729268074036, "learning_rate": 0.0002, "epoch": 1.857809694793537, "step": 25870}, {"loss": 0.7791, "grad_norm": 0.7227505445480347, "learning_rate": 0.0002, "epoch": 1.858527827648115, "step": 25880}, {"loss": 0.7899, "grad_norm": 0.7489684224128723, "learning_rate": 0.0002, "epoch": 1.859245960502693, "step": 25890}, {"loss": 0.7875, "grad_norm": 0.7447289824485779, "learning_rate": 0.0002, "epoch": 1.859964093357271, "step": 25900}, {"loss": 0.7151, "grad_norm": 0.8516317009925842, "learning_rate": 0.0002, "epoch": 1.8606822262118492, "step": 25910}, {"loss": 0.6947, "grad_norm": 0.6864543557167053, "learning_rate": 0.0002, "epoch": 1.8614003590664274, "step": 25920}, {"loss": 0.7516, "grad_norm": 0.6753451824188232, "learning_rate": 0.0002, "epoch": 1.8621184919210054, "step": 25930}, {"loss": 0.7606, "grad_norm": 0.631679117679596, "learning_rate": 0.0002, "epoch": 1.8628366247755834, "step": 25940}, {"loss": 0.7663, "grad_norm": 0.7715049982070923, "learning_rate": 0.0002, "epoch": 1.8635547576301614, "step": 25950}, {"loss": 0.6967, "grad_norm": 0.7354850769042969, "learning_rate": 0.0002, "epoch": 1.8642728904847397, "step": 25960}, {"loss": 0.7331, "grad_norm": 0.7443442940711975, "learning_rate": 0.0002, "epoch": 1.8649910233393179, "step": 25970}, {"loss": 0.7558, "grad_norm": 0.6880337595939636, "learning_rate": 0.0002, "epoch": 1.8657091561938959, "step": 25980}, {"loss": 0.752, "grad_norm": 0.843941867351532, "learning_rate": 0.0002, "epoch": 1.8664272890484739, "step": 25990}, {"loss": 0.6941, "grad_norm": 0.6904318928718567, "learning_rate": 0.0002, "epoch": 1.867145421903052, "step": 26000}, {"loss": 0.6995, "grad_norm": 0.9041751623153687, "learning_rate": 0.0002, "epoch": 1.86786355475763, "step": 26010}, {"loss": 0.7503, "grad_norm": 0.7470057010650635, "learning_rate": 0.0002, "epoch": 1.8685816876122083, "step": 26020}, {"loss": 0.775, "grad_norm": 0.6921331882476807, "learning_rate": 0.0002, "epoch": 1.8692998204667863, "step": 26030}, {"loss": 0.7376, "grad_norm": 0.7627376914024353, "learning_rate": 0.0002, "epoch": 1.8700179533213643, "step": 26040}, {"loss": 0.7459, "grad_norm": 0.7784932851791382, "learning_rate": 0.0002, "epoch": 1.8707360861759426, "step": 26050}, {"loss": 0.7479, "grad_norm": 0.6399524807929993, "learning_rate": 0.0002, "epoch": 1.8714542190305208, "step": 26060}, {"loss": 0.7128, "grad_norm": 0.6478492617607117, "learning_rate": 0.0002, "epoch": 1.8721723518850988, "step": 26070}, {"loss": 0.6901, "grad_norm": 0.6376804113388062, "learning_rate": 0.0002, "epoch": 1.8728904847396768, "step": 26080}, {"loss": 0.7037, "grad_norm": 0.6976892352104187, "learning_rate": 0.0002, "epoch": 1.8736086175942548, "step": 26090}, {"loss": 0.7071, "grad_norm": 0.7997903227806091, "learning_rate": 0.0002, "epoch": 1.874326750448833, "step": 26100}, {"loss": 0.7152, "grad_norm": 0.6984273791313171, "learning_rate": 0.0002, "epoch": 1.8750448833034112, "step": 26110}, {"loss": 0.7768, "grad_norm": 0.7020659446716309, "learning_rate": 0.0002, "epoch": 1.8757630161579892, "step": 26120}, {"loss": 0.7518, "grad_norm": 0.784986138343811, "learning_rate": 0.0002, "epoch": 1.8764811490125672, "step": 26130}, {"loss": 0.7224, "grad_norm": 0.7369210124015808, "learning_rate": 0.0002, "epoch": 1.8771992818671455, "step": 26140}, {"loss": 0.7935, "grad_norm": 0.7730622291564941, "learning_rate": 0.0002, "epoch": 1.8779174147217235, "step": 26150}, {"loss": 0.697, "grad_norm": 0.7253434658050537, "learning_rate": 0.0002, "epoch": 1.8786355475763017, "step": 26160}, {"loss": 0.6866, "grad_norm": 0.8019800186157227, "learning_rate": 0.0002, "epoch": 1.8793536804308797, "step": 26170}, {"loss": 0.7341, "grad_norm": 0.7337628602981567, "learning_rate": 0.0002, "epoch": 1.8800718132854577, "step": 26180}, {"loss": 0.752, "grad_norm": 0.7049200534820557, "learning_rate": 0.0002, "epoch": 1.880789946140036, "step": 26190}, {"loss": 0.73, "grad_norm": 0.6451525092124939, "learning_rate": 0.0002, "epoch": 1.8815080789946141, "step": 26200}, {"loss": 0.749, "grad_norm": 0.7660874724388123, "learning_rate": 0.0002, "epoch": 1.8822262118491921, "step": 26210}, {"loss": 0.7377, "grad_norm": 0.8464223146438599, "learning_rate": 0.0002, "epoch": 1.8829443447037701, "step": 26220}, {"loss": 0.7402, "grad_norm": 0.859503984451294, "learning_rate": 0.0002, "epoch": 1.8836624775583481, "step": 26230}, {"loss": 0.7057, "grad_norm": 0.6969478726387024, "learning_rate": 0.0002, "epoch": 1.8843806104129264, "step": 26240}, {"loss": 0.7338, "grad_norm": 0.6860285997390747, "learning_rate": 0.0002, "epoch": 1.8850987432675046, "step": 26250}, {"loss": 0.7397, "grad_norm": 0.5873110294342041, "learning_rate": 0.0002, "epoch": 1.8858168761220826, "step": 26260}, {"loss": 0.7208, "grad_norm": 0.6959530115127563, "learning_rate": 0.0002, "epoch": 1.8865350089766606, "step": 26270}, {"loss": 0.7156, "grad_norm": 0.8734689950942993, "learning_rate": 0.0002, "epoch": 1.8872531418312388, "step": 26280}, {"loss": 0.689, "grad_norm": 0.7385509014129639, "learning_rate": 0.0002, "epoch": 1.8879712746858168, "step": 26290}, {"loss": 0.7355, "grad_norm": 0.6702063083648682, "learning_rate": 0.0002, "epoch": 1.888689407540395, "step": 26300}, {"loss": 0.7247, "grad_norm": 0.8177255988121033, "learning_rate": 0.0002, "epoch": 1.889407540394973, "step": 26310}, {"loss": 0.7451, "grad_norm": 0.6638466715812683, "learning_rate": 0.0002, "epoch": 1.890125673249551, "step": 26320}, {"loss": 0.7176, "grad_norm": 0.8584128618240356, "learning_rate": 0.0002, "epoch": 1.8908438061041293, "step": 26330}, {"loss": 0.7216, "grad_norm": 0.677561342716217, "learning_rate": 0.0002, "epoch": 1.8915619389587075, "step": 26340}, {"loss": 0.7502, "grad_norm": 0.6931864619255066, "learning_rate": 0.0002, "epoch": 1.8922800718132855, "step": 26350}, {"loss": 0.7548, "grad_norm": 0.6583828330039978, "learning_rate": 0.0002, "epoch": 1.8929982046678635, "step": 26360}, {"loss": 0.7544, "grad_norm": 0.6708519458770752, "learning_rate": 0.0002, "epoch": 1.8937163375224415, "step": 26370}, {"loss": 0.7034, "grad_norm": 0.7684788107872009, "learning_rate": 0.0002, "epoch": 1.8944344703770197, "step": 26380}, {"loss": 0.7243, "grad_norm": 0.703217625617981, "learning_rate": 0.0002, "epoch": 1.895152603231598, "step": 26390}, {"loss": 0.7768, "grad_norm": 0.6686710119247437, "learning_rate": 0.0002, "epoch": 1.895870736086176, "step": 26400}, {"loss": 0.7999, "grad_norm": 0.7429705262184143, "learning_rate": 0.0002, "epoch": 1.896588868940754, "step": 26410}, {"loss": 0.7695, "grad_norm": 0.7835305333137512, "learning_rate": 0.0002, "epoch": 1.8973070017953322, "step": 26420}, {"loss": 0.722, "grad_norm": 0.7793689370155334, "learning_rate": 0.0002, "epoch": 1.8980251346499102, "step": 26430}, {"loss": 0.7872, "grad_norm": 0.7337237000465393, "learning_rate": 0.0002, "epoch": 1.8987432675044884, "step": 26440}, {"loss": 0.7092, "grad_norm": 0.5734546780586243, "learning_rate": 0.0002, "epoch": 1.8994614003590664, "step": 26450}, {"loss": 0.7738, "grad_norm": 0.655937135219574, "learning_rate": 0.0002, "epoch": 1.9001795332136444, "step": 26460}, {"loss": 0.7302, "grad_norm": 1.0200905799865723, "learning_rate": 0.0002, "epoch": 1.9008976660682226, "step": 26470}, {"loss": 0.733, "grad_norm": 0.6118829250335693, "learning_rate": 0.0002, "epoch": 1.9016157989228009, "step": 26480}, {"loss": 0.7255, "grad_norm": 0.7459297776222229, "learning_rate": 0.0002, "epoch": 1.9023339317773789, "step": 26490}, {"loss": 0.7257, "grad_norm": 0.9451959729194641, "learning_rate": 0.0002, "epoch": 1.9030520646319569, "step": 26500}, {"loss": 0.7911, "grad_norm": 0.9694880247116089, "learning_rate": 0.0002, "epoch": 1.9037701974865349, "step": 26510}, {"loss": 0.7913, "grad_norm": 0.806532084941864, "learning_rate": 0.0002, "epoch": 1.904488330341113, "step": 26520}, {"loss": 0.7375, "grad_norm": 0.7016968727111816, "learning_rate": 0.0002, "epoch": 1.9052064631956913, "step": 26530}, {"loss": 0.7128, "grad_norm": 0.7707533836364746, "learning_rate": 0.0002, "epoch": 1.9059245960502693, "step": 26540}, {"loss": 0.7225, "grad_norm": 0.716044545173645, "learning_rate": 0.0002, "epoch": 1.9066427289048473, "step": 26550}, {"loss": 0.7569, "grad_norm": 0.7904782295227051, "learning_rate": 0.0002, "epoch": 1.9073608617594255, "step": 26560}, {"loss": 0.7112, "grad_norm": 0.8557461500167847, "learning_rate": 0.0002, "epoch": 1.9080789946140035, "step": 26570}, {"loss": 0.7377, "grad_norm": 0.6807048916816711, "learning_rate": 0.0002, "epoch": 1.9087971274685818, "step": 26580}, {"loss": 0.7066, "grad_norm": 0.8374032974243164, "learning_rate": 0.0002, "epoch": 1.9095152603231598, "step": 26590}, {"loss": 0.7282, "grad_norm": 0.7936834692955017, "learning_rate": 0.0002, "epoch": 1.9102333931777378, "step": 26600}, {"loss": 0.741, "grad_norm": 0.6342210173606873, "learning_rate": 0.0002, "epoch": 1.910951526032316, "step": 26610}, {"loss": 0.7117, "grad_norm": 0.8222208023071289, "learning_rate": 0.0002, "epoch": 1.9116696588868942, "step": 26620}, {"loss": 0.6965, "grad_norm": 0.7890012860298157, "learning_rate": 0.0002, "epoch": 1.9123877917414722, "step": 26630}, {"loss": 0.7141, "grad_norm": 0.6415254473686218, "learning_rate": 0.0002, "epoch": 1.9131059245960502, "step": 26640}, {"loss": 0.7232, "grad_norm": 0.7936763763427734, "learning_rate": 0.0002, "epoch": 1.9138240574506282, "step": 26650}, {"loss": 0.7411, "grad_norm": 0.7174334526062012, "learning_rate": 0.0002, "epoch": 1.9145421903052064, "step": 26660}, {"loss": 0.715, "grad_norm": 0.6503710746765137, "learning_rate": 0.0002, "epoch": 1.9152603231597847, "step": 26670}, {"loss": 0.7629, "grad_norm": 0.7618577480316162, "learning_rate": 0.0002, "epoch": 1.9159784560143627, "step": 26680}, {"loss": 0.7581, "grad_norm": 0.7984131574630737, "learning_rate": 0.0002, "epoch": 1.9166965888689407, "step": 26690}, {"loss": 0.7126, "grad_norm": 0.6863887906074524, "learning_rate": 0.0002, "epoch": 1.917414721723519, "step": 26700}, {"loss": 0.738, "grad_norm": 0.7621138691902161, "learning_rate": 0.0002, "epoch": 1.918132854578097, "step": 26710}, {"loss": 0.7095, "grad_norm": 0.7855543494224548, "learning_rate": 0.0002, "epoch": 1.9188509874326751, "step": 26720}, {"loss": 0.7354, "grad_norm": 0.7045016288757324, "learning_rate": 0.0002, "epoch": 1.9195691202872531, "step": 26730}, {"loss": 0.7188, "grad_norm": 0.7799559235572815, "learning_rate": 0.0002, "epoch": 1.9202872531418311, "step": 26740}, {"loss": 0.7714, "grad_norm": 0.7999796271324158, "learning_rate": 0.0002, "epoch": 1.9210053859964094, "step": 26750}, {"loss": 0.6856, "grad_norm": 0.5479980111122131, "learning_rate": 0.0002, "epoch": 1.9217235188509876, "step": 26760}, {"loss": 0.7153, "grad_norm": 0.7192868590354919, "learning_rate": 0.0002, "epoch": 1.9224416517055656, "step": 26770}, {"loss": 0.7272, "grad_norm": 0.7642375826835632, "learning_rate": 0.0002, "epoch": 1.9231597845601436, "step": 26780}, {"loss": 0.6923, "grad_norm": 0.7015959620475769, "learning_rate": 0.0002, "epoch": 1.9238779174147216, "step": 26790}, {"loss": 0.8291, "grad_norm": 0.6685634851455688, "learning_rate": 0.0002, "epoch": 1.9245960502692998, "step": 26800}, {"loss": 0.7404, "grad_norm": 0.674363911151886, "learning_rate": 0.0002, "epoch": 1.925314183123878, "step": 26810}, {"loss": 0.7145, "grad_norm": 0.769318163394928, "learning_rate": 0.0002, "epoch": 1.926032315978456, "step": 26820}, {"loss": 0.7323, "grad_norm": 0.7397989630699158, "learning_rate": 0.0002, "epoch": 1.926750448833034, "step": 26830}, {"loss": 0.7399, "grad_norm": 0.7603814601898193, "learning_rate": 0.0002, "epoch": 1.9274685816876123, "step": 26840}, {"loss": 0.7147, "grad_norm": 0.5960564613342285, "learning_rate": 0.0002, "epoch": 1.9281867145421903, "step": 26850}, {"loss": 0.7292, "grad_norm": 0.8158858418464661, "learning_rate": 0.0002, "epoch": 1.9289048473967685, "step": 26860}, {"loss": 0.7609, "grad_norm": 0.7022058367729187, "learning_rate": 0.0002, "epoch": 1.9296229802513465, "step": 26870}, {"loss": 0.809, "grad_norm": 0.7249060273170471, "learning_rate": 0.0002, "epoch": 1.9303411131059245, "step": 26880}, {"loss": 0.7437, "grad_norm": 0.7613264322280884, "learning_rate": 0.0002, "epoch": 1.9310592459605027, "step": 26890}, {"loss": 0.7238, "grad_norm": 0.6857499480247498, "learning_rate": 0.0002, "epoch": 1.931777378815081, "step": 26900}, {"loss": 0.7651, "grad_norm": 0.6968346834182739, "learning_rate": 0.0002, "epoch": 1.932495511669659, "step": 26910}, {"loss": 0.6837, "grad_norm": 0.7079267501831055, "learning_rate": 0.0002, "epoch": 1.933213644524237, "step": 26920}, {"loss": 0.7482, "grad_norm": 0.6571618914604187, "learning_rate": 0.0002, "epoch": 1.933931777378815, "step": 26930}, {"loss": 0.7344, "grad_norm": 0.7460548281669617, "learning_rate": 0.0002, "epoch": 1.9346499102333932, "step": 26940}, {"loss": 0.7038, "grad_norm": 0.7954307794570923, "learning_rate": 0.0002, "epoch": 1.9353680430879714, "step": 26950}, {"loss": 0.6847, "grad_norm": 0.8696223497390747, "learning_rate": 0.0002, "epoch": 1.9360861759425494, "step": 26960}, {"loss": 0.7657, "grad_norm": 0.726004421710968, "learning_rate": 0.0002, "epoch": 1.9368043087971274, "step": 26970}, {"loss": 0.771, "grad_norm": 0.8760337829589844, "learning_rate": 0.0002, "epoch": 1.9375224416517056, "step": 26980}, {"loss": 0.6917, "grad_norm": 0.7308675646781921, "learning_rate": 0.0002, "epoch": 1.9382405745062836, "step": 26990}, {"loss": 0.7155, "grad_norm": 0.5900304317474365, "learning_rate": 0.0002, "epoch": 1.9389587073608618, "step": 27000}, {"loss": 0.6917, "grad_norm": 0.8839457631111145, "learning_rate": 0.0002, "epoch": 1.9396768402154398, "step": 27010}, {"loss": 0.7443, "grad_norm": 0.7239173650741577, "learning_rate": 0.0002, "epoch": 1.9403949730700178, "step": 27020}, {"loss": 0.7081, "grad_norm": 0.8972901701927185, "learning_rate": 0.0002, "epoch": 1.941113105924596, "step": 27030}, {"loss": 0.7422, "grad_norm": 0.7140652537345886, "learning_rate": 0.0002, "epoch": 1.9418312387791743, "step": 27040}, {"loss": 0.7679, "grad_norm": 0.7502743005752563, "learning_rate": 0.0002, "epoch": 1.9425493716337523, "step": 27050}, {"loss": 0.7311, "grad_norm": 0.6420751810073853, "learning_rate": 0.0002, "epoch": 1.9432675044883303, "step": 27060}, {"loss": 0.7403, "grad_norm": 0.6671820282936096, "learning_rate": 0.0002, "epoch": 1.9439856373429083, "step": 27070}, {"loss": 0.6919, "grad_norm": 0.6268796324729919, "learning_rate": 0.0002, "epoch": 1.9447037701974865, "step": 27080}, {"loss": 0.8154, "grad_norm": 0.6850021481513977, "learning_rate": 0.0002, "epoch": 1.9454219030520647, "step": 27090}, {"loss": 0.7179, "grad_norm": 0.6380038261413574, "learning_rate": 0.0002, "epoch": 1.9461400359066428, "step": 27100}, {"loss": 0.7638, "grad_norm": 0.5806204080581665, "learning_rate": 0.0002, "epoch": 1.9468581687612208, "step": 27110}, {"loss": 0.7032, "grad_norm": 0.8236927390098572, "learning_rate": 0.0002, "epoch": 1.947576301615799, "step": 27120}, {"loss": 0.7398, "grad_norm": 0.7915826439857483, "learning_rate": 0.0002, "epoch": 1.948294434470377, "step": 27130}, {"loss": 0.729, "grad_norm": 0.7467429041862488, "learning_rate": 0.0002, "epoch": 1.9490125673249552, "step": 27140}, {"loss": 0.7297, "grad_norm": 0.6278707981109619, "learning_rate": 0.0002, "epoch": 1.9497307001795332, "step": 27150}, {"loss": 0.7272, "grad_norm": 0.7353739142417908, "learning_rate": 0.0002, "epoch": 1.9504488330341112, "step": 27160}, {"loss": 0.6877, "grad_norm": 0.6443645358085632, "learning_rate": 0.0002, "epoch": 1.9511669658886894, "step": 27170}, {"loss": 0.7479, "grad_norm": 0.770800769329071, "learning_rate": 0.0002, "epoch": 1.9518850987432677, "step": 27180}, {"loss": 0.713, "grad_norm": 0.8982598781585693, "learning_rate": 0.0002, "epoch": 1.9526032315978457, "step": 27190}, {"loss": 0.7447, "grad_norm": 0.775017499923706, "learning_rate": 0.0002, "epoch": 1.9533213644524237, "step": 27200}, {"loss": 0.76, "grad_norm": 0.8271628618240356, "learning_rate": 0.0002, "epoch": 1.9540394973070017, "step": 27210}, {"loss": 0.7321, "grad_norm": 0.7460184693336487, "learning_rate": 0.0002, "epoch": 1.9547576301615799, "step": 27220}, {"loss": 0.6999, "grad_norm": 0.7732188105583191, "learning_rate": 0.0002, "epoch": 1.955475763016158, "step": 27230}, {"loss": 0.7135, "grad_norm": 0.7398577332496643, "learning_rate": 0.0002, "epoch": 1.956193895870736, "step": 27240}, {"loss": 0.7347, "grad_norm": 0.7132339477539062, "learning_rate": 0.0002, "epoch": 1.9569120287253141, "step": 27250}, {"loss": 0.7731, "grad_norm": 0.6718965768814087, "learning_rate": 0.0002, "epoch": 1.9576301615798921, "step": 27260}, {"loss": 0.7088, "grad_norm": 0.7914422154426575, "learning_rate": 0.0002, "epoch": 1.9583482944344703, "step": 27270}, {"loss": 0.6998, "grad_norm": 0.8314110636711121, "learning_rate": 0.0002, "epoch": 1.9590664272890486, "step": 27280}, {"loss": 0.7662, "grad_norm": 0.7810674905776978, "learning_rate": 0.0002, "epoch": 1.9597845601436266, "step": 27290}, {"loss": 0.7278, "grad_norm": 0.7691007256507874, "learning_rate": 0.0002, "epoch": 1.9605026929982046, "step": 27300}, {"loss": 0.7152, "grad_norm": 0.6753138899803162, "learning_rate": 0.0002, "epoch": 1.9612208258527828, "step": 27310}, {"loss": 0.7519, "grad_norm": 0.5881175994873047, "learning_rate": 0.0002, "epoch": 1.961938958707361, "step": 27320}, {"loss": 0.6877, "grad_norm": 0.8414133191108704, "learning_rate": 0.0002, "epoch": 1.962657091561939, "step": 27330}, {"loss": 0.7241, "grad_norm": 0.7363715171813965, "learning_rate": 0.0002, "epoch": 1.963375224416517, "step": 27340}, {"loss": 0.7153, "grad_norm": 0.6526232361793518, "learning_rate": 0.0002, "epoch": 1.964093357271095, "step": 27350}, {"loss": 0.8178, "grad_norm": 0.6821389198303223, "learning_rate": 0.0002, "epoch": 1.9648114901256732, "step": 27360}, {"loss": 0.7134, "grad_norm": 0.7306062579154968, "learning_rate": 0.0002, "epoch": 1.9655296229802515, "step": 27370}, {"loss": 0.7139, "grad_norm": 0.6458130478858948, "learning_rate": 0.0002, "epoch": 1.9662477558348295, "step": 27380}, {"loss": 0.7397, "grad_norm": 0.7243196368217468, "learning_rate": 0.0002, "epoch": 1.9669658886894075, "step": 27390}, {"loss": 0.6729, "grad_norm": 0.8062235713005066, "learning_rate": 0.0002, "epoch": 1.9676840215439855, "step": 27400}, {"loss": 0.7972, "grad_norm": 0.68441241979599, "learning_rate": 0.0002, "epoch": 1.9684021543985637, "step": 27410}, {"loss": 0.7235, "grad_norm": 0.7504498958587646, "learning_rate": 0.0002, "epoch": 1.969120287253142, "step": 27420}, {"loss": 0.7192, "grad_norm": 0.7469466328620911, "learning_rate": 0.0002, "epoch": 1.96983842010772, "step": 27430}, {"loss": 0.7556, "grad_norm": 0.7109853625297546, "learning_rate": 0.0002, "epoch": 1.970556552962298, "step": 27440}, {"loss": 0.7977, "grad_norm": 0.6964903473854065, "learning_rate": 0.0002, "epoch": 1.9712746858168761, "step": 27450}, {"loss": 0.7692, "grad_norm": 0.8224200010299683, "learning_rate": 0.0002, "epoch": 1.9719928186714544, "step": 27460}, {"loss": 0.7318, "grad_norm": 0.6195617318153381, "learning_rate": 0.0002, "epoch": 1.9727109515260324, "step": 27470}, {"loss": 0.7843, "grad_norm": 0.691511332988739, "learning_rate": 0.0002, "epoch": 1.9734290843806104, "step": 27480}, {"loss": 0.7324, "grad_norm": 0.7437900304794312, "learning_rate": 0.0002, "epoch": 1.9741472172351884, "step": 27490}, {"loss": 0.6736, "grad_norm": 0.7987960577011108, "learning_rate": 0.0002, "epoch": 1.9748653500897666, "step": 27500}, {"loss": 0.7005, "grad_norm": 0.7117776274681091, "learning_rate": 0.0002, "epoch": 1.9755834829443448, "step": 27510}, {"loss": 0.7201, "grad_norm": 0.8473866581916809, "learning_rate": 0.0002, "epoch": 1.9763016157989228, "step": 27520}, {"loss": 0.7528, "grad_norm": 0.7178242802619934, "learning_rate": 0.0002, "epoch": 1.9770197486535008, "step": 27530}, {"loss": 0.7112, "grad_norm": 0.760145902633667, "learning_rate": 0.0002, "epoch": 1.9777378815080788, "step": 27540}, {"loss": 0.8118, "grad_norm": 0.764436662197113, "learning_rate": 0.0002, "epoch": 1.978456014362657, "step": 27550}, {"loss": 0.7542, "grad_norm": 0.7245904803276062, "learning_rate": 0.0002, "epoch": 1.9791741472172353, "step": 27560}, {"loss": 0.7316, "grad_norm": 0.6317000389099121, "learning_rate": 0.0002, "epoch": 1.9798922800718133, "step": 27570}, {"loss": 0.7504, "grad_norm": 0.8764704465866089, "learning_rate": 0.0002, "epoch": 1.9806104129263913, "step": 27580}, {"loss": 0.7845, "grad_norm": 0.6111825108528137, "learning_rate": 0.0002, "epoch": 1.9813285457809695, "step": 27590}, {"loss": 0.7101, "grad_norm": 0.6797714233398438, "learning_rate": 0.0002, "epoch": 1.9820466786355477, "step": 27600}, {"loss": 0.8037, "grad_norm": 0.7754142880439758, "learning_rate": 0.0002, "epoch": 1.9827648114901257, "step": 27610}, {"loss": 0.7288, "grad_norm": 0.7243061661720276, "learning_rate": 0.0002, "epoch": 1.9834829443447037, "step": 27620}, {"loss": 0.6626, "grad_norm": 0.6194812655448914, "learning_rate": 0.0002, "epoch": 1.9842010771992817, "step": 27630}, {"loss": 0.7162, "grad_norm": 0.6399638056755066, "learning_rate": 0.0002, "epoch": 1.98491921005386, "step": 27640}, {"loss": 0.764, "grad_norm": 0.7637218832969666, "learning_rate": 0.0002, "epoch": 1.9856373429084382, "step": 27650}, {"loss": 0.7386, "grad_norm": 0.9099404811859131, "learning_rate": 0.0002, "epoch": 1.9863554757630162, "step": 27660}, {"loss": 0.7579, "grad_norm": 0.6892596483230591, "learning_rate": 0.0002, "epoch": 1.9870736086175942, "step": 27670}, {"loss": 0.7802, "grad_norm": 0.5962418913841248, "learning_rate": 0.0002, "epoch": 1.9877917414721722, "step": 27680}, {"loss": 0.7162, "grad_norm": 0.5750163197517395, "learning_rate": 0.0002, "epoch": 1.9885098743267504, "step": 27690}, {"loss": 0.7553, "grad_norm": 0.6740097403526306, "learning_rate": 0.0002, "epoch": 1.9892280071813286, "step": 27700}, {"loss": 0.7444, "grad_norm": 0.6968644857406616, "learning_rate": 0.0002, "epoch": 1.9899461400359066, "step": 27710}, {"loss": 0.7383, "grad_norm": 0.6788132190704346, "learning_rate": 0.0002, "epoch": 1.9906642728904846, "step": 27720}, {"loss": 0.7249, "grad_norm": 0.8600544929504395, "learning_rate": 0.0002, "epoch": 1.9913824057450629, "step": 27730}, {"loss": 0.7133, "grad_norm": 0.6227671504020691, "learning_rate": 0.0002, "epoch": 1.992100538599641, "step": 27740}, {"loss": 0.7815, "grad_norm": 0.6611875295639038, "learning_rate": 0.0002, "epoch": 1.992818671454219, "step": 27750}, {"loss": 0.7423, "grad_norm": 0.714568018913269, "learning_rate": 0.0002, "epoch": 1.993536804308797, "step": 27760}, {"loss": 0.7297, "grad_norm": 0.6328669190406799, "learning_rate": 0.0002, "epoch": 1.994254937163375, "step": 27770}, {"loss": 0.7398, "grad_norm": 0.8673429489135742, "learning_rate": 0.0002, "epoch": 1.9949730700179533, "step": 27780}, {"loss": 0.7301, "grad_norm": 0.820620059967041, "learning_rate": 0.0002, "epoch": 1.9956912028725315, "step": 27790}, {"loss": 0.7828, "grad_norm": 0.8748094439506531, "learning_rate": 0.0002, "epoch": 1.9964093357271095, "step": 27800}, {"loss": 0.6945, "grad_norm": 0.8118113875389099, "learning_rate": 0.0002, "epoch": 1.9971274685816875, "step": 27810}, {"loss": 0.742, "grad_norm": 0.6886725425720215, "learning_rate": 0.0002, "epoch": 1.9978456014362656, "step": 27820}, {"loss": 0.7293, "grad_norm": 0.7101268768310547, "learning_rate": 0.0002, "epoch": 1.9985637342908438, "step": 27830}, {"loss": 0.7317, "grad_norm": 0.7823781967163086, "learning_rate": 0.0002, "epoch": 1.999281867145422, "step": 27840}, {"loss": 0.7711, "grad_norm": 0.8491085767745972, "learning_rate": 0.0002, "epoch": 2.0, "step": 27850}]} +{"epoch": 3.0, "step": 41775, "epoch_duration": 14994.17358160019, "total_accumulated_duration": 48340.90283536911, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.5816, "grad_norm": 1.0291756391525269, "learning_rate": 0.0002, "epoch": 0.000718132854578097, "step": 10}, {"loss": 1.1527, "grad_norm": 0.6570823192596436, "learning_rate": 0.0002, "epoch": 0.001436265709156194, "step": 20}, {"loss": 1.0014, "grad_norm": 0.693844199180603, "learning_rate": 0.0002, "epoch": 0.0021543985637342907, "step": 30}, {"loss": 0.9377, "grad_norm": 0.5608532428741455, "learning_rate": 0.0002, "epoch": 0.002872531418312388, "step": 40}, {"loss": 0.9533, "grad_norm": 0.549075722694397, "learning_rate": 0.0002, "epoch": 0.003590664272890485, "step": 50}, {"loss": 0.9164, "grad_norm": 0.47189879417419434, "learning_rate": 0.0002, "epoch": 0.004308797127468581, "step": 60}, {"loss": 0.8898, "grad_norm": 0.5799676775932312, "learning_rate": 0.0002, "epoch": 0.005026929982046679, "step": 70}, {"loss": 0.859, "grad_norm": 0.45907193422317505, "learning_rate": 0.0002, "epoch": 0.005745062836624776, "step": 80}, {"loss": 0.8697, "grad_norm": 0.4373045861721039, "learning_rate": 0.0002, "epoch": 0.006463195691202872, "step": 90}, {"loss": 0.8879, "grad_norm": 0.5636304020881653, "learning_rate": 0.0002, "epoch": 0.00718132854578097, "step": 100}, {"loss": 0.8397, "grad_norm": 0.5248253345489502, "learning_rate": 0.0002, "epoch": 0.007899461400359067, "step": 110}, {"loss": 0.9021, "grad_norm": 0.5082874298095703, "learning_rate": 0.0002, "epoch": 0.008617594254937163, "step": 120}, {"loss": 0.8678, "grad_norm": 0.42670881748199463, "learning_rate": 0.0002, "epoch": 0.00933572710951526, "step": 130}, {"loss": 0.7847, "grad_norm": 0.43311649560928345, "learning_rate": 0.0002, "epoch": 0.010053859964093357, "step": 140}, {"loss": 0.9252, "grad_norm": 0.43456509709358215, "learning_rate": 0.0002, "epoch": 0.010771992818671455, "step": 150}, {"loss": 0.8812, "grad_norm": 0.9222815632820129, "learning_rate": 0.0002, "epoch": 0.011490125673249552, "step": 160}, {"loss": 0.8651, "grad_norm": 0.42752256989479065, "learning_rate": 0.0002, "epoch": 0.012208258527827648, "step": 170}, {"loss": 0.8898, "grad_norm": 0.4175542891025543, "learning_rate": 0.0002, "epoch": 0.012926391382405745, "step": 180}, {"loss": 0.8519, "grad_norm": 0.4377831518650055, "learning_rate": 0.0002, "epoch": 0.013644524236983842, "step": 190}, {"loss": 0.8849, "grad_norm": 0.47263655066490173, "learning_rate": 0.0002, "epoch": 0.01436265709156194, "step": 200}, {"loss": 0.8764, "grad_norm": 0.3870520293712616, "learning_rate": 0.0002, "epoch": 0.015080789946140035, "step": 210}, {"loss": 0.833, "grad_norm": 0.4950464963912964, "learning_rate": 0.0002, "epoch": 0.015798922800718134, "step": 220}, {"loss": 0.8323, "grad_norm": 0.4643295407295227, "learning_rate": 0.0002, "epoch": 0.01651705565529623, "step": 230}, {"loss": 0.8363, "grad_norm": 0.5152903199195862, "learning_rate": 0.0002, "epoch": 0.017235188509874325, "step": 240}, {"loss": 0.873, "grad_norm": 0.3800727427005768, "learning_rate": 0.0002, "epoch": 0.017953321364452424, "step": 250}, {"loss": 0.8252, "grad_norm": 0.43700528144836426, "learning_rate": 0.0002, "epoch": 0.01867145421903052, "step": 260}, {"loss": 0.8686, "grad_norm": 0.3712887763977051, "learning_rate": 0.0002, "epoch": 0.01938958707360862, "step": 270}, {"loss": 0.8329, "grad_norm": 0.4202553629875183, "learning_rate": 0.0002, "epoch": 0.020107719928186715, "step": 280}, {"loss": 0.8143, "grad_norm": 0.40585094690322876, "learning_rate": 0.0002, "epoch": 0.02082585278276481, "step": 290}, {"loss": 0.8463, "grad_norm": 0.4685470759868622, "learning_rate": 0.0002, "epoch": 0.02154398563734291, "step": 300}, {"loss": 0.8321, "grad_norm": 0.373169481754303, "learning_rate": 0.0002, "epoch": 0.022262118491921005, "step": 310}, {"loss": 0.8031, "grad_norm": 0.39681482315063477, "learning_rate": 0.0002, "epoch": 0.022980251346499104, "step": 320}, {"loss": 0.8667, "grad_norm": 0.3919322192668915, "learning_rate": 0.0002, "epoch": 0.0236983842010772, "step": 330}, {"loss": 0.8196, "grad_norm": 0.4728981554508209, "learning_rate": 0.0002, "epoch": 0.024416517055655295, "step": 340}, {"loss": 0.8662, "grad_norm": 0.42439374327659607, "learning_rate": 0.0002, "epoch": 0.025134649910233394, "step": 350}, {"loss": 0.8618, "grad_norm": 0.425650030374527, "learning_rate": 0.0002, "epoch": 0.02585278276481149, "step": 360}, {"loss": 0.8249, "grad_norm": 0.4076762795448303, "learning_rate": 0.0002, "epoch": 0.02657091561938959, "step": 370}, {"loss": 0.8293, "grad_norm": 0.44335922598838806, "learning_rate": 0.0002, "epoch": 0.027289048473967684, "step": 380}, {"loss": 0.8288, "grad_norm": 0.5313619375228882, "learning_rate": 0.0002, "epoch": 0.02800718132854578, "step": 390}, {"loss": 0.8431, "grad_norm": 0.37089797854423523, "learning_rate": 0.0002, "epoch": 0.02872531418312388, "step": 400}, {"loss": 0.7644, "grad_norm": 0.5193604826927185, "learning_rate": 0.0002, "epoch": 0.029443447037701975, "step": 410}, {"loss": 0.7853, "grad_norm": 0.4428552985191345, "learning_rate": 0.0002, "epoch": 0.03016157989228007, "step": 420}, {"loss": 0.8641, "grad_norm": 0.384171724319458, "learning_rate": 0.0002, "epoch": 0.03087971274685817, "step": 430}, {"loss": 0.8236, "grad_norm": 0.3906913101673126, "learning_rate": 0.0002, "epoch": 0.03159784560143627, "step": 440}, {"loss": 0.8215, "grad_norm": 0.5365669131278992, "learning_rate": 0.0002, "epoch": 0.03231597845601436, "step": 450}, {"loss": 0.8376, "grad_norm": 0.4785287380218506, "learning_rate": 0.0002, "epoch": 0.03303411131059246, "step": 460}, {"loss": 0.8439, "grad_norm": 0.40048182010650635, "learning_rate": 0.0002, "epoch": 0.03375224416517056, "step": 470}, {"loss": 0.8306, "grad_norm": 0.49529239535331726, "learning_rate": 0.0002, "epoch": 0.03447037701974865, "step": 480}, {"loss": 0.8653, "grad_norm": 0.5853474140167236, "learning_rate": 0.0002, "epoch": 0.03518850987432675, "step": 490}, {"loss": 0.7952, "grad_norm": 0.3802863359451294, "learning_rate": 0.0002, "epoch": 0.03590664272890485, "step": 500}, {"loss": 0.8986, "grad_norm": 0.40374308824539185, "learning_rate": 0.0002, "epoch": 0.03662477558348295, "step": 510}, {"loss": 0.8495, "grad_norm": 0.4320009648799896, "learning_rate": 0.0002, "epoch": 0.03734290843806104, "step": 520}, {"loss": 0.8838, "grad_norm": 0.5198846459388733, "learning_rate": 0.0002, "epoch": 0.03806104129263914, "step": 530}, {"loss": 0.8343, "grad_norm": 0.4136947989463806, "learning_rate": 0.0002, "epoch": 0.03877917414721724, "step": 540}, {"loss": 0.8752, "grad_norm": 0.39344364404678345, "learning_rate": 0.0002, "epoch": 0.03949730700179533, "step": 550}, {"loss": 0.8088, "grad_norm": 0.4659644067287445, "learning_rate": 0.0002, "epoch": 0.04021543985637343, "step": 560}, {"loss": 0.766, "grad_norm": 0.3898842930793762, "learning_rate": 0.0002, "epoch": 0.04093357271095153, "step": 570}, {"loss": 0.7806, "grad_norm": 0.3964841961860657, "learning_rate": 0.0002, "epoch": 0.04165170556552962, "step": 580}, {"loss": 0.801, "grad_norm": 0.5172179341316223, "learning_rate": 0.0002, "epoch": 0.04236983842010772, "step": 590}, {"loss": 0.8253, "grad_norm": 0.5362544059753418, "learning_rate": 0.0002, "epoch": 0.04308797127468582, "step": 600}, {"loss": 0.8701, "grad_norm": 0.3975909948348999, "learning_rate": 0.0002, "epoch": 0.04380610412926391, "step": 610}, {"loss": 0.844, "grad_norm": 0.3905031085014343, "learning_rate": 0.0002, "epoch": 0.04452423698384201, "step": 620}, {"loss": 0.7723, "grad_norm": 0.5148088932037354, "learning_rate": 0.0002, "epoch": 0.04524236983842011, "step": 630}, {"loss": 0.8309, "grad_norm": 0.38826194405555725, "learning_rate": 0.0002, "epoch": 0.04596050269299821, "step": 640}, {"loss": 0.8379, "grad_norm": 0.5432049036026001, "learning_rate": 0.0002, "epoch": 0.0466786355475763, "step": 650}, {"loss": 0.838, "grad_norm": 0.42048221826553345, "learning_rate": 0.0002, "epoch": 0.0473967684021544, "step": 660}, {"loss": 0.8337, "grad_norm": 0.4683088958263397, "learning_rate": 0.0002, "epoch": 0.0481149012567325, "step": 670}, {"loss": 0.7982, "grad_norm": 0.4623735249042511, "learning_rate": 0.0002, "epoch": 0.04883303411131059, "step": 680}, {"loss": 0.8905, "grad_norm": 0.509128212928772, "learning_rate": 0.0002, "epoch": 0.04955116696588869, "step": 690}, {"loss": 0.8193, "grad_norm": 0.45767295360565186, "learning_rate": 0.0002, "epoch": 0.05026929982046679, "step": 700}, {"loss": 0.7658, "grad_norm": 0.4023726284503937, "learning_rate": 0.0002, "epoch": 0.05098743267504488, "step": 710}, {"loss": 0.8552, "grad_norm": 0.4407201409339905, "learning_rate": 0.0002, "epoch": 0.05170556552962298, "step": 720}, {"loss": 0.8369, "grad_norm": 0.41862091422080994, "learning_rate": 0.0002, "epoch": 0.05242369838420108, "step": 730}, {"loss": 0.8856, "grad_norm": 0.37473055720329285, "learning_rate": 0.0002, "epoch": 0.05314183123877918, "step": 740}, {"loss": 0.8282, "grad_norm": 0.4882921576499939, "learning_rate": 0.0002, "epoch": 0.05385996409335727, "step": 750}, {"loss": 0.8257, "grad_norm": 0.47890132665634155, "learning_rate": 0.0002, "epoch": 0.05457809694793537, "step": 760}, {"loss": 0.9222, "grad_norm": 0.5811166167259216, "learning_rate": 0.0002, "epoch": 0.05529622980251347, "step": 770}, {"loss": 0.7943, "grad_norm": 0.41113588213920593, "learning_rate": 0.0002, "epoch": 0.05601436265709156, "step": 780}, {"loss": 0.791, "grad_norm": 0.4120602607727051, "learning_rate": 0.0002, "epoch": 0.05673249551166966, "step": 790}, {"loss": 0.9038, "grad_norm": 0.39287394285202026, "learning_rate": 0.0002, "epoch": 0.05745062836624776, "step": 800}, {"loss": 0.8131, "grad_norm": 0.3986941874027252, "learning_rate": 0.0002, "epoch": 0.05816876122082585, "step": 810}, {"loss": 0.8268, "grad_norm": 0.4264012575149536, "learning_rate": 0.0002, "epoch": 0.05888689407540395, "step": 820}, {"loss": 0.7881, "grad_norm": 0.481139600276947, "learning_rate": 0.0002, "epoch": 0.05960502692998205, "step": 830}, {"loss": 0.8477, "grad_norm": 0.5561784505844116, "learning_rate": 0.0002, "epoch": 0.06032315978456014, "step": 840}, {"loss": 0.7817, "grad_norm": 0.4787197411060333, "learning_rate": 0.0002, "epoch": 0.06104129263913824, "step": 850}, {"loss": 0.8567, "grad_norm": 0.46454647183418274, "learning_rate": 0.0002, "epoch": 0.06175942549371634, "step": 860}, {"loss": 0.8429, "grad_norm": 0.5929669141769409, "learning_rate": 0.0002, "epoch": 0.06247755834829444, "step": 870}, {"loss": 0.8019, "grad_norm": 0.4561384618282318, "learning_rate": 0.0002, "epoch": 0.06319569120287254, "step": 880}, {"loss": 0.8686, "grad_norm": 0.45767998695373535, "learning_rate": 0.0002, "epoch": 0.06391382405745062, "step": 890}, {"loss": 0.818, "grad_norm": 0.42475444078445435, "learning_rate": 0.0002, "epoch": 0.06463195691202872, "step": 900}, {"loss": 0.8579, "grad_norm": 0.4911022484302521, "learning_rate": 0.0002, "epoch": 0.06535008976660682, "step": 910}, {"loss": 0.8067, "grad_norm": 0.5229166746139526, "learning_rate": 0.0002, "epoch": 0.06606822262118492, "step": 920}, {"loss": 0.8563, "grad_norm": 0.38134580850601196, "learning_rate": 0.0002, "epoch": 0.06678635547576302, "step": 930}, {"loss": 0.815, "grad_norm": 0.4171486496925354, "learning_rate": 0.0002, "epoch": 0.06750448833034112, "step": 940}, {"loss": 0.8122, "grad_norm": 0.45171529054641724, "learning_rate": 0.0002, "epoch": 0.06822262118491922, "step": 950}, {"loss": 0.8436, "grad_norm": 0.44889307022094727, "learning_rate": 0.0002, "epoch": 0.0689407540394973, "step": 960}, {"loss": 0.8149, "grad_norm": 0.44902464747428894, "learning_rate": 0.0002, "epoch": 0.0696588868940754, "step": 970}, {"loss": 0.7916, "grad_norm": 0.4671969413757324, "learning_rate": 0.0002, "epoch": 0.0703770197486535, "step": 980}, {"loss": 0.8147, "grad_norm": 0.4686984717845917, "learning_rate": 0.0002, "epoch": 0.0710951526032316, "step": 990}, {"loss": 0.806, "grad_norm": 0.4513658583164215, "learning_rate": 0.0002, "epoch": 0.0718132854578097, "step": 1000}, {"loss": 0.8348, "grad_norm": 0.48861828446388245, "learning_rate": 0.0002, "epoch": 0.0725314183123878, "step": 1010}, {"loss": 0.8038, "grad_norm": 0.7603165507316589, "learning_rate": 0.0002, "epoch": 0.0732495511669659, "step": 1020}, {"loss": 0.7844, "grad_norm": 0.501654863357544, "learning_rate": 0.0002, "epoch": 0.07396768402154398, "step": 1030}, {"loss": 0.7623, "grad_norm": 0.45291560888290405, "learning_rate": 0.0002, "epoch": 0.07468581687612208, "step": 1040}, {"loss": 0.8174, "grad_norm": 0.42454713582992554, "learning_rate": 0.0002, "epoch": 0.07540394973070018, "step": 1050}, {"loss": 0.7874, "grad_norm": 0.4655592441558838, "learning_rate": 0.0002, "epoch": 0.07612208258527828, "step": 1060}, {"loss": 0.8855, "grad_norm": 0.5011071562767029, "learning_rate": 0.0002, "epoch": 0.07684021543985638, "step": 1070}, {"loss": 0.8502, "grad_norm": 0.37221577763557434, "learning_rate": 0.0002, "epoch": 0.07755834829443448, "step": 1080}, {"loss": 0.8623, "grad_norm": 0.5123572945594788, "learning_rate": 0.0002, "epoch": 0.07827648114901256, "step": 1090}, {"loss": 0.8527, "grad_norm": 0.44138720631599426, "learning_rate": 0.0002, "epoch": 0.07899461400359066, "step": 1100}, {"loss": 0.7949, "grad_norm": 0.38932886719703674, "learning_rate": 0.0002, "epoch": 0.07971274685816876, "step": 1110}, {"loss": 0.8289, "grad_norm": 0.435820072889328, "learning_rate": 0.0002, "epoch": 0.08043087971274686, "step": 1120}, {"loss": 0.787, "grad_norm": 0.3820142149925232, "learning_rate": 0.0002, "epoch": 0.08114901256732496, "step": 1130}, {"loss": 0.8617, "grad_norm": 0.39680808782577515, "learning_rate": 0.0002, "epoch": 0.08186714542190306, "step": 1140}, {"loss": 0.8047, "grad_norm": 0.4833722412586212, "learning_rate": 0.0002, "epoch": 0.08258527827648116, "step": 1150}, {"loss": 0.8513, "grad_norm": 0.5045956969261169, "learning_rate": 0.0002, "epoch": 0.08330341113105924, "step": 1160}, {"loss": 0.8366, "grad_norm": 0.3652207553386688, "learning_rate": 0.0002, "epoch": 0.08402154398563734, "step": 1170}, {"loss": 0.8464, "grad_norm": 0.44447052478790283, "learning_rate": 0.0002, "epoch": 0.08473967684021544, "step": 1180}, {"loss": 0.8362, "grad_norm": 0.44942694902420044, "learning_rate": 0.0002, "epoch": 0.08545780969479354, "step": 1190}, {"loss": 0.7932, "grad_norm": 0.48789075016975403, "learning_rate": 0.0002, "epoch": 0.08617594254937164, "step": 1200}, {"loss": 0.8008, "grad_norm": 0.3981451094150543, "learning_rate": 0.0002, "epoch": 0.08689407540394974, "step": 1210}, {"loss": 0.8296, "grad_norm": 0.45545220375061035, "learning_rate": 0.0002, "epoch": 0.08761220825852782, "step": 1220}, {"loss": 0.8406, "grad_norm": 0.562138557434082, "learning_rate": 0.0002, "epoch": 0.08833034111310592, "step": 1230}, {"loss": 0.808, "grad_norm": 0.48523494601249695, "learning_rate": 0.0002, "epoch": 0.08904847396768402, "step": 1240}, {"loss": 0.8024, "grad_norm": 0.35054388642311096, "learning_rate": 0.0002, "epoch": 0.08976660682226212, "step": 1250}, {"loss": 0.8635, "grad_norm": 0.4148605167865753, "learning_rate": 0.0002, "epoch": 0.09048473967684022, "step": 1260}, {"loss": 0.8379, "grad_norm": 0.50171959400177, "learning_rate": 0.0002, "epoch": 0.09120287253141832, "step": 1270}, {"loss": 0.8466, "grad_norm": 0.41747573018074036, "learning_rate": 0.0002, "epoch": 0.09192100538599642, "step": 1280}, {"loss": 0.7905, "grad_norm": 0.43028751015663147, "learning_rate": 0.0002, "epoch": 0.0926391382405745, "step": 1290}, {"loss": 0.8071, "grad_norm": 0.41274991631507874, "learning_rate": 0.0002, "epoch": 0.0933572710951526, "step": 1300}, {"loss": 0.8214, "grad_norm": 0.5399569272994995, "learning_rate": 0.0002, "epoch": 0.0940754039497307, "step": 1310}, {"loss": 0.8108, "grad_norm": 0.44284379482269287, "learning_rate": 0.0002, "epoch": 0.0947935368043088, "step": 1320}, {"loss": 0.8301, "grad_norm": 0.42511969804763794, "learning_rate": 0.0002, "epoch": 0.0955116696588869, "step": 1330}, {"loss": 0.8527, "grad_norm": 0.5717929005622864, "learning_rate": 0.0002, "epoch": 0.096229802513465, "step": 1340}, {"loss": 0.8232, "grad_norm": 0.4104631245136261, "learning_rate": 0.0002, "epoch": 0.09694793536804308, "step": 1350}, {"loss": 0.8697, "grad_norm": 0.4144339859485626, "learning_rate": 0.0002, "epoch": 0.09766606822262118, "step": 1360}, {"loss": 0.7909, "grad_norm": 0.43676936626434326, "learning_rate": 0.0002, "epoch": 0.09838420107719928, "step": 1370}, {"loss": 0.8757, "grad_norm": 0.5297161340713501, "learning_rate": 0.0002, "epoch": 0.09910233393177738, "step": 1380}, {"loss": 0.7772, "grad_norm": 0.5319193601608276, "learning_rate": 0.0002, "epoch": 0.09982046678635548, "step": 1390}, {"loss": 0.8167, "grad_norm": 0.4083728492259979, "learning_rate": 0.0002, "epoch": 0.10053859964093358, "step": 1400}, {"loss": 0.8436, "grad_norm": 0.4193868339061737, "learning_rate": 0.0002, "epoch": 0.10125673249551168, "step": 1410}, {"loss": 0.8634, "grad_norm": 0.4062198996543884, "learning_rate": 0.0002, "epoch": 0.10197486535008976, "step": 1420}, {"loss": 0.7984, "grad_norm": 0.43972232937812805, "learning_rate": 0.0002, "epoch": 0.10269299820466786, "step": 1430}, {"loss": 0.8278, "grad_norm": 0.4598410725593567, "learning_rate": 0.0002, "epoch": 0.10341113105924596, "step": 1440}, {"loss": 0.8527, "grad_norm": 0.571662187576294, "learning_rate": 0.0002, "epoch": 0.10412926391382406, "step": 1450}, {"loss": 0.8485, "grad_norm": 0.5437791347503662, "learning_rate": 0.0002, "epoch": 0.10484739676840216, "step": 1460}, {"loss": 0.8172, "grad_norm": 0.4241923391819, "learning_rate": 0.0002, "epoch": 0.10556552962298026, "step": 1470}, {"loss": 0.8224, "grad_norm": 0.5185145735740662, "learning_rate": 0.0002, "epoch": 0.10628366247755835, "step": 1480}, {"loss": 0.8292, "grad_norm": 0.537626326084137, "learning_rate": 0.0002, "epoch": 0.10700179533213644, "step": 1490}, {"loss": 0.8227, "grad_norm": 0.4573661983013153, "learning_rate": 0.0002, "epoch": 0.10771992818671454, "step": 1500}, {"loss": 0.8318, "grad_norm": 0.4521017074584961, "learning_rate": 0.0002, "epoch": 0.10843806104129264, "step": 1510}, {"loss": 0.8107, "grad_norm": 0.6835159063339233, "learning_rate": 0.0002, "epoch": 0.10915619389587074, "step": 1520}, {"loss": 0.8256, "grad_norm": 0.43522894382476807, "learning_rate": 0.0002, "epoch": 0.10987432675044884, "step": 1530}, {"loss": 0.8211, "grad_norm": 0.685547411441803, "learning_rate": 0.0002, "epoch": 0.11059245960502694, "step": 1540}, {"loss": 0.8393, "grad_norm": 0.5283669233322144, "learning_rate": 0.0002, "epoch": 0.11131059245960502, "step": 1550}, {"loss": 0.8493, "grad_norm": 0.4869283437728882, "learning_rate": 0.0002, "epoch": 0.11202872531418312, "step": 1560}, {"loss": 0.8614, "grad_norm": 0.43024054169654846, "learning_rate": 0.0002, "epoch": 0.11274685816876122, "step": 1570}, {"loss": 0.8026, "grad_norm": 0.46726059913635254, "learning_rate": 0.0002, "epoch": 0.11346499102333932, "step": 1580}, {"loss": 0.8103, "grad_norm": 0.5046039819717407, "learning_rate": 0.0002, "epoch": 0.11418312387791742, "step": 1590}, {"loss": 0.8242, "grad_norm": 0.48972827196121216, "learning_rate": 0.0002, "epoch": 0.11490125673249552, "step": 1600}, {"loss": 0.8114, "grad_norm": 0.5221049189567566, "learning_rate": 0.0002, "epoch": 0.11561938958707361, "step": 1610}, {"loss": 0.8022, "grad_norm": 0.49169477820396423, "learning_rate": 0.0002, "epoch": 0.1163375224416517, "step": 1620}, {"loss": 0.8223, "grad_norm": 0.48462188243865967, "learning_rate": 0.0002, "epoch": 0.1170556552962298, "step": 1630}, {"loss": 0.8409, "grad_norm": 0.9001021981239319, "learning_rate": 0.0002, "epoch": 0.1177737881508079, "step": 1640}, {"loss": 0.8037, "grad_norm": 0.47555917501449585, "learning_rate": 0.0002, "epoch": 0.118491921005386, "step": 1650}, {"loss": 0.8047, "grad_norm": 0.4523521959781647, "learning_rate": 0.0002, "epoch": 0.1192100538599641, "step": 1660}, {"loss": 0.8552, "grad_norm": 0.510956346988678, "learning_rate": 0.0002, "epoch": 0.1199281867145422, "step": 1670}, {"loss": 0.8081, "grad_norm": 0.48063746094703674, "learning_rate": 0.0002, "epoch": 0.12064631956912028, "step": 1680}, {"loss": 0.7712, "grad_norm": 0.5209490060806274, "learning_rate": 0.0002, "epoch": 0.12136445242369838, "step": 1690}, {"loss": 0.8019, "grad_norm": 0.5488983988761902, "learning_rate": 0.0002, "epoch": 0.12208258527827648, "step": 1700}, {"loss": 0.829, "grad_norm": 0.5263523459434509, "learning_rate": 0.0002, "epoch": 0.12280071813285458, "step": 1710}, {"loss": 0.7761, "grad_norm": 0.45365768671035767, "learning_rate": 0.0002, "epoch": 0.12351885098743268, "step": 1720}, {"loss": 0.8432, "grad_norm": 0.4366922378540039, "learning_rate": 0.0002, "epoch": 0.12423698384201078, "step": 1730}, {"loss": 0.8261, "grad_norm": 0.4841083884239197, "learning_rate": 0.0002, "epoch": 0.12495511669658887, "step": 1740}, {"loss": 0.7834, "grad_norm": 0.46546968817710876, "learning_rate": 0.0002, "epoch": 0.12567324955116696, "step": 1750}, {"loss": 0.7874, "grad_norm": 0.39987099170684814, "learning_rate": 0.0002, "epoch": 0.12639138240574507, "step": 1760}, {"loss": 0.813, "grad_norm": 0.4661678969860077, "learning_rate": 0.0002, "epoch": 0.12710951526032316, "step": 1770}, {"loss": 0.8516, "grad_norm": 0.46716657280921936, "learning_rate": 0.0002, "epoch": 0.12782764811490124, "step": 1780}, {"loss": 0.8065, "grad_norm": 0.46164995431900024, "learning_rate": 0.0002, "epoch": 0.12854578096947936, "step": 1790}, {"loss": 0.8911, "grad_norm": 0.4910370111465454, "learning_rate": 0.0002, "epoch": 0.12926391382405744, "step": 1800}, {"loss": 0.7773, "grad_norm": 0.5615737438201904, "learning_rate": 0.0002, "epoch": 0.12998204667863555, "step": 1810}, {"loss": 0.7726, "grad_norm": 0.5739728808403015, "learning_rate": 0.0002, "epoch": 0.13070017953321364, "step": 1820}, {"loss": 0.8307, "grad_norm": 0.44104722142219543, "learning_rate": 0.0002, "epoch": 0.13141831238779175, "step": 1830}, {"loss": 0.7533, "grad_norm": 0.46373724937438965, "learning_rate": 0.0002, "epoch": 0.13213644524236984, "step": 1840}, {"loss": 0.8181, "grad_norm": 0.4481196403503418, "learning_rate": 0.0002, "epoch": 0.13285457809694792, "step": 1850}, {"loss": 0.8508, "grad_norm": 0.5689327716827393, "learning_rate": 0.0002, "epoch": 0.13357271095152604, "step": 1860}, {"loss": 0.8364, "grad_norm": 0.5334849953651428, "learning_rate": 0.0002, "epoch": 0.13429084380610412, "step": 1870}, {"loss": 0.8018, "grad_norm": 0.5177253484725952, "learning_rate": 0.0002, "epoch": 0.13500897666068223, "step": 1880}, {"loss": 0.869, "grad_norm": 0.4919368326663971, "learning_rate": 0.0002, "epoch": 0.13572710951526032, "step": 1890}, {"loss": 0.7647, "grad_norm": 0.5987576842308044, "learning_rate": 0.0002, "epoch": 0.13644524236983843, "step": 1900}, {"loss": 0.8546, "grad_norm": 0.49790486693382263, "learning_rate": 0.0002, "epoch": 0.13716337522441652, "step": 1910}, {"loss": 0.8402, "grad_norm": 0.5337542295455933, "learning_rate": 0.0002, "epoch": 0.1378815080789946, "step": 1920}, {"loss": 0.815, "grad_norm": 0.5171598792076111, "learning_rate": 0.0002, "epoch": 0.13859964093357272, "step": 1930}, {"loss": 0.843, "grad_norm": 0.5003953576087952, "learning_rate": 0.0002, "epoch": 0.1393177737881508, "step": 1940}, {"loss": 0.7867, "grad_norm": 0.5147887468338013, "learning_rate": 0.0002, "epoch": 0.1400359066427289, "step": 1950}, {"loss": 0.8215, "grad_norm": 0.6365984678268433, "learning_rate": 0.0002, "epoch": 0.140754039497307, "step": 1960}, {"loss": 0.8397, "grad_norm": 0.5449512004852295, "learning_rate": 0.0002, "epoch": 0.1414721723518851, "step": 1970}, {"loss": 0.8177, "grad_norm": 0.4062703847885132, "learning_rate": 0.0002, "epoch": 0.1421903052064632, "step": 1980}, {"loss": 0.8058, "grad_norm": 0.4446912705898285, "learning_rate": 0.0002, "epoch": 0.14290843806104128, "step": 1990}, {"loss": 0.7854, "grad_norm": 0.49001234769821167, "learning_rate": 0.0002, "epoch": 0.1436265709156194, "step": 2000}, {"loss": 0.8136, "grad_norm": 0.5591765642166138, "learning_rate": 0.0002, "epoch": 0.14434470377019748, "step": 2010}, {"loss": 0.7808, "grad_norm": 0.6476696133613586, "learning_rate": 0.0002, "epoch": 0.1450628366247756, "step": 2020}, {"loss": 0.8137, "grad_norm": 0.44688376784324646, "learning_rate": 0.0002, "epoch": 0.14578096947935368, "step": 2030}, {"loss": 0.8253, "grad_norm": 0.4437490701675415, "learning_rate": 0.0002, "epoch": 0.1464991023339318, "step": 2040}, {"loss": 0.7654, "grad_norm": 0.59927898645401, "learning_rate": 0.0002, "epoch": 0.14721723518850988, "step": 2050}, {"loss": 0.825, "grad_norm": 0.4356591999530792, "learning_rate": 0.0002, "epoch": 0.14793536804308796, "step": 2060}, {"loss": 0.8038, "grad_norm": 0.5560822486877441, "learning_rate": 0.0002, "epoch": 0.14865350089766607, "step": 2070}, {"loss": 0.838, "grad_norm": 0.43027108907699585, "learning_rate": 0.0002, "epoch": 0.14937163375224416, "step": 2080}, {"loss": 0.8317, "grad_norm": 0.41215455532073975, "learning_rate": 0.0002, "epoch": 0.15008976660682227, "step": 2090}, {"loss": 0.7948, "grad_norm": 0.4607839584350586, "learning_rate": 0.0002, "epoch": 0.15080789946140036, "step": 2100}, {"loss": 0.7981, "grad_norm": 0.4699854254722595, "learning_rate": 0.0002, "epoch": 0.15152603231597844, "step": 2110}, {"loss": 0.8464, "grad_norm": 0.5111975073814392, "learning_rate": 0.0002, "epoch": 0.15224416517055656, "step": 2120}, {"loss": 0.7672, "grad_norm": 0.4713742733001709, "learning_rate": 0.0002, "epoch": 0.15296229802513464, "step": 2130}, {"loss": 0.7692, "grad_norm": 0.3816622793674469, "learning_rate": 0.0002, "epoch": 0.15368043087971275, "step": 2140}, {"loss": 0.7824, "grad_norm": 0.4637526273727417, "learning_rate": 0.0002, "epoch": 0.15439856373429084, "step": 2150}, {"loss": 0.8185, "grad_norm": 0.3691818118095398, "learning_rate": 0.0002, "epoch": 0.15511669658886895, "step": 2160}, {"loss": 0.8298, "grad_norm": 0.4435218274593353, "learning_rate": 0.0002, "epoch": 0.15583482944344704, "step": 2170}, {"loss": 0.7917, "grad_norm": 0.5282211899757385, "learning_rate": 0.0002, "epoch": 0.15655296229802512, "step": 2180}, {"loss": 0.8006, "grad_norm": 0.7611056566238403, "learning_rate": 0.0002, "epoch": 0.15727109515260324, "step": 2190}, {"loss": 0.8039, "grad_norm": 0.5951169729232788, "learning_rate": 0.0002, "epoch": 0.15798922800718132, "step": 2200}, {"loss": 0.8314, "grad_norm": 0.5243265628814697, "learning_rate": 0.0002, "epoch": 0.15870736086175943, "step": 2210}, {"loss": 0.7817, "grad_norm": 0.518944501876831, "learning_rate": 0.0002, "epoch": 0.15942549371633752, "step": 2220}, {"loss": 0.8187, "grad_norm": 0.4264616072177887, "learning_rate": 0.0002, "epoch": 0.16014362657091563, "step": 2230}, {"loss": 0.7916, "grad_norm": 0.4619045853614807, "learning_rate": 0.0002, "epoch": 0.16086175942549372, "step": 2240}, {"loss": 0.84, "grad_norm": 0.4047030508518219, "learning_rate": 0.0002, "epoch": 0.1615798922800718, "step": 2250}, {"loss": 0.8133, "grad_norm": 0.47133687138557434, "learning_rate": 0.0002, "epoch": 0.16229802513464991, "step": 2260}, {"loss": 0.8032, "grad_norm": 0.4990246593952179, "learning_rate": 0.0002, "epoch": 0.163016157989228, "step": 2270}, {"loss": 0.752, "grad_norm": 0.5145298838615417, "learning_rate": 0.0002, "epoch": 0.1637342908438061, "step": 2280}, {"loss": 0.8441, "grad_norm": 0.5354352593421936, "learning_rate": 0.0002, "epoch": 0.1644524236983842, "step": 2290}, {"loss": 0.8099, "grad_norm": 0.47621065378189087, "learning_rate": 0.0002, "epoch": 0.1651705565529623, "step": 2300}, {"loss": 0.8105, "grad_norm": 0.45333582162857056, "learning_rate": 0.0002, "epoch": 0.1658886894075404, "step": 2310}, {"loss": 0.8126, "grad_norm": 0.4832790493965149, "learning_rate": 0.0002, "epoch": 0.16660682226211848, "step": 2320}, {"loss": 0.8271, "grad_norm": 0.4922761619091034, "learning_rate": 0.0002, "epoch": 0.1673249551166966, "step": 2330}, {"loss": 0.8324, "grad_norm": 0.5701655149459839, "learning_rate": 0.0002, "epoch": 0.16804308797127468, "step": 2340}, {"loss": 0.844, "grad_norm": 0.5170459151268005, "learning_rate": 0.0002, "epoch": 0.1687612208258528, "step": 2350}, {"loss": 0.7995, "grad_norm": 0.6562373638153076, "learning_rate": 0.0002, "epoch": 0.16947935368043088, "step": 2360}, {"loss": 0.7733, "grad_norm": 0.5350262522697449, "learning_rate": 0.0002, "epoch": 0.170197486535009, "step": 2370}, {"loss": 0.8501, "grad_norm": 0.5163491368293762, "learning_rate": 0.0002, "epoch": 0.17091561938958708, "step": 2380}, {"loss": 0.7708, "grad_norm": 0.48841530084609985, "learning_rate": 0.0002, "epoch": 0.17163375224416516, "step": 2390}, {"loss": 0.7969, "grad_norm": 0.44912993907928467, "learning_rate": 0.0002, "epoch": 0.17235188509874327, "step": 2400}, {"loss": 0.7706, "grad_norm": 0.5770647525787354, "learning_rate": 0.0002, "epoch": 0.17307001795332136, "step": 2410}, {"loss": 0.8233, "grad_norm": 0.4716179072856903, "learning_rate": 0.0002, "epoch": 0.17378815080789947, "step": 2420}, {"loss": 0.7802, "grad_norm": 0.5465078949928284, "learning_rate": 0.0002, "epoch": 0.17450628366247756, "step": 2430}, {"loss": 0.8191, "grad_norm": 0.40810713171958923, "learning_rate": 0.0002, "epoch": 0.17522441651705564, "step": 2440}, {"loss": 0.7971, "grad_norm": 0.3789578080177307, "learning_rate": 0.0002, "epoch": 0.17594254937163376, "step": 2450}, {"loss": 0.7437, "grad_norm": 0.4615110158920288, "learning_rate": 0.0002, "epoch": 0.17666068222621184, "step": 2460}, {"loss": 0.8102, "grad_norm": 0.4400235712528229, "learning_rate": 0.0002, "epoch": 0.17737881508078995, "step": 2470}, {"loss": 0.8254, "grad_norm": 0.5935020446777344, "learning_rate": 0.0002, "epoch": 0.17809694793536804, "step": 2480}, {"loss": 0.7886, "grad_norm": 0.5672990679740906, "learning_rate": 0.0002, "epoch": 0.17881508078994615, "step": 2490}, {"loss": 0.7829, "grad_norm": 0.4132838845252991, "learning_rate": 0.0002, "epoch": 0.17953321364452424, "step": 2500}, {"loss": 0.8056, "grad_norm": 0.5373716950416565, "learning_rate": 0.0002, "epoch": 0.18025134649910232, "step": 2510}, {"loss": 0.8061, "grad_norm": 0.5335832834243774, "learning_rate": 0.0002, "epoch": 0.18096947935368043, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5705642700195312, "learning_rate": 0.0002, "epoch": 0.18168761220825852, "step": 2530}, {"loss": 0.7779, "grad_norm": 0.4807959496974945, "learning_rate": 0.0002, "epoch": 0.18240574506283663, "step": 2540}, {"loss": 0.7767, "grad_norm": 0.4430573880672455, "learning_rate": 0.0002, "epoch": 0.18312387791741472, "step": 2550}, {"loss": 0.7921, "grad_norm": 0.5294728875160217, "learning_rate": 0.0002, "epoch": 0.18384201077199283, "step": 2560}, {"loss": 0.8102, "grad_norm": 0.661173403263092, "learning_rate": 0.0002, "epoch": 0.18456014362657092, "step": 2570}, {"loss": 0.803, "grad_norm": 0.5044304728507996, "learning_rate": 0.0002, "epoch": 0.185278276481149, "step": 2580}, {"loss": 0.7833, "grad_norm": 0.48929551243782043, "learning_rate": 0.0002, "epoch": 0.18599640933572711, "step": 2590}, {"loss": 0.8252, "grad_norm": 0.5054438710212708, "learning_rate": 0.0002, "epoch": 0.1867145421903052, "step": 2600}, {"loss": 0.7665, "grad_norm": 0.5613677501678467, "learning_rate": 0.0002, "epoch": 0.1874326750448833, "step": 2610}, {"loss": 0.7954, "grad_norm": 0.5762478709220886, "learning_rate": 0.0002, "epoch": 0.1881508078994614, "step": 2620}, {"loss": 0.8312, "grad_norm": 0.4523695409297943, "learning_rate": 0.0002, "epoch": 0.1888689407540395, "step": 2630}, {"loss": 0.8098, "grad_norm": 0.5235317945480347, "learning_rate": 0.0002, "epoch": 0.1895870736086176, "step": 2640}, {"loss": 0.8281, "grad_norm": 0.4894576370716095, "learning_rate": 0.0002, "epoch": 0.19030520646319568, "step": 2650}, {"loss": 0.7923, "grad_norm": 0.45731106400489807, "learning_rate": 0.0002, "epoch": 0.1910233393177738, "step": 2660}, {"loss": 0.7942, "grad_norm": 0.4726541042327881, "learning_rate": 0.0002, "epoch": 0.19174147217235188, "step": 2670}, {"loss": 0.7979, "grad_norm": 0.4281631410121918, "learning_rate": 0.0002, "epoch": 0.19245960502693, "step": 2680}, {"loss": 0.8076, "grad_norm": 0.48011314868927, "learning_rate": 0.0002, "epoch": 0.19317773788150808, "step": 2690}, {"loss": 0.7785, "grad_norm": 0.45785006880760193, "learning_rate": 0.0002, "epoch": 0.19389587073608616, "step": 2700}, {"loss": 0.7726, "grad_norm": 0.5244625210762024, "learning_rate": 0.0002, "epoch": 0.19461400359066428, "step": 2710}, {"loss": 0.8674, "grad_norm": 0.4674883186817169, "learning_rate": 0.0002, "epoch": 0.19533213644524236, "step": 2720}, {"loss": 0.8465, "grad_norm": 0.5969558358192444, "learning_rate": 0.0002, "epoch": 0.19605026929982047, "step": 2730}, {"loss": 0.8238, "grad_norm": 0.44413265585899353, "learning_rate": 0.0002, "epoch": 0.19676840215439856, "step": 2740}, {"loss": 0.8181, "grad_norm": 0.5094553828239441, "learning_rate": 0.0002, "epoch": 0.19748653500897667, "step": 2750}, {"loss": 0.7593, "grad_norm": 0.4931736886501312, "learning_rate": 0.0002, "epoch": 0.19820466786355476, "step": 2760}, {"loss": 0.8535, "grad_norm": 0.4766625463962555, "learning_rate": 0.0002, "epoch": 0.19892280071813284, "step": 2770}, {"loss": 0.754, "grad_norm": 0.4196971654891968, "learning_rate": 0.0002, "epoch": 0.19964093357271095, "step": 2780}, {"loss": 0.7794, "grad_norm": 0.4693375825881958, "learning_rate": 0.0002, "epoch": 0.20035906642728904, "step": 2790}, {"loss": 0.8336, "grad_norm": 0.5407108664512634, "learning_rate": 0.0002, "epoch": 0.20107719928186715, "step": 2800}, {"loss": 0.7938, "grad_norm": 0.42864227294921875, "learning_rate": 0.0002, "epoch": 0.20179533213644524, "step": 2810}, {"loss": 0.8059, "grad_norm": 0.4928833246231079, "learning_rate": 0.0002, "epoch": 0.20251346499102335, "step": 2820}, {"loss": 0.8221, "grad_norm": 0.5575131773948669, "learning_rate": 0.0002, "epoch": 0.20323159784560144, "step": 2830}, {"loss": 0.7712, "grad_norm": 0.505114734172821, "learning_rate": 0.0002, "epoch": 0.20394973070017952, "step": 2840}, {"loss": 0.7986, "grad_norm": 0.4727420210838318, "learning_rate": 0.0002, "epoch": 0.20466786355475763, "step": 2850}, {"loss": 0.7662, "grad_norm": 0.48218145966529846, "learning_rate": 0.0002, "epoch": 0.20538599640933572, "step": 2860}, {"loss": 0.8055, "grad_norm": 0.5196906328201294, "learning_rate": 0.0002, "epoch": 0.20610412926391383, "step": 2870}, {"loss": 0.8401, "grad_norm": 0.4927639067173004, "learning_rate": 0.0002, "epoch": 0.20682226211849192, "step": 2880}, {"loss": 0.8067, "grad_norm": 0.5076990127563477, "learning_rate": 0.0002, "epoch": 0.20754039497307003, "step": 2890}, {"loss": 0.789, "grad_norm": 0.4606800079345703, "learning_rate": 0.0002, "epoch": 0.20825852782764812, "step": 2900}, {"loss": 0.8381, "grad_norm": 0.6184319257736206, "learning_rate": 0.0002, "epoch": 0.2089766606822262, "step": 2910}, {"loss": 0.8019, "grad_norm": 0.5237935781478882, "learning_rate": 0.0002, "epoch": 0.2096947935368043, "step": 2920}, {"loss": 0.7763, "grad_norm": 0.43966251611709595, "learning_rate": 0.0002, "epoch": 0.2104129263913824, "step": 2930}, {"loss": 0.7915, "grad_norm": 0.48786666989326477, "learning_rate": 0.0002, "epoch": 0.2111310592459605, "step": 2940}, {"loss": 0.7549, "grad_norm": 0.4397817552089691, "learning_rate": 0.0002, "epoch": 0.2118491921005386, "step": 2950}, {"loss": 0.8342, "grad_norm": 0.5155336260795593, "learning_rate": 0.0002, "epoch": 0.2125673249551167, "step": 2960}, {"loss": 0.7885, "grad_norm": 0.48058274388313293, "learning_rate": 0.0002, "epoch": 0.2132854578096948, "step": 2970}, {"loss": 0.8208, "grad_norm": 0.5022647976875305, "learning_rate": 0.0002, "epoch": 0.21400359066427288, "step": 2980}, {"loss": 0.784, "grad_norm": 0.5417225360870361, "learning_rate": 0.0002, "epoch": 0.214721723518851, "step": 2990}, {"loss": 0.8518, "grad_norm": 0.46300315856933594, "learning_rate": 0.0002, "epoch": 0.21543985637342908, "step": 3000}, {"loss": 0.764, "grad_norm": 0.5375089049339294, "learning_rate": 0.0002, "epoch": 0.2161579892280072, "step": 3010}, {"loss": 0.8459, "grad_norm": 0.5050022602081299, "learning_rate": 0.0002, "epoch": 0.21687612208258528, "step": 3020}, {"loss": 0.797, "grad_norm": 0.46347716450691223, "learning_rate": 0.0002, "epoch": 0.21759425493716336, "step": 3030}, {"loss": 0.8014, "grad_norm": 0.544874370098114, "learning_rate": 0.0002, "epoch": 0.21831238779174147, "step": 3040}, {"loss": 0.802, "grad_norm": 0.4268142580986023, "learning_rate": 0.0002, "epoch": 0.21903052064631956, "step": 3050}, {"loss": 0.8224, "grad_norm": 0.5527157187461853, "learning_rate": 0.0002, "epoch": 0.21974865350089767, "step": 3060}, {"loss": 0.771, "grad_norm": 0.5565235018730164, "learning_rate": 0.0002, "epoch": 0.22046678635547576, "step": 3070}, {"loss": 0.7807, "grad_norm": 0.4900645613670349, "learning_rate": 0.0002, "epoch": 0.22118491921005387, "step": 3080}, {"loss": 0.8321, "grad_norm": 0.4951242208480835, "learning_rate": 0.0002, "epoch": 0.22190305206463196, "step": 3090}, {"loss": 0.8301, "grad_norm": 0.5831719636917114, "learning_rate": 0.0002, "epoch": 0.22262118491921004, "step": 3100}, {"loss": 0.8011, "grad_norm": 0.417576402425766, "learning_rate": 0.0002, "epoch": 0.22333931777378815, "step": 3110}, {"loss": 0.8226, "grad_norm": 0.4715117812156677, "learning_rate": 0.0002, "epoch": 0.22405745062836624, "step": 3120}, {"loss": 0.778, "grad_norm": 0.5956445336341858, "learning_rate": 0.0002, "epoch": 0.22477558348294435, "step": 3130}, {"loss": 0.788, "grad_norm": 0.408184289932251, "learning_rate": 0.0002, "epoch": 0.22549371633752244, "step": 3140}, {"loss": 0.8096, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 0.22621184919210055, "step": 3150}, {"loss": 0.7722, "grad_norm": 0.5631294846534729, "learning_rate": 0.0002, "epoch": 0.22692998204667864, "step": 3160}, {"loss": 0.7933, "grad_norm": 0.5054665803909302, "learning_rate": 0.0002, "epoch": 0.22764811490125672, "step": 3170}, {"loss": 0.8572, "grad_norm": 0.47388020157814026, "learning_rate": 0.0002, "epoch": 0.22836624775583483, "step": 3180}, {"loss": 0.8148, "grad_norm": 0.45871609449386597, "learning_rate": 0.0002, "epoch": 0.22908438061041292, "step": 3190}, {"loss": 0.8373, "grad_norm": 0.42431211471557617, "learning_rate": 0.0002, "epoch": 0.22980251346499103, "step": 3200}, {"loss": 0.7847, "grad_norm": 0.584872305393219, "learning_rate": 0.0002, "epoch": 0.23052064631956912, "step": 3210}, {"loss": 0.8118, "grad_norm": 0.5489653944969177, "learning_rate": 0.0002, "epoch": 0.23123877917414723, "step": 3220}, {"loss": 0.8552, "grad_norm": 0.5803213119506836, "learning_rate": 0.0002, "epoch": 0.23195691202872532, "step": 3230}, {"loss": 0.7702, "grad_norm": 0.906505823135376, "learning_rate": 0.0002, "epoch": 0.2326750448833034, "step": 3240}, {"loss": 0.8454, "grad_norm": 0.4569525718688965, "learning_rate": 0.0002, "epoch": 0.2333931777378815, "step": 3250}, {"loss": 0.7641, "grad_norm": 0.5566741228103638, "learning_rate": 0.0002, "epoch": 0.2341113105924596, "step": 3260}, {"loss": 0.7964, "grad_norm": 0.5059959888458252, "learning_rate": 0.0002, "epoch": 0.2348294434470377, "step": 3270}, {"loss": 0.7965, "grad_norm": 0.530828058719635, "learning_rate": 0.0002, "epoch": 0.2355475763016158, "step": 3280}, {"loss": 0.807, "grad_norm": 0.5149409174919128, "learning_rate": 0.0002, "epoch": 0.2362657091561939, "step": 3290}, {"loss": 0.8067, "grad_norm": 0.7323763966560364, "learning_rate": 0.0002, "epoch": 0.236983842010772, "step": 3300}, {"loss": 0.774, "grad_norm": 0.6794836521148682, "learning_rate": 0.0002, "epoch": 0.23770197486535008, "step": 3310}, {"loss": 0.7902, "grad_norm": 0.5176534056663513, "learning_rate": 0.0002, "epoch": 0.2384201077199282, "step": 3320}, {"loss": 0.8119, "grad_norm": 0.42245906591415405, "learning_rate": 0.0002, "epoch": 0.23913824057450628, "step": 3330}, {"loss": 0.868, "grad_norm": 0.43535107374191284, "learning_rate": 0.0002, "epoch": 0.2398563734290844, "step": 3340}, {"loss": 0.825, "grad_norm": 0.7038307785987854, "learning_rate": 0.0002, "epoch": 0.24057450628366248, "step": 3350}, {"loss": 0.7818, "grad_norm": 0.5689977407455444, "learning_rate": 0.0002, "epoch": 0.24129263913824056, "step": 3360}, {"loss": 0.7958, "grad_norm": 0.538136899471283, "learning_rate": 0.0002, "epoch": 0.24201077199281867, "step": 3370}, {"loss": 0.7995, "grad_norm": 0.7433661222457886, "learning_rate": 0.0002, "epoch": 0.24272890484739676, "step": 3380}, {"loss": 0.8564, "grad_norm": 0.6996734738349915, "learning_rate": 0.0002, "epoch": 0.24344703770197487, "step": 3390}, {"loss": 0.8288, "grad_norm": 0.5055703520774841, "learning_rate": 0.0002, "epoch": 0.24416517055655296, "step": 3400}, {"loss": 0.7741, "grad_norm": 0.5218513607978821, "learning_rate": 0.0002, "epoch": 0.24488330341113107, "step": 3410}, {"loss": 0.7903, "grad_norm": 0.42782822251319885, "learning_rate": 0.0002, "epoch": 0.24560143626570916, "step": 3420}, {"loss": 0.8005, "grad_norm": 0.4991157650947571, "learning_rate": 0.0002, "epoch": 0.24631956912028724, "step": 3430}, {"loss": 0.8151, "grad_norm": 0.5063165426254272, "learning_rate": 0.0002, "epoch": 0.24703770197486535, "step": 3440}, {"loss": 0.7722, "grad_norm": 0.45863136649131775, "learning_rate": 0.0002, "epoch": 0.24775583482944344, "step": 3450}, {"loss": 0.8236, "grad_norm": 0.474728524684906, "learning_rate": 0.0002, "epoch": 0.24847396768402155, "step": 3460}, {"loss": 0.7698, "grad_norm": 0.522570013999939, "learning_rate": 0.0002, "epoch": 0.24919210053859964, "step": 3470}, {"loss": 0.7448, "grad_norm": 0.5474396347999573, "learning_rate": 0.0002, "epoch": 0.24991023339317775, "step": 3480}, {"loss": 0.8339, "grad_norm": 0.49094662070274353, "learning_rate": 0.0002, "epoch": 0.2506283662477558, "step": 3490}, {"loss": 0.7864, "grad_norm": 0.6399132609367371, "learning_rate": 0.0002, "epoch": 0.2513464991023339, "step": 3500}, {"loss": 0.7988, "grad_norm": 0.5910066366195679, "learning_rate": 0.0002, "epoch": 0.25206463195691203, "step": 3510}, {"loss": 0.813, "grad_norm": 0.4761259853839874, "learning_rate": 0.0002, "epoch": 0.25278276481149015, "step": 3520}, {"loss": 0.812, "grad_norm": 0.5124502182006836, "learning_rate": 0.0002, "epoch": 0.2535008976660682, "step": 3530}, {"loss": 0.7699, "grad_norm": 0.4329150915145874, "learning_rate": 0.0002, "epoch": 0.2542190305206463, "step": 3540}, {"loss": 0.8205, "grad_norm": 0.4839608371257782, "learning_rate": 0.0002, "epoch": 0.25493716337522443, "step": 3550}, {"loss": 0.8279, "grad_norm": 0.5413459539413452, "learning_rate": 0.0002, "epoch": 0.2556552962298025, "step": 3560}, {"loss": 0.8253, "grad_norm": 0.5761468410491943, "learning_rate": 0.0002, "epoch": 0.2563734290843806, "step": 3570}, {"loss": 0.8473, "grad_norm": 0.49266132712364197, "learning_rate": 0.0002, "epoch": 0.2570915619389587, "step": 3580}, {"loss": 0.7946, "grad_norm": 0.7377930879592896, "learning_rate": 0.0002, "epoch": 0.2578096947935368, "step": 3590}, {"loss": 0.799, "grad_norm": 0.543541431427002, "learning_rate": 0.0002, "epoch": 0.2585278276481149, "step": 3600}, {"loss": 0.8044, "grad_norm": 0.48385897278785706, "learning_rate": 0.0002, "epoch": 0.259245960502693, "step": 3610}, {"loss": 0.7686, "grad_norm": 0.5152639746665955, "learning_rate": 0.0002, "epoch": 0.2599640933572711, "step": 3620}, {"loss": 0.7438, "grad_norm": 0.5601988434791565, "learning_rate": 0.0002, "epoch": 0.26068222621184917, "step": 3630}, {"loss": 0.7915, "grad_norm": 0.4349626302719116, "learning_rate": 0.0002, "epoch": 0.2614003590664273, "step": 3640}, {"loss": 0.7825, "grad_norm": 0.5487161874771118, "learning_rate": 0.0002, "epoch": 0.2621184919210054, "step": 3650}, {"loss": 0.8085, "grad_norm": 0.45603805780410767, "learning_rate": 0.0002, "epoch": 0.2628366247755835, "step": 3660}, {"loss": 0.7858, "grad_norm": 0.5012730956077576, "learning_rate": 0.0002, "epoch": 0.26355475763016156, "step": 3670}, {"loss": 0.8022, "grad_norm": 0.4523845314979553, "learning_rate": 0.0002, "epoch": 0.2642728904847397, "step": 3680}, {"loss": 0.7932, "grad_norm": 0.5756664872169495, "learning_rate": 0.0002, "epoch": 0.2649910233393178, "step": 3690}, {"loss": 0.816, "grad_norm": 0.48467493057250977, "learning_rate": 0.0002, "epoch": 0.26570915619389585, "step": 3700}, {"loss": 0.7825, "grad_norm": 0.4860585927963257, "learning_rate": 0.0002, "epoch": 0.26642728904847396, "step": 3710}, {"loss": 0.7903, "grad_norm": 0.5067077875137329, "learning_rate": 0.0002, "epoch": 0.26714542190305207, "step": 3720}, {"loss": 0.8155, "grad_norm": 0.5490895509719849, "learning_rate": 0.0002, "epoch": 0.2678635547576302, "step": 3730}, {"loss": 0.7542, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.26858168761220824, "step": 3740}, {"loss": 0.7991, "grad_norm": 0.5026951432228088, "learning_rate": 0.0002, "epoch": 0.26929982046678635, "step": 3750}, {"loss": 0.8152, "grad_norm": 0.49474090337753296, "learning_rate": 0.0002, "epoch": 0.27001795332136447, "step": 3760}, {"loss": 0.8235, "grad_norm": 0.6381985545158386, "learning_rate": 0.0002, "epoch": 0.2707360861759425, "step": 3770}, {"loss": 0.8024, "grad_norm": 0.4784011244773865, "learning_rate": 0.0002, "epoch": 0.27145421903052064, "step": 3780}, {"loss": 0.7746, "grad_norm": 0.5126543045043945, "learning_rate": 0.0002, "epoch": 0.27217235188509875, "step": 3790}, {"loss": 0.841, "grad_norm": 0.5428652763366699, "learning_rate": 0.0002, "epoch": 0.27289048473967686, "step": 3800}, {"loss": 0.8137, "grad_norm": 0.5427033305168152, "learning_rate": 0.0002, "epoch": 0.2736086175942549, "step": 3810}, {"loss": 0.7274, "grad_norm": 0.46467480063438416, "learning_rate": 0.0002, "epoch": 0.27432675044883303, "step": 3820}, {"loss": 0.8414, "grad_norm": 0.494367390871048, "learning_rate": 0.0002, "epoch": 0.27504488330341115, "step": 3830}, {"loss": 0.8151, "grad_norm": 0.59856778383255, "learning_rate": 0.0002, "epoch": 0.2757630161579892, "step": 3840}, {"loss": 0.7899, "grad_norm": 0.422128826379776, "learning_rate": 0.0002, "epoch": 0.2764811490125673, "step": 3850}, {"loss": 0.8153, "grad_norm": 0.5757306814193726, "learning_rate": 0.0002, "epoch": 0.27719928186714543, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5850930213928223, "learning_rate": 0.0002, "epoch": 0.27791741472172354, "step": 3870}, {"loss": 0.8044, "grad_norm": 0.5633023977279663, "learning_rate": 0.0002, "epoch": 0.2786355475763016, "step": 3880}, {"loss": 0.8402, "grad_norm": 0.5037940144538879, "learning_rate": 0.0002, "epoch": 0.2793536804308797, "step": 3890}, {"loss": 0.822, "grad_norm": 0.5255506038665771, "learning_rate": 0.0002, "epoch": 0.2800718132854578, "step": 3900}, {"loss": 0.7625, "grad_norm": 0.44584617018699646, "learning_rate": 0.0002, "epoch": 0.2807899461400359, "step": 3910}, {"loss": 0.8131, "grad_norm": 0.4803239405155182, "learning_rate": 0.0002, "epoch": 0.281508078994614, "step": 3920}, {"loss": 0.8122, "grad_norm": 0.5206008553504944, "learning_rate": 0.0002, "epoch": 0.2822262118491921, "step": 3930}, {"loss": 0.8988, "grad_norm": 0.5596373081207275, "learning_rate": 0.0002, "epoch": 0.2829443447037702, "step": 3940}, {"loss": 0.8091, "grad_norm": 0.4487258493900299, "learning_rate": 0.0002, "epoch": 0.2836624775583483, "step": 3950}, {"loss": 0.7933, "grad_norm": 0.4774281978607178, "learning_rate": 0.0002, "epoch": 0.2843806104129264, "step": 3960}, {"loss": 0.8994, "grad_norm": 0.571829617023468, "learning_rate": 0.0002, "epoch": 0.2850987432675045, "step": 3970}, {"loss": 0.7971, "grad_norm": 0.45251455903053284, "learning_rate": 0.0002, "epoch": 0.28581687612208256, "step": 3980}, {"loss": 0.8007, "grad_norm": 0.5119943618774414, "learning_rate": 0.0002, "epoch": 0.2865350089766607, "step": 3990}, {"loss": 0.8087, "grad_norm": 0.42333969473838806, "learning_rate": 0.0002, "epoch": 0.2872531418312388, "step": 4000}, {"loss": 0.7978, "grad_norm": 0.5694096684455872, "learning_rate": 0.0002, "epoch": 0.2879712746858169, "step": 4010}, {"loss": 0.845, "grad_norm": 0.44457492232322693, "learning_rate": 0.0002, "epoch": 0.28868940754039496, "step": 4020}, {"loss": 0.7268, "grad_norm": 0.496545672416687, "learning_rate": 0.0002, "epoch": 0.2894075403949731, "step": 4030}, {"loss": 0.7908, "grad_norm": 0.5092352032661438, "learning_rate": 0.0002, "epoch": 0.2901256732495512, "step": 4040}, {"loss": 0.807, "grad_norm": 0.5124567151069641, "learning_rate": 0.0002, "epoch": 0.29084380610412924, "step": 4050}, {"loss": 0.8191, "grad_norm": 0.5148161053657532, "learning_rate": 0.0002, "epoch": 0.29156193895870736, "step": 4060}, {"loss": 0.7893, "grad_norm": 0.48183947801589966, "learning_rate": 0.0002, "epoch": 0.29228007181328547, "step": 4070}, {"loss": 0.8397, "grad_norm": 0.47728800773620605, "learning_rate": 0.0002, "epoch": 0.2929982046678636, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.29371633752244164, "step": 4090}, {"loss": 0.8019, "grad_norm": 0.5343585014343262, "learning_rate": 0.0002, "epoch": 0.29443447037701975, "step": 4100}, {"loss": 0.7933, "grad_norm": 0.5760312676429749, "learning_rate": 0.0002, "epoch": 0.29515260323159787, "step": 4110}, {"loss": 0.811, "grad_norm": 0.5894787907600403, "learning_rate": 0.0002, "epoch": 0.2958707360861759, "step": 4120}, {"loss": 0.7375, "grad_norm": 0.4528578817844391, "learning_rate": 0.0002, "epoch": 0.29658886894075404, "step": 4130}, {"loss": 0.7761, "grad_norm": 0.6027235388755798, "learning_rate": 0.0002, "epoch": 0.29730700179533215, "step": 4140}, {"loss": 0.7636, "grad_norm": 0.5060310959815979, "learning_rate": 0.0002, "epoch": 0.2980251346499102, "step": 4150}, {"loss": 0.8122, "grad_norm": 0.475252628326416, "learning_rate": 0.0002, "epoch": 0.2987432675044883, "step": 4160}, {"loss": 0.8306, "grad_norm": 0.4855351448059082, "learning_rate": 0.0002, "epoch": 0.29946140035906643, "step": 4170}, {"loss": 0.7863, "grad_norm": 0.6720767021179199, "learning_rate": 0.0002, "epoch": 0.30017953321364454, "step": 4180}, {"loss": 0.7755, "grad_norm": 0.6409553289413452, "learning_rate": 0.0002, "epoch": 0.3008976660682226, "step": 4190}, {"loss": 0.8333, "grad_norm": 0.5508167147636414, "learning_rate": 0.0002, "epoch": 0.3016157989228007, "step": 4200}, {"loss": 0.8138, "grad_norm": 0.45958149433135986, "learning_rate": 0.0002, "epoch": 0.30233393177737883, "step": 4210}, {"loss": 0.8266, "grad_norm": 0.5201641321182251, "learning_rate": 0.0002, "epoch": 0.3030520646319569, "step": 4220}, {"loss": 0.8246, "grad_norm": 0.5440032482147217, "learning_rate": 0.0002, "epoch": 0.303770197486535, "step": 4230}, {"loss": 0.7863, "grad_norm": 0.43566814064979553, "learning_rate": 0.0002, "epoch": 0.3044883303411131, "step": 4240}, {"loss": 0.7835, "grad_norm": 0.4479893445968628, "learning_rate": 0.0002, "epoch": 0.3052064631956912, "step": 4250}, {"loss": 0.7646, "grad_norm": 0.40390217304229736, "learning_rate": 0.0002, "epoch": 0.3059245960502693, "step": 4260}, {"loss": 0.8382, "grad_norm": 0.5143486261367798, "learning_rate": 0.0002, "epoch": 0.3066427289048474, "step": 4270}, {"loss": 0.799, "grad_norm": 0.5289962887763977, "learning_rate": 0.0002, "epoch": 0.3073608617594255, "step": 4280}, {"loss": 0.7706, "grad_norm": 0.609561026096344, "learning_rate": 0.0002, "epoch": 0.30807899461400357, "step": 4290}, {"loss": 0.809, "grad_norm": 0.5967493653297424, "learning_rate": 0.0002, "epoch": 0.3087971274685817, "step": 4300}, {"loss": 0.8034, "grad_norm": 0.5323672890663147, "learning_rate": 0.0002, "epoch": 0.3095152603231598, "step": 4310}, {"loss": 0.8463, "grad_norm": 0.4996737241744995, "learning_rate": 0.0002, "epoch": 0.3102333931777379, "step": 4320}, {"loss": 0.7879, "grad_norm": 0.5528829097747803, "learning_rate": 0.0002, "epoch": 0.31095152603231596, "step": 4330}, {"loss": 0.8383, "grad_norm": 0.5394268035888672, "learning_rate": 0.0002, "epoch": 0.3116696588868941, "step": 4340}, {"loss": 0.8258, "grad_norm": 0.4654628038406372, "learning_rate": 0.0002, "epoch": 0.3123877917414722, "step": 4350}, {"loss": 0.8045, "grad_norm": 0.4933706521987915, "learning_rate": 0.0002, "epoch": 0.31310592459605024, "step": 4360}, {"loss": 0.7891, "grad_norm": 0.5310598611831665, "learning_rate": 0.0002, "epoch": 0.31382405745062836, "step": 4370}, {"loss": 0.8362, "grad_norm": 0.5558765530586243, "learning_rate": 0.0002, "epoch": 0.31454219030520647, "step": 4380}, {"loss": 0.8013, "grad_norm": 0.5281313061714172, "learning_rate": 0.0002, "epoch": 0.3152603231597846, "step": 4390}, {"loss": 0.8034, "grad_norm": 0.5100293755531311, "learning_rate": 0.0002, "epoch": 0.31597845601436264, "step": 4400}, {"loss": 0.795, "grad_norm": 0.48762813210487366, "learning_rate": 0.0002, "epoch": 0.31669658886894075, "step": 4410}, {"loss": 0.7941, "grad_norm": 0.5211702585220337, "learning_rate": 0.0002, "epoch": 0.31741472172351887, "step": 4420}, {"loss": 0.8079, "grad_norm": 0.696747899055481, "learning_rate": 0.0002, "epoch": 0.3181328545780969, "step": 4430}, {"loss": 0.77, "grad_norm": 0.6334946751594543, "learning_rate": 0.0002, "epoch": 0.31885098743267504, "step": 4440}, {"loss": 0.7871, "grad_norm": 0.5333067178726196, "learning_rate": 0.0002, "epoch": 0.31956912028725315, "step": 4450}, {"loss": 0.7846, "grad_norm": 0.500091552734375, "learning_rate": 0.0002, "epoch": 0.32028725314183126, "step": 4460}, {"loss": 0.7884, "grad_norm": 0.5190957188606262, "learning_rate": 0.0002, "epoch": 0.3210053859964093, "step": 4470}, {"loss": 0.7988, "grad_norm": 0.6702370047569275, "learning_rate": 0.0002, "epoch": 0.32172351885098743, "step": 4480}, {"loss": 0.8014, "grad_norm": 0.4393869638442993, "learning_rate": 0.0002, "epoch": 0.32244165170556555, "step": 4490}, {"loss": 0.8373, "grad_norm": 0.4766499400138855, "learning_rate": 0.0002, "epoch": 0.3231597845601436, "step": 4500}, {"loss": 0.7567, "grad_norm": 0.561836838722229, "learning_rate": 0.0002, "epoch": 0.3238779174147217, "step": 4510}, {"loss": 0.7727, "grad_norm": 0.44366541504859924, "learning_rate": 0.0002, "epoch": 0.32459605026929983, "step": 4520}, {"loss": 0.8109, "grad_norm": 0.46504274010658264, "learning_rate": 0.0002, "epoch": 0.32531418312387794, "step": 4530}, {"loss": 0.7868, "grad_norm": 0.5498034954071045, "learning_rate": 0.0002, "epoch": 0.326032315978456, "step": 4540}, {"loss": 0.7638, "grad_norm": 0.5901338458061218, "learning_rate": 0.0002, "epoch": 0.3267504488330341, "step": 4550}, {"loss": 0.8016, "grad_norm": 0.5485442876815796, "learning_rate": 0.0002, "epoch": 0.3274685816876122, "step": 4560}, {"loss": 0.7944, "grad_norm": 0.512584924697876, "learning_rate": 0.0002, "epoch": 0.3281867145421903, "step": 4570}, {"loss": 0.8193, "grad_norm": 0.5208188891410828, "learning_rate": 0.0002, "epoch": 0.3289048473967684, "step": 4580}, {"loss": 0.7833, "grad_norm": 0.4923836886882782, "learning_rate": 0.0002, "epoch": 0.3296229802513465, "step": 4590}, {"loss": 0.8102, "grad_norm": 0.49258530139923096, "learning_rate": 0.0002, "epoch": 0.3303411131059246, "step": 4600}, {"loss": 0.7874, "grad_norm": 0.4788922667503357, "learning_rate": 0.0002, "epoch": 0.3310592459605027, "step": 4610}, {"loss": 0.8298, "grad_norm": 0.48276954889297485, "learning_rate": 0.0002, "epoch": 0.3317773788150808, "step": 4620}, {"loss": 0.8519, "grad_norm": 0.6300732493400574, "learning_rate": 0.0002, "epoch": 0.3324955116696589, "step": 4630}, {"loss": 0.8434, "grad_norm": 0.47594770789146423, "learning_rate": 0.0002, "epoch": 0.33321364452423696, "step": 4640}, {"loss": 0.8123, "grad_norm": 0.4728924632072449, "learning_rate": 0.0002, "epoch": 0.3339317773788151, "step": 4650}, {"loss": 0.8113, "grad_norm": 0.5586788654327393, "learning_rate": 0.0002, "epoch": 0.3346499102333932, "step": 4660}, {"loss": 0.7949, "grad_norm": 0.4573180377483368, "learning_rate": 0.0002, "epoch": 0.3353680430879713, "step": 4670}, {"loss": 0.8341, "grad_norm": 0.6391524076461792, "learning_rate": 0.0002, "epoch": 0.33608617594254936, "step": 4680}, {"loss": 0.8126, "grad_norm": 0.6570921540260315, "learning_rate": 0.0002, "epoch": 0.33680430879712747, "step": 4690}, {"loss": 0.796, "grad_norm": 0.4601454734802246, "learning_rate": 0.0002, "epoch": 0.3375224416517056, "step": 4700}, {"loss": 0.8158, "grad_norm": 0.5640755295753479, "learning_rate": 0.0002, "epoch": 0.33824057450628364, "step": 4710}, {"loss": 0.8326, "grad_norm": 0.43475520610809326, "learning_rate": 0.0002, "epoch": 0.33895870736086176, "step": 4720}, {"loss": 0.7684, "grad_norm": 0.4785807132720947, "learning_rate": 0.0002, "epoch": 0.33967684021543987, "step": 4730}, {"loss": 0.8257, "grad_norm": 0.4934665262699127, "learning_rate": 0.0002, "epoch": 0.340394973070018, "step": 4740}, {"loss": 0.7713, "grad_norm": 0.45327693223953247, "learning_rate": 0.0002, "epoch": 0.34111310592459604, "step": 4750}, {"loss": 0.7944, "grad_norm": 0.4710456430912018, "learning_rate": 0.0002, "epoch": 0.34183123877917415, "step": 4760}, {"loss": 0.7689, "grad_norm": 0.5591559410095215, "learning_rate": 0.0002, "epoch": 0.34254937163375226, "step": 4770}, {"loss": 0.8204, "grad_norm": 0.48958835005760193, "learning_rate": 0.0002, "epoch": 0.3432675044883303, "step": 4780}, {"loss": 0.8232, "grad_norm": 0.4613766670227051, "learning_rate": 0.0002, "epoch": 0.34398563734290843, "step": 4790}, {"loss": 0.8339, "grad_norm": 0.5425335764884949, "learning_rate": 0.0002, "epoch": 0.34470377019748655, "step": 4800}, {"loss": 0.828, "grad_norm": 0.4964924156665802, "learning_rate": 0.0002, "epoch": 0.3454219030520646, "step": 4810}, {"loss": 0.8264, "grad_norm": 0.613449215888977, "learning_rate": 0.0002, "epoch": 0.3461400359066427, "step": 4820}, {"loss": 0.846, "grad_norm": 0.6553348898887634, "learning_rate": 0.0002, "epoch": 0.34685816876122083, "step": 4830}, {"loss": 0.8181, "grad_norm": 0.5863470435142517, "learning_rate": 0.0002, "epoch": 0.34757630161579894, "step": 4840}, {"loss": 0.8205, "grad_norm": 0.5338097810745239, "learning_rate": 0.0002, "epoch": 0.348294434470377, "step": 4850}, {"loss": 0.7926, "grad_norm": 0.6129760146141052, "learning_rate": 0.0002, "epoch": 0.3490125673249551, "step": 4860}, {"loss": 0.7745, "grad_norm": 0.6100956797599792, "learning_rate": 0.0002, "epoch": 0.3497307001795332, "step": 4870}, {"loss": 0.7642, "grad_norm": 0.5478541254997253, "learning_rate": 0.0002, "epoch": 0.3504488330341113, "step": 4880}, {"loss": 0.7558, "grad_norm": 0.5725359916687012, "learning_rate": 0.0002, "epoch": 0.3511669658886894, "step": 4890}, {"loss": 0.8208, "grad_norm": 0.6141043901443481, "learning_rate": 0.0002, "epoch": 0.3518850987432675, "step": 4900}, {"loss": 0.841, "grad_norm": 0.597191572189331, "learning_rate": 0.0002, "epoch": 0.3526032315978456, "step": 4910}, {"loss": 0.8234, "grad_norm": 0.5988389253616333, "learning_rate": 0.0002, "epoch": 0.3533213644524237, "step": 4920}, {"loss": 0.7775, "grad_norm": 0.5503361821174622, "learning_rate": 0.0002, "epoch": 0.3540394973070018, "step": 4930}, {"loss": 0.8315, "grad_norm": 0.5932779312133789, "learning_rate": 0.0002, "epoch": 0.3547576301615799, "step": 4940}, {"loss": 0.8407, "grad_norm": 0.48911359906196594, "learning_rate": 0.0002, "epoch": 0.35547576301615796, "step": 4950}, {"loss": 0.8191, "grad_norm": 0.5435750484466553, "learning_rate": 0.0002, "epoch": 0.3561938958707361, "step": 4960}, {"loss": 0.7551, "grad_norm": 0.4786977767944336, "learning_rate": 0.0002, "epoch": 0.3569120287253142, "step": 4970}, {"loss": 0.7845, "grad_norm": 0.4022316336631775, "learning_rate": 0.0002, "epoch": 0.3576301615798923, "step": 4980}, {"loss": 0.8032, "grad_norm": 0.4848504364490509, "learning_rate": 0.0002, "epoch": 0.35834829443447036, "step": 4990}, {"loss": 0.809, "grad_norm": 0.5093459486961365, "learning_rate": 0.0002, "epoch": 0.3590664272890485, "step": 5000}, {"loss": 0.8424, "grad_norm": 0.47368478775024414, "learning_rate": 0.0002, "epoch": 0.3597845601436266, "step": 5010}, {"loss": 0.811, "grad_norm": 0.6041097044944763, "learning_rate": 0.0002, "epoch": 0.36050269299820464, "step": 5020}, {"loss": 0.8023, "grad_norm": 0.5384424924850464, "learning_rate": 0.0002, "epoch": 0.36122082585278276, "step": 5030}, {"loss": 0.826, "grad_norm": 0.4668518602848053, "learning_rate": 0.0002, "epoch": 0.36193895870736087, "step": 5040}, {"loss": 0.7785, "grad_norm": 0.5471060276031494, "learning_rate": 0.0002, "epoch": 0.362657091561939, "step": 5050}, {"loss": 0.7511, "grad_norm": 0.731369137763977, "learning_rate": 0.0002, "epoch": 0.36337522441651704, "step": 5060}, {"loss": 0.8646, "grad_norm": 0.5119590759277344, "learning_rate": 0.0002, "epoch": 0.36409335727109515, "step": 5070}, {"loss": 0.8125, "grad_norm": 0.567428469657898, "learning_rate": 0.0002, "epoch": 0.36481149012567327, "step": 5080}, {"loss": 0.7616, "grad_norm": 0.5139971375465393, "learning_rate": 0.0002, "epoch": 0.3655296229802513, "step": 5090}, {"loss": 0.8091, "grad_norm": 0.5701581835746765, "learning_rate": 0.0002, "epoch": 0.36624775583482944, "step": 5100}, {"loss": 0.821, "grad_norm": 0.5022063851356506, "learning_rate": 0.0002, "epoch": 0.36696588868940755, "step": 5110}, {"loss": 0.7879, "grad_norm": 0.4684354364871979, "learning_rate": 0.0002, "epoch": 0.36768402154398566, "step": 5120}, {"loss": 0.8028, "grad_norm": 0.5423495769500732, "learning_rate": 0.0002, "epoch": 0.3684021543985637, "step": 5130}, {"loss": 0.7763, "grad_norm": 0.46262967586517334, "learning_rate": 0.0002, "epoch": 0.36912028725314183, "step": 5140}, {"loss": 0.8485, "grad_norm": 0.4720141589641571, "learning_rate": 0.0002, "epoch": 0.36983842010771995, "step": 5150}, {"loss": 0.7778, "grad_norm": 0.5113096833229065, "learning_rate": 0.0002, "epoch": 0.370556552962298, "step": 5160}, {"loss": 0.7854, "grad_norm": 0.5253350138664246, "learning_rate": 0.0002, "epoch": 0.3712746858168761, "step": 5170}, {"loss": 0.8539, "grad_norm": 0.5799776315689087, "learning_rate": 0.0002, "epoch": 0.37199281867145423, "step": 5180}, {"loss": 0.78, "grad_norm": 0.5166001319885254, "learning_rate": 0.0002, "epoch": 0.37271095152603234, "step": 5190}, {"loss": 0.7939, "grad_norm": 0.5658290386199951, "learning_rate": 0.0002, "epoch": 0.3734290843806104, "step": 5200}, {"loss": 0.8059, "grad_norm": 0.45811113715171814, "learning_rate": 0.0002, "epoch": 0.3741472172351885, "step": 5210}, {"loss": 0.8024, "grad_norm": 0.5509489178657532, "learning_rate": 0.0002, "epoch": 0.3748653500897666, "step": 5220}, {"loss": 0.7537, "grad_norm": 0.47473257780075073, "learning_rate": 0.0002, "epoch": 0.3755834829443447, "step": 5230}, {"loss": 0.8159, "grad_norm": 0.3858596086502075, "learning_rate": 0.0002, "epoch": 0.3763016157989228, "step": 5240}, {"loss": 0.8592, "grad_norm": 0.6941536068916321, "learning_rate": 0.0002, "epoch": 0.3770197486535009, "step": 5250}, {"loss": 0.8489, "grad_norm": 0.46940872073173523, "learning_rate": 0.0002, "epoch": 0.377737881508079, "step": 5260}, {"loss": 0.7818, "grad_norm": 0.5413833260536194, "learning_rate": 0.0002, "epoch": 0.3784560143626571, "step": 5270}, {"loss": 0.8202, "grad_norm": 0.5165658593177795, "learning_rate": 0.0002, "epoch": 0.3791741472172352, "step": 5280}, {"loss": 0.7837, "grad_norm": 0.6567398309707642, "learning_rate": 0.0002, "epoch": 0.3798922800718133, "step": 5290}, {"loss": 0.7991, "grad_norm": 0.5466915965080261, "learning_rate": 0.0002, "epoch": 0.38061041292639136, "step": 5300}, {"loss": 0.7683, "grad_norm": 0.4800598621368408, "learning_rate": 0.0002, "epoch": 0.3813285457809695, "step": 5310}, {"loss": 0.8653, "grad_norm": 0.4551742970943451, "learning_rate": 0.0002, "epoch": 0.3820466786355476, "step": 5320}, {"loss": 0.8283, "grad_norm": 0.5561164617538452, "learning_rate": 0.0002, "epoch": 0.3827648114901257, "step": 5330}, {"loss": 0.8192, "grad_norm": 0.6170380115509033, "learning_rate": 0.0002, "epoch": 0.38348294434470376, "step": 5340}, {"loss": 0.8015, "grad_norm": 0.465762197971344, "learning_rate": 0.0002, "epoch": 0.38420107719928187, "step": 5350}, {"loss": 0.7561, "grad_norm": 0.6176838874816895, "learning_rate": 0.0002, "epoch": 0.38491921005386, "step": 5360}, {"loss": 0.7571, "grad_norm": 0.657926082611084, "learning_rate": 0.0002, "epoch": 0.38563734290843804, "step": 5370}, {"loss": 0.7366, "grad_norm": 0.5063281655311584, "learning_rate": 0.0002, "epoch": 0.38635547576301615, "step": 5380}, {"loss": 0.8259, "grad_norm": 0.6960828304290771, "learning_rate": 0.0002, "epoch": 0.38707360861759427, "step": 5390}, {"loss": 0.8058, "grad_norm": 0.46712034940719604, "learning_rate": 0.0002, "epoch": 0.3877917414721723, "step": 5400}, {"loss": 0.7674, "grad_norm": 0.598114013671875, "learning_rate": 0.0002, "epoch": 0.38850987432675044, "step": 5410}, {"loss": 0.8256, "grad_norm": 0.6798132061958313, "learning_rate": 0.0002, "epoch": 0.38922800718132855, "step": 5420}, {"loss": 0.844, "grad_norm": 0.5194289088249207, "learning_rate": 0.0002, "epoch": 0.38994614003590666, "step": 5430}, {"loss": 0.7666, "grad_norm": 0.48175323009490967, "learning_rate": 0.0002, "epoch": 0.3906642728904847, "step": 5440}, {"loss": 0.8089, "grad_norm": 0.4979408085346222, "learning_rate": 0.0002, "epoch": 0.39138240574506283, "step": 5450}, {"loss": 0.7938, "grad_norm": 0.6440972685813904, "learning_rate": 0.0002, "epoch": 0.39210053859964095, "step": 5460}, {"loss": 0.8531, "grad_norm": 0.5977227091789246, "learning_rate": 0.0002, "epoch": 0.392818671454219, "step": 5470}, {"loss": 0.8384, "grad_norm": 0.4735909104347229, "learning_rate": 0.0002, "epoch": 0.3935368043087971, "step": 5480}, {"loss": 0.8579, "grad_norm": 0.48181721568107605, "learning_rate": 0.0002, "epoch": 0.39425493716337523, "step": 5490}, {"loss": 0.8113, "grad_norm": 0.6339454650878906, "learning_rate": 0.0002, "epoch": 0.39497307001795334, "step": 5500}, {"loss": 0.7682, "grad_norm": 0.5364336371421814, "learning_rate": 0.0002, "epoch": 0.3956912028725314, "step": 5510}, {"loss": 0.8198, "grad_norm": 0.5499233603477478, "learning_rate": 0.0002, "epoch": 0.3964093357271095, "step": 5520}, {"loss": 0.7981, "grad_norm": 0.47249847650527954, "learning_rate": 0.0002, "epoch": 0.3971274685816876, "step": 5530}, {"loss": 0.8207, "grad_norm": 0.5692135095596313, "learning_rate": 0.0002, "epoch": 0.3978456014362657, "step": 5540}, {"loss": 0.8173, "grad_norm": 0.6009272933006287, "learning_rate": 0.0002, "epoch": 0.3985637342908438, "step": 5550}, {"loss": 0.7622, "grad_norm": 0.5198255181312561, "learning_rate": 0.0002, "epoch": 0.3992818671454219, "step": 5560}, {"loss": 0.8597, "grad_norm": 0.5474766492843628, "learning_rate": 0.0002, "epoch": 0.4, "step": 5570}, {"loss": 0.841, "grad_norm": 0.5577479600906372, "learning_rate": 0.0002, "epoch": 0.4007181328545781, "step": 5580}, {"loss": 0.7986, "grad_norm": 0.5350302457809448, "learning_rate": 0.0002, "epoch": 0.4014362657091562, "step": 5590}, {"loss": 0.7892, "grad_norm": 0.6310991048812866, "learning_rate": 0.0002, "epoch": 0.4021543985637343, "step": 5600}, {"loss": 0.7834, "grad_norm": 0.5695762038230896, "learning_rate": 0.0002, "epoch": 0.40287253141831236, "step": 5610}, {"loss": 0.7508, "grad_norm": 0.5431827306747437, "learning_rate": 0.0002, "epoch": 0.4035906642728905, "step": 5620}, {"loss": 0.8743, "grad_norm": 0.4923325777053833, "learning_rate": 0.0002, "epoch": 0.4043087971274686, "step": 5630}, {"loss": 0.7745, "grad_norm": 0.531399667263031, "learning_rate": 0.0002, "epoch": 0.4050269299820467, "step": 5640}, {"loss": 0.7982, "grad_norm": 0.5854769349098206, "learning_rate": 0.0002, "epoch": 0.40574506283662476, "step": 5650}, {"loss": 0.8225, "grad_norm": 0.6684802174568176, "learning_rate": 0.0002, "epoch": 0.40646319569120287, "step": 5660}, {"loss": 0.7405, "grad_norm": 0.6618620753288269, "learning_rate": 0.0002, "epoch": 0.407181328545781, "step": 5670}, {"loss": 0.7707, "grad_norm": 0.4930776059627533, "learning_rate": 0.0002, "epoch": 0.40789946140035904, "step": 5680}, {"loss": 0.7846, "grad_norm": 0.506628155708313, "learning_rate": 0.0002, "epoch": 0.40861759425493716, "step": 5690}, {"loss": 0.7827, "grad_norm": 0.5250783562660217, "learning_rate": 0.0002, "epoch": 0.40933572710951527, "step": 5700}, {"loss": 0.8386, "grad_norm": 0.6773046851158142, "learning_rate": 0.0002, "epoch": 0.4100538599640934, "step": 5710}, {"loss": 0.8096, "grad_norm": 0.6750592589378357, "learning_rate": 0.0002, "epoch": 0.41077199281867144, "step": 5720}, {"loss": 0.7873, "grad_norm": 0.5277232527732849, "learning_rate": 0.0002, "epoch": 0.41149012567324955, "step": 5730}, {"loss": 0.762, "grad_norm": 0.5155990719795227, "learning_rate": 0.0002, "epoch": 0.41220825852782766, "step": 5740}, {"loss": 0.871, "grad_norm": 0.5236294865608215, "learning_rate": 0.0002, "epoch": 0.4129263913824057, "step": 5750}, {"loss": 0.7753, "grad_norm": 0.5073592066764832, "learning_rate": 0.0002, "epoch": 0.41364452423698383, "step": 5760}, {"loss": 0.7984, "grad_norm": 0.6997184753417969, "learning_rate": 0.0002, "epoch": 0.41436265709156195, "step": 5770}, {"loss": 0.7579, "grad_norm": 0.5282439589500427, "learning_rate": 0.0002, "epoch": 0.41508078994614006, "step": 5780}, {"loss": 0.7831, "grad_norm": 0.4997355341911316, "learning_rate": 0.0002, "epoch": 0.4157989228007181, "step": 5790}, {"loss": 0.8022, "grad_norm": 0.6081610321998596, "learning_rate": 0.0002, "epoch": 0.41651705565529623, "step": 5800}, {"loss": 0.8068, "grad_norm": 0.5640295147895813, "learning_rate": 0.0002, "epoch": 0.41723518850987434, "step": 5810}, {"loss": 0.7819, "grad_norm": 0.6443586349487305, "learning_rate": 0.0002, "epoch": 0.4179533213644524, "step": 5820}, {"loss": 0.8132, "grad_norm": 0.6456229090690613, "learning_rate": 0.0002, "epoch": 0.4186714542190305, "step": 5830}, {"loss": 0.785, "grad_norm": 0.5422267317771912, "learning_rate": 0.0002, "epoch": 0.4193895870736086, "step": 5840}, {"loss": 0.7962, "grad_norm": 0.45251885056495667, "learning_rate": 0.0002, "epoch": 0.42010771992818674, "step": 5850}, {"loss": 0.7945, "grad_norm": 0.781165599822998, "learning_rate": 0.0002, "epoch": 0.4208258527827648, "step": 5860}, {"loss": 0.8171, "grad_norm": 0.5359160900115967, "learning_rate": 0.0002, "epoch": 0.4215439856373429, "step": 5870}, {"loss": 0.8012, "grad_norm": 0.6201958656311035, "learning_rate": 0.0002, "epoch": 0.422262118491921, "step": 5880}, {"loss": 0.8363, "grad_norm": 0.5985850691795349, "learning_rate": 0.0002, "epoch": 0.4229802513464991, "step": 5890}, {"loss": 0.7842, "grad_norm": 0.5550961494445801, "learning_rate": 0.0002, "epoch": 0.4236983842010772, "step": 5900}, {"loss": 0.7717, "grad_norm": 0.6284893155097961, "learning_rate": 0.0002, "epoch": 0.4244165170556553, "step": 5910}, {"loss": 0.8165, "grad_norm": 0.6143685579299927, "learning_rate": 0.0002, "epoch": 0.4251346499102334, "step": 5920}, {"loss": 0.7986, "grad_norm": 0.5065329670906067, "learning_rate": 0.0002, "epoch": 0.4258527827648115, "step": 5930}, {"loss": 0.7883, "grad_norm": 0.7274345755577087, "learning_rate": 0.0002, "epoch": 0.4265709156193896, "step": 5940}, {"loss": 0.8126, "grad_norm": 0.606531023979187, "learning_rate": 0.0002, "epoch": 0.4272890484739677, "step": 5950}, {"loss": 0.7805, "grad_norm": 0.5983648300170898, "learning_rate": 0.0002, "epoch": 0.42800718132854576, "step": 5960}, {"loss": 0.8124, "grad_norm": 0.5546031594276428, "learning_rate": 0.0002, "epoch": 0.4287253141831239, "step": 5970}, {"loss": 0.8184, "grad_norm": 0.666868269443512, "learning_rate": 0.0002, "epoch": 0.429443447037702, "step": 5980}, {"loss": 0.8171, "grad_norm": 0.41438576579093933, "learning_rate": 0.0002, "epoch": 0.4301615798922801, "step": 5990}, {"loss": 0.8456, "grad_norm": 0.5012526512145996, "learning_rate": 0.0002, "epoch": 0.43087971274685816, "step": 6000}, {"loss": 0.7837, "grad_norm": 0.6071694493293762, "learning_rate": 0.0002, "epoch": 0.43159784560143627, "step": 6010}, {"loss": 0.8364, "grad_norm": 0.5538384914398193, "learning_rate": 0.0002, "epoch": 0.4323159784560144, "step": 6020}, {"loss": 0.7888, "grad_norm": 0.5798718929290771, "learning_rate": 0.0002, "epoch": 0.43303411131059244, "step": 6030}, {"loss": 0.8196, "grad_norm": 0.5442442893981934, "learning_rate": 0.0002, "epoch": 0.43375224416517055, "step": 6040}, {"loss": 0.8041, "grad_norm": 0.6895565390586853, "learning_rate": 0.0002, "epoch": 0.43447037701974867, "step": 6050}, {"loss": 0.8154, "grad_norm": 0.6498045325279236, "learning_rate": 0.0002, "epoch": 0.4351885098743267, "step": 6060}, {"loss": 0.782, "grad_norm": 0.5225510001182556, "learning_rate": 0.0002, "epoch": 0.43590664272890484, "step": 6070}, {"loss": 0.7809, "grad_norm": 0.6366992592811584, "learning_rate": 0.0002, "epoch": 0.43662477558348295, "step": 6080}, {"loss": 0.7715, "grad_norm": 0.47929027676582336, "learning_rate": 0.0002, "epoch": 0.43734290843806106, "step": 6090}, {"loss": 0.7481, "grad_norm": 0.5722405910491943, "learning_rate": 0.0002, "epoch": 0.4380610412926391, "step": 6100}, {"loss": 0.765, "grad_norm": 0.6008004546165466, "learning_rate": 0.0002, "epoch": 0.43877917414721723, "step": 6110}, {"loss": 0.7795, "grad_norm": 0.5922580361366272, "learning_rate": 0.0002, "epoch": 0.43949730700179535, "step": 6120}, {"loss": 0.8542, "grad_norm": 0.7051905393600464, "learning_rate": 0.0002, "epoch": 0.4402154398563734, "step": 6130}, {"loss": 0.8159, "grad_norm": 0.5146450400352478, "learning_rate": 0.0002, "epoch": 0.4409335727109515, "step": 6140}, {"loss": 0.8178, "grad_norm": 0.5605781674385071, "learning_rate": 0.0002, "epoch": 0.44165170556552963, "step": 6150}, {"loss": 0.8409, "grad_norm": 0.8008661866188049, "learning_rate": 0.0002, "epoch": 0.44236983842010774, "step": 6160}, {"loss": 0.797, "grad_norm": 0.47406497597694397, "learning_rate": 0.0002, "epoch": 0.4430879712746858, "step": 6170}, {"loss": 0.7853, "grad_norm": 0.612287700176239, "learning_rate": 0.0002, "epoch": 0.4438061041292639, "step": 6180}, {"loss": 0.835, "grad_norm": 0.561188280582428, "learning_rate": 0.0002, "epoch": 0.444524236983842, "step": 6190}, {"loss": 0.7604, "grad_norm": 0.6233669519424438, "learning_rate": 0.0002, "epoch": 0.4452423698384201, "step": 6200}, {"loss": 0.7539, "grad_norm": 0.45546263456344604, "learning_rate": 0.0002, "epoch": 0.4459605026929982, "step": 6210}, {"loss": 0.8183, "grad_norm": 0.5947871208190918, "learning_rate": 0.0002, "epoch": 0.4466786355475763, "step": 6220}, {"loss": 0.789, "grad_norm": 0.6109753847122192, "learning_rate": 0.0002, "epoch": 0.4473967684021544, "step": 6230}, {"loss": 0.7811, "grad_norm": 0.6380727887153625, "learning_rate": 0.0002, "epoch": 0.4481149012567325, "step": 6240}, {"loss": 0.7845, "grad_norm": 0.5225699543952942, "learning_rate": 0.0002, "epoch": 0.4488330341113106, "step": 6250}, {"loss": 0.8217, "grad_norm": 0.521503210067749, "learning_rate": 0.0002, "epoch": 0.4495511669658887, "step": 6260}, {"loss": 0.8392, "grad_norm": 0.5523216128349304, "learning_rate": 0.0002, "epoch": 0.45026929982046676, "step": 6270}, {"loss": 0.8228, "grad_norm": 0.5954921841621399, "learning_rate": 0.0002, "epoch": 0.4509874326750449, "step": 6280}, {"loss": 0.7798, "grad_norm": 0.702751100063324, "learning_rate": 0.0002, "epoch": 0.451705565529623, "step": 6290}, {"loss": 0.7865, "grad_norm": 0.5756356120109558, "learning_rate": 0.0002, "epoch": 0.4524236983842011, "step": 6300}, {"loss": 0.8128, "grad_norm": 0.45365944504737854, "learning_rate": 0.0002, "epoch": 0.45314183123877916, "step": 6310}, {"loss": 0.8027, "grad_norm": 0.5027855038642883, "learning_rate": 0.0002, "epoch": 0.45385996409335727, "step": 6320}, {"loss": 0.8052, "grad_norm": 0.6551687121391296, "learning_rate": 0.0002, "epoch": 0.4545780969479354, "step": 6330}, {"loss": 0.7507, "grad_norm": 0.5296684503555298, "learning_rate": 0.0002, "epoch": 0.45529622980251344, "step": 6340}, {"loss": 0.8209, "grad_norm": 0.5762032866477966, "learning_rate": 0.0002, "epoch": 0.45601436265709155, "step": 6350}, {"loss": 0.8209, "grad_norm": 0.5234073996543884, "learning_rate": 0.0002, "epoch": 0.45673249551166967, "step": 6360}, {"loss": 0.8412, "grad_norm": 0.5090946555137634, "learning_rate": 0.0002, "epoch": 0.4574506283662478, "step": 6370}, {"loss": 0.787, "grad_norm": 0.6515111327171326, "learning_rate": 0.0002, "epoch": 0.45816876122082584, "step": 6380}, {"loss": 0.7351, "grad_norm": 0.7904898524284363, "learning_rate": 0.0002, "epoch": 0.45888689407540395, "step": 6390}, {"loss": 0.841, "grad_norm": 0.6379680037498474, "learning_rate": 0.0002, "epoch": 0.45960502692998206, "step": 6400}, {"loss": 0.7727, "grad_norm": 0.641759991645813, "learning_rate": 0.0002, "epoch": 0.4603231597845601, "step": 6410}, {"loss": 0.8346, "grad_norm": 0.5273829698562622, "learning_rate": 0.0002, "epoch": 0.46104129263913823, "step": 6420}, {"loss": 0.7722, "grad_norm": 0.5668497681617737, "learning_rate": 0.0002, "epoch": 0.46175942549371635, "step": 6430}, {"loss": 0.8157, "grad_norm": 0.5862061381340027, "learning_rate": 0.0002, "epoch": 0.46247755834829446, "step": 6440}, {"loss": 0.818, "grad_norm": 0.5239592790603638, "learning_rate": 0.0002, "epoch": 0.4631956912028725, "step": 6450}, {"loss": 0.7803, "grad_norm": 0.5078722834587097, "learning_rate": 0.0002, "epoch": 0.46391382405745063, "step": 6460}, {"loss": 0.7934, "grad_norm": 0.566509485244751, "learning_rate": 0.0002, "epoch": 0.46463195691202874, "step": 6470}, {"loss": 0.7746, "grad_norm": 0.5952697396278381, "learning_rate": 0.0002, "epoch": 0.4653500897666068, "step": 6480}, {"loss": 0.8088, "grad_norm": 0.6548156142234802, "learning_rate": 0.0002, "epoch": 0.4660682226211849, "step": 6490}, {"loss": 0.8303, "grad_norm": 0.4768427908420563, "learning_rate": 0.0002, "epoch": 0.466786355475763, "step": 6500}, {"loss": 0.805, "grad_norm": 0.5588273406028748, "learning_rate": 0.0002, "epoch": 0.46750448833034114, "step": 6510}, {"loss": 0.7774, "grad_norm": 0.5348677039146423, "learning_rate": 0.0002, "epoch": 0.4682226211849192, "step": 6520}, {"loss": 0.7969, "grad_norm": 0.4784318804740906, "learning_rate": 0.0002, "epoch": 0.4689407540394973, "step": 6530}, {"loss": 0.8073, "grad_norm": 0.5112265944480896, "learning_rate": 0.0002, "epoch": 0.4696588868940754, "step": 6540}, {"loss": 0.8289, "grad_norm": 0.7250495553016663, "learning_rate": 0.0002, "epoch": 0.4703770197486535, "step": 6550}, {"loss": 0.808, "grad_norm": 0.538608968257904, "learning_rate": 0.0002, "epoch": 0.4710951526032316, "step": 6560}, {"loss": 0.7977, "grad_norm": 0.5981247425079346, "learning_rate": 0.0002, "epoch": 0.4718132854578097, "step": 6570}, {"loss": 0.8092, "grad_norm": 0.5466762781143188, "learning_rate": 0.0002, "epoch": 0.4725314183123878, "step": 6580}, {"loss": 0.8136, "grad_norm": 0.5609987378120422, "learning_rate": 0.0002, "epoch": 0.4732495511669659, "step": 6590}, {"loss": 0.8575, "grad_norm": 0.6091027855873108, "learning_rate": 0.0002, "epoch": 0.473967684021544, "step": 6600}, {"loss": 0.7741, "grad_norm": 0.5542886853218079, "learning_rate": 0.0002, "epoch": 0.4746858168761221, "step": 6610}, {"loss": 0.7867, "grad_norm": 0.5656579732894897, "learning_rate": 0.0002, "epoch": 0.47540394973070016, "step": 6620}, {"loss": 0.7647, "grad_norm": 0.47507357597351074, "learning_rate": 0.0002, "epoch": 0.4761220825852783, "step": 6630}, {"loss": 0.8323, "grad_norm": 0.6039174199104309, "learning_rate": 0.0002, "epoch": 0.4768402154398564, "step": 6640}, {"loss": 0.7812, "grad_norm": 0.7129740715026855, "learning_rate": 0.0002, "epoch": 0.47755834829443444, "step": 6650}, {"loss": 0.8001, "grad_norm": 0.5189188718795776, "learning_rate": 0.0002, "epoch": 0.47827648114901256, "step": 6660}, {"loss": 0.7467, "grad_norm": 0.7548696398735046, "learning_rate": 0.0002, "epoch": 0.47899461400359067, "step": 6670}, {"loss": 0.7694, "grad_norm": 0.4729466438293457, "learning_rate": 0.0002, "epoch": 0.4797127468581688, "step": 6680}, {"loss": 0.7497, "grad_norm": 0.6190000772476196, "learning_rate": 0.0002, "epoch": 0.48043087971274684, "step": 6690}, {"loss": 0.7691, "grad_norm": 0.6276983022689819, "learning_rate": 0.0002, "epoch": 0.48114901256732495, "step": 6700}, {"loss": 0.7947, "grad_norm": 0.6097590923309326, "learning_rate": 0.0002, "epoch": 0.48186714542190306, "step": 6710}, {"loss": 0.7735, "grad_norm": 0.6507330536842346, "learning_rate": 0.0002, "epoch": 0.4825852782764811, "step": 6720}, {"loss": 0.817, "grad_norm": 0.5501991510391235, "learning_rate": 0.0002, "epoch": 0.48330341113105924, "step": 6730}, {"loss": 0.7998, "grad_norm": 0.5928015112876892, "learning_rate": 0.0002, "epoch": 0.48402154398563735, "step": 6740}, {"loss": 0.7717, "grad_norm": 0.5523008704185486, "learning_rate": 0.0002, "epoch": 0.48473967684021546, "step": 6750}, {"loss": 0.7821, "grad_norm": 0.5997263789176941, "learning_rate": 0.0002, "epoch": 0.4854578096947935, "step": 6760}, {"loss": 0.7619, "grad_norm": 0.6201002597808838, "learning_rate": 0.0002, "epoch": 0.48617594254937163, "step": 6770}, {"loss": 0.8018, "grad_norm": 0.6338862776756287, "learning_rate": 0.0002, "epoch": 0.48689407540394974, "step": 6780}, {"loss": 0.7547, "grad_norm": 0.5542550086975098, "learning_rate": 0.0002, "epoch": 0.4876122082585278, "step": 6790}, {"loss": 0.7754, "grad_norm": 0.5587872862815857, "learning_rate": 0.0002, "epoch": 0.4883303411131059, "step": 6800}, {"loss": 0.7913, "grad_norm": 0.5895681977272034, "learning_rate": 0.0002, "epoch": 0.489048473967684, "step": 6810}, {"loss": 0.7799, "grad_norm": 0.4948221743106842, "learning_rate": 0.0002, "epoch": 0.48976660682226214, "step": 6820}, {"loss": 0.8057, "grad_norm": 0.44546931982040405, "learning_rate": 0.0002, "epoch": 0.4904847396768402, "step": 6830}, {"loss": 0.8124, "grad_norm": 0.632046103477478, "learning_rate": 0.0002, "epoch": 0.4912028725314183, "step": 6840}, {"loss": 0.8014, "grad_norm": 0.49396243691444397, "learning_rate": 0.0002, "epoch": 0.4919210053859964, "step": 6850}, {"loss": 0.7127, "grad_norm": 0.497745156288147, "learning_rate": 0.0002, "epoch": 0.4926391382405745, "step": 6860}, {"loss": 0.8306, "grad_norm": 0.7336170077323914, "learning_rate": 0.0002, "epoch": 0.4933572710951526, "step": 6870}, {"loss": 0.8342, "grad_norm": 0.6723181009292603, "learning_rate": 0.0002, "epoch": 0.4940754039497307, "step": 6880}, {"loss": 0.8251, "grad_norm": 0.5887754559516907, "learning_rate": 0.0002, "epoch": 0.4947935368043088, "step": 6890}, {"loss": 0.7904, "grad_norm": 0.6580226421356201, "learning_rate": 0.0002, "epoch": 0.4955116696588869, "step": 6900}, {"loss": 0.8203, "grad_norm": 0.7385056614875793, "learning_rate": 0.0002, "epoch": 0.496229802513465, "step": 6910}, {"loss": 0.87, "grad_norm": 0.48736000061035156, "learning_rate": 0.0002, "epoch": 0.4969479353680431, "step": 6920}, {"loss": 0.8045, "grad_norm": 0.6304559111595154, "learning_rate": 0.0002, "epoch": 0.49766606822262116, "step": 6930}, {"loss": 0.8323, "grad_norm": 0.607148289680481, "learning_rate": 0.0002, "epoch": 0.4983842010771993, "step": 6940}, {"loss": 0.8277, "grad_norm": 0.5467981696128845, "learning_rate": 0.0002, "epoch": 0.4991023339317774, "step": 6950}, {"loss": 0.804, "grad_norm": 0.7046723961830139, "learning_rate": 0.0002, "epoch": 0.4998204667863555, "step": 6960}, {"loss": 0.7836, "grad_norm": 0.5487921833992004, "learning_rate": 0.0002, "epoch": 0.5005385996409336, "step": 6970}, {"loss": 0.8445, "grad_norm": 0.5706006288528442, "learning_rate": 0.0002, "epoch": 0.5012567324955116, "step": 6980}, {"loss": 0.8216, "grad_norm": 0.539536714553833, "learning_rate": 0.0002, "epoch": 0.5019748653500897, "step": 6990}, {"loss": 0.7829, "grad_norm": 0.5527397394180298, "learning_rate": 0.0002, "epoch": 0.5026929982046678, "step": 7000}, {"loss": 0.8342, "grad_norm": 0.5498567223548889, "learning_rate": 0.0002, "epoch": 0.503411131059246, "step": 7010}, {"loss": 0.8073, "grad_norm": 0.5878575444221497, "learning_rate": 0.0002, "epoch": 0.5041292639138241, "step": 7020}, {"loss": 0.8284, "grad_norm": 0.646153450012207, "learning_rate": 0.0002, "epoch": 0.5048473967684022, "step": 7030}, {"loss": 0.7758, "grad_norm": 0.5603899359703064, "learning_rate": 0.0002, "epoch": 0.5055655296229803, "step": 7040}, {"loss": 0.8002, "grad_norm": 0.5849952697753906, "learning_rate": 0.0002, "epoch": 0.5062836624775583, "step": 7050}, {"loss": 0.7953, "grad_norm": 0.6082724928855896, "learning_rate": 0.0002, "epoch": 0.5070017953321364, "step": 7060}, {"loss": 0.8046, "grad_norm": 0.5900670289993286, "learning_rate": 0.0002, "epoch": 0.5077199281867145, "step": 7070}, {"loss": 0.8612, "grad_norm": 0.5856624841690063, "learning_rate": 0.0002, "epoch": 0.5084380610412926, "step": 7080}, {"loss": 0.8289, "grad_norm": 0.6177338361740112, "learning_rate": 0.0002, "epoch": 0.5091561938958707, "step": 7090}, {"loss": 0.8139, "grad_norm": 0.5559300184249878, "learning_rate": 0.0002, "epoch": 0.5098743267504489, "step": 7100}, {"loss": 0.8083, "grad_norm": 0.62027907371521, "learning_rate": 0.0002, "epoch": 0.510592459605027, "step": 7110}, {"loss": 0.8037, "grad_norm": 0.6334301829338074, "learning_rate": 0.0002, "epoch": 0.511310592459605, "step": 7120}, {"loss": 0.8107, "grad_norm": 0.513795018196106, "learning_rate": 0.0002, "epoch": 0.5120287253141831, "step": 7130}, {"loss": 0.7566, "grad_norm": 0.7004675269126892, "learning_rate": 0.0002, "epoch": 0.5127468581687612, "step": 7140}, {"loss": 0.7893, "grad_norm": 0.5614308714866638, "learning_rate": 0.0002, "epoch": 0.5134649910233393, "step": 7150}, {"loss": 0.7868, "grad_norm": 0.5037539601325989, "learning_rate": 0.0002, "epoch": 0.5141831238779174, "step": 7160}, {"loss": 0.7981, "grad_norm": 0.5568661093711853, "learning_rate": 0.0002, "epoch": 0.5149012567324955, "step": 7170}, {"loss": 0.8333, "grad_norm": 0.7513397336006165, "learning_rate": 0.0002, "epoch": 0.5156193895870737, "step": 7180}, {"loss": 0.792, "grad_norm": 0.7264583706855774, "learning_rate": 0.0002, "epoch": 0.5163375224416517, "step": 7190}, {"loss": 0.8671, "grad_norm": 0.6355819702148438, "learning_rate": 0.0002, "epoch": 0.5170556552962298, "step": 7200}, {"loss": 0.7734, "grad_norm": 0.6063222289085388, "learning_rate": 0.0002, "epoch": 0.5177737881508079, "step": 7210}, {"loss": 0.812, "grad_norm": 0.6484307646751404, "learning_rate": 0.0002, "epoch": 0.518491921005386, "step": 7220}, {"loss": 0.7852, "grad_norm": 0.5260455012321472, "learning_rate": 0.0002, "epoch": 0.5192100538599641, "step": 7230}, {"loss": 0.8301, "grad_norm": 0.6718002557754517, "learning_rate": 0.0002, "epoch": 0.5199281867145422, "step": 7240}, {"loss": 0.8178, "grad_norm": 0.5997617244720459, "learning_rate": 0.0002, "epoch": 0.5206463195691203, "step": 7250}, {"loss": 0.7631, "grad_norm": 0.5838589668273926, "learning_rate": 0.0002, "epoch": 0.5213644524236983, "step": 7260}, {"loss": 0.7853, "grad_norm": 0.5755977630615234, "learning_rate": 0.0002, "epoch": 0.5220825852782764, "step": 7270}, {"loss": 0.8233, "grad_norm": 0.6442093253135681, "learning_rate": 0.0002, "epoch": 0.5228007181328546, "step": 7280}, {"loss": 0.822, "grad_norm": 0.6128416657447815, "learning_rate": 0.0002, "epoch": 0.5235188509874327, "step": 7290}, {"loss": 0.802, "grad_norm": 0.509742796421051, "learning_rate": 0.0002, "epoch": 0.5242369838420108, "step": 7300}, {"loss": 0.7438, "grad_norm": 0.5450230836868286, "learning_rate": 0.0002, "epoch": 0.5249551166965889, "step": 7310}, {"loss": 0.7881, "grad_norm": 0.5437141060829163, "learning_rate": 0.0002, "epoch": 0.525673249551167, "step": 7320}, {"loss": 0.795, "grad_norm": 0.5291738510131836, "learning_rate": 0.0002, "epoch": 0.526391382405745, "step": 7330}, {"loss": 0.8204, "grad_norm": 0.5101743936538696, "learning_rate": 0.0002, "epoch": 0.5271095152603231, "step": 7340}, {"loss": 0.856, "grad_norm": 0.5678408145904541, "learning_rate": 0.0002, "epoch": 0.5278276481149012, "step": 7350}, {"loss": 0.8435, "grad_norm": 0.6332360506057739, "learning_rate": 0.0002, "epoch": 0.5285457809694794, "step": 7360}, {"loss": 0.8521, "grad_norm": 0.4935058653354645, "learning_rate": 0.0002, "epoch": 0.5292639138240575, "step": 7370}, {"loss": 0.7699, "grad_norm": 0.6399656534194946, "learning_rate": 0.0002, "epoch": 0.5299820466786356, "step": 7380}, {"loss": 0.7956, "grad_norm": 0.5986794233322144, "learning_rate": 0.0002, "epoch": 0.5307001795332137, "step": 7390}, {"loss": 0.774, "grad_norm": 0.6948414444923401, "learning_rate": 0.0002, "epoch": 0.5314183123877917, "step": 7400}, {"loss": 0.8267, "grad_norm": 0.5337842106819153, "learning_rate": 0.0002, "epoch": 0.5321364452423698, "step": 7410}, {"loss": 0.7634, "grad_norm": 0.6897268295288086, "learning_rate": 0.0002, "epoch": 0.5328545780969479, "step": 7420}, {"loss": 0.7606, "grad_norm": 0.6361175179481506, "learning_rate": 0.0002, "epoch": 0.533572710951526, "step": 7430}, {"loss": 0.7592, "grad_norm": 0.5242252945899963, "learning_rate": 0.0002, "epoch": 0.5342908438061041, "step": 7440}, {"loss": 0.7387, "grad_norm": 0.5731322765350342, "learning_rate": 0.0002, "epoch": 0.5350089766606823, "step": 7450}, {"loss": 0.8215, "grad_norm": 0.5790955424308777, "learning_rate": 0.0002, "epoch": 0.5357271095152604, "step": 7460}, {"loss": 0.7714, "grad_norm": 0.4979061782360077, "learning_rate": 0.0002, "epoch": 0.5364452423698384, "step": 7470}, {"loss": 0.794, "grad_norm": 0.7335101962089539, "learning_rate": 0.0002, "epoch": 0.5371633752244165, "step": 7480}, {"loss": 0.787, "grad_norm": 0.592521071434021, "learning_rate": 0.0002, "epoch": 0.5378815080789946, "step": 7490}, {"loss": 0.7421, "grad_norm": 0.5784769654273987, "learning_rate": 0.0002, "epoch": 0.5385996409335727, "step": 7500}, {"loss": 0.789, "grad_norm": 0.8148589730262756, "learning_rate": 0.0002, "epoch": 0.5393177737881508, "step": 7510}, {"loss": 0.7777, "grad_norm": 0.5727689862251282, "learning_rate": 0.0002, "epoch": 0.5400359066427289, "step": 7520}, {"loss": 0.8321, "grad_norm": 0.6958279609680176, "learning_rate": 0.0002, "epoch": 0.540754039497307, "step": 7530}, {"loss": 0.7678, "grad_norm": 0.6302788257598877, "learning_rate": 0.0002, "epoch": 0.541472172351885, "step": 7540}, {"loss": 0.7772, "grad_norm": 0.5950970649719238, "learning_rate": 0.0002, "epoch": 0.5421903052064632, "step": 7550}, {"loss": 0.8076, "grad_norm": 0.4275270104408264, "learning_rate": 0.0002, "epoch": 0.5429084380610413, "step": 7560}, {"loss": 0.8158, "grad_norm": 0.7579900622367859, "learning_rate": 0.0002, "epoch": 0.5436265709156194, "step": 7570}, {"loss": 0.8036, "grad_norm": 0.5835317969322205, "learning_rate": 0.0002, "epoch": 0.5443447037701975, "step": 7580}, {"loss": 0.7947, "grad_norm": 0.5305142998695374, "learning_rate": 0.0002, "epoch": 0.5450628366247756, "step": 7590}, {"loss": 0.8043, "grad_norm": 0.6076129674911499, "learning_rate": 0.0002, "epoch": 0.5457809694793537, "step": 7600}, {"loss": 0.8197, "grad_norm": 0.5341935753822327, "learning_rate": 0.0002, "epoch": 0.5464991023339317, "step": 7610}, {"loss": 0.7424, "grad_norm": 0.6070826053619385, "learning_rate": 0.0002, "epoch": 0.5472172351885098, "step": 7620}, {"loss": 0.7801, "grad_norm": 0.6193035840988159, "learning_rate": 0.0002, "epoch": 0.547935368043088, "step": 7630}, {"loss": 0.7639, "grad_norm": 0.6171614527702332, "learning_rate": 0.0002, "epoch": 0.5486535008976661, "step": 7640}, {"loss": 0.7655, "grad_norm": 0.5700938105583191, "learning_rate": 0.0002, "epoch": 0.5493716337522442, "step": 7650}, {"loss": 0.8289, "grad_norm": 0.5742418169975281, "learning_rate": 0.0002, "epoch": 0.5500897666068223, "step": 7660}, {"loss": 0.7942, "grad_norm": 0.6450320482254028, "learning_rate": 0.0002, "epoch": 0.5508078994614004, "step": 7670}, {"loss": 0.807, "grad_norm": 0.542860209941864, "learning_rate": 0.0002, "epoch": 0.5515260323159784, "step": 7680}, {"loss": 0.8298, "grad_norm": 0.538007915019989, "learning_rate": 0.0002, "epoch": 0.5522441651705565, "step": 7690}, {"loss": 0.8301, "grad_norm": 0.5846288204193115, "learning_rate": 0.0002, "epoch": 0.5529622980251346, "step": 7700}, {"loss": 0.7893, "grad_norm": 0.623315155506134, "learning_rate": 0.0002, "epoch": 0.5536804308797127, "step": 7710}, {"loss": 0.8043, "grad_norm": 0.6607962250709534, "learning_rate": 0.0002, "epoch": 0.5543985637342909, "step": 7720}, {"loss": 0.7615, "grad_norm": 0.5258557200431824, "learning_rate": 0.0002, "epoch": 0.555116696588869, "step": 7730}, {"loss": 0.8177, "grad_norm": 0.6464316844940186, "learning_rate": 0.0002, "epoch": 0.5558348294434471, "step": 7740}, {"loss": 0.7683, "grad_norm": 0.6390621662139893, "learning_rate": 0.0002, "epoch": 0.5565529622980251, "step": 7750}, {"loss": 0.8447, "grad_norm": 0.5327560305595398, "learning_rate": 0.0002, "epoch": 0.5572710951526032, "step": 7760}, {"loss": 0.7833, "grad_norm": 0.8202064633369446, "learning_rate": 0.0002, "epoch": 0.5579892280071813, "step": 7770}, {"loss": 0.7818, "grad_norm": 0.45350968837738037, "learning_rate": 0.0002, "epoch": 0.5587073608617594, "step": 7780}, {"loss": 0.7299, "grad_norm": 0.5031413435935974, "learning_rate": 0.0002, "epoch": 0.5594254937163375, "step": 7790}, {"loss": 0.7542, "grad_norm": 0.5047417879104614, "learning_rate": 0.0002, "epoch": 0.5601436265709157, "step": 7800}, {"loss": 0.7989, "grad_norm": 0.668912410736084, "learning_rate": 0.0002, "epoch": 0.5608617594254938, "step": 7810}, {"loss": 0.8226, "grad_norm": 0.6106061339378357, "learning_rate": 0.0002, "epoch": 0.5615798922800718, "step": 7820}, {"loss": 0.7489, "grad_norm": 0.5558443665504456, "learning_rate": 0.0002, "epoch": 0.5622980251346499, "step": 7830}, {"loss": 0.79, "grad_norm": 0.5937177538871765, "learning_rate": 0.0002, "epoch": 0.563016157989228, "step": 7840}, {"loss": 0.7857, "grad_norm": 0.67307448387146, "learning_rate": 0.0002, "epoch": 0.5637342908438061, "step": 7850}, {"loss": 0.8037, "grad_norm": 0.4615475833415985, "learning_rate": 0.0002, "epoch": 0.5644524236983842, "step": 7860}, {"loss": 0.7519, "grad_norm": 0.5462577939033508, "learning_rate": 0.0002, "epoch": 0.5651705565529623, "step": 7870}, {"loss": 0.7821, "grad_norm": 0.6422402858734131, "learning_rate": 0.0002, "epoch": 0.5658886894075404, "step": 7880}, {"loss": 0.8327, "grad_norm": 0.5313532948493958, "learning_rate": 0.0002, "epoch": 0.5666068222621184, "step": 7890}, {"loss": 0.7771, "grad_norm": 0.5647847056388855, "learning_rate": 0.0002, "epoch": 0.5673249551166966, "step": 7900}, {"loss": 0.8126, "grad_norm": 0.6581610441207886, "learning_rate": 0.0002, "epoch": 0.5680430879712747, "step": 7910}, {"loss": 0.7549, "grad_norm": 0.46947669982910156, "learning_rate": 0.0002, "epoch": 0.5687612208258528, "step": 7920}, {"loss": 0.8333, "grad_norm": 0.6420038342475891, "learning_rate": 0.0002, "epoch": 0.5694793536804309, "step": 7930}, {"loss": 0.7921, "grad_norm": 0.6730441451072693, "learning_rate": 0.0002, "epoch": 0.570197486535009, "step": 7940}, {"loss": 0.7668, "grad_norm": 0.3849070966243744, "learning_rate": 0.0002, "epoch": 0.5709156193895871, "step": 7950}, {"loss": 0.8297, "grad_norm": 0.6076335906982422, "learning_rate": 0.0002, "epoch": 0.5716337522441651, "step": 7960}, {"loss": 0.7932, "grad_norm": 0.6446982026100159, "learning_rate": 0.0002, "epoch": 0.5723518850987432, "step": 7970}, {"loss": 0.7988, "grad_norm": 0.6019234657287598, "learning_rate": 0.0002, "epoch": 0.5730700179533214, "step": 7980}, {"loss": 0.8103, "grad_norm": 0.620880663394928, "learning_rate": 0.0002, "epoch": 0.5737881508078995, "step": 7990}, {"loss": 0.7712, "grad_norm": 0.4927573502063751, "learning_rate": 0.0002, "epoch": 0.5745062836624776, "step": 8000}, {"loss": 0.7499, "grad_norm": 0.6276804804801941, "learning_rate": 0.0002, "epoch": 0.5752244165170557, "step": 8010}, {"loss": 0.8232, "grad_norm": 0.484518826007843, "learning_rate": 0.0002, "epoch": 0.5759425493716338, "step": 8020}, {"loss": 0.7658, "grad_norm": 0.5019962787628174, "learning_rate": 0.0002, "epoch": 0.5766606822262118, "step": 8030}, {"loss": 0.7827, "grad_norm": 0.6685234308242798, "learning_rate": 0.0002, "epoch": 0.5773788150807899, "step": 8040}, {"loss": 0.7811, "grad_norm": 0.5762107372283936, "learning_rate": 0.0002, "epoch": 0.578096947935368, "step": 8050}, {"loss": 0.8256, "grad_norm": 0.6402477025985718, "learning_rate": 0.0002, "epoch": 0.5788150807899461, "step": 8060}, {"loss": 0.779, "grad_norm": 0.5919345617294312, "learning_rate": 0.0002, "epoch": 0.5795332136445243, "step": 8070}, {"loss": 0.8179, "grad_norm": 0.47100913524627686, "learning_rate": 0.0002, "epoch": 0.5802513464991024, "step": 8080}, {"loss": 0.7832, "grad_norm": 0.6029118895530701, "learning_rate": 0.0002, "epoch": 0.5809694793536805, "step": 8090}, {"loss": 0.8061, "grad_norm": 0.5896338820457458, "learning_rate": 0.0002, "epoch": 0.5816876122082585, "step": 8100}, {"loss": 0.7991, "grad_norm": 0.49017754197120667, "learning_rate": 0.0002, "epoch": 0.5824057450628366, "step": 8110}, {"loss": 0.8148, "grad_norm": 0.5049256086349487, "learning_rate": 0.0002, "epoch": 0.5831238779174147, "step": 8120}, {"loss": 0.7561, "grad_norm": 0.6874517798423767, "learning_rate": 0.0002, "epoch": 0.5838420107719928, "step": 8130}, {"loss": 0.7908, "grad_norm": 0.5429391264915466, "learning_rate": 0.0002, "epoch": 0.5845601436265709, "step": 8140}, {"loss": 0.7834, "grad_norm": 0.5533722639083862, "learning_rate": 0.0002, "epoch": 0.585278276481149, "step": 8150}, {"loss": 0.7725, "grad_norm": 0.5827956199645996, "learning_rate": 0.0002, "epoch": 0.5859964093357272, "step": 8160}, {"loss": 0.7758, "grad_norm": 0.6670212149620056, "learning_rate": 0.0002, "epoch": 0.5867145421903052, "step": 8170}, {"loss": 0.7625, "grad_norm": 0.5231172442436218, "learning_rate": 0.0002, "epoch": 0.5874326750448833, "step": 8180}, {"loss": 0.7975, "grad_norm": 0.567447304725647, "learning_rate": 0.0002, "epoch": 0.5881508078994614, "step": 8190}, {"loss": 0.7463, "grad_norm": 0.5318575501441956, "learning_rate": 0.0002, "epoch": 0.5888689407540395, "step": 8200}, {"loss": 0.7961, "grad_norm": 0.6959463357925415, "learning_rate": 0.0002, "epoch": 0.5895870736086176, "step": 8210}, {"loss": 0.7575, "grad_norm": 0.6964931488037109, "learning_rate": 0.0002, "epoch": 0.5903052064631957, "step": 8220}, {"loss": 0.8382, "grad_norm": 0.5164617896080017, "learning_rate": 0.0002, "epoch": 0.5910233393177737, "step": 8230}, {"loss": 0.8152, "grad_norm": 0.5456110239028931, "learning_rate": 0.0002, "epoch": 0.5917414721723518, "step": 8240}, {"loss": 0.7627, "grad_norm": 0.6553666591644287, "learning_rate": 0.0002, "epoch": 0.59245960502693, "step": 8250}, {"loss": 0.8134, "grad_norm": 0.6185845732688904, "learning_rate": 0.0002, "epoch": 0.5931777378815081, "step": 8260}, {"loss": 0.8216, "grad_norm": 0.6110545992851257, "learning_rate": 0.0002, "epoch": 0.5938958707360862, "step": 8270}, {"loss": 0.805, "grad_norm": 0.5186824202537537, "learning_rate": 0.0002, "epoch": 0.5946140035906643, "step": 8280}, {"loss": 0.7934, "grad_norm": 0.7003735303878784, "learning_rate": 0.0002, "epoch": 0.5953321364452424, "step": 8290}, {"loss": 0.8095, "grad_norm": 0.4606216549873352, "learning_rate": 0.0002, "epoch": 0.5960502692998204, "step": 8300}, {"loss": 0.8051, "grad_norm": 0.5903441309928894, "learning_rate": 0.0002, "epoch": 0.5967684021543985, "step": 8310}, {"loss": 0.7861, "grad_norm": 0.7916744947433472, "learning_rate": 0.0002, "epoch": 0.5974865350089766, "step": 8320}, {"loss": 0.8234, "grad_norm": 0.5506401062011719, "learning_rate": 0.0002, "epoch": 0.5982046678635548, "step": 8330}, {"loss": 0.8137, "grad_norm": 0.5749204158782959, "learning_rate": 0.0002, "epoch": 0.5989228007181329, "step": 8340}, {"loss": 0.8133, "grad_norm": 0.6807544827461243, "learning_rate": 0.0002, "epoch": 0.599640933572711, "step": 8350}, {"loss": 0.8089, "grad_norm": 0.5782986283302307, "learning_rate": 0.0002, "epoch": 0.6003590664272891, "step": 8360}, {"loss": 0.8725, "grad_norm": 0.7336342334747314, "learning_rate": 0.0002, "epoch": 0.6010771992818671, "step": 8370}, {"loss": 0.7992, "grad_norm": 0.5762712955474854, "learning_rate": 0.0002, "epoch": 0.6017953321364452, "step": 8380}, {"loss": 0.8037, "grad_norm": 0.5726776719093323, "learning_rate": 0.0002, "epoch": 0.6025134649910233, "step": 8390}, {"loss": 0.7918, "grad_norm": 0.5355535745620728, "learning_rate": 0.0002, "epoch": 0.6032315978456014, "step": 8400}, {"loss": 0.8138, "grad_norm": 0.6762161254882812, "learning_rate": 0.0002, "epoch": 0.6039497307001795, "step": 8410}, {"loss": 0.8357, "grad_norm": 0.8200717568397522, "learning_rate": 0.0002, "epoch": 0.6046678635547577, "step": 8420}, {"loss": 0.79, "grad_norm": 0.5600009560585022, "learning_rate": 0.0002, "epoch": 0.6053859964093358, "step": 8430}, {"loss": 0.7387, "grad_norm": 0.6465966105461121, "learning_rate": 0.0002, "epoch": 0.6061041292639138, "step": 8440}, {"loss": 0.838, "grad_norm": 0.5176072120666504, "learning_rate": 0.0002, "epoch": 0.6068222621184919, "step": 8450}, {"loss": 0.7855, "grad_norm": 0.5777280926704407, "learning_rate": 0.0002, "epoch": 0.60754039497307, "step": 8460}, {"loss": 0.7776, "grad_norm": 0.5989252924919128, "learning_rate": 0.0002, "epoch": 0.6082585278276481, "step": 8470}, {"loss": 0.8216, "grad_norm": 0.5207306742668152, "learning_rate": 0.0002, "epoch": 0.6089766606822262, "step": 8480}, {"loss": 0.8092, "grad_norm": 0.5242675542831421, "learning_rate": 0.0002, "epoch": 0.6096947935368043, "step": 8490}, {"loss": 0.7546, "grad_norm": 0.5631455183029175, "learning_rate": 0.0002, "epoch": 0.6104129263913824, "step": 8500}, {"loss": 0.7495, "grad_norm": 0.65207439661026, "learning_rate": 0.0002, "epoch": 0.6111310592459605, "step": 8510}, {"loss": 0.8023, "grad_norm": 0.5808899998664856, "learning_rate": 0.0002, "epoch": 0.6118491921005386, "step": 8520}, {"loss": 0.7763, "grad_norm": 0.558127760887146, "learning_rate": 0.0002, "epoch": 0.6125673249551167, "step": 8530}, {"loss": 0.8012, "grad_norm": 0.6063143014907837, "learning_rate": 0.0002, "epoch": 0.6132854578096948, "step": 8540}, {"loss": 0.7496, "grad_norm": 0.5491744875907898, "learning_rate": 0.0002, "epoch": 0.6140035906642729, "step": 8550}, {"loss": 0.779, "grad_norm": 0.5105780959129333, "learning_rate": 0.0002, "epoch": 0.614721723518851, "step": 8560}, {"loss": 0.7983, "grad_norm": 0.6892395615577698, "learning_rate": 0.0002, "epoch": 0.6154398563734291, "step": 8570}, {"loss": 0.7563, "grad_norm": 0.7411758899688721, "learning_rate": 0.0002, "epoch": 0.6161579892280071, "step": 8580}, {"loss": 0.7455, "grad_norm": 0.6745429635047913, "learning_rate": 0.0002, "epoch": 0.6168761220825852, "step": 8590}, {"loss": 0.8213, "grad_norm": 0.596007227897644, "learning_rate": 0.0002, "epoch": 0.6175942549371634, "step": 8600}, {"loss": 0.7963, "grad_norm": 0.6751060485839844, "learning_rate": 0.0002, "epoch": 0.6183123877917415, "step": 8610}, {"loss": 0.7343, "grad_norm": 0.711124837398529, "learning_rate": 0.0002, "epoch": 0.6190305206463196, "step": 8620}, {"loss": 0.773, "grad_norm": 0.6110914945602417, "learning_rate": 0.0002, "epoch": 0.6197486535008977, "step": 8630}, {"loss": 0.7497, "grad_norm": 0.5687659978866577, "learning_rate": 0.0002, "epoch": 0.6204667863554758, "step": 8640}, {"loss": 0.7754, "grad_norm": 0.7025772929191589, "learning_rate": 0.0002, "epoch": 0.6211849192100538, "step": 8650}, {"loss": 0.7423, "grad_norm": 0.6456184983253479, "learning_rate": 0.0002, "epoch": 0.6219030520646319, "step": 8660}, {"loss": 0.7449, "grad_norm": 0.5317023992538452, "learning_rate": 0.0002, "epoch": 0.62262118491921, "step": 8670}, {"loss": 0.8146, "grad_norm": 0.5531691908836365, "learning_rate": 0.0002, "epoch": 0.6233393177737881, "step": 8680}, {"loss": 0.8171, "grad_norm": 0.6063531637191772, "learning_rate": 0.0002, "epoch": 0.6240574506283663, "step": 8690}, {"loss": 0.7943, "grad_norm": 1.094390630722046, "learning_rate": 0.0002, "epoch": 0.6247755834829444, "step": 8700}, {"loss": 0.7993, "grad_norm": 0.5558148622512817, "learning_rate": 0.0002, "epoch": 0.6254937163375225, "step": 8710}, {"loss": 0.7747, "grad_norm": 0.5470370054244995, "learning_rate": 0.0002, "epoch": 0.6262118491921005, "step": 8720}, {"loss": 0.8252, "grad_norm": 0.5852634310722351, "learning_rate": 0.0002, "epoch": 0.6269299820466786, "step": 8730}, {"loss": 0.8712, "grad_norm": 0.6120240092277527, "learning_rate": 0.0002, "epoch": 0.6276481149012567, "step": 8740}, {"loss": 0.8367, "grad_norm": 0.5608004927635193, "learning_rate": 0.0002, "epoch": 0.6283662477558348, "step": 8750}, {"loss": 0.7711, "grad_norm": 0.5980432033538818, "learning_rate": 0.0002, "epoch": 0.6290843806104129, "step": 8760}, {"loss": 0.7903, "grad_norm": 0.5670580863952637, "learning_rate": 0.0002, "epoch": 0.629802513464991, "step": 8770}, {"loss": 0.7765, "grad_norm": 0.5931687951087952, "learning_rate": 0.0002, "epoch": 0.6305206463195692, "step": 8780}, {"loss": 0.7752, "grad_norm": 0.7872577905654907, "learning_rate": 0.0002, "epoch": 0.6312387791741472, "step": 8790}, {"loss": 0.8045, "grad_norm": 0.6355181336402893, "learning_rate": 0.0002, "epoch": 0.6319569120287253, "step": 8800}, {"loss": 0.7651, "grad_norm": 0.501913845539093, "learning_rate": 0.0002, "epoch": 0.6326750448833034, "step": 8810}, {"loss": 0.8023, "grad_norm": 0.5956716537475586, "learning_rate": 0.0002, "epoch": 0.6333931777378815, "step": 8820}, {"loss": 0.798, "grad_norm": 0.6448253393173218, "learning_rate": 0.0002, "epoch": 0.6341113105924596, "step": 8830}, {"loss": 0.7878, "grad_norm": 0.6139631271362305, "learning_rate": 0.0002, "epoch": 0.6348294434470377, "step": 8840}, {"loss": 0.7767, "grad_norm": 0.5894306302070618, "learning_rate": 0.0002, "epoch": 0.6355475763016158, "step": 8850}, {"loss": 0.7516, "grad_norm": 0.8724799752235413, "learning_rate": 0.0002, "epoch": 0.6362657091561938, "step": 8860}, {"loss": 0.7715, "grad_norm": 0.5413858890533447, "learning_rate": 0.0002, "epoch": 0.636983842010772, "step": 8870}, {"loss": 0.8175, "grad_norm": 0.5993430614471436, "learning_rate": 0.0002, "epoch": 0.6377019748653501, "step": 8880}, {"loss": 0.7865, "grad_norm": 0.539415717124939, "learning_rate": 0.0002, "epoch": 0.6384201077199282, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.600125789642334, "learning_rate": 0.0002, "epoch": 0.6391382405745063, "step": 8900}, {"loss": 0.7886, "grad_norm": 0.5597978234291077, "learning_rate": 0.0002, "epoch": 0.6398563734290844, "step": 8910}, {"loss": 0.8468, "grad_norm": 0.6262031197547913, "learning_rate": 0.0002, "epoch": 0.6405745062836625, "step": 8920}, {"loss": 0.7523, "grad_norm": 0.72662752866745, "learning_rate": 0.0002, "epoch": 0.6412926391382405, "step": 8930}, {"loss": 0.8099, "grad_norm": 0.613002598285675, "learning_rate": 0.0002, "epoch": 0.6420107719928186, "step": 8940}, {"loss": 0.8112, "grad_norm": 0.6511827707290649, "learning_rate": 0.0002, "epoch": 0.6427289048473968, "step": 8950}, {"loss": 0.7479, "grad_norm": 0.5383973717689514, "learning_rate": 0.0002, "epoch": 0.6434470377019749, "step": 8960}, {"loss": 0.764, "grad_norm": 0.5236184597015381, "learning_rate": 0.0002, "epoch": 0.644165170556553, "step": 8970}, {"loss": 0.7515, "grad_norm": 0.5938544273376465, "learning_rate": 0.0002, "epoch": 0.6448833034111311, "step": 8980}, {"loss": 0.8103, "grad_norm": 0.4594680964946747, "learning_rate": 0.0002, "epoch": 0.6456014362657092, "step": 8990}, {"loss": 0.7495, "grad_norm": 0.6314211487770081, "learning_rate": 0.0002, "epoch": 0.6463195691202872, "step": 9000}, {"loss": 0.8162, "grad_norm": 0.6291103363037109, "learning_rate": 0.0002, "epoch": 0.6470377019748653, "step": 9010}, {"loss": 0.8167, "grad_norm": 0.5888266563415527, "learning_rate": 0.0002, "epoch": 0.6477558348294434, "step": 9020}, {"loss": 0.7685, "grad_norm": 0.5613022446632385, "learning_rate": 0.0002, "epoch": 0.6484739676840215, "step": 9030}, {"loss": 0.8142, "grad_norm": 0.7219604253768921, "learning_rate": 0.0002, "epoch": 0.6491921005385997, "step": 9040}, {"loss": 0.805, "grad_norm": 0.5846529006958008, "learning_rate": 0.0002, "epoch": 0.6499102333931778, "step": 9050}, {"loss": 0.8471, "grad_norm": 0.7264063954353333, "learning_rate": 0.0002, "epoch": 0.6506283662477559, "step": 9060}, {"loss": 0.7925, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "epoch": 0.6513464991023339, "step": 9070}, {"loss": 0.7961, "grad_norm": 0.4857395887374878, "learning_rate": 0.0002, "epoch": 0.652064631956912, "step": 9080}, {"loss": 0.7567, "grad_norm": 0.5044030547142029, "learning_rate": 0.0002, "epoch": 0.6527827648114901, "step": 9090}, {"loss": 0.7889, "grad_norm": 0.6105342507362366, "learning_rate": 0.0002, "epoch": 0.6535008976660682, "step": 9100}, {"loss": 0.7692, "grad_norm": 0.6408740282058716, "learning_rate": 0.0002, "epoch": 0.6542190305206463, "step": 9110}, {"loss": 0.7788, "grad_norm": 0.7474880814552307, "learning_rate": 0.0002, "epoch": 0.6549371633752245, "step": 9120}, {"loss": 0.7694, "grad_norm": 0.584768533706665, "learning_rate": 0.0002, "epoch": 0.6556552962298026, "step": 9130}, {"loss": 0.8273, "grad_norm": 0.6368113160133362, "learning_rate": 0.0002, "epoch": 0.6563734290843806, "step": 9140}, {"loss": 0.7493, "grad_norm": 0.693631649017334, "learning_rate": 0.0002, "epoch": 0.6570915619389587, "step": 9150}, {"loss": 0.7636, "grad_norm": 0.6094512343406677, "learning_rate": 0.0002, "epoch": 0.6578096947935368, "step": 9160}, {"loss": 0.8269, "grad_norm": 0.7154942750930786, "learning_rate": 0.0002, "epoch": 0.6585278276481149, "step": 9170}, {"loss": 0.7623, "grad_norm": 0.5749237537384033, "learning_rate": 0.0002, "epoch": 0.659245960502693, "step": 9180}, {"loss": 0.799, "grad_norm": 0.6214450001716614, "learning_rate": 0.0002, "epoch": 0.6599640933572711, "step": 9190}, {"loss": 0.7973, "grad_norm": 0.6357814073562622, "learning_rate": 0.0002, "epoch": 0.6606822262118492, "step": 9200}, {"loss": 0.773, "grad_norm": 0.5677326917648315, "learning_rate": 0.0002, "epoch": 0.6614003590664272, "step": 9210}, {"loss": 0.8173, "grad_norm": 0.5432633757591248, "learning_rate": 0.0002, "epoch": 0.6621184919210054, "step": 9220}, {"loss": 0.7573, "grad_norm": 0.43935060501098633, "learning_rate": 0.0002, "epoch": 0.6628366247755835, "step": 9230}, {"loss": 0.848, "grad_norm": 0.5350922346115112, "learning_rate": 0.0002, "epoch": 0.6635547576301616, "step": 9240}, {"loss": 0.7409, "grad_norm": 0.7745687365531921, "learning_rate": 0.0002, "epoch": 0.6642728904847397, "step": 9250}, {"loss": 0.7412, "grad_norm": 0.5767113566398621, "learning_rate": 0.0002, "epoch": 0.6649910233393178, "step": 9260}, {"loss": 0.8197, "grad_norm": 0.49304983019828796, "learning_rate": 0.0002, "epoch": 0.6657091561938959, "step": 9270}, {"loss": 0.7856, "grad_norm": 0.6355269551277161, "learning_rate": 0.0002, "epoch": 0.6664272890484739, "step": 9280}, {"loss": 0.7659, "grad_norm": 0.5539451241493225, "learning_rate": 0.0002, "epoch": 0.667145421903052, "step": 9290}, {"loss": 0.7888, "grad_norm": 0.5225138068199158, "learning_rate": 0.0002, "epoch": 0.6678635547576302, "step": 9300}, {"loss": 0.8048, "grad_norm": 0.5435736179351807, "learning_rate": 0.0002, "epoch": 0.6685816876122083, "step": 9310}, {"loss": 0.8284, "grad_norm": 0.611266553401947, "learning_rate": 0.0002, "epoch": 0.6692998204667864, "step": 9320}, {"loss": 0.8081, "grad_norm": 0.5880926251411438, "learning_rate": 0.0002, "epoch": 0.6700179533213645, "step": 9330}, {"loss": 0.7781, "grad_norm": 0.5301468372344971, "learning_rate": 0.0002, "epoch": 0.6707360861759426, "step": 9340}, {"loss": 0.7586, "grad_norm": 0.5614377856254578, "learning_rate": 0.0002, "epoch": 0.6714542190305206, "step": 9350}, {"loss": 0.7538, "grad_norm": 0.7177342176437378, "learning_rate": 0.0002, "epoch": 0.6721723518850987, "step": 9360}, {"loss": 0.7412, "grad_norm": 0.5187423825263977, "learning_rate": 0.0002, "epoch": 0.6728904847396768, "step": 9370}, {"loss": 0.7456, "grad_norm": 0.49305087327957153, "learning_rate": 0.0002, "epoch": 0.6736086175942549, "step": 9380}, {"loss": 0.7926, "grad_norm": 0.555867612361908, "learning_rate": 0.0002, "epoch": 0.6743267504488331, "step": 9390}, {"loss": 0.7486, "grad_norm": 0.8308040499687195, "learning_rate": 0.0002, "epoch": 0.6750448833034112, "step": 9400}, {"loss": 0.8225, "grad_norm": 0.6522438526153564, "learning_rate": 0.0002, "epoch": 0.6757630161579893, "step": 9410}, {"loss": 0.8283, "grad_norm": 0.5768371224403381, "learning_rate": 0.0002, "epoch": 0.6764811490125673, "step": 9420}, {"loss": 0.7815, "grad_norm": 0.783802330493927, "learning_rate": 0.0002, "epoch": 0.6771992818671454, "step": 9430}, {"loss": 0.7511, "grad_norm": 0.5246656537055969, "learning_rate": 0.0002, "epoch": 0.6779174147217235, "step": 9440}, {"loss": 0.7866, "grad_norm": 0.6630974411964417, "learning_rate": 0.0002, "epoch": 0.6786355475763016, "step": 9450}, {"loss": 0.7961, "grad_norm": 0.5012770295143127, "learning_rate": 0.0002, "epoch": 0.6793536804308797, "step": 9460}, {"loss": 0.7762, "grad_norm": 0.6208643317222595, "learning_rate": 0.0002, "epoch": 0.6800718132854578, "step": 9470}, {"loss": 0.7229, "grad_norm": 0.6033898591995239, "learning_rate": 0.0002, "epoch": 0.680789946140036, "step": 9480}, {"loss": 0.8315, "grad_norm": 0.6613174080848694, "learning_rate": 0.0002, "epoch": 0.681508078994614, "step": 9490}, {"loss": 0.7874, "grad_norm": 0.6417899131774902, "learning_rate": 0.0002, "epoch": 0.6822262118491921, "step": 9500}, {"loss": 0.7979, "grad_norm": 0.5060321092605591, "learning_rate": 0.0002, "epoch": 0.6829443447037702, "step": 9510}, {"loss": 0.7908, "grad_norm": 0.586670458316803, "learning_rate": 0.0002, "epoch": 0.6836624775583483, "step": 9520}, {"loss": 0.7652, "grad_norm": 0.6607828736305237, "learning_rate": 0.0002, "epoch": 0.6843806104129264, "step": 9530}, {"loss": 0.7645, "grad_norm": 0.5142775177955627, "learning_rate": 0.0002, "epoch": 0.6850987432675045, "step": 9540}, {"loss": 0.7553, "grad_norm": 0.741000771522522, "learning_rate": 0.0002, "epoch": 0.6858168761220825, "step": 9550}, {"loss": 0.8453, "grad_norm": 0.4687826335430145, "learning_rate": 0.0002, "epoch": 0.6865350089766606, "step": 9560}, {"loss": 0.7582, "grad_norm": 0.6452056169509888, "learning_rate": 0.0002, "epoch": 0.6872531418312388, "step": 9570}, {"loss": 0.7965, "grad_norm": 0.6393555402755737, "learning_rate": 0.0002, "epoch": 0.6879712746858169, "step": 9580}, {"loss": 0.802, "grad_norm": 0.4907757043838501, "learning_rate": 0.0002, "epoch": 0.688689407540395, "step": 9590}, {"loss": 0.7813, "grad_norm": 0.5380825996398926, "learning_rate": 0.0002, "epoch": 0.6894075403949731, "step": 9600}, {"loss": 0.8188, "grad_norm": 0.5657393932342529, "learning_rate": 0.0002, "epoch": 0.6901256732495512, "step": 9610}, {"loss": 0.7581, "grad_norm": 0.8505447506904602, "learning_rate": 0.0002, "epoch": 0.6908438061041292, "step": 9620}, {"loss": 0.7631, "grad_norm": 0.5389836430549622, "learning_rate": 0.0002, "epoch": 0.6915619389587073, "step": 9630}, {"loss": 0.8015, "grad_norm": 0.4977441728115082, "learning_rate": 0.0002, "epoch": 0.6922800718132854, "step": 9640}, {"loss": 0.8057, "grad_norm": 0.5855389833450317, "learning_rate": 0.0002, "epoch": 0.6929982046678635, "step": 9650}, {"loss": 0.7735, "grad_norm": 0.633994996547699, "learning_rate": 0.0002, "epoch": 0.6937163375224417, "step": 9660}, {"loss": 0.7918, "grad_norm": 0.5592191815376282, "learning_rate": 0.0002, "epoch": 0.6944344703770198, "step": 9670}, {"loss": 0.7883, "grad_norm": 0.6030594706535339, "learning_rate": 0.0002, "epoch": 0.6951526032315979, "step": 9680}, {"loss": 0.7472, "grad_norm": 0.6782388687133789, "learning_rate": 0.0002, "epoch": 0.6958707360861759, "step": 9690}, {"loss": 0.8097, "grad_norm": 0.6777627468109131, "learning_rate": 0.0002, "epoch": 0.696588868940754, "step": 9700}, {"loss": 0.7958, "grad_norm": 0.5674123764038086, "learning_rate": 0.0002, "epoch": 0.6973070017953321, "step": 9710}, {"loss": 0.7743, "grad_norm": 0.5280387997627258, "learning_rate": 0.0002, "epoch": 0.6980251346499102, "step": 9720}, {"loss": 0.7496, "grad_norm": 0.5471981763839722, "learning_rate": 0.0002, "epoch": 0.6987432675044883, "step": 9730}, {"loss": 0.7837, "grad_norm": 0.6751061677932739, "learning_rate": 0.0002, "epoch": 0.6994614003590665, "step": 9740}, {"loss": 0.7686, "grad_norm": 0.5942487716674805, "learning_rate": 0.0002, "epoch": 0.7001795332136446, "step": 9750}, {"loss": 0.757, "grad_norm": 0.6165713667869568, "learning_rate": 0.0002, "epoch": 0.7008976660682226, "step": 9760}, {"loss": 0.7864, "grad_norm": 0.5745091438293457, "learning_rate": 0.0002, "epoch": 0.7016157989228007, "step": 9770}, {"loss": 0.8079, "grad_norm": 0.600308358669281, "learning_rate": 0.0002, "epoch": 0.7023339317773788, "step": 9780}, {"loss": 0.7527, "grad_norm": 0.6448577046394348, "learning_rate": 0.0002, "epoch": 0.7030520646319569, "step": 9790}, {"loss": 0.7725, "grad_norm": 0.5662767291069031, "learning_rate": 0.0002, "epoch": 0.703770197486535, "step": 9800}, {"loss": 0.8028, "grad_norm": 0.6490433812141418, "learning_rate": 0.0002, "epoch": 0.7044883303411131, "step": 9810}, {"loss": 0.8006, "grad_norm": 0.6126134991645813, "learning_rate": 0.0002, "epoch": 0.7052064631956912, "step": 9820}, {"loss": 0.8034, "grad_norm": 0.7181116938591003, "learning_rate": 0.0002, "epoch": 0.7059245960502692, "step": 9830}, {"loss": 0.7937, "grad_norm": 0.7805212140083313, "learning_rate": 0.0002, "epoch": 0.7066427289048474, "step": 9840}, {"loss": 0.7781, "grad_norm": 0.7521958947181702, "learning_rate": 0.0002, "epoch": 0.7073608617594255, "step": 9850}, {"loss": 0.7412, "grad_norm": 0.5610787868499756, "learning_rate": 0.0002, "epoch": 0.7080789946140036, "step": 9860}, {"loss": 0.7627, "grad_norm": 0.7026229500770569, "learning_rate": 0.0002, "epoch": 0.7087971274685817, "step": 9870}, {"loss": 0.8085, "grad_norm": 0.551691472530365, "learning_rate": 0.0002, "epoch": 0.7095152603231598, "step": 9880}, {"loss": 0.7874, "grad_norm": 0.5841995477676392, "learning_rate": 0.0002, "epoch": 0.7102333931777379, "step": 9890}, {"loss": 0.7749, "grad_norm": 0.7170061469078064, "learning_rate": 0.0002, "epoch": 0.7109515260323159, "step": 9900}, {"loss": 0.7917, "grad_norm": 0.49836990237236023, "learning_rate": 0.0002, "epoch": 0.711669658886894, "step": 9910}, {"loss": 0.7667, "grad_norm": 0.5234556794166565, "learning_rate": 0.0002, "epoch": 0.7123877917414722, "step": 9920}, {"loss": 0.8438, "grad_norm": 0.7590384483337402, "learning_rate": 0.0002, "epoch": 0.7131059245960503, "step": 9930}, {"loss": 0.7725, "grad_norm": 0.5657515525817871, "learning_rate": 0.0002, "epoch": 0.7138240574506284, "step": 9940}, {"loss": 0.8184, "grad_norm": 0.5969128012657166, "learning_rate": 0.0002, "epoch": 0.7145421903052065, "step": 9950}, {"loss": 0.7375, "grad_norm": 0.7136867046356201, "learning_rate": 0.0002, "epoch": 0.7152603231597846, "step": 9960}, {"loss": 0.7883, "grad_norm": 0.6774699091911316, "learning_rate": 0.0002, "epoch": 0.7159784560143626, "step": 9970}, {"loss": 0.7629, "grad_norm": 0.6066371202468872, "learning_rate": 0.0002, "epoch": 0.7166965888689407, "step": 9980}, {"loss": 0.7767, "grad_norm": 0.7355279922485352, "learning_rate": 0.0002, "epoch": 0.7174147217235188, "step": 9990}, {"loss": 0.7643, "grad_norm": 0.7996646761894226, "learning_rate": 0.0002, "epoch": 0.718132854578097, "step": 10000}, {"loss": 0.8304, "grad_norm": 0.628839910030365, "learning_rate": 0.0002, "epoch": 0.7188509874326751, "step": 10010}, {"loss": 0.7292, "grad_norm": 0.5472931265830994, "learning_rate": 0.0002, "epoch": 0.7195691202872532, "step": 10020}, {"loss": 0.7787, "grad_norm": 0.5776344537734985, "learning_rate": 0.0002, "epoch": 0.7202872531418313, "step": 10030}, {"loss": 0.7432, "grad_norm": 0.5041707158088684, "learning_rate": 0.0002, "epoch": 0.7210053859964093, "step": 10040}, {"loss": 0.7923, "grad_norm": 0.5965308547019958, "learning_rate": 0.0002, "epoch": 0.7217235188509874, "step": 10050}, {"loss": 0.8131, "grad_norm": 0.5892689228057861, "learning_rate": 0.0002, "epoch": 0.7224416517055655, "step": 10060}, {"loss": 0.7961, "grad_norm": 0.5695884227752686, "learning_rate": 0.0002, "epoch": 0.7231597845601436, "step": 10070}, {"loss": 0.7806, "grad_norm": 0.6547690629959106, "learning_rate": 0.0002, "epoch": 0.7238779174147217, "step": 10080}, {"loss": 0.7978, "grad_norm": 0.6759928464889526, "learning_rate": 0.0002, "epoch": 0.7245960502692999, "step": 10090}, {"loss": 0.7547, "grad_norm": 0.6829725503921509, "learning_rate": 0.0002, "epoch": 0.725314183123878, "step": 10100}, {"loss": 0.7507, "grad_norm": 0.5242751240730286, "learning_rate": 0.0002, "epoch": 0.726032315978456, "step": 10110}, {"loss": 0.8042, "grad_norm": 0.6947014927864075, "learning_rate": 0.0002, "epoch": 0.7267504488330341, "step": 10120}, {"loss": 0.7621, "grad_norm": 0.6094982624053955, "learning_rate": 0.0002, "epoch": 0.7274685816876122, "step": 10130}, {"loss": 0.7911, "grad_norm": 0.628461480140686, "learning_rate": 0.0002, "epoch": 0.7281867145421903, "step": 10140}, {"loss": 0.7839, "grad_norm": 0.4952087104320526, "learning_rate": 0.0002, "epoch": 0.7289048473967684, "step": 10150}, {"loss": 0.7582, "grad_norm": 0.6917221546173096, "learning_rate": 0.0002, "epoch": 0.7296229802513465, "step": 10160}, {"loss": 0.7791, "grad_norm": 0.6866413354873657, "learning_rate": 0.0002, "epoch": 0.7303411131059246, "step": 10170}, {"loss": 0.7628, "grad_norm": 0.5505863428115845, "learning_rate": 0.0002, "epoch": 0.7310592459605026, "step": 10180}, {"loss": 0.7941, "grad_norm": 0.5903199911117554, "learning_rate": 0.0002, "epoch": 0.7317773788150808, "step": 10190}, {"loss": 0.8072, "grad_norm": 0.5001798272132874, "learning_rate": 0.0002, "epoch": 0.7324955116696589, "step": 10200}, {"loss": 0.7934, "grad_norm": 0.5117581486701965, "learning_rate": 0.0002, "epoch": 0.733213644524237, "step": 10210}, {"loss": 0.8364, "grad_norm": 0.7716088891029358, "learning_rate": 0.0002, "epoch": 0.7339317773788151, "step": 10220}, {"loss": 0.7775, "grad_norm": 0.5973874926567078, "learning_rate": 0.0002, "epoch": 0.7346499102333932, "step": 10230}, {"loss": 0.7689, "grad_norm": 0.6433483362197876, "learning_rate": 0.0002, "epoch": 0.7353680430879713, "step": 10240}, {"loss": 0.8307, "grad_norm": 0.6241081357002258, "learning_rate": 0.0002, "epoch": 0.7360861759425493, "step": 10250}, {"loss": 0.7432, "grad_norm": 0.7198845744132996, "learning_rate": 0.0002, "epoch": 0.7368043087971274, "step": 10260}, {"loss": 0.7545, "grad_norm": 0.5879023671150208, "learning_rate": 0.0002, "epoch": 0.7375224416517056, "step": 10270}, {"loss": 0.7526, "grad_norm": 0.5810162425041199, "learning_rate": 0.0002, "epoch": 0.7382405745062837, "step": 10280}, {"loss": 0.7839, "grad_norm": 0.6336500644683838, "learning_rate": 0.0002, "epoch": 0.7389587073608618, "step": 10290}, {"loss": 0.7597, "grad_norm": 0.5627583861351013, "learning_rate": 0.0002, "epoch": 0.7396768402154399, "step": 10300}, {"loss": 0.8166, "grad_norm": 0.5396066904067993, "learning_rate": 0.0002, "epoch": 0.740394973070018, "step": 10310}, {"loss": 0.7698, "grad_norm": 0.5519505143165588, "learning_rate": 0.0002, "epoch": 0.741113105924596, "step": 10320}, {"loss": 0.7953, "grad_norm": 0.628710925579071, "learning_rate": 0.0002, "epoch": 0.7418312387791741, "step": 10330}, {"loss": 0.805, "grad_norm": 0.6466957926750183, "learning_rate": 0.0002, "epoch": 0.7425493716337522, "step": 10340}, {"loss": 0.8173, "grad_norm": 0.6269286274909973, "learning_rate": 0.0002, "epoch": 0.7432675044883303, "step": 10350}, {"loss": 0.8315, "grad_norm": 0.6985455751419067, "learning_rate": 0.0002, "epoch": 0.7439856373429085, "step": 10360}, {"loss": 0.7598, "grad_norm": 0.6203648447990417, "learning_rate": 0.0002, "epoch": 0.7447037701974866, "step": 10370}, {"loss": 0.7937, "grad_norm": 0.6524295210838318, "learning_rate": 0.0002, "epoch": 0.7454219030520647, "step": 10380}, {"loss": 0.8005, "grad_norm": 0.6108002662658691, "learning_rate": 0.0002, "epoch": 0.7461400359066427, "step": 10390}, {"loss": 0.7592, "grad_norm": 0.5196276903152466, "learning_rate": 0.0002, "epoch": 0.7468581687612208, "step": 10400}, {"loss": 0.7769, "grad_norm": 0.6207506656646729, "learning_rate": 0.0002, "epoch": 0.7475763016157989, "step": 10410}, {"loss": 0.8066, "grad_norm": 0.6015686988830566, "learning_rate": 0.0002, "epoch": 0.748294434470377, "step": 10420}, {"loss": 0.7993, "grad_norm": 0.6402649879455566, "learning_rate": 0.0002, "epoch": 0.7490125673249551, "step": 10430}, {"loss": 0.802, "grad_norm": 0.7816081047058105, "learning_rate": 0.0002, "epoch": 0.7497307001795332, "step": 10440}, {"loss": 0.8021, "grad_norm": 0.6148143410682678, "learning_rate": 0.0002, "epoch": 0.7504488330341114, "step": 10450}, {"loss": 0.7986, "grad_norm": 0.6496613621711731, "learning_rate": 0.0002, "epoch": 0.7511669658886894, "step": 10460}, {"loss": 0.8152, "grad_norm": 0.49158045649528503, "learning_rate": 0.0002, "epoch": 0.7518850987432675, "step": 10470}, {"loss": 0.8098, "grad_norm": 0.8629217743873596, "learning_rate": 0.0002, "epoch": 0.7526032315978456, "step": 10480}, {"loss": 0.807, "grad_norm": 0.6800066828727722, "learning_rate": 0.0002, "epoch": 0.7533213644524237, "step": 10490}, {"loss": 0.7238, "grad_norm": 0.6480063199996948, "learning_rate": 0.0002, "epoch": 0.7540394973070018, "step": 10500}, {"loss": 0.7818, "grad_norm": 0.5740751028060913, "learning_rate": 0.0002, "epoch": 0.7547576301615799, "step": 10510}, {"loss": 0.7732, "grad_norm": 0.7182627320289612, "learning_rate": 0.0002, "epoch": 0.755475763016158, "step": 10520}, {"loss": 0.7752, "grad_norm": 0.6482816934585571, "learning_rate": 0.0002, "epoch": 0.756193895870736, "step": 10530}, {"loss": 0.7564, "grad_norm": 0.4937674105167389, "learning_rate": 0.0002, "epoch": 0.7569120287253142, "step": 10540}, {"loss": 0.7783, "grad_norm": 0.6818482875823975, "learning_rate": 0.0002, "epoch": 0.7576301615798923, "step": 10550}, {"loss": 0.8303, "grad_norm": 0.6375173926353455, "learning_rate": 0.0002, "epoch": 0.7583482944344704, "step": 10560}, {"loss": 0.77, "grad_norm": 0.528798520565033, "learning_rate": 0.0002, "epoch": 0.7590664272890485, "step": 10570}, {"loss": 0.8435, "grad_norm": 0.42099910974502563, "learning_rate": 0.0002, "epoch": 0.7597845601436266, "step": 10580}, {"loss": 0.8218, "grad_norm": 0.529604434967041, "learning_rate": 0.0002, "epoch": 0.7605026929982047, "step": 10590}, {"loss": 0.7833, "grad_norm": 0.6236841082572937, "learning_rate": 0.0002, "epoch": 0.7612208258527827, "step": 10600}, {"loss": 0.777, "grad_norm": 0.6194891929626465, "learning_rate": 0.0002, "epoch": 0.7619389587073608, "step": 10610}, {"loss": 0.7967, "grad_norm": 0.5206209421157837, "learning_rate": 0.0002, "epoch": 0.762657091561939, "step": 10620}, {"loss": 0.811, "grad_norm": 0.7981295585632324, "learning_rate": 0.0002, "epoch": 0.7633752244165171, "step": 10630}, {"loss": 0.8016, "grad_norm": 0.6113479137420654, "learning_rate": 0.0002, "epoch": 0.7640933572710952, "step": 10640}, {"loss": 0.7642, "grad_norm": 0.7025435566902161, "learning_rate": 0.0002, "epoch": 0.7648114901256733, "step": 10650}, {"loss": 0.7293, "grad_norm": 0.46914348006248474, "learning_rate": 0.0002, "epoch": 0.7655296229802514, "step": 10660}, {"loss": 0.8079, "grad_norm": 0.6134725213050842, "learning_rate": 0.0002, "epoch": 0.7662477558348294, "step": 10670}, {"loss": 0.7469, "grad_norm": 0.583859920501709, "learning_rate": 0.0002, "epoch": 0.7669658886894075, "step": 10680}, {"loss": 0.843, "grad_norm": 0.511349081993103, "learning_rate": 0.0002, "epoch": 0.7676840215439856, "step": 10690}, {"loss": 0.8355, "grad_norm": 0.6467110514640808, "learning_rate": 0.0002, "epoch": 0.7684021543985637, "step": 10700}, {"loss": 0.7935, "grad_norm": 0.7210163474082947, "learning_rate": 0.0002, "epoch": 0.7691202872531419, "step": 10710}, {"loss": 0.7807, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "epoch": 0.76983842010772, "step": 10720}, {"loss": 0.7742, "grad_norm": 0.6237271428108215, "learning_rate": 0.0002, "epoch": 0.7705565529622981, "step": 10730}, {"loss": 0.8185, "grad_norm": 0.664328396320343, "learning_rate": 0.0002, "epoch": 0.7712746858168761, "step": 10740}, {"loss": 0.8096, "grad_norm": 0.6550520062446594, "learning_rate": 0.0002, "epoch": 0.7719928186714542, "step": 10750}, {"loss": 0.7538, "grad_norm": 0.5103325843811035, "learning_rate": 0.0002, "epoch": 0.7727109515260323, "step": 10760}, {"loss": 0.7777, "grad_norm": 0.7171200513839722, "learning_rate": 0.0002, "epoch": 0.7734290843806104, "step": 10770}, {"loss": 0.7743, "grad_norm": 0.5947384834289551, "learning_rate": 0.0002, "epoch": 0.7741472172351885, "step": 10780}, {"loss": 0.781, "grad_norm": 0.5293096899986267, "learning_rate": 0.0002, "epoch": 0.7748653500897666, "step": 10790}, {"loss": 0.777, "grad_norm": 0.6372577548027039, "learning_rate": 0.0002, "epoch": 0.7755834829443446, "step": 10800}, {"loss": 0.7972, "grad_norm": 0.5738261938095093, "learning_rate": 0.0002, "epoch": 0.7763016157989228, "step": 10810}, {"loss": 0.7877, "grad_norm": 0.7309247255325317, "learning_rate": 0.0002, "epoch": 0.7770197486535009, "step": 10820}, {"loss": 0.7745, "grad_norm": 0.8867193460464478, "learning_rate": 0.0002, "epoch": 0.777737881508079, "step": 10830}, {"loss": 0.7959, "grad_norm": 0.6151437759399414, "learning_rate": 0.0002, "epoch": 0.7784560143626571, "step": 10840}, {"loss": 0.7897, "grad_norm": 0.5645464658737183, "learning_rate": 0.0002, "epoch": 0.7791741472172352, "step": 10850}, {"loss": 0.7858, "grad_norm": 0.5118698477745056, "learning_rate": 0.0002, "epoch": 0.7798922800718133, "step": 10860}, {"loss": 0.8064, "grad_norm": 0.618181049823761, "learning_rate": 0.0002, "epoch": 0.7806104129263913, "step": 10870}, {"loss": 0.7675, "grad_norm": 0.7206462025642395, "learning_rate": 0.0002, "epoch": 0.7813285457809694, "step": 10880}, {"loss": 0.8162, "grad_norm": 0.7993820905685425, "learning_rate": 0.0002, "epoch": 0.7820466786355476, "step": 10890}, {"loss": 0.781, "grad_norm": 0.5072754621505737, "learning_rate": 0.0002, "epoch": 0.7827648114901257, "step": 10900}, {"loss": 0.7575, "grad_norm": 0.5829088687896729, "learning_rate": 0.0002, "epoch": 0.7834829443447038, "step": 10910}, {"loss": 0.7552, "grad_norm": 0.5778957605361938, "learning_rate": 0.0002, "epoch": 0.7842010771992819, "step": 10920}, {"loss": 0.7652, "grad_norm": 0.7237067222595215, "learning_rate": 0.0002, "epoch": 0.78491921005386, "step": 10930}, {"loss": 0.8357, "grad_norm": 0.5778013467788696, "learning_rate": 0.0002, "epoch": 0.785637342908438, "step": 10940}, {"loss": 0.7464, "grad_norm": 0.6129629611968994, "learning_rate": 0.0002, "epoch": 0.7863554757630161, "step": 10950}, {"loss": 0.7863, "grad_norm": 0.5637320876121521, "learning_rate": 0.0002, "epoch": 0.7870736086175942, "step": 10960}, {"loss": 0.7645, "grad_norm": 0.6253715753555298, "learning_rate": 0.0002, "epoch": 0.7877917414721723, "step": 10970}, {"loss": 0.8307, "grad_norm": 0.6209888458251953, "learning_rate": 0.0002, "epoch": 0.7885098743267505, "step": 10980}, {"loss": 0.7899, "grad_norm": 1.0841948986053467, "learning_rate": 0.0002, "epoch": 0.7892280071813286, "step": 10990}, {"loss": 0.7659, "grad_norm": 0.6570560336112976, "learning_rate": 0.0002, "epoch": 0.7899461400359067, "step": 11000}, {"loss": 0.7839, "grad_norm": 0.4830388128757477, "learning_rate": 0.0002, "epoch": 0.7906642728904847, "step": 11010}, {"loss": 0.8064, "grad_norm": 0.7607520222663879, "learning_rate": 0.0002, "epoch": 0.7913824057450628, "step": 11020}, {"loss": 0.8009, "grad_norm": 0.8202590346336365, "learning_rate": 0.0002, "epoch": 0.7921005385996409, "step": 11030}, {"loss": 0.7788, "grad_norm": 0.5640848278999329, "learning_rate": 0.0002, "epoch": 0.792818671454219, "step": 11040}, {"loss": 0.8298, "grad_norm": 0.7773675322532654, "learning_rate": 0.0002, "epoch": 0.7935368043087971, "step": 11050}, {"loss": 0.793, "grad_norm": 0.664139986038208, "learning_rate": 0.0002, "epoch": 0.7942549371633753, "step": 11060}, {"loss": 0.7886, "grad_norm": 0.6097795367240906, "learning_rate": 0.0002, "epoch": 0.7949730700179534, "step": 11070}, {"loss": 0.7989, "grad_norm": 0.9208881258964539, "learning_rate": 0.0002, "epoch": 0.7956912028725314, "step": 11080}, {"loss": 0.8045, "grad_norm": 0.6210731863975525, "learning_rate": 0.0002, "epoch": 0.7964093357271095, "step": 11090}, {"loss": 0.7868, "grad_norm": 0.7060235738754272, "learning_rate": 0.0002, "epoch": 0.7971274685816876, "step": 11100}, {"loss": 0.8041, "grad_norm": 0.48695266246795654, "learning_rate": 0.0002, "epoch": 0.7978456014362657, "step": 11110}, {"loss": 0.7885, "grad_norm": 0.6458830833435059, "learning_rate": 0.0002, "epoch": 0.7985637342908438, "step": 11120}, {"loss": 0.7773, "grad_norm": 0.572545051574707, "learning_rate": 0.0002, "epoch": 0.7992818671454219, "step": 11130}, {"loss": 0.7984, "grad_norm": 0.5925027132034302, "learning_rate": 0.0002, "epoch": 0.8, "step": 11140}, {"loss": 0.7571, "grad_norm": 0.569622278213501, "learning_rate": 0.0002, "epoch": 0.800718132854578, "step": 11150}, {"loss": 0.7765, "grad_norm": 0.537146806716919, "learning_rate": 0.0002, "epoch": 0.8014362657091562, "step": 11160}, {"loss": 0.7896, "grad_norm": 0.7118613719940186, "learning_rate": 0.0002, "epoch": 0.8021543985637343, "step": 11170}, {"loss": 0.7398, "grad_norm": 0.6183688044548035, "learning_rate": 0.0002, "epoch": 0.8028725314183124, "step": 11180}, {"loss": 0.7545, "grad_norm": 0.5187385082244873, "learning_rate": 0.0002, "epoch": 0.8035906642728905, "step": 11190}, {"loss": 0.766, "grad_norm": 0.5422571301460266, "learning_rate": 0.0002, "epoch": 0.8043087971274686, "step": 11200}, {"loss": 0.756, "grad_norm": 0.635050892829895, "learning_rate": 0.0002, "epoch": 0.8050269299820467, "step": 11210}, {"loss": 0.7337, "grad_norm": 0.6584872007369995, "learning_rate": 0.0002, "epoch": 0.8057450628366247, "step": 11220}, {"loss": 0.7467, "grad_norm": 0.624921977519989, "learning_rate": 0.0002, "epoch": 0.8064631956912028, "step": 11230}, {"loss": 0.7559, "grad_norm": 0.6837546229362488, "learning_rate": 0.0002, "epoch": 0.807181328545781, "step": 11240}, {"loss": 0.7861, "grad_norm": 0.5861160755157471, "learning_rate": 0.0002, "epoch": 0.8078994614003591, "step": 11250}, {"loss": 0.7883, "grad_norm": 0.5751383900642395, "learning_rate": 0.0002, "epoch": 0.8086175942549372, "step": 11260}, {"loss": 0.8103, "grad_norm": 0.7181510329246521, "learning_rate": 0.0002, "epoch": 0.8093357271095153, "step": 11270}, {"loss": 0.8066, "grad_norm": 0.5862139463424683, "learning_rate": 0.0002, "epoch": 0.8100538599640934, "step": 11280}, {"loss": 0.7692, "grad_norm": 0.4880113899707794, "learning_rate": 0.0002, "epoch": 0.8107719928186714, "step": 11290}, {"loss": 0.8154, "grad_norm": 0.565590500831604, "learning_rate": 0.0002, "epoch": 0.8114901256732495, "step": 11300}, {"loss": 0.7893, "grad_norm": 0.6171264052391052, "learning_rate": 0.0002, "epoch": 0.8122082585278276, "step": 11310}, {"loss": 0.816, "grad_norm": 0.5815969109535217, "learning_rate": 0.0002, "epoch": 0.8129263913824057, "step": 11320}, {"loss": 0.7462, "grad_norm": 0.5407653450965881, "learning_rate": 0.0002, "epoch": 0.8136445242369839, "step": 11330}, {"loss": 0.7647, "grad_norm": 0.6990084648132324, "learning_rate": 0.0002, "epoch": 0.814362657091562, "step": 11340}, {"loss": 0.783, "grad_norm": 0.5845068097114563, "learning_rate": 0.0002, "epoch": 0.8150807899461401, "step": 11350}, {"loss": 0.7839, "grad_norm": 0.5978701114654541, "learning_rate": 0.0002, "epoch": 0.8157989228007181, "step": 11360}, {"loss": 0.7342, "grad_norm": 0.6873053312301636, "learning_rate": 0.0002, "epoch": 0.8165170556552962, "step": 11370}, {"loss": 0.7656, "grad_norm": 0.7048654556274414, "learning_rate": 0.0002, "epoch": 0.8172351885098743, "step": 11380}, {"loss": 0.7293, "grad_norm": 0.7631531953811646, "learning_rate": 0.0002, "epoch": 0.8179533213644524, "step": 11390}, {"loss": 0.8606, "grad_norm": 0.704922080039978, "learning_rate": 0.0002, "epoch": 0.8186714542190305, "step": 11400}, {"loss": 0.8066, "grad_norm": 0.595460832118988, "learning_rate": 0.0002, "epoch": 0.8193895870736086, "step": 11410}, {"loss": 0.809, "grad_norm": 0.5882242918014526, "learning_rate": 0.0002, "epoch": 0.8201077199281868, "step": 11420}, {"loss": 0.7639, "grad_norm": 0.6433175206184387, "learning_rate": 0.0002, "epoch": 0.8208258527827648, "step": 11430}, {"loss": 0.7522, "grad_norm": 0.6047986149787903, "learning_rate": 0.0002, "epoch": 0.8215439856373429, "step": 11440}, {"loss": 0.8305, "grad_norm": 0.6462088823318481, "learning_rate": 0.0002, "epoch": 0.822262118491921, "step": 11450}, {"loss": 0.8144, "grad_norm": 0.5558379888534546, "learning_rate": 0.0002, "epoch": 0.8229802513464991, "step": 11460}, {"loss": 0.7916, "grad_norm": 0.6745542287826538, "learning_rate": 0.0002, "epoch": 0.8236983842010772, "step": 11470}, {"loss": 0.7853, "grad_norm": 0.7082334756851196, "learning_rate": 0.0002, "epoch": 0.8244165170556553, "step": 11480}, {"loss": 0.7533, "grad_norm": 0.703889787197113, "learning_rate": 0.0002, "epoch": 0.8251346499102334, "step": 11490}, {"loss": 0.8085, "grad_norm": 0.5261096358299255, "learning_rate": 0.0002, "epoch": 0.8258527827648114, "step": 11500}, {"loss": 0.7903, "grad_norm": 0.6009393930435181, "learning_rate": 0.0002, "epoch": 0.8265709156193896, "step": 11510}, {"loss": 0.7377, "grad_norm": 0.584274172782898, "learning_rate": 0.0002, "epoch": 0.8272890484739677, "step": 11520}, {"loss": 0.7926, "grad_norm": 0.6803238987922668, "learning_rate": 0.0002, "epoch": 0.8280071813285458, "step": 11530}, {"loss": 0.7948, "grad_norm": 0.6230084896087646, "learning_rate": 0.0002, "epoch": 0.8287253141831239, "step": 11540}, {"loss": 0.7902, "grad_norm": 0.6090595722198486, "learning_rate": 0.0002, "epoch": 0.829443447037702, "step": 11550}, {"loss": 0.7514, "grad_norm": 0.5292693376541138, "learning_rate": 0.0002, "epoch": 0.8301615798922801, "step": 11560}, {"loss": 0.7979, "grad_norm": 0.5675389766693115, "learning_rate": 0.0002, "epoch": 0.8308797127468581, "step": 11570}, {"loss": 0.7851, "grad_norm": 0.554874062538147, "learning_rate": 0.0002, "epoch": 0.8315978456014362, "step": 11580}, {"loss": 0.8004, "grad_norm": 0.8582373261451721, "learning_rate": 0.0002, "epoch": 0.8323159784560143, "step": 11590}, {"loss": 0.7864, "grad_norm": 0.5743035674095154, "learning_rate": 0.0002, "epoch": 0.8330341113105925, "step": 11600}, {"loss": 0.7714, "grad_norm": 0.5749582648277283, "learning_rate": 0.0002, "epoch": 0.8337522441651706, "step": 11610}, {"loss": 0.8131, "grad_norm": 0.5207278728485107, "learning_rate": 0.0002, "epoch": 0.8344703770197487, "step": 11620}, {"loss": 0.785, "grad_norm": 0.6262611150741577, "learning_rate": 0.0002, "epoch": 0.8351885098743268, "step": 11630}, {"loss": 0.7699, "grad_norm": 0.5490066409111023, "learning_rate": 0.0002, "epoch": 0.8359066427289048, "step": 11640}, {"loss": 0.7779, "grad_norm": 0.6283167600631714, "learning_rate": 0.0002, "epoch": 0.8366247755834829, "step": 11650}, {"loss": 0.7508, "grad_norm": 0.7701452374458313, "learning_rate": 0.0002, "epoch": 0.837342908438061, "step": 11660}, {"loss": 0.7662, "grad_norm": 0.5825072526931763, "learning_rate": 0.0002, "epoch": 0.8380610412926391, "step": 11670}, {"loss": 0.758, "grad_norm": 0.6119720935821533, "learning_rate": 0.0002, "epoch": 0.8387791741472173, "step": 11680}, {"loss": 0.7995, "grad_norm": 0.689383327960968, "learning_rate": 0.0002, "epoch": 0.8394973070017954, "step": 11690}, {"loss": 0.7615, "grad_norm": 0.5396560430526733, "learning_rate": 0.0002, "epoch": 0.8402154398563735, "step": 11700}, {"loss": 0.8073, "grad_norm": 0.577178955078125, "learning_rate": 0.0002, "epoch": 0.8409335727109515, "step": 11710}, {"loss": 0.7911, "grad_norm": 0.6652564406394958, "learning_rate": 0.0002, "epoch": 0.8416517055655296, "step": 11720}, {"loss": 0.7708, "grad_norm": 0.588377058506012, "learning_rate": 0.0002, "epoch": 0.8423698384201077, "step": 11730}, {"loss": 0.8245, "grad_norm": 0.6180438995361328, "learning_rate": 0.0002, "epoch": 0.8430879712746858, "step": 11740}, {"loss": 0.729, "grad_norm": 0.6897811889648438, "learning_rate": 0.0002, "epoch": 0.8438061041292639, "step": 11750}, {"loss": 0.8026, "grad_norm": 0.5826608538627625, "learning_rate": 0.0002, "epoch": 0.844524236983842, "step": 11760}, {"loss": 0.7959, "grad_norm": 0.6511976718902588, "learning_rate": 0.0002, "epoch": 0.8452423698384202, "step": 11770}, {"loss": 0.7705, "grad_norm": 0.4738382399082184, "learning_rate": 0.0002, "epoch": 0.8459605026929982, "step": 11780}, {"loss": 0.8317, "grad_norm": 0.541780948638916, "learning_rate": 0.0002, "epoch": 0.8466786355475763, "step": 11790}, {"loss": 0.774, "grad_norm": 0.6115241050720215, "learning_rate": 0.0002, "epoch": 0.8473967684021544, "step": 11800}, {"loss": 0.834, "grad_norm": 0.7067801356315613, "learning_rate": 0.0002, "epoch": 0.8481149012567325, "step": 11810}, {"loss": 0.7725, "grad_norm": 0.5602791905403137, "learning_rate": 0.0002, "epoch": 0.8488330341113106, "step": 11820}, {"loss": 0.7832, "grad_norm": 0.6968005299568176, "learning_rate": 0.0002, "epoch": 0.8495511669658887, "step": 11830}, {"loss": 0.7556, "grad_norm": 0.621132493019104, "learning_rate": 0.0002, "epoch": 0.8502692998204668, "step": 11840}, {"loss": 0.8036, "grad_norm": 0.5777568817138672, "learning_rate": 0.0002, "epoch": 0.8509874326750448, "step": 11850}, {"loss": 0.8071, "grad_norm": 0.6468178629875183, "learning_rate": 0.0002, "epoch": 0.851705565529623, "step": 11860}, {"loss": 0.8074, "grad_norm": 0.6216070652008057, "learning_rate": 0.0002, "epoch": 0.8524236983842011, "step": 11870}, {"loss": 0.7736, "grad_norm": 0.7402005791664124, "learning_rate": 0.0002, "epoch": 0.8531418312387792, "step": 11880}, {"loss": 0.7877, "grad_norm": 0.5192958116531372, "learning_rate": 0.0002, "epoch": 0.8538599640933573, "step": 11890}, {"loss": 0.7113, "grad_norm": 0.6050501465797424, "learning_rate": 0.0002, "epoch": 0.8545780969479354, "step": 11900}, {"loss": 0.8131, "grad_norm": 0.5363124012947083, "learning_rate": 0.0002, "epoch": 0.8552962298025135, "step": 11910}, {"loss": 0.7861, "grad_norm": 0.525288462638855, "learning_rate": 0.0002, "epoch": 0.8560143626570915, "step": 11920}, {"loss": 0.726, "grad_norm": 0.6129848957061768, "learning_rate": 0.0002, "epoch": 0.8567324955116696, "step": 11930}, {"loss": 0.7921, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 0.8574506283662477, "step": 11940}, {"loss": 0.772, "grad_norm": 0.5862830281257629, "learning_rate": 0.0002, "epoch": 0.8581687612208259, "step": 11950}, {"loss": 0.7272, "grad_norm": 0.7078025341033936, "learning_rate": 0.0002, "epoch": 0.858886894075404, "step": 11960}, {"loss": 0.7733, "grad_norm": 0.6600908637046814, "learning_rate": 0.0002, "epoch": 0.8596050269299821, "step": 11970}, {"loss": 0.7784, "grad_norm": 0.5914377570152283, "learning_rate": 0.0002, "epoch": 0.8603231597845602, "step": 11980}, {"loss": 0.8222, "grad_norm": 0.7844575047492981, "learning_rate": 0.0002, "epoch": 0.8610412926391382, "step": 11990}, {"loss": 0.8059, "grad_norm": 0.6605148315429688, "learning_rate": 0.0002, "epoch": 0.8617594254937163, "step": 12000}, {"loss": 0.8066, "grad_norm": 0.6320111155509949, "learning_rate": 0.0002, "epoch": 0.8624775583482944, "step": 12010}, {"loss": 0.7844, "grad_norm": 0.5833557844161987, "learning_rate": 0.0002, "epoch": 0.8631956912028725, "step": 12020}, {"loss": 0.8016, "grad_norm": 0.5322666764259338, "learning_rate": 0.0002, "epoch": 0.8639138240574507, "step": 12030}, {"loss": 0.8142, "grad_norm": 0.568696141242981, "learning_rate": 0.0002, "epoch": 0.8646319569120288, "step": 12040}, {"loss": 0.7929, "grad_norm": 0.5739135146141052, "learning_rate": 0.0002, "epoch": 0.8653500897666068, "step": 12050}, {"loss": 0.7877, "grad_norm": 0.6667993068695068, "learning_rate": 0.0002, "epoch": 0.8660682226211849, "step": 12060}, {"loss": 0.7538, "grad_norm": 0.5393701195716858, "learning_rate": 0.0002, "epoch": 0.866786355475763, "step": 12070}, {"loss": 0.8014, "grad_norm": 0.7036312818527222, "learning_rate": 0.0002, "epoch": 0.8675044883303411, "step": 12080}, {"loss": 0.7937, "grad_norm": 0.5851739048957825, "learning_rate": 0.0002, "epoch": 0.8682226211849192, "step": 12090}, {"loss": 0.8121, "grad_norm": 0.6554462909698486, "learning_rate": 0.0002, "epoch": 0.8689407540394973, "step": 12100}, {"loss": 0.8541, "grad_norm": 0.8224838376045227, "learning_rate": 0.0002, "epoch": 0.8696588868940754, "step": 12110}, {"loss": 0.73, "grad_norm": 0.513981819152832, "learning_rate": 0.0002, "epoch": 0.8703770197486534, "step": 12120}, {"loss": 0.7371, "grad_norm": 0.6913988590240479, "learning_rate": 0.0002, "epoch": 0.8710951526032316, "step": 12130}, {"loss": 0.762, "grad_norm": 0.5539003610610962, "learning_rate": 0.0002, "epoch": 0.8718132854578097, "step": 12140}, {"loss": 0.7535, "grad_norm": 0.6216937303543091, "learning_rate": 0.0002, "epoch": 0.8725314183123878, "step": 12150}, {"loss": 0.7344, "grad_norm": 0.5594495534896851, "learning_rate": 0.0002, "epoch": 0.8732495511669659, "step": 12160}, {"loss": 0.7342, "grad_norm": 0.6025309562683105, "learning_rate": 0.0002, "epoch": 0.873967684021544, "step": 12170}, {"loss": 0.7561, "grad_norm": 0.5285239815711975, "learning_rate": 0.0002, "epoch": 0.8746858168761221, "step": 12180}, {"loss": 0.7619, "grad_norm": 1.0394607782363892, "learning_rate": 0.0002, "epoch": 0.8754039497307001, "step": 12190}, {"loss": 0.8111, "grad_norm": 0.5128031373023987, "learning_rate": 0.0002, "epoch": 0.8761220825852782, "step": 12200}, {"loss": 0.8113, "grad_norm": 0.5883685946464539, "learning_rate": 0.0002, "epoch": 0.8768402154398564, "step": 12210}, {"loss": 0.7493, "grad_norm": 0.593204915523529, "learning_rate": 0.0002, "epoch": 0.8775583482944345, "step": 12220}, {"loss": 0.7739, "grad_norm": 0.7141679525375366, "learning_rate": 0.0002, "epoch": 0.8782764811490126, "step": 12230}, {"loss": 0.8155, "grad_norm": 0.6381585597991943, "learning_rate": 0.0002, "epoch": 0.8789946140035907, "step": 12240}, {"loss": 0.7756, "grad_norm": 0.7076981067657471, "learning_rate": 0.0002, "epoch": 0.8797127468581688, "step": 12250}, {"loss": 0.8186, "grad_norm": 0.8046461939811707, "learning_rate": 0.0002, "epoch": 0.8804308797127468, "step": 12260}, {"loss": 0.7615, "grad_norm": 0.635160505771637, "learning_rate": 0.0002, "epoch": 0.8811490125673249, "step": 12270}, {"loss": 0.7695, "grad_norm": 0.6388354301452637, "learning_rate": 0.0002, "epoch": 0.881867145421903, "step": 12280}, {"loss": 0.81, "grad_norm": 0.5612906217575073, "learning_rate": 0.0002, "epoch": 0.8825852782764811, "step": 12290}, {"loss": 0.8055, "grad_norm": 0.6716228723526001, "learning_rate": 0.0002, "epoch": 0.8833034111310593, "step": 12300}, {"loss": 0.757, "grad_norm": 0.6488762497901917, "learning_rate": 0.0002, "epoch": 0.8840215439856374, "step": 12310}, {"loss": 0.7794, "grad_norm": 0.5770853757858276, "learning_rate": 0.0002, "epoch": 0.8847396768402155, "step": 12320}, {"loss": 0.7617, "grad_norm": 0.5006616711616516, "learning_rate": 0.0002, "epoch": 0.8854578096947935, "step": 12330}, {"loss": 0.7512, "grad_norm": 0.6428417563438416, "learning_rate": 0.0002, "epoch": 0.8861759425493716, "step": 12340}, {"loss": 0.796, "grad_norm": 0.5721977949142456, "learning_rate": 0.0002, "epoch": 0.8868940754039497, "step": 12350}, {"loss": 0.7764, "grad_norm": 0.7000266313552856, "learning_rate": 0.0002, "epoch": 0.8876122082585278, "step": 12360}, {"loss": 0.7524, "grad_norm": 0.5252631306648254, "learning_rate": 0.0002, "epoch": 0.8883303411131059, "step": 12370}, {"loss": 0.7635, "grad_norm": 0.5788044929504395, "learning_rate": 0.0002, "epoch": 0.889048473967684, "step": 12380}, {"loss": 0.7856, "grad_norm": 0.6730653643608093, "learning_rate": 0.0002, "epoch": 0.8897666068222622, "step": 12390}, {"loss": 0.7925, "grad_norm": 0.5556851029396057, "learning_rate": 0.0002, "epoch": 0.8904847396768402, "step": 12400}, {"loss": 0.6958, "grad_norm": 0.616189181804657, "learning_rate": 0.0002, "epoch": 0.8912028725314183, "step": 12410}, {"loss": 0.7468, "grad_norm": 0.6360940337181091, "learning_rate": 0.0002, "epoch": 0.8919210053859964, "step": 12420}, {"loss": 0.8088, "grad_norm": 0.5832887887954712, "learning_rate": 0.0002, "epoch": 0.8926391382405745, "step": 12430}, {"loss": 0.7383, "grad_norm": 0.8319168090820312, "learning_rate": 0.0002, "epoch": 0.8933572710951526, "step": 12440}, {"loss": 0.8597, "grad_norm": 0.5415005087852478, "learning_rate": 0.0002, "epoch": 0.8940754039497307, "step": 12450}, {"loss": 0.7439, "grad_norm": 0.4959808588027954, "learning_rate": 0.0002, "epoch": 0.8947935368043088, "step": 12460}, {"loss": 0.8493, "grad_norm": 0.5102260708808899, "learning_rate": 0.0002, "epoch": 0.8955116696588868, "step": 12470}, {"loss": 0.7274, "grad_norm": 0.773972749710083, "learning_rate": 0.0002, "epoch": 0.896229802513465, "step": 12480}, {"loss": 0.7797, "grad_norm": 0.6314513087272644, "learning_rate": 0.0002, "epoch": 0.8969479353680431, "step": 12490}, {"loss": 0.7839, "grad_norm": 0.6503705382347107, "learning_rate": 0.0002, "epoch": 0.8976660682226212, "step": 12500}, {"loss": 0.8177, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 0.8983842010771993, "step": 12510}, {"loss": 0.7448, "grad_norm": 0.7222756743431091, "learning_rate": 0.0002, "epoch": 0.8991023339317774, "step": 12520}, {"loss": 0.7779, "grad_norm": 0.7242336869239807, "learning_rate": 0.0002, "epoch": 0.8998204667863555, "step": 12530}, {"loss": 0.7577, "grad_norm": 0.625769317150116, "learning_rate": 0.0002, "epoch": 0.9005385996409335, "step": 12540}, {"loss": 0.8528, "grad_norm": 0.6003357172012329, "learning_rate": 0.0002, "epoch": 0.9012567324955116, "step": 12550}, {"loss": 0.7871, "grad_norm": 0.6089374423027039, "learning_rate": 0.0002, "epoch": 0.9019748653500897, "step": 12560}, {"loss": 0.74, "grad_norm": 0.6232544183731079, "learning_rate": 0.0002, "epoch": 0.9026929982046679, "step": 12570}, {"loss": 0.7993, "grad_norm": 0.5426769256591797, "learning_rate": 0.0002, "epoch": 0.903411131059246, "step": 12580}, {"loss": 0.8023, "grad_norm": 0.5711943507194519, "learning_rate": 0.0002, "epoch": 0.9041292639138241, "step": 12590}, {"loss": 0.7915, "grad_norm": 0.5287838578224182, "learning_rate": 0.0002, "epoch": 0.9048473967684022, "step": 12600}, {"loss": 0.7394, "grad_norm": 0.6192951798439026, "learning_rate": 0.0002, "epoch": 0.9055655296229802, "step": 12610}, {"loss": 0.7547, "grad_norm": 0.493082195520401, "learning_rate": 0.0002, "epoch": 0.9062836624775583, "step": 12620}, {"loss": 0.7604, "grad_norm": 0.7668463587760925, "learning_rate": 0.0002, "epoch": 0.9070017953321364, "step": 12630}, {"loss": 0.8079, "grad_norm": 0.6298037767410278, "learning_rate": 0.0002, "epoch": 0.9077199281867145, "step": 12640}, {"loss": 0.7451, "grad_norm": 0.5502580404281616, "learning_rate": 0.0002, "epoch": 0.9084380610412927, "step": 12650}, {"loss": 0.763, "grad_norm": 0.5525170564651489, "learning_rate": 0.0002, "epoch": 0.9091561938958708, "step": 12660}, {"loss": 0.7579, "grad_norm": 0.9753695726394653, "learning_rate": 0.0002, "epoch": 0.9098743267504489, "step": 12670}, {"loss": 0.872, "grad_norm": 0.611427366733551, "learning_rate": 0.0002, "epoch": 0.9105924596050269, "step": 12680}, {"loss": 0.7786, "grad_norm": 0.5141594409942627, "learning_rate": 0.0002, "epoch": 0.911310592459605, "step": 12690}, {"loss": 0.7384, "grad_norm": 0.6739137172698975, "learning_rate": 0.0002, "epoch": 0.9120287253141831, "step": 12700}, {"loss": 0.8579, "grad_norm": 0.5759707689285278, "learning_rate": 0.0002, "epoch": 0.9127468581687612, "step": 12710}, {"loss": 0.7559, "grad_norm": 0.5548733472824097, "learning_rate": 0.0002, "epoch": 0.9134649910233393, "step": 12720}, {"loss": 0.8225, "grad_norm": 0.7014280557632446, "learning_rate": 0.0002, "epoch": 0.9141831238779174, "step": 12730}, {"loss": 0.7936, "grad_norm": 0.5939958691596985, "learning_rate": 0.0002, "epoch": 0.9149012567324956, "step": 12740}, {"loss": 0.7756, "grad_norm": 0.5995593667030334, "learning_rate": 0.0002, "epoch": 0.9156193895870736, "step": 12750}, {"loss": 0.7423, "grad_norm": 0.6686680316925049, "learning_rate": 0.0002, "epoch": 0.9163375224416517, "step": 12760}, {"loss": 0.8057, "grad_norm": 0.4742372930049896, "learning_rate": 0.0002, "epoch": 0.9170556552962298, "step": 12770}, {"loss": 0.7795, "grad_norm": 0.5493217706680298, "learning_rate": 0.0002, "epoch": 0.9177737881508079, "step": 12780}, {"loss": 0.7859, "grad_norm": 0.5641885995864868, "learning_rate": 0.0002, "epoch": 0.918491921005386, "step": 12790}, {"loss": 0.7775, "grad_norm": 0.5814061164855957, "learning_rate": 0.0002, "epoch": 0.9192100538599641, "step": 12800}, {"loss": 0.8204, "grad_norm": 0.6774331331253052, "learning_rate": 0.0002, "epoch": 0.9199281867145422, "step": 12810}, {"loss": 0.8205, "grad_norm": 0.5592127442359924, "learning_rate": 0.0002, "epoch": 0.9206463195691202, "step": 12820}, {"loss": 0.7788, "grad_norm": 0.5246456861495972, "learning_rate": 0.0002, "epoch": 0.9213644524236984, "step": 12830}, {"loss": 0.7886, "grad_norm": 0.6524264812469482, "learning_rate": 0.0002, "epoch": 0.9220825852782765, "step": 12840}, {"loss": 0.796, "grad_norm": 0.6010791063308716, "learning_rate": 0.0002, "epoch": 0.9228007181328546, "step": 12850}, {"loss": 0.7998, "grad_norm": 0.5289866924285889, "learning_rate": 0.0002, "epoch": 0.9235188509874327, "step": 12860}, {"loss": 0.7582, "grad_norm": 0.6850762367248535, "learning_rate": 0.0002, "epoch": 0.9242369838420108, "step": 12870}, {"loss": 0.7894, "grad_norm": 0.5293797850608826, "learning_rate": 0.0002, "epoch": 0.9249551166965889, "step": 12880}, {"loss": 0.7738, "grad_norm": 0.6045399308204651, "learning_rate": 0.0002, "epoch": 0.9256732495511669, "step": 12890}, {"loss": 0.7207, "grad_norm": 0.7026739716529846, "learning_rate": 0.0002, "epoch": 0.926391382405745, "step": 12900}, {"loss": 0.7726, "grad_norm": 0.6884756684303284, "learning_rate": 0.0002, "epoch": 0.9271095152603231, "step": 12910}, {"loss": 0.7913, "grad_norm": 0.637884795665741, "learning_rate": 0.0002, "epoch": 0.9278276481149013, "step": 12920}, {"loss": 0.7513, "grad_norm": 0.513913631439209, "learning_rate": 0.0002, "epoch": 0.9285457809694794, "step": 12930}, {"loss": 0.8, "grad_norm": 0.6642340421676636, "learning_rate": 0.0002, "epoch": 0.9292639138240575, "step": 12940}, {"loss": 0.8026, "grad_norm": 0.5708861947059631, "learning_rate": 0.0002, "epoch": 0.9299820466786356, "step": 12950}, {"loss": 0.8234, "grad_norm": 0.5896512866020203, "learning_rate": 0.0002, "epoch": 0.9307001795332136, "step": 12960}, {"loss": 0.77, "grad_norm": 0.5754874348640442, "learning_rate": 0.0002, "epoch": 0.9314183123877917, "step": 12970}, {"loss": 0.7594, "grad_norm": 0.6363751888275146, "learning_rate": 0.0002, "epoch": 0.9321364452423698, "step": 12980}, {"loss": 0.7898, "grad_norm": 0.7660197019577026, "learning_rate": 0.0002, "epoch": 0.9328545780969479, "step": 12990}, {"loss": 0.792, "grad_norm": 0.607728898525238, "learning_rate": 0.0002, "epoch": 0.933572710951526, "step": 13000}, {"loss": 0.734, "grad_norm": 0.5257042050361633, "learning_rate": 0.0002, "epoch": 0.9342908438061042, "step": 13010}, {"loss": 0.8129, "grad_norm": 0.7916908264160156, "learning_rate": 0.0002, "epoch": 0.9350089766606823, "step": 13020}, {"loss": 0.81, "grad_norm": 0.8310123085975647, "learning_rate": 0.0002, "epoch": 0.9357271095152603, "step": 13030}, {"loss": 0.7738, "grad_norm": 0.6543728113174438, "learning_rate": 0.0002, "epoch": 0.9364452423698384, "step": 13040}, {"loss": 0.7797, "grad_norm": 0.7153878808021545, "learning_rate": 0.0002, "epoch": 0.9371633752244165, "step": 13050}, {"loss": 0.779, "grad_norm": 0.7510694265365601, "learning_rate": 0.0002, "epoch": 0.9378815080789946, "step": 13060}, {"loss": 0.7761, "grad_norm": 0.5524464249610901, "learning_rate": 0.0002, "epoch": 0.9385996409335727, "step": 13070}, {"loss": 0.8635, "grad_norm": 0.6657140254974365, "learning_rate": 0.0002, "epoch": 0.9393177737881508, "step": 13080}, {"loss": 0.8097, "grad_norm": 0.5757394433021545, "learning_rate": 0.0002, "epoch": 0.940035906642729, "step": 13090}, {"loss": 0.7967, "grad_norm": 0.6171187162399292, "learning_rate": 0.0002, "epoch": 0.940754039497307, "step": 13100}, {"loss": 0.8197, "grad_norm": 0.5946314334869385, "learning_rate": 0.0002, "epoch": 0.9414721723518851, "step": 13110}, {"loss": 0.7184, "grad_norm": 0.5727229714393616, "learning_rate": 0.0002, "epoch": 0.9421903052064632, "step": 13120}, {"loss": 0.7981, "grad_norm": 0.7805224061012268, "learning_rate": 0.0002, "epoch": 0.9429084380610413, "step": 13130}, {"loss": 0.8045, "grad_norm": 0.5763523578643799, "learning_rate": 0.0002, "epoch": 0.9436265709156194, "step": 13140}, {"loss": 0.7462, "grad_norm": 0.8310899138450623, "learning_rate": 0.0002, "epoch": 0.9443447037701975, "step": 13150}, {"loss": 0.7818, "grad_norm": 0.7531784772872925, "learning_rate": 0.0002, "epoch": 0.9450628366247756, "step": 13160}, {"loss": 0.8418, "grad_norm": 0.678779661655426, "learning_rate": 0.0002, "epoch": 0.9457809694793536, "step": 13170}, {"loss": 0.8064, "grad_norm": 0.8096453547477722, "learning_rate": 0.0002, "epoch": 0.9464991023339318, "step": 13180}, {"loss": 0.7676, "grad_norm": 0.6743921637535095, "learning_rate": 0.0002, "epoch": 0.9472172351885099, "step": 13190}, {"loss": 0.7949, "grad_norm": 0.606852114200592, "learning_rate": 0.0002, "epoch": 0.947935368043088, "step": 13200}, {"loss": 0.7908, "grad_norm": 0.6550270915031433, "learning_rate": 0.0002, "epoch": 0.9486535008976661, "step": 13210}, {"loss": 0.7564, "grad_norm": 0.6494552493095398, "learning_rate": 0.0002, "epoch": 0.9493716337522442, "step": 13220}, {"loss": 0.7974, "grad_norm": 0.5867666602134705, "learning_rate": 0.0002, "epoch": 0.9500897666068223, "step": 13230}, {"loss": 0.8117, "grad_norm": 0.6283786296844482, "learning_rate": 0.0002, "epoch": 0.9508078994614003, "step": 13240}, {"loss": 0.7775, "grad_norm": 0.6824573278427124, "learning_rate": 0.0002, "epoch": 0.9515260323159784, "step": 13250}, {"loss": 0.7674, "grad_norm": 0.6945744156837463, "learning_rate": 0.0002, "epoch": 0.9522441651705565, "step": 13260}, {"loss": 0.7384, "grad_norm": 0.6468575596809387, "learning_rate": 0.0002, "epoch": 0.9529622980251347, "step": 13270}, {"loss": 0.7548, "grad_norm": 0.6819407939910889, "learning_rate": 0.0002, "epoch": 0.9536804308797128, "step": 13280}, {"loss": 0.7933, "grad_norm": 0.6660491824150085, "learning_rate": 0.0002, "epoch": 0.9543985637342909, "step": 13290}, {"loss": 0.7293, "grad_norm": 0.6320462226867676, "learning_rate": 0.0002, "epoch": 0.9551166965888689, "step": 13300}, {"loss": 0.8122, "grad_norm": 0.46753761172294617, "learning_rate": 0.0002, "epoch": 0.955834829443447, "step": 13310}, {"loss": 0.7953, "grad_norm": 0.6608774065971375, "learning_rate": 0.0002, "epoch": 0.9565529622980251, "step": 13320}, {"loss": 0.8217, "grad_norm": 0.607448935508728, "learning_rate": 0.0002, "epoch": 0.9572710951526032, "step": 13330}, {"loss": 0.7278, "grad_norm": 0.6796701550483704, "learning_rate": 0.0002, "epoch": 0.9579892280071813, "step": 13340}, {"loss": 0.7979, "grad_norm": 0.7655861377716064, "learning_rate": 0.0002, "epoch": 0.9587073608617595, "step": 13350}, {"loss": 0.7822, "grad_norm": 0.5881335735321045, "learning_rate": 0.0002, "epoch": 0.9594254937163376, "step": 13360}, {"loss": 0.815, "grad_norm": 0.6855270862579346, "learning_rate": 0.0002, "epoch": 0.9601436265709156, "step": 13370}, {"loss": 0.8025, "grad_norm": 0.6072475910186768, "learning_rate": 0.0002, "epoch": 0.9608617594254937, "step": 13380}, {"loss": 0.7756, "grad_norm": 0.5983994603157043, "learning_rate": 0.0002, "epoch": 0.9615798922800718, "step": 13390}, {"loss": 0.8121, "grad_norm": 0.6141189932823181, "learning_rate": 0.0002, "epoch": 0.9622980251346499, "step": 13400}, {"loss": 0.8059, "grad_norm": 0.6539722084999084, "learning_rate": 0.0002, "epoch": 0.963016157989228, "step": 13410}, {"loss": 0.8085, "grad_norm": 0.5425801277160645, "learning_rate": 0.0002, "epoch": 0.9637342908438061, "step": 13420}, {"loss": 0.7687, "grad_norm": 0.8038925528526306, "learning_rate": 0.0002, "epoch": 0.9644524236983842, "step": 13430}, {"loss": 0.8015, "grad_norm": 0.5729590058326721, "learning_rate": 0.0002, "epoch": 0.9651705565529622, "step": 13440}, {"loss": 0.782, "grad_norm": 0.5695241689682007, "learning_rate": 0.0002, "epoch": 0.9658886894075404, "step": 13450}, {"loss": 0.7984, "grad_norm": 0.5913681387901306, "learning_rate": 0.0002, "epoch": 0.9666068222621185, "step": 13460}, {"loss": 0.7947, "grad_norm": 1.1798994541168213, "learning_rate": 0.0002, "epoch": 0.9673249551166966, "step": 13470}, {"loss": 0.7342, "grad_norm": 0.5931369066238403, "learning_rate": 0.0002, "epoch": 0.9680430879712747, "step": 13480}, {"loss": 0.8432, "grad_norm": 0.6269514560699463, "learning_rate": 0.0002, "epoch": 0.9687612208258528, "step": 13490}, {"loss": 0.7357, "grad_norm": 0.7380245327949524, "learning_rate": 0.0002, "epoch": 0.9694793536804309, "step": 13500}, {"loss": 0.8006, "grad_norm": 0.5668187141418457, "learning_rate": 0.0002, "epoch": 0.9701974865350089, "step": 13510}, {"loss": 0.7562, "grad_norm": 0.547149121761322, "learning_rate": 0.0002, "epoch": 0.970915619389587, "step": 13520}, {"loss": 0.8239, "grad_norm": 0.49131739139556885, "learning_rate": 0.0002, "epoch": 0.9716337522441651, "step": 13530}, {"loss": 0.8159, "grad_norm": 0.6385366320610046, "learning_rate": 0.0002, "epoch": 0.9723518850987433, "step": 13540}, {"loss": 0.7882, "grad_norm": 0.5962417125701904, "learning_rate": 0.0002, "epoch": 0.9730700179533214, "step": 13550}, {"loss": 0.7353, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9737881508078995, "step": 13560}, {"loss": 0.7511, "grad_norm": 0.5757403373718262, "learning_rate": 0.0002, "epoch": 0.9745062836624776, "step": 13570}, {"loss": 0.7858, "grad_norm": 0.7214667201042175, "learning_rate": 0.0002, "epoch": 0.9752244165170556, "step": 13580}, {"loss": 0.7492, "grad_norm": 0.5902701020240784, "learning_rate": 0.0002, "epoch": 0.9759425493716337, "step": 13590}, {"loss": 0.8177, "grad_norm": 0.752805769443512, "learning_rate": 0.0002, "epoch": 0.9766606822262118, "step": 13600}, {"loss": 0.7622, "grad_norm": 0.5943595767021179, "learning_rate": 0.0002, "epoch": 0.9773788150807899, "step": 13610}, {"loss": 0.7781, "grad_norm": 0.6752488613128662, "learning_rate": 0.0002, "epoch": 0.978096947935368, "step": 13620}, {"loss": 0.8022, "grad_norm": 0.5295413732528687, "learning_rate": 0.0002, "epoch": 0.9788150807899462, "step": 13630}, {"loss": 0.7462, "grad_norm": 0.732549250125885, "learning_rate": 0.0002, "epoch": 0.9795332136445243, "step": 13640}, {"loss": 0.7939, "grad_norm": 0.5701823830604553, "learning_rate": 0.0002, "epoch": 0.9802513464991023, "step": 13650}, {"loss": 0.7609, "grad_norm": 0.576898455619812, "learning_rate": 0.0002, "epoch": 0.9809694793536804, "step": 13660}, {"loss": 0.7576, "grad_norm": 0.5916832089424133, "learning_rate": 0.0002, "epoch": 0.9816876122082585, "step": 13670}, {"loss": 0.7587, "grad_norm": 0.5554524660110474, "learning_rate": 0.0002, "epoch": 0.9824057450628366, "step": 13680}, {"loss": 0.8274, "grad_norm": 0.6988440752029419, "learning_rate": 0.0002, "epoch": 0.9831238779174147, "step": 13690}, {"loss": 0.7485, "grad_norm": 0.6660445332527161, "learning_rate": 0.0002, "epoch": 0.9838420107719928, "step": 13700}, {"loss": 0.7609, "grad_norm": 2.421210289001465, "learning_rate": 0.0002, "epoch": 0.984560143626571, "step": 13710}, {"loss": 0.784, "grad_norm": 0.6307598948478699, "learning_rate": 0.0002, "epoch": 0.985278276481149, "step": 13720}, {"loss": 0.7757, "grad_norm": 0.6832480430603027, "learning_rate": 0.0002, "epoch": 0.9859964093357271, "step": 13730}, {"loss": 0.8064, "grad_norm": 0.5974255204200745, "learning_rate": 0.0002, "epoch": 0.9867145421903052, "step": 13740}, {"loss": 0.7871, "grad_norm": 0.6540380716323853, "learning_rate": 0.0002, "epoch": 0.9874326750448833, "step": 13750}, {"loss": 0.7735, "grad_norm": 0.7532727122306824, "learning_rate": 0.0002, "epoch": 0.9881508078994614, "step": 13760}, {"loss": 0.7392, "grad_norm": 0.6776283383369446, "learning_rate": 0.0002, "epoch": 0.9888689407540395, "step": 13770}, {"loss": 0.7852, "grad_norm": 0.5776281356811523, "learning_rate": 0.0002, "epoch": 0.9895870736086176, "step": 13780}, {"loss": 0.8216, "grad_norm": 0.5473008751869202, "learning_rate": 0.0002, "epoch": 0.9903052064631956, "step": 13790}, {"loss": 0.7776, "grad_norm": 0.5428591370582581, "learning_rate": 0.0002, "epoch": 0.9910233393177738, "step": 13800}, {"loss": 0.7823, "grad_norm": 0.5173406004905701, "learning_rate": 0.0002, "epoch": 0.9917414721723519, "step": 13810}, {"loss": 0.762, "grad_norm": 0.6462617516517639, "learning_rate": 0.0002, "epoch": 0.99245960502693, "step": 13820}, {"loss": 0.7656, "grad_norm": 0.5800426006317139, "learning_rate": 0.0002, "epoch": 0.9931777378815081, "step": 13830}, {"loss": 0.8028, "grad_norm": 0.5015466809272766, "learning_rate": 0.0002, "epoch": 0.9938958707360862, "step": 13840}, {"loss": 0.7782, "grad_norm": 0.59474778175354, "learning_rate": 0.0002, "epoch": 0.9946140035906643, "step": 13850}, {"loss": 0.7891, "grad_norm": 0.5609583258628845, "learning_rate": 0.0002, "epoch": 0.9953321364452423, "step": 13860}, {"loss": 0.7647, "grad_norm": 0.5762063264846802, "learning_rate": 0.0002, "epoch": 0.9960502692998204, "step": 13870}, {"loss": 0.7594, "grad_norm": 0.6419214010238647, "learning_rate": 0.0002, "epoch": 0.9967684021543985, "step": 13880}, {"loss": 0.7599, "grad_norm": 0.7821950316429138, "learning_rate": 0.0002, "epoch": 0.9974865350089767, "step": 13890}, {"loss": 0.7529, "grad_norm": 0.6216017007827759, "learning_rate": 0.0002, "epoch": 0.9982046678635548, "step": 13900}, {"loss": 0.7621, "grad_norm": 0.5446485877037048, "learning_rate": 0.0002, "epoch": 0.9989228007181329, "step": 13910}, {"loss": 0.74, "grad_norm": 0.5037565231323242, "learning_rate": 0.0002, "epoch": 0.999640933572711, "step": 13920}, {"eval_loss": 1.09147310256958, "eval_runtime": 55.1915, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 1.0, "step": 13925}, {"loss": 0.7479, "grad_norm": 0.5808277130126953, "learning_rate": 0.0002, "epoch": 1.000359066427289, "step": 13930}, {"loss": 0.7147, "grad_norm": 0.47258496284484863, "learning_rate": 0.0002, "epoch": 1.0010771992818672, "step": 13940}, {"loss": 0.7075, "grad_norm": 0.8921670317649841, "learning_rate": 0.0002, "epoch": 1.0017953321364452, "step": 13950}, {"loss": 0.7737, "grad_norm": 0.746729850769043, "learning_rate": 0.0002, "epoch": 1.0025134649910232, "step": 13960}, {"loss": 0.6912, "grad_norm": 0.6243796944618225, "learning_rate": 0.0002, "epoch": 1.0032315978456015, "step": 13970}, {"loss": 0.7171, "grad_norm": 0.6725090742111206, "learning_rate": 0.0002, "epoch": 1.0039497307001795, "step": 13980}, {"loss": 0.7094, "grad_norm": 0.8762497305870056, "learning_rate": 0.0002, "epoch": 1.0046678635547577, "step": 13990}, {"loss": 0.7183, "grad_norm": 0.7694411873817444, "learning_rate": 0.0002, "epoch": 1.0053859964093357, "step": 14000}, {"loss": 0.7741, "grad_norm": 0.6208822727203369, "learning_rate": 0.0002, "epoch": 1.006104129263914, "step": 14010}, {"loss": 0.7291, "grad_norm": 0.8503357768058777, "learning_rate": 0.0002, "epoch": 1.006822262118492, "step": 14020}, {"loss": 0.7189, "grad_norm": 0.5813316106796265, "learning_rate": 0.0002, "epoch": 1.00754039497307, "step": 14030}, {"loss": 0.751, "grad_norm": 0.8186036348342896, "learning_rate": 0.0002, "epoch": 1.0082585278276481, "step": 14040}, {"loss": 0.7205, "grad_norm": 0.759873628616333, "learning_rate": 0.0002, "epoch": 1.0089766606822261, "step": 14050}, {"loss": 0.7517, "grad_norm": 0.8437777161598206, "learning_rate": 0.0002, "epoch": 1.0096947935368044, "step": 14060}, {"loss": 0.7205, "grad_norm": 0.5750975012779236, "learning_rate": 0.0002, "epoch": 1.0104129263913824, "step": 14070}, {"loss": 0.7079, "grad_norm": 0.5873221158981323, "learning_rate": 0.0002, "epoch": 1.0111310592459606, "step": 14080}, {"loss": 0.7645, "grad_norm": 0.6381314396858215, "learning_rate": 0.0002, "epoch": 1.0118491921005386, "step": 14090}, {"loss": 0.7246, "grad_norm": 0.6510405540466309, "learning_rate": 0.0002, "epoch": 1.0125673249551166, "step": 14100}, {"loss": 0.6906, "grad_norm": 0.7698671221733093, "learning_rate": 0.0002, "epoch": 1.0132854578096948, "step": 14110}, {"loss": 0.7008, "grad_norm": 0.646180272102356, "learning_rate": 0.0002, "epoch": 1.0140035906642728, "step": 14120}, {"loss": 0.7446, "grad_norm": 0.6183205246925354, "learning_rate": 0.0002, "epoch": 1.014721723518851, "step": 14130}, {"loss": 0.747, "grad_norm": 0.5082563757896423, "learning_rate": 0.0002, "epoch": 1.015439856373429, "step": 14140}, {"loss": 0.7229, "grad_norm": 0.7285500764846802, "learning_rate": 0.0002, "epoch": 1.0161579892280073, "step": 14150}, {"loss": 0.6879, "grad_norm": 0.6368175148963928, "learning_rate": 0.0002, "epoch": 1.0168761220825853, "step": 14160}, {"loss": 0.712, "grad_norm": 0.44868743419647217, "learning_rate": 0.0002, "epoch": 1.0175942549371633, "step": 14170}, {"loss": 0.7299, "grad_norm": 0.6346513628959656, "learning_rate": 0.0002, "epoch": 1.0183123877917415, "step": 14180}, {"loss": 0.7099, "grad_norm": 0.7287803292274475, "learning_rate": 0.0002, "epoch": 1.0190305206463195, "step": 14190}, {"loss": 0.6915, "grad_norm": 0.6701363325119019, "learning_rate": 0.0002, "epoch": 1.0197486535008977, "step": 14200}, {"loss": 0.7389, "grad_norm": 0.6419289112091064, "learning_rate": 0.0002, "epoch": 1.0204667863554757, "step": 14210}, {"loss": 0.7386, "grad_norm": 0.7703002095222473, "learning_rate": 0.0002, "epoch": 1.021184919210054, "step": 14220}, {"loss": 0.6819, "grad_norm": 0.6803670525550842, "learning_rate": 0.0002, "epoch": 1.021903052064632, "step": 14230}, {"loss": 0.74, "grad_norm": 0.5780976414680481, "learning_rate": 0.0002, "epoch": 1.02262118491921, "step": 14240}, {"loss": 0.6912, "grad_norm": 0.5096051096916199, "learning_rate": 0.0002, "epoch": 1.0233393177737882, "step": 14250}, {"loss": 0.7585, "grad_norm": 0.6058611869812012, "learning_rate": 0.0002, "epoch": 1.0240574506283662, "step": 14260}, {"loss": 0.7542, "grad_norm": 0.6703311204910278, "learning_rate": 0.0002, "epoch": 1.0247755834829444, "step": 14270}, {"loss": 0.7541, "grad_norm": 0.7143640518188477, "learning_rate": 0.0002, "epoch": 1.0254937163375224, "step": 14280}, {"loss": 0.7411, "grad_norm": 0.6730744242668152, "learning_rate": 0.0002, "epoch": 1.0262118491921006, "step": 14290}, {"loss": 0.7072, "grad_norm": 0.8180603384971619, "learning_rate": 0.0002, "epoch": 1.0269299820466786, "step": 14300}, {"loss": 0.6944, "grad_norm": 0.6752267479896545, "learning_rate": 0.0002, "epoch": 1.0276481149012566, "step": 14310}, {"loss": 0.7105, "grad_norm": 0.678428590297699, "learning_rate": 0.0002, "epoch": 1.0283662477558349, "step": 14320}, {"loss": 0.7496, "grad_norm": 0.5959973931312561, "learning_rate": 0.0002, "epoch": 1.0290843806104129, "step": 14330}, {"loss": 0.7196, "grad_norm": 0.5797176957130432, "learning_rate": 0.0002, "epoch": 1.029802513464991, "step": 14340}, {"loss": 0.7853, "grad_norm": 0.6415652632713318, "learning_rate": 0.0002, "epoch": 1.030520646319569, "step": 14350}, {"loss": 0.7297, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 1.0312387791741473, "step": 14360}, {"loss": 0.7715, "grad_norm": 0.7158452272415161, "learning_rate": 0.0002, "epoch": 1.0319569120287253, "step": 14370}, {"loss": 0.7526, "grad_norm": 0.6066089272499084, "learning_rate": 0.0002, "epoch": 1.0326750448833033, "step": 14380}, {"loss": 0.7639, "grad_norm": 0.7359582781791687, "learning_rate": 0.0002, "epoch": 1.0333931777378815, "step": 14390}, {"loss": 0.7445, "grad_norm": 0.7372373938560486, "learning_rate": 0.0002, "epoch": 1.0341113105924595, "step": 14400}, {"loss": 0.7262, "grad_norm": 0.7511868476867676, "learning_rate": 0.0002, "epoch": 1.0348294434470378, "step": 14410}, {"loss": 0.7145, "grad_norm": 0.5449917912483215, "learning_rate": 0.0002, "epoch": 1.0355475763016158, "step": 14420}, {"loss": 0.6908, "grad_norm": 0.6700817346572876, "learning_rate": 0.0002, "epoch": 1.036265709156194, "step": 14430}, {"loss": 0.7237, "grad_norm": 0.7061316967010498, "learning_rate": 0.0002, "epoch": 1.036983842010772, "step": 14440}, {"loss": 0.7166, "grad_norm": 0.7582663893699646, "learning_rate": 0.0002, "epoch": 1.03770197486535, "step": 14450}, {"loss": 0.7447, "grad_norm": 0.6408873200416565, "learning_rate": 0.0002, "epoch": 1.0384201077199282, "step": 14460}, {"loss": 0.728, "grad_norm": 0.7645436525344849, "learning_rate": 0.0002, "epoch": 1.0391382405745062, "step": 14470}, {"loss": 0.7764, "grad_norm": 0.6522644758224487, "learning_rate": 0.0002, "epoch": 1.0398563734290844, "step": 14480}, {"loss": 0.7249, "grad_norm": 0.784273624420166, "learning_rate": 0.0002, "epoch": 1.0405745062836624, "step": 14490}, {"loss": 0.7173, "grad_norm": 0.673891544342041, "learning_rate": 0.0002, "epoch": 1.0412926391382407, "step": 14500}, {"loss": 0.6647, "grad_norm": 0.6566316485404968, "learning_rate": 0.0002, "epoch": 1.0420107719928187, "step": 14510}, {"loss": 0.7626, "grad_norm": 0.6062059998512268, "learning_rate": 0.0002, "epoch": 1.0427289048473967, "step": 14520}, {"loss": 0.7061, "grad_norm": 0.6884504556655884, "learning_rate": 0.0002, "epoch": 1.0434470377019749, "step": 14530}, {"loss": 0.7293, "grad_norm": 0.6642231345176697, "learning_rate": 0.0002, "epoch": 1.044165170556553, "step": 14540}, {"loss": 0.7084, "grad_norm": 0.6989523768424988, "learning_rate": 0.0002, "epoch": 1.0448833034111311, "step": 14550}, {"loss": 0.7751, "grad_norm": 0.8179892301559448, "learning_rate": 0.0002, "epoch": 1.0456014362657091, "step": 14560}, {"loss": 0.7225, "grad_norm": 0.6426970362663269, "learning_rate": 0.0002, "epoch": 1.0463195691202873, "step": 14570}, {"loss": 0.7756, "grad_norm": 0.678445041179657, "learning_rate": 0.0002, "epoch": 1.0470377019748653, "step": 14580}, {"loss": 0.7172, "grad_norm": 0.7573820352554321, "learning_rate": 0.0002, "epoch": 1.0477558348294433, "step": 14590}, {"loss": 0.8092, "grad_norm": 0.734443724155426, "learning_rate": 0.0002, "epoch": 1.0484739676840216, "step": 14600}, {"loss": 0.7205, "grad_norm": 0.7333676218986511, "learning_rate": 0.0002, "epoch": 1.0491921005385996, "step": 14610}, {"loss": 0.7276, "grad_norm": 0.6122187972068787, "learning_rate": 0.0002, "epoch": 1.0499102333931778, "step": 14620}, {"loss": 0.7051, "grad_norm": 0.6916412711143494, "learning_rate": 0.0002, "epoch": 1.0506283662477558, "step": 14630}, {"loss": 0.7315, "grad_norm": 0.5898127555847168, "learning_rate": 0.0002, "epoch": 1.051346499102334, "step": 14640}, {"loss": 0.7293, "grad_norm": 0.6071873307228088, "learning_rate": 0.0002, "epoch": 1.052064631956912, "step": 14650}, {"loss": 0.7924, "grad_norm": 0.6530455946922302, "learning_rate": 0.0002, "epoch": 1.05278276481149, "step": 14660}, {"loss": 0.7055, "grad_norm": 0.6919314861297607, "learning_rate": 0.0002, "epoch": 1.0535008976660682, "step": 14670}, {"loss": 0.7481, "grad_norm": 0.7843509912490845, "learning_rate": 0.0002, "epoch": 1.0542190305206462, "step": 14680}, {"loss": 0.7253, "grad_norm": 0.6106747388839722, "learning_rate": 0.0002, "epoch": 1.0549371633752245, "step": 14690}, {"loss": 0.7206, "grad_norm": 0.7828368544578552, "learning_rate": 0.0002, "epoch": 1.0556552962298025, "step": 14700}, {"loss": 0.6933, "grad_norm": 0.6772044897079468, "learning_rate": 0.0002, "epoch": 1.0563734290843807, "step": 14710}, {"loss": 0.6851, "grad_norm": 0.5430962443351746, "learning_rate": 0.0002, "epoch": 1.0570915619389587, "step": 14720}, {"loss": 0.7306, "grad_norm": 0.7364194989204407, "learning_rate": 0.0002, "epoch": 1.0578096947935367, "step": 14730}, {"loss": 0.703, "grad_norm": 0.5607585310935974, "learning_rate": 0.0002, "epoch": 1.058527827648115, "step": 14740}, {"loss": 0.7488, "grad_norm": 0.7917081713676453, "learning_rate": 0.0002, "epoch": 1.059245960502693, "step": 14750}, {"loss": 0.71, "grad_norm": 0.7852025628089905, "learning_rate": 0.0002, "epoch": 1.0599640933572712, "step": 14760}, {"loss": 0.7093, "grad_norm": 0.6329161524772644, "learning_rate": 0.0002, "epoch": 1.0606822262118492, "step": 14770}, {"loss": 0.7244, "grad_norm": 0.7607306838035583, "learning_rate": 0.0002, "epoch": 1.0614003590664274, "step": 14780}, {"loss": 0.7237, "grad_norm": 0.7236617207527161, "learning_rate": 0.0002, "epoch": 1.0621184919210054, "step": 14790}, {"loss": 0.7133, "grad_norm": 0.793542206287384, "learning_rate": 0.0002, "epoch": 1.0628366247755834, "step": 14800}, {"loss": 0.7482, "grad_norm": 0.53999263048172, "learning_rate": 0.0002, "epoch": 1.0635547576301616, "step": 14810}, {"loss": 0.732, "grad_norm": 0.5821034908294678, "learning_rate": 0.0002, "epoch": 1.0642728904847396, "step": 14820}, {"loss": 0.7066, "grad_norm": 0.6593600511550903, "learning_rate": 0.0002, "epoch": 1.0649910233393178, "step": 14830}, {"loss": 0.7458, "grad_norm": 0.70230633020401, "learning_rate": 0.0002, "epoch": 1.0657091561938958, "step": 14840}, {"loss": 0.7244, "grad_norm": 0.5715264081954956, "learning_rate": 0.0002, "epoch": 1.066427289048474, "step": 14850}, {"loss": 0.723, "grad_norm": 0.6610119938850403, "learning_rate": 0.0002, "epoch": 1.067145421903052, "step": 14860}, {"loss": 0.745, "grad_norm": 0.5470091700553894, "learning_rate": 0.0002, "epoch": 1.06786355475763, "step": 14870}, {"loss": 0.7464, "grad_norm": 0.7529906630516052, "learning_rate": 0.0002, "epoch": 1.0685816876122083, "step": 14880}, {"loss": 0.7421, "grad_norm": 0.7532844543457031, "learning_rate": 0.0002, "epoch": 1.0692998204667863, "step": 14890}, {"loss": 0.6706, "grad_norm": 0.6439316868782043, "learning_rate": 0.0002, "epoch": 1.0700179533213645, "step": 14900}, {"loss": 0.7276, "grad_norm": 0.5580114126205444, "learning_rate": 0.0002, "epoch": 1.0707360861759425, "step": 14910}, {"loss": 0.7478, "grad_norm": 0.6299236416816711, "learning_rate": 0.0002, "epoch": 1.0714542190305207, "step": 14920}, {"loss": 0.7927, "grad_norm": 0.6934021711349487, "learning_rate": 0.0002, "epoch": 1.0721723518850987, "step": 14930}, {"loss": 0.6766, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 1.0728904847396767, "step": 14940}, {"loss": 0.7072, "grad_norm": 0.8921014070510864, "learning_rate": 0.0002, "epoch": 1.073608617594255, "step": 14950}, {"loss": 0.7127, "grad_norm": 0.5934301614761353, "learning_rate": 0.0002, "epoch": 1.074326750448833, "step": 14960}, {"loss": 0.7595, "grad_norm": 0.8379642367362976, "learning_rate": 0.0002, "epoch": 1.0750448833034112, "step": 14970}, {"loss": 0.7231, "grad_norm": 0.6842767596244812, "learning_rate": 0.0002, "epoch": 1.0757630161579892, "step": 14980}, {"loss": 0.7362, "grad_norm": 0.7296533584594727, "learning_rate": 0.0002, "epoch": 1.0764811490125674, "step": 14990}, {"loss": 0.688, "grad_norm": 0.6821087002754211, "learning_rate": 0.0002, "epoch": 1.0771992818671454, "step": 15000}, {"loss": 0.6808, "grad_norm": 0.6133626699447632, "learning_rate": 0.0002, "epoch": 1.0779174147217234, "step": 15010}, {"loss": 0.7351, "grad_norm": 0.6774773001670837, "learning_rate": 0.0002, "epoch": 1.0786355475763016, "step": 15020}, {"loss": 0.7403, "grad_norm": 0.6818786859512329, "learning_rate": 0.0002, "epoch": 1.0793536804308796, "step": 15030}, {"loss": 0.7005, "grad_norm": 0.7763522863388062, "learning_rate": 0.0002, "epoch": 1.0800718132854579, "step": 15040}, {"loss": 0.7028, "grad_norm": 0.7259193658828735, "learning_rate": 0.0002, "epoch": 1.0807899461400359, "step": 15050}, {"loss": 0.7232, "grad_norm": 0.6797525882720947, "learning_rate": 0.0002, "epoch": 1.081508078994614, "step": 15060}, {"loss": 0.7051, "grad_norm": 0.5775881409645081, "learning_rate": 0.0002, "epoch": 1.082226211849192, "step": 15070}, {"loss": 0.745, "grad_norm": 0.7055524587631226, "learning_rate": 0.0002, "epoch": 1.08294434470377, "step": 15080}, {"loss": 0.7539, "grad_norm": 0.8018748760223389, "learning_rate": 0.0002, "epoch": 1.0836624775583483, "step": 15090}, {"loss": 0.6833, "grad_norm": 0.6738115549087524, "learning_rate": 0.0002, "epoch": 1.0843806104129263, "step": 15100}, {"loss": 0.7014, "grad_norm": 0.6586359143257141, "learning_rate": 0.0002, "epoch": 1.0850987432675046, "step": 15110}, {"loss": 0.7391, "grad_norm": 0.7396895885467529, "learning_rate": 0.0002, "epoch": 1.0858168761220826, "step": 15120}, {"loss": 0.7473, "grad_norm": 0.7224817276000977, "learning_rate": 0.0002, "epoch": 1.0865350089766608, "step": 15130}, {"loss": 0.7137, "grad_norm": 0.798514187335968, "learning_rate": 0.0002, "epoch": 1.0872531418312388, "step": 15140}, {"loss": 0.757, "grad_norm": 0.79301518201828, "learning_rate": 0.0002, "epoch": 1.0879712746858168, "step": 15150}, {"loss": 0.7, "grad_norm": 0.7106764316558838, "learning_rate": 0.0002, "epoch": 1.088689407540395, "step": 15160}, {"loss": 0.7515, "grad_norm": 0.6525473594665527, "learning_rate": 0.0002, "epoch": 1.089407540394973, "step": 15170}, {"loss": 0.7067, "grad_norm": 0.6001671552658081, "learning_rate": 0.0002, "epoch": 1.0901256732495512, "step": 15180}, {"loss": 0.722, "grad_norm": 0.6949557662010193, "learning_rate": 0.0002, "epoch": 1.0908438061041292, "step": 15190}, {"loss": 0.7165, "grad_norm": 0.5713186860084534, "learning_rate": 0.0002, "epoch": 1.0915619389587075, "step": 15200}, {"loss": 0.7073, "grad_norm": 0.8773220181465149, "learning_rate": 0.0002, "epoch": 1.0922800718132855, "step": 15210}, {"loss": 0.7332, "grad_norm": 0.5837785601615906, "learning_rate": 0.0002, "epoch": 1.0929982046678635, "step": 15220}, {"loss": 0.7451, "grad_norm": 0.7243856191635132, "learning_rate": 0.0002, "epoch": 1.0937163375224417, "step": 15230}, {"loss": 0.6885, "grad_norm": 0.7008263468742371, "learning_rate": 0.0002, "epoch": 1.0944344703770197, "step": 15240}, {"loss": 0.7259, "grad_norm": 0.7061941623687744, "learning_rate": 0.0002, "epoch": 1.095152603231598, "step": 15250}, {"loss": 0.7482, "grad_norm": 0.575903594493866, "learning_rate": 0.0002, "epoch": 1.095870736086176, "step": 15260}, {"loss": 0.7001, "grad_norm": 0.6794043183326721, "learning_rate": 0.0002, "epoch": 1.0965888689407541, "step": 15270}, {"loss": 0.708, "grad_norm": 0.7194870710372925, "learning_rate": 0.0002, "epoch": 1.0973070017953321, "step": 15280}, {"loss": 0.7248, "grad_norm": 0.8063322305679321, "learning_rate": 0.0002, "epoch": 1.0980251346499101, "step": 15290}, {"loss": 0.7128, "grad_norm": 0.786101758480072, "learning_rate": 0.0002, "epoch": 1.0987432675044884, "step": 15300}, {"loss": 0.7523, "grad_norm": 0.827474057674408, "learning_rate": 0.0002, "epoch": 1.0994614003590664, "step": 15310}, {"loss": 0.7624, "grad_norm": 0.6514455080032349, "learning_rate": 0.0002, "epoch": 1.1001795332136446, "step": 15320}, {"loss": 0.745, "grad_norm": 0.7534348368644714, "learning_rate": 0.0002, "epoch": 1.1008976660682226, "step": 15330}, {"loss": 0.7359, "grad_norm": 0.6991367340087891, "learning_rate": 0.0002, "epoch": 1.1016157989228008, "step": 15340}, {"loss": 0.717, "grad_norm": 0.6742196679115295, "learning_rate": 0.0002, "epoch": 1.1023339317773788, "step": 15350}, {"loss": 0.737, "grad_norm": 0.7373757362365723, "learning_rate": 0.0002, "epoch": 1.1030520646319568, "step": 15360}, {"loss": 0.7421, "grad_norm": 0.6834485530853271, "learning_rate": 0.0002, "epoch": 1.103770197486535, "step": 15370}, {"loss": 0.7015, "grad_norm": 0.6454901099205017, "learning_rate": 0.0002, "epoch": 1.104488330341113, "step": 15380}, {"loss": 0.7276, "grad_norm": 0.7764508128166199, "learning_rate": 0.0002, "epoch": 1.1052064631956913, "step": 15390}, {"loss": 0.747, "grad_norm": 0.668560802936554, "learning_rate": 0.0002, "epoch": 1.1059245960502693, "step": 15400}, {"loss": 0.6705, "grad_norm": 0.579655110836029, "learning_rate": 0.0002, "epoch": 1.1066427289048475, "step": 15410}, {"loss": 0.7101, "grad_norm": 0.7196493148803711, "learning_rate": 0.0002, "epoch": 1.1073608617594255, "step": 15420}, {"loss": 0.8027, "grad_norm": 0.5530232191085815, "learning_rate": 0.0002, "epoch": 1.1080789946140035, "step": 15430}, {"loss": 0.7369, "grad_norm": 0.6542958617210388, "learning_rate": 0.0002, "epoch": 1.1087971274685817, "step": 15440}, {"loss": 0.7475, "grad_norm": 0.7468852400779724, "learning_rate": 0.0002, "epoch": 1.1095152603231597, "step": 15450}, {"loss": 0.6898, "grad_norm": 0.8119780421257019, "learning_rate": 0.0002, "epoch": 1.110233393177738, "step": 15460}, {"loss": 0.7652, "grad_norm": 0.7807733416557312, "learning_rate": 0.0002, "epoch": 1.110951526032316, "step": 15470}, {"loss": 0.697, "grad_norm": 0.7352553009986877, "learning_rate": 0.0002, "epoch": 1.1116696588868942, "step": 15480}, {"loss": 0.7509, "grad_norm": 0.8455224633216858, "learning_rate": 0.0002, "epoch": 1.1123877917414722, "step": 15490}, {"loss": 0.7757, "grad_norm": 0.635308563709259, "learning_rate": 0.0002, "epoch": 1.1131059245960502, "step": 15500}, {"loss": 0.685, "grad_norm": 0.6268794536590576, "learning_rate": 0.0002, "epoch": 1.1138240574506284, "step": 15510}, {"loss": 0.7174, "grad_norm": 0.6829593181610107, "learning_rate": 0.0002, "epoch": 1.1145421903052064, "step": 15520}, {"loss": 0.7264, "grad_norm": 0.5997796058654785, "learning_rate": 0.0002, "epoch": 1.1152603231597846, "step": 15530}, {"loss": 0.7167, "grad_norm": 0.7500942349433899, "learning_rate": 0.0002, "epoch": 1.1159784560143626, "step": 15540}, {"loss": 0.7275, "grad_norm": 0.7052047848701477, "learning_rate": 0.0002, "epoch": 1.1166965888689409, "step": 15550}, {"loss": 0.7832, "grad_norm": 0.6698189377784729, "learning_rate": 0.0002, "epoch": 1.1174147217235189, "step": 15560}, {"loss": 0.7587, "grad_norm": 0.7890462875366211, "learning_rate": 0.0002, "epoch": 1.1181328545780969, "step": 15570}, {"loss": 0.7092, "grad_norm": 0.7002465128898621, "learning_rate": 0.0002, "epoch": 1.118850987432675, "step": 15580}, {"loss": 0.6903, "grad_norm": 0.7456073760986328, "learning_rate": 0.0002, "epoch": 1.119569120287253, "step": 15590}, {"loss": 0.7577, "grad_norm": 0.7997385263442993, "learning_rate": 0.0002, "epoch": 1.1202872531418313, "step": 15600}, {"loss": 0.7005, "grad_norm": 0.6640482544898987, "learning_rate": 0.0002, "epoch": 1.1210053859964093, "step": 15610}, {"loss": 0.7334, "grad_norm": 0.7765318155288696, "learning_rate": 0.0002, "epoch": 1.1217235188509875, "step": 15620}, {"loss": 0.6977, "grad_norm": 0.7184962630271912, "learning_rate": 0.0002, "epoch": 1.1224416517055655, "step": 15630}, {"loss": 0.7362, "grad_norm": 0.7310904264450073, "learning_rate": 0.0002, "epoch": 1.1231597845601435, "step": 15640}, {"loss": 0.7278, "grad_norm": 0.7406452298164368, "learning_rate": 0.0002, "epoch": 1.1238779174147218, "step": 15650}, {"loss": 0.7074, "grad_norm": 0.7546738982200623, "learning_rate": 0.0002, "epoch": 1.1245960502692998, "step": 15660}, {"loss": 0.7641, "grad_norm": 0.7069764733314514, "learning_rate": 0.0002, "epoch": 1.125314183123878, "step": 15670}, {"loss": 0.76, "grad_norm": 0.6309521198272705, "learning_rate": 0.0002, "epoch": 1.126032315978456, "step": 15680}, {"loss": 0.7862, "grad_norm": 0.8050156831741333, "learning_rate": 0.0002, "epoch": 1.1267504488330342, "step": 15690}, {"loss": 0.7553, "grad_norm": 0.726556122303009, "learning_rate": 0.0002, "epoch": 1.1274685816876122, "step": 15700}, {"loss": 0.7763, "grad_norm": 0.77745521068573, "learning_rate": 0.0002, "epoch": 1.1281867145421902, "step": 15710}, {"loss": 0.7703, "grad_norm": 0.7467634677886963, "learning_rate": 0.0002, "epoch": 1.1289048473967684, "step": 15720}, {"loss": 0.7676, "grad_norm": 0.8207895755767822, "learning_rate": 0.0002, "epoch": 1.1296229802513464, "step": 15730}, {"loss": 0.6747, "grad_norm": 0.8253937363624573, "learning_rate": 0.0002, "epoch": 1.1303411131059247, "step": 15740}, {"loss": 0.6983, "grad_norm": 0.6313983798027039, "learning_rate": 0.0002, "epoch": 1.1310592459605027, "step": 15750}, {"loss": 0.6916, "grad_norm": 0.8040992021560669, "learning_rate": 0.0002, "epoch": 1.1317773788150807, "step": 15760}, {"loss": 0.7295, "grad_norm": 0.5937064290046692, "learning_rate": 0.0002, "epoch": 1.132495511669659, "step": 15770}, {"loss": 0.7494, "grad_norm": 0.6486281156539917, "learning_rate": 0.0002, "epoch": 1.133213644524237, "step": 15780}, {"loss": 0.7029, "grad_norm": 0.6161853075027466, "learning_rate": 0.0002, "epoch": 1.1339317773788151, "step": 15790}, {"loss": 0.7019, "grad_norm": 0.6926610469818115, "learning_rate": 0.0002, "epoch": 1.1346499102333931, "step": 15800}, {"loss": 0.6906, "grad_norm": 0.6084047555923462, "learning_rate": 0.0002, "epoch": 1.1353680430879713, "step": 15810}, {"loss": 0.7091, "grad_norm": 0.6928383111953735, "learning_rate": 0.0002, "epoch": 1.1360861759425493, "step": 15820}, {"loss": 0.7238, "grad_norm": 0.7784243822097778, "learning_rate": 0.0002, "epoch": 1.1368043087971276, "step": 15830}, {"loss": 0.6943, "grad_norm": 0.7169384956359863, "learning_rate": 0.0002, "epoch": 1.1375224416517056, "step": 15840}, {"loss": 0.7287, "grad_norm": 0.6953616142272949, "learning_rate": 0.0002, "epoch": 1.1382405745062836, "step": 15850}, {"loss": 0.7489, "grad_norm": 0.7345215082168579, "learning_rate": 0.0002, "epoch": 1.1389587073608618, "step": 15860}, {"loss": 0.683, "grad_norm": 0.5469502806663513, "learning_rate": 0.0002, "epoch": 1.1396768402154398, "step": 15870}, {"loss": 0.717, "grad_norm": 0.687680721282959, "learning_rate": 0.0002, "epoch": 1.140394973070018, "step": 15880}, {"loss": 0.7171, "grad_norm": 0.6879996657371521, "learning_rate": 0.0002, "epoch": 1.141113105924596, "step": 15890}, {"loss": 0.7321, "grad_norm": 0.728886067867279, "learning_rate": 0.0002, "epoch": 1.141831238779174, "step": 15900}, {"loss": 0.7752, "grad_norm": 0.929531455039978, "learning_rate": 0.0002, "epoch": 1.1425493716337523, "step": 15910}, {"loss": 0.7353, "grad_norm": 0.8122507333755493, "learning_rate": 0.0002, "epoch": 1.1432675044883303, "step": 15920}, {"loss": 0.7138, "grad_norm": 0.6494652628898621, "learning_rate": 0.0002, "epoch": 1.1439856373429085, "step": 15930}, {"loss": 0.7489, "grad_norm": 0.7307567596435547, "learning_rate": 0.0002, "epoch": 1.1447037701974865, "step": 15940}, {"loss": 0.7385, "grad_norm": 0.548678994178772, "learning_rate": 0.0002, "epoch": 1.1454219030520647, "step": 15950}, {"loss": 0.7152, "grad_norm": 0.8011603951454163, "learning_rate": 0.0002, "epoch": 1.1461400359066427, "step": 15960}, {"loss": 0.7324, "grad_norm": 0.7026647329330444, "learning_rate": 0.0002, "epoch": 1.146858168761221, "step": 15970}, {"loss": 0.7464, "grad_norm": 0.7338995933532715, "learning_rate": 0.0002, "epoch": 1.147576301615799, "step": 15980}, {"loss": 0.7416, "grad_norm": 0.8453443646430969, "learning_rate": 0.0002, "epoch": 1.148294434470377, "step": 15990}, {"loss": 0.7419, "grad_norm": 0.6787207126617432, "learning_rate": 0.0002, "epoch": 1.1490125673249552, "step": 16000}, {"loss": 0.7487, "grad_norm": 0.6314631104469299, "learning_rate": 0.0002, "epoch": 1.1497307001795332, "step": 16010}, {"loss": 0.7165, "grad_norm": 0.8812752962112427, "learning_rate": 0.0002, "epoch": 1.1504488330341114, "step": 16020}, {"loss": 0.774, "grad_norm": 0.6528969407081604, "learning_rate": 0.0002, "epoch": 1.1511669658886894, "step": 16030}, {"loss": 0.7321, "grad_norm": 0.7843571305274963, "learning_rate": 0.0002, "epoch": 1.1518850987432674, "step": 16040}, {"loss": 0.7769, "grad_norm": 0.7095080018043518, "learning_rate": 0.0002, "epoch": 1.1526032315978456, "step": 16050}, {"loss": 0.744, "grad_norm": 0.7495582103729248, "learning_rate": 0.0002, "epoch": 1.1533213644524236, "step": 16060}, {"loss": 0.7813, "grad_norm": 0.6002049446105957, "learning_rate": 0.0002, "epoch": 1.1540394973070018, "step": 16070}, {"loss": 0.7117, "grad_norm": 0.565014123916626, "learning_rate": 0.0002, "epoch": 1.1547576301615798, "step": 16080}, {"loss": 0.7664, "grad_norm": 0.8209971785545349, "learning_rate": 0.0002, "epoch": 1.155475763016158, "step": 16090}, {"loss": 0.7486, "grad_norm": 0.7137531042098999, "learning_rate": 0.0002, "epoch": 1.156193895870736, "step": 16100}, {"loss": 0.7197, "grad_norm": 0.7307516932487488, "learning_rate": 0.0002, "epoch": 1.1569120287253143, "step": 16110}, {"loss": 0.7351, "grad_norm": 0.6686444878578186, "learning_rate": 0.0002, "epoch": 1.1576301615798923, "step": 16120}, {"loss": 0.7407, "grad_norm": 0.7977298498153687, "learning_rate": 0.0002, "epoch": 1.1583482944344703, "step": 16130}, {"loss": 0.6696, "grad_norm": 0.6980607509613037, "learning_rate": 0.0002, "epoch": 1.1590664272890485, "step": 16140}, {"loss": 0.7513, "grad_norm": 0.6622613668441772, "learning_rate": 0.0002, "epoch": 1.1597845601436265, "step": 16150}, {"loss": 0.7162, "grad_norm": 0.6598347425460815, "learning_rate": 0.0002, "epoch": 1.1605026929982047, "step": 16160}, {"loss": 0.7418, "grad_norm": 0.6686234474182129, "learning_rate": 0.0002, "epoch": 1.1612208258527827, "step": 16170}, {"loss": 0.7104, "grad_norm": 0.7308177947998047, "learning_rate": 0.0002, "epoch": 1.1619389587073607, "step": 16180}, {"loss": 0.7337, "grad_norm": 0.939537525177002, "learning_rate": 0.0002, "epoch": 1.162657091561939, "step": 16190}, {"loss": 0.7054, "grad_norm": 0.5514758825302124, "learning_rate": 0.0002, "epoch": 1.163375224416517, "step": 16200}, {"loss": 0.7449, "grad_norm": 0.589142918586731, "learning_rate": 0.0002, "epoch": 1.1640933572710952, "step": 16210}, {"loss": 0.7438, "grad_norm": 0.6888012290000916, "learning_rate": 0.0002, "epoch": 1.1648114901256732, "step": 16220}, {"loss": 0.719, "grad_norm": 0.82566899061203, "learning_rate": 0.0002, "epoch": 1.1655296229802514, "step": 16230}, {"loss": 0.7274, "grad_norm": 0.6107817888259888, "learning_rate": 0.0002, "epoch": 1.1662477558348294, "step": 16240}, {"loss": 0.6849, "grad_norm": 0.7831398844718933, "learning_rate": 0.0002, "epoch": 1.1669658886894076, "step": 16250}, {"loss": 0.7077, "grad_norm": 0.6468397974967957, "learning_rate": 0.0002, "epoch": 1.1676840215439857, "step": 16260}, {"loss": 0.7056, "grad_norm": 0.7284161448478699, "learning_rate": 0.0002, "epoch": 1.1684021543985637, "step": 16270}, {"loss": 0.7476, "grad_norm": 0.6182818412780762, "learning_rate": 0.0002, "epoch": 1.1691202872531419, "step": 16280}, {"loss": 0.7608, "grad_norm": 0.7091781497001648, "learning_rate": 0.0002, "epoch": 1.1698384201077199, "step": 16290}, {"loss": 0.7235, "grad_norm": 0.7327643632888794, "learning_rate": 0.0002, "epoch": 1.170556552962298, "step": 16300}, {"loss": 0.7304, "grad_norm": 0.5864694118499756, "learning_rate": 0.0002, "epoch": 1.171274685816876, "step": 16310}, {"loss": 0.7011, "grad_norm": 0.7049986720085144, "learning_rate": 0.0002, "epoch": 1.171992818671454, "step": 16320}, {"loss": 0.7234, "grad_norm": 0.7563399076461792, "learning_rate": 0.0002, "epoch": 1.1727109515260323, "step": 16330}, {"loss": 0.7313, "grad_norm": 0.5888143181800842, "learning_rate": 0.0002, "epoch": 1.1734290843806103, "step": 16340}, {"loss": 0.7078, "grad_norm": 0.8670049905776978, "learning_rate": 0.0002, "epoch": 1.1741472172351886, "step": 16350}, {"loss": 0.7656, "grad_norm": 0.8045654296875, "learning_rate": 0.0002, "epoch": 1.1748653500897666, "step": 16360}, {"loss": 0.7942, "grad_norm": 0.9115668535232544, "learning_rate": 0.0002, "epoch": 1.1755834829443448, "step": 16370}, {"loss": 0.6807, "grad_norm": 0.6943584084510803, "learning_rate": 0.0002, "epoch": 1.1763016157989228, "step": 16380}, {"loss": 0.7558, "grad_norm": 0.7931740283966064, "learning_rate": 0.0002, "epoch": 1.177019748653501, "step": 16390}, {"loss": 0.7247, "grad_norm": 0.7967953085899353, "learning_rate": 0.0002, "epoch": 1.177737881508079, "step": 16400}, {"loss": 0.7294, "grad_norm": 0.575165867805481, "learning_rate": 0.0002, "epoch": 1.178456014362657, "step": 16410}, {"loss": 0.8045, "grad_norm": 0.6803409457206726, "learning_rate": 0.0002, "epoch": 1.1791741472172352, "step": 16420}, {"loss": 0.7594, "grad_norm": 0.7661909461021423, "learning_rate": 0.0002, "epoch": 1.1798922800718132, "step": 16430}, {"loss": 0.7387, "grad_norm": 0.7907630205154419, "learning_rate": 0.0002, "epoch": 1.1806104129263915, "step": 16440}, {"loss": 0.6954, "grad_norm": 0.7215338945388794, "learning_rate": 0.0002, "epoch": 1.1813285457809695, "step": 16450}, {"loss": 0.7503, "grad_norm": 0.6824054718017578, "learning_rate": 0.0002, "epoch": 1.1820466786355475, "step": 16460}, {"loss": 0.7548, "grad_norm": 0.8057665228843689, "learning_rate": 0.0002, "epoch": 1.1827648114901257, "step": 16470}, {"loss": 0.7572, "grad_norm": 0.7487542033195496, "learning_rate": 0.0002, "epoch": 1.1834829443447037, "step": 16480}, {"loss": 0.7267, "grad_norm": 0.7254953384399414, "learning_rate": 0.0002, "epoch": 1.184201077199282, "step": 16490}, {"loss": 0.6906, "grad_norm": 0.6986604332923889, "learning_rate": 0.0002, "epoch": 1.18491921005386, "step": 16500}, {"loss": 0.6979, "grad_norm": 0.7889591455459595, "learning_rate": 0.0002, "epoch": 1.1856373429084381, "step": 16510}, {"loss": 0.7455, "grad_norm": 0.6029604077339172, "learning_rate": 0.0002, "epoch": 1.1863554757630161, "step": 16520}, {"loss": 0.7673, "grad_norm": 0.680322527885437, "learning_rate": 0.0002, "epoch": 1.1870736086175944, "step": 16530}, {"loss": 0.708, "grad_norm": 0.8588826060295105, "learning_rate": 0.0002, "epoch": 1.1877917414721724, "step": 16540}, {"loss": 0.7291, "grad_norm": 0.7614806890487671, "learning_rate": 0.0002, "epoch": 1.1885098743267504, "step": 16550}, {"loss": 0.7021, "grad_norm": 0.7523183226585388, "learning_rate": 0.0002, "epoch": 1.1892280071813286, "step": 16560}, {"loss": 0.7452, "grad_norm": 0.8299532532691956, "learning_rate": 0.0002, "epoch": 1.1899461400359066, "step": 16570}, {"loss": 0.7409, "grad_norm": 0.6709241271018982, "learning_rate": 0.0002, "epoch": 1.1906642728904848, "step": 16580}, {"loss": 0.7322, "grad_norm": 0.665414035320282, "learning_rate": 0.0002, "epoch": 1.1913824057450628, "step": 16590}, {"loss": 0.7699, "grad_norm": 0.7582152485847473, "learning_rate": 0.0002, "epoch": 1.1921005385996408, "step": 16600}, {"loss": 0.7069, "grad_norm": 0.5856947302818298, "learning_rate": 0.0002, "epoch": 1.192818671454219, "step": 16610}, {"loss": 0.7444, "grad_norm": 0.6972885727882385, "learning_rate": 0.0002, "epoch": 1.193536804308797, "step": 16620}, {"loss": 0.7265, "grad_norm": 0.6884734630584717, "learning_rate": 0.0002, "epoch": 1.1942549371633753, "step": 16630}, {"loss": 0.6881, "grad_norm": 0.7380475401878357, "learning_rate": 0.0002, "epoch": 1.1949730700179533, "step": 16640}, {"loss": 0.7297, "grad_norm": 0.7976197600364685, "learning_rate": 0.0002, "epoch": 1.1956912028725315, "step": 16650}, {"loss": 0.7328, "grad_norm": 0.819256067276001, "learning_rate": 0.0002, "epoch": 1.1964093357271095, "step": 16660}, {"loss": 0.771, "grad_norm": 0.587867796421051, "learning_rate": 0.0002, "epoch": 1.1971274685816877, "step": 16670}, {"loss": 0.7357, "grad_norm": 0.9162678122520447, "learning_rate": 0.0002, "epoch": 1.1978456014362657, "step": 16680}, {"loss": 0.7472, "grad_norm": 0.7452084422111511, "learning_rate": 0.0002, "epoch": 1.1985637342908437, "step": 16690}, {"loss": 0.7257, "grad_norm": 0.7966971397399902, "learning_rate": 0.0002, "epoch": 1.199281867145422, "step": 16700}, {"loss": 0.8051, "grad_norm": 0.6605724692344666, "learning_rate": 0.0002, "epoch": 1.2, "step": 16710}, {"loss": 0.729, "grad_norm": 0.6499220728874207, "learning_rate": 0.0002, "epoch": 1.2007181328545782, "step": 16720}, {"loss": 0.7107, "grad_norm": 0.7422114610671997, "learning_rate": 0.0002, "epoch": 1.2014362657091562, "step": 16730}, {"loss": 0.6712, "grad_norm": 0.6652370095252991, "learning_rate": 0.0002, "epoch": 1.2021543985637342, "step": 16740}, {"loss": 0.7804, "grad_norm": 0.8761070370674133, "learning_rate": 0.0002, "epoch": 1.2028725314183124, "step": 16750}, {"loss": 0.737, "grad_norm": 0.7294463515281677, "learning_rate": 0.0002, "epoch": 1.2035906642728904, "step": 16760}, {"loss": 0.7638, "grad_norm": 0.7725599408149719, "learning_rate": 0.0002, "epoch": 1.2043087971274686, "step": 16770}, {"loss": 0.6857, "grad_norm": 0.5630005598068237, "learning_rate": 0.0002, "epoch": 1.2050269299820466, "step": 16780}, {"loss": 0.7344, "grad_norm": 0.7601404786109924, "learning_rate": 0.0002, "epoch": 1.2057450628366249, "step": 16790}, {"loss": 0.729, "grad_norm": 0.6859985589981079, "learning_rate": 0.0002, "epoch": 1.2064631956912029, "step": 16800}, {"loss": 0.7203, "grad_norm": 0.7040054798126221, "learning_rate": 0.0002, "epoch": 1.207181328545781, "step": 16810}, {"loss": 0.7727, "grad_norm": 0.7058989405632019, "learning_rate": 0.0002, "epoch": 1.207899461400359, "step": 16820}, {"loss": 0.7247, "grad_norm": 0.7646133899688721, "learning_rate": 0.0002, "epoch": 1.208617594254937, "step": 16830}, {"loss": 0.7903, "grad_norm": 0.669550359249115, "learning_rate": 0.0002, "epoch": 1.2093357271095153, "step": 16840}, {"loss": 0.7313, "grad_norm": 0.6613401174545288, "learning_rate": 0.0002, "epoch": 1.2100538599640933, "step": 16850}, {"loss": 0.7181, "grad_norm": 0.8636519312858582, "learning_rate": 0.0002, "epoch": 1.2107719928186715, "step": 16860}, {"loss": 0.7111, "grad_norm": 0.6077507138252258, "learning_rate": 0.0002, "epoch": 1.2114901256732495, "step": 16870}, {"loss": 0.7706, "grad_norm": 0.7892228364944458, "learning_rate": 0.0002, "epoch": 1.2122082585278275, "step": 16880}, {"loss": 0.685, "grad_norm": 0.7424154877662659, "learning_rate": 0.0002, "epoch": 1.2129263913824058, "step": 16890}, {"loss": 0.6707, "grad_norm": 0.6525408029556274, "learning_rate": 0.0002, "epoch": 1.2136445242369838, "step": 16900}, {"loss": 0.7721, "grad_norm": 0.6178015470504761, "learning_rate": 0.0002, "epoch": 1.214362657091562, "step": 16910}, {"loss": 0.6971, "grad_norm": 0.7319437861442566, "learning_rate": 0.0002, "epoch": 1.21508078994614, "step": 16920}, {"loss": 0.7261, "grad_norm": 0.6823344826698303, "learning_rate": 0.0002, "epoch": 1.2157989228007182, "step": 16930}, {"loss": 0.7048, "grad_norm": 0.5681257843971252, "learning_rate": 0.0002, "epoch": 1.2165170556552962, "step": 16940}, {"loss": 0.7398, "grad_norm": 0.7939814925193787, "learning_rate": 0.0002, "epoch": 1.2172351885098744, "step": 16950}, {"loss": 0.7192, "grad_norm": 0.7031611800193787, "learning_rate": 0.0002, "epoch": 1.2179533213644524, "step": 16960}, {"loss": 0.7212, "grad_norm": 0.7610133290290833, "learning_rate": 0.0002, "epoch": 1.2186714542190304, "step": 16970}, {"loss": 0.7599, "grad_norm": 0.8707142472267151, "learning_rate": 0.0002, "epoch": 1.2193895870736087, "step": 16980}, {"loss": 0.7121, "grad_norm": 0.6603384017944336, "learning_rate": 0.0002, "epoch": 1.2201077199281867, "step": 16990}, {"loss": 0.7315, "grad_norm": 0.7218315005302429, "learning_rate": 0.0002, "epoch": 1.220825852782765, "step": 17000}, {"loss": 0.7513, "grad_norm": 0.8043148517608643, "learning_rate": 0.0002, "epoch": 1.221543985637343, "step": 17010}, {"loss": 0.6749, "grad_norm": 0.7232559323310852, "learning_rate": 0.0002, "epoch": 1.222262118491921, "step": 17020}, {"loss": 0.7681, "grad_norm": 0.690376341342926, "learning_rate": 0.0002, "epoch": 1.2229802513464991, "step": 17030}, {"loss": 0.7042, "grad_norm": 0.602436363697052, "learning_rate": 0.0002, "epoch": 1.2236983842010771, "step": 17040}, {"loss": 0.7129, "grad_norm": 0.7610493898391724, "learning_rate": 0.0002, "epoch": 1.2244165170556554, "step": 17050}, {"loss": 0.758, "grad_norm": 0.7504690885543823, "learning_rate": 0.0002, "epoch": 1.2251346499102334, "step": 17060}, {"loss": 0.6908, "grad_norm": 0.8080246448516846, "learning_rate": 0.0002, "epoch": 1.2258527827648116, "step": 17070}, {"loss": 0.7519, "grad_norm": 1.0240572690963745, "learning_rate": 0.0002, "epoch": 1.2265709156193896, "step": 17080}, {"loss": 0.7193, "grad_norm": 0.6874111294746399, "learning_rate": 0.0002, "epoch": 1.2272890484739678, "step": 17090}, {"loss": 0.79, "grad_norm": 0.800069272518158, "learning_rate": 0.0002, "epoch": 1.2280071813285458, "step": 17100}, {"loss": 0.742, "grad_norm": 0.8628103137016296, "learning_rate": 0.0002, "epoch": 1.2287253141831238, "step": 17110}, {"loss": 0.7022, "grad_norm": 0.7408499121665955, "learning_rate": 0.0002, "epoch": 1.229443447037702, "step": 17120}, {"loss": 0.6774, "grad_norm": 0.6494335532188416, "learning_rate": 0.0002, "epoch": 1.23016157989228, "step": 17130}, {"loss": 0.7025, "grad_norm": 0.6493549942970276, "learning_rate": 0.0002, "epoch": 1.2308797127468583, "step": 17140}, {"loss": 0.7448, "grad_norm": 0.6972658038139343, "learning_rate": 0.0002, "epoch": 1.2315978456014363, "step": 17150}, {"loss": 0.7219, "grad_norm": 0.6877315044403076, "learning_rate": 0.0002, "epoch": 1.2323159784560143, "step": 17160}, {"loss": 0.7945, "grad_norm": 0.7569024562835693, "learning_rate": 0.0002, "epoch": 1.2330341113105925, "step": 17170}, {"loss": 0.7467, "grad_norm": 0.696260392665863, "learning_rate": 0.0002, "epoch": 1.2337522441651705, "step": 17180}, {"loss": 0.6716, "grad_norm": 0.6150345802307129, "learning_rate": 0.0002, "epoch": 1.2344703770197487, "step": 17190}, {"loss": 0.7416, "grad_norm": 0.69009929895401, "learning_rate": 0.0002, "epoch": 1.2351885098743267, "step": 17200}, {"loss": 0.787, "grad_norm": 0.7035185098648071, "learning_rate": 0.0002, "epoch": 1.235906642728905, "step": 17210}, {"loss": 0.6896, "grad_norm": 0.6792506575584412, "learning_rate": 0.0002, "epoch": 1.236624775583483, "step": 17220}, {"loss": 0.6953, "grad_norm": 0.6310356855392456, "learning_rate": 0.0002, "epoch": 1.2373429084380612, "step": 17230}, {"loss": 0.7531, "grad_norm": 0.647026538848877, "learning_rate": 0.0002, "epoch": 1.2380610412926392, "step": 17240}, {"loss": 0.8014, "grad_norm": 0.7609930038452148, "learning_rate": 0.0002, "epoch": 1.2387791741472172, "step": 17250}, {"loss": 0.8045, "grad_norm": 0.791890561580658, "learning_rate": 0.0002, "epoch": 1.2394973070017954, "step": 17260}, {"loss": 0.7445, "grad_norm": 0.7126715183258057, "learning_rate": 0.0002, "epoch": 1.2402154398563734, "step": 17270}, {"loss": 0.6561, "grad_norm": 0.7850401401519775, "learning_rate": 0.0002, "epoch": 1.2409335727109516, "step": 17280}, {"loss": 0.7454, "grad_norm": 0.6694281697273254, "learning_rate": 0.0002, "epoch": 1.2416517055655296, "step": 17290}, {"loss": 0.6711, "grad_norm": 0.6418080925941467, "learning_rate": 0.0002, "epoch": 1.2423698384201076, "step": 17300}, {"loss": 0.7504, "grad_norm": 0.7308132648468018, "learning_rate": 0.0002, "epoch": 1.2430879712746858, "step": 17310}, {"loss": 0.6896, "grad_norm": 0.8322312235832214, "learning_rate": 0.0002, "epoch": 1.2438061041292638, "step": 17320}, {"loss": 0.7341, "grad_norm": 0.6959006190299988, "learning_rate": 0.0002, "epoch": 1.244524236983842, "step": 17330}, {"loss": 0.7025, "grad_norm": 0.7110121846199036, "learning_rate": 0.0002, "epoch": 1.24524236983842, "step": 17340}, {"loss": 0.7858, "grad_norm": 0.6496296525001526, "learning_rate": 0.0002, "epoch": 1.2459605026929983, "step": 17350}, {"loss": 0.7061, "grad_norm": 0.7649076581001282, "learning_rate": 0.0002, "epoch": 1.2466786355475763, "step": 17360}, {"loss": 0.7155, "grad_norm": 0.7139049172401428, "learning_rate": 0.0002, "epoch": 1.2473967684021545, "step": 17370}, {"loss": 0.6932, "grad_norm": 0.7709113955497742, "learning_rate": 0.0002, "epoch": 1.2481149012567325, "step": 17380}, {"loss": 0.731, "grad_norm": 0.7160373330116272, "learning_rate": 0.0002, "epoch": 1.2488330341113105, "step": 17390}, {"loss": 0.7146, "grad_norm": 0.5608301162719727, "learning_rate": 0.0002, "epoch": 1.2495511669658887, "step": 17400}, {"loss": 0.7368, "grad_norm": 0.6913180351257324, "learning_rate": 0.0002, "epoch": 1.2502692998204668, "step": 17410}, {"loss": 0.7167, "grad_norm": 0.6980322599411011, "learning_rate": 0.0002, "epoch": 1.250987432675045, "step": 17420}, {"loss": 0.7096, "grad_norm": 0.8155394792556763, "learning_rate": 0.0002, "epoch": 1.251705565529623, "step": 17430}, {"loss": 0.7477, "grad_norm": 0.8015886545181274, "learning_rate": 0.0002, "epoch": 1.252423698384201, "step": 17440}, {"loss": 0.7006, "grad_norm": 0.5985556244850159, "learning_rate": 0.0002, "epoch": 1.2531418312387792, "step": 17450}, {"loss": 0.7171, "grad_norm": 0.70317143201828, "learning_rate": 0.0002, "epoch": 1.2538599640933572, "step": 17460}, {"loss": 0.7006, "grad_norm": 0.612501323223114, "learning_rate": 0.0002, "epoch": 1.2545780969479354, "step": 17470}, {"loss": 0.7639, "grad_norm": 0.7347102165222168, "learning_rate": 0.0002, "epoch": 1.2552962298025134, "step": 17480}, {"loss": 0.7303, "grad_norm": 0.9189441800117493, "learning_rate": 0.0002, "epoch": 1.2560143626570914, "step": 17490}, {"loss": 0.7547, "grad_norm": 0.7727932929992676, "learning_rate": 0.0002, "epoch": 1.2567324955116697, "step": 17500}, {"loss": 0.6979, "grad_norm": 0.6782869696617126, "learning_rate": 0.0002, "epoch": 1.2574506283662479, "step": 17510}, {"loss": 0.7146, "grad_norm": 0.5710638761520386, "learning_rate": 0.0002, "epoch": 1.2581687612208259, "step": 17520}, {"loss": 0.6999, "grad_norm": 0.6856266856193542, "learning_rate": 0.0002, "epoch": 1.2588868940754039, "step": 17530}, {"loss": 0.7229, "grad_norm": 0.7257347702980042, "learning_rate": 0.0002, "epoch": 1.259605026929982, "step": 17540}, {"loss": 0.7475, "grad_norm": 0.6343092918395996, "learning_rate": 0.0002, "epoch": 1.26032315978456, "step": 17550}, {"loss": 0.7863, "grad_norm": 0.6482594013214111, "learning_rate": 0.0002, "epoch": 1.2610412926391383, "step": 17560}, {"loss": 0.716, "grad_norm": 0.6542837619781494, "learning_rate": 0.0002, "epoch": 1.2617594254937163, "step": 17570}, {"loss": 0.7871, "grad_norm": 0.7106123566627502, "learning_rate": 0.0002, "epoch": 1.2624775583482943, "step": 17580}, {"loss": 0.7446, "grad_norm": 0.9081960320472717, "learning_rate": 0.0002, "epoch": 1.2631956912028726, "step": 17590}, {"loss": 0.7591, "grad_norm": 0.7010290026664734, "learning_rate": 0.0002, "epoch": 1.2639138240574506, "step": 17600}, {"loss": 0.7391, "grad_norm": 0.9973132610321045, "learning_rate": 0.0002, "epoch": 1.2646319569120288, "step": 17610}, {"loss": 0.725, "grad_norm": 0.8003297448158264, "learning_rate": 0.0002, "epoch": 1.2653500897666068, "step": 17620}, {"loss": 0.697, "grad_norm": 0.7383468151092529, "learning_rate": 0.0002, "epoch": 1.2660682226211848, "step": 17630}, {"loss": 0.785, "grad_norm": 0.6337200999259949, "learning_rate": 0.0002, "epoch": 1.266786355475763, "step": 17640}, {"loss": 0.7469, "grad_norm": 0.6371761560440063, "learning_rate": 0.0002, "epoch": 1.2675044883303412, "step": 17650}, {"loss": 0.7348, "grad_norm": 0.7283522486686707, "learning_rate": 0.0002, "epoch": 1.2682226211849192, "step": 17660}, {"loss": 0.7251, "grad_norm": 0.8191015720367432, "learning_rate": 0.0002, "epoch": 1.2689407540394972, "step": 17670}, {"loss": 0.7558, "grad_norm": 0.6210351586341858, "learning_rate": 0.0002, "epoch": 1.2696588868940755, "step": 17680}, {"loss": 0.7733, "grad_norm": 0.6563277840614319, "learning_rate": 0.0002, "epoch": 1.2703770197486535, "step": 17690}, {"loss": 0.7065, "grad_norm": 0.7111260294914246, "learning_rate": 0.0002, "epoch": 1.2710951526032317, "step": 17700}, {"loss": 0.7079, "grad_norm": 0.7061500549316406, "learning_rate": 0.0002, "epoch": 1.2718132854578097, "step": 17710}, {"loss": 0.7612, "grad_norm": 0.7657744884490967, "learning_rate": 0.0002, "epoch": 1.2725314183123877, "step": 17720}, {"loss": 0.7513, "grad_norm": 0.6952996850013733, "learning_rate": 0.0002, "epoch": 1.273249551166966, "step": 17730}, {"loss": 0.7402, "grad_norm": 0.5678043961524963, "learning_rate": 0.0002, "epoch": 1.273967684021544, "step": 17740}, {"loss": 0.7357, "grad_norm": 0.8608036041259766, "learning_rate": 0.0002, "epoch": 1.2746858168761221, "step": 17750}, {"loss": 0.7482, "grad_norm": 0.7184045910835266, "learning_rate": 0.0002, "epoch": 1.2754039497307001, "step": 17760}, {"loss": 0.7277, "grad_norm": 0.6647557616233826, "learning_rate": 0.0002, "epoch": 1.2761220825852782, "step": 17770}, {"loss": 0.6866, "grad_norm": 0.6899349093437195, "learning_rate": 0.0002, "epoch": 1.2768402154398564, "step": 17780}, {"loss": 0.721, "grad_norm": 0.7073346972465515, "learning_rate": 0.0002, "epoch": 1.2775583482944346, "step": 17790}, {"loss": 0.7432, "grad_norm": 0.8896707892417908, "learning_rate": 0.0002, "epoch": 1.2782764811490126, "step": 17800}, {"loss": 0.7318, "grad_norm": 0.5072778463363647, "learning_rate": 0.0002, "epoch": 1.2789946140035906, "step": 17810}, {"loss": 0.7648, "grad_norm": 0.8889711499214172, "learning_rate": 0.0002, "epoch": 1.2797127468581688, "step": 17820}, {"loss": 0.6894, "grad_norm": 0.5583778619766235, "learning_rate": 0.0002, "epoch": 1.2804308797127468, "step": 17830}, {"loss": 0.7488, "grad_norm": 0.6526148915290833, "learning_rate": 0.0002, "epoch": 1.281149012567325, "step": 17840}, {"loss": 0.7462, "grad_norm": 0.7658175826072693, "learning_rate": 0.0002, "epoch": 1.281867145421903, "step": 17850}, {"loss": 0.7298, "grad_norm": 0.5547847151756287, "learning_rate": 0.0002, "epoch": 1.282585278276481, "step": 17860}, {"loss": 0.705, "grad_norm": 0.6153780817985535, "learning_rate": 0.0002, "epoch": 1.2833034111310593, "step": 17870}, {"loss": 0.7173, "grad_norm": 0.8474061489105225, "learning_rate": 0.0002, "epoch": 1.2840215439856373, "step": 17880}, {"loss": 0.7597, "grad_norm": 0.859260618686676, "learning_rate": 0.0002, "epoch": 1.2847396768402155, "step": 17890}, {"loss": 0.7237, "grad_norm": 0.7270520329475403, "learning_rate": 0.0002, "epoch": 1.2854578096947935, "step": 17900}, {"loss": 0.701, "grad_norm": 0.8166249394416809, "learning_rate": 0.0002, "epoch": 1.2861759425493715, "step": 17910}, {"loss": 0.686, "grad_norm": 0.9158982038497925, "learning_rate": 0.0002, "epoch": 1.2868940754039497, "step": 17920}, {"loss": 0.7243, "grad_norm": 0.8132565021514893, "learning_rate": 0.0002, "epoch": 1.287612208258528, "step": 17930}, {"loss": 0.6909, "grad_norm": 0.7914409637451172, "learning_rate": 0.0002, "epoch": 1.288330341113106, "step": 17940}, {"loss": 0.7034, "grad_norm": 0.6256071329116821, "learning_rate": 0.0002, "epoch": 1.289048473967684, "step": 17950}, {"loss": 0.7279, "grad_norm": 0.6463542580604553, "learning_rate": 0.0002, "epoch": 1.2897666068222622, "step": 17960}, {"loss": 0.7601, "grad_norm": 0.6702672839164734, "learning_rate": 0.0002, "epoch": 1.2904847396768402, "step": 17970}, {"loss": 0.7355, "grad_norm": 0.8666605949401855, "learning_rate": 0.0002, "epoch": 1.2912028725314184, "step": 17980}, {"loss": 0.6838, "grad_norm": 0.8055952787399292, "learning_rate": 0.0002, "epoch": 1.2919210053859964, "step": 17990}, {"loss": 0.7361, "grad_norm": 0.6909741163253784, "learning_rate": 0.0002, "epoch": 1.2926391382405744, "step": 18000}, {"loss": 0.7766, "grad_norm": 0.663702130317688, "learning_rate": 0.0002, "epoch": 1.2933572710951526, "step": 18010}, {"loss": 0.7071, "grad_norm": 0.6952448487281799, "learning_rate": 0.0002, "epoch": 1.2940754039497306, "step": 18020}, {"loss": 0.7359, "grad_norm": 0.5722854137420654, "learning_rate": 0.0002, "epoch": 1.2947935368043089, "step": 18030}, {"loss": 0.764, "grad_norm": 0.7987681031227112, "learning_rate": 0.0002, "epoch": 1.2955116696588869, "step": 18040}, {"loss": 0.743, "grad_norm": 0.661133348941803, "learning_rate": 0.0002, "epoch": 1.2962298025134649, "step": 18050}, {"loss": 0.7627, "grad_norm": 0.6025064587593079, "learning_rate": 0.0002, "epoch": 1.296947935368043, "step": 18060}, {"loss": 0.7242, "grad_norm": 0.7569907903671265, "learning_rate": 0.0002, "epoch": 1.2976660682226213, "step": 18070}, {"loss": 0.7234, "grad_norm": 0.7222012281417847, "learning_rate": 0.0002, "epoch": 1.2983842010771993, "step": 18080}, {"loss": 0.7133, "grad_norm": 0.5291963815689087, "learning_rate": 0.0002, "epoch": 1.2991023339317773, "step": 18090}, {"loss": 0.7215, "grad_norm": 0.6808363199234009, "learning_rate": 0.0002, "epoch": 1.2998204667863555, "step": 18100}, {"loss": 0.7621, "grad_norm": 0.6797927618026733, "learning_rate": 0.0002, "epoch": 1.3005385996409335, "step": 18110}, {"loss": 0.7474, "grad_norm": 0.7775542140007019, "learning_rate": 0.0002, "epoch": 1.3012567324955118, "step": 18120}, {"loss": 0.7376, "grad_norm": 0.7369466423988342, "learning_rate": 0.0002, "epoch": 1.3019748653500898, "step": 18130}, {"loss": 0.7098, "grad_norm": 0.6822494864463806, "learning_rate": 0.0002, "epoch": 1.3026929982046678, "step": 18140}, {"loss": 0.7675, "grad_norm": 0.9222138524055481, "learning_rate": 0.0002, "epoch": 1.303411131059246, "step": 18150}, {"loss": 0.7593, "grad_norm": 0.7485767006874084, "learning_rate": 0.0002, "epoch": 1.304129263913824, "step": 18160}, {"loss": 0.7293, "grad_norm": 0.6383684277534485, "learning_rate": 0.0002, "epoch": 1.3048473967684022, "step": 18170}, {"loss": 0.7929, "grad_norm": 0.5934187173843384, "learning_rate": 0.0002, "epoch": 1.3055655296229802, "step": 18180}, {"loss": 0.7576, "grad_norm": 0.7265770435333252, "learning_rate": 0.0002, "epoch": 1.3062836624775582, "step": 18190}, {"loss": 0.7126, "grad_norm": 0.8149140477180481, "learning_rate": 0.0002, "epoch": 1.3070017953321365, "step": 18200}, {"loss": 0.7529, "grad_norm": 0.8067880272865295, "learning_rate": 0.0002, "epoch": 1.3077199281867147, "step": 18210}, {"loss": 0.7173, "grad_norm": 0.6109178066253662, "learning_rate": 0.0002, "epoch": 1.3084380610412927, "step": 18220}, {"loss": 0.7452, "grad_norm": 0.7194176316261292, "learning_rate": 0.0002, "epoch": 1.3091561938958707, "step": 18230}, {"loss": 0.732, "grad_norm": 0.6452242136001587, "learning_rate": 0.0002, "epoch": 1.309874326750449, "step": 18240}, {"loss": 0.7772, "grad_norm": 0.680550217628479, "learning_rate": 0.0002, "epoch": 1.310592459605027, "step": 18250}, {"loss": 0.7334, "grad_norm": 0.7005740404129028, "learning_rate": 0.0002, "epoch": 1.3113105924596051, "step": 18260}, {"loss": 0.7537, "grad_norm": 0.7217825055122375, "learning_rate": 0.0002, "epoch": 1.3120287253141831, "step": 18270}, {"loss": 0.7797, "grad_norm": 0.7730209231376648, "learning_rate": 0.0002, "epoch": 1.3127468581687611, "step": 18280}, {"loss": 0.7257, "grad_norm": 0.8291956186294556, "learning_rate": 0.0002, "epoch": 1.3134649910233394, "step": 18290}, {"loss": 0.7234, "grad_norm": 0.758528470993042, "learning_rate": 0.0002, "epoch": 1.3141831238779174, "step": 18300}, {"loss": 0.6915, "grad_norm": 0.9682782292366028, "learning_rate": 0.0002, "epoch": 1.3149012567324956, "step": 18310}, {"loss": 0.686, "grad_norm": 0.5784780979156494, "learning_rate": 0.0002, "epoch": 1.3156193895870736, "step": 18320}, {"loss": 0.7277, "grad_norm": 0.5870532393455505, "learning_rate": 0.0002, "epoch": 1.3163375224416516, "step": 18330}, {"loss": 0.7594, "grad_norm": 0.5950172543525696, "learning_rate": 0.0002, "epoch": 1.3170556552962298, "step": 18340}, {"loss": 0.7086, "grad_norm": 0.7625961899757385, "learning_rate": 0.0002, "epoch": 1.317773788150808, "step": 18350}, {"loss": 0.7075, "grad_norm": 0.8027397394180298, "learning_rate": 0.0002, "epoch": 1.318491921005386, "step": 18360}, {"loss": 0.7249, "grad_norm": 0.8424779772758484, "learning_rate": 0.0002, "epoch": 1.319210053859964, "step": 18370}, {"loss": 0.7349, "grad_norm": 0.5741737484931946, "learning_rate": 0.0002, "epoch": 1.3199281867145423, "step": 18380}, {"loss": 0.7421, "grad_norm": 0.7363710999488831, "learning_rate": 0.0002, "epoch": 1.3206463195691203, "step": 18390}, {"loss": 0.7208, "grad_norm": 0.7900536060333252, "learning_rate": 0.0002, "epoch": 1.3213644524236985, "step": 18400}, {"loss": 0.6836, "grad_norm": 0.6273105144500732, "learning_rate": 0.0002, "epoch": 1.3220825852782765, "step": 18410}, {"loss": 0.7365, "grad_norm": 0.7612496018409729, "learning_rate": 0.0002, "epoch": 1.3228007181328545, "step": 18420}, {"loss": 0.7521, "grad_norm": 0.729653537273407, "learning_rate": 0.0002, "epoch": 1.3235188509874327, "step": 18430}, {"loss": 0.7153, "grad_norm": 0.6599212288856506, "learning_rate": 0.0002, "epoch": 1.3242369838420107, "step": 18440}, {"loss": 0.7315, "grad_norm": 0.762320876121521, "learning_rate": 0.0002, "epoch": 1.324955116696589, "step": 18450}, {"loss": 0.6986, "grad_norm": 0.7468838095664978, "learning_rate": 0.0002, "epoch": 1.325673249551167, "step": 18460}, {"loss": 0.7527, "grad_norm": 0.6376237273216248, "learning_rate": 0.0002, "epoch": 1.326391382405745, "step": 18470}, {"loss": 0.7173, "grad_norm": 0.6722603440284729, "learning_rate": 0.0002, "epoch": 1.3271095152603232, "step": 18480}, {"loss": 0.6821, "grad_norm": 0.7011231780052185, "learning_rate": 0.0002, "epoch": 1.3278276481149014, "step": 18490}, {"loss": 0.7942, "grad_norm": 0.5325027108192444, "learning_rate": 0.0002, "epoch": 1.3285457809694794, "step": 18500}, {"loss": 0.6709, "grad_norm": 0.6916731595993042, "learning_rate": 0.0002, "epoch": 1.3292639138240574, "step": 18510}, {"loss": 0.7204, "grad_norm": 0.6529106497764587, "learning_rate": 0.0002, "epoch": 1.3299820466786356, "step": 18520}, {"loss": 0.7289, "grad_norm": 0.7708640694618225, "learning_rate": 0.0002, "epoch": 1.3307001795332136, "step": 18530}, {"loss": 0.7688, "grad_norm": 0.7125861048698425, "learning_rate": 0.0002, "epoch": 1.3314183123877918, "step": 18540}, {"loss": 0.723, "grad_norm": 0.7663969993591309, "learning_rate": 0.0002, "epoch": 1.3321364452423698, "step": 18550}, {"loss": 0.6993, "grad_norm": 0.601141631603241, "learning_rate": 0.0002, "epoch": 1.3328545780969479, "step": 18560}, {"loss": 0.734, "grad_norm": 0.6185581088066101, "learning_rate": 0.0002, "epoch": 1.333572710951526, "step": 18570}, {"loss": 0.6938, "grad_norm": 0.6136596202850342, "learning_rate": 0.0002, "epoch": 1.334290843806104, "step": 18580}, {"loss": 0.6963, "grad_norm": 0.8377187252044678, "learning_rate": 0.0002, "epoch": 1.3350089766606823, "step": 18590}, {"loss": 0.7399, "grad_norm": 0.7649989724159241, "learning_rate": 0.0002, "epoch": 1.3357271095152603, "step": 18600}, {"loss": 0.7565, "grad_norm": 0.7944515347480774, "learning_rate": 0.0002, "epoch": 1.3364452423698383, "step": 18610}, {"loss": 0.7894, "grad_norm": 0.619024395942688, "learning_rate": 0.0002, "epoch": 1.3371633752244165, "step": 18620}, {"loss": 0.7497, "grad_norm": 0.7849082946777344, "learning_rate": 0.0002, "epoch": 1.3378815080789948, "step": 18630}, {"loss": 0.7123, "grad_norm": 0.5740780830383301, "learning_rate": 0.0002, "epoch": 1.3385996409335728, "step": 18640}, {"loss": 0.7211, "grad_norm": 0.6897456645965576, "learning_rate": 0.0002, "epoch": 1.3393177737881508, "step": 18650}, {"loss": 0.7174, "grad_norm": 0.6263600587844849, "learning_rate": 0.0002, "epoch": 1.340035906642729, "step": 18660}, {"loss": 0.7048, "grad_norm": 0.5744550824165344, "learning_rate": 0.0002, "epoch": 1.340754039497307, "step": 18670}, {"loss": 0.7773, "grad_norm": 0.7785728573799133, "learning_rate": 0.0002, "epoch": 1.3414721723518852, "step": 18680}, {"loss": 0.7697, "grad_norm": 0.6944230198860168, "learning_rate": 0.0002, "epoch": 1.3421903052064632, "step": 18690}, {"loss": 0.7387, "grad_norm": 0.7388073801994324, "learning_rate": 0.0002, "epoch": 1.3429084380610412, "step": 18700}, {"loss": 0.7776, "grad_norm": 0.9555586576461792, "learning_rate": 0.0002, "epoch": 1.3436265709156194, "step": 18710}, {"loss": 0.7308, "grad_norm": 0.8510582447052002, "learning_rate": 0.0002, "epoch": 1.3443447037701974, "step": 18720}, {"loss": 0.7131, "grad_norm": 0.6093049645423889, "learning_rate": 0.0002, "epoch": 1.3450628366247757, "step": 18730}, {"loss": 0.7194, "grad_norm": 0.9159273505210876, "learning_rate": 0.0002, "epoch": 1.3457809694793537, "step": 18740}, {"loss": 0.7626, "grad_norm": 0.7188084721565247, "learning_rate": 0.0002, "epoch": 1.3464991023339317, "step": 18750}, {"loss": 0.7212, "grad_norm": 0.7228650450706482, "learning_rate": 0.0002, "epoch": 1.3472172351885099, "step": 18760}, {"loss": 0.7213, "grad_norm": 0.8160615563392639, "learning_rate": 0.0002, "epoch": 1.347935368043088, "step": 18770}, {"loss": 0.7093, "grad_norm": 0.6485389471054077, "learning_rate": 0.0002, "epoch": 1.3486535008976661, "step": 18780}, {"loss": 0.7044, "grad_norm": 0.6755139827728271, "learning_rate": 0.0002, "epoch": 1.3493716337522441, "step": 18790}, {"loss": 0.7413, "grad_norm": 0.6923297643661499, "learning_rate": 0.0002, "epoch": 1.3500897666068223, "step": 18800}, {"loss": 0.7184, "grad_norm": 0.6954510807991028, "learning_rate": 0.0002, "epoch": 1.3508078994614003, "step": 18810}, {"loss": 0.6987, "grad_norm": 0.9948558807373047, "learning_rate": 0.0002, "epoch": 1.3515260323159786, "step": 18820}, {"loss": 0.7315, "grad_norm": 0.708381175994873, "learning_rate": 0.0002, "epoch": 1.3522441651705566, "step": 18830}, {"loss": 0.7135, "grad_norm": 0.6409999132156372, "learning_rate": 0.0002, "epoch": 1.3529622980251346, "step": 18840}, {"loss": 0.7204, "grad_norm": 0.6365936994552612, "learning_rate": 0.0002, "epoch": 1.3536804308797128, "step": 18850}, {"loss": 0.691, "grad_norm": 0.7620742917060852, "learning_rate": 0.0002, "epoch": 1.3543985637342908, "step": 18860}, {"loss": 0.7458, "grad_norm": 0.6849071383476257, "learning_rate": 0.0002, "epoch": 1.355116696588869, "step": 18870}, {"loss": 0.7221, "grad_norm": 0.5776316523551941, "learning_rate": 0.0002, "epoch": 1.355834829443447, "step": 18880}, {"loss": 0.7412, "grad_norm": 0.597236156463623, "learning_rate": 0.0002, "epoch": 1.356552962298025, "step": 18890}, {"loss": 0.7065, "grad_norm": 0.6569282412528992, "learning_rate": 0.0002, "epoch": 1.3572710951526032, "step": 18900}, {"loss": 0.6995, "grad_norm": 0.6384802460670471, "learning_rate": 0.0002, "epoch": 1.3579892280071812, "step": 18910}, {"loss": 0.7592, "grad_norm": 0.6623879671096802, "learning_rate": 0.0002, "epoch": 1.3587073608617595, "step": 18920}, {"loss": 0.7288, "grad_norm": 0.6149632334709167, "learning_rate": 0.0002, "epoch": 1.3594254937163375, "step": 18930}, {"loss": 0.7392, "grad_norm": 0.6978002190589905, "learning_rate": 0.0002, "epoch": 1.3601436265709157, "step": 18940}, {"loss": 0.7405, "grad_norm": 0.7579124569892883, "learning_rate": 0.0002, "epoch": 1.3608617594254937, "step": 18950}, {"loss": 0.7589, "grad_norm": 0.7138084173202515, "learning_rate": 0.0002, "epoch": 1.361579892280072, "step": 18960}, {"loss": 0.7257, "grad_norm": 0.678322434425354, "learning_rate": 0.0002, "epoch": 1.36229802513465, "step": 18970}, {"loss": 0.7221, "grad_norm": 0.694346010684967, "learning_rate": 0.0002, "epoch": 1.363016157989228, "step": 18980}, {"loss": 0.6986, "grad_norm": 0.682262659072876, "learning_rate": 0.0002, "epoch": 1.3637342908438062, "step": 18990}, {"loss": 0.7297, "grad_norm": 0.9068194627761841, "learning_rate": 0.0002, "epoch": 1.3644524236983842, "step": 19000}, {"loss": 0.756, "grad_norm": 0.6691566705703735, "learning_rate": 0.0002, "epoch": 1.3651705565529624, "step": 19010}, {"loss": 0.7158, "grad_norm": 0.7791378498077393, "learning_rate": 0.0002, "epoch": 1.3658886894075404, "step": 19020}, {"loss": 0.6904, "grad_norm": 0.717107355594635, "learning_rate": 0.0002, "epoch": 1.3666068222621184, "step": 19030}, {"loss": 0.7308, "grad_norm": 0.7897566556930542, "learning_rate": 0.0002, "epoch": 1.3673249551166966, "step": 19040}, {"loss": 0.7278, "grad_norm": 0.8823844790458679, "learning_rate": 0.0002, "epoch": 1.3680430879712746, "step": 19050}, {"loss": 0.7252, "grad_norm": 0.6512053608894348, "learning_rate": 0.0002, "epoch": 1.3687612208258528, "step": 19060}, {"loss": 0.6861, "grad_norm": 0.6871389150619507, "learning_rate": 0.0002, "epoch": 1.3694793536804308, "step": 19070}, {"loss": 0.7311, "grad_norm": 0.6795603036880493, "learning_rate": 0.0002, "epoch": 1.370197486535009, "step": 19080}, {"loss": 0.7351, "grad_norm": 0.6569121479988098, "learning_rate": 0.0002, "epoch": 1.370915619389587, "step": 19090}, {"loss": 0.7743, "grad_norm": 0.6769960522651672, "learning_rate": 0.0002, "epoch": 1.3716337522441653, "step": 19100}, {"loss": 0.7275, "grad_norm": 0.726613461971283, "learning_rate": 0.0002, "epoch": 1.3723518850987433, "step": 19110}, {"loss": 0.7484, "grad_norm": 0.7287817001342773, "learning_rate": 0.0002, "epoch": 1.3730700179533213, "step": 19120}, {"loss": 0.7305, "grad_norm": 0.6169242858886719, "learning_rate": 0.0002, "epoch": 1.3737881508078995, "step": 19130}, {"loss": 0.7195, "grad_norm": 0.6537347435951233, "learning_rate": 0.0002, "epoch": 1.3745062836624775, "step": 19140}, {"loss": 0.7402, "grad_norm": 0.6113879680633545, "learning_rate": 0.0002, "epoch": 1.3752244165170557, "step": 19150}, {"loss": 0.7012, "grad_norm": 0.6415297985076904, "learning_rate": 0.0002, "epoch": 1.3759425493716337, "step": 19160}, {"loss": 0.7367, "grad_norm": 0.6812838315963745, "learning_rate": 0.0002, "epoch": 1.3766606822262117, "step": 19170}, {"loss": 0.7117, "grad_norm": 0.7331814169883728, "learning_rate": 0.0002, "epoch": 1.37737881508079, "step": 19180}, {"loss": 0.7496, "grad_norm": 0.7265108823776245, "learning_rate": 0.0002, "epoch": 1.378096947935368, "step": 19190}, {"loss": 0.699, "grad_norm": 0.6233167052268982, "learning_rate": 0.0002, "epoch": 1.3788150807899462, "step": 19200}, {"loss": 0.6978, "grad_norm": 0.6841492652893066, "learning_rate": 0.0002, "epoch": 1.3795332136445242, "step": 19210}, {"loss": 0.6934, "grad_norm": 0.822853684425354, "learning_rate": 0.0002, "epoch": 1.3802513464991024, "step": 19220}, {"loss": 0.7574, "grad_norm": 0.8078812956809998, "learning_rate": 0.0002, "epoch": 1.3809694793536804, "step": 19230}, {"loss": 0.7429, "grad_norm": 0.7269898056983948, "learning_rate": 0.0002, "epoch": 1.3816876122082586, "step": 19240}, {"loss": 0.7552, "grad_norm": 0.6297033429145813, "learning_rate": 0.0002, "epoch": 1.3824057450628366, "step": 19250}, {"loss": 0.7396, "grad_norm": 0.8097442388534546, "learning_rate": 0.0002, "epoch": 1.3831238779174146, "step": 19260}, {"loss": 0.7281, "grad_norm": 0.6442803740501404, "learning_rate": 0.0002, "epoch": 1.3838420107719929, "step": 19270}, {"loss": 0.7598, "grad_norm": 0.659866213798523, "learning_rate": 0.0002, "epoch": 1.3845601436265709, "step": 19280}, {"loss": 0.7262, "grad_norm": 0.7537921667098999, "learning_rate": 0.0002, "epoch": 1.385278276481149, "step": 19290}, {"loss": 0.7215, "grad_norm": 0.8441828489303589, "learning_rate": 0.0002, "epoch": 1.385996409335727, "step": 19300}, {"loss": 0.725, "grad_norm": 0.8506057262420654, "learning_rate": 0.0002, "epoch": 1.386714542190305, "step": 19310}, {"loss": 0.7747, "grad_norm": 0.6747094392776489, "learning_rate": 0.0002, "epoch": 1.3874326750448833, "step": 19320}, {"loss": 0.7785, "grad_norm": 0.7906509041786194, "learning_rate": 0.0002, "epoch": 1.3881508078994613, "step": 19330}, {"loss": 0.8147, "grad_norm": 0.6784867644309998, "learning_rate": 0.0002, "epoch": 1.3888689407540395, "step": 19340}, {"loss": 0.7861, "grad_norm": 0.6371709108352661, "learning_rate": 0.0002, "epoch": 1.3895870736086176, "step": 19350}, {"loss": 0.7434, "grad_norm": 0.7858285307884216, "learning_rate": 0.0002, "epoch": 1.3903052064631956, "step": 19360}, {"loss": 0.7638, "grad_norm": 0.711395263671875, "learning_rate": 0.0002, "epoch": 1.3910233393177738, "step": 19370}, {"loss": 0.725, "grad_norm": 0.7023257613182068, "learning_rate": 0.0002, "epoch": 1.391741472172352, "step": 19380}, {"loss": 0.7612, "grad_norm": 0.7036022543907166, "learning_rate": 0.0002, "epoch": 1.39245960502693, "step": 19390}, {"loss": 0.7354, "grad_norm": 0.6418436169624329, "learning_rate": 0.0002, "epoch": 1.393177737881508, "step": 19400}, {"loss": 0.7444, "grad_norm": 0.7108847498893738, "learning_rate": 0.0002, "epoch": 1.3938958707360862, "step": 19410}, {"loss": 0.771, "grad_norm": 0.6940230131149292, "learning_rate": 0.0002, "epoch": 1.3946140035906642, "step": 19420}, {"loss": 0.6791, "grad_norm": 0.6750220656394958, "learning_rate": 0.0002, "epoch": 1.3953321364452425, "step": 19430}, {"loss": 0.7466, "grad_norm": 0.7479177713394165, "learning_rate": 0.0002, "epoch": 1.3960502692998205, "step": 19440}, {"loss": 0.7259, "grad_norm": 0.626124918460846, "learning_rate": 0.0002, "epoch": 1.3967684021543985, "step": 19450}, {"loss": 0.7108, "grad_norm": 0.8908559083938599, "learning_rate": 0.0002, "epoch": 1.3974865350089767, "step": 19460}, {"loss": 0.7451, "grad_norm": 0.6163712739944458, "learning_rate": 0.0002, "epoch": 1.3982046678635547, "step": 19470}, {"loss": 0.7437, "grad_norm": 0.6993312239646912, "learning_rate": 0.0002, "epoch": 1.398922800718133, "step": 19480}, {"loss": 0.7035, "grad_norm": 0.6162890791893005, "learning_rate": 0.0002, "epoch": 1.399640933572711, "step": 19490}, {"loss": 0.7455, "grad_norm": 0.7797643542289734, "learning_rate": 0.0002, "epoch": 1.400359066427289, "step": 19500}, {"loss": 0.7497, "grad_norm": 0.7038744688034058, "learning_rate": 0.0002, "epoch": 1.4010771992818671, "step": 19510}, {"loss": 0.7084, "grad_norm": 0.6902393698692322, "learning_rate": 0.0002, "epoch": 1.4017953321364454, "step": 19520}, {"loss": 0.7136, "grad_norm": 0.5436386466026306, "learning_rate": 0.0002, "epoch": 1.4025134649910234, "step": 19530}, {"loss": 0.7457, "grad_norm": 0.6537990570068359, "learning_rate": 0.0002, "epoch": 1.4032315978456014, "step": 19540}, {"loss": 0.727, "grad_norm": 0.739691972732544, "learning_rate": 0.0002, "epoch": 1.4039497307001796, "step": 19550}, {"loss": 0.7537, "grad_norm": 0.7287635803222656, "learning_rate": 0.0002, "epoch": 1.4046678635547576, "step": 19560}, {"loss": 0.707, "grad_norm": 0.6809501051902771, "learning_rate": 0.0002, "epoch": 1.4053859964093358, "step": 19570}, {"loss": 0.7336, "grad_norm": 0.8302195072174072, "learning_rate": 0.0002, "epoch": 1.4061041292639138, "step": 19580}, {"loss": 0.7201, "grad_norm": 0.6613629460334778, "learning_rate": 0.0002, "epoch": 1.4068222621184918, "step": 19590}, {"loss": 0.7415, "grad_norm": 0.7897207736968994, "learning_rate": 0.0002, "epoch": 1.40754039497307, "step": 19600}, {"loss": 0.7483, "grad_norm": 0.8368293642997742, "learning_rate": 0.0002, "epoch": 1.408258527827648, "step": 19610}, {"loss": 0.7412, "grad_norm": 0.665109395980835, "learning_rate": 0.0002, "epoch": 1.4089766606822263, "step": 19620}, {"loss": 0.7339, "grad_norm": 0.7359302639961243, "learning_rate": 0.0002, "epoch": 1.4096947935368043, "step": 19630}, {"loss": 0.7775, "grad_norm": 0.8048052787780762, "learning_rate": 0.0002, "epoch": 1.4104129263913823, "step": 19640}, {"loss": 0.7668, "grad_norm": 0.7414906620979309, "learning_rate": 0.0002, "epoch": 1.4111310592459605, "step": 19650}, {"loss": 0.7386, "grad_norm": 0.7894161343574524, "learning_rate": 0.0002, "epoch": 1.4118491921005387, "step": 19660}, {"loss": 0.7371, "grad_norm": 0.6724628210067749, "learning_rate": 0.0002, "epoch": 1.4125673249551167, "step": 19670}, {"loss": 0.7243, "grad_norm": 0.9397756457328796, "learning_rate": 0.0002, "epoch": 1.4132854578096947, "step": 19680}, {"loss": 0.7109, "grad_norm": 0.6684842109680176, "learning_rate": 0.0002, "epoch": 1.414003590664273, "step": 19690}, {"loss": 0.7693, "grad_norm": 0.7753993272781372, "learning_rate": 0.0002, "epoch": 1.414721723518851, "step": 19700}, {"loss": 0.7653, "grad_norm": 0.6934253573417664, "learning_rate": 0.0002, "epoch": 1.4154398563734292, "step": 19710}, {"loss": 0.7393, "grad_norm": 0.8567284941673279, "learning_rate": 0.0002, "epoch": 1.4161579892280072, "step": 19720}, {"loss": 0.6907, "grad_norm": 0.9471787214279175, "learning_rate": 0.0002, "epoch": 1.4168761220825852, "step": 19730}, {"loss": 0.709, "grad_norm": 0.6664855480194092, "learning_rate": 0.0002, "epoch": 1.4175942549371634, "step": 19740}, {"loss": 0.7149, "grad_norm": 0.6713361740112305, "learning_rate": 0.0002, "epoch": 1.4183123877917414, "step": 19750}, {"loss": 0.7302, "grad_norm": 0.6488258838653564, "learning_rate": 0.0002, "epoch": 1.4190305206463196, "step": 19760}, {"loss": 0.7612, "grad_norm": 0.7089938521385193, "learning_rate": 0.0002, "epoch": 1.4197486535008976, "step": 19770}, {"loss": 0.7245, "grad_norm": 0.6433218717575073, "learning_rate": 0.0002, "epoch": 1.4204667863554756, "step": 19780}, {"loss": 0.7105, "grad_norm": 0.7025160193443298, "learning_rate": 0.0002, "epoch": 1.4211849192100539, "step": 19790}, {"loss": 0.7948, "grad_norm": 0.7030544877052307, "learning_rate": 0.0002, "epoch": 1.421903052064632, "step": 19800}, {"loss": 0.7333, "grad_norm": 0.6515552401542664, "learning_rate": 0.0002, "epoch": 1.42262118491921, "step": 19810}, {"loss": 0.7342, "grad_norm": 0.6463841795921326, "learning_rate": 0.0002, "epoch": 1.423339317773788, "step": 19820}, {"loss": 0.7457, "grad_norm": 0.6654344201087952, "learning_rate": 0.0002, "epoch": 1.4240574506283663, "step": 19830}, {"loss": 0.7289, "grad_norm": 0.7223384380340576, "learning_rate": 0.0002, "epoch": 1.4247755834829443, "step": 19840}, {"loss": 0.7471, "grad_norm": 0.6575722694396973, "learning_rate": 0.0002, "epoch": 1.4254937163375225, "step": 19850}, {"loss": 0.7559, "grad_norm": 0.6216059327125549, "learning_rate": 0.0002, "epoch": 1.4262118491921005, "step": 19860}, {"loss": 0.7638, "grad_norm": 0.7451487183570862, "learning_rate": 0.0002, "epoch": 1.4269299820466785, "step": 19870}, {"loss": 0.7083, "grad_norm": 0.6563336253166199, "learning_rate": 0.0002, "epoch": 1.4276481149012568, "step": 19880}, {"loss": 0.7122, "grad_norm": 0.8021975159645081, "learning_rate": 0.0002, "epoch": 1.4283662477558348, "step": 19890}, {"loss": 0.7389, "grad_norm": 0.7474712133407593, "learning_rate": 0.0002, "epoch": 1.429084380610413, "step": 19900}, {"loss": 0.7839, "grad_norm": 0.7316377758979797, "learning_rate": 0.0002, "epoch": 1.429802513464991, "step": 19910}, {"loss": 0.7588, "grad_norm": 0.646892786026001, "learning_rate": 0.0002, "epoch": 1.430520646319569, "step": 19920}, {"loss": 0.7175, "grad_norm": 0.6268765926361084, "learning_rate": 0.0002, "epoch": 1.4312387791741472, "step": 19930}, {"loss": 0.7502, "grad_norm": 0.7104699611663818, "learning_rate": 0.0002, "epoch": 1.4319569120287254, "step": 19940}, {"loss": 0.7006, "grad_norm": 0.6742063760757446, "learning_rate": 0.0002, "epoch": 1.4326750448833034, "step": 19950}, {"loss": 0.7394, "grad_norm": 0.6973381638526917, "learning_rate": 0.0002, "epoch": 1.4333931777378814, "step": 19960}, {"loss": 0.7428, "grad_norm": 0.5819381475448608, "learning_rate": 0.0002, "epoch": 1.4341113105924597, "step": 19970}, {"loss": 0.7836, "grad_norm": 0.680623471736908, "learning_rate": 0.0002, "epoch": 1.4348294434470377, "step": 19980}, {"loss": 0.7063, "grad_norm": 0.5899890661239624, "learning_rate": 0.0002, "epoch": 1.435547576301616, "step": 19990}, {"loss": 0.7438, "grad_norm": 0.6225098371505737, "learning_rate": 0.0002, "epoch": 1.436265709156194, "step": 20000}, {"loss": 0.7065, "grad_norm": 0.6314228773117065, "learning_rate": 0.0002, "epoch": 1.436983842010772, "step": 20010}, {"loss": 0.677, "grad_norm": 0.8690667152404785, "learning_rate": 0.0002, "epoch": 1.4377019748653501, "step": 20020}, {"loss": 0.7491, "grad_norm": 0.7166543006896973, "learning_rate": 0.0002, "epoch": 1.4384201077199281, "step": 20030}, {"loss": 0.7686, "grad_norm": 0.7051591873168945, "learning_rate": 0.0002, "epoch": 1.4391382405745063, "step": 20040}, {"loss": 0.6669, "grad_norm": 0.7606652975082397, "learning_rate": 0.0002, "epoch": 1.4398563734290843, "step": 20050}, {"loss": 0.7427, "grad_norm": 0.6343185305595398, "learning_rate": 0.0002, "epoch": 1.4405745062836623, "step": 20060}, {"loss": 0.6956, "grad_norm": 0.5625789761543274, "learning_rate": 0.0002, "epoch": 1.4412926391382406, "step": 20070}, {"loss": 0.7421, "grad_norm": 0.6081897020339966, "learning_rate": 0.0002, "epoch": 1.4420107719928188, "step": 20080}, {"loss": 0.7646, "grad_norm": 0.9571536779403687, "learning_rate": 0.0002, "epoch": 1.4427289048473968, "step": 20090}, {"loss": 0.6939, "grad_norm": 0.869531512260437, "learning_rate": 0.0002, "epoch": 1.4434470377019748, "step": 20100}, {"loss": 0.7684, "grad_norm": 0.6865507960319519, "learning_rate": 0.0002, "epoch": 1.444165170556553, "step": 20110}, {"loss": 0.6835, "grad_norm": 0.7572755813598633, "learning_rate": 0.0002, "epoch": 1.444883303411131, "step": 20120}, {"loss": 0.7392, "grad_norm": 0.79011070728302, "learning_rate": 0.0002, "epoch": 1.4456014362657092, "step": 20130}, {"loss": 0.7624, "grad_norm": 0.8297342658042908, "learning_rate": 0.0002, "epoch": 1.4463195691202873, "step": 20140}, {"loss": 0.696, "grad_norm": 0.6593490839004517, "learning_rate": 0.0002, "epoch": 1.4470377019748653, "step": 20150}, {"loss": 0.7062, "grad_norm": 1.0264687538146973, "learning_rate": 0.0002, "epoch": 1.4477558348294435, "step": 20160}, {"loss": 0.7804, "grad_norm": 0.7032888531684875, "learning_rate": 0.0002, "epoch": 1.4484739676840215, "step": 20170}, {"loss": 0.7692, "grad_norm": 0.6438494920730591, "learning_rate": 0.0002, "epoch": 1.4491921005385997, "step": 20180}, {"loss": 0.7189, "grad_norm": 0.7448790669441223, "learning_rate": 0.0002, "epoch": 1.4499102333931777, "step": 20190}, {"loss": 0.7389, "grad_norm": 0.7551555037498474, "learning_rate": 0.0002, "epoch": 1.4506283662477557, "step": 20200}, {"loss": 0.7636, "grad_norm": 0.6677857041358948, "learning_rate": 0.0002, "epoch": 1.451346499102334, "step": 20210}, {"loss": 0.7261, "grad_norm": 0.7888486385345459, "learning_rate": 0.0002, "epoch": 1.4520646319569122, "step": 20220}, {"loss": 0.7349, "grad_norm": 0.6658565402030945, "learning_rate": 0.0002, "epoch": 1.4527827648114902, "step": 20230}, {"loss": 0.7862, "grad_norm": 0.6800249814987183, "learning_rate": 0.0002, "epoch": 1.4535008976660682, "step": 20240}, {"loss": 0.7464, "grad_norm": 0.7419682741165161, "learning_rate": 0.0002, "epoch": 1.4542190305206464, "step": 20250}, {"loss": 0.7118, "grad_norm": 0.8848792910575867, "learning_rate": 0.0002, "epoch": 1.4549371633752244, "step": 20260}, {"loss": 0.729, "grad_norm": 0.6513857245445251, "learning_rate": 0.0002, "epoch": 1.4556552962298026, "step": 20270}, {"loss": 0.7325, "grad_norm": 0.5605742335319519, "learning_rate": 0.0002, "epoch": 1.4563734290843806, "step": 20280}, {"loss": 0.7078, "grad_norm": 0.6737141013145447, "learning_rate": 0.0002, "epoch": 1.4570915619389586, "step": 20290}, {"loss": 0.6971, "grad_norm": 0.6663289666175842, "learning_rate": 0.0002, "epoch": 1.4578096947935368, "step": 20300}, {"loss": 0.7161, "grad_norm": 0.7157106995582581, "learning_rate": 0.0002, "epoch": 1.4585278276481148, "step": 20310}, {"loss": 0.7024, "grad_norm": 0.7713354825973511, "learning_rate": 0.0002, "epoch": 1.459245960502693, "step": 20320}, {"loss": 0.7043, "grad_norm": 0.8334044218063354, "learning_rate": 0.0002, "epoch": 1.459964093357271, "step": 20330}, {"loss": 0.7151, "grad_norm": 0.7268327474594116, "learning_rate": 0.0002, "epoch": 1.460682226211849, "step": 20340}, {"loss": 0.7415, "grad_norm": 0.6791431903839111, "learning_rate": 0.0002, "epoch": 1.4614003590664273, "step": 20350}, {"loss": 0.7738, "grad_norm": 0.8177870512008667, "learning_rate": 0.0002, "epoch": 1.4621184919210055, "step": 20360}, {"loss": 0.7212, "grad_norm": 0.8064364790916443, "learning_rate": 0.0002, "epoch": 1.4628366247755835, "step": 20370}, {"loss": 0.7285, "grad_norm": 0.6547006964683533, "learning_rate": 0.0002, "epoch": 1.4635547576301615, "step": 20380}, {"loss": 0.7444, "grad_norm": 0.6381436586380005, "learning_rate": 0.0002, "epoch": 1.4642728904847397, "step": 20390}, {"loss": 0.7593, "grad_norm": 0.7351248264312744, "learning_rate": 0.0002, "epoch": 1.4649910233393177, "step": 20400}, {"loss": 0.7385, "grad_norm": 0.7037558555603027, "learning_rate": 0.0002, "epoch": 1.465709156193896, "step": 20410}, {"loss": 0.7815, "grad_norm": 0.6294074654579163, "learning_rate": 0.0002, "epoch": 1.466427289048474, "step": 20420}, {"loss": 0.6665, "grad_norm": 0.9722632765769958, "learning_rate": 0.0002, "epoch": 1.467145421903052, "step": 20430}, {"loss": 0.7363, "grad_norm": 0.753065824508667, "learning_rate": 0.0002, "epoch": 1.4678635547576302, "step": 20440}, {"loss": 0.7568, "grad_norm": 0.7317194938659668, "learning_rate": 0.0002, "epoch": 1.4685816876122082, "step": 20450}, {"loss": 0.6948, "grad_norm": 0.6862193942070007, "learning_rate": 0.0002, "epoch": 1.4692998204667864, "step": 20460}, {"loss": 0.7552, "grad_norm": 0.7643225193023682, "learning_rate": 0.0002, "epoch": 1.4700179533213644, "step": 20470}, {"loss": 0.6757, "grad_norm": 0.5904353260993958, "learning_rate": 0.0002, "epoch": 1.4707360861759424, "step": 20480}, {"loss": 0.7779, "grad_norm": 0.5812238454818726, "learning_rate": 0.0002, "epoch": 1.4714542190305206, "step": 20490}, {"loss": 0.7252, "grad_norm": 0.7478151321411133, "learning_rate": 0.0002, "epoch": 1.4721723518850989, "step": 20500}, {"loss": 0.7165, "grad_norm": 0.7625645399093628, "learning_rate": 0.0002, "epoch": 1.4728904847396769, "step": 20510}, {"loss": 0.7383, "grad_norm": 0.6354498267173767, "learning_rate": 0.0002, "epoch": 1.4736086175942549, "step": 20520}, {"loss": 0.7095, "grad_norm": 0.8731162548065186, "learning_rate": 0.0002, "epoch": 1.474326750448833, "step": 20530}, {"loss": 0.7535, "grad_norm": 0.7346670627593994, "learning_rate": 0.0002, "epoch": 1.475044883303411, "step": 20540}, {"loss": 0.78, "grad_norm": 1.038447618484497, "learning_rate": 0.0002, "epoch": 1.4757630161579893, "step": 20550}, {"loss": 0.7026, "grad_norm": 0.7032809257507324, "learning_rate": 0.0002, "epoch": 1.4764811490125673, "step": 20560}, {"loss": 0.6776, "grad_norm": 0.8008337020874023, "learning_rate": 0.0002, "epoch": 1.4771992818671453, "step": 20570}, {"loss": 0.776, "grad_norm": 0.6735056638717651, "learning_rate": 0.0002, "epoch": 1.4779174147217236, "step": 20580}, {"loss": 0.7632, "grad_norm": 0.622056245803833, "learning_rate": 0.0002, "epoch": 1.4786355475763016, "step": 20590}, {"loss": 0.7467, "grad_norm": 0.6580422520637512, "learning_rate": 0.0002, "epoch": 1.4793536804308798, "step": 20600}, {"loss": 0.7161, "grad_norm": 0.8401153087615967, "learning_rate": 0.0002, "epoch": 1.4800718132854578, "step": 20610}, {"loss": 0.7581, "grad_norm": 0.7564560770988464, "learning_rate": 0.0002, "epoch": 1.4807899461400358, "step": 20620}, {"loss": 0.7507, "grad_norm": 0.8319511413574219, "learning_rate": 0.0002, "epoch": 1.481508078994614, "step": 20630}, {"loss": 0.7379, "grad_norm": 0.7430182695388794, "learning_rate": 0.0002, "epoch": 1.4822262118491922, "step": 20640}, {"loss": 0.7273, "grad_norm": 0.7996522784233093, "learning_rate": 0.0002, "epoch": 1.4829443447037702, "step": 20650}, {"loss": 0.7223, "grad_norm": 0.6993277072906494, "learning_rate": 0.0002, "epoch": 1.4836624775583482, "step": 20660}, {"loss": 0.7328, "grad_norm": 0.8621185421943665, "learning_rate": 0.0002, "epoch": 1.4843806104129265, "step": 20670}, {"loss": 0.7327, "grad_norm": 0.7709757685661316, "learning_rate": 0.0002, "epoch": 1.4850987432675045, "step": 20680}, {"loss": 0.7053, "grad_norm": 0.743760347366333, "learning_rate": 0.0002, "epoch": 1.4858168761220827, "step": 20690}, {"loss": 0.6763, "grad_norm": 0.8353745341300964, "learning_rate": 0.0002, "epoch": 1.4865350089766607, "step": 20700}, {"loss": 0.6933, "grad_norm": 0.8510433435440063, "learning_rate": 0.0002, "epoch": 1.4872531418312387, "step": 20710}, {"loss": 0.7486, "grad_norm": 0.7065894603729248, "learning_rate": 0.0002, "epoch": 1.487971274685817, "step": 20720}, {"loss": 0.736, "grad_norm": 0.6878955960273743, "learning_rate": 0.0002, "epoch": 1.488689407540395, "step": 20730}, {"loss": 0.6958, "grad_norm": 0.7861111760139465, "learning_rate": 0.0002, "epoch": 1.4894075403949731, "step": 20740}, {"loss": 0.7568, "grad_norm": 0.4810725152492523, "learning_rate": 0.0002, "epoch": 1.4901256732495511, "step": 20750}, {"loss": 0.8147, "grad_norm": 0.7246082425117493, "learning_rate": 0.0002, "epoch": 1.4908438061041291, "step": 20760}, {"loss": 0.7312, "grad_norm": 0.7101936340332031, "learning_rate": 0.0002, "epoch": 1.4915619389587074, "step": 20770}, {"loss": 0.7393, "grad_norm": 0.7508591413497925, "learning_rate": 0.0002, "epoch": 1.4922800718132856, "step": 20780}, {"loss": 0.7635, "grad_norm": 0.8872039914131165, "learning_rate": 0.0002, "epoch": 1.4929982046678636, "step": 20790}, {"loss": 0.7352, "grad_norm": 0.7257922887802124, "learning_rate": 0.0002, "epoch": 1.4937163375224416, "step": 20800}, {"loss": 0.7497, "grad_norm": 0.7886278629302979, "learning_rate": 0.0002, "epoch": 1.4944344703770198, "step": 20810}, {"loss": 0.7247, "grad_norm": 0.6746290922164917, "learning_rate": 0.0002, "epoch": 1.4951526032315978, "step": 20820}, {"loss": 0.7836, "grad_norm": 0.8118207454681396, "learning_rate": 0.0002, "epoch": 1.495870736086176, "step": 20830}, {"loss": 0.7323, "grad_norm": 0.7337301969528198, "learning_rate": 0.0002, "epoch": 1.496588868940754, "step": 20840}, {"loss": 0.7105, "grad_norm": 0.5451242327690125, "learning_rate": 0.0002, "epoch": 1.497307001795332, "step": 20850}, {"loss": 0.7255, "grad_norm": 0.8398377299308777, "learning_rate": 0.0002, "epoch": 1.4980251346499103, "step": 20860}, {"loss": 0.7217, "grad_norm": 0.7196659445762634, "learning_rate": 0.0002, "epoch": 1.4987432675044883, "step": 20870}, {"loss": 0.6843, "grad_norm": 0.6659539937973022, "learning_rate": 0.0002, "epoch": 1.4994614003590665, "step": 20880}, {"loss": 0.7337, "grad_norm": 0.6071978807449341, "learning_rate": 0.0002, "epoch": 1.5001795332136445, "step": 20890}, {"loss": 0.7221, "grad_norm": 0.6704870462417603, "learning_rate": 0.0002, "epoch": 1.5008976660682225, "step": 20900}, {"loss": 0.6946, "grad_norm": 0.7216639518737793, "learning_rate": 0.0002, "epoch": 1.5016157989228007, "step": 20910}, {"loss": 0.7282, "grad_norm": 0.6050528287887573, "learning_rate": 0.0002, "epoch": 1.502333931777379, "step": 20920}, {"loss": 0.7142, "grad_norm": 0.7422218918800354, "learning_rate": 0.0002, "epoch": 1.503052064631957, "step": 20930}, {"loss": 0.7779, "grad_norm": 0.7157148122787476, "learning_rate": 0.0002, "epoch": 1.503770197486535, "step": 20940}, {"loss": 0.7179, "grad_norm": 0.6704899668693542, "learning_rate": 0.0002, "epoch": 1.504488330341113, "step": 20950}, {"loss": 0.7124, "grad_norm": 0.7573544979095459, "learning_rate": 0.0002, "epoch": 1.5052064631956912, "step": 20960}, {"loss": 0.7831, "grad_norm": 0.6710506677627563, "learning_rate": 0.0002, "epoch": 1.5059245960502694, "step": 20970}, {"loss": 0.7123, "grad_norm": 0.7559793591499329, "learning_rate": 0.0002, "epoch": 1.5066427289048474, "step": 20980}, {"loss": 0.7442, "grad_norm": 0.6705940961837769, "learning_rate": 0.0002, "epoch": 1.5073608617594254, "step": 20990}, {"loss": 0.7387, "grad_norm": 0.8016680479049683, "learning_rate": 0.0002, "epoch": 1.5080789946140036, "step": 21000}, {"loss": 0.7101, "grad_norm": 0.8154481649398804, "learning_rate": 0.0002, "epoch": 1.5087971274685816, "step": 21010}, {"loss": 0.7223, "grad_norm": 0.5830582976341248, "learning_rate": 0.0002, "epoch": 1.5095152603231599, "step": 21020}, {"loss": 0.753, "grad_norm": 0.7088601589202881, "learning_rate": 0.0002, "epoch": 1.5102333931777379, "step": 21030}, {"loss": 0.7278, "grad_norm": 0.7499658465385437, "learning_rate": 0.0002, "epoch": 1.5109515260323159, "step": 21040}, {"loss": 0.7441, "grad_norm": 0.7684667706489563, "learning_rate": 0.0002, "epoch": 1.511669658886894, "step": 21050}, {"loss": 0.7665, "grad_norm": 0.7183627486228943, "learning_rate": 0.0002, "epoch": 1.5123877917414723, "step": 21060}, {"loss": 0.7777, "grad_norm": 0.8201524615287781, "learning_rate": 0.0002, "epoch": 1.5131059245960503, "step": 21070}, {"loss": 0.7005, "grad_norm": 0.6359647512435913, "learning_rate": 0.0002, "epoch": 1.5138240574506283, "step": 21080}, {"loss": 0.7231, "grad_norm": 0.7419124245643616, "learning_rate": 0.0002, "epoch": 1.5145421903052063, "step": 21090}, {"loss": 0.724, "grad_norm": 0.6145808696746826, "learning_rate": 0.0002, "epoch": 1.5152603231597845, "step": 21100}, {"loss": 0.7563, "grad_norm": 0.7116656303405762, "learning_rate": 0.0002, "epoch": 1.5159784560143628, "step": 21110}, {"loss": 0.7221, "grad_norm": 0.8927125334739685, "learning_rate": 0.0002, "epoch": 1.5166965888689408, "step": 21120}, {"loss": 0.7159, "grad_norm": 0.7527788877487183, "learning_rate": 0.0002, "epoch": 1.5174147217235188, "step": 21130}, {"loss": 0.7147, "grad_norm": 0.7537266612052917, "learning_rate": 0.0002, "epoch": 1.518132854578097, "step": 21140}, {"loss": 0.7451, "grad_norm": 0.9051724672317505, "learning_rate": 0.0002, "epoch": 1.518850987432675, "step": 21150}, {"loss": 0.7362, "grad_norm": 0.7258086800575256, "learning_rate": 0.0002, "epoch": 1.5195691202872532, "step": 21160}, {"loss": 0.7096, "grad_norm": 0.60377436876297, "learning_rate": 0.0002, "epoch": 1.5202872531418312, "step": 21170}, {"loss": 0.7141, "grad_norm": 0.613362729549408, "learning_rate": 0.0002, "epoch": 1.5210053859964092, "step": 21180}, {"loss": 0.7018, "grad_norm": 0.6311782002449036, "learning_rate": 0.0002, "epoch": 1.5217235188509874, "step": 21190}, {"loss": 0.8144, "grad_norm": 0.7814380526542664, "learning_rate": 0.0002, "epoch": 1.5224416517055657, "step": 21200}, {"loss": 0.7505, "grad_norm": 0.8482790589332581, "learning_rate": 0.0002, "epoch": 1.5231597845601437, "step": 21210}, {"loss": 0.7387, "grad_norm": 0.6767336130142212, "learning_rate": 0.0002, "epoch": 1.5238779174147217, "step": 21220}, {"loss": 0.7556, "grad_norm": 0.7000219821929932, "learning_rate": 0.0002, "epoch": 1.5245960502692997, "step": 21230}, {"loss": 0.7628, "grad_norm": 0.8848617076873779, "learning_rate": 0.0002, "epoch": 1.525314183123878, "step": 21240}, {"loss": 0.7226, "grad_norm": 0.692258894443512, "learning_rate": 0.0002, "epoch": 1.5260323159784561, "step": 21250}, {"loss": 0.7535, "grad_norm": 0.7701950073242188, "learning_rate": 0.0002, "epoch": 1.5267504488330341, "step": 21260}, {"loss": 0.7531, "grad_norm": 0.7454132437705994, "learning_rate": 0.0002, "epoch": 1.5274685816876121, "step": 21270}, {"loss": 0.7663, "grad_norm": 0.7299574613571167, "learning_rate": 0.0002, "epoch": 1.5281867145421903, "step": 21280}, {"loss": 0.6993, "grad_norm": 0.6693950891494751, "learning_rate": 0.0002, "epoch": 1.5289048473967684, "step": 21290}, {"loss": 0.7567, "grad_norm": 0.8323785066604614, "learning_rate": 0.0002, "epoch": 1.5296229802513466, "step": 21300}, {"loss": 0.7205, "grad_norm": 0.8998763561248779, "learning_rate": 0.0002, "epoch": 1.5303411131059246, "step": 21310}, {"loss": 0.7779, "grad_norm": 0.8118193745613098, "learning_rate": 0.0002, "epoch": 1.5310592459605026, "step": 21320}, {"loss": 0.7642, "grad_norm": 0.8966332077980042, "learning_rate": 0.0002, "epoch": 1.5317773788150808, "step": 21330}, {"loss": 0.7626, "grad_norm": 0.7849827408790588, "learning_rate": 0.0002, "epoch": 1.532495511669659, "step": 21340}, {"loss": 0.7501, "grad_norm": 0.897583544254303, "learning_rate": 0.0002, "epoch": 1.533213644524237, "step": 21350}, {"loss": 0.7812, "grad_norm": 0.7998009324073792, "learning_rate": 0.0002, "epoch": 1.533931777378815, "step": 21360}, {"loss": 0.7217, "grad_norm": 0.5890361070632935, "learning_rate": 0.0002, "epoch": 1.534649910233393, "step": 21370}, {"loss": 0.7283, "grad_norm": 0.7321302890777588, "learning_rate": 0.0002, "epoch": 1.5353680430879713, "step": 21380}, {"loss": 0.7238, "grad_norm": 0.7746050357818604, "learning_rate": 0.0002, "epoch": 1.5360861759425495, "step": 21390}, {"loss": 0.7146, "grad_norm": 0.7033910155296326, "learning_rate": 0.0002, "epoch": 1.5368043087971275, "step": 21400}, {"loss": 0.6783, "grad_norm": 0.7229148149490356, "learning_rate": 0.0002, "epoch": 1.5375224416517055, "step": 21410}, {"loss": 0.7347, "grad_norm": 0.8055810928344727, "learning_rate": 0.0002, "epoch": 1.5382405745062837, "step": 21420}, {"loss": 0.7382, "grad_norm": 0.9411654472351074, "learning_rate": 0.0002, "epoch": 1.5389587073608617, "step": 21430}, {"loss": 0.6916, "grad_norm": 0.7297126650810242, "learning_rate": 0.0002, "epoch": 1.53967684021544, "step": 21440}, {"loss": 0.6977, "grad_norm": 0.7316457629203796, "learning_rate": 0.0002, "epoch": 1.540394973070018, "step": 21450}, {"loss": 0.713, "grad_norm": 0.8568798303604126, "learning_rate": 0.0002, "epoch": 1.541113105924596, "step": 21460}, {"loss": 0.6916, "grad_norm": 0.7829580307006836, "learning_rate": 0.0002, "epoch": 1.5418312387791742, "step": 21470}, {"loss": 0.712, "grad_norm": 0.6679823398590088, "learning_rate": 0.0002, "epoch": 1.5425493716337524, "step": 21480}, {"loss": 0.6978, "grad_norm": 0.5680868029594421, "learning_rate": 0.0002, "epoch": 1.5432675044883304, "step": 21490}, {"loss": 0.7638, "grad_norm": 0.6878862380981445, "learning_rate": 0.0002, "epoch": 1.5439856373429084, "step": 21500}, {"loss": 0.7634, "grad_norm": 0.7391727566719055, "learning_rate": 0.0002, "epoch": 1.5447037701974864, "step": 21510}, {"loss": 0.7781, "grad_norm": 0.844994843006134, "learning_rate": 0.0002, "epoch": 1.5454219030520646, "step": 21520}, {"loss": 0.7052, "grad_norm": 0.7852550148963928, "learning_rate": 0.0002, "epoch": 1.5461400359066428, "step": 21530}, {"loss": 0.7364, "grad_norm": 0.8370407223701477, "learning_rate": 0.0002, "epoch": 1.5468581687612208, "step": 21540}, {"loss": 0.7266, "grad_norm": 0.7138169407844543, "learning_rate": 0.0002, "epoch": 1.5475763016157988, "step": 21550}, {"loss": 0.7078, "grad_norm": 0.7660839557647705, "learning_rate": 0.0002, "epoch": 1.548294434470377, "step": 21560}, {"loss": 0.7056, "grad_norm": 0.6628666520118713, "learning_rate": 0.0002, "epoch": 1.549012567324955, "step": 21570}, {"loss": 0.7384, "grad_norm": 0.602262020111084, "learning_rate": 0.0002, "epoch": 1.5497307001795333, "step": 21580}, {"loss": 0.7258, "grad_norm": 0.6120333671569824, "learning_rate": 0.0002, "epoch": 1.5504488330341113, "step": 21590}, {"loss": 0.8094, "grad_norm": 0.6742582321166992, "learning_rate": 0.0002, "epoch": 1.5511669658886893, "step": 21600}, {"loss": 0.6807, "grad_norm": 0.6788192391395569, "learning_rate": 0.0002, "epoch": 1.5518850987432675, "step": 21610}, {"loss": 0.6969, "grad_norm": 0.7124713659286499, "learning_rate": 0.0002, "epoch": 1.5526032315978457, "step": 21620}, {"loss": 0.7296, "grad_norm": 0.6297248005867004, "learning_rate": 0.0002, "epoch": 1.5533213644524237, "step": 21630}, {"loss": 0.7466, "grad_norm": 0.8977078199386597, "learning_rate": 0.0002, "epoch": 1.5540394973070017, "step": 21640}, {"loss": 0.7376, "grad_norm": 0.7543209791183472, "learning_rate": 0.0002, "epoch": 1.5547576301615798, "step": 21650}, {"loss": 0.749, "grad_norm": 0.8704302310943604, "learning_rate": 0.0002, "epoch": 1.555475763016158, "step": 21660}, {"loss": 0.7801, "grad_norm": 0.7848012447357178, "learning_rate": 0.0002, "epoch": 1.5561938958707362, "step": 21670}, {"loss": 0.7062, "grad_norm": 0.7496278285980225, "learning_rate": 0.0002, "epoch": 1.5569120287253142, "step": 21680}, {"loss": 0.7503, "grad_norm": 0.7305200099945068, "learning_rate": 0.0002, "epoch": 1.5576301615798922, "step": 21690}, {"loss": 0.7429, "grad_norm": 0.6671105623245239, "learning_rate": 0.0002, "epoch": 1.5583482944344704, "step": 21700}, {"loss": 0.7293, "grad_norm": 0.8536111116409302, "learning_rate": 0.0002, "epoch": 1.5590664272890484, "step": 21710}, {"loss": 0.7169, "grad_norm": 0.7360461354255676, "learning_rate": 0.0002, "epoch": 1.5597845601436267, "step": 21720}, {"loss": 0.7314, "grad_norm": 0.6665109395980835, "learning_rate": 0.0002, "epoch": 1.5605026929982047, "step": 21730}, {"loss": 0.7262, "grad_norm": 0.5879628658294678, "learning_rate": 0.0002, "epoch": 1.5612208258527827, "step": 21740}, {"loss": 0.7099, "grad_norm": 0.6937240958213806, "learning_rate": 0.0002, "epoch": 1.5619389587073609, "step": 21750}, {"loss": 0.7669, "grad_norm": 0.7118659019470215, "learning_rate": 0.0002, "epoch": 1.562657091561939, "step": 21760}, {"loss": 0.7196, "grad_norm": 0.7858866453170776, "learning_rate": 0.0002, "epoch": 1.563375224416517, "step": 21770}, {"loss": 0.7552, "grad_norm": 0.8691372871398926, "learning_rate": 0.0002, "epoch": 1.564093357271095, "step": 21780}, {"loss": 0.7684, "grad_norm": 0.8884942531585693, "learning_rate": 0.0002, "epoch": 1.564811490125673, "step": 21790}, {"loss": 0.7128, "grad_norm": 0.6335656046867371, "learning_rate": 0.0002, "epoch": 1.5655296229802513, "step": 21800}, {"loss": 0.7233, "grad_norm": 0.8666166067123413, "learning_rate": 0.0002, "epoch": 1.5662477558348296, "step": 21810}, {"loss": 0.6771, "grad_norm": 0.7961624264717102, "learning_rate": 0.0002, "epoch": 1.5669658886894076, "step": 21820}, {"loss": 0.7286, "grad_norm": 0.6331174373626709, "learning_rate": 0.0002, "epoch": 1.5676840215439856, "step": 21830}, {"loss": 0.7273, "grad_norm": 0.6476998925209045, "learning_rate": 0.0002, "epoch": 1.5684021543985638, "step": 21840}, {"loss": 0.7507, "grad_norm": 0.8279129266738892, "learning_rate": 0.0002, "epoch": 1.5691202872531418, "step": 21850}, {"loss": 0.7219, "grad_norm": 0.6997109651565552, "learning_rate": 0.0002, "epoch": 1.56983842010772, "step": 21860}, {"loss": 0.7424, "grad_norm": 0.6992211937904358, "learning_rate": 0.0002, "epoch": 1.570556552962298, "step": 21870}, {"loss": 0.7275, "grad_norm": 0.7766915559768677, "learning_rate": 0.0002, "epoch": 1.571274685816876, "step": 21880}, {"loss": 0.7651, "grad_norm": 0.6845845580101013, "learning_rate": 0.0002, "epoch": 1.5719928186714542, "step": 21890}, {"loss": 0.706, "grad_norm": 0.7247874140739441, "learning_rate": 0.0002, "epoch": 1.5727109515260325, "step": 21900}, {"loss": 0.7812, "grad_norm": 0.802342414855957, "learning_rate": 0.0002, "epoch": 1.5734290843806105, "step": 21910}, {"loss": 0.7028, "grad_norm": 0.7797709107398987, "learning_rate": 0.0002, "epoch": 1.5741472172351885, "step": 21920}, {"loss": 0.7466, "grad_norm": 0.6534958481788635, "learning_rate": 0.0002, "epoch": 1.5748653500897665, "step": 21930}, {"loss": 0.7148, "grad_norm": 0.6003528237342834, "learning_rate": 0.0002, "epoch": 1.5755834829443447, "step": 21940}, {"loss": 0.7282, "grad_norm": 0.6920075416564941, "learning_rate": 0.0002, "epoch": 1.576301615798923, "step": 21950}, {"loss": 0.6533, "grad_norm": 0.7213456034660339, "learning_rate": 0.0002, "epoch": 1.577019748653501, "step": 21960}, {"loss": 0.6875, "grad_norm": 0.7101914286613464, "learning_rate": 0.0002, "epoch": 1.577737881508079, "step": 21970}, {"loss": 0.7421, "grad_norm": 0.9531592130661011, "learning_rate": 0.0002, "epoch": 1.5784560143626571, "step": 21980}, {"loss": 0.7454, "grad_norm": 0.7690590023994446, "learning_rate": 0.0002, "epoch": 1.5791741472172351, "step": 21990}, {"loss": 0.7135, "grad_norm": 0.8226363062858582, "learning_rate": 0.0002, "epoch": 1.5798922800718134, "step": 22000}, {"loss": 0.7518, "grad_norm": 0.6128851175308228, "learning_rate": 0.0002, "epoch": 1.5806104129263914, "step": 22010}, {"loss": 0.7253, "grad_norm": 0.827008068561554, "learning_rate": 0.0002, "epoch": 1.5813285457809694, "step": 22020}, {"loss": 0.7176, "grad_norm": 0.6729007363319397, "learning_rate": 0.0002, "epoch": 1.5820466786355476, "step": 22030}, {"loss": 0.7503, "grad_norm": 0.6397014260292053, "learning_rate": 0.0002, "epoch": 1.5827648114901258, "step": 22040}, {"loss": 0.7531, "grad_norm": 0.6927793622016907, "learning_rate": 0.0002, "epoch": 1.5834829443447038, "step": 22050}, {"loss": 0.7499, "grad_norm": 0.7527112364768982, "learning_rate": 0.0002, "epoch": 1.5842010771992818, "step": 22060}, {"loss": 0.739, "grad_norm": 0.6418012380599976, "learning_rate": 0.0002, "epoch": 1.5849192100538598, "step": 22070}, {"loss": 0.727, "grad_norm": 0.7627281546592712, "learning_rate": 0.0002, "epoch": 1.585637342908438, "step": 22080}, {"loss": 0.7115, "grad_norm": 0.753851592540741, "learning_rate": 0.0002, "epoch": 1.5863554757630163, "step": 22090}, {"loss": 0.7677, "grad_norm": 0.6049349904060364, "learning_rate": 0.0002, "epoch": 1.5870736086175943, "step": 22100}, {"loss": 0.7494, "grad_norm": 0.6677758693695068, "learning_rate": 0.0002, "epoch": 1.5877917414721723, "step": 22110}, {"loss": 0.7259, "grad_norm": 0.913489818572998, "learning_rate": 0.0002, "epoch": 1.5885098743267505, "step": 22120}, {"loss": 0.7823, "grad_norm": 0.6779162883758545, "learning_rate": 0.0002, "epoch": 1.5892280071813285, "step": 22130}, {"loss": 0.7674, "grad_norm": 0.910076916217804, "learning_rate": 0.0002, "epoch": 1.5899461400359067, "step": 22140}, {"loss": 0.7162, "grad_norm": 0.9506068229675293, "learning_rate": 0.0002, "epoch": 1.5906642728904847, "step": 22150}, {"loss": 0.7343, "grad_norm": 0.6552460789680481, "learning_rate": 0.0002, "epoch": 1.5913824057450627, "step": 22160}, {"loss": 0.7488, "grad_norm": 0.6855819821357727, "learning_rate": 0.0002, "epoch": 1.592100538599641, "step": 22170}, {"loss": 0.6785, "grad_norm": 0.6713384985923767, "learning_rate": 0.0002, "epoch": 1.5928186714542192, "step": 22180}, {"loss": 0.7287, "grad_norm": 0.7168547511100769, "learning_rate": 0.0002, "epoch": 1.5935368043087972, "step": 22190}, {"loss": 0.7259, "grad_norm": 0.8395482897758484, "learning_rate": 0.0002, "epoch": 1.5942549371633752, "step": 22200}, {"loss": 0.6995, "grad_norm": 0.6676998138427734, "learning_rate": 0.0002, "epoch": 1.5949730700179532, "step": 22210}, {"loss": 0.7152, "grad_norm": 0.5837140083312988, "learning_rate": 0.0002, "epoch": 1.5956912028725314, "step": 22220}, {"loss": 0.7464, "grad_norm": 0.8399306535720825, "learning_rate": 0.0002, "epoch": 1.5964093357271096, "step": 22230}, {"loss": 0.7053, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 1.5971274685816876, "step": 22240}, {"loss": 0.784, "grad_norm": 0.768604040145874, "learning_rate": 0.0002, "epoch": 1.5978456014362656, "step": 22250}, {"loss": 0.6946, "grad_norm": 0.6382646560668945, "learning_rate": 0.0002, "epoch": 1.5985637342908436, "step": 22260}, {"loss": 0.7035, "grad_norm": 0.7244897484779358, "learning_rate": 0.0002, "epoch": 1.5992818671454219, "step": 22270}, {"loss": 0.7168, "grad_norm": 0.6250987648963928, "learning_rate": 0.0002, "epoch": 1.6, "step": 22280}, {"loss": 0.7182, "grad_norm": 0.8731992244720459, "learning_rate": 0.0002, "epoch": 1.600718132854578, "step": 22290}, {"loss": 0.6866, "grad_norm": 0.5861822962760925, "learning_rate": 0.0002, "epoch": 1.601436265709156, "step": 22300}, {"loss": 0.6909, "grad_norm": 0.716805100440979, "learning_rate": 0.0002, "epoch": 1.6021543985637343, "step": 22310}, {"loss": 0.7377, "grad_norm": 0.6650034189224243, "learning_rate": 0.0002, "epoch": 1.6028725314183125, "step": 22320}, {"loss": 0.7107, "grad_norm": 0.6944432854652405, "learning_rate": 0.0002, "epoch": 1.6035906642728905, "step": 22330}, {"loss": 0.682, "grad_norm": 0.7411999106407166, "learning_rate": 0.0002, "epoch": 1.6043087971274685, "step": 22340}, {"loss": 0.7294, "grad_norm": 0.831828773021698, "learning_rate": 0.0002, "epoch": 1.6050269299820465, "step": 22350}, {"loss": 0.7305, "grad_norm": 0.6252152919769287, "learning_rate": 0.0002, "epoch": 1.6057450628366248, "step": 22360}, {"loss": 0.7479, "grad_norm": 0.8643325567245483, "learning_rate": 0.0002, "epoch": 1.606463195691203, "step": 22370}, {"loss": 0.7417, "grad_norm": 0.7330279350280762, "learning_rate": 0.0002, "epoch": 1.607181328545781, "step": 22380}, {"loss": 0.7198, "grad_norm": 0.7235422730445862, "learning_rate": 0.0002, "epoch": 1.607899461400359, "step": 22390}, {"loss": 0.7638, "grad_norm": 0.6940887570381165, "learning_rate": 0.0002, "epoch": 1.608617594254937, "step": 22400}, {"loss": 0.714, "grad_norm": 0.7907325625419617, "learning_rate": 0.0002, "epoch": 1.6093357271095152, "step": 22410}, {"loss": 0.7824, "grad_norm": 0.6899075508117676, "learning_rate": 0.0002, "epoch": 1.6100538599640934, "step": 22420}, {"loss": 0.7502, "grad_norm": 0.7057487368583679, "learning_rate": 0.0002, "epoch": 1.6107719928186714, "step": 22430}, {"loss": 0.7437, "grad_norm": 0.9235003590583801, "learning_rate": 0.0002, "epoch": 1.6114901256732495, "step": 22440}, {"loss": 0.7115, "grad_norm": 0.7238173484802246, "learning_rate": 0.0002, "epoch": 1.6122082585278277, "step": 22450}, {"loss": 0.7628, "grad_norm": 0.5931997299194336, "learning_rate": 0.0002, "epoch": 1.612926391382406, "step": 22460}, {"loss": 0.6663, "grad_norm": 0.6705866456031799, "learning_rate": 0.0002, "epoch": 1.613644524236984, "step": 22470}, {"loss": 0.749, "grad_norm": 0.7392773032188416, "learning_rate": 0.0002, "epoch": 1.614362657091562, "step": 22480}, {"loss": 0.7292, "grad_norm": 0.6286543607711792, "learning_rate": 0.0002, "epoch": 1.61508078994614, "step": 22490}, {"loss": 0.7264, "grad_norm": 0.7467446327209473, "learning_rate": 0.0002, "epoch": 1.6157989228007181, "step": 22500}, {"loss": 0.732, "grad_norm": 0.8353021740913391, "learning_rate": 0.0002, "epoch": 1.6165170556552964, "step": 22510}, {"loss": 0.7626, "grad_norm": 0.7333045601844788, "learning_rate": 0.0002, "epoch": 1.6172351885098744, "step": 22520}, {"loss": 0.7567, "grad_norm": 0.6203709244728088, "learning_rate": 0.0002, "epoch": 1.6179533213644524, "step": 22530}, {"loss": 0.7478, "grad_norm": 0.5585690140724182, "learning_rate": 0.0002, "epoch": 1.6186714542190304, "step": 22540}, {"loss": 0.669, "grad_norm": 0.7157222032546997, "learning_rate": 0.0002, "epoch": 1.6193895870736086, "step": 22550}, {"loss": 0.7224, "grad_norm": 0.8129993677139282, "learning_rate": 0.0002, "epoch": 1.6201077199281868, "step": 22560}, {"loss": 0.7374, "grad_norm": 0.6745335459709167, "learning_rate": 0.0002, "epoch": 1.6208258527827648, "step": 22570}, {"loss": 0.7276, "grad_norm": 0.7684996724128723, "learning_rate": 0.0002, "epoch": 1.6215439856373428, "step": 22580}, {"loss": 0.7479, "grad_norm": 0.6735436916351318, "learning_rate": 0.0002, "epoch": 1.622262118491921, "step": 22590}, {"loss": 0.6596, "grad_norm": 0.7394272089004517, "learning_rate": 0.0002, "epoch": 1.6229802513464993, "step": 22600}, {"loss": 0.7382, "grad_norm": 0.7268046140670776, "learning_rate": 0.0002, "epoch": 1.6236983842010773, "step": 22610}, {"loss": 0.7619, "grad_norm": 0.8338810205459595, "learning_rate": 0.0002, "epoch": 1.6244165170556553, "step": 22620}, {"loss": 0.7247, "grad_norm": 0.9293080568313599, "learning_rate": 0.0002, "epoch": 1.6251346499102333, "step": 22630}, {"loss": 0.7601, "grad_norm": 0.8084996938705444, "learning_rate": 0.0002, "epoch": 1.6258527827648115, "step": 22640}, {"loss": 0.7053, "grad_norm": 0.6605180501937866, "learning_rate": 0.0002, "epoch": 1.6265709156193897, "step": 22650}, {"loss": 0.7489, "grad_norm": 0.8402717113494873, "learning_rate": 0.0002, "epoch": 1.6272890484739677, "step": 22660}, {"loss": 0.7468, "grad_norm": 0.653055727481842, "learning_rate": 0.0002, "epoch": 1.6280071813285457, "step": 22670}, {"loss": 0.7179, "grad_norm": 0.6477823257446289, "learning_rate": 0.0002, "epoch": 1.6287253141831237, "step": 22680}, {"loss": 0.7216, "grad_norm": 0.9053590893745422, "learning_rate": 0.0002, "epoch": 1.629443447037702, "step": 22690}, {"loss": 0.7257, "grad_norm": 0.90384441614151, "learning_rate": 0.0002, "epoch": 1.6301615798922802, "step": 22700}, {"loss": 0.7703, "grad_norm": 0.6789469122886658, "learning_rate": 0.0002, "epoch": 1.6308797127468582, "step": 22710}, {"loss": 0.7706, "grad_norm": 0.7221854329109192, "learning_rate": 0.0002, "epoch": 1.6315978456014362, "step": 22720}, {"loss": 0.7457, "grad_norm": 0.7724022269248962, "learning_rate": 0.0002, "epoch": 1.6323159784560144, "step": 22730}, {"loss": 0.7864, "grad_norm": 0.8213715553283691, "learning_rate": 0.0002, "epoch": 1.6330341113105926, "step": 22740}, {"loss": 0.7356, "grad_norm": 0.7102876305580139, "learning_rate": 0.0002, "epoch": 1.6337522441651706, "step": 22750}, {"loss": 0.7208, "grad_norm": 0.8817880749702454, "learning_rate": 0.0002, "epoch": 1.6344703770197486, "step": 22760}, {"loss": 0.7722, "grad_norm": 0.8446506857872009, "learning_rate": 0.0002, "epoch": 1.6351885098743266, "step": 22770}, {"loss": 0.7341, "grad_norm": 0.6749029755592346, "learning_rate": 0.0002, "epoch": 1.6359066427289048, "step": 22780}, {"loss": 0.7599, "grad_norm": 0.7013556957244873, "learning_rate": 0.0002, "epoch": 1.636624775583483, "step": 22790}, {"loss": 0.7488, "grad_norm": 0.7767965793609619, "learning_rate": 0.0002, "epoch": 1.637342908438061, "step": 22800}, {"loss": 0.7387, "grad_norm": 0.7354073524475098, "learning_rate": 0.0002, "epoch": 1.638061041292639, "step": 22810}, {"loss": 0.7816, "grad_norm": 0.8871088027954102, "learning_rate": 0.0002, "epoch": 1.638779174147217, "step": 22820}, {"loss": 0.7243, "grad_norm": 0.6573871374130249, "learning_rate": 0.0002, "epoch": 1.6394973070017953, "step": 22830}, {"loss": 0.7812, "grad_norm": 0.5679349303245544, "learning_rate": 0.0002, "epoch": 1.6402154398563735, "step": 22840}, {"loss": 0.7402, "grad_norm": 0.7072559595108032, "learning_rate": 0.0002, "epoch": 1.6409335727109515, "step": 22850}, {"loss": 0.751, "grad_norm": 0.7639257311820984, "learning_rate": 0.0002, "epoch": 1.6416517055655295, "step": 22860}, {"loss": 0.7357, "grad_norm": 0.6699341535568237, "learning_rate": 0.0002, "epoch": 1.6423698384201078, "step": 22870}, {"loss": 0.7295, "grad_norm": 0.8285767436027527, "learning_rate": 0.0002, "epoch": 1.643087971274686, "step": 22880}, {"loss": 0.7267, "grad_norm": 0.7328150272369385, "learning_rate": 0.0002, "epoch": 1.643806104129264, "step": 22890}, {"loss": 0.6904, "grad_norm": 0.8122354745864868, "learning_rate": 0.0002, "epoch": 1.644524236983842, "step": 22900}, {"loss": 0.7853, "grad_norm": 0.7322969436645508, "learning_rate": 0.0002, "epoch": 1.64524236983842, "step": 22910}, {"loss": 0.7629, "grad_norm": 0.7269576191902161, "learning_rate": 0.0002, "epoch": 1.6459605026929982, "step": 22920}, {"loss": 0.728, "grad_norm": 0.7037042379379272, "learning_rate": 0.0002, "epoch": 1.6466786355475764, "step": 22930}, {"loss": 0.752, "grad_norm": 0.6960355639457703, "learning_rate": 0.0002, "epoch": 1.6473967684021544, "step": 22940}, {"loss": 0.7484, "grad_norm": 0.7446839213371277, "learning_rate": 0.0002, "epoch": 1.6481149012567324, "step": 22950}, {"loss": 0.7528, "grad_norm": 0.7201664447784424, "learning_rate": 0.0002, "epoch": 1.6488330341113104, "step": 22960}, {"loss": 0.7183, "grad_norm": 0.7062349319458008, "learning_rate": 0.0002, "epoch": 1.6495511669658887, "step": 22970}, {"loss": 0.6999, "grad_norm": 0.7666636109352112, "learning_rate": 0.0002, "epoch": 1.6502692998204669, "step": 22980}, {"loss": 0.7103, "grad_norm": 0.7872112393379211, "learning_rate": 0.0002, "epoch": 1.6509874326750449, "step": 22990}, {"loss": 0.7307, "grad_norm": 0.7428551316261292, "learning_rate": 0.0002, "epoch": 1.6517055655296229, "step": 23000}, {"loss": 0.7573, "grad_norm": 0.6087952852249146, "learning_rate": 0.0002, "epoch": 1.6524236983842011, "step": 23010}, {"loss": 0.8045, "grad_norm": 0.7191354036331177, "learning_rate": 0.0002, "epoch": 1.6531418312387793, "step": 23020}, {"loss": 0.7517, "grad_norm": 0.8679710626602173, "learning_rate": 0.0002, "epoch": 1.6538599640933573, "step": 23030}, {"loss": 0.7084, "grad_norm": 0.7232310175895691, "learning_rate": 0.0002, "epoch": 1.6545780969479353, "step": 23040}, {"loss": 0.7007, "grad_norm": 0.5695104002952576, "learning_rate": 0.0002, "epoch": 1.6552962298025133, "step": 23050}, {"loss": 0.7115, "grad_norm": 0.6363076567649841, "learning_rate": 0.0002, "epoch": 1.6560143626570916, "step": 23060}, {"loss": 0.7639, "grad_norm": 0.8168749809265137, "learning_rate": 0.0002, "epoch": 1.6567324955116698, "step": 23070}, {"loss": 0.6768, "grad_norm": 0.7664111852645874, "learning_rate": 0.0002, "epoch": 1.6574506283662478, "step": 23080}, {"loss": 0.7492, "grad_norm": 0.6748140454292297, "learning_rate": 0.0002, "epoch": 1.6581687612208258, "step": 23090}, {"loss": 0.7213, "grad_norm": 0.6258183121681213, "learning_rate": 0.0002, "epoch": 1.6588868940754038, "step": 23100}, {"loss": 0.783, "grad_norm": 0.8669735193252563, "learning_rate": 0.0002, "epoch": 1.659605026929982, "step": 23110}, {"loss": 0.6847, "grad_norm": 0.5606119632720947, "learning_rate": 0.0002, "epoch": 1.6603231597845602, "step": 23120}, {"loss": 0.6889, "grad_norm": 0.6602507829666138, "learning_rate": 0.0002, "epoch": 1.6610412926391382, "step": 23130}, {"loss": 0.7605, "grad_norm": 0.7237988710403442, "learning_rate": 0.0002, "epoch": 1.6617594254937162, "step": 23140}, {"loss": 0.7663, "grad_norm": 0.9054415225982666, "learning_rate": 0.0002, "epoch": 1.6624775583482945, "step": 23150}, {"loss": 0.7603, "grad_norm": 0.5186660289764404, "learning_rate": 0.0002, "epoch": 1.6631956912028727, "step": 23160}, {"loss": 0.7442, "grad_norm": 0.719584584236145, "learning_rate": 0.0002, "epoch": 1.6639138240574507, "step": 23170}, {"loss": 0.7715, "grad_norm": 0.7583617568016052, "learning_rate": 0.0002, "epoch": 1.6646319569120287, "step": 23180}, {"loss": 0.7402, "grad_norm": 0.7985982298851013, "learning_rate": 0.0002, "epoch": 1.6653500897666067, "step": 23190}, {"loss": 0.7515, "grad_norm": 0.6952691674232483, "learning_rate": 0.0002, "epoch": 1.666068222621185, "step": 23200}, {"loss": 0.7491, "grad_norm": 0.7184221744537354, "learning_rate": 0.0002, "epoch": 1.6667863554757631, "step": 23210}, {"loss": 0.7608, "grad_norm": 0.8256361484527588, "learning_rate": 0.0002, "epoch": 1.6675044883303412, "step": 23220}, {"loss": 0.7331, "grad_norm": 0.7534128427505493, "learning_rate": 0.0002, "epoch": 1.6682226211849192, "step": 23230}, {"loss": 0.7196, "grad_norm": 0.7711095213890076, "learning_rate": 0.0002, "epoch": 1.6689407540394972, "step": 23240}, {"loss": 0.7871, "grad_norm": 0.6326615810394287, "learning_rate": 0.0002, "epoch": 1.6696588868940754, "step": 23250}, {"loss": 0.7244, "grad_norm": 0.8345766663551331, "learning_rate": 0.0002, "epoch": 1.6703770197486536, "step": 23260}, {"loss": 0.7819, "grad_norm": 0.9079837203025818, "learning_rate": 0.0002, "epoch": 1.6710951526032316, "step": 23270}, {"loss": 0.7259, "grad_norm": 0.7310197353363037, "learning_rate": 0.0002, "epoch": 1.6718132854578096, "step": 23280}, {"loss": 0.7253, "grad_norm": 0.7573344707489014, "learning_rate": 0.0002, "epoch": 1.6725314183123878, "step": 23290}, {"loss": 0.6817, "grad_norm": 0.7708047032356262, "learning_rate": 0.0002, "epoch": 1.673249551166966, "step": 23300}, {"loss": 0.7247, "grad_norm": 0.7665812969207764, "learning_rate": 0.0002, "epoch": 1.673967684021544, "step": 23310}, {"loss": 0.7048, "grad_norm": 0.7988788485527039, "learning_rate": 0.0002, "epoch": 1.674685816876122, "step": 23320}, {"loss": 0.7396, "grad_norm": 0.755042552947998, "learning_rate": 0.0002, "epoch": 1.6754039497307, "step": 23330}, {"loss": 0.7392, "grad_norm": 0.6605848670005798, "learning_rate": 0.0002, "epoch": 1.6761220825852783, "step": 23340}, {"loss": 0.7394, "grad_norm": 0.8762016296386719, "learning_rate": 0.0002, "epoch": 1.6768402154398565, "step": 23350}, {"loss": 0.7661, "grad_norm": 0.604742169380188, "learning_rate": 0.0002, "epoch": 1.6775583482944345, "step": 23360}, {"loss": 0.7422, "grad_norm": 0.7479172945022583, "learning_rate": 0.0002, "epoch": 1.6782764811490125, "step": 23370}, {"loss": 0.7248, "grad_norm": 0.6418702602386475, "learning_rate": 0.0002, "epoch": 1.6789946140035905, "step": 23380}, {"loss": 0.7717, "grad_norm": 0.6783933639526367, "learning_rate": 0.0002, "epoch": 1.6797127468581687, "step": 23390}, {"loss": 0.7099, "grad_norm": 0.7036024928092957, "learning_rate": 0.0002, "epoch": 1.680430879712747, "step": 23400}, {"loss": 0.7439, "grad_norm": 0.6833266615867615, "learning_rate": 0.0002, "epoch": 1.681149012567325, "step": 23410}, {"loss": 0.753, "grad_norm": 0.8867062330245972, "learning_rate": 0.0002, "epoch": 1.681867145421903, "step": 23420}, {"loss": 0.7694, "grad_norm": 0.7825753092765808, "learning_rate": 0.0002, "epoch": 1.6825852782764812, "step": 23430}, {"loss": 0.7127, "grad_norm": 0.6396880745887756, "learning_rate": 0.0002, "epoch": 1.6833034111310592, "step": 23440}, {"loss": 0.7465, "grad_norm": 0.5723230242729187, "learning_rate": 0.0002, "epoch": 1.6840215439856374, "step": 23450}, {"loss": 0.7102, "grad_norm": 0.6949231624603271, "learning_rate": 0.0002, "epoch": 1.6847396768402154, "step": 23460}, {"loss": 0.7421, "grad_norm": 0.8290650248527527, "learning_rate": 0.0002, "epoch": 1.6854578096947934, "step": 23470}, {"loss": 0.7774, "grad_norm": 0.7765078544616699, "learning_rate": 0.0002, "epoch": 1.6861759425493716, "step": 23480}, {"loss": 0.7271, "grad_norm": 0.7084149718284607, "learning_rate": 0.0002, "epoch": 1.6868940754039499, "step": 23490}, {"loss": 0.8188, "grad_norm": 0.6916654109954834, "learning_rate": 0.0002, "epoch": 1.6876122082585279, "step": 23500}, {"loss": 0.7235, "grad_norm": 0.5615179538726807, "learning_rate": 0.0002, "epoch": 1.6883303411131059, "step": 23510}, {"loss": 0.7203, "grad_norm": 0.7996105551719666, "learning_rate": 0.0002, "epoch": 1.6890484739676839, "step": 23520}, {"loss": 0.7145, "grad_norm": 0.7010168433189392, "learning_rate": 0.0002, "epoch": 1.689766606822262, "step": 23530}, {"loss": 0.7696, "grad_norm": 0.7876442074775696, "learning_rate": 0.0002, "epoch": 1.6904847396768403, "step": 23540}, {"loss": 0.6966, "grad_norm": 0.7508043646812439, "learning_rate": 0.0002, "epoch": 1.6912028725314183, "step": 23550}, {"loss": 0.729, "grad_norm": 0.8125874400138855, "learning_rate": 0.0002, "epoch": 1.6919210053859963, "step": 23560}, {"loss": 0.774, "grad_norm": 0.711840808391571, "learning_rate": 0.0002, "epoch": 1.6926391382405745, "step": 23570}, {"loss": 0.7165, "grad_norm": 0.6540026068687439, "learning_rate": 0.0002, "epoch": 1.6933572710951525, "step": 23580}, {"loss": 0.7578, "grad_norm": 0.8376550078392029, "learning_rate": 0.0002, "epoch": 1.6940754039497308, "step": 23590}, {"loss": 0.7746, "grad_norm": 0.7075366973876953, "learning_rate": 0.0002, "epoch": 1.6947935368043088, "step": 23600}, {"loss": 0.7639, "grad_norm": 0.7522266507148743, "learning_rate": 0.0002, "epoch": 1.6955116696588868, "step": 23610}, {"loss": 0.7386, "grad_norm": 0.7572667002677917, "learning_rate": 0.0002, "epoch": 1.696229802513465, "step": 23620}, {"loss": 0.6896, "grad_norm": 0.6126907467842102, "learning_rate": 0.0002, "epoch": 1.6969479353680432, "step": 23630}, {"loss": 0.7182, "grad_norm": 0.7473152875900269, "learning_rate": 0.0002, "epoch": 1.6976660682226212, "step": 23640}, {"loss": 0.7272, "grad_norm": 0.6630390286445618, "learning_rate": 0.0002, "epoch": 1.6983842010771992, "step": 23650}, {"loss": 0.7232, "grad_norm": 0.5848073363304138, "learning_rate": 0.0002, "epoch": 1.6991023339317772, "step": 23660}, {"loss": 0.6923, "grad_norm": 0.5901942849159241, "learning_rate": 0.0002, "epoch": 1.6998204667863555, "step": 23670}, {"loss": 0.79, "grad_norm": 0.7896918058395386, "learning_rate": 0.0002, "epoch": 1.7005385996409337, "step": 23680}, {"loss": 0.77, "grad_norm": 0.705362856388092, "learning_rate": 0.0002, "epoch": 1.7012567324955117, "step": 23690}, {"loss": 0.751, "grad_norm": 0.9917470812797546, "learning_rate": 0.0002, "epoch": 1.7019748653500897, "step": 23700}, {"loss": 0.7403, "grad_norm": 0.7550538778305054, "learning_rate": 0.0002, "epoch": 1.702692998204668, "step": 23710}, {"loss": 0.7398, "grad_norm": 0.8348238468170166, "learning_rate": 0.0002, "epoch": 1.703411131059246, "step": 23720}, {"loss": 0.7799, "grad_norm": 0.5979694128036499, "learning_rate": 0.0002, "epoch": 1.7041292639138241, "step": 23730}, {"loss": 0.7035, "grad_norm": 0.7451775670051575, "learning_rate": 0.0002, "epoch": 1.7048473967684021, "step": 23740}, {"loss": 0.7237, "grad_norm": 0.7614818215370178, "learning_rate": 0.0002, "epoch": 1.7055655296229801, "step": 23750}, {"loss": 0.7636, "grad_norm": 0.5590742826461792, "learning_rate": 0.0002, "epoch": 1.7062836624775584, "step": 23760}, {"loss": 0.701, "grad_norm": 0.7039094567298889, "learning_rate": 0.0002, "epoch": 1.7070017953321366, "step": 23770}, {"loss": 0.7145, "grad_norm": 0.7963233590126038, "learning_rate": 0.0002, "epoch": 1.7077199281867146, "step": 23780}, {"loss": 0.7702, "grad_norm": 0.7214934825897217, "learning_rate": 0.0002, "epoch": 1.7084380610412926, "step": 23790}, {"loss": 0.7515, "grad_norm": 0.7310500741004944, "learning_rate": 0.0002, "epoch": 1.7091561938958706, "step": 23800}, {"loss": 0.7038, "grad_norm": 0.6653284430503845, "learning_rate": 0.0002, "epoch": 1.7098743267504488, "step": 23810}, {"loss": 0.698, "grad_norm": 0.6632702946662903, "learning_rate": 0.0002, "epoch": 1.710592459605027, "step": 23820}, {"loss": 0.7338, "grad_norm": 0.6314955949783325, "learning_rate": 0.0002, "epoch": 1.711310592459605, "step": 23830}, {"loss": 0.7511, "grad_norm": 0.73652583360672, "learning_rate": 0.0002, "epoch": 1.712028725314183, "step": 23840}, {"loss": 0.6999, "grad_norm": 0.5685144662857056, "learning_rate": 0.0002, "epoch": 1.7127468581687613, "step": 23850}, {"loss": 0.7295, "grad_norm": 0.7010223865509033, "learning_rate": 0.0002, "epoch": 1.7134649910233393, "step": 23860}, {"loss": 0.7488, "grad_norm": 0.7643879652023315, "learning_rate": 0.0002, "epoch": 1.7141831238779175, "step": 23870}, {"loss": 0.7449, "grad_norm": 0.7543165683746338, "learning_rate": 0.0002, "epoch": 1.7149012567324955, "step": 23880}, {"loss": 0.6946, "grad_norm": 0.8816508054733276, "learning_rate": 0.0002, "epoch": 1.7156193895870735, "step": 23890}, {"loss": 0.7398, "grad_norm": 0.7979614734649658, "learning_rate": 0.0002, "epoch": 1.7163375224416517, "step": 23900}, {"loss": 0.7844, "grad_norm": 0.7631057500839233, "learning_rate": 0.0002, "epoch": 1.71705565529623, "step": 23910}, {"loss": 0.7409, "grad_norm": 0.6349977254867554, "learning_rate": 0.0002, "epoch": 1.717773788150808, "step": 23920}, {"loss": 0.74, "grad_norm": 0.7464412450790405, "learning_rate": 0.0002, "epoch": 1.718491921005386, "step": 23930}, {"loss": 0.7164, "grad_norm": 0.6985567212104797, "learning_rate": 0.0002, "epoch": 1.719210053859964, "step": 23940}, {"loss": 0.7256, "grad_norm": 0.6641302704811096, "learning_rate": 0.0002, "epoch": 1.7199281867145422, "step": 23950}, {"loss": 0.7154, "grad_norm": 0.7299597263336182, "learning_rate": 0.0002, "epoch": 1.7206463195691204, "step": 23960}, {"loss": 0.7535, "grad_norm": 0.7812355756759644, "learning_rate": 0.0002, "epoch": 1.7213644524236984, "step": 23970}, {"loss": 0.7363, "grad_norm": 0.667571485042572, "learning_rate": 0.0002, "epoch": 1.7220825852782764, "step": 23980}, {"loss": 0.7427, "grad_norm": 0.8244081735610962, "learning_rate": 0.0002, "epoch": 1.7228007181328546, "step": 23990}, {"loss": 0.7191, "grad_norm": 0.6684445738792419, "learning_rate": 0.0002, "epoch": 1.7235188509874326, "step": 24000}, {"loss": 0.8042, "grad_norm": 0.7002949118614197, "learning_rate": 0.0002, "epoch": 1.7242369838420109, "step": 24010}, {"loss": 0.7134, "grad_norm": 0.6249772906303406, "learning_rate": 0.0002, "epoch": 1.7249551166965889, "step": 24020}, {"loss": 0.721, "grad_norm": 0.7279905080795288, "learning_rate": 0.0002, "epoch": 1.7256732495511669, "step": 24030}, {"loss": 0.7374, "grad_norm": 0.631148636341095, "learning_rate": 0.0002, "epoch": 1.726391382405745, "step": 24040}, {"loss": 0.697, "grad_norm": 0.7486464977264404, "learning_rate": 0.0002, "epoch": 1.7271095152603233, "step": 24050}, {"loss": 0.715, "grad_norm": 0.7494347095489502, "learning_rate": 0.0002, "epoch": 1.7278276481149013, "step": 24060}, {"loss": 0.7609, "grad_norm": 0.7821264863014221, "learning_rate": 0.0002, "epoch": 1.7285457809694793, "step": 24070}, {"loss": 0.6925, "grad_norm": 0.7211608290672302, "learning_rate": 0.0002, "epoch": 1.7292639138240573, "step": 24080}, {"loss": 0.7444, "grad_norm": 0.7028553485870361, "learning_rate": 0.0002, "epoch": 1.7299820466786355, "step": 24090}, {"loss": 0.8065, "grad_norm": 0.6189247369766235, "learning_rate": 0.0002, "epoch": 1.7307001795332138, "step": 24100}, {"loss": 0.7011, "grad_norm": 0.7339756488800049, "learning_rate": 0.0002, "epoch": 1.7314183123877918, "step": 24110}, {"loss": 0.8071, "grad_norm": 0.6700502038002014, "learning_rate": 0.0002, "epoch": 1.7321364452423698, "step": 24120}, {"loss": 0.7608, "grad_norm": 0.6139533519744873, "learning_rate": 0.0002, "epoch": 1.732854578096948, "step": 24130}, {"loss": 0.7251, "grad_norm": 0.7249825596809387, "learning_rate": 0.0002, "epoch": 1.733572710951526, "step": 24140}, {"loss": 0.6954, "grad_norm": 0.6531777381896973, "learning_rate": 0.0002, "epoch": 1.7342908438061042, "step": 24150}, {"loss": 0.7214, "grad_norm": 0.8443833589553833, "learning_rate": 0.0002, "epoch": 1.7350089766606822, "step": 24160}, {"loss": 0.75, "grad_norm": 0.7040373086929321, "learning_rate": 0.0002, "epoch": 1.7357271095152602, "step": 24170}, {"loss": 0.701, "grad_norm": 0.8647749423980713, "learning_rate": 0.0002, "epoch": 1.7364452423698384, "step": 24180}, {"loss": 0.7033, "grad_norm": 0.7297305464744568, "learning_rate": 0.0002, "epoch": 1.7371633752244167, "step": 24190}, {"loss": 0.7187, "grad_norm": 0.8191218376159668, "learning_rate": 0.0002, "epoch": 1.7378815080789947, "step": 24200}, {"loss": 0.7665, "grad_norm": 0.7315607666969299, "learning_rate": 0.0002, "epoch": 1.7385996409335727, "step": 24210}, {"loss": 0.7467, "grad_norm": 0.694486677646637, "learning_rate": 0.0002, "epoch": 1.7393177737881507, "step": 24220}, {"loss": 0.7476, "grad_norm": 0.8115953207015991, "learning_rate": 0.0002, "epoch": 1.740035906642729, "step": 24230}, {"loss": 0.7792, "grad_norm": 0.7379186153411865, "learning_rate": 0.0002, "epoch": 1.7407540394973071, "step": 24240}, {"loss": 0.7224, "grad_norm": 0.6820309162139893, "learning_rate": 0.0002, "epoch": 1.7414721723518851, "step": 24250}, {"loss": 0.7558, "grad_norm": 0.8210766911506653, "learning_rate": 0.0002, "epoch": 1.7421903052064631, "step": 24260}, {"loss": 0.7098, "grad_norm": 0.724466860294342, "learning_rate": 0.0002, "epoch": 1.7429084380610413, "step": 24270}, {"loss": 0.7343, "grad_norm": 0.8768740296363831, "learning_rate": 0.0002, "epoch": 1.7436265709156193, "step": 24280}, {"loss": 0.7041, "grad_norm": 0.6691206097602844, "learning_rate": 0.0002, "epoch": 1.7443447037701976, "step": 24290}, {"loss": 0.7526, "grad_norm": 0.6529893279075623, "learning_rate": 0.0002, "epoch": 1.7450628366247756, "step": 24300}, {"loss": 0.7638, "grad_norm": 0.904729962348938, "learning_rate": 0.0002, "epoch": 1.7457809694793536, "step": 24310}, {"loss": 0.7463, "grad_norm": 0.655235230922699, "learning_rate": 0.0002, "epoch": 1.7464991023339318, "step": 24320}, {"loss": 0.7625, "grad_norm": 0.9476361274719238, "learning_rate": 0.0002, "epoch": 1.74721723518851, "step": 24330}, {"loss": 0.688, "grad_norm": 0.55366051197052, "learning_rate": 0.0002, "epoch": 1.747935368043088, "step": 24340}, {"loss": 0.7664, "grad_norm": 0.7192568182945251, "learning_rate": 0.0002, "epoch": 1.748653500897666, "step": 24350}, {"loss": 0.7423, "grad_norm": 0.7193983793258667, "learning_rate": 0.0002, "epoch": 1.749371633752244, "step": 24360}, {"loss": 0.7463, "grad_norm": 0.753998339176178, "learning_rate": 0.0002, "epoch": 1.7500897666068223, "step": 24370}, {"loss": 0.7415, "grad_norm": 1.1058299541473389, "learning_rate": 0.0002, "epoch": 1.7508078994614005, "step": 24380}, {"loss": 0.7373, "grad_norm": 0.7213007211685181, "learning_rate": 0.0002, "epoch": 1.7515260323159785, "step": 24390}, {"loss": 0.7395, "grad_norm": 0.972494900226593, "learning_rate": 0.0002, "epoch": 1.7522441651705565, "step": 24400}, {"loss": 0.7689, "grad_norm": 0.8045306205749512, "learning_rate": 0.0002, "epoch": 1.7529622980251347, "step": 24410}, {"loss": 0.7463, "grad_norm": 0.82415372133255, "learning_rate": 0.0002, "epoch": 1.7536804308797127, "step": 24420}, {"loss": 0.7384, "grad_norm": 0.72683185338974, "learning_rate": 0.0002, "epoch": 1.754398563734291, "step": 24430}, {"loss": 0.7512, "grad_norm": 0.687907338142395, "learning_rate": 0.0002, "epoch": 1.755116696588869, "step": 24440}, {"loss": 0.7627, "grad_norm": 0.6616531610488892, "learning_rate": 0.0002, "epoch": 1.755834829443447, "step": 24450}, {"loss": 0.7425, "grad_norm": 0.7225571870803833, "learning_rate": 0.0002, "epoch": 1.7565529622980252, "step": 24460}, {"loss": 0.7584, "grad_norm": 0.7597603797912598, "learning_rate": 0.0002, "epoch": 1.7572710951526034, "step": 24470}, {"loss": 0.7076, "grad_norm": 0.7850660681724548, "learning_rate": 0.0002, "epoch": 1.7579892280071814, "step": 24480}, {"loss": 0.7294, "grad_norm": 0.9843530058860779, "learning_rate": 0.0002, "epoch": 1.7587073608617594, "step": 24490}, {"loss": 0.7237, "grad_norm": 0.7010256052017212, "learning_rate": 0.0002, "epoch": 1.7594254937163374, "step": 24500}, {"loss": 0.7143, "grad_norm": 0.5669383406639099, "learning_rate": 0.0002, "epoch": 1.7601436265709156, "step": 24510}, {"loss": 0.7511, "grad_norm": 0.7043302655220032, "learning_rate": 0.0002, "epoch": 1.7608617594254938, "step": 24520}, {"loss": 0.73, "grad_norm": 0.8000741600990295, "learning_rate": 0.0002, "epoch": 1.7615798922800718, "step": 24530}, {"loss": 0.6994, "grad_norm": 0.7084416747093201, "learning_rate": 0.0002, "epoch": 1.7622980251346498, "step": 24540}, {"loss": 0.7337, "grad_norm": 0.7290608882904053, "learning_rate": 0.0002, "epoch": 1.763016157989228, "step": 24550}, {"loss": 0.6968, "grad_norm": 0.8710007071495056, "learning_rate": 0.0002, "epoch": 1.763734290843806, "step": 24560}, {"loss": 0.7023, "grad_norm": 0.6346535682678223, "learning_rate": 0.0002, "epoch": 1.7644524236983843, "step": 24570}, {"loss": 0.684, "grad_norm": 0.8990599513053894, "learning_rate": 0.0002, "epoch": 1.7651705565529623, "step": 24580}, {"loss": 0.7222, "grad_norm": 0.7823857665061951, "learning_rate": 0.0002, "epoch": 1.7658886894075403, "step": 24590}, {"loss": 0.7392, "grad_norm": 0.6250144839286804, "learning_rate": 0.0002, "epoch": 1.7666068222621185, "step": 24600}, {"loss": 0.7159, "grad_norm": 0.715657114982605, "learning_rate": 0.0002, "epoch": 1.7673249551166967, "step": 24610}, {"loss": 0.7245, "grad_norm": 0.6254874467849731, "learning_rate": 0.0002, "epoch": 1.7680430879712747, "step": 24620}, {"loss": 0.7258, "grad_norm": 0.6873717904090881, "learning_rate": 0.0002, "epoch": 1.7687612208258527, "step": 24630}, {"loss": 0.7951, "grad_norm": 0.7273038625717163, "learning_rate": 0.0002, "epoch": 1.7694793536804307, "step": 24640}, {"loss": 0.7417, "grad_norm": 0.9079981446266174, "learning_rate": 0.0002, "epoch": 1.770197486535009, "step": 24650}, {"loss": 0.7138, "grad_norm": 0.6262510418891907, "learning_rate": 0.0002, "epoch": 1.7709156193895872, "step": 24660}, {"loss": 0.6995, "grad_norm": 0.7326231002807617, "learning_rate": 0.0002, "epoch": 1.7716337522441652, "step": 24670}, {"loss": 0.7483, "grad_norm": 0.7828301787376404, "learning_rate": 0.0002, "epoch": 1.7723518850987432, "step": 24680}, {"loss": 0.689, "grad_norm": 0.5881586670875549, "learning_rate": 0.0002, "epoch": 1.7730700179533212, "step": 24690}, {"loss": 0.744, "grad_norm": 0.7101683020591736, "learning_rate": 0.0002, "epoch": 1.7737881508078994, "step": 24700}, {"loss": 0.7145, "grad_norm": 0.8466469049453735, "learning_rate": 0.0002, "epoch": 1.7745062836624776, "step": 24710}, {"loss": 0.7428, "grad_norm": 0.7770822644233704, "learning_rate": 0.0002, "epoch": 1.7752244165170556, "step": 24720}, {"loss": 0.7299, "grad_norm": 0.7259120345115662, "learning_rate": 0.0002, "epoch": 1.7759425493716336, "step": 24730}, {"loss": 0.6909, "grad_norm": 0.7696824669837952, "learning_rate": 0.0002, "epoch": 1.7766606822262119, "step": 24740}, {"loss": 0.7659, "grad_norm": 0.7603837847709656, "learning_rate": 0.0002, "epoch": 1.77737881508079, "step": 24750}, {"loss": 0.6966, "grad_norm": 0.6166595220565796, "learning_rate": 0.0002, "epoch": 1.778096947935368, "step": 24760}, {"loss": 0.6987, "grad_norm": 0.7493758797645569, "learning_rate": 0.0002, "epoch": 1.778815080789946, "step": 24770}, {"loss": 0.6808, "grad_norm": 0.7177459597587585, "learning_rate": 0.0002, "epoch": 1.779533213644524, "step": 24780}, {"loss": 0.7411, "grad_norm": 0.6666781306266785, "learning_rate": 0.0002, "epoch": 1.7802513464991023, "step": 24790}, {"loss": 0.6867, "grad_norm": 0.6556468605995178, "learning_rate": 0.0002, "epoch": 1.7809694793536806, "step": 24800}, {"loss": 0.7375, "grad_norm": 0.6119393706321716, "learning_rate": 0.0002, "epoch": 1.7816876122082586, "step": 24810}, {"loss": 0.7059, "grad_norm": 0.8573325276374817, "learning_rate": 0.0002, "epoch": 1.7824057450628366, "step": 24820}, {"loss": 0.7708, "grad_norm": 0.8017005920410156, "learning_rate": 0.0002, "epoch": 1.7831238779174146, "step": 24830}, {"loss": 0.7041, "grad_norm": 0.7337947487831116, "learning_rate": 0.0002, "epoch": 1.7838420107719928, "step": 24840}, {"loss": 0.7325, "grad_norm": 0.6717178225517273, "learning_rate": 0.0002, "epoch": 1.784560143626571, "step": 24850}, {"loss": 0.7285, "grad_norm": 0.8243708610534668, "learning_rate": 0.0002, "epoch": 1.785278276481149, "step": 24860}, {"loss": 0.701, "grad_norm": 0.8111547827720642, "learning_rate": 0.0002, "epoch": 1.785996409335727, "step": 24870}, {"loss": 0.7105, "grad_norm": 0.8577823042869568, "learning_rate": 0.0002, "epoch": 1.7867145421903052, "step": 24880}, {"loss": 0.7419, "grad_norm": 0.6488644480705261, "learning_rate": 0.0002, "epoch": 1.7874326750448835, "step": 24890}, {"loss": 0.7112, "grad_norm": 0.6446744799613953, "learning_rate": 0.0002, "epoch": 1.7881508078994615, "step": 24900}, {"loss": 0.7531, "grad_norm": 0.6400182247161865, "learning_rate": 0.0002, "epoch": 1.7888689407540395, "step": 24910}, {"loss": 0.711, "grad_norm": 0.8059108853340149, "learning_rate": 0.0002, "epoch": 1.7895870736086175, "step": 24920}, {"loss": 0.7678, "grad_norm": 0.7101734280586243, "learning_rate": 0.0002, "epoch": 1.7903052064631957, "step": 24930}, {"loss": 0.7648, "grad_norm": 1.0397762060165405, "learning_rate": 0.0002, "epoch": 1.791023339317774, "step": 24940}, {"loss": 0.7079, "grad_norm": 0.6231128573417664, "learning_rate": 0.0002, "epoch": 1.791741472172352, "step": 24950}, {"loss": 0.7525, "grad_norm": 5.905253887176514, "learning_rate": 0.0002, "epoch": 1.79245960502693, "step": 24960}, {"loss": 0.7286, "grad_norm": 0.8003911375999451, "learning_rate": 0.0002, "epoch": 1.793177737881508, "step": 24970}, {"loss": 0.7002, "grad_norm": 0.6340393424034119, "learning_rate": 0.0002, "epoch": 1.7938958707360861, "step": 24980}, {"loss": 0.7056, "grad_norm": 0.8701013922691345, "learning_rate": 0.0002, "epoch": 1.7946140035906644, "step": 24990}, {"loss": 0.7192, "grad_norm": 0.9085575342178345, "learning_rate": 0.0002, "epoch": 1.7953321364452424, "step": 25000}, {"loss": 0.7367, "grad_norm": 0.6306625604629517, "learning_rate": 0.0002, "epoch": 1.7960502692998204, "step": 25010}, {"loss": 0.7122, "grad_norm": 0.6985056400299072, "learning_rate": 0.0002, "epoch": 1.7967684021543986, "step": 25020}, {"loss": 0.7005, "grad_norm": 0.7309113144874573, "learning_rate": 0.0002, "epoch": 1.7974865350089768, "step": 25030}, {"loss": 0.7414, "grad_norm": 0.6795042157173157, "learning_rate": 0.0002, "epoch": 1.7982046678635548, "step": 25040}, {"loss": 0.7606, "grad_norm": 0.6920178532600403, "learning_rate": 0.0002, "epoch": 1.7989228007181328, "step": 25050}, {"loss": 0.7094, "grad_norm": 0.6578564047813416, "learning_rate": 0.0002, "epoch": 1.7996409335727108, "step": 25060}, {"loss": 0.7471, "grad_norm": 0.6718358993530273, "learning_rate": 0.0002, "epoch": 1.800359066427289, "step": 25070}, {"loss": 0.7271, "grad_norm": 0.9086750149726868, "learning_rate": 0.0002, "epoch": 1.8010771992818673, "step": 25080}, {"loss": 0.7653, "grad_norm": 0.6102437973022461, "learning_rate": 0.0002, "epoch": 1.8017953321364453, "step": 25090}, {"loss": 0.7538, "grad_norm": 0.6391313076019287, "learning_rate": 0.0002, "epoch": 1.8025134649910233, "step": 25100}, {"loss": 0.766, "grad_norm": 0.7150128483772278, "learning_rate": 0.0002, "epoch": 1.8032315978456013, "step": 25110}, {"loss": 0.7036, "grad_norm": 0.9833421111106873, "learning_rate": 0.0002, "epoch": 1.8039497307001795, "step": 25120}, {"loss": 0.7122, "grad_norm": 0.774002194404602, "learning_rate": 0.0002, "epoch": 1.8046678635547577, "step": 25130}, {"loss": 0.7329, "grad_norm": 0.644443154335022, "learning_rate": 0.0002, "epoch": 1.8053859964093357, "step": 25140}, {"loss": 0.7039, "grad_norm": 0.6996100544929504, "learning_rate": 0.0002, "epoch": 1.8061041292639137, "step": 25150}, {"loss": 0.6962, "grad_norm": 0.7545985579490662, "learning_rate": 0.0002, "epoch": 1.806822262118492, "step": 25160}, {"loss": 0.7432, "grad_norm": 0.7505226731300354, "learning_rate": 0.0002, "epoch": 1.8075403949730702, "step": 25170}, {"loss": 0.7189, "grad_norm": 0.800681471824646, "learning_rate": 0.0002, "epoch": 1.8082585278276482, "step": 25180}, {"loss": 0.7131, "grad_norm": 0.8268337845802307, "learning_rate": 0.0002, "epoch": 1.8089766606822262, "step": 25190}, {"loss": 0.7933, "grad_norm": 0.6436594128608704, "learning_rate": 0.0002, "epoch": 1.8096947935368042, "step": 25200}, {"loss": 0.7478, "grad_norm": 0.6961014270782471, "learning_rate": 0.0002, "epoch": 1.8104129263913824, "step": 25210}, {"loss": 0.7519, "grad_norm": 0.6649489998817444, "learning_rate": 0.0002, "epoch": 1.8111310592459606, "step": 25220}, {"loss": 0.7307, "grad_norm": 0.7071637511253357, "learning_rate": 0.0002, "epoch": 1.8118491921005386, "step": 25230}, {"loss": 0.7074, "grad_norm": 0.9082241654396057, "learning_rate": 0.0002, "epoch": 1.8125673249551166, "step": 25240}, {"loss": 0.7406, "grad_norm": 0.6318159103393555, "learning_rate": 0.0002, "epoch": 1.8132854578096946, "step": 25250}, {"loss": 0.7081, "grad_norm": 0.8006597757339478, "learning_rate": 0.0002, "epoch": 1.8140035906642729, "step": 25260}, {"loss": 0.7593, "grad_norm": 0.7950259447097778, "learning_rate": 0.0002, "epoch": 1.814721723518851, "step": 25270}, {"loss": 0.6897, "grad_norm": 0.8376588821411133, "learning_rate": 0.0002, "epoch": 1.815439856373429, "step": 25280}, {"loss": 0.747, "grad_norm": 0.8343217968940735, "learning_rate": 0.0002, "epoch": 1.816157989228007, "step": 25290}, {"loss": 0.7611, "grad_norm": 0.6240017414093018, "learning_rate": 0.0002, "epoch": 1.8168761220825853, "step": 25300}, {"loss": 0.7458, "grad_norm": 0.7079808712005615, "learning_rate": 0.0002, "epoch": 1.8175942549371635, "step": 25310}, {"loss": 0.7254, "grad_norm": 0.5930073261260986, "learning_rate": 0.0002, "epoch": 1.8183123877917415, "step": 25320}, {"loss": 0.7647, "grad_norm": 0.6994491815567017, "learning_rate": 0.0002, "epoch": 1.8190305206463195, "step": 25330}, {"loss": 0.726, "grad_norm": 0.8285305500030518, "learning_rate": 0.0002, "epoch": 1.8197486535008975, "step": 25340}, {"loss": 0.7215, "grad_norm": 0.6880194544792175, "learning_rate": 0.0002, "epoch": 1.8204667863554758, "step": 25350}, {"loss": 0.7365, "grad_norm": 0.7301307916641235, "learning_rate": 0.0002, "epoch": 1.821184919210054, "step": 25360}, {"loss": 0.7308, "grad_norm": 0.8117532730102539, "learning_rate": 0.0002, "epoch": 1.821903052064632, "step": 25370}, {"loss": 0.7395, "grad_norm": 0.8098701238632202, "learning_rate": 0.0002, "epoch": 1.82262118491921, "step": 25380}, {"loss": 0.7082, "grad_norm": 0.6899038553237915, "learning_rate": 0.0002, "epoch": 1.823339317773788, "step": 25390}, {"loss": 0.697, "grad_norm": 0.7350431084632874, "learning_rate": 0.0002, "epoch": 1.8240574506283662, "step": 25400}, {"loss": 0.7389, "grad_norm": 0.8723382949829102, "learning_rate": 0.0002, "epoch": 1.8247755834829444, "step": 25410}, {"loss": 0.7375, "grad_norm": 0.7448108196258545, "learning_rate": 0.0002, "epoch": 1.8254937163375224, "step": 25420}, {"loss": 0.7279, "grad_norm": 0.7525040507316589, "learning_rate": 0.0002, "epoch": 1.8262118491921004, "step": 25430}, {"loss": 0.7164, "grad_norm": 0.7148599028587341, "learning_rate": 0.0002, "epoch": 1.8269299820466787, "step": 25440}, {"loss": 0.7955, "grad_norm": 1.1802153587341309, "learning_rate": 0.0002, "epoch": 1.827648114901257, "step": 25450}, {"loss": 0.7094, "grad_norm": 0.619945764541626, "learning_rate": 0.0002, "epoch": 1.828366247755835, "step": 25460}, {"loss": 0.8234, "grad_norm": 0.7065792679786682, "learning_rate": 0.0002, "epoch": 1.829084380610413, "step": 25470}, {"loss": 0.796, "grad_norm": 0.6626001596450806, "learning_rate": 0.0002, "epoch": 1.829802513464991, "step": 25480}, {"loss": 0.7402, "grad_norm": 0.8368920087814331, "learning_rate": 0.0002, "epoch": 1.8305206463195691, "step": 25490}, {"loss": 0.6513, "grad_norm": 0.7528934478759766, "learning_rate": 0.0002, "epoch": 1.8312387791741473, "step": 25500}, {"loss": 0.7272, "grad_norm": 0.6472136378288269, "learning_rate": 0.0002, "epoch": 1.8319569120287253, "step": 25510}, {"loss": 0.7221, "grad_norm": 0.7818671464920044, "learning_rate": 0.0002, "epoch": 1.8326750448833034, "step": 25520}, {"loss": 0.7582, "grad_norm": 0.8280798196792603, "learning_rate": 0.0002, "epoch": 1.8333931777378814, "step": 25530}, {"loss": 0.7079, "grad_norm": 0.7038599252700806, "learning_rate": 0.0002, "epoch": 1.8341113105924596, "step": 25540}, {"loss": 0.711, "grad_norm": 0.6345962882041931, "learning_rate": 0.0002, "epoch": 1.8348294434470378, "step": 25550}, {"loss": 0.7553, "grad_norm": 0.6891741752624512, "learning_rate": 0.0002, "epoch": 1.8355475763016158, "step": 25560}, {"loss": 0.754, "grad_norm": 0.7753492593765259, "learning_rate": 0.0002, "epoch": 1.8362657091561938, "step": 25570}, {"loss": 0.7149, "grad_norm": 0.6907210946083069, "learning_rate": 0.0002, "epoch": 1.836983842010772, "step": 25580}, {"loss": 0.705, "grad_norm": 0.7483090162277222, "learning_rate": 0.0002, "epoch": 1.8377019748653503, "step": 25590}, {"loss": 0.7716, "grad_norm": 0.8749029636383057, "learning_rate": 0.0002, "epoch": 1.8384201077199283, "step": 25600}, {"loss": 0.7745, "grad_norm": 0.6936851143836975, "learning_rate": 0.0002, "epoch": 1.8391382405745063, "step": 25610}, {"loss": 0.7297, "grad_norm": 0.7273763418197632, "learning_rate": 0.0002, "epoch": 1.8398563734290843, "step": 25620}, {"loss": 0.724, "grad_norm": 0.7655298113822937, "learning_rate": 0.0002, "epoch": 1.8405745062836625, "step": 25630}, {"loss": 0.7566, "grad_norm": 0.7207344770431519, "learning_rate": 0.0002, "epoch": 1.8412926391382407, "step": 25640}, {"loss": 0.7092, "grad_norm": 0.6970131397247314, "learning_rate": 0.0002, "epoch": 1.8420107719928187, "step": 25650}, {"loss": 0.7164, "grad_norm": 0.7777560353279114, "learning_rate": 0.0002, "epoch": 1.8427289048473967, "step": 25660}, {"loss": 0.7594, "grad_norm": 0.7070116400718689, "learning_rate": 0.0002, "epoch": 1.8434470377019747, "step": 25670}, {"loss": 0.7603, "grad_norm": 0.6980257630348206, "learning_rate": 0.0002, "epoch": 1.844165170556553, "step": 25680}, {"loss": 0.7782, "grad_norm": 0.906563401222229, "learning_rate": 0.0002, "epoch": 1.8448833034111312, "step": 25690}, {"loss": 0.7377, "grad_norm": 0.567991316318512, "learning_rate": 0.0002, "epoch": 1.8456014362657092, "step": 25700}, {"loss": 0.7236, "grad_norm": 0.5954506993293762, "learning_rate": 0.0002, "epoch": 1.8463195691202872, "step": 25710}, {"loss": 0.7287, "grad_norm": 0.8073318600654602, "learning_rate": 0.0002, "epoch": 1.8470377019748654, "step": 25720}, {"loss": 0.7627, "grad_norm": 0.7439551949501038, "learning_rate": 0.0002, "epoch": 1.8477558348294436, "step": 25730}, {"loss": 0.7719, "grad_norm": 0.8091771602630615, "learning_rate": 0.0002, "epoch": 1.8484739676840216, "step": 25740}, {"loss": 0.7477, "grad_norm": 0.6584576964378357, "learning_rate": 0.0002, "epoch": 1.8491921005385996, "step": 25750}, {"loss": 0.6988, "grad_norm": 0.8161963224411011, "learning_rate": 0.0002, "epoch": 1.8499102333931776, "step": 25760}, {"loss": 0.7607, "grad_norm": 0.7337122559547424, "learning_rate": 0.0002, "epoch": 1.8506283662477558, "step": 25770}, {"loss": 0.7279, "grad_norm": 0.8968114256858826, "learning_rate": 0.0002, "epoch": 1.851346499102334, "step": 25780}, {"loss": 0.7162, "grad_norm": 0.8647686839103699, "learning_rate": 0.0002, "epoch": 1.852064631956912, "step": 25790}, {"loss": 0.7315, "grad_norm": 0.7775349020957947, "learning_rate": 0.0002, "epoch": 1.85278276481149, "step": 25800}, {"loss": 0.7739, "grad_norm": 0.686072587966919, "learning_rate": 0.0002, "epoch": 1.853500897666068, "step": 25810}, {"loss": 0.7138, "grad_norm": 0.7053380012512207, "learning_rate": 0.0002, "epoch": 1.8542190305206463, "step": 25820}, {"loss": 0.7583, "grad_norm": 0.7899979948997498, "learning_rate": 0.0002, "epoch": 1.8549371633752245, "step": 25830}, {"loss": 0.7633, "grad_norm": 0.6970776915550232, "learning_rate": 0.0002, "epoch": 1.8556552962298025, "step": 25840}, {"loss": 0.7704, "grad_norm": 0.7210841774940491, "learning_rate": 0.0002, "epoch": 1.8563734290843805, "step": 25850}, {"loss": 0.7422, "grad_norm": 0.7297208905220032, "learning_rate": 0.0002, "epoch": 1.8570915619389587, "step": 25860}, {"loss": 0.698, "grad_norm": 0.7782729268074036, "learning_rate": 0.0002, "epoch": 1.857809694793537, "step": 25870}, {"loss": 0.7791, "grad_norm": 0.7227505445480347, "learning_rate": 0.0002, "epoch": 1.858527827648115, "step": 25880}, {"loss": 0.7899, "grad_norm": 0.7489684224128723, "learning_rate": 0.0002, "epoch": 1.859245960502693, "step": 25890}, {"loss": 0.7875, "grad_norm": 0.7447289824485779, "learning_rate": 0.0002, "epoch": 1.859964093357271, "step": 25900}, {"loss": 0.7151, "grad_norm": 0.8516317009925842, "learning_rate": 0.0002, "epoch": 1.8606822262118492, "step": 25910}, {"loss": 0.6947, "grad_norm": 0.6864543557167053, "learning_rate": 0.0002, "epoch": 1.8614003590664274, "step": 25920}, {"loss": 0.7516, "grad_norm": 0.6753451824188232, "learning_rate": 0.0002, "epoch": 1.8621184919210054, "step": 25930}, {"loss": 0.7606, "grad_norm": 0.631679117679596, "learning_rate": 0.0002, "epoch": 1.8628366247755834, "step": 25940}, {"loss": 0.7663, "grad_norm": 0.7715049982070923, "learning_rate": 0.0002, "epoch": 1.8635547576301614, "step": 25950}, {"loss": 0.6967, "grad_norm": 0.7354850769042969, "learning_rate": 0.0002, "epoch": 1.8642728904847397, "step": 25960}, {"loss": 0.7331, "grad_norm": 0.7443442940711975, "learning_rate": 0.0002, "epoch": 1.8649910233393179, "step": 25970}, {"loss": 0.7558, "grad_norm": 0.6880337595939636, "learning_rate": 0.0002, "epoch": 1.8657091561938959, "step": 25980}, {"loss": 0.752, "grad_norm": 0.843941867351532, "learning_rate": 0.0002, "epoch": 1.8664272890484739, "step": 25990}, {"loss": 0.6941, "grad_norm": 0.6904318928718567, "learning_rate": 0.0002, "epoch": 1.867145421903052, "step": 26000}, {"loss": 0.6995, "grad_norm": 0.9041751623153687, "learning_rate": 0.0002, "epoch": 1.86786355475763, "step": 26010}, {"loss": 0.7503, "grad_norm": 0.7470057010650635, "learning_rate": 0.0002, "epoch": 1.8685816876122083, "step": 26020}, {"loss": 0.775, "grad_norm": 0.6921331882476807, "learning_rate": 0.0002, "epoch": 1.8692998204667863, "step": 26030}, {"loss": 0.7376, "grad_norm": 0.7627376914024353, "learning_rate": 0.0002, "epoch": 1.8700179533213643, "step": 26040}, {"loss": 0.7459, "grad_norm": 0.7784932851791382, "learning_rate": 0.0002, "epoch": 1.8707360861759426, "step": 26050}, {"loss": 0.7479, "grad_norm": 0.6399524807929993, "learning_rate": 0.0002, "epoch": 1.8714542190305208, "step": 26060}, {"loss": 0.7128, "grad_norm": 0.6478492617607117, "learning_rate": 0.0002, "epoch": 1.8721723518850988, "step": 26070}, {"loss": 0.6901, "grad_norm": 0.6376804113388062, "learning_rate": 0.0002, "epoch": 1.8728904847396768, "step": 26080}, {"loss": 0.7037, "grad_norm": 0.6976892352104187, "learning_rate": 0.0002, "epoch": 1.8736086175942548, "step": 26090}, {"loss": 0.7071, "grad_norm": 0.7997903227806091, "learning_rate": 0.0002, "epoch": 1.874326750448833, "step": 26100}, {"loss": 0.7152, "grad_norm": 0.6984273791313171, "learning_rate": 0.0002, "epoch": 1.8750448833034112, "step": 26110}, {"loss": 0.7768, "grad_norm": 0.7020659446716309, "learning_rate": 0.0002, "epoch": 1.8757630161579892, "step": 26120}, {"loss": 0.7518, "grad_norm": 0.784986138343811, "learning_rate": 0.0002, "epoch": 1.8764811490125672, "step": 26130}, {"loss": 0.7224, "grad_norm": 0.7369210124015808, "learning_rate": 0.0002, "epoch": 1.8771992818671455, "step": 26140}, {"loss": 0.7935, "grad_norm": 0.7730622291564941, "learning_rate": 0.0002, "epoch": 1.8779174147217235, "step": 26150}, {"loss": 0.697, "grad_norm": 0.7253434658050537, "learning_rate": 0.0002, "epoch": 1.8786355475763017, "step": 26160}, {"loss": 0.6866, "grad_norm": 0.8019800186157227, "learning_rate": 0.0002, "epoch": 1.8793536804308797, "step": 26170}, {"loss": 0.7341, "grad_norm": 0.7337628602981567, "learning_rate": 0.0002, "epoch": 1.8800718132854577, "step": 26180}, {"loss": 0.752, "grad_norm": 0.7049200534820557, "learning_rate": 0.0002, "epoch": 1.880789946140036, "step": 26190}, {"loss": 0.73, "grad_norm": 0.6451525092124939, "learning_rate": 0.0002, "epoch": 1.8815080789946141, "step": 26200}, {"loss": 0.749, "grad_norm": 0.7660874724388123, "learning_rate": 0.0002, "epoch": 1.8822262118491921, "step": 26210}, {"loss": 0.7377, "grad_norm": 0.8464223146438599, "learning_rate": 0.0002, "epoch": 1.8829443447037701, "step": 26220}, {"loss": 0.7402, "grad_norm": 0.859503984451294, "learning_rate": 0.0002, "epoch": 1.8836624775583481, "step": 26230}, {"loss": 0.7057, "grad_norm": 0.6969478726387024, "learning_rate": 0.0002, "epoch": 1.8843806104129264, "step": 26240}, {"loss": 0.7338, "grad_norm": 0.6860285997390747, "learning_rate": 0.0002, "epoch": 1.8850987432675046, "step": 26250}, {"loss": 0.7397, "grad_norm": 0.5873110294342041, "learning_rate": 0.0002, "epoch": 1.8858168761220826, "step": 26260}, {"loss": 0.7208, "grad_norm": 0.6959530115127563, "learning_rate": 0.0002, "epoch": 1.8865350089766606, "step": 26270}, {"loss": 0.7156, "grad_norm": 0.8734689950942993, "learning_rate": 0.0002, "epoch": 1.8872531418312388, "step": 26280}, {"loss": 0.689, "grad_norm": 0.7385509014129639, "learning_rate": 0.0002, "epoch": 1.8879712746858168, "step": 26290}, {"loss": 0.7355, "grad_norm": 0.6702063083648682, "learning_rate": 0.0002, "epoch": 1.888689407540395, "step": 26300}, {"loss": 0.7247, "grad_norm": 0.8177255988121033, "learning_rate": 0.0002, "epoch": 1.889407540394973, "step": 26310}, {"loss": 0.7451, "grad_norm": 0.6638466715812683, "learning_rate": 0.0002, "epoch": 1.890125673249551, "step": 26320}, {"loss": 0.7176, "grad_norm": 0.8584128618240356, "learning_rate": 0.0002, "epoch": 1.8908438061041293, "step": 26330}, {"loss": 0.7216, "grad_norm": 0.677561342716217, "learning_rate": 0.0002, "epoch": 1.8915619389587075, "step": 26340}, {"loss": 0.7502, "grad_norm": 0.6931864619255066, "learning_rate": 0.0002, "epoch": 1.8922800718132855, "step": 26350}, {"loss": 0.7548, "grad_norm": 0.6583828330039978, "learning_rate": 0.0002, "epoch": 1.8929982046678635, "step": 26360}, {"loss": 0.7544, "grad_norm": 0.6708519458770752, "learning_rate": 0.0002, "epoch": 1.8937163375224415, "step": 26370}, {"loss": 0.7034, "grad_norm": 0.7684788107872009, "learning_rate": 0.0002, "epoch": 1.8944344703770197, "step": 26380}, {"loss": 0.7243, "grad_norm": 0.703217625617981, "learning_rate": 0.0002, "epoch": 1.895152603231598, "step": 26390}, {"loss": 0.7768, "grad_norm": 0.6686710119247437, "learning_rate": 0.0002, "epoch": 1.895870736086176, "step": 26400}, {"loss": 0.7999, "grad_norm": 0.7429705262184143, "learning_rate": 0.0002, "epoch": 1.896588868940754, "step": 26410}, {"loss": 0.7695, "grad_norm": 0.7835305333137512, "learning_rate": 0.0002, "epoch": 1.8973070017953322, "step": 26420}, {"loss": 0.722, "grad_norm": 0.7793689370155334, "learning_rate": 0.0002, "epoch": 1.8980251346499102, "step": 26430}, {"loss": 0.7872, "grad_norm": 0.7337237000465393, "learning_rate": 0.0002, "epoch": 1.8987432675044884, "step": 26440}, {"loss": 0.7092, "grad_norm": 0.5734546780586243, "learning_rate": 0.0002, "epoch": 1.8994614003590664, "step": 26450}, {"loss": 0.7738, "grad_norm": 0.655937135219574, "learning_rate": 0.0002, "epoch": 1.9001795332136444, "step": 26460}, {"loss": 0.7302, "grad_norm": 1.0200905799865723, "learning_rate": 0.0002, "epoch": 1.9008976660682226, "step": 26470}, {"loss": 0.733, "grad_norm": 0.6118829250335693, "learning_rate": 0.0002, "epoch": 1.9016157989228009, "step": 26480}, {"loss": 0.7255, "grad_norm": 0.7459297776222229, "learning_rate": 0.0002, "epoch": 1.9023339317773789, "step": 26490}, {"loss": 0.7257, "grad_norm": 0.9451959729194641, "learning_rate": 0.0002, "epoch": 1.9030520646319569, "step": 26500}, {"loss": 0.7911, "grad_norm": 0.9694880247116089, "learning_rate": 0.0002, "epoch": 1.9037701974865349, "step": 26510}, {"loss": 0.7913, "grad_norm": 0.806532084941864, "learning_rate": 0.0002, "epoch": 1.904488330341113, "step": 26520}, {"loss": 0.7375, "grad_norm": 0.7016968727111816, "learning_rate": 0.0002, "epoch": 1.9052064631956913, "step": 26530}, {"loss": 0.7128, "grad_norm": 0.7707533836364746, "learning_rate": 0.0002, "epoch": 1.9059245960502693, "step": 26540}, {"loss": 0.7225, "grad_norm": 0.716044545173645, "learning_rate": 0.0002, "epoch": 1.9066427289048473, "step": 26550}, {"loss": 0.7569, "grad_norm": 0.7904782295227051, "learning_rate": 0.0002, "epoch": 1.9073608617594255, "step": 26560}, {"loss": 0.7112, "grad_norm": 0.8557461500167847, "learning_rate": 0.0002, "epoch": 1.9080789946140035, "step": 26570}, {"loss": 0.7377, "grad_norm": 0.6807048916816711, "learning_rate": 0.0002, "epoch": 1.9087971274685818, "step": 26580}, {"loss": 0.7066, "grad_norm": 0.8374032974243164, "learning_rate": 0.0002, "epoch": 1.9095152603231598, "step": 26590}, {"loss": 0.7282, "grad_norm": 0.7936834692955017, "learning_rate": 0.0002, "epoch": 1.9102333931777378, "step": 26600}, {"loss": 0.741, "grad_norm": 0.6342210173606873, "learning_rate": 0.0002, "epoch": 1.910951526032316, "step": 26610}, {"loss": 0.7117, "grad_norm": 0.8222208023071289, "learning_rate": 0.0002, "epoch": 1.9116696588868942, "step": 26620}, {"loss": 0.6965, "grad_norm": 0.7890012860298157, "learning_rate": 0.0002, "epoch": 1.9123877917414722, "step": 26630}, {"loss": 0.7141, "grad_norm": 0.6415254473686218, "learning_rate": 0.0002, "epoch": 1.9131059245960502, "step": 26640}, {"loss": 0.7232, "grad_norm": 0.7936763763427734, "learning_rate": 0.0002, "epoch": 1.9138240574506282, "step": 26650}, {"loss": 0.7411, "grad_norm": 0.7174334526062012, "learning_rate": 0.0002, "epoch": 1.9145421903052064, "step": 26660}, {"loss": 0.715, "grad_norm": 0.6503710746765137, "learning_rate": 0.0002, "epoch": 1.9152603231597847, "step": 26670}, {"loss": 0.7629, "grad_norm": 0.7618577480316162, "learning_rate": 0.0002, "epoch": 1.9159784560143627, "step": 26680}, {"loss": 0.7581, "grad_norm": 0.7984131574630737, "learning_rate": 0.0002, "epoch": 1.9166965888689407, "step": 26690}, {"loss": 0.7126, "grad_norm": 0.6863887906074524, "learning_rate": 0.0002, "epoch": 1.917414721723519, "step": 26700}, {"loss": 0.738, "grad_norm": 0.7621138691902161, "learning_rate": 0.0002, "epoch": 1.918132854578097, "step": 26710}, {"loss": 0.7095, "grad_norm": 0.7855543494224548, "learning_rate": 0.0002, "epoch": 1.9188509874326751, "step": 26720}, {"loss": 0.7354, "grad_norm": 0.7045016288757324, "learning_rate": 0.0002, "epoch": 1.9195691202872531, "step": 26730}, {"loss": 0.7188, "grad_norm": 0.7799559235572815, "learning_rate": 0.0002, "epoch": 1.9202872531418311, "step": 26740}, {"loss": 0.7714, "grad_norm": 0.7999796271324158, "learning_rate": 0.0002, "epoch": 1.9210053859964094, "step": 26750}, {"loss": 0.6856, "grad_norm": 0.5479980111122131, "learning_rate": 0.0002, "epoch": 1.9217235188509876, "step": 26760}, {"loss": 0.7153, "grad_norm": 0.7192868590354919, "learning_rate": 0.0002, "epoch": 1.9224416517055656, "step": 26770}, {"loss": 0.7272, "grad_norm": 0.7642375826835632, "learning_rate": 0.0002, "epoch": 1.9231597845601436, "step": 26780}, {"loss": 0.6923, "grad_norm": 0.7015959620475769, "learning_rate": 0.0002, "epoch": 1.9238779174147216, "step": 26790}, {"loss": 0.8291, "grad_norm": 0.6685634851455688, "learning_rate": 0.0002, "epoch": 1.9245960502692998, "step": 26800}, {"loss": 0.7404, "grad_norm": 0.674363911151886, "learning_rate": 0.0002, "epoch": 1.925314183123878, "step": 26810}, {"loss": 0.7145, "grad_norm": 0.769318163394928, "learning_rate": 0.0002, "epoch": 1.926032315978456, "step": 26820}, {"loss": 0.7323, "grad_norm": 0.7397989630699158, "learning_rate": 0.0002, "epoch": 1.926750448833034, "step": 26830}, {"loss": 0.7399, "grad_norm": 0.7603814601898193, "learning_rate": 0.0002, "epoch": 1.9274685816876123, "step": 26840}, {"loss": 0.7147, "grad_norm": 0.5960564613342285, "learning_rate": 0.0002, "epoch": 1.9281867145421903, "step": 26850}, {"loss": 0.7292, "grad_norm": 0.8158858418464661, "learning_rate": 0.0002, "epoch": 1.9289048473967685, "step": 26860}, {"loss": 0.7609, "grad_norm": 0.7022058367729187, "learning_rate": 0.0002, "epoch": 1.9296229802513465, "step": 26870}, {"loss": 0.809, "grad_norm": 0.7249060273170471, "learning_rate": 0.0002, "epoch": 1.9303411131059245, "step": 26880}, {"loss": 0.7437, "grad_norm": 0.7613264322280884, "learning_rate": 0.0002, "epoch": 1.9310592459605027, "step": 26890}, {"loss": 0.7238, "grad_norm": 0.6857499480247498, "learning_rate": 0.0002, "epoch": 1.931777378815081, "step": 26900}, {"loss": 0.7651, "grad_norm": 0.6968346834182739, "learning_rate": 0.0002, "epoch": 1.932495511669659, "step": 26910}, {"loss": 0.6837, "grad_norm": 0.7079267501831055, "learning_rate": 0.0002, "epoch": 1.933213644524237, "step": 26920}, {"loss": 0.7482, "grad_norm": 0.6571618914604187, "learning_rate": 0.0002, "epoch": 1.933931777378815, "step": 26930}, {"loss": 0.7344, "grad_norm": 0.7460548281669617, "learning_rate": 0.0002, "epoch": 1.9346499102333932, "step": 26940}, {"loss": 0.7038, "grad_norm": 0.7954307794570923, "learning_rate": 0.0002, "epoch": 1.9353680430879714, "step": 26950}, {"loss": 0.6847, "grad_norm": 0.8696223497390747, "learning_rate": 0.0002, "epoch": 1.9360861759425494, "step": 26960}, {"loss": 0.7657, "grad_norm": 0.726004421710968, "learning_rate": 0.0002, "epoch": 1.9368043087971274, "step": 26970}, {"loss": 0.771, "grad_norm": 0.8760337829589844, "learning_rate": 0.0002, "epoch": 1.9375224416517056, "step": 26980}, {"loss": 0.6917, "grad_norm": 0.7308675646781921, "learning_rate": 0.0002, "epoch": 1.9382405745062836, "step": 26990}, {"loss": 0.7155, "grad_norm": 0.5900304317474365, "learning_rate": 0.0002, "epoch": 1.9389587073608618, "step": 27000}, {"loss": 0.6917, "grad_norm": 0.8839457631111145, "learning_rate": 0.0002, "epoch": 1.9396768402154398, "step": 27010}, {"loss": 0.7443, "grad_norm": 0.7239173650741577, "learning_rate": 0.0002, "epoch": 1.9403949730700178, "step": 27020}, {"loss": 0.7081, "grad_norm": 0.8972901701927185, "learning_rate": 0.0002, "epoch": 1.941113105924596, "step": 27030}, {"loss": 0.7422, "grad_norm": 0.7140652537345886, "learning_rate": 0.0002, "epoch": 1.9418312387791743, "step": 27040}, {"loss": 0.7679, "grad_norm": 0.7502743005752563, "learning_rate": 0.0002, "epoch": 1.9425493716337523, "step": 27050}, {"loss": 0.7311, "grad_norm": 0.6420751810073853, "learning_rate": 0.0002, "epoch": 1.9432675044883303, "step": 27060}, {"loss": 0.7403, "grad_norm": 0.6671820282936096, "learning_rate": 0.0002, "epoch": 1.9439856373429083, "step": 27070}, {"loss": 0.6919, "grad_norm": 0.6268796324729919, "learning_rate": 0.0002, "epoch": 1.9447037701974865, "step": 27080}, {"loss": 0.8154, "grad_norm": 0.6850021481513977, "learning_rate": 0.0002, "epoch": 1.9454219030520647, "step": 27090}, {"loss": 0.7179, "grad_norm": 0.6380038261413574, "learning_rate": 0.0002, "epoch": 1.9461400359066428, "step": 27100}, {"loss": 0.7638, "grad_norm": 0.5806204080581665, "learning_rate": 0.0002, "epoch": 1.9468581687612208, "step": 27110}, {"loss": 0.7032, "grad_norm": 0.8236927390098572, "learning_rate": 0.0002, "epoch": 1.947576301615799, "step": 27120}, {"loss": 0.7398, "grad_norm": 0.7915826439857483, "learning_rate": 0.0002, "epoch": 1.948294434470377, "step": 27130}, {"loss": 0.729, "grad_norm": 0.7467429041862488, "learning_rate": 0.0002, "epoch": 1.9490125673249552, "step": 27140}, {"loss": 0.7297, "grad_norm": 0.6278707981109619, "learning_rate": 0.0002, "epoch": 1.9497307001795332, "step": 27150}, {"loss": 0.7272, "grad_norm": 0.7353739142417908, "learning_rate": 0.0002, "epoch": 1.9504488330341112, "step": 27160}, {"loss": 0.6877, "grad_norm": 0.6443645358085632, "learning_rate": 0.0002, "epoch": 1.9511669658886894, "step": 27170}, {"loss": 0.7479, "grad_norm": 0.770800769329071, "learning_rate": 0.0002, "epoch": 1.9518850987432677, "step": 27180}, {"loss": 0.713, "grad_norm": 0.8982598781585693, "learning_rate": 0.0002, "epoch": 1.9526032315978457, "step": 27190}, {"loss": 0.7447, "grad_norm": 0.775017499923706, "learning_rate": 0.0002, "epoch": 1.9533213644524237, "step": 27200}, {"loss": 0.76, "grad_norm": 0.8271628618240356, "learning_rate": 0.0002, "epoch": 1.9540394973070017, "step": 27210}, {"loss": 0.7321, "grad_norm": 0.7460184693336487, "learning_rate": 0.0002, "epoch": 1.9547576301615799, "step": 27220}, {"loss": 0.6999, "grad_norm": 0.7732188105583191, "learning_rate": 0.0002, "epoch": 1.955475763016158, "step": 27230}, {"loss": 0.7135, "grad_norm": 0.7398577332496643, "learning_rate": 0.0002, "epoch": 1.956193895870736, "step": 27240}, {"loss": 0.7347, "grad_norm": 0.7132339477539062, "learning_rate": 0.0002, "epoch": 1.9569120287253141, "step": 27250}, {"loss": 0.7731, "grad_norm": 0.6718965768814087, "learning_rate": 0.0002, "epoch": 1.9576301615798921, "step": 27260}, {"loss": 0.7088, "grad_norm": 0.7914422154426575, "learning_rate": 0.0002, "epoch": 1.9583482944344703, "step": 27270}, {"loss": 0.6998, "grad_norm": 0.8314110636711121, "learning_rate": 0.0002, "epoch": 1.9590664272890486, "step": 27280}, {"loss": 0.7662, "grad_norm": 0.7810674905776978, "learning_rate": 0.0002, "epoch": 1.9597845601436266, "step": 27290}, {"loss": 0.7278, "grad_norm": 0.7691007256507874, "learning_rate": 0.0002, "epoch": 1.9605026929982046, "step": 27300}, {"loss": 0.7152, "grad_norm": 0.6753138899803162, "learning_rate": 0.0002, "epoch": 1.9612208258527828, "step": 27310}, {"loss": 0.7519, "grad_norm": 0.5881175994873047, "learning_rate": 0.0002, "epoch": 1.961938958707361, "step": 27320}, {"loss": 0.6877, "grad_norm": 0.8414133191108704, "learning_rate": 0.0002, "epoch": 1.962657091561939, "step": 27330}, {"loss": 0.7241, "grad_norm": 0.7363715171813965, "learning_rate": 0.0002, "epoch": 1.963375224416517, "step": 27340}, {"loss": 0.7153, "grad_norm": 0.6526232361793518, "learning_rate": 0.0002, "epoch": 1.964093357271095, "step": 27350}, {"loss": 0.8178, "grad_norm": 0.6821389198303223, "learning_rate": 0.0002, "epoch": 1.9648114901256732, "step": 27360}, {"loss": 0.7134, "grad_norm": 0.7306062579154968, "learning_rate": 0.0002, "epoch": 1.9655296229802515, "step": 27370}, {"loss": 0.7139, "grad_norm": 0.6458130478858948, "learning_rate": 0.0002, "epoch": 1.9662477558348295, "step": 27380}, {"loss": 0.7397, "grad_norm": 0.7243196368217468, "learning_rate": 0.0002, "epoch": 1.9669658886894075, "step": 27390}, {"loss": 0.6729, "grad_norm": 0.8062235713005066, "learning_rate": 0.0002, "epoch": 1.9676840215439855, "step": 27400}, {"loss": 0.7972, "grad_norm": 0.68441241979599, "learning_rate": 0.0002, "epoch": 1.9684021543985637, "step": 27410}, {"loss": 0.7235, "grad_norm": 0.7504498958587646, "learning_rate": 0.0002, "epoch": 1.969120287253142, "step": 27420}, {"loss": 0.7192, "grad_norm": 0.7469466328620911, "learning_rate": 0.0002, "epoch": 1.96983842010772, "step": 27430}, {"loss": 0.7556, "grad_norm": 0.7109853625297546, "learning_rate": 0.0002, "epoch": 1.970556552962298, "step": 27440}, {"loss": 0.7977, "grad_norm": 0.6964903473854065, "learning_rate": 0.0002, "epoch": 1.9712746858168761, "step": 27450}, {"loss": 0.7692, "grad_norm": 0.8224200010299683, "learning_rate": 0.0002, "epoch": 1.9719928186714544, "step": 27460}, {"loss": 0.7318, "grad_norm": 0.6195617318153381, "learning_rate": 0.0002, "epoch": 1.9727109515260324, "step": 27470}, {"loss": 0.7843, "grad_norm": 0.691511332988739, "learning_rate": 0.0002, "epoch": 1.9734290843806104, "step": 27480}, {"loss": 0.7324, "grad_norm": 0.7437900304794312, "learning_rate": 0.0002, "epoch": 1.9741472172351884, "step": 27490}, {"loss": 0.6736, "grad_norm": 0.7987960577011108, "learning_rate": 0.0002, "epoch": 1.9748653500897666, "step": 27500}, {"loss": 0.7005, "grad_norm": 0.7117776274681091, "learning_rate": 0.0002, "epoch": 1.9755834829443448, "step": 27510}, {"loss": 0.7201, "grad_norm": 0.8473866581916809, "learning_rate": 0.0002, "epoch": 1.9763016157989228, "step": 27520}, {"loss": 0.7528, "grad_norm": 0.7178242802619934, "learning_rate": 0.0002, "epoch": 1.9770197486535008, "step": 27530}, {"loss": 0.7112, "grad_norm": 0.760145902633667, "learning_rate": 0.0002, "epoch": 1.9777378815080788, "step": 27540}, {"loss": 0.8118, "grad_norm": 0.764436662197113, "learning_rate": 0.0002, "epoch": 1.978456014362657, "step": 27550}, {"loss": 0.7542, "grad_norm": 0.7245904803276062, "learning_rate": 0.0002, "epoch": 1.9791741472172353, "step": 27560}, {"loss": 0.7316, "grad_norm": 0.6317000389099121, "learning_rate": 0.0002, "epoch": 1.9798922800718133, "step": 27570}, {"loss": 0.7504, "grad_norm": 0.8764704465866089, "learning_rate": 0.0002, "epoch": 1.9806104129263913, "step": 27580}, {"loss": 0.7845, "grad_norm": 0.6111825108528137, "learning_rate": 0.0002, "epoch": 1.9813285457809695, "step": 27590}, {"loss": 0.7101, "grad_norm": 0.6797714233398438, "learning_rate": 0.0002, "epoch": 1.9820466786355477, "step": 27600}, {"loss": 0.8037, "grad_norm": 0.7754142880439758, "learning_rate": 0.0002, "epoch": 1.9827648114901257, "step": 27610}, {"loss": 0.7288, "grad_norm": 0.7243061661720276, "learning_rate": 0.0002, "epoch": 1.9834829443447037, "step": 27620}, {"loss": 0.6626, "grad_norm": 0.6194812655448914, "learning_rate": 0.0002, "epoch": 1.9842010771992817, "step": 27630}, {"loss": 0.7162, "grad_norm": 0.6399638056755066, "learning_rate": 0.0002, "epoch": 1.98491921005386, "step": 27640}, {"loss": 0.764, "grad_norm": 0.7637218832969666, "learning_rate": 0.0002, "epoch": 1.9856373429084382, "step": 27650}, {"loss": 0.7386, "grad_norm": 0.9099404811859131, "learning_rate": 0.0002, "epoch": 1.9863554757630162, "step": 27660}, {"loss": 0.7579, "grad_norm": 0.6892596483230591, "learning_rate": 0.0002, "epoch": 1.9870736086175942, "step": 27670}, {"loss": 0.7802, "grad_norm": 0.5962418913841248, "learning_rate": 0.0002, "epoch": 1.9877917414721722, "step": 27680}, {"loss": 0.7162, "grad_norm": 0.5750163197517395, "learning_rate": 0.0002, "epoch": 1.9885098743267504, "step": 27690}, {"loss": 0.7553, "grad_norm": 0.6740097403526306, "learning_rate": 0.0002, "epoch": 1.9892280071813286, "step": 27700}, {"loss": 0.7444, "grad_norm": 0.6968644857406616, "learning_rate": 0.0002, "epoch": 1.9899461400359066, "step": 27710}, {"loss": 0.7383, "grad_norm": 0.6788132190704346, "learning_rate": 0.0002, "epoch": 1.9906642728904846, "step": 27720}, {"loss": 0.7249, "grad_norm": 0.8600544929504395, "learning_rate": 0.0002, "epoch": 1.9913824057450629, "step": 27730}, {"loss": 0.7133, "grad_norm": 0.6227671504020691, "learning_rate": 0.0002, "epoch": 1.992100538599641, "step": 27740}, {"loss": 0.7815, "grad_norm": 0.6611875295639038, "learning_rate": 0.0002, "epoch": 1.992818671454219, "step": 27750}, {"loss": 0.7423, "grad_norm": 0.714568018913269, "learning_rate": 0.0002, "epoch": 1.993536804308797, "step": 27760}, {"loss": 0.7297, "grad_norm": 0.6328669190406799, "learning_rate": 0.0002, "epoch": 1.994254937163375, "step": 27770}, {"loss": 0.7398, "grad_norm": 0.8673429489135742, "learning_rate": 0.0002, "epoch": 1.9949730700179533, "step": 27780}, {"loss": 0.7301, "grad_norm": 0.820620059967041, "learning_rate": 0.0002, "epoch": 1.9956912028725315, "step": 27790}, {"loss": 0.7828, "grad_norm": 0.8748094439506531, "learning_rate": 0.0002, "epoch": 1.9964093357271095, "step": 27800}, {"loss": 0.6945, "grad_norm": 0.8118113875389099, "learning_rate": 0.0002, "epoch": 1.9971274685816875, "step": 27810}, {"loss": 0.742, "grad_norm": 0.6886725425720215, "learning_rate": 0.0002, "epoch": 1.9978456014362656, "step": 27820}, {"loss": 0.7293, "grad_norm": 0.7101268768310547, "learning_rate": 0.0002, "epoch": 1.9985637342908438, "step": 27830}, {"loss": 0.7317, "grad_norm": 0.7823781967163086, "learning_rate": 0.0002, "epoch": 1.999281867145422, "step": 27840}, {"loss": 0.7711, "grad_norm": 0.8491085767745972, "learning_rate": 0.0002, "epoch": 2.0, "step": 27850}, {"eval_loss": 1.0868422985076904, "eval_runtime": 55.1699, "eval_samples_per_second": 13.286, "eval_steps_per_second": 1.668, "epoch": 2.0, "step": 27850}, {"loss": 0.6808, "grad_norm": 0.9003389477729797, "learning_rate": 0.0002, "epoch": 2.000718132854578, "step": 27860}, {"loss": 0.6379, "grad_norm": 0.8898349404335022, "learning_rate": 0.0002, "epoch": 2.001436265709156, "step": 27870}, {"loss": 0.7157, "grad_norm": 0.7525973320007324, "learning_rate": 0.0002, "epoch": 2.0021543985637344, "step": 27880}, {"loss": 0.6681, "grad_norm": 0.7821497321128845, "learning_rate": 0.0002, "epoch": 2.0028725314183125, "step": 27890}, {"loss": 0.6781, "grad_norm": 0.6334691047668457, "learning_rate": 0.0002, "epoch": 2.0035906642728905, "step": 27900}, {"loss": 0.6349, "grad_norm": 0.732991099357605, "learning_rate": 0.0002, "epoch": 2.0043087971274685, "step": 27910}, {"loss": 0.6776, "grad_norm": 0.949942946434021, "learning_rate": 0.0002, "epoch": 2.0050269299820465, "step": 27920}, {"loss": 0.735, "grad_norm": 0.657267689704895, "learning_rate": 0.0002, "epoch": 2.005745062836625, "step": 27930}, {"loss": 0.7123, "grad_norm": 0.8329252004623413, "learning_rate": 0.0002, "epoch": 2.006463195691203, "step": 27940}, {"loss": 0.6826, "grad_norm": 0.7816959023475647, "learning_rate": 0.0002, "epoch": 2.007181328545781, "step": 27950}, {"loss": 0.6511, "grad_norm": 0.7546323537826538, "learning_rate": 0.0002, "epoch": 2.007899461400359, "step": 27960}, {"loss": 0.6222, "grad_norm": 0.9519657492637634, "learning_rate": 0.0002, "epoch": 2.0086175942549374, "step": 27970}, {"loss": 0.6642, "grad_norm": 0.7934315800666809, "learning_rate": 0.0002, "epoch": 2.0093357271095154, "step": 27980}, {"loss": 0.666, "grad_norm": 0.9579764604568481, "learning_rate": 0.0002, "epoch": 2.0100538599640934, "step": 27990}, {"loss": 0.6376, "grad_norm": 0.764167070388794, "learning_rate": 0.0002, "epoch": 2.0107719928186714, "step": 28000}, {"loss": 0.6512, "grad_norm": 0.7380000948905945, "learning_rate": 0.0002, "epoch": 2.0114901256732494, "step": 28010}, {"loss": 0.6893, "grad_norm": 0.7220044732093811, "learning_rate": 0.0002, "epoch": 2.012208258527828, "step": 28020}, {"loss": 0.6168, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 2.012926391382406, "step": 28030}, {"loss": 0.6595, "grad_norm": 0.7507190704345703, "learning_rate": 0.0002, "epoch": 2.013644524236984, "step": 28040}, {"loss": 0.6974, "grad_norm": 0.9488387703895569, "learning_rate": 0.0002, "epoch": 2.014362657091562, "step": 28050}, {"loss": 0.6489, "grad_norm": 0.9092940092086792, "learning_rate": 0.0002, "epoch": 2.01508078994614, "step": 28060}, {"loss": 0.6545, "grad_norm": 0.7859629392623901, "learning_rate": 0.0002, "epoch": 2.0157989228007183, "step": 28070}, {"loss": 0.6552, "grad_norm": 0.7636393904685974, "learning_rate": 0.0002, "epoch": 2.0165170556552963, "step": 28080}, {"loss": 0.696, "grad_norm": 0.8860714435577393, "learning_rate": 0.0002, "epoch": 2.0172351885098743, "step": 28090}, {"loss": 0.6368, "grad_norm": 0.6837195158004761, "learning_rate": 0.0002, "epoch": 2.0179533213644523, "step": 28100}, {"loss": 0.6405, "grad_norm": 0.7778242826461792, "learning_rate": 0.0002, "epoch": 2.0186714542190307, "step": 28110}, {"loss": 0.6417, "grad_norm": 0.7164766788482666, "learning_rate": 0.0002, "epoch": 2.0193895870736087, "step": 28120}, {"loss": 0.6684, "grad_norm": 0.8965572118759155, "learning_rate": 0.0002, "epoch": 2.0201077199281867, "step": 28130}, {"loss": 0.6213, "grad_norm": 0.8074374794960022, "learning_rate": 0.0002, "epoch": 2.0208258527827647, "step": 28140}, {"loss": 0.6563, "grad_norm": 0.8307222127914429, "learning_rate": 0.0002, "epoch": 2.0215439856373427, "step": 28150}, {"loss": 0.6617, "grad_norm": 0.9600032567977905, "learning_rate": 0.0002, "epoch": 2.022262118491921, "step": 28160}, {"loss": 0.6722, "grad_norm": 0.8541040420532227, "learning_rate": 0.0002, "epoch": 2.022980251346499, "step": 28170}, {"loss": 0.6803, "grad_norm": 0.8864985704421997, "learning_rate": 0.0002, "epoch": 2.023698384201077, "step": 28180}, {"loss": 0.6516, "grad_norm": 0.7926326990127563, "learning_rate": 0.0002, "epoch": 2.024416517055655, "step": 28190}, {"loss": 0.6595, "grad_norm": 1.0548077821731567, "learning_rate": 0.0002, "epoch": 2.025134649910233, "step": 28200}, {"loss": 0.6859, "grad_norm": 0.7468827366828918, "learning_rate": 0.0002, "epoch": 2.0258527827648116, "step": 28210}, {"loss": 0.6605, "grad_norm": 0.7683286070823669, "learning_rate": 0.0002, "epoch": 2.0265709156193896, "step": 28220}, {"loss": 0.6656, "grad_norm": 0.7307319641113281, "learning_rate": 0.0002, "epoch": 2.0272890484739676, "step": 28230}, {"loss": 0.7148, "grad_norm": 0.7813416719436646, "learning_rate": 0.0002, "epoch": 2.0280071813285456, "step": 28240}, {"loss": 0.6882, "grad_norm": 0.7954556941986084, "learning_rate": 0.0002, "epoch": 2.028725314183124, "step": 28250}, {"loss": 0.6192, "grad_norm": 0.8836418986320496, "learning_rate": 0.0002, "epoch": 2.029443447037702, "step": 28260}, {"loss": 0.6275, "grad_norm": 0.7092728614807129, "learning_rate": 0.0002, "epoch": 2.03016157989228, "step": 28270}, {"loss": 0.6735, "grad_norm": 0.8512285351753235, "learning_rate": 0.0002, "epoch": 2.030879712746858, "step": 28280}, {"loss": 0.6586, "grad_norm": 0.8005346059799194, "learning_rate": 0.0002, "epoch": 2.031597845601436, "step": 28290}, {"loss": 0.6129, "grad_norm": 0.8872515559196472, "learning_rate": 0.0002, "epoch": 2.0323159784560145, "step": 28300}, {"loss": 0.6935, "grad_norm": 0.7948436737060547, "learning_rate": 0.0002, "epoch": 2.0330341113105925, "step": 28310}, {"loss": 0.6831, "grad_norm": 0.7418082356452942, "learning_rate": 0.0002, "epoch": 2.0337522441651705, "step": 28320}, {"loss": 0.6922, "grad_norm": 0.9600949287414551, "learning_rate": 0.0002, "epoch": 2.0344703770197485, "step": 28330}, {"loss": 0.6015, "grad_norm": 0.9767434597015381, "learning_rate": 0.0002, "epoch": 2.0351885098743265, "step": 28340}, {"loss": 0.6637, "grad_norm": 0.7435336709022522, "learning_rate": 0.0002, "epoch": 2.035906642728905, "step": 28350}, {"loss": 0.649, "grad_norm": 0.997978925704956, "learning_rate": 0.0002, "epoch": 2.036624775583483, "step": 28360}, {"loss": 0.6957, "grad_norm": 0.9072412252426147, "learning_rate": 0.0002, "epoch": 2.037342908438061, "step": 28370}, {"loss": 0.6816, "grad_norm": 0.8396701812744141, "learning_rate": 0.0002, "epoch": 2.038061041292639, "step": 28380}, {"loss": 0.6487, "grad_norm": 1.0449832677841187, "learning_rate": 0.0002, "epoch": 2.0387791741472174, "step": 28390}, {"loss": 0.6826, "grad_norm": 0.6471025943756104, "learning_rate": 0.0002, "epoch": 2.0394973070017954, "step": 28400}, {"loss": 0.6597, "grad_norm": 0.8147950768470764, "learning_rate": 0.0002, "epoch": 2.0402154398563734, "step": 28410}, {"loss": 0.6502, "grad_norm": 0.902508020401001, "learning_rate": 0.0002, "epoch": 2.0409335727109514, "step": 28420}, {"loss": 0.6303, "grad_norm": 0.6426262855529785, "learning_rate": 0.0002, "epoch": 2.0416517055655294, "step": 28430}, {"loss": 0.6812, "grad_norm": 0.8016643524169922, "learning_rate": 0.0002, "epoch": 2.042369838420108, "step": 28440}, {"loss": 0.6535, "grad_norm": 0.6841614246368408, "learning_rate": 0.0002, "epoch": 2.043087971274686, "step": 28450}, {"loss": 0.638, "grad_norm": 0.7713631987571716, "learning_rate": 0.0002, "epoch": 2.043806104129264, "step": 28460}, {"loss": 0.6456, "grad_norm": 0.8795675039291382, "learning_rate": 0.0002, "epoch": 2.044524236983842, "step": 28470}, {"loss": 0.6858, "grad_norm": 0.725447416305542, "learning_rate": 0.0002, "epoch": 2.04524236983842, "step": 28480}, {"loss": 0.6289, "grad_norm": 0.806861162185669, "learning_rate": 0.0002, "epoch": 2.0459605026929983, "step": 28490}, {"loss": 0.6269, "grad_norm": 0.752953827381134, "learning_rate": 0.0002, "epoch": 2.0466786355475763, "step": 28500}, {"loss": 0.6818, "grad_norm": 0.7143173813819885, "learning_rate": 0.0002, "epoch": 2.0473967684021543, "step": 28510}, {"loss": 0.6606, "grad_norm": 0.9316226243972778, "learning_rate": 0.0002, "epoch": 2.0481149012567323, "step": 28520}, {"loss": 0.6284, "grad_norm": 0.7292338609695435, "learning_rate": 0.0002, "epoch": 2.048833034111311, "step": 28530}, {"loss": 0.6528, "grad_norm": 0.7392885088920593, "learning_rate": 0.0002, "epoch": 2.049551166965889, "step": 28540}, {"loss": 0.7007, "grad_norm": 0.7288873195648193, "learning_rate": 0.0002, "epoch": 2.050269299820467, "step": 28550}, {"loss": 0.6239, "grad_norm": 0.7791221141815186, "learning_rate": 0.0002, "epoch": 2.050987432675045, "step": 28560}, {"loss": 0.684, "grad_norm": 0.821983814239502, "learning_rate": 0.0002, "epoch": 2.051705565529623, "step": 28570}, {"loss": 0.6545, "grad_norm": 0.8925826549530029, "learning_rate": 0.0002, "epoch": 2.0524236983842012, "step": 28580}, {"loss": 0.719, "grad_norm": 0.7181646227836609, "learning_rate": 0.0002, "epoch": 2.0531418312387792, "step": 28590}, {"loss": 0.686, "grad_norm": 0.6387725472450256, "learning_rate": 0.0002, "epoch": 2.0538599640933572, "step": 28600}, {"loss": 0.6662, "grad_norm": 0.8398096561431885, "learning_rate": 0.0002, "epoch": 2.0545780969479353, "step": 28610}, {"loss": 0.69, "grad_norm": 1.0458195209503174, "learning_rate": 0.0002, "epoch": 2.0552962298025133, "step": 28620}, {"loss": 0.655, "grad_norm": 0.7032150626182556, "learning_rate": 0.0002, "epoch": 2.0560143626570917, "step": 28630}, {"loss": 0.6551, "grad_norm": 0.8850845098495483, "learning_rate": 0.0002, "epoch": 2.0567324955116697, "step": 28640}, {"loss": 0.6767, "grad_norm": 0.8587120175361633, "learning_rate": 0.0002, "epoch": 2.0574506283662477, "step": 28650}, {"loss": 0.6721, "grad_norm": 0.7462602853775024, "learning_rate": 0.0002, "epoch": 2.0581687612208257, "step": 28660}, {"loss": 0.6639, "grad_norm": 0.7355574369430542, "learning_rate": 0.0002, "epoch": 2.058886894075404, "step": 28670}, {"loss": 0.6216, "grad_norm": 0.9229736328125, "learning_rate": 0.0002, "epoch": 2.059605026929982, "step": 28680}, {"loss": 0.6692, "grad_norm": 0.7685085535049438, "learning_rate": 0.0002, "epoch": 2.06032315978456, "step": 28690}, {"loss": 0.6801, "grad_norm": 0.6749364137649536, "learning_rate": 0.0002, "epoch": 2.061041292639138, "step": 28700}, {"loss": 0.6721, "grad_norm": 0.7608520984649658, "learning_rate": 0.0002, "epoch": 2.061759425493716, "step": 28710}, {"loss": 0.6721, "grad_norm": 0.9451281428337097, "learning_rate": 0.0002, "epoch": 2.0624775583482946, "step": 28720}, {"loss": 0.671, "grad_norm": 0.7869735360145569, "learning_rate": 0.0002, "epoch": 2.0631956912028726, "step": 28730}, {"loss": 0.6409, "grad_norm": 0.8422008156776428, "learning_rate": 0.0002, "epoch": 2.0639138240574506, "step": 28740}, {"loss": 0.6686, "grad_norm": 0.7486162781715393, "learning_rate": 0.0002, "epoch": 2.0646319569120286, "step": 28750}, {"loss": 0.6641, "grad_norm": 0.9374173879623413, "learning_rate": 0.0002, "epoch": 2.0653500897666066, "step": 28760}, {"loss": 0.6737, "grad_norm": 0.8749295473098755, "learning_rate": 0.0002, "epoch": 2.066068222621185, "step": 28770}, {"loss": 0.636, "grad_norm": 0.8265942931175232, "learning_rate": 0.0002, "epoch": 2.066786355475763, "step": 28780}, {"loss": 0.6819, "grad_norm": 0.8541982769966125, "learning_rate": 0.0002, "epoch": 2.067504488330341, "step": 28790}, {"loss": 0.661, "grad_norm": 0.8220006227493286, "learning_rate": 0.0002, "epoch": 2.068222621184919, "step": 28800}, {"loss": 0.6942, "grad_norm": 0.7302022576332092, "learning_rate": 0.0002, "epoch": 2.0689407540394975, "step": 28810}, {"loss": 0.68, "grad_norm": 0.7073875069618225, "learning_rate": 0.0002, "epoch": 2.0696588868940755, "step": 28820}, {"loss": 0.6275, "grad_norm": 0.7792919874191284, "learning_rate": 0.0002, "epoch": 2.0703770197486535, "step": 28830}, {"loss": 0.6941, "grad_norm": 0.8268185257911682, "learning_rate": 0.0002, "epoch": 2.0710951526032315, "step": 28840}, {"loss": 0.6776, "grad_norm": 0.7576423287391663, "learning_rate": 0.0002, "epoch": 2.0718132854578095, "step": 28850}, {"loss": 0.6298, "grad_norm": 0.8255910873413086, "learning_rate": 0.0002, "epoch": 2.072531418312388, "step": 28860}, {"loss": 0.6695, "grad_norm": 0.7900934815406799, "learning_rate": 0.0002, "epoch": 2.073249551166966, "step": 28870}, {"loss": 0.6532, "grad_norm": 0.846665620803833, "learning_rate": 0.0002, "epoch": 2.073967684021544, "step": 28880}, {"loss": 0.6598, "grad_norm": 0.8159831166267395, "learning_rate": 0.0002, "epoch": 2.074685816876122, "step": 28890}, {"loss": 0.6341, "grad_norm": 0.7395941615104675, "learning_rate": 0.0002, "epoch": 2.0754039497307, "step": 28900}, {"loss": 0.6513, "grad_norm": 0.9765046238899231, "learning_rate": 0.0002, "epoch": 2.0761220825852784, "step": 28910}, {"loss": 0.6785, "grad_norm": 0.8358173966407776, "learning_rate": 0.0002, "epoch": 2.0768402154398564, "step": 28920}, {"loss": 0.6973, "grad_norm": 0.6848723292350769, "learning_rate": 0.0002, "epoch": 2.0775583482944344, "step": 28930}, {"loss": 0.6381, "grad_norm": 0.7965065836906433, "learning_rate": 0.0002, "epoch": 2.0782764811490124, "step": 28940}, {"loss": 0.667, "grad_norm": 0.7618608474731445, "learning_rate": 0.0002, "epoch": 2.078994614003591, "step": 28950}, {"loss": 0.6683, "grad_norm": 0.890615701675415, "learning_rate": 0.0002, "epoch": 2.079712746858169, "step": 28960}, {"loss": 0.6641, "grad_norm": 0.7310431003570557, "learning_rate": 0.0002, "epoch": 2.080430879712747, "step": 28970}, {"loss": 0.6511, "grad_norm": 0.8228268027305603, "learning_rate": 0.0002, "epoch": 2.081149012567325, "step": 28980}, {"loss": 0.655, "grad_norm": 0.883577287197113, "learning_rate": 0.0002, "epoch": 2.081867145421903, "step": 28990}, {"loss": 0.7232, "grad_norm": 0.8359243869781494, "learning_rate": 0.0002, "epoch": 2.0825852782764813, "step": 29000}, {"loss": 0.6744, "grad_norm": 0.8285391330718994, "learning_rate": 0.0002, "epoch": 2.0833034111310593, "step": 29010}, {"loss": 0.6951, "grad_norm": 0.8991064429283142, "learning_rate": 0.0002, "epoch": 2.0840215439856373, "step": 29020}, {"loss": 0.6444, "grad_norm": 0.6911244988441467, "learning_rate": 0.0002, "epoch": 2.0847396768402153, "step": 29030}, {"loss": 0.7098, "grad_norm": 0.8462249636650085, "learning_rate": 0.0002, "epoch": 2.0854578096947933, "step": 29040}, {"loss": 0.6813, "grad_norm": 0.9149548411369324, "learning_rate": 0.0002, "epoch": 2.0861759425493718, "step": 29050}, {"loss": 0.6948, "grad_norm": 0.7365630269050598, "learning_rate": 0.0002, "epoch": 2.0868940754039498, "step": 29060}, {"loss": 0.6391, "grad_norm": 0.8439079523086548, "learning_rate": 0.0002, "epoch": 2.087612208258528, "step": 29070}, {"loss": 0.6566, "grad_norm": 0.7123780846595764, "learning_rate": 0.0002, "epoch": 2.088330341113106, "step": 29080}, {"loss": 0.6305, "grad_norm": 0.6854261755943298, "learning_rate": 0.0002, "epoch": 2.0890484739676842, "step": 29090}, {"loss": 0.667, "grad_norm": 0.83026123046875, "learning_rate": 0.0002, "epoch": 2.0897666068222622, "step": 29100}, {"loss": 0.661, "grad_norm": 0.8413158059120178, "learning_rate": 0.0002, "epoch": 2.0904847396768402, "step": 29110}, {"loss": 0.7194, "grad_norm": 0.9646758437156677, "learning_rate": 0.0002, "epoch": 2.0912028725314182, "step": 29120}, {"loss": 0.7101, "grad_norm": 0.8421565890312195, "learning_rate": 0.0002, "epoch": 2.0919210053859962, "step": 29130}, {"loss": 0.6685, "grad_norm": 0.7748899459838867, "learning_rate": 0.0002, "epoch": 2.0926391382405747, "step": 29140}, {"loss": 0.6596, "grad_norm": 0.5973830819129944, "learning_rate": 0.0002, "epoch": 2.0933572710951527, "step": 29150}, {"loss": 0.6437, "grad_norm": 0.8440837860107422, "learning_rate": 0.0002, "epoch": 2.0940754039497307, "step": 29160}, {"loss": 0.6373, "grad_norm": 0.7392688989639282, "learning_rate": 0.0002, "epoch": 2.0947935368043087, "step": 29170}, {"loss": 0.6907, "grad_norm": 1.0522996187210083, "learning_rate": 0.0002, "epoch": 2.0955116696588867, "step": 29180}, {"loss": 0.6733, "grad_norm": 0.7330273389816284, "learning_rate": 0.0002, "epoch": 2.096229802513465, "step": 29190}, {"loss": 0.7219, "grad_norm": 1.11064875125885, "learning_rate": 0.0002, "epoch": 2.096947935368043, "step": 29200}, {"loss": 0.6125, "grad_norm": 0.795446515083313, "learning_rate": 0.0002, "epoch": 2.097666068222621, "step": 29210}, {"loss": 0.6466, "grad_norm": 0.5552594661712646, "learning_rate": 0.0002, "epoch": 2.098384201077199, "step": 29220}, {"loss": 0.6601, "grad_norm": 0.7327710390090942, "learning_rate": 0.0002, "epoch": 2.0991023339317776, "step": 29230}, {"loss": 0.656, "grad_norm": 0.7474247217178345, "learning_rate": 0.0002, "epoch": 2.0998204667863556, "step": 29240}, {"loss": 0.6707, "grad_norm": 0.7775853276252747, "learning_rate": 0.0002, "epoch": 2.1005385996409336, "step": 29250}, {"loss": 0.6623, "grad_norm": 0.769527018070221, "learning_rate": 0.0002, "epoch": 2.1012567324955116, "step": 29260}, {"loss": 0.6183, "grad_norm": 0.8350797891616821, "learning_rate": 0.0002, "epoch": 2.1019748653500896, "step": 29270}, {"loss": 0.6623, "grad_norm": 0.8749061822891235, "learning_rate": 0.0002, "epoch": 2.102692998204668, "step": 29280}, {"loss": 0.6292, "grad_norm": 0.7838778495788574, "learning_rate": 0.0002, "epoch": 2.103411131059246, "step": 29290}, {"loss": 0.699, "grad_norm": 0.8144710063934326, "learning_rate": 0.0002, "epoch": 2.104129263913824, "step": 29300}, {"loss": 0.6291, "grad_norm": 0.7965250015258789, "learning_rate": 0.0002, "epoch": 2.104847396768402, "step": 29310}, {"loss": 0.6387, "grad_norm": 0.7075945138931274, "learning_rate": 0.0002, "epoch": 2.10556552962298, "step": 29320}, {"loss": 0.6846, "grad_norm": 0.9449555277824402, "learning_rate": 0.0002, "epoch": 2.1062836624775585, "step": 29330}, {"loss": 0.6571, "grad_norm": 0.9114580750465393, "learning_rate": 0.0002, "epoch": 2.1070017953321365, "step": 29340}, {"loss": 0.6652, "grad_norm": 0.8768125176429749, "learning_rate": 0.0002, "epoch": 2.1077199281867145, "step": 29350}, {"loss": 0.7134, "grad_norm": 0.8586908578872681, "learning_rate": 0.0002, "epoch": 2.1084380610412925, "step": 29360}, {"loss": 0.6471, "grad_norm": 0.8351234793663025, "learning_rate": 0.0002, "epoch": 2.109156193895871, "step": 29370}, {"loss": 0.671, "grad_norm": 0.686488687992096, "learning_rate": 0.0002, "epoch": 2.109874326750449, "step": 29380}, {"loss": 0.6706, "grad_norm": 0.7910184264183044, "learning_rate": 0.0002, "epoch": 2.110592459605027, "step": 29390}, {"loss": 0.7367, "grad_norm": 0.7649612426757812, "learning_rate": 0.0002, "epoch": 2.111310592459605, "step": 29400}, {"loss": 0.6386, "grad_norm": 0.7790259122848511, "learning_rate": 0.0002, "epoch": 2.112028725314183, "step": 29410}, {"loss": 0.6983, "grad_norm": 0.8386351466178894, "learning_rate": 0.0002, "epoch": 2.1127468581687614, "step": 29420}, {"loss": 0.6519, "grad_norm": 0.8605695366859436, "learning_rate": 0.0002, "epoch": 2.1134649910233394, "step": 29430}, {"loss": 0.6686, "grad_norm": 0.6808947920799255, "learning_rate": 0.0002, "epoch": 2.1141831238779174, "step": 29440}, {"loss": 0.6743, "grad_norm": 0.8310001492500305, "learning_rate": 0.0002, "epoch": 2.1149012567324954, "step": 29450}, {"loss": 0.6669, "grad_norm": 1.289986252784729, "learning_rate": 0.0002, "epoch": 2.1156193895870734, "step": 29460}, {"loss": 0.6947, "grad_norm": 0.8679313659667969, "learning_rate": 0.0002, "epoch": 2.116337522441652, "step": 29470}, {"loss": 0.6954, "grad_norm": 0.9149175882339478, "learning_rate": 0.0002, "epoch": 2.11705565529623, "step": 29480}, {"loss": 0.6908, "grad_norm": 0.8405622839927673, "learning_rate": 0.0002, "epoch": 2.117773788150808, "step": 29490}, {"loss": 0.7436, "grad_norm": 0.9174691438674927, "learning_rate": 0.0002, "epoch": 2.118491921005386, "step": 29500}, {"loss": 0.6804, "grad_norm": 0.8865614533424377, "learning_rate": 0.0002, "epoch": 2.1192100538599643, "step": 29510}, {"loss": 0.6535, "grad_norm": 0.645301342010498, "learning_rate": 0.0002, "epoch": 2.1199281867145423, "step": 29520}, {"loss": 0.6879, "grad_norm": 0.7612960338592529, "learning_rate": 0.0002, "epoch": 2.1206463195691203, "step": 29530}, {"loss": 0.6874, "grad_norm": 0.7575576305389404, "learning_rate": 0.0002, "epoch": 2.1213644524236983, "step": 29540}, {"loss": 0.6924, "grad_norm": 0.8746156096458435, "learning_rate": 0.0002, "epoch": 2.1220825852782763, "step": 29550}, {"loss": 0.6659, "grad_norm": 0.8488934636116028, "learning_rate": 0.0002, "epoch": 2.1228007181328548, "step": 29560}, {"loss": 0.6568, "grad_norm": 0.8064972162246704, "learning_rate": 0.0002, "epoch": 2.1235188509874328, "step": 29570}, {"loss": 0.713, "grad_norm": 0.7410933971405029, "learning_rate": 0.0002, "epoch": 2.1242369838420108, "step": 29580}, {"loss": 0.649, "grad_norm": 0.7023535966873169, "learning_rate": 0.0002, "epoch": 2.1249551166965888, "step": 29590}, {"loss": 0.6574, "grad_norm": 0.8591743111610413, "learning_rate": 0.0002, "epoch": 2.1256732495511668, "step": 29600}, {"loss": 0.673, "grad_norm": 0.7270186543464661, "learning_rate": 0.0002, "epoch": 2.126391382405745, "step": 29610}, {"loss": 0.6262, "grad_norm": 0.9639726281166077, "learning_rate": 0.0002, "epoch": 2.127109515260323, "step": 29620}, {"loss": 0.6434, "grad_norm": 0.8519027829170227, "learning_rate": 0.0002, "epoch": 2.127827648114901, "step": 29630}, {"loss": 0.6843, "grad_norm": 0.8786447048187256, "learning_rate": 0.0002, "epoch": 2.128545780969479, "step": 29640}, {"loss": 0.6386, "grad_norm": 0.7452822923660278, "learning_rate": 0.0002, "epoch": 2.129263913824057, "step": 29650}, {"loss": 0.6577, "grad_norm": 0.9385744333267212, "learning_rate": 0.0002, "epoch": 2.1299820466786357, "step": 29660}, {"loss": 0.7088, "grad_norm": 0.7650160193443298, "learning_rate": 0.0002, "epoch": 2.1307001795332137, "step": 29670}, {"loss": 0.6742, "grad_norm": 0.7581976652145386, "learning_rate": 0.0002, "epoch": 2.1314183123877917, "step": 29680}, {"loss": 0.6358, "grad_norm": 0.8455183506011963, "learning_rate": 0.0002, "epoch": 2.1321364452423697, "step": 29690}, {"loss": 0.6288, "grad_norm": 0.7200509905815125, "learning_rate": 0.0002, "epoch": 2.132854578096948, "step": 29700}, {"loss": 0.695, "grad_norm": 0.7071877121925354, "learning_rate": 0.0002, "epoch": 2.133572710951526, "step": 29710}, {"loss": 0.6852, "grad_norm": 0.9197220802307129, "learning_rate": 0.0002, "epoch": 2.134290843806104, "step": 29720}, {"loss": 0.6578, "grad_norm": 0.6787277460098267, "learning_rate": 0.0002, "epoch": 2.135008976660682, "step": 29730}, {"loss": 0.666, "grad_norm": 0.8183788061141968, "learning_rate": 0.0002, "epoch": 2.13572710951526, "step": 29740}, {"loss": 0.6754, "grad_norm": 0.7958994507789612, "learning_rate": 0.0002, "epoch": 2.1364452423698386, "step": 29750}, {"loss": 0.6761, "grad_norm": 0.8803889155387878, "learning_rate": 0.0002, "epoch": 2.1371633752244166, "step": 29760}, {"loss": 0.686, "grad_norm": 0.6682677268981934, "learning_rate": 0.0002, "epoch": 2.1378815080789946, "step": 29770}, {"loss": 0.6878, "grad_norm": 1.0198085308074951, "learning_rate": 0.0002, "epoch": 2.1385996409335726, "step": 29780}, {"loss": 0.6576, "grad_norm": 1.0258227586746216, "learning_rate": 0.0002, "epoch": 2.139317773788151, "step": 29790}, {"loss": 0.6454, "grad_norm": 0.8920917510986328, "learning_rate": 0.0002, "epoch": 2.140035906642729, "step": 29800}, {"loss": 0.6926, "grad_norm": 0.8352635502815247, "learning_rate": 0.0002, "epoch": 2.140754039497307, "step": 29810}, {"loss": 0.692, "grad_norm": 0.8422067165374756, "learning_rate": 0.0002, "epoch": 2.141472172351885, "step": 29820}, {"loss": 0.72, "grad_norm": 0.8845202326774597, "learning_rate": 0.0002, "epoch": 2.142190305206463, "step": 29830}, {"loss": 0.688, "grad_norm": 0.659397542476654, "learning_rate": 0.0002, "epoch": 2.1429084380610415, "step": 29840}, {"loss": 0.6354, "grad_norm": 0.6233306527137756, "learning_rate": 0.0002, "epoch": 2.1436265709156195, "step": 29850}, {"loss": 0.6946, "grad_norm": 0.8951199054718018, "learning_rate": 0.0002, "epoch": 2.1443447037701975, "step": 29860}, {"loss": 0.6417, "grad_norm": 0.6980211734771729, "learning_rate": 0.0002, "epoch": 2.1450628366247755, "step": 29870}, {"loss": 0.6754, "grad_norm": 0.8463385105133057, "learning_rate": 0.0002, "epoch": 2.1457809694793535, "step": 29880}, {"loss": 0.6636, "grad_norm": 0.682183027267456, "learning_rate": 0.0002, "epoch": 2.146499102333932, "step": 29890}, {"loss": 0.6605, "grad_norm": 0.8491033911705017, "learning_rate": 0.0002, "epoch": 2.14721723518851, "step": 29900}, {"loss": 0.6851, "grad_norm": 0.8112631440162659, "learning_rate": 0.0002, "epoch": 2.147935368043088, "step": 29910}, {"loss": 0.6804, "grad_norm": 1.0186359882354736, "learning_rate": 0.0002, "epoch": 2.148653500897666, "step": 29920}, {"loss": 0.6709, "grad_norm": 0.7904929518699646, "learning_rate": 0.0002, "epoch": 2.149371633752244, "step": 29930}, {"loss": 0.6535, "grad_norm": 0.8381312489509583, "learning_rate": 0.0002, "epoch": 2.1500897666068224, "step": 29940}, {"loss": 0.6896, "grad_norm": 0.7596192359924316, "learning_rate": 0.0002, "epoch": 2.1508078994614004, "step": 29950}, {"loss": 0.6473, "grad_norm": 0.7532448768615723, "learning_rate": 0.0002, "epoch": 2.1515260323159784, "step": 29960}, {"loss": 0.7051, "grad_norm": 0.7877430319786072, "learning_rate": 0.0002, "epoch": 2.1522441651705564, "step": 29970}, {"loss": 0.6657, "grad_norm": 0.6870610117912292, "learning_rate": 0.0002, "epoch": 2.152962298025135, "step": 29980}, {"loss": 0.6518, "grad_norm": 0.7154987454414368, "learning_rate": 0.0002, "epoch": 2.153680430879713, "step": 29990}, {"loss": 0.6418, "grad_norm": 0.7692370414733887, "learning_rate": 0.0002, "epoch": 2.154398563734291, "step": 30000}, {"loss": 0.6557, "grad_norm": 0.7745859026908875, "learning_rate": 0.0002, "epoch": 2.155116696588869, "step": 30010}, {"loss": 0.61, "grad_norm": 0.718207061290741, "learning_rate": 0.0002, "epoch": 2.155834829443447, "step": 30020}, {"loss": 0.6348, "grad_norm": 0.8851615786552429, "learning_rate": 0.0002, "epoch": 2.1565529622980253, "step": 30030}, {"loss": 0.7108, "grad_norm": 0.736194372177124, "learning_rate": 0.0002, "epoch": 2.1572710951526033, "step": 30040}, {"loss": 0.6682, "grad_norm": 0.9908117055892944, "learning_rate": 0.0002, "epoch": 2.1579892280071813, "step": 30050}, {"loss": 0.6348, "grad_norm": 0.6772316694259644, "learning_rate": 0.0002, "epoch": 2.1587073608617593, "step": 30060}, {"loss": 0.6952, "grad_norm": 0.7474411725997925, "learning_rate": 0.0002, "epoch": 2.1594254937163377, "step": 30070}, {"loss": 0.6698, "grad_norm": 0.8140033483505249, "learning_rate": 0.0002, "epoch": 2.1601436265709157, "step": 30080}, {"loss": 0.6516, "grad_norm": 0.912555992603302, "learning_rate": 0.0002, "epoch": 2.1608617594254937, "step": 30090}, {"loss": 0.6818, "grad_norm": 0.8189636468887329, "learning_rate": 0.0002, "epoch": 2.1615798922800717, "step": 30100}, {"loss": 0.6662, "grad_norm": 0.7520000338554382, "learning_rate": 0.0002, "epoch": 2.1622980251346497, "step": 30110}, {"loss": 0.678, "grad_norm": 0.9635465741157532, "learning_rate": 0.0002, "epoch": 2.163016157989228, "step": 30120}, {"loss": 0.6641, "grad_norm": 0.9139830470085144, "learning_rate": 0.0002, "epoch": 2.163734290843806, "step": 30130}, {"loss": 0.6685, "grad_norm": 0.844384491443634, "learning_rate": 0.0002, "epoch": 2.164452423698384, "step": 30140}, {"loss": 0.708, "grad_norm": 0.8296793103218079, "learning_rate": 0.0002, "epoch": 2.165170556552962, "step": 30150}, {"loss": 0.668, "grad_norm": 0.7929309606552124, "learning_rate": 0.0002, "epoch": 2.16588868940754, "step": 30160}, {"loss": 0.6221, "grad_norm": 0.8046507239341736, "learning_rate": 0.0002, "epoch": 2.1666068222621186, "step": 30170}, {"loss": 0.6788, "grad_norm": 0.8161377310752869, "learning_rate": 0.0002, "epoch": 2.1673249551166966, "step": 30180}, {"loss": 0.6578, "grad_norm": 0.6984363794326782, "learning_rate": 0.0002, "epoch": 2.1680430879712747, "step": 30190}, {"loss": 0.6774, "grad_norm": 0.8578489422798157, "learning_rate": 0.0002, "epoch": 2.1687612208258527, "step": 30200}, {"loss": 0.668, "grad_norm": 0.8051524758338928, "learning_rate": 0.0002, "epoch": 2.1694793536804307, "step": 30210}, {"loss": 0.6212, "grad_norm": 0.6775792241096497, "learning_rate": 0.0002, "epoch": 2.170197486535009, "step": 30220}, {"loss": 0.705, "grad_norm": 0.7102242708206177, "learning_rate": 0.0002, "epoch": 2.170915619389587, "step": 30230}, {"loss": 0.6814, "grad_norm": 0.9038975238800049, "learning_rate": 0.0002, "epoch": 2.171633752244165, "step": 30240}, {"loss": 0.6919, "grad_norm": 0.8509918451309204, "learning_rate": 0.0002, "epoch": 2.172351885098743, "step": 30250}, {"loss": 0.6904, "grad_norm": 0.8816375732421875, "learning_rate": 0.0002, "epoch": 2.1730700179533216, "step": 30260}, {"loss": 0.7211, "grad_norm": 0.7907037138938904, "learning_rate": 0.0002, "epoch": 2.1737881508078996, "step": 30270}, {"loss": 0.6542, "grad_norm": 0.7104434967041016, "learning_rate": 0.0002, "epoch": 2.1745062836624776, "step": 30280}, {"loss": 0.6863, "grad_norm": 1.028658151626587, "learning_rate": 0.0002, "epoch": 2.1752244165170556, "step": 30290}, {"loss": 0.6789, "grad_norm": 0.8542430400848389, "learning_rate": 0.0002, "epoch": 2.1759425493716336, "step": 30300}, {"loss": 0.6783, "grad_norm": 0.7438064813613892, "learning_rate": 0.0002, "epoch": 2.176660682226212, "step": 30310}, {"loss": 0.63, "grad_norm": 0.8384708762168884, "learning_rate": 0.0002, "epoch": 2.17737881508079, "step": 30320}, {"loss": 0.6861, "grad_norm": 0.9034163355827332, "learning_rate": 0.0002, "epoch": 2.178096947935368, "step": 30330}, {"loss": 0.666, "grad_norm": 0.9659526944160461, "learning_rate": 0.0002, "epoch": 2.178815080789946, "step": 30340}, {"loss": 0.6819, "grad_norm": 0.6685642600059509, "learning_rate": 0.0002, "epoch": 2.1795332136445245, "step": 30350}, {"loss": 0.6759, "grad_norm": 0.9180589318275452, "learning_rate": 0.0002, "epoch": 2.1802513464991025, "step": 30360}, {"loss": 0.6575, "grad_norm": 0.9550795555114746, "learning_rate": 0.0002, "epoch": 2.1809694793536805, "step": 30370}, {"loss": 0.7014, "grad_norm": 0.8517686724662781, "learning_rate": 0.0002, "epoch": 2.1816876122082585, "step": 30380}, {"loss": 0.7069, "grad_norm": 0.7351927161216736, "learning_rate": 0.0002, "epoch": 2.1824057450628365, "step": 30390}, {"loss": 0.6555, "grad_norm": 0.8439408540725708, "learning_rate": 0.0002, "epoch": 2.183123877917415, "step": 30400}, {"loss": 0.69, "grad_norm": 0.8322570323944092, "learning_rate": 0.0002, "epoch": 2.183842010771993, "step": 30410}, {"loss": 0.6801, "grad_norm": 0.6735888123512268, "learning_rate": 0.0002, "epoch": 2.184560143626571, "step": 30420}, {"loss": 0.6844, "grad_norm": 0.7273133397102356, "learning_rate": 0.0002, "epoch": 2.185278276481149, "step": 30430}, {"loss": 0.7119, "grad_norm": 0.7841959595680237, "learning_rate": 0.0002, "epoch": 2.185996409335727, "step": 30440}, {"loss": 0.6717, "grad_norm": 0.67259281873703, "learning_rate": 0.0002, "epoch": 2.1867145421903054, "step": 30450}, {"loss": 0.6857, "grad_norm": 0.7646223306655884, "learning_rate": 0.0002, "epoch": 2.1874326750448834, "step": 30460}, {"loss": 0.6803, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 2.1881508078994614, "step": 30470}, {"loss": 0.6512, "grad_norm": 0.8818342685699463, "learning_rate": 0.0002, "epoch": 2.1888689407540394, "step": 30480}, {"loss": 0.6778, "grad_norm": 0.7421377897262573, "learning_rate": 0.0002, "epoch": 2.1895870736086174, "step": 30490}, {"loss": 0.6783, "grad_norm": 0.8180080652236938, "learning_rate": 0.0002, "epoch": 2.190305206463196, "step": 30500}, {"loss": 0.6774, "grad_norm": 0.8003571033477783, "learning_rate": 0.0002, "epoch": 2.191023339317774, "step": 30510}, {"loss": 0.7, "grad_norm": 0.8200605511665344, "learning_rate": 0.0002, "epoch": 2.191741472172352, "step": 30520}, {"loss": 0.7113, "grad_norm": 0.8878887295722961, "learning_rate": 0.0002, "epoch": 2.19245960502693, "step": 30530}, {"loss": 0.6364, "grad_norm": 0.8518163561820984, "learning_rate": 0.0002, "epoch": 2.1931777378815083, "step": 30540}, {"loss": 0.7039, "grad_norm": 0.8182454705238342, "learning_rate": 0.0002, "epoch": 2.1938958707360863, "step": 30550}, {"loss": 0.6966, "grad_norm": 0.9395919442176819, "learning_rate": 0.0002, "epoch": 2.1946140035906643, "step": 30560}, {"loss": 0.6617, "grad_norm": 0.7916256189346313, "learning_rate": 0.0002, "epoch": 2.1953321364452423, "step": 30570}, {"loss": 0.6869, "grad_norm": 0.7303445339202881, "learning_rate": 0.0002, "epoch": 2.1960502692998203, "step": 30580}, {"loss": 0.6485, "grad_norm": 0.7407387495040894, "learning_rate": 0.0002, "epoch": 2.1967684021543987, "step": 30590}, {"loss": 0.6704, "grad_norm": 0.7410500645637512, "learning_rate": 0.0002, "epoch": 2.1974865350089767, "step": 30600}, {"loss": 0.7013, "grad_norm": 0.9176440834999084, "learning_rate": 0.0002, "epoch": 2.1982046678635547, "step": 30610}, {"loss": 0.706, "grad_norm": 0.8823038935661316, "learning_rate": 0.0002, "epoch": 2.1989228007181327, "step": 30620}, {"loss": 0.7418, "grad_norm": 0.9263436198234558, "learning_rate": 0.0002, "epoch": 2.199640933572711, "step": 30630}, {"loss": 0.6019, "grad_norm": 0.6753571033477783, "learning_rate": 0.0002, "epoch": 2.200359066427289, "step": 30640}, {"loss": 0.6808, "grad_norm": 0.841160774230957, "learning_rate": 0.0002, "epoch": 2.201077199281867, "step": 30650}, {"loss": 0.6917, "grad_norm": 0.8786441683769226, "learning_rate": 0.0002, "epoch": 2.201795332136445, "step": 30660}, {"loss": 0.6878, "grad_norm": 0.8833681344985962, "learning_rate": 0.0002, "epoch": 2.202513464991023, "step": 30670}, {"loss": 0.7061, "grad_norm": 0.6609824299812317, "learning_rate": 0.0002, "epoch": 2.2032315978456016, "step": 30680}, {"loss": 0.6572, "grad_norm": 0.7308626174926758, "learning_rate": 0.0002, "epoch": 2.2039497307001796, "step": 30690}, {"loss": 0.7127, "grad_norm": 0.8854711055755615, "learning_rate": 0.0002, "epoch": 2.2046678635547576, "step": 30700}, {"loss": 0.6836, "grad_norm": 0.839043140411377, "learning_rate": 0.0002, "epoch": 2.2053859964093356, "step": 30710}, {"loss": 0.6577, "grad_norm": 0.9030174016952515, "learning_rate": 0.0002, "epoch": 2.2061041292639136, "step": 30720}, {"loss": 0.663, "grad_norm": 0.6856667399406433, "learning_rate": 0.0002, "epoch": 2.206822262118492, "step": 30730}, {"loss": 0.6672, "grad_norm": 0.8823501467704773, "learning_rate": 0.0002, "epoch": 2.20754039497307, "step": 30740}, {"loss": 0.6809, "grad_norm": 0.8501278162002563, "learning_rate": 0.0002, "epoch": 2.208258527827648, "step": 30750}, {"loss": 0.7402, "grad_norm": 0.8099446892738342, "learning_rate": 0.0002, "epoch": 2.208976660682226, "step": 30760}, {"loss": 0.6996, "grad_norm": 0.7203072905540466, "learning_rate": 0.0002, "epoch": 2.209694793536804, "step": 30770}, {"loss": 0.7494, "grad_norm": 1.0898563861846924, "learning_rate": 0.0002, "epoch": 2.2104129263913825, "step": 30780}, {"loss": 0.6432, "grad_norm": 0.8157216906547546, "learning_rate": 0.0002, "epoch": 2.2111310592459605, "step": 30790}, {"loss": 0.634, "grad_norm": 0.7617478966712952, "learning_rate": 0.0002, "epoch": 2.2118491921005385, "step": 30800}, {"loss": 0.7155, "grad_norm": 0.790503978729248, "learning_rate": 0.0002, "epoch": 2.2125673249551165, "step": 30810}, {"loss": 0.6301, "grad_norm": 0.9289199113845825, "learning_rate": 0.0002, "epoch": 2.213285457809695, "step": 30820}, {"loss": 0.6867, "grad_norm": 0.9267001748085022, "learning_rate": 0.0002, "epoch": 2.214003590664273, "step": 30830}, {"loss": 0.7012, "grad_norm": 0.716023862361908, "learning_rate": 0.0002, "epoch": 2.214721723518851, "step": 30840}, {"loss": 0.6755, "grad_norm": 0.8733863234519958, "learning_rate": 0.0002, "epoch": 2.215439856373429, "step": 30850}, {"loss": 0.6713, "grad_norm": 0.7743660807609558, "learning_rate": 0.0002, "epoch": 2.216157989228007, "step": 30860}, {"loss": 0.665, "grad_norm": 0.7974567413330078, "learning_rate": 0.0002, "epoch": 2.2168761220825854, "step": 30870}, {"loss": 0.6624, "grad_norm": 0.6617984771728516, "learning_rate": 0.0002, "epoch": 2.2175942549371634, "step": 30880}, {"loss": 0.6332, "grad_norm": 0.6925143003463745, "learning_rate": 0.0002, "epoch": 2.2183123877917414, "step": 30890}, {"loss": 0.6986, "grad_norm": 0.6853532195091248, "learning_rate": 0.0002, "epoch": 2.2190305206463194, "step": 30900}, {"loss": 0.6881, "grad_norm": 0.7964699268341064, "learning_rate": 0.0002, "epoch": 2.219748653500898, "step": 30910}, {"loss": 0.6879, "grad_norm": 0.8116228580474854, "learning_rate": 0.0002, "epoch": 2.220466786355476, "step": 30920}, {"loss": 0.6599, "grad_norm": 1.0121010541915894, "learning_rate": 0.0002, "epoch": 2.221184919210054, "step": 30930}, {"loss": 0.6873, "grad_norm": 0.7348445653915405, "learning_rate": 0.0002, "epoch": 2.221903052064632, "step": 30940}, {"loss": 0.6711, "grad_norm": 0.8998047709465027, "learning_rate": 0.0002, "epoch": 2.22262118491921, "step": 30950}, {"loss": 0.692, "grad_norm": 0.6108106970787048, "learning_rate": 0.0002, "epoch": 2.2233393177737883, "step": 30960}, {"loss": 0.6515, "grad_norm": 1.287834882736206, "learning_rate": 0.0002, "epoch": 2.2240574506283664, "step": 30970}, {"loss": 0.6513, "grad_norm": 0.8584468960762024, "learning_rate": 0.0002, "epoch": 2.2247755834829444, "step": 30980}, {"loss": 0.6907, "grad_norm": 0.865276038646698, "learning_rate": 0.0002, "epoch": 2.2254937163375224, "step": 30990}, {"loss": 0.7516, "grad_norm": 0.8713302612304688, "learning_rate": 0.0002, "epoch": 2.2262118491921004, "step": 31000}, {"loss": 0.7127, "grad_norm": 0.9210535883903503, "learning_rate": 0.0002, "epoch": 2.226929982046679, "step": 31010}, {"loss": 0.6543, "grad_norm": 0.8578430414199829, "learning_rate": 0.0002, "epoch": 2.227648114901257, "step": 31020}, {"loss": 0.6964, "grad_norm": 0.7128387093544006, "learning_rate": 0.0002, "epoch": 2.228366247755835, "step": 31030}, {"loss": 0.6949, "grad_norm": 0.8059941530227661, "learning_rate": 0.0002, "epoch": 2.229084380610413, "step": 31040}, {"loss": 0.6422, "grad_norm": 0.8043261170387268, "learning_rate": 0.0002, "epoch": 2.229802513464991, "step": 31050}, {"loss": 0.691, "grad_norm": 0.9260253310203552, "learning_rate": 0.0002, "epoch": 2.2305206463195693, "step": 31060}, {"loss": 0.6601, "grad_norm": 0.7908085584640503, "learning_rate": 0.0002, "epoch": 2.2312387791741473, "step": 31070}, {"loss": 0.6312, "grad_norm": 0.7860442996025085, "learning_rate": 0.0002, "epoch": 2.2319569120287253, "step": 31080}, {"loss": 0.715, "grad_norm": 0.8388702273368835, "learning_rate": 0.0002, "epoch": 2.2326750448833033, "step": 31090}, {"loss": 0.7015, "grad_norm": 0.835686206817627, "learning_rate": 0.0002, "epoch": 2.2333931777378817, "step": 31100}, {"loss": 0.6796, "grad_norm": 0.8148298859596252, "learning_rate": 0.0002, "epoch": 2.2341113105924597, "step": 31110}, {"loss": 0.6318, "grad_norm": 0.8501878976821899, "learning_rate": 0.0002, "epoch": 2.2348294434470377, "step": 31120}, {"loss": 0.7262, "grad_norm": 0.793323278427124, "learning_rate": 0.0002, "epoch": 2.2355475763016157, "step": 31130}, {"loss": 0.722, "grad_norm": 0.8234742879867554, "learning_rate": 0.0002, "epoch": 2.2362657091561937, "step": 31140}, {"loss": 0.6746, "grad_norm": 0.8691303133964539, "learning_rate": 0.0002, "epoch": 2.236983842010772, "step": 31150}, {"loss": 0.6191, "grad_norm": 0.8707090020179749, "learning_rate": 0.0002, "epoch": 2.23770197486535, "step": 31160}, {"loss": 0.6988, "grad_norm": 0.8468940854072571, "learning_rate": 0.0002, "epoch": 2.238420107719928, "step": 31170}, {"loss": 0.6429, "grad_norm": 0.7275772094726562, "learning_rate": 0.0002, "epoch": 2.239138240574506, "step": 31180}, {"loss": 0.7057, "grad_norm": 0.8765808939933777, "learning_rate": 0.0002, "epoch": 2.2398563734290846, "step": 31190}, {"loss": 0.7273, "grad_norm": 1.02803635597229, "learning_rate": 0.0002, "epoch": 2.2405745062836626, "step": 31200}, {"loss": 0.7303, "grad_norm": 0.7999185919761658, "learning_rate": 0.0002, "epoch": 2.2412926391382406, "step": 31210}, {"loss": 0.658, "grad_norm": 0.5711870789527893, "learning_rate": 0.0002, "epoch": 2.2420107719928186, "step": 31220}, {"loss": 0.6527, "grad_norm": 0.7183604836463928, "learning_rate": 0.0002, "epoch": 2.2427289048473966, "step": 31230}, {"loss": 0.6817, "grad_norm": 0.8819206357002258, "learning_rate": 0.0002, "epoch": 2.243447037701975, "step": 31240}, {"loss": 0.6805, "grad_norm": 0.9078969955444336, "learning_rate": 0.0002, "epoch": 2.244165170556553, "step": 31250}, {"loss": 0.6937, "grad_norm": 1.184506893157959, "learning_rate": 0.0002, "epoch": 2.244883303411131, "step": 31260}, {"loss": 0.7682, "grad_norm": 0.8660752177238464, "learning_rate": 0.0002, "epoch": 2.245601436265709, "step": 31270}, {"loss": 0.6461, "grad_norm": 1.011796236038208, "learning_rate": 0.0002, "epoch": 2.246319569120287, "step": 31280}, {"loss": 0.677, "grad_norm": 0.9168157577514648, "learning_rate": 0.0002, "epoch": 2.2470377019748655, "step": 31290}, {"loss": 0.6844, "grad_norm": 0.7798577547073364, "learning_rate": 0.0002, "epoch": 2.2477558348294435, "step": 31300}, {"loss": 0.6622, "grad_norm": 0.6609913110733032, "learning_rate": 0.0002, "epoch": 2.2484739676840215, "step": 31310}, {"loss": 0.6616, "grad_norm": 0.64737868309021, "learning_rate": 0.0002, "epoch": 2.2491921005385995, "step": 31320}, {"loss": 0.665, "grad_norm": 1.0700385570526123, "learning_rate": 0.0002, "epoch": 2.2499102333931775, "step": 31330}, {"loss": 0.6539, "grad_norm": 0.7838551998138428, "learning_rate": 0.0002, "epoch": 2.250628366247756, "step": 31340}, {"loss": 0.7002, "grad_norm": 0.9225728511810303, "learning_rate": 0.0002, "epoch": 2.251346499102334, "step": 31350}, {"loss": 0.6758, "grad_norm": 0.7956384420394897, "learning_rate": 0.0002, "epoch": 2.252064631956912, "step": 31360}, {"loss": 0.7039, "grad_norm": 0.7645466923713684, "learning_rate": 0.0002, "epoch": 2.25278276481149, "step": 31370}, {"loss": 0.6816, "grad_norm": 0.9595549702644348, "learning_rate": 0.0002, "epoch": 2.2535008976660684, "step": 31380}, {"loss": 0.6419, "grad_norm": 0.6124163866043091, "learning_rate": 0.0002, "epoch": 2.2542190305206464, "step": 31390}, {"loss": 0.6573, "grad_norm": 0.7531530261039734, "learning_rate": 0.0002, "epoch": 2.2549371633752244, "step": 31400}, {"loss": 0.6223, "grad_norm": 0.6904721856117249, "learning_rate": 0.0002, "epoch": 2.2556552962298024, "step": 31410}, {"loss": 0.6661, "grad_norm": 0.7644204497337341, "learning_rate": 0.0002, "epoch": 2.2563734290843804, "step": 31420}, {"loss": 0.7122, "grad_norm": 0.7879737019538879, "learning_rate": 0.0002, "epoch": 2.257091561938959, "step": 31430}, {"loss": 0.6407, "grad_norm": 0.796450138092041, "learning_rate": 0.0002, "epoch": 2.257809694793537, "step": 31440}, {"loss": 0.722, "grad_norm": 0.7536656856536865, "learning_rate": 0.0002, "epoch": 2.258527827648115, "step": 31450}, {"loss": 0.681, "grad_norm": 0.6797451376914978, "learning_rate": 0.0002, "epoch": 2.259245960502693, "step": 31460}, {"loss": 0.6916, "grad_norm": 0.7833347320556641, "learning_rate": 0.0002, "epoch": 2.2599640933572713, "step": 31470}, {"loss": 0.702, "grad_norm": 0.7571428418159485, "learning_rate": 0.0002, "epoch": 2.2606822262118493, "step": 31480}, {"loss": 0.6878, "grad_norm": 0.7028690576553345, "learning_rate": 0.0002, "epoch": 2.2614003590664273, "step": 31490}, {"loss": 0.6863, "grad_norm": 0.7854651212692261, "learning_rate": 0.0002, "epoch": 2.2621184919210053, "step": 31500}, {"loss": 0.6895, "grad_norm": 1.1924974918365479, "learning_rate": 0.0002, "epoch": 2.2628366247755833, "step": 31510}, {"loss": 0.7174, "grad_norm": 0.8087588548660278, "learning_rate": 0.0002, "epoch": 2.2635547576301613, "step": 31520}, {"loss": 0.6398, "grad_norm": 0.8521981835365295, "learning_rate": 0.0002, "epoch": 2.26427289048474, "step": 31530}, {"loss": 0.6654, "grad_norm": 0.754585862159729, "learning_rate": 0.0002, "epoch": 2.264991023339318, "step": 31540}, {"loss": 0.6854, "grad_norm": 0.8403395414352417, "learning_rate": 0.0002, "epoch": 2.265709156193896, "step": 31550}, {"loss": 0.6873, "grad_norm": 0.9724786877632141, "learning_rate": 0.0002, "epoch": 2.266427289048474, "step": 31560}, {"loss": 0.6876, "grad_norm": 0.7568767070770264, "learning_rate": 0.0002, "epoch": 2.2671454219030522, "step": 31570}, {"loss": 0.6161, "grad_norm": 0.712009608745575, "learning_rate": 0.0002, "epoch": 2.2678635547576302, "step": 31580}, {"loss": 0.6568, "grad_norm": 0.7649937868118286, "learning_rate": 0.0002, "epoch": 2.2685816876122082, "step": 31590}, {"loss": 0.6195, "grad_norm": 0.7319537997245789, "learning_rate": 0.0002, "epoch": 2.2692998204667862, "step": 31600}, {"loss": 0.6434, "grad_norm": 0.9597942233085632, "learning_rate": 0.0002, "epoch": 2.2700179533213642, "step": 31610}, {"loss": 0.6273, "grad_norm": 0.7403358817100525, "learning_rate": 0.0002, "epoch": 2.2707360861759427, "step": 31620}, {"loss": 0.7185, "grad_norm": 0.7395114898681641, "learning_rate": 0.0002, "epoch": 2.2714542190305207, "step": 31630}, {"loss": 0.6357, "grad_norm": 0.8835344314575195, "learning_rate": 0.0002, "epoch": 2.2721723518850987, "step": 31640}, {"loss": 0.7442, "grad_norm": 0.76587975025177, "learning_rate": 0.0002, "epoch": 2.2728904847396767, "step": 31650}, {"loss": 0.6491, "grad_norm": 0.6472584009170532, "learning_rate": 0.0002, "epoch": 2.273608617594255, "step": 31660}, {"loss": 0.7026, "grad_norm": 1.0170460939407349, "learning_rate": 0.0002, "epoch": 2.274326750448833, "step": 31670}, {"loss": 0.6839, "grad_norm": 0.8170912265777588, "learning_rate": 0.0002, "epoch": 2.275044883303411, "step": 31680}, {"loss": 0.6599, "grad_norm": 0.6821279525756836, "learning_rate": 0.0002, "epoch": 2.275763016157989, "step": 31690}, {"loss": 0.6346, "grad_norm": 0.8150709867477417, "learning_rate": 0.0002, "epoch": 2.276481149012567, "step": 31700}, {"loss": 0.6639, "grad_norm": 0.6786386370658875, "learning_rate": 0.0002, "epoch": 2.2771992818671456, "step": 31710}, {"loss": 0.6753, "grad_norm": 0.8871912360191345, "learning_rate": 0.0002, "epoch": 2.2779174147217236, "step": 31720}, {"loss": 0.6826, "grad_norm": 0.7710220813751221, "learning_rate": 0.0002, "epoch": 2.2786355475763016, "step": 31730}, {"loss": 0.7118, "grad_norm": 0.8073079586029053, "learning_rate": 0.0002, "epoch": 2.2793536804308796, "step": 31740}, {"loss": 0.6614, "grad_norm": 0.8228550553321838, "learning_rate": 0.0002, "epoch": 2.280071813285458, "step": 31750}, {"loss": 0.7162, "grad_norm": 0.7987996339797974, "learning_rate": 0.0002, "epoch": 2.280789946140036, "step": 31760}, {"loss": 0.6953, "grad_norm": 0.744326651096344, "learning_rate": 0.0002, "epoch": 2.281508078994614, "step": 31770}, {"loss": 0.7089, "grad_norm": 0.7672302722930908, "learning_rate": 0.0002, "epoch": 2.282226211849192, "step": 31780}, {"loss": 0.6926, "grad_norm": 0.8079774975776672, "learning_rate": 0.0002, "epoch": 2.28294434470377, "step": 31790}, {"loss": 0.6361, "grad_norm": 0.7383643984794617, "learning_rate": 0.0002, "epoch": 2.283662477558348, "step": 31800}, {"loss": 0.6924, "grad_norm": 0.8542332649230957, "learning_rate": 0.0002, "epoch": 2.2843806104129265, "step": 31810}, {"loss": 0.7156, "grad_norm": 0.7657321691513062, "learning_rate": 0.0002, "epoch": 2.2850987432675045, "step": 31820}, {"loss": 0.6545, "grad_norm": 0.7485944628715515, "learning_rate": 0.0002, "epoch": 2.2858168761220825, "step": 31830}, {"loss": 0.6452, "grad_norm": 0.7817596793174744, "learning_rate": 0.0002, "epoch": 2.2865350089766605, "step": 31840}, {"loss": 0.6398, "grad_norm": 0.840421736240387, "learning_rate": 0.0002, "epoch": 2.287253141831239, "step": 31850}, {"loss": 0.7245, "grad_norm": 0.8190447688102722, "learning_rate": 0.0002, "epoch": 2.287971274685817, "step": 31860}, {"loss": 0.7343, "grad_norm": 0.9582287669181824, "learning_rate": 0.0002, "epoch": 2.288689407540395, "step": 31870}, {"loss": 0.683, "grad_norm": 1.0939116477966309, "learning_rate": 0.0002, "epoch": 2.289407540394973, "step": 31880}, {"loss": 0.7176, "grad_norm": 1.0901678800582886, "learning_rate": 0.0002, "epoch": 2.290125673249551, "step": 31890}, {"loss": 0.6711, "grad_norm": 0.8025168776512146, "learning_rate": 0.0002, "epoch": 2.2908438061041294, "step": 31900}, {"loss": 0.6901, "grad_norm": 0.8157371878623962, "learning_rate": 0.0002, "epoch": 2.2915619389587074, "step": 31910}, {"loss": 0.6643, "grad_norm": 0.7735328078269958, "learning_rate": 0.0002, "epoch": 2.2922800718132854, "step": 31920}, {"loss": 0.689, "grad_norm": 0.7501550316810608, "learning_rate": 0.0002, "epoch": 2.2929982046678634, "step": 31930}, {"loss": 0.6605, "grad_norm": 0.76664799451828, "learning_rate": 0.0002, "epoch": 2.293716337522442, "step": 31940}, {"loss": 0.6818, "grad_norm": 1.0044599771499634, "learning_rate": 0.0002, "epoch": 2.29443447037702, "step": 31950}, {"loss": 0.6566, "grad_norm": 0.7773551344871521, "learning_rate": 0.0002, "epoch": 2.295152603231598, "step": 31960}, {"loss": 0.6834, "grad_norm": 0.9021226763725281, "learning_rate": 0.0002, "epoch": 2.295870736086176, "step": 31970}, {"loss": 0.6757, "grad_norm": 0.9075915813446045, "learning_rate": 0.0002, "epoch": 2.296588868940754, "step": 31980}, {"loss": 0.6584, "grad_norm": 0.9109290242195129, "learning_rate": 0.0002, "epoch": 2.2973070017953323, "step": 31990}, {"loss": 0.6792, "grad_norm": 0.7742900252342224, "learning_rate": 0.0002, "epoch": 2.2980251346499103, "step": 32000}, {"loss": 0.7137, "grad_norm": 0.633260190486908, "learning_rate": 0.0002, "epoch": 2.2987432675044883, "step": 32010}, {"loss": 0.6644, "grad_norm": 0.8593834042549133, "learning_rate": 0.0002, "epoch": 2.2994614003590663, "step": 32020}, {"loss": 0.6961, "grad_norm": 0.88165283203125, "learning_rate": 0.0002, "epoch": 2.3001795332136448, "step": 32030}, {"loss": 0.7779, "grad_norm": 0.7840633988380432, "learning_rate": 0.0002, "epoch": 2.3008976660682228, "step": 32040}, {"loss": 0.7045, "grad_norm": 0.8150764107704163, "learning_rate": 0.0002, "epoch": 2.3016157989228008, "step": 32050}, {"loss": 0.6556, "grad_norm": 0.7683324813842773, "learning_rate": 0.0002, "epoch": 2.3023339317773788, "step": 32060}, {"loss": 0.6657, "grad_norm": 0.7581049799919128, "learning_rate": 0.0002, "epoch": 2.3030520646319568, "step": 32070}, {"loss": 0.6683, "grad_norm": 0.911687970161438, "learning_rate": 0.0002, "epoch": 2.3037701974865348, "step": 32080}, {"loss": 0.7029, "grad_norm": 1.0596355199813843, "learning_rate": 0.0002, "epoch": 2.3044883303411132, "step": 32090}, {"loss": 0.6955, "grad_norm": 0.7329661846160889, "learning_rate": 0.0002, "epoch": 2.3052064631956912, "step": 32100}, {"loss": 0.6798, "grad_norm": 0.8251074552536011, "learning_rate": 0.0002, "epoch": 2.3059245960502692, "step": 32110}, {"loss": 0.692, "grad_norm": 0.7765523195266724, "learning_rate": 0.0002, "epoch": 2.3066427289048472, "step": 32120}, {"loss": 0.6375, "grad_norm": 0.8246980905532837, "learning_rate": 0.0002, "epoch": 2.3073608617594257, "step": 32130}, {"loss": 0.6815, "grad_norm": 0.833387017250061, "learning_rate": 0.0002, "epoch": 2.3080789946140037, "step": 32140}, {"loss": 0.6261, "grad_norm": 0.9558065533638, "learning_rate": 0.0002, "epoch": 2.3087971274685817, "step": 32150}, {"loss": 0.6723, "grad_norm": 0.788151204586029, "learning_rate": 0.0002, "epoch": 2.3095152603231597, "step": 32160}, {"loss": 0.6398, "grad_norm": 0.8662320971488953, "learning_rate": 0.0002, "epoch": 2.3102333931777377, "step": 32170}, {"loss": 0.7014, "grad_norm": 0.7079060673713684, "learning_rate": 0.0002, "epoch": 2.310951526032316, "step": 32180}, {"loss": 0.6479, "grad_norm": 0.8477022647857666, "learning_rate": 0.0002, "epoch": 2.311669658886894, "step": 32190}, {"loss": 0.6872, "grad_norm": 0.6549711227416992, "learning_rate": 0.0002, "epoch": 2.312387791741472, "step": 32200}, {"loss": 0.6668, "grad_norm": 0.8274375796318054, "learning_rate": 0.0002, "epoch": 2.31310592459605, "step": 32210}, {"loss": 0.6731, "grad_norm": 0.6305822730064392, "learning_rate": 0.0002, "epoch": 2.3138240574506286, "step": 32220}, {"loss": 0.6908, "grad_norm": 0.8105725049972534, "learning_rate": 0.0002, "epoch": 2.3145421903052066, "step": 32230}, {"loss": 0.7028, "grad_norm": 0.7317119240760803, "learning_rate": 0.0002, "epoch": 2.3152603231597846, "step": 32240}, {"loss": 0.6444, "grad_norm": 0.7729924917221069, "learning_rate": 0.0002, "epoch": 2.3159784560143626, "step": 32250}, {"loss": 0.6945, "grad_norm": 0.8092145919799805, "learning_rate": 0.0002, "epoch": 2.3166965888689406, "step": 32260}, {"loss": 0.663, "grad_norm": 0.8723762035369873, "learning_rate": 0.0002, "epoch": 2.317414721723519, "step": 32270}, {"loss": 0.6992, "grad_norm": 0.9699533581733704, "learning_rate": 0.0002, "epoch": 2.318132854578097, "step": 32280}, {"loss": 0.7488, "grad_norm": 1.2972444295883179, "learning_rate": 0.0002, "epoch": 2.318850987432675, "step": 32290}, {"loss": 0.6969, "grad_norm": 0.7888450622558594, "learning_rate": 0.0002, "epoch": 2.319569120287253, "step": 32300}, {"loss": 0.6876, "grad_norm": 0.7457000017166138, "learning_rate": 0.0002, "epoch": 2.3202872531418315, "step": 32310}, {"loss": 0.6891, "grad_norm": 0.7270606756210327, "learning_rate": 0.0002, "epoch": 2.3210053859964095, "step": 32320}, {"loss": 0.6607, "grad_norm": 0.7930711507797241, "learning_rate": 0.0002, "epoch": 2.3217235188509875, "step": 32330}, {"loss": 0.7222, "grad_norm": 0.9015030264854431, "learning_rate": 0.0002, "epoch": 2.3224416517055655, "step": 32340}, {"loss": 0.6544, "grad_norm": 0.9385523796081543, "learning_rate": 0.0002, "epoch": 2.3231597845601435, "step": 32350}, {"loss": 0.6779, "grad_norm": 0.7293606400489807, "learning_rate": 0.0002, "epoch": 2.3238779174147215, "step": 32360}, {"loss": 0.6556, "grad_norm": 0.797618567943573, "learning_rate": 0.0002, "epoch": 2.3245960502693, "step": 32370}, {"loss": 0.6743, "grad_norm": 0.8588258028030396, "learning_rate": 0.0002, "epoch": 2.325314183123878, "step": 32380}, {"loss": 0.659, "grad_norm": 0.7490078210830688, "learning_rate": 0.0002, "epoch": 2.326032315978456, "step": 32390}, {"loss": 0.7365, "grad_norm": 0.7569956183433533, "learning_rate": 0.0002, "epoch": 2.326750448833034, "step": 32400}, {"loss": 0.7048, "grad_norm": 0.8754122853279114, "learning_rate": 0.0002, "epoch": 2.3274685816876124, "step": 32410}, {"loss": 0.6845, "grad_norm": 0.9410699605941772, "learning_rate": 0.0002, "epoch": 2.3281867145421904, "step": 32420}, {"loss": 0.6611, "grad_norm": 1.1309062242507935, "learning_rate": 0.0002, "epoch": 2.3289048473967684, "step": 32430}, {"loss": 0.6609, "grad_norm": 0.7923168540000916, "learning_rate": 0.0002, "epoch": 2.3296229802513464, "step": 32440}, {"loss": 0.6728, "grad_norm": 0.830387532711029, "learning_rate": 0.0002, "epoch": 2.3303411131059244, "step": 32450}, {"loss": 0.673, "grad_norm": 0.9087454080581665, "learning_rate": 0.0002, "epoch": 2.331059245960503, "step": 32460}, {"loss": 0.6749, "grad_norm": 0.8892660737037659, "learning_rate": 0.0002, "epoch": 2.331777378815081, "step": 32470}, {"loss": 0.7101, "grad_norm": 0.84930819272995, "learning_rate": 0.0002, "epoch": 2.332495511669659, "step": 32480}, {"loss": 0.6465, "grad_norm": 0.7736781239509583, "learning_rate": 0.0002, "epoch": 2.333213644524237, "step": 32490}, {"loss": 0.6976, "grad_norm": 0.7396222352981567, "learning_rate": 0.0002, "epoch": 2.3339317773788153, "step": 32500}, {"loss": 0.6484, "grad_norm": 0.7710241079330444, "learning_rate": 0.0002, "epoch": 2.3346499102333933, "step": 32510}, {"loss": 0.6591, "grad_norm": 0.7297301888465881, "learning_rate": 0.0002, "epoch": 2.3353680430879713, "step": 32520}, {"loss": 0.7375, "grad_norm": 0.9084094166755676, "learning_rate": 0.0002, "epoch": 2.3360861759425493, "step": 32530}, {"loss": 0.6775, "grad_norm": 0.6425859332084656, "learning_rate": 0.0002, "epoch": 2.3368043087971273, "step": 32540}, {"loss": 0.7249, "grad_norm": 0.8646581172943115, "learning_rate": 0.0002, "epoch": 2.3375224416517058, "step": 32550}, {"loss": 0.6862, "grad_norm": 0.91925048828125, "learning_rate": 0.0002, "epoch": 2.3382405745062838, "step": 32560}, {"loss": 0.6805, "grad_norm": 0.8687716722488403, "learning_rate": 0.0002, "epoch": 2.3389587073608618, "step": 32570}, {"loss": 0.6377, "grad_norm": 0.9769517183303833, "learning_rate": 0.0002, "epoch": 2.3396768402154398, "step": 32580}, {"loss": 0.6459, "grad_norm": 0.7240557074546814, "learning_rate": 0.0002, "epoch": 2.340394973070018, "step": 32590}, {"loss": 0.7029, "grad_norm": 0.6631549000740051, "learning_rate": 0.0002, "epoch": 2.341113105924596, "step": 32600}, {"loss": 0.6524, "grad_norm": 0.9103635549545288, "learning_rate": 0.0002, "epoch": 2.341831238779174, "step": 32610}, {"loss": 0.6695, "grad_norm": 0.8718403577804565, "learning_rate": 0.0002, "epoch": 2.342549371633752, "step": 32620}, {"loss": 0.7006, "grad_norm": 0.8020271062850952, "learning_rate": 0.0002, "epoch": 2.34326750448833, "step": 32630}, {"loss": 0.6853, "grad_norm": 0.7834265232086182, "learning_rate": 0.0002, "epoch": 2.343985637342908, "step": 32640}, {"loss": 0.6447, "grad_norm": 0.8909988403320312, "learning_rate": 0.0002, "epoch": 2.3447037701974867, "step": 32650}, {"loss": 0.6762, "grad_norm": 0.6915582418441772, "learning_rate": 0.0002, "epoch": 2.3454219030520647, "step": 32660}, {"loss": 0.6993, "grad_norm": 0.8829401135444641, "learning_rate": 0.0002, "epoch": 2.3461400359066427, "step": 32670}, {"loss": 0.6035, "grad_norm": 0.8869150876998901, "learning_rate": 0.0002, "epoch": 2.3468581687612207, "step": 32680}, {"loss": 0.6404, "grad_norm": 0.8348933458328247, "learning_rate": 0.0002, "epoch": 2.347576301615799, "step": 32690}, {"loss": 0.6961, "grad_norm": 0.7591108679771423, "learning_rate": 0.0002, "epoch": 2.348294434470377, "step": 32700}, {"loss": 0.7155, "grad_norm": 0.8343638181686401, "learning_rate": 0.0002, "epoch": 2.349012567324955, "step": 32710}, {"loss": 0.6949, "grad_norm": 0.8537896275520325, "learning_rate": 0.0002, "epoch": 2.349730700179533, "step": 32720}, {"loss": 0.6545, "grad_norm": 0.7750797867774963, "learning_rate": 0.0002, "epoch": 2.350448833034111, "step": 32730}, {"loss": 0.7226, "grad_norm": 0.7553941607475281, "learning_rate": 0.0002, "epoch": 2.3511669658886896, "step": 32740}, {"loss": 0.6985, "grad_norm": 0.8083372712135315, "learning_rate": 0.0002, "epoch": 2.3518850987432676, "step": 32750}, {"loss": 0.6345, "grad_norm": 0.8016324043273926, "learning_rate": 0.0002, "epoch": 2.3526032315978456, "step": 32760}, {"loss": 0.6348, "grad_norm": 0.7524061799049377, "learning_rate": 0.0002, "epoch": 2.3533213644524236, "step": 32770}, {"loss": 0.6782, "grad_norm": 0.9046763777732849, "learning_rate": 0.0002, "epoch": 2.354039497307002, "step": 32780}, {"loss": 0.6745, "grad_norm": 0.9704324007034302, "learning_rate": 0.0002, "epoch": 2.35475763016158, "step": 32790}, {"loss": 0.7095, "grad_norm": 0.8756019473075867, "learning_rate": 0.0002, "epoch": 2.355475763016158, "step": 32800}, {"loss": 0.6989, "grad_norm": 0.7345646023750305, "learning_rate": 0.0002, "epoch": 2.356193895870736, "step": 32810}, {"loss": 0.6659, "grad_norm": 0.8022899031639099, "learning_rate": 0.0002, "epoch": 2.356912028725314, "step": 32820}, {"loss": 0.6997, "grad_norm": 0.7663353085517883, "learning_rate": 0.0002, "epoch": 2.3576301615798925, "step": 32830}, {"loss": 0.6683, "grad_norm": 0.7802956104278564, "learning_rate": 0.0002, "epoch": 2.3583482944344705, "step": 32840}, {"loss": 0.679, "grad_norm": 0.8130960464477539, "learning_rate": 0.0002, "epoch": 2.3590664272890485, "step": 32850}, {"loss": 0.6792, "grad_norm": 0.9671252369880676, "learning_rate": 0.0002, "epoch": 2.3597845601436265, "step": 32860}, {"loss": 0.6989, "grad_norm": 0.8806724548339844, "learning_rate": 0.0002, "epoch": 2.3605026929982045, "step": 32870}, {"loss": 0.6674, "grad_norm": 0.9378283619880676, "learning_rate": 0.0002, "epoch": 2.361220825852783, "step": 32880}, {"loss": 0.6607, "grad_norm": 0.8638162612915039, "learning_rate": 0.0002, "epoch": 2.361938958707361, "step": 32890}, {"loss": 0.6866, "grad_norm": 0.7321885228157043, "learning_rate": 0.0002, "epoch": 2.362657091561939, "step": 32900}, {"loss": 0.6682, "grad_norm": 0.8445415496826172, "learning_rate": 0.0002, "epoch": 2.363375224416517, "step": 32910}, {"loss": 0.6863, "grad_norm": 0.915715754032135, "learning_rate": 0.0002, "epoch": 2.364093357271095, "step": 32920}, {"loss": 0.6671, "grad_norm": 0.8674854040145874, "learning_rate": 0.0002, "epoch": 2.3648114901256734, "step": 32930}, {"loss": 0.7124, "grad_norm": 0.7577189207077026, "learning_rate": 0.0002, "epoch": 2.3655296229802514, "step": 32940}, {"loss": 0.6879, "grad_norm": 0.8649988174438477, "learning_rate": 0.0002, "epoch": 2.3662477558348294, "step": 32950}, {"loss": 0.6571, "grad_norm": 0.9760734438896179, "learning_rate": 0.0002, "epoch": 2.3669658886894074, "step": 32960}, {"loss": 0.7002, "grad_norm": 0.8909491300582886, "learning_rate": 0.0002, "epoch": 2.367684021543986, "step": 32970}, {"loss": 0.6961, "grad_norm": 0.6970168948173523, "learning_rate": 0.0002, "epoch": 2.368402154398564, "step": 32980}, {"loss": 0.6153, "grad_norm": 0.8208426237106323, "learning_rate": 0.0002, "epoch": 2.369120287253142, "step": 32990}, {"loss": 0.626, "grad_norm": 0.8477405309677124, "learning_rate": 0.0002, "epoch": 2.36983842010772, "step": 33000}, {"loss": 0.6588, "grad_norm": 0.7771625518798828, "learning_rate": 0.0002, "epoch": 2.370556552962298, "step": 33010}, {"loss": 0.673, "grad_norm": 0.7811821103096008, "learning_rate": 0.0002, "epoch": 2.3712746858168763, "step": 33020}, {"loss": 0.6792, "grad_norm": 0.6280415654182434, "learning_rate": 0.0002, "epoch": 2.3719928186714543, "step": 33030}, {"loss": 0.6567, "grad_norm": 0.8733929395675659, "learning_rate": 0.0002, "epoch": 2.3727109515260323, "step": 33040}, {"loss": 0.6844, "grad_norm": 0.6169558167457581, "learning_rate": 0.0002, "epoch": 2.3734290843806103, "step": 33050}, {"loss": 0.6675, "grad_norm": 0.7414724826812744, "learning_rate": 0.0002, "epoch": 2.3741472172351887, "step": 33060}, {"loss": 0.6905, "grad_norm": 0.7484683990478516, "learning_rate": 0.0002, "epoch": 2.3748653500897667, "step": 33070}, {"loss": 0.6676, "grad_norm": 0.8495098948478699, "learning_rate": 0.0002, "epoch": 2.3755834829443447, "step": 33080}, {"loss": 0.687, "grad_norm": 0.9057353734970093, "learning_rate": 0.0002, "epoch": 2.3763016157989227, "step": 33090}, {"loss": 0.6911, "grad_norm": 0.8028274178504944, "learning_rate": 0.0002, "epoch": 2.3770197486535007, "step": 33100}, {"loss": 0.6851, "grad_norm": 1.2398128509521484, "learning_rate": 0.0002, "epoch": 2.377737881508079, "step": 33110}, {"loss": 0.6753, "grad_norm": 0.7894110679626465, "learning_rate": 0.0002, "epoch": 2.378456014362657, "step": 33120}, {"loss": 0.6625, "grad_norm": 0.8530096411705017, "learning_rate": 0.0002, "epoch": 2.379174147217235, "step": 33130}, {"loss": 0.7061, "grad_norm": 0.892613410949707, "learning_rate": 0.0002, "epoch": 2.379892280071813, "step": 33140}, {"loss": 0.6719, "grad_norm": 0.868606448173523, "learning_rate": 0.0002, "epoch": 2.380610412926391, "step": 33150}, {"loss": 0.6423, "grad_norm": 0.6801115870475769, "learning_rate": 0.0002, "epoch": 2.3813285457809696, "step": 33160}, {"loss": 0.6723, "grad_norm": 0.9517148733139038, "learning_rate": 0.0002, "epoch": 2.3820466786355476, "step": 33170}, {"loss": 0.6957, "grad_norm": 0.8986499309539795, "learning_rate": 0.0002, "epoch": 2.3827648114901256, "step": 33180}, {"loss": 0.6767, "grad_norm": 0.8467642068862915, "learning_rate": 0.0002, "epoch": 2.3834829443447036, "step": 33190}, {"loss": 0.7228, "grad_norm": 0.8400940299034119, "learning_rate": 0.0002, "epoch": 2.3842010771992816, "step": 33200}, {"loss": 0.7048, "grad_norm": 0.86443030834198, "learning_rate": 0.0002, "epoch": 2.38491921005386, "step": 33210}, {"loss": 0.6227, "grad_norm": 0.8599014282226562, "learning_rate": 0.0002, "epoch": 2.385637342908438, "step": 33220}, {"loss": 0.673, "grad_norm": 0.868735134601593, "learning_rate": 0.0002, "epoch": 2.386355475763016, "step": 33230}, {"loss": 0.6612, "grad_norm": 0.941734790802002, "learning_rate": 0.0002, "epoch": 2.387073608617594, "step": 33240}, {"loss": 0.6951, "grad_norm": 0.9342881441116333, "learning_rate": 0.0002, "epoch": 2.3877917414721725, "step": 33250}, {"loss": 0.7255, "grad_norm": 1.012920618057251, "learning_rate": 0.0002, "epoch": 2.3885098743267505, "step": 33260}, {"loss": 0.6399, "grad_norm": 0.6949151754379272, "learning_rate": 0.0002, "epoch": 2.3892280071813286, "step": 33270}, {"loss": 0.7137, "grad_norm": 0.8283912539482117, "learning_rate": 0.0002, "epoch": 2.3899461400359066, "step": 33280}, {"loss": 0.7324, "grad_norm": 0.807273805141449, "learning_rate": 0.0002, "epoch": 2.3906642728904846, "step": 33290}, {"loss": 0.7353, "grad_norm": 0.8109124302864075, "learning_rate": 0.0002, "epoch": 2.391382405745063, "step": 33300}, {"loss": 0.689, "grad_norm": 0.7477563619613647, "learning_rate": 0.0002, "epoch": 2.392100538599641, "step": 33310}, {"loss": 0.6585, "grad_norm": 0.6961637735366821, "learning_rate": 0.0002, "epoch": 2.392818671454219, "step": 33320}, {"loss": 0.6919, "grad_norm": 0.9424173831939697, "learning_rate": 0.0002, "epoch": 2.393536804308797, "step": 33330}, {"loss": 0.6965, "grad_norm": 0.8289623856544495, "learning_rate": 0.0002, "epoch": 2.3942549371633755, "step": 33340}, {"loss": 0.6761, "grad_norm": 0.8106551170349121, "learning_rate": 0.0002, "epoch": 2.3949730700179535, "step": 33350}, {"loss": 0.6675, "grad_norm": 0.8800507187843323, "learning_rate": 0.0002, "epoch": 2.3956912028725315, "step": 33360}, {"loss": 0.6636, "grad_norm": 0.7662274241447449, "learning_rate": 0.0002, "epoch": 2.3964093357271095, "step": 33370}, {"loss": 0.6824, "grad_norm": 0.889204740524292, "learning_rate": 0.0002, "epoch": 2.3971274685816875, "step": 33380}, {"loss": 0.6539, "grad_norm": 0.7991349697113037, "learning_rate": 0.0002, "epoch": 2.3978456014362655, "step": 33390}, {"loss": 0.6818, "grad_norm": 0.8210278749465942, "learning_rate": 0.0002, "epoch": 2.398563734290844, "step": 33400}, {"loss": 0.7118, "grad_norm": 0.91801917552948, "learning_rate": 0.0002, "epoch": 2.399281867145422, "step": 33410}, {"loss": 0.726, "grad_norm": 0.8086220622062683, "learning_rate": 0.0002, "epoch": 2.4, "step": 33420}, {"loss": 0.7418, "grad_norm": 0.901613175868988, "learning_rate": 0.0002, "epoch": 2.400718132854578, "step": 33430}, {"loss": 0.6904, "grad_norm": 0.9865965247154236, "learning_rate": 0.0002, "epoch": 2.4014362657091564, "step": 33440}, {"loss": 0.7543, "grad_norm": 0.8160675168037415, "learning_rate": 0.0002, "epoch": 2.4021543985637344, "step": 33450}, {"loss": 0.6598, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 2.4028725314183124, "step": 33460}, {"loss": 0.6784, "grad_norm": 0.8490013480186462, "learning_rate": 0.0002, "epoch": 2.4035906642728904, "step": 33470}, {"loss": 0.6844, "grad_norm": 0.6947163939476013, "learning_rate": 0.0002, "epoch": 2.4043087971274684, "step": 33480}, {"loss": 0.6606, "grad_norm": 0.7984827756881714, "learning_rate": 0.0002, "epoch": 2.405026929982047, "step": 33490}, {"loss": 0.7032, "grad_norm": 0.7826083302497864, "learning_rate": 0.0002, "epoch": 2.405745062836625, "step": 33500}, {"loss": 0.6914, "grad_norm": 0.8213959336280823, "learning_rate": 0.0002, "epoch": 2.406463195691203, "step": 33510}, {"loss": 0.6855, "grad_norm": 0.8790069818496704, "learning_rate": 0.0002, "epoch": 2.407181328545781, "step": 33520}, {"loss": 0.6278, "grad_norm": 0.9093378782272339, "learning_rate": 0.0002, "epoch": 2.4078994614003593, "step": 33530}, {"loss": 0.6724, "grad_norm": 0.8085389137268066, "learning_rate": 0.0002, "epoch": 2.4086175942549373, "step": 33540}, {"loss": 0.6456, "grad_norm": 0.7952343225479126, "learning_rate": 0.0002, "epoch": 2.4093357271095153, "step": 33550}, {"loss": 0.7357, "grad_norm": 0.9576563835144043, "learning_rate": 0.0002, "epoch": 2.4100538599640933, "step": 33560}, {"loss": 0.7123, "grad_norm": 0.7722929120063782, "learning_rate": 0.0002, "epoch": 2.4107719928186713, "step": 33570}, {"loss": 0.6647, "grad_norm": 0.8634604215621948, "learning_rate": 0.0002, "epoch": 2.4114901256732497, "step": 33580}, {"loss": 0.6677, "grad_norm": 0.7805271148681641, "learning_rate": 0.0002, "epoch": 2.4122082585278277, "step": 33590}, {"loss": 0.6629, "grad_norm": 0.8274481296539307, "learning_rate": 0.0002, "epoch": 2.4129263913824057, "step": 33600}, {"loss": 0.6396, "grad_norm": 0.9265141487121582, "learning_rate": 0.0002, "epoch": 2.4136445242369837, "step": 33610}, {"loss": 0.6727, "grad_norm": 0.7497374415397644, "learning_rate": 0.0002, "epoch": 2.414362657091562, "step": 33620}, {"loss": 0.6543, "grad_norm": 0.7048972249031067, "learning_rate": 0.0002, "epoch": 2.41508078994614, "step": 33630}, {"loss": 0.6863, "grad_norm": 0.8449550271034241, "learning_rate": 0.0002, "epoch": 2.415798922800718, "step": 33640}, {"loss": 0.6891, "grad_norm": 0.7581984400749207, "learning_rate": 0.0002, "epoch": 2.416517055655296, "step": 33650}, {"loss": 0.6845, "grad_norm": 0.7744191288948059, "learning_rate": 0.0002, "epoch": 2.417235188509874, "step": 33660}, {"loss": 0.6412, "grad_norm": 0.6736614108085632, "learning_rate": 0.0002, "epoch": 2.417953321364452, "step": 33670}, {"loss": 0.6792, "grad_norm": 0.985431432723999, "learning_rate": 0.0002, "epoch": 2.4186714542190306, "step": 33680}, {"loss": 0.6675, "grad_norm": 0.8027978539466858, "learning_rate": 0.0002, "epoch": 2.4193895870736086, "step": 33690}, {"loss": 0.7107, "grad_norm": 0.6809377074241638, "learning_rate": 0.0002, "epoch": 2.4201077199281866, "step": 33700}, {"loss": 0.7332, "grad_norm": 0.8305349946022034, "learning_rate": 0.0002, "epoch": 2.4208258527827646, "step": 33710}, {"loss": 0.642, "grad_norm": 0.7632496356964111, "learning_rate": 0.0002, "epoch": 2.421543985637343, "step": 33720}, {"loss": 0.6614, "grad_norm": 0.7241050601005554, "learning_rate": 0.0002, "epoch": 2.422262118491921, "step": 33730}, {"loss": 0.6668, "grad_norm": 0.6729857325553894, "learning_rate": 0.0002, "epoch": 2.422980251346499, "step": 33740}, {"loss": 0.7289, "grad_norm": 0.7741881012916565, "learning_rate": 0.0002, "epoch": 2.423698384201077, "step": 33750}, {"loss": 0.6895, "grad_norm": 0.7844415903091431, "learning_rate": 0.0002, "epoch": 2.424416517055655, "step": 33760}, {"loss": 0.7073, "grad_norm": 0.7960098385810852, "learning_rate": 0.0002, "epoch": 2.4251346499102335, "step": 33770}, {"loss": 0.702, "grad_norm": 0.8267978429794312, "learning_rate": 0.0002, "epoch": 2.4258527827648115, "step": 33780}, {"loss": 0.6379, "grad_norm": 0.7498974204063416, "learning_rate": 0.0002, "epoch": 2.4265709156193895, "step": 33790}, {"loss": 0.6749, "grad_norm": 0.8357859253883362, "learning_rate": 0.0002, "epoch": 2.4272890484739675, "step": 33800}, {"loss": 0.6617, "grad_norm": 0.8056104779243469, "learning_rate": 0.0002, "epoch": 2.428007181328546, "step": 33810}, {"loss": 0.701, "grad_norm": 0.806897759437561, "learning_rate": 0.0002, "epoch": 2.428725314183124, "step": 33820}, {"loss": 0.6771, "grad_norm": 0.7770048975944519, "learning_rate": 0.0002, "epoch": 2.429443447037702, "step": 33830}, {"loss": 0.7096, "grad_norm": 0.8311458230018616, "learning_rate": 0.0002, "epoch": 2.43016157989228, "step": 33840}, {"loss": 0.7127, "grad_norm": 0.9201730489730835, "learning_rate": 0.0002, "epoch": 2.430879712746858, "step": 33850}, {"loss": 0.6722, "grad_norm": 0.83509761095047, "learning_rate": 0.0002, "epoch": 2.4315978456014364, "step": 33860}, {"loss": 0.6477, "grad_norm": 0.7680139541625977, "learning_rate": 0.0002, "epoch": 2.4323159784560144, "step": 33870}, {"loss": 0.7229, "grad_norm": 0.8956670165061951, "learning_rate": 0.0002, "epoch": 2.4330341113105924, "step": 33880}, {"loss": 0.6598, "grad_norm": 0.717941164970398, "learning_rate": 0.0002, "epoch": 2.4337522441651704, "step": 33890}, {"loss": 0.6546, "grad_norm": 0.777206540107727, "learning_rate": 0.0002, "epoch": 2.434470377019749, "step": 33900}, {"loss": 0.7442, "grad_norm": 0.90232914686203, "learning_rate": 0.0002, "epoch": 2.435188509874327, "step": 33910}, {"loss": 0.6763, "grad_norm": 1.0817158222198486, "learning_rate": 0.0002, "epoch": 2.435906642728905, "step": 33920}, {"loss": 0.6995, "grad_norm": 0.7890931367874146, "learning_rate": 0.0002, "epoch": 2.436624775583483, "step": 33930}, {"loss": 0.6438, "grad_norm": 0.9279449582099915, "learning_rate": 0.0002, "epoch": 2.437342908438061, "step": 33940}, {"loss": 0.6694, "grad_norm": 0.8313823342323303, "learning_rate": 0.0002, "epoch": 2.438061041292639, "step": 33950}, {"loss": 0.6841, "grad_norm": 1.0510340929031372, "learning_rate": 0.0002, "epoch": 2.4387791741472173, "step": 33960}, {"loss": 0.7203, "grad_norm": 0.8002574443817139, "learning_rate": 0.0002, "epoch": 2.4394973070017953, "step": 33970}, {"loss": 0.6767, "grad_norm": 0.7822834253311157, "learning_rate": 0.0002, "epoch": 2.4402154398563733, "step": 33980}, {"loss": 0.6289, "grad_norm": 0.9050403237342834, "learning_rate": 0.0002, "epoch": 2.4409335727109513, "step": 33990}, {"loss": 0.6798, "grad_norm": 0.7569652199745178, "learning_rate": 0.0002, "epoch": 2.44165170556553, "step": 34000}, {"loss": 0.648, "grad_norm": 0.6609470844268799, "learning_rate": 0.0002, "epoch": 2.442369838420108, "step": 34010}, {"loss": 0.6734, "grad_norm": 0.8090947866439819, "learning_rate": 0.0002, "epoch": 2.443087971274686, "step": 34020}, {"loss": 0.6621, "grad_norm": 0.647814929485321, "learning_rate": 0.0002, "epoch": 2.443806104129264, "step": 34030}, {"loss": 0.7227, "grad_norm": 0.9308601021766663, "learning_rate": 0.0002, "epoch": 2.444524236983842, "step": 34040}, {"loss": 0.6937, "grad_norm": 0.8259239792823792, "learning_rate": 0.0002, "epoch": 2.4452423698384202, "step": 34050}, {"loss": 0.6813, "grad_norm": 0.9410025477409363, "learning_rate": 0.0002, "epoch": 2.4459605026929983, "step": 34060}, {"loss": 0.7112, "grad_norm": 0.7446974515914917, "learning_rate": 0.0002, "epoch": 2.4466786355475763, "step": 34070}, {"loss": 0.6608, "grad_norm": 0.7093849182128906, "learning_rate": 0.0002, "epoch": 2.4473967684021543, "step": 34080}, {"loss": 0.6801, "grad_norm": 0.8726152181625366, "learning_rate": 0.0002, "epoch": 2.4481149012567327, "step": 34090}, {"loss": 0.7164, "grad_norm": 0.808300793170929, "learning_rate": 0.0002, "epoch": 2.4488330341113107, "step": 34100}, {"loss": 0.658, "grad_norm": 0.6884859800338745, "learning_rate": 0.0002, "epoch": 2.4495511669658887, "step": 34110}, {"loss": 0.6444, "grad_norm": 0.7151864767074585, "learning_rate": 0.0002, "epoch": 2.4502692998204667, "step": 34120}, {"loss": 0.6685, "grad_norm": 0.9261866807937622, "learning_rate": 0.0002, "epoch": 2.4509874326750447, "step": 34130}, {"loss": 0.6717, "grad_norm": 0.8069018125534058, "learning_rate": 0.0002, "epoch": 2.451705565529623, "step": 34140}, {"loss": 0.7436, "grad_norm": 0.8001297116279602, "learning_rate": 0.0002, "epoch": 2.452423698384201, "step": 34150}, {"loss": 0.7032, "grad_norm": 0.8547799587249756, "learning_rate": 0.0002, "epoch": 2.453141831238779, "step": 34160}, {"loss": 0.7226, "grad_norm": 0.6693823337554932, "learning_rate": 0.0002, "epoch": 2.453859964093357, "step": 34170}, {"loss": 0.6644, "grad_norm": 0.6646198630332947, "learning_rate": 0.0002, "epoch": 2.4545780969479356, "step": 34180}, {"loss": 0.6891, "grad_norm": 0.9330950975418091, "learning_rate": 0.0002, "epoch": 2.4552962298025136, "step": 34190}, {"loss": 0.6728, "grad_norm": 0.7738645672798157, "learning_rate": 0.0002, "epoch": 2.4560143626570916, "step": 34200}, {"loss": 0.7162, "grad_norm": 0.7929846048355103, "learning_rate": 0.0002, "epoch": 2.4567324955116696, "step": 34210}, {"loss": 0.6793, "grad_norm": 0.8936280012130737, "learning_rate": 0.0002, "epoch": 2.4574506283662476, "step": 34220}, {"loss": 0.6758, "grad_norm": 0.9099360108375549, "learning_rate": 0.0002, "epoch": 2.4581687612208256, "step": 34230}, {"loss": 0.666, "grad_norm": 0.7941291928291321, "learning_rate": 0.0002, "epoch": 2.458886894075404, "step": 34240}, {"loss": 0.6689, "grad_norm": 0.7169737219810486, "learning_rate": 0.0002, "epoch": 2.459605026929982, "step": 34250}, {"loss": 0.7417, "grad_norm": 0.8994171023368835, "learning_rate": 0.0002, "epoch": 2.46032315978456, "step": 34260}, {"loss": 0.6807, "grad_norm": 0.8087331056594849, "learning_rate": 0.0002, "epoch": 2.461041292639138, "step": 34270}, {"loss": 0.7152, "grad_norm": 0.935502827167511, "learning_rate": 0.0002, "epoch": 2.4617594254937165, "step": 34280}, {"loss": 0.7448, "grad_norm": 0.8957464694976807, "learning_rate": 0.0002, "epoch": 2.4624775583482945, "step": 34290}, {"loss": 0.6501, "grad_norm": 0.9017183780670166, "learning_rate": 0.0002, "epoch": 2.4631956912028725, "step": 34300}, {"loss": 0.6985, "grad_norm": 0.7778640389442444, "learning_rate": 0.0002, "epoch": 2.4639138240574505, "step": 34310}, {"loss": 0.7041, "grad_norm": 0.8870323896408081, "learning_rate": 0.0002, "epoch": 2.4646319569120285, "step": 34320}, {"loss": 0.6796, "grad_norm": 0.7660176753997803, "learning_rate": 0.0002, "epoch": 2.465350089766607, "step": 34330}, {"loss": 0.6705, "grad_norm": 0.8442226648330688, "learning_rate": 0.0002, "epoch": 2.466068222621185, "step": 34340}, {"loss": 0.7019, "grad_norm": 0.7522561550140381, "learning_rate": 0.0002, "epoch": 2.466786355475763, "step": 34350}, {"loss": 0.7331, "grad_norm": 0.9355213046073914, "learning_rate": 0.0002, "epoch": 2.467504488330341, "step": 34360}, {"loss": 0.688, "grad_norm": 0.8487382531166077, "learning_rate": 0.0002, "epoch": 2.4682226211849194, "step": 34370}, {"loss": 0.7068, "grad_norm": 0.7869813442230225, "learning_rate": 0.0002, "epoch": 2.4689407540394974, "step": 34380}, {"loss": 0.6809, "grad_norm": 0.7562848329544067, "learning_rate": 0.0002, "epoch": 2.4696588868940754, "step": 34390}, {"loss": 0.653, "grad_norm": 0.740829586982727, "learning_rate": 0.0002, "epoch": 2.4703770197486534, "step": 34400}, {"loss": 0.656, "grad_norm": 1.0862116813659668, "learning_rate": 0.0002, "epoch": 2.4710951526032314, "step": 34410}, {"loss": 0.6429, "grad_norm": 0.9633645415306091, "learning_rate": 0.0002, "epoch": 2.47181328545781, "step": 34420}, {"loss": 0.7126, "grad_norm": 0.8467186093330383, "learning_rate": 0.0002, "epoch": 2.472531418312388, "step": 34430}, {"loss": 0.6783, "grad_norm": 0.9972147941589355, "learning_rate": 0.0002, "epoch": 2.473249551166966, "step": 34440}, {"loss": 0.701, "grad_norm": 0.8086632490158081, "learning_rate": 0.0002, "epoch": 2.473967684021544, "step": 34450}, {"loss": 0.7127, "grad_norm": 0.9043704271316528, "learning_rate": 0.0002, "epoch": 2.4746858168761223, "step": 34460}, {"loss": 0.6861, "grad_norm": 0.8275330662727356, "learning_rate": 0.0002, "epoch": 2.4754039497307003, "step": 34470}, {"loss": 0.6443, "grad_norm": 0.8142464756965637, "learning_rate": 0.0002, "epoch": 2.4761220825852783, "step": 34480}, {"loss": 0.637, "grad_norm": 0.7116754651069641, "learning_rate": 0.0002, "epoch": 2.4768402154398563, "step": 34490}, {"loss": 0.6572, "grad_norm": 0.8742281198501587, "learning_rate": 0.0002, "epoch": 2.4775583482944343, "step": 34500}, {"loss": 0.6615, "grad_norm": 0.7545657157897949, "learning_rate": 0.0002, "epoch": 2.4782764811490123, "step": 34510}, {"loss": 0.6715, "grad_norm": 0.7586482167243958, "learning_rate": 0.0002, "epoch": 2.478994614003591, "step": 34520}, {"loss": 0.71, "grad_norm": 0.9212547540664673, "learning_rate": 0.0002, "epoch": 2.479712746858169, "step": 34530}, {"loss": 0.6742, "grad_norm": 0.9391530752182007, "learning_rate": 0.0002, "epoch": 2.480430879712747, "step": 34540}, {"loss": 0.6565, "grad_norm": 1.119698166847229, "learning_rate": 0.0002, "epoch": 2.481149012567325, "step": 34550}, {"loss": 0.6734, "grad_norm": 0.8499019145965576, "learning_rate": 0.0002, "epoch": 2.4818671454219032, "step": 34560}, {"loss": 0.7043, "grad_norm": 0.7629778385162354, "learning_rate": 0.0002, "epoch": 2.4825852782764812, "step": 34570}, {"loss": 0.671, "grad_norm": 0.7667021155357361, "learning_rate": 0.0002, "epoch": 2.4833034111310592, "step": 34580}, {"loss": 0.6202, "grad_norm": 0.6711493730545044, "learning_rate": 0.0002, "epoch": 2.4840215439856372, "step": 34590}, {"loss": 0.6644, "grad_norm": 0.7354223728179932, "learning_rate": 0.0002, "epoch": 2.4847396768402152, "step": 34600}, {"loss": 0.622, "grad_norm": 0.875295102596283, "learning_rate": 0.0002, "epoch": 2.4854578096947937, "step": 34610}, {"loss": 0.6946, "grad_norm": 0.7341493964195251, "learning_rate": 0.0002, "epoch": 2.4861759425493717, "step": 34620}, {"loss": 0.6674, "grad_norm": 0.9049216508865356, "learning_rate": 0.0002, "epoch": 2.4868940754039497, "step": 34630}, {"loss": 0.7017, "grad_norm": 0.7214788198471069, "learning_rate": 0.0002, "epoch": 2.4876122082585277, "step": 34640}, {"loss": 0.6571, "grad_norm": 0.7514070868492126, "learning_rate": 0.0002, "epoch": 2.488330341113106, "step": 34650}, {"loss": 0.6623, "grad_norm": 0.6929763555526733, "learning_rate": 0.0002, "epoch": 2.489048473967684, "step": 34660}, {"loss": 0.7118, "grad_norm": 1.11346435546875, "learning_rate": 0.0002, "epoch": 2.489766606822262, "step": 34670}, {"loss": 0.6664, "grad_norm": 0.9285556674003601, "learning_rate": 0.0002, "epoch": 2.49048473967684, "step": 34680}, {"loss": 0.7094, "grad_norm": 0.7699695825576782, "learning_rate": 0.0002, "epoch": 2.491202872531418, "step": 34690}, {"loss": 0.6575, "grad_norm": 0.872349739074707, "learning_rate": 0.0002, "epoch": 2.4919210053859966, "step": 34700}, {"loss": 0.6886, "grad_norm": 0.8692147135734558, "learning_rate": 0.0002, "epoch": 2.4926391382405746, "step": 34710}, {"loss": 0.711, "grad_norm": 0.799740195274353, "learning_rate": 0.0002, "epoch": 2.4933572710951526, "step": 34720}, {"loss": 0.6849, "grad_norm": 0.7320986986160278, "learning_rate": 0.0002, "epoch": 2.4940754039497306, "step": 34730}, {"loss": 0.7138, "grad_norm": 0.8233383893966675, "learning_rate": 0.0002, "epoch": 2.494793536804309, "step": 34740}, {"loss": 0.6937, "grad_norm": 0.9605086445808411, "learning_rate": 0.0002, "epoch": 2.495511669658887, "step": 34750}, {"loss": 0.6511, "grad_norm": 0.8597773909568787, "learning_rate": 0.0002, "epoch": 2.496229802513465, "step": 34760}, {"loss": 0.6793, "grad_norm": 0.7459201812744141, "learning_rate": 0.0002, "epoch": 2.496947935368043, "step": 34770}, {"loss": 0.7098, "grad_norm": 0.778457522392273, "learning_rate": 0.0002, "epoch": 2.497666068222621, "step": 34780}, {"loss": 0.6727, "grad_norm": 0.8591375946998596, "learning_rate": 0.0002, "epoch": 2.498384201077199, "step": 34790}, {"loss": 0.6439, "grad_norm": 0.9689867496490479, "learning_rate": 0.0002, "epoch": 2.4991023339317775, "step": 34800}, {"loss": 0.6365, "grad_norm": 0.7430615425109863, "learning_rate": 0.0002, "epoch": 2.4998204667863555, "step": 34810}, {"loss": 0.7207, "grad_norm": 0.8545114994049072, "learning_rate": 0.0002, "epoch": 2.5005385996409335, "step": 34820}, {"loss": 0.7318, "grad_norm": 0.7115356922149658, "learning_rate": 0.0002, "epoch": 2.5012567324955115, "step": 34830}, {"loss": 0.6985, "grad_norm": 0.7616795301437378, "learning_rate": 0.0002, "epoch": 2.50197486535009, "step": 34840}, {"loss": 0.7153, "grad_norm": 0.8097891211509705, "learning_rate": 0.0002, "epoch": 2.502692998204668, "step": 34850}, {"loss": 0.7131, "grad_norm": 0.7397396564483643, "learning_rate": 0.0002, "epoch": 2.503411131059246, "step": 34860}, {"loss": 0.7213, "grad_norm": 0.7531594038009644, "learning_rate": 0.0002, "epoch": 2.504129263913824, "step": 34870}, {"loss": 0.678, "grad_norm": 0.8050091862678528, "learning_rate": 0.0002, "epoch": 2.504847396768402, "step": 34880}, {"loss": 0.6765, "grad_norm": 0.7550507187843323, "learning_rate": 0.0002, "epoch": 2.5055655296229804, "step": 34890}, {"loss": 0.6861, "grad_norm": 1.0131759643554688, "learning_rate": 0.0002, "epoch": 2.5062836624775584, "step": 34900}, {"loss": 0.6755, "grad_norm": 0.9275356531143188, "learning_rate": 0.0002, "epoch": 2.5070017953321364, "step": 34910}, {"loss": 0.7108, "grad_norm": 0.6655791997909546, "learning_rate": 0.0002, "epoch": 2.5077199281867144, "step": 34920}, {"loss": 0.7154, "grad_norm": 0.79361891746521, "learning_rate": 0.0002, "epoch": 2.508438061041293, "step": 34930}, {"loss": 0.6506, "grad_norm": 0.8223658800125122, "learning_rate": 0.0002, "epoch": 2.509156193895871, "step": 34940}, {"loss": 0.6869, "grad_norm": 1.0070416927337646, "learning_rate": 0.0002, "epoch": 2.509874326750449, "step": 34950}, {"loss": 0.6819, "grad_norm": 0.8408986330032349, "learning_rate": 0.0002, "epoch": 2.510592459605027, "step": 34960}, {"loss": 0.7195, "grad_norm": 0.8178259134292603, "learning_rate": 0.0002, "epoch": 2.511310592459605, "step": 34970}, {"loss": 0.6738, "grad_norm": 0.747876763343811, "learning_rate": 0.0002, "epoch": 2.512028725314183, "step": 34980}, {"loss": 0.6706, "grad_norm": 0.8551825881004333, "learning_rate": 0.0002, "epoch": 2.5127468581687613, "step": 34990}, {"loss": 0.653, "grad_norm": 0.8366564512252808, "learning_rate": 0.0002, "epoch": 2.5134649910233393, "step": 35000}, {"loss": 0.6427, "grad_norm": 0.8491294384002686, "learning_rate": 0.0002, "epoch": 2.5141831238779173, "step": 35010}, {"loss": 0.6714, "grad_norm": 0.8854562640190125, "learning_rate": 0.0002, "epoch": 2.5149012567324958, "step": 35020}, {"loss": 0.6606, "grad_norm": 0.8652133345603943, "learning_rate": 0.0002, "epoch": 2.5156193895870738, "step": 35030}, {"loss": 0.658, "grad_norm": 0.8734033107757568, "learning_rate": 0.0002, "epoch": 2.5163375224416518, "step": 35040}, {"loss": 0.6528, "grad_norm": 0.8613446950912476, "learning_rate": 0.0002, "epoch": 2.5170556552962298, "step": 35050}, {"loss": 0.6943, "grad_norm": 0.762395441532135, "learning_rate": 0.0002, "epoch": 2.5177737881508078, "step": 35060}, {"loss": 0.66, "grad_norm": 0.806220293045044, "learning_rate": 0.0002, "epoch": 2.5184919210053858, "step": 35070}, {"loss": 0.6867, "grad_norm": 0.7781713008880615, "learning_rate": 0.0002, "epoch": 2.519210053859964, "step": 35080}, {"loss": 0.6927, "grad_norm": 0.8639848828315735, "learning_rate": 0.0002, "epoch": 2.519928186714542, "step": 35090}, {"loss": 0.6397, "grad_norm": 0.7331740260124207, "learning_rate": 0.0002, "epoch": 2.52064631956912, "step": 35100}, {"loss": 0.6916, "grad_norm": 0.8148137927055359, "learning_rate": 0.0002, "epoch": 2.521364452423698, "step": 35110}, {"loss": 0.6877, "grad_norm": 0.6939297914505005, "learning_rate": 0.0002, "epoch": 2.5220825852782767, "step": 35120}, {"loss": 0.6669, "grad_norm": 0.8151076436042786, "learning_rate": 0.0002, "epoch": 2.5228007181328547, "step": 35130}, {"loss": 0.6761, "grad_norm": 0.9193238019943237, "learning_rate": 0.0002, "epoch": 2.5235188509874327, "step": 35140}, {"loss": 0.7136, "grad_norm": 0.8230985403060913, "learning_rate": 0.0002, "epoch": 2.5242369838420107, "step": 35150}, {"loss": 0.7127, "grad_norm": 0.865492582321167, "learning_rate": 0.0002, "epoch": 2.5249551166965887, "step": 35160}, {"loss": 0.6591, "grad_norm": 0.7673570513725281, "learning_rate": 0.0002, "epoch": 2.525673249551167, "step": 35170}, {"loss": 0.6703, "grad_norm": 0.8296313881874084, "learning_rate": 0.0002, "epoch": 2.526391382405745, "step": 35180}, {"loss": 0.6588, "grad_norm": 0.6531317234039307, "learning_rate": 0.0002, "epoch": 2.527109515260323, "step": 35190}, {"loss": 0.7129, "grad_norm": 0.9865642189979553, "learning_rate": 0.0002, "epoch": 2.527827648114901, "step": 35200}, {"loss": 0.6728, "grad_norm": 0.8001098036766052, "learning_rate": 0.0002, "epoch": 2.5285457809694796, "step": 35210}, {"loss": 0.6737, "grad_norm": 0.7523218393325806, "learning_rate": 0.0002, "epoch": 2.5292639138240576, "step": 35220}, {"loss": 0.6426, "grad_norm": 1.061640977859497, "learning_rate": 0.0002, "epoch": 2.5299820466786356, "step": 35230}, {"loss": 0.6974, "grad_norm": 0.9668078422546387, "learning_rate": 0.0002, "epoch": 2.5307001795332136, "step": 35240}, {"loss": 0.7189, "grad_norm": 0.9554983973503113, "learning_rate": 0.0002, "epoch": 2.5314183123877916, "step": 35250}, {"loss": 0.648, "grad_norm": 0.8343066573143005, "learning_rate": 0.0002, "epoch": 2.5321364452423696, "step": 35260}, {"loss": 0.639, "grad_norm": 0.8408095240592957, "learning_rate": 0.0002, "epoch": 2.532854578096948, "step": 35270}, {"loss": 0.6412, "grad_norm": 0.8593984842300415, "learning_rate": 0.0002, "epoch": 2.533572710951526, "step": 35280}, {"loss": 0.6689, "grad_norm": 0.7593855261802673, "learning_rate": 0.0002, "epoch": 2.534290843806104, "step": 35290}, {"loss": 0.6731, "grad_norm": 0.9179701209068298, "learning_rate": 0.0002, "epoch": 2.5350089766606825, "step": 35300}, {"loss": 0.7194, "grad_norm": 0.749022901058197, "learning_rate": 0.0002, "epoch": 2.5357271095152605, "step": 35310}, {"loss": 0.6488, "grad_norm": 0.7172152400016785, "learning_rate": 0.0002, "epoch": 2.5364452423698385, "step": 35320}, {"loss": 0.6934, "grad_norm": 0.8228873610496521, "learning_rate": 0.0002, "epoch": 2.5371633752244165, "step": 35330}, {"loss": 0.7245, "grad_norm": 0.9663547277450562, "learning_rate": 0.0002, "epoch": 2.5378815080789945, "step": 35340}, {"loss": 0.6974, "grad_norm": 0.8446536660194397, "learning_rate": 0.0002, "epoch": 2.5385996409335725, "step": 35350}, {"loss": 0.6942, "grad_norm": 0.9751029014587402, "learning_rate": 0.0002, "epoch": 2.539317773788151, "step": 35360}, {"loss": 0.7001, "grad_norm": 0.7460315823554993, "learning_rate": 0.0002, "epoch": 2.540035906642729, "step": 35370}, {"loss": 0.6928, "grad_norm": 0.8269246816635132, "learning_rate": 0.0002, "epoch": 2.540754039497307, "step": 35380}, {"loss": 0.6559, "grad_norm": 0.7200030088424683, "learning_rate": 0.0002, "epoch": 2.541472172351885, "step": 35390}, {"loss": 0.6736, "grad_norm": 0.9586671590805054, "learning_rate": 0.0002, "epoch": 2.5421903052064634, "step": 35400}, {"loss": 0.6653, "grad_norm": 0.7872378826141357, "learning_rate": 0.0002, "epoch": 2.5429084380610414, "step": 35410}, {"loss": 0.7002, "grad_norm": 0.8257358074188232, "learning_rate": 0.0002, "epoch": 2.5436265709156194, "step": 35420}, {"loss": 0.6888, "grad_norm": 0.6924505829811096, "learning_rate": 0.0002, "epoch": 2.5443447037701974, "step": 35430}, {"loss": 0.6536, "grad_norm": 1.1171481609344482, "learning_rate": 0.0002, "epoch": 2.5450628366247754, "step": 35440}, {"loss": 0.7087, "grad_norm": 0.9635605216026306, "learning_rate": 0.0002, "epoch": 2.545780969479354, "step": 35450}, {"loss": 0.6545, "grad_norm": 0.9760567545890808, "learning_rate": 0.0002, "epoch": 2.546499102333932, "step": 35460}, {"loss": 0.6858, "grad_norm": 0.8523460030555725, "learning_rate": 0.0002, "epoch": 2.54721723518851, "step": 35470}, {"loss": 0.6702, "grad_norm": 0.9316970109939575, "learning_rate": 0.0002, "epoch": 2.547935368043088, "step": 35480}, {"loss": 0.7028, "grad_norm": 0.7401485443115234, "learning_rate": 0.0002, "epoch": 2.5486535008976663, "step": 35490}, {"loss": 0.6991, "grad_norm": 1.0627065896987915, "learning_rate": 0.0002, "epoch": 2.5493716337522443, "step": 35500}, {"loss": 0.6401, "grad_norm": 0.7463156580924988, "learning_rate": 0.0002, "epoch": 2.5500897666068223, "step": 35510}, {"loss": 0.6978, "grad_norm": 0.9935570359230042, "learning_rate": 0.0002, "epoch": 2.5508078994614003, "step": 35520}, {"loss": 0.7531, "grad_norm": 0.8824051022529602, "learning_rate": 0.0002, "epoch": 2.5515260323159783, "step": 35530}, {"loss": 0.7078, "grad_norm": 0.8018375635147095, "learning_rate": 0.0002, "epoch": 2.5522441651705563, "step": 35540}, {"loss": 0.6757, "grad_norm": 0.7523182034492493, "learning_rate": 0.0002, "epoch": 2.5529622980251347, "step": 35550}, {"loss": 0.6631, "grad_norm": 0.6771712303161621, "learning_rate": 0.0002, "epoch": 2.5536804308797127, "step": 35560}, {"loss": 0.6679, "grad_norm": 0.7903336882591248, "learning_rate": 0.0002, "epoch": 2.5543985637342908, "step": 35570}, {"loss": 0.7069, "grad_norm": 0.7973808646202087, "learning_rate": 0.0002, "epoch": 2.555116696588869, "step": 35580}, {"loss": 0.6388, "grad_norm": 0.9082772731781006, "learning_rate": 0.0002, "epoch": 2.555834829443447, "step": 35590}, {"loss": 0.6926, "grad_norm": 0.779671311378479, "learning_rate": 0.0002, "epoch": 2.556552962298025, "step": 35600}, {"loss": 0.6966, "grad_norm": 0.710058331489563, "learning_rate": 0.0002, "epoch": 2.557271095152603, "step": 35610}, {"loss": 0.701, "grad_norm": 0.8217873573303223, "learning_rate": 0.0002, "epoch": 2.557989228007181, "step": 35620}, {"loss": 0.6773, "grad_norm": 0.8017855286598206, "learning_rate": 0.0002, "epoch": 2.558707360861759, "step": 35630}, {"loss": 0.6764, "grad_norm": 0.6671402454376221, "learning_rate": 0.0002, "epoch": 2.5594254937163377, "step": 35640}, {"loss": 0.6946, "grad_norm": 0.9357045292854309, "learning_rate": 0.0002, "epoch": 2.5601436265709157, "step": 35650}, {"loss": 0.695, "grad_norm": 0.7676312327384949, "learning_rate": 0.0002, "epoch": 2.5608617594254937, "step": 35660}, {"loss": 0.7086, "grad_norm": 0.7602545619010925, "learning_rate": 0.0002, "epoch": 2.5615798922800717, "step": 35670}, {"loss": 0.695, "grad_norm": 0.8112275004386902, "learning_rate": 0.0002, "epoch": 2.56229802513465, "step": 35680}, {"loss": 0.7492, "grad_norm": 0.73296719789505, "learning_rate": 0.0002, "epoch": 2.563016157989228, "step": 35690}, {"loss": 0.6935, "grad_norm": 0.9007818102836609, "learning_rate": 0.0002, "epoch": 2.563734290843806, "step": 35700}, {"loss": 0.7287, "grad_norm": 0.7526060938835144, "learning_rate": 0.0002, "epoch": 2.564452423698384, "step": 35710}, {"loss": 0.6762, "grad_norm": 0.813875675201416, "learning_rate": 0.0002, "epoch": 2.565170556552962, "step": 35720}, {"loss": 0.666, "grad_norm": 0.7767695784568787, "learning_rate": 0.0002, "epoch": 2.5658886894075406, "step": 35730}, {"loss": 0.6591, "grad_norm": 0.7840573787689209, "learning_rate": 0.0002, "epoch": 2.5666068222621186, "step": 35740}, {"loss": 0.7131, "grad_norm": 0.7400487661361694, "learning_rate": 0.0002, "epoch": 2.5673249551166966, "step": 35750}, {"loss": 0.6571, "grad_norm": 0.7424315810203552, "learning_rate": 0.0002, "epoch": 2.5680430879712746, "step": 35760}, {"loss": 0.6861, "grad_norm": 0.7812185883522034, "learning_rate": 0.0002, "epoch": 2.568761220825853, "step": 35770}, {"loss": 0.7034, "grad_norm": 0.8397669196128845, "learning_rate": 0.0002, "epoch": 2.569479353680431, "step": 35780}, {"loss": 0.6734, "grad_norm": 0.7543849945068359, "learning_rate": 0.0002, "epoch": 2.570197486535009, "step": 35790}, {"loss": 0.7393, "grad_norm": 0.903634786605835, "learning_rate": 0.0002, "epoch": 2.570915619389587, "step": 35800}, {"loss": 0.6884, "grad_norm": 0.853335976600647, "learning_rate": 0.0002, "epoch": 2.571633752244165, "step": 35810}, {"loss": 0.6843, "grad_norm": 0.8441029787063599, "learning_rate": 0.0002, "epoch": 2.572351885098743, "step": 35820}, {"loss": 0.6874, "grad_norm": 0.9072228670120239, "learning_rate": 0.0002, "epoch": 2.5730700179533215, "step": 35830}, {"loss": 0.6866, "grad_norm": 0.7720168828964233, "learning_rate": 0.0002, "epoch": 2.5737881508078995, "step": 35840}, {"loss": 0.695, "grad_norm": 0.8719366788864136, "learning_rate": 0.0002, "epoch": 2.5745062836624775, "step": 35850}, {"loss": 0.7842, "grad_norm": 0.766209065914154, "learning_rate": 0.0002, "epoch": 2.575224416517056, "step": 35860}, {"loss": 0.6688, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 2.575942549371634, "step": 35870}, {"loss": 0.7309, "grad_norm": 0.8068482875823975, "learning_rate": 0.0002, "epoch": 2.576660682226212, "step": 35880}, {"loss": 0.703, "grad_norm": 0.8321225643157959, "learning_rate": 0.0002, "epoch": 2.57737881508079, "step": 35890}, {"loss": 0.6885, "grad_norm": 0.9787611961364746, "learning_rate": 0.0002, "epoch": 2.578096947935368, "step": 35900}, {"loss": 0.7246, "grad_norm": 0.6955108642578125, "learning_rate": 0.0002, "epoch": 2.578815080789946, "step": 35910}, {"loss": 0.6972, "grad_norm": 0.8309195637702942, "learning_rate": 0.0002, "epoch": 2.5795332136445244, "step": 35920}, {"loss": 0.6735, "grad_norm": 0.9309390783309937, "learning_rate": 0.0002, "epoch": 2.5802513464991024, "step": 35930}, {"loss": 0.7376, "grad_norm": 0.903537392616272, "learning_rate": 0.0002, "epoch": 2.5809694793536804, "step": 35940}, {"loss": 0.6578, "grad_norm": 0.9530633091926575, "learning_rate": 0.0002, "epoch": 2.5816876122082584, "step": 35950}, {"loss": 0.6707, "grad_norm": 1.0140212774276733, "learning_rate": 0.0002, "epoch": 2.582405745062837, "step": 35960}, {"loss": 0.6859, "grad_norm": 0.8224637508392334, "learning_rate": 0.0002, "epoch": 2.583123877917415, "step": 35970}, {"loss": 0.7158, "grad_norm": 0.7952998280525208, "learning_rate": 0.0002, "epoch": 2.583842010771993, "step": 35980}, {"loss": 0.65, "grad_norm": 0.6057878136634827, "learning_rate": 0.0002, "epoch": 2.584560143626571, "step": 35990}, {"loss": 0.6566, "grad_norm": 0.9172457456588745, "learning_rate": 0.0002, "epoch": 2.585278276481149, "step": 36000}, {"loss": 0.6863, "grad_norm": 1.0061585903167725, "learning_rate": 0.0002, "epoch": 2.5859964093357273, "step": 36010}, {"loss": 0.6831, "grad_norm": 0.8555058240890503, "learning_rate": 0.0002, "epoch": 2.5867145421903053, "step": 36020}, {"loss": 0.7181, "grad_norm": 0.7732099890708923, "learning_rate": 0.0002, "epoch": 2.5874326750448833, "step": 36030}, {"loss": 0.7383, "grad_norm": 0.9026121497154236, "learning_rate": 0.0002, "epoch": 2.5881508078994613, "step": 36040}, {"loss": 0.6221, "grad_norm": 0.7477090954780579, "learning_rate": 0.0002, "epoch": 2.5888689407540397, "step": 36050}, {"loss": 0.6852, "grad_norm": 0.8835780024528503, "learning_rate": 0.0002, "epoch": 2.5895870736086177, "step": 36060}, {"loss": 0.6786, "grad_norm": 0.7555899024009705, "learning_rate": 0.0002, "epoch": 2.5903052064631957, "step": 36070}, {"loss": 0.6723, "grad_norm": 0.7983574867248535, "learning_rate": 0.0002, "epoch": 2.5910233393177737, "step": 36080}, {"loss": 0.64, "grad_norm": 0.9261698722839355, "learning_rate": 0.0002, "epoch": 2.5917414721723517, "step": 36090}, {"loss": 0.6363, "grad_norm": 0.6834031343460083, "learning_rate": 0.0002, "epoch": 2.5924596050269297, "step": 36100}, {"loss": 0.702, "grad_norm": 0.9528526067733765, "learning_rate": 0.0002, "epoch": 2.593177737881508, "step": 36110}, {"loss": 0.7271, "grad_norm": 0.7469993233680725, "learning_rate": 0.0002, "epoch": 2.593895870736086, "step": 36120}, {"loss": 0.6967, "grad_norm": 0.6750355362892151, "learning_rate": 0.0002, "epoch": 2.594614003590664, "step": 36130}, {"loss": 0.6893, "grad_norm": 0.8591015338897705, "learning_rate": 0.0002, "epoch": 2.5953321364452426, "step": 36140}, {"loss": 0.7015, "grad_norm": 0.7359472513198853, "learning_rate": 0.0002, "epoch": 2.5960502692998206, "step": 36150}, {"loss": 0.6697, "grad_norm": 0.8450608253479004, "learning_rate": 0.0002, "epoch": 2.5967684021543986, "step": 36160}, {"loss": 0.7034, "grad_norm": 0.9069468975067139, "learning_rate": 0.0002, "epoch": 2.5974865350089766, "step": 36170}, {"loss": 0.6814, "grad_norm": 0.9261118173599243, "learning_rate": 0.0002, "epoch": 2.5982046678635546, "step": 36180}, {"loss": 0.6575, "grad_norm": 0.7164715528488159, "learning_rate": 0.0002, "epoch": 2.5989228007181326, "step": 36190}, {"loss": 0.7044, "grad_norm": 0.8809511661529541, "learning_rate": 0.0002, "epoch": 2.599640933572711, "step": 36200}, {"loss": 0.6333, "grad_norm": 0.9872701168060303, "learning_rate": 0.0002, "epoch": 2.600359066427289, "step": 36210}, {"loss": 0.689, "grad_norm": 0.7544043064117432, "learning_rate": 0.0002, "epoch": 2.601077199281867, "step": 36220}, {"loss": 0.658, "grad_norm": 0.9890767335891724, "learning_rate": 0.0002, "epoch": 2.601795332136445, "step": 36230}, {"loss": 0.6981, "grad_norm": 0.907865047454834, "learning_rate": 0.0002, "epoch": 2.6025134649910235, "step": 36240}, {"loss": 0.7131, "grad_norm": 0.7724096179008484, "learning_rate": 0.0002, "epoch": 2.6032315978456015, "step": 36250}, {"loss": 0.7034, "grad_norm": 0.7996655106544495, "learning_rate": 0.0002, "epoch": 2.6039497307001795, "step": 36260}, {"loss": 0.6744, "grad_norm": 0.7184412479400635, "learning_rate": 0.0002, "epoch": 2.6046678635547575, "step": 36270}, {"loss": 0.7133, "grad_norm": 0.7781601548194885, "learning_rate": 0.0002, "epoch": 2.6053859964093355, "step": 36280}, {"loss": 0.6975, "grad_norm": 0.8972102403640747, "learning_rate": 0.0002, "epoch": 2.6061041292639135, "step": 36290}, {"loss": 0.6757, "grad_norm": 0.6831884980201721, "learning_rate": 0.0002, "epoch": 2.606822262118492, "step": 36300}, {"loss": 0.6633, "grad_norm": 0.9049789905548096, "learning_rate": 0.0002, "epoch": 2.60754039497307, "step": 36310}, {"loss": 0.7048, "grad_norm": 0.8062970042228699, "learning_rate": 0.0002, "epoch": 2.608258527827648, "step": 36320}, {"loss": 0.6695, "grad_norm": 0.94797682762146, "learning_rate": 0.0002, "epoch": 2.6089766606822264, "step": 36330}, {"loss": 0.6934, "grad_norm": 0.7907559275627136, "learning_rate": 0.0002, "epoch": 2.6096947935368044, "step": 36340}, {"loss": 0.6299, "grad_norm": 0.6720156073570251, "learning_rate": 0.0002, "epoch": 2.6104129263913824, "step": 36350}, {"loss": 0.644, "grad_norm": 0.729228138923645, "learning_rate": 0.0002, "epoch": 2.6111310592459605, "step": 36360}, {"loss": 0.6651, "grad_norm": 0.9072836637496948, "learning_rate": 0.0002, "epoch": 2.6118491921005385, "step": 36370}, {"loss": 0.6821, "grad_norm": 0.8022173643112183, "learning_rate": 0.0002, "epoch": 2.6125673249551165, "step": 36380}, {"loss": 0.6587, "grad_norm": 0.7475612163543701, "learning_rate": 0.0002, "epoch": 2.613285457809695, "step": 36390}, {"loss": 0.6454, "grad_norm": 0.7976534366607666, "learning_rate": 0.0002, "epoch": 2.614003590664273, "step": 36400}, {"loss": 0.7173, "grad_norm": 0.7118260860443115, "learning_rate": 0.0002, "epoch": 2.614721723518851, "step": 36410}, {"loss": 0.7173, "grad_norm": 0.666500985622406, "learning_rate": 0.0002, "epoch": 2.6154398563734294, "step": 36420}, {"loss": 0.719, "grad_norm": 0.8776089549064636, "learning_rate": 0.0002, "epoch": 2.6161579892280074, "step": 36430}, {"loss": 0.6928, "grad_norm": 0.9375919699668884, "learning_rate": 0.0002, "epoch": 2.6168761220825854, "step": 36440}, {"loss": 0.6627, "grad_norm": 0.8162244558334351, "learning_rate": 0.0002, "epoch": 2.6175942549371634, "step": 36450}, {"loss": 0.6586, "grad_norm": 0.8459304571151733, "learning_rate": 0.0002, "epoch": 2.6183123877917414, "step": 36460}, {"loss": 0.6777, "grad_norm": 0.7731037735939026, "learning_rate": 0.0002, "epoch": 2.6190305206463194, "step": 36470}, {"loss": 0.7288, "grad_norm": 0.7857680320739746, "learning_rate": 0.0002, "epoch": 2.619748653500898, "step": 36480}, {"loss": 0.664, "grad_norm": 0.8415161371231079, "learning_rate": 0.0002, "epoch": 2.620466786355476, "step": 36490}, {"loss": 0.703, "grad_norm": 0.8103558421134949, "learning_rate": 0.0002, "epoch": 2.621184919210054, "step": 36500}, {"loss": 0.6693, "grad_norm": 0.7876150608062744, "learning_rate": 0.0002, "epoch": 2.621903052064632, "step": 36510}, {"loss": 0.6562, "grad_norm": 0.7316484451293945, "learning_rate": 0.0002, "epoch": 2.6226211849192103, "step": 36520}, {"loss": 0.6263, "grad_norm": 0.7209784984588623, "learning_rate": 0.0002, "epoch": 2.6233393177737883, "step": 36530}, {"loss": 0.6767, "grad_norm": 0.8933016657829285, "learning_rate": 0.0002, "epoch": 2.6240574506283663, "step": 36540}, {"loss": 0.7217, "grad_norm": 0.8078171610832214, "learning_rate": 0.0002, "epoch": 2.6247755834829443, "step": 36550}, {"loss": 0.7106, "grad_norm": 0.9134724736213684, "learning_rate": 0.0002, "epoch": 2.6254937163375223, "step": 36560}, {"loss": 0.6909, "grad_norm": 0.8691368699073792, "learning_rate": 0.0002, "epoch": 2.6262118491921003, "step": 36570}, {"loss": 0.6769, "grad_norm": 0.706479012966156, "learning_rate": 0.0002, "epoch": 2.6269299820466787, "step": 36580}, {"loss": 0.6864, "grad_norm": 0.9333644509315491, "learning_rate": 0.0002, "epoch": 2.6276481149012567, "step": 36590}, {"loss": 0.6704, "grad_norm": 0.8156154155731201, "learning_rate": 0.0002, "epoch": 2.6283662477558347, "step": 36600}, {"loss": 0.7128, "grad_norm": 0.812745213508606, "learning_rate": 0.0002, "epoch": 2.629084380610413, "step": 36610}, {"loss": 0.6901, "grad_norm": 0.8898148536682129, "learning_rate": 0.0002, "epoch": 2.629802513464991, "step": 36620}, {"loss": 0.6821, "grad_norm": 0.8083946108818054, "learning_rate": 0.0002, "epoch": 2.630520646319569, "step": 36630}, {"loss": 0.7285, "grad_norm": 0.7050122618675232, "learning_rate": 0.0002, "epoch": 2.631238779174147, "step": 36640}, {"loss": 0.6751, "grad_norm": 0.8155789971351624, "learning_rate": 0.0002, "epoch": 2.631956912028725, "step": 36650}, {"loss": 0.7258, "grad_norm": 0.9102175235748291, "learning_rate": 0.0002, "epoch": 2.632675044883303, "step": 36660}, {"loss": 0.6697, "grad_norm": 0.6621248126029968, "learning_rate": 0.0002, "epoch": 2.6333931777378816, "step": 36670}, {"loss": 0.6405, "grad_norm": 0.7338519096374512, "learning_rate": 0.0002, "epoch": 2.6341113105924596, "step": 36680}, {"loss": 0.6784, "grad_norm": 0.7536506652832031, "learning_rate": 0.0002, "epoch": 2.6348294434470376, "step": 36690}, {"loss": 0.6974, "grad_norm": 0.9357436299324036, "learning_rate": 0.0002, "epoch": 2.635547576301616, "step": 36700}, {"loss": 0.7729, "grad_norm": 0.7732111215591431, "learning_rate": 0.0002, "epoch": 2.636265709156194, "step": 36710}, {"loss": 0.6905, "grad_norm": 0.6863537430763245, "learning_rate": 0.0002, "epoch": 2.636983842010772, "step": 36720}, {"loss": 0.7058, "grad_norm": 0.8014764785766602, "learning_rate": 0.0002, "epoch": 2.63770197486535, "step": 36730}, {"loss": 0.697, "grad_norm": 0.8103911280632019, "learning_rate": 0.0002, "epoch": 2.638420107719928, "step": 36740}, {"loss": 0.7164, "grad_norm": 0.882652997970581, "learning_rate": 0.0002, "epoch": 2.639138240574506, "step": 36750}, {"loss": 0.6689, "grad_norm": 0.8705278038978577, "learning_rate": 0.0002, "epoch": 2.6398563734290845, "step": 36760}, {"loss": 0.6863, "grad_norm": 0.80764240026474, "learning_rate": 0.0002, "epoch": 2.6405745062836625, "step": 36770}, {"loss": 0.6761, "grad_norm": 0.9668620824813843, "learning_rate": 0.0002, "epoch": 2.6412926391382405, "step": 36780}, {"loss": 0.6576, "grad_norm": 0.7477577328681946, "learning_rate": 0.0002, "epoch": 2.6420107719928185, "step": 36790}, {"loss": 0.6558, "grad_norm": 0.8344516754150391, "learning_rate": 0.0002, "epoch": 2.642728904847397, "step": 36800}, {"loss": 0.6949, "grad_norm": 0.9520720839500427, "learning_rate": 0.0002, "epoch": 2.643447037701975, "step": 36810}, {"loss": 0.6731, "grad_norm": 0.5942372679710388, "learning_rate": 0.0002, "epoch": 2.644165170556553, "step": 36820}, {"loss": 0.6509, "grad_norm": 0.7411555051803589, "learning_rate": 0.0002, "epoch": 2.644883303411131, "step": 36830}, {"loss": 0.6948, "grad_norm": 0.6597771048545837, "learning_rate": 0.0002, "epoch": 2.645601436265709, "step": 36840}, {"loss": 0.6379, "grad_norm": 0.8636548519134521, "learning_rate": 0.0002, "epoch": 2.646319569120287, "step": 36850}, {"loss": 0.6965, "grad_norm": 0.8557497262954712, "learning_rate": 0.0002, "epoch": 2.6470377019748654, "step": 36860}, {"loss": 0.7061, "grad_norm": 0.8535996675491333, "learning_rate": 0.0002, "epoch": 2.6477558348294434, "step": 36870}, {"loss": 0.7087, "grad_norm": 0.7996463775634766, "learning_rate": 0.0002, "epoch": 2.6484739676840214, "step": 36880}, {"loss": 0.7174, "grad_norm": 0.6462067365646362, "learning_rate": 0.0002, "epoch": 2.6491921005386, "step": 36890}, {"loss": 0.6905, "grad_norm": 0.8849772214889526, "learning_rate": 0.0002, "epoch": 2.649910233393178, "step": 36900}, {"loss": 0.6973, "grad_norm": 0.999173641204834, "learning_rate": 0.0002, "epoch": 2.650628366247756, "step": 36910}, {"loss": 0.628, "grad_norm": 0.7221724987030029, "learning_rate": 0.0002, "epoch": 2.651346499102334, "step": 36920}, {"loss": 0.6698, "grad_norm": 0.8122989535331726, "learning_rate": 0.0002, "epoch": 2.652064631956912, "step": 36930}, {"loss": 0.6758, "grad_norm": 0.724267840385437, "learning_rate": 0.0002, "epoch": 2.65278276481149, "step": 36940}, {"loss": 0.6542, "grad_norm": 0.8250583410263062, "learning_rate": 0.0002, "epoch": 2.6535008976660683, "step": 36950}, {"loss": 0.6914, "grad_norm": 0.7623526453971863, "learning_rate": 0.0002, "epoch": 2.6542190305206463, "step": 36960}, {"loss": 0.6699, "grad_norm": 0.6474025845527649, "learning_rate": 0.0002, "epoch": 2.6549371633752243, "step": 36970}, {"loss": 0.7235, "grad_norm": 0.9751694202423096, "learning_rate": 0.0002, "epoch": 2.655655296229803, "step": 36980}, {"loss": 0.7423, "grad_norm": 0.8338939547538757, "learning_rate": 0.0002, "epoch": 2.656373429084381, "step": 36990}, {"loss": 0.6641, "grad_norm": 0.8877421021461487, "learning_rate": 0.0002, "epoch": 2.657091561938959, "step": 37000}, {"loss": 0.6639, "grad_norm": 0.9590298533439636, "learning_rate": 0.0002, "epoch": 2.657809694793537, "step": 37010}, {"loss": 0.6617, "grad_norm": 0.8224121928215027, "learning_rate": 0.0002, "epoch": 2.658527827648115, "step": 37020}, {"loss": 0.6359, "grad_norm": 0.9871236681938171, "learning_rate": 0.0002, "epoch": 2.659245960502693, "step": 37030}, {"loss": 0.65, "grad_norm": 0.8729037046432495, "learning_rate": 0.0002, "epoch": 2.6599640933572712, "step": 37040}, {"loss": 0.6561, "grad_norm": 0.6279319524765015, "learning_rate": 0.0002, "epoch": 2.6606822262118492, "step": 37050}, {"loss": 0.7031, "grad_norm": 1.0278962850570679, "learning_rate": 0.0002, "epoch": 2.6614003590664272, "step": 37060}, {"loss": 0.6552, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 2.6621184919210052, "step": 37070}, {"loss": 0.6994, "grad_norm": 0.7432018518447876, "learning_rate": 0.0002, "epoch": 2.6628366247755837, "step": 37080}, {"loss": 0.7086, "grad_norm": 0.9425008296966553, "learning_rate": 0.0002, "epoch": 2.6635547576301617, "step": 37090}, {"loss": 0.716, "grad_norm": 0.7542579174041748, "learning_rate": 0.0002, "epoch": 2.6642728904847397, "step": 37100}, {"loss": 0.6714, "grad_norm": 0.8469315767288208, "learning_rate": 0.0002, "epoch": 2.6649910233393177, "step": 37110}, {"loss": 0.6638, "grad_norm": 0.865777313709259, "learning_rate": 0.0002, "epoch": 2.6657091561938957, "step": 37120}, {"loss": 0.741, "grad_norm": 0.7293250560760498, "learning_rate": 0.0002, "epoch": 2.6664272890484737, "step": 37130}, {"loss": 0.6662, "grad_norm": 0.7199395895004272, "learning_rate": 0.0002, "epoch": 2.667145421903052, "step": 37140}, {"loss": 0.7078, "grad_norm": 0.7801268100738525, "learning_rate": 0.0002, "epoch": 2.66786355475763, "step": 37150}, {"loss": 0.7083, "grad_norm": 0.8706921935081482, "learning_rate": 0.0002, "epoch": 2.668581687612208, "step": 37160}, {"loss": 0.69, "grad_norm": 0.7124722599983215, "learning_rate": 0.0002, "epoch": 2.6692998204667866, "step": 37170}, {"loss": 0.625, "grad_norm": 0.8333015441894531, "learning_rate": 0.0002, "epoch": 2.6700179533213646, "step": 37180}, {"loss": 0.636, "grad_norm": 0.8822736740112305, "learning_rate": 0.0002, "epoch": 2.6707360861759426, "step": 37190}, {"loss": 0.6731, "grad_norm": 0.8300906419754028, "learning_rate": 0.0002, "epoch": 2.6714542190305206, "step": 37200}, {"loss": 0.6883, "grad_norm": 0.887126088142395, "learning_rate": 0.0002, "epoch": 2.6721723518850986, "step": 37210}, {"loss": 0.7211, "grad_norm": 0.7473671436309814, "learning_rate": 0.0002, "epoch": 2.6728904847396766, "step": 37220}, {"loss": 0.7032, "grad_norm": 0.8121018409729004, "learning_rate": 0.0002, "epoch": 2.673608617594255, "step": 37230}, {"loss": 0.6262, "grad_norm": 0.7882586717605591, "learning_rate": 0.0002, "epoch": 2.674326750448833, "step": 37240}, {"loss": 0.7201, "grad_norm": 0.797060489654541, "learning_rate": 0.0002, "epoch": 2.675044883303411, "step": 37250}, {"loss": 0.6635, "grad_norm": 0.9776935577392578, "learning_rate": 0.0002, "epoch": 2.6757630161579895, "step": 37260}, {"loss": 0.6883, "grad_norm": 0.9527283906936646, "learning_rate": 0.0002, "epoch": 2.6764811490125675, "step": 37270}, {"loss": 0.6968, "grad_norm": 0.7232038974761963, "learning_rate": 0.0002, "epoch": 2.6771992818671455, "step": 37280}, {"loss": 0.6544, "grad_norm": 0.8514575362205505, "learning_rate": 0.0002, "epoch": 2.6779174147217235, "step": 37290}, {"loss": 0.6956, "grad_norm": 0.8951214551925659, "learning_rate": 0.0002, "epoch": 2.6786355475763015, "step": 37300}, {"loss": 0.7435, "grad_norm": 0.7569643259048462, "learning_rate": 0.0002, "epoch": 2.6793536804308795, "step": 37310}, {"loss": 0.6522, "grad_norm": 1.0522346496582031, "learning_rate": 0.0002, "epoch": 2.680071813285458, "step": 37320}, {"loss": 0.7051, "grad_norm": 0.8914180994033813, "learning_rate": 0.0002, "epoch": 2.680789946140036, "step": 37330}, {"loss": 0.6941, "grad_norm": 0.8251807689666748, "learning_rate": 0.0002, "epoch": 2.681508078994614, "step": 37340}, {"loss": 0.6783, "grad_norm": 0.8215394020080566, "learning_rate": 0.0002, "epoch": 2.682226211849192, "step": 37350}, {"loss": 0.682, "grad_norm": 0.8043696880340576, "learning_rate": 0.0002, "epoch": 2.6829443447037704, "step": 37360}, {"loss": 0.6614, "grad_norm": 0.767250657081604, "learning_rate": 0.0002, "epoch": 2.6836624775583484, "step": 37370}, {"loss": 0.7197, "grad_norm": 0.817740261554718, "learning_rate": 0.0002, "epoch": 2.6843806104129264, "step": 37380}, {"loss": 0.6839, "grad_norm": 0.7963255047798157, "learning_rate": 0.0002, "epoch": 2.6850987432675044, "step": 37390}, {"loss": 0.7469, "grad_norm": 0.839271605014801, "learning_rate": 0.0002, "epoch": 2.6858168761220824, "step": 37400}, {"loss": 0.6879, "grad_norm": 0.7882823348045349, "learning_rate": 0.0002, "epoch": 2.6865350089766604, "step": 37410}, {"loss": 0.6768, "grad_norm": 0.8316412568092346, "learning_rate": 0.0002, "epoch": 2.687253141831239, "step": 37420}, {"loss": 0.7031, "grad_norm": 1.0044993162155151, "learning_rate": 0.0002, "epoch": 2.687971274685817, "step": 37430}, {"loss": 0.6988, "grad_norm": 0.8342832326889038, "learning_rate": 0.0002, "epoch": 2.688689407540395, "step": 37440}, {"loss": 0.6685, "grad_norm": 0.6743215322494507, "learning_rate": 0.0002, "epoch": 2.6894075403949733, "step": 37450}, {"loss": 0.6567, "grad_norm": 0.6872923970222473, "learning_rate": 0.0002, "epoch": 2.6901256732495513, "step": 37460}, {"loss": 0.7089, "grad_norm": 0.7377792596817017, "learning_rate": 0.0002, "epoch": 2.6908438061041293, "step": 37470}, {"loss": 0.676, "grad_norm": 0.7677304744720459, "learning_rate": 0.0002, "epoch": 2.6915619389587073, "step": 37480}, {"loss": 0.6693, "grad_norm": 0.9951061010360718, "learning_rate": 0.0002, "epoch": 2.6922800718132853, "step": 37490}, {"loss": 0.6517, "grad_norm": 0.7452111840248108, "learning_rate": 0.0002, "epoch": 2.6929982046678633, "step": 37500}, {"loss": 0.7503, "grad_norm": 0.9663393497467041, "learning_rate": 0.0002, "epoch": 2.6937163375224418, "step": 37510}, {"loss": 0.7025, "grad_norm": 0.7919635772705078, "learning_rate": 0.0002, "epoch": 2.6944344703770198, "step": 37520}, {"loss": 0.7257, "grad_norm": 0.9977981448173523, "learning_rate": 0.0002, "epoch": 2.6951526032315978, "step": 37530}, {"loss": 0.6507, "grad_norm": 0.7279480695724487, "learning_rate": 0.0002, "epoch": 2.695870736086176, "step": 37540}, {"loss": 0.7448, "grad_norm": 0.7218075394630432, "learning_rate": 0.0002, "epoch": 2.6965888689407542, "step": 37550}, {"loss": 0.6845, "grad_norm": 0.9041047096252441, "learning_rate": 0.0002, "epoch": 2.6973070017953322, "step": 37560}, {"loss": 0.6848, "grad_norm": 0.7689407467842102, "learning_rate": 0.0002, "epoch": 2.6980251346499102, "step": 37570}, {"loss": 0.7136, "grad_norm": 0.8184728622436523, "learning_rate": 0.0002, "epoch": 2.6987432675044882, "step": 37580}, {"loss": 0.6952, "grad_norm": 0.7536661624908447, "learning_rate": 0.0002, "epoch": 2.6994614003590662, "step": 37590}, {"loss": 0.7064, "grad_norm": 0.8371431231498718, "learning_rate": 0.0002, "epoch": 2.7001795332136447, "step": 37600}, {"loss": 0.7118, "grad_norm": 0.8562723994255066, "learning_rate": 0.0002, "epoch": 2.7008976660682227, "step": 37610}, {"loss": 0.6602, "grad_norm": 0.8227898478507996, "learning_rate": 0.0002, "epoch": 2.7016157989228007, "step": 37620}, {"loss": 0.7324, "grad_norm": 0.764792799949646, "learning_rate": 0.0002, "epoch": 2.7023339317773787, "step": 37630}, {"loss": 0.7289, "grad_norm": 0.7782649993896484, "learning_rate": 0.0002, "epoch": 2.703052064631957, "step": 37640}, {"loss": 0.705, "grad_norm": 0.7669944167137146, "learning_rate": 0.0002, "epoch": 2.703770197486535, "step": 37650}, {"loss": 0.7019, "grad_norm": 0.7945750951766968, "learning_rate": 0.0002, "epoch": 2.704488330341113, "step": 37660}, {"loss": 0.6789, "grad_norm": 0.6840786337852478, "learning_rate": 0.0002, "epoch": 2.705206463195691, "step": 37670}, {"loss": 0.768, "grad_norm": 1.0565117597579956, "learning_rate": 0.0002, "epoch": 2.705924596050269, "step": 37680}, {"loss": 0.737, "grad_norm": 0.7407042384147644, "learning_rate": 0.0002, "epoch": 2.706642728904847, "step": 37690}, {"loss": 0.712, "grad_norm": 0.7862113118171692, "learning_rate": 0.0002, "epoch": 2.7073608617594256, "step": 37700}, {"loss": 0.6331, "grad_norm": 0.7487596273422241, "learning_rate": 0.0002, "epoch": 2.7080789946140036, "step": 37710}, {"loss": 0.6917, "grad_norm": 0.9416596293449402, "learning_rate": 0.0002, "epoch": 2.7087971274685816, "step": 37720}, {"loss": 0.717, "grad_norm": 0.8943207263946533, "learning_rate": 0.0002, "epoch": 2.70951526032316, "step": 37730}, {"loss": 0.6505, "grad_norm": 0.9263445138931274, "learning_rate": 0.0002, "epoch": 2.710233393177738, "step": 37740}, {"loss": 0.7423, "grad_norm": 0.6869737505912781, "learning_rate": 0.0002, "epoch": 2.710951526032316, "step": 37750}, {"loss": 0.724, "grad_norm": 0.9186407923698425, "learning_rate": 0.0002, "epoch": 2.711669658886894, "step": 37760}, {"loss": 0.6757, "grad_norm": 0.8379335999488831, "learning_rate": 0.0002, "epoch": 2.712387791741472, "step": 37770}, {"loss": 0.7352, "grad_norm": 0.7248736023902893, "learning_rate": 0.0002, "epoch": 2.71310592459605, "step": 37780}, {"loss": 0.7023, "grad_norm": 0.8636229038238525, "learning_rate": 0.0002, "epoch": 2.7138240574506285, "step": 37790}, {"loss": 0.726, "grad_norm": 0.7590767741203308, "learning_rate": 0.0002, "epoch": 2.7145421903052065, "step": 37800}, {"loss": 0.6837, "grad_norm": 0.8946404457092285, "learning_rate": 0.0002, "epoch": 2.7152603231597845, "step": 37810}, {"loss": 0.7135, "grad_norm": 0.7822132706642151, "learning_rate": 0.0002, "epoch": 2.7159784560143625, "step": 37820}, {"loss": 0.7034, "grad_norm": 0.7882820963859558, "learning_rate": 0.0002, "epoch": 2.716696588868941, "step": 37830}, {"loss": 0.6667, "grad_norm": 0.8025872707366943, "learning_rate": 0.0002, "epoch": 2.717414721723519, "step": 37840}, {"loss": 0.6967, "grad_norm": 0.8618839979171753, "learning_rate": 0.0002, "epoch": 2.718132854578097, "step": 37850}, {"loss": 0.699, "grad_norm": 0.6975733637809753, "learning_rate": 0.0002, "epoch": 2.718850987432675, "step": 37860}, {"loss": 0.6858, "grad_norm": 0.7952182292938232, "learning_rate": 0.0002, "epoch": 2.719569120287253, "step": 37870}, {"loss": 0.7018, "grad_norm": 0.7580680251121521, "learning_rate": 0.0002, "epoch": 2.7202872531418314, "step": 37880}, {"loss": 0.6838, "grad_norm": 0.9504257440567017, "learning_rate": 0.0002, "epoch": 2.7210053859964094, "step": 37890}, {"loss": 0.6801, "grad_norm": 0.856614351272583, "learning_rate": 0.0002, "epoch": 2.7217235188509874, "step": 37900}, {"loss": 0.6647, "grad_norm": 1.0092085599899292, "learning_rate": 0.0002, "epoch": 2.7224416517055654, "step": 37910}, {"loss": 0.6709, "grad_norm": 0.9009839296340942, "learning_rate": 0.0002, "epoch": 2.723159784560144, "step": 37920}, {"loss": 0.7009, "grad_norm": 0.9247435331344604, "learning_rate": 0.0002, "epoch": 2.723877917414722, "step": 37930}, {"loss": 0.6924, "grad_norm": 1.0774317979812622, "learning_rate": 0.0002, "epoch": 2.7245960502693, "step": 37940}, {"loss": 0.6706, "grad_norm": 0.9104372262954712, "learning_rate": 0.0002, "epoch": 2.725314183123878, "step": 37950}, {"loss": 0.6608, "grad_norm": 0.7904245257377625, "learning_rate": 0.0002, "epoch": 2.726032315978456, "step": 37960}, {"loss": 0.6937, "grad_norm": 0.9555521607398987, "learning_rate": 0.0002, "epoch": 2.726750448833034, "step": 37970}, {"loss": 0.6497, "grad_norm": 0.7769099473953247, "learning_rate": 0.0002, "epoch": 2.7274685816876123, "step": 37980}, {"loss": 0.63, "grad_norm": 0.9202065467834473, "learning_rate": 0.0002, "epoch": 2.7281867145421903, "step": 37990}, {"loss": 0.7021, "grad_norm": 0.732510507106781, "learning_rate": 0.0002, "epoch": 2.7289048473967683, "step": 38000}, {"loss": 0.6665, "grad_norm": 0.7723771929740906, "learning_rate": 0.0002, "epoch": 2.7296229802513468, "step": 38010}, {"loss": 0.6836, "grad_norm": 0.7948567867279053, "learning_rate": 0.0002, "epoch": 2.7303411131059248, "step": 38020}, {"loss": 0.6802, "grad_norm": 0.7702966928482056, "learning_rate": 0.0002, "epoch": 2.7310592459605028, "step": 38030}, {"loss": 0.6859, "grad_norm": 0.689098060131073, "learning_rate": 0.0002, "epoch": 2.7317773788150808, "step": 38040}, {"loss": 0.7027, "grad_norm": 0.7951080203056335, "learning_rate": 0.0002, "epoch": 2.7324955116696588, "step": 38050}, {"loss": 0.6895, "grad_norm": 0.7284924983978271, "learning_rate": 0.0002, "epoch": 2.7332136445242368, "step": 38060}, {"loss": 0.7409, "grad_norm": 0.9198044538497925, "learning_rate": 0.0002, "epoch": 2.733931777378815, "step": 38070}, {"loss": 0.6699, "grad_norm": 0.8653260469436646, "learning_rate": 0.0002, "epoch": 2.734649910233393, "step": 38080}, {"loss": 0.6832, "grad_norm": 0.8503400683403015, "learning_rate": 0.0002, "epoch": 2.735368043087971, "step": 38090}, {"loss": 0.6955, "grad_norm": 0.8388783931732178, "learning_rate": 0.0002, "epoch": 2.736086175942549, "step": 38100}, {"loss": 0.7059, "grad_norm": 0.7636904716491699, "learning_rate": 0.0002, "epoch": 2.7368043087971277, "step": 38110}, {"loss": 0.6659, "grad_norm": 0.8990790247917175, "learning_rate": 0.0002, "epoch": 2.7375224416517057, "step": 38120}, {"loss": 0.6487, "grad_norm": 0.8878970742225647, "learning_rate": 0.0002, "epoch": 2.7382405745062837, "step": 38130}, {"loss": 0.6725, "grad_norm": 0.7684310078620911, "learning_rate": 0.0002, "epoch": 2.7389587073608617, "step": 38140}, {"loss": 0.6935, "grad_norm": 1.0777359008789062, "learning_rate": 0.0002, "epoch": 2.7396768402154397, "step": 38150}, {"loss": 0.6904, "grad_norm": 0.768764317035675, "learning_rate": 0.0002, "epoch": 2.740394973070018, "step": 38160}, {"loss": 0.6509, "grad_norm": 0.7490760087966919, "learning_rate": 0.0002, "epoch": 2.741113105924596, "step": 38170}, {"loss": 0.6907, "grad_norm": 0.860373854637146, "learning_rate": 0.0002, "epoch": 2.741831238779174, "step": 38180}, {"loss": 0.6704, "grad_norm": 0.7145599722862244, "learning_rate": 0.0002, "epoch": 2.742549371633752, "step": 38190}, {"loss": 0.6798, "grad_norm": 0.8347760438919067, "learning_rate": 0.0002, "epoch": 2.7432675044883306, "step": 38200}, {"loss": 0.7029, "grad_norm": 0.8425729274749756, "learning_rate": 0.0002, "epoch": 2.7439856373429086, "step": 38210}, {"loss": 0.6442, "grad_norm": 0.9289436936378479, "learning_rate": 0.0002, "epoch": 2.7447037701974866, "step": 38220}, {"loss": 0.694, "grad_norm": 0.7608675360679626, "learning_rate": 0.0002, "epoch": 2.7454219030520646, "step": 38230}, {"loss": 0.7097, "grad_norm": 0.8067167401313782, "learning_rate": 0.0002, "epoch": 2.7461400359066426, "step": 38240}, {"loss": 0.704, "grad_norm": 0.8599629402160645, "learning_rate": 0.0002, "epoch": 2.7468581687612206, "step": 38250}, {"loss": 0.6259, "grad_norm": 0.8425742387771606, "learning_rate": 0.0002, "epoch": 2.747576301615799, "step": 38260}, {"loss": 0.6875, "grad_norm": 0.8626754283905029, "learning_rate": 0.0002, "epoch": 2.748294434470377, "step": 38270}, {"loss": 0.7357, "grad_norm": 0.797652006149292, "learning_rate": 0.0002, "epoch": 2.749012567324955, "step": 38280}, {"loss": 0.7184, "grad_norm": 0.7971500754356384, "learning_rate": 0.0002, "epoch": 2.7497307001795335, "step": 38290}, {"loss": 0.7035, "grad_norm": 0.9786333441734314, "learning_rate": 0.0002, "epoch": 2.7504488330341115, "step": 38300}, {"loss": 0.6501, "grad_norm": 0.7146100997924805, "learning_rate": 0.0002, "epoch": 2.7511669658886895, "step": 38310}, {"loss": 0.7087, "grad_norm": 0.8436099886894226, "learning_rate": 0.0002, "epoch": 2.7518850987432675, "step": 38320}, {"loss": 0.6911, "grad_norm": 0.8943847417831421, "learning_rate": 0.0002, "epoch": 2.7526032315978455, "step": 38330}, {"loss": 0.6397, "grad_norm": 0.8170148730278015, "learning_rate": 0.0002, "epoch": 2.7533213644524235, "step": 38340}, {"loss": 0.6756, "grad_norm": 0.7804728746414185, "learning_rate": 0.0002, "epoch": 2.754039497307002, "step": 38350}, {"loss": 0.6954, "grad_norm": 0.9139971137046814, "learning_rate": 0.0002, "epoch": 2.75475763016158, "step": 38360}, {"loss": 0.7083, "grad_norm": 0.835332453250885, "learning_rate": 0.0002, "epoch": 2.755475763016158, "step": 38370}, {"loss": 0.7112, "grad_norm": 1.0904794931411743, "learning_rate": 0.0002, "epoch": 2.756193895870736, "step": 38380}, {"loss": 0.6881, "grad_norm": 0.7443365454673767, "learning_rate": 0.0002, "epoch": 2.7569120287253144, "step": 38390}, {"loss": 0.6896, "grad_norm": 1.1336839199066162, "learning_rate": 0.0002, "epoch": 2.7576301615798924, "step": 38400}, {"loss": 0.6777, "grad_norm": 0.9024015665054321, "learning_rate": 0.0002, "epoch": 2.7583482944344704, "step": 38410}, {"loss": 0.629, "grad_norm": 0.7380578517913818, "learning_rate": 0.0002, "epoch": 2.7590664272890484, "step": 38420}, {"loss": 0.7708, "grad_norm": 0.9860634207725525, "learning_rate": 0.0002, "epoch": 2.7597845601436264, "step": 38430}, {"loss": 0.6694, "grad_norm": 0.7928970456123352, "learning_rate": 0.0002, "epoch": 2.760502692998205, "step": 38440}, {"loss": 0.669, "grad_norm": 1.0357221364974976, "learning_rate": 0.0002, "epoch": 2.761220825852783, "step": 38450}, {"loss": 0.6763, "grad_norm": 0.8110901117324829, "learning_rate": 0.0002, "epoch": 2.761938958707361, "step": 38460}, {"loss": 0.6528, "grad_norm": 0.8420981764793396, "learning_rate": 0.0002, "epoch": 2.762657091561939, "step": 38470}, {"loss": 0.6841, "grad_norm": 0.858955979347229, "learning_rate": 0.0002, "epoch": 2.7633752244165173, "step": 38480}, {"loss": 0.7387, "grad_norm": 0.9851368069648743, "learning_rate": 0.0002, "epoch": 2.7640933572710953, "step": 38490}, {"loss": 0.6939, "grad_norm": 0.8073325753211975, "learning_rate": 0.0002, "epoch": 2.7648114901256733, "step": 38500}, {"loss": 0.7033, "grad_norm": 1.0654062032699585, "learning_rate": 0.0002, "epoch": 2.7655296229802513, "step": 38510}, {"loss": 0.692, "grad_norm": 0.719603955745697, "learning_rate": 0.0002, "epoch": 2.7662477558348293, "step": 38520}, {"loss": 0.7032, "grad_norm": 0.9790831804275513, "learning_rate": 0.0002, "epoch": 2.7669658886894073, "step": 38530}, {"loss": 0.6613, "grad_norm": 0.907619833946228, "learning_rate": 0.0002, "epoch": 2.7676840215439857, "step": 38540}, {"loss": 0.6683, "grad_norm": 0.7463719248771667, "learning_rate": 0.0002, "epoch": 2.7684021543985637, "step": 38550}, {"loss": 0.6785, "grad_norm": 1.0687178373336792, "learning_rate": 0.0002, "epoch": 2.7691202872531417, "step": 38560}, {"loss": 0.6901, "grad_norm": 0.7397776246070862, "learning_rate": 0.0002, "epoch": 2.76983842010772, "step": 38570}, {"loss": 0.6861, "grad_norm": 0.7392559051513672, "learning_rate": 0.0002, "epoch": 2.770556552962298, "step": 38580}, {"loss": 0.6954, "grad_norm": 0.9774793982505798, "learning_rate": 0.0002, "epoch": 2.771274685816876, "step": 38590}, {"loss": 0.6641, "grad_norm": 0.9502208828926086, "learning_rate": 0.0002, "epoch": 2.771992818671454, "step": 38600}, {"loss": 0.6908, "grad_norm": 0.776108980178833, "learning_rate": 0.0002, "epoch": 2.772710951526032, "step": 38610}, {"loss": 0.6826, "grad_norm": 0.7633077502250671, "learning_rate": 0.0002, "epoch": 2.77342908438061, "step": 38620}, {"loss": 0.6559, "grad_norm": 0.9445580244064331, "learning_rate": 0.0002, "epoch": 2.7741472172351886, "step": 38630}, {"loss": 0.7085, "grad_norm": 0.943165123462677, "learning_rate": 0.0002, "epoch": 2.7748653500897666, "step": 38640}, {"loss": 0.6739, "grad_norm": 0.9045929908752441, "learning_rate": 0.0002, "epoch": 2.7755834829443446, "step": 38650}, {"loss": 0.7351, "grad_norm": 0.9425684213638306, "learning_rate": 0.0002, "epoch": 2.7763016157989227, "step": 38660}, {"loss": 0.6602, "grad_norm": 0.9106295704841614, "learning_rate": 0.0002, "epoch": 2.777019748653501, "step": 38670}, {"loss": 0.7076, "grad_norm": 0.6264749765396118, "learning_rate": 0.0002, "epoch": 2.777737881508079, "step": 38680}, {"loss": 0.7234, "grad_norm": 0.9156801700592041, "learning_rate": 0.0002, "epoch": 2.778456014362657, "step": 38690}, {"loss": 0.6804, "grad_norm": 0.9752956032752991, "learning_rate": 0.0002, "epoch": 2.779174147217235, "step": 38700}, {"loss": 0.686, "grad_norm": 0.7849555611610413, "learning_rate": 0.0002, "epoch": 2.779892280071813, "step": 38710}, {"loss": 0.72, "grad_norm": 0.8109981417655945, "learning_rate": 0.0002, "epoch": 2.780610412926391, "step": 38720}, {"loss": 0.6592, "grad_norm": 0.7882387638092041, "learning_rate": 0.0002, "epoch": 2.7813285457809696, "step": 38730}, {"loss": 0.6948, "grad_norm": 0.9049678444862366, "learning_rate": 0.0002, "epoch": 2.7820466786355476, "step": 38740}, {"loss": 0.7032, "grad_norm": 0.7678212523460388, "learning_rate": 0.0002, "epoch": 2.7827648114901256, "step": 38750}, {"loss": 0.6882, "grad_norm": 0.9754453301429749, "learning_rate": 0.0002, "epoch": 2.783482944344704, "step": 38760}, {"loss": 0.7071, "grad_norm": 0.7643493413925171, "learning_rate": 0.0002, "epoch": 2.784201077199282, "step": 38770}, {"loss": 0.6817, "grad_norm": 0.7440303564071655, "learning_rate": 0.0002, "epoch": 2.78491921005386, "step": 38780}, {"loss": 0.6869, "grad_norm": 0.8870946168899536, "learning_rate": 0.0002, "epoch": 2.785637342908438, "step": 38790}, {"loss": 0.7391, "grad_norm": 0.8100579977035522, "learning_rate": 0.0002, "epoch": 2.786355475763016, "step": 38800}, {"loss": 0.7003, "grad_norm": 0.7082616090774536, "learning_rate": 0.0002, "epoch": 2.787073608617594, "step": 38810}, {"loss": 0.697, "grad_norm": 0.7880047559738159, "learning_rate": 0.0002, "epoch": 2.7877917414721725, "step": 38820}, {"loss": 0.6635, "grad_norm": 0.7217963337898254, "learning_rate": 0.0002, "epoch": 2.7885098743267505, "step": 38830}, {"loss": 0.696, "grad_norm": 0.799124002456665, "learning_rate": 0.0002, "epoch": 2.7892280071813285, "step": 38840}, {"loss": 0.7267, "grad_norm": 1.0004022121429443, "learning_rate": 0.0002, "epoch": 2.789946140035907, "step": 38850}, {"loss": 0.6325, "grad_norm": 0.7866547107696533, "learning_rate": 0.0002, "epoch": 2.790664272890485, "step": 38860}, {"loss": 0.6573, "grad_norm": 0.891603410243988, "learning_rate": 0.0002, "epoch": 2.791382405745063, "step": 38870}, {"loss": 0.6949, "grad_norm": 0.7687129378318787, "learning_rate": 0.0002, "epoch": 2.792100538599641, "step": 38880}, {"loss": 0.6753, "grad_norm": 0.7549769282341003, "learning_rate": 0.0002, "epoch": 2.792818671454219, "step": 38890}, {"loss": 0.7103, "grad_norm": 0.7792351245880127, "learning_rate": 0.0002, "epoch": 2.793536804308797, "step": 38900}, {"loss": 0.671, "grad_norm": 0.7352819442749023, "learning_rate": 0.0002, "epoch": 2.7942549371633754, "step": 38910}, {"loss": 0.7176, "grad_norm": 0.8758018612861633, "learning_rate": 0.0002, "epoch": 2.7949730700179534, "step": 38920}, {"loss": 0.7033, "grad_norm": 0.8213023543357849, "learning_rate": 0.0002, "epoch": 2.7956912028725314, "step": 38930}, {"loss": 0.6759, "grad_norm": 0.899368941783905, "learning_rate": 0.0002, "epoch": 2.7964093357271094, "step": 38940}, {"loss": 0.6994, "grad_norm": 0.7497758269309998, "learning_rate": 0.0002, "epoch": 2.797127468581688, "step": 38950}, {"loss": 0.7006, "grad_norm": 0.870704710483551, "learning_rate": 0.0002, "epoch": 2.797845601436266, "step": 38960}, {"loss": 0.6865, "grad_norm": 0.8021528720855713, "learning_rate": 0.0002, "epoch": 2.798563734290844, "step": 38970}, {"loss": 0.7254, "grad_norm": 0.7541360855102539, "learning_rate": 0.0002, "epoch": 2.799281867145422, "step": 38980}, {"loss": 0.6275, "grad_norm": 0.8909788131713867, "learning_rate": 0.0002, "epoch": 2.8, "step": 38990}, {"loss": 0.6801, "grad_norm": 0.8175999522209167, "learning_rate": 0.0002, "epoch": 2.800718132854578, "step": 39000}, {"loss": 0.6961, "grad_norm": 0.7336044311523438, "learning_rate": 0.0002, "epoch": 2.8014362657091563, "step": 39010}, {"loss": 0.6573, "grad_norm": 0.7354168891906738, "learning_rate": 0.0002, "epoch": 2.8021543985637343, "step": 39020}, {"loss": 0.6207, "grad_norm": 0.8771968483924866, "learning_rate": 0.0002, "epoch": 2.8028725314183123, "step": 39030}, {"loss": 0.671, "grad_norm": 0.8073309063911438, "learning_rate": 0.0002, "epoch": 2.8035906642728907, "step": 39040}, {"loss": 0.6869, "grad_norm": 0.8475365042686462, "learning_rate": 0.0002, "epoch": 2.8043087971274687, "step": 39050}, {"loss": 0.6549, "grad_norm": 0.7233281135559082, "learning_rate": 0.0002, "epoch": 2.8050269299820467, "step": 39060}, {"loss": 0.6937, "grad_norm": 0.9850572347640991, "learning_rate": 0.0002, "epoch": 2.8057450628366247, "step": 39070}, {"loss": 0.7091, "grad_norm": 1.0635435581207275, "learning_rate": 0.0002, "epoch": 2.8064631956912027, "step": 39080}, {"loss": 0.6345, "grad_norm": 0.8183665871620178, "learning_rate": 0.0002, "epoch": 2.8071813285457807, "step": 39090}, {"loss": 0.7116, "grad_norm": 0.802228569984436, "learning_rate": 0.0002, "epoch": 2.807899461400359, "step": 39100}, {"loss": 0.7078, "grad_norm": 0.9861624836921692, "learning_rate": 0.0002, "epoch": 2.808617594254937, "step": 39110}, {"loss": 0.7242, "grad_norm": 0.675205409526825, "learning_rate": 0.0002, "epoch": 2.809335727109515, "step": 39120}, {"loss": 0.6599, "grad_norm": 0.7503975629806519, "learning_rate": 0.0002, "epoch": 2.8100538599640936, "step": 39130}, {"loss": 0.6684, "grad_norm": 0.8266825675964355, "learning_rate": 0.0002, "epoch": 2.8107719928186716, "step": 39140}, {"loss": 0.6869, "grad_norm": 0.6956485509872437, "learning_rate": 0.0002, "epoch": 2.8114901256732496, "step": 39150}, {"loss": 0.6495, "grad_norm": 0.7363799214363098, "learning_rate": 0.0002, "epoch": 2.8122082585278276, "step": 39160}, {"loss": 0.7047, "grad_norm": 1.3893407583236694, "learning_rate": 0.0002, "epoch": 2.8129263913824056, "step": 39170}, {"loss": 0.6501, "grad_norm": 1.0619654655456543, "learning_rate": 0.0002, "epoch": 2.8136445242369836, "step": 39180}, {"loss": 0.703, "grad_norm": 0.7924326062202454, "learning_rate": 0.0002, "epoch": 2.814362657091562, "step": 39190}, {"loss": 0.6748, "grad_norm": 0.8838121294975281, "learning_rate": 0.0002, "epoch": 2.81508078994614, "step": 39200}, {"loss": 0.6759, "grad_norm": 0.9059016108512878, "learning_rate": 0.0002, "epoch": 2.815798922800718, "step": 39210}, {"loss": 0.6812, "grad_norm": 0.9284590482711792, "learning_rate": 0.0002, "epoch": 2.816517055655296, "step": 39220}, {"loss": 0.6261, "grad_norm": 0.7992225289344788, "learning_rate": 0.0002, "epoch": 2.8172351885098745, "step": 39230}, {"loss": 0.6623, "grad_norm": 0.816376805305481, "learning_rate": 0.0002, "epoch": 2.8179533213644525, "step": 39240}, {"loss": 0.6825, "grad_norm": 0.9183637499809265, "learning_rate": 0.0002, "epoch": 2.8186714542190305, "step": 39250}, {"loss": 0.6558, "grad_norm": 0.7232057452201843, "learning_rate": 0.0002, "epoch": 2.8193895870736085, "step": 39260}, {"loss": 0.7396, "grad_norm": 0.9012457728385925, "learning_rate": 0.0002, "epoch": 2.8201077199281865, "step": 39270}, {"loss": 0.6823, "grad_norm": 0.7796093821525574, "learning_rate": 0.0002, "epoch": 2.8208258527827645, "step": 39280}, {"loss": 0.6997, "grad_norm": 0.8331146836280823, "learning_rate": 0.0002, "epoch": 2.821543985637343, "step": 39290}, {"loss": 0.6867, "grad_norm": 0.8031269907951355, "learning_rate": 0.0002, "epoch": 2.822262118491921, "step": 39300}, {"loss": 0.7451, "grad_norm": 0.8563299179077148, "learning_rate": 0.0002, "epoch": 2.822980251346499, "step": 39310}, {"loss": 0.6828, "grad_norm": 0.8083387613296509, "learning_rate": 0.0002, "epoch": 2.8236983842010774, "step": 39320}, {"loss": 0.723, "grad_norm": 0.8132631182670593, "learning_rate": 0.0002, "epoch": 2.8244165170556554, "step": 39330}, {"loss": 0.6882, "grad_norm": 0.9071316719055176, "learning_rate": 0.0002, "epoch": 2.8251346499102334, "step": 39340}, {"loss": 0.7057, "grad_norm": 0.8224168419837952, "learning_rate": 0.0002, "epoch": 2.8258527827648114, "step": 39350}, {"loss": 0.6831, "grad_norm": 1.073014497756958, "learning_rate": 0.0002, "epoch": 2.8265709156193894, "step": 39360}, {"loss": 0.7392, "grad_norm": 0.9466553926467896, "learning_rate": 0.0002, "epoch": 2.8272890484739674, "step": 39370}, {"loss": 0.7288, "grad_norm": 0.8946257829666138, "learning_rate": 0.0002, "epoch": 2.828007181328546, "step": 39380}, {"loss": 0.7023, "grad_norm": 0.8497758507728577, "learning_rate": 0.0002, "epoch": 2.828725314183124, "step": 39390}, {"loss": 0.6787, "grad_norm": 0.8952143788337708, "learning_rate": 0.0002, "epoch": 2.829443447037702, "step": 39400}, {"loss": 0.7059, "grad_norm": 0.8839313983917236, "learning_rate": 0.0002, "epoch": 2.8301615798922803, "step": 39410}, {"loss": 0.6643, "grad_norm": 0.7576757669448853, "learning_rate": 0.0002, "epoch": 2.8308797127468583, "step": 39420}, {"loss": 0.6509, "grad_norm": 0.8212469816207886, "learning_rate": 0.0002, "epoch": 2.8315978456014363, "step": 39430}, {"loss": 0.6728, "grad_norm": 0.9289504885673523, "learning_rate": 0.0002, "epoch": 2.8323159784560143, "step": 39440}, {"loss": 0.6773, "grad_norm": 0.8745405077934265, "learning_rate": 0.0002, "epoch": 2.8330341113105924, "step": 39450}, {"loss": 0.6741, "grad_norm": 0.7974533438682556, "learning_rate": 0.0002, "epoch": 2.8337522441651704, "step": 39460}, {"loss": 0.6887, "grad_norm": 0.914289116859436, "learning_rate": 0.0002, "epoch": 2.834470377019749, "step": 39470}, {"loss": 0.7009, "grad_norm": 0.7686914801597595, "learning_rate": 0.0002, "epoch": 2.835188509874327, "step": 39480}, {"loss": 0.679, "grad_norm": 0.9289370179176331, "learning_rate": 0.0002, "epoch": 2.835906642728905, "step": 39490}, {"loss": 0.684, "grad_norm": 0.8851973414421082, "learning_rate": 0.0002, "epoch": 2.836624775583483, "step": 39500}, {"loss": 0.7012, "grad_norm": 0.7754096388816833, "learning_rate": 0.0002, "epoch": 2.8373429084380613, "step": 39510}, {"loss": 0.6936, "grad_norm": 0.8801632523536682, "learning_rate": 0.0002, "epoch": 2.8380610412926393, "step": 39520}, {"loss": 0.6878, "grad_norm": 0.9031528234481812, "learning_rate": 0.0002, "epoch": 2.8387791741472173, "step": 39530}, {"loss": 0.6815, "grad_norm": 0.7113721966743469, "learning_rate": 0.0002, "epoch": 2.8394973070017953, "step": 39540}, {"loss": 0.7287, "grad_norm": 0.7880923748016357, "learning_rate": 0.0002, "epoch": 2.8402154398563733, "step": 39550}, {"loss": 0.671, "grad_norm": 2.4828813076019287, "learning_rate": 0.0002, "epoch": 2.8409335727109513, "step": 39560}, {"loss": 0.6824, "grad_norm": 0.9174619913101196, "learning_rate": 0.0002, "epoch": 2.8416517055655297, "step": 39570}, {"loss": 0.7086, "grad_norm": 0.9708074927330017, "learning_rate": 0.0002, "epoch": 2.8423698384201077, "step": 39580}, {"loss": 0.7021, "grad_norm": 0.7968248724937439, "learning_rate": 0.0002, "epoch": 2.8430879712746857, "step": 39590}, {"loss": 0.7121, "grad_norm": 0.7967682480812073, "learning_rate": 0.0002, "epoch": 2.843806104129264, "step": 39600}, {"loss": 0.6284, "grad_norm": 0.7487651109695435, "learning_rate": 0.0002, "epoch": 2.844524236983842, "step": 39610}, {"loss": 0.6624, "grad_norm": 0.6997556686401367, "learning_rate": 0.0002, "epoch": 2.84524236983842, "step": 39620}, {"loss": 0.6987, "grad_norm": 0.7639351487159729, "learning_rate": 0.0002, "epoch": 2.845960502692998, "step": 39630}, {"loss": 0.6757, "grad_norm": 0.9086648225784302, "learning_rate": 0.0002, "epoch": 2.846678635547576, "step": 39640}, {"loss": 0.6841, "grad_norm": 0.91103196144104, "learning_rate": 0.0002, "epoch": 2.847396768402154, "step": 39650}, {"loss": 0.7046, "grad_norm": 0.8096913695335388, "learning_rate": 0.0002, "epoch": 2.8481149012567326, "step": 39660}, {"loss": 0.679, "grad_norm": 0.8961427807807922, "learning_rate": 0.0002, "epoch": 2.8488330341113106, "step": 39670}, {"loss": 0.6589, "grad_norm": 0.7489904761314392, "learning_rate": 0.0002, "epoch": 2.8495511669658886, "step": 39680}, {"loss": 0.6581, "grad_norm": 0.7893617749214172, "learning_rate": 0.0002, "epoch": 2.850269299820467, "step": 39690}, {"loss": 0.7326, "grad_norm": 0.8259761929512024, "learning_rate": 0.0002, "epoch": 2.850987432675045, "step": 39700}, {"loss": 0.6763, "grad_norm": 0.7006617188453674, "learning_rate": 0.0002, "epoch": 2.851705565529623, "step": 39710}, {"loss": 0.7095, "grad_norm": 0.8922327756881714, "learning_rate": 0.0002, "epoch": 2.852423698384201, "step": 39720}, {"loss": 0.6829, "grad_norm": 0.9058550000190735, "learning_rate": 0.0002, "epoch": 2.853141831238779, "step": 39730}, {"loss": 0.6777, "grad_norm": 0.7627129554748535, "learning_rate": 0.0002, "epoch": 2.853859964093357, "step": 39740}, {"loss": 0.6937, "grad_norm": 0.9316968321800232, "learning_rate": 0.0002, "epoch": 2.8545780969479355, "step": 39750}, {"loss": 0.6882, "grad_norm": 0.8424679040908813, "learning_rate": 0.0002, "epoch": 2.8552962298025135, "step": 39760}, {"loss": 0.7018, "grad_norm": 0.6185386776924133, "learning_rate": 0.0002, "epoch": 2.8560143626570915, "step": 39770}, {"loss": 0.7106, "grad_norm": 0.709902286529541, "learning_rate": 0.0002, "epoch": 2.8567324955116695, "step": 39780}, {"loss": 0.7007, "grad_norm": 0.93730229139328, "learning_rate": 0.0002, "epoch": 2.857450628366248, "step": 39790}, {"loss": 0.6973, "grad_norm": 0.875989556312561, "learning_rate": 0.0002, "epoch": 2.858168761220826, "step": 39800}, {"loss": 0.6685, "grad_norm": 0.7424131631851196, "learning_rate": 0.0002, "epoch": 2.858886894075404, "step": 39810}, {"loss": 0.6704, "grad_norm": 0.9108477830886841, "learning_rate": 0.0002, "epoch": 2.859605026929982, "step": 39820}, {"loss": 0.6677, "grad_norm": 0.8248386383056641, "learning_rate": 0.0002, "epoch": 2.86032315978456, "step": 39830}, {"loss": 0.6591, "grad_norm": 0.8739979863166809, "learning_rate": 0.0002, "epoch": 2.861041292639138, "step": 39840}, {"loss": 0.6674, "grad_norm": 0.7940961122512817, "learning_rate": 0.0002, "epoch": 2.8617594254937164, "step": 39850}, {"loss": 0.6875, "grad_norm": 0.7594687938690186, "learning_rate": 0.0002, "epoch": 2.8624775583482944, "step": 39860}, {"loss": 0.7339, "grad_norm": 0.9884313941001892, "learning_rate": 0.0002, "epoch": 2.8631956912028724, "step": 39870}, {"loss": 0.6583, "grad_norm": 0.8537741303443909, "learning_rate": 0.0002, "epoch": 2.863913824057451, "step": 39880}, {"loss": 0.6746, "grad_norm": 0.7407512664794922, "learning_rate": 0.0002, "epoch": 2.864631956912029, "step": 39890}, {"loss": 0.7211, "grad_norm": 1.0179548263549805, "learning_rate": 0.0002, "epoch": 2.865350089766607, "step": 39900}, {"loss": 0.6916, "grad_norm": 0.8822470307350159, "learning_rate": 0.0002, "epoch": 2.866068222621185, "step": 39910}, {"loss": 0.7141, "grad_norm": 0.794448733329773, "learning_rate": 0.0002, "epoch": 2.866786355475763, "step": 39920}, {"loss": 0.6993, "grad_norm": 0.8115299940109253, "learning_rate": 0.0002, "epoch": 2.867504488330341, "step": 39930}, {"loss": 0.655, "grad_norm": 0.7998958826065063, "learning_rate": 0.0002, "epoch": 2.8682226211849193, "step": 39940}, {"loss": 0.7414, "grad_norm": 0.8222435116767883, "learning_rate": 0.0002, "epoch": 2.8689407540394973, "step": 39950}, {"loss": 0.6987, "grad_norm": 0.9495923519134521, "learning_rate": 0.0002, "epoch": 2.8696588868940753, "step": 39960}, {"loss": 0.6567, "grad_norm": 0.6749192476272583, "learning_rate": 0.0002, "epoch": 2.8703770197486533, "step": 39970}, {"loss": 0.7003, "grad_norm": 0.8910874128341675, "learning_rate": 0.0002, "epoch": 2.871095152603232, "step": 39980}, {"loss": 0.6935, "grad_norm": 0.7051638960838318, "learning_rate": 0.0002, "epoch": 2.87181328545781, "step": 39990}, {"loss": 0.663, "grad_norm": 0.8456535339355469, "learning_rate": 0.0002, "epoch": 2.872531418312388, "step": 40000}, {"loss": 0.7222, "grad_norm": 0.934894859790802, "learning_rate": 0.0002, "epoch": 2.873249551166966, "step": 40010}, {"loss": 0.7106, "grad_norm": 0.6740477681159973, "learning_rate": 0.0002, "epoch": 2.873967684021544, "step": 40020}, {"loss": 0.6981, "grad_norm": 0.6632325649261475, "learning_rate": 0.0002, "epoch": 2.8746858168761222, "step": 40030}, {"loss": 0.7037, "grad_norm": 0.8889022469520569, "learning_rate": 0.0002, "epoch": 2.8754039497307002, "step": 40040}, {"loss": 0.7094, "grad_norm": 0.7460705637931824, "learning_rate": 0.0002, "epoch": 2.8761220825852782, "step": 40050}, {"loss": 0.6994, "grad_norm": 0.9795911908149719, "learning_rate": 0.0002, "epoch": 2.8768402154398562, "step": 40060}, {"loss": 0.6602, "grad_norm": 1.0002509355545044, "learning_rate": 0.0002, "epoch": 2.8775583482944347, "step": 40070}, {"loss": 0.7191, "grad_norm": 0.7867239713668823, "learning_rate": 0.0002, "epoch": 2.8782764811490127, "step": 40080}, {"loss": 0.6772, "grad_norm": 1.0221471786499023, "learning_rate": 0.0002, "epoch": 2.8789946140035907, "step": 40090}, {"loss": 0.7317, "grad_norm": 0.8091005086898804, "learning_rate": 0.0002, "epoch": 2.8797127468581687, "step": 40100}, {"loss": 0.7334, "grad_norm": 0.8485820293426514, "learning_rate": 0.0002, "epoch": 2.8804308797127467, "step": 40110}, {"loss": 0.7221, "grad_norm": 0.7850196957588196, "learning_rate": 0.0002, "epoch": 2.8811490125673247, "step": 40120}, {"loss": 0.6696, "grad_norm": 0.7906134128570557, "learning_rate": 0.0002, "epoch": 2.881867145421903, "step": 40130}, {"loss": 0.648, "grad_norm": 0.7957962155342102, "learning_rate": 0.0002, "epoch": 2.882585278276481, "step": 40140}, {"loss": 0.6774, "grad_norm": 1.0687522888183594, "learning_rate": 0.0002, "epoch": 2.883303411131059, "step": 40150}, {"loss": 0.7256, "grad_norm": 0.713752031326294, "learning_rate": 0.0002, "epoch": 2.8840215439856376, "step": 40160}, {"loss": 0.7144, "grad_norm": 1.1603864431381226, "learning_rate": 0.0002, "epoch": 2.8847396768402156, "step": 40170}, {"loss": 0.7223, "grad_norm": 0.8423245549201965, "learning_rate": 0.0002, "epoch": 2.8854578096947936, "step": 40180}, {"loss": 0.6796, "grad_norm": 0.7554550766944885, "learning_rate": 0.0002, "epoch": 2.8861759425493716, "step": 40190}, {"loss": 0.6923, "grad_norm": 0.6006978750228882, "learning_rate": 0.0002, "epoch": 2.8868940754039496, "step": 40200}, {"loss": 0.6893, "grad_norm": 0.923068106174469, "learning_rate": 0.0002, "epoch": 2.8876122082585276, "step": 40210}, {"loss": 0.6688, "grad_norm": 0.7659787535667419, "learning_rate": 0.0002, "epoch": 2.888330341113106, "step": 40220}, {"loss": 0.6706, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 2.889048473967684, "step": 40230}, {"loss": 0.6922, "grad_norm": 1.1267355680465698, "learning_rate": 0.0002, "epoch": 2.889766606822262, "step": 40240}, {"loss": 0.7636, "grad_norm": 0.8548554182052612, "learning_rate": 0.0002, "epoch": 2.89048473967684, "step": 40250}, {"loss": 0.6847, "grad_norm": 0.7846875786781311, "learning_rate": 0.0002, "epoch": 2.8912028725314185, "step": 40260}, {"loss": 0.6796, "grad_norm": 0.8606904745101929, "learning_rate": 0.0002, "epoch": 2.8919210053859965, "step": 40270}, {"loss": 0.6864, "grad_norm": 0.6508898138999939, "learning_rate": 0.0002, "epoch": 2.8926391382405745, "step": 40280}, {"loss": 0.6793, "grad_norm": 0.7903237342834473, "learning_rate": 0.0002, "epoch": 2.8933572710951525, "step": 40290}, {"loss": 0.6642, "grad_norm": 0.7320941686630249, "learning_rate": 0.0002, "epoch": 2.8940754039497305, "step": 40300}, {"loss": 0.6813, "grad_norm": 1.0031821727752686, "learning_rate": 0.0002, "epoch": 2.894793536804309, "step": 40310}, {"loss": 0.6071, "grad_norm": 0.7463554739952087, "learning_rate": 0.0002, "epoch": 2.895511669658887, "step": 40320}, {"loss": 0.6856, "grad_norm": 0.8455599546432495, "learning_rate": 0.0002, "epoch": 2.896229802513465, "step": 40330}, {"loss": 0.7252, "grad_norm": 0.7645914554595947, "learning_rate": 0.0002, "epoch": 2.896947935368043, "step": 40340}, {"loss": 0.7181, "grad_norm": 0.9074810147285461, "learning_rate": 0.0002, "epoch": 2.8976660682226214, "step": 40350}, {"loss": 0.6935, "grad_norm": 0.9070153832435608, "learning_rate": 0.0002, "epoch": 2.8983842010771994, "step": 40360}, {"loss": 0.7021, "grad_norm": 0.8649221658706665, "learning_rate": 0.0002, "epoch": 2.8991023339317774, "step": 40370}, {"loss": 0.7402, "grad_norm": 1.0325016975402832, "learning_rate": 0.0002, "epoch": 2.8998204667863554, "step": 40380}, {"loss": 0.6889, "grad_norm": 0.8688622713088989, "learning_rate": 0.0002, "epoch": 2.9005385996409334, "step": 40390}, {"loss": 0.7209, "grad_norm": 0.83316969871521, "learning_rate": 0.0002, "epoch": 2.9012567324955114, "step": 40400}, {"loss": 0.6915, "grad_norm": 1.0146536827087402, "learning_rate": 0.0002, "epoch": 2.90197486535009, "step": 40410}, {"loss": 0.67, "grad_norm": 6.21811580657959, "learning_rate": 0.0002, "epoch": 2.902692998204668, "step": 40420}, {"loss": 0.675, "grad_norm": 0.8747655749320984, "learning_rate": 0.0002, "epoch": 2.903411131059246, "step": 40430}, {"loss": 0.6781, "grad_norm": 0.8671547174453735, "learning_rate": 0.0002, "epoch": 2.9041292639138243, "step": 40440}, {"loss": 0.693, "grad_norm": 0.7888760566711426, "learning_rate": 0.0002, "epoch": 2.9048473967684023, "step": 40450}, {"loss": 0.7208, "grad_norm": 0.7182217240333557, "learning_rate": 0.0002, "epoch": 2.9055655296229803, "step": 40460}, {"loss": 0.7393, "grad_norm": 0.8802227973937988, "learning_rate": 0.0002, "epoch": 2.9062836624775583, "step": 40470}, {"loss": 0.6755, "grad_norm": 0.8106126189231873, "learning_rate": 0.0002, "epoch": 2.9070017953321363, "step": 40480}, {"loss": 0.7251, "grad_norm": 0.7313538789749146, "learning_rate": 0.0002, "epoch": 2.9077199281867143, "step": 40490}, {"loss": 0.6927, "grad_norm": 0.6098655462265015, "learning_rate": 0.0002, "epoch": 2.9084380610412928, "step": 40500}, {"loss": 0.6667, "grad_norm": 0.8849560618400574, "learning_rate": 0.0002, "epoch": 2.9091561938958708, "step": 40510}, {"loss": 0.7199, "grad_norm": 0.8761322498321533, "learning_rate": 0.0002, "epoch": 2.9098743267504488, "step": 40520}, {"loss": 0.6952, "grad_norm": 0.8259703516960144, "learning_rate": 0.0002, "epoch": 2.9105924596050268, "step": 40530}, {"loss": 0.6547, "grad_norm": 0.6613079309463501, "learning_rate": 0.0002, "epoch": 2.911310592459605, "step": 40540}, {"loss": 0.7642, "grad_norm": 0.825678825378418, "learning_rate": 0.0002, "epoch": 2.912028725314183, "step": 40550}, {"loss": 0.7052, "grad_norm": 0.824850857257843, "learning_rate": 0.0002, "epoch": 2.912746858168761, "step": 40560}, {"loss": 0.6869, "grad_norm": 0.9629682898521423, "learning_rate": 0.0002, "epoch": 2.9134649910233392, "step": 40570}, {"loss": 0.7588, "grad_norm": 0.7446485161781311, "learning_rate": 0.0002, "epoch": 2.9141831238779172, "step": 40580}, {"loss": 0.7045, "grad_norm": 0.9028317928314209, "learning_rate": 0.0002, "epoch": 2.9149012567324957, "step": 40590}, {"loss": 0.7128, "grad_norm": 0.9646022319793701, "learning_rate": 0.0002, "epoch": 2.9156193895870737, "step": 40600}, {"loss": 0.6782, "grad_norm": 0.8845045566558838, "learning_rate": 0.0002, "epoch": 2.9163375224416517, "step": 40610}, {"loss": 0.7179, "grad_norm": 0.9660372734069824, "learning_rate": 0.0002, "epoch": 2.9170556552962297, "step": 40620}, {"loss": 0.7442, "grad_norm": 0.8914347290992737, "learning_rate": 0.0002, "epoch": 2.917773788150808, "step": 40630}, {"loss": 0.6435, "grad_norm": 0.7789235711097717, "learning_rate": 0.0002, "epoch": 2.918491921005386, "step": 40640}, {"loss": 0.7156, "grad_norm": 0.8221206665039062, "learning_rate": 0.0002, "epoch": 2.919210053859964, "step": 40650}, {"loss": 0.7363, "grad_norm": 0.9550618529319763, "learning_rate": 0.0002, "epoch": 2.919928186714542, "step": 40660}, {"loss": 0.6911, "grad_norm": 0.868315577507019, "learning_rate": 0.0002, "epoch": 2.92064631956912, "step": 40670}, {"loss": 0.6939, "grad_norm": 0.852878749370575, "learning_rate": 0.0002, "epoch": 2.921364452423698, "step": 40680}, {"loss": 0.6497, "grad_norm": 0.8388790488243103, "learning_rate": 0.0002, "epoch": 2.9220825852782766, "step": 40690}, {"loss": 0.7299, "grad_norm": 0.9897602200508118, "learning_rate": 0.0002, "epoch": 2.9228007181328546, "step": 40700}, {"loss": 0.695, "grad_norm": 0.8050527572631836, "learning_rate": 0.0002, "epoch": 2.9235188509874326, "step": 40710}, {"loss": 0.6924, "grad_norm": 0.7296929955482483, "learning_rate": 0.0002, "epoch": 2.924236983842011, "step": 40720}, {"loss": 0.759, "grad_norm": 0.917475700378418, "learning_rate": 0.0002, "epoch": 2.924955116696589, "step": 40730}, {"loss": 0.6965, "grad_norm": 0.9118483662605286, "learning_rate": 0.0002, "epoch": 2.925673249551167, "step": 40740}, {"loss": 0.6918, "grad_norm": 0.7722473740577698, "learning_rate": 0.0002, "epoch": 2.926391382405745, "step": 40750}, {"loss": 0.7103, "grad_norm": 0.7950358986854553, "learning_rate": 0.0002, "epoch": 2.927109515260323, "step": 40760}, {"loss": 0.7266, "grad_norm": 0.8868561387062073, "learning_rate": 0.0002, "epoch": 2.927827648114901, "step": 40770}, {"loss": 0.7513, "grad_norm": 0.7923154830932617, "learning_rate": 0.0002, "epoch": 2.9285457809694795, "step": 40780}, {"loss": 0.6822, "grad_norm": 0.7285428047180176, "learning_rate": 0.0002, "epoch": 2.9292639138240575, "step": 40790}, {"loss": 0.6748, "grad_norm": 0.794775664806366, "learning_rate": 0.0002, "epoch": 2.9299820466786355, "step": 40800}, {"loss": 0.6967, "grad_norm": 0.8351698517799377, "learning_rate": 0.0002, "epoch": 2.9307001795332135, "step": 40810}, {"loss": 0.6927, "grad_norm": 0.853082001209259, "learning_rate": 0.0002, "epoch": 2.931418312387792, "step": 40820}, {"loss": 0.7047, "grad_norm": 0.8209722638130188, "learning_rate": 0.0002, "epoch": 2.93213644524237, "step": 40830}, {"loss": 0.6742, "grad_norm": 0.8982136845588684, "learning_rate": 0.0002, "epoch": 2.932854578096948, "step": 40840}, {"loss": 0.6617, "grad_norm": 0.8373305201530457, "learning_rate": 0.0002, "epoch": 2.933572710951526, "step": 40850}, {"loss": 0.6754, "grad_norm": 0.8326864242553711, "learning_rate": 0.0002, "epoch": 2.934290843806104, "step": 40860}, {"loss": 0.7151, "grad_norm": 0.7232590317726135, "learning_rate": 0.0002, "epoch": 2.9350089766606824, "step": 40870}, {"loss": 0.7311, "grad_norm": 0.823615312576294, "learning_rate": 0.0002, "epoch": 2.9357271095152604, "step": 40880}, {"loss": 0.7122, "grad_norm": 0.7532811760902405, "learning_rate": 0.0002, "epoch": 2.9364452423698384, "step": 40890}, {"loss": 0.7254, "grad_norm": 0.9594773650169373, "learning_rate": 0.0002, "epoch": 2.9371633752244164, "step": 40900}, {"loss": 0.7024, "grad_norm": 0.8368398547172546, "learning_rate": 0.0002, "epoch": 2.937881508078995, "step": 40910}, {"loss": 0.7201, "grad_norm": 0.8336817026138306, "learning_rate": 0.0002, "epoch": 2.938599640933573, "step": 40920}, {"loss": 0.6402, "grad_norm": 0.8413758277893066, "learning_rate": 0.0002, "epoch": 2.939317773788151, "step": 40930}, {"loss": 0.7054, "grad_norm": 0.7117549180984497, "learning_rate": 0.0002, "epoch": 2.940035906642729, "step": 40940}, {"loss": 0.6101, "grad_norm": 0.8741925954818726, "learning_rate": 0.0002, "epoch": 2.940754039497307, "step": 40950}, {"loss": 0.7491, "grad_norm": 0.8476088047027588, "learning_rate": 0.0002, "epoch": 2.941472172351885, "step": 40960}, {"loss": 0.7084, "grad_norm": 0.674659788608551, "learning_rate": 0.0002, "epoch": 2.9421903052064633, "step": 40970}, {"loss": 0.6714, "grad_norm": 0.7087500691413879, "learning_rate": 0.0002, "epoch": 2.9429084380610413, "step": 40980}, {"loss": 0.6953, "grad_norm": 0.9202252626419067, "learning_rate": 0.0002, "epoch": 2.9436265709156193, "step": 40990}, {"loss": 0.7244, "grad_norm": 0.9775124192237854, "learning_rate": 0.0002, "epoch": 2.9443447037701977, "step": 41000}, {"loss": 0.6897, "grad_norm": 0.7465068101882935, "learning_rate": 0.0002, "epoch": 2.9450628366247757, "step": 41010}, {"loss": 0.6944, "grad_norm": 0.7229986786842346, "learning_rate": 0.0002, "epoch": 2.9457809694793538, "step": 41020}, {"loss": 0.6754, "grad_norm": 0.7228954434394836, "learning_rate": 0.0002, "epoch": 2.9464991023339318, "step": 41030}, {"loss": 0.6604, "grad_norm": 0.9396149516105652, "learning_rate": 0.0002, "epoch": 2.9472172351885098, "step": 41040}, {"loss": 0.6498, "grad_norm": 0.9458696842193604, "learning_rate": 0.0002, "epoch": 2.9479353680430878, "step": 41050}, {"loss": 0.7154, "grad_norm": 0.8276246190071106, "learning_rate": 0.0002, "epoch": 2.948653500897666, "step": 41060}, {"loss": 0.6567, "grad_norm": 0.7927420139312744, "learning_rate": 0.0002, "epoch": 2.949371633752244, "step": 41070}, {"loss": 0.7442, "grad_norm": 0.7403103709220886, "learning_rate": 0.0002, "epoch": 2.950089766606822, "step": 41080}, {"loss": 0.6856, "grad_norm": 0.9813524484634399, "learning_rate": 0.0002, "epoch": 2.9508078994614, "step": 41090}, {"loss": 0.7271, "grad_norm": 0.8560924530029297, "learning_rate": 0.0002, "epoch": 2.9515260323159787, "step": 41100}, {"loss": 0.6851, "grad_norm": 0.6937443017959595, "learning_rate": 0.0002, "epoch": 2.9522441651705567, "step": 41110}, {"loss": 0.6817, "grad_norm": 0.8440476655960083, "learning_rate": 0.0002, "epoch": 2.9529622980251347, "step": 41120}, {"loss": 0.7082, "grad_norm": 1.1260770559310913, "learning_rate": 0.0002, "epoch": 2.9536804308797127, "step": 41130}, {"loss": 0.6745, "grad_norm": 0.8789936900138855, "learning_rate": 0.0002, "epoch": 2.9543985637342907, "step": 41140}, {"loss": 0.7297, "grad_norm": 0.8205832839012146, "learning_rate": 0.0002, "epoch": 2.9551166965888687, "step": 41150}, {"loss": 0.7036, "grad_norm": 0.8148444294929504, "learning_rate": 0.0002, "epoch": 2.955834829443447, "step": 41160}, {"loss": 0.6923, "grad_norm": 0.791296660900116, "learning_rate": 0.0002, "epoch": 2.956552962298025, "step": 41170}, {"loss": 0.6589, "grad_norm": 1.3229854106903076, "learning_rate": 0.0002, "epoch": 2.957271095152603, "step": 41180}, {"loss": 0.6691, "grad_norm": 0.906423807144165, "learning_rate": 0.0002, "epoch": 2.9579892280071816, "step": 41190}, {"loss": 0.6979, "grad_norm": 0.8707411289215088, "learning_rate": 0.0002, "epoch": 2.9587073608617596, "step": 41200}, {"loss": 0.6442, "grad_norm": 1.0362473726272583, "learning_rate": 0.0002, "epoch": 2.9594254937163376, "step": 41210}, {"loss": 0.6725, "grad_norm": 0.818546712398529, "learning_rate": 0.0002, "epoch": 2.9601436265709156, "step": 41220}, {"loss": 0.7158, "grad_norm": 0.8558517098426819, "learning_rate": 0.0002, "epoch": 2.9608617594254936, "step": 41230}, {"loss": 0.7056, "grad_norm": 0.8262931704521179, "learning_rate": 0.0002, "epoch": 2.9615798922800716, "step": 41240}, {"loss": 0.6256, "grad_norm": 0.9603250026702881, "learning_rate": 0.0002, "epoch": 2.96229802513465, "step": 41250}, {"loss": 0.68, "grad_norm": 0.891610860824585, "learning_rate": 0.0002, "epoch": 2.963016157989228, "step": 41260}, {"loss": 0.7732, "grad_norm": 0.9823883175849915, "learning_rate": 0.0002, "epoch": 2.963734290843806, "step": 41270}, {"loss": 0.7144, "grad_norm": 0.8783510327339172, "learning_rate": 0.0002, "epoch": 2.9644524236983845, "step": 41280}, {"loss": 0.7196, "grad_norm": 0.873656690120697, "learning_rate": 0.0002, "epoch": 2.9651705565529625, "step": 41290}, {"loss": 0.6531, "grad_norm": 0.8281165957450867, "learning_rate": 0.0002, "epoch": 2.9658886894075405, "step": 41300}, {"loss": 0.69, "grad_norm": 0.8008899092674255, "learning_rate": 0.0002, "epoch": 2.9666068222621185, "step": 41310}, {"loss": 0.6923, "grad_norm": 0.8564065098762512, "learning_rate": 0.0002, "epoch": 2.9673249551166965, "step": 41320}, {"loss": 0.6871, "grad_norm": 0.786119818687439, "learning_rate": 0.0002, "epoch": 2.9680430879712745, "step": 41330}, {"loss": 0.7105, "grad_norm": 1.3152399063110352, "learning_rate": 0.0002, "epoch": 2.968761220825853, "step": 41340}, {"loss": 0.6575, "grad_norm": 0.7551527619361877, "learning_rate": 0.0002, "epoch": 2.969479353680431, "step": 41350}, {"loss": 0.6939, "grad_norm": 1.1397290229797363, "learning_rate": 0.0002, "epoch": 2.970197486535009, "step": 41360}, {"loss": 0.7119, "grad_norm": 0.8333854079246521, "learning_rate": 0.0002, "epoch": 2.970915619389587, "step": 41370}, {"loss": 0.6941, "grad_norm": 0.8096165657043457, "learning_rate": 0.0002, "epoch": 2.9716337522441654, "step": 41380}, {"loss": 0.7748, "grad_norm": 0.8378547430038452, "learning_rate": 0.0002, "epoch": 2.9723518850987434, "step": 41390}, {"loss": 0.7678, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 2.9730700179533214, "step": 41400}, {"loss": 0.6962, "grad_norm": 0.8722409605979919, "learning_rate": 0.0002, "epoch": 2.9737881508078994, "step": 41410}, {"loss": 0.7298, "grad_norm": 0.6680061221122742, "learning_rate": 0.0002, "epoch": 2.9745062836624774, "step": 41420}, {"loss": 0.6731, "grad_norm": 0.7666152715682983, "learning_rate": 0.0002, "epoch": 2.9752244165170554, "step": 41430}, {"loss": 0.7377, "grad_norm": 0.8489957451820374, "learning_rate": 0.0002, "epoch": 2.975942549371634, "step": 41440}, {"loss": 0.6816, "grad_norm": 0.8516127467155457, "learning_rate": 0.0002, "epoch": 2.976660682226212, "step": 41450}, {"loss": 0.697, "grad_norm": 0.8836804628372192, "learning_rate": 0.0002, "epoch": 2.97737881508079, "step": 41460}, {"loss": 0.7048, "grad_norm": 1.0963364839553833, "learning_rate": 0.0002, "epoch": 2.9780969479353683, "step": 41470}, {"loss": 0.6695, "grad_norm": 0.9908610582351685, "learning_rate": 0.0002, "epoch": 2.9788150807899463, "step": 41480}, {"loss": 0.7184, "grad_norm": 0.8822041153907776, "learning_rate": 0.0002, "epoch": 2.9795332136445243, "step": 41490}, {"loss": 0.7192, "grad_norm": 0.717723548412323, "learning_rate": 0.0002, "epoch": 2.9802513464991023, "step": 41500}, {"loss": 0.711, "grad_norm": 0.8413400053977966, "learning_rate": 0.0002, "epoch": 2.9809694793536803, "step": 41510}, {"loss": 0.6871, "grad_norm": 0.8771023750305176, "learning_rate": 0.0002, "epoch": 2.9816876122082583, "step": 41520}, {"loss": 0.6802, "grad_norm": 0.7185000777244568, "learning_rate": 0.0002, "epoch": 2.9824057450628367, "step": 41530}, {"loss": 0.706, "grad_norm": 0.8299767374992371, "learning_rate": 0.0002, "epoch": 2.9831238779174147, "step": 41540}, {"loss": 0.6569, "grad_norm": 0.9309971928596497, "learning_rate": 0.0002, "epoch": 2.9838420107719927, "step": 41550}, {"loss": 0.6598, "grad_norm": 0.7644693851470947, "learning_rate": 0.0002, "epoch": 2.984560143626571, "step": 41560}, {"loss": 0.7186, "grad_norm": 0.7888111472129822, "learning_rate": 0.0002, "epoch": 2.985278276481149, "step": 41570}, {"loss": 0.6984, "grad_norm": 1.0921967029571533, "learning_rate": 0.0002, "epoch": 2.985996409335727, "step": 41580}, {"loss": 0.6629, "grad_norm": 0.8116785883903503, "learning_rate": 0.0002, "epoch": 2.986714542190305, "step": 41590}, {"loss": 0.6842, "grad_norm": 0.983269214630127, "learning_rate": 0.0002, "epoch": 2.987432675044883, "step": 41600}, {"loss": 0.6675, "grad_norm": 0.81700599193573, "learning_rate": 0.0002, "epoch": 2.988150807899461, "step": 41610}, {"loss": 0.7525, "grad_norm": 0.7545617818832397, "learning_rate": 0.0002, "epoch": 2.9888689407540396, "step": 41620}, {"loss": 0.6698, "grad_norm": 0.8695791363716125, "learning_rate": 0.0002, "epoch": 2.9895870736086176, "step": 41630}, {"loss": 0.7446, "grad_norm": 0.8980445861816406, "learning_rate": 0.0002, "epoch": 2.9903052064631956, "step": 41640}, {"loss": 0.6616, "grad_norm": 0.7884747982025146, "learning_rate": 0.0002, "epoch": 2.9910233393177736, "step": 41650}, {"loss": 0.6461, "grad_norm": 0.8347880840301514, "learning_rate": 0.0002, "epoch": 2.991741472172352, "step": 41660}, {"loss": 0.6607, "grad_norm": 0.7786261439323425, "learning_rate": 0.0002, "epoch": 2.99245960502693, "step": 41670}, {"loss": 0.6834, "grad_norm": 0.7830624580383301, "learning_rate": 0.0002, "epoch": 2.993177737881508, "step": 41680}, {"loss": 0.7116, "grad_norm": 0.8293532133102417, "learning_rate": 0.0002, "epoch": 2.993895870736086, "step": 41690}, {"loss": 0.7029, "grad_norm": 0.8476244211196899, "learning_rate": 0.0002, "epoch": 2.994614003590664, "step": 41700}, {"loss": 0.6909, "grad_norm": 0.7218726873397827, "learning_rate": 0.0002, "epoch": 2.995332136445242, "step": 41710}, {"loss": 0.6579, "grad_norm": 0.8144199252128601, "learning_rate": 0.0002, "epoch": 2.9960502692998205, "step": 41720}, {"loss": 0.7011, "grad_norm": 0.7047123312950134, "learning_rate": 0.0002, "epoch": 2.9967684021543985, "step": 41730}, {"loss": 0.6555, "grad_norm": 0.8412184715270996, "learning_rate": 0.0002, "epoch": 2.9974865350089765, "step": 41740}, {"loss": 0.7237, "grad_norm": 0.8840848207473755, "learning_rate": 0.0002, "epoch": 2.998204667863555, "step": 41750}, {"loss": 0.6618, "grad_norm": 0.7302142977714539, "learning_rate": 0.0002, "epoch": 2.998922800718133, "step": 41760}, {"loss": 0.6596, "grad_norm": 0.7075994610786438, "learning_rate": 0.0002, "epoch": 2.999640933572711, "step": 41770}]} +{"epoch": 4.0, "step": 55700, "epoch_duration": 15003.449581623077, "total_accumulated_duration": 63344.35241699219, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.5816, "grad_norm": 1.0291756391525269, "learning_rate": 0.0002, "epoch": 0.000718132854578097, "step": 10}, {"loss": 1.1527, "grad_norm": 0.6570823192596436, "learning_rate": 0.0002, "epoch": 0.001436265709156194, "step": 20}, {"loss": 1.0014, "grad_norm": 0.693844199180603, "learning_rate": 0.0002, "epoch": 0.0021543985637342907, "step": 30}, {"loss": 0.9377, "grad_norm": 0.5608532428741455, "learning_rate": 0.0002, "epoch": 0.002872531418312388, "step": 40}, {"loss": 0.9533, "grad_norm": 0.549075722694397, "learning_rate": 0.0002, "epoch": 0.003590664272890485, "step": 50}, {"loss": 0.9164, "grad_norm": 0.47189879417419434, "learning_rate": 0.0002, "epoch": 0.004308797127468581, "step": 60}, {"loss": 0.8898, "grad_norm": 0.5799676775932312, "learning_rate": 0.0002, "epoch": 0.005026929982046679, "step": 70}, {"loss": 0.859, "grad_norm": 0.45907193422317505, "learning_rate": 0.0002, "epoch": 0.005745062836624776, "step": 80}, {"loss": 0.8697, "grad_norm": 0.4373045861721039, "learning_rate": 0.0002, "epoch": 0.006463195691202872, "step": 90}, {"loss": 0.8879, "grad_norm": 0.5636304020881653, "learning_rate": 0.0002, "epoch": 0.00718132854578097, "step": 100}, {"loss": 0.8397, "grad_norm": 0.5248253345489502, "learning_rate": 0.0002, "epoch": 0.007899461400359067, "step": 110}, {"loss": 0.9021, "grad_norm": 0.5082874298095703, "learning_rate": 0.0002, "epoch": 0.008617594254937163, "step": 120}, {"loss": 0.8678, "grad_norm": 0.42670881748199463, "learning_rate": 0.0002, "epoch": 0.00933572710951526, "step": 130}, {"loss": 0.7847, "grad_norm": 0.43311649560928345, "learning_rate": 0.0002, "epoch": 0.010053859964093357, "step": 140}, {"loss": 0.9252, "grad_norm": 0.43456509709358215, "learning_rate": 0.0002, "epoch": 0.010771992818671455, "step": 150}, {"loss": 0.8812, "grad_norm": 0.9222815632820129, "learning_rate": 0.0002, "epoch": 0.011490125673249552, "step": 160}, {"loss": 0.8651, "grad_norm": 0.42752256989479065, "learning_rate": 0.0002, "epoch": 0.012208258527827648, "step": 170}, {"loss": 0.8898, "grad_norm": 0.4175542891025543, "learning_rate": 0.0002, "epoch": 0.012926391382405745, "step": 180}, {"loss": 0.8519, "grad_norm": 0.4377831518650055, "learning_rate": 0.0002, "epoch": 0.013644524236983842, "step": 190}, {"loss": 0.8849, "grad_norm": 0.47263655066490173, "learning_rate": 0.0002, "epoch": 0.01436265709156194, "step": 200}, {"loss": 0.8764, "grad_norm": 0.3870520293712616, "learning_rate": 0.0002, "epoch": 0.015080789946140035, "step": 210}, {"loss": 0.833, "grad_norm": 0.4950464963912964, "learning_rate": 0.0002, "epoch": 0.015798922800718134, "step": 220}, {"loss": 0.8323, "grad_norm": 0.4643295407295227, "learning_rate": 0.0002, "epoch": 0.01651705565529623, "step": 230}, {"loss": 0.8363, "grad_norm": 0.5152903199195862, "learning_rate": 0.0002, "epoch": 0.017235188509874325, "step": 240}, {"loss": 0.873, "grad_norm": 0.3800727427005768, "learning_rate": 0.0002, "epoch": 0.017953321364452424, "step": 250}, {"loss": 0.8252, "grad_norm": 0.43700528144836426, "learning_rate": 0.0002, "epoch": 0.01867145421903052, "step": 260}, {"loss": 0.8686, "grad_norm": 0.3712887763977051, "learning_rate": 0.0002, "epoch": 0.01938958707360862, "step": 270}, {"loss": 0.8329, "grad_norm": 0.4202553629875183, "learning_rate": 0.0002, "epoch": 0.020107719928186715, "step": 280}, {"loss": 0.8143, "grad_norm": 0.40585094690322876, "learning_rate": 0.0002, "epoch": 0.02082585278276481, "step": 290}, {"loss": 0.8463, "grad_norm": 0.4685470759868622, "learning_rate": 0.0002, "epoch": 0.02154398563734291, "step": 300}, {"loss": 0.8321, "grad_norm": 0.373169481754303, "learning_rate": 0.0002, "epoch": 0.022262118491921005, "step": 310}, {"loss": 0.8031, "grad_norm": 0.39681482315063477, "learning_rate": 0.0002, "epoch": 0.022980251346499104, "step": 320}, {"loss": 0.8667, "grad_norm": 0.3919322192668915, "learning_rate": 0.0002, "epoch": 0.0236983842010772, "step": 330}, {"loss": 0.8196, "grad_norm": 0.4728981554508209, "learning_rate": 0.0002, "epoch": 0.024416517055655295, "step": 340}, {"loss": 0.8662, "grad_norm": 0.42439374327659607, "learning_rate": 0.0002, "epoch": 0.025134649910233394, "step": 350}, {"loss": 0.8618, "grad_norm": 0.425650030374527, "learning_rate": 0.0002, "epoch": 0.02585278276481149, "step": 360}, {"loss": 0.8249, "grad_norm": 0.4076762795448303, "learning_rate": 0.0002, "epoch": 0.02657091561938959, "step": 370}, {"loss": 0.8293, "grad_norm": 0.44335922598838806, "learning_rate": 0.0002, "epoch": 0.027289048473967684, "step": 380}, {"loss": 0.8288, "grad_norm": 0.5313619375228882, "learning_rate": 0.0002, "epoch": 0.02800718132854578, "step": 390}, {"loss": 0.8431, "grad_norm": 0.37089797854423523, "learning_rate": 0.0002, "epoch": 0.02872531418312388, "step": 400}, {"loss": 0.7644, "grad_norm": 0.5193604826927185, "learning_rate": 0.0002, "epoch": 0.029443447037701975, "step": 410}, {"loss": 0.7853, "grad_norm": 0.4428552985191345, "learning_rate": 0.0002, "epoch": 0.03016157989228007, "step": 420}, {"loss": 0.8641, "grad_norm": 0.384171724319458, "learning_rate": 0.0002, "epoch": 0.03087971274685817, "step": 430}, {"loss": 0.8236, "grad_norm": 0.3906913101673126, "learning_rate": 0.0002, "epoch": 0.03159784560143627, "step": 440}, {"loss": 0.8215, "grad_norm": 0.5365669131278992, "learning_rate": 0.0002, "epoch": 0.03231597845601436, "step": 450}, {"loss": 0.8376, "grad_norm": 0.4785287380218506, "learning_rate": 0.0002, "epoch": 0.03303411131059246, "step": 460}, {"loss": 0.8439, "grad_norm": 0.40048182010650635, "learning_rate": 0.0002, "epoch": 0.03375224416517056, "step": 470}, {"loss": 0.8306, "grad_norm": 0.49529239535331726, "learning_rate": 0.0002, "epoch": 0.03447037701974865, "step": 480}, {"loss": 0.8653, "grad_norm": 0.5853474140167236, "learning_rate": 0.0002, "epoch": 0.03518850987432675, "step": 490}, {"loss": 0.7952, "grad_norm": 0.3802863359451294, "learning_rate": 0.0002, "epoch": 0.03590664272890485, "step": 500}, {"loss": 0.8986, "grad_norm": 0.40374308824539185, "learning_rate": 0.0002, "epoch": 0.03662477558348295, "step": 510}, {"loss": 0.8495, "grad_norm": 0.4320009648799896, "learning_rate": 0.0002, "epoch": 0.03734290843806104, "step": 520}, {"loss": 0.8838, "grad_norm": 0.5198846459388733, "learning_rate": 0.0002, "epoch": 0.03806104129263914, "step": 530}, {"loss": 0.8343, "grad_norm": 0.4136947989463806, "learning_rate": 0.0002, "epoch": 0.03877917414721724, "step": 540}, {"loss": 0.8752, "grad_norm": 0.39344364404678345, "learning_rate": 0.0002, "epoch": 0.03949730700179533, "step": 550}, {"loss": 0.8088, "grad_norm": 0.4659644067287445, "learning_rate": 0.0002, "epoch": 0.04021543985637343, "step": 560}, {"loss": 0.766, "grad_norm": 0.3898842930793762, "learning_rate": 0.0002, "epoch": 0.04093357271095153, "step": 570}, {"loss": 0.7806, "grad_norm": 0.3964841961860657, "learning_rate": 0.0002, "epoch": 0.04165170556552962, "step": 580}, {"loss": 0.801, "grad_norm": 0.5172179341316223, "learning_rate": 0.0002, "epoch": 0.04236983842010772, "step": 590}, {"loss": 0.8253, "grad_norm": 0.5362544059753418, "learning_rate": 0.0002, "epoch": 0.04308797127468582, "step": 600}, {"loss": 0.8701, "grad_norm": 0.3975909948348999, "learning_rate": 0.0002, "epoch": 0.04380610412926391, "step": 610}, {"loss": 0.844, "grad_norm": 0.3905031085014343, "learning_rate": 0.0002, "epoch": 0.04452423698384201, "step": 620}, {"loss": 0.7723, "grad_norm": 0.5148088932037354, "learning_rate": 0.0002, "epoch": 0.04524236983842011, "step": 630}, {"loss": 0.8309, "grad_norm": 0.38826194405555725, "learning_rate": 0.0002, "epoch": 0.04596050269299821, "step": 640}, {"loss": 0.8379, "grad_norm": 0.5432049036026001, "learning_rate": 0.0002, "epoch": 0.0466786355475763, "step": 650}, {"loss": 0.838, "grad_norm": 0.42048221826553345, "learning_rate": 0.0002, "epoch": 0.0473967684021544, "step": 660}, {"loss": 0.8337, "grad_norm": 0.4683088958263397, "learning_rate": 0.0002, "epoch": 0.0481149012567325, "step": 670}, {"loss": 0.7982, "grad_norm": 0.4623735249042511, "learning_rate": 0.0002, "epoch": 0.04883303411131059, "step": 680}, {"loss": 0.8905, "grad_norm": 0.509128212928772, "learning_rate": 0.0002, "epoch": 0.04955116696588869, "step": 690}, {"loss": 0.8193, "grad_norm": 0.45767295360565186, "learning_rate": 0.0002, "epoch": 0.05026929982046679, "step": 700}, {"loss": 0.7658, "grad_norm": 0.4023726284503937, "learning_rate": 0.0002, "epoch": 0.05098743267504488, "step": 710}, {"loss": 0.8552, "grad_norm": 0.4407201409339905, "learning_rate": 0.0002, "epoch": 0.05170556552962298, "step": 720}, {"loss": 0.8369, "grad_norm": 0.41862091422080994, "learning_rate": 0.0002, "epoch": 0.05242369838420108, "step": 730}, {"loss": 0.8856, "grad_norm": 0.37473055720329285, "learning_rate": 0.0002, "epoch": 0.05314183123877918, "step": 740}, {"loss": 0.8282, "grad_norm": 0.4882921576499939, "learning_rate": 0.0002, "epoch": 0.05385996409335727, "step": 750}, {"loss": 0.8257, "grad_norm": 0.47890132665634155, "learning_rate": 0.0002, "epoch": 0.05457809694793537, "step": 760}, {"loss": 0.9222, "grad_norm": 0.5811166167259216, "learning_rate": 0.0002, "epoch": 0.05529622980251347, "step": 770}, {"loss": 0.7943, "grad_norm": 0.41113588213920593, "learning_rate": 0.0002, "epoch": 0.05601436265709156, "step": 780}, {"loss": 0.791, "grad_norm": 0.4120602607727051, "learning_rate": 0.0002, "epoch": 0.05673249551166966, "step": 790}, {"loss": 0.9038, "grad_norm": 0.39287394285202026, "learning_rate": 0.0002, "epoch": 0.05745062836624776, "step": 800}, {"loss": 0.8131, "grad_norm": 0.3986941874027252, "learning_rate": 0.0002, "epoch": 0.05816876122082585, "step": 810}, {"loss": 0.8268, "grad_norm": 0.4264012575149536, "learning_rate": 0.0002, "epoch": 0.05888689407540395, "step": 820}, {"loss": 0.7881, "grad_norm": 0.481139600276947, "learning_rate": 0.0002, "epoch": 0.05960502692998205, "step": 830}, {"loss": 0.8477, "grad_norm": 0.5561784505844116, "learning_rate": 0.0002, "epoch": 0.06032315978456014, "step": 840}, {"loss": 0.7817, "grad_norm": 0.4787197411060333, "learning_rate": 0.0002, "epoch": 0.06104129263913824, "step": 850}, {"loss": 0.8567, "grad_norm": 0.46454647183418274, "learning_rate": 0.0002, "epoch": 0.06175942549371634, "step": 860}, {"loss": 0.8429, "grad_norm": 0.5929669141769409, "learning_rate": 0.0002, "epoch": 0.06247755834829444, "step": 870}, {"loss": 0.8019, "grad_norm": 0.4561384618282318, "learning_rate": 0.0002, "epoch": 0.06319569120287254, "step": 880}, {"loss": 0.8686, "grad_norm": 0.45767998695373535, "learning_rate": 0.0002, "epoch": 0.06391382405745062, "step": 890}, {"loss": 0.818, "grad_norm": 0.42475444078445435, "learning_rate": 0.0002, "epoch": 0.06463195691202872, "step": 900}, {"loss": 0.8579, "grad_norm": 0.4911022484302521, "learning_rate": 0.0002, "epoch": 0.06535008976660682, "step": 910}, {"loss": 0.8067, "grad_norm": 0.5229166746139526, "learning_rate": 0.0002, "epoch": 0.06606822262118492, "step": 920}, {"loss": 0.8563, "grad_norm": 0.38134580850601196, "learning_rate": 0.0002, "epoch": 0.06678635547576302, "step": 930}, {"loss": 0.815, "grad_norm": 0.4171486496925354, "learning_rate": 0.0002, "epoch": 0.06750448833034112, "step": 940}, {"loss": 0.8122, "grad_norm": 0.45171529054641724, "learning_rate": 0.0002, "epoch": 0.06822262118491922, "step": 950}, {"loss": 0.8436, "grad_norm": 0.44889307022094727, "learning_rate": 0.0002, "epoch": 0.0689407540394973, "step": 960}, {"loss": 0.8149, "grad_norm": 0.44902464747428894, "learning_rate": 0.0002, "epoch": 0.0696588868940754, "step": 970}, {"loss": 0.7916, "grad_norm": 0.4671969413757324, "learning_rate": 0.0002, "epoch": 0.0703770197486535, "step": 980}, {"loss": 0.8147, "grad_norm": 0.4686984717845917, "learning_rate": 0.0002, "epoch": 0.0710951526032316, "step": 990}, {"loss": 0.806, "grad_norm": 0.4513658583164215, "learning_rate": 0.0002, "epoch": 0.0718132854578097, "step": 1000}, {"loss": 0.8348, "grad_norm": 0.48861828446388245, "learning_rate": 0.0002, "epoch": 0.0725314183123878, "step": 1010}, {"loss": 0.8038, "grad_norm": 0.7603165507316589, "learning_rate": 0.0002, "epoch": 0.0732495511669659, "step": 1020}, {"loss": 0.7844, "grad_norm": 0.501654863357544, "learning_rate": 0.0002, "epoch": 0.07396768402154398, "step": 1030}, {"loss": 0.7623, "grad_norm": 0.45291560888290405, "learning_rate": 0.0002, "epoch": 0.07468581687612208, "step": 1040}, {"loss": 0.8174, "grad_norm": 0.42454713582992554, "learning_rate": 0.0002, "epoch": 0.07540394973070018, "step": 1050}, {"loss": 0.7874, "grad_norm": 0.4655592441558838, "learning_rate": 0.0002, "epoch": 0.07612208258527828, "step": 1060}, {"loss": 0.8855, "grad_norm": 0.5011071562767029, "learning_rate": 0.0002, "epoch": 0.07684021543985638, "step": 1070}, {"loss": 0.8502, "grad_norm": 0.37221577763557434, "learning_rate": 0.0002, "epoch": 0.07755834829443448, "step": 1080}, {"loss": 0.8623, "grad_norm": 0.5123572945594788, "learning_rate": 0.0002, "epoch": 0.07827648114901256, "step": 1090}, {"loss": 0.8527, "grad_norm": 0.44138720631599426, "learning_rate": 0.0002, "epoch": 0.07899461400359066, "step": 1100}, {"loss": 0.7949, "grad_norm": 0.38932886719703674, "learning_rate": 0.0002, "epoch": 0.07971274685816876, "step": 1110}, {"loss": 0.8289, "grad_norm": 0.435820072889328, "learning_rate": 0.0002, "epoch": 0.08043087971274686, "step": 1120}, {"loss": 0.787, "grad_norm": 0.3820142149925232, "learning_rate": 0.0002, "epoch": 0.08114901256732496, "step": 1130}, {"loss": 0.8617, "grad_norm": 0.39680808782577515, "learning_rate": 0.0002, "epoch": 0.08186714542190306, "step": 1140}, {"loss": 0.8047, "grad_norm": 0.4833722412586212, "learning_rate": 0.0002, "epoch": 0.08258527827648116, "step": 1150}, {"loss": 0.8513, "grad_norm": 0.5045956969261169, "learning_rate": 0.0002, "epoch": 0.08330341113105924, "step": 1160}, {"loss": 0.8366, "grad_norm": 0.3652207553386688, "learning_rate": 0.0002, "epoch": 0.08402154398563734, "step": 1170}, {"loss": 0.8464, "grad_norm": 0.44447052478790283, "learning_rate": 0.0002, "epoch": 0.08473967684021544, "step": 1180}, {"loss": 0.8362, "grad_norm": 0.44942694902420044, "learning_rate": 0.0002, "epoch": 0.08545780969479354, "step": 1190}, {"loss": 0.7932, "grad_norm": 0.48789075016975403, "learning_rate": 0.0002, "epoch": 0.08617594254937164, "step": 1200}, {"loss": 0.8008, "grad_norm": 0.3981451094150543, "learning_rate": 0.0002, "epoch": 0.08689407540394974, "step": 1210}, {"loss": 0.8296, "grad_norm": 0.45545220375061035, "learning_rate": 0.0002, "epoch": 0.08761220825852782, "step": 1220}, {"loss": 0.8406, "grad_norm": 0.562138557434082, "learning_rate": 0.0002, "epoch": 0.08833034111310592, "step": 1230}, {"loss": 0.808, "grad_norm": 0.48523494601249695, "learning_rate": 0.0002, "epoch": 0.08904847396768402, "step": 1240}, {"loss": 0.8024, "grad_norm": 0.35054388642311096, "learning_rate": 0.0002, "epoch": 0.08976660682226212, "step": 1250}, {"loss": 0.8635, "grad_norm": 0.4148605167865753, "learning_rate": 0.0002, "epoch": 0.09048473967684022, "step": 1260}, {"loss": 0.8379, "grad_norm": 0.50171959400177, "learning_rate": 0.0002, "epoch": 0.09120287253141832, "step": 1270}, {"loss": 0.8466, "grad_norm": 0.41747573018074036, "learning_rate": 0.0002, "epoch": 0.09192100538599642, "step": 1280}, {"loss": 0.7905, "grad_norm": 0.43028751015663147, "learning_rate": 0.0002, "epoch": 0.0926391382405745, "step": 1290}, {"loss": 0.8071, "grad_norm": 0.41274991631507874, "learning_rate": 0.0002, "epoch": 0.0933572710951526, "step": 1300}, {"loss": 0.8214, "grad_norm": 0.5399569272994995, "learning_rate": 0.0002, "epoch": 0.0940754039497307, "step": 1310}, {"loss": 0.8108, "grad_norm": 0.44284379482269287, "learning_rate": 0.0002, "epoch": 0.0947935368043088, "step": 1320}, {"loss": 0.8301, "grad_norm": 0.42511969804763794, "learning_rate": 0.0002, "epoch": 0.0955116696588869, "step": 1330}, {"loss": 0.8527, "grad_norm": 0.5717929005622864, "learning_rate": 0.0002, "epoch": 0.096229802513465, "step": 1340}, {"loss": 0.8232, "grad_norm": 0.4104631245136261, "learning_rate": 0.0002, "epoch": 0.09694793536804308, "step": 1350}, {"loss": 0.8697, "grad_norm": 0.4144339859485626, "learning_rate": 0.0002, "epoch": 0.09766606822262118, "step": 1360}, {"loss": 0.7909, "grad_norm": 0.43676936626434326, "learning_rate": 0.0002, "epoch": 0.09838420107719928, "step": 1370}, {"loss": 0.8757, "grad_norm": 0.5297161340713501, "learning_rate": 0.0002, "epoch": 0.09910233393177738, "step": 1380}, {"loss": 0.7772, "grad_norm": 0.5319193601608276, "learning_rate": 0.0002, "epoch": 0.09982046678635548, "step": 1390}, {"loss": 0.8167, "grad_norm": 0.4083728492259979, "learning_rate": 0.0002, "epoch": 0.10053859964093358, "step": 1400}, {"loss": 0.8436, "grad_norm": 0.4193868339061737, "learning_rate": 0.0002, "epoch": 0.10125673249551168, "step": 1410}, {"loss": 0.8634, "grad_norm": 0.4062198996543884, "learning_rate": 0.0002, "epoch": 0.10197486535008976, "step": 1420}, {"loss": 0.7984, "grad_norm": 0.43972232937812805, "learning_rate": 0.0002, "epoch": 0.10269299820466786, "step": 1430}, {"loss": 0.8278, "grad_norm": 0.4598410725593567, "learning_rate": 0.0002, "epoch": 0.10341113105924596, "step": 1440}, {"loss": 0.8527, "grad_norm": 0.571662187576294, "learning_rate": 0.0002, "epoch": 0.10412926391382406, "step": 1450}, {"loss": 0.8485, "grad_norm": 0.5437791347503662, "learning_rate": 0.0002, "epoch": 0.10484739676840216, "step": 1460}, {"loss": 0.8172, "grad_norm": 0.4241923391819, "learning_rate": 0.0002, "epoch": 0.10556552962298026, "step": 1470}, {"loss": 0.8224, "grad_norm": 0.5185145735740662, "learning_rate": 0.0002, "epoch": 0.10628366247755835, "step": 1480}, {"loss": 0.8292, "grad_norm": 0.537626326084137, "learning_rate": 0.0002, "epoch": 0.10700179533213644, "step": 1490}, {"loss": 0.8227, "grad_norm": 0.4573661983013153, "learning_rate": 0.0002, "epoch": 0.10771992818671454, "step": 1500}, {"loss": 0.8318, "grad_norm": 0.4521017074584961, "learning_rate": 0.0002, "epoch": 0.10843806104129264, "step": 1510}, {"loss": 0.8107, "grad_norm": 0.6835159063339233, "learning_rate": 0.0002, "epoch": 0.10915619389587074, "step": 1520}, {"loss": 0.8256, "grad_norm": 0.43522894382476807, "learning_rate": 0.0002, "epoch": 0.10987432675044884, "step": 1530}, {"loss": 0.8211, "grad_norm": 0.685547411441803, "learning_rate": 0.0002, "epoch": 0.11059245960502694, "step": 1540}, {"loss": 0.8393, "grad_norm": 0.5283669233322144, "learning_rate": 0.0002, "epoch": 0.11131059245960502, "step": 1550}, {"loss": 0.8493, "grad_norm": 0.4869283437728882, "learning_rate": 0.0002, "epoch": 0.11202872531418312, "step": 1560}, {"loss": 0.8614, "grad_norm": 0.43024054169654846, "learning_rate": 0.0002, "epoch": 0.11274685816876122, "step": 1570}, {"loss": 0.8026, "grad_norm": 0.46726059913635254, "learning_rate": 0.0002, "epoch": 0.11346499102333932, "step": 1580}, {"loss": 0.8103, "grad_norm": 0.5046039819717407, "learning_rate": 0.0002, "epoch": 0.11418312387791742, "step": 1590}, {"loss": 0.8242, "grad_norm": 0.48972827196121216, "learning_rate": 0.0002, "epoch": 0.11490125673249552, "step": 1600}, {"loss": 0.8114, "grad_norm": 0.5221049189567566, "learning_rate": 0.0002, "epoch": 0.11561938958707361, "step": 1610}, {"loss": 0.8022, "grad_norm": 0.49169477820396423, "learning_rate": 0.0002, "epoch": 0.1163375224416517, "step": 1620}, {"loss": 0.8223, "grad_norm": 0.48462188243865967, "learning_rate": 0.0002, "epoch": 0.1170556552962298, "step": 1630}, {"loss": 0.8409, "grad_norm": 0.9001021981239319, "learning_rate": 0.0002, "epoch": 0.1177737881508079, "step": 1640}, {"loss": 0.8037, "grad_norm": 0.47555917501449585, "learning_rate": 0.0002, "epoch": 0.118491921005386, "step": 1650}, {"loss": 0.8047, "grad_norm": 0.4523521959781647, "learning_rate": 0.0002, "epoch": 0.1192100538599641, "step": 1660}, {"loss": 0.8552, "grad_norm": 0.510956346988678, "learning_rate": 0.0002, "epoch": 0.1199281867145422, "step": 1670}, {"loss": 0.8081, "grad_norm": 0.48063746094703674, "learning_rate": 0.0002, "epoch": 0.12064631956912028, "step": 1680}, {"loss": 0.7712, "grad_norm": 0.5209490060806274, "learning_rate": 0.0002, "epoch": 0.12136445242369838, "step": 1690}, {"loss": 0.8019, "grad_norm": 0.5488983988761902, "learning_rate": 0.0002, "epoch": 0.12208258527827648, "step": 1700}, {"loss": 0.829, "grad_norm": 0.5263523459434509, "learning_rate": 0.0002, "epoch": 0.12280071813285458, "step": 1710}, {"loss": 0.7761, "grad_norm": 0.45365768671035767, "learning_rate": 0.0002, "epoch": 0.12351885098743268, "step": 1720}, {"loss": 0.8432, "grad_norm": 0.4366922378540039, "learning_rate": 0.0002, "epoch": 0.12423698384201078, "step": 1730}, {"loss": 0.8261, "grad_norm": 0.4841083884239197, "learning_rate": 0.0002, "epoch": 0.12495511669658887, "step": 1740}, {"loss": 0.7834, "grad_norm": 0.46546968817710876, "learning_rate": 0.0002, "epoch": 0.12567324955116696, "step": 1750}, {"loss": 0.7874, "grad_norm": 0.39987099170684814, "learning_rate": 0.0002, "epoch": 0.12639138240574507, "step": 1760}, {"loss": 0.813, "grad_norm": 0.4661678969860077, "learning_rate": 0.0002, "epoch": 0.12710951526032316, "step": 1770}, {"loss": 0.8516, "grad_norm": 0.46716657280921936, "learning_rate": 0.0002, "epoch": 0.12782764811490124, "step": 1780}, {"loss": 0.8065, "grad_norm": 0.46164995431900024, "learning_rate": 0.0002, "epoch": 0.12854578096947936, "step": 1790}, {"loss": 0.8911, "grad_norm": 0.4910370111465454, "learning_rate": 0.0002, "epoch": 0.12926391382405744, "step": 1800}, {"loss": 0.7773, "grad_norm": 0.5615737438201904, "learning_rate": 0.0002, "epoch": 0.12998204667863555, "step": 1810}, {"loss": 0.7726, "grad_norm": 0.5739728808403015, "learning_rate": 0.0002, "epoch": 0.13070017953321364, "step": 1820}, {"loss": 0.8307, "grad_norm": 0.44104722142219543, "learning_rate": 0.0002, "epoch": 0.13141831238779175, "step": 1830}, {"loss": 0.7533, "grad_norm": 0.46373724937438965, "learning_rate": 0.0002, "epoch": 0.13213644524236984, "step": 1840}, {"loss": 0.8181, "grad_norm": 0.4481196403503418, "learning_rate": 0.0002, "epoch": 0.13285457809694792, "step": 1850}, {"loss": 0.8508, "grad_norm": 0.5689327716827393, "learning_rate": 0.0002, "epoch": 0.13357271095152604, "step": 1860}, {"loss": 0.8364, "grad_norm": 0.5334849953651428, "learning_rate": 0.0002, "epoch": 0.13429084380610412, "step": 1870}, {"loss": 0.8018, "grad_norm": 0.5177253484725952, "learning_rate": 0.0002, "epoch": 0.13500897666068223, "step": 1880}, {"loss": 0.869, "grad_norm": 0.4919368326663971, "learning_rate": 0.0002, "epoch": 0.13572710951526032, "step": 1890}, {"loss": 0.7647, "grad_norm": 0.5987576842308044, "learning_rate": 0.0002, "epoch": 0.13644524236983843, "step": 1900}, {"loss": 0.8546, "grad_norm": 0.49790486693382263, "learning_rate": 0.0002, "epoch": 0.13716337522441652, "step": 1910}, {"loss": 0.8402, "grad_norm": 0.5337542295455933, "learning_rate": 0.0002, "epoch": 0.1378815080789946, "step": 1920}, {"loss": 0.815, "grad_norm": 0.5171598792076111, "learning_rate": 0.0002, "epoch": 0.13859964093357272, "step": 1930}, {"loss": 0.843, "grad_norm": 0.5003953576087952, "learning_rate": 0.0002, "epoch": 0.1393177737881508, "step": 1940}, {"loss": 0.7867, "grad_norm": 0.5147887468338013, "learning_rate": 0.0002, "epoch": 0.1400359066427289, "step": 1950}, {"loss": 0.8215, "grad_norm": 0.6365984678268433, "learning_rate": 0.0002, "epoch": 0.140754039497307, "step": 1960}, {"loss": 0.8397, "grad_norm": 0.5449512004852295, "learning_rate": 0.0002, "epoch": 0.1414721723518851, "step": 1970}, {"loss": 0.8177, "grad_norm": 0.4062703847885132, "learning_rate": 0.0002, "epoch": 0.1421903052064632, "step": 1980}, {"loss": 0.8058, "grad_norm": 0.4446912705898285, "learning_rate": 0.0002, "epoch": 0.14290843806104128, "step": 1990}, {"loss": 0.7854, "grad_norm": 0.49001234769821167, "learning_rate": 0.0002, "epoch": 0.1436265709156194, "step": 2000}, {"loss": 0.8136, "grad_norm": 0.5591765642166138, "learning_rate": 0.0002, "epoch": 0.14434470377019748, "step": 2010}, {"loss": 0.7808, "grad_norm": 0.6476696133613586, "learning_rate": 0.0002, "epoch": 0.1450628366247756, "step": 2020}, {"loss": 0.8137, "grad_norm": 0.44688376784324646, "learning_rate": 0.0002, "epoch": 0.14578096947935368, "step": 2030}, {"loss": 0.8253, "grad_norm": 0.4437490701675415, "learning_rate": 0.0002, "epoch": 0.1464991023339318, "step": 2040}, {"loss": 0.7654, "grad_norm": 0.59927898645401, "learning_rate": 0.0002, "epoch": 0.14721723518850988, "step": 2050}, {"loss": 0.825, "grad_norm": 0.4356591999530792, "learning_rate": 0.0002, "epoch": 0.14793536804308796, "step": 2060}, {"loss": 0.8038, "grad_norm": 0.5560822486877441, "learning_rate": 0.0002, "epoch": 0.14865350089766607, "step": 2070}, {"loss": 0.838, "grad_norm": 0.43027108907699585, "learning_rate": 0.0002, "epoch": 0.14937163375224416, "step": 2080}, {"loss": 0.8317, "grad_norm": 0.41215455532073975, "learning_rate": 0.0002, "epoch": 0.15008976660682227, "step": 2090}, {"loss": 0.7948, "grad_norm": 0.4607839584350586, "learning_rate": 0.0002, "epoch": 0.15080789946140036, "step": 2100}, {"loss": 0.7981, "grad_norm": 0.4699854254722595, "learning_rate": 0.0002, "epoch": 0.15152603231597844, "step": 2110}, {"loss": 0.8464, "grad_norm": 0.5111975073814392, "learning_rate": 0.0002, "epoch": 0.15224416517055656, "step": 2120}, {"loss": 0.7672, "grad_norm": 0.4713742733001709, "learning_rate": 0.0002, "epoch": 0.15296229802513464, "step": 2130}, {"loss": 0.7692, "grad_norm": 0.3816622793674469, "learning_rate": 0.0002, "epoch": 0.15368043087971275, "step": 2140}, {"loss": 0.7824, "grad_norm": 0.4637526273727417, "learning_rate": 0.0002, "epoch": 0.15439856373429084, "step": 2150}, {"loss": 0.8185, "grad_norm": 0.3691818118095398, "learning_rate": 0.0002, "epoch": 0.15511669658886895, "step": 2160}, {"loss": 0.8298, "grad_norm": 0.4435218274593353, "learning_rate": 0.0002, "epoch": 0.15583482944344704, "step": 2170}, {"loss": 0.7917, "grad_norm": 0.5282211899757385, "learning_rate": 0.0002, "epoch": 0.15655296229802512, "step": 2180}, {"loss": 0.8006, "grad_norm": 0.7611056566238403, "learning_rate": 0.0002, "epoch": 0.15727109515260324, "step": 2190}, {"loss": 0.8039, "grad_norm": 0.5951169729232788, "learning_rate": 0.0002, "epoch": 0.15798922800718132, "step": 2200}, {"loss": 0.8314, "grad_norm": 0.5243265628814697, "learning_rate": 0.0002, "epoch": 0.15870736086175943, "step": 2210}, {"loss": 0.7817, "grad_norm": 0.518944501876831, "learning_rate": 0.0002, "epoch": 0.15942549371633752, "step": 2220}, {"loss": 0.8187, "grad_norm": 0.4264616072177887, "learning_rate": 0.0002, "epoch": 0.16014362657091563, "step": 2230}, {"loss": 0.7916, "grad_norm": 0.4619045853614807, "learning_rate": 0.0002, "epoch": 0.16086175942549372, "step": 2240}, {"loss": 0.84, "grad_norm": 0.4047030508518219, "learning_rate": 0.0002, "epoch": 0.1615798922800718, "step": 2250}, {"loss": 0.8133, "grad_norm": 0.47133687138557434, "learning_rate": 0.0002, "epoch": 0.16229802513464991, "step": 2260}, {"loss": 0.8032, "grad_norm": 0.4990246593952179, "learning_rate": 0.0002, "epoch": 0.163016157989228, "step": 2270}, {"loss": 0.752, "grad_norm": 0.5145298838615417, "learning_rate": 0.0002, "epoch": 0.1637342908438061, "step": 2280}, {"loss": 0.8441, "grad_norm": 0.5354352593421936, "learning_rate": 0.0002, "epoch": 0.1644524236983842, "step": 2290}, {"loss": 0.8099, "grad_norm": 0.47621065378189087, "learning_rate": 0.0002, "epoch": 0.1651705565529623, "step": 2300}, {"loss": 0.8105, "grad_norm": 0.45333582162857056, "learning_rate": 0.0002, "epoch": 0.1658886894075404, "step": 2310}, {"loss": 0.8126, "grad_norm": 0.4832790493965149, "learning_rate": 0.0002, "epoch": 0.16660682226211848, "step": 2320}, {"loss": 0.8271, "grad_norm": 0.4922761619091034, "learning_rate": 0.0002, "epoch": 0.1673249551166966, "step": 2330}, {"loss": 0.8324, "grad_norm": 0.5701655149459839, "learning_rate": 0.0002, "epoch": 0.16804308797127468, "step": 2340}, {"loss": 0.844, "grad_norm": 0.5170459151268005, "learning_rate": 0.0002, "epoch": 0.1687612208258528, "step": 2350}, {"loss": 0.7995, "grad_norm": 0.6562373638153076, "learning_rate": 0.0002, "epoch": 0.16947935368043088, "step": 2360}, {"loss": 0.7733, "grad_norm": 0.5350262522697449, "learning_rate": 0.0002, "epoch": 0.170197486535009, "step": 2370}, {"loss": 0.8501, "grad_norm": 0.5163491368293762, "learning_rate": 0.0002, "epoch": 0.17091561938958708, "step": 2380}, {"loss": 0.7708, "grad_norm": 0.48841530084609985, "learning_rate": 0.0002, "epoch": 0.17163375224416516, "step": 2390}, {"loss": 0.7969, "grad_norm": 0.44912993907928467, "learning_rate": 0.0002, "epoch": 0.17235188509874327, "step": 2400}, {"loss": 0.7706, "grad_norm": 0.5770647525787354, "learning_rate": 0.0002, "epoch": 0.17307001795332136, "step": 2410}, {"loss": 0.8233, "grad_norm": 0.4716179072856903, "learning_rate": 0.0002, "epoch": 0.17378815080789947, "step": 2420}, {"loss": 0.7802, "grad_norm": 0.5465078949928284, "learning_rate": 0.0002, "epoch": 0.17450628366247756, "step": 2430}, {"loss": 0.8191, "grad_norm": 0.40810713171958923, "learning_rate": 0.0002, "epoch": 0.17522441651705564, "step": 2440}, {"loss": 0.7971, "grad_norm": 0.3789578080177307, "learning_rate": 0.0002, "epoch": 0.17594254937163376, "step": 2450}, {"loss": 0.7437, "grad_norm": 0.4615110158920288, "learning_rate": 0.0002, "epoch": 0.17666068222621184, "step": 2460}, {"loss": 0.8102, "grad_norm": 0.4400235712528229, "learning_rate": 0.0002, "epoch": 0.17737881508078995, "step": 2470}, {"loss": 0.8254, "grad_norm": 0.5935020446777344, "learning_rate": 0.0002, "epoch": 0.17809694793536804, "step": 2480}, {"loss": 0.7886, "grad_norm": 0.5672990679740906, "learning_rate": 0.0002, "epoch": 0.17881508078994615, "step": 2490}, {"loss": 0.7829, "grad_norm": 0.4132838845252991, "learning_rate": 0.0002, "epoch": 0.17953321364452424, "step": 2500}, {"loss": 0.8056, "grad_norm": 0.5373716950416565, "learning_rate": 0.0002, "epoch": 0.18025134649910232, "step": 2510}, {"loss": 0.8061, "grad_norm": 0.5335832834243774, "learning_rate": 0.0002, "epoch": 0.18096947935368043, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5705642700195312, "learning_rate": 0.0002, "epoch": 0.18168761220825852, "step": 2530}, {"loss": 0.7779, "grad_norm": 0.4807959496974945, "learning_rate": 0.0002, "epoch": 0.18240574506283663, "step": 2540}, {"loss": 0.7767, "grad_norm": 0.4430573880672455, "learning_rate": 0.0002, "epoch": 0.18312387791741472, "step": 2550}, {"loss": 0.7921, "grad_norm": 0.5294728875160217, "learning_rate": 0.0002, "epoch": 0.18384201077199283, "step": 2560}, {"loss": 0.8102, "grad_norm": 0.661173403263092, "learning_rate": 0.0002, "epoch": 0.18456014362657092, "step": 2570}, {"loss": 0.803, "grad_norm": 0.5044304728507996, "learning_rate": 0.0002, "epoch": 0.185278276481149, "step": 2580}, {"loss": 0.7833, "grad_norm": 0.48929551243782043, "learning_rate": 0.0002, "epoch": 0.18599640933572711, "step": 2590}, {"loss": 0.8252, "grad_norm": 0.5054438710212708, "learning_rate": 0.0002, "epoch": 0.1867145421903052, "step": 2600}, {"loss": 0.7665, "grad_norm": 0.5613677501678467, "learning_rate": 0.0002, "epoch": 0.1874326750448833, "step": 2610}, {"loss": 0.7954, "grad_norm": 0.5762478709220886, "learning_rate": 0.0002, "epoch": 0.1881508078994614, "step": 2620}, {"loss": 0.8312, "grad_norm": 0.4523695409297943, "learning_rate": 0.0002, "epoch": 0.1888689407540395, "step": 2630}, {"loss": 0.8098, "grad_norm": 0.5235317945480347, "learning_rate": 0.0002, "epoch": 0.1895870736086176, "step": 2640}, {"loss": 0.8281, "grad_norm": 0.4894576370716095, "learning_rate": 0.0002, "epoch": 0.19030520646319568, "step": 2650}, {"loss": 0.7923, "grad_norm": 0.45731106400489807, "learning_rate": 0.0002, "epoch": 0.1910233393177738, "step": 2660}, {"loss": 0.7942, "grad_norm": 0.4726541042327881, "learning_rate": 0.0002, "epoch": 0.19174147217235188, "step": 2670}, {"loss": 0.7979, "grad_norm": 0.4281631410121918, "learning_rate": 0.0002, "epoch": 0.19245960502693, "step": 2680}, {"loss": 0.8076, "grad_norm": 0.48011314868927, "learning_rate": 0.0002, "epoch": 0.19317773788150808, "step": 2690}, {"loss": 0.7785, "grad_norm": 0.45785006880760193, "learning_rate": 0.0002, "epoch": 0.19389587073608616, "step": 2700}, {"loss": 0.7726, "grad_norm": 0.5244625210762024, "learning_rate": 0.0002, "epoch": 0.19461400359066428, "step": 2710}, {"loss": 0.8674, "grad_norm": 0.4674883186817169, "learning_rate": 0.0002, "epoch": 0.19533213644524236, "step": 2720}, {"loss": 0.8465, "grad_norm": 0.5969558358192444, "learning_rate": 0.0002, "epoch": 0.19605026929982047, "step": 2730}, {"loss": 0.8238, "grad_norm": 0.44413265585899353, "learning_rate": 0.0002, "epoch": 0.19676840215439856, "step": 2740}, {"loss": 0.8181, "grad_norm": 0.5094553828239441, "learning_rate": 0.0002, "epoch": 0.19748653500897667, "step": 2750}, {"loss": 0.7593, "grad_norm": 0.4931736886501312, "learning_rate": 0.0002, "epoch": 0.19820466786355476, "step": 2760}, {"loss": 0.8535, "grad_norm": 0.4766625463962555, "learning_rate": 0.0002, "epoch": 0.19892280071813284, "step": 2770}, {"loss": 0.754, "grad_norm": 0.4196971654891968, "learning_rate": 0.0002, "epoch": 0.19964093357271095, "step": 2780}, {"loss": 0.7794, "grad_norm": 0.4693375825881958, "learning_rate": 0.0002, "epoch": 0.20035906642728904, "step": 2790}, {"loss": 0.8336, "grad_norm": 0.5407108664512634, "learning_rate": 0.0002, "epoch": 0.20107719928186715, "step": 2800}, {"loss": 0.7938, "grad_norm": 0.42864227294921875, "learning_rate": 0.0002, "epoch": 0.20179533213644524, "step": 2810}, {"loss": 0.8059, "grad_norm": 0.4928833246231079, "learning_rate": 0.0002, "epoch": 0.20251346499102335, "step": 2820}, {"loss": 0.8221, "grad_norm": 0.5575131773948669, "learning_rate": 0.0002, "epoch": 0.20323159784560144, "step": 2830}, {"loss": 0.7712, "grad_norm": 0.505114734172821, "learning_rate": 0.0002, "epoch": 0.20394973070017952, "step": 2840}, {"loss": 0.7986, "grad_norm": 0.4727420210838318, "learning_rate": 0.0002, "epoch": 0.20466786355475763, "step": 2850}, {"loss": 0.7662, "grad_norm": 0.48218145966529846, "learning_rate": 0.0002, "epoch": 0.20538599640933572, "step": 2860}, {"loss": 0.8055, "grad_norm": 0.5196906328201294, "learning_rate": 0.0002, "epoch": 0.20610412926391383, "step": 2870}, {"loss": 0.8401, "grad_norm": 0.4927639067173004, "learning_rate": 0.0002, "epoch": 0.20682226211849192, "step": 2880}, {"loss": 0.8067, "grad_norm": 0.5076990127563477, "learning_rate": 0.0002, "epoch": 0.20754039497307003, "step": 2890}, {"loss": 0.789, "grad_norm": 0.4606800079345703, "learning_rate": 0.0002, "epoch": 0.20825852782764812, "step": 2900}, {"loss": 0.8381, "grad_norm": 0.6184319257736206, "learning_rate": 0.0002, "epoch": 0.2089766606822262, "step": 2910}, {"loss": 0.8019, "grad_norm": 0.5237935781478882, "learning_rate": 0.0002, "epoch": 0.2096947935368043, "step": 2920}, {"loss": 0.7763, "grad_norm": 0.43966251611709595, "learning_rate": 0.0002, "epoch": 0.2104129263913824, "step": 2930}, {"loss": 0.7915, "grad_norm": 0.48786666989326477, "learning_rate": 0.0002, "epoch": 0.2111310592459605, "step": 2940}, {"loss": 0.7549, "grad_norm": 0.4397817552089691, "learning_rate": 0.0002, "epoch": 0.2118491921005386, "step": 2950}, {"loss": 0.8342, "grad_norm": 0.5155336260795593, "learning_rate": 0.0002, "epoch": 0.2125673249551167, "step": 2960}, {"loss": 0.7885, "grad_norm": 0.48058274388313293, "learning_rate": 0.0002, "epoch": 0.2132854578096948, "step": 2970}, {"loss": 0.8208, "grad_norm": 0.5022647976875305, "learning_rate": 0.0002, "epoch": 0.21400359066427288, "step": 2980}, {"loss": 0.784, "grad_norm": 0.5417225360870361, "learning_rate": 0.0002, "epoch": 0.214721723518851, "step": 2990}, {"loss": 0.8518, "grad_norm": 0.46300315856933594, "learning_rate": 0.0002, "epoch": 0.21543985637342908, "step": 3000}, {"loss": 0.764, "grad_norm": 0.5375089049339294, "learning_rate": 0.0002, "epoch": 0.2161579892280072, "step": 3010}, {"loss": 0.8459, "grad_norm": 0.5050022602081299, "learning_rate": 0.0002, "epoch": 0.21687612208258528, "step": 3020}, {"loss": 0.797, "grad_norm": 0.46347716450691223, "learning_rate": 0.0002, "epoch": 0.21759425493716336, "step": 3030}, {"loss": 0.8014, "grad_norm": 0.544874370098114, "learning_rate": 0.0002, "epoch": 0.21831238779174147, "step": 3040}, {"loss": 0.802, "grad_norm": 0.4268142580986023, "learning_rate": 0.0002, "epoch": 0.21903052064631956, "step": 3050}, {"loss": 0.8224, "grad_norm": 0.5527157187461853, "learning_rate": 0.0002, "epoch": 0.21974865350089767, "step": 3060}, {"loss": 0.771, "grad_norm": 0.5565235018730164, "learning_rate": 0.0002, "epoch": 0.22046678635547576, "step": 3070}, {"loss": 0.7807, "grad_norm": 0.4900645613670349, "learning_rate": 0.0002, "epoch": 0.22118491921005387, "step": 3080}, {"loss": 0.8321, "grad_norm": 0.4951242208480835, "learning_rate": 0.0002, "epoch": 0.22190305206463196, "step": 3090}, {"loss": 0.8301, "grad_norm": 0.5831719636917114, "learning_rate": 0.0002, "epoch": 0.22262118491921004, "step": 3100}, {"loss": 0.8011, "grad_norm": 0.417576402425766, "learning_rate": 0.0002, "epoch": 0.22333931777378815, "step": 3110}, {"loss": 0.8226, "grad_norm": 0.4715117812156677, "learning_rate": 0.0002, "epoch": 0.22405745062836624, "step": 3120}, {"loss": 0.778, "grad_norm": 0.5956445336341858, "learning_rate": 0.0002, "epoch": 0.22477558348294435, "step": 3130}, {"loss": 0.788, "grad_norm": 0.408184289932251, "learning_rate": 0.0002, "epoch": 0.22549371633752244, "step": 3140}, {"loss": 0.8096, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 0.22621184919210055, "step": 3150}, {"loss": 0.7722, "grad_norm": 0.5631294846534729, "learning_rate": 0.0002, "epoch": 0.22692998204667864, "step": 3160}, {"loss": 0.7933, "grad_norm": 0.5054665803909302, "learning_rate": 0.0002, "epoch": 0.22764811490125672, "step": 3170}, {"loss": 0.8572, "grad_norm": 0.47388020157814026, "learning_rate": 0.0002, "epoch": 0.22836624775583483, "step": 3180}, {"loss": 0.8148, "grad_norm": 0.45871609449386597, "learning_rate": 0.0002, "epoch": 0.22908438061041292, "step": 3190}, {"loss": 0.8373, "grad_norm": 0.42431211471557617, "learning_rate": 0.0002, "epoch": 0.22980251346499103, "step": 3200}, {"loss": 0.7847, "grad_norm": 0.584872305393219, "learning_rate": 0.0002, "epoch": 0.23052064631956912, "step": 3210}, {"loss": 0.8118, "grad_norm": 0.5489653944969177, "learning_rate": 0.0002, "epoch": 0.23123877917414723, "step": 3220}, {"loss": 0.8552, "grad_norm": 0.5803213119506836, "learning_rate": 0.0002, "epoch": 0.23195691202872532, "step": 3230}, {"loss": 0.7702, "grad_norm": 0.906505823135376, "learning_rate": 0.0002, "epoch": 0.2326750448833034, "step": 3240}, {"loss": 0.8454, "grad_norm": 0.4569525718688965, "learning_rate": 0.0002, "epoch": 0.2333931777378815, "step": 3250}, {"loss": 0.7641, "grad_norm": 0.5566741228103638, "learning_rate": 0.0002, "epoch": 0.2341113105924596, "step": 3260}, {"loss": 0.7964, "grad_norm": 0.5059959888458252, "learning_rate": 0.0002, "epoch": 0.2348294434470377, "step": 3270}, {"loss": 0.7965, "grad_norm": 0.530828058719635, "learning_rate": 0.0002, "epoch": 0.2355475763016158, "step": 3280}, {"loss": 0.807, "grad_norm": 0.5149409174919128, "learning_rate": 0.0002, "epoch": 0.2362657091561939, "step": 3290}, {"loss": 0.8067, "grad_norm": 0.7323763966560364, "learning_rate": 0.0002, "epoch": 0.236983842010772, "step": 3300}, {"loss": 0.774, "grad_norm": 0.6794836521148682, "learning_rate": 0.0002, "epoch": 0.23770197486535008, "step": 3310}, {"loss": 0.7902, "grad_norm": 0.5176534056663513, "learning_rate": 0.0002, "epoch": 0.2384201077199282, "step": 3320}, {"loss": 0.8119, "grad_norm": 0.42245906591415405, "learning_rate": 0.0002, "epoch": 0.23913824057450628, "step": 3330}, {"loss": 0.868, "grad_norm": 0.43535107374191284, "learning_rate": 0.0002, "epoch": 0.2398563734290844, "step": 3340}, {"loss": 0.825, "grad_norm": 0.7038307785987854, "learning_rate": 0.0002, "epoch": 0.24057450628366248, "step": 3350}, {"loss": 0.7818, "grad_norm": 0.5689977407455444, "learning_rate": 0.0002, "epoch": 0.24129263913824056, "step": 3360}, {"loss": 0.7958, "grad_norm": 0.538136899471283, "learning_rate": 0.0002, "epoch": 0.24201077199281867, "step": 3370}, {"loss": 0.7995, "grad_norm": 0.7433661222457886, "learning_rate": 0.0002, "epoch": 0.24272890484739676, "step": 3380}, {"loss": 0.8564, "grad_norm": 0.6996734738349915, "learning_rate": 0.0002, "epoch": 0.24344703770197487, "step": 3390}, {"loss": 0.8288, "grad_norm": 0.5055703520774841, "learning_rate": 0.0002, "epoch": 0.24416517055655296, "step": 3400}, {"loss": 0.7741, "grad_norm": 0.5218513607978821, "learning_rate": 0.0002, "epoch": 0.24488330341113107, "step": 3410}, {"loss": 0.7903, "grad_norm": 0.42782822251319885, "learning_rate": 0.0002, "epoch": 0.24560143626570916, "step": 3420}, {"loss": 0.8005, "grad_norm": 0.4991157650947571, "learning_rate": 0.0002, "epoch": 0.24631956912028724, "step": 3430}, {"loss": 0.8151, "grad_norm": 0.5063165426254272, "learning_rate": 0.0002, "epoch": 0.24703770197486535, "step": 3440}, {"loss": 0.7722, "grad_norm": 0.45863136649131775, "learning_rate": 0.0002, "epoch": 0.24775583482944344, "step": 3450}, {"loss": 0.8236, "grad_norm": 0.474728524684906, "learning_rate": 0.0002, "epoch": 0.24847396768402155, "step": 3460}, {"loss": 0.7698, "grad_norm": 0.522570013999939, "learning_rate": 0.0002, "epoch": 0.24919210053859964, "step": 3470}, {"loss": 0.7448, "grad_norm": 0.5474396347999573, "learning_rate": 0.0002, "epoch": 0.24991023339317775, "step": 3480}, {"loss": 0.8339, "grad_norm": 0.49094662070274353, "learning_rate": 0.0002, "epoch": 0.2506283662477558, "step": 3490}, {"loss": 0.7864, "grad_norm": 0.6399132609367371, "learning_rate": 0.0002, "epoch": 0.2513464991023339, "step": 3500}, {"loss": 0.7988, "grad_norm": 0.5910066366195679, "learning_rate": 0.0002, "epoch": 0.25206463195691203, "step": 3510}, {"loss": 0.813, "grad_norm": 0.4761259853839874, "learning_rate": 0.0002, "epoch": 0.25278276481149015, "step": 3520}, {"loss": 0.812, "grad_norm": 0.5124502182006836, "learning_rate": 0.0002, "epoch": 0.2535008976660682, "step": 3530}, {"loss": 0.7699, "grad_norm": 0.4329150915145874, "learning_rate": 0.0002, "epoch": 0.2542190305206463, "step": 3540}, {"loss": 0.8205, "grad_norm": 0.4839608371257782, "learning_rate": 0.0002, "epoch": 0.25493716337522443, "step": 3550}, {"loss": 0.8279, "grad_norm": 0.5413459539413452, "learning_rate": 0.0002, "epoch": 0.2556552962298025, "step": 3560}, {"loss": 0.8253, "grad_norm": 0.5761468410491943, "learning_rate": 0.0002, "epoch": 0.2563734290843806, "step": 3570}, {"loss": 0.8473, "grad_norm": 0.49266132712364197, "learning_rate": 0.0002, "epoch": 0.2570915619389587, "step": 3580}, {"loss": 0.7946, "grad_norm": 0.7377930879592896, "learning_rate": 0.0002, "epoch": 0.2578096947935368, "step": 3590}, {"loss": 0.799, "grad_norm": 0.543541431427002, "learning_rate": 0.0002, "epoch": 0.2585278276481149, "step": 3600}, {"loss": 0.8044, "grad_norm": 0.48385897278785706, "learning_rate": 0.0002, "epoch": 0.259245960502693, "step": 3610}, {"loss": 0.7686, "grad_norm": 0.5152639746665955, "learning_rate": 0.0002, "epoch": 0.2599640933572711, "step": 3620}, {"loss": 0.7438, "grad_norm": 0.5601988434791565, "learning_rate": 0.0002, "epoch": 0.26068222621184917, "step": 3630}, {"loss": 0.7915, "grad_norm": 0.4349626302719116, "learning_rate": 0.0002, "epoch": 0.2614003590664273, "step": 3640}, {"loss": 0.7825, "grad_norm": 0.5487161874771118, "learning_rate": 0.0002, "epoch": 0.2621184919210054, "step": 3650}, {"loss": 0.8085, "grad_norm": 0.45603805780410767, "learning_rate": 0.0002, "epoch": 0.2628366247755835, "step": 3660}, {"loss": 0.7858, "grad_norm": 0.5012730956077576, "learning_rate": 0.0002, "epoch": 0.26355475763016156, "step": 3670}, {"loss": 0.8022, "grad_norm": 0.4523845314979553, "learning_rate": 0.0002, "epoch": 0.2642728904847397, "step": 3680}, {"loss": 0.7932, "grad_norm": 0.5756664872169495, "learning_rate": 0.0002, "epoch": 0.2649910233393178, "step": 3690}, {"loss": 0.816, "grad_norm": 0.48467493057250977, "learning_rate": 0.0002, "epoch": 0.26570915619389585, "step": 3700}, {"loss": 0.7825, "grad_norm": 0.4860585927963257, "learning_rate": 0.0002, "epoch": 0.26642728904847396, "step": 3710}, {"loss": 0.7903, "grad_norm": 0.5067077875137329, "learning_rate": 0.0002, "epoch": 0.26714542190305207, "step": 3720}, {"loss": 0.8155, "grad_norm": 0.5490895509719849, "learning_rate": 0.0002, "epoch": 0.2678635547576302, "step": 3730}, {"loss": 0.7542, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.26858168761220824, "step": 3740}, {"loss": 0.7991, "grad_norm": 0.5026951432228088, "learning_rate": 0.0002, "epoch": 0.26929982046678635, "step": 3750}, {"loss": 0.8152, "grad_norm": 0.49474090337753296, "learning_rate": 0.0002, "epoch": 0.27001795332136447, "step": 3760}, {"loss": 0.8235, "grad_norm": 0.6381985545158386, "learning_rate": 0.0002, "epoch": 0.2707360861759425, "step": 3770}, {"loss": 0.8024, "grad_norm": 0.4784011244773865, "learning_rate": 0.0002, "epoch": 0.27145421903052064, "step": 3780}, {"loss": 0.7746, "grad_norm": 0.5126543045043945, "learning_rate": 0.0002, "epoch": 0.27217235188509875, "step": 3790}, {"loss": 0.841, "grad_norm": 0.5428652763366699, "learning_rate": 0.0002, "epoch": 0.27289048473967686, "step": 3800}, {"loss": 0.8137, "grad_norm": 0.5427033305168152, "learning_rate": 0.0002, "epoch": 0.2736086175942549, "step": 3810}, {"loss": 0.7274, "grad_norm": 0.46467480063438416, "learning_rate": 0.0002, "epoch": 0.27432675044883303, "step": 3820}, {"loss": 0.8414, "grad_norm": 0.494367390871048, "learning_rate": 0.0002, "epoch": 0.27504488330341115, "step": 3830}, {"loss": 0.8151, "grad_norm": 0.59856778383255, "learning_rate": 0.0002, "epoch": 0.2757630161579892, "step": 3840}, {"loss": 0.7899, "grad_norm": 0.422128826379776, "learning_rate": 0.0002, "epoch": 0.2764811490125673, "step": 3850}, {"loss": 0.8153, "grad_norm": 0.5757306814193726, "learning_rate": 0.0002, "epoch": 0.27719928186714543, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5850930213928223, "learning_rate": 0.0002, "epoch": 0.27791741472172354, "step": 3870}, {"loss": 0.8044, "grad_norm": 0.5633023977279663, "learning_rate": 0.0002, "epoch": 0.2786355475763016, "step": 3880}, {"loss": 0.8402, "grad_norm": 0.5037940144538879, "learning_rate": 0.0002, "epoch": 0.2793536804308797, "step": 3890}, {"loss": 0.822, "grad_norm": 0.5255506038665771, "learning_rate": 0.0002, "epoch": 0.2800718132854578, "step": 3900}, {"loss": 0.7625, "grad_norm": 0.44584617018699646, "learning_rate": 0.0002, "epoch": 0.2807899461400359, "step": 3910}, {"loss": 0.8131, "grad_norm": 0.4803239405155182, "learning_rate": 0.0002, "epoch": 0.281508078994614, "step": 3920}, {"loss": 0.8122, "grad_norm": 0.5206008553504944, "learning_rate": 0.0002, "epoch": 0.2822262118491921, "step": 3930}, {"loss": 0.8988, "grad_norm": 0.5596373081207275, "learning_rate": 0.0002, "epoch": 0.2829443447037702, "step": 3940}, {"loss": 0.8091, "grad_norm": 0.4487258493900299, "learning_rate": 0.0002, "epoch": 0.2836624775583483, "step": 3950}, {"loss": 0.7933, "grad_norm": 0.4774281978607178, "learning_rate": 0.0002, "epoch": 0.2843806104129264, "step": 3960}, {"loss": 0.8994, "grad_norm": 0.571829617023468, "learning_rate": 0.0002, "epoch": 0.2850987432675045, "step": 3970}, {"loss": 0.7971, "grad_norm": 0.45251455903053284, "learning_rate": 0.0002, "epoch": 0.28581687612208256, "step": 3980}, {"loss": 0.8007, "grad_norm": 0.5119943618774414, "learning_rate": 0.0002, "epoch": 0.2865350089766607, "step": 3990}, {"loss": 0.8087, "grad_norm": 0.42333969473838806, "learning_rate": 0.0002, "epoch": 0.2872531418312388, "step": 4000}, {"loss": 0.7978, "grad_norm": 0.5694096684455872, "learning_rate": 0.0002, "epoch": 0.2879712746858169, "step": 4010}, {"loss": 0.845, "grad_norm": 0.44457492232322693, "learning_rate": 0.0002, "epoch": 0.28868940754039496, "step": 4020}, {"loss": 0.7268, "grad_norm": 0.496545672416687, "learning_rate": 0.0002, "epoch": 0.2894075403949731, "step": 4030}, {"loss": 0.7908, "grad_norm": 0.5092352032661438, "learning_rate": 0.0002, "epoch": 0.2901256732495512, "step": 4040}, {"loss": 0.807, "grad_norm": 0.5124567151069641, "learning_rate": 0.0002, "epoch": 0.29084380610412924, "step": 4050}, {"loss": 0.8191, "grad_norm": 0.5148161053657532, "learning_rate": 0.0002, "epoch": 0.29156193895870736, "step": 4060}, {"loss": 0.7893, "grad_norm": 0.48183947801589966, "learning_rate": 0.0002, "epoch": 0.29228007181328547, "step": 4070}, {"loss": 0.8397, "grad_norm": 0.47728800773620605, "learning_rate": 0.0002, "epoch": 0.2929982046678636, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.29371633752244164, "step": 4090}, {"loss": 0.8019, "grad_norm": 0.5343585014343262, "learning_rate": 0.0002, "epoch": 0.29443447037701975, "step": 4100}, {"loss": 0.7933, "grad_norm": 0.5760312676429749, "learning_rate": 0.0002, "epoch": 0.29515260323159787, "step": 4110}, {"loss": 0.811, "grad_norm": 0.5894787907600403, "learning_rate": 0.0002, "epoch": 0.2958707360861759, "step": 4120}, {"loss": 0.7375, "grad_norm": 0.4528578817844391, "learning_rate": 0.0002, "epoch": 0.29658886894075404, "step": 4130}, {"loss": 0.7761, "grad_norm": 0.6027235388755798, "learning_rate": 0.0002, "epoch": 0.29730700179533215, "step": 4140}, {"loss": 0.7636, "grad_norm": 0.5060310959815979, "learning_rate": 0.0002, "epoch": 0.2980251346499102, "step": 4150}, {"loss": 0.8122, "grad_norm": 0.475252628326416, "learning_rate": 0.0002, "epoch": 0.2987432675044883, "step": 4160}, {"loss": 0.8306, "grad_norm": 0.4855351448059082, "learning_rate": 0.0002, "epoch": 0.29946140035906643, "step": 4170}, {"loss": 0.7863, "grad_norm": 0.6720767021179199, "learning_rate": 0.0002, "epoch": 0.30017953321364454, "step": 4180}, {"loss": 0.7755, "grad_norm": 0.6409553289413452, "learning_rate": 0.0002, "epoch": 0.3008976660682226, "step": 4190}, {"loss": 0.8333, "grad_norm": 0.5508167147636414, "learning_rate": 0.0002, "epoch": 0.3016157989228007, "step": 4200}, {"loss": 0.8138, "grad_norm": 0.45958149433135986, "learning_rate": 0.0002, "epoch": 0.30233393177737883, "step": 4210}, {"loss": 0.8266, "grad_norm": 0.5201641321182251, "learning_rate": 0.0002, "epoch": 0.3030520646319569, "step": 4220}, {"loss": 0.8246, "grad_norm": 0.5440032482147217, "learning_rate": 0.0002, "epoch": 0.303770197486535, "step": 4230}, {"loss": 0.7863, "grad_norm": 0.43566814064979553, "learning_rate": 0.0002, "epoch": 0.3044883303411131, "step": 4240}, {"loss": 0.7835, "grad_norm": 0.4479893445968628, "learning_rate": 0.0002, "epoch": 0.3052064631956912, "step": 4250}, {"loss": 0.7646, "grad_norm": 0.40390217304229736, "learning_rate": 0.0002, "epoch": 0.3059245960502693, "step": 4260}, {"loss": 0.8382, "grad_norm": 0.5143486261367798, "learning_rate": 0.0002, "epoch": 0.3066427289048474, "step": 4270}, {"loss": 0.799, "grad_norm": 0.5289962887763977, "learning_rate": 0.0002, "epoch": 0.3073608617594255, "step": 4280}, {"loss": 0.7706, "grad_norm": 0.609561026096344, "learning_rate": 0.0002, "epoch": 0.30807899461400357, "step": 4290}, {"loss": 0.809, "grad_norm": 0.5967493653297424, "learning_rate": 0.0002, "epoch": 0.3087971274685817, "step": 4300}, {"loss": 0.8034, "grad_norm": 0.5323672890663147, "learning_rate": 0.0002, "epoch": 0.3095152603231598, "step": 4310}, {"loss": 0.8463, "grad_norm": 0.4996737241744995, "learning_rate": 0.0002, "epoch": 0.3102333931777379, "step": 4320}, {"loss": 0.7879, "grad_norm": 0.5528829097747803, "learning_rate": 0.0002, "epoch": 0.31095152603231596, "step": 4330}, {"loss": 0.8383, "grad_norm": 0.5394268035888672, "learning_rate": 0.0002, "epoch": 0.3116696588868941, "step": 4340}, {"loss": 0.8258, "grad_norm": 0.4654628038406372, "learning_rate": 0.0002, "epoch": 0.3123877917414722, "step": 4350}, {"loss": 0.8045, "grad_norm": 0.4933706521987915, "learning_rate": 0.0002, "epoch": 0.31310592459605024, "step": 4360}, {"loss": 0.7891, "grad_norm": 0.5310598611831665, "learning_rate": 0.0002, "epoch": 0.31382405745062836, "step": 4370}, {"loss": 0.8362, "grad_norm": 0.5558765530586243, "learning_rate": 0.0002, "epoch": 0.31454219030520647, "step": 4380}, {"loss": 0.8013, "grad_norm": 0.5281313061714172, "learning_rate": 0.0002, "epoch": 0.3152603231597846, "step": 4390}, {"loss": 0.8034, "grad_norm": 0.5100293755531311, "learning_rate": 0.0002, "epoch": 0.31597845601436264, "step": 4400}, {"loss": 0.795, "grad_norm": 0.48762813210487366, "learning_rate": 0.0002, "epoch": 0.31669658886894075, "step": 4410}, {"loss": 0.7941, "grad_norm": 0.5211702585220337, "learning_rate": 0.0002, "epoch": 0.31741472172351887, "step": 4420}, {"loss": 0.8079, "grad_norm": 0.696747899055481, "learning_rate": 0.0002, "epoch": 0.3181328545780969, "step": 4430}, {"loss": 0.77, "grad_norm": 0.6334946751594543, "learning_rate": 0.0002, "epoch": 0.31885098743267504, "step": 4440}, {"loss": 0.7871, "grad_norm": 0.5333067178726196, "learning_rate": 0.0002, "epoch": 0.31956912028725315, "step": 4450}, {"loss": 0.7846, "grad_norm": 0.500091552734375, "learning_rate": 0.0002, "epoch": 0.32028725314183126, "step": 4460}, {"loss": 0.7884, "grad_norm": 0.5190957188606262, "learning_rate": 0.0002, "epoch": 0.3210053859964093, "step": 4470}, {"loss": 0.7988, "grad_norm": 0.6702370047569275, "learning_rate": 0.0002, "epoch": 0.32172351885098743, "step": 4480}, {"loss": 0.8014, "grad_norm": 0.4393869638442993, "learning_rate": 0.0002, "epoch": 0.32244165170556555, "step": 4490}, {"loss": 0.8373, "grad_norm": 0.4766499400138855, "learning_rate": 0.0002, "epoch": 0.3231597845601436, "step": 4500}, {"loss": 0.7567, "grad_norm": 0.561836838722229, "learning_rate": 0.0002, "epoch": 0.3238779174147217, "step": 4510}, {"loss": 0.7727, "grad_norm": 0.44366541504859924, "learning_rate": 0.0002, "epoch": 0.32459605026929983, "step": 4520}, {"loss": 0.8109, "grad_norm": 0.46504274010658264, "learning_rate": 0.0002, "epoch": 0.32531418312387794, "step": 4530}, {"loss": 0.7868, "grad_norm": 0.5498034954071045, "learning_rate": 0.0002, "epoch": 0.326032315978456, "step": 4540}, {"loss": 0.7638, "grad_norm": 0.5901338458061218, "learning_rate": 0.0002, "epoch": 0.3267504488330341, "step": 4550}, {"loss": 0.8016, "grad_norm": 0.5485442876815796, "learning_rate": 0.0002, "epoch": 0.3274685816876122, "step": 4560}, {"loss": 0.7944, "grad_norm": 0.512584924697876, "learning_rate": 0.0002, "epoch": 0.3281867145421903, "step": 4570}, {"loss": 0.8193, "grad_norm": 0.5208188891410828, "learning_rate": 0.0002, "epoch": 0.3289048473967684, "step": 4580}, {"loss": 0.7833, "grad_norm": 0.4923836886882782, "learning_rate": 0.0002, "epoch": 0.3296229802513465, "step": 4590}, {"loss": 0.8102, "grad_norm": 0.49258530139923096, "learning_rate": 0.0002, "epoch": 0.3303411131059246, "step": 4600}, {"loss": 0.7874, "grad_norm": 0.4788922667503357, "learning_rate": 0.0002, "epoch": 0.3310592459605027, "step": 4610}, {"loss": 0.8298, "grad_norm": 0.48276954889297485, "learning_rate": 0.0002, "epoch": 0.3317773788150808, "step": 4620}, {"loss": 0.8519, "grad_norm": 0.6300732493400574, "learning_rate": 0.0002, "epoch": 0.3324955116696589, "step": 4630}, {"loss": 0.8434, "grad_norm": 0.47594770789146423, "learning_rate": 0.0002, "epoch": 0.33321364452423696, "step": 4640}, {"loss": 0.8123, "grad_norm": 0.4728924632072449, "learning_rate": 0.0002, "epoch": 0.3339317773788151, "step": 4650}, {"loss": 0.8113, "grad_norm": 0.5586788654327393, "learning_rate": 0.0002, "epoch": 0.3346499102333932, "step": 4660}, {"loss": 0.7949, "grad_norm": 0.4573180377483368, "learning_rate": 0.0002, "epoch": 0.3353680430879713, "step": 4670}, {"loss": 0.8341, "grad_norm": 0.6391524076461792, "learning_rate": 0.0002, "epoch": 0.33608617594254936, "step": 4680}, {"loss": 0.8126, "grad_norm": 0.6570921540260315, "learning_rate": 0.0002, "epoch": 0.33680430879712747, "step": 4690}, {"loss": 0.796, "grad_norm": 0.4601454734802246, "learning_rate": 0.0002, "epoch": 0.3375224416517056, "step": 4700}, {"loss": 0.8158, "grad_norm": 0.5640755295753479, "learning_rate": 0.0002, "epoch": 0.33824057450628364, "step": 4710}, {"loss": 0.8326, "grad_norm": 0.43475520610809326, "learning_rate": 0.0002, "epoch": 0.33895870736086176, "step": 4720}, {"loss": 0.7684, "grad_norm": 0.4785807132720947, "learning_rate": 0.0002, "epoch": 0.33967684021543987, "step": 4730}, {"loss": 0.8257, "grad_norm": 0.4934665262699127, "learning_rate": 0.0002, "epoch": 0.340394973070018, "step": 4740}, {"loss": 0.7713, "grad_norm": 0.45327693223953247, "learning_rate": 0.0002, "epoch": 0.34111310592459604, "step": 4750}, {"loss": 0.7944, "grad_norm": 0.4710456430912018, "learning_rate": 0.0002, "epoch": 0.34183123877917415, "step": 4760}, {"loss": 0.7689, "grad_norm": 0.5591559410095215, "learning_rate": 0.0002, "epoch": 0.34254937163375226, "step": 4770}, {"loss": 0.8204, "grad_norm": 0.48958835005760193, "learning_rate": 0.0002, "epoch": 0.3432675044883303, "step": 4780}, {"loss": 0.8232, "grad_norm": 0.4613766670227051, "learning_rate": 0.0002, "epoch": 0.34398563734290843, "step": 4790}, {"loss": 0.8339, "grad_norm": 0.5425335764884949, "learning_rate": 0.0002, "epoch": 0.34470377019748655, "step": 4800}, {"loss": 0.828, "grad_norm": 0.4964924156665802, "learning_rate": 0.0002, "epoch": 0.3454219030520646, "step": 4810}, {"loss": 0.8264, "grad_norm": 0.613449215888977, "learning_rate": 0.0002, "epoch": 0.3461400359066427, "step": 4820}, {"loss": 0.846, "grad_norm": 0.6553348898887634, "learning_rate": 0.0002, "epoch": 0.34685816876122083, "step": 4830}, {"loss": 0.8181, "grad_norm": 0.5863470435142517, "learning_rate": 0.0002, "epoch": 0.34757630161579894, "step": 4840}, {"loss": 0.8205, "grad_norm": 0.5338097810745239, "learning_rate": 0.0002, "epoch": 0.348294434470377, "step": 4850}, {"loss": 0.7926, "grad_norm": 0.6129760146141052, "learning_rate": 0.0002, "epoch": 0.3490125673249551, "step": 4860}, {"loss": 0.7745, "grad_norm": 0.6100956797599792, "learning_rate": 0.0002, "epoch": 0.3497307001795332, "step": 4870}, {"loss": 0.7642, "grad_norm": 0.5478541254997253, "learning_rate": 0.0002, "epoch": 0.3504488330341113, "step": 4880}, {"loss": 0.7558, "grad_norm": 0.5725359916687012, "learning_rate": 0.0002, "epoch": 0.3511669658886894, "step": 4890}, {"loss": 0.8208, "grad_norm": 0.6141043901443481, "learning_rate": 0.0002, "epoch": 0.3518850987432675, "step": 4900}, {"loss": 0.841, "grad_norm": 0.597191572189331, "learning_rate": 0.0002, "epoch": 0.3526032315978456, "step": 4910}, {"loss": 0.8234, "grad_norm": 0.5988389253616333, "learning_rate": 0.0002, "epoch": 0.3533213644524237, "step": 4920}, {"loss": 0.7775, "grad_norm": 0.5503361821174622, "learning_rate": 0.0002, "epoch": 0.3540394973070018, "step": 4930}, {"loss": 0.8315, "grad_norm": 0.5932779312133789, "learning_rate": 0.0002, "epoch": 0.3547576301615799, "step": 4940}, {"loss": 0.8407, "grad_norm": 0.48911359906196594, "learning_rate": 0.0002, "epoch": 0.35547576301615796, "step": 4950}, {"loss": 0.8191, "grad_norm": 0.5435750484466553, "learning_rate": 0.0002, "epoch": 0.3561938958707361, "step": 4960}, {"loss": 0.7551, "grad_norm": 0.4786977767944336, "learning_rate": 0.0002, "epoch": 0.3569120287253142, "step": 4970}, {"loss": 0.7845, "grad_norm": 0.4022316336631775, "learning_rate": 0.0002, "epoch": 0.3576301615798923, "step": 4980}, {"loss": 0.8032, "grad_norm": 0.4848504364490509, "learning_rate": 0.0002, "epoch": 0.35834829443447036, "step": 4990}, {"loss": 0.809, "grad_norm": 0.5093459486961365, "learning_rate": 0.0002, "epoch": 0.3590664272890485, "step": 5000}, {"loss": 0.8424, "grad_norm": 0.47368478775024414, "learning_rate": 0.0002, "epoch": 0.3597845601436266, "step": 5010}, {"loss": 0.811, "grad_norm": 0.6041097044944763, "learning_rate": 0.0002, "epoch": 0.36050269299820464, "step": 5020}, {"loss": 0.8023, "grad_norm": 0.5384424924850464, "learning_rate": 0.0002, "epoch": 0.36122082585278276, "step": 5030}, {"loss": 0.826, "grad_norm": 0.4668518602848053, "learning_rate": 0.0002, "epoch": 0.36193895870736087, "step": 5040}, {"loss": 0.7785, "grad_norm": 0.5471060276031494, "learning_rate": 0.0002, "epoch": 0.362657091561939, "step": 5050}, {"loss": 0.7511, "grad_norm": 0.731369137763977, "learning_rate": 0.0002, "epoch": 0.36337522441651704, "step": 5060}, {"loss": 0.8646, "grad_norm": 0.5119590759277344, "learning_rate": 0.0002, "epoch": 0.36409335727109515, "step": 5070}, {"loss": 0.8125, "grad_norm": 0.567428469657898, "learning_rate": 0.0002, "epoch": 0.36481149012567327, "step": 5080}, {"loss": 0.7616, "grad_norm": 0.5139971375465393, "learning_rate": 0.0002, "epoch": 0.3655296229802513, "step": 5090}, {"loss": 0.8091, "grad_norm": 0.5701581835746765, "learning_rate": 0.0002, "epoch": 0.36624775583482944, "step": 5100}, {"loss": 0.821, "grad_norm": 0.5022063851356506, "learning_rate": 0.0002, "epoch": 0.36696588868940755, "step": 5110}, {"loss": 0.7879, "grad_norm": 0.4684354364871979, "learning_rate": 0.0002, "epoch": 0.36768402154398566, "step": 5120}, {"loss": 0.8028, "grad_norm": 0.5423495769500732, "learning_rate": 0.0002, "epoch": 0.3684021543985637, "step": 5130}, {"loss": 0.7763, "grad_norm": 0.46262967586517334, "learning_rate": 0.0002, "epoch": 0.36912028725314183, "step": 5140}, {"loss": 0.8485, "grad_norm": 0.4720141589641571, "learning_rate": 0.0002, "epoch": 0.36983842010771995, "step": 5150}, {"loss": 0.7778, "grad_norm": 0.5113096833229065, "learning_rate": 0.0002, "epoch": 0.370556552962298, "step": 5160}, {"loss": 0.7854, "grad_norm": 0.5253350138664246, "learning_rate": 0.0002, "epoch": 0.3712746858168761, "step": 5170}, {"loss": 0.8539, "grad_norm": 0.5799776315689087, "learning_rate": 0.0002, "epoch": 0.37199281867145423, "step": 5180}, {"loss": 0.78, "grad_norm": 0.5166001319885254, "learning_rate": 0.0002, "epoch": 0.37271095152603234, "step": 5190}, {"loss": 0.7939, "grad_norm": 0.5658290386199951, "learning_rate": 0.0002, "epoch": 0.3734290843806104, "step": 5200}, {"loss": 0.8059, "grad_norm": 0.45811113715171814, "learning_rate": 0.0002, "epoch": 0.3741472172351885, "step": 5210}, {"loss": 0.8024, "grad_norm": 0.5509489178657532, "learning_rate": 0.0002, "epoch": 0.3748653500897666, "step": 5220}, {"loss": 0.7537, "grad_norm": 0.47473257780075073, "learning_rate": 0.0002, "epoch": 0.3755834829443447, "step": 5230}, {"loss": 0.8159, "grad_norm": 0.3858596086502075, "learning_rate": 0.0002, "epoch": 0.3763016157989228, "step": 5240}, {"loss": 0.8592, "grad_norm": 0.6941536068916321, "learning_rate": 0.0002, "epoch": 0.3770197486535009, "step": 5250}, {"loss": 0.8489, "grad_norm": 0.46940872073173523, "learning_rate": 0.0002, "epoch": 0.377737881508079, "step": 5260}, {"loss": 0.7818, "grad_norm": 0.5413833260536194, "learning_rate": 0.0002, "epoch": 0.3784560143626571, "step": 5270}, {"loss": 0.8202, "grad_norm": 0.5165658593177795, "learning_rate": 0.0002, "epoch": 0.3791741472172352, "step": 5280}, {"loss": 0.7837, "grad_norm": 0.6567398309707642, "learning_rate": 0.0002, "epoch": 0.3798922800718133, "step": 5290}, {"loss": 0.7991, "grad_norm": 0.5466915965080261, "learning_rate": 0.0002, "epoch": 0.38061041292639136, "step": 5300}, {"loss": 0.7683, "grad_norm": 0.4800598621368408, "learning_rate": 0.0002, "epoch": 0.3813285457809695, "step": 5310}, {"loss": 0.8653, "grad_norm": 0.4551742970943451, "learning_rate": 0.0002, "epoch": 0.3820466786355476, "step": 5320}, {"loss": 0.8283, "grad_norm": 0.5561164617538452, "learning_rate": 0.0002, "epoch": 0.3827648114901257, "step": 5330}, {"loss": 0.8192, "grad_norm": 0.6170380115509033, "learning_rate": 0.0002, "epoch": 0.38348294434470376, "step": 5340}, {"loss": 0.8015, "grad_norm": 0.465762197971344, "learning_rate": 0.0002, "epoch": 0.38420107719928187, "step": 5350}, {"loss": 0.7561, "grad_norm": 0.6176838874816895, "learning_rate": 0.0002, "epoch": 0.38491921005386, "step": 5360}, {"loss": 0.7571, "grad_norm": 0.657926082611084, "learning_rate": 0.0002, "epoch": 0.38563734290843804, "step": 5370}, {"loss": 0.7366, "grad_norm": 0.5063281655311584, "learning_rate": 0.0002, "epoch": 0.38635547576301615, "step": 5380}, {"loss": 0.8259, "grad_norm": 0.6960828304290771, "learning_rate": 0.0002, "epoch": 0.38707360861759427, "step": 5390}, {"loss": 0.8058, "grad_norm": 0.46712034940719604, "learning_rate": 0.0002, "epoch": 0.3877917414721723, "step": 5400}, {"loss": 0.7674, "grad_norm": 0.598114013671875, "learning_rate": 0.0002, "epoch": 0.38850987432675044, "step": 5410}, {"loss": 0.8256, "grad_norm": 0.6798132061958313, "learning_rate": 0.0002, "epoch": 0.38922800718132855, "step": 5420}, {"loss": 0.844, "grad_norm": 0.5194289088249207, "learning_rate": 0.0002, "epoch": 0.38994614003590666, "step": 5430}, {"loss": 0.7666, "grad_norm": 0.48175323009490967, "learning_rate": 0.0002, "epoch": 0.3906642728904847, "step": 5440}, {"loss": 0.8089, "grad_norm": 0.4979408085346222, "learning_rate": 0.0002, "epoch": 0.39138240574506283, "step": 5450}, {"loss": 0.7938, "grad_norm": 0.6440972685813904, "learning_rate": 0.0002, "epoch": 0.39210053859964095, "step": 5460}, {"loss": 0.8531, "grad_norm": 0.5977227091789246, "learning_rate": 0.0002, "epoch": 0.392818671454219, "step": 5470}, {"loss": 0.8384, "grad_norm": 0.4735909104347229, "learning_rate": 0.0002, "epoch": 0.3935368043087971, "step": 5480}, {"loss": 0.8579, "grad_norm": 0.48181721568107605, "learning_rate": 0.0002, "epoch": 0.39425493716337523, "step": 5490}, {"loss": 0.8113, "grad_norm": 0.6339454650878906, "learning_rate": 0.0002, "epoch": 0.39497307001795334, "step": 5500}, {"loss": 0.7682, "grad_norm": 0.5364336371421814, "learning_rate": 0.0002, "epoch": 0.3956912028725314, "step": 5510}, {"loss": 0.8198, "grad_norm": 0.5499233603477478, "learning_rate": 0.0002, "epoch": 0.3964093357271095, "step": 5520}, {"loss": 0.7981, "grad_norm": 0.47249847650527954, "learning_rate": 0.0002, "epoch": 0.3971274685816876, "step": 5530}, {"loss": 0.8207, "grad_norm": 0.5692135095596313, "learning_rate": 0.0002, "epoch": 0.3978456014362657, "step": 5540}, {"loss": 0.8173, "grad_norm": 0.6009272933006287, "learning_rate": 0.0002, "epoch": 0.3985637342908438, "step": 5550}, {"loss": 0.7622, "grad_norm": 0.5198255181312561, "learning_rate": 0.0002, "epoch": 0.3992818671454219, "step": 5560}, {"loss": 0.8597, "grad_norm": 0.5474766492843628, "learning_rate": 0.0002, "epoch": 0.4, "step": 5570}, {"loss": 0.841, "grad_norm": 0.5577479600906372, "learning_rate": 0.0002, "epoch": 0.4007181328545781, "step": 5580}, {"loss": 0.7986, "grad_norm": 0.5350302457809448, "learning_rate": 0.0002, "epoch": 0.4014362657091562, "step": 5590}, {"loss": 0.7892, "grad_norm": 0.6310991048812866, "learning_rate": 0.0002, "epoch": 0.4021543985637343, "step": 5600}, {"loss": 0.7834, "grad_norm": 0.5695762038230896, "learning_rate": 0.0002, "epoch": 0.40287253141831236, "step": 5610}, {"loss": 0.7508, "grad_norm": 0.5431827306747437, "learning_rate": 0.0002, "epoch": 0.4035906642728905, "step": 5620}, {"loss": 0.8743, "grad_norm": 0.4923325777053833, "learning_rate": 0.0002, "epoch": 0.4043087971274686, "step": 5630}, {"loss": 0.7745, "grad_norm": 0.531399667263031, "learning_rate": 0.0002, "epoch": 0.4050269299820467, "step": 5640}, {"loss": 0.7982, "grad_norm": 0.5854769349098206, "learning_rate": 0.0002, "epoch": 0.40574506283662476, "step": 5650}, {"loss": 0.8225, "grad_norm": 0.6684802174568176, "learning_rate": 0.0002, "epoch": 0.40646319569120287, "step": 5660}, {"loss": 0.7405, "grad_norm": 0.6618620753288269, "learning_rate": 0.0002, "epoch": 0.407181328545781, "step": 5670}, {"loss": 0.7707, "grad_norm": 0.4930776059627533, "learning_rate": 0.0002, "epoch": 0.40789946140035904, "step": 5680}, {"loss": 0.7846, "grad_norm": 0.506628155708313, "learning_rate": 0.0002, "epoch": 0.40861759425493716, "step": 5690}, {"loss": 0.7827, "grad_norm": 0.5250783562660217, "learning_rate": 0.0002, "epoch": 0.40933572710951527, "step": 5700}, {"loss": 0.8386, "grad_norm": 0.6773046851158142, "learning_rate": 0.0002, "epoch": 0.4100538599640934, "step": 5710}, {"loss": 0.8096, "grad_norm": 0.6750592589378357, "learning_rate": 0.0002, "epoch": 0.41077199281867144, "step": 5720}, {"loss": 0.7873, "grad_norm": 0.5277232527732849, "learning_rate": 0.0002, "epoch": 0.41149012567324955, "step": 5730}, {"loss": 0.762, "grad_norm": 0.5155990719795227, "learning_rate": 0.0002, "epoch": 0.41220825852782766, "step": 5740}, {"loss": 0.871, "grad_norm": 0.5236294865608215, "learning_rate": 0.0002, "epoch": 0.4129263913824057, "step": 5750}, {"loss": 0.7753, "grad_norm": 0.5073592066764832, "learning_rate": 0.0002, "epoch": 0.41364452423698383, "step": 5760}, {"loss": 0.7984, "grad_norm": 0.6997184753417969, "learning_rate": 0.0002, "epoch": 0.41436265709156195, "step": 5770}, {"loss": 0.7579, "grad_norm": 0.5282439589500427, "learning_rate": 0.0002, "epoch": 0.41508078994614006, "step": 5780}, {"loss": 0.7831, "grad_norm": 0.4997355341911316, "learning_rate": 0.0002, "epoch": 0.4157989228007181, "step": 5790}, {"loss": 0.8022, "grad_norm": 0.6081610321998596, "learning_rate": 0.0002, "epoch": 0.41651705565529623, "step": 5800}, {"loss": 0.8068, "grad_norm": 0.5640295147895813, "learning_rate": 0.0002, "epoch": 0.41723518850987434, "step": 5810}, {"loss": 0.7819, "grad_norm": 0.6443586349487305, "learning_rate": 0.0002, "epoch": 0.4179533213644524, "step": 5820}, {"loss": 0.8132, "grad_norm": 0.6456229090690613, "learning_rate": 0.0002, "epoch": 0.4186714542190305, "step": 5830}, {"loss": 0.785, "grad_norm": 0.5422267317771912, "learning_rate": 0.0002, "epoch": 0.4193895870736086, "step": 5840}, {"loss": 0.7962, "grad_norm": 0.45251885056495667, "learning_rate": 0.0002, "epoch": 0.42010771992818674, "step": 5850}, {"loss": 0.7945, "grad_norm": 0.781165599822998, "learning_rate": 0.0002, "epoch": 0.4208258527827648, "step": 5860}, {"loss": 0.8171, "grad_norm": 0.5359160900115967, "learning_rate": 0.0002, "epoch": 0.4215439856373429, "step": 5870}, {"loss": 0.8012, "grad_norm": 0.6201958656311035, "learning_rate": 0.0002, "epoch": 0.422262118491921, "step": 5880}, {"loss": 0.8363, "grad_norm": 0.5985850691795349, "learning_rate": 0.0002, "epoch": 0.4229802513464991, "step": 5890}, {"loss": 0.7842, "grad_norm": 0.5550961494445801, "learning_rate": 0.0002, "epoch": 0.4236983842010772, "step": 5900}, {"loss": 0.7717, "grad_norm": 0.6284893155097961, "learning_rate": 0.0002, "epoch": 0.4244165170556553, "step": 5910}, {"loss": 0.8165, "grad_norm": 0.6143685579299927, "learning_rate": 0.0002, "epoch": 0.4251346499102334, "step": 5920}, {"loss": 0.7986, "grad_norm": 0.5065329670906067, "learning_rate": 0.0002, "epoch": 0.4258527827648115, "step": 5930}, {"loss": 0.7883, "grad_norm": 0.7274345755577087, "learning_rate": 0.0002, "epoch": 0.4265709156193896, "step": 5940}, {"loss": 0.8126, "grad_norm": 0.606531023979187, "learning_rate": 0.0002, "epoch": 0.4272890484739677, "step": 5950}, {"loss": 0.7805, "grad_norm": 0.5983648300170898, "learning_rate": 0.0002, "epoch": 0.42800718132854576, "step": 5960}, {"loss": 0.8124, "grad_norm": 0.5546031594276428, "learning_rate": 0.0002, "epoch": 0.4287253141831239, "step": 5970}, {"loss": 0.8184, "grad_norm": 0.666868269443512, "learning_rate": 0.0002, "epoch": 0.429443447037702, "step": 5980}, {"loss": 0.8171, "grad_norm": 0.41438576579093933, "learning_rate": 0.0002, "epoch": 0.4301615798922801, "step": 5990}, {"loss": 0.8456, "grad_norm": 0.5012526512145996, "learning_rate": 0.0002, "epoch": 0.43087971274685816, "step": 6000}, {"loss": 0.7837, "grad_norm": 0.6071694493293762, "learning_rate": 0.0002, "epoch": 0.43159784560143627, "step": 6010}, {"loss": 0.8364, "grad_norm": 0.5538384914398193, "learning_rate": 0.0002, "epoch": 0.4323159784560144, "step": 6020}, {"loss": 0.7888, "grad_norm": 0.5798718929290771, "learning_rate": 0.0002, "epoch": 0.43303411131059244, "step": 6030}, {"loss": 0.8196, "grad_norm": 0.5442442893981934, "learning_rate": 0.0002, "epoch": 0.43375224416517055, "step": 6040}, {"loss": 0.8041, "grad_norm": 0.6895565390586853, "learning_rate": 0.0002, "epoch": 0.43447037701974867, "step": 6050}, {"loss": 0.8154, "grad_norm": 0.6498045325279236, "learning_rate": 0.0002, "epoch": 0.4351885098743267, "step": 6060}, {"loss": 0.782, "grad_norm": 0.5225510001182556, "learning_rate": 0.0002, "epoch": 0.43590664272890484, "step": 6070}, {"loss": 0.7809, "grad_norm": 0.6366992592811584, "learning_rate": 0.0002, "epoch": 0.43662477558348295, "step": 6080}, {"loss": 0.7715, "grad_norm": 0.47929027676582336, "learning_rate": 0.0002, "epoch": 0.43734290843806106, "step": 6090}, {"loss": 0.7481, "grad_norm": 0.5722405910491943, "learning_rate": 0.0002, "epoch": 0.4380610412926391, "step": 6100}, {"loss": 0.765, "grad_norm": 0.6008004546165466, "learning_rate": 0.0002, "epoch": 0.43877917414721723, "step": 6110}, {"loss": 0.7795, "grad_norm": 0.5922580361366272, "learning_rate": 0.0002, "epoch": 0.43949730700179535, "step": 6120}, {"loss": 0.8542, "grad_norm": 0.7051905393600464, "learning_rate": 0.0002, "epoch": 0.4402154398563734, "step": 6130}, {"loss": 0.8159, "grad_norm": 0.5146450400352478, "learning_rate": 0.0002, "epoch": 0.4409335727109515, "step": 6140}, {"loss": 0.8178, "grad_norm": 0.5605781674385071, "learning_rate": 0.0002, "epoch": 0.44165170556552963, "step": 6150}, {"loss": 0.8409, "grad_norm": 0.8008661866188049, "learning_rate": 0.0002, "epoch": 0.44236983842010774, "step": 6160}, {"loss": 0.797, "grad_norm": 0.47406497597694397, "learning_rate": 0.0002, "epoch": 0.4430879712746858, "step": 6170}, {"loss": 0.7853, "grad_norm": 0.612287700176239, "learning_rate": 0.0002, "epoch": 0.4438061041292639, "step": 6180}, {"loss": 0.835, "grad_norm": 0.561188280582428, "learning_rate": 0.0002, "epoch": 0.444524236983842, "step": 6190}, {"loss": 0.7604, "grad_norm": 0.6233669519424438, "learning_rate": 0.0002, "epoch": 0.4452423698384201, "step": 6200}, {"loss": 0.7539, "grad_norm": 0.45546263456344604, "learning_rate": 0.0002, "epoch": 0.4459605026929982, "step": 6210}, {"loss": 0.8183, "grad_norm": 0.5947871208190918, "learning_rate": 0.0002, "epoch": 0.4466786355475763, "step": 6220}, {"loss": 0.789, "grad_norm": 0.6109753847122192, "learning_rate": 0.0002, "epoch": 0.4473967684021544, "step": 6230}, {"loss": 0.7811, "grad_norm": 0.6380727887153625, "learning_rate": 0.0002, "epoch": 0.4481149012567325, "step": 6240}, {"loss": 0.7845, "grad_norm": 0.5225699543952942, "learning_rate": 0.0002, "epoch": 0.4488330341113106, "step": 6250}, {"loss": 0.8217, "grad_norm": 0.521503210067749, "learning_rate": 0.0002, "epoch": 0.4495511669658887, "step": 6260}, {"loss": 0.8392, "grad_norm": 0.5523216128349304, "learning_rate": 0.0002, "epoch": 0.45026929982046676, "step": 6270}, {"loss": 0.8228, "grad_norm": 0.5954921841621399, "learning_rate": 0.0002, "epoch": 0.4509874326750449, "step": 6280}, {"loss": 0.7798, "grad_norm": 0.702751100063324, "learning_rate": 0.0002, "epoch": 0.451705565529623, "step": 6290}, {"loss": 0.7865, "grad_norm": 0.5756356120109558, "learning_rate": 0.0002, "epoch": 0.4524236983842011, "step": 6300}, {"loss": 0.8128, "grad_norm": 0.45365944504737854, "learning_rate": 0.0002, "epoch": 0.45314183123877916, "step": 6310}, {"loss": 0.8027, "grad_norm": 0.5027855038642883, "learning_rate": 0.0002, "epoch": 0.45385996409335727, "step": 6320}, {"loss": 0.8052, "grad_norm": 0.6551687121391296, "learning_rate": 0.0002, "epoch": 0.4545780969479354, "step": 6330}, {"loss": 0.7507, "grad_norm": 0.5296684503555298, "learning_rate": 0.0002, "epoch": 0.45529622980251344, "step": 6340}, {"loss": 0.8209, "grad_norm": 0.5762032866477966, "learning_rate": 0.0002, "epoch": 0.45601436265709155, "step": 6350}, {"loss": 0.8209, "grad_norm": 0.5234073996543884, "learning_rate": 0.0002, "epoch": 0.45673249551166967, "step": 6360}, {"loss": 0.8412, "grad_norm": 0.5090946555137634, "learning_rate": 0.0002, "epoch": 0.4574506283662478, "step": 6370}, {"loss": 0.787, "grad_norm": 0.6515111327171326, "learning_rate": 0.0002, "epoch": 0.45816876122082584, "step": 6380}, {"loss": 0.7351, "grad_norm": 0.7904898524284363, "learning_rate": 0.0002, "epoch": 0.45888689407540395, "step": 6390}, {"loss": 0.841, "grad_norm": 0.6379680037498474, "learning_rate": 0.0002, "epoch": 0.45960502692998206, "step": 6400}, {"loss": 0.7727, "grad_norm": 0.641759991645813, "learning_rate": 0.0002, "epoch": 0.4603231597845601, "step": 6410}, {"loss": 0.8346, "grad_norm": 0.5273829698562622, "learning_rate": 0.0002, "epoch": 0.46104129263913823, "step": 6420}, {"loss": 0.7722, "grad_norm": 0.5668497681617737, "learning_rate": 0.0002, "epoch": 0.46175942549371635, "step": 6430}, {"loss": 0.8157, "grad_norm": 0.5862061381340027, "learning_rate": 0.0002, "epoch": 0.46247755834829446, "step": 6440}, {"loss": 0.818, "grad_norm": 0.5239592790603638, "learning_rate": 0.0002, "epoch": 0.4631956912028725, "step": 6450}, {"loss": 0.7803, "grad_norm": 0.5078722834587097, "learning_rate": 0.0002, "epoch": 0.46391382405745063, "step": 6460}, {"loss": 0.7934, "grad_norm": 0.566509485244751, "learning_rate": 0.0002, "epoch": 0.46463195691202874, "step": 6470}, {"loss": 0.7746, "grad_norm": 0.5952697396278381, "learning_rate": 0.0002, "epoch": 0.4653500897666068, "step": 6480}, {"loss": 0.8088, "grad_norm": 0.6548156142234802, "learning_rate": 0.0002, "epoch": 0.4660682226211849, "step": 6490}, {"loss": 0.8303, "grad_norm": 0.4768427908420563, "learning_rate": 0.0002, "epoch": 0.466786355475763, "step": 6500}, {"loss": 0.805, "grad_norm": 0.5588273406028748, "learning_rate": 0.0002, "epoch": 0.46750448833034114, "step": 6510}, {"loss": 0.7774, "grad_norm": 0.5348677039146423, "learning_rate": 0.0002, "epoch": 0.4682226211849192, "step": 6520}, {"loss": 0.7969, "grad_norm": 0.4784318804740906, "learning_rate": 0.0002, "epoch": 0.4689407540394973, "step": 6530}, {"loss": 0.8073, "grad_norm": 0.5112265944480896, "learning_rate": 0.0002, "epoch": 0.4696588868940754, "step": 6540}, {"loss": 0.8289, "grad_norm": 0.7250495553016663, "learning_rate": 0.0002, "epoch": 0.4703770197486535, "step": 6550}, {"loss": 0.808, "grad_norm": 0.538608968257904, "learning_rate": 0.0002, "epoch": 0.4710951526032316, "step": 6560}, {"loss": 0.7977, "grad_norm": 0.5981247425079346, "learning_rate": 0.0002, "epoch": 0.4718132854578097, "step": 6570}, {"loss": 0.8092, "grad_norm": 0.5466762781143188, "learning_rate": 0.0002, "epoch": 0.4725314183123878, "step": 6580}, {"loss": 0.8136, "grad_norm": 0.5609987378120422, "learning_rate": 0.0002, "epoch": 0.4732495511669659, "step": 6590}, {"loss": 0.8575, "grad_norm": 0.6091027855873108, "learning_rate": 0.0002, "epoch": 0.473967684021544, "step": 6600}, {"loss": 0.7741, "grad_norm": 0.5542886853218079, "learning_rate": 0.0002, "epoch": 0.4746858168761221, "step": 6610}, {"loss": 0.7867, "grad_norm": 0.5656579732894897, "learning_rate": 0.0002, "epoch": 0.47540394973070016, "step": 6620}, {"loss": 0.7647, "grad_norm": 0.47507357597351074, "learning_rate": 0.0002, "epoch": 0.4761220825852783, "step": 6630}, {"loss": 0.8323, "grad_norm": 0.6039174199104309, "learning_rate": 0.0002, "epoch": 0.4768402154398564, "step": 6640}, {"loss": 0.7812, "grad_norm": 0.7129740715026855, "learning_rate": 0.0002, "epoch": 0.47755834829443444, "step": 6650}, {"loss": 0.8001, "grad_norm": 0.5189188718795776, "learning_rate": 0.0002, "epoch": 0.47827648114901256, "step": 6660}, {"loss": 0.7467, "grad_norm": 0.7548696398735046, "learning_rate": 0.0002, "epoch": 0.47899461400359067, "step": 6670}, {"loss": 0.7694, "grad_norm": 0.4729466438293457, "learning_rate": 0.0002, "epoch": 0.4797127468581688, "step": 6680}, {"loss": 0.7497, "grad_norm": 0.6190000772476196, "learning_rate": 0.0002, "epoch": 0.48043087971274684, "step": 6690}, {"loss": 0.7691, "grad_norm": 0.6276983022689819, "learning_rate": 0.0002, "epoch": 0.48114901256732495, "step": 6700}, {"loss": 0.7947, "grad_norm": 0.6097590923309326, "learning_rate": 0.0002, "epoch": 0.48186714542190306, "step": 6710}, {"loss": 0.7735, "grad_norm": 0.6507330536842346, "learning_rate": 0.0002, "epoch": 0.4825852782764811, "step": 6720}, {"loss": 0.817, "grad_norm": 0.5501991510391235, "learning_rate": 0.0002, "epoch": 0.48330341113105924, "step": 6730}, {"loss": 0.7998, "grad_norm": 0.5928015112876892, "learning_rate": 0.0002, "epoch": 0.48402154398563735, "step": 6740}, {"loss": 0.7717, "grad_norm": 0.5523008704185486, "learning_rate": 0.0002, "epoch": 0.48473967684021546, "step": 6750}, {"loss": 0.7821, "grad_norm": 0.5997263789176941, "learning_rate": 0.0002, "epoch": 0.4854578096947935, "step": 6760}, {"loss": 0.7619, "grad_norm": 0.6201002597808838, "learning_rate": 0.0002, "epoch": 0.48617594254937163, "step": 6770}, {"loss": 0.8018, "grad_norm": 0.6338862776756287, "learning_rate": 0.0002, "epoch": 0.48689407540394974, "step": 6780}, {"loss": 0.7547, "grad_norm": 0.5542550086975098, "learning_rate": 0.0002, "epoch": 0.4876122082585278, "step": 6790}, {"loss": 0.7754, "grad_norm": 0.5587872862815857, "learning_rate": 0.0002, "epoch": 0.4883303411131059, "step": 6800}, {"loss": 0.7913, "grad_norm": 0.5895681977272034, "learning_rate": 0.0002, "epoch": 0.489048473967684, "step": 6810}, {"loss": 0.7799, "grad_norm": 0.4948221743106842, "learning_rate": 0.0002, "epoch": 0.48976660682226214, "step": 6820}, {"loss": 0.8057, "grad_norm": 0.44546931982040405, "learning_rate": 0.0002, "epoch": 0.4904847396768402, "step": 6830}, {"loss": 0.8124, "grad_norm": 0.632046103477478, "learning_rate": 0.0002, "epoch": 0.4912028725314183, "step": 6840}, {"loss": 0.8014, "grad_norm": 0.49396243691444397, "learning_rate": 0.0002, "epoch": 0.4919210053859964, "step": 6850}, {"loss": 0.7127, "grad_norm": 0.497745156288147, "learning_rate": 0.0002, "epoch": 0.4926391382405745, "step": 6860}, {"loss": 0.8306, "grad_norm": 0.7336170077323914, "learning_rate": 0.0002, "epoch": 0.4933572710951526, "step": 6870}, {"loss": 0.8342, "grad_norm": 0.6723181009292603, "learning_rate": 0.0002, "epoch": 0.4940754039497307, "step": 6880}, {"loss": 0.8251, "grad_norm": 0.5887754559516907, "learning_rate": 0.0002, "epoch": 0.4947935368043088, "step": 6890}, {"loss": 0.7904, "grad_norm": 0.6580226421356201, "learning_rate": 0.0002, "epoch": 0.4955116696588869, "step": 6900}, {"loss": 0.8203, "grad_norm": 0.7385056614875793, "learning_rate": 0.0002, "epoch": 0.496229802513465, "step": 6910}, {"loss": 0.87, "grad_norm": 0.48736000061035156, "learning_rate": 0.0002, "epoch": 0.4969479353680431, "step": 6920}, {"loss": 0.8045, "grad_norm": 0.6304559111595154, "learning_rate": 0.0002, "epoch": 0.49766606822262116, "step": 6930}, {"loss": 0.8323, "grad_norm": 0.607148289680481, "learning_rate": 0.0002, "epoch": 0.4983842010771993, "step": 6940}, {"loss": 0.8277, "grad_norm": 0.5467981696128845, "learning_rate": 0.0002, "epoch": 0.4991023339317774, "step": 6950}, {"loss": 0.804, "grad_norm": 0.7046723961830139, "learning_rate": 0.0002, "epoch": 0.4998204667863555, "step": 6960}, {"loss": 0.7836, "grad_norm": 0.5487921833992004, "learning_rate": 0.0002, "epoch": 0.5005385996409336, "step": 6970}, {"loss": 0.8445, "grad_norm": 0.5706006288528442, "learning_rate": 0.0002, "epoch": 0.5012567324955116, "step": 6980}, {"loss": 0.8216, "grad_norm": 0.539536714553833, "learning_rate": 0.0002, "epoch": 0.5019748653500897, "step": 6990}, {"loss": 0.7829, "grad_norm": 0.5527397394180298, "learning_rate": 0.0002, "epoch": 0.5026929982046678, "step": 7000}, {"loss": 0.8342, "grad_norm": 0.5498567223548889, "learning_rate": 0.0002, "epoch": 0.503411131059246, "step": 7010}, {"loss": 0.8073, "grad_norm": 0.5878575444221497, "learning_rate": 0.0002, "epoch": 0.5041292639138241, "step": 7020}, {"loss": 0.8284, "grad_norm": 0.646153450012207, "learning_rate": 0.0002, "epoch": 0.5048473967684022, "step": 7030}, {"loss": 0.7758, "grad_norm": 0.5603899359703064, "learning_rate": 0.0002, "epoch": 0.5055655296229803, "step": 7040}, {"loss": 0.8002, "grad_norm": 0.5849952697753906, "learning_rate": 0.0002, "epoch": 0.5062836624775583, "step": 7050}, {"loss": 0.7953, "grad_norm": 0.6082724928855896, "learning_rate": 0.0002, "epoch": 0.5070017953321364, "step": 7060}, {"loss": 0.8046, "grad_norm": 0.5900670289993286, "learning_rate": 0.0002, "epoch": 0.5077199281867145, "step": 7070}, {"loss": 0.8612, "grad_norm": 0.5856624841690063, "learning_rate": 0.0002, "epoch": 0.5084380610412926, "step": 7080}, {"loss": 0.8289, "grad_norm": 0.6177338361740112, "learning_rate": 0.0002, "epoch": 0.5091561938958707, "step": 7090}, {"loss": 0.8139, "grad_norm": 0.5559300184249878, "learning_rate": 0.0002, "epoch": 0.5098743267504489, "step": 7100}, {"loss": 0.8083, "grad_norm": 0.62027907371521, "learning_rate": 0.0002, "epoch": 0.510592459605027, "step": 7110}, {"loss": 0.8037, "grad_norm": 0.6334301829338074, "learning_rate": 0.0002, "epoch": 0.511310592459605, "step": 7120}, {"loss": 0.8107, "grad_norm": 0.513795018196106, "learning_rate": 0.0002, "epoch": 0.5120287253141831, "step": 7130}, {"loss": 0.7566, "grad_norm": 0.7004675269126892, "learning_rate": 0.0002, "epoch": 0.5127468581687612, "step": 7140}, {"loss": 0.7893, "grad_norm": 0.5614308714866638, "learning_rate": 0.0002, "epoch": 0.5134649910233393, "step": 7150}, {"loss": 0.7868, "grad_norm": 0.5037539601325989, "learning_rate": 0.0002, "epoch": 0.5141831238779174, "step": 7160}, {"loss": 0.7981, "grad_norm": 0.5568661093711853, "learning_rate": 0.0002, "epoch": 0.5149012567324955, "step": 7170}, {"loss": 0.8333, "grad_norm": 0.7513397336006165, "learning_rate": 0.0002, "epoch": 0.5156193895870737, "step": 7180}, {"loss": 0.792, "grad_norm": 0.7264583706855774, "learning_rate": 0.0002, "epoch": 0.5163375224416517, "step": 7190}, {"loss": 0.8671, "grad_norm": 0.6355819702148438, "learning_rate": 0.0002, "epoch": 0.5170556552962298, "step": 7200}, {"loss": 0.7734, "grad_norm": 0.6063222289085388, "learning_rate": 0.0002, "epoch": 0.5177737881508079, "step": 7210}, {"loss": 0.812, "grad_norm": 0.6484307646751404, "learning_rate": 0.0002, "epoch": 0.518491921005386, "step": 7220}, {"loss": 0.7852, "grad_norm": 0.5260455012321472, "learning_rate": 0.0002, "epoch": 0.5192100538599641, "step": 7230}, {"loss": 0.8301, "grad_norm": 0.6718002557754517, "learning_rate": 0.0002, "epoch": 0.5199281867145422, "step": 7240}, {"loss": 0.8178, "grad_norm": 0.5997617244720459, "learning_rate": 0.0002, "epoch": 0.5206463195691203, "step": 7250}, {"loss": 0.7631, "grad_norm": 0.5838589668273926, "learning_rate": 0.0002, "epoch": 0.5213644524236983, "step": 7260}, {"loss": 0.7853, "grad_norm": 0.5755977630615234, "learning_rate": 0.0002, "epoch": 0.5220825852782764, "step": 7270}, {"loss": 0.8233, "grad_norm": 0.6442093253135681, "learning_rate": 0.0002, "epoch": 0.5228007181328546, "step": 7280}, {"loss": 0.822, "grad_norm": 0.6128416657447815, "learning_rate": 0.0002, "epoch": 0.5235188509874327, "step": 7290}, {"loss": 0.802, "grad_norm": 0.509742796421051, "learning_rate": 0.0002, "epoch": 0.5242369838420108, "step": 7300}, {"loss": 0.7438, "grad_norm": 0.5450230836868286, "learning_rate": 0.0002, "epoch": 0.5249551166965889, "step": 7310}, {"loss": 0.7881, "grad_norm": 0.5437141060829163, "learning_rate": 0.0002, "epoch": 0.525673249551167, "step": 7320}, {"loss": 0.795, "grad_norm": 0.5291738510131836, "learning_rate": 0.0002, "epoch": 0.526391382405745, "step": 7330}, {"loss": 0.8204, "grad_norm": 0.5101743936538696, "learning_rate": 0.0002, "epoch": 0.5271095152603231, "step": 7340}, {"loss": 0.856, "grad_norm": 0.5678408145904541, "learning_rate": 0.0002, "epoch": 0.5278276481149012, "step": 7350}, {"loss": 0.8435, "grad_norm": 0.6332360506057739, "learning_rate": 0.0002, "epoch": 0.5285457809694794, "step": 7360}, {"loss": 0.8521, "grad_norm": 0.4935058653354645, "learning_rate": 0.0002, "epoch": 0.5292639138240575, "step": 7370}, {"loss": 0.7699, "grad_norm": 0.6399656534194946, "learning_rate": 0.0002, "epoch": 0.5299820466786356, "step": 7380}, {"loss": 0.7956, "grad_norm": 0.5986794233322144, "learning_rate": 0.0002, "epoch": 0.5307001795332137, "step": 7390}, {"loss": 0.774, "grad_norm": 0.6948414444923401, "learning_rate": 0.0002, "epoch": 0.5314183123877917, "step": 7400}, {"loss": 0.8267, "grad_norm": 0.5337842106819153, "learning_rate": 0.0002, "epoch": 0.5321364452423698, "step": 7410}, {"loss": 0.7634, "grad_norm": 0.6897268295288086, "learning_rate": 0.0002, "epoch": 0.5328545780969479, "step": 7420}, {"loss": 0.7606, "grad_norm": 0.6361175179481506, "learning_rate": 0.0002, "epoch": 0.533572710951526, "step": 7430}, {"loss": 0.7592, "grad_norm": 0.5242252945899963, "learning_rate": 0.0002, "epoch": 0.5342908438061041, "step": 7440}, {"loss": 0.7387, "grad_norm": 0.5731322765350342, "learning_rate": 0.0002, "epoch": 0.5350089766606823, "step": 7450}, {"loss": 0.8215, "grad_norm": 0.5790955424308777, "learning_rate": 0.0002, "epoch": 0.5357271095152604, "step": 7460}, {"loss": 0.7714, "grad_norm": 0.4979061782360077, "learning_rate": 0.0002, "epoch": 0.5364452423698384, "step": 7470}, {"loss": 0.794, "grad_norm": 0.7335101962089539, "learning_rate": 0.0002, "epoch": 0.5371633752244165, "step": 7480}, {"loss": 0.787, "grad_norm": 0.592521071434021, "learning_rate": 0.0002, "epoch": 0.5378815080789946, "step": 7490}, {"loss": 0.7421, "grad_norm": 0.5784769654273987, "learning_rate": 0.0002, "epoch": 0.5385996409335727, "step": 7500}, {"loss": 0.789, "grad_norm": 0.8148589730262756, "learning_rate": 0.0002, "epoch": 0.5393177737881508, "step": 7510}, {"loss": 0.7777, "grad_norm": 0.5727689862251282, "learning_rate": 0.0002, "epoch": 0.5400359066427289, "step": 7520}, {"loss": 0.8321, "grad_norm": 0.6958279609680176, "learning_rate": 0.0002, "epoch": 0.540754039497307, "step": 7530}, {"loss": 0.7678, "grad_norm": 0.6302788257598877, "learning_rate": 0.0002, "epoch": 0.541472172351885, "step": 7540}, {"loss": 0.7772, "grad_norm": 0.5950970649719238, "learning_rate": 0.0002, "epoch": 0.5421903052064632, "step": 7550}, {"loss": 0.8076, "grad_norm": 0.4275270104408264, "learning_rate": 0.0002, "epoch": 0.5429084380610413, "step": 7560}, {"loss": 0.8158, "grad_norm": 0.7579900622367859, "learning_rate": 0.0002, "epoch": 0.5436265709156194, "step": 7570}, {"loss": 0.8036, "grad_norm": 0.5835317969322205, "learning_rate": 0.0002, "epoch": 0.5443447037701975, "step": 7580}, {"loss": 0.7947, "grad_norm": 0.5305142998695374, "learning_rate": 0.0002, "epoch": 0.5450628366247756, "step": 7590}, {"loss": 0.8043, "grad_norm": 0.6076129674911499, "learning_rate": 0.0002, "epoch": 0.5457809694793537, "step": 7600}, {"loss": 0.8197, "grad_norm": 0.5341935753822327, "learning_rate": 0.0002, "epoch": 0.5464991023339317, "step": 7610}, {"loss": 0.7424, "grad_norm": 0.6070826053619385, "learning_rate": 0.0002, "epoch": 0.5472172351885098, "step": 7620}, {"loss": 0.7801, "grad_norm": 0.6193035840988159, "learning_rate": 0.0002, "epoch": 0.547935368043088, "step": 7630}, {"loss": 0.7639, "grad_norm": 0.6171614527702332, "learning_rate": 0.0002, "epoch": 0.5486535008976661, "step": 7640}, {"loss": 0.7655, "grad_norm": 0.5700938105583191, "learning_rate": 0.0002, "epoch": 0.5493716337522442, "step": 7650}, {"loss": 0.8289, "grad_norm": 0.5742418169975281, "learning_rate": 0.0002, "epoch": 0.5500897666068223, "step": 7660}, {"loss": 0.7942, "grad_norm": 0.6450320482254028, "learning_rate": 0.0002, "epoch": 0.5508078994614004, "step": 7670}, {"loss": 0.807, "grad_norm": 0.542860209941864, "learning_rate": 0.0002, "epoch": 0.5515260323159784, "step": 7680}, {"loss": 0.8298, "grad_norm": 0.538007915019989, "learning_rate": 0.0002, "epoch": 0.5522441651705565, "step": 7690}, {"loss": 0.8301, "grad_norm": 0.5846288204193115, "learning_rate": 0.0002, "epoch": 0.5529622980251346, "step": 7700}, {"loss": 0.7893, "grad_norm": 0.623315155506134, "learning_rate": 0.0002, "epoch": 0.5536804308797127, "step": 7710}, {"loss": 0.8043, "grad_norm": 0.6607962250709534, "learning_rate": 0.0002, "epoch": 0.5543985637342909, "step": 7720}, {"loss": 0.7615, "grad_norm": 0.5258557200431824, "learning_rate": 0.0002, "epoch": 0.555116696588869, "step": 7730}, {"loss": 0.8177, "grad_norm": 0.6464316844940186, "learning_rate": 0.0002, "epoch": 0.5558348294434471, "step": 7740}, {"loss": 0.7683, "grad_norm": 0.6390621662139893, "learning_rate": 0.0002, "epoch": 0.5565529622980251, "step": 7750}, {"loss": 0.8447, "grad_norm": 0.5327560305595398, "learning_rate": 0.0002, "epoch": 0.5572710951526032, "step": 7760}, {"loss": 0.7833, "grad_norm": 0.8202064633369446, "learning_rate": 0.0002, "epoch": 0.5579892280071813, "step": 7770}, {"loss": 0.7818, "grad_norm": 0.45350968837738037, "learning_rate": 0.0002, "epoch": 0.5587073608617594, "step": 7780}, {"loss": 0.7299, "grad_norm": 0.5031413435935974, "learning_rate": 0.0002, "epoch": 0.5594254937163375, "step": 7790}, {"loss": 0.7542, "grad_norm": 0.5047417879104614, "learning_rate": 0.0002, "epoch": 0.5601436265709157, "step": 7800}, {"loss": 0.7989, "grad_norm": 0.668912410736084, "learning_rate": 0.0002, "epoch": 0.5608617594254938, "step": 7810}, {"loss": 0.8226, "grad_norm": 0.6106061339378357, "learning_rate": 0.0002, "epoch": 0.5615798922800718, "step": 7820}, {"loss": 0.7489, "grad_norm": 0.5558443665504456, "learning_rate": 0.0002, "epoch": 0.5622980251346499, "step": 7830}, {"loss": 0.79, "grad_norm": 0.5937177538871765, "learning_rate": 0.0002, "epoch": 0.563016157989228, "step": 7840}, {"loss": 0.7857, "grad_norm": 0.67307448387146, "learning_rate": 0.0002, "epoch": 0.5637342908438061, "step": 7850}, {"loss": 0.8037, "grad_norm": 0.4615475833415985, "learning_rate": 0.0002, "epoch": 0.5644524236983842, "step": 7860}, {"loss": 0.7519, "grad_norm": 0.5462577939033508, "learning_rate": 0.0002, "epoch": 0.5651705565529623, "step": 7870}, {"loss": 0.7821, "grad_norm": 0.6422402858734131, "learning_rate": 0.0002, "epoch": 0.5658886894075404, "step": 7880}, {"loss": 0.8327, "grad_norm": 0.5313532948493958, "learning_rate": 0.0002, "epoch": 0.5666068222621184, "step": 7890}, {"loss": 0.7771, "grad_norm": 0.5647847056388855, "learning_rate": 0.0002, "epoch": 0.5673249551166966, "step": 7900}, {"loss": 0.8126, "grad_norm": 0.6581610441207886, "learning_rate": 0.0002, "epoch": 0.5680430879712747, "step": 7910}, {"loss": 0.7549, "grad_norm": 0.46947669982910156, "learning_rate": 0.0002, "epoch": 0.5687612208258528, "step": 7920}, {"loss": 0.8333, "grad_norm": 0.6420038342475891, "learning_rate": 0.0002, "epoch": 0.5694793536804309, "step": 7930}, {"loss": 0.7921, "grad_norm": 0.6730441451072693, "learning_rate": 0.0002, "epoch": 0.570197486535009, "step": 7940}, {"loss": 0.7668, "grad_norm": 0.3849070966243744, "learning_rate": 0.0002, "epoch": 0.5709156193895871, "step": 7950}, {"loss": 0.8297, "grad_norm": 0.6076335906982422, "learning_rate": 0.0002, "epoch": 0.5716337522441651, "step": 7960}, {"loss": 0.7932, "grad_norm": 0.6446982026100159, "learning_rate": 0.0002, "epoch": 0.5723518850987432, "step": 7970}, {"loss": 0.7988, "grad_norm": 0.6019234657287598, "learning_rate": 0.0002, "epoch": 0.5730700179533214, "step": 7980}, {"loss": 0.8103, "grad_norm": 0.620880663394928, "learning_rate": 0.0002, "epoch": 0.5737881508078995, "step": 7990}, {"loss": 0.7712, "grad_norm": 0.4927573502063751, "learning_rate": 0.0002, "epoch": 0.5745062836624776, "step": 8000}, {"loss": 0.7499, "grad_norm": 0.6276804804801941, "learning_rate": 0.0002, "epoch": 0.5752244165170557, "step": 8010}, {"loss": 0.8232, "grad_norm": 0.484518826007843, "learning_rate": 0.0002, "epoch": 0.5759425493716338, "step": 8020}, {"loss": 0.7658, "grad_norm": 0.5019962787628174, "learning_rate": 0.0002, "epoch": 0.5766606822262118, "step": 8030}, {"loss": 0.7827, "grad_norm": 0.6685234308242798, "learning_rate": 0.0002, "epoch": 0.5773788150807899, "step": 8040}, {"loss": 0.7811, "grad_norm": 0.5762107372283936, "learning_rate": 0.0002, "epoch": 0.578096947935368, "step": 8050}, {"loss": 0.8256, "grad_norm": 0.6402477025985718, "learning_rate": 0.0002, "epoch": 0.5788150807899461, "step": 8060}, {"loss": 0.779, "grad_norm": 0.5919345617294312, "learning_rate": 0.0002, "epoch": 0.5795332136445243, "step": 8070}, {"loss": 0.8179, "grad_norm": 0.47100913524627686, "learning_rate": 0.0002, "epoch": 0.5802513464991024, "step": 8080}, {"loss": 0.7832, "grad_norm": 0.6029118895530701, "learning_rate": 0.0002, "epoch": 0.5809694793536805, "step": 8090}, {"loss": 0.8061, "grad_norm": 0.5896338820457458, "learning_rate": 0.0002, "epoch": 0.5816876122082585, "step": 8100}, {"loss": 0.7991, "grad_norm": 0.49017754197120667, "learning_rate": 0.0002, "epoch": 0.5824057450628366, "step": 8110}, {"loss": 0.8148, "grad_norm": 0.5049256086349487, "learning_rate": 0.0002, "epoch": 0.5831238779174147, "step": 8120}, {"loss": 0.7561, "grad_norm": 0.6874517798423767, "learning_rate": 0.0002, "epoch": 0.5838420107719928, "step": 8130}, {"loss": 0.7908, "grad_norm": 0.5429391264915466, "learning_rate": 0.0002, "epoch": 0.5845601436265709, "step": 8140}, {"loss": 0.7834, "grad_norm": 0.5533722639083862, "learning_rate": 0.0002, "epoch": 0.585278276481149, "step": 8150}, {"loss": 0.7725, "grad_norm": 0.5827956199645996, "learning_rate": 0.0002, "epoch": 0.5859964093357272, "step": 8160}, {"loss": 0.7758, "grad_norm": 0.6670212149620056, "learning_rate": 0.0002, "epoch": 0.5867145421903052, "step": 8170}, {"loss": 0.7625, "grad_norm": 0.5231172442436218, "learning_rate": 0.0002, "epoch": 0.5874326750448833, "step": 8180}, {"loss": 0.7975, "grad_norm": 0.567447304725647, "learning_rate": 0.0002, "epoch": 0.5881508078994614, "step": 8190}, {"loss": 0.7463, "grad_norm": 0.5318575501441956, "learning_rate": 0.0002, "epoch": 0.5888689407540395, "step": 8200}, {"loss": 0.7961, "grad_norm": 0.6959463357925415, "learning_rate": 0.0002, "epoch": 0.5895870736086176, "step": 8210}, {"loss": 0.7575, "grad_norm": 0.6964931488037109, "learning_rate": 0.0002, "epoch": 0.5903052064631957, "step": 8220}, {"loss": 0.8382, "grad_norm": 0.5164617896080017, "learning_rate": 0.0002, "epoch": 0.5910233393177737, "step": 8230}, {"loss": 0.8152, "grad_norm": 0.5456110239028931, "learning_rate": 0.0002, "epoch": 0.5917414721723518, "step": 8240}, {"loss": 0.7627, "grad_norm": 0.6553666591644287, "learning_rate": 0.0002, "epoch": 0.59245960502693, "step": 8250}, {"loss": 0.8134, "grad_norm": 0.6185845732688904, "learning_rate": 0.0002, "epoch": 0.5931777378815081, "step": 8260}, {"loss": 0.8216, "grad_norm": 0.6110545992851257, "learning_rate": 0.0002, "epoch": 0.5938958707360862, "step": 8270}, {"loss": 0.805, "grad_norm": 0.5186824202537537, "learning_rate": 0.0002, "epoch": 0.5946140035906643, "step": 8280}, {"loss": 0.7934, "grad_norm": 0.7003735303878784, "learning_rate": 0.0002, "epoch": 0.5953321364452424, "step": 8290}, {"loss": 0.8095, "grad_norm": 0.4606216549873352, "learning_rate": 0.0002, "epoch": 0.5960502692998204, "step": 8300}, {"loss": 0.8051, "grad_norm": 0.5903441309928894, "learning_rate": 0.0002, "epoch": 0.5967684021543985, "step": 8310}, {"loss": 0.7861, "grad_norm": 0.7916744947433472, "learning_rate": 0.0002, "epoch": 0.5974865350089766, "step": 8320}, {"loss": 0.8234, "grad_norm": 0.5506401062011719, "learning_rate": 0.0002, "epoch": 0.5982046678635548, "step": 8330}, {"loss": 0.8137, "grad_norm": 0.5749204158782959, "learning_rate": 0.0002, "epoch": 0.5989228007181329, "step": 8340}, {"loss": 0.8133, "grad_norm": 0.6807544827461243, "learning_rate": 0.0002, "epoch": 0.599640933572711, "step": 8350}, {"loss": 0.8089, "grad_norm": 0.5782986283302307, "learning_rate": 0.0002, "epoch": 0.6003590664272891, "step": 8360}, {"loss": 0.8725, "grad_norm": 0.7336342334747314, "learning_rate": 0.0002, "epoch": 0.6010771992818671, "step": 8370}, {"loss": 0.7992, "grad_norm": 0.5762712955474854, "learning_rate": 0.0002, "epoch": 0.6017953321364452, "step": 8380}, {"loss": 0.8037, "grad_norm": 0.5726776719093323, "learning_rate": 0.0002, "epoch": 0.6025134649910233, "step": 8390}, {"loss": 0.7918, "grad_norm": 0.5355535745620728, "learning_rate": 0.0002, "epoch": 0.6032315978456014, "step": 8400}, {"loss": 0.8138, "grad_norm": 0.6762161254882812, "learning_rate": 0.0002, "epoch": 0.6039497307001795, "step": 8410}, {"loss": 0.8357, "grad_norm": 0.8200717568397522, "learning_rate": 0.0002, "epoch": 0.6046678635547577, "step": 8420}, {"loss": 0.79, "grad_norm": 0.5600009560585022, "learning_rate": 0.0002, "epoch": 0.6053859964093358, "step": 8430}, {"loss": 0.7387, "grad_norm": 0.6465966105461121, "learning_rate": 0.0002, "epoch": 0.6061041292639138, "step": 8440}, {"loss": 0.838, "grad_norm": 0.5176072120666504, "learning_rate": 0.0002, "epoch": 0.6068222621184919, "step": 8450}, {"loss": 0.7855, "grad_norm": 0.5777280926704407, "learning_rate": 0.0002, "epoch": 0.60754039497307, "step": 8460}, {"loss": 0.7776, "grad_norm": 0.5989252924919128, "learning_rate": 0.0002, "epoch": 0.6082585278276481, "step": 8470}, {"loss": 0.8216, "grad_norm": 0.5207306742668152, "learning_rate": 0.0002, "epoch": 0.6089766606822262, "step": 8480}, {"loss": 0.8092, "grad_norm": 0.5242675542831421, "learning_rate": 0.0002, "epoch": 0.6096947935368043, "step": 8490}, {"loss": 0.7546, "grad_norm": 0.5631455183029175, "learning_rate": 0.0002, "epoch": 0.6104129263913824, "step": 8500}, {"loss": 0.7495, "grad_norm": 0.65207439661026, "learning_rate": 0.0002, "epoch": 0.6111310592459605, "step": 8510}, {"loss": 0.8023, "grad_norm": 0.5808899998664856, "learning_rate": 0.0002, "epoch": 0.6118491921005386, "step": 8520}, {"loss": 0.7763, "grad_norm": 0.558127760887146, "learning_rate": 0.0002, "epoch": 0.6125673249551167, "step": 8530}, {"loss": 0.8012, "grad_norm": 0.6063143014907837, "learning_rate": 0.0002, "epoch": 0.6132854578096948, "step": 8540}, {"loss": 0.7496, "grad_norm": 0.5491744875907898, "learning_rate": 0.0002, "epoch": 0.6140035906642729, "step": 8550}, {"loss": 0.779, "grad_norm": 0.5105780959129333, "learning_rate": 0.0002, "epoch": 0.614721723518851, "step": 8560}, {"loss": 0.7983, "grad_norm": 0.6892395615577698, "learning_rate": 0.0002, "epoch": 0.6154398563734291, "step": 8570}, {"loss": 0.7563, "grad_norm": 0.7411758899688721, "learning_rate": 0.0002, "epoch": 0.6161579892280071, "step": 8580}, {"loss": 0.7455, "grad_norm": 0.6745429635047913, "learning_rate": 0.0002, "epoch": 0.6168761220825852, "step": 8590}, {"loss": 0.8213, "grad_norm": 0.596007227897644, "learning_rate": 0.0002, "epoch": 0.6175942549371634, "step": 8600}, {"loss": 0.7963, "grad_norm": 0.6751060485839844, "learning_rate": 0.0002, "epoch": 0.6183123877917415, "step": 8610}, {"loss": 0.7343, "grad_norm": 0.711124837398529, "learning_rate": 0.0002, "epoch": 0.6190305206463196, "step": 8620}, {"loss": 0.773, "grad_norm": 0.6110914945602417, "learning_rate": 0.0002, "epoch": 0.6197486535008977, "step": 8630}, {"loss": 0.7497, "grad_norm": 0.5687659978866577, "learning_rate": 0.0002, "epoch": 0.6204667863554758, "step": 8640}, {"loss": 0.7754, "grad_norm": 0.7025772929191589, "learning_rate": 0.0002, "epoch": 0.6211849192100538, "step": 8650}, {"loss": 0.7423, "grad_norm": 0.6456184983253479, "learning_rate": 0.0002, "epoch": 0.6219030520646319, "step": 8660}, {"loss": 0.7449, "grad_norm": 0.5317023992538452, "learning_rate": 0.0002, "epoch": 0.62262118491921, "step": 8670}, {"loss": 0.8146, "grad_norm": 0.5531691908836365, "learning_rate": 0.0002, "epoch": 0.6233393177737881, "step": 8680}, {"loss": 0.8171, "grad_norm": 0.6063531637191772, "learning_rate": 0.0002, "epoch": 0.6240574506283663, "step": 8690}, {"loss": 0.7943, "grad_norm": 1.094390630722046, "learning_rate": 0.0002, "epoch": 0.6247755834829444, "step": 8700}, {"loss": 0.7993, "grad_norm": 0.5558148622512817, "learning_rate": 0.0002, "epoch": 0.6254937163375225, "step": 8710}, {"loss": 0.7747, "grad_norm": 0.5470370054244995, "learning_rate": 0.0002, "epoch": 0.6262118491921005, "step": 8720}, {"loss": 0.8252, "grad_norm": 0.5852634310722351, "learning_rate": 0.0002, "epoch": 0.6269299820466786, "step": 8730}, {"loss": 0.8712, "grad_norm": 0.6120240092277527, "learning_rate": 0.0002, "epoch": 0.6276481149012567, "step": 8740}, {"loss": 0.8367, "grad_norm": 0.5608004927635193, "learning_rate": 0.0002, "epoch": 0.6283662477558348, "step": 8750}, {"loss": 0.7711, "grad_norm": 0.5980432033538818, "learning_rate": 0.0002, "epoch": 0.6290843806104129, "step": 8760}, {"loss": 0.7903, "grad_norm": 0.5670580863952637, "learning_rate": 0.0002, "epoch": 0.629802513464991, "step": 8770}, {"loss": 0.7765, "grad_norm": 0.5931687951087952, "learning_rate": 0.0002, "epoch": 0.6305206463195692, "step": 8780}, {"loss": 0.7752, "grad_norm": 0.7872577905654907, "learning_rate": 0.0002, "epoch": 0.6312387791741472, "step": 8790}, {"loss": 0.8045, "grad_norm": 0.6355181336402893, "learning_rate": 0.0002, "epoch": 0.6319569120287253, "step": 8800}, {"loss": 0.7651, "grad_norm": 0.501913845539093, "learning_rate": 0.0002, "epoch": 0.6326750448833034, "step": 8810}, {"loss": 0.8023, "grad_norm": 0.5956716537475586, "learning_rate": 0.0002, "epoch": 0.6333931777378815, "step": 8820}, {"loss": 0.798, "grad_norm": 0.6448253393173218, "learning_rate": 0.0002, "epoch": 0.6341113105924596, "step": 8830}, {"loss": 0.7878, "grad_norm": 0.6139631271362305, "learning_rate": 0.0002, "epoch": 0.6348294434470377, "step": 8840}, {"loss": 0.7767, "grad_norm": 0.5894306302070618, "learning_rate": 0.0002, "epoch": 0.6355475763016158, "step": 8850}, {"loss": 0.7516, "grad_norm": 0.8724799752235413, "learning_rate": 0.0002, "epoch": 0.6362657091561938, "step": 8860}, {"loss": 0.7715, "grad_norm": 0.5413858890533447, "learning_rate": 0.0002, "epoch": 0.636983842010772, "step": 8870}, {"loss": 0.8175, "grad_norm": 0.5993430614471436, "learning_rate": 0.0002, "epoch": 0.6377019748653501, "step": 8880}, {"loss": 0.7865, "grad_norm": 0.539415717124939, "learning_rate": 0.0002, "epoch": 0.6384201077199282, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.600125789642334, "learning_rate": 0.0002, "epoch": 0.6391382405745063, "step": 8900}, {"loss": 0.7886, "grad_norm": 0.5597978234291077, "learning_rate": 0.0002, "epoch": 0.6398563734290844, "step": 8910}, {"loss": 0.8468, "grad_norm": 0.6262031197547913, "learning_rate": 0.0002, "epoch": 0.6405745062836625, "step": 8920}, {"loss": 0.7523, "grad_norm": 0.72662752866745, "learning_rate": 0.0002, "epoch": 0.6412926391382405, "step": 8930}, {"loss": 0.8099, "grad_norm": 0.613002598285675, "learning_rate": 0.0002, "epoch": 0.6420107719928186, "step": 8940}, {"loss": 0.8112, "grad_norm": 0.6511827707290649, "learning_rate": 0.0002, "epoch": 0.6427289048473968, "step": 8950}, {"loss": 0.7479, "grad_norm": 0.5383973717689514, "learning_rate": 0.0002, "epoch": 0.6434470377019749, "step": 8960}, {"loss": 0.764, "grad_norm": 0.5236184597015381, "learning_rate": 0.0002, "epoch": 0.644165170556553, "step": 8970}, {"loss": 0.7515, "grad_norm": 0.5938544273376465, "learning_rate": 0.0002, "epoch": 0.6448833034111311, "step": 8980}, {"loss": 0.8103, "grad_norm": 0.4594680964946747, "learning_rate": 0.0002, "epoch": 0.6456014362657092, "step": 8990}, {"loss": 0.7495, "grad_norm": 0.6314211487770081, "learning_rate": 0.0002, "epoch": 0.6463195691202872, "step": 9000}, {"loss": 0.8162, "grad_norm": 0.6291103363037109, "learning_rate": 0.0002, "epoch": 0.6470377019748653, "step": 9010}, {"loss": 0.8167, "grad_norm": 0.5888266563415527, "learning_rate": 0.0002, "epoch": 0.6477558348294434, "step": 9020}, {"loss": 0.7685, "grad_norm": 0.5613022446632385, "learning_rate": 0.0002, "epoch": 0.6484739676840215, "step": 9030}, {"loss": 0.8142, "grad_norm": 0.7219604253768921, "learning_rate": 0.0002, "epoch": 0.6491921005385997, "step": 9040}, {"loss": 0.805, "grad_norm": 0.5846529006958008, "learning_rate": 0.0002, "epoch": 0.6499102333931778, "step": 9050}, {"loss": 0.8471, "grad_norm": 0.7264063954353333, "learning_rate": 0.0002, "epoch": 0.6506283662477559, "step": 9060}, {"loss": 0.7925, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "epoch": 0.6513464991023339, "step": 9070}, {"loss": 0.7961, "grad_norm": 0.4857395887374878, "learning_rate": 0.0002, "epoch": 0.652064631956912, "step": 9080}, {"loss": 0.7567, "grad_norm": 0.5044030547142029, "learning_rate": 0.0002, "epoch": 0.6527827648114901, "step": 9090}, {"loss": 0.7889, "grad_norm": 0.6105342507362366, "learning_rate": 0.0002, "epoch": 0.6535008976660682, "step": 9100}, {"loss": 0.7692, "grad_norm": 0.6408740282058716, "learning_rate": 0.0002, "epoch": 0.6542190305206463, "step": 9110}, {"loss": 0.7788, "grad_norm": 0.7474880814552307, "learning_rate": 0.0002, "epoch": 0.6549371633752245, "step": 9120}, {"loss": 0.7694, "grad_norm": 0.584768533706665, "learning_rate": 0.0002, "epoch": 0.6556552962298026, "step": 9130}, {"loss": 0.8273, "grad_norm": 0.6368113160133362, "learning_rate": 0.0002, "epoch": 0.6563734290843806, "step": 9140}, {"loss": 0.7493, "grad_norm": 0.693631649017334, "learning_rate": 0.0002, "epoch": 0.6570915619389587, "step": 9150}, {"loss": 0.7636, "grad_norm": 0.6094512343406677, "learning_rate": 0.0002, "epoch": 0.6578096947935368, "step": 9160}, {"loss": 0.8269, "grad_norm": 0.7154942750930786, "learning_rate": 0.0002, "epoch": 0.6585278276481149, "step": 9170}, {"loss": 0.7623, "grad_norm": 0.5749237537384033, "learning_rate": 0.0002, "epoch": 0.659245960502693, "step": 9180}, {"loss": 0.799, "grad_norm": 0.6214450001716614, "learning_rate": 0.0002, "epoch": 0.6599640933572711, "step": 9190}, {"loss": 0.7973, "grad_norm": 0.6357814073562622, "learning_rate": 0.0002, "epoch": 0.6606822262118492, "step": 9200}, {"loss": 0.773, "grad_norm": 0.5677326917648315, "learning_rate": 0.0002, "epoch": 0.6614003590664272, "step": 9210}, {"loss": 0.8173, "grad_norm": 0.5432633757591248, "learning_rate": 0.0002, "epoch": 0.6621184919210054, "step": 9220}, {"loss": 0.7573, "grad_norm": 0.43935060501098633, "learning_rate": 0.0002, "epoch": 0.6628366247755835, "step": 9230}, {"loss": 0.848, "grad_norm": 0.5350922346115112, "learning_rate": 0.0002, "epoch": 0.6635547576301616, "step": 9240}, {"loss": 0.7409, "grad_norm": 0.7745687365531921, "learning_rate": 0.0002, "epoch": 0.6642728904847397, "step": 9250}, {"loss": 0.7412, "grad_norm": 0.5767113566398621, "learning_rate": 0.0002, "epoch": 0.6649910233393178, "step": 9260}, {"loss": 0.8197, "grad_norm": 0.49304983019828796, "learning_rate": 0.0002, "epoch": 0.6657091561938959, "step": 9270}, {"loss": 0.7856, "grad_norm": 0.6355269551277161, "learning_rate": 0.0002, "epoch": 0.6664272890484739, "step": 9280}, {"loss": 0.7659, "grad_norm": 0.5539451241493225, "learning_rate": 0.0002, "epoch": 0.667145421903052, "step": 9290}, {"loss": 0.7888, "grad_norm": 0.5225138068199158, "learning_rate": 0.0002, "epoch": 0.6678635547576302, "step": 9300}, {"loss": 0.8048, "grad_norm": 0.5435736179351807, "learning_rate": 0.0002, "epoch": 0.6685816876122083, "step": 9310}, {"loss": 0.8284, "grad_norm": 0.611266553401947, "learning_rate": 0.0002, "epoch": 0.6692998204667864, "step": 9320}, {"loss": 0.8081, "grad_norm": 0.5880926251411438, "learning_rate": 0.0002, "epoch": 0.6700179533213645, "step": 9330}, {"loss": 0.7781, "grad_norm": 0.5301468372344971, "learning_rate": 0.0002, "epoch": 0.6707360861759426, "step": 9340}, {"loss": 0.7586, "grad_norm": 0.5614377856254578, "learning_rate": 0.0002, "epoch": 0.6714542190305206, "step": 9350}, {"loss": 0.7538, "grad_norm": 0.7177342176437378, "learning_rate": 0.0002, "epoch": 0.6721723518850987, "step": 9360}, {"loss": 0.7412, "grad_norm": 0.5187423825263977, "learning_rate": 0.0002, "epoch": 0.6728904847396768, "step": 9370}, {"loss": 0.7456, "grad_norm": 0.49305087327957153, "learning_rate": 0.0002, "epoch": 0.6736086175942549, "step": 9380}, {"loss": 0.7926, "grad_norm": 0.555867612361908, "learning_rate": 0.0002, "epoch": 0.6743267504488331, "step": 9390}, {"loss": 0.7486, "grad_norm": 0.8308040499687195, "learning_rate": 0.0002, "epoch": 0.6750448833034112, "step": 9400}, {"loss": 0.8225, "grad_norm": 0.6522438526153564, "learning_rate": 0.0002, "epoch": 0.6757630161579893, "step": 9410}, {"loss": 0.8283, "grad_norm": 0.5768371224403381, "learning_rate": 0.0002, "epoch": 0.6764811490125673, "step": 9420}, {"loss": 0.7815, "grad_norm": 0.783802330493927, "learning_rate": 0.0002, "epoch": 0.6771992818671454, "step": 9430}, {"loss": 0.7511, "grad_norm": 0.5246656537055969, "learning_rate": 0.0002, "epoch": 0.6779174147217235, "step": 9440}, {"loss": 0.7866, "grad_norm": 0.6630974411964417, "learning_rate": 0.0002, "epoch": 0.6786355475763016, "step": 9450}, {"loss": 0.7961, "grad_norm": 0.5012770295143127, "learning_rate": 0.0002, "epoch": 0.6793536804308797, "step": 9460}, {"loss": 0.7762, "grad_norm": 0.6208643317222595, "learning_rate": 0.0002, "epoch": 0.6800718132854578, "step": 9470}, {"loss": 0.7229, "grad_norm": 0.6033898591995239, "learning_rate": 0.0002, "epoch": 0.680789946140036, "step": 9480}, {"loss": 0.8315, "grad_norm": 0.6613174080848694, "learning_rate": 0.0002, "epoch": 0.681508078994614, "step": 9490}, {"loss": 0.7874, "grad_norm": 0.6417899131774902, "learning_rate": 0.0002, "epoch": 0.6822262118491921, "step": 9500}, {"loss": 0.7979, "grad_norm": 0.5060321092605591, "learning_rate": 0.0002, "epoch": 0.6829443447037702, "step": 9510}, {"loss": 0.7908, "grad_norm": 0.586670458316803, "learning_rate": 0.0002, "epoch": 0.6836624775583483, "step": 9520}, {"loss": 0.7652, "grad_norm": 0.6607828736305237, "learning_rate": 0.0002, "epoch": 0.6843806104129264, "step": 9530}, {"loss": 0.7645, "grad_norm": 0.5142775177955627, "learning_rate": 0.0002, "epoch": 0.6850987432675045, "step": 9540}, {"loss": 0.7553, "grad_norm": 0.741000771522522, "learning_rate": 0.0002, "epoch": 0.6858168761220825, "step": 9550}, {"loss": 0.8453, "grad_norm": 0.4687826335430145, "learning_rate": 0.0002, "epoch": 0.6865350089766606, "step": 9560}, {"loss": 0.7582, "grad_norm": 0.6452056169509888, "learning_rate": 0.0002, "epoch": 0.6872531418312388, "step": 9570}, {"loss": 0.7965, "grad_norm": 0.6393555402755737, "learning_rate": 0.0002, "epoch": 0.6879712746858169, "step": 9580}, {"loss": 0.802, "grad_norm": 0.4907757043838501, "learning_rate": 0.0002, "epoch": 0.688689407540395, "step": 9590}, {"loss": 0.7813, "grad_norm": 0.5380825996398926, "learning_rate": 0.0002, "epoch": 0.6894075403949731, "step": 9600}, {"loss": 0.8188, "grad_norm": 0.5657393932342529, "learning_rate": 0.0002, "epoch": 0.6901256732495512, "step": 9610}, {"loss": 0.7581, "grad_norm": 0.8505447506904602, "learning_rate": 0.0002, "epoch": 0.6908438061041292, "step": 9620}, {"loss": 0.7631, "grad_norm": 0.5389836430549622, "learning_rate": 0.0002, "epoch": 0.6915619389587073, "step": 9630}, {"loss": 0.8015, "grad_norm": 0.4977441728115082, "learning_rate": 0.0002, "epoch": 0.6922800718132854, "step": 9640}, {"loss": 0.8057, "grad_norm": 0.5855389833450317, "learning_rate": 0.0002, "epoch": 0.6929982046678635, "step": 9650}, {"loss": 0.7735, "grad_norm": 0.633994996547699, "learning_rate": 0.0002, "epoch": 0.6937163375224417, "step": 9660}, {"loss": 0.7918, "grad_norm": 0.5592191815376282, "learning_rate": 0.0002, "epoch": 0.6944344703770198, "step": 9670}, {"loss": 0.7883, "grad_norm": 0.6030594706535339, "learning_rate": 0.0002, "epoch": 0.6951526032315979, "step": 9680}, {"loss": 0.7472, "grad_norm": 0.6782388687133789, "learning_rate": 0.0002, "epoch": 0.6958707360861759, "step": 9690}, {"loss": 0.8097, "grad_norm": 0.6777627468109131, "learning_rate": 0.0002, "epoch": 0.696588868940754, "step": 9700}, {"loss": 0.7958, "grad_norm": 0.5674123764038086, "learning_rate": 0.0002, "epoch": 0.6973070017953321, "step": 9710}, {"loss": 0.7743, "grad_norm": 0.5280387997627258, "learning_rate": 0.0002, "epoch": 0.6980251346499102, "step": 9720}, {"loss": 0.7496, "grad_norm": 0.5471981763839722, "learning_rate": 0.0002, "epoch": 0.6987432675044883, "step": 9730}, {"loss": 0.7837, "grad_norm": 0.6751061677932739, "learning_rate": 0.0002, "epoch": 0.6994614003590665, "step": 9740}, {"loss": 0.7686, "grad_norm": 0.5942487716674805, "learning_rate": 0.0002, "epoch": 0.7001795332136446, "step": 9750}, {"loss": 0.757, "grad_norm": 0.6165713667869568, "learning_rate": 0.0002, "epoch": 0.7008976660682226, "step": 9760}, {"loss": 0.7864, "grad_norm": 0.5745091438293457, "learning_rate": 0.0002, "epoch": 0.7016157989228007, "step": 9770}, {"loss": 0.8079, "grad_norm": 0.600308358669281, "learning_rate": 0.0002, "epoch": 0.7023339317773788, "step": 9780}, {"loss": 0.7527, "grad_norm": 0.6448577046394348, "learning_rate": 0.0002, "epoch": 0.7030520646319569, "step": 9790}, {"loss": 0.7725, "grad_norm": 0.5662767291069031, "learning_rate": 0.0002, "epoch": 0.703770197486535, "step": 9800}, {"loss": 0.8028, "grad_norm": 0.6490433812141418, "learning_rate": 0.0002, "epoch": 0.7044883303411131, "step": 9810}, {"loss": 0.8006, "grad_norm": 0.6126134991645813, "learning_rate": 0.0002, "epoch": 0.7052064631956912, "step": 9820}, {"loss": 0.8034, "grad_norm": 0.7181116938591003, "learning_rate": 0.0002, "epoch": 0.7059245960502692, "step": 9830}, {"loss": 0.7937, "grad_norm": 0.7805212140083313, "learning_rate": 0.0002, "epoch": 0.7066427289048474, "step": 9840}, {"loss": 0.7781, "grad_norm": 0.7521958947181702, "learning_rate": 0.0002, "epoch": 0.7073608617594255, "step": 9850}, {"loss": 0.7412, "grad_norm": 0.5610787868499756, "learning_rate": 0.0002, "epoch": 0.7080789946140036, "step": 9860}, {"loss": 0.7627, "grad_norm": 0.7026229500770569, "learning_rate": 0.0002, "epoch": 0.7087971274685817, "step": 9870}, {"loss": 0.8085, "grad_norm": 0.551691472530365, "learning_rate": 0.0002, "epoch": 0.7095152603231598, "step": 9880}, {"loss": 0.7874, "grad_norm": 0.5841995477676392, "learning_rate": 0.0002, "epoch": 0.7102333931777379, "step": 9890}, {"loss": 0.7749, "grad_norm": 0.7170061469078064, "learning_rate": 0.0002, "epoch": 0.7109515260323159, "step": 9900}, {"loss": 0.7917, "grad_norm": 0.49836990237236023, "learning_rate": 0.0002, "epoch": 0.711669658886894, "step": 9910}, {"loss": 0.7667, "grad_norm": 0.5234556794166565, "learning_rate": 0.0002, "epoch": 0.7123877917414722, "step": 9920}, {"loss": 0.8438, "grad_norm": 0.7590384483337402, "learning_rate": 0.0002, "epoch": 0.7131059245960503, "step": 9930}, {"loss": 0.7725, "grad_norm": 0.5657515525817871, "learning_rate": 0.0002, "epoch": 0.7138240574506284, "step": 9940}, {"loss": 0.8184, "grad_norm": 0.5969128012657166, "learning_rate": 0.0002, "epoch": 0.7145421903052065, "step": 9950}, {"loss": 0.7375, "grad_norm": 0.7136867046356201, "learning_rate": 0.0002, "epoch": 0.7152603231597846, "step": 9960}, {"loss": 0.7883, "grad_norm": 0.6774699091911316, "learning_rate": 0.0002, "epoch": 0.7159784560143626, "step": 9970}, {"loss": 0.7629, "grad_norm": 0.6066371202468872, "learning_rate": 0.0002, "epoch": 0.7166965888689407, "step": 9980}, {"loss": 0.7767, "grad_norm": 0.7355279922485352, "learning_rate": 0.0002, "epoch": 0.7174147217235188, "step": 9990}, {"loss": 0.7643, "grad_norm": 0.7996646761894226, "learning_rate": 0.0002, "epoch": 0.718132854578097, "step": 10000}, {"loss": 0.8304, "grad_norm": 0.628839910030365, "learning_rate": 0.0002, "epoch": 0.7188509874326751, "step": 10010}, {"loss": 0.7292, "grad_norm": 0.5472931265830994, "learning_rate": 0.0002, "epoch": 0.7195691202872532, "step": 10020}, {"loss": 0.7787, "grad_norm": 0.5776344537734985, "learning_rate": 0.0002, "epoch": 0.7202872531418313, "step": 10030}, {"loss": 0.7432, "grad_norm": 0.5041707158088684, "learning_rate": 0.0002, "epoch": 0.7210053859964093, "step": 10040}, {"loss": 0.7923, "grad_norm": 0.5965308547019958, "learning_rate": 0.0002, "epoch": 0.7217235188509874, "step": 10050}, {"loss": 0.8131, "grad_norm": 0.5892689228057861, "learning_rate": 0.0002, "epoch": 0.7224416517055655, "step": 10060}, {"loss": 0.7961, "grad_norm": 0.5695884227752686, "learning_rate": 0.0002, "epoch": 0.7231597845601436, "step": 10070}, {"loss": 0.7806, "grad_norm": 0.6547690629959106, "learning_rate": 0.0002, "epoch": 0.7238779174147217, "step": 10080}, {"loss": 0.7978, "grad_norm": 0.6759928464889526, "learning_rate": 0.0002, "epoch": 0.7245960502692999, "step": 10090}, {"loss": 0.7547, "grad_norm": 0.6829725503921509, "learning_rate": 0.0002, "epoch": 0.725314183123878, "step": 10100}, {"loss": 0.7507, "grad_norm": 0.5242751240730286, "learning_rate": 0.0002, "epoch": 0.726032315978456, "step": 10110}, {"loss": 0.8042, "grad_norm": 0.6947014927864075, "learning_rate": 0.0002, "epoch": 0.7267504488330341, "step": 10120}, {"loss": 0.7621, "grad_norm": 0.6094982624053955, "learning_rate": 0.0002, "epoch": 0.7274685816876122, "step": 10130}, {"loss": 0.7911, "grad_norm": 0.628461480140686, "learning_rate": 0.0002, "epoch": 0.7281867145421903, "step": 10140}, {"loss": 0.7839, "grad_norm": 0.4952087104320526, "learning_rate": 0.0002, "epoch": 0.7289048473967684, "step": 10150}, {"loss": 0.7582, "grad_norm": 0.6917221546173096, "learning_rate": 0.0002, "epoch": 0.7296229802513465, "step": 10160}, {"loss": 0.7791, "grad_norm": 0.6866413354873657, "learning_rate": 0.0002, "epoch": 0.7303411131059246, "step": 10170}, {"loss": 0.7628, "grad_norm": 0.5505863428115845, "learning_rate": 0.0002, "epoch": 0.7310592459605026, "step": 10180}, {"loss": 0.7941, "grad_norm": 0.5903199911117554, "learning_rate": 0.0002, "epoch": 0.7317773788150808, "step": 10190}, {"loss": 0.8072, "grad_norm": 0.5001798272132874, "learning_rate": 0.0002, "epoch": 0.7324955116696589, "step": 10200}, {"loss": 0.7934, "grad_norm": 0.5117581486701965, "learning_rate": 0.0002, "epoch": 0.733213644524237, "step": 10210}, {"loss": 0.8364, "grad_norm": 0.7716088891029358, "learning_rate": 0.0002, "epoch": 0.7339317773788151, "step": 10220}, {"loss": 0.7775, "grad_norm": 0.5973874926567078, "learning_rate": 0.0002, "epoch": 0.7346499102333932, "step": 10230}, {"loss": 0.7689, "grad_norm": 0.6433483362197876, "learning_rate": 0.0002, "epoch": 0.7353680430879713, "step": 10240}, {"loss": 0.8307, "grad_norm": 0.6241081357002258, "learning_rate": 0.0002, "epoch": 0.7360861759425493, "step": 10250}, {"loss": 0.7432, "grad_norm": 0.7198845744132996, "learning_rate": 0.0002, "epoch": 0.7368043087971274, "step": 10260}, {"loss": 0.7545, "grad_norm": 0.5879023671150208, "learning_rate": 0.0002, "epoch": 0.7375224416517056, "step": 10270}, {"loss": 0.7526, "grad_norm": 0.5810162425041199, "learning_rate": 0.0002, "epoch": 0.7382405745062837, "step": 10280}, {"loss": 0.7839, "grad_norm": 0.6336500644683838, "learning_rate": 0.0002, "epoch": 0.7389587073608618, "step": 10290}, {"loss": 0.7597, "grad_norm": 0.5627583861351013, "learning_rate": 0.0002, "epoch": 0.7396768402154399, "step": 10300}, {"loss": 0.8166, "grad_norm": 0.5396066904067993, "learning_rate": 0.0002, "epoch": 0.740394973070018, "step": 10310}, {"loss": 0.7698, "grad_norm": 0.5519505143165588, "learning_rate": 0.0002, "epoch": 0.741113105924596, "step": 10320}, {"loss": 0.7953, "grad_norm": 0.628710925579071, "learning_rate": 0.0002, "epoch": 0.7418312387791741, "step": 10330}, {"loss": 0.805, "grad_norm": 0.6466957926750183, "learning_rate": 0.0002, "epoch": 0.7425493716337522, "step": 10340}, {"loss": 0.8173, "grad_norm": 0.6269286274909973, "learning_rate": 0.0002, "epoch": 0.7432675044883303, "step": 10350}, {"loss": 0.8315, "grad_norm": 0.6985455751419067, "learning_rate": 0.0002, "epoch": 0.7439856373429085, "step": 10360}, {"loss": 0.7598, "grad_norm": 0.6203648447990417, "learning_rate": 0.0002, "epoch": 0.7447037701974866, "step": 10370}, {"loss": 0.7937, "grad_norm": 0.6524295210838318, "learning_rate": 0.0002, "epoch": 0.7454219030520647, "step": 10380}, {"loss": 0.8005, "grad_norm": 0.6108002662658691, "learning_rate": 0.0002, "epoch": 0.7461400359066427, "step": 10390}, {"loss": 0.7592, "grad_norm": 0.5196276903152466, "learning_rate": 0.0002, "epoch": 0.7468581687612208, "step": 10400}, {"loss": 0.7769, "grad_norm": 0.6207506656646729, "learning_rate": 0.0002, "epoch": 0.7475763016157989, "step": 10410}, {"loss": 0.8066, "grad_norm": 0.6015686988830566, "learning_rate": 0.0002, "epoch": 0.748294434470377, "step": 10420}, {"loss": 0.7993, "grad_norm": 0.6402649879455566, "learning_rate": 0.0002, "epoch": 0.7490125673249551, "step": 10430}, {"loss": 0.802, "grad_norm": 0.7816081047058105, "learning_rate": 0.0002, "epoch": 0.7497307001795332, "step": 10440}, {"loss": 0.8021, "grad_norm": 0.6148143410682678, "learning_rate": 0.0002, "epoch": 0.7504488330341114, "step": 10450}, {"loss": 0.7986, "grad_norm": 0.6496613621711731, "learning_rate": 0.0002, "epoch": 0.7511669658886894, "step": 10460}, {"loss": 0.8152, "grad_norm": 0.49158045649528503, "learning_rate": 0.0002, "epoch": 0.7518850987432675, "step": 10470}, {"loss": 0.8098, "grad_norm": 0.8629217743873596, "learning_rate": 0.0002, "epoch": 0.7526032315978456, "step": 10480}, {"loss": 0.807, "grad_norm": 0.6800066828727722, "learning_rate": 0.0002, "epoch": 0.7533213644524237, "step": 10490}, {"loss": 0.7238, "grad_norm": 0.6480063199996948, "learning_rate": 0.0002, "epoch": 0.7540394973070018, "step": 10500}, {"loss": 0.7818, "grad_norm": 0.5740751028060913, "learning_rate": 0.0002, "epoch": 0.7547576301615799, "step": 10510}, {"loss": 0.7732, "grad_norm": 0.7182627320289612, "learning_rate": 0.0002, "epoch": 0.755475763016158, "step": 10520}, {"loss": 0.7752, "grad_norm": 0.6482816934585571, "learning_rate": 0.0002, "epoch": 0.756193895870736, "step": 10530}, {"loss": 0.7564, "grad_norm": 0.4937674105167389, "learning_rate": 0.0002, "epoch": 0.7569120287253142, "step": 10540}, {"loss": 0.7783, "grad_norm": 0.6818482875823975, "learning_rate": 0.0002, "epoch": 0.7576301615798923, "step": 10550}, {"loss": 0.8303, "grad_norm": 0.6375173926353455, "learning_rate": 0.0002, "epoch": 0.7583482944344704, "step": 10560}, {"loss": 0.77, "grad_norm": 0.528798520565033, "learning_rate": 0.0002, "epoch": 0.7590664272890485, "step": 10570}, {"loss": 0.8435, "grad_norm": 0.42099910974502563, "learning_rate": 0.0002, "epoch": 0.7597845601436266, "step": 10580}, {"loss": 0.8218, "grad_norm": 0.529604434967041, "learning_rate": 0.0002, "epoch": 0.7605026929982047, "step": 10590}, {"loss": 0.7833, "grad_norm": 0.6236841082572937, "learning_rate": 0.0002, "epoch": 0.7612208258527827, "step": 10600}, {"loss": 0.777, "grad_norm": 0.6194891929626465, "learning_rate": 0.0002, "epoch": 0.7619389587073608, "step": 10610}, {"loss": 0.7967, "grad_norm": 0.5206209421157837, "learning_rate": 0.0002, "epoch": 0.762657091561939, "step": 10620}, {"loss": 0.811, "grad_norm": 0.7981295585632324, "learning_rate": 0.0002, "epoch": 0.7633752244165171, "step": 10630}, {"loss": 0.8016, "grad_norm": 0.6113479137420654, "learning_rate": 0.0002, "epoch": 0.7640933572710952, "step": 10640}, {"loss": 0.7642, "grad_norm": 0.7025435566902161, "learning_rate": 0.0002, "epoch": 0.7648114901256733, "step": 10650}, {"loss": 0.7293, "grad_norm": 0.46914348006248474, "learning_rate": 0.0002, "epoch": 0.7655296229802514, "step": 10660}, {"loss": 0.8079, "grad_norm": 0.6134725213050842, "learning_rate": 0.0002, "epoch": 0.7662477558348294, "step": 10670}, {"loss": 0.7469, "grad_norm": 0.583859920501709, "learning_rate": 0.0002, "epoch": 0.7669658886894075, "step": 10680}, {"loss": 0.843, "grad_norm": 0.511349081993103, "learning_rate": 0.0002, "epoch": 0.7676840215439856, "step": 10690}, {"loss": 0.8355, "grad_norm": 0.6467110514640808, "learning_rate": 0.0002, "epoch": 0.7684021543985637, "step": 10700}, {"loss": 0.7935, "grad_norm": 0.7210163474082947, "learning_rate": 0.0002, "epoch": 0.7691202872531419, "step": 10710}, {"loss": 0.7807, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "epoch": 0.76983842010772, "step": 10720}, {"loss": 0.7742, "grad_norm": 0.6237271428108215, "learning_rate": 0.0002, "epoch": 0.7705565529622981, "step": 10730}, {"loss": 0.8185, "grad_norm": 0.664328396320343, "learning_rate": 0.0002, "epoch": 0.7712746858168761, "step": 10740}, {"loss": 0.8096, "grad_norm": 0.6550520062446594, "learning_rate": 0.0002, "epoch": 0.7719928186714542, "step": 10750}, {"loss": 0.7538, "grad_norm": 0.5103325843811035, "learning_rate": 0.0002, "epoch": 0.7727109515260323, "step": 10760}, {"loss": 0.7777, "grad_norm": 0.7171200513839722, "learning_rate": 0.0002, "epoch": 0.7734290843806104, "step": 10770}, {"loss": 0.7743, "grad_norm": 0.5947384834289551, "learning_rate": 0.0002, "epoch": 0.7741472172351885, "step": 10780}, {"loss": 0.781, "grad_norm": 0.5293096899986267, "learning_rate": 0.0002, "epoch": 0.7748653500897666, "step": 10790}, {"loss": 0.777, "grad_norm": 0.6372577548027039, "learning_rate": 0.0002, "epoch": 0.7755834829443446, "step": 10800}, {"loss": 0.7972, "grad_norm": 0.5738261938095093, "learning_rate": 0.0002, "epoch": 0.7763016157989228, "step": 10810}, {"loss": 0.7877, "grad_norm": 0.7309247255325317, "learning_rate": 0.0002, "epoch": 0.7770197486535009, "step": 10820}, {"loss": 0.7745, "grad_norm": 0.8867193460464478, "learning_rate": 0.0002, "epoch": 0.777737881508079, "step": 10830}, {"loss": 0.7959, "grad_norm": 0.6151437759399414, "learning_rate": 0.0002, "epoch": 0.7784560143626571, "step": 10840}, {"loss": 0.7897, "grad_norm": 0.5645464658737183, "learning_rate": 0.0002, "epoch": 0.7791741472172352, "step": 10850}, {"loss": 0.7858, "grad_norm": 0.5118698477745056, "learning_rate": 0.0002, "epoch": 0.7798922800718133, "step": 10860}, {"loss": 0.8064, "grad_norm": 0.618181049823761, "learning_rate": 0.0002, "epoch": 0.7806104129263913, "step": 10870}, {"loss": 0.7675, "grad_norm": 0.7206462025642395, "learning_rate": 0.0002, "epoch": 0.7813285457809694, "step": 10880}, {"loss": 0.8162, "grad_norm": 0.7993820905685425, "learning_rate": 0.0002, "epoch": 0.7820466786355476, "step": 10890}, {"loss": 0.781, "grad_norm": 0.5072754621505737, "learning_rate": 0.0002, "epoch": 0.7827648114901257, "step": 10900}, {"loss": 0.7575, "grad_norm": 0.5829088687896729, "learning_rate": 0.0002, "epoch": 0.7834829443447038, "step": 10910}, {"loss": 0.7552, "grad_norm": 0.5778957605361938, "learning_rate": 0.0002, "epoch": 0.7842010771992819, "step": 10920}, {"loss": 0.7652, "grad_norm": 0.7237067222595215, "learning_rate": 0.0002, "epoch": 0.78491921005386, "step": 10930}, {"loss": 0.8357, "grad_norm": 0.5778013467788696, "learning_rate": 0.0002, "epoch": 0.785637342908438, "step": 10940}, {"loss": 0.7464, "grad_norm": 0.6129629611968994, "learning_rate": 0.0002, "epoch": 0.7863554757630161, "step": 10950}, {"loss": 0.7863, "grad_norm": 0.5637320876121521, "learning_rate": 0.0002, "epoch": 0.7870736086175942, "step": 10960}, {"loss": 0.7645, "grad_norm": 0.6253715753555298, "learning_rate": 0.0002, "epoch": 0.7877917414721723, "step": 10970}, {"loss": 0.8307, "grad_norm": 0.6209888458251953, "learning_rate": 0.0002, "epoch": 0.7885098743267505, "step": 10980}, {"loss": 0.7899, "grad_norm": 1.0841948986053467, "learning_rate": 0.0002, "epoch": 0.7892280071813286, "step": 10990}, {"loss": 0.7659, "grad_norm": 0.6570560336112976, "learning_rate": 0.0002, "epoch": 0.7899461400359067, "step": 11000}, {"loss": 0.7839, "grad_norm": 0.4830388128757477, "learning_rate": 0.0002, "epoch": 0.7906642728904847, "step": 11010}, {"loss": 0.8064, "grad_norm": 0.7607520222663879, "learning_rate": 0.0002, "epoch": 0.7913824057450628, "step": 11020}, {"loss": 0.8009, "grad_norm": 0.8202590346336365, "learning_rate": 0.0002, "epoch": 0.7921005385996409, "step": 11030}, {"loss": 0.7788, "grad_norm": 0.5640848278999329, "learning_rate": 0.0002, "epoch": 0.792818671454219, "step": 11040}, {"loss": 0.8298, "grad_norm": 0.7773675322532654, "learning_rate": 0.0002, "epoch": 0.7935368043087971, "step": 11050}, {"loss": 0.793, "grad_norm": 0.664139986038208, "learning_rate": 0.0002, "epoch": 0.7942549371633753, "step": 11060}, {"loss": 0.7886, "grad_norm": 0.6097795367240906, "learning_rate": 0.0002, "epoch": 0.7949730700179534, "step": 11070}, {"loss": 0.7989, "grad_norm": 0.9208881258964539, "learning_rate": 0.0002, "epoch": 0.7956912028725314, "step": 11080}, {"loss": 0.8045, "grad_norm": 0.6210731863975525, "learning_rate": 0.0002, "epoch": 0.7964093357271095, "step": 11090}, {"loss": 0.7868, "grad_norm": 0.7060235738754272, "learning_rate": 0.0002, "epoch": 0.7971274685816876, "step": 11100}, {"loss": 0.8041, "grad_norm": 0.48695266246795654, "learning_rate": 0.0002, "epoch": 0.7978456014362657, "step": 11110}, {"loss": 0.7885, "grad_norm": 0.6458830833435059, "learning_rate": 0.0002, "epoch": 0.7985637342908438, "step": 11120}, {"loss": 0.7773, "grad_norm": 0.572545051574707, "learning_rate": 0.0002, "epoch": 0.7992818671454219, "step": 11130}, {"loss": 0.7984, "grad_norm": 0.5925027132034302, "learning_rate": 0.0002, "epoch": 0.8, "step": 11140}, {"loss": 0.7571, "grad_norm": 0.569622278213501, "learning_rate": 0.0002, "epoch": 0.800718132854578, "step": 11150}, {"loss": 0.7765, "grad_norm": 0.537146806716919, "learning_rate": 0.0002, "epoch": 0.8014362657091562, "step": 11160}, {"loss": 0.7896, "grad_norm": 0.7118613719940186, "learning_rate": 0.0002, "epoch": 0.8021543985637343, "step": 11170}, {"loss": 0.7398, "grad_norm": 0.6183688044548035, "learning_rate": 0.0002, "epoch": 0.8028725314183124, "step": 11180}, {"loss": 0.7545, "grad_norm": 0.5187385082244873, "learning_rate": 0.0002, "epoch": 0.8035906642728905, "step": 11190}, {"loss": 0.766, "grad_norm": 0.5422571301460266, "learning_rate": 0.0002, "epoch": 0.8043087971274686, "step": 11200}, {"loss": 0.756, "grad_norm": 0.635050892829895, "learning_rate": 0.0002, "epoch": 0.8050269299820467, "step": 11210}, {"loss": 0.7337, "grad_norm": 0.6584872007369995, "learning_rate": 0.0002, "epoch": 0.8057450628366247, "step": 11220}, {"loss": 0.7467, "grad_norm": 0.624921977519989, "learning_rate": 0.0002, "epoch": 0.8064631956912028, "step": 11230}, {"loss": 0.7559, "grad_norm": 0.6837546229362488, "learning_rate": 0.0002, "epoch": 0.807181328545781, "step": 11240}, {"loss": 0.7861, "grad_norm": 0.5861160755157471, "learning_rate": 0.0002, "epoch": 0.8078994614003591, "step": 11250}, {"loss": 0.7883, "grad_norm": 0.5751383900642395, "learning_rate": 0.0002, "epoch": 0.8086175942549372, "step": 11260}, {"loss": 0.8103, "grad_norm": 0.7181510329246521, "learning_rate": 0.0002, "epoch": 0.8093357271095153, "step": 11270}, {"loss": 0.8066, "grad_norm": 0.5862139463424683, "learning_rate": 0.0002, "epoch": 0.8100538599640934, "step": 11280}, {"loss": 0.7692, "grad_norm": 0.4880113899707794, "learning_rate": 0.0002, "epoch": 0.8107719928186714, "step": 11290}, {"loss": 0.8154, "grad_norm": 0.565590500831604, "learning_rate": 0.0002, "epoch": 0.8114901256732495, "step": 11300}, {"loss": 0.7893, "grad_norm": 0.6171264052391052, "learning_rate": 0.0002, "epoch": 0.8122082585278276, "step": 11310}, {"loss": 0.816, "grad_norm": 0.5815969109535217, "learning_rate": 0.0002, "epoch": 0.8129263913824057, "step": 11320}, {"loss": 0.7462, "grad_norm": 0.5407653450965881, "learning_rate": 0.0002, "epoch": 0.8136445242369839, "step": 11330}, {"loss": 0.7647, "grad_norm": 0.6990084648132324, "learning_rate": 0.0002, "epoch": 0.814362657091562, "step": 11340}, {"loss": 0.783, "grad_norm": 0.5845068097114563, "learning_rate": 0.0002, "epoch": 0.8150807899461401, "step": 11350}, {"loss": 0.7839, "grad_norm": 0.5978701114654541, "learning_rate": 0.0002, "epoch": 0.8157989228007181, "step": 11360}, {"loss": 0.7342, "grad_norm": 0.6873053312301636, "learning_rate": 0.0002, "epoch": 0.8165170556552962, "step": 11370}, {"loss": 0.7656, "grad_norm": 0.7048654556274414, "learning_rate": 0.0002, "epoch": 0.8172351885098743, "step": 11380}, {"loss": 0.7293, "grad_norm": 0.7631531953811646, "learning_rate": 0.0002, "epoch": 0.8179533213644524, "step": 11390}, {"loss": 0.8606, "grad_norm": 0.704922080039978, "learning_rate": 0.0002, "epoch": 0.8186714542190305, "step": 11400}, {"loss": 0.8066, "grad_norm": 0.595460832118988, "learning_rate": 0.0002, "epoch": 0.8193895870736086, "step": 11410}, {"loss": 0.809, "grad_norm": 0.5882242918014526, "learning_rate": 0.0002, "epoch": 0.8201077199281868, "step": 11420}, {"loss": 0.7639, "grad_norm": 0.6433175206184387, "learning_rate": 0.0002, "epoch": 0.8208258527827648, "step": 11430}, {"loss": 0.7522, "grad_norm": 0.6047986149787903, "learning_rate": 0.0002, "epoch": 0.8215439856373429, "step": 11440}, {"loss": 0.8305, "grad_norm": 0.6462088823318481, "learning_rate": 0.0002, "epoch": 0.822262118491921, "step": 11450}, {"loss": 0.8144, "grad_norm": 0.5558379888534546, "learning_rate": 0.0002, "epoch": 0.8229802513464991, "step": 11460}, {"loss": 0.7916, "grad_norm": 0.6745542287826538, "learning_rate": 0.0002, "epoch": 0.8236983842010772, "step": 11470}, {"loss": 0.7853, "grad_norm": 0.7082334756851196, "learning_rate": 0.0002, "epoch": 0.8244165170556553, "step": 11480}, {"loss": 0.7533, "grad_norm": 0.703889787197113, "learning_rate": 0.0002, "epoch": 0.8251346499102334, "step": 11490}, {"loss": 0.8085, "grad_norm": 0.5261096358299255, "learning_rate": 0.0002, "epoch": 0.8258527827648114, "step": 11500}, {"loss": 0.7903, "grad_norm": 0.6009393930435181, "learning_rate": 0.0002, "epoch": 0.8265709156193896, "step": 11510}, {"loss": 0.7377, "grad_norm": 0.584274172782898, "learning_rate": 0.0002, "epoch": 0.8272890484739677, "step": 11520}, {"loss": 0.7926, "grad_norm": 0.6803238987922668, "learning_rate": 0.0002, "epoch": 0.8280071813285458, "step": 11530}, {"loss": 0.7948, "grad_norm": 0.6230084896087646, "learning_rate": 0.0002, "epoch": 0.8287253141831239, "step": 11540}, {"loss": 0.7902, "grad_norm": 0.6090595722198486, "learning_rate": 0.0002, "epoch": 0.829443447037702, "step": 11550}, {"loss": 0.7514, "grad_norm": 0.5292693376541138, "learning_rate": 0.0002, "epoch": 0.8301615798922801, "step": 11560}, {"loss": 0.7979, "grad_norm": 0.5675389766693115, "learning_rate": 0.0002, "epoch": 0.8308797127468581, "step": 11570}, {"loss": 0.7851, "grad_norm": 0.554874062538147, "learning_rate": 0.0002, "epoch": 0.8315978456014362, "step": 11580}, {"loss": 0.8004, "grad_norm": 0.8582373261451721, "learning_rate": 0.0002, "epoch": 0.8323159784560143, "step": 11590}, {"loss": 0.7864, "grad_norm": 0.5743035674095154, "learning_rate": 0.0002, "epoch": 0.8330341113105925, "step": 11600}, {"loss": 0.7714, "grad_norm": 0.5749582648277283, "learning_rate": 0.0002, "epoch": 0.8337522441651706, "step": 11610}, {"loss": 0.8131, "grad_norm": 0.5207278728485107, "learning_rate": 0.0002, "epoch": 0.8344703770197487, "step": 11620}, {"loss": 0.785, "grad_norm": 0.6262611150741577, "learning_rate": 0.0002, "epoch": 0.8351885098743268, "step": 11630}, {"loss": 0.7699, "grad_norm": 0.5490066409111023, "learning_rate": 0.0002, "epoch": 0.8359066427289048, "step": 11640}, {"loss": 0.7779, "grad_norm": 0.6283167600631714, "learning_rate": 0.0002, "epoch": 0.8366247755834829, "step": 11650}, {"loss": 0.7508, "grad_norm": 0.7701452374458313, "learning_rate": 0.0002, "epoch": 0.837342908438061, "step": 11660}, {"loss": 0.7662, "grad_norm": 0.5825072526931763, "learning_rate": 0.0002, "epoch": 0.8380610412926391, "step": 11670}, {"loss": 0.758, "grad_norm": 0.6119720935821533, "learning_rate": 0.0002, "epoch": 0.8387791741472173, "step": 11680}, {"loss": 0.7995, "grad_norm": 0.689383327960968, "learning_rate": 0.0002, "epoch": 0.8394973070017954, "step": 11690}, {"loss": 0.7615, "grad_norm": 0.5396560430526733, "learning_rate": 0.0002, "epoch": 0.8402154398563735, "step": 11700}, {"loss": 0.8073, "grad_norm": 0.577178955078125, "learning_rate": 0.0002, "epoch": 0.8409335727109515, "step": 11710}, {"loss": 0.7911, "grad_norm": 0.6652564406394958, "learning_rate": 0.0002, "epoch": 0.8416517055655296, "step": 11720}, {"loss": 0.7708, "grad_norm": 0.588377058506012, "learning_rate": 0.0002, "epoch": 0.8423698384201077, "step": 11730}, {"loss": 0.8245, "grad_norm": 0.6180438995361328, "learning_rate": 0.0002, "epoch": 0.8430879712746858, "step": 11740}, {"loss": 0.729, "grad_norm": 0.6897811889648438, "learning_rate": 0.0002, "epoch": 0.8438061041292639, "step": 11750}, {"loss": 0.8026, "grad_norm": 0.5826608538627625, "learning_rate": 0.0002, "epoch": 0.844524236983842, "step": 11760}, {"loss": 0.7959, "grad_norm": 0.6511976718902588, "learning_rate": 0.0002, "epoch": 0.8452423698384202, "step": 11770}, {"loss": 0.7705, "grad_norm": 0.4738382399082184, "learning_rate": 0.0002, "epoch": 0.8459605026929982, "step": 11780}, {"loss": 0.8317, "grad_norm": 0.541780948638916, "learning_rate": 0.0002, "epoch": 0.8466786355475763, "step": 11790}, {"loss": 0.774, "grad_norm": 0.6115241050720215, "learning_rate": 0.0002, "epoch": 0.8473967684021544, "step": 11800}, {"loss": 0.834, "grad_norm": 0.7067801356315613, "learning_rate": 0.0002, "epoch": 0.8481149012567325, "step": 11810}, {"loss": 0.7725, "grad_norm": 0.5602791905403137, "learning_rate": 0.0002, "epoch": 0.8488330341113106, "step": 11820}, {"loss": 0.7832, "grad_norm": 0.6968005299568176, "learning_rate": 0.0002, "epoch": 0.8495511669658887, "step": 11830}, {"loss": 0.7556, "grad_norm": 0.621132493019104, "learning_rate": 0.0002, "epoch": 0.8502692998204668, "step": 11840}, {"loss": 0.8036, "grad_norm": 0.5777568817138672, "learning_rate": 0.0002, "epoch": 0.8509874326750448, "step": 11850}, {"loss": 0.8071, "grad_norm": 0.6468178629875183, "learning_rate": 0.0002, "epoch": 0.851705565529623, "step": 11860}, {"loss": 0.8074, "grad_norm": 0.6216070652008057, "learning_rate": 0.0002, "epoch": 0.8524236983842011, "step": 11870}, {"loss": 0.7736, "grad_norm": 0.7402005791664124, "learning_rate": 0.0002, "epoch": 0.8531418312387792, "step": 11880}, {"loss": 0.7877, "grad_norm": 0.5192958116531372, "learning_rate": 0.0002, "epoch": 0.8538599640933573, "step": 11890}, {"loss": 0.7113, "grad_norm": 0.6050501465797424, "learning_rate": 0.0002, "epoch": 0.8545780969479354, "step": 11900}, {"loss": 0.8131, "grad_norm": 0.5363124012947083, "learning_rate": 0.0002, "epoch": 0.8552962298025135, "step": 11910}, {"loss": 0.7861, "grad_norm": 0.525288462638855, "learning_rate": 0.0002, "epoch": 0.8560143626570915, "step": 11920}, {"loss": 0.726, "grad_norm": 0.6129848957061768, "learning_rate": 0.0002, "epoch": 0.8567324955116696, "step": 11930}, {"loss": 0.7921, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 0.8574506283662477, "step": 11940}, {"loss": 0.772, "grad_norm": 0.5862830281257629, "learning_rate": 0.0002, "epoch": 0.8581687612208259, "step": 11950}, {"loss": 0.7272, "grad_norm": 0.7078025341033936, "learning_rate": 0.0002, "epoch": 0.858886894075404, "step": 11960}, {"loss": 0.7733, "grad_norm": 0.6600908637046814, "learning_rate": 0.0002, "epoch": 0.8596050269299821, "step": 11970}, {"loss": 0.7784, "grad_norm": 0.5914377570152283, "learning_rate": 0.0002, "epoch": 0.8603231597845602, "step": 11980}, {"loss": 0.8222, "grad_norm": 0.7844575047492981, "learning_rate": 0.0002, "epoch": 0.8610412926391382, "step": 11990}, {"loss": 0.8059, "grad_norm": 0.6605148315429688, "learning_rate": 0.0002, "epoch": 0.8617594254937163, "step": 12000}, {"loss": 0.8066, "grad_norm": 0.6320111155509949, "learning_rate": 0.0002, "epoch": 0.8624775583482944, "step": 12010}, {"loss": 0.7844, "grad_norm": 0.5833557844161987, "learning_rate": 0.0002, "epoch": 0.8631956912028725, "step": 12020}, {"loss": 0.8016, "grad_norm": 0.5322666764259338, "learning_rate": 0.0002, "epoch": 0.8639138240574507, "step": 12030}, {"loss": 0.8142, "grad_norm": 0.568696141242981, "learning_rate": 0.0002, "epoch": 0.8646319569120288, "step": 12040}, {"loss": 0.7929, "grad_norm": 0.5739135146141052, "learning_rate": 0.0002, "epoch": 0.8653500897666068, "step": 12050}, {"loss": 0.7877, "grad_norm": 0.6667993068695068, "learning_rate": 0.0002, "epoch": 0.8660682226211849, "step": 12060}, {"loss": 0.7538, "grad_norm": 0.5393701195716858, "learning_rate": 0.0002, "epoch": 0.866786355475763, "step": 12070}, {"loss": 0.8014, "grad_norm": 0.7036312818527222, "learning_rate": 0.0002, "epoch": 0.8675044883303411, "step": 12080}, {"loss": 0.7937, "grad_norm": 0.5851739048957825, "learning_rate": 0.0002, "epoch": 0.8682226211849192, "step": 12090}, {"loss": 0.8121, "grad_norm": 0.6554462909698486, "learning_rate": 0.0002, "epoch": 0.8689407540394973, "step": 12100}, {"loss": 0.8541, "grad_norm": 0.8224838376045227, "learning_rate": 0.0002, "epoch": 0.8696588868940754, "step": 12110}, {"loss": 0.73, "grad_norm": 0.513981819152832, "learning_rate": 0.0002, "epoch": 0.8703770197486534, "step": 12120}, {"loss": 0.7371, "grad_norm": 0.6913988590240479, "learning_rate": 0.0002, "epoch": 0.8710951526032316, "step": 12130}, {"loss": 0.762, "grad_norm": 0.5539003610610962, "learning_rate": 0.0002, "epoch": 0.8718132854578097, "step": 12140}, {"loss": 0.7535, "grad_norm": 0.6216937303543091, "learning_rate": 0.0002, "epoch": 0.8725314183123878, "step": 12150}, {"loss": 0.7344, "grad_norm": 0.5594495534896851, "learning_rate": 0.0002, "epoch": 0.8732495511669659, "step": 12160}, {"loss": 0.7342, "grad_norm": 0.6025309562683105, "learning_rate": 0.0002, "epoch": 0.873967684021544, "step": 12170}, {"loss": 0.7561, "grad_norm": 0.5285239815711975, "learning_rate": 0.0002, "epoch": 0.8746858168761221, "step": 12180}, {"loss": 0.7619, "grad_norm": 1.0394607782363892, "learning_rate": 0.0002, "epoch": 0.8754039497307001, "step": 12190}, {"loss": 0.8111, "grad_norm": 0.5128031373023987, "learning_rate": 0.0002, "epoch": 0.8761220825852782, "step": 12200}, {"loss": 0.8113, "grad_norm": 0.5883685946464539, "learning_rate": 0.0002, "epoch": 0.8768402154398564, "step": 12210}, {"loss": 0.7493, "grad_norm": 0.593204915523529, "learning_rate": 0.0002, "epoch": 0.8775583482944345, "step": 12220}, {"loss": 0.7739, "grad_norm": 0.7141679525375366, "learning_rate": 0.0002, "epoch": 0.8782764811490126, "step": 12230}, {"loss": 0.8155, "grad_norm": 0.6381585597991943, "learning_rate": 0.0002, "epoch": 0.8789946140035907, "step": 12240}, {"loss": 0.7756, "grad_norm": 0.7076981067657471, "learning_rate": 0.0002, "epoch": 0.8797127468581688, "step": 12250}, {"loss": 0.8186, "grad_norm": 0.8046461939811707, "learning_rate": 0.0002, "epoch": 0.8804308797127468, "step": 12260}, {"loss": 0.7615, "grad_norm": 0.635160505771637, "learning_rate": 0.0002, "epoch": 0.8811490125673249, "step": 12270}, {"loss": 0.7695, "grad_norm": 0.6388354301452637, "learning_rate": 0.0002, "epoch": 0.881867145421903, "step": 12280}, {"loss": 0.81, "grad_norm": 0.5612906217575073, "learning_rate": 0.0002, "epoch": 0.8825852782764811, "step": 12290}, {"loss": 0.8055, "grad_norm": 0.6716228723526001, "learning_rate": 0.0002, "epoch": 0.8833034111310593, "step": 12300}, {"loss": 0.757, "grad_norm": 0.6488762497901917, "learning_rate": 0.0002, "epoch": 0.8840215439856374, "step": 12310}, {"loss": 0.7794, "grad_norm": 0.5770853757858276, "learning_rate": 0.0002, "epoch": 0.8847396768402155, "step": 12320}, {"loss": 0.7617, "grad_norm": 0.5006616711616516, "learning_rate": 0.0002, "epoch": 0.8854578096947935, "step": 12330}, {"loss": 0.7512, "grad_norm": 0.6428417563438416, "learning_rate": 0.0002, "epoch": 0.8861759425493716, "step": 12340}, {"loss": 0.796, "grad_norm": 0.5721977949142456, "learning_rate": 0.0002, "epoch": 0.8868940754039497, "step": 12350}, {"loss": 0.7764, "grad_norm": 0.7000266313552856, "learning_rate": 0.0002, "epoch": 0.8876122082585278, "step": 12360}, {"loss": 0.7524, "grad_norm": 0.5252631306648254, "learning_rate": 0.0002, "epoch": 0.8883303411131059, "step": 12370}, {"loss": 0.7635, "grad_norm": 0.5788044929504395, "learning_rate": 0.0002, "epoch": 0.889048473967684, "step": 12380}, {"loss": 0.7856, "grad_norm": 0.6730653643608093, "learning_rate": 0.0002, "epoch": 0.8897666068222622, "step": 12390}, {"loss": 0.7925, "grad_norm": 0.5556851029396057, "learning_rate": 0.0002, "epoch": 0.8904847396768402, "step": 12400}, {"loss": 0.6958, "grad_norm": 0.616189181804657, "learning_rate": 0.0002, "epoch": 0.8912028725314183, "step": 12410}, {"loss": 0.7468, "grad_norm": 0.6360940337181091, "learning_rate": 0.0002, "epoch": 0.8919210053859964, "step": 12420}, {"loss": 0.8088, "grad_norm": 0.5832887887954712, "learning_rate": 0.0002, "epoch": 0.8926391382405745, "step": 12430}, {"loss": 0.7383, "grad_norm": 0.8319168090820312, "learning_rate": 0.0002, "epoch": 0.8933572710951526, "step": 12440}, {"loss": 0.8597, "grad_norm": 0.5415005087852478, "learning_rate": 0.0002, "epoch": 0.8940754039497307, "step": 12450}, {"loss": 0.7439, "grad_norm": 0.4959808588027954, "learning_rate": 0.0002, "epoch": 0.8947935368043088, "step": 12460}, {"loss": 0.8493, "grad_norm": 0.5102260708808899, "learning_rate": 0.0002, "epoch": 0.8955116696588868, "step": 12470}, {"loss": 0.7274, "grad_norm": 0.773972749710083, "learning_rate": 0.0002, "epoch": 0.896229802513465, "step": 12480}, {"loss": 0.7797, "grad_norm": 0.6314513087272644, "learning_rate": 0.0002, "epoch": 0.8969479353680431, "step": 12490}, {"loss": 0.7839, "grad_norm": 0.6503705382347107, "learning_rate": 0.0002, "epoch": 0.8976660682226212, "step": 12500}, {"loss": 0.8177, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 0.8983842010771993, "step": 12510}, {"loss": 0.7448, "grad_norm": 0.7222756743431091, "learning_rate": 0.0002, "epoch": 0.8991023339317774, "step": 12520}, {"loss": 0.7779, "grad_norm": 0.7242336869239807, "learning_rate": 0.0002, "epoch": 0.8998204667863555, "step": 12530}, {"loss": 0.7577, "grad_norm": 0.625769317150116, "learning_rate": 0.0002, "epoch": 0.9005385996409335, "step": 12540}, {"loss": 0.8528, "grad_norm": 0.6003357172012329, "learning_rate": 0.0002, "epoch": 0.9012567324955116, "step": 12550}, {"loss": 0.7871, "grad_norm": 0.6089374423027039, "learning_rate": 0.0002, "epoch": 0.9019748653500897, "step": 12560}, {"loss": 0.74, "grad_norm": 0.6232544183731079, "learning_rate": 0.0002, "epoch": 0.9026929982046679, "step": 12570}, {"loss": 0.7993, "grad_norm": 0.5426769256591797, "learning_rate": 0.0002, "epoch": 0.903411131059246, "step": 12580}, {"loss": 0.8023, "grad_norm": 0.5711943507194519, "learning_rate": 0.0002, "epoch": 0.9041292639138241, "step": 12590}, {"loss": 0.7915, "grad_norm": 0.5287838578224182, "learning_rate": 0.0002, "epoch": 0.9048473967684022, "step": 12600}, {"loss": 0.7394, "grad_norm": 0.6192951798439026, "learning_rate": 0.0002, "epoch": 0.9055655296229802, "step": 12610}, {"loss": 0.7547, "grad_norm": 0.493082195520401, "learning_rate": 0.0002, "epoch": 0.9062836624775583, "step": 12620}, {"loss": 0.7604, "grad_norm": 0.7668463587760925, "learning_rate": 0.0002, "epoch": 0.9070017953321364, "step": 12630}, {"loss": 0.8079, "grad_norm": 0.6298037767410278, "learning_rate": 0.0002, "epoch": 0.9077199281867145, "step": 12640}, {"loss": 0.7451, "grad_norm": 0.5502580404281616, "learning_rate": 0.0002, "epoch": 0.9084380610412927, "step": 12650}, {"loss": 0.763, "grad_norm": 0.5525170564651489, "learning_rate": 0.0002, "epoch": 0.9091561938958708, "step": 12660}, {"loss": 0.7579, "grad_norm": 0.9753695726394653, "learning_rate": 0.0002, "epoch": 0.9098743267504489, "step": 12670}, {"loss": 0.872, "grad_norm": 0.611427366733551, "learning_rate": 0.0002, "epoch": 0.9105924596050269, "step": 12680}, {"loss": 0.7786, "grad_norm": 0.5141594409942627, "learning_rate": 0.0002, "epoch": 0.911310592459605, "step": 12690}, {"loss": 0.7384, "grad_norm": 0.6739137172698975, "learning_rate": 0.0002, "epoch": 0.9120287253141831, "step": 12700}, {"loss": 0.8579, "grad_norm": 0.5759707689285278, "learning_rate": 0.0002, "epoch": 0.9127468581687612, "step": 12710}, {"loss": 0.7559, "grad_norm": 0.5548733472824097, "learning_rate": 0.0002, "epoch": 0.9134649910233393, "step": 12720}, {"loss": 0.8225, "grad_norm": 0.7014280557632446, "learning_rate": 0.0002, "epoch": 0.9141831238779174, "step": 12730}, {"loss": 0.7936, "grad_norm": 0.5939958691596985, "learning_rate": 0.0002, "epoch": 0.9149012567324956, "step": 12740}, {"loss": 0.7756, "grad_norm": 0.5995593667030334, "learning_rate": 0.0002, "epoch": 0.9156193895870736, "step": 12750}, {"loss": 0.7423, "grad_norm": 0.6686680316925049, "learning_rate": 0.0002, "epoch": 0.9163375224416517, "step": 12760}, {"loss": 0.8057, "grad_norm": 0.4742372930049896, "learning_rate": 0.0002, "epoch": 0.9170556552962298, "step": 12770}, {"loss": 0.7795, "grad_norm": 0.5493217706680298, "learning_rate": 0.0002, "epoch": 0.9177737881508079, "step": 12780}, {"loss": 0.7859, "grad_norm": 0.5641885995864868, "learning_rate": 0.0002, "epoch": 0.918491921005386, "step": 12790}, {"loss": 0.7775, "grad_norm": 0.5814061164855957, "learning_rate": 0.0002, "epoch": 0.9192100538599641, "step": 12800}, {"loss": 0.8204, "grad_norm": 0.6774331331253052, "learning_rate": 0.0002, "epoch": 0.9199281867145422, "step": 12810}, {"loss": 0.8205, "grad_norm": 0.5592127442359924, "learning_rate": 0.0002, "epoch": 0.9206463195691202, "step": 12820}, {"loss": 0.7788, "grad_norm": 0.5246456861495972, "learning_rate": 0.0002, "epoch": 0.9213644524236984, "step": 12830}, {"loss": 0.7886, "grad_norm": 0.6524264812469482, "learning_rate": 0.0002, "epoch": 0.9220825852782765, "step": 12840}, {"loss": 0.796, "grad_norm": 0.6010791063308716, "learning_rate": 0.0002, "epoch": 0.9228007181328546, "step": 12850}, {"loss": 0.7998, "grad_norm": 0.5289866924285889, "learning_rate": 0.0002, "epoch": 0.9235188509874327, "step": 12860}, {"loss": 0.7582, "grad_norm": 0.6850762367248535, "learning_rate": 0.0002, "epoch": 0.9242369838420108, "step": 12870}, {"loss": 0.7894, "grad_norm": 0.5293797850608826, "learning_rate": 0.0002, "epoch": 0.9249551166965889, "step": 12880}, {"loss": 0.7738, "grad_norm": 0.6045399308204651, "learning_rate": 0.0002, "epoch": 0.9256732495511669, "step": 12890}, {"loss": 0.7207, "grad_norm": 0.7026739716529846, "learning_rate": 0.0002, "epoch": 0.926391382405745, "step": 12900}, {"loss": 0.7726, "grad_norm": 0.6884756684303284, "learning_rate": 0.0002, "epoch": 0.9271095152603231, "step": 12910}, {"loss": 0.7913, "grad_norm": 0.637884795665741, "learning_rate": 0.0002, "epoch": 0.9278276481149013, "step": 12920}, {"loss": 0.7513, "grad_norm": 0.513913631439209, "learning_rate": 0.0002, "epoch": 0.9285457809694794, "step": 12930}, {"loss": 0.8, "grad_norm": 0.6642340421676636, "learning_rate": 0.0002, "epoch": 0.9292639138240575, "step": 12940}, {"loss": 0.8026, "grad_norm": 0.5708861947059631, "learning_rate": 0.0002, "epoch": 0.9299820466786356, "step": 12950}, {"loss": 0.8234, "grad_norm": 0.5896512866020203, "learning_rate": 0.0002, "epoch": 0.9307001795332136, "step": 12960}, {"loss": 0.77, "grad_norm": 0.5754874348640442, "learning_rate": 0.0002, "epoch": 0.9314183123877917, "step": 12970}, {"loss": 0.7594, "grad_norm": 0.6363751888275146, "learning_rate": 0.0002, "epoch": 0.9321364452423698, "step": 12980}, {"loss": 0.7898, "grad_norm": 0.7660197019577026, "learning_rate": 0.0002, "epoch": 0.9328545780969479, "step": 12990}, {"loss": 0.792, "grad_norm": 0.607728898525238, "learning_rate": 0.0002, "epoch": 0.933572710951526, "step": 13000}, {"loss": 0.734, "grad_norm": 0.5257042050361633, "learning_rate": 0.0002, "epoch": 0.9342908438061042, "step": 13010}, {"loss": 0.8129, "grad_norm": 0.7916908264160156, "learning_rate": 0.0002, "epoch": 0.9350089766606823, "step": 13020}, {"loss": 0.81, "grad_norm": 0.8310123085975647, "learning_rate": 0.0002, "epoch": 0.9357271095152603, "step": 13030}, {"loss": 0.7738, "grad_norm": 0.6543728113174438, "learning_rate": 0.0002, "epoch": 0.9364452423698384, "step": 13040}, {"loss": 0.7797, "grad_norm": 0.7153878808021545, "learning_rate": 0.0002, "epoch": 0.9371633752244165, "step": 13050}, {"loss": 0.779, "grad_norm": 0.7510694265365601, "learning_rate": 0.0002, "epoch": 0.9378815080789946, "step": 13060}, {"loss": 0.7761, "grad_norm": 0.5524464249610901, "learning_rate": 0.0002, "epoch": 0.9385996409335727, "step": 13070}, {"loss": 0.8635, "grad_norm": 0.6657140254974365, "learning_rate": 0.0002, "epoch": 0.9393177737881508, "step": 13080}, {"loss": 0.8097, "grad_norm": 0.5757394433021545, "learning_rate": 0.0002, "epoch": 0.940035906642729, "step": 13090}, {"loss": 0.7967, "grad_norm": 0.6171187162399292, "learning_rate": 0.0002, "epoch": 0.940754039497307, "step": 13100}, {"loss": 0.8197, "grad_norm": 0.5946314334869385, "learning_rate": 0.0002, "epoch": 0.9414721723518851, "step": 13110}, {"loss": 0.7184, "grad_norm": 0.5727229714393616, "learning_rate": 0.0002, "epoch": 0.9421903052064632, "step": 13120}, {"loss": 0.7981, "grad_norm": 0.7805224061012268, "learning_rate": 0.0002, "epoch": 0.9429084380610413, "step": 13130}, {"loss": 0.8045, "grad_norm": 0.5763523578643799, "learning_rate": 0.0002, "epoch": 0.9436265709156194, "step": 13140}, {"loss": 0.7462, "grad_norm": 0.8310899138450623, "learning_rate": 0.0002, "epoch": 0.9443447037701975, "step": 13150}, {"loss": 0.7818, "grad_norm": 0.7531784772872925, "learning_rate": 0.0002, "epoch": 0.9450628366247756, "step": 13160}, {"loss": 0.8418, "grad_norm": 0.678779661655426, "learning_rate": 0.0002, "epoch": 0.9457809694793536, "step": 13170}, {"loss": 0.8064, "grad_norm": 0.8096453547477722, "learning_rate": 0.0002, "epoch": 0.9464991023339318, "step": 13180}, {"loss": 0.7676, "grad_norm": 0.6743921637535095, "learning_rate": 0.0002, "epoch": 0.9472172351885099, "step": 13190}, {"loss": 0.7949, "grad_norm": 0.606852114200592, "learning_rate": 0.0002, "epoch": 0.947935368043088, "step": 13200}, {"loss": 0.7908, "grad_norm": 0.6550270915031433, "learning_rate": 0.0002, "epoch": 0.9486535008976661, "step": 13210}, {"loss": 0.7564, "grad_norm": 0.6494552493095398, "learning_rate": 0.0002, "epoch": 0.9493716337522442, "step": 13220}, {"loss": 0.7974, "grad_norm": 0.5867666602134705, "learning_rate": 0.0002, "epoch": 0.9500897666068223, "step": 13230}, {"loss": 0.8117, "grad_norm": 0.6283786296844482, "learning_rate": 0.0002, "epoch": 0.9508078994614003, "step": 13240}, {"loss": 0.7775, "grad_norm": 0.6824573278427124, "learning_rate": 0.0002, "epoch": 0.9515260323159784, "step": 13250}, {"loss": 0.7674, "grad_norm": 0.6945744156837463, "learning_rate": 0.0002, "epoch": 0.9522441651705565, "step": 13260}, {"loss": 0.7384, "grad_norm": 0.6468575596809387, "learning_rate": 0.0002, "epoch": 0.9529622980251347, "step": 13270}, {"loss": 0.7548, "grad_norm": 0.6819407939910889, "learning_rate": 0.0002, "epoch": 0.9536804308797128, "step": 13280}, {"loss": 0.7933, "grad_norm": 0.6660491824150085, "learning_rate": 0.0002, "epoch": 0.9543985637342909, "step": 13290}, {"loss": 0.7293, "grad_norm": 0.6320462226867676, "learning_rate": 0.0002, "epoch": 0.9551166965888689, "step": 13300}, {"loss": 0.8122, "grad_norm": 0.46753761172294617, "learning_rate": 0.0002, "epoch": 0.955834829443447, "step": 13310}, {"loss": 0.7953, "grad_norm": 0.6608774065971375, "learning_rate": 0.0002, "epoch": 0.9565529622980251, "step": 13320}, {"loss": 0.8217, "grad_norm": 0.607448935508728, "learning_rate": 0.0002, "epoch": 0.9572710951526032, "step": 13330}, {"loss": 0.7278, "grad_norm": 0.6796701550483704, "learning_rate": 0.0002, "epoch": 0.9579892280071813, "step": 13340}, {"loss": 0.7979, "grad_norm": 0.7655861377716064, "learning_rate": 0.0002, "epoch": 0.9587073608617595, "step": 13350}, {"loss": 0.7822, "grad_norm": 0.5881335735321045, "learning_rate": 0.0002, "epoch": 0.9594254937163376, "step": 13360}, {"loss": 0.815, "grad_norm": 0.6855270862579346, "learning_rate": 0.0002, "epoch": 0.9601436265709156, "step": 13370}, {"loss": 0.8025, "grad_norm": 0.6072475910186768, "learning_rate": 0.0002, "epoch": 0.9608617594254937, "step": 13380}, {"loss": 0.7756, "grad_norm": 0.5983994603157043, "learning_rate": 0.0002, "epoch": 0.9615798922800718, "step": 13390}, {"loss": 0.8121, "grad_norm": 0.6141189932823181, "learning_rate": 0.0002, "epoch": 0.9622980251346499, "step": 13400}, {"loss": 0.8059, "grad_norm": 0.6539722084999084, "learning_rate": 0.0002, "epoch": 0.963016157989228, "step": 13410}, {"loss": 0.8085, "grad_norm": 0.5425801277160645, "learning_rate": 0.0002, "epoch": 0.9637342908438061, "step": 13420}, {"loss": 0.7687, "grad_norm": 0.8038925528526306, "learning_rate": 0.0002, "epoch": 0.9644524236983842, "step": 13430}, {"loss": 0.8015, "grad_norm": 0.5729590058326721, "learning_rate": 0.0002, "epoch": 0.9651705565529622, "step": 13440}, {"loss": 0.782, "grad_norm": 0.5695241689682007, "learning_rate": 0.0002, "epoch": 0.9658886894075404, "step": 13450}, {"loss": 0.7984, "grad_norm": 0.5913681387901306, "learning_rate": 0.0002, "epoch": 0.9666068222621185, "step": 13460}, {"loss": 0.7947, "grad_norm": 1.1798994541168213, "learning_rate": 0.0002, "epoch": 0.9673249551166966, "step": 13470}, {"loss": 0.7342, "grad_norm": 0.5931369066238403, "learning_rate": 0.0002, "epoch": 0.9680430879712747, "step": 13480}, {"loss": 0.8432, "grad_norm": 0.6269514560699463, "learning_rate": 0.0002, "epoch": 0.9687612208258528, "step": 13490}, {"loss": 0.7357, "grad_norm": 0.7380245327949524, "learning_rate": 0.0002, "epoch": 0.9694793536804309, "step": 13500}, {"loss": 0.8006, "grad_norm": 0.5668187141418457, "learning_rate": 0.0002, "epoch": 0.9701974865350089, "step": 13510}, {"loss": 0.7562, "grad_norm": 0.547149121761322, "learning_rate": 0.0002, "epoch": 0.970915619389587, "step": 13520}, {"loss": 0.8239, "grad_norm": 0.49131739139556885, "learning_rate": 0.0002, "epoch": 0.9716337522441651, "step": 13530}, {"loss": 0.8159, "grad_norm": 0.6385366320610046, "learning_rate": 0.0002, "epoch": 0.9723518850987433, "step": 13540}, {"loss": 0.7882, "grad_norm": 0.5962417125701904, "learning_rate": 0.0002, "epoch": 0.9730700179533214, "step": 13550}, {"loss": 0.7353, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9737881508078995, "step": 13560}, {"loss": 0.7511, "grad_norm": 0.5757403373718262, "learning_rate": 0.0002, "epoch": 0.9745062836624776, "step": 13570}, {"loss": 0.7858, "grad_norm": 0.7214667201042175, "learning_rate": 0.0002, "epoch": 0.9752244165170556, "step": 13580}, {"loss": 0.7492, "grad_norm": 0.5902701020240784, "learning_rate": 0.0002, "epoch": 0.9759425493716337, "step": 13590}, {"loss": 0.8177, "grad_norm": 0.752805769443512, "learning_rate": 0.0002, "epoch": 0.9766606822262118, "step": 13600}, {"loss": 0.7622, "grad_norm": 0.5943595767021179, "learning_rate": 0.0002, "epoch": 0.9773788150807899, "step": 13610}, {"loss": 0.7781, "grad_norm": 0.6752488613128662, "learning_rate": 0.0002, "epoch": 0.978096947935368, "step": 13620}, {"loss": 0.8022, "grad_norm": 0.5295413732528687, "learning_rate": 0.0002, "epoch": 0.9788150807899462, "step": 13630}, {"loss": 0.7462, "grad_norm": 0.732549250125885, "learning_rate": 0.0002, "epoch": 0.9795332136445243, "step": 13640}, {"loss": 0.7939, "grad_norm": 0.5701823830604553, "learning_rate": 0.0002, "epoch": 0.9802513464991023, "step": 13650}, {"loss": 0.7609, "grad_norm": 0.576898455619812, "learning_rate": 0.0002, "epoch": 0.9809694793536804, "step": 13660}, {"loss": 0.7576, "grad_norm": 0.5916832089424133, "learning_rate": 0.0002, "epoch": 0.9816876122082585, "step": 13670}, {"loss": 0.7587, "grad_norm": 0.5554524660110474, "learning_rate": 0.0002, "epoch": 0.9824057450628366, "step": 13680}, {"loss": 0.8274, "grad_norm": 0.6988440752029419, "learning_rate": 0.0002, "epoch": 0.9831238779174147, "step": 13690}, {"loss": 0.7485, "grad_norm": 0.6660445332527161, "learning_rate": 0.0002, "epoch": 0.9838420107719928, "step": 13700}, {"loss": 0.7609, "grad_norm": 2.421210289001465, "learning_rate": 0.0002, "epoch": 0.984560143626571, "step": 13710}, {"loss": 0.784, "grad_norm": 0.6307598948478699, "learning_rate": 0.0002, "epoch": 0.985278276481149, "step": 13720}, {"loss": 0.7757, "grad_norm": 0.6832480430603027, "learning_rate": 0.0002, "epoch": 0.9859964093357271, "step": 13730}, {"loss": 0.8064, "grad_norm": 0.5974255204200745, "learning_rate": 0.0002, "epoch": 0.9867145421903052, "step": 13740}, {"loss": 0.7871, "grad_norm": 0.6540380716323853, "learning_rate": 0.0002, "epoch": 0.9874326750448833, "step": 13750}, {"loss": 0.7735, "grad_norm": 0.7532727122306824, "learning_rate": 0.0002, "epoch": 0.9881508078994614, "step": 13760}, {"loss": 0.7392, "grad_norm": 0.6776283383369446, "learning_rate": 0.0002, "epoch": 0.9888689407540395, "step": 13770}, {"loss": 0.7852, "grad_norm": 0.5776281356811523, "learning_rate": 0.0002, "epoch": 0.9895870736086176, "step": 13780}, {"loss": 0.8216, "grad_norm": 0.5473008751869202, "learning_rate": 0.0002, "epoch": 0.9903052064631956, "step": 13790}, {"loss": 0.7776, "grad_norm": 0.5428591370582581, "learning_rate": 0.0002, "epoch": 0.9910233393177738, "step": 13800}, {"loss": 0.7823, "grad_norm": 0.5173406004905701, "learning_rate": 0.0002, "epoch": 0.9917414721723519, "step": 13810}, {"loss": 0.762, "grad_norm": 0.6462617516517639, "learning_rate": 0.0002, "epoch": 0.99245960502693, "step": 13820}, {"loss": 0.7656, "grad_norm": 0.5800426006317139, "learning_rate": 0.0002, "epoch": 0.9931777378815081, "step": 13830}, {"loss": 0.8028, "grad_norm": 0.5015466809272766, "learning_rate": 0.0002, "epoch": 0.9938958707360862, "step": 13840}, {"loss": 0.7782, "grad_norm": 0.59474778175354, "learning_rate": 0.0002, "epoch": 0.9946140035906643, "step": 13850}, {"loss": 0.7891, "grad_norm": 0.5609583258628845, "learning_rate": 0.0002, "epoch": 0.9953321364452423, "step": 13860}, {"loss": 0.7647, "grad_norm": 0.5762063264846802, "learning_rate": 0.0002, "epoch": 0.9960502692998204, "step": 13870}, {"loss": 0.7594, "grad_norm": 0.6419214010238647, "learning_rate": 0.0002, "epoch": 0.9967684021543985, "step": 13880}, {"loss": 0.7599, "grad_norm": 0.7821950316429138, "learning_rate": 0.0002, "epoch": 0.9974865350089767, "step": 13890}, {"loss": 0.7529, "grad_norm": 0.6216017007827759, "learning_rate": 0.0002, "epoch": 0.9982046678635548, "step": 13900}, {"loss": 0.7621, "grad_norm": 0.5446485877037048, "learning_rate": 0.0002, "epoch": 0.9989228007181329, "step": 13910}, {"loss": 0.74, "grad_norm": 0.5037565231323242, "learning_rate": 0.0002, "epoch": 0.999640933572711, "step": 13920}, {"eval_loss": 1.09147310256958, "eval_runtime": 55.1915, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 1.0, "step": 13925}, {"loss": 0.7479, "grad_norm": 0.5808277130126953, "learning_rate": 0.0002, "epoch": 1.000359066427289, "step": 13930}, {"loss": 0.7147, "grad_norm": 0.47258496284484863, "learning_rate": 0.0002, "epoch": 1.0010771992818672, "step": 13940}, {"loss": 0.7075, "grad_norm": 0.8921670317649841, "learning_rate": 0.0002, "epoch": 1.0017953321364452, "step": 13950}, {"loss": 0.7737, "grad_norm": 0.746729850769043, "learning_rate": 0.0002, "epoch": 1.0025134649910232, "step": 13960}, {"loss": 0.6912, "grad_norm": 0.6243796944618225, "learning_rate": 0.0002, "epoch": 1.0032315978456015, "step": 13970}, {"loss": 0.7171, "grad_norm": 0.6725090742111206, "learning_rate": 0.0002, "epoch": 1.0039497307001795, "step": 13980}, {"loss": 0.7094, "grad_norm": 0.8762497305870056, "learning_rate": 0.0002, "epoch": 1.0046678635547577, "step": 13990}, {"loss": 0.7183, "grad_norm": 0.7694411873817444, "learning_rate": 0.0002, "epoch": 1.0053859964093357, "step": 14000}, {"loss": 0.7741, "grad_norm": 0.6208822727203369, "learning_rate": 0.0002, "epoch": 1.006104129263914, "step": 14010}, {"loss": 0.7291, "grad_norm": 0.8503357768058777, "learning_rate": 0.0002, "epoch": 1.006822262118492, "step": 14020}, {"loss": 0.7189, "grad_norm": 0.5813316106796265, "learning_rate": 0.0002, "epoch": 1.00754039497307, "step": 14030}, {"loss": 0.751, "grad_norm": 0.8186036348342896, "learning_rate": 0.0002, "epoch": 1.0082585278276481, "step": 14040}, {"loss": 0.7205, "grad_norm": 0.759873628616333, "learning_rate": 0.0002, "epoch": 1.0089766606822261, "step": 14050}, {"loss": 0.7517, "grad_norm": 0.8437777161598206, "learning_rate": 0.0002, "epoch": 1.0096947935368044, "step": 14060}, {"loss": 0.7205, "grad_norm": 0.5750975012779236, "learning_rate": 0.0002, "epoch": 1.0104129263913824, "step": 14070}, {"loss": 0.7079, "grad_norm": 0.5873221158981323, "learning_rate": 0.0002, "epoch": 1.0111310592459606, "step": 14080}, {"loss": 0.7645, "grad_norm": 0.6381314396858215, "learning_rate": 0.0002, "epoch": 1.0118491921005386, "step": 14090}, {"loss": 0.7246, "grad_norm": 0.6510405540466309, "learning_rate": 0.0002, "epoch": 1.0125673249551166, "step": 14100}, {"loss": 0.6906, "grad_norm": 0.7698671221733093, "learning_rate": 0.0002, "epoch": 1.0132854578096948, "step": 14110}, {"loss": 0.7008, "grad_norm": 0.646180272102356, "learning_rate": 0.0002, "epoch": 1.0140035906642728, "step": 14120}, {"loss": 0.7446, "grad_norm": 0.6183205246925354, "learning_rate": 0.0002, "epoch": 1.014721723518851, "step": 14130}, {"loss": 0.747, "grad_norm": 0.5082563757896423, "learning_rate": 0.0002, "epoch": 1.015439856373429, "step": 14140}, {"loss": 0.7229, "grad_norm": 0.7285500764846802, "learning_rate": 0.0002, "epoch": 1.0161579892280073, "step": 14150}, {"loss": 0.6879, "grad_norm": 0.6368175148963928, "learning_rate": 0.0002, "epoch": 1.0168761220825853, "step": 14160}, {"loss": 0.712, "grad_norm": 0.44868743419647217, "learning_rate": 0.0002, "epoch": 1.0175942549371633, "step": 14170}, {"loss": 0.7299, "grad_norm": 0.6346513628959656, "learning_rate": 0.0002, "epoch": 1.0183123877917415, "step": 14180}, {"loss": 0.7099, "grad_norm": 0.7287803292274475, "learning_rate": 0.0002, "epoch": 1.0190305206463195, "step": 14190}, {"loss": 0.6915, "grad_norm": 0.6701363325119019, "learning_rate": 0.0002, "epoch": 1.0197486535008977, "step": 14200}, {"loss": 0.7389, "grad_norm": 0.6419289112091064, "learning_rate": 0.0002, "epoch": 1.0204667863554757, "step": 14210}, {"loss": 0.7386, "grad_norm": 0.7703002095222473, "learning_rate": 0.0002, "epoch": 1.021184919210054, "step": 14220}, {"loss": 0.6819, "grad_norm": 0.6803670525550842, "learning_rate": 0.0002, "epoch": 1.021903052064632, "step": 14230}, {"loss": 0.74, "grad_norm": 0.5780976414680481, "learning_rate": 0.0002, "epoch": 1.02262118491921, "step": 14240}, {"loss": 0.6912, "grad_norm": 0.5096051096916199, "learning_rate": 0.0002, "epoch": 1.0233393177737882, "step": 14250}, {"loss": 0.7585, "grad_norm": 0.6058611869812012, "learning_rate": 0.0002, "epoch": 1.0240574506283662, "step": 14260}, {"loss": 0.7542, "grad_norm": 0.6703311204910278, "learning_rate": 0.0002, "epoch": 1.0247755834829444, "step": 14270}, {"loss": 0.7541, "grad_norm": 0.7143640518188477, "learning_rate": 0.0002, "epoch": 1.0254937163375224, "step": 14280}, {"loss": 0.7411, "grad_norm": 0.6730744242668152, "learning_rate": 0.0002, "epoch": 1.0262118491921006, "step": 14290}, {"loss": 0.7072, "grad_norm": 0.8180603384971619, "learning_rate": 0.0002, "epoch": 1.0269299820466786, "step": 14300}, {"loss": 0.6944, "grad_norm": 0.6752267479896545, "learning_rate": 0.0002, "epoch": 1.0276481149012566, "step": 14310}, {"loss": 0.7105, "grad_norm": 0.678428590297699, "learning_rate": 0.0002, "epoch": 1.0283662477558349, "step": 14320}, {"loss": 0.7496, "grad_norm": 0.5959973931312561, "learning_rate": 0.0002, "epoch": 1.0290843806104129, "step": 14330}, {"loss": 0.7196, "grad_norm": 0.5797176957130432, "learning_rate": 0.0002, "epoch": 1.029802513464991, "step": 14340}, {"loss": 0.7853, "grad_norm": 0.6415652632713318, "learning_rate": 0.0002, "epoch": 1.030520646319569, "step": 14350}, {"loss": 0.7297, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 1.0312387791741473, "step": 14360}, {"loss": 0.7715, "grad_norm": 0.7158452272415161, "learning_rate": 0.0002, "epoch": 1.0319569120287253, "step": 14370}, {"loss": 0.7526, "grad_norm": 0.6066089272499084, "learning_rate": 0.0002, "epoch": 1.0326750448833033, "step": 14380}, {"loss": 0.7639, "grad_norm": 0.7359582781791687, "learning_rate": 0.0002, "epoch": 1.0333931777378815, "step": 14390}, {"loss": 0.7445, "grad_norm": 0.7372373938560486, "learning_rate": 0.0002, "epoch": 1.0341113105924595, "step": 14400}, {"loss": 0.7262, "grad_norm": 0.7511868476867676, "learning_rate": 0.0002, "epoch": 1.0348294434470378, "step": 14410}, {"loss": 0.7145, "grad_norm": 0.5449917912483215, "learning_rate": 0.0002, "epoch": 1.0355475763016158, "step": 14420}, {"loss": 0.6908, "grad_norm": 0.6700817346572876, "learning_rate": 0.0002, "epoch": 1.036265709156194, "step": 14430}, {"loss": 0.7237, "grad_norm": 0.7061316967010498, "learning_rate": 0.0002, "epoch": 1.036983842010772, "step": 14440}, {"loss": 0.7166, "grad_norm": 0.7582663893699646, "learning_rate": 0.0002, "epoch": 1.03770197486535, "step": 14450}, {"loss": 0.7447, "grad_norm": 0.6408873200416565, "learning_rate": 0.0002, "epoch": 1.0384201077199282, "step": 14460}, {"loss": 0.728, "grad_norm": 0.7645436525344849, "learning_rate": 0.0002, "epoch": 1.0391382405745062, "step": 14470}, {"loss": 0.7764, "grad_norm": 0.6522644758224487, "learning_rate": 0.0002, "epoch": 1.0398563734290844, "step": 14480}, {"loss": 0.7249, "grad_norm": 0.784273624420166, "learning_rate": 0.0002, "epoch": 1.0405745062836624, "step": 14490}, {"loss": 0.7173, "grad_norm": 0.673891544342041, "learning_rate": 0.0002, "epoch": 1.0412926391382407, "step": 14500}, {"loss": 0.6647, "grad_norm": 0.6566316485404968, "learning_rate": 0.0002, "epoch": 1.0420107719928187, "step": 14510}, {"loss": 0.7626, "grad_norm": 0.6062059998512268, "learning_rate": 0.0002, "epoch": 1.0427289048473967, "step": 14520}, {"loss": 0.7061, "grad_norm": 0.6884504556655884, "learning_rate": 0.0002, "epoch": 1.0434470377019749, "step": 14530}, {"loss": 0.7293, "grad_norm": 0.6642231345176697, "learning_rate": 0.0002, "epoch": 1.044165170556553, "step": 14540}, {"loss": 0.7084, "grad_norm": 0.6989523768424988, "learning_rate": 0.0002, "epoch": 1.0448833034111311, "step": 14550}, {"loss": 0.7751, "grad_norm": 0.8179892301559448, "learning_rate": 0.0002, "epoch": 1.0456014362657091, "step": 14560}, {"loss": 0.7225, "grad_norm": 0.6426970362663269, "learning_rate": 0.0002, "epoch": 1.0463195691202873, "step": 14570}, {"loss": 0.7756, "grad_norm": 0.678445041179657, "learning_rate": 0.0002, "epoch": 1.0470377019748653, "step": 14580}, {"loss": 0.7172, "grad_norm": 0.7573820352554321, "learning_rate": 0.0002, "epoch": 1.0477558348294433, "step": 14590}, {"loss": 0.8092, "grad_norm": 0.734443724155426, "learning_rate": 0.0002, "epoch": 1.0484739676840216, "step": 14600}, {"loss": 0.7205, "grad_norm": 0.7333676218986511, "learning_rate": 0.0002, "epoch": 1.0491921005385996, "step": 14610}, {"loss": 0.7276, "grad_norm": 0.6122187972068787, "learning_rate": 0.0002, "epoch": 1.0499102333931778, "step": 14620}, {"loss": 0.7051, "grad_norm": 0.6916412711143494, "learning_rate": 0.0002, "epoch": 1.0506283662477558, "step": 14630}, {"loss": 0.7315, "grad_norm": 0.5898127555847168, "learning_rate": 0.0002, "epoch": 1.051346499102334, "step": 14640}, {"loss": 0.7293, "grad_norm": 0.6071873307228088, "learning_rate": 0.0002, "epoch": 1.052064631956912, "step": 14650}, {"loss": 0.7924, "grad_norm": 0.6530455946922302, "learning_rate": 0.0002, "epoch": 1.05278276481149, "step": 14660}, {"loss": 0.7055, "grad_norm": 0.6919314861297607, "learning_rate": 0.0002, "epoch": 1.0535008976660682, "step": 14670}, {"loss": 0.7481, "grad_norm": 0.7843509912490845, "learning_rate": 0.0002, "epoch": 1.0542190305206462, "step": 14680}, {"loss": 0.7253, "grad_norm": 0.6106747388839722, "learning_rate": 0.0002, "epoch": 1.0549371633752245, "step": 14690}, {"loss": 0.7206, "grad_norm": 0.7828368544578552, "learning_rate": 0.0002, "epoch": 1.0556552962298025, "step": 14700}, {"loss": 0.6933, "grad_norm": 0.6772044897079468, "learning_rate": 0.0002, "epoch": 1.0563734290843807, "step": 14710}, {"loss": 0.6851, "grad_norm": 0.5430962443351746, "learning_rate": 0.0002, "epoch": 1.0570915619389587, "step": 14720}, {"loss": 0.7306, "grad_norm": 0.7364194989204407, "learning_rate": 0.0002, "epoch": 1.0578096947935367, "step": 14730}, {"loss": 0.703, "grad_norm": 0.5607585310935974, "learning_rate": 0.0002, "epoch": 1.058527827648115, "step": 14740}, {"loss": 0.7488, "grad_norm": 0.7917081713676453, "learning_rate": 0.0002, "epoch": 1.059245960502693, "step": 14750}, {"loss": 0.71, "grad_norm": 0.7852025628089905, "learning_rate": 0.0002, "epoch": 1.0599640933572712, "step": 14760}, {"loss": 0.7093, "grad_norm": 0.6329161524772644, "learning_rate": 0.0002, "epoch": 1.0606822262118492, "step": 14770}, {"loss": 0.7244, "grad_norm": 0.7607306838035583, "learning_rate": 0.0002, "epoch": 1.0614003590664274, "step": 14780}, {"loss": 0.7237, "grad_norm": 0.7236617207527161, "learning_rate": 0.0002, "epoch": 1.0621184919210054, "step": 14790}, {"loss": 0.7133, "grad_norm": 0.793542206287384, "learning_rate": 0.0002, "epoch": 1.0628366247755834, "step": 14800}, {"loss": 0.7482, "grad_norm": 0.53999263048172, "learning_rate": 0.0002, "epoch": 1.0635547576301616, "step": 14810}, {"loss": 0.732, "grad_norm": 0.5821034908294678, "learning_rate": 0.0002, "epoch": 1.0642728904847396, "step": 14820}, {"loss": 0.7066, "grad_norm": 0.6593600511550903, "learning_rate": 0.0002, "epoch": 1.0649910233393178, "step": 14830}, {"loss": 0.7458, "grad_norm": 0.70230633020401, "learning_rate": 0.0002, "epoch": 1.0657091561938958, "step": 14840}, {"loss": 0.7244, "grad_norm": 0.5715264081954956, "learning_rate": 0.0002, "epoch": 1.066427289048474, "step": 14850}, {"loss": 0.723, "grad_norm": 0.6610119938850403, "learning_rate": 0.0002, "epoch": 1.067145421903052, "step": 14860}, {"loss": 0.745, "grad_norm": 0.5470091700553894, "learning_rate": 0.0002, "epoch": 1.06786355475763, "step": 14870}, {"loss": 0.7464, "grad_norm": 0.7529906630516052, "learning_rate": 0.0002, "epoch": 1.0685816876122083, "step": 14880}, {"loss": 0.7421, "grad_norm": 0.7532844543457031, "learning_rate": 0.0002, "epoch": 1.0692998204667863, "step": 14890}, {"loss": 0.6706, "grad_norm": 0.6439316868782043, "learning_rate": 0.0002, "epoch": 1.0700179533213645, "step": 14900}, {"loss": 0.7276, "grad_norm": 0.5580114126205444, "learning_rate": 0.0002, "epoch": 1.0707360861759425, "step": 14910}, {"loss": 0.7478, "grad_norm": 0.6299236416816711, "learning_rate": 0.0002, "epoch": 1.0714542190305207, "step": 14920}, {"loss": 0.7927, "grad_norm": 0.6934021711349487, "learning_rate": 0.0002, "epoch": 1.0721723518850987, "step": 14930}, {"loss": 0.6766, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 1.0728904847396767, "step": 14940}, {"loss": 0.7072, "grad_norm": 0.8921014070510864, "learning_rate": 0.0002, "epoch": 1.073608617594255, "step": 14950}, {"loss": 0.7127, "grad_norm": 0.5934301614761353, "learning_rate": 0.0002, "epoch": 1.074326750448833, "step": 14960}, {"loss": 0.7595, "grad_norm": 0.8379642367362976, "learning_rate": 0.0002, "epoch": 1.0750448833034112, "step": 14970}, {"loss": 0.7231, "grad_norm": 0.6842767596244812, "learning_rate": 0.0002, "epoch": 1.0757630161579892, "step": 14980}, {"loss": 0.7362, "grad_norm": 0.7296533584594727, "learning_rate": 0.0002, "epoch": 1.0764811490125674, "step": 14990}, {"loss": 0.688, "grad_norm": 0.6821087002754211, "learning_rate": 0.0002, "epoch": 1.0771992818671454, "step": 15000}, {"loss": 0.6808, "grad_norm": 0.6133626699447632, "learning_rate": 0.0002, "epoch": 1.0779174147217234, "step": 15010}, {"loss": 0.7351, "grad_norm": 0.6774773001670837, "learning_rate": 0.0002, "epoch": 1.0786355475763016, "step": 15020}, {"loss": 0.7403, "grad_norm": 0.6818786859512329, "learning_rate": 0.0002, "epoch": 1.0793536804308796, "step": 15030}, {"loss": 0.7005, "grad_norm": 0.7763522863388062, "learning_rate": 0.0002, "epoch": 1.0800718132854579, "step": 15040}, {"loss": 0.7028, "grad_norm": 0.7259193658828735, "learning_rate": 0.0002, "epoch": 1.0807899461400359, "step": 15050}, {"loss": 0.7232, "grad_norm": 0.6797525882720947, "learning_rate": 0.0002, "epoch": 1.081508078994614, "step": 15060}, {"loss": 0.7051, "grad_norm": 0.5775881409645081, "learning_rate": 0.0002, "epoch": 1.082226211849192, "step": 15070}, {"loss": 0.745, "grad_norm": 0.7055524587631226, "learning_rate": 0.0002, "epoch": 1.08294434470377, "step": 15080}, {"loss": 0.7539, "grad_norm": 0.8018748760223389, "learning_rate": 0.0002, "epoch": 1.0836624775583483, "step": 15090}, {"loss": 0.6833, "grad_norm": 0.6738115549087524, "learning_rate": 0.0002, "epoch": 1.0843806104129263, "step": 15100}, {"loss": 0.7014, "grad_norm": 0.6586359143257141, "learning_rate": 0.0002, "epoch": 1.0850987432675046, "step": 15110}, {"loss": 0.7391, "grad_norm": 0.7396895885467529, "learning_rate": 0.0002, "epoch": 1.0858168761220826, "step": 15120}, {"loss": 0.7473, "grad_norm": 0.7224817276000977, "learning_rate": 0.0002, "epoch": 1.0865350089766608, "step": 15130}, {"loss": 0.7137, "grad_norm": 0.798514187335968, "learning_rate": 0.0002, "epoch": 1.0872531418312388, "step": 15140}, {"loss": 0.757, "grad_norm": 0.79301518201828, "learning_rate": 0.0002, "epoch": 1.0879712746858168, "step": 15150}, {"loss": 0.7, "grad_norm": 0.7106764316558838, "learning_rate": 0.0002, "epoch": 1.088689407540395, "step": 15160}, {"loss": 0.7515, "grad_norm": 0.6525473594665527, "learning_rate": 0.0002, "epoch": 1.089407540394973, "step": 15170}, {"loss": 0.7067, "grad_norm": 0.6001671552658081, "learning_rate": 0.0002, "epoch": 1.0901256732495512, "step": 15180}, {"loss": 0.722, "grad_norm": 0.6949557662010193, "learning_rate": 0.0002, "epoch": 1.0908438061041292, "step": 15190}, {"loss": 0.7165, "grad_norm": 0.5713186860084534, "learning_rate": 0.0002, "epoch": 1.0915619389587075, "step": 15200}, {"loss": 0.7073, "grad_norm": 0.8773220181465149, "learning_rate": 0.0002, "epoch": 1.0922800718132855, "step": 15210}, {"loss": 0.7332, "grad_norm": 0.5837785601615906, "learning_rate": 0.0002, "epoch": 1.0929982046678635, "step": 15220}, {"loss": 0.7451, "grad_norm": 0.7243856191635132, "learning_rate": 0.0002, "epoch": 1.0937163375224417, "step": 15230}, {"loss": 0.6885, "grad_norm": 0.7008263468742371, "learning_rate": 0.0002, "epoch": 1.0944344703770197, "step": 15240}, {"loss": 0.7259, "grad_norm": 0.7061941623687744, "learning_rate": 0.0002, "epoch": 1.095152603231598, "step": 15250}, {"loss": 0.7482, "grad_norm": 0.575903594493866, "learning_rate": 0.0002, "epoch": 1.095870736086176, "step": 15260}, {"loss": 0.7001, "grad_norm": 0.6794043183326721, "learning_rate": 0.0002, "epoch": 1.0965888689407541, "step": 15270}, {"loss": 0.708, "grad_norm": 0.7194870710372925, "learning_rate": 0.0002, "epoch": 1.0973070017953321, "step": 15280}, {"loss": 0.7248, "grad_norm": 0.8063322305679321, "learning_rate": 0.0002, "epoch": 1.0980251346499101, "step": 15290}, {"loss": 0.7128, "grad_norm": 0.786101758480072, "learning_rate": 0.0002, "epoch": 1.0987432675044884, "step": 15300}, {"loss": 0.7523, "grad_norm": 0.827474057674408, "learning_rate": 0.0002, "epoch": 1.0994614003590664, "step": 15310}, {"loss": 0.7624, "grad_norm": 0.6514455080032349, "learning_rate": 0.0002, "epoch": 1.1001795332136446, "step": 15320}, {"loss": 0.745, "grad_norm": 0.7534348368644714, "learning_rate": 0.0002, "epoch": 1.1008976660682226, "step": 15330}, {"loss": 0.7359, "grad_norm": 0.6991367340087891, "learning_rate": 0.0002, "epoch": 1.1016157989228008, "step": 15340}, {"loss": 0.717, "grad_norm": 0.6742196679115295, "learning_rate": 0.0002, "epoch": 1.1023339317773788, "step": 15350}, {"loss": 0.737, "grad_norm": 0.7373757362365723, "learning_rate": 0.0002, "epoch": 1.1030520646319568, "step": 15360}, {"loss": 0.7421, "grad_norm": 0.6834485530853271, "learning_rate": 0.0002, "epoch": 1.103770197486535, "step": 15370}, {"loss": 0.7015, "grad_norm": 0.6454901099205017, "learning_rate": 0.0002, "epoch": 1.104488330341113, "step": 15380}, {"loss": 0.7276, "grad_norm": 0.7764508128166199, "learning_rate": 0.0002, "epoch": 1.1052064631956913, "step": 15390}, {"loss": 0.747, "grad_norm": 0.668560802936554, "learning_rate": 0.0002, "epoch": 1.1059245960502693, "step": 15400}, {"loss": 0.6705, "grad_norm": 0.579655110836029, "learning_rate": 0.0002, "epoch": 1.1066427289048475, "step": 15410}, {"loss": 0.7101, "grad_norm": 0.7196493148803711, "learning_rate": 0.0002, "epoch": 1.1073608617594255, "step": 15420}, {"loss": 0.8027, "grad_norm": 0.5530232191085815, "learning_rate": 0.0002, "epoch": 1.1080789946140035, "step": 15430}, {"loss": 0.7369, "grad_norm": 0.6542958617210388, "learning_rate": 0.0002, "epoch": 1.1087971274685817, "step": 15440}, {"loss": 0.7475, "grad_norm": 0.7468852400779724, "learning_rate": 0.0002, "epoch": 1.1095152603231597, "step": 15450}, {"loss": 0.6898, "grad_norm": 0.8119780421257019, "learning_rate": 0.0002, "epoch": 1.110233393177738, "step": 15460}, {"loss": 0.7652, "grad_norm": 0.7807733416557312, "learning_rate": 0.0002, "epoch": 1.110951526032316, "step": 15470}, {"loss": 0.697, "grad_norm": 0.7352553009986877, "learning_rate": 0.0002, "epoch": 1.1116696588868942, "step": 15480}, {"loss": 0.7509, "grad_norm": 0.8455224633216858, "learning_rate": 0.0002, "epoch": 1.1123877917414722, "step": 15490}, {"loss": 0.7757, "grad_norm": 0.635308563709259, "learning_rate": 0.0002, "epoch": 1.1131059245960502, "step": 15500}, {"loss": 0.685, "grad_norm": 0.6268794536590576, "learning_rate": 0.0002, "epoch": 1.1138240574506284, "step": 15510}, {"loss": 0.7174, "grad_norm": 0.6829593181610107, "learning_rate": 0.0002, "epoch": 1.1145421903052064, "step": 15520}, {"loss": 0.7264, "grad_norm": 0.5997796058654785, "learning_rate": 0.0002, "epoch": 1.1152603231597846, "step": 15530}, {"loss": 0.7167, "grad_norm": 0.7500942349433899, "learning_rate": 0.0002, "epoch": 1.1159784560143626, "step": 15540}, {"loss": 0.7275, "grad_norm": 0.7052047848701477, "learning_rate": 0.0002, "epoch": 1.1166965888689409, "step": 15550}, {"loss": 0.7832, "grad_norm": 0.6698189377784729, "learning_rate": 0.0002, "epoch": 1.1174147217235189, "step": 15560}, {"loss": 0.7587, "grad_norm": 0.7890462875366211, "learning_rate": 0.0002, "epoch": 1.1181328545780969, "step": 15570}, {"loss": 0.7092, "grad_norm": 0.7002465128898621, "learning_rate": 0.0002, "epoch": 1.118850987432675, "step": 15580}, {"loss": 0.6903, "grad_norm": 0.7456073760986328, "learning_rate": 0.0002, "epoch": 1.119569120287253, "step": 15590}, {"loss": 0.7577, "grad_norm": 0.7997385263442993, "learning_rate": 0.0002, "epoch": 1.1202872531418313, "step": 15600}, {"loss": 0.7005, "grad_norm": 0.6640482544898987, "learning_rate": 0.0002, "epoch": 1.1210053859964093, "step": 15610}, {"loss": 0.7334, "grad_norm": 0.7765318155288696, "learning_rate": 0.0002, "epoch": 1.1217235188509875, "step": 15620}, {"loss": 0.6977, "grad_norm": 0.7184962630271912, "learning_rate": 0.0002, "epoch": 1.1224416517055655, "step": 15630}, {"loss": 0.7362, "grad_norm": 0.7310904264450073, "learning_rate": 0.0002, "epoch": 1.1231597845601435, "step": 15640}, {"loss": 0.7278, "grad_norm": 0.7406452298164368, "learning_rate": 0.0002, "epoch": 1.1238779174147218, "step": 15650}, {"loss": 0.7074, "grad_norm": 0.7546738982200623, "learning_rate": 0.0002, "epoch": 1.1245960502692998, "step": 15660}, {"loss": 0.7641, "grad_norm": 0.7069764733314514, "learning_rate": 0.0002, "epoch": 1.125314183123878, "step": 15670}, {"loss": 0.76, "grad_norm": 0.6309521198272705, "learning_rate": 0.0002, "epoch": 1.126032315978456, "step": 15680}, {"loss": 0.7862, "grad_norm": 0.8050156831741333, "learning_rate": 0.0002, "epoch": 1.1267504488330342, "step": 15690}, {"loss": 0.7553, "grad_norm": 0.726556122303009, "learning_rate": 0.0002, "epoch": 1.1274685816876122, "step": 15700}, {"loss": 0.7763, "grad_norm": 0.77745521068573, "learning_rate": 0.0002, "epoch": 1.1281867145421902, "step": 15710}, {"loss": 0.7703, "grad_norm": 0.7467634677886963, "learning_rate": 0.0002, "epoch": 1.1289048473967684, "step": 15720}, {"loss": 0.7676, "grad_norm": 0.8207895755767822, "learning_rate": 0.0002, "epoch": 1.1296229802513464, "step": 15730}, {"loss": 0.6747, "grad_norm": 0.8253937363624573, "learning_rate": 0.0002, "epoch": 1.1303411131059247, "step": 15740}, {"loss": 0.6983, "grad_norm": 0.6313983798027039, "learning_rate": 0.0002, "epoch": 1.1310592459605027, "step": 15750}, {"loss": 0.6916, "grad_norm": 0.8040992021560669, "learning_rate": 0.0002, "epoch": 1.1317773788150807, "step": 15760}, {"loss": 0.7295, "grad_norm": 0.5937064290046692, "learning_rate": 0.0002, "epoch": 1.132495511669659, "step": 15770}, {"loss": 0.7494, "grad_norm": 0.6486281156539917, "learning_rate": 0.0002, "epoch": 1.133213644524237, "step": 15780}, {"loss": 0.7029, "grad_norm": 0.6161853075027466, "learning_rate": 0.0002, "epoch": 1.1339317773788151, "step": 15790}, {"loss": 0.7019, "grad_norm": 0.6926610469818115, "learning_rate": 0.0002, "epoch": 1.1346499102333931, "step": 15800}, {"loss": 0.6906, "grad_norm": 0.6084047555923462, "learning_rate": 0.0002, "epoch": 1.1353680430879713, "step": 15810}, {"loss": 0.7091, "grad_norm": 0.6928383111953735, "learning_rate": 0.0002, "epoch": 1.1360861759425493, "step": 15820}, {"loss": 0.7238, "grad_norm": 0.7784243822097778, "learning_rate": 0.0002, "epoch": 1.1368043087971276, "step": 15830}, {"loss": 0.6943, "grad_norm": 0.7169384956359863, "learning_rate": 0.0002, "epoch": 1.1375224416517056, "step": 15840}, {"loss": 0.7287, "grad_norm": 0.6953616142272949, "learning_rate": 0.0002, "epoch": 1.1382405745062836, "step": 15850}, {"loss": 0.7489, "grad_norm": 0.7345215082168579, "learning_rate": 0.0002, "epoch": 1.1389587073608618, "step": 15860}, {"loss": 0.683, "grad_norm": 0.5469502806663513, "learning_rate": 0.0002, "epoch": 1.1396768402154398, "step": 15870}, {"loss": 0.717, "grad_norm": 0.687680721282959, "learning_rate": 0.0002, "epoch": 1.140394973070018, "step": 15880}, {"loss": 0.7171, "grad_norm": 0.6879996657371521, "learning_rate": 0.0002, "epoch": 1.141113105924596, "step": 15890}, {"loss": 0.7321, "grad_norm": 0.728886067867279, "learning_rate": 0.0002, "epoch": 1.141831238779174, "step": 15900}, {"loss": 0.7752, "grad_norm": 0.929531455039978, "learning_rate": 0.0002, "epoch": 1.1425493716337523, "step": 15910}, {"loss": 0.7353, "grad_norm": 0.8122507333755493, "learning_rate": 0.0002, "epoch": 1.1432675044883303, "step": 15920}, {"loss": 0.7138, "grad_norm": 0.6494652628898621, "learning_rate": 0.0002, "epoch": 1.1439856373429085, "step": 15930}, {"loss": 0.7489, "grad_norm": 0.7307567596435547, "learning_rate": 0.0002, "epoch": 1.1447037701974865, "step": 15940}, {"loss": 0.7385, "grad_norm": 0.548678994178772, "learning_rate": 0.0002, "epoch": 1.1454219030520647, "step": 15950}, {"loss": 0.7152, "grad_norm": 0.8011603951454163, "learning_rate": 0.0002, "epoch": 1.1461400359066427, "step": 15960}, {"loss": 0.7324, "grad_norm": 0.7026647329330444, "learning_rate": 0.0002, "epoch": 1.146858168761221, "step": 15970}, {"loss": 0.7464, "grad_norm": 0.7338995933532715, "learning_rate": 0.0002, "epoch": 1.147576301615799, "step": 15980}, {"loss": 0.7416, "grad_norm": 0.8453443646430969, "learning_rate": 0.0002, "epoch": 1.148294434470377, "step": 15990}, {"loss": 0.7419, "grad_norm": 0.6787207126617432, "learning_rate": 0.0002, "epoch": 1.1490125673249552, "step": 16000}, {"loss": 0.7487, "grad_norm": 0.6314631104469299, "learning_rate": 0.0002, "epoch": 1.1497307001795332, "step": 16010}, {"loss": 0.7165, "grad_norm": 0.8812752962112427, "learning_rate": 0.0002, "epoch": 1.1504488330341114, "step": 16020}, {"loss": 0.774, "grad_norm": 0.6528969407081604, "learning_rate": 0.0002, "epoch": 1.1511669658886894, "step": 16030}, {"loss": 0.7321, "grad_norm": 0.7843571305274963, "learning_rate": 0.0002, "epoch": 1.1518850987432674, "step": 16040}, {"loss": 0.7769, "grad_norm": 0.7095080018043518, "learning_rate": 0.0002, "epoch": 1.1526032315978456, "step": 16050}, {"loss": 0.744, "grad_norm": 0.7495582103729248, "learning_rate": 0.0002, "epoch": 1.1533213644524236, "step": 16060}, {"loss": 0.7813, "grad_norm": 0.6002049446105957, "learning_rate": 0.0002, "epoch": 1.1540394973070018, "step": 16070}, {"loss": 0.7117, "grad_norm": 0.565014123916626, "learning_rate": 0.0002, "epoch": 1.1547576301615798, "step": 16080}, {"loss": 0.7664, "grad_norm": 0.8209971785545349, "learning_rate": 0.0002, "epoch": 1.155475763016158, "step": 16090}, {"loss": 0.7486, "grad_norm": 0.7137531042098999, "learning_rate": 0.0002, "epoch": 1.156193895870736, "step": 16100}, {"loss": 0.7197, "grad_norm": 0.7307516932487488, "learning_rate": 0.0002, "epoch": 1.1569120287253143, "step": 16110}, {"loss": 0.7351, "grad_norm": 0.6686444878578186, "learning_rate": 0.0002, "epoch": 1.1576301615798923, "step": 16120}, {"loss": 0.7407, "grad_norm": 0.7977298498153687, "learning_rate": 0.0002, "epoch": 1.1583482944344703, "step": 16130}, {"loss": 0.6696, "grad_norm": 0.6980607509613037, "learning_rate": 0.0002, "epoch": 1.1590664272890485, "step": 16140}, {"loss": 0.7513, "grad_norm": 0.6622613668441772, "learning_rate": 0.0002, "epoch": 1.1597845601436265, "step": 16150}, {"loss": 0.7162, "grad_norm": 0.6598347425460815, "learning_rate": 0.0002, "epoch": 1.1605026929982047, "step": 16160}, {"loss": 0.7418, "grad_norm": 0.6686234474182129, "learning_rate": 0.0002, "epoch": 1.1612208258527827, "step": 16170}, {"loss": 0.7104, "grad_norm": 0.7308177947998047, "learning_rate": 0.0002, "epoch": 1.1619389587073607, "step": 16180}, {"loss": 0.7337, "grad_norm": 0.939537525177002, "learning_rate": 0.0002, "epoch": 1.162657091561939, "step": 16190}, {"loss": 0.7054, "grad_norm": 0.5514758825302124, "learning_rate": 0.0002, "epoch": 1.163375224416517, "step": 16200}, {"loss": 0.7449, "grad_norm": 0.589142918586731, "learning_rate": 0.0002, "epoch": 1.1640933572710952, "step": 16210}, {"loss": 0.7438, "grad_norm": 0.6888012290000916, "learning_rate": 0.0002, "epoch": 1.1648114901256732, "step": 16220}, {"loss": 0.719, "grad_norm": 0.82566899061203, "learning_rate": 0.0002, "epoch": 1.1655296229802514, "step": 16230}, {"loss": 0.7274, "grad_norm": 0.6107817888259888, "learning_rate": 0.0002, "epoch": 1.1662477558348294, "step": 16240}, {"loss": 0.6849, "grad_norm": 0.7831398844718933, "learning_rate": 0.0002, "epoch": 1.1669658886894076, "step": 16250}, {"loss": 0.7077, "grad_norm": 0.6468397974967957, "learning_rate": 0.0002, "epoch": 1.1676840215439857, "step": 16260}, {"loss": 0.7056, "grad_norm": 0.7284161448478699, "learning_rate": 0.0002, "epoch": 1.1684021543985637, "step": 16270}, {"loss": 0.7476, "grad_norm": 0.6182818412780762, "learning_rate": 0.0002, "epoch": 1.1691202872531419, "step": 16280}, {"loss": 0.7608, "grad_norm": 0.7091781497001648, "learning_rate": 0.0002, "epoch": 1.1698384201077199, "step": 16290}, {"loss": 0.7235, "grad_norm": 0.7327643632888794, "learning_rate": 0.0002, "epoch": 1.170556552962298, "step": 16300}, {"loss": 0.7304, "grad_norm": 0.5864694118499756, "learning_rate": 0.0002, "epoch": 1.171274685816876, "step": 16310}, {"loss": 0.7011, "grad_norm": 0.7049986720085144, "learning_rate": 0.0002, "epoch": 1.171992818671454, "step": 16320}, {"loss": 0.7234, "grad_norm": 0.7563399076461792, "learning_rate": 0.0002, "epoch": 1.1727109515260323, "step": 16330}, {"loss": 0.7313, "grad_norm": 0.5888143181800842, "learning_rate": 0.0002, "epoch": 1.1734290843806103, "step": 16340}, {"loss": 0.7078, "grad_norm": 0.8670049905776978, "learning_rate": 0.0002, "epoch": 1.1741472172351886, "step": 16350}, {"loss": 0.7656, "grad_norm": 0.8045654296875, "learning_rate": 0.0002, "epoch": 1.1748653500897666, "step": 16360}, {"loss": 0.7942, "grad_norm": 0.9115668535232544, "learning_rate": 0.0002, "epoch": 1.1755834829443448, "step": 16370}, {"loss": 0.6807, "grad_norm": 0.6943584084510803, "learning_rate": 0.0002, "epoch": 1.1763016157989228, "step": 16380}, {"loss": 0.7558, "grad_norm": 0.7931740283966064, "learning_rate": 0.0002, "epoch": 1.177019748653501, "step": 16390}, {"loss": 0.7247, "grad_norm": 0.7967953085899353, "learning_rate": 0.0002, "epoch": 1.177737881508079, "step": 16400}, {"loss": 0.7294, "grad_norm": 0.575165867805481, "learning_rate": 0.0002, "epoch": 1.178456014362657, "step": 16410}, {"loss": 0.8045, "grad_norm": 0.6803409457206726, "learning_rate": 0.0002, "epoch": 1.1791741472172352, "step": 16420}, {"loss": 0.7594, "grad_norm": 0.7661909461021423, "learning_rate": 0.0002, "epoch": 1.1798922800718132, "step": 16430}, {"loss": 0.7387, "grad_norm": 0.7907630205154419, "learning_rate": 0.0002, "epoch": 1.1806104129263915, "step": 16440}, {"loss": 0.6954, "grad_norm": 0.7215338945388794, "learning_rate": 0.0002, "epoch": 1.1813285457809695, "step": 16450}, {"loss": 0.7503, "grad_norm": 0.6824054718017578, "learning_rate": 0.0002, "epoch": 1.1820466786355475, "step": 16460}, {"loss": 0.7548, "grad_norm": 0.8057665228843689, "learning_rate": 0.0002, "epoch": 1.1827648114901257, "step": 16470}, {"loss": 0.7572, "grad_norm": 0.7487542033195496, "learning_rate": 0.0002, "epoch": 1.1834829443447037, "step": 16480}, {"loss": 0.7267, "grad_norm": 0.7254953384399414, "learning_rate": 0.0002, "epoch": 1.184201077199282, "step": 16490}, {"loss": 0.6906, "grad_norm": 0.6986604332923889, "learning_rate": 0.0002, "epoch": 1.18491921005386, "step": 16500}, {"loss": 0.6979, "grad_norm": 0.7889591455459595, "learning_rate": 0.0002, "epoch": 1.1856373429084381, "step": 16510}, {"loss": 0.7455, "grad_norm": 0.6029604077339172, "learning_rate": 0.0002, "epoch": 1.1863554757630161, "step": 16520}, {"loss": 0.7673, "grad_norm": 0.680322527885437, "learning_rate": 0.0002, "epoch": 1.1870736086175944, "step": 16530}, {"loss": 0.708, "grad_norm": 0.8588826060295105, "learning_rate": 0.0002, "epoch": 1.1877917414721724, "step": 16540}, {"loss": 0.7291, "grad_norm": 0.7614806890487671, "learning_rate": 0.0002, "epoch": 1.1885098743267504, "step": 16550}, {"loss": 0.7021, "grad_norm": 0.7523183226585388, "learning_rate": 0.0002, "epoch": 1.1892280071813286, "step": 16560}, {"loss": 0.7452, "grad_norm": 0.8299532532691956, "learning_rate": 0.0002, "epoch": 1.1899461400359066, "step": 16570}, {"loss": 0.7409, "grad_norm": 0.6709241271018982, "learning_rate": 0.0002, "epoch": 1.1906642728904848, "step": 16580}, {"loss": 0.7322, "grad_norm": 0.665414035320282, "learning_rate": 0.0002, "epoch": 1.1913824057450628, "step": 16590}, {"loss": 0.7699, "grad_norm": 0.7582152485847473, "learning_rate": 0.0002, "epoch": 1.1921005385996408, "step": 16600}, {"loss": 0.7069, "grad_norm": 0.5856947302818298, "learning_rate": 0.0002, "epoch": 1.192818671454219, "step": 16610}, {"loss": 0.7444, "grad_norm": 0.6972885727882385, "learning_rate": 0.0002, "epoch": 1.193536804308797, "step": 16620}, {"loss": 0.7265, "grad_norm": 0.6884734630584717, "learning_rate": 0.0002, "epoch": 1.1942549371633753, "step": 16630}, {"loss": 0.6881, "grad_norm": 0.7380475401878357, "learning_rate": 0.0002, "epoch": 1.1949730700179533, "step": 16640}, {"loss": 0.7297, "grad_norm": 0.7976197600364685, "learning_rate": 0.0002, "epoch": 1.1956912028725315, "step": 16650}, {"loss": 0.7328, "grad_norm": 0.819256067276001, "learning_rate": 0.0002, "epoch": 1.1964093357271095, "step": 16660}, {"loss": 0.771, "grad_norm": 0.587867796421051, "learning_rate": 0.0002, "epoch": 1.1971274685816877, "step": 16670}, {"loss": 0.7357, "grad_norm": 0.9162678122520447, "learning_rate": 0.0002, "epoch": 1.1978456014362657, "step": 16680}, {"loss": 0.7472, "grad_norm": 0.7452084422111511, "learning_rate": 0.0002, "epoch": 1.1985637342908437, "step": 16690}, {"loss": 0.7257, "grad_norm": 0.7966971397399902, "learning_rate": 0.0002, "epoch": 1.199281867145422, "step": 16700}, {"loss": 0.8051, "grad_norm": 0.6605724692344666, "learning_rate": 0.0002, "epoch": 1.2, "step": 16710}, {"loss": 0.729, "grad_norm": 0.6499220728874207, "learning_rate": 0.0002, "epoch": 1.2007181328545782, "step": 16720}, {"loss": 0.7107, "grad_norm": 0.7422114610671997, "learning_rate": 0.0002, "epoch": 1.2014362657091562, "step": 16730}, {"loss": 0.6712, "grad_norm": 0.6652370095252991, "learning_rate": 0.0002, "epoch": 1.2021543985637342, "step": 16740}, {"loss": 0.7804, "grad_norm": 0.8761070370674133, "learning_rate": 0.0002, "epoch": 1.2028725314183124, "step": 16750}, {"loss": 0.737, "grad_norm": 0.7294463515281677, "learning_rate": 0.0002, "epoch": 1.2035906642728904, "step": 16760}, {"loss": 0.7638, "grad_norm": 0.7725599408149719, "learning_rate": 0.0002, "epoch": 1.2043087971274686, "step": 16770}, {"loss": 0.6857, "grad_norm": 0.5630005598068237, "learning_rate": 0.0002, "epoch": 1.2050269299820466, "step": 16780}, {"loss": 0.7344, "grad_norm": 0.7601404786109924, "learning_rate": 0.0002, "epoch": 1.2057450628366249, "step": 16790}, {"loss": 0.729, "grad_norm": 0.6859985589981079, "learning_rate": 0.0002, "epoch": 1.2064631956912029, "step": 16800}, {"loss": 0.7203, "grad_norm": 0.7040054798126221, "learning_rate": 0.0002, "epoch": 1.207181328545781, "step": 16810}, {"loss": 0.7727, "grad_norm": 0.7058989405632019, "learning_rate": 0.0002, "epoch": 1.207899461400359, "step": 16820}, {"loss": 0.7247, "grad_norm": 0.7646133899688721, "learning_rate": 0.0002, "epoch": 1.208617594254937, "step": 16830}, {"loss": 0.7903, "grad_norm": 0.669550359249115, "learning_rate": 0.0002, "epoch": 1.2093357271095153, "step": 16840}, {"loss": 0.7313, "grad_norm": 0.6613401174545288, "learning_rate": 0.0002, "epoch": 1.2100538599640933, "step": 16850}, {"loss": 0.7181, "grad_norm": 0.8636519312858582, "learning_rate": 0.0002, "epoch": 1.2107719928186715, "step": 16860}, {"loss": 0.7111, "grad_norm": 0.6077507138252258, "learning_rate": 0.0002, "epoch": 1.2114901256732495, "step": 16870}, {"loss": 0.7706, "grad_norm": 0.7892228364944458, "learning_rate": 0.0002, "epoch": 1.2122082585278275, "step": 16880}, {"loss": 0.685, "grad_norm": 0.7424154877662659, "learning_rate": 0.0002, "epoch": 1.2129263913824058, "step": 16890}, {"loss": 0.6707, "grad_norm": 0.6525408029556274, "learning_rate": 0.0002, "epoch": 1.2136445242369838, "step": 16900}, {"loss": 0.7721, "grad_norm": 0.6178015470504761, "learning_rate": 0.0002, "epoch": 1.214362657091562, "step": 16910}, {"loss": 0.6971, "grad_norm": 0.7319437861442566, "learning_rate": 0.0002, "epoch": 1.21508078994614, "step": 16920}, {"loss": 0.7261, "grad_norm": 0.6823344826698303, "learning_rate": 0.0002, "epoch": 1.2157989228007182, "step": 16930}, {"loss": 0.7048, "grad_norm": 0.5681257843971252, "learning_rate": 0.0002, "epoch": 1.2165170556552962, "step": 16940}, {"loss": 0.7398, "grad_norm": 0.7939814925193787, "learning_rate": 0.0002, "epoch": 1.2172351885098744, "step": 16950}, {"loss": 0.7192, "grad_norm": 0.7031611800193787, "learning_rate": 0.0002, "epoch": 1.2179533213644524, "step": 16960}, {"loss": 0.7212, "grad_norm": 0.7610133290290833, "learning_rate": 0.0002, "epoch": 1.2186714542190304, "step": 16970}, {"loss": 0.7599, "grad_norm": 0.8707142472267151, "learning_rate": 0.0002, "epoch": 1.2193895870736087, "step": 16980}, {"loss": 0.7121, "grad_norm": 0.6603384017944336, "learning_rate": 0.0002, "epoch": 1.2201077199281867, "step": 16990}, {"loss": 0.7315, "grad_norm": 0.7218315005302429, "learning_rate": 0.0002, "epoch": 1.220825852782765, "step": 17000}, {"loss": 0.7513, "grad_norm": 0.8043148517608643, "learning_rate": 0.0002, "epoch": 1.221543985637343, "step": 17010}, {"loss": 0.6749, "grad_norm": 0.7232559323310852, "learning_rate": 0.0002, "epoch": 1.222262118491921, "step": 17020}, {"loss": 0.7681, "grad_norm": 0.690376341342926, "learning_rate": 0.0002, "epoch": 1.2229802513464991, "step": 17030}, {"loss": 0.7042, "grad_norm": 0.602436363697052, "learning_rate": 0.0002, "epoch": 1.2236983842010771, "step": 17040}, {"loss": 0.7129, "grad_norm": 0.7610493898391724, "learning_rate": 0.0002, "epoch": 1.2244165170556554, "step": 17050}, {"loss": 0.758, "grad_norm": 0.7504690885543823, "learning_rate": 0.0002, "epoch": 1.2251346499102334, "step": 17060}, {"loss": 0.6908, "grad_norm": 0.8080246448516846, "learning_rate": 0.0002, "epoch": 1.2258527827648116, "step": 17070}, {"loss": 0.7519, "grad_norm": 1.0240572690963745, "learning_rate": 0.0002, "epoch": 1.2265709156193896, "step": 17080}, {"loss": 0.7193, "grad_norm": 0.6874111294746399, "learning_rate": 0.0002, "epoch": 1.2272890484739678, "step": 17090}, {"loss": 0.79, "grad_norm": 0.800069272518158, "learning_rate": 0.0002, "epoch": 1.2280071813285458, "step": 17100}, {"loss": 0.742, "grad_norm": 0.8628103137016296, "learning_rate": 0.0002, "epoch": 1.2287253141831238, "step": 17110}, {"loss": 0.7022, "grad_norm": 0.7408499121665955, "learning_rate": 0.0002, "epoch": 1.229443447037702, "step": 17120}, {"loss": 0.6774, "grad_norm": 0.6494335532188416, "learning_rate": 0.0002, "epoch": 1.23016157989228, "step": 17130}, {"loss": 0.7025, "grad_norm": 0.6493549942970276, "learning_rate": 0.0002, "epoch": 1.2308797127468583, "step": 17140}, {"loss": 0.7448, "grad_norm": 0.6972658038139343, "learning_rate": 0.0002, "epoch": 1.2315978456014363, "step": 17150}, {"loss": 0.7219, "grad_norm": 0.6877315044403076, "learning_rate": 0.0002, "epoch": 1.2323159784560143, "step": 17160}, {"loss": 0.7945, "grad_norm": 0.7569024562835693, "learning_rate": 0.0002, "epoch": 1.2330341113105925, "step": 17170}, {"loss": 0.7467, "grad_norm": 0.696260392665863, "learning_rate": 0.0002, "epoch": 1.2337522441651705, "step": 17180}, {"loss": 0.6716, "grad_norm": 0.6150345802307129, "learning_rate": 0.0002, "epoch": 1.2344703770197487, "step": 17190}, {"loss": 0.7416, "grad_norm": 0.69009929895401, "learning_rate": 0.0002, "epoch": 1.2351885098743267, "step": 17200}, {"loss": 0.787, "grad_norm": 0.7035185098648071, "learning_rate": 0.0002, "epoch": 1.235906642728905, "step": 17210}, {"loss": 0.6896, "grad_norm": 0.6792506575584412, "learning_rate": 0.0002, "epoch": 1.236624775583483, "step": 17220}, {"loss": 0.6953, "grad_norm": 0.6310356855392456, "learning_rate": 0.0002, "epoch": 1.2373429084380612, "step": 17230}, {"loss": 0.7531, "grad_norm": 0.647026538848877, "learning_rate": 0.0002, "epoch": 1.2380610412926392, "step": 17240}, {"loss": 0.8014, "grad_norm": 0.7609930038452148, "learning_rate": 0.0002, "epoch": 1.2387791741472172, "step": 17250}, {"loss": 0.8045, "grad_norm": 0.791890561580658, "learning_rate": 0.0002, "epoch": 1.2394973070017954, "step": 17260}, {"loss": 0.7445, "grad_norm": 0.7126715183258057, "learning_rate": 0.0002, "epoch": 1.2402154398563734, "step": 17270}, {"loss": 0.6561, "grad_norm": 0.7850401401519775, "learning_rate": 0.0002, "epoch": 1.2409335727109516, "step": 17280}, {"loss": 0.7454, "grad_norm": 0.6694281697273254, "learning_rate": 0.0002, "epoch": 1.2416517055655296, "step": 17290}, {"loss": 0.6711, "grad_norm": 0.6418080925941467, "learning_rate": 0.0002, "epoch": 1.2423698384201076, "step": 17300}, {"loss": 0.7504, "grad_norm": 0.7308132648468018, "learning_rate": 0.0002, "epoch": 1.2430879712746858, "step": 17310}, {"loss": 0.6896, "grad_norm": 0.8322312235832214, "learning_rate": 0.0002, "epoch": 1.2438061041292638, "step": 17320}, {"loss": 0.7341, "grad_norm": 0.6959006190299988, "learning_rate": 0.0002, "epoch": 1.244524236983842, "step": 17330}, {"loss": 0.7025, "grad_norm": 0.7110121846199036, "learning_rate": 0.0002, "epoch": 1.24524236983842, "step": 17340}, {"loss": 0.7858, "grad_norm": 0.6496296525001526, "learning_rate": 0.0002, "epoch": 1.2459605026929983, "step": 17350}, {"loss": 0.7061, "grad_norm": 0.7649076581001282, "learning_rate": 0.0002, "epoch": 1.2466786355475763, "step": 17360}, {"loss": 0.7155, "grad_norm": 0.7139049172401428, "learning_rate": 0.0002, "epoch": 1.2473967684021545, "step": 17370}, {"loss": 0.6932, "grad_norm": 0.7709113955497742, "learning_rate": 0.0002, "epoch": 1.2481149012567325, "step": 17380}, {"loss": 0.731, "grad_norm": 0.7160373330116272, "learning_rate": 0.0002, "epoch": 1.2488330341113105, "step": 17390}, {"loss": 0.7146, "grad_norm": 0.5608301162719727, "learning_rate": 0.0002, "epoch": 1.2495511669658887, "step": 17400}, {"loss": 0.7368, "grad_norm": 0.6913180351257324, "learning_rate": 0.0002, "epoch": 1.2502692998204668, "step": 17410}, {"loss": 0.7167, "grad_norm": 0.6980322599411011, "learning_rate": 0.0002, "epoch": 1.250987432675045, "step": 17420}, {"loss": 0.7096, "grad_norm": 0.8155394792556763, "learning_rate": 0.0002, "epoch": 1.251705565529623, "step": 17430}, {"loss": 0.7477, "grad_norm": 0.8015886545181274, "learning_rate": 0.0002, "epoch": 1.252423698384201, "step": 17440}, {"loss": 0.7006, "grad_norm": 0.5985556244850159, "learning_rate": 0.0002, "epoch": 1.2531418312387792, "step": 17450}, {"loss": 0.7171, "grad_norm": 0.70317143201828, "learning_rate": 0.0002, "epoch": 1.2538599640933572, "step": 17460}, {"loss": 0.7006, "grad_norm": 0.612501323223114, "learning_rate": 0.0002, "epoch": 1.2545780969479354, "step": 17470}, {"loss": 0.7639, "grad_norm": 0.7347102165222168, "learning_rate": 0.0002, "epoch": 1.2552962298025134, "step": 17480}, {"loss": 0.7303, "grad_norm": 0.9189441800117493, "learning_rate": 0.0002, "epoch": 1.2560143626570914, "step": 17490}, {"loss": 0.7547, "grad_norm": 0.7727932929992676, "learning_rate": 0.0002, "epoch": 1.2567324955116697, "step": 17500}, {"loss": 0.6979, "grad_norm": 0.6782869696617126, "learning_rate": 0.0002, "epoch": 1.2574506283662479, "step": 17510}, {"loss": 0.7146, "grad_norm": 0.5710638761520386, "learning_rate": 0.0002, "epoch": 1.2581687612208259, "step": 17520}, {"loss": 0.6999, "grad_norm": 0.6856266856193542, "learning_rate": 0.0002, "epoch": 1.2588868940754039, "step": 17530}, {"loss": 0.7229, "grad_norm": 0.7257347702980042, "learning_rate": 0.0002, "epoch": 1.259605026929982, "step": 17540}, {"loss": 0.7475, "grad_norm": 0.6343092918395996, "learning_rate": 0.0002, "epoch": 1.26032315978456, "step": 17550}, {"loss": 0.7863, "grad_norm": 0.6482594013214111, "learning_rate": 0.0002, "epoch": 1.2610412926391383, "step": 17560}, {"loss": 0.716, "grad_norm": 0.6542837619781494, "learning_rate": 0.0002, "epoch": 1.2617594254937163, "step": 17570}, {"loss": 0.7871, "grad_norm": 0.7106123566627502, "learning_rate": 0.0002, "epoch": 1.2624775583482943, "step": 17580}, {"loss": 0.7446, "grad_norm": 0.9081960320472717, "learning_rate": 0.0002, "epoch": 1.2631956912028726, "step": 17590}, {"loss": 0.7591, "grad_norm": 0.7010290026664734, "learning_rate": 0.0002, "epoch": 1.2639138240574506, "step": 17600}, {"loss": 0.7391, "grad_norm": 0.9973132610321045, "learning_rate": 0.0002, "epoch": 1.2646319569120288, "step": 17610}, {"loss": 0.725, "grad_norm": 0.8003297448158264, "learning_rate": 0.0002, "epoch": 1.2653500897666068, "step": 17620}, {"loss": 0.697, "grad_norm": 0.7383468151092529, "learning_rate": 0.0002, "epoch": 1.2660682226211848, "step": 17630}, {"loss": 0.785, "grad_norm": 0.6337200999259949, "learning_rate": 0.0002, "epoch": 1.266786355475763, "step": 17640}, {"loss": 0.7469, "grad_norm": 0.6371761560440063, "learning_rate": 0.0002, "epoch": 1.2675044883303412, "step": 17650}, {"loss": 0.7348, "grad_norm": 0.7283522486686707, "learning_rate": 0.0002, "epoch": 1.2682226211849192, "step": 17660}, {"loss": 0.7251, "grad_norm": 0.8191015720367432, "learning_rate": 0.0002, "epoch": 1.2689407540394972, "step": 17670}, {"loss": 0.7558, "grad_norm": 0.6210351586341858, "learning_rate": 0.0002, "epoch": 1.2696588868940755, "step": 17680}, {"loss": 0.7733, "grad_norm": 0.6563277840614319, "learning_rate": 0.0002, "epoch": 1.2703770197486535, "step": 17690}, {"loss": 0.7065, "grad_norm": 0.7111260294914246, "learning_rate": 0.0002, "epoch": 1.2710951526032317, "step": 17700}, {"loss": 0.7079, "grad_norm": 0.7061500549316406, "learning_rate": 0.0002, "epoch": 1.2718132854578097, "step": 17710}, {"loss": 0.7612, "grad_norm": 0.7657744884490967, "learning_rate": 0.0002, "epoch": 1.2725314183123877, "step": 17720}, {"loss": 0.7513, "grad_norm": 0.6952996850013733, "learning_rate": 0.0002, "epoch": 1.273249551166966, "step": 17730}, {"loss": 0.7402, "grad_norm": 0.5678043961524963, "learning_rate": 0.0002, "epoch": 1.273967684021544, "step": 17740}, {"loss": 0.7357, "grad_norm": 0.8608036041259766, "learning_rate": 0.0002, "epoch": 1.2746858168761221, "step": 17750}, {"loss": 0.7482, "grad_norm": 0.7184045910835266, "learning_rate": 0.0002, "epoch": 1.2754039497307001, "step": 17760}, {"loss": 0.7277, "grad_norm": 0.6647557616233826, "learning_rate": 0.0002, "epoch": 1.2761220825852782, "step": 17770}, {"loss": 0.6866, "grad_norm": 0.6899349093437195, "learning_rate": 0.0002, "epoch": 1.2768402154398564, "step": 17780}, {"loss": 0.721, "grad_norm": 0.7073346972465515, "learning_rate": 0.0002, "epoch": 1.2775583482944346, "step": 17790}, {"loss": 0.7432, "grad_norm": 0.8896707892417908, "learning_rate": 0.0002, "epoch": 1.2782764811490126, "step": 17800}, {"loss": 0.7318, "grad_norm": 0.5072778463363647, "learning_rate": 0.0002, "epoch": 1.2789946140035906, "step": 17810}, {"loss": 0.7648, "grad_norm": 0.8889711499214172, "learning_rate": 0.0002, "epoch": 1.2797127468581688, "step": 17820}, {"loss": 0.6894, "grad_norm": 0.5583778619766235, "learning_rate": 0.0002, "epoch": 1.2804308797127468, "step": 17830}, {"loss": 0.7488, "grad_norm": 0.6526148915290833, "learning_rate": 0.0002, "epoch": 1.281149012567325, "step": 17840}, {"loss": 0.7462, "grad_norm": 0.7658175826072693, "learning_rate": 0.0002, "epoch": 1.281867145421903, "step": 17850}, {"loss": 0.7298, "grad_norm": 0.5547847151756287, "learning_rate": 0.0002, "epoch": 1.282585278276481, "step": 17860}, {"loss": 0.705, "grad_norm": 0.6153780817985535, "learning_rate": 0.0002, "epoch": 1.2833034111310593, "step": 17870}, {"loss": 0.7173, "grad_norm": 0.8474061489105225, "learning_rate": 0.0002, "epoch": 1.2840215439856373, "step": 17880}, {"loss": 0.7597, "grad_norm": 0.859260618686676, "learning_rate": 0.0002, "epoch": 1.2847396768402155, "step": 17890}, {"loss": 0.7237, "grad_norm": 0.7270520329475403, "learning_rate": 0.0002, "epoch": 1.2854578096947935, "step": 17900}, {"loss": 0.701, "grad_norm": 0.8166249394416809, "learning_rate": 0.0002, "epoch": 1.2861759425493715, "step": 17910}, {"loss": 0.686, "grad_norm": 0.9158982038497925, "learning_rate": 0.0002, "epoch": 1.2868940754039497, "step": 17920}, {"loss": 0.7243, "grad_norm": 0.8132565021514893, "learning_rate": 0.0002, "epoch": 1.287612208258528, "step": 17930}, {"loss": 0.6909, "grad_norm": 0.7914409637451172, "learning_rate": 0.0002, "epoch": 1.288330341113106, "step": 17940}, {"loss": 0.7034, "grad_norm": 0.6256071329116821, "learning_rate": 0.0002, "epoch": 1.289048473967684, "step": 17950}, {"loss": 0.7279, "grad_norm": 0.6463542580604553, "learning_rate": 0.0002, "epoch": 1.2897666068222622, "step": 17960}, {"loss": 0.7601, "grad_norm": 0.6702672839164734, "learning_rate": 0.0002, "epoch": 1.2904847396768402, "step": 17970}, {"loss": 0.7355, "grad_norm": 0.8666605949401855, "learning_rate": 0.0002, "epoch": 1.2912028725314184, "step": 17980}, {"loss": 0.6838, "grad_norm": 0.8055952787399292, "learning_rate": 0.0002, "epoch": 1.2919210053859964, "step": 17990}, {"loss": 0.7361, "grad_norm": 0.6909741163253784, "learning_rate": 0.0002, "epoch": 1.2926391382405744, "step": 18000}, {"loss": 0.7766, "grad_norm": 0.663702130317688, "learning_rate": 0.0002, "epoch": 1.2933572710951526, "step": 18010}, {"loss": 0.7071, "grad_norm": 0.6952448487281799, "learning_rate": 0.0002, "epoch": 1.2940754039497306, "step": 18020}, {"loss": 0.7359, "grad_norm": 0.5722854137420654, "learning_rate": 0.0002, "epoch": 1.2947935368043089, "step": 18030}, {"loss": 0.764, "grad_norm": 0.7987681031227112, "learning_rate": 0.0002, "epoch": 1.2955116696588869, "step": 18040}, {"loss": 0.743, "grad_norm": 0.661133348941803, "learning_rate": 0.0002, "epoch": 1.2962298025134649, "step": 18050}, {"loss": 0.7627, "grad_norm": 0.6025064587593079, "learning_rate": 0.0002, "epoch": 1.296947935368043, "step": 18060}, {"loss": 0.7242, "grad_norm": 0.7569907903671265, "learning_rate": 0.0002, "epoch": 1.2976660682226213, "step": 18070}, {"loss": 0.7234, "grad_norm": 0.7222012281417847, "learning_rate": 0.0002, "epoch": 1.2983842010771993, "step": 18080}, {"loss": 0.7133, "grad_norm": 0.5291963815689087, "learning_rate": 0.0002, "epoch": 1.2991023339317773, "step": 18090}, {"loss": 0.7215, "grad_norm": 0.6808363199234009, "learning_rate": 0.0002, "epoch": 1.2998204667863555, "step": 18100}, {"loss": 0.7621, "grad_norm": 0.6797927618026733, "learning_rate": 0.0002, "epoch": 1.3005385996409335, "step": 18110}, {"loss": 0.7474, "grad_norm": 0.7775542140007019, "learning_rate": 0.0002, "epoch": 1.3012567324955118, "step": 18120}, {"loss": 0.7376, "grad_norm": 0.7369466423988342, "learning_rate": 0.0002, "epoch": 1.3019748653500898, "step": 18130}, {"loss": 0.7098, "grad_norm": 0.6822494864463806, "learning_rate": 0.0002, "epoch": 1.3026929982046678, "step": 18140}, {"loss": 0.7675, "grad_norm": 0.9222138524055481, "learning_rate": 0.0002, "epoch": 1.303411131059246, "step": 18150}, {"loss": 0.7593, "grad_norm": 0.7485767006874084, "learning_rate": 0.0002, "epoch": 1.304129263913824, "step": 18160}, {"loss": 0.7293, "grad_norm": 0.6383684277534485, "learning_rate": 0.0002, "epoch": 1.3048473967684022, "step": 18170}, {"loss": 0.7929, "grad_norm": 0.5934187173843384, "learning_rate": 0.0002, "epoch": 1.3055655296229802, "step": 18180}, {"loss": 0.7576, "grad_norm": 0.7265770435333252, "learning_rate": 0.0002, "epoch": 1.3062836624775582, "step": 18190}, {"loss": 0.7126, "grad_norm": 0.8149140477180481, "learning_rate": 0.0002, "epoch": 1.3070017953321365, "step": 18200}, {"loss": 0.7529, "grad_norm": 0.8067880272865295, "learning_rate": 0.0002, "epoch": 1.3077199281867147, "step": 18210}, {"loss": 0.7173, "grad_norm": 0.6109178066253662, "learning_rate": 0.0002, "epoch": 1.3084380610412927, "step": 18220}, {"loss": 0.7452, "grad_norm": 0.7194176316261292, "learning_rate": 0.0002, "epoch": 1.3091561938958707, "step": 18230}, {"loss": 0.732, "grad_norm": 0.6452242136001587, "learning_rate": 0.0002, "epoch": 1.309874326750449, "step": 18240}, {"loss": 0.7772, "grad_norm": 0.680550217628479, "learning_rate": 0.0002, "epoch": 1.310592459605027, "step": 18250}, {"loss": 0.7334, "grad_norm": 0.7005740404129028, "learning_rate": 0.0002, "epoch": 1.3113105924596051, "step": 18260}, {"loss": 0.7537, "grad_norm": 0.7217825055122375, "learning_rate": 0.0002, "epoch": 1.3120287253141831, "step": 18270}, {"loss": 0.7797, "grad_norm": 0.7730209231376648, "learning_rate": 0.0002, "epoch": 1.3127468581687611, "step": 18280}, {"loss": 0.7257, "grad_norm": 0.8291956186294556, "learning_rate": 0.0002, "epoch": 1.3134649910233394, "step": 18290}, {"loss": 0.7234, "grad_norm": 0.758528470993042, "learning_rate": 0.0002, "epoch": 1.3141831238779174, "step": 18300}, {"loss": 0.6915, "grad_norm": 0.9682782292366028, "learning_rate": 0.0002, "epoch": 1.3149012567324956, "step": 18310}, {"loss": 0.686, "grad_norm": 0.5784780979156494, "learning_rate": 0.0002, "epoch": 1.3156193895870736, "step": 18320}, {"loss": 0.7277, "grad_norm": 0.5870532393455505, "learning_rate": 0.0002, "epoch": 1.3163375224416516, "step": 18330}, {"loss": 0.7594, "grad_norm": 0.5950172543525696, "learning_rate": 0.0002, "epoch": 1.3170556552962298, "step": 18340}, {"loss": 0.7086, "grad_norm": 0.7625961899757385, "learning_rate": 0.0002, "epoch": 1.317773788150808, "step": 18350}, {"loss": 0.7075, "grad_norm": 0.8027397394180298, "learning_rate": 0.0002, "epoch": 1.318491921005386, "step": 18360}, {"loss": 0.7249, "grad_norm": 0.8424779772758484, "learning_rate": 0.0002, "epoch": 1.319210053859964, "step": 18370}, {"loss": 0.7349, "grad_norm": 0.5741737484931946, "learning_rate": 0.0002, "epoch": 1.3199281867145423, "step": 18380}, {"loss": 0.7421, "grad_norm": 0.7363710999488831, "learning_rate": 0.0002, "epoch": 1.3206463195691203, "step": 18390}, {"loss": 0.7208, "grad_norm": 0.7900536060333252, "learning_rate": 0.0002, "epoch": 1.3213644524236985, "step": 18400}, {"loss": 0.6836, "grad_norm": 0.6273105144500732, "learning_rate": 0.0002, "epoch": 1.3220825852782765, "step": 18410}, {"loss": 0.7365, "grad_norm": 0.7612496018409729, "learning_rate": 0.0002, "epoch": 1.3228007181328545, "step": 18420}, {"loss": 0.7521, "grad_norm": 0.729653537273407, "learning_rate": 0.0002, "epoch": 1.3235188509874327, "step": 18430}, {"loss": 0.7153, "grad_norm": 0.6599212288856506, "learning_rate": 0.0002, "epoch": 1.3242369838420107, "step": 18440}, {"loss": 0.7315, "grad_norm": 0.762320876121521, "learning_rate": 0.0002, "epoch": 1.324955116696589, "step": 18450}, {"loss": 0.6986, "grad_norm": 0.7468838095664978, "learning_rate": 0.0002, "epoch": 1.325673249551167, "step": 18460}, {"loss": 0.7527, "grad_norm": 0.6376237273216248, "learning_rate": 0.0002, "epoch": 1.326391382405745, "step": 18470}, {"loss": 0.7173, "grad_norm": 0.6722603440284729, "learning_rate": 0.0002, "epoch": 1.3271095152603232, "step": 18480}, {"loss": 0.6821, "grad_norm": 0.7011231780052185, "learning_rate": 0.0002, "epoch": 1.3278276481149014, "step": 18490}, {"loss": 0.7942, "grad_norm": 0.5325027108192444, "learning_rate": 0.0002, "epoch": 1.3285457809694794, "step": 18500}, {"loss": 0.6709, "grad_norm": 0.6916731595993042, "learning_rate": 0.0002, "epoch": 1.3292639138240574, "step": 18510}, {"loss": 0.7204, "grad_norm": 0.6529106497764587, "learning_rate": 0.0002, "epoch": 1.3299820466786356, "step": 18520}, {"loss": 0.7289, "grad_norm": 0.7708640694618225, "learning_rate": 0.0002, "epoch": 1.3307001795332136, "step": 18530}, {"loss": 0.7688, "grad_norm": 0.7125861048698425, "learning_rate": 0.0002, "epoch": 1.3314183123877918, "step": 18540}, {"loss": 0.723, "grad_norm": 0.7663969993591309, "learning_rate": 0.0002, "epoch": 1.3321364452423698, "step": 18550}, {"loss": 0.6993, "grad_norm": 0.601141631603241, "learning_rate": 0.0002, "epoch": 1.3328545780969479, "step": 18560}, {"loss": 0.734, "grad_norm": 0.6185581088066101, "learning_rate": 0.0002, "epoch": 1.333572710951526, "step": 18570}, {"loss": 0.6938, "grad_norm": 0.6136596202850342, "learning_rate": 0.0002, "epoch": 1.334290843806104, "step": 18580}, {"loss": 0.6963, "grad_norm": 0.8377187252044678, "learning_rate": 0.0002, "epoch": 1.3350089766606823, "step": 18590}, {"loss": 0.7399, "grad_norm": 0.7649989724159241, "learning_rate": 0.0002, "epoch": 1.3357271095152603, "step": 18600}, {"loss": 0.7565, "grad_norm": 0.7944515347480774, "learning_rate": 0.0002, "epoch": 1.3364452423698383, "step": 18610}, {"loss": 0.7894, "grad_norm": 0.619024395942688, "learning_rate": 0.0002, "epoch": 1.3371633752244165, "step": 18620}, {"loss": 0.7497, "grad_norm": 0.7849082946777344, "learning_rate": 0.0002, "epoch": 1.3378815080789948, "step": 18630}, {"loss": 0.7123, "grad_norm": 0.5740780830383301, "learning_rate": 0.0002, "epoch": 1.3385996409335728, "step": 18640}, {"loss": 0.7211, "grad_norm": 0.6897456645965576, "learning_rate": 0.0002, "epoch": 1.3393177737881508, "step": 18650}, {"loss": 0.7174, "grad_norm": 0.6263600587844849, "learning_rate": 0.0002, "epoch": 1.340035906642729, "step": 18660}, {"loss": 0.7048, "grad_norm": 0.5744550824165344, "learning_rate": 0.0002, "epoch": 1.340754039497307, "step": 18670}, {"loss": 0.7773, "grad_norm": 0.7785728573799133, "learning_rate": 0.0002, "epoch": 1.3414721723518852, "step": 18680}, {"loss": 0.7697, "grad_norm": 0.6944230198860168, "learning_rate": 0.0002, "epoch": 1.3421903052064632, "step": 18690}, {"loss": 0.7387, "grad_norm": 0.7388073801994324, "learning_rate": 0.0002, "epoch": 1.3429084380610412, "step": 18700}, {"loss": 0.7776, "grad_norm": 0.9555586576461792, "learning_rate": 0.0002, "epoch": 1.3436265709156194, "step": 18710}, {"loss": 0.7308, "grad_norm": 0.8510582447052002, "learning_rate": 0.0002, "epoch": 1.3443447037701974, "step": 18720}, {"loss": 0.7131, "grad_norm": 0.6093049645423889, "learning_rate": 0.0002, "epoch": 1.3450628366247757, "step": 18730}, {"loss": 0.7194, "grad_norm": 0.9159273505210876, "learning_rate": 0.0002, "epoch": 1.3457809694793537, "step": 18740}, {"loss": 0.7626, "grad_norm": 0.7188084721565247, "learning_rate": 0.0002, "epoch": 1.3464991023339317, "step": 18750}, {"loss": 0.7212, "grad_norm": 0.7228650450706482, "learning_rate": 0.0002, "epoch": 1.3472172351885099, "step": 18760}, {"loss": 0.7213, "grad_norm": 0.8160615563392639, "learning_rate": 0.0002, "epoch": 1.347935368043088, "step": 18770}, {"loss": 0.7093, "grad_norm": 0.6485389471054077, "learning_rate": 0.0002, "epoch": 1.3486535008976661, "step": 18780}, {"loss": 0.7044, "grad_norm": 0.6755139827728271, "learning_rate": 0.0002, "epoch": 1.3493716337522441, "step": 18790}, {"loss": 0.7413, "grad_norm": 0.6923297643661499, "learning_rate": 0.0002, "epoch": 1.3500897666068223, "step": 18800}, {"loss": 0.7184, "grad_norm": 0.6954510807991028, "learning_rate": 0.0002, "epoch": 1.3508078994614003, "step": 18810}, {"loss": 0.6987, "grad_norm": 0.9948558807373047, "learning_rate": 0.0002, "epoch": 1.3515260323159786, "step": 18820}, {"loss": 0.7315, "grad_norm": 0.708381175994873, "learning_rate": 0.0002, "epoch": 1.3522441651705566, "step": 18830}, {"loss": 0.7135, "grad_norm": 0.6409999132156372, "learning_rate": 0.0002, "epoch": 1.3529622980251346, "step": 18840}, {"loss": 0.7204, "grad_norm": 0.6365936994552612, "learning_rate": 0.0002, "epoch": 1.3536804308797128, "step": 18850}, {"loss": 0.691, "grad_norm": 0.7620742917060852, "learning_rate": 0.0002, "epoch": 1.3543985637342908, "step": 18860}, {"loss": 0.7458, "grad_norm": 0.6849071383476257, "learning_rate": 0.0002, "epoch": 1.355116696588869, "step": 18870}, {"loss": 0.7221, "grad_norm": 0.5776316523551941, "learning_rate": 0.0002, "epoch": 1.355834829443447, "step": 18880}, {"loss": 0.7412, "grad_norm": 0.597236156463623, "learning_rate": 0.0002, "epoch": 1.356552962298025, "step": 18890}, {"loss": 0.7065, "grad_norm": 0.6569282412528992, "learning_rate": 0.0002, "epoch": 1.3572710951526032, "step": 18900}, {"loss": 0.6995, "grad_norm": 0.6384802460670471, "learning_rate": 0.0002, "epoch": 1.3579892280071812, "step": 18910}, {"loss": 0.7592, "grad_norm": 0.6623879671096802, "learning_rate": 0.0002, "epoch": 1.3587073608617595, "step": 18920}, {"loss": 0.7288, "grad_norm": 0.6149632334709167, "learning_rate": 0.0002, "epoch": 1.3594254937163375, "step": 18930}, {"loss": 0.7392, "grad_norm": 0.6978002190589905, "learning_rate": 0.0002, "epoch": 1.3601436265709157, "step": 18940}, {"loss": 0.7405, "grad_norm": 0.7579124569892883, "learning_rate": 0.0002, "epoch": 1.3608617594254937, "step": 18950}, {"loss": 0.7589, "grad_norm": 0.7138084173202515, "learning_rate": 0.0002, "epoch": 1.361579892280072, "step": 18960}, {"loss": 0.7257, "grad_norm": 0.678322434425354, "learning_rate": 0.0002, "epoch": 1.36229802513465, "step": 18970}, {"loss": 0.7221, "grad_norm": 0.694346010684967, "learning_rate": 0.0002, "epoch": 1.363016157989228, "step": 18980}, {"loss": 0.6986, "grad_norm": 0.682262659072876, "learning_rate": 0.0002, "epoch": 1.3637342908438062, "step": 18990}, {"loss": 0.7297, "grad_norm": 0.9068194627761841, "learning_rate": 0.0002, "epoch": 1.3644524236983842, "step": 19000}, {"loss": 0.756, "grad_norm": 0.6691566705703735, "learning_rate": 0.0002, "epoch": 1.3651705565529624, "step": 19010}, {"loss": 0.7158, "grad_norm": 0.7791378498077393, "learning_rate": 0.0002, "epoch": 1.3658886894075404, "step": 19020}, {"loss": 0.6904, "grad_norm": 0.717107355594635, "learning_rate": 0.0002, "epoch": 1.3666068222621184, "step": 19030}, {"loss": 0.7308, "grad_norm": 0.7897566556930542, "learning_rate": 0.0002, "epoch": 1.3673249551166966, "step": 19040}, {"loss": 0.7278, "grad_norm": 0.8823844790458679, "learning_rate": 0.0002, "epoch": 1.3680430879712746, "step": 19050}, {"loss": 0.7252, "grad_norm": 0.6512053608894348, "learning_rate": 0.0002, "epoch": 1.3687612208258528, "step": 19060}, {"loss": 0.6861, "grad_norm": 0.6871389150619507, "learning_rate": 0.0002, "epoch": 1.3694793536804308, "step": 19070}, {"loss": 0.7311, "grad_norm": 0.6795603036880493, "learning_rate": 0.0002, "epoch": 1.370197486535009, "step": 19080}, {"loss": 0.7351, "grad_norm": 0.6569121479988098, "learning_rate": 0.0002, "epoch": 1.370915619389587, "step": 19090}, {"loss": 0.7743, "grad_norm": 0.6769960522651672, "learning_rate": 0.0002, "epoch": 1.3716337522441653, "step": 19100}, {"loss": 0.7275, "grad_norm": 0.726613461971283, "learning_rate": 0.0002, "epoch": 1.3723518850987433, "step": 19110}, {"loss": 0.7484, "grad_norm": 0.7287817001342773, "learning_rate": 0.0002, "epoch": 1.3730700179533213, "step": 19120}, {"loss": 0.7305, "grad_norm": 0.6169242858886719, "learning_rate": 0.0002, "epoch": 1.3737881508078995, "step": 19130}, {"loss": 0.7195, "grad_norm": 0.6537347435951233, "learning_rate": 0.0002, "epoch": 1.3745062836624775, "step": 19140}, {"loss": 0.7402, "grad_norm": 0.6113879680633545, "learning_rate": 0.0002, "epoch": 1.3752244165170557, "step": 19150}, {"loss": 0.7012, "grad_norm": 0.6415297985076904, "learning_rate": 0.0002, "epoch": 1.3759425493716337, "step": 19160}, {"loss": 0.7367, "grad_norm": 0.6812838315963745, "learning_rate": 0.0002, "epoch": 1.3766606822262117, "step": 19170}, {"loss": 0.7117, "grad_norm": 0.7331814169883728, "learning_rate": 0.0002, "epoch": 1.37737881508079, "step": 19180}, {"loss": 0.7496, "grad_norm": 0.7265108823776245, "learning_rate": 0.0002, "epoch": 1.378096947935368, "step": 19190}, {"loss": 0.699, "grad_norm": 0.6233167052268982, "learning_rate": 0.0002, "epoch": 1.3788150807899462, "step": 19200}, {"loss": 0.6978, "grad_norm": 0.6841492652893066, "learning_rate": 0.0002, "epoch": 1.3795332136445242, "step": 19210}, {"loss": 0.6934, "grad_norm": 0.822853684425354, "learning_rate": 0.0002, "epoch": 1.3802513464991024, "step": 19220}, {"loss": 0.7574, "grad_norm": 0.8078812956809998, "learning_rate": 0.0002, "epoch": 1.3809694793536804, "step": 19230}, {"loss": 0.7429, "grad_norm": 0.7269898056983948, "learning_rate": 0.0002, "epoch": 1.3816876122082586, "step": 19240}, {"loss": 0.7552, "grad_norm": 0.6297033429145813, "learning_rate": 0.0002, "epoch": 1.3824057450628366, "step": 19250}, {"loss": 0.7396, "grad_norm": 0.8097442388534546, "learning_rate": 0.0002, "epoch": 1.3831238779174146, "step": 19260}, {"loss": 0.7281, "grad_norm": 0.6442803740501404, "learning_rate": 0.0002, "epoch": 1.3838420107719929, "step": 19270}, {"loss": 0.7598, "grad_norm": 0.659866213798523, "learning_rate": 0.0002, "epoch": 1.3845601436265709, "step": 19280}, {"loss": 0.7262, "grad_norm": 0.7537921667098999, "learning_rate": 0.0002, "epoch": 1.385278276481149, "step": 19290}, {"loss": 0.7215, "grad_norm": 0.8441828489303589, "learning_rate": 0.0002, "epoch": 1.385996409335727, "step": 19300}, {"loss": 0.725, "grad_norm": 0.8506057262420654, "learning_rate": 0.0002, "epoch": 1.386714542190305, "step": 19310}, {"loss": 0.7747, "grad_norm": 0.6747094392776489, "learning_rate": 0.0002, "epoch": 1.3874326750448833, "step": 19320}, {"loss": 0.7785, "grad_norm": 0.7906509041786194, "learning_rate": 0.0002, "epoch": 1.3881508078994613, "step": 19330}, {"loss": 0.8147, "grad_norm": 0.6784867644309998, "learning_rate": 0.0002, "epoch": 1.3888689407540395, "step": 19340}, {"loss": 0.7861, "grad_norm": 0.6371709108352661, "learning_rate": 0.0002, "epoch": 1.3895870736086176, "step": 19350}, {"loss": 0.7434, "grad_norm": 0.7858285307884216, "learning_rate": 0.0002, "epoch": 1.3903052064631956, "step": 19360}, {"loss": 0.7638, "grad_norm": 0.711395263671875, "learning_rate": 0.0002, "epoch": 1.3910233393177738, "step": 19370}, {"loss": 0.725, "grad_norm": 0.7023257613182068, "learning_rate": 0.0002, "epoch": 1.391741472172352, "step": 19380}, {"loss": 0.7612, "grad_norm": 0.7036022543907166, "learning_rate": 0.0002, "epoch": 1.39245960502693, "step": 19390}, {"loss": 0.7354, "grad_norm": 0.6418436169624329, "learning_rate": 0.0002, "epoch": 1.393177737881508, "step": 19400}, {"loss": 0.7444, "grad_norm": 0.7108847498893738, "learning_rate": 0.0002, "epoch": 1.3938958707360862, "step": 19410}, {"loss": 0.771, "grad_norm": 0.6940230131149292, "learning_rate": 0.0002, "epoch": 1.3946140035906642, "step": 19420}, {"loss": 0.6791, "grad_norm": 0.6750220656394958, "learning_rate": 0.0002, "epoch": 1.3953321364452425, "step": 19430}, {"loss": 0.7466, "grad_norm": 0.7479177713394165, "learning_rate": 0.0002, "epoch": 1.3960502692998205, "step": 19440}, {"loss": 0.7259, "grad_norm": 0.626124918460846, "learning_rate": 0.0002, "epoch": 1.3967684021543985, "step": 19450}, {"loss": 0.7108, "grad_norm": 0.8908559083938599, "learning_rate": 0.0002, "epoch": 1.3974865350089767, "step": 19460}, {"loss": 0.7451, "grad_norm": 0.6163712739944458, "learning_rate": 0.0002, "epoch": 1.3982046678635547, "step": 19470}, {"loss": 0.7437, "grad_norm": 0.6993312239646912, "learning_rate": 0.0002, "epoch": 1.398922800718133, "step": 19480}, {"loss": 0.7035, "grad_norm": 0.6162890791893005, "learning_rate": 0.0002, "epoch": 1.399640933572711, "step": 19490}, {"loss": 0.7455, "grad_norm": 0.7797643542289734, "learning_rate": 0.0002, "epoch": 1.400359066427289, "step": 19500}, {"loss": 0.7497, "grad_norm": 0.7038744688034058, "learning_rate": 0.0002, "epoch": 1.4010771992818671, "step": 19510}, {"loss": 0.7084, "grad_norm": 0.6902393698692322, "learning_rate": 0.0002, "epoch": 1.4017953321364454, "step": 19520}, {"loss": 0.7136, "grad_norm": 0.5436386466026306, "learning_rate": 0.0002, "epoch": 1.4025134649910234, "step": 19530}, {"loss": 0.7457, "grad_norm": 0.6537990570068359, "learning_rate": 0.0002, "epoch": 1.4032315978456014, "step": 19540}, {"loss": 0.727, "grad_norm": 0.739691972732544, "learning_rate": 0.0002, "epoch": 1.4039497307001796, "step": 19550}, {"loss": 0.7537, "grad_norm": 0.7287635803222656, "learning_rate": 0.0002, "epoch": 1.4046678635547576, "step": 19560}, {"loss": 0.707, "grad_norm": 0.6809501051902771, "learning_rate": 0.0002, "epoch": 1.4053859964093358, "step": 19570}, {"loss": 0.7336, "grad_norm": 0.8302195072174072, "learning_rate": 0.0002, "epoch": 1.4061041292639138, "step": 19580}, {"loss": 0.7201, "grad_norm": 0.6613629460334778, "learning_rate": 0.0002, "epoch": 1.4068222621184918, "step": 19590}, {"loss": 0.7415, "grad_norm": 0.7897207736968994, "learning_rate": 0.0002, "epoch": 1.40754039497307, "step": 19600}, {"loss": 0.7483, "grad_norm": 0.8368293642997742, "learning_rate": 0.0002, "epoch": 1.408258527827648, "step": 19610}, {"loss": 0.7412, "grad_norm": 0.665109395980835, "learning_rate": 0.0002, "epoch": 1.4089766606822263, "step": 19620}, {"loss": 0.7339, "grad_norm": 0.7359302639961243, "learning_rate": 0.0002, "epoch": 1.4096947935368043, "step": 19630}, {"loss": 0.7775, "grad_norm": 0.8048052787780762, "learning_rate": 0.0002, "epoch": 1.4104129263913823, "step": 19640}, {"loss": 0.7668, "grad_norm": 0.7414906620979309, "learning_rate": 0.0002, "epoch": 1.4111310592459605, "step": 19650}, {"loss": 0.7386, "grad_norm": 0.7894161343574524, "learning_rate": 0.0002, "epoch": 1.4118491921005387, "step": 19660}, {"loss": 0.7371, "grad_norm": 0.6724628210067749, "learning_rate": 0.0002, "epoch": 1.4125673249551167, "step": 19670}, {"loss": 0.7243, "grad_norm": 0.9397756457328796, "learning_rate": 0.0002, "epoch": 1.4132854578096947, "step": 19680}, {"loss": 0.7109, "grad_norm": 0.6684842109680176, "learning_rate": 0.0002, "epoch": 1.414003590664273, "step": 19690}, {"loss": 0.7693, "grad_norm": 0.7753993272781372, "learning_rate": 0.0002, "epoch": 1.414721723518851, "step": 19700}, {"loss": 0.7653, "grad_norm": 0.6934253573417664, "learning_rate": 0.0002, "epoch": 1.4154398563734292, "step": 19710}, {"loss": 0.7393, "grad_norm": 0.8567284941673279, "learning_rate": 0.0002, "epoch": 1.4161579892280072, "step": 19720}, {"loss": 0.6907, "grad_norm": 0.9471787214279175, "learning_rate": 0.0002, "epoch": 1.4168761220825852, "step": 19730}, {"loss": 0.709, "grad_norm": 0.6664855480194092, "learning_rate": 0.0002, "epoch": 1.4175942549371634, "step": 19740}, {"loss": 0.7149, "grad_norm": 0.6713361740112305, "learning_rate": 0.0002, "epoch": 1.4183123877917414, "step": 19750}, {"loss": 0.7302, "grad_norm": 0.6488258838653564, "learning_rate": 0.0002, "epoch": 1.4190305206463196, "step": 19760}, {"loss": 0.7612, "grad_norm": 0.7089938521385193, "learning_rate": 0.0002, "epoch": 1.4197486535008976, "step": 19770}, {"loss": 0.7245, "grad_norm": 0.6433218717575073, "learning_rate": 0.0002, "epoch": 1.4204667863554756, "step": 19780}, {"loss": 0.7105, "grad_norm": 0.7025160193443298, "learning_rate": 0.0002, "epoch": 1.4211849192100539, "step": 19790}, {"loss": 0.7948, "grad_norm": 0.7030544877052307, "learning_rate": 0.0002, "epoch": 1.421903052064632, "step": 19800}, {"loss": 0.7333, "grad_norm": 0.6515552401542664, "learning_rate": 0.0002, "epoch": 1.42262118491921, "step": 19810}, {"loss": 0.7342, "grad_norm": 0.6463841795921326, "learning_rate": 0.0002, "epoch": 1.423339317773788, "step": 19820}, {"loss": 0.7457, "grad_norm": 0.6654344201087952, "learning_rate": 0.0002, "epoch": 1.4240574506283663, "step": 19830}, {"loss": 0.7289, "grad_norm": 0.7223384380340576, "learning_rate": 0.0002, "epoch": 1.4247755834829443, "step": 19840}, {"loss": 0.7471, "grad_norm": 0.6575722694396973, "learning_rate": 0.0002, "epoch": 1.4254937163375225, "step": 19850}, {"loss": 0.7559, "grad_norm": 0.6216059327125549, "learning_rate": 0.0002, "epoch": 1.4262118491921005, "step": 19860}, {"loss": 0.7638, "grad_norm": 0.7451487183570862, "learning_rate": 0.0002, "epoch": 1.4269299820466785, "step": 19870}, {"loss": 0.7083, "grad_norm": 0.6563336253166199, "learning_rate": 0.0002, "epoch": 1.4276481149012568, "step": 19880}, {"loss": 0.7122, "grad_norm": 0.8021975159645081, "learning_rate": 0.0002, "epoch": 1.4283662477558348, "step": 19890}, {"loss": 0.7389, "grad_norm": 0.7474712133407593, "learning_rate": 0.0002, "epoch": 1.429084380610413, "step": 19900}, {"loss": 0.7839, "grad_norm": 0.7316377758979797, "learning_rate": 0.0002, "epoch": 1.429802513464991, "step": 19910}, {"loss": 0.7588, "grad_norm": 0.646892786026001, "learning_rate": 0.0002, "epoch": 1.430520646319569, "step": 19920}, {"loss": 0.7175, "grad_norm": 0.6268765926361084, "learning_rate": 0.0002, "epoch": 1.4312387791741472, "step": 19930}, {"loss": 0.7502, "grad_norm": 0.7104699611663818, "learning_rate": 0.0002, "epoch": 1.4319569120287254, "step": 19940}, {"loss": 0.7006, "grad_norm": 0.6742063760757446, "learning_rate": 0.0002, "epoch": 1.4326750448833034, "step": 19950}, {"loss": 0.7394, "grad_norm": 0.6973381638526917, "learning_rate": 0.0002, "epoch": 1.4333931777378814, "step": 19960}, {"loss": 0.7428, "grad_norm": 0.5819381475448608, "learning_rate": 0.0002, "epoch": 1.4341113105924597, "step": 19970}, {"loss": 0.7836, "grad_norm": 0.680623471736908, "learning_rate": 0.0002, "epoch": 1.4348294434470377, "step": 19980}, {"loss": 0.7063, "grad_norm": 0.5899890661239624, "learning_rate": 0.0002, "epoch": 1.435547576301616, "step": 19990}, {"loss": 0.7438, "grad_norm": 0.6225098371505737, "learning_rate": 0.0002, "epoch": 1.436265709156194, "step": 20000}, {"loss": 0.7065, "grad_norm": 0.6314228773117065, "learning_rate": 0.0002, "epoch": 1.436983842010772, "step": 20010}, {"loss": 0.677, "grad_norm": 0.8690667152404785, "learning_rate": 0.0002, "epoch": 1.4377019748653501, "step": 20020}, {"loss": 0.7491, "grad_norm": 0.7166543006896973, "learning_rate": 0.0002, "epoch": 1.4384201077199281, "step": 20030}, {"loss": 0.7686, "grad_norm": 0.7051591873168945, "learning_rate": 0.0002, "epoch": 1.4391382405745063, "step": 20040}, {"loss": 0.6669, "grad_norm": 0.7606652975082397, "learning_rate": 0.0002, "epoch": 1.4398563734290843, "step": 20050}, {"loss": 0.7427, "grad_norm": 0.6343185305595398, "learning_rate": 0.0002, "epoch": 1.4405745062836623, "step": 20060}, {"loss": 0.6956, "grad_norm": 0.5625789761543274, "learning_rate": 0.0002, "epoch": 1.4412926391382406, "step": 20070}, {"loss": 0.7421, "grad_norm": 0.6081897020339966, "learning_rate": 0.0002, "epoch": 1.4420107719928188, "step": 20080}, {"loss": 0.7646, "grad_norm": 0.9571536779403687, "learning_rate": 0.0002, "epoch": 1.4427289048473968, "step": 20090}, {"loss": 0.6939, "grad_norm": 0.869531512260437, "learning_rate": 0.0002, "epoch": 1.4434470377019748, "step": 20100}, {"loss": 0.7684, "grad_norm": 0.6865507960319519, "learning_rate": 0.0002, "epoch": 1.444165170556553, "step": 20110}, {"loss": 0.6835, "grad_norm": 0.7572755813598633, "learning_rate": 0.0002, "epoch": 1.444883303411131, "step": 20120}, {"loss": 0.7392, "grad_norm": 0.79011070728302, "learning_rate": 0.0002, "epoch": 1.4456014362657092, "step": 20130}, {"loss": 0.7624, "grad_norm": 0.8297342658042908, "learning_rate": 0.0002, "epoch": 1.4463195691202873, "step": 20140}, {"loss": 0.696, "grad_norm": 0.6593490839004517, "learning_rate": 0.0002, "epoch": 1.4470377019748653, "step": 20150}, {"loss": 0.7062, "grad_norm": 1.0264687538146973, "learning_rate": 0.0002, "epoch": 1.4477558348294435, "step": 20160}, {"loss": 0.7804, "grad_norm": 0.7032888531684875, "learning_rate": 0.0002, "epoch": 1.4484739676840215, "step": 20170}, {"loss": 0.7692, "grad_norm": 0.6438494920730591, "learning_rate": 0.0002, "epoch": 1.4491921005385997, "step": 20180}, {"loss": 0.7189, "grad_norm": 0.7448790669441223, "learning_rate": 0.0002, "epoch": 1.4499102333931777, "step": 20190}, {"loss": 0.7389, "grad_norm": 0.7551555037498474, "learning_rate": 0.0002, "epoch": 1.4506283662477557, "step": 20200}, {"loss": 0.7636, "grad_norm": 0.6677857041358948, "learning_rate": 0.0002, "epoch": 1.451346499102334, "step": 20210}, {"loss": 0.7261, "grad_norm": 0.7888486385345459, "learning_rate": 0.0002, "epoch": 1.4520646319569122, "step": 20220}, {"loss": 0.7349, "grad_norm": 0.6658565402030945, "learning_rate": 0.0002, "epoch": 1.4527827648114902, "step": 20230}, {"loss": 0.7862, "grad_norm": 0.6800249814987183, "learning_rate": 0.0002, "epoch": 1.4535008976660682, "step": 20240}, {"loss": 0.7464, "grad_norm": 0.7419682741165161, "learning_rate": 0.0002, "epoch": 1.4542190305206464, "step": 20250}, {"loss": 0.7118, "grad_norm": 0.8848792910575867, "learning_rate": 0.0002, "epoch": 1.4549371633752244, "step": 20260}, {"loss": 0.729, "grad_norm": 0.6513857245445251, "learning_rate": 0.0002, "epoch": 1.4556552962298026, "step": 20270}, {"loss": 0.7325, "grad_norm": 0.5605742335319519, "learning_rate": 0.0002, "epoch": 1.4563734290843806, "step": 20280}, {"loss": 0.7078, "grad_norm": 0.6737141013145447, "learning_rate": 0.0002, "epoch": 1.4570915619389586, "step": 20290}, {"loss": 0.6971, "grad_norm": 0.6663289666175842, "learning_rate": 0.0002, "epoch": 1.4578096947935368, "step": 20300}, {"loss": 0.7161, "grad_norm": 0.7157106995582581, "learning_rate": 0.0002, "epoch": 1.4585278276481148, "step": 20310}, {"loss": 0.7024, "grad_norm": 0.7713354825973511, "learning_rate": 0.0002, "epoch": 1.459245960502693, "step": 20320}, {"loss": 0.7043, "grad_norm": 0.8334044218063354, "learning_rate": 0.0002, "epoch": 1.459964093357271, "step": 20330}, {"loss": 0.7151, "grad_norm": 0.7268327474594116, "learning_rate": 0.0002, "epoch": 1.460682226211849, "step": 20340}, {"loss": 0.7415, "grad_norm": 0.6791431903839111, "learning_rate": 0.0002, "epoch": 1.4614003590664273, "step": 20350}, {"loss": 0.7738, "grad_norm": 0.8177870512008667, "learning_rate": 0.0002, "epoch": 1.4621184919210055, "step": 20360}, {"loss": 0.7212, "grad_norm": 0.8064364790916443, "learning_rate": 0.0002, "epoch": 1.4628366247755835, "step": 20370}, {"loss": 0.7285, "grad_norm": 0.6547006964683533, "learning_rate": 0.0002, "epoch": 1.4635547576301615, "step": 20380}, {"loss": 0.7444, "grad_norm": 0.6381436586380005, "learning_rate": 0.0002, "epoch": 1.4642728904847397, "step": 20390}, {"loss": 0.7593, "grad_norm": 0.7351248264312744, "learning_rate": 0.0002, "epoch": 1.4649910233393177, "step": 20400}, {"loss": 0.7385, "grad_norm": 0.7037558555603027, "learning_rate": 0.0002, "epoch": 1.465709156193896, "step": 20410}, {"loss": 0.7815, "grad_norm": 0.6294074654579163, "learning_rate": 0.0002, "epoch": 1.466427289048474, "step": 20420}, {"loss": 0.6665, "grad_norm": 0.9722632765769958, "learning_rate": 0.0002, "epoch": 1.467145421903052, "step": 20430}, {"loss": 0.7363, "grad_norm": 0.753065824508667, "learning_rate": 0.0002, "epoch": 1.4678635547576302, "step": 20440}, {"loss": 0.7568, "grad_norm": 0.7317194938659668, "learning_rate": 0.0002, "epoch": 1.4685816876122082, "step": 20450}, {"loss": 0.6948, "grad_norm": 0.6862193942070007, "learning_rate": 0.0002, "epoch": 1.4692998204667864, "step": 20460}, {"loss": 0.7552, "grad_norm": 0.7643225193023682, "learning_rate": 0.0002, "epoch": 1.4700179533213644, "step": 20470}, {"loss": 0.6757, "grad_norm": 0.5904353260993958, "learning_rate": 0.0002, "epoch": 1.4707360861759424, "step": 20480}, {"loss": 0.7779, "grad_norm": 0.5812238454818726, "learning_rate": 0.0002, "epoch": 1.4714542190305206, "step": 20490}, {"loss": 0.7252, "grad_norm": 0.7478151321411133, "learning_rate": 0.0002, "epoch": 1.4721723518850989, "step": 20500}, {"loss": 0.7165, "grad_norm": 0.7625645399093628, "learning_rate": 0.0002, "epoch": 1.4728904847396769, "step": 20510}, {"loss": 0.7383, "grad_norm": 0.6354498267173767, "learning_rate": 0.0002, "epoch": 1.4736086175942549, "step": 20520}, {"loss": 0.7095, "grad_norm": 0.8731162548065186, "learning_rate": 0.0002, "epoch": 1.474326750448833, "step": 20530}, {"loss": 0.7535, "grad_norm": 0.7346670627593994, "learning_rate": 0.0002, "epoch": 1.475044883303411, "step": 20540}, {"loss": 0.78, "grad_norm": 1.038447618484497, "learning_rate": 0.0002, "epoch": 1.4757630161579893, "step": 20550}, {"loss": 0.7026, "grad_norm": 0.7032809257507324, "learning_rate": 0.0002, "epoch": 1.4764811490125673, "step": 20560}, {"loss": 0.6776, "grad_norm": 0.8008337020874023, "learning_rate": 0.0002, "epoch": 1.4771992818671453, "step": 20570}, {"loss": 0.776, "grad_norm": 0.6735056638717651, "learning_rate": 0.0002, "epoch": 1.4779174147217236, "step": 20580}, {"loss": 0.7632, "grad_norm": 0.622056245803833, "learning_rate": 0.0002, "epoch": 1.4786355475763016, "step": 20590}, {"loss": 0.7467, "grad_norm": 0.6580422520637512, "learning_rate": 0.0002, "epoch": 1.4793536804308798, "step": 20600}, {"loss": 0.7161, "grad_norm": 0.8401153087615967, "learning_rate": 0.0002, "epoch": 1.4800718132854578, "step": 20610}, {"loss": 0.7581, "grad_norm": 0.7564560770988464, "learning_rate": 0.0002, "epoch": 1.4807899461400358, "step": 20620}, {"loss": 0.7507, "grad_norm": 0.8319511413574219, "learning_rate": 0.0002, "epoch": 1.481508078994614, "step": 20630}, {"loss": 0.7379, "grad_norm": 0.7430182695388794, "learning_rate": 0.0002, "epoch": 1.4822262118491922, "step": 20640}, {"loss": 0.7273, "grad_norm": 0.7996522784233093, "learning_rate": 0.0002, "epoch": 1.4829443447037702, "step": 20650}, {"loss": 0.7223, "grad_norm": 0.6993277072906494, "learning_rate": 0.0002, "epoch": 1.4836624775583482, "step": 20660}, {"loss": 0.7328, "grad_norm": 0.8621185421943665, "learning_rate": 0.0002, "epoch": 1.4843806104129265, "step": 20670}, {"loss": 0.7327, "grad_norm": 0.7709757685661316, "learning_rate": 0.0002, "epoch": 1.4850987432675045, "step": 20680}, {"loss": 0.7053, "grad_norm": 0.743760347366333, "learning_rate": 0.0002, "epoch": 1.4858168761220827, "step": 20690}, {"loss": 0.6763, "grad_norm": 0.8353745341300964, "learning_rate": 0.0002, "epoch": 1.4865350089766607, "step": 20700}, {"loss": 0.6933, "grad_norm": 0.8510433435440063, "learning_rate": 0.0002, "epoch": 1.4872531418312387, "step": 20710}, {"loss": 0.7486, "grad_norm": 0.7065894603729248, "learning_rate": 0.0002, "epoch": 1.487971274685817, "step": 20720}, {"loss": 0.736, "grad_norm": 0.6878955960273743, "learning_rate": 0.0002, "epoch": 1.488689407540395, "step": 20730}, {"loss": 0.6958, "grad_norm": 0.7861111760139465, "learning_rate": 0.0002, "epoch": 1.4894075403949731, "step": 20740}, {"loss": 0.7568, "grad_norm": 0.4810725152492523, "learning_rate": 0.0002, "epoch": 1.4901256732495511, "step": 20750}, {"loss": 0.8147, "grad_norm": 0.7246082425117493, "learning_rate": 0.0002, "epoch": 1.4908438061041291, "step": 20760}, {"loss": 0.7312, "grad_norm": 0.7101936340332031, "learning_rate": 0.0002, "epoch": 1.4915619389587074, "step": 20770}, {"loss": 0.7393, "grad_norm": 0.7508591413497925, "learning_rate": 0.0002, "epoch": 1.4922800718132856, "step": 20780}, {"loss": 0.7635, "grad_norm": 0.8872039914131165, "learning_rate": 0.0002, "epoch": 1.4929982046678636, "step": 20790}, {"loss": 0.7352, "grad_norm": 0.7257922887802124, "learning_rate": 0.0002, "epoch": 1.4937163375224416, "step": 20800}, {"loss": 0.7497, "grad_norm": 0.7886278629302979, "learning_rate": 0.0002, "epoch": 1.4944344703770198, "step": 20810}, {"loss": 0.7247, "grad_norm": 0.6746290922164917, "learning_rate": 0.0002, "epoch": 1.4951526032315978, "step": 20820}, {"loss": 0.7836, "grad_norm": 0.8118207454681396, "learning_rate": 0.0002, "epoch": 1.495870736086176, "step": 20830}, {"loss": 0.7323, "grad_norm": 0.7337301969528198, "learning_rate": 0.0002, "epoch": 1.496588868940754, "step": 20840}, {"loss": 0.7105, "grad_norm": 0.5451242327690125, "learning_rate": 0.0002, "epoch": 1.497307001795332, "step": 20850}, {"loss": 0.7255, "grad_norm": 0.8398377299308777, "learning_rate": 0.0002, "epoch": 1.4980251346499103, "step": 20860}, {"loss": 0.7217, "grad_norm": 0.7196659445762634, "learning_rate": 0.0002, "epoch": 1.4987432675044883, "step": 20870}, {"loss": 0.6843, "grad_norm": 0.6659539937973022, "learning_rate": 0.0002, "epoch": 1.4994614003590665, "step": 20880}, {"loss": 0.7337, "grad_norm": 0.6071978807449341, "learning_rate": 0.0002, "epoch": 1.5001795332136445, "step": 20890}, {"loss": 0.7221, "grad_norm": 0.6704870462417603, "learning_rate": 0.0002, "epoch": 1.5008976660682225, "step": 20900}, {"loss": 0.6946, "grad_norm": 0.7216639518737793, "learning_rate": 0.0002, "epoch": 1.5016157989228007, "step": 20910}, {"loss": 0.7282, "grad_norm": 0.6050528287887573, "learning_rate": 0.0002, "epoch": 1.502333931777379, "step": 20920}, {"loss": 0.7142, "grad_norm": 0.7422218918800354, "learning_rate": 0.0002, "epoch": 1.503052064631957, "step": 20930}, {"loss": 0.7779, "grad_norm": 0.7157148122787476, "learning_rate": 0.0002, "epoch": 1.503770197486535, "step": 20940}, {"loss": 0.7179, "grad_norm": 0.6704899668693542, "learning_rate": 0.0002, "epoch": 1.504488330341113, "step": 20950}, {"loss": 0.7124, "grad_norm": 0.7573544979095459, "learning_rate": 0.0002, "epoch": 1.5052064631956912, "step": 20960}, {"loss": 0.7831, "grad_norm": 0.6710506677627563, "learning_rate": 0.0002, "epoch": 1.5059245960502694, "step": 20970}, {"loss": 0.7123, "grad_norm": 0.7559793591499329, "learning_rate": 0.0002, "epoch": 1.5066427289048474, "step": 20980}, {"loss": 0.7442, "grad_norm": 0.6705940961837769, "learning_rate": 0.0002, "epoch": 1.5073608617594254, "step": 20990}, {"loss": 0.7387, "grad_norm": 0.8016680479049683, "learning_rate": 0.0002, "epoch": 1.5080789946140036, "step": 21000}, {"loss": 0.7101, "grad_norm": 0.8154481649398804, "learning_rate": 0.0002, "epoch": 1.5087971274685816, "step": 21010}, {"loss": 0.7223, "grad_norm": 0.5830582976341248, "learning_rate": 0.0002, "epoch": 1.5095152603231599, "step": 21020}, {"loss": 0.753, "grad_norm": 0.7088601589202881, "learning_rate": 0.0002, "epoch": 1.5102333931777379, "step": 21030}, {"loss": 0.7278, "grad_norm": 0.7499658465385437, "learning_rate": 0.0002, "epoch": 1.5109515260323159, "step": 21040}, {"loss": 0.7441, "grad_norm": 0.7684667706489563, "learning_rate": 0.0002, "epoch": 1.511669658886894, "step": 21050}, {"loss": 0.7665, "grad_norm": 0.7183627486228943, "learning_rate": 0.0002, "epoch": 1.5123877917414723, "step": 21060}, {"loss": 0.7777, "grad_norm": 0.8201524615287781, "learning_rate": 0.0002, "epoch": 1.5131059245960503, "step": 21070}, {"loss": 0.7005, "grad_norm": 0.6359647512435913, "learning_rate": 0.0002, "epoch": 1.5138240574506283, "step": 21080}, {"loss": 0.7231, "grad_norm": 0.7419124245643616, "learning_rate": 0.0002, "epoch": 1.5145421903052063, "step": 21090}, {"loss": 0.724, "grad_norm": 0.6145808696746826, "learning_rate": 0.0002, "epoch": 1.5152603231597845, "step": 21100}, {"loss": 0.7563, "grad_norm": 0.7116656303405762, "learning_rate": 0.0002, "epoch": 1.5159784560143628, "step": 21110}, {"loss": 0.7221, "grad_norm": 0.8927125334739685, "learning_rate": 0.0002, "epoch": 1.5166965888689408, "step": 21120}, {"loss": 0.7159, "grad_norm": 0.7527788877487183, "learning_rate": 0.0002, "epoch": 1.5174147217235188, "step": 21130}, {"loss": 0.7147, "grad_norm": 0.7537266612052917, "learning_rate": 0.0002, "epoch": 1.518132854578097, "step": 21140}, {"loss": 0.7451, "grad_norm": 0.9051724672317505, "learning_rate": 0.0002, "epoch": 1.518850987432675, "step": 21150}, {"loss": 0.7362, "grad_norm": 0.7258086800575256, "learning_rate": 0.0002, "epoch": 1.5195691202872532, "step": 21160}, {"loss": 0.7096, "grad_norm": 0.60377436876297, "learning_rate": 0.0002, "epoch": 1.5202872531418312, "step": 21170}, {"loss": 0.7141, "grad_norm": 0.613362729549408, "learning_rate": 0.0002, "epoch": 1.5210053859964092, "step": 21180}, {"loss": 0.7018, "grad_norm": 0.6311782002449036, "learning_rate": 0.0002, "epoch": 1.5217235188509874, "step": 21190}, {"loss": 0.8144, "grad_norm": 0.7814380526542664, "learning_rate": 0.0002, "epoch": 1.5224416517055657, "step": 21200}, {"loss": 0.7505, "grad_norm": 0.8482790589332581, "learning_rate": 0.0002, "epoch": 1.5231597845601437, "step": 21210}, {"loss": 0.7387, "grad_norm": 0.6767336130142212, "learning_rate": 0.0002, "epoch": 1.5238779174147217, "step": 21220}, {"loss": 0.7556, "grad_norm": 0.7000219821929932, "learning_rate": 0.0002, "epoch": 1.5245960502692997, "step": 21230}, {"loss": 0.7628, "grad_norm": 0.8848617076873779, "learning_rate": 0.0002, "epoch": 1.525314183123878, "step": 21240}, {"loss": 0.7226, "grad_norm": 0.692258894443512, "learning_rate": 0.0002, "epoch": 1.5260323159784561, "step": 21250}, {"loss": 0.7535, "grad_norm": 0.7701950073242188, "learning_rate": 0.0002, "epoch": 1.5267504488330341, "step": 21260}, {"loss": 0.7531, "grad_norm": 0.7454132437705994, "learning_rate": 0.0002, "epoch": 1.5274685816876121, "step": 21270}, {"loss": 0.7663, "grad_norm": 0.7299574613571167, "learning_rate": 0.0002, "epoch": 1.5281867145421903, "step": 21280}, {"loss": 0.6993, "grad_norm": 0.6693950891494751, "learning_rate": 0.0002, "epoch": 1.5289048473967684, "step": 21290}, {"loss": 0.7567, "grad_norm": 0.8323785066604614, "learning_rate": 0.0002, "epoch": 1.5296229802513466, "step": 21300}, {"loss": 0.7205, "grad_norm": 0.8998763561248779, "learning_rate": 0.0002, "epoch": 1.5303411131059246, "step": 21310}, {"loss": 0.7779, "grad_norm": 0.8118193745613098, "learning_rate": 0.0002, "epoch": 1.5310592459605026, "step": 21320}, {"loss": 0.7642, "grad_norm": 0.8966332077980042, "learning_rate": 0.0002, "epoch": 1.5317773788150808, "step": 21330}, {"loss": 0.7626, "grad_norm": 0.7849827408790588, "learning_rate": 0.0002, "epoch": 1.532495511669659, "step": 21340}, {"loss": 0.7501, "grad_norm": 0.897583544254303, "learning_rate": 0.0002, "epoch": 1.533213644524237, "step": 21350}, {"loss": 0.7812, "grad_norm": 0.7998009324073792, "learning_rate": 0.0002, "epoch": 1.533931777378815, "step": 21360}, {"loss": 0.7217, "grad_norm": 0.5890361070632935, "learning_rate": 0.0002, "epoch": 1.534649910233393, "step": 21370}, {"loss": 0.7283, "grad_norm": 0.7321302890777588, "learning_rate": 0.0002, "epoch": 1.5353680430879713, "step": 21380}, {"loss": 0.7238, "grad_norm": 0.7746050357818604, "learning_rate": 0.0002, "epoch": 1.5360861759425495, "step": 21390}, {"loss": 0.7146, "grad_norm": 0.7033910155296326, "learning_rate": 0.0002, "epoch": 1.5368043087971275, "step": 21400}, {"loss": 0.6783, "grad_norm": 0.7229148149490356, "learning_rate": 0.0002, "epoch": 1.5375224416517055, "step": 21410}, {"loss": 0.7347, "grad_norm": 0.8055810928344727, "learning_rate": 0.0002, "epoch": 1.5382405745062837, "step": 21420}, {"loss": 0.7382, "grad_norm": 0.9411654472351074, "learning_rate": 0.0002, "epoch": 1.5389587073608617, "step": 21430}, {"loss": 0.6916, "grad_norm": 0.7297126650810242, "learning_rate": 0.0002, "epoch": 1.53967684021544, "step": 21440}, {"loss": 0.6977, "grad_norm": 0.7316457629203796, "learning_rate": 0.0002, "epoch": 1.540394973070018, "step": 21450}, {"loss": 0.713, "grad_norm": 0.8568798303604126, "learning_rate": 0.0002, "epoch": 1.541113105924596, "step": 21460}, {"loss": 0.6916, "grad_norm": 0.7829580307006836, "learning_rate": 0.0002, "epoch": 1.5418312387791742, "step": 21470}, {"loss": 0.712, "grad_norm": 0.6679823398590088, "learning_rate": 0.0002, "epoch": 1.5425493716337524, "step": 21480}, {"loss": 0.6978, "grad_norm": 0.5680868029594421, "learning_rate": 0.0002, "epoch": 1.5432675044883304, "step": 21490}, {"loss": 0.7638, "grad_norm": 0.6878862380981445, "learning_rate": 0.0002, "epoch": 1.5439856373429084, "step": 21500}, {"loss": 0.7634, "grad_norm": 0.7391727566719055, "learning_rate": 0.0002, "epoch": 1.5447037701974864, "step": 21510}, {"loss": 0.7781, "grad_norm": 0.844994843006134, "learning_rate": 0.0002, "epoch": 1.5454219030520646, "step": 21520}, {"loss": 0.7052, "grad_norm": 0.7852550148963928, "learning_rate": 0.0002, "epoch": 1.5461400359066428, "step": 21530}, {"loss": 0.7364, "grad_norm": 0.8370407223701477, "learning_rate": 0.0002, "epoch": 1.5468581687612208, "step": 21540}, {"loss": 0.7266, "grad_norm": 0.7138169407844543, "learning_rate": 0.0002, "epoch": 1.5475763016157988, "step": 21550}, {"loss": 0.7078, "grad_norm": 0.7660839557647705, "learning_rate": 0.0002, "epoch": 1.548294434470377, "step": 21560}, {"loss": 0.7056, "grad_norm": 0.6628666520118713, "learning_rate": 0.0002, "epoch": 1.549012567324955, "step": 21570}, {"loss": 0.7384, "grad_norm": 0.602262020111084, "learning_rate": 0.0002, "epoch": 1.5497307001795333, "step": 21580}, {"loss": 0.7258, "grad_norm": 0.6120333671569824, "learning_rate": 0.0002, "epoch": 1.5504488330341113, "step": 21590}, {"loss": 0.8094, "grad_norm": 0.6742582321166992, "learning_rate": 0.0002, "epoch": 1.5511669658886893, "step": 21600}, {"loss": 0.6807, "grad_norm": 0.6788192391395569, "learning_rate": 0.0002, "epoch": 1.5518850987432675, "step": 21610}, {"loss": 0.6969, "grad_norm": 0.7124713659286499, "learning_rate": 0.0002, "epoch": 1.5526032315978457, "step": 21620}, {"loss": 0.7296, "grad_norm": 0.6297248005867004, "learning_rate": 0.0002, "epoch": 1.5533213644524237, "step": 21630}, {"loss": 0.7466, "grad_norm": 0.8977078199386597, "learning_rate": 0.0002, "epoch": 1.5540394973070017, "step": 21640}, {"loss": 0.7376, "grad_norm": 0.7543209791183472, "learning_rate": 0.0002, "epoch": 1.5547576301615798, "step": 21650}, {"loss": 0.749, "grad_norm": 0.8704302310943604, "learning_rate": 0.0002, "epoch": 1.555475763016158, "step": 21660}, {"loss": 0.7801, "grad_norm": 0.7848012447357178, "learning_rate": 0.0002, "epoch": 1.5561938958707362, "step": 21670}, {"loss": 0.7062, "grad_norm": 0.7496278285980225, "learning_rate": 0.0002, "epoch": 1.5569120287253142, "step": 21680}, {"loss": 0.7503, "grad_norm": 0.7305200099945068, "learning_rate": 0.0002, "epoch": 1.5576301615798922, "step": 21690}, {"loss": 0.7429, "grad_norm": 0.6671105623245239, "learning_rate": 0.0002, "epoch": 1.5583482944344704, "step": 21700}, {"loss": 0.7293, "grad_norm": 0.8536111116409302, "learning_rate": 0.0002, "epoch": 1.5590664272890484, "step": 21710}, {"loss": 0.7169, "grad_norm": 0.7360461354255676, "learning_rate": 0.0002, "epoch": 1.5597845601436267, "step": 21720}, {"loss": 0.7314, "grad_norm": 0.6665109395980835, "learning_rate": 0.0002, "epoch": 1.5605026929982047, "step": 21730}, {"loss": 0.7262, "grad_norm": 0.5879628658294678, "learning_rate": 0.0002, "epoch": 1.5612208258527827, "step": 21740}, {"loss": 0.7099, "grad_norm": 0.6937240958213806, "learning_rate": 0.0002, "epoch": 1.5619389587073609, "step": 21750}, {"loss": 0.7669, "grad_norm": 0.7118659019470215, "learning_rate": 0.0002, "epoch": 1.562657091561939, "step": 21760}, {"loss": 0.7196, "grad_norm": 0.7858866453170776, "learning_rate": 0.0002, "epoch": 1.563375224416517, "step": 21770}, {"loss": 0.7552, "grad_norm": 0.8691372871398926, "learning_rate": 0.0002, "epoch": 1.564093357271095, "step": 21780}, {"loss": 0.7684, "grad_norm": 0.8884942531585693, "learning_rate": 0.0002, "epoch": 1.564811490125673, "step": 21790}, {"loss": 0.7128, "grad_norm": 0.6335656046867371, "learning_rate": 0.0002, "epoch": 1.5655296229802513, "step": 21800}, {"loss": 0.7233, "grad_norm": 0.8666166067123413, "learning_rate": 0.0002, "epoch": 1.5662477558348296, "step": 21810}, {"loss": 0.6771, "grad_norm": 0.7961624264717102, "learning_rate": 0.0002, "epoch": 1.5669658886894076, "step": 21820}, {"loss": 0.7286, "grad_norm": 0.6331174373626709, "learning_rate": 0.0002, "epoch": 1.5676840215439856, "step": 21830}, {"loss": 0.7273, "grad_norm": 0.6476998925209045, "learning_rate": 0.0002, "epoch": 1.5684021543985638, "step": 21840}, {"loss": 0.7507, "grad_norm": 0.8279129266738892, "learning_rate": 0.0002, "epoch": 1.5691202872531418, "step": 21850}, {"loss": 0.7219, "grad_norm": 0.6997109651565552, "learning_rate": 0.0002, "epoch": 1.56983842010772, "step": 21860}, {"loss": 0.7424, "grad_norm": 0.6992211937904358, "learning_rate": 0.0002, "epoch": 1.570556552962298, "step": 21870}, {"loss": 0.7275, "grad_norm": 0.7766915559768677, "learning_rate": 0.0002, "epoch": 1.571274685816876, "step": 21880}, {"loss": 0.7651, "grad_norm": 0.6845845580101013, "learning_rate": 0.0002, "epoch": 1.5719928186714542, "step": 21890}, {"loss": 0.706, "grad_norm": 0.7247874140739441, "learning_rate": 0.0002, "epoch": 1.5727109515260325, "step": 21900}, {"loss": 0.7812, "grad_norm": 0.802342414855957, "learning_rate": 0.0002, "epoch": 1.5734290843806105, "step": 21910}, {"loss": 0.7028, "grad_norm": 0.7797709107398987, "learning_rate": 0.0002, "epoch": 1.5741472172351885, "step": 21920}, {"loss": 0.7466, "grad_norm": 0.6534958481788635, "learning_rate": 0.0002, "epoch": 1.5748653500897665, "step": 21930}, {"loss": 0.7148, "grad_norm": 0.6003528237342834, "learning_rate": 0.0002, "epoch": 1.5755834829443447, "step": 21940}, {"loss": 0.7282, "grad_norm": 0.6920075416564941, "learning_rate": 0.0002, "epoch": 1.576301615798923, "step": 21950}, {"loss": 0.6533, "grad_norm": 0.7213456034660339, "learning_rate": 0.0002, "epoch": 1.577019748653501, "step": 21960}, {"loss": 0.6875, "grad_norm": 0.7101914286613464, "learning_rate": 0.0002, "epoch": 1.577737881508079, "step": 21970}, {"loss": 0.7421, "grad_norm": 0.9531592130661011, "learning_rate": 0.0002, "epoch": 1.5784560143626571, "step": 21980}, {"loss": 0.7454, "grad_norm": 0.7690590023994446, "learning_rate": 0.0002, "epoch": 1.5791741472172351, "step": 21990}, {"loss": 0.7135, "grad_norm": 0.8226363062858582, "learning_rate": 0.0002, "epoch": 1.5798922800718134, "step": 22000}, {"loss": 0.7518, "grad_norm": 0.6128851175308228, "learning_rate": 0.0002, "epoch": 1.5806104129263914, "step": 22010}, {"loss": 0.7253, "grad_norm": 0.827008068561554, "learning_rate": 0.0002, "epoch": 1.5813285457809694, "step": 22020}, {"loss": 0.7176, "grad_norm": 0.6729007363319397, "learning_rate": 0.0002, "epoch": 1.5820466786355476, "step": 22030}, {"loss": 0.7503, "grad_norm": 0.6397014260292053, "learning_rate": 0.0002, "epoch": 1.5827648114901258, "step": 22040}, {"loss": 0.7531, "grad_norm": 0.6927793622016907, "learning_rate": 0.0002, "epoch": 1.5834829443447038, "step": 22050}, {"loss": 0.7499, "grad_norm": 0.7527112364768982, "learning_rate": 0.0002, "epoch": 1.5842010771992818, "step": 22060}, {"loss": 0.739, "grad_norm": 0.6418012380599976, "learning_rate": 0.0002, "epoch": 1.5849192100538598, "step": 22070}, {"loss": 0.727, "grad_norm": 0.7627281546592712, "learning_rate": 0.0002, "epoch": 1.585637342908438, "step": 22080}, {"loss": 0.7115, "grad_norm": 0.753851592540741, "learning_rate": 0.0002, "epoch": 1.5863554757630163, "step": 22090}, {"loss": 0.7677, "grad_norm": 0.6049349904060364, "learning_rate": 0.0002, "epoch": 1.5870736086175943, "step": 22100}, {"loss": 0.7494, "grad_norm": 0.6677758693695068, "learning_rate": 0.0002, "epoch": 1.5877917414721723, "step": 22110}, {"loss": 0.7259, "grad_norm": 0.913489818572998, "learning_rate": 0.0002, "epoch": 1.5885098743267505, "step": 22120}, {"loss": 0.7823, "grad_norm": 0.6779162883758545, "learning_rate": 0.0002, "epoch": 1.5892280071813285, "step": 22130}, {"loss": 0.7674, "grad_norm": 0.910076916217804, "learning_rate": 0.0002, "epoch": 1.5899461400359067, "step": 22140}, {"loss": 0.7162, "grad_norm": 0.9506068229675293, "learning_rate": 0.0002, "epoch": 1.5906642728904847, "step": 22150}, {"loss": 0.7343, "grad_norm": 0.6552460789680481, "learning_rate": 0.0002, "epoch": 1.5913824057450627, "step": 22160}, {"loss": 0.7488, "grad_norm": 0.6855819821357727, "learning_rate": 0.0002, "epoch": 1.592100538599641, "step": 22170}, {"loss": 0.6785, "grad_norm": 0.6713384985923767, "learning_rate": 0.0002, "epoch": 1.5928186714542192, "step": 22180}, {"loss": 0.7287, "grad_norm": 0.7168547511100769, "learning_rate": 0.0002, "epoch": 1.5935368043087972, "step": 22190}, {"loss": 0.7259, "grad_norm": 0.8395482897758484, "learning_rate": 0.0002, "epoch": 1.5942549371633752, "step": 22200}, {"loss": 0.6995, "grad_norm": 0.6676998138427734, "learning_rate": 0.0002, "epoch": 1.5949730700179532, "step": 22210}, {"loss": 0.7152, "grad_norm": 0.5837140083312988, "learning_rate": 0.0002, "epoch": 1.5956912028725314, "step": 22220}, {"loss": 0.7464, "grad_norm": 0.8399306535720825, "learning_rate": 0.0002, "epoch": 1.5964093357271096, "step": 22230}, {"loss": 0.7053, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 1.5971274685816876, "step": 22240}, {"loss": 0.784, "grad_norm": 0.768604040145874, "learning_rate": 0.0002, "epoch": 1.5978456014362656, "step": 22250}, {"loss": 0.6946, "grad_norm": 0.6382646560668945, "learning_rate": 0.0002, "epoch": 1.5985637342908436, "step": 22260}, {"loss": 0.7035, "grad_norm": 0.7244897484779358, "learning_rate": 0.0002, "epoch": 1.5992818671454219, "step": 22270}, {"loss": 0.7168, "grad_norm": 0.6250987648963928, "learning_rate": 0.0002, "epoch": 1.6, "step": 22280}, {"loss": 0.7182, "grad_norm": 0.8731992244720459, "learning_rate": 0.0002, "epoch": 1.600718132854578, "step": 22290}, {"loss": 0.6866, "grad_norm": 0.5861822962760925, "learning_rate": 0.0002, "epoch": 1.601436265709156, "step": 22300}, {"loss": 0.6909, "grad_norm": 0.716805100440979, "learning_rate": 0.0002, "epoch": 1.6021543985637343, "step": 22310}, {"loss": 0.7377, "grad_norm": 0.6650034189224243, "learning_rate": 0.0002, "epoch": 1.6028725314183125, "step": 22320}, {"loss": 0.7107, "grad_norm": 0.6944432854652405, "learning_rate": 0.0002, "epoch": 1.6035906642728905, "step": 22330}, {"loss": 0.682, "grad_norm": 0.7411999106407166, "learning_rate": 0.0002, "epoch": 1.6043087971274685, "step": 22340}, {"loss": 0.7294, "grad_norm": 0.831828773021698, "learning_rate": 0.0002, "epoch": 1.6050269299820465, "step": 22350}, {"loss": 0.7305, "grad_norm": 0.6252152919769287, "learning_rate": 0.0002, "epoch": 1.6057450628366248, "step": 22360}, {"loss": 0.7479, "grad_norm": 0.8643325567245483, "learning_rate": 0.0002, "epoch": 1.606463195691203, "step": 22370}, {"loss": 0.7417, "grad_norm": 0.7330279350280762, "learning_rate": 0.0002, "epoch": 1.607181328545781, "step": 22380}, {"loss": 0.7198, "grad_norm": 0.7235422730445862, "learning_rate": 0.0002, "epoch": 1.607899461400359, "step": 22390}, {"loss": 0.7638, "grad_norm": 0.6940887570381165, "learning_rate": 0.0002, "epoch": 1.608617594254937, "step": 22400}, {"loss": 0.714, "grad_norm": 0.7907325625419617, "learning_rate": 0.0002, "epoch": 1.6093357271095152, "step": 22410}, {"loss": 0.7824, "grad_norm": 0.6899075508117676, "learning_rate": 0.0002, "epoch": 1.6100538599640934, "step": 22420}, {"loss": 0.7502, "grad_norm": 0.7057487368583679, "learning_rate": 0.0002, "epoch": 1.6107719928186714, "step": 22430}, {"loss": 0.7437, "grad_norm": 0.9235003590583801, "learning_rate": 0.0002, "epoch": 1.6114901256732495, "step": 22440}, {"loss": 0.7115, "grad_norm": 0.7238173484802246, "learning_rate": 0.0002, "epoch": 1.6122082585278277, "step": 22450}, {"loss": 0.7628, "grad_norm": 0.5931997299194336, "learning_rate": 0.0002, "epoch": 1.612926391382406, "step": 22460}, {"loss": 0.6663, "grad_norm": 0.6705866456031799, "learning_rate": 0.0002, "epoch": 1.613644524236984, "step": 22470}, {"loss": 0.749, "grad_norm": 0.7392773032188416, "learning_rate": 0.0002, "epoch": 1.614362657091562, "step": 22480}, {"loss": 0.7292, "grad_norm": 0.6286543607711792, "learning_rate": 0.0002, "epoch": 1.61508078994614, "step": 22490}, {"loss": 0.7264, "grad_norm": 0.7467446327209473, "learning_rate": 0.0002, "epoch": 1.6157989228007181, "step": 22500}, {"loss": 0.732, "grad_norm": 0.8353021740913391, "learning_rate": 0.0002, "epoch": 1.6165170556552964, "step": 22510}, {"loss": 0.7626, "grad_norm": 0.7333045601844788, "learning_rate": 0.0002, "epoch": 1.6172351885098744, "step": 22520}, {"loss": 0.7567, "grad_norm": 0.6203709244728088, "learning_rate": 0.0002, "epoch": 1.6179533213644524, "step": 22530}, {"loss": 0.7478, "grad_norm": 0.5585690140724182, "learning_rate": 0.0002, "epoch": 1.6186714542190304, "step": 22540}, {"loss": 0.669, "grad_norm": 0.7157222032546997, "learning_rate": 0.0002, "epoch": 1.6193895870736086, "step": 22550}, {"loss": 0.7224, "grad_norm": 0.8129993677139282, "learning_rate": 0.0002, "epoch": 1.6201077199281868, "step": 22560}, {"loss": 0.7374, "grad_norm": 0.6745335459709167, "learning_rate": 0.0002, "epoch": 1.6208258527827648, "step": 22570}, {"loss": 0.7276, "grad_norm": 0.7684996724128723, "learning_rate": 0.0002, "epoch": 1.6215439856373428, "step": 22580}, {"loss": 0.7479, "grad_norm": 0.6735436916351318, "learning_rate": 0.0002, "epoch": 1.622262118491921, "step": 22590}, {"loss": 0.6596, "grad_norm": 0.7394272089004517, "learning_rate": 0.0002, "epoch": 1.6229802513464993, "step": 22600}, {"loss": 0.7382, "grad_norm": 0.7268046140670776, "learning_rate": 0.0002, "epoch": 1.6236983842010773, "step": 22610}, {"loss": 0.7619, "grad_norm": 0.8338810205459595, "learning_rate": 0.0002, "epoch": 1.6244165170556553, "step": 22620}, {"loss": 0.7247, "grad_norm": 0.9293080568313599, "learning_rate": 0.0002, "epoch": 1.6251346499102333, "step": 22630}, {"loss": 0.7601, "grad_norm": 0.8084996938705444, "learning_rate": 0.0002, "epoch": 1.6258527827648115, "step": 22640}, {"loss": 0.7053, "grad_norm": 0.6605180501937866, "learning_rate": 0.0002, "epoch": 1.6265709156193897, "step": 22650}, {"loss": 0.7489, "grad_norm": 0.8402717113494873, "learning_rate": 0.0002, "epoch": 1.6272890484739677, "step": 22660}, {"loss": 0.7468, "grad_norm": 0.653055727481842, "learning_rate": 0.0002, "epoch": 1.6280071813285457, "step": 22670}, {"loss": 0.7179, "grad_norm": 0.6477823257446289, "learning_rate": 0.0002, "epoch": 1.6287253141831237, "step": 22680}, {"loss": 0.7216, "grad_norm": 0.9053590893745422, "learning_rate": 0.0002, "epoch": 1.629443447037702, "step": 22690}, {"loss": 0.7257, "grad_norm": 0.90384441614151, "learning_rate": 0.0002, "epoch": 1.6301615798922802, "step": 22700}, {"loss": 0.7703, "grad_norm": 0.6789469122886658, "learning_rate": 0.0002, "epoch": 1.6308797127468582, "step": 22710}, {"loss": 0.7706, "grad_norm": 0.7221854329109192, "learning_rate": 0.0002, "epoch": 1.6315978456014362, "step": 22720}, {"loss": 0.7457, "grad_norm": 0.7724022269248962, "learning_rate": 0.0002, "epoch": 1.6323159784560144, "step": 22730}, {"loss": 0.7864, "grad_norm": 0.8213715553283691, "learning_rate": 0.0002, "epoch": 1.6330341113105926, "step": 22740}, {"loss": 0.7356, "grad_norm": 0.7102876305580139, "learning_rate": 0.0002, "epoch": 1.6337522441651706, "step": 22750}, {"loss": 0.7208, "grad_norm": 0.8817880749702454, "learning_rate": 0.0002, "epoch": 1.6344703770197486, "step": 22760}, {"loss": 0.7722, "grad_norm": 0.8446506857872009, "learning_rate": 0.0002, "epoch": 1.6351885098743266, "step": 22770}, {"loss": 0.7341, "grad_norm": 0.6749029755592346, "learning_rate": 0.0002, "epoch": 1.6359066427289048, "step": 22780}, {"loss": 0.7599, "grad_norm": 0.7013556957244873, "learning_rate": 0.0002, "epoch": 1.636624775583483, "step": 22790}, {"loss": 0.7488, "grad_norm": 0.7767965793609619, "learning_rate": 0.0002, "epoch": 1.637342908438061, "step": 22800}, {"loss": 0.7387, "grad_norm": 0.7354073524475098, "learning_rate": 0.0002, "epoch": 1.638061041292639, "step": 22810}, {"loss": 0.7816, "grad_norm": 0.8871088027954102, "learning_rate": 0.0002, "epoch": 1.638779174147217, "step": 22820}, {"loss": 0.7243, "grad_norm": 0.6573871374130249, "learning_rate": 0.0002, "epoch": 1.6394973070017953, "step": 22830}, {"loss": 0.7812, "grad_norm": 0.5679349303245544, "learning_rate": 0.0002, "epoch": 1.6402154398563735, "step": 22840}, {"loss": 0.7402, "grad_norm": 0.7072559595108032, "learning_rate": 0.0002, "epoch": 1.6409335727109515, "step": 22850}, {"loss": 0.751, "grad_norm": 0.7639257311820984, "learning_rate": 0.0002, "epoch": 1.6416517055655295, "step": 22860}, {"loss": 0.7357, "grad_norm": 0.6699341535568237, "learning_rate": 0.0002, "epoch": 1.6423698384201078, "step": 22870}, {"loss": 0.7295, "grad_norm": 0.8285767436027527, "learning_rate": 0.0002, "epoch": 1.643087971274686, "step": 22880}, {"loss": 0.7267, "grad_norm": 0.7328150272369385, "learning_rate": 0.0002, "epoch": 1.643806104129264, "step": 22890}, {"loss": 0.6904, "grad_norm": 0.8122354745864868, "learning_rate": 0.0002, "epoch": 1.644524236983842, "step": 22900}, {"loss": 0.7853, "grad_norm": 0.7322969436645508, "learning_rate": 0.0002, "epoch": 1.64524236983842, "step": 22910}, {"loss": 0.7629, "grad_norm": 0.7269576191902161, "learning_rate": 0.0002, "epoch": 1.6459605026929982, "step": 22920}, {"loss": 0.728, "grad_norm": 0.7037042379379272, "learning_rate": 0.0002, "epoch": 1.6466786355475764, "step": 22930}, {"loss": 0.752, "grad_norm": 0.6960355639457703, "learning_rate": 0.0002, "epoch": 1.6473967684021544, "step": 22940}, {"loss": 0.7484, "grad_norm": 0.7446839213371277, "learning_rate": 0.0002, "epoch": 1.6481149012567324, "step": 22950}, {"loss": 0.7528, "grad_norm": 0.7201664447784424, "learning_rate": 0.0002, "epoch": 1.6488330341113104, "step": 22960}, {"loss": 0.7183, "grad_norm": 0.7062349319458008, "learning_rate": 0.0002, "epoch": 1.6495511669658887, "step": 22970}, {"loss": 0.6999, "grad_norm": 0.7666636109352112, "learning_rate": 0.0002, "epoch": 1.6502692998204669, "step": 22980}, {"loss": 0.7103, "grad_norm": 0.7872112393379211, "learning_rate": 0.0002, "epoch": 1.6509874326750449, "step": 22990}, {"loss": 0.7307, "grad_norm": 0.7428551316261292, "learning_rate": 0.0002, "epoch": 1.6517055655296229, "step": 23000}, {"loss": 0.7573, "grad_norm": 0.6087952852249146, "learning_rate": 0.0002, "epoch": 1.6524236983842011, "step": 23010}, {"loss": 0.8045, "grad_norm": 0.7191354036331177, "learning_rate": 0.0002, "epoch": 1.6531418312387793, "step": 23020}, {"loss": 0.7517, "grad_norm": 0.8679710626602173, "learning_rate": 0.0002, "epoch": 1.6538599640933573, "step": 23030}, {"loss": 0.7084, "grad_norm": 0.7232310175895691, "learning_rate": 0.0002, "epoch": 1.6545780969479353, "step": 23040}, {"loss": 0.7007, "grad_norm": 0.5695104002952576, "learning_rate": 0.0002, "epoch": 1.6552962298025133, "step": 23050}, {"loss": 0.7115, "grad_norm": 0.6363076567649841, "learning_rate": 0.0002, "epoch": 1.6560143626570916, "step": 23060}, {"loss": 0.7639, "grad_norm": 0.8168749809265137, "learning_rate": 0.0002, "epoch": 1.6567324955116698, "step": 23070}, {"loss": 0.6768, "grad_norm": 0.7664111852645874, "learning_rate": 0.0002, "epoch": 1.6574506283662478, "step": 23080}, {"loss": 0.7492, "grad_norm": 0.6748140454292297, "learning_rate": 0.0002, "epoch": 1.6581687612208258, "step": 23090}, {"loss": 0.7213, "grad_norm": 0.6258183121681213, "learning_rate": 0.0002, "epoch": 1.6588868940754038, "step": 23100}, {"loss": 0.783, "grad_norm": 0.8669735193252563, "learning_rate": 0.0002, "epoch": 1.659605026929982, "step": 23110}, {"loss": 0.6847, "grad_norm": 0.5606119632720947, "learning_rate": 0.0002, "epoch": 1.6603231597845602, "step": 23120}, {"loss": 0.6889, "grad_norm": 0.6602507829666138, "learning_rate": 0.0002, "epoch": 1.6610412926391382, "step": 23130}, {"loss": 0.7605, "grad_norm": 0.7237988710403442, "learning_rate": 0.0002, "epoch": 1.6617594254937162, "step": 23140}, {"loss": 0.7663, "grad_norm": 0.9054415225982666, "learning_rate": 0.0002, "epoch": 1.6624775583482945, "step": 23150}, {"loss": 0.7603, "grad_norm": 0.5186660289764404, "learning_rate": 0.0002, "epoch": 1.6631956912028727, "step": 23160}, {"loss": 0.7442, "grad_norm": 0.719584584236145, "learning_rate": 0.0002, "epoch": 1.6639138240574507, "step": 23170}, {"loss": 0.7715, "grad_norm": 0.7583617568016052, "learning_rate": 0.0002, "epoch": 1.6646319569120287, "step": 23180}, {"loss": 0.7402, "grad_norm": 0.7985982298851013, "learning_rate": 0.0002, "epoch": 1.6653500897666067, "step": 23190}, {"loss": 0.7515, "grad_norm": 0.6952691674232483, "learning_rate": 0.0002, "epoch": 1.666068222621185, "step": 23200}, {"loss": 0.7491, "grad_norm": 0.7184221744537354, "learning_rate": 0.0002, "epoch": 1.6667863554757631, "step": 23210}, {"loss": 0.7608, "grad_norm": 0.8256361484527588, "learning_rate": 0.0002, "epoch": 1.6675044883303412, "step": 23220}, {"loss": 0.7331, "grad_norm": 0.7534128427505493, "learning_rate": 0.0002, "epoch": 1.6682226211849192, "step": 23230}, {"loss": 0.7196, "grad_norm": 0.7711095213890076, "learning_rate": 0.0002, "epoch": 1.6689407540394972, "step": 23240}, {"loss": 0.7871, "grad_norm": 0.6326615810394287, "learning_rate": 0.0002, "epoch": 1.6696588868940754, "step": 23250}, {"loss": 0.7244, "grad_norm": 0.8345766663551331, "learning_rate": 0.0002, "epoch": 1.6703770197486536, "step": 23260}, {"loss": 0.7819, "grad_norm": 0.9079837203025818, "learning_rate": 0.0002, "epoch": 1.6710951526032316, "step": 23270}, {"loss": 0.7259, "grad_norm": 0.7310197353363037, "learning_rate": 0.0002, "epoch": 1.6718132854578096, "step": 23280}, {"loss": 0.7253, "grad_norm": 0.7573344707489014, "learning_rate": 0.0002, "epoch": 1.6725314183123878, "step": 23290}, {"loss": 0.6817, "grad_norm": 0.7708047032356262, "learning_rate": 0.0002, "epoch": 1.673249551166966, "step": 23300}, {"loss": 0.7247, "grad_norm": 0.7665812969207764, "learning_rate": 0.0002, "epoch": 1.673967684021544, "step": 23310}, {"loss": 0.7048, "grad_norm": 0.7988788485527039, "learning_rate": 0.0002, "epoch": 1.674685816876122, "step": 23320}, {"loss": 0.7396, "grad_norm": 0.755042552947998, "learning_rate": 0.0002, "epoch": 1.6754039497307, "step": 23330}, {"loss": 0.7392, "grad_norm": 0.6605848670005798, "learning_rate": 0.0002, "epoch": 1.6761220825852783, "step": 23340}, {"loss": 0.7394, "grad_norm": 0.8762016296386719, "learning_rate": 0.0002, "epoch": 1.6768402154398565, "step": 23350}, {"loss": 0.7661, "grad_norm": 0.604742169380188, "learning_rate": 0.0002, "epoch": 1.6775583482944345, "step": 23360}, {"loss": 0.7422, "grad_norm": 0.7479172945022583, "learning_rate": 0.0002, "epoch": 1.6782764811490125, "step": 23370}, {"loss": 0.7248, "grad_norm": 0.6418702602386475, "learning_rate": 0.0002, "epoch": 1.6789946140035905, "step": 23380}, {"loss": 0.7717, "grad_norm": 0.6783933639526367, "learning_rate": 0.0002, "epoch": 1.6797127468581687, "step": 23390}, {"loss": 0.7099, "grad_norm": 0.7036024928092957, "learning_rate": 0.0002, "epoch": 1.680430879712747, "step": 23400}, {"loss": 0.7439, "grad_norm": 0.6833266615867615, "learning_rate": 0.0002, "epoch": 1.681149012567325, "step": 23410}, {"loss": 0.753, "grad_norm": 0.8867062330245972, "learning_rate": 0.0002, "epoch": 1.681867145421903, "step": 23420}, {"loss": 0.7694, "grad_norm": 0.7825753092765808, "learning_rate": 0.0002, "epoch": 1.6825852782764812, "step": 23430}, {"loss": 0.7127, "grad_norm": 0.6396880745887756, "learning_rate": 0.0002, "epoch": 1.6833034111310592, "step": 23440}, {"loss": 0.7465, "grad_norm": 0.5723230242729187, "learning_rate": 0.0002, "epoch": 1.6840215439856374, "step": 23450}, {"loss": 0.7102, "grad_norm": 0.6949231624603271, "learning_rate": 0.0002, "epoch": 1.6847396768402154, "step": 23460}, {"loss": 0.7421, "grad_norm": 0.8290650248527527, "learning_rate": 0.0002, "epoch": 1.6854578096947934, "step": 23470}, {"loss": 0.7774, "grad_norm": 0.7765078544616699, "learning_rate": 0.0002, "epoch": 1.6861759425493716, "step": 23480}, {"loss": 0.7271, "grad_norm": 0.7084149718284607, "learning_rate": 0.0002, "epoch": 1.6868940754039499, "step": 23490}, {"loss": 0.8188, "grad_norm": 0.6916654109954834, "learning_rate": 0.0002, "epoch": 1.6876122082585279, "step": 23500}, {"loss": 0.7235, "grad_norm": 0.5615179538726807, "learning_rate": 0.0002, "epoch": 1.6883303411131059, "step": 23510}, {"loss": 0.7203, "grad_norm": 0.7996105551719666, "learning_rate": 0.0002, "epoch": 1.6890484739676839, "step": 23520}, {"loss": 0.7145, "grad_norm": 0.7010168433189392, "learning_rate": 0.0002, "epoch": 1.689766606822262, "step": 23530}, {"loss": 0.7696, "grad_norm": 0.7876442074775696, "learning_rate": 0.0002, "epoch": 1.6904847396768403, "step": 23540}, {"loss": 0.6966, "grad_norm": 0.7508043646812439, "learning_rate": 0.0002, "epoch": 1.6912028725314183, "step": 23550}, {"loss": 0.729, "grad_norm": 0.8125874400138855, "learning_rate": 0.0002, "epoch": 1.6919210053859963, "step": 23560}, {"loss": 0.774, "grad_norm": 0.711840808391571, "learning_rate": 0.0002, "epoch": 1.6926391382405745, "step": 23570}, {"loss": 0.7165, "grad_norm": 0.6540026068687439, "learning_rate": 0.0002, "epoch": 1.6933572710951525, "step": 23580}, {"loss": 0.7578, "grad_norm": 0.8376550078392029, "learning_rate": 0.0002, "epoch": 1.6940754039497308, "step": 23590}, {"loss": 0.7746, "grad_norm": 0.7075366973876953, "learning_rate": 0.0002, "epoch": 1.6947935368043088, "step": 23600}, {"loss": 0.7639, "grad_norm": 0.7522266507148743, "learning_rate": 0.0002, "epoch": 1.6955116696588868, "step": 23610}, {"loss": 0.7386, "grad_norm": 0.7572667002677917, "learning_rate": 0.0002, "epoch": 1.696229802513465, "step": 23620}, {"loss": 0.6896, "grad_norm": 0.6126907467842102, "learning_rate": 0.0002, "epoch": 1.6969479353680432, "step": 23630}, {"loss": 0.7182, "grad_norm": 0.7473152875900269, "learning_rate": 0.0002, "epoch": 1.6976660682226212, "step": 23640}, {"loss": 0.7272, "grad_norm": 0.6630390286445618, "learning_rate": 0.0002, "epoch": 1.6983842010771992, "step": 23650}, {"loss": 0.7232, "grad_norm": 0.5848073363304138, "learning_rate": 0.0002, "epoch": 1.6991023339317772, "step": 23660}, {"loss": 0.6923, "grad_norm": 0.5901942849159241, "learning_rate": 0.0002, "epoch": 1.6998204667863555, "step": 23670}, {"loss": 0.79, "grad_norm": 0.7896918058395386, "learning_rate": 0.0002, "epoch": 1.7005385996409337, "step": 23680}, {"loss": 0.77, "grad_norm": 0.705362856388092, "learning_rate": 0.0002, "epoch": 1.7012567324955117, "step": 23690}, {"loss": 0.751, "grad_norm": 0.9917470812797546, "learning_rate": 0.0002, "epoch": 1.7019748653500897, "step": 23700}, {"loss": 0.7403, "grad_norm": 0.7550538778305054, "learning_rate": 0.0002, "epoch": 1.702692998204668, "step": 23710}, {"loss": 0.7398, "grad_norm": 0.8348238468170166, "learning_rate": 0.0002, "epoch": 1.703411131059246, "step": 23720}, {"loss": 0.7799, "grad_norm": 0.5979694128036499, "learning_rate": 0.0002, "epoch": 1.7041292639138241, "step": 23730}, {"loss": 0.7035, "grad_norm": 0.7451775670051575, "learning_rate": 0.0002, "epoch": 1.7048473967684021, "step": 23740}, {"loss": 0.7237, "grad_norm": 0.7614818215370178, "learning_rate": 0.0002, "epoch": 1.7055655296229801, "step": 23750}, {"loss": 0.7636, "grad_norm": 0.5590742826461792, "learning_rate": 0.0002, "epoch": 1.7062836624775584, "step": 23760}, {"loss": 0.701, "grad_norm": 0.7039094567298889, "learning_rate": 0.0002, "epoch": 1.7070017953321366, "step": 23770}, {"loss": 0.7145, "grad_norm": 0.7963233590126038, "learning_rate": 0.0002, "epoch": 1.7077199281867146, "step": 23780}, {"loss": 0.7702, "grad_norm": 0.7214934825897217, "learning_rate": 0.0002, "epoch": 1.7084380610412926, "step": 23790}, {"loss": 0.7515, "grad_norm": 0.7310500741004944, "learning_rate": 0.0002, "epoch": 1.7091561938958706, "step": 23800}, {"loss": 0.7038, "grad_norm": 0.6653284430503845, "learning_rate": 0.0002, "epoch": 1.7098743267504488, "step": 23810}, {"loss": 0.698, "grad_norm": 0.6632702946662903, "learning_rate": 0.0002, "epoch": 1.710592459605027, "step": 23820}, {"loss": 0.7338, "grad_norm": 0.6314955949783325, "learning_rate": 0.0002, "epoch": 1.711310592459605, "step": 23830}, {"loss": 0.7511, "grad_norm": 0.73652583360672, "learning_rate": 0.0002, "epoch": 1.712028725314183, "step": 23840}, {"loss": 0.6999, "grad_norm": 0.5685144662857056, "learning_rate": 0.0002, "epoch": 1.7127468581687613, "step": 23850}, {"loss": 0.7295, "grad_norm": 0.7010223865509033, "learning_rate": 0.0002, "epoch": 1.7134649910233393, "step": 23860}, {"loss": 0.7488, "grad_norm": 0.7643879652023315, "learning_rate": 0.0002, "epoch": 1.7141831238779175, "step": 23870}, {"loss": 0.7449, "grad_norm": 0.7543165683746338, "learning_rate": 0.0002, "epoch": 1.7149012567324955, "step": 23880}, {"loss": 0.6946, "grad_norm": 0.8816508054733276, "learning_rate": 0.0002, "epoch": 1.7156193895870735, "step": 23890}, {"loss": 0.7398, "grad_norm": 0.7979614734649658, "learning_rate": 0.0002, "epoch": 1.7163375224416517, "step": 23900}, {"loss": 0.7844, "grad_norm": 0.7631057500839233, "learning_rate": 0.0002, "epoch": 1.71705565529623, "step": 23910}, {"loss": 0.7409, "grad_norm": 0.6349977254867554, "learning_rate": 0.0002, "epoch": 1.717773788150808, "step": 23920}, {"loss": 0.74, "grad_norm": 0.7464412450790405, "learning_rate": 0.0002, "epoch": 1.718491921005386, "step": 23930}, {"loss": 0.7164, "grad_norm": 0.6985567212104797, "learning_rate": 0.0002, "epoch": 1.719210053859964, "step": 23940}, {"loss": 0.7256, "grad_norm": 0.6641302704811096, "learning_rate": 0.0002, "epoch": 1.7199281867145422, "step": 23950}, {"loss": 0.7154, "grad_norm": 0.7299597263336182, "learning_rate": 0.0002, "epoch": 1.7206463195691204, "step": 23960}, {"loss": 0.7535, "grad_norm": 0.7812355756759644, "learning_rate": 0.0002, "epoch": 1.7213644524236984, "step": 23970}, {"loss": 0.7363, "grad_norm": 0.667571485042572, "learning_rate": 0.0002, "epoch": 1.7220825852782764, "step": 23980}, {"loss": 0.7427, "grad_norm": 0.8244081735610962, "learning_rate": 0.0002, "epoch": 1.7228007181328546, "step": 23990}, {"loss": 0.7191, "grad_norm": 0.6684445738792419, "learning_rate": 0.0002, "epoch": 1.7235188509874326, "step": 24000}, {"loss": 0.8042, "grad_norm": 0.7002949118614197, "learning_rate": 0.0002, "epoch": 1.7242369838420109, "step": 24010}, {"loss": 0.7134, "grad_norm": 0.6249772906303406, "learning_rate": 0.0002, "epoch": 1.7249551166965889, "step": 24020}, {"loss": 0.721, "grad_norm": 0.7279905080795288, "learning_rate": 0.0002, "epoch": 1.7256732495511669, "step": 24030}, {"loss": 0.7374, "grad_norm": 0.631148636341095, "learning_rate": 0.0002, "epoch": 1.726391382405745, "step": 24040}, {"loss": 0.697, "grad_norm": 0.7486464977264404, "learning_rate": 0.0002, "epoch": 1.7271095152603233, "step": 24050}, {"loss": 0.715, "grad_norm": 0.7494347095489502, "learning_rate": 0.0002, "epoch": 1.7278276481149013, "step": 24060}, {"loss": 0.7609, "grad_norm": 0.7821264863014221, "learning_rate": 0.0002, "epoch": 1.7285457809694793, "step": 24070}, {"loss": 0.6925, "grad_norm": 0.7211608290672302, "learning_rate": 0.0002, "epoch": 1.7292639138240573, "step": 24080}, {"loss": 0.7444, "grad_norm": 0.7028553485870361, "learning_rate": 0.0002, "epoch": 1.7299820466786355, "step": 24090}, {"loss": 0.8065, "grad_norm": 0.6189247369766235, "learning_rate": 0.0002, "epoch": 1.7307001795332138, "step": 24100}, {"loss": 0.7011, "grad_norm": 0.7339756488800049, "learning_rate": 0.0002, "epoch": 1.7314183123877918, "step": 24110}, {"loss": 0.8071, "grad_norm": 0.6700502038002014, "learning_rate": 0.0002, "epoch": 1.7321364452423698, "step": 24120}, {"loss": 0.7608, "grad_norm": 0.6139533519744873, "learning_rate": 0.0002, "epoch": 1.732854578096948, "step": 24130}, {"loss": 0.7251, "grad_norm": 0.7249825596809387, "learning_rate": 0.0002, "epoch": 1.733572710951526, "step": 24140}, {"loss": 0.6954, "grad_norm": 0.6531777381896973, "learning_rate": 0.0002, "epoch": 1.7342908438061042, "step": 24150}, {"loss": 0.7214, "grad_norm": 0.8443833589553833, "learning_rate": 0.0002, "epoch": 1.7350089766606822, "step": 24160}, {"loss": 0.75, "grad_norm": 0.7040373086929321, "learning_rate": 0.0002, "epoch": 1.7357271095152602, "step": 24170}, {"loss": 0.701, "grad_norm": 0.8647749423980713, "learning_rate": 0.0002, "epoch": 1.7364452423698384, "step": 24180}, {"loss": 0.7033, "grad_norm": 0.7297305464744568, "learning_rate": 0.0002, "epoch": 1.7371633752244167, "step": 24190}, {"loss": 0.7187, "grad_norm": 0.8191218376159668, "learning_rate": 0.0002, "epoch": 1.7378815080789947, "step": 24200}, {"loss": 0.7665, "grad_norm": 0.7315607666969299, "learning_rate": 0.0002, "epoch": 1.7385996409335727, "step": 24210}, {"loss": 0.7467, "grad_norm": 0.694486677646637, "learning_rate": 0.0002, "epoch": 1.7393177737881507, "step": 24220}, {"loss": 0.7476, "grad_norm": 0.8115953207015991, "learning_rate": 0.0002, "epoch": 1.740035906642729, "step": 24230}, {"loss": 0.7792, "grad_norm": 0.7379186153411865, "learning_rate": 0.0002, "epoch": 1.7407540394973071, "step": 24240}, {"loss": 0.7224, "grad_norm": 0.6820309162139893, "learning_rate": 0.0002, "epoch": 1.7414721723518851, "step": 24250}, {"loss": 0.7558, "grad_norm": 0.8210766911506653, "learning_rate": 0.0002, "epoch": 1.7421903052064631, "step": 24260}, {"loss": 0.7098, "grad_norm": 0.724466860294342, "learning_rate": 0.0002, "epoch": 1.7429084380610413, "step": 24270}, {"loss": 0.7343, "grad_norm": 0.8768740296363831, "learning_rate": 0.0002, "epoch": 1.7436265709156193, "step": 24280}, {"loss": 0.7041, "grad_norm": 0.6691206097602844, "learning_rate": 0.0002, "epoch": 1.7443447037701976, "step": 24290}, {"loss": 0.7526, "grad_norm": 0.6529893279075623, "learning_rate": 0.0002, "epoch": 1.7450628366247756, "step": 24300}, {"loss": 0.7638, "grad_norm": 0.904729962348938, "learning_rate": 0.0002, "epoch": 1.7457809694793536, "step": 24310}, {"loss": 0.7463, "grad_norm": 0.655235230922699, "learning_rate": 0.0002, "epoch": 1.7464991023339318, "step": 24320}, {"loss": 0.7625, "grad_norm": 0.9476361274719238, "learning_rate": 0.0002, "epoch": 1.74721723518851, "step": 24330}, {"loss": 0.688, "grad_norm": 0.55366051197052, "learning_rate": 0.0002, "epoch": 1.747935368043088, "step": 24340}, {"loss": 0.7664, "grad_norm": 0.7192568182945251, "learning_rate": 0.0002, "epoch": 1.748653500897666, "step": 24350}, {"loss": 0.7423, "grad_norm": 0.7193983793258667, "learning_rate": 0.0002, "epoch": 1.749371633752244, "step": 24360}, {"loss": 0.7463, "grad_norm": 0.753998339176178, "learning_rate": 0.0002, "epoch": 1.7500897666068223, "step": 24370}, {"loss": 0.7415, "grad_norm": 1.1058299541473389, "learning_rate": 0.0002, "epoch": 1.7508078994614005, "step": 24380}, {"loss": 0.7373, "grad_norm": 0.7213007211685181, "learning_rate": 0.0002, "epoch": 1.7515260323159785, "step": 24390}, {"loss": 0.7395, "grad_norm": 0.972494900226593, "learning_rate": 0.0002, "epoch": 1.7522441651705565, "step": 24400}, {"loss": 0.7689, "grad_norm": 0.8045306205749512, "learning_rate": 0.0002, "epoch": 1.7529622980251347, "step": 24410}, {"loss": 0.7463, "grad_norm": 0.82415372133255, "learning_rate": 0.0002, "epoch": 1.7536804308797127, "step": 24420}, {"loss": 0.7384, "grad_norm": 0.72683185338974, "learning_rate": 0.0002, "epoch": 1.754398563734291, "step": 24430}, {"loss": 0.7512, "grad_norm": 0.687907338142395, "learning_rate": 0.0002, "epoch": 1.755116696588869, "step": 24440}, {"loss": 0.7627, "grad_norm": 0.6616531610488892, "learning_rate": 0.0002, "epoch": 1.755834829443447, "step": 24450}, {"loss": 0.7425, "grad_norm": 0.7225571870803833, "learning_rate": 0.0002, "epoch": 1.7565529622980252, "step": 24460}, {"loss": 0.7584, "grad_norm": 0.7597603797912598, "learning_rate": 0.0002, "epoch": 1.7572710951526034, "step": 24470}, {"loss": 0.7076, "grad_norm": 0.7850660681724548, "learning_rate": 0.0002, "epoch": 1.7579892280071814, "step": 24480}, {"loss": 0.7294, "grad_norm": 0.9843530058860779, "learning_rate": 0.0002, "epoch": 1.7587073608617594, "step": 24490}, {"loss": 0.7237, "grad_norm": 0.7010256052017212, "learning_rate": 0.0002, "epoch": 1.7594254937163374, "step": 24500}, {"loss": 0.7143, "grad_norm": 0.5669383406639099, "learning_rate": 0.0002, "epoch": 1.7601436265709156, "step": 24510}, {"loss": 0.7511, "grad_norm": 0.7043302655220032, "learning_rate": 0.0002, "epoch": 1.7608617594254938, "step": 24520}, {"loss": 0.73, "grad_norm": 0.8000741600990295, "learning_rate": 0.0002, "epoch": 1.7615798922800718, "step": 24530}, {"loss": 0.6994, "grad_norm": 0.7084416747093201, "learning_rate": 0.0002, "epoch": 1.7622980251346498, "step": 24540}, {"loss": 0.7337, "grad_norm": 0.7290608882904053, "learning_rate": 0.0002, "epoch": 1.763016157989228, "step": 24550}, {"loss": 0.6968, "grad_norm": 0.8710007071495056, "learning_rate": 0.0002, "epoch": 1.763734290843806, "step": 24560}, {"loss": 0.7023, "grad_norm": 0.6346535682678223, "learning_rate": 0.0002, "epoch": 1.7644524236983843, "step": 24570}, {"loss": 0.684, "grad_norm": 0.8990599513053894, "learning_rate": 0.0002, "epoch": 1.7651705565529623, "step": 24580}, {"loss": 0.7222, "grad_norm": 0.7823857665061951, "learning_rate": 0.0002, "epoch": 1.7658886894075403, "step": 24590}, {"loss": 0.7392, "grad_norm": 0.6250144839286804, "learning_rate": 0.0002, "epoch": 1.7666068222621185, "step": 24600}, {"loss": 0.7159, "grad_norm": 0.715657114982605, "learning_rate": 0.0002, "epoch": 1.7673249551166967, "step": 24610}, {"loss": 0.7245, "grad_norm": 0.6254874467849731, "learning_rate": 0.0002, "epoch": 1.7680430879712747, "step": 24620}, {"loss": 0.7258, "grad_norm": 0.6873717904090881, "learning_rate": 0.0002, "epoch": 1.7687612208258527, "step": 24630}, {"loss": 0.7951, "grad_norm": 0.7273038625717163, "learning_rate": 0.0002, "epoch": 1.7694793536804307, "step": 24640}, {"loss": 0.7417, "grad_norm": 0.9079981446266174, "learning_rate": 0.0002, "epoch": 1.770197486535009, "step": 24650}, {"loss": 0.7138, "grad_norm": 0.6262510418891907, "learning_rate": 0.0002, "epoch": 1.7709156193895872, "step": 24660}, {"loss": 0.6995, "grad_norm": 0.7326231002807617, "learning_rate": 0.0002, "epoch": 1.7716337522441652, "step": 24670}, {"loss": 0.7483, "grad_norm": 0.7828301787376404, "learning_rate": 0.0002, "epoch": 1.7723518850987432, "step": 24680}, {"loss": 0.689, "grad_norm": 0.5881586670875549, "learning_rate": 0.0002, "epoch": 1.7730700179533212, "step": 24690}, {"loss": 0.744, "grad_norm": 0.7101683020591736, "learning_rate": 0.0002, "epoch": 1.7737881508078994, "step": 24700}, {"loss": 0.7145, "grad_norm": 0.8466469049453735, "learning_rate": 0.0002, "epoch": 1.7745062836624776, "step": 24710}, {"loss": 0.7428, "grad_norm": 0.7770822644233704, "learning_rate": 0.0002, "epoch": 1.7752244165170556, "step": 24720}, {"loss": 0.7299, "grad_norm": 0.7259120345115662, "learning_rate": 0.0002, "epoch": 1.7759425493716336, "step": 24730}, {"loss": 0.6909, "grad_norm": 0.7696824669837952, "learning_rate": 0.0002, "epoch": 1.7766606822262119, "step": 24740}, {"loss": 0.7659, "grad_norm": 0.7603837847709656, "learning_rate": 0.0002, "epoch": 1.77737881508079, "step": 24750}, {"loss": 0.6966, "grad_norm": 0.6166595220565796, "learning_rate": 0.0002, "epoch": 1.778096947935368, "step": 24760}, {"loss": 0.6987, "grad_norm": 0.7493758797645569, "learning_rate": 0.0002, "epoch": 1.778815080789946, "step": 24770}, {"loss": 0.6808, "grad_norm": 0.7177459597587585, "learning_rate": 0.0002, "epoch": 1.779533213644524, "step": 24780}, {"loss": 0.7411, "grad_norm": 0.6666781306266785, "learning_rate": 0.0002, "epoch": 1.7802513464991023, "step": 24790}, {"loss": 0.6867, "grad_norm": 0.6556468605995178, "learning_rate": 0.0002, "epoch": 1.7809694793536806, "step": 24800}, {"loss": 0.7375, "grad_norm": 0.6119393706321716, "learning_rate": 0.0002, "epoch": 1.7816876122082586, "step": 24810}, {"loss": 0.7059, "grad_norm": 0.8573325276374817, "learning_rate": 0.0002, "epoch": 1.7824057450628366, "step": 24820}, {"loss": 0.7708, "grad_norm": 0.8017005920410156, "learning_rate": 0.0002, "epoch": 1.7831238779174146, "step": 24830}, {"loss": 0.7041, "grad_norm": 0.7337947487831116, "learning_rate": 0.0002, "epoch": 1.7838420107719928, "step": 24840}, {"loss": 0.7325, "grad_norm": 0.6717178225517273, "learning_rate": 0.0002, "epoch": 1.784560143626571, "step": 24850}, {"loss": 0.7285, "grad_norm": 0.8243708610534668, "learning_rate": 0.0002, "epoch": 1.785278276481149, "step": 24860}, {"loss": 0.701, "grad_norm": 0.8111547827720642, "learning_rate": 0.0002, "epoch": 1.785996409335727, "step": 24870}, {"loss": 0.7105, "grad_norm": 0.8577823042869568, "learning_rate": 0.0002, "epoch": 1.7867145421903052, "step": 24880}, {"loss": 0.7419, "grad_norm": 0.6488644480705261, "learning_rate": 0.0002, "epoch": 1.7874326750448835, "step": 24890}, {"loss": 0.7112, "grad_norm": 0.6446744799613953, "learning_rate": 0.0002, "epoch": 1.7881508078994615, "step": 24900}, {"loss": 0.7531, "grad_norm": 0.6400182247161865, "learning_rate": 0.0002, "epoch": 1.7888689407540395, "step": 24910}, {"loss": 0.711, "grad_norm": 0.8059108853340149, "learning_rate": 0.0002, "epoch": 1.7895870736086175, "step": 24920}, {"loss": 0.7678, "grad_norm": 0.7101734280586243, "learning_rate": 0.0002, "epoch": 1.7903052064631957, "step": 24930}, {"loss": 0.7648, "grad_norm": 1.0397762060165405, "learning_rate": 0.0002, "epoch": 1.791023339317774, "step": 24940}, {"loss": 0.7079, "grad_norm": 0.6231128573417664, "learning_rate": 0.0002, "epoch": 1.791741472172352, "step": 24950}, {"loss": 0.7525, "grad_norm": 5.905253887176514, "learning_rate": 0.0002, "epoch": 1.79245960502693, "step": 24960}, {"loss": 0.7286, "grad_norm": 0.8003911375999451, "learning_rate": 0.0002, "epoch": 1.793177737881508, "step": 24970}, {"loss": 0.7002, "grad_norm": 0.6340393424034119, "learning_rate": 0.0002, "epoch": 1.7938958707360861, "step": 24980}, {"loss": 0.7056, "grad_norm": 0.8701013922691345, "learning_rate": 0.0002, "epoch": 1.7946140035906644, "step": 24990}, {"loss": 0.7192, "grad_norm": 0.9085575342178345, "learning_rate": 0.0002, "epoch": 1.7953321364452424, "step": 25000}, {"loss": 0.7367, "grad_norm": 0.6306625604629517, "learning_rate": 0.0002, "epoch": 1.7960502692998204, "step": 25010}, {"loss": 0.7122, "grad_norm": 0.6985056400299072, "learning_rate": 0.0002, "epoch": 1.7967684021543986, "step": 25020}, {"loss": 0.7005, "grad_norm": 0.7309113144874573, "learning_rate": 0.0002, "epoch": 1.7974865350089768, "step": 25030}, {"loss": 0.7414, "grad_norm": 0.6795042157173157, "learning_rate": 0.0002, "epoch": 1.7982046678635548, "step": 25040}, {"loss": 0.7606, "grad_norm": 0.6920178532600403, "learning_rate": 0.0002, "epoch": 1.7989228007181328, "step": 25050}, {"loss": 0.7094, "grad_norm": 0.6578564047813416, "learning_rate": 0.0002, "epoch": 1.7996409335727108, "step": 25060}, {"loss": 0.7471, "grad_norm": 0.6718358993530273, "learning_rate": 0.0002, "epoch": 1.800359066427289, "step": 25070}, {"loss": 0.7271, "grad_norm": 0.9086750149726868, "learning_rate": 0.0002, "epoch": 1.8010771992818673, "step": 25080}, {"loss": 0.7653, "grad_norm": 0.6102437973022461, "learning_rate": 0.0002, "epoch": 1.8017953321364453, "step": 25090}, {"loss": 0.7538, "grad_norm": 0.6391313076019287, "learning_rate": 0.0002, "epoch": 1.8025134649910233, "step": 25100}, {"loss": 0.766, "grad_norm": 0.7150128483772278, "learning_rate": 0.0002, "epoch": 1.8032315978456013, "step": 25110}, {"loss": 0.7036, "grad_norm": 0.9833421111106873, "learning_rate": 0.0002, "epoch": 1.8039497307001795, "step": 25120}, {"loss": 0.7122, "grad_norm": 0.774002194404602, "learning_rate": 0.0002, "epoch": 1.8046678635547577, "step": 25130}, {"loss": 0.7329, "grad_norm": 0.644443154335022, "learning_rate": 0.0002, "epoch": 1.8053859964093357, "step": 25140}, {"loss": 0.7039, "grad_norm": 0.6996100544929504, "learning_rate": 0.0002, "epoch": 1.8061041292639137, "step": 25150}, {"loss": 0.6962, "grad_norm": 0.7545985579490662, "learning_rate": 0.0002, "epoch": 1.806822262118492, "step": 25160}, {"loss": 0.7432, "grad_norm": 0.7505226731300354, "learning_rate": 0.0002, "epoch": 1.8075403949730702, "step": 25170}, {"loss": 0.7189, "grad_norm": 0.800681471824646, "learning_rate": 0.0002, "epoch": 1.8082585278276482, "step": 25180}, {"loss": 0.7131, "grad_norm": 0.8268337845802307, "learning_rate": 0.0002, "epoch": 1.8089766606822262, "step": 25190}, {"loss": 0.7933, "grad_norm": 0.6436594128608704, "learning_rate": 0.0002, "epoch": 1.8096947935368042, "step": 25200}, {"loss": 0.7478, "grad_norm": 0.6961014270782471, "learning_rate": 0.0002, "epoch": 1.8104129263913824, "step": 25210}, {"loss": 0.7519, "grad_norm": 0.6649489998817444, "learning_rate": 0.0002, "epoch": 1.8111310592459606, "step": 25220}, {"loss": 0.7307, "grad_norm": 0.7071637511253357, "learning_rate": 0.0002, "epoch": 1.8118491921005386, "step": 25230}, {"loss": 0.7074, "grad_norm": 0.9082241654396057, "learning_rate": 0.0002, "epoch": 1.8125673249551166, "step": 25240}, {"loss": 0.7406, "grad_norm": 0.6318159103393555, "learning_rate": 0.0002, "epoch": 1.8132854578096946, "step": 25250}, {"loss": 0.7081, "grad_norm": 0.8006597757339478, "learning_rate": 0.0002, "epoch": 1.8140035906642729, "step": 25260}, {"loss": 0.7593, "grad_norm": 0.7950259447097778, "learning_rate": 0.0002, "epoch": 1.814721723518851, "step": 25270}, {"loss": 0.6897, "grad_norm": 0.8376588821411133, "learning_rate": 0.0002, "epoch": 1.815439856373429, "step": 25280}, {"loss": 0.747, "grad_norm": 0.8343217968940735, "learning_rate": 0.0002, "epoch": 1.816157989228007, "step": 25290}, {"loss": 0.7611, "grad_norm": 0.6240017414093018, "learning_rate": 0.0002, "epoch": 1.8168761220825853, "step": 25300}, {"loss": 0.7458, "grad_norm": 0.7079808712005615, "learning_rate": 0.0002, "epoch": 1.8175942549371635, "step": 25310}, {"loss": 0.7254, "grad_norm": 0.5930073261260986, "learning_rate": 0.0002, "epoch": 1.8183123877917415, "step": 25320}, {"loss": 0.7647, "grad_norm": 0.6994491815567017, "learning_rate": 0.0002, "epoch": 1.8190305206463195, "step": 25330}, {"loss": 0.726, "grad_norm": 0.8285305500030518, "learning_rate": 0.0002, "epoch": 1.8197486535008975, "step": 25340}, {"loss": 0.7215, "grad_norm": 0.6880194544792175, "learning_rate": 0.0002, "epoch": 1.8204667863554758, "step": 25350}, {"loss": 0.7365, "grad_norm": 0.7301307916641235, "learning_rate": 0.0002, "epoch": 1.821184919210054, "step": 25360}, {"loss": 0.7308, "grad_norm": 0.8117532730102539, "learning_rate": 0.0002, "epoch": 1.821903052064632, "step": 25370}, {"loss": 0.7395, "grad_norm": 0.8098701238632202, "learning_rate": 0.0002, "epoch": 1.82262118491921, "step": 25380}, {"loss": 0.7082, "grad_norm": 0.6899038553237915, "learning_rate": 0.0002, "epoch": 1.823339317773788, "step": 25390}, {"loss": 0.697, "grad_norm": 0.7350431084632874, "learning_rate": 0.0002, "epoch": 1.8240574506283662, "step": 25400}, {"loss": 0.7389, "grad_norm": 0.8723382949829102, "learning_rate": 0.0002, "epoch": 1.8247755834829444, "step": 25410}, {"loss": 0.7375, "grad_norm": 0.7448108196258545, "learning_rate": 0.0002, "epoch": 1.8254937163375224, "step": 25420}, {"loss": 0.7279, "grad_norm": 0.7525040507316589, "learning_rate": 0.0002, "epoch": 1.8262118491921004, "step": 25430}, {"loss": 0.7164, "grad_norm": 0.7148599028587341, "learning_rate": 0.0002, "epoch": 1.8269299820466787, "step": 25440}, {"loss": 0.7955, "grad_norm": 1.1802153587341309, "learning_rate": 0.0002, "epoch": 1.827648114901257, "step": 25450}, {"loss": 0.7094, "grad_norm": 0.619945764541626, "learning_rate": 0.0002, "epoch": 1.828366247755835, "step": 25460}, {"loss": 0.8234, "grad_norm": 0.7065792679786682, "learning_rate": 0.0002, "epoch": 1.829084380610413, "step": 25470}, {"loss": 0.796, "grad_norm": 0.6626001596450806, "learning_rate": 0.0002, "epoch": 1.829802513464991, "step": 25480}, {"loss": 0.7402, "grad_norm": 0.8368920087814331, "learning_rate": 0.0002, "epoch": 1.8305206463195691, "step": 25490}, {"loss": 0.6513, "grad_norm": 0.7528934478759766, "learning_rate": 0.0002, "epoch": 1.8312387791741473, "step": 25500}, {"loss": 0.7272, "grad_norm": 0.6472136378288269, "learning_rate": 0.0002, "epoch": 1.8319569120287253, "step": 25510}, {"loss": 0.7221, "grad_norm": 0.7818671464920044, "learning_rate": 0.0002, "epoch": 1.8326750448833034, "step": 25520}, {"loss": 0.7582, "grad_norm": 0.8280798196792603, "learning_rate": 0.0002, "epoch": 1.8333931777378814, "step": 25530}, {"loss": 0.7079, "grad_norm": 0.7038599252700806, "learning_rate": 0.0002, "epoch": 1.8341113105924596, "step": 25540}, {"loss": 0.711, "grad_norm": 0.6345962882041931, "learning_rate": 0.0002, "epoch": 1.8348294434470378, "step": 25550}, {"loss": 0.7553, "grad_norm": 0.6891741752624512, "learning_rate": 0.0002, "epoch": 1.8355475763016158, "step": 25560}, {"loss": 0.754, "grad_norm": 0.7753492593765259, "learning_rate": 0.0002, "epoch": 1.8362657091561938, "step": 25570}, {"loss": 0.7149, "grad_norm": 0.6907210946083069, "learning_rate": 0.0002, "epoch": 1.836983842010772, "step": 25580}, {"loss": 0.705, "grad_norm": 0.7483090162277222, "learning_rate": 0.0002, "epoch": 1.8377019748653503, "step": 25590}, {"loss": 0.7716, "grad_norm": 0.8749029636383057, "learning_rate": 0.0002, "epoch": 1.8384201077199283, "step": 25600}, {"loss": 0.7745, "grad_norm": 0.6936851143836975, "learning_rate": 0.0002, "epoch": 1.8391382405745063, "step": 25610}, {"loss": 0.7297, "grad_norm": 0.7273763418197632, "learning_rate": 0.0002, "epoch": 1.8398563734290843, "step": 25620}, {"loss": 0.724, "grad_norm": 0.7655298113822937, "learning_rate": 0.0002, "epoch": 1.8405745062836625, "step": 25630}, {"loss": 0.7566, "grad_norm": 0.7207344770431519, "learning_rate": 0.0002, "epoch": 1.8412926391382407, "step": 25640}, {"loss": 0.7092, "grad_norm": 0.6970131397247314, "learning_rate": 0.0002, "epoch": 1.8420107719928187, "step": 25650}, {"loss": 0.7164, "grad_norm": 0.7777560353279114, "learning_rate": 0.0002, "epoch": 1.8427289048473967, "step": 25660}, {"loss": 0.7594, "grad_norm": 0.7070116400718689, "learning_rate": 0.0002, "epoch": 1.8434470377019747, "step": 25670}, {"loss": 0.7603, "grad_norm": 0.6980257630348206, "learning_rate": 0.0002, "epoch": 1.844165170556553, "step": 25680}, {"loss": 0.7782, "grad_norm": 0.906563401222229, "learning_rate": 0.0002, "epoch": 1.8448833034111312, "step": 25690}, {"loss": 0.7377, "grad_norm": 0.567991316318512, "learning_rate": 0.0002, "epoch": 1.8456014362657092, "step": 25700}, {"loss": 0.7236, "grad_norm": 0.5954506993293762, "learning_rate": 0.0002, "epoch": 1.8463195691202872, "step": 25710}, {"loss": 0.7287, "grad_norm": 0.8073318600654602, "learning_rate": 0.0002, "epoch": 1.8470377019748654, "step": 25720}, {"loss": 0.7627, "grad_norm": 0.7439551949501038, "learning_rate": 0.0002, "epoch": 1.8477558348294436, "step": 25730}, {"loss": 0.7719, "grad_norm": 0.8091771602630615, "learning_rate": 0.0002, "epoch": 1.8484739676840216, "step": 25740}, {"loss": 0.7477, "grad_norm": 0.6584576964378357, "learning_rate": 0.0002, "epoch": 1.8491921005385996, "step": 25750}, {"loss": 0.6988, "grad_norm": 0.8161963224411011, "learning_rate": 0.0002, "epoch": 1.8499102333931776, "step": 25760}, {"loss": 0.7607, "grad_norm": 0.7337122559547424, "learning_rate": 0.0002, "epoch": 1.8506283662477558, "step": 25770}, {"loss": 0.7279, "grad_norm": 0.8968114256858826, "learning_rate": 0.0002, "epoch": 1.851346499102334, "step": 25780}, {"loss": 0.7162, "grad_norm": 0.8647686839103699, "learning_rate": 0.0002, "epoch": 1.852064631956912, "step": 25790}, {"loss": 0.7315, "grad_norm": 0.7775349020957947, "learning_rate": 0.0002, "epoch": 1.85278276481149, "step": 25800}, {"loss": 0.7739, "grad_norm": 0.686072587966919, "learning_rate": 0.0002, "epoch": 1.853500897666068, "step": 25810}, {"loss": 0.7138, "grad_norm": 0.7053380012512207, "learning_rate": 0.0002, "epoch": 1.8542190305206463, "step": 25820}, {"loss": 0.7583, "grad_norm": 0.7899979948997498, "learning_rate": 0.0002, "epoch": 1.8549371633752245, "step": 25830}, {"loss": 0.7633, "grad_norm": 0.6970776915550232, "learning_rate": 0.0002, "epoch": 1.8556552962298025, "step": 25840}, {"loss": 0.7704, "grad_norm": 0.7210841774940491, "learning_rate": 0.0002, "epoch": 1.8563734290843805, "step": 25850}, {"loss": 0.7422, "grad_norm": 0.7297208905220032, "learning_rate": 0.0002, "epoch": 1.8570915619389587, "step": 25860}, {"loss": 0.698, "grad_norm": 0.7782729268074036, "learning_rate": 0.0002, "epoch": 1.857809694793537, "step": 25870}, {"loss": 0.7791, "grad_norm": 0.7227505445480347, "learning_rate": 0.0002, "epoch": 1.858527827648115, "step": 25880}, {"loss": 0.7899, "grad_norm": 0.7489684224128723, "learning_rate": 0.0002, "epoch": 1.859245960502693, "step": 25890}, {"loss": 0.7875, "grad_norm": 0.7447289824485779, "learning_rate": 0.0002, "epoch": 1.859964093357271, "step": 25900}, {"loss": 0.7151, "grad_norm": 0.8516317009925842, "learning_rate": 0.0002, "epoch": 1.8606822262118492, "step": 25910}, {"loss": 0.6947, "grad_norm": 0.6864543557167053, "learning_rate": 0.0002, "epoch": 1.8614003590664274, "step": 25920}, {"loss": 0.7516, "grad_norm": 0.6753451824188232, "learning_rate": 0.0002, "epoch": 1.8621184919210054, "step": 25930}, {"loss": 0.7606, "grad_norm": 0.631679117679596, "learning_rate": 0.0002, "epoch": 1.8628366247755834, "step": 25940}, {"loss": 0.7663, "grad_norm": 0.7715049982070923, "learning_rate": 0.0002, "epoch": 1.8635547576301614, "step": 25950}, {"loss": 0.6967, "grad_norm": 0.7354850769042969, "learning_rate": 0.0002, "epoch": 1.8642728904847397, "step": 25960}, {"loss": 0.7331, "grad_norm": 0.7443442940711975, "learning_rate": 0.0002, "epoch": 1.8649910233393179, "step": 25970}, {"loss": 0.7558, "grad_norm": 0.6880337595939636, "learning_rate": 0.0002, "epoch": 1.8657091561938959, "step": 25980}, {"loss": 0.752, "grad_norm": 0.843941867351532, "learning_rate": 0.0002, "epoch": 1.8664272890484739, "step": 25990}, {"loss": 0.6941, "grad_norm": 0.6904318928718567, "learning_rate": 0.0002, "epoch": 1.867145421903052, "step": 26000}, {"loss": 0.6995, "grad_norm": 0.9041751623153687, "learning_rate": 0.0002, "epoch": 1.86786355475763, "step": 26010}, {"loss": 0.7503, "grad_norm": 0.7470057010650635, "learning_rate": 0.0002, "epoch": 1.8685816876122083, "step": 26020}, {"loss": 0.775, "grad_norm": 0.6921331882476807, "learning_rate": 0.0002, "epoch": 1.8692998204667863, "step": 26030}, {"loss": 0.7376, "grad_norm": 0.7627376914024353, "learning_rate": 0.0002, "epoch": 1.8700179533213643, "step": 26040}, {"loss": 0.7459, "grad_norm": 0.7784932851791382, "learning_rate": 0.0002, "epoch": 1.8707360861759426, "step": 26050}, {"loss": 0.7479, "grad_norm": 0.6399524807929993, "learning_rate": 0.0002, "epoch": 1.8714542190305208, "step": 26060}, {"loss": 0.7128, "grad_norm": 0.6478492617607117, "learning_rate": 0.0002, "epoch": 1.8721723518850988, "step": 26070}, {"loss": 0.6901, "grad_norm": 0.6376804113388062, "learning_rate": 0.0002, "epoch": 1.8728904847396768, "step": 26080}, {"loss": 0.7037, "grad_norm": 0.6976892352104187, "learning_rate": 0.0002, "epoch": 1.8736086175942548, "step": 26090}, {"loss": 0.7071, "grad_norm": 0.7997903227806091, "learning_rate": 0.0002, "epoch": 1.874326750448833, "step": 26100}, {"loss": 0.7152, "grad_norm": 0.6984273791313171, "learning_rate": 0.0002, "epoch": 1.8750448833034112, "step": 26110}, {"loss": 0.7768, "grad_norm": 0.7020659446716309, "learning_rate": 0.0002, "epoch": 1.8757630161579892, "step": 26120}, {"loss": 0.7518, "grad_norm": 0.784986138343811, "learning_rate": 0.0002, "epoch": 1.8764811490125672, "step": 26130}, {"loss": 0.7224, "grad_norm": 0.7369210124015808, "learning_rate": 0.0002, "epoch": 1.8771992818671455, "step": 26140}, {"loss": 0.7935, "grad_norm": 0.7730622291564941, "learning_rate": 0.0002, "epoch": 1.8779174147217235, "step": 26150}, {"loss": 0.697, "grad_norm": 0.7253434658050537, "learning_rate": 0.0002, "epoch": 1.8786355475763017, "step": 26160}, {"loss": 0.6866, "grad_norm": 0.8019800186157227, "learning_rate": 0.0002, "epoch": 1.8793536804308797, "step": 26170}, {"loss": 0.7341, "grad_norm": 0.7337628602981567, "learning_rate": 0.0002, "epoch": 1.8800718132854577, "step": 26180}, {"loss": 0.752, "grad_norm": 0.7049200534820557, "learning_rate": 0.0002, "epoch": 1.880789946140036, "step": 26190}, {"loss": 0.73, "grad_norm": 0.6451525092124939, "learning_rate": 0.0002, "epoch": 1.8815080789946141, "step": 26200}, {"loss": 0.749, "grad_norm": 0.7660874724388123, "learning_rate": 0.0002, "epoch": 1.8822262118491921, "step": 26210}, {"loss": 0.7377, "grad_norm": 0.8464223146438599, "learning_rate": 0.0002, "epoch": 1.8829443447037701, "step": 26220}, {"loss": 0.7402, "grad_norm": 0.859503984451294, "learning_rate": 0.0002, "epoch": 1.8836624775583481, "step": 26230}, {"loss": 0.7057, "grad_norm": 0.6969478726387024, "learning_rate": 0.0002, "epoch": 1.8843806104129264, "step": 26240}, {"loss": 0.7338, "grad_norm": 0.6860285997390747, "learning_rate": 0.0002, "epoch": 1.8850987432675046, "step": 26250}, {"loss": 0.7397, "grad_norm": 0.5873110294342041, "learning_rate": 0.0002, "epoch": 1.8858168761220826, "step": 26260}, {"loss": 0.7208, "grad_norm": 0.6959530115127563, "learning_rate": 0.0002, "epoch": 1.8865350089766606, "step": 26270}, {"loss": 0.7156, "grad_norm": 0.8734689950942993, "learning_rate": 0.0002, "epoch": 1.8872531418312388, "step": 26280}, {"loss": 0.689, "grad_norm": 0.7385509014129639, "learning_rate": 0.0002, "epoch": 1.8879712746858168, "step": 26290}, {"loss": 0.7355, "grad_norm": 0.6702063083648682, "learning_rate": 0.0002, "epoch": 1.888689407540395, "step": 26300}, {"loss": 0.7247, "grad_norm": 0.8177255988121033, "learning_rate": 0.0002, "epoch": 1.889407540394973, "step": 26310}, {"loss": 0.7451, "grad_norm": 0.6638466715812683, "learning_rate": 0.0002, "epoch": 1.890125673249551, "step": 26320}, {"loss": 0.7176, "grad_norm": 0.8584128618240356, "learning_rate": 0.0002, "epoch": 1.8908438061041293, "step": 26330}, {"loss": 0.7216, "grad_norm": 0.677561342716217, "learning_rate": 0.0002, "epoch": 1.8915619389587075, "step": 26340}, {"loss": 0.7502, "grad_norm": 0.6931864619255066, "learning_rate": 0.0002, "epoch": 1.8922800718132855, "step": 26350}, {"loss": 0.7548, "grad_norm": 0.6583828330039978, "learning_rate": 0.0002, "epoch": 1.8929982046678635, "step": 26360}, {"loss": 0.7544, "grad_norm": 0.6708519458770752, "learning_rate": 0.0002, "epoch": 1.8937163375224415, "step": 26370}, {"loss": 0.7034, "grad_norm": 0.7684788107872009, "learning_rate": 0.0002, "epoch": 1.8944344703770197, "step": 26380}, {"loss": 0.7243, "grad_norm": 0.703217625617981, "learning_rate": 0.0002, "epoch": 1.895152603231598, "step": 26390}, {"loss": 0.7768, "grad_norm": 0.6686710119247437, "learning_rate": 0.0002, "epoch": 1.895870736086176, "step": 26400}, {"loss": 0.7999, "grad_norm": 0.7429705262184143, "learning_rate": 0.0002, "epoch": 1.896588868940754, "step": 26410}, {"loss": 0.7695, "grad_norm": 0.7835305333137512, "learning_rate": 0.0002, "epoch": 1.8973070017953322, "step": 26420}, {"loss": 0.722, "grad_norm": 0.7793689370155334, "learning_rate": 0.0002, "epoch": 1.8980251346499102, "step": 26430}, {"loss": 0.7872, "grad_norm": 0.7337237000465393, "learning_rate": 0.0002, "epoch": 1.8987432675044884, "step": 26440}, {"loss": 0.7092, "grad_norm": 0.5734546780586243, "learning_rate": 0.0002, "epoch": 1.8994614003590664, "step": 26450}, {"loss": 0.7738, "grad_norm": 0.655937135219574, "learning_rate": 0.0002, "epoch": 1.9001795332136444, "step": 26460}, {"loss": 0.7302, "grad_norm": 1.0200905799865723, "learning_rate": 0.0002, "epoch": 1.9008976660682226, "step": 26470}, {"loss": 0.733, "grad_norm": 0.6118829250335693, "learning_rate": 0.0002, "epoch": 1.9016157989228009, "step": 26480}, {"loss": 0.7255, "grad_norm": 0.7459297776222229, "learning_rate": 0.0002, "epoch": 1.9023339317773789, "step": 26490}, {"loss": 0.7257, "grad_norm": 0.9451959729194641, "learning_rate": 0.0002, "epoch": 1.9030520646319569, "step": 26500}, {"loss": 0.7911, "grad_norm": 0.9694880247116089, "learning_rate": 0.0002, "epoch": 1.9037701974865349, "step": 26510}, {"loss": 0.7913, "grad_norm": 0.806532084941864, "learning_rate": 0.0002, "epoch": 1.904488330341113, "step": 26520}, {"loss": 0.7375, "grad_norm": 0.7016968727111816, "learning_rate": 0.0002, "epoch": 1.9052064631956913, "step": 26530}, {"loss": 0.7128, "grad_norm": 0.7707533836364746, "learning_rate": 0.0002, "epoch": 1.9059245960502693, "step": 26540}, {"loss": 0.7225, "grad_norm": 0.716044545173645, "learning_rate": 0.0002, "epoch": 1.9066427289048473, "step": 26550}, {"loss": 0.7569, "grad_norm": 0.7904782295227051, "learning_rate": 0.0002, "epoch": 1.9073608617594255, "step": 26560}, {"loss": 0.7112, "grad_norm": 0.8557461500167847, "learning_rate": 0.0002, "epoch": 1.9080789946140035, "step": 26570}, {"loss": 0.7377, "grad_norm": 0.6807048916816711, "learning_rate": 0.0002, "epoch": 1.9087971274685818, "step": 26580}, {"loss": 0.7066, "grad_norm": 0.8374032974243164, "learning_rate": 0.0002, "epoch": 1.9095152603231598, "step": 26590}, {"loss": 0.7282, "grad_norm": 0.7936834692955017, "learning_rate": 0.0002, "epoch": 1.9102333931777378, "step": 26600}, {"loss": 0.741, "grad_norm": 0.6342210173606873, "learning_rate": 0.0002, "epoch": 1.910951526032316, "step": 26610}, {"loss": 0.7117, "grad_norm": 0.8222208023071289, "learning_rate": 0.0002, "epoch": 1.9116696588868942, "step": 26620}, {"loss": 0.6965, "grad_norm": 0.7890012860298157, "learning_rate": 0.0002, "epoch": 1.9123877917414722, "step": 26630}, {"loss": 0.7141, "grad_norm": 0.6415254473686218, "learning_rate": 0.0002, "epoch": 1.9131059245960502, "step": 26640}, {"loss": 0.7232, "grad_norm": 0.7936763763427734, "learning_rate": 0.0002, "epoch": 1.9138240574506282, "step": 26650}, {"loss": 0.7411, "grad_norm": 0.7174334526062012, "learning_rate": 0.0002, "epoch": 1.9145421903052064, "step": 26660}, {"loss": 0.715, "grad_norm": 0.6503710746765137, "learning_rate": 0.0002, "epoch": 1.9152603231597847, "step": 26670}, {"loss": 0.7629, "grad_norm": 0.7618577480316162, "learning_rate": 0.0002, "epoch": 1.9159784560143627, "step": 26680}, {"loss": 0.7581, "grad_norm": 0.7984131574630737, "learning_rate": 0.0002, "epoch": 1.9166965888689407, "step": 26690}, {"loss": 0.7126, "grad_norm": 0.6863887906074524, "learning_rate": 0.0002, "epoch": 1.917414721723519, "step": 26700}, {"loss": 0.738, "grad_norm": 0.7621138691902161, "learning_rate": 0.0002, "epoch": 1.918132854578097, "step": 26710}, {"loss": 0.7095, "grad_norm": 0.7855543494224548, "learning_rate": 0.0002, "epoch": 1.9188509874326751, "step": 26720}, {"loss": 0.7354, "grad_norm": 0.7045016288757324, "learning_rate": 0.0002, "epoch": 1.9195691202872531, "step": 26730}, {"loss": 0.7188, "grad_norm": 0.7799559235572815, "learning_rate": 0.0002, "epoch": 1.9202872531418311, "step": 26740}, {"loss": 0.7714, "grad_norm": 0.7999796271324158, "learning_rate": 0.0002, "epoch": 1.9210053859964094, "step": 26750}, {"loss": 0.6856, "grad_norm": 0.5479980111122131, "learning_rate": 0.0002, "epoch": 1.9217235188509876, "step": 26760}, {"loss": 0.7153, "grad_norm": 0.7192868590354919, "learning_rate": 0.0002, "epoch": 1.9224416517055656, "step": 26770}, {"loss": 0.7272, "grad_norm": 0.7642375826835632, "learning_rate": 0.0002, "epoch": 1.9231597845601436, "step": 26780}, {"loss": 0.6923, "grad_norm": 0.7015959620475769, "learning_rate": 0.0002, "epoch": 1.9238779174147216, "step": 26790}, {"loss": 0.8291, "grad_norm": 0.6685634851455688, "learning_rate": 0.0002, "epoch": 1.9245960502692998, "step": 26800}, {"loss": 0.7404, "grad_norm": 0.674363911151886, "learning_rate": 0.0002, "epoch": 1.925314183123878, "step": 26810}, {"loss": 0.7145, "grad_norm": 0.769318163394928, "learning_rate": 0.0002, "epoch": 1.926032315978456, "step": 26820}, {"loss": 0.7323, "grad_norm": 0.7397989630699158, "learning_rate": 0.0002, "epoch": 1.926750448833034, "step": 26830}, {"loss": 0.7399, "grad_norm": 0.7603814601898193, "learning_rate": 0.0002, "epoch": 1.9274685816876123, "step": 26840}, {"loss": 0.7147, "grad_norm": 0.5960564613342285, "learning_rate": 0.0002, "epoch": 1.9281867145421903, "step": 26850}, {"loss": 0.7292, "grad_norm": 0.8158858418464661, "learning_rate": 0.0002, "epoch": 1.9289048473967685, "step": 26860}, {"loss": 0.7609, "grad_norm": 0.7022058367729187, "learning_rate": 0.0002, "epoch": 1.9296229802513465, "step": 26870}, {"loss": 0.809, "grad_norm": 0.7249060273170471, "learning_rate": 0.0002, "epoch": 1.9303411131059245, "step": 26880}, {"loss": 0.7437, "grad_norm": 0.7613264322280884, "learning_rate": 0.0002, "epoch": 1.9310592459605027, "step": 26890}, {"loss": 0.7238, "grad_norm": 0.6857499480247498, "learning_rate": 0.0002, "epoch": 1.931777378815081, "step": 26900}, {"loss": 0.7651, "grad_norm": 0.6968346834182739, "learning_rate": 0.0002, "epoch": 1.932495511669659, "step": 26910}, {"loss": 0.6837, "grad_norm": 0.7079267501831055, "learning_rate": 0.0002, "epoch": 1.933213644524237, "step": 26920}, {"loss": 0.7482, "grad_norm": 0.6571618914604187, "learning_rate": 0.0002, "epoch": 1.933931777378815, "step": 26930}, {"loss": 0.7344, "grad_norm": 0.7460548281669617, "learning_rate": 0.0002, "epoch": 1.9346499102333932, "step": 26940}, {"loss": 0.7038, "grad_norm": 0.7954307794570923, "learning_rate": 0.0002, "epoch": 1.9353680430879714, "step": 26950}, {"loss": 0.6847, "grad_norm": 0.8696223497390747, "learning_rate": 0.0002, "epoch": 1.9360861759425494, "step": 26960}, {"loss": 0.7657, "grad_norm": 0.726004421710968, "learning_rate": 0.0002, "epoch": 1.9368043087971274, "step": 26970}, {"loss": 0.771, "grad_norm": 0.8760337829589844, "learning_rate": 0.0002, "epoch": 1.9375224416517056, "step": 26980}, {"loss": 0.6917, "grad_norm": 0.7308675646781921, "learning_rate": 0.0002, "epoch": 1.9382405745062836, "step": 26990}, {"loss": 0.7155, "grad_norm": 0.5900304317474365, "learning_rate": 0.0002, "epoch": 1.9389587073608618, "step": 27000}, {"loss": 0.6917, "grad_norm": 0.8839457631111145, "learning_rate": 0.0002, "epoch": 1.9396768402154398, "step": 27010}, {"loss": 0.7443, "grad_norm": 0.7239173650741577, "learning_rate": 0.0002, "epoch": 1.9403949730700178, "step": 27020}, {"loss": 0.7081, "grad_norm": 0.8972901701927185, "learning_rate": 0.0002, "epoch": 1.941113105924596, "step": 27030}, {"loss": 0.7422, "grad_norm": 0.7140652537345886, "learning_rate": 0.0002, "epoch": 1.9418312387791743, "step": 27040}, {"loss": 0.7679, "grad_norm": 0.7502743005752563, "learning_rate": 0.0002, "epoch": 1.9425493716337523, "step": 27050}, {"loss": 0.7311, "grad_norm": 0.6420751810073853, "learning_rate": 0.0002, "epoch": 1.9432675044883303, "step": 27060}, {"loss": 0.7403, "grad_norm": 0.6671820282936096, "learning_rate": 0.0002, "epoch": 1.9439856373429083, "step": 27070}, {"loss": 0.6919, "grad_norm": 0.6268796324729919, "learning_rate": 0.0002, "epoch": 1.9447037701974865, "step": 27080}, {"loss": 0.8154, "grad_norm": 0.6850021481513977, "learning_rate": 0.0002, "epoch": 1.9454219030520647, "step": 27090}, {"loss": 0.7179, "grad_norm": 0.6380038261413574, "learning_rate": 0.0002, "epoch": 1.9461400359066428, "step": 27100}, {"loss": 0.7638, "grad_norm": 0.5806204080581665, "learning_rate": 0.0002, "epoch": 1.9468581687612208, "step": 27110}, {"loss": 0.7032, "grad_norm": 0.8236927390098572, "learning_rate": 0.0002, "epoch": 1.947576301615799, "step": 27120}, {"loss": 0.7398, "grad_norm": 0.7915826439857483, "learning_rate": 0.0002, "epoch": 1.948294434470377, "step": 27130}, {"loss": 0.729, "grad_norm": 0.7467429041862488, "learning_rate": 0.0002, "epoch": 1.9490125673249552, "step": 27140}, {"loss": 0.7297, "grad_norm": 0.6278707981109619, "learning_rate": 0.0002, "epoch": 1.9497307001795332, "step": 27150}, {"loss": 0.7272, "grad_norm": 0.7353739142417908, "learning_rate": 0.0002, "epoch": 1.9504488330341112, "step": 27160}, {"loss": 0.6877, "grad_norm": 0.6443645358085632, "learning_rate": 0.0002, "epoch": 1.9511669658886894, "step": 27170}, {"loss": 0.7479, "grad_norm": 0.770800769329071, "learning_rate": 0.0002, "epoch": 1.9518850987432677, "step": 27180}, {"loss": 0.713, "grad_norm": 0.8982598781585693, "learning_rate": 0.0002, "epoch": 1.9526032315978457, "step": 27190}, {"loss": 0.7447, "grad_norm": 0.775017499923706, "learning_rate": 0.0002, "epoch": 1.9533213644524237, "step": 27200}, {"loss": 0.76, "grad_norm": 0.8271628618240356, "learning_rate": 0.0002, "epoch": 1.9540394973070017, "step": 27210}, {"loss": 0.7321, "grad_norm": 0.7460184693336487, "learning_rate": 0.0002, "epoch": 1.9547576301615799, "step": 27220}, {"loss": 0.6999, "grad_norm": 0.7732188105583191, "learning_rate": 0.0002, "epoch": 1.955475763016158, "step": 27230}, {"loss": 0.7135, "grad_norm": 0.7398577332496643, "learning_rate": 0.0002, "epoch": 1.956193895870736, "step": 27240}, {"loss": 0.7347, "grad_norm": 0.7132339477539062, "learning_rate": 0.0002, "epoch": 1.9569120287253141, "step": 27250}, {"loss": 0.7731, "grad_norm": 0.6718965768814087, "learning_rate": 0.0002, "epoch": 1.9576301615798921, "step": 27260}, {"loss": 0.7088, "grad_norm": 0.7914422154426575, "learning_rate": 0.0002, "epoch": 1.9583482944344703, "step": 27270}, {"loss": 0.6998, "grad_norm": 0.8314110636711121, "learning_rate": 0.0002, "epoch": 1.9590664272890486, "step": 27280}, {"loss": 0.7662, "grad_norm": 0.7810674905776978, "learning_rate": 0.0002, "epoch": 1.9597845601436266, "step": 27290}, {"loss": 0.7278, "grad_norm": 0.7691007256507874, "learning_rate": 0.0002, "epoch": 1.9605026929982046, "step": 27300}, {"loss": 0.7152, "grad_norm": 0.6753138899803162, "learning_rate": 0.0002, "epoch": 1.9612208258527828, "step": 27310}, {"loss": 0.7519, "grad_norm": 0.5881175994873047, "learning_rate": 0.0002, "epoch": 1.961938958707361, "step": 27320}, {"loss": 0.6877, "grad_norm": 0.8414133191108704, "learning_rate": 0.0002, "epoch": 1.962657091561939, "step": 27330}, {"loss": 0.7241, "grad_norm": 0.7363715171813965, "learning_rate": 0.0002, "epoch": 1.963375224416517, "step": 27340}, {"loss": 0.7153, "grad_norm": 0.6526232361793518, "learning_rate": 0.0002, "epoch": 1.964093357271095, "step": 27350}, {"loss": 0.8178, "grad_norm": 0.6821389198303223, "learning_rate": 0.0002, "epoch": 1.9648114901256732, "step": 27360}, {"loss": 0.7134, "grad_norm": 0.7306062579154968, "learning_rate": 0.0002, "epoch": 1.9655296229802515, "step": 27370}, {"loss": 0.7139, "grad_norm": 0.6458130478858948, "learning_rate": 0.0002, "epoch": 1.9662477558348295, "step": 27380}, {"loss": 0.7397, "grad_norm": 0.7243196368217468, "learning_rate": 0.0002, "epoch": 1.9669658886894075, "step": 27390}, {"loss": 0.6729, "grad_norm": 0.8062235713005066, "learning_rate": 0.0002, "epoch": 1.9676840215439855, "step": 27400}, {"loss": 0.7972, "grad_norm": 0.68441241979599, "learning_rate": 0.0002, "epoch": 1.9684021543985637, "step": 27410}, {"loss": 0.7235, "grad_norm": 0.7504498958587646, "learning_rate": 0.0002, "epoch": 1.969120287253142, "step": 27420}, {"loss": 0.7192, "grad_norm": 0.7469466328620911, "learning_rate": 0.0002, "epoch": 1.96983842010772, "step": 27430}, {"loss": 0.7556, "grad_norm": 0.7109853625297546, "learning_rate": 0.0002, "epoch": 1.970556552962298, "step": 27440}, {"loss": 0.7977, "grad_norm": 0.6964903473854065, "learning_rate": 0.0002, "epoch": 1.9712746858168761, "step": 27450}, {"loss": 0.7692, "grad_norm": 0.8224200010299683, "learning_rate": 0.0002, "epoch": 1.9719928186714544, "step": 27460}, {"loss": 0.7318, "grad_norm": 0.6195617318153381, "learning_rate": 0.0002, "epoch": 1.9727109515260324, "step": 27470}, {"loss": 0.7843, "grad_norm": 0.691511332988739, "learning_rate": 0.0002, "epoch": 1.9734290843806104, "step": 27480}, {"loss": 0.7324, "grad_norm": 0.7437900304794312, "learning_rate": 0.0002, "epoch": 1.9741472172351884, "step": 27490}, {"loss": 0.6736, "grad_norm": 0.7987960577011108, "learning_rate": 0.0002, "epoch": 1.9748653500897666, "step": 27500}, {"loss": 0.7005, "grad_norm": 0.7117776274681091, "learning_rate": 0.0002, "epoch": 1.9755834829443448, "step": 27510}, {"loss": 0.7201, "grad_norm": 0.8473866581916809, "learning_rate": 0.0002, "epoch": 1.9763016157989228, "step": 27520}, {"loss": 0.7528, "grad_norm": 0.7178242802619934, "learning_rate": 0.0002, "epoch": 1.9770197486535008, "step": 27530}, {"loss": 0.7112, "grad_norm": 0.760145902633667, "learning_rate": 0.0002, "epoch": 1.9777378815080788, "step": 27540}, {"loss": 0.8118, "grad_norm": 0.764436662197113, "learning_rate": 0.0002, "epoch": 1.978456014362657, "step": 27550}, {"loss": 0.7542, "grad_norm": 0.7245904803276062, "learning_rate": 0.0002, "epoch": 1.9791741472172353, "step": 27560}, {"loss": 0.7316, "grad_norm": 0.6317000389099121, "learning_rate": 0.0002, "epoch": 1.9798922800718133, "step": 27570}, {"loss": 0.7504, "grad_norm": 0.8764704465866089, "learning_rate": 0.0002, "epoch": 1.9806104129263913, "step": 27580}, {"loss": 0.7845, "grad_norm": 0.6111825108528137, "learning_rate": 0.0002, "epoch": 1.9813285457809695, "step": 27590}, {"loss": 0.7101, "grad_norm": 0.6797714233398438, "learning_rate": 0.0002, "epoch": 1.9820466786355477, "step": 27600}, {"loss": 0.8037, "grad_norm": 0.7754142880439758, "learning_rate": 0.0002, "epoch": 1.9827648114901257, "step": 27610}, {"loss": 0.7288, "grad_norm": 0.7243061661720276, "learning_rate": 0.0002, "epoch": 1.9834829443447037, "step": 27620}, {"loss": 0.6626, "grad_norm": 0.6194812655448914, "learning_rate": 0.0002, "epoch": 1.9842010771992817, "step": 27630}, {"loss": 0.7162, "grad_norm": 0.6399638056755066, "learning_rate": 0.0002, "epoch": 1.98491921005386, "step": 27640}, {"loss": 0.764, "grad_norm": 0.7637218832969666, "learning_rate": 0.0002, "epoch": 1.9856373429084382, "step": 27650}, {"loss": 0.7386, "grad_norm": 0.9099404811859131, "learning_rate": 0.0002, "epoch": 1.9863554757630162, "step": 27660}, {"loss": 0.7579, "grad_norm": 0.6892596483230591, "learning_rate": 0.0002, "epoch": 1.9870736086175942, "step": 27670}, {"loss": 0.7802, "grad_norm": 0.5962418913841248, "learning_rate": 0.0002, "epoch": 1.9877917414721722, "step": 27680}, {"loss": 0.7162, "grad_norm": 0.5750163197517395, "learning_rate": 0.0002, "epoch": 1.9885098743267504, "step": 27690}, {"loss": 0.7553, "grad_norm": 0.6740097403526306, "learning_rate": 0.0002, "epoch": 1.9892280071813286, "step": 27700}, {"loss": 0.7444, "grad_norm": 0.6968644857406616, "learning_rate": 0.0002, "epoch": 1.9899461400359066, "step": 27710}, {"loss": 0.7383, "grad_norm": 0.6788132190704346, "learning_rate": 0.0002, "epoch": 1.9906642728904846, "step": 27720}, {"loss": 0.7249, "grad_norm": 0.8600544929504395, "learning_rate": 0.0002, "epoch": 1.9913824057450629, "step": 27730}, {"loss": 0.7133, "grad_norm": 0.6227671504020691, "learning_rate": 0.0002, "epoch": 1.992100538599641, "step": 27740}, {"loss": 0.7815, "grad_norm": 0.6611875295639038, "learning_rate": 0.0002, "epoch": 1.992818671454219, "step": 27750}, {"loss": 0.7423, "grad_norm": 0.714568018913269, "learning_rate": 0.0002, "epoch": 1.993536804308797, "step": 27760}, {"loss": 0.7297, "grad_norm": 0.6328669190406799, "learning_rate": 0.0002, "epoch": 1.994254937163375, "step": 27770}, {"loss": 0.7398, "grad_norm": 0.8673429489135742, "learning_rate": 0.0002, "epoch": 1.9949730700179533, "step": 27780}, {"loss": 0.7301, "grad_norm": 0.820620059967041, "learning_rate": 0.0002, "epoch": 1.9956912028725315, "step": 27790}, {"loss": 0.7828, "grad_norm": 0.8748094439506531, "learning_rate": 0.0002, "epoch": 1.9964093357271095, "step": 27800}, {"loss": 0.6945, "grad_norm": 0.8118113875389099, "learning_rate": 0.0002, "epoch": 1.9971274685816875, "step": 27810}, {"loss": 0.742, "grad_norm": 0.6886725425720215, "learning_rate": 0.0002, "epoch": 1.9978456014362656, "step": 27820}, {"loss": 0.7293, "grad_norm": 0.7101268768310547, "learning_rate": 0.0002, "epoch": 1.9985637342908438, "step": 27830}, {"loss": 0.7317, "grad_norm": 0.7823781967163086, "learning_rate": 0.0002, "epoch": 1.999281867145422, "step": 27840}, {"loss": 0.7711, "grad_norm": 0.8491085767745972, "learning_rate": 0.0002, "epoch": 2.0, "step": 27850}, {"eval_loss": 1.0868422985076904, "eval_runtime": 55.1699, "eval_samples_per_second": 13.286, "eval_steps_per_second": 1.668, "epoch": 2.0, "step": 27850}, {"loss": 0.6808, "grad_norm": 0.9003389477729797, "learning_rate": 0.0002, "epoch": 2.000718132854578, "step": 27860}, {"loss": 0.6379, "grad_norm": 0.8898349404335022, "learning_rate": 0.0002, "epoch": 2.001436265709156, "step": 27870}, {"loss": 0.7157, "grad_norm": 0.7525973320007324, "learning_rate": 0.0002, "epoch": 2.0021543985637344, "step": 27880}, {"loss": 0.6681, "grad_norm": 0.7821497321128845, "learning_rate": 0.0002, "epoch": 2.0028725314183125, "step": 27890}, {"loss": 0.6781, "grad_norm": 0.6334691047668457, "learning_rate": 0.0002, "epoch": 2.0035906642728905, "step": 27900}, {"loss": 0.6349, "grad_norm": 0.732991099357605, "learning_rate": 0.0002, "epoch": 2.0043087971274685, "step": 27910}, {"loss": 0.6776, "grad_norm": 0.949942946434021, "learning_rate": 0.0002, "epoch": 2.0050269299820465, "step": 27920}, {"loss": 0.735, "grad_norm": 0.657267689704895, "learning_rate": 0.0002, "epoch": 2.005745062836625, "step": 27930}, {"loss": 0.7123, "grad_norm": 0.8329252004623413, "learning_rate": 0.0002, "epoch": 2.006463195691203, "step": 27940}, {"loss": 0.6826, "grad_norm": 0.7816959023475647, "learning_rate": 0.0002, "epoch": 2.007181328545781, "step": 27950}, {"loss": 0.6511, "grad_norm": 0.7546323537826538, "learning_rate": 0.0002, "epoch": 2.007899461400359, "step": 27960}, {"loss": 0.6222, "grad_norm": 0.9519657492637634, "learning_rate": 0.0002, "epoch": 2.0086175942549374, "step": 27970}, {"loss": 0.6642, "grad_norm": 0.7934315800666809, "learning_rate": 0.0002, "epoch": 2.0093357271095154, "step": 27980}, {"loss": 0.666, "grad_norm": 0.9579764604568481, "learning_rate": 0.0002, "epoch": 2.0100538599640934, "step": 27990}, {"loss": 0.6376, "grad_norm": 0.764167070388794, "learning_rate": 0.0002, "epoch": 2.0107719928186714, "step": 28000}, {"loss": 0.6512, "grad_norm": 0.7380000948905945, "learning_rate": 0.0002, "epoch": 2.0114901256732494, "step": 28010}, {"loss": 0.6893, "grad_norm": 0.7220044732093811, "learning_rate": 0.0002, "epoch": 2.012208258527828, "step": 28020}, {"loss": 0.6168, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 2.012926391382406, "step": 28030}, {"loss": 0.6595, "grad_norm": 0.7507190704345703, "learning_rate": 0.0002, "epoch": 2.013644524236984, "step": 28040}, {"loss": 0.6974, "grad_norm": 0.9488387703895569, "learning_rate": 0.0002, "epoch": 2.014362657091562, "step": 28050}, {"loss": 0.6489, "grad_norm": 0.9092940092086792, "learning_rate": 0.0002, "epoch": 2.01508078994614, "step": 28060}, {"loss": 0.6545, "grad_norm": 0.7859629392623901, "learning_rate": 0.0002, "epoch": 2.0157989228007183, "step": 28070}, {"loss": 0.6552, "grad_norm": 0.7636393904685974, "learning_rate": 0.0002, "epoch": 2.0165170556552963, "step": 28080}, {"loss": 0.696, "grad_norm": 0.8860714435577393, "learning_rate": 0.0002, "epoch": 2.0172351885098743, "step": 28090}, {"loss": 0.6368, "grad_norm": 0.6837195158004761, "learning_rate": 0.0002, "epoch": 2.0179533213644523, "step": 28100}, {"loss": 0.6405, "grad_norm": 0.7778242826461792, "learning_rate": 0.0002, "epoch": 2.0186714542190307, "step": 28110}, {"loss": 0.6417, "grad_norm": 0.7164766788482666, "learning_rate": 0.0002, "epoch": 2.0193895870736087, "step": 28120}, {"loss": 0.6684, "grad_norm": 0.8965572118759155, "learning_rate": 0.0002, "epoch": 2.0201077199281867, "step": 28130}, {"loss": 0.6213, "grad_norm": 0.8074374794960022, "learning_rate": 0.0002, "epoch": 2.0208258527827647, "step": 28140}, {"loss": 0.6563, "grad_norm": 0.8307222127914429, "learning_rate": 0.0002, "epoch": 2.0215439856373427, "step": 28150}, {"loss": 0.6617, "grad_norm": 0.9600032567977905, "learning_rate": 0.0002, "epoch": 2.022262118491921, "step": 28160}, {"loss": 0.6722, "grad_norm": 0.8541040420532227, "learning_rate": 0.0002, "epoch": 2.022980251346499, "step": 28170}, {"loss": 0.6803, "grad_norm": 0.8864985704421997, "learning_rate": 0.0002, "epoch": 2.023698384201077, "step": 28180}, {"loss": 0.6516, "grad_norm": 0.7926326990127563, "learning_rate": 0.0002, "epoch": 2.024416517055655, "step": 28190}, {"loss": 0.6595, "grad_norm": 1.0548077821731567, "learning_rate": 0.0002, "epoch": 2.025134649910233, "step": 28200}, {"loss": 0.6859, "grad_norm": 0.7468827366828918, "learning_rate": 0.0002, "epoch": 2.0258527827648116, "step": 28210}, {"loss": 0.6605, "grad_norm": 0.7683286070823669, "learning_rate": 0.0002, "epoch": 2.0265709156193896, "step": 28220}, {"loss": 0.6656, "grad_norm": 0.7307319641113281, "learning_rate": 0.0002, "epoch": 2.0272890484739676, "step": 28230}, {"loss": 0.7148, "grad_norm": 0.7813416719436646, "learning_rate": 0.0002, "epoch": 2.0280071813285456, "step": 28240}, {"loss": 0.6882, "grad_norm": 0.7954556941986084, "learning_rate": 0.0002, "epoch": 2.028725314183124, "step": 28250}, {"loss": 0.6192, "grad_norm": 0.8836418986320496, "learning_rate": 0.0002, "epoch": 2.029443447037702, "step": 28260}, {"loss": 0.6275, "grad_norm": 0.7092728614807129, "learning_rate": 0.0002, "epoch": 2.03016157989228, "step": 28270}, {"loss": 0.6735, "grad_norm": 0.8512285351753235, "learning_rate": 0.0002, "epoch": 2.030879712746858, "step": 28280}, {"loss": 0.6586, "grad_norm": 0.8005346059799194, "learning_rate": 0.0002, "epoch": 2.031597845601436, "step": 28290}, {"loss": 0.6129, "grad_norm": 0.8872515559196472, "learning_rate": 0.0002, "epoch": 2.0323159784560145, "step": 28300}, {"loss": 0.6935, "grad_norm": 0.7948436737060547, "learning_rate": 0.0002, "epoch": 2.0330341113105925, "step": 28310}, {"loss": 0.6831, "grad_norm": 0.7418082356452942, "learning_rate": 0.0002, "epoch": 2.0337522441651705, "step": 28320}, {"loss": 0.6922, "grad_norm": 0.9600949287414551, "learning_rate": 0.0002, "epoch": 2.0344703770197485, "step": 28330}, {"loss": 0.6015, "grad_norm": 0.9767434597015381, "learning_rate": 0.0002, "epoch": 2.0351885098743265, "step": 28340}, {"loss": 0.6637, "grad_norm": 0.7435336709022522, "learning_rate": 0.0002, "epoch": 2.035906642728905, "step": 28350}, {"loss": 0.649, "grad_norm": 0.997978925704956, "learning_rate": 0.0002, "epoch": 2.036624775583483, "step": 28360}, {"loss": 0.6957, "grad_norm": 0.9072412252426147, "learning_rate": 0.0002, "epoch": 2.037342908438061, "step": 28370}, {"loss": 0.6816, "grad_norm": 0.8396701812744141, "learning_rate": 0.0002, "epoch": 2.038061041292639, "step": 28380}, {"loss": 0.6487, "grad_norm": 1.0449832677841187, "learning_rate": 0.0002, "epoch": 2.0387791741472174, "step": 28390}, {"loss": 0.6826, "grad_norm": 0.6471025943756104, "learning_rate": 0.0002, "epoch": 2.0394973070017954, "step": 28400}, {"loss": 0.6597, "grad_norm": 0.8147950768470764, "learning_rate": 0.0002, "epoch": 2.0402154398563734, "step": 28410}, {"loss": 0.6502, "grad_norm": 0.902508020401001, "learning_rate": 0.0002, "epoch": 2.0409335727109514, "step": 28420}, {"loss": 0.6303, "grad_norm": 0.6426262855529785, "learning_rate": 0.0002, "epoch": 2.0416517055655294, "step": 28430}, {"loss": 0.6812, "grad_norm": 0.8016643524169922, "learning_rate": 0.0002, "epoch": 2.042369838420108, "step": 28440}, {"loss": 0.6535, "grad_norm": 0.6841614246368408, "learning_rate": 0.0002, "epoch": 2.043087971274686, "step": 28450}, {"loss": 0.638, "grad_norm": 0.7713631987571716, "learning_rate": 0.0002, "epoch": 2.043806104129264, "step": 28460}, {"loss": 0.6456, "grad_norm": 0.8795675039291382, "learning_rate": 0.0002, "epoch": 2.044524236983842, "step": 28470}, {"loss": 0.6858, "grad_norm": 0.725447416305542, "learning_rate": 0.0002, "epoch": 2.04524236983842, "step": 28480}, {"loss": 0.6289, "grad_norm": 0.806861162185669, "learning_rate": 0.0002, "epoch": 2.0459605026929983, "step": 28490}, {"loss": 0.6269, "grad_norm": 0.752953827381134, "learning_rate": 0.0002, "epoch": 2.0466786355475763, "step": 28500}, {"loss": 0.6818, "grad_norm": 0.7143173813819885, "learning_rate": 0.0002, "epoch": 2.0473967684021543, "step": 28510}, {"loss": 0.6606, "grad_norm": 0.9316226243972778, "learning_rate": 0.0002, "epoch": 2.0481149012567323, "step": 28520}, {"loss": 0.6284, "grad_norm": 0.7292338609695435, "learning_rate": 0.0002, "epoch": 2.048833034111311, "step": 28530}, {"loss": 0.6528, "grad_norm": 0.7392885088920593, "learning_rate": 0.0002, "epoch": 2.049551166965889, "step": 28540}, {"loss": 0.7007, "grad_norm": 0.7288873195648193, "learning_rate": 0.0002, "epoch": 2.050269299820467, "step": 28550}, {"loss": 0.6239, "grad_norm": 0.7791221141815186, "learning_rate": 0.0002, "epoch": 2.050987432675045, "step": 28560}, {"loss": 0.684, "grad_norm": 0.821983814239502, "learning_rate": 0.0002, "epoch": 2.051705565529623, "step": 28570}, {"loss": 0.6545, "grad_norm": 0.8925826549530029, "learning_rate": 0.0002, "epoch": 2.0524236983842012, "step": 28580}, {"loss": 0.719, "grad_norm": 0.7181646227836609, "learning_rate": 0.0002, "epoch": 2.0531418312387792, "step": 28590}, {"loss": 0.686, "grad_norm": 0.6387725472450256, "learning_rate": 0.0002, "epoch": 2.0538599640933572, "step": 28600}, {"loss": 0.6662, "grad_norm": 0.8398096561431885, "learning_rate": 0.0002, "epoch": 2.0545780969479353, "step": 28610}, {"loss": 0.69, "grad_norm": 1.0458195209503174, "learning_rate": 0.0002, "epoch": 2.0552962298025133, "step": 28620}, {"loss": 0.655, "grad_norm": 0.7032150626182556, "learning_rate": 0.0002, "epoch": 2.0560143626570917, "step": 28630}, {"loss": 0.6551, "grad_norm": 0.8850845098495483, "learning_rate": 0.0002, "epoch": 2.0567324955116697, "step": 28640}, {"loss": 0.6767, "grad_norm": 0.8587120175361633, "learning_rate": 0.0002, "epoch": 2.0574506283662477, "step": 28650}, {"loss": 0.6721, "grad_norm": 0.7462602853775024, "learning_rate": 0.0002, "epoch": 2.0581687612208257, "step": 28660}, {"loss": 0.6639, "grad_norm": 0.7355574369430542, "learning_rate": 0.0002, "epoch": 2.058886894075404, "step": 28670}, {"loss": 0.6216, "grad_norm": 0.9229736328125, "learning_rate": 0.0002, "epoch": 2.059605026929982, "step": 28680}, {"loss": 0.6692, "grad_norm": 0.7685085535049438, "learning_rate": 0.0002, "epoch": 2.06032315978456, "step": 28690}, {"loss": 0.6801, "grad_norm": 0.6749364137649536, "learning_rate": 0.0002, "epoch": 2.061041292639138, "step": 28700}, {"loss": 0.6721, "grad_norm": 0.7608520984649658, "learning_rate": 0.0002, "epoch": 2.061759425493716, "step": 28710}, {"loss": 0.6721, "grad_norm": 0.9451281428337097, "learning_rate": 0.0002, "epoch": 2.0624775583482946, "step": 28720}, {"loss": 0.671, "grad_norm": 0.7869735360145569, "learning_rate": 0.0002, "epoch": 2.0631956912028726, "step": 28730}, {"loss": 0.6409, "grad_norm": 0.8422008156776428, "learning_rate": 0.0002, "epoch": 2.0639138240574506, "step": 28740}, {"loss": 0.6686, "grad_norm": 0.7486162781715393, "learning_rate": 0.0002, "epoch": 2.0646319569120286, "step": 28750}, {"loss": 0.6641, "grad_norm": 0.9374173879623413, "learning_rate": 0.0002, "epoch": 2.0653500897666066, "step": 28760}, {"loss": 0.6737, "grad_norm": 0.8749295473098755, "learning_rate": 0.0002, "epoch": 2.066068222621185, "step": 28770}, {"loss": 0.636, "grad_norm": 0.8265942931175232, "learning_rate": 0.0002, "epoch": 2.066786355475763, "step": 28780}, {"loss": 0.6819, "grad_norm": 0.8541982769966125, "learning_rate": 0.0002, "epoch": 2.067504488330341, "step": 28790}, {"loss": 0.661, "grad_norm": 0.8220006227493286, "learning_rate": 0.0002, "epoch": 2.068222621184919, "step": 28800}, {"loss": 0.6942, "grad_norm": 0.7302022576332092, "learning_rate": 0.0002, "epoch": 2.0689407540394975, "step": 28810}, {"loss": 0.68, "grad_norm": 0.7073875069618225, "learning_rate": 0.0002, "epoch": 2.0696588868940755, "step": 28820}, {"loss": 0.6275, "grad_norm": 0.7792919874191284, "learning_rate": 0.0002, "epoch": 2.0703770197486535, "step": 28830}, {"loss": 0.6941, "grad_norm": 0.8268185257911682, "learning_rate": 0.0002, "epoch": 2.0710951526032315, "step": 28840}, {"loss": 0.6776, "grad_norm": 0.7576423287391663, "learning_rate": 0.0002, "epoch": 2.0718132854578095, "step": 28850}, {"loss": 0.6298, "grad_norm": 0.8255910873413086, "learning_rate": 0.0002, "epoch": 2.072531418312388, "step": 28860}, {"loss": 0.6695, "grad_norm": 0.7900934815406799, "learning_rate": 0.0002, "epoch": 2.073249551166966, "step": 28870}, {"loss": 0.6532, "grad_norm": 0.846665620803833, "learning_rate": 0.0002, "epoch": 2.073967684021544, "step": 28880}, {"loss": 0.6598, "grad_norm": 0.8159831166267395, "learning_rate": 0.0002, "epoch": 2.074685816876122, "step": 28890}, {"loss": 0.6341, "grad_norm": 0.7395941615104675, "learning_rate": 0.0002, "epoch": 2.0754039497307, "step": 28900}, {"loss": 0.6513, "grad_norm": 0.9765046238899231, "learning_rate": 0.0002, "epoch": 2.0761220825852784, "step": 28910}, {"loss": 0.6785, "grad_norm": 0.8358173966407776, "learning_rate": 0.0002, "epoch": 2.0768402154398564, "step": 28920}, {"loss": 0.6973, "grad_norm": 0.6848723292350769, "learning_rate": 0.0002, "epoch": 2.0775583482944344, "step": 28930}, {"loss": 0.6381, "grad_norm": 0.7965065836906433, "learning_rate": 0.0002, "epoch": 2.0782764811490124, "step": 28940}, {"loss": 0.667, "grad_norm": 0.7618608474731445, "learning_rate": 0.0002, "epoch": 2.078994614003591, "step": 28950}, {"loss": 0.6683, "grad_norm": 0.890615701675415, "learning_rate": 0.0002, "epoch": 2.079712746858169, "step": 28960}, {"loss": 0.6641, "grad_norm": 0.7310431003570557, "learning_rate": 0.0002, "epoch": 2.080430879712747, "step": 28970}, {"loss": 0.6511, "grad_norm": 0.8228268027305603, "learning_rate": 0.0002, "epoch": 2.081149012567325, "step": 28980}, {"loss": 0.655, "grad_norm": 0.883577287197113, "learning_rate": 0.0002, "epoch": 2.081867145421903, "step": 28990}, {"loss": 0.7232, "grad_norm": 0.8359243869781494, "learning_rate": 0.0002, "epoch": 2.0825852782764813, "step": 29000}, {"loss": 0.6744, "grad_norm": 0.8285391330718994, "learning_rate": 0.0002, "epoch": 2.0833034111310593, "step": 29010}, {"loss": 0.6951, "grad_norm": 0.8991064429283142, "learning_rate": 0.0002, "epoch": 2.0840215439856373, "step": 29020}, {"loss": 0.6444, "grad_norm": 0.6911244988441467, "learning_rate": 0.0002, "epoch": 2.0847396768402153, "step": 29030}, {"loss": 0.7098, "grad_norm": 0.8462249636650085, "learning_rate": 0.0002, "epoch": 2.0854578096947933, "step": 29040}, {"loss": 0.6813, "grad_norm": 0.9149548411369324, "learning_rate": 0.0002, "epoch": 2.0861759425493718, "step": 29050}, {"loss": 0.6948, "grad_norm": 0.7365630269050598, "learning_rate": 0.0002, "epoch": 2.0868940754039498, "step": 29060}, {"loss": 0.6391, "grad_norm": 0.8439079523086548, "learning_rate": 0.0002, "epoch": 2.087612208258528, "step": 29070}, {"loss": 0.6566, "grad_norm": 0.7123780846595764, "learning_rate": 0.0002, "epoch": 2.088330341113106, "step": 29080}, {"loss": 0.6305, "grad_norm": 0.6854261755943298, "learning_rate": 0.0002, "epoch": 2.0890484739676842, "step": 29090}, {"loss": 0.667, "grad_norm": 0.83026123046875, "learning_rate": 0.0002, "epoch": 2.0897666068222622, "step": 29100}, {"loss": 0.661, "grad_norm": 0.8413158059120178, "learning_rate": 0.0002, "epoch": 2.0904847396768402, "step": 29110}, {"loss": 0.7194, "grad_norm": 0.9646758437156677, "learning_rate": 0.0002, "epoch": 2.0912028725314182, "step": 29120}, {"loss": 0.7101, "grad_norm": 0.8421565890312195, "learning_rate": 0.0002, "epoch": 2.0919210053859962, "step": 29130}, {"loss": 0.6685, "grad_norm": 0.7748899459838867, "learning_rate": 0.0002, "epoch": 2.0926391382405747, "step": 29140}, {"loss": 0.6596, "grad_norm": 0.5973830819129944, "learning_rate": 0.0002, "epoch": 2.0933572710951527, "step": 29150}, {"loss": 0.6437, "grad_norm": 0.8440837860107422, "learning_rate": 0.0002, "epoch": 2.0940754039497307, "step": 29160}, {"loss": 0.6373, "grad_norm": 0.7392688989639282, "learning_rate": 0.0002, "epoch": 2.0947935368043087, "step": 29170}, {"loss": 0.6907, "grad_norm": 1.0522996187210083, "learning_rate": 0.0002, "epoch": 2.0955116696588867, "step": 29180}, {"loss": 0.6733, "grad_norm": 0.7330273389816284, "learning_rate": 0.0002, "epoch": 2.096229802513465, "step": 29190}, {"loss": 0.7219, "grad_norm": 1.11064875125885, "learning_rate": 0.0002, "epoch": 2.096947935368043, "step": 29200}, {"loss": 0.6125, "grad_norm": 0.795446515083313, "learning_rate": 0.0002, "epoch": 2.097666068222621, "step": 29210}, {"loss": 0.6466, "grad_norm": 0.5552594661712646, "learning_rate": 0.0002, "epoch": 2.098384201077199, "step": 29220}, {"loss": 0.6601, "grad_norm": 0.7327710390090942, "learning_rate": 0.0002, "epoch": 2.0991023339317776, "step": 29230}, {"loss": 0.656, "grad_norm": 0.7474247217178345, "learning_rate": 0.0002, "epoch": 2.0998204667863556, "step": 29240}, {"loss": 0.6707, "grad_norm": 0.7775853276252747, "learning_rate": 0.0002, "epoch": 2.1005385996409336, "step": 29250}, {"loss": 0.6623, "grad_norm": 0.769527018070221, "learning_rate": 0.0002, "epoch": 2.1012567324955116, "step": 29260}, {"loss": 0.6183, "grad_norm": 0.8350797891616821, "learning_rate": 0.0002, "epoch": 2.1019748653500896, "step": 29270}, {"loss": 0.6623, "grad_norm": 0.8749061822891235, "learning_rate": 0.0002, "epoch": 2.102692998204668, "step": 29280}, {"loss": 0.6292, "grad_norm": 0.7838778495788574, "learning_rate": 0.0002, "epoch": 2.103411131059246, "step": 29290}, {"loss": 0.699, "grad_norm": 0.8144710063934326, "learning_rate": 0.0002, "epoch": 2.104129263913824, "step": 29300}, {"loss": 0.6291, "grad_norm": 0.7965250015258789, "learning_rate": 0.0002, "epoch": 2.104847396768402, "step": 29310}, {"loss": 0.6387, "grad_norm": 0.7075945138931274, "learning_rate": 0.0002, "epoch": 2.10556552962298, "step": 29320}, {"loss": 0.6846, "grad_norm": 0.9449555277824402, "learning_rate": 0.0002, "epoch": 2.1062836624775585, "step": 29330}, {"loss": 0.6571, "grad_norm": 0.9114580750465393, "learning_rate": 0.0002, "epoch": 2.1070017953321365, "step": 29340}, {"loss": 0.6652, "grad_norm": 0.8768125176429749, "learning_rate": 0.0002, "epoch": 2.1077199281867145, "step": 29350}, {"loss": 0.7134, "grad_norm": 0.8586908578872681, "learning_rate": 0.0002, "epoch": 2.1084380610412925, "step": 29360}, {"loss": 0.6471, "grad_norm": 0.8351234793663025, "learning_rate": 0.0002, "epoch": 2.109156193895871, "step": 29370}, {"loss": 0.671, "grad_norm": 0.686488687992096, "learning_rate": 0.0002, "epoch": 2.109874326750449, "step": 29380}, {"loss": 0.6706, "grad_norm": 0.7910184264183044, "learning_rate": 0.0002, "epoch": 2.110592459605027, "step": 29390}, {"loss": 0.7367, "grad_norm": 0.7649612426757812, "learning_rate": 0.0002, "epoch": 2.111310592459605, "step": 29400}, {"loss": 0.6386, "grad_norm": 0.7790259122848511, "learning_rate": 0.0002, "epoch": 2.112028725314183, "step": 29410}, {"loss": 0.6983, "grad_norm": 0.8386351466178894, "learning_rate": 0.0002, "epoch": 2.1127468581687614, "step": 29420}, {"loss": 0.6519, "grad_norm": 0.8605695366859436, "learning_rate": 0.0002, "epoch": 2.1134649910233394, "step": 29430}, {"loss": 0.6686, "grad_norm": 0.6808947920799255, "learning_rate": 0.0002, "epoch": 2.1141831238779174, "step": 29440}, {"loss": 0.6743, "grad_norm": 0.8310001492500305, "learning_rate": 0.0002, "epoch": 2.1149012567324954, "step": 29450}, {"loss": 0.6669, "grad_norm": 1.289986252784729, "learning_rate": 0.0002, "epoch": 2.1156193895870734, "step": 29460}, {"loss": 0.6947, "grad_norm": 0.8679313659667969, "learning_rate": 0.0002, "epoch": 2.116337522441652, "step": 29470}, {"loss": 0.6954, "grad_norm": 0.9149175882339478, "learning_rate": 0.0002, "epoch": 2.11705565529623, "step": 29480}, {"loss": 0.6908, "grad_norm": 0.8405622839927673, "learning_rate": 0.0002, "epoch": 2.117773788150808, "step": 29490}, {"loss": 0.7436, "grad_norm": 0.9174691438674927, "learning_rate": 0.0002, "epoch": 2.118491921005386, "step": 29500}, {"loss": 0.6804, "grad_norm": 0.8865614533424377, "learning_rate": 0.0002, "epoch": 2.1192100538599643, "step": 29510}, {"loss": 0.6535, "grad_norm": 0.645301342010498, "learning_rate": 0.0002, "epoch": 2.1199281867145423, "step": 29520}, {"loss": 0.6879, "grad_norm": 0.7612960338592529, "learning_rate": 0.0002, "epoch": 2.1206463195691203, "step": 29530}, {"loss": 0.6874, "grad_norm": 0.7575576305389404, "learning_rate": 0.0002, "epoch": 2.1213644524236983, "step": 29540}, {"loss": 0.6924, "grad_norm": 0.8746156096458435, "learning_rate": 0.0002, "epoch": 2.1220825852782763, "step": 29550}, {"loss": 0.6659, "grad_norm": 0.8488934636116028, "learning_rate": 0.0002, "epoch": 2.1228007181328548, "step": 29560}, {"loss": 0.6568, "grad_norm": 0.8064972162246704, "learning_rate": 0.0002, "epoch": 2.1235188509874328, "step": 29570}, {"loss": 0.713, "grad_norm": 0.7410933971405029, "learning_rate": 0.0002, "epoch": 2.1242369838420108, "step": 29580}, {"loss": 0.649, "grad_norm": 0.7023535966873169, "learning_rate": 0.0002, "epoch": 2.1249551166965888, "step": 29590}, {"loss": 0.6574, "grad_norm": 0.8591743111610413, "learning_rate": 0.0002, "epoch": 2.1256732495511668, "step": 29600}, {"loss": 0.673, "grad_norm": 0.7270186543464661, "learning_rate": 0.0002, "epoch": 2.126391382405745, "step": 29610}, {"loss": 0.6262, "grad_norm": 0.9639726281166077, "learning_rate": 0.0002, "epoch": 2.127109515260323, "step": 29620}, {"loss": 0.6434, "grad_norm": 0.8519027829170227, "learning_rate": 0.0002, "epoch": 2.127827648114901, "step": 29630}, {"loss": 0.6843, "grad_norm": 0.8786447048187256, "learning_rate": 0.0002, "epoch": 2.128545780969479, "step": 29640}, {"loss": 0.6386, "grad_norm": 0.7452822923660278, "learning_rate": 0.0002, "epoch": 2.129263913824057, "step": 29650}, {"loss": 0.6577, "grad_norm": 0.9385744333267212, "learning_rate": 0.0002, "epoch": 2.1299820466786357, "step": 29660}, {"loss": 0.7088, "grad_norm": 0.7650160193443298, "learning_rate": 0.0002, "epoch": 2.1307001795332137, "step": 29670}, {"loss": 0.6742, "grad_norm": 0.7581976652145386, "learning_rate": 0.0002, "epoch": 2.1314183123877917, "step": 29680}, {"loss": 0.6358, "grad_norm": 0.8455183506011963, "learning_rate": 0.0002, "epoch": 2.1321364452423697, "step": 29690}, {"loss": 0.6288, "grad_norm": 0.7200509905815125, "learning_rate": 0.0002, "epoch": 2.132854578096948, "step": 29700}, {"loss": 0.695, "grad_norm": 0.7071877121925354, "learning_rate": 0.0002, "epoch": 2.133572710951526, "step": 29710}, {"loss": 0.6852, "grad_norm": 0.9197220802307129, "learning_rate": 0.0002, "epoch": 2.134290843806104, "step": 29720}, {"loss": 0.6578, "grad_norm": 0.6787277460098267, "learning_rate": 0.0002, "epoch": 2.135008976660682, "step": 29730}, {"loss": 0.666, "grad_norm": 0.8183788061141968, "learning_rate": 0.0002, "epoch": 2.13572710951526, "step": 29740}, {"loss": 0.6754, "grad_norm": 0.7958994507789612, "learning_rate": 0.0002, "epoch": 2.1364452423698386, "step": 29750}, {"loss": 0.6761, "grad_norm": 0.8803889155387878, "learning_rate": 0.0002, "epoch": 2.1371633752244166, "step": 29760}, {"loss": 0.686, "grad_norm": 0.6682677268981934, "learning_rate": 0.0002, "epoch": 2.1378815080789946, "step": 29770}, {"loss": 0.6878, "grad_norm": 1.0198085308074951, "learning_rate": 0.0002, "epoch": 2.1385996409335726, "step": 29780}, {"loss": 0.6576, "grad_norm": 1.0258227586746216, "learning_rate": 0.0002, "epoch": 2.139317773788151, "step": 29790}, {"loss": 0.6454, "grad_norm": 0.8920917510986328, "learning_rate": 0.0002, "epoch": 2.140035906642729, "step": 29800}, {"loss": 0.6926, "grad_norm": 0.8352635502815247, "learning_rate": 0.0002, "epoch": 2.140754039497307, "step": 29810}, {"loss": 0.692, "grad_norm": 0.8422067165374756, "learning_rate": 0.0002, "epoch": 2.141472172351885, "step": 29820}, {"loss": 0.72, "grad_norm": 0.8845202326774597, "learning_rate": 0.0002, "epoch": 2.142190305206463, "step": 29830}, {"loss": 0.688, "grad_norm": 0.659397542476654, "learning_rate": 0.0002, "epoch": 2.1429084380610415, "step": 29840}, {"loss": 0.6354, "grad_norm": 0.6233306527137756, "learning_rate": 0.0002, "epoch": 2.1436265709156195, "step": 29850}, {"loss": 0.6946, "grad_norm": 0.8951199054718018, "learning_rate": 0.0002, "epoch": 2.1443447037701975, "step": 29860}, {"loss": 0.6417, "grad_norm": 0.6980211734771729, "learning_rate": 0.0002, "epoch": 2.1450628366247755, "step": 29870}, {"loss": 0.6754, "grad_norm": 0.8463385105133057, "learning_rate": 0.0002, "epoch": 2.1457809694793535, "step": 29880}, {"loss": 0.6636, "grad_norm": 0.682183027267456, "learning_rate": 0.0002, "epoch": 2.146499102333932, "step": 29890}, {"loss": 0.6605, "grad_norm": 0.8491033911705017, "learning_rate": 0.0002, "epoch": 2.14721723518851, "step": 29900}, {"loss": 0.6851, "grad_norm": 0.8112631440162659, "learning_rate": 0.0002, "epoch": 2.147935368043088, "step": 29910}, {"loss": 0.6804, "grad_norm": 1.0186359882354736, "learning_rate": 0.0002, "epoch": 2.148653500897666, "step": 29920}, {"loss": 0.6709, "grad_norm": 0.7904929518699646, "learning_rate": 0.0002, "epoch": 2.149371633752244, "step": 29930}, {"loss": 0.6535, "grad_norm": 0.8381312489509583, "learning_rate": 0.0002, "epoch": 2.1500897666068224, "step": 29940}, {"loss": 0.6896, "grad_norm": 0.7596192359924316, "learning_rate": 0.0002, "epoch": 2.1508078994614004, "step": 29950}, {"loss": 0.6473, "grad_norm": 0.7532448768615723, "learning_rate": 0.0002, "epoch": 2.1515260323159784, "step": 29960}, {"loss": 0.7051, "grad_norm": 0.7877430319786072, "learning_rate": 0.0002, "epoch": 2.1522441651705564, "step": 29970}, {"loss": 0.6657, "grad_norm": 0.6870610117912292, "learning_rate": 0.0002, "epoch": 2.152962298025135, "step": 29980}, {"loss": 0.6518, "grad_norm": 0.7154987454414368, "learning_rate": 0.0002, "epoch": 2.153680430879713, "step": 29990}, {"loss": 0.6418, "grad_norm": 0.7692370414733887, "learning_rate": 0.0002, "epoch": 2.154398563734291, "step": 30000}, {"loss": 0.6557, "grad_norm": 0.7745859026908875, "learning_rate": 0.0002, "epoch": 2.155116696588869, "step": 30010}, {"loss": 0.61, "grad_norm": 0.718207061290741, "learning_rate": 0.0002, "epoch": 2.155834829443447, "step": 30020}, {"loss": 0.6348, "grad_norm": 0.8851615786552429, "learning_rate": 0.0002, "epoch": 2.1565529622980253, "step": 30030}, {"loss": 0.7108, "grad_norm": 0.736194372177124, "learning_rate": 0.0002, "epoch": 2.1572710951526033, "step": 30040}, {"loss": 0.6682, "grad_norm": 0.9908117055892944, "learning_rate": 0.0002, "epoch": 2.1579892280071813, "step": 30050}, {"loss": 0.6348, "grad_norm": 0.6772316694259644, "learning_rate": 0.0002, "epoch": 2.1587073608617593, "step": 30060}, {"loss": 0.6952, "grad_norm": 0.7474411725997925, "learning_rate": 0.0002, "epoch": 2.1594254937163377, "step": 30070}, {"loss": 0.6698, "grad_norm": 0.8140033483505249, "learning_rate": 0.0002, "epoch": 2.1601436265709157, "step": 30080}, {"loss": 0.6516, "grad_norm": 0.912555992603302, "learning_rate": 0.0002, "epoch": 2.1608617594254937, "step": 30090}, {"loss": 0.6818, "grad_norm": 0.8189636468887329, "learning_rate": 0.0002, "epoch": 2.1615798922800717, "step": 30100}, {"loss": 0.6662, "grad_norm": 0.7520000338554382, "learning_rate": 0.0002, "epoch": 2.1622980251346497, "step": 30110}, {"loss": 0.678, "grad_norm": 0.9635465741157532, "learning_rate": 0.0002, "epoch": 2.163016157989228, "step": 30120}, {"loss": 0.6641, "grad_norm": 0.9139830470085144, "learning_rate": 0.0002, "epoch": 2.163734290843806, "step": 30130}, {"loss": 0.6685, "grad_norm": 0.844384491443634, "learning_rate": 0.0002, "epoch": 2.164452423698384, "step": 30140}, {"loss": 0.708, "grad_norm": 0.8296793103218079, "learning_rate": 0.0002, "epoch": 2.165170556552962, "step": 30150}, {"loss": 0.668, "grad_norm": 0.7929309606552124, "learning_rate": 0.0002, "epoch": 2.16588868940754, "step": 30160}, {"loss": 0.6221, "grad_norm": 0.8046507239341736, "learning_rate": 0.0002, "epoch": 2.1666068222621186, "step": 30170}, {"loss": 0.6788, "grad_norm": 0.8161377310752869, "learning_rate": 0.0002, "epoch": 2.1673249551166966, "step": 30180}, {"loss": 0.6578, "grad_norm": 0.6984363794326782, "learning_rate": 0.0002, "epoch": 2.1680430879712747, "step": 30190}, {"loss": 0.6774, "grad_norm": 0.8578489422798157, "learning_rate": 0.0002, "epoch": 2.1687612208258527, "step": 30200}, {"loss": 0.668, "grad_norm": 0.8051524758338928, "learning_rate": 0.0002, "epoch": 2.1694793536804307, "step": 30210}, {"loss": 0.6212, "grad_norm": 0.6775792241096497, "learning_rate": 0.0002, "epoch": 2.170197486535009, "step": 30220}, {"loss": 0.705, "grad_norm": 0.7102242708206177, "learning_rate": 0.0002, "epoch": 2.170915619389587, "step": 30230}, {"loss": 0.6814, "grad_norm": 0.9038975238800049, "learning_rate": 0.0002, "epoch": 2.171633752244165, "step": 30240}, {"loss": 0.6919, "grad_norm": 0.8509918451309204, "learning_rate": 0.0002, "epoch": 2.172351885098743, "step": 30250}, {"loss": 0.6904, "grad_norm": 0.8816375732421875, "learning_rate": 0.0002, "epoch": 2.1730700179533216, "step": 30260}, {"loss": 0.7211, "grad_norm": 0.7907037138938904, "learning_rate": 0.0002, "epoch": 2.1737881508078996, "step": 30270}, {"loss": 0.6542, "grad_norm": 0.7104434967041016, "learning_rate": 0.0002, "epoch": 2.1745062836624776, "step": 30280}, {"loss": 0.6863, "grad_norm": 1.028658151626587, "learning_rate": 0.0002, "epoch": 2.1752244165170556, "step": 30290}, {"loss": 0.6789, "grad_norm": 0.8542430400848389, "learning_rate": 0.0002, "epoch": 2.1759425493716336, "step": 30300}, {"loss": 0.6783, "grad_norm": 0.7438064813613892, "learning_rate": 0.0002, "epoch": 2.176660682226212, "step": 30310}, {"loss": 0.63, "grad_norm": 0.8384708762168884, "learning_rate": 0.0002, "epoch": 2.17737881508079, "step": 30320}, {"loss": 0.6861, "grad_norm": 0.9034163355827332, "learning_rate": 0.0002, "epoch": 2.178096947935368, "step": 30330}, {"loss": 0.666, "grad_norm": 0.9659526944160461, "learning_rate": 0.0002, "epoch": 2.178815080789946, "step": 30340}, {"loss": 0.6819, "grad_norm": 0.6685642600059509, "learning_rate": 0.0002, "epoch": 2.1795332136445245, "step": 30350}, {"loss": 0.6759, "grad_norm": 0.9180589318275452, "learning_rate": 0.0002, "epoch": 2.1802513464991025, "step": 30360}, {"loss": 0.6575, "grad_norm": 0.9550795555114746, "learning_rate": 0.0002, "epoch": 2.1809694793536805, "step": 30370}, {"loss": 0.7014, "grad_norm": 0.8517686724662781, "learning_rate": 0.0002, "epoch": 2.1816876122082585, "step": 30380}, {"loss": 0.7069, "grad_norm": 0.7351927161216736, "learning_rate": 0.0002, "epoch": 2.1824057450628365, "step": 30390}, {"loss": 0.6555, "grad_norm": 0.8439408540725708, "learning_rate": 0.0002, "epoch": 2.183123877917415, "step": 30400}, {"loss": 0.69, "grad_norm": 0.8322570323944092, "learning_rate": 0.0002, "epoch": 2.183842010771993, "step": 30410}, {"loss": 0.6801, "grad_norm": 0.6735888123512268, "learning_rate": 0.0002, "epoch": 2.184560143626571, "step": 30420}, {"loss": 0.6844, "grad_norm": 0.7273133397102356, "learning_rate": 0.0002, "epoch": 2.185278276481149, "step": 30430}, {"loss": 0.7119, "grad_norm": 0.7841959595680237, "learning_rate": 0.0002, "epoch": 2.185996409335727, "step": 30440}, {"loss": 0.6717, "grad_norm": 0.67259281873703, "learning_rate": 0.0002, "epoch": 2.1867145421903054, "step": 30450}, {"loss": 0.6857, "grad_norm": 0.7646223306655884, "learning_rate": 0.0002, "epoch": 2.1874326750448834, "step": 30460}, {"loss": 0.6803, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 2.1881508078994614, "step": 30470}, {"loss": 0.6512, "grad_norm": 0.8818342685699463, "learning_rate": 0.0002, "epoch": 2.1888689407540394, "step": 30480}, {"loss": 0.6778, "grad_norm": 0.7421377897262573, "learning_rate": 0.0002, "epoch": 2.1895870736086174, "step": 30490}, {"loss": 0.6783, "grad_norm": 0.8180080652236938, "learning_rate": 0.0002, "epoch": 2.190305206463196, "step": 30500}, {"loss": 0.6774, "grad_norm": 0.8003571033477783, "learning_rate": 0.0002, "epoch": 2.191023339317774, "step": 30510}, {"loss": 0.7, "grad_norm": 0.8200605511665344, "learning_rate": 0.0002, "epoch": 2.191741472172352, "step": 30520}, {"loss": 0.7113, "grad_norm": 0.8878887295722961, "learning_rate": 0.0002, "epoch": 2.19245960502693, "step": 30530}, {"loss": 0.6364, "grad_norm": 0.8518163561820984, "learning_rate": 0.0002, "epoch": 2.1931777378815083, "step": 30540}, {"loss": 0.7039, "grad_norm": 0.8182454705238342, "learning_rate": 0.0002, "epoch": 2.1938958707360863, "step": 30550}, {"loss": 0.6966, "grad_norm": 0.9395919442176819, "learning_rate": 0.0002, "epoch": 2.1946140035906643, "step": 30560}, {"loss": 0.6617, "grad_norm": 0.7916256189346313, "learning_rate": 0.0002, "epoch": 2.1953321364452423, "step": 30570}, {"loss": 0.6869, "grad_norm": 0.7303445339202881, "learning_rate": 0.0002, "epoch": 2.1960502692998203, "step": 30580}, {"loss": 0.6485, "grad_norm": 0.7407387495040894, "learning_rate": 0.0002, "epoch": 2.1967684021543987, "step": 30590}, {"loss": 0.6704, "grad_norm": 0.7410500645637512, "learning_rate": 0.0002, "epoch": 2.1974865350089767, "step": 30600}, {"loss": 0.7013, "grad_norm": 0.9176440834999084, "learning_rate": 0.0002, "epoch": 2.1982046678635547, "step": 30610}, {"loss": 0.706, "grad_norm": 0.8823038935661316, "learning_rate": 0.0002, "epoch": 2.1989228007181327, "step": 30620}, {"loss": 0.7418, "grad_norm": 0.9263436198234558, "learning_rate": 0.0002, "epoch": 2.199640933572711, "step": 30630}, {"loss": 0.6019, "grad_norm": 0.6753571033477783, "learning_rate": 0.0002, "epoch": 2.200359066427289, "step": 30640}, {"loss": 0.6808, "grad_norm": 0.841160774230957, "learning_rate": 0.0002, "epoch": 2.201077199281867, "step": 30650}, {"loss": 0.6917, "grad_norm": 0.8786441683769226, "learning_rate": 0.0002, "epoch": 2.201795332136445, "step": 30660}, {"loss": 0.6878, "grad_norm": 0.8833681344985962, "learning_rate": 0.0002, "epoch": 2.202513464991023, "step": 30670}, {"loss": 0.7061, "grad_norm": 0.6609824299812317, "learning_rate": 0.0002, "epoch": 2.2032315978456016, "step": 30680}, {"loss": 0.6572, "grad_norm": 0.7308626174926758, "learning_rate": 0.0002, "epoch": 2.2039497307001796, "step": 30690}, {"loss": 0.7127, "grad_norm": 0.8854711055755615, "learning_rate": 0.0002, "epoch": 2.2046678635547576, "step": 30700}, {"loss": 0.6836, "grad_norm": 0.839043140411377, "learning_rate": 0.0002, "epoch": 2.2053859964093356, "step": 30710}, {"loss": 0.6577, "grad_norm": 0.9030174016952515, "learning_rate": 0.0002, "epoch": 2.2061041292639136, "step": 30720}, {"loss": 0.663, "grad_norm": 0.6856667399406433, "learning_rate": 0.0002, "epoch": 2.206822262118492, "step": 30730}, {"loss": 0.6672, "grad_norm": 0.8823501467704773, "learning_rate": 0.0002, "epoch": 2.20754039497307, "step": 30740}, {"loss": 0.6809, "grad_norm": 0.8501278162002563, "learning_rate": 0.0002, "epoch": 2.208258527827648, "step": 30750}, {"loss": 0.7402, "grad_norm": 0.8099446892738342, "learning_rate": 0.0002, "epoch": 2.208976660682226, "step": 30760}, {"loss": 0.6996, "grad_norm": 0.7203072905540466, "learning_rate": 0.0002, "epoch": 2.209694793536804, "step": 30770}, {"loss": 0.7494, "grad_norm": 1.0898563861846924, "learning_rate": 0.0002, "epoch": 2.2104129263913825, "step": 30780}, {"loss": 0.6432, "grad_norm": 0.8157216906547546, "learning_rate": 0.0002, "epoch": 2.2111310592459605, "step": 30790}, {"loss": 0.634, "grad_norm": 0.7617478966712952, "learning_rate": 0.0002, "epoch": 2.2118491921005385, "step": 30800}, {"loss": 0.7155, "grad_norm": 0.790503978729248, "learning_rate": 0.0002, "epoch": 2.2125673249551165, "step": 30810}, {"loss": 0.6301, "grad_norm": 0.9289199113845825, "learning_rate": 0.0002, "epoch": 2.213285457809695, "step": 30820}, {"loss": 0.6867, "grad_norm": 0.9267001748085022, "learning_rate": 0.0002, "epoch": 2.214003590664273, "step": 30830}, {"loss": 0.7012, "grad_norm": 0.716023862361908, "learning_rate": 0.0002, "epoch": 2.214721723518851, "step": 30840}, {"loss": 0.6755, "grad_norm": 0.8733863234519958, "learning_rate": 0.0002, "epoch": 2.215439856373429, "step": 30850}, {"loss": 0.6713, "grad_norm": 0.7743660807609558, "learning_rate": 0.0002, "epoch": 2.216157989228007, "step": 30860}, {"loss": 0.665, "grad_norm": 0.7974567413330078, "learning_rate": 0.0002, "epoch": 2.2168761220825854, "step": 30870}, {"loss": 0.6624, "grad_norm": 0.6617984771728516, "learning_rate": 0.0002, "epoch": 2.2175942549371634, "step": 30880}, {"loss": 0.6332, "grad_norm": 0.6925143003463745, "learning_rate": 0.0002, "epoch": 2.2183123877917414, "step": 30890}, {"loss": 0.6986, "grad_norm": 0.6853532195091248, "learning_rate": 0.0002, "epoch": 2.2190305206463194, "step": 30900}, {"loss": 0.6881, "grad_norm": 0.7964699268341064, "learning_rate": 0.0002, "epoch": 2.219748653500898, "step": 30910}, {"loss": 0.6879, "grad_norm": 0.8116228580474854, "learning_rate": 0.0002, "epoch": 2.220466786355476, "step": 30920}, {"loss": 0.6599, "grad_norm": 1.0121010541915894, "learning_rate": 0.0002, "epoch": 2.221184919210054, "step": 30930}, {"loss": 0.6873, "grad_norm": 0.7348445653915405, "learning_rate": 0.0002, "epoch": 2.221903052064632, "step": 30940}, {"loss": 0.6711, "grad_norm": 0.8998047709465027, "learning_rate": 0.0002, "epoch": 2.22262118491921, "step": 30950}, {"loss": 0.692, "grad_norm": 0.6108106970787048, "learning_rate": 0.0002, "epoch": 2.2233393177737883, "step": 30960}, {"loss": 0.6515, "grad_norm": 1.287834882736206, "learning_rate": 0.0002, "epoch": 2.2240574506283664, "step": 30970}, {"loss": 0.6513, "grad_norm": 0.8584468960762024, "learning_rate": 0.0002, "epoch": 2.2247755834829444, "step": 30980}, {"loss": 0.6907, "grad_norm": 0.865276038646698, "learning_rate": 0.0002, "epoch": 2.2254937163375224, "step": 30990}, {"loss": 0.7516, "grad_norm": 0.8713302612304688, "learning_rate": 0.0002, "epoch": 2.2262118491921004, "step": 31000}, {"loss": 0.7127, "grad_norm": 0.9210535883903503, "learning_rate": 0.0002, "epoch": 2.226929982046679, "step": 31010}, {"loss": 0.6543, "grad_norm": 0.8578430414199829, "learning_rate": 0.0002, "epoch": 2.227648114901257, "step": 31020}, {"loss": 0.6964, "grad_norm": 0.7128387093544006, "learning_rate": 0.0002, "epoch": 2.228366247755835, "step": 31030}, {"loss": 0.6949, "grad_norm": 0.8059941530227661, "learning_rate": 0.0002, "epoch": 2.229084380610413, "step": 31040}, {"loss": 0.6422, "grad_norm": 0.8043261170387268, "learning_rate": 0.0002, "epoch": 2.229802513464991, "step": 31050}, {"loss": 0.691, "grad_norm": 0.9260253310203552, "learning_rate": 0.0002, "epoch": 2.2305206463195693, "step": 31060}, {"loss": 0.6601, "grad_norm": 0.7908085584640503, "learning_rate": 0.0002, "epoch": 2.2312387791741473, "step": 31070}, {"loss": 0.6312, "grad_norm": 0.7860442996025085, "learning_rate": 0.0002, "epoch": 2.2319569120287253, "step": 31080}, {"loss": 0.715, "grad_norm": 0.8388702273368835, "learning_rate": 0.0002, "epoch": 2.2326750448833033, "step": 31090}, {"loss": 0.7015, "grad_norm": 0.835686206817627, "learning_rate": 0.0002, "epoch": 2.2333931777378817, "step": 31100}, {"loss": 0.6796, "grad_norm": 0.8148298859596252, "learning_rate": 0.0002, "epoch": 2.2341113105924597, "step": 31110}, {"loss": 0.6318, "grad_norm": 0.8501878976821899, "learning_rate": 0.0002, "epoch": 2.2348294434470377, "step": 31120}, {"loss": 0.7262, "grad_norm": 0.793323278427124, "learning_rate": 0.0002, "epoch": 2.2355475763016157, "step": 31130}, {"loss": 0.722, "grad_norm": 0.8234742879867554, "learning_rate": 0.0002, "epoch": 2.2362657091561937, "step": 31140}, {"loss": 0.6746, "grad_norm": 0.8691303133964539, "learning_rate": 0.0002, "epoch": 2.236983842010772, "step": 31150}, {"loss": 0.6191, "grad_norm": 0.8707090020179749, "learning_rate": 0.0002, "epoch": 2.23770197486535, "step": 31160}, {"loss": 0.6988, "grad_norm": 0.8468940854072571, "learning_rate": 0.0002, "epoch": 2.238420107719928, "step": 31170}, {"loss": 0.6429, "grad_norm": 0.7275772094726562, "learning_rate": 0.0002, "epoch": 2.239138240574506, "step": 31180}, {"loss": 0.7057, "grad_norm": 0.8765808939933777, "learning_rate": 0.0002, "epoch": 2.2398563734290846, "step": 31190}, {"loss": 0.7273, "grad_norm": 1.02803635597229, "learning_rate": 0.0002, "epoch": 2.2405745062836626, "step": 31200}, {"loss": 0.7303, "grad_norm": 0.7999185919761658, "learning_rate": 0.0002, "epoch": 2.2412926391382406, "step": 31210}, {"loss": 0.658, "grad_norm": 0.5711870789527893, "learning_rate": 0.0002, "epoch": 2.2420107719928186, "step": 31220}, {"loss": 0.6527, "grad_norm": 0.7183604836463928, "learning_rate": 0.0002, "epoch": 2.2427289048473966, "step": 31230}, {"loss": 0.6817, "grad_norm": 0.8819206357002258, "learning_rate": 0.0002, "epoch": 2.243447037701975, "step": 31240}, {"loss": 0.6805, "grad_norm": 0.9078969955444336, "learning_rate": 0.0002, "epoch": 2.244165170556553, "step": 31250}, {"loss": 0.6937, "grad_norm": 1.184506893157959, "learning_rate": 0.0002, "epoch": 2.244883303411131, "step": 31260}, {"loss": 0.7682, "grad_norm": 0.8660752177238464, "learning_rate": 0.0002, "epoch": 2.245601436265709, "step": 31270}, {"loss": 0.6461, "grad_norm": 1.011796236038208, "learning_rate": 0.0002, "epoch": 2.246319569120287, "step": 31280}, {"loss": 0.677, "grad_norm": 0.9168157577514648, "learning_rate": 0.0002, "epoch": 2.2470377019748655, "step": 31290}, {"loss": 0.6844, "grad_norm": 0.7798577547073364, "learning_rate": 0.0002, "epoch": 2.2477558348294435, "step": 31300}, {"loss": 0.6622, "grad_norm": 0.6609913110733032, "learning_rate": 0.0002, "epoch": 2.2484739676840215, "step": 31310}, {"loss": 0.6616, "grad_norm": 0.64737868309021, "learning_rate": 0.0002, "epoch": 2.2491921005385995, "step": 31320}, {"loss": 0.665, "grad_norm": 1.0700385570526123, "learning_rate": 0.0002, "epoch": 2.2499102333931775, "step": 31330}, {"loss": 0.6539, "grad_norm": 0.7838551998138428, "learning_rate": 0.0002, "epoch": 2.250628366247756, "step": 31340}, {"loss": 0.7002, "grad_norm": 0.9225728511810303, "learning_rate": 0.0002, "epoch": 2.251346499102334, "step": 31350}, {"loss": 0.6758, "grad_norm": 0.7956384420394897, "learning_rate": 0.0002, "epoch": 2.252064631956912, "step": 31360}, {"loss": 0.7039, "grad_norm": 0.7645466923713684, "learning_rate": 0.0002, "epoch": 2.25278276481149, "step": 31370}, {"loss": 0.6816, "grad_norm": 0.9595549702644348, "learning_rate": 0.0002, "epoch": 2.2535008976660684, "step": 31380}, {"loss": 0.6419, "grad_norm": 0.6124163866043091, "learning_rate": 0.0002, "epoch": 2.2542190305206464, "step": 31390}, {"loss": 0.6573, "grad_norm": 0.7531530261039734, "learning_rate": 0.0002, "epoch": 2.2549371633752244, "step": 31400}, {"loss": 0.6223, "grad_norm": 0.6904721856117249, "learning_rate": 0.0002, "epoch": 2.2556552962298024, "step": 31410}, {"loss": 0.6661, "grad_norm": 0.7644204497337341, "learning_rate": 0.0002, "epoch": 2.2563734290843804, "step": 31420}, {"loss": 0.7122, "grad_norm": 0.7879737019538879, "learning_rate": 0.0002, "epoch": 2.257091561938959, "step": 31430}, {"loss": 0.6407, "grad_norm": 0.796450138092041, "learning_rate": 0.0002, "epoch": 2.257809694793537, "step": 31440}, {"loss": 0.722, "grad_norm": 0.7536656856536865, "learning_rate": 0.0002, "epoch": 2.258527827648115, "step": 31450}, {"loss": 0.681, "grad_norm": 0.6797451376914978, "learning_rate": 0.0002, "epoch": 2.259245960502693, "step": 31460}, {"loss": 0.6916, "grad_norm": 0.7833347320556641, "learning_rate": 0.0002, "epoch": 2.2599640933572713, "step": 31470}, {"loss": 0.702, "grad_norm": 0.7571428418159485, "learning_rate": 0.0002, "epoch": 2.2606822262118493, "step": 31480}, {"loss": 0.6878, "grad_norm": 0.7028690576553345, "learning_rate": 0.0002, "epoch": 2.2614003590664273, "step": 31490}, {"loss": 0.6863, "grad_norm": 0.7854651212692261, "learning_rate": 0.0002, "epoch": 2.2621184919210053, "step": 31500}, {"loss": 0.6895, "grad_norm": 1.1924974918365479, "learning_rate": 0.0002, "epoch": 2.2628366247755833, "step": 31510}, {"loss": 0.7174, "grad_norm": 0.8087588548660278, "learning_rate": 0.0002, "epoch": 2.2635547576301613, "step": 31520}, {"loss": 0.6398, "grad_norm": 0.8521981835365295, "learning_rate": 0.0002, "epoch": 2.26427289048474, "step": 31530}, {"loss": 0.6654, "grad_norm": 0.754585862159729, "learning_rate": 0.0002, "epoch": 2.264991023339318, "step": 31540}, {"loss": 0.6854, "grad_norm": 0.8403395414352417, "learning_rate": 0.0002, "epoch": 2.265709156193896, "step": 31550}, {"loss": 0.6873, "grad_norm": 0.9724786877632141, "learning_rate": 0.0002, "epoch": 2.266427289048474, "step": 31560}, {"loss": 0.6876, "grad_norm": 0.7568767070770264, "learning_rate": 0.0002, "epoch": 2.2671454219030522, "step": 31570}, {"loss": 0.6161, "grad_norm": 0.712009608745575, "learning_rate": 0.0002, "epoch": 2.2678635547576302, "step": 31580}, {"loss": 0.6568, "grad_norm": 0.7649937868118286, "learning_rate": 0.0002, "epoch": 2.2685816876122082, "step": 31590}, {"loss": 0.6195, "grad_norm": 0.7319537997245789, "learning_rate": 0.0002, "epoch": 2.2692998204667862, "step": 31600}, {"loss": 0.6434, "grad_norm": 0.9597942233085632, "learning_rate": 0.0002, "epoch": 2.2700179533213642, "step": 31610}, {"loss": 0.6273, "grad_norm": 0.7403358817100525, "learning_rate": 0.0002, "epoch": 2.2707360861759427, "step": 31620}, {"loss": 0.7185, "grad_norm": 0.7395114898681641, "learning_rate": 0.0002, "epoch": 2.2714542190305207, "step": 31630}, {"loss": 0.6357, "grad_norm": 0.8835344314575195, "learning_rate": 0.0002, "epoch": 2.2721723518850987, "step": 31640}, {"loss": 0.7442, "grad_norm": 0.76587975025177, "learning_rate": 0.0002, "epoch": 2.2728904847396767, "step": 31650}, {"loss": 0.6491, "grad_norm": 0.6472584009170532, "learning_rate": 0.0002, "epoch": 2.273608617594255, "step": 31660}, {"loss": 0.7026, "grad_norm": 1.0170460939407349, "learning_rate": 0.0002, "epoch": 2.274326750448833, "step": 31670}, {"loss": 0.6839, "grad_norm": 0.8170912265777588, "learning_rate": 0.0002, "epoch": 2.275044883303411, "step": 31680}, {"loss": 0.6599, "grad_norm": 0.6821279525756836, "learning_rate": 0.0002, "epoch": 2.275763016157989, "step": 31690}, {"loss": 0.6346, "grad_norm": 0.8150709867477417, "learning_rate": 0.0002, "epoch": 2.276481149012567, "step": 31700}, {"loss": 0.6639, "grad_norm": 0.6786386370658875, "learning_rate": 0.0002, "epoch": 2.2771992818671456, "step": 31710}, {"loss": 0.6753, "grad_norm": 0.8871912360191345, "learning_rate": 0.0002, "epoch": 2.2779174147217236, "step": 31720}, {"loss": 0.6826, "grad_norm": 0.7710220813751221, "learning_rate": 0.0002, "epoch": 2.2786355475763016, "step": 31730}, {"loss": 0.7118, "grad_norm": 0.8073079586029053, "learning_rate": 0.0002, "epoch": 2.2793536804308796, "step": 31740}, {"loss": 0.6614, "grad_norm": 0.8228550553321838, "learning_rate": 0.0002, "epoch": 2.280071813285458, "step": 31750}, {"loss": 0.7162, "grad_norm": 0.7987996339797974, "learning_rate": 0.0002, "epoch": 2.280789946140036, "step": 31760}, {"loss": 0.6953, "grad_norm": 0.744326651096344, "learning_rate": 0.0002, "epoch": 2.281508078994614, "step": 31770}, {"loss": 0.7089, "grad_norm": 0.7672302722930908, "learning_rate": 0.0002, "epoch": 2.282226211849192, "step": 31780}, {"loss": 0.6926, "grad_norm": 0.8079774975776672, "learning_rate": 0.0002, "epoch": 2.28294434470377, "step": 31790}, {"loss": 0.6361, "grad_norm": 0.7383643984794617, "learning_rate": 0.0002, "epoch": 2.283662477558348, "step": 31800}, {"loss": 0.6924, "grad_norm": 0.8542332649230957, "learning_rate": 0.0002, "epoch": 2.2843806104129265, "step": 31810}, {"loss": 0.7156, "grad_norm": 0.7657321691513062, "learning_rate": 0.0002, "epoch": 2.2850987432675045, "step": 31820}, {"loss": 0.6545, "grad_norm": 0.7485944628715515, "learning_rate": 0.0002, "epoch": 2.2858168761220825, "step": 31830}, {"loss": 0.6452, "grad_norm": 0.7817596793174744, "learning_rate": 0.0002, "epoch": 2.2865350089766605, "step": 31840}, {"loss": 0.6398, "grad_norm": 0.840421736240387, "learning_rate": 0.0002, "epoch": 2.287253141831239, "step": 31850}, {"loss": 0.7245, "grad_norm": 0.8190447688102722, "learning_rate": 0.0002, "epoch": 2.287971274685817, "step": 31860}, {"loss": 0.7343, "grad_norm": 0.9582287669181824, "learning_rate": 0.0002, "epoch": 2.288689407540395, "step": 31870}, {"loss": 0.683, "grad_norm": 1.0939116477966309, "learning_rate": 0.0002, "epoch": 2.289407540394973, "step": 31880}, {"loss": 0.7176, "grad_norm": 1.0901678800582886, "learning_rate": 0.0002, "epoch": 2.290125673249551, "step": 31890}, {"loss": 0.6711, "grad_norm": 0.8025168776512146, "learning_rate": 0.0002, "epoch": 2.2908438061041294, "step": 31900}, {"loss": 0.6901, "grad_norm": 0.8157371878623962, "learning_rate": 0.0002, "epoch": 2.2915619389587074, "step": 31910}, {"loss": 0.6643, "grad_norm": 0.7735328078269958, "learning_rate": 0.0002, "epoch": 2.2922800718132854, "step": 31920}, {"loss": 0.689, "grad_norm": 0.7501550316810608, "learning_rate": 0.0002, "epoch": 2.2929982046678634, "step": 31930}, {"loss": 0.6605, "grad_norm": 0.76664799451828, "learning_rate": 0.0002, "epoch": 2.293716337522442, "step": 31940}, {"loss": 0.6818, "grad_norm": 1.0044599771499634, "learning_rate": 0.0002, "epoch": 2.29443447037702, "step": 31950}, {"loss": 0.6566, "grad_norm": 0.7773551344871521, "learning_rate": 0.0002, "epoch": 2.295152603231598, "step": 31960}, {"loss": 0.6834, "grad_norm": 0.9021226763725281, "learning_rate": 0.0002, "epoch": 2.295870736086176, "step": 31970}, {"loss": 0.6757, "grad_norm": 0.9075915813446045, "learning_rate": 0.0002, "epoch": 2.296588868940754, "step": 31980}, {"loss": 0.6584, "grad_norm": 0.9109290242195129, "learning_rate": 0.0002, "epoch": 2.2973070017953323, "step": 31990}, {"loss": 0.6792, "grad_norm": 0.7742900252342224, "learning_rate": 0.0002, "epoch": 2.2980251346499103, "step": 32000}, {"loss": 0.7137, "grad_norm": 0.633260190486908, "learning_rate": 0.0002, "epoch": 2.2987432675044883, "step": 32010}, {"loss": 0.6644, "grad_norm": 0.8593834042549133, "learning_rate": 0.0002, "epoch": 2.2994614003590663, "step": 32020}, {"loss": 0.6961, "grad_norm": 0.88165283203125, "learning_rate": 0.0002, "epoch": 2.3001795332136448, "step": 32030}, {"loss": 0.7779, "grad_norm": 0.7840633988380432, "learning_rate": 0.0002, "epoch": 2.3008976660682228, "step": 32040}, {"loss": 0.7045, "grad_norm": 0.8150764107704163, "learning_rate": 0.0002, "epoch": 2.3016157989228008, "step": 32050}, {"loss": 0.6556, "grad_norm": 0.7683324813842773, "learning_rate": 0.0002, "epoch": 2.3023339317773788, "step": 32060}, {"loss": 0.6657, "grad_norm": 0.7581049799919128, "learning_rate": 0.0002, "epoch": 2.3030520646319568, "step": 32070}, {"loss": 0.6683, "grad_norm": 0.911687970161438, "learning_rate": 0.0002, "epoch": 2.3037701974865348, "step": 32080}, {"loss": 0.7029, "grad_norm": 1.0596355199813843, "learning_rate": 0.0002, "epoch": 2.3044883303411132, "step": 32090}, {"loss": 0.6955, "grad_norm": 0.7329661846160889, "learning_rate": 0.0002, "epoch": 2.3052064631956912, "step": 32100}, {"loss": 0.6798, "grad_norm": 0.8251074552536011, "learning_rate": 0.0002, "epoch": 2.3059245960502692, "step": 32110}, {"loss": 0.692, "grad_norm": 0.7765523195266724, "learning_rate": 0.0002, "epoch": 2.3066427289048472, "step": 32120}, {"loss": 0.6375, "grad_norm": 0.8246980905532837, "learning_rate": 0.0002, "epoch": 2.3073608617594257, "step": 32130}, {"loss": 0.6815, "grad_norm": 0.833387017250061, "learning_rate": 0.0002, "epoch": 2.3080789946140037, "step": 32140}, {"loss": 0.6261, "grad_norm": 0.9558065533638, "learning_rate": 0.0002, "epoch": 2.3087971274685817, "step": 32150}, {"loss": 0.6723, "grad_norm": 0.788151204586029, "learning_rate": 0.0002, "epoch": 2.3095152603231597, "step": 32160}, {"loss": 0.6398, "grad_norm": 0.8662320971488953, "learning_rate": 0.0002, "epoch": 2.3102333931777377, "step": 32170}, {"loss": 0.7014, "grad_norm": 0.7079060673713684, "learning_rate": 0.0002, "epoch": 2.310951526032316, "step": 32180}, {"loss": 0.6479, "grad_norm": 0.8477022647857666, "learning_rate": 0.0002, "epoch": 2.311669658886894, "step": 32190}, {"loss": 0.6872, "grad_norm": 0.6549711227416992, "learning_rate": 0.0002, "epoch": 2.312387791741472, "step": 32200}, {"loss": 0.6668, "grad_norm": 0.8274375796318054, "learning_rate": 0.0002, "epoch": 2.31310592459605, "step": 32210}, {"loss": 0.6731, "grad_norm": 0.6305822730064392, "learning_rate": 0.0002, "epoch": 2.3138240574506286, "step": 32220}, {"loss": 0.6908, "grad_norm": 0.8105725049972534, "learning_rate": 0.0002, "epoch": 2.3145421903052066, "step": 32230}, {"loss": 0.7028, "grad_norm": 0.7317119240760803, "learning_rate": 0.0002, "epoch": 2.3152603231597846, "step": 32240}, {"loss": 0.6444, "grad_norm": 0.7729924917221069, "learning_rate": 0.0002, "epoch": 2.3159784560143626, "step": 32250}, {"loss": 0.6945, "grad_norm": 0.8092145919799805, "learning_rate": 0.0002, "epoch": 2.3166965888689406, "step": 32260}, {"loss": 0.663, "grad_norm": 0.8723762035369873, "learning_rate": 0.0002, "epoch": 2.317414721723519, "step": 32270}, {"loss": 0.6992, "grad_norm": 0.9699533581733704, "learning_rate": 0.0002, "epoch": 2.318132854578097, "step": 32280}, {"loss": 0.7488, "grad_norm": 1.2972444295883179, "learning_rate": 0.0002, "epoch": 2.318850987432675, "step": 32290}, {"loss": 0.6969, "grad_norm": 0.7888450622558594, "learning_rate": 0.0002, "epoch": 2.319569120287253, "step": 32300}, {"loss": 0.6876, "grad_norm": 0.7457000017166138, "learning_rate": 0.0002, "epoch": 2.3202872531418315, "step": 32310}, {"loss": 0.6891, "grad_norm": 0.7270606756210327, "learning_rate": 0.0002, "epoch": 2.3210053859964095, "step": 32320}, {"loss": 0.6607, "grad_norm": 0.7930711507797241, "learning_rate": 0.0002, "epoch": 2.3217235188509875, "step": 32330}, {"loss": 0.7222, "grad_norm": 0.9015030264854431, "learning_rate": 0.0002, "epoch": 2.3224416517055655, "step": 32340}, {"loss": 0.6544, "grad_norm": 0.9385523796081543, "learning_rate": 0.0002, "epoch": 2.3231597845601435, "step": 32350}, {"loss": 0.6779, "grad_norm": 0.7293606400489807, "learning_rate": 0.0002, "epoch": 2.3238779174147215, "step": 32360}, {"loss": 0.6556, "grad_norm": 0.797618567943573, "learning_rate": 0.0002, "epoch": 2.3245960502693, "step": 32370}, {"loss": 0.6743, "grad_norm": 0.8588258028030396, "learning_rate": 0.0002, "epoch": 2.325314183123878, "step": 32380}, {"loss": 0.659, "grad_norm": 0.7490078210830688, "learning_rate": 0.0002, "epoch": 2.326032315978456, "step": 32390}, {"loss": 0.7365, "grad_norm": 0.7569956183433533, "learning_rate": 0.0002, "epoch": 2.326750448833034, "step": 32400}, {"loss": 0.7048, "grad_norm": 0.8754122853279114, "learning_rate": 0.0002, "epoch": 2.3274685816876124, "step": 32410}, {"loss": 0.6845, "grad_norm": 0.9410699605941772, "learning_rate": 0.0002, "epoch": 2.3281867145421904, "step": 32420}, {"loss": 0.6611, "grad_norm": 1.1309062242507935, "learning_rate": 0.0002, "epoch": 2.3289048473967684, "step": 32430}, {"loss": 0.6609, "grad_norm": 0.7923168540000916, "learning_rate": 0.0002, "epoch": 2.3296229802513464, "step": 32440}, {"loss": 0.6728, "grad_norm": 0.830387532711029, "learning_rate": 0.0002, "epoch": 2.3303411131059244, "step": 32450}, {"loss": 0.673, "grad_norm": 0.9087454080581665, "learning_rate": 0.0002, "epoch": 2.331059245960503, "step": 32460}, {"loss": 0.6749, "grad_norm": 0.8892660737037659, "learning_rate": 0.0002, "epoch": 2.331777378815081, "step": 32470}, {"loss": 0.7101, "grad_norm": 0.84930819272995, "learning_rate": 0.0002, "epoch": 2.332495511669659, "step": 32480}, {"loss": 0.6465, "grad_norm": 0.7736781239509583, "learning_rate": 0.0002, "epoch": 2.333213644524237, "step": 32490}, {"loss": 0.6976, "grad_norm": 0.7396222352981567, "learning_rate": 0.0002, "epoch": 2.3339317773788153, "step": 32500}, {"loss": 0.6484, "grad_norm": 0.7710241079330444, "learning_rate": 0.0002, "epoch": 2.3346499102333933, "step": 32510}, {"loss": 0.6591, "grad_norm": 0.7297301888465881, "learning_rate": 0.0002, "epoch": 2.3353680430879713, "step": 32520}, {"loss": 0.7375, "grad_norm": 0.9084094166755676, "learning_rate": 0.0002, "epoch": 2.3360861759425493, "step": 32530}, {"loss": 0.6775, "grad_norm": 0.6425859332084656, "learning_rate": 0.0002, "epoch": 2.3368043087971273, "step": 32540}, {"loss": 0.7249, "grad_norm": 0.8646581172943115, "learning_rate": 0.0002, "epoch": 2.3375224416517058, "step": 32550}, {"loss": 0.6862, "grad_norm": 0.91925048828125, "learning_rate": 0.0002, "epoch": 2.3382405745062838, "step": 32560}, {"loss": 0.6805, "grad_norm": 0.8687716722488403, "learning_rate": 0.0002, "epoch": 2.3389587073608618, "step": 32570}, {"loss": 0.6377, "grad_norm": 0.9769517183303833, "learning_rate": 0.0002, "epoch": 2.3396768402154398, "step": 32580}, {"loss": 0.6459, "grad_norm": 0.7240557074546814, "learning_rate": 0.0002, "epoch": 2.340394973070018, "step": 32590}, {"loss": 0.7029, "grad_norm": 0.6631549000740051, "learning_rate": 0.0002, "epoch": 2.341113105924596, "step": 32600}, {"loss": 0.6524, "grad_norm": 0.9103635549545288, "learning_rate": 0.0002, "epoch": 2.341831238779174, "step": 32610}, {"loss": 0.6695, "grad_norm": 0.8718403577804565, "learning_rate": 0.0002, "epoch": 2.342549371633752, "step": 32620}, {"loss": 0.7006, "grad_norm": 0.8020271062850952, "learning_rate": 0.0002, "epoch": 2.34326750448833, "step": 32630}, {"loss": 0.6853, "grad_norm": 0.7834265232086182, "learning_rate": 0.0002, "epoch": 2.343985637342908, "step": 32640}, {"loss": 0.6447, "grad_norm": 0.8909988403320312, "learning_rate": 0.0002, "epoch": 2.3447037701974867, "step": 32650}, {"loss": 0.6762, "grad_norm": 0.6915582418441772, "learning_rate": 0.0002, "epoch": 2.3454219030520647, "step": 32660}, {"loss": 0.6993, "grad_norm": 0.8829401135444641, "learning_rate": 0.0002, "epoch": 2.3461400359066427, "step": 32670}, {"loss": 0.6035, "grad_norm": 0.8869150876998901, "learning_rate": 0.0002, "epoch": 2.3468581687612207, "step": 32680}, {"loss": 0.6404, "grad_norm": 0.8348933458328247, "learning_rate": 0.0002, "epoch": 2.347576301615799, "step": 32690}, {"loss": 0.6961, "grad_norm": 0.7591108679771423, "learning_rate": 0.0002, "epoch": 2.348294434470377, "step": 32700}, {"loss": 0.7155, "grad_norm": 0.8343638181686401, "learning_rate": 0.0002, "epoch": 2.349012567324955, "step": 32710}, {"loss": 0.6949, "grad_norm": 0.8537896275520325, "learning_rate": 0.0002, "epoch": 2.349730700179533, "step": 32720}, {"loss": 0.6545, "grad_norm": 0.7750797867774963, "learning_rate": 0.0002, "epoch": 2.350448833034111, "step": 32730}, {"loss": 0.7226, "grad_norm": 0.7553941607475281, "learning_rate": 0.0002, "epoch": 2.3511669658886896, "step": 32740}, {"loss": 0.6985, "grad_norm": 0.8083372712135315, "learning_rate": 0.0002, "epoch": 2.3518850987432676, "step": 32750}, {"loss": 0.6345, "grad_norm": 0.8016324043273926, "learning_rate": 0.0002, "epoch": 2.3526032315978456, "step": 32760}, {"loss": 0.6348, "grad_norm": 0.7524061799049377, "learning_rate": 0.0002, "epoch": 2.3533213644524236, "step": 32770}, {"loss": 0.6782, "grad_norm": 0.9046763777732849, "learning_rate": 0.0002, "epoch": 2.354039497307002, "step": 32780}, {"loss": 0.6745, "grad_norm": 0.9704324007034302, "learning_rate": 0.0002, "epoch": 2.35475763016158, "step": 32790}, {"loss": 0.7095, "grad_norm": 0.8756019473075867, "learning_rate": 0.0002, "epoch": 2.355475763016158, "step": 32800}, {"loss": 0.6989, "grad_norm": 0.7345646023750305, "learning_rate": 0.0002, "epoch": 2.356193895870736, "step": 32810}, {"loss": 0.6659, "grad_norm": 0.8022899031639099, "learning_rate": 0.0002, "epoch": 2.356912028725314, "step": 32820}, {"loss": 0.6997, "grad_norm": 0.7663353085517883, "learning_rate": 0.0002, "epoch": 2.3576301615798925, "step": 32830}, {"loss": 0.6683, "grad_norm": 0.7802956104278564, "learning_rate": 0.0002, "epoch": 2.3583482944344705, "step": 32840}, {"loss": 0.679, "grad_norm": 0.8130960464477539, "learning_rate": 0.0002, "epoch": 2.3590664272890485, "step": 32850}, {"loss": 0.6792, "grad_norm": 0.9671252369880676, "learning_rate": 0.0002, "epoch": 2.3597845601436265, "step": 32860}, {"loss": 0.6989, "grad_norm": 0.8806724548339844, "learning_rate": 0.0002, "epoch": 2.3605026929982045, "step": 32870}, {"loss": 0.6674, "grad_norm": 0.9378283619880676, "learning_rate": 0.0002, "epoch": 2.361220825852783, "step": 32880}, {"loss": 0.6607, "grad_norm": 0.8638162612915039, "learning_rate": 0.0002, "epoch": 2.361938958707361, "step": 32890}, {"loss": 0.6866, "grad_norm": 0.7321885228157043, "learning_rate": 0.0002, "epoch": 2.362657091561939, "step": 32900}, {"loss": 0.6682, "grad_norm": 0.8445415496826172, "learning_rate": 0.0002, "epoch": 2.363375224416517, "step": 32910}, {"loss": 0.6863, "grad_norm": 0.915715754032135, "learning_rate": 0.0002, "epoch": 2.364093357271095, "step": 32920}, {"loss": 0.6671, "grad_norm": 0.8674854040145874, "learning_rate": 0.0002, "epoch": 2.3648114901256734, "step": 32930}, {"loss": 0.7124, "grad_norm": 0.7577189207077026, "learning_rate": 0.0002, "epoch": 2.3655296229802514, "step": 32940}, {"loss": 0.6879, "grad_norm": 0.8649988174438477, "learning_rate": 0.0002, "epoch": 2.3662477558348294, "step": 32950}, {"loss": 0.6571, "grad_norm": 0.9760734438896179, "learning_rate": 0.0002, "epoch": 2.3669658886894074, "step": 32960}, {"loss": 0.7002, "grad_norm": 0.8909491300582886, "learning_rate": 0.0002, "epoch": 2.367684021543986, "step": 32970}, {"loss": 0.6961, "grad_norm": 0.6970168948173523, "learning_rate": 0.0002, "epoch": 2.368402154398564, "step": 32980}, {"loss": 0.6153, "grad_norm": 0.8208426237106323, "learning_rate": 0.0002, "epoch": 2.369120287253142, "step": 32990}, {"loss": 0.626, "grad_norm": 0.8477405309677124, "learning_rate": 0.0002, "epoch": 2.36983842010772, "step": 33000}, {"loss": 0.6588, "grad_norm": 0.7771625518798828, "learning_rate": 0.0002, "epoch": 2.370556552962298, "step": 33010}, {"loss": 0.673, "grad_norm": 0.7811821103096008, "learning_rate": 0.0002, "epoch": 2.3712746858168763, "step": 33020}, {"loss": 0.6792, "grad_norm": 0.6280415654182434, "learning_rate": 0.0002, "epoch": 2.3719928186714543, "step": 33030}, {"loss": 0.6567, "grad_norm": 0.8733929395675659, "learning_rate": 0.0002, "epoch": 2.3727109515260323, "step": 33040}, {"loss": 0.6844, "grad_norm": 0.6169558167457581, "learning_rate": 0.0002, "epoch": 2.3734290843806103, "step": 33050}, {"loss": 0.6675, "grad_norm": 0.7414724826812744, "learning_rate": 0.0002, "epoch": 2.3741472172351887, "step": 33060}, {"loss": 0.6905, "grad_norm": 0.7484683990478516, "learning_rate": 0.0002, "epoch": 2.3748653500897667, "step": 33070}, {"loss": 0.6676, "grad_norm": 0.8495098948478699, "learning_rate": 0.0002, "epoch": 2.3755834829443447, "step": 33080}, {"loss": 0.687, "grad_norm": 0.9057353734970093, "learning_rate": 0.0002, "epoch": 2.3763016157989227, "step": 33090}, {"loss": 0.6911, "grad_norm": 0.8028274178504944, "learning_rate": 0.0002, "epoch": 2.3770197486535007, "step": 33100}, {"loss": 0.6851, "grad_norm": 1.2398128509521484, "learning_rate": 0.0002, "epoch": 2.377737881508079, "step": 33110}, {"loss": 0.6753, "grad_norm": 0.7894110679626465, "learning_rate": 0.0002, "epoch": 2.378456014362657, "step": 33120}, {"loss": 0.6625, "grad_norm": 0.8530096411705017, "learning_rate": 0.0002, "epoch": 2.379174147217235, "step": 33130}, {"loss": 0.7061, "grad_norm": 0.892613410949707, "learning_rate": 0.0002, "epoch": 2.379892280071813, "step": 33140}, {"loss": 0.6719, "grad_norm": 0.868606448173523, "learning_rate": 0.0002, "epoch": 2.380610412926391, "step": 33150}, {"loss": 0.6423, "grad_norm": 0.6801115870475769, "learning_rate": 0.0002, "epoch": 2.3813285457809696, "step": 33160}, {"loss": 0.6723, "grad_norm": 0.9517148733139038, "learning_rate": 0.0002, "epoch": 2.3820466786355476, "step": 33170}, {"loss": 0.6957, "grad_norm": 0.8986499309539795, "learning_rate": 0.0002, "epoch": 2.3827648114901256, "step": 33180}, {"loss": 0.6767, "grad_norm": 0.8467642068862915, "learning_rate": 0.0002, "epoch": 2.3834829443447036, "step": 33190}, {"loss": 0.7228, "grad_norm": 0.8400940299034119, "learning_rate": 0.0002, "epoch": 2.3842010771992816, "step": 33200}, {"loss": 0.7048, "grad_norm": 0.86443030834198, "learning_rate": 0.0002, "epoch": 2.38491921005386, "step": 33210}, {"loss": 0.6227, "grad_norm": 0.8599014282226562, "learning_rate": 0.0002, "epoch": 2.385637342908438, "step": 33220}, {"loss": 0.673, "grad_norm": 0.868735134601593, "learning_rate": 0.0002, "epoch": 2.386355475763016, "step": 33230}, {"loss": 0.6612, "grad_norm": 0.941734790802002, "learning_rate": 0.0002, "epoch": 2.387073608617594, "step": 33240}, {"loss": 0.6951, "grad_norm": 0.9342881441116333, "learning_rate": 0.0002, "epoch": 2.3877917414721725, "step": 33250}, {"loss": 0.7255, "grad_norm": 1.012920618057251, "learning_rate": 0.0002, "epoch": 2.3885098743267505, "step": 33260}, {"loss": 0.6399, "grad_norm": 0.6949151754379272, "learning_rate": 0.0002, "epoch": 2.3892280071813286, "step": 33270}, {"loss": 0.7137, "grad_norm": 0.8283912539482117, "learning_rate": 0.0002, "epoch": 2.3899461400359066, "step": 33280}, {"loss": 0.7324, "grad_norm": 0.807273805141449, "learning_rate": 0.0002, "epoch": 2.3906642728904846, "step": 33290}, {"loss": 0.7353, "grad_norm": 0.8109124302864075, "learning_rate": 0.0002, "epoch": 2.391382405745063, "step": 33300}, {"loss": 0.689, "grad_norm": 0.7477563619613647, "learning_rate": 0.0002, "epoch": 2.392100538599641, "step": 33310}, {"loss": 0.6585, "grad_norm": 0.6961637735366821, "learning_rate": 0.0002, "epoch": 2.392818671454219, "step": 33320}, {"loss": 0.6919, "grad_norm": 0.9424173831939697, "learning_rate": 0.0002, "epoch": 2.393536804308797, "step": 33330}, {"loss": 0.6965, "grad_norm": 0.8289623856544495, "learning_rate": 0.0002, "epoch": 2.3942549371633755, "step": 33340}, {"loss": 0.6761, "grad_norm": 0.8106551170349121, "learning_rate": 0.0002, "epoch": 2.3949730700179535, "step": 33350}, {"loss": 0.6675, "grad_norm": 0.8800507187843323, "learning_rate": 0.0002, "epoch": 2.3956912028725315, "step": 33360}, {"loss": 0.6636, "grad_norm": 0.7662274241447449, "learning_rate": 0.0002, "epoch": 2.3964093357271095, "step": 33370}, {"loss": 0.6824, "grad_norm": 0.889204740524292, "learning_rate": 0.0002, "epoch": 2.3971274685816875, "step": 33380}, {"loss": 0.6539, "grad_norm": 0.7991349697113037, "learning_rate": 0.0002, "epoch": 2.3978456014362655, "step": 33390}, {"loss": 0.6818, "grad_norm": 0.8210278749465942, "learning_rate": 0.0002, "epoch": 2.398563734290844, "step": 33400}, {"loss": 0.7118, "grad_norm": 0.91801917552948, "learning_rate": 0.0002, "epoch": 2.399281867145422, "step": 33410}, {"loss": 0.726, "grad_norm": 0.8086220622062683, "learning_rate": 0.0002, "epoch": 2.4, "step": 33420}, {"loss": 0.7418, "grad_norm": 0.901613175868988, "learning_rate": 0.0002, "epoch": 2.400718132854578, "step": 33430}, {"loss": 0.6904, "grad_norm": 0.9865965247154236, "learning_rate": 0.0002, "epoch": 2.4014362657091564, "step": 33440}, {"loss": 0.7543, "grad_norm": 0.8160675168037415, "learning_rate": 0.0002, "epoch": 2.4021543985637344, "step": 33450}, {"loss": 0.6598, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 2.4028725314183124, "step": 33460}, {"loss": 0.6784, "grad_norm": 0.8490013480186462, "learning_rate": 0.0002, "epoch": 2.4035906642728904, "step": 33470}, {"loss": 0.6844, "grad_norm": 0.6947163939476013, "learning_rate": 0.0002, "epoch": 2.4043087971274684, "step": 33480}, {"loss": 0.6606, "grad_norm": 0.7984827756881714, "learning_rate": 0.0002, "epoch": 2.405026929982047, "step": 33490}, {"loss": 0.7032, "grad_norm": 0.7826083302497864, "learning_rate": 0.0002, "epoch": 2.405745062836625, "step": 33500}, {"loss": 0.6914, "grad_norm": 0.8213959336280823, "learning_rate": 0.0002, "epoch": 2.406463195691203, "step": 33510}, {"loss": 0.6855, "grad_norm": 0.8790069818496704, "learning_rate": 0.0002, "epoch": 2.407181328545781, "step": 33520}, {"loss": 0.6278, "grad_norm": 0.9093378782272339, "learning_rate": 0.0002, "epoch": 2.4078994614003593, "step": 33530}, {"loss": 0.6724, "grad_norm": 0.8085389137268066, "learning_rate": 0.0002, "epoch": 2.4086175942549373, "step": 33540}, {"loss": 0.6456, "grad_norm": 0.7952343225479126, "learning_rate": 0.0002, "epoch": 2.4093357271095153, "step": 33550}, {"loss": 0.7357, "grad_norm": 0.9576563835144043, "learning_rate": 0.0002, "epoch": 2.4100538599640933, "step": 33560}, {"loss": 0.7123, "grad_norm": 0.7722929120063782, "learning_rate": 0.0002, "epoch": 2.4107719928186713, "step": 33570}, {"loss": 0.6647, "grad_norm": 0.8634604215621948, "learning_rate": 0.0002, "epoch": 2.4114901256732497, "step": 33580}, {"loss": 0.6677, "grad_norm": 0.7805271148681641, "learning_rate": 0.0002, "epoch": 2.4122082585278277, "step": 33590}, {"loss": 0.6629, "grad_norm": 0.8274481296539307, "learning_rate": 0.0002, "epoch": 2.4129263913824057, "step": 33600}, {"loss": 0.6396, "grad_norm": 0.9265141487121582, "learning_rate": 0.0002, "epoch": 2.4136445242369837, "step": 33610}, {"loss": 0.6727, "grad_norm": 0.7497374415397644, "learning_rate": 0.0002, "epoch": 2.414362657091562, "step": 33620}, {"loss": 0.6543, "grad_norm": 0.7048972249031067, "learning_rate": 0.0002, "epoch": 2.41508078994614, "step": 33630}, {"loss": 0.6863, "grad_norm": 0.8449550271034241, "learning_rate": 0.0002, "epoch": 2.415798922800718, "step": 33640}, {"loss": 0.6891, "grad_norm": 0.7581984400749207, "learning_rate": 0.0002, "epoch": 2.416517055655296, "step": 33650}, {"loss": 0.6845, "grad_norm": 0.7744191288948059, "learning_rate": 0.0002, "epoch": 2.417235188509874, "step": 33660}, {"loss": 0.6412, "grad_norm": 0.6736614108085632, "learning_rate": 0.0002, "epoch": 2.417953321364452, "step": 33670}, {"loss": 0.6792, "grad_norm": 0.985431432723999, "learning_rate": 0.0002, "epoch": 2.4186714542190306, "step": 33680}, {"loss": 0.6675, "grad_norm": 0.8027978539466858, "learning_rate": 0.0002, "epoch": 2.4193895870736086, "step": 33690}, {"loss": 0.7107, "grad_norm": 0.6809377074241638, "learning_rate": 0.0002, "epoch": 2.4201077199281866, "step": 33700}, {"loss": 0.7332, "grad_norm": 0.8305349946022034, "learning_rate": 0.0002, "epoch": 2.4208258527827646, "step": 33710}, {"loss": 0.642, "grad_norm": 0.7632496356964111, "learning_rate": 0.0002, "epoch": 2.421543985637343, "step": 33720}, {"loss": 0.6614, "grad_norm": 0.7241050601005554, "learning_rate": 0.0002, "epoch": 2.422262118491921, "step": 33730}, {"loss": 0.6668, "grad_norm": 0.6729857325553894, "learning_rate": 0.0002, "epoch": 2.422980251346499, "step": 33740}, {"loss": 0.7289, "grad_norm": 0.7741881012916565, "learning_rate": 0.0002, "epoch": 2.423698384201077, "step": 33750}, {"loss": 0.6895, "grad_norm": 0.7844415903091431, "learning_rate": 0.0002, "epoch": 2.424416517055655, "step": 33760}, {"loss": 0.7073, "grad_norm": 0.7960098385810852, "learning_rate": 0.0002, "epoch": 2.4251346499102335, "step": 33770}, {"loss": 0.702, "grad_norm": 0.8267978429794312, "learning_rate": 0.0002, "epoch": 2.4258527827648115, "step": 33780}, {"loss": 0.6379, "grad_norm": 0.7498974204063416, "learning_rate": 0.0002, "epoch": 2.4265709156193895, "step": 33790}, {"loss": 0.6749, "grad_norm": 0.8357859253883362, "learning_rate": 0.0002, "epoch": 2.4272890484739675, "step": 33800}, {"loss": 0.6617, "grad_norm": 0.8056104779243469, "learning_rate": 0.0002, "epoch": 2.428007181328546, "step": 33810}, {"loss": 0.701, "grad_norm": 0.806897759437561, "learning_rate": 0.0002, "epoch": 2.428725314183124, "step": 33820}, {"loss": 0.6771, "grad_norm": 0.7770048975944519, "learning_rate": 0.0002, "epoch": 2.429443447037702, "step": 33830}, {"loss": 0.7096, "grad_norm": 0.8311458230018616, "learning_rate": 0.0002, "epoch": 2.43016157989228, "step": 33840}, {"loss": 0.7127, "grad_norm": 0.9201730489730835, "learning_rate": 0.0002, "epoch": 2.430879712746858, "step": 33850}, {"loss": 0.6722, "grad_norm": 0.83509761095047, "learning_rate": 0.0002, "epoch": 2.4315978456014364, "step": 33860}, {"loss": 0.6477, "grad_norm": 0.7680139541625977, "learning_rate": 0.0002, "epoch": 2.4323159784560144, "step": 33870}, {"loss": 0.7229, "grad_norm": 0.8956670165061951, "learning_rate": 0.0002, "epoch": 2.4330341113105924, "step": 33880}, {"loss": 0.6598, "grad_norm": 0.717941164970398, "learning_rate": 0.0002, "epoch": 2.4337522441651704, "step": 33890}, {"loss": 0.6546, "grad_norm": 0.777206540107727, "learning_rate": 0.0002, "epoch": 2.434470377019749, "step": 33900}, {"loss": 0.7442, "grad_norm": 0.90232914686203, "learning_rate": 0.0002, "epoch": 2.435188509874327, "step": 33910}, {"loss": 0.6763, "grad_norm": 1.0817158222198486, "learning_rate": 0.0002, "epoch": 2.435906642728905, "step": 33920}, {"loss": 0.6995, "grad_norm": 0.7890931367874146, "learning_rate": 0.0002, "epoch": 2.436624775583483, "step": 33930}, {"loss": 0.6438, "grad_norm": 0.9279449582099915, "learning_rate": 0.0002, "epoch": 2.437342908438061, "step": 33940}, {"loss": 0.6694, "grad_norm": 0.8313823342323303, "learning_rate": 0.0002, "epoch": 2.438061041292639, "step": 33950}, {"loss": 0.6841, "grad_norm": 1.0510340929031372, "learning_rate": 0.0002, "epoch": 2.4387791741472173, "step": 33960}, {"loss": 0.7203, "grad_norm": 0.8002574443817139, "learning_rate": 0.0002, "epoch": 2.4394973070017953, "step": 33970}, {"loss": 0.6767, "grad_norm": 0.7822834253311157, "learning_rate": 0.0002, "epoch": 2.4402154398563733, "step": 33980}, {"loss": 0.6289, "grad_norm": 0.9050403237342834, "learning_rate": 0.0002, "epoch": 2.4409335727109513, "step": 33990}, {"loss": 0.6798, "grad_norm": 0.7569652199745178, "learning_rate": 0.0002, "epoch": 2.44165170556553, "step": 34000}, {"loss": 0.648, "grad_norm": 0.6609470844268799, "learning_rate": 0.0002, "epoch": 2.442369838420108, "step": 34010}, {"loss": 0.6734, "grad_norm": 0.8090947866439819, "learning_rate": 0.0002, "epoch": 2.443087971274686, "step": 34020}, {"loss": 0.6621, "grad_norm": 0.647814929485321, "learning_rate": 0.0002, "epoch": 2.443806104129264, "step": 34030}, {"loss": 0.7227, "grad_norm": 0.9308601021766663, "learning_rate": 0.0002, "epoch": 2.444524236983842, "step": 34040}, {"loss": 0.6937, "grad_norm": 0.8259239792823792, "learning_rate": 0.0002, "epoch": 2.4452423698384202, "step": 34050}, {"loss": 0.6813, "grad_norm": 0.9410025477409363, "learning_rate": 0.0002, "epoch": 2.4459605026929983, "step": 34060}, {"loss": 0.7112, "grad_norm": 0.7446974515914917, "learning_rate": 0.0002, "epoch": 2.4466786355475763, "step": 34070}, {"loss": 0.6608, "grad_norm": 0.7093849182128906, "learning_rate": 0.0002, "epoch": 2.4473967684021543, "step": 34080}, {"loss": 0.6801, "grad_norm": 0.8726152181625366, "learning_rate": 0.0002, "epoch": 2.4481149012567327, "step": 34090}, {"loss": 0.7164, "grad_norm": 0.808300793170929, "learning_rate": 0.0002, "epoch": 2.4488330341113107, "step": 34100}, {"loss": 0.658, "grad_norm": 0.6884859800338745, "learning_rate": 0.0002, "epoch": 2.4495511669658887, "step": 34110}, {"loss": 0.6444, "grad_norm": 0.7151864767074585, "learning_rate": 0.0002, "epoch": 2.4502692998204667, "step": 34120}, {"loss": 0.6685, "grad_norm": 0.9261866807937622, "learning_rate": 0.0002, "epoch": 2.4509874326750447, "step": 34130}, {"loss": 0.6717, "grad_norm": 0.8069018125534058, "learning_rate": 0.0002, "epoch": 2.451705565529623, "step": 34140}, {"loss": 0.7436, "grad_norm": 0.8001297116279602, "learning_rate": 0.0002, "epoch": 2.452423698384201, "step": 34150}, {"loss": 0.7032, "grad_norm": 0.8547799587249756, "learning_rate": 0.0002, "epoch": 2.453141831238779, "step": 34160}, {"loss": 0.7226, "grad_norm": 0.6693823337554932, "learning_rate": 0.0002, "epoch": 2.453859964093357, "step": 34170}, {"loss": 0.6644, "grad_norm": 0.6646198630332947, "learning_rate": 0.0002, "epoch": 2.4545780969479356, "step": 34180}, {"loss": 0.6891, "grad_norm": 0.9330950975418091, "learning_rate": 0.0002, "epoch": 2.4552962298025136, "step": 34190}, {"loss": 0.6728, "grad_norm": 0.7738645672798157, "learning_rate": 0.0002, "epoch": 2.4560143626570916, "step": 34200}, {"loss": 0.7162, "grad_norm": 0.7929846048355103, "learning_rate": 0.0002, "epoch": 2.4567324955116696, "step": 34210}, {"loss": 0.6793, "grad_norm": 0.8936280012130737, "learning_rate": 0.0002, "epoch": 2.4574506283662476, "step": 34220}, {"loss": 0.6758, "grad_norm": 0.9099360108375549, "learning_rate": 0.0002, "epoch": 2.4581687612208256, "step": 34230}, {"loss": 0.666, "grad_norm": 0.7941291928291321, "learning_rate": 0.0002, "epoch": 2.458886894075404, "step": 34240}, {"loss": 0.6689, "grad_norm": 0.7169737219810486, "learning_rate": 0.0002, "epoch": 2.459605026929982, "step": 34250}, {"loss": 0.7417, "grad_norm": 0.8994171023368835, "learning_rate": 0.0002, "epoch": 2.46032315978456, "step": 34260}, {"loss": 0.6807, "grad_norm": 0.8087331056594849, "learning_rate": 0.0002, "epoch": 2.461041292639138, "step": 34270}, {"loss": 0.7152, "grad_norm": 0.935502827167511, "learning_rate": 0.0002, "epoch": 2.4617594254937165, "step": 34280}, {"loss": 0.7448, "grad_norm": 0.8957464694976807, "learning_rate": 0.0002, "epoch": 2.4624775583482945, "step": 34290}, {"loss": 0.6501, "grad_norm": 0.9017183780670166, "learning_rate": 0.0002, "epoch": 2.4631956912028725, "step": 34300}, {"loss": 0.6985, "grad_norm": 0.7778640389442444, "learning_rate": 0.0002, "epoch": 2.4639138240574505, "step": 34310}, {"loss": 0.7041, "grad_norm": 0.8870323896408081, "learning_rate": 0.0002, "epoch": 2.4646319569120285, "step": 34320}, {"loss": 0.6796, "grad_norm": 0.7660176753997803, "learning_rate": 0.0002, "epoch": 2.465350089766607, "step": 34330}, {"loss": 0.6705, "grad_norm": 0.8442226648330688, "learning_rate": 0.0002, "epoch": 2.466068222621185, "step": 34340}, {"loss": 0.7019, "grad_norm": 0.7522561550140381, "learning_rate": 0.0002, "epoch": 2.466786355475763, "step": 34350}, {"loss": 0.7331, "grad_norm": 0.9355213046073914, "learning_rate": 0.0002, "epoch": 2.467504488330341, "step": 34360}, {"loss": 0.688, "grad_norm": 0.8487382531166077, "learning_rate": 0.0002, "epoch": 2.4682226211849194, "step": 34370}, {"loss": 0.7068, "grad_norm": 0.7869813442230225, "learning_rate": 0.0002, "epoch": 2.4689407540394974, "step": 34380}, {"loss": 0.6809, "grad_norm": 0.7562848329544067, "learning_rate": 0.0002, "epoch": 2.4696588868940754, "step": 34390}, {"loss": 0.653, "grad_norm": 0.740829586982727, "learning_rate": 0.0002, "epoch": 2.4703770197486534, "step": 34400}, {"loss": 0.656, "grad_norm": 1.0862116813659668, "learning_rate": 0.0002, "epoch": 2.4710951526032314, "step": 34410}, {"loss": 0.6429, "grad_norm": 0.9633645415306091, "learning_rate": 0.0002, "epoch": 2.47181328545781, "step": 34420}, {"loss": 0.7126, "grad_norm": 0.8467186093330383, "learning_rate": 0.0002, "epoch": 2.472531418312388, "step": 34430}, {"loss": 0.6783, "grad_norm": 0.9972147941589355, "learning_rate": 0.0002, "epoch": 2.473249551166966, "step": 34440}, {"loss": 0.701, "grad_norm": 0.8086632490158081, "learning_rate": 0.0002, "epoch": 2.473967684021544, "step": 34450}, {"loss": 0.7127, "grad_norm": 0.9043704271316528, "learning_rate": 0.0002, "epoch": 2.4746858168761223, "step": 34460}, {"loss": 0.6861, "grad_norm": 0.8275330662727356, "learning_rate": 0.0002, "epoch": 2.4754039497307003, "step": 34470}, {"loss": 0.6443, "grad_norm": 0.8142464756965637, "learning_rate": 0.0002, "epoch": 2.4761220825852783, "step": 34480}, {"loss": 0.637, "grad_norm": 0.7116754651069641, "learning_rate": 0.0002, "epoch": 2.4768402154398563, "step": 34490}, {"loss": 0.6572, "grad_norm": 0.8742281198501587, "learning_rate": 0.0002, "epoch": 2.4775583482944343, "step": 34500}, {"loss": 0.6615, "grad_norm": 0.7545657157897949, "learning_rate": 0.0002, "epoch": 2.4782764811490123, "step": 34510}, {"loss": 0.6715, "grad_norm": 0.7586482167243958, "learning_rate": 0.0002, "epoch": 2.478994614003591, "step": 34520}, {"loss": 0.71, "grad_norm": 0.9212547540664673, "learning_rate": 0.0002, "epoch": 2.479712746858169, "step": 34530}, {"loss": 0.6742, "grad_norm": 0.9391530752182007, "learning_rate": 0.0002, "epoch": 2.480430879712747, "step": 34540}, {"loss": 0.6565, "grad_norm": 1.119698166847229, "learning_rate": 0.0002, "epoch": 2.481149012567325, "step": 34550}, {"loss": 0.6734, "grad_norm": 0.8499019145965576, "learning_rate": 0.0002, "epoch": 2.4818671454219032, "step": 34560}, {"loss": 0.7043, "grad_norm": 0.7629778385162354, "learning_rate": 0.0002, "epoch": 2.4825852782764812, "step": 34570}, {"loss": 0.671, "grad_norm": 0.7667021155357361, "learning_rate": 0.0002, "epoch": 2.4833034111310592, "step": 34580}, {"loss": 0.6202, "grad_norm": 0.6711493730545044, "learning_rate": 0.0002, "epoch": 2.4840215439856372, "step": 34590}, {"loss": 0.6644, "grad_norm": 0.7354223728179932, "learning_rate": 0.0002, "epoch": 2.4847396768402152, "step": 34600}, {"loss": 0.622, "grad_norm": 0.875295102596283, "learning_rate": 0.0002, "epoch": 2.4854578096947937, "step": 34610}, {"loss": 0.6946, "grad_norm": 0.7341493964195251, "learning_rate": 0.0002, "epoch": 2.4861759425493717, "step": 34620}, {"loss": 0.6674, "grad_norm": 0.9049216508865356, "learning_rate": 0.0002, "epoch": 2.4868940754039497, "step": 34630}, {"loss": 0.7017, "grad_norm": 0.7214788198471069, "learning_rate": 0.0002, "epoch": 2.4876122082585277, "step": 34640}, {"loss": 0.6571, "grad_norm": 0.7514070868492126, "learning_rate": 0.0002, "epoch": 2.488330341113106, "step": 34650}, {"loss": 0.6623, "grad_norm": 0.6929763555526733, "learning_rate": 0.0002, "epoch": 2.489048473967684, "step": 34660}, {"loss": 0.7118, "grad_norm": 1.11346435546875, "learning_rate": 0.0002, "epoch": 2.489766606822262, "step": 34670}, {"loss": 0.6664, "grad_norm": 0.9285556674003601, "learning_rate": 0.0002, "epoch": 2.49048473967684, "step": 34680}, {"loss": 0.7094, "grad_norm": 0.7699695825576782, "learning_rate": 0.0002, "epoch": 2.491202872531418, "step": 34690}, {"loss": 0.6575, "grad_norm": 0.872349739074707, "learning_rate": 0.0002, "epoch": 2.4919210053859966, "step": 34700}, {"loss": 0.6886, "grad_norm": 0.8692147135734558, "learning_rate": 0.0002, "epoch": 2.4926391382405746, "step": 34710}, {"loss": 0.711, "grad_norm": 0.799740195274353, "learning_rate": 0.0002, "epoch": 2.4933572710951526, "step": 34720}, {"loss": 0.6849, "grad_norm": 0.7320986986160278, "learning_rate": 0.0002, "epoch": 2.4940754039497306, "step": 34730}, {"loss": 0.7138, "grad_norm": 0.8233383893966675, "learning_rate": 0.0002, "epoch": 2.494793536804309, "step": 34740}, {"loss": 0.6937, "grad_norm": 0.9605086445808411, "learning_rate": 0.0002, "epoch": 2.495511669658887, "step": 34750}, {"loss": 0.6511, "grad_norm": 0.8597773909568787, "learning_rate": 0.0002, "epoch": 2.496229802513465, "step": 34760}, {"loss": 0.6793, "grad_norm": 0.7459201812744141, "learning_rate": 0.0002, "epoch": 2.496947935368043, "step": 34770}, {"loss": 0.7098, "grad_norm": 0.778457522392273, "learning_rate": 0.0002, "epoch": 2.497666068222621, "step": 34780}, {"loss": 0.6727, "grad_norm": 0.8591375946998596, "learning_rate": 0.0002, "epoch": 2.498384201077199, "step": 34790}, {"loss": 0.6439, "grad_norm": 0.9689867496490479, "learning_rate": 0.0002, "epoch": 2.4991023339317775, "step": 34800}, {"loss": 0.6365, "grad_norm": 0.7430615425109863, "learning_rate": 0.0002, "epoch": 2.4998204667863555, "step": 34810}, {"loss": 0.7207, "grad_norm": 0.8545114994049072, "learning_rate": 0.0002, "epoch": 2.5005385996409335, "step": 34820}, {"loss": 0.7318, "grad_norm": 0.7115356922149658, "learning_rate": 0.0002, "epoch": 2.5012567324955115, "step": 34830}, {"loss": 0.6985, "grad_norm": 0.7616795301437378, "learning_rate": 0.0002, "epoch": 2.50197486535009, "step": 34840}, {"loss": 0.7153, "grad_norm": 0.8097891211509705, "learning_rate": 0.0002, "epoch": 2.502692998204668, "step": 34850}, {"loss": 0.7131, "grad_norm": 0.7397396564483643, "learning_rate": 0.0002, "epoch": 2.503411131059246, "step": 34860}, {"loss": 0.7213, "grad_norm": 0.7531594038009644, "learning_rate": 0.0002, "epoch": 2.504129263913824, "step": 34870}, {"loss": 0.678, "grad_norm": 0.8050091862678528, "learning_rate": 0.0002, "epoch": 2.504847396768402, "step": 34880}, {"loss": 0.6765, "grad_norm": 0.7550507187843323, "learning_rate": 0.0002, "epoch": 2.5055655296229804, "step": 34890}, {"loss": 0.6861, "grad_norm": 1.0131759643554688, "learning_rate": 0.0002, "epoch": 2.5062836624775584, "step": 34900}, {"loss": 0.6755, "grad_norm": 0.9275356531143188, "learning_rate": 0.0002, "epoch": 2.5070017953321364, "step": 34910}, {"loss": 0.7108, "grad_norm": 0.6655791997909546, "learning_rate": 0.0002, "epoch": 2.5077199281867144, "step": 34920}, {"loss": 0.7154, "grad_norm": 0.79361891746521, "learning_rate": 0.0002, "epoch": 2.508438061041293, "step": 34930}, {"loss": 0.6506, "grad_norm": 0.8223658800125122, "learning_rate": 0.0002, "epoch": 2.509156193895871, "step": 34940}, {"loss": 0.6869, "grad_norm": 1.0070416927337646, "learning_rate": 0.0002, "epoch": 2.509874326750449, "step": 34950}, {"loss": 0.6819, "grad_norm": 0.8408986330032349, "learning_rate": 0.0002, "epoch": 2.510592459605027, "step": 34960}, {"loss": 0.7195, "grad_norm": 0.8178259134292603, "learning_rate": 0.0002, "epoch": 2.511310592459605, "step": 34970}, {"loss": 0.6738, "grad_norm": 0.747876763343811, "learning_rate": 0.0002, "epoch": 2.512028725314183, "step": 34980}, {"loss": 0.6706, "grad_norm": 0.8551825881004333, "learning_rate": 0.0002, "epoch": 2.5127468581687613, "step": 34990}, {"loss": 0.653, "grad_norm": 0.8366564512252808, "learning_rate": 0.0002, "epoch": 2.5134649910233393, "step": 35000}, {"loss": 0.6427, "grad_norm": 0.8491294384002686, "learning_rate": 0.0002, "epoch": 2.5141831238779173, "step": 35010}, {"loss": 0.6714, "grad_norm": 0.8854562640190125, "learning_rate": 0.0002, "epoch": 2.5149012567324958, "step": 35020}, {"loss": 0.6606, "grad_norm": 0.8652133345603943, "learning_rate": 0.0002, "epoch": 2.5156193895870738, "step": 35030}, {"loss": 0.658, "grad_norm": 0.8734033107757568, "learning_rate": 0.0002, "epoch": 2.5163375224416518, "step": 35040}, {"loss": 0.6528, "grad_norm": 0.8613446950912476, "learning_rate": 0.0002, "epoch": 2.5170556552962298, "step": 35050}, {"loss": 0.6943, "grad_norm": 0.762395441532135, "learning_rate": 0.0002, "epoch": 2.5177737881508078, "step": 35060}, {"loss": 0.66, "grad_norm": 0.806220293045044, "learning_rate": 0.0002, "epoch": 2.5184919210053858, "step": 35070}, {"loss": 0.6867, "grad_norm": 0.7781713008880615, "learning_rate": 0.0002, "epoch": 2.519210053859964, "step": 35080}, {"loss": 0.6927, "grad_norm": 0.8639848828315735, "learning_rate": 0.0002, "epoch": 2.519928186714542, "step": 35090}, {"loss": 0.6397, "grad_norm": 0.7331740260124207, "learning_rate": 0.0002, "epoch": 2.52064631956912, "step": 35100}, {"loss": 0.6916, "grad_norm": 0.8148137927055359, "learning_rate": 0.0002, "epoch": 2.521364452423698, "step": 35110}, {"loss": 0.6877, "grad_norm": 0.6939297914505005, "learning_rate": 0.0002, "epoch": 2.5220825852782767, "step": 35120}, {"loss": 0.6669, "grad_norm": 0.8151076436042786, "learning_rate": 0.0002, "epoch": 2.5228007181328547, "step": 35130}, {"loss": 0.6761, "grad_norm": 0.9193238019943237, "learning_rate": 0.0002, "epoch": 2.5235188509874327, "step": 35140}, {"loss": 0.7136, "grad_norm": 0.8230985403060913, "learning_rate": 0.0002, "epoch": 2.5242369838420107, "step": 35150}, {"loss": 0.7127, "grad_norm": 0.865492582321167, "learning_rate": 0.0002, "epoch": 2.5249551166965887, "step": 35160}, {"loss": 0.6591, "grad_norm": 0.7673570513725281, "learning_rate": 0.0002, "epoch": 2.525673249551167, "step": 35170}, {"loss": 0.6703, "grad_norm": 0.8296313881874084, "learning_rate": 0.0002, "epoch": 2.526391382405745, "step": 35180}, {"loss": 0.6588, "grad_norm": 0.6531317234039307, "learning_rate": 0.0002, "epoch": 2.527109515260323, "step": 35190}, {"loss": 0.7129, "grad_norm": 0.9865642189979553, "learning_rate": 0.0002, "epoch": 2.527827648114901, "step": 35200}, {"loss": 0.6728, "grad_norm": 0.8001098036766052, "learning_rate": 0.0002, "epoch": 2.5285457809694796, "step": 35210}, {"loss": 0.6737, "grad_norm": 0.7523218393325806, "learning_rate": 0.0002, "epoch": 2.5292639138240576, "step": 35220}, {"loss": 0.6426, "grad_norm": 1.061640977859497, "learning_rate": 0.0002, "epoch": 2.5299820466786356, "step": 35230}, {"loss": 0.6974, "grad_norm": 0.9668078422546387, "learning_rate": 0.0002, "epoch": 2.5307001795332136, "step": 35240}, {"loss": 0.7189, "grad_norm": 0.9554983973503113, "learning_rate": 0.0002, "epoch": 2.5314183123877916, "step": 35250}, {"loss": 0.648, "grad_norm": 0.8343066573143005, "learning_rate": 0.0002, "epoch": 2.5321364452423696, "step": 35260}, {"loss": 0.639, "grad_norm": 0.8408095240592957, "learning_rate": 0.0002, "epoch": 2.532854578096948, "step": 35270}, {"loss": 0.6412, "grad_norm": 0.8593984842300415, "learning_rate": 0.0002, "epoch": 2.533572710951526, "step": 35280}, {"loss": 0.6689, "grad_norm": 0.7593855261802673, "learning_rate": 0.0002, "epoch": 2.534290843806104, "step": 35290}, {"loss": 0.6731, "grad_norm": 0.9179701209068298, "learning_rate": 0.0002, "epoch": 2.5350089766606825, "step": 35300}, {"loss": 0.7194, "grad_norm": 0.749022901058197, "learning_rate": 0.0002, "epoch": 2.5357271095152605, "step": 35310}, {"loss": 0.6488, "grad_norm": 0.7172152400016785, "learning_rate": 0.0002, "epoch": 2.5364452423698385, "step": 35320}, {"loss": 0.6934, "grad_norm": 0.8228873610496521, "learning_rate": 0.0002, "epoch": 2.5371633752244165, "step": 35330}, {"loss": 0.7245, "grad_norm": 0.9663547277450562, "learning_rate": 0.0002, "epoch": 2.5378815080789945, "step": 35340}, {"loss": 0.6974, "grad_norm": 0.8446536660194397, "learning_rate": 0.0002, "epoch": 2.5385996409335725, "step": 35350}, {"loss": 0.6942, "grad_norm": 0.9751029014587402, "learning_rate": 0.0002, "epoch": 2.539317773788151, "step": 35360}, {"loss": 0.7001, "grad_norm": 0.7460315823554993, "learning_rate": 0.0002, "epoch": 2.540035906642729, "step": 35370}, {"loss": 0.6928, "grad_norm": 0.8269246816635132, "learning_rate": 0.0002, "epoch": 2.540754039497307, "step": 35380}, {"loss": 0.6559, "grad_norm": 0.7200030088424683, "learning_rate": 0.0002, "epoch": 2.541472172351885, "step": 35390}, {"loss": 0.6736, "grad_norm": 0.9586671590805054, "learning_rate": 0.0002, "epoch": 2.5421903052064634, "step": 35400}, {"loss": 0.6653, "grad_norm": 0.7872378826141357, "learning_rate": 0.0002, "epoch": 2.5429084380610414, "step": 35410}, {"loss": 0.7002, "grad_norm": 0.8257358074188232, "learning_rate": 0.0002, "epoch": 2.5436265709156194, "step": 35420}, {"loss": 0.6888, "grad_norm": 0.6924505829811096, "learning_rate": 0.0002, "epoch": 2.5443447037701974, "step": 35430}, {"loss": 0.6536, "grad_norm": 1.1171481609344482, "learning_rate": 0.0002, "epoch": 2.5450628366247754, "step": 35440}, {"loss": 0.7087, "grad_norm": 0.9635605216026306, "learning_rate": 0.0002, "epoch": 2.545780969479354, "step": 35450}, {"loss": 0.6545, "grad_norm": 0.9760567545890808, "learning_rate": 0.0002, "epoch": 2.546499102333932, "step": 35460}, {"loss": 0.6858, "grad_norm": 0.8523460030555725, "learning_rate": 0.0002, "epoch": 2.54721723518851, "step": 35470}, {"loss": 0.6702, "grad_norm": 0.9316970109939575, "learning_rate": 0.0002, "epoch": 2.547935368043088, "step": 35480}, {"loss": 0.7028, "grad_norm": 0.7401485443115234, "learning_rate": 0.0002, "epoch": 2.5486535008976663, "step": 35490}, {"loss": 0.6991, "grad_norm": 1.0627065896987915, "learning_rate": 0.0002, "epoch": 2.5493716337522443, "step": 35500}, {"loss": 0.6401, "grad_norm": 0.7463156580924988, "learning_rate": 0.0002, "epoch": 2.5500897666068223, "step": 35510}, {"loss": 0.6978, "grad_norm": 0.9935570359230042, "learning_rate": 0.0002, "epoch": 2.5508078994614003, "step": 35520}, {"loss": 0.7531, "grad_norm": 0.8824051022529602, "learning_rate": 0.0002, "epoch": 2.5515260323159783, "step": 35530}, {"loss": 0.7078, "grad_norm": 0.8018375635147095, "learning_rate": 0.0002, "epoch": 2.5522441651705563, "step": 35540}, {"loss": 0.6757, "grad_norm": 0.7523182034492493, "learning_rate": 0.0002, "epoch": 2.5529622980251347, "step": 35550}, {"loss": 0.6631, "grad_norm": 0.6771712303161621, "learning_rate": 0.0002, "epoch": 2.5536804308797127, "step": 35560}, {"loss": 0.6679, "grad_norm": 0.7903336882591248, "learning_rate": 0.0002, "epoch": 2.5543985637342908, "step": 35570}, {"loss": 0.7069, "grad_norm": 0.7973808646202087, "learning_rate": 0.0002, "epoch": 2.555116696588869, "step": 35580}, {"loss": 0.6388, "grad_norm": 0.9082772731781006, "learning_rate": 0.0002, "epoch": 2.555834829443447, "step": 35590}, {"loss": 0.6926, "grad_norm": 0.779671311378479, "learning_rate": 0.0002, "epoch": 2.556552962298025, "step": 35600}, {"loss": 0.6966, "grad_norm": 0.710058331489563, "learning_rate": 0.0002, "epoch": 2.557271095152603, "step": 35610}, {"loss": 0.701, "grad_norm": 0.8217873573303223, "learning_rate": 0.0002, "epoch": 2.557989228007181, "step": 35620}, {"loss": 0.6773, "grad_norm": 0.8017855286598206, "learning_rate": 0.0002, "epoch": 2.558707360861759, "step": 35630}, {"loss": 0.6764, "grad_norm": 0.6671402454376221, "learning_rate": 0.0002, "epoch": 2.5594254937163377, "step": 35640}, {"loss": 0.6946, "grad_norm": 0.9357045292854309, "learning_rate": 0.0002, "epoch": 2.5601436265709157, "step": 35650}, {"loss": 0.695, "grad_norm": 0.7676312327384949, "learning_rate": 0.0002, "epoch": 2.5608617594254937, "step": 35660}, {"loss": 0.7086, "grad_norm": 0.7602545619010925, "learning_rate": 0.0002, "epoch": 2.5615798922800717, "step": 35670}, {"loss": 0.695, "grad_norm": 0.8112275004386902, "learning_rate": 0.0002, "epoch": 2.56229802513465, "step": 35680}, {"loss": 0.7492, "grad_norm": 0.73296719789505, "learning_rate": 0.0002, "epoch": 2.563016157989228, "step": 35690}, {"loss": 0.6935, "grad_norm": 0.9007818102836609, "learning_rate": 0.0002, "epoch": 2.563734290843806, "step": 35700}, {"loss": 0.7287, "grad_norm": 0.7526060938835144, "learning_rate": 0.0002, "epoch": 2.564452423698384, "step": 35710}, {"loss": 0.6762, "grad_norm": 0.813875675201416, "learning_rate": 0.0002, "epoch": 2.565170556552962, "step": 35720}, {"loss": 0.666, "grad_norm": 0.7767695784568787, "learning_rate": 0.0002, "epoch": 2.5658886894075406, "step": 35730}, {"loss": 0.6591, "grad_norm": 0.7840573787689209, "learning_rate": 0.0002, "epoch": 2.5666068222621186, "step": 35740}, {"loss": 0.7131, "grad_norm": 0.7400487661361694, "learning_rate": 0.0002, "epoch": 2.5673249551166966, "step": 35750}, {"loss": 0.6571, "grad_norm": 0.7424315810203552, "learning_rate": 0.0002, "epoch": 2.5680430879712746, "step": 35760}, {"loss": 0.6861, "grad_norm": 0.7812185883522034, "learning_rate": 0.0002, "epoch": 2.568761220825853, "step": 35770}, {"loss": 0.7034, "grad_norm": 0.8397669196128845, "learning_rate": 0.0002, "epoch": 2.569479353680431, "step": 35780}, {"loss": 0.6734, "grad_norm": 0.7543849945068359, "learning_rate": 0.0002, "epoch": 2.570197486535009, "step": 35790}, {"loss": 0.7393, "grad_norm": 0.903634786605835, "learning_rate": 0.0002, "epoch": 2.570915619389587, "step": 35800}, {"loss": 0.6884, "grad_norm": 0.853335976600647, "learning_rate": 0.0002, "epoch": 2.571633752244165, "step": 35810}, {"loss": 0.6843, "grad_norm": 0.8441029787063599, "learning_rate": 0.0002, "epoch": 2.572351885098743, "step": 35820}, {"loss": 0.6874, "grad_norm": 0.9072228670120239, "learning_rate": 0.0002, "epoch": 2.5730700179533215, "step": 35830}, {"loss": 0.6866, "grad_norm": 0.7720168828964233, "learning_rate": 0.0002, "epoch": 2.5737881508078995, "step": 35840}, {"loss": 0.695, "grad_norm": 0.8719366788864136, "learning_rate": 0.0002, "epoch": 2.5745062836624775, "step": 35850}, {"loss": 0.7842, "grad_norm": 0.766209065914154, "learning_rate": 0.0002, "epoch": 2.575224416517056, "step": 35860}, {"loss": 0.6688, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 2.575942549371634, "step": 35870}, {"loss": 0.7309, "grad_norm": 0.8068482875823975, "learning_rate": 0.0002, "epoch": 2.576660682226212, "step": 35880}, {"loss": 0.703, "grad_norm": 0.8321225643157959, "learning_rate": 0.0002, "epoch": 2.57737881508079, "step": 35890}, {"loss": 0.6885, "grad_norm": 0.9787611961364746, "learning_rate": 0.0002, "epoch": 2.578096947935368, "step": 35900}, {"loss": 0.7246, "grad_norm": 0.6955108642578125, "learning_rate": 0.0002, "epoch": 2.578815080789946, "step": 35910}, {"loss": 0.6972, "grad_norm": 0.8309195637702942, "learning_rate": 0.0002, "epoch": 2.5795332136445244, "step": 35920}, {"loss": 0.6735, "grad_norm": 0.9309390783309937, "learning_rate": 0.0002, "epoch": 2.5802513464991024, "step": 35930}, {"loss": 0.7376, "grad_norm": 0.903537392616272, "learning_rate": 0.0002, "epoch": 2.5809694793536804, "step": 35940}, {"loss": 0.6578, "grad_norm": 0.9530633091926575, "learning_rate": 0.0002, "epoch": 2.5816876122082584, "step": 35950}, {"loss": 0.6707, "grad_norm": 1.0140212774276733, "learning_rate": 0.0002, "epoch": 2.582405745062837, "step": 35960}, {"loss": 0.6859, "grad_norm": 0.8224637508392334, "learning_rate": 0.0002, "epoch": 2.583123877917415, "step": 35970}, {"loss": 0.7158, "grad_norm": 0.7952998280525208, "learning_rate": 0.0002, "epoch": 2.583842010771993, "step": 35980}, {"loss": 0.65, "grad_norm": 0.6057878136634827, "learning_rate": 0.0002, "epoch": 2.584560143626571, "step": 35990}, {"loss": 0.6566, "grad_norm": 0.9172457456588745, "learning_rate": 0.0002, "epoch": 2.585278276481149, "step": 36000}, {"loss": 0.6863, "grad_norm": 1.0061585903167725, "learning_rate": 0.0002, "epoch": 2.5859964093357273, "step": 36010}, {"loss": 0.6831, "grad_norm": 0.8555058240890503, "learning_rate": 0.0002, "epoch": 2.5867145421903053, "step": 36020}, {"loss": 0.7181, "grad_norm": 0.7732099890708923, "learning_rate": 0.0002, "epoch": 2.5874326750448833, "step": 36030}, {"loss": 0.7383, "grad_norm": 0.9026121497154236, "learning_rate": 0.0002, "epoch": 2.5881508078994613, "step": 36040}, {"loss": 0.6221, "grad_norm": 0.7477090954780579, "learning_rate": 0.0002, "epoch": 2.5888689407540397, "step": 36050}, {"loss": 0.6852, "grad_norm": 0.8835780024528503, "learning_rate": 0.0002, "epoch": 2.5895870736086177, "step": 36060}, {"loss": 0.6786, "grad_norm": 0.7555899024009705, "learning_rate": 0.0002, "epoch": 2.5903052064631957, "step": 36070}, {"loss": 0.6723, "grad_norm": 0.7983574867248535, "learning_rate": 0.0002, "epoch": 2.5910233393177737, "step": 36080}, {"loss": 0.64, "grad_norm": 0.9261698722839355, "learning_rate": 0.0002, "epoch": 2.5917414721723517, "step": 36090}, {"loss": 0.6363, "grad_norm": 0.6834031343460083, "learning_rate": 0.0002, "epoch": 2.5924596050269297, "step": 36100}, {"loss": 0.702, "grad_norm": 0.9528526067733765, "learning_rate": 0.0002, "epoch": 2.593177737881508, "step": 36110}, {"loss": 0.7271, "grad_norm": 0.7469993233680725, "learning_rate": 0.0002, "epoch": 2.593895870736086, "step": 36120}, {"loss": 0.6967, "grad_norm": 0.6750355362892151, "learning_rate": 0.0002, "epoch": 2.594614003590664, "step": 36130}, {"loss": 0.6893, "grad_norm": 0.8591015338897705, "learning_rate": 0.0002, "epoch": 2.5953321364452426, "step": 36140}, {"loss": 0.7015, "grad_norm": 0.7359472513198853, "learning_rate": 0.0002, "epoch": 2.5960502692998206, "step": 36150}, {"loss": 0.6697, "grad_norm": 0.8450608253479004, "learning_rate": 0.0002, "epoch": 2.5967684021543986, "step": 36160}, {"loss": 0.7034, "grad_norm": 0.9069468975067139, "learning_rate": 0.0002, "epoch": 2.5974865350089766, "step": 36170}, {"loss": 0.6814, "grad_norm": 0.9261118173599243, "learning_rate": 0.0002, "epoch": 2.5982046678635546, "step": 36180}, {"loss": 0.6575, "grad_norm": 0.7164715528488159, "learning_rate": 0.0002, "epoch": 2.5989228007181326, "step": 36190}, {"loss": 0.7044, "grad_norm": 0.8809511661529541, "learning_rate": 0.0002, "epoch": 2.599640933572711, "step": 36200}, {"loss": 0.6333, "grad_norm": 0.9872701168060303, "learning_rate": 0.0002, "epoch": 2.600359066427289, "step": 36210}, {"loss": 0.689, "grad_norm": 0.7544043064117432, "learning_rate": 0.0002, "epoch": 2.601077199281867, "step": 36220}, {"loss": 0.658, "grad_norm": 0.9890767335891724, "learning_rate": 0.0002, "epoch": 2.601795332136445, "step": 36230}, {"loss": 0.6981, "grad_norm": 0.907865047454834, "learning_rate": 0.0002, "epoch": 2.6025134649910235, "step": 36240}, {"loss": 0.7131, "grad_norm": 0.7724096179008484, "learning_rate": 0.0002, "epoch": 2.6032315978456015, "step": 36250}, {"loss": 0.7034, "grad_norm": 0.7996655106544495, "learning_rate": 0.0002, "epoch": 2.6039497307001795, "step": 36260}, {"loss": 0.6744, "grad_norm": 0.7184412479400635, "learning_rate": 0.0002, "epoch": 2.6046678635547575, "step": 36270}, {"loss": 0.7133, "grad_norm": 0.7781601548194885, "learning_rate": 0.0002, "epoch": 2.6053859964093355, "step": 36280}, {"loss": 0.6975, "grad_norm": 0.8972102403640747, "learning_rate": 0.0002, "epoch": 2.6061041292639135, "step": 36290}, {"loss": 0.6757, "grad_norm": 0.6831884980201721, "learning_rate": 0.0002, "epoch": 2.606822262118492, "step": 36300}, {"loss": 0.6633, "grad_norm": 0.9049789905548096, "learning_rate": 0.0002, "epoch": 2.60754039497307, "step": 36310}, {"loss": 0.7048, "grad_norm": 0.8062970042228699, "learning_rate": 0.0002, "epoch": 2.608258527827648, "step": 36320}, {"loss": 0.6695, "grad_norm": 0.94797682762146, "learning_rate": 0.0002, "epoch": 2.6089766606822264, "step": 36330}, {"loss": 0.6934, "grad_norm": 0.7907559275627136, "learning_rate": 0.0002, "epoch": 2.6096947935368044, "step": 36340}, {"loss": 0.6299, "grad_norm": 0.6720156073570251, "learning_rate": 0.0002, "epoch": 2.6104129263913824, "step": 36350}, {"loss": 0.644, "grad_norm": 0.729228138923645, "learning_rate": 0.0002, "epoch": 2.6111310592459605, "step": 36360}, {"loss": 0.6651, "grad_norm": 0.9072836637496948, "learning_rate": 0.0002, "epoch": 2.6118491921005385, "step": 36370}, {"loss": 0.6821, "grad_norm": 0.8022173643112183, "learning_rate": 0.0002, "epoch": 2.6125673249551165, "step": 36380}, {"loss": 0.6587, "grad_norm": 0.7475612163543701, "learning_rate": 0.0002, "epoch": 2.613285457809695, "step": 36390}, {"loss": 0.6454, "grad_norm": 0.7976534366607666, "learning_rate": 0.0002, "epoch": 2.614003590664273, "step": 36400}, {"loss": 0.7173, "grad_norm": 0.7118260860443115, "learning_rate": 0.0002, "epoch": 2.614721723518851, "step": 36410}, {"loss": 0.7173, "grad_norm": 0.666500985622406, "learning_rate": 0.0002, "epoch": 2.6154398563734294, "step": 36420}, {"loss": 0.719, "grad_norm": 0.8776089549064636, "learning_rate": 0.0002, "epoch": 2.6161579892280074, "step": 36430}, {"loss": 0.6928, "grad_norm": 0.9375919699668884, "learning_rate": 0.0002, "epoch": 2.6168761220825854, "step": 36440}, {"loss": 0.6627, "grad_norm": 0.8162244558334351, "learning_rate": 0.0002, "epoch": 2.6175942549371634, "step": 36450}, {"loss": 0.6586, "grad_norm": 0.8459304571151733, "learning_rate": 0.0002, "epoch": 2.6183123877917414, "step": 36460}, {"loss": 0.6777, "grad_norm": 0.7731037735939026, "learning_rate": 0.0002, "epoch": 2.6190305206463194, "step": 36470}, {"loss": 0.7288, "grad_norm": 0.7857680320739746, "learning_rate": 0.0002, "epoch": 2.619748653500898, "step": 36480}, {"loss": 0.664, "grad_norm": 0.8415161371231079, "learning_rate": 0.0002, "epoch": 2.620466786355476, "step": 36490}, {"loss": 0.703, "grad_norm": 0.8103558421134949, "learning_rate": 0.0002, "epoch": 2.621184919210054, "step": 36500}, {"loss": 0.6693, "grad_norm": 0.7876150608062744, "learning_rate": 0.0002, "epoch": 2.621903052064632, "step": 36510}, {"loss": 0.6562, "grad_norm": 0.7316484451293945, "learning_rate": 0.0002, "epoch": 2.6226211849192103, "step": 36520}, {"loss": 0.6263, "grad_norm": 0.7209784984588623, "learning_rate": 0.0002, "epoch": 2.6233393177737883, "step": 36530}, {"loss": 0.6767, "grad_norm": 0.8933016657829285, "learning_rate": 0.0002, "epoch": 2.6240574506283663, "step": 36540}, {"loss": 0.7217, "grad_norm": 0.8078171610832214, "learning_rate": 0.0002, "epoch": 2.6247755834829443, "step": 36550}, {"loss": 0.7106, "grad_norm": 0.9134724736213684, "learning_rate": 0.0002, "epoch": 2.6254937163375223, "step": 36560}, {"loss": 0.6909, "grad_norm": 0.8691368699073792, "learning_rate": 0.0002, "epoch": 2.6262118491921003, "step": 36570}, {"loss": 0.6769, "grad_norm": 0.706479012966156, "learning_rate": 0.0002, "epoch": 2.6269299820466787, "step": 36580}, {"loss": 0.6864, "grad_norm": 0.9333644509315491, "learning_rate": 0.0002, "epoch": 2.6276481149012567, "step": 36590}, {"loss": 0.6704, "grad_norm": 0.8156154155731201, "learning_rate": 0.0002, "epoch": 2.6283662477558347, "step": 36600}, {"loss": 0.7128, "grad_norm": 0.812745213508606, "learning_rate": 0.0002, "epoch": 2.629084380610413, "step": 36610}, {"loss": 0.6901, "grad_norm": 0.8898148536682129, "learning_rate": 0.0002, "epoch": 2.629802513464991, "step": 36620}, {"loss": 0.6821, "grad_norm": 0.8083946108818054, "learning_rate": 0.0002, "epoch": 2.630520646319569, "step": 36630}, {"loss": 0.7285, "grad_norm": 0.7050122618675232, "learning_rate": 0.0002, "epoch": 2.631238779174147, "step": 36640}, {"loss": 0.6751, "grad_norm": 0.8155789971351624, "learning_rate": 0.0002, "epoch": 2.631956912028725, "step": 36650}, {"loss": 0.7258, "grad_norm": 0.9102175235748291, "learning_rate": 0.0002, "epoch": 2.632675044883303, "step": 36660}, {"loss": 0.6697, "grad_norm": 0.6621248126029968, "learning_rate": 0.0002, "epoch": 2.6333931777378816, "step": 36670}, {"loss": 0.6405, "grad_norm": 0.7338519096374512, "learning_rate": 0.0002, "epoch": 2.6341113105924596, "step": 36680}, {"loss": 0.6784, "grad_norm": 0.7536506652832031, "learning_rate": 0.0002, "epoch": 2.6348294434470376, "step": 36690}, {"loss": 0.6974, "grad_norm": 0.9357436299324036, "learning_rate": 0.0002, "epoch": 2.635547576301616, "step": 36700}, {"loss": 0.7729, "grad_norm": 0.7732111215591431, "learning_rate": 0.0002, "epoch": 2.636265709156194, "step": 36710}, {"loss": 0.6905, "grad_norm": 0.6863537430763245, "learning_rate": 0.0002, "epoch": 2.636983842010772, "step": 36720}, {"loss": 0.7058, "grad_norm": 0.8014764785766602, "learning_rate": 0.0002, "epoch": 2.63770197486535, "step": 36730}, {"loss": 0.697, "grad_norm": 0.8103911280632019, "learning_rate": 0.0002, "epoch": 2.638420107719928, "step": 36740}, {"loss": 0.7164, "grad_norm": 0.882652997970581, "learning_rate": 0.0002, "epoch": 2.639138240574506, "step": 36750}, {"loss": 0.6689, "grad_norm": 0.8705278038978577, "learning_rate": 0.0002, "epoch": 2.6398563734290845, "step": 36760}, {"loss": 0.6863, "grad_norm": 0.80764240026474, "learning_rate": 0.0002, "epoch": 2.6405745062836625, "step": 36770}, {"loss": 0.6761, "grad_norm": 0.9668620824813843, "learning_rate": 0.0002, "epoch": 2.6412926391382405, "step": 36780}, {"loss": 0.6576, "grad_norm": 0.7477577328681946, "learning_rate": 0.0002, "epoch": 2.6420107719928185, "step": 36790}, {"loss": 0.6558, "grad_norm": 0.8344516754150391, "learning_rate": 0.0002, "epoch": 2.642728904847397, "step": 36800}, {"loss": 0.6949, "grad_norm": 0.9520720839500427, "learning_rate": 0.0002, "epoch": 2.643447037701975, "step": 36810}, {"loss": 0.6731, "grad_norm": 0.5942372679710388, "learning_rate": 0.0002, "epoch": 2.644165170556553, "step": 36820}, {"loss": 0.6509, "grad_norm": 0.7411555051803589, "learning_rate": 0.0002, "epoch": 2.644883303411131, "step": 36830}, {"loss": 0.6948, "grad_norm": 0.6597771048545837, "learning_rate": 0.0002, "epoch": 2.645601436265709, "step": 36840}, {"loss": 0.6379, "grad_norm": 0.8636548519134521, "learning_rate": 0.0002, "epoch": 2.646319569120287, "step": 36850}, {"loss": 0.6965, "grad_norm": 0.8557497262954712, "learning_rate": 0.0002, "epoch": 2.6470377019748654, "step": 36860}, {"loss": 0.7061, "grad_norm": 0.8535996675491333, "learning_rate": 0.0002, "epoch": 2.6477558348294434, "step": 36870}, {"loss": 0.7087, "grad_norm": 0.7996463775634766, "learning_rate": 0.0002, "epoch": 2.6484739676840214, "step": 36880}, {"loss": 0.7174, "grad_norm": 0.6462067365646362, "learning_rate": 0.0002, "epoch": 2.6491921005386, "step": 36890}, {"loss": 0.6905, "grad_norm": 0.8849772214889526, "learning_rate": 0.0002, "epoch": 2.649910233393178, "step": 36900}, {"loss": 0.6973, "grad_norm": 0.999173641204834, "learning_rate": 0.0002, "epoch": 2.650628366247756, "step": 36910}, {"loss": 0.628, "grad_norm": 0.7221724987030029, "learning_rate": 0.0002, "epoch": 2.651346499102334, "step": 36920}, {"loss": 0.6698, "grad_norm": 0.8122989535331726, "learning_rate": 0.0002, "epoch": 2.652064631956912, "step": 36930}, {"loss": 0.6758, "grad_norm": 0.724267840385437, "learning_rate": 0.0002, "epoch": 2.65278276481149, "step": 36940}, {"loss": 0.6542, "grad_norm": 0.8250583410263062, "learning_rate": 0.0002, "epoch": 2.6535008976660683, "step": 36950}, {"loss": 0.6914, "grad_norm": 0.7623526453971863, "learning_rate": 0.0002, "epoch": 2.6542190305206463, "step": 36960}, {"loss": 0.6699, "grad_norm": 0.6474025845527649, "learning_rate": 0.0002, "epoch": 2.6549371633752243, "step": 36970}, {"loss": 0.7235, "grad_norm": 0.9751694202423096, "learning_rate": 0.0002, "epoch": 2.655655296229803, "step": 36980}, {"loss": 0.7423, "grad_norm": 0.8338939547538757, "learning_rate": 0.0002, "epoch": 2.656373429084381, "step": 36990}, {"loss": 0.6641, "grad_norm": 0.8877421021461487, "learning_rate": 0.0002, "epoch": 2.657091561938959, "step": 37000}, {"loss": 0.6639, "grad_norm": 0.9590298533439636, "learning_rate": 0.0002, "epoch": 2.657809694793537, "step": 37010}, {"loss": 0.6617, "grad_norm": 0.8224121928215027, "learning_rate": 0.0002, "epoch": 2.658527827648115, "step": 37020}, {"loss": 0.6359, "grad_norm": 0.9871236681938171, "learning_rate": 0.0002, "epoch": 2.659245960502693, "step": 37030}, {"loss": 0.65, "grad_norm": 0.8729037046432495, "learning_rate": 0.0002, "epoch": 2.6599640933572712, "step": 37040}, {"loss": 0.6561, "grad_norm": 0.6279319524765015, "learning_rate": 0.0002, "epoch": 2.6606822262118492, "step": 37050}, {"loss": 0.7031, "grad_norm": 1.0278962850570679, "learning_rate": 0.0002, "epoch": 2.6614003590664272, "step": 37060}, {"loss": 0.6552, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 2.6621184919210052, "step": 37070}, {"loss": 0.6994, "grad_norm": 0.7432018518447876, "learning_rate": 0.0002, "epoch": 2.6628366247755837, "step": 37080}, {"loss": 0.7086, "grad_norm": 0.9425008296966553, "learning_rate": 0.0002, "epoch": 2.6635547576301617, "step": 37090}, {"loss": 0.716, "grad_norm": 0.7542579174041748, "learning_rate": 0.0002, "epoch": 2.6642728904847397, "step": 37100}, {"loss": 0.6714, "grad_norm": 0.8469315767288208, "learning_rate": 0.0002, "epoch": 2.6649910233393177, "step": 37110}, {"loss": 0.6638, "grad_norm": 0.865777313709259, "learning_rate": 0.0002, "epoch": 2.6657091561938957, "step": 37120}, {"loss": 0.741, "grad_norm": 0.7293250560760498, "learning_rate": 0.0002, "epoch": 2.6664272890484737, "step": 37130}, {"loss": 0.6662, "grad_norm": 0.7199395895004272, "learning_rate": 0.0002, "epoch": 2.667145421903052, "step": 37140}, {"loss": 0.7078, "grad_norm": 0.7801268100738525, "learning_rate": 0.0002, "epoch": 2.66786355475763, "step": 37150}, {"loss": 0.7083, "grad_norm": 0.8706921935081482, "learning_rate": 0.0002, "epoch": 2.668581687612208, "step": 37160}, {"loss": 0.69, "grad_norm": 0.7124722599983215, "learning_rate": 0.0002, "epoch": 2.6692998204667866, "step": 37170}, {"loss": 0.625, "grad_norm": 0.8333015441894531, "learning_rate": 0.0002, "epoch": 2.6700179533213646, "step": 37180}, {"loss": 0.636, "grad_norm": 0.8822736740112305, "learning_rate": 0.0002, "epoch": 2.6707360861759426, "step": 37190}, {"loss": 0.6731, "grad_norm": 0.8300906419754028, "learning_rate": 0.0002, "epoch": 2.6714542190305206, "step": 37200}, {"loss": 0.6883, "grad_norm": 0.887126088142395, "learning_rate": 0.0002, "epoch": 2.6721723518850986, "step": 37210}, {"loss": 0.7211, "grad_norm": 0.7473671436309814, "learning_rate": 0.0002, "epoch": 2.6728904847396766, "step": 37220}, {"loss": 0.7032, "grad_norm": 0.8121018409729004, "learning_rate": 0.0002, "epoch": 2.673608617594255, "step": 37230}, {"loss": 0.6262, "grad_norm": 0.7882586717605591, "learning_rate": 0.0002, "epoch": 2.674326750448833, "step": 37240}, {"loss": 0.7201, "grad_norm": 0.797060489654541, "learning_rate": 0.0002, "epoch": 2.675044883303411, "step": 37250}, {"loss": 0.6635, "grad_norm": 0.9776935577392578, "learning_rate": 0.0002, "epoch": 2.6757630161579895, "step": 37260}, {"loss": 0.6883, "grad_norm": 0.9527283906936646, "learning_rate": 0.0002, "epoch": 2.6764811490125675, "step": 37270}, {"loss": 0.6968, "grad_norm": 0.7232038974761963, "learning_rate": 0.0002, "epoch": 2.6771992818671455, "step": 37280}, {"loss": 0.6544, "grad_norm": 0.8514575362205505, "learning_rate": 0.0002, "epoch": 2.6779174147217235, "step": 37290}, {"loss": 0.6956, "grad_norm": 0.8951214551925659, "learning_rate": 0.0002, "epoch": 2.6786355475763015, "step": 37300}, {"loss": 0.7435, "grad_norm": 0.7569643259048462, "learning_rate": 0.0002, "epoch": 2.6793536804308795, "step": 37310}, {"loss": 0.6522, "grad_norm": 1.0522346496582031, "learning_rate": 0.0002, "epoch": 2.680071813285458, "step": 37320}, {"loss": 0.7051, "grad_norm": 0.8914180994033813, "learning_rate": 0.0002, "epoch": 2.680789946140036, "step": 37330}, {"loss": 0.6941, "grad_norm": 0.8251807689666748, "learning_rate": 0.0002, "epoch": 2.681508078994614, "step": 37340}, {"loss": 0.6783, "grad_norm": 0.8215394020080566, "learning_rate": 0.0002, "epoch": 2.682226211849192, "step": 37350}, {"loss": 0.682, "grad_norm": 0.8043696880340576, "learning_rate": 0.0002, "epoch": 2.6829443447037704, "step": 37360}, {"loss": 0.6614, "grad_norm": 0.767250657081604, "learning_rate": 0.0002, "epoch": 2.6836624775583484, "step": 37370}, {"loss": 0.7197, "grad_norm": 0.817740261554718, "learning_rate": 0.0002, "epoch": 2.6843806104129264, "step": 37380}, {"loss": 0.6839, "grad_norm": 0.7963255047798157, "learning_rate": 0.0002, "epoch": 2.6850987432675044, "step": 37390}, {"loss": 0.7469, "grad_norm": 0.839271605014801, "learning_rate": 0.0002, "epoch": 2.6858168761220824, "step": 37400}, {"loss": 0.6879, "grad_norm": 0.7882823348045349, "learning_rate": 0.0002, "epoch": 2.6865350089766604, "step": 37410}, {"loss": 0.6768, "grad_norm": 0.8316412568092346, "learning_rate": 0.0002, "epoch": 2.687253141831239, "step": 37420}, {"loss": 0.7031, "grad_norm": 1.0044993162155151, "learning_rate": 0.0002, "epoch": 2.687971274685817, "step": 37430}, {"loss": 0.6988, "grad_norm": 0.8342832326889038, "learning_rate": 0.0002, "epoch": 2.688689407540395, "step": 37440}, {"loss": 0.6685, "grad_norm": 0.6743215322494507, "learning_rate": 0.0002, "epoch": 2.6894075403949733, "step": 37450}, {"loss": 0.6567, "grad_norm": 0.6872923970222473, "learning_rate": 0.0002, "epoch": 2.6901256732495513, "step": 37460}, {"loss": 0.7089, "grad_norm": 0.7377792596817017, "learning_rate": 0.0002, "epoch": 2.6908438061041293, "step": 37470}, {"loss": 0.676, "grad_norm": 0.7677304744720459, "learning_rate": 0.0002, "epoch": 2.6915619389587073, "step": 37480}, {"loss": 0.6693, "grad_norm": 0.9951061010360718, "learning_rate": 0.0002, "epoch": 2.6922800718132853, "step": 37490}, {"loss": 0.6517, "grad_norm": 0.7452111840248108, "learning_rate": 0.0002, "epoch": 2.6929982046678633, "step": 37500}, {"loss": 0.7503, "grad_norm": 0.9663393497467041, "learning_rate": 0.0002, "epoch": 2.6937163375224418, "step": 37510}, {"loss": 0.7025, "grad_norm": 0.7919635772705078, "learning_rate": 0.0002, "epoch": 2.6944344703770198, "step": 37520}, {"loss": 0.7257, "grad_norm": 0.9977981448173523, "learning_rate": 0.0002, "epoch": 2.6951526032315978, "step": 37530}, {"loss": 0.6507, "grad_norm": 0.7279480695724487, "learning_rate": 0.0002, "epoch": 2.695870736086176, "step": 37540}, {"loss": 0.7448, "grad_norm": 0.7218075394630432, "learning_rate": 0.0002, "epoch": 2.6965888689407542, "step": 37550}, {"loss": 0.6845, "grad_norm": 0.9041047096252441, "learning_rate": 0.0002, "epoch": 2.6973070017953322, "step": 37560}, {"loss": 0.6848, "grad_norm": 0.7689407467842102, "learning_rate": 0.0002, "epoch": 2.6980251346499102, "step": 37570}, {"loss": 0.7136, "grad_norm": 0.8184728622436523, "learning_rate": 0.0002, "epoch": 2.6987432675044882, "step": 37580}, {"loss": 0.6952, "grad_norm": 0.7536661624908447, "learning_rate": 0.0002, "epoch": 2.6994614003590662, "step": 37590}, {"loss": 0.7064, "grad_norm": 0.8371431231498718, "learning_rate": 0.0002, "epoch": 2.7001795332136447, "step": 37600}, {"loss": 0.7118, "grad_norm": 0.8562723994255066, "learning_rate": 0.0002, "epoch": 2.7008976660682227, "step": 37610}, {"loss": 0.6602, "grad_norm": 0.8227898478507996, "learning_rate": 0.0002, "epoch": 2.7016157989228007, "step": 37620}, {"loss": 0.7324, "grad_norm": 0.764792799949646, "learning_rate": 0.0002, "epoch": 2.7023339317773787, "step": 37630}, {"loss": 0.7289, "grad_norm": 0.7782649993896484, "learning_rate": 0.0002, "epoch": 2.703052064631957, "step": 37640}, {"loss": 0.705, "grad_norm": 0.7669944167137146, "learning_rate": 0.0002, "epoch": 2.703770197486535, "step": 37650}, {"loss": 0.7019, "grad_norm": 0.7945750951766968, "learning_rate": 0.0002, "epoch": 2.704488330341113, "step": 37660}, {"loss": 0.6789, "grad_norm": 0.6840786337852478, "learning_rate": 0.0002, "epoch": 2.705206463195691, "step": 37670}, {"loss": 0.768, "grad_norm": 1.0565117597579956, "learning_rate": 0.0002, "epoch": 2.705924596050269, "step": 37680}, {"loss": 0.737, "grad_norm": 0.7407042384147644, "learning_rate": 0.0002, "epoch": 2.706642728904847, "step": 37690}, {"loss": 0.712, "grad_norm": 0.7862113118171692, "learning_rate": 0.0002, "epoch": 2.7073608617594256, "step": 37700}, {"loss": 0.6331, "grad_norm": 0.7487596273422241, "learning_rate": 0.0002, "epoch": 2.7080789946140036, "step": 37710}, {"loss": 0.6917, "grad_norm": 0.9416596293449402, "learning_rate": 0.0002, "epoch": 2.7087971274685816, "step": 37720}, {"loss": 0.717, "grad_norm": 0.8943207263946533, "learning_rate": 0.0002, "epoch": 2.70951526032316, "step": 37730}, {"loss": 0.6505, "grad_norm": 0.9263445138931274, "learning_rate": 0.0002, "epoch": 2.710233393177738, "step": 37740}, {"loss": 0.7423, "grad_norm": 0.6869737505912781, "learning_rate": 0.0002, "epoch": 2.710951526032316, "step": 37750}, {"loss": 0.724, "grad_norm": 0.9186407923698425, "learning_rate": 0.0002, "epoch": 2.711669658886894, "step": 37760}, {"loss": 0.6757, "grad_norm": 0.8379335999488831, "learning_rate": 0.0002, "epoch": 2.712387791741472, "step": 37770}, {"loss": 0.7352, "grad_norm": 0.7248736023902893, "learning_rate": 0.0002, "epoch": 2.71310592459605, "step": 37780}, {"loss": 0.7023, "grad_norm": 0.8636229038238525, "learning_rate": 0.0002, "epoch": 2.7138240574506285, "step": 37790}, {"loss": 0.726, "grad_norm": 0.7590767741203308, "learning_rate": 0.0002, "epoch": 2.7145421903052065, "step": 37800}, {"loss": 0.6837, "grad_norm": 0.8946404457092285, "learning_rate": 0.0002, "epoch": 2.7152603231597845, "step": 37810}, {"loss": 0.7135, "grad_norm": 0.7822132706642151, "learning_rate": 0.0002, "epoch": 2.7159784560143625, "step": 37820}, {"loss": 0.7034, "grad_norm": 0.7882820963859558, "learning_rate": 0.0002, "epoch": 2.716696588868941, "step": 37830}, {"loss": 0.6667, "grad_norm": 0.8025872707366943, "learning_rate": 0.0002, "epoch": 2.717414721723519, "step": 37840}, {"loss": 0.6967, "grad_norm": 0.8618839979171753, "learning_rate": 0.0002, "epoch": 2.718132854578097, "step": 37850}, {"loss": 0.699, "grad_norm": 0.6975733637809753, "learning_rate": 0.0002, "epoch": 2.718850987432675, "step": 37860}, {"loss": 0.6858, "grad_norm": 0.7952182292938232, "learning_rate": 0.0002, "epoch": 2.719569120287253, "step": 37870}, {"loss": 0.7018, "grad_norm": 0.7580680251121521, "learning_rate": 0.0002, "epoch": 2.7202872531418314, "step": 37880}, {"loss": 0.6838, "grad_norm": 0.9504257440567017, "learning_rate": 0.0002, "epoch": 2.7210053859964094, "step": 37890}, {"loss": 0.6801, "grad_norm": 0.856614351272583, "learning_rate": 0.0002, "epoch": 2.7217235188509874, "step": 37900}, {"loss": 0.6647, "grad_norm": 1.0092085599899292, "learning_rate": 0.0002, "epoch": 2.7224416517055654, "step": 37910}, {"loss": 0.6709, "grad_norm": 0.9009839296340942, "learning_rate": 0.0002, "epoch": 2.723159784560144, "step": 37920}, {"loss": 0.7009, "grad_norm": 0.9247435331344604, "learning_rate": 0.0002, "epoch": 2.723877917414722, "step": 37930}, {"loss": 0.6924, "grad_norm": 1.0774317979812622, "learning_rate": 0.0002, "epoch": 2.7245960502693, "step": 37940}, {"loss": 0.6706, "grad_norm": 0.9104372262954712, "learning_rate": 0.0002, "epoch": 2.725314183123878, "step": 37950}, {"loss": 0.6608, "grad_norm": 0.7904245257377625, "learning_rate": 0.0002, "epoch": 2.726032315978456, "step": 37960}, {"loss": 0.6937, "grad_norm": 0.9555521607398987, "learning_rate": 0.0002, "epoch": 2.726750448833034, "step": 37970}, {"loss": 0.6497, "grad_norm": 0.7769099473953247, "learning_rate": 0.0002, "epoch": 2.7274685816876123, "step": 37980}, {"loss": 0.63, "grad_norm": 0.9202065467834473, "learning_rate": 0.0002, "epoch": 2.7281867145421903, "step": 37990}, {"loss": 0.7021, "grad_norm": 0.732510507106781, "learning_rate": 0.0002, "epoch": 2.7289048473967683, "step": 38000}, {"loss": 0.6665, "grad_norm": 0.7723771929740906, "learning_rate": 0.0002, "epoch": 2.7296229802513468, "step": 38010}, {"loss": 0.6836, "grad_norm": 0.7948567867279053, "learning_rate": 0.0002, "epoch": 2.7303411131059248, "step": 38020}, {"loss": 0.6802, "grad_norm": 0.7702966928482056, "learning_rate": 0.0002, "epoch": 2.7310592459605028, "step": 38030}, {"loss": 0.6859, "grad_norm": 0.689098060131073, "learning_rate": 0.0002, "epoch": 2.7317773788150808, "step": 38040}, {"loss": 0.7027, "grad_norm": 0.7951080203056335, "learning_rate": 0.0002, "epoch": 2.7324955116696588, "step": 38050}, {"loss": 0.6895, "grad_norm": 0.7284924983978271, "learning_rate": 0.0002, "epoch": 2.7332136445242368, "step": 38060}, {"loss": 0.7409, "grad_norm": 0.9198044538497925, "learning_rate": 0.0002, "epoch": 2.733931777378815, "step": 38070}, {"loss": 0.6699, "grad_norm": 0.8653260469436646, "learning_rate": 0.0002, "epoch": 2.734649910233393, "step": 38080}, {"loss": 0.6832, "grad_norm": 0.8503400683403015, "learning_rate": 0.0002, "epoch": 2.735368043087971, "step": 38090}, {"loss": 0.6955, "grad_norm": 0.8388783931732178, "learning_rate": 0.0002, "epoch": 2.736086175942549, "step": 38100}, {"loss": 0.7059, "grad_norm": 0.7636904716491699, "learning_rate": 0.0002, "epoch": 2.7368043087971277, "step": 38110}, {"loss": 0.6659, "grad_norm": 0.8990790247917175, "learning_rate": 0.0002, "epoch": 2.7375224416517057, "step": 38120}, {"loss": 0.6487, "grad_norm": 0.8878970742225647, "learning_rate": 0.0002, "epoch": 2.7382405745062837, "step": 38130}, {"loss": 0.6725, "grad_norm": 0.7684310078620911, "learning_rate": 0.0002, "epoch": 2.7389587073608617, "step": 38140}, {"loss": 0.6935, "grad_norm": 1.0777359008789062, "learning_rate": 0.0002, "epoch": 2.7396768402154397, "step": 38150}, {"loss": 0.6904, "grad_norm": 0.768764317035675, "learning_rate": 0.0002, "epoch": 2.740394973070018, "step": 38160}, {"loss": 0.6509, "grad_norm": 0.7490760087966919, "learning_rate": 0.0002, "epoch": 2.741113105924596, "step": 38170}, {"loss": 0.6907, "grad_norm": 0.860373854637146, "learning_rate": 0.0002, "epoch": 2.741831238779174, "step": 38180}, {"loss": 0.6704, "grad_norm": 0.7145599722862244, "learning_rate": 0.0002, "epoch": 2.742549371633752, "step": 38190}, {"loss": 0.6798, "grad_norm": 0.8347760438919067, "learning_rate": 0.0002, "epoch": 2.7432675044883306, "step": 38200}, {"loss": 0.7029, "grad_norm": 0.8425729274749756, "learning_rate": 0.0002, "epoch": 2.7439856373429086, "step": 38210}, {"loss": 0.6442, "grad_norm": 0.9289436936378479, "learning_rate": 0.0002, "epoch": 2.7447037701974866, "step": 38220}, {"loss": 0.694, "grad_norm": 0.7608675360679626, "learning_rate": 0.0002, "epoch": 2.7454219030520646, "step": 38230}, {"loss": 0.7097, "grad_norm": 0.8067167401313782, "learning_rate": 0.0002, "epoch": 2.7461400359066426, "step": 38240}, {"loss": 0.704, "grad_norm": 0.8599629402160645, "learning_rate": 0.0002, "epoch": 2.7468581687612206, "step": 38250}, {"loss": 0.6259, "grad_norm": 0.8425742387771606, "learning_rate": 0.0002, "epoch": 2.747576301615799, "step": 38260}, {"loss": 0.6875, "grad_norm": 0.8626754283905029, "learning_rate": 0.0002, "epoch": 2.748294434470377, "step": 38270}, {"loss": 0.7357, "grad_norm": 0.797652006149292, "learning_rate": 0.0002, "epoch": 2.749012567324955, "step": 38280}, {"loss": 0.7184, "grad_norm": 0.7971500754356384, "learning_rate": 0.0002, "epoch": 2.7497307001795335, "step": 38290}, {"loss": 0.7035, "grad_norm": 0.9786333441734314, "learning_rate": 0.0002, "epoch": 2.7504488330341115, "step": 38300}, {"loss": 0.6501, "grad_norm": 0.7146100997924805, "learning_rate": 0.0002, "epoch": 2.7511669658886895, "step": 38310}, {"loss": 0.7087, "grad_norm": 0.8436099886894226, "learning_rate": 0.0002, "epoch": 2.7518850987432675, "step": 38320}, {"loss": 0.6911, "grad_norm": 0.8943847417831421, "learning_rate": 0.0002, "epoch": 2.7526032315978455, "step": 38330}, {"loss": 0.6397, "grad_norm": 0.8170148730278015, "learning_rate": 0.0002, "epoch": 2.7533213644524235, "step": 38340}, {"loss": 0.6756, "grad_norm": 0.7804728746414185, "learning_rate": 0.0002, "epoch": 2.754039497307002, "step": 38350}, {"loss": 0.6954, "grad_norm": 0.9139971137046814, "learning_rate": 0.0002, "epoch": 2.75475763016158, "step": 38360}, {"loss": 0.7083, "grad_norm": 0.835332453250885, "learning_rate": 0.0002, "epoch": 2.755475763016158, "step": 38370}, {"loss": 0.7112, "grad_norm": 1.0904794931411743, "learning_rate": 0.0002, "epoch": 2.756193895870736, "step": 38380}, {"loss": 0.6881, "grad_norm": 0.7443365454673767, "learning_rate": 0.0002, "epoch": 2.7569120287253144, "step": 38390}, {"loss": 0.6896, "grad_norm": 1.1336839199066162, "learning_rate": 0.0002, "epoch": 2.7576301615798924, "step": 38400}, {"loss": 0.6777, "grad_norm": 0.9024015665054321, "learning_rate": 0.0002, "epoch": 2.7583482944344704, "step": 38410}, {"loss": 0.629, "grad_norm": 0.7380578517913818, "learning_rate": 0.0002, "epoch": 2.7590664272890484, "step": 38420}, {"loss": 0.7708, "grad_norm": 0.9860634207725525, "learning_rate": 0.0002, "epoch": 2.7597845601436264, "step": 38430}, {"loss": 0.6694, "grad_norm": 0.7928970456123352, "learning_rate": 0.0002, "epoch": 2.760502692998205, "step": 38440}, {"loss": 0.669, "grad_norm": 1.0357221364974976, "learning_rate": 0.0002, "epoch": 2.761220825852783, "step": 38450}, {"loss": 0.6763, "grad_norm": 0.8110901117324829, "learning_rate": 0.0002, "epoch": 2.761938958707361, "step": 38460}, {"loss": 0.6528, "grad_norm": 0.8420981764793396, "learning_rate": 0.0002, "epoch": 2.762657091561939, "step": 38470}, {"loss": 0.6841, "grad_norm": 0.858955979347229, "learning_rate": 0.0002, "epoch": 2.7633752244165173, "step": 38480}, {"loss": 0.7387, "grad_norm": 0.9851368069648743, "learning_rate": 0.0002, "epoch": 2.7640933572710953, "step": 38490}, {"loss": 0.6939, "grad_norm": 0.8073325753211975, "learning_rate": 0.0002, "epoch": 2.7648114901256733, "step": 38500}, {"loss": 0.7033, "grad_norm": 1.0654062032699585, "learning_rate": 0.0002, "epoch": 2.7655296229802513, "step": 38510}, {"loss": 0.692, "grad_norm": 0.719603955745697, "learning_rate": 0.0002, "epoch": 2.7662477558348293, "step": 38520}, {"loss": 0.7032, "grad_norm": 0.9790831804275513, "learning_rate": 0.0002, "epoch": 2.7669658886894073, "step": 38530}, {"loss": 0.6613, "grad_norm": 0.907619833946228, "learning_rate": 0.0002, "epoch": 2.7676840215439857, "step": 38540}, {"loss": 0.6683, "grad_norm": 0.7463719248771667, "learning_rate": 0.0002, "epoch": 2.7684021543985637, "step": 38550}, {"loss": 0.6785, "grad_norm": 1.0687178373336792, "learning_rate": 0.0002, "epoch": 2.7691202872531417, "step": 38560}, {"loss": 0.6901, "grad_norm": 0.7397776246070862, "learning_rate": 0.0002, "epoch": 2.76983842010772, "step": 38570}, {"loss": 0.6861, "grad_norm": 0.7392559051513672, "learning_rate": 0.0002, "epoch": 2.770556552962298, "step": 38580}, {"loss": 0.6954, "grad_norm": 0.9774793982505798, "learning_rate": 0.0002, "epoch": 2.771274685816876, "step": 38590}, {"loss": 0.6641, "grad_norm": 0.9502208828926086, "learning_rate": 0.0002, "epoch": 2.771992818671454, "step": 38600}, {"loss": 0.6908, "grad_norm": 0.776108980178833, "learning_rate": 0.0002, "epoch": 2.772710951526032, "step": 38610}, {"loss": 0.6826, "grad_norm": 0.7633077502250671, "learning_rate": 0.0002, "epoch": 2.77342908438061, "step": 38620}, {"loss": 0.6559, "grad_norm": 0.9445580244064331, "learning_rate": 0.0002, "epoch": 2.7741472172351886, "step": 38630}, {"loss": 0.7085, "grad_norm": 0.943165123462677, "learning_rate": 0.0002, "epoch": 2.7748653500897666, "step": 38640}, {"loss": 0.6739, "grad_norm": 0.9045929908752441, "learning_rate": 0.0002, "epoch": 2.7755834829443446, "step": 38650}, {"loss": 0.7351, "grad_norm": 0.9425684213638306, "learning_rate": 0.0002, "epoch": 2.7763016157989227, "step": 38660}, {"loss": 0.6602, "grad_norm": 0.9106295704841614, "learning_rate": 0.0002, "epoch": 2.777019748653501, "step": 38670}, {"loss": 0.7076, "grad_norm": 0.6264749765396118, "learning_rate": 0.0002, "epoch": 2.777737881508079, "step": 38680}, {"loss": 0.7234, "grad_norm": 0.9156801700592041, "learning_rate": 0.0002, "epoch": 2.778456014362657, "step": 38690}, {"loss": 0.6804, "grad_norm": 0.9752956032752991, "learning_rate": 0.0002, "epoch": 2.779174147217235, "step": 38700}, {"loss": 0.686, "grad_norm": 0.7849555611610413, "learning_rate": 0.0002, "epoch": 2.779892280071813, "step": 38710}, {"loss": 0.72, "grad_norm": 0.8109981417655945, "learning_rate": 0.0002, "epoch": 2.780610412926391, "step": 38720}, {"loss": 0.6592, "grad_norm": 0.7882387638092041, "learning_rate": 0.0002, "epoch": 2.7813285457809696, "step": 38730}, {"loss": 0.6948, "grad_norm": 0.9049678444862366, "learning_rate": 0.0002, "epoch": 2.7820466786355476, "step": 38740}, {"loss": 0.7032, "grad_norm": 0.7678212523460388, "learning_rate": 0.0002, "epoch": 2.7827648114901256, "step": 38750}, {"loss": 0.6882, "grad_norm": 0.9754453301429749, "learning_rate": 0.0002, "epoch": 2.783482944344704, "step": 38760}, {"loss": 0.7071, "grad_norm": 0.7643493413925171, "learning_rate": 0.0002, "epoch": 2.784201077199282, "step": 38770}, {"loss": 0.6817, "grad_norm": 0.7440303564071655, "learning_rate": 0.0002, "epoch": 2.78491921005386, "step": 38780}, {"loss": 0.6869, "grad_norm": 0.8870946168899536, "learning_rate": 0.0002, "epoch": 2.785637342908438, "step": 38790}, {"loss": 0.7391, "grad_norm": 0.8100579977035522, "learning_rate": 0.0002, "epoch": 2.786355475763016, "step": 38800}, {"loss": 0.7003, "grad_norm": 0.7082616090774536, "learning_rate": 0.0002, "epoch": 2.787073608617594, "step": 38810}, {"loss": 0.697, "grad_norm": 0.7880047559738159, "learning_rate": 0.0002, "epoch": 2.7877917414721725, "step": 38820}, {"loss": 0.6635, "grad_norm": 0.7217963337898254, "learning_rate": 0.0002, "epoch": 2.7885098743267505, "step": 38830}, {"loss": 0.696, "grad_norm": 0.799124002456665, "learning_rate": 0.0002, "epoch": 2.7892280071813285, "step": 38840}, {"loss": 0.7267, "grad_norm": 1.0004022121429443, "learning_rate": 0.0002, "epoch": 2.789946140035907, "step": 38850}, {"loss": 0.6325, "grad_norm": 0.7866547107696533, "learning_rate": 0.0002, "epoch": 2.790664272890485, "step": 38860}, {"loss": 0.6573, "grad_norm": 0.891603410243988, "learning_rate": 0.0002, "epoch": 2.791382405745063, "step": 38870}, {"loss": 0.6949, "grad_norm": 0.7687129378318787, "learning_rate": 0.0002, "epoch": 2.792100538599641, "step": 38880}, {"loss": 0.6753, "grad_norm": 0.7549769282341003, "learning_rate": 0.0002, "epoch": 2.792818671454219, "step": 38890}, {"loss": 0.7103, "grad_norm": 0.7792351245880127, "learning_rate": 0.0002, "epoch": 2.793536804308797, "step": 38900}, {"loss": 0.671, "grad_norm": 0.7352819442749023, "learning_rate": 0.0002, "epoch": 2.7942549371633754, "step": 38910}, {"loss": 0.7176, "grad_norm": 0.8758018612861633, "learning_rate": 0.0002, "epoch": 2.7949730700179534, "step": 38920}, {"loss": 0.7033, "grad_norm": 0.8213023543357849, "learning_rate": 0.0002, "epoch": 2.7956912028725314, "step": 38930}, {"loss": 0.6759, "grad_norm": 0.899368941783905, "learning_rate": 0.0002, "epoch": 2.7964093357271094, "step": 38940}, {"loss": 0.6994, "grad_norm": 0.7497758269309998, "learning_rate": 0.0002, "epoch": 2.797127468581688, "step": 38950}, {"loss": 0.7006, "grad_norm": 0.870704710483551, "learning_rate": 0.0002, "epoch": 2.797845601436266, "step": 38960}, {"loss": 0.6865, "grad_norm": 0.8021528720855713, "learning_rate": 0.0002, "epoch": 2.798563734290844, "step": 38970}, {"loss": 0.7254, "grad_norm": 0.7541360855102539, "learning_rate": 0.0002, "epoch": 2.799281867145422, "step": 38980}, {"loss": 0.6275, "grad_norm": 0.8909788131713867, "learning_rate": 0.0002, "epoch": 2.8, "step": 38990}, {"loss": 0.6801, "grad_norm": 0.8175999522209167, "learning_rate": 0.0002, "epoch": 2.800718132854578, "step": 39000}, {"loss": 0.6961, "grad_norm": 0.7336044311523438, "learning_rate": 0.0002, "epoch": 2.8014362657091563, "step": 39010}, {"loss": 0.6573, "grad_norm": 0.7354168891906738, "learning_rate": 0.0002, "epoch": 2.8021543985637343, "step": 39020}, {"loss": 0.6207, "grad_norm": 0.8771968483924866, "learning_rate": 0.0002, "epoch": 2.8028725314183123, "step": 39030}, {"loss": 0.671, "grad_norm": 0.8073309063911438, "learning_rate": 0.0002, "epoch": 2.8035906642728907, "step": 39040}, {"loss": 0.6869, "grad_norm": 0.8475365042686462, "learning_rate": 0.0002, "epoch": 2.8043087971274687, "step": 39050}, {"loss": 0.6549, "grad_norm": 0.7233281135559082, "learning_rate": 0.0002, "epoch": 2.8050269299820467, "step": 39060}, {"loss": 0.6937, "grad_norm": 0.9850572347640991, "learning_rate": 0.0002, "epoch": 2.8057450628366247, "step": 39070}, {"loss": 0.7091, "grad_norm": 1.0635435581207275, "learning_rate": 0.0002, "epoch": 2.8064631956912027, "step": 39080}, {"loss": 0.6345, "grad_norm": 0.8183665871620178, "learning_rate": 0.0002, "epoch": 2.8071813285457807, "step": 39090}, {"loss": 0.7116, "grad_norm": 0.802228569984436, "learning_rate": 0.0002, "epoch": 2.807899461400359, "step": 39100}, {"loss": 0.7078, "grad_norm": 0.9861624836921692, "learning_rate": 0.0002, "epoch": 2.808617594254937, "step": 39110}, {"loss": 0.7242, "grad_norm": 0.675205409526825, "learning_rate": 0.0002, "epoch": 2.809335727109515, "step": 39120}, {"loss": 0.6599, "grad_norm": 0.7503975629806519, "learning_rate": 0.0002, "epoch": 2.8100538599640936, "step": 39130}, {"loss": 0.6684, "grad_norm": 0.8266825675964355, "learning_rate": 0.0002, "epoch": 2.8107719928186716, "step": 39140}, {"loss": 0.6869, "grad_norm": 0.6956485509872437, "learning_rate": 0.0002, "epoch": 2.8114901256732496, "step": 39150}, {"loss": 0.6495, "grad_norm": 0.7363799214363098, "learning_rate": 0.0002, "epoch": 2.8122082585278276, "step": 39160}, {"loss": 0.7047, "grad_norm": 1.3893407583236694, "learning_rate": 0.0002, "epoch": 2.8129263913824056, "step": 39170}, {"loss": 0.6501, "grad_norm": 1.0619654655456543, "learning_rate": 0.0002, "epoch": 2.8136445242369836, "step": 39180}, {"loss": 0.703, "grad_norm": 0.7924326062202454, "learning_rate": 0.0002, "epoch": 2.814362657091562, "step": 39190}, {"loss": 0.6748, "grad_norm": 0.8838121294975281, "learning_rate": 0.0002, "epoch": 2.81508078994614, "step": 39200}, {"loss": 0.6759, "grad_norm": 0.9059016108512878, "learning_rate": 0.0002, "epoch": 2.815798922800718, "step": 39210}, {"loss": 0.6812, "grad_norm": 0.9284590482711792, "learning_rate": 0.0002, "epoch": 2.816517055655296, "step": 39220}, {"loss": 0.6261, "grad_norm": 0.7992225289344788, "learning_rate": 0.0002, "epoch": 2.8172351885098745, "step": 39230}, {"loss": 0.6623, "grad_norm": 0.816376805305481, "learning_rate": 0.0002, "epoch": 2.8179533213644525, "step": 39240}, {"loss": 0.6825, "grad_norm": 0.9183637499809265, "learning_rate": 0.0002, "epoch": 2.8186714542190305, "step": 39250}, {"loss": 0.6558, "grad_norm": 0.7232057452201843, "learning_rate": 0.0002, "epoch": 2.8193895870736085, "step": 39260}, {"loss": 0.7396, "grad_norm": 0.9012457728385925, "learning_rate": 0.0002, "epoch": 2.8201077199281865, "step": 39270}, {"loss": 0.6823, "grad_norm": 0.7796093821525574, "learning_rate": 0.0002, "epoch": 2.8208258527827645, "step": 39280}, {"loss": 0.6997, "grad_norm": 0.8331146836280823, "learning_rate": 0.0002, "epoch": 2.821543985637343, "step": 39290}, {"loss": 0.6867, "grad_norm": 0.8031269907951355, "learning_rate": 0.0002, "epoch": 2.822262118491921, "step": 39300}, {"loss": 0.7451, "grad_norm": 0.8563299179077148, "learning_rate": 0.0002, "epoch": 2.822980251346499, "step": 39310}, {"loss": 0.6828, "grad_norm": 0.8083387613296509, "learning_rate": 0.0002, "epoch": 2.8236983842010774, "step": 39320}, {"loss": 0.723, "grad_norm": 0.8132631182670593, "learning_rate": 0.0002, "epoch": 2.8244165170556554, "step": 39330}, {"loss": 0.6882, "grad_norm": 0.9071316719055176, "learning_rate": 0.0002, "epoch": 2.8251346499102334, "step": 39340}, {"loss": 0.7057, "grad_norm": 0.8224168419837952, "learning_rate": 0.0002, "epoch": 2.8258527827648114, "step": 39350}, {"loss": 0.6831, "grad_norm": 1.073014497756958, "learning_rate": 0.0002, "epoch": 2.8265709156193894, "step": 39360}, {"loss": 0.7392, "grad_norm": 0.9466553926467896, "learning_rate": 0.0002, "epoch": 2.8272890484739674, "step": 39370}, {"loss": 0.7288, "grad_norm": 0.8946257829666138, "learning_rate": 0.0002, "epoch": 2.828007181328546, "step": 39380}, {"loss": 0.7023, "grad_norm": 0.8497758507728577, "learning_rate": 0.0002, "epoch": 2.828725314183124, "step": 39390}, {"loss": 0.6787, "grad_norm": 0.8952143788337708, "learning_rate": 0.0002, "epoch": 2.829443447037702, "step": 39400}, {"loss": 0.7059, "grad_norm": 0.8839313983917236, "learning_rate": 0.0002, "epoch": 2.8301615798922803, "step": 39410}, {"loss": 0.6643, "grad_norm": 0.7576757669448853, "learning_rate": 0.0002, "epoch": 2.8308797127468583, "step": 39420}, {"loss": 0.6509, "grad_norm": 0.8212469816207886, "learning_rate": 0.0002, "epoch": 2.8315978456014363, "step": 39430}, {"loss": 0.6728, "grad_norm": 0.9289504885673523, "learning_rate": 0.0002, "epoch": 2.8323159784560143, "step": 39440}, {"loss": 0.6773, "grad_norm": 0.8745405077934265, "learning_rate": 0.0002, "epoch": 2.8330341113105924, "step": 39450}, {"loss": 0.6741, "grad_norm": 0.7974533438682556, "learning_rate": 0.0002, "epoch": 2.8337522441651704, "step": 39460}, {"loss": 0.6887, "grad_norm": 0.914289116859436, "learning_rate": 0.0002, "epoch": 2.834470377019749, "step": 39470}, {"loss": 0.7009, "grad_norm": 0.7686914801597595, "learning_rate": 0.0002, "epoch": 2.835188509874327, "step": 39480}, {"loss": 0.679, "grad_norm": 0.9289370179176331, "learning_rate": 0.0002, "epoch": 2.835906642728905, "step": 39490}, {"loss": 0.684, "grad_norm": 0.8851973414421082, "learning_rate": 0.0002, "epoch": 2.836624775583483, "step": 39500}, {"loss": 0.7012, "grad_norm": 0.7754096388816833, "learning_rate": 0.0002, "epoch": 2.8373429084380613, "step": 39510}, {"loss": 0.6936, "grad_norm": 0.8801632523536682, "learning_rate": 0.0002, "epoch": 2.8380610412926393, "step": 39520}, {"loss": 0.6878, "grad_norm": 0.9031528234481812, "learning_rate": 0.0002, "epoch": 2.8387791741472173, "step": 39530}, {"loss": 0.6815, "grad_norm": 0.7113721966743469, "learning_rate": 0.0002, "epoch": 2.8394973070017953, "step": 39540}, {"loss": 0.7287, "grad_norm": 0.7880923748016357, "learning_rate": 0.0002, "epoch": 2.8402154398563733, "step": 39550}, {"loss": 0.671, "grad_norm": 2.4828813076019287, "learning_rate": 0.0002, "epoch": 2.8409335727109513, "step": 39560}, {"loss": 0.6824, "grad_norm": 0.9174619913101196, "learning_rate": 0.0002, "epoch": 2.8416517055655297, "step": 39570}, {"loss": 0.7086, "grad_norm": 0.9708074927330017, "learning_rate": 0.0002, "epoch": 2.8423698384201077, "step": 39580}, {"loss": 0.7021, "grad_norm": 0.7968248724937439, "learning_rate": 0.0002, "epoch": 2.8430879712746857, "step": 39590}, {"loss": 0.7121, "grad_norm": 0.7967682480812073, "learning_rate": 0.0002, "epoch": 2.843806104129264, "step": 39600}, {"loss": 0.6284, "grad_norm": 0.7487651109695435, "learning_rate": 0.0002, "epoch": 2.844524236983842, "step": 39610}, {"loss": 0.6624, "grad_norm": 0.6997556686401367, "learning_rate": 0.0002, "epoch": 2.84524236983842, "step": 39620}, {"loss": 0.6987, "grad_norm": 0.7639351487159729, "learning_rate": 0.0002, "epoch": 2.845960502692998, "step": 39630}, {"loss": 0.6757, "grad_norm": 0.9086648225784302, "learning_rate": 0.0002, "epoch": 2.846678635547576, "step": 39640}, {"loss": 0.6841, "grad_norm": 0.91103196144104, "learning_rate": 0.0002, "epoch": 2.847396768402154, "step": 39650}, {"loss": 0.7046, "grad_norm": 0.8096913695335388, "learning_rate": 0.0002, "epoch": 2.8481149012567326, "step": 39660}, {"loss": 0.679, "grad_norm": 0.8961427807807922, "learning_rate": 0.0002, "epoch": 2.8488330341113106, "step": 39670}, {"loss": 0.6589, "grad_norm": 0.7489904761314392, "learning_rate": 0.0002, "epoch": 2.8495511669658886, "step": 39680}, {"loss": 0.6581, "grad_norm": 0.7893617749214172, "learning_rate": 0.0002, "epoch": 2.850269299820467, "step": 39690}, {"loss": 0.7326, "grad_norm": 0.8259761929512024, "learning_rate": 0.0002, "epoch": 2.850987432675045, "step": 39700}, {"loss": 0.6763, "grad_norm": 0.7006617188453674, "learning_rate": 0.0002, "epoch": 2.851705565529623, "step": 39710}, {"loss": 0.7095, "grad_norm": 0.8922327756881714, "learning_rate": 0.0002, "epoch": 2.852423698384201, "step": 39720}, {"loss": 0.6829, "grad_norm": 0.9058550000190735, "learning_rate": 0.0002, "epoch": 2.853141831238779, "step": 39730}, {"loss": 0.6777, "grad_norm": 0.7627129554748535, "learning_rate": 0.0002, "epoch": 2.853859964093357, "step": 39740}, {"loss": 0.6937, "grad_norm": 0.9316968321800232, "learning_rate": 0.0002, "epoch": 2.8545780969479355, "step": 39750}, {"loss": 0.6882, "grad_norm": 0.8424679040908813, "learning_rate": 0.0002, "epoch": 2.8552962298025135, "step": 39760}, {"loss": 0.7018, "grad_norm": 0.6185386776924133, "learning_rate": 0.0002, "epoch": 2.8560143626570915, "step": 39770}, {"loss": 0.7106, "grad_norm": 0.709902286529541, "learning_rate": 0.0002, "epoch": 2.8567324955116695, "step": 39780}, {"loss": 0.7007, "grad_norm": 0.93730229139328, "learning_rate": 0.0002, "epoch": 2.857450628366248, "step": 39790}, {"loss": 0.6973, "grad_norm": 0.875989556312561, "learning_rate": 0.0002, "epoch": 2.858168761220826, "step": 39800}, {"loss": 0.6685, "grad_norm": 0.7424131631851196, "learning_rate": 0.0002, "epoch": 2.858886894075404, "step": 39810}, {"loss": 0.6704, "grad_norm": 0.9108477830886841, "learning_rate": 0.0002, "epoch": 2.859605026929982, "step": 39820}, {"loss": 0.6677, "grad_norm": 0.8248386383056641, "learning_rate": 0.0002, "epoch": 2.86032315978456, "step": 39830}, {"loss": 0.6591, "grad_norm": 0.8739979863166809, "learning_rate": 0.0002, "epoch": 2.861041292639138, "step": 39840}, {"loss": 0.6674, "grad_norm": 0.7940961122512817, "learning_rate": 0.0002, "epoch": 2.8617594254937164, "step": 39850}, {"loss": 0.6875, "grad_norm": 0.7594687938690186, "learning_rate": 0.0002, "epoch": 2.8624775583482944, "step": 39860}, {"loss": 0.7339, "grad_norm": 0.9884313941001892, "learning_rate": 0.0002, "epoch": 2.8631956912028724, "step": 39870}, {"loss": 0.6583, "grad_norm": 0.8537741303443909, "learning_rate": 0.0002, "epoch": 2.863913824057451, "step": 39880}, {"loss": 0.6746, "grad_norm": 0.7407512664794922, "learning_rate": 0.0002, "epoch": 2.864631956912029, "step": 39890}, {"loss": 0.7211, "grad_norm": 1.0179548263549805, "learning_rate": 0.0002, "epoch": 2.865350089766607, "step": 39900}, {"loss": 0.6916, "grad_norm": 0.8822470307350159, "learning_rate": 0.0002, "epoch": 2.866068222621185, "step": 39910}, {"loss": 0.7141, "grad_norm": 0.794448733329773, "learning_rate": 0.0002, "epoch": 2.866786355475763, "step": 39920}, {"loss": 0.6993, "grad_norm": 0.8115299940109253, "learning_rate": 0.0002, "epoch": 2.867504488330341, "step": 39930}, {"loss": 0.655, "grad_norm": 0.7998958826065063, "learning_rate": 0.0002, "epoch": 2.8682226211849193, "step": 39940}, {"loss": 0.7414, "grad_norm": 0.8222435116767883, "learning_rate": 0.0002, "epoch": 2.8689407540394973, "step": 39950}, {"loss": 0.6987, "grad_norm": 0.9495923519134521, "learning_rate": 0.0002, "epoch": 2.8696588868940753, "step": 39960}, {"loss": 0.6567, "grad_norm": 0.6749192476272583, "learning_rate": 0.0002, "epoch": 2.8703770197486533, "step": 39970}, {"loss": 0.7003, "grad_norm": 0.8910874128341675, "learning_rate": 0.0002, "epoch": 2.871095152603232, "step": 39980}, {"loss": 0.6935, "grad_norm": 0.7051638960838318, "learning_rate": 0.0002, "epoch": 2.87181328545781, "step": 39990}, {"loss": 0.663, "grad_norm": 0.8456535339355469, "learning_rate": 0.0002, "epoch": 2.872531418312388, "step": 40000}, {"loss": 0.7222, "grad_norm": 0.934894859790802, "learning_rate": 0.0002, "epoch": 2.873249551166966, "step": 40010}, {"loss": 0.7106, "grad_norm": 0.6740477681159973, "learning_rate": 0.0002, "epoch": 2.873967684021544, "step": 40020}, {"loss": 0.6981, "grad_norm": 0.6632325649261475, "learning_rate": 0.0002, "epoch": 2.8746858168761222, "step": 40030}, {"loss": 0.7037, "grad_norm": 0.8889022469520569, "learning_rate": 0.0002, "epoch": 2.8754039497307002, "step": 40040}, {"loss": 0.7094, "grad_norm": 0.7460705637931824, "learning_rate": 0.0002, "epoch": 2.8761220825852782, "step": 40050}, {"loss": 0.6994, "grad_norm": 0.9795911908149719, "learning_rate": 0.0002, "epoch": 2.8768402154398562, "step": 40060}, {"loss": 0.6602, "grad_norm": 1.0002509355545044, "learning_rate": 0.0002, "epoch": 2.8775583482944347, "step": 40070}, {"loss": 0.7191, "grad_norm": 0.7867239713668823, "learning_rate": 0.0002, "epoch": 2.8782764811490127, "step": 40080}, {"loss": 0.6772, "grad_norm": 1.0221471786499023, "learning_rate": 0.0002, "epoch": 2.8789946140035907, "step": 40090}, {"loss": 0.7317, "grad_norm": 0.8091005086898804, "learning_rate": 0.0002, "epoch": 2.8797127468581687, "step": 40100}, {"loss": 0.7334, "grad_norm": 0.8485820293426514, "learning_rate": 0.0002, "epoch": 2.8804308797127467, "step": 40110}, {"loss": 0.7221, "grad_norm": 0.7850196957588196, "learning_rate": 0.0002, "epoch": 2.8811490125673247, "step": 40120}, {"loss": 0.6696, "grad_norm": 0.7906134128570557, "learning_rate": 0.0002, "epoch": 2.881867145421903, "step": 40130}, {"loss": 0.648, "grad_norm": 0.7957962155342102, "learning_rate": 0.0002, "epoch": 2.882585278276481, "step": 40140}, {"loss": 0.6774, "grad_norm": 1.0687522888183594, "learning_rate": 0.0002, "epoch": 2.883303411131059, "step": 40150}, {"loss": 0.7256, "grad_norm": 0.713752031326294, "learning_rate": 0.0002, "epoch": 2.8840215439856376, "step": 40160}, {"loss": 0.7144, "grad_norm": 1.1603864431381226, "learning_rate": 0.0002, "epoch": 2.8847396768402156, "step": 40170}, {"loss": 0.7223, "grad_norm": 0.8423245549201965, "learning_rate": 0.0002, "epoch": 2.8854578096947936, "step": 40180}, {"loss": 0.6796, "grad_norm": 0.7554550766944885, "learning_rate": 0.0002, "epoch": 2.8861759425493716, "step": 40190}, {"loss": 0.6923, "grad_norm": 0.6006978750228882, "learning_rate": 0.0002, "epoch": 2.8868940754039496, "step": 40200}, {"loss": 0.6893, "grad_norm": 0.923068106174469, "learning_rate": 0.0002, "epoch": 2.8876122082585276, "step": 40210}, {"loss": 0.6688, "grad_norm": 0.7659787535667419, "learning_rate": 0.0002, "epoch": 2.888330341113106, "step": 40220}, {"loss": 0.6706, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 2.889048473967684, "step": 40230}, {"loss": 0.6922, "grad_norm": 1.1267355680465698, "learning_rate": 0.0002, "epoch": 2.889766606822262, "step": 40240}, {"loss": 0.7636, "grad_norm": 0.8548554182052612, "learning_rate": 0.0002, "epoch": 2.89048473967684, "step": 40250}, {"loss": 0.6847, "grad_norm": 0.7846875786781311, "learning_rate": 0.0002, "epoch": 2.8912028725314185, "step": 40260}, {"loss": 0.6796, "grad_norm": 0.8606904745101929, "learning_rate": 0.0002, "epoch": 2.8919210053859965, "step": 40270}, {"loss": 0.6864, "grad_norm": 0.6508898138999939, "learning_rate": 0.0002, "epoch": 2.8926391382405745, "step": 40280}, {"loss": 0.6793, "grad_norm": 0.7903237342834473, "learning_rate": 0.0002, "epoch": 2.8933572710951525, "step": 40290}, {"loss": 0.6642, "grad_norm": 0.7320941686630249, "learning_rate": 0.0002, "epoch": 2.8940754039497305, "step": 40300}, {"loss": 0.6813, "grad_norm": 1.0031821727752686, "learning_rate": 0.0002, "epoch": 2.894793536804309, "step": 40310}, {"loss": 0.6071, "grad_norm": 0.7463554739952087, "learning_rate": 0.0002, "epoch": 2.895511669658887, "step": 40320}, {"loss": 0.6856, "grad_norm": 0.8455599546432495, "learning_rate": 0.0002, "epoch": 2.896229802513465, "step": 40330}, {"loss": 0.7252, "grad_norm": 0.7645914554595947, "learning_rate": 0.0002, "epoch": 2.896947935368043, "step": 40340}, {"loss": 0.7181, "grad_norm": 0.9074810147285461, "learning_rate": 0.0002, "epoch": 2.8976660682226214, "step": 40350}, {"loss": 0.6935, "grad_norm": 0.9070153832435608, "learning_rate": 0.0002, "epoch": 2.8983842010771994, "step": 40360}, {"loss": 0.7021, "grad_norm": 0.8649221658706665, "learning_rate": 0.0002, "epoch": 2.8991023339317774, "step": 40370}, {"loss": 0.7402, "grad_norm": 1.0325016975402832, "learning_rate": 0.0002, "epoch": 2.8998204667863554, "step": 40380}, {"loss": 0.6889, "grad_norm": 0.8688622713088989, "learning_rate": 0.0002, "epoch": 2.9005385996409334, "step": 40390}, {"loss": 0.7209, "grad_norm": 0.83316969871521, "learning_rate": 0.0002, "epoch": 2.9012567324955114, "step": 40400}, {"loss": 0.6915, "grad_norm": 1.0146536827087402, "learning_rate": 0.0002, "epoch": 2.90197486535009, "step": 40410}, {"loss": 0.67, "grad_norm": 6.21811580657959, "learning_rate": 0.0002, "epoch": 2.902692998204668, "step": 40420}, {"loss": 0.675, "grad_norm": 0.8747655749320984, "learning_rate": 0.0002, "epoch": 2.903411131059246, "step": 40430}, {"loss": 0.6781, "grad_norm": 0.8671547174453735, "learning_rate": 0.0002, "epoch": 2.9041292639138243, "step": 40440}, {"loss": 0.693, "grad_norm": 0.7888760566711426, "learning_rate": 0.0002, "epoch": 2.9048473967684023, "step": 40450}, {"loss": 0.7208, "grad_norm": 0.7182217240333557, "learning_rate": 0.0002, "epoch": 2.9055655296229803, "step": 40460}, {"loss": 0.7393, "grad_norm": 0.8802227973937988, "learning_rate": 0.0002, "epoch": 2.9062836624775583, "step": 40470}, {"loss": 0.6755, "grad_norm": 0.8106126189231873, "learning_rate": 0.0002, "epoch": 2.9070017953321363, "step": 40480}, {"loss": 0.7251, "grad_norm": 0.7313538789749146, "learning_rate": 0.0002, "epoch": 2.9077199281867143, "step": 40490}, {"loss": 0.6927, "grad_norm": 0.6098655462265015, "learning_rate": 0.0002, "epoch": 2.9084380610412928, "step": 40500}, {"loss": 0.6667, "grad_norm": 0.8849560618400574, "learning_rate": 0.0002, "epoch": 2.9091561938958708, "step": 40510}, {"loss": 0.7199, "grad_norm": 0.8761322498321533, "learning_rate": 0.0002, "epoch": 2.9098743267504488, "step": 40520}, {"loss": 0.6952, "grad_norm": 0.8259703516960144, "learning_rate": 0.0002, "epoch": 2.9105924596050268, "step": 40530}, {"loss": 0.6547, "grad_norm": 0.6613079309463501, "learning_rate": 0.0002, "epoch": 2.911310592459605, "step": 40540}, {"loss": 0.7642, "grad_norm": 0.825678825378418, "learning_rate": 0.0002, "epoch": 2.912028725314183, "step": 40550}, {"loss": 0.7052, "grad_norm": 0.824850857257843, "learning_rate": 0.0002, "epoch": 2.912746858168761, "step": 40560}, {"loss": 0.6869, "grad_norm": 0.9629682898521423, "learning_rate": 0.0002, "epoch": 2.9134649910233392, "step": 40570}, {"loss": 0.7588, "grad_norm": 0.7446485161781311, "learning_rate": 0.0002, "epoch": 2.9141831238779172, "step": 40580}, {"loss": 0.7045, "grad_norm": 0.9028317928314209, "learning_rate": 0.0002, "epoch": 2.9149012567324957, "step": 40590}, {"loss": 0.7128, "grad_norm": 0.9646022319793701, "learning_rate": 0.0002, "epoch": 2.9156193895870737, "step": 40600}, {"loss": 0.6782, "grad_norm": 0.8845045566558838, "learning_rate": 0.0002, "epoch": 2.9163375224416517, "step": 40610}, {"loss": 0.7179, "grad_norm": 0.9660372734069824, "learning_rate": 0.0002, "epoch": 2.9170556552962297, "step": 40620}, {"loss": 0.7442, "grad_norm": 0.8914347290992737, "learning_rate": 0.0002, "epoch": 2.917773788150808, "step": 40630}, {"loss": 0.6435, "grad_norm": 0.7789235711097717, "learning_rate": 0.0002, "epoch": 2.918491921005386, "step": 40640}, {"loss": 0.7156, "grad_norm": 0.8221206665039062, "learning_rate": 0.0002, "epoch": 2.919210053859964, "step": 40650}, {"loss": 0.7363, "grad_norm": 0.9550618529319763, "learning_rate": 0.0002, "epoch": 2.919928186714542, "step": 40660}, {"loss": 0.6911, "grad_norm": 0.868315577507019, "learning_rate": 0.0002, "epoch": 2.92064631956912, "step": 40670}, {"loss": 0.6939, "grad_norm": 0.852878749370575, "learning_rate": 0.0002, "epoch": 2.921364452423698, "step": 40680}, {"loss": 0.6497, "grad_norm": 0.8388790488243103, "learning_rate": 0.0002, "epoch": 2.9220825852782766, "step": 40690}, {"loss": 0.7299, "grad_norm": 0.9897602200508118, "learning_rate": 0.0002, "epoch": 2.9228007181328546, "step": 40700}, {"loss": 0.695, "grad_norm": 0.8050527572631836, "learning_rate": 0.0002, "epoch": 2.9235188509874326, "step": 40710}, {"loss": 0.6924, "grad_norm": 0.7296929955482483, "learning_rate": 0.0002, "epoch": 2.924236983842011, "step": 40720}, {"loss": 0.759, "grad_norm": 0.917475700378418, "learning_rate": 0.0002, "epoch": 2.924955116696589, "step": 40730}, {"loss": 0.6965, "grad_norm": 0.9118483662605286, "learning_rate": 0.0002, "epoch": 2.925673249551167, "step": 40740}, {"loss": 0.6918, "grad_norm": 0.7722473740577698, "learning_rate": 0.0002, "epoch": 2.926391382405745, "step": 40750}, {"loss": 0.7103, "grad_norm": 0.7950358986854553, "learning_rate": 0.0002, "epoch": 2.927109515260323, "step": 40760}, {"loss": 0.7266, "grad_norm": 0.8868561387062073, "learning_rate": 0.0002, "epoch": 2.927827648114901, "step": 40770}, {"loss": 0.7513, "grad_norm": 0.7923154830932617, "learning_rate": 0.0002, "epoch": 2.9285457809694795, "step": 40780}, {"loss": 0.6822, "grad_norm": 0.7285428047180176, "learning_rate": 0.0002, "epoch": 2.9292639138240575, "step": 40790}, {"loss": 0.6748, "grad_norm": 0.794775664806366, "learning_rate": 0.0002, "epoch": 2.9299820466786355, "step": 40800}, {"loss": 0.6967, "grad_norm": 0.8351698517799377, "learning_rate": 0.0002, "epoch": 2.9307001795332135, "step": 40810}, {"loss": 0.6927, "grad_norm": 0.853082001209259, "learning_rate": 0.0002, "epoch": 2.931418312387792, "step": 40820}, {"loss": 0.7047, "grad_norm": 0.8209722638130188, "learning_rate": 0.0002, "epoch": 2.93213644524237, "step": 40830}, {"loss": 0.6742, "grad_norm": 0.8982136845588684, "learning_rate": 0.0002, "epoch": 2.932854578096948, "step": 40840}, {"loss": 0.6617, "grad_norm": 0.8373305201530457, "learning_rate": 0.0002, "epoch": 2.933572710951526, "step": 40850}, {"loss": 0.6754, "grad_norm": 0.8326864242553711, "learning_rate": 0.0002, "epoch": 2.934290843806104, "step": 40860}, {"loss": 0.7151, "grad_norm": 0.7232590317726135, "learning_rate": 0.0002, "epoch": 2.9350089766606824, "step": 40870}, {"loss": 0.7311, "grad_norm": 0.823615312576294, "learning_rate": 0.0002, "epoch": 2.9357271095152604, "step": 40880}, {"loss": 0.7122, "grad_norm": 0.7532811760902405, "learning_rate": 0.0002, "epoch": 2.9364452423698384, "step": 40890}, {"loss": 0.7254, "grad_norm": 0.9594773650169373, "learning_rate": 0.0002, "epoch": 2.9371633752244164, "step": 40900}, {"loss": 0.7024, "grad_norm": 0.8368398547172546, "learning_rate": 0.0002, "epoch": 2.937881508078995, "step": 40910}, {"loss": 0.7201, "grad_norm": 0.8336817026138306, "learning_rate": 0.0002, "epoch": 2.938599640933573, "step": 40920}, {"loss": 0.6402, "grad_norm": 0.8413758277893066, "learning_rate": 0.0002, "epoch": 2.939317773788151, "step": 40930}, {"loss": 0.7054, "grad_norm": 0.7117549180984497, "learning_rate": 0.0002, "epoch": 2.940035906642729, "step": 40940}, {"loss": 0.6101, "grad_norm": 0.8741925954818726, "learning_rate": 0.0002, "epoch": 2.940754039497307, "step": 40950}, {"loss": 0.7491, "grad_norm": 0.8476088047027588, "learning_rate": 0.0002, "epoch": 2.941472172351885, "step": 40960}, {"loss": 0.7084, "grad_norm": 0.674659788608551, "learning_rate": 0.0002, "epoch": 2.9421903052064633, "step": 40970}, {"loss": 0.6714, "grad_norm": 0.7087500691413879, "learning_rate": 0.0002, "epoch": 2.9429084380610413, "step": 40980}, {"loss": 0.6953, "grad_norm": 0.9202252626419067, "learning_rate": 0.0002, "epoch": 2.9436265709156193, "step": 40990}, {"loss": 0.7244, "grad_norm": 0.9775124192237854, "learning_rate": 0.0002, "epoch": 2.9443447037701977, "step": 41000}, {"loss": 0.6897, "grad_norm": 0.7465068101882935, "learning_rate": 0.0002, "epoch": 2.9450628366247757, "step": 41010}, {"loss": 0.6944, "grad_norm": 0.7229986786842346, "learning_rate": 0.0002, "epoch": 2.9457809694793538, "step": 41020}, {"loss": 0.6754, "grad_norm": 0.7228954434394836, "learning_rate": 0.0002, "epoch": 2.9464991023339318, "step": 41030}, {"loss": 0.6604, "grad_norm": 0.9396149516105652, "learning_rate": 0.0002, "epoch": 2.9472172351885098, "step": 41040}, {"loss": 0.6498, "grad_norm": 0.9458696842193604, "learning_rate": 0.0002, "epoch": 2.9479353680430878, "step": 41050}, {"loss": 0.7154, "grad_norm": 0.8276246190071106, "learning_rate": 0.0002, "epoch": 2.948653500897666, "step": 41060}, {"loss": 0.6567, "grad_norm": 0.7927420139312744, "learning_rate": 0.0002, "epoch": 2.949371633752244, "step": 41070}, {"loss": 0.7442, "grad_norm": 0.7403103709220886, "learning_rate": 0.0002, "epoch": 2.950089766606822, "step": 41080}, {"loss": 0.6856, "grad_norm": 0.9813524484634399, "learning_rate": 0.0002, "epoch": 2.9508078994614, "step": 41090}, {"loss": 0.7271, "grad_norm": 0.8560924530029297, "learning_rate": 0.0002, "epoch": 2.9515260323159787, "step": 41100}, {"loss": 0.6851, "grad_norm": 0.6937443017959595, "learning_rate": 0.0002, "epoch": 2.9522441651705567, "step": 41110}, {"loss": 0.6817, "grad_norm": 0.8440476655960083, "learning_rate": 0.0002, "epoch": 2.9529622980251347, "step": 41120}, {"loss": 0.7082, "grad_norm": 1.1260770559310913, "learning_rate": 0.0002, "epoch": 2.9536804308797127, "step": 41130}, {"loss": 0.6745, "grad_norm": 0.8789936900138855, "learning_rate": 0.0002, "epoch": 2.9543985637342907, "step": 41140}, {"loss": 0.7297, "grad_norm": 0.8205832839012146, "learning_rate": 0.0002, "epoch": 2.9551166965888687, "step": 41150}, {"loss": 0.7036, "grad_norm": 0.8148444294929504, "learning_rate": 0.0002, "epoch": 2.955834829443447, "step": 41160}, {"loss": 0.6923, "grad_norm": 0.791296660900116, "learning_rate": 0.0002, "epoch": 2.956552962298025, "step": 41170}, {"loss": 0.6589, "grad_norm": 1.3229854106903076, "learning_rate": 0.0002, "epoch": 2.957271095152603, "step": 41180}, {"loss": 0.6691, "grad_norm": 0.906423807144165, "learning_rate": 0.0002, "epoch": 2.9579892280071816, "step": 41190}, {"loss": 0.6979, "grad_norm": 0.8707411289215088, "learning_rate": 0.0002, "epoch": 2.9587073608617596, "step": 41200}, {"loss": 0.6442, "grad_norm": 1.0362473726272583, "learning_rate": 0.0002, "epoch": 2.9594254937163376, "step": 41210}, {"loss": 0.6725, "grad_norm": 0.818546712398529, "learning_rate": 0.0002, "epoch": 2.9601436265709156, "step": 41220}, {"loss": 0.7158, "grad_norm": 0.8558517098426819, "learning_rate": 0.0002, "epoch": 2.9608617594254936, "step": 41230}, {"loss": 0.7056, "grad_norm": 0.8262931704521179, "learning_rate": 0.0002, "epoch": 2.9615798922800716, "step": 41240}, {"loss": 0.6256, "grad_norm": 0.9603250026702881, "learning_rate": 0.0002, "epoch": 2.96229802513465, "step": 41250}, {"loss": 0.68, "grad_norm": 0.891610860824585, "learning_rate": 0.0002, "epoch": 2.963016157989228, "step": 41260}, {"loss": 0.7732, "grad_norm": 0.9823883175849915, "learning_rate": 0.0002, "epoch": 2.963734290843806, "step": 41270}, {"loss": 0.7144, "grad_norm": 0.8783510327339172, "learning_rate": 0.0002, "epoch": 2.9644524236983845, "step": 41280}, {"loss": 0.7196, "grad_norm": 0.873656690120697, "learning_rate": 0.0002, "epoch": 2.9651705565529625, "step": 41290}, {"loss": 0.6531, "grad_norm": 0.8281165957450867, "learning_rate": 0.0002, "epoch": 2.9658886894075405, "step": 41300}, {"loss": 0.69, "grad_norm": 0.8008899092674255, "learning_rate": 0.0002, "epoch": 2.9666068222621185, "step": 41310}, {"loss": 0.6923, "grad_norm": 0.8564065098762512, "learning_rate": 0.0002, "epoch": 2.9673249551166965, "step": 41320}, {"loss": 0.6871, "grad_norm": 0.786119818687439, "learning_rate": 0.0002, "epoch": 2.9680430879712745, "step": 41330}, {"loss": 0.7105, "grad_norm": 1.3152399063110352, "learning_rate": 0.0002, "epoch": 2.968761220825853, "step": 41340}, {"loss": 0.6575, "grad_norm": 0.7551527619361877, "learning_rate": 0.0002, "epoch": 2.969479353680431, "step": 41350}, {"loss": 0.6939, "grad_norm": 1.1397290229797363, "learning_rate": 0.0002, "epoch": 2.970197486535009, "step": 41360}, {"loss": 0.7119, "grad_norm": 0.8333854079246521, "learning_rate": 0.0002, "epoch": 2.970915619389587, "step": 41370}, {"loss": 0.6941, "grad_norm": 0.8096165657043457, "learning_rate": 0.0002, "epoch": 2.9716337522441654, "step": 41380}, {"loss": 0.7748, "grad_norm": 0.8378547430038452, "learning_rate": 0.0002, "epoch": 2.9723518850987434, "step": 41390}, {"loss": 0.7678, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 2.9730700179533214, "step": 41400}, {"loss": 0.6962, "grad_norm": 0.8722409605979919, "learning_rate": 0.0002, "epoch": 2.9737881508078994, "step": 41410}, {"loss": 0.7298, "grad_norm": 0.6680061221122742, "learning_rate": 0.0002, "epoch": 2.9745062836624774, "step": 41420}, {"loss": 0.6731, "grad_norm": 0.7666152715682983, "learning_rate": 0.0002, "epoch": 2.9752244165170554, "step": 41430}, {"loss": 0.7377, "grad_norm": 0.8489957451820374, "learning_rate": 0.0002, "epoch": 2.975942549371634, "step": 41440}, {"loss": 0.6816, "grad_norm": 0.8516127467155457, "learning_rate": 0.0002, "epoch": 2.976660682226212, "step": 41450}, {"loss": 0.697, "grad_norm": 0.8836804628372192, "learning_rate": 0.0002, "epoch": 2.97737881508079, "step": 41460}, {"loss": 0.7048, "grad_norm": 1.0963364839553833, "learning_rate": 0.0002, "epoch": 2.9780969479353683, "step": 41470}, {"loss": 0.6695, "grad_norm": 0.9908610582351685, "learning_rate": 0.0002, "epoch": 2.9788150807899463, "step": 41480}, {"loss": 0.7184, "grad_norm": 0.8822041153907776, "learning_rate": 0.0002, "epoch": 2.9795332136445243, "step": 41490}, {"loss": 0.7192, "grad_norm": 0.717723548412323, "learning_rate": 0.0002, "epoch": 2.9802513464991023, "step": 41500}, {"loss": 0.711, "grad_norm": 0.8413400053977966, "learning_rate": 0.0002, "epoch": 2.9809694793536803, "step": 41510}, {"loss": 0.6871, "grad_norm": 0.8771023750305176, "learning_rate": 0.0002, "epoch": 2.9816876122082583, "step": 41520}, {"loss": 0.6802, "grad_norm": 0.7185000777244568, "learning_rate": 0.0002, "epoch": 2.9824057450628367, "step": 41530}, {"loss": 0.706, "grad_norm": 0.8299767374992371, "learning_rate": 0.0002, "epoch": 2.9831238779174147, "step": 41540}, {"loss": 0.6569, "grad_norm": 0.9309971928596497, "learning_rate": 0.0002, "epoch": 2.9838420107719927, "step": 41550}, {"loss": 0.6598, "grad_norm": 0.7644693851470947, "learning_rate": 0.0002, "epoch": 2.984560143626571, "step": 41560}, {"loss": 0.7186, "grad_norm": 0.7888111472129822, "learning_rate": 0.0002, "epoch": 2.985278276481149, "step": 41570}, {"loss": 0.6984, "grad_norm": 1.0921967029571533, "learning_rate": 0.0002, "epoch": 2.985996409335727, "step": 41580}, {"loss": 0.6629, "grad_norm": 0.8116785883903503, "learning_rate": 0.0002, "epoch": 2.986714542190305, "step": 41590}, {"loss": 0.6842, "grad_norm": 0.983269214630127, "learning_rate": 0.0002, "epoch": 2.987432675044883, "step": 41600}, {"loss": 0.6675, "grad_norm": 0.81700599193573, "learning_rate": 0.0002, "epoch": 2.988150807899461, "step": 41610}, {"loss": 0.7525, "grad_norm": 0.7545617818832397, "learning_rate": 0.0002, "epoch": 2.9888689407540396, "step": 41620}, {"loss": 0.6698, "grad_norm": 0.8695791363716125, "learning_rate": 0.0002, "epoch": 2.9895870736086176, "step": 41630}, {"loss": 0.7446, "grad_norm": 0.8980445861816406, "learning_rate": 0.0002, "epoch": 2.9903052064631956, "step": 41640}, {"loss": 0.6616, "grad_norm": 0.7884747982025146, "learning_rate": 0.0002, "epoch": 2.9910233393177736, "step": 41650}, {"loss": 0.6461, "grad_norm": 0.8347880840301514, "learning_rate": 0.0002, "epoch": 2.991741472172352, "step": 41660}, {"loss": 0.6607, "grad_norm": 0.7786261439323425, "learning_rate": 0.0002, "epoch": 2.99245960502693, "step": 41670}, {"loss": 0.6834, "grad_norm": 0.7830624580383301, "learning_rate": 0.0002, "epoch": 2.993177737881508, "step": 41680}, {"loss": 0.7116, "grad_norm": 0.8293532133102417, "learning_rate": 0.0002, "epoch": 2.993895870736086, "step": 41690}, {"loss": 0.7029, "grad_norm": 0.8476244211196899, "learning_rate": 0.0002, "epoch": 2.994614003590664, "step": 41700}, {"loss": 0.6909, "grad_norm": 0.7218726873397827, "learning_rate": 0.0002, "epoch": 2.995332136445242, "step": 41710}, {"loss": 0.6579, "grad_norm": 0.8144199252128601, "learning_rate": 0.0002, "epoch": 2.9960502692998205, "step": 41720}, {"loss": 0.7011, "grad_norm": 0.7047123312950134, "learning_rate": 0.0002, "epoch": 2.9967684021543985, "step": 41730}, {"loss": 0.6555, "grad_norm": 0.8412184715270996, "learning_rate": 0.0002, "epoch": 2.9974865350089765, "step": 41740}, {"loss": 0.7237, "grad_norm": 0.8840848207473755, "learning_rate": 0.0002, "epoch": 2.998204667863555, "step": 41750}, {"loss": 0.6618, "grad_norm": 0.7302142977714539, "learning_rate": 0.0002, "epoch": 2.998922800718133, "step": 41760}, {"loss": 0.6596, "grad_norm": 0.7075994610786438, "learning_rate": 0.0002, "epoch": 2.999640933572711, "step": 41770}, {"eval_loss": 1.1079821586608887, "eval_runtime": 55.1897, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 3.0, "step": 41775}, {"loss": 0.6472, "grad_norm": 0.8630077838897705, "learning_rate": 0.0002, "epoch": 3.000359066427289, "step": 41780}, {"loss": 0.5843, "grad_norm": 0.8901806473731995, "learning_rate": 0.0002, "epoch": 3.001077199281867, "step": 41790}, {"loss": 0.5789, "grad_norm": 0.8291767835617065, "learning_rate": 0.0002, "epoch": 3.0017953321364454, "step": 41800}, {"loss": 0.6049, "grad_norm": 0.792519211769104, "learning_rate": 0.0002, "epoch": 3.0025134649910235, "step": 41810}, {"loss": 0.6131, "grad_norm": 1.1330063343048096, "learning_rate": 0.0002, "epoch": 3.0032315978456015, "step": 41820}, {"loss": 0.6225, "grad_norm": 0.9401350617408752, "learning_rate": 0.0002, "epoch": 3.0039497307001795, "step": 41830}, {"loss": 0.5924, "grad_norm": 0.8065463304519653, "learning_rate": 0.0002, "epoch": 3.0046678635547575, "step": 41840}, {"loss": 0.6161, "grad_norm": 0.8309979438781738, "learning_rate": 0.0002, "epoch": 3.005385996409336, "step": 41850}, {"loss": 0.6099, "grad_norm": 0.7432689070701599, "learning_rate": 0.0002, "epoch": 3.006104129263914, "step": 41860}, {"loss": 0.5901, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 3.006822262118492, "step": 41870}, {"loss": 0.6211, "grad_norm": 1.4364255666732788, "learning_rate": 0.0002, "epoch": 3.00754039497307, "step": 41880}, {"loss": 0.5988, "grad_norm": 0.9023072123527527, "learning_rate": 0.0002, "epoch": 3.008258527827648, "step": 41890}, {"loss": 0.6296, "grad_norm": 0.7790587544441223, "learning_rate": 0.0002, "epoch": 3.0089766606822264, "step": 41900}, {"loss": 0.5908, "grad_norm": 0.9163706302642822, "learning_rate": 0.0002, "epoch": 3.0096947935368044, "step": 41910}, {"loss": 0.6216, "grad_norm": 0.8147963285446167, "learning_rate": 0.0002, "epoch": 3.0104129263913824, "step": 41920}, {"loss": 0.6546, "grad_norm": 0.8432748913764954, "learning_rate": 0.0002, "epoch": 3.0111310592459604, "step": 41930}, {"loss": 0.5815, "grad_norm": 0.9216182231903076, "learning_rate": 0.0002, "epoch": 3.011849192100539, "step": 41940}, {"loss": 0.6336, "grad_norm": 0.62154221534729, "learning_rate": 0.0002, "epoch": 3.012567324955117, "step": 41950}, {"loss": 0.5868, "grad_norm": 0.8902392387390137, "learning_rate": 0.0002, "epoch": 3.013285457809695, "step": 41960}, {"loss": 0.6205, "grad_norm": 0.9601083993911743, "learning_rate": 0.0002, "epoch": 3.014003590664273, "step": 41970}, {"loss": 0.6001, "grad_norm": 0.8938809037208557, "learning_rate": 0.0002, "epoch": 3.014721723518851, "step": 41980}, {"loss": 0.6215, "grad_norm": 1.0621999502182007, "learning_rate": 0.0002, "epoch": 3.0154398563734293, "step": 41990}, {"loss": 0.6453, "grad_norm": 0.7310585379600525, "learning_rate": 0.0002, "epoch": 3.0161579892280073, "step": 42000}, {"loss": 0.5674, "grad_norm": 0.8475853800773621, "learning_rate": 0.0002, "epoch": 3.0168761220825853, "step": 42010}, {"loss": 0.605, "grad_norm": 0.8509864807128906, "learning_rate": 0.0002, "epoch": 3.0175942549371633, "step": 42020}, {"loss": 0.6487, "grad_norm": 0.7461876273155212, "learning_rate": 0.0002, "epoch": 3.0183123877917413, "step": 42030}, {"loss": 0.6136, "grad_norm": 0.7734265327453613, "learning_rate": 0.0002, "epoch": 3.0190305206463197, "step": 42040}, {"loss": 0.6073, "grad_norm": 0.9056455492973328, "learning_rate": 0.0002, "epoch": 3.0197486535008977, "step": 42050}, {"loss": 0.6015, "grad_norm": 0.9183889031410217, "learning_rate": 0.0002, "epoch": 3.0204667863554757, "step": 42060}, {"loss": 0.6502, "grad_norm": 1.0777326822280884, "learning_rate": 0.0002, "epoch": 3.0211849192100537, "step": 42070}, {"loss": 0.6775, "grad_norm": 0.9217308163642883, "learning_rate": 0.0002, "epoch": 3.021903052064632, "step": 42080}, {"loss": 0.6157, "grad_norm": 0.8220202326774597, "learning_rate": 0.0002, "epoch": 3.02262118491921, "step": 42090}, {"loss": 0.5786, "grad_norm": 0.8454978466033936, "learning_rate": 0.0002, "epoch": 3.023339317773788, "step": 42100}, {"loss": 0.5653, "grad_norm": 0.8116370439529419, "learning_rate": 0.0002, "epoch": 3.024057450628366, "step": 42110}, {"loss": 0.6307, "grad_norm": 0.8064935207366943, "learning_rate": 0.0002, "epoch": 3.024775583482944, "step": 42120}, {"loss": 0.6567, "grad_norm": 0.9718650579452515, "learning_rate": 0.0002, "epoch": 3.0254937163375226, "step": 42130}, {"loss": 0.5936, "grad_norm": 0.8817588090896606, "learning_rate": 0.0002, "epoch": 3.0262118491921006, "step": 42140}, {"loss": 0.5625, "grad_norm": 0.7757318615913391, "learning_rate": 0.0002, "epoch": 3.0269299820466786, "step": 42150}, {"loss": 0.5704, "grad_norm": 0.7500545382499695, "learning_rate": 0.0002, "epoch": 3.0276481149012566, "step": 42160}, {"loss": 0.5635, "grad_norm": 0.72913658618927, "learning_rate": 0.0002, "epoch": 3.0283662477558346, "step": 42170}, {"loss": 0.6354, "grad_norm": 0.7641891837120056, "learning_rate": 0.0002, "epoch": 3.029084380610413, "step": 42180}, {"loss": 0.621, "grad_norm": 0.7682021856307983, "learning_rate": 0.0002, "epoch": 3.029802513464991, "step": 42190}, {"loss": 0.6377, "grad_norm": 0.8145958781242371, "learning_rate": 0.0002, "epoch": 3.030520646319569, "step": 42200}, {"loss": 0.6008, "grad_norm": 1.0546396970748901, "learning_rate": 0.0002, "epoch": 3.031238779174147, "step": 42210}, {"loss": 0.6177, "grad_norm": 0.8222804665565491, "learning_rate": 0.0002, "epoch": 3.0319569120287255, "step": 42220}, {"loss": 0.6264, "grad_norm": 0.8245829343795776, "learning_rate": 0.0002, "epoch": 3.0326750448833035, "step": 42230}, {"loss": 0.5828, "grad_norm": 0.9059963822364807, "learning_rate": 0.0002, "epoch": 3.0333931777378815, "step": 42240}, {"loss": 0.6373, "grad_norm": 1.026747465133667, "learning_rate": 0.0002, "epoch": 3.0341113105924595, "step": 42250}, {"loss": 0.636, "grad_norm": 0.9108404517173767, "learning_rate": 0.0002, "epoch": 3.0348294434470375, "step": 42260}, {"loss": 0.589, "grad_norm": 0.9828516840934753, "learning_rate": 0.0002, "epoch": 3.035547576301616, "step": 42270}, {"loss": 0.6558, "grad_norm": 0.9664266705513, "learning_rate": 0.0002, "epoch": 3.036265709156194, "step": 42280}, {"loss": 0.6157, "grad_norm": 0.7577654719352722, "learning_rate": 0.0002, "epoch": 3.036983842010772, "step": 42290}, {"loss": 0.5849, "grad_norm": 0.8331853151321411, "learning_rate": 0.0002, "epoch": 3.03770197486535, "step": 42300}, {"loss": 0.6335, "grad_norm": 0.8017228245735168, "learning_rate": 0.0002, "epoch": 3.038420107719928, "step": 42310}, {"loss": 0.6148, "grad_norm": 1.0316718816757202, "learning_rate": 0.0002, "epoch": 3.0391382405745064, "step": 42320}, {"loss": 0.5934, "grad_norm": 0.9379803538322449, "learning_rate": 0.0002, "epoch": 3.0398563734290844, "step": 42330}, {"loss": 0.6358, "grad_norm": 0.7554476857185364, "learning_rate": 0.0002, "epoch": 3.0405745062836624, "step": 42340}, {"loss": 0.5951, "grad_norm": 0.7377917766571045, "learning_rate": 0.0002, "epoch": 3.0412926391382404, "step": 42350}, {"loss": 0.5769, "grad_norm": 1.0655276775360107, "learning_rate": 0.0002, "epoch": 3.042010771992819, "step": 42360}, {"loss": 0.5892, "grad_norm": 0.7748511433601379, "learning_rate": 0.0002, "epoch": 3.042728904847397, "step": 42370}, {"loss": 0.6512, "grad_norm": 0.848649799823761, "learning_rate": 0.0002, "epoch": 3.043447037701975, "step": 42380}, {"loss": 0.6411, "grad_norm": 0.7754636406898499, "learning_rate": 0.0002, "epoch": 3.044165170556553, "step": 42390}, {"loss": 0.6665, "grad_norm": 0.8173656463623047, "learning_rate": 0.0002, "epoch": 3.044883303411131, "step": 42400}, {"loss": 0.5877, "grad_norm": 0.7881983518600464, "learning_rate": 0.0002, "epoch": 3.0456014362657093, "step": 42410}, {"loss": 0.5832, "grad_norm": 0.971072256565094, "learning_rate": 0.0002, "epoch": 3.0463195691202873, "step": 42420}, {"loss": 0.6303, "grad_norm": 0.8400143384933472, "learning_rate": 0.0002, "epoch": 3.0470377019748653, "step": 42430}, {"loss": 0.6557, "grad_norm": 1.0028647184371948, "learning_rate": 0.0002, "epoch": 3.0477558348294433, "step": 42440}, {"loss": 0.5949, "grad_norm": 0.9728034734725952, "learning_rate": 0.0002, "epoch": 3.0484739676840213, "step": 42450}, {"loss": 0.6222, "grad_norm": 0.937633752822876, "learning_rate": 0.0002, "epoch": 3.0491921005386, "step": 42460}, {"loss": 0.6254, "grad_norm": 1.0265642404556274, "learning_rate": 0.0002, "epoch": 3.049910233393178, "step": 42470}, {"loss": 0.6078, "grad_norm": 0.9733216762542725, "learning_rate": 0.0002, "epoch": 3.050628366247756, "step": 42480}, {"loss": 0.5766, "grad_norm": 0.7039174437522888, "learning_rate": 0.0002, "epoch": 3.051346499102334, "step": 42490}, {"loss": 0.6422, "grad_norm": 0.7515231370925903, "learning_rate": 0.0002, "epoch": 3.0520646319569122, "step": 42500}, {"loss": 0.5517, "grad_norm": 0.9115300178527832, "learning_rate": 0.0002, "epoch": 3.0527827648114902, "step": 42510}, {"loss": 0.6738, "grad_norm": 0.7403655648231506, "learning_rate": 0.0002, "epoch": 3.0535008976660682, "step": 42520}, {"loss": 0.5528, "grad_norm": 0.7826810479164124, "learning_rate": 0.0002, "epoch": 3.0542190305206462, "step": 42530}, {"loss": 0.6513, "grad_norm": 0.8007349371910095, "learning_rate": 0.0002, "epoch": 3.0549371633752243, "step": 42540}, {"loss": 0.6118, "grad_norm": 0.7975959777832031, "learning_rate": 0.0002, "epoch": 3.0556552962298027, "step": 42550}, {"loss": 0.6157, "grad_norm": 0.9665228128433228, "learning_rate": 0.0002, "epoch": 3.0563734290843807, "step": 42560}, {"loss": 0.6095, "grad_norm": 0.8386123180389404, "learning_rate": 0.0002, "epoch": 3.0570915619389587, "step": 42570}, {"loss": 0.64, "grad_norm": 0.7437782287597656, "learning_rate": 0.0002, "epoch": 3.0578096947935367, "step": 42580}, {"loss": 0.6399, "grad_norm": 0.8360698223114014, "learning_rate": 0.0002, "epoch": 3.0585278276481147, "step": 42590}, {"loss": 0.6259, "grad_norm": 0.8982073664665222, "learning_rate": 0.0002, "epoch": 3.059245960502693, "step": 42600}, {"loss": 0.6235, "grad_norm": 0.9425758719444275, "learning_rate": 0.0002, "epoch": 3.059964093357271, "step": 42610}, {"loss": 0.631, "grad_norm": 0.8567131161689758, "learning_rate": 0.0002, "epoch": 3.060682226211849, "step": 42620}, {"loss": 0.609, "grad_norm": 0.9322942495346069, "learning_rate": 0.0002, "epoch": 3.061400359066427, "step": 42630}, {"loss": 0.6384, "grad_norm": 0.8283235430717468, "learning_rate": 0.0002, "epoch": 3.0621184919210056, "step": 42640}, {"loss": 0.6345, "grad_norm": 0.8457967638969421, "learning_rate": 0.0002, "epoch": 3.0628366247755836, "step": 42650}, {"loss": 0.631, "grad_norm": 0.8205100893974304, "learning_rate": 0.0002, "epoch": 3.0635547576301616, "step": 42660}, {"loss": 0.6094, "grad_norm": 0.8385181427001953, "learning_rate": 0.0002, "epoch": 3.0642728904847396, "step": 42670}, {"loss": 0.6169, "grad_norm": 1.2959390878677368, "learning_rate": 0.0002, "epoch": 3.0649910233393176, "step": 42680}, {"loss": 0.6531, "grad_norm": 0.7150540351867676, "learning_rate": 0.0002, "epoch": 3.065709156193896, "step": 42690}, {"loss": 0.6456, "grad_norm": 0.6647360920906067, "learning_rate": 0.0002, "epoch": 3.066427289048474, "step": 42700}, {"loss": 0.6151, "grad_norm": 0.9148316979408264, "learning_rate": 0.0002, "epoch": 3.067145421903052, "step": 42710}, {"loss": 0.6298, "grad_norm": 0.8606209754943848, "learning_rate": 0.0002, "epoch": 3.06786355475763, "step": 42720}, {"loss": 0.636, "grad_norm": 1.4255632162094116, "learning_rate": 0.0002, "epoch": 3.068581687612208, "step": 42730}, {"loss": 0.6363, "grad_norm": 0.9131710529327393, "learning_rate": 0.0002, "epoch": 3.0692998204667865, "step": 42740}, {"loss": 0.6432, "grad_norm": 0.9560360908508301, "learning_rate": 0.0002, "epoch": 3.0700179533213645, "step": 42750}, {"loss": 0.6259, "grad_norm": 0.9278100728988647, "learning_rate": 0.0002, "epoch": 3.0707360861759425, "step": 42760}, {"loss": 0.6001, "grad_norm": 0.7258471846580505, "learning_rate": 0.0002, "epoch": 3.0714542190305205, "step": 42770}, {"loss": 0.6447, "grad_norm": 1.1537690162658691, "learning_rate": 0.0002, "epoch": 3.072172351885099, "step": 42780}, {"loss": 0.6237, "grad_norm": 0.8562588691711426, "learning_rate": 0.0002, "epoch": 3.072890484739677, "step": 42790}, {"loss": 0.645, "grad_norm": 1.0271626710891724, "learning_rate": 0.0002, "epoch": 3.073608617594255, "step": 42800}, {"loss": 0.6782, "grad_norm": 0.85148024559021, "learning_rate": 0.0002, "epoch": 3.074326750448833, "step": 42810}, {"loss": 0.5905, "grad_norm": 0.805772602558136, "learning_rate": 0.0002, "epoch": 3.075044883303411, "step": 42820}, {"loss": 0.623, "grad_norm": 0.8057122230529785, "learning_rate": 0.0002, "epoch": 3.0757630161579894, "step": 42830}, {"loss": 0.6391, "grad_norm": 0.7997274994850159, "learning_rate": 0.0002, "epoch": 3.0764811490125674, "step": 42840}, {"loss": 0.5965, "grad_norm": 0.8739321231842041, "learning_rate": 0.0002, "epoch": 3.0771992818671454, "step": 42850}, {"loss": 0.6027, "grad_norm": 0.833951473236084, "learning_rate": 0.0002, "epoch": 3.0779174147217234, "step": 42860}, {"loss": 0.6251, "grad_norm": 0.8813839554786682, "learning_rate": 0.0002, "epoch": 3.0786355475763014, "step": 42870}, {"loss": 0.6485, "grad_norm": 0.9020521640777588, "learning_rate": 0.0002, "epoch": 3.07935368043088, "step": 42880}, {"loss": 0.5719, "grad_norm": 0.888148844242096, "learning_rate": 0.0002, "epoch": 3.080071813285458, "step": 42890}, {"loss": 0.6715, "grad_norm": 0.8110589385032654, "learning_rate": 0.0002, "epoch": 3.080789946140036, "step": 42900}, {"loss": 0.5931, "grad_norm": 0.818738579750061, "learning_rate": 0.0002, "epoch": 3.081508078994614, "step": 42910}, {"loss": 0.6723, "grad_norm": 0.9607479572296143, "learning_rate": 0.0002, "epoch": 3.082226211849192, "step": 42920}, {"loss": 0.6045, "grad_norm": 0.8162698745727539, "learning_rate": 0.0002, "epoch": 3.0829443447037703, "step": 42930}, {"loss": 0.5975, "grad_norm": 0.8170801997184753, "learning_rate": 0.0002, "epoch": 3.0836624775583483, "step": 42940}, {"loss": 0.5748, "grad_norm": 0.9250763654708862, "learning_rate": 0.0002, "epoch": 3.0843806104129263, "step": 42950}, {"loss": 0.6651, "grad_norm": 0.898097813129425, "learning_rate": 0.0002, "epoch": 3.0850987432675043, "step": 42960}, {"loss": 0.6573, "grad_norm": 0.9398433566093445, "learning_rate": 0.0002, "epoch": 3.0858168761220828, "step": 42970}, {"loss": 0.6243, "grad_norm": 1.052808165550232, "learning_rate": 0.0002, "epoch": 3.0865350089766608, "step": 42980}, {"loss": 0.6622, "grad_norm": 0.8974723219871521, "learning_rate": 0.0002, "epoch": 3.087253141831239, "step": 42990}, {"loss": 0.6135, "grad_norm": 0.7517408728599548, "learning_rate": 0.0002, "epoch": 3.087971274685817, "step": 43000}, {"loss": 0.6185, "grad_norm": 0.8054485321044922, "learning_rate": 0.0002, "epoch": 3.088689407540395, "step": 43010}, {"loss": 0.6199, "grad_norm": 0.9896154999732971, "learning_rate": 0.0002, "epoch": 3.0894075403949732, "step": 43020}, {"loss": 0.6308, "grad_norm": 0.7887356281280518, "learning_rate": 0.0002, "epoch": 3.0901256732495512, "step": 43030}, {"loss": 0.6173, "grad_norm": 1.0119125843048096, "learning_rate": 0.0002, "epoch": 3.0908438061041292, "step": 43040}, {"loss": 0.6294, "grad_norm": 0.8753892779350281, "learning_rate": 0.0002, "epoch": 3.0915619389587072, "step": 43050}, {"loss": 0.6068, "grad_norm": 0.8322654962539673, "learning_rate": 0.0002, "epoch": 3.0922800718132857, "step": 43060}, {"loss": 0.6237, "grad_norm": 1.0605992078781128, "learning_rate": 0.0002, "epoch": 3.0929982046678637, "step": 43070}, {"loss": 0.6507, "grad_norm": 0.8783912062644958, "learning_rate": 0.0002, "epoch": 3.0937163375224417, "step": 43080}, {"loss": 0.6023, "grad_norm": 0.8839107751846313, "learning_rate": 0.0002, "epoch": 3.0944344703770197, "step": 43090}, {"loss": 0.6588, "grad_norm": 1.1655086278915405, "learning_rate": 0.0002, "epoch": 3.0951526032315977, "step": 43100}, {"loss": 0.6367, "grad_norm": 0.7051523327827454, "learning_rate": 0.0002, "epoch": 3.095870736086176, "step": 43110}, {"loss": 0.5941, "grad_norm": 0.7793807983398438, "learning_rate": 0.0002, "epoch": 3.096588868940754, "step": 43120}, {"loss": 0.6073, "grad_norm": 0.8352194428443909, "learning_rate": 0.0002, "epoch": 3.097307001795332, "step": 43130}, {"loss": 0.6087, "grad_norm": 0.9684847593307495, "learning_rate": 0.0002, "epoch": 3.09802513464991, "step": 43140}, {"loss": 0.6347, "grad_norm": 1.1106340885162354, "learning_rate": 0.0002, "epoch": 3.098743267504488, "step": 43150}, {"loss": 0.6395, "grad_norm": 0.7814911603927612, "learning_rate": 0.0002, "epoch": 3.0994614003590666, "step": 43160}, {"loss": 0.637, "grad_norm": 0.7923110723495483, "learning_rate": 0.0002, "epoch": 3.1001795332136446, "step": 43170}, {"loss": 0.6218, "grad_norm": 0.87022864818573, "learning_rate": 0.0002, "epoch": 3.1008976660682226, "step": 43180}, {"loss": 0.6246, "grad_norm": 0.9352855682373047, "learning_rate": 0.0002, "epoch": 3.1016157989228006, "step": 43190}, {"loss": 0.5943, "grad_norm": 0.8548445105552673, "learning_rate": 0.0002, "epoch": 3.1023339317773786, "step": 43200}, {"loss": 0.6106, "grad_norm": 0.9576025009155273, "learning_rate": 0.0002, "epoch": 3.103052064631957, "step": 43210}, {"loss": 0.6222, "grad_norm": 0.7430430054664612, "learning_rate": 0.0002, "epoch": 3.103770197486535, "step": 43220}, {"loss": 0.6223, "grad_norm": 0.9619144797325134, "learning_rate": 0.0002, "epoch": 3.104488330341113, "step": 43230}, {"loss": 0.6171, "grad_norm": 0.8622338771820068, "learning_rate": 0.0002, "epoch": 3.105206463195691, "step": 43240}, {"loss": 0.6336, "grad_norm": 0.853489339351654, "learning_rate": 0.0002, "epoch": 3.1059245960502695, "step": 43250}, {"loss": 0.635, "grad_norm": 0.9253206849098206, "learning_rate": 0.0002, "epoch": 3.1066427289048475, "step": 43260}, {"loss": 0.68, "grad_norm": 0.9700671434402466, "learning_rate": 0.0002, "epoch": 3.1073608617594255, "step": 43270}, {"loss": 0.6284, "grad_norm": 1.0550731420516968, "learning_rate": 0.0002, "epoch": 3.1080789946140035, "step": 43280}, {"loss": 0.6389, "grad_norm": 0.939452052116394, "learning_rate": 0.0002, "epoch": 3.1087971274685815, "step": 43290}, {"loss": 0.621, "grad_norm": 0.8855276107788086, "learning_rate": 0.0002, "epoch": 3.10951526032316, "step": 43300}, {"loss": 0.5814, "grad_norm": 0.92197185754776, "learning_rate": 0.0002, "epoch": 3.110233393177738, "step": 43310}, {"loss": 0.6341, "grad_norm": 0.8825578689575195, "learning_rate": 0.0002, "epoch": 3.110951526032316, "step": 43320}, {"loss": 0.6412, "grad_norm": 0.9964608550071716, "learning_rate": 0.0002, "epoch": 3.111669658886894, "step": 43330}, {"loss": 0.6074, "grad_norm": 0.9070520401000977, "learning_rate": 0.0002, "epoch": 3.1123877917414724, "step": 43340}, {"loss": 0.6503, "grad_norm": 0.9699633717536926, "learning_rate": 0.0002, "epoch": 3.1131059245960504, "step": 43350}, {"loss": 0.6545, "grad_norm": 0.7384091019630432, "learning_rate": 0.0002, "epoch": 3.1138240574506284, "step": 43360}, {"loss": 0.6644, "grad_norm": 0.9445326328277588, "learning_rate": 0.0002, "epoch": 3.1145421903052064, "step": 43370}, {"loss": 0.6088, "grad_norm": 0.8906524181365967, "learning_rate": 0.0002, "epoch": 3.1152603231597844, "step": 43380}, {"loss": 0.6213, "grad_norm": 0.8850129246711731, "learning_rate": 0.0002, "epoch": 3.115978456014363, "step": 43390}, {"loss": 0.6156, "grad_norm": 0.7091860771179199, "learning_rate": 0.0002, "epoch": 3.116696588868941, "step": 43400}, {"loss": 0.6056, "grad_norm": 0.8992764949798584, "learning_rate": 0.0002, "epoch": 3.117414721723519, "step": 43410}, {"loss": 0.6336, "grad_norm": 0.9166698455810547, "learning_rate": 0.0002, "epoch": 3.118132854578097, "step": 43420}, {"loss": 0.7011, "grad_norm": 1.1195749044418335, "learning_rate": 0.0002, "epoch": 3.118850987432675, "step": 43430}, {"loss": 0.6409, "grad_norm": 0.9414069652557373, "learning_rate": 0.0002, "epoch": 3.1195691202872533, "step": 43440}, {"loss": 0.6533, "grad_norm": 0.7641217112541199, "learning_rate": 0.0002, "epoch": 3.1202872531418313, "step": 43450}, {"loss": 0.6613, "grad_norm": 1.2659285068511963, "learning_rate": 0.0002, "epoch": 3.1210053859964093, "step": 43460}, {"loss": 0.631, "grad_norm": 0.9968213438987732, "learning_rate": 0.0002, "epoch": 3.1217235188509873, "step": 43470}, {"loss": 0.5833, "grad_norm": 0.8819042444229126, "learning_rate": 0.0002, "epoch": 3.1224416517055653, "step": 43480}, {"loss": 0.6819, "grad_norm": 0.9124775528907776, "learning_rate": 0.0002, "epoch": 3.1231597845601438, "step": 43490}, {"loss": 0.675, "grad_norm": 0.868354082107544, "learning_rate": 0.0002, "epoch": 3.1238779174147218, "step": 43500}, {"loss": 0.6348, "grad_norm": 0.7367526292800903, "learning_rate": 0.0002, "epoch": 3.1245960502692998, "step": 43510}, {"loss": 0.6068, "grad_norm": 0.7553679943084717, "learning_rate": 0.0002, "epoch": 3.1253141831238778, "step": 43520}, {"loss": 0.6346, "grad_norm": 0.7970008850097656, "learning_rate": 0.0002, "epoch": 3.126032315978456, "step": 43530}, {"loss": 0.6357, "grad_norm": 0.9117488861083984, "learning_rate": 0.0002, "epoch": 3.126750448833034, "step": 43540}, {"loss": 0.6609, "grad_norm": 0.8004103899002075, "learning_rate": 0.0002, "epoch": 3.127468581687612, "step": 43550}, {"loss": 0.596, "grad_norm": 0.736518919467926, "learning_rate": 0.0002, "epoch": 3.12818671454219, "step": 43560}, {"loss": 0.5945, "grad_norm": 0.8568395376205444, "learning_rate": 0.0002, "epoch": 3.128904847396768, "step": 43570}, {"loss": 0.665, "grad_norm": 0.9344052672386169, "learning_rate": 0.0002, "epoch": 3.1296229802513467, "step": 43580}, {"loss": 0.6403, "grad_norm": 0.7986525297164917, "learning_rate": 0.0002, "epoch": 3.1303411131059247, "step": 43590}, {"loss": 0.61, "grad_norm": 0.8283242583274841, "learning_rate": 0.0002, "epoch": 3.1310592459605027, "step": 43600}, {"loss": 0.6003, "grad_norm": 0.6534292101860046, "learning_rate": 0.0002, "epoch": 3.1317773788150807, "step": 43610}, {"loss": 0.6994, "grad_norm": 0.9585428833961487, "learning_rate": 0.0002, "epoch": 3.132495511669659, "step": 43620}, {"loss": 0.6007, "grad_norm": 0.8299157023429871, "learning_rate": 0.0002, "epoch": 3.133213644524237, "step": 43630}, {"loss": 0.6169, "grad_norm": 0.9050052762031555, "learning_rate": 0.0002, "epoch": 3.133931777378815, "step": 43640}, {"loss": 0.6217, "grad_norm": 1.0457062721252441, "learning_rate": 0.0002, "epoch": 3.134649910233393, "step": 43650}, {"loss": 0.6147, "grad_norm": 0.907691240310669, "learning_rate": 0.0002, "epoch": 3.135368043087971, "step": 43660}, {"loss": 0.5808, "grad_norm": 0.8868935108184814, "learning_rate": 0.0002, "epoch": 3.1360861759425496, "step": 43670}, {"loss": 0.6427, "grad_norm": 0.8585456609725952, "learning_rate": 0.0002, "epoch": 3.1368043087971276, "step": 43680}, {"loss": 0.6242, "grad_norm": 1.0402741432189941, "learning_rate": 0.0002, "epoch": 3.1375224416517056, "step": 43690}, {"loss": 0.641, "grad_norm": 1.0866798162460327, "learning_rate": 0.0002, "epoch": 3.1382405745062836, "step": 43700}, {"loss": 0.6082, "grad_norm": 0.7637296915054321, "learning_rate": 0.0002, "epoch": 3.1389587073608616, "step": 43710}, {"loss": 0.6256, "grad_norm": 0.755235493183136, "learning_rate": 0.0002, "epoch": 3.13967684021544, "step": 43720}, {"loss": 0.6441, "grad_norm": 0.7258853316307068, "learning_rate": 0.0002, "epoch": 3.140394973070018, "step": 43730}, {"loss": 0.5891, "grad_norm": 1.0425268411636353, "learning_rate": 0.0002, "epoch": 3.141113105924596, "step": 43740}, {"loss": 0.6527, "grad_norm": 0.9171959757804871, "learning_rate": 0.0002, "epoch": 3.141831238779174, "step": 43750}, {"loss": 0.6365, "grad_norm": 0.8900150656700134, "learning_rate": 0.0002, "epoch": 3.142549371633752, "step": 43760}, {"loss": 0.6324, "grad_norm": 0.9879246354103088, "learning_rate": 0.0002, "epoch": 3.1432675044883305, "step": 43770}, {"loss": 0.6624, "grad_norm": 0.7853389382362366, "learning_rate": 0.0002, "epoch": 3.1439856373429085, "step": 43780}, {"loss": 0.6259, "grad_norm": 1.0245232582092285, "learning_rate": 0.0002, "epoch": 3.1447037701974865, "step": 43790}, {"loss": 0.6278, "grad_norm": 0.8486390113830566, "learning_rate": 0.0002, "epoch": 3.1454219030520645, "step": 43800}, {"loss": 0.6175, "grad_norm": 0.8536406755447388, "learning_rate": 0.0002, "epoch": 3.146140035906643, "step": 43810}, {"loss": 0.5901, "grad_norm": 0.9653734564781189, "learning_rate": 0.0002, "epoch": 3.146858168761221, "step": 43820}, {"loss": 0.6041, "grad_norm": 0.8292608857154846, "learning_rate": 0.0002, "epoch": 3.147576301615799, "step": 43830}, {"loss": 0.6688, "grad_norm": 1.147524118423462, "learning_rate": 0.0002, "epoch": 3.148294434470377, "step": 43840}, {"loss": 0.6155, "grad_norm": 0.9317546486854553, "learning_rate": 0.0002, "epoch": 3.149012567324955, "step": 43850}, {"loss": 0.6305, "grad_norm": 0.8651045560836792, "learning_rate": 0.0002, "epoch": 3.1497307001795334, "step": 43860}, {"loss": 0.5985, "grad_norm": 0.8718969225883484, "learning_rate": 0.0002, "epoch": 3.1504488330341114, "step": 43870}, {"loss": 0.6206, "grad_norm": 1.0140702724456787, "learning_rate": 0.0002, "epoch": 3.1511669658886894, "step": 43880}, {"loss": 0.5941, "grad_norm": 0.75941401720047, "learning_rate": 0.0002, "epoch": 3.1518850987432674, "step": 43890}, {"loss": 0.5957, "grad_norm": 0.6618940234184265, "learning_rate": 0.0002, "epoch": 3.152603231597846, "step": 43900}, {"loss": 0.6262, "grad_norm": 1.0013338327407837, "learning_rate": 0.0002, "epoch": 3.153321364452424, "step": 43910}, {"loss": 0.6263, "grad_norm": 0.8735299706459045, "learning_rate": 0.0002, "epoch": 3.154039497307002, "step": 43920}, {"loss": 0.627, "grad_norm": 1.141914963722229, "learning_rate": 0.0002, "epoch": 3.15475763016158, "step": 43930}, {"loss": 0.6604, "grad_norm": 1.0916038751602173, "learning_rate": 0.0002, "epoch": 3.155475763016158, "step": 43940}, {"loss": 0.6228, "grad_norm": 0.7042547464370728, "learning_rate": 0.0002, "epoch": 3.1561938958707363, "step": 43950}, {"loss": 0.6069, "grad_norm": 0.9885236620903015, "learning_rate": 0.0002, "epoch": 3.1569120287253143, "step": 43960}, {"loss": 0.5973, "grad_norm": 0.8083009719848633, "learning_rate": 0.0002, "epoch": 3.1576301615798923, "step": 43970}, {"loss": 0.6416, "grad_norm": 1.082627296447754, "learning_rate": 0.0002, "epoch": 3.1583482944344703, "step": 43980}, {"loss": 0.624, "grad_norm": 0.9293290376663208, "learning_rate": 0.0002, "epoch": 3.1590664272890483, "step": 43990}, {"loss": 0.5665, "grad_norm": 0.861003041267395, "learning_rate": 0.0002, "epoch": 3.1597845601436267, "step": 44000}, {"loss": 0.6221, "grad_norm": 0.9565994143486023, "learning_rate": 0.0002, "epoch": 3.1605026929982047, "step": 44010}, {"loss": 0.7038, "grad_norm": 0.9609305262565613, "learning_rate": 0.0002, "epoch": 3.1612208258527827, "step": 44020}, {"loss": 0.6064, "grad_norm": 0.847830593585968, "learning_rate": 0.0002, "epoch": 3.1619389587073607, "step": 44030}, {"loss": 0.6299, "grad_norm": 0.852357804775238, "learning_rate": 0.0002, "epoch": 3.1626570915619387, "step": 44040}, {"loss": 0.5943, "grad_norm": 0.8634562492370605, "learning_rate": 0.0002, "epoch": 3.163375224416517, "step": 44050}, {"loss": 0.6011, "grad_norm": 1.0259950160980225, "learning_rate": 0.0002, "epoch": 3.164093357271095, "step": 44060}, {"loss": 0.7039, "grad_norm": 0.9615250825881958, "learning_rate": 0.0002, "epoch": 3.164811490125673, "step": 44070}, {"loss": 0.6179, "grad_norm": 0.9892165660858154, "learning_rate": 0.0002, "epoch": 3.165529622980251, "step": 44080}, {"loss": 0.6295, "grad_norm": 0.8827354907989502, "learning_rate": 0.0002, "epoch": 3.1662477558348296, "step": 44090}, {"loss": 0.6131, "grad_norm": 0.9258168339729309, "learning_rate": 0.0002, "epoch": 3.1669658886894076, "step": 44100}, {"loss": 0.5746, "grad_norm": 0.7983399033546448, "learning_rate": 0.0002, "epoch": 3.1676840215439857, "step": 44110}, {"loss": 0.6075, "grad_norm": 0.9917809963226318, "learning_rate": 0.0002, "epoch": 3.1684021543985637, "step": 44120}, {"loss": 0.6474, "grad_norm": 1.058927297592163, "learning_rate": 0.0002, "epoch": 3.1691202872531417, "step": 44130}, {"loss": 0.6211, "grad_norm": 1.0095895528793335, "learning_rate": 0.0002, "epoch": 3.16983842010772, "step": 44140}, {"loss": 0.6586, "grad_norm": 0.9032495617866516, "learning_rate": 0.0002, "epoch": 3.170556552962298, "step": 44150}, {"loss": 0.6356, "grad_norm": 0.9391272664070129, "learning_rate": 0.0002, "epoch": 3.171274685816876, "step": 44160}, {"loss": 0.6324, "grad_norm": 0.990755558013916, "learning_rate": 0.0002, "epoch": 3.171992818671454, "step": 44170}, {"loss": 0.5647, "grad_norm": 0.9310759902000427, "learning_rate": 0.0002, "epoch": 3.172710951526032, "step": 44180}, {"loss": 0.6802, "grad_norm": 0.7698856592178345, "learning_rate": 0.0002, "epoch": 3.1734290843806106, "step": 44190}, {"loss": 0.6109, "grad_norm": 0.7735867500305176, "learning_rate": 0.0002, "epoch": 3.1741472172351886, "step": 44200}, {"loss": 0.6252, "grad_norm": 1.1447525024414062, "learning_rate": 0.0002, "epoch": 3.1748653500897666, "step": 44210}, {"loss": 0.6268, "grad_norm": 0.8667060136795044, "learning_rate": 0.0002, "epoch": 3.1755834829443446, "step": 44220}, {"loss": 0.6066, "grad_norm": 0.8596829771995544, "learning_rate": 0.0002, "epoch": 3.176301615798923, "step": 44230}, {"loss": 0.6142, "grad_norm": 0.8607654571533203, "learning_rate": 0.0002, "epoch": 3.177019748653501, "step": 44240}, {"loss": 0.6358, "grad_norm": 0.9346948266029358, "learning_rate": 0.0002, "epoch": 3.177737881508079, "step": 44250}, {"loss": 0.6099, "grad_norm": 0.852344810962677, "learning_rate": 0.0002, "epoch": 3.178456014362657, "step": 44260}, {"loss": 0.5759, "grad_norm": 0.9260450005531311, "learning_rate": 0.0002, "epoch": 3.179174147217235, "step": 44270}, {"loss": 0.6419, "grad_norm": 0.924053430557251, "learning_rate": 0.0002, "epoch": 3.1798922800718135, "step": 44280}, {"loss": 0.6456, "grad_norm": 1.001965045928955, "learning_rate": 0.0002, "epoch": 3.1806104129263915, "step": 44290}, {"loss": 0.6211, "grad_norm": 0.943215012550354, "learning_rate": 0.0002, "epoch": 3.1813285457809695, "step": 44300}, {"loss": 0.6261, "grad_norm": 1.006977915763855, "learning_rate": 0.0002, "epoch": 3.1820466786355475, "step": 44310}, {"loss": 0.6684, "grad_norm": 0.9768950343132019, "learning_rate": 0.0002, "epoch": 3.1827648114901255, "step": 44320}, {"loss": 0.6334, "grad_norm": 0.9297489523887634, "learning_rate": 0.0002, "epoch": 3.183482944344704, "step": 44330}, {"loss": 0.6291, "grad_norm": 0.9110919237136841, "learning_rate": 0.0002, "epoch": 3.184201077199282, "step": 44340}, {"loss": 0.6389, "grad_norm": 0.9821381568908691, "learning_rate": 0.0002, "epoch": 3.18491921005386, "step": 44350}, {"loss": 0.6342, "grad_norm": 0.8451243042945862, "learning_rate": 0.0002, "epoch": 3.185637342908438, "step": 44360}, {"loss": 0.6709, "grad_norm": 0.9676638245582581, "learning_rate": 0.0002, "epoch": 3.1863554757630164, "step": 44370}, {"loss": 0.6506, "grad_norm": 0.9826035499572754, "learning_rate": 0.0002, "epoch": 3.1870736086175944, "step": 44380}, {"loss": 0.6425, "grad_norm": 0.9453121423721313, "learning_rate": 0.0002, "epoch": 3.1877917414721724, "step": 44390}, {"loss": 0.6481, "grad_norm": 0.7766330242156982, "learning_rate": 0.0002, "epoch": 3.1885098743267504, "step": 44400}, {"loss": 0.6369, "grad_norm": 0.9302349090576172, "learning_rate": 0.0002, "epoch": 3.1892280071813284, "step": 44410}, {"loss": 0.5586, "grad_norm": 0.8335331082344055, "learning_rate": 0.0002, "epoch": 3.189946140035907, "step": 44420}, {"loss": 0.673, "grad_norm": 0.6722736358642578, "learning_rate": 0.0002, "epoch": 3.190664272890485, "step": 44430}, {"loss": 0.6809, "grad_norm": 0.9047536849975586, "learning_rate": 0.0002, "epoch": 3.191382405745063, "step": 44440}, {"loss": 0.6085, "grad_norm": 0.9653822183609009, "learning_rate": 0.0002, "epoch": 3.192100538599641, "step": 44450}, {"loss": 0.6071, "grad_norm": 0.7750703692436218, "learning_rate": 0.0002, "epoch": 3.192818671454219, "step": 44460}, {"loss": 0.6323, "grad_norm": 0.7767539024353027, "learning_rate": 0.0002, "epoch": 3.1935368043087973, "step": 44470}, {"loss": 0.6471, "grad_norm": 0.8597778081893921, "learning_rate": 0.0002, "epoch": 3.1942549371633753, "step": 44480}, {"loss": 0.6804, "grad_norm": 1.1711493730545044, "learning_rate": 0.0002, "epoch": 3.1949730700179533, "step": 44490}, {"loss": 0.5917, "grad_norm": 0.9025220274925232, "learning_rate": 0.0002, "epoch": 3.1956912028725313, "step": 44500}, {"loss": 0.6445, "grad_norm": 0.8084979057312012, "learning_rate": 0.0002, "epoch": 3.1964093357271093, "step": 44510}, {"loss": 0.5943, "grad_norm": 0.8475074172019958, "learning_rate": 0.0002, "epoch": 3.1971274685816877, "step": 44520}, {"loss": 0.5959, "grad_norm": 0.9915644526481628, "learning_rate": 0.0002, "epoch": 3.1978456014362657, "step": 44530}, {"loss": 0.627, "grad_norm": 0.992231547832489, "learning_rate": 0.0002, "epoch": 3.1985637342908437, "step": 44540}, {"loss": 0.625, "grad_norm": 0.9804556369781494, "learning_rate": 0.0002, "epoch": 3.1992818671454217, "step": 44550}, {"loss": 0.6534, "grad_norm": 1.045558214187622, "learning_rate": 0.0002, "epoch": 3.2, "step": 44560}, {"loss": 0.6201, "grad_norm": 1.0880261659622192, "learning_rate": 0.0002, "epoch": 3.200718132854578, "step": 44570}, {"loss": 0.6471, "grad_norm": 0.9511138200759888, "learning_rate": 0.0002, "epoch": 3.201436265709156, "step": 44580}, {"loss": 0.5961, "grad_norm": 0.9115344882011414, "learning_rate": 0.0002, "epoch": 3.202154398563734, "step": 44590}, {"loss": 0.6504, "grad_norm": 1.0738362073898315, "learning_rate": 0.0002, "epoch": 3.202872531418312, "step": 44600}, {"loss": 0.6324, "grad_norm": 0.8209697604179382, "learning_rate": 0.0002, "epoch": 3.2035906642728906, "step": 44610}, {"loss": 0.6445, "grad_norm": 0.9220197796821594, "learning_rate": 0.0002, "epoch": 3.2043087971274686, "step": 44620}, {"loss": 0.5798, "grad_norm": 0.8859700560569763, "learning_rate": 0.0002, "epoch": 3.2050269299820466, "step": 44630}, {"loss": 0.6185, "grad_norm": 0.9772757291793823, "learning_rate": 0.0002, "epoch": 3.2057450628366246, "step": 44640}, {"loss": 0.6528, "grad_norm": 0.9385574460029602, "learning_rate": 0.0002, "epoch": 3.206463195691203, "step": 44650}, {"loss": 0.6098, "grad_norm": 0.839958906173706, "learning_rate": 0.0002, "epoch": 3.207181328545781, "step": 44660}, {"loss": 0.6803, "grad_norm": 0.860478401184082, "learning_rate": 0.0002, "epoch": 3.207899461400359, "step": 44670}, {"loss": 0.683, "grad_norm": 0.846886396408081, "learning_rate": 0.0002, "epoch": 3.208617594254937, "step": 44680}, {"loss": 0.6312, "grad_norm": 0.8591006398200989, "learning_rate": 0.0002, "epoch": 3.209335727109515, "step": 44690}, {"loss": 0.6173, "grad_norm": 0.9236023426055908, "learning_rate": 0.0002, "epoch": 3.2100538599640935, "step": 44700}, {"loss": 0.6471, "grad_norm": 0.7348999977111816, "learning_rate": 0.0002, "epoch": 3.2107719928186715, "step": 44710}, {"loss": 0.6239, "grad_norm": 1.0041730403900146, "learning_rate": 0.0002, "epoch": 3.2114901256732495, "step": 44720}, {"loss": 0.6612, "grad_norm": 0.8382687568664551, "learning_rate": 0.0002, "epoch": 3.2122082585278275, "step": 44730}, {"loss": 0.6026, "grad_norm": 0.8253511190414429, "learning_rate": 0.0002, "epoch": 3.2129263913824055, "step": 44740}, {"loss": 0.6129, "grad_norm": 0.9589242935180664, "learning_rate": 0.0002, "epoch": 3.213644524236984, "step": 44750}, {"loss": 0.6476, "grad_norm": 0.8938157558441162, "learning_rate": 0.0002, "epoch": 3.214362657091562, "step": 44760}, {"loss": 0.6811, "grad_norm": 1.0085135698318481, "learning_rate": 0.0002, "epoch": 3.21508078994614, "step": 44770}, {"loss": 0.646, "grad_norm": 0.8647134304046631, "learning_rate": 0.0002, "epoch": 3.215798922800718, "step": 44780}, {"loss": 0.6169, "grad_norm": 1.09453284740448, "learning_rate": 0.0002, "epoch": 3.216517055655296, "step": 44790}, {"loss": 0.6156, "grad_norm": 0.8710666298866272, "learning_rate": 0.0002, "epoch": 3.2172351885098744, "step": 44800}, {"loss": 0.662, "grad_norm": 0.8080880641937256, "learning_rate": 0.0002, "epoch": 3.2179533213644524, "step": 44810}, {"loss": 0.6039, "grad_norm": 1.0440675020217896, "learning_rate": 0.0002, "epoch": 3.2186714542190304, "step": 44820}, {"loss": 0.6629, "grad_norm": 1.1036376953125, "learning_rate": 0.0002, "epoch": 3.2193895870736084, "step": 44830}, {"loss": 0.6474, "grad_norm": 0.8783546686172485, "learning_rate": 0.0002, "epoch": 3.220107719928187, "step": 44840}, {"loss": 0.6286, "grad_norm": 0.7816855907440186, "learning_rate": 0.0002, "epoch": 3.220825852782765, "step": 44850}, {"loss": 0.622, "grad_norm": 1.0099157094955444, "learning_rate": 0.0002, "epoch": 3.221543985637343, "step": 44860}, {"loss": 0.6668, "grad_norm": 1.054928183555603, "learning_rate": 0.0002, "epoch": 3.222262118491921, "step": 44870}, {"loss": 0.6104, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 3.222980251346499, "step": 44880}, {"loss": 0.686, "grad_norm": 0.9730798602104187, "learning_rate": 0.0002, "epoch": 3.2236983842010773, "step": 44890}, {"loss": 0.6533, "grad_norm": 0.7911382913589478, "learning_rate": 0.0002, "epoch": 3.2244165170556554, "step": 44900}, {"loss": 0.6466, "grad_norm": 0.9574400782585144, "learning_rate": 0.0002, "epoch": 3.2251346499102334, "step": 44910}, {"loss": 0.693, "grad_norm": 0.8101068139076233, "learning_rate": 0.0002, "epoch": 3.2258527827648114, "step": 44920}, {"loss": 0.6605, "grad_norm": 0.754146933555603, "learning_rate": 0.0002, "epoch": 3.22657091561939, "step": 44930}, {"loss": 0.6317, "grad_norm": 0.7471939921379089, "learning_rate": 0.0002, "epoch": 3.227289048473968, "step": 44940}, {"loss": 0.6378, "grad_norm": 1.0040855407714844, "learning_rate": 0.0002, "epoch": 3.228007181328546, "step": 44950}, {"loss": 0.6496, "grad_norm": 1.0016074180603027, "learning_rate": 0.0002, "epoch": 3.228725314183124, "step": 44960}, {"loss": 0.6, "grad_norm": 1.0432976484298706, "learning_rate": 0.0002, "epoch": 3.229443447037702, "step": 44970}, {"loss": 0.635, "grad_norm": 0.8517055511474609, "learning_rate": 0.0002, "epoch": 3.2301615798922803, "step": 44980}, {"loss": 0.6168, "grad_norm": 0.9174178242683411, "learning_rate": 0.0002, "epoch": 3.2308797127468583, "step": 44990}, {"loss": 0.6325, "grad_norm": 0.9733774065971375, "learning_rate": 0.0002, "epoch": 3.2315978456014363, "step": 45000}, {"loss": 0.6743, "grad_norm": 0.9074714779853821, "learning_rate": 0.0002, "epoch": 3.2323159784560143, "step": 45010}, {"loss": 0.6372, "grad_norm": 0.8802759051322937, "learning_rate": 0.0002, "epoch": 3.2330341113105923, "step": 45020}, {"loss": 0.6189, "grad_norm": 1.0620871782302856, "learning_rate": 0.0002, "epoch": 3.2337522441651707, "step": 45030}, {"loss": 0.6201, "grad_norm": 0.8069542050361633, "learning_rate": 0.0002, "epoch": 3.2344703770197487, "step": 45040}, {"loss": 0.618, "grad_norm": 0.9139137864112854, "learning_rate": 0.0002, "epoch": 3.2351885098743267, "step": 45050}, {"loss": 0.6389, "grad_norm": 0.8936411142349243, "learning_rate": 0.0002, "epoch": 3.2359066427289047, "step": 45060}, {"loss": 0.6602, "grad_norm": 0.9098079204559326, "learning_rate": 0.0002, "epoch": 3.2366247755834827, "step": 45070}, {"loss": 0.6423, "grad_norm": 1.062953233718872, "learning_rate": 0.0002, "epoch": 3.237342908438061, "step": 45080}, {"loss": 0.6527, "grad_norm": 0.8656470775604248, "learning_rate": 0.0002, "epoch": 3.238061041292639, "step": 45090}, {"loss": 0.6362, "grad_norm": 0.9299449920654297, "learning_rate": 0.0002, "epoch": 3.238779174147217, "step": 45100}, {"loss": 0.6469, "grad_norm": 1.0102022886276245, "learning_rate": 0.0002, "epoch": 3.239497307001795, "step": 45110}, {"loss": 0.5984, "grad_norm": 0.8074561953544617, "learning_rate": 0.0002, "epoch": 3.2402154398563736, "step": 45120}, {"loss": 0.6196, "grad_norm": 1.044105887413025, "learning_rate": 0.0002, "epoch": 3.2409335727109516, "step": 45130}, {"loss": 0.6471, "grad_norm": 0.8742762207984924, "learning_rate": 0.0002, "epoch": 3.2416517055655296, "step": 45140}, {"loss": 0.648, "grad_norm": 0.8240015506744385, "learning_rate": 0.0002, "epoch": 3.2423698384201076, "step": 45150}, {"loss": 0.6599, "grad_norm": 0.8438951373100281, "learning_rate": 0.0002, "epoch": 3.2430879712746856, "step": 45160}, {"loss": 0.6406, "grad_norm": 1.02358877658844, "learning_rate": 0.0002, "epoch": 3.243806104129264, "step": 45170}, {"loss": 0.6581, "grad_norm": 0.8824774026870728, "learning_rate": 0.0002, "epoch": 3.244524236983842, "step": 45180}, {"loss": 0.658, "grad_norm": 0.971015989780426, "learning_rate": 0.0002, "epoch": 3.24524236983842, "step": 45190}, {"loss": 0.6473, "grad_norm": 0.9282383918762207, "learning_rate": 0.0002, "epoch": 3.245960502692998, "step": 45200}, {"loss": 0.6376, "grad_norm": 0.7908362746238708, "learning_rate": 0.0002, "epoch": 3.2466786355475765, "step": 45210}, {"loss": 0.6765, "grad_norm": 1.0721662044525146, "learning_rate": 0.0002, "epoch": 3.2473967684021545, "step": 45220}, {"loss": 0.7102, "grad_norm": 0.9516810774803162, "learning_rate": 0.0002, "epoch": 3.2481149012567325, "step": 45230}, {"loss": 0.6332, "grad_norm": 0.7914131283760071, "learning_rate": 0.0002, "epoch": 3.2488330341113105, "step": 45240}, {"loss": 0.6018, "grad_norm": 0.8492292761802673, "learning_rate": 0.0002, "epoch": 3.2495511669658885, "step": 45250}, {"loss": 0.6272, "grad_norm": 0.8880114555358887, "learning_rate": 0.0002, "epoch": 3.250269299820467, "step": 45260}, {"loss": 0.6394, "grad_norm": 0.7808310985565186, "learning_rate": 0.0002, "epoch": 3.250987432675045, "step": 45270}, {"loss": 0.6161, "grad_norm": 0.8566828966140747, "learning_rate": 0.0002, "epoch": 3.251705565529623, "step": 45280}, {"loss": 0.6408, "grad_norm": 0.7929658889770508, "learning_rate": 0.0002, "epoch": 3.252423698384201, "step": 45290}, {"loss": 0.6182, "grad_norm": 0.678207516670227, "learning_rate": 0.0002, "epoch": 3.253141831238779, "step": 45300}, {"loss": 0.6315, "grad_norm": 0.9963029623031616, "learning_rate": 0.0002, "epoch": 3.2538599640933574, "step": 45310}, {"loss": 0.6496, "grad_norm": 0.835304856300354, "learning_rate": 0.0002, "epoch": 3.2545780969479354, "step": 45320}, {"loss": 0.6099, "grad_norm": 0.7281617522239685, "learning_rate": 0.0002, "epoch": 3.2552962298025134, "step": 45330}, {"loss": 0.6224, "grad_norm": 1.244890570640564, "learning_rate": 0.0002, "epoch": 3.2560143626570914, "step": 45340}, {"loss": 0.6317, "grad_norm": 0.8372750282287598, "learning_rate": 0.0002, "epoch": 3.2567324955116694, "step": 45350}, {"loss": 0.604, "grad_norm": 1.0029667615890503, "learning_rate": 0.0002, "epoch": 3.257450628366248, "step": 45360}, {"loss": 0.596, "grad_norm": 0.8561908602714539, "learning_rate": 0.0002, "epoch": 3.258168761220826, "step": 45370}, {"loss": 0.6185, "grad_norm": 1.0058085918426514, "learning_rate": 0.0002, "epoch": 3.258886894075404, "step": 45380}, {"loss": 0.6415, "grad_norm": 0.7768221497535706, "learning_rate": 0.0002, "epoch": 3.259605026929982, "step": 45390}, {"loss": 0.635, "grad_norm": 0.8443793058395386, "learning_rate": 0.0002, "epoch": 3.2603231597845603, "step": 45400}, {"loss": 0.6579, "grad_norm": 1.0140392780303955, "learning_rate": 0.0002, "epoch": 3.2610412926391383, "step": 45410}, {"loss": 0.6434, "grad_norm": 0.8397058248519897, "learning_rate": 0.0002, "epoch": 3.2617594254937163, "step": 45420}, {"loss": 0.6361, "grad_norm": 0.9717063903808594, "learning_rate": 0.0002, "epoch": 3.2624775583482943, "step": 45430}, {"loss": 0.6837, "grad_norm": 1.0279473066329956, "learning_rate": 0.0002, "epoch": 3.2631956912028723, "step": 45440}, {"loss": 0.6274, "grad_norm": 1.207457184791565, "learning_rate": 0.0002, "epoch": 3.263913824057451, "step": 45450}, {"loss": 0.681, "grad_norm": 0.8121998906135559, "learning_rate": 0.0002, "epoch": 3.264631956912029, "step": 45460}, {"loss": 0.6202, "grad_norm": 1.037733554840088, "learning_rate": 0.0002, "epoch": 3.265350089766607, "step": 45470}, {"loss": 0.6146, "grad_norm": 0.9305754899978638, "learning_rate": 0.0002, "epoch": 3.266068222621185, "step": 45480}, {"loss": 0.6186, "grad_norm": 0.9733602404594421, "learning_rate": 0.0002, "epoch": 3.2667863554757632, "step": 45490}, {"loss": 0.6713, "grad_norm": 0.8345039486885071, "learning_rate": 0.0002, "epoch": 3.2675044883303412, "step": 45500}, {"loss": 0.6315, "grad_norm": 0.8601692318916321, "learning_rate": 0.0002, "epoch": 3.2682226211849192, "step": 45510}, {"loss": 0.5953, "grad_norm": 0.7921277284622192, "learning_rate": 0.0002, "epoch": 3.2689407540394972, "step": 45520}, {"loss": 0.6781, "grad_norm": 0.8324153423309326, "learning_rate": 0.0002, "epoch": 3.2696588868940752, "step": 45530}, {"loss": 0.6413, "grad_norm": 0.85141521692276, "learning_rate": 0.0002, "epoch": 3.2703770197486537, "step": 45540}, {"loss": 0.654, "grad_norm": 0.9399608373641968, "learning_rate": 0.0002, "epoch": 3.2710951526032317, "step": 45550}, {"loss": 0.6364, "grad_norm": 0.9829166531562805, "learning_rate": 0.0002, "epoch": 3.2718132854578097, "step": 45560}, {"loss": 0.627, "grad_norm": 0.9936266541481018, "learning_rate": 0.0002, "epoch": 3.2725314183123877, "step": 45570}, {"loss": 0.6465, "grad_norm": 1.036165714263916, "learning_rate": 0.0002, "epoch": 3.2732495511669657, "step": 45580}, {"loss": 0.6216, "grad_norm": 0.8988680243492126, "learning_rate": 0.0002, "epoch": 3.273967684021544, "step": 45590}, {"loss": 0.6368, "grad_norm": 0.9173405766487122, "learning_rate": 0.0002, "epoch": 3.274685816876122, "step": 45600}, {"loss": 0.6455, "grad_norm": 0.9967324733734131, "learning_rate": 0.0002, "epoch": 3.2754039497307, "step": 45610}, {"loss": 0.6236, "grad_norm": 0.9097777009010315, "learning_rate": 0.0002, "epoch": 3.276122082585278, "step": 45620}, {"loss": 0.632, "grad_norm": 1.0559430122375488, "learning_rate": 0.0002, "epoch": 3.276840215439856, "step": 45630}, {"loss": 0.5999, "grad_norm": 0.9583360552787781, "learning_rate": 0.0002, "epoch": 3.2775583482944346, "step": 45640}, {"loss": 0.6329, "grad_norm": 0.7630334496498108, "learning_rate": 0.0002, "epoch": 3.2782764811490126, "step": 45650}, {"loss": 0.6873, "grad_norm": 0.9955230355262756, "learning_rate": 0.0002, "epoch": 3.2789946140035906, "step": 45660}, {"loss": 0.6216, "grad_norm": 0.8685793876647949, "learning_rate": 0.0002, "epoch": 3.2797127468581686, "step": 45670}, {"loss": 0.6243, "grad_norm": 0.919913113117218, "learning_rate": 0.0002, "epoch": 3.280430879712747, "step": 45680}, {"loss": 0.6334, "grad_norm": 0.826144814491272, "learning_rate": 0.0002, "epoch": 3.281149012567325, "step": 45690}, {"loss": 0.6359, "grad_norm": 0.9750179052352905, "learning_rate": 0.0002, "epoch": 3.281867145421903, "step": 45700}, {"loss": 0.6589, "grad_norm": 0.7931897640228271, "learning_rate": 0.0002, "epoch": 3.282585278276481, "step": 45710}, {"loss": 0.6785, "grad_norm": 1.0380089282989502, "learning_rate": 0.0002, "epoch": 3.283303411131059, "step": 45720}, {"loss": 0.6219, "grad_norm": 0.8220566511154175, "learning_rate": 0.0002, "epoch": 3.2840215439856375, "step": 45730}, {"loss": 0.5737, "grad_norm": 0.9688239693641663, "learning_rate": 0.0002, "epoch": 3.2847396768402155, "step": 45740}, {"loss": 0.603, "grad_norm": 0.8760311603546143, "learning_rate": 0.0002, "epoch": 3.2854578096947935, "step": 45750}, {"loss": 0.6134, "grad_norm": 0.8103382587432861, "learning_rate": 0.0002, "epoch": 3.2861759425493715, "step": 45760}, {"loss": 0.6475, "grad_norm": 0.8835865259170532, "learning_rate": 0.0002, "epoch": 3.28689407540395, "step": 45770}, {"loss": 0.6423, "grad_norm": 0.9021160006523132, "learning_rate": 0.0002, "epoch": 3.287612208258528, "step": 45780}, {"loss": 0.6693, "grad_norm": 0.8182386159896851, "learning_rate": 0.0002, "epoch": 3.288330341113106, "step": 45790}, {"loss": 0.6408, "grad_norm": 0.8555024862289429, "learning_rate": 0.0002, "epoch": 3.289048473967684, "step": 45800}, {"loss": 0.6839, "grad_norm": 1.0982348918914795, "learning_rate": 0.0002, "epoch": 3.289766606822262, "step": 45810}, {"loss": 0.6323, "grad_norm": 1.06246817111969, "learning_rate": 0.0002, "epoch": 3.2904847396768404, "step": 45820}, {"loss": 0.5924, "grad_norm": 1.1727149486541748, "learning_rate": 0.0002, "epoch": 3.2912028725314184, "step": 45830}, {"loss": 0.624, "grad_norm": 0.8224700093269348, "learning_rate": 0.0002, "epoch": 3.2919210053859964, "step": 45840}, {"loss": 0.6445, "grad_norm": 0.8195698261260986, "learning_rate": 0.0002, "epoch": 3.2926391382405744, "step": 45850}, {"loss": 0.6106, "grad_norm": 0.8424476981163025, "learning_rate": 0.0002, "epoch": 3.2933572710951524, "step": 45860}, {"loss": 0.6705, "grad_norm": 0.9804632067680359, "learning_rate": 0.0002, "epoch": 3.294075403949731, "step": 45870}, {"loss": 0.6538, "grad_norm": 0.8701804876327515, "learning_rate": 0.0002, "epoch": 3.294793536804309, "step": 45880}, {"loss": 0.6264, "grad_norm": 0.8876864910125732, "learning_rate": 0.0002, "epoch": 3.295511669658887, "step": 45890}, {"loss": 0.6401, "grad_norm": 1.0105448961257935, "learning_rate": 0.0002, "epoch": 3.296229802513465, "step": 45900}, {"loss": 0.687, "grad_norm": 0.847017228603363, "learning_rate": 0.0002, "epoch": 3.296947935368043, "step": 45910}, {"loss": 0.6433, "grad_norm": 0.7610297799110413, "learning_rate": 0.0002, "epoch": 3.2976660682226213, "step": 45920}, {"loss": 0.6499, "grad_norm": 0.7272670269012451, "learning_rate": 0.0002, "epoch": 3.2983842010771993, "step": 45930}, {"loss": 0.6366, "grad_norm": 0.8243510127067566, "learning_rate": 0.0002, "epoch": 3.2991023339317773, "step": 45940}, {"loss": 0.6498, "grad_norm": 1.0113074779510498, "learning_rate": 0.0002, "epoch": 3.2998204667863553, "step": 45950}, {"loss": 0.6639, "grad_norm": 0.8578087687492371, "learning_rate": 0.0002, "epoch": 3.3005385996409338, "step": 45960}, {"loss": 0.6137, "grad_norm": 0.9511606097221375, "learning_rate": 0.0002, "epoch": 3.3012567324955118, "step": 45970}, {"loss": 0.6115, "grad_norm": 0.8612566590309143, "learning_rate": 0.0002, "epoch": 3.3019748653500898, "step": 45980}, {"loss": 0.6799, "grad_norm": 0.8702331185340881, "learning_rate": 0.0002, "epoch": 3.3026929982046678, "step": 45990}, {"loss": 0.6429, "grad_norm": 1.0229583978652954, "learning_rate": 0.0002, "epoch": 3.3034111310592458, "step": 46000}, {"loss": 0.6054, "grad_norm": 1.1775577068328857, "learning_rate": 0.0002, "epoch": 3.304129263913824, "step": 46010}, {"loss": 0.6958, "grad_norm": 0.9922171831130981, "learning_rate": 0.0002, "epoch": 3.3048473967684022, "step": 46020}, {"loss": 0.6642, "grad_norm": 0.8246880769729614, "learning_rate": 0.0002, "epoch": 3.3055655296229802, "step": 46030}, {"loss": 0.678, "grad_norm": 0.9351653456687927, "learning_rate": 0.0002, "epoch": 3.3062836624775582, "step": 46040}, {"loss": 0.649, "grad_norm": 0.9617429375648499, "learning_rate": 0.0002, "epoch": 3.3070017953321367, "step": 46050}, {"loss": 0.6314, "grad_norm": 0.9753885269165039, "learning_rate": 0.0002, "epoch": 3.3077199281867147, "step": 46060}, {"loss": 0.6434, "grad_norm": 0.8532425165176392, "learning_rate": 0.0002, "epoch": 3.3084380610412927, "step": 46070}, {"loss": 0.6312, "grad_norm": 0.9722012877464294, "learning_rate": 0.0002, "epoch": 3.3091561938958707, "step": 46080}, {"loss": 0.6629, "grad_norm": 0.8950021266937256, "learning_rate": 0.0002, "epoch": 3.3098743267504487, "step": 46090}, {"loss": 0.6278, "grad_norm": 0.8536333441734314, "learning_rate": 0.0002, "epoch": 3.3105924596050267, "step": 46100}, {"loss": 0.6359, "grad_norm": 0.9423946738243103, "learning_rate": 0.0002, "epoch": 3.311310592459605, "step": 46110}, {"loss": 0.6647, "grad_norm": 0.8573169112205505, "learning_rate": 0.0002, "epoch": 3.312028725314183, "step": 46120}, {"loss": 0.6127, "grad_norm": 1.0122376680374146, "learning_rate": 0.0002, "epoch": 3.312746858168761, "step": 46130}, {"loss": 0.6782, "grad_norm": 0.7492560744285583, "learning_rate": 0.0002, "epoch": 3.313464991023339, "step": 46140}, {"loss": 0.6315, "grad_norm": 1.023658037185669, "learning_rate": 0.0002, "epoch": 3.3141831238779176, "step": 46150}, {"loss": 0.6051, "grad_norm": 1.1191970109939575, "learning_rate": 0.0002, "epoch": 3.3149012567324956, "step": 46160}, {"loss": 0.6247, "grad_norm": 0.9847373962402344, "learning_rate": 0.0002, "epoch": 3.3156193895870736, "step": 46170}, {"loss": 0.661, "grad_norm": 0.7315911054611206, "learning_rate": 0.0002, "epoch": 3.3163375224416516, "step": 46180}, {"loss": 0.6017, "grad_norm": 0.8267890214920044, "learning_rate": 0.0002, "epoch": 3.3170556552962296, "step": 46190}, {"loss": 0.6202, "grad_norm": 0.8898099064826965, "learning_rate": 0.0002, "epoch": 3.317773788150808, "step": 46200}, {"loss": 0.651, "grad_norm": 0.8525369167327881, "learning_rate": 0.0002, "epoch": 3.318491921005386, "step": 46210}, {"loss": 0.6705, "grad_norm": 0.8074760437011719, "learning_rate": 0.0002, "epoch": 3.319210053859964, "step": 46220}, {"loss": 0.641, "grad_norm": 0.8473616242408752, "learning_rate": 0.0002, "epoch": 3.319928186714542, "step": 46230}, {"loss": 0.6092, "grad_norm": 0.8678314089775085, "learning_rate": 0.0002, "epoch": 3.3206463195691205, "step": 46240}, {"loss": 0.655, "grad_norm": 0.8718782067298889, "learning_rate": 0.0002, "epoch": 3.3213644524236985, "step": 46250}, {"loss": 0.6266, "grad_norm": 0.9384858012199402, "learning_rate": 0.0002, "epoch": 3.3220825852782765, "step": 46260}, {"loss": 0.6393, "grad_norm": 0.9295032620429993, "learning_rate": 0.0002, "epoch": 3.3228007181328545, "step": 46270}, {"loss": 0.6824, "grad_norm": 0.9472482800483704, "learning_rate": 0.0002, "epoch": 3.3235188509874325, "step": 46280}, {"loss": 0.6177, "grad_norm": 0.7970638275146484, "learning_rate": 0.0002, "epoch": 3.324236983842011, "step": 46290}, {"loss": 0.6431, "grad_norm": 0.9508723020553589, "learning_rate": 0.0002, "epoch": 3.324955116696589, "step": 46300}, {"loss": 0.6126, "grad_norm": 0.9153636693954468, "learning_rate": 0.0002, "epoch": 3.325673249551167, "step": 46310}, {"loss": 0.6042, "grad_norm": 0.7890323400497437, "learning_rate": 0.0002, "epoch": 3.326391382405745, "step": 46320}, {"loss": 0.6525, "grad_norm": 0.8711825609207153, "learning_rate": 0.0002, "epoch": 3.3271095152603234, "step": 46330}, {"loss": 0.6253, "grad_norm": 0.9938926696777344, "learning_rate": 0.0002, "epoch": 3.3278276481149014, "step": 46340}, {"loss": 0.6227, "grad_norm": 0.8497524857521057, "learning_rate": 0.0002, "epoch": 3.3285457809694794, "step": 46350}, {"loss": 0.6472, "grad_norm": 0.9191650748252869, "learning_rate": 0.0002, "epoch": 3.3292639138240574, "step": 46360}, {"loss": 0.6385, "grad_norm": 0.8974085450172424, "learning_rate": 0.0002, "epoch": 3.3299820466786354, "step": 46370}, {"loss": 0.618, "grad_norm": 0.9928934574127197, "learning_rate": 0.0002, "epoch": 3.3307001795332134, "step": 46380}, {"loss": 0.6254, "grad_norm": 0.9011030197143555, "learning_rate": 0.0002, "epoch": 3.331418312387792, "step": 46390}, {"loss": 0.6146, "grad_norm": 0.898594856262207, "learning_rate": 0.0002, "epoch": 3.33213644524237, "step": 46400}, {"loss": 0.6321, "grad_norm": 0.7506672143936157, "learning_rate": 0.0002, "epoch": 3.332854578096948, "step": 46410}, {"loss": 0.6329, "grad_norm": 0.9239172339439392, "learning_rate": 0.0002, "epoch": 3.333572710951526, "step": 46420}, {"loss": 0.6278, "grad_norm": 1.0749682188034058, "learning_rate": 0.0002, "epoch": 3.3342908438061043, "step": 46430}, {"loss": 0.6568, "grad_norm": 0.9262617230415344, "learning_rate": 0.0002, "epoch": 3.3350089766606823, "step": 46440}, {"loss": 0.6034, "grad_norm": 0.8681274056434631, "learning_rate": 0.0002, "epoch": 3.3357271095152603, "step": 46450}, {"loss": 0.6261, "grad_norm": 0.9558620452880859, "learning_rate": 0.0002, "epoch": 3.3364452423698383, "step": 46460}, {"loss": 0.6087, "grad_norm": 0.8907097578048706, "learning_rate": 0.0002, "epoch": 3.3371633752244163, "step": 46470}, {"loss": 0.6356, "grad_norm": 1.0941565036773682, "learning_rate": 0.0002, "epoch": 3.3378815080789948, "step": 46480}, {"loss": 0.6536, "grad_norm": 0.8971590995788574, "learning_rate": 0.0002, "epoch": 3.3385996409335728, "step": 46490}, {"loss": 0.6252, "grad_norm": 1.0315606594085693, "learning_rate": 0.0002, "epoch": 3.3393177737881508, "step": 46500}, {"loss": 0.5819, "grad_norm": 0.7717124223709106, "learning_rate": 0.0002, "epoch": 3.3400359066427288, "step": 46510}, {"loss": 0.612, "grad_norm": 0.8060970902442932, "learning_rate": 0.0002, "epoch": 3.340754039497307, "step": 46520}, {"loss": 0.7036, "grad_norm": 0.969510018825531, "learning_rate": 0.0002, "epoch": 3.341472172351885, "step": 46530}, {"loss": 0.6163, "grad_norm": 0.8837248682975769, "learning_rate": 0.0002, "epoch": 3.342190305206463, "step": 46540}, {"loss": 0.6762, "grad_norm": 0.9561076164245605, "learning_rate": 0.0002, "epoch": 3.342908438061041, "step": 46550}, {"loss": 0.687, "grad_norm": 0.8529208898544312, "learning_rate": 0.0002, "epoch": 3.343626570915619, "step": 46560}, {"loss": 0.611, "grad_norm": 1.1300519704818726, "learning_rate": 0.0002, "epoch": 3.3443447037701977, "step": 46570}, {"loss": 0.6088, "grad_norm": 0.8330956101417542, "learning_rate": 0.0002, "epoch": 3.3450628366247757, "step": 46580}, {"loss": 0.6725, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 3.3457809694793537, "step": 46590}, {"loss": 0.6667, "grad_norm": 1.0470821857452393, "learning_rate": 0.0002, "epoch": 3.3464991023339317, "step": 46600}, {"loss": 0.6408, "grad_norm": 0.9933704137802124, "learning_rate": 0.0002, "epoch": 3.34721723518851, "step": 46610}, {"loss": 0.6416, "grad_norm": 0.8130798935890198, "learning_rate": 0.0002, "epoch": 3.347935368043088, "step": 46620}, {"loss": 0.6576, "grad_norm": 0.9746946692466736, "learning_rate": 0.0002, "epoch": 3.348653500897666, "step": 46630}, {"loss": 0.6254, "grad_norm": 0.8607267141342163, "learning_rate": 0.0002, "epoch": 3.349371633752244, "step": 46640}, {"loss": 0.6639, "grad_norm": 0.800335705280304, "learning_rate": 0.0002, "epoch": 3.350089766606822, "step": 46650}, {"loss": 0.6749, "grad_norm": 1.0083239078521729, "learning_rate": 0.0002, "epoch": 3.3508078994614, "step": 46660}, {"loss": 0.6606, "grad_norm": 1.0774433612823486, "learning_rate": 0.0002, "epoch": 3.3515260323159786, "step": 46670}, {"loss": 0.6408, "grad_norm": 0.9378824234008789, "learning_rate": 0.0002, "epoch": 3.3522441651705566, "step": 46680}, {"loss": 0.5879, "grad_norm": 0.8490564227104187, "learning_rate": 0.0002, "epoch": 3.3529622980251346, "step": 46690}, {"loss": 0.6364, "grad_norm": 1.0415582656860352, "learning_rate": 0.0002, "epoch": 3.3536804308797126, "step": 46700}, {"loss": 0.5813, "grad_norm": 0.8514367938041687, "learning_rate": 0.0002, "epoch": 3.354398563734291, "step": 46710}, {"loss": 0.6847, "grad_norm": 0.7691360712051392, "learning_rate": 0.0002, "epoch": 3.355116696588869, "step": 46720}, {"loss": 0.6295, "grad_norm": 0.8345438241958618, "learning_rate": 0.0002, "epoch": 3.355834829443447, "step": 46730}, {"loss": 0.6093, "grad_norm": 1.023492693901062, "learning_rate": 0.0002, "epoch": 3.356552962298025, "step": 46740}, {"loss": 0.5997, "grad_norm": 0.9648325443267822, "learning_rate": 0.0002, "epoch": 3.357271095152603, "step": 46750}, {"loss": 0.6379, "grad_norm": 0.9029248356819153, "learning_rate": 0.0002, "epoch": 3.3579892280071815, "step": 46760}, {"loss": 0.6551, "grad_norm": 0.9109513759613037, "learning_rate": 0.0002, "epoch": 3.3587073608617595, "step": 46770}, {"loss": 0.6616, "grad_norm": 0.7757390141487122, "learning_rate": 0.0002, "epoch": 3.3594254937163375, "step": 46780}, {"loss": 0.6088, "grad_norm": 0.794035792350769, "learning_rate": 0.0002, "epoch": 3.3601436265709155, "step": 46790}, {"loss": 0.6405, "grad_norm": 0.8211429715156555, "learning_rate": 0.0002, "epoch": 3.360861759425494, "step": 46800}, {"loss": 0.6359, "grad_norm": 0.8620322346687317, "learning_rate": 0.0002, "epoch": 3.361579892280072, "step": 46810}, {"loss": 0.6357, "grad_norm": 0.9392538070678711, "learning_rate": 0.0002, "epoch": 3.36229802513465, "step": 46820}, {"loss": 0.6225, "grad_norm": 0.8297873139381409, "learning_rate": 0.0002, "epoch": 3.363016157989228, "step": 46830}, {"loss": 0.639, "grad_norm": 0.9158190488815308, "learning_rate": 0.0002, "epoch": 3.363734290843806, "step": 46840}, {"loss": 0.6168, "grad_norm": 1.1449424028396606, "learning_rate": 0.0002, "epoch": 3.3644524236983844, "step": 46850}, {"loss": 0.6413, "grad_norm": 0.8718444108963013, "learning_rate": 0.0002, "epoch": 3.3651705565529624, "step": 46860}, {"loss": 0.624, "grad_norm": 0.7744014263153076, "learning_rate": 0.0002, "epoch": 3.3658886894075404, "step": 46870}, {"loss": 0.6238, "grad_norm": 0.8392460942268372, "learning_rate": 0.0002, "epoch": 3.3666068222621184, "step": 46880}, {"loss": 0.6753, "grad_norm": 1.0424989461898804, "learning_rate": 0.0002, "epoch": 3.367324955116697, "step": 46890}, {"loss": 0.6038, "grad_norm": 1.4696359634399414, "learning_rate": 0.0002, "epoch": 3.368043087971275, "step": 46900}, {"loss": 0.6525, "grad_norm": 0.9298201203346252, "learning_rate": 0.0002, "epoch": 3.368761220825853, "step": 46910}, {"loss": 0.6351, "grad_norm": 0.8965262770652771, "learning_rate": 0.0002, "epoch": 3.369479353680431, "step": 46920}, {"loss": 0.6505, "grad_norm": 0.9395381808280945, "learning_rate": 0.0002, "epoch": 3.370197486535009, "step": 46930}, {"loss": 0.6161, "grad_norm": 0.9069047570228577, "learning_rate": 0.0002, "epoch": 3.370915619389587, "step": 46940}, {"loss": 0.6576, "grad_norm": 0.9208605885505676, "learning_rate": 0.0002, "epoch": 3.3716337522441653, "step": 46950}, {"loss": 0.6456, "grad_norm": 0.9493077397346497, "learning_rate": 0.0002, "epoch": 3.3723518850987433, "step": 46960}, {"loss": 0.6609, "grad_norm": 1.0804208517074585, "learning_rate": 0.0002, "epoch": 3.3730700179533213, "step": 46970}, {"loss": 0.6267, "grad_norm": 0.9465714693069458, "learning_rate": 0.0002, "epoch": 3.3737881508078993, "step": 46980}, {"loss": 0.6633, "grad_norm": 0.9189882278442383, "learning_rate": 0.0002, "epoch": 3.3745062836624777, "step": 46990}, {"loss": 0.6518, "grad_norm": 1.0199357271194458, "learning_rate": 0.0002, "epoch": 3.3752244165170557, "step": 47000}, {"loss": 0.6645, "grad_norm": 0.8999426960945129, "learning_rate": 0.0002, "epoch": 3.3759425493716337, "step": 47010}, {"loss": 0.637, "grad_norm": 0.8923690319061279, "learning_rate": 0.0002, "epoch": 3.3766606822262117, "step": 47020}, {"loss": 0.6543, "grad_norm": 0.7459347248077393, "learning_rate": 0.0002, "epoch": 3.3773788150807897, "step": 47030}, {"loss": 0.6269, "grad_norm": 0.7702858448028564, "learning_rate": 0.0002, "epoch": 3.378096947935368, "step": 47040}, {"loss": 0.6399, "grad_norm": 0.8296625018119812, "learning_rate": 0.0002, "epoch": 3.378815080789946, "step": 47050}, {"loss": 0.6552, "grad_norm": 1.2952555418014526, "learning_rate": 0.0002, "epoch": 3.379533213644524, "step": 47060}, {"loss": 0.6264, "grad_norm": 0.7778869271278381, "learning_rate": 0.0002, "epoch": 3.380251346499102, "step": 47070}, {"loss": 0.6906, "grad_norm": 0.9151549339294434, "learning_rate": 0.0002, "epoch": 3.3809694793536806, "step": 47080}, {"loss": 0.6443, "grad_norm": 0.7883925437927246, "learning_rate": 0.0002, "epoch": 3.3816876122082586, "step": 47090}, {"loss": 0.6124, "grad_norm": 0.9602295756340027, "learning_rate": 0.0002, "epoch": 3.3824057450628366, "step": 47100}, {"loss": 0.651, "grad_norm": 0.7953121066093445, "learning_rate": 0.0002, "epoch": 3.3831238779174146, "step": 47110}, {"loss": 0.638, "grad_norm": 1.110148549079895, "learning_rate": 0.0002, "epoch": 3.3838420107719926, "step": 47120}, {"loss": 0.6386, "grad_norm": 0.9359608888626099, "learning_rate": 0.0002, "epoch": 3.384560143626571, "step": 47130}, {"loss": 0.6075, "grad_norm": 0.7877762317657471, "learning_rate": 0.0002, "epoch": 3.385278276481149, "step": 47140}, {"loss": 0.6657, "grad_norm": 0.8586933016777039, "learning_rate": 0.0002, "epoch": 3.385996409335727, "step": 47150}, {"loss": 0.6438, "grad_norm": 0.8920878767967224, "learning_rate": 0.0002, "epoch": 3.386714542190305, "step": 47160}, {"loss": 0.6584, "grad_norm": 0.9692603349685669, "learning_rate": 0.0002, "epoch": 3.3874326750448835, "step": 47170}, {"loss": 0.6643, "grad_norm": 0.9038610458374023, "learning_rate": 0.0002, "epoch": 3.3881508078994615, "step": 47180}, {"loss": 0.6002, "grad_norm": 1.6299188137054443, "learning_rate": 0.0002, "epoch": 3.3888689407540395, "step": 47190}, {"loss": 0.6423, "grad_norm": 0.9704291820526123, "learning_rate": 0.0002, "epoch": 3.3895870736086176, "step": 47200}, {"loss": 0.6808, "grad_norm": 0.9503401517868042, "learning_rate": 0.0002, "epoch": 3.3903052064631956, "step": 47210}, {"loss": 0.6871, "grad_norm": 1.0051378011703491, "learning_rate": 0.0002, "epoch": 3.3910233393177736, "step": 47220}, {"loss": 0.6207, "grad_norm": 0.7336357235908508, "learning_rate": 0.0002, "epoch": 3.391741472172352, "step": 47230}, {"loss": 0.6688, "grad_norm": 0.9847398996353149, "learning_rate": 0.0002, "epoch": 3.39245960502693, "step": 47240}, {"loss": 0.6305, "grad_norm": 0.8100917339324951, "learning_rate": 0.0002, "epoch": 3.393177737881508, "step": 47250}, {"loss": 0.6418, "grad_norm": 0.9752838611602783, "learning_rate": 0.0002, "epoch": 3.393895870736086, "step": 47260}, {"loss": 0.6237, "grad_norm": 0.9400623440742493, "learning_rate": 0.0002, "epoch": 3.3946140035906645, "step": 47270}, {"loss": 0.6321, "grad_norm": 0.7310057878494263, "learning_rate": 0.0002, "epoch": 3.3953321364452425, "step": 47280}, {"loss": 0.6209, "grad_norm": 0.8898789286613464, "learning_rate": 0.0002, "epoch": 3.3960502692998205, "step": 47290}, {"loss": 0.6496, "grad_norm": 1.0157585144042969, "learning_rate": 0.0002, "epoch": 3.3967684021543985, "step": 47300}, {"loss": 0.6497, "grad_norm": 0.9108527898788452, "learning_rate": 0.0002, "epoch": 3.3974865350089765, "step": 47310}, {"loss": 0.5928, "grad_norm": 0.9796249270439148, "learning_rate": 0.0002, "epoch": 3.398204667863555, "step": 47320}, {"loss": 0.6169, "grad_norm": 0.8176435232162476, "learning_rate": 0.0002, "epoch": 3.398922800718133, "step": 47330}, {"loss": 0.6279, "grad_norm": 0.9981188178062439, "learning_rate": 0.0002, "epoch": 3.399640933572711, "step": 47340}, {"loss": 0.6657, "grad_norm": 0.9774404764175415, "learning_rate": 0.0002, "epoch": 3.400359066427289, "step": 47350}, {"loss": 0.68, "grad_norm": 0.8624991774559021, "learning_rate": 0.0002, "epoch": 3.4010771992818674, "step": 47360}, {"loss": 0.6597, "grad_norm": 0.9191665053367615, "learning_rate": 0.0002, "epoch": 3.4017953321364454, "step": 47370}, {"loss": 0.6249, "grad_norm": 0.7971290946006775, "learning_rate": 0.0002, "epoch": 3.4025134649910234, "step": 47380}, {"loss": 0.617, "grad_norm": 0.8336732983589172, "learning_rate": 0.0002, "epoch": 3.4032315978456014, "step": 47390}, {"loss": 0.6435, "grad_norm": 0.7730334401130676, "learning_rate": 0.0002, "epoch": 3.4039497307001794, "step": 47400}, {"loss": 0.6348, "grad_norm": 0.8559145927429199, "learning_rate": 0.0002, "epoch": 3.404667863554758, "step": 47410}, {"loss": 0.6466, "grad_norm": 1.0261447429656982, "learning_rate": 0.0002, "epoch": 3.405385996409336, "step": 47420}, {"loss": 0.6556, "grad_norm": 0.9931781888008118, "learning_rate": 0.0002, "epoch": 3.406104129263914, "step": 47430}, {"loss": 0.6226, "grad_norm": 0.8971807360649109, "learning_rate": 0.0002, "epoch": 3.406822262118492, "step": 47440}, {"loss": 0.656, "grad_norm": 0.8886999487876892, "learning_rate": 0.0002, "epoch": 3.4075403949730703, "step": 47450}, {"loss": 0.6256, "grad_norm": 0.9551735520362854, "learning_rate": 0.0002, "epoch": 3.4082585278276483, "step": 47460}, {"loss": 0.6646, "grad_norm": 0.9066859483718872, "learning_rate": 0.0002, "epoch": 3.4089766606822263, "step": 47470}, {"loss": 0.6655, "grad_norm": 0.9192125201225281, "learning_rate": 0.0002, "epoch": 3.4096947935368043, "step": 47480}, {"loss": 0.6197, "grad_norm": 0.9332839250564575, "learning_rate": 0.0002, "epoch": 3.4104129263913823, "step": 47490}, {"loss": 0.6134, "grad_norm": 0.745563805103302, "learning_rate": 0.0002, "epoch": 3.4111310592459603, "step": 47500}, {"loss": 0.6206, "grad_norm": 0.6843905448913574, "learning_rate": 0.0002, "epoch": 3.4118491921005387, "step": 47510}, {"loss": 0.6742, "grad_norm": 0.8063111305236816, "learning_rate": 0.0002, "epoch": 3.4125673249551167, "step": 47520}, {"loss": 0.6138, "grad_norm": 0.9666593670845032, "learning_rate": 0.0002, "epoch": 3.4132854578096947, "step": 47530}, {"loss": 0.635, "grad_norm": 0.8112747073173523, "learning_rate": 0.0002, "epoch": 3.4140035906642727, "step": 47540}, {"loss": 0.6225, "grad_norm": 0.820807933807373, "learning_rate": 0.0002, "epoch": 3.414721723518851, "step": 47550}, {"loss": 0.6262, "grad_norm": 0.8476285338401794, "learning_rate": 0.0002, "epoch": 3.415439856373429, "step": 47560}, {"loss": 0.6134, "grad_norm": 1.0232552289962769, "learning_rate": 0.0002, "epoch": 3.416157989228007, "step": 47570}, {"loss": 0.604, "grad_norm": 0.8749372363090515, "learning_rate": 0.0002, "epoch": 3.416876122082585, "step": 47580}, {"loss": 0.6463, "grad_norm": 0.8117937445640564, "learning_rate": 0.0002, "epoch": 3.417594254937163, "step": 47590}, {"loss": 0.623, "grad_norm": 0.9010460376739502, "learning_rate": 0.0002, "epoch": 3.4183123877917416, "step": 47600}, {"loss": 0.6676, "grad_norm": 0.8955527544021606, "learning_rate": 0.0002, "epoch": 3.4190305206463196, "step": 47610}, {"loss": 0.6424, "grad_norm": 0.884186327457428, "learning_rate": 0.0002, "epoch": 3.4197486535008976, "step": 47620}, {"loss": 0.6377, "grad_norm": 0.8995241522789001, "learning_rate": 0.0002, "epoch": 3.4204667863554756, "step": 47630}, {"loss": 0.651, "grad_norm": 1.0627013444900513, "learning_rate": 0.0002, "epoch": 3.421184919210054, "step": 47640}, {"loss": 0.6338, "grad_norm": 0.8619979619979858, "learning_rate": 0.0002, "epoch": 3.421903052064632, "step": 47650}, {"loss": 0.6483, "grad_norm": 0.9682498574256897, "learning_rate": 0.0002, "epoch": 3.42262118491921, "step": 47660}, {"loss": 0.6006, "grad_norm": 0.9614400863647461, "learning_rate": 0.0002, "epoch": 3.423339317773788, "step": 47670}, {"loss": 0.6088, "grad_norm": 0.7986962795257568, "learning_rate": 0.0002, "epoch": 3.424057450628366, "step": 47680}, {"loss": 0.6056, "grad_norm": 0.8255957961082458, "learning_rate": 0.0002, "epoch": 3.4247755834829445, "step": 47690}, {"loss": 0.663, "grad_norm": 0.9139757752418518, "learning_rate": 0.0002, "epoch": 3.4254937163375225, "step": 47700}, {"loss": 0.61, "grad_norm": 0.8086292743682861, "learning_rate": 0.0002, "epoch": 3.4262118491921005, "step": 47710}, {"loss": 0.6604, "grad_norm": 0.8852273225784302, "learning_rate": 0.0002, "epoch": 3.4269299820466785, "step": 47720}, {"loss": 0.6168, "grad_norm": 0.7568784356117249, "learning_rate": 0.0002, "epoch": 3.427648114901257, "step": 47730}, {"loss": 0.6559, "grad_norm": 0.8933039903640747, "learning_rate": 0.0002, "epoch": 3.428366247755835, "step": 47740}, {"loss": 0.6406, "grad_norm": 0.8101669549942017, "learning_rate": 0.0002, "epoch": 3.429084380610413, "step": 47750}, {"loss": 0.6287, "grad_norm": 0.7021054625511169, "learning_rate": 0.0002, "epoch": 3.429802513464991, "step": 47760}, {"loss": 0.6159, "grad_norm": 0.8282538652420044, "learning_rate": 0.0002, "epoch": 3.430520646319569, "step": 47770}, {"loss": 0.6439, "grad_norm": 0.8168348670005798, "learning_rate": 0.0002, "epoch": 3.431238779174147, "step": 47780}, {"loss": 0.6265, "grad_norm": 0.9504001140594482, "learning_rate": 0.0002, "epoch": 3.4319569120287254, "step": 47790}, {"loss": 0.6688, "grad_norm": 0.7500190734863281, "learning_rate": 0.0002, "epoch": 3.4326750448833034, "step": 47800}, {"loss": 0.6818, "grad_norm": 0.8645710945129395, "learning_rate": 0.0002, "epoch": 3.4333931777378814, "step": 47810}, {"loss": 0.6268, "grad_norm": 0.8088704943656921, "learning_rate": 0.0002, "epoch": 3.4341113105924594, "step": 47820}, {"loss": 0.6795, "grad_norm": 0.9981673955917358, "learning_rate": 0.0002, "epoch": 3.434829443447038, "step": 47830}, {"loss": 0.6615, "grad_norm": 0.9363315105438232, "learning_rate": 0.0002, "epoch": 3.435547576301616, "step": 47840}, {"loss": 0.6028, "grad_norm": 0.8471030592918396, "learning_rate": 0.0002, "epoch": 3.436265709156194, "step": 47850}, {"loss": 0.6658, "grad_norm": 0.9447668790817261, "learning_rate": 0.0002, "epoch": 3.436983842010772, "step": 47860}, {"loss": 0.6511, "grad_norm": 0.9494127631187439, "learning_rate": 0.0002, "epoch": 3.43770197486535, "step": 47870}, {"loss": 0.6134, "grad_norm": 0.8340432643890381, "learning_rate": 0.0002, "epoch": 3.4384201077199283, "step": 47880}, {"loss": 0.6731, "grad_norm": 0.8466387987136841, "learning_rate": 0.0002, "epoch": 3.4391382405745063, "step": 47890}, {"loss": 0.6552, "grad_norm": 0.9498962759971619, "learning_rate": 0.0002, "epoch": 3.4398563734290843, "step": 47900}, {"loss": 0.6593, "grad_norm": 0.8490501046180725, "learning_rate": 0.0002, "epoch": 3.4405745062836623, "step": 47910}, {"loss": 0.6038, "grad_norm": 0.9506490230560303, "learning_rate": 0.0002, "epoch": 3.441292639138241, "step": 47920}, {"loss": 0.6317, "grad_norm": 0.7944257855415344, "learning_rate": 0.0002, "epoch": 3.442010771992819, "step": 47930}, {"loss": 0.6193, "grad_norm": 0.9725518226623535, "learning_rate": 0.0002, "epoch": 3.442728904847397, "step": 47940}, {"loss": 0.635, "grad_norm": 0.7823024392127991, "learning_rate": 0.0002, "epoch": 3.443447037701975, "step": 47950}, {"loss": 0.6221, "grad_norm": 0.810565173625946, "learning_rate": 0.0002, "epoch": 3.444165170556553, "step": 47960}, {"loss": 0.6519, "grad_norm": 0.9809024333953857, "learning_rate": 0.0002, "epoch": 3.4448833034111312, "step": 47970}, {"loss": 0.6441, "grad_norm": 0.8818578720092773, "learning_rate": 0.0002, "epoch": 3.4456014362657092, "step": 47980}, {"loss": 0.6452, "grad_norm": 0.9843092560768127, "learning_rate": 0.0002, "epoch": 3.4463195691202873, "step": 47990}, {"loss": 0.6076, "grad_norm": 0.916313886642456, "learning_rate": 0.0002, "epoch": 3.4470377019748653, "step": 48000}, {"loss": 0.6399, "grad_norm": 0.908442497253418, "learning_rate": 0.0002, "epoch": 3.4477558348294433, "step": 48010}, {"loss": 0.6263, "grad_norm": 0.9880178570747375, "learning_rate": 0.0002, "epoch": 3.4484739676840217, "step": 48020}, {"loss": 0.6802, "grad_norm": 0.9276854991912842, "learning_rate": 0.0002, "epoch": 3.4491921005385997, "step": 48030}, {"loss": 0.6522, "grad_norm": 1.0879448652267456, "learning_rate": 0.0002, "epoch": 3.4499102333931777, "step": 48040}, {"loss": 0.6362, "grad_norm": 0.7430389523506165, "learning_rate": 0.0002, "epoch": 3.4506283662477557, "step": 48050}, {"loss": 0.6064, "grad_norm": 1.0880072116851807, "learning_rate": 0.0002, "epoch": 3.4513464991023337, "step": 48060}, {"loss": 0.6152, "grad_norm": 1.0424141883850098, "learning_rate": 0.0002, "epoch": 3.452064631956912, "step": 48070}, {"loss": 0.6485, "grad_norm": 0.926330029964447, "learning_rate": 0.0002, "epoch": 3.45278276481149, "step": 48080}, {"loss": 0.6261, "grad_norm": 0.8911219239234924, "learning_rate": 0.0002, "epoch": 3.453500897666068, "step": 48090}, {"loss": 0.6883, "grad_norm": 0.8727201223373413, "learning_rate": 0.0002, "epoch": 3.454219030520646, "step": 48100}, {"loss": 0.6473, "grad_norm": 0.8573940396308899, "learning_rate": 0.0002, "epoch": 3.4549371633752246, "step": 48110}, {"loss": 0.6645, "grad_norm": 1.0427064895629883, "learning_rate": 0.0002, "epoch": 3.4556552962298026, "step": 48120}, {"loss": 0.6489, "grad_norm": 0.8688231706619263, "learning_rate": 0.0002, "epoch": 3.4563734290843806, "step": 48130}, {"loss": 0.5947, "grad_norm": 0.8856009244918823, "learning_rate": 0.0002, "epoch": 3.4570915619389586, "step": 48140}, {"loss": 0.6482, "grad_norm": 0.9535353183746338, "learning_rate": 0.0002, "epoch": 3.4578096947935366, "step": 48150}, {"loss": 0.6435, "grad_norm": 0.9466010928153992, "learning_rate": 0.0002, "epoch": 3.458527827648115, "step": 48160}, {"loss": 0.6231, "grad_norm": 0.9783535599708557, "learning_rate": 0.0002, "epoch": 3.459245960502693, "step": 48170}, {"loss": 0.6926, "grad_norm": 0.8010456562042236, "learning_rate": 0.0002, "epoch": 3.459964093357271, "step": 48180}, {"loss": 0.6141, "grad_norm": 0.8928955793380737, "learning_rate": 0.0002, "epoch": 3.460682226211849, "step": 48190}, {"loss": 0.6699, "grad_norm": 0.7565838694572449, "learning_rate": 0.0002, "epoch": 3.4614003590664275, "step": 48200}, {"loss": 0.6218, "grad_norm": 1.0044180154800415, "learning_rate": 0.0002, "epoch": 3.4621184919210055, "step": 48210}, {"loss": 0.6182, "grad_norm": 0.8161038160324097, "learning_rate": 0.0002, "epoch": 3.4628366247755835, "step": 48220}, {"loss": 0.6869, "grad_norm": 1.1000211238861084, "learning_rate": 0.0002, "epoch": 3.4635547576301615, "step": 48230}, {"loss": 0.7141, "grad_norm": 0.7942240238189697, "learning_rate": 0.0002, "epoch": 3.4642728904847395, "step": 48240}, {"loss": 0.6247, "grad_norm": 0.7546432018280029, "learning_rate": 0.0002, "epoch": 3.464991023339318, "step": 48250}, {"loss": 0.6319, "grad_norm": 0.7705255150794983, "learning_rate": 0.0002, "epoch": 3.465709156193896, "step": 48260}, {"loss": 0.6414, "grad_norm": 0.7958067059516907, "learning_rate": 0.0002, "epoch": 3.466427289048474, "step": 48270}, {"loss": 0.6526, "grad_norm": 0.9199120402336121, "learning_rate": 0.0002, "epoch": 3.467145421903052, "step": 48280}, {"loss": 0.6476, "grad_norm": 1.118672251701355, "learning_rate": 0.0002, "epoch": 3.46786355475763, "step": 48290}, {"loss": 0.6543, "grad_norm": 0.9161015748977661, "learning_rate": 0.0002, "epoch": 3.4685816876122084, "step": 48300}, {"loss": 0.6767, "grad_norm": 1.1086218357086182, "learning_rate": 0.0002, "epoch": 3.4692998204667864, "step": 48310}, {"loss": 0.5917, "grad_norm": 1.0123368501663208, "learning_rate": 0.0002, "epoch": 3.4700179533213644, "step": 48320}, {"loss": 0.6277, "grad_norm": 0.7380602359771729, "learning_rate": 0.0002, "epoch": 3.4707360861759424, "step": 48330}, {"loss": 0.6407, "grad_norm": 0.8967105150222778, "learning_rate": 0.0002, "epoch": 3.4714542190305204, "step": 48340}, {"loss": 0.6526, "grad_norm": 1.0134044885635376, "learning_rate": 0.0002, "epoch": 3.472172351885099, "step": 48350}, {"loss": 0.6436, "grad_norm": 1.080815076828003, "learning_rate": 0.0002, "epoch": 3.472890484739677, "step": 48360}, {"loss": 0.6644, "grad_norm": 1.151721477508545, "learning_rate": 0.0002, "epoch": 3.473608617594255, "step": 48370}, {"loss": 0.6612, "grad_norm": 0.9436505436897278, "learning_rate": 0.0002, "epoch": 3.474326750448833, "step": 48380}, {"loss": 0.6503, "grad_norm": 0.9154609441757202, "learning_rate": 0.0002, "epoch": 3.4750448833034113, "step": 48390}, {"loss": 0.6151, "grad_norm": 0.8943037986755371, "learning_rate": 0.0002, "epoch": 3.4757630161579893, "step": 48400}, {"loss": 0.6316, "grad_norm": 0.936988115310669, "learning_rate": 0.0002, "epoch": 3.4764811490125673, "step": 48410}, {"loss": 0.6638, "grad_norm": 0.826960027217865, "learning_rate": 0.0002, "epoch": 3.4771992818671453, "step": 48420}, {"loss": 0.6242, "grad_norm": 1.0487587451934814, "learning_rate": 0.0002, "epoch": 3.4779174147217233, "step": 48430}, {"loss": 0.6302, "grad_norm": 0.729163646697998, "learning_rate": 0.0002, "epoch": 3.478635547576302, "step": 48440}, {"loss": 0.6115, "grad_norm": 0.8156948089599609, "learning_rate": 0.0002, "epoch": 3.47935368043088, "step": 48450}, {"loss": 0.6455, "grad_norm": 0.8004332184791565, "learning_rate": 0.0002, "epoch": 3.480071813285458, "step": 48460}, {"loss": 0.621, "grad_norm": 0.9632692337036133, "learning_rate": 0.0002, "epoch": 3.480789946140036, "step": 48470}, {"loss": 0.6214, "grad_norm": 1.0950212478637695, "learning_rate": 0.0002, "epoch": 3.4815080789946142, "step": 48480}, {"loss": 0.6659, "grad_norm": 0.8574318885803223, "learning_rate": 0.0002, "epoch": 3.4822262118491922, "step": 48490}, {"loss": 0.6969, "grad_norm": 0.8552606701850891, "learning_rate": 0.0002, "epoch": 3.4829443447037702, "step": 48500}, {"loss": 0.6253, "grad_norm": 0.9698445200920105, "learning_rate": 0.0002, "epoch": 3.4836624775583482, "step": 48510}, {"loss": 0.6844, "grad_norm": 0.9427815675735474, "learning_rate": 0.0002, "epoch": 3.4843806104129262, "step": 48520}, {"loss": 0.6722, "grad_norm": 0.7902070879936218, "learning_rate": 0.0002, "epoch": 3.4850987432675042, "step": 48530}, {"loss": 0.6708, "grad_norm": 1.0300066471099854, "learning_rate": 0.0002, "epoch": 3.4858168761220827, "step": 48540}, {"loss": 0.6113, "grad_norm": 1.1688778400421143, "learning_rate": 0.0002, "epoch": 3.4865350089766607, "step": 48550}, {"loss": 0.5956, "grad_norm": 1.0012071132659912, "learning_rate": 0.0002, "epoch": 3.4872531418312387, "step": 48560}, {"loss": 0.6536, "grad_norm": 1.112094759941101, "learning_rate": 0.0002, "epoch": 3.4879712746858167, "step": 48570}, {"loss": 0.6625, "grad_norm": 0.8547284603118896, "learning_rate": 0.0002, "epoch": 3.488689407540395, "step": 48580}, {"loss": 0.6488, "grad_norm": 0.8827278017997742, "learning_rate": 0.0002, "epoch": 3.489407540394973, "step": 48590}, {"loss": 0.6437, "grad_norm": 0.9255490303039551, "learning_rate": 0.0002, "epoch": 3.490125673249551, "step": 48600}, {"loss": 0.6089, "grad_norm": 0.8000030517578125, "learning_rate": 0.0002, "epoch": 3.490843806104129, "step": 48610}, {"loss": 0.647, "grad_norm": 0.9327391386032104, "learning_rate": 0.0002, "epoch": 3.491561938958707, "step": 48620}, {"loss": 0.6678, "grad_norm": 0.9004138708114624, "learning_rate": 0.0002, "epoch": 3.4922800718132856, "step": 48630}, {"loss": 0.6145, "grad_norm": 0.9886971116065979, "learning_rate": 0.0002, "epoch": 3.4929982046678636, "step": 48640}, {"loss": 0.6309, "grad_norm": 0.9890487194061279, "learning_rate": 0.0002, "epoch": 3.4937163375224416, "step": 48650}, {"loss": 0.655, "grad_norm": 0.7024438977241516, "learning_rate": 0.0002, "epoch": 3.4944344703770196, "step": 48660}, {"loss": 0.6313, "grad_norm": 0.8397303223609924, "learning_rate": 0.0002, "epoch": 3.495152603231598, "step": 48670}, {"loss": 0.6429, "grad_norm": 0.9120950698852539, "learning_rate": 0.0002, "epoch": 3.495870736086176, "step": 48680}, {"loss": 0.631, "grad_norm": 1.057299017906189, "learning_rate": 0.0002, "epoch": 3.496588868940754, "step": 48690}, {"loss": 0.6459, "grad_norm": 0.821325957775116, "learning_rate": 0.0002, "epoch": 3.497307001795332, "step": 48700}, {"loss": 0.6174, "grad_norm": 1.0029970407485962, "learning_rate": 0.0002, "epoch": 3.49802513464991, "step": 48710}, {"loss": 0.6374, "grad_norm": 0.9483712911605835, "learning_rate": 0.0002, "epoch": 3.4987432675044885, "step": 48720}, {"loss": 0.6472, "grad_norm": 0.9637855291366577, "learning_rate": 0.0002, "epoch": 3.4994614003590665, "step": 48730}, {"loss": 0.6639, "grad_norm": 0.6848894357681274, "learning_rate": 0.0002, "epoch": 3.5001795332136445, "step": 48740}, {"loss": 0.6129, "grad_norm": 0.7848573327064514, "learning_rate": 0.0002, "epoch": 3.5008976660682225, "step": 48750}, {"loss": 0.6306, "grad_norm": 1.0341308116912842, "learning_rate": 0.0002, "epoch": 3.501615798922801, "step": 48760}, {"loss": 0.6063, "grad_norm": 0.8858218193054199, "learning_rate": 0.0002, "epoch": 3.502333931777379, "step": 48770}, {"loss": 0.6729, "grad_norm": 0.8366939425468445, "learning_rate": 0.0002, "epoch": 3.503052064631957, "step": 48780}, {"loss": 0.6736, "grad_norm": 0.7926092147827148, "learning_rate": 0.0002, "epoch": 3.503770197486535, "step": 48790}, {"loss": 0.6279, "grad_norm": 0.8503843545913696, "learning_rate": 0.0002, "epoch": 3.504488330341113, "step": 48800}, {"loss": 0.6162, "grad_norm": 0.8867869973182678, "learning_rate": 0.0002, "epoch": 3.505206463195691, "step": 48810}, {"loss": 0.6987, "grad_norm": 1.0336930751800537, "learning_rate": 0.0002, "epoch": 3.5059245960502694, "step": 48820}, {"loss": 0.6333, "grad_norm": 0.8564051985740662, "learning_rate": 0.0002, "epoch": 3.5066427289048474, "step": 48830}, {"loss": 0.6574, "grad_norm": 0.9202605485916138, "learning_rate": 0.0002, "epoch": 3.5073608617594254, "step": 48840}, {"loss": 0.6457, "grad_norm": 0.8838639855384827, "learning_rate": 0.0002, "epoch": 3.508078994614004, "step": 48850}, {"loss": 0.631, "grad_norm": 0.8975196480751038, "learning_rate": 0.0002, "epoch": 3.508797127468582, "step": 48860}, {"loss": 0.6335, "grad_norm": 0.8842370510101318, "learning_rate": 0.0002, "epoch": 3.50951526032316, "step": 48870}, {"loss": 0.6569, "grad_norm": 0.9195886254310608, "learning_rate": 0.0002, "epoch": 3.510233393177738, "step": 48880}, {"loss": 0.6647, "grad_norm": 0.986130952835083, "learning_rate": 0.0002, "epoch": 3.510951526032316, "step": 48890}, {"loss": 0.6676, "grad_norm": 0.8119593858718872, "learning_rate": 0.0002, "epoch": 3.511669658886894, "step": 48900}, {"loss": 0.653, "grad_norm": 0.9027136564254761, "learning_rate": 0.0002, "epoch": 3.5123877917414723, "step": 48910}, {"loss": 0.6731, "grad_norm": 0.8560537099838257, "learning_rate": 0.0002, "epoch": 3.5131059245960503, "step": 48920}, {"loss": 0.7032, "grad_norm": 0.7073559165000916, "learning_rate": 0.0002, "epoch": 3.5138240574506283, "step": 48930}, {"loss": 0.6738, "grad_norm": 0.8753304481506348, "learning_rate": 0.0002, "epoch": 3.5145421903052063, "step": 48940}, {"loss": 0.6366, "grad_norm": 0.9151145815849304, "learning_rate": 0.0002, "epoch": 3.5152603231597848, "step": 48950}, {"loss": 0.6135, "grad_norm": 0.7794315814971924, "learning_rate": 0.0002, "epoch": 3.5159784560143628, "step": 48960}, {"loss": 0.658, "grad_norm": 0.9226023554801941, "learning_rate": 0.0002, "epoch": 3.5166965888689408, "step": 48970}, {"loss": 0.6473, "grad_norm": 0.8442051410675049, "learning_rate": 0.0002, "epoch": 3.5174147217235188, "step": 48980}, {"loss": 0.6267, "grad_norm": 0.9769423007965088, "learning_rate": 0.0002, "epoch": 3.5181328545780968, "step": 48990}, {"loss": 0.6333, "grad_norm": 0.740347146987915, "learning_rate": 0.0002, "epoch": 3.5188509874326748, "step": 49000}, {"loss": 0.6652, "grad_norm": 0.8963457345962524, "learning_rate": 0.0002, "epoch": 3.519569120287253, "step": 49010}, {"loss": 0.6782, "grad_norm": 0.8410176634788513, "learning_rate": 0.0002, "epoch": 3.520287253141831, "step": 49020}, {"loss": 0.6496, "grad_norm": 1.0486022233963013, "learning_rate": 0.0002, "epoch": 3.521005385996409, "step": 49030}, {"loss": 0.6275, "grad_norm": 0.95393967628479, "learning_rate": 0.0002, "epoch": 3.5217235188509877, "step": 49040}, {"loss": 0.6328, "grad_norm": 0.8261157274246216, "learning_rate": 0.0002, "epoch": 3.5224416517055657, "step": 49050}, {"loss": 0.6441, "grad_norm": 0.9321704506874084, "learning_rate": 0.0002, "epoch": 3.5231597845601437, "step": 49060}, {"loss": 0.6202, "grad_norm": 1.2596088647842407, "learning_rate": 0.0002, "epoch": 3.5238779174147217, "step": 49070}, {"loss": 0.6596, "grad_norm": 0.8584637641906738, "learning_rate": 0.0002, "epoch": 3.5245960502692997, "step": 49080}, {"loss": 0.6708, "grad_norm": 0.850520670413971, "learning_rate": 0.0002, "epoch": 3.5253141831238777, "step": 49090}, {"loss": 0.6543, "grad_norm": 0.8915920257568359, "learning_rate": 0.0002, "epoch": 3.526032315978456, "step": 49100}, {"loss": 0.6558, "grad_norm": 0.9070239067077637, "learning_rate": 0.0002, "epoch": 3.526750448833034, "step": 49110}, {"loss": 0.6128, "grad_norm": 0.699878990650177, "learning_rate": 0.0002, "epoch": 3.527468581687612, "step": 49120}, {"loss": 0.6454, "grad_norm": 0.9003779888153076, "learning_rate": 0.0002, "epoch": 3.5281867145421906, "step": 49130}, {"loss": 0.6177, "grad_norm": 0.7886711955070496, "learning_rate": 0.0002, "epoch": 3.5289048473967686, "step": 49140}, {"loss": 0.6499, "grad_norm": 0.7368922233581543, "learning_rate": 0.0002, "epoch": 3.5296229802513466, "step": 49150}, {"loss": 0.6382, "grad_norm": 0.8585197329521179, "learning_rate": 0.0002, "epoch": 3.5303411131059246, "step": 49160}, {"loss": 0.6761, "grad_norm": 1.0205435752868652, "learning_rate": 0.0002, "epoch": 3.5310592459605026, "step": 49170}, {"loss": 0.6544, "grad_norm": 0.8756650686264038, "learning_rate": 0.0002, "epoch": 3.5317773788150806, "step": 49180}, {"loss": 0.6592, "grad_norm": 1.0278643369674683, "learning_rate": 0.0002, "epoch": 3.532495511669659, "step": 49190}, {"loss": 0.6682, "grad_norm": 0.8641911745071411, "learning_rate": 0.0002, "epoch": 3.533213644524237, "step": 49200}, {"loss": 0.6531, "grad_norm": 0.8730159401893616, "learning_rate": 0.0002, "epoch": 3.533931777378815, "step": 49210}, {"loss": 0.636, "grad_norm": 0.918637216091156, "learning_rate": 0.0002, "epoch": 3.534649910233393, "step": 49220}, {"loss": 0.6815, "grad_norm": 1.0467222929000854, "learning_rate": 0.0002, "epoch": 3.5353680430879715, "step": 49230}, {"loss": 0.6554, "grad_norm": 1.005009412765503, "learning_rate": 0.0002, "epoch": 3.5360861759425495, "step": 49240}, {"loss": 0.649, "grad_norm": 0.9775063395500183, "learning_rate": 0.0002, "epoch": 3.5368043087971275, "step": 49250}, {"loss": 0.6527, "grad_norm": 0.8198322057723999, "learning_rate": 0.0002, "epoch": 3.5375224416517055, "step": 49260}, {"loss": 0.664, "grad_norm": 0.8184829354286194, "learning_rate": 0.0002, "epoch": 3.5382405745062835, "step": 49270}, {"loss": 0.6493, "grad_norm": 0.9520270824432373, "learning_rate": 0.0002, "epoch": 3.5389587073608615, "step": 49280}, {"loss": 0.5935, "grad_norm": 0.7816803455352783, "learning_rate": 0.0002, "epoch": 3.53967684021544, "step": 49290}, {"loss": 0.6424, "grad_norm": 0.6915702819824219, "learning_rate": 0.0002, "epoch": 3.540394973070018, "step": 49300}, {"loss": 0.6447, "grad_norm": 0.8282375931739807, "learning_rate": 0.0002, "epoch": 3.541113105924596, "step": 49310}, {"loss": 0.6164, "grad_norm": 1.0797513723373413, "learning_rate": 0.0002, "epoch": 3.5418312387791744, "step": 49320}, {"loss": 0.6836, "grad_norm": 0.868671715259552, "learning_rate": 0.0002, "epoch": 3.5425493716337524, "step": 49330}, {"loss": 0.6453, "grad_norm": 0.8534455895423889, "learning_rate": 0.0002, "epoch": 3.5432675044883304, "step": 49340}, {"loss": 0.6706, "grad_norm": 0.816411554813385, "learning_rate": 0.0002, "epoch": 3.5439856373429084, "step": 49350}, {"loss": 0.6101, "grad_norm": 0.7813423275947571, "learning_rate": 0.0002, "epoch": 3.5447037701974864, "step": 49360}, {"loss": 0.6617, "grad_norm": 0.8002013564109802, "learning_rate": 0.0002, "epoch": 3.5454219030520644, "step": 49370}, {"loss": 0.6667, "grad_norm": 0.9740113615989685, "learning_rate": 0.0002, "epoch": 3.546140035906643, "step": 49380}, {"loss": 0.6938, "grad_norm": 0.9046127200126648, "learning_rate": 0.0002, "epoch": 3.546858168761221, "step": 49390}, {"loss": 0.6444, "grad_norm": 0.8635150194168091, "learning_rate": 0.0002, "epoch": 3.547576301615799, "step": 49400}, {"loss": 0.6273, "grad_norm": 0.9488558769226074, "learning_rate": 0.0002, "epoch": 3.5482944344703773, "step": 49410}, {"loss": 0.6542, "grad_norm": 0.9637090563774109, "learning_rate": 0.0002, "epoch": 3.5490125673249553, "step": 49420}, {"loss": 0.6468, "grad_norm": 1.042245626449585, "learning_rate": 0.0002, "epoch": 3.5497307001795333, "step": 49430}, {"loss": 0.6999, "grad_norm": 0.9076175689697266, "learning_rate": 0.0002, "epoch": 3.5504488330341113, "step": 49440}, {"loss": 0.6192, "grad_norm": 0.8480596542358398, "learning_rate": 0.0002, "epoch": 3.5511669658886893, "step": 49450}, {"loss": 0.6835, "grad_norm": 0.8483007550239563, "learning_rate": 0.0002, "epoch": 3.5518850987432673, "step": 49460}, {"loss": 0.6607, "grad_norm": 0.7855815887451172, "learning_rate": 0.0002, "epoch": 3.5526032315978457, "step": 49470}, {"loss": 0.6364, "grad_norm": 0.8435823917388916, "learning_rate": 0.0002, "epoch": 3.5533213644524237, "step": 49480}, {"loss": 0.6674, "grad_norm": 0.8613026142120361, "learning_rate": 0.0002, "epoch": 3.5540394973070017, "step": 49490}, {"loss": 0.6651, "grad_norm": 0.9654812812805176, "learning_rate": 0.0002, "epoch": 3.5547576301615798, "step": 49500}, {"loss": 0.6471, "grad_norm": 0.8888838887214661, "learning_rate": 0.0002, "epoch": 3.555475763016158, "step": 49510}, {"loss": 0.622, "grad_norm": 0.7718146443367004, "learning_rate": 0.0002, "epoch": 3.556193895870736, "step": 49520}, {"loss": 0.6297, "grad_norm": 0.9487382173538208, "learning_rate": 0.0002, "epoch": 3.556912028725314, "step": 49530}, {"loss": 0.6516, "grad_norm": 0.9256559610366821, "learning_rate": 0.0002, "epoch": 3.557630161579892, "step": 49540}, {"loss": 0.6461, "grad_norm": 0.8879945874214172, "learning_rate": 0.0002, "epoch": 3.55834829443447, "step": 49550}, {"loss": 0.6367, "grad_norm": 0.8498744368553162, "learning_rate": 0.0002, "epoch": 3.559066427289048, "step": 49560}, {"loss": 0.6274, "grad_norm": 0.9550948143005371, "learning_rate": 0.0002, "epoch": 3.5597845601436267, "step": 49570}, {"loss": 0.635, "grad_norm": 0.8386164903640747, "learning_rate": 0.0002, "epoch": 3.5605026929982047, "step": 49580}, {"loss": 0.6495, "grad_norm": 0.925573468208313, "learning_rate": 0.0002, "epoch": 3.5612208258527827, "step": 49590}, {"loss": 0.676, "grad_norm": 0.8867112398147583, "learning_rate": 0.0002, "epoch": 3.561938958707361, "step": 49600}, {"loss": 0.6156, "grad_norm": 0.7638537883758545, "learning_rate": 0.0002, "epoch": 3.562657091561939, "step": 49610}, {"loss": 0.6597, "grad_norm": 0.9491845965385437, "learning_rate": 0.0002, "epoch": 3.563375224416517, "step": 49620}, {"loss": 0.6237, "grad_norm": 0.8384189605712891, "learning_rate": 0.0002, "epoch": 3.564093357271095, "step": 49630}, {"loss": 0.6102, "grad_norm": 0.8850575089454651, "learning_rate": 0.0002, "epoch": 3.564811490125673, "step": 49640}, {"loss": 0.6517, "grad_norm": 1.020916223526001, "learning_rate": 0.0002, "epoch": 3.565529622980251, "step": 49650}, {"loss": 0.6569, "grad_norm": 0.9298280477523804, "learning_rate": 0.0002, "epoch": 3.5662477558348296, "step": 49660}, {"loss": 0.6094, "grad_norm": 0.9795742034912109, "learning_rate": 0.0002, "epoch": 3.5669658886894076, "step": 49670}, {"loss": 0.6147, "grad_norm": 0.9401193261146545, "learning_rate": 0.0002, "epoch": 3.5676840215439856, "step": 49680}, {"loss": 0.622, "grad_norm": 1.0383585691452026, "learning_rate": 0.0002, "epoch": 3.568402154398564, "step": 49690}, {"loss": 0.6304, "grad_norm": 0.8370866179466248, "learning_rate": 0.0002, "epoch": 3.569120287253142, "step": 49700}, {"loss": 0.6356, "grad_norm": 0.8207486271858215, "learning_rate": 0.0002, "epoch": 3.56983842010772, "step": 49710}, {"loss": 0.6328, "grad_norm": 0.8551223278045654, "learning_rate": 0.0002, "epoch": 3.570556552962298, "step": 49720}, {"loss": 0.621, "grad_norm": 0.8041176199913025, "learning_rate": 0.0002, "epoch": 3.571274685816876, "step": 49730}, {"loss": 0.5818, "grad_norm": 0.9862527847290039, "learning_rate": 0.0002, "epoch": 3.571992818671454, "step": 49740}, {"loss": 0.6448, "grad_norm": 0.7557165622711182, "learning_rate": 0.0002, "epoch": 3.5727109515260325, "step": 49750}, {"loss": 0.6484, "grad_norm": 1.0908563137054443, "learning_rate": 0.0002, "epoch": 3.5734290843806105, "step": 49760}, {"loss": 0.6497, "grad_norm": 0.7245369553565979, "learning_rate": 0.0002, "epoch": 3.5741472172351885, "step": 49770}, {"loss": 0.6315, "grad_norm": 0.7851184010505676, "learning_rate": 0.0002, "epoch": 3.5748653500897665, "step": 49780}, {"loss": 0.6245, "grad_norm": 0.9443599581718445, "learning_rate": 0.0002, "epoch": 3.575583482944345, "step": 49790}, {"loss": 0.6481, "grad_norm": 1.021196961402893, "learning_rate": 0.0002, "epoch": 3.576301615798923, "step": 49800}, {"loss": 0.6368, "grad_norm": 0.9099196195602417, "learning_rate": 0.0002, "epoch": 3.577019748653501, "step": 49810}, {"loss": 0.6372, "grad_norm": 0.9397716522216797, "learning_rate": 0.0002, "epoch": 3.577737881508079, "step": 49820}, {"loss": 0.6208, "grad_norm": 0.9214922785758972, "learning_rate": 0.0002, "epoch": 3.578456014362657, "step": 49830}, {"loss": 0.6219, "grad_norm": 1.0053879022598267, "learning_rate": 0.0002, "epoch": 3.579174147217235, "step": 49840}, {"loss": 0.6283, "grad_norm": 0.9415460228919983, "learning_rate": 0.0002, "epoch": 3.5798922800718134, "step": 49850}, {"loss": 0.6759, "grad_norm": 1.0807833671569824, "learning_rate": 0.0002, "epoch": 3.5806104129263914, "step": 49860}, {"loss": 0.6404, "grad_norm": 1.0070871114730835, "learning_rate": 0.0002, "epoch": 3.5813285457809694, "step": 49870}, {"loss": 0.6411, "grad_norm": 0.9707024693489075, "learning_rate": 0.0002, "epoch": 3.582046678635548, "step": 49880}, {"loss": 0.6852, "grad_norm": 0.9979593753814697, "learning_rate": 0.0002, "epoch": 3.582764811490126, "step": 49890}, {"loss": 0.6519, "grad_norm": 0.7238648533821106, "learning_rate": 0.0002, "epoch": 3.583482944344704, "step": 49900}, {"loss": 0.6452, "grad_norm": 0.8168631792068481, "learning_rate": 0.0002, "epoch": 3.584201077199282, "step": 49910}, {"loss": 0.6174, "grad_norm": 0.8156409859657288, "learning_rate": 0.0002, "epoch": 3.58491921005386, "step": 49920}, {"loss": 0.6248, "grad_norm": 0.9256414175033569, "learning_rate": 0.0002, "epoch": 3.585637342908438, "step": 49930}, {"loss": 0.6077, "grad_norm": 1.0090070962905884, "learning_rate": 0.0002, "epoch": 3.5863554757630163, "step": 49940}, {"loss": 0.6016, "grad_norm": 0.8257701992988586, "learning_rate": 0.0002, "epoch": 3.5870736086175943, "step": 49950}, {"loss": 0.6996, "grad_norm": 0.9189013242721558, "learning_rate": 0.0002, "epoch": 3.5877917414721723, "step": 49960}, {"loss": 0.661, "grad_norm": 0.8497788310050964, "learning_rate": 0.0002, "epoch": 3.5885098743267507, "step": 49970}, {"loss": 0.6335, "grad_norm": 0.9596505761146545, "learning_rate": 0.0002, "epoch": 3.5892280071813287, "step": 49980}, {"loss": 0.697, "grad_norm": 0.8773331642150879, "learning_rate": 0.0002, "epoch": 3.5899461400359067, "step": 49990}, {"loss": 0.6259, "grad_norm": 0.8952302932739258, "learning_rate": 0.0002, "epoch": 3.5906642728904847, "step": 50000}, {"loss": 0.6152, "grad_norm": 0.7713809609413147, "learning_rate": 0.0002, "epoch": 3.5913824057450627, "step": 50010}, {"loss": 0.6127, "grad_norm": 1.0151346921920776, "learning_rate": 0.0002, "epoch": 3.5921005385996407, "step": 50020}, {"loss": 0.6093, "grad_norm": 0.8793733716011047, "learning_rate": 0.0002, "epoch": 3.592818671454219, "step": 50030}, {"loss": 0.5986, "grad_norm": 0.8881325721740723, "learning_rate": 0.0002, "epoch": 3.593536804308797, "step": 50040}, {"loss": 0.6351, "grad_norm": 0.9346749782562256, "learning_rate": 0.0002, "epoch": 3.594254937163375, "step": 50050}, {"loss": 0.6501, "grad_norm": 0.8705052137374878, "learning_rate": 0.0002, "epoch": 3.594973070017953, "step": 50060}, {"loss": 0.6753, "grad_norm": 1.039197564125061, "learning_rate": 0.0002, "epoch": 3.5956912028725316, "step": 50070}, {"loss": 0.6565, "grad_norm": 0.7053273320198059, "learning_rate": 0.0002, "epoch": 3.5964093357271096, "step": 50080}, {"loss": 0.6546, "grad_norm": 0.8268665671348572, "learning_rate": 0.0002, "epoch": 3.5971274685816876, "step": 50090}, {"loss": 0.6637, "grad_norm": 0.8921764492988586, "learning_rate": 0.0002, "epoch": 3.5978456014362656, "step": 50100}, {"loss": 0.6827, "grad_norm": 0.9756084680557251, "learning_rate": 0.0002, "epoch": 3.5985637342908436, "step": 50110}, {"loss": 0.6746, "grad_norm": 0.9275530576705933, "learning_rate": 0.0002, "epoch": 3.5992818671454216, "step": 50120}, {"loss": 0.6709, "grad_norm": 0.9030009508132935, "learning_rate": 0.0002, "epoch": 3.6, "step": 50130}, {"loss": 0.6344, "grad_norm": 0.7805638909339905, "learning_rate": 0.0002, "epoch": 3.600718132854578, "step": 50140}, {"loss": 0.6437, "grad_norm": 0.7627325057983398, "learning_rate": 0.0002, "epoch": 3.601436265709156, "step": 50150}, {"loss": 0.6523, "grad_norm": 0.7809714078903198, "learning_rate": 0.0002, "epoch": 3.6021543985637345, "step": 50160}, {"loss": 0.6578, "grad_norm": 0.7910378575325012, "learning_rate": 0.0002, "epoch": 3.6028725314183125, "step": 50170}, {"loss": 0.6522, "grad_norm": 1.004438042640686, "learning_rate": 0.0002, "epoch": 3.6035906642728905, "step": 50180}, {"loss": 0.6657, "grad_norm": 0.825969934463501, "learning_rate": 0.0002, "epoch": 3.6043087971274685, "step": 50190}, {"loss": 0.6788, "grad_norm": 0.8866565227508545, "learning_rate": 0.0002, "epoch": 3.6050269299820465, "step": 50200}, {"loss": 0.6643, "grad_norm": 0.8920543193817139, "learning_rate": 0.0002, "epoch": 3.6057450628366245, "step": 50210}, {"loss": 0.668, "grad_norm": 1.106584906578064, "learning_rate": 0.0002, "epoch": 3.606463195691203, "step": 50220}, {"loss": 0.6878, "grad_norm": 0.916607677936554, "learning_rate": 0.0002, "epoch": 3.607181328545781, "step": 50230}, {"loss": 0.6084, "grad_norm": 0.8014767169952393, "learning_rate": 0.0002, "epoch": 3.607899461400359, "step": 50240}, {"loss": 0.6718, "grad_norm": 0.9556822776794434, "learning_rate": 0.0002, "epoch": 3.608617594254937, "step": 50250}, {"loss": 0.6896, "grad_norm": 0.9630016684532166, "learning_rate": 0.0002, "epoch": 3.6093357271095154, "step": 50260}, {"loss": 0.692, "grad_norm": 0.9862125515937805, "learning_rate": 0.0002, "epoch": 3.6100538599640934, "step": 50270}, {"loss": 0.5981, "grad_norm": 1.0043333768844604, "learning_rate": 0.0002, "epoch": 3.6107719928186714, "step": 50280}, {"loss": 0.6243, "grad_norm": 0.9255319833755493, "learning_rate": 0.0002, "epoch": 3.6114901256732495, "step": 50290}, {"loss": 0.6374, "grad_norm": 1.012023687362671, "learning_rate": 0.0002, "epoch": 3.6122082585278275, "step": 50300}, {"loss": 0.6896, "grad_norm": 1.0701122283935547, "learning_rate": 0.0002, "epoch": 3.612926391382406, "step": 50310}, {"loss": 0.6474, "grad_norm": 0.8270810842514038, "learning_rate": 0.0002, "epoch": 3.613644524236984, "step": 50320}, {"loss": 0.6667, "grad_norm": 0.8881328105926514, "learning_rate": 0.0002, "epoch": 3.614362657091562, "step": 50330}, {"loss": 0.6517, "grad_norm": 0.9536844491958618, "learning_rate": 0.0002, "epoch": 3.61508078994614, "step": 50340}, {"loss": 0.62, "grad_norm": 0.8044326305389404, "learning_rate": 0.0002, "epoch": 3.6157989228007184, "step": 50350}, {"loss": 0.6259, "grad_norm": 0.834591805934906, "learning_rate": 0.0002, "epoch": 3.6165170556552964, "step": 50360}, {"loss": 0.7173, "grad_norm": 0.903752863407135, "learning_rate": 0.0002, "epoch": 3.6172351885098744, "step": 50370}, {"loss": 0.6305, "grad_norm": 0.9148632884025574, "learning_rate": 0.0002, "epoch": 3.6179533213644524, "step": 50380}, {"loss": 0.6624, "grad_norm": 0.9280176162719727, "learning_rate": 0.0002, "epoch": 3.6186714542190304, "step": 50390}, {"loss": 0.6457, "grad_norm": 0.9524136781692505, "learning_rate": 0.0002, "epoch": 3.6193895870736084, "step": 50400}, {"loss": 0.6918, "grad_norm": 1.1751197576522827, "learning_rate": 0.0002, "epoch": 3.620107719928187, "step": 50410}, {"loss": 0.6161, "grad_norm": 1.032279133796692, "learning_rate": 0.0002, "epoch": 3.620825852782765, "step": 50420}, {"loss": 0.6347, "grad_norm": 0.790741503238678, "learning_rate": 0.0002, "epoch": 3.621543985637343, "step": 50430}, {"loss": 0.695, "grad_norm": 0.9584221243858337, "learning_rate": 0.0002, "epoch": 3.6222621184919213, "step": 50440}, {"loss": 0.6393, "grad_norm": 0.7792508006095886, "learning_rate": 0.0002, "epoch": 3.6229802513464993, "step": 50450}, {"loss": 0.6398, "grad_norm": 0.8273448944091797, "learning_rate": 0.0002, "epoch": 3.6236983842010773, "step": 50460}, {"loss": 0.6436, "grad_norm": 0.8001132607460022, "learning_rate": 0.0002, "epoch": 3.6244165170556553, "step": 50470}, {"loss": 0.6499, "grad_norm": 1.077109694480896, "learning_rate": 0.0002, "epoch": 3.6251346499102333, "step": 50480}, {"loss": 0.6587, "grad_norm": 1.111274003982544, "learning_rate": 0.0002, "epoch": 3.6258527827648113, "step": 50490}, {"loss": 0.6842, "grad_norm": 0.7757347822189331, "learning_rate": 0.0002, "epoch": 3.6265709156193897, "step": 50500}, {"loss": 0.6887, "grad_norm": 0.9217049479484558, "learning_rate": 0.0002, "epoch": 3.6272890484739677, "step": 50510}, {"loss": 0.6903, "grad_norm": 0.9362251162528992, "learning_rate": 0.0002, "epoch": 3.6280071813285457, "step": 50520}, {"loss": 0.625, "grad_norm": 0.9435479044914246, "learning_rate": 0.0002, "epoch": 3.6287253141831237, "step": 50530}, {"loss": 0.5869, "grad_norm": 0.7748915553092957, "learning_rate": 0.0002, "epoch": 3.629443447037702, "step": 50540}, {"loss": 0.637, "grad_norm": 0.8238945007324219, "learning_rate": 0.0002, "epoch": 3.63016157989228, "step": 50550}, {"loss": 0.6251, "grad_norm": 0.8421505093574524, "learning_rate": 0.0002, "epoch": 3.630879712746858, "step": 50560}, {"loss": 0.6544, "grad_norm": 1.0272293090820312, "learning_rate": 0.0002, "epoch": 3.631597845601436, "step": 50570}, {"loss": 0.6467, "grad_norm": 0.7643818259239197, "learning_rate": 0.0002, "epoch": 3.632315978456014, "step": 50580}, {"loss": 0.6716, "grad_norm": 0.9756225347518921, "learning_rate": 0.0002, "epoch": 3.6330341113105926, "step": 50590}, {"loss": 0.6534, "grad_norm": 0.9311570525169373, "learning_rate": 0.0002, "epoch": 3.6337522441651706, "step": 50600}, {"loss": 0.6465, "grad_norm": 0.8829827904701233, "learning_rate": 0.0002, "epoch": 3.6344703770197486, "step": 50610}, {"loss": 0.626, "grad_norm": 0.9473454356193542, "learning_rate": 0.0002, "epoch": 3.6351885098743266, "step": 50620}, {"loss": 0.713, "grad_norm": 1.1023668050765991, "learning_rate": 0.0002, "epoch": 3.635906642728905, "step": 50630}, {"loss": 0.6287, "grad_norm": 0.8490299582481384, "learning_rate": 0.0002, "epoch": 3.636624775583483, "step": 50640}, {"loss": 0.6373, "grad_norm": 1.1129392385482788, "learning_rate": 0.0002, "epoch": 3.637342908438061, "step": 50650}, {"loss": 0.7351, "grad_norm": 1.0334501266479492, "learning_rate": 0.0002, "epoch": 3.638061041292639, "step": 50660}, {"loss": 0.69, "grad_norm": 0.8397296667098999, "learning_rate": 0.0002, "epoch": 3.638779174147217, "step": 50670}, {"loss": 0.6075, "grad_norm": 0.7984256744384766, "learning_rate": 0.0002, "epoch": 3.639497307001795, "step": 50680}, {"loss": 0.651, "grad_norm": 1.1182054281234741, "learning_rate": 0.0002, "epoch": 3.6402154398563735, "step": 50690}, {"loss": 0.6511, "grad_norm": 0.8743279576301575, "learning_rate": 0.0002, "epoch": 3.6409335727109515, "step": 50700}, {"loss": 0.6894, "grad_norm": 0.9101628661155701, "learning_rate": 0.0002, "epoch": 3.6416517055655295, "step": 50710}, {"loss": 0.6591, "grad_norm": 0.8866934180259705, "learning_rate": 0.0002, "epoch": 3.642369838420108, "step": 50720}, {"loss": 0.6483, "grad_norm": 0.863945484161377, "learning_rate": 0.0002, "epoch": 3.643087971274686, "step": 50730}, {"loss": 0.6443, "grad_norm": 1.0845744609832764, "learning_rate": 0.0002, "epoch": 3.643806104129264, "step": 50740}, {"loss": 0.6611, "grad_norm": 0.8610911965370178, "learning_rate": 0.0002, "epoch": 3.644524236983842, "step": 50750}, {"loss": 0.6617, "grad_norm": 0.8502625226974487, "learning_rate": 0.0002, "epoch": 3.64524236983842, "step": 50760}, {"loss": 0.6283, "grad_norm": 0.847372829914093, "learning_rate": 0.0002, "epoch": 3.645960502692998, "step": 50770}, {"loss": 0.5724, "grad_norm": 0.8649292588233948, "learning_rate": 0.0002, "epoch": 3.6466786355475764, "step": 50780}, {"loss": 0.6253, "grad_norm": 0.8742905855178833, "learning_rate": 0.0002, "epoch": 3.6473967684021544, "step": 50790}, {"loss": 0.68, "grad_norm": 0.9546048641204834, "learning_rate": 0.0002, "epoch": 3.6481149012567324, "step": 50800}, {"loss": 0.6212, "grad_norm": 0.7893161773681641, "learning_rate": 0.0002, "epoch": 3.6488330341113104, "step": 50810}, {"loss": 0.6328, "grad_norm": 0.9350247979164124, "learning_rate": 0.0002, "epoch": 3.649551166965889, "step": 50820}, {"loss": 0.6893, "grad_norm": 0.772149384021759, "learning_rate": 0.0002, "epoch": 3.650269299820467, "step": 50830}, {"loss": 0.6107, "grad_norm": 0.8281718492507935, "learning_rate": 0.0002, "epoch": 3.650987432675045, "step": 50840}, {"loss": 0.6136, "grad_norm": 0.8063850402832031, "learning_rate": 0.0002, "epoch": 3.651705565529623, "step": 50850}, {"loss": 0.6416, "grad_norm": 0.8101351261138916, "learning_rate": 0.0002, "epoch": 3.652423698384201, "step": 50860}, {"loss": 0.6636, "grad_norm": 0.8747833371162415, "learning_rate": 0.0002, "epoch": 3.6531418312387793, "step": 50870}, {"loss": 0.6575, "grad_norm": 0.9634656310081482, "learning_rate": 0.0002, "epoch": 3.6538599640933573, "step": 50880}, {"loss": 0.6227, "grad_norm": 1.1646045446395874, "learning_rate": 0.0002, "epoch": 3.6545780969479353, "step": 50890}, {"loss": 0.6628, "grad_norm": 0.8538454174995422, "learning_rate": 0.0002, "epoch": 3.6552962298025133, "step": 50900}, {"loss": 0.6488, "grad_norm": 0.7639184594154358, "learning_rate": 0.0002, "epoch": 3.656014362657092, "step": 50910}, {"loss": 0.6495, "grad_norm": 0.8750212788581848, "learning_rate": 0.0002, "epoch": 3.65673249551167, "step": 50920}, {"loss": 0.6601, "grad_norm": 0.9161198735237122, "learning_rate": 0.0002, "epoch": 3.657450628366248, "step": 50930}, {"loss": 0.6809, "grad_norm": 0.7987924814224243, "learning_rate": 0.0002, "epoch": 3.658168761220826, "step": 50940}, {"loss": 0.6228, "grad_norm": 0.8939290642738342, "learning_rate": 0.0002, "epoch": 3.658886894075404, "step": 50950}, {"loss": 0.687, "grad_norm": 0.9803797602653503, "learning_rate": 0.0002, "epoch": 3.659605026929982, "step": 50960}, {"loss": 0.6368, "grad_norm": 1.2423512935638428, "learning_rate": 0.0002, "epoch": 3.6603231597845602, "step": 50970}, {"loss": 0.6477, "grad_norm": 1.0023225545883179, "learning_rate": 0.0002, "epoch": 3.6610412926391382, "step": 50980}, {"loss": 0.6659, "grad_norm": 0.9066677689552307, "learning_rate": 0.0002, "epoch": 3.6617594254937162, "step": 50990}, {"loss": 0.6348, "grad_norm": 0.8906226754188538, "learning_rate": 0.0002, "epoch": 3.6624775583482947, "step": 51000}, {"loss": 0.5967, "grad_norm": 0.7449954152107239, "learning_rate": 0.0002, "epoch": 3.6631956912028727, "step": 51010}, {"loss": 0.6167, "grad_norm": 0.812612771987915, "learning_rate": 0.0002, "epoch": 3.6639138240574507, "step": 51020}, {"loss": 0.6414, "grad_norm": 0.861818253993988, "learning_rate": 0.0002, "epoch": 3.6646319569120287, "step": 51030}, {"loss": 0.6418, "grad_norm": 0.849726676940918, "learning_rate": 0.0002, "epoch": 3.6653500897666067, "step": 51040}, {"loss": 0.6613, "grad_norm": 0.9738494753837585, "learning_rate": 0.0002, "epoch": 3.6660682226211847, "step": 51050}, {"loss": 0.6094, "grad_norm": 0.928989827632904, "learning_rate": 0.0002, "epoch": 3.666786355475763, "step": 51060}, {"loss": 0.623, "grad_norm": 0.9725563526153564, "learning_rate": 0.0002, "epoch": 3.667504488330341, "step": 51070}, {"loss": 0.5967, "grad_norm": 0.9366095066070557, "learning_rate": 0.0002, "epoch": 3.668222621184919, "step": 51080}, {"loss": 0.6175, "grad_norm": 0.8012986779212952, "learning_rate": 0.0002, "epoch": 3.668940754039497, "step": 51090}, {"loss": 0.6428, "grad_norm": 1.0646892786026, "learning_rate": 0.0002, "epoch": 3.6696588868940756, "step": 51100}, {"loss": 0.6333, "grad_norm": 0.7245157361030579, "learning_rate": 0.0002, "epoch": 3.6703770197486536, "step": 51110}, {"loss": 0.6618, "grad_norm": 0.6938936114311218, "learning_rate": 0.0002, "epoch": 3.6710951526032316, "step": 51120}, {"loss": 0.6511, "grad_norm": 0.8461366295814514, "learning_rate": 0.0002, "epoch": 3.6718132854578096, "step": 51130}, {"loss": 0.6168, "grad_norm": 0.8392583131790161, "learning_rate": 0.0002, "epoch": 3.6725314183123876, "step": 51140}, {"loss": 0.6616, "grad_norm": 0.7245259284973145, "learning_rate": 0.0002, "epoch": 3.673249551166966, "step": 51150}, {"loss": 0.6165, "grad_norm": 1.0742167234420776, "learning_rate": 0.0002, "epoch": 3.673967684021544, "step": 51160}, {"loss": 0.6805, "grad_norm": 0.9553889036178589, "learning_rate": 0.0002, "epoch": 3.674685816876122, "step": 51170}, {"loss": 0.6065, "grad_norm": 0.8713715672492981, "learning_rate": 0.0002, "epoch": 3.6754039497307, "step": 51180}, {"loss": 0.599, "grad_norm": 0.7499800324440002, "learning_rate": 0.0002, "epoch": 3.6761220825852785, "step": 51190}, {"loss": 0.7143, "grad_norm": 1.1118139028549194, "learning_rate": 0.0002, "epoch": 3.6768402154398565, "step": 51200}, {"loss": 0.6694, "grad_norm": 0.8146613836288452, "learning_rate": 0.0002, "epoch": 3.6775583482944345, "step": 51210}, {"loss": 0.6528, "grad_norm": 0.9331285357475281, "learning_rate": 0.0002, "epoch": 3.6782764811490125, "step": 51220}, {"loss": 0.6429, "grad_norm": 1.0497597455978394, "learning_rate": 0.0002, "epoch": 3.6789946140035905, "step": 51230}, {"loss": 0.6404, "grad_norm": 0.879814863204956, "learning_rate": 0.0002, "epoch": 3.6797127468581685, "step": 51240}, {"loss": 0.6617, "grad_norm": 0.9896606802940369, "learning_rate": 0.0002, "epoch": 3.680430879712747, "step": 51250}, {"loss": 0.6461, "grad_norm": 0.928236186504364, "learning_rate": 0.0002, "epoch": 3.681149012567325, "step": 51260}, {"loss": 0.6516, "grad_norm": 0.8436732292175293, "learning_rate": 0.0002, "epoch": 3.681867145421903, "step": 51270}, {"loss": 0.6428, "grad_norm": 0.93634432554245, "learning_rate": 0.0002, "epoch": 3.6825852782764814, "step": 51280}, {"loss": 0.6081, "grad_norm": 0.8477143049240112, "learning_rate": 0.0002, "epoch": 3.6833034111310594, "step": 51290}, {"loss": 0.6536, "grad_norm": 0.8720934987068176, "learning_rate": 0.0002, "epoch": 3.6840215439856374, "step": 51300}, {"loss": 0.6523, "grad_norm": 0.7322931289672852, "learning_rate": 0.0002, "epoch": 3.6847396768402154, "step": 51310}, {"loss": 0.6475, "grad_norm": 1.0064427852630615, "learning_rate": 0.0002, "epoch": 3.6854578096947934, "step": 51320}, {"loss": 0.681, "grad_norm": 1.0197817087173462, "learning_rate": 0.0002, "epoch": 3.6861759425493714, "step": 51330}, {"loss": 0.5904, "grad_norm": 0.8764060139656067, "learning_rate": 0.0002, "epoch": 3.68689407540395, "step": 51340}, {"loss": 0.625, "grad_norm": 0.9763964414596558, "learning_rate": 0.0002, "epoch": 3.687612208258528, "step": 51350}, {"loss": 0.6299, "grad_norm": 0.8389105200767517, "learning_rate": 0.0002, "epoch": 3.688330341113106, "step": 51360}, {"loss": 0.6885, "grad_norm": 0.9215750694274902, "learning_rate": 0.0002, "epoch": 3.689048473967684, "step": 51370}, {"loss": 0.6325, "grad_norm": 0.8444913625717163, "learning_rate": 0.0002, "epoch": 3.6897666068222623, "step": 51380}, {"loss": 0.657, "grad_norm": 0.9635153412818909, "learning_rate": 0.0002, "epoch": 3.6904847396768403, "step": 51390}, {"loss": 0.7045, "grad_norm": 1.0397378206253052, "learning_rate": 0.0002, "epoch": 3.6912028725314183, "step": 51400}, {"loss": 0.6635, "grad_norm": 0.9154748320579529, "learning_rate": 0.0002, "epoch": 3.6919210053859963, "step": 51410}, {"loss": 0.6757, "grad_norm": 0.906445324420929, "learning_rate": 0.0002, "epoch": 3.6926391382405743, "step": 51420}, {"loss": 0.6533, "grad_norm": 0.9237992763519287, "learning_rate": 0.0002, "epoch": 3.6933572710951523, "step": 51430}, {"loss": 0.6257, "grad_norm": 0.8796338438987732, "learning_rate": 0.0002, "epoch": 3.6940754039497308, "step": 51440}, {"loss": 0.7063, "grad_norm": 0.8613203763961792, "learning_rate": 0.0002, "epoch": 3.6947935368043088, "step": 51450}, {"loss": 0.6455, "grad_norm": 0.7957607507705688, "learning_rate": 0.0002, "epoch": 3.6955116696588868, "step": 51460}, {"loss": 0.6328, "grad_norm": 0.9183711409568787, "learning_rate": 0.0002, "epoch": 3.6962298025134652, "step": 51470}, {"loss": 0.6289, "grad_norm": 1.0108308792114258, "learning_rate": 0.0002, "epoch": 3.6969479353680432, "step": 51480}, {"loss": 0.668, "grad_norm": 0.7768247127532959, "learning_rate": 0.0002, "epoch": 3.6976660682226212, "step": 51490}, {"loss": 0.6483, "grad_norm": 1.0051485300064087, "learning_rate": 0.0002, "epoch": 3.6983842010771992, "step": 51500}, {"loss": 0.6268, "grad_norm": 0.82451993227005, "learning_rate": 0.0002, "epoch": 3.6991023339317772, "step": 51510}, {"loss": 0.6258, "grad_norm": 0.9542286992073059, "learning_rate": 0.0002, "epoch": 3.6998204667863552, "step": 51520}, {"loss": 0.6415, "grad_norm": 0.693890392780304, "learning_rate": 0.0002, "epoch": 3.7005385996409337, "step": 51530}, {"loss": 0.6445, "grad_norm": 0.9068924784660339, "learning_rate": 0.0002, "epoch": 3.7012567324955117, "step": 51540}, {"loss": 0.6386, "grad_norm": 0.8694922924041748, "learning_rate": 0.0002, "epoch": 3.7019748653500897, "step": 51550}, {"loss": 0.6563, "grad_norm": 0.941081702709198, "learning_rate": 0.0002, "epoch": 3.702692998204668, "step": 51560}, {"loss": 0.6068, "grad_norm": 0.7385984659194946, "learning_rate": 0.0002, "epoch": 3.703411131059246, "step": 51570}, {"loss": 0.6243, "grad_norm": 1.0399216413497925, "learning_rate": 0.0002, "epoch": 3.704129263913824, "step": 51580}, {"loss": 0.6776, "grad_norm": 0.9802294969558716, "learning_rate": 0.0002, "epoch": 3.704847396768402, "step": 51590}, {"loss": 0.6243, "grad_norm": 1.0409669876098633, "learning_rate": 0.0002, "epoch": 3.70556552962298, "step": 51600}, {"loss": 0.6812, "grad_norm": 0.8972786068916321, "learning_rate": 0.0002, "epoch": 3.706283662477558, "step": 51610}, {"loss": 0.5993, "grad_norm": 1.1916245222091675, "learning_rate": 0.0002, "epoch": 3.7070017953321366, "step": 51620}, {"loss": 0.6566, "grad_norm": 0.9545385241508484, "learning_rate": 0.0002, "epoch": 3.7077199281867146, "step": 51630}, {"loss": 0.6497, "grad_norm": 1.0773427486419678, "learning_rate": 0.0002, "epoch": 3.7084380610412926, "step": 51640}, {"loss": 0.6768, "grad_norm": 1.0856024026870728, "learning_rate": 0.0002, "epoch": 3.7091561938958706, "step": 51650}, {"loss": 0.6404, "grad_norm": 0.7678500413894653, "learning_rate": 0.0002, "epoch": 3.709874326750449, "step": 51660}, {"loss": 0.6571, "grad_norm": 0.7276270985603333, "learning_rate": 0.0002, "epoch": 3.710592459605027, "step": 51670}, {"loss": 0.6498, "grad_norm": 0.8859017491340637, "learning_rate": 0.0002, "epoch": 3.711310592459605, "step": 51680}, {"loss": 0.6602, "grad_norm": 0.9037614464759827, "learning_rate": 0.0002, "epoch": 3.712028725314183, "step": 51690}, {"loss": 0.685, "grad_norm": 0.9223412275314331, "learning_rate": 0.0002, "epoch": 3.712746858168761, "step": 51700}, {"loss": 0.647, "grad_norm": 0.8812923431396484, "learning_rate": 0.0002, "epoch": 3.713464991023339, "step": 51710}, {"loss": 0.6546, "grad_norm": 0.8242456912994385, "learning_rate": 0.0002, "epoch": 3.7141831238779175, "step": 51720}, {"loss": 0.6462, "grad_norm": 0.8368834257125854, "learning_rate": 0.0002, "epoch": 3.7149012567324955, "step": 51730}, {"loss": 0.6432, "grad_norm": 0.8624704480171204, "learning_rate": 0.0002, "epoch": 3.7156193895870735, "step": 51740}, {"loss": 0.6367, "grad_norm": 0.9138273596763611, "learning_rate": 0.0002, "epoch": 3.716337522441652, "step": 51750}, {"loss": 0.6717, "grad_norm": 0.8088571429252625, "learning_rate": 0.0002, "epoch": 3.71705565529623, "step": 51760}, {"loss": 0.658, "grad_norm": 0.882808268070221, "learning_rate": 0.0002, "epoch": 3.717773788150808, "step": 51770}, {"loss": 0.6686, "grad_norm": 0.9368035197257996, "learning_rate": 0.0002, "epoch": 3.718491921005386, "step": 51780}, {"loss": 0.6482, "grad_norm": 0.8341794013977051, "learning_rate": 0.0002, "epoch": 3.719210053859964, "step": 51790}, {"loss": 0.6486, "grad_norm": 0.8692073225975037, "learning_rate": 0.0002, "epoch": 3.719928186714542, "step": 51800}, {"loss": 0.6591, "grad_norm": 0.7566918730735779, "learning_rate": 0.0002, "epoch": 3.7206463195691204, "step": 51810}, {"loss": 0.707, "grad_norm": 1.113138198852539, "learning_rate": 0.0002, "epoch": 3.7213644524236984, "step": 51820}, {"loss": 0.6683, "grad_norm": 0.8793158531188965, "learning_rate": 0.0002, "epoch": 3.7220825852782764, "step": 51830}, {"loss": 0.6343, "grad_norm": 0.8856439590454102, "learning_rate": 0.0002, "epoch": 3.722800718132855, "step": 51840}, {"loss": 0.6238, "grad_norm": 1.0182029008865356, "learning_rate": 0.0002, "epoch": 3.723518850987433, "step": 51850}, {"loss": 0.6743, "grad_norm": 1.1177181005477905, "learning_rate": 0.0002, "epoch": 3.724236983842011, "step": 51860}, {"loss": 0.6477, "grad_norm": 0.6600990295410156, "learning_rate": 0.0002, "epoch": 3.724955116696589, "step": 51870}, {"loss": 0.6532, "grad_norm": 1.0563536882400513, "learning_rate": 0.0002, "epoch": 3.725673249551167, "step": 51880}, {"loss": 0.6648, "grad_norm": 1.1067734956741333, "learning_rate": 0.0002, "epoch": 3.726391382405745, "step": 51890}, {"loss": 0.6547, "grad_norm": 1.0204616785049438, "learning_rate": 0.0002, "epoch": 3.7271095152603233, "step": 51900}, {"loss": 0.685, "grad_norm": 0.8647155165672302, "learning_rate": 0.0002, "epoch": 3.7278276481149013, "step": 51910}, {"loss": 0.739, "grad_norm": 1.0754971504211426, "learning_rate": 0.0002, "epoch": 3.7285457809694793, "step": 51920}, {"loss": 0.6535, "grad_norm": 1.0448992252349854, "learning_rate": 0.0002, "epoch": 3.7292639138240573, "step": 51930}, {"loss": 0.6802, "grad_norm": 0.963434100151062, "learning_rate": 0.0002, "epoch": 3.7299820466786358, "step": 51940}, {"loss": 0.6367, "grad_norm": 0.8112701773643494, "learning_rate": 0.0002, "epoch": 3.7307001795332138, "step": 51950}, {"loss": 0.6785, "grad_norm": 0.7975119948387146, "learning_rate": 0.0002, "epoch": 3.7314183123877918, "step": 51960}, {"loss": 0.6748, "grad_norm": 0.7953376173973083, "learning_rate": 0.0002, "epoch": 3.7321364452423698, "step": 51970}, {"loss": 0.6464, "grad_norm": 0.9519981741905212, "learning_rate": 0.0002, "epoch": 3.7328545780969478, "step": 51980}, {"loss": 0.6247, "grad_norm": 0.8705791234970093, "learning_rate": 0.0002, "epoch": 3.7335727109515258, "step": 51990}, {"loss": 0.6876, "grad_norm": 0.870205283164978, "learning_rate": 0.0002, "epoch": 3.734290843806104, "step": 52000}, {"loss": 0.6681, "grad_norm": 0.9558930993080139, "learning_rate": 0.0002, "epoch": 3.735008976660682, "step": 52010}, {"loss": 0.6772, "grad_norm": 0.9330434799194336, "learning_rate": 0.0002, "epoch": 3.73572710951526, "step": 52020}, {"loss": 0.6365, "grad_norm": 0.783620297908783, "learning_rate": 0.0002, "epoch": 3.7364452423698387, "step": 52030}, {"loss": 0.6275, "grad_norm": 0.7575166821479797, "learning_rate": 0.0002, "epoch": 3.7371633752244167, "step": 52040}, {"loss": 0.6859, "grad_norm": 1.0592705011367798, "learning_rate": 0.0002, "epoch": 3.7378815080789947, "step": 52050}, {"loss": 0.6704, "grad_norm": 0.9309433102607727, "learning_rate": 0.0002, "epoch": 3.7385996409335727, "step": 52060}, {"loss": 0.6607, "grad_norm": 0.972861647605896, "learning_rate": 0.0002, "epoch": 3.7393177737881507, "step": 52070}, {"loss": 0.6267, "grad_norm": 0.9318740963935852, "learning_rate": 0.0002, "epoch": 3.7400359066427287, "step": 52080}, {"loss": 0.6404, "grad_norm": 0.7938477396965027, "learning_rate": 0.0002, "epoch": 3.740754039497307, "step": 52090}, {"loss": 0.6451, "grad_norm": 1.1515966653823853, "learning_rate": 0.0002, "epoch": 3.741472172351885, "step": 52100}, {"loss": 0.6179, "grad_norm": 1.076869010925293, "learning_rate": 0.0002, "epoch": 3.742190305206463, "step": 52110}, {"loss": 0.6477, "grad_norm": 0.8516066670417786, "learning_rate": 0.0002, "epoch": 3.7429084380610416, "step": 52120}, {"loss": 0.6741, "grad_norm": 0.6853429079055786, "learning_rate": 0.0002, "epoch": 3.7436265709156196, "step": 52130}, {"loss": 0.6392, "grad_norm": 0.8179695010185242, "learning_rate": 0.0002, "epoch": 3.7443447037701976, "step": 52140}, {"loss": 0.6692, "grad_norm": 0.8395232558250427, "learning_rate": 0.0002, "epoch": 3.7450628366247756, "step": 52150}, {"loss": 0.6902, "grad_norm": 1.0178003311157227, "learning_rate": 0.0002, "epoch": 3.7457809694793536, "step": 52160}, {"loss": 0.6726, "grad_norm": 1.1801023483276367, "learning_rate": 0.0002, "epoch": 3.7464991023339316, "step": 52170}, {"loss": 0.6334, "grad_norm": 0.8215751647949219, "learning_rate": 0.0002, "epoch": 3.74721723518851, "step": 52180}, {"loss": 0.5992, "grad_norm": 1.17083740234375, "learning_rate": 0.0002, "epoch": 3.747935368043088, "step": 52190}, {"loss": 0.6219, "grad_norm": 0.9230290651321411, "learning_rate": 0.0002, "epoch": 3.748653500897666, "step": 52200}, {"loss": 0.6503, "grad_norm": 0.8431521058082581, "learning_rate": 0.0002, "epoch": 3.749371633752244, "step": 52210}, {"loss": 0.6983, "grad_norm": 0.9690840244293213, "learning_rate": 0.0002, "epoch": 3.7500897666068225, "step": 52220}, {"loss": 0.6204, "grad_norm": 1.0022395849227905, "learning_rate": 0.0002, "epoch": 3.7508078994614005, "step": 52230}, {"loss": 0.6683, "grad_norm": 1.0489065647125244, "learning_rate": 0.0002, "epoch": 3.7515260323159785, "step": 52240}, {"loss": 0.6439, "grad_norm": 0.7880696058273315, "learning_rate": 0.0002, "epoch": 3.7522441651705565, "step": 52250}, {"loss": 0.6933, "grad_norm": 1.0255829095840454, "learning_rate": 0.0002, "epoch": 3.7529622980251345, "step": 52260}, {"loss": 0.6631, "grad_norm": 0.8470141291618347, "learning_rate": 0.0002, "epoch": 3.7536804308797125, "step": 52270}, {"loss": 0.5956, "grad_norm": 0.9040523171424866, "learning_rate": 0.0002, "epoch": 3.754398563734291, "step": 52280}, {"loss": 0.6759, "grad_norm": 0.9564392566680908, "learning_rate": 0.0002, "epoch": 3.755116696588869, "step": 52290}, {"loss": 0.6717, "grad_norm": 0.907857358455658, "learning_rate": 0.0002, "epoch": 3.755834829443447, "step": 52300}, {"loss": 0.6821, "grad_norm": 0.8929873704910278, "learning_rate": 0.0002, "epoch": 3.7565529622980254, "step": 52310}, {"loss": 0.655, "grad_norm": 0.854434072971344, "learning_rate": 0.0002, "epoch": 3.7572710951526034, "step": 52320}, {"loss": 0.6668, "grad_norm": 0.8744779229164124, "learning_rate": 0.0002, "epoch": 3.7579892280071814, "step": 52330}, {"loss": 0.6628, "grad_norm": 0.9022667407989502, "learning_rate": 0.0002, "epoch": 3.7587073608617594, "step": 52340}, {"loss": 0.6275, "grad_norm": 0.8884857892990112, "learning_rate": 0.0002, "epoch": 3.7594254937163374, "step": 52350}, {"loss": 0.6585, "grad_norm": 1.0228430032730103, "learning_rate": 0.0002, "epoch": 3.7601436265709154, "step": 52360}, {"loss": 0.6092, "grad_norm": 0.8593528270721436, "learning_rate": 0.0002, "epoch": 3.760861759425494, "step": 52370}, {"loss": 0.664, "grad_norm": 0.9435563087463379, "learning_rate": 0.0002, "epoch": 3.761579892280072, "step": 52380}, {"loss": 0.6326, "grad_norm": 0.7545679807662964, "learning_rate": 0.0002, "epoch": 3.76229802513465, "step": 52390}, {"loss": 0.6628, "grad_norm": 0.9411585927009583, "learning_rate": 0.0002, "epoch": 3.7630161579892283, "step": 52400}, {"loss": 0.62, "grad_norm": 0.9764377474784851, "learning_rate": 0.0002, "epoch": 3.7637342908438063, "step": 52410}, {"loss": 0.671, "grad_norm": 1.0718384981155396, "learning_rate": 0.0002, "epoch": 3.7644524236983843, "step": 52420}, {"loss": 0.6654, "grad_norm": 0.8765230774879456, "learning_rate": 0.0002, "epoch": 3.7651705565529623, "step": 52430}, {"loss": 0.6602, "grad_norm": 0.9275036454200745, "learning_rate": 0.0002, "epoch": 3.7658886894075403, "step": 52440}, {"loss": 0.6098, "grad_norm": 0.967410147190094, "learning_rate": 0.0002, "epoch": 3.7666068222621183, "step": 52450}, {"loss": 0.6195, "grad_norm": 0.7738949060440063, "learning_rate": 0.0002, "epoch": 3.7673249551166967, "step": 52460}, {"loss": 0.6054, "grad_norm": 1.0828070640563965, "learning_rate": 0.0002, "epoch": 3.7680430879712747, "step": 52470}, {"loss": 0.6208, "grad_norm": 0.9570213556289673, "learning_rate": 0.0002, "epoch": 3.7687612208258527, "step": 52480}, {"loss": 0.6703, "grad_norm": 1.0688215494155884, "learning_rate": 0.0002, "epoch": 3.7694793536804307, "step": 52490}, {"loss": 0.5993, "grad_norm": 0.7970073223114014, "learning_rate": 0.0002, "epoch": 3.770197486535009, "step": 52500}, {"loss": 0.6537, "grad_norm": 0.7132976651191711, "learning_rate": 0.0002, "epoch": 3.770915619389587, "step": 52510}, {"loss": 0.6571, "grad_norm": 1.152268648147583, "learning_rate": 0.0002, "epoch": 3.771633752244165, "step": 52520}, {"loss": 0.6548, "grad_norm": 0.8645235896110535, "learning_rate": 0.0002, "epoch": 3.772351885098743, "step": 52530}, {"loss": 0.6918, "grad_norm": 0.7725570201873779, "learning_rate": 0.0002, "epoch": 3.773070017953321, "step": 52540}, {"loss": 0.6796, "grad_norm": 0.9718102812767029, "learning_rate": 0.0002, "epoch": 3.773788150807899, "step": 52550}, {"loss": 0.6298, "grad_norm": 0.7568017840385437, "learning_rate": 0.0002, "epoch": 3.7745062836624776, "step": 52560}, {"loss": 0.6652, "grad_norm": 0.9578912854194641, "learning_rate": 0.0002, "epoch": 3.7752244165170556, "step": 52570}, {"loss": 0.6417, "grad_norm": 0.8657314777374268, "learning_rate": 0.0002, "epoch": 3.7759425493716336, "step": 52580}, {"loss": 0.6552, "grad_norm": 0.7564393281936646, "learning_rate": 0.0002, "epoch": 3.776660682226212, "step": 52590}, {"loss": 0.69, "grad_norm": 0.7631160616874695, "learning_rate": 0.0002, "epoch": 3.77737881508079, "step": 52600}, {"loss": 0.6427, "grad_norm": 1.1852056980133057, "learning_rate": 0.0002, "epoch": 3.778096947935368, "step": 52610}, {"loss": 0.6369, "grad_norm": 1.0620790719985962, "learning_rate": 0.0002, "epoch": 3.778815080789946, "step": 52620}, {"loss": 0.6782, "grad_norm": 0.8677777647972107, "learning_rate": 0.0002, "epoch": 3.779533213644524, "step": 52630}, {"loss": 0.6249, "grad_norm": 0.9913218021392822, "learning_rate": 0.0002, "epoch": 3.780251346499102, "step": 52640}, {"loss": 0.625, "grad_norm": 0.9868429899215698, "learning_rate": 0.0002, "epoch": 3.7809694793536806, "step": 52650}, {"loss": 0.6252, "grad_norm": 0.8791782259941101, "learning_rate": 0.0002, "epoch": 3.7816876122082586, "step": 52660}, {"loss": 0.6675, "grad_norm": 0.9503955245018005, "learning_rate": 0.0002, "epoch": 3.7824057450628366, "step": 52670}, {"loss": 0.6406, "grad_norm": 0.8647131323814392, "learning_rate": 0.0002, "epoch": 3.7831238779174146, "step": 52680}, {"loss": 0.6654, "grad_norm": 0.9819629788398743, "learning_rate": 0.0002, "epoch": 3.783842010771993, "step": 52690}, {"loss": 0.593, "grad_norm": 0.8548610210418701, "learning_rate": 0.0002, "epoch": 3.784560143626571, "step": 52700}, {"loss": 0.6614, "grad_norm": 0.8706230521202087, "learning_rate": 0.0002, "epoch": 3.785278276481149, "step": 52710}, {"loss": 0.6326, "grad_norm": 1.0032461881637573, "learning_rate": 0.0002, "epoch": 3.785996409335727, "step": 52720}, {"loss": 0.6172, "grad_norm": 1.0578246116638184, "learning_rate": 0.0002, "epoch": 3.786714542190305, "step": 52730}, {"loss": 0.6392, "grad_norm": 0.9854007363319397, "learning_rate": 0.0002, "epoch": 3.7874326750448835, "step": 52740}, {"loss": 0.6462, "grad_norm": 0.8389187455177307, "learning_rate": 0.0002, "epoch": 3.7881508078994615, "step": 52750}, {"loss": 0.6515, "grad_norm": 0.9192399978637695, "learning_rate": 0.0002, "epoch": 3.7888689407540395, "step": 52760}, {"loss": 0.6436, "grad_norm": 0.9518283605575562, "learning_rate": 0.0002, "epoch": 3.7895870736086175, "step": 52770}, {"loss": 0.6548, "grad_norm": 1.1296825408935547, "learning_rate": 0.0002, "epoch": 3.790305206463196, "step": 52780}, {"loss": 0.6073, "grad_norm": 1.0589144229888916, "learning_rate": 0.0002, "epoch": 3.791023339317774, "step": 52790}, {"loss": 0.6593, "grad_norm": 0.8954343199729919, "learning_rate": 0.0002, "epoch": 3.791741472172352, "step": 52800}, {"loss": 0.6678, "grad_norm": 0.8283370733261108, "learning_rate": 0.0002, "epoch": 3.79245960502693, "step": 52810}, {"loss": 0.6865, "grad_norm": 0.910642683506012, "learning_rate": 0.0002, "epoch": 3.793177737881508, "step": 52820}, {"loss": 0.6672, "grad_norm": 0.9255108833312988, "learning_rate": 0.0002, "epoch": 3.793895870736086, "step": 52830}, {"loss": 0.6836, "grad_norm": 0.8773723244667053, "learning_rate": 0.0002, "epoch": 3.7946140035906644, "step": 52840}, {"loss": 0.6815, "grad_norm": 0.8454240560531616, "learning_rate": 0.0002, "epoch": 3.7953321364452424, "step": 52850}, {"loss": 0.6594, "grad_norm": 0.7636052966117859, "learning_rate": 0.0002, "epoch": 3.7960502692998204, "step": 52860}, {"loss": 0.6663, "grad_norm": 0.9358382821083069, "learning_rate": 0.0002, "epoch": 3.796768402154399, "step": 52870}, {"loss": 0.6761, "grad_norm": 0.9662801623344421, "learning_rate": 0.0002, "epoch": 3.797486535008977, "step": 52880}, {"loss": 0.6749, "grad_norm": 0.995907187461853, "learning_rate": 0.0002, "epoch": 3.798204667863555, "step": 52890}, {"loss": 0.6715, "grad_norm": 0.8700127005577087, "learning_rate": 0.0002, "epoch": 3.798922800718133, "step": 52900}, {"loss": 0.6554, "grad_norm": 0.8987792134284973, "learning_rate": 0.0002, "epoch": 3.799640933572711, "step": 52910}, {"loss": 0.6655, "grad_norm": 0.9753904938697815, "learning_rate": 0.0002, "epoch": 3.800359066427289, "step": 52920}, {"loss": 0.6536, "grad_norm": 0.7873555421829224, "learning_rate": 0.0002, "epoch": 3.8010771992818673, "step": 52930}, {"loss": 0.6233, "grad_norm": 0.8177929520606995, "learning_rate": 0.0002, "epoch": 3.8017953321364453, "step": 52940}, {"loss": 0.6508, "grad_norm": 0.8865532279014587, "learning_rate": 0.0002, "epoch": 3.8025134649910233, "step": 52950}, {"loss": 0.6922, "grad_norm": 0.9113775491714478, "learning_rate": 0.0002, "epoch": 3.8032315978456013, "step": 52960}, {"loss": 0.6382, "grad_norm": 0.9424585700035095, "learning_rate": 0.0002, "epoch": 3.8039497307001797, "step": 52970}, {"loss": 0.6694, "grad_norm": 0.8347237706184387, "learning_rate": 0.0002, "epoch": 3.8046678635547577, "step": 52980}, {"loss": 0.643, "grad_norm": 0.826863169670105, "learning_rate": 0.0002, "epoch": 3.8053859964093357, "step": 52990}, {"loss": 0.639, "grad_norm": 0.7313310503959656, "learning_rate": 0.0002, "epoch": 3.8061041292639137, "step": 53000}, {"loss": 0.6831, "grad_norm": 0.8352667093276978, "learning_rate": 0.0002, "epoch": 3.8068222621184917, "step": 53010}, {"loss": 0.6265, "grad_norm": 0.748461127281189, "learning_rate": 0.0002, "epoch": 3.80754039497307, "step": 53020}, {"loss": 0.6433, "grad_norm": 0.943256139755249, "learning_rate": 0.0002, "epoch": 3.808258527827648, "step": 53030}, {"loss": 0.6702, "grad_norm": 1.0448410511016846, "learning_rate": 0.0002, "epoch": 3.808976660682226, "step": 53040}, {"loss": 0.6901, "grad_norm": 0.9047636985778809, "learning_rate": 0.0002, "epoch": 3.809694793536804, "step": 53050}, {"loss": 0.6774, "grad_norm": 0.8594381213188171, "learning_rate": 0.0002, "epoch": 3.8104129263913826, "step": 53060}, {"loss": 0.6664, "grad_norm": 0.7593536972999573, "learning_rate": 0.0002, "epoch": 3.8111310592459606, "step": 53070}, {"loss": 0.6651, "grad_norm": 0.7189019918441772, "learning_rate": 0.0002, "epoch": 3.8118491921005386, "step": 53080}, {"loss": 0.6657, "grad_norm": 0.8569809198379517, "learning_rate": 0.0002, "epoch": 3.8125673249551166, "step": 53090}, {"loss": 0.6689, "grad_norm": 0.923378050327301, "learning_rate": 0.0002, "epoch": 3.8132854578096946, "step": 53100}, {"loss": 0.6168, "grad_norm": 0.9088824391365051, "learning_rate": 0.0002, "epoch": 3.8140035906642726, "step": 53110}, {"loss": 0.6514, "grad_norm": 1.1386840343475342, "learning_rate": 0.0002, "epoch": 3.814721723518851, "step": 53120}, {"loss": 0.6182, "grad_norm": 0.8389552235603333, "learning_rate": 0.0002, "epoch": 3.815439856373429, "step": 53130}, {"loss": 0.6779, "grad_norm": 0.7940975427627563, "learning_rate": 0.0002, "epoch": 3.816157989228007, "step": 53140}, {"loss": 0.6825, "grad_norm": 0.8389907479286194, "learning_rate": 0.0002, "epoch": 3.8168761220825855, "step": 53150}, {"loss": 0.6763, "grad_norm": 0.774206280708313, "learning_rate": 0.0002, "epoch": 3.8175942549371635, "step": 53160}, {"loss": 0.7011, "grad_norm": 1.189447283744812, "learning_rate": 0.0002, "epoch": 3.8183123877917415, "step": 53170}, {"loss": 0.6206, "grad_norm": 0.9875882863998413, "learning_rate": 0.0002, "epoch": 3.8190305206463195, "step": 53180}, {"loss": 0.6254, "grad_norm": 0.9205945134162903, "learning_rate": 0.0002, "epoch": 3.8197486535008975, "step": 53190}, {"loss": 0.5845, "grad_norm": 0.8312796354293823, "learning_rate": 0.0002, "epoch": 3.8204667863554755, "step": 53200}, {"loss": 0.6415, "grad_norm": 0.9755756855010986, "learning_rate": 0.0002, "epoch": 3.821184919210054, "step": 53210}, {"loss": 0.6657, "grad_norm": 1.0722965002059937, "learning_rate": 0.0002, "epoch": 3.821903052064632, "step": 53220}, {"loss": 0.6547, "grad_norm": 0.7720510959625244, "learning_rate": 0.0002, "epoch": 3.82262118491921, "step": 53230}, {"loss": 0.6383, "grad_norm": 1.020147681236267, "learning_rate": 0.0002, "epoch": 3.823339317773788, "step": 53240}, {"loss": 0.6491, "grad_norm": 0.8241816759109497, "learning_rate": 0.0002, "epoch": 3.8240574506283664, "step": 53250}, {"loss": 0.6914, "grad_norm": 0.8939895629882812, "learning_rate": 0.0002, "epoch": 3.8247755834829444, "step": 53260}, {"loss": 0.6725, "grad_norm": 1.010852336883545, "learning_rate": 0.0002, "epoch": 3.8254937163375224, "step": 53270}, {"loss": 0.6841, "grad_norm": 0.8201420307159424, "learning_rate": 0.0002, "epoch": 3.8262118491921004, "step": 53280}, {"loss": 0.6739, "grad_norm": 0.8797973990440369, "learning_rate": 0.0002, "epoch": 3.8269299820466784, "step": 53290}, {"loss": 0.658, "grad_norm": 0.9034950137138367, "learning_rate": 0.0002, "epoch": 3.827648114901257, "step": 53300}, {"loss": 0.6314, "grad_norm": 0.926802933216095, "learning_rate": 0.0002, "epoch": 3.828366247755835, "step": 53310}, {"loss": 0.6526, "grad_norm": 1.0205509662628174, "learning_rate": 0.0002, "epoch": 3.829084380610413, "step": 53320}, {"loss": 0.6596, "grad_norm": 0.9524099230766296, "learning_rate": 0.0002, "epoch": 3.829802513464991, "step": 53330}, {"loss": 0.6796, "grad_norm": 0.9692625999450684, "learning_rate": 0.0002, "epoch": 3.8305206463195693, "step": 53340}, {"loss": 0.628, "grad_norm": 0.7255275845527649, "learning_rate": 0.0002, "epoch": 3.8312387791741473, "step": 53350}, {"loss": 0.6104, "grad_norm": 0.7199059724807739, "learning_rate": 0.0002, "epoch": 3.8319569120287253, "step": 53360}, {"loss": 0.6703, "grad_norm": 1.004464864730835, "learning_rate": 0.0002, "epoch": 3.8326750448833034, "step": 53370}, {"loss": 0.7032, "grad_norm": 0.9092583060264587, "learning_rate": 0.0002, "epoch": 3.8333931777378814, "step": 53380}, {"loss": 0.6811, "grad_norm": 0.945091724395752, "learning_rate": 0.0002, "epoch": 3.8341113105924594, "step": 53390}, {"loss": 0.611, "grad_norm": 0.7980135679244995, "learning_rate": 0.0002, "epoch": 3.834829443447038, "step": 53400}, {"loss": 0.6604, "grad_norm": 0.7812868356704712, "learning_rate": 0.0002, "epoch": 3.835547576301616, "step": 53410}, {"loss": 0.6104, "grad_norm": 0.8957077860832214, "learning_rate": 0.0002, "epoch": 3.836265709156194, "step": 53420}, {"loss": 0.6754, "grad_norm": 0.9119600653648376, "learning_rate": 0.0002, "epoch": 3.8369838420107722, "step": 53430}, {"loss": 0.7346, "grad_norm": 0.8208187222480774, "learning_rate": 0.0002, "epoch": 3.8377019748653503, "step": 53440}, {"loss": 0.6549, "grad_norm": 0.7930439114570618, "learning_rate": 0.0002, "epoch": 3.8384201077199283, "step": 53450}, {"loss": 0.6192, "grad_norm": 0.8937777280807495, "learning_rate": 0.0002, "epoch": 3.8391382405745063, "step": 53460}, {"loss": 0.5954, "grad_norm": 0.7583796977996826, "learning_rate": 0.0002, "epoch": 3.8398563734290843, "step": 53470}, {"loss": 0.6217, "grad_norm": 1.0735969543457031, "learning_rate": 0.0002, "epoch": 3.8405745062836623, "step": 53480}, {"loss": 0.6472, "grad_norm": 1.1106033325195312, "learning_rate": 0.0002, "epoch": 3.8412926391382407, "step": 53490}, {"loss": 0.6813, "grad_norm": 1.092631220817566, "learning_rate": 0.0002, "epoch": 3.8420107719928187, "step": 53500}, {"loss": 0.6437, "grad_norm": 0.9961787462234497, "learning_rate": 0.0002, "epoch": 3.8427289048473967, "step": 53510}, {"loss": 0.6382, "grad_norm": 0.833831250667572, "learning_rate": 0.0002, "epoch": 3.8434470377019747, "step": 53520}, {"loss": 0.6403, "grad_norm": 1.0000009536743164, "learning_rate": 0.0002, "epoch": 3.844165170556553, "step": 53530}, {"loss": 0.6824, "grad_norm": 0.9784213304519653, "learning_rate": 0.0002, "epoch": 3.844883303411131, "step": 53540}, {"loss": 0.6816, "grad_norm": 0.8582558035850525, "learning_rate": 0.0002, "epoch": 3.845601436265709, "step": 53550}, {"loss": 0.5944, "grad_norm": 0.8267415761947632, "learning_rate": 0.0002, "epoch": 3.846319569120287, "step": 53560}, {"loss": 0.6562, "grad_norm": 0.8783000111579895, "learning_rate": 0.0002, "epoch": 3.847037701974865, "step": 53570}, {"loss": 0.6795, "grad_norm": 0.9866999983787537, "learning_rate": 0.0002, "epoch": 3.8477558348294436, "step": 53580}, {"loss": 0.7222, "grad_norm": 0.8459296226501465, "learning_rate": 0.0002, "epoch": 3.8484739676840216, "step": 53590}, {"loss": 0.6748, "grad_norm": 0.9804834723472595, "learning_rate": 0.0002, "epoch": 3.8491921005385996, "step": 53600}, {"loss": 0.6115, "grad_norm": 0.951074481010437, "learning_rate": 0.0002, "epoch": 3.8499102333931776, "step": 53610}, {"loss": 0.5914, "grad_norm": 0.8020104169845581, "learning_rate": 0.0002, "epoch": 3.850628366247756, "step": 53620}, {"loss": 0.6237, "grad_norm": 0.9296963214874268, "learning_rate": 0.0002, "epoch": 3.851346499102334, "step": 53630}, {"loss": 0.6384, "grad_norm": 0.8983652591705322, "learning_rate": 0.0002, "epoch": 3.852064631956912, "step": 53640}, {"loss": 0.6855, "grad_norm": 1.031858205795288, "learning_rate": 0.0002, "epoch": 3.85278276481149, "step": 53650}, {"loss": 0.622, "grad_norm": 0.8943952918052673, "learning_rate": 0.0002, "epoch": 3.853500897666068, "step": 53660}, {"loss": 0.6745, "grad_norm": 1.0072312355041504, "learning_rate": 0.0002, "epoch": 3.854219030520646, "step": 53670}, {"loss": 0.677, "grad_norm": 1.0604884624481201, "learning_rate": 0.0002, "epoch": 3.8549371633752245, "step": 53680}, {"loss": 0.5873, "grad_norm": 0.834223210811615, "learning_rate": 0.0002, "epoch": 3.8556552962298025, "step": 53690}, {"loss": 0.665, "grad_norm": 0.9872867465019226, "learning_rate": 0.0002, "epoch": 3.8563734290843805, "step": 53700}, {"loss": 0.6689, "grad_norm": 0.7999459505081177, "learning_rate": 0.0002, "epoch": 3.857091561938959, "step": 53710}, {"loss": 0.6744, "grad_norm": 0.717722475528717, "learning_rate": 0.0002, "epoch": 3.857809694793537, "step": 53720}, {"loss": 0.6348, "grad_norm": 1.0675442218780518, "learning_rate": 0.0002, "epoch": 3.858527827648115, "step": 53730}, {"loss": 0.6141, "grad_norm": 0.9789777398109436, "learning_rate": 0.0002, "epoch": 3.859245960502693, "step": 53740}, {"loss": 0.6455, "grad_norm": 0.9318669438362122, "learning_rate": 0.0002, "epoch": 3.859964093357271, "step": 53750}, {"loss": 0.6587, "grad_norm": 0.9848631024360657, "learning_rate": 0.0002, "epoch": 3.860682226211849, "step": 53760}, {"loss": 0.6202, "grad_norm": 0.8754391670227051, "learning_rate": 0.0002, "epoch": 3.8614003590664274, "step": 53770}, {"loss": 0.6411, "grad_norm": 0.9024585485458374, "learning_rate": 0.0002, "epoch": 3.8621184919210054, "step": 53780}, {"loss": 0.6643, "grad_norm": 0.8974794745445251, "learning_rate": 0.0002, "epoch": 3.8628366247755834, "step": 53790}, {"loss": 0.6729, "grad_norm": 0.8342790603637695, "learning_rate": 0.0002, "epoch": 3.8635547576301614, "step": 53800}, {"loss": 0.6322, "grad_norm": 0.8177682757377625, "learning_rate": 0.0002, "epoch": 3.86427289048474, "step": 53810}, {"loss": 0.6525, "grad_norm": 1.0259089469909668, "learning_rate": 0.0002, "epoch": 3.864991023339318, "step": 53820}, {"loss": 0.6508, "grad_norm": 1.042290210723877, "learning_rate": 0.0002, "epoch": 3.865709156193896, "step": 53830}, {"loss": 0.6963, "grad_norm": 0.7316540479660034, "learning_rate": 0.0002, "epoch": 3.866427289048474, "step": 53840}, {"loss": 0.6491, "grad_norm": 0.9384970664978027, "learning_rate": 0.0002, "epoch": 3.867145421903052, "step": 53850}, {"loss": 0.6689, "grad_norm": 0.9273143410682678, "learning_rate": 0.0002, "epoch": 3.86786355475763, "step": 53860}, {"loss": 0.6443, "grad_norm": 1.1183570623397827, "learning_rate": 0.0002, "epoch": 3.8685816876122083, "step": 53870}, {"loss": 0.6712, "grad_norm": 0.9455275535583496, "learning_rate": 0.0002, "epoch": 3.8692998204667863, "step": 53880}, {"loss": 0.6662, "grad_norm": 0.8702114820480347, "learning_rate": 0.0002, "epoch": 3.8700179533213643, "step": 53890}, {"loss": 0.7032, "grad_norm": 0.8751053214073181, "learning_rate": 0.0002, "epoch": 3.870736086175943, "step": 53900}, {"loss": 0.6398, "grad_norm": 0.9793110489845276, "learning_rate": 0.0002, "epoch": 3.871454219030521, "step": 53910}, {"loss": 0.6577, "grad_norm": 0.9705014824867249, "learning_rate": 0.0002, "epoch": 3.872172351885099, "step": 53920}, {"loss": 0.751, "grad_norm": 1.051504373550415, "learning_rate": 0.0002, "epoch": 3.872890484739677, "step": 53930}, {"loss": 0.6606, "grad_norm": 0.8590622544288635, "learning_rate": 0.0002, "epoch": 3.873608617594255, "step": 53940}, {"loss": 0.6495, "grad_norm": 0.7828099727630615, "learning_rate": 0.0002, "epoch": 3.874326750448833, "step": 53950}, {"loss": 0.6294, "grad_norm": 0.86341792345047, "learning_rate": 0.0002, "epoch": 3.8750448833034112, "step": 53960}, {"loss": 0.6677, "grad_norm": 1.114670991897583, "learning_rate": 0.0002, "epoch": 3.8757630161579892, "step": 53970}, {"loss": 0.6533, "grad_norm": 0.8559519052505493, "learning_rate": 0.0002, "epoch": 3.8764811490125672, "step": 53980}, {"loss": 0.6517, "grad_norm": 1.0518953800201416, "learning_rate": 0.0002, "epoch": 3.8771992818671457, "step": 53990}, {"loss": 0.6359, "grad_norm": 0.7157500982284546, "learning_rate": 0.0002, "epoch": 3.8779174147217237, "step": 54000}, {"loss": 0.6847, "grad_norm": 0.8390372395515442, "learning_rate": 0.0002, "epoch": 3.8786355475763017, "step": 54010}, {"loss": 0.6376, "grad_norm": 0.8486756086349487, "learning_rate": 0.0002, "epoch": 3.8793536804308797, "step": 54020}, {"loss": 0.6184, "grad_norm": 0.8361587524414062, "learning_rate": 0.0002, "epoch": 3.8800718132854577, "step": 54030}, {"loss": 0.6552, "grad_norm": 0.9490554928779602, "learning_rate": 0.0002, "epoch": 3.8807899461400357, "step": 54040}, {"loss": 0.6653, "grad_norm": 1.0311323404312134, "learning_rate": 0.0002, "epoch": 3.881508078994614, "step": 54050}, {"loss": 0.6484, "grad_norm": 0.84800124168396, "learning_rate": 0.0002, "epoch": 3.882226211849192, "step": 54060}, {"loss": 0.6995, "grad_norm": 0.8940879702568054, "learning_rate": 0.0002, "epoch": 3.88294434470377, "step": 54070}, {"loss": 0.6157, "grad_norm": 0.985542356967926, "learning_rate": 0.0002, "epoch": 3.883662477558348, "step": 54080}, {"loss": 0.6221, "grad_norm": 0.8846475481987, "learning_rate": 0.0002, "epoch": 3.8843806104129266, "step": 54090}, {"loss": 0.6656, "grad_norm": 0.9186338186264038, "learning_rate": 0.0002, "epoch": 3.8850987432675046, "step": 54100}, {"loss": 0.6367, "grad_norm": 1.106598973274231, "learning_rate": 0.0002, "epoch": 3.8858168761220826, "step": 54110}, {"loss": 0.6311, "grad_norm": 0.8167300224304199, "learning_rate": 0.0002, "epoch": 3.8865350089766606, "step": 54120}, {"loss": 0.694, "grad_norm": 0.9153622984886169, "learning_rate": 0.0002, "epoch": 3.8872531418312386, "step": 54130}, {"loss": 0.6669, "grad_norm": 0.8464475274085999, "learning_rate": 0.0002, "epoch": 3.8879712746858166, "step": 54140}, {"loss": 0.6658, "grad_norm": 0.8889452815055847, "learning_rate": 0.0002, "epoch": 3.888689407540395, "step": 54150}, {"loss": 0.6291, "grad_norm": 0.7861065864562988, "learning_rate": 0.0002, "epoch": 3.889407540394973, "step": 54160}, {"loss": 0.6315, "grad_norm": 0.882674515247345, "learning_rate": 0.0002, "epoch": 3.890125673249551, "step": 54170}, {"loss": 0.6223, "grad_norm": 0.8503835201263428, "learning_rate": 0.0002, "epoch": 3.8908438061041295, "step": 54180}, {"loss": 0.6176, "grad_norm": 0.888455331325531, "learning_rate": 0.0002, "epoch": 3.8915619389587075, "step": 54190}, {"loss": 0.6985, "grad_norm": 1.0473699569702148, "learning_rate": 0.0002, "epoch": 3.8922800718132855, "step": 54200}, {"loss": 0.6513, "grad_norm": 0.9548208713531494, "learning_rate": 0.0002, "epoch": 3.8929982046678635, "step": 54210}, {"loss": 0.6089, "grad_norm": 0.9158754944801331, "learning_rate": 0.0002, "epoch": 3.8937163375224415, "step": 54220}, {"loss": 0.6352, "grad_norm": 0.9001154899597168, "learning_rate": 0.0002, "epoch": 3.8944344703770195, "step": 54230}, {"loss": 0.6657, "grad_norm": 0.9736626148223877, "learning_rate": 0.0002, "epoch": 3.895152603231598, "step": 54240}, {"loss": 0.7248, "grad_norm": 0.8809846043586731, "learning_rate": 0.0002, "epoch": 3.895870736086176, "step": 54250}, {"loss": 0.6364, "grad_norm": 0.887583315372467, "learning_rate": 0.0002, "epoch": 3.896588868940754, "step": 54260}, {"loss": 0.6252, "grad_norm": 0.8395712971687317, "learning_rate": 0.0002, "epoch": 3.8973070017953324, "step": 54270}, {"loss": 0.681, "grad_norm": 0.8391315937042236, "learning_rate": 0.0002, "epoch": 3.8980251346499104, "step": 54280}, {"loss": 0.6352, "grad_norm": 0.8210049271583557, "learning_rate": 0.0002, "epoch": 3.8987432675044884, "step": 54290}, {"loss": 0.6484, "grad_norm": 1.1364530324935913, "learning_rate": 0.0002, "epoch": 3.8994614003590664, "step": 54300}, {"loss": 0.6383, "grad_norm": 0.7712056636810303, "learning_rate": 0.0002, "epoch": 3.9001795332136444, "step": 54310}, {"loss": 0.6516, "grad_norm": 0.9466049671173096, "learning_rate": 0.0002, "epoch": 3.9008976660682224, "step": 54320}, {"loss": 0.6938, "grad_norm": 1.0367140769958496, "learning_rate": 0.0002, "epoch": 3.901615798922801, "step": 54330}, {"loss": 0.672, "grad_norm": 1.0168321132659912, "learning_rate": 0.0002, "epoch": 3.902333931777379, "step": 54340}, {"loss": 0.6306, "grad_norm": 0.7830407619476318, "learning_rate": 0.0002, "epoch": 3.903052064631957, "step": 54350}, {"loss": 0.7198, "grad_norm": 0.9649789333343506, "learning_rate": 0.0002, "epoch": 3.903770197486535, "step": 54360}, {"loss": 0.6644, "grad_norm": 0.681077778339386, "learning_rate": 0.0002, "epoch": 3.9044883303411133, "step": 54370}, {"loss": 0.6677, "grad_norm": 0.8970136046409607, "learning_rate": 0.0002, "epoch": 3.9052064631956913, "step": 54380}, {"loss": 0.6581, "grad_norm": 0.9155173301696777, "learning_rate": 0.0002, "epoch": 3.9059245960502693, "step": 54390}, {"loss": 0.6711, "grad_norm": 1.0447794198989868, "learning_rate": 0.0002, "epoch": 3.9066427289048473, "step": 54400}, {"loss": 0.6883, "grad_norm": 0.7823813557624817, "learning_rate": 0.0002, "epoch": 3.9073608617594253, "step": 54410}, {"loss": 0.6688, "grad_norm": 0.9289445877075195, "learning_rate": 0.0002, "epoch": 3.9080789946140033, "step": 54420}, {"loss": 0.7024, "grad_norm": 0.9983111619949341, "learning_rate": 0.0002, "epoch": 3.9087971274685818, "step": 54430}, {"loss": 0.6687, "grad_norm": 0.7952495813369751, "learning_rate": 0.0002, "epoch": 3.9095152603231598, "step": 54440}, {"loss": 0.6118, "grad_norm": 0.8045601844787598, "learning_rate": 0.0002, "epoch": 3.9102333931777378, "step": 54450}, {"loss": 0.6388, "grad_norm": 0.936585009098053, "learning_rate": 0.0002, "epoch": 3.910951526032316, "step": 54460}, {"loss": 0.6217, "grad_norm": 0.745793879032135, "learning_rate": 0.0002, "epoch": 3.911669658886894, "step": 54470}, {"loss": 0.6814, "grad_norm": 0.9137616157531738, "learning_rate": 0.0002, "epoch": 3.912387791741472, "step": 54480}, {"loss": 0.6792, "grad_norm": 0.826316237449646, "learning_rate": 0.0002, "epoch": 3.9131059245960502, "step": 54490}, {"loss": 0.6914, "grad_norm": 0.94313645362854, "learning_rate": 0.0002, "epoch": 3.9138240574506282, "step": 54500}, {"loss": 0.62, "grad_norm": 1.045893907546997, "learning_rate": 0.0002, "epoch": 3.9145421903052062, "step": 54510}, {"loss": 0.5841, "grad_norm": 0.9122704863548279, "learning_rate": 0.0002, "epoch": 3.9152603231597847, "step": 54520}, {"loss": 0.7029, "grad_norm": 1.0999689102172852, "learning_rate": 0.0002, "epoch": 3.9159784560143627, "step": 54530}, {"loss": 0.6387, "grad_norm": 0.9281555414199829, "learning_rate": 0.0002, "epoch": 3.9166965888689407, "step": 54540}, {"loss": 0.6227, "grad_norm": 1.1439622640609741, "learning_rate": 0.0002, "epoch": 3.917414721723519, "step": 54550}, {"loss": 0.6733, "grad_norm": 0.9375617504119873, "learning_rate": 0.0002, "epoch": 3.918132854578097, "step": 54560}, {"loss": 0.6503, "grad_norm": 0.92906653881073, "learning_rate": 0.0002, "epoch": 3.918850987432675, "step": 54570}, {"loss": 0.6361, "grad_norm": 1.0840893983840942, "learning_rate": 0.0002, "epoch": 3.919569120287253, "step": 54580}, {"loss": 0.6476, "grad_norm": 0.8145509362220764, "learning_rate": 0.0002, "epoch": 3.920287253141831, "step": 54590}, {"loss": 0.6826, "grad_norm": 0.973737895488739, "learning_rate": 0.0002, "epoch": 3.921005385996409, "step": 54600}, {"loss": 0.6822, "grad_norm": 0.9302353858947754, "learning_rate": 0.0002, "epoch": 3.9217235188509876, "step": 54610}, {"loss": 0.6522, "grad_norm": 0.9167897701263428, "learning_rate": 0.0002, "epoch": 3.9224416517055656, "step": 54620}, {"loss": 0.6783, "grad_norm": 0.8096851706504822, "learning_rate": 0.0002, "epoch": 3.9231597845601436, "step": 54630}, {"loss": 0.6369, "grad_norm": 0.8006368279457092, "learning_rate": 0.0002, "epoch": 3.9238779174147216, "step": 54640}, {"loss": 0.6533, "grad_norm": 0.7800863981246948, "learning_rate": 0.0002, "epoch": 3.9245960502693, "step": 54650}, {"loss": 0.6518, "grad_norm": 1.0331560373306274, "learning_rate": 0.0002, "epoch": 3.925314183123878, "step": 54660}, {"loss": 0.6764, "grad_norm": 1.0057517290115356, "learning_rate": 0.0002, "epoch": 3.926032315978456, "step": 54670}, {"loss": 0.6636, "grad_norm": 0.8920564651489258, "learning_rate": 0.0002, "epoch": 3.926750448833034, "step": 54680}, {"loss": 0.6432, "grad_norm": 0.7704599499702454, "learning_rate": 0.0002, "epoch": 3.927468581687612, "step": 54690}, {"loss": 0.6532, "grad_norm": 0.827032208442688, "learning_rate": 0.0002, "epoch": 3.92818671454219, "step": 54700}, {"loss": 0.7083, "grad_norm": 1.0019268989562988, "learning_rate": 0.0002, "epoch": 3.9289048473967685, "step": 54710}, {"loss": 0.6026, "grad_norm": 0.862033486366272, "learning_rate": 0.0002, "epoch": 3.9296229802513465, "step": 54720}, {"loss": 0.599, "grad_norm": 0.8965592980384827, "learning_rate": 0.0002, "epoch": 3.9303411131059245, "step": 54730}, {"loss": 0.6739, "grad_norm": 0.7689077854156494, "learning_rate": 0.0002, "epoch": 3.931059245960503, "step": 54740}, {"loss": 0.6401, "grad_norm": 0.846276581287384, "learning_rate": 0.0002, "epoch": 3.931777378815081, "step": 54750}, {"loss": 0.6942, "grad_norm": 0.8932713866233826, "learning_rate": 0.0002, "epoch": 3.932495511669659, "step": 54760}, {"loss": 0.6697, "grad_norm": 0.9711386561393738, "learning_rate": 0.0002, "epoch": 3.933213644524237, "step": 54770}, {"loss": 0.6672, "grad_norm": 0.9290250539779663, "learning_rate": 0.0002, "epoch": 3.933931777378815, "step": 54780}, {"loss": 0.6365, "grad_norm": 1.0897367000579834, "learning_rate": 0.0002, "epoch": 3.934649910233393, "step": 54790}, {"loss": 0.6647, "grad_norm": 0.8451842665672302, "learning_rate": 0.0002, "epoch": 3.9353680430879714, "step": 54800}, {"loss": 0.6705, "grad_norm": 0.8400090336799622, "learning_rate": 0.0002, "epoch": 3.9360861759425494, "step": 54810}, {"loss": 0.6577, "grad_norm": 0.951383650302887, "learning_rate": 0.0002, "epoch": 3.9368043087971274, "step": 54820}, {"loss": 0.654, "grad_norm": 0.848838210105896, "learning_rate": 0.0002, "epoch": 3.937522441651706, "step": 54830}, {"loss": 0.6852, "grad_norm": 0.735763669013977, "learning_rate": 0.0002, "epoch": 3.938240574506284, "step": 54840}, {"loss": 0.6574, "grad_norm": 0.979037344455719, "learning_rate": 0.0002, "epoch": 3.938958707360862, "step": 54850}, {"loss": 0.5851, "grad_norm": 0.933674693107605, "learning_rate": 0.0002, "epoch": 3.93967684021544, "step": 54860}, {"loss": 0.6931, "grad_norm": 0.835593044757843, "learning_rate": 0.0002, "epoch": 3.940394973070018, "step": 54870}, {"loss": 0.6967, "grad_norm": 1.0034281015396118, "learning_rate": 0.0002, "epoch": 3.941113105924596, "step": 54880}, {"loss": 0.6442, "grad_norm": 0.9732975959777832, "learning_rate": 0.0002, "epoch": 3.9418312387791743, "step": 54890}, {"loss": 0.6657, "grad_norm": 0.9666336178779602, "learning_rate": 0.0002, "epoch": 3.9425493716337523, "step": 54900}, {"loss": 0.6521, "grad_norm": 0.755310595035553, "learning_rate": 0.0002, "epoch": 3.9432675044883303, "step": 54910}, {"loss": 0.6562, "grad_norm": 0.8732092976570129, "learning_rate": 0.0002, "epoch": 3.9439856373429083, "step": 54920}, {"loss": 0.6486, "grad_norm": 1.139453649520874, "learning_rate": 0.0002, "epoch": 3.9447037701974867, "step": 54930}, {"loss": 0.6609, "grad_norm": 0.9044837951660156, "learning_rate": 0.0002, "epoch": 3.9454219030520647, "step": 54940}, {"loss": 0.6344, "grad_norm": 1.0496679544448853, "learning_rate": 0.0002, "epoch": 3.9461400359066428, "step": 54950}, {"loss": 0.6471, "grad_norm": 1.0099035501480103, "learning_rate": 0.0002, "epoch": 3.9468581687612208, "step": 54960}, {"loss": 0.6143, "grad_norm": 1.0694963932037354, "learning_rate": 0.0002, "epoch": 3.9475763016157988, "step": 54970}, {"loss": 0.6209, "grad_norm": 1.0012997388839722, "learning_rate": 0.0002, "epoch": 3.9482944344703768, "step": 54980}, {"loss": 0.7379, "grad_norm": 0.8910513520240784, "learning_rate": 0.0002, "epoch": 3.949012567324955, "step": 54990}, {"loss": 0.7184, "grad_norm": 1.0267579555511475, "learning_rate": 0.0002, "epoch": 3.949730700179533, "step": 55000}, {"loss": 0.6844, "grad_norm": 0.9786432385444641, "learning_rate": 0.0002, "epoch": 3.950448833034111, "step": 55010}, {"loss": 0.6499, "grad_norm": 0.8703538775444031, "learning_rate": 0.0002, "epoch": 3.9511669658886897, "step": 55020}, {"loss": 0.5989, "grad_norm": 0.8970484137535095, "learning_rate": 0.0002, "epoch": 3.9518850987432677, "step": 55030}, {"loss": 0.659, "grad_norm": 0.8781577944755554, "learning_rate": 0.0002, "epoch": 3.9526032315978457, "step": 55040}, {"loss": 0.6944, "grad_norm": 0.8040280938148499, "learning_rate": 0.0002, "epoch": 3.9533213644524237, "step": 55050}, {"loss": 0.6359, "grad_norm": 0.851926326751709, "learning_rate": 0.0002, "epoch": 3.9540394973070017, "step": 55060}, {"loss": 0.6806, "grad_norm": 0.8597240447998047, "learning_rate": 0.0002, "epoch": 3.9547576301615797, "step": 55070}, {"loss": 0.6499, "grad_norm": 0.9461944699287415, "learning_rate": 0.0002, "epoch": 3.955475763016158, "step": 55080}, {"loss": 0.6222, "grad_norm": 0.7576611042022705, "learning_rate": 0.0002, "epoch": 3.956193895870736, "step": 55090}, {"loss": 0.6735, "grad_norm": 0.9484710693359375, "learning_rate": 0.0002, "epoch": 3.956912028725314, "step": 55100}, {"loss": 0.6586, "grad_norm": 0.9487117528915405, "learning_rate": 0.0002, "epoch": 3.957630161579892, "step": 55110}, {"loss": 0.6632, "grad_norm": 0.870090663433075, "learning_rate": 0.0002, "epoch": 3.9583482944344706, "step": 55120}, {"loss": 0.6786, "grad_norm": 0.8496458530426025, "learning_rate": 0.0002, "epoch": 3.9590664272890486, "step": 55130}, {"loss": 0.6631, "grad_norm": 1.0121779441833496, "learning_rate": 0.0002, "epoch": 3.9597845601436266, "step": 55140}, {"loss": 0.7005, "grad_norm": 0.8912323713302612, "learning_rate": 0.0002, "epoch": 3.9605026929982046, "step": 55150}, {"loss": 0.6398, "grad_norm": 0.8398444652557373, "learning_rate": 0.0002, "epoch": 3.9612208258527826, "step": 55160}, {"loss": 0.6183, "grad_norm": 0.8046348690986633, "learning_rate": 0.0002, "epoch": 3.961938958707361, "step": 55170}, {"loss": 0.6357, "grad_norm": 1.0369254350662231, "learning_rate": 0.0002, "epoch": 3.962657091561939, "step": 55180}, {"loss": 0.6053, "grad_norm": 1.172431230545044, "learning_rate": 0.0002, "epoch": 3.963375224416517, "step": 55190}, {"loss": 0.643, "grad_norm": 0.8093554377555847, "learning_rate": 0.0002, "epoch": 3.964093357271095, "step": 55200}, {"loss": 0.6416, "grad_norm": 0.8851078748703003, "learning_rate": 0.0002, "epoch": 3.9648114901256735, "step": 55210}, {"loss": 0.6516, "grad_norm": 0.7494266033172607, "learning_rate": 0.0002, "epoch": 3.9655296229802515, "step": 55220}, {"loss": 0.629, "grad_norm": 0.9556898474693298, "learning_rate": 0.0002, "epoch": 3.9662477558348295, "step": 55230}, {"loss": 0.6481, "grad_norm": 1.016017198562622, "learning_rate": 0.0002, "epoch": 3.9669658886894075, "step": 55240}, {"loss": 0.7185, "grad_norm": 0.8425998091697693, "learning_rate": 0.0002, "epoch": 3.9676840215439855, "step": 55250}, {"loss": 0.6609, "grad_norm": 0.717673122882843, "learning_rate": 0.0002, "epoch": 3.9684021543985635, "step": 55260}, {"loss": 0.6453, "grad_norm": 0.8366572856903076, "learning_rate": 0.0002, "epoch": 3.969120287253142, "step": 55270}, {"loss": 0.6841, "grad_norm": 0.8981583118438721, "learning_rate": 0.0002, "epoch": 3.96983842010772, "step": 55280}, {"loss": 0.6351, "grad_norm": 0.8868781328201294, "learning_rate": 0.0002, "epoch": 3.970556552962298, "step": 55290}, {"loss": 0.6755, "grad_norm": 1.0632785558700562, "learning_rate": 0.0002, "epoch": 3.9712746858168764, "step": 55300}, {"loss": 0.6433, "grad_norm": 0.8813109993934631, "learning_rate": 0.0002, "epoch": 3.9719928186714544, "step": 55310}, {"loss": 0.5699, "grad_norm": 0.8225542306900024, "learning_rate": 0.0002, "epoch": 3.9727109515260324, "step": 55320}, {"loss": 0.6591, "grad_norm": 1.1391420364379883, "learning_rate": 0.0002, "epoch": 3.9734290843806104, "step": 55330}, {"loss": 0.6551, "grad_norm": 1.0371832847595215, "learning_rate": 0.0002, "epoch": 3.9741472172351884, "step": 55340}, {"loss": 0.7538, "grad_norm": 1.0542186498641968, "learning_rate": 0.0002, "epoch": 3.9748653500897664, "step": 55350}, {"loss": 0.6799, "grad_norm": 1.0178009271621704, "learning_rate": 0.0002, "epoch": 3.975583482944345, "step": 55360}, {"loss": 0.6394, "grad_norm": 0.7927802205085754, "learning_rate": 0.0002, "epoch": 3.976301615798923, "step": 55370}, {"loss": 0.6632, "grad_norm": 0.9350495934486389, "learning_rate": 0.0002, "epoch": 3.977019748653501, "step": 55380}, {"loss": 0.6889, "grad_norm": 1.0240116119384766, "learning_rate": 0.0002, "epoch": 3.977737881508079, "step": 55390}, {"loss": 0.6756, "grad_norm": 1.0279067754745483, "learning_rate": 0.0002, "epoch": 3.9784560143626573, "step": 55400}, {"loss": 0.6979, "grad_norm": 1.1228227615356445, "learning_rate": 0.0002, "epoch": 3.9791741472172353, "step": 55410}, {"loss": 0.6595, "grad_norm": 0.9500134587287903, "learning_rate": 0.0002, "epoch": 3.9798922800718133, "step": 55420}, {"loss": 0.6875, "grad_norm": 0.9229732155799866, "learning_rate": 0.0002, "epoch": 3.9806104129263913, "step": 55430}, {"loss": 0.6742, "grad_norm": 0.7946729063987732, "learning_rate": 0.0002, "epoch": 3.9813285457809693, "step": 55440}, {"loss": 0.6643, "grad_norm": 0.9987489581108093, "learning_rate": 0.0002, "epoch": 3.9820466786355477, "step": 55450}, {"loss": 0.6642, "grad_norm": 0.9670467972755432, "learning_rate": 0.0002, "epoch": 3.9827648114901257, "step": 55460}, {"loss": 0.6603, "grad_norm": 0.835028350353241, "learning_rate": 0.0002, "epoch": 3.9834829443447037, "step": 55470}, {"loss": 0.6198, "grad_norm": 0.8678702712059021, "learning_rate": 0.0002, "epoch": 3.9842010771992817, "step": 55480}, {"loss": 0.6581, "grad_norm": 0.8581197261810303, "learning_rate": 0.0002, "epoch": 3.98491921005386, "step": 55490}, {"loss": 0.614, "grad_norm": 0.779848039150238, "learning_rate": 0.0002, "epoch": 3.985637342908438, "step": 55500}, {"loss": 0.634, "grad_norm": 0.8827589154243469, "learning_rate": 0.0002, "epoch": 3.986355475763016, "step": 55510}, {"loss": 0.624, "grad_norm": 1.0108301639556885, "learning_rate": 0.0002, "epoch": 3.987073608617594, "step": 55520}, {"loss": 0.6553, "grad_norm": 0.8506004214286804, "learning_rate": 0.0002, "epoch": 3.987791741472172, "step": 55530}, {"loss": 0.6229, "grad_norm": 1.0297727584838867, "learning_rate": 0.0002, "epoch": 3.98850987432675, "step": 55540}, {"loss": 0.6551, "grad_norm": 0.8579224944114685, "learning_rate": 0.0002, "epoch": 3.9892280071813286, "step": 55550}, {"loss": 0.6491, "grad_norm": 0.8503788113594055, "learning_rate": 0.0002, "epoch": 3.9899461400359066, "step": 55560}, {"loss": 0.6941, "grad_norm": 1.1144801378250122, "learning_rate": 0.0002, "epoch": 3.9906642728904846, "step": 55570}, {"loss": 0.6956, "grad_norm": 0.8418305516242981, "learning_rate": 0.0002, "epoch": 3.991382405745063, "step": 55580}, {"loss": 0.6226, "grad_norm": 1.0065871477127075, "learning_rate": 0.0002, "epoch": 3.992100538599641, "step": 55590}, {"loss": 0.6775, "grad_norm": 0.8160259127616882, "learning_rate": 0.0002, "epoch": 3.992818671454219, "step": 55600}, {"loss": 0.624, "grad_norm": 0.8678009510040283, "learning_rate": 0.0002, "epoch": 3.993536804308797, "step": 55610}, {"loss": 0.6552, "grad_norm": 0.863465428352356, "learning_rate": 0.0002, "epoch": 3.994254937163375, "step": 55620}, {"loss": 0.6764, "grad_norm": 0.9242135286331177, "learning_rate": 0.0002, "epoch": 3.994973070017953, "step": 55630}, {"loss": 0.6774, "grad_norm": 1.0285470485687256, "learning_rate": 0.0002, "epoch": 3.9956912028725315, "step": 55640}, {"loss": 0.6882, "grad_norm": 0.8953320384025574, "learning_rate": 0.0002, "epoch": 3.9964093357271095, "step": 55650}, {"loss": 0.6935, "grad_norm": 0.915892481803894, "learning_rate": 0.0002, "epoch": 3.9971274685816875, "step": 55660}, {"loss": 0.641, "grad_norm": 0.8235118985176086, "learning_rate": 0.0002, "epoch": 3.9978456014362656, "step": 55670}, {"loss": 0.6417, "grad_norm": 1.0178656578063965, "learning_rate": 0.0002, "epoch": 3.998563734290844, "step": 55680}, {"loss": 0.6635, "grad_norm": 0.9926803708076477, "learning_rate": 0.0002, "epoch": 3.999281867145422, "step": 55690}, {"loss": 0.6476, "grad_norm": 0.9213629961013794, "learning_rate": 0.0002, "epoch": 4.0, "step": 55700}]} +{"epoch": 5.0, "step": 69625, "epoch_duration": 15459.427813053131, "total_accumulated_duration": 78803.78023004532, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.5816, "grad_norm": 1.0291756391525269, "learning_rate": 0.0002, "epoch": 0.000718132854578097, "step": 10}, {"loss": 1.1527, "grad_norm": 0.6570823192596436, "learning_rate": 0.0002, "epoch": 0.001436265709156194, "step": 20}, {"loss": 1.0014, "grad_norm": 0.693844199180603, "learning_rate": 0.0002, "epoch": 0.0021543985637342907, "step": 30}, {"loss": 0.9377, "grad_norm": 0.5608532428741455, "learning_rate": 0.0002, "epoch": 0.002872531418312388, "step": 40}, {"loss": 0.9533, "grad_norm": 0.549075722694397, "learning_rate": 0.0002, "epoch": 0.003590664272890485, "step": 50}, {"loss": 0.9164, "grad_norm": 0.47189879417419434, "learning_rate": 0.0002, "epoch": 0.004308797127468581, "step": 60}, {"loss": 0.8898, "grad_norm": 0.5799676775932312, "learning_rate": 0.0002, "epoch": 0.005026929982046679, "step": 70}, {"loss": 0.859, "grad_norm": 0.45907193422317505, "learning_rate": 0.0002, "epoch": 0.005745062836624776, "step": 80}, {"loss": 0.8697, "grad_norm": 0.4373045861721039, "learning_rate": 0.0002, "epoch": 0.006463195691202872, "step": 90}, {"loss": 0.8879, "grad_norm": 0.5636304020881653, "learning_rate": 0.0002, "epoch": 0.00718132854578097, "step": 100}, {"loss": 0.8397, "grad_norm": 0.5248253345489502, "learning_rate": 0.0002, "epoch": 0.007899461400359067, "step": 110}, {"loss": 0.9021, "grad_norm": 0.5082874298095703, "learning_rate": 0.0002, "epoch": 0.008617594254937163, "step": 120}, {"loss": 0.8678, "grad_norm": 0.42670881748199463, "learning_rate": 0.0002, "epoch": 0.00933572710951526, "step": 130}, {"loss": 0.7847, "grad_norm": 0.43311649560928345, "learning_rate": 0.0002, "epoch": 0.010053859964093357, "step": 140}, {"loss": 0.9252, "grad_norm": 0.43456509709358215, "learning_rate": 0.0002, "epoch": 0.010771992818671455, "step": 150}, {"loss": 0.8812, "grad_norm": 0.9222815632820129, "learning_rate": 0.0002, "epoch": 0.011490125673249552, "step": 160}, {"loss": 0.8651, "grad_norm": 0.42752256989479065, "learning_rate": 0.0002, "epoch": 0.012208258527827648, "step": 170}, {"loss": 0.8898, "grad_norm": 0.4175542891025543, "learning_rate": 0.0002, "epoch": 0.012926391382405745, "step": 180}, {"loss": 0.8519, "grad_norm": 0.4377831518650055, "learning_rate": 0.0002, "epoch": 0.013644524236983842, "step": 190}, {"loss": 0.8849, "grad_norm": 0.47263655066490173, "learning_rate": 0.0002, "epoch": 0.01436265709156194, "step": 200}, {"loss": 0.8764, "grad_norm": 0.3870520293712616, "learning_rate": 0.0002, "epoch": 0.015080789946140035, "step": 210}, {"loss": 0.833, "grad_norm": 0.4950464963912964, "learning_rate": 0.0002, "epoch": 0.015798922800718134, "step": 220}, {"loss": 0.8323, "grad_norm": 0.4643295407295227, "learning_rate": 0.0002, "epoch": 0.01651705565529623, "step": 230}, {"loss": 0.8363, "grad_norm": 0.5152903199195862, "learning_rate": 0.0002, "epoch": 0.017235188509874325, "step": 240}, {"loss": 0.873, "grad_norm": 0.3800727427005768, "learning_rate": 0.0002, "epoch": 0.017953321364452424, "step": 250}, {"loss": 0.8252, "grad_norm": 0.43700528144836426, "learning_rate": 0.0002, "epoch": 0.01867145421903052, "step": 260}, {"loss": 0.8686, "grad_norm": 0.3712887763977051, "learning_rate": 0.0002, "epoch": 0.01938958707360862, "step": 270}, {"loss": 0.8329, "grad_norm": 0.4202553629875183, "learning_rate": 0.0002, "epoch": 0.020107719928186715, "step": 280}, {"loss": 0.8143, "grad_norm": 0.40585094690322876, "learning_rate": 0.0002, "epoch": 0.02082585278276481, "step": 290}, {"loss": 0.8463, "grad_norm": 0.4685470759868622, "learning_rate": 0.0002, "epoch": 0.02154398563734291, "step": 300}, {"loss": 0.8321, "grad_norm": 0.373169481754303, "learning_rate": 0.0002, "epoch": 0.022262118491921005, "step": 310}, {"loss": 0.8031, "grad_norm": 0.39681482315063477, "learning_rate": 0.0002, "epoch": 0.022980251346499104, "step": 320}, {"loss": 0.8667, "grad_norm": 0.3919322192668915, "learning_rate": 0.0002, "epoch": 0.0236983842010772, "step": 330}, {"loss": 0.8196, "grad_norm": 0.4728981554508209, "learning_rate": 0.0002, "epoch": 0.024416517055655295, "step": 340}, {"loss": 0.8662, "grad_norm": 0.42439374327659607, "learning_rate": 0.0002, "epoch": 0.025134649910233394, "step": 350}, {"loss": 0.8618, "grad_norm": 0.425650030374527, "learning_rate": 0.0002, "epoch": 0.02585278276481149, "step": 360}, {"loss": 0.8249, "grad_norm": 0.4076762795448303, "learning_rate": 0.0002, "epoch": 0.02657091561938959, "step": 370}, {"loss": 0.8293, "grad_norm": 0.44335922598838806, "learning_rate": 0.0002, "epoch": 0.027289048473967684, "step": 380}, {"loss": 0.8288, "grad_norm": 0.5313619375228882, "learning_rate": 0.0002, "epoch": 0.02800718132854578, "step": 390}, {"loss": 0.8431, "grad_norm": 0.37089797854423523, "learning_rate": 0.0002, "epoch": 0.02872531418312388, "step": 400}, {"loss": 0.7644, "grad_norm": 0.5193604826927185, "learning_rate": 0.0002, "epoch": 0.029443447037701975, "step": 410}, {"loss": 0.7853, "grad_norm": 0.4428552985191345, "learning_rate": 0.0002, "epoch": 0.03016157989228007, "step": 420}, {"loss": 0.8641, "grad_norm": 0.384171724319458, "learning_rate": 0.0002, "epoch": 0.03087971274685817, "step": 430}, {"loss": 0.8236, "grad_norm": 0.3906913101673126, "learning_rate": 0.0002, "epoch": 0.03159784560143627, "step": 440}, {"loss": 0.8215, "grad_norm": 0.5365669131278992, "learning_rate": 0.0002, "epoch": 0.03231597845601436, "step": 450}, {"loss": 0.8376, "grad_norm": 0.4785287380218506, "learning_rate": 0.0002, "epoch": 0.03303411131059246, "step": 460}, {"loss": 0.8439, "grad_norm": 0.40048182010650635, "learning_rate": 0.0002, "epoch": 0.03375224416517056, "step": 470}, {"loss": 0.8306, "grad_norm": 0.49529239535331726, "learning_rate": 0.0002, "epoch": 0.03447037701974865, "step": 480}, {"loss": 0.8653, "grad_norm": 0.5853474140167236, "learning_rate": 0.0002, "epoch": 0.03518850987432675, "step": 490}, {"loss": 0.7952, "grad_norm": 0.3802863359451294, "learning_rate": 0.0002, "epoch": 0.03590664272890485, "step": 500}, {"loss": 0.8986, "grad_norm": 0.40374308824539185, "learning_rate": 0.0002, "epoch": 0.03662477558348295, "step": 510}, {"loss": 0.8495, "grad_norm": 0.4320009648799896, "learning_rate": 0.0002, "epoch": 0.03734290843806104, "step": 520}, {"loss": 0.8838, "grad_norm": 0.5198846459388733, "learning_rate": 0.0002, "epoch": 0.03806104129263914, "step": 530}, {"loss": 0.8343, "grad_norm": 0.4136947989463806, "learning_rate": 0.0002, "epoch": 0.03877917414721724, "step": 540}, {"loss": 0.8752, "grad_norm": 0.39344364404678345, "learning_rate": 0.0002, "epoch": 0.03949730700179533, "step": 550}, {"loss": 0.8088, "grad_norm": 0.4659644067287445, "learning_rate": 0.0002, "epoch": 0.04021543985637343, "step": 560}, {"loss": 0.766, "grad_norm": 0.3898842930793762, "learning_rate": 0.0002, "epoch": 0.04093357271095153, "step": 570}, {"loss": 0.7806, "grad_norm": 0.3964841961860657, "learning_rate": 0.0002, "epoch": 0.04165170556552962, "step": 580}, {"loss": 0.801, "grad_norm": 0.5172179341316223, "learning_rate": 0.0002, "epoch": 0.04236983842010772, "step": 590}, {"loss": 0.8253, "grad_norm": 0.5362544059753418, "learning_rate": 0.0002, "epoch": 0.04308797127468582, "step": 600}, {"loss": 0.8701, "grad_norm": 0.3975909948348999, "learning_rate": 0.0002, "epoch": 0.04380610412926391, "step": 610}, {"loss": 0.844, "grad_norm": 0.3905031085014343, "learning_rate": 0.0002, "epoch": 0.04452423698384201, "step": 620}, {"loss": 0.7723, "grad_norm": 0.5148088932037354, "learning_rate": 0.0002, "epoch": 0.04524236983842011, "step": 630}, {"loss": 0.8309, "grad_norm": 0.38826194405555725, "learning_rate": 0.0002, "epoch": 0.04596050269299821, "step": 640}, {"loss": 0.8379, "grad_norm": 0.5432049036026001, "learning_rate": 0.0002, "epoch": 0.0466786355475763, "step": 650}, {"loss": 0.838, "grad_norm": 0.42048221826553345, "learning_rate": 0.0002, "epoch": 0.0473967684021544, "step": 660}, {"loss": 0.8337, "grad_norm": 0.4683088958263397, "learning_rate": 0.0002, "epoch": 0.0481149012567325, "step": 670}, {"loss": 0.7982, "grad_norm": 0.4623735249042511, "learning_rate": 0.0002, "epoch": 0.04883303411131059, "step": 680}, {"loss": 0.8905, "grad_norm": 0.509128212928772, "learning_rate": 0.0002, "epoch": 0.04955116696588869, "step": 690}, {"loss": 0.8193, "grad_norm": 0.45767295360565186, "learning_rate": 0.0002, "epoch": 0.05026929982046679, "step": 700}, {"loss": 0.7658, "grad_norm": 0.4023726284503937, "learning_rate": 0.0002, "epoch": 0.05098743267504488, "step": 710}, {"loss": 0.8552, "grad_norm": 0.4407201409339905, "learning_rate": 0.0002, "epoch": 0.05170556552962298, "step": 720}, {"loss": 0.8369, "grad_norm": 0.41862091422080994, "learning_rate": 0.0002, "epoch": 0.05242369838420108, "step": 730}, {"loss": 0.8856, "grad_norm": 0.37473055720329285, "learning_rate": 0.0002, "epoch": 0.05314183123877918, "step": 740}, {"loss": 0.8282, "grad_norm": 0.4882921576499939, "learning_rate": 0.0002, "epoch": 0.05385996409335727, "step": 750}, {"loss": 0.8257, "grad_norm": 0.47890132665634155, "learning_rate": 0.0002, "epoch": 0.05457809694793537, "step": 760}, {"loss": 0.9222, "grad_norm": 0.5811166167259216, "learning_rate": 0.0002, "epoch": 0.05529622980251347, "step": 770}, {"loss": 0.7943, "grad_norm": 0.41113588213920593, "learning_rate": 0.0002, "epoch": 0.05601436265709156, "step": 780}, {"loss": 0.791, "grad_norm": 0.4120602607727051, "learning_rate": 0.0002, "epoch": 0.05673249551166966, "step": 790}, {"loss": 0.9038, "grad_norm": 0.39287394285202026, "learning_rate": 0.0002, "epoch": 0.05745062836624776, "step": 800}, {"loss": 0.8131, "grad_norm": 0.3986941874027252, "learning_rate": 0.0002, "epoch": 0.05816876122082585, "step": 810}, {"loss": 0.8268, "grad_norm": 0.4264012575149536, "learning_rate": 0.0002, "epoch": 0.05888689407540395, "step": 820}, {"loss": 0.7881, "grad_norm": 0.481139600276947, "learning_rate": 0.0002, "epoch": 0.05960502692998205, "step": 830}, {"loss": 0.8477, "grad_norm": 0.5561784505844116, "learning_rate": 0.0002, "epoch": 0.06032315978456014, "step": 840}, {"loss": 0.7817, "grad_norm": 0.4787197411060333, "learning_rate": 0.0002, "epoch": 0.06104129263913824, "step": 850}, {"loss": 0.8567, "grad_norm": 0.46454647183418274, "learning_rate": 0.0002, "epoch": 0.06175942549371634, "step": 860}, {"loss": 0.8429, "grad_norm": 0.5929669141769409, "learning_rate": 0.0002, "epoch": 0.06247755834829444, "step": 870}, {"loss": 0.8019, "grad_norm": 0.4561384618282318, "learning_rate": 0.0002, "epoch": 0.06319569120287254, "step": 880}, {"loss": 0.8686, "grad_norm": 0.45767998695373535, "learning_rate": 0.0002, "epoch": 0.06391382405745062, "step": 890}, {"loss": 0.818, "grad_norm": 0.42475444078445435, "learning_rate": 0.0002, "epoch": 0.06463195691202872, "step": 900}, {"loss": 0.8579, "grad_norm": 0.4911022484302521, "learning_rate": 0.0002, "epoch": 0.06535008976660682, "step": 910}, {"loss": 0.8067, "grad_norm": 0.5229166746139526, "learning_rate": 0.0002, "epoch": 0.06606822262118492, "step": 920}, {"loss": 0.8563, "grad_norm": 0.38134580850601196, "learning_rate": 0.0002, "epoch": 0.06678635547576302, "step": 930}, {"loss": 0.815, "grad_norm": 0.4171486496925354, "learning_rate": 0.0002, "epoch": 0.06750448833034112, "step": 940}, {"loss": 0.8122, "grad_norm": 0.45171529054641724, "learning_rate": 0.0002, "epoch": 0.06822262118491922, "step": 950}, {"loss": 0.8436, "grad_norm": 0.44889307022094727, "learning_rate": 0.0002, "epoch": 0.0689407540394973, "step": 960}, {"loss": 0.8149, "grad_norm": 0.44902464747428894, "learning_rate": 0.0002, "epoch": 0.0696588868940754, "step": 970}, {"loss": 0.7916, "grad_norm": 0.4671969413757324, "learning_rate": 0.0002, "epoch": 0.0703770197486535, "step": 980}, {"loss": 0.8147, "grad_norm": 0.4686984717845917, "learning_rate": 0.0002, "epoch": 0.0710951526032316, "step": 990}, {"loss": 0.806, "grad_norm": 0.4513658583164215, "learning_rate": 0.0002, "epoch": 0.0718132854578097, "step": 1000}, {"loss": 0.8348, "grad_norm": 0.48861828446388245, "learning_rate": 0.0002, "epoch": 0.0725314183123878, "step": 1010}, {"loss": 0.8038, "grad_norm": 0.7603165507316589, "learning_rate": 0.0002, "epoch": 0.0732495511669659, "step": 1020}, {"loss": 0.7844, "grad_norm": 0.501654863357544, "learning_rate": 0.0002, "epoch": 0.07396768402154398, "step": 1030}, {"loss": 0.7623, "grad_norm": 0.45291560888290405, "learning_rate": 0.0002, "epoch": 0.07468581687612208, "step": 1040}, {"loss": 0.8174, "grad_norm": 0.42454713582992554, "learning_rate": 0.0002, "epoch": 0.07540394973070018, "step": 1050}, {"loss": 0.7874, "grad_norm": 0.4655592441558838, "learning_rate": 0.0002, "epoch": 0.07612208258527828, "step": 1060}, {"loss": 0.8855, "grad_norm": 0.5011071562767029, "learning_rate": 0.0002, "epoch": 0.07684021543985638, "step": 1070}, {"loss": 0.8502, "grad_norm": 0.37221577763557434, "learning_rate": 0.0002, "epoch": 0.07755834829443448, "step": 1080}, {"loss": 0.8623, "grad_norm": 0.5123572945594788, "learning_rate": 0.0002, "epoch": 0.07827648114901256, "step": 1090}, {"loss": 0.8527, "grad_norm": 0.44138720631599426, "learning_rate": 0.0002, "epoch": 0.07899461400359066, "step": 1100}, {"loss": 0.7949, "grad_norm": 0.38932886719703674, "learning_rate": 0.0002, "epoch": 0.07971274685816876, "step": 1110}, {"loss": 0.8289, "grad_norm": 0.435820072889328, "learning_rate": 0.0002, "epoch": 0.08043087971274686, "step": 1120}, {"loss": 0.787, "grad_norm": 0.3820142149925232, "learning_rate": 0.0002, "epoch": 0.08114901256732496, "step": 1130}, {"loss": 0.8617, "grad_norm": 0.39680808782577515, "learning_rate": 0.0002, "epoch": 0.08186714542190306, "step": 1140}, {"loss": 0.8047, "grad_norm": 0.4833722412586212, "learning_rate": 0.0002, "epoch": 0.08258527827648116, "step": 1150}, {"loss": 0.8513, "grad_norm": 0.5045956969261169, "learning_rate": 0.0002, "epoch": 0.08330341113105924, "step": 1160}, {"loss": 0.8366, "grad_norm": 0.3652207553386688, "learning_rate": 0.0002, "epoch": 0.08402154398563734, "step": 1170}, {"loss": 0.8464, "grad_norm": 0.44447052478790283, "learning_rate": 0.0002, "epoch": 0.08473967684021544, "step": 1180}, {"loss": 0.8362, "grad_norm": 0.44942694902420044, "learning_rate": 0.0002, "epoch": 0.08545780969479354, "step": 1190}, {"loss": 0.7932, "grad_norm": 0.48789075016975403, "learning_rate": 0.0002, "epoch": 0.08617594254937164, "step": 1200}, {"loss": 0.8008, "grad_norm": 0.3981451094150543, "learning_rate": 0.0002, "epoch": 0.08689407540394974, "step": 1210}, {"loss": 0.8296, "grad_norm": 0.45545220375061035, "learning_rate": 0.0002, "epoch": 0.08761220825852782, "step": 1220}, {"loss": 0.8406, "grad_norm": 0.562138557434082, "learning_rate": 0.0002, "epoch": 0.08833034111310592, "step": 1230}, {"loss": 0.808, "grad_norm": 0.48523494601249695, "learning_rate": 0.0002, "epoch": 0.08904847396768402, "step": 1240}, {"loss": 0.8024, "grad_norm": 0.35054388642311096, "learning_rate": 0.0002, "epoch": 0.08976660682226212, "step": 1250}, {"loss": 0.8635, "grad_norm": 0.4148605167865753, "learning_rate": 0.0002, "epoch": 0.09048473967684022, "step": 1260}, {"loss": 0.8379, "grad_norm": 0.50171959400177, "learning_rate": 0.0002, "epoch": 0.09120287253141832, "step": 1270}, {"loss": 0.8466, "grad_norm": 0.41747573018074036, "learning_rate": 0.0002, "epoch": 0.09192100538599642, "step": 1280}, {"loss": 0.7905, "grad_norm": 0.43028751015663147, "learning_rate": 0.0002, "epoch": 0.0926391382405745, "step": 1290}, {"loss": 0.8071, "grad_norm": 0.41274991631507874, "learning_rate": 0.0002, "epoch": 0.0933572710951526, "step": 1300}, {"loss": 0.8214, "grad_norm": 0.5399569272994995, "learning_rate": 0.0002, "epoch": 0.0940754039497307, "step": 1310}, {"loss": 0.8108, "grad_norm": 0.44284379482269287, "learning_rate": 0.0002, "epoch": 0.0947935368043088, "step": 1320}, {"loss": 0.8301, "grad_norm": 0.42511969804763794, "learning_rate": 0.0002, "epoch": 0.0955116696588869, "step": 1330}, {"loss": 0.8527, "grad_norm": 0.5717929005622864, "learning_rate": 0.0002, "epoch": 0.096229802513465, "step": 1340}, {"loss": 0.8232, "grad_norm": 0.4104631245136261, "learning_rate": 0.0002, "epoch": 0.09694793536804308, "step": 1350}, {"loss": 0.8697, "grad_norm": 0.4144339859485626, "learning_rate": 0.0002, "epoch": 0.09766606822262118, "step": 1360}, {"loss": 0.7909, "grad_norm": 0.43676936626434326, "learning_rate": 0.0002, "epoch": 0.09838420107719928, "step": 1370}, {"loss": 0.8757, "grad_norm": 0.5297161340713501, "learning_rate": 0.0002, "epoch": 0.09910233393177738, "step": 1380}, {"loss": 0.7772, "grad_norm": 0.5319193601608276, "learning_rate": 0.0002, "epoch": 0.09982046678635548, "step": 1390}, {"loss": 0.8167, "grad_norm": 0.4083728492259979, "learning_rate": 0.0002, "epoch": 0.10053859964093358, "step": 1400}, {"loss": 0.8436, "grad_norm": 0.4193868339061737, "learning_rate": 0.0002, "epoch": 0.10125673249551168, "step": 1410}, {"loss": 0.8634, "grad_norm": 0.4062198996543884, "learning_rate": 0.0002, "epoch": 0.10197486535008976, "step": 1420}, {"loss": 0.7984, "grad_norm": 0.43972232937812805, "learning_rate": 0.0002, "epoch": 0.10269299820466786, "step": 1430}, {"loss": 0.8278, "grad_norm": 0.4598410725593567, "learning_rate": 0.0002, "epoch": 0.10341113105924596, "step": 1440}, {"loss": 0.8527, "grad_norm": 0.571662187576294, "learning_rate": 0.0002, "epoch": 0.10412926391382406, "step": 1450}, {"loss": 0.8485, "grad_norm": 0.5437791347503662, "learning_rate": 0.0002, "epoch": 0.10484739676840216, "step": 1460}, {"loss": 0.8172, "grad_norm": 0.4241923391819, "learning_rate": 0.0002, "epoch": 0.10556552962298026, "step": 1470}, {"loss": 0.8224, "grad_norm": 0.5185145735740662, "learning_rate": 0.0002, "epoch": 0.10628366247755835, "step": 1480}, {"loss": 0.8292, "grad_norm": 0.537626326084137, "learning_rate": 0.0002, "epoch": 0.10700179533213644, "step": 1490}, {"loss": 0.8227, "grad_norm": 0.4573661983013153, "learning_rate": 0.0002, "epoch": 0.10771992818671454, "step": 1500}, {"loss": 0.8318, "grad_norm": 0.4521017074584961, "learning_rate": 0.0002, "epoch": 0.10843806104129264, "step": 1510}, {"loss": 0.8107, "grad_norm": 0.6835159063339233, "learning_rate": 0.0002, "epoch": 0.10915619389587074, "step": 1520}, {"loss": 0.8256, "grad_norm": 0.43522894382476807, "learning_rate": 0.0002, "epoch": 0.10987432675044884, "step": 1530}, {"loss": 0.8211, "grad_norm": 0.685547411441803, "learning_rate": 0.0002, "epoch": 0.11059245960502694, "step": 1540}, {"loss": 0.8393, "grad_norm": 0.5283669233322144, "learning_rate": 0.0002, "epoch": 0.11131059245960502, "step": 1550}, {"loss": 0.8493, "grad_norm": 0.4869283437728882, "learning_rate": 0.0002, "epoch": 0.11202872531418312, "step": 1560}, {"loss": 0.8614, "grad_norm": 0.43024054169654846, "learning_rate": 0.0002, "epoch": 0.11274685816876122, "step": 1570}, {"loss": 0.8026, "grad_norm": 0.46726059913635254, "learning_rate": 0.0002, "epoch": 0.11346499102333932, "step": 1580}, {"loss": 0.8103, "grad_norm": 0.5046039819717407, "learning_rate": 0.0002, "epoch": 0.11418312387791742, "step": 1590}, {"loss": 0.8242, "grad_norm": 0.48972827196121216, "learning_rate": 0.0002, "epoch": 0.11490125673249552, "step": 1600}, {"loss": 0.8114, "grad_norm": 0.5221049189567566, "learning_rate": 0.0002, "epoch": 0.11561938958707361, "step": 1610}, {"loss": 0.8022, "grad_norm": 0.49169477820396423, "learning_rate": 0.0002, "epoch": 0.1163375224416517, "step": 1620}, {"loss": 0.8223, "grad_norm": 0.48462188243865967, "learning_rate": 0.0002, "epoch": 0.1170556552962298, "step": 1630}, {"loss": 0.8409, "grad_norm": 0.9001021981239319, "learning_rate": 0.0002, "epoch": 0.1177737881508079, "step": 1640}, {"loss": 0.8037, "grad_norm": 0.47555917501449585, "learning_rate": 0.0002, "epoch": 0.118491921005386, "step": 1650}, {"loss": 0.8047, "grad_norm": 0.4523521959781647, "learning_rate": 0.0002, "epoch": 0.1192100538599641, "step": 1660}, {"loss": 0.8552, "grad_norm": 0.510956346988678, "learning_rate": 0.0002, "epoch": 0.1199281867145422, "step": 1670}, {"loss": 0.8081, "grad_norm": 0.48063746094703674, "learning_rate": 0.0002, "epoch": 0.12064631956912028, "step": 1680}, {"loss": 0.7712, "grad_norm": 0.5209490060806274, "learning_rate": 0.0002, "epoch": 0.12136445242369838, "step": 1690}, {"loss": 0.8019, "grad_norm": 0.5488983988761902, "learning_rate": 0.0002, "epoch": 0.12208258527827648, "step": 1700}, {"loss": 0.829, "grad_norm": 0.5263523459434509, "learning_rate": 0.0002, "epoch": 0.12280071813285458, "step": 1710}, {"loss": 0.7761, "grad_norm": 0.45365768671035767, "learning_rate": 0.0002, "epoch": 0.12351885098743268, "step": 1720}, {"loss": 0.8432, "grad_norm": 0.4366922378540039, "learning_rate": 0.0002, "epoch": 0.12423698384201078, "step": 1730}, {"loss": 0.8261, "grad_norm": 0.4841083884239197, "learning_rate": 0.0002, "epoch": 0.12495511669658887, "step": 1740}, {"loss": 0.7834, "grad_norm": 0.46546968817710876, "learning_rate": 0.0002, "epoch": 0.12567324955116696, "step": 1750}, {"loss": 0.7874, "grad_norm": 0.39987099170684814, "learning_rate": 0.0002, "epoch": 0.12639138240574507, "step": 1760}, {"loss": 0.813, "grad_norm": 0.4661678969860077, "learning_rate": 0.0002, "epoch": 0.12710951526032316, "step": 1770}, {"loss": 0.8516, "grad_norm": 0.46716657280921936, "learning_rate": 0.0002, "epoch": 0.12782764811490124, "step": 1780}, {"loss": 0.8065, "grad_norm": 0.46164995431900024, "learning_rate": 0.0002, "epoch": 0.12854578096947936, "step": 1790}, {"loss": 0.8911, "grad_norm": 0.4910370111465454, "learning_rate": 0.0002, "epoch": 0.12926391382405744, "step": 1800}, {"loss": 0.7773, "grad_norm": 0.5615737438201904, "learning_rate": 0.0002, "epoch": 0.12998204667863555, "step": 1810}, {"loss": 0.7726, "grad_norm": 0.5739728808403015, "learning_rate": 0.0002, "epoch": 0.13070017953321364, "step": 1820}, {"loss": 0.8307, "grad_norm": 0.44104722142219543, "learning_rate": 0.0002, "epoch": 0.13141831238779175, "step": 1830}, {"loss": 0.7533, "grad_norm": 0.46373724937438965, "learning_rate": 0.0002, "epoch": 0.13213644524236984, "step": 1840}, {"loss": 0.8181, "grad_norm": 0.4481196403503418, "learning_rate": 0.0002, "epoch": 0.13285457809694792, "step": 1850}, {"loss": 0.8508, "grad_norm": 0.5689327716827393, "learning_rate": 0.0002, "epoch": 0.13357271095152604, "step": 1860}, {"loss": 0.8364, "grad_norm": 0.5334849953651428, "learning_rate": 0.0002, "epoch": 0.13429084380610412, "step": 1870}, {"loss": 0.8018, "grad_norm": 0.5177253484725952, "learning_rate": 0.0002, "epoch": 0.13500897666068223, "step": 1880}, {"loss": 0.869, "grad_norm": 0.4919368326663971, "learning_rate": 0.0002, "epoch": 0.13572710951526032, "step": 1890}, {"loss": 0.7647, "grad_norm": 0.5987576842308044, "learning_rate": 0.0002, "epoch": 0.13644524236983843, "step": 1900}, {"loss": 0.8546, "grad_norm": 0.49790486693382263, "learning_rate": 0.0002, "epoch": 0.13716337522441652, "step": 1910}, {"loss": 0.8402, "grad_norm": 0.5337542295455933, "learning_rate": 0.0002, "epoch": 0.1378815080789946, "step": 1920}, {"loss": 0.815, "grad_norm": 0.5171598792076111, "learning_rate": 0.0002, "epoch": 0.13859964093357272, "step": 1930}, {"loss": 0.843, "grad_norm": 0.5003953576087952, "learning_rate": 0.0002, "epoch": 0.1393177737881508, "step": 1940}, {"loss": 0.7867, "grad_norm": 0.5147887468338013, "learning_rate": 0.0002, "epoch": 0.1400359066427289, "step": 1950}, {"loss": 0.8215, "grad_norm": 0.6365984678268433, "learning_rate": 0.0002, "epoch": 0.140754039497307, "step": 1960}, {"loss": 0.8397, "grad_norm": 0.5449512004852295, "learning_rate": 0.0002, "epoch": 0.1414721723518851, "step": 1970}, {"loss": 0.8177, "grad_norm": 0.4062703847885132, "learning_rate": 0.0002, "epoch": 0.1421903052064632, "step": 1980}, {"loss": 0.8058, "grad_norm": 0.4446912705898285, "learning_rate": 0.0002, "epoch": 0.14290843806104128, "step": 1990}, {"loss": 0.7854, "grad_norm": 0.49001234769821167, "learning_rate": 0.0002, "epoch": 0.1436265709156194, "step": 2000}, {"loss": 0.8136, "grad_norm": 0.5591765642166138, "learning_rate": 0.0002, "epoch": 0.14434470377019748, "step": 2010}, {"loss": 0.7808, "grad_norm": 0.6476696133613586, "learning_rate": 0.0002, "epoch": 0.1450628366247756, "step": 2020}, {"loss": 0.8137, "grad_norm": 0.44688376784324646, "learning_rate": 0.0002, "epoch": 0.14578096947935368, "step": 2030}, {"loss": 0.8253, "grad_norm": 0.4437490701675415, "learning_rate": 0.0002, "epoch": 0.1464991023339318, "step": 2040}, {"loss": 0.7654, "grad_norm": 0.59927898645401, "learning_rate": 0.0002, "epoch": 0.14721723518850988, "step": 2050}, {"loss": 0.825, "grad_norm": 0.4356591999530792, "learning_rate": 0.0002, "epoch": 0.14793536804308796, "step": 2060}, {"loss": 0.8038, "grad_norm": 0.5560822486877441, "learning_rate": 0.0002, "epoch": 0.14865350089766607, "step": 2070}, {"loss": 0.838, "grad_norm": 0.43027108907699585, "learning_rate": 0.0002, "epoch": 0.14937163375224416, "step": 2080}, {"loss": 0.8317, "grad_norm": 0.41215455532073975, "learning_rate": 0.0002, "epoch": 0.15008976660682227, "step": 2090}, {"loss": 0.7948, "grad_norm": 0.4607839584350586, "learning_rate": 0.0002, "epoch": 0.15080789946140036, "step": 2100}, {"loss": 0.7981, "grad_norm": 0.4699854254722595, "learning_rate": 0.0002, "epoch": 0.15152603231597844, "step": 2110}, {"loss": 0.8464, "grad_norm": 0.5111975073814392, "learning_rate": 0.0002, "epoch": 0.15224416517055656, "step": 2120}, {"loss": 0.7672, "grad_norm": 0.4713742733001709, "learning_rate": 0.0002, "epoch": 0.15296229802513464, "step": 2130}, {"loss": 0.7692, "grad_norm": 0.3816622793674469, "learning_rate": 0.0002, "epoch": 0.15368043087971275, "step": 2140}, {"loss": 0.7824, "grad_norm": 0.4637526273727417, "learning_rate": 0.0002, "epoch": 0.15439856373429084, "step": 2150}, {"loss": 0.8185, "grad_norm": 0.3691818118095398, "learning_rate": 0.0002, "epoch": 0.15511669658886895, "step": 2160}, {"loss": 0.8298, "grad_norm": 0.4435218274593353, "learning_rate": 0.0002, "epoch": 0.15583482944344704, "step": 2170}, {"loss": 0.7917, "grad_norm": 0.5282211899757385, "learning_rate": 0.0002, "epoch": 0.15655296229802512, "step": 2180}, {"loss": 0.8006, "grad_norm": 0.7611056566238403, "learning_rate": 0.0002, "epoch": 0.15727109515260324, "step": 2190}, {"loss": 0.8039, "grad_norm": 0.5951169729232788, "learning_rate": 0.0002, "epoch": 0.15798922800718132, "step": 2200}, {"loss": 0.8314, "grad_norm": 0.5243265628814697, "learning_rate": 0.0002, "epoch": 0.15870736086175943, "step": 2210}, {"loss": 0.7817, "grad_norm": 0.518944501876831, "learning_rate": 0.0002, "epoch": 0.15942549371633752, "step": 2220}, {"loss": 0.8187, "grad_norm": 0.4264616072177887, "learning_rate": 0.0002, "epoch": 0.16014362657091563, "step": 2230}, {"loss": 0.7916, "grad_norm": 0.4619045853614807, "learning_rate": 0.0002, "epoch": 0.16086175942549372, "step": 2240}, {"loss": 0.84, "grad_norm": 0.4047030508518219, "learning_rate": 0.0002, "epoch": 0.1615798922800718, "step": 2250}, {"loss": 0.8133, "grad_norm": 0.47133687138557434, "learning_rate": 0.0002, "epoch": 0.16229802513464991, "step": 2260}, {"loss": 0.8032, "grad_norm": 0.4990246593952179, "learning_rate": 0.0002, "epoch": 0.163016157989228, "step": 2270}, {"loss": 0.752, "grad_norm": 0.5145298838615417, "learning_rate": 0.0002, "epoch": 0.1637342908438061, "step": 2280}, {"loss": 0.8441, "grad_norm": 0.5354352593421936, "learning_rate": 0.0002, "epoch": 0.1644524236983842, "step": 2290}, {"loss": 0.8099, "grad_norm": 0.47621065378189087, "learning_rate": 0.0002, "epoch": 0.1651705565529623, "step": 2300}, {"loss": 0.8105, "grad_norm": 0.45333582162857056, "learning_rate": 0.0002, "epoch": 0.1658886894075404, "step": 2310}, {"loss": 0.8126, "grad_norm": 0.4832790493965149, "learning_rate": 0.0002, "epoch": 0.16660682226211848, "step": 2320}, {"loss": 0.8271, "grad_norm": 0.4922761619091034, "learning_rate": 0.0002, "epoch": 0.1673249551166966, "step": 2330}, {"loss": 0.8324, "grad_norm": 0.5701655149459839, "learning_rate": 0.0002, "epoch": 0.16804308797127468, "step": 2340}, {"loss": 0.844, "grad_norm": 0.5170459151268005, "learning_rate": 0.0002, "epoch": 0.1687612208258528, "step": 2350}, {"loss": 0.7995, "grad_norm": 0.6562373638153076, "learning_rate": 0.0002, "epoch": 0.16947935368043088, "step": 2360}, {"loss": 0.7733, "grad_norm": 0.5350262522697449, "learning_rate": 0.0002, "epoch": 0.170197486535009, "step": 2370}, {"loss": 0.8501, "grad_norm": 0.5163491368293762, "learning_rate": 0.0002, "epoch": 0.17091561938958708, "step": 2380}, {"loss": 0.7708, "grad_norm": 0.48841530084609985, "learning_rate": 0.0002, "epoch": 0.17163375224416516, "step": 2390}, {"loss": 0.7969, "grad_norm": 0.44912993907928467, "learning_rate": 0.0002, "epoch": 0.17235188509874327, "step": 2400}, {"loss": 0.7706, "grad_norm": 0.5770647525787354, "learning_rate": 0.0002, "epoch": 0.17307001795332136, "step": 2410}, {"loss": 0.8233, "grad_norm": 0.4716179072856903, "learning_rate": 0.0002, "epoch": 0.17378815080789947, "step": 2420}, {"loss": 0.7802, "grad_norm": 0.5465078949928284, "learning_rate": 0.0002, "epoch": 0.17450628366247756, "step": 2430}, {"loss": 0.8191, "grad_norm": 0.40810713171958923, "learning_rate": 0.0002, "epoch": 0.17522441651705564, "step": 2440}, {"loss": 0.7971, "grad_norm": 0.3789578080177307, "learning_rate": 0.0002, "epoch": 0.17594254937163376, "step": 2450}, {"loss": 0.7437, "grad_norm": 0.4615110158920288, "learning_rate": 0.0002, "epoch": 0.17666068222621184, "step": 2460}, {"loss": 0.8102, "grad_norm": 0.4400235712528229, "learning_rate": 0.0002, "epoch": 0.17737881508078995, "step": 2470}, {"loss": 0.8254, "grad_norm": 0.5935020446777344, "learning_rate": 0.0002, "epoch": 0.17809694793536804, "step": 2480}, {"loss": 0.7886, "grad_norm": 0.5672990679740906, "learning_rate": 0.0002, "epoch": 0.17881508078994615, "step": 2490}, {"loss": 0.7829, "grad_norm": 0.4132838845252991, "learning_rate": 0.0002, "epoch": 0.17953321364452424, "step": 2500}, {"loss": 0.8056, "grad_norm": 0.5373716950416565, "learning_rate": 0.0002, "epoch": 0.18025134649910232, "step": 2510}, {"loss": 0.8061, "grad_norm": 0.5335832834243774, "learning_rate": 0.0002, "epoch": 0.18096947935368043, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5705642700195312, "learning_rate": 0.0002, "epoch": 0.18168761220825852, "step": 2530}, {"loss": 0.7779, "grad_norm": 0.4807959496974945, "learning_rate": 0.0002, "epoch": 0.18240574506283663, "step": 2540}, {"loss": 0.7767, "grad_norm": 0.4430573880672455, "learning_rate": 0.0002, "epoch": 0.18312387791741472, "step": 2550}, {"loss": 0.7921, "grad_norm": 0.5294728875160217, "learning_rate": 0.0002, "epoch": 0.18384201077199283, "step": 2560}, {"loss": 0.8102, "grad_norm": 0.661173403263092, "learning_rate": 0.0002, "epoch": 0.18456014362657092, "step": 2570}, {"loss": 0.803, "grad_norm": 0.5044304728507996, "learning_rate": 0.0002, "epoch": 0.185278276481149, "step": 2580}, {"loss": 0.7833, "grad_norm": 0.48929551243782043, "learning_rate": 0.0002, "epoch": 0.18599640933572711, "step": 2590}, {"loss": 0.8252, "grad_norm": 0.5054438710212708, "learning_rate": 0.0002, "epoch": 0.1867145421903052, "step": 2600}, {"loss": 0.7665, "grad_norm": 0.5613677501678467, "learning_rate": 0.0002, "epoch": 0.1874326750448833, "step": 2610}, {"loss": 0.7954, "grad_norm": 0.5762478709220886, "learning_rate": 0.0002, "epoch": 0.1881508078994614, "step": 2620}, {"loss": 0.8312, "grad_norm": 0.4523695409297943, "learning_rate": 0.0002, "epoch": 0.1888689407540395, "step": 2630}, {"loss": 0.8098, "grad_norm": 0.5235317945480347, "learning_rate": 0.0002, "epoch": 0.1895870736086176, "step": 2640}, {"loss": 0.8281, "grad_norm": 0.4894576370716095, "learning_rate": 0.0002, "epoch": 0.19030520646319568, "step": 2650}, {"loss": 0.7923, "grad_norm": 0.45731106400489807, "learning_rate": 0.0002, "epoch": 0.1910233393177738, "step": 2660}, {"loss": 0.7942, "grad_norm": 0.4726541042327881, "learning_rate": 0.0002, "epoch": 0.19174147217235188, "step": 2670}, {"loss": 0.7979, "grad_norm": 0.4281631410121918, "learning_rate": 0.0002, "epoch": 0.19245960502693, "step": 2680}, {"loss": 0.8076, "grad_norm": 0.48011314868927, "learning_rate": 0.0002, "epoch": 0.19317773788150808, "step": 2690}, {"loss": 0.7785, "grad_norm": 0.45785006880760193, "learning_rate": 0.0002, "epoch": 0.19389587073608616, "step": 2700}, {"loss": 0.7726, "grad_norm": 0.5244625210762024, "learning_rate": 0.0002, "epoch": 0.19461400359066428, "step": 2710}, {"loss": 0.8674, "grad_norm": 0.4674883186817169, "learning_rate": 0.0002, "epoch": 0.19533213644524236, "step": 2720}, {"loss": 0.8465, "grad_norm": 0.5969558358192444, "learning_rate": 0.0002, "epoch": 0.19605026929982047, "step": 2730}, {"loss": 0.8238, "grad_norm": 0.44413265585899353, "learning_rate": 0.0002, "epoch": 0.19676840215439856, "step": 2740}, {"loss": 0.8181, "grad_norm": 0.5094553828239441, "learning_rate": 0.0002, "epoch": 0.19748653500897667, "step": 2750}, {"loss": 0.7593, "grad_norm": 0.4931736886501312, "learning_rate": 0.0002, "epoch": 0.19820466786355476, "step": 2760}, {"loss": 0.8535, "grad_norm": 0.4766625463962555, "learning_rate": 0.0002, "epoch": 0.19892280071813284, "step": 2770}, {"loss": 0.754, "grad_norm": 0.4196971654891968, "learning_rate": 0.0002, "epoch": 0.19964093357271095, "step": 2780}, {"loss": 0.7794, "grad_norm": 0.4693375825881958, "learning_rate": 0.0002, "epoch": 0.20035906642728904, "step": 2790}, {"loss": 0.8336, "grad_norm": 0.5407108664512634, "learning_rate": 0.0002, "epoch": 0.20107719928186715, "step": 2800}, {"loss": 0.7938, "grad_norm": 0.42864227294921875, "learning_rate": 0.0002, "epoch": 0.20179533213644524, "step": 2810}, {"loss": 0.8059, "grad_norm": 0.4928833246231079, "learning_rate": 0.0002, "epoch": 0.20251346499102335, "step": 2820}, {"loss": 0.8221, "grad_norm": 0.5575131773948669, "learning_rate": 0.0002, "epoch": 0.20323159784560144, "step": 2830}, {"loss": 0.7712, "grad_norm": 0.505114734172821, "learning_rate": 0.0002, "epoch": 0.20394973070017952, "step": 2840}, {"loss": 0.7986, "grad_norm": 0.4727420210838318, "learning_rate": 0.0002, "epoch": 0.20466786355475763, "step": 2850}, {"loss": 0.7662, "grad_norm": 0.48218145966529846, "learning_rate": 0.0002, "epoch": 0.20538599640933572, "step": 2860}, {"loss": 0.8055, "grad_norm": 0.5196906328201294, "learning_rate": 0.0002, "epoch": 0.20610412926391383, "step": 2870}, {"loss": 0.8401, "grad_norm": 0.4927639067173004, "learning_rate": 0.0002, "epoch": 0.20682226211849192, "step": 2880}, {"loss": 0.8067, "grad_norm": 0.5076990127563477, "learning_rate": 0.0002, "epoch": 0.20754039497307003, "step": 2890}, {"loss": 0.789, "grad_norm": 0.4606800079345703, "learning_rate": 0.0002, "epoch": 0.20825852782764812, "step": 2900}, {"loss": 0.8381, "grad_norm": 0.6184319257736206, "learning_rate": 0.0002, "epoch": 0.2089766606822262, "step": 2910}, {"loss": 0.8019, "grad_norm": 0.5237935781478882, "learning_rate": 0.0002, "epoch": 0.2096947935368043, "step": 2920}, {"loss": 0.7763, "grad_norm": 0.43966251611709595, "learning_rate": 0.0002, "epoch": 0.2104129263913824, "step": 2930}, {"loss": 0.7915, "grad_norm": 0.48786666989326477, "learning_rate": 0.0002, "epoch": 0.2111310592459605, "step": 2940}, {"loss": 0.7549, "grad_norm": 0.4397817552089691, "learning_rate": 0.0002, "epoch": 0.2118491921005386, "step": 2950}, {"loss": 0.8342, "grad_norm": 0.5155336260795593, "learning_rate": 0.0002, "epoch": 0.2125673249551167, "step": 2960}, {"loss": 0.7885, "grad_norm": 0.48058274388313293, "learning_rate": 0.0002, "epoch": 0.2132854578096948, "step": 2970}, {"loss": 0.8208, "grad_norm": 0.5022647976875305, "learning_rate": 0.0002, "epoch": 0.21400359066427288, "step": 2980}, {"loss": 0.784, "grad_norm": 0.5417225360870361, "learning_rate": 0.0002, "epoch": 0.214721723518851, "step": 2990}, {"loss": 0.8518, "grad_norm": 0.46300315856933594, "learning_rate": 0.0002, "epoch": 0.21543985637342908, "step": 3000}, {"loss": 0.764, "grad_norm": 0.5375089049339294, "learning_rate": 0.0002, "epoch": 0.2161579892280072, "step": 3010}, {"loss": 0.8459, "grad_norm": 0.5050022602081299, "learning_rate": 0.0002, "epoch": 0.21687612208258528, "step": 3020}, {"loss": 0.797, "grad_norm": 0.46347716450691223, "learning_rate": 0.0002, "epoch": 0.21759425493716336, "step": 3030}, {"loss": 0.8014, "grad_norm": 0.544874370098114, "learning_rate": 0.0002, "epoch": 0.21831238779174147, "step": 3040}, {"loss": 0.802, "grad_norm": 0.4268142580986023, "learning_rate": 0.0002, "epoch": 0.21903052064631956, "step": 3050}, {"loss": 0.8224, "grad_norm": 0.5527157187461853, "learning_rate": 0.0002, "epoch": 0.21974865350089767, "step": 3060}, {"loss": 0.771, "grad_norm": 0.5565235018730164, "learning_rate": 0.0002, "epoch": 0.22046678635547576, "step": 3070}, {"loss": 0.7807, "grad_norm": 0.4900645613670349, "learning_rate": 0.0002, "epoch": 0.22118491921005387, "step": 3080}, {"loss": 0.8321, "grad_norm": 0.4951242208480835, "learning_rate": 0.0002, "epoch": 0.22190305206463196, "step": 3090}, {"loss": 0.8301, "grad_norm": 0.5831719636917114, "learning_rate": 0.0002, "epoch": 0.22262118491921004, "step": 3100}, {"loss": 0.8011, "grad_norm": 0.417576402425766, "learning_rate": 0.0002, "epoch": 0.22333931777378815, "step": 3110}, {"loss": 0.8226, "grad_norm": 0.4715117812156677, "learning_rate": 0.0002, "epoch": 0.22405745062836624, "step": 3120}, {"loss": 0.778, "grad_norm": 0.5956445336341858, "learning_rate": 0.0002, "epoch": 0.22477558348294435, "step": 3130}, {"loss": 0.788, "grad_norm": 0.408184289932251, "learning_rate": 0.0002, "epoch": 0.22549371633752244, "step": 3140}, {"loss": 0.8096, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 0.22621184919210055, "step": 3150}, {"loss": 0.7722, "grad_norm": 0.5631294846534729, "learning_rate": 0.0002, "epoch": 0.22692998204667864, "step": 3160}, {"loss": 0.7933, "grad_norm": 0.5054665803909302, "learning_rate": 0.0002, "epoch": 0.22764811490125672, "step": 3170}, {"loss": 0.8572, "grad_norm": 0.47388020157814026, "learning_rate": 0.0002, "epoch": 0.22836624775583483, "step": 3180}, {"loss": 0.8148, "grad_norm": 0.45871609449386597, "learning_rate": 0.0002, "epoch": 0.22908438061041292, "step": 3190}, {"loss": 0.8373, "grad_norm": 0.42431211471557617, "learning_rate": 0.0002, "epoch": 0.22980251346499103, "step": 3200}, {"loss": 0.7847, "grad_norm": 0.584872305393219, "learning_rate": 0.0002, "epoch": 0.23052064631956912, "step": 3210}, {"loss": 0.8118, "grad_norm": 0.5489653944969177, "learning_rate": 0.0002, "epoch": 0.23123877917414723, "step": 3220}, {"loss": 0.8552, "grad_norm": 0.5803213119506836, "learning_rate": 0.0002, "epoch": 0.23195691202872532, "step": 3230}, {"loss": 0.7702, "grad_norm": 0.906505823135376, "learning_rate": 0.0002, "epoch": 0.2326750448833034, "step": 3240}, {"loss": 0.8454, "grad_norm": 0.4569525718688965, "learning_rate": 0.0002, "epoch": 0.2333931777378815, "step": 3250}, {"loss": 0.7641, "grad_norm": 0.5566741228103638, "learning_rate": 0.0002, "epoch": 0.2341113105924596, "step": 3260}, {"loss": 0.7964, "grad_norm": 0.5059959888458252, "learning_rate": 0.0002, "epoch": 0.2348294434470377, "step": 3270}, {"loss": 0.7965, "grad_norm": 0.530828058719635, "learning_rate": 0.0002, "epoch": 0.2355475763016158, "step": 3280}, {"loss": 0.807, "grad_norm": 0.5149409174919128, "learning_rate": 0.0002, "epoch": 0.2362657091561939, "step": 3290}, {"loss": 0.8067, "grad_norm": 0.7323763966560364, "learning_rate": 0.0002, "epoch": 0.236983842010772, "step": 3300}, {"loss": 0.774, "grad_norm": 0.6794836521148682, "learning_rate": 0.0002, "epoch": 0.23770197486535008, "step": 3310}, {"loss": 0.7902, "grad_norm": 0.5176534056663513, "learning_rate": 0.0002, "epoch": 0.2384201077199282, "step": 3320}, {"loss": 0.8119, "grad_norm": 0.42245906591415405, "learning_rate": 0.0002, "epoch": 0.23913824057450628, "step": 3330}, {"loss": 0.868, "grad_norm": 0.43535107374191284, "learning_rate": 0.0002, "epoch": 0.2398563734290844, "step": 3340}, {"loss": 0.825, "grad_norm": 0.7038307785987854, "learning_rate": 0.0002, "epoch": 0.24057450628366248, "step": 3350}, {"loss": 0.7818, "grad_norm": 0.5689977407455444, "learning_rate": 0.0002, "epoch": 0.24129263913824056, "step": 3360}, {"loss": 0.7958, "grad_norm": 0.538136899471283, "learning_rate": 0.0002, "epoch": 0.24201077199281867, "step": 3370}, {"loss": 0.7995, "grad_norm": 0.7433661222457886, "learning_rate": 0.0002, "epoch": 0.24272890484739676, "step": 3380}, {"loss": 0.8564, "grad_norm": 0.6996734738349915, "learning_rate": 0.0002, "epoch": 0.24344703770197487, "step": 3390}, {"loss": 0.8288, "grad_norm": 0.5055703520774841, "learning_rate": 0.0002, "epoch": 0.24416517055655296, "step": 3400}, {"loss": 0.7741, "grad_norm": 0.5218513607978821, "learning_rate": 0.0002, "epoch": 0.24488330341113107, "step": 3410}, {"loss": 0.7903, "grad_norm": 0.42782822251319885, "learning_rate": 0.0002, "epoch": 0.24560143626570916, "step": 3420}, {"loss": 0.8005, "grad_norm": 0.4991157650947571, "learning_rate": 0.0002, "epoch": 0.24631956912028724, "step": 3430}, {"loss": 0.8151, "grad_norm": 0.5063165426254272, "learning_rate": 0.0002, "epoch": 0.24703770197486535, "step": 3440}, {"loss": 0.7722, "grad_norm": 0.45863136649131775, "learning_rate": 0.0002, "epoch": 0.24775583482944344, "step": 3450}, {"loss": 0.8236, "grad_norm": 0.474728524684906, "learning_rate": 0.0002, "epoch": 0.24847396768402155, "step": 3460}, {"loss": 0.7698, "grad_norm": 0.522570013999939, "learning_rate": 0.0002, "epoch": 0.24919210053859964, "step": 3470}, {"loss": 0.7448, "grad_norm": 0.5474396347999573, "learning_rate": 0.0002, "epoch": 0.24991023339317775, "step": 3480}, {"loss": 0.8339, "grad_norm": 0.49094662070274353, "learning_rate": 0.0002, "epoch": 0.2506283662477558, "step": 3490}, {"loss": 0.7864, "grad_norm": 0.6399132609367371, "learning_rate": 0.0002, "epoch": 0.2513464991023339, "step": 3500}, {"loss": 0.7988, "grad_norm": 0.5910066366195679, "learning_rate": 0.0002, "epoch": 0.25206463195691203, "step": 3510}, {"loss": 0.813, "grad_norm": 0.4761259853839874, "learning_rate": 0.0002, "epoch": 0.25278276481149015, "step": 3520}, {"loss": 0.812, "grad_norm": 0.5124502182006836, "learning_rate": 0.0002, "epoch": 0.2535008976660682, "step": 3530}, {"loss": 0.7699, "grad_norm": 0.4329150915145874, "learning_rate": 0.0002, "epoch": 0.2542190305206463, "step": 3540}, {"loss": 0.8205, "grad_norm": 0.4839608371257782, "learning_rate": 0.0002, "epoch": 0.25493716337522443, "step": 3550}, {"loss": 0.8279, "grad_norm": 0.5413459539413452, "learning_rate": 0.0002, "epoch": 0.2556552962298025, "step": 3560}, {"loss": 0.8253, "grad_norm": 0.5761468410491943, "learning_rate": 0.0002, "epoch": 0.2563734290843806, "step": 3570}, {"loss": 0.8473, "grad_norm": 0.49266132712364197, "learning_rate": 0.0002, "epoch": 0.2570915619389587, "step": 3580}, {"loss": 0.7946, "grad_norm": 0.7377930879592896, "learning_rate": 0.0002, "epoch": 0.2578096947935368, "step": 3590}, {"loss": 0.799, "grad_norm": 0.543541431427002, "learning_rate": 0.0002, "epoch": 0.2585278276481149, "step": 3600}, {"loss": 0.8044, "grad_norm": 0.48385897278785706, "learning_rate": 0.0002, "epoch": 0.259245960502693, "step": 3610}, {"loss": 0.7686, "grad_norm": 0.5152639746665955, "learning_rate": 0.0002, "epoch": 0.2599640933572711, "step": 3620}, {"loss": 0.7438, "grad_norm": 0.5601988434791565, "learning_rate": 0.0002, "epoch": 0.26068222621184917, "step": 3630}, {"loss": 0.7915, "grad_norm": 0.4349626302719116, "learning_rate": 0.0002, "epoch": 0.2614003590664273, "step": 3640}, {"loss": 0.7825, "grad_norm": 0.5487161874771118, "learning_rate": 0.0002, "epoch": 0.2621184919210054, "step": 3650}, {"loss": 0.8085, "grad_norm": 0.45603805780410767, "learning_rate": 0.0002, "epoch": 0.2628366247755835, "step": 3660}, {"loss": 0.7858, "grad_norm": 0.5012730956077576, "learning_rate": 0.0002, "epoch": 0.26355475763016156, "step": 3670}, {"loss": 0.8022, "grad_norm": 0.4523845314979553, "learning_rate": 0.0002, "epoch": 0.2642728904847397, "step": 3680}, {"loss": 0.7932, "grad_norm": 0.5756664872169495, "learning_rate": 0.0002, "epoch": 0.2649910233393178, "step": 3690}, {"loss": 0.816, "grad_norm": 0.48467493057250977, "learning_rate": 0.0002, "epoch": 0.26570915619389585, "step": 3700}, {"loss": 0.7825, "grad_norm": 0.4860585927963257, "learning_rate": 0.0002, "epoch": 0.26642728904847396, "step": 3710}, {"loss": 0.7903, "grad_norm": 0.5067077875137329, "learning_rate": 0.0002, "epoch": 0.26714542190305207, "step": 3720}, {"loss": 0.8155, "grad_norm": 0.5490895509719849, "learning_rate": 0.0002, "epoch": 0.2678635547576302, "step": 3730}, {"loss": 0.7542, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.26858168761220824, "step": 3740}, {"loss": 0.7991, "grad_norm": 0.5026951432228088, "learning_rate": 0.0002, "epoch": 0.26929982046678635, "step": 3750}, {"loss": 0.8152, "grad_norm": 0.49474090337753296, "learning_rate": 0.0002, "epoch": 0.27001795332136447, "step": 3760}, {"loss": 0.8235, "grad_norm": 0.6381985545158386, "learning_rate": 0.0002, "epoch": 0.2707360861759425, "step": 3770}, {"loss": 0.8024, "grad_norm": 0.4784011244773865, "learning_rate": 0.0002, "epoch": 0.27145421903052064, "step": 3780}, {"loss": 0.7746, "grad_norm": 0.5126543045043945, "learning_rate": 0.0002, "epoch": 0.27217235188509875, "step": 3790}, {"loss": 0.841, "grad_norm": 0.5428652763366699, "learning_rate": 0.0002, "epoch": 0.27289048473967686, "step": 3800}, {"loss": 0.8137, "grad_norm": 0.5427033305168152, "learning_rate": 0.0002, "epoch": 0.2736086175942549, "step": 3810}, {"loss": 0.7274, "grad_norm": 0.46467480063438416, "learning_rate": 0.0002, "epoch": 0.27432675044883303, "step": 3820}, {"loss": 0.8414, "grad_norm": 0.494367390871048, "learning_rate": 0.0002, "epoch": 0.27504488330341115, "step": 3830}, {"loss": 0.8151, "grad_norm": 0.59856778383255, "learning_rate": 0.0002, "epoch": 0.2757630161579892, "step": 3840}, {"loss": 0.7899, "grad_norm": 0.422128826379776, "learning_rate": 0.0002, "epoch": 0.2764811490125673, "step": 3850}, {"loss": 0.8153, "grad_norm": 0.5757306814193726, "learning_rate": 0.0002, "epoch": 0.27719928186714543, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5850930213928223, "learning_rate": 0.0002, "epoch": 0.27791741472172354, "step": 3870}, {"loss": 0.8044, "grad_norm": 0.5633023977279663, "learning_rate": 0.0002, "epoch": 0.2786355475763016, "step": 3880}, {"loss": 0.8402, "grad_norm": 0.5037940144538879, "learning_rate": 0.0002, "epoch": 0.2793536804308797, "step": 3890}, {"loss": 0.822, "grad_norm": 0.5255506038665771, "learning_rate": 0.0002, "epoch": 0.2800718132854578, "step": 3900}, {"loss": 0.7625, "grad_norm": 0.44584617018699646, "learning_rate": 0.0002, "epoch": 0.2807899461400359, "step": 3910}, {"loss": 0.8131, "grad_norm": 0.4803239405155182, "learning_rate": 0.0002, "epoch": 0.281508078994614, "step": 3920}, {"loss": 0.8122, "grad_norm": 0.5206008553504944, "learning_rate": 0.0002, "epoch": 0.2822262118491921, "step": 3930}, {"loss": 0.8988, "grad_norm": 0.5596373081207275, "learning_rate": 0.0002, "epoch": 0.2829443447037702, "step": 3940}, {"loss": 0.8091, "grad_norm": 0.4487258493900299, "learning_rate": 0.0002, "epoch": 0.2836624775583483, "step": 3950}, {"loss": 0.7933, "grad_norm": 0.4774281978607178, "learning_rate": 0.0002, "epoch": 0.2843806104129264, "step": 3960}, {"loss": 0.8994, "grad_norm": 0.571829617023468, "learning_rate": 0.0002, "epoch": 0.2850987432675045, "step": 3970}, {"loss": 0.7971, "grad_norm": 0.45251455903053284, "learning_rate": 0.0002, "epoch": 0.28581687612208256, "step": 3980}, {"loss": 0.8007, "grad_norm": 0.5119943618774414, "learning_rate": 0.0002, "epoch": 0.2865350089766607, "step": 3990}, {"loss": 0.8087, "grad_norm": 0.42333969473838806, "learning_rate": 0.0002, "epoch": 0.2872531418312388, "step": 4000}, {"loss": 0.7978, "grad_norm": 0.5694096684455872, "learning_rate": 0.0002, "epoch": 0.2879712746858169, "step": 4010}, {"loss": 0.845, "grad_norm": 0.44457492232322693, "learning_rate": 0.0002, "epoch": 0.28868940754039496, "step": 4020}, {"loss": 0.7268, "grad_norm": 0.496545672416687, "learning_rate": 0.0002, "epoch": 0.2894075403949731, "step": 4030}, {"loss": 0.7908, "grad_norm": 0.5092352032661438, "learning_rate": 0.0002, "epoch": 0.2901256732495512, "step": 4040}, {"loss": 0.807, "grad_norm": 0.5124567151069641, "learning_rate": 0.0002, "epoch": 0.29084380610412924, "step": 4050}, {"loss": 0.8191, "grad_norm": 0.5148161053657532, "learning_rate": 0.0002, "epoch": 0.29156193895870736, "step": 4060}, {"loss": 0.7893, "grad_norm": 0.48183947801589966, "learning_rate": 0.0002, "epoch": 0.29228007181328547, "step": 4070}, {"loss": 0.8397, "grad_norm": 0.47728800773620605, "learning_rate": 0.0002, "epoch": 0.2929982046678636, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.29371633752244164, "step": 4090}, {"loss": 0.8019, "grad_norm": 0.5343585014343262, "learning_rate": 0.0002, "epoch": 0.29443447037701975, "step": 4100}, {"loss": 0.7933, "grad_norm": 0.5760312676429749, "learning_rate": 0.0002, "epoch": 0.29515260323159787, "step": 4110}, {"loss": 0.811, "grad_norm": 0.5894787907600403, "learning_rate": 0.0002, "epoch": 0.2958707360861759, "step": 4120}, {"loss": 0.7375, "grad_norm": 0.4528578817844391, "learning_rate": 0.0002, "epoch": 0.29658886894075404, "step": 4130}, {"loss": 0.7761, "grad_norm": 0.6027235388755798, "learning_rate": 0.0002, "epoch": 0.29730700179533215, "step": 4140}, {"loss": 0.7636, "grad_norm": 0.5060310959815979, "learning_rate": 0.0002, "epoch": 0.2980251346499102, "step": 4150}, {"loss": 0.8122, "grad_norm": 0.475252628326416, "learning_rate": 0.0002, "epoch": 0.2987432675044883, "step": 4160}, {"loss": 0.8306, "grad_norm": 0.4855351448059082, "learning_rate": 0.0002, "epoch": 0.29946140035906643, "step": 4170}, {"loss": 0.7863, "grad_norm": 0.6720767021179199, "learning_rate": 0.0002, "epoch": 0.30017953321364454, "step": 4180}, {"loss": 0.7755, "grad_norm": 0.6409553289413452, "learning_rate": 0.0002, "epoch": 0.3008976660682226, "step": 4190}, {"loss": 0.8333, "grad_norm": 0.5508167147636414, "learning_rate": 0.0002, "epoch": 0.3016157989228007, "step": 4200}, {"loss": 0.8138, "grad_norm": 0.45958149433135986, "learning_rate": 0.0002, "epoch": 0.30233393177737883, "step": 4210}, {"loss": 0.8266, "grad_norm": 0.5201641321182251, "learning_rate": 0.0002, "epoch": 0.3030520646319569, "step": 4220}, {"loss": 0.8246, "grad_norm": 0.5440032482147217, "learning_rate": 0.0002, "epoch": 0.303770197486535, "step": 4230}, {"loss": 0.7863, "grad_norm": 0.43566814064979553, "learning_rate": 0.0002, "epoch": 0.3044883303411131, "step": 4240}, {"loss": 0.7835, "grad_norm": 0.4479893445968628, "learning_rate": 0.0002, "epoch": 0.3052064631956912, "step": 4250}, {"loss": 0.7646, "grad_norm": 0.40390217304229736, "learning_rate": 0.0002, "epoch": 0.3059245960502693, "step": 4260}, {"loss": 0.8382, "grad_norm": 0.5143486261367798, "learning_rate": 0.0002, "epoch": 0.3066427289048474, "step": 4270}, {"loss": 0.799, "grad_norm": 0.5289962887763977, "learning_rate": 0.0002, "epoch": 0.3073608617594255, "step": 4280}, {"loss": 0.7706, "grad_norm": 0.609561026096344, "learning_rate": 0.0002, "epoch": 0.30807899461400357, "step": 4290}, {"loss": 0.809, "grad_norm": 0.5967493653297424, "learning_rate": 0.0002, "epoch": 0.3087971274685817, "step": 4300}, {"loss": 0.8034, "grad_norm": 0.5323672890663147, "learning_rate": 0.0002, "epoch": 0.3095152603231598, "step": 4310}, {"loss": 0.8463, "grad_norm": 0.4996737241744995, "learning_rate": 0.0002, "epoch": 0.3102333931777379, "step": 4320}, {"loss": 0.7879, "grad_norm": 0.5528829097747803, "learning_rate": 0.0002, "epoch": 0.31095152603231596, "step": 4330}, {"loss": 0.8383, "grad_norm": 0.5394268035888672, "learning_rate": 0.0002, "epoch": 0.3116696588868941, "step": 4340}, {"loss": 0.8258, "grad_norm": 0.4654628038406372, "learning_rate": 0.0002, "epoch": 0.3123877917414722, "step": 4350}, {"loss": 0.8045, "grad_norm": 0.4933706521987915, "learning_rate": 0.0002, "epoch": 0.31310592459605024, "step": 4360}, {"loss": 0.7891, "grad_norm": 0.5310598611831665, "learning_rate": 0.0002, "epoch": 0.31382405745062836, "step": 4370}, {"loss": 0.8362, "grad_norm": 0.5558765530586243, "learning_rate": 0.0002, "epoch": 0.31454219030520647, "step": 4380}, {"loss": 0.8013, "grad_norm": 0.5281313061714172, "learning_rate": 0.0002, "epoch": 0.3152603231597846, "step": 4390}, {"loss": 0.8034, "grad_norm": 0.5100293755531311, "learning_rate": 0.0002, "epoch": 0.31597845601436264, "step": 4400}, {"loss": 0.795, "grad_norm": 0.48762813210487366, "learning_rate": 0.0002, "epoch": 0.31669658886894075, "step": 4410}, {"loss": 0.7941, "grad_norm": 0.5211702585220337, "learning_rate": 0.0002, "epoch": 0.31741472172351887, "step": 4420}, {"loss": 0.8079, "grad_norm": 0.696747899055481, "learning_rate": 0.0002, "epoch": 0.3181328545780969, "step": 4430}, {"loss": 0.77, "grad_norm": 0.6334946751594543, "learning_rate": 0.0002, "epoch": 0.31885098743267504, "step": 4440}, {"loss": 0.7871, "grad_norm": 0.5333067178726196, "learning_rate": 0.0002, "epoch": 0.31956912028725315, "step": 4450}, {"loss": 0.7846, "grad_norm": 0.500091552734375, "learning_rate": 0.0002, "epoch": 0.32028725314183126, "step": 4460}, {"loss": 0.7884, "grad_norm": 0.5190957188606262, "learning_rate": 0.0002, "epoch": 0.3210053859964093, "step": 4470}, {"loss": 0.7988, "grad_norm": 0.6702370047569275, "learning_rate": 0.0002, "epoch": 0.32172351885098743, "step": 4480}, {"loss": 0.8014, "grad_norm": 0.4393869638442993, "learning_rate": 0.0002, "epoch": 0.32244165170556555, "step": 4490}, {"loss": 0.8373, "grad_norm": 0.4766499400138855, "learning_rate": 0.0002, "epoch": 0.3231597845601436, "step": 4500}, {"loss": 0.7567, "grad_norm": 0.561836838722229, "learning_rate": 0.0002, "epoch": 0.3238779174147217, "step": 4510}, {"loss": 0.7727, "grad_norm": 0.44366541504859924, "learning_rate": 0.0002, "epoch": 0.32459605026929983, "step": 4520}, {"loss": 0.8109, "grad_norm": 0.46504274010658264, "learning_rate": 0.0002, "epoch": 0.32531418312387794, "step": 4530}, {"loss": 0.7868, "grad_norm": 0.5498034954071045, "learning_rate": 0.0002, "epoch": 0.326032315978456, "step": 4540}, {"loss": 0.7638, "grad_norm": 0.5901338458061218, "learning_rate": 0.0002, "epoch": 0.3267504488330341, "step": 4550}, {"loss": 0.8016, "grad_norm": 0.5485442876815796, "learning_rate": 0.0002, "epoch": 0.3274685816876122, "step": 4560}, {"loss": 0.7944, "grad_norm": 0.512584924697876, "learning_rate": 0.0002, "epoch": 0.3281867145421903, "step": 4570}, {"loss": 0.8193, "grad_norm": 0.5208188891410828, "learning_rate": 0.0002, "epoch": 0.3289048473967684, "step": 4580}, {"loss": 0.7833, "grad_norm": 0.4923836886882782, "learning_rate": 0.0002, "epoch": 0.3296229802513465, "step": 4590}, {"loss": 0.8102, "grad_norm": 0.49258530139923096, "learning_rate": 0.0002, "epoch": 0.3303411131059246, "step": 4600}, {"loss": 0.7874, "grad_norm": 0.4788922667503357, "learning_rate": 0.0002, "epoch": 0.3310592459605027, "step": 4610}, {"loss": 0.8298, "grad_norm": 0.48276954889297485, "learning_rate": 0.0002, "epoch": 0.3317773788150808, "step": 4620}, {"loss": 0.8519, "grad_norm": 0.6300732493400574, "learning_rate": 0.0002, "epoch": 0.3324955116696589, "step": 4630}, {"loss": 0.8434, "grad_norm": 0.47594770789146423, "learning_rate": 0.0002, "epoch": 0.33321364452423696, "step": 4640}, {"loss": 0.8123, "grad_norm": 0.4728924632072449, "learning_rate": 0.0002, "epoch": 0.3339317773788151, "step": 4650}, {"loss": 0.8113, "grad_norm": 0.5586788654327393, "learning_rate": 0.0002, "epoch": 0.3346499102333932, "step": 4660}, {"loss": 0.7949, "grad_norm": 0.4573180377483368, "learning_rate": 0.0002, "epoch": 0.3353680430879713, "step": 4670}, {"loss": 0.8341, "grad_norm": 0.6391524076461792, "learning_rate": 0.0002, "epoch": 0.33608617594254936, "step": 4680}, {"loss": 0.8126, "grad_norm": 0.6570921540260315, "learning_rate": 0.0002, "epoch": 0.33680430879712747, "step": 4690}, {"loss": 0.796, "grad_norm": 0.4601454734802246, "learning_rate": 0.0002, "epoch": 0.3375224416517056, "step": 4700}, {"loss": 0.8158, "grad_norm": 0.5640755295753479, "learning_rate": 0.0002, "epoch": 0.33824057450628364, "step": 4710}, {"loss": 0.8326, "grad_norm": 0.43475520610809326, "learning_rate": 0.0002, "epoch": 0.33895870736086176, "step": 4720}, {"loss": 0.7684, "grad_norm": 0.4785807132720947, "learning_rate": 0.0002, "epoch": 0.33967684021543987, "step": 4730}, {"loss": 0.8257, "grad_norm": 0.4934665262699127, "learning_rate": 0.0002, "epoch": 0.340394973070018, "step": 4740}, {"loss": 0.7713, "grad_norm": 0.45327693223953247, "learning_rate": 0.0002, "epoch": 0.34111310592459604, "step": 4750}, {"loss": 0.7944, "grad_norm": 0.4710456430912018, "learning_rate": 0.0002, "epoch": 0.34183123877917415, "step": 4760}, {"loss": 0.7689, "grad_norm": 0.5591559410095215, "learning_rate": 0.0002, "epoch": 0.34254937163375226, "step": 4770}, {"loss": 0.8204, "grad_norm": 0.48958835005760193, "learning_rate": 0.0002, "epoch": 0.3432675044883303, "step": 4780}, {"loss": 0.8232, "grad_norm": 0.4613766670227051, "learning_rate": 0.0002, "epoch": 0.34398563734290843, "step": 4790}, {"loss": 0.8339, "grad_norm": 0.5425335764884949, "learning_rate": 0.0002, "epoch": 0.34470377019748655, "step": 4800}, {"loss": 0.828, "grad_norm": 0.4964924156665802, "learning_rate": 0.0002, "epoch": 0.3454219030520646, "step": 4810}, {"loss": 0.8264, "grad_norm": 0.613449215888977, "learning_rate": 0.0002, "epoch": 0.3461400359066427, "step": 4820}, {"loss": 0.846, "grad_norm": 0.6553348898887634, "learning_rate": 0.0002, "epoch": 0.34685816876122083, "step": 4830}, {"loss": 0.8181, "grad_norm": 0.5863470435142517, "learning_rate": 0.0002, "epoch": 0.34757630161579894, "step": 4840}, {"loss": 0.8205, "grad_norm": 0.5338097810745239, "learning_rate": 0.0002, "epoch": 0.348294434470377, "step": 4850}, {"loss": 0.7926, "grad_norm": 0.6129760146141052, "learning_rate": 0.0002, "epoch": 0.3490125673249551, "step": 4860}, {"loss": 0.7745, "grad_norm": 0.6100956797599792, "learning_rate": 0.0002, "epoch": 0.3497307001795332, "step": 4870}, {"loss": 0.7642, "grad_norm": 0.5478541254997253, "learning_rate": 0.0002, "epoch": 0.3504488330341113, "step": 4880}, {"loss": 0.7558, "grad_norm": 0.5725359916687012, "learning_rate": 0.0002, "epoch": 0.3511669658886894, "step": 4890}, {"loss": 0.8208, "grad_norm": 0.6141043901443481, "learning_rate": 0.0002, "epoch": 0.3518850987432675, "step": 4900}, {"loss": 0.841, "grad_norm": 0.597191572189331, "learning_rate": 0.0002, "epoch": 0.3526032315978456, "step": 4910}, {"loss": 0.8234, "grad_norm": 0.5988389253616333, "learning_rate": 0.0002, "epoch": 0.3533213644524237, "step": 4920}, {"loss": 0.7775, "grad_norm": 0.5503361821174622, "learning_rate": 0.0002, "epoch": 0.3540394973070018, "step": 4930}, {"loss": 0.8315, "grad_norm": 0.5932779312133789, "learning_rate": 0.0002, "epoch": 0.3547576301615799, "step": 4940}, {"loss": 0.8407, "grad_norm": 0.48911359906196594, "learning_rate": 0.0002, "epoch": 0.35547576301615796, "step": 4950}, {"loss": 0.8191, "grad_norm": 0.5435750484466553, "learning_rate": 0.0002, "epoch": 0.3561938958707361, "step": 4960}, {"loss": 0.7551, "grad_norm": 0.4786977767944336, "learning_rate": 0.0002, "epoch": 0.3569120287253142, "step": 4970}, {"loss": 0.7845, "grad_norm": 0.4022316336631775, "learning_rate": 0.0002, "epoch": 0.3576301615798923, "step": 4980}, {"loss": 0.8032, "grad_norm": 0.4848504364490509, "learning_rate": 0.0002, "epoch": 0.35834829443447036, "step": 4990}, {"loss": 0.809, "grad_norm": 0.5093459486961365, "learning_rate": 0.0002, "epoch": 0.3590664272890485, "step": 5000}, {"loss": 0.8424, "grad_norm": 0.47368478775024414, "learning_rate": 0.0002, "epoch": 0.3597845601436266, "step": 5010}, {"loss": 0.811, "grad_norm": 0.6041097044944763, "learning_rate": 0.0002, "epoch": 0.36050269299820464, "step": 5020}, {"loss": 0.8023, "grad_norm": 0.5384424924850464, "learning_rate": 0.0002, "epoch": 0.36122082585278276, "step": 5030}, {"loss": 0.826, "grad_norm": 0.4668518602848053, "learning_rate": 0.0002, "epoch": 0.36193895870736087, "step": 5040}, {"loss": 0.7785, "grad_norm": 0.5471060276031494, "learning_rate": 0.0002, "epoch": 0.362657091561939, "step": 5050}, {"loss": 0.7511, "grad_norm": 0.731369137763977, "learning_rate": 0.0002, "epoch": 0.36337522441651704, "step": 5060}, {"loss": 0.8646, "grad_norm": 0.5119590759277344, "learning_rate": 0.0002, "epoch": 0.36409335727109515, "step": 5070}, {"loss": 0.8125, "grad_norm": 0.567428469657898, "learning_rate": 0.0002, "epoch": 0.36481149012567327, "step": 5080}, {"loss": 0.7616, "grad_norm": 0.5139971375465393, "learning_rate": 0.0002, "epoch": 0.3655296229802513, "step": 5090}, {"loss": 0.8091, "grad_norm": 0.5701581835746765, "learning_rate": 0.0002, "epoch": 0.36624775583482944, "step": 5100}, {"loss": 0.821, "grad_norm": 0.5022063851356506, "learning_rate": 0.0002, "epoch": 0.36696588868940755, "step": 5110}, {"loss": 0.7879, "grad_norm": 0.4684354364871979, "learning_rate": 0.0002, "epoch": 0.36768402154398566, "step": 5120}, {"loss": 0.8028, "grad_norm": 0.5423495769500732, "learning_rate": 0.0002, "epoch": 0.3684021543985637, "step": 5130}, {"loss": 0.7763, "grad_norm": 0.46262967586517334, "learning_rate": 0.0002, "epoch": 0.36912028725314183, "step": 5140}, {"loss": 0.8485, "grad_norm": 0.4720141589641571, "learning_rate": 0.0002, "epoch": 0.36983842010771995, "step": 5150}, {"loss": 0.7778, "grad_norm": 0.5113096833229065, "learning_rate": 0.0002, "epoch": 0.370556552962298, "step": 5160}, {"loss": 0.7854, "grad_norm": 0.5253350138664246, "learning_rate": 0.0002, "epoch": 0.3712746858168761, "step": 5170}, {"loss": 0.8539, "grad_norm": 0.5799776315689087, "learning_rate": 0.0002, "epoch": 0.37199281867145423, "step": 5180}, {"loss": 0.78, "grad_norm": 0.5166001319885254, "learning_rate": 0.0002, "epoch": 0.37271095152603234, "step": 5190}, {"loss": 0.7939, "grad_norm": 0.5658290386199951, "learning_rate": 0.0002, "epoch": 0.3734290843806104, "step": 5200}, {"loss": 0.8059, "grad_norm": 0.45811113715171814, "learning_rate": 0.0002, "epoch": 0.3741472172351885, "step": 5210}, {"loss": 0.8024, "grad_norm": 0.5509489178657532, "learning_rate": 0.0002, "epoch": 0.3748653500897666, "step": 5220}, {"loss": 0.7537, "grad_norm": 0.47473257780075073, "learning_rate": 0.0002, "epoch": 0.3755834829443447, "step": 5230}, {"loss": 0.8159, "grad_norm": 0.3858596086502075, "learning_rate": 0.0002, "epoch": 0.3763016157989228, "step": 5240}, {"loss": 0.8592, "grad_norm": 0.6941536068916321, "learning_rate": 0.0002, "epoch": 0.3770197486535009, "step": 5250}, {"loss": 0.8489, "grad_norm": 0.46940872073173523, "learning_rate": 0.0002, "epoch": 0.377737881508079, "step": 5260}, {"loss": 0.7818, "grad_norm": 0.5413833260536194, "learning_rate": 0.0002, "epoch": 0.3784560143626571, "step": 5270}, {"loss": 0.8202, "grad_norm": 0.5165658593177795, "learning_rate": 0.0002, "epoch": 0.3791741472172352, "step": 5280}, {"loss": 0.7837, "grad_norm": 0.6567398309707642, "learning_rate": 0.0002, "epoch": 0.3798922800718133, "step": 5290}, {"loss": 0.7991, "grad_norm": 0.5466915965080261, "learning_rate": 0.0002, "epoch": 0.38061041292639136, "step": 5300}, {"loss": 0.7683, "grad_norm": 0.4800598621368408, "learning_rate": 0.0002, "epoch": 0.3813285457809695, "step": 5310}, {"loss": 0.8653, "grad_norm": 0.4551742970943451, "learning_rate": 0.0002, "epoch": 0.3820466786355476, "step": 5320}, {"loss": 0.8283, "grad_norm": 0.5561164617538452, "learning_rate": 0.0002, "epoch": 0.3827648114901257, "step": 5330}, {"loss": 0.8192, "grad_norm": 0.6170380115509033, "learning_rate": 0.0002, "epoch": 0.38348294434470376, "step": 5340}, {"loss": 0.8015, "grad_norm": 0.465762197971344, "learning_rate": 0.0002, "epoch": 0.38420107719928187, "step": 5350}, {"loss": 0.7561, "grad_norm": 0.6176838874816895, "learning_rate": 0.0002, "epoch": 0.38491921005386, "step": 5360}, {"loss": 0.7571, "grad_norm": 0.657926082611084, "learning_rate": 0.0002, "epoch": 0.38563734290843804, "step": 5370}, {"loss": 0.7366, "grad_norm": 0.5063281655311584, "learning_rate": 0.0002, "epoch": 0.38635547576301615, "step": 5380}, {"loss": 0.8259, "grad_norm": 0.6960828304290771, "learning_rate": 0.0002, "epoch": 0.38707360861759427, "step": 5390}, {"loss": 0.8058, "grad_norm": 0.46712034940719604, "learning_rate": 0.0002, "epoch": 0.3877917414721723, "step": 5400}, {"loss": 0.7674, "grad_norm": 0.598114013671875, "learning_rate": 0.0002, "epoch": 0.38850987432675044, "step": 5410}, {"loss": 0.8256, "grad_norm": 0.6798132061958313, "learning_rate": 0.0002, "epoch": 0.38922800718132855, "step": 5420}, {"loss": 0.844, "grad_norm": 0.5194289088249207, "learning_rate": 0.0002, "epoch": 0.38994614003590666, "step": 5430}, {"loss": 0.7666, "grad_norm": 0.48175323009490967, "learning_rate": 0.0002, "epoch": 0.3906642728904847, "step": 5440}, {"loss": 0.8089, "grad_norm": 0.4979408085346222, "learning_rate": 0.0002, "epoch": 0.39138240574506283, "step": 5450}, {"loss": 0.7938, "grad_norm": 0.6440972685813904, "learning_rate": 0.0002, "epoch": 0.39210053859964095, "step": 5460}, {"loss": 0.8531, "grad_norm": 0.5977227091789246, "learning_rate": 0.0002, "epoch": 0.392818671454219, "step": 5470}, {"loss": 0.8384, "grad_norm": 0.4735909104347229, "learning_rate": 0.0002, "epoch": 0.3935368043087971, "step": 5480}, {"loss": 0.8579, "grad_norm": 0.48181721568107605, "learning_rate": 0.0002, "epoch": 0.39425493716337523, "step": 5490}, {"loss": 0.8113, "grad_norm": 0.6339454650878906, "learning_rate": 0.0002, "epoch": 0.39497307001795334, "step": 5500}, {"loss": 0.7682, "grad_norm": 0.5364336371421814, "learning_rate": 0.0002, "epoch": 0.3956912028725314, "step": 5510}, {"loss": 0.8198, "grad_norm": 0.5499233603477478, "learning_rate": 0.0002, "epoch": 0.3964093357271095, "step": 5520}, {"loss": 0.7981, "grad_norm": 0.47249847650527954, "learning_rate": 0.0002, "epoch": 0.3971274685816876, "step": 5530}, {"loss": 0.8207, "grad_norm": 0.5692135095596313, "learning_rate": 0.0002, "epoch": 0.3978456014362657, "step": 5540}, {"loss": 0.8173, "grad_norm": 0.6009272933006287, "learning_rate": 0.0002, "epoch": 0.3985637342908438, "step": 5550}, {"loss": 0.7622, "grad_norm": 0.5198255181312561, "learning_rate": 0.0002, "epoch": 0.3992818671454219, "step": 5560}, {"loss": 0.8597, "grad_norm": 0.5474766492843628, "learning_rate": 0.0002, "epoch": 0.4, "step": 5570}, {"loss": 0.841, "grad_norm": 0.5577479600906372, "learning_rate": 0.0002, "epoch": 0.4007181328545781, "step": 5580}, {"loss": 0.7986, "grad_norm": 0.5350302457809448, "learning_rate": 0.0002, "epoch": 0.4014362657091562, "step": 5590}, {"loss": 0.7892, "grad_norm": 0.6310991048812866, "learning_rate": 0.0002, "epoch": 0.4021543985637343, "step": 5600}, {"loss": 0.7834, "grad_norm": 0.5695762038230896, "learning_rate": 0.0002, "epoch": 0.40287253141831236, "step": 5610}, {"loss": 0.7508, "grad_norm": 0.5431827306747437, "learning_rate": 0.0002, "epoch": 0.4035906642728905, "step": 5620}, {"loss": 0.8743, "grad_norm": 0.4923325777053833, "learning_rate": 0.0002, "epoch": 0.4043087971274686, "step": 5630}, {"loss": 0.7745, "grad_norm": 0.531399667263031, "learning_rate": 0.0002, "epoch": 0.4050269299820467, "step": 5640}, {"loss": 0.7982, "grad_norm": 0.5854769349098206, "learning_rate": 0.0002, "epoch": 0.40574506283662476, "step": 5650}, {"loss": 0.8225, "grad_norm": 0.6684802174568176, "learning_rate": 0.0002, "epoch": 0.40646319569120287, "step": 5660}, {"loss": 0.7405, "grad_norm": 0.6618620753288269, "learning_rate": 0.0002, "epoch": 0.407181328545781, "step": 5670}, {"loss": 0.7707, "grad_norm": 0.4930776059627533, "learning_rate": 0.0002, "epoch": 0.40789946140035904, "step": 5680}, {"loss": 0.7846, "grad_norm": 0.506628155708313, "learning_rate": 0.0002, "epoch": 0.40861759425493716, "step": 5690}, {"loss": 0.7827, "grad_norm": 0.5250783562660217, "learning_rate": 0.0002, "epoch": 0.40933572710951527, "step": 5700}, {"loss": 0.8386, "grad_norm": 0.6773046851158142, "learning_rate": 0.0002, "epoch": 0.4100538599640934, "step": 5710}, {"loss": 0.8096, "grad_norm": 0.6750592589378357, "learning_rate": 0.0002, "epoch": 0.41077199281867144, "step": 5720}, {"loss": 0.7873, "grad_norm": 0.5277232527732849, "learning_rate": 0.0002, "epoch": 0.41149012567324955, "step": 5730}, {"loss": 0.762, "grad_norm": 0.5155990719795227, "learning_rate": 0.0002, "epoch": 0.41220825852782766, "step": 5740}, {"loss": 0.871, "grad_norm": 0.5236294865608215, "learning_rate": 0.0002, "epoch": 0.4129263913824057, "step": 5750}, {"loss": 0.7753, "grad_norm": 0.5073592066764832, "learning_rate": 0.0002, "epoch": 0.41364452423698383, "step": 5760}, {"loss": 0.7984, "grad_norm": 0.6997184753417969, "learning_rate": 0.0002, "epoch": 0.41436265709156195, "step": 5770}, {"loss": 0.7579, "grad_norm": 0.5282439589500427, "learning_rate": 0.0002, "epoch": 0.41508078994614006, "step": 5780}, {"loss": 0.7831, "grad_norm": 0.4997355341911316, "learning_rate": 0.0002, "epoch": 0.4157989228007181, "step": 5790}, {"loss": 0.8022, "grad_norm": 0.6081610321998596, "learning_rate": 0.0002, "epoch": 0.41651705565529623, "step": 5800}, {"loss": 0.8068, "grad_norm": 0.5640295147895813, "learning_rate": 0.0002, "epoch": 0.41723518850987434, "step": 5810}, {"loss": 0.7819, "grad_norm": 0.6443586349487305, "learning_rate": 0.0002, "epoch": 0.4179533213644524, "step": 5820}, {"loss": 0.8132, "grad_norm": 0.6456229090690613, "learning_rate": 0.0002, "epoch": 0.4186714542190305, "step": 5830}, {"loss": 0.785, "grad_norm": 0.5422267317771912, "learning_rate": 0.0002, "epoch": 0.4193895870736086, "step": 5840}, {"loss": 0.7962, "grad_norm": 0.45251885056495667, "learning_rate": 0.0002, "epoch": 0.42010771992818674, "step": 5850}, {"loss": 0.7945, "grad_norm": 0.781165599822998, "learning_rate": 0.0002, "epoch": 0.4208258527827648, "step": 5860}, {"loss": 0.8171, "grad_norm": 0.5359160900115967, "learning_rate": 0.0002, "epoch": 0.4215439856373429, "step": 5870}, {"loss": 0.8012, "grad_norm": 0.6201958656311035, "learning_rate": 0.0002, "epoch": 0.422262118491921, "step": 5880}, {"loss": 0.8363, "grad_norm": 0.5985850691795349, "learning_rate": 0.0002, "epoch": 0.4229802513464991, "step": 5890}, {"loss": 0.7842, "grad_norm": 0.5550961494445801, "learning_rate": 0.0002, "epoch": 0.4236983842010772, "step": 5900}, {"loss": 0.7717, "grad_norm": 0.6284893155097961, "learning_rate": 0.0002, "epoch": 0.4244165170556553, "step": 5910}, {"loss": 0.8165, "grad_norm": 0.6143685579299927, "learning_rate": 0.0002, "epoch": 0.4251346499102334, "step": 5920}, {"loss": 0.7986, "grad_norm": 0.5065329670906067, "learning_rate": 0.0002, "epoch": 0.4258527827648115, "step": 5930}, {"loss": 0.7883, "grad_norm": 0.7274345755577087, "learning_rate": 0.0002, "epoch": 0.4265709156193896, "step": 5940}, {"loss": 0.8126, "grad_norm": 0.606531023979187, "learning_rate": 0.0002, "epoch": 0.4272890484739677, "step": 5950}, {"loss": 0.7805, "grad_norm": 0.5983648300170898, "learning_rate": 0.0002, "epoch": 0.42800718132854576, "step": 5960}, {"loss": 0.8124, "grad_norm": 0.5546031594276428, "learning_rate": 0.0002, "epoch": 0.4287253141831239, "step": 5970}, {"loss": 0.8184, "grad_norm": 0.666868269443512, "learning_rate": 0.0002, "epoch": 0.429443447037702, "step": 5980}, {"loss": 0.8171, "grad_norm": 0.41438576579093933, "learning_rate": 0.0002, "epoch": 0.4301615798922801, "step": 5990}, {"loss": 0.8456, "grad_norm": 0.5012526512145996, "learning_rate": 0.0002, "epoch": 0.43087971274685816, "step": 6000}, {"loss": 0.7837, "grad_norm": 0.6071694493293762, "learning_rate": 0.0002, "epoch": 0.43159784560143627, "step": 6010}, {"loss": 0.8364, "grad_norm": 0.5538384914398193, "learning_rate": 0.0002, "epoch": 0.4323159784560144, "step": 6020}, {"loss": 0.7888, "grad_norm": 0.5798718929290771, "learning_rate": 0.0002, "epoch": 0.43303411131059244, "step": 6030}, {"loss": 0.8196, "grad_norm": 0.5442442893981934, "learning_rate": 0.0002, "epoch": 0.43375224416517055, "step": 6040}, {"loss": 0.8041, "grad_norm": 0.6895565390586853, "learning_rate": 0.0002, "epoch": 0.43447037701974867, "step": 6050}, {"loss": 0.8154, "grad_norm": 0.6498045325279236, "learning_rate": 0.0002, "epoch": 0.4351885098743267, "step": 6060}, {"loss": 0.782, "grad_norm": 0.5225510001182556, "learning_rate": 0.0002, "epoch": 0.43590664272890484, "step": 6070}, {"loss": 0.7809, "grad_norm": 0.6366992592811584, "learning_rate": 0.0002, "epoch": 0.43662477558348295, "step": 6080}, {"loss": 0.7715, "grad_norm": 0.47929027676582336, "learning_rate": 0.0002, "epoch": 0.43734290843806106, "step": 6090}, {"loss": 0.7481, "grad_norm": 0.5722405910491943, "learning_rate": 0.0002, "epoch": 0.4380610412926391, "step": 6100}, {"loss": 0.765, "grad_norm": 0.6008004546165466, "learning_rate": 0.0002, "epoch": 0.43877917414721723, "step": 6110}, {"loss": 0.7795, "grad_norm": 0.5922580361366272, "learning_rate": 0.0002, "epoch": 0.43949730700179535, "step": 6120}, {"loss": 0.8542, "grad_norm": 0.7051905393600464, "learning_rate": 0.0002, "epoch": 0.4402154398563734, "step": 6130}, {"loss": 0.8159, "grad_norm": 0.5146450400352478, "learning_rate": 0.0002, "epoch": 0.4409335727109515, "step": 6140}, {"loss": 0.8178, "grad_norm": 0.5605781674385071, "learning_rate": 0.0002, "epoch": 0.44165170556552963, "step": 6150}, {"loss": 0.8409, "grad_norm": 0.8008661866188049, "learning_rate": 0.0002, "epoch": 0.44236983842010774, "step": 6160}, {"loss": 0.797, "grad_norm": 0.47406497597694397, "learning_rate": 0.0002, "epoch": 0.4430879712746858, "step": 6170}, {"loss": 0.7853, "grad_norm": 0.612287700176239, "learning_rate": 0.0002, "epoch": 0.4438061041292639, "step": 6180}, {"loss": 0.835, "grad_norm": 0.561188280582428, "learning_rate": 0.0002, "epoch": 0.444524236983842, "step": 6190}, {"loss": 0.7604, "grad_norm": 0.6233669519424438, "learning_rate": 0.0002, "epoch": 0.4452423698384201, "step": 6200}, {"loss": 0.7539, "grad_norm": 0.45546263456344604, "learning_rate": 0.0002, "epoch": 0.4459605026929982, "step": 6210}, {"loss": 0.8183, "grad_norm": 0.5947871208190918, "learning_rate": 0.0002, "epoch": 0.4466786355475763, "step": 6220}, {"loss": 0.789, "grad_norm": 0.6109753847122192, "learning_rate": 0.0002, "epoch": 0.4473967684021544, "step": 6230}, {"loss": 0.7811, "grad_norm": 0.6380727887153625, "learning_rate": 0.0002, "epoch": 0.4481149012567325, "step": 6240}, {"loss": 0.7845, "grad_norm": 0.5225699543952942, "learning_rate": 0.0002, "epoch": 0.4488330341113106, "step": 6250}, {"loss": 0.8217, "grad_norm": 0.521503210067749, "learning_rate": 0.0002, "epoch": 0.4495511669658887, "step": 6260}, {"loss": 0.8392, "grad_norm": 0.5523216128349304, "learning_rate": 0.0002, "epoch": 0.45026929982046676, "step": 6270}, {"loss": 0.8228, "grad_norm": 0.5954921841621399, "learning_rate": 0.0002, "epoch": 0.4509874326750449, "step": 6280}, {"loss": 0.7798, "grad_norm": 0.702751100063324, "learning_rate": 0.0002, "epoch": 0.451705565529623, "step": 6290}, {"loss": 0.7865, "grad_norm": 0.5756356120109558, "learning_rate": 0.0002, "epoch": 0.4524236983842011, "step": 6300}, {"loss": 0.8128, "grad_norm": 0.45365944504737854, "learning_rate": 0.0002, "epoch": 0.45314183123877916, "step": 6310}, {"loss": 0.8027, "grad_norm": 0.5027855038642883, "learning_rate": 0.0002, "epoch": 0.45385996409335727, "step": 6320}, {"loss": 0.8052, "grad_norm": 0.6551687121391296, "learning_rate": 0.0002, "epoch": 0.4545780969479354, "step": 6330}, {"loss": 0.7507, "grad_norm": 0.5296684503555298, "learning_rate": 0.0002, "epoch": 0.45529622980251344, "step": 6340}, {"loss": 0.8209, "grad_norm": 0.5762032866477966, "learning_rate": 0.0002, "epoch": 0.45601436265709155, "step": 6350}, {"loss": 0.8209, "grad_norm": 0.5234073996543884, "learning_rate": 0.0002, "epoch": 0.45673249551166967, "step": 6360}, {"loss": 0.8412, "grad_norm": 0.5090946555137634, "learning_rate": 0.0002, "epoch": 0.4574506283662478, "step": 6370}, {"loss": 0.787, "grad_norm": 0.6515111327171326, "learning_rate": 0.0002, "epoch": 0.45816876122082584, "step": 6380}, {"loss": 0.7351, "grad_norm": 0.7904898524284363, "learning_rate": 0.0002, "epoch": 0.45888689407540395, "step": 6390}, {"loss": 0.841, "grad_norm": 0.6379680037498474, "learning_rate": 0.0002, "epoch": 0.45960502692998206, "step": 6400}, {"loss": 0.7727, "grad_norm": 0.641759991645813, "learning_rate": 0.0002, "epoch": 0.4603231597845601, "step": 6410}, {"loss": 0.8346, "grad_norm": 0.5273829698562622, "learning_rate": 0.0002, "epoch": 0.46104129263913823, "step": 6420}, {"loss": 0.7722, "grad_norm": 0.5668497681617737, "learning_rate": 0.0002, "epoch": 0.46175942549371635, "step": 6430}, {"loss": 0.8157, "grad_norm": 0.5862061381340027, "learning_rate": 0.0002, "epoch": 0.46247755834829446, "step": 6440}, {"loss": 0.818, "grad_norm": 0.5239592790603638, "learning_rate": 0.0002, "epoch": 0.4631956912028725, "step": 6450}, {"loss": 0.7803, "grad_norm": 0.5078722834587097, "learning_rate": 0.0002, "epoch": 0.46391382405745063, "step": 6460}, {"loss": 0.7934, "grad_norm": 0.566509485244751, "learning_rate": 0.0002, "epoch": 0.46463195691202874, "step": 6470}, {"loss": 0.7746, "grad_norm": 0.5952697396278381, "learning_rate": 0.0002, "epoch": 0.4653500897666068, "step": 6480}, {"loss": 0.8088, "grad_norm": 0.6548156142234802, "learning_rate": 0.0002, "epoch": 0.4660682226211849, "step": 6490}, {"loss": 0.8303, "grad_norm": 0.4768427908420563, "learning_rate": 0.0002, "epoch": 0.466786355475763, "step": 6500}, {"loss": 0.805, "grad_norm": 0.5588273406028748, "learning_rate": 0.0002, "epoch": 0.46750448833034114, "step": 6510}, {"loss": 0.7774, "grad_norm": 0.5348677039146423, "learning_rate": 0.0002, "epoch": 0.4682226211849192, "step": 6520}, {"loss": 0.7969, "grad_norm": 0.4784318804740906, "learning_rate": 0.0002, "epoch": 0.4689407540394973, "step": 6530}, {"loss": 0.8073, "grad_norm": 0.5112265944480896, "learning_rate": 0.0002, "epoch": 0.4696588868940754, "step": 6540}, {"loss": 0.8289, "grad_norm": 0.7250495553016663, "learning_rate": 0.0002, "epoch": 0.4703770197486535, "step": 6550}, {"loss": 0.808, "grad_norm": 0.538608968257904, "learning_rate": 0.0002, "epoch": 0.4710951526032316, "step": 6560}, {"loss": 0.7977, "grad_norm": 0.5981247425079346, "learning_rate": 0.0002, "epoch": 0.4718132854578097, "step": 6570}, {"loss": 0.8092, "grad_norm": 0.5466762781143188, "learning_rate": 0.0002, "epoch": 0.4725314183123878, "step": 6580}, {"loss": 0.8136, "grad_norm": 0.5609987378120422, "learning_rate": 0.0002, "epoch": 0.4732495511669659, "step": 6590}, {"loss": 0.8575, "grad_norm": 0.6091027855873108, "learning_rate": 0.0002, "epoch": 0.473967684021544, "step": 6600}, {"loss": 0.7741, "grad_norm": 0.5542886853218079, "learning_rate": 0.0002, "epoch": 0.4746858168761221, "step": 6610}, {"loss": 0.7867, "grad_norm": 0.5656579732894897, "learning_rate": 0.0002, "epoch": 0.47540394973070016, "step": 6620}, {"loss": 0.7647, "grad_norm": 0.47507357597351074, "learning_rate": 0.0002, "epoch": 0.4761220825852783, "step": 6630}, {"loss": 0.8323, "grad_norm": 0.6039174199104309, "learning_rate": 0.0002, "epoch": 0.4768402154398564, "step": 6640}, {"loss": 0.7812, "grad_norm": 0.7129740715026855, "learning_rate": 0.0002, "epoch": 0.47755834829443444, "step": 6650}, {"loss": 0.8001, "grad_norm": 0.5189188718795776, "learning_rate": 0.0002, "epoch": 0.47827648114901256, "step": 6660}, {"loss": 0.7467, "grad_norm": 0.7548696398735046, "learning_rate": 0.0002, "epoch": 0.47899461400359067, "step": 6670}, {"loss": 0.7694, "grad_norm": 0.4729466438293457, "learning_rate": 0.0002, "epoch": 0.4797127468581688, "step": 6680}, {"loss": 0.7497, "grad_norm": 0.6190000772476196, "learning_rate": 0.0002, "epoch": 0.48043087971274684, "step": 6690}, {"loss": 0.7691, "grad_norm": 0.6276983022689819, "learning_rate": 0.0002, "epoch": 0.48114901256732495, "step": 6700}, {"loss": 0.7947, "grad_norm": 0.6097590923309326, "learning_rate": 0.0002, "epoch": 0.48186714542190306, "step": 6710}, {"loss": 0.7735, "grad_norm": 0.6507330536842346, "learning_rate": 0.0002, "epoch": 0.4825852782764811, "step": 6720}, {"loss": 0.817, "grad_norm": 0.5501991510391235, "learning_rate": 0.0002, "epoch": 0.48330341113105924, "step": 6730}, {"loss": 0.7998, "grad_norm": 0.5928015112876892, "learning_rate": 0.0002, "epoch": 0.48402154398563735, "step": 6740}, {"loss": 0.7717, "grad_norm": 0.5523008704185486, "learning_rate": 0.0002, "epoch": 0.48473967684021546, "step": 6750}, {"loss": 0.7821, "grad_norm": 0.5997263789176941, "learning_rate": 0.0002, "epoch": 0.4854578096947935, "step": 6760}, {"loss": 0.7619, "grad_norm": 0.6201002597808838, "learning_rate": 0.0002, "epoch": 0.48617594254937163, "step": 6770}, {"loss": 0.8018, "grad_norm": 0.6338862776756287, "learning_rate": 0.0002, "epoch": 0.48689407540394974, "step": 6780}, {"loss": 0.7547, "grad_norm": 0.5542550086975098, "learning_rate": 0.0002, "epoch": 0.4876122082585278, "step": 6790}, {"loss": 0.7754, "grad_norm": 0.5587872862815857, "learning_rate": 0.0002, "epoch": 0.4883303411131059, "step": 6800}, {"loss": 0.7913, "grad_norm": 0.5895681977272034, "learning_rate": 0.0002, "epoch": 0.489048473967684, "step": 6810}, {"loss": 0.7799, "grad_norm": 0.4948221743106842, "learning_rate": 0.0002, "epoch": 0.48976660682226214, "step": 6820}, {"loss": 0.8057, "grad_norm": 0.44546931982040405, "learning_rate": 0.0002, "epoch": 0.4904847396768402, "step": 6830}, {"loss": 0.8124, "grad_norm": 0.632046103477478, "learning_rate": 0.0002, "epoch": 0.4912028725314183, "step": 6840}, {"loss": 0.8014, "grad_norm": 0.49396243691444397, "learning_rate": 0.0002, "epoch": 0.4919210053859964, "step": 6850}, {"loss": 0.7127, "grad_norm": 0.497745156288147, "learning_rate": 0.0002, "epoch": 0.4926391382405745, "step": 6860}, {"loss": 0.8306, "grad_norm": 0.7336170077323914, "learning_rate": 0.0002, "epoch": 0.4933572710951526, "step": 6870}, {"loss": 0.8342, "grad_norm": 0.6723181009292603, "learning_rate": 0.0002, "epoch": 0.4940754039497307, "step": 6880}, {"loss": 0.8251, "grad_norm": 0.5887754559516907, "learning_rate": 0.0002, "epoch": 0.4947935368043088, "step": 6890}, {"loss": 0.7904, "grad_norm": 0.6580226421356201, "learning_rate": 0.0002, "epoch": 0.4955116696588869, "step": 6900}, {"loss": 0.8203, "grad_norm": 0.7385056614875793, "learning_rate": 0.0002, "epoch": 0.496229802513465, "step": 6910}, {"loss": 0.87, "grad_norm": 0.48736000061035156, "learning_rate": 0.0002, "epoch": 0.4969479353680431, "step": 6920}, {"loss": 0.8045, "grad_norm": 0.6304559111595154, "learning_rate": 0.0002, "epoch": 0.49766606822262116, "step": 6930}, {"loss": 0.8323, "grad_norm": 0.607148289680481, "learning_rate": 0.0002, "epoch": 0.4983842010771993, "step": 6940}, {"loss": 0.8277, "grad_norm": 0.5467981696128845, "learning_rate": 0.0002, "epoch": 0.4991023339317774, "step": 6950}, {"loss": 0.804, "grad_norm": 0.7046723961830139, "learning_rate": 0.0002, "epoch": 0.4998204667863555, "step": 6960}, {"loss": 0.7836, "grad_norm": 0.5487921833992004, "learning_rate": 0.0002, "epoch": 0.5005385996409336, "step": 6970}, {"loss": 0.8445, "grad_norm": 0.5706006288528442, "learning_rate": 0.0002, "epoch": 0.5012567324955116, "step": 6980}, {"loss": 0.8216, "grad_norm": 0.539536714553833, "learning_rate": 0.0002, "epoch": 0.5019748653500897, "step": 6990}, {"loss": 0.7829, "grad_norm": 0.5527397394180298, "learning_rate": 0.0002, "epoch": 0.5026929982046678, "step": 7000}, {"loss": 0.8342, "grad_norm": 0.5498567223548889, "learning_rate": 0.0002, "epoch": 0.503411131059246, "step": 7010}, {"loss": 0.8073, "grad_norm": 0.5878575444221497, "learning_rate": 0.0002, "epoch": 0.5041292639138241, "step": 7020}, {"loss": 0.8284, "grad_norm": 0.646153450012207, "learning_rate": 0.0002, "epoch": 0.5048473967684022, "step": 7030}, {"loss": 0.7758, "grad_norm": 0.5603899359703064, "learning_rate": 0.0002, "epoch": 0.5055655296229803, "step": 7040}, {"loss": 0.8002, "grad_norm": 0.5849952697753906, "learning_rate": 0.0002, "epoch": 0.5062836624775583, "step": 7050}, {"loss": 0.7953, "grad_norm": 0.6082724928855896, "learning_rate": 0.0002, "epoch": 0.5070017953321364, "step": 7060}, {"loss": 0.8046, "grad_norm": 0.5900670289993286, "learning_rate": 0.0002, "epoch": 0.5077199281867145, "step": 7070}, {"loss": 0.8612, "grad_norm": 0.5856624841690063, "learning_rate": 0.0002, "epoch": 0.5084380610412926, "step": 7080}, {"loss": 0.8289, "grad_norm": 0.6177338361740112, "learning_rate": 0.0002, "epoch": 0.5091561938958707, "step": 7090}, {"loss": 0.8139, "grad_norm": 0.5559300184249878, "learning_rate": 0.0002, "epoch": 0.5098743267504489, "step": 7100}, {"loss": 0.8083, "grad_norm": 0.62027907371521, "learning_rate": 0.0002, "epoch": 0.510592459605027, "step": 7110}, {"loss": 0.8037, "grad_norm": 0.6334301829338074, "learning_rate": 0.0002, "epoch": 0.511310592459605, "step": 7120}, {"loss": 0.8107, "grad_norm": 0.513795018196106, "learning_rate": 0.0002, "epoch": 0.5120287253141831, "step": 7130}, {"loss": 0.7566, "grad_norm": 0.7004675269126892, "learning_rate": 0.0002, "epoch": 0.5127468581687612, "step": 7140}, {"loss": 0.7893, "grad_norm": 0.5614308714866638, "learning_rate": 0.0002, "epoch": 0.5134649910233393, "step": 7150}, {"loss": 0.7868, "grad_norm": 0.5037539601325989, "learning_rate": 0.0002, "epoch": 0.5141831238779174, "step": 7160}, {"loss": 0.7981, "grad_norm": 0.5568661093711853, "learning_rate": 0.0002, "epoch": 0.5149012567324955, "step": 7170}, {"loss": 0.8333, "grad_norm": 0.7513397336006165, "learning_rate": 0.0002, "epoch": 0.5156193895870737, "step": 7180}, {"loss": 0.792, "grad_norm": 0.7264583706855774, "learning_rate": 0.0002, "epoch": 0.5163375224416517, "step": 7190}, {"loss": 0.8671, "grad_norm": 0.6355819702148438, "learning_rate": 0.0002, "epoch": 0.5170556552962298, "step": 7200}, {"loss": 0.7734, "grad_norm": 0.6063222289085388, "learning_rate": 0.0002, "epoch": 0.5177737881508079, "step": 7210}, {"loss": 0.812, "grad_norm": 0.6484307646751404, "learning_rate": 0.0002, "epoch": 0.518491921005386, "step": 7220}, {"loss": 0.7852, "grad_norm": 0.5260455012321472, "learning_rate": 0.0002, "epoch": 0.5192100538599641, "step": 7230}, {"loss": 0.8301, "grad_norm": 0.6718002557754517, "learning_rate": 0.0002, "epoch": 0.5199281867145422, "step": 7240}, {"loss": 0.8178, "grad_norm": 0.5997617244720459, "learning_rate": 0.0002, "epoch": 0.5206463195691203, "step": 7250}, {"loss": 0.7631, "grad_norm": 0.5838589668273926, "learning_rate": 0.0002, "epoch": 0.5213644524236983, "step": 7260}, {"loss": 0.7853, "grad_norm": 0.5755977630615234, "learning_rate": 0.0002, "epoch": 0.5220825852782764, "step": 7270}, {"loss": 0.8233, "grad_norm": 0.6442093253135681, "learning_rate": 0.0002, "epoch": 0.5228007181328546, "step": 7280}, {"loss": 0.822, "grad_norm": 0.6128416657447815, "learning_rate": 0.0002, "epoch": 0.5235188509874327, "step": 7290}, {"loss": 0.802, "grad_norm": 0.509742796421051, "learning_rate": 0.0002, "epoch": 0.5242369838420108, "step": 7300}, {"loss": 0.7438, "grad_norm": 0.5450230836868286, "learning_rate": 0.0002, "epoch": 0.5249551166965889, "step": 7310}, {"loss": 0.7881, "grad_norm": 0.5437141060829163, "learning_rate": 0.0002, "epoch": 0.525673249551167, "step": 7320}, {"loss": 0.795, "grad_norm": 0.5291738510131836, "learning_rate": 0.0002, "epoch": 0.526391382405745, "step": 7330}, {"loss": 0.8204, "grad_norm": 0.5101743936538696, "learning_rate": 0.0002, "epoch": 0.5271095152603231, "step": 7340}, {"loss": 0.856, "grad_norm": 0.5678408145904541, "learning_rate": 0.0002, "epoch": 0.5278276481149012, "step": 7350}, {"loss": 0.8435, "grad_norm": 0.6332360506057739, "learning_rate": 0.0002, "epoch": 0.5285457809694794, "step": 7360}, {"loss": 0.8521, "grad_norm": 0.4935058653354645, "learning_rate": 0.0002, "epoch": 0.5292639138240575, "step": 7370}, {"loss": 0.7699, "grad_norm": 0.6399656534194946, "learning_rate": 0.0002, "epoch": 0.5299820466786356, "step": 7380}, {"loss": 0.7956, "grad_norm": 0.5986794233322144, "learning_rate": 0.0002, "epoch": 0.5307001795332137, "step": 7390}, {"loss": 0.774, "grad_norm": 0.6948414444923401, "learning_rate": 0.0002, "epoch": 0.5314183123877917, "step": 7400}, {"loss": 0.8267, "grad_norm": 0.5337842106819153, "learning_rate": 0.0002, "epoch": 0.5321364452423698, "step": 7410}, {"loss": 0.7634, "grad_norm": 0.6897268295288086, "learning_rate": 0.0002, "epoch": 0.5328545780969479, "step": 7420}, {"loss": 0.7606, "grad_norm": 0.6361175179481506, "learning_rate": 0.0002, "epoch": 0.533572710951526, "step": 7430}, {"loss": 0.7592, "grad_norm": 0.5242252945899963, "learning_rate": 0.0002, "epoch": 0.5342908438061041, "step": 7440}, {"loss": 0.7387, "grad_norm": 0.5731322765350342, "learning_rate": 0.0002, "epoch": 0.5350089766606823, "step": 7450}, {"loss": 0.8215, "grad_norm": 0.5790955424308777, "learning_rate": 0.0002, "epoch": 0.5357271095152604, "step": 7460}, {"loss": 0.7714, "grad_norm": 0.4979061782360077, "learning_rate": 0.0002, "epoch": 0.5364452423698384, "step": 7470}, {"loss": 0.794, "grad_norm": 0.7335101962089539, "learning_rate": 0.0002, "epoch": 0.5371633752244165, "step": 7480}, {"loss": 0.787, "grad_norm": 0.592521071434021, "learning_rate": 0.0002, "epoch": 0.5378815080789946, "step": 7490}, {"loss": 0.7421, "grad_norm": 0.5784769654273987, "learning_rate": 0.0002, "epoch": 0.5385996409335727, "step": 7500}, {"loss": 0.789, "grad_norm": 0.8148589730262756, "learning_rate": 0.0002, "epoch": 0.5393177737881508, "step": 7510}, {"loss": 0.7777, "grad_norm": 0.5727689862251282, "learning_rate": 0.0002, "epoch": 0.5400359066427289, "step": 7520}, {"loss": 0.8321, "grad_norm": 0.6958279609680176, "learning_rate": 0.0002, "epoch": 0.540754039497307, "step": 7530}, {"loss": 0.7678, "grad_norm": 0.6302788257598877, "learning_rate": 0.0002, "epoch": 0.541472172351885, "step": 7540}, {"loss": 0.7772, "grad_norm": 0.5950970649719238, "learning_rate": 0.0002, "epoch": 0.5421903052064632, "step": 7550}, {"loss": 0.8076, "grad_norm": 0.4275270104408264, "learning_rate": 0.0002, "epoch": 0.5429084380610413, "step": 7560}, {"loss": 0.8158, "grad_norm": 0.7579900622367859, "learning_rate": 0.0002, "epoch": 0.5436265709156194, "step": 7570}, {"loss": 0.8036, "grad_norm": 0.5835317969322205, "learning_rate": 0.0002, "epoch": 0.5443447037701975, "step": 7580}, {"loss": 0.7947, "grad_norm": 0.5305142998695374, "learning_rate": 0.0002, "epoch": 0.5450628366247756, "step": 7590}, {"loss": 0.8043, "grad_norm": 0.6076129674911499, "learning_rate": 0.0002, "epoch": 0.5457809694793537, "step": 7600}, {"loss": 0.8197, "grad_norm": 0.5341935753822327, "learning_rate": 0.0002, "epoch": 0.5464991023339317, "step": 7610}, {"loss": 0.7424, "grad_norm": 0.6070826053619385, "learning_rate": 0.0002, "epoch": 0.5472172351885098, "step": 7620}, {"loss": 0.7801, "grad_norm": 0.6193035840988159, "learning_rate": 0.0002, "epoch": 0.547935368043088, "step": 7630}, {"loss": 0.7639, "grad_norm": 0.6171614527702332, "learning_rate": 0.0002, "epoch": 0.5486535008976661, "step": 7640}, {"loss": 0.7655, "grad_norm": 0.5700938105583191, "learning_rate": 0.0002, "epoch": 0.5493716337522442, "step": 7650}, {"loss": 0.8289, "grad_norm": 0.5742418169975281, "learning_rate": 0.0002, "epoch": 0.5500897666068223, "step": 7660}, {"loss": 0.7942, "grad_norm": 0.6450320482254028, "learning_rate": 0.0002, "epoch": 0.5508078994614004, "step": 7670}, {"loss": 0.807, "grad_norm": 0.542860209941864, "learning_rate": 0.0002, "epoch": 0.5515260323159784, "step": 7680}, {"loss": 0.8298, "grad_norm": 0.538007915019989, "learning_rate": 0.0002, "epoch": 0.5522441651705565, "step": 7690}, {"loss": 0.8301, "grad_norm": 0.5846288204193115, "learning_rate": 0.0002, "epoch": 0.5529622980251346, "step": 7700}, {"loss": 0.7893, "grad_norm": 0.623315155506134, "learning_rate": 0.0002, "epoch": 0.5536804308797127, "step": 7710}, {"loss": 0.8043, "grad_norm": 0.6607962250709534, "learning_rate": 0.0002, "epoch": 0.5543985637342909, "step": 7720}, {"loss": 0.7615, "grad_norm": 0.5258557200431824, "learning_rate": 0.0002, "epoch": 0.555116696588869, "step": 7730}, {"loss": 0.8177, "grad_norm": 0.6464316844940186, "learning_rate": 0.0002, "epoch": 0.5558348294434471, "step": 7740}, {"loss": 0.7683, "grad_norm": 0.6390621662139893, "learning_rate": 0.0002, "epoch": 0.5565529622980251, "step": 7750}, {"loss": 0.8447, "grad_norm": 0.5327560305595398, "learning_rate": 0.0002, "epoch": 0.5572710951526032, "step": 7760}, {"loss": 0.7833, "grad_norm": 0.8202064633369446, "learning_rate": 0.0002, "epoch": 0.5579892280071813, "step": 7770}, {"loss": 0.7818, "grad_norm": 0.45350968837738037, "learning_rate": 0.0002, "epoch": 0.5587073608617594, "step": 7780}, {"loss": 0.7299, "grad_norm": 0.5031413435935974, "learning_rate": 0.0002, "epoch": 0.5594254937163375, "step": 7790}, {"loss": 0.7542, "grad_norm": 0.5047417879104614, "learning_rate": 0.0002, "epoch": 0.5601436265709157, "step": 7800}, {"loss": 0.7989, "grad_norm": 0.668912410736084, "learning_rate": 0.0002, "epoch": 0.5608617594254938, "step": 7810}, {"loss": 0.8226, "grad_norm": 0.6106061339378357, "learning_rate": 0.0002, "epoch": 0.5615798922800718, "step": 7820}, {"loss": 0.7489, "grad_norm": 0.5558443665504456, "learning_rate": 0.0002, "epoch": 0.5622980251346499, "step": 7830}, {"loss": 0.79, "grad_norm": 0.5937177538871765, "learning_rate": 0.0002, "epoch": 0.563016157989228, "step": 7840}, {"loss": 0.7857, "grad_norm": 0.67307448387146, "learning_rate": 0.0002, "epoch": 0.5637342908438061, "step": 7850}, {"loss": 0.8037, "grad_norm": 0.4615475833415985, "learning_rate": 0.0002, "epoch": 0.5644524236983842, "step": 7860}, {"loss": 0.7519, "grad_norm": 0.5462577939033508, "learning_rate": 0.0002, "epoch": 0.5651705565529623, "step": 7870}, {"loss": 0.7821, "grad_norm": 0.6422402858734131, "learning_rate": 0.0002, "epoch": 0.5658886894075404, "step": 7880}, {"loss": 0.8327, "grad_norm": 0.5313532948493958, "learning_rate": 0.0002, "epoch": 0.5666068222621184, "step": 7890}, {"loss": 0.7771, "grad_norm": 0.5647847056388855, "learning_rate": 0.0002, "epoch": 0.5673249551166966, "step": 7900}, {"loss": 0.8126, "grad_norm": 0.6581610441207886, "learning_rate": 0.0002, "epoch": 0.5680430879712747, "step": 7910}, {"loss": 0.7549, "grad_norm": 0.46947669982910156, "learning_rate": 0.0002, "epoch": 0.5687612208258528, "step": 7920}, {"loss": 0.8333, "grad_norm": 0.6420038342475891, "learning_rate": 0.0002, "epoch": 0.5694793536804309, "step": 7930}, {"loss": 0.7921, "grad_norm": 0.6730441451072693, "learning_rate": 0.0002, "epoch": 0.570197486535009, "step": 7940}, {"loss": 0.7668, "grad_norm": 0.3849070966243744, "learning_rate": 0.0002, "epoch": 0.5709156193895871, "step": 7950}, {"loss": 0.8297, "grad_norm": 0.6076335906982422, "learning_rate": 0.0002, "epoch": 0.5716337522441651, "step": 7960}, {"loss": 0.7932, "grad_norm": 0.6446982026100159, "learning_rate": 0.0002, "epoch": 0.5723518850987432, "step": 7970}, {"loss": 0.7988, "grad_norm": 0.6019234657287598, "learning_rate": 0.0002, "epoch": 0.5730700179533214, "step": 7980}, {"loss": 0.8103, "grad_norm": 0.620880663394928, "learning_rate": 0.0002, "epoch": 0.5737881508078995, "step": 7990}, {"loss": 0.7712, "grad_norm": 0.4927573502063751, "learning_rate": 0.0002, "epoch": 0.5745062836624776, "step": 8000}, {"loss": 0.7499, "grad_norm": 0.6276804804801941, "learning_rate": 0.0002, "epoch": 0.5752244165170557, "step": 8010}, {"loss": 0.8232, "grad_norm": 0.484518826007843, "learning_rate": 0.0002, "epoch": 0.5759425493716338, "step": 8020}, {"loss": 0.7658, "grad_norm": 0.5019962787628174, "learning_rate": 0.0002, "epoch": 0.5766606822262118, "step": 8030}, {"loss": 0.7827, "grad_norm": 0.6685234308242798, "learning_rate": 0.0002, "epoch": 0.5773788150807899, "step": 8040}, {"loss": 0.7811, "grad_norm": 0.5762107372283936, "learning_rate": 0.0002, "epoch": 0.578096947935368, "step": 8050}, {"loss": 0.8256, "grad_norm": 0.6402477025985718, "learning_rate": 0.0002, "epoch": 0.5788150807899461, "step": 8060}, {"loss": 0.779, "grad_norm": 0.5919345617294312, "learning_rate": 0.0002, "epoch": 0.5795332136445243, "step": 8070}, {"loss": 0.8179, "grad_norm": 0.47100913524627686, "learning_rate": 0.0002, "epoch": 0.5802513464991024, "step": 8080}, {"loss": 0.7832, "grad_norm": 0.6029118895530701, "learning_rate": 0.0002, "epoch": 0.5809694793536805, "step": 8090}, {"loss": 0.8061, "grad_norm": 0.5896338820457458, "learning_rate": 0.0002, "epoch": 0.5816876122082585, "step": 8100}, {"loss": 0.7991, "grad_norm": 0.49017754197120667, "learning_rate": 0.0002, "epoch": 0.5824057450628366, "step": 8110}, {"loss": 0.8148, "grad_norm": 0.5049256086349487, "learning_rate": 0.0002, "epoch": 0.5831238779174147, "step": 8120}, {"loss": 0.7561, "grad_norm": 0.6874517798423767, "learning_rate": 0.0002, "epoch": 0.5838420107719928, "step": 8130}, {"loss": 0.7908, "grad_norm": 0.5429391264915466, "learning_rate": 0.0002, "epoch": 0.5845601436265709, "step": 8140}, {"loss": 0.7834, "grad_norm": 0.5533722639083862, "learning_rate": 0.0002, "epoch": 0.585278276481149, "step": 8150}, {"loss": 0.7725, "grad_norm": 0.5827956199645996, "learning_rate": 0.0002, "epoch": 0.5859964093357272, "step": 8160}, {"loss": 0.7758, "grad_norm": 0.6670212149620056, "learning_rate": 0.0002, "epoch": 0.5867145421903052, "step": 8170}, {"loss": 0.7625, "grad_norm": 0.5231172442436218, "learning_rate": 0.0002, "epoch": 0.5874326750448833, "step": 8180}, {"loss": 0.7975, "grad_norm": 0.567447304725647, "learning_rate": 0.0002, "epoch": 0.5881508078994614, "step": 8190}, {"loss": 0.7463, "grad_norm": 0.5318575501441956, "learning_rate": 0.0002, "epoch": 0.5888689407540395, "step": 8200}, {"loss": 0.7961, "grad_norm": 0.6959463357925415, "learning_rate": 0.0002, "epoch": 0.5895870736086176, "step": 8210}, {"loss": 0.7575, "grad_norm": 0.6964931488037109, "learning_rate": 0.0002, "epoch": 0.5903052064631957, "step": 8220}, {"loss": 0.8382, "grad_norm": 0.5164617896080017, "learning_rate": 0.0002, "epoch": 0.5910233393177737, "step": 8230}, {"loss": 0.8152, "grad_norm": 0.5456110239028931, "learning_rate": 0.0002, "epoch": 0.5917414721723518, "step": 8240}, {"loss": 0.7627, "grad_norm": 0.6553666591644287, "learning_rate": 0.0002, "epoch": 0.59245960502693, "step": 8250}, {"loss": 0.8134, "grad_norm": 0.6185845732688904, "learning_rate": 0.0002, "epoch": 0.5931777378815081, "step": 8260}, {"loss": 0.8216, "grad_norm": 0.6110545992851257, "learning_rate": 0.0002, "epoch": 0.5938958707360862, "step": 8270}, {"loss": 0.805, "grad_norm": 0.5186824202537537, "learning_rate": 0.0002, "epoch": 0.5946140035906643, "step": 8280}, {"loss": 0.7934, "grad_norm": 0.7003735303878784, "learning_rate": 0.0002, "epoch": 0.5953321364452424, "step": 8290}, {"loss": 0.8095, "grad_norm": 0.4606216549873352, "learning_rate": 0.0002, "epoch": 0.5960502692998204, "step": 8300}, {"loss": 0.8051, "grad_norm": 0.5903441309928894, "learning_rate": 0.0002, "epoch": 0.5967684021543985, "step": 8310}, {"loss": 0.7861, "grad_norm": 0.7916744947433472, "learning_rate": 0.0002, "epoch": 0.5974865350089766, "step": 8320}, {"loss": 0.8234, "grad_norm": 0.5506401062011719, "learning_rate": 0.0002, "epoch": 0.5982046678635548, "step": 8330}, {"loss": 0.8137, "grad_norm": 0.5749204158782959, "learning_rate": 0.0002, "epoch": 0.5989228007181329, "step": 8340}, {"loss": 0.8133, "grad_norm": 0.6807544827461243, "learning_rate": 0.0002, "epoch": 0.599640933572711, "step": 8350}, {"loss": 0.8089, "grad_norm": 0.5782986283302307, "learning_rate": 0.0002, "epoch": 0.6003590664272891, "step": 8360}, {"loss": 0.8725, "grad_norm": 0.7336342334747314, "learning_rate": 0.0002, "epoch": 0.6010771992818671, "step": 8370}, {"loss": 0.7992, "grad_norm": 0.5762712955474854, "learning_rate": 0.0002, "epoch": 0.6017953321364452, "step": 8380}, {"loss": 0.8037, "grad_norm": 0.5726776719093323, "learning_rate": 0.0002, "epoch": 0.6025134649910233, "step": 8390}, {"loss": 0.7918, "grad_norm": 0.5355535745620728, "learning_rate": 0.0002, "epoch": 0.6032315978456014, "step": 8400}, {"loss": 0.8138, "grad_norm": 0.6762161254882812, "learning_rate": 0.0002, "epoch": 0.6039497307001795, "step": 8410}, {"loss": 0.8357, "grad_norm": 0.8200717568397522, "learning_rate": 0.0002, "epoch": 0.6046678635547577, "step": 8420}, {"loss": 0.79, "grad_norm": 0.5600009560585022, "learning_rate": 0.0002, "epoch": 0.6053859964093358, "step": 8430}, {"loss": 0.7387, "grad_norm": 0.6465966105461121, "learning_rate": 0.0002, "epoch": 0.6061041292639138, "step": 8440}, {"loss": 0.838, "grad_norm": 0.5176072120666504, "learning_rate": 0.0002, "epoch": 0.6068222621184919, "step": 8450}, {"loss": 0.7855, "grad_norm": 0.5777280926704407, "learning_rate": 0.0002, "epoch": 0.60754039497307, "step": 8460}, {"loss": 0.7776, "grad_norm": 0.5989252924919128, "learning_rate": 0.0002, "epoch": 0.6082585278276481, "step": 8470}, {"loss": 0.8216, "grad_norm": 0.5207306742668152, "learning_rate": 0.0002, "epoch": 0.6089766606822262, "step": 8480}, {"loss": 0.8092, "grad_norm": 0.5242675542831421, "learning_rate": 0.0002, "epoch": 0.6096947935368043, "step": 8490}, {"loss": 0.7546, "grad_norm": 0.5631455183029175, "learning_rate": 0.0002, "epoch": 0.6104129263913824, "step": 8500}, {"loss": 0.7495, "grad_norm": 0.65207439661026, "learning_rate": 0.0002, "epoch": 0.6111310592459605, "step": 8510}, {"loss": 0.8023, "grad_norm": 0.5808899998664856, "learning_rate": 0.0002, "epoch": 0.6118491921005386, "step": 8520}, {"loss": 0.7763, "grad_norm": 0.558127760887146, "learning_rate": 0.0002, "epoch": 0.6125673249551167, "step": 8530}, {"loss": 0.8012, "grad_norm": 0.6063143014907837, "learning_rate": 0.0002, "epoch": 0.6132854578096948, "step": 8540}, {"loss": 0.7496, "grad_norm": 0.5491744875907898, "learning_rate": 0.0002, "epoch": 0.6140035906642729, "step": 8550}, {"loss": 0.779, "grad_norm": 0.5105780959129333, "learning_rate": 0.0002, "epoch": 0.614721723518851, "step": 8560}, {"loss": 0.7983, "grad_norm": 0.6892395615577698, "learning_rate": 0.0002, "epoch": 0.6154398563734291, "step": 8570}, {"loss": 0.7563, "grad_norm": 0.7411758899688721, "learning_rate": 0.0002, "epoch": 0.6161579892280071, "step": 8580}, {"loss": 0.7455, "grad_norm": 0.6745429635047913, "learning_rate": 0.0002, "epoch": 0.6168761220825852, "step": 8590}, {"loss": 0.8213, "grad_norm": 0.596007227897644, "learning_rate": 0.0002, "epoch": 0.6175942549371634, "step": 8600}, {"loss": 0.7963, "grad_norm": 0.6751060485839844, "learning_rate": 0.0002, "epoch": 0.6183123877917415, "step": 8610}, {"loss": 0.7343, "grad_norm": 0.711124837398529, "learning_rate": 0.0002, "epoch": 0.6190305206463196, "step": 8620}, {"loss": 0.773, "grad_norm": 0.6110914945602417, "learning_rate": 0.0002, "epoch": 0.6197486535008977, "step": 8630}, {"loss": 0.7497, "grad_norm": 0.5687659978866577, "learning_rate": 0.0002, "epoch": 0.6204667863554758, "step": 8640}, {"loss": 0.7754, "grad_norm": 0.7025772929191589, "learning_rate": 0.0002, "epoch": 0.6211849192100538, "step": 8650}, {"loss": 0.7423, "grad_norm": 0.6456184983253479, "learning_rate": 0.0002, "epoch": 0.6219030520646319, "step": 8660}, {"loss": 0.7449, "grad_norm": 0.5317023992538452, "learning_rate": 0.0002, "epoch": 0.62262118491921, "step": 8670}, {"loss": 0.8146, "grad_norm": 0.5531691908836365, "learning_rate": 0.0002, "epoch": 0.6233393177737881, "step": 8680}, {"loss": 0.8171, "grad_norm": 0.6063531637191772, "learning_rate": 0.0002, "epoch": 0.6240574506283663, "step": 8690}, {"loss": 0.7943, "grad_norm": 1.094390630722046, "learning_rate": 0.0002, "epoch": 0.6247755834829444, "step": 8700}, {"loss": 0.7993, "grad_norm": 0.5558148622512817, "learning_rate": 0.0002, "epoch": 0.6254937163375225, "step": 8710}, {"loss": 0.7747, "grad_norm": 0.5470370054244995, "learning_rate": 0.0002, "epoch": 0.6262118491921005, "step": 8720}, {"loss": 0.8252, "grad_norm": 0.5852634310722351, "learning_rate": 0.0002, "epoch": 0.6269299820466786, "step": 8730}, {"loss": 0.8712, "grad_norm": 0.6120240092277527, "learning_rate": 0.0002, "epoch": 0.6276481149012567, "step": 8740}, {"loss": 0.8367, "grad_norm": 0.5608004927635193, "learning_rate": 0.0002, "epoch": 0.6283662477558348, "step": 8750}, {"loss": 0.7711, "grad_norm": 0.5980432033538818, "learning_rate": 0.0002, "epoch": 0.6290843806104129, "step": 8760}, {"loss": 0.7903, "grad_norm": 0.5670580863952637, "learning_rate": 0.0002, "epoch": 0.629802513464991, "step": 8770}, {"loss": 0.7765, "grad_norm": 0.5931687951087952, "learning_rate": 0.0002, "epoch": 0.6305206463195692, "step": 8780}, {"loss": 0.7752, "grad_norm": 0.7872577905654907, "learning_rate": 0.0002, "epoch": 0.6312387791741472, "step": 8790}, {"loss": 0.8045, "grad_norm": 0.6355181336402893, "learning_rate": 0.0002, "epoch": 0.6319569120287253, "step": 8800}, {"loss": 0.7651, "grad_norm": 0.501913845539093, "learning_rate": 0.0002, "epoch": 0.6326750448833034, "step": 8810}, {"loss": 0.8023, "grad_norm": 0.5956716537475586, "learning_rate": 0.0002, "epoch": 0.6333931777378815, "step": 8820}, {"loss": 0.798, "grad_norm": 0.6448253393173218, "learning_rate": 0.0002, "epoch": 0.6341113105924596, "step": 8830}, {"loss": 0.7878, "grad_norm": 0.6139631271362305, "learning_rate": 0.0002, "epoch": 0.6348294434470377, "step": 8840}, {"loss": 0.7767, "grad_norm": 0.5894306302070618, "learning_rate": 0.0002, "epoch": 0.6355475763016158, "step": 8850}, {"loss": 0.7516, "grad_norm": 0.8724799752235413, "learning_rate": 0.0002, "epoch": 0.6362657091561938, "step": 8860}, {"loss": 0.7715, "grad_norm": 0.5413858890533447, "learning_rate": 0.0002, "epoch": 0.636983842010772, "step": 8870}, {"loss": 0.8175, "grad_norm": 0.5993430614471436, "learning_rate": 0.0002, "epoch": 0.6377019748653501, "step": 8880}, {"loss": 0.7865, "grad_norm": 0.539415717124939, "learning_rate": 0.0002, "epoch": 0.6384201077199282, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.600125789642334, "learning_rate": 0.0002, "epoch": 0.6391382405745063, "step": 8900}, {"loss": 0.7886, "grad_norm": 0.5597978234291077, "learning_rate": 0.0002, "epoch": 0.6398563734290844, "step": 8910}, {"loss": 0.8468, "grad_norm": 0.6262031197547913, "learning_rate": 0.0002, "epoch": 0.6405745062836625, "step": 8920}, {"loss": 0.7523, "grad_norm": 0.72662752866745, "learning_rate": 0.0002, "epoch": 0.6412926391382405, "step": 8930}, {"loss": 0.8099, "grad_norm": 0.613002598285675, "learning_rate": 0.0002, "epoch": 0.6420107719928186, "step": 8940}, {"loss": 0.8112, "grad_norm": 0.6511827707290649, "learning_rate": 0.0002, "epoch": 0.6427289048473968, "step": 8950}, {"loss": 0.7479, "grad_norm": 0.5383973717689514, "learning_rate": 0.0002, "epoch": 0.6434470377019749, "step": 8960}, {"loss": 0.764, "grad_norm": 0.5236184597015381, "learning_rate": 0.0002, "epoch": 0.644165170556553, "step": 8970}, {"loss": 0.7515, "grad_norm": 0.5938544273376465, "learning_rate": 0.0002, "epoch": 0.6448833034111311, "step": 8980}, {"loss": 0.8103, "grad_norm": 0.4594680964946747, "learning_rate": 0.0002, "epoch": 0.6456014362657092, "step": 8990}, {"loss": 0.7495, "grad_norm": 0.6314211487770081, "learning_rate": 0.0002, "epoch": 0.6463195691202872, "step": 9000}, {"loss": 0.8162, "grad_norm": 0.6291103363037109, "learning_rate": 0.0002, "epoch": 0.6470377019748653, "step": 9010}, {"loss": 0.8167, "grad_norm": 0.5888266563415527, "learning_rate": 0.0002, "epoch": 0.6477558348294434, "step": 9020}, {"loss": 0.7685, "grad_norm": 0.5613022446632385, "learning_rate": 0.0002, "epoch": 0.6484739676840215, "step": 9030}, {"loss": 0.8142, "grad_norm": 0.7219604253768921, "learning_rate": 0.0002, "epoch": 0.6491921005385997, "step": 9040}, {"loss": 0.805, "grad_norm": 0.5846529006958008, "learning_rate": 0.0002, "epoch": 0.6499102333931778, "step": 9050}, {"loss": 0.8471, "grad_norm": 0.7264063954353333, "learning_rate": 0.0002, "epoch": 0.6506283662477559, "step": 9060}, {"loss": 0.7925, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "epoch": 0.6513464991023339, "step": 9070}, {"loss": 0.7961, "grad_norm": 0.4857395887374878, "learning_rate": 0.0002, "epoch": 0.652064631956912, "step": 9080}, {"loss": 0.7567, "grad_norm": 0.5044030547142029, "learning_rate": 0.0002, "epoch": 0.6527827648114901, "step": 9090}, {"loss": 0.7889, "grad_norm": 0.6105342507362366, "learning_rate": 0.0002, "epoch": 0.6535008976660682, "step": 9100}, {"loss": 0.7692, "grad_norm": 0.6408740282058716, "learning_rate": 0.0002, "epoch": 0.6542190305206463, "step": 9110}, {"loss": 0.7788, "grad_norm": 0.7474880814552307, "learning_rate": 0.0002, "epoch": 0.6549371633752245, "step": 9120}, {"loss": 0.7694, "grad_norm": 0.584768533706665, "learning_rate": 0.0002, "epoch": 0.6556552962298026, "step": 9130}, {"loss": 0.8273, "grad_norm": 0.6368113160133362, "learning_rate": 0.0002, "epoch": 0.6563734290843806, "step": 9140}, {"loss": 0.7493, "grad_norm": 0.693631649017334, "learning_rate": 0.0002, "epoch": 0.6570915619389587, "step": 9150}, {"loss": 0.7636, "grad_norm": 0.6094512343406677, "learning_rate": 0.0002, "epoch": 0.6578096947935368, "step": 9160}, {"loss": 0.8269, "grad_norm": 0.7154942750930786, "learning_rate": 0.0002, "epoch": 0.6585278276481149, "step": 9170}, {"loss": 0.7623, "grad_norm": 0.5749237537384033, "learning_rate": 0.0002, "epoch": 0.659245960502693, "step": 9180}, {"loss": 0.799, "grad_norm": 0.6214450001716614, "learning_rate": 0.0002, "epoch": 0.6599640933572711, "step": 9190}, {"loss": 0.7973, "grad_norm": 0.6357814073562622, "learning_rate": 0.0002, "epoch": 0.6606822262118492, "step": 9200}, {"loss": 0.773, "grad_norm": 0.5677326917648315, "learning_rate": 0.0002, "epoch": 0.6614003590664272, "step": 9210}, {"loss": 0.8173, "grad_norm": 0.5432633757591248, "learning_rate": 0.0002, "epoch": 0.6621184919210054, "step": 9220}, {"loss": 0.7573, "grad_norm": 0.43935060501098633, "learning_rate": 0.0002, "epoch": 0.6628366247755835, "step": 9230}, {"loss": 0.848, "grad_norm": 0.5350922346115112, "learning_rate": 0.0002, "epoch": 0.6635547576301616, "step": 9240}, {"loss": 0.7409, "grad_norm": 0.7745687365531921, "learning_rate": 0.0002, "epoch": 0.6642728904847397, "step": 9250}, {"loss": 0.7412, "grad_norm": 0.5767113566398621, "learning_rate": 0.0002, "epoch": 0.6649910233393178, "step": 9260}, {"loss": 0.8197, "grad_norm": 0.49304983019828796, "learning_rate": 0.0002, "epoch": 0.6657091561938959, "step": 9270}, {"loss": 0.7856, "grad_norm": 0.6355269551277161, "learning_rate": 0.0002, "epoch": 0.6664272890484739, "step": 9280}, {"loss": 0.7659, "grad_norm": 0.5539451241493225, "learning_rate": 0.0002, "epoch": 0.667145421903052, "step": 9290}, {"loss": 0.7888, "grad_norm": 0.5225138068199158, "learning_rate": 0.0002, "epoch": 0.6678635547576302, "step": 9300}, {"loss": 0.8048, "grad_norm": 0.5435736179351807, "learning_rate": 0.0002, "epoch": 0.6685816876122083, "step": 9310}, {"loss": 0.8284, "grad_norm": 0.611266553401947, "learning_rate": 0.0002, "epoch": 0.6692998204667864, "step": 9320}, {"loss": 0.8081, "grad_norm": 0.5880926251411438, "learning_rate": 0.0002, "epoch": 0.6700179533213645, "step": 9330}, {"loss": 0.7781, "grad_norm": 0.5301468372344971, "learning_rate": 0.0002, "epoch": 0.6707360861759426, "step": 9340}, {"loss": 0.7586, "grad_norm": 0.5614377856254578, "learning_rate": 0.0002, "epoch": 0.6714542190305206, "step": 9350}, {"loss": 0.7538, "grad_norm": 0.7177342176437378, "learning_rate": 0.0002, "epoch": 0.6721723518850987, "step": 9360}, {"loss": 0.7412, "grad_norm": 0.5187423825263977, "learning_rate": 0.0002, "epoch": 0.6728904847396768, "step": 9370}, {"loss": 0.7456, "grad_norm": 0.49305087327957153, "learning_rate": 0.0002, "epoch": 0.6736086175942549, "step": 9380}, {"loss": 0.7926, "grad_norm": 0.555867612361908, "learning_rate": 0.0002, "epoch": 0.6743267504488331, "step": 9390}, {"loss": 0.7486, "grad_norm": 0.8308040499687195, "learning_rate": 0.0002, "epoch": 0.6750448833034112, "step": 9400}, {"loss": 0.8225, "grad_norm": 0.6522438526153564, "learning_rate": 0.0002, "epoch": 0.6757630161579893, "step": 9410}, {"loss": 0.8283, "grad_norm": 0.5768371224403381, "learning_rate": 0.0002, "epoch": 0.6764811490125673, "step": 9420}, {"loss": 0.7815, "grad_norm": 0.783802330493927, "learning_rate": 0.0002, "epoch": 0.6771992818671454, "step": 9430}, {"loss": 0.7511, "grad_norm": 0.5246656537055969, "learning_rate": 0.0002, "epoch": 0.6779174147217235, "step": 9440}, {"loss": 0.7866, "grad_norm": 0.6630974411964417, "learning_rate": 0.0002, "epoch": 0.6786355475763016, "step": 9450}, {"loss": 0.7961, "grad_norm": 0.5012770295143127, "learning_rate": 0.0002, "epoch": 0.6793536804308797, "step": 9460}, {"loss": 0.7762, "grad_norm": 0.6208643317222595, "learning_rate": 0.0002, "epoch": 0.6800718132854578, "step": 9470}, {"loss": 0.7229, "grad_norm": 0.6033898591995239, "learning_rate": 0.0002, "epoch": 0.680789946140036, "step": 9480}, {"loss": 0.8315, "grad_norm": 0.6613174080848694, "learning_rate": 0.0002, "epoch": 0.681508078994614, "step": 9490}, {"loss": 0.7874, "grad_norm": 0.6417899131774902, "learning_rate": 0.0002, "epoch": 0.6822262118491921, "step": 9500}, {"loss": 0.7979, "grad_norm": 0.5060321092605591, "learning_rate": 0.0002, "epoch": 0.6829443447037702, "step": 9510}, {"loss": 0.7908, "grad_norm": 0.586670458316803, "learning_rate": 0.0002, "epoch": 0.6836624775583483, "step": 9520}, {"loss": 0.7652, "grad_norm": 0.6607828736305237, "learning_rate": 0.0002, "epoch": 0.6843806104129264, "step": 9530}, {"loss": 0.7645, "grad_norm": 0.5142775177955627, "learning_rate": 0.0002, "epoch": 0.6850987432675045, "step": 9540}, {"loss": 0.7553, "grad_norm": 0.741000771522522, "learning_rate": 0.0002, "epoch": 0.6858168761220825, "step": 9550}, {"loss": 0.8453, "grad_norm": 0.4687826335430145, "learning_rate": 0.0002, "epoch": 0.6865350089766606, "step": 9560}, {"loss": 0.7582, "grad_norm": 0.6452056169509888, "learning_rate": 0.0002, "epoch": 0.6872531418312388, "step": 9570}, {"loss": 0.7965, "grad_norm": 0.6393555402755737, "learning_rate": 0.0002, "epoch": 0.6879712746858169, "step": 9580}, {"loss": 0.802, "grad_norm": 0.4907757043838501, "learning_rate": 0.0002, "epoch": 0.688689407540395, "step": 9590}, {"loss": 0.7813, "grad_norm": 0.5380825996398926, "learning_rate": 0.0002, "epoch": 0.6894075403949731, "step": 9600}, {"loss": 0.8188, "grad_norm": 0.5657393932342529, "learning_rate": 0.0002, "epoch": 0.6901256732495512, "step": 9610}, {"loss": 0.7581, "grad_norm": 0.8505447506904602, "learning_rate": 0.0002, "epoch": 0.6908438061041292, "step": 9620}, {"loss": 0.7631, "grad_norm": 0.5389836430549622, "learning_rate": 0.0002, "epoch": 0.6915619389587073, "step": 9630}, {"loss": 0.8015, "grad_norm": 0.4977441728115082, "learning_rate": 0.0002, "epoch": 0.6922800718132854, "step": 9640}, {"loss": 0.8057, "grad_norm": 0.5855389833450317, "learning_rate": 0.0002, "epoch": 0.6929982046678635, "step": 9650}, {"loss": 0.7735, "grad_norm": 0.633994996547699, "learning_rate": 0.0002, "epoch": 0.6937163375224417, "step": 9660}, {"loss": 0.7918, "grad_norm": 0.5592191815376282, "learning_rate": 0.0002, "epoch": 0.6944344703770198, "step": 9670}, {"loss": 0.7883, "grad_norm": 0.6030594706535339, "learning_rate": 0.0002, "epoch": 0.6951526032315979, "step": 9680}, {"loss": 0.7472, "grad_norm": 0.6782388687133789, "learning_rate": 0.0002, "epoch": 0.6958707360861759, "step": 9690}, {"loss": 0.8097, "grad_norm": 0.6777627468109131, "learning_rate": 0.0002, "epoch": 0.696588868940754, "step": 9700}, {"loss": 0.7958, "grad_norm": 0.5674123764038086, "learning_rate": 0.0002, "epoch": 0.6973070017953321, "step": 9710}, {"loss": 0.7743, "grad_norm": 0.5280387997627258, "learning_rate": 0.0002, "epoch": 0.6980251346499102, "step": 9720}, {"loss": 0.7496, "grad_norm": 0.5471981763839722, "learning_rate": 0.0002, "epoch": 0.6987432675044883, "step": 9730}, {"loss": 0.7837, "grad_norm": 0.6751061677932739, "learning_rate": 0.0002, "epoch": 0.6994614003590665, "step": 9740}, {"loss": 0.7686, "grad_norm": 0.5942487716674805, "learning_rate": 0.0002, "epoch": 0.7001795332136446, "step": 9750}, {"loss": 0.757, "grad_norm": 0.6165713667869568, "learning_rate": 0.0002, "epoch": 0.7008976660682226, "step": 9760}, {"loss": 0.7864, "grad_norm": 0.5745091438293457, "learning_rate": 0.0002, "epoch": 0.7016157989228007, "step": 9770}, {"loss": 0.8079, "grad_norm": 0.600308358669281, "learning_rate": 0.0002, "epoch": 0.7023339317773788, "step": 9780}, {"loss": 0.7527, "grad_norm": 0.6448577046394348, "learning_rate": 0.0002, "epoch": 0.7030520646319569, "step": 9790}, {"loss": 0.7725, "grad_norm": 0.5662767291069031, "learning_rate": 0.0002, "epoch": 0.703770197486535, "step": 9800}, {"loss": 0.8028, "grad_norm": 0.6490433812141418, "learning_rate": 0.0002, "epoch": 0.7044883303411131, "step": 9810}, {"loss": 0.8006, "grad_norm": 0.6126134991645813, "learning_rate": 0.0002, "epoch": 0.7052064631956912, "step": 9820}, {"loss": 0.8034, "grad_norm": 0.7181116938591003, "learning_rate": 0.0002, "epoch": 0.7059245960502692, "step": 9830}, {"loss": 0.7937, "grad_norm": 0.7805212140083313, "learning_rate": 0.0002, "epoch": 0.7066427289048474, "step": 9840}, {"loss": 0.7781, "grad_norm": 0.7521958947181702, "learning_rate": 0.0002, "epoch": 0.7073608617594255, "step": 9850}, {"loss": 0.7412, "grad_norm": 0.5610787868499756, "learning_rate": 0.0002, "epoch": 0.7080789946140036, "step": 9860}, {"loss": 0.7627, "grad_norm": 0.7026229500770569, "learning_rate": 0.0002, "epoch": 0.7087971274685817, "step": 9870}, {"loss": 0.8085, "grad_norm": 0.551691472530365, "learning_rate": 0.0002, "epoch": 0.7095152603231598, "step": 9880}, {"loss": 0.7874, "grad_norm": 0.5841995477676392, "learning_rate": 0.0002, "epoch": 0.7102333931777379, "step": 9890}, {"loss": 0.7749, "grad_norm": 0.7170061469078064, "learning_rate": 0.0002, "epoch": 0.7109515260323159, "step": 9900}, {"loss": 0.7917, "grad_norm": 0.49836990237236023, "learning_rate": 0.0002, "epoch": 0.711669658886894, "step": 9910}, {"loss": 0.7667, "grad_norm": 0.5234556794166565, "learning_rate": 0.0002, "epoch": 0.7123877917414722, "step": 9920}, {"loss": 0.8438, "grad_norm": 0.7590384483337402, "learning_rate": 0.0002, "epoch": 0.7131059245960503, "step": 9930}, {"loss": 0.7725, "grad_norm": 0.5657515525817871, "learning_rate": 0.0002, "epoch": 0.7138240574506284, "step": 9940}, {"loss": 0.8184, "grad_norm": 0.5969128012657166, "learning_rate": 0.0002, "epoch": 0.7145421903052065, "step": 9950}, {"loss": 0.7375, "grad_norm": 0.7136867046356201, "learning_rate": 0.0002, "epoch": 0.7152603231597846, "step": 9960}, {"loss": 0.7883, "grad_norm": 0.6774699091911316, "learning_rate": 0.0002, "epoch": 0.7159784560143626, "step": 9970}, {"loss": 0.7629, "grad_norm": 0.6066371202468872, "learning_rate": 0.0002, "epoch": 0.7166965888689407, "step": 9980}, {"loss": 0.7767, "grad_norm": 0.7355279922485352, "learning_rate": 0.0002, "epoch": 0.7174147217235188, "step": 9990}, {"loss": 0.7643, "grad_norm": 0.7996646761894226, "learning_rate": 0.0002, "epoch": 0.718132854578097, "step": 10000}, {"loss": 0.8304, "grad_norm": 0.628839910030365, "learning_rate": 0.0002, "epoch": 0.7188509874326751, "step": 10010}, {"loss": 0.7292, "grad_norm": 0.5472931265830994, "learning_rate": 0.0002, "epoch": 0.7195691202872532, "step": 10020}, {"loss": 0.7787, "grad_norm": 0.5776344537734985, "learning_rate": 0.0002, "epoch": 0.7202872531418313, "step": 10030}, {"loss": 0.7432, "grad_norm": 0.5041707158088684, "learning_rate": 0.0002, "epoch": 0.7210053859964093, "step": 10040}, {"loss": 0.7923, "grad_norm": 0.5965308547019958, "learning_rate": 0.0002, "epoch": 0.7217235188509874, "step": 10050}, {"loss": 0.8131, "grad_norm": 0.5892689228057861, "learning_rate": 0.0002, "epoch": 0.7224416517055655, "step": 10060}, {"loss": 0.7961, "grad_norm": 0.5695884227752686, "learning_rate": 0.0002, "epoch": 0.7231597845601436, "step": 10070}, {"loss": 0.7806, "grad_norm": 0.6547690629959106, "learning_rate": 0.0002, "epoch": 0.7238779174147217, "step": 10080}, {"loss": 0.7978, "grad_norm": 0.6759928464889526, "learning_rate": 0.0002, "epoch": 0.7245960502692999, "step": 10090}, {"loss": 0.7547, "grad_norm": 0.6829725503921509, "learning_rate": 0.0002, "epoch": 0.725314183123878, "step": 10100}, {"loss": 0.7507, "grad_norm": 0.5242751240730286, "learning_rate": 0.0002, "epoch": 0.726032315978456, "step": 10110}, {"loss": 0.8042, "grad_norm": 0.6947014927864075, "learning_rate": 0.0002, "epoch": 0.7267504488330341, "step": 10120}, {"loss": 0.7621, "grad_norm": 0.6094982624053955, "learning_rate": 0.0002, "epoch": 0.7274685816876122, "step": 10130}, {"loss": 0.7911, "grad_norm": 0.628461480140686, "learning_rate": 0.0002, "epoch": 0.7281867145421903, "step": 10140}, {"loss": 0.7839, "grad_norm": 0.4952087104320526, "learning_rate": 0.0002, "epoch": 0.7289048473967684, "step": 10150}, {"loss": 0.7582, "grad_norm": 0.6917221546173096, "learning_rate": 0.0002, "epoch": 0.7296229802513465, "step": 10160}, {"loss": 0.7791, "grad_norm": 0.6866413354873657, "learning_rate": 0.0002, "epoch": 0.7303411131059246, "step": 10170}, {"loss": 0.7628, "grad_norm": 0.5505863428115845, "learning_rate": 0.0002, "epoch": 0.7310592459605026, "step": 10180}, {"loss": 0.7941, "grad_norm": 0.5903199911117554, "learning_rate": 0.0002, "epoch": 0.7317773788150808, "step": 10190}, {"loss": 0.8072, "grad_norm": 0.5001798272132874, "learning_rate": 0.0002, "epoch": 0.7324955116696589, "step": 10200}, {"loss": 0.7934, "grad_norm": 0.5117581486701965, "learning_rate": 0.0002, "epoch": 0.733213644524237, "step": 10210}, {"loss": 0.8364, "grad_norm": 0.7716088891029358, "learning_rate": 0.0002, "epoch": 0.7339317773788151, "step": 10220}, {"loss": 0.7775, "grad_norm": 0.5973874926567078, "learning_rate": 0.0002, "epoch": 0.7346499102333932, "step": 10230}, {"loss": 0.7689, "grad_norm": 0.6433483362197876, "learning_rate": 0.0002, "epoch": 0.7353680430879713, "step": 10240}, {"loss": 0.8307, "grad_norm": 0.6241081357002258, "learning_rate": 0.0002, "epoch": 0.7360861759425493, "step": 10250}, {"loss": 0.7432, "grad_norm": 0.7198845744132996, "learning_rate": 0.0002, "epoch": 0.7368043087971274, "step": 10260}, {"loss": 0.7545, "grad_norm": 0.5879023671150208, "learning_rate": 0.0002, "epoch": 0.7375224416517056, "step": 10270}, {"loss": 0.7526, "grad_norm": 0.5810162425041199, "learning_rate": 0.0002, "epoch": 0.7382405745062837, "step": 10280}, {"loss": 0.7839, "grad_norm": 0.6336500644683838, "learning_rate": 0.0002, "epoch": 0.7389587073608618, "step": 10290}, {"loss": 0.7597, "grad_norm": 0.5627583861351013, "learning_rate": 0.0002, "epoch": 0.7396768402154399, "step": 10300}, {"loss": 0.8166, "grad_norm": 0.5396066904067993, "learning_rate": 0.0002, "epoch": 0.740394973070018, "step": 10310}, {"loss": 0.7698, "grad_norm": 0.5519505143165588, "learning_rate": 0.0002, "epoch": 0.741113105924596, "step": 10320}, {"loss": 0.7953, "grad_norm": 0.628710925579071, "learning_rate": 0.0002, "epoch": 0.7418312387791741, "step": 10330}, {"loss": 0.805, "grad_norm": 0.6466957926750183, "learning_rate": 0.0002, "epoch": 0.7425493716337522, "step": 10340}, {"loss": 0.8173, "grad_norm": 0.6269286274909973, "learning_rate": 0.0002, "epoch": 0.7432675044883303, "step": 10350}, {"loss": 0.8315, "grad_norm": 0.6985455751419067, "learning_rate": 0.0002, "epoch": 0.7439856373429085, "step": 10360}, {"loss": 0.7598, "grad_norm": 0.6203648447990417, "learning_rate": 0.0002, "epoch": 0.7447037701974866, "step": 10370}, {"loss": 0.7937, "grad_norm": 0.6524295210838318, "learning_rate": 0.0002, "epoch": 0.7454219030520647, "step": 10380}, {"loss": 0.8005, "grad_norm": 0.6108002662658691, "learning_rate": 0.0002, "epoch": 0.7461400359066427, "step": 10390}, {"loss": 0.7592, "grad_norm": 0.5196276903152466, "learning_rate": 0.0002, "epoch": 0.7468581687612208, "step": 10400}, {"loss": 0.7769, "grad_norm": 0.6207506656646729, "learning_rate": 0.0002, "epoch": 0.7475763016157989, "step": 10410}, {"loss": 0.8066, "grad_norm": 0.6015686988830566, "learning_rate": 0.0002, "epoch": 0.748294434470377, "step": 10420}, {"loss": 0.7993, "grad_norm": 0.6402649879455566, "learning_rate": 0.0002, "epoch": 0.7490125673249551, "step": 10430}, {"loss": 0.802, "grad_norm": 0.7816081047058105, "learning_rate": 0.0002, "epoch": 0.7497307001795332, "step": 10440}, {"loss": 0.8021, "grad_norm": 0.6148143410682678, "learning_rate": 0.0002, "epoch": 0.7504488330341114, "step": 10450}, {"loss": 0.7986, "grad_norm": 0.6496613621711731, "learning_rate": 0.0002, "epoch": 0.7511669658886894, "step": 10460}, {"loss": 0.8152, "grad_norm": 0.49158045649528503, "learning_rate": 0.0002, "epoch": 0.7518850987432675, "step": 10470}, {"loss": 0.8098, "grad_norm": 0.8629217743873596, "learning_rate": 0.0002, "epoch": 0.7526032315978456, "step": 10480}, {"loss": 0.807, "grad_norm": 0.6800066828727722, "learning_rate": 0.0002, "epoch": 0.7533213644524237, "step": 10490}, {"loss": 0.7238, "grad_norm": 0.6480063199996948, "learning_rate": 0.0002, "epoch": 0.7540394973070018, "step": 10500}, {"loss": 0.7818, "grad_norm": 0.5740751028060913, "learning_rate": 0.0002, "epoch": 0.7547576301615799, "step": 10510}, {"loss": 0.7732, "grad_norm": 0.7182627320289612, "learning_rate": 0.0002, "epoch": 0.755475763016158, "step": 10520}, {"loss": 0.7752, "grad_norm": 0.6482816934585571, "learning_rate": 0.0002, "epoch": 0.756193895870736, "step": 10530}, {"loss": 0.7564, "grad_norm": 0.4937674105167389, "learning_rate": 0.0002, "epoch": 0.7569120287253142, "step": 10540}, {"loss": 0.7783, "grad_norm": 0.6818482875823975, "learning_rate": 0.0002, "epoch": 0.7576301615798923, "step": 10550}, {"loss": 0.8303, "grad_norm": 0.6375173926353455, "learning_rate": 0.0002, "epoch": 0.7583482944344704, "step": 10560}, {"loss": 0.77, "grad_norm": 0.528798520565033, "learning_rate": 0.0002, "epoch": 0.7590664272890485, "step": 10570}, {"loss": 0.8435, "grad_norm": 0.42099910974502563, "learning_rate": 0.0002, "epoch": 0.7597845601436266, "step": 10580}, {"loss": 0.8218, "grad_norm": 0.529604434967041, "learning_rate": 0.0002, "epoch": 0.7605026929982047, "step": 10590}, {"loss": 0.7833, "grad_norm": 0.6236841082572937, "learning_rate": 0.0002, "epoch": 0.7612208258527827, "step": 10600}, {"loss": 0.777, "grad_norm": 0.6194891929626465, "learning_rate": 0.0002, "epoch": 0.7619389587073608, "step": 10610}, {"loss": 0.7967, "grad_norm": 0.5206209421157837, "learning_rate": 0.0002, "epoch": 0.762657091561939, "step": 10620}, {"loss": 0.811, "grad_norm": 0.7981295585632324, "learning_rate": 0.0002, "epoch": 0.7633752244165171, "step": 10630}, {"loss": 0.8016, "grad_norm": 0.6113479137420654, "learning_rate": 0.0002, "epoch": 0.7640933572710952, "step": 10640}, {"loss": 0.7642, "grad_norm": 0.7025435566902161, "learning_rate": 0.0002, "epoch": 0.7648114901256733, "step": 10650}, {"loss": 0.7293, "grad_norm": 0.46914348006248474, "learning_rate": 0.0002, "epoch": 0.7655296229802514, "step": 10660}, {"loss": 0.8079, "grad_norm": 0.6134725213050842, "learning_rate": 0.0002, "epoch": 0.7662477558348294, "step": 10670}, {"loss": 0.7469, "grad_norm": 0.583859920501709, "learning_rate": 0.0002, "epoch": 0.7669658886894075, "step": 10680}, {"loss": 0.843, "grad_norm": 0.511349081993103, "learning_rate": 0.0002, "epoch": 0.7676840215439856, "step": 10690}, {"loss": 0.8355, "grad_norm": 0.6467110514640808, "learning_rate": 0.0002, "epoch": 0.7684021543985637, "step": 10700}, {"loss": 0.7935, "grad_norm": 0.7210163474082947, "learning_rate": 0.0002, "epoch": 0.7691202872531419, "step": 10710}, {"loss": 0.7807, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "epoch": 0.76983842010772, "step": 10720}, {"loss": 0.7742, "grad_norm": 0.6237271428108215, "learning_rate": 0.0002, "epoch": 0.7705565529622981, "step": 10730}, {"loss": 0.8185, "grad_norm": 0.664328396320343, "learning_rate": 0.0002, "epoch": 0.7712746858168761, "step": 10740}, {"loss": 0.8096, "grad_norm": 0.6550520062446594, "learning_rate": 0.0002, "epoch": 0.7719928186714542, "step": 10750}, {"loss": 0.7538, "grad_norm": 0.5103325843811035, "learning_rate": 0.0002, "epoch": 0.7727109515260323, "step": 10760}, {"loss": 0.7777, "grad_norm": 0.7171200513839722, "learning_rate": 0.0002, "epoch": 0.7734290843806104, "step": 10770}, {"loss": 0.7743, "grad_norm": 0.5947384834289551, "learning_rate": 0.0002, "epoch": 0.7741472172351885, "step": 10780}, {"loss": 0.781, "grad_norm": 0.5293096899986267, "learning_rate": 0.0002, "epoch": 0.7748653500897666, "step": 10790}, {"loss": 0.777, "grad_norm": 0.6372577548027039, "learning_rate": 0.0002, "epoch": 0.7755834829443446, "step": 10800}, {"loss": 0.7972, "grad_norm": 0.5738261938095093, "learning_rate": 0.0002, "epoch": 0.7763016157989228, "step": 10810}, {"loss": 0.7877, "grad_norm": 0.7309247255325317, "learning_rate": 0.0002, "epoch": 0.7770197486535009, "step": 10820}, {"loss": 0.7745, "grad_norm": 0.8867193460464478, "learning_rate": 0.0002, "epoch": 0.777737881508079, "step": 10830}, {"loss": 0.7959, "grad_norm": 0.6151437759399414, "learning_rate": 0.0002, "epoch": 0.7784560143626571, "step": 10840}, {"loss": 0.7897, "grad_norm": 0.5645464658737183, "learning_rate": 0.0002, "epoch": 0.7791741472172352, "step": 10850}, {"loss": 0.7858, "grad_norm": 0.5118698477745056, "learning_rate": 0.0002, "epoch": 0.7798922800718133, "step": 10860}, {"loss": 0.8064, "grad_norm": 0.618181049823761, "learning_rate": 0.0002, "epoch": 0.7806104129263913, "step": 10870}, {"loss": 0.7675, "grad_norm": 0.7206462025642395, "learning_rate": 0.0002, "epoch": 0.7813285457809694, "step": 10880}, {"loss": 0.8162, "grad_norm": 0.7993820905685425, "learning_rate": 0.0002, "epoch": 0.7820466786355476, "step": 10890}, {"loss": 0.781, "grad_norm": 0.5072754621505737, "learning_rate": 0.0002, "epoch": 0.7827648114901257, "step": 10900}, {"loss": 0.7575, "grad_norm": 0.5829088687896729, "learning_rate": 0.0002, "epoch": 0.7834829443447038, "step": 10910}, {"loss": 0.7552, "grad_norm": 0.5778957605361938, "learning_rate": 0.0002, "epoch": 0.7842010771992819, "step": 10920}, {"loss": 0.7652, "grad_norm": 0.7237067222595215, "learning_rate": 0.0002, "epoch": 0.78491921005386, "step": 10930}, {"loss": 0.8357, "grad_norm": 0.5778013467788696, "learning_rate": 0.0002, "epoch": 0.785637342908438, "step": 10940}, {"loss": 0.7464, "grad_norm": 0.6129629611968994, "learning_rate": 0.0002, "epoch": 0.7863554757630161, "step": 10950}, {"loss": 0.7863, "grad_norm": 0.5637320876121521, "learning_rate": 0.0002, "epoch": 0.7870736086175942, "step": 10960}, {"loss": 0.7645, "grad_norm": 0.6253715753555298, "learning_rate": 0.0002, "epoch": 0.7877917414721723, "step": 10970}, {"loss": 0.8307, "grad_norm": 0.6209888458251953, "learning_rate": 0.0002, "epoch": 0.7885098743267505, "step": 10980}, {"loss": 0.7899, "grad_norm": 1.0841948986053467, "learning_rate": 0.0002, "epoch": 0.7892280071813286, "step": 10990}, {"loss": 0.7659, "grad_norm": 0.6570560336112976, "learning_rate": 0.0002, "epoch": 0.7899461400359067, "step": 11000}, {"loss": 0.7839, "grad_norm": 0.4830388128757477, "learning_rate": 0.0002, "epoch": 0.7906642728904847, "step": 11010}, {"loss": 0.8064, "grad_norm": 0.7607520222663879, "learning_rate": 0.0002, "epoch": 0.7913824057450628, "step": 11020}, {"loss": 0.8009, "grad_norm": 0.8202590346336365, "learning_rate": 0.0002, "epoch": 0.7921005385996409, "step": 11030}, {"loss": 0.7788, "grad_norm": 0.5640848278999329, "learning_rate": 0.0002, "epoch": 0.792818671454219, "step": 11040}, {"loss": 0.8298, "grad_norm": 0.7773675322532654, "learning_rate": 0.0002, "epoch": 0.7935368043087971, "step": 11050}, {"loss": 0.793, "grad_norm": 0.664139986038208, "learning_rate": 0.0002, "epoch": 0.7942549371633753, "step": 11060}, {"loss": 0.7886, "grad_norm": 0.6097795367240906, "learning_rate": 0.0002, "epoch": 0.7949730700179534, "step": 11070}, {"loss": 0.7989, "grad_norm": 0.9208881258964539, "learning_rate": 0.0002, "epoch": 0.7956912028725314, "step": 11080}, {"loss": 0.8045, "grad_norm": 0.6210731863975525, "learning_rate": 0.0002, "epoch": 0.7964093357271095, "step": 11090}, {"loss": 0.7868, "grad_norm": 0.7060235738754272, "learning_rate": 0.0002, "epoch": 0.7971274685816876, "step": 11100}, {"loss": 0.8041, "grad_norm": 0.48695266246795654, "learning_rate": 0.0002, "epoch": 0.7978456014362657, "step": 11110}, {"loss": 0.7885, "grad_norm": 0.6458830833435059, "learning_rate": 0.0002, "epoch": 0.7985637342908438, "step": 11120}, {"loss": 0.7773, "grad_norm": 0.572545051574707, "learning_rate": 0.0002, "epoch": 0.7992818671454219, "step": 11130}, {"loss": 0.7984, "grad_norm": 0.5925027132034302, "learning_rate": 0.0002, "epoch": 0.8, "step": 11140}, {"loss": 0.7571, "grad_norm": 0.569622278213501, "learning_rate": 0.0002, "epoch": 0.800718132854578, "step": 11150}, {"loss": 0.7765, "grad_norm": 0.537146806716919, "learning_rate": 0.0002, "epoch": 0.8014362657091562, "step": 11160}, {"loss": 0.7896, "grad_norm": 0.7118613719940186, "learning_rate": 0.0002, "epoch": 0.8021543985637343, "step": 11170}, {"loss": 0.7398, "grad_norm": 0.6183688044548035, "learning_rate": 0.0002, "epoch": 0.8028725314183124, "step": 11180}, {"loss": 0.7545, "grad_norm": 0.5187385082244873, "learning_rate": 0.0002, "epoch": 0.8035906642728905, "step": 11190}, {"loss": 0.766, "grad_norm": 0.5422571301460266, "learning_rate": 0.0002, "epoch": 0.8043087971274686, "step": 11200}, {"loss": 0.756, "grad_norm": 0.635050892829895, "learning_rate": 0.0002, "epoch": 0.8050269299820467, "step": 11210}, {"loss": 0.7337, "grad_norm": 0.6584872007369995, "learning_rate": 0.0002, "epoch": 0.8057450628366247, "step": 11220}, {"loss": 0.7467, "grad_norm": 0.624921977519989, "learning_rate": 0.0002, "epoch": 0.8064631956912028, "step": 11230}, {"loss": 0.7559, "grad_norm": 0.6837546229362488, "learning_rate": 0.0002, "epoch": 0.807181328545781, "step": 11240}, {"loss": 0.7861, "grad_norm": 0.5861160755157471, "learning_rate": 0.0002, "epoch": 0.8078994614003591, "step": 11250}, {"loss": 0.7883, "grad_norm": 0.5751383900642395, "learning_rate": 0.0002, "epoch": 0.8086175942549372, "step": 11260}, {"loss": 0.8103, "grad_norm": 0.7181510329246521, "learning_rate": 0.0002, "epoch": 0.8093357271095153, "step": 11270}, {"loss": 0.8066, "grad_norm": 0.5862139463424683, "learning_rate": 0.0002, "epoch": 0.8100538599640934, "step": 11280}, {"loss": 0.7692, "grad_norm": 0.4880113899707794, "learning_rate": 0.0002, "epoch": 0.8107719928186714, "step": 11290}, {"loss": 0.8154, "grad_norm": 0.565590500831604, "learning_rate": 0.0002, "epoch": 0.8114901256732495, "step": 11300}, {"loss": 0.7893, "grad_norm": 0.6171264052391052, "learning_rate": 0.0002, "epoch": 0.8122082585278276, "step": 11310}, {"loss": 0.816, "grad_norm": 0.5815969109535217, "learning_rate": 0.0002, "epoch": 0.8129263913824057, "step": 11320}, {"loss": 0.7462, "grad_norm": 0.5407653450965881, "learning_rate": 0.0002, "epoch": 0.8136445242369839, "step": 11330}, {"loss": 0.7647, "grad_norm": 0.6990084648132324, "learning_rate": 0.0002, "epoch": 0.814362657091562, "step": 11340}, {"loss": 0.783, "grad_norm": 0.5845068097114563, "learning_rate": 0.0002, "epoch": 0.8150807899461401, "step": 11350}, {"loss": 0.7839, "grad_norm": 0.5978701114654541, "learning_rate": 0.0002, "epoch": 0.8157989228007181, "step": 11360}, {"loss": 0.7342, "grad_norm": 0.6873053312301636, "learning_rate": 0.0002, "epoch": 0.8165170556552962, "step": 11370}, {"loss": 0.7656, "grad_norm": 0.7048654556274414, "learning_rate": 0.0002, "epoch": 0.8172351885098743, "step": 11380}, {"loss": 0.7293, "grad_norm": 0.7631531953811646, "learning_rate": 0.0002, "epoch": 0.8179533213644524, "step": 11390}, {"loss": 0.8606, "grad_norm": 0.704922080039978, "learning_rate": 0.0002, "epoch": 0.8186714542190305, "step": 11400}, {"loss": 0.8066, "grad_norm": 0.595460832118988, "learning_rate": 0.0002, "epoch": 0.8193895870736086, "step": 11410}, {"loss": 0.809, "grad_norm": 0.5882242918014526, "learning_rate": 0.0002, "epoch": 0.8201077199281868, "step": 11420}, {"loss": 0.7639, "grad_norm": 0.6433175206184387, "learning_rate": 0.0002, "epoch": 0.8208258527827648, "step": 11430}, {"loss": 0.7522, "grad_norm": 0.6047986149787903, "learning_rate": 0.0002, "epoch": 0.8215439856373429, "step": 11440}, {"loss": 0.8305, "grad_norm": 0.6462088823318481, "learning_rate": 0.0002, "epoch": 0.822262118491921, "step": 11450}, {"loss": 0.8144, "grad_norm": 0.5558379888534546, "learning_rate": 0.0002, "epoch": 0.8229802513464991, "step": 11460}, {"loss": 0.7916, "grad_norm": 0.6745542287826538, "learning_rate": 0.0002, "epoch": 0.8236983842010772, "step": 11470}, {"loss": 0.7853, "grad_norm": 0.7082334756851196, "learning_rate": 0.0002, "epoch": 0.8244165170556553, "step": 11480}, {"loss": 0.7533, "grad_norm": 0.703889787197113, "learning_rate": 0.0002, "epoch": 0.8251346499102334, "step": 11490}, {"loss": 0.8085, "grad_norm": 0.5261096358299255, "learning_rate": 0.0002, "epoch": 0.8258527827648114, "step": 11500}, {"loss": 0.7903, "grad_norm": 0.6009393930435181, "learning_rate": 0.0002, "epoch": 0.8265709156193896, "step": 11510}, {"loss": 0.7377, "grad_norm": 0.584274172782898, "learning_rate": 0.0002, "epoch": 0.8272890484739677, "step": 11520}, {"loss": 0.7926, "grad_norm": 0.6803238987922668, "learning_rate": 0.0002, "epoch": 0.8280071813285458, "step": 11530}, {"loss": 0.7948, "grad_norm": 0.6230084896087646, "learning_rate": 0.0002, "epoch": 0.8287253141831239, "step": 11540}, {"loss": 0.7902, "grad_norm": 0.6090595722198486, "learning_rate": 0.0002, "epoch": 0.829443447037702, "step": 11550}, {"loss": 0.7514, "grad_norm": 0.5292693376541138, "learning_rate": 0.0002, "epoch": 0.8301615798922801, "step": 11560}, {"loss": 0.7979, "grad_norm": 0.5675389766693115, "learning_rate": 0.0002, "epoch": 0.8308797127468581, "step": 11570}, {"loss": 0.7851, "grad_norm": 0.554874062538147, "learning_rate": 0.0002, "epoch": 0.8315978456014362, "step": 11580}, {"loss": 0.8004, "grad_norm": 0.8582373261451721, "learning_rate": 0.0002, "epoch": 0.8323159784560143, "step": 11590}, {"loss": 0.7864, "grad_norm": 0.5743035674095154, "learning_rate": 0.0002, "epoch": 0.8330341113105925, "step": 11600}, {"loss": 0.7714, "grad_norm": 0.5749582648277283, "learning_rate": 0.0002, "epoch": 0.8337522441651706, "step": 11610}, {"loss": 0.8131, "grad_norm": 0.5207278728485107, "learning_rate": 0.0002, "epoch": 0.8344703770197487, "step": 11620}, {"loss": 0.785, "grad_norm": 0.6262611150741577, "learning_rate": 0.0002, "epoch": 0.8351885098743268, "step": 11630}, {"loss": 0.7699, "grad_norm": 0.5490066409111023, "learning_rate": 0.0002, "epoch": 0.8359066427289048, "step": 11640}, {"loss": 0.7779, "grad_norm": 0.6283167600631714, "learning_rate": 0.0002, "epoch": 0.8366247755834829, "step": 11650}, {"loss": 0.7508, "grad_norm": 0.7701452374458313, "learning_rate": 0.0002, "epoch": 0.837342908438061, "step": 11660}, {"loss": 0.7662, "grad_norm": 0.5825072526931763, "learning_rate": 0.0002, "epoch": 0.8380610412926391, "step": 11670}, {"loss": 0.758, "grad_norm": 0.6119720935821533, "learning_rate": 0.0002, "epoch": 0.8387791741472173, "step": 11680}, {"loss": 0.7995, "grad_norm": 0.689383327960968, "learning_rate": 0.0002, "epoch": 0.8394973070017954, "step": 11690}, {"loss": 0.7615, "grad_norm": 0.5396560430526733, "learning_rate": 0.0002, "epoch": 0.8402154398563735, "step": 11700}, {"loss": 0.8073, "grad_norm": 0.577178955078125, "learning_rate": 0.0002, "epoch": 0.8409335727109515, "step": 11710}, {"loss": 0.7911, "grad_norm": 0.6652564406394958, "learning_rate": 0.0002, "epoch": 0.8416517055655296, "step": 11720}, {"loss": 0.7708, "grad_norm": 0.588377058506012, "learning_rate": 0.0002, "epoch": 0.8423698384201077, "step": 11730}, {"loss": 0.8245, "grad_norm": 0.6180438995361328, "learning_rate": 0.0002, "epoch": 0.8430879712746858, "step": 11740}, {"loss": 0.729, "grad_norm": 0.6897811889648438, "learning_rate": 0.0002, "epoch": 0.8438061041292639, "step": 11750}, {"loss": 0.8026, "grad_norm": 0.5826608538627625, "learning_rate": 0.0002, "epoch": 0.844524236983842, "step": 11760}, {"loss": 0.7959, "grad_norm": 0.6511976718902588, "learning_rate": 0.0002, "epoch": 0.8452423698384202, "step": 11770}, {"loss": 0.7705, "grad_norm": 0.4738382399082184, "learning_rate": 0.0002, "epoch": 0.8459605026929982, "step": 11780}, {"loss": 0.8317, "grad_norm": 0.541780948638916, "learning_rate": 0.0002, "epoch": 0.8466786355475763, "step": 11790}, {"loss": 0.774, "grad_norm": 0.6115241050720215, "learning_rate": 0.0002, "epoch": 0.8473967684021544, "step": 11800}, {"loss": 0.834, "grad_norm": 0.7067801356315613, "learning_rate": 0.0002, "epoch": 0.8481149012567325, "step": 11810}, {"loss": 0.7725, "grad_norm": 0.5602791905403137, "learning_rate": 0.0002, "epoch": 0.8488330341113106, "step": 11820}, {"loss": 0.7832, "grad_norm": 0.6968005299568176, "learning_rate": 0.0002, "epoch": 0.8495511669658887, "step": 11830}, {"loss": 0.7556, "grad_norm": 0.621132493019104, "learning_rate": 0.0002, "epoch": 0.8502692998204668, "step": 11840}, {"loss": 0.8036, "grad_norm": 0.5777568817138672, "learning_rate": 0.0002, "epoch": 0.8509874326750448, "step": 11850}, {"loss": 0.8071, "grad_norm": 0.6468178629875183, "learning_rate": 0.0002, "epoch": 0.851705565529623, "step": 11860}, {"loss": 0.8074, "grad_norm": 0.6216070652008057, "learning_rate": 0.0002, "epoch": 0.8524236983842011, "step": 11870}, {"loss": 0.7736, "grad_norm": 0.7402005791664124, "learning_rate": 0.0002, "epoch": 0.8531418312387792, "step": 11880}, {"loss": 0.7877, "grad_norm": 0.5192958116531372, "learning_rate": 0.0002, "epoch": 0.8538599640933573, "step": 11890}, {"loss": 0.7113, "grad_norm": 0.6050501465797424, "learning_rate": 0.0002, "epoch": 0.8545780969479354, "step": 11900}, {"loss": 0.8131, "grad_norm": 0.5363124012947083, "learning_rate": 0.0002, "epoch": 0.8552962298025135, "step": 11910}, {"loss": 0.7861, "grad_norm": 0.525288462638855, "learning_rate": 0.0002, "epoch": 0.8560143626570915, "step": 11920}, {"loss": 0.726, "grad_norm": 0.6129848957061768, "learning_rate": 0.0002, "epoch": 0.8567324955116696, "step": 11930}, {"loss": 0.7921, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 0.8574506283662477, "step": 11940}, {"loss": 0.772, "grad_norm": 0.5862830281257629, "learning_rate": 0.0002, "epoch": 0.8581687612208259, "step": 11950}, {"loss": 0.7272, "grad_norm": 0.7078025341033936, "learning_rate": 0.0002, "epoch": 0.858886894075404, "step": 11960}, {"loss": 0.7733, "grad_norm": 0.6600908637046814, "learning_rate": 0.0002, "epoch": 0.8596050269299821, "step": 11970}, {"loss": 0.7784, "grad_norm": 0.5914377570152283, "learning_rate": 0.0002, "epoch": 0.8603231597845602, "step": 11980}, {"loss": 0.8222, "grad_norm": 0.7844575047492981, "learning_rate": 0.0002, "epoch": 0.8610412926391382, "step": 11990}, {"loss": 0.8059, "grad_norm": 0.6605148315429688, "learning_rate": 0.0002, "epoch": 0.8617594254937163, "step": 12000}, {"loss": 0.8066, "grad_norm": 0.6320111155509949, "learning_rate": 0.0002, "epoch": 0.8624775583482944, "step": 12010}, {"loss": 0.7844, "grad_norm": 0.5833557844161987, "learning_rate": 0.0002, "epoch": 0.8631956912028725, "step": 12020}, {"loss": 0.8016, "grad_norm": 0.5322666764259338, "learning_rate": 0.0002, "epoch": 0.8639138240574507, "step": 12030}, {"loss": 0.8142, "grad_norm": 0.568696141242981, "learning_rate": 0.0002, "epoch": 0.8646319569120288, "step": 12040}, {"loss": 0.7929, "grad_norm": 0.5739135146141052, "learning_rate": 0.0002, "epoch": 0.8653500897666068, "step": 12050}, {"loss": 0.7877, "grad_norm": 0.6667993068695068, "learning_rate": 0.0002, "epoch": 0.8660682226211849, "step": 12060}, {"loss": 0.7538, "grad_norm": 0.5393701195716858, "learning_rate": 0.0002, "epoch": 0.866786355475763, "step": 12070}, {"loss": 0.8014, "grad_norm": 0.7036312818527222, "learning_rate": 0.0002, "epoch": 0.8675044883303411, "step": 12080}, {"loss": 0.7937, "grad_norm": 0.5851739048957825, "learning_rate": 0.0002, "epoch": 0.8682226211849192, "step": 12090}, {"loss": 0.8121, "grad_norm": 0.6554462909698486, "learning_rate": 0.0002, "epoch": 0.8689407540394973, "step": 12100}, {"loss": 0.8541, "grad_norm": 0.8224838376045227, "learning_rate": 0.0002, "epoch": 0.8696588868940754, "step": 12110}, {"loss": 0.73, "grad_norm": 0.513981819152832, "learning_rate": 0.0002, "epoch": 0.8703770197486534, "step": 12120}, {"loss": 0.7371, "grad_norm": 0.6913988590240479, "learning_rate": 0.0002, "epoch": 0.8710951526032316, "step": 12130}, {"loss": 0.762, "grad_norm": 0.5539003610610962, "learning_rate": 0.0002, "epoch": 0.8718132854578097, "step": 12140}, {"loss": 0.7535, "grad_norm": 0.6216937303543091, "learning_rate": 0.0002, "epoch": 0.8725314183123878, "step": 12150}, {"loss": 0.7344, "grad_norm": 0.5594495534896851, "learning_rate": 0.0002, "epoch": 0.8732495511669659, "step": 12160}, {"loss": 0.7342, "grad_norm": 0.6025309562683105, "learning_rate": 0.0002, "epoch": 0.873967684021544, "step": 12170}, {"loss": 0.7561, "grad_norm": 0.5285239815711975, "learning_rate": 0.0002, "epoch": 0.8746858168761221, "step": 12180}, {"loss": 0.7619, "grad_norm": 1.0394607782363892, "learning_rate": 0.0002, "epoch": 0.8754039497307001, "step": 12190}, {"loss": 0.8111, "grad_norm": 0.5128031373023987, "learning_rate": 0.0002, "epoch": 0.8761220825852782, "step": 12200}, {"loss": 0.8113, "grad_norm": 0.5883685946464539, "learning_rate": 0.0002, "epoch": 0.8768402154398564, "step": 12210}, {"loss": 0.7493, "grad_norm": 0.593204915523529, "learning_rate": 0.0002, "epoch": 0.8775583482944345, "step": 12220}, {"loss": 0.7739, "grad_norm": 0.7141679525375366, "learning_rate": 0.0002, "epoch": 0.8782764811490126, "step": 12230}, {"loss": 0.8155, "grad_norm": 0.6381585597991943, "learning_rate": 0.0002, "epoch": 0.8789946140035907, "step": 12240}, {"loss": 0.7756, "grad_norm": 0.7076981067657471, "learning_rate": 0.0002, "epoch": 0.8797127468581688, "step": 12250}, {"loss": 0.8186, "grad_norm": 0.8046461939811707, "learning_rate": 0.0002, "epoch": 0.8804308797127468, "step": 12260}, {"loss": 0.7615, "grad_norm": 0.635160505771637, "learning_rate": 0.0002, "epoch": 0.8811490125673249, "step": 12270}, {"loss": 0.7695, "grad_norm": 0.6388354301452637, "learning_rate": 0.0002, "epoch": 0.881867145421903, "step": 12280}, {"loss": 0.81, "grad_norm": 0.5612906217575073, "learning_rate": 0.0002, "epoch": 0.8825852782764811, "step": 12290}, {"loss": 0.8055, "grad_norm": 0.6716228723526001, "learning_rate": 0.0002, "epoch": 0.8833034111310593, "step": 12300}, {"loss": 0.757, "grad_norm": 0.6488762497901917, "learning_rate": 0.0002, "epoch": 0.8840215439856374, "step": 12310}, {"loss": 0.7794, "grad_norm": 0.5770853757858276, "learning_rate": 0.0002, "epoch": 0.8847396768402155, "step": 12320}, {"loss": 0.7617, "grad_norm": 0.5006616711616516, "learning_rate": 0.0002, "epoch": 0.8854578096947935, "step": 12330}, {"loss": 0.7512, "grad_norm": 0.6428417563438416, "learning_rate": 0.0002, "epoch": 0.8861759425493716, "step": 12340}, {"loss": 0.796, "grad_norm": 0.5721977949142456, "learning_rate": 0.0002, "epoch": 0.8868940754039497, "step": 12350}, {"loss": 0.7764, "grad_norm": 0.7000266313552856, "learning_rate": 0.0002, "epoch": 0.8876122082585278, "step": 12360}, {"loss": 0.7524, "grad_norm": 0.5252631306648254, "learning_rate": 0.0002, "epoch": 0.8883303411131059, "step": 12370}, {"loss": 0.7635, "grad_norm": 0.5788044929504395, "learning_rate": 0.0002, "epoch": 0.889048473967684, "step": 12380}, {"loss": 0.7856, "grad_norm": 0.6730653643608093, "learning_rate": 0.0002, "epoch": 0.8897666068222622, "step": 12390}, {"loss": 0.7925, "grad_norm": 0.5556851029396057, "learning_rate": 0.0002, "epoch": 0.8904847396768402, "step": 12400}, {"loss": 0.6958, "grad_norm": 0.616189181804657, "learning_rate": 0.0002, "epoch": 0.8912028725314183, "step": 12410}, {"loss": 0.7468, "grad_norm": 0.6360940337181091, "learning_rate": 0.0002, "epoch": 0.8919210053859964, "step": 12420}, {"loss": 0.8088, "grad_norm": 0.5832887887954712, "learning_rate": 0.0002, "epoch": 0.8926391382405745, "step": 12430}, {"loss": 0.7383, "grad_norm": 0.8319168090820312, "learning_rate": 0.0002, "epoch": 0.8933572710951526, "step": 12440}, {"loss": 0.8597, "grad_norm": 0.5415005087852478, "learning_rate": 0.0002, "epoch": 0.8940754039497307, "step": 12450}, {"loss": 0.7439, "grad_norm": 0.4959808588027954, "learning_rate": 0.0002, "epoch": 0.8947935368043088, "step": 12460}, {"loss": 0.8493, "grad_norm": 0.5102260708808899, "learning_rate": 0.0002, "epoch": 0.8955116696588868, "step": 12470}, {"loss": 0.7274, "grad_norm": 0.773972749710083, "learning_rate": 0.0002, "epoch": 0.896229802513465, "step": 12480}, {"loss": 0.7797, "grad_norm": 0.6314513087272644, "learning_rate": 0.0002, "epoch": 0.8969479353680431, "step": 12490}, {"loss": 0.7839, "grad_norm": 0.6503705382347107, "learning_rate": 0.0002, "epoch": 0.8976660682226212, "step": 12500}, {"loss": 0.8177, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 0.8983842010771993, "step": 12510}, {"loss": 0.7448, "grad_norm": 0.7222756743431091, "learning_rate": 0.0002, "epoch": 0.8991023339317774, "step": 12520}, {"loss": 0.7779, "grad_norm": 0.7242336869239807, "learning_rate": 0.0002, "epoch": 0.8998204667863555, "step": 12530}, {"loss": 0.7577, "grad_norm": 0.625769317150116, "learning_rate": 0.0002, "epoch": 0.9005385996409335, "step": 12540}, {"loss": 0.8528, "grad_norm": 0.6003357172012329, "learning_rate": 0.0002, "epoch": 0.9012567324955116, "step": 12550}, {"loss": 0.7871, "grad_norm": 0.6089374423027039, "learning_rate": 0.0002, "epoch": 0.9019748653500897, "step": 12560}, {"loss": 0.74, "grad_norm": 0.6232544183731079, "learning_rate": 0.0002, "epoch": 0.9026929982046679, "step": 12570}, {"loss": 0.7993, "grad_norm": 0.5426769256591797, "learning_rate": 0.0002, "epoch": 0.903411131059246, "step": 12580}, {"loss": 0.8023, "grad_norm": 0.5711943507194519, "learning_rate": 0.0002, "epoch": 0.9041292639138241, "step": 12590}, {"loss": 0.7915, "grad_norm": 0.5287838578224182, "learning_rate": 0.0002, "epoch": 0.9048473967684022, "step": 12600}, {"loss": 0.7394, "grad_norm": 0.6192951798439026, "learning_rate": 0.0002, "epoch": 0.9055655296229802, "step": 12610}, {"loss": 0.7547, "grad_norm": 0.493082195520401, "learning_rate": 0.0002, "epoch": 0.9062836624775583, "step": 12620}, {"loss": 0.7604, "grad_norm": 0.7668463587760925, "learning_rate": 0.0002, "epoch": 0.9070017953321364, "step": 12630}, {"loss": 0.8079, "grad_norm": 0.6298037767410278, "learning_rate": 0.0002, "epoch": 0.9077199281867145, "step": 12640}, {"loss": 0.7451, "grad_norm": 0.5502580404281616, "learning_rate": 0.0002, "epoch": 0.9084380610412927, "step": 12650}, {"loss": 0.763, "grad_norm": 0.5525170564651489, "learning_rate": 0.0002, "epoch": 0.9091561938958708, "step": 12660}, {"loss": 0.7579, "grad_norm": 0.9753695726394653, "learning_rate": 0.0002, "epoch": 0.9098743267504489, "step": 12670}, {"loss": 0.872, "grad_norm": 0.611427366733551, "learning_rate": 0.0002, "epoch": 0.9105924596050269, "step": 12680}, {"loss": 0.7786, "grad_norm": 0.5141594409942627, "learning_rate": 0.0002, "epoch": 0.911310592459605, "step": 12690}, {"loss": 0.7384, "grad_norm": 0.6739137172698975, "learning_rate": 0.0002, "epoch": 0.9120287253141831, "step": 12700}, {"loss": 0.8579, "grad_norm": 0.5759707689285278, "learning_rate": 0.0002, "epoch": 0.9127468581687612, "step": 12710}, {"loss": 0.7559, "grad_norm": 0.5548733472824097, "learning_rate": 0.0002, "epoch": 0.9134649910233393, "step": 12720}, {"loss": 0.8225, "grad_norm": 0.7014280557632446, "learning_rate": 0.0002, "epoch": 0.9141831238779174, "step": 12730}, {"loss": 0.7936, "grad_norm": 0.5939958691596985, "learning_rate": 0.0002, "epoch": 0.9149012567324956, "step": 12740}, {"loss": 0.7756, "grad_norm": 0.5995593667030334, "learning_rate": 0.0002, "epoch": 0.9156193895870736, "step": 12750}, {"loss": 0.7423, "grad_norm": 0.6686680316925049, "learning_rate": 0.0002, "epoch": 0.9163375224416517, "step": 12760}, {"loss": 0.8057, "grad_norm": 0.4742372930049896, "learning_rate": 0.0002, "epoch": 0.9170556552962298, "step": 12770}, {"loss": 0.7795, "grad_norm": 0.5493217706680298, "learning_rate": 0.0002, "epoch": 0.9177737881508079, "step": 12780}, {"loss": 0.7859, "grad_norm": 0.5641885995864868, "learning_rate": 0.0002, "epoch": 0.918491921005386, "step": 12790}, {"loss": 0.7775, "grad_norm": 0.5814061164855957, "learning_rate": 0.0002, "epoch": 0.9192100538599641, "step": 12800}, {"loss": 0.8204, "grad_norm": 0.6774331331253052, "learning_rate": 0.0002, "epoch": 0.9199281867145422, "step": 12810}, {"loss": 0.8205, "grad_norm": 0.5592127442359924, "learning_rate": 0.0002, "epoch": 0.9206463195691202, "step": 12820}, {"loss": 0.7788, "grad_norm": 0.5246456861495972, "learning_rate": 0.0002, "epoch": 0.9213644524236984, "step": 12830}, {"loss": 0.7886, "grad_norm": 0.6524264812469482, "learning_rate": 0.0002, "epoch": 0.9220825852782765, "step": 12840}, {"loss": 0.796, "grad_norm": 0.6010791063308716, "learning_rate": 0.0002, "epoch": 0.9228007181328546, "step": 12850}, {"loss": 0.7998, "grad_norm": 0.5289866924285889, "learning_rate": 0.0002, "epoch": 0.9235188509874327, "step": 12860}, {"loss": 0.7582, "grad_norm": 0.6850762367248535, "learning_rate": 0.0002, "epoch": 0.9242369838420108, "step": 12870}, {"loss": 0.7894, "grad_norm": 0.5293797850608826, "learning_rate": 0.0002, "epoch": 0.9249551166965889, "step": 12880}, {"loss": 0.7738, "grad_norm": 0.6045399308204651, "learning_rate": 0.0002, "epoch": 0.9256732495511669, "step": 12890}, {"loss": 0.7207, "grad_norm": 0.7026739716529846, "learning_rate": 0.0002, "epoch": 0.926391382405745, "step": 12900}, {"loss": 0.7726, "grad_norm": 0.6884756684303284, "learning_rate": 0.0002, "epoch": 0.9271095152603231, "step": 12910}, {"loss": 0.7913, "grad_norm": 0.637884795665741, "learning_rate": 0.0002, "epoch": 0.9278276481149013, "step": 12920}, {"loss": 0.7513, "grad_norm": 0.513913631439209, "learning_rate": 0.0002, "epoch": 0.9285457809694794, "step": 12930}, {"loss": 0.8, "grad_norm": 0.6642340421676636, "learning_rate": 0.0002, "epoch": 0.9292639138240575, "step": 12940}, {"loss": 0.8026, "grad_norm": 0.5708861947059631, "learning_rate": 0.0002, "epoch": 0.9299820466786356, "step": 12950}, {"loss": 0.8234, "grad_norm": 0.5896512866020203, "learning_rate": 0.0002, "epoch": 0.9307001795332136, "step": 12960}, {"loss": 0.77, "grad_norm": 0.5754874348640442, "learning_rate": 0.0002, "epoch": 0.9314183123877917, "step": 12970}, {"loss": 0.7594, "grad_norm": 0.6363751888275146, "learning_rate": 0.0002, "epoch": 0.9321364452423698, "step": 12980}, {"loss": 0.7898, "grad_norm": 0.7660197019577026, "learning_rate": 0.0002, "epoch": 0.9328545780969479, "step": 12990}, {"loss": 0.792, "grad_norm": 0.607728898525238, "learning_rate": 0.0002, "epoch": 0.933572710951526, "step": 13000}, {"loss": 0.734, "grad_norm": 0.5257042050361633, "learning_rate": 0.0002, "epoch": 0.9342908438061042, "step": 13010}, {"loss": 0.8129, "grad_norm": 0.7916908264160156, "learning_rate": 0.0002, "epoch": 0.9350089766606823, "step": 13020}, {"loss": 0.81, "grad_norm": 0.8310123085975647, "learning_rate": 0.0002, "epoch": 0.9357271095152603, "step": 13030}, {"loss": 0.7738, "grad_norm": 0.6543728113174438, "learning_rate": 0.0002, "epoch": 0.9364452423698384, "step": 13040}, {"loss": 0.7797, "grad_norm": 0.7153878808021545, "learning_rate": 0.0002, "epoch": 0.9371633752244165, "step": 13050}, {"loss": 0.779, "grad_norm": 0.7510694265365601, "learning_rate": 0.0002, "epoch": 0.9378815080789946, "step": 13060}, {"loss": 0.7761, "grad_norm": 0.5524464249610901, "learning_rate": 0.0002, "epoch": 0.9385996409335727, "step": 13070}, {"loss": 0.8635, "grad_norm": 0.6657140254974365, "learning_rate": 0.0002, "epoch": 0.9393177737881508, "step": 13080}, {"loss": 0.8097, "grad_norm": 0.5757394433021545, "learning_rate": 0.0002, "epoch": 0.940035906642729, "step": 13090}, {"loss": 0.7967, "grad_norm": 0.6171187162399292, "learning_rate": 0.0002, "epoch": 0.940754039497307, "step": 13100}, {"loss": 0.8197, "grad_norm": 0.5946314334869385, "learning_rate": 0.0002, "epoch": 0.9414721723518851, "step": 13110}, {"loss": 0.7184, "grad_norm": 0.5727229714393616, "learning_rate": 0.0002, "epoch": 0.9421903052064632, "step": 13120}, {"loss": 0.7981, "grad_norm": 0.7805224061012268, "learning_rate": 0.0002, "epoch": 0.9429084380610413, "step": 13130}, {"loss": 0.8045, "grad_norm": 0.5763523578643799, "learning_rate": 0.0002, "epoch": 0.9436265709156194, "step": 13140}, {"loss": 0.7462, "grad_norm": 0.8310899138450623, "learning_rate": 0.0002, "epoch": 0.9443447037701975, "step": 13150}, {"loss": 0.7818, "grad_norm": 0.7531784772872925, "learning_rate": 0.0002, "epoch": 0.9450628366247756, "step": 13160}, {"loss": 0.8418, "grad_norm": 0.678779661655426, "learning_rate": 0.0002, "epoch": 0.9457809694793536, "step": 13170}, {"loss": 0.8064, "grad_norm": 0.8096453547477722, "learning_rate": 0.0002, "epoch": 0.9464991023339318, "step": 13180}, {"loss": 0.7676, "grad_norm": 0.6743921637535095, "learning_rate": 0.0002, "epoch": 0.9472172351885099, "step": 13190}, {"loss": 0.7949, "grad_norm": 0.606852114200592, "learning_rate": 0.0002, "epoch": 0.947935368043088, "step": 13200}, {"loss": 0.7908, "grad_norm": 0.6550270915031433, "learning_rate": 0.0002, "epoch": 0.9486535008976661, "step": 13210}, {"loss": 0.7564, "grad_norm": 0.6494552493095398, "learning_rate": 0.0002, "epoch": 0.9493716337522442, "step": 13220}, {"loss": 0.7974, "grad_norm": 0.5867666602134705, "learning_rate": 0.0002, "epoch": 0.9500897666068223, "step": 13230}, {"loss": 0.8117, "grad_norm": 0.6283786296844482, "learning_rate": 0.0002, "epoch": 0.9508078994614003, "step": 13240}, {"loss": 0.7775, "grad_norm": 0.6824573278427124, "learning_rate": 0.0002, "epoch": 0.9515260323159784, "step": 13250}, {"loss": 0.7674, "grad_norm": 0.6945744156837463, "learning_rate": 0.0002, "epoch": 0.9522441651705565, "step": 13260}, {"loss": 0.7384, "grad_norm": 0.6468575596809387, "learning_rate": 0.0002, "epoch": 0.9529622980251347, "step": 13270}, {"loss": 0.7548, "grad_norm": 0.6819407939910889, "learning_rate": 0.0002, "epoch": 0.9536804308797128, "step": 13280}, {"loss": 0.7933, "grad_norm": 0.6660491824150085, "learning_rate": 0.0002, "epoch": 0.9543985637342909, "step": 13290}, {"loss": 0.7293, "grad_norm": 0.6320462226867676, "learning_rate": 0.0002, "epoch": 0.9551166965888689, "step": 13300}, {"loss": 0.8122, "grad_norm": 0.46753761172294617, "learning_rate": 0.0002, "epoch": 0.955834829443447, "step": 13310}, {"loss": 0.7953, "grad_norm": 0.6608774065971375, "learning_rate": 0.0002, "epoch": 0.9565529622980251, "step": 13320}, {"loss": 0.8217, "grad_norm": 0.607448935508728, "learning_rate": 0.0002, "epoch": 0.9572710951526032, "step": 13330}, {"loss": 0.7278, "grad_norm": 0.6796701550483704, "learning_rate": 0.0002, "epoch": 0.9579892280071813, "step": 13340}, {"loss": 0.7979, "grad_norm": 0.7655861377716064, "learning_rate": 0.0002, "epoch": 0.9587073608617595, "step": 13350}, {"loss": 0.7822, "grad_norm": 0.5881335735321045, "learning_rate": 0.0002, "epoch": 0.9594254937163376, "step": 13360}, {"loss": 0.815, "grad_norm": 0.6855270862579346, "learning_rate": 0.0002, "epoch": 0.9601436265709156, "step": 13370}, {"loss": 0.8025, "grad_norm": 0.6072475910186768, "learning_rate": 0.0002, "epoch": 0.9608617594254937, "step": 13380}, {"loss": 0.7756, "grad_norm": 0.5983994603157043, "learning_rate": 0.0002, "epoch": 0.9615798922800718, "step": 13390}, {"loss": 0.8121, "grad_norm": 0.6141189932823181, "learning_rate": 0.0002, "epoch": 0.9622980251346499, "step": 13400}, {"loss": 0.8059, "grad_norm": 0.6539722084999084, "learning_rate": 0.0002, "epoch": 0.963016157989228, "step": 13410}, {"loss": 0.8085, "grad_norm": 0.5425801277160645, "learning_rate": 0.0002, "epoch": 0.9637342908438061, "step": 13420}, {"loss": 0.7687, "grad_norm": 0.8038925528526306, "learning_rate": 0.0002, "epoch": 0.9644524236983842, "step": 13430}, {"loss": 0.8015, "grad_norm": 0.5729590058326721, "learning_rate": 0.0002, "epoch": 0.9651705565529622, "step": 13440}, {"loss": 0.782, "grad_norm": 0.5695241689682007, "learning_rate": 0.0002, "epoch": 0.9658886894075404, "step": 13450}, {"loss": 0.7984, "grad_norm": 0.5913681387901306, "learning_rate": 0.0002, "epoch": 0.9666068222621185, "step": 13460}, {"loss": 0.7947, "grad_norm": 1.1798994541168213, "learning_rate": 0.0002, "epoch": 0.9673249551166966, "step": 13470}, {"loss": 0.7342, "grad_norm": 0.5931369066238403, "learning_rate": 0.0002, "epoch": 0.9680430879712747, "step": 13480}, {"loss": 0.8432, "grad_norm": 0.6269514560699463, "learning_rate": 0.0002, "epoch": 0.9687612208258528, "step": 13490}, {"loss": 0.7357, "grad_norm": 0.7380245327949524, "learning_rate": 0.0002, "epoch": 0.9694793536804309, "step": 13500}, {"loss": 0.8006, "grad_norm": 0.5668187141418457, "learning_rate": 0.0002, "epoch": 0.9701974865350089, "step": 13510}, {"loss": 0.7562, "grad_norm": 0.547149121761322, "learning_rate": 0.0002, "epoch": 0.970915619389587, "step": 13520}, {"loss": 0.8239, "grad_norm": 0.49131739139556885, "learning_rate": 0.0002, "epoch": 0.9716337522441651, "step": 13530}, {"loss": 0.8159, "grad_norm": 0.6385366320610046, "learning_rate": 0.0002, "epoch": 0.9723518850987433, "step": 13540}, {"loss": 0.7882, "grad_norm": 0.5962417125701904, "learning_rate": 0.0002, "epoch": 0.9730700179533214, "step": 13550}, {"loss": 0.7353, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9737881508078995, "step": 13560}, {"loss": 0.7511, "grad_norm": 0.5757403373718262, "learning_rate": 0.0002, "epoch": 0.9745062836624776, "step": 13570}, {"loss": 0.7858, "grad_norm": 0.7214667201042175, "learning_rate": 0.0002, "epoch": 0.9752244165170556, "step": 13580}, {"loss": 0.7492, "grad_norm": 0.5902701020240784, "learning_rate": 0.0002, "epoch": 0.9759425493716337, "step": 13590}, {"loss": 0.8177, "grad_norm": 0.752805769443512, "learning_rate": 0.0002, "epoch": 0.9766606822262118, "step": 13600}, {"loss": 0.7622, "grad_norm": 0.5943595767021179, "learning_rate": 0.0002, "epoch": 0.9773788150807899, "step": 13610}, {"loss": 0.7781, "grad_norm": 0.6752488613128662, "learning_rate": 0.0002, "epoch": 0.978096947935368, "step": 13620}, {"loss": 0.8022, "grad_norm": 0.5295413732528687, "learning_rate": 0.0002, "epoch": 0.9788150807899462, "step": 13630}, {"loss": 0.7462, "grad_norm": 0.732549250125885, "learning_rate": 0.0002, "epoch": 0.9795332136445243, "step": 13640}, {"loss": 0.7939, "grad_norm": 0.5701823830604553, "learning_rate": 0.0002, "epoch": 0.9802513464991023, "step": 13650}, {"loss": 0.7609, "grad_norm": 0.576898455619812, "learning_rate": 0.0002, "epoch": 0.9809694793536804, "step": 13660}, {"loss": 0.7576, "grad_norm": 0.5916832089424133, "learning_rate": 0.0002, "epoch": 0.9816876122082585, "step": 13670}, {"loss": 0.7587, "grad_norm": 0.5554524660110474, "learning_rate": 0.0002, "epoch": 0.9824057450628366, "step": 13680}, {"loss": 0.8274, "grad_norm": 0.6988440752029419, "learning_rate": 0.0002, "epoch": 0.9831238779174147, "step": 13690}, {"loss": 0.7485, "grad_norm": 0.6660445332527161, "learning_rate": 0.0002, "epoch": 0.9838420107719928, "step": 13700}, {"loss": 0.7609, "grad_norm": 2.421210289001465, "learning_rate": 0.0002, "epoch": 0.984560143626571, "step": 13710}, {"loss": 0.784, "grad_norm": 0.6307598948478699, "learning_rate": 0.0002, "epoch": 0.985278276481149, "step": 13720}, {"loss": 0.7757, "grad_norm": 0.6832480430603027, "learning_rate": 0.0002, "epoch": 0.9859964093357271, "step": 13730}, {"loss": 0.8064, "grad_norm": 0.5974255204200745, "learning_rate": 0.0002, "epoch": 0.9867145421903052, "step": 13740}, {"loss": 0.7871, "grad_norm": 0.6540380716323853, "learning_rate": 0.0002, "epoch": 0.9874326750448833, "step": 13750}, {"loss": 0.7735, "grad_norm": 0.7532727122306824, "learning_rate": 0.0002, "epoch": 0.9881508078994614, "step": 13760}, {"loss": 0.7392, "grad_norm": 0.6776283383369446, "learning_rate": 0.0002, "epoch": 0.9888689407540395, "step": 13770}, {"loss": 0.7852, "grad_norm": 0.5776281356811523, "learning_rate": 0.0002, "epoch": 0.9895870736086176, "step": 13780}, {"loss": 0.8216, "grad_norm": 0.5473008751869202, "learning_rate": 0.0002, "epoch": 0.9903052064631956, "step": 13790}, {"loss": 0.7776, "grad_norm": 0.5428591370582581, "learning_rate": 0.0002, "epoch": 0.9910233393177738, "step": 13800}, {"loss": 0.7823, "grad_norm": 0.5173406004905701, "learning_rate": 0.0002, "epoch": 0.9917414721723519, "step": 13810}, {"loss": 0.762, "grad_norm": 0.6462617516517639, "learning_rate": 0.0002, "epoch": 0.99245960502693, "step": 13820}, {"loss": 0.7656, "grad_norm": 0.5800426006317139, "learning_rate": 0.0002, "epoch": 0.9931777378815081, "step": 13830}, {"loss": 0.8028, "grad_norm": 0.5015466809272766, "learning_rate": 0.0002, "epoch": 0.9938958707360862, "step": 13840}, {"loss": 0.7782, "grad_norm": 0.59474778175354, "learning_rate": 0.0002, "epoch": 0.9946140035906643, "step": 13850}, {"loss": 0.7891, "grad_norm": 0.5609583258628845, "learning_rate": 0.0002, "epoch": 0.9953321364452423, "step": 13860}, {"loss": 0.7647, "grad_norm": 0.5762063264846802, "learning_rate": 0.0002, "epoch": 0.9960502692998204, "step": 13870}, {"loss": 0.7594, "grad_norm": 0.6419214010238647, "learning_rate": 0.0002, "epoch": 0.9967684021543985, "step": 13880}, {"loss": 0.7599, "grad_norm": 0.7821950316429138, "learning_rate": 0.0002, "epoch": 0.9974865350089767, "step": 13890}, {"loss": 0.7529, "grad_norm": 0.6216017007827759, "learning_rate": 0.0002, "epoch": 0.9982046678635548, "step": 13900}, {"loss": 0.7621, "grad_norm": 0.5446485877037048, "learning_rate": 0.0002, "epoch": 0.9989228007181329, "step": 13910}, {"loss": 0.74, "grad_norm": 0.5037565231323242, "learning_rate": 0.0002, "epoch": 0.999640933572711, "step": 13920}, {"eval_loss": 1.09147310256958, "eval_runtime": 55.1915, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 1.0, "step": 13925}, {"loss": 0.7479, "grad_norm": 0.5808277130126953, "learning_rate": 0.0002, "epoch": 1.000359066427289, "step": 13930}, {"loss": 0.7147, "grad_norm": 0.47258496284484863, "learning_rate": 0.0002, "epoch": 1.0010771992818672, "step": 13940}, {"loss": 0.7075, "grad_norm": 0.8921670317649841, "learning_rate": 0.0002, "epoch": 1.0017953321364452, "step": 13950}, {"loss": 0.7737, "grad_norm": 0.746729850769043, "learning_rate": 0.0002, "epoch": 1.0025134649910232, "step": 13960}, {"loss": 0.6912, "grad_norm": 0.6243796944618225, "learning_rate": 0.0002, "epoch": 1.0032315978456015, "step": 13970}, {"loss": 0.7171, "grad_norm": 0.6725090742111206, "learning_rate": 0.0002, "epoch": 1.0039497307001795, "step": 13980}, {"loss": 0.7094, "grad_norm": 0.8762497305870056, "learning_rate": 0.0002, "epoch": 1.0046678635547577, "step": 13990}, {"loss": 0.7183, "grad_norm": 0.7694411873817444, "learning_rate": 0.0002, "epoch": 1.0053859964093357, "step": 14000}, {"loss": 0.7741, "grad_norm": 0.6208822727203369, "learning_rate": 0.0002, "epoch": 1.006104129263914, "step": 14010}, {"loss": 0.7291, "grad_norm": 0.8503357768058777, "learning_rate": 0.0002, "epoch": 1.006822262118492, "step": 14020}, {"loss": 0.7189, "grad_norm": 0.5813316106796265, "learning_rate": 0.0002, "epoch": 1.00754039497307, "step": 14030}, {"loss": 0.751, "grad_norm": 0.8186036348342896, "learning_rate": 0.0002, "epoch": 1.0082585278276481, "step": 14040}, {"loss": 0.7205, "grad_norm": 0.759873628616333, "learning_rate": 0.0002, "epoch": 1.0089766606822261, "step": 14050}, {"loss": 0.7517, "grad_norm": 0.8437777161598206, "learning_rate": 0.0002, "epoch": 1.0096947935368044, "step": 14060}, {"loss": 0.7205, "grad_norm": 0.5750975012779236, "learning_rate": 0.0002, "epoch": 1.0104129263913824, "step": 14070}, {"loss": 0.7079, "grad_norm": 0.5873221158981323, "learning_rate": 0.0002, "epoch": 1.0111310592459606, "step": 14080}, {"loss": 0.7645, "grad_norm": 0.6381314396858215, "learning_rate": 0.0002, "epoch": 1.0118491921005386, "step": 14090}, {"loss": 0.7246, "grad_norm": 0.6510405540466309, "learning_rate": 0.0002, "epoch": 1.0125673249551166, "step": 14100}, {"loss": 0.6906, "grad_norm": 0.7698671221733093, "learning_rate": 0.0002, "epoch": 1.0132854578096948, "step": 14110}, {"loss": 0.7008, "grad_norm": 0.646180272102356, "learning_rate": 0.0002, "epoch": 1.0140035906642728, "step": 14120}, {"loss": 0.7446, "grad_norm": 0.6183205246925354, "learning_rate": 0.0002, "epoch": 1.014721723518851, "step": 14130}, {"loss": 0.747, "grad_norm": 0.5082563757896423, "learning_rate": 0.0002, "epoch": 1.015439856373429, "step": 14140}, {"loss": 0.7229, "grad_norm": 0.7285500764846802, "learning_rate": 0.0002, "epoch": 1.0161579892280073, "step": 14150}, {"loss": 0.6879, "grad_norm": 0.6368175148963928, "learning_rate": 0.0002, "epoch": 1.0168761220825853, "step": 14160}, {"loss": 0.712, "grad_norm": 0.44868743419647217, "learning_rate": 0.0002, "epoch": 1.0175942549371633, "step": 14170}, {"loss": 0.7299, "grad_norm": 0.6346513628959656, "learning_rate": 0.0002, "epoch": 1.0183123877917415, "step": 14180}, {"loss": 0.7099, "grad_norm": 0.7287803292274475, "learning_rate": 0.0002, "epoch": 1.0190305206463195, "step": 14190}, {"loss": 0.6915, "grad_norm": 0.6701363325119019, "learning_rate": 0.0002, "epoch": 1.0197486535008977, "step": 14200}, {"loss": 0.7389, "grad_norm": 0.6419289112091064, "learning_rate": 0.0002, "epoch": 1.0204667863554757, "step": 14210}, {"loss": 0.7386, "grad_norm": 0.7703002095222473, "learning_rate": 0.0002, "epoch": 1.021184919210054, "step": 14220}, {"loss": 0.6819, "grad_norm": 0.6803670525550842, "learning_rate": 0.0002, "epoch": 1.021903052064632, "step": 14230}, {"loss": 0.74, "grad_norm": 0.5780976414680481, "learning_rate": 0.0002, "epoch": 1.02262118491921, "step": 14240}, {"loss": 0.6912, "grad_norm": 0.5096051096916199, "learning_rate": 0.0002, "epoch": 1.0233393177737882, "step": 14250}, {"loss": 0.7585, "grad_norm": 0.6058611869812012, "learning_rate": 0.0002, "epoch": 1.0240574506283662, "step": 14260}, {"loss": 0.7542, "grad_norm": 0.6703311204910278, "learning_rate": 0.0002, "epoch": 1.0247755834829444, "step": 14270}, {"loss": 0.7541, "grad_norm": 0.7143640518188477, "learning_rate": 0.0002, "epoch": 1.0254937163375224, "step": 14280}, {"loss": 0.7411, "grad_norm": 0.6730744242668152, "learning_rate": 0.0002, "epoch": 1.0262118491921006, "step": 14290}, {"loss": 0.7072, "grad_norm": 0.8180603384971619, "learning_rate": 0.0002, "epoch": 1.0269299820466786, "step": 14300}, {"loss": 0.6944, "grad_norm": 0.6752267479896545, "learning_rate": 0.0002, "epoch": 1.0276481149012566, "step": 14310}, {"loss": 0.7105, "grad_norm": 0.678428590297699, "learning_rate": 0.0002, "epoch": 1.0283662477558349, "step": 14320}, {"loss": 0.7496, "grad_norm": 0.5959973931312561, "learning_rate": 0.0002, "epoch": 1.0290843806104129, "step": 14330}, {"loss": 0.7196, "grad_norm": 0.5797176957130432, "learning_rate": 0.0002, "epoch": 1.029802513464991, "step": 14340}, {"loss": 0.7853, "grad_norm": 0.6415652632713318, "learning_rate": 0.0002, "epoch": 1.030520646319569, "step": 14350}, {"loss": 0.7297, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 1.0312387791741473, "step": 14360}, {"loss": 0.7715, "grad_norm": 0.7158452272415161, "learning_rate": 0.0002, "epoch": 1.0319569120287253, "step": 14370}, {"loss": 0.7526, "grad_norm": 0.6066089272499084, "learning_rate": 0.0002, "epoch": 1.0326750448833033, "step": 14380}, {"loss": 0.7639, "grad_norm": 0.7359582781791687, "learning_rate": 0.0002, "epoch": 1.0333931777378815, "step": 14390}, {"loss": 0.7445, "grad_norm": 0.7372373938560486, "learning_rate": 0.0002, "epoch": 1.0341113105924595, "step": 14400}, {"loss": 0.7262, "grad_norm": 0.7511868476867676, "learning_rate": 0.0002, "epoch": 1.0348294434470378, "step": 14410}, {"loss": 0.7145, "grad_norm": 0.5449917912483215, "learning_rate": 0.0002, "epoch": 1.0355475763016158, "step": 14420}, {"loss": 0.6908, "grad_norm": 0.6700817346572876, "learning_rate": 0.0002, "epoch": 1.036265709156194, "step": 14430}, {"loss": 0.7237, "grad_norm": 0.7061316967010498, "learning_rate": 0.0002, "epoch": 1.036983842010772, "step": 14440}, {"loss": 0.7166, "grad_norm": 0.7582663893699646, "learning_rate": 0.0002, "epoch": 1.03770197486535, "step": 14450}, {"loss": 0.7447, "grad_norm": 0.6408873200416565, "learning_rate": 0.0002, "epoch": 1.0384201077199282, "step": 14460}, {"loss": 0.728, "grad_norm": 0.7645436525344849, "learning_rate": 0.0002, "epoch": 1.0391382405745062, "step": 14470}, {"loss": 0.7764, "grad_norm": 0.6522644758224487, "learning_rate": 0.0002, "epoch": 1.0398563734290844, "step": 14480}, {"loss": 0.7249, "grad_norm": 0.784273624420166, "learning_rate": 0.0002, "epoch": 1.0405745062836624, "step": 14490}, {"loss": 0.7173, "grad_norm": 0.673891544342041, "learning_rate": 0.0002, "epoch": 1.0412926391382407, "step": 14500}, {"loss": 0.6647, "grad_norm": 0.6566316485404968, "learning_rate": 0.0002, "epoch": 1.0420107719928187, "step": 14510}, {"loss": 0.7626, "grad_norm": 0.6062059998512268, "learning_rate": 0.0002, "epoch": 1.0427289048473967, "step": 14520}, {"loss": 0.7061, "grad_norm": 0.6884504556655884, "learning_rate": 0.0002, "epoch": 1.0434470377019749, "step": 14530}, {"loss": 0.7293, "grad_norm": 0.6642231345176697, "learning_rate": 0.0002, "epoch": 1.044165170556553, "step": 14540}, {"loss": 0.7084, "grad_norm": 0.6989523768424988, "learning_rate": 0.0002, "epoch": 1.0448833034111311, "step": 14550}, {"loss": 0.7751, "grad_norm": 0.8179892301559448, "learning_rate": 0.0002, "epoch": 1.0456014362657091, "step": 14560}, {"loss": 0.7225, "grad_norm": 0.6426970362663269, "learning_rate": 0.0002, "epoch": 1.0463195691202873, "step": 14570}, {"loss": 0.7756, "grad_norm": 0.678445041179657, "learning_rate": 0.0002, "epoch": 1.0470377019748653, "step": 14580}, {"loss": 0.7172, "grad_norm": 0.7573820352554321, "learning_rate": 0.0002, "epoch": 1.0477558348294433, "step": 14590}, {"loss": 0.8092, "grad_norm": 0.734443724155426, "learning_rate": 0.0002, "epoch": 1.0484739676840216, "step": 14600}, {"loss": 0.7205, "grad_norm": 0.7333676218986511, "learning_rate": 0.0002, "epoch": 1.0491921005385996, "step": 14610}, {"loss": 0.7276, "grad_norm": 0.6122187972068787, "learning_rate": 0.0002, "epoch": 1.0499102333931778, "step": 14620}, {"loss": 0.7051, "grad_norm": 0.6916412711143494, "learning_rate": 0.0002, "epoch": 1.0506283662477558, "step": 14630}, {"loss": 0.7315, "grad_norm": 0.5898127555847168, "learning_rate": 0.0002, "epoch": 1.051346499102334, "step": 14640}, {"loss": 0.7293, "grad_norm": 0.6071873307228088, "learning_rate": 0.0002, "epoch": 1.052064631956912, "step": 14650}, {"loss": 0.7924, "grad_norm": 0.6530455946922302, "learning_rate": 0.0002, "epoch": 1.05278276481149, "step": 14660}, {"loss": 0.7055, "grad_norm": 0.6919314861297607, "learning_rate": 0.0002, "epoch": 1.0535008976660682, "step": 14670}, {"loss": 0.7481, "grad_norm": 0.7843509912490845, "learning_rate": 0.0002, "epoch": 1.0542190305206462, "step": 14680}, {"loss": 0.7253, "grad_norm": 0.6106747388839722, "learning_rate": 0.0002, "epoch": 1.0549371633752245, "step": 14690}, {"loss": 0.7206, "grad_norm": 0.7828368544578552, "learning_rate": 0.0002, "epoch": 1.0556552962298025, "step": 14700}, {"loss": 0.6933, "grad_norm": 0.6772044897079468, "learning_rate": 0.0002, "epoch": 1.0563734290843807, "step": 14710}, {"loss": 0.6851, "grad_norm": 0.5430962443351746, "learning_rate": 0.0002, "epoch": 1.0570915619389587, "step": 14720}, {"loss": 0.7306, "grad_norm": 0.7364194989204407, "learning_rate": 0.0002, "epoch": 1.0578096947935367, "step": 14730}, {"loss": 0.703, "grad_norm": 0.5607585310935974, "learning_rate": 0.0002, "epoch": 1.058527827648115, "step": 14740}, {"loss": 0.7488, "grad_norm": 0.7917081713676453, "learning_rate": 0.0002, "epoch": 1.059245960502693, "step": 14750}, {"loss": 0.71, "grad_norm": 0.7852025628089905, "learning_rate": 0.0002, "epoch": 1.0599640933572712, "step": 14760}, {"loss": 0.7093, "grad_norm": 0.6329161524772644, "learning_rate": 0.0002, "epoch": 1.0606822262118492, "step": 14770}, {"loss": 0.7244, "grad_norm": 0.7607306838035583, "learning_rate": 0.0002, "epoch": 1.0614003590664274, "step": 14780}, {"loss": 0.7237, "grad_norm": 0.7236617207527161, "learning_rate": 0.0002, "epoch": 1.0621184919210054, "step": 14790}, {"loss": 0.7133, "grad_norm": 0.793542206287384, "learning_rate": 0.0002, "epoch": 1.0628366247755834, "step": 14800}, {"loss": 0.7482, "grad_norm": 0.53999263048172, "learning_rate": 0.0002, "epoch": 1.0635547576301616, "step": 14810}, {"loss": 0.732, "grad_norm": 0.5821034908294678, "learning_rate": 0.0002, "epoch": 1.0642728904847396, "step": 14820}, {"loss": 0.7066, "grad_norm": 0.6593600511550903, "learning_rate": 0.0002, "epoch": 1.0649910233393178, "step": 14830}, {"loss": 0.7458, "grad_norm": 0.70230633020401, "learning_rate": 0.0002, "epoch": 1.0657091561938958, "step": 14840}, {"loss": 0.7244, "grad_norm": 0.5715264081954956, "learning_rate": 0.0002, "epoch": 1.066427289048474, "step": 14850}, {"loss": 0.723, "grad_norm": 0.6610119938850403, "learning_rate": 0.0002, "epoch": 1.067145421903052, "step": 14860}, {"loss": 0.745, "grad_norm": 0.5470091700553894, "learning_rate": 0.0002, "epoch": 1.06786355475763, "step": 14870}, {"loss": 0.7464, "grad_norm": 0.7529906630516052, "learning_rate": 0.0002, "epoch": 1.0685816876122083, "step": 14880}, {"loss": 0.7421, "grad_norm": 0.7532844543457031, "learning_rate": 0.0002, "epoch": 1.0692998204667863, "step": 14890}, {"loss": 0.6706, "grad_norm": 0.6439316868782043, "learning_rate": 0.0002, "epoch": 1.0700179533213645, "step": 14900}, {"loss": 0.7276, "grad_norm": 0.5580114126205444, "learning_rate": 0.0002, "epoch": 1.0707360861759425, "step": 14910}, {"loss": 0.7478, "grad_norm": 0.6299236416816711, "learning_rate": 0.0002, "epoch": 1.0714542190305207, "step": 14920}, {"loss": 0.7927, "grad_norm": 0.6934021711349487, "learning_rate": 0.0002, "epoch": 1.0721723518850987, "step": 14930}, {"loss": 0.6766, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 1.0728904847396767, "step": 14940}, {"loss": 0.7072, "grad_norm": 0.8921014070510864, "learning_rate": 0.0002, "epoch": 1.073608617594255, "step": 14950}, {"loss": 0.7127, "grad_norm": 0.5934301614761353, "learning_rate": 0.0002, "epoch": 1.074326750448833, "step": 14960}, {"loss": 0.7595, "grad_norm": 0.8379642367362976, "learning_rate": 0.0002, "epoch": 1.0750448833034112, "step": 14970}, {"loss": 0.7231, "grad_norm": 0.6842767596244812, "learning_rate": 0.0002, "epoch": 1.0757630161579892, "step": 14980}, {"loss": 0.7362, "grad_norm": 0.7296533584594727, "learning_rate": 0.0002, "epoch": 1.0764811490125674, "step": 14990}, {"loss": 0.688, "grad_norm": 0.6821087002754211, "learning_rate": 0.0002, "epoch": 1.0771992818671454, "step": 15000}, {"loss": 0.6808, "grad_norm": 0.6133626699447632, "learning_rate": 0.0002, "epoch": 1.0779174147217234, "step": 15010}, {"loss": 0.7351, "grad_norm": 0.6774773001670837, "learning_rate": 0.0002, "epoch": 1.0786355475763016, "step": 15020}, {"loss": 0.7403, "grad_norm": 0.6818786859512329, "learning_rate": 0.0002, "epoch": 1.0793536804308796, "step": 15030}, {"loss": 0.7005, "grad_norm": 0.7763522863388062, "learning_rate": 0.0002, "epoch": 1.0800718132854579, "step": 15040}, {"loss": 0.7028, "grad_norm": 0.7259193658828735, "learning_rate": 0.0002, "epoch": 1.0807899461400359, "step": 15050}, {"loss": 0.7232, "grad_norm": 0.6797525882720947, "learning_rate": 0.0002, "epoch": 1.081508078994614, "step": 15060}, {"loss": 0.7051, "grad_norm": 0.5775881409645081, "learning_rate": 0.0002, "epoch": 1.082226211849192, "step": 15070}, {"loss": 0.745, "grad_norm": 0.7055524587631226, "learning_rate": 0.0002, "epoch": 1.08294434470377, "step": 15080}, {"loss": 0.7539, "grad_norm": 0.8018748760223389, "learning_rate": 0.0002, "epoch": 1.0836624775583483, "step": 15090}, {"loss": 0.6833, "grad_norm": 0.6738115549087524, "learning_rate": 0.0002, "epoch": 1.0843806104129263, "step": 15100}, {"loss": 0.7014, "grad_norm": 0.6586359143257141, "learning_rate": 0.0002, "epoch": 1.0850987432675046, "step": 15110}, {"loss": 0.7391, "grad_norm": 0.7396895885467529, "learning_rate": 0.0002, "epoch": 1.0858168761220826, "step": 15120}, {"loss": 0.7473, "grad_norm": 0.7224817276000977, "learning_rate": 0.0002, "epoch": 1.0865350089766608, "step": 15130}, {"loss": 0.7137, "grad_norm": 0.798514187335968, "learning_rate": 0.0002, "epoch": 1.0872531418312388, "step": 15140}, {"loss": 0.757, "grad_norm": 0.79301518201828, "learning_rate": 0.0002, "epoch": 1.0879712746858168, "step": 15150}, {"loss": 0.7, "grad_norm": 0.7106764316558838, "learning_rate": 0.0002, "epoch": 1.088689407540395, "step": 15160}, {"loss": 0.7515, "grad_norm": 0.6525473594665527, "learning_rate": 0.0002, "epoch": 1.089407540394973, "step": 15170}, {"loss": 0.7067, "grad_norm": 0.6001671552658081, "learning_rate": 0.0002, "epoch": 1.0901256732495512, "step": 15180}, {"loss": 0.722, "grad_norm": 0.6949557662010193, "learning_rate": 0.0002, "epoch": 1.0908438061041292, "step": 15190}, {"loss": 0.7165, "grad_norm": 0.5713186860084534, "learning_rate": 0.0002, "epoch": 1.0915619389587075, "step": 15200}, {"loss": 0.7073, "grad_norm": 0.8773220181465149, "learning_rate": 0.0002, "epoch": 1.0922800718132855, "step": 15210}, {"loss": 0.7332, "grad_norm": 0.5837785601615906, "learning_rate": 0.0002, "epoch": 1.0929982046678635, "step": 15220}, {"loss": 0.7451, "grad_norm": 0.7243856191635132, "learning_rate": 0.0002, "epoch": 1.0937163375224417, "step": 15230}, {"loss": 0.6885, "grad_norm": 0.7008263468742371, "learning_rate": 0.0002, "epoch": 1.0944344703770197, "step": 15240}, {"loss": 0.7259, "grad_norm": 0.7061941623687744, "learning_rate": 0.0002, "epoch": 1.095152603231598, "step": 15250}, {"loss": 0.7482, "grad_norm": 0.575903594493866, "learning_rate": 0.0002, "epoch": 1.095870736086176, "step": 15260}, {"loss": 0.7001, "grad_norm": 0.6794043183326721, "learning_rate": 0.0002, "epoch": 1.0965888689407541, "step": 15270}, {"loss": 0.708, "grad_norm": 0.7194870710372925, "learning_rate": 0.0002, "epoch": 1.0973070017953321, "step": 15280}, {"loss": 0.7248, "grad_norm": 0.8063322305679321, "learning_rate": 0.0002, "epoch": 1.0980251346499101, "step": 15290}, {"loss": 0.7128, "grad_norm": 0.786101758480072, "learning_rate": 0.0002, "epoch": 1.0987432675044884, "step": 15300}, {"loss": 0.7523, "grad_norm": 0.827474057674408, "learning_rate": 0.0002, "epoch": 1.0994614003590664, "step": 15310}, {"loss": 0.7624, "grad_norm": 0.6514455080032349, "learning_rate": 0.0002, "epoch": 1.1001795332136446, "step": 15320}, {"loss": 0.745, "grad_norm": 0.7534348368644714, "learning_rate": 0.0002, "epoch": 1.1008976660682226, "step": 15330}, {"loss": 0.7359, "grad_norm": 0.6991367340087891, "learning_rate": 0.0002, "epoch": 1.1016157989228008, "step": 15340}, {"loss": 0.717, "grad_norm": 0.6742196679115295, "learning_rate": 0.0002, "epoch": 1.1023339317773788, "step": 15350}, {"loss": 0.737, "grad_norm": 0.7373757362365723, "learning_rate": 0.0002, "epoch": 1.1030520646319568, "step": 15360}, {"loss": 0.7421, "grad_norm": 0.6834485530853271, "learning_rate": 0.0002, "epoch": 1.103770197486535, "step": 15370}, {"loss": 0.7015, "grad_norm": 0.6454901099205017, "learning_rate": 0.0002, "epoch": 1.104488330341113, "step": 15380}, {"loss": 0.7276, "grad_norm": 0.7764508128166199, "learning_rate": 0.0002, "epoch": 1.1052064631956913, "step": 15390}, {"loss": 0.747, "grad_norm": 0.668560802936554, "learning_rate": 0.0002, "epoch": 1.1059245960502693, "step": 15400}, {"loss": 0.6705, "grad_norm": 0.579655110836029, "learning_rate": 0.0002, "epoch": 1.1066427289048475, "step": 15410}, {"loss": 0.7101, "grad_norm": 0.7196493148803711, "learning_rate": 0.0002, "epoch": 1.1073608617594255, "step": 15420}, {"loss": 0.8027, "grad_norm": 0.5530232191085815, "learning_rate": 0.0002, "epoch": 1.1080789946140035, "step": 15430}, {"loss": 0.7369, "grad_norm": 0.6542958617210388, "learning_rate": 0.0002, "epoch": 1.1087971274685817, "step": 15440}, {"loss": 0.7475, "grad_norm": 0.7468852400779724, "learning_rate": 0.0002, "epoch": 1.1095152603231597, "step": 15450}, {"loss": 0.6898, "grad_norm": 0.8119780421257019, "learning_rate": 0.0002, "epoch": 1.110233393177738, "step": 15460}, {"loss": 0.7652, "grad_norm": 0.7807733416557312, "learning_rate": 0.0002, "epoch": 1.110951526032316, "step": 15470}, {"loss": 0.697, "grad_norm": 0.7352553009986877, "learning_rate": 0.0002, "epoch": 1.1116696588868942, "step": 15480}, {"loss": 0.7509, "grad_norm": 0.8455224633216858, "learning_rate": 0.0002, "epoch": 1.1123877917414722, "step": 15490}, {"loss": 0.7757, "grad_norm": 0.635308563709259, "learning_rate": 0.0002, "epoch": 1.1131059245960502, "step": 15500}, {"loss": 0.685, "grad_norm": 0.6268794536590576, "learning_rate": 0.0002, "epoch": 1.1138240574506284, "step": 15510}, {"loss": 0.7174, "grad_norm": 0.6829593181610107, "learning_rate": 0.0002, "epoch": 1.1145421903052064, "step": 15520}, {"loss": 0.7264, "grad_norm": 0.5997796058654785, "learning_rate": 0.0002, "epoch": 1.1152603231597846, "step": 15530}, {"loss": 0.7167, "grad_norm": 0.7500942349433899, "learning_rate": 0.0002, "epoch": 1.1159784560143626, "step": 15540}, {"loss": 0.7275, "grad_norm": 0.7052047848701477, "learning_rate": 0.0002, "epoch": 1.1166965888689409, "step": 15550}, {"loss": 0.7832, "grad_norm": 0.6698189377784729, "learning_rate": 0.0002, "epoch": 1.1174147217235189, "step": 15560}, {"loss": 0.7587, "grad_norm": 0.7890462875366211, "learning_rate": 0.0002, "epoch": 1.1181328545780969, "step": 15570}, {"loss": 0.7092, "grad_norm": 0.7002465128898621, "learning_rate": 0.0002, "epoch": 1.118850987432675, "step": 15580}, {"loss": 0.6903, "grad_norm": 0.7456073760986328, "learning_rate": 0.0002, "epoch": 1.119569120287253, "step": 15590}, {"loss": 0.7577, "grad_norm": 0.7997385263442993, "learning_rate": 0.0002, "epoch": 1.1202872531418313, "step": 15600}, {"loss": 0.7005, "grad_norm": 0.6640482544898987, "learning_rate": 0.0002, "epoch": 1.1210053859964093, "step": 15610}, {"loss": 0.7334, "grad_norm": 0.7765318155288696, "learning_rate": 0.0002, "epoch": 1.1217235188509875, "step": 15620}, {"loss": 0.6977, "grad_norm": 0.7184962630271912, "learning_rate": 0.0002, "epoch": 1.1224416517055655, "step": 15630}, {"loss": 0.7362, "grad_norm": 0.7310904264450073, "learning_rate": 0.0002, "epoch": 1.1231597845601435, "step": 15640}, {"loss": 0.7278, "grad_norm": 0.7406452298164368, "learning_rate": 0.0002, "epoch": 1.1238779174147218, "step": 15650}, {"loss": 0.7074, "grad_norm": 0.7546738982200623, "learning_rate": 0.0002, "epoch": 1.1245960502692998, "step": 15660}, {"loss": 0.7641, "grad_norm": 0.7069764733314514, "learning_rate": 0.0002, "epoch": 1.125314183123878, "step": 15670}, {"loss": 0.76, "grad_norm": 0.6309521198272705, "learning_rate": 0.0002, "epoch": 1.126032315978456, "step": 15680}, {"loss": 0.7862, "grad_norm": 0.8050156831741333, "learning_rate": 0.0002, "epoch": 1.1267504488330342, "step": 15690}, {"loss": 0.7553, "grad_norm": 0.726556122303009, "learning_rate": 0.0002, "epoch": 1.1274685816876122, "step": 15700}, {"loss": 0.7763, "grad_norm": 0.77745521068573, "learning_rate": 0.0002, "epoch": 1.1281867145421902, "step": 15710}, {"loss": 0.7703, "grad_norm": 0.7467634677886963, "learning_rate": 0.0002, "epoch": 1.1289048473967684, "step": 15720}, {"loss": 0.7676, "grad_norm": 0.8207895755767822, "learning_rate": 0.0002, "epoch": 1.1296229802513464, "step": 15730}, {"loss": 0.6747, "grad_norm": 0.8253937363624573, "learning_rate": 0.0002, "epoch": 1.1303411131059247, "step": 15740}, {"loss": 0.6983, "grad_norm": 0.6313983798027039, "learning_rate": 0.0002, "epoch": 1.1310592459605027, "step": 15750}, {"loss": 0.6916, "grad_norm": 0.8040992021560669, "learning_rate": 0.0002, "epoch": 1.1317773788150807, "step": 15760}, {"loss": 0.7295, "grad_norm": 0.5937064290046692, "learning_rate": 0.0002, "epoch": 1.132495511669659, "step": 15770}, {"loss": 0.7494, "grad_norm": 0.6486281156539917, "learning_rate": 0.0002, "epoch": 1.133213644524237, "step": 15780}, {"loss": 0.7029, "grad_norm": 0.6161853075027466, "learning_rate": 0.0002, "epoch": 1.1339317773788151, "step": 15790}, {"loss": 0.7019, "grad_norm": 0.6926610469818115, "learning_rate": 0.0002, "epoch": 1.1346499102333931, "step": 15800}, {"loss": 0.6906, "grad_norm": 0.6084047555923462, "learning_rate": 0.0002, "epoch": 1.1353680430879713, "step": 15810}, {"loss": 0.7091, "grad_norm": 0.6928383111953735, "learning_rate": 0.0002, "epoch": 1.1360861759425493, "step": 15820}, {"loss": 0.7238, "grad_norm": 0.7784243822097778, "learning_rate": 0.0002, "epoch": 1.1368043087971276, "step": 15830}, {"loss": 0.6943, "grad_norm": 0.7169384956359863, "learning_rate": 0.0002, "epoch": 1.1375224416517056, "step": 15840}, {"loss": 0.7287, "grad_norm": 0.6953616142272949, "learning_rate": 0.0002, "epoch": 1.1382405745062836, "step": 15850}, {"loss": 0.7489, "grad_norm": 0.7345215082168579, "learning_rate": 0.0002, "epoch": 1.1389587073608618, "step": 15860}, {"loss": 0.683, "grad_norm": 0.5469502806663513, "learning_rate": 0.0002, "epoch": 1.1396768402154398, "step": 15870}, {"loss": 0.717, "grad_norm": 0.687680721282959, "learning_rate": 0.0002, "epoch": 1.140394973070018, "step": 15880}, {"loss": 0.7171, "grad_norm": 0.6879996657371521, "learning_rate": 0.0002, "epoch": 1.141113105924596, "step": 15890}, {"loss": 0.7321, "grad_norm": 0.728886067867279, "learning_rate": 0.0002, "epoch": 1.141831238779174, "step": 15900}, {"loss": 0.7752, "grad_norm": 0.929531455039978, "learning_rate": 0.0002, "epoch": 1.1425493716337523, "step": 15910}, {"loss": 0.7353, "grad_norm": 0.8122507333755493, "learning_rate": 0.0002, "epoch": 1.1432675044883303, "step": 15920}, {"loss": 0.7138, "grad_norm": 0.6494652628898621, "learning_rate": 0.0002, "epoch": 1.1439856373429085, "step": 15930}, {"loss": 0.7489, "grad_norm": 0.7307567596435547, "learning_rate": 0.0002, "epoch": 1.1447037701974865, "step": 15940}, {"loss": 0.7385, "grad_norm": 0.548678994178772, "learning_rate": 0.0002, "epoch": 1.1454219030520647, "step": 15950}, {"loss": 0.7152, "grad_norm": 0.8011603951454163, "learning_rate": 0.0002, "epoch": 1.1461400359066427, "step": 15960}, {"loss": 0.7324, "grad_norm": 0.7026647329330444, "learning_rate": 0.0002, "epoch": 1.146858168761221, "step": 15970}, {"loss": 0.7464, "grad_norm": 0.7338995933532715, "learning_rate": 0.0002, "epoch": 1.147576301615799, "step": 15980}, {"loss": 0.7416, "grad_norm": 0.8453443646430969, "learning_rate": 0.0002, "epoch": 1.148294434470377, "step": 15990}, {"loss": 0.7419, "grad_norm": 0.6787207126617432, "learning_rate": 0.0002, "epoch": 1.1490125673249552, "step": 16000}, {"loss": 0.7487, "grad_norm": 0.6314631104469299, "learning_rate": 0.0002, "epoch": 1.1497307001795332, "step": 16010}, {"loss": 0.7165, "grad_norm": 0.8812752962112427, "learning_rate": 0.0002, "epoch": 1.1504488330341114, "step": 16020}, {"loss": 0.774, "grad_norm": 0.6528969407081604, "learning_rate": 0.0002, "epoch": 1.1511669658886894, "step": 16030}, {"loss": 0.7321, "grad_norm": 0.7843571305274963, "learning_rate": 0.0002, "epoch": 1.1518850987432674, "step": 16040}, {"loss": 0.7769, "grad_norm": 0.7095080018043518, "learning_rate": 0.0002, "epoch": 1.1526032315978456, "step": 16050}, {"loss": 0.744, "grad_norm": 0.7495582103729248, "learning_rate": 0.0002, "epoch": 1.1533213644524236, "step": 16060}, {"loss": 0.7813, "grad_norm": 0.6002049446105957, "learning_rate": 0.0002, "epoch": 1.1540394973070018, "step": 16070}, {"loss": 0.7117, "grad_norm": 0.565014123916626, "learning_rate": 0.0002, "epoch": 1.1547576301615798, "step": 16080}, {"loss": 0.7664, "grad_norm": 0.8209971785545349, "learning_rate": 0.0002, "epoch": 1.155475763016158, "step": 16090}, {"loss": 0.7486, "grad_norm": 0.7137531042098999, "learning_rate": 0.0002, "epoch": 1.156193895870736, "step": 16100}, {"loss": 0.7197, "grad_norm": 0.7307516932487488, "learning_rate": 0.0002, "epoch": 1.1569120287253143, "step": 16110}, {"loss": 0.7351, "grad_norm": 0.6686444878578186, "learning_rate": 0.0002, "epoch": 1.1576301615798923, "step": 16120}, {"loss": 0.7407, "grad_norm": 0.7977298498153687, "learning_rate": 0.0002, "epoch": 1.1583482944344703, "step": 16130}, {"loss": 0.6696, "grad_norm": 0.6980607509613037, "learning_rate": 0.0002, "epoch": 1.1590664272890485, "step": 16140}, {"loss": 0.7513, "grad_norm": 0.6622613668441772, "learning_rate": 0.0002, "epoch": 1.1597845601436265, "step": 16150}, {"loss": 0.7162, "grad_norm": 0.6598347425460815, "learning_rate": 0.0002, "epoch": 1.1605026929982047, "step": 16160}, {"loss": 0.7418, "grad_norm": 0.6686234474182129, "learning_rate": 0.0002, "epoch": 1.1612208258527827, "step": 16170}, {"loss": 0.7104, "grad_norm": 0.7308177947998047, "learning_rate": 0.0002, "epoch": 1.1619389587073607, "step": 16180}, {"loss": 0.7337, "grad_norm": 0.939537525177002, "learning_rate": 0.0002, "epoch": 1.162657091561939, "step": 16190}, {"loss": 0.7054, "grad_norm": 0.5514758825302124, "learning_rate": 0.0002, "epoch": 1.163375224416517, "step": 16200}, {"loss": 0.7449, "grad_norm": 0.589142918586731, "learning_rate": 0.0002, "epoch": 1.1640933572710952, "step": 16210}, {"loss": 0.7438, "grad_norm": 0.6888012290000916, "learning_rate": 0.0002, "epoch": 1.1648114901256732, "step": 16220}, {"loss": 0.719, "grad_norm": 0.82566899061203, "learning_rate": 0.0002, "epoch": 1.1655296229802514, "step": 16230}, {"loss": 0.7274, "grad_norm": 0.6107817888259888, "learning_rate": 0.0002, "epoch": 1.1662477558348294, "step": 16240}, {"loss": 0.6849, "grad_norm": 0.7831398844718933, "learning_rate": 0.0002, "epoch": 1.1669658886894076, "step": 16250}, {"loss": 0.7077, "grad_norm": 0.6468397974967957, "learning_rate": 0.0002, "epoch": 1.1676840215439857, "step": 16260}, {"loss": 0.7056, "grad_norm": 0.7284161448478699, "learning_rate": 0.0002, "epoch": 1.1684021543985637, "step": 16270}, {"loss": 0.7476, "grad_norm": 0.6182818412780762, "learning_rate": 0.0002, "epoch": 1.1691202872531419, "step": 16280}, {"loss": 0.7608, "grad_norm": 0.7091781497001648, "learning_rate": 0.0002, "epoch": 1.1698384201077199, "step": 16290}, {"loss": 0.7235, "grad_norm": 0.7327643632888794, "learning_rate": 0.0002, "epoch": 1.170556552962298, "step": 16300}, {"loss": 0.7304, "grad_norm": 0.5864694118499756, "learning_rate": 0.0002, "epoch": 1.171274685816876, "step": 16310}, {"loss": 0.7011, "grad_norm": 0.7049986720085144, "learning_rate": 0.0002, "epoch": 1.171992818671454, "step": 16320}, {"loss": 0.7234, "grad_norm": 0.7563399076461792, "learning_rate": 0.0002, "epoch": 1.1727109515260323, "step": 16330}, {"loss": 0.7313, "grad_norm": 0.5888143181800842, "learning_rate": 0.0002, "epoch": 1.1734290843806103, "step": 16340}, {"loss": 0.7078, "grad_norm": 0.8670049905776978, "learning_rate": 0.0002, "epoch": 1.1741472172351886, "step": 16350}, {"loss": 0.7656, "grad_norm": 0.8045654296875, "learning_rate": 0.0002, "epoch": 1.1748653500897666, "step": 16360}, {"loss": 0.7942, "grad_norm": 0.9115668535232544, "learning_rate": 0.0002, "epoch": 1.1755834829443448, "step": 16370}, {"loss": 0.6807, "grad_norm": 0.6943584084510803, "learning_rate": 0.0002, "epoch": 1.1763016157989228, "step": 16380}, {"loss": 0.7558, "grad_norm": 0.7931740283966064, "learning_rate": 0.0002, "epoch": 1.177019748653501, "step": 16390}, {"loss": 0.7247, "grad_norm": 0.7967953085899353, "learning_rate": 0.0002, "epoch": 1.177737881508079, "step": 16400}, {"loss": 0.7294, "grad_norm": 0.575165867805481, "learning_rate": 0.0002, "epoch": 1.178456014362657, "step": 16410}, {"loss": 0.8045, "grad_norm": 0.6803409457206726, "learning_rate": 0.0002, "epoch": 1.1791741472172352, "step": 16420}, {"loss": 0.7594, "grad_norm": 0.7661909461021423, "learning_rate": 0.0002, "epoch": 1.1798922800718132, "step": 16430}, {"loss": 0.7387, "grad_norm": 0.7907630205154419, "learning_rate": 0.0002, "epoch": 1.1806104129263915, "step": 16440}, {"loss": 0.6954, "grad_norm": 0.7215338945388794, "learning_rate": 0.0002, "epoch": 1.1813285457809695, "step": 16450}, {"loss": 0.7503, "grad_norm": 0.6824054718017578, "learning_rate": 0.0002, "epoch": 1.1820466786355475, "step": 16460}, {"loss": 0.7548, "grad_norm": 0.8057665228843689, "learning_rate": 0.0002, "epoch": 1.1827648114901257, "step": 16470}, {"loss": 0.7572, "grad_norm": 0.7487542033195496, "learning_rate": 0.0002, "epoch": 1.1834829443447037, "step": 16480}, {"loss": 0.7267, "grad_norm": 0.7254953384399414, "learning_rate": 0.0002, "epoch": 1.184201077199282, "step": 16490}, {"loss": 0.6906, "grad_norm": 0.6986604332923889, "learning_rate": 0.0002, "epoch": 1.18491921005386, "step": 16500}, {"loss": 0.6979, "grad_norm": 0.7889591455459595, "learning_rate": 0.0002, "epoch": 1.1856373429084381, "step": 16510}, {"loss": 0.7455, "grad_norm": 0.6029604077339172, "learning_rate": 0.0002, "epoch": 1.1863554757630161, "step": 16520}, {"loss": 0.7673, "grad_norm": 0.680322527885437, "learning_rate": 0.0002, "epoch": 1.1870736086175944, "step": 16530}, {"loss": 0.708, "grad_norm": 0.8588826060295105, "learning_rate": 0.0002, "epoch": 1.1877917414721724, "step": 16540}, {"loss": 0.7291, "grad_norm": 0.7614806890487671, "learning_rate": 0.0002, "epoch": 1.1885098743267504, "step": 16550}, {"loss": 0.7021, "grad_norm": 0.7523183226585388, "learning_rate": 0.0002, "epoch": 1.1892280071813286, "step": 16560}, {"loss": 0.7452, "grad_norm": 0.8299532532691956, "learning_rate": 0.0002, "epoch": 1.1899461400359066, "step": 16570}, {"loss": 0.7409, "grad_norm": 0.6709241271018982, "learning_rate": 0.0002, "epoch": 1.1906642728904848, "step": 16580}, {"loss": 0.7322, "grad_norm": 0.665414035320282, "learning_rate": 0.0002, "epoch": 1.1913824057450628, "step": 16590}, {"loss": 0.7699, "grad_norm": 0.7582152485847473, "learning_rate": 0.0002, "epoch": 1.1921005385996408, "step": 16600}, {"loss": 0.7069, "grad_norm": 0.5856947302818298, "learning_rate": 0.0002, "epoch": 1.192818671454219, "step": 16610}, {"loss": 0.7444, "grad_norm": 0.6972885727882385, "learning_rate": 0.0002, "epoch": 1.193536804308797, "step": 16620}, {"loss": 0.7265, "grad_norm": 0.6884734630584717, "learning_rate": 0.0002, "epoch": 1.1942549371633753, "step": 16630}, {"loss": 0.6881, "grad_norm": 0.7380475401878357, "learning_rate": 0.0002, "epoch": 1.1949730700179533, "step": 16640}, {"loss": 0.7297, "grad_norm": 0.7976197600364685, "learning_rate": 0.0002, "epoch": 1.1956912028725315, "step": 16650}, {"loss": 0.7328, "grad_norm": 0.819256067276001, "learning_rate": 0.0002, "epoch": 1.1964093357271095, "step": 16660}, {"loss": 0.771, "grad_norm": 0.587867796421051, "learning_rate": 0.0002, "epoch": 1.1971274685816877, "step": 16670}, {"loss": 0.7357, "grad_norm": 0.9162678122520447, "learning_rate": 0.0002, "epoch": 1.1978456014362657, "step": 16680}, {"loss": 0.7472, "grad_norm": 0.7452084422111511, "learning_rate": 0.0002, "epoch": 1.1985637342908437, "step": 16690}, {"loss": 0.7257, "grad_norm": 0.7966971397399902, "learning_rate": 0.0002, "epoch": 1.199281867145422, "step": 16700}, {"loss": 0.8051, "grad_norm": 0.6605724692344666, "learning_rate": 0.0002, "epoch": 1.2, "step": 16710}, {"loss": 0.729, "grad_norm": 0.6499220728874207, "learning_rate": 0.0002, "epoch": 1.2007181328545782, "step": 16720}, {"loss": 0.7107, "grad_norm": 0.7422114610671997, "learning_rate": 0.0002, "epoch": 1.2014362657091562, "step": 16730}, {"loss": 0.6712, "grad_norm": 0.6652370095252991, "learning_rate": 0.0002, "epoch": 1.2021543985637342, "step": 16740}, {"loss": 0.7804, "grad_norm": 0.8761070370674133, "learning_rate": 0.0002, "epoch": 1.2028725314183124, "step": 16750}, {"loss": 0.737, "grad_norm": 0.7294463515281677, "learning_rate": 0.0002, "epoch": 1.2035906642728904, "step": 16760}, {"loss": 0.7638, "grad_norm": 0.7725599408149719, "learning_rate": 0.0002, "epoch": 1.2043087971274686, "step": 16770}, {"loss": 0.6857, "grad_norm": 0.5630005598068237, "learning_rate": 0.0002, "epoch": 1.2050269299820466, "step": 16780}, {"loss": 0.7344, "grad_norm": 0.7601404786109924, "learning_rate": 0.0002, "epoch": 1.2057450628366249, "step": 16790}, {"loss": 0.729, "grad_norm": 0.6859985589981079, "learning_rate": 0.0002, "epoch": 1.2064631956912029, "step": 16800}, {"loss": 0.7203, "grad_norm": 0.7040054798126221, "learning_rate": 0.0002, "epoch": 1.207181328545781, "step": 16810}, {"loss": 0.7727, "grad_norm": 0.7058989405632019, "learning_rate": 0.0002, "epoch": 1.207899461400359, "step": 16820}, {"loss": 0.7247, "grad_norm": 0.7646133899688721, "learning_rate": 0.0002, "epoch": 1.208617594254937, "step": 16830}, {"loss": 0.7903, "grad_norm": 0.669550359249115, "learning_rate": 0.0002, "epoch": 1.2093357271095153, "step": 16840}, {"loss": 0.7313, "grad_norm": 0.6613401174545288, "learning_rate": 0.0002, "epoch": 1.2100538599640933, "step": 16850}, {"loss": 0.7181, "grad_norm": 0.8636519312858582, "learning_rate": 0.0002, "epoch": 1.2107719928186715, "step": 16860}, {"loss": 0.7111, "grad_norm": 0.6077507138252258, "learning_rate": 0.0002, "epoch": 1.2114901256732495, "step": 16870}, {"loss": 0.7706, "grad_norm": 0.7892228364944458, "learning_rate": 0.0002, "epoch": 1.2122082585278275, "step": 16880}, {"loss": 0.685, "grad_norm": 0.7424154877662659, "learning_rate": 0.0002, "epoch": 1.2129263913824058, "step": 16890}, {"loss": 0.6707, "grad_norm": 0.6525408029556274, "learning_rate": 0.0002, "epoch": 1.2136445242369838, "step": 16900}, {"loss": 0.7721, "grad_norm": 0.6178015470504761, "learning_rate": 0.0002, "epoch": 1.214362657091562, "step": 16910}, {"loss": 0.6971, "grad_norm": 0.7319437861442566, "learning_rate": 0.0002, "epoch": 1.21508078994614, "step": 16920}, {"loss": 0.7261, "grad_norm": 0.6823344826698303, "learning_rate": 0.0002, "epoch": 1.2157989228007182, "step": 16930}, {"loss": 0.7048, "grad_norm": 0.5681257843971252, "learning_rate": 0.0002, "epoch": 1.2165170556552962, "step": 16940}, {"loss": 0.7398, "grad_norm": 0.7939814925193787, "learning_rate": 0.0002, "epoch": 1.2172351885098744, "step": 16950}, {"loss": 0.7192, "grad_norm": 0.7031611800193787, "learning_rate": 0.0002, "epoch": 1.2179533213644524, "step": 16960}, {"loss": 0.7212, "grad_norm": 0.7610133290290833, "learning_rate": 0.0002, "epoch": 1.2186714542190304, "step": 16970}, {"loss": 0.7599, "grad_norm": 0.8707142472267151, "learning_rate": 0.0002, "epoch": 1.2193895870736087, "step": 16980}, {"loss": 0.7121, "grad_norm": 0.6603384017944336, "learning_rate": 0.0002, "epoch": 1.2201077199281867, "step": 16990}, {"loss": 0.7315, "grad_norm": 0.7218315005302429, "learning_rate": 0.0002, "epoch": 1.220825852782765, "step": 17000}, {"loss": 0.7513, "grad_norm": 0.8043148517608643, "learning_rate": 0.0002, "epoch": 1.221543985637343, "step": 17010}, {"loss": 0.6749, "grad_norm": 0.7232559323310852, "learning_rate": 0.0002, "epoch": 1.222262118491921, "step": 17020}, {"loss": 0.7681, "grad_norm": 0.690376341342926, "learning_rate": 0.0002, "epoch": 1.2229802513464991, "step": 17030}, {"loss": 0.7042, "grad_norm": 0.602436363697052, "learning_rate": 0.0002, "epoch": 1.2236983842010771, "step": 17040}, {"loss": 0.7129, "grad_norm": 0.7610493898391724, "learning_rate": 0.0002, "epoch": 1.2244165170556554, "step": 17050}, {"loss": 0.758, "grad_norm": 0.7504690885543823, "learning_rate": 0.0002, "epoch": 1.2251346499102334, "step": 17060}, {"loss": 0.6908, "grad_norm": 0.8080246448516846, "learning_rate": 0.0002, "epoch": 1.2258527827648116, "step": 17070}, {"loss": 0.7519, "grad_norm": 1.0240572690963745, "learning_rate": 0.0002, "epoch": 1.2265709156193896, "step": 17080}, {"loss": 0.7193, "grad_norm": 0.6874111294746399, "learning_rate": 0.0002, "epoch": 1.2272890484739678, "step": 17090}, {"loss": 0.79, "grad_norm": 0.800069272518158, "learning_rate": 0.0002, "epoch": 1.2280071813285458, "step": 17100}, {"loss": 0.742, "grad_norm": 0.8628103137016296, "learning_rate": 0.0002, "epoch": 1.2287253141831238, "step": 17110}, {"loss": 0.7022, "grad_norm": 0.7408499121665955, "learning_rate": 0.0002, "epoch": 1.229443447037702, "step": 17120}, {"loss": 0.6774, "grad_norm": 0.6494335532188416, "learning_rate": 0.0002, "epoch": 1.23016157989228, "step": 17130}, {"loss": 0.7025, "grad_norm": 0.6493549942970276, "learning_rate": 0.0002, "epoch": 1.2308797127468583, "step": 17140}, {"loss": 0.7448, "grad_norm": 0.6972658038139343, "learning_rate": 0.0002, "epoch": 1.2315978456014363, "step": 17150}, {"loss": 0.7219, "grad_norm": 0.6877315044403076, "learning_rate": 0.0002, "epoch": 1.2323159784560143, "step": 17160}, {"loss": 0.7945, "grad_norm": 0.7569024562835693, "learning_rate": 0.0002, "epoch": 1.2330341113105925, "step": 17170}, {"loss": 0.7467, "grad_norm": 0.696260392665863, "learning_rate": 0.0002, "epoch": 1.2337522441651705, "step": 17180}, {"loss": 0.6716, "grad_norm": 0.6150345802307129, "learning_rate": 0.0002, "epoch": 1.2344703770197487, "step": 17190}, {"loss": 0.7416, "grad_norm": 0.69009929895401, "learning_rate": 0.0002, "epoch": 1.2351885098743267, "step": 17200}, {"loss": 0.787, "grad_norm": 0.7035185098648071, "learning_rate": 0.0002, "epoch": 1.235906642728905, "step": 17210}, {"loss": 0.6896, "grad_norm": 0.6792506575584412, "learning_rate": 0.0002, "epoch": 1.236624775583483, "step": 17220}, {"loss": 0.6953, "grad_norm": 0.6310356855392456, "learning_rate": 0.0002, "epoch": 1.2373429084380612, "step": 17230}, {"loss": 0.7531, "grad_norm": 0.647026538848877, "learning_rate": 0.0002, "epoch": 1.2380610412926392, "step": 17240}, {"loss": 0.8014, "grad_norm": 0.7609930038452148, "learning_rate": 0.0002, "epoch": 1.2387791741472172, "step": 17250}, {"loss": 0.8045, "grad_norm": 0.791890561580658, "learning_rate": 0.0002, "epoch": 1.2394973070017954, "step": 17260}, {"loss": 0.7445, "grad_norm": 0.7126715183258057, "learning_rate": 0.0002, "epoch": 1.2402154398563734, "step": 17270}, {"loss": 0.6561, "grad_norm": 0.7850401401519775, "learning_rate": 0.0002, "epoch": 1.2409335727109516, "step": 17280}, {"loss": 0.7454, "grad_norm": 0.6694281697273254, "learning_rate": 0.0002, "epoch": 1.2416517055655296, "step": 17290}, {"loss": 0.6711, "grad_norm": 0.6418080925941467, "learning_rate": 0.0002, "epoch": 1.2423698384201076, "step": 17300}, {"loss": 0.7504, "grad_norm": 0.7308132648468018, "learning_rate": 0.0002, "epoch": 1.2430879712746858, "step": 17310}, {"loss": 0.6896, "grad_norm": 0.8322312235832214, "learning_rate": 0.0002, "epoch": 1.2438061041292638, "step": 17320}, {"loss": 0.7341, "grad_norm": 0.6959006190299988, "learning_rate": 0.0002, "epoch": 1.244524236983842, "step": 17330}, {"loss": 0.7025, "grad_norm": 0.7110121846199036, "learning_rate": 0.0002, "epoch": 1.24524236983842, "step": 17340}, {"loss": 0.7858, "grad_norm": 0.6496296525001526, "learning_rate": 0.0002, "epoch": 1.2459605026929983, "step": 17350}, {"loss": 0.7061, "grad_norm": 0.7649076581001282, "learning_rate": 0.0002, "epoch": 1.2466786355475763, "step": 17360}, {"loss": 0.7155, "grad_norm": 0.7139049172401428, "learning_rate": 0.0002, "epoch": 1.2473967684021545, "step": 17370}, {"loss": 0.6932, "grad_norm": 0.7709113955497742, "learning_rate": 0.0002, "epoch": 1.2481149012567325, "step": 17380}, {"loss": 0.731, "grad_norm": 0.7160373330116272, "learning_rate": 0.0002, "epoch": 1.2488330341113105, "step": 17390}, {"loss": 0.7146, "grad_norm": 0.5608301162719727, "learning_rate": 0.0002, "epoch": 1.2495511669658887, "step": 17400}, {"loss": 0.7368, "grad_norm": 0.6913180351257324, "learning_rate": 0.0002, "epoch": 1.2502692998204668, "step": 17410}, {"loss": 0.7167, "grad_norm": 0.6980322599411011, "learning_rate": 0.0002, "epoch": 1.250987432675045, "step": 17420}, {"loss": 0.7096, "grad_norm": 0.8155394792556763, "learning_rate": 0.0002, "epoch": 1.251705565529623, "step": 17430}, {"loss": 0.7477, "grad_norm": 0.8015886545181274, "learning_rate": 0.0002, "epoch": 1.252423698384201, "step": 17440}, {"loss": 0.7006, "grad_norm": 0.5985556244850159, "learning_rate": 0.0002, "epoch": 1.2531418312387792, "step": 17450}, {"loss": 0.7171, "grad_norm": 0.70317143201828, "learning_rate": 0.0002, "epoch": 1.2538599640933572, "step": 17460}, {"loss": 0.7006, "grad_norm": 0.612501323223114, "learning_rate": 0.0002, "epoch": 1.2545780969479354, "step": 17470}, {"loss": 0.7639, "grad_norm": 0.7347102165222168, "learning_rate": 0.0002, "epoch": 1.2552962298025134, "step": 17480}, {"loss": 0.7303, "grad_norm": 0.9189441800117493, "learning_rate": 0.0002, "epoch": 1.2560143626570914, "step": 17490}, {"loss": 0.7547, "grad_norm": 0.7727932929992676, "learning_rate": 0.0002, "epoch": 1.2567324955116697, "step": 17500}, {"loss": 0.6979, "grad_norm": 0.6782869696617126, "learning_rate": 0.0002, "epoch": 1.2574506283662479, "step": 17510}, {"loss": 0.7146, "grad_norm": 0.5710638761520386, "learning_rate": 0.0002, "epoch": 1.2581687612208259, "step": 17520}, {"loss": 0.6999, "grad_norm": 0.6856266856193542, "learning_rate": 0.0002, "epoch": 1.2588868940754039, "step": 17530}, {"loss": 0.7229, "grad_norm": 0.7257347702980042, "learning_rate": 0.0002, "epoch": 1.259605026929982, "step": 17540}, {"loss": 0.7475, "grad_norm": 0.6343092918395996, "learning_rate": 0.0002, "epoch": 1.26032315978456, "step": 17550}, {"loss": 0.7863, "grad_norm": 0.6482594013214111, "learning_rate": 0.0002, "epoch": 1.2610412926391383, "step": 17560}, {"loss": 0.716, "grad_norm": 0.6542837619781494, "learning_rate": 0.0002, "epoch": 1.2617594254937163, "step": 17570}, {"loss": 0.7871, "grad_norm": 0.7106123566627502, "learning_rate": 0.0002, "epoch": 1.2624775583482943, "step": 17580}, {"loss": 0.7446, "grad_norm": 0.9081960320472717, "learning_rate": 0.0002, "epoch": 1.2631956912028726, "step": 17590}, {"loss": 0.7591, "grad_norm": 0.7010290026664734, "learning_rate": 0.0002, "epoch": 1.2639138240574506, "step": 17600}, {"loss": 0.7391, "grad_norm": 0.9973132610321045, "learning_rate": 0.0002, "epoch": 1.2646319569120288, "step": 17610}, {"loss": 0.725, "grad_norm": 0.8003297448158264, "learning_rate": 0.0002, "epoch": 1.2653500897666068, "step": 17620}, {"loss": 0.697, "grad_norm": 0.7383468151092529, "learning_rate": 0.0002, "epoch": 1.2660682226211848, "step": 17630}, {"loss": 0.785, "grad_norm": 0.6337200999259949, "learning_rate": 0.0002, "epoch": 1.266786355475763, "step": 17640}, {"loss": 0.7469, "grad_norm": 0.6371761560440063, "learning_rate": 0.0002, "epoch": 1.2675044883303412, "step": 17650}, {"loss": 0.7348, "grad_norm": 0.7283522486686707, "learning_rate": 0.0002, "epoch": 1.2682226211849192, "step": 17660}, {"loss": 0.7251, "grad_norm": 0.8191015720367432, "learning_rate": 0.0002, "epoch": 1.2689407540394972, "step": 17670}, {"loss": 0.7558, "grad_norm": 0.6210351586341858, "learning_rate": 0.0002, "epoch": 1.2696588868940755, "step": 17680}, {"loss": 0.7733, "grad_norm": 0.6563277840614319, "learning_rate": 0.0002, "epoch": 1.2703770197486535, "step": 17690}, {"loss": 0.7065, "grad_norm": 0.7111260294914246, "learning_rate": 0.0002, "epoch": 1.2710951526032317, "step": 17700}, {"loss": 0.7079, "grad_norm": 0.7061500549316406, "learning_rate": 0.0002, "epoch": 1.2718132854578097, "step": 17710}, {"loss": 0.7612, "grad_norm": 0.7657744884490967, "learning_rate": 0.0002, "epoch": 1.2725314183123877, "step": 17720}, {"loss": 0.7513, "grad_norm": 0.6952996850013733, "learning_rate": 0.0002, "epoch": 1.273249551166966, "step": 17730}, {"loss": 0.7402, "grad_norm": 0.5678043961524963, "learning_rate": 0.0002, "epoch": 1.273967684021544, "step": 17740}, {"loss": 0.7357, "grad_norm": 0.8608036041259766, "learning_rate": 0.0002, "epoch": 1.2746858168761221, "step": 17750}, {"loss": 0.7482, "grad_norm": 0.7184045910835266, "learning_rate": 0.0002, "epoch": 1.2754039497307001, "step": 17760}, {"loss": 0.7277, "grad_norm": 0.6647557616233826, "learning_rate": 0.0002, "epoch": 1.2761220825852782, "step": 17770}, {"loss": 0.6866, "grad_norm": 0.6899349093437195, "learning_rate": 0.0002, "epoch": 1.2768402154398564, "step": 17780}, {"loss": 0.721, "grad_norm": 0.7073346972465515, "learning_rate": 0.0002, "epoch": 1.2775583482944346, "step": 17790}, {"loss": 0.7432, "grad_norm": 0.8896707892417908, "learning_rate": 0.0002, "epoch": 1.2782764811490126, "step": 17800}, {"loss": 0.7318, "grad_norm": 0.5072778463363647, "learning_rate": 0.0002, "epoch": 1.2789946140035906, "step": 17810}, {"loss": 0.7648, "grad_norm": 0.8889711499214172, "learning_rate": 0.0002, "epoch": 1.2797127468581688, "step": 17820}, {"loss": 0.6894, "grad_norm": 0.5583778619766235, "learning_rate": 0.0002, "epoch": 1.2804308797127468, "step": 17830}, {"loss": 0.7488, "grad_norm": 0.6526148915290833, "learning_rate": 0.0002, "epoch": 1.281149012567325, "step": 17840}, {"loss": 0.7462, "grad_norm": 0.7658175826072693, "learning_rate": 0.0002, "epoch": 1.281867145421903, "step": 17850}, {"loss": 0.7298, "grad_norm": 0.5547847151756287, "learning_rate": 0.0002, "epoch": 1.282585278276481, "step": 17860}, {"loss": 0.705, "grad_norm": 0.6153780817985535, "learning_rate": 0.0002, "epoch": 1.2833034111310593, "step": 17870}, {"loss": 0.7173, "grad_norm": 0.8474061489105225, "learning_rate": 0.0002, "epoch": 1.2840215439856373, "step": 17880}, {"loss": 0.7597, "grad_norm": 0.859260618686676, "learning_rate": 0.0002, "epoch": 1.2847396768402155, "step": 17890}, {"loss": 0.7237, "grad_norm": 0.7270520329475403, "learning_rate": 0.0002, "epoch": 1.2854578096947935, "step": 17900}, {"loss": 0.701, "grad_norm": 0.8166249394416809, "learning_rate": 0.0002, "epoch": 1.2861759425493715, "step": 17910}, {"loss": 0.686, "grad_norm": 0.9158982038497925, "learning_rate": 0.0002, "epoch": 1.2868940754039497, "step": 17920}, {"loss": 0.7243, "grad_norm": 0.8132565021514893, "learning_rate": 0.0002, "epoch": 1.287612208258528, "step": 17930}, {"loss": 0.6909, "grad_norm": 0.7914409637451172, "learning_rate": 0.0002, "epoch": 1.288330341113106, "step": 17940}, {"loss": 0.7034, "grad_norm": 0.6256071329116821, "learning_rate": 0.0002, "epoch": 1.289048473967684, "step": 17950}, {"loss": 0.7279, "grad_norm": 0.6463542580604553, "learning_rate": 0.0002, "epoch": 1.2897666068222622, "step": 17960}, {"loss": 0.7601, "grad_norm": 0.6702672839164734, "learning_rate": 0.0002, "epoch": 1.2904847396768402, "step": 17970}, {"loss": 0.7355, "grad_norm": 0.8666605949401855, "learning_rate": 0.0002, "epoch": 1.2912028725314184, "step": 17980}, {"loss": 0.6838, "grad_norm": 0.8055952787399292, "learning_rate": 0.0002, "epoch": 1.2919210053859964, "step": 17990}, {"loss": 0.7361, "grad_norm": 0.6909741163253784, "learning_rate": 0.0002, "epoch": 1.2926391382405744, "step": 18000}, {"loss": 0.7766, "grad_norm": 0.663702130317688, "learning_rate": 0.0002, "epoch": 1.2933572710951526, "step": 18010}, {"loss": 0.7071, "grad_norm": 0.6952448487281799, "learning_rate": 0.0002, "epoch": 1.2940754039497306, "step": 18020}, {"loss": 0.7359, "grad_norm": 0.5722854137420654, "learning_rate": 0.0002, "epoch": 1.2947935368043089, "step": 18030}, {"loss": 0.764, "grad_norm": 0.7987681031227112, "learning_rate": 0.0002, "epoch": 1.2955116696588869, "step": 18040}, {"loss": 0.743, "grad_norm": 0.661133348941803, "learning_rate": 0.0002, "epoch": 1.2962298025134649, "step": 18050}, {"loss": 0.7627, "grad_norm": 0.6025064587593079, "learning_rate": 0.0002, "epoch": 1.296947935368043, "step": 18060}, {"loss": 0.7242, "grad_norm": 0.7569907903671265, "learning_rate": 0.0002, "epoch": 1.2976660682226213, "step": 18070}, {"loss": 0.7234, "grad_norm": 0.7222012281417847, "learning_rate": 0.0002, "epoch": 1.2983842010771993, "step": 18080}, {"loss": 0.7133, "grad_norm": 0.5291963815689087, "learning_rate": 0.0002, "epoch": 1.2991023339317773, "step": 18090}, {"loss": 0.7215, "grad_norm": 0.6808363199234009, "learning_rate": 0.0002, "epoch": 1.2998204667863555, "step": 18100}, {"loss": 0.7621, "grad_norm": 0.6797927618026733, "learning_rate": 0.0002, "epoch": 1.3005385996409335, "step": 18110}, {"loss": 0.7474, "grad_norm": 0.7775542140007019, "learning_rate": 0.0002, "epoch": 1.3012567324955118, "step": 18120}, {"loss": 0.7376, "grad_norm": 0.7369466423988342, "learning_rate": 0.0002, "epoch": 1.3019748653500898, "step": 18130}, {"loss": 0.7098, "grad_norm": 0.6822494864463806, "learning_rate": 0.0002, "epoch": 1.3026929982046678, "step": 18140}, {"loss": 0.7675, "grad_norm": 0.9222138524055481, "learning_rate": 0.0002, "epoch": 1.303411131059246, "step": 18150}, {"loss": 0.7593, "grad_norm": 0.7485767006874084, "learning_rate": 0.0002, "epoch": 1.304129263913824, "step": 18160}, {"loss": 0.7293, "grad_norm": 0.6383684277534485, "learning_rate": 0.0002, "epoch": 1.3048473967684022, "step": 18170}, {"loss": 0.7929, "grad_norm": 0.5934187173843384, "learning_rate": 0.0002, "epoch": 1.3055655296229802, "step": 18180}, {"loss": 0.7576, "grad_norm": 0.7265770435333252, "learning_rate": 0.0002, "epoch": 1.3062836624775582, "step": 18190}, {"loss": 0.7126, "grad_norm": 0.8149140477180481, "learning_rate": 0.0002, "epoch": 1.3070017953321365, "step": 18200}, {"loss": 0.7529, "grad_norm": 0.8067880272865295, "learning_rate": 0.0002, "epoch": 1.3077199281867147, "step": 18210}, {"loss": 0.7173, "grad_norm": 0.6109178066253662, "learning_rate": 0.0002, "epoch": 1.3084380610412927, "step": 18220}, {"loss": 0.7452, "grad_norm": 0.7194176316261292, "learning_rate": 0.0002, "epoch": 1.3091561938958707, "step": 18230}, {"loss": 0.732, "grad_norm": 0.6452242136001587, "learning_rate": 0.0002, "epoch": 1.309874326750449, "step": 18240}, {"loss": 0.7772, "grad_norm": 0.680550217628479, "learning_rate": 0.0002, "epoch": 1.310592459605027, "step": 18250}, {"loss": 0.7334, "grad_norm": 0.7005740404129028, "learning_rate": 0.0002, "epoch": 1.3113105924596051, "step": 18260}, {"loss": 0.7537, "grad_norm": 0.7217825055122375, "learning_rate": 0.0002, "epoch": 1.3120287253141831, "step": 18270}, {"loss": 0.7797, "grad_norm": 0.7730209231376648, "learning_rate": 0.0002, "epoch": 1.3127468581687611, "step": 18280}, {"loss": 0.7257, "grad_norm": 0.8291956186294556, "learning_rate": 0.0002, "epoch": 1.3134649910233394, "step": 18290}, {"loss": 0.7234, "grad_norm": 0.758528470993042, "learning_rate": 0.0002, "epoch": 1.3141831238779174, "step": 18300}, {"loss": 0.6915, "grad_norm": 0.9682782292366028, "learning_rate": 0.0002, "epoch": 1.3149012567324956, "step": 18310}, {"loss": 0.686, "grad_norm": 0.5784780979156494, "learning_rate": 0.0002, "epoch": 1.3156193895870736, "step": 18320}, {"loss": 0.7277, "grad_norm": 0.5870532393455505, "learning_rate": 0.0002, "epoch": 1.3163375224416516, "step": 18330}, {"loss": 0.7594, "grad_norm": 0.5950172543525696, "learning_rate": 0.0002, "epoch": 1.3170556552962298, "step": 18340}, {"loss": 0.7086, "grad_norm": 0.7625961899757385, "learning_rate": 0.0002, "epoch": 1.317773788150808, "step": 18350}, {"loss": 0.7075, "grad_norm": 0.8027397394180298, "learning_rate": 0.0002, "epoch": 1.318491921005386, "step": 18360}, {"loss": 0.7249, "grad_norm": 0.8424779772758484, "learning_rate": 0.0002, "epoch": 1.319210053859964, "step": 18370}, {"loss": 0.7349, "grad_norm": 0.5741737484931946, "learning_rate": 0.0002, "epoch": 1.3199281867145423, "step": 18380}, {"loss": 0.7421, "grad_norm": 0.7363710999488831, "learning_rate": 0.0002, "epoch": 1.3206463195691203, "step": 18390}, {"loss": 0.7208, "grad_norm": 0.7900536060333252, "learning_rate": 0.0002, "epoch": 1.3213644524236985, "step": 18400}, {"loss": 0.6836, "grad_norm": 0.6273105144500732, "learning_rate": 0.0002, "epoch": 1.3220825852782765, "step": 18410}, {"loss": 0.7365, "grad_norm": 0.7612496018409729, "learning_rate": 0.0002, "epoch": 1.3228007181328545, "step": 18420}, {"loss": 0.7521, "grad_norm": 0.729653537273407, "learning_rate": 0.0002, "epoch": 1.3235188509874327, "step": 18430}, {"loss": 0.7153, "grad_norm": 0.6599212288856506, "learning_rate": 0.0002, "epoch": 1.3242369838420107, "step": 18440}, {"loss": 0.7315, "grad_norm": 0.762320876121521, "learning_rate": 0.0002, "epoch": 1.324955116696589, "step": 18450}, {"loss": 0.6986, "grad_norm": 0.7468838095664978, "learning_rate": 0.0002, "epoch": 1.325673249551167, "step": 18460}, {"loss": 0.7527, "grad_norm": 0.6376237273216248, "learning_rate": 0.0002, "epoch": 1.326391382405745, "step": 18470}, {"loss": 0.7173, "grad_norm": 0.6722603440284729, "learning_rate": 0.0002, "epoch": 1.3271095152603232, "step": 18480}, {"loss": 0.6821, "grad_norm": 0.7011231780052185, "learning_rate": 0.0002, "epoch": 1.3278276481149014, "step": 18490}, {"loss": 0.7942, "grad_norm": 0.5325027108192444, "learning_rate": 0.0002, "epoch": 1.3285457809694794, "step": 18500}, {"loss": 0.6709, "grad_norm": 0.6916731595993042, "learning_rate": 0.0002, "epoch": 1.3292639138240574, "step": 18510}, {"loss": 0.7204, "grad_norm": 0.6529106497764587, "learning_rate": 0.0002, "epoch": 1.3299820466786356, "step": 18520}, {"loss": 0.7289, "grad_norm": 0.7708640694618225, "learning_rate": 0.0002, "epoch": 1.3307001795332136, "step": 18530}, {"loss": 0.7688, "grad_norm": 0.7125861048698425, "learning_rate": 0.0002, "epoch": 1.3314183123877918, "step": 18540}, {"loss": 0.723, "grad_norm": 0.7663969993591309, "learning_rate": 0.0002, "epoch": 1.3321364452423698, "step": 18550}, {"loss": 0.6993, "grad_norm": 0.601141631603241, "learning_rate": 0.0002, "epoch": 1.3328545780969479, "step": 18560}, {"loss": 0.734, "grad_norm": 0.6185581088066101, "learning_rate": 0.0002, "epoch": 1.333572710951526, "step": 18570}, {"loss": 0.6938, "grad_norm": 0.6136596202850342, "learning_rate": 0.0002, "epoch": 1.334290843806104, "step": 18580}, {"loss": 0.6963, "grad_norm": 0.8377187252044678, "learning_rate": 0.0002, "epoch": 1.3350089766606823, "step": 18590}, {"loss": 0.7399, "grad_norm": 0.7649989724159241, "learning_rate": 0.0002, "epoch": 1.3357271095152603, "step": 18600}, {"loss": 0.7565, "grad_norm": 0.7944515347480774, "learning_rate": 0.0002, "epoch": 1.3364452423698383, "step": 18610}, {"loss": 0.7894, "grad_norm": 0.619024395942688, "learning_rate": 0.0002, "epoch": 1.3371633752244165, "step": 18620}, {"loss": 0.7497, "grad_norm": 0.7849082946777344, "learning_rate": 0.0002, "epoch": 1.3378815080789948, "step": 18630}, {"loss": 0.7123, "grad_norm": 0.5740780830383301, "learning_rate": 0.0002, "epoch": 1.3385996409335728, "step": 18640}, {"loss": 0.7211, "grad_norm": 0.6897456645965576, "learning_rate": 0.0002, "epoch": 1.3393177737881508, "step": 18650}, {"loss": 0.7174, "grad_norm": 0.6263600587844849, "learning_rate": 0.0002, "epoch": 1.340035906642729, "step": 18660}, {"loss": 0.7048, "grad_norm": 0.5744550824165344, "learning_rate": 0.0002, "epoch": 1.340754039497307, "step": 18670}, {"loss": 0.7773, "grad_norm": 0.7785728573799133, "learning_rate": 0.0002, "epoch": 1.3414721723518852, "step": 18680}, {"loss": 0.7697, "grad_norm": 0.6944230198860168, "learning_rate": 0.0002, "epoch": 1.3421903052064632, "step": 18690}, {"loss": 0.7387, "grad_norm": 0.7388073801994324, "learning_rate": 0.0002, "epoch": 1.3429084380610412, "step": 18700}, {"loss": 0.7776, "grad_norm": 0.9555586576461792, "learning_rate": 0.0002, "epoch": 1.3436265709156194, "step": 18710}, {"loss": 0.7308, "grad_norm": 0.8510582447052002, "learning_rate": 0.0002, "epoch": 1.3443447037701974, "step": 18720}, {"loss": 0.7131, "grad_norm": 0.6093049645423889, "learning_rate": 0.0002, "epoch": 1.3450628366247757, "step": 18730}, {"loss": 0.7194, "grad_norm": 0.9159273505210876, "learning_rate": 0.0002, "epoch": 1.3457809694793537, "step": 18740}, {"loss": 0.7626, "grad_norm": 0.7188084721565247, "learning_rate": 0.0002, "epoch": 1.3464991023339317, "step": 18750}, {"loss": 0.7212, "grad_norm": 0.7228650450706482, "learning_rate": 0.0002, "epoch": 1.3472172351885099, "step": 18760}, {"loss": 0.7213, "grad_norm": 0.8160615563392639, "learning_rate": 0.0002, "epoch": 1.347935368043088, "step": 18770}, {"loss": 0.7093, "grad_norm": 0.6485389471054077, "learning_rate": 0.0002, "epoch": 1.3486535008976661, "step": 18780}, {"loss": 0.7044, "grad_norm": 0.6755139827728271, "learning_rate": 0.0002, "epoch": 1.3493716337522441, "step": 18790}, {"loss": 0.7413, "grad_norm": 0.6923297643661499, "learning_rate": 0.0002, "epoch": 1.3500897666068223, "step": 18800}, {"loss": 0.7184, "grad_norm": 0.6954510807991028, "learning_rate": 0.0002, "epoch": 1.3508078994614003, "step": 18810}, {"loss": 0.6987, "grad_norm": 0.9948558807373047, "learning_rate": 0.0002, "epoch": 1.3515260323159786, "step": 18820}, {"loss": 0.7315, "grad_norm": 0.708381175994873, "learning_rate": 0.0002, "epoch": 1.3522441651705566, "step": 18830}, {"loss": 0.7135, "grad_norm": 0.6409999132156372, "learning_rate": 0.0002, "epoch": 1.3529622980251346, "step": 18840}, {"loss": 0.7204, "grad_norm": 0.6365936994552612, "learning_rate": 0.0002, "epoch": 1.3536804308797128, "step": 18850}, {"loss": 0.691, "grad_norm": 0.7620742917060852, "learning_rate": 0.0002, "epoch": 1.3543985637342908, "step": 18860}, {"loss": 0.7458, "grad_norm": 0.6849071383476257, "learning_rate": 0.0002, "epoch": 1.355116696588869, "step": 18870}, {"loss": 0.7221, "grad_norm": 0.5776316523551941, "learning_rate": 0.0002, "epoch": 1.355834829443447, "step": 18880}, {"loss": 0.7412, "grad_norm": 0.597236156463623, "learning_rate": 0.0002, "epoch": 1.356552962298025, "step": 18890}, {"loss": 0.7065, "grad_norm": 0.6569282412528992, "learning_rate": 0.0002, "epoch": 1.3572710951526032, "step": 18900}, {"loss": 0.6995, "grad_norm": 0.6384802460670471, "learning_rate": 0.0002, "epoch": 1.3579892280071812, "step": 18910}, {"loss": 0.7592, "grad_norm": 0.6623879671096802, "learning_rate": 0.0002, "epoch": 1.3587073608617595, "step": 18920}, {"loss": 0.7288, "grad_norm": 0.6149632334709167, "learning_rate": 0.0002, "epoch": 1.3594254937163375, "step": 18930}, {"loss": 0.7392, "grad_norm": 0.6978002190589905, "learning_rate": 0.0002, "epoch": 1.3601436265709157, "step": 18940}, {"loss": 0.7405, "grad_norm": 0.7579124569892883, "learning_rate": 0.0002, "epoch": 1.3608617594254937, "step": 18950}, {"loss": 0.7589, "grad_norm": 0.7138084173202515, "learning_rate": 0.0002, "epoch": 1.361579892280072, "step": 18960}, {"loss": 0.7257, "grad_norm": 0.678322434425354, "learning_rate": 0.0002, "epoch": 1.36229802513465, "step": 18970}, {"loss": 0.7221, "grad_norm": 0.694346010684967, "learning_rate": 0.0002, "epoch": 1.363016157989228, "step": 18980}, {"loss": 0.6986, "grad_norm": 0.682262659072876, "learning_rate": 0.0002, "epoch": 1.3637342908438062, "step": 18990}, {"loss": 0.7297, "grad_norm": 0.9068194627761841, "learning_rate": 0.0002, "epoch": 1.3644524236983842, "step": 19000}, {"loss": 0.756, "grad_norm": 0.6691566705703735, "learning_rate": 0.0002, "epoch": 1.3651705565529624, "step": 19010}, {"loss": 0.7158, "grad_norm": 0.7791378498077393, "learning_rate": 0.0002, "epoch": 1.3658886894075404, "step": 19020}, {"loss": 0.6904, "grad_norm": 0.717107355594635, "learning_rate": 0.0002, "epoch": 1.3666068222621184, "step": 19030}, {"loss": 0.7308, "grad_norm": 0.7897566556930542, "learning_rate": 0.0002, "epoch": 1.3673249551166966, "step": 19040}, {"loss": 0.7278, "grad_norm": 0.8823844790458679, "learning_rate": 0.0002, "epoch": 1.3680430879712746, "step": 19050}, {"loss": 0.7252, "grad_norm": 0.6512053608894348, "learning_rate": 0.0002, "epoch": 1.3687612208258528, "step": 19060}, {"loss": 0.6861, "grad_norm": 0.6871389150619507, "learning_rate": 0.0002, "epoch": 1.3694793536804308, "step": 19070}, {"loss": 0.7311, "grad_norm": 0.6795603036880493, "learning_rate": 0.0002, "epoch": 1.370197486535009, "step": 19080}, {"loss": 0.7351, "grad_norm": 0.6569121479988098, "learning_rate": 0.0002, "epoch": 1.370915619389587, "step": 19090}, {"loss": 0.7743, "grad_norm": 0.6769960522651672, "learning_rate": 0.0002, "epoch": 1.3716337522441653, "step": 19100}, {"loss": 0.7275, "grad_norm": 0.726613461971283, "learning_rate": 0.0002, "epoch": 1.3723518850987433, "step": 19110}, {"loss": 0.7484, "grad_norm": 0.7287817001342773, "learning_rate": 0.0002, "epoch": 1.3730700179533213, "step": 19120}, {"loss": 0.7305, "grad_norm": 0.6169242858886719, "learning_rate": 0.0002, "epoch": 1.3737881508078995, "step": 19130}, {"loss": 0.7195, "grad_norm": 0.6537347435951233, "learning_rate": 0.0002, "epoch": 1.3745062836624775, "step": 19140}, {"loss": 0.7402, "grad_norm": 0.6113879680633545, "learning_rate": 0.0002, "epoch": 1.3752244165170557, "step": 19150}, {"loss": 0.7012, "grad_norm": 0.6415297985076904, "learning_rate": 0.0002, "epoch": 1.3759425493716337, "step": 19160}, {"loss": 0.7367, "grad_norm": 0.6812838315963745, "learning_rate": 0.0002, "epoch": 1.3766606822262117, "step": 19170}, {"loss": 0.7117, "grad_norm": 0.7331814169883728, "learning_rate": 0.0002, "epoch": 1.37737881508079, "step": 19180}, {"loss": 0.7496, "grad_norm": 0.7265108823776245, "learning_rate": 0.0002, "epoch": 1.378096947935368, "step": 19190}, {"loss": 0.699, "grad_norm": 0.6233167052268982, "learning_rate": 0.0002, "epoch": 1.3788150807899462, "step": 19200}, {"loss": 0.6978, "grad_norm": 0.6841492652893066, "learning_rate": 0.0002, "epoch": 1.3795332136445242, "step": 19210}, {"loss": 0.6934, "grad_norm": 0.822853684425354, "learning_rate": 0.0002, "epoch": 1.3802513464991024, "step": 19220}, {"loss": 0.7574, "grad_norm": 0.8078812956809998, "learning_rate": 0.0002, "epoch": 1.3809694793536804, "step": 19230}, {"loss": 0.7429, "grad_norm": 0.7269898056983948, "learning_rate": 0.0002, "epoch": 1.3816876122082586, "step": 19240}, {"loss": 0.7552, "grad_norm": 0.6297033429145813, "learning_rate": 0.0002, "epoch": 1.3824057450628366, "step": 19250}, {"loss": 0.7396, "grad_norm": 0.8097442388534546, "learning_rate": 0.0002, "epoch": 1.3831238779174146, "step": 19260}, {"loss": 0.7281, "grad_norm": 0.6442803740501404, "learning_rate": 0.0002, "epoch": 1.3838420107719929, "step": 19270}, {"loss": 0.7598, "grad_norm": 0.659866213798523, "learning_rate": 0.0002, "epoch": 1.3845601436265709, "step": 19280}, {"loss": 0.7262, "grad_norm": 0.7537921667098999, "learning_rate": 0.0002, "epoch": 1.385278276481149, "step": 19290}, {"loss": 0.7215, "grad_norm": 0.8441828489303589, "learning_rate": 0.0002, "epoch": 1.385996409335727, "step": 19300}, {"loss": 0.725, "grad_norm": 0.8506057262420654, "learning_rate": 0.0002, "epoch": 1.386714542190305, "step": 19310}, {"loss": 0.7747, "grad_norm": 0.6747094392776489, "learning_rate": 0.0002, "epoch": 1.3874326750448833, "step": 19320}, {"loss": 0.7785, "grad_norm": 0.7906509041786194, "learning_rate": 0.0002, "epoch": 1.3881508078994613, "step": 19330}, {"loss": 0.8147, "grad_norm": 0.6784867644309998, "learning_rate": 0.0002, "epoch": 1.3888689407540395, "step": 19340}, {"loss": 0.7861, "grad_norm": 0.6371709108352661, "learning_rate": 0.0002, "epoch": 1.3895870736086176, "step": 19350}, {"loss": 0.7434, "grad_norm": 0.7858285307884216, "learning_rate": 0.0002, "epoch": 1.3903052064631956, "step": 19360}, {"loss": 0.7638, "grad_norm": 0.711395263671875, "learning_rate": 0.0002, "epoch": 1.3910233393177738, "step": 19370}, {"loss": 0.725, "grad_norm": 0.7023257613182068, "learning_rate": 0.0002, "epoch": 1.391741472172352, "step": 19380}, {"loss": 0.7612, "grad_norm": 0.7036022543907166, "learning_rate": 0.0002, "epoch": 1.39245960502693, "step": 19390}, {"loss": 0.7354, "grad_norm": 0.6418436169624329, "learning_rate": 0.0002, "epoch": 1.393177737881508, "step": 19400}, {"loss": 0.7444, "grad_norm": 0.7108847498893738, "learning_rate": 0.0002, "epoch": 1.3938958707360862, "step": 19410}, {"loss": 0.771, "grad_norm": 0.6940230131149292, "learning_rate": 0.0002, "epoch": 1.3946140035906642, "step": 19420}, {"loss": 0.6791, "grad_norm": 0.6750220656394958, "learning_rate": 0.0002, "epoch": 1.3953321364452425, "step": 19430}, {"loss": 0.7466, "grad_norm": 0.7479177713394165, "learning_rate": 0.0002, "epoch": 1.3960502692998205, "step": 19440}, {"loss": 0.7259, "grad_norm": 0.626124918460846, "learning_rate": 0.0002, "epoch": 1.3967684021543985, "step": 19450}, {"loss": 0.7108, "grad_norm": 0.8908559083938599, "learning_rate": 0.0002, "epoch": 1.3974865350089767, "step": 19460}, {"loss": 0.7451, "grad_norm": 0.6163712739944458, "learning_rate": 0.0002, "epoch": 1.3982046678635547, "step": 19470}, {"loss": 0.7437, "grad_norm": 0.6993312239646912, "learning_rate": 0.0002, "epoch": 1.398922800718133, "step": 19480}, {"loss": 0.7035, "grad_norm": 0.6162890791893005, "learning_rate": 0.0002, "epoch": 1.399640933572711, "step": 19490}, {"loss": 0.7455, "grad_norm": 0.7797643542289734, "learning_rate": 0.0002, "epoch": 1.400359066427289, "step": 19500}, {"loss": 0.7497, "grad_norm": 0.7038744688034058, "learning_rate": 0.0002, "epoch": 1.4010771992818671, "step": 19510}, {"loss": 0.7084, "grad_norm": 0.6902393698692322, "learning_rate": 0.0002, "epoch": 1.4017953321364454, "step": 19520}, {"loss": 0.7136, "grad_norm": 0.5436386466026306, "learning_rate": 0.0002, "epoch": 1.4025134649910234, "step": 19530}, {"loss": 0.7457, "grad_norm": 0.6537990570068359, "learning_rate": 0.0002, "epoch": 1.4032315978456014, "step": 19540}, {"loss": 0.727, "grad_norm": 0.739691972732544, "learning_rate": 0.0002, "epoch": 1.4039497307001796, "step": 19550}, {"loss": 0.7537, "grad_norm": 0.7287635803222656, "learning_rate": 0.0002, "epoch": 1.4046678635547576, "step": 19560}, {"loss": 0.707, "grad_norm": 0.6809501051902771, "learning_rate": 0.0002, "epoch": 1.4053859964093358, "step": 19570}, {"loss": 0.7336, "grad_norm": 0.8302195072174072, "learning_rate": 0.0002, "epoch": 1.4061041292639138, "step": 19580}, {"loss": 0.7201, "grad_norm": 0.6613629460334778, "learning_rate": 0.0002, "epoch": 1.4068222621184918, "step": 19590}, {"loss": 0.7415, "grad_norm": 0.7897207736968994, "learning_rate": 0.0002, "epoch": 1.40754039497307, "step": 19600}, {"loss": 0.7483, "grad_norm": 0.8368293642997742, "learning_rate": 0.0002, "epoch": 1.408258527827648, "step": 19610}, {"loss": 0.7412, "grad_norm": 0.665109395980835, "learning_rate": 0.0002, "epoch": 1.4089766606822263, "step": 19620}, {"loss": 0.7339, "grad_norm": 0.7359302639961243, "learning_rate": 0.0002, "epoch": 1.4096947935368043, "step": 19630}, {"loss": 0.7775, "grad_norm": 0.8048052787780762, "learning_rate": 0.0002, "epoch": 1.4104129263913823, "step": 19640}, {"loss": 0.7668, "grad_norm": 0.7414906620979309, "learning_rate": 0.0002, "epoch": 1.4111310592459605, "step": 19650}, {"loss": 0.7386, "grad_norm": 0.7894161343574524, "learning_rate": 0.0002, "epoch": 1.4118491921005387, "step": 19660}, {"loss": 0.7371, "grad_norm": 0.6724628210067749, "learning_rate": 0.0002, "epoch": 1.4125673249551167, "step": 19670}, {"loss": 0.7243, "grad_norm": 0.9397756457328796, "learning_rate": 0.0002, "epoch": 1.4132854578096947, "step": 19680}, {"loss": 0.7109, "grad_norm": 0.6684842109680176, "learning_rate": 0.0002, "epoch": 1.414003590664273, "step": 19690}, {"loss": 0.7693, "grad_norm": 0.7753993272781372, "learning_rate": 0.0002, "epoch": 1.414721723518851, "step": 19700}, {"loss": 0.7653, "grad_norm": 0.6934253573417664, "learning_rate": 0.0002, "epoch": 1.4154398563734292, "step": 19710}, {"loss": 0.7393, "grad_norm": 0.8567284941673279, "learning_rate": 0.0002, "epoch": 1.4161579892280072, "step": 19720}, {"loss": 0.6907, "grad_norm": 0.9471787214279175, "learning_rate": 0.0002, "epoch": 1.4168761220825852, "step": 19730}, {"loss": 0.709, "grad_norm": 0.6664855480194092, "learning_rate": 0.0002, "epoch": 1.4175942549371634, "step": 19740}, {"loss": 0.7149, "grad_norm": 0.6713361740112305, "learning_rate": 0.0002, "epoch": 1.4183123877917414, "step": 19750}, {"loss": 0.7302, "grad_norm": 0.6488258838653564, "learning_rate": 0.0002, "epoch": 1.4190305206463196, "step": 19760}, {"loss": 0.7612, "grad_norm": 0.7089938521385193, "learning_rate": 0.0002, "epoch": 1.4197486535008976, "step": 19770}, {"loss": 0.7245, "grad_norm": 0.6433218717575073, "learning_rate": 0.0002, "epoch": 1.4204667863554756, "step": 19780}, {"loss": 0.7105, "grad_norm": 0.7025160193443298, "learning_rate": 0.0002, "epoch": 1.4211849192100539, "step": 19790}, {"loss": 0.7948, "grad_norm": 0.7030544877052307, "learning_rate": 0.0002, "epoch": 1.421903052064632, "step": 19800}, {"loss": 0.7333, "grad_norm": 0.6515552401542664, "learning_rate": 0.0002, "epoch": 1.42262118491921, "step": 19810}, {"loss": 0.7342, "grad_norm": 0.6463841795921326, "learning_rate": 0.0002, "epoch": 1.423339317773788, "step": 19820}, {"loss": 0.7457, "grad_norm": 0.6654344201087952, "learning_rate": 0.0002, "epoch": 1.4240574506283663, "step": 19830}, {"loss": 0.7289, "grad_norm": 0.7223384380340576, "learning_rate": 0.0002, "epoch": 1.4247755834829443, "step": 19840}, {"loss": 0.7471, "grad_norm": 0.6575722694396973, "learning_rate": 0.0002, "epoch": 1.4254937163375225, "step": 19850}, {"loss": 0.7559, "grad_norm": 0.6216059327125549, "learning_rate": 0.0002, "epoch": 1.4262118491921005, "step": 19860}, {"loss": 0.7638, "grad_norm": 0.7451487183570862, "learning_rate": 0.0002, "epoch": 1.4269299820466785, "step": 19870}, {"loss": 0.7083, "grad_norm": 0.6563336253166199, "learning_rate": 0.0002, "epoch": 1.4276481149012568, "step": 19880}, {"loss": 0.7122, "grad_norm": 0.8021975159645081, "learning_rate": 0.0002, "epoch": 1.4283662477558348, "step": 19890}, {"loss": 0.7389, "grad_norm": 0.7474712133407593, "learning_rate": 0.0002, "epoch": 1.429084380610413, "step": 19900}, {"loss": 0.7839, "grad_norm": 0.7316377758979797, "learning_rate": 0.0002, "epoch": 1.429802513464991, "step": 19910}, {"loss": 0.7588, "grad_norm": 0.646892786026001, "learning_rate": 0.0002, "epoch": 1.430520646319569, "step": 19920}, {"loss": 0.7175, "grad_norm": 0.6268765926361084, "learning_rate": 0.0002, "epoch": 1.4312387791741472, "step": 19930}, {"loss": 0.7502, "grad_norm": 0.7104699611663818, "learning_rate": 0.0002, "epoch": 1.4319569120287254, "step": 19940}, {"loss": 0.7006, "grad_norm": 0.6742063760757446, "learning_rate": 0.0002, "epoch": 1.4326750448833034, "step": 19950}, {"loss": 0.7394, "grad_norm": 0.6973381638526917, "learning_rate": 0.0002, "epoch": 1.4333931777378814, "step": 19960}, {"loss": 0.7428, "grad_norm": 0.5819381475448608, "learning_rate": 0.0002, "epoch": 1.4341113105924597, "step": 19970}, {"loss": 0.7836, "grad_norm": 0.680623471736908, "learning_rate": 0.0002, "epoch": 1.4348294434470377, "step": 19980}, {"loss": 0.7063, "grad_norm": 0.5899890661239624, "learning_rate": 0.0002, "epoch": 1.435547576301616, "step": 19990}, {"loss": 0.7438, "grad_norm": 0.6225098371505737, "learning_rate": 0.0002, "epoch": 1.436265709156194, "step": 20000}, {"loss": 0.7065, "grad_norm": 0.6314228773117065, "learning_rate": 0.0002, "epoch": 1.436983842010772, "step": 20010}, {"loss": 0.677, "grad_norm": 0.8690667152404785, "learning_rate": 0.0002, "epoch": 1.4377019748653501, "step": 20020}, {"loss": 0.7491, "grad_norm": 0.7166543006896973, "learning_rate": 0.0002, "epoch": 1.4384201077199281, "step": 20030}, {"loss": 0.7686, "grad_norm": 0.7051591873168945, "learning_rate": 0.0002, "epoch": 1.4391382405745063, "step": 20040}, {"loss": 0.6669, "grad_norm": 0.7606652975082397, "learning_rate": 0.0002, "epoch": 1.4398563734290843, "step": 20050}, {"loss": 0.7427, "grad_norm": 0.6343185305595398, "learning_rate": 0.0002, "epoch": 1.4405745062836623, "step": 20060}, {"loss": 0.6956, "grad_norm": 0.5625789761543274, "learning_rate": 0.0002, "epoch": 1.4412926391382406, "step": 20070}, {"loss": 0.7421, "grad_norm": 0.6081897020339966, "learning_rate": 0.0002, "epoch": 1.4420107719928188, "step": 20080}, {"loss": 0.7646, "grad_norm": 0.9571536779403687, "learning_rate": 0.0002, "epoch": 1.4427289048473968, "step": 20090}, {"loss": 0.6939, "grad_norm": 0.869531512260437, "learning_rate": 0.0002, "epoch": 1.4434470377019748, "step": 20100}, {"loss": 0.7684, "grad_norm": 0.6865507960319519, "learning_rate": 0.0002, "epoch": 1.444165170556553, "step": 20110}, {"loss": 0.6835, "grad_norm": 0.7572755813598633, "learning_rate": 0.0002, "epoch": 1.444883303411131, "step": 20120}, {"loss": 0.7392, "grad_norm": 0.79011070728302, "learning_rate": 0.0002, "epoch": 1.4456014362657092, "step": 20130}, {"loss": 0.7624, "grad_norm": 0.8297342658042908, "learning_rate": 0.0002, "epoch": 1.4463195691202873, "step": 20140}, {"loss": 0.696, "grad_norm": 0.6593490839004517, "learning_rate": 0.0002, "epoch": 1.4470377019748653, "step": 20150}, {"loss": 0.7062, "grad_norm": 1.0264687538146973, "learning_rate": 0.0002, "epoch": 1.4477558348294435, "step": 20160}, {"loss": 0.7804, "grad_norm": 0.7032888531684875, "learning_rate": 0.0002, "epoch": 1.4484739676840215, "step": 20170}, {"loss": 0.7692, "grad_norm": 0.6438494920730591, "learning_rate": 0.0002, "epoch": 1.4491921005385997, "step": 20180}, {"loss": 0.7189, "grad_norm": 0.7448790669441223, "learning_rate": 0.0002, "epoch": 1.4499102333931777, "step": 20190}, {"loss": 0.7389, "grad_norm": 0.7551555037498474, "learning_rate": 0.0002, "epoch": 1.4506283662477557, "step": 20200}, {"loss": 0.7636, "grad_norm": 0.6677857041358948, "learning_rate": 0.0002, "epoch": 1.451346499102334, "step": 20210}, {"loss": 0.7261, "grad_norm": 0.7888486385345459, "learning_rate": 0.0002, "epoch": 1.4520646319569122, "step": 20220}, {"loss": 0.7349, "grad_norm": 0.6658565402030945, "learning_rate": 0.0002, "epoch": 1.4527827648114902, "step": 20230}, {"loss": 0.7862, "grad_norm": 0.6800249814987183, "learning_rate": 0.0002, "epoch": 1.4535008976660682, "step": 20240}, {"loss": 0.7464, "grad_norm": 0.7419682741165161, "learning_rate": 0.0002, "epoch": 1.4542190305206464, "step": 20250}, {"loss": 0.7118, "grad_norm": 0.8848792910575867, "learning_rate": 0.0002, "epoch": 1.4549371633752244, "step": 20260}, {"loss": 0.729, "grad_norm": 0.6513857245445251, "learning_rate": 0.0002, "epoch": 1.4556552962298026, "step": 20270}, {"loss": 0.7325, "grad_norm": 0.5605742335319519, "learning_rate": 0.0002, "epoch": 1.4563734290843806, "step": 20280}, {"loss": 0.7078, "grad_norm": 0.6737141013145447, "learning_rate": 0.0002, "epoch": 1.4570915619389586, "step": 20290}, {"loss": 0.6971, "grad_norm": 0.6663289666175842, "learning_rate": 0.0002, "epoch": 1.4578096947935368, "step": 20300}, {"loss": 0.7161, "grad_norm": 0.7157106995582581, "learning_rate": 0.0002, "epoch": 1.4585278276481148, "step": 20310}, {"loss": 0.7024, "grad_norm": 0.7713354825973511, "learning_rate": 0.0002, "epoch": 1.459245960502693, "step": 20320}, {"loss": 0.7043, "grad_norm": 0.8334044218063354, "learning_rate": 0.0002, "epoch": 1.459964093357271, "step": 20330}, {"loss": 0.7151, "grad_norm": 0.7268327474594116, "learning_rate": 0.0002, "epoch": 1.460682226211849, "step": 20340}, {"loss": 0.7415, "grad_norm": 0.6791431903839111, "learning_rate": 0.0002, "epoch": 1.4614003590664273, "step": 20350}, {"loss": 0.7738, "grad_norm": 0.8177870512008667, "learning_rate": 0.0002, "epoch": 1.4621184919210055, "step": 20360}, {"loss": 0.7212, "grad_norm": 0.8064364790916443, "learning_rate": 0.0002, "epoch": 1.4628366247755835, "step": 20370}, {"loss": 0.7285, "grad_norm": 0.6547006964683533, "learning_rate": 0.0002, "epoch": 1.4635547576301615, "step": 20380}, {"loss": 0.7444, "grad_norm": 0.6381436586380005, "learning_rate": 0.0002, "epoch": 1.4642728904847397, "step": 20390}, {"loss": 0.7593, "grad_norm": 0.7351248264312744, "learning_rate": 0.0002, "epoch": 1.4649910233393177, "step": 20400}, {"loss": 0.7385, "grad_norm": 0.7037558555603027, "learning_rate": 0.0002, "epoch": 1.465709156193896, "step": 20410}, {"loss": 0.7815, "grad_norm": 0.6294074654579163, "learning_rate": 0.0002, "epoch": 1.466427289048474, "step": 20420}, {"loss": 0.6665, "grad_norm": 0.9722632765769958, "learning_rate": 0.0002, "epoch": 1.467145421903052, "step": 20430}, {"loss": 0.7363, "grad_norm": 0.753065824508667, "learning_rate": 0.0002, "epoch": 1.4678635547576302, "step": 20440}, {"loss": 0.7568, "grad_norm": 0.7317194938659668, "learning_rate": 0.0002, "epoch": 1.4685816876122082, "step": 20450}, {"loss": 0.6948, "grad_norm": 0.6862193942070007, "learning_rate": 0.0002, "epoch": 1.4692998204667864, "step": 20460}, {"loss": 0.7552, "grad_norm": 0.7643225193023682, "learning_rate": 0.0002, "epoch": 1.4700179533213644, "step": 20470}, {"loss": 0.6757, "grad_norm": 0.5904353260993958, "learning_rate": 0.0002, "epoch": 1.4707360861759424, "step": 20480}, {"loss": 0.7779, "grad_norm": 0.5812238454818726, "learning_rate": 0.0002, "epoch": 1.4714542190305206, "step": 20490}, {"loss": 0.7252, "grad_norm": 0.7478151321411133, "learning_rate": 0.0002, "epoch": 1.4721723518850989, "step": 20500}, {"loss": 0.7165, "grad_norm": 0.7625645399093628, "learning_rate": 0.0002, "epoch": 1.4728904847396769, "step": 20510}, {"loss": 0.7383, "grad_norm": 0.6354498267173767, "learning_rate": 0.0002, "epoch": 1.4736086175942549, "step": 20520}, {"loss": 0.7095, "grad_norm": 0.8731162548065186, "learning_rate": 0.0002, "epoch": 1.474326750448833, "step": 20530}, {"loss": 0.7535, "grad_norm": 0.7346670627593994, "learning_rate": 0.0002, "epoch": 1.475044883303411, "step": 20540}, {"loss": 0.78, "grad_norm": 1.038447618484497, "learning_rate": 0.0002, "epoch": 1.4757630161579893, "step": 20550}, {"loss": 0.7026, "grad_norm": 0.7032809257507324, "learning_rate": 0.0002, "epoch": 1.4764811490125673, "step": 20560}, {"loss": 0.6776, "grad_norm": 0.8008337020874023, "learning_rate": 0.0002, "epoch": 1.4771992818671453, "step": 20570}, {"loss": 0.776, "grad_norm": 0.6735056638717651, "learning_rate": 0.0002, "epoch": 1.4779174147217236, "step": 20580}, {"loss": 0.7632, "grad_norm": 0.622056245803833, "learning_rate": 0.0002, "epoch": 1.4786355475763016, "step": 20590}, {"loss": 0.7467, "grad_norm": 0.6580422520637512, "learning_rate": 0.0002, "epoch": 1.4793536804308798, "step": 20600}, {"loss": 0.7161, "grad_norm": 0.8401153087615967, "learning_rate": 0.0002, "epoch": 1.4800718132854578, "step": 20610}, {"loss": 0.7581, "grad_norm": 0.7564560770988464, "learning_rate": 0.0002, "epoch": 1.4807899461400358, "step": 20620}, {"loss": 0.7507, "grad_norm": 0.8319511413574219, "learning_rate": 0.0002, "epoch": 1.481508078994614, "step": 20630}, {"loss": 0.7379, "grad_norm": 0.7430182695388794, "learning_rate": 0.0002, "epoch": 1.4822262118491922, "step": 20640}, {"loss": 0.7273, "grad_norm": 0.7996522784233093, "learning_rate": 0.0002, "epoch": 1.4829443447037702, "step": 20650}, {"loss": 0.7223, "grad_norm": 0.6993277072906494, "learning_rate": 0.0002, "epoch": 1.4836624775583482, "step": 20660}, {"loss": 0.7328, "grad_norm": 0.8621185421943665, "learning_rate": 0.0002, "epoch": 1.4843806104129265, "step": 20670}, {"loss": 0.7327, "grad_norm": 0.7709757685661316, "learning_rate": 0.0002, "epoch": 1.4850987432675045, "step": 20680}, {"loss": 0.7053, "grad_norm": 0.743760347366333, "learning_rate": 0.0002, "epoch": 1.4858168761220827, "step": 20690}, {"loss": 0.6763, "grad_norm": 0.8353745341300964, "learning_rate": 0.0002, "epoch": 1.4865350089766607, "step": 20700}, {"loss": 0.6933, "grad_norm": 0.8510433435440063, "learning_rate": 0.0002, "epoch": 1.4872531418312387, "step": 20710}, {"loss": 0.7486, "grad_norm": 0.7065894603729248, "learning_rate": 0.0002, "epoch": 1.487971274685817, "step": 20720}, {"loss": 0.736, "grad_norm": 0.6878955960273743, "learning_rate": 0.0002, "epoch": 1.488689407540395, "step": 20730}, {"loss": 0.6958, "grad_norm": 0.7861111760139465, "learning_rate": 0.0002, "epoch": 1.4894075403949731, "step": 20740}, {"loss": 0.7568, "grad_norm": 0.4810725152492523, "learning_rate": 0.0002, "epoch": 1.4901256732495511, "step": 20750}, {"loss": 0.8147, "grad_norm": 0.7246082425117493, "learning_rate": 0.0002, "epoch": 1.4908438061041291, "step": 20760}, {"loss": 0.7312, "grad_norm": 0.7101936340332031, "learning_rate": 0.0002, "epoch": 1.4915619389587074, "step": 20770}, {"loss": 0.7393, "grad_norm": 0.7508591413497925, "learning_rate": 0.0002, "epoch": 1.4922800718132856, "step": 20780}, {"loss": 0.7635, "grad_norm": 0.8872039914131165, "learning_rate": 0.0002, "epoch": 1.4929982046678636, "step": 20790}, {"loss": 0.7352, "grad_norm": 0.7257922887802124, "learning_rate": 0.0002, "epoch": 1.4937163375224416, "step": 20800}, {"loss": 0.7497, "grad_norm": 0.7886278629302979, "learning_rate": 0.0002, "epoch": 1.4944344703770198, "step": 20810}, {"loss": 0.7247, "grad_norm": 0.6746290922164917, "learning_rate": 0.0002, "epoch": 1.4951526032315978, "step": 20820}, {"loss": 0.7836, "grad_norm": 0.8118207454681396, "learning_rate": 0.0002, "epoch": 1.495870736086176, "step": 20830}, {"loss": 0.7323, "grad_norm": 0.7337301969528198, "learning_rate": 0.0002, "epoch": 1.496588868940754, "step": 20840}, {"loss": 0.7105, "grad_norm": 0.5451242327690125, "learning_rate": 0.0002, "epoch": 1.497307001795332, "step": 20850}, {"loss": 0.7255, "grad_norm": 0.8398377299308777, "learning_rate": 0.0002, "epoch": 1.4980251346499103, "step": 20860}, {"loss": 0.7217, "grad_norm": 0.7196659445762634, "learning_rate": 0.0002, "epoch": 1.4987432675044883, "step": 20870}, {"loss": 0.6843, "grad_norm": 0.6659539937973022, "learning_rate": 0.0002, "epoch": 1.4994614003590665, "step": 20880}, {"loss": 0.7337, "grad_norm": 0.6071978807449341, "learning_rate": 0.0002, "epoch": 1.5001795332136445, "step": 20890}, {"loss": 0.7221, "grad_norm": 0.6704870462417603, "learning_rate": 0.0002, "epoch": 1.5008976660682225, "step": 20900}, {"loss": 0.6946, "grad_norm": 0.7216639518737793, "learning_rate": 0.0002, "epoch": 1.5016157989228007, "step": 20910}, {"loss": 0.7282, "grad_norm": 0.6050528287887573, "learning_rate": 0.0002, "epoch": 1.502333931777379, "step": 20920}, {"loss": 0.7142, "grad_norm": 0.7422218918800354, "learning_rate": 0.0002, "epoch": 1.503052064631957, "step": 20930}, {"loss": 0.7779, "grad_norm": 0.7157148122787476, "learning_rate": 0.0002, "epoch": 1.503770197486535, "step": 20940}, {"loss": 0.7179, "grad_norm": 0.6704899668693542, "learning_rate": 0.0002, "epoch": 1.504488330341113, "step": 20950}, {"loss": 0.7124, "grad_norm": 0.7573544979095459, "learning_rate": 0.0002, "epoch": 1.5052064631956912, "step": 20960}, {"loss": 0.7831, "grad_norm": 0.6710506677627563, "learning_rate": 0.0002, "epoch": 1.5059245960502694, "step": 20970}, {"loss": 0.7123, "grad_norm": 0.7559793591499329, "learning_rate": 0.0002, "epoch": 1.5066427289048474, "step": 20980}, {"loss": 0.7442, "grad_norm": 0.6705940961837769, "learning_rate": 0.0002, "epoch": 1.5073608617594254, "step": 20990}, {"loss": 0.7387, "grad_norm": 0.8016680479049683, "learning_rate": 0.0002, "epoch": 1.5080789946140036, "step": 21000}, {"loss": 0.7101, "grad_norm": 0.8154481649398804, "learning_rate": 0.0002, "epoch": 1.5087971274685816, "step": 21010}, {"loss": 0.7223, "grad_norm": 0.5830582976341248, "learning_rate": 0.0002, "epoch": 1.5095152603231599, "step": 21020}, {"loss": 0.753, "grad_norm": 0.7088601589202881, "learning_rate": 0.0002, "epoch": 1.5102333931777379, "step": 21030}, {"loss": 0.7278, "grad_norm": 0.7499658465385437, "learning_rate": 0.0002, "epoch": 1.5109515260323159, "step": 21040}, {"loss": 0.7441, "grad_norm": 0.7684667706489563, "learning_rate": 0.0002, "epoch": 1.511669658886894, "step": 21050}, {"loss": 0.7665, "grad_norm": 0.7183627486228943, "learning_rate": 0.0002, "epoch": 1.5123877917414723, "step": 21060}, {"loss": 0.7777, "grad_norm": 0.8201524615287781, "learning_rate": 0.0002, "epoch": 1.5131059245960503, "step": 21070}, {"loss": 0.7005, "grad_norm": 0.6359647512435913, "learning_rate": 0.0002, "epoch": 1.5138240574506283, "step": 21080}, {"loss": 0.7231, "grad_norm": 0.7419124245643616, "learning_rate": 0.0002, "epoch": 1.5145421903052063, "step": 21090}, {"loss": 0.724, "grad_norm": 0.6145808696746826, "learning_rate": 0.0002, "epoch": 1.5152603231597845, "step": 21100}, {"loss": 0.7563, "grad_norm": 0.7116656303405762, "learning_rate": 0.0002, "epoch": 1.5159784560143628, "step": 21110}, {"loss": 0.7221, "grad_norm": 0.8927125334739685, "learning_rate": 0.0002, "epoch": 1.5166965888689408, "step": 21120}, {"loss": 0.7159, "grad_norm": 0.7527788877487183, "learning_rate": 0.0002, "epoch": 1.5174147217235188, "step": 21130}, {"loss": 0.7147, "grad_norm": 0.7537266612052917, "learning_rate": 0.0002, "epoch": 1.518132854578097, "step": 21140}, {"loss": 0.7451, "grad_norm": 0.9051724672317505, "learning_rate": 0.0002, "epoch": 1.518850987432675, "step": 21150}, {"loss": 0.7362, "grad_norm": 0.7258086800575256, "learning_rate": 0.0002, "epoch": 1.5195691202872532, "step": 21160}, {"loss": 0.7096, "grad_norm": 0.60377436876297, "learning_rate": 0.0002, "epoch": 1.5202872531418312, "step": 21170}, {"loss": 0.7141, "grad_norm": 0.613362729549408, "learning_rate": 0.0002, "epoch": 1.5210053859964092, "step": 21180}, {"loss": 0.7018, "grad_norm": 0.6311782002449036, "learning_rate": 0.0002, "epoch": 1.5217235188509874, "step": 21190}, {"loss": 0.8144, "grad_norm": 0.7814380526542664, "learning_rate": 0.0002, "epoch": 1.5224416517055657, "step": 21200}, {"loss": 0.7505, "grad_norm": 0.8482790589332581, "learning_rate": 0.0002, "epoch": 1.5231597845601437, "step": 21210}, {"loss": 0.7387, "grad_norm": 0.6767336130142212, "learning_rate": 0.0002, "epoch": 1.5238779174147217, "step": 21220}, {"loss": 0.7556, "grad_norm": 0.7000219821929932, "learning_rate": 0.0002, "epoch": 1.5245960502692997, "step": 21230}, {"loss": 0.7628, "grad_norm": 0.8848617076873779, "learning_rate": 0.0002, "epoch": 1.525314183123878, "step": 21240}, {"loss": 0.7226, "grad_norm": 0.692258894443512, "learning_rate": 0.0002, "epoch": 1.5260323159784561, "step": 21250}, {"loss": 0.7535, "grad_norm": 0.7701950073242188, "learning_rate": 0.0002, "epoch": 1.5267504488330341, "step": 21260}, {"loss": 0.7531, "grad_norm": 0.7454132437705994, "learning_rate": 0.0002, "epoch": 1.5274685816876121, "step": 21270}, {"loss": 0.7663, "grad_norm": 0.7299574613571167, "learning_rate": 0.0002, "epoch": 1.5281867145421903, "step": 21280}, {"loss": 0.6993, "grad_norm": 0.6693950891494751, "learning_rate": 0.0002, "epoch": 1.5289048473967684, "step": 21290}, {"loss": 0.7567, "grad_norm": 0.8323785066604614, "learning_rate": 0.0002, "epoch": 1.5296229802513466, "step": 21300}, {"loss": 0.7205, "grad_norm": 0.8998763561248779, "learning_rate": 0.0002, "epoch": 1.5303411131059246, "step": 21310}, {"loss": 0.7779, "grad_norm": 0.8118193745613098, "learning_rate": 0.0002, "epoch": 1.5310592459605026, "step": 21320}, {"loss": 0.7642, "grad_norm": 0.8966332077980042, "learning_rate": 0.0002, "epoch": 1.5317773788150808, "step": 21330}, {"loss": 0.7626, "grad_norm": 0.7849827408790588, "learning_rate": 0.0002, "epoch": 1.532495511669659, "step": 21340}, {"loss": 0.7501, "grad_norm": 0.897583544254303, "learning_rate": 0.0002, "epoch": 1.533213644524237, "step": 21350}, {"loss": 0.7812, "grad_norm": 0.7998009324073792, "learning_rate": 0.0002, "epoch": 1.533931777378815, "step": 21360}, {"loss": 0.7217, "grad_norm": 0.5890361070632935, "learning_rate": 0.0002, "epoch": 1.534649910233393, "step": 21370}, {"loss": 0.7283, "grad_norm": 0.7321302890777588, "learning_rate": 0.0002, "epoch": 1.5353680430879713, "step": 21380}, {"loss": 0.7238, "grad_norm": 0.7746050357818604, "learning_rate": 0.0002, "epoch": 1.5360861759425495, "step": 21390}, {"loss": 0.7146, "grad_norm": 0.7033910155296326, "learning_rate": 0.0002, "epoch": 1.5368043087971275, "step": 21400}, {"loss": 0.6783, "grad_norm": 0.7229148149490356, "learning_rate": 0.0002, "epoch": 1.5375224416517055, "step": 21410}, {"loss": 0.7347, "grad_norm": 0.8055810928344727, "learning_rate": 0.0002, "epoch": 1.5382405745062837, "step": 21420}, {"loss": 0.7382, "grad_norm": 0.9411654472351074, "learning_rate": 0.0002, "epoch": 1.5389587073608617, "step": 21430}, {"loss": 0.6916, "grad_norm": 0.7297126650810242, "learning_rate": 0.0002, "epoch": 1.53967684021544, "step": 21440}, {"loss": 0.6977, "grad_norm": 0.7316457629203796, "learning_rate": 0.0002, "epoch": 1.540394973070018, "step": 21450}, {"loss": 0.713, "grad_norm": 0.8568798303604126, "learning_rate": 0.0002, "epoch": 1.541113105924596, "step": 21460}, {"loss": 0.6916, "grad_norm": 0.7829580307006836, "learning_rate": 0.0002, "epoch": 1.5418312387791742, "step": 21470}, {"loss": 0.712, "grad_norm": 0.6679823398590088, "learning_rate": 0.0002, "epoch": 1.5425493716337524, "step": 21480}, {"loss": 0.6978, "grad_norm": 0.5680868029594421, "learning_rate": 0.0002, "epoch": 1.5432675044883304, "step": 21490}, {"loss": 0.7638, "grad_norm": 0.6878862380981445, "learning_rate": 0.0002, "epoch": 1.5439856373429084, "step": 21500}, {"loss": 0.7634, "grad_norm": 0.7391727566719055, "learning_rate": 0.0002, "epoch": 1.5447037701974864, "step": 21510}, {"loss": 0.7781, "grad_norm": 0.844994843006134, "learning_rate": 0.0002, "epoch": 1.5454219030520646, "step": 21520}, {"loss": 0.7052, "grad_norm": 0.7852550148963928, "learning_rate": 0.0002, "epoch": 1.5461400359066428, "step": 21530}, {"loss": 0.7364, "grad_norm": 0.8370407223701477, "learning_rate": 0.0002, "epoch": 1.5468581687612208, "step": 21540}, {"loss": 0.7266, "grad_norm": 0.7138169407844543, "learning_rate": 0.0002, "epoch": 1.5475763016157988, "step": 21550}, {"loss": 0.7078, "grad_norm": 0.7660839557647705, "learning_rate": 0.0002, "epoch": 1.548294434470377, "step": 21560}, {"loss": 0.7056, "grad_norm": 0.6628666520118713, "learning_rate": 0.0002, "epoch": 1.549012567324955, "step": 21570}, {"loss": 0.7384, "grad_norm": 0.602262020111084, "learning_rate": 0.0002, "epoch": 1.5497307001795333, "step": 21580}, {"loss": 0.7258, "grad_norm": 0.6120333671569824, "learning_rate": 0.0002, "epoch": 1.5504488330341113, "step": 21590}, {"loss": 0.8094, "grad_norm": 0.6742582321166992, "learning_rate": 0.0002, "epoch": 1.5511669658886893, "step": 21600}, {"loss": 0.6807, "grad_norm": 0.6788192391395569, "learning_rate": 0.0002, "epoch": 1.5518850987432675, "step": 21610}, {"loss": 0.6969, "grad_norm": 0.7124713659286499, "learning_rate": 0.0002, "epoch": 1.5526032315978457, "step": 21620}, {"loss": 0.7296, "grad_norm": 0.6297248005867004, "learning_rate": 0.0002, "epoch": 1.5533213644524237, "step": 21630}, {"loss": 0.7466, "grad_norm": 0.8977078199386597, "learning_rate": 0.0002, "epoch": 1.5540394973070017, "step": 21640}, {"loss": 0.7376, "grad_norm": 0.7543209791183472, "learning_rate": 0.0002, "epoch": 1.5547576301615798, "step": 21650}, {"loss": 0.749, "grad_norm": 0.8704302310943604, "learning_rate": 0.0002, "epoch": 1.555475763016158, "step": 21660}, {"loss": 0.7801, "grad_norm": 0.7848012447357178, "learning_rate": 0.0002, "epoch": 1.5561938958707362, "step": 21670}, {"loss": 0.7062, "grad_norm": 0.7496278285980225, "learning_rate": 0.0002, "epoch": 1.5569120287253142, "step": 21680}, {"loss": 0.7503, "grad_norm": 0.7305200099945068, "learning_rate": 0.0002, "epoch": 1.5576301615798922, "step": 21690}, {"loss": 0.7429, "grad_norm": 0.6671105623245239, "learning_rate": 0.0002, "epoch": 1.5583482944344704, "step": 21700}, {"loss": 0.7293, "grad_norm": 0.8536111116409302, "learning_rate": 0.0002, "epoch": 1.5590664272890484, "step": 21710}, {"loss": 0.7169, "grad_norm": 0.7360461354255676, "learning_rate": 0.0002, "epoch": 1.5597845601436267, "step": 21720}, {"loss": 0.7314, "grad_norm": 0.6665109395980835, "learning_rate": 0.0002, "epoch": 1.5605026929982047, "step": 21730}, {"loss": 0.7262, "grad_norm": 0.5879628658294678, "learning_rate": 0.0002, "epoch": 1.5612208258527827, "step": 21740}, {"loss": 0.7099, "grad_norm": 0.6937240958213806, "learning_rate": 0.0002, "epoch": 1.5619389587073609, "step": 21750}, {"loss": 0.7669, "grad_norm": 0.7118659019470215, "learning_rate": 0.0002, "epoch": 1.562657091561939, "step": 21760}, {"loss": 0.7196, "grad_norm": 0.7858866453170776, "learning_rate": 0.0002, "epoch": 1.563375224416517, "step": 21770}, {"loss": 0.7552, "grad_norm": 0.8691372871398926, "learning_rate": 0.0002, "epoch": 1.564093357271095, "step": 21780}, {"loss": 0.7684, "grad_norm": 0.8884942531585693, "learning_rate": 0.0002, "epoch": 1.564811490125673, "step": 21790}, {"loss": 0.7128, "grad_norm": 0.6335656046867371, "learning_rate": 0.0002, "epoch": 1.5655296229802513, "step": 21800}, {"loss": 0.7233, "grad_norm": 0.8666166067123413, "learning_rate": 0.0002, "epoch": 1.5662477558348296, "step": 21810}, {"loss": 0.6771, "grad_norm": 0.7961624264717102, "learning_rate": 0.0002, "epoch": 1.5669658886894076, "step": 21820}, {"loss": 0.7286, "grad_norm": 0.6331174373626709, "learning_rate": 0.0002, "epoch": 1.5676840215439856, "step": 21830}, {"loss": 0.7273, "grad_norm": 0.6476998925209045, "learning_rate": 0.0002, "epoch": 1.5684021543985638, "step": 21840}, {"loss": 0.7507, "grad_norm": 0.8279129266738892, "learning_rate": 0.0002, "epoch": 1.5691202872531418, "step": 21850}, {"loss": 0.7219, "grad_norm": 0.6997109651565552, "learning_rate": 0.0002, "epoch": 1.56983842010772, "step": 21860}, {"loss": 0.7424, "grad_norm": 0.6992211937904358, "learning_rate": 0.0002, "epoch": 1.570556552962298, "step": 21870}, {"loss": 0.7275, "grad_norm": 0.7766915559768677, "learning_rate": 0.0002, "epoch": 1.571274685816876, "step": 21880}, {"loss": 0.7651, "grad_norm": 0.6845845580101013, "learning_rate": 0.0002, "epoch": 1.5719928186714542, "step": 21890}, {"loss": 0.706, "grad_norm": 0.7247874140739441, "learning_rate": 0.0002, "epoch": 1.5727109515260325, "step": 21900}, {"loss": 0.7812, "grad_norm": 0.802342414855957, "learning_rate": 0.0002, "epoch": 1.5734290843806105, "step": 21910}, {"loss": 0.7028, "grad_norm": 0.7797709107398987, "learning_rate": 0.0002, "epoch": 1.5741472172351885, "step": 21920}, {"loss": 0.7466, "grad_norm": 0.6534958481788635, "learning_rate": 0.0002, "epoch": 1.5748653500897665, "step": 21930}, {"loss": 0.7148, "grad_norm": 0.6003528237342834, "learning_rate": 0.0002, "epoch": 1.5755834829443447, "step": 21940}, {"loss": 0.7282, "grad_norm": 0.6920075416564941, "learning_rate": 0.0002, "epoch": 1.576301615798923, "step": 21950}, {"loss": 0.6533, "grad_norm": 0.7213456034660339, "learning_rate": 0.0002, "epoch": 1.577019748653501, "step": 21960}, {"loss": 0.6875, "grad_norm": 0.7101914286613464, "learning_rate": 0.0002, "epoch": 1.577737881508079, "step": 21970}, {"loss": 0.7421, "grad_norm": 0.9531592130661011, "learning_rate": 0.0002, "epoch": 1.5784560143626571, "step": 21980}, {"loss": 0.7454, "grad_norm": 0.7690590023994446, "learning_rate": 0.0002, "epoch": 1.5791741472172351, "step": 21990}, {"loss": 0.7135, "grad_norm": 0.8226363062858582, "learning_rate": 0.0002, "epoch": 1.5798922800718134, "step": 22000}, {"loss": 0.7518, "grad_norm": 0.6128851175308228, "learning_rate": 0.0002, "epoch": 1.5806104129263914, "step": 22010}, {"loss": 0.7253, "grad_norm": 0.827008068561554, "learning_rate": 0.0002, "epoch": 1.5813285457809694, "step": 22020}, {"loss": 0.7176, "grad_norm": 0.6729007363319397, "learning_rate": 0.0002, "epoch": 1.5820466786355476, "step": 22030}, {"loss": 0.7503, "grad_norm": 0.6397014260292053, "learning_rate": 0.0002, "epoch": 1.5827648114901258, "step": 22040}, {"loss": 0.7531, "grad_norm": 0.6927793622016907, "learning_rate": 0.0002, "epoch": 1.5834829443447038, "step": 22050}, {"loss": 0.7499, "grad_norm": 0.7527112364768982, "learning_rate": 0.0002, "epoch": 1.5842010771992818, "step": 22060}, {"loss": 0.739, "grad_norm": 0.6418012380599976, "learning_rate": 0.0002, "epoch": 1.5849192100538598, "step": 22070}, {"loss": 0.727, "grad_norm": 0.7627281546592712, "learning_rate": 0.0002, "epoch": 1.585637342908438, "step": 22080}, {"loss": 0.7115, "grad_norm": 0.753851592540741, "learning_rate": 0.0002, "epoch": 1.5863554757630163, "step": 22090}, {"loss": 0.7677, "grad_norm": 0.6049349904060364, "learning_rate": 0.0002, "epoch": 1.5870736086175943, "step": 22100}, {"loss": 0.7494, "grad_norm": 0.6677758693695068, "learning_rate": 0.0002, "epoch": 1.5877917414721723, "step": 22110}, {"loss": 0.7259, "grad_norm": 0.913489818572998, "learning_rate": 0.0002, "epoch": 1.5885098743267505, "step": 22120}, {"loss": 0.7823, "grad_norm": 0.6779162883758545, "learning_rate": 0.0002, "epoch": 1.5892280071813285, "step": 22130}, {"loss": 0.7674, "grad_norm": 0.910076916217804, "learning_rate": 0.0002, "epoch": 1.5899461400359067, "step": 22140}, {"loss": 0.7162, "grad_norm": 0.9506068229675293, "learning_rate": 0.0002, "epoch": 1.5906642728904847, "step": 22150}, {"loss": 0.7343, "grad_norm": 0.6552460789680481, "learning_rate": 0.0002, "epoch": 1.5913824057450627, "step": 22160}, {"loss": 0.7488, "grad_norm": 0.6855819821357727, "learning_rate": 0.0002, "epoch": 1.592100538599641, "step": 22170}, {"loss": 0.6785, "grad_norm": 0.6713384985923767, "learning_rate": 0.0002, "epoch": 1.5928186714542192, "step": 22180}, {"loss": 0.7287, "grad_norm": 0.7168547511100769, "learning_rate": 0.0002, "epoch": 1.5935368043087972, "step": 22190}, {"loss": 0.7259, "grad_norm": 0.8395482897758484, "learning_rate": 0.0002, "epoch": 1.5942549371633752, "step": 22200}, {"loss": 0.6995, "grad_norm": 0.6676998138427734, "learning_rate": 0.0002, "epoch": 1.5949730700179532, "step": 22210}, {"loss": 0.7152, "grad_norm": 0.5837140083312988, "learning_rate": 0.0002, "epoch": 1.5956912028725314, "step": 22220}, {"loss": 0.7464, "grad_norm": 0.8399306535720825, "learning_rate": 0.0002, "epoch": 1.5964093357271096, "step": 22230}, {"loss": 0.7053, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 1.5971274685816876, "step": 22240}, {"loss": 0.784, "grad_norm": 0.768604040145874, "learning_rate": 0.0002, "epoch": 1.5978456014362656, "step": 22250}, {"loss": 0.6946, "grad_norm": 0.6382646560668945, "learning_rate": 0.0002, "epoch": 1.5985637342908436, "step": 22260}, {"loss": 0.7035, "grad_norm": 0.7244897484779358, "learning_rate": 0.0002, "epoch": 1.5992818671454219, "step": 22270}, {"loss": 0.7168, "grad_norm": 0.6250987648963928, "learning_rate": 0.0002, "epoch": 1.6, "step": 22280}, {"loss": 0.7182, "grad_norm": 0.8731992244720459, "learning_rate": 0.0002, "epoch": 1.600718132854578, "step": 22290}, {"loss": 0.6866, "grad_norm": 0.5861822962760925, "learning_rate": 0.0002, "epoch": 1.601436265709156, "step": 22300}, {"loss": 0.6909, "grad_norm": 0.716805100440979, "learning_rate": 0.0002, "epoch": 1.6021543985637343, "step": 22310}, {"loss": 0.7377, "grad_norm": 0.6650034189224243, "learning_rate": 0.0002, "epoch": 1.6028725314183125, "step": 22320}, {"loss": 0.7107, "grad_norm": 0.6944432854652405, "learning_rate": 0.0002, "epoch": 1.6035906642728905, "step": 22330}, {"loss": 0.682, "grad_norm": 0.7411999106407166, "learning_rate": 0.0002, "epoch": 1.6043087971274685, "step": 22340}, {"loss": 0.7294, "grad_norm": 0.831828773021698, "learning_rate": 0.0002, "epoch": 1.6050269299820465, "step": 22350}, {"loss": 0.7305, "grad_norm": 0.6252152919769287, "learning_rate": 0.0002, "epoch": 1.6057450628366248, "step": 22360}, {"loss": 0.7479, "grad_norm": 0.8643325567245483, "learning_rate": 0.0002, "epoch": 1.606463195691203, "step": 22370}, {"loss": 0.7417, "grad_norm": 0.7330279350280762, "learning_rate": 0.0002, "epoch": 1.607181328545781, "step": 22380}, {"loss": 0.7198, "grad_norm": 0.7235422730445862, "learning_rate": 0.0002, "epoch": 1.607899461400359, "step": 22390}, {"loss": 0.7638, "grad_norm": 0.6940887570381165, "learning_rate": 0.0002, "epoch": 1.608617594254937, "step": 22400}, {"loss": 0.714, "grad_norm": 0.7907325625419617, "learning_rate": 0.0002, "epoch": 1.6093357271095152, "step": 22410}, {"loss": 0.7824, "grad_norm": 0.6899075508117676, "learning_rate": 0.0002, "epoch": 1.6100538599640934, "step": 22420}, {"loss": 0.7502, "grad_norm": 0.7057487368583679, "learning_rate": 0.0002, "epoch": 1.6107719928186714, "step": 22430}, {"loss": 0.7437, "grad_norm": 0.9235003590583801, "learning_rate": 0.0002, "epoch": 1.6114901256732495, "step": 22440}, {"loss": 0.7115, "grad_norm": 0.7238173484802246, "learning_rate": 0.0002, "epoch": 1.6122082585278277, "step": 22450}, {"loss": 0.7628, "grad_norm": 0.5931997299194336, "learning_rate": 0.0002, "epoch": 1.612926391382406, "step": 22460}, {"loss": 0.6663, "grad_norm": 0.6705866456031799, "learning_rate": 0.0002, "epoch": 1.613644524236984, "step": 22470}, {"loss": 0.749, "grad_norm": 0.7392773032188416, "learning_rate": 0.0002, "epoch": 1.614362657091562, "step": 22480}, {"loss": 0.7292, "grad_norm": 0.6286543607711792, "learning_rate": 0.0002, "epoch": 1.61508078994614, "step": 22490}, {"loss": 0.7264, "grad_norm": 0.7467446327209473, "learning_rate": 0.0002, "epoch": 1.6157989228007181, "step": 22500}, {"loss": 0.732, "grad_norm": 0.8353021740913391, "learning_rate": 0.0002, "epoch": 1.6165170556552964, "step": 22510}, {"loss": 0.7626, "grad_norm": 0.7333045601844788, "learning_rate": 0.0002, "epoch": 1.6172351885098744, "step": 22520}, {"loss": 0.7567, "grad_norm": 0.6203709244728088, "learning_rate": 0.0002, "epoch": 1.6179533213644524, "step": 22530}, {"loss": 0.7478, "grad_norm": 0.5585690140724182, "learning_rate": 0.0002, "epoch": 1.6186714542190304, "step": 22540}, {"loss": 0.669, "grad_norm": 0.7157222032546997, "learning_rate": 0.0002, "epoch": 1.6193895870736086, "step": 22550}, {"loss": 0.7224, "grad_norm": 0.8129993677139282, "learning_rate": 0.0002, "epoch": 1.6201077199281868, "step": 22560}, {"loss": 0.7374, "grad_norm": 0.6745335459709167, "learning_rate": 0.0002, "epoch": 1.6208258527827648, "step": 22570}, {"loss": 0.7276, "grad_norm": 0.7684996724128723, "learning_rate": 0.0002, "epoch": 1.6215439856373428, "step": 22580}, {"loss": 0.7479, "grad_norm": 0.6735436916351318, "learning_rate": 0.0002, "epoch": 1.622262118491921, "step": 22590}, {"loss": 0.6596, "grad_norm": 0.7394272089004517, "learning_rate": 0.0002, "epoch": 1.6229802513464993, "step": 22600}, {"loss": 0.7382, "grad_norm": 0.7268046140670776, "learning_rate": 0.0002, "epoch": 1.6236983842010773, "step": 22610}, {"loss": 0.7619, "grad_norm": 0.8338810205459595, "learning_rate": 0.0002, "epoch": 1.6244165170556553, "step": 22620}, {"loss": 0.7247, "grad_norm": 0.9293080568313599, "learning_rate": 0.0002, "epoch": 1.6251346499102333, "step": 22630}, {"loss": 0.7601, "grad_norm": 0.8084996938705444, "learning_rate": 0.0002, "epoch": 1.6258527827648115, "step": 22640}, {"loss": 0.7053, "grad_norm": 0.6605180501937866, "learning_rate": 0.0002, "epoch": 1.6265709156193897, "step": 22650}, {"loss": 0.7489, "grad_norm": 0.8402717113494873, "learning_rate": 0.0002, "epoch": 1.6272890484739677, "step": 22660}, {"loss": 0.7468, "grad_norm": 0.653055727481842, "learning_rate": 0.0002, "epoch": 1.6280071813285457, "step": 22670}, {"loss": 0.7179, "grad_norm": 0.6477823257446289, "learning_rate": 0.0002, "epoch": 1.6287253141831237, "step": 22680}, {"loss": 0.7216, "grad_norm": 0.9053590893745422, "learning_rate": 0.0002, "epoch": 1.629443447037702, "step": 22690}, {"loss": 0.7257, "grad_norm": 0.90384441614151, "learning_rate": 0.0002, "epoch": 1.6301615798922802, "step": 22700}, {"loss": 0.7703, "grad_norm": 0.6789469122886658, "learning_rate": 0.0002, "epoch": 1.6308797127468582, "step": 22710}, {"loss": 0.7706, "grad_norm": 0.7221854329109192, "learning_rate": 0.0002, "epoch": 1.6315978456014362, "step": 22720}, {"loss": 0.7457, "grad_norm": 0.7724022269248962, "learning_rate": 0.0002, "epoch": 1.6323159784560144, "step": 22730}, {"loss": 0.7864, "grad_norm": 0.8213715553283691, "learning_rate": 0.0002, "epoch": 1.6330341113105926, "step": 22740}, {"loss": 0.7356, "grad_norm": 0.7102876305580139, "learning_rate": 0.0002, "epoch": 1.6337522441651706, "step": 22750}, {"loss": 0.7208, "grad_norm": 0.8817880749702454, "learning_rate": 0.0002, "epoch": 1.6344703770197486, "step": 22760}, {"loss": 0.7722, "grad_norm": 0.8446506857872009, "learning_rate": 0.0002, "epoch": 1.6351885098743266, "step": 22770}, {"loss": 0.7341, "grad_norm": 0.6749029755592346, "learning_rate": 0.0002, "epoch": 1.6359066427289048, "step": 22780}, {"loss": 0.7599, "grad_norm": 0.7013556957244873, "learning_rate": 0.0002, "epoch": 1.636624775583483, "step": 22790}, {"loss": 0.7488, "grad_norm": 0.7767965793609619, "learning_rate": 0.0002, "epoch": 1.637342908438061, "step": 22800}, {"loss": 0.7387, "grad_norm": 0.7354073524475098, "learning_rate": 0.0002, "epoch": 1.638061041292639, "step": 22810}, {"loss": 0.7816, "grad_norm": 0.8871088027954102, "learning_rate": 0.0002, "epoch": 1.638779174147217, "step": 22820}, {"loss": 0.7243, "grad_norm": 0.6573871374130249, "learning_rate": 0.0002, "epoch": 1.6394973070017953, "step": 22830}, {"loss": 0.7812, "grad_norm": 0.5679349303245544, "learning_rate": 0.0002, "epoch": 1.6402154398563735, "step": 22840}, {"loss": 0.7402, "grad_norm": 0.7072559595108032, "learning_rate": 0.0002, "epoch": 1.6409335727109515, "step": 22850}, {"loss": 0.751, "grad_norm": 0.7639257311820984, "learning_rate": 0.0002, "epoch": 1.6416517055655295, "step": 22860}, {"loss": 0.7357, "grad_norm": 0.6699341535568237, "learning_rate": 0.0002, "epoch": 1.6423698384201078, "step": 22870}, {"loss": 0.7295, "grad_norm": 0.8285767436027527, "learning_rate": 0.0002, "epoch": 1.643087971274686, "step": 22880}, {"loss": 0.7267, "grad_norm": 0.7328150272369385, "learning_rate": 0.0002, "epoch": 1.643806104129264, "step": 22890}, {"loss": 0.6904, "grad_norm": 0.8122354745864868, "learning_rate": 0.0002, "epoch": 1.644524236983842, "step": 22900}, {"loss": 0.7853, "grad_norm": 0.7322969436645508, "learning_rate": 0.0002, "epoch": 1.64524236983842, "step": 22910}, {"loss": 0.7629, "grad_norm": 0.7269576191902161, "learning_rate": 0.0002, "epoch": 1.6459605026929982, "step": 22920}, {"loss": 0.728, "grad_norm": 0.7037042379379272, "learning_rate": 0.0002, "epoch": 1.6466786355475764, "step": 22930}, {"loss": 0.752, "grad_norm": 0.6960355639457703, "learning_rate": 0.0002, "epoch": 1.6473967684021544, "step": 22940}, {"loss": 0.7484, "grad_norm": 0.7446839213371277, "learning_rate": 0.0002, "epoch": 1.6481149012567324, "step": 22950}, {"loss": 0.7528, "grad_norm": 0.7201664447784424, "learning_rate": 0.0002, "epoch": 1.6488330341113104, "step": 22960}, {"loss": 0.7183, "grad_norm": 0.7062349319458008, "learning_rate": 0.0002, "epoch": 1.6495511669658887, "step": 22970}, {"loss": 0.6999, "grad_norm": 0.7666636109352112, "learning_rate": 0.0002, "epoch": 1.6502692998204669, "step": 22980}, {"loss": 0.7103, "grad_norm": 0.7872112393379211, "learning_rate": 0.0002, "epoch": 1.6509874326750449, "step": 22990}, {"loss": 0.7307, "grad_norm": 0.7428551316261292, "learning_rate": 0.0002, "epoch": 1.6517055655296229, "step": 23000}, {"loss": 0.7573, "grad_norm": 0.6087952852249146, "learning_rate": 0.0002, "epoch": 1.6524236983842011, "step": 23010}, {"loss": 0.8045, "grad_norm": 0.7191354036331177, "learning_rate": 0.0002, "epoch": 1.6531418312387793, "step": 23020}, {"loss": 0.7517, "grad_norm": 0.8679710626602173, "learning_rate": 0.0002, "epoch": 1.6538599640933573, "step": 23030}, {"loss": 0.7084, "grad_norm": 0.7232310175895691, "learning_rate": 0.0002, "epoch": 1.6545780969479353, "step": 23040}, {"loss": 0.7007, "grad_norm": 0.5695104002952576, "learning_rate": 0.0002, "epoch": 1.6552962298025133, "step": 23050}, {"loss": 0.7115, "grad_norm": 0.6363076567649841, "learning_rate": 0.0002, "epoch": 1.6560143626570916, "step": 23060}, {"loss": 0.7639, "grad_norm": 0.8168749809265137, "learning_rate": 0.0002, "epoch": 1.6567324955116698, "step": 23070}, {"loss": 0.6768, "grad_norm": 0.7664111852645874, "learning_rate": 0.0002, "epoch": 1.6574506283662478, "step": 23080}, {"loss": 0.7492, "grad_norm": 0.6748140454292297, "learning_rate": 0.0002, "epoch": 1.6581687612208258, "step": 23090}, {"loss": 0.7213, "grad_norm": 0.6258183121681213, "learning_rate": 0.0002, "epoch": 1.6588868940754038, "step": 23100}, {"loss": 0.783, "grad_norm": 0.8669735193252563, "learning_rate": 0.0002, "epoch": 1.659605026929982, "step": 23110}, {"loss": 0.6847, "grad_norm": 0.5606119632720947, "learning_rate": 0.0002, "epoch": 1.6603231597845602, "step": 23120}, {"loss": 0.6889, "grad_norm": 0.6602507829666138, "learning_rate": 0.0002, "epoch": 1.6610412926391382, "step": 23130}, {"loss": 0.7605, "grad_norm": 0.7237988710403442, "learning_rate": 0.0002, "epoch": 1.6617594254937162, "step": 23140}, {"loss": 0.7663, "grad_norm": 0.9054415225982666, "learning_rate": 0.0002, "epoch": 1.6624775583482945, "step": 23150}, {"loss": 0.7603, "grad_norm": 0.5186660289764404, "learning_rate": 0.0002, "epoch": 1.6631956912028727, "step": 23160}, {"loss": 0.7442, "grad_norm": 0.719584584236145, "learning_rate": 0.0002, "epoch": 1.6639138240574507, "step": 23170}, {"loss": 0.7715, "grad_norm": 0.7583617568016052, "learning_rate": 0.0002, "epoch": 1.6646319569120287, "step": 23180}, {"loss": 0.7402, "grad_norm": 0.7985982298851013, "learning_rate": 0.0002, "epoch": 1.6653500897666067, "step": 23190}, {"loss": 0.7515, "grad_norm": 0.6952691674232483, "learning_rate": 0.0002, "epoch": 1.666068222621185, "step": 23200}, {"loss": 0.7491, "grad_norm": 0.7184221744537354, "learning_rate": 0.0002, "epoch": 1.6667863554757631, "step": 23210}, {"loss": 0.7608, "grad_norm": 0.8256361484527588, "learning_rate": 0.0002, "epoch": 1.6675044883303412, "step": 23220}, {"loss": 0.7331, "grad_norm": 0.7534128427505493, "learning_rate": 0.0002, "epoch": 1.6682226211849192, "step": 23230}, {"loss": 0.7196, "grad_norm": 0.7711095213890076, "learning_rate": 0.0002, "epoch": 1.6689407540394972, "step": 23240}, {"loss": 0.7871, "grad_norm": 0.6326615810394287, "learning_rate": 0.0002, "epoch": 1.6696588868940754, "step": 23250}, {"loss": 0.7244, "grad_norm": 0.8345766663551331, "learning_rate": 0.0002, "epoch": 1.6703770197486536, "step": 23260}, {"loss": 0.7819, "grad_norm": 0.9079837203025818, "learning_rate": 0.0002, "epoch": 1.6710951526032316, "step": 23270}, {"loss": 0.7259, "grad_norm": 0.7310197353363037, "learning_rate": 0.0002, "epoch": 1.6718132854578096, "step": 23280}, {"loss": 0.7253, "grad_norm": 0.7573344707489014, "learning_rate": 0.0002, "epoch": 1.6725314183123878, "step": 23290}, {"loss": 0.6817, "grad_norm": 0.7708047032356262, "learning_rate": 0.0002, "epoch": 1.673249551166966, "step": 23300}, {"loss": 0.7247, "grad_norm": 0.7665812969207764, "learning_rate": 0.0002, "epoch": 1.673967684021544, "step": 23310}, {"loss": 0.7048, "grad_norm": 0.7988788485527039, "learning_rate": 0.0002, "epoch": 1.674685816876122, "step": 23320}, {"loss": 0.7396, "grad_norm": 0.755042552947998, "learning_rate": 0.0002, "epoch": 1.6754039497307, "step": 23330}, {"loss": 0.7392, "grad_norm": 0.6605848670005798, "learning_rate": 0.0002, "epoch": 1.6761220825852783, "step": 23340}, {"loss": 0.7394, "grad_norm": 0.8762016296386719, "learning_rate": 0.0002, "epoch": 1.6768402154398565, "step": 23350}, {"loss": 0.7661, "grad_norm": 0.604742169380188, "learning_rate": 0.0002, "epoch": 1.6775583482944345, "step": 23360}, {"loss": 0.7422, "grad_norm": 0.7479172945022583, "learning_rate": 0.0002, "epoch": 1.6782764811490125, "step": 23370}, {"loss": 0.7248, "grad_norm": 0.6418702602386475, "learning_rate": 0.0002, "epoch": 1.6789946140035905, "step": 23380}, {"loss": 0.7717, "grad_norm": 0.6783933639526367, "learning_rate": 0.0002, "epoch": 1.6797127468581687, "step": 23390}, {"loss": 0.7099, "grad_norm": 0.7036024928092957, "learning_rate": 0.0002, "epoch": 1.680430879712747, "step": 23400}, {"loss": 0.7439, "grad_norm": 0.6833266615867615, "learning_rate": 0.0002, "epoch": 1.681149012567325, "step": 23410}, {"loss": 0.753, "grad_norm": 0.8867062330245972, "learning_rate": 0.0002, "epoch": 1.681867145421903, "step": 23420}, {"loss": 0.7694, "grad_norm": 0.7825753092765808, "learning_rate": 0.0002, "epoch": 1.6825852782764812, "step": 23430}, {"loss": 0.7127, "grad_norm": 0.6396880745887756, "learning_rate": 0.0002, "epoch": 1.6833034111310592, "step": 23440}, {"loss": 0.7465, "grad_norm": 0.5723230242729187, "learning_rate": 0.0002, "epoch": 1.6840215439856374, "step": 23450}, {"loss": 0.7102, "grad_norm": 0.6949231624603271, "learning_rate": 0.0002, "epoch": 1.6847396768402154, "step": 23460}, {"loss": 0.7421, "grad_norm": 0.8290650248527527, "learning_rate": 0.0002, "epoch": 1.6854578096947934, "step": 23470}, {"loss": 0.7774, "grad_norm": 0.7765078544616699, "learning_rate": 0.0002, "epoch": 1.6861759425493716, "step": 23480}, {"loss": 0.7271, "grad_norm": 0.7084149718284607, "learning_rate": 0.0002, "epoch": 1.6868940754039499, "step": 23490}, {"loss": 0.8188, "grad_norm": 0.6916654109954834, "learning_rate": 0.0002, "epoch": 1.6876122082585279, "step": 23500}, {"loss": 0.7235, "grad_norm": 0.5615179538726807, "learning_rate": 0.0002, "epoch": 1.6883303411131059, "step": 23510}, {"loss": 0.7203, "grad_norm": 0.7996105551719666, "learning_rate": 0.0002, "epoch": 1.6890484739676839, "step": 23520}, {"loss": 0.7145, "grad_norm": 0.7010168433189392, "learning_rate": 0.0002, "epoch": 1.689766606822262, "step": 23530}, {"loss": 0.7696, "grad_norm": 0.7876442074775696, "learning_rate": 0.0002, "epoch": 1.6904847396768403, "step": 23540}, {"loss": 0.6966, "grad_norm": 0.7508043646812439, "learning_rate": 0.0002, "epoch": 1.6912028725314183, "step": 23550}, {"loss": 0.729, "grad_norm": 0.8125874400138855, "learning_rate": 0.0002, "epoch": 1.6919210053859963, "step": 23560}, {"loss": 0.774, "grad_norm": 0.711840808391571, "learning_rate": 0.0002, "epoch": 1.6926391382405745, "step": 23570}, {"loss": 0.7165, "grad_norm": 0.6540026068687439, "learning_rate": 0.0002, "epoch": 1.6933572710951525, "step": 23580}, {"loss": 0.7578, "grad_norm": 0.8376550078392029, "learning_rate": 0.0002, "epoch": 1.6940754039497308, "step": 23590}, {"loss": 0.7746, "grad_norm": 0.7075366973876953, "learning_rate": 0.0002, "epoch": 1.6947935368043088, "step": 23600}, {"loss": 0.7639, "grad_norm": 0.7522266507148743, "learning_rate": 0.0002, "epoch": 1.6955116696588868, "step": 23610}, {"loss": 0.7386, "grad_norm": 0.7572667002677917, "learning_rate": 0.0002, "epoch": 1.696229802513465, "step": 23620}, {"loss": 0.6896, "grad_norm": 0.6126907467842102, "learning_rate": 0.0002, "epoch": 1.6969479353680432, "step": 23630}, {"loss": 0.7182, "grad_norm": 0.7473152875900269, "learning_rate": 0.0002, "epoch": 1.6976660682226212, "step": 23640}, {"loss": 0.7272, "grad_norm": 0.6630390286445618, "learning_rate": 0.0002, "epoch": 1.6983842010771992, "step": 23650}, {"loss": 0.7232, "grad_norm": 0.5848073363304138, "learning_rate": 0.0002, "epoch": 1.6991023339317772, "step": 23660}, {"loss": 0.6923, "grad_norm": 0.5901942849159241, "learning_rate": 0.0002, "epoch": 1.6998204667863555, "step": 23670}, {"loss": 0.79, "grad_norm": 0.7896918058395386, "learning_rate": 0.0002, "epoch": 1.7005385996409337, "step": 23680}, {"loss": 0.77, "grad_norm": 0.705362856388092, "learning_rate": 0.0002, "epoch": 1.7012567324955117, "step": 23690}, {"loss": 0.751, "grad_norm": 0.9917470812797546, "learning_rate": 0.0002, "epoch": 1.7019748653500897, "step": 23700}, {"loss": 0.7403, "grad_norm": 0.7550538778305054, "learning_rate": 0.0002, "epoch": 1.702692998204668, "step": 23710}, {"loss": 0.7398, "grad_norm": 0.8348238468170166, "learning_rate": 0.0002, "epoch": 1.703411131059246, "step": 23720}, {"loss": 0.7799, "grad_norm": 0.5979694128036499, "learning_rate": 0.0002, "epoch": 1.7041292639138241, "step": 23730}, {"loss": 0.7035, "grad_norm": 0.7451775670051575, "learning_rate": 0.0002, "epoch": 1.7048473967684021, "step": 23740}, {"loss": 0.7237, "grad_norm": 0.7614818215370178, "learning_rate": 0.0002, "epoch": 1.7055655296229801, "step": 23750}, {"loss": 0.7636, "grad_norm": 0.5590742826461792, "learning_rate": 0.0002, "epoch": 1.7062836624775584, "step": 23760}, {"loss": 0.701, "grad_norm": 0.7039094567298889, "learning_rate": 0.0002, "epoch": 1.7070017953321366, "step": 23770}, {"loss": 0.7145, "grad_norm": 0.7963233590126038, "learning_rate": 0.0002, "epoch": 1.7077199281867146, "step": 23780}, {"loss": 0.7702, "grad_norm": 0.7214934825897217, "learning_rate": 0.0002, "epoch": 1.7084380610412926, "step": 23790}, {"loss": 0.7515, "grad_norm": 0.7310500741004944, "learning_rate": 0.0002, "epoch": 1.7091561938958706, "step": 23800}, {"loss": 0.7038, "grad_norm": 0.6653284430503845, "learning_rate": 0.0002, "epoch": 1.7098743267504488, "step": 23810}, {"loss": 0.698, "grad_norm": 0.6632702946662903, "learning_rate": 0.0002, "epoch": 1.710592459605027, "step": 23820}, {"loss": 0.7338, "grad_norm": 0.6314955949783325, "learning_rate": 0.0002, "epoch": 1.711310592459605, "step": 23830}, {"loss": 0.7511, "grad_norm": 0.73652583360672, "learning_rate": 0.0002, "epoch": 1.712028725314183, "step": 23840}, {"loss": 0.6999, "grad_norm": 0.5685144662857056, "learning_rate": 0.0002, "epoch": 1.7127468581687613, "step": 23850}, {"loss": 0.7295, "grad_norm": 0.7010223865509033, "learning_rate": 0.0002, "epoch": 1.7134649910233393, "step": 23860}, {"loss": 0.7488, "grad_norm": 0.7643879652023315, "learning_rate": 0.0002, "epoch": 1.7141831238779175, "step": 23870}, {"loss": 0.7449, "grad_norm": 0.7543165683746338, "learning_rate": 0.0002, "epoch": 1.7149012567324955, "step": 23880}, {"loss": 0.6946, "grad_norm": 0.8816508054733276, "learning_rate": 0.0002, "epoch": 1.7156193895870735, "step": 23890}, {"loss": 0.7398, "grad_norm": 0.7979614734649658, "learning_rate": 0.0002, "epoch": 1.7163375224416517, "step": 23900}, {"loss": 0.7844, "grad_norm": 0.7631057500839233, "learning_rate": 0.0002, "epoch": 1.71705565529623, "step": 23910}, {"loss": 0.7409, "grad_norm": 0.6349977254867554, "learning_rate": 0.0002, "epoch": 1.717773788150808, "step": 23920}, {"loss": 0.74, "grad_norm": 0.7464412450790405, "learning_rate": 0.0002, "epoch": 1.718491921005386, "step": 23930}, {"loss": 0.7164, "grad_norm": 0.6985567212104797, "learning_rate": 0.0002, "epoch": 1.719210053859964, "step": 23940}, {"loss": 0.7256, "grad_norm": 0.6641302704811096, "learning_rate": 0.0002, "epoch": 1.7199281867145422, "step": 23950}, {"loss": 0.7154, "grad_norm": 0.7299597263336182, "learning_rate": 0.0002, "epoch": 1.7206463195691204, "step": 23960}, {"loss": 0.7535, "grad_norm": 0.7812355756759644, "learning_rate": 0.0002, "epoch": 1.7213644524236984, "step": 23970}, {"loss": 0.7363, "grad_norm": 0.667571485042572, "learning_rate": 0.0002, "epoch": 1.7220825852782764, "step": 23980}, {"loss": 0.7427, "grad_norm": 0.8244081735610962, "learning_rate": 0.0002, "epoch": 1.7228007181328546, "step": 23990}, {"loss": 0.7191, "grad_norm": 0.6684445738792419, "learning_rate": 0.0002, "epoch": 1.7235188509874326, "step": 24000}, {"loss": 0.8042, "grad_norm": 0.7002949118614197, "learning_rate": 0.0002, "epoch": 1.7242369838420109, "step": 24010}, {"loss": 0.7134, "grad_norm": 0.6249772906303406, "learning_rate": 0.0002, "epoch": 1.7249551166965889, "step": 24020}, {"loss": 0.721, "grad_norm": 0.7279905080795288, "learning_rate": 0.0002, "epoch": 1.7256732495511669, "step": 24030}, {"loss": 0.7374, "grad_norm": 0.631148636341095, "learning_rate": 0.0002, "epoch": 1.726391382405745, "step": 24040}, {"loss": 0.697, "grad_norm": 0.7486464977264404, "learning_rate": 0.0002, "epoch": 1.7271095152603233, "step": 24050}, {"loss": 0.715, "grad_norm": 0.7494347095489502, "learning_rate": 0.0002, "epoch": 1.7278276481149013, "step": 24060}, {"loss": 0.7609, "grad_norm": 0.7821264863014221, "learning_rate": 0.0002, "epoch": 1.7285457809694793, "step": 24070}, {"loss": 0.6925, "grad_norm": 0.7211608290672302, "learning_rate": 0.0002, "epoch": 1.7292639138240573, "step": 24080}, {"loss": 0.7444, "grad_norm": 0.7028553485870361, "learning_rate": 0.0002, "epoch": 1.7299820466786355, "step": 24090}, {"loss": 0.8065, "grad_norm": 0.6189247369766235, "learning_rate": 0.0002, "epoch": 1.7307001795332138, "step": 24100}, {"loss": 0.7011, "grad_norm": 0.7339756488800049, "learning_rate": 0.0002, "epoch": 1.7314183123877918, "step": 24110}, {"loss": 0.8071, "grad_norm": 0.6700502038002014, "learning_rate": 0.0002, "epoch": 1.7321364452423698, "step": 24120}, {"loss": 0.7608, "grad_norm": 0.6139533519744873, "learning_rate": 0.0002, "epoch": 1.732854578096948, "step": 24130}, {"loss": 0.7251, "grad_norm": 0.7249825596809387, "learning_rate": 0.0002, "epoch": 1.733572710951526, "step": 24140}, {"loss": 0.6954, "grad_norm": 0.6531777381896973, "learning_rate": 0.0002, "epoch": 1.7342908438061042, "step": 24150}, {"loss": 0.7214, "grad_norm": 0.8443833589553833, "learning_rate": 0.0002, "epoch": 1.7350089766606822, "step": 24160}, {"loss": 0.75, "grad_norm": 0.7040373086929321, "learning_rate": 0.0002, "epoch": 1.7357271095152602, "step": 24170}, {"loss": 0.701, "grad_norm": 0.8647749423980713, "learning_rate": 0.0002, "epoch": 1.7364452423698384, "step": 24180}, {"loss": 0.7033, "grad_norm": 0.7297305464744568, "learning_rate": 0.0002, "epoch": 1.7371633752244167, "step": 24190}, {"loss": 0.7187, "grad_norm": 0.8191218376159668, "learning_rate": 0.0002, "epoch": 1.7378815080789947, "step": 24200}, {"loss": 0.7665, "grad_norm": 0.7315607666969299, "learning_rate": 0.0002, "epoch": 1.7385996409335727, "step": 24210}, {"loss": 0.7467, "grad_norm": 0.694486677646637, "learning_rate": 0.0002, "epoch": 1.7393177737881507, "step": 24220}, {"loss": 0.7476, "grad_norm": 0.8115953207015991, "learning_rate": 0.0002, "epoch": 1.740035906642729, "step": 24230}, {"loss": 0.7792, "grad_norm": 0.7379186153411865, "learning_rate": 0.0002, "epoch": 1.7407540394973071, "step": 24240}, {"loss": 0.7224, "grad_norm": 0.6820309162139893, "learning_rate": 0.0002, "epoch": 1.7414721723518851, "step": 24250}, {"loss": 0.7558, "grad_norm": 0.8210766911506653, "learning_rate": 0.0002, "epoch": 1.7421903052064631, "step": 24260}, {"loss": 0.7098, "grad_norm": 0.724466860294342, "learning_rate": 0.0002, "epoch": 1.7429084380610413, "step": 24270}, {"loss": 0.7343, "grad_norm": 0.8768740296363831, "learning_rate": 0.0002, "epoch": 1.7436265709156193, "step": 24280}, {"loss": 0.7041, "grad_norm": 0.6691206097602844, "learning_rate": 0.0002, "epoch": 1.7443447037701976, "step": 24290}, {"loss": 0.7526, "grad_norm": 0.6529893279075623, "learning_rate": 0.0002, "epoch": 1.7450628366247756, "step": 24300}, {"loss": 0.7638, "grad_norm": 0.904729962348938, "learning_rate": 0.0002, "epoch": 1.7457809694793536, "step": 24310}, {"loss": 0.7463, "grad_norm": 0.655235230922699, "learning_rate": 0.0002, "epoch": 1.7464991023339318, "step": 24320}, {"loss": 0.7625, "grad_norm": 0.9476361274719238, "learning_rate": 0.0002, "epoch": 1.74721723518851, "step": 24330}, {"loss": 0.688, "grad_norm": 0.55366051197052, "learning_rate": 0.0002, "epoch": 1.747935368043088, "step": 24340}, {"loss": 0.7664, "grad_norm": 0.7192568182945251, "learning_rate": 0.0002, "epoch": 1.748653500897666, "step": 24350}, {"loss": 0.7423, "grad_norm": 0.7193983793258667, "learning_rate": 0.0002, "epoch": 1.749371633752244, "step": 24360}, {"loss": 0.7463, "grad_norm": 0.753998339176178, "learning_rate": 0.0002, "epoch": 1.7500897666068223, "step": 24370}, {"loss": 0.7415, "grad_norm": 1.1058299541473389, "learning_rate": 0.0002, "epoch": 1.7508078994614005, "step": 24380}, {"loss": 0.7373, "grad_norm": 0.7213007211685181, "learning_rate": 0.0002, "epoch": 1.7515260323159785, "step": 24390}, {"loss": 0.7395, "grad_norm": 0.972494900226593, "learning_rate": 0.0002, "epoch": 1.7522441651705565, "step": 24400}, {"loss": 0.7689, "grad_norm": 0.8045306205749512, "learning_rate": 0.0002, "epoch": 1.7529622980251347, "step": 24410}, {"loss": 0.7463, "grad_norm": 0.82415372133255, "learning_rate": 0.0002, "epoch": 1.7536804308797127, "step": 24420}, {"loss": 0.7384, "grad_norm": 0.72683185338974, "learning_rate": 0.0002, "epoch": 1.754398563734291, "step": 24430}, {"loss": 0.7512, "grad_norm": 0.687907338142395, "learning_rate": 0.0002, "epoch": 1.755116696588869, "step": 24440}, {"loss": 0.7627, "grad_norm": 0.6616531610488892, "learning_rate": 0.0002, "epoch": 1.755834829443447, "step": 24450}, {"loss": 0.7425, "grad_norm": 0.7225571870803833, "learning_rate": 0.0002, "epoch": 1.7565529622980252, "step": 24460}, {"loss": 0.7584, "grad_norm": 0.7597603797912598, "learning_rate": 0.0002, "epoch": 1.7572710951526034, "step": 24470}, {"loss": 0.7076, "grad_norm": 0.7850660681724548, "learning_rate": 0.0002, "epoch": 1.7579892280071814, "step": 24480}, {"loss": 0.7294, "grad_norm": 0.9843530058860779, "learning_rate": 0.0002, "epoch": 1.7587073608617594, "step": 24490}, {"loss": 0.7237, "grad_norm": 0.7010256052017212, "learning_rate": 0.0002, "epoch": 1.7594254937163374, "step": 24500}, {"loss": 0.7143, "grad_norm": 0.5669383406639099, "learning_rate": 0.0002, "epoch": 1.7601436265709156, "step": 24510}, {"loss": 0.7511, "grad_norm": 0.7043302655220032, "learning_rate": 0.0002, "epoch": 1.7608617594254938, "step": 24520}, {"loss": 0.73, "grad_norm": 0.8000741600990295, "learning_rate": 0.0002, "epoch": 1.7615798922800718, "step": 24530}, {"loss": 0.6994, "grad_norm": 0.7084416747093201, "learning_rate": 0.0002, "epoch": 1.7622980251346498, "step": 24540}, {"loss": 0.7337, "grad_norm": 0.7290608882904053, "learning_rate": 0.0002, "epoch": 1.763016157989228, "step": 24550}, {"loss": 0.6968, "grad_norm": 0.8710007071495056, "learning_rate": 0.0002, "epoch": 1.763734290843806, "step": 24560}, {"loss": 0.7023, "grad_norm": 0.6346535682678223, "learning_rate": 0.0002, "epoch": 1.7644524236983843, "step": 24570}, {"loss": 0.684, "grad_norm": 0.8990599513053894, "learning_rate": 0.0002, "epoch": 1.7651705565529623, "step": 24580}, {"loss": 0.7222, "grad_norm": 0.7823857665061951, "learning_rate": 0.0002, "epoch": 1.7658886894075403, "step": 24590}, {"loss": 0.7392, "grad_norm": 0.6250144839286804, "learning_rate": 0.0002, "epoch": 1.7666068222621185, "step": 24600}, {"loss": 0.7159, "grad_norm": 0.715657114982605, "learning_rate": 0.0002, "epoch": 1.7673249551166967, "step": 24610}, {"loss": 0.7245, "grad_norm": 0.6254874467849731, "learning_rate": 0.0002, "epoch": 1.7680430879712747, "step": 24620}, {"loss": 0.7258, "grad_norm": 0.6873717904090881, "learning_rate": 0.0002, "epoch": 1.7687612208258527, "step": 24630}, {"loss": 0.7951, "grad_norm": 0.7273038625717163, "learning_rate": 0.0002, "epoch": 1.7694793536804307, "step": 24640}, {"loss": 0.7417, "grad_norm": 0.9079981446266174, "learning_rate": 0.0002, "epoch": 1.770197486535009, "step": 24650}, {"loss": 0.7138, "grad_norm": 0.6262510418891907, "learning_rate": 0.0002, "epoch": 1.7709156193895872, "step": 24660}, {"loss": 0.6995, "grad_norm": 0.7326231002807617, "learning_rate": 0.0002, "epoch": 1.7716337522441652, "step": 24670}, {"loss": 0.7483, "grad_norm": 0.7828301787376404, "learning_rate": 0.0002, "epoch": 1.7723518850987432, "step": 24680}, {"loss": 0.689, "grad_norm": 0.5881586670875549, "learning_rate": 0.0002, "epoch": 1.7730700179533212, "step": 24690}, {"loss": 0.744, "grad_norm": 0.7101683020591736, "learning_rate": 0.0002, "epoch": 1.7737881508078994, "step": 24700}, {"loss": 0.7145, "grad_norm": 0.8466469049453735, "learning_rate": 0.0002, "epoch": 1.7745062836624776, "step": 24710}, {"loss": 0.7428, "grad_norm": 0.7770822644233704, "learning_rate": 0.0002, "epoch": 1.7752244165170556, "step": 24720}, {"loss": 0.7299, "grad_norm": 0.7259120345115662, "learning_rate": 0.0002, "epoch": 1.7759425493716336, "step": 24730}, {"loss": 0.6909, "grad_norm": 0.7696824669837952, "learning_rate": 0.0002, "epoch": 1.7766606822262119, "step": 24740}, {"loss": 0.7659, "grad_norm": 0.7603837847709656, "learning_rate": 0.0002, "epoch": 1.77737881508079, "step": 24750}, {"loss": 0.6966, "grad_norm": 0.6166595220565796, "learning_rate": 0.0002, "epoch": 1.778096947935368, "step": 24760}, {"loss": 0.6987, "grad_norm": 0.7493758797645569, "learning_rate": 0.0002, "epoch": 1.778815080789946, "step": 24770}, {"loss": 0.6808, "grad_norm": 0.7177459597587585, "learning_rate": 0.0002, "epoch": 1.779533213644524, "step": 24780}, {"loss": 0.7411, "grad_norm": 0.6666781306266785, "learning_rate": 0.0002, "epoch": 1.7802513464991023, "step": 24790}, {"loss": 0.6867, "grad_norm": 0.6556468605995178, "learning_rate": 0.0002, "epoch": 1.7809694793536806, "step": 24800}, {"loss": 0.7375, "grad_norm": 0.6119393706321716, "learning_rate": 0.0002, "epoch": 1.7816876122082586, "step": 24810}, {"loss": 0.7059, "grad_norm": 0.8573325276374817, "learning_rate": 0.0002, "epoch": 1.7824057450628366, "step": 24820}, {"loss": 0.7708, "grad_norm": 0.8017005920410156, "learning_rate": 0.0002, "epoch": 1.7831238779174146, "step": 24830}, {"loss": 0.7041, "grad_norm": 0.7337947487831116, "learning_rate": 0.0002, "epoch": 1.7838420107719928, "step": 24840}, {"loss": 0.7325, "grad_norm": 0.6717178225517273, "learning_rate": 0.0002, "epoch": 1.784560143626571, "step": 24850}, {"loss": 0.7285, "grad_norm": 0.8243708610534668, "learning_rate": 0.0002, "epoch": 1.785278276481149, "step": 24860}, {"loss": 0.701, "grad_norm": 0.8111547827720642, "learning_rate": 0.0002, "epoch": 1.785996409335727, "step": 24870}, {"loss": 0.7105, "grad_norm": 0.8577823042869568, "learning_rate": 0.0002, "epoch": 1.7867145421903052, "step": 24880}, {"loss": 0.7419, "grad_norm": 0.6488644480705261, "learning_rate": 0.0002, "epoch": 1.7874326750448835, "step": 24890}, {"loss": 0.7112, "grad_norm": 0.6446744799613953, "learning_rate": 0.0002, "epoch": 1.7881508078994615, "step": 24900}, {"loss": 0.7531, "grad_norm": 0.6400182247161865, "learning_rate": 0.0002, "epoch": 1.7888689407540395, "step": 24910}, {"loss": 0.711, "grad_norm": 0.8059108853340149, "learning_rate": 0.0002, "epoch": 1.7895870736086175, "step": 24920}, {"loss": 0.7678, "grad_norm": 0.7101734280586243, "learning_rate": 0.0002, "epoch": 1.7903052064631957, "step": 24930}, {"loss": 0.7648, "grad_norm": 1.0397762060165405, "learning_rate": 0.0002, "epoch": 1.791023339317774, "step": 24940}, {"loss": 0.7079, "grad_norm": 0.6231128573417664, "learning_rate": 0.0002, "epoch": 1.791741472172352, "step": 24950}, {"loss": 0.7525, "grad_norm": 5.905253887176514, "learning_rate": 0.0002, "epoch": 1.79245960502693, "step": 24960}, {"loss": 0.7286, "grad_norm": 0.8003911375999451, "learning_rate": 0.0002, "epoch": 1.793177737881508, "step": 24970}, {"loss": 0.7002, "grad_norm": 0.6340393424034119, "learning_rate": 0.0002, "epoch": 1.7938958707360861, "step": 24980}, {"loss": 0.7056, "grad_norm": 0.8701013922691345, "learning_rate": 0.0002, "epoch": 1.7946140035906644, "step": 24990}, {"loss": 0.7192, "grad_norm": 0.9085575342178345, "learning_rate": 0.0002, "epoch": 1.7953321364452424, "step": 25000}, {"loss": 0.7367, "grad_norm": 0.6306625604629517, "learning_rate": 0.0002, "epoch": 1.7960502692998204, "step": 25010}, {"loss": 0.7122, "grad_norm": 0.6985056400299072, "learning_rate": 0.0002, "epoch": 1.7967684021543986, "step": 25020}, {"loss": 0.7005, "grad_norm": 0.7309113144874573, "learning_rate": 0.0002, "epoch": 1.7974865350089768, "step": 25030}, {"loss": 0.7414, "grad_norm": 0.6795042157173157, "learning_rate": 0.0002, "epoch": 1.7982046678635548, "step": 25040}, {"loss": 0.7606, "grad_norm": 0.6920178532600403, "learning_rate": 0.0002, "epoch": 1.7989228007181328, "step": 25050}, {"loss": 0.7094, "grad_norm": 0.6578564047813416, "learning_rate": 0.0002, "epoch": 1.7996409335727108, "step": 25060}, {"loss": 0.7471, "grad_norm": 0.6718358993530273, "learning_rate": 0.0002, "epoch": 1.800359066427289, "step": 25070}, {"loss": 0.7271, "grad_norm": 0.9086750149726868, "learning_rate": 0.0002, "epoch": 1.8010771992818673, "step": 25080}, {"loss": 0.7653, "grad_norm": 0.6102437973022461, "learning_rate": 0.0002, "epoch": 1.8017953321364453, "step": 25090}, {"loss": 0.7538, "grad_norm": 0.6391313076019287, "learning_rate": 0.0002, "epoch": 1.8025134649910233, "step": 25100}, {"loss": 0.766, "grad_norm": 0.7150128483772278, "learning_rate": 0.0002, "epoch": 1.8032315978456013, "step": 25110}, {"loss": 0.7036, "grad_norm": 0.9833421111106873, "learning_rate": 0.0002, "epoch": 1.8039497307001795, "step": 25120}, {"loss": 0.7122, "grad_norm": 0.774002194404602, "learning_rate": 0.0002, "epoch": 1.8046678635547577, "step": 25130}, {"loss": 0.7329, "grad_norm": 0.644443154335022, "learning_rate": 0.0002, "epoch": 1.8053859964093357, "step": 25140}, {"loss": 0.7039, "grad_norm": 0.6996100544929504, "learning_rate": 0.0002, "epoch": 1.8061041292639137, "step": 25150}, {"loss": 0.6962, "grad_norm": 0.7545985579490662, "learning_rate": 0.0002, "epoch": 1.806822262118492, "step": 25160}, {"loss": 0.7432, "grad_norm": 0.7505226731300354, "learning_rate": 0.0002, "epoch": 1.8075403949730702, "step": 25170}, {"loss": 0.7189, "grad_norm": 0.800681471824646, "learning_rate": 0.0002, "epoch": 1.8082585278276482, "step": 25180}, {"loss": 0.7131, "grad_norm": 0.8268337845802307, "learning_rate": 0.0002, "epoch": 1.8089766606822262, "step": 25190}, {"loss": 0.7933, "grad_norm": 0.6436594128608704, "learning_rate": 0.0002, "epoch": 1.8096947935368042, "step": 25200}, {"loss": 0.7478, "grad_norm": 0.6961014270782471, "learning_rate": 0.0002, "epoch": 1.8104129263913824, "step": 25210}, {"loss": 0.7519, "grad_norm": 0.6649489998817444, "learning_rate": 0.0002, "epoch": 1.8111310592459606, "step": 25220}, {"loss": 0.7307, "grad_norm": 0.7071637511253357, "learning_rate": 0.0002, "epoch": 1.8118491921005386, "step": 25230}, {"loss": 0.7074, "grad_norm": 0.9082241654396057, "learning_rate": 0.0002, "epoch": 1.8125673249551166, "step": 25240}, {"loss": 0.7406, "grad_norm": 0.6318159103393555, "learning_rate": 0.0002, "epoch": 1.8132854578096946, "step": 25250}, {"loss": 0.7081, "grad_norm": 0.8006597757339478, "learning_rate": 0.0002, "epoch": 1.8140035906642729, "step": 25260}, {"loss": 0.7593, "grad_norm": 0.7950259447097778, "learning_rate": 0.0002, "epoch": 1.814721723518851, "step": 25270}, {"loss": 0.6897, "grad_norm": 0.8376588821411133, "learning_rate": 0.0002, "epoch": 1.815439856373429, "step": 25280}, {"loss": 0.747, "grad_norm": 0.8343217968940735, "learning_rate": 0.0002, "epoch": 1.816157989228007, "step": 25290}, {"loss": 0.7611, "grad_norm": 0.6240017414093018, "learning_rate": 0.0002, "epoch": 1.8168761220825853, "step": 25300}, {"loss": 0.7458, "grad_norm": 0.7079808712005615, "learning_rate": 0.0002, "epoch": 1.8175942549371635, "step": 25310}, {"loss": 0.7254, "grad_norm": 0.5930073261260986, "learning_rate": 0.0002, "epoch": 1.8183123877917415, "step": 25320}, {"loss": 0.7647, "grad_norm": 0.6994491815567017, "learning_rate": 0.0002, "epoch": 1.8190305206463195, "step": 25330}, {"loss": 0.726, "grad_norm": 0.8285305500030518, "learning_rate": 0.0002, "epoch": 1.8197486535008975, "step": 25340}, {"loss": 0.7215, "grad_norm": 0.6880194544792175, "learning_rate": 0.0002, "epoch": 1.8204667863554758, "step": 25350}, {"loss": 0.7365, "grad_norm": 0.7301307916641235, "learning_rate": 0.0002, "epoch": 1.821184919210054, "step": 25360}, {"loss": 0.7308, "grad_norm": 0.8117532730102539, "learning_rate": 0.0002, "epoch": 1.821903052064632, "step": 25370}, {"loss": 0.7395, "grad_norm": 0.8098701238632202, "learning_rate": 0.0002, "epoch": 1.82262118491921, "step": 25380}, {"loss": 0.7082, "grad_norm": 0.6899038553237915, "learning_rate": 0.0002, "epoch": 1.823339317773788, "step": 25390}, {"loss": 0.697, "grad_norm": 0.7350431084632874, "learning_rate": 0.0002, "epoch": 1.8240574506283662, "step": 25400}, {"loss": 0.7389, "grad_norm": 0.8723382949829102, "learning_rate": 0.0002, "epoch": 1.8247755834829444, "step": 25410}, {"loss": 0.7375, "grad_norm": 0.7448108196258545, "learning_rate": 0.0002, "epoch": 1.8254937163375224, "step": 25420}, {"loss": 0.7279, "grad_norm": 0.7525040507316589, "learning_rate": 0.0002, "epoch": 1.8262118491921004, "step": 25430}, {"loss": 0.7164, "grad_norm": 0.7148599028587341, "learning_rate": 0.0002, "epoch": 1.8269299820466787, "step": 25440}, {"loss": 0.7955, "grad_norm": 1.1802153587341309, "learning_rate": 0.0002, "epoch": 1.827648114901257, "step": 25450}, {"loss": 0.7094, "grad_norm": 0.619945764541626, "learning_rate": 0.0002, "epoch": 1.828366247755835, "step": 25460}, {"loss": 0.8234, "grad_norm": 0.7065792679786682, "learning_rate": 0.0002, "epoch": 1.829084380610413, "step": 25470}, {"loss": 0.796, "grad_norm": 0.6626001596450806, "learning_rate": 0.0002, "epoch": 1.829802513464991, "step": 25480}, {"loss": 0.7402, "grad_norm": 0.8368920087814331, "learning_rate": 0.0002, "epoch": 1.8305206463195691, "step": 25490}, {"loss": 0.6513, "grad_norm": 0.7528934478759766, "learning_rate": 0.0002, "epoch": 1.8312387791741473, "step": 25500}, {"loss": 0.7272, "grad_norm": 0.6472136378288269, "learning_rate": 0.0002, "epoch": 1.8319569120287253, "step": 25510}, {"loss": 0.7221, "grad_norm": 0.7818671464920044, "learning_rate": 0.0002, "epoch": 1.8326750448833034, "step": 25520}, {"loss": 0.7582, "grad_norm": 0.8280798196792603, "learning_rate": 0.0002, "epoch": 1.8333931777378814, "step": 25530}, {"loss": 0.7079, "grad_norm": 0.7038599252700806, "learning_rate": 0.0002, "epoch": 1.8341113105924596, "step": 25540}, {"loss": 0.711, "grad_norm": 0.6345962882041931, "learning_rate": 0.0002, "epoch": 1.8348294434470378, "step": 25550}, {"loss": 0.7553, "grad_norm": 0.6891741752624512, "learning_rate": 0.0002, "epoch": 1.8355475763016158, "step": 25560}, {"loss": 0.754, "grad_norm": 0.7753492593765259, "learning_rate": 0.0002, "epoch": 1.8362657091561938, "step": 25570}, {"loss": 0.7149, "grad_norm": 0.6907210946083069, "learning_rate": 0.0002, "epoch": 1.836983842010772, "step": 25580}, {"loss": 0.705, "grad_norm": 0.7483090162277222, "learning_rate": 0.0002, "epoch": 1.8377019748653503, "step": 25590}, {"loss": 0.7716, "grad_norm": 0.8749029636383057, "learning_rate": 0.0002, "epoch": 1.8384201077199283, "step": 25600}, {"loss": 0.7745, "grad_norm": 0.6936851143836975, "learning_rate": 0.0002, "epoch": 1.8391382405745063, "step": 25610}, {"loss": 0.7297, "grad_norm": 0.7273763418197632, "learning_rate": 0.0002, "epoch": 1.8398563734290843, "step": 25620}, {"loss": 0.724, "grad_norm": 0.7655298113822937, "learning_rate": 0.0002, "epoch": 1.8405745062836625, "step": 25630}, {"loss": 0.7566, "grad_norm": 0.7207344770431519, "learning_rate": 0.0002, "epoch": 1.8412926391382407, "step": 25640}, {"loss": 0.7092, "grad_norm": 0.6970131397247314, "learning_rate": 0.0002, "epoch": 1.8420107719928187, "step": 25650}, {"loss": 0.7164, "grad_norm": 0.7777560353279114, "learning_rate": 0.0002, "epoch": 1.8427289048473967, "step": 25660}, {"loss": 0.7594, "grad_norm": 0.7070116400718689, "learning_rate": 0.0002, "epoch": 1.8434470377019747, "step": 25670}, {"loss": 0.7603, "grad_norm": 0.6980257630348206, "learning_rate": 0.0002, "epoch": 1.844165170556553, "step": 25680}, {"loss": 0.7782, "grad_norm": 0.906563401222229, "learning_rate": 0.0002, "epoch": 1.8448833034111312, "step": 25690}, {"loss": 0.7377, "grad_norm": 0.567991316318512, "learning_rate": 0.0002, "epoch": 1.8456014362657092, "step": 25700}, {"loss": 0.7236, "grad_norm": 0.5954506993293762, "learning_rate": 0.0002, "epoch": 1.8463195691202872, "step": 25710}, {"loss": 0.7287, "grad_norm": 0.8073318600654602, "learning_rate": 0.0002, "epoch": 1.8470377019748654, "step": 25720}, {"loss": 0.7627, "grad_norm": 0.7439551949501038, "learning_rate": 0.0002, "epoch": 1.8477558348294436, "step": 25730}, {"loss": 0.7719, "grad_norm": 0.8091771602630615, "learning_rate": 0.0002, "epoch": 1.8484739676840216, "step": 25740}, {"loss": 0.7477, "grad_norm": 0.6584576964378357, "learning_rate": 0.0002, "epoch": 1.8491921005385996, "step": 25750}, {"loss": 0.6988, "grad_norm": 0.8161963224411011, "learning_rate": 0.0002, "epoch": 1.8499102333931776, "step": 25760}, {"loss": 0.7607, "grad_norm": 0.7337122559547424, "learning_rate": 0.0002, "epoch": 1.8506283662477558, "step": 25770}, {"loss": 0.7279, "grad_norm": 0.8968114256858826, "learning_rate": 0.0002, "epoch": 1.851346499102334, "step": 25780}, {"loss": 0.7162, "grad_norm": 0.8647686839103699, "learning_rate": 0.0002, "epoch": 1.852064631956912, "step": 25790}, {"loss": 0.7315, "grad_norm": 0.7775349020957947, "learning_rate": 0.0002, "epoch": 1.85278276481149, "step": 25800}, {"loss": 0.7739, "grad_norm": 0.686072587966919, "learning_rate": 0.0002, "epoch": 1.853500897666068, "step": 25810}, {"loss": 0.7138, "grad_norm": 0.7053380012512207, "learning_rate": 0.0002, "epoch": 1.8542190305206463, "step": 25820}, {"loss": 0.7583, "grad_norm": 0.7899979948997498, "learning_rate": 0.0002, "epoch": 1.8549371633752245, "step": 25830}, {"loss": 0.7633, "grad_norm": 0.6970776915550232, "learning_rate": 0.0002, "epoch": 1.8556552962298025, "step": 25840}, {"loss": 0.7704, "grad_norm": 0.7210841774940491, "learning_rate": 0.0002, "epoch": 1.8563734290843805, "step": 25850}, {"loss": 0.7422, "grad_norm": 0.7297208905220032, "learning_rate": 0.0002, "epoch": 1.8570915619389587, "step": 25860}, {"loss": 0.698, "grad_norm": 0.7782729268074036, "learning_rate": 0.0002, "epoch": 1.857809694793537, "step": 25870}, {"loss": 0.7791, "grad_norm": 0.7227505445480347, "learning_rate": 0.0002, "epoch": 1.858527827648115, "step": 25880}, {"loss": 0.7899, "grad_norm": 0.7489684224128723, "learning_rate": 0.0002, "epoch": 1.859245960502693, "step": 25890}, {"loss": 0.7875, "grad_norm": 0.7447289824485779, "learning_rate": 0.0002, "epoch": 1.859964093357271, "step": 25900}, {"loss": 0.7151, "grad_norm": 0.8516317009925842, "learning_rate": 0.0002, "epoch": 1.8606822262118492, "step": 25910}, {"loss": 0.6947, "grad_norm": 0.6864543557167053, "learning_rate": 0.0002, "epoch": 1.8614003590664274, "step": 25920}, {"loss": 0.7516, "grad_norm": 0.6753451824188232, "learning_rate": 0.0002, "epoch": 1.8621184919210054, "step": 25930}, {"loss": 0.7606, "grad_norm": 0.631679117679596, "learning_rate": 0.0002, "epoch": 1.8628366247755834, "step": 25940}, {"loss": 0.7663, "grad_norm": 0.7715049982070923, "learning_rate": 0.0002, "epoch": 1.8635547576301614, "step": 25950}, {"loss": 0.6967, "grad_norm": 0.7354850769042969, "learning_rate": 0.0002, "epoch": 1.8642728904847397, "step": 25960}, {"loss": 0.7331, "grad_norm": 0.7443442940711975, "learning_rate": 0.0002, "epoch": 1.8649910233393179, "step": 25970}, {"loss": 0.7558, "grad_norm": 0.6880337595939636, "learning_rate": 0.0002, "epoch": 1.8657091561938959, "step": 25980}, {"loss": 0.752, "grad_norm": 0.843941867351532, "learning_rate": 0.0002, "epoch": 1.8664272890484739, "step": 25990}, {"loss": 0.6941, "grad_norm": 0.6904318928718567, "learning_rate": 0.0002, "epoch": 1.867145421903052, "step": 26000}, {"loss": 0.6995, "grad_norm": 0.9041751623153687, "learning_rate": 0.0002, "epoch": 1.86786355475763, "step": 26010}, {"loss": 0.7503, "grad_norm": 0.7470057010650635, "learning_rate": 0.0002, "epoch": 1.8685816876122083, "step": 26020}, {"loss": 0.775, "grad_norm": 0.6921331882476807, "learning_rate": 0.0002, "epoch": 1.8692998204667863, "step": 26030}, {"loss": 0.7376, "grad_norm": 0.7627376914024353, "learning_rate": 0.0002, "epoch": 1.8700179533213643, "step": 26040}, {"loss": 0.7459, "grad_norm": 0.7784932851791382, "learning_rate": 0.0002, "epoch": 1.8707360861759426, "step": 26050}, {"loss": 0.7479, "grad_norm": 0.6399524807929993, "learning_rate": 0.0002, "epoch": 1.8714542190305208, "step": 26060}, {"loss": 0.7128, "grad_norm": 0.6478492617607117, "learning_rate": 0.0002, "epoch": 1.8721723518850988, "step": 26070}, {"loss": 0.6901, "grad_norm": 0.6376804113388062, "learning_rate": 0.0002, "epoch": 1.8728904847396768, "step": 26080}, {"loss": 0.7037, "grad_norm": 0.6976892352104187, "learning_rate": 0.0002, "epoch": 1.8736086175942548, "step": 26090}, {"loss": 0.7071, "grad_norm": 0.7997903227806091, "learning_rate": 0.0002, "epoch": 1.874326750448833, "step": 26100}, {"loss": 0.7152, "grad_norm": 0.6984273791313171, "learning_rate": 0.0002, "epoch": 1.8750448833034112, "step": 26110}, {"loss": 0.7768, "grad_norm": 0.7020659446716309, "learning_rate": 0.0002, "epoch": 1.8757630161579892, "step": 26120}, {"loss": 0.7518, "grad_norm": 0.784986138343811, "learning_rate": 0.0002, "epoch": 1.8764811490125672, "step": 26130}, {"loss": 0.7224, "grad_norm": 0.7369210124015808, "learning_rate": 0.0002, "epoch": 1.8771992818671455, "step": 26140}, {"loss": 0.7935, "grad_norm": 0.7730622291564941, "learning_rate": 0.0002, "epoch": 1.8779174147217235, "step": 26150}, {"loss": 0.697, "grad_norm": 0.7253434658050537, "learning_rate": 0.0002, "epoch": 1.8786355475763017, "step": 26160}, {"loss": 0.6866, "grad_norm": 0.8019800186157227, "learning_rate": 0.0002, "epoch": 1.8793536804308797, "step": 26170}, {"loss": 0.7341, "grad_norm": 0.7337628602981567, "learning_rate": 0.0002, "epoch": 1.8800718132854577, "step": 26180}, {"loss": 0.752, "grad_norm": 0.7049200534820557, "learning_rate": 0.0002, "epoch": 1.880789946140036, "step": 26190}, {"loss": 0.73, "grad_norm": 0.6451525092124939, "learning_rate": 0.0002, "epoch": 1.8815080789946141, "step": 26200}, {"loss": 0.749, "grad_norm": 0.7660874724388123, "learning_rate": 0.0002, "epoch": 1.8822262118491921, "step": 26210}, {"loss": 0.7377, "grad_norm": 0.8464223146438599, "learning_rate": 0.0002, "epoch": 1.8829443447037701, "step": 26220}, {"loss": 0.7402, "grad_norm": 0.859503984451294, "learning_rate": 0.0002, "epoch": 1.8836624775583481, "step": 26230}, {"loss": 0.7057, "grad_norm": 0.6969478726387024, "learning_rate": 0.0002, "epoch": 1.8843806104129264, "step": 26240}, {"loss": 0.7338, "grad_norm": 0.6860285997390747, "learning_rate": 0.0002, "epoch": 1.8850987432675046, "step": 26250}, {"loss": 0.7397, "grad_norm": 0.5873110294342041, "learning_rate": 0.0002, "epoch": 1.8858168761220826, "step": 26260}, {"loss": 0.7208, "grad_norm": 0.6959530115127563, "learning_rate": 0.0002, "epoch": 1.8865350089766606, "step": 26270}, {"loss": 0.7156, "grad_norm": 0.8734689950942993, "learning_rate": 0.0002, "epoch": 1.8872531418312388, "step": 26280}, {"loss": 0.689, "grad_norm": 0.7385509014129639, "learning_rate": 0.0002, "epoch": 1.8879712746858168, "step": 26290}, {"loss": 0.7355, "grad_norm": 0.6702063083648682, "learning_rate": 0.0002, "epoch": 1.888689407540395, "step": 26300}, {"loss": 0.7247, "grad_norm": 0.8177255988121033, "learning_rate": 0.0002, "epoch": 1.889407540394973, "step": 26310}, {"loss": 0.7451, "grad_norm": 0.6638466715812683, "learning_rate": 0.0002, "epoch": 1.890125673249551, "step": 26320}, {"loss": 0.7176, "grad_norm": 0.8584128618240356, "learning_rate": 0.0002, "epoch": 1.8908438061041293, "step": 26330}, {"loss": 0.7216, "grad_norm": 0.677561342716217, "learning_rate": 0.0002, "epoch": 1.8915619389587075, "step": 26340}, {"loss": 0.7502, "grad_norm": 0.6931864619255066, "learning_rate": 0.0002, "epoch": 1.8922800718132855, "step": 26350}, {"loss": 0.7548, "grad_norm": 0.6583828330039978, "learning_rate": 0.0002, "epoch": 1.8929982046678635, "step": 26360}, {"loss": 0.7544, "grad_norm": 0.6708519458770752, "learning_rate": 0.0002, "epoch": 1.8937163375224415, "step": 26370}, {"loss": 0.7034, "grad_norm": 0.7684788107872009, "learning_rate": 0.0002, "epoch": 1.8944344703770197, "step": 26380}, {"loss": 0.7243, "grad_norm": 0.703217625617981, "learning_rate": 0.0002, "epoch": 1.895152603231598, "step": 26390}, {"loss": 0.7768, "grad_norm": 0.6686710119247437, "learning_rate": 0.0002, "epoch": 1.895870736086176, "step": 26400}, {"loss": 0.7999, "grad_norm": 0.7429705262184143, "learning_rate": 0.0002, "epoch": 1.896588868940754, "step": 26410}, {"loss": 0.7695, "grad_norm": 0.7835305333137512, "learning_rate": 0.0002, "epoch": 1.8973070017953322, "step": 26420}, {"loss": 0.722, "grad_norm": 0.7793689370155334, "learning_rate": 0.0002, "epoch": 1.8980251346499102, "step": 26430}, {"loss": 0.7872, "grad_norm": 0.7337237000465393, "learning_rate": 0.0002, "epoch": 1.8987432675044884, "step": 26440}, {"loss": 0.7092, "grad_norm": 0.5734546780586243, "learning_rate": 0.0002, "epoch": 1.8994614003590664, "step": 26450}, {"loss": 0.7738, "grad_norm": 0.655937135219574, "learning_rate": 0.0002, "epoch": 1.9001795332136444, "step": 26460}, {"loss": 0.7302, "grad_norm": 1.0200905799865723, "learning_rate": 0.0002, "epoch": 1.9008976660682226, "step": 26470}, {"loss": 0.733, "grad_norm": 0.6118829250335693, "learning_rate": 0.0002, "epoch": 1.9016157989228009, "step": 26480}, {"loss": 0.7255, "grad_norm": 0.7459297776222229, "learning_rate": 0.0002, "epoch": 1.9023339317773789, "step": 26490}, {"loss": 0.7257, "grad_norm": 0.9451959729194641, "learning_rate": 0.0002, "epoch": 1.9030520646319569, "step": 26500}, {"loss": 0.7911, "grad_norm": 0.9694880247116089, "learning_rate": 0.0002, "epoch": 1.9037701974865349, "step": 26510}, {"loss": 0.7913, "grad_norm": 0.806532084941864, "learning_rate": 0.0002, "epoch": 1.904488330341113, "step": 26520}, {"loss": 0.7375, "grad_norm": 0.7016968727111816, "learning_rate": 0.0002, "epoch": 1.9052064631956913, "step": 26530}, {"loss": 0.7128, "grad_norm": 0.7707533836364746, "learning_rate": 0.0002, "epoch": 1.9059245960502693, "step": 26540}, {"loss": 0.7225, "grad_norm": 0.716044545173645, "learning_rate": 0.0002, "epoch": 1.9066427289048473, "step": 26550}, {"loss": 0.7569, "grad_norm": 0.7904782295227051, "learning_rate": 0.0002, "epoch": 1.9073608617594255, "step": 26560}, {"loss": 0.7112, "grad_norm": 0.8557461500167847, "learning_rate": 0.0002, "epoch": 1.9080789946140035, "step": 26570}, {"loss": 0.7377, "grad_norm": 0.6807048916816711, "learning_rate": 0.0002, "epoch": 1.9087971274685818, "step": 26580}, {"loss": 0.7066, "grad_norm": 0.8374032974243164, "learning_rate": 0.0002, "epoch": 1.9095152603231598, "step": 26590}, {"loss": 0.7282, "grad_norm": 0.7936834692955017, "learning_rate": 0.0002, "epoch": 1.9102333931777378, "step": 26600}, {"loss": 0.741, "grad_norm": 0.6342210173606873, "learning_rate": 0.0002, "epoch": 1.910951526032316, "step": 26610}, {"loss": 0.7117, "grad_norm": 0.8222208023071289, "learning_rate": 0.0002, "epoch": 1.9116696588868942, "step": 26620}, {"loss": 0.6965, "grad_norm": 0.7890012860298157, "learning_rate": 0.0002, "epoch": 1.9123877917414722, "step": 26630}, {"loss": 0.7141, "grad_norm": 0.6415254473686218, "learning_rate": 0.0002, "epoch": 1.9131059245960502, "step": 26640}, {"loss": 0.7232, "grad_norm": 0.7936763763427734, "learning_rate": 0.0002, "epoch": 1.9138240574506282, "step": 26650}, {"loss": 0.7411, "grad_norm": 0.7174334526062012, "learning_rate": 0.0002, "epoch": 1.9145421903052064, "step": 26660}, {"loss": 0.715, "grad_norm": 0.6503710746765137, "learning_rate": 0.0002, "epoch": 1.9152603231597847, "step": 26670}, {"loss": 0.7629, "grad_norm": 0.7618577480316162, "learning_rate": 0.0002, "epoch": 1.9159784560143627, "step": 26680}, {"loss": 0.7581, "grad_norm": 0.7984131574630737, "learning_rate": 0.0002, "epoch": 1.9166965888689407, "step": 26690}, {"loss": 0.7126, "grad_norm": 0.6863887906074524, "learning_rate": 0.0002, "epoch": 1.917414721723519, "step": 26700}, {"loss": 0.738, "grad_norm": 0.7621138691902161, "learning_rate": 0.0002, "epoch": 1.918132854578097, "step": 26710}, {"loss": 0.7095, "grad_norm": 0.7855543494224548, "learning_rate": 0.0002, "epoch": 1.9188509874326751, "step": 26720}, {"loss": 0.7354, "grad_norm": 0.7045016288757324, "learning_rate": 0.0002, "epoch": 1.9195691202872531, "step": 26730}, {"loss": 0.7188, "grad_norm": 0.7799559235572815, "learning_rate": 0.0002, "epoch": 1.9202872531418311, "step": 26740}, {"loss": 0.7714, "grad_norm": 0.7999796271324158, "learning_rate": 0.0002, "epoch": 1.9210053859964094, "step": 26750}, {"loss": 0.6856, "grad_norm": 0.5479980111122131, "learning_rate": 0.0002, "epoch": 1.9217235188509876, "step": 26760}, {"loss": 0.7153, "grad_norm": 0.7192868590354919, "learning_rate": 0.0002, "epoch": 1.9224416517055656, "step": 26770}, {"loss": 0.7272, "grad_norm": 0.7642375826835632, "learning_rate": 0.0002, "epoch": 1.9231597845601436, "step": 26780}, {"loss": 0.6923, "grad_norm": 0.7015959620475769, "learning_rate": 0.0002, "epoch": 1.9238779174147216, "step": 26790}, {"loss": 0.8291, "grad_norm": 0.6685634851455688, "learning_rate": 0.0002, "epoch": 1.9245960502692998, "step": 26800}, {"loss": 0.7404, "grad_norm": 0.674363911151886, "learning_rate": 0.0002, "epoch": 1.925314183123878, "step": 26810}, {"loss": 0.7145, "grad_norm": 0.769318163394928, "learning_rate": 0.0002, "epoch": 1.926032315978456, "step": 26820}, {"loss": 0.7323, "grad_norm": 0.7397989630699158, "learning_rate": 0.0002, "epoch": 1.926750448833034, "step": 26830}, {"loss": 0.7399, "grad_norm": 0.7603814601898193, "learning_rate": 0.0002, "epoch": 1.9274685816876123, "step": 26840}, {"loss": 0.7147, "grad_norm": 0.5960564613342285, "learning_rate": 0.0002, "epoch": 1.9281867145421903, "step": 26850}, {"loss": 0.7292, "grad_norm": 0.8158858418464661, "learning_rate": 0.0002, "epoch": 1.9289048473967685, "step": 26860}, {"loss": 0.7609, "grad_norm": 0.7022058367729187, "learning_rate": 0.0002, "epoch": 1.9296229802513465, "step": 26870}, {"loss": 0.809, "grad_norm": 0.7249060273170471, "learning_rate": 0.0002, "epoch": 1.9303411131059245, "step": 26880}, {"loss": 0.7437, "grad_norm": 0.7613264322280884, "learning_rate": 0.0002, "epoch": 1.9310592459605027, "step": 26890}, {"loss": 0.7238, "grad_norm": 0.6857499480247498, "learning_rate": 0.0002, "epoch": 1.931777378815081, "step": 26900}, {"loss": 0.7651, "grad_norm": 0.6968346834182739, "learning_rate": 0.0002, "epoch": 1.932495511669659, "step": 26910}, {"loss": 0.6837, "grad_norm": 0.7079267501831055, "learning_rate": 0.0002, "epoch": 1.933213644524237, "step": 26920}, {"loss": 0.7482, "grad_norm": 0.6571618914604187, "learning_rate": 0.0002, "epoch": 1.933931777378815, "step": 26930}, {"loss": 0.7344, "grad_norm": 0.7460548281669617, "learning_rate": 0.0002, "epoch": 1.9346499102333932, "step": 26940}, {"loss": 0.7038, "grad_norm": 0.7954307794570923, "learning_rate": 0.0002, "epoch": 1.9353680430879714, "step": 26950}, {"loss": 0.6847, "grad_norm": 0.8696223497390747, "learning_rate": 0.0002, "epoch": 1.9360861759425494, "step": 26960}, {"loss": 0.7657, "grad_norm": 0.726004421710968, "learning_rate": 0.0002, "epoch": 1.9368043087971274, "step": 26970}, {"loss": 0.771, "grad_norm": 0.8760337829589844, "learning_rate": 0.0002, "epoch": 1.9375224416517056, "step": 26980}, {"loss": 0.6917, "grad_norm": 0.7308675646781921, "learning_rate": 0.0002, "epoch": 1.9382405745062836, "step": 26990}, {"loss": 0.7155, "grad_norm": 0.5900304317474365, "learning_rate": 0.0002, "epoch": 1.9389587073608618, "step": 27000}, {"loss": 0.6917, "grad_norm": 0.8839457631111145, "learning_rate": 0.0002, "epoch": 1.9396768402154398, "step": 27010}, {"loss": 0.7443, "grad_norm": 0.7239173650741577, "learning_rate": 0.0002, "epoch": 1.9403949730700178, "step": 27020}, {"loss": 0.7081, "grad_norm": 0.8972901701927185, "learning_rate": 0.0002, "epoch": 1.941113105924596, "step": 27030}, {"loss": 0.7422, "grad_norm": 0.7140652537345886, "learning_rate": 0.0002, "epoch": 1.9418312387791743, "step": 27040}, {"loss": 0.7679, "grad_norm": 0.7502743005752563, "learning_rate": 0.0002, "epoch": 1.9425493716337523, "step": 27050}, {"loss": 0.7311, "grad_norm": 0.6420751810073853, "learning_rate": 0.0002, "epoch": 1.9432675044883303, "step": 27060}, {"loss": 0.7403, "grad_norm": 0.6671820282936096, "learning_rate": 0.0002, "epoch": 1.9439856373429083, "step": 27070}, {"loss": 0.6919, "grad_norm": 0.6268796324729919, "learning_rate": 0.0002, "epoch": 1.9447037701974865, "step": 27080}, {"loss": 0.8154, "grad_norm": 0.6850021481513977, "learning_rate": 0.0002, "epoch": 1.9454219030520647, "step": 27090}, {"loss": 0.7179, "grad_norm": 0.6380038261413574, "learning_rate": 0.0002, "epoch": 1.9461400359066428, "step": 27100}, {"loss": 0.7638, "grad_norm": 0.5806204080581665, "learning_rate": 0.0002, "epoch": 1.9468581687612208, "step": 27110}, {"loss": 0.7032, "grad_norm": 0.8236927390098572, "learning_rate": 0.0002, "epoch": 1.947576301615799, "step": 27120}, {"loss": 0.7398, "grad_norm": 0.7915826439857483, "learning_rate": 0.0002, "epoch": 1.948294434470377, "step": 27130}, {"loss": 0.729, "grad_norm": 0.7467429041862488, "learning_rate": 0.0002, "epoch": 1.9490125673249552, "step": 27140}, {"loss": 0.7297, "grad_norm": 0.6278707981109619, "learning_rate": 0.0002, "epoch": 1.9497307001795332, "step": 27150}, {"loss": 0.7272, "grad_norm": 0.7353739142417908, "learning_rate": 0.0002, "epoch": 1.9504488330341112, "step": 27160}, {"loss": 0.6877, "grad_norm": 0.6443645358085632, "learning_rate": 0.0002, "epoch": 1.9511669658886894, "step": 27170}, {"loss": 0.7479, "grad_norm": 0.770800769329071, "learning_rate": 0.0002, "epoch": 1.9518850987432677, "step": 27180}, {"loss": 0.713, "grad_norm": 0.8982598781585693, "learning_rate": 0.0002, "epoch": 1.9526032315978457, "step": 27190}, {"loss": 0.7447, "grad_norm": 0.775017499923706, "learning_rate": 0.0002, "epoch": 1.9533213644524237, "step": 27200}, {"loss": 0.76, "grad_norm": 0.8271628618240356, "learning_rate": 0.0002, "epoch": 1.9540394973070017, "step": 27210}, {"loss": 0.7321, "grad_norm": 0.7460184693336487, "learning_rate": 0.0002, "epoch": 1.9547576301615799, "step": 27220}, {"loss": 0.6999, "grad_norm": 0.7732188105583191, "learning_rate": 0.0002, "epoch": 1.955475763016158, "step": 27230}, {"loss": 0.7135, "grad_norm": 0.7398577332496643, "learning_rate": 0.0002, "epoch": 1.956193895870736, "step": 27240}, {"loss": 0.7347, "grad_norm": 0.7132339477539062, "learning_rate": 0.0002, "epoch": 1.9569120287253141, "step": 27250}, {"loss": 0.7731, "grad_norm": 0.6718965768814087, "learning_rate": 0.0002, "epoch": 1.9576301615798921, "step": 27260}, {"loss": 0.7088, "grad_norm": 0.7914422154426575, "learning_rate": 0.0002, "epoch": 1.9583482944344703, "step": 27270}, {"loss": 0.6998, "grad_norm": 0.8314110636711121, "learning_rate": 0.0002, "epoch": 1.9590664272890486, "step": 27280}, {"loss": 0.7662, "grad_norm": 0.7810674905776978, "learning_rate": 0.0002, "epoch": 1.9597845601436266, "step": 27290}, {"loss": 0.7278, "grad_norm": 0.7691007256507874, "learning_rate": 0.0002, "epoch": 1.9605026929982046, "step": 27300}, {"loss": 0.7152, "grad_norm": 0.6753138899803162, "learning_rate": 0.0002, "epoch": 1.9612208258527828, "step": 27310}, {"loss": 0.7519, "grad_norm": 0.5881175994873047, "learning_rate": 0.0002, "epoch": 1.961938958707361, "step": 27320}, {"loss": 0.6877, "grad_norm": 0.8414133191108704, "learning_rate": 0.0002, "epoch": 1.962657091561939, "step": 27330}, {"loss": 0.7241, "grad_norm": 0.7363715171813965, "learning_rate": 0.0002, "epoch": 1.963375224416517, "step": 27340}, {"loss": 0.7153, "grad_norm": 0.6526232361793518, "learning_rate": 0.0002, "epoch": 1.964093357271095, "step": 27350}, {"loss": 0.8178, "grad_norm": 0.6821389198303223, "learning_rate": 0.0002, "epoch": 1.9648114901256732, "step": 27360}, {"loss": 0.7134, "grad_norm": 0.7306062579154968, "learning_rate": 0.0002, "epoch": 1.9655296229802515, "step": 27370}, {"loss": 0.7139, "grad_norm": 0.6458130478858948, "learning_rate": 0.0002, "epoch": 1.9662477558348295, "step": 27380}, {"loss": 0.7397, "grad_norm": 0.7243196368217468, "learning_rate": 0.0002, "epoch": 1.9669658886894075, "step": 27390}, {"loss": 0.6729, "grad_norm": 0.8062235713005066, "learning_rate": 0.0002, "epoch": 1.9676840215439855, "step": 27400}, {"loss": 0.7972, "grad_norm": 0.68441241979599, "learning_rate": 0.0002, "epoch": 1.9684021543985637, "step": 27410}, {"loss": 0.7235, "grad_norm": 0.7504498958587646, "learning_rate": 0.0002, "epoch": 1.969120287253142, "step": 27420}, {"loss": 0.7192, "grad_norm": 0.7469466328620911, "learning_rate": 0.0002, "epoch": 1.96983842010772, "step": 27430}, {"loss": 0.7556, "grad_norm": 0.7109853625297546, "learning_rate": 0.0002, "epoch": 1.970556552962298, "step": 27440}, {"loss": 0.7977, "grad_norm": 0.6964903473854065, "learning_rate": 0.0002, "epoch": 1.9712746858168761, "step": 27450}, {"loss": 0.7692, "grad_norm": 0.8224200010299683, "learning_rate": 0.0002, "epoch": 1.9719928186714544, "step": 27460}, {"loss": 0.7318, "grad_norm": 0.6195617318153381, "learning_rate": 0.0002, "epoch": 1.9727109515260324, "step": 27470}, {"loss": 0.7843, "grad_norm": 0.691511332988739, "learning_rate": 0.0002, "epoch": 1.9734290843806104, "step": 27480}, {"loss": 0.7324, "grad_norm": 0.7437900304794312, "learning_rate": 0.0002, "epoch": 1.9741472172351884, "step": 27490}, {"loss": 0.6736, "grad_norm": 0.7987960577011108, "learning_rate": 0.0002, "epoch": 1.9748653500897666, "step": 27500}, {"loss": 0.7005, "grad_norm": 0.7117776274681091, "learning_rate": 0.0002, "epoch": 1.9755834829443448, "step": 27510}, {"loss": 0.7201, "grad_norm": 0.8473866581916809, "learning_rate": 0.0002, "epoch": 1.9763016157989228, "step": 27520}, {"loss": 0.7528, "grad_norm": 0.7178242802619934, "learning_rate": 0.0002, "epoch": 1.9770197486535008, "step": 27530}, {"loss": 0.7112, "grad_norm": 0.760145902633667, "learning_rate": 0.0002, "epoch": 1.9777378815080788, "step": 27540}, {"loss": 0.8118, "grad_norm": 0.764436662197113, "learning_rate": 0.0002, "epoch": 1.978456014362657, "step": 27550}, {"loss": 0.7542, "grad_norm": 0.7245904803276062, "learning_rate": 0.0002, "epoch": 1.9791741472172353, "step": 27560}, {"loss": 0.7316, "grad_norm": 0.6317000389099121, "learning_rate": 0.0002, "epoch": 1.9798922800718133, "step": 27570}, {"loss": 0.7504, "grad_norm": 0.8764704465866089, "learning_rate": 0.0002, "epoch": 1.9806104129263913, "step": 27580}, {"loss": 0.7845, "grad_norm": 0.6111825108528137, "learning_rate": 0.0002, "epoch": 1.9813285457809695, "step": 27590}, {"loss": 0.7101, "grad_norm": 0.6797714233398438, "learning_rate": 0.0002, "epoch": 1.9820466786355477, "step": 27600}, {"loss": 0.8037, "grad_norm": 0.7754142880439758, "learning_rate": 0.0002, "epoch": 1.9827648114901257, "step": 27610}, {"loss": 0.7288, "grad_norm": 0.7243061661720276, "learning_rate": 0.0002, "epoch": 1.9834829443447037, "step": 27620}, {"loss": 0.6626, "grad_norm": 0.6194812655448914, "learning_rate": 0.0002, "epoch": 1.9842010771992817, "step": 27630}, {"loss": 0.7162, "grad_norm": 0.6399638056755066, "learning_rate": 0.0002, "epoch": 1.98491921005386, "step": 27640}, {"loss": 0.764, "grad_norm": 0.7637218832969666, "learning_rate": 0.0002, "epoch": 1.9856373429084382, "step": 27650}, {"loss": 0.7386, "grad_norm": 0.9099404811859131, "learning_rate": 0.0002, "epoch": 1.9863554757630162, "step": 27660}, {"loss": 0.7579, "grad_norm": 0.6892596483230591, "learning_rate": 0.0002, "epoch": 1.9870736086175942, "step": 27670}, {"loss": 0.7802, "grad_norm": 0.5962418913841248, "learning_rate": 0.0002, "epoch": 1.9877917414721722, "step": 27680}, {"loss": 0.7162, "grad_norm": 0.5750163197517395, "learning_rate": 0.0002, "epoch": 1.9885098743267504, "step": 27690}, {"loss": 0.7553, "grad_norm": 0.6740097403526306, "learning_rate": 0.0002, "epoch": 1.9892280071813286, "step": 27700}, {"loss": 0.7444, "grad_norm": 0.6968644857406616, "learning_rate": 0.0002, "epoch": 1.9899461400359066, "step": 27710}, {"loss": 0.7383, "grad_norm": 0.6788132190704346, "learning_rate": 0.0002, "epoch": 1.9906642728904846, "step": 27720}, {"loss": 0.7249, "grad_norm": 0.8600544929504395, "learning_rate": 0.0002, "epoch": 1.9913824057450629, "step": 27730}, {"loss": 0.7133, "grad_norm": 0.6227671504020691, "learning_rate": 0.0002, "epoch": 1.992100538599641, "step": 27740}, {"loss": 0.7815, "grad_norm": 0.6611875295639038, "learning_rate": 0.0002, "epoch": 1.992818671454219, "step": 27750}, {"loss": 0.7423, "grad_norm": 0.714568018913269, "learning_rate": 0.0002, "epoch": 1.993536804308797, "step": 27760}, {"loss": 0.7297, "grad_norm": 0.6328669190406799, "learning_rate": 0.0002, "epoch": 1.994254937163375, "step": 27770}, {"loss": 0.7398, "grad_norm": 0.8673429489135742, "learning_rate": 0.0002, "epoch": 1.9949730700179533, "step": 27780}, {"loss": 0.7301, "grad_norm": 0.820620059967041, "learning_rate": 0.0002, "epoch": 1.9956912028725315, "step": 27790}, {"loss": 0.7828, "grad_norm": 0.8748094439506531, "learning_rate": 0.0002, "epoch": 1.9964093357271095, "step": 27800}, {"loss": 0.6945, "grad_norm": 0.8118113875389099, "learning_rate": 0.0002, "epoch": 1.9971274685816875, "step": 27810}, {"loss": 0.742, "grad_norm": 0.6886725425720215, "learning_rate": 0.0002, "epoch": 1.9978456014362656, "step": 27820}, {"loss": 0.7293, "grad_norm": 0.7101268768310547, "learning_rate": 0.0002, "epoch": 1.9985637342908438, "step": 27830}, {"loss": 0.7317, "grad_norm": 0.7823781967163086, "learning_rate": 0.0002, "epoch": 1.999281867145422, "step": 27840}, {"loss": 0.7711, "grad_norm": 0.8491085767745972, "learning_rate": 0.0002, "epoch": 2.0, "step": 27850}, {"eval_loss": 1.0868422985076904, "eval_runtime": 55.1699, "eval_samples_per_second": 13.286, "eval_steps_per_second": 1.668, "epoch": 2.0, "step": 27850}, {"loss": 0.6808, "grad_norm": 0.9003389477729797, "learning_rate": 0.0002, "epoch": 2.000718132854578, "step": 27860}, {"loss": 0.6379, "grad_norm": 0.8898349404335022, "learning_rate": 0.0002, "epoch": 2.001436265709156, "step": 27870}, {"loss": 0.7157, "grad_norm": 0.7525973320007324, "learning_rate": 0.0002, "epoch": 2.0021543985637344, "step": 27880}, {"loss": 0.6681, "grad_norm": 0.7821497321128845, "learning_rate": 0.0002, "epoch": 2.0028725314183125, "step": 27890}, {"loss": 0.6781, "grad_norm": 0.6334691047668457, "learning_rate": 0.0002, "epoch": 2.0035906642728905, "step": 27900}, {"loss": 0.6349, "grad_norm": 0.732991099357605, "learning_rate": 0.0002, "epoch": 2.0043087971274685, "step": 27910}, {"loss": 0.6776, "grad_norm": 0.949942946434021, "learning_rate": 0.0002, "epoch": 2.0050269299820465, "step": 27920}, {"loss": 0.735, "grad_norm": 0.657267689704895, "learning_rate": 0.0002, "epoch": 2.005745062836625, "step": 27930}, {"loss": 0.7123, "grad_norm": 0.8329252004623413, "learning_rate": 0.0002, "epoch": 2.006463195691203, "step": 27940}, {"loss": 0.6826, "grad_norm": 0.7816959023475647, "learning_rate": 0.0002, "epoch": 2.007181328545781, "step": 27950}, {"loss": 0.6511, "grad_norm": 0.7546323537826538, "learning_rate": 0.0002, "epoch": 2.007899461400359, "step": 27960}, {"loss": 0.6222, "grad_norm": 0.9519657492637634, "learning_rate": 0.0002, "epoch": 2.0086175942549374, "step": 27970}, {"loss": 0.6642, "grad_norm": 0.7934315800666809, "learning_rate": 0.0002, "epoch": 2.0093357271095154, "step": 27980}, {"loss": 0.666, "grad_norm": 0.9579764604568481, "learning_rate": 0.0002, "epoch": 2.0100538599640934, "step": 27990}, {"loss": 0.6376, "grad_norm": 0.764167070388794, "learning_rate": 0.0002, "epoch": 2.0107719928186714, "step": 28000}, {"loss": 0.6512, "grad_norm": 0.7380000948905945, "learning_rate": 0.0002, "epoch": 2.0114901256732494, "step": 28010}, {"loss": 0.6893, "grad_norm": 0.7220044732093811, "learning_rate": 0.0002, "epoch": 2.012208258527828, "step": 28020}, {"loss": 0.6168, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 2.012926391382406, "step": 28030}, {"loss": 0.6595, "grad_norm": 0.7507190704345703, "learning_rate": 0.0002, "epoch": 2.013644524236984, "step": 28040}, {"loss": 0.6974, "grad_norm": 0.9488387703895569, "learning_rate": 0.0002, "epoch": 2.014362657091562, "step": 28050}, {"loss": 0.6489, "grad_norm": 0.9092940092086792, "learning_rate": 0.0002, "epoch": 2.01508078994614, "step": 28060}, {"loss": 0.6545, "grad_norm": 0.7859629392623901, "learning_rate": 0.0002, "epoch": 2.0157989228007183, "step": 28070}, {"loss": 0.6552, "grad_norm": 0.7636393904685974, "learning_rate": 0.0002, "epoch": 2.0165170556552963, "step": 28080}, {"loss": 0.696, "grad_norm": 0.8860714435577393, "learning_rate": 0.0002, "epoch": 2.0172351885098743, "step": 28090}, {"loss": 0.6368, "grad_norm": 0.6837195158004761, "learning_rate": 0.0002, "epoch": 2.0179533213644523, "step": 28100}, {"loss": 0.6405, "grad_norm": 0.7778242826461792, "learning_rate": 0.0002, "epoch": 2.0186714542190307, "step": 28110}, {"loss": 0.6417, "grad_norm": 0.7164766788482666, "learning_rate": 0.0002, "epoch": 2.0193895870736087, "step": 28120}, {"loss": 0.6684, "grad_norm": 0.8965572118759155, "learning_rate": 0.0002, "epoch": 2.0201077199281867, "step": 28130}, {"loss": 0.6213, "grad_norm": 0.8074374794960022, "learning_rate": 0.0002, "epoch": 2.0208258527827647, "step": 28140}, {"loss": 0.6563, "grad_norm": 0.8307222127914429, "learning_rate": 0.0002, "epoch": 2.0215439856373427, "step": 28150}, {"loss": 0.6617, "grad_norm": 0.9600032567977905, "learning_rate": 0.0002, "epoch": 2.022262118491921, "step": 28160}, {"loss": 0.6722, "grad_norm": 0.8541040420532227, "learning_rate": 0.0002, "epoch": 2.022980251346499, "step": 28170}, {"loss": 0.6803, "grad_norm": 0.8864985704421997, "learning_rate": 0.0002, "epoch": 2.023698384201077, "step": 28180}, {"loss": 0.6516, "grad_norm": 0.7926326990127563, "learning_rate": 0.0002, "epoch": 2.024416517055655, "step": 28190}, {"loss": 0.6595, "grad_norm": 1.0548077821731567, "learning_rate": 0.0002, "epoch": 2.025134649910233, "step": 28200}, {"loss": 0.6859, "grad_norm": 0.7468827366828918, "learning_rate": 0.0002, "epoch": 2.0258527827648116, "step": 28210}, {"loss": 0.6605, "grad_norm": 0.7683286070823669, "learning_rate": 0.0002, "epoch": 2.0265709156193896, "step": 28220}, {"loss": 0.6656, "grad_norm": 0.7307319641113281, "learning_rate": 0.0002, "epoch": 2.0272890484739676, "step": 28230}, {"loss": 0.7148, "grad_norm": 0.7813416719436646, "learning_rate": 0.0002, "epoch": 2.0280071813285456, "step": 28240}, {"loss": 0.6882, "grad_norm": 0.7954556941986084, "learning_rate": 0.0002, "epoch": 2.028725314183124, "step": 28250}, {"loss": 0.6192, "grad_norm": 0.8836418986320496, "learning_rate": 0.0002, "epoch": 2.029443447037702, "step": 28260}, {"loss": 0.6275, "grad_norm": 0.7092728614807129, "learning_rate": 0.0002, "epoch": 2.03016157989228, "step": 28270}, {"loss": 0.6735, "grad_norm": 0.8512285351753235, "learning_rate": 0.0002, "epoch": 2.030879712746858, "step": 28280}, {"loss": 0.6586, "grad_norm": 0.8005346059799194, "learning_rate": 0.0002, "epoch": 2.031597845601436, "step": 28290}, {"loss": 0.6129, "grad_norm": 0.8872515559196472, "learning_rate": 0.0002, "epoch": 2.0323159784560145, "step": 28300}, {"loss": 0.6935, "grad_norm": 0.7948436737060547, "learning_rate": 0.0002, "epoch": 2.0330341113105925, "step": 28310}, {"loss": 0.6831, "grad_norm": 0.7418082356452942, "learning_rate": 0.0002, "epoch": 2.0337522441651705, "step": 28320}, {"loss": 0.6922, "grad_norm": 0.9600949287414551, "learning_rate": 0.0002, "epoch": 2.0344703770197485, "step": 28330}, {"loss": 0.6015, "grad_norm": 0.9767434597015381, "learning_rate": 0.0002, "epoch": 2.0351885098743265, "step": 28340}, {"loss": 0.6637, "grad_norm": 0.7435336709022522, "learning_rate": 0.0002, "epoch": 2.035906642728905, "step": 28350}, {"loss": 0.649, "grad_norm": 0.997978925704956, "learning_rate": 0.0002, "epoch": 2.036624775583483, "step": 28360}, {"loss": 0.6957, "grad_norm": 0.9072412252426147, "learning_rate": 0.0002, "epoch": 2.037342908438061, "step": 28370}, {"loss": 0.6816, "grad_norm": 0.8396701812744141, "learning_rate": 0.0002, "epoch": 2.038061041292639, "step": 28380}, {"loss": 0.6487, "grad_norm": 1.0449832677841187, "learning_rate": 0.0002, "epoch": 2.0387791741472174, "step": 28390}, {"loss": 0.6826, "grad_norm": 0.6471025943756104, "learning_rate": 0.0002, "epoch": 2.0394973070017954, "step": 28400}, {"loss": 0.6597, "grad_norm": 0.8147950768470764, "learning_rate": 0.0002, "epoch": 2.0402154398563734, "step": 28410}, {"loss": 0.6502, "grad_norm": 0.902508020401001, "learning_rate": 0.0002, "epoch": 2.0409335727109514, "step": 28420}, {"loss": 0.6303, "grad_norm": 0.6426262855529785, "learning_rate": 0.0002, "epoch": 2.0416517055655294, "step": 28430}, {"loss": 0.6812, "grad_norm": 0.8016643524169922, "learning_rate": 0.0002, "epoch": 2.042369838420108, "step": 28440}, {"loss": 0.6535, "grad_norm": 0.6841614246368408, "learning_rate": 0.0002, "epoch": 2.043087971274686, "step": 28450}, {"loss": 0.638, "grad_norm": 0.7713631987571716, "learning_rate": 0.0002, "epoch": 2.043806104129264, "step": 28460}, {"loss": 0.6456, "grad_norm": 0.8795675039291382, "learning_rate": 0.0002, "epoch": 2.044524236983842, "step": 28470}, {"loss": 0.6858, "grad_norm": 0.725447416305542, "learning_rate": 0.0002, "epoch": 2.04524236983842, "step": 28480}, {"loss": 0.6289, "grad_norm": 0.806861162185669, "learning_rate": 0.0002, "epoch": 2.0459605026929983, "step": 28490}, {"loss": 0.6269, "grad_norm": 0.752953827381134, "learning_rate": 0.0002, "epoch": 2.0466786355475763, "step": 28500}, {"loss": 0.6818, "grad_norm": 0.7143173813819885, "learning_rate": 0.0002, "epoch": 2.0473967684021543, "step": 28510}, {"loss": 0.6606, "grad_norm": 0.9316226243972778, "learning_rate": 0.0002, "epoch": 2.0481149012567323, "step": 28520}, {"loss": 0.6284, "grad_norm": 0.7292338609695435, "learning_rate": 0.0002, "epoch": 2.048833034111311, "step": 28530}, {"loss": 0.6528, "grad_norm": 0.7392885088920593, "learning_rate": 0.0002, "epoch": 2.049551166965889, "step": 28540}, {"loss": 0.7007, "grad_norm": 0.7288873195648193, "learning_rate": 0.0002, "epoch": 2.050269299820467, "step": 28550}, {"loss": 0.6239, "grad_norm": 0.7791221141815186, "learning_rate": 0.0002, "epoch": 2.050987432675045, "step": 28560}, {"loss": 0.684, "grad_norm": 0.821983814239502, "learning_rate": 0.0002, "epoch": 2.051705565529623, "step": 28570}, {"loss": 0.6545, "grad_norm": 0.8925826549530029, "learning_rate": 0.0002, "epoch": 2.0524236983842012, "step": 28580}, {"loss": 0.719, "grad_norm": 0.7181646227836609, "learning_rate": 0.0002, "epoch": 2.0531418312387792, "step": 28590}, {"loss": 0.686, "grad_norm": 0.6387725472450256, "learning_rate": 0.0002, "epoch": 2.0538599640933572, "step": 28600}, {"loss": 0.6662, "grad_norm": 0.8398096561431885, "learning_rate": 0.0002, "epoch": 2.0545780969479353, "step": 28610}, {"loss": 0.69, "grad_norm": 1.0458195209503174, "learning_rate": 0.0002, "epoch": 2.0552962298025133, "step": 28620}, {"loss": 0.655, "grad_norm": 0.7032150626182556, "learning_rate": 0.0002, "epoch": 2.0560143626570917, "step": 28630}, {"loss": 0.6551, "grad_norm": 0.8850845098495483, "learning_rate": 0.0002, "epoch": 2.0567324955116697, "step": 28640}, {"loss": 0.6767, "grad_norm": 0.8587120175361633, "learning_rate": 0.0002, "epoch": 2.0574506283662477, "step": 28650}, {"loss": 0.6721, "grad_norm": 0.7462602853775024, "learning_rate": 0.0002, "epoch": 2.0581687612208257, "step": 28660}, {"loss": 0.6639, "grad_norm": 0.7355574369430542, "learning_rate": 0.0002, "epoch": 2.058886894075404, "step": 28670}, {"loss": 0.6216, "grad_norm": 0.9229736328125, "learning_rate": 0.0002, "epoch": 2.059605026929982, "step": 28680}, {"loss": 0.6692, "grad_norm": 0.7685085535049438, "learning_rate": 0.0002, "epoch": 2.06032315978456, "step": 28690}, {"loss": 0.6801, "grad_norm": 0.6749364137649536, "learning_rate": 0.0002, "epoch": 2.061041292639138, "step": 28700}, {"loss": 0.6721, "grad_norm": 0.7608520984649658, "learning_rate": 0.0002, "epoch": 2.061759425493716, "step": 28710}, {"loss": 0.6721, "grad_norm": 0.9451281428337097, "learning_rate": 0.0002, "epoch": 2.0624775583482946, "step": 28720}, {"loss": 0.671, "grad_norm": 0.7869735360145569, "learning_rate": 0.0002, "epoch": 2.0631956912028726, "step": 28730}, {"loss": 0.6409, "grad_norm": 0.8422008156776428, "learning_rate": 0.0002, "epoch": 2.0639138240574506, "step": 28740}, {"loss": 0.6686, "grad_norm": 0.7486162781715393, "learning_rate": 0.0002, "epoch": 2.0646319569120286, "step": 28750}, {"loss": 0.6641, "grad_norm": 0.9374173879623413, "learning_rate": 0.0002, "epoch": 2.0653500897666066, "step": 28760}, {"loss": 0.6737, "grad_norm": 0.8749295473098755, "learning_rate": 0.0002, "epoch": 2.066068222621185, "step": 28770}, {"loss": 0.636, "grad_norm": 0.8265942931175232, "learning_rate": 0.0002, "epoch": 2.066786355475763, "step": 28780}, {"loss": 0.6819, "grad_norm": 0.8541982769966125, "learning_rate": 0.0002, "epoch": 2.067504488330341, "step": 28790}, {"loss": 0.661, "grad_norm": 0.8220006227493286, "learning_rate": 0.0002, "epoch": 2.068222621184919, "step": 28800}, {"loss": 0.6942, "grad_norm": 0.7302022576332092, "learning_rate": 0.0002, "epoch": 2.0689407540394975, "step": 28810}, {"loss": 0.68, "grad_norm": 0.7073875069618225, "learning_rate": 0.0002, "epoch": 2.0696588868940755, "step": 28820}, {"loss": 0.6275, "grad_norm": 0.7792919874191284, "learning_rate": 0.0002, "epoch": 2.0703770197486535, "step": 28830}, {"loss": 0.6941, "grad_norm": 0.8268185257911682, "learning_rate": 0.0002, "epoch": 2.0710951526032315, "step": 28840}, {"loss": 0.6776, "grad_norm": 0.7576423287391663, "learning_rate": 0.0002, "epoch": 2.0718132854578095, "step": 28850}, {"loss": 0.6298, "grad_norm": 0.8255910873413086, "learning_rate": 0.0002, "epoch": 2.072531418312388, "step": 28860}, {"loss": 0.6695, "grad_norm": 0.7900934815406799, "learning_rate": 0.0002, "epoch": 2.073249551166966, "step": 28870}, {"loss": 0.6532, "grad_norm": 0.846665620803833, "learning_rate": 0.0002, "epoch": 2.073967684021544, "step": 28880}, {"loss": 0.6598, "grad_norm": 0.8159831166267395, "learning_rate": 0.0002, "epoch": 2.074685816876122, "step": 28890}, {"loss": 0.6341, "grad_norm": 0.7395941615104675, "learning_rate": 0.0002, "epoch": 2.0754039497307, "step": 28900}, {"loss": 0.6513, "grad_norm": 0.9765046238899231, "learning_rate": 0.0002, "epoch": 2.0761220825852784, "step": 28910}, {"loss": 0.6785, "grad_norm": 0.8358173966407776, "learning_rate": 0.0002, "epoch": 2.0768402154398564, "step": 28920}, {"loss": 0.6973, "grad_norm": 0.6848723292350769, "learning_rate": 0.0002, "epoch": 2.0775583482944344, "step": 28930}, {"loss": 0.6381, "grad_norm": 0.7965065836906433, "learning_rate": 0.0002, "epoch": 2.0782764811490124, "step": 28940}, {"loss": 0.667, "grad_norm": 0.7618608474731445, "learning_rate": 0.0002, "epoch": 2.078994614003591, "step": 28950}, {"loss": 0.6683, "grad_norm": 0.890615701675415, "learning_rate": 0.0002, "epoch": 2.079712746858169, "step": 28960}, {"loss": 0.6641, "grad_norm": 0.7310431003570557, "learning_rate": 0.0002, "epoch": 2.080430879712747, "step": 28970}, {"loss": 0.6511, "grad_norm": 0.8228268027305603, "learning_rate": 0.0002, "epoch": 2.081149012567325, "step": 28980}, {"loss": 0.655, "grad_norm": 0.883577287197113, "learning_rate": 0.0002, "epoch": 2.081867145421903, "step": 28990}, {"loss": 0.7232, "grad_norm": 0.8359243869781494, "learning_rate": 0.0002, "epoch": 2.0825852782764813, "step": 29000}, {"loss": 0.6744, "grad_norm": 0.8285391330718994, "learning_rate": 0.0002, "epoch": 2.0833034111310593, "step": 29010}, {"loss": 0.6951, "grad_norm": 0.8991064429283142, "learning_rate": 0.0002, "epoch": 2.0840215439856373, "step": 29020}, {"loss": 0.6444, "grad_norm": 0.6911244988441467, "learning_rate": 0.0002, "epoch": 2.0847396768402153, "step": 29030}, {"loss": 0.7098, "grad_norm": 0.8462249636650085, "learning_rate": 0.0002, "epoch": 2.0854578096947933, "step": 29040}, {"loss": 0.6813, "grad_norm": 0.9149548411369324, "learning_rate": 0.0002, "epoch": 2.0861759425493718, "step": 29050}, {"loss": 0.6948, "grad_norm": 0.7365630269050598, "learning_rate": 0.0002, "epoch": 2.0868940754039498, "step": 29060}, {"loss": 0.6391, "grad_norm": 0.8439079523086548, "learning_rate": 0.0002, "epoch": 2.087612208258528, "step": 29070}, {"loss": 0.6566, "grad_norm": 0.7123780846595764, "learning_rate": 0.0002, "epoch": 2.088330341113106, "step": 29080}, {"loss": 0.6305, "grad_norm": 0.6854261755943298, "learning_rate": 0.0002, "epoch": 2.0890484739676842, "step": 29090}, {"loss": 0.667, "grad_norm": 0.83026123046875, "learning_rate": 0.0002, "epoch": 2.0897666068222622, "step": 29100}, {"loss": 0.661, "grad_norm": 0.8413158059120178, "learning_rate": 0.0002, "epoch": 2.0904847396768402, "step": 29110}, {"loss": 0.7194, "grad_norm": 0.9646758437156677, "learning_rate": 0.0002, "epoch": 2.0912028725314182, "step": 29120}, {"loss": 0.7101, "grad_norm": 0.8421565890312195, "learning_rate": 0.0002, "epoch": 2.0919210053859962, "step": 29130}, {"loss": 0.6685, "grad_norm": 0.7748899459838867, "learning_rate": 0.0002, "epoch": 2.0926391382405747, "step": 29140}, {"loss": 0.6596, "grad_norm": 0.5973830819129944, "learning_rate": 0.0002, "epoch": 2.0933572710951527, "step": 29150}, {"loss": 0.6437, "grad_norm": 0.8440837860107422, "learning_rate": 0.0002, "epoch": 2.0940754039497307, "step": 29160}, {"loss": 0.6373, "grad_norm": 0.7392688989639282, "learning_rate": 0.0002, "epoch": 2.0947935368043087, "step": 29170}, {"loss": 0.6907, "grad_norm": 1.0522996187210083, "learning_rate": 0.0002, "epoch": 2.0955116696588867, "step": 29180}, {"loss": 0.6733, "grad_norm": 0.7330273389816284, "learning_rate": 0.0002, "epoch": 2.096229802513465, "step": 29190}, {"loss": 0.7219, "grad_norm": 1.11064875125885, "learning_rate": 0.0002, "epoch": 2.096947935368043, "step": 29200}, {"loss": 0.6125, "grad_norm": 0.795446515083313, "learning_rate": 0.0002, "epoch": 2.097666068222621, "step": 29210}, {"loss": 0.6466, "grad_norm": 0.5552594661712646, "learning_rate": 0.0002, "epoch": 2.098384201077199, "step": 29220}, {"loss": 0.6601, "grad_norm": 0.7327710390090942, "learning_rate": 0.0002, "epoch": 2.0991023339317776, "step": 29230}, {"loss": 0.656, "grad_norm": 0.7474247217178345, "learning_rate": 0.0002, "epoch": 2.0998204667863556, "step": 29240}, {"loss": 0.6707, "grad_norm": 0.7775853276252747, "learning_rate": 0.0002, "epoch": 2.1005385996409336, "step": 29250}, {"loss": 0.6623, "grad_norm": 0.769527018070221, "learning_rate": 0.0002, "epoch": 2.1012567324955116, "step": 29260}, {"loss": 0.6183, "grad_norm": 0.8350797891616821, "learning_rate": 0.0002, "epoch": 2.1019748653500896, "step": 29270}, {"loss": 0.6623, "grad_norm": 0.8749061822891235, "learning_rate": 0.0002, "epoch": 2.102692998204668, "step": 29280}, {"loss": 0.6292, "grad_norm": 0.7838778495788574, "learning_rate": 0.0002, "epoch": 2.103411131059246, "step": 29290}, {"loss": 0.699, "grad_norm": 0.8144710063934326, "learning_rate": 0.0002, "epoch": 2.104129263913824, "step": 29300}, {"loss": 0.6291, "grad_norm": 0.7965250015258789, "learning_rate": 0.0002, "epoch": 2.104847396768402, "step": 29310}, {"loss": 0.6387, "grad_norm": 0.7075945138931274, "learning_rate": 0.0002, "epoch": 2.10556552962298, "step": 29320}, {"loss": 0.6846, "grad_norm": 0.9449555277824402, "learning_rate": 0.0002, "epoch": 2.1062836624775585, "step": 29330}, {"loss": 0.6571, "grad_norm": 0.9114580750465393, "learning_rate": 0.0002, "epoch": 2.1070017953321365, "step": 29340}, {"loss": 0.6652, "grad_norm": 0.8768125176429749, "learning_rate": 0.0002, "epoch": 2.1077199281867145, "step": 29350}, {"loss": 0.7134, "grad_norm": 0.8586908578872681, "learning_rate": 0.0002, "epoch": 2.1084380610412925, "step": 29360}, {"loss": 0.6471, "grad_norm": 0.8351234793663025, "learning_rate": 0.0002, "epoch": 2.109156193895871, "step": 29370}, {"loss": 0.671, "grad_norm": 0.686488687992096, "learning_rate": 0.0002, "epoch": 2.109874326750449, "step": 29380}, {"loss": 0.6706, "grad_norm": 0.7910184264183044, "learning_rate": 0.0002, "epoch": 2.110592459605027, "step": 29390}, {"loss": 0.7367, "grad_norm": 0.7649612426757812, "learning_rate": 0.0002, "epoch": 2.111310592459605, "step": 29400}, {"loss": 0.6386, "grad_norm": 0.7790259122848511, "learning_rate": 0.0002, "epoch": 2.112028725314183, "step": 29410}, {"loss": 0.6983, "grad_norm": 0.8386351466178894, "learning_rate": 0.0002, "epoch": 2.1127468581687614, "step": 29420}, {"loss": 0.6519, "grad_norm": 0.8605695366859436, "learning_rate": 0.0002, "epoch": 2.1134649910233394, "step": 29430}, {"loss": 0.6686, "grad_norm": 0.6808947920799255, "learning_rate": 0.0002, "epoch": 2.1141831238779174, "step": 29440}, {"loss": 0.6743, "grad_norm": 0.8310001492500305, "learning_rate": 0.0002, "epoch": 2.1149012567324954, "step": 29450}, {"loss": 0.6669, "grad_norm": 1.289986252784729, "learning_rate": 0.0002, "epoch": 2.1156193895870734, "step": 29460}, {"loss": 0.6947, "grad_norm": 0.8679313659667969, "learning_rate": 0.0002, "epoch": 2.116337522441652, "step": 29470}, {"loss": 0.6954, "grad_norm": 0.9149175882339478, "learning_rate": 0.0002, "epoch": 2.11705565529623, "step": 29480}, {"loss": 0.6908, "grad_norm": 0.8405622839927673, "learning_rate": 0.0002, "epoch": 2.117773788150808, "step": 29490}, {"loss": 0.7436, "grad_norm": 0.9174691438674927, "learning_rate": 0.0002, "epoch": 2.118491921005386, "step": 29500}, {"loss": 0.6804, "grad_norm": 0.8865614533424377, "learning_rate": 0.0002, "epoch": 2.1192100538599643, "step": 29510}, {"loss": 0.6535, "grad_norm": 0.645301342010498, "learning_rate": 0.0002, "epoch": 2.1199281867145423, "step": 29520}, {"loss": 0.6879, "grad_norm": 0.7612960338592529, "learning_rate": 0.0002, "epoch": 2.1206463195691203, "step": 29530}, {"loss": 0.6874, "grad_norm": 0.7575576305389404, "learning_rate": 0.0002, "epoch": 2.1213644524236983, "step": 29540}, {"loss": 0.6924, "grad_norm": 0.8746156096458435, "learning_rate": 0.0002, "epoch": 2.1220825852782763, "step": 29550}, {"loss": 0.6659, "grad_norm": 0.8488934636116028, "learning_rate": 0.0002, "epoch": 2.1228007181328548, "step": 29560}, {"loss": 0.6568, "grad_norm": 0.8064972162246704, "learning_rate": 0.0002, "epoch": 2.1235188509874328, "step": 29570}, {"loss": 0.713, "grad_norm": 0.7410933971405029, "learning_rate": 0.0002, "epoch": 2.1242369838420108, "step": 29580}, {"loss": 0.649, "grad_norm": 0.7023535966873169, "learning_rate": 0.0002, "epoch": 2.1249551166965888, "step": 29590}, {"loss": 0.6574, "grad_norm": 0.8591743111610413, "learning_rate": 0.0002, "epoch": 2.1256732495511668, "step": 29600}, {"loss": 0.673, "grad_norm": 0.7270186543464661, "learning_rate": 0.0002, "epoch": 2.126391382405745, "step": 29610}, {"loss": 0.6262, "grad_norm": 0.9639726281166077, "learning_rate": 0.0002, "epoch": 2.127109515260323, "step": 29620}, {"loss": 0.6434, "grad_norm": 0.8519027829170227, "learning_rate": 0.0002, "epoch": 2.127827648114901, "step": 29630}, {"loss": 0.6843, "grad_norm": 0.8786447048187256, "learning_rate": 0.0002, "epoch": 2.128545780969479, "step": 29640}, {"loss": 0.6386, "grad_norm": 0.7452822923660278, "learning_rate": 0.0002, "epoch": 2.129263913824057, "step": 29650}, {"loss": 0.6577, "grad_norm": 0.9385744333267212, "learning_rate": 0.0002, "epoch": 2.1299820466786357, "step": 29660}, {"loss": 0.7088, "grad_norm": 0.7650160193443298, "learning_rate": 0.0002, "epoch": 2.1307001795332137, "step": 29670}, {"loss": 0.6742, "grad_norm": 0.7581976652145386, "learning_rate": 0.0002, "epoch": 2.1314183123877917, "step": 29680}, {"loss": 0.6358, "grad_norm": 0.8455183506011963, "learning_rate": 0.0002, "epoch": 2.1321364452423697, "step": 29690}, {"loss": 0.6288, "grad_norm": 0.7200509905815125, "learning_rate": 0.0002, "epoch": 2.132854578096948, "step": 29700}, {"loss": 0.695, "grad_norm": 0.7071877121925354, "learning_rate": 0.0002, "epoch": 2.133572710951526, "step": 29710}, {"loss": 0.6852, "grad_norm": 0.9197220802307129, "learning_rate": 0.0002, "epoch": 2.134290843806104, "step": 29720}, {"loss": 0.6578, "grad_norm": 0.6787277460098267, "learning_rate": 0.0002, "epoch": 2.135008976660682, "step": 29730}, {"loss": 0.666, "grad_norm": 0.8183788061141968, "learning_rate": 0.0002, "epoch": 2.13572710951526, "step": 29740}, {"loss": 0.6754, "grad_norm": 0.7958994507789612, "learning_rate": 0.0002, "epoch": 2.1364452423698386, "step": 29750}, {"loss": 0.6761, "grad_norm": 0.8803889155387878, "learning_rate": 0.0002, "epoch": 2.1371633752244166, "step": 29760}, {"loss": 0.686, "grad_norm": 0.6682677268981934, "learning_rate": 0.0002, "epoch": 2.1378815080789946, "step": 29770}, {"loss": 0.6878, "grad_norm": 1.0198085308074951, "learning_rate": 0.0002, "epoch": 2.1385996409335726, "step": 29780}, {"loss": 0.6576, "grad_norm": 1.0258227586746216, "learning_rate": 0.0002, "epoch": 2.139317773788151, "step": 29790}, {"loss": 0.6454, "grad_norm": 0.8920917510986328, "learning_rate": 0.0002, "epoch": 2.140035906642729, "step": 29800}, {"loss": 0.6926, "grad_norm": 0.8352635502815247, "learning_rate": 0.0002, "epoch": 2.140754039497307, "step": 29810}, {"loss": 0.692, "grad_norm": 0.8422067165374756, "learning_rate": 0.0002, "epoch": 2.141472172351885, "step": 29820}, {"loss": 0.72, "grad_norm": 0.8845202326774597, "learning_rate": 0.0002, "epoch": 2.142190305206463, "step": 29830}, {"loss": 0.688, "grad_norm": 0.659397542476654, "learning_rate": 0.0002, "epoch": 2.1429084380610415, "step": 29840}, {"loss": 0.6354, "grad_norm": 0.6233306527137756, "learning_rate": 0.0002, "epoch": 2.1436265709156195, "step": 29850}, {"loss": 0.6946, "grad_norm": 0.8951199054718018, "learning_rate": 0.0002, "epoch": 2.1443447037701975, "step": 29860}, {"loss": 0.6417, "grad_norm": 0.6980211734771729, "learning_rate": 0.0002, "epoch": 2.1450628366247755, "step": 29870}, {"loss": 0.6754, "grad_norm": 0.8463385105133057, "learning_rate": 0.0002, "epoch": 2.1457809694793535, "step": 29880}, {"loss": 0.6636, "grad_norm": 0.682183027267456, "learning_rate": 0.0002, "epoch": 2.146499102333932, "step": 29890}, {"loss": 0.6605, "grad_norm": 0.8491033911705017, "learning_rate": 0.0002, "epoch": 2.14721723518851, "step": 29900}, {"loss": 0.6851, "grad_norm": 0.8112631440162659, "learning_rate": 0.0002, "epoch": 2.147935368043088, "step": 29910}, {"loss": 0.6804, "grad_norm": 1.0186359882354736, "learning_rate": 0.0002, "epoch": 2.148653500897666, "step": 29920}, {"loss": 0.6709, "grad_norm": 0.7904929518699646, "learning_rate": 0.0002, "epoch": 2.149371633752244, "step": 29930}, {"loss": 0.6535, "grad_norm": 0.8381312489509583, "learning_rate": 0.0002, "epoch": 2.1500897666068224, "step": 29940}, {"loss": 0.6896, "grad_norm": 0.7596192359924316, "learning_rate": 0.0002, "epoch": 2.1508078994614004, "step": 29950}, {"loss": 0.6473, "grad_norm": 0.7532448768615723, "learning_rate": 0.0002, "epoch": 2.1515260323159784, "step": 29960}, {"loss": 0.7051, "grad_norm": 0.7877430319786072, "learning_rate": 0.0002, "epoch": 2.1522441651705564, "step": 29970}, {"loss": 0.6657, "grad_norm": 0.6870610117912292, "learning_rate": 0.0002, "epoch": 2.152962298025135, "step": 29980}, {"loss": 0.6518, "grad_norm": 0.7154987454414368, "learning_rate": 0.0002, "epoch": 2.153680430879713, "step": 29990}, {"loss": 0.6418, "grad_norm": 0.7692370414733887, "learning_rate": 0.0002, "epoch": 2.154398563734291, "step": 30000}, {"loss": 0.6557, "grad_norm": 0.7745859026908875, "learning_rate": 0.0002, "epoch": 2.155116696588869, "step": 30010}, {"loss": 0.61, "grad_norm": 0.718207061290741, "learning_rate": 0.0002, "epoch": 2.155834829443447, "step": 30020}, {"loss": 0.6348, "grad_norm": 0.8851615786552429, "learning_rate": 0.0002, "epoch": 2.1565529622980253, "step": 30030}, {"loss": 0.7108, "grad_norm": 0.736194372177124, "learning_rate": 0.0002, "epoch": 2.1572710951526033, "step": 30040}, {"loss": 0.6682, "grad_norm": 0.9908117055892944, "learning_rate": 0.0002, "epoch": 2.1579892280071813, "step": 30050}, {"loss": 0.6348, "grad_norm": 0.6772316694259644, "learning_rate": 0.0002, "epoch": 2.1587073608617593, "step": 30060}, {"loss": 0.6952, "grad_norm": 0.7474411725997925, "learning_rate": 0.0002, "epoch": 2.1594254937163377, "step": 30070}, {"loss": 0.6698, "grad_norm": 0.8140033483505249, "learning_rate": 0.0002, "epoch": 2.1601436265709157, "step": 30080}, {"loss": 0.6516, "grad_norm": 0.912555992603302, "learning_rate": 0.0002, "epoch": 2.1608617594254937, "step": 30090}, {"loss": 0.6818, "grad_norm": 0.8189636468887329, "learning_rate": 0.0002, "epoch": 2.1615798922800717, "step": 30100}, {"loss": 0.6662, "grad_norm": 0.7520000338554382, "learning_rate": 0.0002, "epoch": 2.1622980251346497, "step": 30110}, {"loss": 0.678, "grad_norm": 0.9635465741157532, "learning_rate": 0.0002, "epoch": 2.163016157989228, "step": 30120}, {"loss": 0.6641, "grad_norm": 0.9139830470085144, "learning_rate": 0.0002, "epoch": 2.163734290843806, "step": 30130}, {"loss": 0.6685, "grad_norm": 0.844384491443634, "learning_rate": 0.0002, "epoch": 2.164452423698384, "step": 30140}, {"loss": 0.708, "grad_norm": 0.8296793103218079, "learning_rate": 0.0002, "epoch": 2.165170556552962, "step": 30150}, {"loss": 0.668, "grad_norm": 0.7929309606552124, "learning_rate": 0.0002, "epoch": 2.16588868940754, "step": 30160}, {"loss": 0.6221, "grad_norm": 0.8046507239341736, "learning_rate": 0.0002, "epoch": 2.1666068222621186, "step": 30170}, {"loss": 0.6788, "grad_norm": 0.8161377310752869, "learning_rate": 0.0002, "epoch": 2.1673249551166966, "step": 30180}, {"loss": 0.6578, "grad_norm": 0.6984363794326782, "learning_rate": 0.0002, "epoch": 2.1680430879712747, "step": 30190}, {"loss": 0.6774, "grad_norm": 0.8578489422798157, "learning_rate": 0.0002, "epoch": 2.1687612208258527, "step": 30200}, {"loss": 0.668, "grad_norm": 0.8051524758338928, "learning_rate": 0.0002, "epoch": 2.1694793536804307, "step": 30210}, {"loss": 0.6212, "grad_norm": 0.6775792241096497, "learning_rate": 0.0002, "epoch": 2.170197486535009, "step": 30220}, {"loss": 0.705, "grad_norm": 0.7102242708206177, "learning_rate": 0.0002, "epoch": 2.170915619389587, "step": 30230}, {"loss": 0.6814, "grad_norm": 0.9038975238800049, "learning_rate": 0.0002, "epoch": 2.171633752244165, "step": 30240}, {"loss": 0.6919, "grad_norm": 0.8509918451309204, "learning_rate": 0.0002, "epoch": 2.172351885098743, "step": 30250}, {"loss": 0.6904, "grad_norm": 0.8816375732421875, "learning_rate": 0.0002, "epoch": 2.1730700179533216, "step": 30260}, {"loss": 0.7211, "grad_norm": 0.7907037138938904, "learning_rate": 0.0002, "epoch": 2.1737881508078996, "step": 30270}, {"loss": 0.6542, "grad_norm": 0.7104434967041016, "learning_rate": 0.0002, "epoch": 2.1745062836624776, "step": 30280}, {"loss": 0.6863, "grad_norm": 1.028658151626587, "learning_rate": 0.0002, "epoch": 2.1752244165170556, "step": 30290}, {"loss": 0.6789, "grad_norm": 0.8542430400848389, "learning_rate": 0.0002, "epoch": 2.1759425493716336, "step": 30300}, {"loss": 0.6783, "grad_norm": 0.7438064813613892, "learning_rate": 0.0002, "epoch": 2.176660682226212, "step": 30310}, {"loss": 0.63, "grad_norm": 0.8384708762168884, "learning_rate": 0.0002, "epoch": 2.17737881508079, "step": 30320}, {"loss": 0.6861, "grad_norm": 0.9034163355827332, "learning_rate": 0.0002, "epoch": 2.178096947935368, "step": 30330}, {"loss": 0.666, "grad_norm": 0.9659526944160461, "learning_rate": 0.0002, "epoch": 2.178815080789946, "step": 30340}, {"loss": 0.6819, "grad_norm": 0.6685642600059509, "learning_rate": 0.0002, "epoch": 2.1795332136445245, "step": 30350}, {"loss": 0.6759, "grad_norm": 0.9180589318275452, "learning_rate": 0.0002, "epoch": 2.1802513464991025, "step": 30360}, {"loss": 0.6575, "grad_norm": 0.9550795555114746, "learning_rate": 0.0002, "epoch": 2.1809694793536805, "step": 30370}, {"loss": 0.7014, "grad_norm": 0.8517686724662781, "learning_rate": 0.0002, "epoch": 2.1816876122082585, "step": 30380}, {"loss": 0.7069, "grad_norm": 0.7351927161216736, "learning_rate": 0.0002, "epoch": 2.1824057450628365, "step": 30390}, {"loss": 0.6555, "grad_norm": 0.8439408540725708, "learning_rate": 0.0002, "epoch": 2.183123877917415, "step": 30400}, {"loss": 0.69, "grad_norm": 0.8322570323944092, "learning_rate": 0.0002, "epoch": 2.183842010771993, "step": 30410}, {"loss": 0.6801, "grad_norm": 0.6735888123512268, "learning_rate": 0.0002, "epoch": 2.184560143626571, "step": 30420}, {"loss": 0.6844, "grad_norm": 0.7273133397102356, "learning_rate": 0.0002, "epoch": 2.185278276481149, "step": 30430}, {"loss": 0.7119, "grad_norm": 0.7841959595680237, "learning_rate": 0.0002, "epoch": 2.185996409335727, "step": 30440}, {"loss": 0.6717, "grad_norm": 0.67259281873703, "learning_rate": 0.0002, "epoch": 2.1867145421903054, "step": 30450}, {"loss": 0.6857, "grad_norm": 0.7646223306655884, "learning_rate": 0.0002, "epoch": 2.1874326750448834, "step": 30460}, {"loss": 0.6803, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 2.1881508078994614, "step": 30470}, {"loss": 0.6512, "grad_norm": 0.8818342685699463, "learning_rate": 0.0002, "epoch": 2.1888689407540394, "step": 30480}, {"loss": 0.6778, "grad_norm": 0.7421377897262573, "learning_rate": 0.0002, "epoch": 2.1895870736086174, "step": 30490}, {"loss": 0.6783, "grad_norm": 0.8180080652236938, "learning_rate": 0.0002, "epoch": 2.190305206463196, "step": 30500}, {"loss": 0.6774, "grad_norm": 0.8003571033477783, "learning_rate": 0.0002, "epoch": 2.191023339317774, "step": 30510}, {"loss": 0.7, "grad_norm": 0.8200605511665344, "learning_rate": 0.0002, "epoch": 2.191741472172352, "step": 30520}, {"loss": 0.7113, "grad_norm": 0.8878887295722961, "learning_rate": 0.0002, "epoch": 2.19245960502693, "step": 30530}, {"loss": 0.6364, "grad_norm": 0.8518163561820984, "learning_rate": 0.0002, "epoch": 2.1931777378815083, "step": 30540}, {"loss": 0.7039, "grad_norm": 0.8182454705238342, "learning_rate": 0.0002, "epoch": 2.1938958707360863, "step": 30550}, {"loss": 0.6966, "grad_norm": 0.9395919442176819, "learning_rate": 0.0002, "epoch": 2.1946140035906643, "step": 30560}, {"loss": 0.6617, "grad_norm": 0.7916256189346313, "learning_rate": 0.0002, "epoch": 2.1953321364452423, "step": 30570}, {"loss": 0.6869, "grad_norm": 0.7303445339202881, "learning_rate": 0.0002, "epoch": 2.1960502692998203, "step": 30580}, {"loss": 0.6485, "grad_norm": 0.7407387495040894, "learning_rate": 0.0002, "epoch": 2.1967684021543987, "step": 30590}, {"loss": 0.6704, "grad_norm": 0.7410500645637512, "learning_rate": 0.0002, "epoch": 2.1974865350089767, "step": 30600}, {"loss": 0.7013, "grad_norm": 0.9176440834999084, "learning_rate": 0.0002, "epoch": 2.1982046678635547, "step": 30610}, {"loss": 0.706, "grad_norm": 0.8823038935661316, "learning_rate": 0.0002, "epoch": 2.1989228007181327, "step": 30620}, {"loss": 0.7418, "grad_norm": 0.9263436198234558, "learning_rate": 0.0002, "epoch": 2.199640933572711, "step": 30630}, {"loss": 0.6019, "grad_norm": 0.6753571033477783, "learning_rate": 0.0002, "epoch": 2.200359066427289, "step": 30640}, {"loss": 0.6808, "grad_norm": 0.841160774230957, "learning_rate": 0.0002, "epoch": 2.201077199281867, "step": 30650}, {"loss": 0.6917, "grad_norm": 0.8786441683769226, "learning_rate": 0.0002, "epoch": 2.201795332136445, "step": 30660}, {"loss": 0.6878, "grad_norm": 0.8833681344985962, "learning_rate": 0.0002, "epoch": 2.202513464991023, "step": 30670}, {"loss": 0.7061, "grad_norm": 0.6609824299812317, "learning_rate": 0.0002, "epoch": 2.2032315978456016, "step": 30680}, {"loss": 0.6572, "grad_norm": 0.7308626174926758, "learning_rate": 0.0002, "epoch": 2.2039497307001796, "step": 30690}, {"loss": 0.7127, "grad_norm": 0.8854711055755615, "learning_rate": 0.0002, "epoch": 2.2046678635547576, "step": 30700}, {"loss": 0.6836, "grad_norm": 0.839043140411377, "learning_rate": 0.0002, "epoch": 2.2053859964093356, "step": 30710}, {"loss": 0.6577, "grad_norm": 0.9030174016952515, "learning_rate": 0.0002, "epoch": 2.2061041292639136, "step": 30720}, {"loss": 0.663, "grad_norm": 0.6856667399406433, "learning_rate": 0.0002, "epoch": 2.206822262118492, "step": 30730}, {"loss": 0.6672, "grad_norm": 0.8823501467704773, "learning_rate": 0.0002, "epoch": 2.20754039497307, "step": 30740}, {"loss": 0.6809, "grad_norm": 0.8501278162002563, "learning_rate": 0.0002, "epoch": 2.208258527827648, "step": 30750}, {"loss": 0.7402, "grad_norm": 0.8099446892738342, "learning_rate": 0.0002, "epoch": 2.208976660682226, "step": 30760}, {"loss": 0.6996, "grad_norm": 0.7203072905540466, "learning_rate": 0.0002, "epoch": 2.209694793536804, "step": 30770}, {"loss": 0.7494, "grad_norm": 1.0898563861846924, "learning_rate": 0.0002, "epoch": 2.2104129263913825, "step": 30780}, {"loss": 0.6432, "grad_norm": 0.8157216906547546, "learning_rate": 0.0002, "epoch": 2.2111310592459605, "step": 30790}, {"loss": 0.634, "grad_norm": 0.7617478966712952, "learning_rate": 0.0002, "epoch": 2.2118491921005385, "step": 30800}, {"loss": 0.7155, "grad_norm": 0.790503978729248, "learning_rate": 0.0002, "epoch": 2.2125673249551165, "step": 30810}, {"loss": 0.6301, "grad_norm": 0.9289199113845825, "learning_rate": 0.0002, "epoch": 2.213285457809695, "step": 30820}, {"loss": 0.6867, "grad_norm": 0.9267001748085022, "learning_rate": 0.0002, "epoch": 2.214003590664273, "step": 30830}, {"loss": 0.7012, "grad_norm": 0.716023862361908, "learning_rate": 0.0002, "epoch": 2.214721723518851, "step": 30840}, {"loss": 0.6755, "grad_norm": 0.8733863234519958, "learning_rate": 0.0002, "epoch": 2.215439856373429, "step": 30850}, {"loss": 0.6713, "grad_norm": 0.7743660807609558, "learning_rate": 0.0002, "epoch": 2.216157989228007, "step": 30860}, {"loss": 0.665, "grad_norm": 0.7974567413330078, "learning_rate": 0.0002, "epoch": 2.2168761220825854, "step": 30870}, {"loss": 0.6624, "grad_norm": 0.6617984771728516, "learning_rate": 0.0002, "epoch": 2.2175942549371634, "step": 30880}, {"loss": 0.6332, "grad_norm": 0.6925143003463745, "learning_rate": 0.0002, "epoch": 2.2183123877917414, "step": 30890}, {"loss": 0.6986, "grad_norm": 0.6853532195091248, "learning_rate": 0.0002, "epoch": 2.2190305206463194, "step": 30900}, {"loss": 0.6881, "grad_norm": 0.7964699268341064, "learning_rate": 0.0002, "epoch": 2.219748653500898, "step": 30910}, {"loss": 0.6879, "grad_norm": 0.8116228580474854, "learning_rate": 0.0002, "epoch": 2.220466786355476, "step": 30920}, {"loss": 0.6599, "grad_norm": 1.0121010541915894, "learning_rate": 0.0002, "epoch": 2.221184919210054, "step": 30930}, {"loss": 0.6873, "grad_norm": 0.7348445653915405, "learning_rate": 0.0002, "epoch": 2.221903052064632, "step": 30940}, {"loss": 0.6711, "grad_norm": 0.8998047709465027, "learning_rate": 0.0002, "epoch": 2.22262118491921, "step": 30950}, {"loss": 0.692, "grad_norm": 0.6108106970787048, "learning_rate": 0.0002, "epoch": 2.2233393177737883, "step": 30960}, {"loss": 0.6515, "grad_norm": 1.287834882736206, "learning_rate": 0.0002, "epoch": 2.2240574506283664, "step": 30970}, {"loss": 0.6513, "grad_norm": 0.8584468960762024, "learning_rate": 0.0002, "epoch": 2.2247755834829444, "step": 30980}, {"loss": 0.6907, "grad_norm": 0.865276038646698, "learning_rate": 0.0002, "epoch": 2.2254937163375224, "step": 30990}, {"loss": 0.7516, "grad_norm": 0.8713302612304688, "learning_rate": 0.0002, "epoch": 2.2262118491921004, "step": 31000}, {"loss": 0.7127, "grad_norm": 0.9210535883903503, "learning_rate": 0.0002, "epoch": 2.226929982046679, "step": 31010}, {"loss": 0.6543, "grad_norm": 0.8578430414199829, "learning_rate": 0.0002, "epoch": 2.227648114901257, "step": 31020}, {"loss": 0.6964, "grad_norm": 0.7128387093544006, "learning_rate": 0.0002, "epoch": 2.228366247755835, "step": 31030}, {"loss": 0.6949, "grad_norm": 0.8059941530227661, "learning_rate": 0.0002, "epoch": 2.229084380610413, "step": 31040}, {"loss": 0.6422, "grad_norm": 0.8043261170387268, "learning_rate": 0.0002, "epoch": 2.229802513464991, "step": 31050}, {"loss": 0.691, "grad_norm": 0.9260253310203552, "learning_rate": 0.0002, "epoch": 2.2305206463195693, "step": 31060}, {"loss": 0.6601, "grad_norm": 0.7908085584640503, "learning_rate": 0.0002, "epoch": 2.2312387791741473, "step": 31070}, {"loss": 0.6312, "grad_norm": 0.7860442996025085, "learning_rate": 0.0002, "epoch": 2.2319569120287253, "step": 31080}, {"loss": 0.715, "grad_norm": 0.8388702273368835, "learning_rate": 0.0002, "epoch": 2.2326750448833033, "step": 31090}, {"loss": 0.7015, "grad_norm": 0.835686206817627, "learning_rate": 0.0002, "epoch": 2.2333931777378817, "step": 31100}, {"loss": 0.6796, "grad_norm": 0.8148298859596252, "learning_rate": 0.0002, "epoch": 2.2341113105924597, "step": 31110}, {"loss": 0.6318, "grad_norm": 0.8501878976821899, "learning_rate": 0.0002, "epoch": 2.2348294434470377, "step": 31120}, {"loss": 0.7262, "grad_norm": 0.793323278427124, "learning_rate": 0.0002, "epoch": 2.2355475763016157, "step": 31130}, {"loss": 0.722, "grad_norm": 0.8234742879867554, "learning_rate": 0.0002, "epoch": 2.2362657091561937, "step": 31140}, {"loss": 0.6746, "grad_norm": 0.8691303133964539, "learning_rate": 0.0002, "epoch": 2.236983842010772, "step": 31150}, {"loss": 0.6191, "grad_norm": 0.8707090020179749, "learning_rate": 0.0002, "epoch": 2.23770197486535, "step": 31160}, {"loss": 0.6988, "grad_norm": 0.8468940854072571, "learning_rate": 0.0002, "epoch": 2.238420107719928, "step": 31170}, {"loss": 0.6429, "grad_norm": 0.7275772094726562, "learning_rate": 0.0002, "epoch": 2.239138240574506, "step": 31180}, {"loss": 0.7057, "grad_norm": 0.8765808939933777, "learning_rate": 0.0002, "epoch": 2.2398563734290846, "step": 31190}, {"loss": 0.7273, "grad_norm": 1.02803635597229, "learning_rate": 0.0002, "epoch": 2.2405745062836626, "step": 31200}, {"loss": 0.7303, "grad_norm": 0.7999185919761658, "learning_rate": 0.0002, "epoch": 2.2412926391382406, "step": 31210}, {"loss": 0.658, "grad_norm": 0.5711870789527893, "learning_rate": 0.0002, "epoch": 2.2420107719928186, "step": 31220}, {"loss": 0.6527, "grad_norm": 0.7183604836463928, "learning_rate": 0.0002, "epoch": 2.2427289048473966, "step": 31230}, {"loss": 0.6817, "grad_norm": 0.8819206357002258, "learning_rate": 0.0002, "epoch": 2.243447037701975, "step": 31240}, {"loss": 0.6805, "grad_norm": 0.9078969955444336, "learning_rate": 0.0002, "epoch": 2.244165170556553, "step": 31250}, {"loss": 0.6937, "grad_norm": 1.184506893157959, "learning_rate": 0.0002, "epoch": 2.244883303411131, "step": 31260}, {"loss": 0.7682, "grad_norm": 0.8660752177238464, "learning_rate": 0.0002, "epoch": 2.245601436265709, "step": 31270}, {"loss": 0.6461, "grad_norm": 1.011796236038208, "learning_rate": 0.0002, "epoch": 2.246319569120287, "step": 31280}, {"loss": 0.677, "grad_norm": 0.9168157577514648, "learning_rate": 0.0002, "epoch": 2.2470377019748655, "step": 31290}, {"loss": 0.6844, "grad_norm": 0.7798577547073364, "learning_rate": 0.0002, "epoch": 2.2477558348294435, "step": 31300}, {"loss": 0.6622, "grad_norm": 0.6609913110733032, "learning_rate": 0.0002, "epoch": 2.2484739676840215, "step": 31310}, {"loss": 0.6616, "grad_norm": 0.64737868309021, "learning_rate": 0.0002, "epoch": 2.2491921005385995, "step": 31320}, {"loss": 0.665, "grad_norm": 1.0700385570526123, "learning_rate": 0.0002, "epoch": 2.2499102333931775, "step": 31330}, {"loss": 0.6539, "grad_norm": 0.7838551998138428, "learning_rate": 0.0002, "epoch": 2.250628366247756, "step": 31340}, {"loss": 0.7002, "grad_norm": 0.9225728511810303, "learning_rate": 0.0002, "epoch": 2.251346499102334, "step": 31350}, {"loss": 0.6758, "grad_norm": 0.7956384420394897, "learning_rate": 0.0002, "epoch": 2.252064631956912, "step": 31360}, {"loss": 0.7039, "grad_norm": 0.7645466923713684, "learning_rate": 0.0002, "epoch": 2.25278276481149, "step": 31370}, {"loss": 0.6816, "grad_norm": 0.9595549702644348, "learning_rate": 0.0002, "epoch": 2.2535008976660684, "step": 31380}, {"loss": 0.6419, "grad_norm": 0.6124163866043091, "learning_rate": 0.0002, "epoch": 2.2542190305206464, "step": 31390}, {"loss": 0.6573, "grad_norm": 0.7531530261039734, "learning_rate": 0.0002, "epoch": 2.2549371633752244, "step": 31400}, {"loss": 0.6223, "grad_norm": 0.6904721856117249, "learning_rate": 0.0002, "epoch": 2.2556552962298024, "step": 31410}, {"loss": 0.6661, "grad_norm": 0.7644204497337341, "learning_rate": 0.0002, "epoch": 2.2563734290843804, "step": 31420}, {"loss": 0.7122, "grad_norm": 0.7879737019538879, "learning_rate": 0.0002, "epoch": 2.257091561938959, "step": 31430}, {"loss": 0.6407, "grad_norm": 0.796450138092041, "learning_rate": 0.0002, "epoch": 2.257809694793537, "step": 31440}, {"loss": 0.722, "grad_norm": 0.7536656856536865, "learning_rate": 0.0002, "epoch": 2.258527827648115, "step": 31450}, {"loss": 0.681, "grad_norm": 0.6797451376914978, "learning_rate": 0.0002, "epoch": 2.259245960502693, "step": 31460}, {"loss": 0.6916, "grad_norm": 0.7833347320556641, "learning_rate": 0.0002, "epoch": 2.2599640933572713, "step": 31470}, {"loss": 0.702, "grad_norm": 0.7571428418159485, "learning_rate": 0.0002, "epoch": 2.2606822262118493, "step": 31480}, {"loss": 0.6878, "grad_norm": 0.7028690576553345, "learning_rate": 0.0002, "epoch": 2.2614003590664273, "step": 31490}, {"loss": 0.6863, "grad_norm": 0.7854651212692261, "learning_rate": 0.0002, "epoch": 2.2621184919210053, "step": 31500}, {"loss": 0.6895, "grad_norm": 1.1924974918365479, "learning_rate": 0.0002, "epoch": 2.2628366247755833, "step": 31510}, {"loss": 0.7174, "grad_norm": 0.8087588548660278, "learning_rate": 0.0002, "epoch": 2.2635547576301613, "step": 31520}, {"loss": 0.6398, "grad_norm": 0.8521981835365295, "learning_rate": 0.0002, "epoch": 2.26427289048474, "step": 31530}, {"loss": 0.6654, "grad_norm": 0.754585862159729, "learning_rate": 0.0002, "epoch": 2.264991023339318, "step": 31540}, {"loss": 0.6854, "grad_norm": 0.8403395414352417, "learning_rate": 0.0002, "epoch": 2.265709156193896, "step": 31550}, {"loss": 0.6873, "grad_norm": 0.9724786877632141, "learning_rate": 0.0002, "epoch": 2.266427289048474, "step": 31560}, {"loss": 0.6876, "grad_norm": 0.7568767070770264, "learning_rate": 0.0002, "epoch": 2.2671454219030522, "step": 31570}, {"loss": 0.6161, "grad_norm": 0.712009608745575, "learning_rate": 0.0002, "epoch": 2.2678635547576302, "step": 31580}, {"loss": 0.6568, "grad_norm": 0.7649937868118286, "learning_rate": 0.0002, "epoch": 2.2685816876122082, "step": 31590}, {"loss": 0.6195, "grad_norm": 0.7319537997245789, "learning_rate": 0.0002, "epoch": 2.2692998204667862, "step": 31600}, {"loss": 0.6434, "grad_norm": 0.9597942233085632, "learning_rate": 0.0002, "epoch": 2.2700179533213642, "step": 31610}, {"loss": 0.6273, "grad_norm": 0.7403358817100525, "learning_rate": 0.0002, "epoch": 2.2707360861759427, "step": 31620}, {"loss": 0.7185, "grad_norm": 0.7395114898681641, "learning_rate": 0.0002, "epoch": 2.2714542190305207, "step": 31630}, {"loss": 0.6357, "grad_norm": 0.8835344314575195, "learning_rate": 0.0002, "epoch": 2.2721723518850987, "step": 31640}, {"loss": 0.7442, "grad_norm": 0.76587975025177, "learning_rate": 0.0002, "epoch": 2.2728904847396767, "step": 31650}, {"loss": 0.6491, "grad_norm": 0.6472584009170532, "learning_rate": 0.0002, "epoch": 2.273608617594255, "step": 31660}, {"loss": 0.7026, "grad_norm": 1.0170460939407349, "learning_rate": 0.0002, "epoch": 2.274326750448833, "step": 31670}, {"loss": 0.6839, "grad_norm": 0.8170912265777588, "learning_rate": 0.0002, "epoch": 2.275044883303411, "step": 31680}, {"loss": 0.6599, "grad_norm": 0.6821279525756836, "learning_rate": 0.0002, "epoch": 2.275763016157989, "step": 31690}, {"loss": 0.6346, "grad_norm": 0.8150709867477417, "learning_rate": 0.0002, "epoch": 2.276481149012567, "step": 31700}, {"loss": 0.6639, "grad_norm": 0.6786386370658875, "learning_rate": 0.0002, "epoch": 2.2771992818671456, "step": 31710}, {"loss": 0.6753, "grad_norm": 0.8871912360191345, "learning_rate": 0.0002, "epoch": 2.2779174147217236, "step": 31720}, {"loss": 0.6826, "grad_norm": 0.7710220813751221, "learning_rate": 0.0002, "epoch": 2.2786355475763016, "step": 31730}, {"loss": 0.7118, "grad_norm": 0.8073079586029053, "learning_rate": 0.0002, "epoch": 2.2793536804308796, "step": 31740}, {"loss": 0.6614, "grad_norm": 0.8228550553321838, "learning_rate": 0.0002, "epoch": 2.280071813285458, "step": 31750}, {"loss": 0.7162, "grad_norm": 0.7987996339797974, "learning_rate": 0.0002, "epoch": 2.280789946140036, "step": 31760}, {"loss": 0.6953, "grad_norm": 0.744326651096344, "learning_rate": 0.0002, "epoch": 2.281508078994614, "step": 31770}, {"loss": 0.7089, "grad_norm": 0.7672302722930908, "learning_rate": 0.0002, "epoch": 2.282226211849192, "step": 31780}, {"loss": 0.6926, "grad_norm": 0.8079774975776672, "learning_rate": 0.0002, "epoch": 2.28294434470377, "step": 31790}, {"loss": 0.6361, "grad_norm": 0.7383643984794617, "learning_rate": 0.0002, "epoch": 2.283662477558348, "step": 31800}, {"loss": 0.6924, "grad_norm": 0.8542332649230957, "learning_rate": 0.0002, "epoch": 2.2843806104129265, "step": 31810}, {"loss": 0.7156, "grad_norm": 0.7657321691513062, "learning_rate": 0.0002, "epoch": 2.2850987432675045, "step": 31820}, {"loss": 0.6545, "grad_norm": 0.7485944628715515, "learning_rate": 0.0002, "epoch": 2.2858168761220825, "step": 31830}, {"loss": 0.6452, "grad_norm": 0.7817596793174744, "learning_rate": 0.0002, "epoch": 2.2865350089766605, "step": 31840}, {"loss": 0.6398, "grad_norm": 0.840421736240387, "learning_rate": 0.0002, "epoch": 2.287253141831239, "step": 31850}, {"loss": 0.7245, "grad_norm": 0.8190447688102722, "learning_rate": 0.0002, "epoch": 2.287971274685817, "step": 31860}, {"loss": 0.7343, "grad_norm": 0.9582287669181824, "learning_rate": 0.0002, "epoch": 2.288689407540395, "step": 31870}, {"loss": 0.683, "grad_norm": 1.0939116477966309, "learning_rate": 0.0002, "epoch": 2.289407540394973, "step": 31880}, {"loss": 0.7176, "grad_norm": 1.0901678800582886, "learning_rate": 0.0002, "epoch": 2.290125673249551, "step": 31890}, {"loss": 0.6711, "grad_norm": 0.8025168776512146, "learning_rate": 0.0002, "epoch": 2.2908438061041294, "step": 31900}, {"loss": 0.6901, "grad_norm": 0.8157371878623962, "learning_rate": 0.0002, "epoch": 2.2915619389587074, "step": 31910}, {"loss": 0.6643, "grad_norm": 0.7735328078269958, "learning_rate": 0.0002, "epoch": 2.2922800718132854, "step": 31920}, {"loss": 0.689, "grad_norm": 0.7501550316810608, "learning_rate": 0.0002, "epoch": 2.2929982046678634, "step": 31930}, {"loss": 0.6605, "grad_norm": 0.76664799451828, "learning_rate": 0.0002, "epoch": 2.293716337522442, "step": 31940}, {"loss": 0.6818, "grad_norm": 1.0044599771499634, "learning_rate": 0.0002, "epoch": 2.29443447037702, "step": 31950}, {"loss": 0.6566, "grad_norm": 0.7773551344871521, "learning_rate": 0.0002, "epoch": 2.295152603231598, "step": 31960}, {"loss": 0.6834, "grad_norm": 0.9021226763725281, "learning_rate": 0.0002, "epoch": 2.295870736086176, "step": 31970}, {"loss": 0.6757, "grad_norm": 0.9075915813446045, "learning_rate": 0.0002, "epoch": 2.296588868940754, "step": 31980}, {"loss": 0.6584, "grad_norm": 0.9109290242195129, "learning_rate": 0.0002, "epoch": 2.2973070017953323, "step": 31990}, {"loss": 0.6792, "grad_norm": 0.7742900252342224, "learning_rate": 0.0002, "epoch": 2.2980251346499103, "step": 32000}, {"loss": 0.7137, "grad_norm": 0.633260190486908, "learning_rate": 0.0002, "epoch": 2.2987432675044883, "step": 32010}, {"loss": 0.6644, "grad_norm": 0.8593834042549133, "learning_rate": 0.0002, "epoch": 2.2994614003590663, "step": 32020}, {"loss": 0.6961, "grad_norm": 0.88165283203125, "learning_rate": 0.0002, "epoch": 2.3001795332136448, "step": 32030}, {"loss": 0.7779, "grad_norm": 0.7840633988380432, "learning_rate": 0.0002, "epoch": 2.3008976660682228, "step": 32040}, {"loss": 0.7045, "grad_norm": 0.8150764107704163, "learning_rate": 0.0002, "epoch": 2.3016157989228008, "step": 32050}, {"loss": 0.6556, "grad_norm": 0.7683324813842773, "learning_rate": 0.0002, "epoch": 2.3023339317773788, "step": 32060}, {"loss": 0.6657, "grad_norm": 0.7581049799919128, "learning_rate": 0.0002, "epoch": 2.3030520646319568, "step": 32070}, {"loss": 0.6683, "grad_norm": 0.911687970161438, "learning_rate": 0.0002, "epoch": 2.3037701974865348, "step": 32080}, {"loss": 0.7029, "grad_norm": 1.0596355199813843, "learning_rate": 0.0002, "epoch": 2.3044883303411132, "step": 32090}, {"loss": 0.6955, "grad_norm": 0.7329661846160889, "learning_rate": 0.0002, "epoch": 2.3052064631956912, "step": 32100}, {"loss": 0.6798, "grad_norm": 0.8251074552536011, "learning_rate": 0.0002, "epoch": 2.3059245960502692, "step": 32110}, {"loss": 0.692, "grad_norm": 0.7765523195266724, "learning_rate": 0.0002, "epoch": 2.3066427289048472, "step": 32120}, {"loss": 0.6375, "grad_norm": 0.8246980905532837, "learning_rate": 0.0002, "epoch": 2.3073608617594257, "step": 32130}, {"loss": 0.6815, "grad_norm": 0.833387017250061, "learning_rate": 0.0002, "epoch": 2.3080789946140037, "step": 32140}, {"loss": 0.6261, "grad_norm": 0.9558065533638, "learning_rate": 0.0002, "epoch": 2.3087971274685817, "step": 32150}, {"loss": 0.6723, "grad_norm": 0.788151204586029, "learning_rate": 0.0002, "epoch": 2.3095152603231597, "step": 32160}, {"loss": 0.6398, "grad_norm": 0.8662320971488953, "learning_rate": 0.0002, "epoch": 2.3102333931777377, "step": 32170}, {"loss": 0.7014, "grad_norm": 0.7079060673713684, "learning_rate": 0.0002, "epoch": 2.310951526032316, "step": 32180}, {"loss": 0.6479, "grad_norm": 0.8477022647857666, "learning_rate": 0.0002, "epoch": 2.311669658886894, "step": 32190}, {"loss": 0.6872, "grad_norm": 0.6549711227416992, "learning_rate": 0.0002, "epoch": 2.312387791741472, "step": 32200}, {"loss": 0.6668, "grad_norm": 0.8274375796318054, "learning_rate": 0.0002, "epoch": 2.31310592459605, "step": 32210}, {"loss": 0.6731, "grad_norm": 0.6305822730064392, "learning_rate": 0.0002, "epoch": 2.3138240574506286, "step": 32220}, {"loss": 0.6908, "grad_norm": 0.8105725049972534, "learning_rate": 0.0002, "epoch": 2.3145421903052066, "step": 32230}, {"loss": 0.7028, "grad_norm": 0.7317119240760803, "learning_rate": 0.0002, "epoch": 2.3152603231597846, "step": 32240}, {"loss": 0.6444, "grad_norm": 0.7729924917221069, "learning_rate": 0.0002, "epoch": 2.3159784560143626, "step": 32250}, {"loss": 0.6945, "grad_norm": 0.8092145919799805, "learning_rate": 0.0002, "epoch": 2.3166965888689406, "step": 32260}, {"loss": 0.663, "grad_norm": 0.8723762035369873, "learning_rate": 0.0002, "epoch": 2.317414721723519, "step": 32270}, {"loss": 0.6992, "grad_norm": 0.9699533581733704, "learning_rate": 0.0002, "epoch": 2.318132854578097, "step": 32280}, {"loss": 0.7488, "grad_norm": 1.2972444295883179, "learning_rate": 0.0002, "epoch": 2.318850987432675, "step": 32290}, {"loss": 0.6969, "grad_norm": 0.7888450622558594, "learning_rate": 0.0002, "epoch": 2.319569120287253, "step": 32300}, {"loss": 0.6876, "grad_norm": 0.7457000017166138, "learning_rate": 0.0002, "epoch": 2.3202872531418315, "step": 32310}, {"loss": 0.6891, "grad_norm": 0.7270606756210327, "learning_rate": 0.0002, "epoch": 2.3210053859964095, "step": 32320}, {"loss": 0.6607, "grad_norm": 0.7930711507797241, "learning_rate": 0.0002, "epoch": 2.3217235188509875, "step": 32330}, {"loss": 0.7222, "grad_norm": 0.9015030264854431, "learning_rate": 0.0002, "epoch": 2.3224416517055655, "step": 32340}, {"loss": 0.6544, "grad_norm": 0.9385523796081543, "learning_rate": 0.0002, "epoch": 2.3231597845601435, "step": 32350}, {"loss": 0.6779, "grad_norm": 0.7293606400489807, "learning_rate": 0.0002, "epoch": 2.3238779174147215, "step": 32360}, {"loss": 0.6556, "grad_norm": 0.797618567943573, "learning_rate": 0.0002, "epoch": 2.3245960502693, "step": 32370}, {"loss": 0.6743, "grad_norm": 0.8588258028030396, "learning_rate": 0.0002, "epoch": 2.325314183123878, "step": 32380}, {"loss": 0.659, "grad_norm": 0.7490078210830688, "learning_rate": 0.0002, "epoch": 2.326032315978456, "step": 32390}, {"loss": 0.7365, "grad_norm": 0.7569956183433533, "learning_rate": 0.0002, "epoch": 2.326750448833034, "step": 32400}, {"loss": 0.7048, "grad_norm": 0.8754122853279114, "learning_rate": 0.0002, "epoch": 2.3274685816876124, "step": 32410}, {"loss": 0.6845, "grad_norm": 0.9410699605941772, "learning_rate": 0.0002, "epoch": 2.3281867145421904, "step": 32420}, {"loss": 0.6611, "grad_norm": 1.1309062242507935, "learning_rate": 0.0002, "epoch": 2.3289048473967684, "step": 32430}, {"loss": 0.6609, "grad_norm": 0.7923168540000916, "learning_rate": 0.0002, "epoch": 2.3296229802513464, "step": 32440}, {"loss": 0.6728, "grad_norm": 0.830387532711029, "learning_rate": 0.0002, "epoch": 2.3303411131059244, "step": 32450}, {"loss": 0.673, "grad_norm": 0.9087454080581665, "learning_rate": 0.0002, "epoch": 2.331059245960503, "step": 32460}, {"loss": 0.6749, "grad_norm": 0.8892660737037659, "learning_rate": 0.0002, "epoch": 2.331777378815081, "step": 32470}, {"loss": 0.7101, "grad_norm": 0.84930819272995, "learning_rate": 0.0002, "epoch": 2.332495511669659, "step": 32480}, {"loss": 0.6465, "grad_norm": 0.7736781239509583, "learning_rate": 0.0002, "epoch": 2.333213644524237, "step": 32490}, {"loss": 0.6976, "grad_norm": 0.7396222352981567, "learning_rate": 0.0002, "epoch": 2.3339317773788153, "step": 32500}, {"loss": 0.6484, "grad_norm": 0.7710241079330444, "learning_rate": 0.0002, "epoch": 2.3346499102333933, "step": 32510}, {"loss": 0.6591, "grad_norm": 0.7297301888465881, "learning_rate": 0.0002, "epoch": 2.3353680430879713, "step": 32520}, {"loss": 0.7375, "grad_norm": 0.9084094166755676, "learning_rate": 0.0002, "epoch": 2.3360861759425493, "step": 32530}, {"loss": 0.6775, "grad_norm": 0.6425859332084656, "learning_rate": 0.0002, "epoch": 2.3368043087971273, "step": 32540}, {"loss": 0.7249, "grad_norm": 0.8646581172943115, "learning_rate": 0.0002, "epoch": 2.3375224416517058, "step": 32550}, {"loss": 0.6862, "grad_norm": 0.91925048828125, "learning_rate": 0.0002, "epoch": 2.3382405745062838, "step": 32560}, {"loss": 0.6805, "grad_norm": 0.8687716722488403, "learning_rate": 0.0002, "epoch": 2.3389587073608618, "step": 32570}, {"loss": 0.6377, "grad_norm": 0.9769517183303833, "learning_rate": 0.0002, "epoch": 2.3396768402154398, "step": 32580}, {"loss": 0.6459, "grad_norm": 0.7240557074546814, "learning_rate": 0.0002, "epoch": 2.340394973070018, "step": 32590}, {"loss": 0.7029, "grad_norm": 0.6631549000740051, "learning_rate": 0.0002, "epoch": 2.341113105924596, "step": 32600}, {"loss": 0.6524, "grad_norm": 0.9103635549545288, "learning_rate": 0.0002, "epoch": 2.341831238779174, "step": 32610}, {"loss": 0.6695, "grad_norm": 0.8718403577804565, "learning_rate": 0.0002, "epoch": 2.342549371633752, "step": 32620}, {"loss": 0.7006, "grad_norm": 0.8020271062850952, "learning_rate": 0.0002, "epoch": 2.34326750448833, "step": 32630}, {"loss": 0.6853, "grad_norm": 0.7834265232086182, "learning_rate": 0.0002, "epoch": 2.343985637342908, "step": 32640}, {"loss": 0.6447, "grad_norm": 0.8909988403320312, "learning_rate": 0.0002, "epoch": 2.3447037701974867, "step": 32650}, {"loss": 0.6762, "grad_norm": 0.6915582418441772, "learning_rate": 0.0002, "epoch": 2.3454219030520647, "step": 32660}, {"loss": 0.6993, "grad_norm": 0.8829401135444641, "learning_rate": 0.0002, "epoch": 2.3461400359066427, "step": 32670}, {"loss": 0.6035, "grad_norm": 0.8869150876998901, "learning_rate": 0.0002, "epoch": 2.3468581687612207, "step": 32680}, {"loss": 0.6404, "grad_norm": 0.8348933458328247, "learning_rate": 0.0002, "epoch": 2.347576301615799, "step": 32690}, {"loss": 0.6961, "grad_norm": 0.7591108679771423, "learning_rate": 0.0002, "epoch": 2.348294434470377, "step": 32700}, {"loss": 0.7155, "grad_norm": 0.8343638181686401, "learning_rate": 0.0002, "epoch": 2.349012567324955, "step": 32710}, {"loss": 0.6949, "grad_norm": 0.8537896275520325, "learning_rate": 0.0002, "epoch": 2.349730700179533, "step": 32720}, {"loss": 0.6545, "grad_norm": 0.7750797867774963, "learning_rate": 0.0002, "epoch": 2.350448833034111, "step": 32730}, {"loss": 0.7226, "grad_norm": 0.7553941607475281, "learning_rate": 0.0002, "epoch": 2.3511669658886896, "step": 32740}, {"loss": 0.6985, "grad_norm": 0.8083372712135315, "learning_rate": 0.0002, "epoch": 2.3518850987432676, "step": 32750}, {"loss": 0.6345, "grad_norm": 0.8016324043273926, "learning_rate": 0.0002, "epoch": 2.3526032315978456, "step": 32760}, {"loss": 0.6348, "grad_norm": 0.7524061799049377, "learning_rate": 0.0002, "epoch": 2.3533213644524236, "step": 32770}, {"loss": 0.6782, "grad_norm": 0.9046763777732849, "learning_rate": 0.0002, "epoch": 2.354039497307002, "step": 32780}, {"loss": 0.6745, "grad_norm": 0.9704324007034302, "learning_rate": 0.0002, "epoch": 2.35475763016158, "step": 32790}, {"loss": 0.7095, "grad_norm": 0.8756019473075867, "learning_rate": 0.0002, "epoch": 2.355475763016158, "step": 32800}, {"loss": 0.6989, "grad_norm": 0.7345646023750305, "learning_rate": 0.0002, "epoch": 2.356193895870736, "step": 32810}, {"loss": 0.6659, "grad_norm": 0.8022899031639099, "learning_rate": 0.0002, "epoch": 2.356912028725314, "step": 32820}, {"loss": 0.6997, "grad_norm": 0.7663353085517883, "learning_rate": 0.0002, "epoch": 2.3576301615798925, "step": 32830}, {"loss": 0.6683, "grad_norm": 0.7802956104278564, "learning_rate": 0.0002, "epoch": 2.3583482944344705, "step": 32840}, {"loss": 0.679, "grad_norm": 0.8130960464477539, "learning_rate": 0.0002, "epoch": 2.3590664272890485, "step": 32850}, {"loss": 0.6792, "grad_norm": 0.9671252369880676, "learning_rate": 0.0002, "epoch": 2.3597845601436265, "step": 32860}, {"loss": 0.6989, "grad_norm": 0.8806724548339844, "learning_rate": 0.0002, "epoch": 2.3605026929982045, "step": 32870}, {"loss": 0.6674, "grad_norm": 0.9378283619880676, "learning_rate": 0.0002, "epoch": 2.361220825852783, "step": 32880}, {"loss": 0.6607, "grad_norm": 0.8638162612915039, "learning_rate": 0.0002, "epoch": 2.361938958707361, "step": 32890}, {"loss": 0.6866, "grad_norm": 0.7321885228157043, "learning_rate": 0.0002, "epoch": 2.362657091561939, "step": 32900}, {"loss": 0.6682, "grad_norm": 0.8445415496826172, "learning_rate": 0.0002, "epoch": 2.363375224416517, "step": 32910}, {"loss": 0.6863, "grad_norm": 0.915715754032135, "learning_rate": 0.0002, "epoch": 2.364093357271095, "step": 32920}, {"loss": 0.6671, "grad_norm": 0.8674854040145874, "learning_rate": 0.0002, "epoch": 2.3648114901256734, "step": 32930}, {"loss": 0.7124, "grad_norm": 0.7577189207077026, "learning_rate": 0.0002, "epoch": 2.3655296229802514, "step": 32940}, {"loss": 0.6879, "grad_norm": 0.8649988174438477, "learning_rate": 0.0002, "epoch": 2.3662477558348294, "step": 32950}, {"loss": 0.6571, "grad_norm": 0.9760734438896179, "learning_rate": 0.0002, "epoch": 2.3669658886894074, "step": 32960}, {"loss": 0.7002, "grad_norm": 0.8909491300582886, "learning_rate": 0.0002, "epoch": 2.367684021543986, "step": 32970}, {"loss": 0.6961, "grad_norm": 0.6970168948173523, "learning_rate": 0.0002, "epoch": 2.368402154398564, "step": 32980}, {"loss": 0.6153, "grad_norm": 0.8208426237106323, "learning_rate": 0.0002, "epoch": 2.369120287253142, "step": 32990}, {"loss": 0.626, "grad_norm": 0.8477405309677124, "learning_rate": 0.0002, "epoch": 2.36983842010772, "step": 33000}, {"loss": 0.6588, "grad_norm": 0.7771625518798828, "learning_rate": 0.0002, "epoch": 2.370556552962298, "step": 33010}, {"loss": 0.673, "grad_norm": 0.7811821103096008, "learning_rate": 0.0002, "epoch": 2.3712746858168763, "step": 33020}, {"loss": 0.6792, "grad_norm": 0.6280415654182434, "learning_rate": 0.0002, "epoch": 2.3719928186714543, "step": 33030}, {"loss": 0.6567, "grad_norm": 0.8733929395675659, "learning_rate": 0.0002, "epoch": 2.3727109515260323, "step": 33040}, {"loss": 0.6844, "grad_norm": 0.6169558167457581, "learning_rate": 0.0002, "epoch": 2.3734290843806103, "step": 33050}, {"loss": 0.6675, "grad_norm": 0.7414724826812744, "learning_rate": 0.0002, "epoch": 2.3741472172351887, "step": 33060}, {"loss": 0.6905, "grad_norm": 0.7484683990478516, "learning_rate": 0.0002, "epoch": 2.3748653500897667, "step": 33070}, {"loss": 0.6676, "grad_norm": 0.8495098948478699, "learning_rate": 0.0002, "epoch": 2.3755834829443447, "step": 33080}, {"loss": 0.687, "grad_norm": 0.9057353734970093, "learning_rate": 0.0002, "epoch": 2.3763016157989227, "step": 33090}, {"loss": 0.6911, "grad_norm": 0.8028274178504944, "learning_rate": 0.0002, "epoch": 2.3770197486535007, "step": 33100}, {"loss": 0.6851, "grad_norm": 1.2398128509521484, "learning_rate": 0.0002, "epoch": 2.377737881508079, "step": 33110}, {"loss": 0.6753, "grad_norm": 0.7894110679626465, "learning_rate": 0.0002, "epoch": 2.378456014362657, "step": 33120}, {"loss": 0.6625, "grad_norm": 0.8530096411705017, "learning_rate": 0.0002, "epoch": 2.379174147217235, "step": 33130}, {"loss": 0.7061, "grad_norm": 0.892613410949707, "learning_rate": 0.0002, "epoch": 2.379892280071813, "step": 33140}, {"loss": 0.6719, "grad_norm": 0.868606448173523, "learning_rate": 0.0002, "epoch": 2.380610412926391, "step": 33150}, {"loss": 0.6423, "grad_norm": 0.6801115870475769, "learning_rate": 0.0002, "epoch": 2.3813285457809696, "step": 33160}, {"loss": 0.6723, "grad_norm": 0.9517148733139038, "learning_rate": 0.0002, "epoch": 2.3820466786355476, "step": 33170}, {"loss": 0.6957, "grad_norm": 0.8986499309539795, "learning_rate": 0.0002, "epoch": 2.3827648114901256, "step": 33180}, {"loss": 0.6767, "grad_norm": 0.8467642068862915, "learning_rate": 0.0002, "epoch": 2.3834829443447036, "step": 33190}, {"loss": 0.7228, "grad_norm": 0.8400940299034119, "learning_rate": 0.0002, "epoch": 2.3842010771992816, "step": 33200}, {"loss": 0.7048, "grad_norm": 0.86443030834198, "learning_rate": 0.0002, "epoch": 2.38491921005386, "step": 33210}, {"loss": 0.6227, "grad_norm": 0.8599014282226562, "learning_rate": 0.0002, "epoch": 2.385637342908438, "step": 33220}, {"loss": 0.673, "grad_norm": 0.868735134601593, "learning_rate": 0.0002, "epoch": 2.386355475763016, "step": 33230}, {"loss": 0.6612, "grad_norm": 0.941734790802002, "learning_rate": 0.0002, "epoch": 2.387073608617594, "step": 33240}, {"loss": 0.6951, "grad_norm": 0.9342881441116333, "learning_rate": 0.0002, "epoch": 2.3877917414721725, "step": 33250}, {"loss": 0.7255, "grad_norm": 1.012920618057251, "learning_rate": 0.0002, "epoch": 2.3885098743267505, "step": 33260}, {"loss": 0.6399, "grad_norm": 0.6949151754379272, "learning_rate": 0.0002, "epoch": 2.3892280071813286, "step": 33270}, {"loss": 0.7137, "grad_norm": 0.8283912539482117, "learning_rate": 0.0002, "epoch": 2.3899461400359066, "step": 33280}, {"loss": 0.7324, "grad_norm": 0.807273805141449, "learning_rate": 0.0002, "epoch": 2.3906642728904846, "step": 33290}, {"loss": 0.7353, "grad_norm": 0.8109124302864075, "learning_rate": 0.0002, "epoch": 2.391382405745063, "step": 33300}, {"loss": 0.689, "grad_norm": 0.7477563619613647, "learning_rate": 0.0002, "epoch": 2.392100538599641, "step": 33310}, {"loss": 0.6585, "grad_norm": 0.6961637735366821, "learning_rate": 0.0002, "epoch": 2.392818671454219, "step": 33320}, {"loss": 0.6919, "grad_norm": 0.9424173831939697, "learning_rate": 0.0002, "epoch": 2.393536804308797, "step": 33330}, {"loss": 0.6965, "grad_norm": 0.8289623856544495, "learning_rate": 0.0002, "epoch": 2.3942549371633755, "step": 33340}, {"loss": 0.6761, "grad_norm": 0.8106551170349121, "learning_rate": 0.0002, "epoch": 2.3949730700179535, "step": 33350}, {"loss": 0.6675, "grad_norm": 0.8800507187843323, "learning_rate": 0.0002, "epoch": 2.3956912028725315, "step": 33360}, {"loss": 0.6636, "grad_norm": 0.7662274241447449, "learning_rate": 0.0002, "epoch": 2.3964093357271095, "step": 33370}, {"loss": 0.6824, "grad_norm": 0.889204740524292, "learning_rate": 0.0002, "epoch": 2.3971274685816875, "step": 33380}, {"loss": 0.6539, "grad_norm": 0.7991349697113037, "learning_rate": 0.0002, "epoch": 2.3978456014362655, "step": 33390}, {"loss": 0.6818, "grad_norm": 0.8210278749465942, "learning_rate": 0.0002, "epoch": 2.398563734290844, "step": 33400}, {"loss": 0.7118, "grad_norm": 0.91801917552948, "learning_rate": 0.0002, "epoch": 2.399281867145422, "step": 33410}, {"loss": 0.726, "grad_norm": 0.8086220622062683, "learning_rate": 0.0002, "epoch": 2.4, "step": 33420}, {"loss": 0.7418, "grad_norm": 0.901613175868988, "learning_rate": 0.0002, "epoch": 2.400718132854578, "step": 33430}, {"loss": 0.6904, "grad_norm": 0.9865965247154236, "learning_rate": 0.0002, "epoch": 2.4014362657091564, "step": 33440}, {"loss": 0.7543, "grad_norm": 0.8160675168037415, "learning_rate": 0.0002, "epoch": 2.4021543985637344, "step": 33450}, {"loss": 0.6598, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 2.4028725314183124, "step": 33460}, {"loss": 0.6784, "grad_norm": 0.8490013480186462, "learning_rate": 0.0002, "epoch": 2.4035906642728904, "step": 33470}, {"loss": 0.6844, "grad_norm": 0.6947163939476013, "learning_rate": 0.0002, "epoch": 2.4043087971274684, "step": 33480}, {"loss": 0.6606, "grad_norm": 0.7984827756881714, "learning_rate": 0.0002, "epoch": 2.405026929982047, "step": 33490}, {"loss": 0.7032, "grad_norm": 0.7826083302497864, "learning_rate": 0.0002, "epoch": 2.405745062836625, "step": 33500}, {"loss": 0.6914, "grad_norm": 0.8213959336280823, "learning_rate": 0.0002, "epoch": 2.406463195691203, "step": 33510}, {"loss": 0.6855, "grad_norm": 0.8790069818496704, "learning_rate": 0.0002, "epoch": 2.407181328545781, "step": 33520}, {"loss": 0.6278, "grad_norm": 0.9093378782272339, "learning_rate": 0.0002, "epoch": 2.4078994614003593, "step": 33530}, {"loss": 0.6724, "grad_norm": 0.8085389137268066, "learning_rate": 0.0002, "epoch": 2.4086175942549373, "step": 33540}, {"loss": 0.6456, "grad_norm": 0.7952343225479126, "learning_rate": 0.0002, "epoch": 2.4093357271095153, "step": 33550}, {"loss": 0.7357, "grad_norm": 0.9576563835144043, "learning_rate": 0.0002, "epoch": 2.4100538599640933, "step": 33560}, {"loss": 0.7123, "grad_norm": 0.7722929120063782, "learning_rate": 0.0002, "epoch": 2.4107719928186713, "step": 33570}, {"loss": 0.6647, "grad_norm": 0.8634604215621948, "learning_rate": 0.0002, "epoch": 2.4114901256732497, "step": 33580}, {"loss": 0.6677, "grad_norm": 0.7805271148681641, "learning_rate": 0.0002, "epoch": 2.4122082585278277, "step": 33590}, {"loss": 0.6629, "grad_norm": 0.8274481296539307, "learning_rate": 0.0002, "epoch": 2.4129263913824057, "step": 33600}, {"loss": 0.6396, "grad_norm": 0.9265141487121582, "learning_rate": 0.0002, "epoch": 2.4136445242369837, "step": 33610}, {"loss": 0.6727, "grad_norm": 0.7497374415397644, "learning_rate": 0.0002, "epoch": 2.414362657091562, "step": 33620}, {"loss": 0.6543, "grad_norm": 0.7048972249031067, "learning_rate": 0.0002, "epoch": 2.41508078994614, "step": 33630}, {"loss": 0.6863, "grad_norm": 0.8449550271034241, "learning_rate": 0.0002, "epoch": 2.415798922800718, "step": 33640}, {"loss": 0.6891, "grad_norm": 0.7581984400749207, "learning_rate": 0.0002, "epoch": 2.416517055655296, "step": 33650}, {"loss": 0.6845, "grad_norm": 0.7744191288948059, "learning_rate": 0.0002, "epoch": 2.417235188509874, "step": 33660}, {"loss": 0.6412, "grad_norm": 0.6736614108085632, "learning_rate": 0.0002, "epoch": 2.417953321364452, "step": 33670}, {"loss": 0.6792, "grad_norm": 0.985431432723999, "learning_rate": 0.0002, "epoch": 2.4186714542190306, "step": 33680}, {"loss": 0.6675, "grad_norm": 0.8027978539466858, "learning_rate": 0.0002, "epoch": 2.4193895870736086, "step": 33690}, {"loss": 0.7107, "grad_norm": 0.6809377074241638, "learning_rate": 0.0002, "epoch": 2.4201077199281866, "step": 33700}, {"loss": 0.7332, "grad_norm": 0.8305349946022034, "learning_rate": 0.0002, "epoch": 2.4208258527827646, "step": 33710}, {"loss": 0.642, "grad_norm": 0.7632496356964111, "learning_rate": 0.0002, "epoch": 2.421543985637343, "step": 33720}, {"loss": 0.6614, "grad_norm": 0.7241050601005554, "learning_rate": 0.0002, "epoch": 2.422262118491921, "step": 33730}, {"loss": 0.6668, "grad_norm": 0.6729857325553894, "learning_rate": 0.0002, "epoch": 2.422980251346499, "step": 33740}, {"loss": 0.7289, "grad_norm": 0.7741881012916565, "learning_rate": 0.0002, "epoch": 2.423698384201077, "step": 33750}, {"loss": 0.6895, "grad_norm": 0.7844415903091431, "learning_rate": 0.0002, "epoch": 2.424416517055655, "step": 33760}, {"loss": 0.7073, "grad_norm": 0.7960098385810852, "learning_rate": 0.0002, "epoch": 2.4251346499102335, "step": 33770}, {"loss": 0.702, "grad_norm": 0.8267978429794312, "learning_rate": 0.0002, "epoch": 2.4258527827648115, "step": 33780}, {"loss": 0.6379, "grad_norm": 0.7498974204063416, "learning_rate": 0.0002, "epoch": 2.4265709156193895, "step": 33790}, {"loss": 0.6749, "grad_norm": 0.8357859253883362, "learning_rate": 0.0002, "epoch": 2.4272890484739675, "step": 33800}, {"loss": 0.6617, "grad_norm": 0.8056104779243469, "learning_rate": 0.0002, "epoch": 2.428007181328546, "step": 33810}, {"loss": 0.701, "grad_norm": 0.806897759437561, "learning_rate": 0.0002, "epoch": 2.428725314183124, "step": 33820}, {"loss": 0.6771, "grad_norm": 0.7770048975944519, "learning_rate": 0.0002, "epoch": 2.429443447037702, "step": 33830}, {"loss": 0.7096, "grad_norm": 0.8311458230018616, "learning_rate": 0.0002, "epoch": 2.43016157989228, "step": 33840}, {"loss": 0.7127, "grad_norm": 0.9201730489730835, "learning_rate": 0.0002, "epoch": 2.430879712746858, "step": 33850}, {"loss": 0.6722, "grad_norm": 0.83509761095047, "learning_rate": 0.0002, "epoch": 2.4315978456014364, "step": 33860}, {"loss": 0.6477, "grad_norm": 0.7680139541625977, "learning_rate": 0.0002, "epoch": 2.4323159784560144, "step": 33870}, {"loss": 0.7229, "grad_norm": 0.8956670165061951, "learning_rate": 0.0002, "epoch": 2.4330341113105924, "step": 33880}, {"loss": 0.6598, "grad_norm": 0.717941164970398, "learning_rate": 0.0002, "epoch": 2.4337522441651704, "step": 33890}, {"loss": 0.6546, "grad_norm": 0.777206540107727, "learning_rate": 0.0002, "epoch": 2.434470377019749, "step": 33900}, {"loss": 0.7442, "grad_norm": 0.90232914686203, "learning_rate": 0.0002, "epoch": 2.435188509874327, "step": 33910}, {"loss": 0.6763, "grad_norm": 1.0817158222198486, "learning_rate": 0.0002, "epoch": 2.435906642728905, "step": 33920}, {"loss": 0.6995, "grad_norm": 0.7890931367874146, "learning_rate": 0.0002, "epoch": 2.436624775583483, "step": 33930}, {"loss": 0.6438, "grad_norm": 0.9279449582099915, "learning_rate": 0.0002, "epoch": 2.437342908438061, "step": 33940}, {"loss": 0.6694, "grad_norm": 0.8313823342323303, "learning_rate": 0.0002, "epoch": 2.438061041292639, "step": 33950}, {"loss": 0.6841, "grad_norm": 1.0510340929031372, "learning_rate": 0.0002, "epoch": 2.4387791741472173, "step": 33960}, {"loss": 0.7203, "grad_norm": 0.8002574443817139, "learning_rate": 0.0002, "epoch": 2.4394973070017953, "step": 33970}, {"loss": 0.6767, "grad_norm": 0.7822834253311157, "learning_rate": 0.0002, "epoch": 2.4402154398563733, "step": 33980}, {"loss": 0.6289, "grad_norm": 0.9050403237342834, "learning_rate": 0.0002, "epoch": 2.4409335727109513, "step": 33990}, {"loss": 0.6798, "grad_norm": 0.7569652199745178, "learning_rate": 0.0002, "epoch": 2.44165170556553, "step": 34000}, {"loss": 0.648, "grad_norm": 0.6609470844268799, "learning_rate": 0.0002, "epoch": 2.442369838420108, "step": 34010}, {"loss": 0.6734, "grad_norm": 0.8090947866439819, "learning_rate": 0.0002, "epoch": 2.443087971274686, "step": 34020}, {"loss": 0.6621, "grad_norm": 0.647814929485321, "learning_rate": 0.0002, "epoch": 2.443806104129264, "step": 34030}, {"loss": 0.7227, "grad_norm": 0.9308601021766663, "learning_rate": 0.0002, "epoch": 2.444524236983842, "step": 34040}, {"loss": 0.6937, "grad_norm": 0.8259239792823792, "learning_rate": 0.0002, "epoch": 2.4452423698384202, "step": 34050}, {"loss": 0.6813, "grad_norm": 0.9410025477409363, "learning_rate": 0.0002, "epoch": 2.4459605026929983, "step": 34060}, {"loss": 0.7112, "grad_norm": 0.7446974515914917, "learning_rate": 0.0002, "epoch": 2.4466786355475763, "step": 34070}, {"loss": 0.6608, "grad_norm": 0.7093849182128906, "learning_rate": 0.0002, "epoch": 2.4473967684021543, "step": 34080}, {"loss": 0.6801, "grad_norm": 0.8726152181625366, "learning_rate": 0.0002, "epoch": 2.4481149012567327, "step": 34090}, {"loss": 0.7164, "grad_norm": 0.808300793170929, "learning_rate": 0.0002, "epoch": 2.4488330341113107, "step": 34100}, {"loss": 0.658, "grad_norm": 0.6884859800338745, "learning_rate": 0.0002, "epoch": 2.4495511669658887, "step": 34110}, {"loss": 0.6444, "grad_norm": 0.7151864767074585, "learning_rate": 0.0002, "epoch": 2.4502692998204667, "step": 34120}, {"loss": 0.6685, "grad_norm": 0.9261866807937622, "learning_rate": 0.0002, "epoch": 2.4509874326750447, "step": 34130}, {"loss": 0.6717, "grad_norm": 0.8069018125534058, "learning_rate": 0.0002, "epoch": 2.451705565529623, "step": 34140}, {"loss": 0.7436, "grad_norm": 0.8001297116279602, "learning_rate": 0.0002, "epoch": 2.452423698384201, "step": 34150}, {"loss": 0.7032, "grad_norm": 0.8547799587249756, "learning_rate": 0.0002, "epoch": 2.453141831238779, "step": 34160}, {"loss": 0.7226, "grad_norm": 0.6693823337554932, "learning_rate": 0.0002, "epoch": 2.453859964093357, "step": 34170}, {"loss": 0.6644, "grad_norm": 0.6646198630332947, "learning_rate": 0.0002, "epoch": 2.4545780969479356, "step": 34180}, {"loss": 0.6891, "grad_norm": 0.9330950975418091, "learning_rate": 0.0002, "epoch": 2.4552962298025136, "step": 34190}, {"loss": 0.6728, "grad_norm": 0.7738645672798157, "learning_rate": 0.0002, "epoch": 2.4560143626570916, "step": 34200}, {"loss": 0.7162, "grad_norm": 0.7929846048355103, "learning_rate": 0.0002, "epoch": 2.4567324955116696, "step": 34210}, {"loss": 0.6793, "grad_norm": 0.8936280012130737, "learning_rate": 0.0002, "epoch": 2.4574506283662476, "step": 34220}, {"loss": 0.6758, "grad_norm": 0.9099360108375549, "learning_rate": 0.0002, "epoch": 2.4581687612208256, "step": 34230}, {"loss": 0.666, "grad_norm": 0.7941291928291321, "learning_rate": 0.0002, "epoch": 2.458886894075404, "step": 34240}, {"loss": 0.6689, "grad_norm": 0.7169737219810486, "learning_rate": 0.0002, "epoch": 2.459605026929982, "step": 34250}, {"loss": 0.7417, "grad_norm": 0.8994171023368835, "learning_rate": 0.0002, "epoch": 2.46032315978456, "step": 34260}, {"loss": 0.6807, "grad_norm": 0.8087331056594849, "learning_rate": 0.0002, "epoch": 2.461041292639138, "step": 34270}, {"loss": 0.7152, "grad_norm": 0.935502827167511, "learning_rate": 0.0002, "epoch": 2.4617594254937165, "step": 34280}, {"loss": 0.7448, "grad_norm": 0.8957464694976807, "learning_rate": 0.0002, "epoch": 2.4624775583482945, "step": 34290}, {"loss": 0.6501, "grad_norm": 0.9017183780670166, "learning_rate": 0.0002, "epoch": 2.4631956912028725, "step": 34300}, {"loss": 0.6985, "grad_norm": 0.7778640389442444, "learning_rate": 0.0002, "epoch": 2.4639138240574505, "step": 34310}, {"loss": 0.7041, "grad_norm": 0.8870323896408081, "learning_rate": 0.0002, "epoch": 2.4646319569120285, "step": 34320}, {"loss": 0.6796, "grad_norm": 0.7660176753997803, "learning_rate": 0.0002, "epoch": 2.465350089766607, "step": 34330}, {"loss": 0.6705, "grad_norm": 0.8442226648330688, "learning_rate": 0.0002, "epoch": 2.466068222621185, "step": 34340}, {"loss": 0.7019, "grad_norm": 0.7522561550140381, "learning_rate": 0.0002, "epoch": 2.466786355475763, "step": 34350}, {"loss": 0.7331, "grad_norm": 0.9355213046073914, "learning_rate": 0.0002, "epoch": 2.467504488330341, "step": 34360}, {"loss": 0.688, "grad_norm": 0.8487382531166077, "learning_rate": 0.0002, "epoch": 2.4682226211849194, "step": 34370}, {"loss": 0.7068, "grad_norm": 0.7869813442230225, "learning_rate": 0.0002, "epoch": 2.4689407540394974, "step": 34380}, {"loss": 0.6809, "grad_norm": 0.7562848329544067, "learning_rate": 0.0002, "epoch": 2.4696588868940754, "step": 34390}, {"loss": 0.653, "grad_norm": 0.740829586982727, "learning_rate": 0.0002, "epoch": 2.4703770197486534, "step": 34400}, {"loss": 0.656, "grad_norm": 1.0862116813659668, "learning_rate": 0.0002, "epoch": 2.4710951526032314, "step": 34410}, {"loss": 0.6429, "grad_norm": 0.9633645415306091, "learning_rate": 0.0002, "epoch": 2.47181328545781, "step": 34420}, {"loss": 0.7126, "grad_norm": 0.8467186093330383, "learning_rate": 0.0002, "epoch": 2.472531418312388, "step": 34430}, {"loss": 0.6783, "grad_norm": 0.9972147941589355, "learning_rate": 0.0002, "epoch": 2.473249551166966, "step": 34440}, {"loss": 0.701, "grad_norm": 0.8086632490158081, "learning_rate": 0.0002, "epoch": 2.473967684021544, "step": 34450}, {"loss": 0.7127, "grad_norm": 0.9043704271316528, "learning_rate": 0.0002, "epoch": 2.4746858168761223, "step": 34460}, {"loss": 0.6861, "grad_norm": 0.8275330662727356, "learning_rate": 0.0002, "epoch": 2.4754039497307003, "step": 34470}, {"loss": 0.6443, "grad_norm": 0.8142464756965637, "learning_rate": 0.0002, "epoch": 2.4761220825852783, "step": 34480}, {"loss": 0.637, "grad_norm": 0.7116754651069641, "learning_rate": 0.0002, "epoch": 2.4768402154398563, "step": 34490}, {"loss": 0.6572, "grad_norm": 0.8742281198501587, "learning_rate": 0.0002, "epoch": 2.4775583482944343, "step": 34500}, {"loss": 0.6615, "grad_norm": 0.7545657157897949, "learning_rate": 0.0002, "epoch": 2.4782764811490123, "step": 34510}, {"loss": 0.6715, "grad_norm": 0.7586482167243958, "learning_rate": 0.0002, "epoch": 2.478994614003591, "step": 34520}, {"loss": 0.71, "grad_norm": 0.9212547540664673, "learning_rate": 0.0002, "epoch": 2.479712746858169, "step": 34530}, {"loss": 0.6742, "grad_norm": 0.9391530752182007, "learning_rate": 0.0002, "epoch": 2.480430879712747, "step": 34540}, {"loss": 0.6565, "grad_norm": 1.119698166847229, "learning_rate": 0.0002, "epoch": 2.481149012567325, "step": 34550}, {"loss": 0.6734, "grad_norm": 0.8499019145965576, "learning_rate": 0.0002, "epoch": 2.4818671454219032, "step": 34560}, {"loss": 0.7043, "grad_norm": 0.7629778385162354, "learning_rate": 0.0002, "epoch": 2.4825852782764812, "step": 34570}, {"loss": 0.671, "grad_norm": 0.7667021155357361, "learning_rate": 0.0002, "epoch": 2.4833034111310592, "step": 34580}, {"loss": 0.6202, "grad_norm": 0.6711493730545044, "learning_rate": 0.0002, "epoch": 2.4840215439856372, "step": 34590}, {"loss": 0.6644, "grad_norm": 0.7354223728179932, "learning_rate": 0.0002, "epoch": 2.4847396768402152, "step": 34600}, {"loss": 0.622, "grad_norm": 0.875295102596283, "learning_rate": 0.0002, "epoch": 2.4854578096947937, "step": 34610}, {"loss": 0.6946, "grad_norm": 0.7341493964195251, "learning_rate": 0.0002, "epoch": 2.4861759425493717, "step": 34620}, {"loss": 0.6674, "grad_norm": 0.9049216508865356, "learning_rate": 0.0002, "epoch": 2.4868940754039497, "step": 34630}, {"loss": 0.7017, "grad_norm": 0.7214788198471069, "learning_rate": 0.0002, "epoch": 2.4876122082585277, "step": 34640}, {"loss": 0.6571, "grad_norm": 0.7514070868492126, "learning_rate": 0.0002, "epoch": 2.488330341113106, "step": 34650}, {"loss": 0.6623, "grad_norm": 0.6929763555526733, "learning_rate": 0.0002, "epoch": 2.489048473967684, "step": 34660}, {"loss": 0.7118, "grad_norm": 1.11346435546875, "learning_rate": 0.0002, "epoch": 2.489766606822262, "step": 34670}, {"loss": 0.6664, "grad_norm": 0.9285556674003601, "learning_rate": 0.0002, "epoch": 2.49048473967684, "step": 34680}, {"loss": 0.7094, "grad_norm": 0.7699695825576782, "learning_rate": 0.0002, "epoch": 2.491202872531418, "step": 34690}, {"loss": 0.6575, "grad_norm": 0.872349739074707, "learning_rate": 0.0002, "epoch": 2.4919210053859966, "step": 34700}, {"loss": 0.6886, "grad_norm": 0.8692147135734558, "learning_rate": 0.0002, "epoch": 2.4926391382405746, "step": 34710}, {"loss": 0.711, "grad_norm": 0.799740195274353, "learning_rate": 0.0002, "epoch": 2.4933572710951526, "step": 34720}, {"loss": 0.6849, "grad_norm": 0.7320986986160278, "learning_rate": 0.0002, "epoch": 2.4940754039497306, "step": 34730}, {"loss": 0.7138, "grad_norm": 0.8233383893966675, "learning_rate": 0.0002, "epoch": 2.494793536804309, "step": 34740}, {"loss": 0.6937, "grad_norm": 0.9605086445808411, "learning_rate": 0.0002, "epoch": 2.495511669658887, "step": 34750}, {"loss": 0.6511, "grad_norm": 0.8597773909568787, "learning_rate": 0.0002, "epoch": 2.496229802513465, "step": 34760}, {"loss": 0.6793, "grad_norm": 0.7459201812744141, "learning_rate": 0.0002, "epoch": 2.496947935368043, "step": 34770}, {"loss": 0.7098, "grad_norm": 0.778457522392273, "learning_rate": 0.0002, "epoch": 2.497666068222621, "step": 34780}, {"loss": 0.6727, "grad_norm": 0.8591375946998596, "learning_rate": 0.0002, "epoch": 2.498384201077199, "step": 34790}, {"loss": 0.6439, "grad_norm": 0.9689867496490479, "learning_rate": 0.0002, "epoch": 2.4991023339317775, "step": 34800}, {"loss": 0.6365, "grad_norm": 0.7430615425109863, "learning_rate": 0.0002, "epoch": 2.4998204667863555, "step": 34810}, {"loss": 0.7207, "grad_norm": 0.8545114994049072, "learning_rate": 0.0002, "epoch": 2.5005385996409335, "step": 34820}, {"loss": 0.7318, "grad_norm": 0.7115356922149658, "learning_rate": 0.0002, "epoch": 2.5012567324955115, "step": 34830}, {"loss": 0.6985, "grad_norm": 0.7616795301437378, "learning_rate": 0.0002, "epoch": 2.50197486535009, "step": 34840}, {"loss": 0.7153, "grad_norm": 0.8097891211509705, "learning_rate": 0.0002, "epoch": 2.502692998204668, "step": 34850}, {"loss": 0.7131, "grad_norm": 0.7397396564483643, "learning_rate": 0.0002, "epoch": 2.503411131059246, "step": 34860}, {"loss": 0.7213, "grad_norm": 0.7531594038009644, "learning_rate": 0.0002, "epoch": 2.504129263913824, "step": 34870}, {"loss": 0.678, "grad_norm": 0.8050091862678528, "learning_rate": 0.0002, "epoch": 2.504847396768402, "step": 34880}, {"loss": 0.6765, "grad_norm": 0.7550507187843323, "learning_rate": 0.0002, "epoch": 2.5055655296229804, "step": 34890}, {"loss": 0.6861, "grad_norm": 1.0131759643554688, "learning_rate": 0.0002, "epoch": 2.5062836624775584, "step": 34900}, {"loss": 0.6755, "grad_norm": 0.9275356531143188, "learning_rate": 0.0002, "epoch": 2.5070017953321364, "step": 34910}, {"loss": 0.7108, "grad_norm": 0.6655791997909546, "learning_rate": 0.0002, "epoch": 2.5077199281867144, "step": 34920}, {"loss": 0.7154, "grad_norm": 0.79361891746521, "learning_rate": 0.0002, "epoch": 2.508438061041293, "step": 34930}, {"loss": 0.6506, "grad_norm": 0.8223658800125122, "learning_rate": 0.0002, "epoch": 2.509156193895871, "step": 34940}, {"loss": 0.6869, "grad_norm": 1.0070416927337646, "learning_rate": 0.0002, "epoch": 2.509874326750449, "step": 34950}, {"loss": 0.6819, "grad_norm": 0.8408986330032349, "learning_rate": 0.0002, "epoch": 2.510592459605027, "step": 34960}, {"loss": 0.7195, "grad_norm": 0.8178259134292603, "learning_rate": 0.0002, "epoch": 2.511310592459605, "step": 34970}, {"loss": 0.6738, "grad_norm": 0.747876763343811, "learning_rate": 0.0002, "epoch": 2.512028725314183, "step": 34980}, {"loss": 0.6706, "grad_norm": 0.8551825881004333, "learning_rate": 0.0002, "epoch": 2.5127468581687613, "step": 34990}, {"loss": 0.653, "grad_norm": 0.8366564512252808, "learning_rate": 0.0002, "epoch": 2.5134649910233393, "step": 35000}, {"loss": 0.6427, "grad_norm": 0.8491294384002686, "learning_rate": 0.0002, "epoch": 2.5141831238779173, "step": 35010}, {"loss": 0.6714, "grad_norm": 0.8854562640190125, "learning_rate": 0.0002, "epoch": 2.5149012567324958, "step": 35020}, {"loss": 0.6606, "grad_norm": 0.8652133345603943, "learning_rate": 0.0002, "epoch": 2.5156193895870738, "step": 35030}, {"loss": 0.658, "grad_norm": 0.8734033107757568, "learning_rate": 0.0002, "epoch": 2.5163375224416518, "step": 35040}, {"loss": 0.6528, "grad_norm": 0.8613446950912476, "learning_rate": 0.0002, "epoch": 2.5170556552962298, "step": 35050}, {"loss": 0.6943, "grad_norm": 0.762395441532135, "learning_rate": 0.0002, "epoch": 2.5177737881508078, "step": 35060}, {"loss": 0.66, "grad_norm": 0.806220293045044, "learning_rate": 0.0002, "epoch": 2.5184919210053858, "step": 35070}, {"loss": 0.6867, "grad_norm": 0.7781713008880615, "learning_rate": 0.0002, "epoch": 2.519210053859964, "step": 35080}, {"loss": 0.6927, "grad_norm": 0.8639848828315735, "learning_rate": 0.0002, "epoch": 2.519928186714542, "step": 35090}, {"loss": 0.6397, "grad_norm": 0.7331740260124207, "learning_rate": 0.0002, "epoch": 2.52064631956912, "step": 35100}, {"loss": 0.6916, "grad_norm": 0.8148137927055359, "learning_rate": 0.0002, "epoch": 2.521364452423698, "step": 35110}, {"loss": 0.6877, "grad_norm": 0.6939297914505005, "learning_rate": 0.0002, "epoch": 2.5220825852782767, "step": 35120}, {"loss": 0.6669, "grad_norm": 0.8151076436042786, "learning_rate": 0.0002, "epoch": 2.5228007181328547, "step": 35130}, {"loss": 0.6761, "grad_norm": 0.9193238019943237, "learning_rate": 0.0002, "epoch": 2.5235188509874327, "step": 35140}, {"loss": 0.7136, "grad_norm": 0.8230985403060913, "learning_rate": 0.0002, "epoch": 2.5242369838420107, "step": 35150}, {"loss": 0.7127, "grad_norm": 0.865492582321167, "learning_rate": 0.0002, "epoch": 2.5249551166965887, "step": 35160}, {"loss": 0.6591, "grad_norm": 0.7673570513725281, "learning_rate": 0.0002, "epoch": 2.525673249551167, "step": 35170}, {"loss": 0.6703, "grad_norm": 0.8296313881874084, "learning_rate": 0.0002, "epoch": 2.526391382405745, "step": 35180}, {"loss": 0.6588, "grad_norm": 0.6531317234039307, "learning_rate": 0.0002, "epoch": 2.527109515260323, "step": 35190}, {"loss": 0.7129, "grad_norm": 0.9865642189979553, "learning_rate": 0.0002, "epoch": 2.527827648114901, "step": 35200}, {"loss": 0.6728, "grad_norm": 0.8001098036766052, "learning_rate": 0.0002, "epoch": 2.5285457809694796, "step": 35210}, {"loss": 0.6737, "grad_norm": 0.7523218393325806, "learning_rate": 0.0002, "epoch": 2.5292639138240576, "step": 35220}, {"loss": 0.6426, "grad_norm": 1.061640977859497, "learning_rate": 0.0002, "epoch": 2.5299820466786356, "step": 35230}, {"loss": 0.6974, "grad_norm": 0.9668078422546387, "learning_rate": 0.0002, "epoch": 2.5307001795332136, "step": 35240}, {"loss": 0.7189, "grad_norm": 0.9554983973503113, "learning_rate": 0.0002, "epoch": 2.5314183123877916, "step": 35250}, {"loss": 0.648, "grad_norm": 0.8343066573143005, "learning_rate": 0.0002, "epoch": 2.5321364452423696, "step": 35260}, {"loss": 0.639, "grad_norm": 0.8408095240592957, "learning_rate": 0.0002, "epoch": 2.532854578096948, "step": 35270}, {"loss": 0.6412, "grad_norm": 0.8593984842300415, "learning_rate": 0.0002, "epoch": 2.533572710951526, "step": 35280}, {"loss": 0.6689, "grad_norm": 0.7593855261802673, "learning_rate": 0.0002, "epoch": 2.534290843806104, "step": 35290}, {"loss": 0.6731, "grad_norm": 0.9179701209068298, "learning_rate": 0.0002, "epoch": 2.5350089766606825, "step": 35300}, {"loss": 0.7194, "grad_norm": 0.749022901058197, "learning_rate": 0.0002, "epoch": 2.5357271095152605, "step": 35310}, {"loss": 0.6488, "grad_norm": 0.7172152400016785, "learning_rate": 0.0002, "epoch": 2.5364452423698385, "step": 35320}, {"loss": 0.6934, "grad_norm": 0.8228873610496521, "learning_rate": 0.0002, "epoch": 2.5371633752244165, "step": 35330}, {"loss": 0.7245, "grad_norm": 0.9663547277450562, "learning_rate": 0.0002, "epoch": 2.5378815080789945, "step": 35340}, {"loss": 0.6974, "grad_norm": 0.8446536660194397, "learning_rate": 0.0002, "epoch": 2.5385996409335725, "step": 35350}, {"loss": 0.6942, "grad_norm": 0.9751029014587402, "learning_rate": 0.0002, "epoch": 2.539317773788151, "step": 35360}, {"loss": 0.7001, "grad_norm": 0.7460315823554993, "learning_rate": 0.0002, "epoch": 2.540035906642729, "step": 35370}, {"loss": 0.6928, "grad_norm": 0.8269246816635132, "learning_rate": 0.0002, "epoch": 2.540754039497307, "step": 35380}, {"loss": 0.6559, "grad_norm": 0.7200030088424683, "learning_rate": 0.0002, "epoch": 2.541472172351885, "step": 35390}, {"loss": 0.6736, "grad_norm": 0.9586671590805054, "learning_rate": 0.0002, "epoch": 2.5421903052064634, "step": 35400}, {"loss": 0.6653, "grad_norm": 0.7872378826141357, "learning_rate": 0.0002, "epoch": 2.5429084380610414, "step": 35410}, {"loss": 0.7002, "grad_norm": 0.8257358074188232, "learning_rate": 0.0002, "epoch": 2.5436265709156194, "step": 35420}, {"loss": 0.6888, "grad_norm": 0.6924505829811096, "learning_rate": 0.0002, "epoch": 2.5443447037701974, "step": 35430}, {"loss": 0.6536, "grad_norm": 1.1171481609344482, "learning_rate": 0.0002, "epoch": 2.5450628366247754, "step": 35440}, {"loss": 0.7087, "grad_norm": 0.9635605216026306, "learning_rate": 0.0002, "epoch": 2.545780969479354, "step": 35450}, {"loss": 0.6545, "grad_norm": 0.9760567545890808, "learning_rate": 0.0002, "epoch": 2.546499102333932, "step": 35460}, {"loss": 0.6858, "grad_norm": 0.8523460030555725, "learning_rate": 0.0002, "epoch": 2.54721723518851, "step": 35470}, {"loss": 0.6702, "grad_norm": 0.9316970109939575, "learning_rate": 0.0002, "epoch": 2.547935368043088, "step": 35480}, {"loss": 0.7028, "grad_norm": 0.7401485443115234, "learning_rate": 0.0002, "epoch": 2.5486535008976663, "step": 35490}, {"loss": 0.6991, "grad_norm": 1.0627065896987915, "learning_rate": 0.0002, "epoch": 2.5493716337522443, "step": 35500}, {"loss": 0.6401, "grad_norm": 0.7463156580924988, "learning_rate": 0.0002, "epoch": 2.5500897666068223, "step": 35510}, {"loss": 0.6978, "grad_norm": 0.9935570359230042, "learning_rate": 0.0002, "epoch": 2.5508078994614003, "step": 35520}, {"loss": 0.7531, "grad_norm": 0.8824051022529602, "learning_rate": 0.0002, "epoch": 2.5515260323159783, "step": 35530}, {"loss": 0.7078, "grad_norm": 0.8018375635147095, "learning_rate": 0.0002, "epoch": 2.5522441651705563, "step": 35540}, {"loss": 0.6757, "grad_norm": 0.7523182034492493, "learning_rate": 0.0002, "epoch": 2.5529622980251347, "step": 35550}, {"loss": 0.6631, "grad_norm": 0.6771712303161621, "learning_rate": 0.0002, "epoch": 2.5536804308797127, "step": 35560}, {"loss": 0.6679, "grad_norm": 0.7903336882591248, "learning_rate": 0.0002, "epoch": 2.5543985637342908, "step": 35570}, {"loss": 0.7069, "grad_norm": 0.7973808646202087, "learning_rate": 0.0002, "epoch": 2.555116696588869, "step": 35580}, {"loss": 0.6388, "grad_norm": 0.9082772731781006, "learning_rate": 0.0002, "epoch": 2.555834829443447, "step": 35590}, {"loss": 0.6926, "grad_norm": 0.779671311378479, "learning_rate": 0.0002, "epoch": 2.556552962298025, "step": 35600}, {"loss": 0.6966, "grad_norm": 0.710058331489563, "learning_rate": 0.0002, "epoch": 2.557271095152603, "step": 35610}, {"loss": 0.701, "grad_norm": 0.8217873573303223, "learning_rate": 0.0002, "epoch": 2.557989228007181, "step": 35620}, {"loss": 0.6773, "grad_norm": 0.8017855286598206, "learning_rate": 0.0002, "epoch": 2.558707360861759, "step": 35630}, {"loss": 0.6764, "grad_norm": 0.6671402454376221, "learning_rate": 0.0002, "epoch": 2.5594254937163377, "step": 35640}, {"loss": 0.6946, "grad_norm": 0.9357045292854309, "learning_rate": 0.0002, "epoch": 2.5601436265709157, "step": 35650}, {"loss": 0.695, "grad_norm": 0.7676312327384949, "learning_rate": 0.0002, "epoch": 2.5608617594254937, "step": 35660}, {"loss": 0.7086, "grad_norm": 0.7602545619010925, "learning_rate": 0.0002, "epoch": 2.5615798922800717, "step": 35670}, {"loss": 0.695, "grad_norm": 0.8112275004386902, "learning_rate": 0.0002, "epoch": 2.56229802513465, "step": 35680}, {"loss": 0.7492, "grad_norm": 0.73296719789505, "learning_rate": 0.0002, "epoch": 2.563016157989228, "step": 35690}, {"loss": 0.6935, "grad_norm": 0.9007818102836609, "learning_rate": 0.0002, "epoch": 2.563734290843806, "step": 35700}, {"loss": 0.7287, "grad_norm": 0.7526060938835144, "learning_rate": 0.0002, "epoch": 2.564452423698384, "step": 35710}, {"loss": 0.6762, "grad_norm": 0.813875675201416, "learning_rate": 0.0002, "epoch": 2.565170556552962, "step": 35720}, {"loss": 0.666, "grad_norm": 0.7767695784568787, "learning_rate": 0.0002, "epoch": 2.5658886894075406, "step": 35730}, {"loss": 0.6591, "grad_norm": 0.7840573787689209, "learning_rate": 0.0002, "epoch": 2.5666068222621186, "step": 35740}, {"loss": 0.7131, "grad_norm": 0.7400487661361694, "learning_rate": 0.0002, "epoch": 2.5673249551166966, "step": 35750}, {"loss": 0.6571, "grad_norm": 0.7424315810203552, "learning_rate": 0.0002, "epoch": 2.5680430879712746, "step": 35760}, {"loss": 0.6861, "grad_norm": 0.7812185883522034, "learning_rate": 0.0002, "epoch": 2.568761220825853, "step": 35770}, {"loss": 0.7034, "grad_norm": 0.8397669196128845, "learning_rate": 0.0002, "epoch": 2.569479353680431, "step": 35780}, {"loss": 0.6734, "grad_norm": 0.7543849945068359, "learning_rate": 0.0002, "epoch": 2.570197486535009, "step": 35790}, {"loss": 0.7393, "grad_norm": 0.903634786605835, "learning_rate": 0.0002, "epoch": 2.570915619389587, "step": 35800}, {"loss": 0.6884, "grad_norm": 0.853335976600647, "learning_rate": 0.0002, "epoch": 2.571633752244165, "step": 35810}, {"loss": 0.6843, "grad_norm": 0.8441029787063599, "learning_rate": 0.0002, "epoch": 2.572351885098743, "step": 35820}, {"loss": 0.6874, "grad_norm": 0.9072228670120239, "learning_rate": 0.0002, "epoch": 2.5730700179533215, "step": 35830}, {"loss": 0.6866, "grad_norm": 0.7720168828964233, "learning_rate": 0.0002, "epoch": 2.5737881508078995, "step": 35840}, {"loss": 0.695, "grad_norm": 0.8719366788864136, "learning_rate": 0.0002, "epoch": 2.5745062836624775, "step": 35850}, {"loss": 0.7842, "grad_norm": 0.766209065914154, "learning_rate": 0.0002, "epoch": 2.575224416517056, "step": 35860}, {"loss": 0.6688, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 2.575942549371634, "step": 35870}, {"loss": 0.7309, "grad_norm": 0.8068482875823975, "learning_rate": 0.0002, "epoch": 2.576660682226212, "step": 35880}, {"loss": 0.703, "grad_norm": 0.8321225643157959, "learning_rate": 0.0002, "epoch": 2.57737881508079, "step": 35890}, {"loss": 0.6885, "grad_norm": 0.9787611961364746, "learning_rate": 0.0002, "epoch": 2.578096947935368, "step": 35900}, {"loss": 0.7246, "grad_norm": 0.6955108642578125, "learning_rate": 0.0002, "epoch": 2.578815080789946, "step": 35910}, {"loss": 0.6972, "grad_norm": 0.8309195637702942, "learning_rate": 0.0002, "epoch": 2.5795332136445244, "step": 35920}, {"loss": 0.6735, "grad_norm": 0.9309390783309937, "learning_rate": 0.0002, "epoch": 2.5802513464991024, "step": 35930}, {"loss": 0.7376, "grad_norm": 0.903537392616272, "learning_rate": 0.0002, "epoch": 2.5809694793536804, "step": 35940}, {"loss": 0.6578, "grad_norm": 0.9530633091926575, "learning_rate": 0.0002, "epoch": 2.5816876122082584, "step": 35950}, {"loss": 0.6707, "grad_norm": 1.0140212774276733, "learning_rate": 0.0002, "epoch": 2.582405745062837, "step": 35960}, {"loss": 0.6859, "grad_norm": 0.8224637508392334, "learning_rate": 0.0002, "epoch": 2.583123877917415, "step": 35970}, {"loss": 0.7158, "grad_norm": 0.7952998280525208, "learning_rate": 0.0002, "epoch": 2.583842010771993, "step": 35980}, {"loss": 0.65, "grad_norm": 0.6057878136634827, "learning_rate": 0.0002, "epoch": 2.584560143626571, "step": 35990}, {"loss": 0.6566, "grad_norm": 0.9172457456588745, "learning_rate": 0.0002, "epoch": 2.585278276481149, "step": 36000}, {"loss": 0.6863, "grad_norm": 1.0061585903167725, "learning_rate": 0.0002, "epoch": 2.5859964093357273, "step": 36010}, {"loss": 0.6831, "grad_norm": 0.8555058240890503, "learning_rate": 0.0002, "epoch": 2.5867145421903053, "step": 36020}, {"loss": 0.7181, "grad_norm": 0.7732099890708923, "learning_rate": 0.0002, "epoch": 2.5874326750448833, "step": 36030}, {"loss": 0.7383, "grad_norm": 0.9026121497154236, "learning_rate": 0.0002, "epoch": 2.5881508078994613, "step": 36040}, {"loss": 0.6221, "grad_norm": 0.7477090954780579, "learning_rate": 0.0002, "epoch": 2.5888689407540397, "step": 36050}, {"loss": 0.6852, "grad_norm": 0.8835780024528503, "learning_rate": 0.0002, "epoch": 2.5895870736086177, "step": 36060}, {"loss": 0.6786, "grad_norm": 0.7555899024009705, "learning_rate": 0.0002, "epoch": 2.5903052064631957, "step": 36070}, {"loss": 0.6723, "grad_norm": 0.7983574867248535, "learning_rate": 0.0002, "epoch": 2.5910233393177737, "step": 36080}, {"loss": 0.64, "grad_norm": 0.9261698722839355, "learning_rate": 0.0002, "epoch": 2.5917414721723517, "step": 36090}, {"loss": 0.6363, "grad_norm": 0.6834031343460083, "learning_rate": 0.0002, "epoch": 2.5924596050269297, "step": 36100}, {"loss": 0.702, "grad_norm": 0.9528526067733765, "learning_rate": 0.0002, "epoch": 2.593177737881508, "step": 36110}, {"loss": 0.7271, "grad_norm": 0.7469993233680725, "learning_rate": 0.0002, "epoch": 2.593895870736086, "step": 36120}, {"loss": 0.6967, "grad_norm": 0.6750355362892151, "learning_rate": 0.0002, "epoch": 2.594614003590664, "step": 36130}, {"loss": 0.6893, "grad_norm": 0.8591015338897705, "learning_rate": 0.0002, "epoch": 2.5953321364452426, "step": 36140}, {"loss": 0.7015, "grad_norm": 0.7359472513198853, "learning_rate": 0.0002, "epoch": 2.5960502692998206, "step": 36150}, {"loss": 0.6697, "grad_norm": 0.8450608253479004, "learning_rate": 0.0002, "epoch": 2.5967684021543986, "step": 36160}, {"loss": 0.7034, "grad_norm": 0.9069468975067139, "learning_rate": 0.0002, "epoch": 2.5974865350089766, "step": 36170}, {"loss": 0.6814, "grad_norm": 0.9261118173599243, "learning_rate": 0.0002, "epoch": 2.5982046678635546, "step": 36180}, {"loss": 0.6575, "grad_norm": 0.7164715528488159, "learning_rate": 0.0002, "epoch": 2.5989228007181326, "step": 36190}, {"loss": 0.7044, "grad_norm": 0.8809511661529541, "learning_rate": 0.0002, "epoch": 2.599640933572711, "step": 36200}, {"loss": 0.6333, "grad_norm": 0.9872701168060303, "learning_rate": 0.0002, "epoch": 2.600359066427289, "step": 36210}, {"loss": 0.689, "grad_norm": 0.7544043064117432, "learning_rate": 0.0002, "epoch": 2.601077199281867, "step": 36220}, {"loss": 0.658, "grad_norm": 0.9890767335891724, "learning_rate": 0.0002, "epoch": 2.601795332136445, "step": 36230}, {"loss": 0.6981, "grad_norm": 0.907865047454834, "learning_rate": 0.0002, "epoch": 2.6025134649910235, "step": 36240}, {"loss": 0.7131, "grad_norm": 0.7724096179008484, "learning_rate": 0.0002, "epoch": 2.6032315978456015, "step": 36250}, {"loss": 0.7034, "grad_norm": 0.7996655106544495, "learning_rate": 0.0002, "epoch": 2.6039497307001795, "step": 36260}, {"loss": 0.6744, "grad_norm": 0.7184412479400635, "learning_rate": 0.0002, "epoch": 2.6046678635547575, "step": 36270}, {"loss": 0.7133, "grad_norm": 0.7781601548194885, "learning_rate": 0.0002, "epoch": 2.6053859964093355, "step": 36280}, {"loss": 0.6975, "grad_norm": 0.8972102403640747, "learning_rate": 0.0002, "epoch": 2.6061041292639135, "step": 36290}, {"loss": 0.6757, "grad_norm": 0.6831884980201721, "learning_rate": 0.0002, "epoch": 2.606822262118492, "step": 36300}, {"loss": 0.6633, "grad_norm": 0.9049789905548096, "learning_rate": 0.0002, "epoch": 2.60754039497307, "step": 36310}, {"loss": 0.7048, "grad_norm": 0.8062970042228699, "learning_rate": 0.0002, "epoch": 2.608258527827648, "step": 36320}, {"loss": 0.6695, "grad_norm": 0.94797682762146, "learning_rate": 0.0002, "epoch": 2.6089766606822264, "step": 36330}, {"loss": 0.6934, "grad_norm": 0.7907559275627136, "learning_rate": 0.0002, "epoch": 2.6096947935368044, "step": 36340}, {"loss": 0.6299, "grad_norm": 0.6720156073570251, "learning_rate": 0.0002, "epoch": 2.6104129263913824, "step": 36350}, {"loss": 0.644, "grad_norm": 0.729228138923645, "learning_rate": 0.0002, "epoch": 2.6111310592459605, "step": 36360}, {"loss": 0.6651, "grad_norm": 0.9072836637496948, "learning_rate": 0.0002, "epoch": 2.6118491921005385, "step": 36370}, {"loss": 0.6821, "grad_norm": 0.8022173643112183, "learning_rate": 0.0002, "epoch": 2.6125673249551165, "step": 36380}, {"loss": 0.6587, "grad_norm": 0.7475612163543701, "learning_rate": 0.0002, "epoch": 2.613285457809695, "step": 36390}, {"loss": 0.6454, "grad_norm": 0.7976534366607666, "learning_rate": 0.0002, "epoch": 2.614003590664273, "step": 36400}, {"loss": 0.7173, "grad_norm": 0.7118260860443115, "learning_rate": 0.0002, "epoch": 2.614721723518851, "step": 36410}, {"loss": 0.7173, "grad_norm": 0.666500985622406, "learning_rate": 0.0002, "epoch": 2.6154398563734294, "step": 36420}, {"loss": 0.719, "grad_norm": 0.8776089549064636, "learning_rate": 0.0002, "epoch": 2.6161579892280074, "step": 36430}, {"loss": 0.6928, "grad_norm": 0.9375919699668884, "learning_rate": 0.0002, "epoch": 2.6168761220825854, "step": 36440}, {"loss": 0.6627, "grad_norm": 0.8162244558334351, "learning_rate": 0.0002, "epoch": 2.6175942549371634, "step": 36450}, {"loss": 0.6586, "grad_norm": 0.8459304571151733, "learning_rate": 0.0002, "epoch": 2.6183123877917414, "step": 36460}, {"loss": 0.6777, "grad_norm": 0.7731037735939026, "learning_rate": 0.0002, "epoch": 2.6190305206463194, "step": 36470}, {"loss": 0.7288, "grad_norm": 0.7857680320739746, "learning_rate": 0.0002, "epoch": 2.619748653500898, "step": 36480}, {"loss": 0.664, "grad_norm": 0.8415161371231079, "learning_rate": 0.0002, "epoch": 2.620466786355476, "step": 36490}, {"loss": 0.703, "grad_norm": 0.8103558421134949, "learning_rate": 0.0002, "epoch": 2.621184919210054, "step": 36500}, {"loss": 0.6693, "grad_norm": 0.7876150608062744, "learning_rate": 0.0002, "epoch": 2.621903052064632, "step": 36510}, {"loss": 0.6562, "grad_norm": 0.7316484451293945, "learning_rate": 0.0002, "epoch": 2.6226211849192103, "step": 36520}, {"loss": 0.6263, "grad_norm": 0.7209784984588623, "learning_rate": 0.0002, "epoch": 2.6233393177737883, "step": 36530}, {"loss": 0.6767, "grad_norm": 0.8933016657829285, "learning_rate": 0.0002, "epoch": 2.6240574506283663, "step": 36540}, {"loss": 0.7217, "grad_norm": 0.8078171610832214, "learning_rate": 0.0002, "epoch": 2.6247755834829443, "step": 36550}, {"loss": 0.7106, "grad_norm": 0.9134724736213684, "learning_rate": 0.0002, "epoch": 2.6254937163375223, "step": 36560}, {"loss": 0.6909, "grad_norm": 0.8691368699073792, "learning_rate": 0.0002, "epoch": 2.6262118491921003, "step": 36570}, {"loss": 0.6769, "grad_norm": 0.706479012966156, "learning_rate": 0.0002, "epoch": 2.6269299820466787, "step": 36580}, {"loss": 0.6864, "grad_norm": 0.9333644509315491, "learning_rate": 0.0002, "epoch": 2.6276481149012567, "step": 36590}, {"loss": 0.6704, "grad_norm": 0.8156154155731201, "learning_rate": 0.0002, "epoch": 2.6283662477558347, "step": 36600}, {"loss": 0.7128, "grad_norm": 0.812745213508606, "learning_rate": 0.0002, "epoch": 2.629084380610413, "step": 36610}, {"loss": 0.6901, "grad_norm": 0.8898148536682129, "learning_rate": 0.0002, "epoch": 2.629802513464991, "step": 36620}, {"loss": 0.6821, "grad_norm": 0.8083946108818054, "learning_rate": 0.0002, "epoch": 2.630520646319569, "step": 36630}, {"loss": 0.7285, "grad_norm": 0.7050122618675232, "learning_rate": 0.0002, "epoch": 2.631238779174147, "step": 36640}, {"loss": 0.6751, "grad_norm": 0.8155789971351624, "learning_rate": 0.0002, "epoch": 2.631956912028725, "step": 36650}, {"loss": 0.7258, "grad_norm": 0.9102175235748291, "learning_rate": 0.0002, "epoch": 2.632675044883303, "step": 36660}, {"loss": 0.6697, "grad_norm": 0.6621248126029968, "learning_rate": 0.0002, "epoch": 2.6333931777378816, "step": 36670}, {"loss": 0.6405, "grad_norm": 0.7338519096374512, "learning_rate": 0.0002, "epoch": 2.6341113105924596, "step": 36680}, {"loss": 0.6784, "grad_norm": 0.7536506652832031, "learning_rate": 0.0002, "epoch": 2.6348294434470376, "step": 36690}, {"loss": 0.6974, "grad_norm": 0.9357436299324036, "learning_rate": 0.0002, "epoch": 2.635547576301616, "step": 36700}, {"loss": 0.7729, "grad_norm": 0.7732111215591431, "learning_rate": 0.0002, "epoch": 2.636265709156194, "step": 36710}, {"loss": 0.6905, "grad_norm": 0.6863537430763245, "learning_rate": 0.0002, "epoch": 2.636983842010772, "step": 36720}, {"loss": 0.7058, "grad_norm": 0.8014764785766602, "learning_rate": 0.0002, "epoch": 2.63770197486535, "step": 36730}, {"loss": 0.697, "grad_norm": 0.8103911280632019, "learning_rate": 0.0002, "epoch": 2.638420107719928, "step": 36740}, {"loss": 0.7164, "grad_norm": 0.882652997970581, "learning_rate": 0.0002, "epoch": 2.639138240574506, "step": 36750}, {"loss": 0.6689, "grad_norm": 0.8705278038978577, "learning_rate": 0.0002, "epoch": 2.6398563734290845, "step": 36760}, {"loss": 0.6863, "grad_norm": 0.80764240026474, "learning_rate": 0.0002, "epoch": 2.6405745062836625, "step": 36770}, {"loss": 0.6761, "grad_norm": 0.9668620824813843, "learning_rate": 0.0002, "epoch": 2.6412926391382405, "step": 36780}, {"loss": 0.6576, "grad_norm": 0.7477577328681946, "learning_rate": 0.0002, "epoch": 2.6420107719928185, "step": 36790}, {"loss": 0.6558, "grad_norm": 0.8344516754150391, "learning_rate": 0.0002, "epoch": 2.642728904847397, "step": 36800}, {"loss": 0.6949, "grad_norm": 0.9520720839500427, "learning_rate": 0.0002, "epoch": 2.643447037701975, "step": 36810}, {"loss": 0.6731, "grad_norm": 0.5942372679710388, "learning_rate": 0.0002, "epoch": 2.644165170556553, "step": 36820}, {"loss": 0.6509, "grad_norm": 0.7411555051803589, "learning_rate": 0.0002, "epoch": 2.644883303411131, "step": 36830}, {"loss": 0.6948, "grad_norm": 0.6597771048545837, "learning_rate": 0.0002, "epoch": 2.645601436265709, "step": 36840}, {"loss": 0.6379, "grad_norm": 0.8636548519134521, "learning_rate": 0.0002, "epoch": 2.646319569120287, "step": 36850}, {"loss": 0.6965, "grad_norm": 0.8557497262954712, "learning_rate": 0.0002, "epoch": 2.6470377019748654, "step": 36860}, {"loss": 0.7061, "grad_norm": 0.8535996675491333, "learning_rate": 0.0002, "epoch": 2.6477558348294434, "step": 36870}, {"loss": 0.7087, "grad_norm": 0.7996463775634766, "learning_rate": 0.0002, "epoch": 2.6484739676840214, "step": 36880}, {"loss": 0.7174, "grad_norm": 0.6462067365646362, "learning_rate": 0.0002, "epoch": 2.6491921005386, "step": 36890}, {"loss": 0.6905, "grad_norm": 0.8849772214889526, "learning_rate": 0.0002, "epoch": 2.649910233393178, "step": 36900}, {"loss": 0.6973, "grad_norm": 0.999173641204834, "learning_rate": 0.0002, "epoch": 2.650628366247756, "step": 36910}, {"loss": 0.628, "grad_norm": 0.7221724987030029, "learning_rate": 0.0002, "epoch": 2.651346499102334, "step": 36920}, {"loss": 0.6698, "grad_norm": 0.8122989535331726, "learning_rate": 0.0002, "epoch": 2.652064631956912, "step": 36930}, {"loss": 0.6758, "grad_norm": 0.724267840385437, "learning_rate": 0.0002, "epoch": 2.65278276481149, "step": 36940}, {"loss": 0.6542, "grad_norm": 0.8250583410263062, "learning_rate": 0.0002, "epoch": 2.6535008976660683, "step": 36950}, {"loss": 0.6914, "grad_norm": 0.7623526453971863, "learning_rate": 0.0002, "epoch": 2.6542190305206463, "step": 36960}, {"loss": 0.6699, "grad_norm": 0.6474025845527649, "learning_rate": 0.0002, "epoch": 2.6549371633752243, "step": 36970}, {"loss": 0.7235, "grad_norm": 0.9751694202423096, "learning_rate": 0.0002, "epoch": 2.655655296229803, "step": 36980}, {"loss": 0.7423, "grad_norm": 0.8338939547538757, "learning_rate": 0.0002, "epoch": 2.656373429084381, "step": 36990}, {"loss": 0.6641, "grad_norm": 0.8877421021461487, "learning_rate": 0.0002, "epoch": 2.657091561938959, "step": 37000}, {"loss": 0.6639, "grad_norm": 0.9590298533439636, "learning_rate": 0.0002, "epoch": 2.657809694793537, "step": 37010}, {"loss": 0.6617, "grad_norm": 0.8224121928215027, "learning_rate": 0.0002, "epoch": 2.658527827648115, "step": 37020}, {"loss": 0.6359, "grad_norm": 0.9871236681938171, "learning_rate": 0.0002, "epoch": 2.659245960502693, "step": 37030}, {"loss": 0.65, "grad_norm": 0.8729037046432495, "learning_rate": 0.0002, "epoch": 2.6599640933572712, "step": 37040}, {"loss": 0.6561, "grad_norm": 0.6279319524765015, "learning_rate": 0.0002, "epoch": 2.6606822262118492, "step": 37050}, {"loss": 0.7031, "grad_norm": 1.0278962850570679, "learning_rate": 0.0002, "epoch": 2.6614003590664272, "step": 37060}, {"loss": 0.6552, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 2.6621184919210052, "step": 37070}, {"loss": 0.6994, "grad_norm": 0.7432018518447876, "learning_rate": 0.0002, "epoch": 2.6628366247755837, "step": 37080}, {"loss": 0.7086, "grad_norm": 0.9425008296966553, "learning_rate": 0.0002, "epoch": 2.6635547576301617, "step": 37090}, {"loss": 0.716, "grad_norm": 0.7542579174041748, "learning_rate": 0.0002, "epoch": 2.6642728904847397, "step": 37100}, {"loss": 0.6714, "grad_norm": 0.8469315767288208, "learning_rate": 0.0002, "epoch": 2.6649910233393177, "step": 37110}, {"loss": 0.6638, "grad_norm": 0.865777313709259, "learning_rate": 0.0002, "epoch": 2.6657091561938957, "step": 37120}, {"loss": 0.741, "grad_norm": 0.7293250560760498, "learning_rate": 0.0002, "epoch": 2.6664272890484737, "step": 37130}, {"loss": 0.6662, "grad_norm": 0.7199395895004272, "learning_rate": 0.0002, "epoch": 2.667145421903052, "step": 37140}, {"loss": 0.7078, "grad_norm": 0.7801268100738525, "learning_rate": 0.0002, "epoch": 2.66786355475763, "step": 37150}, {"loss": 0.7083, "grad_norm": 0.8706921935081482, "learning_rate": 0.0002, "epoch": 2.668581687612208, "step": 37160}, {"loss": 0.69, "grad_norm": 0.7124722599983215, "learning_rate": 0.0002, "epoch": 2.6692998204667866, "step": 37170}, {"loss": 0.625, "grad_norm": 0.8333015441894531, "learning_rate": 0.0002, "epoch": 2.6700179533213646, "step": 37180}, {"loss": 0.636, "grad_norm": 0.8822736740112305, "learning_rate": 0.0002, "epoch": 2.6707360861759426, "step": 37190}, {"loss": 0.6731, "grad_norm": 0.8300906419754028, "learning_rate": 0.0002, "epoch": 2.6714542190305206, "step": 37200}, {"loss": 0.6883, "grad_norm": 0.887126088142395, "learning_rate": 0.0002, "epoch": 2.6721723518850986, "step": 37210}, {"loss": 0.7211, "grad_norm": 0.7473671436309814, "learning_rate": 0.0002, "epoch": 2.6728904847396766, "step": 37220}, {"loss": 0.7032, "grad_norm": 0.8121018409729004, "learning_rate": 0.0002, "epoch": 2.673608617594255, "step": 37230}, {"loss": 0.6262, "grad_norm": 0.7882586717605591, "learning_rate": 0.0002, "epoch": 2.674326750448833, "step": 37240}, {"loss": 0.7201, "grad_norm": 0.797060489654541, "learning_rate": 0.0002, "epoch": 2.675044883303411, "step": 37250}, {"loss": 0.6635, "grad_norm": 0.9776935577392578, "learning_rate": 0.0002, "epoch": 2.6757630161579895, "step": 37260}, {"loss": 0.6883, "grad_norm": 0.9527283906936646, "learning_rate": 0.0002, "epoch": 2.6764811490125675, "step": 37270}, {"loss": 0.6968, "grad_norm": 0.7232038974761963, "learning_rate": 0.0002, "epoch": 2.6771992818671455, "step": 37280}, {"loss": 0.6544, "grad_norm": 0.8514575362205505, "learning_rate": 0.0002, "epoch": 2.6779174147217235, "step": 37290}, {"loss": 0.6956, "grad_norm": 0.8951214551925659, "learning_rate": 0.0002, "epoch": 2.6786355475763015, "step": 37300}, {"loss": 0.7435, "grad_norm": 0.7569643259048462, "learning_rate": 0.0002, "epoch": 2.6793536804308795, "step": 37310}, {"loss": 0.6522, "grad_norm": 1.0522346496582031, "learning_rate": 0.0002, "epoch": 2.680071813285458, "step": 37320}, {"loss": 0.7051, "grad_norm": 0.8914180994033813, "learning_rate": 0.0002, "epoch": 2.680789946140036, "step": 37330}, {"loss": 0.6941, "grad_norm": 0.8251807689666748, "learning_rate": 0.0002, "epoch": 2.681508078994614, "step": 37340}, {"loss": 0.6783, "grad_norm": 0.8215394020080566, "learning_rate": 0.0002, "epoch": 2.682226211849192, "step": 37350}, {"loss": 0.682, "grad_norm": 0.8043696880340576, "learning_rate": 0.0002, "epoch": 2.6829443447037704, "step": 37360}, {"loss": 0.6614, "grad_norm": 0.767250657081604, "learning_rate": 0.0002, "epoch": 2.6836624775583484, "step": 37370}, {"loss": 0.7197, "grad_norm": 0.817740261554718, "learning_rate": 0.0002, "epoch": 2.6843806104129264, "step": 37380}, {"loss": 0.6839, "grad_norm": 0.7963255047798157, "learning_rate": 0.0002, "epoch": 2.6850987432675044, "step": 37390}, {"loss": 0.7469, "grad_norm": 0.839271605014801, "learning_rate": 0.0002, "epoch": 2.6858168761220824, "step": 37400}, {"loss": 0.6879, "grad_norm": 0.7882823348045349, "learning_rate": 0.0002, "epoch": 2.6865350089766604, "step": 37410}, {"loss": 0.6768, "grad_norm": 0.8316412568092346, "learning_rate": 0.0002, "epoch": 2.687253141831239, "step": 37420}, {"loss": 0.7031, "grad_norm": 1.0044993162155151, "learning_rate": 0.0002, "epoch": 2.687971274685817, "step": 37430}, {"loss": 0.6988, "grad_norm": 0.8342832326889038, "learning_rate": 0.0002, "epoch": 2.688689407540395, "step": 37440}, {"loss": 0.6685, "grad_norm": 0.6743215322494507, "learning_rate": 0.0002, "epoch": 2.6894075403949733, "step": 37450}, {"loss": 0.6567, "grad_norm": 0.6872923970222473, "learning_rate": 0.0002, "epoch": 2.6901256732495513, "step": 37460}, {"loss": 0.7089, "grad_norm": 0.7377792596817017, "learning_rate": 0.0002, "epoch": 2.6908438061041293, "step": 37470}, {"loss": 0.676, "grad_norm": 0.7677304744720459, "learning_rate": 0.0002, "epoch": 2.6915619389587073, "step": 37480}, {"loss": 0.6693, "grad_norm": 0.9951061010360718, "learning_rate": 0.0002, "epoch": 2.6922800718132853, "step": 37490}, {"loss": 0.6517, "grad_norm": 0.7452111840248108, "learning_rate": 0.0002, "epoch": 2.6929982046678633, "step": 37500}, {"loss": 0.7503, "grad_norm": 0.9663393497467041, "learning_rate": 0.0002, "epoch": 2.6937163375224418, "step": 37510}, {"loss": 0.7025, "grad_norm": 0.7919635772705078, "learning_rate": 0.0002, "epoch": 2.6944344703770198, "step": 37520}, {"loss": 0.7257, "grad_norm": 0.9977981448173523, "learning_rate": 0.0002, "epoch": 2.6951526032315978, "step": 37530}, {"loss": 0.6507, "grad_norm": 0.7279480695724487, "learning_rate": 0.0002, "epoch": 2.695870736086176, "step": 37540}, {"loss": 0.7448, "grad_norm": 0.7218075394630432, "learning_rate": 0.0002, "epoch": 2.6965888689407542, "step": 37550}, {"loss": 0.6845, "grad_norm": 0.9041047096252441, "learning_rate": 0.0002, "epoch": 2.6973070017953322, "step": 37560}, {"loss": 0.6848, "grad_norm": 0.7689407467842102, "learning_rate": 0.0002, "epoch": 2.6980251346499102, "step": 37570}, {"loss": 0.7136, "grad_norm": 0.8184728622436523, "learning_rate": 0.0002, "epoch": 2.6987432675044882, "step": 37580}, {"loss": 0.6952, "grad_norm": 0.7536661624908447, "learning_rate": 0.0002, "epoch": 2.6994614003590662, "step": 37590}, {"loss": 0.7064, "grad_norm": 0.8371431231498718, "learning_rate": 0.0002, "epoch": 2.7001795332136447, "step": 37600}, {"loss": 0.7118, "grad_norm": 0.8562723994255066, "learning_rate": 0.0002, "epoch": 2.7008976660682227, "step": 37610}, {"loss": 0.6602, "grad_norm": 0.8227898478507996, "learning_rate": 0.0002, "epoch": 2.7016157989228007, "step": 37620}, {"loss": 0.7324, "grad_norm": 0.764792799949646, "learning_rate": 0.0002, "epoch": 2.7023339317773787, "step": 37630}, {"loss": 0.7289, "grad_norm": 0.7782649993896484, "learning_rate": 0.0002, "epoch": 2.703052064631957, "step": 37640}, {"loss": 0.705, "grad_norm": 0.7669944167137146, "learning_rate": 0.0002, "epoch": 2.703770197486535, "step": 37650}, {"loss": 0.7019, "grad_norm": 0.7945750951766968, "learning_rate": 0.0002, "epoch": 2.704488330341113, "step": 37660}, {"loss": 0.6789, "grad_norm": 0.6840786337852478, "learning_rate": 0.0002, "epoch": 2.705206463195691, "step": 37670}, {"loss": 0.768, "grad_norm": 1.0565117597579956, "learning_rate": 0.0002, "epoch": 2.705924596050269, "step": 37680}, {"loss": 0.737, "grad_norm": 0.7407042384147644, "learning_rate": 0.0002, "epoch": 2.706642728904847, "step": 37690}, {"loss": 0.712, "grad_norm": 0.7862113118171692, "learning_rate": 0.0002, "epoch": 2.7073608617594256, "step": 37700}, {"loss": 0.6331, "grad_norm": 0.7487596273422241, "learning_rate": 0.0002, "epoch": 2.7080789946140036, "step": 37710}, {"loss": 0.6917, "grad_norm": 0.9416596293449402, "learning_rate": 0.0002, "epoch": 2.7087971274685816, "step": 37720}, {"loss": 0.717, "grad_norm": 0.8943207263946533, "learning_rate": 0.0002, "epoch": 2.70951526032316, "step": 37730}, {"loss": 0.6505, "grad_norm": 0.9263445138931274, "learning_rate": 0.0002, "epoch": 2.710233393177738, "step": 37740}, {"loss": 0.7423, "grad_norm": 0.6869737505912781, "learning_rate": 0.0002, "epoch": 2.710951526032316, "step": 37750}, {"loss": 0.724, "grad_norm": 0.9186407923698425, "learning_rate": 0.0002, "epoch": 2.711669658886894, "step": 37760}, {"loss": 0.6757, "grad_norm": 0.8379335999488831, "learning_rate": 0.0002, "epoch": 2.712387791741472, "step": 37770}, {"loss": 0.7352, "grad_norm": 0.7248736023902893, "learning_rate": 0.0002, "epoch": 2.71310592459605, "step": 37780}, {"loss": 0.7023, "grad_norm": 0.8636229038238525, "learning_rate": 0.0002, "epoch": 2.7138240574506285, "step": 37790}, {"loss": 0.726, "grad_norm": 0.7590767741203308, "learning_rate": 0.0002, "epoch": 2.7145421903052065, "step": 37800}, {"loss": 0.6837, "grad_norm": 0.8946404457092285, "learning_rate": 0.0002, "epoch": 2.7152603231597845, "step": 37810}, {"loss": 0.7135, "grad_norm": 0.7822132706642151, "learning_rate": 0.0002, "epoch": 2.7159784560143625, "step": 37820}, {"loss": 0.7034, "grad_norm": 0.7882820963859558, "learning_rate": 0.0002, "epoch": 2.716696588868941, "step": 37830}, {"loss": 0.6667, "grad_norm": 0.8025872707366943, "learning_rate": 0.0002, "epoch": 2.717414721723519, "step": 37840}, {"loss": 0.6967, "grad_norm": 0.8618839979171753, "learning_rate": 0.0002, "epoch": 2.718132854578097, "step": 37850}, {"loss": 0.699, "grad_norm": 0.6975733637809753, "learning_rate": 0.0002, "epoch": 2.718850987432675, "step": 37860}, {"loss": 0.6858, "grad_norm": 0.7952182292938232, "learning_rate": 0.0002, "epoch": 2.719569120287253, "step": 37870}, {"loss": 0.7018, "grad_norm": 0.7580680251121521, "learning_rate": 0.0002, "epoch": 2.7202872531418314, "step": 37880}, {"loss": 0.6838, "grad_norm": 0.9504257440567017, "learning_rate": 0.0002, "epoch": 2.7210053859964094, "step": 37890}, {"loss": 0.6801, "grad_norm": 0.856614351272583, "learning_rate": 0.0002, "epoch": 2.7217235188509874, "step": 37900}, {"loss": 0.6647, "grad_norm": 1.0092085599899292, "learning_rate": 0.0002, "epoch": 2.7224416517055654, "step": 37910}, {"loss": 0.6709, "grad_norm": 0.9009839296340942, "learning_rate": 0.0002, "epoch": 2.723159784560144, "step": 37920}, {"loss": 0.7009, "grad_norm": 0.9247435331344604, "learning_rate": 0.0002, "epoch": 2.723877917414722, "step": 37930}, {"loss": 0.6924, "grad_norm": 1.0774317979812622, "learning_rate": 0.0002, "epoch": 2.7245960502693, "step": 37940}, {"loss": 0.6706, "grad_norm": 0.9104372262954712, "learning_rate": 0.0002, "epoch": 2.725314183123878, "step": 37950}, {"loss": 0.6608, "grad_norm": 0.7904245257377625, "learning_rate": 0.0002, "epoch": 2.726032315978456, "step": 37960}, {"loss": 0.6937, "grad_norm": 0.9555521607398987, "learning_rate": 0.0002, "epoch": 2.726750448833034, "step": 37970}, {"loss": 0.6497, "grad_norm": 0.7769099473953247, "learning_rate": 0.0002, "epoch": 2.7274685816876123, "step": 37980}, {"loss": 0.63, "grad_norm": 0.9202065467834473, "learning_rate": 0.0002, "epoch": 2.7281867145421903, "step": 37990}, {"loss": 0.7021, "grad_norm": 0.732510507106781, "learning_rate": 0.0002, "epoch": 2.7289048473967683, "step": 38000}, {"loss": 0.6665, "grad_norm": 0.7723771929740906, "learning_rate": 0.0002, "epoch": 2.7296229802513468, "step": 38010}, {"loss": 0.6836, "grad_norm": 0.7948567867279053, "learning_rate": 0.0002, "epoch": 2.7303411131059248, "step": 38020}, {"loss": 0.6802, "grad_norm": 0.7702966928482056, "learning_rate": 0.0002, "epoch": 2.7310592459605028, "step": 38030}, {"loss": 0.6859, "grad_norm": 0.689098060131073, "learning_rate": 0.0002, "epoch": 2.7317773788150808, "step": 38040}, {"loss": 0.7027, "grad_norm": 0.7951080203056335, "learning_rate": 0.0002, "epoch": 2.7324955116696588, "step": 38050}, {"loss": 0.6895, "grad_norm": 0.7284924983978271, "learning_rate": 0.0002, "epoch": 2.7332136445242368, "step": 38060}, {"loss": 0.7409, "grad_norm": 0.9198044538497925, "learning_rate": 0.0002, "epoch": 2.733931777378815, "step": 38070}, {"loss": 0.6699, "grad_norm": 0.8653260469436646, "learning_rate": 0.0002, "epoch": 2.734649910233393, "step": 38080}, {"loss": 0.6832, "grad_norm": 0.8503400683403015, "learning_rate": 0.0002, "epoch": 2.735368043087971, "step": 38090}, {"loss": 0.6955, "grad_norm": 0.8388783931732178, "learning_rate": 0.0002, "epoch": 2.736086175942549, "step": 38100}, {"loss": 0.7059, "grad_norm": 0.7636904716491699, "learning_rate": 0.0002, "epoch": 2.7368043087971277, "step": 38110}, {"loss": 0.6659, "grad_norm": 0.8990790247917175, "learning_rate": 0.0002, "epoch": 2.7375224416517057, "step": 38120}, {"loss": 0.6487, "grad_norm": 0.8878970742225647, "learning_rate": 0.0002, "epoch": 2.7382405745062837, "step": 38130}, {"loss": 0.6725, "grad_norm": 0.7684310078620911, "learning_rate": 0.0002, "epoch": 2.7389587073608617, "step": 38140}, {"loss": 0.6935, "grad_norm": 1.0777359008789062, "learning_rate": 0.0002, "epoch": 2.7396768402154397, "step": 38150}, {"loss": 0.6904, "grad_norm": 0.768764317035675, "learning_rate": 0.0002, "epoch": 2.740394973070018, "step": 38160}, {"loss": 0.6509, "grad_norm": 0.7490760087966919, "learning_rate": 0.0002, "epoch": 2.741113105924596, "step": 38170}, {"loss": 0.6907, "grad_norm": 0.860373854637146, "learning_rate": 0.0002, "epoch": 2.741831238779174, "step": 38180}, {"loss": 0.6704, "grad_norm": 0.7145599722862244, "learning_rate": 0.0002, "epoch": 2.742549371633752, "step": 38190}, {"loss": 0.6798, "grad_norm": 0.8347760438919067, "learning_rate": 0.0002, "epoch": 2.7432675044883306, "step": 38200}, {"loss": 0.7029, "grad_norm": 0.8425729274749756, "learning_rate": 0.0002, "epoch": 2.7439856373429086, "step": 38210}, {"loss": 0.6442, "grad_norm": 0.9289436936378479, "learning_rate": 0.0002, "epoch": 2.7447037701974866, "step": 38220}, {"loss": 0.694, "grad_norm": 0.7608675360679626, "learning_rate": 0.0002, "epoch": 2.7454219030520646, "step": 38230}, {"loss": 0.7097, "grad_norm": 0.8067167401313782, "learning_rate": 0.0002, "epoch": 2.7461400359066426, "step": 38240}, {"loss": 0.704, "grad_norm": 0.8599629402160645, "learning_rate": 0.0002, "epoch": 2.7468581687612206, "step": 38250}, {"loss": 0.6259, "grad_norm": 0.8425742387771606, "learning_rate": 0.0002, "epoch": 2.747576301615799, "step": 38260}, {"loss": 0.6875, "grad_norm": 0.8626754283905029, "learning_rate": 0.0002, "epoch": 2.748294434470377, "step": 38270}, {"loss": 0.7357, "grad_norm": 0.797652006149292, "learning_rate": 0.0002, "epoch": 2.749012567324955, "step": 38280}, {"loss": 0.7184, "grad_norm": 0.7971500754356384, "learning_rate": 0.0002, "epoch": 2.7497307001795335, "step": 38290}, {"loss": 0.7035, "grad_norm": 0.9786333441734314, "learning_rate": 0.0002, "epoch": 2.7504488330341115, "step": 38300}, {"loss": 0.6501, "grad_norm": 0.7146100997924805, "learning_rate": 0.0002, "epoch": 2.7511669658886895, "step": 38310}, {"loss": 0.7087, "grad_norm": 0.8436099886894226, "learning_rate": 0.0002, "epoch": 2.7518850987432675, "step": 38320}, {"loss": 0.6911, "grad_norm": 0.8943847417831421, "learning_rate": 0.0002, "epoch": 2.7526032315978455, "step": 38330}, {"loss": 0.6397, "grad_norm": 0.8170148730278015, "learning_rate": 0.0002, "epoch": 2.7533213644524235, "step": 38340}, {"loss": 0.6756, "grad_norm": 0.7804728746414185, "learning_rate": 0.0002, "epoch": 2.754039497307002, "step": 38350}, {"loss": 0.6954, "grad_norm": 0.9139971137046814, "learning_rate": 0.0002, "epoch": 2.75475763016158, "step": 38360}, {"loss": 0.7083, "grad_norm": 0.835332453250885, "learning_rate": 0.0002, "epoch": 2.755475763016158, "step": 38370}, {"loss": 0.7112, "grad_norm": 1.0904794931411743, "learning_rate": 0.0002, "epoch": 2.756193895870736, "step": 38380}, {"loss": 0.6881, "grad_norm": 0.7443365454673767, "learning_rate": 0.0002, "epoch": 2.7569120287253144, "step": 38390}, {"loss": 0.6896, "grad_norm": 1.1336839199066162, "learning_rate": 0.0002, "epoch": 2.7576301615798924, "step": 38400}, {"loss": 0.6777, "grad_norm": 0.9024015665054321, "learning_rate": 0.0002, "epoch": 2.7583482944344704, "step": 38410}, {"loss": 0.629, "grad_norm": 0.7380578517913818, "learning_rate": 0.0002, "epoch": 2.7590664272890484, "step": 38420}, {"loss": 0.7708, "grad_norm": 0.9860634207725525, "learning_rate": 0.0002, "epoch": 2.7597845601436264, "step": 38430}, {"loss": 0.6694, "grad_norm": 0.7928970456123352, "learning_rate": 0.0002, "epoch": 2.760502692998205, "step": 38440}, {"loss": 0.669, "grad_norm": 1.0357221364974976, "learning_rate": 0.0002, "epoch": 2.761220825852783, "step": 38450}, {"loss": 0.6763, "grad_norm": 0.8110901117324829, "learning_rate": 0.0002, "epoch": 2.761938958707361, "step": 38460}, {"loss": 0.6528, "grad_norm": 0.8420981764793396, "learning_rate": 0.0002, "epoch": 2.762657091561939, "step": 38470}, {"loss": 0.6841, "grad_norm": 0.858955979347229, "learning_rate": 0.0002, "epoch": 2.7633752244165173, "step": 38480}, {"loss": 0.7387, "grad_norm": 0.9851368069648743, "learning_rate": 0.0002, "epoch": 2.7640933572710953, "step": 38490}, {"loss": 0.6939, "grad_norm": 0.8073325753211975, "learning_rate": 0.0002, "epoch": 2.7648114901256733, "step": 38500}, {"loss": 0.7033, "grad_norm": 1.0654062032699585, "learning_rate": 0.0002, "epoch": 2.7655296229802513, "step": 38510}, {"loss": 0.692, "grad_norm": 0.719603955745697, "learning_rate": 0.0002, "epoch": 2.7662477558348293, "step": 38520}, {"loss": 0.7032, "grad_norm": 0.9790831804275513, "learning_rate": 0.0002, "epoch": 2.7669658886894073, "step": 38530}, {"loss": 0.6613, "grad_norm": 0.907619833946228, "learning_rate": 0.0002, "epoch": 2.7676840215439857, "step": 38540}, {"loss": 0.6683, "grad_norm": 0.7463719248771667, "learning_rate": 0.0002, "epoch": 2.7684021543985637, "step": 38550}, {"loss": 0.6785, "grad_norm": 1.0687178373336792, "learning_rate": 0.0002, "epoch": 2.7691202872531417, "step": 38560}, {"loss": 0.6901, "grad_norm": 0.7397776246070862, "learning_rate": 0.0002, "epoch": 2.76983842010772, "step": 38570}, {"loss": 0.6861, "grad_norm": 0.7392559051513672, "learning_rate": 0.0002, "epoch": 2.770556552962298, "step": 38580}, {"loss": 0.6954, "grad_norm": 0.9774793982505798, "learning_rate": 0.0002, "epoch": 2.771274685816876, "step": 38590}, {"loss": 0.6641, "grad_norm": 0.9502208828926086, "learning_rate": 0.0002, "epoch": 2.771992818671454, "step": 38600}, {"loss": 0.6908, "grad_norm": 0.776108980178833, "learning_rate": 0.0002, "epoch": 2.772710951526032, "step": 38610}, {"loss": 0.6826, "grad_norm": 0.7633077502250671, "learning_rate": 0.0002, "epoch": 2.77342908438061, "step": 38620}, {"loss": 0.6559, "grad_norm": 0.9445580244064331, "learning_rate": 0.0002, "epoch": 2.7741472172351886, "step": 38630}, {"loss": 0.7085, "grad_norm": 0.943165123462677, "learning_rate": 0.0002, "epoch": 2.7748653500897666, "step": 38640}, {"loss": 0.6739, "grad_norm": 0.9045929908752441, "learning_rate": 0.0002, "epoch": 2.7755834829443446, "step": 38650}, {"loss": 0.7351, "grad_norm": 0.9425684213638306, "learning_rate": 0.0002, "epoch": 2.7763016157989227, "step": 38660}, {"loss": 0.6602, "grad_norm": 0.9106295704841614, "learning_rate": 0.0002, "epoch": 2.777019748653501, "step": 38670}, {"loss": 0.7076, "grad_norm": 0.6264749765396118, "learning_rate": 0.0002, "epoch": 2.777737881508079, "step": 38680}, {"loss": 0.7234, "grad_norm": 0.9156801700592041, "learning_rate": 0.0002, "epoch": 2.778456014362657, "step": 38690}, {"loss": 0.6804, "grad_norm": 0.9752956032752991, "learning_rate": 0.0002, "epoch": 2.779174147217235, "step": 38700}, {"loss": 0.686, "grad_norm": 0.7849555611610413, "learning_rate": 0.0002, "epoch": 2.779892280071813, "step": 38710}, {"loss": 0.72, "grad_norm": 0.8109981417655945, "learning_rate": 0.0002, "epoch": 2.780610412926391, "step": 38720}, {"loss": 0.6592, "grad_norm": 0.7882387638092041, "learning_rate": 0.0002, "epoch": 2.7813285457809696, "step": 38730}, {"loss": 0.6948, "grad_norm": 0.9049678444862366, "learning_rate": 0.0002, "epoch": 2.7820466786355476, "step": 38740}, {"loss": 0.7032, "grad_norm": 0.7678212523460388, "learning_rate": 0.0002, "epoch": 2.7827648114901256, "step": 38750}, {"loss": 0.6882, "grad_norm": 0.9754453301429749, "learning_rate": 0.0002, "epoch": 2.783482944344704, "step": 38760}, {"loss": 0.7071, "grad_norm": 0.7643493413925171, "learning_rate": 0.0002, "epoch": 2.784201077199282, "step": 38770}, {"loss": 0.6817, "grad_norm": 0.7440303564071655, "learning_rate": 0.0002, "epoch": 2.78491921005386, "step": 38780}, {"loss": 0.6869, "grad_norm": 0.8870946168899536, "learning_rate": 0.0002, "epoch": 2.785637342908438, "step": 38790}, {"loss": 0.7391, "grad_norm": 0.8100579977035522, "learning_rate": 0.0002, "epoch": 2.786355475763016, "step": 38800}, {"loss": 0.7003, "grad_norm": 0.7082616090774536, "learning_rate": 0.0002, "epoch": 2.787073608617594, "step": 38810}, {"loss": 0.697, "grad_norm": 0.7880047559738159, "learning_rate": 0.0002, "epoch": 2.7877917414721725, "step": 38820}, {"loss": 0.6635, "grad_norm": 0.7217963337898254, "learning_rate": 0.0002, "epoch": 2.7885098743267505, "step": 38830}, {"loss": 0.696, "grad_norm": 0.799124002456665, "learning_rate": 0.0002, "epoch": 2.7892280071813285, "step": 38840}, {"loss": 0.7267, "grad_norm": 1.0004022121429443, "learning_rate": 0.0002, "epoch": 2.789946140035907, "step": 38850}, {"loss": 0.6325, "grad_norm": 0.7866547107696533, "learning_rate": 0.0002, "epoch": 2.790664272890485, "step": 38860}, {"loss": 0.6573, "grad_norm": 0.891603410243988, "learning_rate": 0.0002, "epoch": 2.791382405745063, "step": 38870}, {"loss": 0.6949, "grad_norm": 0.7687129378318787, "learning_rate": 0.0002, "epoch": 2.792100538599641, "step": 38880}, {"loss": 0.6753, "grad_norm": 0.7549769282341003, "learning_rate": 0.0002, "epoch": 2.792818671454219, "step": 38890}, {"loss": 0.7103, "grad_norm": 0.7792351245880127, "learning_rate": 0.0002, "epoch": 2.793536804308797, "step": 38900}, {"loss": 0.671, "grad_norm": 0.7352819442749023, "learning_rate": 0.0002, "epoch": 2.7942549371633754, "step": 38910}, {"loss": 0.7176, "grad_norm": 0.8758018612861633, "learning_rate": 0.0002, "epoch": 2.7949730700179534, "step": 38920}, {"loss": 0.7033, "grad_norm": 0.8213023543357849, "learning_rate": 0.0002, "epoch": 2.7956912028725314, "step": 38930}, {"loss": 0.6759, "grad_norm": 0.899368941783905, "learning_rate": 0.0002, "epoch": 2.7964093357271094, "step": 38940}, {"loss": 0.6994, "grad_norm": 0.7497758269309998, "learning_rate": 0.0002, "epoch": 2.797127468581688, "step": 38950}, {"loss": 0.7006, "grad_norm": 0.870704710483551, "learning_rate": 0.0002, "epoch": 2.797845601436266, "step": 38960}, {"loss": 0.6865, "grad_norm": 0.8021528720855713, "learning_rate": 0.0002, "epoch": 2.798563734290844, "step": 38970}, {"loss": 0.7254, "grad_norm": 0.7541360855102539, "learning_rate": 0.0002, "epoch": 2.799281867145422, "step": 38980}, {"loss": 0.6275, "grad_norm": 0.8909788131713867, "learning_rate": 0.0002, "epoch": 2.8, "step": 38990}, {"loss": 0.6801, "grad_norm": 0.8175999522209167, "learning_rate": 0.0002, "epoch": 2.800718132854578, "step": 39000}, {"loss": 0.6961, "grad_norm": 0.7336044311523438, "learning_rate": 0.0002, "epoch": 2.8014362657091563, "step": 39010}, {"loss": 0.6573, "grad_norm": 0.7354168891906738, "learning_rate": 0.0002, "epoch": 2.8021543985637343, "step": 39020}, {"loss": 0.6207, "grad_norm": 0.8771968483924866, "learning_rate": 0.0002, "epoch": 2.8028725314183123, "step": 39030}, {"loss": 0.671, "grad_norm": 0.8073309063911438, "learning_rate": 0.0002, "epoch": 2.8035906642728907, "step": 39040}, {"loss": 0.6869, "grad_norm": 0.8475365042686462, "learning_rate": 0.0002, "epoch": 2.8043087971274687, "step": 39050}, {"loss": 0.6549, "grad_norm": 0.7233281135559082, "learning_rate": 0.0002, "epoch": 2.8050269299820467, "step": 39060}, {"loss": 0.6937, "grad_norm": 0.9850572347640991, "learning_rate": 0.0002, "epoch": 2.8057450628366247, "step": 39070}, {"loss": 0.7091, "grad_norm": 1.0635435581207275, "learning_rate": 0.0002, "epoch": 2.8064631956912027, "step": 39080}, {"loss": 0.6345, "grad_norm": 0.8183665871620178, "learning_rate": 0.0002, "epoch": 2.8071813285457807, "step": 39090}, {"loss": 0.7116, "grad_norm": 0.802228569984436, "learning_rate": 0.0002, "epoch": 2.807899461400359, "step": 39100}, {"loss": 0.7078, "grad_norm": 0.9861624836921692, "learning_rate": 0.0002, "epoch": 2.808617594254937, "step": 39110}, {"loss": 0.7242, "grad_norm": 0.675205409526825, "learning_rate": 0.0002, "epoch": 2.809335727109515, "step": 39120}, {"loss": 0.6599, "grad_norm": 0.7503975629806519, "learning_rate": 0.0002, "epoch": 2.8100538599640936, "step": 39130}, {"loss": 0.6684, "grad_norm": 0.8266825675964355, "learning_rate": 0.0002, "epoch": 2.8107719928186716, "step": 39140}, {"loss": 0.6869, "grad_norm": 0.6956485509872437, "learning_rate": 0.0002, "epoch": 2.8114901256732496, "step": 39150}, {"loss": 0.6495, "grad_norm": 0.7363799214363098, "learning_rate": 0.0002, "epoch": 2.8122082585278276, "step": 39160}, {"loss": 0.7047, "grad_norm": 1.3893407583236694, "learning_rate": 0.0002, "epoch": 2.8129263913824056, "step": 39170}, {"loss": 0.6501, "grad_norm": 1.0619654655456543, "learning_rate": 0.0002, "epoch": 2.8136445242369836, "step": 39180}, {"loss": 0.703, "grad_norm": 0.7924326062202454, "learning_rate": 0.0002, "epoch": 2.814362657091562, "step": 39190}, {"loss": 0.6748, "grad_norm": 0.8838121294975281, "learning_rate": 0.0002, "epoch": 2.81508078994614, "step": 39200}, {"loss": 0.6759, "grad_norm": 0.9059016108512878, "learning_rate": 0.0002, "epoch": 2.815798922800718, "step": 39210}, {"loss": 0.6812, "grad_norm": 0.9284590482711792, "learning_rate": 0.0002, "epoch": 2.816517055655296, "step": 39220}, {"loss": 0.6261, "grad_norm": 0.7992225289344788, "learning_rate": 0.0002, "epoch": 2.8172351885098745, "step": 39230}, {"loss": 0.6623, "grad_norm": 0.816376805305481, "learning_rate": 0.0002, "epoch": 2.8179533213644525, "step": 39240}, {"loss": 0.6825, "grad_norm": 0.9183637499809265, "learning_rate": 0.0002, "epoch": 2.8186714542190305, "step": 39250}, {"loss": 0.6558, "grad_norm": 0.7232057452201843, "learning_rate": 0.0002, "epoch": 2.8193895870736085, "step": 39260}, {"loss": 0.7396, "grad_norm": 0.9012457728385925, "learning_rate": 0.0002, "epoch": 2.8201077199281865, "step": 39270}, {"loss": 0.6823, "grad_norm": 0.7796093821525574, "learning_rate": 0.0002, "epoch": 2.8208258527827645, "step": 39280}, {"loss": 0.6997, "grad_norm": 0.8331146836280823, "learning_rate": 0.0002, "epoch": 2.821543985637343, "step": 39290}, {"loss": 0.6867, "grad_norm": 0.8031269907951355, "learning_rate": 0.0002, "epoch": 2.822262118491921, "step": 39300}, {"loss": 0.7451, "grad_norm": 0.8563299179077148, "learning_rate": 0.0002, "epoch": 2.822980251346499, "step": 39310}, {"loss": 0.6828, "grad_norm": 0.8083387613296509, "learning_rate": 0.0002, "epoch": 2.8236983842010774, "step": 39320}, {"loss": 0.723, "grad_norm": 0.8132631182670593, "learning_rate": 0.0002, "epoch": 2.8244165170556554, "step": 39330}, {"loss": 0.6882, "grad_norm": 0.9071316719055176, "learning_rate": 0.0002, "epoch": 2.8251346499102334, "step": 39340}, {"loss": 0.7057, "grad_norm": 0.8224168419837952, "learning_rate": 0.0002, "epoch": 2.8258527827648114, "step": 39350}, {"loss": 0.6831, "grad_norm": 1.073014497756958, "learning_rate": 0.0002, "epoch": 2.8265709156193894, "step": 39360}, {"loss": 0.7392, "grad_norm": 0.9466553926467896, "learning_rate": 0.0002, "epoch": 2.8272890484739674, "step": 39370}, {"loss": 0.7288, "grad_norm": 0.8946257829666138, "learning_rate": 0.0002, "epoch": 2.828007181328546, "step": 39380}, {"loss": 0.7023, "grad_norm": 0.8497758507728577, "learning_rate": 0.0002, "epoch": 2.828725314183124, "step": 39390}, {"loss": 0.6787, "grad_norm": 0.8952143788337708, "learning_rate": 0.0002, "epoch": 2.829443447037702, "step": 39400}, {"loss": 0.7059, "grad_norm": 0.8839313983917236, "learning_rate": 0.0002, "epoch": 2.8301615798922803, "step": 39410}, {"loss": 0.6643, "grad_norm": 0.7576757669448853, "learning_rate": 0.0002, "epoch": 2.8308797127468583, "step": 39420}, {"loss": 0.6509, "grad_norm": 0.8212469816207886, "learning_rate": 0.0002, "epoch": 2.8315978456014363, "step": 39430}, {"loss": 0.6728, "grad_norm": 0.9289504885673523, "learning_rate": 0.0002, "epoch": 2.8323159784560143, "step": 39440}, {"loss": 0.6773, "grad_norm": 0.8745405077934265, "learning_rate": 0.0002, "epoch": 2.8330341113105924, "step": 39450}, {"loss": 0.6741, "grad_norm": 0.7974533438682556, "learning_rate": 0.0002, "epoch": 2.8337522441651704, "step": 39460}, {"loss": 0.6887, "grad_norm": 0.914289116859436, "learning_rate": 0.0002, "epoch": 2.834470377019749, "step": 39470}, {"loss": 0.7009, "grad_norm": 0.7686914801597595, "learning_rate": 0.0002, "epoch": 2.835188509874327, "step": 39480}, {"loss": 0.679, "grad_norm": 0.9289370179176331, "learning_rate": 0.0002, "epoch": 2.835906642728905, "step": 39490}, {"loss": 0.684, "grad_norm": 0.8851973414421082, "learning_rate": 0.0002, "epoch": 2.836624775583483, "step": 39500}, {"loss": 0.7012, "grad_norm": 0.7754096388816833, "learning_rate": 0.0002, "epoch": 2.8373429084380613, "step": 39510}, {"loss": 0.6936, "grad_norm": 0.8801632523536682, "learning_rate": 0.0002, "epoch": 2.8380610412926393, "step": 39520}, {"loss": 0.6878, "grad_norm": 0.9031528234481812, "learning_rate": 0.0002, "epoch": 2.8387791741472173, "step": 39530}, {"loss": 0.6815, "grad_norm": 0.7113721966743469, "learning_rate": 0.0002, "epoch": 2.8394973070017953, "step": 39540}, {"loss": 0.7287, "grad_norm": 0.7880923748016357, "learning_rate": 0.0002, "epoch": 2.8402154398563733, "step": 39550}, {"loss": 0.671, "grad_norm": 2.4828813076019287, "learning_rate": 0.0002, "epoch": 2.8409335727109513, "step": 39560}, {"loss": 0.6824, "grad_norm": 0.9174619913101196, "learning_rate": 0.0002, "epoch": 2.8416517055655297, "step": 39570}, {"loss": 0.7086, "grad_norm": 0.9708074927330017, "learning_rate": 0.0002, "epoch": 2.8423698384201077, "step": 39580}, {"loss": 0.7021, "grad_norm": 0.7968248724937439, "learning_rate": 0.0002, "epoch": 2.8430879712746857, "step": 39590}, {"loss": 0.7121, "grad_norm": 0.7967682480812073, "learning_rate": 0.0002, "epoch": 2.843806104129264, "step": 39600}, {"loss": 0.6284, "grad_norm": 0.7487651109695435, "learning_rate": 0.0002, "epoch": 2.844524236983842, "step": 39610}, {"loss": 0.6624, "grad_norm": 0.6997556686401367, "learning_rate": 0.0002, "epoch": 2.84524236983842, "step": 39620}, {"loss": 0.6987, "grad_norm": 0.7639351487159729, "learning_rate": 0.0002, "epoch": 2.845960502692998, "step": 39630}, {"loss": 0.6757, "grad_norm": 0.9086648225784302, "learning_rate": 0.0002, "epoch": 2.846678635547576, "step": 39640}, {"loss": 0.6841, "grad_norm": 0.91103196144104, "learning_rate": 0.0002, "epoch": 2.847396768402154, "step": 39650}, {"loss": 0.7046, "grad_norm": 0.8096913695335388, "learning_rate": 0.0002, "epoch": 2.8481149012567326, "step": 39660}, {"loss": 0.679, "grad_norm": 0.8961427807807922, "learning_rate": 0.0002, "epoch": 2.8488330341113106, "step": 39670}, {"loss": 0.6589, "grad_norm": 0.7489904761314392, "learning_rate": 0.0002, "epoch": 2.8495511669658886, "step": 39680}, {"loss": 0.6581, "grad_norm": 0.7893617749214172, "learning_rate": 0.0002, "epoch": 2.850269299820467, "step": 39690}, {"loss": 0.7326, "grad_norm": 0.8259761929512024, "learning_rate": 0.0002, "epoch": 2.850987432675045, "step": 39700}, {"loss": 0.6763, "grad_norm": 0.7006617188453674, "learning_rate": 0.0002, "epoch": 2.851705565529623, "step": 39710}, {"loss": 0.7095, "grad_norm": 0.8922327756881714, "learning_rate": 0.0002, "epoch": 2.852423698384201, "step": 39720}, {"loss": 0.6829, "grad_norm": 0.9058550000190735, "learning_rate": 0.0002, "epoch": 2.853141831238779, "step": 39730}, {"loss": 0.6777, "grad_norm": 0.7627129554748535, "learning_rate": 0.0002, "epoch": 2.853859964093357, "step": 39740}, {"loss": 0.6937, "grad_norm": 0.9316968321800232, "learning_rate": 0.0002, "epoch": 2.8545780969479355, "step": 39750}, {"loss": 0.6882, "grad_norm": 0.8424679040908813, "learning_rate": 0.0002, "epoch": 2.8552962298025135, "step": 39760}, {"loss": 0.7018, "grad_norm": 0.6185386776924133, "learning_rate": 0.0002, "epoch": 2.8560143626570915, "step": 39770}, {"loss": 0.7106, "grad_norm": 0.709902286529541, "learning_rate": 0.0002, "epoch": 2.8567324955116695, "step": 39780}, {"loss": 0.7007, "grad_norm": 0.93730229139328, "learning_rate": 0.0002, "epoch": 2.857450628366248, "step": 39790}, {"loss": 0.6973, "grad_norm": 0.875989556312561, "learning_rate": 0.0002, "epoch": 2.858168761220826, "step": 39800}, {"loss": 0.6685, "grad_norm": 0.7424131631851196, "learning_rate": 0.0002, "epoch": 2.858886894075404, "step": 39810}, {"loss": 0.6704, "grad_norm": 0.9108477830886841, "learning_rate": 0.0002, "epoch": 2.859605026929982, "step": 39820}, {"loss": 0.6677, "grad_norm": 0.8248386383056641, "learning_rate": 0.0002, "epoch": 2.86032315978456, "step": 39830}, {"loss": 0.6591, "grad_norm": 0.8739979863166809, "learning_rate": 0.0002, "epoch": 2.861041292639138, "step": 39840}, {"loss": 0.6674, "grad_norm": 0.7940961122512817, "learning_rate": 0.0002, "epoch": 2.8617594254937164, "step": 39850}, {"loss": 0.6875, "grad_norm": 0.7594687938690186, "learning_rate": 0.0002, "epoch": 2.8624775583482944, "step": 39860}, {"loss": 0.7339, "grad_norm": 0.9884313941001892, "learning_rate": 0.0002, "epoch": 2.8631956912028724, "step": 39870}, {"loss": 0.6583, "grad_norm": 0.8537741303443909, "learning_rate": 0.0002, "epoch": 2.863913824057451, "step": 39880}, {"loss": 0.6746, "grad_norm": 0.7407512664794922, "learning_rate": 0.0002, "epoch": 2.864631956912029, "step": 39890}, {"loss": 0.7211, "grad_norm": 1.0179548263549805, "learning_rate": 0.0002, "epoch": 2.865350089766607, "step": 39900}, {"loss": 0.6916, "grad_norm": 0.8822470307350159, "learning_rate": 0.0002, "epoch": 2.866068222621185, "step": 39910}, {"loss": 0.7141, "grad_norm": 0.794448733329773, "learning_rate": 0.0002, "epoch": 2.866786355475763, "step": 39920}, {"loss": 0.6993, "grad_norm": 0.8115299940109253, "learning_rate": 0.0002, "epoch": 2.867504488330341, "step": 39930}, {"loss": 0.655, "grad_norm": 0.7998958826065063, "learning_rate": 0.0002, "epoch": 2.8682226211849193, "step": 39940}, {"loss": 0.7414, "grad_norm": 0.8222435116767883, "learning_rate": 0.0002, "epoch": 2.8689407540394973, "step": 39950}, {"loss": 0.6987, "grad_norm": 0.9495923519134521, "learning_rate": 0.0002, "epoch": 2.8696588868940753, "step": 39960}, {"loss": 0.6567, "grad_norm": 0.6749192476272583, "learning_rate": 0.0002, "epoch": 2.8703770197486533, "step": 39970}, {"loss": 0.7003, "grad_norm": 0.8910874128341675, "learning_rate": 0.0002, "epoch": 2.871095152603232, "step": 39980}, {"loss": 0.6935, "grad_norm": 0.7051638960838318, "learning_rate": 0.0002, "epoch": 2.87181328545781, "step": 39990}, {"loss": 0.663, "grad_norm": 0.8456535339355469, "learning_rate": 0.0002, "epoch": 2.872531418312388, "step": 40000}, {"loss": 0.7222, "grad_norm": 0.934894859790802, "learning_rate": 0.0002, "epoch": 2.873249551166966, "step": 40010}, {"loss": 0.7106, "grad_norm": 0.6740477681159973, "learning_rate": 0.0002, "epoch": 2.873967684021544, "step": 40020}, {"loss": 0.6981, "grad_norm": 0.6632325649261475, "learning_rate": 0.0002, "epoch": 2.8746858168761222, "step": 40030}, {"loss": 0.7037, "grad_norm": 0.8889022469520569, "learning_rate": 0.0002, "epoch": 2.8754039497307002, "step": 40040}, {"loss": 0.7094, "grad_norm": 0.7460705637931824, "learning_rate": 0.0002, "epoch": 2.8761220825852782, "step": 40050}, {"loss": 0.6994, "grad_norm": 0.9795911908149719, "learning_rate": 0.0002, "epoch": 2.8768402154398562, "step": 40060}, {"loss": 0.6602, "grad_norm": 1.0002509355545044, "learning_rate": 0.0002, "epoch": 2.8775583482944347, "step": 40070}, {"loss": 0.7191, "grad_norm": 0.7867239713668823, "learning_rate": 0.0002, "epoch": 2.8782764811490127, "step": 40080}, {"loss": 0.6772, "grad_norm": 1.0221471786499023, "learning_rate": 0.0002, "epoch": 2.8789946140035907, "step": 40090}, {"loss": 0.7317, "grad_norm": 0.8091005086898804, "learning_rate": 0.0002, "epoch": 2.8797127468581687, "step": 40100}, {"loss": 0.7334, "grad_norm": 0.8485820293426514, "learning_rate": 0.0002, "epoch": 2.8804308797127467, "step": 40110}, {"loss": 0.7221, "grad_norm": 0.7850196957588196, "learning_rate": 0.0002, "epoch": 2.8811490125673247, "step": 40120}, {"loss": 0.6696, "grad_norm": 0.7906134128570557, "learning_rate": 0.0002, "epoch": 2.881867145421903, "step": 40130}, {"loss": 0.648, "grad_norm": 0.7957962155342102, "learning_rate": 0.0002, "epoch": 2.882585278276481, "step": 40140}, {"loss": 0.6774, "grad_norm": 1.0687522888183594, "learning_rate": 0.0002, "epoch": 2.883303411131059, "step": 40150}, {"loss": 0.7256, "grad_norm": 0.713752031326294, "learning_rate": 0.0002, "epoch": 2.8840215439856376, "step": 40160}, {"loss": 0.7144, "grad_norm": 1.1603864431381226, "learning_rate": 0.0002, "epoch": 2.8847396768402156, "step": 40170}, {"loss": 0.7223, "grad_norm": 0.8423245549201965, "learning_rate": 0.0002, "epoch": 2.8854578096947936, "step": 40180}, {"loss": 0.6796, "grad_norm": 0.7554550766944885, "learning_rate": 0.0002, "epoch": 2.8861759425493716, "step": 40190}, {"loss": 0.6923, "grad_norm": 0.6006978750228882, "learning_rate": 0.0002, "epoch": 2.8868940754039496, "step": 40200}, {"loss": 0.6893, "grad_norm": 0.923068106174469, "learning_rate": 0.0002, "epoch": 2.8876122082585276, "step": 40210}, {"loss": 0.6688, "grad_norm": 0.7659787535667419, "learning_rate": 0.0002, "epoch": 2.888330341113106, "step": 40220}, {"loss": 0.6706, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 2.889048473967684, "step": 40230}, {"loss": 0.6922, "grad_norm": 1.1267355680465698, "learning_rate": 0.0002, "epoch": 2.889766606822262, "step": 40240}, {"loss": 0.7636, "grad_norm": 0.8548554182052612, "learning_rate": 0.0002, "epoch": 2.89048473967684, "step": 40250}, {"loss": 0.6847, "grad_norm": 0.7846875786781311, "learning_rate": 0.0002, "epoch": 2.8912028725314185, "step": 40260}, {"loss": 0.6796, "grad_norm": 0.8606904745101929, "learning_rate": 0.0002, "epoch": 2.8919210053859965, "step": 40270}, {"loss": 0.6864, "grad_norm": 0.6508898138999939, "learning_rate": 0.0002, "epoch": 2.8926391382405745, "step": 40280}, {"loss": 0.6793, "grad_norm": 0.7903237342834473, "learning_rate": 0.0002, "epoch": 2.8933572710951525, "step": 40290}, {"loss": 0.6642, "grad_norm": 0.7320941686630249, "learning_rate": 0.0002, "epoch": 2.8940754039497305, "step": 40300}, {"loss": 0.6813, "grad_norm": 1.0031821727752686, "learning_rate": 0.0002, "epoch": 2.894793536804309, "step": 40310}, {"loss": 0.6071, "grad_norm": 0.7463554739952087, "learning_rate": 0.0002, "epoch": 2.895511669658887, "step": 40320}, {"loss": 0.6856, "grad_norm": 0.8455599546432495, "learning_rate": 0.0002, "epoch": 2.896229802513465, "step": 40330}, {"loss": 0.7252, "grad_norm": 0.7645914554595947, "learning_rate": 0.0002, "epoch": 2.896947935368043, "step": 40340}, {"loss": 0.7181, "grad_norm": 0.9074810147285461, "learning_rate": 0.0002, "epoch": 2.8976660682226214, "step": 40350}, {"loss": 0.6935, "grad_norm": 0.9070153832435608, "learning_rate": 0.0002, "epoch": 2.8983842010771994, "step": 40360}, {"loss": 0.7021, "grad_norm": 0.8649221658706665, "learning_rate": 0.0002, "epoch": 2.8991023339317774, "step": 40370}, {"loss": 0.7402, "grad_norm": 1.0325016975402832, "learning_rate": 0.0002, "epoch": 2.8998204667863554, "step": 40380}, {"loss": 0.6889, "grad_norm": 0.8688622713088989, "learning_rate": 0.0002, "epoch": 2.9005385996409334, "step": 40390}, {"loss": 0.7209, "grad_norm": 0.83316969871521, "learning_rate": 0.0002, "epoch": 2.9012567324955114, "step": 40400}, {"loss": 0.6915, "grad_norm": 1.0146536827087402, "learning_rate": 0.0002, "epoch": 2.90197486535009, "step": 40410}, {"loss": 0.67, "grad_norm": 6.21811580657959, "learning_rate": 0.0002, "epoch": 2.902692998204668, "step": 40420}, {"loss": 0.675, "grad_norm": 0.8747655749320984, "learning_rate": 0.0002, "epoch": 2.903411131059246, "step": 40430}, {"loss": 0.6781, "grad_norm": 0.8671547174453735, "learning_rate": 0.0002, "epoch": 2.9041292639138243, "step": 40440}, {"loss": 0.693, "grad_norm": 0.7888760566711426, "learning_rate": 0.0002, "epoch": 2.9048473967684023, "step": 40450}, {"loss": 0.7208, "grad_norm": 0.7182217240333557, "learning_rate": 0.0002, "epoch": 2.9055655296229803, "step": 40460}, {"loss": 0.7393, "grad_norm": 0.8802227973937988, "learning_rate": 0.0002, "epoch": 2.9062836624775583, "step": 40470}, {"loss": 0.6755, "grad_norm": 0.8106126189231873, "learning_rate": 0.0002, "epoch": 2.9070017953321363, "step": 40480}, {"loss": 0.7251, "grad_norm": 0.7313538789749146, "learning_rate": 0.0002, "epoch": 2.9077199281867143, "step": 40490}, {"loss": 0.6927, "grad_norm": 0.6098655462265015, "learning_rate": 0.0002, "epoch": 2.9084380610412928, "step": 40500}, {"loss": 0.6667, "grad_norm": 0.8849560618400574, "learning_rate": 0.0002, "epoch": 2.9091561938958708, "step": 40510}, {"loss": 0.7199, "grad_norm": 0.8761322498321533, "learning_rate": 0.0002, "epoch": 2.9098743267504488, "step": 40520}, {"loss": 0.6952, "grad_norm": 0.8259703516960144, "learning_rate": 0.0002, "epoch": 2.9105924596050268, "step": 40530}, {"loss": 0.6547, "grad_norm": 0.6613079309463501, "learning_rate": 0.0002, "epoch": 2.911310592459605, "step": 40540}, {"loss": 0.7642, "grad_norm": 0.825678825378418, "learning_rate": 0.0002, "epoch": 2.912028725314183, "step": 40550}, {"loss": 0.7052, "grad_norm": 0.824850857257843, "learning_rate": 0.0002, "epoch": 2.912746858168761, "step": 40560}, {"loss": 0.6869, "grad_norm": 0.9629682898521423, "learning_rate": 0.0002, "epoch": 2.9134649910233392, "step": 40570}, {"loss": 0.7588, "grad_norm": 0.7446485161781311, "learning_rate": 0.0002, "epoch": 2.9141831238779172, "step": 40580}, {"loss": 0.7045, "grad_norm": 0.9028317928314209, "learning_rate": 0.0002, "epoch": 2.9149012567324957, "step": 40590}, {"loss": 0.7128, "grad_norm": 0.9646022319793701, "learning_rate": 0.0002, "epoch": 2.9156193895870737, "step": 40600}, {"loss": 0.6782, "grad_norm": 0.8845045566558838, "learning_rate": 0.0002, "epoch": 2.9163375224416517, "step": 40610}, {"loss": 0.7179, "grad_norm": 0.9660372734069824, "learning_rate": 0.0002, "epoch": 2.9170556552962297, "step": 40620}, {"loss": 0.7442, "grad_norm": 0.8914347290992737, "learning_rate": 0.0002, "epoch": 2.917773788150808, "step": 40630}, {"loss": 0.6435, "grad_norm": 0.7789235711097717, "learning_rate": 0.0002, "epoch": 2.918491921005386, "step": 40640}, {"loss": 0.7156, "grad_norm": 0.8221206665039062, "learning_rate": 0.0002, "epoch": 2.919210053859964, "step": 40650}, {"loss": 0.7363, "grad_norm": 0.9550618529319763, "learning_rate": 0.0002, "epoch": 2.919928186714542, "step": 40660}, {"loss": 0.6911, "grad_norm": 0.868315577507019, "learning_rate": 0.0002, "epoch": 2.92064631956912, "step": 40670}, {"loss": 0.6939, "grad_norm": 0.852878749370575, "learning_rate": 0.0002, "epoch": 2.921364452423698, "step": 40680}, {"loss": 0.6497, "grad_norm": 0.8388790488243103, "learning_rate": 0.0002, "epoch": 2.9220825852782766, "step": 40690}, {"loss": 0.7299, "grad_norm": 0.9897602200508118, "learning_rate": 0.0002, "epoch": 2.9228007181328546, "step": 40700}, {"loss": 0.695, "grad_norm": 0.8050527572631836, "learning_rate": 0.0002, "epoch": 2.9235188509874326, "step": 40710}, {"loss": 0.6924, "grad_norm": 0.7296929955482483, "learning_rate": 0.0002, "epoch": 2.924236983842011, "step": 40720}, {"loss": 0.759, "grad_norm": 0.917475700378418, "learning_rate": 0.0002, "epoch": 2.924955116696589, "step": 40730}, {"loss": 0.6965, "grad_norm": 0.9118483662605286, "learning_rate": 0.0002, "epoch": 2.925673249551167, "step": 40740}, {"loss": 0.6918, "grad_norm": 0.7722473740577698, "learning_rate": 0.0002, "epoch": 2.926391382405745, "step": 40750}, {"loss": 0.7103, "grad_norm": 0.7950358986854553, "learning_rate": 0.0002, "epoch": 2.927109515260323, "step": 40760}, {"loss": 0.7266, "grad_norm": 0.8868561387062073, "learning_rate": 0.0002, "epoch": 2.927827648114901, "step": 40770}, {"loss": 0.7513, "grad_norm": 0.7923154830932617, "learning_rate": 0.0002, "epoch": 2.9285457809694795, "step": 40780}, {"loss": 0.6822, "grad_norm": 0.7285428047180176, "learning_rate": 0.0002, "epoch": 2.9292639138240575, "step": 40790}, {"loss": 0.6748, "grad_norm": 0.794775664806366, "learning_rate": 0.0002, "epoch": 2.9299820466786355, "step": 40800}, {"loss": 0.6967, "grad_norm": 0.8351698517799377, "learning_rate": 0.0002, "epoch": 2.9307001795332135, "step": 40810}, {"loss": 0.6927, "grad_norm": 0.853082001209259, "learning_rate": 0.0002, "epoch": 2.931418312387792, "step": 40820}, {"loss": 0.7047, "grad_norm": 0.8209722638130188, "learning_rate": 0.0002, "epoch": 2.93213644524237, "step": 40830}, {"loss": 0.6742, "grad_norm": 0.8982136845588684, "learning_rate": 0.0002, "epoch": 2.932854578096948, "step": 40840}, {"loss": 0.6617, "grad_norm": 0.8373305201530457, "learning_rate": 0.0002, "epoch": 2.933572710951526, "step": 40850}, {"loss": 0.6754, "grad_norm": 0.8326864242553711, "learning_rate": 0.0002, "epoch": 2.934290843806104, "step": 40860}, {"loss": 0.7151, "grad_norm": 0.7232590317726135, "learning_rate": 0.0002, "epoch": 2.9350089766606824, "step": 40870}, {"loss": 0.7311, "grad_norm": 0.823615312576294, "learning_rate": 0.0002, "epoch": 2.9357271095152604, "step": 40880}, {"loss": 0.7122, "grad_norm": 0.7532811760902405, "learning_rate": 0.0002, "epoch": 2.9364452423698384, "step": 40890}, {"loss": 0.7254, "grad_norm": 0.9594773650169373, "learning_rate": 0.0002, "epoch": 2.9371633752244164, "step": 40900}, {"loss": 0.7024, "grad_norm": 0.8368398547172546, "learning_rate": 0.0002, "epoch": 2.937881508078995, "step": 40910}, {"loss": 0.7201, "grad_norm": 0.8336817026138306, "learning_rate": 0.0002, "epoch": 2.938599640933573, "step": 40920}, {"loss": 0.6402, "grad_norm": 0.8413758277893066, "learning_rate": 0.0002, "epoch": 2.939317773788151, "step": 40930}, {"loss": 0.7054, "grad_norm": 0.7117549180984497, "learning_rate": 0.0002, "epoch": 2.940035906642729, "step": 40940}, {"loss": 0.6101, "grad_norm": 0.8741925954818726, "learning_rate": 0.0002, "epoch": 2.940754039497307, "step": 40950}, {"loss": 0.7491, "grad_norm": 0.8476088047027588, "learning_rate": 0.0002, "epoch": 2.941472172351885, "step": 40960}, {"loss": 0.7084, "grad_norm": 0.674659788608551, "learning_rate": 0.0002, "epoch": 2.9421903052064633, "step": 40970}, {"loss": 0.6714, "grad_norm": 0.7087500691413879, "learning_rate": 0.0002, "epoch": 2.9429084380610413, "step": 40980}, {"loss": 0.6953, "grad_norm": 0.9202252626419067, "learning_rate": 0.0002, "epoch": 2.9436265709156193, "step": 40990}, {"loss": 0.7244, "grad_norm": 0.9775124192237854, "learning_rate": 0.0002, "epoch": 2.9443447037701977, "step": 41000}, {"loss": 0.6897, "grad_norm": 0.7465068101882935, "learning_rate": 0.0002, "epoch": 2.9450628366247757, "step": 41010}, {"loss": 0.6944, "grad_norm": 0.7229986786842346, "learning_rate": 0.0002, "epoch": 2.9457809694793538, "step": 41020}, {"loss": 0.6754, "grad_norm": 0.7228954434394836, "learning_rate": 0.0002, "epoch": 2.9464991023339318, "step": 41030}, {"loss": 0.6604, "grad_norm": 0.9396149516105652, "learning_rate": 0.0002, "epoch": 2.9472172351885098, "step": 41040}, {"loss": 0.6498, "grad_norm": 0.9458696842193604, "learning_rate": 0.0002, "epoch": 2.9479353680430878, "step": 41050}, {"loss": 0.7154, "grad_norm": 0.8276246190071106, "learning_rate": 0.0002, "epoch": 2.948653500897666, "step": 41060}, {"loss": 0.6567, "grad_norm": 0.7927420139312744, "learning_rate": 0.0002, "epoch": 2.949371633752244, "step": 41070}, {"loss": 0.7442, "grad_norm": 0.7403103709220886, "learning_rate": 0.0002, "epoch": 2.950089766606822, "step": 41080}, {"loss": 0.6856, "grad_norm": 0.9813524484634399, "learning_rate": 0.0002, "epoch": 2.9508078994614, "step": 41090}, {"loss": 0.7271, "grad_norm": 0.8560924530029297, "learning_rate": 0.0002, "epoch": 2.9515260323159787, "step": 41100}, {"loss": 0.6851, "grad_norm": 0.6937443017959595, "learning_rate": 0.0002, "epoch": 2.9522441651705567, "step": 41110}, {"loss": 0.6817, "grad_norm": 0.8440476655960083, "learning_rate": 0.0002, "epoch": 2.9529622980251347, "step": 41120}, {"loss": 0.7082, "grad_norm": 1.1260770559310913, "learning_rate": 0.0002, "epoch": 2.9536804308797127, "step": 41130}, {"loss": 0.6745, "grad_norm": 0.8789936900138855, "learning_rate": 0.0002, "epoch": 2.9543985637342907, "step": 41140}, {"loss": 0.7297, "grad_norm": 0.8205832839012146, "learning_rate": 0.0002, "epoch": 2.9551166965888687, "step": 41150}, {"loss": 0.7036, "grad_norm": 0.8148444294929504, "learning_rate": 0.0002, "epoch": 2.955834829443447, "step": 41160}, {"loss": 0.6923, "grad_norm": 0.791296660900116, "learning_rate": 0.0002, "epoch": 2.956552962298025, "step": 41170}, {"loss": 0.6589, "grad_norm": 1.3229854106903076, "learning_rate": 0.0002, "epoch": 2.957271095152603, "step": 41180}, {"loss": 0.6691, "grad_norm": 0.906423807144165, "learning_rate": 0.0002, "epoch": 2.9579892280071816, "step": 41190}, {"loss": 0.6979, "grad_norm": 0.8707411289215088, "learning_rate": 0.0002, "epoch": 2.9587073608617596, "step": 41200}, {"loss": 0.6442, "grad_norm": 1.0362473726272583, "learning_rate": 0.0002, "epoch": 2.9594254937163376, "step": 41210}, {"loss": 0.6725, "grad_norm": 0.818546712398529, "learning_rate": 0.0002, "epoch": 2.9601436265709156, "step": 41220}, {"loss": 0.7158, "grad_norm": 0.8558517098426819, "learning_rate": 0.0002, "epoch": 2.9608617594254936, "step": 41230}, {"loss": 0.7056, "grad_norm": 0.8262931704521179, "learning_rate": 0.0002, "epoch": 2.9615798922800716, "step": 41240}, {"loss": 0.6256, "grad_norm": 0.9603250026702881, "learning_rate": 0.0002, "epoch": 2.96229802513465, "step": 41250}, {"loss": 0.68, "grad_norm": 0.891610860824585, "learning_rate": 0.0002, "epoch": 2.963016157989228, "step": 41260}, {"loss": 0.7732, "grad_norm": 0.9823883175849915, "learning_rate": 0.0002, "epoch": 2.963734290843806, "step": 41270}, {"loss": 0.7144, "grad_norm": 0.8783510327339172, "learning_rate": 0.0002, "epoch": 2.9644524236983845, "step": 41280}, {"loss": 0.7196, "grad_norm": 0.873656690120697, "learning_rate": 0.0002, "epoch": 2.9651705565529625, "step": 41290}, {"loss": 0.6531, "grad_norm": 0.8281165957450867, "learning_rate": 0.0002, "epoch": 2.9658886894075405, "step": 41300}, {"loss": 0.69, "grad_norm": 0.8008899092674255, "learning_rate": 0.0002, "epoch": 2.9666068222621185, "step": 41310}, {"loss": 0.6923, "grad_norm": 0.8564065098762512, "learning_rate": 0.0002, "epoch": 2.9673249551166965, "step": 41320}, {"loss": 0.6871, "grad_norm": 0.786119818687439, "learning_rate": 0.0002, "epoch": 2.9680430879712745, "step": 41330}, {"loss": 0.7105, "grad_norm": 1.3152399063110352, "learning_rate": 0.0002, "epoch": 2.968761220825853, "step": 41340}, {"loss": 0.6575, "grad_norm": 0.7551527619361877, "learning_rate": 0.0002, "epoch": 2.969479353680431, "step": 41350}, {"loss": 0.6939, "grad_norm": 1.1397290229797363, "learning_rate": 0.0002, "epoch": 2.970197486535009, "step": 41360}, {"loss": 0.7119, "grad_norm": 0.8333854079246521, "learning_rate": 0.0002, "epoch": 2.970915619389587, "step": 41370}, {"loss": 0.6941, "grad_norm": 0.8096165657043457, "learning_rate": 0.0002, "epoch": 2.9716337522441654, "step": 41380}, {"loss": 0.7748, "grad_norm": 0.8378547430038452, "learning_rate": 0.0002, "epoch": 2.9723518850987434, "step": 41390}, {"loss": 0.7678, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 2.9730700179533214, "step": 41400}, {"loss": 0.6962, "grad_norm": 0.8722409605979919, "learning_rate": 0.0002, "epoch": 2.9737881508078994, "step": 41410}, {"loss": 0.7298, "grad_norm": 0.6680061221122742, "learning_rate": 0.0002, "epoch": 2.9745062836624774, "step": 41420}, {"loss": 0.6731, "grad_norm": 0.7666152715682983, "learning_rate": 0.0002, "epoch": 2.9752244165170554, "step": 41430}, {"loss": 0.7377, "grad_norm": 0.8489957451820374, "learning_rate": 0.0002, "epoch": 2.975942549371634, "step": 41440}, {"loss": 0.6816, "grad_norm": 0.8516127467155457, "learning_rate": 0.0002, "epoch": 2.976660682226212, "step": 41450}, {"loss": 0.697, "grad_norm": 0.8836804628372192, "learning_rate": 0.0002, "epoch": 2.97737881508079, "step": 41460}, {"loss": 0.7048, "grad_norm": 1.0963364839553833, "learning_rate": 0.0002, "epoch": 2.9780969479353683, "step": 41470}, {"loss": 0.6695, "grad_norm": 0.9908610582351685, "learning_rate": 0.0002, "epoch": 2.9788150807899463, "step": 41480}, {"loss": 0.7184, "grad_norm": 0.8822041153907776, "learning_rate": 0.0002, "epoch": 2.9795332136445243, "step": 41490}, {"loss": 0.7192, "grad_norm": 0.717723548412323, "learning_rate": 0.0002, "epoch": 2.9802513464991023, "step": 41500}, {"loss": 0.711, "grad_norm": 0.8413400053977966, "learning_rate": 0.0002, "epoch": 2.9809694793536803, "step": 41510}, {"loss": 0.6871, "grad_norm": 0.8771023750305176, "learning_rate": 0.0002, "epoch": 2.9816876122082583, "step": 41520}, {"loss": 0.6802, "grad_norm": 0.7185000777244568, "learning_rate": 0.0002, "epoch": 2.9824057450628367, "step": 41530}, {"loss": 0.706, "grad_norm": 0.8299767374992371, "learning_rate": 0.0002, "epoch": 2.9831238779174147, "step": 41540}, {"loss": 0.6569, "grad_norm": 0.9309971928596497, "learning_rate": 0.0002, "epoch": 2.9838420107719927, "step": 41550}, {"loss": 0.6598, "grad_norm": 0.7644693851470947, "learning_rate": 0.0002, "epoch": 2.984560143626571, "step": 41560}, {"loss": 0.7186, "grad_norm": 0.7888111472129822, "learning_rate": 0.0002, "epoch": 2.985278276481149, "step": 41570}, {"loss": 0.6984, "grad_norm": 1.0921967029571533, "learning_rate": 0.0002, "epoch": 2.985996409335727, "step": 41580}, {"loss": 0.6629, "grad_norm": 0.8116785883903503, "learning_rate": 0.0002, "epoch": 2.986714542190305, "step": 41590}, {"loss": 0.6842, "grad_norm": 0.983269214630127, "learning_rate": 0.0002, "epoch": 2.987432675044883, "step": 41600}, {"loss": 0.6675, "grad_norm": 0.81700599193573, "learning_rate": 0.0002, "epoch": 2.988150807899461, "step": 41610}, {"loss": 0.7525, "grad_norm": 0.7545617818832397, "learning_rate": 0.0002, "epoch": 2.9888689407540396, "step": 41620}, {"loss": 0.6698, "grad_norm": 0.8695791363716125, "learning_rate": 0.0002, "epoch": 2.9895870736086176, "step": 41630}, {"loss": 0.7446, "grad_norm": 0.8980445861816406, "learning_rate": 0.0002, "epoch": 2.9903052064631956, "step": 41640}, {"loss": 0.6616, "grad_norm": 0.7884747982025146, "learning_rate": 0.0002, "epoch": 2.9910233393177736, "step": 41650}, {"loss": 0.6461, "grad_norm": 0.8347880840301514, "learning_rate": 0.0002, "epoch": 2.991741472172352, "step": 41660}, {"loss": 0.6607, "grad_norm": 0.7786261439323425, "learning_rate": 0.0002, "epoch": 2.99245960502693, "step": 41670}, {"loss": 0.6834, "grad_norm": 0.7830624580383301, "learning_rate": 0.0002, "epoch": 2.993177737881508, "step": 41680}, {"loss": 0.7116, "grad_norm": 0.8293532133102417, "learning_rate": 0.0002, "epoch": 2.993895870736086, "step": 41690}, {"loss": 0.7029, "grad_norm": 0.8476244211196899, "learning_rate": 0.0002, "epoch": 2.994614003590664, "step": 41700}, {"loss": 0.6909, "grad_norm": 0.7218726873397827, "learning_rate": 0.0002, "epoch": 2.995332136445242, "step": 41710}, {"loss": 0.6579, "grad_norm": 0.8144199252128601, "learning_rate": 0.0002, "epoch": 2.9960502692998205, "step": 41720}, {"loss": 0.7011, "grad_norm": 0.7047123312950134, "learning_rate": 0.0002, "epoch": 2.9967684021543985, "step": 41730}, {"loss": 0.6555, "grad_norm": 0.8412184715270996, "learning_rate": 0.0002, "epoch": 2.9974865350089765, "step": 41740}, {"loss": 0.7237, "grad_norm": 0.8840848207473755, "learning_rate": 0.0002, "epoch": 2.998204667863555, "step": 41750}, {"loss": 0.6618, "grad_norm": 0.7302142977714539, "learning_rate": 0.0002, "epoch": 2.998922800718133, "step": 41760}, {"loss": 0.6596, "grad_norm": 0.7075994610786438, "learning_rate": 0.0002, "epoch": 2.999640933572711, "step": 41770}, {"eval_loss": 1.1079821586608887, "eval_runtime": 55.1897, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 3.0, "step": 41775}, {"loss": 0.6472, "grad_norm": 0.8630077838897705, "learning_rate": 0.0002, "epoch": 3.000359066427289, "step": 41780}, {"loss": 0.5843, "grad_norm": 0.8901806473731995, "learning_rate": 0.0002, "epoch": 3.001077199281867, "step": 41790}, {"loss": 0.5789, "grad_norm": 0.8291767835617065, "learning_rate": 0.0002, "epoch": 3.0017953321364454, "step": 41800}, {"loss": 0.6049, "grad_norm": 0.792519211769104, "learning_rate": 0.0002, "epoch": 3.0025134649910235, "step": 41810}, {"loss": 0.6131, "grad_norm": 1.1330063343048096, "learning_rate": 0.0002, "epoch": 3.0032315978456015, "step": 41820}, {"loss": 0.6225, "grad_norm": 0.9401350617408752, "learning_rate": 0.0002, "epoch": 3.0039497307001795, "step": 41830}, {"loss": 0.5924, "grad_norm": 0.8065463304519653, "learning_rate": 0.0002, "epoch": 3.0046678635547575, "step": 41840}, {"loss": 0.6161, "grad_norm": 0.8309979438781738, "learning_rate": 0.0002, "epoch": 3.005385996409336, "step": 41850}, {"loss": 0.6099, "grad_norm": 0.7432689070701599, "learning_rate": 0.0002, "epoch": 3.006104129263914, "step": 41860}, {"loss": 0.5901, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 3.006822262118492, "step": 41870}, {"loss": 0.6211, "grad_norm": 1.4364255666732788, "learning_rate": 0.0002, "epoch": 3.00754039497307, "step": 41880}, {"loss": 0.5988, "grad_norm": 0.9023072123527527, "learning_rate": 0.0002, "epoch": 3.008258527827648, "step": 41890}, {"loss": 0.6296, "grad_norm": 0.7790587544441223, "learning_rate": 0.0002, "epoch": 3.0089766606822264, "step": 41900}, {"loss": 0.5908, "grad_norm": 0.9163706302642822, "learning_rate": 0.0002, "epoch": 3.0096947935368044, "step": 41910}, {"loss": 0.6216, "grad_norm": 0.8147963285446167, "learning_rate": 0.0002, "epoch": 3.0104129263913824, "step": 41920}, {"loss": 0.6546, "grad_norm": 0.8432748913764954, "learning_rate": 0.0002, "epoch": 3.0111310592459604, "step": 41930}, {"loss": 0.5815, "grad_norm": 0.9216182231903076, "learning_rate": 0.0002, "epoch": 3.011849192100539, "step": 41940}, {"loss": 0.6336, "grad_norm": 0.62154221534729, "learning_rate": 0.0002, "epoch": 3.012567324955117, "step": 41950}, {"loss": 0.5868, "grad_norm": 0.8902392387390137, "learning_rate": 0.0002, "epoch": 3.013285457809695, "step": 41960}, {"loss": 0.6205, "grad_norm": 0.9601083993911743, "learning_rate": 0.0002, "epoch": 3.014003590664273, "step": 41970}, {"loss": 0.6001, "grad_norm": 0.8938809037208557, "learning_rate": 0.0002, "epoch": 3.014721723518851, "step": 41980}, {"loss": 0.6215, "grad_norm": 1.0621999502182007, "learning_rate": 0.0002, "epoch": 3.0154398563734293, "step": 41990}, {"loss": 0.6453, "grad_norm": 0.7310585379600525, "learning_rate": 0.0002, "epoch": 3.0161579892280073, "step": 42000}, {"loss": 0.5674, "grad_norm": 0.8475853800773621, "learning_rate": 0.0002, "epoch": 3.0168761220825853, "step": 42010}, {"loss": 0.605, "grad_norm": 0.8509864807128906, "learning_rate": 0.0002, "epoch": 3.0175942549371633, "step": 42020}, {"loss": 0.6487, "grad_norm": 0.7461876273155212, "learning_rate": 0.0002, "epoch": 3.0183123877917413, "step": 42030}, {"loss": 0.6136, "grad_norm": 0.7734265327453613, "learning_rate": 0.0002, "epoch": 3.0190305206463197, "step": 42040}, {"loss": 0.6073, "grad_norm": 0.9056455492973328, "learning_rate": 0.0002, "epoch": 3.0197486535008977, "step": 42050}, {"loss": 0.6015, "grad_norm": 0.9183889031410217, "learning_rate": 0.0002, "epoch": 3.0204667863554757, "step": 42060}, {"loss": 0.6502, "grad_norm": 1.0777326822280884, "learning_rate": 0.0002, "epoch": 3.0211849192100537, "step": 42070}, {"loss": 0.6775, "grad_norm": 0.9217308163642883, "learning_rate": 0.0002, "epoch": 3.021903052064632, "step": 42080}, {"loss": 0.6157, "grad_norm": 0.8220202326774597, "learning_rate": 0.0002, "epoch": 3.02262118491921, "step": 42090}, {"loss": 0.5786, "grad_norm": 0.8454978466033936, "learning_rate": 0.0002, "epoch": 3.023339317773788, "step": 42100}, {"loss": 0.5653, "grad_norm": 0.8116370439529419, "learning_rate": 0.0002, "epoch": 3.024057450628366, "step": 42110}, {"loss": 0.6307, "grad_norm": 0.8064935207366943, "learning_rate": 0.0002, "epoch": 3.024775583482944, "step": 42120}, {"loss": 0.6567, "grad_norm": 0.9718650579452515, "learning_rate": 0.0002, "epoch": 3.0254937163375226, "step": 42130}, {"loss": 0.5936, "grad_norm": 0.8817588090896606, "learning_rate": 0.0002, "epoch": 3.0262118491921006, "step": 42140}, {"loss": 0.5625, "grad_norm": 0.7757318615913391, "learning_rate": 0.0002, "epoch": 3.0269299820466786, "step": 42150}, {"loss": 0.5704, "grad_norm": 0.7500545382499695, "learning_rate": 0.0002, "epoch": 3.0276481149012566, "step": 42160}, {"loss": 0.5635, "grad_norm": 0.72913658618927, "learning_rate": 0.0002, "epoch": 3.0283662477558346, "step": 42170}, {"loss": 0.6354, "grad_norm": 0.7641891837120056, "learning_rate": 0.0002, "epoch": 3.029084380610413, "step": 42180}, {"loss": 0.621, "grad_norm": 0.7682021856307983, "learning_rate": 0.0002, "epoch": 3.029802513464991, "step": 42190}, {"loss": 0.6377, "grad_norm": 0.8145958781242371, "learning_rate": 0.0002, "epoch": 3.030520646319569, "step": 42200}, {"loss": 0.6008, "grad_norm": 1.0546396970748901, "learning_rate": 0.0002, "epoch": 3.031238779174147, "step": 42210}, {"loss": 0.6177, "grad_norm": 0.8222804665565491, "learning_rate": 0.0002, "epoch": 3.0319569120287255, "step": 42220}, {"loss": 0.6264, "grad_norm": 0.8245829343795776, "learning_rate": 0.0002, "epoch": 3.0326750448833035, "step": 42230}, {"loss": 0.5828, "grad_norm": 0.9059963822364807, "learning_rate": 0.0002, "epoch": 3.0333931777378815, "step": 42240}, {"loss": 0.6373, "grad_norm": 1.026747465133667, "learning_rate": 0.0002, "epoch": 3.0341113105924595, "step": 42250}, {"loss": 0.636, "grad_norm": 0.9108404517173767, "learning_rate": 0.0002, "epoch": 3.0348294434470375, "step": 42260}, {"loss": 0.589, "grad_norm": 0.9828516840934753, "learning_rate": 0.0002, "epoch": 3.035547576301616, "step": 42270}, {"loss": 0.6558, "grad_norm": 0.9664266705513, "learning_rate": 0.0002, "epoch": 3.036265709156194, "step": 42280}, {"loss": 0.6157, "grad_norm": 0.7577654719352722, "learning_rate": 0.0002, "epoch": 3.036983842010772, "step": 42290}, {"loss": 0.5849, "grad_norm": 0.8331853151321411, "learning_rate": 0.0002, "epoch": 3.03770197486535, "step": 42300}, {"loss": 0.6335, "grad_norm": 0.8017228245735168, "learning_rate": 0.0002, "epoch": 3.038420107719928, "step": 42310}, {"loss": 0.6148, "grad_norm": 1.0316718816757202, "learning_rate": 0.0002, "epoch": 3.0391382405745064, "step": 42320}, {"loss": 0.5934, "grad_norm": 0.9379803538322449, "learning_rate": 0.0002, "epoch": 3.0398563734290844, "step": 42330}, {"loss": 0.6358, "grad_norm": 0.7554476857185364, "learning_rate": 0.0002, "epoch": 3.0405745062836624, "step": 42340}, {"loss": 0.5951, "grad_norm": 0.7377917766571045, "learning_rate": 0.0002, "epoch": 3.0412926391382404, "step": 42350}, {"loss": 0.5769, "grad_norm": 1.0655276775360107, "learning_rate": 0.0002, "epoch": 3.042010771992819, "step": 42360}, {"loss": 0.5892, "grad_norm": 0.7748511433601379, "learning_rate": 0.0002, "epoch": 3.042728904847397, "step": 42370}, {"loss": 0.6512, "grad_norm": 0.848649799823761, "learning_rate": 0.0002, "epoch": 3.043447037701975, "step": 42380}, {"loss": 0.6411, "grad_norm": 0.7754636406898499, "learning_rate": 0.0002, "epoch": 3.044165170556553, "step": 42390}, {"loss": 0.6665, "grad_norm": 0.8173656463623047, "learning_rate": 0.0002, "epoch": 3.044883303411131, "step": 42400}, {"loss": 0.5877, "grad_norm": 0.7881983518600464, "learning_rate": 0.0002, "epoch": 3.0456014362657093, "step": 42410}, {"loss": 0.5832, "grad_norm": 0.971072256565094, "learning_rate": 0.0002, "epoch": 3.0463195691202873, "step": 42420}, {"loss": 0.6303, "grad_norm": 0.8400143384933472, "learning_rate": 0.0002, "epoch": 3.0470377019748653, "step": 42430}, {"loss": 0.6557, "grad_norm": 1.0028647184371948, "learning_rate": 0.0002, "epoch": 3.0477558348294433, "step": 42440}, {"loss": 0.5949, "grad_norm": 0.9728034734725952, "learning_rate": 0.0002, "epoch": 3.0484739676840213, "step": 42450}, {"loss": 0.6222, "grad_norm": 0.937633752822876, "learning_rate": 0.0002, "epoch": 3.0491921005386, "step": 42460}, {"loss": 0.6254, "grad_norm": 1.0265642404556274, "learning_rate": 0.0002, "epoch": 3.049910233393178, "step": 42470}, {"loss": 0.6078, "grad_norm": 0.9733216762542725, "learning_rate": 0.0002, "epoch": 3.050628366247756, "step": 42480}, {"loss": 0.5766, "grad_norm": 0.7039174437522888, "learning_rate": 0.0002, "epoch": 3.051346499102334, "step": 42490}, {"loss": 0.6422, "grad_norm": 0.7515231370925903, "learning_rate": 0.0002, "epoch": 3.0520646319569122, "step": 42500}, {"loss": 0.5517, "grad_norm": 0.9115300178527832, "learning_rate": 0.0002, "epoch": 3.0527827648114902, "step": 42510}, {"loss": 0.6738, "grad_norm": 0.7403655648231506, "learning_rate": 0.0002, "epoch": 3.0535008976660682, "step": 42520}, {"loss": 0.5528, "grad_norm": 0.7826810479164124, "learning_rate": 0.0002, "epoch": 3.0542190305206462, "step": 42530}, {"loss": 0.6513, "grad_norm": 0.8007349371910095, "learning_rate": 0.0002, "epoch": 3.0549371633752243, "step": 42540}, {"loss": 0.6118, "grad_norm": 0.7975959777832031, "learning_rate": 0.0002, "epoch": 3.0556552962298027, "step": 42550}, {"loss": 0.6157, "grad_norm": 0.9665228128433228, "learning_rate": 0.0002, "epoch": 3.0563734290843807, "step": 42560}, {"loss": 0.6095, "grad_norm": 0.8386123180389404, "learning_rate": 0.0002, "epoch": 3.0570915619389587, "step": 42570}, {"loss": 0.64, "grad_norm": 0.7437782287597656, "learning_rate": 0.0002, "epoch": 3.0578096947935367, "step": 42580}, {"loss": 0.6399, "grad_norm": 0.8360698223114014, "learning_rate": 0.0002, "epoch": 3.0585278276481147, "step": 42590}, {"loss": 0.6259, "grad_norm": 0.8982073664665222, "learning_rate": 0.0002, "epoch": 3.059245960502693, "step": 42600}, {"loss": 0.6235, "grad_norm": 0.9425758719444275, "learning_rate": 0.0002, "epoch": 3.059964093357271, "step": 42610}, {"loss": 0.631, "grad_norm": 0.8567131161689758, "learning_rate": 0.0002, "epoch": 3.060682226211849, "step": 42620}, {"loss": 0.609, "grad_norm": 0.9322942495346069, "learning_rate": 0.0002, "epoch": 3.061400359066427, "step": 42630}, {"loss": 0.6384, "grad_norm": 0.8283235430717468, "learning_rate": 0.0002, "epoch": 3.0621184919210056, "step": 42640}, {"loss": 0.6345, "grad_norm": 0.8457967638969421, "learning_rate": 0.0002, "epoch": 3.0628366247755836, "step": 42650}, {"loss": 0.631, "grad_norm": 0.8205100893974304, "learning_rate": 0.0002, "epoch": 3.0635547576301616, "step": 42660}, {"loss": 0.6094, "grad_norm": 0.8385181427001953, "learning_rate": 0.0002, "epoch": 3.0642728904847396, "step": 42670}, {"loss": 0.6169, "grad_norm": 1.2959390878677368, "learning_rate": 0.0002, "epoch": 3.0649910233393176, "step": 42680}, {"loss": 0.6531, "grad_norm": 0.7150540351867676, "learning_rate": 0.0002, "epoch": 3.065709156193896, "step": 42690}, {"loss": 0.6456, "grad_norm": 0.6647360920906067, "learning_rate": 0.0002, "epoch": 3.066427289048474, "step": 42700}, {"loss": 0.6151, "grad_norm": 0.9148316979408264, "learning_rate": 0.0002, "epoch": 3.067145421903052, "step": 42710}, {"loss": 0.6298, "grad_norm": 0.8606209754943848, "learning_rate": 0.0002, "epoch": 3.06786355475763, "step": 42720}, {"loss": 0.636, "grad_norm": 1.4255632162094116, "learning_rate": 0.0002, "epoch": 3.068581687612208, "step": 42730}, {"loss": 0.6363, "grad_norm": 0.9131710529327393, "learning_rate": 0.0002, "epoch": 3.0692998204667865, "step": 42740}, {"loss": 0.6432, "grad_norm": 0.9560360908508301, "learning_rate": 0.0002, "epoch": 3.0700179533213645, "step": 42750}, {"loss": 0.6259, "grad_norm": 0.9278100728988647, "learning_rate": 0.0002, "epoch": 3.0707360861759425, "step": 42760}, {"loss": 0.6001, "grad_norm": 0.7258471846580505, "learning_rate": 0.0002, "epoch": 3.0714542190305205, "step": 42770}, {"loss": 0.6447, "grad_norm": 1.1537690162658691, "learning_rate": 0.0002, "epoch": 3.072172351885099, "step": 42780}, {"loss": 0.6237, "grad_norm": 0.8562588691711426, "learning_rate": 0.0002, "epoch": 3.072890484739677, "step": 42790}, {"loss": 0.645, "grad_norm": 1.0271626710891724, "learning_rate": 0.0002, "epoch": 3.073608617594255, "step": 42800}, {"loss": 0.6782, "grad_norm": 0.85148024559021, "learning_rate": 0.0002, "epoch": 3.074326750448833, "step": 42810}, {"loss": 0.5905, "grad_norm": 0.805772602558136, "learning_rate": 0.0002, "epoch": 3.075044883303411, "step": 42820}, {"loss": 0.623, "grad_norm": 0.8057122230529785, "learning_rate": 0.0002, "epoch": 3.0757630161579894, "step": 42830}, {"loss": 0.6391, "grad_norm": 0.7997274994850159, "learning_rate": 0.0002, "epoch": 3.0764811490125674, "step": 42840}, {"loss": 0.5965, "grad_norm": 0.8739321231842041, "learning_rate": 0.0002, "epoch": 3.0771992818671454, "step": 42850}, {"loss": 0.6027, "grad_norm": 0.833951473236084, "learning_rate": 0.0002, "epoch": 3.0779174147217234, "step": 42860}, {"loss": 0.6251, "grad_norm": 0.8813839554786682, "learning_rate": 0.0002, "epoch": 3.0786355475763014, "step": 42870}, {"loss": 0.6485, "grad_norm": 0.9020521640777588, "learning_rate": 0.0002, "epoch": 3.07935368043088, "step": 42880}, {"loss": 0.5719, "grad_norm": 0.888148844242096, "learning_rate": 0.0002, "epoch": 3.080071813285458, "step": 42890}, {"loss": 0.6715, "grad_norm": 0.8110589385032654, "learning_rate": 0.0002, "epoch": 3.080789946140036, "step": 42900}, {"loss": 0.5931, "grad_norm": 0.818738579750061, "learning_rate": 0.0002, "epoch": 3.081508078994614, "step": 42910}, {"loss": 0.6723, "grad_norm": 0.9607479572296143, "learning_rate": 0.0002, "epoch": 3.082226211849192, "step": 42920}, {"loss": 0.6045, "grad_norm": 0.8162698745727539, "learning_rate": 0.0002, "epoch": 3.0829443447037703, "step": 42930}, {"loss": 0.5975, "grad_norm": 0.8170801997184753, "learning_rate": 0.0002, "epoch": 3.0836624775583483, "step": 42940}, {"loss": 0.5748, "grad_norm": 0.9250763654708862, "learning_rate": 0.0002, "epoch": 3.0843806104129263, "step": 42950}, {"loss": 0.6651, "grad_norm": 0.898097813129425, "learning_rate": 0.0002, "epoch": 3.0850987432675043, "step": 42960}, {"loss": 0.6573, "grad_norm": 0.9398433566093445, "learning_rate": 0.0002, "epoch": 3.0858168761220828, "step": 42970}, {"loss": 0.6243, "grad_norm": 1.052808165550232, "learning_rate": 0.0002, "epoch": 3.0865350089766608, "step": 42980}, {"loss": 0.6622, "grad_norm": 0.8974723219871521, "learning_rate": 0.0002, "epoch": 3.087253141831239, "step": 42990}, {"loss": 0.6135, "grad_norm": 0.7517408728599548, "learning_rate": 0.0002, "epoch": 3.087971274685817, "step": 43000}, {"loss": 0.6185, "grad_norm": 0.8054485321044922, "learning_rate": 0.0002, "epoch": 3.088689407540395, "step": 43010}, {"loss": 0.6199, "grad_norm": 0.9896154999732971, "learning_rate": 0.0002, "epoch": 3.0894075403949732, "step": 43020}, {"loss": 0.6308, "grad_norm": 0.7887356281280518, "learning_rate": 0.0002, "epoch": 3.0901256732495512, "step": 43030}, {"loss": 0.6173, "grad_norm": 1.0119125843048096, "learning_rate": 0.0002, "epoch": 3.0908438061041292, "step": 43040}, {"loss": 0.6294, "grad_norm": 0.8753892779350281, "learning_rate": 0.0002, "epoch": 3.0915619389587072, "step": 43050}, {"loss": 0.6068, "grad_norm": 0.8322654962539673, "learning_rate": 0.0002, "epoch": 3.0922800718132857, "step": 43060}, {"loss": 0.6237, "grad_norm": 1.0605992078781128, "learning_rate": 0.0002, "epoch": 3.0929982046678637, "step": 43070}, {"loss": 0.6507, "grad_norm": 0.8783912062644958, "learning_rate": 0.0002, "epoch": 3.0937163375224417, "step": 43080}, {"loss": 0.6023, "grad_norm": 0.8839107751846313, "learning_rate": 0.0002, "epoch": 3.0944344703770197, "step": 43090}, {"loss": 0.6588, "grad_norm": 1.1655086278915405, "learning_rate": 0.0002, "epoch": 3.0951526032315977, "step": 43100}, {"loss": 0.6367, "grad_norm": 0.7051523327827454, "learning_rate": 0.0002, "epoch": 3.095870736086176, "step": 43110}, {"loss": 0.5941, "grad_norm": 0.7793807983398438, "learning_rate": 0.0002, "epoch": 3.096588868940754, "step": 43120}, {"loss": 0.6073, "grad_norm": 0.8352194428443909, "learning_rate": 0.0002, "epoch": 3.097307001795332, "step": 43130}, {"loss": 0.6087, "grad_norm": 0.9684847593307495, "learning_rate": 0.0002, "epoch": 3.09802513464991, "step": 43140}, {"loss": 0.6347, "grad_norm": 1.1106340885162354, "learning_rate": 0.0002, "epoch": 3.098743267504488, "step": 43150}, {"loss": 0.6395, "grad_norm": 0.7814911603927612, "learning_rate": 0.0002, "epoch": 3.0994614003590666, "step": 43160}, {"loss": 0.637, "grad_norm": 0.7923110723495483, "learning_rate": 0.0002, "epoch": 3.1001795332136446, "step": 43170}, {"loss": 0.6218, "grad_norm": 0.87022864818573, "learning_rate": 0.0002, "epoch": 3.1008976660682226, "step": 43180}, {"loss": 0.6246, "grad_norm": 0.9352855682373047, "learning_rate": 0.0002, "epoch": 3.1016157989228006, "step": 43190}, {"loss": 0.5943, "grad_norm": 0.8548445105552673, "learning_rate": 0.0002, "epoch": 3.1023339317773786, "step": 43200}, {"loss": 0.6106, "grad_norm": 0.9576025009155273, "learning_rate": 0.0002, "epoch": 3.103052064631957, "step": 43210}, {"loss": 0.6222, "grad_norm": 0.7430430054664612, "learning_rate": 0.0002, "epoch": 3.103770197486535, "step": 43220}, {"loss": 0.6223, "grad_norm": 0.9619144797325134, "learning_rate": 0.0002, "epoch": 3.104488330341113, "step": 43230}, {"loss": 0.6171, "grad_norm": 0.8622338771820068, "learning_rate": 0.0002, "epoch": 3.105206463195691, "step": 43240}, {"loss": 0.6336, "grad_norm": 0.853489339351654, "learning_rate": 0.0002, "epoch": 3.1059245960502695, "step": 43250}, {"loss": 0.635, "grad_norm": 0.9253206849098206, "learning_rate": 0.0002, "epoch": 3.1066427289048475, "step": 43260}, {"loss": 0.68, "grad_norm": 0.9700671434402466, "learning_rate": 0.0002, "epoch": 3.1073608617594255, "step": 43270}, {"loss": 0.6284, "grad_norm": 1.0550731420516968, "learning_rate": 0.0002, "epoch": 3.1080789946140035, "step": 43280}, {"loss": 0.6389, "grad_norm": 0.939452052116394, "learning_rate": 0.0002, "epoch": 3.1087971274685815, "step": 43290}, {"loss": 0.621, "grad_norm": 0.8855276107788086, "learning_rate": 0.0002, "epoch": 3.10951526032316, "step": 43300}, {"loss": 0.5814, "grad_norm": 0.92197185754776, "learning_rate": 0.0002, "epoch": 3.110233393177738, "step": 43310}, {"loss": 0.6341, "grad_norm": 0.8825578689575195, "learning_rate": 0.0002, "epoch": 3.110951526032316, "step": 43320}, {"loss": 0.6412, "grad_norm": 0.9964608550071716, "learning_rate": 0.0002, "epoch": 3.111669658886894, "step": 43330}, {"loss": 0.6074, "grad_norm": 0.9070520401000977, "learning_rate": 0.0002, "epoch": 3.1123877917414724, "step": 43340}, {"loss": 0.6503, "grad_norm": 0.9699633717536926, "learning_rate": 0.0002, "epoch": 3.1131059245960504, "step": 43350}, {"loss": 0.6545, "grad_norm": 0.7384091019630432, "learning_rate": 0.0002, "epoch": 3.1138240574506284, "step": 43360}, {"loss": 0.6644, "grad_norm": 0.9445326328277588, "learning_rate": 0.0002, "epoch": 3.1145421903052064, "step": 43370}, {"loss": 0.6088, "grad_norm": 0.8906524181365967, "learning_rate": 0.0002, "epoch": 3.1152603231597844, "step": 43380}, {"loss": 0.6213, "grad_norm": 0.8850129246711731, "learning_rate": 0.0002, "epoch": 3.115978456014363, "step": 43390}, {"loss": 0.6156, "grad_norm": 0.7091860771179199, "learning_rate": 0.0002, "epoch": 3.116696588868941, "step": 43400}, {"loss": 0.6056, "grad_norm": 0.8992764949798584, "learning_rate": 0.0002, "epoch": 3.117414721723519, "step": 43410}, {"loss": 0.6336, "grad_norm": 0.9166698455810547, "learning_rate": 0.0002, "epoch": 3.118132854578097, "step": 43420}, {"loss": 0.7011, "grad_norm": 1.1195749044418335, "learning_rate": 0.0002, "epoch": 3.118850987432675, "step": 43430}, {"loss": 0.6409, "grad_norm": 0.9414069652557373, "learning_rate": 0.0002, "epoch": 3.1195691202872533, "step": 43440}, {"loss": 0.6533, "grad_norm": 0.7641217112541199, "learning_rate": 0.0002, "epoch": 3.1202872531418313, "step": 43450}, {"loss": 0.6613, "grad_norm": 1.2659285068511963, "learning_rate": 0.0002, "epoch": 3.1210053859964093, "step": 43460}, {"loss": 0.631, "grad_norm": 0.9968213438987732, "learning_rate": 0.0002, "epoch": 3.1217235188509873, "step": 43470}, {"loss": 0.5833, "grad_norm": 0.8819042444229126, "learning_rate": 0.0002, "epoch": 3.1224416517055653, "step": 43480}, {"loss": 0.6819, "grad_norm": 0.9124775528907776, "learning_rate": 0.0002, "epoch": 3.1231597845601438, "step": 43490}, {"loss": 0.675, "grad_norm": 0.868354082107544, "learning_rate": 0.0002, "epoch": 3.1238779174147218, "step": 43500}, {"loss": 0.6348, "grad_norm": 0.7367526292800903, "learning_rate": 0.0002, "epoch": 3.1245960502692998, "step": 43510}, {"loss": 0.6068, "grad_norm": 0.7553679943084717, "learning_rate": 0.0002, "epoch": 3.1253141831238778, "step": 43520}, {"loss": 0.6346, "grad_norm": 0.7970008850097656, "learning_rate": 0.0002, "epoch": 3.126032315978456, "step": 43530}, {"loss": 0.6357, "grad_norm": 0.9117488861083984, "learning_rate": 0.0002, "epoch": 3.126750448833034, "step": 43540}, {"loss": 0.6609, "grad_norm": 0.8004103899002075, "learning_rate": 0.0002, "epoch": 3.127468581687612, "step": 43550}, {"loss": 0.596, "grad_norm": 0.736518919467926, "learning_rate": 0.0002, "epoch": 3.12818671454219, "step": 43560}, {"loss": 0.5945, "grad_norm": 0.8568395376205444, "learning_rate": 0.0002, "epoch": 3.128904847396768, "step": 43570}, {"loss": 0.665, "grad_norm": 0.9344052672386169, "learning_rate": 0.0002, "epoch": 3.1296229802513467, "step": 43580}, {"loss": 0.6403, "grad_norm": 0.7986525297164917, "learning_rate": 0.0002, "epoch": 3.1303411131059247, "step": 43590}, {"loss": 0.61, "grad_norm": 0.8283242583274841, "learning_rate": 0.0002, "epoch": 3.1310592459605027, "step": 43600}, {"loss": 0.6003, "grad_norm": 0.6534292101860046, "learning_rate": 0.0002, "epoch": 3.1317773788150807, "step": 43610}, {"loss": 0.6994, "grad_norm": 0.9585428833961487, "learning_rate": 0.0002, "epoch": 3.132495511669659, "step": 43620}, {"loss": 0.6007, "grad_norm": 0.8299157023429871, "learning_rate": 0.0002, "epoch": 3.133213644524237, "step": 43630}, {"loss": 0.6169, "grad_norm": 0.9050052762031555, "learning_rate": 0.0002, "epoch": 3.133931777378815, "step": 43640}, {"loss": 0.6217, "grad_norm": 1.0457062721252441, "learning_rate": 0.0002, "epoch": 3.134649910233393, "step": 43650}, {"loss": 0.6147, "grad_norm": 0.907691240310669, "learning_rate": 0.0002, "epoch": 3.135368043087971, "step": 43660}, {"loss": 0.5808, "grad_norm": 0.8868935108184814, "learning_rate": 0.0002, "epoch": 3.1360861759425496, "step": 43670}, {"loss": 0.6427, "grad_norm": 0.8585456609725952, "learning_rate": 0.0002, "epoch": 3.1368043087971276, "step": 43680}, {"loss": 0.6242, "grad_norm": 1.0402741432189941, "learning_rate": 0.0002, "epoch": 3.1375224416517056, "step": 43690}, {"loss": 0.641, "grad_norm": 1.0866798162460327, "learning_rate": 0.0002, "epoch": 3.1382405745062836, "step": 43700}, {"loss": 0.6082, "grad_norm": 0.7637296915054321, "learning_rate": 0.0002, "epoch": 3.1389587073608616, "step": 43710}, {"loss": 0.6256, "grad_norm": 0.755235493183136, "learning_rate": 0.0002, "epoch": 3.13967684021544, "step": 43720}, {"loss": 0.6441, "grad_norm": 0.7258853316307068, "learning_rate": 0.0002, "epoch": 3.140394973070018, "step": 43730}, {"loss": 0.5891, "grad_norm": 1.0425268411636353, "learning_rate": 0.0002, "epoch": 3.141113105924596, "step": 43740}, {"loss": 0.6527, "grad_norm": 0.9171959757804871, "learning_rate": 0.0002, "epoch": 3.141831238779174, "step": 43750}, {"loss": 0.6365, "grad_norm": 0.8900150656700134, "learning_rate": 0.0002, "epoch": 3.142549371633752, "step": 43760}, {"loss": 0.6324, "grad_norm": 0.9879246354103088, "learning_rate": 0.0002, "epoch": 3.1432675044883305, "step": 43770}, {"loss": 0.6624, "grad_norm": 0.7853389382362366, "learning_rate": 0.0002, "epoch": 3.1439856373429085, "step": 43780}, {"loss": 0.6259, "grad_norm": 1.0245232582092285, "learning_rate": 0.0002, "epoch": 3.1447037701974865, "step": 43790}, {"loss": 0.6278, "grad_norm": 0.8486390113830566, "learning_rate": 0.0002, "epoch": 3.1454219030520645, "step": 43800}, {"loss": 0.6175, "grad_norm": 0.8536406755447388, "learning_rate": 0.0002, "epoch": 3.146140035906643, "step": 43810}, {"loss": 0.5901, "grad_norm": 0.9653734564781189, "learning_rate": 0.0002, "epoch": 3.146858168761221, "step": 43820}, {"loss": 0.6041, "grad_norm": 0.8292608857154846, "learning_rate": 0.0002, "epoch": 3.147576301615799, "step": 43830}, {"loss": 0.6688, "grad_norm": 1.147524118423462, "learning_rate": 0.0002, "epoch": 3.148294434470377, "step": 43840}, {"loss": 0.6155, "grad_norm": 0.9317546486854553, "learning_rate": 0.0002, "epoch": 3.149012567324955, "step": 43850}, {"loss": 0.6305, "grad_norm": 0.8651045560836792, "learning_rate": 0.0002, "epoch": 3.1497307001795334, "step": 43860}, {"loss": 0.5985, "grad_norm": 0.8718969225883484, "learning_rate": 0.0002, "epoch": 3.1504488330341114, "step": 43870}, {"loss": 0.6206, "grad_norm": 1.0140702724456787, "learning_rate": 0.0002, "epoch": 3.1511669658886894, "step": 43880}, {"loss": 0.5941, "grad_norm": 0.75941401720047, "learning_rate": 0.0002, "epoch": 3.1518850987432674, "step": 43890}, {"loss": 0.5957, "grad_norm": 0.6618940234184265, "learning_rate": 0.0002, "epoch": 3.152603231597846, "step": 43900}, {"loss": 0.6262, "grad_norm": 1.0013338327407837, "learning_rate": 0.0002, "epoch": 3.153321364452424, "step": 43910}, {"loss": 0.6263, "grad_norm": 0.8735299706459045, "learning_rate": 0.0002, "epoch": 3.154039497307002, "step": 43920}, {"loss": 0.627, "grad_norm": 1.141914963722229, "learning_rate": 0.0002, "epoch": 3.15475763016158, "step": 43930}, {"loss": 0.6604, "grad_norm": 1.0916038751602173, "learning_rate": 0.0002, "epoch": 3.155475763016158, "step": 43940}, {"loss": 0.6228, "grad_norm": 0.7042547464370728, "learning_rate": 0.0002, "epoch": 3.1561938958707363, "step": 43950}, {"loss": 0.6069, "grad_norm": 0.9885236620903015, "learning_rate": 0.0002, "epoch": 3.1569120287253143, "step": 43960}, {"loss": 0.5973, "grad_norm": 0.8083009719848633, "learning_rate": 0.0002, "epoch": 3.1576301615798923, "step": 43970}, {"loss": 0.6416, "grad_norm": 1.082627296447754, "learning_rate": 0.0002, "epoch": 3.1583482944344703, "step": 43980}, {"loss": 0.624, "grad_norm": 0.9293290376663208, "learning_rate": 0.0002, "epoch": 3.1590664272890483, "step": 43990}, {"loss": 0.5665, "grad_norm": 0.861003041267395, "learning_rate": 0.0002, "epoch": 3.1597845601436267, "step": 44000}, {"loss": 0.6221, "grad_norm": 0.9565994143486023, "learning_rate": 0.0002, "epoch": 3.1605026929982047, "step": 44010}, {"loss": 0.7038, "grad_norm": 0.9609305262565613, "learning_rate": 0.0002, "epoch": 3.1612208258527827, "step": 44020}, {"loss": 0.6064, "grad_norm": 0.847830593585968, "learning_rate": 0.0002, "epoch": 3.1619389587073607, "step": 44030}, {"loss": 0.6299, "grad_norm": 0.852357804775238, "learning_rate": 0.0002, "epoch": 3.1626570915619387, "step": 44040}, {"loss": 0.5943, "grad_norm": 0.8634562492370605, "learning_rate": 0.0002, "epoch": 3.163375224416517, "step": 44050}, {"loss": 0.6011, "grad_norm": 1.0259950160980225, "learning_rate": 0.0002, "epoch": 3.164093357271095, "step": 44060}, {"loss": 0.7039, "grad_norm": 0.9615250825881958, "learning_rate": 0.0002, "epoch": 3.164811490125673, "step": 44070}, {"loss": 0.6179, "grad_norm": 0.9892165660858154, "learning_rate": 0.0002, "epoch": 3.165529622980251, "step": 44080}, {"loss": 0.6295, "grad_norm": 0.8827354907989502, "learning_rate": 0.0002, "epoch": 3.1662477558348296, "step": 44090}, {"loss": 0.6131, "grad_norm": 0.9258168339729309, "learning_rate": 0.0002, "epoch": 3.1669658886894076, "step": 44100}, {"loss": 0.5746, "grad_norm": 0.7983399033546448, "learning_rate": 0.0002, "epoch": 3.1676840215439857, "step": 44110}, {"loss": 0.6075, "grad_norm": 0.9917809963226318, "learning_rate": 0.0002, "epoch": 3.1684021543985637, "step": 44120}, {"loss": 0.6474, "grad_norm": 1.058927297592163, "learning_rate": 0.0002, "epoch": 3.1691202872531417, "step": 44130}, {"loss": 0.6211, "grad_norm": 1.0095895528793335, "learning_rate": 0.0002, "epoch": 3.16983842010772, "step": 44140}, {"loss": 0.6586, "grad_norm": 0.9032495617866516, "learning_rate": 0.0002, "epoch": 3.170556552962298, "step": 44150}, {"loss": 0.6356, "grad_norm": 0.9391272664070129, "learning_rate": 0.0002, "epoch": 3.171274685816876, "step": 44160}, {"loss": 0.6324, "grad_norm": 0.990755558013916, "learning_rate": 0.0002, "epoch": 3.171992818671454, "step": 44170}, {"loss": 0.5647, "grad_norm": 0.9310759902000427, "learning_rate": 0.0002, "epoch": 3.172710951526032, "step": 44180}, {"loss": 0.6802, "grad_norm": 0.7698856592178345, "learning_rate": 0.0002, "epoch": 3.1734290843806106, "step": 44190}, {"loss": 0.6109, "grad_norm": 0.7735867500305176, "learning_rate": 0.0002, "epoch": 3.1741472172351886, "step": 44200}, {"loss": 0.6252, "grad_norm": 1.1447525024414062, "learning_rate": 0.0002, "epoch": 3.1748653500897666, "step": 44210}, {"loss": 0.6268, "grad_norm": 0.8667060136795044, "learning_rate": 0.0002, "epoch": 3.1755834829443446, "step": 44220}, {"loss": 0.6066, "grad_norm": 0.8596829771995544, "learning_rate": 0.0002, "epoch": 3.176301615798923, "step": 44230}, {"loss": 0.6142, "grad_norm": 0.8607654571533203, "learning_rate": 0.0002, "epoch": 3.177019748653501, "step": 44240}, {"loss": 0.6358, "grad_norm": 0.9346948266029358, "learning_rate": 0.0002, "epoch": 3.177737881508079, "step": 44250}, {"loss": 0.6099, "grad_norm": 0.852344810962677, "learning_rate": 0.0002, "epoch": 3.178456014362657, "step": 44260}, {"loss": 0.5759, "grad_norm": 0.9260450005531311, "learning_rate": 0.0002, "epoch": 3.179174147217235, "step": 44270}, {"loss": 0.6419, "grad_norm": 0.924053430557251, "learning_rate": 0.0002, "epoch": 3.1798922800718135, "step": 44280}, {"loss": 0.6456, "grad_norm": 1.001965045928955, "learning_rate": 0.0002, "epoch": 3.1806104129263915, "step": 44290}, {"loss": 0.6211, "grad_norm": 0.943215012550354, "learning_rate": 0.0002, "epoch": 3.1813285457809695, "step": 44300}, {"loss": 0.6261, "grad_norm": 1.006977915763855, "learning_rate": 0.0002, "epoch": 3.1820466786355475, "step": 44310}, {"loss": 0.6684, "grad_norm": 0.9768950343132019, "learning_rate": 0.0002, "epoch": 3.1827648114901255, "step": 44320}, {"loss": 0.6334, "grad_norm": 0.9297489523887634, "learning_rate": 0.0002, "epoch": 3.183482944344704, "step": 44330}, {"loss": 0.6291, "grad_norm": 0.9110919237136841, "learning_rate": 0.0002, "epoch": 3.184201077199282, "step": 44340}, {"loss": 0.6389, "grad_norm": 0.9821381568908691, "learning_rate": 0.0002, "epoch": 3.18491921005386, "step": 44350}, {"loss": 0.6342, "grad_norm": 0.8451243042945862, "learning_rate": 0.0002, "epoch": 3.185637342908438, "step": 44360}, {"loss": 0.6709, "grad_norm": 0.9676638245582581, "learning_rate": 0.0002, "epoch": 3.1863554757630164, "step": 44370}, {"loss": 0.6506, "grad_norm": 0.9826035499572754, "learning_rate": 0.0002, "epoch": 3.1870736086175944, "step": 44380}, {"loss": 0.6425, "grad_norm": 0.9453121423721313, "learning_rate": 0.0002, "epoch": 3.1877917414721724, "step": 44390}, {"loss": 0.6481, "grad_norm": 0.7766330242156982, "learning_rate": 0.0002, "epoch": 3.1885098743267504, "step": 44400}, {"loss": 0.6369, "grad_norm": 0.9302349090576172, "learning_rate": 0.0002, "epoch": 3.1892280071813284, "step": 44410}, {"loss": 0.5586, "grad_norm": 0.8335331082344055, "learning_rate": 0.0002, "epoch": 3.189946140035907, "step": 44420}, {"loss": 0.673, "grad_norm": 0.6722736358642578, "learning_rate": 0.0002, "epoch": 3.190664272890485, "step": 44430}, {"loss": 0.6809, "grad_norm": 0.9047536849975586, "learning_rate": 0.0002, "epoch": 3.191382405745063, "step": 44440}, {"loss": 0.6085, "grad_norm": 0.9653822183609009, "learning_rate": 0.0002, "epoch": 3.192100538599641, "step": 44450}, {"loss": 0.6071, "grad_norm": 0.7750703692436218, "learning_rate": 0.0002, "epoch": 3.192818671454219, "step": 44460}, {"loss": 0.6323, "grad_norm": 0.7767539024353027, "learning_rate": 0.0002, "epoch": 3.1935368043087973, "step": 44470}, {"loss": 0.6471, "grad_norm": 0.8597778081893921, "learning_rate": 0.0002, "epoch": 3.1942549371633753, "step": 44480}, {"loss": 0.6804, "grad_norm": 1.1711493730545044, "learning_rate": 0.0002, "epoch": 3.1949730700179533, "step": 44490}, {"loss": 0.5917, "grad_norm": 0.9025220274925232, "learning_rate": 0.0002, "epoch": 3.1956912028725313, "step": 44500}, {"loss": 0.6445, "grad_norm": 0.8084979057312012, "learning_rate": 0.0002, "epoch": 3.1964093357271093, "step": 44510}, {"loss": 0.5943, "grad_norm": 0.8475074172019958, "learning_rate": 0.0002, "epoch": 3.1971274685816877, "step": 44520}, {"loss": 0.5959, "grad_norm": 0.9915644526481628, "learning_rate": 0.0002, "epoch": 3.1978456014362657, "step": 44530}, {"loss": 0.627, "grad_norm": 0.992231547832489, "learning_rate": 0.0002, "epoch": 3.1985637342908437, "step": 44540}, {"loss": 0.625, "grad_norm": 0.9804556369781494, "learning_rate": 0.0002, "epoch": 3.1992818671454217, "step": 44550}, {"loss": 0.6534, "grad_norm": 1.045558214187622, "learning_rate": 0.0002, "epoch": 3.2, "step": 44560}, {"loss": 0.6201, "grad_norm": 1.0880261659622192, "learning_rate": 0.0002, "epoch": 3.200718132854578, "step": 44570}, {"loss": 0.6471, "grad_norm": 0.9511138200759888, "learning_rate": 0.0002, "epoch": 3.201436265709156, "step": 44580}, {"loss": 0.5961, "grad_norm": 0.9115344882011414, "learning_rate": 0.0002, "epoch": 3.202154398563734, "step": 44590}, {"loss": 0.6504, "grad_norm": 1.0738362073898315, "learning_rate": 0.0002, "epoch": 3.202872531418312, "step": 44600}, {"loss": 0.6324, "grad_norm": 0.8209697604179382, "learning_rate": 0.0002, "epoch": 3.2035906642728906, "step": 44610}, {"loss": 0.6445, "grad_norm": 0.9220197796821594, "learning_rate": 0.0002, "epoch": 3.2043087971274686, "step": 44620}, {"loss": 0.5798, "grad_norm": 0.8859700560569763, "learning_rate": 0.0002, "epoch": 3.2050269299820466, "step": 44630}, {"loss": 0.6185, "grad_norm": 0.9772757291793823, "learning_rate": 0.0002, "epoch": 3.2057450628366246, "step": 44640}, {"loss": 0.6528, "grad_norm": 0.9385574460029602, "learning_rate": 0.0002, "epoch": 3.206463195691203, "step": 44650}, {"loss": 0.6098, "grad_norm": 0.839958906173706, "learning_rate": 0.0002, "epoch": 3.207181328545781, "step": 44660}, {"loss": 0.6803, "grad_norm": 0.860478401184082, "learning_rate": 0.0002, "epoch": 3.207899461400359, "step": 44670}, {"loss": 0.683, "grad_norm": 0.846886396408081, "learning_rate": 0.0002, "epoch": 3.208617594254937, "step": 44680}, {"loss": 0.6312, "grad_norm": 0.8591006398200989, "learning_rate": 0.0002, "epoch": 3.209335727109515, "step": 44690}, {"loss": 0.6173, "grad_norm": 0.9236023426055908, "learning_rate": 0.0002, "epoch": 3.2100538599640935, "step": 44700}, {"loss": 0.6471, "grad_norm": 0.7348999977111816, "learning_rate": 0.0002, "epoch": 3.2107719928186715, "step": 44710}, {"loss": 0.6239, "grad_norm": 1.0041730403900146, "learning_rate": 0.0002, "epoch": 3.2114901256732495, "step": 44720}, {"loss": 0.6612, "grad_norm": 0.8382687568664551, "learning_rate": 0.0002, "epoch": 3.2122082585278275, "step": 44730}, {"loss": 0.6026, "grad_norm": 0.8253511190414429, "learning_rate": 0.0002, "epoch": 3.2129263913824055, "step": 44740}, {"loss": 0.6129, "grad_norm": 0.9589242935180664, "learning_rate": 0.0002, "epoch": 3.213644524236984, "step": 44750}, {"loss": 0.6476, "grad_norm": 0.8938157558441162, "learning_rate": 0.0002, "epoch": 3.214362657091562, "step": 44760}, {"loss": 0.6811, "grad_norm": 1.0085135698318481, "learning_rate": 0.0002, "epoch": 3.21508078994614, "step": 44770}, {"loss": 0.646, "grad_norm": 0.8647134304046631, "learning_rate": 0.0002, "epoch": 3.215798922800718, "step": 44780}, {"loss": 0.6169, "grad_norm": 1.09453284740448, "learning_rate": 0.0002, "epoch": 3.216517055655296, "step": 44790}, {"loss": 0.6156, "grad_norm": 0.8710666298866272, "learning_rate": 0.0002, "epoch": 3.2172351885098744, "step": 44800}, {"loss": 0.662, "grad_norm": 0.8080880641937256, "learning_rate": 0.0002, "epoch": 3.2179533213644524, "step": 44810}, {"loss": 0.6039, "grad_norm": 1.0440675020217896, "learning_rate": 0.0002, "epoch": 3.2186714542190304, "step": 44820}, {"loss": 0.6629, "grad_norm": 1.1036376953125, "learning_rate": 0.0002, "epoch": 3.2193895870736084, "step": 44830}, {"loss": 0.6474, "grad_norm": 0.8783546686172485, "learning_rate": 0.0002, "epoch": 3.220107719928187, "step": 44840}, {"loss": 0.6286, "grad_norm": 0.7816855907440186, "learning_rate": 0.0002, "epoch": 3.220825852782765, "step": 44850}, {"loss": 0.622, "grad_norm": 1.0099157094955444, "learning_rate": 0.0002, "epoch": 3.221543985637343, "step": 44860}, {"loss": 0.6668, "grad_norm": 1.054928183555603, "learning_rate": 0.0002, "epoch": 3.222262118491921, "step": 44870}, {"loss": 0.6104, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 3.222980251346499, "step": 44880}, {"loss": 0.686, "grad_norm": 0.9730798602104187, "learning_rate": 0.0002, "epoch": 3.2236983842010773, "step": 44890}, {"loss": 0.6533, "grad_norm": 0.7911382913589478, "learning_rate": 0.0002, "epoch": 3.2244165170556554, "step": 44900}, {"loss": 0.6466, "grad_norm": 0.9574400782585144, "learning_rate": 0.0002, "epoch": 3.2251346499102334, "step": 44910}, {"loss": 0.693, "grad_norm": 0.8101068139076233, "learning_rate": 0.0002, "epoch": 3.2258527827648114, "step": 44920}, {"loss": 0.6605, "grad_norm": 0.754146933555603, "learning_rate": 0.0002, "epoch": 3.22657091561939, "step": 44930}, {"loss": 0.6317, "grad_norm": 0.7471939921379089, "learning_rate": 0.0002, "epoch": 3.227289048473968, "step": 44940}, {"loss": 0.6378, "grad_norm": 1.0040855407714844, "learning_rate": 0.0002, "epoch": 3.228007181328546, "step": 44950}, {"loss": 0.6496, "grad_norm": 1.0016074180603027, "learning_rate": 0.0002, "epoch": 3.228725314183124, "step": 44960}, {"loss": 0.6, "grad_norm": 1.0432976484298706, "learning_rate": 0.0002, "epoch": 3.229443447037702, "step": 44970}, {"loss": 0.635, "grad_norm": 0.8517055511474609, "learning_rate": 0.0002, "epoch": 3.2301615798922803, "step": 44980}, {"loss": 0.6168, "grad_norm": 0.9174178242683411, "learning_rate": 0.0002, "epoch": 3.2308797127468583, "step": 44990}, {"loss": 0.6325, "grad_norm": 0.9733774065971375, "learning_rate": 0.0002, "epoch": 3.2315978456014363, "step": 45000}, {"loss": 0.6743, "grad_norm": 0.9074714779853821, "learning_rate": 0.0002, "epoch": 3.2323159784560143, "step": 45010}, {"loss": 0.6372, "grad_norm": 0.8802759051322937, "learning_rate": 0.0002, "epoch": 3.2330341113105923, "step": 45020}, {"loss": 0.6189, "grad_norm": 1.0620871782302856, "learning_rate": 0.0002, "epoch": 3.2337522441651707, "step": 45030}, {"loss": 0.6201, "grad_norm": 0.8069542050361633, "learning_rate": 0.0002, "epoch": 3.2344703770197487, "step": 45040}, {"loss": 0.618, "grad_norm": 0.9139137864112854, "learning_rate": 0.0002, "epoch": 3.2351885098743267, "step": 45050}, {"loss": 0.6389, "grad_norm": 0.8936411142349243, "learning_rate": 0.0002, "epoch": 3.2359066427289047, "step": 45060}, {"loss": 0.6602, "grad_norm": 0.9098079204559326, "learning_rate": 0.0002, "epoch": 3.2366247755834827, "step": 45070}, {"loss": 0.6423, "grad_norm": 1.062953233718872, "learning_rate": 0.0002, "epoch": 3.237342908438061, "step": 45080}, {"loss": 0.6527, "grad_norm": 0.8656470775604248, "learning_rate": 0.0002, "epoch": 3.238061041292639, "step": 45090}, {"loss": 0.6362, "grad_norm": 0.9299449920654297, "learning_rate": 0.0002, "epoch": 3.238779174147217, "step": 45100}, {"loss": 0.6469, "grad_norm": 1.0102022886276245, "learning_rate": 0.0002, "epoch": 3.239497307001795, "step": 45110}, {"loss": 0.5984, "grad_norm": 0.8074561953544617, "learning_rate": 0.0002, "epoch": 3.2402154398563736, "step": 45120}, {"loss": 0.6196, "grad_norm": 1.044105887413025, "learning_rate": 0.0002, "epoch": 3.2409335727109516, "step": 45130}, {"loss": 0.6471, "grad_norm": 0.8742762207984924, "learning_rate": 0.0002, "epoch": 3.2416517055655296, "step": 45140}, {"loss": 0.648, "grad_norm": 0.8240015506744385, "learning_rate": 0.0002, "epoch": 3.2423698384201076, "step": 45150}, {"loss": 0.6599, "grad_norm": 0.8438951373100281, "learning_rate": 0.0002, "epoch": 3.2430879712746856, "step": 45160}, {"loss": 0.6406, "grad_norm": 1.02358877658844, "learning_rate": 0.0002, "epoch": 3.243806104129264, "step": 45170}, {"loss": 0.6581, "grad_norm": 0.8824774026870728, "learning_rate": 0.0002, "epoch": 3.244524236983842, "step": 45180}, {"loss": 0.658, "grad_norm": 0.971015989780426, "learning_rate": 0.0002, "epoch": 3.24524236983842, "step": 45190}, {"loss": 0.6473, "grad_norm": 0.9282383918762207, "learning_rate": 0.0002, "epoch": 3.245960502692998, "step": 45200}, {"loss": 0.6376, "grad_norm": 0.7908362746238708, "learning_rate": 0.0002, "epoch": 3.2466786355475765, "step": 45210}, {"loss": 0.6765, "grad_norm": 1.0721662044525146, "learning_rate": 0.0002, "epoch": 3.2473967684021545, "step": 45220}, {"loss": 0.7102, "grad_norm": 0.9516810774803162, "learning_rate": 0.0002, "epoch": 3.2481149012567325, "step": 45230}, {"loss": 0.6332, "grad_norm": 0.7914131283760071, "learning_rate": 0.0002, "epoch": 3.2488330341113105, "step": 45240}, {"loss": 0.6018, "grad_norm": 0.8492292761802673, "learning_rate": 0.0002, "epoch": 3.2495511669658885, "step": 45250}, {"loss": 0.6272, "grad_norm": 0.8880114555358887, "learning_rate": 0.0002, "epoch": 3.250269299820467, "step": 45260}, {"loss": 0.6394, "grad_norm": 0.7808310985565186, "learning_rate": 0.0002, "epoch": 3.250987432675045, "step": 45270}, {"loss": 0.6161, "grad_norm": 0.8566828966140747, "learning_rate": 0.0002, "epoch": 3.251705565529623, "step": 45280}, {"loss": 0.6408, "grad_norm": 0.7929658889770508, "learning_rate": 0.0002, "epoch": 3.252423698384201, "step": 45290}, {"loss": 0.6182, "grad_norm": 0.678207516670227, "learning_rate": 0.0002, "epoch": 3.253141831238779, "step": 45300}, {"loss": 0.6315, "grad_norm": 0.9963029623031616, "learning_rate": 0.0002, "epoch": 3.2538599640933574, "step": 45310}, {"loss": 0.6496, "grad_norm": 0.835304856300354, "learning_rate": 0.0002, "epoch": 3.2545780969479354, "step": 45320}, {"loss": 0.6099, "grad_norm": 0.7281617522239685, "learning_rate": 0.0002, "epoch": 3.2552962298025134, "step": 45330}, {"loss": 0.6224, "grad_norm": 1.244890570640564, "learning_rate": 0.0002, "epoch": 3.2560143626570914, "step": 45340}, {"loss": 0.6317, "grad_norm": 0.8372750282287598, "learning_rate": 0.0002, "epoch": 3.2567324955116694, "step": 45350}, {"loss": 0.604, "grad_norm": 1.0029667615890503, "learning_rate": 0.0002, "epoch": 3.257450628366248, "step": 45360}, {"loss": 0.596, "grad_norm": 0.8561908602714539, "learning_rate": 0.0002, "epoch": 3.258168761220826, "step": 45370}, {"loss": 0.6185, "grad_norm": 1.0058085918426514, "learning_rate": 0.0002, "epoch": 3.258886894075404, "step": 45380}, {"loss": 0.6415, "grad_norm": 0.7768221497535706, "learning_rate": 0.0002, "epoch": 3.259605026929982, "step": 45390}, {"loss": 0.635, "grad_norm": 0.8443793058395386, "learning_rate": 0.0002, "epoch": 3.2603231597845603, "step": 45400}, {"loss": 0.6579, "grad_norm": 1.0140392780303955, "learning_rate": 0.0002, "epoch": 3.2610412926391383, "step": 45410}, {"loss": 0.6434, "grad_norm": 0.8397058248519897, "learning_rate": 0.0002, "epoch": 3.2617594254937163, "step": 45420}, {"loss": 0.6361, "grad_norm": 0.9717063903808594, "learning_rate": 0.0002, "epoch": 3.2624775583482943, "step": 45430}, {"loss": 0.6837, "grad_norm": 1.0279473066329956, "learning_rate": 0.0002, "epoch": 3.2631956912028723, "step": 45440}, {"loss": 0.6274, "grad_norm": 1.207457184791565, "learning_rate": 0.0002, "epoch": 3.263913824057451, "step": 45450}, {"loss": 0.681, "grad_norm": 0.8121998906135559, "learning_rate": 0.0002, "epoch": 3.264631956912029, "step": 45460}, {"loss": 0.6202, "grad_norm": 1.037733554840088, "learning_rate": 0.0002, "epoch": 3.265350089766607, "step": 45470}, {"loss": 0.6146, "grad_norm": 0.9305754899978638, "learning_rate": 0.0002, "epoch": 3.266068222621185, "step": 45480}, {"loss": 0.6186, "grad_norm": 0.9733602404594421, "learning_rate": 0.0002, "epoch": 3.2667863554757632, "step": 45490}, {"loss": 0.6713, "grad_norm": 0.8345039486885071, "learning_rate": 0.0002, "epoch": 3.2675044883303412, "step": 45500}, {"loss": 0.6315, "grad_norm": 0.8601692318916321, "learning_rate": 0.0002, "epoch": 3.2682226211849192, "step": 45510}, {"loss": 0.5953, "grad_norm": 0.7921277284622192, "learning_rate": 0.0002, "epoch": 3.2689407540394972, "step": 45520}, {"loss": 0.6781, "grad_norm": 0.8324153423309326, "learning_rate": 0.0002, "epoch": 3.2696588868940752, "step": 45530}, {"loss": 0.6413, "grad_norm": 0.85141521692276, "learning_rate": 0.0002, "epoch": 3.2703770197486537, "step": 45540}, {"loss": 0.654, "grad_norm": 0.9399608373641968, "learning_rate": 0.0002, "epoch": 3.2710951526032317, "step": 45550}, {"loss": 0.6364, "grad_norm": 0.9829166531562805, "learning_rate": 0.0002, "epoch": 3.2718132854578097, "step": 45560}, {"loss": 0.627, "grad_norm": 0.9936266541481018, "learning_rate": 0.0002, "epoch": 3.2725314183123877, "step": 45570}, {"loss": 0.6465, "grad_norm": 1.036165714263916, "learning_rate": 0.0002, "epoch": 3.2732495511669657, "step": 45580}, {"loss": 0.6216, "grad_norm": 0.8988680243492126, "learning_rate": 0.0002, "epoch": 3.273967684021544, "step": 45590}, {"loss": 0.6368, "grad_norm": 0.9173405766487122, "learning_rate": 0.0002, "epoch": 3.274685816876122, "step": 45600}, {"loss": 0.6455, "grad_norm": 0.9967324733734131, "learning_rate": 0.0002, "epoch": 3.2754039497307, "step": 45610}, {"loss": 0.6236, "grad_norm": 0.9097777009010315, "learning_rate": 0.0002, "epoch": 3.276122082585278, "step": 45620}, {"loss": 0.632, "grad_norm": 1.0559430122375488, "learning_rate": 0.0002, "epoch": 3.276840215439856, "step": 45630}, {"loss": 0.5999, "grad_norm": 0.9583360552787781, "learning_rate": 0.0002, "epoch": 3.2775583482944346, "step": 45640}, {"loss": 0.6329, "grad_norm": 0.7630334496498108, "learning_rate": 0.0002, "epoch": 3.2782764811490126, "step": 45650}, {"loss": 0.6873, "grad_norm": 0.9955230355262756, "learning_rate": 0.0002, "epoch": 3.2789946140035906, "step": 45660}, {"loss": 0.6216, "grad_norm": 0.8685793876647949, "learning_rate": 0.0002, "epoch": 3.2797127468581686, "step": 45670}, {"loss": 0.6243, "grad_norm": 0.919913113117218, "learning_rate": 0.0002, "epoch": 3.280430879712747, "step": 45680}, {"loss": 0.6334, "grad_norm": 0.826144814491272, "learning_rate": 0.0002, "epoch": 3.281149012567325, "step": 45690}, {"loss": 0.6359, "grad_norm": 0.9750179052352905, "learning_rate": 0.0002, "epoch": 3.281867145421903, "step": 45700}, {"loss": 0.6589, "grad_norm": 0.7931897640228271, "learning_rate": 0.0002, "epoch": 3.282585278276481, "step": 45710}, {"loss": 0.6785, "grad_norm": 1.0380089282989502, "learning_rate": 0.0002, "epoch": 3.283303411131059, "step": 45720}, {"loss": 0.6219, "grad_norm": 0.8220566511154175, "learning_rate": 0.0002, "epoch": 3.2840215439856375, "step": 45730}, {"loss": 0.5737, "grad_norm": 0.9688239693641663, "learning_rate": 0.0002, "epoch": 3.2847396768402155, "step": 45740}, {"loss": 0.603, "grad_norm": 0.8760311603546143, "learning_rate": 0.0002, "epoch": 3.2854578096947935, "step": 45750}, {"loss": 0.6134, "grad_norm": 0.8103382587432861, "learning_rate": 0.0002, "epoch": 3.2861759425493715, "step": 45760}, {"loss": 0.6475, "grad_norm": 0.8835865259170532, "learning_rate": 0.0002, "epoch": 3.28689407540395, "step": 45770}, {"loss": 0.6423, "grad_norm": 0.9021160006523132, "learning_rate": 0.0002, "epoch": 3.287612208258528, "step": 45780}, {"loss": 0.6693, "grad_norm": 0.8182386159896851, "learning_rate": 0.0002, "epoch": 3.288330341113106, "step": 45790}, {"loss": 0.6408, "grad_norm": 0.8555024862289429, "learning_rate": 0.0002, "epoch": 3.289048473967684, "step": 45800}, {"loss": 0.6839, "grad_norm": 1.0982348918914795, "learning_rate": 0.0002, "epoch": 3.289766606822262, "step": 45810}, {"loss": 0.6323, "grad_norm": 1.06246817111969, "learning_rate": 0.0002, "epoch": 3.2904847396768404, "step": 45820}, {"loss": 0.5924, "grad_norm": 1.1727149486541748, "learning_rate": 0.0002, "epoch": 3.2912028725314184, "step": 45830}, {"loss": 0.624, "grad_norm": 0.8224700093269348, "learning_rate": 0.0002, "epoch": 3.2919210053859964, "step": 45840}, {"loss": 0.6445, "grad_norm": 0.8195698261260986, "learning_rate": 0.0002, "epoch": 3.2926391382405744, "step": 45850}, {"loss": 0.6106, "grad_norm": 0.8424476981163025, "learning_rate": 0.0002, "epoch": 3.2933572710951524, "step": 45860}, {"loss": 0.6705, "grad_norm": 0.9804632067680359, "learning_rate": 0.0002, "epoch": 3.294075403949731, "step": 45870}, {"loss": 0.6538, "grad_norm": 0.8701804876327515, "learning_rate": 0.0002, "epoch": 3.294793536804309, "step": 45880}, {"loss": 0.6264, "grad_norm": 0.8876864910125732, "learning_rate": 0.0002, "epoch": 3.295511669658887, "step": 45890}, {"loss": 0.6401, "grad_norm": 1.0105448961257935, "learning_rate": 0.0002, "epoch": 3.296229802513465, "step": 45900}, {"loss": 0.687, "grad_norm": 0.847017228603363, "learning_rate": 0.0002, "epoch": 3.296947935368043, "step": 45910}, {"loss": 0.6433, "grad_norm": 0.7610297799110413, "learning_rate": 0.0002, "epoch": 3.2976660682226213, "step": 45920}, {"loss": 0.6499, "grad_norm": 0.7272670269012451, "learning_rate": 0.0002, "epoch": 3.2983842010771993, "step": 45930}, {"loss": 0.6366, "grad_norm": 0.8243510127067566, "learning_rate": 0.0002, "epoch": 3.2991023339317773, "step": 45940}, {"loss": 0.6498, "grad_norm": 1.0113074779510498, "learning_rate": 0.0002, "epoch": 3.2998204667863553, "step": 45950}, {"loss": 0.6639, "grad_norm": 0.8578087687492371, "learning_rate": 0.0002, "epoch": 3.3005385996409338, "step": 45960}, {"loss": 0.6137, "grad_norm": 0.9511606097221375, "learning_rate": 0.0002, "epoch": 3.3012567324955118, "step": 45970}, {"loss": 0.6115, "grad_norm": 0.8612566590309143, "learning_rate": 0.0002, "epoch": 3.3019748653500898, "step": 45980}, {"loss": 0.6799, "grad_norm": 0.8702331185340881, "learning_rate": 0.0002, "epoch": 3.3026929982046678, "step": 45990}, {"loss": 0.6429, "grad_norm": 1.0229583978652954, "learning_rate": 0.0002, "epoch": 3.3034111310592458, "step": 46000}, {"loss": 0.6054, "grad_norm": 1.1775577068328857, "learning_rate": 0.0002, "epoch": 3.304129263913824, "step": 46010}, {"loss": 0.6958, "grad_norm": 0.9922171831130981, "learning_rate": 0.0002, "epoch": 3.3048473967684022, "step": 46020}, {"loss": 0.6642, "grad_norm": 0.8246880769729614, "learning_rate": 0.0002, "epoch": 3.3055655296229802, "step": 46030}, {"loss": 0.678, "grad_norm": 0.9351653456687927, "learning_rate": 0.0002, "epoch": 3.3062836624775582, "step": 46040}, {"loss": 0.649, "grad_norm": 0.9617429375648499, "learning_rate": 0.0002, "epoch": 3.3070017953321367, "step": 46050}, {"loss": 0.6314, "grad_norm": 0.9753885269165039, "learning_rate": 0.0002, "epoch": 3.3077199281867147, "step": 46060}, {"loss": 0.6434, "grad_norm": 0.8532425165176392, "learning_rate": 0.0002, "epoch": 3.3084380610412927, "step": 46070}, {"loss": 0.6312, "grad_norm": 0.9722012877464294, "learning_rate": 0.0002, "epoch": 3.3091561938958707, "step": 46080}, {"loss": 0.6629, "grad_norm": 0.8950021266937256, "learning_rate": 0.0002, "epoch": 3.3098743267504487, "step": 46090}, {"loss": 0.6278, "grad_norm": 0.8536333441734314, "learning_rate": 0.0002, "epoch": 3.3105924596050267, "step": 46100}, {"loss": 0.6359, "grad_norm": 0.9423946738243103, "learning_rate": 0.0002, "epoch": 3.311310592459605, "step": 46110}, {"loss": 0.6647, "grad_norm": 0.8573169112205505, "learning_rate": 0.0002, "epoch": 3.312028725314183, "step": 46120}, {"loss": 0.6127, "grad_norm": 1.0122376680374146, "learning_rate": 0.0002, "epoch": 3.312746858168761, "step": 46130}, {"loss": 0.6782, "grad_norm": 0.7492560744285583, "learning_rate": 0.0002, "epoch": 3.313464991023339, "step": 46140}, {"loss": 0.6315, "grad_norm": 1.023658037185669, "learning_rate": 0.0002, "epoch": 3.3141831238779176, "step": 46150}, {"loss": 0.6051, "grad_norm": 1.1191970109939575, "learning_rate": 0.0002, "epoch": 3.3149012567324956, "step": 46160}, {"loss": 0.6247, "grad_norm": 0.9847373962402344, "learning_rate": 0.0002, "epoch": 3.3156193895870736, "step": 46170}, {"loss": 0.661, "grad_norm": 0.7315911054611206, "learning_rate": 0.0002, "epoch": 3.3163375224416516, "step": 46180}, {"loss": 0.6017, "grad_norm": 0.8267890214920044, "learning_rate": 0.0002, "epoch": 3.3170556552962296, "step": 46190}, {"loss": 0.6202, "grad_norm": 0.8898099064826965, "learning_rate": 0.0002, "epoch": 3.317773788150808, "step": 46200}, {"loss": 0.651, "grad_norm": 0.8525369167327881, "learning_rate": 0.0002, "epoch": 3.318491921005386, "step": 46210}, {"loss": 0.6705, "grad_norm": 0.8074760437011719, "learning_rate": 0.0002, "epoch": 3.319210053859964, "step": 46220}, {"loss": 0.641, "grad_norm": 0.8473616242408752, "learning_rate": 0.0002, "epoch": 3.319928186714542, "step": 46230}, {"loss": 0.6092, "grad_norm": 0.8678314089775085, "learning_rate": 0.0002, "epoch": 3.3206463195691205, "step": 46240}, {"loss": 0.655, "grad_norm": 0.8718782067298889, "learning_rate": 0.0002, "epoch": 3.3213644524236985, "step": 46250}, {"loss": 0.6266, "grad_norm": 0.9384858012199402, "learning_rate": 0.0002, "epoch": 3.3220825852782765, "step": 46260}, {"loss": 0.6393, "grad_norm": 0.9295032620429993, "learning_rate": 0.0002, "epoch": 3.3228007181328545, "step": 46270}, {"loss": 0.6824, "grad_norm": 0.9472482800483704, "learning_rate": 0.0002, "epoch": 3.3235188509874325, "step": 46280}, {"loss": 0.6177, "grad_norm": 0.7970638275146484, "learning_rate": 0.0002, "epoch": 3.324236983842011, "step": 46290}, {"loss": 0.6431, "grad_norm": 0.9508723020553589, "learning_rate": 0.0002, "epoch": 3.324955116696589, "step": 46300}, {"loss": 0.6126, "grad_norm": 0.9153636693954468, "learning_rate": 0.0002, "epoch": 3.325673249551167, "step": 46310}, {"loss": 0.6042, "grad_norm": 0.7890323400497437, "learning_rate": 0.0002, "epoch": 3.326391382405745, "step": 46320}, {"loss": 0.6525, "grad_norm": 0.8711825609207153, "learning_rate": 0.0002, "epoch": 3.3271095152603234, "step": 46330}, {"loss": 0.6253, "grad_norm": 0.9938926696777344, "learning_rate": 0.0002, "epoch": 3.3278276481149014, "step": 46340}, {"loss": 0.6227, "grad_norm": 0.8497524857521057, "learning_rate": 0.0002, "epoch": 3.3285457809694794, "step": 46350}, {"loss": 0.6472, "grad_norm": 0.9191650748252869, "learning_rate": 0.0002, "epoch": 3.3292639138240574, "step": 46360}, {"loss": 0.6385, "grad_norm": 0.8974085450172424, "learning_rate": 0.0002, "epoch": 3.3299820466786354, "step": 46370}, {"loss": 0.618, "grad_norm": 0.9928934574127197, "learning_rate": 0.0002, "epoch": 3.3307001795332134, "step": 46380}, {"loss": 0.6254, "grad_norm": 0.9011030197143555, "learning_rate": 0.0002, "epoch": 3.331418312387792, "step": 46390}, {"loss": 0.6146, "grad_norm": 0.898594856262207, "learning_rate": 0.0002, "epoch": 3.33213644524237, "step": 46400}, {"loss": 0.6321, "grad_norm": 0.7506672143936157, "learning_rate": 0.0002, "epoch": 3.332854578096948, "step": 46410}, {"loss": 0.6329, "grad_norm": 0.9239172339439392, "learning_rate": 0.0002, "epoch": 3.333572710951526, "step": 46420}, {"loss": 0.6278, "grad_norm": 1.0749682188034058, "learning_rate": 0.0002, "epoch": 3.3342908438061043, "step": 46430}, {"loss": 0.6568, "grad_norm": 0.9262617230415344, "learning_rate": 0.0002, "epoch": 3.3350089766606823, "step": 46440}, {"loss": 0.6034, "grad_norm": 0.8681274056434631, "learning_rate": 0.0002, "epoch": 3.3357271095152603, "step": 46450}, {"loss": 0.6261, "grad_norm": 0.9558620452880859, "learning_rate": 0.0002, "epoch": 3.3364452423698383, "step": 46460}, {"loss": 0.6087, "grad_norm": 0.8907097578048706, "learning_rate": 0.0002, "epoch": 3.3371633752244163, "step": 46470}, {"loss": 0.6356, "grad_norm": 1.0941565036773682, "learning_rate": 0.0002, "epoch": 3.3378815080789948, "step": 46480}, {"loss": 0.6536, "grad_norm": 0.8971590995788574, "learning_rate": 0.0002, "epoch": 3.3385996409335728, "step": 46490}, {"loss": 0.6252, "grad_norm": 1.0315606594085693, "learning_rate": 0.0002, "epoch": 3.3393177737881508, "step": 46500}, {"loss": 0.5819, "grad_norm": 0.7717124223709106, "learning_rate": 0.0002, "epoch": 3.3400359066427288, "step": 46510}, {"loss": 0.612, "grad_norm": 0.8060970902442932, "learning_rate": 0.0002, "epoch": 3.340754039497307, "step": 46520}, {"loss": 0.7036, "grad_norm": 0.969510018825531, "learning_rate": 0.0002, "epoch": 3.341472172351885, "step": 46530}, {"loss": 0.6163, "grad_norm": 0.8837248682975769, "learning_rate": 0.0002, "epoch": 3.342190305206463, "step": 46540}, {"loss": 0.6762, "grad_norm": 0.9561076164245605, "learning_rate": 0.0002, "epoch": 3.342908438061041, "step": 46550}, {"loss": 0.687, "grad_norm": 0.8529208898544312, "learning_rate": 0.0002, "epoch": 3.343626570915619, "step": 46560}, {"loss": 0.611, "grad_norm": 1.1300519704818726, "learning_rate": 0.0002, "epoch": 3.3443447037701977, "step": 46570}, {"loss": 0.6088, "grad_norm": 0.8330956101417542, "learning_rate": 0.0002, "epoch": 3.3450628366247757, "step": 46580}, {"loss": 0.6725, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 3.3457809694793537, "step": 46590}, {"loss": 0.6667, "grad_norm": 1.0470821857452393, "learning_rate": 0.0002, "epoch": 3.3464991023339317, "step": 46600}, {"loss": 0.6408, "grad_norm": 0.9933704137802124, "learning_rate": 0.0002, "epoch": 3.34721723518851, "step": 46610}, {"loss": 0.6416, "grad_norm": 0.8130798935890198, "learning_rate": 0.0002, "epoch": 3.347935368043088, "step": 46620}, {"loss": 0.6576, "grad_norm": 0.9746946692466736, "learning_rate": 0.0002, "epoch": 3.348653500897666, "step": 46630}, {"loss": 0.6254, "grad_norm": 0.8607267141342163, "learning_rate": 0.0002, "epoch": 3.349371633752244, "step": 46640}, {"loss": 0.6639, "grad_norm": 0.800335705280304, "learning_rate": 0.0002, "epoch": 3.350089766606822, "step": 46650}, {"loss": 0.6749, "grad_norm": 1.0083239078521729, "learning_rate": 0.0002, "epoch": 3.3508078994614, "step": 46660}, {"loss": 0.6606, "grad_norm": 1.0774433612823486, "learning_rate": 0.0002, "epoch": 3.3515260323159786, "step": 46670}, {"loss": 0.6408, "grad_norm": 0.9378824234008789, "learning_rate": 0.0002, "epoch": 3.3522441651705566, "step": 46680}, {"loss": 0.5879, "grad_norm": 0.8490564227104187, "learning_rate": 0.0002, "epoch": 3.3529622980251346, "step": 46690}, {"loss": 0.6364, "grad_norm": 1.0415582656860352, "learning_rate": 0.0002, "epoch": 3.3536804308797126, "step": 46700}, {"loss": 0.5813, "grad_norm": 0.8514367938041687, "learning_rate": 0.0002, "epoch": 3.354398563734291, "step": 46710}, {"loss": 0.6847, "grad_norm": 0.7691360712051392, "learning_rate": 0.0002, "epoch": 3.355116696588869, "step": 46720}, {"loss": 0.6295, "grad_norm": 0.8345438241958618, "learning_rate": 0.0002, "epoch": 3.355834829443447, "step": 46730}, {"loss": 0.6093, "grad_norm": 1.023492693901062, "learning_rate": 0.0002, "epoch": 3.356552962298025, "step": 46740}, {"loss": 0.5997, "grad_norm": 0.9648325443267822, "learning_rate": 0.0002, "epoch": 3.357271095152603, "step": 46750}, {"loss": 0.6379, "grad_norm": 0.9029248356819153, "learning_rate": 0.0002, "epoch": 3.3579892280071815, "step": 46760}, {"loss": 0.6551, "grad_norm": 0.9109513759613037, "learning_rate": 0.0002, "epoch": 3.3587073608617595, "step": 46770}, {"loss": 0.6616, "grad_norm": 0.7757390141487122, "learning_rate": 0.0002, "epoch": 3.3594254937163375, "step": 46780}, {"loss": 0.6088, "grad_norm": 0.794035792350769, "learning_rate": 0.0002, "epoch": 3.3601436265709155, "step": 46790}, {"loss": 0.6405, "grad_norm": 0.8211429715156555, "learning_rate": 0.0002, "epoch": 3.360861759425494, "step": 46800}, {"loss": 0.6359, "grad_norm": 0.8620322346687317, "learning_rate": 0.0002, "epoch": 3.361579892280072, "step": 46810}, {"loss": 0.6357, "grad_norm": 0.9392538070678711, "learning_rate": 0.0002, "epoch": 3.36229802513465, "step": 46820}, {"loss": 0.6225, "grad_norm": 0.8297873139381409, "learning_rate": 0.0002, "epoch": 3.363016157989228, "step": 46830}, {"loss": 0.639, "grad_norm": 0.9158190488815308, "learning_rate": 0.0002, "epoch": 3.363734290843806, "step": 46840}, {"loss": 0.6168, "grad_norm": 1.1449424028396606, "learning_rate": 0.0002, "epoch": 3.3644524236983844, "step": 46850}, {"loss": 0.6413, "grad_norm": 0.8718444108963013, "learning_rate": 0.0002, "epoch": 3.3651705565529624, "step": 46860}, {"loss": 0.624, "grad_norm": 0.7744014263153076, "learning_rate": 0.0002, "epoch": 3.3658886894075404, "step": 46870}, {"loss": 0.6238, "grad_norm": 0.8392460942268372, "learning_rate": 0.0002, "epoch": 3.3666068222621184, "step": 46880}, {"loss": 0.6753, "grad_norm": 1.0424989461898804, "learning_rate": 0.0002, "epoch": 3.367324955116697, "step": 46890}, {"loss": 0.6038, "grad_norm": 1.4696359634399414, "learning_rate": 0.0002, "epoch": 3.368043087971275, "step": 46900}, {"loss": 0.6525, "grad_norm": 0.9298201203346252, "learning_rate": 0.0002, "epoch": 3.368761220825853, "step": 46910}, {"loss": 0.6351, "grad_norm": 0.8965262770652771, "learning_rate": 0.0002, "epoch": 3.369479353680431, "step": 46920}, {"loss": 0.6505, "grad_norm": 0.9395381808280945, "learning_rate": 0.0002, "epoch": 3.370197486535009, "step": 46930}, {"loss": 0.6161, "grad_norm": 0.9069047570228577, "learning_rate": 0.0002, "epoch": 3.370915619389587, "step": 46940}, {"loss": 0.6576, "grad_norm": 0.9208605885505676, "learning_rate": 0.0002, "epoch": 3.3716337522441653, "step": 46950}, {"loss": 0.6456, "grad_norm": 0.9493077397346497, "learning_rate": 0.0002, "epoch": 3.3723518850987433, "step": 46960}, {"loss": 0.6609, "grad_norm": 1.0804208517074585, "learning_rate": 0.0002, "epoch": 3.3730700179533213, "step": 46970}, {"loss": 0.6267, "grad_norm": 0.9465714693069458, "learning_rate": 0.0002, "epoch": 3.3737881508078993, "step": 46980}, {"loss": 0.6633, "grad_norm": 0.9189882278442383, "learning_rate": 0.0002, "epoch": 3.3745062836624777, "step": 46990}, {"loss": 0.6518, "grad_norm": 1.0199357271194458, "learning_rate": 0.0002, "epoch": 3.3752244165170557, "step": 47000}, {"loss": 0.6645, "grad_norm": 0.8999426960945129, "learning_rate": 0.0002, "epoch": 3.3759425493716337, "step": 47010}, {"loss": 0.637, "grad_norm": 0.8923690319061279, "learning_rate": 0.0002, "epoch": 3.3766606822262117, "step": 47020}, {"loss": 0.6543, "grad_norm": 0.7459347248077393, "learning_rate": 0.0002, "epoch": 3.3773788150807897, "step": 47030}, {"loss": 0.6269, "grad_norm": 0.7702858448028564, "learning_rate": 0.0002, "epoch": 3.378096947935368, "step": 47040}, {"loss": 0.6399, "grad_norm": 0.8296625018119812, "learning_rate": 0.0002, "epoch": 3.378815080789946, "step": 47050}, {"loss": 0.6552, "grad_norm": 1.2952555418014526, "learning_rate": 0.0002, "epoch": 3.379533213644524, "step": 47060}, {"loss": 0.6264, "grad_norm": 0.7778869271278381, "learning_rate": 0.0002, "epoch": 3.380251346499102, "step": 47070}, {"loss": 0.6906, "grad_norm": 0.9151549339294434, "learning_rate": 0.0002, "epoch": 3.3809694793536806, "step": 47080}, {"loss": 0.6443, "grad_norm": 0.7883925437927246, "learning_rate": 0.0002, "epoch": 3.3816876122082586, "step": 47090}, {"loss": 0.6124, "grad_norm": 0.9602295756340027, "learning_rate": 0.0002, "epoch": 3.3824057450628366, "step": 47100}, {"loss": 0.651, "grad_norm": 0.7953121066093445, "learning_rate": 0.0002, "epoch": 3.3831238779174146, "step": 47110}, {"loss": 0.638, "grad_norm": 1.110148549079895, "learning_rate": 0.0002, "epoch": 3.3838420107719926, "step": 47120}, {"loss": 0.6386, "grad_norm": 0.9359608888626099, "learning_rate": 0.0002, "epoch": 3.384560143626571, "step": 47130}, {"loss": 0.6075, "grad_norm": 0.7877762317657471, "learning_rate": 0.0002, "epoch": 3.385278276481149, "step": 47140}, {"loss": 0.6657, "grad_norm": 0.8586933016777039, "learning_rate": 0.0002, "epoch": 3.385996409335727, "step": 47150}, {"loss": 0.6438, "grad_norm": 0.8920878767967224, "learning_rate": 0.0002, "epoch": 3.386714542190305, "step": 47160}, {"loss": 0.6584, "grad_norm": 0.9692603349685669, "learning_rate": 0.0002, "epoch": 3.3874326750448835, "step": 47170}, {"loss": 0.6643, "grad_norm": 0.9038610458374023, "learning_rate": 0.0002, "epoch": 3.3881508078994615, "step": 47180}, {"loss": 0.6002, "grad_norm": 1.6299188137054443, "learning_rate": 0.0002, "epoch": 3.3888689407540395, "step": 47190}, {"loss": 0.6423, "grad_norm": 0.9704291820526123, "learning_rate": 0.0002, "epoch": 3.3895870736086176, "step": 47200}, {"loss": 0.6808, "grad_norm": 0.9503401517868042, "learning_rate": 0.0002, "epoch": 3.3903052064631956, "step": 47210}, {"loss": 0.6871, "grad_norm": 1.0051378011703491, "learning_rate": 0.0002, "epoch": 3.3910233393177736, "step": 47220}, {"loss": 0.6207, "grad_norm": 0.7336357235908508, "learning_rate": 0.0002, "epoch": 3.391741472172352, "step": 47230}, {"loss": 0.6688, "grad_norm": 0.9847398996353149, "learning_rate": 0.0002, "epoch": 3.39245960502693, "step": 47240}, {"loss": 0.6305, "grad_norm": 0.8100917339324951, "learning_rate": 0.0002, "epoch": 3.393177737881508, "step": 47250}, {"loss": 0.6418, "grad_norm": 0.9752838611602783, "learning_rate": 0.0002, "epoch": 3.393895870736086, "step": 47260}, {"loss": 0.6237, "grad_norm": 0.9400623440742493, "learning_rate": 0.0002, "epoch": 3.3946140035906645, "step": 47270}, {"loss": 0.6321, "grad_norm": 0.7310057878494263, "learning_rate": 0.0002, "epoch": 3.3953321364452425, "step": 47280}, {"loss": 0.6209, "grad_norm": 0.8898789286613464, "learning_rate": 0.0002, "epoch": 3.3960502692998205, "step": 47290}, {"loss": 0.6496, "grad_norm": 1.0157585144042969, "learning_rate": 0.0002, "epoch": 3.3967684021543985, "step": 47300}, {"loss": 0.6497, "grad_norm": 0.9108527898788452, "learning_rate": 0.0002, "epoch": 3.3974865350089765, "step": 47310}, {"loss": 0.5928, "grad_norm": 0.9796249270439148, "learning_rate": 0.0002, "epoch": 3.398204667863555, "step": 47320}, {"loss": 0.6169, "grad_norm": 0.8176435232162476, "learning_rate": 0.0002, "epoch": 3.398922800718133, "step": 47330}, {"loss": 0.6279, "grad_norm": 0.9981188178062439, "learning_rate": 0.0002, "epoch": 3.399640933572711, "step": 47340}, {"loss": 0.6657, "grad_norm": 0.9774404764175415, "learning_rate": 0.0002, "epoch": 3.400359066427289, "step": 47350}, {"loss": 0.68, "grad_norm": 0.8624991774559021, "learning_rate": 0.0002, "epoch": 3.4010771992818674, "step": 47360}, {"loss": 0.6597, "grad_norm": 0.9191665053367615, "learning_rate": 0.0002, "epoch": 3.4017953321364454, "step": 47370}, {"loss": 0.6249, "grad_norm": 0.7971290946006775, "learning_rate": 0.0002, "epoch": 3.4025134649910234, "step": 47380}, {"loss": 0.617, "grad_norm": 0.8336732983589172, "learning_rate": 0.0002, "epoch": 3.4032315978456014, "step": 47390}, {"loss": 0.6435, "grad_norm": 0.7730334401130676, "learning_rate": 0.0002, "epoch": 3.4039497307001794, "step": 47400}, {"loss": 0.6348, "grad_norm": 0.8559145927429199, "learning_rate": 0.0002, "epoch": 3.404667863554758, "step": 47410}, {"loss": 0.6466, "grad_norm": 1.0261447429656982, "learning_rate": 0.0002, "epoch": 3.405385996409336, "step": 47420}, {"loss": 0.6556, "grad_norm": 0.9931781888008118, "learning_rate": 0.0002, "epoch": 3.406104129263914, "step": 47430}, {"loss": 0.6226, "grad_norm": 0.8971807360649109, "learning_rate": 0.0002, "epoch": 3.406822262118492, "step": 47440}, {"loss": 0.656, "grad_norm": 0.8886999487876892, "learning_rate": 0.0002, "epoch": 3.4075403949730703, "step": 47450}, {"loss": 0.6256, "grad_norm": 0.9551735520362854, "learning_rate": 0.0002, "epoch": 3.4082585278276483, "step": 47460}, {"loss": 0.6646, "grad_norm": 0.9066859483718872, "learning_rate": 0.0002, "epoch": 3.4089766606822263, "step": 47470}, {"loss": 0.6655, "grad_norm": 0.9192125201225281, "learning_rate": 0.0002, "epoch": 3.4096947935368043, "step": 47480}, {"loss": 0.6197, "grad_norm": 0.9332839250564575, "learning_rate": 0.0002, "epoch": 3.4104129263913823, "step": 47490}, {"loss": 0.6134, "grad_norm": 0.745563805103302, "learning_rate": 0.0002, "epoch": 3.4111310592459603, "step": 47500}, {"loss": 0.6206, "grad_norm": 0.6843905448913574, "learning_rate": 0.0002, "epoch": 3.4118491921005387, "step": 47510}, {"loss": 0.6742, "grad_norm": 0.8063111305236816, "learning_rate": 0.0002, "epoch": 3.4125673249551167, "step": 47520}, {"loss": 0.6138, "grad_norm": 0.9666593670845032, "learning_rate": 0.0002, "epoch": 3.4132854578096947, "step": 47530}, {"loss": 0.635, "grad_norm": 0.8112747073173523, "learning_rate": 0.0002, "epoch": 3.4140035906642727, "step": 47540}, {"loss": 0.6225, "grad_norm": 0.820807933807373, "learning_rate": 0.0002, "epoch": 3.414721723518851, "step": 47550}, {"loss": 0.6262, "grad_norm": 0.8476285338401794, "learning_rate": 0.0002, "epoch": 3.415439856373429, "step": 47560}, {"loss": 0.6134, "grad_norm": 1.0232552289962769, "learning_rate": 0.0002, "epoch": 3.416157989228007, "step": 47570}, {"loss": 0.604, "grad_norm": 0.8749372363090515, "learning_rate": 0.0002, "epoch": 3.416876122082585, "step": 47580}, {"loss": 0.6463, "grad_norm": 0.8117937445640564, "learning_rate": 0.0002, "epoch": 3.417594254937163, "step": 47590}, {"loss": 0.623, "grad_norm": 0.9010460376739502, "learning_rate": 0.0002, "epoch": 3.4183123877917416, "step": 47600}, {"loss": 0.6676, "grad_norm": 0.8955527544021606, "learning_rate": 0.0002, "epoch": 3.4190305206463196, "step": 47610}, {"loss": 0.6424, "grad_norm": 0.884186327457428, "learning_rate": 0.0002, "epoch": 3.4197486535008976, "step": 47620}, {"loss": 0.6377, "grad_norm": 0.8995241522789001, "learning_rate": 0.0002, "epoch": 3.4204667863554756, "step": 47630}, {"loss": 0.651, "grad_norm": 1.0627013444900513, "learning_rate": 0.0002, "epoch": 3.421184919210054, "step": 47640}, {"loss": 0.6338, "grad_norm": 0.8619979619979858, "learning_rate": 0.0002, "epoch": 3.421903052064632, "step": 47650}, {"loss": 0.6483, "grad_norm": 0.9682498574256897, "learning_rate": 0.0002, "epoch": 3.42262118491921, "step": 47660}, {"loss": 0.6006, "grad_norm": 0.9614400863647461, "learning_rate": 0.0002, "epoch": 3.423339317773788, "step": 47670}, {"loss": 0.6088, "grad_norm": 0.7986962795257568, "learning_rate": 0.0002, "epoch": 3.424057450628366, "step": 47680}, {"loss": 0.6056, "grad_norm": 0.8255957961082458, "learning_rate": 0.0002, "epoch": 3.4247755834829445, "step": 47690}, {"loss": 0.663, "grad_norm": 0.9139757752418518, "learning_rate": 0.0002, "epoch": 3.4254937163375225, "step": 47700}, {"loss": 0.61, "grad_norm": 0.8086292743682861, "learning_rate": 0.0002, "epoch": 3.4262118491921005, "step": 47710}, {"loss": 0.6604, "grad_norm": 0.8852273225784302, "learning_rate": 0.0002, "epoch": 3.4269299820466785, "step": 47720}, {"loss": 0.6168, "grad_norm": 0.7568784356117249, "learning_rate": 0.0002, "epoch": 3.427648114901257, "step": 47730}, {"loss": 0.6559, "grad_norm": 0.8933039903640747, "learning_rate": 0.0002, "epoch": 3.428366247755835, "step": 47740}, {"loss": 0.6406, "grad_norm": 0.8101669549942017, "learning_rate": 0.0002, "epoch": 3.429084380610413, "step": 47750}, {"loss": 0.6287, "grad_norm": 0.7021054625511169, "learning_rate": 0.0002, "epoch": 3.429802513464991, "step": 47760}, {"loss": 0.6159, "grad_norm": 0.8282538652420044, "learning_rate": 0.0002, "epoch": 3.430520646319569, "step": 47770}, {"loss": 0.6439, "grad_norm": 0.8168348670005798, "learning_rate": 0.0002, "epoch": 3.431238779174147, "step": 47780}, {"loss": 0.6265, "grad_norm": 0.9504001140594482, "learning_rate": 0.0002, "epoch": 3.4319569120287254, "step": 47790}, {"loss": 0.6688, "grad_norm": 0.7500190734863281, "learning_rate": 0.0002, "epoch": 3.4326750448833034, "step": 47800}, {"loss": 0.6818, "grad_norm": 0.8645710945129395, "learning_rate": 0.0002, "epoch": 3.4333931777378814, "step": 47810}, {"loss": 0.6268, "grad_norm": 0.8088704943656921, "learning_rate": 0.0002, "epoch": 3.4341113105924594, "step": 47820}, {"loss": 0.6795, "grad_norm": 0.9981673955917358, "learning_rate": 0.0002, "epoch": 3.434829443447038, "step": 47830}, {"loss": 0.6615, "grad_norm": 0.9363315105438232, "learning_rate": 0.0002, "epoch": 3.435547576301616, "step": 47840}, {"loss": 0.6028, "grad_norm": 0.8471030592918396, "learning_rate": 0.0002, "epoch": 3.436265709156194, "step": 47850}, {"loss": 0.6658, "grad_norm": 0.9447668790817261, "learning_rate": 0.0002, "epoch": 3.436983842010772, "step": 47860}, {"loss": 0.6511, "grad_norm": 0.9494127631187439, "learning_rate": 0.0002, "epoch": 3.43770197486535, "step": 47870}, {"loss": 0.6134, "grad_norm": 0.8340432643890381, "learning_rate": 0.0002, "epoch": 3.4384201077199283, "step": 47880}, {"loss": 0.6731, "grad_norm": 0.8466387987136841, "learning_rate": 0.0002, "epoch": 3.4391382405745063, "step": 47890}, {"loss": 0.6552, "grad_norm": 0.9498962759971619, "learning_rate": 0.0002, "epoch": 3.4398563734290843, "step": 47900}, {"loss": 0.6593, "grad_norm": 0.8490501046180725, "learning_rate": 0.0002, "epoch": 3.4405745062836623, "step": 47910}, {"loss": 0.6038, "grad_norm": 0.9506490230560303, "learning_rate": 0.0002, "epoch": 3.441292639138241, "step": 47920}, {"loss": 0.6317, "grad_norm": 0.7944257855415344, "learning_rate": 0.0002, "epoch": 3.442010771992819, "step": 47930}, {"loss": 0.6193, "grad_norm": 0.9725518226623535, "learning_rate": 0.0002, "epoch": 3.442728904847397, "step": 47940}, {"loss": 0.635, "grad_norm": 0.7823024392127991, "learning_rate": 0.0002, "epoch": 3.443447037701975, "step": 47950}, {"loss": 0.6221, "grad_norm": 0.810565173625946, "learning_rate": 0.0002, "epoch": 3.444165170556553, "step": 47960}, {"loss": 0.6519, "grad_norm": 0.9809024333953857, "learning_rate": 0.0002, "epoch": 3.4448833034111312, "step": 47970}, {"loss": 0.6441, "grad_norm": 0.8818578720092773, "learning_rate": 0.0002, "epoch": 3.4456014362657092, "step": 47980}, {"loss": 0.6452, "grad_norm": 0.9843092560768127, "learning_rate": 0.0002, "epoch": 3.4463195691202873, "step": 47990}, {"loss": 0.6076, "grad_norm": 0.916313886642456, "learning_rate": 0.0002, "epoch": 3.4470377019748653, "step": 48000}, {"loss": 0.6399, "grad_norm": 0.908442497253418, "learning_rate": 0.0002, "epoch": 3.4477558348294433, "step": 48010}, {"loss": 0.6263, "grad_norm": 0.9880178570747375, "learning_rate": 0.0002, "epoch": 3.4484739676840217, "step": 48020}, {"loss": 0.6802, "grad_norm": 0.9276854991912842, "learning_rate": 0.0002, "epoch": 3.4491921005385997, "step": 48030}, {"loss": 0.6522, "grad_norm": 1.0879448652267456, "learning_rate": 0.0002, "epoch": 3.4499102333931777, "step": 48040}, {"loss": 0.6362, "grad_norm": 0.7430389523506165, "learning_rate": 0.0002, "epoch": 3.4506283662477557, "step": 48050}, {"loss": 0.6064, "grad_norm": 1.0880072116851807, "learning_rate": 0.0002, "epoch": 3.4513464991023337, "step": 48060}, {"loss": 0.6152, "grad_norm": 1.0424141883850098, "learning_rate": 0.0002, "epoch": 3.452064631956912, "step": 48070}, {"loss": 0.6485, "grad_norm": 0.926330029964447, "learning_rate": 0.0002, "epoch": 3.45278276481149, "step": 48080}, {"loss": 0.6261, "grad_norm": 0.8911219239234924, "learning_rate": 0.0002, "epoch": 3.453500897666068, "step": 48090}, {"loss": 0.6883, "grad_norm": 0.8727201223373413, "learning_rate": 0.0002, "epoch": 3.454219030520646, "step": 48100}, {"loss": 0.6473, "grad_norm": 0.8573940396308899, "learning_rate": 0.0002, "epoch": 3.4549371633752246, "step": 48110}, {"loss": 0.6645, "grad_norm": 1.0427064895629883, "learning_rate": 0.0002, "epoch": 3.4556552962298026, "step": 48120}, {"loss": 0.6489, "grad_norm": 0.8688231706619263, "learning_rate": 0.0002, "epoch": 3.4563734290843806, "step": 48130}, {"loss": 0.5947, "grad_norm": 0.8856009244918823, "learning_rate": 0.0002, "epoch": 3.4570915619389586, "step": 48140}, {"loss": 0.6482, "grad_norm": 0.9535353183746338, "learning_rate": 0.0002, "epoch": 3.4578096947935366, "step": 48150}, {"loss": 0.6435, "grad_norm": 0.9466010928153992, "learning_rate": 0.0002, "epoch": 3.458527827648115, "step": 48160}, {"loss": 0.6231, "grad_norm": 0.9783535599708557, "learning_rate": 0.0002, "epoch": 3.459245960502693, "step": 48170}, {"loss": 0.6926, "grad_norm": 0.8010456562042236, "learning_rate": 0.0002, "epoch": 3.459964093357271, "step": 48180}, {"loss": 0.6141, "grad_norm": 0.8928955793380737, "learning_rate": 0.0002, "epoch": 3.460682226211849, "step": 48190}, {"loss": 0.6699, "grad_norm": 0.7565838694572449, "learning_rate": 0.0002, "epoch": 3.4614003590664275, "step": 48200}, {"loss": 0.6218, "grad_norm": 1.0044180154800415, "learning_rate": 0.0002, "epoch": 3.4621184919210055, "step": 48210}, {"loss": 0.6182, "grad_norm": 0.8161038160324097, "learning_rate": 0.0002, "epoch": 3.4628366247755835, "step": 48220}, {"loss": 0.6869, "grad_norm": 1.1000211238861084, "learning_rate": 0.0002, "epoch": 3.4635547576301615, "step": 48230}, {"loss": 0.7141, "grad_norm": 0.7942240238189697, "learning_rate": 0.0002, "epoch": 3.4642728904847395, "step": 48240}, {"loss": 0.6247, "grad_norm": 0.7546432018280029, "learning_rate": 0.0002, "epoch": 3.464991023339318, "step": 48250}, {"loss": 0.6319, "grad_norm": 0.7705255150794983, "learning_rate": 0.0002, "epoch": 3.465709156193896, "step": 48260}, {"loss": 0.6414, "grad_norm": 0.7958067059516907, "learning_rate": 0.0002, "epoch": 3.466427289048474, "step": 48270}, {"loss": 0.6526, "grad_norm": 0.9199120402336121, "learning_rate": 0.0002, "epoch": 3.467145421903052, "step": 48280}, {"loss": 0.6476, "grad_norm": 1.118672251701355, "learning_rate": 0.0002, "epoch": 3.46786355475763, "step": 48290}, {"loss": 0.6543, "grad_norm": 0.9161015748977661, "learning_rate": 0.0002, "epoch": 3.4685816876122084, "step": 48300}, {"loss": 0.6767, "grad_norm": 1.1086218357086182, "learning_rate": 0.0002, "epoch": 3.4692998204667864, "step": 48310}, {"loss": 0.5917, "grad_norm": 1.0123368501663208, "learning_rate": 0.0002, "epoch": 3.4700179533213644, "step": 48320}, {"loss": 0.6277, "grad_norm": 0.7380602359771729, "learning_rate": 0.0002, "epoch": 3.4707360861759424, "step": 48330}, {"loss": 0.6407, "grad_norm": 0.8967105150222778, "learning_rate": 0.0002, "epoch": 3.4714542190305204, "step": 48340}, {"loss": 0.6526, "grad_norm": 1.0134044885635376, "learning_rate": 0.0002, "epoch": 3.472172351885099, "step": 48350}, {"loss": 0.6436, "grad_norm": 1.080815076828003, "learning_rate": 0.0002, "epoch": 3.472890484739677, "step": 48360}, {"loss": 0.6644, "grad_norm": 1.151721477508545, "learning_rate": 0.0002, "epoch": 3.473608617594255, "step": 48370}, {"loss": 0.6612, "grad_norm": 0.9436505436897278, "learning_rate": 0.0002, "epoch": 3.474326750448833, "step": 48380}, {"loss": 0.6503, "grad_norm": 0.9154609441757202, "learning_rate": 0.0002, "epoch": 3.4750448833034113, "step": 48390}, {"loss": 0.6151, "grad_norm": 0.8943037986755371, "learning_rate": 0.0002, "epoch": 3.4757630161579893, "step": 48400}, {"loss": 0.6316, "grad_norm": 0.936988115310669, "learning_rate": 0.0002, "epoch": 3.4764811490125673, "step": 48410}, {"loss": 0.6638, "grad_norm": 0.826960027217865, "learning_rate": 0.0002, "epoch": 3.4771992818671453, "step": 48420}, {"loss": 0.6242, "grad_norm": 1.0487587451934814, "learning_rate": 0.0002, "epoch": 3.4779174147217233, "step": 48430}, {"loss": 0.6302, "grad_norm": 0.729163646697998, "learning_rate": 0.0002, "epoch": 3.478635547576302, "step": 48440}, {"loss": 0.6115, "grad_norm": 0.8156948089599609, "learning_rate": 0.0002, "epoch": 3.47935368043088, "step": 48450}, {"loss": 0.6455, "grad_norm": 0.8004332184791565, "learning_rate": 0.0002, "epoch": 3.480071813285458, "step": 48460}, {"loss": 0.621, "grad_norm": 0.9632692337036133, "learning_rate": 0.0002, "epoch": 3.480789946140036, "step": 48470}, {"loss": 0.6214, "grad_norm": 1.0950212478637695, "learning_rate": 0.0002, "epoch": 3.4815080789946142, "step": 48480}, {"loss": 0.6659, "grad_norm": 0.8574318885803223, "learning_rate": 0.0002, "epoch": 3.4822262118491922, "step": 48490}, {"loss": 0.6969, "grad_norm": 0.8552606701850891, "learning_rate": 0.0002, "epoch": 3.4829443447037702, "step": 48500}, {"loss": 0.6253, "grad_norm": 0.9698445200920105, "learning_rate": 0.0002, "epoch": 3.4836624775583482, "step": 48510}, {"loss": 0.6844, "grad_norm": 0.9427815675735474, "learning_rate": 0.0002, "epoch": 3.4843806104129262, "step": 48520}, {"loss": 0.6722, "grad_norm": 0.7902070879936218, "learning_rate": 0.0002, "epoch": 3.4850987432675042, "step": 48530}, {"loss": 0.6708, "grad_norm": 1.0300066471099854, "learning_rate": 0.0002, "epoch": 3.4858168761220827, "step": 48540}, {"loss": 0.6113, "grad_norm": 1.1688778400421143, "learning_rate": 0.0002, "epoch": 3.4865350089766607, "step": 48550}, {"loss": 0.5956, "grad_norm": 1.0012071132659912, "learning_rate": 0.0002, "epoch": 3.4872531418312387, "step": 48560}, {"loss": 0.6536, "grad_norm": 1.112094759941101, "learning_rate": 0.0002, "epoch": 3.4879712746858167, "step": 48570}, {"loss": 0.6625, "grad_norm": 0.8547284603118896, "learning_rate": 0.0002, "epoch": 3.488689407540395, "step": 48580}, {"loss": 0.6488, "grad_norm": 0.8827278017997742, "learning_rate": 0.0002, "epoch": 3.489407540394973, "step": 48590}, {"loss": 0.6437, "grad_norm": 0.9255490303039551, "learning_rate": 0.0002, "epoch": 3.490125673249551, "step": 48600}, {"loss": 0.6089, "grad_norm": 0.8000030517578125, "learning_rate": 0.0002, "epoch": 3.490843806104129, "step": 48610}, {"loss": 0.647, "grad_norm": 0.9327391386032104, "learning_rate": 0.0002, "epoch": 3.491561938958707, "step": 48620}, {"loss": 0.6678, "grad_norm": 0.9004138708114624, "learning_rate": 0.0002, "epoch": 3.4922800718132856, "step": 48630}, {"loss": 0.6145, "grad_norm": 0.9886971116065979, "learning_rate": 0.0002, "epoch": 3.4929982046678636, "step": 48640}, {"loss": 0.6309, "grad_norm": 0.9890487194061279, "learning_rate": 0.0002, "epoch": 3.4937163375224416, "step": 48650}, {"loss": 0.655, "grad_norm": 0.7024438977241516, "learning_rate": 0.0002, "epoch": 3.4944344703770196, "step": 48660}, {"loss": 0.6313, "grad_norm": 0.8397303223609924, "learning_rate": 0.0002, "epoch": 3.495152603231598, "step": 48670}, {"loss": 0.6429, "grad_norm": 0.9120950698852539, "learning_rate": 0.0002, "epoch": 3.495870736086176, "step": 48680}, {"loss": 0.631, "grad_norm": 1.057299017906189, "learning_rate": 0.0002, "epoch": 3.496588868940754, "step": 48690}, {"loss": 0.6459, "grad_norm": 0.821325957775116, "learning_rate": 0.0002, "epoch": 3.497307001795332, "step": 48700}, {"loss": 0.6174, "grad_norm": 1.0029970407485962, "learning_rate": 0.0002, "epoch": 3.49802513464991, "step": 48710}, {"loss": 0.6374, "grad_norm": 0.9483712911605835, "learning_rate": 0.0002, "epoch": 3.4987432675044885, "step": 48720}, {"loss": 0.6472, "grad_norm": 0.9637855291366577, "learning_rate": 0.0002, "epoch": 3.4994614003590665, "step": 48730}, {"loss": 0.6639, "grad_norm": 0.6848894357681274, "learning_rate": 0.0002, "epoch": 3.5001795332136445, "step": 48740}, {"loss": 0.6129, "grad_norm": 0.7848573327064514, "learning_rate": 0.0002, "epoch": 3.5008976660682225, "step": 48750}, {"loss": 0.6306, "grad_norm": 1.0341308116912842, "learning_rate": 0.0002, "epoch": 3.501615798922801, "step": 48760}, {"loss": 0.6063, "grad_norm": 0.8858218193054199, "learning_rate": 0.0002, "epoch": 3.502333931777379, "step": 48770}, {"loss": 0.6729, "grad_norm": 0.8366939425468445, "learning_rate": 0.0002, "epoch": 3.503052064631957, "step": 48780}, {"loss": 0.6736, "grad_norm": 0.7926092147827148, "learning_rate": 0.0002, "epoch": 3.503770197486535, "step": 48790}, {"loss": 0.6279, "grad_norm": 0.8503843545913696, "learning_rate": 0.0002, "epoch": 3.504488330341113, "step": 48800}, {"loss": 0.6162, "grad_norm": 0.8867869973182678, "learning_rate": 0.0002, "epoch": 3.505206463195691, "step": 48810}, {"loss": 0.6987, "grad_norm": 1.0336930751800537, "learning_rate": 0.0002, "epoch": 3.5059245960502694, "step": 48820}, {"loss": 0.6333, "grad_norm": 0.8564051985740662, "learning_rate": 0.0002, "epoch": 3.5066427289048474, "step": 48830}, {"loss": 0.6574, "grad_norm": 0.9202605485916138, "learning_rate": 0.0002, "epoch": 3.5073608617594254, "step": 48840}, {"loss": 0.6457, "grad_norm": 0.8838639855384827, "learning_rate": 0.0002, "epoch": 3.508078994614004, "step": 48850}, {"loss": 0.631, "grad_norm": 0.8975196480751038, "learning_rate": 0.0002, "epoch": 3.508797127468582, "step": 48860}, {"loss": 0.6335, "grad_norm": 0.8842370510101318, "learning_rate": 0.0002, "epoch": 3.50951526032316, "step": 48870}, {"loss": 0.6569, "grad_norm": 0.9195886254310608, "learning_rate": 0.0002, "epoch": 3.510233393177738, "step": 48880}, {"loss": 0.6647, "grad_norm": 0.986130952835083, "learning_rate": 0.0002, "epoch": 3.510951526032316, "step": 48890}, {"loss": 0.6676, "grad_norm": 0.8119593858718872, "learning_rate": 0.0002, "epoch": 3.511669658886894, "step": 48900}, {"loss": 0.653, "grad_norm": 0.9027136564254761, "learning_rate": 0.0002, "epoch": 3.5123877917414723, "step": 48910}, {"loss": 0.6731, "grad_norm": 0.8560537099838257, "learning_rate": 0.0002, "epoch": 3.5131059245960503, "step": 48920}, {"loss": 0.7032, "grad_norm": 0.7073559165000916, "learning_rate": 0.0002, "epoch": 3.5138240574506283, "step": 48930}, {"loss": 0.6738, "grad_norm": 0.8753304481506348, "learning_rate": 0.0002, "epoch": 3.5145421903052063, "step": 48940}, {"loss": 0.6366, "grad_norm": 0.9151145815849304, "learning_rate": 0.0002, "epoch": 3.5152603231597848, "step": 48950}, {"loss": 0.6135, "grad_norm": 0.7794315814971924, "learning_rate": 0.0002, "epoch": 3.5159784560143628, "step": 48960}, {"loss": 0.658, "grad_norm": 0.9226023554801941, "learning_rate": 0.0002, "epoch": 3.5166965888689408, "step": 48970}, {"loss": 0.6473, "grad_norm": 0.8442051410675049, "learning_rate": 0.0002, "epoch": 3.5174147217235188, "step": 48980}, {"loss": 0.6267, "grad_norm": 0.9769423007965088, "learning_rate": 0.0002, "epoch": 3.5181328545780968, "step": 48990}, {"loss": 0.6333, "grad_norm": 0.740347146987915, "learning_rate": 0.0002, "epoch": 3.5188509874326748, "step": 49000}, {"loss": 0.6652, "grad_norm": 0.8963457345962524, "learning_rate": 0.0002, "epoch": 3.519569120287253, "step": 49010}, {"loss": 0.6782, "grad_norm": 0.8410176634788513, "learning_rate": 0.0002, "epoch": 3.520287253141831, "step": 49020}, {"loss": 0.6496, "grad_norm": 1.0486022233963013, "learning_rate": 0.0002, "epoch": 3.521005385996409, "step": 49030}, {"loss": 0.6275, "grad_norm": 0.95393967628479, "learning_rate": 0.0002, "epoch": 3.5217235188509877, "step": 49040}, {"loss": 0.6328, "grad_norm": 0.8261157274246216, "learning_rate": 0.0002, "epoch": 3.5224416517055657, "step": 49050}, {"loss": 0.6441, "grad_norm": 0.9321704506874084, "learning_rate": 0.0002, "epoch": 3.5231597845601437, "step": 49060}, {"loss": 0.6202, "grad_norm": 1.2596088647842407, "learning_rate": 0.0002, "epoch": 3.5238779174147217, "step": 49070}, {"loss": 0.6596, "grad_norm": 0.8584637641906738, "learning_rate": 0.0002, "epoch": 3.5245960502692997, "step": 49080}, {"loss": 0.6708, "grad_norm": 0.850520670413971, "learning_rate": 0.0002, "epoch": 3.5253141831238777, "step": 49090}, {"loss": 0.6543, "grad_norm": 0.8915920257568359, "learning_rate": 0.0002, "epoch": 3.526032315978456, "step": 49100}, {"loss": 0.6558, "grad_norm": 0.9070239067077637, "learning_rate": 0.0002, "epoch": 3.526750448833034, "step": 49110}, {"loss": 0.6128, "grad_norm": 0.699878990650177, "learning_rate": 0.0002, "epoch": 3.527468581687612, "step": 49120}, {"loss": 0.6454, "grad_norm": 0.9003779888153076, "learning_rate": 0.0002, "epoch": 3.5281867145421906, "step": 49130}, {"loss": 0.6177, "grad_norm": 0.7886711955070496, "learning_rate": 0.0002, "epoch": 3.5289048473967686, "step": 49140}, {"loss": 0.6499, "grad_norm": 0.7368922233581543, "learning_rate": 0.0002, "epoch": 3.5296229802513466, "step": 49150}, {"loss": 0.6382, "grad_norm": 0.8585197329521179, "learning_rate": 0.0002, "epoch": 3.5303411131059246, "step": 49160}, {"loss": 0.6761, "grad_norm": 1.0205435752868652, "learning_rate": 0.0002, "epoch": 3.5310592459605026, "step": 49170}, {"loss": 0.6544, "grad_norm": 0.8756650686264038, "learning_rate": 0.0002, "epoch": 3.5317773788150806, "step": 49180}, {"loss": 0.6592, "grad_norm": 1.0278643369674683, "learning_rate": 0.0002, "epoch": 3.532495511669659, "step": 49190}, {"loss": 0.6682, "grad_norm": 0.8641911745071411, "learning_rate": 0.0002, "epoch": 3.533213644524237, "step": 49200}, {"loss": 0.6531, "grad_norm": 0.8730159401893616, "learning_rate": 0.0002, "epoch": 3.533931777378815, "step": 49210}, {"loss": 0.636, "grad_norm": 0.918637216091156, "learning_rate": 0.0002, "epoch": 3.534649910233393, "step": 49220}, {"loss": 0.6815, "grad_norm": 1.0467222929000854, "learning_rate": 0.0002, "epoch": 3.5353680430879715, "step": 49230}, {"loss": 0.6554, "grad_norm": 1.005009412765503, "learning_rate": 0.0002, "epoch": 3.5360861759425495, "step": 49240}, {"loss": 0.649, "grad_norm": 0.9775063395500183, "learning_rate": 0.0002, "epoch": 3.5368043087971275, "step": 49250}, {"loss": 0.6527, "grad_norm": 0.8198322057723999, "learning_rate": 0.0002, "epoch": 3.5375224416517055, "step": 49260}, {"loss": 0.664, "grad_norm": 0.8184829354286194, "learning_rate": 0.0002, "epoch": 3.5382405745062835, "step": 49270}, {"loss": 0.6493, "grad_norm": 0.9520270824432373, "learning_rate": 0.0002, "epoch": 3.5389587073608615, "step": 49280}, {"loss": 0.5935, "grad_norm": 0.7816803455352783, "learning_rate": 0.0002, "epoch": 3.53967684021544, "step": 49290}, {"loss": 0.6424, "grad_norm": 0.6915702819824219, "learning_rate": 0.0002, "epoch": 3.540394973070018, "step": 49300}, {"loss": 0.6447, "grad_norm": 0.8282375931739807, "learning_rate": 0.0002, "epoch": 3.541113105924596, "step": 49310}, {"loss": 0.6164, "grad_norm": 1.0797513723373413, "learning_rate": 0.0002, "epoch": 3.5418312387791744, "step": 49320}, {"loss": 0.6836, "grad_norm": 0.868671715259552, "learning_rate": 0.0002, "epoch": 3.5425493716337524, "step": 49330}, {"loss": 0.6453, "grad_norm": 0.8534455895423889, "learning_rate": 0.0002, "epoch": 3.5432675044883304, "step": 49340}, {"loss": 0.6706, "grad_norm": 0.816411554813385, "learning_rate": 0.0002, "epoch": 3.5439856373429084, "step": 49350}, {"loss": 0.6101, "grad_norm": 0.7813423275947571, "learning_rate": 0.0002, "epoch": 3.5447037701974864, "step": 49360}, {"loss": 0.6617, "grad_norm": 0.8002013564109802, "learning_rate": 0.0002, "epoch": 3.5454219030520644, "step": 49370}, {"loss": 0.6667, "grad_norm": 0.9740113615989685, "learning_rate": 0.0002, "epoch": 3.546140035906643, "step": 49380}, {"loss": 0.6938, "grad_norm": 0.9046127200126648, "learning_rate": 0.0002, "epoch": 3.546858168761221, "step": 49390}, {"loss": 0.6444, "grad_norm": 0.8635150194168091, "learning_rate": 0.0002, "epoch": 3.547576301615799, "step": 49400}, {"loss": 0.6273, "grad_norm": 0.9488558769226074, "learning_rate": 0.0002, "epoch": 3.5482944344703773, "step": 49410}, {"loss": 0.6542, "grad_norm": 0.9637090563774109, "learning_rate": 0.0002, "epoch": 3.5490125673249553, "step": 49420}, {"loss": 0.6468, "grad_norm": 1.042245626449585, "learning_rate": 0.0002, "epoch": 3.5497307001795333, "step": 49430}, {"loss": 0.6999, "grad_norm": 0.9076175689697266, "learning_rate": 0.0002, "epoch": 3.5504488330341113, "step": 49440}, {"loss": 0.6192, "grad_norm": 0.8480596542358398, "learning_rate": 0.0002, "epoch": 3.5511669658886893, "step": 49450}, {"loss": 0.6835, "grad_norm": 0.8483007550239563, "learning_rate": 0.0002, "epoch": 3.5518850987432673, "step": 49460}, {"loss": 0.6607, "grad_norm": 0.7855815887451172, "learning_rate": 0.0002, "epoch": 3.5526032315978457, "step": 49470}, {"loss": 0.6364, "grad_norm": 0.8435823917388916, "learning_rate": 0.0002, "epoch": 3.5533213644524237, "step": 49480}, {"loss": 0.6674, "grad_norm": 0.8613026142120361, "learning_rate": 0.0002, "epoch": 3.5540394973070017, "step": 49490}, {"loss": 0.6651, "grad_norm": 0.9654812812805176, "learning_rate": 0.0002, "epoch": 3.5547576301615798, "step": 49500}, {"loss": 0.6471, "grad_norm": 0.8888838887214661, "learning_rate": 0.0002, "epoch": 3.555475763016158, "step": 49510}, {"loss": 0.622, "grad_norm": 0.7718146443367004, "learning_rate": 0.0002, "epoch": 3.556193895870736, "step": 49520}, {"loss": 0.6297, "grad_norm": 0.9487382173538208, "learning_rate": 0.0002, "epoch": 3.556912028725314, "step": 49530}, {"loss": 0.6516, "grad_norm": 0.9256559610366821, "learning_rate": 0.0002, "epoch": 3.557630161579892, "step": 49540}, {"loss": 0.6461, "grad_norm": 0.8879945874214172, "learning_rate": 0.0002, "epoch": 3.55834829443447, "step": 49550}, {"loss": 0.6367, "grad_norm": 0.8498744368553162, "learning_rate": 0.0002, "epoch": 3.559066427289048, "step": 49560}, {"loss": 0.6274, "grad_norm": 0.9550948143005371, "learning_rate": 0.0002, "epoch": 3.5597845601436267, "step": 49570}, {"loss": 0.635, "grad_norm": 0.8386164903640747, "learning_rate": 0.0002, "epoch": 3.5605026929982047, "step": 49580}, {"loss": 0.6495, "grad_norm": 0.925573468208313, "learning_rate": 0.0002, "epoch": 3.5612208258527827, "step": 49590}, {"loss": 0.676, "grad_norm": 0.8867112398147583, "learning_rate": 0.0002, "epoch": 3.561938958707361, "step": 49600}, {"loss": 0.6156, "grad_norm": 0.7638537883758545, "learning_rate": 0.0002, "epoch": 3.562657091561939, "step": 49610}, {"loss": 0.6597, "grad_norm": 0.9491845965385437, "learning_rate": 0.0002, "epoch": 3.563375224416517, "step": 49620}, {"loss": 0.6237, "grad_norm": 0.8384189605712891, "learning_rate": 0.0002, "epoch": 3.564093357271095, "step": 49630}, {"loss": 0.6102, "grad_norm": 0.8850575089454651, "learning_rate": 0.0002, "epoch": 3.564811490125673, "step": 49640}, {"loss": 0.6517, "grad_norm": 1.020916223526001, "learning_rate": 0.0002, "epoch": 3.565529622980251, "step": 49650}, {"loss": 0.6569, "grad_norm": 0.9298280477523804, "learning_rate": 0.0002, "epoch": 3.5662477558348296, "step": 49660}, {"loss": 0.6094, "grad_norm": 0.9795742034912109, "learning_rate": 0.0002, "epoch": 3.5669658886894076, "step": 49670}, {"loss": 0.6147, "grad_norm": 0.9401193261146545, "learning_rate": 0.0002, "epoch": 3.5676840215439856, "step": 49680}, {"loss": 0.622, "grad_norm": 1.0383585691452026, "learning_rate": 0.0002, "epoch": 3.568402154398564, "step": 49690}, {"loss": 0.6304, "grad_norm": 0.8370866179466248, "learning_rate": 0.0002, "epoch": 3.569120287253142, "step": 49700}, {"loss": 0.6356, "grad_norm": 0.8207486271858215, "learning_rate": 0.0002, "epoch": 3.56983842010772, "step": 49710}, {"loss": 0.6328, "grad_norm": 0.8551223278045654, "learning_rate": 0.0002, "epoch": 3.570556552962298, "step": 49720}, {"loss": 0.621, "grad_norm": 0.8041176199913025, "learning_rate": 0.0002, "epoch": 3.571274685816876, "step": 49730}, {"loss": 0.5818, "grad_norm": 0.9862527847290039, "learning_rate": 0.0002, "epoch": 3.571992818671454, "step": 49740}, {"loss": 0.6448, "grad_norm": 0.7557165622711182, "learning_rate": 0.0002, "epoch": 3.5727109515260325, "step": 49750}, {"loss": 0.6484, "grad_norm": 1.0908563137054443, "learning_rate": 0.0002, "epoch": 3.5734290843806105, "step": 49760}, {"loss": 0.6497, "grad_norm": 0.7245369553565979, "learning_rate": 0.0002, "epoch": 3.5741472172351885, "step": 49770}, {"loss": 0.6315, "grad_norm": 0.7851184010505676, "learning_rate": 0.0002, "epoch": 3.5748653500897665, "step": 49780}, {"loss": 0.6245, "grad_norm": 0.9443599581718445, "learning_rate": 0.0002, "epoch": 3.575583482944345, "step": 49790}, {"loss": 0.6481, "grad_norm": 1.021196961402893, "learning_rate": 0.0002, "epoch": 3.576301615798923, "step": 49800}, {"loss": 0.6368, "grad_norm": 0.9099196195602417, "learning_rate": 0.0002, "epoch": 3.577019748653501, "step": 49810}, {"loss": 0.6372, "grad_norm": 0.9397716522216797, "learning_rate": 0.0002, "epoch": 3.577737881508079, "step": 49820}, {"loss": 0.6208, "grad_norm": 0.9214922785758972, "learning_rate": 0.0002, "epoch": 3.578456014362657, "step": 49830}, {"loss": 0.6219, "grad_norm": 1.0053879022598267, "learning_rate": 0.0002, "epoch": 3.579174147217235, "step": 49840}, {"loss": 0.6283, "grad_norm": 0.9415460228919983, "learning_rate": 0.0002, "epoch": 3.5798922800718134, "step": 49850}, {"loss": 0.6759, "grad_norm": 1.0807833671569824, "learning_rate": 0.0002, "epoch": 3.5806104129263914, "step": 49860}, {"loss": 0.6404, "grad_norm": 1.0070871114730835, "learning_rate": 0.0002, "epoch": 3.5813285457809694, "step": 49870}, {"loss": 0.6411, "grad_norm": 0.9707024693489075, "learning_rate": 0.0002, "epoch": 3.582046678635548, "step": 49880}, {"loss": 0.6852, "grad_norm": 0.9979593753814697, "learning_rate": 0.0002, "epoch": 3.582764811490126, "step": 49890}, {"loss": 0.6519, "grad_norm": 0.7238648533821106, "learning_rate": 0.0002, "epoch": 3.583482944344704, "step": 49900}, {"loss": 0.6452, "grad_norm": 0.8168631792068481, "learning_rate": 0.0002, "epoch": 3.584201077199282, "step": 49910}, {"loss": 0.6174, "grad_norm": 0.8156409859657288, "learning_rate": 0.0002, "epoch": 3.58491921005386, "step": 49920}, {"loss": 0.6248, "grad_norm": 0.9256414175033569, "learning_rate": 0.0002, "epoch": 3.585637342908438, "step": 49930}, {"loss": 0.6077, "grad_norm": 1.0090070962905884, "learning_rate": 0.0002, "epoch": 3.5863554757630163, "step": 49940}, {"loss": 0.6016, "grad_norm": 0.8257701992988586, "learning_rate": 0.0002, "epoch": 3.5870736086175943, "step": 49950}, {"loss": 0.6996, "grad_norm": 0.9189013242721558, "learning_rate": 0.0002, "epoch": 3.5877917414721723, "step": 49960}, {"loss": 0.661, "grad_norm": 0.8497788310050964, "learning_rate": 0.0002, "epoch": 3.5885098743267507, "step": 49970}, {"loss": 0.6335, "grad_norm": 0.9596505761146545, "learning_rate": 0.0002, "epoch": 3.5892280071813287, "step": 49980}, {"loss": 0.697, "grad_norm": 0.8773331642150879, "learning_rate": 0.0002, "epoch": 3.5899461400359067, "step": 49990}, {"loss": 0.6259, "grad_norm": 0.8952302932739258, "learning_rate": 0.0002, "epoch": 3.5906642728904847, "step": 50000}, {"loss": 0.6152, "grad_norm": 0.7713809609413147, "learning_rate": 0.0002, "epoch": 3.5913824057450627, "step": 50010}, {"loss": 0.6127, "grad_norm": 1.0151346921920776, "learning_rate": 0.0002, "epoch": 3.5921005385996407, "step": 50020}, {"loss": 0.6093, "grad_norm": 0.8793733716011047, "learning_rate": 0.0002, "epoch": 3.592818671454219, "step": 50030}, {"loss": 0.5986, "grad_norm": 0.8881325721740723, "learning_rate": 0.0002, "epoch": 3.593536804308797, "step": 50040}, {"loss": 0.6351, "grad_norm": 0.9346749782562256, "learning_rate": 0.0002, "epoch": 3.594254937163375, "step": 50050}, {"loss": 0.6501, "grad_norm": 0.8705052137374878, "learning_rate": 0.0002, "epoch": 3.594973070017953, "step": 50060}, {"loss": 0.6753, "grad_norm": 1.039197564125061, "learning_rate": 0.0002, "epoch": 3.5956912028725316, "step": 50070}, {"loss": 0.6565, "grad_norm": 0.7053273320198059, "learning_rate": 0.0002, "epoch": 3.5964093357271096, "step": 50080}, {"loss": 0.6546, "grad_norm": 0.8268665671348572, "learning_rate": 0.0002, "epoch": 3.5971274685816876, "step": 50090}, {"loss": 0.6637, "grad_norm": 0.8921764492988586, "learning_rate": 0.0002, "epoch": 3.5978456014362656, "step": 50100}, {"loss": 0.6827, "grad_norm": 0.9756084680557251, "learning_rate": 0.0002, "epoch": 3.5985637342908436, "step": 50110}, {"loss": 0.6746, "grad_norm": 0.9275530576705933, "learning_rate": 0.0002, "epoch": 3.5992818671454216, "step": 50120}, {"loss": 0.6709, "grad_norm": 0.9030009508132935, "learning_rate": 0.0002, "epoch": 3.6, "step": 50130}, {"loss": 0.6344, "grad_norm": 0.7805638909339905, "learning_rate": 0.0002, "epoch": 3.600718132854578, "step": 50140}, {"loss": 0.6437, "grad_norm": 0.7627325057983398, "learning_rate": 0.0002, "epoch": 3.601436265709156, "step": 50150}, {"loss": 0.6523, "grad_norm": 0.7809714078903198, "learning_rate": 0.0002, "epoch": 3.6021543985637345, "step": 50160}, {"loss": 0.6578, "grad_norm": 0.7910378575325012, "learning_rate": 0.0002, "epoch": 3.6028725314183125, "step": 50170}, {"loss": 0.6522, "grad_norm": 1.004438042640686, "learning_rate": 0.0002, "epoch": 3.6035906642728905, "step": 50180}, {"loss": 0.6657, "grad_norm": 0.825969934463501, "learning_rate": 0.0002, "epoch": 3.6043087971274685, "step": 50190}, {"loss": 0.6788, "grad_norm": 0.8866565227508545, "learning_rate": 0.0002, "epoch": 3.6050269299820465, "step": 50200}, {"loss": 0.6643, "grad_norm": 0.8920543193817139, "learning_rate": 0.0002, "epoch": 3.6057450628366245, "step": 50210}, {"loss": 0.668, "grad_norm": 1.106584906578064, "learning_rate": 0.0002, "epoch": 3.606463195691203, "step": 50220}, {"loss": 0.6878, "grad_norm": 0.916607677936554, "learning_rate": 0.0002, "epoch": 3.607181328545781, "step": 50230}, {"loss": 0.6084, "grad_norm": 0.8014767169952393, "learning_rate": 0.0002, "epoch": 3.607899461400359, "step": 50240}, {"loss": 0.6718, "grad_norm": 0.9556822776794434, "learning_rate": 0.0002, "epoch": 3.608617594254937, "step": 50250}, {"loss": 0.6896, "grad_norm": 0.9630016684532166, "learning_rate": 0.0002, "epoch": 3.6093357271095154, "step": 50260}, {"loss": 0.692, "grad_norm": 0.9862125515937805, "learning_rate": 0.0002, "epoch": 3.6100538599640934, "step": 50270}, {"loss": 0.5981, "grad_norm": 1.0043333768844604, "learning_rate": 0.0002, "epoch": 3.6107719928186714, "step": 50280}, {"loss": 0.6243, "grad_norm": 0.9255319833755493, "learning_rate": 0.0002, "epoch": 3.6114901256732495, "step": 50290}, {"loss": 0.6374, "grad_norm": 1.012023687362671, "learning_rate": 0.0002, "epoch": 3.6122082585278275, "step": 50300}, {"loss": 0.6896, "grad_norm": 1.0701122283935547, "learning_rate": 0.0002, "epoch": 3.612926391382406, "step": 50310}, {"loss": 0.6474, "grad_norm": 0.8270810842514038, "learning_rate": 0.0002, "epoch": 3.613644524236984, "step": 50320}, {"loss": 0.6667, "grad_norm": 0.8881328105926514, "learning_rate": 0.0002, "epoch": 3.614362657091562, "step": 50330}, {"loss": 0.6517, "grad_norm": 0.9536844491958618, "learning_rate": 0.0002, "epoch": 3.61508078994614, "step": 50340}, {"loss": 0.62, "grad_norm": 0.8044326305389404, "learning_rate": 0.0002, "epoch": 3.6157989228007184, "step": 50350}, {"loss": 0.6259, "grad_norm": 0.834591805934906, "learning_rate": 0.0002, "epoch": 3.6165170556552964, "step": 50360}, {"loss": 0.7173, "grad_norm": 0.903752863407135, "learning_rate": 0.0002, "epoch": 3.6172351885098744, "step": 50370}, {"loss": 0.6305, "grad_norm": 0.9148632884025574, "learning_rate": 0.0002, "epoch": 3.6179533213644524, "step": 50380}, {"loss": 0.6624, "grad_norm": 0.9280176162719727, "learning_rate": 0.0002, "epoch": 3.6186714542190304, "step": 50390}, {"loss": 0.6457, "grad_norm": 0.9524136781692505, "learning_rate": 0.0002, "epoch": 3.6193895870736084, "step": 50400}, {"loss": 0.6918, "grad_norm": 1.1751197576522827, "learning_rate": 0.0002, "epoch": 3.620107719928187, "step": 50410}, {"loss": 0.6161, "grad_norm": 1.032279133796692, "learning_rate": 0.0002, "epoch": 3.620825852782765, "step": 50420}, {"loss": 0.6347, "grad_norm": 0.790741503238678, "learning_rate": 0.0002, "epoch": 3.621543985637343, "step": 50430}, {"loss": 0.695, "grad_norm": 0.9584221243858337, "learning_rate": 0.0002, "epoch": 3.6222621184919213, "step": 50440}, {"loss": 0.6393, "grad_norm": 0.7792508006095886, "learning_rate": 0.0002, "epoch": 3.6229802513464993, "step": 50450}, {"loss": 0.6398, "grad_norm": 0.8273448944091797, "learning_rate": 0.0002, "epoch": 3.6236983842010773, "step": 50460}, {"loss": 0.6436, "grad_norm": 0.8001132607460022, "learning_rate": 0.0002, "epoch": 3.6244165170556553, "step": 50470}, {"loss": 0.6499, "grad_norm": 1.077109694480896, "learning_rate": 0.0002, "epoch": 3.6251346499102333, "step": 50480}, {"loss": 0.6587, "grad_norm": 1.111274003982544, "learning_rate": 0.0002, "epoch": 3.6258527827648113, "step": 50490}, {"loss": 0.6842, "grad_norm": 0.7757347822189331, "learning_rate": 0.0002, "epoch": 3.6265709156193897, "step": 50500}, {"loss": 0.6887, "grad_norm": 0.9217049479484558, "learning_rate": 0.0002, "epoch": 3.6272890484739677, "step": 50510}, {"loss": 0.6903, "grad_norm": 0.9362251162528992, "learning_rate": 0.0002, "epoch": 3.6280071813285457, "step": 50520}, {"loss": 0.625, "grad_norm": 0.9435479044914246, "learning_rate": 0.0002, "epoch": 3.6287253141831237, "step": 50530}, {"loss": 0.5869, "grad_norm": 0.7748915553092957, "learning_rate": 0.0002, "epoch": 3.629443447037702, "step": 50540}, {"loss": 0.637, "grad_norm": 0.8238945007324219, "learning_rate": 0.0002, "epoch": 3.63016157989228, "step": 50550}, {"loss": 0.6251, "grad_norm": 0.8421505093574524, "learning_rate": 0.0002, "epoch": 3.630879712746858, "step": 50560}, {"loss": 0.6544, "grad_norm": 1.0272293090820312, "learning_rate": 0.0002, "epoch": 3.631597845601436, "step": 50570}, {"loss": 0.6467, "grad_norm": 0.7643818259239197, "learning_rate": 0.0002, "epoch": 3.632315978456014, "step": 50580}, {"loss": 0.6716, "grad_norm": 0.9756225347518921, "learning_rate": 0.0002, "epoch": 3.6330341113105926, "step": 50590}, {"loss": 0.6534, "grad_norm": 0.9311570525169373, "learning_rate": 0.0002, "epoch": 3.6337522441651706, "step": 50600}, {"loss": 0.6465, "grad_norm": 0.8829827904701233, "learning_rate": 0.0002, "epoch": 3.6344703770197486, "step": 50610}, {"loss": 0.626, "grad_norm": 0.9473454356193542, "learning_rate": 0.0002, "epoch": 3.6351885098743266, "step": 50620}, {"loss": 0.713, "grad_norm": 1.1023668050765991, "learning_rate": 0.0002, "epoch": 3.635906642728905, "step": 50630}, {"loss": 0.6287, "grad_norm": 0.8490299582481384, "learning_rate": 0.0002, "epoch": 3.636624775583483, "step": 50640}, {"loss": 0.6373, "grad_norm": 1.1129392385482788, "learning_rate": 0.0002, "epoch": 3.637342908438061, "step": 50650}, {"loss": 0.7351, "grad_norm": 1.0334501266479492, "learning_rate": 0.0002, "epoch": 3.638061041292639, "step": 50660}, {"loss": 0.69, "grad_norm": 0.8397296667098999, "learning_rate": 0.0002, "epoch": 3.638779174147217, "step": 50670}, {"loss": 0.6075, "grad_norm": 0.7984256744384766, "learning_rate": 0.0002, "epoch": 3.639497307001795, "step": 50680}, {"loss": 0.651, "grad_norm": 1.1182054281234741, "learning_rate": 0.0002, "epoch": 3.6402154398563735, "step": 50690}, {"loss": 0.6511, "grad_norm": 0.8743279576301575, "learning_rate": 0.0002, "epoch": 3.6409335727109515, "step": 50700}, {"loss": 0.6894, "grad_norm": 0.9101628661155701, "learning_rate": 0.0002, "epoch": 3.6416517055655295, "step": 50710}, {"loss": 0.6591, "grad_norm": 0.8866934180259705, "learning_rate": 0.0002, "epoch": 3.642369838420108, "step": 50720}, {"loss": 0.6483, "grad_norm": 0.863945484161377, "learning_rate": 0.0002, "epoch": 3.643087971274686, "step": 50730}, {"loss": 0.6443, "grad_norm": 1.0845744609832764, "learning_rate": 0.0002, "epoch": 3.643806104129264, "step": 50740}, {"loss": 0.6611, "grad_norm": 0.8610911965370178, "learning_rate": 0.0002, "epoch": 3.644524236983842, "step": 50750}, {"loss": 0.6617, "grad_norm": 0.8502625226974487, "learning_rate": 0.0002, "epoch": 3.64524236983842, "step": 50760}, {"loss": 0.6283, "grad_norm": 0.847372829914093, "learning_rate": 0.0002, "epoch": 3.645960502692998, "step": 50770}, {"loss": 0.5724, "grad_norm": 0.8649292588233948, "learning_rate": 0.0002, "epoch": 3.6466786355475764, "step": 50780}, {"loss": 0.6253, "grad_norm": 0.8742905855178833, "learning_rate": 0.0002, "epoch": 3.6473967684021544, "step": 50790}, {"loss": 0.68, "grad_norm": 0.9546048641204834, "learning_rate": 0.0002, "epoch": 3.6481149012567324, "step": 50800}, {"loss": 0.6212, "grad_norm": 0.7893161773681641, "learning_rate": 0.0002, "epoch": 3.6488330341113104, "step": 50810}, {"loss": 0.6328, "grad_norm": 0.9350247979164124, "learning_rate": 0.0002, "epoch": 3.649551166965889, "step": 50820}, {"loss": 0.6893, "grad_norm": 0.772149384021759, "learning_rate": 0.0002, "epoch": 3.650269299820467, "step": 50830}, {"loss": 0.6107, "grad_norm": 0.8281718492507935, "learning_rate": 0.0002, "epoch": 3.650987432675045, "step": 50840}, {"loss": 0.6136, "grad_norm": 0.8063850402832031, "learning_rate": 0.0002, "epoch": 3.651705565529623, "step": 50850}, {"loss": 0.6416, "grad_norm": 0.8101351261138916, "learning_rate": 0.0002, "epoch": 3.652423698384201, "step": 50860}, {"loss": 0.6636, "grad_norm": 0.8747833371162415, "learning_rate": 0.0002, "epoch": 3.6531418312387793, "step": 50870}, {"loss": 0.6575, "grad_norm": 0.9634656310081482, "learning_rate": 0.0002, "epoch": 3.6538599640933573, "step": 50880}, {"loss": 0.6227, "grad_norm": 1.1646045446395874, "learning_rate": 0.0002, "epoch": 3.6545780969479353, "step": 50890}, {"loss": 0.6628, "grad_norm": 0.8538454174995422, "learning_rate": 0.0002, "epoch": 3.6552962298025133, "step": 50900}, {"loss": 0.6488, "grad_norm": 0.7639184594154358, "learning_rate": 0.0002, "epoch": 3.656014362657092, "step": 50910}, {"loss": 0.6495, "grad_norm": 0.8750212788581848, "learning_rate": 0.0002, "epoch": 3.65673249551167, "step": 50920}, {"loss": 0.6601, "grad_norm": 0.9161198735237122, "learning_rate": 0.0002, "epoch": 3.657450628366248, "step": 50930}, {"loss": 0.6809, "grad_norm": 0.7987924814224243, "learning_rate": 0.0002, "epoch": 3.658168761220826, "step": 50940}, {"loss": 0.6228, "grad_norm": 0.8939290642738342, "learning_rate": 0.0002, "epoch": 3.658886894075404, "step": 50950}, {"loss": 0.687, "grad_norm": 0.9803797602653503, "learning_rate": 0.0002, "epoch": 3.659605026929982, "step": 50960}, {"loss": 0.6368, "grad_norm": 1.2423512935638428, "learning_rate": 0.0002, "epoch": 3.6603231597845602, "step": 50970}, {"loss": 0.6477, "grad_norm": 1.0023225545883179, "learning_rate": 0.0002, "epoch": 3.6610412926391382, "step": 50980}, {"loss": 0.6659, "grad_norm": 0.9066677689552307, "learning_rate": 0.0002, "epoch": 3.6617594254937162, "step": 50990}, {"loss": 0.6348, "grad_norm": 0.8906226754188538, "learning_rate": 0.0002, "epoch": 3.6624775583482947, "step": 51000}, {"loss": 0.5967, "grad_norm": 0.7449954152107239, "learning_rate": 0.0002, "epoch": 3.6631956912028727, "step": 51010}, {"loss": 0.6167, "grad_norm": 0.812612771987915, "learning_rate": 0.0002, "epoch": 3.6639138240574507, "step": 51020}, {"loss": 0.6414, "grad_norm": 0.861818253993988, "learning_rate": 0.0002, "epoch": 3.6646319569120287, "step": 51030}, {"loss": 0.6418, "grad_norm": 0.849726676940918, "learning_rate": 0.0002, "epoch": 3.6653500897666067, "step": 51040}, {"loss": 0.6613, "grad_norm": 0.9738494753837585, "learning_rate": 0.0002, "epoch": 3.6660682226211847, "step": 51050}, {"loss": 0.6094, "grad_norm": 0.928989827632904, "learning_rate": 0.0002, "epoch": 3.666786355475763, "step": 51060}, {"loss": 0.623, "grad_norm": 0.9725563526153564, "learning_rate": 0.0002, "epoch": 3.667504488330341, "step": 51070}, {"loss": 0.5967, "grad_norm": 0.9366095066070557, "learning_rate": 0.0002, "epoch": 3.668222621184919, "step": 51080}, {"loss": 0.6175, "grad_norm": 0.8012986779212952, "learning_rate": 0.0002, "epoch": 3.668940754039497, "step": 51090}, {"loss": 0.6428, "grad_norm": 1.0646892786026, "learning_rate": 0.0002, "epoch": 3.6696588868940756, "step": 51100}, {"loss": 0.6333, "grad_norm": 0.7245157361030579, "learning_rate": 0.0002, "epoch": 3.6703770197486536, "step": 51110}, {"loss": 0.6618, "grad_norm": 0.6938936114311218, "learning_rate": 0.0002, "epoch": 3.6710951526032316, "step": 51120}, {"loss": 0.6511, "grad_norm": 0.8461366295814514, "learning_rate": 0.0002, "epoch": 3.6718132854578096, "step": 51130}, {"loss": 0.6168, "grad_norm": 0.8392583131790161, "learning_rate": 0.0002, "epoch": 3.6725314183123876, "step": 51140}, {"loss": 0.6616, "grad_norm": 0.7245259284973145, "learning_rate": 0.0002, "epoch": 3.673249551166966, "step": 51150}, {"loss": 0.6165, "grad_norm": 1.0742167234420776, "learning_rate": 0.0002, "epoch": 3.673967684021544, "step": 51160}, {"loss": 0.6805, "grad_norm": 0.9553889036178589, "learning_rate": 0.0002, "epoch": 3.674685816876122, "step": 51170}, {"loss": 0.6065, "grad_norm": 0.8713715672492981, "learning_rate": 0.0002, "epoch": 3.6754039497307, "step": 51180}, {"loss": 0.599, "grad_norm": 0.7499800324440002, "learning_rate": 0.0002, "epoch": 3.6761220825852785, "step": 51190}, {"loss": 0.7143, "grad_norm": 1.1118139028549194, "learning_rate": 0.0002, "epoch": 3.6768402154398565, "step": 51200}, {"loss": 0.6694, "grad_norm": 0.8146613836288452, "learning_rate": 0.0002, "epoch": 3.6775583482944345, "step": 51210}, {"loss": 0.6528, "grad_norm": 0.9331285357475281, "learning_rate": 0.0002, "epoch": 3.6782764811490125, "step": 51220}, {"loss": 0.6429, "grad_norm": 1.0497597455978394, "learning_rate": 0.0002, "epoch": 3.6789946140035905, "step": 51230}, {"loss": 0.6404, "grad_norm": 0.879814863204956, "learning_rate": 0.0002, "epoch": 3.6797127468581685, "step": 51240}, {"loss": 0.6617, "grad_norm": 0.9896606802940369, "learning_rate": 0.0002, "epoch": 3.680430879712747, "step": 51250}, {"loss": 0.6461, "grad_norm": 0.928236186504364, "learning_rate": 0.0002, "epoch": 3.681149012567325, "step": 51260}, {"loss": 0.6516, "grad_norm": 0.8436732292175293, "learning_rate": 0.0002, "epoch": 3.681867145421903, "step": 51270}, {"loss": 0.6428, "grad_norm": 0.93634432554245, "learning_rate": 0.0002, "epoch": 3.6825852782764814, "step": 51280}, {"loss": 0.6081, "grad_norm": 0.8477143049240112, "learning_rate": 0.0002, "epoch": 3.6833034111310594, "step": 51290}, {"loss": 0.6536, "grad_norm": 0.8720934987068176, "learning_rate": 0.0002, "epoch": 3.6840215439856374, "step": 51300}, {"loss": 0.6523, "grad_norm": 0.7322931289672852, "learning_rate": 0.0002, "epoch": 3.6847396768402154, "step": 51310}, {"loss": 0.6475, "grad_norm": 1.0064427852630615, "learning_rate": 0.0002, "epoch": 3.6854578096947934, "step": 51320}, {"loss": 0.681, "grad_norm": 1.0197817087173462, "learning_rate": 0.0002, "epoch": 3.6861759425493714, "step": 51330}, {"loss": 0.5904, "grad_norm": 0.8764060139656067, "learning_rate": 0.0002, "epoch": 3.68689407540395, "step": 51340}, {"loss": 0.625, "grad_norm": 0.9763964414596558, "learning_rate": 0.0002, "epoch": 3.687612208258528, "step": 51350}, {"loss": 0.6299, "grad_norm": 0.8389105200767517, "learning_rate": 0.0002, "epoch": 3.688330341113106, "step": 51360}, {"loss": 0.6885, "grad_norm": 0.9215750694274902, "learning_rate": 0.0002, "epoch": 3.689048473967684, "step": 51370}, {"loss": 0.6325, "grad_norm": 0.8444913625717163, "learning_rate": 0.0002, "epoch": 3.6897666068222623, "step": 51380}, {"loss": 0.657, "grad_norm": 0.9635153412818909, "learning_rate": 0.0002, "epoch": 3.6904847396768403, "step": 51390}, {"loss": 0.7045, "grad_norm": 1.0397378206253052, "learning_rate": 0.0002, "epoch": 3.6912028725314183, "step": 51400}, {"loss": 0.6635, "grad_norm": 0.9154748320579529, "learning_rate": 0.0002, "epoch": 3.6919210053859963, "step": 51410}, {"loss": 0.6757, "grad_norm": 0.906445324420929, "learning_rate": 0.0002, "epoch": 3.6926391382405743, "step": 51420}, {"loss": 0.6533, "grad_norm": 0.9237992763519287, "learning_rate": 0.0002, "epoch": 3.6933572710951523, "step": 51430}, {"loss": 0.6257, "grad_norm": 0.8796338438987732, "learning_rate": 0.0002, "epoch": 3.6940754039497308, "step": 51440}, {"loss": 0.7063, "grad_norm": 0.8613203763961792, "learning_rate": 0.0002, "epoch": 3.6947935368043088, "step": 51450}, {"loss": 0.6455, "grad_norm": 0.7957607507705688, "learning_rate": 0.0002, "epoch": 3.6955116696588868, "step": 51460}, {"loss": 0.6328, "grad_norm": 0.9183711409568787, "learning_rate": 0.0002, "epoch": 3.6962298025134652, "step": 51470}, {"loss": 0.6289, "grad_norm": 1.0108308792114258, "learning_rate": 0.0002, "epoch": 3.6969479353680432, "step": 51480}, {"loss": 0.668, "grad_norm": 0.7768247127532959, "learning_rate": 0.0002, "epoch": 3.6976660682226212, "step": 51490}, {"loss": 0.6483, "grad_norm": 1.0051485300064087, "learning_rate": 0.0002, "epoch": 3.6983842010771992, "step": 51500}, {"loss": 0.6268, "grad_norm": 0.82451993227005, "learning_rate": 0.0002, "epoch": 3.6991023339317772, "step": 51510}, {"loss": 0.6258, "grad_norm": 0.9542286992073059, "learning_rate": 0.0002, "epoch": 3.6998204667863552, "step": 51520}, {"loss": 0.6415, "grad_norm": 0.693890392780304, "learning_rate": 0.0002, "epoch": 3.7005385996409337, "step": 51530}, {"loss": 0.6445, "grad_norm": 0.9068924784660339, "learning_rate": 0.0002, "epoch": 3.7012567324955117, "step": 51540}, {"loss": 0.6386, "grad_norm": 0.8694922924041748, "learning_rate": 0.0002, "epoch": 3.7019748653500897, "step": 51550}, {"loss": 0.6563, "grad_norm": 0.941081702709198, "learning_rate": 0.0002, "epoch": 3.702692998204668, "step": 51560}, {"loss": 0.6068, "grad_norm": 0.7385984659194946, "learning_rate": 0.0002, "epoch": 3.703411131059246, "step": 51570}, {"loss": 0.6243, "grad_norm": 1.0399216413497925, "learning_rate": 0.0002, "epoch": 3.704129263913824, "step": 51580}, {"loss": 0.6776, "grad_norm": 0.9802294969558716, "learning_rate": 0.0002, "epoch": 3.704847396768402, "step": 51590}, {"loss": 0.6243, "grad_norm": 1.0409669876098633, "learning_rate": 0.0002, "epoch": 3.70556552962298, "step": 51600}, {"loss": 0.6812, "grad_norm": 0.8972786068916321, "learning_rate": 0.0002, "epoch": 3.706283662477558, "step": 51610}, {"loss": 0.5993, "grad_norm": 1.1916245222091675, "learning_rate": 0.0002, "epoch": 3.7070017953321366, "step": 51620}, {"loss": 0.6566, "grad_norm": 0.9545385241508484, "learning_rate": 0.0002, "epoch": 3.7077199281867146, "step": 51630}, {"loss": 0.6497, "grad_norm": 1.0773427486419678, "learning_rate": 0.0002, "epoch": 3.7084380610412926, "step": 51640}, {"loss": 0.6768, "grad_norm": 1.0856024026870728, "learning_rate": 0.0002, "epoch": 3.7091561938958706, "step": 51650}, {"loss": 0.6404, "grad_norm": 0.7678500413894653, "learning_rate": 0.0002, "epoch": 3.709874326750449, "step": 51660}, {"loss": 0.6571, "grad_norm": 0.7276270985603333, "learning_rate": 0.0002, "epoch": 3.710592459605027, "step": 51670}, {"loss": 0.6498, "grad_norm": 0.8859017491340637, "learning_rate": 0.0002, "epoch": 3.711310592459605, "step": 51680}, {"loss": 0.6602, "grad_norm": 0.9037614464759827, "learning_rate": 0.0002, "epoch": 3.712028725314183, "step": 51690}, {"loss": 0.685, "grad_norm": 0.9223412275314331, "learning_rate": 0.0002, "epoch": 3.712746858168761, "step": 51700}, {"loss": 0.647, "grad_norm": 0.8812923431396484, "learning_rate": 0.0002, "epoch": 3.713464991023339, "step": 51710}, {"loss": 0.6546, "grad_norm": 0.8242456912994385, "learning_rate": 0.0002, "epoch": 3.7141831238779175, "step": 51720}, {"loss": 0.6462, "grad_norm": 0.8368834257125854, "learning_rate": 0.0002, "epoch": 3.7149012567324955, "step": 51730}, {"loss": 0.6432, "grad_norm": 0.8624704480171204, "learning_rate": 0.0002, "epoch": 3.7156193895870735, "step": 51740}, {"loss": 0.6367, "grad_norm": 0.9138273596763611, "learning_rate": 0.0002, "epoch": 3.716337522441652, "step": 51750}, {"loss": 0.6717, "grad_norm": 0.8088571429252625, "learning_rate": 0.0002, "epoch": 3.71705565529623, "step": 51760}, {"loss": 0.658, "grad_norm": 0.882808268070221, "learning_rate": 0.0002, "epoch": 3.717773788150808, "step": 51770}, {"loss": 0.6686, "grad_norm": 0.9368035197257996, "learning_rate": 0.0002, "epoch": 3.718491921005386, "step": 51780}, {"loss": 0.6482, "grad_norm": 0.8341794013977051, "learning_rate": 0.0002, "epoch": 3.719210053859964, "step": 51790}, {"loss": 0.6486, "grad_norm": 0.8692073225975037, "learning_rate": 0.0002, "epoch": 3.719928186714542, "step": 51800}, {"loss": 0.6591, "grad_norm": 0.7566918730735779, "learning_rate": 0.0002, "epoch": 3.7206463195691204, "step": 51810}, {"loss": 0.707, "grad_norm": 1.113138198852539, "learning_rate": 0.0002, "epoch": 3.7213644524236984, "step": 51820}, {"loss": 0.6683, "grad_norm": 0.8793158531188965, "learning_rate": 0.0002, "epoch": 3.7220825852782764, "step": 51830}, {"loss": 0.6343, "grad_norm": 0.8856439590454102, "learning_rate": 0.0002, "epoch": 3.722800718132855, "step": 51840}, {"loss": 0.6238, "grad_norm": 1.0182029008865356, "learning_rate": 0.0002, "epoch": 3.723518850987433, "step": 51850}, {"loss": 0.6743, "grad_norm": 1.1177181005477905, "learning_rate": 0.0002, "epoch": 3.724236983842011, "step": 51860}, {"loss": 0.6477, "grad_norm": 0.6600990295410156, "learning_rate": 0.0002, "epoch": 3.724955116696589, "step": 51870}, {"loss": 0.6532, "grad_norm": 1.0563536882400513, "learning_rate": 0.0002, "epoch": 3.725673249551167, "step": 51880}, {"loss": 0.6648, "grad_norm": 1.1067734956741333, "learning_rate": 0.0002, "epoch": 3.726391382405745, "step": 51890}, {"loss": 0.6547, "grad_norm": 1.0204616785049438, "learning_rate": 0.0002, "epoch": 3.7271095152603233, "step": 51900}, {"loss": 0.685, "grad_norm": 0.8647155165672302, "learning_rate": 0.0002, "epoch": 3.7278276481149013, "step": 51910}, {"loss": 0.739, "grad_norm": 1.0754971504211426, "learning_rate": 0.0002, "epoch": 3.7285457809694793, "step": 51920}, {"loss": 0.6535, "grad_norm": 1.0448992252349854, "learning_rate": 0.0002, "epoch": 3.7292639138240573, "step": 51930}, {"loss": 0.6802, "grad_norm": 0.963434100151062, "learning_rate": 0.0002, "epoch": 3.7299820466786358, "step": 51940}, {"loss": 0.6367, "grad_norm": 0.8112701773643494, "learning_rate": 0.0002, "epoch": 3.7307001795332138, "step": 51950}, {"loss": 0.6785, "grad_norm": 0.7975119948387146, "learning_rate": 0.0002, "epoch": 3.7314183123877918, "step": 51960}, {"loss": 0.6748, "grad_norm": 0.7953376173973083, "learning_rate": 0.0002, "epoch": 3.7321364452423698, "step": 51970}, {"loss": 0.6464, "grad_norm": 0.9519981741905212, "learning_rate": 0.0002, "epoch": 3.7328545780969478, "step": 51980}, {"loss": 0.6247, "grad_norm": 0.8705791234970093, "learning_rate": 0.0002, "epoch": 3.7335727109515258, "step": 51990}, {"loss": 0.6876, "grad_norm": 0.870205283164978, "learning_rate": 0.0002, "epoch": 3.734290843806104, "step": 52000}, {"loss": 0.6681, "grad_norm": 0.9558930993080139, "learning_rate": 0.0002, "epoch": 3.735008976660682, "step": 52010}, {"loss": 0.6772, "grad_norm": 0.9330434799194336, "learning_rate": 0.0002, "epoch": 3.73572710951526, "step": 52020}, {"loss": 0.6365, "grad_norm": 0.783620297908783, "learning_rate": 0.0002, "epoch": 3.7364452423698387, "step": 52030}, {"loss": 0.6275, "grad_norm": 0.7575166821479797, "learning_rate": 0.0002, "epoch": 3.7371633752244167, "step": 52040}, {"loss": 0.6859, "grad_norm": 1.0592705011367798, "learning_rate": 0.0002, "epoch": 3.7378815080789947, "step": 52050}, {"loss": 0.6704, "grad_norm": 0.9309433102607727, "learning_rate": 0.0002, "epoch": 3.7385996409335727, "step": 52060}, {"loss": 0.6607, "grad_norm": 0.972861647605896, "learning_rate": 0.0002, "epoch": 3.7393177737881507, "step": 52070}, {"loss": 0.6267, "grad_norm": 0.9318740963935852, "learning_rate": 0.0002, "epoch": 3.7400359066427287, "step": 52080}, {"loss": 0.6404, "grad_norm": 0.7938477396965027, "learning_rate": 0.0002, "epoch": 3.740754039497307, "step": 52090}, {"loss": 0.6451, "grad_norm": 1.1515966653823853, "learning_rate": 0.0002, "epoch": 3.741472172351885, "step": 52100}, {"loss": 0.6179, "grad_norm": 1.076869010925293, "learning_rate": 0.0002, "epoch": 3.742190305206463, "step": 52110}, {"loss": 0.6477, "grad_norm": 0.8516066670417786, "learning_rate": 0.0002, "epoch": 3.7429084380610416, "step": 52120}, {"loss": 0.6741, "grad_norm": 0.6853429079055786, "learning_rate": 0.0002, "epoch": 3.7436265709156196, "step": 52130}, {"loss": 0.6392, "grad_norm": 0.8179695010185242, "learning_rate": 0.0002, "epoch": 3.7443447037701976, "step": 52140}, {"loss": 0.6692, "grad_norm": 0.8395232558250427, "learning_rate": 0.0002, "epoch": 3.7450628366247756, "step": 52150}, {"loss": 0.6902, "grad_norm": 1.0178003311157227, "learning_rate": 0.0002, "epoch": 3.7457809694793536, "step": 52160}, {"loss": 0.6726, "grad_norm": 1.1801023483276367, "learning_rate": 0.0002, "epoch": 3.7464991023339316, "step": 52170}, {"loss": 0.6334, "grad_norm": 0.8215751647949219, "learning_rate": 0.0002, "epoch": 3.74721723518851, "step": 52180}, {"loss": 0.5992, "grad_norm": 1.17083740234375, "learning_rate": 0.0002, "epoch": 3.747935368043088, "step": 52190}, {"loss": 0.6219, "grad_norm": 0.9230290651321411, "learning_rate": 0.0002, "epoch": 3.748653500897666, "step": 52200}, {"loss": 0.6503, "grad_norm": 0.8431521058082581, "learning_rate": 0.0002, "epoch": 3.749371633752244, "step": 52210}, {"loss": 0.6983, "grad_norm": 0.9690840244293213, "learning_rate": 0.0002, "epoch": 3.7500897666068225, "step": 52220}, {"loss": 0.6204, "grad_norm": 1.0022395849227905, "learning_rate": 0.0002, "epoch": 3.7508078994614005, "step": 52230}, {"loss": 0.6683, "grad_norm": 1.0489065647125244, "learning_rate": 0.0002, "epoch": 3.7515260323159785, "step": 52240}, {"loss": 0.6439, "grad_norm": 0.7880696058273315, "learning_rate": 0.0002, "epoch": 3.7522441651705565, "step": 52250}, {"loss": 0.6933, "grad_norm": 1.0255829095840454, "learning_rate": 0.0002, "epoch": 3.7529622980251345, "step": 52260}, {"loss": 0.6631, "grad_norm": 0.8470141291618347, "learning_rate": 0.0002, "epoch": 3.7536804308797125, "step": 52270}, {"loss": 0.5956, "grad_norm": 0.9040523171424866, "learning_rate": 0.0002, "epoch": 3.754398563734291, "step": 52280}, {"loss": 0.6759, "grad_norm": 0.9564392566680908, "learning_rate": 0.0002, "epoch": 3.755116696588869, "step": 52290}, {"loss": 0.6717, "grad_norm": 0.907857358455658, "learning_rate": 0.0002, "epoch": 3.755834829443447, "step": 52300}, {"loss": 0.6821, "grad_norm": 0.8929873704910278, "learning_rate": 0.0002, "epoch": 3.7565529622980254, "step": 52310}, {"loss": 0.655, "grad_norm": 0.854434072971344, "learning_rate": 0.0002, "epoch": 3.7572710951526034, "step": 52320}, {"loss": 0.6668, "grad_norm": 0.8744779229164124, "learning_rate": 0.0002, "epoch": 3.7579892280071814, "step": 52330}, {"loss": 0.6628, "grad_norm": 0.9022667407989502, "learning_rate": 0.0002, "epoch": 3.7587073608617594, "step": 52340}, {"loss": 0.6275, "grad_norm": 0.8884857892990112, "learning_rate": 0.0002, "epoch": 3.7594254937163374, "step": 52350}, {"loss": 0.6585, "grad_norm": 1.0228430032730103, "learning_rate": 0.0002, "epoch": 3.7601436265709154, "step": 52360}, {"loss": 0.6092, "grad_norm": 0.8593528270721436, "learning_rate": 0.0002, "epoch": 3.760861759425494, "step": 52370}, {"loss": 0.664, "grad_norm": 0.9435563087463379, "learning_rate": 0.0002, "epoch": 3.761579892280072, "step": 52380}, {"loss": 0.6326, "grad_norm": 0.7545679807662964, "learning_rate": 0.0002, "epoch": 3.76229802513465, "step": 52390}, {"loss": 0.6628, "grad_norm": 0.9411585927009583, "learning_rate": 0.0002, "epoch": 3.7630161579892283, "step": 52400}, {"loss": 0.62, "grad_norm": 0.9764377474784851, "learning_rate": 0.0002, "epoch": 3.7637342908438063, "step": 52410}, {"loss": 0.671, "grad_norm": 1.0718384981155396, "learning_rate": 0.0002, "epoch": 3.7644524236983843, "step": 52420}, {"loss": 0.6654, "grad_norm": 0.8765230774879456, "learning_rate": 0.0002, "epoch": 3.7651705565529623, "step": 52430}, {"loss": 0.6602, "grad_norm": 0.9275036454200745, "learning_rate": 0.0002, "epoch": 3.7658886894075403, "step": 52440}, {"loss": 0.6098, "grad_norm": 0.967410147190094, "learning_rate": 0.0002, "epoch": 3.7666068222621183, "step": 52450}, {"loss": 0.6195, "grad_norm": 0.7738949060440063, "learning_rate": 0.0002, "epoch": 3.7673249551166967, "step": 52460}, {"loss": 0.6054, "grad_norm": 1.0828070640563965, "learning_rate": 0.0002, "epoch": 3.7680430879712747, "step": 52470}, {"loss": 0.6208, "grad_norm": 0.9570213556289673, "learning_rate": 0.0002, "epoch": 3.7687612208258527, "step": 52480}, {"loss": 0.6703, "grad_norm": 1.0688215494155884, "learning_rate": 0.0002, "epoch": 3.7694793536804307, "step": 52490}, {"loss": 0.5993, "grad_norm": 0.7970073223114014, "learning_rate": 0.0002, "epoch": 3.770197486535009, "step": 52500}, {"loss": 0.6537, "grad_norm": 0.7132976651191711, "learning_rate": 0.0002, "epoch": 3.770915619389587, "step": 52510}, {"loss": 0.6571, "grad_norm": 1.152268648147583, "learning_rate": 0.0002, "epoch": 3.771633752244165, "step": 52520}, {"loss": 0.6548, "grad_norm": 0.8645235896110535, "learning_rate": 0.0002, "epoch": 3.772351885098743, "step": 52530}, {"loss": 0.6918, "grad_norm": 0.7725570201873779, "learning_rate": 0.0002, "epoch": 3.773070017953321, "step": 52540}, {"loss": 0.6796, "grad_norm": 0.9718102812767029, "learning_rate": 0.0002, "epoch": 3.773788150807899, "step": 52550}, {"loss": 0.6298, "grad_norm": 0.7568017840385437, "learning_rate": 0.0002, "epoch": 3.7745062836624776, "step": 52560}, {"loss": 0.6652, "grad_norm": 0.9578912854194641, "learning_rate": 0.0002, "epoch": 3.7752244165170556, "step": 52570}, {"loss": 0.6417, "grad_norm": 0.8657314777374268, "learning_rate": 0.0002, "epoch": 3.7759425493716336, "step": 52580}, {"loss": 0.6552, "grad_norm": 0.7564393281936646, "learning_rate": 0.0002, "epoch": 3.776660682226212, "step": 52590}, {"loss": 0.69, "grad_norm": 0.7631160616874695, "learning_rate": 0.0002, "epoch": 3.77737881508079, "step": 52600}, {"loss": 0.6427, "grad_norm": 1.1852056980133057, "learning_rate": 0.0002, "epoch": 3.778096947935368, "step": 52610}, {"loss": 0.6369, "grad_norm": 1.0620790719985962, "learning_rate": 0.0002, "epoch": 3.778815080789946, "step": 52620}, {"loss": 0.6782, "grad_norm": 0.8677777647972107, "learning_rate": 0.0002, "epoch": 3.779533213644524, "step": 52630}, {"loss": 0.6249, "grad_norm": 0.9913218021392822, "learning_rate": 0.0002, "epoch": 3.780251346499102, "step": 52640}, {"loss": 0.625, "grad_norm": 0.9868429899215698, "learning_rate": 0.0002, "epoch": 3.7809694793536806, "step": 52650}, {"loss": 0.6252, "grad_norm": 0.8791782259941101, "learning_rate": 0.0002, "epoch": 3.7816876122082586, "step": 52660}, {"loss": 0.6675, "grad_norm": 0.9503955245018005, "learning_rate": 0.0002, "epoch": 3.7824057450628366, "step": 52670}, {"loss": 0.6406, "grad_norm": 0.8647131323814392, "learning_rate": 0.0002, "epoch": 3.7831238779174146, "step": 52680}, {"loss": 0.6654, "grad_norm": 0.9819629788398743, "learning_rate": 0.0002, "epoch": 3.783842010771993, "step": 52690}, {"loss": 0.593, "grad_norm": 0.8548610210418701, "learning_rate": 0.0002, "epoch": 3.784560143626571, "step": 52700}, {"loss": 0.6614, "grad_norm": 0.8706230521202087, "learning_rate": 0.0002, "epoch": 3.785278276481149, "step": 52710}, {"loss": 0.6326, "grad_norm": 1.0032461881637573, "learning_rate": 0.0002, "epoch": 3.785996409335727, "step": 52720}, {"loss": 0.6172, "grad_norm": 1.0578246116638184, "learning_rate": 0.0002, "epoch": 3.786714542190305, "step": 52730}, {"loss": 0.6392, "grad_norm": 0.9854007363319397, "learning_rate": 0.0002, "epoch": 3.7874326750448835, "step": 52740}, {"loss": 0.6462, "grad_norm": 0.8389187455177307, "learning_rate": 0.0002, "epoch": 3.7881508078994615, "step": 52750}, {"loss": 0.6515, "grad_norm": 0.9192399978637695, "learning_rate": 0.0002, "epoch": 3.7888689407540395, "step": 52760}, {"loss": 0.6436, "grad_norm": 0.9518283605575562, "learning_rate": 0.0002, "epoch": 3.7895870736086175, "step": 52770}, {"loss": 0.6548, "grad_norm": 1.1296825408935547, "learning_rate": 0.0002, "epoch": 3.790305206463196, "step": 52780}, {"loss": 0.6073, "grad_norm": 1.0589144229888916, "learning_rate": 0.0002, "epoch": 3.791023339317774, "step": 52790}, {"loss": 0.6593, "grad_norm": 0.8954343199729919, "learning_rate": 0.0002, "epoch": 3.791741472172352, "step": 52800}, {"loss": 0.6678, "grad_norm": 0.8283370733261108, "learning_rate": 0.0002, "epoch": 3.79245960502693, "step": 52810}, {"loss": 0.6865, "grad_norm": 0.910642683506012, "learning_rate": 0.0002, "epoch": 3.793177737881508, "step": 52820}, {"loss": 0.6672, "grad_norm": 0.9255108833312988, "learning_rate": 0.0002, "epoch": 3.793895870736086, "step": 52830}, {"loss": 0.6836, "grad_norm": 0.8773723244667053, "learning_rate": 0.0002, "epoch": 3.7946140035906644, "step": 52840}, {"loss": 0.6815, "grad_norm": 0.8454240560531616, "learning_rate": 0.0002, "epoch": 3.7953321364452424, "step": 52850}, {"loss": 0.6594, "grad_norm": 0.7636052966117859, "learning_rate": 0.0002, "epoch": 3.7960502692998204, "step": 52860}, {"loss": 0.6663, "grad_norm": 0.9358382821083069, "learning_rate": 0.0002, "epoch": 3.796768402154399, "step": 52870}, {"loss": 0.6761, "grad_norm": 0.9662801623344421, "learning_rate": 0.0002, "epoch": 3.797486535008977, "step": 52880}, {"loss": 0.6749, "grad_norm": 0.995907187461853, "learning_rate": 0.0002, "epoch": 3.798204667863555, "step": 52890}, {"loss": 0.6715, "grad_norm": 0.8700127005577087, "learning_rate": 0.0002, "epoch": 3.798922800718133, "step": 52900}, {"loss": 0.6554, "grad_norm": 0.8987792134284973, "learning_rate": 0.0002, "epoch": 3.799640933572711, "step": 52910}, {"loss": 0.6655, "grad_norm": 0.9753904938697815, "learning_rate": 0.0002, "epoch": 3.800359066427289, "step": 52920}, {"loss": 0.6536, "grad_norm": 0.7873555421829224, "learning_rate": 0.0002, "epoch": 3.8010771992818673, "step": 52930}, {"loss": 0.6233, "grad_norm": 0.8177929520606995, "learning_rate": 0.0002, "epoch": 3.8017953321364453, "step": 52940}, {"loss": 0.6508, "grad_norm": 0.8865532279014587, "learning_rate": 0.0002, "epoch": 3.8025134649910233, "step": 52950}, {"loss": 0.6922, "grad_norm": 0.9113775491714478, "learning_rate": 0.0002, "epoch": 3.8032315978456013, "step": 52960}, {"loss": 0.6382, "grad_norm": 0.9424585700035095, "learning_rate": 0.0002, "epoch": 3.8039497307001797, "step": 52970}, {"loss": 0.6694, "grad_norm": 0.8347237706184387, "learning_rate": 0.0002, "epoch": 3.8046678635547577, "step": 52980}, {"loss": 0.643, "grad_norm": 0.826863169670105, "learning_rate": 0.0002, "epoch": 3.8053859964093357, "step": 52990}, {"loss": 0.639, "grad_norm": 0.7313310503959656, "learning_rate": 0.0002, "epoch": 3.8061041292639137, "step": 53000}, {"loss": 0.6831, "grad_norm": 0.8352667093276978, "learning_rate": 0.0002, "epoch": 3.8068222621184917, "step": 53010}, {"loss": 0.6265, "grad_norm": 0.748461127281189, "learning_rate": 0.0002, "epoch": 3.80754039497307, "step": 53020}, {"loss": 0.6433, "grad_norm": 0.943256139755249, "learning_rate": 0.0002, "epoch": 3.808258527827648, "step": 53030}, {"loss": 0.6702, "grad_norm": 1.0448410511016846, "learning_rate": 0.0002, "epoch": 3.808976660682226, "step": 53040}, {"loss": 0.6901, "grad_norm": 0.9047636985778809, "learning_rate": 0.0002, "epoch": 3.809694793536804, "step": 53050}, {"loss": 0.6774, "grad_norm": 0.8594381213188171, "learning_rate": 0.0002, "epoch": 3.8104129263913826, "step": 53060}, {"loss": 0.6664, "grad_norm": 0.7593536972999573, "learning_rate": 0.0002, "epoch": 3.8111310592459606, "step": 53070}, {"loss": 0.6651, "grad_norm": 0.7189019918441772, "learning_rate": 0.0002, "epoch": 3.8118491921005386, "step": 53080}, {"loss": 0.6657, "grad_norm": 0.8569809198379517, "learning_rate": 0.0002, "epoch": 3.8125673249551166, "step": 53090}, {"loss": 0.6689, "grad_norm": 0.923378050327301, "learning_rate": 0.0002, "epoch": 3.8132854578096946, "step": 53100}, {"loss": 0.6168, "grad_norm": 0.9088824391365051, "learning_rate": 0.0002, "epoch": 3.8140035906642726, "step": 53110}, {"loss": 0.6514, "grad_norm": 1.1386840343475342, "learning_rate": 0.0002, "epoch": 3.814721723518851, "step": 53120}, {"loss": 0.6182, "grad_norm": 0.8389552235603333, "learning_rate": 0.0002, "epoch": 3.815439856373429, "step": 53130}, {"loss": 0.6779, "grad_norm": 0.7940975427627563, "learning_rate": 0.0002, "epoch": 3.816157989228007, "step": 53140}, {"loss": 0.6825, "grad_norm": 0.8389907479286194, "learning_rate": 0.0002, "epoch": 3.8168761220825855, "step": 53150}, {"loss": 0.6763, "grad_norm": 0.774206280708313, "learning_rate": 0.0002, "epoch": 3.8175942549371635, "step": 53160}, {"loss": 0.7011, "grad_norm": 1.189447283744812, "learning_rate": 0.0002, "epoch": 3.8183123877917415, "step": 53170}, {"loss": 0.6206, "grad_norm": 0.9875882863998413, "learning_rate": 0.0002, "epoch": 3.8190305206463195, "step": 53180}, {"loss": 0.6254, "grad_norm": 0.9205945134162903, "learning_rate": 0.0002, "epoch": 3.8197486535008975, "step": 53190}, {"loss": 0.5845, "grad_norm": 0.8312796354293823, "learning_rate": 0.0002, "epoch": 3.8204667863554755, "step": 53200}, {"loss": 0.6415, "grad_norm": 0.9755756855010986, "learning_rate": 0.0002, "epoch": 3.821184919210054, "step": 53210}, {"loss": 0.6657, "grad_norm": 1.0722965002059937, "learning_rate": 0.0002, "epoch": 3.821903052064632, "step": 53220}, {"loss": 0.6547, "grad_norm": 0.7720510959625244, "learning_rate": 0.0002, "epoch": 3.82262118491921, "step": 53230}, {"loss": 0.6383, "grad_norm": 1.020147681236267, "learning_rate": 0.0002, "epoch": 3.823339317773788, "step": 53240}, {"loss": 0.6491, "grad_norm": 0.8241816759109497, "learning_rate": 0.0002, "epoch": 3.8240574506283664, "step": 53250}, {"loss": 0.6914, "grad_norm": 0.8939895629882812, "learning_rate": 0.0002, "epoch": 3.8247755834829444, "step": 53260}, {"loss": 0.6725, "grad_norm": 1.010852336883545, "learning_rate": 0.0002, "epoch": 3.8254937163375224, "step": 53270}, {"loss": 0.6841, "grad_norm": 0.8201420307159424, "learning_rate": 0.0002, "epoch": 3.8262118491921004, "step": 53280}, {"loss": 0.6739, "grad_norm": 0.8797973990440369, "learning_rate": 0.0002, "epoch": 3.8269299820466784, "step": 53290}, {"loss": 0.658, "grad_norm": 0.9034950137138367, "learning_rate": 0.0002, "epoch": 3.827648114901257, "step": 53300}, {"loss": 0.6314, "grad_norm": 0.926802933216095, "learning_rate": 0.0002, "epoch": 3.828366247755835, "step": 53310}, {"loss": 0.6526, "grad_norm": 1.0205509662628174, "learning_rate": 0.0002, "epoch": 3.829084380610413, "step": 53320}, {"loss": 0.6596, "grad_norm": 0.9524099230766296, "learning_rate": 0.0002, "epoch": 3.829802513464991, "step": 53330}, {"loss": 0.6796, "grad_norm": 0.9692625999450684, "learning_rate": 0.0002, "epoch": 3.8305206463195693, "step": 53340}, {"loss": 0.628, "grad_norm": 0.7255275845527649, "learning_rate": 0.0002, "epoch": 3.8312387791741473, "step": 53350}, {"loss": 0.6104, "grad_norm": 0.7199059724807739, "learning_rate": 0.0002, "epoch": 3.8319569120287253, "step": 53360}, {"loss": 0.6703, "grad_norm": 1.004464864730835, "learning_rate": 0.0002, "epoch": 3.8326750448833034, "step": 53370}, {"loss": 0.7032, "grad_norm": 0.9092583060264587, "learning_rate": 0.0002, "epoch": 3.8333931777378814, "step": 53380}, {"loss": 0.6811, "grad_norm": 0.945091724395752, "learning_rate": 0.0002, "epoch": 3.8341113105924594, "step": 53390}, {"loss": 0.611, "grad_norm": 0.7980135679244995, "learning_rate": 0.0002, "epoch": 3.834829443447038, "step": 53400}, {"loss": 0.6604, "grad_norm": 0.7812868356704712, "learning_rate": 0.0002, "epoch": 3.835547576301616, "step": 53410}, {"loss": 0.6104, "grad_norm": 0.8957077860832214, "learning_rate": 0.0002, "epoch": 3.836265709156194, "step": 53420}, {"loss": 0.6754, "grad_norm": 0.9119600653648376, "learning_rate": 0.0002, "epoch": 3.8369838420107722, "step": 53430}, {"loss": 0.7346, "grad_norm": 0.8208187222480774, "learning_rate": 0.0002, "epoch": 3.8377019748653503, "step": 53440}, {"loss": 0.6549, "grad_norm": 0.7930439114570618, "learning_rate": 0.0002, "epoch": 3.8384201077199283, "step": 53450}, {"loss": 0.6192, "grad_norm": 0.8937777280807495, "learning_rate": 0.0002, "epoch": 3.8391382405745063, "step": 53460}, {"loss": 0.5954, "grad_norm": 0.7583796977996826, "learning_rate": 0.0002, "epoch": 3.8398563734290843, "step": 53470}, {"loss": 0.6217, "grad_norm": 1.0735969543457031, "learning_rate": 0.0002, "epoch": 3.8405745062836623, "step": 53480}, {"loss": 0.6472, "grad_norm": 1.1106033325195312, "learning_rate": 0.0002, "epoch": 3.8412926391382407, "step": 53490}, {"loss": 0.6813, "grad_norm": 1.092631220817566, "learning_rate": 0.0002, "epoch": 3.8420107719928187, "step": 53500}, {"loss": 0.6437, "grad_norm": 0.9961787462234497, "learning_rate": 0.0002, "epoch": 3.8427289048473967, "step": 53510}, {"loss": 0.6382, "grad_norm": 0.833831250667572, "learning_rate": 0.0002, "epoch": 3.8434470377019747, "step": 53520}, {"loss": 0.6403, "grad_norm": 1.0000009536743164, "learning_rate": 0.0002, "epoch": 3.844165170556553, "step": 53530}, {"loss": 0.6824, "grad_norm": 0.9784213304519653, "learning_rate": 0.0002, "epoch": 3.844883303411131, "step": 53540}, {"loss": 0.6816, "grad_norm": 0.8582558035850525, "learning_rate": 0.0002, "epoch": 3.845601436265709, "step": 53550}, {"loss": 0.5944, "grad_norm": 0.8267415761947632, "learning_rate": 0.0002, "epoch": 3.846319569120287, "step": 53560}, {"loss": 0.6562, "grad_norm": 0.8783000111579895, "learning_rate": 0.0002, "epoch": 3.847037701974865, "step": 53570}, {"loss": 0.6795, "grad_norm": 0.9866999983787537, "learning_rate": 0.0002, "epoch": 3.8477558348294436, "step": 53580}, {"loss": 0.7222, "grad_norm": 0.8459296226501465, "learning_rate": 0.0002, "epoch": 3.8484739676840216, "step": 53590}, {"loss": 0.6748, "grad_norm": 0.9804834723472595, "learning_rate": 0.0002, "epoch": 3.8491921005385996, "step": 53600}, {"loss": 0.6115, "grad_norm": 0.951074481010437, "learning_rate": 0.0002, "epoch": 3.8499102333931776, "step": 53610}, {"loss": 0.5914, "grad_norm": 0.8020104169845581, "learning_rate": 0.0002, "epoch": 3.850628366247756, "step": 53620}, {"loss": 0.6237, "grad_norm": 0.9296963214874268, "learning_rate": 0.0002, "epoch": 3.851346499102334, "step": 53630}, {"loss": 0.6384, "grad_norm": 0.8983652591705322, "learning_rate": 0.0002, "epoch": 3.852064631956912, "step": 53640}, {"loss": 0.6855, "grad_norm": 1.031858205795288, "learning_rate": 0.0002, "epoch": 3.85278276481149, "step": 53650}, {"loss": 0.622, "grad_norm": 0.8943952918052673, "learning_rate": 0.0002, "epoch": 3.853500897666068, "step": 53660}, {"loss": 0.6745, "grad_norm": 1.0072312355041504, "learning_rate": 0.0002, "epoch": 3.854219030520646, "step": 53670}, {"loss": 0.677, "grad_norm": 1.0604884624481201, "learning_rate": 0.0002, "epoch": 3.8549371633752245, "step": 53680}, {"loss": 0.5873, "grad_norm": 0.834223210811615, "learning_rate": 0.0002, "epoch": 3.8556552962298025, "step": 53690}, {"loss": 0.665, "grad_norm": 0.9872867465019226, "learning_rate": 0.0002, "epoch": 3.8563734290843805, "step": 53700}, {"loss": 0.6689, "grad_norm": 0.7999459505081177, "learning_rate": 0.0002, "epoch": 3.857091561938959, "step": 53710}, {"loss": 0.6744, "grad_norm": 0.717722475528717, "learning_rate": 0.0002, "epoch": 3.857809694793537, "step": 53720}, {"loss": 0.6348, "grad_norm": 1.0675442218780518, "learning_rate": 0.0002, "epoch": 3.858527827648115, "step": 53730}, {"loss": 0.6141, "grad_norm": 0.9789777398109436, "learning_rate": 0.0002, "epoch": 3.859245960502693, "step": 53740}, {"loss": 0.6455, "grad_norm": 0.9318669438362122, "learning_rate": 0.0002, "epoch": 3.859964093357271, "step": 53750}, {"loss": 0.6587, "grad_norm": 0.9848631024360657, "learning_rate": 0.0002, "epoch": 3.860682226211849, "step": 53760}, {"loss": 0.6202, "grad_norm": 0.8754391670227051, "learning_rate": 0.0002, "epoch": 3.8614003590664274, "step": 53770}, {"loss": 0.6411, "grad_norm": 0.9024585485458374, "learning_rate": 0.0002, "epoch": 3.8621184919210054, "step": 53780}, {"loss": 0.6643, "grad_norm": 0.8974794745445251, "learning_rate": 0.0002, "epoch": 3.8628366247755834, "step": 53790}, {"loss": 0.6729, "grad_norm": 0.8342790603637695, "learning_rate": 0.0002, "epoch": 3.8635547576301614, "step": 53800}, {"loss": 0.6322, "grad_norm": 0.8177682757377625, "learning_rate": 0.0002, "epoch": 3.86427289048474, "step": 53810}, {"loss": 0.6525, "grad_norm": 1.0259089469909668, "learning_rate": 0.0002, "epoch": 3.864991023339318, "step": 53820}, {"loss": 0.6508, "grad_norm": 1.042290210723877, "learning_rate": 0.0002, "epoch": 3.865709156193896, "step": 53830}, {"loss": 0.6963, "grad_norm": 0.7316540479660034, "learning_rate": 0.0002, "epoch": 3.866427289048474, "step": 53840}, {"loss": 0.6491, "grad_norm": 0.9384970664978027, "learning_rate": 0.0002, "epoch": 3.867145421903052, "step": 53850}, {"loss": 0.6689, "grad_norm": 0.9273143410682678, "learning_rate": 0.0002, "epoch": 3.86786355475763, "step": 53860}, {"loss": 0.6443, "grad_norm": 1.1183570623397827, "learning_rate": 0.0002, "epoch": 3.8685816876122083, "step": 53870}, {"loss": 0.6712, "grad_norm": 0.9455275535583496, "learning_rate": 0.0002, "epoch": 3.8692998204667863, "step": 53880}, {"loss": 0.6662, "grad_norm": 0.8702114820480347, "learning_rate": 0.0002, "epoch": 3.8700179533213643, "step": 53890}, {"loss": 0.7032, "grad_norm": 0.8751053214073181, "learning_rate": 0.0002, "epoch": 3.870736086175943, "step": 53900}, {"loss": 0.6398, "grad_norm": 0.9793110489845276, "learning_rate": 0.0002, "epoch": 3.871454219030521, "step": 53910}, {"loss": 0.6577, "grad_norm": 0.9705014824867249, "learning_rate": 0.0002, "epoch": 3.872172351885099, "step": 53920}, {"loss": 0.751, "grad_norm": 1.051504373550415, "learning_rate": 0.0002, "epoch": 3.872890484739677, "step": 53930}, {"loss": 0.6606, "grad_norm": 0.8590622544288635, "learning_rate": 0.0002, "epoch": 3.873608617594255, "step": 53940}, {"loss": 0.6495, "grad_norm": 0.7828099727630615, "learning_rate": 0.0002, "epoch": 3.874326750448833, "step": 53950}, {"loss": 0.6294, "grad_norm": 0.86341792345047, "learning_rate": 0.0002, "epoch": 3.8750448833034112, "step": 53960}, {"loss": 0.6677, "grad_norm": 1.114670991897583, "learning_rate": 0.0002, "epoch": 3.8757630161579892, "step": 53970}, {"loss": 0.6533, "grad_norm": 0.8559519052505493, "learning_rate": 0.0002, "epoch": 3.8764811490125672, "step": 53980}, {"loss": 0.6517, "grad_norm": 1.0518953800201416, "learning_rate": 0.0002, "epoch": 3.8771992818671457, "step": 53990}, {"loss": 0.6359, "grad_norm": 0.7157500982284546, "learning_rate": 0.0002, "epoch": 3.8779174147217237, "step": 54000}, {"loss": 0.6847, "grad_norm": 0.8390372395515442, "learning_rate": 0.0002, "epoch": 3.8786355475763017, "step": 54010}, {"loss": 0.6376, "grad_norm": 0.8486756086349487, "learning_rate": 0.0002, "epoch": 3.8793536804308797, "step": 54020}, {"loss": 0.6184, "grad_norm": 0.8361587524414062, "learning_rate": 0.0002, "epoch": 3.8800718132854577, "step": 54030}, {"loss": 0.6552, "grad_norm": 0.9490554928779602, "learning_rate": 0.0002, "epoch": 3.8807899461400357, "step": 54040}, {"loss": 0.6653, "grad_norm": 1.0311323404312134, "learning_rate": 0.0002, "epoch": 3.881508078994614, "step": 54050}, {"loss": 0.6484, "grad_norm": 0.84800124168396, "learning_rate": 0.0002, "epoch": 3.882226211849192, "step": 54060}, {"loss": 0.6995, "grad_norm": 0.8940879702568054, "learning_rate": 0.0002, "epoch": 3.88294434470377, "step": 54070}, {"loss": 0.6157, "grad_norm": 0.985542356967926, "learning_rate": 0.0002, "epoch": 3.883662477558348, "step": 54080}, {"loss": 0.6221, "grad_norm": 0.8846475481987, "learning_rate": 0.0002, "epoch": 3.8843806104129266, "step": 54090}, {"loss": 0.6656, "grad_norm": 0.9186338186264038, "learning_rate": 0.0002, "epoch": 3.8850987432675046, "step": 54100}, {"loss": 0.6367, "grad_norm": 1.106598973274231, "learning_rate": 0.0002, "epoch": 3.8858168761220826, "step": 54110}, {"loss": 0.6311, "grad_norm": 0.8167300224304199, "learning_rate": 0.0002, "epoch": 3.8865350089766606, "step": 54120}, {"loss": 0.694, "grad_norm": 0.9153622984886169, "learning_rate": 0.0002, "epoch": 3.8872531418312386, "step": 54130}, {"loss": 0.6669, "grad_norm": 0.8464475274085999, "learning_rate": 0.0002, "epoch": 3.8879712746858166, "step": 54140}, {"loss": 0.6658, "grad_norm": 0.8889452815055847, "learning_rate": 0.0002, "epoch": 3.888689407540395, "step": 54150}, {"loss": 0.6291, "grad_norm": 0.7861065864562988, "learning_rate": 0.0002, "epoch": 3.889407540394973, "step": 54160}, {"loss": 0.6315, "grad_norm": 0.882674515247345, "learning_rate": 0.0002, "epoch": 3.890125673249551, "step": 54170}, {"loss": 0.6223, "grad_norm": 0.8503835201263428, "learning_rate": 0.0002, "epoch": 3.8908438061041295, "step": 54180}, {"loss": 0.6176, "grad_norm": 0.888455331325531, "learning_rate": 0.0002, "epoch": 3.8915619389587075, "step": 54190}, {"loss": 0.6985, "grad_norm": 1.0473699569702148, "learning_rate": 0.0002, "epoch": 3.8922800718132855, "step": 54200}, {"loss": 0.6513, "grad_norm": 0.9548208713531494, "learning_rate": 0.0002, "epoch": 3.8929982046678635, "step": 54210}, {"loss": 0.6089, "grad_norm": 0.9158754944801331, "learning_rate": 0.0002, "epoch": 3.8937163375224415, "step": 54220}, {"loss": 0.6352, "grad_norm": 0.9001154899597168, "learning_rate": 0.0002, "epoch": 3.8944344703770195, "step": 54230}, {"loss": 0.6657, "grad_norm": 0.9736626148223877, "learning_rate": 0.0002, "epoch": 3.895152603231598, "step": 54240}, {"loss": 0.7248, "grad_norm": 0.8809846043586731, "learning_rate": 0.0002, "epoch": 3.895870736086176, "step": 54250}, {"loss": 0.6364, "grad_norm": 0.887583315372467, "learning_rate": 0.0002, "epoch": 3.896588868940754, "step": 54260}, {"loss": 0.6252, "grad_norm": 0.8395712971687317, "learning_rate": 0.0002, "epoch": 3.8973070017953324, "step": 54270}, {"loss": 0.681, "grad_norm": 0.8391315937042236, "learning_rate": 0.0002, "epoch": 3.8980251346499104, "step": 54280}, {"loss": 0.6352, "grad_norm": 0.8210049271583557, "learning_rate": 0.0002, "epoch": 3.8987432675044884, "step": 54290}, {"loss": 0.6484, "grad_norm": 1.1364530324935913, "learning_rate": 0.0002, "epoch": 3.8994614003590664, "step": 54300}, {"loss": 0.6383, "grad_norm": 0.7712056636810303, "learning_rate": 0.0002, "epoch": 3.9001795332136444, "step": 54310}, {"loss": 0.6516, "grad_norm": 0.9466049671173096, "learning_rate": 0.0002, "epoch": 3.9008976660682224, "step": 54320}, {"loss": 0.6938, "grad_norm": 1.0367140769958496, "learning_rate": 0.0002, "epoch": 3.901615798922801, "step": 54330}, {"loss": 0.672, "grad_norm": 1.0168321132659912, "learning_rate": 0.0002, "epoch": 3.902333931777379, "step": 54340}, {"loss": 0.6306, "grad_norm": 0.7830407619476318, "learning_rate": 0.0002, "epoch": 3.903052064631957, "step": 54350}, {"loss": 0.7198, "grad_norm": 0.9649789333343506, "learning_rate": 0.0002, "epoch": 3.903770197486535, "step": 54360}, {"loss": 0.6644, "grad_norm": 0.681077778339386, "learning_rate": 0.0002, "epoch": 3.9044883303411133, "step": 54370}, {"loss": 0.6677, "grad_norm": 0.8970136046409607, "learning_rate": 0.0002, "epoch": 3.9052064631956913, "step": 54380}, {"loss": 0.6581, "grad_norm": 0.9155173301696777, "learning_rate": 0.0002, "epoch": 3.9059245960502693, "step": 54390}, {"loss": 0.6711, "grad_norm": 1.0447794198989868, "learning_rate": 0.0002, "epoch": 3.9066427289048473, "step": 54400}, {"loss": 0.6883, "grad_norm": 0.7823813557624817, "learning_rate": 0.0002, "epoch": 3.9073608617594253, "step": 54410}, {"loss": 0.6688, "grad_norm": 0.9289445877075195, "learning_rate": 0.0002, "epoch": 3.9080789946140033, "step": 54420}, {"loss": 0.7024, "grad_norm": 0.9983111619949341, "learning_rate": 0.0002, "epoch": 3.9087971274685818, "step": 54430}, {"loss": 0.6687, "grad_norm": 0.7952495813369751, "learning_rate": 0.0002, "epoch": 3.9095152603231598, "step": 54440}, {"loss": 0.6118, "grad_norm": 0.8045601844787598, "learning_rate": 0.0002, "epoch": 3.9102333931777378, "step": 54450}, {"loss": 0.6388, "grad_norm": 0.936585009098053, "learning_rate": 0.0002, "epoch": 3.910951526032316, "step": 54460}, {"loss": 0.6217, "grad_norm": 0.745793879032135, "learning_rate": 0.0002, "epoch": 3.911669658886894, "step": 54470}, {"loss": 0.6814, "grad_norm": 0.9137616157531738, "learning_rate": 0.0002, "epoch": 3.912387791741472, "step": 54480}, {"loss": 0.6792, "grad_norm": 0.826316237449646, "learning_rate": 0.0002, "epoch": 3.9131059245960502, "step": 54490}, {"loss": 0.6914, "grad_norm": 0.94313645362854, "learning_rate": 0.0002, "epoch": 3.9138240574506282, "step": 54500}, {"loss": 0.62, "grad_norm": 1.045893907546997, "learning_rate": 0.0002, "epoch": 3.9145421903052062, "step": 54510}, {"loss": 0.5841, "grad_norm": 0.9122704863548279, "learning_rate": 0.0002, "epoch": 3.9152603231597847, "step": 54520}, {"loss": 0.7029, "grad_norm": 1.0999689102172852, "learning_rate": 0.0002, "epoch": 3.9159784560143627, "step": 54530}, {"loss": 0.6387, "grad_norm": 0.9281555414199829, "learning_rate": 0.0002, "epoch": 3.9166965888689407, "step": 54540}, {"loss": 0.6227, "grad_norm": 1.1439622640609741, "learning_rate": 0.0002, "epoch": 3.917414721723519, "step": 54550}, {"loss": 0.6733, "grad_norm": 0.9375617504119873, "learning_rate": 0.0002, "epoch": 3.918132854578097, "step": 54560}, {"loss": 0.6503, "grad_norm": 0.92906653881073, "learning_rate": 0.0002, "epoch": 3.918850987432675, "step": 54570}, {"loss": 0.6361, "grad_norm": 1.0840893983840942, "learning_rate": 0.0002, "epoch": 3.919569120287253, "step": 54580}, {"loss": 0.6476, "grad_norm": 0.8145509362220764, "learning_rate": 0.0002, "epoch": 3.920287253141831, "step": 54590}, {"loss": 0.6826, "grad_norm": 0.973737895488739, "learning_rate": 0.0002, "epoch": 3.921005385996409, "step": 54600}, {"loss": 0.6822, "grad_norm": 0.9302353858947754, "learning_rate": 0.0002, "epoch": 3.9217235188509876, "step": 54610}, {"loss": 0.6522, "grad_norm": 0.9167897701263428, "learning_rate": 0.0002, "epoch": 3.9224416517055656, "step": 54620}, {"loss": 0.6783, "grad_norm": 0.8096851706504822, "learning_rate": 0.0002, "epoch": 3.9231597845601436, "step": 54630}, {"loss": 0.6369, "grad_norm": 0.8006368279457092, "learning_rate": 0.0002, "epoch": 3.9238779174147216, "step": 54640}, {"loss": 0.6533, "grad_norm": 0.7800863981246948, "learning_rate": 0.0002, "epoch": 3.9245960502693, "step": 54650}, {"loss": 0.6518, "grad_norm": 1.0331560373306274, "learning_rate": 0.0002, "epoch": 3.925314183123878, "step": 54660}, {"loss": 0.6764, "grad_norm": 1.0057517290115356, "learning_rate": 0.0002, "epoch": 3.926032315978456, "step": 54670}, {"loss": 0.6636, "grad_norm": 0.8920564651489258, "learning_rate": 0.0002, "epoch": 3.926750448833034, "step": 54680}, {"loss": 0.6432, "grad_norm": 0.7704599499702454, "learning_rate": 0.0002, "epoch": 3.927468581687612, "step": 54690}, {"loss": 0.6532, "grad_norm": 0.827032208442688, "learning_rate": 0.0002, "epoch": 3.92818671454219, "step": 54700}, {"loss": 0.7083, "grad_norm": 1.0019268989562988, "learning_rate": 0.0002, "epoch": 3.9289048473967685, "step": 54710}, {"loss": 0.6026, "grad_norm": 0.862033486366272, "learning_rate": 0.0002, "epoch": 3.9296229802513465, "step": 54720}, {"loss": 0.599, "grad_norm": 0.8965592980384827, "learning_rate": 0.0002, "epoch": 3.9303411131059245, "step": 54730}, {"loss": 0.6739, "grad_norm": 0.7689077854156494, "learning_rate": 0.0002, "epoch": 3.931059245960503, "step": 54740}, {"loss": 0.6401, "grad_norm": 0.846276581287384, "learning_rate": 0.0002, "epoch": 3.931777378815081, "step": 54750}, {"loss": 0.6942, "grad_norm": 0.8932713866233826, "learning_rate": 0.0002, "epoch": 3.932495511669659, "step": 54760}, {"loss": 0.6697, "grad_norm": 0.9711386561393738, "learning_rate": 0.0002, "epoch": 3.933213644524237, "step": 54770}, {"loss": 0.6672, "grad_norm": 0.9290250539779663, "learning_rate": 0.0002, "epoch": 3.933931777378815, "step": 54780}, {"loss": 0.6365, "grad_norm": 1.0897367000579834, "learning_rate": 0.0002, "epoch": 3.934649910233393, "step": 54790}, {"loss": 0.6647, "grad_norm": 0.8451842665672302, "learning_rate": 0.0002, "epoch": 3.9353680430879714, "step": 54800}, {"loss": 0.6705, "grad_norm": 0.8400090336799622, "learning_rate": 0.0002, "epoch": 3.9360861759425494, "step": 54810}, {"loss": 0.6577, "grad_norm": 0.951383650302887, "learning_rate": 0.0002, "epoch": 3.9368043087971274, "step": 54820}, {"loss": 0.654, "grad_norm": 0.848838210105896, "learning_rate": 0.0002, "epoch": 3.937522441651706, "step": 54830}, {"loss": 0.6852, "grad_norm": 0.735763669013977, "learning_rate": 0.0002, "epoch": 3.938240574506284, "step": 54840}, {"loss": 0.6574, "grad_norm": 0.979037344455719, "learning_rate": 0.0002, "epoch": 3.938958707360862, "step": 54850}, {"loss": 0.5851, "grad_norm": 0.933674693107605, "learning_rate": 0.0002, "epoch": 3.93967684021544, "step": 54860}, {"loss": 0.6931, "grad_norm": 0.835593044757843, "learning_rate": 0.0002, "epoch": 3.940394973070018, "step": 54870}, {"loss": 0.6967, "grad_norm": 1.0034281015396118, "learning_rate": 0.0002, "epoch": 3.941113105924596, "step": 54880}, {"loss": 0.6442, "grad_norm": 0.9732975959777832, "learning_rate": 0.0002, "epoch": 3.9418312387791743, "step": 54890}, {"loss": 0.6657, "grad_norm": 0.9666336178779602, "learning_rate": 0.0002, "epoch": 3.9425493716337523, "step": 54900}, {"loss": 0.6521, "grad_norm": 0.755310595035553, "learning_rate": 0.0002, "epoch": 3.9432675044883303, "step": 54910}, {"loss": 0.6562, "grad_norm": 0.8732092976570129, "learning_rate": 0.0002, "epoch": 3.9439856373429083, "step": 54920}, {"loss": 0.6486, "grad_norm": 1.139453649520874, "learning_rate": 0.0002, "epoch": 3.9447037701974867, "step": 54930}, {"loss": 0.6609, "grad_norm": 0.9044837951660156, "learning_rate": 0.0002, "epoch": 3.9454219030520647, "step": 54940}, {"loss": 0.6344, "grad_norm": 1.0496679544448853, "learning_rate": 0.0002, "epoch": 3.9461400359066428, "step": 54950}, {"loss": 0.6471, "grad_norm": 1.0099035501480103, "learning_rate": 0.0002, "epoch": 3.9468581687612208, "step": 54960}, {"loss": 0.6143, "grad_norm": 1.0694963932037354, "learning_rate": 0.0002, "epoch": 3.9475763016157988, "step": 54970}, {"loss": 0.6209, "grad_norm": 1.0012997388839722, "learning_rate": 0.0002, "epoch": 3.9482944344703768, "step": 54980}, {"loss": 0.7379, "grad_norm": 0.8910513520240784, "learning_rate": 0.0002, "epoch": 3.949012567324955, "step": 54990}, {"loss": 0.7184, "grad_norm": 1.0267579555511475, "learning_rate": 0.0002, "epoch": 3.949730700179533, "step": 55000}, {"loss": 0.6844, "grad_norm": 0.9786432385444641, "learning_rate": 0.0002, "epoch": 3.950448833034111, "step": 55010}, {"loss": 0.6499, "grad_norm": 0.8703538775444031, "learning_rate": 0.0002, "epoch": 3.9511669658886897, "step": 55020}, {"loss": 0.5989, "grad_norm": 0.8970484137535095, "learning_rate": 0.0002, "epoch": 3.9518850987432677, "step": 55030}, {"loss": 0.659, "grad_norm": 0.8781577944755554, "learning_rate": 0.0002, "epoch": 3.9526032315978457, "step": 55040}, {"loss": 0.6944, "grad_norm": 0.8040280938148499, "learning_rate": 0.0002, "epoch": 3.9533213644524237, "step": 55050}, {"loss": 0.6359, "grad_norm": 0.851926326751709, "learning_rate": 0.0002, "epoch": 3.9540394973070017, "step": 55060}, {"loss": 0.6806, "grad_norm": 0.8597240447998047, "learning_rate": 0.0002, "epoch": 3.9547576301615797, "step": 55070}, {"loss": 0.6499, "grad_norm": 0.9461944699287415, "learning_rate": 0.0002, "epoch": 3.955475763016158, "step": 55080}, {"loss": 0.6222, "grad_norm": 0.7576611042022705, "learning_rate": 0.0002, "epoch": 3.956193895870736, "step": 55090}, {"loss": 0.6735, "grad_norm": 0.9484710693359375, "learning_rate": 0.0002, "epoch": 3.956912028725314, "step": 55100}, {"loss": 0.6586, "grad_norm": 0.9487117528915405, "learning_rate": 0.0002, "epoch": 3.957630161579892, "step": 55110}, {"loss": 0.6632, "grad_norm": 0.870090663433075, "learning_rate": 0.0002, "epoch": 3.9583482944344706, "step": 55120}, {"loss": 0.6786, "grad_norm": 0.8496458530426025, "learning_rate": 0.0002, "epoch": 3.9590664272890486, "step": 55130}, {"loss": 0.6631, "grad_norm": 1.0121779441833496, "learning_rate": 0.0002, "epoch": 3.9597845601436266, "step": 55140}, {"loss": 0.7005, "grad_norm": 0.8912323713302612, "learning_rate": 0.0002, "epoch": 3.9605026929982046, "step": 55150}, {"loss": 0.6398, "grad_norm": 0.8398444652557373, "learning_rate": 0.0002, "epoch": 3.9612208258527826, "step": 55160}, {"loss": 0.6183, "grad_norm": 0.8046348690986633, "learning_rate": 0.0002, "epoch": 3.961938958707361, "step": 55170}, {"loss": 0.6357, "grad_norm": 1.0369254350662231, "learning_rate": 0.0002, "epoch": 3.962657091561939, "step": 55180}, {"loss": 0.6053, "grad_norm": 1.172431230545044, "learning_rate": 0.0002, "epoch": 3.963375224416517, "step": 55190}, {"loss": 0.643, "grad_norm": 0.8093554377555847, "learning_rate": 0.0002, "epoch": 3.964093357271095, "step": 55200}, {"loss": 0.6416, "grad_norm": 0.8851078748703003, "learning_rate": 0.0002, "epoch": 3.9648114901256735, "step": 55210}, {"loss": 0.6516, "grad_norm": 0.7494266033172607, "learning_rate": 0.0002, "epoch": 3.9655296229802515, "step": 55220}, {"loss": 0.629, "grad_norm": 0.9556898474693298, "learning_rate": 0.0002, "epoch": 3.9662477558348295, "step": 55230}, {"loss": 0.6481, "grad_norm": 1.016017198562622, "learning_rate": 0.0002, "epoch": 3.9669658886894075, "step": 55240}, {"loss": 0.7185, "grad_norm": 0.8425998091697693, "learning_rate": 0.0002, "epoch": 3.9676840215439855, "step": 55250}, {"loss": 0.6609, "grad_norm": 0.717673122882843, "learning_rate": 0.0002, "epoch": 3.9684021543985635, "step": 55260}, {"loss": 0.6453, "grad_norm": 0.8366572856903076, "learning_rate": 0.0002, "epoch": 3.969120287253142, "step": 55270}, {"loss": 0.6841, "grad_norm": 0.8981583118438721, "learning_rate": 0.0002, "epoch": 3.96983842010772, "step": 55280}, {"loss": 0.6351, "grad_norm": 0.8868781328201294, "learning_rate": 0.0002, "epoch": 3.970556552962298, "step": 55290}, {"loss": 0.6755, "grad_norm": 1.0632785558700562, "learning_rate": 0.0002, "epoch": 3.9712746858168764, "step": 55300}, {"loss": 0.6433, "grad_norm": 0.8813109993934631, "learning_rate": 0.0002, "epoch": 3.9719928186714544, "step": 55310}, {"loss": 0.5699, "grad_norm": 0.8225542306900024, "learning_rate": 0.0002, "epoch": 3.9727109515260324, "step": 55320}, {"loss": 0.6591, "grad_norm": 1.1391420364379883, "learning_rate": 0.0002, "epoch": 3.9734290843806104, "step": 55330}, {"loss": 0.6551, "grad_norm": 1.0371832847595215, "learning_rate": 0.0002, "epoch": 3.9741472172351884, "step": 55340}, {"loss": 0.7538, "grad_norm": 1.0542186498641968, "learning_rate": 0.0002, "epoch": 3.9748653500897664, "step": 55350}, {"loss": 0.6799, "grad_norm": 1.0178009271621704, "learning_rate": 0.0002, "epoch": 3.975583482944345, "step": 55360}, {"loss": 0.6394, "grad_norm": 0.7927802205085754, "learning_rate": 0.0002, "epoch": 3.976301615798923, "step": 55370}, {"loss": 0.6632, "grad_norm": 0.9350495934486389, "learning_rate": 0.0002, "epoch": 3.977019748653501, "step": 55380}, {"loss": 0.6889, "grad_norm": 1.0240116119384766, "learning_rate": 0.0002, "epoch": 3.977737881508079, "step": 55390}, {"loss": 0.6756, "grad_norm": 1.0279067754745483, "learning_rate": 0.0002, "epoch": 3.9784560143626573, "step": 55400}, {"loss": 0.6979, "grad_norm": 1.1228227615356445, "learning_rate": 0.0002, "epoch": 3.9791741472172353, "step": 55410}, {"loss": 0.6595, "grad_norm": 0.9500134587287903, "learning_rate": 0.0002, "epoch": 3.9798922800718133, "step": 55420}, {"loss": 0.6875, "grad_norm": 0.9229732155799866, "learning_rate": 0.0002, "epoch": 3.9806104129263913, "step": 55430}, {"loss": 0.6742, "grad_norm": 0.7946729063987732, "learning_rate": 0.0002, "epoch": 3.9813285457809693, "step": 55440}, {"loss": 0.6643, "grad_norm": 0.9987489581108093, "learning_rate": 0.0002, "epoch": 3.9820466786355477, "step": 55450}, {"loss": 0.6642, "grad_norm": 0.9670467972755432, "learning_rate": 0.0002, "epoch": 3.9827648114901257, "step": 55460}, {"loss": 0.6603, "grad_norm": 0.835028350353241, "learning_rate": 0.0002, "epoch": 3.9834829443447037, "step": 55470}, {"loss": 0.6198, "grad_norm": 0.8678702712059021, "learning_rate": 0.0002, "epoch": 3.9842010771992817, "step": 55480}, {"loss": 0.6581, "grad_norm": 0.8581197261810303, "learning_rate": 0.0002, "epoch": 3.98491921005386, "step": 55490}, {"loss": 0.614, "grad_norm": 0.779848039150238, "learning_rate": 0.0002, "epoch": 3.985637342908438, "step": 55500}, {"loss": 0.634, "grad_norm": 0.8827589154243469, "learning_rate": 0.0002, "epoch": 3.986355475763016, "step": 55510}, {"loss": 0.624, "grad_norm": 1.0108301639556885, "learning_rate": 0.0002, "epoch": 3.987073608617594, "step": 55520}, {"loss": 0.6553, "grad_norm": 0.8506004214286804, "learning_rate": 0.0002, "epoch": 3.987791741472172, "step": 55530}, {"loss": 0.6229, "grad_norm": 1.0297727584838867, "learning_rate": 0.0002, "epoch": 3.98850987432675, "step": 55540}, {"loss": 0.6551, "grad_norm": 0.8579224944114685, "learning_rate": 0.0002, "epoch": 3.9892280071813286, "step": 55550}, {"loss": 0.6491, "grad_norm": 0.8503788113594055, "learning_rate": 0.0002, "epoch": 3.9899461400359066, "step": 55560}, {"loss": 0.6941, "grad_norm": 1.1144801378250122, "learning_rate": 0.0002, "epoch": 3.9906642728904846, "step": 55570}, {"loss": 0.6956, "grad_norm": 0.8418305516242981, "learning_rate": 0.0002, "epoch": 3.991382405745063, "step": 55580}, {"loss": 0.6226, "grad_norm": 1.0065871477127075, "learning_rate": 0.0002, "epoch": 3.992100538599641, "step": 55590}, {"loss": 0.6775, "grad_norm": 0.8160259127616882, "learning_rate": 0.0002, "epoch": 3.992818671454219, "step": 55600}, {"loss": 0.624, "grad_norm": 0.8678009510040283, "learning_rate": 0.0002, "epoch": 3.993536804308797, "step": 55610}, {"loss": 0.6552, "grad_norm": 0.863465428352356, "learning_rate": 0.0002, "epoch": 3.994254937163375, "step": 55620}, {"loss": 0.6764, "grad_norm": 0.9242135286331177, "learning_rate": 0.0002, "epoch": 3.994973070017953, "step": 55630}, {"loss": 0.6774, "grad_norm": 1.0285470485687256, "learning_rate": 0.0002, "epoch": 3.9956912028725315, "step": 55640}, {"loss": 0.6882, "grad_norm": 0.8953320384025574, "learning_rate": 0.0002, "epoch": 3.9964093357271095, "step": 55650}, {"loss": 0.6935, "grad_norm": 0.915892481803894, "learning_rate": 0.0002, "epoch": 3.9971274685816875, "step": 55660}, {"loss": 0.641, "grad_norm": 0.8235118985176086, "learning_rate": 0.0002, "epoch": 3.9978456014362656, "step": 55670}, {"loss": 0.6417, "grad_norm": 1.0178656578063965, "learning_rate": 0.0002, "epoch": 3.998563734290844, "step": 55680}, {"loss": 0.6635, "grad_norm": 0.9926803708076477, "learning_rate": 0.0002, "epoch": 3.999281867145422, "step": 55690}, {"loss": 0.6476, "grad_norm": 0.9213629961013794, "learning_rate": 0.0002, "epoch": 4.0, "step": 55700}, {"eval_loss": 1.1152480840682983, "eval_runtime": 55.2237, "eval_samples_per_second": 13.273, "eval_steps_per_second": 1.666, "epoch": 4.0, "step": 55700}, {"loss": 0.6085, "grad_norm": 1.0820496082305908, "learning_rate": 0.0002, "epoch": 4.000718132854578, "step": 55710}, {"loss": 0.5506, "grad_norm": 0.9036441445350647, "learning_rate": 0.0002, "epoch": 4.001436265709156, "step": 55720}, {"loss": 0.5924, "grad_norm": 1.102754831314087, "learning_rate": 0.0002, "epoch": 4.002154398563734, "step": 55730}, {"loss": 0.6192, "grad_norm": 0.98259437084198, "learning_rate": 0.0002, "epoch": 4.002872531418312, "step": 55740}, {"loss": 0.567, "grad_norm": 1.1935845613479614, "learning_rate": 0.0002, "epoch": 4.003590664272891, "step": 55750}, {"loss": 0.6205, "grad_norm": 0.9925830960273743, "learning_rate": 0.0002, "epoch": 4.004308797127469, "step": 55760}, {"loss": 0.5545, "grad_norm": 1.075087070465088, "learning_rate": 0.0002, "epoch": 4.005026929982047, "step": 55770}, {"loss": 0.5591, "grad_norm": 0.8746396899223328, "learning_rate": 0.0002, "epoch": 4.005745062836625, "step": 55780}, {"loss": 0.5745, "grad_norm": 0.7635995745658875, "learning_rate": 0.0002, "epoch": 4.006463195691203, "step": 55790}, {"loss": 0.599, "grad_norm": 0.9064885377883911, "learning_rate": 0.0002, "epoch": 4.007181328545781, "step": 55800}, {"loss": 0.5668, "grad_norm": 1.018478274345398, "learning_rate": 0.0002, "epoch": 4.007899461400359, "step": 55810}, {"loss": 0.5573, "grad_norm": 0.9797589778900146, "learning_rate": 0.0002, "epoch": 4.008617594254937, "step": 55820}, {"loss": 0.5784, "grad_norm": 0.7867457866668701, "learning_rate": 0.0002, "epoch": 4.009335727109515, "step": 55830}, {"loss": 0.5607, "grad_norm": 0.9998070597648621, "learning_rate": 0.0002, "epoch": 4.010053859964093, "step": 55840}, {"loss": 0.5655, "grad_norm": 0.8656311631202698, "learning_rate": 0.0002, "epoch": 4.010771992818672, "step": 55850}, {"loss": 0.533, "grad_norm": 0.945469081401825, "learning_rate": 0.0002, "epoch": 4.01149012567325, "step": 55860}, {"loss": 0.625, "grad_norm": 0.8809926509857178, "learning_rate": 0.0002, "epoch": 4.012208258527828, "step": 55870}, {"loss": 0.5795, "grad_norm": 0.8047897219657898, "learning_rate": 0.0002, "epoch": 4.012926391382406, "step": 55880}, {"loss": 0.5322, "grad_norm": 1.0563900470733643, "learning_rate": 0.0002, "epoch": 4.013644524236984, "step": 55890}, {"loss": 0.5597, "grad_norm": 0.8578300476074219, "learning_rate": 0.0002, "epoch": 4.014362657091562, "step": 55900}, {"loss": 0.5634, "grad_norm": 1.0304765701293945, "learning_rate": 0.0002, "epoch": 4.01508078994614, "step": 55910}, {"loss": 0.558, "grad_norm": 0.8087666034698486, "learning_rate": 0.0002, "epoch": 4.015798922800718, "step": 55920}, {"loss": 0.5557, "grad_norm": 1.0192348957061768, "learning_rate": 0.0002, "epoch": 4.016517055655296, "step": 55930}, {"loss": 0.6269, "grad_norm": 1.061194658279419, "learning_rate": 0.0002, "epoch": 4.017235188509875, "step": 55940}, {"loss": 0.5812, "grad_norm": 0.93668133020401, "learning_rate": 0.0002, "epoch": 4.017953321364453, "step": 55950}, {"loss": 0.6104, "grad_norm": 1.1569286584854126, "learning_rate": 0.0002, "epoch": 4.018671454219031, "step": 55960}, {"loss": 0.5832, "grad_norm": 0.9853817224502563, "learning_rate": 0.0002, "epoch": 4.019389587073609, "step": 55970}, {"loss": 0.6154, "grad_norm": 0.851109504699707, "learning_rate": 0.0002, "epoch": 4.020107719928187, "step": 55980}, {"loss": 0.5993, "grad_norm": 1.053525447845459, "learning_rate": 0.0002, "epoch": 4.020825852782765, "step": 55990}, {"loss": 0.571, "grad_norm": 0.8307225704193115, "learning_rate": 0.0002, "epoch": 4.021543985637343, "step": 56000}, {"loss": 0.5419, "grad_norm": 1.2741150856018066, "learning_rate": 0.0002, "epoch": 4.022262118491921, "step": 56010}, {"loss": 0.6001, "grad_norm": 0.9708344340324402, "learning_rate": 0.0002, "epoch": 4.022980251346499, "step": 56020}, {"loss": 0.5989, "grad_norm": 1.265034556388855, "learning_rate": 0.0002, "epoch": 4.023698384201078, "step": 56030}, {"loss": 0.5852, "grad_norm": 0.9364367723464966, "learning_rate": 0.0002, "epoch": 4.024416517055656, "step": 56040}, {"loss": 0.6108, "grad_norm": 0.8643592000007629, "learning_rate": 0.0002, "epoch": 4.025134649910234, "step": 56050}, {"loss": 0.6074, "grad_norm": 0.9742133021354675, "learning_rate": 0.0002, "epoch": 4.025852782764812, "step": 56060}, {"loss": 0.5699, "grad_norm": 1.1793473958969116, "learning_rate": 0.0002, "epoch": 4.02657091561939, "step": 56070}, {"loss": 0.5911, "grad_norm": 0.9641149044036865, "learning_rate": 0.0002, "epoch": 4.027289048473968, "step": 56080}, {"loss": 0.6083, "grad_norm": 0.9426136016845703, "learning_rate": 0.0002, "epoch": 4.028007181328546, "step": 56090}, {"loss": 0.5692, "grad_norm": 0.9211869835853577, "learning_rate": 0.0002, "epoch": 4.028725314183124, "step": 56100}, {"loss": 0.6109, "grad_norm": 1.1576565504074097, "learning_rate": 0.0002, "epoch": 4.029443447037702, "step": 56110}, {"loss": 0.5684, "grad_norm": 1.0014013051986694, "learning_rate": 0.0002, "epoch": 4.03016157989228, "step": 56120}, {"loss": 0.6017, "grad_norm": 0.9307010769844055, "learning_rate": 0.0002, "epoch": 4.0308797127468585, "step": 56130}, {"loss": 0.5582, "grad_norm": 0.8290148377418518, "learning_rate": 0.0002, "epoch": 4.0315978456014365, "step": 56140}, {"loss": 0.5921, "grad_norm": 1.0648446083068848, "learning_rate": 0.0002, "epoch": 4.0323159784560145, "step": 56150}, {"loss": 0.6116, "grad_norm": 1.1545547246932983, "learning_rate": 0.0002, "epoch": 4.0330341113105925, "step": 56160}, {"loss": 0.6301, "grad_norm": 0.9643545150756836, "learning_rate": 0.0002, "epoch": 4.0337522441651705, "step": 56170}, {"loss": 0.5655, "grad_norm": 0.8913900256156921, "learning_rate": 0.0002, "epoch": 4.0344703770197485, "step": 56180}, {"loss": 0.5897, "grad_norm": 0.9445754289627075, "learning_rate": 0.0002, "epoch": 4.0351885098743265, "step": 56190}, {"loss": 0.6204, "grad_norm": 0.9353124499320984, "learning_rate": 0.0002, "epoch": 4.0359066427289045, "step": 56200}, {"loss": 0.6017, "grad_norm": 1.1780431270599365, "learning_rate": 0.0002, "epoch": 4.0366247755834825, "step": 56210}, {"loss": 0.5767, "grad_norm": 0.9208880662918091, "learning_rate": 0.0002, "epoch": 4.037342908438061, "step": 56220}, {"loss": 0.5367, "grad_norm": 0.9475517272949219, "learning_rate": 0.0002, "epoch": 4.038061041292639, "step": 56230}, {"loss": 0.576, "grad_norm": 0.7478583455085754, "learning_rate": 0.0002, "epoch": 4.038779174147217, "step": 56240}, {"loss": 0.5616, "grad_norm": 1.0026403665542603, "learning_rate": 0.0002, "epoch": 4.039497307001795, "step": 56250}, {"loss": 0.6031, "grad_norm": 0.9664973020553589, "learning_rate": 0.0002, "epoch": 4.040215439856373, "step": 56260}, {"loss": 0.5764, "grad_norm": 1.0655616521835327, "learning_rate": 0.0002, "epoch": 4.040933572710951, "step": 56270}, {"loss": 0.5862, "grad_norm": 0.8367540240287781, "learning_rate": 0.0002, "epoch": 4.041651705565529, "step": 56280}, {"loss": 0.5828, "grad_norm": 0.7982191443443298, "learning_rate": 0.0002, "epoch": 4.042369838420107, "step": 56290}, {"loss": 0.5637, "grad_norm": 0.8304495215415955, "learning_rate": 0.0002, "epoch": 4.043087971274685, "step": 56300}, {"loss": 0.5974, "grad_norm": 0.95123291015625, "learning_rate": 0.0002, "epoch": 4.043806104129264, "step": 56310}, {"loss": 0.617, "grad_norm": 0.9504102468490601, "learning_rate": 0.0002, "epoch": 4.044524236983842, "step": 56320}, {"loss": 0.6143, "grad_norm": 0.7432710528373718, "learning_rate": 0.0002, "epoch": 4.04524236983842, "step": 56330}, {"loss": 0.6157, "grad_norm": 0.9327874183654785, "learning_rate": 0.0002, "epoch": 4.045960502692998, "step": 56340}, {"loss": 0.591, "grad_norm": 0.9161670804023743, "learning_rate": 0.0002, "epoch": 4.046678635547576, "step": 56350}, {"loss": 0.6111, "grad_norm": 0.9371771812438965, "learning_rate": 0.0002, "epoch": 4.047396768402154, "step": 56360}, {"loss": 0.6101, "grad_norm": 1.0332437753677368, "learning_rate": 0.0002, "epoch": 4.048114901256732, "step": 56370}, {"loss": 0.5451, "grad_norm": 0.7346320748329163, "learning_rate": 0.0002, "epoch": 4.04883303411131, "step": 56380}, {"loss": 0.6416, "grad_norm": 0.8247857689857483, "learning_rate": 0.0002, "epoch": 4.049551166965888, "step": 56390}, {"loss": 0.6208, "grad_norm": 0.925325334072113, "learning_rate": 0.0002, "epoch": 4.050269299820466, "step": 56400}, {"loss": 0.558, "grad_norm": 0.7344088554382324, "learning_rate": 0.0002, "epoch": 4.050987432675045, "step": 56410}, {"loss": 0.5978, "grad_norm": 0.9204918146133423, "learning_rate": 0.0002, "epoch": 4.051705565529623, "step": 56420}, {"loss": 0.5788, "grad_norm": 0.8273472785949707, "learning_rate": 0.0002, "epoch": 4.052423698384201, "step": 56430}, {"loss": 0.5551, "grad_norm": 0.9524998068809509, "learning_rate": 0.0002, "epoch": 4.053141831238779, "step": 56440}, {"loss": 0.5836, "grad_norm": 0.9168205857276917, "learning_rate": 0.0002, "epoch": 4.053859964093357, "step": 56450}, {"loss": 0.6035, "grad_norm": 0.9634994864463806, "learning_rate": 0.0002, "epoch": 4.054578096947935, "step": 56460}, {"loss": 0.5907, "grad_norm": 1.2027593851089478, "learning_rate": 0.0002, "epoch": 4.055296229802513, "step": 56470}, {"loss": 0.5691, "grad_norm": 1.2347805500030518, "learning_rate": 0.0002, "epoch": 4.056014362657091, "step": 56480}, {"loss": 0.5789, "grad_norm": 0.8621458411216736, "learning_rate": 0.0002, "epoch": 4.056732495511669, "step": 56490}, {"loss": 0.6082, "grad_norm": 0.9194608330726624, "learning_rate": 0.0002, "epoch": 4.057450628366248, "step": 56500}, {"loss": 0.5667, "grad_norm": 1.0153663158416748, "learning_rate": 0.0002, "epoch": 4.058168761220826, "step": 56510}, {"loss": 0.5908, "grad_norm": 0.9170986413955688, "learning_rate": 0.0002, "epoch": 4.058886894075404, "step": 56520}, {"loss": 0.5672, "grad_norm": 1.033057689666748, "learning_rate": 0.0002, "epoch": 4.059605026929982, "step": 56530}, {"loss": 0.5577, "grad_norm": 1.0125197172164917, "learning_rate": 0.0002, "epoch": 4.06032315978456, "step": 56540}, {"loss": 0.5821, "grad_norm": 0.9429898262023926, "learning_rate": 0.0002, "epoch": 4.061041292639138, "step": 56550}, {"loss": 0.5655, "grad_norm": 0.9242179989814758, "learning_rate": 0.0002, "epoch": 4.061759425493716, "step": 56560}, {"loss": 0.5568, "grad_norm": 0.9365091323852539, "learning_rate": 0.0002, "epoch": 4.062477558348294, "step": 56570}, {"loss": 0.6104, "grad_norm": 0.9148455858230591, "learning_rate": 0.0002, "epoch": 4.063195691202872, "step": 56580}, {"loss": 0.5891, "grad_norm": 0.8546709418296814, "learning_rate": 0.0002, "epoch": 4.063913824057451, "step": 56590}, {"loss": 0.6079, "grad_norm": 0.9743902087211609, "learning_rate": 0.0002, "epoch": 4.064631956912029, "step": 56600}, {"loss": 0.6109, "grad_norm": 1.0599974393844604, "learning_rate": 0.0002, "epoch": 4.065350089766607, "step": 56610}, {"loss": 0.5746, "grad_norm": 0.9677841067314148, "learning_rate": 0.0002, "epoch": 4.066068222621185, "step": 56620}, {"loss": 0.5957, "grad_norm": 0.8892754316329956, "learning_rate": 0.0002, "epoch": 4.066786355475763, "step": 56630}, {"loss": 0.5899, "grad_norm": 0.8837814331054688, "learning_rate": 0.0002, "epoch": 4.067504488330341, "step": 56640}, {"loss": 0.5784, "grad_norm": 0.9284095764160156, "learning_rate": 0.0002, "epoch": 4.068222621184919, "step": 56650}, {"loss": 0.5829, "grad_norm": 1.0163567066192627, "learning_rate": 0.0002, "epoch": 4.068940754039497, "step": 56660}, {"loss": 0.5349, "grad_norm": 0.8713456988334656, "learning_rate": 0.0002, "epoch": 4.069658886894075, "step": 56670}, {"loss": 0.5345, "grad_norm": 0.8356686234474182, "learning_rate": 0.0002, "epoch": 4.070377019748653, "step": 56680}, {"loss": 0.5473, "grad_norm": 0.8998766541481018, "learning_rate": 0.0002, "epoch": 4.071095152603232, "step": 56690}, {"loss": 0.5896, "grad_norm": 1.0441967248916626, "learning_rate": 0.0002, "epoch": 4.07181328545781, "step": 56700}, {"loss": 0.5817, "grad_norm": 0.9313125610351562, "learning_rate": 0.0002, "epoch": 4.072531418312388, "step": 56710}, {"loss": 0.5477, "grad_norm": 0.9912964701652527, "learning_rate": 0.0002, "epoch": 4.073249551166966, "step": 56720}, {"loss": 0.5974, "grad_norm": 0.9048459529876709, "learning_rate": 0.0002, "epoch": 4.073967684021544, "step": 56730}, {"loss": 0.5927, "grad_norm": 1.0248944759368896, "learning_rate": 0.0002, "epoch": 4.074685816876122, "step": 56740}, {"loss": 0.6019, "grad_norm": 1.4526786804199219, "learning_rate": 0.0002, "epoch": 4.0754039497307, "step": 56750}, {"loss": 0.6267, "grad_norm": 0.9813178181648254, "learning_rate": 0.0002, "epoch": 4.076122082585278, "step": 56760}, {"loss": 0.5707, "grad_norm": 1.0686813592910767, "learning_rate": 0.0002, "epoch": 4.076840215439856, "step": 56770}, {"loss": 0.5857, "grad_norm": 1.1093482971191406, "learning_rate": 0.0002, "epoch": 4.077558348294435, "step": 56780}, {"loss": 0.5768, "grad_norm": 0.9377819895744324, "learning_rate": 0.0002, "epoch": 4.078276481149013, "step": 56790}, {"loss": 0.6342, "grad_norm": 0.8043649196624756, "learning_rate": 0.0002, "epoch": 4.078994614003591, "step": 56800}, {"loss": 0.6005, "grad_norm": 0.7995415925979614, "learning_rate": 0.0002, "epoch": 4.079712746858169, "step": 56810}, {"loss": 0.5466, "grad_norm": 1.0076148509979248, "learning_rate": 0.0002, "epoch": 4.080430879712747, "step": 56820}, {"loss": 0.6021, "grad_norm": 0.8192076683044434, "learning_rate": 0.0002, "epoch": 4.081149012567325, "step": 56830}, {"loss": 0.5439, "grad_norm": 0.9226266145706177, "learning_rate": 0.0002, "epoch": 4.081867145421903, "step": 56840}, {"loss": 0.5893, "grad_norm": 0.8877972960472107, "learning_rate": 0.0002, "epoch": 4.082585278276481, "step": 56850}, {"loss": 0.5774, "grad_norm": 0.9578937888145447, "learning_rate": 0.0002, "epoch": 4.083303411131059, "step": 56860}, {"loss": 0.5946, "grad_norm": 0.8929167985916138, "learning_rate": 0.0002, "epoch": 4.084021543985638, "step": 56870}, {"loss": 0.5226, "grad_norm": 1.0015977621078491, "learning_rate": 0.0002, "epoch": 4.084739676840216, "step": 56880}, {"loss": 0.5931, "grad_norm": 0.9768750667572021, "learning_rate": 0.0002, "epoch": 4.085457809694794, "step": 56890}, {"loss": 0.5983, "grad_norm": 1.0834569931030273, "learning_rate": 0.0002, "epoch": 4.086175942549372, "step": 56900}, {"loss": 0.5786, "grad_norm": 0.8761230707168579, "learning_rate": 0.0002, "epoch": 4.08689407540395, "step": 56910}, {"loss": 0.5708, "grad_norm": 1.027064323425293, "learning_rate": 0.0002, "epoch": 4.087612208258528, "step": 56920}, {"loss": 0.601, "grad_norm": 1.130336880683899, "learning_rate": 0.0002, "epoch": 4.088330341113106, "step": 56930}, {"loss": 0.5664, "grad_norm": 0.8157579898834229, "learning_rate": 0.0002, "epoch": 4.089048473967684, "step": 56940}, {"loss": 0.5789, "grad_norm": 1.071175217628479, "learning_rate": 0.0002, "epoch": 4.089766606822262, "step": 56950}, {"loss": 0.5942, "grad_norm": 0.9534492492675781, "learning_rate": 0.0002, "epoch": 4.09048473967684, "step": 56960}, {"loss": 0.5803, "grad_norm": 0.9584037661552429, "learning_rate": 0.0002, "epoch": 4.091202872531419, "step": 56970}, {"loss": 0.5647, "grad_norm": 1.1513131856918335, "learning_rate": 0.0002, "epoch": 4.091921005385997, "step": 56980}, {"loss": 0.5971, "grad_norm": 1.0167666673660278, "learning_rate": 0.0002, "epoch": 4.092639138240575, "step": 56990}, {"loss": 0.5981, "grad_norm": 1.0630987882614136, "learning_rate": 0.0002, "epoch": 4.093357271095153, "step": 57000}, {"loss": 0.5734, "grad_norm": 1.0326893329620361, "learning_rate": 0.0002, "epoch": 4.094075403949731, "step": 57010}, {"loss": 0.572, "grad_norm": 0.9701678156852722, "learning_rate": 0.0002, "epoch": 4.094793536804309, "step": 57020}, {"loss": 0.5815, "grad_norm": 0.839935302734375, "learning_rate": 0.0002, "epoch": 4.095511669658887, "step": 57030}, {"loss": 0.6051, "grad_norm": 0.8995838761329651, "learning_rate": 0.0002, "epoch": 4.096229802513465, "step": 57040}, {"loss": 0.6037, "grad_norm": 0.8039916157722473, "learning_rate": 0.0002, "epoch": 4.096947935368043, "step": 57050}, {"loss": 0.5597, "grad_norm": 1.126122236251831, "learning_rate": 0.0002, "epoch": 4.097666068222622, "step": 57060}, {"loss": 0.5943, "grad_norm": 0.8749837875366211, "learning_rate": 0.0002, "epoch": 4.0983842010772, "step": 57070}, {"loss": 0.6017, "grad_norm": 0.8630341291427612, "learning_rate": 0.0002, "epoch": 4.099102333931778, "step": 57080}, {"loss": 0.6083, "grad_norm": 0.8889496922492981, "learning_rate": 0.0002, "epoch": 4.099820466786356, "step": 57090}, {"loss": 0.5727, "grad_norm": 0.9050310254096985, "learning_rate": 0.0002, "epoch": 4.100538599640934, "step": 57100}, {"loss": 0.5824, "grad_norm": 0.943072497844696, "learning_rate": 0.0002, "epoch": 4.101256732495512, "step": 57110}, {"loss": 0.6036, "grad_norm": 0.9031552672386169, "learning_rate": 0.0002, "epoch": 4.10197486535009, "step": 57120}, {"loss": 0.5913, "grad_norm": 0.939862847328186, "learning_rate": 0.0002, "epoch": 4.102692998204668, "step": 57130}, {"loss": 0.5738, "grad_norm": 0.8080634474754333, "learning_rate": 0.0002, "epoch": 4.103411131059246, "step": 57140}, {"loss": 0.5841, "grad_norm": 0.9181693196296692, "learning_rate": 0.0002, "epoch": 4.1041292639138245, "step": 57150}, {"loss": 0.5561, "grad_norm": 0.9609217643737793, "learning_rate": 0.0002, "epoch": 4.1048473967684025, "step": 57160}, {"loss": 0.5572, "grad_norm": 1.1246516704559326, "learning_rate": 0.0002, "epoch": 4.1055655296229805, "step": 57170}, {"loss": 0.5886, "grad_norm": 1.0616880655288696, "learning_rate": 0.0002, "epoch": 4.1062836624775585, "step": 57180}, {"loss": 0.5579, "grad_norm": 0.9954505562782288, "learning_rate": 0.0002, "epoch": 4.1070017953321365, "step": 57190}, {"loss": 0.5899, "grad_norm": 1.0602279901504517, "learning_rate": 0.0002, "epoch": 4.1077199281867145, "step": 57200}, {"loss": 0.5747, "grad_norm": 0.8984764814376831, "learning_rate": 0.0002, "epoch": 4.1084380610412925, "step": 57210}, {"loss": 0.5502, "grad_norm": 0.845167875289917, "learning_rate": 0.0002, "epoch": 4.1091561938958705, "step": 57220}, {"loss": 0.6147, "grad_norm": 0.7901500463485718, "learning_rate": 0.0002, "epoch": 4.1098743267504485, "step": 57230}, {"loss": 0.5883, "grad_norm": 1.0462526082992554, "learning_rate": 0.0002, "epoch": 4.1105924596050265, "step": 57240}, {"loss": 0.6334, "grad_norm": 0.9098827838897705, "learning_rate": 0.0002, "epoch": 4.111310592459605, "step": 57250}, {"loss": 0.5794, "grad_norm": 0.9234077334403992, "learning_rate": 0.0002, "epoch": 4.112028725314183, "step": 57260}, {"loss": 0.623, "grad_norm": 1.0033560991287231, "learning_rate": 0.0002, "epoch": 4.112746858168761, "step": 57270}, {"loss": 0.5392, "grad_norm": 1.0620051622390747, "learning_rate": 0.0002, "epoch": 4.113464991023339, "step": 57280}, {"loss": 0.6144, "grad_norm": 0.8679345846176147, "learning_rate": 0.0002, "epoch": 4.114183123877917, "step": 57290}, {"loss": 0.5951, "grad_norm": 0.7557345628738403, "learning_rate": 0.0002, "epoch": 4.114901256732495, "step": 57300}, {"loss": 0.575, "grad_norm": 0.8970935344696045, "learning_rate": 0.0002, "epoch": 4.115619389587073, "step": 57310}, {"loss": 0.5595, "grad_norm": 1.0779842138290405, "learning_rate": 0.0002, "epoch": 4.116337522441651, "step": 57320}, {"loss": 0.5532, "grad_norm": 1.2036106586456299, "learning_rate": 0.0002, "epoch": 4.117055655296229, "step": 57330}, {"loss": 0.5959, "grad_norm": 0.8337953686714172, "learning_rate": 0.0002, "epoch": 4.117773788150808, "step": 57340}, {"loss": 0.6128, "grad_norm": 0.9850410223007202, "learning_rate": 0.0002, "epoch": 4.118491921005386, "step": 57350}, {"loss": 0.5676, "grad_norm": 0.8028770685195923, "learning_rate": 0.0002, "epoch": 4.119210053859964, "step": 57360}, {"loss": 0.5693, "grad_norm": 0.8693217039108276, "learning_rate": 0.0002, "epoch": 4.119928186714542, "step": 57370}, {"loss": 0.5897, "grad_norm": 0.8795534372329712, "learning_rate": 0.0002, "epoch": 4.12064631956912, "step": 57380}, {"loss": 0.5692, "grad_norm": 1.0081543922424316, "learning_rate": 0.0002, "epoch": 4.121364452423698, "step": 57390}, {"loss": 0.6027, "grad_norm": 0.8776742219924927, "learning_rate": 0.0002, "epoch": 4.122082585278276, "step": 57400}, {"loss": 0.6418, "grad_norm": 0.8247824311256409, "learning_rate": 0.0002, "epoch": 4.122800718132854, "step": 57410}, {"loss": 0.5537, "grad_norm": 1.1346335411071777, "learning_rate": 0.0002, "epoch": 4.123518850987432, "step": 57420}, {"loss": 0.5949, "grad_norm": 1.0671089887619019, "learning_rate": 0.0002, "epoch": 4.124236983842011, "step": 57430}, {"loss": 0.5908, "grad_norm": 0.8548333048820496, "learning_rate": 0.0002, "epoch": 4.124955116696589, "step": 57440}, {"loss": 0.5967, "grad_norm": 1.0221573114395142, "learning_rate": 0.0002, "epoch": 4.125673249551167, "step": 57450}, {"loss": 0.6238, "grad_norm": 0.9746617674827576, "learning_rate": 0.0002, "epoch": 4.126391382405745, "step": 57460}, {"loss": 0.5855, "grad_norm": 0.8104965090751648, "learning_rate": 0.0002, "epoch": 4.127109515260323, "step": 57470}, {"loss": 0.5724, "grad_norm": 1.0401487350463867, "learning_rate": 0.0002, "epoch": 4.127827648114901, "step": 57480}, {"loss": 0.5956, "grad_norm": 0.8828882575035095, "learning_rate": 0.0002, "epoch": 4.128545780969479, "step": 57490}, {"loss": 0.5851, "grad_norm": 1.0121098756790161, "learning_rate": 0.0002, "epoch": 4.129263913824057, "step": 57500}, {"loss": 0.5923, "grad_norm": 0.8789737820625305, "learning_rate": 0.0002, "epoch": 4.129982046678635, "step": 57510}, {"loss": 0.5929, "grad_norm": 1.0386744737625122, "learning_rate": 0.0002, "epoch": 4.130700179533213, "step": 57520}, {"loss": 0.6104, "grad_norm": 1.0092610120773315, "learning_rate": 0.0002, "epoch": 4.131418312387792, "step": 57530}, {"loss": 0.5974, "grad_norm": 0.8706282377243042, "learning_rate": 0.0002, "epoch": 4.13213644524237, "step": 57540}, {"loss": 0.5829, "grad_norm": 0.9270507097244263, "learning_rate": 0.0002, "epoch": 4.132854578096948, "step": 57550}, {"loss": 0.5826, "grad_norm": 1.0303068161010742, "learning_rate": 0.0002, "epoch": 4.133572710951526, "step": 57560}, {"loss": 0.5515, "grad_norm": 1.1169062852859497, "learning_rate": 0.0002, "epoch": 4.134290843806104, "step": 57570}, {"loss": 0.5848, "grad_norm": 0.8530599474906921, "learning_rate": 0.0002, "epoch": 4.135008976660682, "step": 57580}, {"loss": 0.6231, "grad_norm": 1.1395039558410645, "learning_rate": 0.0002, "epoch": 4.13572710951526, "step": 57590}, {"loss": 0.5739, "grad_norm": 0.8944115042686462, "learning_rate": 0.0002, "epoch": 4.136445242369838, "step": 57600}, {"loss": 0.6212, "grad_norm": 1.137966275215149, "learning_rate": 0.0002, "epoch": 4.137163375224416, "step": 57610}, {"loss": 0.6041, "grad_norm": 0.8244962692260742, "learning_rate": 0.0002, "epoch": 4.137881508078995, "step": 57620}, {"loss": 0.6078, "grad_norm": 1.1935817003250122, "learning_rate": 0.0002, "epoch": 4.138599640933573, "step": 57630}, {"loss": 0.5939, "grad_norm": 0.9774235486984253, "learning_rate": 0.0002, "epoch": 4.139317773788151, "step": 57640}, {"loss": 0.5963, "grad_norm": 1.066219449043274, "learning_rate": 0.0002, "epoch": 4.140035906642729, "step": 57650}, {"loss": 0.6008, "grad_norm": 0.8631396293640137, "learning_rate": 0.0002, "epoch": 4.140754039497307, "step": 57660}, {"loss": 0.5622, "grad_norm": 0.888410747051239, "learning_rate": 0.0002, "epoch": 4.141472172351885, "step": 57670}, {"loss": 0.5675, "grad_norm": 1.002642035484314, "learning_rate": 0.0002, "epoch": 4.142190305206463, "step": 57680}, {"loss": 0.5269, "grad_norm": 1.0092825889587402, "learning_rate": 0.0002, "epoch": 4.142908438061041, "step": 57690}, {"loss": 0.588, "grad_norm": 0.9126971364021301, "learning_rate": 0.0002, "epoch": 4.143626570915619, "step": 57700}, {"loss": 0.5593, "grad_norm": 1.0303562879562378, "learning_rate": 0.0002, "epoch": 4.144344703770198, "step": 57710}, {"loss": 0.6183, "grad_norm": 1.1230897903442383, "learning_rate": 0.0002, "epoch": 4.145062836624776, "step": 57720}, {"loss": 0.5934, "grad_norm": 1.0494099855422974, "learning_rate": 0.0002, "epoch": 4.145780969479354, "step": 57730}, {"loss": 0.6022, "grad_norm": 0.9555442333221436, "learning_rate": 0.0002, "epoch": 4.146499102333932, "step": 57740}, {"loss": 0.609, "grad_norm": 0.8255124092102051, "learning_rate": 0.0002, "epoch": 4.14721723518851, "step": 57750}, {"loss": 0.5659, "grad_norm": 1.097853660583496, "learning_rate": 0.0002, "epoch": 4.147935368043088, "step": 57760}, {"loss": 0.5698, "grad_norm": 1.0272663831710815, "learning_rate": 0.0002, "epoch": 4.148653500897666, "step": 57770}, {"loss": 0.5701, "grad_norm": 1.022571086883545, "learning_rate": 0.0002, "epoch": 4.149371633752244, "step": 57780}, {"loss": 0.579, "grad_norm": 0.964543342590332, "learning_rate": 0.0002, "epoch": 4.150089766606822, "step": 57790}, {"loss": 0.6175, "grad_norm": 0.9251219034194946, "learning_rate": 0.0002, "epoch": 4.1508078994614, "step": 57800}, {"loss": 0.564, "grad_norm": 1.081840991973877, "learning_rate": 0.0002, "epoch": 4.151526032315979, "step": 57810}, {"loss": 0.5956, "grad_norm": 0.8989445567131042, "learning_rate": 0.0002, "epoch": 4.152244165170557, "step": 57820}, {"loss": 0.5849, "grad_norm": 0.903629720211029, "learning_rate": 0.0002, "epoch": 4.152962298025135, "step": 57830}, {"loss": 0.6202, "grad_norm": 0.8985397219657898, "learning_rate": 0.0002, "epoch": 4.153680430879713, "step": 57840}, {"loss": 0.5629, "grad_norm": 1.047778844833374, "learning_rate": 0.0002, "epoch": 4.154398563734291, "step": 57850}, {"loss": 0.6045, "grad_norm": 0.9804165363311768, "learning_rate": 0.0002, "epoch": 4.155116696588869, "step": 57860}, {"loss": 0.5815, "grad_norm": 1.187309980392456, "learning_rate": 0.0002, "epoch": 4.155834829443447, "step": 57870}, {"loss": 0.6304, "grad_norm": 0.9854836463928223, "learning_rate": 0.0002, "epoch": 4.156552962298025, "step": 57880}, {"loss": 0.6076, "grad_norm": 0.8494308590888977, "learning_rate": 0.0002, "epoch": 4.157271095152603, "step": 57890}, {"loss": 0.6033, "grad_norm": 0.9359684586524963, "learning_rate": 0.0002, "epoch": 4.157989228007182, "step": 57900}, {"loss": 0.5546, "grad_norm": 0.8971988558769226, "learning_rate": 0.0002, "epoch": 4.15870736086176, "step": 57910}, {"loss": 0.5934, "grad_norm": 0.8848021030426025, "learning_rate": 0.0002, "epoch": 4.159425493716338, "step": 57920}, {"loss": 0.6102, "grad_norm": 0.982877790927887, "learning_rate": 0.0002, "epoch": 4.160143626570916, "step": 57930}, {"loss": 0.6091, "grad_norm": 0.8668819069862366, "learning_rate": 0.0002, "epoch": 4.160861759425494, "step": 57940}, {"loss": 0.5969, "grad_norm": 1.06569504737854, "learning_rate": 0.0002, "epoch": 4.161579892280072, "step": 57950}, {"loss": 0.5799, "grad_norm": 1.165740728378296, "learning_rate": 0.0002, "epoch": 4.16229802513465, "step": 57960}, {"loss": 0.6038, "grad_norm": 1.0534512996673584, "learning_rate": 0.0002, "epoch": 4.163016157989228, "step": 57970}, {"loss": 0.594, "grad_norm": 0.8785330653190613, "learning_rate": 0.0002, "epoch": 4.163734290843806, "step": 57980}, {"loss": 0.5981, "grad_norm": 1.1244874000549316, "learning_rate": 0.0002, "epoch": 4.164452423698384, "step": 57990}, {"loss": 0.6456, "grad_norm": 0.8839399218559265, "learning_rate": 0.0002, "epoch": 4.165170556552963, "step": 58000}, {"loss": 0.5767, "grad_norm": 1.0603798627853394, "learning_rate": 0.0002, "epoch": 4.165888689407541, "step": 58010}, {"loss": 0.6334, "grad_norm": 0.9737853407859802, "learning_rate": 0.0002, "epoch": 4.166606822262119, "step": 58020}, {"loss": 0.5901, "grad_norm": 1.0650558471679688, "learning_rate": 0.0002, "epoch": 4.167324955116697, "step": 58030}, {"loss": 0.6549, "grad_norm": 0.7528959512710571, "learning_rate": 0.0002, "epoch": 4.168043087971275, "step": 58040}, {"loss": 0.5593, "grad_norm": 0.9286156892776489, "learning_rate": 0.0002, "epoch": 4.168761220825853, "step": 58050}, {"loss": 0.6093, "grad_norm": 1.0225880146026611, "learning_rate": 0.0002, "epoch": 4.169479353680431, "step": 58060}, {"loss": 0.5993, "grad_norm": 0.9990654587745667, "learning_rate": 0.0002, "epoch": 4.170197486535009, "step": 58070}, {"loss": 0.6002, "grad_norm": 1.052057147026062, "learning_rate": 0.0002, "epoch": 4.170915619389587, "step": 58080}, {"loss": 0.5911, "grad_norm": 0.7366801500320435, "learning_rate": 0.0002, "epoch": 4.1716337522441655, "step": 58090}, {"loss": 0.6273, "grad_norm": 1.0943711996078491, "learning_rate": 0.0002, "epoch": 4.1723518850987436, "step": 58100}, {"loss": 0.6095, "grad_norm": 1.1297656297683716, "learning_rate": 0.0002, "epoch": 4.1730700179533216, "step": 58110}, {"loss": 0.6123, "grad_norm": 0.7861461639404297, "learning_rate": 0.0002, "epoch": 4.1737881508078996, "step": 58120}, {"loss": 0.6188, "grad_norm": 0.8643335103988647, "learning_rate": 0.0002, "epoch": 4.174506283662478, "step": 58130}, {"loss": 0.6103, "grad_norm": 0.957288384437561, "learning_rate": 0.0002, "epoch": 4.175224416517056, "step": 58140}, {"loss": 0.5636, "grad_norm": 0.9175366759300232, "learning_rate": 0.0002, "epoch": 4.175942549371634, "step": 58150}, {"loss": 0.6288, "grad_norm": 1.129935622215271, "learning_rate": 0.0002, "epoch": 4.176660682226212, "step": 58160}, {"loss": 0.5969, "grad_norm": 0.9683087468147278, "learning_rate": 0.0002, "epoch": 4.17737881508079, "step": 58170}, {"loss": 0.6249, "grad_norm": 1.045171856880188, "learning_rate": 0.0002, "epoch": 4.1780969479353685, "step": 58180}, {"loss": 0.5611, "grad_norm": 0.9858742952346802, "learning_rate": 0.0002, "epoch": 4.1788150807899465, "step": 58190}, {"loss": 0.5946, "grad_norm": 0.8513413071632385, "learning_rate": 0.0002, "epoch": 4.1795332136445245, "step": 58200}, {"loss": 0.5928, "grad_norm": 0.9584265947341919, "learning_rate": 0.0002, "epoch": 4.1802513464991025, "step": 58210}, {"loss": 0.5864, "grad_norm": 0.8828920722007751, "learning_rate": 0.0002, "epoch": 4.1809694793536805, "step": 58220}, {"loss": 0.5745, "grad_norm": 0.9849961400032043, "learning_rate": 0.0002, "epoch": 4.1816876122082585, "step": 58230}, {"loss": 0.5355, "grad_norm": 1.0601637363433838, "learning_rate": 0.0002, "epoch": 4.1824057450628365, "step": 58240}, {"loss": 0.6063, "grad_norm": 1.2206604480743408, "learning_rate": 0.0002, "epoch": 4.1831238779174145, "step": 58250}, {"loss": 0.6176, "grad_norm": 1.1768009662628174, "learning_rate": 0.0002, "epoch": 4.1838420107719925, "step": 58260}, {"loss": 0.5572, "grad_norm": 0.9521295428276062, "learning_rate": 0.0002, "epoch": 4.184560143626571, "step": 58270}, {"loss": 0.5978, "grad_norm": 0.892971932888031, "learning_rate": 0.0002, "epoch": 4.185278276481149, "step": 58280}, {"loss": 0.5727, "grad_norm": 0.8712016940116882, "learning_rate": 0.0002, "epoch": 4.185996409335727, "step": 58290}, {"loss": 0.6124, "grad_norm": 1.0190843343734741, "learning_rate": 0.0002, "epoch": 4.186714542190305, "step": 58300}, {"loss": 0.6324, "grad_norm": 1.0149270296096802, "learning_rate": 0.0002, "epoch": 4.187432675044883, "step": 58310}, {"loss": 0.6337, "grad_norm": 1.1818004846572876, "learning_rate": 0.0002, "epoch": 4.188150807899461, "step": 58320}, {"loss": 0.5588, "grad_norm": 0.7892335653305054, "learning_rate": 0.0002, "epoch": 4.188868940754039, "step": 58330}, {"loss": 0.6132, "grad_norm": 0.9792808890342712, "learning_rate": 0.0002, "epoch": 4.189587073608617, "step": 58340}, {"loss": 0.5841, "grad_norm": 0.9946883320808411, "learning_rate": 0.0002, "epoch": 4.190305206463195, "step": 58350}, {"loss": 0.6043, "grad_norm": 1.0363789796829224, "learning_rate": 0.0002, "epoch": 4.191023339317773, "step": 58360}, {"loss": 0.5843, "grad_norm": 0.9285917282104492, "learning_rate": 0.0002, "epoch": 4.191741472172352, "step": 58370}, {"loss": 0.6042, "grad_norm": 0.9461679458618164, "learning_rate": 0.0002, "epoch": 4.19245960502693, "step": 58380}, {"loss": 0.5666, "grad_norm": 1.0344175100326538, "learning_rate": 0.0002, "epoch": 4.193177737881508, "step": 58390}, {"loss": 0.6032, "grad_norm": 0.9530242085456848, "learning_rate": 0.0002, "epoch": 4.193895870736086, "step": 58400}, {"loss": 0.5887, "grad_norm": 0.9171900749206543, "learning_rate": 0.0002, "epoch": 4.194614003590664, "step": 58410}, {"loss": 0.6116, "grad_norm": 0.8094898462295532, "learning_rate": 0.0002, "epoch": 4.195332136445242, "step": 58420}, {"loss": 0.5268, "grad_norm": 0.921981930732727, "learning_rate": 0.0002, "epoch": 4.19605026929982, "step": 58430}, {"loss": 0.551, "grad_norm": 0.9783532023429871, "learning_rate": 0.0002, "epoch": 4.196768402154398, "step": 58440}, {"loss": 0.5774, "grad_norm": 1.017805576324463, "learning_rate": 0.0002, "epoch": 4.197486535008976, "step": 58450}, {"loss": 0.6261, "grad_norm": 0.9244308471679688, "learning_rate": 0.0002, "epoch": 4.198204667863555, "step": 58460}, {"loss": 0.6247, "grad_norm": 0.9942585229873657, "learning_rate": 0.0002, "epoch": 4.198922800718133, "step": 58470}, {"loss": 0.5803, "grad_norm": 1.1045037508010864, "learning_rate": 0.0002, "epoch": 4.199640933572711, "step": 58480}, {"loss": 0.5846, "grad_norm": 0.9483149647712708, "learning_rate": 0.0002, "epoch": 4.200359066427289, "step": 58490}, {"loss": 0.5997, "grad_norm": 1.0807271003723145, "learning_rate": 0.0002, "epoch": 4.201077199281867, "step": 58500}, {"loss": 0.5474, "grad_norm": 0.7697445750236511, "learning_rate": 0.0002, "epoch": 4.201795332136445, "step": 58510}, {"loss": 0.5692, "grad_norm": 1.0761178731918335, "learning_rate": 0.0002, "epoch": 4.202513464991023, "step": 58520}, {"loss": 0.5667, "grad_norm": 0.9992024898529053, "learning_rate": 0.0002, "epoch": 4.203231597845601, "step": 58530}, {"loss": 0.5606, "grad_norm": 0.8741498589515686, "learning_rate": 0.0002, "epoch": 4.203949730700179, "step": 58540}, {"loss": 0.6012, "grad_norm": 0.8557528853416443, "learning_rate": 0.0002, "epoch": 4.204667863554757, "step": 58550}, {"loss": 0.5191, "grad_norm": 0.8853630423545837, "learning_rate": 0.0002, "epoch": 4.205385996409336, "step": 58560}, {"loss": 0.5806, "grad_norm": 0.9858933687210083, "learning_rate": 0.0002, "epoch": 4.206104129263914, "step": 58570}, {"loss": 0.5908, "grad_norm": 1.104732871055603, "learning_rate": 0.0002, "epoch": 4.206822262118492, "step": 58580}, {"loss": 0.5993, "grad_norm": 0.9345462322235107, "learning_rate": 0.0002, "epoch": 4.20754039497307, "step": 58590}, {"loss": 0.6101, "grad_norm": 0.9620407819747925, "learning_rate": 0.0002, "epoch": 4.208258527827648, "step": 58600}, {"loss": 0.5848, "grad_norm": 0.8546963334083557, "learning_rate": 0.0002, "epoch": 4.208976660682226, "step": 58610}, {"loss": 0.5747, "grad_norm": 0.8125145435333252, "learning_rate": 0.0002, "epoch": 4.209694793536804, "step": 58620}, {"loss": 0.604, "grad_norm": 0.8481138944625854, "learning_rate": 0.0002, "epoch": 4.210412926391382, "step": 58630}, {"loss": 0.5928, "grad_norm": 0.8884692788124084, "learning_rate": 0.0002, "epoch": 4.21113105924596, "step": 58640}, {"loss": 0.5612, "grad_norm": 1.09279465675354, "learning_rate": 0.0002, "epoch": 4.211849192100539, "step": 58650}, {"loss": 0.644, "grad_norm": 0.9806583523750305, "learning_rate": 0.0002, "epoch": 4.212567324955117, "step": 58660}, {"loss": 0.5737, "grad_norm": 0.9510366916656494, "learning_rate": 0.0002, "epoch": 4.213285457809695, "step": 58670}, {"loss": 0.5996, "grad_norm": 0.7517459988594055, "learning_rate": 0.0002, "epoch": 4.214003590664273, "step": 58680}, {"loss": 0.6274, "grad_norm": 1.1134123802185059, "learning_rate": 0.0002, "epoch": 4.214721723518851, "step": 58690}, {"loss": 0.5842, "grad_norm": 0.8307328820228577, "learning_rate": 0.0002, "epoch": 4.215439856373429, "step": 58700}, {"loss": 0.5795, "grad_norm": 0.8211639523506165, "learning_rate": 0.0002, "epoch": 4.216157989228007, "step": 58710}, {"loss": 0.5613, "grad_norm": 1.0749584436416626, "learning_rate": 0.0002, "epoch": 4.216876122082585, "step": 58720}, {"loss": 0.5956, "grad_norm": 1.1394833326339722, "learning_rate": 0.0002, "epoch": 4.217594254937163, "step": 58730}, {"loss": 0.609, "grad_norm": 1.05130934715271, "learning_rate": 0.0002, "epoch": 4.218312387791742, "step": 58740}, {"loss": 0.6294, "grad_norm": 0.7949456572532654, "learning_rate": 0.0002, "epoch": 4.21903052064632, "step": 58750}, {"loss": 0.6148, "grad_norm": 0.906506359577179, "learning_rate": 0.0002, "epoch": 4.219748653500898, "step": 58760}, {"loss": 0.5778, "grad_norm": 0.8338989615440369, "learning_rate": 0.0002, "epoch": 4.220466786355476, "step": 58770}, {"loss": 0.5402, "grad_norm": 0.9325370788574219, "learning_rate": 0.0002, "epoch": 4.221184919210054, "step": 58780}, {"loss": 0.5657, "grad_norm": 1.0208096504211426, "learning_rate": 0.0002, "epoch": 4.221903052064632, "step": 58790}, {"loss": 0.6523, "grad_norm": 1.0075920820236206, "learning_rate": 0.0002, "epoch": 4.22262118491921, "step": 58800}, {"loss": 0.5545, "grad_norm": 0.9858701229095459, "learning_rate": 0.0002, "epoch": 4.223339317773788, "step": 58810}, {"loss": 0.6343, "grad_norm": 1.0010110139846802, "learning_rate": 0.0002, "epoch": 4.224057450628366, "step": 58820}, {"loss": 0.5991, "grad_norm": 0.9360540509223938, "learning_rate": 0.0002, "epoch": 4.224775583482945, "step": 58830}, {"loss": 0.5887, "grad_norm": 0.9021786451339722, "learning_rate": 0.0002, "epoch": 4.225493716337523, "step": 58840}, {"loss": 0.6132, "grad_norm": 1.1778476238250732, "learning_rate": 0.0002, "epoch": 4.226211849192101, "step": 58850}, {"loss": 0.5956, "grad_norm": 1.0061023235321045, "learning_rate": 0.0002, "epoch": 4.226929982046679, "step": 58860}, {"loss": 0.5846, "grad_norm": 0.8839752674102783, "learning_rate": 0.0002, "epoch": 4.227648114901257, "step": 58870}, {"loss": 0.6129, "grad_norm": 1.0078870058059692, "learning_rate": 0.0002, "epoch": 4.228366247755835, "step": 58880}, {"loss": 0.6403, "grad_norm": 0.8926451206207275, "learning_rate": 0.0002, "epoch": 4.229084380610413, "step": 58890}, {"loss": 0.5987, "grad_norm": 1.4018772840499878, "learning_rate": 0.0002, "epoch": 4.229802513464991, "step": 58900}, {"loss": 0.5925, "grad_norm": 0.9911289215087891, "learning_rate": 0.0002, "epoch": 4.230520646319569, "step": 58910}, {"loss": 0.5846, "grad_norm": 0.9374576807022095, "learning_rate": 0.0002, "epoch": 4.231238779174147, "step": 58920}, {"loss": 0.5856, "grad_norm": 1.179650068283081, "learning_rate": 0.0002, "epoch": 4.231956912028726, "step": 58930}, {"loss": 0.601, "grad_norm": 0.9434911012649536, "learning_rate": 0.0002, "epoch": 4.232675044883304, "step": 58940}, {"loss": 0.6137, "grad_norm": 1.0061911344528198, "learning_rate": 0.0002, "epoch": 4.233393177737882, "step": 58950}, {"loss": 0.5847, "grad_norm": 0.9663233757019043, "learning_rate": 0.0002, "epoch": 4.23411131059246, "step": 58960}, {"loss": 0.5748, "grad_norm": 0.8897581696510315, "learning_rate": 0.0002, "epoch": 4.234829443447038, "step": 58970}, {"loss": 0.5586, "grad_norm": 0.873281717300415, "learning_rate": 0.0002, "epoch": 4.235547576301616, "step": 58980}, {"loss": 0.6027, "grad_norm": 0.9146949052810669, "learning_rate": 0.0002, "epoch": 4.236265709156194, "step": 58990}, {"loss": 0.6356, "grad_norm": 0.9381195306777954, "learning_rate": 0.0002, "epoch": 4.236983842010772, "step": 59000}, {"loss": 0.5641, "grad_norm": 0.9700697064399719, "learning_rate": 0.0002, "epoch": 4.23770197486535, "step": 59010}, {"loss": 0.6099, "grad_norm": 0.9050154685974121, "learning_rate": 0.0002, "epoch": 4.238420107719929, "step": 59020}, {"loss": 0.552, "grad_norm": 0.9901503324508667, "learning_rate": 0.0002, "epoch": 4.239138240574507, "step": 59030}, {"loss": 0.6333, "grad_norm": 0.9009594321250916, "learning_rate": 0.0002, "epoch": 4.239856373429085, "step": 59040}, {"loss": 0.6104, "grad_norm": 1.0924968719482422, "learning_rate": 0.0002, "epoch": 4.240574506283663, "step": 59050}, {"loss": 0.6269, "grad_norm": 0.9939947724342346, "learning_rate": 0.0002, "epoch": 4.241292639138241, "step": 59060}, {"loss": 0.6039, "grad_norm": 1.0577857494354248, "learning_rate": 0.0002, "epoch": 4.242010771992819, "step": 59070}, {"loss": 0.5992, "grad_norm": 1.0836747884750366, "learning_rate": 0.0002, "epoch": 4.242728904847397, "step": 59080}, {"loss": 0.6518, "grad_norm": 0.97043377161026, "learning_rate": 0.0002, "epoch": 4.243447037701975, "step": 59090}, {"loss": 0.5877, "grad_norm": 0.7711901664733887, "learning_rate": 0.0002, "epoch": 4.244165170556553, "step": 59100}, {"loss": 0.6017, "grad_norm": 1.0143170356750488, "learning_rate": 0.0002, "epoch": 4.244883303411131, "step": 59110}, {"loss": 0.6245, "grad_norm": 0.9151925444602966, "learning_rate": 0.0002, "epoch": 4.2456014362657095, "step": 59120}, {"loss": 0.6436, "grad_norm": 0.9252700209617615, "learning_rate": 0.0002, "epoch": 4.2463195691202875, "step": 59130}, {"loss": 0.5696, "grad_norm": 0.8429408073425293, "learning_rate": 0.0002, "epoch": 4.2470377019748655, "step": 59140}, {"loss": 0.5737, "grad_norm": 0.9645987153053284, "learning_rate": 0.0002, "epoch": 4.2477558348294435, "step": 59150}, {"loss": 0.6045, "grad_norm": 0.9949791431427002, "learning_rate": 0.0002, "epoch": 4.2484739676840215, "step": 59160}, {"loss": 0.6069, "grad_norm": 0.9128350615501404, "learning_rate": 0.0002, "epoch": 4.2491921005385995, "step": 59170}, {"loss": 0.596, "grad_norm": 0.7406911849975586, "learning_rate": 0.0002, "epoch": 4.2499102333931775, "step": 59180}, {"loss": 0.5796, "grad_norm": 1.0237419605255127, "learning_rate": 0.0002, "epoch": 4.2506283662477555, "step": 59190}, {"loss": 0.631, "grad_norm": 0.805459201335907, "learning_rate": 0.0002, "epoch": 4.2513464991023335, "step": 59200}, {"loss": 0.6104, "grad_norm": 0.8477254509925842, "learning_rate": 0.0002, "epoch": 4.252064631956912, "step": 59210}, {"loss": 0.5608, "grad_norm": 0.984023928642273, "learning_rate": 0.0002, "epoch": 4.25278276481149, "step": 59220}, {"loss": 0.6185, "grad_norm": 1.0667484998703003, "learning_rate": 0.0002, "epoch": 4.253500897666068, "step": 59230}, {"loss": 0.5596, "grad_norm": 0.7192284464836121, "learning_rate": 0.0002, "epoch": 4.254219030520646, "step": 59240}, {"loss": 0.5971, "grad_norm": 0.9557451009750366, "learning_rate": 0.0002, "epoch": 4.254937163375224, "step": 59250}, {"loss": 0.6012, "grad_norm": 0.9209784865379333, "learning_rate": 0.0002, "epoch": 4.255655296229802, "step": 59260}, {"loss": 0.67, "grad_norm": 0.9785363674163818, "learning_rate": 0.0002, "epoch": 4.25637342908438, "step": 59270}, {"loss": 0.6185, "grad_norm": 0.910214364528656, "learning_rate": 0.0002, "epoch": 4.257091561938958, "step": 59280}, {"loss": 0.6451, "grad_norm": 0.8945858478546143, "learning_rate": 0.0002, "epoch": 4.257809694793536, "step": 59290}, {"loss": 0.5876, "grad_norm": 1.0984420776367188, "learning_rate": 0.0002, "epoch": 4.258527827648114, "step": 59300}, {"loss": 0.5616, "grad_norm": 1.0256640911102295, "learning_rate": 0.0002, "epoch": 4.259245960502693, "step": 59310}, {"loss": 0.5825, "grad_norm": 0.978397786617279, "learning_rate": 0.0002, "epoch": 4.259964093357271, "step": 59320}, {"loss": 0.6043, "grad_norm": 0.7587000727653503, "learning_rate": 0.0002, "epoch": 4.260682226211849, "step": 59330}, {"loss": 0.5616, "grad_norm": 0.9384620785713196, "learning_rate": 0.0002, "epoch": 4.261400359066427, "step": 59340}, {"loss": 0.6669, "grad_norm": 0.893992006778717, "learning_rate": 0.0002, "epoch": 4.262118491921005, "step": 59350}, {"loss": 0.561, "grad_norm": 1.0231536626815796, "learning_rate": 0.0002, "epoch": 4.262836624775583, "step": 59360}, {"loss": 0.5912, "grad_norm": 0.9810128211975098, "learning_rate": 0.0002, "epoch": 4.263554757630161, "step": 59370}, {"loss": 0.5871, "grad_norm": 1.0868116617202759, "learning_rate": 0.0002, "epoch": 4.264272890484739, "step": 59380}, {"loss": 0.5986, "grad_norm": 1.1433676481246948, "learning_rate": 0.0002, "epoch": 4.264991023339318, "step": 59390}, {"loss": 0.6306, "grad_norm": 0.9836946725845337, "learning_rate": 0.0002, "epoch": 4.265709156193896, "step": 59400}, {"loss": 0.5854, "grad_norm": 0.9473603963851929, "learning_rate": 0.0002, "epoch": 4.266427289048474, "step": 59410}, {"loss": 0.6095, "grad_norm": 0.9066835641860962, "learning_rate": 0.0002, "epoch": 4.267145421903052, "step": 59420}, {"loss": 0.656, "grad_norm": 1.0534718036651611, "learning_rate": 0.0002, "epoch": 4.26786355475763, "step": 59430}, {"loss": 0.5624, "grad_norm": 1.0392775535583496, "learning_rate": 0.0002, "epoch": 4.268581687612208, "step": 59440}, {"loss": 0.5697, "grad_norm": 1.011472463607788, "learning_rate": 0.0002, "epoch": 4.269299820466786, "step": 59450}, {"loss": 0.5971, "grad_norm": 1.0704147815704346, "learning_rate": 0.0002, "epoch": 4.270017953321364, "step": 59460}, {"loss": 0.5719, "grad_norm": 0.9349238872528076, "learning_rate": 0.0002, "epoch": 4.270736086175942, "step": 59470}, {"loss": 0.5637, "grad_norm": 0.8745087385177612, "learning_rate": 0.0002, "epoch": 4.27145421903052, "step": 59480}, {"loss": 0.6246, "grad_norm": 0.8823763728141785, "learning_rate": 0.0002, "epoch": 4.272172351885099, "step": 59490}, {"loss": 0.6021, "grad_norm": 1.110912799835205, "learning_rate": 0.0002, "epoch": 4.272890484739677, "step": 59500}, {"loss": 0.5939, "grad_norm": 1.0000925064086914, "learning_rate": 0.0002, "epoch": 4.273608617594255, "step": 59510}, {"loss": 0.5531, "grad_norm": 1.1578227281570435, "learning_rate": 0.0002, "epoch": 4.274326750448833, "step": 59520}, {"loss": 0.6372, "grad_norm": 0.875720202922821, "learning_rate": 0.0002, "epoch": 4.275044883303411, "step": 59530}, {"loss": 0.5956, "grad_norm": 0.9562238454818726, "learning_rate": 0.0002, "epoch": 4.275763016157989, "step": 59540}, {"loss": 0.5996, "grad_norm": 0.8384222388267517, "learning_rate": 0.0002, "epoch": 4.276481149012567, "step": 59550}, {"loss": 0.6001, "grad_norm": 1.2719428539276123, "learning_rate": 0.0002, "epoch": 4.277199281867145, "step": 59560}, {"loss": 0.6286, "grad_norm": 1.0656434297561646, "learning_rate": 0.0002, "epoch": 4.277917414721723, "step": 59570}, {"loss": 0.5895, "grad_norm": 1.0766716003417969, "learning_rate": 0.0002, "epoch": 4.278635547576302, "step": 59580}, {"loss": 0.5831, "grad_norm": 0.8892807960510254, "learning_rate": 0.0002, "epoch": 4.27935368043088, "step": 59590}, {"loss": 0.5717, "grad_norm": 0.8956300020217896, "learning_rate": 0.0002, "epoch": 4.280071813285458, "step": 59600}, {"loss": 0.5965, "grad_norm": 0.9562926888465881, "learning_rate": 0.0002, "epoch": 4.280789946140036, "step": 59610}, {"loss": 0.5487, "grad_norm": 1.009141445159912, "learning_rate": 0.0002, "epoch": 4.281508078994614, "step": 59620}, {"loss": 0.6337, "grad_norm": 1.0546064376831055, "learning_rate": 0.0002, "epoch": 4.282226211849192, "step": 59630}, {"loss": 0.5771, "grad_norm": 0.8831254243850708, "learning_rate": 0.0002, "epoch": 4.28294434470377, "step": 59640}, {"loss": 0.6241, "grad_norm": 0.9560053944587708, "learning_rate": 0.0002, "epoch": 4.283662477558348, "step": 59650}, {"loss": 0.6012, "grad_norm": 1.030339241027832, "learning_rate": 0.0002, "epoch": 4.284380610412926, "step": 59660}, {"loss": 0.6174, "grad_norm": 1.00662100315094, "learning_rate": 0.0002, "epoch": 4.285098743267504, "step": 59670}, {"loss": 0.5802, "grad_norm": 1.0759116411209106, "learning_rate": 0.0002, "epoch": 4.285816876122083, "step": 59680}, {"loss": 0.6429, "grad_norm": 0.9985393285751343, "learning_rate": 0.0002, "epoch": 4.286535008976661, "step": 59690}, {"loss": 0.5992, "grad_norm": 0.9044474959373474, "learning_rate": 0.0002, "epoch": 4.287253141831239, "step": 59700}, {"loss": 0.6263, "grad_norm": 1.1224442720413208, "learning_rate": 0.0002, "epoch": 4.287971274685817, "step": 59710}, {"loss": 0.6118, "grad_norm": 0.8436414003372192, "learning_rate": 0.0002, "epoch": 4.288689407540395, "step": 59720}, {"loss": 0.5881, "grad_norm": 1.0695041418075562, "learning_rate": 0.0002, "epoch": 4.289407540394973, "step": 59730}, {"loss": 0.5994, "grad_norm": 0.8809951543807983, "learning_rate": 0.0002, "epoch": 4.290125673249551, "step": 59740}, {"loss": 0.6508, "grad_norm": 1.0213792324066162, "learning_rate": 0.0002, "epoch": 4.290843806104129, "step": 59750}, {"loss": 0.5851, "grad_norm": 0.9660196900367737, "learning_rate": 0.0002, "epoch": 4.291561938958707, "step": 59760}, {"loss": 0.6582, "grad_norm": 0.8005787134170532, "learning_rate": 0.0002, "epoch": 4.292280071813286, "step": 59770}, {"loss": 0.6504, "grad_norm": 1.0016109943389893, "learning_rate": 0.0002, "epoch": 4.292998204667864, "step": 59780}, {"loss": 0.5765, "grad_norm": 0.9112903475761414, "learning_rate": 0.0002, "epoch": 4.293716337522442, "step": 59790}, {"loss": 0.5925, "grad_norm": 0.9999852180480957, "learning_rate": 0.0002, "epoch": 4.29443447037702, "step": 59800}, {"loss": 0.636, "grad_norm": 0.9323953986167908, "learning_rate": 0.0002, "epoch": 4.295152603231598, "step": 59810}, {"loss": 0.5743, "grad_norm": 0.903037965297699, "learning_rate": 0.0002, "epoch": 4.295870736086176, "step": 59820}, {"loss": 0.6008, "grad_norm": 1.2462431192398071, "learning_rate": 0.0002, "epoch": 4.296588868940754, "step": 59830}, {"loss": 0.6126, "grad_norm": 1.2322230339050293, "learning_rate": 0.0002, "epoch": 4.297307001795332, "step": 59840}, {"loss": 0.6029, "grad_norm": 0.9584668278694153, "learning_rate": 0.0002, "epoch": 4.29802513464991, "step": 59850}, {"loss": 0.6179, "grad_norm": 0.9664767980575562, "learning_rate": 0.0002, "epoch": 4.298743267504488, "step": 59860}, {"loss": 0.5909, "grad_norm": 0.8860437273979187, "learning_rate": 0.0002, "epoch": 4.299461400359067, "step": 59870}, {"loss": 0.5708, "grad_norm": 1.0825127363204956, "learning_rate": 0.0002, "epoch": 4.300179533213645, "step": 59880}, {"loss": 0.6338, "grad_norm": 1.1312100887298584, "learning_rate": 0.0002, "epoch": 4.300897666068223, "step": 59890}, {"loss": 0.6362, "grad_norm": 0.8289751410484314, "learning_rate": 0.0002, "epoch": 4.301615798922801, "step": 59900}, {"loss": 0.6061, "grad_norm": 0.8990927934646606, "learning_rate": 0.0002, "epoch": 4.302333931777379, "step": 59910}, {"loss": 0.5993, "grad_norm": 0.9667525887489319, "learning_rate": 0.0002, "epoch": 4.303052064631957, "step": 59920}, {"loss": 0.5756, "grad_norm": 0.8656060695648193, "learning_rate": 0.0002, "epoch": 4.303770197486535, "step": 59930}, {"loss": 0.6271, "grad_norm": 0.8909396529197693, "learning_rate": 0.0002, "epoch": 4.304488330341113, "step": 59940}, {"loss": 0.5918, "grad_norm": 0.9533283114433289, "learning_rate": 0.0002, "epoch": 4.305206463195692, "step": 59950}, {"loss": 0.6146, "grad_norm": 0.9090739488601685, "learning_rate": 0.0002, "epoch": 4.30592459605027, "step": 59960}, {"loss": 0.5949, "grad_norm": 1.096656322479248, "learning_rate": 0.0002, "epoch": 4.306642728904848, "step": 59970}, {"loss": 0.582, "grad_norm": 1.0392465591430664, "learning_rate": 0.0002, "epoch": 4.307360861759426, "step": 59980}, {"loss": 0.6552, "grad_norm": 0.8733913898468018, "learning_rate": 0.0002, "epoch": 4.308078994614004, "step": 59990}, {"loss": 0.5771, "grad_norm": 0.8287094235420227, "learning_rate": 0.0002, "epoch": 4.308797127468582, "step": 60000}, {"loss": 0.6157, "grad_norm": 0.9267017245292664, "learning_rate": 0.0002, "epoch": 4.30951526032316, "step": 60010}, {"loss": 0.6402, "grad_norm": 0.9969515800476074, "learning_rate": 0.0002, "epoch": 4.310233393177738, "step": 60020}, {"loss": 0.541, "grad_norm": 1.0005015134811401, "learning_rate": 0.0002, "epoch": 4.310951526032316, "step": 60030}, {"loss": 0.6295, "grad_norm": 1.1215369701385498, "learning_rate": 0.0002, "epoch": 4.311669658886894, "step": 60040}, {"loss": 0.6225, "grad_norm": 1.0434890985488892, "learning_rate": 0.0002, "epoch": 4.312387791741473, "step": 60050}, {"loss": 0.5962, "grad_norm": 0.967989981174469, "learning_rate": 0.0002, "epoch": 4.313105924596051, "step": 60060}, {"loss": 0.5862, "grad_norm": 1.007599115371704, "learning_rate": 0.0002, "epoch": 4.313824057450629, "step": 60070}, {"loss": 0.6233, "grad_norm": 0.9356340765953064, "learning_rate": 0.0002, "epoch": 4.314542190305207, "step": 60080}, {"loss": 0.5642, "grad_norm": 0.9566757678985596, "learning_rate": 0.0002, "epoch": 4.315260323159785, "step": 60090}, {"loss": 0.6142, "grad_norm": 1.1066830158233643, "learning_rate": 0.0002, "epoch": 4.315978456014363, "step": 60100}, {"loss": 0.5432, "grad_norm": 0.9895772933959961, "learning_rate": 0.0002, "epoch": 4.316696588868941, "step": 60110}, {"loss": 0.5542, "grad_norm": 1.07423734664917, "learning_rate": 0.0002, "epoch": 4.317414721723519, "step": 60120}, {"loss": 0.5975, "grad_norm": 1.0777037143707275, "learning_rate": 0.0002, "epoch": 4.318132854578097, "step": 60130}, {"loss": 0.6168, "grad_norm": 1.1475656032562256, "learning_rate": 0.0002, "epoch": 4.3188509874326755, "step": 60140}, {"loss": 0.6038, "grad_norm": 1.0705864429473877, "learning_rate": 0.0002, "epoch": 4.3195691202872535, "step": 60150}, {"loss": 0.6032, "grad_norm": 0.8676854968070984, "learning_rate": 0.0002, "epoch": 4.3202872531418315, "step": 60160}, {"loss": 0.632, "grad_norm": 0.9488174319267273, "learning_rate": 0.0002, "epoch": 4.3210053859964095, "step": 60170}, {"loss": 0.6137, "grad_norm": 1.1171153783798218, "learning_rate": 0.0002, "epoch": 4.3217235188509875, "step": 60180}, {"loss": 0.6477, "grad_norm": 1.091435194015503, "learning_rate": 0.0002, "epoch": 4.3224416517055655, "step": 60190}, {"loss": 0.6105, "grad_norm": 0.880944013595581, "learning_rate": 0.0002, "epoch": 4.3231597845601435, "step": 60200}, {"loss": 0.5736, "grad_norm": 0.8458809852600098, "learning_rate": 0.0002, "epoch": 4.3238779174147215, "step": 60210}, {"loss": 0.6211, "grad_norm": 0.7900225520133972, "learning_rate": 0.0002, "epoch": 4.3245960502692995, "step": 60220}, {"loss": 0.6205, "grad_norm": 0.966742753982544, "learning_rate": 0.0002, "epoch": 4.3253141831238775, "step": 60230}, {"loss": 0.6178, "grad_norm": 0.8948110342025757, "learning_rate": 0.0002, "epoch": 4.326032315978456, "step": 60240}, {"loss": 0.6176, "grad_norm": 0.8598700165748596, "learning_rate": 0.0002, "epoch": 4.326750448833034, "step": 60250}, {"loss": 0.6373, "grad_norm": 1.127610206604004, "learning_rate": 0.0002, "epoch": 4.327468581687612, "step": 60260}, {"loss": 0.6081, "grad_norm": 0.8357340693473816, "learning_rate": 0.0002, "epoch": 4.32818671454219, "step": 60270}, {"loss": 0.5839, "grad_norm": 0.8771896362304688, "learning_rate": 0.0002, "epoch": 4.328904847396768, "step": 60280}, {"loss": 0.5959, "grad_norm": 0.9202101826667786, "learning_rate": 0.0002, "epoch": 4.329622980251346, "step": 60290}, {"loss": 0.6387, "grad_norm": 1.1427538394927979, "learning_rate": 0.0002, "epoch": 4.330341113105924, "step": 60300}, {"loss": 0.6306, "grad_norm": 0.8711863160133362, "learning_rate": 0.0002, "epoch": 4.331059245960502, "step": 60310}, {"loss": 0.6011, "grad_norm": 0.972723662853241, "learning_rate": 0.0002, "epoch": 4.33177737881508, "step": 60320}, {"loss": 0.5761, "grad_norm": 1.1496877670288086, "learning_rate": 0.0002, "epoch": 4.332495511669659, "step": 60330}, {"loss": 0.6472, "grad_norm": 1.008581519126892, "learning_rate": 0.0002, "epoch": 4.333213644524237, "step": 60340}, {"loss": 0.6479, "grad_norm": 1.0802706480026245, "learning_rate": 0.0002, "epoch": 4.333931777378815, "step": 60350}, {"loss": 0.6105, "grad_norm": 0.8394291996955872, "learning_rate": 0.0002, "epoch": 4.334649910233393, "step": 60360}, {"loss": 0.6241, "grad_norm": 0.8355905413627625, "learning_rate": 0.0002, "epoch": 4.335368043087971, "step": 60370}, {"loss": 0.6282, "grad_norm": 0.9583960175514221, "learning_rate": 0.0002, "epoch": 4.336086175942549, "step": 60380}, {"loss": 0.6436, "grad_norm": 1.138934850692749, "learning_rate": 0.0002, "epoch": 4.336804308797127, "step": 60390}, {"loss": 0.587, "grad_norm": 1.0334709882736206, "learning_rate": 0.0002, "epoch": 4.337522441651705, "step": 60400}, {"loss": 0.5596, "grad_norm": 0.729686439037323, "learning_rate": 0.0002, "epoch": 4.338240574506283, "step": 60410}, {"loss": 0.5863, "grad_norm": 0.8735929727554321, "learning_rate": 0.0002, "epoch": 4.338958707360861, "step": 60420}, {"loss": 0.5732, "grad_norm": 0.9617681503295898, "learning_rate": 0.0002, "epoch": 4.33967684021544, "step": 60430}, {"loss": 0.5865, "grad_norm": 0.9439655542373657, "learning_rate": 0.0002, "epoch": 4.340394973070018, "step": 60440}, {"loss": 0.5959, "grad_norm": 0.9275408387184143, "learning_rate": 0.0002, "epoch": 4.341113105924596, "step": 60450}, {"loss": 0.6295, "grad_norm": 1.0693308115005493, "learning_rate": 0.0002, "epoch": 4.341831238779174, "step": 60460}, {"loss": 0.6455, "grad_norm": 0.9234438538551331, "learning_rate": 0.0002, "epoch": 4.342549371633752, "step": 60470}, {"loss": 0.6308, "grad_norm": 1.1376168727874756, "learning_rate": 0.0002, "epoch": 4.34326750448833, "step": 60480}, {"loss": 0.623, "grad_norm": 0.9218108654022217, "learning_rate": 0.0002, "epoch": 4.343985637342908, "step": 60490}, {"loss": 0.6291, "grad_norm": 1.1467362642288208, "learning_rate": 0.0002, "epoch": 4.344703770197486, "step": 60500}, {"loss": 0.5757, "grad_norm": 0.9459165930747986, "learning_rate": 0.0002, "epoch": 4.345421903052064, "step": 60510}, {"loss": 0.5963, "grad_norm": 0.9460827708244324, "learning_rate": 0.0002, "epoch": 4.346140035906643, "step": 60520}, {"loss": 0.5822, "grad_norm": 1.0845041275024414, "learning_rate": 0.0002, "epoch": 4.346858168761221, "step": 60530}, {"loss": 0.6326, "grad_norm": 1.082675576210022, "learning_rate": 0.0002, "epoch": 4.347576301615799, "step": 60540}, {"loss": 0.5419, "grad_norm": 0.8443698883056641, "learning_rate": 0.0002, "epoch": 4.348294434470377, "step": 60550}, {"loss": 0.5634, "grad_norm": 1.018393874168396, "learning_rate": 0.0002, "epoch": 4.349012567324955, "step": 60560}, {"loss": 0.6447, "grad_norm": 0.8796373009681702, "learning_rate": 0.0002, "epoch": 4.349730700179533, "step": 60570}, {"loss": 0.6108, "grad_norm": 1.097942590713501, "learning_rate": 0.0002, "epoch": 4.350448833034111, "step": 60580}, {"loss": 0.6161, "grad_norm": 0.8750485181808472, "learning_rate": 0.0002, "epoch": 4.351166965888689, "step": 60590}, {"loss": 0.5849, "grad_norm": 1.0339995622634888, "learning_rate": 0.0002, "epoch": 4.351885098743267, "step": 60600}, {"loss": 0.6097, "grad_norm": 0.9077731966972351, "learning_rate": 0.0002, "epoch": 4.352603231597846, "step": 60610}, {"loss": 0.5657, "grad_norm": 1.051321029663086, "learning_rate": 0.0002, "epoch": 4.353321364452424, "step": 60620}, {"loss": 0.6089, "grad_norm": 1.0018669366836548, "learning_rate": 0.0002, "epoch": 4.354039497307002, "step": 60630}, {"loss": 0.5957, "grad_norm": 1.0349196195602417, "learning_rate": 0.0002, "epoch": 4.35475763016158, "step": 60640}, {"loss": 0.6212, "grad_norm": 1.009589672088623, "learning_rate": 0.0002, "epoch": 4.355475763016158, "step": 60650}, {"loss": 0.5542, "grad_norm": 1.0463480949401855, "learning_rate": 0.0002, "epoch": 4.356193895870736, "step": 60660}, {"loss": 0.5797, "grad_norm": 0.9815132021903992, "learning_rate": 0.0002, "epoch": 4.356912028725314, "step": 60670}, {"loss": 0.6089, "grad_norm": 1.0977262258529663, "learning_rate": 0.0002, "epoch": 4.357630161579892, "step": 60680}, {"loss": 0.6061, "grad_norm": 0.8450005054473877, "learning_rate": 0.0002, "epoch": 4.35834829443447, "step": 60690}, {"loss": 0.5913, "grad_norm": 1.0959078073501587, "learning_rate": 0.0002, "epoch": 4.359066427289049, "step": 60700}, {"loss": 0.5957, "grad_norm": 0.9155098795890808, "learning_rate": 0.0002, "epoch": 4.359784560143627, "step": 60710}, {"loss": 0.6084, "grad_norm": 0.9267987012863159, "learning_rate": 0.0002, "epoch": 4.360502692998205, "step": 60720}, {"loss": 0.5974, "grad_norm": 1.177472472190857, "learning_rate": 0.0002, "epoch": 4.361220825852783, "step": 60730}, {"loss": 0.5911, "grad_norm": 0.8615312576293945, "learning_rate": 0.0002, "epoch": 4.361938958707361, "step": 60740}, {"loss": 0.5819, "grad_norm": 1.0939710140228271, "learning_rate": 0.0002, "epoch": 4.362657091561939, "step": 60750}, {"loss": 0.6263, "grad_norm": 1.0928049087524414, "learning_rate": 0.0002, "epoch": 4.363375224416517, "step": 60760}, {"loss": 0.5772, "grad_norm": 1.0796833038330078, "learning_rate": 0.0002, "epoch": 4.364093357271095, "step": 60770}, {"loss": 0.5879, "grad_norm": 0.9768339991569519, "learning_rate": 0.0002, "epoch": 4.364811490125673, "step": 60780}, {"loss": 0.6335, "grad_norm": 0.9082722067832947, "learning_rate": 0.0002, "epoch": 4.365529622980251, "step": 60790}, {"loss": 0.6037, "grad_norm": 0.9614832997322083, "learning_rate": 0.0002, "epoch": 4.36624775583483, "step": 60800}, {"loss": 0.6185, "grad_norm": 0.8874651789665222, "learning_rate": 0.0002, "epoch": 4.366965888689408, "step": 60810}, {"loss": 0.6524, "grad_norm": 0.8810178637504578, "learning_rate": 0.0002, "epoch": 4.367684021543986, "step": 60820}, {"loss": 0.5908, "grad_norm": 1.0893806219100952, "learning_rate": 0.0002, "epoch": 4.368402154398564, "step": 60830}, {"loss": 0.5782, "grad_norm": 0.9042278528213501, "learning_rate": 0.0002, "epoch": 4.369120287253142, "step": 60840}, {"loss": 0.5798, "grad_norm": 1.0832217931747437, "learning_rate": 0.0002, "epoch": 4.36983842010772, "step": 60850}, {"loss": 0.6235, "grad_norm": 0.9431114792823792, "learning_rate": 0.0002, "epoch": 4.370556552962298, "step": 60860}, {"loss": 0.5869, "grad_norm": 1.031553030014038, "learning_rate": 0.0002, "epoch": 4.371274685816876, "step": 60870}, {"loss": 0.5839, "grad_norm": 0.8702824711799622, "learning_rate": 0.0002, "epoch": 4.371992818671454, "step": 60880}, {"loss": 0.6028, "grad_norm": 1.1109199523925781, "learning_rate": 0.0002, "epoch": 4.372710951526033, "step": 60890}, {"loss": 0.6423, "grad_norm": 0.8369361162185669, "learning_rate": 0.0002, "epoch": 4.373429084380611, "step": 60900}, {"loss": 0.6011, "grad_norm": 0.988915205001831, "learning_rate": 0.0002, "epoch": 4.374147217235189, "step": 60910}, {"loss": 0.6266, "grad_norm": 0.9365919232368469, "learning_rate": 0.0002, "epoch": 4.374865350089767, "step": 60920}, {"loss": 0.5786, "grad_norm": 0.9789398908615112, "learning_rate": 0.0002, "epoch": 4.375583482944345, "step": 60930}, {"loss": 0.6459, "grad_norm": 0.8786931037902832, "learning_rate": 0.0002, "epoch": 4.376301615798923, "step": 60940}, {"loss": 0.631, "grad_norm": 0.8891511559486389, "learning_rate": 0.0002, "epoch": 4.377019748653501, "step": 60950}, {"loss": 0.5909, "grad_norm": 0.9561707377433777, "learning_rate": 0.0002, "epoch": 4.377737881508079, "step": 60960}, {"loss": 0.5815, "grad_norm": 0.8674200177192688, "learning_rate": 0.0002, "epoch": 4.378456014362657, "step": 60970}, {"loss": 0.5664, "grad_norm": 0.9285916090011597, "learning_rate": 0.0002, "epoch": 4.379174147217235, "step": 60980}, {"loss": 0.5727, "grad_norm": 0.9185547232627869, "learning_rate": 0.0002, "epoch": 4.379892280071814, "step": 60990}, {"loss": 0.6296, "grad_norm": 1.081664800643921, "learning_rate": 0.0002, "epoch": 4.380610412926392, "step": 61000}, {"loss": 0.6346, "grad_norm": 1.0475854873657227, "learning_rate": 0.0002, "epoch": 4.38132854578097, "step": 61010}, {"loss": 0.6394, "grad_norm": 1.1519653797149658, "learning_rate": 0.0002, "epoch": 4.382046678635548, "step": 61020}, {"loss": 0.6437, "grad_norm": 0.8757607936859131, "learning_rate": 0.0002, "epoch": 4.382764811490126, "step": 61030}, {"loss": 0.6143, "grad_norm": 0.8707934021949768, "learning_rate": 0.0002, "epoch": 4.383482944344704, "step": 61040}, {"loss": 0.5782, "grad_norm": 1.1807516813278198, "learning_rate": 0.0002, "epoch": 4.384201077199282, "step": 61050}, {"loss": 0.5901, "grad_norm": 1.0674688816070557, "learning_rate": 0.0002, "epoch": 4.38491921005386, "step": 61060}, {"loss": 0.6247, "grad_norm": 0.9321209788322449, "learning_rate": 0.0002, "epoch": 4.385637342908438, "step": 61070}, {"loss": 0.5882, "grad_norm": 1.0786446332931519, "learning_rate": 0.0002, "epoch": 4.3863554757630165, "step": 61080}, {"loss": 0.5966, "grad_norm": 0.9733907580375671, "learning_rate": 0.0002, "epoch": 4.3870736086175945, "step": 61090}, {"loss": 0.5826, "grad_norm": 0.9476010203361511, "learning_rate": 0.0002, "epoch": 4.3877917414721725, "step": 61100}, {"loss": 0.6204, "grad_norm": 1.1321563720703125, "learning_rate": 0.0002, "epoch": 4.3885098743267505, "step": 61110}, {"loss": 0.5908, "grad_norm": 0.9379117488861084, "learning_rate": 0.0002, "epoch": 4.3892280071813286, "step": 61120}, {"loss": 0.586, "grad_norm": 0.8409728407859802, "learning_rate": 0.0002, "epoch": 4.3899461400359066, "step": 61130}, {"loss": 0.614, "grad_norm": 0.8309189081192017, "learning_rate": 0.0002, "epoch": 4.3906642728904846, "step": 61140}, {"loss": 0.6284, "grad_norm": 0.8922196626663208, "learning_rate": 0.0002, "epoch": 4.391382405745063, "step": 61150}, {"loss": 0.6358, "grad_norm": 0.8274614214897156, "learning_rate": 0.0002, "epoch": 4.392100538599641, "step": 61160}, {"loss": 0.5827, "grad_norm": 1.0928618907928467, "learning_rate": 0.0002, "epoch": 4.392818671454219, "step": 61170}, {"loss": 0.616, "grad_norm": 0.9771125316619873, "learning_rate": 0.0002, "epoch": 4.3935368043087974, "step": 61180}, {"loss": 0.6238, "grad_norm": 0.8844535946846008, "learning_rate": 0.0002, "epoch": 4.3942549371633755, "step": 61190}, {"loss": 0.5974, "grad_norm": 1.0498822927474976, "learning_rate": 0.0002, "epoch": 4.3949730700179535, "step": 61200}, {"loss": 0.596, "grad_norm": 0.9882155060768127, "learning_rate": 0.0002, "epoch": 4.3956912028725315, "step": 61210}, {"loss": 0.6385, "grad_norm": 1.090356707572937, "learning_rate": 0.0002, "epoch": 4.3964093357271095, "step": 61220}, {"loss": 0.6298, "grad_norm": 1.0908088684082031, "learning_rate": 0.0002, "epoch": 4.3971274685816875, "step": 61230}, {"loss": 0.6405, "grad_norm": 1.0013501644134521, "learning_rate": 0.0002, "epoch": 4.3978456014362655, "step": 61240}, {"loss": 0.5995, "grad_norm": 1.0916062593460083, "learning_rate": 0.0002, "epoch": 4.3985637342908435, "step": 61250}, {"loss": 0.5938, "grad_norm": 1.0817667245864868, "learning_rate": 0.0002, "epoch": 4.399281867145422, "step": 61260}, {"loss": 0.604, "grad_norm": 0.9745162129402161, "learning_rate": 0.0002, "epoch": 4.4, "step": 61270}, {"loss": 0.6028, "grad_norm": 1.0653400421142578, "learning_rate": 0.0002, "epoch": 4.400718132854578, "step": 61280}, {"loss": 0.6064, "grad_norm": 1.0082067251205444, "learning_rate": 0.0002, "epoch": 4.401436265709156, "step": 61290}, {"loss": 0.5719, "grad_norm": 0.7963659167289734, "learning_rate": 0.0002, "epoch": 4.402154398563734, "step": 61300}, {"loss": 0.6724, "grad_norm": 1.0428845882415771, "learning_rate": 0.0002, "epoch": 4.402872531418312, "step": 61310}, {"loss": 0.5991, "grad_norm": 0.9205707311630249, "learning_rate": 0.0002, "epoch": 4.40359066427289, "step": 61320}, {"loss": 0.6169, "grad_norm": 1.0103533267974854, "learning_rate": 0.0002, "epoch": 4.404308797127468, "step": 61330}, {"loss": 0.6284, "grad_norm": 1.113547682762146, "learning_rate": 0.0002, "epoch": 4.405026929982046, "step": 61340}, {"loss": 0.6071, "grad_norm": 1.137488842010498, "learning_rate": 0.0002, "epoch": 4.405745062836624, "step": 61350}, {"loss": 0.6303, "grad_norm": 1.1284101009368896, "learning_rate": 0.0002, "epoch": 4.406463195691203, "step": 61360}, {"loss": 0.5613, "grad_norm": 0.8010451197624207, "learning_rate": 0.0002, "epoch": 4.407181328545781, "step": 61370}, {"loss": 0.5963, "grad_norm": 0.8893977403640747, "learning_rate": 0.0002, "epoch": 4.407899461400359, "step": 61380}, {"loss": 0.6154, "grad_norm": 0.9098272323608398, "learning_rate": 0.0002, "epoch": 4.408617594254937, "step": 61390}, {"loss": 0.6091, "grad_norm": 1.0613329410552979, "learning_rate": 0.0002, "epoch": 4.409335727109515, "step": 61400}, {"loss": 0.6222, "grad_norm": 1.0070269107818604, "learning_rate": 0.0002, "epoch": 4.410053859964093, "step": 61410}, {"loss": 0.5894, "grad_norm": 0.8632227778434753, "learning_rate": 0.0002, "epoch": 4.410771992818671, "step": 61420}, {"loss": 0.6412, "grad_norm": 1.0183731317520142, "learning_rate": 0.0002, "epoch": 4.411490125673249, "step": 61430}, {"loss": 0.596, "grad_norm": 0.9049941897392273, "learning_rate": 0.0002, "epoch": 4.412208258527827, "step": 61440}, {"loss": 0.5991, "grad_norm": 1.0184082984924316, "learning_rate": 0.0002, "epoch": 4.412926391382406, "step": 61450}, {"loss": 0.5758, "grad_norm": 0.9994277358055115, "learning_rate": 0.0002, "epoch": 4.413644524236984, "step": 61460}, {"loss": 0.6009, "grad_norm": 1.0112420320510864, "learning_rate": 0.0002, "epoch": 4.414362657091562, "step": 61470}, {"loss": 0.584, "grad_norm": 0.9751759171485901, "learning_rate": 0.0002, "epoch": 4.41508078994614, "step": 61480}, {"loss": 0.6307, "grad_norm": 1.047135591506958, "learning_rate": 0.0002, "epoch": 4.415798922800718, "step": 61490}, {"loss": 0.6645, "grad_norm": 0.886282742023468, "learning_rate": 0.0002, "epoch": 4.416517055655296, "step": 61500}, {"loss": 0.6168, "grad_norm": 0.971964418888092, "learning_rate": 0.0002, "epoch": 4.417235188509874, "step": 61510}, {"loss": 0.5822, "grad_norm": 0.9603846073150635, "learning_rate": 0.0002, "epoch": 4.417953321364452, "step": 61520}, {"loss": 0.6349, "grad_norm": 1.060042142868042, "learning_rate": 0.0002, "epoch": 4.41867145421903, "step": 61530}, {"loss": 0.6223, "grad_norm": 1.1231369972229004, "learning_rate": 0.0002, "epoch": 4.419389587073608, "step": 61540}, {"loss": 0.6175, "grad_norm": 0.8269591331481934, "learning_rate": 0.0002, "epoch": 4.420107719928187, "step": 61550}, {"loss": 0.6285, "grad_norm": 1.0341241359710693, "learning_rate": 0.0002, "epoch": 4.420825852782765, "step": 61560}, {"loss": 0.6054, "grad_norm": 0.7276636958122253, "learning_rate": 0.0002, "epoch": 4.421543985637343, "step": 61570}, {"loss": 0.6321, "grad_norm": 1.0663669109344482, "learning_rate": 0.0002, "epoch": 4.422262118491921, "step": 61580}, {"loss": 0.5944, "grad_norm": 0.9764387011528015, "learning_rate": 0.0002, "epoch": 4.422980251346499, "step": 61590}, {"loss": 0.6065, "grad_norm": 1.0953258275985718, "learning_rate": 0.0002, "epoch": 4.423698384201077, "step": 61600}, {"loss": 0.5815, "grad_norm": 0.8877012729644775, "learning_rate": 0.0002, "epoch": 4.424416517055655, "step": 61610}, {"loss": 0.5798, "grad_norm": 0.8781440854072571, "learning_rate": 0.0002, "epoch": 4.425134649910233, "step": 61620}, {"loss": 0.6223, "grad_norm": 0.8333432674407959, "learning_rate": 0.0002, "epoch": 4.425852782764811, "step": 61630}, {"loss": 0.5949, "grad_norm": 0.9647989869117737, "learning_rate": 0.0002, "epoch": 4.42657091561939, "step": 61640}, {"loss": 0.6135, "grad_norm": 1.0801783800125122, "learning_rate": 0.0002, "epoch": 4.427289048473968, "step": 61650}, {"loss": 0.6065, "grad_norm": 0.8215882778167725, "learning_rate": 0.0002, "epoch": 4.428007181328546, "step": 61660}, {"loss": 0.5851, "grad_norm": 0.9853931665420532, "learning_rate": 0.0002, "epoch": 4.428725314183124, "step": 61670}, {"loss": 0.5942, "grad_norm": 0.8658010959625244, "learning_rate": 0.0002, "epoch": 4.429443447037702, "step": 61680}, {"loss": 0.6413, "grad_norm": 1.124064326286316, "learning_rate": 0.0002, "epoch": 4.43016157989228, "step": 61690}, {"loss": 0.6021, "grad_norm": 1.009340763092041, "learning_rate": 0.0002, "epoch": 4.430879712746858, "step": 61700}, {"loss": 0.6127, "grad_norm": 0.8705293536186218, "learning_rate": 0.0002, "epoch": 4.431597845601436, "step": 61710}, {"loss": 0.5971, "grad_norm": 1.1323511600494385, "learning_rate": 0.0002, "epoch": 4.432315978456014, "step": 61720}, {"loss": 0.5985, "grad_norm": 1.1203019618988037, "learning_rate": 0.0002, "epoch": 4.433034111310592, "step": 61730}, {"loss": 0.6178, "grad_norm": 1.1683770418167114, "learning_rate": 0.0002, "epoch": 4.433752244165171, "step": 61740}, {"loss": 0.6132, "grad_norm": 1.0735899209976196, "learning_rate": 0.0002, "epoch": 4.434470377019749, "step": 61750}, {"loss": 0.5664, "grad_norm": 1.142496109008789, "learning_rate": 0.0002, "epoch": 4.435188509874327, "step": 61760}, {"loss": 0.6276, "grad_norm": 1.1157732009887695, "learning_rate": 0.0002, "epoch": 4.435906642728905, "step": 61770}, {"loss": 0.6237, "grad_norm": 0.8845949172973633, "learning_rate": 0.0002, "epoch": 4.436624775583483, "step": 61780}, {"loss": 0.5964, "grad_norm": 1.1212759017944336, "learning_rate": 0.0002, "epoch": 4.437342908438061, "step": 61790}, {"loss": 0.6185, "grad_norm": 0.8832488656044006, "learning_rate": 0.0002, "epoch": 4.438061041292639, "step": 61800}, {"loss": 0.6264, "grad_norm": 0.9059590101242065, "learning_rate": 0.0002, "epoch": 4.438779174147217, "step": 61810}, {"loss": 0.6303, "grad_norm": 1.0625685453414917, "learning_rate": 0.0002, "epoch": 4.439497307001796, "step": 61820}, {"loss": 0.5795, "grad_norm": 0.9565598368644714, "learning_rate": 0.0002, "epoch": 4.440215439856374, "step": 61830}, {"loss": 0.6027, "grad_norm": 0.8975377082824707, "learning_rate": 0.0002, "epoch": 4.440933572710952, "step": 61840}, {"loss": 0.6334, "grad_norm": 1.0412718057632446, "learning_rate": 0.0002, "epoch": 4.44165170556553, "step": 61850}, {"loss": 0.6455, "grad_norm": 0.9923529624938965, "learning_rate": 0.0002, "epoch": 4.442369838420108, "step": 61860}, {"loss": 0.5931, "grad_norm": 1.3025734424591064, "learning_rate": 0.0002, "epoch": 4.443087971274686, "step": 61870}, {"loss": 0.5804, "grad_norm": 1.0031960010528564, "learning_rate": 0.0002, "epoch": 4.443806104129264, "step": 61880}, {"loss": 0.602, "grad_norm": 1.0974701642990112, "learning_rate": 0.0002, "epoch": 4.444524236983842, "step": 61890}, {"loss": 0.6078, "grad_norm": 1.1044024229049683, "learning_rate": 0.0002, "epoch": 4.44524236983842, "step": 61900}, {"loss": 0.6454, "grad_norm": 1.0782772302627563, "learning_rate": 0.0002, "epoch": 4.445960502692998, "step": 61910}, {"loss": 0.6453, "grad_norm": 1.006304383277893, "learning_rate": 0.0002, "epoch": 4.446678635547577, "step": 61920}, {"loss": 0.5449, "grad_norm": 0.9258833527565002, "learning_rate": 0.0002, "epoch": 4.447396768402155, "step": 61930}, {"loss": 0.5744, "grad_norm": 0.9888426065444946, "learning_rate": 0.0002, "epoch": 4.448114901256733, "step": 61940}, {"loss": 0.5853, "grad_norm": 0.9592963457107544, "learning_rate": 0.0002, "epoch": 4.448833034111311, "step": 61950}, {"loss": 0.6142, "grad_norm": 1.0527986288070679, "learning_rate": 0.0002, "epoch": 4.449551166965889, "step": 61960}, {"loss": 0.5829, "grad_norm": 0.8613291382789612, "learning_rate": 0.0002, "epoch": 4.450269299820467, "step": 61970}, {"loss": 0.6176, "grad_norm": 1.1083767414093018, "learning_rate": 0.0002, "epoch": 4.450987432675045, "step": 61980}, {"loss": 0.5768, "grad_norm": 0.772679328918457, "learning_rate": 0.0002, "epoch": 4.451705565529623, "step": 61990}, {"loss": 0.6348, "grad_norm": 0.9052274227142334, "learning_rate": 0.0002, "epoch": 4.452423698384201, "step": 62000}, {"loss": 0.6202, "grad_norm": 1.129667043685913, "learning_rate": 0.0002, "epoch": 4.45314183123878, "step": 62010}, {"loss": 0.6265, "grad_norm": 0.9994529485702515, "learning_rate": 0.0002, "epoch": 4.453859964093358, "step": 62020}, {"loss": 0.6249, "grad_norm": 0.982155978679657, "learning_rate": 0.0002, "epoch": 4.454578096947936, "step": 62030}, {"loss": 0.6255, "grad_norm": 0.9139904975891113, "learning_rate": 0.0002, "epoch": 4.455296229802514, "step": 62040}, {"loss": 0.6237, "grad_norm": 1.0877810716629028, "learning_rate": 0.0002, "epoch": 4.456014362657092, "step": 62050}, {"loss": 0.6105, "grad_norm": 1.0535308122634888, "learning_rate": 0.0002, "epoch": 4.45673249551167, "step": 62060}, {"loss": 0.6084, "grad_norm": 1.0225313901901245, "learning_rate": 0.0002, "epoch": 4.457450628366248, "step": 62070}, {"loss": 0.6239, "grad_norm": 0.8443132042884827, "learning_rate": 0.0002, "epoch": 4.458168761220826, "step": 62080}, {"loss": 0.5895, "grad_norm": 1.0426654815673828, "learning_rate": 0.0002, "epoch": 4.458886894075404, "step": 62090}, {"loss": 0.6022, "grad_norm": 1.1110700368881226, "learning_rate": 0.0002, "epoch": 4.459605026929982, "step": 62100}, {"loss": 0.6436, "grad_norm": 1.0200893878936768, "learning_rate": 0.0002, "epoch": 4.4603231597845605, "step": 62110}, {"loss": 0.628, "grad_norm": 0.9102830290794373, "learning_rate": 0.0002, "epoch": 4.4610412926391385, "step": 62120}, {"loss": 0.5894, "grad_norm": 1.1395094394683838, "learning_rate": 0.0002, "epoch": 4.4617594254937165, "step": 62130}, {"loss": 0.5765, "grad_norm": 1.1202316284179688, "learning_rate": 0.0002, "epoch": 4.4624775583482945, "step": 62140}, {"loss": 0.6238, "grad_norm": 1.142580509185791, "learning_rate": 0.0002, "epoch": 4.4631956912028725, "step": 62150}, {"loss": 0.6502, "grad_norm": 0.9843677878379822, "learning_rate": 0.0002, "epoch": 4.4639138240574505, "step": 62160}, {"loss": 0.6734, "grad_norm": 1.0351676940917969, "learning_rate": 0.0002, "epoch": 4.4646319569120285, "step": 62170}, {"loss": 0.6371, "grad_norm": 0.9365093111991882, "learning_rate": 0.0002, "epoch": 4.4653500897666065, "step": 62180}, {"loss": 0.5827, "grad_norm": 1.041193962097168, "learning_rate": 0.0002, "epoch": 4.4660682226211845, "step": 62190}, {"loss": 0.555, "grad_norm": 0.9686329960823059, "learning_rate": 0.0002, "epoch": 4.466786355475763, "step": 62200}, {"loss": 0.6405, "grad_norm": 1.028622031211853, "learning_rate": 0.0002, "epoch": 4.467504488330341, "step": 62210}, {"loss": 0.5928, "grad_norm": 0.9717516899108887, "learning_rate": 0.0002, "epoch": 4.468222621184919, "step": 62220}, {"loss": 0.6028, "grad_norm": 1.0467450618743896, "learning_rate": 0.0002, "epoch": 4.468940754039497, "step": 62230}, {"loss": 0.593, "grad_norm": 0.943717896938324, "learning_rate": 0.0002, "epoch": 4.469658886894075, "step": 62240}, {"loss": 0.5861, "grad_norm": 0.909429132938385, "learning_rate": 0.0002, "epoch": 4.470377019748653, "step": 62250}, {"loss": 0.6211, "grad_norm": 1.0294792652130127, "learning_rate": 0.0002, "epoch": 4.471095152603231, "step": 62260}, {"loss": 0.6215, "grad_norm": 1.1044281721115112, "learning_rate": 0.0002, "epoch": 4.471813285457809, "step": 62270}, {"loss": 0.6147, "grad_norm": 1.1555784940719604, "learning_rate": 0.0002, "epoch": 4.472531418312387, "step": 62280}, {"loss": 0.627, "grad_norm": 0.9441297650337219, "learning_rate": 0.0002, "epoch": 4.473249551166965, "step": 62290}, {"loss": 0.6205, "grad_norm": 0.9164380431175232, "learning_rate": 0.0002, "epoch": 4.473967684021544, "step": 62300}, {"loss": 0.6413, "grad_norm": 1.1139159202575684, "learning_rate": 0.0002, "epoch": 4.474685816876122, "step": 62310}, {"loss": 0.6013, "grad_norm": 1.0201882123947144, "learning_rate": 0.0002, "epoch": 4.4754039497307, "step": 62320}, {"loss": 0.6127, "grad_norm": 1.1471681594848633, "learning_rate": 0.0002, "epoch": 4.476122082585278, "step": 62330}, {"loss": 0.6322, "grad_norm": 1.0333549976348877, "learning_rate": 0.0002, "epoch": 4.476840215439856, "step": 62340}, {"loss": 0.654, "grad_norm": 0.8929767608642578, "learning_rate": 0.0002, "epoch": 4.477558348294434, "step": 62350}, {"loss": 0.6325, "grad_norm": 0.9465752840042114, "learning_rate": 0.0002, "epoch": 4.478276481149012, "step": 62360}, {"loss": 0.619, "grad_norm": 1.2155033349990845, "learning_rate": 0.0002, "epoch": 4.47899461400359, "step": 62370}, {"loss": 0.5538, "grad_norm": 0.7181217074394226, "learning_rate": 0.0002, "epoch": 4.479712746858169, "step": 62380}, {"loss": 0.6236, "grad_norm": 1.0052744150161743, "learning_rate": 0.0002, "epoch": 4.480430879712747, "step": 62390}, {"loss": 0.6443, "grad_norm": 0.8522219061851501, "learning_rate": 0.0002, "epoch": 4.481149012567325, "step": 62400}, {"loss": 0.6073, "grad_norm": 0.8844723105430603, "learning_rate": 0.0002, "epoch": 4.481867145421903, "step": 62410}, {"loss": 0.6193, "grad_norm": 0.9542465209960938, "learning_rate": 0.0002, "epoch": 4.482585278276481, "step": 62420}, {"loss": 0.6099, "grad_norm": 0.8963674306869507, "learning_rate": 0.0002, "epoch": 4.483303411131059, "step": 62430}, {"loss": 0.5826, "grad_norm": 0.8105363845825195, "learning_rate": 0.0002, "epoch": 4.484021543985637, "step": 62440}, {"loss": 0.6688, "grad_norm": 0.9618421196937561, "learning_rate": 0.0002, "epoch": 4.484739676840215, "step": 62450}, {"loss": 0.6042, "grad_norm": 1.1931076049804688, "learning_rate": 0.0002, "epoch": 4.485457809694793, "step": 62460}, {"loss": 0.5869, "grad_norm": 0.7406999468803406, "learning_rate": 0.0002, "epoch": 4.486175942549371, "step": 62470}, {"loss": 0.604, "grad_norm": 0.7698216438293457, "learning_rate": 0.0002, "epoch": 4.48689407540395, "step": 62480}, {"loss": 0.6062, "grad_norm": 0.862271249294281, "learning_rate": 0.0002, "epoch": 4.487612208258528, "step": 62490}, {"loss": 0.645, "grad_norm": 1.0025171041488647, "learning_rate": 0.0002, "epoch": 4.488330341113106, "step": 62500}, {"loss": 0.5727, "grad_norm": 0.8474493622779846, "learning_rate": 0.0002, "epoch": 4.489048473967684, "step": 62510}, {"loss": 0.6907, "grad_norm": 0.8965697884559631, "learning_rate": 0.0002, "epoch": 4.489766606822262, "step": 62520}, {"loss": 0.5846, "grad_norm": 1.1276488304138184, "learning_rate": 0.0002, "epoch": 4.49048473967684, "step": 62530}, {"loss": 0.6018, "grad_norm": 1.0253537893295288, "learning_rate": 0.0002, "epoch": 4.491202872531418, "step": 62540}, {"loss": 0.5831, "grad_norm": 1.1750596761703491, "learning_rate": 0.0002, "epoch": 4.491921005385996, "step": 62550}, {"loss": 0.6272, "grad_norm": 0.9951794147491455, "learning_rate": 0.0002, "epoch": 4.492639138240574, "step": 62560}, {"loss": 0.5931, "grad_norm": 1.2510017156600952, "learning_rate": 0.0002, "epoch": 4.493357271095153, "step": 62570}, {"loss": 0.6268, "grad_norm": 1.4066375494003296, "learning_rate": 0.0002, "epoch": 4.494075403949731, "step": 62580}, {"loss": 0.6274, "grad_norm": 0.988175094127655, "learning_rate": 0.0002, "epoch": 4.494793536804309, "step": 62590}, {"loss": 0.607, "grad_norm": 1.2049115896224976, "learning_rate": 0.0002, "epoch": 4.495511669658887, "step": 62600}, {"loss": 0.6384, "grad_norm": 0.962464451789856, "learning_rate": 0.0002, "epoch": 4.496229802513465, "step": 62610}, {"loss": 0.6436, "grad_norm": 0.9324793815612793, "learning_rate": 0.0002, "epoch": 4.496947935368043, "step": 62620}, {"loss": 0.6568, "grad_norm": 0.9174214005470276, "learning_rate": 0.0002, "epoch": 4.497666068222621, "step": 62630}, {"loss": 0.6146, "grad_norm": 0.9729902148246765, "learning_rate": 0.0002, "epoch": 4.498384201077199, "step": 62640}, {"loss": 0.6564, "grad_norm": 1.0190484523773193, "learning_rate": 0.0002, "epoch": 4.499102333931777, "step": 62650}, {"loss": 0.6571, "grad_norm": 1.1473679542541504, "learning_rate": 0.0002, "epoch": 4.499820466786355, "step": 62660}, {"loss": 0.6115, "grad_norm": 1.0160558223724365, "learning_rate": 0.0002, "epoch": 4.500538599640934, "step": 62670}, {"loss": 0.6206, "grad_norm": 0.8083887100219727, "learning_rate": 0.0002, "epoch": 4.501256732495512, "step": 62680}, {"loss": 0.6107, "grad_norm": 0.941933274269104, "learning_rate": 0.0002, "epoch": 4.50197486535009, "step": 62690}, {"loss": 0.6181, "grad_norm": 0.9962822794914246, "learning_rate": 0.0002, "epoch": 4.502692998204668, "step": 62700}, {"loss": 0.6364, "grad_norm": 0.8993943333625793, "learning_rate": 0.0002, "epoch": 4.503411131059246, "step": 62710}, {"loss": 0.6141, "grad_norm": 0.9438319206237793, "learning_rate": 0.0002, "epoch": 4.504129263913824, "step": 62720}, {"loss": 0.6453, "grad_norm": 0.7951892018318176, "learning_rate": 0.0002, "epoch": 4.504847396768402, "step": 62730}, {"loss": 0.616, "grad_norm": 0.8875413537025452, "learning_rate": 0.0002, "epoch": 4.50556552962298, "step": 62740}, {"loss": 0.5702, "grad_norm": 0.993819534778595, "learning_rate": 0.0002, "epoch": 4.506283662477558, "step": 62750}, {"loss": 0.6427, "grad_norm": 0.9177559018135071, "learning_rate": 0.0002, "epoch": 4.507001795332137, "step": 62760}, {"loss": 0.6278, "grad_norm": 0.8632771968841553, "learning_rate": 0.0002, "epoch": 4.507719928186715, "step": 62770}, {"loss": 0.6665, "grad_norm": 0.943778395652771, "learning_rate": 0.0002, "epoch": 4.508438061041293, "step": 62780}, {"loss": 0.6068, "grad_norm": 0.8754997849464417, "learning_rate": 0.0002, "epoch": 4.509156193895871, "step": 62790}, {"loss": 0.6345, "grad_norm": 1.102683424949646, "learning_rate": 0.0002, "epoch": 4.509874326750449, "step": 62800}, {"loss": 0.6057, "grad_norm": 1.1156457662582397, "learning_rate": 0.0002, "epoch": 4.510592459605027, "step": 62810}, {"loss": 0.5915, "grad_norm": 0.9178887009620667, "learning_rate": 0.0002, "epoch": 4.511310592459605, "step": 62820}, {"loss": 0.6081, "grad_norm": 0.9520689249038696, "learning_rate": 0.0002, "epoch": 4.512028725314183, "step": 62830}, {"loss": 0.6434, "grad_norm": 0.8880525231361389, "learning_rate": 0.0002, "epoch": 4.512746858168761, "step": 62840}, {"loss": 0.6895, "grad_norm": 0.9541497826576233, "learning_rate": 0.0002, "epoch": 4.513464991023339, "step": 62850}, {"loss": 0.6675, "grad_norm": 1.003766417503357, "learning_rate": 0.0002, "epoch": 4.514183123877918, "step": 62860}, {"loss": 0.6412, "grad_norm": 0.8844705820083618, "learning_rate": 0.0002, "epoch": 4.514901256732496, "step": 62870}, {"loss": 0.6289, "grad_norm": 1.1870828866958618, "learning_rate": 0.0002, "epoch": 4.515619389587074, "step": 62880}, {"loss": 0.6611, "grad_norm": 0.863487184047699, "learning_rate": 0.0002, "epoch": 4.516337522441652, "step": 62890}, {"loss": 0.59, "grad_norm": 0.997770369052887, "learning_rate": 0.0002, "epoch": 4.51705565529623, "step": 62900}, {"loss": 0.6476, "grad_norm": 0.9708612561225891, "learning_rate": 0.0002, "epoch": 4.517773788150808, "step": 62910}, {"loss": 0.6084, "grad_norm": 1.1381206512451172, "learning_rate": 0.0002, "epoch": 4.518491921005386, "step": 62920}, {"loss": 0.5739, "grad_norm": 1.0386693477630615, "learning_rate": 0.0002, "epoch": 4.519210053859964, "step": 62930}, {"loss": 0.6038, "grad_norm": 1.1711705923080444, "learning_rate": 0.0002, "epoch": 4.519928186714543, "step": 62940}, {"loss": 0.6276, "grad_norm": 0.8727447390556335, "learning_rate": 0.0002, "epoch": 4.520646319569121, "step": 62950}, {"loss": 0.6298, "grad_norm": 0.9215193390846252, "learning_rate": 0.0002, "epoch": 4.521364452423699, "step": 62960}, {"loss": 0.6199, "grad_norm": 1.005467176437378, "learning_rate": 0.0002, "epoch": 4.522082585278277, "step": 62970}, {"loss": 0.6324, "grad_norm": 0.8761187791824341, "learning_rate": 0.0002, "epoch": 4.522800718132855, "step": 62980}, {"loss": 0.6152, "grad_norm": 0.957848310470581, "learning_rate": 0.0002, "epoch": 4.523518850987433, "step": 62990}, {"loss": 0.5752, "grad_norm": 0.8634148836135864, "learning_rate": 0.0002, "epoch": 4.524236983842011, "step": 63000}, {"loss": 0.6127, "grad_norm": 0.9557477235794067, "learning_rate": 0.0002, "epoch": 4.524955116696589, "step": 63010}, {"loss": 0.5708, "grad_norm": 1.017720341682434, "learning_rate": 0.0002, "epoch": 4.525673249551167, "step": 63020}, {"loss": 0.6186, "grad_norm": 1.0281825065612793, "learning_rate": 0.0002, "epoch": 4.526391382405745, "step": 63030}, {"loss": 0.6221, "grad_norm": 1.253974437713623, "learning_rate": 0.0002, "epoch": 4.527109515260323, "step": 63040}, {"loss": 0.6381, "grad_norm": 0.8489068150520325, "learning_rate": 0.0002, "epoch": 4.527827648114902, "step": 63050}, {"loss": 0.6022, "grad_norm": 0.9681686162948608, "learning_rate": 0.0002, "epoch": 4.52854578096948, "step": 63060}, {"loss": 0.6166, "grad_norm": 1.10277259349823, "learning_rate": 0.0002, "epoch": 4.529263913824058, "step": 63070}, {"loss": 0.5838, "grad_norm": 0.9469163417816162, "learning_rate": 0.0002, "epoch": 4.529982046678636, "step": 63080}, {"loss": 0.6323, "grad_norm": 1.1228134632110596, "learning_rate": 0.0002, "epoch": 4.530700179533214, "step": 63090}, {"loss": 0.6143, "grad_norm": 0.9673212170600891, "learning_rate": 0.0002, "epoch": 4.531418312387792, "step": 63100}, {"loss": 0.713, "grad_norm": 1.0221107006072998, "learning_rate": 0.0002, "epoch": 4.53213644524237, "step": 63110}, {"loss": 0.6099, "grad_norm": 0.826372504234314, "learning_rate": 0.0002, "epoch": 4.532854578096948, "step": 63120}, {"loss": 0.6487, "grad_norm": 1.1805331707000732, "learning_rate": 0.0002, "epoch": 4.5335727109515265, "step": 63130}, {"loss": 0.6088, "grad_norm": 0.9645666480064392, "learning_rate": 0.0002, "epoch": 4.5342908438061045, "step": 63140}, {"loss": 0.6049, "grad_norm": 1.0838309526443481, "learning_rate": 0.0002, "epoch": 4.5350089766606825, "step": 63150}, {"loss": 0.5972, "grad_norm": 1.061414361000061, "learning_rate": 0.0002, "epoch": 4.5357271095152605, "step": 63160}, {"loss": 0.5706, "grad_norm": 0.841961145401001, "learning_rate": 0.0002, "epoch": 4.5364452423698385, "step": 63170}, {"loss": 0.6168, "grad_norm": 1.1220186948776245, "learning_rate": 0.0002, "epoch": 4.5371633752244165, "step": 63180}, {"loss": 0.6055, "grad_norm": 1.036441445350647, "learning_rate": 0.0002, "epoch": 4.5378815080789945, "step": 63190}, {"loss": 0.619, "grad_norm": 0.9089716076850891, "learning_rate": 0.0002, "epoch": 4.5385996409335725, "step": 63200}, {"loss": 0.6373, "grad_norm": 0.8699982762336731, "learning_rate": 0.0002, "epoch": 4.5393177737881505, "step": 63210}, {"loss": 0.6082, "grad_norm": 0.8489565253257751, "learning_rate": 0.0002, "epoch": 4.5400359066427285, "step": 63220}, {"loss": 0.5957, "grad_norm": 0.7778416275978088, "learning_rate": 0.0002, "epoch": 4.540754039497307, "step": 63230}, {"loss": 0.6109, "grad_norm": 1.0625852346420288, "learning_rate": 0.0002, "epoch": 4.541472172351885, "step": 63240}, {"loss": 0.6039, "grad_norm": 0.8515732884407043, "learning_rate": 0.0002, "epoch": 4.542190305206463, "step": 63250}, {"loss": 0.5827, "grad_norm": 0.7679561376571655, "learning_rate": 0.0002, "epoch": 4.542908438061041, "step": 63260}, {"loss": 0.5948, "grad_norm": 0.7358446717262268, "learning_rate": 0.0002, "epoch": 4.543626570915619, "step": 63270}, {"loss": 0.6265, "grad_norm": 1.0866128206253052, "learning_rate": 0.0002, "epoch": 4.544344703770197, "step": 63280}, {"loss": 0.6622, "grad_norm": 1.0870225429534912, "learning_rate": 0.0002, "epoch": 4.545062836624775, "step": 63290}, {"loss": 0.5859, "grad_norm": 0.951095461845398, "learning_rate": 0.0002, "epoch": 4.545780969479353, "step": 63300}, {"loss": 0.6252, "grad_norm": 1.0914306640625, "learning_rate": 0.0002, "epoch": 4.546499102333931, "step": 63310}, {"loss": 0.6504, "grad_norm": 0.8676106333732605, "learning_rate": 0.0002, "epoch": 4.54721723518851, "step": 63320}, {"loss": 0.6088, "grad_norm": 1.0129096508026123, "learning_rate": 0.0002, "epoch": 4.547935368043088, "step": 63330}, {"loss": 0.617, "grad_norm": 0.8710526823997498, "learning_rate": 0.0002, "epoch": 4.548653500897666, "step": 63340}, {"loss": 0.6336, "grad_norm": 0.7014815807342529, "learning_rate": 0.0002, "epoch": 4.549371633752244, "step": 63350}, {"loss": 0.5758, "grad_norm": 1.1546777486801147, "learning_rate": 0.0002, "epoch": 4.550089766606822, "step": 63360}, {"loss": 0.5976, "grad_norm": 0.7464957237243652, "learning_rate": 0.0002, "epoch": 4.5508078994614, "step": 63370}, {"loss": 0.6016, "grad_norm": 0.9976209998130798, "learning_rate": 0.0002, "epoch": 4.551526032315978, "step": 63380}, {"loss": 0.5784, "grad_norm": 0.9543681740760803, "learning_rate": 0.0002, "epoch": 4.552244165170556, "step": 63390}, {"loss": 0.5873, "grad_norm": 1.1498578786849976, "learning_rate": 0.0002, "epoch": 4.552962298025134, "step": 63400}, {"loss": 0.6445, "grad_norm": 1.0162293910980225, "learning_rate": 0.0002, "epoch": 4.553680430879712, "step": 63410}, {"loss": 0.5677, "grad_norm": 0.9015304446220398, "learning_rate": 0.0002, "epoch": 4.554398563734291, "step": 63420}, {"loss": 0.6257, "grad_norm": 1.1639831066131592, "learning_rate": 0.0002, "epoch": 4.555116696588869, "step": 63430}, {"loss": 0.6763, "grad_norm": 0.9494703412055969, "learning_rate": 0.0002, "epoch": 4.555834829443447, "step": 63440}, {"loss": 0.5955, "grad_norm": 1.0555956363677979, "learning_rate": 0.0002, "epoch": 4.556552962298025, "step": 63450}, {"loss": 0.6634, "grad_norm": 0.8513827919960022, "learning_rate": 0.0002, "epoch": 4.557271095152603, "step": 63460}, {"loss": 0.6507, "grad_norm": 1.0614275932312012, "learning_rate": 0.0002, "epoch": 4.557989228007181, "step": 63470}, {"loss": 0.5619, "grad_norm": 0.8341137766838074, "learning_rate": 0.0002, "epoch": 4.558707360861759, "step": 63480}, {"loss": 0.6147, "grad_norm": 1.2136222124099731, "learning_rate": 0.0002, "epoch": 4.559425493716337, "step": 63490}, {"loss": 0.6313, "grad_norm": 0.8806019425392151, "learning_rate": 0.0002, "epoch": 4.560143626570916, "step": 63500}, {"loss": 0.6012, "grad_norm": 1.2548854351043701, "learning_rate": 0.0002, "epoch": 4.560861759425494, "step": 63510}, {"loss": 0.5995, "grad_norm": 1.0162668228149414, "learning_rate": 0.0002, "epoch": 4.561579892280072, "step": 63520}, {"loss": 0.5895, "grad_norm": 1.0487624406814575, "learning_rate": 0.0002, "epoch": 4.56229802513465, "step": 63530}, {"loss": 0.5997, "grad_norm": 1.2505502700805664, "learning_rate": 0.0002, "epoch": 4.563016157989228, "step": 63540}, {"loss": 0.618, "grad_norm": 0.9930511713027954, "learning_rate": 0.0002, "epoch": 4.563734290843806, "step": 63550}, {"loss": 0.6695, "grad_norm": 0.8132568001747131, "learning_rate": 0.0002, "epoch": 4.564452423698384, "step": 63560}, {"loss": 0.6221, "grad_norm": 1.0129177570343018, "learning_rate": 0.0002, "epoch": 4.565170556552962, "step": 63570}, {"loss": 0.6463, "grad_norm": 0.9011693596839905, "learning_rate": 0.0002, "epoch": 4.56588868940754, "step": 63580}, {"loss": 0.6046, "grad_norm": 0.9161545634269714, "learning_rate": 0.0002, "epoch": 4.566606822262118, "step": 63590}, {"loss": 0.6413, "grad_norm": 0.8852348327636719, "learning_rate": 0.0002, "epoch": 4.567324955116696, "step": 63600}, {"loss": 0.6282, "grad_norm": 0.8579391837120056, "learning_rate": 0.0002, "epoch": 4.568043087971275, "step": 63610}, {"loss": 0.6041, "grad_norm": 0.9271050095558167, "learning_rate": 0.0002, "epoch": 4.568761220825853, "step": 63620}, {"loss": 0.6156, "grad_norm": 0.9881834983825684, "learning_rate": 0.0002, "epoch": 4.569479353680431, "step": 63630}, {"loss": 0.6164, "grad_norm": 1.0255686044692993, "learning_rate": 0.0002, "epoch": 4.570197486535009, "step": 63640}, {"loss": 0.6416, "grad_norm": 0.8758876919746399, "learning_rate": 0.0002, "epoch": 4.570915619389587, "step": 63650}, {"loss": 0.6787, "grad_norm": 1.0134185552597046, "learning_rate": 0.0002, "epoch": 4.571633752244165, "step": 63660}, {"loss": 0.6245, "grad_norm": 0.8535705208778381, "learning_rate": 0.0002, "epoch": 4.572351885098743, "step": 63670}, {"loss": 0.6282, "grad_norm": 0.9614834785461426, "learning_rate": 0.0002, "epoch": 4.573070017953321, "step": 63680}, {"loss": 0.6461, "grad_norm": 0.9004243612289429, "learning_rate": 0.0002, "epoch": 4.5737881508079, "step": 63690}, {"loss": 0.6172, "grad_norm": 0.9563080072402954, "learning_rate": 0.0002, "epoch": 4.574506283662478, "step": 63700}, {"loss": 0.6059, "grad_norm": 1.024857521057129, "learning_rate": 0.0002, "epoch": 4.575224416517056, "step": 63710}, {"loss": 0.6188, "grad_norm": 0.9345638155937195, "learning_rate": 0.0002, "epoch": 4.575942549371634, "step": 63720}, {"loss": 0.6814, "grad_norm": 1.27083158493042, "learning_rate": 0.0002, "epoch": 4.576660682226212, "step": 63730}, {"loss": 0.5987, "grad_norm": 1.0866559743881226, "learning_rate": 0.0002, "epoch": 4.57737881508079, "step": 63740}, {"loss": 0.5738, "grad_norm": 0.9253925681114197, "learning_rate": 0.0002, "epoch": 4.578096947935368, "step": 63750}, {"loss": 0.5981, "grad_norm": 0.8127399682998657, "learning_rate": 0.0002, "epoch": 4.578815080789946, "step": 63760}, {"loss": 0.6321, "grad_norm": 1.0453993082046509, "learning_rate": 0.0002, "epoch": 4.579533213644524, "step": 63770}, {"loss": 0.6423, "grad_norm": 1.2227544784545898, "learning_rate": 0.0002, "epoch": 4.580251346499102, "step": 63780}, {"loss": 0.6405, "grad_norm": 1.0207865238189697, "learning_rate": 0.0002, "epoch": 4.580969479353681, "step": 63790}, {"loss": 0.6268, "grad_norm": 1.030447244644165, "learning_rate": 0.0002, "epoch": 4.581687612208259, "step": 63800}, {"loss": 0.6014, "grad_norm": 1.0855677127838135, "learning_rate": 0.0002, "epoch": 4.582405745062837, "step": 63810}, {"loss": 0.6204, "grad_norm": 0.9572556018829346, "learning_rate": 0.0002, "epoch": 4.583123877917415, "step": 63820}, {"loss": 0.6094, "grad_norm": 0.9061040282249451, "learning_rate": 0.0002, "epoch": 4.583842010771993, "step": 63830}, {"loss": 0.6074, "grad_norm": 0.9267677068710327, "learning_rate": 0.0002, "epoch": 4.584560143626571, "step": 63840}, {"loss": 0.6525, "grad_norm": 1.070076823234558, "learning_rate": 0.0002, "epoch": 4.585278276481149, "step": 63850}, {"loss": 0.6074, "grad_norm": 1.045881748199463, "learning_rate": 0.0002, "epoch": 4.585996409335727, "step": 63860}, {"loss": 0.6106, "grad_norm": 0.9190576672554016, "learning_rate": 0.0002, "epoch": 4.586714542190305, "step": 63870}, {"loss": 0.6213, "grad_norm": 0.9263932704925537, "learning_rate": 0.0002, "epoch": 4.587432675044884, "step": 63880}, {"loss": 0.6077, "grad_norm": 1.0217589139938354, "learning_rate": 0.0002, "epoch": 4.588150807899462, "step": 63890}, {"loss": 0.5798, "grad_norm": 0.9200088381767273, "learning_rate": 0.0002, "epoch": 4.58886894075404, "step": 63900}, {"loss": 0.6311, "grad_norm": 0.9877251386642456, "learning_rate": 0.0002, "epoch": 4.589587073608618, "step": 63910}, {"loss": 0.5981, "grad_norm": 1.0059093236923218, "learning_rate": 0.0002, "epoch": 4.590305206463196, "step": 63920}, {"loss": 0.6265, "grad_norm": 1.2618095874786377, "learning_rate": 0.0002, "epoch": 4.591023339317774, "step": 63930}, {"loss": 0.583, "grad_norm": 1.1779268980026245, "learning_rate": 0.0002, "epoch": 4.591741472172352, "step": 63940}, {"loss": 0.6232, "grad_norm": 1.2339502573013306, "learning_rate": 0.0002, "epoch": 4.59245960502693, "step": 63950}, {"loss": 0.5985, "grad_norm": 0.7488788366317749, "learning_rate": 0.0002, "epoch": 4.593177737881508, "step": 63960}, {"loss": 0.5991, "grad_norm": 0.8366380929946899, "learning_rate": 0.0002, "epoch": 4.593895870736086, "step": 63970}, {"loss": 0.5864, "grad_norm": 1.0292677879333496, "learning_rate": 0.0002, "epoch": 4.594614003590665, "step": 63980}, {"loss": 0.666, "grad_norm": 0.7938551306724548, "learning_rate": 0.0002, "epoch": 4.595332136445243, "step": 63990}, {"loss": 0.6202, "grad_norm": 0.7958516478538513, "learning_rate": 0.0002, "epoch": 4.596050269299821, "step": 64000}, {"loss": 0.5868, "grad_norm": 0.9613908529281616, "learning_rate": 0.0002, "epoch": 4.596768402154399, "step": 64010}, {"loss": 0.6299, "grad_norm": 1.0253773927688599, "learning_rate": 0.0002, "epoch": 4.597486535008977, "step": 64020}, {"loss": 0.5964, "grad_norm": 1.0560888051986694, "learning_rate": 0.0002, "epoch": 4.598204667863555, "step": 64030}, {"loss": 0.6681, "grad_norm": 1.1093556880950928, "learning_rate": 0.0002, "epoch": 4.598922800718133, "step": 64040}, {"loss": 0.6097, "grad_norm": 0.8492098450660706, "learning_rate": 0.0002, "epoch": 4.599640933572711, "step": 64050}, {"loss": 0.6029, "grad_norm": 1.0070436000823975, "learning_rate": 0.0002, "epoch": 4.6003590664272895, "step": 64060}, {"loss": 0.6392, "grad_norm": 0.9774282574653625, "learning_rate": 0.0002, "epoch": 4.6010771992818675, "step": 64070}, {"loss": 0.6397, "grad_norm": 1.0744960308074951, "learning_rate": 0.0002, "epoch": 4.6017953321364455, "step": 64080}, {"loss": 0.6491, "grad_norm": 1.0101491212844849, "learning_rate": 0.0002, "epoch": 4.6025134649910235, "step": 64090}, {"loss": 0.594, "grad_norm": 1.2306591272354126, "learning_rate": 0.0002, "epoch": 4.6032315978456015, "step": 64100}, {"loss": 0.5783, "grad_norm": 0.9187033176422119, "learning_rate": 0.0002, "epoch": 4.6039497307001795, "step": 64110}, {"loss": 0.5982, "grad_norm": 0.9178676605224609, "learning_rate": 0.0002, "epoch": 4.6046678635547575, "step": 64120}, {"loss": 0.6074, "grad_norm": 1.006374716758728, "learning_rate": 0.0002, "epoch": 4.6053859964093355, "step": 64130}, {"loss": 0.6402, "grad_norm": 1.0774449110031128, "learning_rate": 0.0002, "epoch": 4.6061041292639135, "step": 64140}, {"loss": 0.6076, "grad_norm": 1.0360658168792725, "learning_rate": 0.0002, "epoch": 4.6068222621184916, "step": 64150}, {"loss": 0.6259, "grad_norm": 1.1061090230941772, "learning_rate": 0.0002, "epoch": 4.6075403949730696, "step": 64160}, {"loss": 0.6304, "grad_norm": 1.0320971012115479, "learning_rate": 0.0002, "epoch": 4.608258527827648, "step": 64170}, {"loss": 0.6182, "grad_norm": 0.8596988916397095, "learning_rate": 0.0002, "epoch": 4.6089766606822264, "step": 64180}, {"loss": 0.5646, "grad_norm": 1.1665741205215454, "learning_rate": 0.0002, "epoch": 4.6096947935368044, "step": 64190}, {"loss": 0.6219, "grad_norm": 0.857207715511322, "learning_rate": 0.0002, "epoch": 4.6104129263913824, "step": 64200}, {"loss": 0.6271, "grad_norm": 1.0088987350463867, "learning_rate": 0.0002, "epoch": 4.6111310592459605, "step": 64210}, {"loss": 0.6209, "grad_norm": 1.0985605716705322, "learning_rate": 0.0002, "epoch": 4.6118491921005385, "step": 64220}, {"loss": 0.6455, "grad_norm": 0.9504913687705994, "learning_rate": 0.0002, "epoch": 4.6125673249551165, "step": 64230}, {"loss": 0.6054, "grad_norm": 0.8415018916130066, "learning_rate": 0.0002, "epoch": 4.6132854578096945, "step": 64240}, {"loss": 0.5975, "grad_norm": 0.9857034087181091, "learning_rate": 0.0002, "epoch": 4.614003590664273, "step": 64250}, {"loss": 0.6347, "grad_norm": 1.0164235830307007, "learning_rate": 0.0002, "epoch": 4.614721723518851, "step": 64260}, {"loss": 0.5877, "grad_norm": 0.949481725692749, "learning_rate": 0.0002, "epoch": 4.615439856373429, "step": 64270}, {"loss": 0.5737, "grad_norm": 0.9526455998420715, "learning_rate": 0.0002, "epoch": 4.616157989228007, "step": 64280}, {"loss": 0.6134, "grad_norm": 1.1121242046356201, "learning_rate": 0.0002, "epoch": 4.616876122082585, "step": 64290}, {"loss": 0.6152, "grad_norm": 0.9598871469497681, "learning_rate": 0.0002, "epoch": 4.617594254937163, "step": 64300}, {"loss": 0.6405, "grad_norm": 1.0406304597854614, "learning_rate": 0.0002, "epoch": 4.618312387791741, "step": 64310}, {"loss": 0.5971, "grad_norm": 1.1816964149475098, "learning_rate": 0.0002, "epoch": 4.619030520646319, "step": 64320}, {"loss": 0.6483, "grad_norm": 0.9818326830863953, "learning_rate": 0.0002, "epoch": 4.619748653500897, "step": 64330}, {"loss": 0.6141, "grad_norm": 0.952017605304718, "learning_rate": 0.0002, "epoch": 4.620466786355475, "step": 64340}, {"loss": 0.6146, "grad_norm": 1.1263453960418701, "learning_rate": 0.0002, "epoch": 4.621184919210053, "step": 64350}, {"loss": 0.5973, "grad_norm": 1.1158473491668701, "learning_rate": 0.0002, "epoch": 4.621903052064632, "step": 64360}, {"loss": 0.6029, "grad_norm": 0.9056766033172607, "learning_rate": 0.0002, "epoch": 4.62262118491921, "step": 64370}, {"loss": 0.6488, "grad_norm": 0.8113203048706055, "learning_rate": 0.0002, "epoch": 4.623339317773788, "step": 64380}, {"loss": 0.6391, "grad_norm": 0.8646712899208069, "learning_rate": 0.0002, "epoch": 4.624057450628366, "step": 64390}, {"loss": 0.6191, "grad_norm": 1.0064425468444824, "learning_rate": 0.0002, "epoch": 4.624775583482944, "step": 64400}, {"loss": 0.5826, "grad_norm": 0.9867565631866455, "learning_rate": 0.0002, "epoch": 4.625493716337522, "step": 64410}, {"loss": 0.6409, "grad_norm": 1.018764615058899, "learning_rate": 0.0002, "epoch": 4.6262118491921, "step": 64420}, {"loss": 0.5992, "grad_norm": 1.0607863664627075, "learning_rate": 0.0002, "epoch": 4.626929982046678, "step": 64430}, {"loss": 0.6502, "grad_norm": 1.012825846672058, "learning_rate": 0.0002, "epoch": 4.627648114901257, "step": 64440}, {"loss": 0.6074, "grad_norm": 0.8441653847694397, "learning_rate": 0.0002, "epoch": 4.628366247755835, "step": 64450}, {"loss": 0.6462, "grad_norm": 0.9819194674491882, "learning_rate": 0.0002, "epoch": 4.629084380610413, "step": 64460}, {"loss": 0.5983, "grad_norm": 0.925519585609436, "learning_rate": 0.0002, "epoch": 4.629802513464991, "step": 64470}, {"loss": 0.5959, "grad_norm": 0.9409030079841614, "learning_rate": 0.0002, "epoch": 4.630520646319569, "step": 64480}, {"loss": 0.6265, "grad_norm": 1.148024559020996, "learning_rate": 0.0002, "epoch": 4.631238779174147, "step": 64490}, {"loss": 0.6556, "grad_norm": 0.8225533962249756, "learning_rate": 0.0002, "epoch": 4.631956912028725, "step": 64500}, {"loss": 0.5922, "grad_norm": 0.8806734681129456, "learning_rate": 0.0002, "epoch": 4.632675044883303, "step": 64510}, {"loss": 0.6202, "grad_norm": 0.9656694531440735, "learning_rate": 0.0002, "epoch": 4.633393177737881, "step": 64520}, {"loss": 0.6044, "grad_norm": 0.9977783560752869, "learning_rate": 0.0002, "epoch": 4.634111310592459, "step": 64530}, {"loss": 0.5741, "grad_norm": 0.9259420037269592, "learning_rate": 0.0002, "epoch": 4.634829443447038, "step": 64540}, {"loss": 0.5801, "grad_norm": 1.0215885639190674, "learning_rate": 0.0002, "epoch": 4.635547576301616, "step": 64550}, {"loss": 0.6492, "grad_norm": 1.1082557439804077, "learning_rate": 0.0002, "epoch": 4.636265709156194, "step": 64560}, {"loss": 0.6285, "grad_norm": 1.1183207035064697, "learning_rate": 0.0002, "epoch": 4.636983842010772, "step": 64570}, {"loss": 0.6216, "grad_norm": 0.9914339184761047, "learning_rate": 0.0002, "epoch": 4.63770197486535, "step": 64580}, {"loss": 0.6416, "grad_norm": 0.8065831661224365, "learning_rate": 0.0002, "epoch": 4.638420107719928, "step": 64590}, {"loss": 0.6078, "grad_norm": 1.1546721458435059, "learning_rate": 0.0002, "epoch": 4.639138240574506, "step": 64600}, {"loss": 0.6219, "grad_norm": 1.0395900011062622, "learning_rate": 0.0002, "epoch": 4.639856373429084, "step": 64610}, {"loss": 0.5939, "grad_norm": 0.9957455992698669, "learning_rate": 0.0002, "epoch": 4.640574506283663, "step": 64620}, {"loss": 0.6653, "grad_norm": 1.069557785987854, "learning_rate": 0.0002, "epoch": 4.641292639138241, "step": 64630}, {"loss": 0.6546, "grad_norm": 1.005236268043518, "learning_rate": 0.0002, "epoch": 4.642010771992819, "step": 64640}, {"loss": 0.6262, "grad_norm": 1.0216304063796997, "learning_rate": 0.0002, "epoch": 4.642728904847397, "step": 64650}, {"loss": 0.6756, "grad_norm": 0.8567317128181458, "learning_rate": 0.0002, "epoch": 4.643447037701975, "step": 64660}, {"loss": 0.5997, "grad_norm": 1.0386067628860474, "learning_rate": 0.0002, "epoch": 4.644165170556553, "step": 64670}, {"loss": 0.6471, "grad_norm": 0.9566055536270142, "learning_rate": 0.0002, "epoch": 4.644883303411131, "step": 64680}, {"loss": 0.6601, "grad_norm": 1.0990564823150635, "learning_rate": 0.0002, "epoch": 4.645601436265709, "step": 64690}, {"loss": 0.6418, "grad_norm": 0.9962695240974426, "learning_rate": 0.0002, "epoch": 4.646319569120287, "step": 64700}, {"loss": 0.6442, "grad_norm": 0.9041377305984497, "learning_rate": 0.0002, "epoch": 4.647037701974865, "step": 64710}, {"loss": 0.6276, "grad_norm": 0.8611233234405518, "learning_rate": 0.0002, "epoch": 4.647755834829443, "step": 64720}, {"loss": 0.6015, "grad_norm": 1.1569812297821045, "learning_rate": 0.0002, "epoch": 4.648473967684022, "step": 64730}, {"loss": 0.6169, "grad_norm": 0.7946197390556335, "learning_rate": 0.0002, "epoch": 4.6491921005386, "step": 64740}, {"loss": 0.668, "grad_norm": 0.9612061381340027, "learning_rate": 0.0002, "epoch": 4.649910233393178, "step": 64750}, {"loss": 0.6741, "grad_norm": 0.9669303297996521, "learning_rate": 0.0002, "epoch": 4.650628366247756, "step": 64760}, {"loss": 0.593, "grad_norm": 0.8117775321006775, "learning_rate": 0.0002, "epoch": 4.651346499102334, "step": 64770}, {"loss": 0.6915, "grad_norm": 1.2326241731643677, "learning_rate": 0.0002, "epoch": 4.652064631956912, "step": 64780}, {"loss": 0.6076, "grad_norm": 0.7494568228721619, "learning_rate": 0.0002, "epoch": 4.65278276481149, "step": 64790}, {"loss": 0.58, "grad_norm": 0.8145379424095154, "learning_rate": 0.0002, "epoch": 4.653500897666068, "step": 64800}, {"loss": 0.6351, "grad_norm": 1.0139610767364502, "learning_rate": 0.0002, "epoch": 4.654219030520647, "step": 64810}, {"loss": 0.6575, "grad_norm": 0.9887115359306335, "learning_rate": 0.0002, "epoch": 4.654937163375225, "step": 64820}, {"loss": 0.6338, "grad_norm": 0.9565147161483765, "learning_rate": 0.0002, "epoch": 4.655655296229803, "step": 64830}, {"loss": 0.6212, "grad_norm": 0.9022467136383057, "learning_rate": 0.0002, "epoch": 4.656373429084381, "step": 64840}, {"loss": 0.6395, "grad_norm": 1.075003981590271, "learning_rate": 0.0002, "epoch": 4.657091561938959, "step": 64850}, {"loss": 0.6191, "grad_norm": 0.8705733418464661, "learning_rate": 0.0002, "epoch": 4.657809694793537, "step": 64860}, {"loss": 0.5543, "grad_norm": 1.0826832056045532, "learning_rate": 0.0002, "epoch": 4.658527827648115, "step": 64870}, {"loss": 0.6363, "grad_norm": 1.1056268215179443, "learning_rate": 0.0002, "epoch": 4.659245960502693, "step": 64880}, {"loss": 0.6252, "grad_norm": 0.8664149641990662, "learning_rate": 0.0002, "epoch": 4.659964093357271, "step": 64890}, {"loss": 0.6126, "grad_norm": 0.9487230181694031, "learning_rate": 0.0002, "epoch": 4.660682226211849, "step": 64900}, {"loss": 0.5968, "grad_norm": 1.0357837677001953, "learning_rate": 0.0002, "epoch": 4.661400359066427, "step": 64910}, {"loss": 0.603, "grad_norm": 0.8620632290840149, "learning_rate": 0.0002, "epoch": 4.662118491921006, "step": 64920}, {"loss": 0.6113, "grad_norm": 1.108986735343933, "learning_rate": 0.0002, "epoch": 4.662836624775584, "step": 64930}, {"loss": 0.6115, "grad_norm": 0.8017674684524536, "learning_rate": 0.0002, "epoch": 4.663554757630162, "step": 64940}, {"loss": 0.6268, "grad_norm": 0.882347583770752, "learning_rate": 0.0002, "epoch": 4.66427289048474, "step": 64950}, {"loss": 0.657, "grad_norm": 0.9466867446899414, "learning_rate": 0.0002, "epoch": 4.664991023339318, "step": 64960}, {"loss": 0.645, "grad_norm": 1.1823636293411255, "learning_rate": 0.0002, "epoch": 4.665709156193896, "step": 64970}, {"loss": 0.5889, "grad_norm": 0.9535016417503357, "learning_rate": 0.0002, "epoch": 4.666427289048474, "step": 64980}, {"loss": 0.5986, "grad_norm": 0.9456726312637329, "learning_rate": 0.0002, "epoch": 4.667145421903052, "step": 64990}, {"loss": 0.6334, "grad_norm": 0.7761920690536499, "learning_rate": 0.0002, "epoch": 4.667863554757631, "step": 65000}, {"loss": 0.6645, "grad_norm": 1.060357689857483, "learning_rate": 0.0002, "epoch": 4.668581687612209, "step": 65010}, {"loss": 0.6369, "grad_norm": 0.9083862900733948, "learning_rate": 0.0002, "epoch": 4.669299820466787, "step": 65020}, {"loss": 0.5839, "grad_norm": 0.8745762705802917, "learning_rate": 0.0002, "epoch": 4.670017953321365, "step": 65030}, {"loss": 0.6517, "grad_norm": 0.8715422749519348, "learning_rate": 0.0002, "epoch": 4.670736086175943, "step": 65040}, {"loss": 0.6061, "grad_norm": 0.9407707452774048, "learning_rate": 0.0002, "epoch": 4.671454219030521, "step": 65050}, {"loss": 0.5928, "grad_norm": 0.8998945355415344, "learning_rate": 0.0002, "epoch": 4.672172351885099, "step": 65060}, {"loss": 0.6107, "grad_norm": 0.9147891998291016, "learning_rate": 0.0002, "epoch": 4.672890484739677, "step": 65070}, {"loss": 0.6215, "grad_norm": 1.116614580154419, "learning_rate": 0.0002, "epoch": 4.673608617594255, "step": 65080}, {"loss": 0.641, "grad_norm": 1.0764213800430298, "learning_rate": 0.0002, "epoch": 4.674326750448833, "step": 65090}, {"loss": 0.6353, "grad_norm": 0.9115945100784302, "learning_rate": 0.0002, "epoch": 4.6750448833034115, "step": 65100}, {"loss": 0.6506, "grad_norm": 1.001251459121704, "learning_rate": 0.0002, "epoch": 4.6757630161579895, "step": 65110}, {"loss": 0.6414, "grad_norm": 1.0330020189285278, "learning_rate": 0.0002, "epoch": 4.6764811490125675, "step": 65120}, {"loss": 0.6421, "grad_norm": 0.9083197116851807, "learning_rate": 0.0002, "epoch": 4.6771992818671455, "step": 65130}, {"loss": 0.5905, "grad_norm": 0.9298770427703857, "learning_rate": 0.0002, "epoch": 4.6779174147217235, "step": 65140}, {"loss": 0.633, "grad_norm": 1.0009549856185913, "learning_rate": 0.0002, "epoch": 4.6786355475763015, "step": 65150}, {"loss": 0.661, "grad_norm": 0.951389729976654, "learning_rate": 0.0002, "epoch": 4.6793536804308795, "step": 65160}, {"loss": 0.6282, "grad_norm": 1.151870608329773, "learning_rate": 0.0002, "epoch": 4.6800718132854575, "step": 65170}, {"loss": 0.5944, "grad_norm": 1.0074727535247803, "learning_rate": 0.0002, "epoch": 4.680789946140036, "step": 65180}, {"loss": 0.6539, "grad_norm": 1.0490152835845947, "learning_rate": 0.0002, "epoch": 4.681508078994614, "step": 65190}, {"loss": 0.6604, "grad_norm": 0.8967363834381104, "learning_rate": 0.0002, "epoch": 4.682226211849192, "step": 65200}, {"loss": 0.6582, "grad_norm": 1.2314889430999756, "learning_rate": 0.0002, "epoch": 4.68294434470377, "step": 65210}, {"loss": 0.6104, "grad_norm": 0.7764074802398682, "learning_rate": 0.0002, "epoch": 4.683662477558348, "step": 65220}, {"loss": 0.6401, "grad_norm": 1.0587822198867798, "learning_rate": 0.0002, "epoch": 4.684380610412926, "step": 65230}, {"loss": 0.556, "grad_norm": 0.916114091873169, "learning_rate": 0.0002, "epoch": 4.685098743267504, "step": 65240}, {"loss": 0.5912, "grad_norm": 0.9117472767829895, "learning_rate": 0.0002, "epoch": 4.685816876122082, "step": 65250}, {"loss": 0.6127, "grad_norm": 0.8369293212890625, "learning_rate": 0.0002, "epoch": 4.68653500897666, "step": 65260}, {"loss": 0.5715, "grad_norm": 0.9700121879577637, "learning_rate": 0.0002, "epoch": 4.687253141831238, "step": 65270}, {"loss": 0.6364, "grad_norm": 1.0008411407470703, "learning_rate": 0.0002, "epoch": 4.687971274685816, "step": 65280}, {"loss": 0.5816, "grad_norm": 0.9339549541473389, "learning_rate": 0.0002, "epoch": 4.688689407540395, "step": 65290}, {"loss": 0.6382, "grad_norm": 0.956701934337616, "learning_rate": 0.0002, "epoch": 4.689407540394973, "step": 65300}, {"loss": 0.6368, "grad_norm": 1.2042720317840576, "learning_rate": 0.0002, "epoch": 4.690125673249551, "step": 65310}, {"loss": 0.6138, "grad_norm": 0.8679144382476807, "learning_rate": 0.0002, "epoch": 4.690843806104129, "step": 65320}, {"loss": 0.6619, "grad_norm": 1.2320687770843506, "learning_rate": 0.0002, "epoch": 4.691561938958707, "step": 65330}, {"loss": 0.6212, "grad_norm": 0.8397238850593567, "learning_rate": 0.0002, "epoch": 4.692280071813285, "step": 65340}, {"loss": 0.578, "grad_norm": 0.7850362658500671, "learning_rate": 0.0002, "epoch": 4.692998204667863, "step": 65350}, {"loss": 0.632, "grad_norm": 0.9281290173530579, "learning_rate": 0.0002, "epoch": 4.693716337522441, "step": 65360}, {"loss": 0.6492, "grad_norm": 1.1506335735321045, "learning_rate": 0.0002, "epoch": 4.69443447037702, "step": 65370}, {"loss": 0.6503, "grad_norm": 1.0910584926605225, "learning_rate": 0.0002, "epoch": 4.695152603231598, "step": 65380}, {"loss": 0.66, "grad_norm": 0.8937386274337769, "learning_rate": 0.0002, "epoch": 4.695870736086176, "step": 65390}, {"loss": 0.6425, "grad_norm": 1.0163888931274414, "learning_rate": 0.0002, "epoch": 4.696588868940754, "step": 65400}, {"loss": 0.647, "grad_norm": 1.0290007591247559, "learning_rate": 0.0002, "epoch": 4.697307001795332, "step": 65410}, {"loss": 0.614, "grad_norm": 0.9046576023101807, "learning_rate": 0.0002, "epoch": 4.69802513464991, "step": 65420}, {"loss": 0.5844, "grad_norm": 1.0030237436294556, "learning_rate": 0.0002, "epoch": 4.698743267504488, "step": 65430}, {"loss": 0.6273, "grad_norm": 0.8196740746498108, "learning_rate": 0.0002, "epoch": 4.699461400359066, "step": 65440}, {"loss": 0.6273, "grad_norm": 0.9036651849746704, "learning_rate": 0.0002, "epoch": 4.700179533213644, "step": 65450}, {"loss": 0.6024, "grad_norm": 1.2080141305923462, "learning_rate": 0.0002, "epoch": 4.700897666068222, "step": 65460}, {"loss": 0.6461, "grad_norm": 0.8743635416030884, "learning_rate": 0.0002, "epoch": 4.7016157989228, "step": 65470}, {"loss": 0.6129, "grad_norm": 0.9566192030906677, "learning_rate": 0.0002, "epoch": 4.702333931777379, "step": 65480}, {"loss": 0.6721, "grad_norm": 1.0505144596099854, "learning_rate": 0.0002, "epoch": 4.703052064631957, "step": 65490}, {"loss": 0.6287, "grad_norm": 0.8797298073768616, "learning_rate": 0.0002, "epoch": 4.703770197486535, "step": 65500}, {"loss": 0.6515, "grad_norm": 0.9970770478248596, "learning_rate": 0.0002, "epoch": 4.704488330341113, "step": 65510}, {"loss": 0.6096, "grad_norm": 1.1743851900100708, "learning_rate": 0.0002, "epoch": 4.705206463195691, "step": 65520}, {"loss": 0.5755, "grad_norm": 0.9534381031990051, "learning_rate": 0.0002, "epoch": 4.705924596050269, "step": 65530}, {"loss": 0.6039, "grad_norm": 0.9735581278800964, "learning_rate": 0.0002, "epoch": 4.706642728904847, "step": 65540}, {"loss": 0.6217, "grad_norm": 1.185352087020874, "learning_rate": 0.0002, "epoch": 4.707360861759425, "step": 65550}, {"loss": 0.6398, "grad_norm": 0.9383901357650757, "learning_rate": 0.0002, "epoch": 4.708078994614004, "step": 65560}, {"loss": 0.6654, "grad_norm": 1.0194662809371948, "learning_rate": 0.0002, "epoch": 4.708797127468582, "step": 65570}, {"loss": 0.6008, "grad_norm": 0.8448300361633301, "learning_rate": 0.0002, "epoch": 4.70951526032316, "step": 65580}, {"loss": 0.6608, "grad_norm": 1.1930629014968872, "learning_rate": 0.0002, "epoch": 4.710233393177738, "step": 65590}, {"loss": 0.6082, "grad_norm": 1.0038636922836304, "learning_rate": 0.0002, "epoch": 4.710951526032316, "step": 65600}, {"loss": 0.6613, "grad_norm": 0.8206564784049988, "learning_rate": 0.0002, "epoch": 4.711669658886894, "step": 65610}, {"loss": 0.6142, "grad_norm": 1.0984861850738525, "learning_rate": 0.0002, "epoch": 4.712387791741472, "step": 65620}, {"loss": 0.6368, "grad_norm": 1.2891547679901123, "learning_rate": 0.0002, "epoch": 4.71310592459605, "step": 65630}, {"loss": 0.5857, "grad_norm": 0.927062451839447, "learning_rate": 0.0002, "epoch": 4.713824057450628, "step": 65640}, {"loss": 0.6187, "grad_norm": 0.8647334575653076, "learning_rate": 0.0002, "epoch": 4.714542190305206, "step": 65650}, {"loss": 0.6327, "grad_norm": 1.1017670631408691, "learning_rate": 0.0002, "epoch": 4.715260323159785, "step": 65660}, {"loss": 0.6398, "grad_norm": 0.9589072465896606, "learning_rate": 0.0002, "epoch": 4.715978456014363, "step": 65670}, {"loss": 0.6179, "grad_norm": 0.9496776461601257, "learning_rate": 0.0002, "epoch": 4.716696588868941, "step": 65680}, {"loss": 0.625, "grad_norm": 0.9266180396080017, "learning_rate": 0.0002, "epoch": 4.717414721723519, "step": 65690}, {"loss": 0.637, "grad_norm": 0.8699696063995361, "learning_rate": 0.0002, "epoch": 4.718132854578097, "step": 65700}, {"loss": 0.6402, "grad_norm": 1.0444015264511108, "learning_rate": 0.0002, "epoch": 4.718850987432675, "step": 65710}, {"loss": 0.6526, "grad_norm": 1.0100741386413574, "learning_rate": 0.0002, "epoch": 4.719569120287253, "step": 65720}, {"loss": 0.617, "grad_norm": 1.1442630290985107, "learning_rate": 0.0002, "epoch": 4.720287253141831, "step": 65730}, {"loss": 0.6214, "grad_norm": 0.8937877416610718, "learning_rate": 0.0002, "epoch": 4.721005385996409, "step": 65740}, {"loss": 0.625, "grad_norm": 1.0718764066696167, "learning_rate": 0.0002, "epoch": 4.721723518850988, "step": 65750}, {"loss": 0.6182, "grad_norm": 0.8838587999343872, "learning_rate": 0.0002, "epoch": 4.722441651705566, "step": 65760}, {"loss": 0.6254, "grad_norm": 1.1247940063476562, "learning_rate": 0.0002, "epoch": 4.723159784560144, "step": 65770}, {"loss": 0.5917, "grad_norm": 0.9491105675697327, "learning_rate": 0.0002, "epoch": 4.723877917414722, "step": 65780}, {"loss": 0.6178, "grad_norm": 1.0896921157836914, "learning_rate": 0.0002, "epoch": 4.7245960502693, "step": 65790}, {"loss": 0.5975, "grad_norm": 1.0097380876541138, "learning_rate": 0.0002, "epoch": 4.725314183123878, "step": 65800}, {"loss": 0.592, "grad_norm": 0.911763608455658, "learning_rate": 0.0002, "epoch": 4.726032315978456, "step": 65810}, {"loss": 0.6274, "grad_norm": 1.1295124292373657, "learning_rate": 0.0002, "epoch": 4.726750448833034, "step": 65820}, {"loss": 0.6004, "grad_norm": 0.7637538313865662, "learning_rate": 0.0002, "epoch": 4.727468581687612, "step": 65830}, {"loss": 0.6136, "grad_norm": 0.9255306720733643, "learning_rate": 0.0002, "epoch": 4.72818671454219, "step": 65840}, {"loss": 0.6013, "grad_norm": 0.9847530126571655, "learning_rate": 0.0002, "epoch": 4.728904847396769, "step": 65850}, {"loss": 0.6283, "grad_norm": 0.9036182761192322, "learning_rate": 0.0002, "epoch": 4.729622980251347, "step": 65860}, {"loss": 0.6374, "grad_norm": 0.8284199833869934, "learning_rate": 0.0002, "epoch": 4.730341113105925, "step": 65870}, {"loss": 0.6228, "grad_norm": 1.0142838954925537, "learning_rate": 0.0002, "epoch": 4.731059245960503, "step": 65880}, {"loss": 0.624, "grad_norm": 0.9389033913612366, "learning_rate": 0.0002, "epoch": 4.731777378815081, "step": 65890}, {"loss": 0.6414, "grad_norm": 0.8870056867599487, "learning_rate": 0.0002, "epoch": 4.732495511669659, "step": 65900}, {"loss": 0.6261, "grad_norm": 1.1211678981781006, "learning_rate": 0.0002, "epoch": 4.733213644524237, "step": 65910}, {"loss": 0.6065, "grad_norm": 0.7796614170074463, "learning_rate": 0.0002, "epoch": 4.733931777378815, "step": 65920}, {"loss": 0.6701, "grad_norm": 1.0360451936721802, "learning_rate": 0.0002, "epoch": 4.734649910233394, "step": 65930}, {"loss": 0.68, "grad_norm": 0.8383482098579407, "learning_rate": 0.0002, "epoch": 4.735368043087972, "step": 65940}, {"loss": 0.6014, "grad_norm": 0.7985122799873352, "learning_rate": 0.0002, "epoch": 4.73608617594255, "step": 65950}, {"loss": 0.6431, "grad_norm": 1.0314199924468994, "learning_rate": 0.0002, "epoch": 4.736804308797128, "step": 65960}, {"loss": 0.5894, "grad_norm": 0.9279016852378845, "learning_rate": 0.0002, "epoch": 4.737522441651706, "step": 65970}, {"loss": 0.6327, "grad_norm": 1.1046063899993896, "learning_rate": 0.0002, "epoch": 4.738240574506284, "step": 65980}, {"loss": 0.5778, "grad_norm": 0.9075793623924255, "learning_rate": 0.0002, "epoch": 4.738958707360862, "step": 65990}, {"loss": 0.5832, "grad_norm": 1.0945355892181396, "learning_rate": 0.0002, "epoch": 4.73967684021544, "step": 66000}, {"loss": 0.6256, "grad_norm": 0.8885519504547119, "learning_rate": 0.0002, "epoch": 4.740394973070018, "step": 66010}, {"loss": 0.6283, "grad_norm": 0.9312083125114441, "learning_rate": 0.0002, "epoch": 4.741113105924596, "step": 66020}, {"loss": 0.6328, "grad_norm": 1.1574538946151733, "learning_rate": 0.0002, "epoch": 4.741831238779174, "step": 66030}, {"loss": 0.6693, "grad_norm": 0.9346209168434143, "learning_rate": 0.0002, "epoch": 4.742549371633753, "step": 66040}, {"loss": 0.6252, "grad_norm": 0.8935149312019348, "learning_rate": 0.0002, "epoch": 4.743267504488331, "step": 66050}, {"loss": 0.6137, "grad_norm": 0.8958369493484497, "learning_rate": 0.0002, "epoch": 4.743985637342909, "step": 66060}, {"loss": 0.6088, "grad_norm": 0.9383506774902344, "learning_rate": 0.0002, "epoch": 4.744703770197487, "step": 66070}, {"loss": 0.6323, "grad_norm": 0.9868947863578796, "learning_rate": 0.0002, "epoch": 4.745421903052065, "step": 66080}, {"loss": 0.6426, "grad_norm": 1.3417645692825317, "learning_rate": 0.0002, "epoch": 4.746140035906643, "step": 66090}, {"loss": 0.5417, "grad_norm": 1.070693850517273, "learning_rate": 0.0002, "epoch": 4.746858168761221, "step": 66100}, {"loss": 0.6326, "grad_norm": 0.8841570019721985, "learning_rate": 0.0002, "epoch": 4.747576301615799, "step": 66110}, {"loss": 0.655, "grad_norm": 0.7963120341300964, "learning_rate": 0.0002, "epoch": 4.7482944344703775, "step": 66120}, {"loss": 0.6145, "grad_norm": 0.8145691156387329, "learning_rate": 0.0002, "epoch": 4.7490125673249555, "step": 66130}, {"loss": 0.6081, "grad_norm": 0.9074729681015015, "learning_rate": 0.0002, "epoch": 4.7497307001795335, "step": 66140}, {"loss": 0.5651, "grad_norm": 0.9129886627197266, "learning_rate": 0.0002, "epoch": 4.7504488330341115, "step": 66150}, {"loss": 0.6111, "grad_norm": 0.91527259349823, "learning_rate": 0.0002, "epoch": 4.7511669658886895, "step": 66160}, {"loss": 0.672, "grad_norm": 0.9569419622421265, "learning_rate": 0.0002, "epoch": 4.7518850987432675, "step": 66170}, {"loss": 0.597, "grad_norm": 0.8777104616165161, "learning_rate": 0.0002, "epoch": 4.7526032315978455, "step": 66180}, {"loss": 0.6433, "grad_norm": 0.9673085808753967, "learning_rate": 0.0002, "epoch": 4.7533213644524235, "step": 66190}, {"loss": 0.5783, "grad_norm": 1.0683966875076294, "learning_rate": 0.0002, "epoch": 4.7540394973070015, "step": 66200}, {"loss": 0.6356, "grad_norm": 1.1591907739639282, "learning_rate": 0.0002, "epoch": 4.7547576301615795, "step": 66210}, {"loss": 0.6482, "grad_norm": 1.1973309516906738, "learning_rate": 0.0002, "epoch": 4.755475763016158, "step": 66220}, {"loss": 0.5998, "grad_norm": 0.8472012281417847, "learning_rate": 0.0002, "epoch": 4.756193895870736, "step": 66230}, {"loss": 0.717, "grad_norm": 0.9896261692047119, "learning_rate": 0.0002, "epoch": 4.756912028725314, "step": 66240}, {"loss": 0.6368, "grad_norm": 0.8498432040214539, "learning_rate": 0.0002, "epoch": 4.757630161579892, "step": 66250}, {"loss": 0.5931, "grad_norm": 0.9624166488647461, "learning_rate": 0.0002, "epoch": 4.75834829443447, "step": 66260}, {"loss": 0.645, "grad_norm": 1.0951786041259766, "learning_rate": 0.0002, "epoch": 4.759066427289048, "step": 66270}, {"loss": 0.6092, "grad_norm": 0.9863157868385315, "learning_rate": 0.0002, "epoch": 4.759784560143626, "step": 66280}, {"loss": 0.6682, "grad_norm": 1.0062068700790405, "learning_rate": 0.0002, "epoch": 4.760502692998204, "step": 66290}, {"loss": 0.5704, "grad_norm": 0.8075495958328247, "learning_rate": 0.0002, "epoch": 4.761220825852782, "step": 66300}, {"loss": 0.6297, "grad_norm": 0.9617878198623657, "learning_rate": 0.0002, "epoch": 4.761938958707361, "step": 66310}, {"loss": 0.6141, "grad_norm": 1.097091555595398, "learning_rate": 0.0002, "epoch": 4.762657091561939, "step": 66320}, {"loss": 0.6152, "grad_norm": 1.2713453769683838, "learning_rate": 0.0002, "epoch": 4.763375224416517, "step": 66330}, {"loss": 0.6726, "grad_norm": 0.9473448991775513, "learning_rate": 0.0002, "epoch": 4.764093357271095, "step": 66340}, {"loss": 0.6032, "grad_norm": 1.0176854133605957, "learning_rate": 0.0002, "epoch": 4.764811490125673, "step": 66350}, {"loss": 0.6429, "grad_norm": 1.0486242771148682, "learning_rate": 0.0002, "epoch": 4.765529622980251, "step": 66360}, {"loss": 0.6875, "grad_norm": 1.249985694885254, "learning_rate": 0.0002, "epoch": 4.766247755834829, "step": 66370}, {"loss": 0.6086, "grad_norm": 1.283875584602356, "learning_rate": 0.0002, "epoch": 4.766965888689407, "step": 66380}, {"loss": 0.5997, "grad_norm": 1.0009022951126099, "learning_rate": 0.0002, "epoch": 4.767684021543985, "step": 66390}, {"loss": 0.5782, "grad_norm": 0.9718021750450134, "learning_rate": 0.0002, "epoch": 4.768402154398563, "step": 66400}, {"loss": 0.6292, "grad_norm": 1.0865732431411743, "learning_rate": 0.0002, "epoch": 4.769120287253142, "step": 66410}, {"loss": 0.6038, "grad_norm": 0.9273189306259155, "learning_rate": 0.0002, "epoch": 4.76983842010772, "step": 66420}, {"loss": 0.6244, "grad_norm": 1.067535638809204, "learning_rate": 0.0002, "epoch": 4.770556552962298, "step": 66430}, {"loss": 0.6434, "grad_norm": 1.0551011562347412, "learning_rate": 0.0002, "epoch": 4.771274685816876, "step": 66440}, {"loss": 0.6151, "grad_norm": 1.0336146354675293, "learning_rate": 0.0002, "epoch": 4.771992818671454, "step": 66450}, {"loss": 0.5955, "grad_norm": 0.8738380670547485, "learning_rate": 0.0002, "epoch": 4.772710951526032, "step": 66460}, {"loss": 0.6386, "grad_norm": 1.1048321723937988, "learning_rate": 0.0002, "epoch": 4.77342908438061, "step": 66470}, {"loss": 0.592, "grad_norm": 0.8471167683601379, "learning_rate": 0.0002, "epoch": 4.774147217235188, "step": 66480}, {"loss": 0.6139, "grad_norm": 1.2527031898498535, "learning_rate": 0.0002, "epoch": 4.774865350089767, "step": 66490}, {"loss": 0.579, "grad_norm": 1.0056052207946777, "learning_rate": 0.0002, "epoch": 4.775583482944345, "step": 66500}, {"loss": 0.6448, "grad_norm": 1.142456293106079, "learning_rate": 0.0002, "epoch": 4.776301615798923, "step": 66510}, {"loss": 0.6399, "grad_norm": 1.1813132762908936, "learning_rate": 0.0002, "epoch": 4.777019748653501, "step": 66520}, {"loss": 0.6575, "grad_norm": 0.8683654069900513, "learning_rate": 0.0002, "epoch": 4.777737881508079, "step": 66530}, {"loss": 0.6059, "grad_norm": 1.0577980279922485, "learning_rate": 0.0002, "epoch": 4.778456014362657, "step": 66540}, {"loss": 0.5923, "grad_norm": 1.077438473701477, "learning_rate": 0.0002, "epoch": 4.779174147217235, "step": 66550}, {"loss": 0.5744, "grad_norm": 1.0107938051223755, "learning_rate": 0.0002, "epoch": 4.779892280071813, "step": 66560}, {"loss": 0.6155, "grad_norm": 0.8071168065071106, "learning_rate": 0.0002, "epoch": 4.780610412926391, "step": 66570}, {"loss": 0.6126, "grad_norm": 0.8887564539909363, "learning_rate": 0.0002, "epoch": 4.781328545780969, "step": 66580}, {"loss": 0.6417, "grad_norm": 0.9823092222213745, "learning_rate": 0.0002, "epoch": 4.782046678635547, "step": 66590}, {"loss": 0.6108, "grad_norm": 0.9026784300804138, "learning_rate": 0.0002, "epoch": 4.782764811490126, "step": 66600}, {"loss": 0.6252, "grad_norm": 0.8912792205810547, "learning_rate": 0.0002, "epoch": 4.783482944344704, "step": 66610}, {"loss": 0.6285, "grad_norm": 1.0955979824066162, "learning_rate": 0.0002, "epoch": 4.784201077199282, "step": 66620}, {"loss": 0.6161, "grad_norm": 0.8614793419837952, "learning_rate": 0.0002, "epoch": 4.78491921005386, "step": 66630}, {"loss": 0.6343, "grad_norm": 0.7247269153594971, "learning_rate": 0.0002, "epoch": 4.785637342908438, "step": 66640}, {"loss": 0.5634, "grad_norm": 0.9685400724411011, "learning_rate": 0.0002, "epoch": 4.786355475763016, "step": 66650}, {"loss": 0.6419, "grad_norm": 0.9219905734062195, "learning_rate": 0.0002, "epoch": 4.787073608617594, "step": 66660}, {"loss": 0.6509, "grad_norm": 0.9217489361763, "learning_rate": 0.0002, "epoch": 4.787791741472172, "step": 66670}, {"loss": 0.6151, "grad_norm": 1.13791823387146, "learning_rate": 0.0002, "epoch": 4.788509874326751, "step": 66680}, {"loss": 0.6114, "grad_norm": 0.857542872428894, "learning_rate": 0.0002, "epoch": 4.789228007181329, "step": 66690}, {"loss": 0.6317, "grad_norm": 0.9886694550514221, "learning_rate": 0.0002, "epoch": 4.789946140035907, "step": 66700}, {"loss": 0.6436, "grad_norm": 0.987952470779419, "learning_rate": 0.0002, "epoch": 4.790664272890485, "step": 66710}, {"loss": 0.6284, "grad_norm": 1.051612377166748, "learning_rate": 0.0002, "epoch": 4.791382405745063, "step": 66720}, {"loss": 0.6207, "grad_norm": 0.9816454648971558, "learning_rate": 0.0002, "epoch": 4.792100538599641, "step": 66730}, {"loss": 0.6618, "grad_norm": 1.0953829288482666, "learning_rate": 0.0002, "epoch": 4.792818671454219, "step": 66740}, {"loss": 0.652, "grad_norm": 0.8720369935035706, "learning_rate": 0.0002, "epoch": 4.793536804308797, "step": 66750}, {"loss": 0.569, "grad_norm": 0.8910234570503235, "learning_rate": 0.0002, "epoch": 4.794254937163375, "step": 66760}, {"loss": 0.5814, "grad_norm": 0.8300510048866272, "learning_rate": 0.0002, "epoch": 4.794973070017953, "step": 66770}, {"loss": 0.591, "grad_norm": 0.9380533695220947, "learning_rate": 0.0002, "epoch": 4.795691202872531, "step": 66780}, {"loss": 0.6201, "grad_norm": 0.8361864686012268, "learning_rate": 0.0002, "epoch": 4.79640933572711, "step": 66790}, {"loss": 0.6192, "grad_norm": 1.051262617111206, "learning_rate": 0.0002, "epoch": 4.797127468581688, "step": 66800}, {"loss": 0.6408, "grad_norm": 1.1324400901794434, "learning_rate": 0.0002, "epoch": 4.797845601436266, "step": 66810}, {"loss": 0.6156, "grad_norm": 0.853903591632843, "learning_rate": 0.0002, "epoch": 4.798563734290844, "step": 66820}, {"loss": 0.5923, "grad_norm": 0.9949867725372314, "learning_rate": 0.0002, "epoch": 4.799281867145422, "step": 66830}, {"loss": 0.6453, "grad_norm": 0.9204033017158508, "learning_rate": 0.0002, "epoch": 4.8, "step": 66840}, {"loss": 0.6221, "grad_norm": 0.7461584806442261, "learning_rate": 0.0002, "epoch": 4.800718132854578, "step": 66850}, {"loss": 0.6019, "grad_norm": 1.1019874811172485, "learning_rate": 0.0002, "epoch": 4.801436265709156, "step": 66860}, {"loss": 0.6514, "grad_norm": 1.1695797443389893, "learning_rate": 0.0002, "epoch": 4.802154398563735, "step": 66870}, {"loss": 0.6105, "grad_norm": 1.0902758836746216, "learning_rate": 0.0002, "epoch": 4.802872531418313, "step": 66880}, {"loss": 0.6297, "grad_norm": 0.8778618574142456, "learning_rate": 0.0002, "epoch": 4.803590664272891, "step": 66890}, {"loss": 0.6608, "grad_norm": 0.905505359172821, "learning_rate": 0.0002, "epoch": 4.804308797127469, "step": 66900}, {"loss": 0.6386, "grad_norm": 1.0802056789398193, "learning_rate": 0.0002, "epoch": 4.805026929982047, "step": 66910}, {"loss": 0.5866, "grad_norm": 0.7899449467658997, "learning_rate": 0.0002, "epoch": 4.805745062836625, "step": 66920}, {"loss": 0.6169, "grad_norm": 1.1938519477844238, "learning_rate": 0.0002, "epoch": 4.806463195691203, "step": 66930}, {"loss": 0.5979, "grad_norm": 1.0213780403137207, "learning_rate": 0.0002, "epoch": 4.807181328545781, "step": 66940}, {"loss": 0.6518, "grad_norm": 0.9925506711006165, "learning_rate": 0.0002, "epoch": 4.807899461400359, "step": 66950}, {"loss": 0.6229, "grad_norm": 1.0174424648284912, "learning_rate": 0.0002, "epoch": 4.808617594254937, "step": 66960}, {"loss": 0.5932, "grad_norm": 1.0515072345733643, "learning_rate": 0.0002, "epoch": 4.809335727109516, "step": 66970}, {"loss": 0.6169, "grad_norm": 1.0161492824554443, "learning_rate": 0.0002, "epoch": 4.810053859964094, "step": 66980}, {"loss": 0.5804, "grad_norm": 0.8421840071678162, "learning_rate": 0.0002, "epoch": 4.810771992818672, "step": 66990}, {"loss": 0.6792, "grad_norm": 1.0493539571762085, "learning_rate": 0.0002, "epoch": 4.81149012567325, "step": 67000}, {"loss": 0.5906, "grad_norm": 1.1133309602737427, "learning_rate": 0.0002, "epoch": 4.812208258527828, "step": 67010}, {"loss": 0.5771, "grad_norm": 0.924017071723938, "learning_rate": 0.0002, "epoch": 4.812926391382406, "step": 67020}, {"loss": 0.625, "grad_norm": 1.0568689107894897, "learning_rate": 0.0002, "epoch": 4.813644524236984, "step": 67030}, {"loss": 0.6654, "grad_norm": 0.989414632320404, "learning_rate": 0.0002, "epoch": 4.814362657091562, "step": 67040}, {"loss": 0.6186, "grad_norm": 0.9256827235221863, "learning_rate": 0.0002, "epoch": 4.8150807899461405, "step": 67050}, {"loss": 0.637, "grad_norm": 0.9538901448249817, "learning_rate": 0.0002, "epoch": 4.8157989228007185, "step": 67060}, {"loss": 0.632, "grad_norm": 1.0373849868774414, "learning_rate": 0.0002, "epoch": 4.8165170556552965, "step": 67070}, {"loss": 0.5956, "grad_norm": 1.0019729137420654, "learning_rate": 0.0002, "epoch": 4.8172351885098745, "step": 67080}, {"loss": 0.636, "grad_norm": 0.9930381178855896, "learning_rate": 0.0002, "epoch": 4.8179533213644525, "step": 67090}, {"loss": 0.6106, "grad_norm": 1.0008453130722046, "learning_rate": 0.0002, "epoch": 4.8186714542190305, "step": 67100}, {"loss": 0.5841, "grad_norm": 1.0153851509094238, "learning_rate": 0.0002, "epoch": 4.8193895870736085, "step": 67110}, {"loss": 0.6012, "grad_norm": 1.0193161964416504, "learning_rate": 0.0002, "epoch": 4.8201077199281865, "step": 67120}, {"loss": 0.6602, "grad_norm": 1.0204501152038574, "learning_rate": 0.0002, "epoch": 4.8208258527827645, "step": 67130}, {"loss": 0.6235, "grad_norm": 0.9097670316696167, "learning_rate": 0.0002, "epoch": 4.8215439856373425, "step": 67140}, {"loss": 0.5836, "grad_norm": 0.9288716912269592, "learning_rate": 0.0002, "epoch": 4.8222621184919205, "step": 67150}, {"loss": 0.604, "grad_norm": 0.9975850582122803, "learning_rate": 0.0002, "epoch": 4.822980251346499, "step": 67160}, {"loss": 0.6877, "grad_norm": 0.8502511382102966, "learning_rate": 0.0002, "epoch": 4.823698384201077, "step": 67170}, {"loss": 0.6194, "grad_norm": 1.0129257440567017, "learning_rate": 0.0002, "epoch": 4.824416517055655, "step": 67180}, {"loss": 0.6294, "grad_norm": 1.0009492635726929, "learning_rate": 0.0002, "epoch": 4.825134649910233, "step": 67190}, {"loss": 0.5757, "grad_norm": 0.9273321032524109, "learning_rate": 0.0002, "epoch": 4.825852782764811, "step": 67200}, {"loss": 0.5749, "grad_norm": 1.0438604354858398, "learning_rate": 0.0002, "epoch": 4.8265709156193894, "step": 67210}, {"loss": 0.6273, "grad_norm": 1.119573712348938, "learning_rate": 0.0002, "epoch": 4.8272890484739674, "step": 67220}, {"loss": 0.6284, "grad_norm": 0.9607422351837158, "learning_rate": 0.0002, "epoch": 4.8280071813285454, "step": 67230}, {"loss": 0.6259, "grad_norm": 0.9614062905311584, "learning_rate": 0.0002, "epoch": 4.828725314183124, "step": 67240}, {"loss": 0.5709, "grad_norm": 1.1017652750015259, "learning_rate": 0.0002, "epoch": 4.829443447037702, "step": 67250}, {"loss": 0.6203, "grad_norm": 1.0521706342697144, "learning_rate": 0.0002, "epoch": 4.83016157989228, "step": 67260}, {"loss": 0.6266, "grad_norm": 0.7685959339141846, "learning_rate": 0.0002, "epoch": 4.830879712746858, "step": 67270}, {"loss": 0.5809, "grad_norm": 0.7894896268844604, "learning_rate": 0.0002, "epoch": 4.831597845601436, "step": 67280}, {"loss": 0.6349, "grad_norm": 1.0882996320724487, "learning_rate": 0.0002, "epoch": 4.832315978456014, "step": 67290}, {"loss": 0.6129, "grad_norm": 0.9215409755706787, "learning_rate": 0.0002, "epoch": 4.833034111310592, "step": 67300}, {"loss": 0.6142, "grad_norm": 0.8660635352134705, "learning_rate": 0.0002, "epoch": 4.83375224416517, "step": 67310}, {"loss": 0.6378, "grad_norm": 0.980879008769989, "learning_rate": 0.0002, "epoch": 4.834470377019748, "step": 67320}, {"loss": 0.6291, "grad_norm": 1.0356814861297607, "learning_rate": 0.0002, "epoch": 4.835188509874326, "step": 67330}, {"loss": 0.6271, "grad_norm": 1.0265507698059082, "learning_rate": 0.0002, "epoch": 4.835906642728904, "step": 67340}, {"loss": 0.6009, "grad_norm": 1.0659137964248657, "learning_rate": 0.0002, "epoch": 4.836624775583483, "step": 67350}, {"loss": 0.5946, "grad_norm": 0.9485231637954712, "learning_rate": 0.0002, "epoch": 4.837342908438061, "step": 67360}, {"loss": 0.6338, "grad_norm": 1.0950140953063965, "learning_rate": 0.0002, "epoch": 4.838061041292639, "step": 67370}, {"loss": 0.6314, "grad_norm": 0.8907382488250732, "learning_rate": 0.0002, "epoch": 4.838779174147217, "step": 67380}, {"loss": 0.6066, "grad_norm": 0.9777120351791382, "learning_rate": 0.0002, "epoch": 4.839497307001795, "step": 67390}, {"loss": 0.6258, "grad_norm": 0.8482252955436707, "learning_rate": 0.0002, "epoch": 4.840215439856373, "step": 67400}, {"loss": 0.603, "grad_norm": 0.8505899906158447, "learning_rate": 0.0002, "epoch": 4.840933572710951, "step": 67410}, {"loss": 0.609, "grad_norm": 0.8574482798576355, "learning_rate": 0.0002, "epoch": 4.841651705565529, "step": 67420}, {"loss": 0.6188, "grad_norm": 1.092310905456543, "learning_rate": 0.0002, "epoch": 4.842369838420108, "step": 67430}, {"loss": 0.619, "grad_norm": 0.9418560266494751, "learning_rate": 0.0002, "epoch": 4.843087971274686, "step": 67440}, {"loss": 0.6367, "grad_norm": 1.1310782432556152, "learning_rate": 0.0002, "epoch": 4.843806104129264, "step": 67450}, {"loss": 0.664, "grad_norm": 0.9993671774864197, "learning_rate": 0.0002, "epoch": 4.844524236983842, "step": 67460}, {"loss": 0.6247, "grad_norm": 0.8322528600692749, "learning_rate": 0.0002, "epoch": 4.84524236983842, "step": 67470}, {"loss": 0.5828, "grad_norm": 0.8488435745239258, "learning_rate": 0.0002, "epoch": 4.845960502692998, "step": 67480}, {"loss": 0.6023, "grad_norm": 0.8070611357688904, "learning_rate": 0.0002, "epoch": 4.846678635547576, "step": 67490}, {"loss": 0.6362, "grad_norm": 0.8200163245201111, "learning_rate": 0.0002, "epoch": 4.847396768402154, "step": 67500}, {"loss": 0.612, "grad_norm": 0.91901034116745, "learning_rate": 0.0002, "epoch": 4.848114901256732, "step": 67510}, {"loss": 0.6191, "grad_norm": 1.0938435792922974, "learning_rate": 0.0002, "epoch": 4.84883303411131, "step": 67520}, {"loss": 0.6736, "grad_norm": 0.7926174402236938, "learning_rate": 0.0002, "epoch": 4.849551166965889, "step": 67530}, {"loss": 0.6252, "grad_norm": 0.9914385676383972, "learning_rate": 0.0002, "epoch": 4.850269299820467, "step": 67540}, {"loss": 0.6278, "grad_norm": 1.033065915107727, "learning_rate": 0.0002, "epoch": 4.850987432675045, "step": 67550}, {"loss": 0.6334, "grad_norm": 0.9700239300727844, "learning_rate": 0.0002, "epoch": 4.851705565529623, "step": 67560}, {"loss": 0.6308, "grad_norm": 0.8550103902816772, "learning_rate": 0.0002, "epoch": 4.852423698384201, "step": 67570}, {"loss": 0.6194, "grad_norm": 1.0009654760360718, "learning_rate": 0.0002, "epoch": 4.853141831238779, "step": 67580}, {"loss": 0.5825, "grad_norm": 1.0766186714172363, "learning_rate": 0.0002, "epoch": 4.853859964093357, "step": 67590}, {"loss": 0.6216, "grad_norm": 0.9512220621109009, "learning_rate": 0.0002, "epoch": 4.854578096947935, "step": 67600}, {"loss": 0.6301, "grad_norm": 0.8434456586837769, "learning_rate": 0.0002, "epoch": 4.855296229802514, "step": 67610}, {"loss": 0.6416, "grad_norm": 1.0276665687561035, "learning_rate": 0.0002, "epoch": 4.856014362657092, "step": 67620}, {"loss": 0.6063, "grad_norm": 0.9758516550064087, "learning_rate": 0.0002, "epoch": 4.85673249551167, "step": 67630}, {"loss": 0.622, "grad_norm": 0.8988076448440552, "learning_rate": 0.0002, "epoch": 4.857450628366248, "step": 67640}, {"loss": 0.6516, "grad_norm": 1.0038257837295532, "learning_rate": 0.0002, "epoch": 4.858168761220826, "step": 67650}, {"loss": 0.6322, "grad_norm": 0.9973093867301941, "learning_rate": 0.0002, "epoch": 4.858886894075404, "step": 67660}, {"loss": 0.6065, "grad_norm": 0.9754974246025085, "learning_rate": 0.0002, "epoch": 4.859605026929982, "step": 67670}, {"loss": 0.6191, "grad_norm": 1.1829560995101929, "learning_rate": 0.0002, "epoch": 4.86032315978456, "step": 67680}, {"loss": 0.6267, "grad_norm": 1.1077659130096436, "learning_rate": 0.0002, "epoch": 4.861041292639138, "step": 67690}, {"loss": 0.6312, "grad_norm": 0.9862872958183289, "learning_rate": 0.0002, "epoch": 4.861759425493716, "step": 67700}, {"loss": 0.6281, "grad_norm": 0.9826052188873291, "learning_rate": 0.0002, "epoch": 4.862477558348294, "step": 67710}, {"loss": 0.6227, "grad_norm": 0.940082848072052, "learning_rate": 0.0002, "epoch": 4.863195691202873, "step": 67720}, {"loss": 0.6232, "grad_norm": 0.895434558391571, "learning_rate": 0.0002, "epoch": 4.863913824057451, "step": 67730}, {"loss": 0.6674, "grad_norm": 1.1194682121276855, "learning_rate": 0.0002, "epoch": 4.864631956912029, "step": 67740}, {"loss": 0.5981, "grad_norm": 0.9984544515609741, "learning_rate": 0.0002, "epoch": 4.865350089766607, "step": 67750}, {"loss": 0.6583, "grad_norm": 1.049224615097046, "learning_rate": 0.0002, "epoch": 4.866068222621185, "step": 67760}, {"loss": 0.583, "grad_norm": 1.009515643119812, "learning_rate": 0.0002, "epoch": 4.866786355475763, "step": 67770}, {"loss": 0.6466, "grad_norm": 1.0336902141571045, "learning_rate": 0.0002, "epoch": 4.867504488330341, "step": 67780}, {"loss": 0.6909, "grad_norm": 0.9310635924339294, "learning_rate": 0.0002, "epoch": 4.868222621184919, "step": 67790}, {"loss": 0.7267, "grad_norm": 0.934882640838623, "learning_rate": 0.0002, "epoch": 4.868940754039498, "step": 67800}, {"loss": 0.648, "grad_norm": 0.8663495779037476, "learning_rate": 0.0002, "epoch": 4.869658886894076, "step": 67810}, {"loss": 0.6275, "grad_norm": 1.0085018873214722, "learning_rate": 0.0002, "epoch": 4.870377019748654, "step": 67820}, {"loss": 0.6571, "grad_norm": 0.896507978439331, "learning_rate": 0.0002, "epoch": 4.871095152603232, "step": 67830}, {"loss": 0.6711, "grad_norm": 0.925809919834137, "learning_rate": 0.0002, "epoch": 4.87181328545781, "step": 67840}, {"loss": 0.5917, "grad_norm": 0.8044029474258423, "learning_rate": 0.0002, "epoch": 4.872531418312388, "step": 67850}, {"loss": 0.6671, "grad_norm": 1.0026800632476807, "learning_rate": 0.0002, "epoch": 4.873249551166966, "step": 67860}, {"loss": 0.6175, "grad_norm": 0.9577589631080627, "learning_rate": 0.0002, "epoch": 4.873967684021544, "step": 67870}, {"loss": 0.591, "grad_norm": 0.8225193619728088, "learning_rate": 0.0002, "epoch": 4.874685816876122, "step": 67880}, {"loss": 0.6, "grad_norm": 1.0019139051437378, "learning_rate": 0.0002, "epoch": 4.8754039497307, "step": 67890}, {"loss": 0.6521, "grad_norm": 0.9282827377319336, "learning_rate": 0.0002, "epoch": 4.876122082585278, "step": 67900}, {"loss": 0.6251, "grad_norm": 0.8204836249351501, "learning_rate": 0.0002, "epoch": 4.876840215439857, "step": 67910}, {"loss": 0.6345, "grad_norm": 0.907356321811676, "learning_rate": 0.0002, "epoch": 4.877558348294435, "step": 67920}, {"loss": 0.6438, "grad_norm": 1.12422776222229, "learning_rate": 0.0002, "epoch": 4.878276481149013, "step": 67930}, {"loss": 0.6727, "grad_norm": 0.8230205178260803, "learning_rate": 0.0002, "epoch": 4.878994614003591, "step": 67940}, {"loss": 0.6361, "grad_norm": 1.1588479280471802, "learning_rate": 0.0002, "epoch": 4.879712746858169, "step": 67950}, {"loss": 0.6489, "grad_norm": 1.1064553260803223, "learning_rate": 0.0002, "epoch": 4.880430879712747, "step": 67960}, {"loss": 0.5851, "grad_norm": 0.9311534762382507, "learning_rate": 0.0002, "epoch": 4.881149012567325, "step": 67970}, {"loss": 0.6238, "grad_norm": 0.7575639486312866, "learning_rate": 0.0002, "epoch": 4.881867145421903, "step": 67980}, {"loss": 0.5933, "grad_norm": 0.9201191067695618, "learning_rate": 0.0002, "epoch": 4.882585278276482, "step": 67990}, {"loss": 0.5806, "grad_norm": 0.8487658500671387, "learning_rate": 0.0002, "epoch": 4.88330341113106, "step": 68000}, {"loss": 0.598, "grad_norm": 0.9645208716392517, "learning_rate": 0.0002, "epoch": 4.884021543985638, "step": 68010}, {"loss": 0.6112, "grad_norm": 0.8594469428062439, "learning_rate": 0.0002, "epoch": 4.884739676840216, "step": 68020}, {"loss": 0.6115, "grad_norm": 0.9518412947654724, "learning_rate": 0.0002, "epoch": 4.885457809694794, "step": 68030}, {"loss": 0.6071, "grad_norm": 1.0934258699417114, "learning_rate": 0.0002, "epoch": 4.886175942549372, "step": 68040}, {"loss": 0.6265, "grad_norm": 0.988761842250824, "learning_rate": 0.0002, "epoch": 4.88689407540395, "step": 68050}, {"loss": 0.5981, "grad_norm": 0.7572013735771179, "learning_rate": 0.0002, "epoch": 4.887612208258528, "step": 68060}, {"loss": 0.6286, "grad_norm": 0.8801929950714111, "learning_rate": 0.0002, "epoch": 4.888330341113106, "step": 68070}, {"loss": 0.6503, "grad_norm": 1.0080658197402954, "learning_rate": 0.0002, "epoch": 4.889048473967684, "step": 68080}, {"loss": 0.6064, "grad_norm": 0.9588785171508789, "learning_rate": 0.0002, "epoch": 4.8897666068222625, "step": 68090}, {"loss": 0.6159, "grad_norm": 1.0994032621383667, "learning_rate": 0.0002, "epoch": 4.8904847396768405, "step": 68100}, {"loss": 0.6357, "grad_norm": 0.9851962924003601, "learning_rate": 0.0002, "epoch": 4.8912028725314185, "step": 68110}, {"loss": 0.5999, "grad_norm": 0.9566116333007812, "learning_rate": 0.0002, "epoch": 4.8919210053859965, "step": 68120}, {"loss": 0.6742, "grad_norm": 0.8708083033561707, "learning_rate": 0.0002, "epoch": 4.8926391382405745, "step": 68130}, {"loss": 0.6489, "grad_norm": 1.2182754278182983, "learning_rate": 0.0002, "epoch": 4.8933572710951525, "step": 68140}, {"loss": 0.6442, "grad_norm": 1.047988772392273, "learning_rate": 0.0002, "epoch": 4.8940754039497305, "step": 68150}, {"loss": 0.6176, "grad_norm": 0.8665831685066223, "learning_rate": 0.0002, "epoch": 4.8947935368043085, "step": 68160}, {"loss": 0.5721, "grad_norm": 0.9313908219337463, "learning_rate": 0.0002, "epoch": 4.8955116696588865, "step": 68170}, {"loss": 0.6073, "grad_norm": 0.9568582773208618, "learning_rate": 0.0002, "epoch": 4.896229802513465, "step": 68180}, {"loss": 0.6308, "grad_norm": 1.0427594184875488, "learning_rate": 0.0002, "epoch": 4.896947935368043, "step": 68190}, {"loss": 0.6357, "grad_norm": 0.9132021069526672, "learning_rate": 0.0002, "epoch": 4.897666068222621, "step": 68200}, {"loss": 0.6264, "grad_norm": 0.9597318768501282, "learning_rate": 0.0002, "epoch": 4.898384201077199, "step": 68210}, {"loss": 0.6025, "grad_norm": 1.0736947059631348, "learning_rate": 0.0002, "epoch": 4.899102333931777, "step": 68220}, {"loss": 0.5942, "grad_norm": 0.9318404793739319, "learning_rate": 0.0002, "epoch": 4.899820466786355, "step": 68230}, {"loss": 0.5991, "grad_norm": 0.8594326972961426, "learning_rate": 0.0002, "epoch": 4.900538599640933, "step": 68240}, {"loss": 0.6145, "grad_norm": 1.1437443494796753, "learning_rate": 0.0002, "epoch": 4.901256732495511, "step": 68250}, {"loss": 0.6414, "grad_norm": 1.1599408388137817, "learning_rate": 0.0002, "epoch": 4.901974865350089, "step": 68260}, {"loss": 0.6148, "grad_norm": 1.160628080368042, "learning_rate": 0.0002, "epoch": 4.902692998204667, "step": 68270}, {"loss": 0.613, "grad_norm": 1.0147801637649536, "learning_rate": 0.0002, "epoch": 4.903411131059246, "step": 68280}, {"loss": 0.6502, "grad_norm": 0.8622691631317139, "learning_rate": 0.0002, "epoch": 4.904129263913824, "step": 68290}, {"loss": 0.618, "grad_norm": 0.7179980874061584, "learning_rate": 0.0002, "epoch": 4.904847396768402, "step": 68300}, {"loss": 0.6388, "grad_norm": 1.1705092191696167, "learning_rate": 0.0002, "epoch": 4.90556552962298, "step": 68310}, {"loss": 0.6164, "grad_norm": 1.1687676906585693, "learning_rate": 0.0002, "epoch": 4.906283662477558, "step": 68320}, {"loss": 0.6791, "grad_norm": 1.1621531248092651, "learning_rate": 0.0002, "epoch": 4.907001795332136, "step": 68330}, {"loss": 0.6474, "grad_norm": 1.0241422653198242, "learning_rate": 0.0002, "epoch": 4.907719928186714, "step": 68340}, {"loss": 0.6225, "grad_norm": 0.943354070186615, "learning_rate": 0.0002, "epoch": 4.908438061041292, "step": 68350}, {"loss": 0.6596, "grad_norm": 0.8091703653335571, "learning_rate": 0.0002, "epoch": 4.909156193895871, "step": 68360}, {"loss": 0.6196, "grad_norm": 0.8871228694915771, "learning_rate": 0.0002, "epoch": 4.909874326750449, "step": 68370}, {"loss": 0.5714, "grad_norm": 1.0951069593429565, "learning_rate": 0.0002, "epoch": 4.910592459605027, "step": 68380}, {"loss": 0.6407, "grad_norm": 1.1355193853378296, "learning_rate": 0.0002, "epoch": 4.911310592459605, "step": 68390}, {"loss": 0.6369, "grad_norm": 1.0741122961044312, "learning_rate": 0.0002, "epoch": 4.912028725314183, "step": 68400}, {"loss": 0.6176, "grad_norm": 0.9285269975662231, "learning_rate": 0.0002, "epoch": 4.912746858168761, "step": 68410}, {"loss": 0.6433, "grad_norm": 1.080695390701294, "learning_rate": 0.0002, "epoch": 4.913464991023339, "step": 68420}, {"loss": 0.6505, "grad_norm": 0.921331524848938, "learning_rate": 0.0002, "epoch": 4.914183123877917, "step": 68430}, {"loss": 0.701, "grad_norm": 0.9763174057006836, "learning_rate": 0.0002, "epoch": 4.914901256732495, "step": 68440}, {"loss": 0.6429, "grad_norm": 1.1133354902267456, "learning_rate": 0.0002, "epoch": 4.915619389587073, "step": 68450}, {"loss": 0.6117, "grad_norm": 0.8373502492904663, "learning_rate": 0.0002, "epoch": 4.916337522441651, "step": 68460}, {"loss": 0.5993, "grad_norm": 0.9192346334457397, "learning_rate": 0.0002, "epoch": 4.91705565529623, "step": 68470}, {"loss": 0.626, "grad_norm": 1.0724657773971558, "learning_rate": 0.0002, "epoch": 4.917773788150808, "step": 68480}, {"loss": 0.6339, "grad_norm": 0.9209843873977661, "learning_rate": 0.0002, "epoch": 4.918491921005386, "step": 68490}, {"loss": 0.6427, "grad_norm": 0.9201577305793762, "learning_rate": 0.0002, "epoch": 4.919210053859964, "step": 68500}, {"loss": 0.6686, "grad_norm": 0.8086138963699341, "learning_rate": 0.0002, "epoch": 4.919928186714542, "step": 68510}, {"loss": 0.564, "grad_norm": 1.0917785167694092, "learning_rate": 0.0002, "epoch": 4.92064631956912, "step": 68520}, {"loss": 0.6177, "grad_norm": 0.9287897944450378, "learning_rate": 0.0002, "epoch": 4.921364452423698, "step": 68530}, {"loss": 0.6344, "grad_norm": 0.9830158948898315, "learning_rate": 0.0002, "epoch": 4.922082585278276, "step": 68540}, {"loss": 0.6583, "grad_norm": 0.8674678802490234, "learning_rate": 0.0002, "epoch": 4.922800718132855, "step": 68550}, {"loss": 0.6284, "grad_norm": 0.7996176481246948, "learning_rate": 0.0002, "epoch": 4.923518850987433, "step": 68560}, {"loss": 0.6089, "grad_norm": 1.1284033060073853, "learning_rate": 0.0002, "epoch": 4.924236983842011, "step": 68570}, {"loss": 0.6454, "grad_norm": 0.894339919090271, "learning_rate": 0.0002, "epoch": 4.924955116696589, "step": 68580}, {"loss": 0.6231, "grad_norm": 1.1140280961990356, "learning_rate": 0.0002, "epoch": 4.925673249551167, "step": 68590}, {"loss": 0.6318, "grad_norm": 0.9048344492912292, "learning_rate": 0.0002, "epoch": 4.926391382405745, "step": 68600}, {"loss": 0.5963, "grad_norm": 0.9380471706390381, "learning_rate": 0.0002, "epoch": 4.927109515260323, "step": 68610}, {"loss": 0.6384, "grad_norm": 0.8598429560661316, "learning_rate": 0.0002, "epoch": 4.927827648114901, "step": 68620}, {"loss": 0.6486, "grad_norm": 1.0813355445861816, "learning_rate": 0.0002, "epoch": 4.928545780969479, "step": 68630}, {"loss": 0.6367, "grad_norm": 0.979053795337677, "learning_rate": 0.0002, "epoch": 4.929263913824057, "step": 68640}, {"loss": 0.6084, "grad_norm": 0.8194574117660522, "learning_rate": 0.0002, "epoch": 4.929982046678636, "step": 68650}, {"loss": 0.6469, "grad_norm": 0.8593540787696838, "learning_rate": 0.0002, "epoch": 4.930700179533214, "step": 68660}, {"loss": 0.6465, "grad_norm": 1.0134016275405884, "learning_rate": 0.0002, "epoch": 4.931418312387792, "step": 68670}, {"loss": 0.6221, "grad_norm": 1.060586929321289, "learning_rate": 0.0002, "epoch": 4.93213644524237, "step": 68680}, {"loss": 0.5861, "grad_norm": 0.84132319688797, "learning_rate": 0.0002, "epoch": 4.932854578096948, "step": 68690}, {"loss": 0.6206, "grad_norm": 1.0767526626586914, "learning_rate": 0.0002, "epoch": 4.933572710951526, "step": 68700}, {"loss": 0.6294, "grad_norm": 0.8858519792556763, "learning_rate": 0.0002, "epoch": 4.934290843806104, "step": 68710}, {"loss": 0.6727, "grad_norm": 1.194031000137329, "learning_rate": 0.0002, "epoch": 4.935008976660682, "step": 68720}, {"loss": 0.6231, "grad_norm": 0.8270226120948792, "learning_rate": 0.0002, "epoch": 4.93572710951526, "step": 68730}, {"loss": 0.6538, "grad_norm": 1.0385973453521729, "learning_rate": 0.0002, "epoch": 4.936445242369839, "step": 68740}, {"loss": 0.623, "grad_norm": 0.9062243700027466, "learning_rate": 0.0002, "epoch": 4.937163375224417, "step": 68750}, {"loss": 0.6578, "grad_norm": 1.0526955127716064, "learning_rate": 0.0002, "epoch": 4.937881508078995, "step": 68760}, {"loss": 0.6425, "grad_norm": 0.930604100227356, "learning_rate": 0.0002, "epoch": 4.938599640933573, "step": 68770}, {"loss": 0.6228, "grad_norm": 0.9635265469551086, "learning_rate": 0.0002, "epoch": 4.939317773788151, "step": 68780}, {"loss": 0.6269, "grad_norm": 0.9825171232223511, "learning_rate": 0.0002, "epoch": 4.940035906642729, "step": 68790}, {"loss": 0.6063, "grad_norm": 0.9621182680130005, "learning_rate": 0.0002, "epoch": 4.940754039497307, "step": 68800}, {"loss": 0.6558, "grad_norm": 0.9655307531356812, "learning_rate": 0.0002, "epoch": 4.941472172351885, "step": 68810}, {"loss": 0.6441, "grad_norm": 1.2948180437088013, "learning_rate": 0.0002, "epoch": 4.942190305206463, "step": 68820}, {"loss": 0.6757, "grad_norm": 0.9206728339195251, "learning_rate": 0.0002, "epoch": 4.942908438061041, "step": 68830}, {"loss": 0.6554, "grad_norm": 1.0235631465911865, "learning_rate": 0.0002, "epoch": 4.94362657091562, "step": 68840}, {"loss": 0.6386, "grad_norm": 1.0542538166046143, "learning_rate": 0.0002, "epoch": 4.944344703770198, "step": 68850}, {"loss": 0.6359, "grad_norm": 0.9787087440490723, "learning_rate": 0.0002, "epoch": 4.945062836624776, "step": 68860}, {"loss": 0.659, "grad_norm": 0.9527219533920288, "learning_rate": 0.0002, "epoch": 4.945780969479354, "step": 68870}, {"loss": 0.6504, "grad_norm": 1.1525826454162598, "learning_rate": 0.0002, "epoch": 4.946499102333932, "step": 68880}, {"loss": 0.6345, "grad_norm": 0.8610072731971741, "learning_rate": 0.0002, "epoch": 4.94721723518851, "step": 68890}, {"loss": 0.6029, "grad_norm": 1.1403616666793823, "learning_rate": 0.0002, "epoch": 4.947935368043088, "step": 68900}, {"loss": 0.6476, "grad_norm": 1.10334312915802, "learning_rate": 0.0002, "epoch": 4.948653500897666, "step": 68910}, {"loss": 0.6123, "grad_norm": 0.8633760809898376, "learning_rate": 0.0002, "epoch": 4.949371633752245, "step": 68920}, {"loss": 0.6619, "grad_norm": 1.1291080713272095, "learning_rate": 0.0002, "epoch": 4.950089766606823, "step": 68930}, {"loss": 0.6003, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 4.950807899461401, "step": 68940}, {"loss": 0.6126, "grad_norm": 0.9207960963249207, "learning_rate": 0.0002, "epoch": 4.951526032315979, "step": 68950}, {"loss": 0.6031, "grad_norm": 0.9815934300422668, "learning_rate": 0.0002, "epoch": 4.952244165170557, "step": 68960}, {"loss": 0.6201, "grad_norm": 0.9725701808929443, "learning_rate": 0.0002, "epoch": 4.952962298025135, "step": 68970}, {"loss": 0.6251, "grad_norm": 0.844926655292511, "learning_rate": 0.0002, "epoch": 4.953680430879713, "step": 68980}, {"loss": 0.6446, "grad_norm": 0.9898511171340942, "learning_rate": 0.0002, "epoch": 4.954398563734291, "step": 68990}, {"loss": 0.629, "grad_norm": 1.1311410665512085, "learning_rate": 0.0002, "epoch": 4.955116696588869, "step": 69000}, {"loss": 0.6525, "grad_norm": 1.218610405921936, "learning_rate": 0.0002, "epoch": 4.955834829443447, "step": 69010}, {"loss": 0.6639, "grad_norm": 1.1536420583724976, "learning_rate": 0.0002, "epoch": 4.956552962298025, "step": 69020}, {"loss": 0.6375, "grad_norm": 1.1857786178588867, "learning_rate": 0.0002, "epoch": 4.957271095152604, "step": 69030}, {"loss": 0.6618, "grad_norm": 0.9969246983528137, "learning_rate": 0.0002, "epoch": 4.957989228007182, "step": 69040}, {"loss": 0.633, "grad_norm": 1.138635277748108, "learning_rate": 0.0002, "epoch": 4.95870736086176, "step": 69050}, {"loss": 0.6344, "grad_norm": 1.110474705696106, "learning_rate": 0.0002, "epoch": 4.959425493716338, "step": 69060}, {"loss": 0.687, "grad_norm": 1.0366318225860596, "learning_rate": 0.0002, "epoch": 4.960143626570916, "step": 69070}, {"loss": 0.6384, "grad_norm": 0.6927996277809143, "learning_rate": 0.0002, "epoch": 4.960861759425494, "step": 69080}, {"loss": 0.6337, "grad_norm": 1.0368026494979858, "learning_rate": 0.0002, "epoch": 4.961579892280072, "step": 69090}, {"loss": 0.6077, "grad_norm": 1.0638312101364136, "learning_rate": 0.0002, "epoch": 4.96229802513465, "step": 69100}, {"loss": 0.6403, "grad_norm": 1.0372415781021118, "learning_rate": 0.0002, "epoch": 4.9630161579892285, "step": 69110}, {"loss": 0.6347, "grad_norm": 0.8257387280464172, "learning_rate": 0.0002, "epoch": 4.9637342908438065, "step": 69120}, {"loss": 0.6405, "grad_norm": 1.0046974420547485, "learning_rate": 0.0002, "epoch": 4.9644524236983845, "step": 69130}, {"loss": 0.623, "grad_norm": 1.0139652490615845, "learning_rate": 0.0002, "epoch": 4.9651705565529625, "step": 69140}, {"loss": 0.5857, "grad_norm": 1.0214691162109375, "learning_rate": 0.0002, "epoch": 4.9658886894075405, "step": 69150}, {"loss": 0.624, "grad_norm": 1.1042424440383911, "learning_rate": 0.0002, "epoch": 4.9666068222621185, "step": 69160}, {"loss": 0.6475, "grad_norm": 0.8749067783355713, "learning_rate": 0.0002, "epoch": 4.9673249551166965, "step": 69170}, {"loss": 0.6734, "grad_norm": 0.9894024133682251, "learning_rate": 0.0002, "epoch": 4.9680430879712745, "step": 69180}, {"loss": 0.5894, "grad_norm": 1.0218034982681274, "learning_rate": 0.0002, "epoch": 4.9687612208258525, "step": 69190}, {"loss": 0.6423, "grad_norm": 0.9782929420471191, "learning_rate": 0.0002, "epoch": 4.9694793536804305, "step": 69200}, {"loss": 0.6455, "grad_norm": 0.9373409748077393, "learning_rate": 0.0002, "epoch": 4.9701974865350085, "step": 69210}, {"loss": 0.6105, "grad_norm": 1.0329546928405762, "learning_rate": 0.0002, "epoch": 4.970915619389587, "step": 69220}, {"loss": 0.6877, "grad_norm": 0.9746108055114746, "learning_rate": 0.0002, "epoch": 4.971633752244165, "step": 69230}, {"loss": 0.6342, "grad_norm": 0.9202073216438293, "learning_rate": 0.0002, "epoch": 4.972351885098743, "step": 69240}, {"loss": 0.6102, "grad_norm": 1.078032374382019, "learning_rate": 0.0002, "epoch": 4.973070017953321, "step": 69250}, {"loss": 0.6349, "grad_norm": 0.8860024809837341, "learning_rate": 0.0002, "epoch": 4.973788150807899, "step": 69260}, {"loss": 0.5971, "grad_norm": 0.915212094783783, "learning_rate": 0.0002, "epoch": 4.974506283662477, "step": 69270}, {"loss": 0.623, "grad_norm": 1.1192166805267334, "learning_rate": 0.0002, "epoch": 4.975224416517055, "step": 69280}, {"loss": 0.6347, "grad_norm": 0.8387445211410522, "learning_rate": 0.0002, "epoch": 4.975942549371633, "step": 69290}, {"loss": 0.6392, "grad_norm": 1.1210044622421265, "learning_rate": 0.0002, "epoch": 4.976660682226212, "step": 69300}, {"loss": 0.6565, "grad_norm": 1.0051207542419434, "learning_rate": 0.0002, "epoch": 4.97737881508079, "step": 69310}, {"loss": 0.5961, "grad_norm": 0.9248682856559753, "learning_rate": 0.0002, "epoch": 4.978096947935368, "step": 69320}, {"loss": 0.6067, "grad_norm": 0.8265128135681152, "learning_rate": 0.0002, "epoch": 4.978815080789946, "step": 69330}, {"loss": 0.6068, "grad_norm": 0.9432681798934937, "learning_rate": 0.0002, "epoch": 4.979533213644524, "step": 69340}, {"loss": 0.627, "grad_norm": 1.0135977268218994, "learning_rate": 0.0002, "epoch": 4.980251346499102, "step": 69350}, {"loss": 0.5882, "grad_norm": 0.9857245683670044, "learning_rate": 0.0002, "epoch": 4.98096947935368, "step": 69360}, {"loss": 0.6396, "grad_norm": 0.9215952157974243, "learning_rate": 0.0002, "epoch": 4.981687612208258, "step": 69370}, {"loss": 0.565, "grad_norm": 1.1518077850341797, "learning_rate": 0.0002, "epoch": 4.982405745062836, "step": 69380}, {"loss": 0.6022, "grad_norm": 0.8836095929145813, "learning_rate": 0.0002, "epoch": 4.983123877917414, "step": 69390}, {"loss": 0.6442, "grad_norm": 0.8082528710365295, "learning_rate": 0.0002, "epoch": 4.983842010771993, "step": 69400}, {"loss": 0.597, "grad_norm": 0.9295604825019836, "learning_rate": 0.0002, "epoch": 4.984560143626571, "step": 69410}, {"loss": 0.5811, "grad_norm": 1.002057433128357, "learning_rate": 0.0002, "epoch": 4.985278276481149, "step": 69420}, {"loss": 0.6275, "grad_norm": 0.8127216100692749, "learning_rate": 0.0002, "epoch": 4.985996409335727, "step": 69430}, {"loss": 0.6223, "grad_norm": 1.058138370513916, "learning_rate": 0.0002, "epoch": 4.986714542190305, "step": 69440}, {"loss": 0.6317, "grad_norm": 0.8451166749000549, "learning_rate": 0.0002, "epoch": 4.987432675044883, "step": 69450}, {"loss": 0.6135, "grad_norm": 0.9687268137931824, "learning_rate": 0.0002, "epoch": 4.988150807899461, "step": 69460}, {"loss": 0.5926, "grad_norm": 1.0342036485671997, "learning_rate": 0.0002, "epoch": 4.988868940754039, "step": 69470}, {"loss": 0.636, "grad_norm": 0.9042398929595947, "learning_rate": 0.0002, "epoch": 4.989587073608618, "step": 69480}, {"loss": 0.6193, "grad_norm": 1.0575438737869263, "learning_rate": 0.0002, "epoch": 4.990305206463196, "step": 69490}, {"loss": 0.5887, "grad_norm": 0.9364935159683228, "learning_rate": 0.0002, "epoch": 4.991023339317774, "step": 69500}, {"loss": 0.6532, "grad_norm": 1.0327378511428833, "learning_rate": 0.0002, "epoch": 4.991741472172352, "step": 69510}, {"loss": 0.6397, "grad_norm": 0.815592885017395, "learning_rate": 0.0002, "epoch": 4.99245960502693, "step": 69520}, {"loss": 0.6776, "grad_norm": 1.0813369750976562, "learning_rate": 0.0002, "epoch": 4.993177737881508, "step": 69530}, {"loss": 0.6964, "grad_norm": 1.0277023315429688, "learning_rate": 0.0002, "epoch": 4.993895870736086, "step": 69540}, {"loss": 0.6369, "grad_norm": 1.0291162729263306, "learning_rate": 0.0002, "epoch": 4.994614003590664, "step": 69550}, {"loss": 0.5842, "grad_norm": 0.8435685634613037, "learning_rate": 0.0002, "epoch": 4.995332136445242, "step": 69560}, {"loss": 0.6146, "grad_norm": 1.1972291469573975, "learning_rate": 0.0002, "epoch": 4.99605026929982, "step": 69570}, {"loss": 0.5977, "grad_norm": 0.8114907741546631, "learning_rate": 0.0002, "epoch": 4.996768402154398, "step": 69580}, {"loss": 0.6137, "grad_norm": 0.8296133875846863, "learning_rate": 0.0002, "epoch": 4.997486535008977, "step": 69590}, {"loss": 0.6273, "grad_norm": 1.1728706359863281, "learning_rate": 0.0002, "epoch": 4.998204667863555, "step": 69600}, {"loss": 0.6579, "grad_norm": 0.9586578607559204, "learning_rate": 0.0002, "epoch": 4.998922800718133, "step": 69610}, {"loss": 0.612, "grad_norm": 0.9725151062011719, "learning_rate": 0.0002, "epoch": 4.999640933572711, "step": 69620}]} +{"epoch": 6.0, "step": 83550, "epoch_duration": 14995.9871134758, "total_accumulated_duration": 93799.76734352112, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.5816, "grad_norm": 1.0291756391525269, "learning_rate": 0.0002, "epoch": 0.000718132854578097, "step": 10}, {"loss": 1.1527, "grad_norm": 0.6570823192596436, "learning_rate": 0.0002, "epoch": 0.001436265709156194, "step": 20}, {"loss": 1.0014, "grad_norm": 0.693844199180603, "learning_rate": 0.0002, "epoch": 0.0021543985637342907, "step": 30}, {"loss": 0.9377, "grad_norm": 0.5608532428741455, "learning_rate": 0.0002, "epoch": 0.002872531418312388, "step": 40}, {"loss": 0.9533, "grad_norm": 0.549075722694397, "learning_rate": 0.0002, "epoch": 0.003590664272890485, "step": 50}, {"loss": 0.9164, "grad_norm": 0.47189879417419434, "learning_rate": 0.0002, "epoch": 0.004308797127468581, "step": 60}, {"loss": 0.8898, "grad_norm": 0.5799676775932312, "learning_rate": 0.0002, "epoch": 0.005026929982046679, "step": 70}, {"loss": 0.859, "grad_norm": 0.45907193422317505, "learning_rate": 0.0002, "epoch": 0.005745062836624776, "step": 80}, {"loss": 0.8697, "grad_norm": 0.4373045861721039, "learning_rate": 0.0002, "epoch": 0.006463195691202872, "step": 90}, {"loss": 0.8879, "grad_norm": 0.5636304020881653, "learning_rate": 0.0002, "epoch": 0.00718132854578097, "step": 100}, {"loss": 0.8397, "grad_norm": 0.5248253345489502, "learning_rate": 0.0002, "epoch": 0.007899461400359067, "step": 110}, {"loss": 0.9021, "grad_norm": 0.5082874298095703, "learning_rate": 0.0002, "epoch": 0.008617594254937163, "step": 120}, {"loss": 0.8678, "grad_norm": 0.42670881748199463, "learning_rate": 0.0002, "epoch": 0.00933572710951526, "step": 130}, {"loss": 0.7847, "grad_norm": 0.43311649560928345, "learning_rate": 0.0002, "epoch": 0.010053859964093357, "step": 140}, {"loss": 0.9252, "grad_norm": 0.43456509709358215, "learning_rate": 0.0002, "epoch": 0.010771992818671455, "step": 150}, {"loss": 0.8812, "grad_norm": 0.9222815632820129, "learning_rate": 0.0002, "epoch": 0.011490125673249552, "step": 160}, {"loss": 0.8651, "grad_norm": 0.42752256989479065, "learning_rate": 0.0002, "epoch": 0.012208258527827648, "step": 170}, {"loss": 0.8898, "grad_norm": 0.4175542891025543, "learning_rate": 0.0002, "epoch": 0.012926391382405745, "step": 180}, {"loss": 0.8519, "grad_norm": 0.4377831518650055, "learning_rate": 0.0002, "epoch": 0.013644524236983842, "step": 190}, {"loss": 0.8849, "grad_norm": 0.47263655066490173, "learning_rate": 0.0002, "epoch": 0.01436265709156194, "step": 200}, {"loss": 0.8764, "grad_norm": 0.3870520293712616, "learning_rate": 0.0002, "epoch": 0.015080789946140035, "step": 210}, {"loss": 0.833, "grad_norm": 0.4950464963912964, "learning_rate": 0.0002, "epoch": 0.015798922800718134, "step": 220}, {"loss": 0.8323, "grad_norm": 0.4643295407295227, "learning_rate": 0.0002, "epoch": 0.01651705565529623, "step": 230}, {"loss": 0.8363, "grad_norm": 0.5152903199195862, "learning_rate": 0.0002, "epoch": 0.017235188509874325, "step": 240}, {"loss": 0.873, "grad_norm": 0.3800727427005768, "learning_rate": 0.0002, "epoch": 0.017953321364452424, "step": 250}, {"loss": 0.8252, "grad_norm": 0.43700528144836426, "learning_rate": 0.0002, "epoch": 0.01867145421903052, "step": 260}, {"loss": 0.8686, "grad_norm": 0.3712887763977051, "learning_rate": 0.0002, "epoch": 0.01938958707360862, "step": 270}, {"loss": 0.8329, "grad_norm": 0.4202553629875183, "learning_rate": 0.0002, "epoch": 0.020107719928186715, "step": 280}, {"loss": 0.8143, "grad_norm": 0.40585094690322876, "learning_rate": 0.0002, "epoch": 0.02082585278276481, "step": 290}, {"loss": 0.8463, "grad_norm": 0.4685470759868622, "learning_rate": 0.0002, "epoch": 0.02154398563734291, "step": 300}, {"loss": 0.8321, "grad_norm": 0.373169481754303, "learning_rate": 0.0002, "epoch": 0.022262118491921005, "step": 310}, {"loss": 0.8031, "grad_norm": 0.39681482315063477, "learning_rate": 0.0002, "epoch": 0.022980251346499104, "step": 320}, {"loss": 0.8667, "grad_norm": 0.3919322192668915, "learning_rate": 0.0002, "epoch": 0.0236983842010772, "step": 330}, {"loss": 0.8196, "grad_norm": 0.4728981554508209, "learning_rate": 0.0002, "epoch": 0.024416517055655295, "step": 340}, {"loss": 0.8662, "grad_norm": 0.42439374327659607, "learning_rate": 0.0002, "epoch": 0.025134649910233394, "step": 350}, {"loss": 0.8618, "grad_norm": 0.425650030374527, "learning_rate": 0.0002, "epoch": 0.02585278276481149, "step": 360}, {"loss": 0.8249, "grad_norm": 0.4076762795448303, "learning_rate": 0.0002, "epoch": 0.02657091561938959, "step": 370}, {"loss": 0.8293, "grad_norm": 0.44335922598838806, "learning_rate": 0.0002, "epoch": 0.027289048473967684, "step": 380}, {"loss": 0.8288, "grad_norm": 0.5313619375228882, "learning_rate": 0.0002, "epoch": 0.02800718132854578, "step": 390}, {"loss": 0.8431, "grad_norm": 0.37089797854423523, "learning_rate": 0.0002, "epoch": 0.02872531418312388, "step": 400}, {"loss": 0.7644, "grad_norm": 0.5193604826927185, "learning_rate": 0.0002, "epoch": 0.029443447037701975, "step": 410}, {"loss": 0.7853, "grad_norm": 0.4428552985191345, "learning_rate": 0.0002, "epoch": 0.03016157989228007, "step": 420}, {"loss": 0.8641, "grad_norm": 0.384171724319458, "learning_rate": 0.0002, "epoch": 0.03087971274685817, "step": 430}, {"loss": 0.8236, "grad_norm": 0.3906913101673126, "learning_rate": 0.0002, "epoch": 0.03159784560143627, "step": 440}, {"loss": 0.8215, "grad_norm": 0.5365669131278992, "learning_rate": 0.0002, "epoch": 0.03231597845601436, "step": 450}, {"loss": 0.8376, "grad_norm": 0.4785287380218506, "learning_rate": 0.0002, "epoch": 0.03303411131059246, "step": 460}, {"loss": 0.8439, "grad_norm": 0.40048182010650635, "learning_rate": 0.0002, "epoch": 0.03375224416517056, "step": 470}, {"loss": 0.8306, "grad_norm": 0.49529239535331726, "learning_rate": 0.0002, "epoch": 0.03447037701974865, "step": 480}, {"loss": 0.8653, "grad_norm": 0.5853474140167236, "learning_rate": 0.0002, "epoch": 0.03518850987432675, "step": 490}, {"loss": 0.7952, "grad_norm": 0.3802863359451294, "learning_rate": 0.0002, "epoch": 0.03590664272890485, "step": 500}, {"loss": 0.8986, "grad_norm": 0.40374308824539185, "learning_rate": 0.0002, "epoch": 0.03662477558348295, "step": 510}, {"loss": 0.8495, "grad_norm": 0.4320009648799896, "learning_rate": 0.0002, "epoch": 0.03734290843806104, "step": 520}, {"loss": 0.8838, "grad_norm": 0.5198846459388733, "learning_rate": 0.0002, "epoch": 0.03806104129263914, "step": 530}, {"loss": 0.8343, "grad_norm": 0.4136947989463806, "learning_rate": 0.0002, "epoch": 0.03877917414721724, "step": 540}, {"loss": 0.8752, "grad_norm": 0.39344364404678345, "learning_rate": 0.0002, "epoch": 0.03949730700179533, "step": 550}, {"loss": 0.8088, "grad_norm": 0.4659644067287445, "learning_rate": 0.0002, "epoch": 0.04021543985637343, "step": 560}, {"loss": 0.766, "grad_norm": 0.3898842930793762, "learning_rate": 0.0002, "epoch": 0.04093357271095153, "step": 570}, {"loss": 0.7806, "grad_norm": 0.3964841961860657, "learning_rate": 0.0002, "epoch": 0.04165170556552962, "step": 580}, {"loss": 0.801, "grad_norm": 0.5172179341316223, "learning_rate": 0.0002, "epoch": 0.04236983842010772, "step": 590}, {"loss": 0.8253, "grad_norm": 0.5362544059753418, "learning_rate": 0.0002, "epoch": 0.04308797127468582, "step": 600}, {"loss": 0.8701, "grad_norm": 0.3975909948348999, "learning_rate": 0.0002, "epoch": 0.04380610412926391, "step": 610}, {"loss": 0.844, "grad_norm": 0.3905031085014343, "learning_rate": 0.0002, "epoch": 0.04452423698384201, "step": 620}, {"loss": 0.7723, "grad_norm": 0.5148088932037354, "learning_rate": 0.0002, "epoch": 0.04524236983842011, "step": 630}, {"loss": 0.8309, "grad_norm": 0.38826194405555725, "learning_rate": 0.0002, "epoch": 0.04596050269299821, "step": 640}, {"loss": 0.8379, "grad_norm": 0.5432049036026001, "learning_rate": 0.0002, "epoch": 0.0466786355475763, "step": 650}, {"loss": 0.838, "grad_norm": 0.42048221826553345, "learning_rate": 0.0002, "epoch": 0.0473967684021544, "step": 660}, {"loss": 0.8337, "grad_norm": 0.4683088958263397, "learning_rate": 0.0002, "epoch": 0.0481149012567325, "step": 670}, {"loss": 0.7982, "grad_norm": 0.4623735249042511, "learning_rate": 0.0002, "epoch": 0.04883303411131059, "step": 680}, {"loss": 0.8905, "grad_norm": 0.509128212928772, "learning_rate": 0.0002, "epoch": 0.04955116696588869, "step": 690}, {"loss": 0.8193, "grad_norm": 0.45767295360565186, "learning_rate": 0.0002, "epoch": 0.05026929982046679, "step": 700}, {"loss": 0.7658, "grad_norm": 0.4023726284503937, "learning_rate": 0.0002, "epoch": 0.05098743267504488, "step": 710}, {"loss": 0.8552, "grad_norm": 0.4407201409339905, "learning_rate": 0.0002, "epoch": 0.05170556552962298, "step": 720}, {"loss": 0.8369, "grad_norm": 0.41862091422080994, "learning_rate": 0.0002, "epoch": 0.05242369838420108, "step": 730}, {"loss": 0.8856, "grad_norm": 0.37473055720329285, "learning_rate": 0.0002, "epoch": 0.05314183123877918, "step": 740}, {"loss": 0.8282, "grad_norm": 0.4882921576499939, "learning_rate": 0.0002, "epoch": 0.05385996409335727, "step": 750}, {"loss": 0.8257, "grad_norm": 0.47890132665634155, "learning_rate": 0.0002, "epoch": 0.05457809694793537, "step": 760}, {"loss": 0.9222, "grad_norm": 0.5811166167259216, "learning_rate": 0.0002, "epoch": 0.05529622980251347, "step": 770}, {"loss": 0.7943, "grad_norm": 0.41113588213920593, "learning_rate": 0.0002, "epoch": 0.05601436265709156, "step": 780}, {"loss": 0.791, "grad_norm": 0.4120602607727051, "learning_rate": 0.0002, "epoch": 0.05673249551166966, "step": 790}, {"loss": 0.9038, "grad_norm": 0.39287394285202026, "learning_rate": 0.0002, "epoch": 0.05745062836624776, "step": 800}, {"loss": 0.8131, "grad_norm": 0.3986941874027252, "learning_rate": 0.0002, "epoch": 0.05816876122082585, "step": 810}, {"loss": 0.8268, "grad_norm": 0.4264012575149536, "learning_rate": 0.0002, "epoch": 0.05888689407540395, "step": 820}, {"loss": 0.7881, "grad_norm": 0.481139600276947, "learning_rate": 0.0002, "epoch": 0.05960502692998205, "step": 830}, {"loss": 0.8477, "grad_norm": 0.5561784505844116, "learning_rate": 0.0002, "epoch": 0.06032315978456014, "step": 840}, {"loss": 0.7817, "grad_norm": 0.4787197411060333, "learning_rate": 0.0002, "epoch": 0.06104129263913824, "step": 850}, {"loss": 0.8567, "grad_norm": 0.46454647183418274, "learning_rate": 0.0002, "epoch": 0.06175942549371634, "step": 860}, {"loss": 0.8429, "grad_norm": 0.5929669141769409, "learning_rate": 0.0002, "epoch": 0.06247755834829444, "step": 870}, {"loss": 0.8019, "grad_norm": 0.4561384618282318, "learning_rate": 0.0002, "epoch": 0.06319569120287254, "step": 880}, {"loss": 0.8686, "grad_norm": 0.45767998695373535, "learning_rate": 0.0002, "epoch": 0.06391382405745062, "step": 890}, {"loss": 0.818, "grad_norm": 0.42475444078445435, "learning_rate": 0.0002, "epoch": 0.06463195691202872, "step": 900}, {"loss": 0.8579, "grad_norm": 0.4911022484302521, "learning_rate": 0.0002, "epoch": 0.06535008976660682, "step": 910}, {"loss": 0.8067, "grad_norm": 0.5229166746139526, "learning_rate": 0.0002, "epoch": 0.06606822262118492, "step": 920}, {"loss": 0.8563, "grad_norm": 0.38134580850601196, "learning_rate": 0.0002, "epoch": 0.06678635547576302, "step": 930}, {"loss": 0.815, "grad_norm": 0.4171486496925354, "learning_rate": 0.0002, "epoch": 0.06750448833034112, "step": 940}, {"loss": 0.8122, "grad_norm": 0.45171529054641724, "learning_rate": 0.0002, "epoch": 0.06822262118491922, "step": 950}, {"loss": 0.8436, "grad_norm": 0.44889307022094727, "learning_rate": 0.0002, "epoch": 0.0689407540394973, "step": 960}, {"loss": 0.8149, "grad_norm": 0.44902464747428894, "learning_rate": 0.0002, "epoch": 0.0696588868940754, "step": 970}, {"loss": 0.7916, "grad_norm": 0.4671969413757324, "learning_rate": 0.0002, "epoch": 0.0703770197486535, "step": 980}, {"loss": 0.8147, "grad_norm": 0.4686984717845917, "learning_rate": 0.0002, "epoch": 0.0710951526032316, "step": 990}, {"loss": 0.806, "grad_norm": 0.4513658583164215, "learning_rate": 0.0002, "epoch": 0.0718132854578097, "step": 1000}, {"loss": 0.8348, "grad_norm": 0.48861828446388245, "learning_rate": 0.0002, "epoch": 0.0725314183123878, "step": 1010}, {"loss": 0.8038, "grad_norm": 0.7603165507316589, "learning_rate": 0.0002, "epoch": 0.0732495511669659, "step": 1020}, {"loss": 0.7844, "grad_norm": 0.501654863357544, "learning_rate": 0.0002, "epoch": 0.07396768402154398, "step": 1030}, {"loss": 0.7623, "grad_norm": 0.45291560888290405, "learning_rate": 0.0002, "epoch": 0.07468581687612208, "step": 1040}, {"loss": 0.8174, "grad_norm": 0.42454713582992554, "learning_rate": 0.0002, "epoch": 0.07540394973070018, "step": 1050}, {"loss": 0.7874, "grad_norm": 0.4655592441558838, "learning_rate": 0.0002, "epoch": 0.07612208258527828, "step": 1060}, {"loss": 0.8855, "grad_norm": 0.5011071562767029, "learning_rate": 0.0002, "epoch": 0.07684021543985638, "step": 1070}, {"loss": 0.8502, "grad_norm": 0.37221577763557434, "learning_rate": 0.0002, "epoch": 0.07755834829443448, "step": 1080}, {"loss": 0.8623, "grad_norm": 0.5123572945594788, "learning_rate": 0.0002, "epoch": 0.07827648114901256, "step": 1090}, {"loss": 0.8527, "grad_norm": 0.44138720631599426, "learning_rate": 0.0002, "epoch": 0.07899461400359066, "step": 1100}, {"loss": 0.7949, "grad_norm": 0.38932886719703674, "learning_rate": 0.0002, "epoch": 0.07971274685816876, "step": 1110}, {"loss": 0.8289, "grad_norm": 0.435820072889328, "learning_rate": 0.0002, "epoch": 0.08043087971274686, "step": 1120}, {"loss": 0.787, "grad_norm": 0.3820142149925232, "learning_rate": 0.0002, "epoch": 0.08114901256732496, "step": 1130}, {"loss": 0.8617, "grad_norm": 0.39680808782577515, "learning_rate": 0.0002, "epoch": 0.08186714542190306, "step": 1140}, {"loss": 0.8047, "grad_norm": 0.4833722412586212, "learning_rate": 0.0002, "epoch": 0.08258527827648116, "step": 1150}, {"loss": 0.8513, "grad_norm": 0.5045956969261169, "learning_rate": 0.0002, "epoch": 0.08330341113105924, "step": 1160}, {"loss": 0.8366, "grad_norm": 0.3652207553386688, "learning_rate": 0.0002, "epoch": 0.08402154398563734, "step": 1170}, {"loss": 0.8464, "grad_norm": 0.44447052478790283, "learning_rate": 0.0002, "epoch": 0.08473967684021544, "step": 1180}, {"loss": 0.8362, "grad_norm": 0.44942694902420044, "learning_rate": 0.0002, "epoch": 0.08545780969479354, "step": 1190}, {"loss": 0.7932, "grad_norm": 0.48789075016975403, "learning_rate": 0.0002, "epoch": 0.08617594254937164, "step": 1200}, {"loss": 0.8008, "grad_norm": 0.3981451094150543, "learning_rate": 0.0002, "epoch": 0.08689407540394974, "step": 1210}, {"loss": 0.8296, "grad_norm": 0.45545220375061035, "learning_rate": 0.0002, "epoch": 0.08761220825852782, "step": 1220}, {"loss": 0.8406, "grad_norm": 0.562138557434082, "learning_rate": 0.0002, "epoch": 0.08833034111310592, "step": 1230}, {"loss": 0.808, "grad_norm": 0.48523494601249695, "learning_rate": 0.0002, "epoch": 0.08904847396768402, "step": 1240}, {"loss": 0.8024, "grad_norm": 0.35054388642311096, "learning_rate": 0.0002, "epoch": 0.08976660682226212, "step": 1250}, {"loss": 0.8635, "grad_norm": 0.4148605167865753, "learning_rate": 0.0002, "epoch": 0.09048473967684022, "step": 1260}, {"loss": 0.8379, "grad_norm": 0.50171959400177, "learning_rate": 0.0002, "epoch": 0.09120287253141832, "step": 1270}, {"loss": 0.8466, "grad_norm": 0.41747573018074036, "learning_rate": 0.0002, "epoch": 0.09192100538599642, "step": 1280}, {"loss": 0.7905, "grad_norm": 0.43028751015663147, "learning_rate": 0.0002, "epoch": 0.0926391382405745, "step": 1290}, {"loss": 0.8071, "grad_norm": 0.41274991631507874, "learning_rate": 0.0002, "epoch": 0.0933572710951526, "step": 1300}, {"loss": 0.8214, "grad_norm": 0.5399569272994995, "learning_rate": 0.0002, "epoch": 0.0940754039497307, "step": 1310}, {"loss": 0.8108, "grad_norm": 0.44284379482269287, "learning_rate": 0.0002, "epoch": 0.0947935368043088, "step": 1320}, {"loss": 0.8301, "grad_norm": 0.42511969804763794, "learning_rate": 0.0002, "epoch": 0.0955116696588869, "step": 1330}, {"loss": 0.8527, "grad_norm": 0.5717929005622864, "learning_rate": 0.0002, "epoch": 0.096229802513465, "step": 1340}, {"loss": 0.8232, "grad_norm": 0.4104631245136261, "learning_rate": 0.0002, "epoch": 0.09694793536804308, "step": 1350}, {"loss": 0.8697, "grad_norm": 0.4144339859485626, "learning_rate": 0.0002, "epoch": 0.09766606822262118, "step": 1360}, {"loss": 0.7909, "grad_norm": 0.43676936626434326, "learning_rate": 0.0002, "epoch": 0.09838420107719928, "step": 1370}, {"loss": 0.8757, "grad_norm": 0.5297161340713501, "learning_rate": 0.0002, "epoch": 0.09910233393177738, "step": 1380}, {"loss": 0.7772, "grad_norm": 0.5319193601608276, "learning_rate": 0.0002, "epoch": 0.09982046678635548, "step": 1390}, {"loss": 0.8167, "grad_norm": 0.4083728492259979, "learning_rate": 0.0002, "epoch": 0.10053859964093358, "step": 1400}, {"loss": 0.8436, "grad_norm": 0.4193868339061737, "learning_rate": 0.0002, "epoch": 0.10125673249551168, "step": 1410}, {"loss": 0.8634, "grad_norm": 0.4062198996543884, "learning_rate": 0.0002, "epoch": 0.10197486535008976, "step": 1420}, {"loss": 0.7984, "grad_norm": 0.43972232937812805, "learning_rate": 0.0002, "epoch": 0.10269299820466786, "step": 1430}, {"loss": 0.8278, "grad_norm": 0.4598410725593567, "learning_rate": 0.0002, "epoch": 0.10341113105924596, "step": 1440}, {"loss": 0.8527, "grad_norm": 0.571662187576294, "learning_rate": 0.0002, "epoch": 0.10412926391382406, "step": 1450}, {"loss": 0.8485, "grad_norm": 0.5437791347503662, "learning_rate": 0.0002, "epoch": 0.10484739676840216, "step": 1460}, {"loss": 0.8172, "grad_norm": 0.4241923391819, "learning_rate": 0.0002, "epoch": 0.10556552962298026, "step": 1470}, {"loss": 0.8224, "grad_norm": 0.5185145735740662, "learning_rate": 0.0002, "epoch": 0.10628366247755835, "step": 1480}, {"loss": 0.8292, "grad_norm": 0.537626326084137, "learning_rate": 0.0002, "epoch": 0.10700179533213644, "step": 1490}, {"loss": 0.8227, "grad_norm": 0.4573661983013153, "learning_rate": 0.0002, "epoch": 0.10771992818671454, "step": 1500}, {"loss": 0.8318, "grad_norm": 0.4521017074584961, "learning_rate": 0.0002, "epoch": 0.10843806104129264, "step": 1510}, {"loss": 0.8107, "grad_norm": 0.6835159063339233, "learning_rate": 0.0002, "epoch": 0.10915619389587074, "step": 1520}, {"loss": 0.8256, "grad_norm": 0.43522894382476807, "learning_rate": 0.0002, "epoch": 0.10987432675044884, "step": 1530}, {"loss": 0.8211, "grad_norm": 0.685547411441803, "learning_rate": 0.0002, "epoch": 0.11059245960502694, "step": 1540}, {"loss": 0.8393, "grad_norm": 0.5283669233322144, "learning_rate": 0.0002, "epoch": 0.11131059245960502, "step": 1550}, {"loss": 0.8493, "grad_norm": 0.4869283437728882, "learning_rate": 0.0002, "epoch": 0.11202872531418312, "step": 1560}, {"loss": 0.8614, "grad_norm": 0.43024054169654846, "learning_rate": 0.0002, "epoch": 0.11274685816876122, "step": 1570}, {"loss": 0.8026, "grad_norm": 0.46726059913635254, "learning_rate": 0.0002, "epoch": 0.11346499102333932, "step": 1580}, {"loss": 0.8103, "grad_norm": 0.5046039819717407, "learning_rate": 0.0002, "epoch": 0.11418312387791742, "step": 1590}, {"loss": 0.8242, "grad_norm": 0.48972827196121216, "learning_rate": 0.0002, "epoch": 0.11490125673249552, "step": 1600}, {"loss": 0.8114, "grad_norm": 0.5221049189567566, "learning_rate": 0.0002, "epoch": 0.11561938958707361, "step": 1610}, {"loss": 0.8022, "grad_norm": 0.49169477820396423, "learning_rate": 0.0002, "epoch": 0.1163375224416517, "step": 1620}, {"loss": 0.8223, "grad_norm": 0.48462188243865967, "learning_rate": 0.0002, "epoch": 0.1170556552962298, "step": 1630}, {"loss": 0.8409, "grad_norm": 0.9001021981239319, "learning_rate": 0.0002, "epoch": 0.1177737881508079, "step": 1640}, {"loss": 0.8037, "grad_norm": 0.47555917501449585, "learning_rate": 0.0002, "epoch": 0.118491921005386, "step": 1650}, {"loss": 0.8047, "grad_norm": 0.4523521959781647, "learning_rate": 0.0002, "epoch": 0.1192100538599641, "step": 1660}, {"loss": 0.8552, "grad_norm": 0.510956346988678, "learning_rate": 0.0002, "epoch": 0.1199281867145422, "step": 1670}, {"loss": 0.8081, "grad_norm": 0.48063746094703674, "learning_rate": 0.0002, "epoch": 0.12064631956912028, "step": 1680}, {"loss": 0.7712, "grad_norm": 0.5209490060806274, "learning_rate": 0.0002, "epoch": 0.12136445242369838, "step": 1690}, {"loss": 0.8019, "grad_norm": 0.5488983988761902, "learning_rate": 0.0002, "epoch": 0.12208258527827648, "step": 1700}, {"loss": 0.829, "grad_norm": 0.5263523459434509, "learning_rate": 0.0002, "epoch": 0.12280071813285458, "step": 1710}, {"loss": 0.7761, "grad_norm": 0.45365768671035767, "learning_rate": 0.0002, "epoch": 0.12351885098743268, "step": 1720}, {"loss": 0.8432, "grad_norm": 0.4366922378540039, "learning_rate": 0.0002, "epoch": 0.12423698384201078, "step": 1730}, {"loss": 0.8261, "grad_norm": 0.4841083884239197, "learning_rate": 0.0002, "epoch": 0.12495511669658887, "step": 1740}, {"loss": 0.7834, "grad_norm": 0.46546968817710876, "learning_rate": 0.0002, "epoch": 0.12567324955116696, "step": 1750}, {"loss": 0.7874, "grad_norm": 0.39987099170684814, "learning_rate": 0.0002, "epoch": 0.12639138240574507, "step": 1760}, {"loss": 0.813, "grad_norm": 0.4661678969860077, "learning_rate": 0.0002, "epoch": 0.12710951526032316, "step": 1770}, {"loss": 0.8516, "grad_norm": 0.46716657280921936, "learning_rate": 0.0002, "epoch": 0.12782764811490124, "step": 1780}, {"loss": 0.8065, "grad_norm": 0.46164995431900024, "learning_rate": 0.0002, "epoch": 0.12854578096947936, "step": 1790}, {"loss": 0.8911, "grad_norm": 0.4910370111465454, "learning_rate": 0.0002, "epoch": 0.12926391382405744, "step": 1800}, {"loss": 0.7773, "grad_norm": 0.5615737438201904, "learning_rate": 0.0002, "epoch": 0.12998204667863555, "step": 1810}, {"loss": 0.7726, "grad_norm": 0.5739728808403015, "learning_rate": 0.0002, "epoch": 0.13070017953321364, "step": 1820}, {"loss": 0.8307, "grad_norm": 0.44104722142219543, "learning_rate": 0.0002, "epoch": 0.13141831238779175, "step": 1830}, {"loss": 0.7533, "grad_norm": 0.46373724937438965, "learning_rate": 0.0002, "epoch": 0.13213644524236984, "step": 1840}, {"loss": 0.8181, "grad_norm": 0.4481196403503418, "learning_rate": 0.0002, "epoch": 0.13285457809694792, "step": 1850}, {"loss": 0.8508, "grad_norm": 0.5689327716827393, "learning_rate": 0.0002, "epoch": 0.13357271095152604, "step": 1860}, {"loss": 0.8364, "grad_norm": 0.5334849953651428, "learning_rate": 0.0002, "epoch": 0.13429084380610412, "step": 1870}, {"loss": 0.8018, "grad_norm": 0.5177253484725952, "learning_rate": 0.0002, "epoch": 0.13500897666068223, "step": 1880}, {"loss": 0.869, "grad_norm": 0.4919368326663971, "learning_rate": 0.0002, "epoch": 0.13572710951526032, "step": 1890}, {"loss": 0.7647, "grad_norm": 0.5987576842308044, "learning_rate": 0.0002, "epoch": 0.13644524236983843, "step": 1900}, {"loss": 0.8546, "grad_norm": 0.49790486693382263, "learning_rate": 0.0002, "epoch": 0.13716337522441652, "step": 1910}, {"loss": 0.8402, "grad_norm": 0.5337542295455933, "learning_rate": 0.0002, "epoch": 0.1378815080789946, "step": 1920}, {"loss": 0.815, "grad_norm": 0.5171598792076111, "learning_rate": 0.0002, "epoch": 0.13859964093357272, "step": 1930}, {"loss": 0.843, "grad_norm": 0.5003953576087952, "learning_rate": 0.0002, "epoch": 0.1393177737881508, "step": 1940}, {"loss": 0.7867, "grad_norm": 0.5147887468338013, "learning_rate": 0.0002, "epoch": 0.1400359066427289, "step": 1950}, {"loss": 0.8215, "grad_norm": 0.6365984678268433, "learning_rate": 0.0002, "epoch": 0.140754039497307, "step": 1960}, {"loss": 0.8397, "grad_norm": 0.5449512004852295, "learning_rate": 0.0002, "epoch": 0.1414721723518851, "step": 1970}, {"loss": 0.8177, "grad_norm": 0.4062703847885132, "learning_rate": 0.0002, "epoch": 0.1421903052064632, "step": 1980}, {"loss": 0.8058, "grad_norm": 0.4446912705898285, "learning_rate": 0.0002, "epoch": 0.14290843806104128, "step": 1990}, {"loss": 0.7854, "grad_norm": 0.49001234769821167, "learning_rate": 0.0002, "epoch": 0.1436265709156194, "step": 2000}, {"loss": 0.8136, "grad_norm": 0.5591765642166138, "learning_rate": 0.0002, "epoch": 0.14434470377019748, "step": 2010}, {"loss": 0.7808, "grad_norm": 0.6476696133613586, "learning_rate": 0.0002, "epoch": 0.1450628366247756, "step": 2020}, {"loss": 0.8137, "grad_norm": 0.44688376784324646, "learning_rate": 0.0002, "epoch": 0.14578096947935368, "step": 2030}, {"loss": 0.8253, "grad_norm": 0.4437490701675415, "learning_rate": 0.0002, "epoch": 0.1464991023339318, "step": 2040}, {"loss": 0.7654, "grad_norm": 0.59927898645401, "learning_rate": 0.0002, "epoch": 0.14721723518850988, "step": 2050}, {"loss": 0.825, "grad_norm": 0.4356591999530792, "learning_rate": 0.0002, "epoch": 0.14793536804308796, "step": 2060}, {"loss": 0.8038, "grad_norm": 0.5560822486877441, "learning_rate": 0.0002, "epoch": 0.14865350089766607, "step": 2070}, {"loss": 0.838, "grad_norm": 0.43027108907699585, "learning_rate": 0.0002, "epoch": 0.14937163375224416, "step": 2080}, {"loss": 0.8317, "grad_norm": 0.41215455532073975, "learning_rate": 0.0002, "epoch": 0.15008976660682227, "step": 2090}, {"loss": 0.7948, "grad_norm": 0.4607839584350586, "learning_rate": 0.0002, "epoch": 0.15080789946140036, "step": 2100}, {"loss": 0.7981, "grad_norm": 0.4699854254722595, "learning_rate": 0.0002, "epoch": 0.15152603231597844, "step": 2110}, {"loss": 0.8464, "grad_norm": 0.5111975073814392, "learning_rate": 0.0002, "epoch": 0.15224416517055656, "step": 2120}, {"loss": 0.7672, "grad_norm": 0.4713742733001709, "learning_rate": 0.0002, "epoch": 0.15296229802513464, "step": 2130}, {"loss": 0.7692, "grad_norm": 0.3816622793674469, "learning_rate": 0.0002, "epoch": 0.15368043087971275, "step": 2140}, {"loss": 0.7824, "grad_norm": 0.4637526273727417, "learning_rate": 0.0002, "epoch": 0.15439856373429084, "step": 2150}, {"loss": 0.8185, "grad_norm": 0.3691818118095398, "learning_rate": 0.0002, "epoch": 0.15511669658886895, "step": 2160}, {"loss": 0.8298, "grad_norm": 0.4435218274593353, "learning_rate": 0.0002, "epoch": 0.15583482944344704, "step": 2170}, {"loss": 0.7917, "grad_norm": 0.5282211899757385, "learning_rate": 0.0002, "epoch": 0.15655296229802512, "step": 2180}, {"loss": 0.8006, "grad_norm": 0.7611056566238403, "learning_rate": 0.0002, "epoch": 0.15727109515260324, "step": 2190}, {"loss": 0.8039, "grad_norm": 0.5951169729232788, "learning_rate": 0.0002, "epoch": 0.15798922800718132, "step": 2200}, {"loss": 0.8314, "grad_norm": 0.5243265628814697, "learning_rate": 0.0002, "epoch": 0.15870736086175943, "step": 2210}, {"loss": 0.7817, "grad_norm": 0.518944501876831, "learning_rate": 0.0002, "epoch": 0.15942549371633752, "step": 2220}, {"loss": 0.8187, "grad_norm": 0.4264616072177887, "learning_rate": 0.0002, "epoch": 0.16014362657091563, "step": 2230}, {"loss": 0.7916, "grad_norm": 0.4619045853614807, "learning_rate": 0.0002, "epoch": 0.16086175942549372, "step": 2240}, {"loss": 0.84, "grad_norm": 0.4047030508518219, "learning_rate": 0.0002, "epoch": 0.1615798922800718, "step": 2250}, {"loss": 0.8133, "grad_norm": 0.47133687138557434, "learning_rate": 0.0002, "epoch": 0.16229802513464991, "step": 2260}, {"loss": 0.8032, "grad_norm": 0.4990246593952179, "learning_rate": 0.0002, "epoch": 0.163016157989228, "step": 2270}, {"loss": 0.752, "grad_norm": 0.5145298838615417, "learning_rate": 0.0002, "epoch": 0.1637342908438061, "step": 2280}, {"loss": 0.8441, "grad_norm": 0.5354352593421936, "learning_rate": 0.0002, "epoch": 0.1644524236983842, "step": 2290}, {"loss": 0.8099, "grad_norm": 0.47621065378189087, "learning_rate": 0.0002, "epoch": 0.1651705565529623, "step": 2300}, {"loss": 0.8105, "grad_norm": 0.45333582162857056, "learning_rate": 0.0002, "epoch": 0.1658886894075404, "step": 2310}, {"loss": 0.8126, "grad_norm": 0.4832790493965149, "learning_rate": 0.0002, "epoch": 0.16660682226211848, "step": 2320}, {"loss": 0.8271, "grad_norm": 0.4922761619091034, "learning_rate": 0.0002, "epoch": 0.1673249551166966, "step": 2330}, {"loss": 0.8324, "grad_norm": 0.5701655149459839, "learning_rate": 0.0002, "epoch": 0.16804308797127468, "step": 2340}, {"loss": 0.844, "grad_norm": 0.5170459151268005, "learning_rate": 0.0002, "epoch": 0.1687612208258528, "step": 2350}, {"loss": 0.7995, "grad_norm": 0.6562373638153076, "learning_rate": 0.0002, "epoch": 0.16947935368043088, "step": 2360}, {"loss": 0.7733, "grad_norm": 0.5350262522697449, "learning_rate": 0.0002, "epoch": 0.170197486535009, "step": 2370}, {"loss": 0.8501, "grad_norm": 0.5163491368293762, "learning_rate": 0.0002, "epoch": 0.17091561938958708, "step": 2380}, {"loss": 0.7708, "grad_norm": 0.48841530084609985, "learning_rate": 0.0002, "epoch": 0.17163375224416516, "step": 2390}, {"loss": 0.7969, "grad_norm": 0.44912993907928467, "learning_rate": 0.0002, "epoch": 0.17235188509874327, "step": 2400}, {"loss": 0.7706, "grad_norm": 0.5770647525787354, "learning_rate": 0.0002, "epoch": 0.17307001795332136, "step": 2410}, {"loss": 0.8233, "grad_norm": 0.4716179072856903, "learning_rate": 0.0002, "epoch": 0.17378815080789947, "step": 2420}, {"loss": 0.7802, "grad_norm": 0.5465078949928284, "learning_rate": 0.0002, "epoch": 0.17450628366247756, "step": 2430}, {"loss": 0.8191, "grad_norm": 0.40810713171958923, "learning_rate": 0.0002, "epoch": 0.17522441651705564, "step": 2440}, {"loss": 0.7971, "grad_norm": 0.3789578080177307, "learning_rate": 0.0002, "epoch": 0.17594254937163376, "step": 2450}, {"loss": 0.7437, "grad_norm": 0.4615110158920288, "learning_rate": 0.0002, "epoch": 0.17666068222621184, "step": 2460}, {"loss": 0.8102, "grad_norm": 0.4400235712528229, "learning_rate": 0.0002, "epoch": 0.17737881508078995, "step": 2470}, {"loss": 0.8254, "grad_norm": 0.5935020446777344, "learning_rate": 0.0002, "epoch": 0.17809694793536804, "step": 2480}, {"loss": 0.7886, "grad_norm": 0.5672990679740906, "learning_rate": 0.0002, "epoch": 0.17881508078994615, "step": 2490}, {"loss": 0.7829, "grad_norm": 0.4132838845252991, "learning_rate": 0.0002, "epoch": 0.17953321364452424, "step": 2500}, {"loss": 0.8056, "grad_norm": 0.5373716950416565, "learning_rate": 0.0002, "epoch": 0.18025134649910232, "step": 2510}, {"loss": 0.8061, "grad_norm": 0.5335832834243774, "learning_rate": 0.0002, "epoch": 0.18096947935368043, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5705642700195312, "learning_rate": 0.0002, "epoch": 0.18168761220825852, "step": 2530}, {"loss": 0.7779, "grad_norm": 0.4807959496974945, "learning_rate": 0.0002, "epoch": 0.18240574506283663, "step": 2540}, {"loss": 0.7767, "grad_norm": 0.4430573880672455, "learning_rate": 0.0002, "epoch": 0.18312387791741472, "step": 2550}, {"loss": 0.7921, "grad_norm": 0.5294728875160217, "learning_rate": 0.0002, "epoch": 0.18384201077199283, "step": 2560}, {"loss": 0.8102, "grad_norm": 0.661173403263092, "learning_rate": 0.0002, "epoch": 0.18456014362657092, "step": 2570}, {"loss": 0.803, "grad_norm": 0.5044304728507996, "learning_rate": 0.0002, "epoch": 0.185278276481149, "step": 2580}, {"loss": 0.7833, "grad_norm": 0.48929551243782043, "learning_rate": 0.0002, "epoch": 0.18599640933572711, "step": 2590}, {"loss": 0.8252, "grad_norm": 0.5054438710212708, "learning_rate": 0.0002, "epoch": 0.1867145421903052, "step": 2600}, {"loss": 0.7665, "grad_norm": 0.5613677501678467, "learning_rate": 0.0002, "epoch": 0.1874326750448833, "step": 2610}, {"loss": 0.7954, "grad_norm": 0.5762478709220886, "learning_rate": 0.0002, "epoch": 0.1881508078994614, "step": 2620}, {"loss": 0.8312, "grad_norm": 0.4523695409297943, "learning_rate": 0.0002, "epoch": 0.1888689407540395, "step": 2630}, {"loss": 0.8098, "grad_norm": 0.5235317945480347, "learning_rate": 0.0002, "epoch": 0.1895870736086176, "step": 2640}, {"loss": 0.8281, "grad_norm": 0.4894576370716095, "learning_rate": 0.0002, "epoch": 0.19030520646319568, "step": 2650}, {"loss": 0.7923, "grad_norm": 0.45731106400489807, "learning_rate": 0.0002, "epoch": 0.1910233393177738, "step": 2660}, {"loss": 0.7942, "grad_norm": 0.4726541042327881, "learning_rate": 0.0002, "epoch": 0.19174147217235188, "step": 2670}, {"loss": 0.7979, "grad_norm": 0.4281631410121918, "learning_rate": 0.0002, "epoch": 0.19245960502693, "step": 2680}, {"loss": 0.8076, "grad_norm": 0.48011314868927, "learning_rate": 0.0002, "epoch": 0.19317773788150808, "step": 2690}, {"loss": 0.7785, "grad_norm": 0.45785006880760193, "learning_rate": 0.0002, "epoch": 0.19389587073608616, "step": 2700}, {"loss": 0.7726, "grad_norm": 0.5244625210762024, "learning_rate": 0.0002, "epoch": 0.19461400359066428, "step": 2710}, {"loss": 0.8674, "grad_norm": 0.4674883186817169, "learning_rate": 0.0002, "epoch": 0.19533213644524236, "step": 2720}, {"loss": 0.8465, "grad_norm": 0.5969558358192444, "learning_rate": 0.0002, "epoch": 0.19605026929982047, "step": 2730}, {"loss": 0.8238, "grad_norm": 0.44413265585899353, "learning_rate": 0.0002, "epoch": 0.19676840215439856, "step": 2740}, {"loss": 0.8181, "grad_norm": 0.5094553828239441, "learning_rate": 0.0002, "epoch": 0.19748653500897667, "step": 2750}, {"loss": 0.7593, "grad_norm": 0.4931736886501312, "learning_rate": 0.0002, "epoch": 0.19820466786355476, "step": 2760}, {"loss": 0.8535, "grad_norm": 0.4766625463962555, "learning_rate": 0.0002, "epoch": 0.19892280071813284, "step": 2770}, {"loss": 0.754, "grad_norm": 0.4196971654891968, "learning_rate": 0.0002, "epoch": 0.19964093357271095, "step": 2780}, {"loss": 0.7794, "grad_norm": 0.4693375825881958, "learning_rate": 0.0002, "epoch": 0.20035906642728904, "step": 2790}, {"loss": 0.8336, "grad_norm": 0.5407108664512634, "learning_rate": 0.0002, "epoch": 0.20107719928186715, "step": 2800}, {"loss": 0.7938, "grad_norm": 0.42864227294921875, "learning_rate": 0.0002, "epoch": 0.20179533213644524, "step": 2810}, {"loss": 0.8059, "grad_norm": 0.4928833246231079, "learning_rate": 0.0002, "epoch": 0.20251346499102335, "step": 2820}, {"loss": 0.8221, "grad_norm": 0.5575131773948669, "learning_rate": 0.0002, "epoch": 0.20323159784560144, "step": 2830}, {"loss": 0.7712, "grad_norm": 0.505114734172821, "learning_rate": 0.0002, "epoch": 0.20394973070017952, "step": 2840}, {"loss": 0.7986, "grad_norm": 0.4727420210838318, "learning_rate": 0.0002, "epoch": 0.20466786355475763, "step": 2850}, {"loss": 0.7662, "grad_norm": 0.48218145966529846, "learning_rate": 0.0002, "epoch": 0.20538599640933572, "step": 2860}, {"loss": 0.8055, "grad_norm": 0.5196906328201294, "learning_rate": 0.0002, "epoch": 0.20610412926391383, "step": 2870}, {"loss": 0.8401, "grad_norm": 0.4927639067173004, "learning_rate": 0.0002, "epoch": 0.20682226211849192, "step": 2880}, {"loss": 0.8067, "grad_norm": 0.5076990127563477, "learning_rate": 0.0002, "epoch": 0.20754039497307003, "step": 2890}, {"loss": 0.789, "grad_norm": 0.4606800079345703, "learning_rate": 0.0002, "epoch": 0.20825852782764812, "step": 2900}, {"loss": 0.8381, "grad_norm": 0.6184319257736206, "learning_rate": 0.0002, "epoch": 0.2089766606822262, "step": 2910}, {"loss": 0.8019, "grad_norm": 0.5237935781478882, "learning_rate": 0.0002, "epoch": 0.2096947935368043, "step": 2920}, {"loss": 0.7763, "grad_norm": 0.43966251611709595, "learning_rate": 0.0002, "epoch": 0.2104129263913824, "step": 2930}, {"loss": 0.7915, "grad_norm": 0.48786666989326477, "learning_rate": 0.0002, "epoch": 0.2111310592459605, "step": 2940}, {"loss": 0.7549, "grad_norm": 0.4397817552089691, "learning_rate": 0.0002, "epoch": 0.2118491921005386, "step": 2950}, {"loss": 0.8342, "grad_norm": 0.5155336260795593, "learning_rate": 0.0002, "epoch": 0.2125673249551167, "step": 2960}, {"loss": 0.7885, "grad_norm": 0.48058274388313293, "learning_rate": 0.0002, "epoch": 0.2132854578096948, "step": 2970}, {"loss": 0.8208, "grad_norm": 0.5022647976875305, "learning_rate": 0.0002, "epoch": 0.21400359066427288, "step": 2980}, {"loss": 0.784, "grad_norm": 0.5417225360870361, "learning_rate": 0.0002, "epoch": 0.214721723518851, "step": 2990}, {"loss": 0.8518, "grad_norm": 0.46300315856933594, "learning_rate": 0.0002, "epoch": 0.21543985637342908, "step": 3000}, {"loss": 0.764, "grad_norm": 0.5375089049339294, "learning_rate": 0.0002, "epoch": 0.2161579892280072, "step": 3010}, {"loss": 0.8459, "grad_norm": 0.5050022602081299, "learning_rate": 0.0002, "epoch": 0.21687612208258528, "step": 3020}, {"loss": 0.797, "grad_norm": 0.46347716450691223, "learning_rate": 0.0002, "epoch": 0.21759425493716336, "step": 3030}, {"loss": 0.8014, "grad_norm": 0.544874370098114, "learning_rate": 0.0002, "epoch": 0.21831238779174147, "step": 3040}, {"loss": 0.802, "grad_norm": 0.4268142580986023, "learning_rate": 0.0002, "epoch": 0.21903052064631956, "step": 3050}, {"loss": 0.8224, "grad_norm": 0.5527157187461853, "learning_rate": 0.0002, "epoch": 0.21974865350089767, "step": 3060}, {"loss": 0.771, "grad_norm": 0.5565235018730164, "learning_rate": 0.0002, "epoch": 0.22046678635547576, "step": 3070}, {"loss": 0.7807, "grad_norm": 0.4900645613670349, "learning_rate": 0.0002, "epoch": 0.22118491921005387, "step": 3080}, {"loss": 0.8321, "grad_norm": 0.4951242208480835, "learning_rate": 0.0002, "epoch": 0.22190305206463196, "step": 3090}, {"loss": 0.8301, "grad_norm": 0.5831719636917114, "learning_rate": 0.0002, "epoch": 0.22262118491921004, "step": 3100}, {"loss": 0.8011, "grad_norm": 0.417576402425766, "learning_rate": 0.0002, "epoch": 0.22333931777378815, "step": 3110}, {"loss": 0.8226, "grad_norm": 0.4715117812156677, "learning_rate": 0.0002, "epoch": 0.22405745062836624, "step": 3120}, {"loss": 0.778, "grad_norm": 0.5956445336341858, "learning_rate": 0.0002, "epoch": 0.22477558348294435, "step": 3130}, {"loss": 0.788, "grad_norm": 0.408184289932251, "learning_rate": 0.0002, "epoch": 0.22549371633752244, "step": 3140}, {"loss": 0.8096, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 0.22621184919210055, "step": 3150}, {"loss": 0.7722, "grad_norm": 0.5631294846534729, "learning_rate": 0.0002, "epoch": 0.22692998204667864, "step": 3160}, {"loss": 0.7933, "grad_norm": 0.5054665803909302, "learning_rate": 0.0002, "epoch": 0.22764811490125672, "step": 3170}, {"loss": 0.8572, "grad_norm": 0.47388020157814026, "learning_rate": 0.0002, "epoch": 0.22836624775583483, "step": 3180}, {"loss": 0.8148, "grad_norm": 0.45871609449386597, "learning_rate": 0.0002, "epoch": 0.22908438061041292, "step": 3190}, {"loss": 0.8373, "grad_norm": 0.42431211471557617, "learning_rate": 0.0002, "epoch": 0.22980251346499103, "step": 3200}, {"loss": 0.7847, "grad_norm": 0.584872305393219, "learning_rate": 0.0002, "epoch": 0.23052064631956912, "step": 3210}, {"loss": 0.8118, "grad_norm": 0.5489653944969177, "learning_rate": 0.0002, "epoch": 0.23123877917414723, "step": 3220}, {"loss": 0.8552, "grad_norm": 0.5803213119506836, "learning_rate": 0.0002, "epoch": 0.23195691202872532, "step": 3230}, {"loss": 0.7702, "grad_norm": 0.906505823135376, "learning_rate": 0.0002, "epoch": 0.2326750448833034, "step": 3240}, {"loss": 0.8454, "grad_norm": 0.4569525718688965, "learning_rate": 0.0002, "epoch": 0.2333931777378815, "step": 3250}, {"loss": 0.7641, "grad_norm": 0.5566741228103638, "learning_rate": 0.0002, "epoch": 0.2341113105924596, "step": 3260}, {"loss": 0.7964, "grad_norm": 0.5059959888458252, "learning_rate": 0.0002, "epoch": 0.2348294434470377, "step": 3270}, {"loss": 0.7965, "grad_norm": 0.530828058719635, "learning_rate": 0.0002, "epoch": 0.2355475763016158, "step": 3280}, {"loss": 0.807, "grad_norm": 0.5149409174919128, "learning_rate": 0.0002, "epoch": 0.2362657091561939, "step": 3290}, {"loss": 0.8067, "grad_norm": 0.7323763966560364, "learning_rate": 0.0002, "epoch": 0.236983842010772, "step": 3300}, {"loss": 0.774, "grad_norm": 0.6794836521148682, "learning_rate": 0.0002, "epoch": 0.23770197486535008, "step": 3310}, {"loss": 0.7902, "grad_norm": 0.5176534056663513, "learning_rate": 0.0002, "epoch": 0.2384201077199282, "step": 3320}, {"loss": 0.8119, "grad_norm": 0.42245906591415405, "learning_rate": 0.0002, "epoch": 0.23913824057450628, "step": 3330}, {"loss": 0.868, "grad_norm": 0.43535107374191284, "learning_rate": 0.0002, "epoch": 0.2398563734290844, "step": 3340}, {"loss": 0.825, "grad_norm": 0.7038307785987854, "learning_rate": 0.0002, "epoch": 0.24057450628366248, "step": 3350}, {"loss": 0.7818, "grad_norm": 0.5689977407455444, "learning_rate": 0.0002, "epoch": 0.24129263913824056, "step": 3360}, {"loss": 0.7958, "grad_norm": 0.538136899471283, "learning_rate": 0.0002, "epoch": 0.24201077199281867, "step": 3370}, {"loss": 0.7995, "grad_norm": 0.7433661222457886, "learning_rate": 0.0002, "epoch": 0.24272890484739676, "step": 3380}, {"loss": 0.8564, "grad_norm": 0.6996734738349915, "learning_rate": 0.0002, "epoch": 0.24344703770197487, "step": 3390}, {"loss": 0.8288, "grad_norm": 0.5055703520774841, "learning_rate": 0.0002, "epoch": 0.24416517055655296, "step": 3400}, {"loss": 0.7741, "grad_norm": 0.5218513607978821, "learning_rate": 0.0002, "epoch": 0.24488330341113107, "step": 3410}, {"loss": 0.7903, "grad_norm": 0.42782822251319885, "learning_rate": 0.0002, "epoch": 0.24560143626570916, "step": 3420}, {"loss": 0.8005, "grad_norm": 0.4991157650947571, "learning_rate": 0.0002, "epoch": 0.24631956912028724, "step": 3430}, {"loss": 0.8151, "grad_norm": 0.5063165426254272, "learning_rate": 0.0002, "epoch": 0.24703770197486535, "step": 3440}, {"loss": 0.7722, "grad_norm": 0.45863136649131775, "learning_rate": 0.0002, "epoch": 0.24775583482944344, "step": 3450}, {"loss": 0.8236, "grad_norm": 0.474728524684906, "learning_rate": 0.0002, "epoch": 0.24847396768402155, "step": 3460}, {"loss": 0.7698, "grad_norm": 0.522570013999939, "learning_rate": 0.0002, "epoch": 0.24919210053859964, "step": 3470}, {"loss": 0.7448, "grad_norm": 0.5474396347999573, "learning_rate": 0.0002, "epoch": 0.24991023339317775, "step": 3480}, {"loss": 0.8339, "grad_norm": 0.49094662070274353, "learning_rate": 0.0002, "epoch": 0.2506283662477558, "step": 3490}, {"loss": 0.7864, "grad_norm": 0.6399132609367371, "learning_rate": 0.0002, "epoch": 0.2513464991023339, "step": 3500}, {"loss": 0.7988, "grad_norm": 0.5910066366195679, "learning_rate": 0.0002, "epoch": 0.25206463195691203, "step": 3510}, {"loss": 0.813, "grad_norm": 0.4761259853839874, "learning_rate": 0.0002, "epoch": 0.25278276481149015, "step": 3520}, {"loss": 0.812, "grad_norm": 0.5124502182006836, "learning_rate": 0.0002, "epoch": 0.2535008976660682, "step": 3530}, {"loss": 0.7699, "grad_norm": 0.4329150915145874, "learning_rate": 0.0002, "epoch": 0.2542190305206463, "step": 3540}, {"loss": 0.8205, "grad_norm": 0.4839608371257782, "learning_rate": 0.0002, "epoch": 0.25493716337522443, "step": 3550}, {"loss": 0.8279, "grad_norm": 0.5413459539413452, "learning_rate": 0.0002, "epoch": 0.2556552962298025, "step": 3560}, {"loss": 0.8253, "grad_norm": 0.5761468410491943, "learning_rate": 0.0002, "epoch": 0.2563734290843806, "step": 3570}, {"loss": 0.8473, "grad_norm": 0.49266132712364197, "learning_rate": 0.0002, "epoch": 0.2570915619389587, "step": 3580}, {"loss": 0.7946, "grad_norm": 0.7377930879592896, "learning_rate": 0.0002, "epoch": 0.2578096947935368, "step": 3590}, {"loss": 0.799, "grad_norm": 0.543541431427002, "learning_rate": 0.0002, "epoch": 0.2585278276481149, "step": 3600}, {"loss": 0.8044, "grad_norm": 0.48385897278785706, "learning_rate": 0.0002, "epoch": 0.259245960502693, "step": 3610}, {"loss": 0.7686, "grad_norm": 0.5152639746665955, "learning_rate": 0.0002, "epoch": 0.2599640933572711, "step": 3620}, {"loss": 0.7438, "grad_norm": 0.5601988434791565, "learning_rate": 0.0002, "epoch": 0.26068222621184917, "step": 3630}, {"loss": 0.7915, "grad_norm": 0.4349626302719116, "learning_rate": 0.0002, "epoch": 0.2614003590664273, "step": 3640}, {"loss": 0.7825, "grad_norm": 0.5487161874771118, "learning_rate": 0.0002, "epoch": 0.2621184919210054, "step": 3650}, {"loss": 0.8085, "grad_norm": 0.45603805780410767, "learning_rate": 0.0002, "epoch": 0.2628366247755835, "step": 3660}, {"loss": 0.7858, "grad_norm": 0.5012730956077576, "learning_rate": 0.0002, "epoch": 0.26355475763016156, "step": 3670}, {"loss": 0.8022, "grad_norm": 0.4523845314979553, "learning_rate": 0.0002, "epoch": 0.2642728904847397, "step": 3680}, {"loss": 0.7932, "grad_norm": 0.5756664872169495, "learning_rate": 0.0002, "epoch": 0.2649910233393178, "step": 3690}, {"loss": 0.816, "grad_norm": 0.48467493057250977, "learning_rate": 0.0002, "epoch": 0.26570915619389585, "step": 3700}, {"loss": 0.7825, "grad_norm": 0.4860585927963257, "learning_rate": 0.0002, "epoch": 0.26642728904847396, "step": 3710}, {"loss": 0.7903, "grad_norm": 0.5067077875137329, "learning_rate": 0.0002, "epoch": 0.26714542190305207, "step": 3720}, {"loss": 0.8155, "grad_norm": 0.5490895509719849, "learning_rate": 0.0002, "epoch": 0.2678635547576302, "step": 3730}, {"loss": 0.7542, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.26858168761220824, "step": 3740}, {"loss": 0.7991, "grad_norm": 0.5026951432228088, "learning_rate": 0.0002, "epoch": 0.26929982046678635, "step": 3750}, {"loss": 0.8152, "grad_norm": 0.49474090337753296, "learning_rate": 0.0002, "epoch": 0.27001795332136447, "step": 3760}, {"loss": 0.8235, "grad_norm": 0.6381985545158386, "learning_rate": 0.0002, "epoch": 0.2707360861759425, "step": 3770}, {"loss": 0.8024, "grad_norm": 0.4784011244773865, "learning_rate": 0.0002, "epoch": 0.27145421903052064, "step": 3780}, {"loss": 0.7746, "grad_norm": 0.5126543045043945, "learning_rate": 0.0002, "epoch": 0.27217235188509875, "step": 3790}, {"loss": 0.841, "grad_norm": 0.5428652763366699, "learning_rate": 0.0002, "epoch": 0.27289048473967686, "step": 3800}, {"loss": 0.8137, "grad_norm": 0.5427033305168152, "learning_rate": 0.0002, "epoch": 0.2736086175942549, "step": 3810}, {"loss": 0.7274, "grad_norm": 0.46467480063438416, "learning_rate": 0.0002, "epoch": 0.27432675044883303, "step": 3820}, {"loss": 0.8414, "grad_norm": 0.494367390871048, "learning_rate": 0.0002, "epoch": 0.27504488330341115, "step": 3830}, {"loss": 0.8151, "grad_norm": 0.59856778383255, "learning_rate": 0.0002, "epoch": 0.2757630161579892, "step": 3840}, {"loss": 0.7899, "grad_norm": 0.422128826379776, "learning_rate": 0.0002, "epoch": 0.2764811490125673, "step": 3850}, {"loss": 0.8153, "grad_norm": 0.5757306814193726, "learning_rate": 0.0002, "epoch": 0.27719928186714543, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5850930213928223, "learning_rate": 0.0002, "epoch": 0.27791741472172354, "step": 3870}, {"loss": 0.8044, "grad_norm": 0.5633023977279663, "learning_rate": 0.0002, "epoch": 0.2786355475763016, "step": 3880}, {"loss": 0.8402, "grad_norm": 0.5037940144538879, "learning_rate": 0.0002, "epoch": 0.2793536804308797, "step": 3890}, {"loss": 0.822, "grad_norm": 0.5255506038665771, "learning_rate": 0.0002, "epoch": 0.2800718132854578, "step": 3900}, {"loss": 0.7625, "grad_norm": 0.44584617018699646, "learning_rate": 0.0002, "epoch": 0.2807899461400359, "step": 3910}, {"loss": 0.8131, "grad_norm": 0.4803239405155182, "learning_rate": 0.0002, "epoch": 0.281508078994614, "step": 3920}, {"loss": 0.8122, "grad_norm": 0.5206008553504944, "learning_rate": 0.0002, "epoch": 0.2822262118491921, "step": 3930}, {"loss": 0.8988, "grad_norm": 0.5596373081207275, "learning_rate": 0.0002, "epoch": 0.2829443447037702, "step": 3940}, {"loss": 0.8091, "grad_norm": 0.4487258493900299, "learning_rate": 0.0002, "epoch": 0.2836624775583483, "step": 3950}, {"loss": 0.7933, "grad_norm": 0.4774281978607178, "learning_rate": 0.0002, "epoch": 0.2843806104129264, "step": 3960}, {"loss": 0.8994, "grad_norm": 0.571829617023468, "learning_rate": 0.0002, "epoch": 0.2850987432675045, "step": 3970}, {"loss": 0.7971, "grad_norm": 0.45251455903053284, "learning_rate": 0.0002, "epoch": 0.28581687612208256, "step": 3980}, {"loss": 0.8007, "grad_norm": 0.5119943618774414, "learning_rate": 0.0002, "epoch": 0.2865350089766607, "step": 3990}, {"loss": 0.8087, "grad_norm": 0.42333969473838806, "learning_rate": 0.0002, "epoch": 0.2872531418312388, "step": 4000}, {"loss": 0.7978, "grad_norm": 0.5694096684455872, "learning_rate": 0.0002, "epoch": 0.2879712746858169, "step": 4010}, {"loss": 0.845, "grad_norm": 0.44457492232322693, "learning_rate": 0.0002, "epoch": 0.28868940754039496, "step": 4020}, {"loss": 0.7268, "grad_norm": 0.496545672416687, "learning_rate": 0.0002, "epoch": 0.2894075403949731, "step": 4030}, {"loss": 0.7908, "grad_norm": 0.5092352032661438, "learning_rate": 0.0002, "epoch": 0.2901256732495512, "step": 4040}, {"loss": 0.807, "grad_norm": 0.5124567151069641, "learning_rate": 0.0002, "epoch": 0.29084380610412924, "step": 4050}, {"loss": 0.8191, "grad_norm": 0.5148161053657532, "learning_rate": 0.0002, "epoch": 0.29156193895870736, "step": 4060}, {"loss": 0.7893, "grad_norm": 0.48183947801589966, "learning_rate": 0.0002, "epoch": 0.29228007181328547, "step": 4070}, {"loss": 0.8397, "grad_norm": 0.47728800773620605, "learning_rate": 0.0002, "epoch": 0.2929982046678636, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.29371633752244164, "step": 4090}, {"loss": 0.8019, "grad_norm": 0.5343585014343262, "learning_rate": 0.0002, "epoch": 0.29443447037701975, "step": 4100}, {"loss": 0.7933, "grad_norm": 0.5760312676429749, "learning_rate": 0.0002, "epoch": 0.29515260323159787, "step": 4110}, {"loss": 0.811, "grad_norm": 0.5894787907600403, "learning_rate": 0.0002, "epoch": 0.2958707360861759, "step": 4120}, {"loss": 0.7375, "grad_norm": 0.4528578817844391, "learning_rate": 0.0002, "epoch": 0.29658886894075404, "step": 4130}, {"loss": 0.7761, "grad_norm": 0.6027235388755798, "learning_rate": 0.0002, "epoch": 0.29730700179533215, "step": 4140}, {"loss": 0.7636, "grad_norm": 0.5060310959815979, "learning_rate": 0.0002, "epoch": 0.2980251346499102, "step": 4150}, {"loss": 0.8122, "grad_norm": 0.475252628326416, "learning_rate": 0.0002, "epoch": 0.2987432675044883, "step": 4160}, {"loss": 0.8306, "grad_norm": 0.4855351448059082, "learning_rate": 0.0002, "epoch": 0.29946140035906643, "step": 4170}, {"loss": 0.7863, "grad_norm": 0.6720767021179199, "learning_rate": 0.0002, "epoch": 0.30017953321364454, "step": 4180}, {"loss": 0.7755, "grad_norm": 0.6409553289413452, "learning_rate": 0.0002, "epoch": 0.3008976660682226, "step": 4190}, {"loss": 0.8333, "grad_norm": 0.5508167147636414, "learning_rate": 0.0002, "epoch": 0.3016157989228007, "step": 4200}, {"loss": 0.8138, "grad_norm": 0.45958149433135986, "learning_rate": 0.0002, "epoch": 0.30233393177737883, "step": 4210}, {"loss": 0.8266, "grad_norm": 0.5201641321182251, "learning_rate": 0.0002, "epoch": 0.3030520646319569, "step": 4220}, {"loss": 0.8246, "grad_norm": 0.5440032482147217, "learning_rate": 0.0002, "epoch": 0.303770197486535, "step": 4230}, {"loss": 0.7863, "grad_norm": 0.43566814064979553, "learning_rate": 0.0002, "epoch": 0.3044883303411131, "step": 4240}, {"loss": 0.7835, "grad_norm": 0.4479893445968628, "learning_rate": 0.0002, "epoch": 0.3052064631956912, "step": 4250}, {"loss": 0.7646, "grad_norm": 0.40390217304229736, "learning_rate": 0.0002, "epoch": 0.3059245960502693, "step": 4260}, {"loss": 0.8382, "grad_norm": 0.5143486261367798, "learning_rate": 0.0002, "epoch": 0.3066427289048474, "step": 4270}, {"loss": 0.799, "grad_norm": 0.5289962887763977, "learning_rate": 0.0002, "epoch": 0.3073608617594255, "step": 4280}, {"loss": 0.7706, "grad_norm": 0.609561026096344, "learning_rate": 0.0002, "epoch": 0.30807899461400357, "step": 4290}, {"loss": 0.809, "grad_norm": 0.5967493653297424, "learning_rate": 0.0002, "epoch": 0.3087971274685817, "step": 4300}, {"loss": 0.8034, "grad_norm": 0.5323672890663147, "learning_rate": 0.0002, "epoch": 0.3095152603231598, "step": 4310}, {"loss": 0.8463, "grad_norm": 0.4996737241744995, "learning_rate": 0.0002, "epoch": 0.3102333931777379, "step": 4320}, {"loss": 0.7879, "grad_norm": 0.5528829097747803, "learning_rate": 0.0002, "epoch": 0.31095152603231596, "step": 4330}, {"loss": 0.8383, "grad_norm": 0.5394268035888672, "learning_rate": 0.0002, "epoch": 0.3116696588868941, "step": 4340}, {"loss": 0.8258, "grad_norm": 0.4654628038406372, "learning_rate": 0.0002, "epoch": 0.3123877917414722, "step": 4350}, {"loss": 0.8045, "grad_norm": 0.4933706521987915, "learning_rate": 0.0002, "epoch": 0.31310592459605024, "step": 4360}, {"loss": 0.7891, "grad_norm": 0.5310598611831665, "learning_rate": 0.0002, "epoch": 0.31382405745062836, "step": 4370}, {"loss": 0.8362, "grad_norm": 0.5558765530586243, "learning_rate": 0.0002, "epoch": 0.31454219030520647, "step": 4380}, {"loss": 0.8013, "grad_norm": 0.5281313061714172, "learning_rate": 0.0002, "epoch": 0.3152603231597846, "step": 4390}, {"loss": 0.8034, "grad_norm": 0.5100293755531311, "learning_rate": 0.0002, "epoch": 0.31597845601436264, "step": 4400}, {"loss": 0.795, "grad_norm": 0.48762813210487366, "learning_rate": 0.0002, "epoch": 0.31669658886894075, "step": 4410}, {"loss": 0.7941, "grad_norm": 0.5211702585220337, "learning_rate": 0.0002, "epoch": 0.31741472172351887, "step": 4420}, {"loss": 0.8079, "grad_norm": 0.696747899055481, "learning_rate": 0.0002, "epoch": 0.3181328545780969, "step": 4430}, {"loss": 0.77, "grad_norm": 0.6334946751594543, "learning_rate": 0.0002, "epoch": 0.31885098743267504, "step": 4440}, {"loss": 0.7871, "grad_norm": 0.5333067178726196, "learning_rate": 0.0002, "epoch": 0.31956912028725315, "step": 4450}, {"loss": 0.7846, "grad_norm": 0.500091552734375, "learning_rate": 0.0002, "epoch": 0.32028725314183126, "step": 4460}, {"loss": 0.7884, "grad_norm": 0.5190957188606262, "learning_rate": 0.0002, "epoch": 0.3210053859964093, "step": 4470}, {"loss": 0.7988, "grad_norm": 0.6702370047569275, "learning_rate": 0.0002, "epoch": 0.32172351885098743, "step": 4480}, {"loss": 0.8014, "grad_norm": 0.4393869638442993, "learning_rate": 0.0002, "epoch": 0.32244165170556555, "step": 4490}, {"loss": 0.8373, "grad_norm": 0.4766499400138855, "learning_rate": 0.0002, "epoch": 0.3231597845601436, "step": 4500}, {"loss": 0.7567, "grad_norm": 0.561836838722229, "learning_rate": 0.0002, "epoch": 0.3238779174147217, "step": 4510}, {"loss": 0.7727, "grad_norm": 0.44366541504859924, "learning_rate": 0.0002, "epoch": 0.32459605026929983, "step": 4520}, {"loss": 0.8109, "grad_norm": 0.46504274010658264, "learning_rate": 0.0002, "epoch": 0.32531418312387794, "step": 4530}, {"loss": 0.7868, "grad_norm": 0.5498034954071045, "learning_rate": 0.0002, "epoch": 0.326032315978456, "step": 4540}, {"loss": 0.7638, "grad_norm": 0.5901338458061218, "learning_rate": 0.0002, "epoch": 0.3267504488330341, "step": 4550}, {"loss": 0.8016, "grad_norm": 0.5485442876815796, "learning_rate": 0.0002, "epoch": 0.3274685816876122, "step": 4560}, {"loss": 0.7944, "grad_norm": 0.512584924697876, "learning_rate": 0.0002, "epoch": 0.3281867145421903, "step": 4570}, {"loss": 0.8193, "grad_norm": 0.5208188891410828, "learning_rate": 0.0002, "epoch": 0.3289048473967684, "step": 4580}, {"loss": 0.7833, "grad_norm": 0.4923836886882782, "learning_rate": 0.0002, "epoch": 0.3296229802513465, "step": 4590}, {"loss": 0.8102, "grad_norm": 0.49258530139923096, "learning_rate": 0.0002, "epoch": 0.3303411131059246, "step": 4600}, {"loss": 0.7874, "grad_norm": 0.4788922667503357, "learning_rate": 0.0002, "epoch": 0.3310592459605027, "step": 4610}, {"loss": 0.8298, "grad_norm": 0.48276954889297485, "learning_rate": 0.0002, "epoch": 0.3317773788150808, "step": 4620}, {"loss": 0.8519, "grad_norm": 0.6300732493400574, "learning_rate": 0.0002, "epoch": 0.3324955116696589, "step": 4630}, {"loss": 0.8434, "grad_norm": 0.47594770789146423, "learning_rate": 0.0002, "epoch": 0.33321364452423696, "step": 4640}, {"loss": 0.8123, "grad_norm": 0.4728924632072449, "learning_rate": 0.0002, "epoch": 0.3339317773788151, "step": 4650}, {"loss": 0.8113, "grad_norm": 0.5586788654327393, "learning_rate": 0.0002, "epoch": 0.3346499102333932, "step": 4660}, {"loss": 0.7949, "grad_norm": 0.4573180377483368, "learning_rate": 0.0002, "epoch": 0.3353680430879713, "step": 4670}, {"loss": 0.8341, "grad_norm": 0.6391524076461792, "learning_rate": 0.0002, "epoch": 0.33608617594254936, "step": 4680}, {"loss": 0.8126, "grad_norm": 0.6570921540260315, "learning_rate": 0.0002, "epoch": 0.33680430879712747, "step": 4690}, {"loss": 0.796, "grad_norm": 0.4601454734802246, "learning_rate": 0.0002, "epoch": 0.3375224416517056, "step": 4700}, {"loss": 0.8158, "grad_norm": 0.5640755295753479, "learning_rate": 0.0002, "epoch": 0.33824057450628364, "step": 4710}, {"loss": 0.8326, "grad_norm": 0.43475520610809326, "learning_rate": 0.0002, "epoch": 0.33895870736086176, "step": 4720}, {"loss": 0.7684, "grad_norm": 0.4785807132720947, "learning_rate": 0.0002, "epoch": 0.33967684021543987, "step": 4730}, {"loss": 0.8257, "grad_norm": 0.4934665262699127, "learning_rate": 0.0002, "epoch": 0.340394973070018, "step": 4740}, {"loss": 0.7713, "grad_norm": 0.45327693223953247, "learning_rate": 0.0002, "epoch": 0.34111310592459604, "step": 4750}, {"loss": 0.7944, "grad_norm": 0.4710456430912018, "learning_rate": 0.0002, "epoch": 0.34183123877917415, "step": 4760}, {"loss": 0.7689, "grad_norm": 0.5591559410095215, "learning_rate": 0.0002, "epoch": 0.34254937163375226, "step": 4770}, {"loss": 0.8204, "grad_norm": 0.48958835005760193, "learning_rate": 0.0002, "epoch": 0.3432675044883303, "step": 4780}, {"loss": 0.8232, "grad_norm": 0.4613766670227051, "learning_rate": 0.0002, "epoch": 0.34398563734290843, "step": 4790}, {"loss": 0.8339, "grad_norm": 0.5425335764884949, "learning_rate": 0.0002, "epoch": 0.34470377019748655, "step": 4800}, {"loss": 0.828, "grad_norm": 0.4964924156665802, "learning_rate": 0.0002, "epoch": 0.3454219030520646, "step": 4810}, {"loss": 0.8264, "grad_norm": 0.613449215888977, "learning_rate": 0.0002, "epoch": 0.3461400359066427, "step": 4820}, {"loss": 0.846, "grad_norm": 0.6553348898887634, "learning_rate": 0.0002, "epoch": 0.34685816876122083, "step": 4830}, {"loss": 0.8181, "grad_norm": 0.5863470435142517, "learning_rate": 0.0002, "epoch": 0.34757630161579894, "step": 4840}, {"loss": 0.8205, "grad_norm": 0.5338097810745239, "learning_rate": 0.0002, "epoch": 0.348294434470377, "step": 4850}, {"loss": 0.7926, "grad_norm": 0.6129760146141052, "learning_rate": 0.0002, "epoch": 0.3490125673249551, "step": 4860}, {"loss": 0.7745, "grad_norm": 0.6100956797599792, "learning_rate": 0.0002, "epoch": 0.3497307001795332, "step": 4870}, {"loss": 0.7642, "grad_norm": 0.5478541254997253, "learning_rate": 0.0002, "epoch": 0.3504488330341113, "step": 4880}, {"loss": 0.7558, "grad_norm": 0.5725359916687012, "learning_rate": 0.0002, "epoch": 0.3511669658886894, "step": 4890}, {"loss": 0.8208, "grad_norm": 0.6141043901443481, "learning_rate": 0.0002, "epoch": 0.3518850987432675, "step": 4900}, {"loss": 0.841, "grad_norm": 0.597191572189331, "learning_rate": 0.0002, "epoch": 0.3526032315978456, "step": 4910}, {"loss": 0.8234, "grad_norm": 0.5988389253616333, "learning_rate": 0.0002, "epoch": 0.3533213644524237, "step": 4920}, {"loss": 0.7775, "grad_norm": 0.5503361821174622, "learning_rate": 0.0002, "epoch": 0.3540394973070018, "step": 4930}, {"loss": 0.8315, "grad_norm": 0.5932779312133789, "learning_rate": 0.0002, "epoch": 0.3547576301615799, "step": 4940}, {"loss": 0.8407, "grad_norm": 0.48911359906196594, "learning_rate": 0.0002, "epoch": 0.35547576301615796, "step": 4950}, {"loss": 0.8191, "grad_norm": 0.5435750484466553, "learning_rate": 0.0002, "epoch": 0.3561938958707361, "step": 4960}, {"loss": 0.7551, "grad_norm": 0.4786977767944336, "learning_rate": 0.0002, "epoch": 0.3569120287253142, "step": 4970}, {"loss": 0.7845, "grad_norm": 0.4022316336631775, "learning_rate": 0.0002, "epoch": 0.3576301615798923, "step": 4980}, {"loss": 0.8032, "grad_norm": 0.4848504364490509, "learning_rate": 0.0002, "epoch": 0.35834829443447036, "step": 4990}, {"loss": 0.809, "grad_norm": 0.5093459486961365, "learning_rate": 0.0002, "epoch": 0.3590664272890485, "step": 5000}, {"loss": 0.8424, "grad_norm": 0.47368478775024414, "learning_rate": 0.0002, "epoch": 0.3597845601436266, "step": 5010}, {"loss": 0.811, "grad_norm": 0.6041097044944763, "learning_rate": 0.0002, "epoch": 0.36050269299820464, "step": 5020}, {"loss": 0.8023, "grad_norm": 0.5384424924850464, "learning_rate": 0.0002, "epoch": 0.36122082585278276, "step": 5030}, {"loss": 0.826, "grad_norm": 0.4668518602848053, "learning_rate": 0.0002, "epoch": 0.36193895870736087, "step": 5040}, {"loss": 0.7785, "grad_norm": 0.5471060276031494, "learning_rate": 0.0002, "epoch": 0.362657091561939, "step": 5050}, {"loss": 0.7511, "grad_norm": 0.731369137763977, "learning_rate": 0.0002, "epoch": 0.36337522441651704, "step": 5060}, {"loss": 0.8646, "grad_norm": 0.5119590759277344, "learning_rate": 0.0002, "epoch": 0.36409335727109515, "step": 5070}, {"loss": 0.8125, "grad_norm": 0.567428469657898, "learning_rate": 0.0002, "epoch": 0.36481149012567327, "step": 5080}, {"loss": 0.7616, "grad_norm": 0.5139971375465393, "learning_rate": 0.0002, "epoch": 0.3655296229802513, "step": 5090}, {"loss": 0.8091, "grad_norm": 0.5701581835746765, "learning_rate": 0.0002, "epoch": 0.36624775583482944, "step": 5100}, {"loss": 0.821, "grad_norm": 0.5022063851356506, "learning_rate": 0.0002, "epoch": 0.36696588868940755, "step": 5110}, {"loss": 0.7879, "grad_norm": 0.4684354364871979, "learning_rate": 0.0002, "epoch": 0.36768402154398566, "step": 5120}, {"loss": 0.8028, "grad_norm": 0.5423495769500732, "learning_rate": 0.0002, "epoch": 0.3684021543985637, "step": 5130}, {"loss": 0.7763, "grad_norm": 0.46262967586517334, "learning_rate": 0.0002, "epoch": 0.36912028725314183, "step": 5140}, {"loss": 0.8485, "grad_norm": 0.4720141589641571, "learning_rate": 0.0002, "epoch": 0.36983842010771995, "step": 5150}, {"loss": 0.7778, "grad_norm": 0.5113096833229065, "learning_rate": 0.0002, "epoch": 0.370556552962298, "step": 5160}, {"loss": 0.7854, "grad_norm": 0.5253350138664246, "learning_rate": 0.0002, "epoch": 0.3712746858168761, "step": 5170}, {"loss": 0.8539, "grad_norm": 0.5799776315689087, "learning_rate": 0.0002, "epoch": 0.37199281867145423, "step": 5180}, {"loss": 0.78, "grad_norm": 0.5166001319885254, "learning_rate": 0.0002, "epoch": 0.37271095152603234, "step": 5190}, {"loss": 0.7939, "grad_norm": 0.5658290386199951, "learning_rate": 0.0002, "epoch": 0.3734290843806104, "step": 5200}, {"loss": 0.8059, "grad_norm": 0.45811113715171814, "learning_rate": 0.0002, "epoch": 0.3741472172351885, "step": 5210}, {"loss": 0.8024, "grad_norm": 0.5509489178657532, "learning_rate": 0.0002, "epoch": 0.3748653500897666, "step": 5220}, {"loss": 0.7537, "grad_norm": 0.47473257780075073, "learning_rate": 0.0002, "epoch": 0.3755834829443447, "step": 5230}, {"loss": 0.8159, "grad_norm": 0.3858596086502075, "learning_rate": 0.0002, "epoch": 0.3763016157989228, "step": 5240}, {"loss": 0.8592, "grad_norm": 0.6941536068916321, "learning_rate": 0.0002, "epoch": 0.3770197486535009, "step": 5250}, {"loss": 0.8489, "grad_norm": 0.46940872073173523, "learning_rate": 0.0002, "epoch": 0.377737881508079, "step": 5260}, {"loss": 0.7818, "grad_norm": 0.5413833260536194, "learning_rate": 0.0002, "epoch": 0.3784560143626571, "step": 5270}, {"loss": 0.8202, "grad_norm": 0.5165658593177795, "learning_rate": 0.0002, "epoch": 0.3791741472172352, "step": 5280}, {"loss": 0.7837, "grad_norm": 0.6567398309707642, "learning_rate": 0.0002, "epoch": 0.3798922800718133, "step": 5290}, {"loss": 0.7991, "grad_norm": 0.5466915965080261, "learning_rate": 0.0002, "epoch": 0.38061041292639136, "step": 5300}, {"loss": 0.7683, "grad_norm": 0.4800598621368408, "learning_rate": 0.0002, "epoch": 0.3813285457809695, "step": 5310}, {"loss": 0.8653, "grad_norm": 0.4551742970943451, "learning_rate": 0.0002, "epoch": 0.3820466786355476, "step": 5320}, {"loss": 0.8283, "grad_norm": 0.5561164617538452, "learning_rate": 0.0002, "epoch": 0.3827648114901257, "step": 5330}, {"loss": 0.8192, "grad_norm": 0.6170380115509033, "learning_rate": 0.0002, "epoch": 0.38348294434470376, "step": 5340}, {"loss": 0.8015, "grad_norm": 0.465762197971344, "learning_rate": 0.0002, "epoch": 0.38420107719928187, "step": 5350}, {"loss": 0.7561, "grad_norm": 0.6176838874816895, "learning_rate": 0.0002, "epoch": 0.38491921005386, "step": 5360}, {"loss": 0.7571, "grad_norm": 0.657926082611084, "learning_rate": 0.0002, "epoch": 0.38563734290843804, "step": 5370}, {"loss": 0.7366, "grad_norm": 0.5063281655311584, "learning_rate": 0.0002, "epoch": 0.38635547576301615, "step": 5380}, {"loss": 0.8259, "grad_norm": 0.6960828304290771, "learning_rate": 0.0002, "epoch": 0.38707360861759427, "step": 5390}, {"loss": 0.8058, "grad_norm": 0.46712034940719604, "learning_rate": 0.0002, "epoch": 0.3877917414721723, "step": 5400}, {"loss": 0.7674, "grad_norm": 0.598114013671875, "learning_rate": 0.0002, "epoch": 0.38850987432675044, "step": 5410}, {"loss": 0.8256, "grad_norm": 0.6798132061958313, "learning_rate": 0.0002, "epoch": 0.38922800718132855, "step": 5420}, {"loss": 0.844, "grad_norm": 0.5194289088249207, "learning_rate": 0.0002, "epoch": 0.38994614003590666, "step": 5430}, {"loss": 0.7666, "grad_norm": 0.48175323009490967, "learning_rate": 0.0002, "epoch": 0.3906642728904847, "step": 5440}, {"loss": 0.8089, "grad_norm": 0.4979408085346222, "learning_rate": 0.0002, "epoch": 0.39138240574506283, "step": 5450}, {"loss": 0.7938, "grad_norm": 0.6440972685813904, "learning_rate": 0.0002, "epoch": 0.39210053859964095, "step": 5460}, {"loss": 0.8531, "grad_norm": 0.5977227091789246, "learning_rate": 0.0002, "epoch": 0.392818671454219, "step": 5470}, {"loss": 0.8384, "grad_norm": 0.4735909104347229, "learning_rate": 0.0002, "epoch": 0.3935368043087971, "step": 5480}, {"loss": 0.8579, "grad_norm": 0.48181721568107605, "learning_rate": 0.0002, "epoch": 0.39425493716337523, "step": 5490}, {"loss": 0.8113, "grad_norm": 0.6339454650878906, "learning_rate": 0.0002, "epoch": 0.39497307001795334, "step": 5500}, {"loss": 0.7682, "grad_norm": 0.5364336371421814, "learning_rate": 0.0002, "epoch": 0.3956912028725314, "step": 5510}, {"loss": 0.8198, "grad_norm": 0.5499233603477478, "learning_rate": 0.0002, "epoch": 0.3964093357271095, "step": 5520}, {"loss": 0.7981, "grad_norm": 0.47249847650527954, "learning_rate": 0.0002, "epoch": 0.3971274685816876, "step": 5530}, {"loss": 0.8207, "grad_norm": 0.5692135095596313, "learning_rate": 0.0002, "epoch": 0.3978456014362657, "step": 5540}, {"loss": 0.8173, "grad_norm": 0.6009272933006287, "learning_rate": 0.0002, "epoch": 0.3985637342908438, "step": 5550}, {"loss": 0.7622, "grad_norm": 0.5198255181312561, "learning_rate": 0.0002, "epoch": 0.3992818671454219, "step": 5560}, {"loss": 0.8597, "grad_norm": 0.5474766492843628, "learning_rate": 0.0002, "epoch": 0.4, "step": 5570}, {"loss": 0.841, "grad_norm": 0.5577479600906372, "learning_rate": 0.0002, "epoch": 0.4007181328545781, "step": 5580}, {"loss": 0.7986, "grad_norm": 0.5350302457809448, "learning_rate": 0.0002, "epoch": 0.4014362657091562, "step": 5590}, {"loss": 0.7892, "grad_norm": 0.6310991048812866, "learning_rate": 0.0002, "epoch": 0.4021543985637343, "step": 5600}, {"loss": 0.7834, "grad_norm": 0.5695762038230896, "learning_rate": 0.0002, "epoch": 0.40287253141831236, "step": 5610}, {"loss": 0.7508, "grad_norm": 0.5431827306747437, "learning_rate": 0.0002, "epoch": 0.4035906642728905, "step": 5620}, {"loss": 0.8743, "grad_norm": 0.4923325777053833, "learning_rate": 0.0002, "epoch": 0.4043087971274686, "step": 5630}, {"loss": 0.7745, "grad_norm": 0.531399667263031, "learning_rate": 0.0002, "epoch": 0.4050269299820467, "step": 5640}, {"loss": 0.7982, "grad_norm": 0.5854769349098206, "learning_rate": 0.0002, "epoch": 0.40574506283662476, "step": 5650}, {"loss": 0.8225, "grad_norm": 0.6684802174568176, "learning_rate": 0.0002, "epoch": 0.40646319569120287, "step": 5660}, {"loss": 0.7405, "grad_norm": 0.6618620753288269, "learning_rate": 0.0002, "epoch": 0.407181328545781, "step": 5670}, {"loss": 0.7707, "grad_norm": 0.4930776059627533, "learning_rate": 0.0002, "epoch": 0.40789946140035904, "step": 5680}, {"loss": 0.7846, "grad_norm": 0.506628155708313, "learning_rate": 0.0002, "epoch": 0.40861759425493716, "step": 5690}, {"loss": 0.7827, "grad_norm": 0.5250783562660217, "learning_rate": 0.0002, "epoch": 0.40933572710951527, "step": 5700}, {"loss": 0.8386, "grad_norm": 0.6773046851158142, "learning_rate": 0.0002, "epoch": 0.4100538599640934, "step": 5710}, {"loss": 0.8096, "grad_norm": 0.6750592589378357, "learning_rate": 0.0002, "epoch": 0.41077199281867144, "step": 5720}, {"loss": 0.7873, "grad_norm": 0.5277232527732849, "learning_rate": 0.0002, "epoch": 0.41149012567324955, "step": 5730}, {"loss": 0.762, "grad_norm": 0.5155990719795227, "learning_rate": 0.0002, "epoch": 0.41220825852782766, "step": 5740}, {"loss": 0.871, "grad_norm": 0.5236294865608215, "learning_rate": 0.0002, "epoch": 0.4129263913824057, "step": 5750}, {"loss": 0.7753, "grad_norm": 0.5073592066764832, "learning_rate": 0.0002, "epoch": 0.41364452423698383, "step": 5760}, {"loss": 0.7984, "grad_norm": 0.6997184753417969, "learning_rate": 0.0002, "epoch": 0.41436265709156195, "step": 5770}, {"loss": 0.7579, "grad_norm": 0.5282439589500427, "learning_rate": 0.0002, "epoch": 0.41508078994614006, "step": 5780}, {"loss": 0.7831, "grad_norm": 0.4997355341911316, "learning_rate": 0.0002, "epoch": 0.4157989228007181, "step": 5790}, {"loss": 0.8022, "grad_norm": 0.6081610321998596, "learning_rate": 0.0002, "epoch": 0.41651705565529623, "step": 5800}, {"loss": 0.8068, "grad_norm": 0.5640295147895813, "learning_rate": 0.0002, "epoch": 0.41723518850987434, "step": 5810}, {"loss": 0.7819, "grad_norm": 0.6443586349487305, "learning_rate": 0.0002, "epoch": 0.4179533213644524, "step": 5820}, {"loss": 0.8132, "grad_norm": 0.6456229090690613, "learning_rate": 0.0002, "epoch": 0.4186714542190305, "step": 5830}, {"loss": 0.785, "grad_norm": 0.5422267317771912, "learning_rate": 0.0002, "epoch": 0.4193895870736086, "step": 5840}, {"loss": 0.7962, "grad_norm": 0.45251885056495667, "learning_rate": 0.0002, "epoch": 0.42010771992818674, "step": 5850}, {"loss": 0.7945, "grad_norm": 0.781165599822998, "learning_rate": 0.0002, "epoch": 0.4208258527827648, "step": 5860}, {"loss": 0.8171, "grad_norm": 0.5359160900115967, "learning_rate": 0.0002, "epoch": 0.4215439856373429, "step": 5870}, {"loss": 0.8012, "grad_norm": 0.6201958656311035, "learning_rate": 0.0002, "epoch": 0.422262118491921, "step": 5880}, {"loss": 0.8363, "grad_norm": 0.5985850691795349, "learning_rate": 0.0002, "epoch": 0.4229802513464991, "step": 5890}, {"loss": 0.7842, "grad_norm": 0.5550961494445801, "learning_rate": 0.0002, "epoch": 0.4236983842010772, "step": 5900}, {"loss": 0.7717, "grad_norm": 0.6284893155097961, "learning_rate": 0.0002, "epoch": 0.4244165170556553, "step": 5910}, {"loss": 0.8165, "grad_norm": 0.6143685579299927, "learning_rate": 0.0002, "epoch": 0.4251346499102334, "step": 5920}, {"loss": 0.7986, "grad_norm": 0.5065329670906067, "learning_rate": 0.0002, "epoch": 0.4258527827648115, "step": 5930}, {"loss": 0.7883, "grad_norm": 0.7274345755577087, "learning_rate": 0.0002, "epoch": 0.4265709156193896, "step": 5940}, {"loss": 0.8126, "grad_norm": 0.606531023979187, "learning_rate": 0.0002, "epoch": 0.4272890484739677, "step": 5950}, {"loss": 0.7805, "grad_norm": 0.5983648300170898, "learning_rate": 0.0002, "epoch": 0.42800718132854576, "step": 5960}, {"loss": 0.8124, "grad_norm": 0.5546031594276428, "learning_rate": 0.0002, "epoch": 0.4287253141831239, "step": 5970}, {"loss": 0.8184, "grad_norm": 0.666868269443512, "learning_rate": 0.0002, "epoch": 0.429443447037702, "step": 5980}, {"loss": 0.8171, "grad_norm": 0.41438576579093933, "learning_rate": 0.0002, "epoch": 0.4301615798922801, "step": 5990}, {"loss": 0.8456, "grad_norm": 0.5012526512145996, "learning_rate": 0.0002, "epoch": 0.43087971274685816, "step": 6000}, {"loss": 0.7837, "grad_norm": 0.6071694493293762, "learning_rate": 0.0002, "epoch": 0.43159784560143627, "step": 6010}, {"loss": 0.8364, "grad_norm": 0.5538384914398193, "learning_rate": 0.0002, "epoch": 0.4323159784560144, "step": 6020}, {"loss": 0.7888, "grad_norm": 0.5798718929290771, "learning_rate": 0.0002, "epoch": 0.43303411131059244, "step": 6030}, {"loss": 0.8196, "grad_norm": 0.5442442893981934, "learning_rate": 0.0002, "epoch": 0.43375224416517055, "step": 6040}, {"loss": 0.8041, "grad_norm": 0.6895565390586853, "learning_rate": 0.0002, "epoch": 0.43447037701974867, "step": 6050}, {"loss": 0.8154, "grad_norm": 0.6498045325279236, "learning_rate": 0.0002, "epoch": 0.4351885098743267, "step": 6060}, {"loss": 0.782, "grad_norm": 0.5225510001182556, "learning_rate": 0.0002, "epoch": 0.43590664272890484, "step": 6070}, {"loss": 0.7809, "grad_norm": 0.6366992592811584, "learning_rate": 0.0002, "epoch": 0.43662477558348295, "step": 6080}, {"loss": 0.7715, "grad_norm": 0.47929027676582336, "learning_rate": 0.0002, "epoch": 0.43734290843806106, "step": 6090}, {"loss": 0.7481, "grad_norm": 0.5722405910491943, "learning_rate": 0.0002, "epoch": 0.4380610412926391, "step": 6100}, {"loss": 0.765, "grad_norm": 0.6008004546165466, "learning_rate": 0.0002, "epoch": 0.43877917414721723, "step": 6110}, {"loss": 0.7795, "grad_norm": 0.5922580361366272, "learning_rate": 0.0002, "epoch": 0.43949730700179535, "step": 6120}, {"loss": 0.8542, "grad_norm": 0.7051905393600464, "learning_rate": 0.0002, "epoch": 0.4402154398563734, "step": 6130}, {"loss": 0.8159, "grad_norm": 0.5146450400352478, "learning_rate": 0.0002, "epoch": 0.4409335727109515, "step": 6140}, {"loss": 0.8178, "grad_norm": 0.5605781674385071, "learning_rate": 0.0002, "epoch": 0.44165170556552963, "step": 6150}, {"loss": 0.8409, "grad_norm": 0.8008661866188049, "learning_rate": 0.0002, "epoch": 0.44236983842010774, "step": 6160}, {"loss": 0.797, "grad_norm": 0.47406497597694397, "learning_rate": 0.0002, "epoch": 0.4430879712746858, "step": 6170}, {"loss": 0.7853, "grad_norm": 0.612287700176239, "learning_rate": 0.0002, "epoch": 0.4438061041292639, "step": 6180}, {"loss": 0.835, "grad_norm": 0.561188280582428, "learning_rate": 0.0002, "epoch": 0.444524236983842, "step": 6190}, {"loss": 0.7604, "grad_norm": 0.6233669519424438, "learning_rate": 0.0002, "epoch": 0.4452423698384201, "step": 6200}, {"loss": 0.7539, "grad_norm": 0.45546263456344604, "learning_rate": 0.0002, "epoch": 0.4459605026929982, "step": 6210}, {"loss": 0.8183, "grad_norm": 0.5947871208190918, "learning_rate": 0.0002, "epoch": 0.4466786355475763, "step": 6220}, {"loss": 0.789, "grad_norm": 0.6109753847122192, "learning_rate": 0.0002, "epoch": 0.4473967684021544, "step": 6230}, {"loss": 0.7811, "grad_norm": 0.6380727887153625, "learning_rate": 0.0002, "epoch": 0.4481149012567325, "step": 6240}, {"loss": 0.7845, "grad_norm": 0.5225699543952942, "learning_rate": 0.0002, "epoch": 0.4488330341113106, "step": 6250}, {"loss": 0.8217, "grad_norm": 0.521503210067749, "learning_rate": 0.0002, "epoch": 0.4495511669658887, "step": 6260}, {"loss": 0.8392, "grad_norm": 0.5523216128349304, "learning_rate": 0.0002, "epoch": 0.45026929982046676, "step": 6270}, {"loss": 0.8228, "grad_norm": 0.5954921841621399, "learning_rate": 0.0002, "epoch": 0.4509874326750449, "step": 6280}, {"loss": 0.7798, "grad_norm": 0.702751100063324, "learning_rate": 0.0002, "epoch": 0.451705565529623, "step": 6290}, {"loss": 0.7865, "grad_norm": 0.5756356120109558, "learning_rate": 0.0002, "epoch": 0.4524236983842011, "step": 6300}, {"loss": 0.8128, "grad_norm": 0.45365944504737854, "learning_rate": 0.0002, "epoch": 0.45314183123877916, "step": 6310}, {"loss": 0.8027, "grad_norm": 0.5027855038642883, "learning_rate": 0.0002, "epoch": 0.45385996409335727, "step": 6320}, {"loss": 0.8052, "grad_norm": 0.6551687121391296, "learning_rate": 0.0002, "epoch": 0.4545780969479354, "step": 6330}, {"loss": 0.7507, "grad_norm": 0.5296684503555298, "learning_rate": 0.0002, "epoch": 0.45529622980251344, "step": 6340}, {"loss": 0.8209, "grad_norm": 0.5762032866477966, "learning_rate": 0.0002, "epoch": 0.45601436265709155, "step": 6350}, {"loss": 0.8209, "grad_norm": 0.5234073996543884, "learning_rate": 0.0002, "epoch": 0.45673249551166967, "step": 6360}, {"loss": 0.8412, "grad_norm": 0.5090946555137634, "learning_rate": 0.0002, "epoch": 0.4574506283662478, "step": 6370}, {"loss": 0.787, "grad_norm": 0.6515111327171326, "learning_rate": 0.0002, "epoch": 0.45816876122082584, "step": 6380}, {"loss": 0.7351, "grad_norm": 0.7904898524284363, "learning_rate": 0.0002, "epoch": 0.45888689407540395, "step": 6390}, {"loss": 0.841, "grad_norm": 0.6379680037498474, "learning_rate": 0.0002, "epoch": 0.45960502692998206, "step": 6400}, {"loss": 0.7727, "grad_norm": 0.641759991645813, "learning_rate": 0.0002, "epoch": 0.4603231597845601, "step": 6410}, {"loss": 0.8346, "grad_norm": 0.5273829698562622, "learning_rate": 0.0002, "epoch": 0.46104129263913823, "step": 6420}, {"loss": 0.7722, "grad_norm": 0.5668497681617737, "learning_rate": 0.0002, "epoch": 0.46175942549371635, "step": 6430}, {"loss": 0.8157, "grad_norm": 0.5862061381340027, "learning_rate": 0.0002, "epoch": 0.46247755834829446, "step": 6440}, {"loss": 0.818, "grad_norm": 0.5239592790603638, "learning_rate": 0.0002, "epoch": 0.4631956912028725, "step": 6450}, {"loss": 0.7803, "grad_norm": 0.5078722834587097, "learning_rate": 0.0002, "epoch": 0.46391382405745063, "step": 6460}, {"loss": 0.7934, "grad_norm": 0.566509485244751, "learning_rate": 0.0002, "epoch": 0.46463195691202874, "step": 6470}, {"loss": 0.7746, "grad_norm": 0.5952697396278381, "learning_rate": 0.0002, "epoch": 0.4653500897666068, "step": 6480}, {"loss": 0.8088, "grad_norm": 0.6548156142234802, "learning_rate": 0.0002, "epoch": 0.4660682226211849, "step": 6490}, {"loss": 0.8303, "grad_norm": 0.4768427908420563, "learning_rate": 0.0002, "epoch": 0.466786355475763, "step": 6500}, {"loss": 0.805, "grad_norm": 0.5588273406028748, "learning_rate": 0.0002, "epoch": 0.46750448833034114, "step": 6510}, {"loss": 0.7774, "grad_norm": 0.5348677039146423, "learning_rate": 0.0002, "epoch": 0.4682226211849192, "step": 6520}, {"loss": 0.7969, "grad_norm": 0.4784318804740906, "learning_rate": 0.0002, "epoch": 0.4689407540394973, "step": 6530}, {"loss": 0.8073, "grad_norm": 0.5112265944480896, "learning_rate": 0.0002, "epoch": 0.4696588868940754, "step": 6540}, {"loss": 0.8289, "grad_norm": 0.7250495553016663, "learning_rate": 0.0002, "epoch": 0.4703770197486535, "step": 6550}, {"loss": 0.808, "grad_norm": 0.538608968257904, "learning_rate": 0.0002, "epoch": 0.4710951526032316, "step": 6560}, {"loss": 0.7977, "grad_norm": 0.5981247425079346, "learning_rate": 0.0002, "epoch": 0.4718132854578097, "step": 6570}, {"loss": 0.8092, "grad_norm": 0.5466762781143188, "learning_rate": 0.0002, "epoch": 0.4725314183123878, "step": 6580}, {"loss": 0.8136, "grad_norm": 0.5609987378120422, "learning_rate": 0.0002, "epoch": 0.4732495511669659, "step": 6590}, {"loss": 0.8575, "grad_norm": 0.6091027855873108, "learning_rate": 0.0002, "epoch": 0.473967684021544, "step": 6600}, {"loss": 0.7741, "grad_norm": 0.5542886853218079, "learning_rate": 0.0002, "epoch": 0.4746858168761221, "step": 6610}, {"loss": 0.7867, "grad_norm": 0.5656579732894897, "learning_rate": 0.0002, "epoch": 0.47540394973070016, "step": 6620}, {"loss": 0.7647, "grad_norm": 0.47507357597351074, "learning_rate": 0.0002, "epoch": 0.4761220825852783, "step": 6630}, {"loss": 0.8323, "grad_norm": 0.6039174199104309, "learning_rate": 0.0002, "epoch": 0.4768402154398564, "step": 6640}, {"loss": 0.7812, "grad_norm": 0.7129740715026855, "learning_rate": 0.0002, "epoch": 0.47755834829443444, "step": 6650}, {"loss": 0.8001, "grad_norm": 0.5189188718795776, "learning_rate": 0.0002, "epoch": 0.47827648114901256, "step": 6660}, {"loss": 0.7467, "grad_norm": 0.7548696398735046, "learning_rate": 0.0002, "epoch": 0.47899461400359067, "step": 6670}, {"loss": 0.7694, "grad_norm": 0.4729466438293457, "learning_rate": 0.0002, "epoch": 0.4797127468581688, "step": 6680}, {"loss": 0.7497, "grad_norm": 0.6190000772476196, "learning_rate": 0.0002, "epoch": 0.48043087971274684, "step": 6690}, {"loss": 0.7691, "grad_norm": 0.6276983022689819, "learning_rate": 0.0002, "epoch": 0.48114901256732495, "step": 6700}, {"loss": 0.7947, "grad_norm": 0.6097590923309326, "learning_rate": 0.0002, "epoch": 0.48186714542190306, "step": 6710}, {"loss": 0.7735, "grad_norm": 0.6507330536842346, "learning_rate": 0.0002, "epoch": 0.4825852782764811, "step": 6720}, {"loss": 0.817, "grad_norm": 0.5501991510391235, "learning_rate": 0.0002, "epoch": 0.48330341113105924, "step": 6730}, {"loss": 0.7998, "grad_norm": 0.5928015112876892, "learning_rate": 0.0002, "epoch": 0.48402154398563735, "step": 6740}, {"loss": 0.7717, "grad_norm": 0.5523008704185486, "learning_rate": 0.0002, "epoch": 0.48473967684021546, "step": 6750}, {"loss": 0.7821, "grad_norm": 0.5997263789176941, "learning_rate": 0.0002, "epoch": 0.4854578096947935, "step": 6760}, {"loss": 0.7619, "grad_norm": 0.6201002597808838, "learning_rate": 0.0002, "epoch": 0.48617594254937163, "step": 6770}, {"loss": 0.8018, "grad_norm": 0.6338862776756287, "learning_rate": 0.0002, "epoch": 0.48689407540394974, "step": 6780}, {"loss": 0.7547, "grad_norm": 0.5542550086975098, "learning_rate": 0.0002, "epoch": 0.4876122082585278, "step": 6790}, {"loss": 0.7754, "grad_norm": 0.5587872862815857, "learning_rate": 0.0002, "epoch": 0.4883303411131059, "step": 6800}, {"loss": 0.7913, "grad_norm": 0.5895681977272034, "learning_rate": 0.0002, "epoch": 0.489048473967684, "step": 6810}, {"loss": 0.7799, "grad_norm": 0.4948221743106842, "learning_rate": 0.0002, "epoch": 0.48976660682226214, "step": 6820}, {"loss": 0.8057, "grad_norm": 0.44546931982040405, "learning_rate": 0.0002, "epoch": 0.4904847396768402, "step": 6830}, {"loss": 0.8124, "grad_norm": 0.632046103477478, "learning_rate": 0.0002, "epoch": 0.4912028725314183, "step": 6840}, {"loss": 0.8014, "grad_norm": 0.49396243691444397, "learning_rate": 0.0002, "epoch": 0.4919210053859964, "step": 6850}, {"loss": 0.7127, "grad_norm": 0.497745156288147, "learning_rate": 0.0002, "epoch": 0.4926391382405745, "step": 6860}, {"loss": 0.8306, "grad_norm": 0.7336170077323914, "learning_rate": 0.0002, "epoch": 0.4933572710951526, "step": 6870}, {"loss": 0.8342, "grad_norm": 0.6723181009292603, "learning_rate": 0.0002, "epoch": 0.4940754039497307, "step": 6880}, {"loss": 0.8251, "grad_norm": 0.5887754559516907, "learning_rate": 0.0002, "epoch": 0.4947935368043088, "step": 6890}, {"loss": 0.7904, "grad_norm": 0.6580226421356201, "learning_rate": 0.0002, "epoch": 0.4955116696588869, "step": 6900}, {"loss": 0.8203, "grad_norm": 0.7385056614875793, "learning_rate": 0.0002, "epoch": 0.496229802513465, "step": 6910}, {"loss": 0.87, "grad_norm": 0.48736000061035156, "learning_rate": 0.0002, "epoch": 0.4969479353680431, "step": 6920}, {"loss": 0.8045, "grad_norm": 0.6304559111595154, "learning_rate": 0.0002, "epoch": 0.49766606822262116, "step": 6930}, {"loss": 0.8323, "grad_norm": 0.607148289680481, "learning_rate": 0.0002, "epoch": 0.4983842010771993, "step": 6940}, {"loss": 0.8277, "grad_norm": 0.5467981696128845, "learning_rate": 0.0002, "epoch": 0.4991023339317774, "step": 6950}, {"loss": 0.804, "grad_norm": 0.7046723961830139, "learning_rate": 0.0002, "epoch": 0.4998204667863555, "step": 6960}, {"loss": 0.7836, "grad_norm": 0.5487921833992004, "learning_rate": 0.0002, "epoch": 0.5005385996409336, "step": 6970}, {"loss": 0.8445, "grad_norm": 0.5706006288528442, "learning_rate": 0.0002, "epoch": 0.5012567324955116, "step": 6980}, {"loss": 0.8216, "grad_norm": 0.539536714553833, "learning_rate": 0.0002, "epoch": 0.5019748653500897, "step": 6990}, {"loss": 0.7829, "grad_norm": 0.5527397394180298, "learning_rate": 0.0002, "epoch": 0.5026929982046678, "step": 7000}, {"loss": 0.8342, "grad_norm": 0.5498567223548889, "learning_rate": 0.0002, "epoch": 0.503411131059246, "step": 7010}, {"loss": 0.8073, "grad_norm": 0.5878575444221497, "learning_rate": 0.0002, "epoch": 0.5041292639138241, "step": 7020}, {"loss": 0.8284, "grad_norm": 0.646153450012207, "learning_rate": 0.0002, "epoch": 0.5048473967684022, "step": 7030}, {"loss": 0.7758, "grad_norm": 0.5603899359703064, "learning_rate": 0.0002, "epoch": 0.5055655296229803, "step": 7040}, {"loss": 0.8002, "grad_norm": 0.5849952697753906, "learning_rate": 0.0002, "epoch": 0.5062836624775583, "step": 7050}, {"loss": 0.7953, "grad_norm": 0.6082724928855896, "learning_rate": 0.0002, "epoch": 0.5070017953321364, "step": 7060}, {"loss": 0.8046, "grad_norm": 0.5900670289993286, "learning_rate": 0.0002, "epoch": 0.5077199281867145, "step": 7070}, {"loss": 0.8612, "grad_norm": 0.5856624841690063, "learning_rate": 0.0002, "epoch": 0.5084380610412926, "step": 7080}, {"loss": 0.8289, "grad_norm": 0.6177338361740112, "learning_rate": 0.0002, "epoch": 0.5091561938958707, "step": 7090}, {"loss": 0.8139, "grad_norm": 0.5559300184249878, "learning_rate": 0.0002, "epoch": 0.5098743267504489, "step": 7100}, {"loss": 0.8083, "grad_norm": 0.62027907371521, "learning_rate": 0.0002, "epoch": 0.510592459605027, "step": 7110}, {"loss": 0.8037, "grad_norm": 0.6334301829338074, "learning_rate": 0.0002, "epoch": 0.511310592459605, "step": 7120}, {"loss": 0.8107, "grad_norm": 0.513795018196106, "learning_rate": 0.0002, "epoch": 0.5120287253141831, "step": 7130}, {"loss": 0.7566, "grad_norm": 0.7004675269126892, "learning_rate": 0.0002, "epoch": 0.5127468581687612, "step": 7140}, {"loss": 0.7893, "grad_norm": 0.5614308714866638, "learning_rate": 0.0002, "epoch": 0.5134649910233393, "step": 7150}, {"loss": 0.7868, "grad_norm": 0.5037539601325989, "learning_rate": 0.0002, "epoch": 0.5141831238779174, "step": 7160}, {"loss": 0.7981, "grad_norm": 0.5568661093711853, "learning_rate": 0.0002, "epoch": 0.5149012567324955, "step": 7170}, {"loss": 0.8333, "grad_norm": 0.7513397336006165, "learning_rate": 0.0002, "epoch": 0.5156193895870737, "step": 7180}, {"loss": 0.792, "grad_norm": 0.7264583706855774, "learning_rate": 0.0002, "epoch": 0.5163375224416517, "step": 7190}, {"loss": 0.8671, "grad_norm": 0.6355819702148438, "learning_rate": 0.0002, "epoch": 0.5170556552962298, "step": 7200}, {"loss": 0.7734, "grad_norm": 0.6063222289085388, "learning_rate": 0.0002, "epoch": 0.5177737881508079, "step": 7210}, {"loss": 0.812, "grad_norm": 0.6484307646751404, "learning_rate": 0.0002, "epoch": 0.518491921005386, "step": 7220}, {"loss": 0.7852, "grad_norm": 0.5260455012321472, "learning_rate": 0.0002, "epoch": 0.5192100538599641, "step": 7230}, {"loss": 0.8301, "grad_norm": 0.6718002557754517, "learning_rate": 0.0002, "epoch": 0.5199281867145422, "step": 7240}, {"loss": 0.8178, "grad_norm": 0.5997617244720459, "learning_rate": 0.0002, "epoch": 0.5206463195691203, "step": 7250}, {"loss": 0.7631, "grad_norm": 0.5838589668273926, "learning_rate": 0.0002, "epoch": 0.5213644524236983, "step": 7260}, {"loss": 0.7853, "grad_norm": 0.5755977630615234, "learning_rate": 0.0002, "epoch": 0.5220825852782764, "step": 7270}, {"loss": 0.8233, "grad_norm": 0.6442093253135681, "learning_rate": 0.0002, "epoch": 0.5228007181328546, "step": 7280}, {"loss": 0.822, "grad_norm": 0.6128416657447815, "learning_rate": 0.0002, "epoch": 0.5235188509874327, "step": 7290}, {"loss": 0.802, "grad_norm": 0.509742796421051, "learning_rate": 0.0002, "epoch": 0.5242369838420108, "step": 7300}, {"loss": 0.7438, "grad_norm": 0.5450230836868286, "learning_rate": 0.0002, "epoch": 0.5249551166965889, "step": 7310}, {"loss": 0.7881, "grad_norm": 0.5437141060829163, "learning_rate": 0.0002, "epoch": 0.525673249551167, "step": 7320}, {"loss": 0.795, "grad_norm": 0.5291738510131836, "learning_rate": 0.0002, "epoch": 0.526391382405745, "step": 7330}, {"loss": 0.8204, "grad_norm": 0.5101743936538696, "learning_rate": 0.0002, "epoch": 0.5271095152603231, "step": 7340}, {"loss": 0.856, "grad_norm": 0.5678408145904541, "learning_rate": 0.0002, "epoch": 0.5278276481149012, "step": 7350}, {"loss": 0.8435, "grad_norm": 0.6332360506057739, "learning_rate": 0.0002, "epoch": 0.5285457809694794, "step": 7360}, {"loss": 0.8521, "grad_norm": 0.4935058653354645, "learning_rate": 0.0002, "epoch": 0.5292639138240575, "step": 7370}, {"loss": 0.7699, "grad_norm": 0.6399656534194946, "learning_rate": 0.0002, "epoch": 0.5299820466786356, "step": 7380}, {"loss": 0.7956, "grad_norm": 0.5986794233322144, "learning_rate": 0.0002, "epoch": 0.5307001795332137, "step": 7390}, {"loss": 0.774, "grad_norm": 0.6948414444923401, "learning_rate": 0.0002, "epoch": 0.5314183123877917, "step": 7400}, {"loss": 0.8267, "grad_norm": 0.5337842106819153, "learning_rate": 0.0002, "epoch": 0.5321364452423698, "step": 7410}, {"loss": 0.7634, "grad_norm": 0.6897268295288086, "learning_rate": 0.0002, "epoch": 0.5328545780969479, "step": 7420}, {"loss": 0.7606, "grad_norm": 0.6361175179481506, "learning_rate": 0.0002, "epoch": 0.533572710951526, "step": 7430}, {"loss": 0.7592, "grad_norm": 0.5242252945899963, "learning_rate": 0.0002, "epoch": 0.5342908438061041, "step": 7440}, {"loss": 0.7387, "grad_norm": 0.5731322765350342, "learning_rate": 0.0002, "epoch": 0.5350089766606823, "step": 7450}, {"loss": 0.8215, "grad_norm": 0.5790955424308777, "learning_rate": 0.0002, "epoch": 0.5357271095152604, "step": 7460}, {"loss": 0.7714, "grad_norm": 0.4979061782360077, "learning_rate": 0.0002, "epoch": 0.5364452423698384, "step": 7470}, {"loss": 0.794, "grad_norm": 0.7335101962089539, "learning_rate": 0.0002, "epoch": 0.5371633752244165, "step": 7480}, {"loss": 0.787, "grad_norm": 0.592521071434021, "learning_rate": 0.0002, "epoch": 0.5378815080789946, "step": 7490}, {"loss": 0.7421, "grad_norm": 0.5784769654273987, "learning_rate": 0.0002, "epoch": 0.5385996409335727, "step": 7500}, {"loss": 0.789, "grad_norm": 0.8148589730262756, "learning_rate": 0.0002, "epoch": 0.5393177737881508, "step": 7510}, {"loss": 0.7777, "grad_norm": 0.5727689862251282, "learning_rate": 0.0002, "epoch": 0.5400359066427289, "step": 7520}, {"loss": 0.8321, "grad_norm": 0.6958279609680176, "learning_rate": 0.0002, "epoch": 0.540754039497307, "step": 7530}, {"loss": 0.7678, "grad_norm": 0.6302788257598877, "learning_rate": 0.0002, "epoch": 0.541472172351885, "step": 7540}, {"loss": 0.7772, "grad_norm": 0.5950970649719238, "learning_rate": 0.0002, "epoch": 0.5421903052064632, "step": 7550}, {"loss": 0.8076, "grad_norm": 0.4275270104408264, "learning_rate": 0.0002, "epoch": 0.5429084380610413, "step": 7560}, {"loss": 0.8158, "grad_norm": 0.7579900622367859, "learning_rate": 0.0002, "epoch": 0.5436265709156194, "step": 7570}, {"loss": 0.8036, "grad_norm": 0.5835317969322205, "learning_rate": 0.0002, "epoch": 0.5443447037701975, "step": 7580}, {"loss": 0.7947, "grad_norm": 0.5305142998695374, "learning_rate": 0.0002, "epoch": 0.5450628366247756, "step": 7590}, {"loss": 0.8043, "grad_norm": 0.6076129674911499, "learning_rate": 0.0002, "epoch": 0.5457809694793537, "step": 7600}, {"loss": 0.8197, "grad_norm": 0.5341935753822327, "learning_rate": 0.0002, "epoch": 0.5464991023339317, "step": 7610}, {"loss": 0.7424, "grad_norm": 0.6070826053619385, "learning_rate": 0.0002, "epoch": 0.5472172351885098, "step": 7620}, {"loss": 0.7801, "grad_norm": 0.6193035840988159, "learning_rate": 0.0002, "epoch": 0.547935368043088, "step": 7630}, {"loss": 0.7639, "grad_norm": 0.6171614527702332, "learning_rate": 0.0002, "epoch": 0.5486535008976661, "step": 7640}, {"loss": 0.7655, "grad_norm": 0.5700938105583191, "learning_rate": 0.0002, "epoch": 0.5493716337522442, "step": 7650}, {"loss": 0.8289, "grad_norm": 0.5742418169975281, "learning_rate": 0.0002, "epoch": 0.5500897666068223, "step": 7660}, {"loss": 0.7942, "grad_norm": 0.6450320482254028, "learning_rate": 0.0002, "epoch": 0.5508078994614004, "step": 7670}, {"loss": 0.807, "grad_norm": 0.542860209941864, "learning_rate": 0.0002, "epoch": 0.5515260323159784, "step": 7680}, {"loss": 0.8298, "grad_norm": 0.538007915019989, "learning_rate": 0.0002, "epoch": 0.5522441651705565, "step": 7690}, {"loss": 0.8301, "grad_norm": 0.5846288204193115, "learning_rate": 0.0002, "epoch": 0.5529622980251346, "step": 7700}, {"loss": 0.7893, "grad_norm": 0.623315155506134, "learning_rate": 0.0002, "epoch": 0.5536804308797127, "step": 7710}, {"loss": 0.8043, "grad_norm": 0.6607962250709534, "learning_rate": 0.0002, "epoch": 0.5543985637342909, "step": 7720}, {"loss": 0.7615, "grad_norm": 0.5258557200431824, "learning_rate": 0.0002, "epoch": 0.555116696588869, "step": 7730}, {"loss": 0.8177, "grad_norm": 0.6464316844940186, "learning_rate": 0.0002, "epoch": 0.5558348294434471, "step": 7740}, {"loss": 0.7683, "grad_norm": 0.6390621662139893, "learning_rate": 0.0002, "epoch": 0.5565529622980251, "step": 7750}, {"loss": 0.8447, "grad_norm": 0.5327560305595398, "learning_rate": 0.0002, "epoch": 0.5572710951526032, "step": 7760}, {"loss": 0.7833, "grad_norm": 0.8202064633369446, "learning_rate": 0.0002, "epoch": 0.5579892280071813, "step": 7770}, {"loss": 0.7818, "grad_norm": 0.45350968837738037, "learning_rate": 0.0002, "epoch": 0.5587073608617594, "step": 7780}, {"loss": 0.7299, "grad_norm": 0.5031413435935974, "learning_rate": 0.0002, "epoch": 0.5594254937163375, "step": 7790}, {"loss": 0.7542, "grad_norm": 0.5047417879104614, "learning_rate": 0.0002, "epoch": 0.5601436265709157, "step": 7800}, {"loss": 0.7989, "grad_norm": 0.668912410736084, "learning_rate": 0.0002, "epoch": 0.5608617594254938, "step": 7810}, {"loss": 0.8226, "grad_norm": 0.6106061339378357, "learning_rate": 0.0002, "epoch": 0.5615798922800718, "step": 7820}, {"loss": 0.7489, "grad_norm": 0.5558443665504456, "learning_rate": 0.0002, "epoch": 0.5622980251346499, "step": 7830}, {"loss": 0.79, "grad_norm": 0.5937177538871765, "learning_rate": 0.0002, "epoch": 0.563016157989228, "step": 7840}, {"loss": 0.7857, "grad_norm": 0.67307448387146, "learning_rate": 0.0002, "epoch": 0.5637342908438061, "step": 7850}, {"loss": 0.8037, "grad_norm": 0.4615475833415985, "learning_rate": 0.0002, "epoch": 0.5644524236983842, "step": 7860}, {"loss": 0.7519, "grad_norm": 0.5462577939033508, "learning_rate": 0.0002, "epoch": 0.5651705565529623, "step": 7870}, {"loss": 0.7821, "grad_norm": 0.6422402858734131, "learning_rate": 0.0002, "epoch": 0.5658886894075404, "step": 7880}, {"loss": 0.8327, "grad_norm": 0.5313532948493958, "learning_rate": 0.0002, "epoch": 0.5666068222621184, "step": 7890}, {"loss": 0.7771, "grad_norm": 0.5647847056388855, "learning_rate": 0.0002, "epoch": 0.5673249551166966, "step": 7900}, {"loss": 0.8126, "grad_norm": 0.6581610441207886, "learning_rate": 0.0002, "epoch": 0.5680430879712747, "step": 7910}, {"loss": 0.7549, "grad_norm": 0.46947669982910156, "learning_rate": 0.0002, "epoch": 0.5687612208258528, "step": 7920}, {"loss": 0.8333, "grad_norm": 0.6420038342475891, "learning_rate": 0.0002, "epoch": 0.5694793536804309, "step": 7930}, {"loss": 0.7921, "grad_norm": 0.6730441451072693, "learning_rate": 0.0002, "epoch": 0.570197486535009, "step": 7940}, {"loss": 0.7668, "grad_norm": 0.3849070966243744, "learning_rate": 0.0002, "epoch": 0.5709156193895871, "step": 7950}, {"loss": 0.8297, "grad_norm": 0.6076335906982422, "learning_rate": 0.0002, "epoch": 0.5716337522441651, "step": 7960}, {"loss": 0.7932, "grad_norm": 0.6446982026100159, "learning_rate": 0.0002, "epoch": 0.5723518850987432, "step": 7970}, {"loss": 0.7988, "grad_norm": 0.6019234657287598, "learning_rate": 0.0002, "epoch": 0.5730700179533214, "step": 7980}, {"loss": 0.8103, "grad_norm": 0.620880663394928, "learning_rate": 0.0002, "epoch": 0.5737881508078995, "step": 7990}, {"loss": 0.7712, "grad_norm": 0.4927573502063751, "learning_rate": 0.0002, "epoch": 0.5745062836624776, "step": 8000}, {"loss": 0.7499, "grad_norm": 0.6276804804801941, "learning_rate": 0.0002, "epoch": 0.5752244165170557, "step": 8010}, {"loss": 0.8232, "grad_norm": 0.484518826007843, "learning_rate": 0.0002, "epoch": 0.5759425493716338, "step": 8020}, {"loss": 0.7658, "grad_norm": 0.5019962787628174, "learning_rate": 0.0002, "epoch": 0.5766606822262118, "step": 8030}, {"loss": 0.7827, "grad_norm": 0.6685234308242798, "learning_rate": 0.0002, "epoch": 0.5773788150807899, "step": 8040}, {"loss": 0.7811, "grad_norm": 0.5762107372283936, "learning_rate": 0.0002, "epoch": 0.578096947935368, "step": 8050}, {"loss": 0.8256, "grad_norm": 0.6402477025985718, "learning_rate": 0.0002, "epoch": 0.5788150807899461, "step": 8060}, {"loss": 0.779, "grad_norm": 0.5919345617294312, "learning_rate": 0.0002, "epoch": 0.5795332136445243, "step": 8070}, {"loss": 0.8179, "grad_norm": 0.47100913524627686, "learning_rate": 0.0002, "epoch": 0.5802513464991024, "step": 8080}, {"loss": 0.7832, "grad_norm": 0.6029118895530701, "learning_rate": 0.0002, "epoch": 0.5809694793536805, "step": 8090}, {"loss": 0.8061, "grad_norm": 0.5896338820457458, "learning_rate": 0.0002, "epoch": 0.5816876122082585, "step": 8100}, {"loss": 0.7991, "grad_norm": 0.49017754197120667, "learning_rate": 0.0002, "epoch": 0.5824057450628366, "step": 8110}, {"loss": 0.8148, "grad_norm": 0.5049256086349487, "learning_rate": 0.0002, "epoch": 0.5831238779174147, "step": 8120}, {"loss": 0.7561, "grad_norm": 0.6874517798423767, "learning_rate": 0.0002, "epoch": 0.5838420107719928, "step": 8130}, {"loss": 0.7908, "grad_norm": 0.5429391264915466, "learning_rate": 0.0002, "epoch": 0.5845601436265709, "step": 8140}, {"loss": 0.7834, "grad_norm": 0.5533722639083862, "learning_rate": 0.0002, "epoch": 0.585278276481149, "step": 8150}, {"loss": 0.7725, "grad_norm": 0.5827956199645996, "learning_rate": 0.0002, "epoch": 0.5859964093357272, "step": 8160}, {"loss": 0.7758, "grad_norm": 0.6670212149620056, "learning_rate": 0.0002, "epoch": 0.5867145421903052, "step": 8170}, {"loss": 0.7625, "grad_norm": 0.5231172442436218, "learning_rate": 0.0002, "epoch": 0.5874326750448833, "step": 8180}, {"loss": 0.7975, "grad_norm": 0.567447304725647, "learning_rate": 0.0002, "epoch": 0.5881508078994614, "step": 8190}, {"loss": 0.7463, "grad_norm": 0.5318575501441956, "learning_rate": 0.0002, "epoch": 0.5888689407540395, "step": 8200}, {"loss": 0.7961, "grad_norm": 0.6959463357925415, "learning_rate": 0.0002, "epoch": 0.5895870736086176, "step": 8210}, {"loss": 0.7575, "grad_norm": 0.6964931488037109, "learning_rate": 0.0002, "epoch": 0.5903052064631957, "step": 8220}, {"loss": 0.8382, "grad_norm": 0.5164617896080017, "learning_rate": 0.0002, "epoch": 0.5910233393177737, "step": 8230}, {"loss": 0.8152, "grad_norm": 0.5456110239028931, "learning_rate": 0.0002, "epoch": 0.5917414721723518, "step": 8240}, {"loss": 0.7627, "grad_norm": 0.6553666591644287, "learning_rate": 0.0002, "epoch": 0.59245960502693, "step": 8250}, {"loss": 0.8134, "grad_norm": 0.6185845732688904, "learning_rate": 0.0002, "epoch": 0.5931777378815081, "step": 8260}, {"loss": 0.8216, "grad_norm": 0.6110545992851257, "learning_rate": 0.0002, "epoch": 0.5938958707360862, "step": 8270}, {"loss": 0.805, "grad_norm": 0.5186824202537537, "learning_rate": 0.0002, "epoch": 0.5946140035906643, "step": 8280}, {"loss": 0.7934, "grad_norm": 0.7003735303878784, "learning_rate": 0.0002, "epoch": 0.5953321364452424, "step": 8290}, {"loss": 0.8095, "grad_norm": 0.4606216549873352, "learning_rate": 0.0002, "epoch": 0.5960502692998204, "step": 8300}, {"loss": 0.8051, "grad_norm": 0.5903441309928894, "learning_rate": 0.0002, "epoch": 0.5967684021543985, "step": 8310}, {"loss": 0.7861, "grad_norm": 0.7916744947433472, "learning_rate": 0.0002, "epoch": 0.5974865350089766, "step": 8320}, {"loss": 0.8234, "grad_norm": 0.5506401062011719, "learning_rate": 0.0002, "epoch": 0.5982046678635548, "step": 8330}, {"loss": 0.8137, "grad_norm": 0.5749204158782959, "learning_rate": 0.0002, "epoch": 0.5989228007181329, "step": 8340}, {"loss": 0.8133, "grad_norm": 0.6807544827461243, "learning_rate": 0.0002, "epoch": 0.599640933572711, "step": 8350}, {"loss": 0.8089, "grad_norm": 0.5782986283302307, "learning_rate": 0.0002, "epoch": 0.6003590664272891, "step": 8360}, {"loss": 0.8725, "grad_norm": 0.7336342334747314, "learning_rate": 0.0002, "epoch": 0.6010771992818671, "step": 8370}, {"loss": 0.7992, "grad_norm": 0.5762712955474854, "learning_rate": 0.0002, "epoch": 0.6017953321364452, "step": 8380}, {"loss": 0.8037, "grad_norm": 0.5726776719093323, "learning_rate": 0.0002, "epoch": 0.6025134649910233, "step": 8390}, {"loss": 0.7918, "grad_norm": 0.5355535745620728, "learning_rate": 0.0002, "epoch": 0.6032315978456014, "step": 8400}, {"loss": 0.8138, "grad_norm": 0.6762161254882812, "learning_rate": 0.0002, "epoch": 0.6039497307001795, "step": 8410}, {"loss": 0.8357, "grad_norm": 0.8200717568397522, "learning_rate": 0.0002, "epoch": 0.6046678635547577, "step": 8420}, {"loss": 0.79, "grad_norm": 0.5600009560585022, "learning_rate": 0.0002, "epoch": 0.6053859964093358, "step": 8430}, {"loss": 0.7387, "grad_norm": 0.6465966105461121, "learning_rate": 0.0002, "epoch": 0.6061041292639138, "step": 8440}, {"loss": 0.838, "grad_norm": 0.5176072120666504, "learning_rate": 0.0002, "epoch": 0.6068222621184919, "step": 8450}, {"loss": 0.7855, "grad_norm": 0.5777280926704407, "learning_rate": 0.0002, "epoch": 0.60754039497307, "step": 8460}, {"loss": 0.7776, "grad_norm": 0.5989252924919128, "learning_rate": 0.0002, "epoch": 0.6082585278276481, "step": 8470}, {"loss": 0.8216, "grad_norm": 0.5207306742668152, "learning_rate": 0.0002, "epoch": 0.6089766606822262, "step": 8480}, {"loss": 0.8092, "grad_norm": 0.5242675542831421, "learning_rate": 0.0002, "epoch": 0.6096947935368043, "step": 8490}, {"loss": 0.7546, "grad_norm": 0.5631455183029175, "learning_rate": 0.0002, "epoch": 0.6104129263913824, "step": 8500}, {"loss": 0.7495, "grad_norm": 0.65207439661026, "learning_rate": 0.0002, "epoch": 0.6111310592459605, "step": 8510}, {"loss": 0.8023, "grad_norm": 0.5808899998664856, "learning_rate": 0.0002, "epoch": 0.6118491921005386, "step": 8520}, {"loss": 0.7763, "grad_norm": 0.558127760887146, "learning_rate": 0.0002, "epoch": 0.6125673249551167, "step": 8530}, {"loss": 0.8012, "grad_norm": 0.6063143014907837, "learning_rate": 0.0002, "epoch": 0.6132854578096948, "step": 8540}, {"loss": 0.7496, "grad_norm": 0.5491744875907898, "learning_rate": 0.0002, "epoch": 0.6140035906642729, "step": 8550}, {"loss": 0.779, "grad_norm": 0.5105780959129333, "learning_rate": 0.0002, "epoch": 0.614721723518851, "step": 8560}, {"loss": 0.7983, "grad_norm": 0.6892395615577698, "learning_rate": 0.0002, "epoch": 0.6154398563734291, "step": 8570}, {"loss": 0.7563, "grad_norm": 0.7411758899688721, "learning_rate": 0.0002, "epoch": 0.6161579892280071, "step": 8580}, {"loss": 0.7455, "grad_norm": 0.6745429635047913, "learning_rate": 0.0002, "epoch": 0.6168761220825852, "step": 8590}, {"loss": 0.8213, "grad_norm": 0.596007227897644, "learning_rate": 0.0002, "epoch": 0.6175942549371634, "step": 8600}, {"loss": 0.7963, "grad_norm": 0.6751060485839844, "learning_rate": 0.0002, "epoch": 0.6183123877917415, "step": 8610}, {"loss": 0.7343, "grad_norm": 0.711124837398529, "learning_rate": 0.0002, "epoch": 0.6190305206463196, "step": 8620}, {"loss": 0.773, "grad_norm": 0.6110914945602417, "learning_rate": 0.0002, "epoch": 0.6197486535008977, "step": 8630}, {"loss": 0.7497, "grad_norm": 0.5687659978866577, "learning_rate": 0.0002, "epoch": 0.6204667863554758, "step": 8640}, {"loss": 0.7754, "grad_norm": 0.7025772929191589, "learning_rate": 0.0002, "epoch": 0.6211849192100538, "step": 8650}, {"loss": 0.7423, "grad_norm": 0.6456184983253479, "learning_rate": 0.0002, "epoch": 0.6219030520646319, "step": 8660}, {"loss": 0.7449, "grad_norm": 0.5317023992538452, "learning_rate": 0.0002, "epoch": 0.62262118491921, "step": 8670}, {"loss": 0.8146, "grad_norm": 0.5531691908836365, "learning_rate": 0.0002, "epoch": 0.6233393177737881, "step": 8680}, {"loss": 0.8171, "grad_norm": 0.6063531637191772, "learning_rate": 0.0002, "epoch": 0.6240574506283663, "step": 8690}, {"loss": 0.7943, "grad_norm": 1.094390630722046, "learning_rate": 0.0002, "epoch": 0.6247755834829444, "step": 8700}, {"loss": 0.7993, "grad_norm": 0.5558148622512817, "learning_rate": 0.0002, "epoch": 0.6254937163375225, "step": 8710}, {"loss": 0.7747, "grad_norm": 0.5470370054244995, "learning_rate": 0.0002, "epoch": 0.6262118491921005, "step": 8720}, {"loss": 0.8252, "grad_norm": 0.5852634310722351, "learning_rate": 0.0002, "epoch": 0.6269299820466786, "step": 8730}, {"loss": 0.8712, "grad_norm": 0.6120240092277527, "learning_rate": 0.0002, "epoch": 0.6276481149012567, "step": 8740}, {"loss": 0.8367, "grad_norm": 0.5608004927635193, "learning_rate": 0.0002, "epoch": 0.6283662477558348, "step": 8750}, {"loss": 0.7711, "grad_norm": 0.5980432033538818, "learning_rate": 0.0002, "epoch": 0.6290843806104129, "step": 8760}, {"loss": 0.7903, "grad_norm": 0.5670580863952637, "learning_rate": 0.0002, "epoch": 0.629802513464991, "step": 8770}, {"loss": 0.7765, "grad_norm": 0.5931687951087952, "learning_rate": 0.0002, "epoch": 0.6305206463195692, "step": 8780}, {"loss": 0.7752, "grad_norm": 0.7872577905654907, "learning_rate": 0.0002, "epoch": 0.6312387791741472, "step": 8790}, {"loss": 0.8045, "grad_norm": 0.6355181336402893, "learning_rate": 0.0002, "epoch": 0.6319569120287253, "step": 8800}, {"loss": 0.7651, "grad_norm": 0.501913845539093, "learning_rate": 0.0002, "epoch": 0.6326750448833034, "step": 8810}, {"loss": 0.8023, "grad_norm": 0.5956716537475586, "learning_rate": 0.0002, "epoch": 0.6333931777378815, "step": 8820}, {"loss": 0.798, "grad_norm": 0.6448253393173218, "learning_rate": 0.0002, "epoch": 0.6341113105924596, "step": 8830}, {"loss": 0.7878, "grad_norm": 0.6139631271362305, "learning_rate": 0.0002, "epoch": 0.6348294434470377, "step": 8840}, {"loss": 0.7767, "grad_norm": 0.5894306302070618, "learning_rate": 0.0002, "epoch": 0.6355475763016158, "step": 8850}, {"loss": 0.7516, "grad_norm": 0.8724799752235413, "learning_rate": 0.0002, "epoch": 0.6362657091561938, "step": 8860}, {"loss": 0.7715, "grad_norm": 0.5413858890533447, "learning_rate": 0.0002, "epoch": 0.636983842010772, "step": 8870}, {"loss": 0.8175, "grad_norm": 0.5993430614471436, "learning_rate": 0.0002, "epoch": 0.6377019748653501, "step": 8880}, {"loss": 0.7865, "grad_norm": 0.539415717124939, "learning_rate": 0.0002, "epoch": 0.6384201077199282, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.600125789642334, "learning_rate": 0.0002, "epoch": 0.6391382405745063, "step": 8900}, {"loss": 0.7886, "grad_norm": 0.5597978234291077, "learning_rate": 0.0002, "epoch": 0.6398563734290844, "step": 8910}, {"loss": 0.8468, "grad_norm": 0.6262031197547913, "learning_rate": 0.0002, "epoch": 0.6405745062836625, "step": 8920}, {"loss": 0.7523, "grad_norm": 0.72662752866745, "learning_rate": 0.0002, "epoch": 0.6412926391382405, "step": 8930}, {"loss": 0.8099, "grad_norm": 0.613002598285675, "learning_rate": 0.0002, "epoch": 0.6420107719928186, "step": 8940}, {"loss": 0.8112, "grad_norm": 0.6511827707290649, "learning_rate": 0.0002, "epoch": 0.6427289048473968, "step": 8950}, {"loss": 0.7479, "grad_norm": 0.5383973717689514, "learning_rate": 0.0002, "epoch": 0.6434470377019749, "step": 8960}, {"loss": 0.764, "grad_norm": 0.5236184597015381, "learning_rate": 0.0002, "epoch": 0.644165170556553, "step": 8970}, {"loss": 0.7515, "grad_norm": 0.5938544273376465, "learning_rate": 0.0002, "epoch": 0.6448833034111311, "step": 8980}, {"loss": 0.8103, "grad_norm": 0.4594680964946747, "learning_rate": 0.0002, "epoch": 0.6456014362657092, "step": 8990}, {"loss": 0.7495, "grad_norm": 0.6314211487770081, "learning_rate": 0.0002, "epoch": 0.6463195691202872, "step": 9000}, {"loss": 0.8162, "grad_norm": 0.6291103363037109, "learning_rate": 0.0002, "epoch": 0.6470377019748653, "step": 9010}, {"loss": 0.8167, "grad_norm": 0.5888266563415527, "learning_rate": 0.0002, "epoch": 0.6477558348294434, "step": 9020}, {"loss": 0.7685, "grad_norm": 0.5613022446632385, "learning_rate": 0.0002, "epoch": 0.6484739676840215, "step": 9030}, {"loss": 0.8142, "grad_norm": 0.7219604253768921, "learning_rate": 0.0002, "epoch": 0.6491921005385997, "step": 9040}, {"loss": 0.805, "grad_norm": 0.5846529006958008, "learning_rate": 0.0002, "epoch": 0.6499102333931778, "step": 9050}, {"loss": 0.8471, "grad_norm": 0.7264063954353333, "learning_rate": 0.0002, "epoch": 0.6506283662477559, "step": 9060}, {"loss": 0.7925, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "epoch": 0.6513464991023339, "step": 9070}, {"loss": 0.7961, "grad_norm": 0.4857395887374878, "learning_rate": 0.0002, "epoch": 0.652064631956912, "step": 9080}, {"loss": 0.7567, "grad_norm": 0.5044030547142029, "learning_rate": 0.0002, "epoch": 0.6527827648114901, "step": 9090}, {"loss": 0.7889, "grad_norm": 0.6105342507362366, "learning_rate": 0.0002, "epoch": 0.6535008976660682, "step": 9100}, {"loss": 0.7692, "grad_norm": 0.6408740282058716, "learning_rate": 0.0002, "epoch": 0.6542190305206463, "step": 9110}, {"loss": 0.7788, "grad_norm": 0.7474880814552307, "learning_rate": 0.0002, "epoch": 0.6549371633752245, "step": 9120}, {"loss": 0.7694, "grad_norm": 0.584768533706665, "learning_rate": 0.0002, "epoch": 0.6556552962298026, "step": 9130}, {"loss": 0.8273, "grad_norm": 0.6368113160133362, "learning_rate": 0.0002, "epoch": 0.6563734290843806, "step": 9140}, {"loss": 0.7493, "grad_norm": 0.693631649017334, "learning_rate": 0.0002, "epoch": 0.6570915619389587, "step": 9150}, {"loss": 0.7636, "grad_norm": 0.6094512343406677, "learning_rate": 0.0002, "epoch": 0.6578096947935368, "step": 9160}, {"loss": 0.8269, "grad_norm": 0.7154942750930786, "learning_rate": 0.0002, "epoch": 0.6585278276481149, "step": 9170}, {"loss": 0.7623, "grad_norm": 0.5749237537384033, "learning_rate": 0.0002, "epoch": 0.659245960502693, "step": 9180}, {"loss": 0.799, "grad_norm": 0.6214450001716614, "learning_rate": 0.0002, "epoch": 0.6599640933572711, "step": 9190}, {"loss": 0.7973, "grad_norm": 0.6357814073562622, "learning_rate": 0.0002, "epoch": 0.6606822262118492, "step": 9200}, {"loss": 0.773, "grad_norm": 0.5677326917648315, "learning_rate": 0.0002, "epoch": 0.6614003590664272, "step": 9210}, {"loss": 0.8173, "grad_norm": 0.5432633757591248, "learning_rate": 0.0002, "epoch": 0.6621184919210054, "step": 9220}, {"loss": 0.7573, "grad_norm": 0.43935060501098633, "learning_rate": 0.0002, "epoch": 0.6628366247755835, "step": 9230}, {"loss": 0.848, "grad_norm": 0.5350922346115112, "learning_rate": 0.0002, "epoch": 0.6635547576301616, "step": 9240}, {"loss": 0.7409, "grad_norm": 0.7745687365531921, "learning_rate": 0.0002, "epoch": 0.6642728904847397, "step": 9250}, {"loss": 0.7412, "grad_norm": 0.5767113566398621, "learning_rate": 0.0002, "epoch": 0.6649910233393178, "step": 9260}, {"loss": 0.8197, "grad_norm": 0.49304983019828796, "learning_rate": 0.0002, "epoch": 0.6657091561938959, "step": 9270}, {"loss": 0.7856, "grad_norm": 0.6355269551277161, "learning_rate": 0.0002, "epoch": 0.6664272890484739, "step": 9280}, {"loss": 0.7659, "grad_norm": 0.5539451241493225, "learning_rate": 0.0002, "epoch": 0.667145421903052, "step": 9290}, {"loss": 0.7888, "grad_norm": 0.5225138068199158, "learning_rate": 0.0002, "epoch": 0.6678635547576302, "step": 9300}, {"loss": 0.8048, "grad_norm": 0.5435736179351807, "learning_rate": 0.0002, "epoch": 0.6685816876122083, "step": 9310}, {"loss": 0.8284, "grad_norm": 0.611266553401947, "learning_rate": 0.0002, "epoch": 0.6692998204667864, "step": 9320}, {"loss": 0.8081, "grad_norm": 0.5880926251411438, "learning_rate": 0.0002, "epoch": 0.6700179533213645, "step": 9330}, {"loss": 0.7781, "grad_norm": 0.5301468372344971, "learning_rate": 0.0002, "epoch": 0.6707360861759426, "step": 9340}, {"loss": 0.7586, "grad_norm": 0.5614377856254578, "learning_rate": 0.0002, "epoch": 0.6714542190305206, "step": 9350}, {"loss": 0.7538, "grad_norm": 0.7177342176437378, "learning_rate": 0.0002, "epoch": 0.6721723518850987, "step": 9360}, {"loss": 0.7412, "grad_norm": 0.5187423825263977, "learning_rate": 0.0002, "epoch": 0.6728904847396768, "step": 9370}, {"loss": 0.7456, "grad_norm": 0.49305087327957153, "learning_rate": 0.0002, "epoch": 0.6736086175942549, "step": 9380}, {"loss": 0.7926, "grad_norm": 0.555867612361908, "learning_rate": 0.0002, "epoch": 0.6743267504488331, "step": 9390}, {"loss": 0.7486, "grad_norm": 0.8308040499687195, "learning_rate": 0.0002, "epoch": 0.6750448833034112, "step": 9400}, {"loss": 0.8225, "grad_norm": 0.6522438526153564, "learning_rate": 0.0002, "epoch": 0.6757630161579893, "step": 9410}, {"loss": 0.8283, "grad_norm": 0.5768371224403381, "learning_rate": 0.0002, "epoch": 0.6764811490125673, "step": 9420}, {"loss": 0.7815, "grad_norm": 0.783802330493927, "learning_rate": 0.0002, "epoch": 0.6771992818671454, "step": 9430}, {"loss": 0.7511, "grad_norm": 0.5246656537055969, "learning_rate": 0.0002, "epoch": 0.6779174147217235, "step": 9440}, {"loss": 0.7866, "grad_norm": 0.6630974411964417, "learning_rate": 0.0002, "epoch": 0.6786355475763016, "step": 9450}, {"loss": 0.7961, "grad_norm": 0.5012770295143127, "learning_rate": 0.0002, "epoch": 0.6793536804308797, "step": 9460}, {"loss": 0.7762, "grad_norm": 0.6208643317222595, "learning_rate": 0.0002, "epoch": 0.6800718132854578, "step": 9470}, {"loss": 0.7229, "grad_norm": 0.6033898591995239, "learning_rate": 0.0002, "epoch": 0.680789946140036, "step": 9480}, {"loss": 0.8315, "grad_norm": 0.6613174080848694, "learning_rate": 0.0002, "epoch": 0.681508078994614, "step": 9490}, {"loss": 0.7874, "grad_norm": 0.6417899131774902, "learning_rate": 0.0002, "epoch": 0.6822262118491921, "step": 9500}, {"loss": 0.7979, "grad_norm": 0.5060321092605591, "learning_rate": 0.0002, "epoch": 0.6829443447037702, "step": 9510}, {"loss": 0.7908, "grad_norm": 0.586670458316803, "learning_rate": 0.0002, "epoch": 0.6836624775583483, "step": 9520}, {"loss": 0.7652, "grad_norm": 0.6607828736305237, "learning_rate": 0.0002, "epoch": 0.6843806104129264, "step": 9530}, {"loss": 0.7645, "grad_norm": 0.5142775177955627, "learning_rate": 0.0002, "epoch": 0.6850987432675045, "step": 9540}, {"loss": 0.7553, "grad_norm": 0.741000771522522, "learning_rate": 0.0002, "epoch": 0.6858168761220825, "step": 9550}, {"loss": 0.8453, "grad_norm": 0.4687826335430145, "learning_rate": 0.0002, "epoch": 0.6865350089766606, "step": 9560}, {"loss": 0.7582, "grad_norm": 0.6452056169509888, "learning_rate": 0.0002, "epoch": 0.6872531418312388, "step": 9570}, {"loss": 0.7965, "grad_norm": 0.6393555402755737, "learning_rate": 0.0002, "epoch": 0.6879712746858169, "step": 9580}, {"loss": 0.802, "grad_norm": 0.4907757043838501, "learning_rate": 0.0002, "epoch": 0.688689407540395, "step": 9590}, {"loss": 0.7813, "grad_norm": 0.5380825996398926, "learning_rate": 0.0002, "epoch": 0.6894075403949731, "step": 9600}, {"loss": 0.8188, "grad_norm": 0.5657393932342529, "learning_rate": 0.0002, "epoch": 0.6901256732495512, "step": 9610}, {"loss": 0.7581, "grad_norm": 0.8505447506904602, "learning_rate": 0.0002, "epoch": 0.6908438061041292, "step": 9620}, {"loss": 0.7631, "grad_norm": 0.5389836430549622, "learning_rate": 0.0002, "epoch": 0.6915619389587073, "step": 9630}, {"loss": 0.8015, "grad_norm": 0.4977441728115082, "learning_rate": 0.0002, "epoch": 0.6922800718132854, "step": 9640}, {"loss": 0.8057, "grad_norm": 0.5855389833450317, "learning_rate": 0.0002, "epoch": 0.6929982046678635, "step": 9650}, {"loss": 0.7735, "grad_norm": 0.633994996547699, "learning_rate": 0.0002, "epoch": 0.6937163375224417, "step": 9660}, {"loss": 0.7918, "grad_norm": 0.5592191815376282, "learning_rate": 0.0002, "epoch": 0.6944344703770198, "step": 9670}, {"loss": 0.7883, "grad_norm": 0.6030594706535339, "learning_rate": 0.0002, "epoch": 0.6951526032315979, "step": 9680}, {"loss": 0.7472, "grad_norm": 0.6782388687133789, "learning_rate": 0.0002, "epoch": 0.6958707360861759, "step": 9690}, {"loss": 0.8097, "grad_norm": 0.6777627468109131, "learning_rate": 0.0002, "epoch": 0.696588868940754, "step": 9700}, {"loss": 0.7958, "grad_norm": 0.5674123764038086, "learning_rate": 0.0002, "epoch": 0.6973070017953321, "step": 9710}, {"loss": 0.7743, "grad_norm": 0.5280387997627258, "learning_rate": 0.0002, "epoch": 0.6980251346499102, "step": 9720}, {"loss": 0.7496, "grad_norm": 0.5471981763839722, "learning_rate": 0.0002, "epoch": 0.6987432675044883, "step": 9730}, {"loss": 0.7837, "grad_norm": 0.6751061677932739, "learning_rate": 0.0002, "epoch": 0.6994614003590665, "step": 9740}, {"loss": 0.7686, "grad_norm": 0.5942487716674805, "learning_rate": 0.0002, "epoch": 0.7001795332136446, "step": 9750}, {"loss": 0.757, "grad_norm": 0.6165713667869568, "learning_rate": 0.0002, "epoch": 0.7008976660682226, "step": 9760}, {"loss": 0.7864, "grad_norm": 0.5745091438293457, "learning_rate": 0.0002, "epoch": 0.7016157989228007, "step": 9770}, {"loss": 0.8079, "grad_norm": 0.600308358669281, "learning_rate": 0.0002, "epoch": 0.7023339317773788, "step": 9780}, {"loss": 0.7527, "grad_norm": 0.6448577046394348, "learning_rate": 0.0002, "epoch": 0.7030520646319569, "step": 9790}, {"loss": 0.7725, "grad_norm": 0.5662767291069031, "learning_rate": 0.0002, "epoch": 0.703770197486535, "step": 9800}, {"loss": 0.8028, "grad_norm": 0.6490433812141418, "learning_rate": 0.0002, "epoch": 0.7044883303411131, "step": 9810}, {"loss": 0.8006, "grad_norm": 0.6126134991645813, "learning_rate": 0.0002, "epoch": 0.7052064631956912, "step": 9820}, {"loss": 0.8034, "grad_norm": 0.7181116938591003, "learning_rate": 0.0002, "epoch": 0.7059245960502692, "step": 9830}, {"loss": 0.7937, "grad_norm": 0.7805212140083313, "learning_rate": 0.0002, "epoch": 0.7066427289048474, "step": 9840}, {"loss": 0.7781, "grad_norm": 0.7521958947181702, "learning_rate": 0.0002, "epoch": 0.7073608617594255, "step": 9850}, {"loss": 0.7412, "grad_norm": 0.5610787868499756, "learning_rate": 0.0002, "epoch": 0.7080789946140036, "step": 9860}, {"loss": 0.7627, "grad_norm": 0.7026229500770569, "learning_rate": 0.0002, "epoch": 0.7087971274685817, "step": 9870}, {"loss": 0.8085, "grad_norm": 0.551691472530365, "learning_rate": 0.0002, "epoch": 0.7095152603231598, "step": 9880}, {"loss": 0.7874, "grad_norm": 0.5841995477676392, "learning_rate": 0.0002, "epoch": 0.7102333931777379, "step": 9890}, {"loss": 0.7749, "grad_norm": 0.7170061469078064, "learning_rate": 0.0002, "epoch": 0.7109515260323159, "step": 9900}, {"loss": 0.7917, "grad_norm": 0.49836990237236023, "learning_rate": 0.0002, "epoch": 0.711669658886894, "step": 9910}, {"loss": 0.7667, "grad_norm": 0.5234556794166565, "learning_rate": 0.0002, "epoch": 0.7123877917414722, "step": 9920}, {"loss": 0.8438, "grad_norm": 0.7590384483337402, "learning_rate": 0.0002, "epoch": 0.7131059245960503, "step": 9930}, {"loss": 0.7725, "grad_norm": 0.5657515525817871, "learning_rate": 0.0002, "epoch": 0.7138240574506284, "step": 9940}, {"loss": 0.8184, "grad_norm": 0.5969128012657166, "learning_rate": 0.0002, "epoch": 0.7145421903052065, "step": 9950}, {"loss": 0.7375, "grad_norm": 0.7136867046356201, "learning_rate": 0.0002, "epoch": 0.7152603231597846, "step": 9960}, {"loss": 0.7883, "grad_norm": 0.6774699091911316, "learning_rate": 0.0002, "epoch": 0.7159784560143626, "step": 9970}, {"loss": 0.7629, "grad_norm": 0.6066371202468872, "learning_rate": 0.0002, "epoch": 0.7166965888689407, "step": 9980}, {"loss": 0.7767, "grad_norm": 0.7355279922485352, "learning_rate": 0.0002, "epoch": 0.7174147217235188, "step": 9990}, {"loss": 0.7643, "grad_norm": 0.7996646761894226, "learning_rate": 0.0002, "epoch": 0.718132854578097, "step": 10000}, {"loss": 0.8304, "grad_norm": 0.628839910030365, "learning_rate": 0.0002, "epoch": 0.7188509874326751, "step": 10010}, {"loss": 0.7292, "grad_norm": 0.5472931265830994, "learning_rate": 0.0002, "epoch": 0.7195691202872532, "step": 10020}, {"loss": 0.7787, "grad_norm": 0.5776344537734985, "learning_rate": 0.0002, "epoch": 0.7202872531418313, "step": 10030}, {"loss": 0.7432, "grad_norm": 0.5041707158088684, "learning_rate": 0.0002, "epoch": 0.7210053859964093, "step": 10040}, {"loss": 0.7923, "grad_norm": 0.5965308547019958, "learning_rate": 0.0002, "epoch": 0.7217235188509874, "step": 10050}, {"loss": 0.8131, "grad_norm": 0.5892689228057861, "learning_rate": 0.0002, "epoch": 0.7224416517055655, "step": 10060}, {"loss": 0.7961, "grad_norm": 0.5695884227752686, "learning_rate": 0.0002, "epoch": 0.7231597845601436, "step": 10070}, {"loss": 0.7806, "grad_norm": 0.6547690629959106, "learning_rate": 0.0002, "epoch": 0.7238779174147217, "step": 10080}, {"loss": 0.7978, "grad_norm": 0.6759928464889526, "learning_rate": 0.0002, "epoch": 0.7245960502692999, "step": 10090}, {"loss": 0.7547, "grad_norm": 0.6829725503921509, "learning_rate": 0.0002, "epoch": 0.725314183123878, "step": 10100}, {"loss": 0.7507, "grad_norm": 0.5242751240730286, "learning_rate": 0.0002, "epoch": 0.726032315978456, "step": 10110}, {"loss": 0.8042, "grad_norm": 0.6947014927864075, "learning_rate": 0.0002, "epoch": 0.7267504488330341, "step": 10120}, {"loss": 0.7621, "grad_norm": 0.6094982624053955, "learning_rate": 0.0002, "epoch": 0.7274685816876122, "step": 10130}, {"loss": 0.7911, "grad_norm": 0.628461480140686, "learning_rate": 0.0002, "epoch": 0.7281867145421903, "step": 10140}, {"loss": 0.7839, "grad_norm": 0.4952087104320526, "learning_rate": 0.0002, "epoch": 0.7289048473967684, "step": 10150}, {"loss": 0.7582, "grad_norm": 0.6917221546173096, "learning_rate": 0.0002, "epoch": 0.7296229802513465, "step": 10160}, {"loss": 0.7791, "grad_norm": 0.6866413354873657, "learning_rate": 0.0002, "epoch": 0.7303411131059246, "step": 10170}, {"loss": 0.7628, "grad_norm": 0.5505863428115845, "learning_rate": 0.0002, "epoch": 0.7310592459605026, "step": 10180}, {"loss": 0.7941, "grad_norm": 0.5903199911117554, "learning_rate": 0.0002, "epoch": 0.7317773788150808, "step": 10190}, {"loss": 0.8072, "grad_norm": 0.5001798272132874, "learning_rate": 0.0002, "epoch": 0.7324955116696589, "step": 10200}, {"loss": 0.7934, "grad_norm": 0.5117581486701965, "learning_rate": 0.0002, "epoch": 0.733213644524237, "step": 10210}, {"loss": 0.8364, "grad_norm": 0.7716088891029358, "learning_rate": 0.0002, "epoch": 0.7339317773788151, "step": 10220}, {"loss": 0.7775, "grad_norm": 0.5973874926567078, "learning_rate": 0.0002, "epoch": 0.7346499102333932, "step": 10230}, {"loss": 0.7689, "grad_norm": 0.6433483362197876, "learning_rate": 0.0002, "epoch": 0.7353680430879713, "step": 10240}, {"loss": 0.8307, "grad_norm": 0.6241081357002258, "learning_rate": 0.0002, "epoch": 0.7360861759425493, "step": 10250}, {"loss": 0.7432, "grad_norm": 0.7198845744132996, "learning_rate": 0.0002, "epoch": 0.7368043087971274, "step": 10260}, {"loss": 0.7545, "grad_norm": 0.5879023671150208, "learning_rate": 0.0002, "epoch": 0.7375224416517056, "step": 10270}, {"loss": 0.7526, "grad_norm": 0.5810162425041199, "learning_rate": 0.0002, "epoch": 0.7382405745062837, "step": 10280}, {"loss": 0.7839, "grad_norm": 0.6336500644683838, "learning_rate": 0.0002, "epoch": 0.7389587073608618, "step": 10290}, {"loss": 0.7597, "grad_norm": 0.5627583861351013, "learning_rate": 0.0002, "epoch": 0.7396768402154399, "step": 10300}, {"loss": 0.8166, "grad_norm": 0.5396066904067993, "learning_rate": 0.0002, "epoch": 0.740394973070018, "step": 10310}, {"loss": 0.7698, "grad_norm": 0.5519505143165588, "learning_rate": 0.0002, "epoch": 0.741113105924596, "step": 10320}, {"loss": 0.7953, "grad_norm": 0.628710925579071, "learning_rate": 0.0002, "epoch": 0.7418312387791741, "step": 10330}, {"loss": 0.805, "grad_norm": 0.6466957926750183, "learning_rate": 0.0002, "epoch": 0.7425493716337522, "step": 10340}, {"loss": 0.8173, "grad_norm": 0.6269286274909973, "learning_rate": 0.0002, "epoch": 0.7432675044883303, "step": 10350}, {"loss": 0.8315, "grad_norm": 0.6985455751419067, "learning_rate": 0.0002, "epoch": 0.7439856373429085, "step": 10360}, {"loss": 0.7598, "grad_norm": 0.6203648447990417, "learning_rate": 0.0002, "epoch": 0.7447037701974866, "step": 10370}, {"loss": 0.7937, "grad_norm": 0.6524295210838318, "learning_rate": 0.0002, "epoch": 0.7454219030520647, "step": 10380}, {"loss": 0.8005, "grad_norm": 0.6108002662658691, "learning_rate": 0.0002, "epoch": 0.7461400359066427, "step": 10390}, {"loss": 0.7592, "grad_norm": 0.5196276903152466, "learning_rate": 0.0002, "epoch": 0.7468581687612208, "step": 10400}, {"loss": 0.7769, "grad_norm": 0.6207506656646729, "learning_rate": 0.0002, "epoch": 0.7475763016157989, "step": 10410}, {"loss": 0.8066, "grad_norm": 0.6015686988830566, "learning_rate": 0.0002, "epoch": 0.748294434470377, "step": 10420}, {"loss": 0.7993, "grad_norm": 0.6402649879455566, "learning_rate": 0.0002, "epoch": 0.7490125673249551, "step": 10430}, {"loss": 0.802, "grad_norm": 0.7816081047058105, "learning_rate": 0.0002, "epoch": 0.7497307001795332, "step": 10440}, {"loss": 0.8021, "grad_norm": 0.6148143410682678, "learning_rate": 0.0002, "epoch": 0.7504488330341114, "step": 10450}, {"loss": 0.7986, "grad_norm": 0.6496613621711731, "learning_rate": 0.0002, "epoch": 0.7511669658886894, "step": 10460}, {"loss": 0.8152, "grad_norm": 0.49158045649528503, "learning_rate": 0.0002, "epoch": 0.7518850987432675, "step": 10470}, {"loss": 0.8098, "grad_norm": 0.8629217743873596, "learning_rate": 0.0002, "epoch": 0.7526032315978456, "step": 10480}, {"loss": 0.807, "grad_norm": 0.6800066828727722, "learning_rate": 0.0002, "epoch": 0.7533213644524237, "step": 10490}, {"loss": 0.7238, "grad_norm": 0.6480063199996948, "learning_rate": 0.0002, "epoch": 0.7540394973070018, "step": 10500}, {"loss": 0.7818, "grad_norm": 0.5740751028060913, "learning_rate": 0.0002, "epoch": 0.7547576301615799, "step": 10510}, {"loss": 0.7732, "grad_norm": 0.7182627320289612, "learning_rate": 0.0002, "epoch": 0.755475763016158, "step": 10520}, {"loss": 0.7752, "grad_norm": 0.6482816934585571, "learning_rate": 0.0002, "epoch": 0.756193895870736, "step": 10530}, {"loss": 0.7564, "grad_norm": 0.4937674105167389, "learning_rate": 0.0002, "epoch": 0.7569120287253142, "step": 10540}, {"loss": 0.7783, "grad_norm": 0.6818482875823975, "learning_rate": 0.0002, "epoch": 0.7576301615798923, "step": 10550}, {"loss": 0.8303, "grad_norm": 0.6375173926353455, "learning_rate": 0.0002, "epoch": 0.7583482944344704, "step": 10560}, {"loss": 0.77, "grad_norm": 0.528798520565033, "learning_rate": 0.0002, "epoch": 0.7590664272890485, "step": 10570}, {"loss": 0.8435, "grad_norm": 0.42099910974502563, "learning_rate": 0.0002, "epoch": 0.7597845601436266, "step": 10580}, {"loss": 0.8218, "grad_norm": 0.529604434967041, "learning_rate": 0.0002, "epoch": 0.7605026929982047, "step": 10590}, {"loss": 0.7833, "grad_norm": 0.6236841082572937, "learning_rate": 0.0002, "epoch": 0.7612208258527827, "step": 10600}, {"loss": 0.777, "grad_norm": 0.6194891929626465, "learning_rate": 0.0002, "epoch": 0.7619389587073608, "step": 10610}, {"loss": 0.7967, "grad_norm": 0.5206209421157837, "learning_rate": 0.0002, "epoch": 0.762657091561939, "step": 10620}, {"loss": 0.811, "grad_norm": 0.7981295585632324, "learning_rate": 0.0002, "epoch": 0.7633752244165171, "step": 10630}, {"loss": 0.8016, "grad_norm": 0.6113479137420654, "learning_rate": 0.0002, "epoch": 0.7640933572710952, "step": 10640}, {"loss": 0.7642, "grad_norm": 0.7025435566902161, "learning_rate": 0.0002, "epoch": 0.7648114901256733, "step": 10650}, {"loss": 0.7293, "grad_norm": 0.46914348006248474, "learning_rate": 0.0002, "epoch": 0.7655296229802514, "step": 10660}, {"loss": 0.8079, "grad_norm": 0.6134725213050842, "learning_rate": 0.0002, "epoch": 0.7662477558348294, "step": 10670}, {"loss": 0.7469, "grad_norm": 0.583859920501709, "learning_rate": 0.0002, "epoch": 0.7669658886894075, "step": 10680}, {"loss": 0.843, "grad_norm": 0.511349081993103, "learning_rate": 0.0002, "epoch": 0.7676840215439856, "step": 10690}, {"loss": 0.8355, "grad_norm": 0.6467110514640808, "learning_rate": 0.0002, "epoch": 0.7684021543985637, "step": 10700}, {"loss": 0.7935, "grad_norm": 0.7210163474082947, "learning_rate": 0.0002, "epoch": 0.7691202872531419, "step": 10710}, {"loss": 0.7807, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "epoch": 0.76983842010772, "step": 10720}, {"loss": 0.7742, "grad_norm": 0.6237271428108215, "learning_rate": 0.0002, "epoch": 0.7705565529622981, "step": 10730}, {"loss": 0.8185, "grad_norm": 0.664328396320343, "learning_rate": 0.0002, "epoch": 0.7712746858168761, "step": 10740}, {"loss": 0.8096, "grad_norm": 0.6550520062446594, "learning_rate": 0.0002, "epoch": 0.7719928186714542, "step": 10750}, {"loss": 0.7538, "grad_norm": 0.5103325843811035, "learning_rate": 0.0002, "epoch": 0.7727109515260323, "step": 10760}, {"loss": 0.7777, "grad_norm": 0.7171200513839722, "learning_rate": 0.0002, "epoch": 0.7734290843806104, "step": 10770}, {"loss": 0.7743, "grad_norm": 0.5947384834289551, "learning_rate": 0.0002, "epoch": 0.7741472172351885, "step": 10780}, {"loss": 0.781, "grad_norm": 0.5293096899986267, "learning_rate": 0.0002, "epoch": 0.7748653500897666, "step": 10790}, {"loss": 0.777, "grad_norm": 0.6372577548027039, "learning_rate": 0.0002, "epoch": 0.7755834829443446, "step": 10800}, {"loss": 0.7972, "grad_norm": 0.5738261938095093, "learning_rate": 0.0002, "epoch": 0.7763016157989228, "step": 10810}, {"loss": 0.7877, "grad_norm": 0.7309247255325317, "learning_rate": 0.0002, "epoch": 0.7770197486535009, "step": 10820}, {"loss": 0.7745, "grad_norm": 0.8867193460464478, "learning_rate": 0.0002, "epoch": 0.777737881508079, "step": 10830}, {"loss": 0.7959, "grad_norm": 0.6151437759399414, "learning_rate": 0.0002, "epoch": 0.7784560143626571, "step": 10840}, {"loss": 0.7897, "grad_norm": 0.5645464658737183, "learning_rate": 0.0002, "epoch": 0.7791741472172352, "step": 10850}, {"loss": 0.7858, "grad_norm": 0.5118698477745056, "learning_rate": 0.0002, "epoch": 0.7798922800718133, "step": 10860}, {"loss": 0.8064, "grad_norm": 0.618181049823761, "learning_rate": 0.0002, "epoch": 0.7806104129263913, "step": 10870}, {"loss": 0.7675, "grad_norm": 0.7206462025642395, "learning_rate": 0.0002, "epoch": 0.7813285457809694, "step": 10880}, {"loss": 0.8162, "grad_norm": 0.7993820905685425, "learning_rate": 0.0002, "epoch": 0.7820466786355476, "step": 10890}, {"loss": 0.781, "grad_norm": 0.5072754621505737, "learning_rate": 0.0002, "epoch": 0.7827648114901257, "step": 10900}, {"loss": 0.7575, "grad_norm": 0.5829088687896729, "learning_rate": 0.0002, "epoch": 0.7834829443447038, "step": 10910}, {"loss": 0.7552, "grad_norm": 0.5778957605361938, "learning_rate": 0.0002, "epoch": 0.7842010771992819, "step": 10920}, {"loss": 0.7652, "grad_norm": 0.7237067222595215, "learning_rate": 0.0002, "epoch": 0.78491921005386, "step": 10930}, {"loss": 0.8357, "grad_norm": 0.5778013467788696, "learning_rate": 0.0002, "epoch": 0.785637342908438, "step": 10940}, {"loss": 0.7464, "grad_norm": 0.6129629611968994, "learning_rate": 0.0002, "epoch": 0.7863554757630161, "step": 10950}, {"loss": 0.7863, "grad_norm": 0.5637320876121521, "learning_rate": 0.0002, "epoch": 0.7870736086175942, "step": 10960}, {"loss": 0.7645, "grad_norm": 0.6253715753555298, "learning_rate": 0.0002, "epoch": 0.7877917414721723, "step": 10970}, {"loss": 0.8307, "grad_norm": 0.6209888458251953, "learning_rate": 0.0002, "epoch": 0.7885098743267505, "step": 10980}, {"loss": 0.7899, "grad_norm": 1.0841948986053467, "learning_rate": 0.0002, "epoch": 0.7892280071813286, "step": 10990}, {"loss": 0.7659, "grad_norm": 0.6570560336112976, "learning_rate": 0.0002, "epoch": 0.7899461400359067, "step": 11000}, {"loss": 0.7839, "grad_norm": 0.4830388128757477, "learning_rate": 0.0002, "epoch": 0.7906642728904847, "step": 11010}, {"loss": 0.8064, "grad_norm": 0.7607520222663879, "learning_rate": 0.0002, "epoch": 0.7913824057450628, "step": 11020}, {"loss": 0.8009, "grad_norm": 0.8202590346336365, "learning_rate": 0.0002, "epoch": 0.7921005385996409, "step": 11030}, {"loss": 0.7788, "grad_norm": 0.5640848278999329, "learning_rate": 0.0002, "epoch": 0.792818671454219, "step": 11040}, {"loss": 0.8298, "grad_norm": 0.7773675322532654, "learning_rate": 0.0002, "epoch": 0.7935368043087971, "step": 11050}, {"loss": 0.793, "grad_norm": 0.664139986038208, "learning_rate": 0.0002, "epoch": 0.7942549371633753, "step": 11060}, {"loss": 0.7886, "grad_norm": 0.6097795367240906, "learning_rate": 0.0002, "epoch": 0.7949730700179534, "step": 11070}, {"loss": 0.7989, "grad_norm": 0.9208881258964539, "learning_rate": 0.0002, "epoch": 0.7956912028725314, "step": 11080}, {"loss": 0.8045, "grad_norm": 0.6210731863975525, "learning_rate": 0.0002, "epoch": 0.7964093357271095, "step": 11090}, {"loss": 0.7868, "grad_norm": 0.7060235738754272, "learning_rate": 0.0002, "epoch": 0.7971274685816876, "step": 11100}, {"loss": 0.8041, "grad_norm": 0.48695266246795654, "learning_rate": 0.0002, "epoch": 0.7978456014362657, "step": 11110}, {"loss": 0.7885, "grad_norm": 0.6458830833435059, "learning_rate": 0.0002, "epoch": 0.7985637342908438, "step": 11120}, {"loss": 0.7773, "grad_norm": 0.572545051574707, "learning_rate": 0.0002, "epoch": 0.7992818671454219, "step": 11130}, {"loss": 0.7984, "grad_norm": 0.5925027132034302, "learning_rate": 0.0002, "epoch": 0.8, "step": 11140}, {"loss": 0.7571, "grad_norm": 0.569622278213501, "learning_rate": 0.0002, "epoch": 0.800718132854578, "step": 11150}, {"loss": 0.7765, "grad_norm": 0.537146806716919, "learning_rate": 0.0002, "epoch": 0.8014362657091562, "step": 11160}, {"loss": 0.7896, "grad_norm": 0.7118613719940186, "learning_rate": 0.0002, "epoch": 0.8021543985637343, "step": 11170}, {"loss": 0.7398, "grad_norm": 0.6183688044548035, "learning_rate": 0.0002, "epoch": 0.8028725314183124, "step": 11180}, {"loss": 0.7545, "grad_norm": 0.5187385082244873, "learning_rate": 0.0002, "epoch": 0.8035906642728905, "step": 11190}, {"loss": 0.766, "grad_norm": 0.5422571301460266, "learning_rate": 0.0002, "epoch": 0.8043087971274686, "step": 11200}, {"loss": 0.756, "grad_norm": 0.635050892829895, "learning_rate": 0.0002, "epoch": 0.8050269299820467, "step": 11210}, {"loss": 0.7337, "grad_norm": 0.6584872007369995, "learning_rate": 0.0002, "epoch": 0.8057450628366247, "step": 11220}, {"loss": 0.7467, "grad_norm": 0.624921977519989, "learning_rate": 0.0002, "epoch": 0.8064631956912028, "step": 11230}, {"loss": 0.7559, "grad_norm": 0.6837546229362488, "learning_rate": 0.0002, "epoch": 0.807181328545781, "step": 11240}, {"loss": 0.7861, "grad_norm": 0.5861160755157471, "learning_rate": 0.0002, "epoch": 0.8078994614003591, "step": 11250}, {"loss": 0.7883, "grad_norm": 0.5751383900642395, "learning_rate": 0.0002, "epoch": 0.8086175942549372, "step": 11260}, {"loss": 0.8103, "grad_norm": 0.7181510329246521, "learning_rate": 0.0002, "epoch": 0.8093357271095153, "step": 11270}, {"loss": 0.8066, "grad_norm": 0.5862139463424683, "learning_rate": 0.0002, "epoch": 0.8100538599640934, "step": 11280}, {"loss": 0.7692, "grad_norm": 0.4880113899707794, "learning_rate": 0.0002, "epoch": 0.8107719928186714, "step": 11290}, {"loss": 0.8154, "grad_norm": 0.565590500831604, "learning_rate": 0.0002, "epoch": 0.8114901256732495, "step": 11300}, {"loss": 0.7893, "grad_norm": 0.6171264052391052, "learning_rate": 0.0002, "epoch": 0.8122082585278276, "step": 11310}, {"loss": 0.816, "grad_norm": 0.5815969109535217, "learning_rate": 0.0002, "epoch": 0.8129263913824057, "step": 11320}, {"loss": 0.7462, "grad_norm": 0.5407653450965881, "learning_rate": 0.0002, "epoch": 0.8136445242369839, "step": 11330}, {"loss": 0.7647, "grad_norm": 0.6990084648132324, "learning_rate": 0.0002, "epoch": 0.814362657091562, "step": 11340}, {"loss": 0.783, "grad_norm": 0.5845068097114563, "learning_rate": 0.0002, "epoch": 0.8150807899461401, "step": 11350}, {"loss": 0.7839, "grad_norm": 0.5978701114654541, "learning_rate": 0.0002, "epoch": 0.8157989228007181, "step": 11360}, {"loss": 0.7342, "grad_norm": 0.6873053312301636, "learning_rate": 0.0002, "epoch": 0.8165170556552962, "step": 11370}, {"loss": 0.7656, "grad_norm": 0.7048654556274414, "learning_rate": 0.0002, "epoch": 0.8172351885098743, "step": 11380}, {"loss": 0.7293, "grad_norm": 0.7631531953811646, "learning_rate": 0.0002, "epoch": 0.8179533213644524, "step": 11390}, {"loss": 0.8606, "grad_norm": 0.704922080039978, "learning_rate": 0.0002, "epoch": 0.8186714542190305, "step": 11400}, {"loss": 0.8066, "grad_norm": 0.595460832118988, "learning_rate": 0.0002, "epoch": 0.8193895870736086, "step": 11410}, {"loss": 0.809, "grad_norm": 0.5882242918014526, "learning_rate": 0.0002, "epoch": 0.8201077199281868, "step": 11420}, {"loss": 0.7639, "grad_norm": 0.6433175206184387, "learning_rate": 0.0002, "epoch": 0.8208258527827648, "step": 11430}, {"loss": 0.7522, "grad_norm": 0.6047986149787903, "learning_rate": 0.0002, "epoch": 0.8215439856373429, "step": 11440}, {"loss": 0.8305, "grad_norm": 0.6462088823318481, "learning_rate": 0.0002, "epoch": 0.822262118491921, "step": 11450}, {"loss": 0.8144, "grad_norm": 0.5558379888534546, "learning_rate": 0.0002, "epoch": 0.8229802513464991, "step": 11460}, {"loss": 0.7916, "grad_norm": 0.6745542287826538, "learning_rate": 0.0002, "epoch": 0.8236983842010772, "step": 11470}, {"loss": 0.7853, "grad_norm": 0.7082334756851196, "learning_rate": 0.0002, "epoch": 0.8244165170556553, "step": 11480}, {"loss": 0.7533, "grad_norm": 0.703889787197113, "learning_rate": 0.0002, "epoch": 0.8251346499102334, "step": 11490}, {"loss": 0.8085, "grad_norm": 0.5261096358299255, "learning_rate": 0.0002, "epoch": 0.8258527827648114, "step": 11500}, {"loss": 0.7903, "grad_norm": 0.6009393930435181, "learning_rate": 0.0002, "epoch": 0.8265709156193896, "step": 11510}, {"loss": 0.7377, "grad_norm": 0.584274172782898, "learning_rate": 0.0002, "epoch": 0.8272890484739677, "step": 11520}, {"loss": 0.7926, "grad_norm": 0.6803238987922668, "learning_rate": 0.0002, "epoch": 0.8280071813285458, "step": 11530}, {"loss": 0.7948, "grad_norm": 0.6230084896087646, "learning_rate": 0.0002, "epoch": 0.8287253141831239, "step": 11540}, {"loss": 0.7902, "grad_norm": 0.6090595722198486, "learning_rate": 0.0002, "epoch": 0.829443447037702, "step": 11550}, {"loss": 0.7514, "grad_norm": 0.5292693376541138, "learning_rate": 0.0002, "epoch": 0.8301615798922801, "step": 11560}, {"loss": 0.7979, "grad_norm": 0.5675389766693115, "learning_rate": 0.0002, "epoch": 0.8308797127468581, "step": 11570}, {"loss": 0.7851, "grad_norm": 0.554874062538147, "learning_rate": 0.0002, "epoch": 0.8315978456014362, "step": 11580}, {"loss": 0.8004, "grad_norm": 0.8582373261451721, "learning_rate": 0.0002, "epoch": 0.8323159784560143, "step": 11590}, {"loss": 0.7864, "grad_norm": 0.5743035674095154, "learning_rate": 0.0002, "epoch": 0.8330341113105925, "step": 11600}, {"loss": 0.7714, "grad_norm": 0.5749582648277283, "learning_rate": 0.0002, "epoch": 0.8337522441651706, "step": 11610}, {"loss": 0.8131, "grad_norm": 0.5207278728485107, "learning_rate": 0.0002, "epoch": 0.8344703770197487, "step": 11620}, {"loss": 0.785, "grad_norm": 0.6262611150741577, "learning_rate": 0.0002, "epoch": 0.8351885098743268, "step": 11630}, {"loss": 0.7699, "grad_norm": 0.5490066409111023, "learning_rate": 0.0002, "epoch": 0.8359066427289048, "step": 11640}, {"loss": 0.7779, "grad_norm": 0.6283167600631714, "learning_rate": 0.0002, "epoch": 0.8366247755834829, "step": 11650}, {"loss": 0.7508, "grad_norm": 0.7701452374458313, "learning_rate": 0.0002, "epoch": 0.837342908438061, "step": 11660}, {"loss": 0.7662, "grad_norm": 0.5825072526931763, "learning_rate": 0.0002, "epoch": 0.8380610412926391, "step": 11670}, {"loss": 0.758, "grad_norm": 0.6119720935821533, "learning_rate": 0.0002, "epoch": 0.8387791741472173, "step": 11680}, {"loss": 0.7995, "grad_norm": 0.689383327960968, "learning_rate": 0.0002, "epoch": 0.8394973070017954, "step": 11690}, {"loss": 0.7615, "grad_norm": 0.5396560430526733, "learning_rate": 0.0002, "epoch": 0.8402154398563735, "step": 11700}, {"loss": 0.8073, "grad_norm": 0.577178955078125, "learning_rate": 0.0002, "epoch": 0.8409335727109515, "step": 11710}, {"loss": 0.7911, "grad_norm": 0.6652564406394958, "learning_rate": 0.0002, "epoch": 0.8416517055655296, "step": 11720}, {"loss": 0.7708, "grad_norm": 0.588377058506012, "learning_rate": 0.0002, "epoch": 0.8423698384201077, "step": 11730}, {"loss": 0.8245, "grad_norm": 0.6180438995361328, "learning_rate": 0.0002, "epoch": 0.8430879712746858, "step": 11740}, {"loss": 0.729, "grad_norm": 0.6897811889648438, "learning_rate": 0.0002, "epoch": 0.8438061041292639, "step": 11750}, {"loss": 0.8026, "grad_norm": 0.5826608538627625, "learning_rate": 0.0002, "epoch": 0.844524236983842, "step": 11760}, {"loss": 0.7959, "grad_norm": 0.6511976718902588, "learning_rate": 0.0002, "epoch": 0.8452423698384202, "step": 11770}, {"loss": 0.7705, "grad_norm": 0.4738382399082184, "learning_rate": 0.0002, "epoch": 0.8459605026929982, "step": 11780}, {"loss": 0.8317, "grad_norm": 0.541780948638916, "learning_rate": 0.0002, "epoch": 0.8466786355475763, "step": 11790}, {"loss": 0.774, "grad_norm": 0.6115241050720215, "learning_rate": 0.0002, "epoch": 0.8473967684021544, "step": 11800}, {"loss": 0.834, "grad_norm": 0.7067801356315613, "learning_rate": 0.0002, "epoch": 0.8481149012567325, "step": 11810}, {"loss": 0.7725, "grad_norm": 0.5602791905403137, "learning_rate": 0.0002, "epoch": 0.8488330341113106, "step": 11820}, {"loss": 0.7832, "grad_norm": 0.6968005299568176, "learning_rate": 0.0002, "epoch": 0.8495511669658887, "step": 11830}, {"loss": 0.7556, "grad_norm": 0.621132493019104, "learning_rate": 0.0002, "epoch": 0.8502692998204668, "step": 11840}, {"loss": 0.8036, "grad_norm": 0.5777568817138672, "learning_rate": 0.0002, "epoch": 0.8509874326750448, "step": 11850}, {"loss": 0.8071, "grad_norm": 0.6468178629875183, "learning_rate": 0.0002, "epoch": 0.851705565529623, "step": 11860}, {"loss": 0.8074, "grad_norm": 0.6216070652008057, "learning_rate": 0.0002, "epoch": 0.8524236983842011, "step": 11870}, {"loss": 0.7736, "grad_norm": 0.7402005791664124, "learning_rate": 0.0002, "epoch": 0.8531418312387792, "step": 11880}, {"loss": 0.7877, "grad_norm": 0.5192958116531372, "learning_rate": 0.0002, "epoch": 0.8538599640933573, "step": 11890}, {"loss": 0.7113, "grad_norm": 0.6050501465797424, "learning_rate": 0.0002, "epoch": 0.8545780969479354, "step": 11900}, {"loss": 0.8131, "grad_norm": 0.5363124012947083, "learning_rate": 0.0002, "epoch": 0.8552962298025135, "step": 11910}, {"loss": 0.7861, "grad_norm": 0.525288462638855, "learning_rate": 0.0002, "epoch": 0.8560143626570915, "step": 11920}, {"loss": 0.726, "grad_norm": 0.6129848957061768, "learning_rate": 0.0002, "epoch": 0.8567324955116696, "step": 11930}, {"loss": 0.7921, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 0.8574506283662477, "step": 11940}, {"loss": 0.772, "grad_norm": 0.5862830281257629, "learning_rate": 0.0002, "epoch": 0.8581687612208259, "step": 11950}, {"loss": 0.7272, "grad_norm": 0.7078025341033936, "learning_rate": 0.0002, "epoch": 0.858886894075404, "step": 11960}, {"loss": 0.7733, "grad_norm": 0.6600908637046814, "learning_rate": 0.0002, "epoch": 0.8596050269299821, "step": 11970}, {"loss": 0.7784, "grad_norm": 0.5914377570152283, "learning_rate": 0.0002, "epoch": 0.8603231597845602, "step": 11980}, {"loss": 0.8222, "grad_norm": 0.7844575047492981, "learning_rate": 0.0002, "epoch": 0.8610412926391382, "step": 11990}, {"loss": 0.8059, "grad_norm": 0.6605148315429688, "learning_rate": 0.0002, "epoch": 0.8617594254937163, "step": 12000}, {"loss": 0.8066, "grad_norm": 0.6320111155509949, "learning_rate": 0.0002, "epoch": 0.8624775583482944, "step": 12010}, {"loss": 0.7844, "grad_norm": 0.5833557844161987, "learning_rate": 0.0002, "epoch": 0.8631956912028725, "step": 12020}, {"loss": 0.8016, "grad_norm": 0.5322666764259338, "learning_rate": 0.0002, "epoch": 0.8639138240574507, "step": 12030}, {"loss": 0.8142, "grad_norm": 0.568696141242981, "learning_rate": 0.0002, "epoch": 0.8646319569120288, "step": 12040}, {"loss": 0.7929, "grad_norm": 0.5739135146141052, "learning_rate": 0.0002, "epoch": 0.8653500897666068, "step": 12050}, {"loss": 0.7877, "grad_norm": 0.6667993068695068, "learning_rate": 0.0002, "epoch": 0.8660682226211849, "step": 12060}, {"loss": 0.7538, "grad_norm": 0.5393701195716858, "learning_rate": 0.0002, "epoch": 0.866786355475763, "step": 12070}, {"loss": 0.8014, "grad_norm": 0.7036312818527222, "learning_rate": 0.0002, "epoch": 0.8675044883303411, "step": 12080}, {"loss": 0.7937, "grad_norm": 0.5851739048957825, "learning_rate": 0.0002, "epoch": 0.8682226211849192, "step": 12090}, {"loss": 0.8121, "grad_norm": 0.6554462909698486, "learning_rate": 0.0002, "epoch": 0.8689407540394973, "step": 12100}, {"loss": 0.8541, "grad_norm": 0.8224838376045227, "learning_rate": 0.0002, "epoch": 0.8696588868940754, "step": 12110}, {"loss": 0.73, "grad_norm": 0.513981819152832, "learning_rate": 0.0002, "epoch": 0.8703770197486534, "step": 12120}, {"loss": 0.7371, "grad_norm": 0.6913988590240479, "learning_rate": 0.0002, "epoch": 0.8710951526032316, "step": 12130}, {"loss": 0.762, "grad_norm": 0.5539003610610962, "learning_rate": 0.0002, "epoch": 0.8718132854578097, "step": 12140}, {"loss": 0.7535, "grad_norm": 0.6216937303543091, "learning_rate": 0.0002, "epoch": 0.8725314183123878, "step": 12150}, {"loss": 0.7344, "grad_norm": 0.5594495534896851, "learning_rate": 0.0002, "epoch": 0.8732495511669659, "step": 12160}, {"loss": 0.7342, "grad_norm": 0.6025309562683105, "learning_rate": 0.0002, "epoch": 0.873967684021544, "step": 12170}, {"loss": 0.7561, "grad_norm": 0.5285239815711975, "learning_rate": 0.0002, "epoch": 0.8746858168761221, "step": 12180}, {"loss": 0.7619, "grad_norm": 1.0394607782363892, "learning_rate": 0.0002, "epoch": 0.8754039497307001, "step": 12190}, {"loss": 0.8111, "grad_norm": 0.5128031373023987, "learning_rate": 0.0002, "epoch": 0.8761220825852782, "step": 12200}, {"loss": 0.8113, "grad_norm": 0.5883685946464539, "learning_rate": 0.0002, "epoch": 0.8768402154398564, "step": 12210}, {"loss": 0.7493, "grad_norm": 0.593204915523529, "learning_rate": 0.0002, "epoch": 0.8775583482944345, "step": 12220}, {"loss": 0.7739, "grad_norm": 0.7141679525375366, "learning_rate": 0.0002, "epoch": 0.8782764811490126, "step": 12230}, {"loss": 0.8155, "grad_norm": 0.6381585597991943, "learning_rate": 0.0002, "epoch": 0.8789946140035907, "step": 12240}, {"loss": 0.7756, "grad_norm": 0.7076981067657471, "learning_rate": 0.0002, "epoch": 0.8797127468581688, "step": 12250}, {"loss": 0.8186, "grad_norm": 0.8046461939811707, "learning_rate": 0.0002, "epoch": 0.8804308797127468, "step": 12260}, {"loss": 0.7615, "grad_norm": 0.635160505771637, "learning_rate": 0.0002, "epoch": 0.8811490125673249, "step": 12270}, {"loss": 0.7695, "grad_norm": 0.6388354301452637, "learning_rate": 0.0002, "epoch": 0.881867145421903, "step": 12280}, {"loss": 0.81, "grad_norm": 0.5612906217575073, "learning_rate": 0.0002, "epoch": 0.8825852782764811, "step": 12290}, {"loss": 0.8055, "grad_norm": 0.6716228723526001, "learning_rate": 0.0002, "epoch": 0.8833034111310593, "step": 12300}, {"loss": 0.757, "grad_norm": 0.6488762497901917, "learning_rate": 0.0002, "epoch": 0.8840215439856374, "step": 12310}, {"loss": 0.7794, "grad_norm": 0.5770853757858276, "learning_rate": 0.0002, "epoch": 0.8847396768402155, "step": 12320}, {"loss": 0.7617, "grad_norm": 0.5006616711616516, "learning_rate": 0.0002, "epoch": 0.8854578096947935, "step": 12330}, {"loss": 0.7512, "grad_norm": 0.6428417563438416, "learning_rate": 0.0002, "epoch": 0.8861759425493716, "step": 12340}, {"loss": 0.796, "grad_norm": 0.5721977949142456, "learning_rate": 0.0002, "epoch": 0.8868940754039497, "step": 12350}, {"loss": 0.7764, "grad_norm": 0.7000266313552856, "learning_rate": 0.0002, "epoch": 0.8876122082585278, "step": 12360}, {"loss": 0.7524, "grad_norm": 0.5252631306648254, "learning_rate": 0.0002, "epoch": 0.8883303411131059, "step": 12370}, {"loss": 0.7635, "grad_norm": 0.5788044929504395, "learning_rate": 0.0002, "epoch": 0.889048473967684, "step": 12380}, {"loss": 0.7856, "grad_norm": 0.6730653643608093, "learning_rate": 0.0002, "epoch": 0.8897666068222622, "step": 12390}, {"loss": 0.7925, "grad_norm": 0.5556851029396057, "learning_rate": 0.0002, "epoch": 0.8904847396768402, "step": 12400}, {"loss": 0.6958, "grad_norm": 0.616189181804657, "learning_rate": 0.0002, "epoch": 0.8912028725314183, "step": 12410}, {"loss": 0.7468, "grad_norm": 0.6360940337181091, "learning_rate": 0.0002, "epoch": 0.8919210053859964, "step": 12420}, {"loss": 0.8088, "grad_norm": 0.5832887887954712, "learning_rate": 0.0002, "epoch": 0.8926391382405745, "step": 12430}, {"loss": 0.7383, "grad_norm": 0.8319168090820312, "learning_rate": 0.0002, "epoch": 0.8933572710951526, "step": 12440}, {"loss": 0.8597, "grad_norm": 0.5415005087852478, "learning_rate": 0.0002, "epoch": 0.8940754039497307, "step": 12450}, {"loss": 0.7439, "grad_norm": 0.4959808588027954, "learning_rate": 0.0002, "epoch": 0.8947935368043088, "step": 12460}, {"loss": 0.8493, "grad_norm": 0.5102260708808899, "learning_rate": 0.0002, "epoch": 0.8955116696588868, "step": 12470}, {"loss": 0.7274, "grad_norm": 0.773972749710083, "learning_rate": 0.0002, "epoch": 0.896229802513465, "step": 12480}, {"loss": 0.7797, "grad_norm": 0.6314513087272644, "learning_rate": 0.0002, "epoch": 0.8969479353680431, "step": 12490}, {"loss": 0.7839, "grad_norm": 0.6503705382347107, "learning_rate": 0.0002, "epoch": 0.8976660682226212, "step": 12500}, {"loss": 0.8177, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 0.8983842010771993, "step": 12510}, {"loss": 0.7448, "grad_norm": 0.7222756743431091, "learning_rate": 0.0002, "epoch": 0.8991023339317774, "step": 12520}, {"loss": 0.7779, "grad_norm": 0.7242336869239807, "learning_rate": 0.0002, "epoch": 0.8998204667863555, "step": 12530}, {"loss": 0.7577, "grad_norm": 0.625769317150116, "learning_rate": 0.0002, "epoch": 0.9005385996409335, "step": 12540}, {"loss": 0.8528, "grad_norm": 0.6003357172012329, "learning_rate": 0.0002, "epoch": 0.9012567324955116, "step": 12550}, {"loss": 0.7871, "grad_norm": 0.6089374423027039, "learning_rate": 0.0002, "epoch": 0.9019748653500897, "step": 12560}, {"loss": 0.74, "grad_norm": 0.6232544183731079, "learning_rate": 0.0002, "epoch": 0.9026929982046679, "step": 12570}, {"loss": 0.7993, "grad_norm": 0.5426769256591797, "learning_rate": 0.0002, "epoch": 0.903411131059246, "step": 12580}, {"loss": 0.8023, "grad_norm": 0.5711943507194519, "learning_rate": 0.0002, "epoch": 0.9041292639138241, "step": 12590}, {"loss": 0.7915, "grad_norm": 0.5287838578224182, "learning_rate": 0.0002, "epoch": 0.9048473967684022, "step": 12600}, {"loss": 0.7394, "grad_norm": 0.6192951798439026, "learning_rate": 0.0002, "epoch": 0.9055655296229802, "step": 12610}, {"loss": 0.7547, "grad_norm": 0.493082195520401, "learning_rate": 0.0002, "epoch": 0.9062836624775583, "step": 12620}, {"loss": 0.7604, "grad_norm": 0.7668463587760925, "learning_rate": 0.0002, "epoch": 0.9070017953321364, "step": 12630}, {"loss": 0.8079, "grad_norm": 0.6298037767410278, "learning_rate": 0.0002, "epoch": 0.9077199281867145, "step": 12640}, {"loss": 0.7451, "grad_norm": 0.5502580404281616, "learning_rate": 0.0002, "epoch": 0.9084380610412927, "step": 12650}, {"loss": 0.763, "grad_norm": 0.5525170564651489, "learning_rate": 0.0002, "epoch": 0.9091561938958708, "step": 12660}, {"loss": 0.7579, "grad_norm": 0.9753695726394653, "learning_rate": 0.0002, "epoch": 0.9098743267504489, "step": 12670}, {"loss": 0.872, "grad_norm": 0.611427366733551, "learning_rate": 0.0002, "epoch": 0.9105924596050269, "step": 12680}, {"loss": 0.7786, "grad_norm": 0.5141594409942627, "learning_rate": 0.0002, "epoch": 0.911310592459605, "step": 12690}, {"loss": 0.7384, "grad_norm": 0.6739137172698975, "learning_rate": 0.0002, "epoch": 0.9120287253141831, "step": 12700}, {"loss": 0.8579, "grad_norm": 0.5759707689285278, "learning_rate": 0.0002, "epoch": 0.9127468581687612, "step": 12710}, {"loss": 0.7559, "grad_norm": 0.5548733472824097, "learning_rate": 0.0002, "epoch": 0.9134649910233393, "step": 12720}, {"loss": 0.8225, "grad_norm": 0.7014280557632446, "learning_rate": 0.0002, "epoch": 0.9141831238779174, "step": 12730}, {"loss": 0.7936, "grad_norm": 0.5939958691596985, "learning_rate": 0.0002, "epoch": 0.9149012567324956, "step": 12740}, {"loss": 0.7756, "grad_norm": 0.5995593667030334, "learning_rate": 0.0002, "epoch": 0.9156193895870736, "step": 12750}, {"loss": 0.7423, "grad_norm": 0.6686680316925049, "learning_rate": 0.0002, "epoch": 0.9163375224416517, "step": 12760}, {"loss": 0.8057, "grad_norm": 0.4742372930049896, "learning_rate": 0.0002, "epoch": 0.9170556552962298, "step": 12770}, {"loss": 0.7795, "grad_norm": 0.5493217706680298, "learning_rate": 0.0002, "epoch": 0.9177737881508079, "step": 12780}, {"loss": 0.7859, "grad_norm": 0.5641885995864868, "learning_rate": 0.0002, "epoch": 0.918491921005386, "step": 12790}, {"loss": 0.7775, "grad_norm": 0.5814061164855957, "learning_rate": 0.0002, "epoch": 0.9192100538599641, "step": 12800}, {"loss": 0.8204, "grad_norm": 0.6774331331253052, "learning_rate": 0.0002, "epoch": 0.9199281867145422, "step": 12810}, {"loss": 0.8205, "grad_norm": 0.5592127442359924, "learning_rate": 0.0002, "epoch": 0.9206463195691202, "step": 12820}, {"loss": 0.7788, "grad_norm": 0.5246456861495972, "learning_rate": 0.0002, "epoch": 0.9213644524236984, "step": 12830}, {"loss": 0.7886, "grad_norm": 0.6524264812469482, "learning_rate": 0.0002, "epoch": 0.9220825852782765, "step": 12840}, {"loss": 0.796, "grad_norm": 0.6010791063308716, "learning_rate": 0.0002, "epoch": 0.9228007181328546, "step": 12850}, {"loss": 0.7998, "grad_norm": 0.5289866924285889, "learning_rate": 0.0002, "epoch": 0.9235188509874327, "step": 12860}, {"loss": 0.7582, "grad_norm": 0.6850762367248535, "learning_rate": 0.0002, "epoch": 0.9242369838420108, "step": 12870}, {"loss": 0.7894, "grad_norm": 0.5293797850608826, "learning_rate": 0.0002, "epoch": 0.9249551166965889, "step": 12880}, {"loss": 0.7738, "grad_norm": 0.6045399308204651, "learning_rate": 0.0002, "epoch": 0.9256732495511669, "step": 12890}, {"loss": 0.7207, "grad_norm": 0.7026739716529846, "learning_rate": 0.0002, "epoch": 0.926391382405745, "step": 12900}, {"loss": 0.7726, "grad_norm": 0.6884756684303284, "learning_rate": 0.0002, "epoch": 0.9271095152603231, "step": 12910}, {"loss": 0.7913, "grad_norm": 0.637884795665741, "learning_rate": 0.0002, "epoch": 0.9278276481149013, "step": 12920}, {"loss": 0.7513, "grad_norm": 0.513913631439209, "learning_rate": 0.0002, "epoch": 0.9285457809694794, "step": 12930}, {"loss": 0.8, "grad_norm": 0.6642340421676636, "learning_rate": 0.0002, "epoch": 0.9292639138240575, "step": 12940}, {"loss": 0.8026, "grad_norm": 0.5708861947059631, "learning_rate": 0.0002, "epoch": 0.9299820466786356, "step": 12950}, {"loss": 0.8234, "grad_norm": 0.5896512866020203, "learning_rate": 0.0002, "epoch": 0.9307001795332136, "step": 12960}, {"loss": 0.77, "grad_norm": 0.5754874348640442, "learning_rate": 0.0002, "epoch": 0.9314183123877917, "step": 12970}, {"loss": 0.7594, "grad_norm": 0.6363751888275146, "learning_rate": 0.0002, "epoch": 0.9321364452423698, "step": 12980}, {"loss": 0.7898, "grad_norm": 0.7660197019577026, "learning_rate": 0.0002, "epoch": 0.9328545780969479, "step": 12990}, {"loss": 0.792, "grad_norm": 0.607728898525238, "learning_rate": 0.0002, "epoch": 0.933572710951526, "step": 13000}, {"loss": 0.734, "grad_norm": 0.5257042050361633, "learning_rate": 0.0002, "epoch": 0.9342908438061042, "step": 13010}, {"loss": 0.8129, "grad_norm": 0.7916908264160156, "learning_rate": 0.0002, "epoch": 0.9350089766606823, "step": 13020}, {"loss": 0.81, "grad_norm": 0.8310123085975647, "learning_rate": 0.0002, "epoch": 0.9357271095152603, "step": 13030}, {"loss": 0.7738, "grad_norm": 0.6543728113174438, "learning_rate": 0.0002, "epoch": 0.9364452423698384, "step": 13040}, {"loss": 0.7797, "grad_norm": 0.7153878808021545, "learning_rate": 0.0002, "epoch": 0.9371633752244165, "step": 13050}, {"loss": 0.779, "grad_norm": 0.7510694265365601, "learning_rate": 0.0002, "epoch": 0.9378815080789946, "step": 13060}, {"loss": 0.7761, "grad_norm": 0.5524464249610901, "learning_rate": 0.0002, "epoch": 0.9385996409335727, "step": 13070}, {"loss": 0.8635, "grad_norm": 0.6657140254974365, "learning_rate": 0.0002, "epoch": 0.9393177737881508, "step": 13080}, {"loss": 0.8097, "grad_norm": 0.5757394433021545, "learning_rate": 0.0002, "epoch": 0.940035906642729, "step": 13090}, {"loss": 0.7967, "grad_norm": 0.6171187162399292, "learning_rate": 0.0002, "epoch": 0.940754039497307, "step": 13100}, {"loss": 0.8197, "grad_norm": 0.5946314334869385, "learning_rate": 0.0002, "epoch": 0.9414721723518851, "step": 13110}, {"loss": 0.7184, "grad_norm": 0.5727229714393616, "learning_rate": 0.0002, "epoch": 0.9421903052064632, "step": 13120}, {"loss": 0.7981, "grad_norm": 0.7805224061012268, "learning_rate": 0.0002, "epoch": 0.9429084380610413, "step": 13130}, {"loss": 0.8045, "grad_norm": 0.5763523578643799, "learning_rate": 0.0002, "epoch": 0.9436265709156194, "step": 13140}, {"loss": 0.7462, "grad_norm": 0.8310899138450623, "learning_rate": 0.0002, "epoch": 0.9443447037701975, "step": 13150}, {"loss": 0.7818, "grad_norm": 0.7531784772872925, "learning_rate": 0.0002, "epoch": 0.9450628366247756, "step": 13160}, {"loss": 0.8418, "grad_norm": 0.678779661655426, "learning_rate": 0.0002, "epoch": 0.9457809694793536, "step": 13170}, {"loss": 0.8064, "grad_norm": 0.8096453547477722, "learning_rate": 0.0002, "epoch": 0.9464991023339318, "step": 13180}, {"loss": 0.7676, "grad_norm": 0.6743921637535095, "learning_rate": 0.0002, "epoch": 0.9472172351885099, "step": 13190}, {"loss": 0.7949, "grad_norm": 0.606852114200592, "learning_rate": 0.0002, "epoch": 0.947935368043088, "step": 13200}, {"loss": 0.7908, "grad_norm": 0.6550270915031433, "learning_rate": 0.0002, "epoch": 0.9486535008976661, "step": 13210}, {"loss": 0.7564, "grad_norm": 0.6494552493095398, "learning_rate": 0.0002, "epoch": 0.9493716337522442, "step": 13220}, {"loss": 0.7974, "grad_norm": 0.5867666602134705, "learning_rate": 0.0002, "epoch": 0.9500897666068223, "step": 13230}, {"loss": 0.8117, "grad_norm": 0.6283786296844482, "learning_rate": 0.0002, "epoch": 0.9508078994614003, "step": 13240}, {"loss": 0.7775, "grad_norm": 0.6824573278427124, "learning_rate": 0.0002, "epoch": 0.9515260323159784, "step": 13250}, {"loss": 0.7674, "grad_norm": 0.6945744156837463, "learning_rate": 0.0002, "epoch": 0.9522441651705565, "step": 13260}, {"loss": 0.7384, "grad_norm": 0.6468575596809387, "learning_rate": 0.0002, "epoch": 0.9529622980251347, "step": 13270}, {"loss": 0.7548, "grad_norm": 0.6819407939910889, "learning_rate": 0.0002, "epoch": 0.9536804308797128, "step": 13280}, {"loss": 0.7933, "grad_norm": 0.6660491824150085, "learning_rate": 0.0002, "epoch": 0.9543985637342909, "step": 13290}, {"loss": 0.7293, "grad_norm": 0.6320462226867676, "learning_rate": 0.0002, "epoch": 0.9551166965888689, "step": 13300}, {"loss": 0.8122, "grad_norm": 0.46753761172294617, "learning_rate": 0.0002, "epoch": 0.955834829443447, "step": 13310}, {"loss": 0.7953, "grad_norm": 0.6608774065971375, "learning_rate": 0.0002, "epoch": 0.9565529622980251, "step": 13320}, {"loss": 0.8217, "grad_norm": 0.607448935508728, "learning_rate": 0.0002, "epoch": 0.9572710951526032, "step": 13330}, {"loss": 0.7278, "grad_norm": 0.6796701550483704, "learning_rate": 0.0002, "epoch": 0.9579892280071813, "step": 13340}, {"loss": 0.7979, "grad_norm": 0.7655861377716064, "learning_rate": 0.0002, "epoch": 0.9587073608617595, "step": 13350}, {"loss": 0.7822, "grad_norm": 0.5881335735321045, "learning_rate": 0.0002, "epoch": 0.9594254937163376, "step": 13360}, {"loss": 0.815, "grad_norm": 0.6855270862579346, "learning_rate": 0.0002, "epoch": 0.9601436265709156, "step": 13370}, {"loss": 0.8025, "grad_norm": 0.6072475910186768, "learning_rate": 0.0002, "epoch": 0.9608617594254937, "step": 13380}, {"loss": 0.7756, "grad_norm": 0.5983994603157043, "learning_rate": 0.0002, "epoch": 0.9615798922800718, "step": 13390}, {"loss": 0.8121, "grad_norm": 0.6141189932823181, "learning_rate": 0.0002, "epoch": 0.9622980251346499, "step": 13400}, {"loss": 0.8059, "grad_norm": 0.6539722084999084, "learning_rate": 0.0002, "epoch": 0.963016157989228, "step": 13410}, {"loss": 0.8085, "grad_norm": 0.5425801277160645, "learning_rate": 0.0002, "epoch": 0.9637342908438061, "step": 13420}, {"loss": 0.7687, "grad_norm": 0.8038925528526306, "learning_rate": 0.0002, "epoch": 0.9644524236983842, "step": 13430}, {"loss": 0.8015, "grad_norm": 0.5729590058326721, "learning_rate": 0.0002, "epoch": 0.9651705565529622, "step": 13440}, {"loss": 0.782, "grad_norm": 0.5695241689682007, "learning_rate": 0.0002, "epoch": 0.9658886894075404, "step": 13450}, {"loss": 0.7984, "grad_norm": 0.5913681387901306, "learning_rate": 0.0002, "epoch": 0.9666068222621185, "step": 13460}, {"loss": 0.7947, "grad_norm": 1.1798994541168213, "learning_rate": 0.0002, "epoch": 0.9673249551166966, "step": 13470}, {"loss": 0.7342, "grad_norm": 0.5931369066238403, "learning_rate": 0.0002, "epoch": 0.9680430879712747, "step": 13480}, {"loss": 0.8432, "grad_norm": 0.6269514560699463, "learning_rate": 0.0002, "epoch": 0.9687612208258528, "step": 13490}, {"loss": 0.7357, "grad_norm": 0.7380245327949524, "learning_rate": 0.0002, "epoch": 0.9694793536804309, "step": 13500}, {"loss": 0.8006, "grad_norm": 0.5668187141418457, "learning_rate": 0.0002, "epoch": 0.9701974865350089, "step": 13510}, {"loss": 0.7562, "grad_norm": 0.547149121761322, "learning_rate": 0.0002, "epoch": 0.970915619389587, "step": 13520}, {"loss": 0.8239, "grad_norm": 0.49131739139556885, "learning_rate": 0.0002, "epoch": 0.9716337522441651, "step": 13530}, {"loss": 0.8159, "grad_norm": 0.6385366320610046, "learning_rate": 0.0002, "epoch": 0.9723518850987433, "step": 13540}, {"loss": 0.7882, "grad_norm": 0.5962417125701904, "learning_rate": 0.0002, "epoch": 0.9730700179533214, "step": 13550}, {"loss": 0.7353, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9737881508078995, "step": 13560}, {"loss": 0.7511, "grad_norm": 0.5757403373718262, "learning_rate": 0.0002, "epoch": 0.9745062836624776, "step": 13570}, {"loss": 0.7858, "grad_norm": 0.7214667201042175, "learning_rate": 0.0002, "epoch": 0.9752244165170556, "step": 13580}, {"loss": 0.7492, "grad_norm": 0.5902701020240784, "learning_rate": 0.0002, "epoch": 0.9759425493716337, "step": 13590}, {"loss": 0.8177, "grad_norm": 0.752805769443512, "learning_rate": 0.0002, "epoch": 0.9766606822262118, "step": 13600}, {"loss": 0.7622, "grad_norm": 0.5943595767021179, "learning_rate": 0.0002, "epoch": 0.9773788150807899, "step": 13610}, {"loss": 0.7781, "grad_norm": 0.6752488613128662, "learning_rate": 0.0002, "epoch": 0.978096947935368, "step": 13620}, {"loss": 0.8022, "grad_norm": 0.5295413732528687, "learning_rate": 0.0002, "epoch": 0.9788150807899462, "step": 13630}, {"loss": 0.7462, "grad_norm": 0.732549250125885, "learning_rate": 0.0002, "epoch": 0.9795332136445243, "step": 13640}, {"loss": 0.7939, "grad_norm": 0.5701823830604553, "learning_rate": 0.0002, "epoch": 0.9802513464991023, "step": 13650}, {"loss": 0.7609, "grad_norm": 0.576898455619812, "learning_rate": 0.0002, "epoch": 0.9809694793536804, "step": 13660}, {"loss": 0.7576, "grad_norm": 0.5916832089424133, "learning_rate": 0.0002, "epoch": 0.9816876122082585, "step": 13670}, {"loss": 0.7587, "grad_norm": 0.5554524660110474, "learning_rate": 0.0002, "epoch": 0.9824057450628366, "step": 13680}, {"loss": 0.8274, "grad_norm": 0.6988440752029419, "learning_rate": 0.0002, "epoch": 0.9831238779174147, "step": 13690}, {"loss": 0.7485, "grad_norm": 0.6660445332527161, "learning_rate": 0.0002, "epoch": 0.9838420107719928, "step": 13700}, {"loss": 0.7609, "grad_norm": 2.421210289001465, "learning_rate": 0.0002, "epoch": 0.984560143626571, "step": 13710}, {"loss": 0.784, "grad_norm": 0.6307598948478699, "learning_rate": 0.0002, "epoch": 0.985278276481149, "step": 13720}, {"loss": 0.7757, "grad_norm": 0.6832480430603027, "learning_rate": 0.0002, "epoch": 0.9859964093357271, "step": 13730}, {"loss": 0.8064, "grad_norm": 0.5974255204200745, "learning_rate": 0.0002, "epoch": 0.9867145421903052, "step": 13740}, {"loss": 0.7871, "grad_norm": 0.6540380716323853, "learning_rate": 0.0002, "epoch": 0.9874326750448833, "step": 13750}, {"loss": 0.7735, "grad_norm": 0.7532727122306824, "learning_rate": 0.0002, "epoch": 0.9881508078994614, "step": 13760}, {"loss": 0.7392, "grad_norm": 0.6776283383369446, "learning_rate": 0.0002, "epoch": 0.9888689407540395, "step": 13770}, {"loss": 0.7852, "grad_norm": 0.5776281356811523, "learning_rate": 0.0002, "epoch": 0.9895870736086176, "step": 13780}, {"loss": 0.8216, "grad_norm": 0.5473008751869202, "learning_rate": 0.0002, "epoch": 0.9903052064631956, "step": 13790}, {"loss": 0.7776, "grad_norm": 0.5428591370582581, "learning_rate": 0.0002, "epoch": 0.9910233393177738, "step": 13800}, {"loss": 0.7823, "grad_norm": 0.5173406004905701, "learning_rate": 0.0002, "epoch": 0.9917414721723519, "step": 13810}, {"loss": 0.762, "grad_norm": 0.6462617516517639, "learning_rate": 0.0002, "epoch": 0.99245960502693, "step": 13820}, {"loss": 0.7656, "grad_norm": 0.5800426006317139, "learning_rate": 0.0002, "epoch": 0.9931777378815081, "step": 13830}, {"loss": 0.8028, "grad_norm": 0.5015466809272766, "learning_rate": 0.0002, "epoch": 0.9938958707360862, "step": 13840}, {"loss": 0.7782, "grad_norm": 0.59474778175354, "learning_rate": 0.0002, "epoch": 0.9946140035906643, "step": 13850}, {"loss": 0.7891, "grad_norm": 0.5609583258628845, "learning_rate": 0.0002, "epoch": 0.9953321364452423, "step": 13860}, {"loss": 0.7647, "grad_norm": 0.5762063264846802, "learning_rate": 0.0002, "epoch": 0.9960502692998204, "step": 13870}, {"loss": 0.7594, "grad_norm": 0.6419214010238647, "learning_rate": 0.0002, "epoch": 0.9967684021543985, "step": 13880}, {"loss": 0.7599, "grad_norm": 0.7821950316429138, "learning_rate": 0.0002, "epoch": 0.9974865350089767, "step": 13890}, {"loss": 0.7529, "grad_norm": 0.6216017007827759, "learning_rate": 0.0002, "epoch": 0.9982046678635548, "step": 13900}, {"loss": 0.7621, "grad_norm": 0.5446485877037048, "learning_rate": 0.0002, "epoch": 0.9989228007181329, "step": 13910}, {"loss": 0.74, "grad_norm": 0.5037565231323242, "learning_rate": 0.0002, "epoch": 0.999640933572711, "step": 13920}, {"eval_loss": 1.09147310256958, "eval_runtime": 55.1915, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 1.0, "step": 13925}, {"loss": 0.7479, "grad_norm": 0.5808277130126953, "learning_rate": 0.0002, "epoch": 1.000359066427289, "step": 13930}, {"loss": 0.7147, "grad_norm": 0.47258496284484863, "learning_rate": 0.0002, "epoch": 1.0010771992818672, "step": 13940}, {"loss": 0.7075, "grad_norm": 0.8921670317649841, "learning_rate": 0.0002, "epoch": 1.0017953321364452, "step": 13950}, {"loss": 0.7737, "grad_norm": 0.746729850769043, "learning_rate": 0.0002, "epoch": 1.0025134649910232, "step": 13960}, {"loss": 0.6912, "grad_norm": 0.6243796944618225, "learning_rate": 0.0002, "epoch": 1.0032315978456015, "step": 13970}, {"loss": 0.7171, "grad_norm": 0.6725090742111206, "learning_rate": 0.0002, "epoch": 1.0039497307001795, "step": 13980}, {"loss": 0.7094, "grad_norm": 0.8762497305870056, "learning_rate": 0.0002, "epoch": 1.0046678635547577, "step": 13990}, {"loss": 0.7183, "grad_norm": 0.7694411873817444, "learning_rate": 0.0002, "epoch": 1.0053859964093357, "step": 14000}, {"loss": 0.7741, "grad_norm": 0.6208822727203369, "learning_rate": 0.0002, "epoch": 1.006104129263914, "step": 14010}, {"loss": 0.7291, "grad_norm": 0.8503357768058777, "learning_rate": 0.0002, "epoch": 1.006822262118492, "step": 14020}, {"loss": 0.7189, "grad_norm": 0.5813316106796265, "learning_rate": 0.0002, "epoch": 1.00754039497307, "step": 14030}, {"loss": 0.751, "grad_norm": 0.8186036348342896, "learning_rate": 0.0002, "epoch": 1.0082585278276481, "step": 14040}, {"loss": 0.7205, "grad_norm": 0.759873628616333, "learning_rate": 0.0002, "epoch": 1.0089766606822261, "step": 14050}, {"loss": 0.7517, "grad_norm": 0.8437777161598206, "learning_rate": 0.0002, "epoch": 1.0096947935368044, "step": 14060}, {"loss": 0.7205, "grad_norm": 0.5750975012779236, "learning_rate": 0.0002, "epoch": 1.0104129263913824, "step": 14070}, {"loss": 0.7079, "grad_norm": 0.5873221158981323, "learning_rate": 0.0002, "epoch": 1.0111310592459606, "step": 14080}, {"loss": 0.7645, "grad_norm": 0.6381314396858215, "learning_rate": 0.0002, "epoch": 1.0118491921005386, "step": 14090}, {"loss": 0.7246, "grad_norm": 0.6510405540466309, "learning_rate": 0.0002, "epoch": 1.0125673249551166, "step": 14100}, {"loss": 0.6906, "grad_norm": 0.7698671221733093, "learning_rate": 0.0002, "epoch": 1.0132854578096948, "step": 14110}, {"loss": 0.7008, "grad_norm": 0.646180272102356, "learning_rate": 0.0002, "epoch": 1.0140035906642728, "step": 14120}, {"loss": 0.7446, "grad_norm": 0.6183205246925354, "learning_rate": 0.0002, "epoch": 1.014721723518851, "step": 14130}, {"loss": 0.747, "grad_norm": 0.5082563757896423, "learning_rate": 0.0002, "epoch": 1.015439856373429, "step": 14140}, {"loss": 0.7229, "grad_norm": 0.7285500764846802, "learning_rate": 0.0002, "epoch": 1.0161579892280073, "step": 14150}, {"loss": 0.6879, "grad_norm": 0.6368175148963928, "learning_rate": 0.0002, "epoch": 1.0168761220825853, "step": 14160}, {"loss": 0.712, "grad_norm": 0.44868743419647217, "learning_rate": 0.0002, "epoch": 1.0175942549371633, "step": 14170}, {"loss": 0.7299, "grad_norm": 0.6346513628959656, "learning_rate": 0.0002, "epoch": 1.0183123877917415, "step": 14180}, {"loss": 0.7099, "grad_norm": 0.7287803292274475, "learning_rate": 0.0002, "epoch": 1.0190305206463195, "step": 14190}, {"loss": 0.6915, "grad_norm": 0.6701363325119019, "learning_rate": 0.0002, "epoch": 1.0197486535008977, "step": 14200}, {"loss": 0.7389, "grad_norm": 0.6419289112091064, "learning_rate": 0.0002, "epoch": 1.0204667863554757, "step": 14210}, {"loss": 0.7386, "grad_norm": 0.7703002095222473, "learning_rate": 0.0002, "epoch": 1.021184919210054, "step": 14220}, {"loss": 0.6819, "grad_norm": 0.6803670525550842, "learning_rate": 0.0002, "epoch": 1.021903052064632, "step": 14230}, {"loss": 0.74, "grad_norm": 0.5780976414680481, "learning_rate": 0.0002, "epoch": 1.02262118491921, "step": 14240}, {"loss": 0.6912, "grad_norm": 0.5096051096916199, "learning_rate": 0.0002, "epoch": 1.0233393177737882, "step": 14250}, {"loss": 0.7585, "grad_norm": 0.6058611869812012, "learning_rate": 0.0002, "epoch": 1.0240574506283662, "step": 14260}, {"loss": 0.7542, "grad_norm": 0.6703311204910278, "learning_rate": 0.0002, "epoch": 1.0247755834829444, "step": 14270}, {"loss": 0.7541, "grad_norm": 0.7143640518188477, "learning_rate": 0.0002, "epoch": 1.0254937163375224, "step": 14280}, {"loss": 0.7411, "grad_norm": 0.6730744242668152, "learning_rate": 0.0002, "epoch": 1.0262118491921006, "step": 14290}, {"loss": 0.7072, "grad_norm": 0.8180603384971619, "learning_rate": 0.0002, "epoch": 1.0269299820466786, "step": 14300}, {"loss": 0.6944, "grad_norm": 0.6752267479896545, "learning_rate": 0.0002, "epoch": 1.0276481149012566, "step": 14310}, {"loss": 0.7105, "grad_norm": 0.678428590297699, "learning_rate": 0.0002, "epoch": 1.0283662477558349, "step": 14320}, {"loss": 0.7496, "grad_norm": 0.5959973931312561, "learning_rate": 0.0002, "epoch": 1.0290843806104129, "step": 14330}, {"loss": 0.7196, "grad_norm": 0.5797176957130432, "learning_rate": 0.0002, "epoch": 1.029802513464991, "step": 14340}, {"loss": 0.7853, "grad_norm": 0.6415652632713318, "learning_rate": 0.0002, "epoch": 1.030520646319569, "step": 14350}, {"loss": 0.7297, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 1.0312387791741473, "step": 14360}, {"loss": 0.7715, "grad_norm": 0.7158452272415161, "learning_rate": 0.0002, "epoch": 1.0319569120287253, "step": 14370}, {"loss": 0.7526, "grad_norm": 0.6066089272499084, "learning_rate": 0.0002, "epoch": 1.0326750448833033, "step": 14380}, {"loss": 0.7639, "grad_norm": 0.7359582781791687, "learning_rate": 0.0002, "epoch": 1.0333931777378815, "step": 14390}, {"loss": 0.7445, "grad_norm": 0.7372373938560486, "learning_rate": 0.0002, "epoch": 1.0341113105924595, "step": 14400}, {"loss": 0.7262, "grad_norm": 0.7511868476867676, "learning_rate": 0.0002, "epoch": 1.0348294434470378, "step": 14410}, {"loss": 0.7145, "grad_norm": 0.5449917912483215, "learning_rate": 0.0002, "epoch": 1.0355475763016158, "step": 14420}, {"loss": 0.6908, "grad_norm": 0.6700817346572876, "learning_rate": 0.0002, "epoch": 1.036265709156194, "step": 14430}, {"loss": 0.7237, "grad_norm": 0.7061316967010498, "learning_rate": 0.0002, "epoch": 1.036983842010772, "step": 14440}, {"loss": 0.7166, "grad_norm": 0.7582663893699646, "learning_rate": 0.0002, "epoch": 1.03770197486535, "step": 14450}, {"loss": 0.7447, "grad_norm": 0.6408873200416565, "learning_rate": 0.0002, "epoch": 1.0384201077199282, "step": 14460}, {"loss": 0.728, "grad_norm": 0.7645436525344849, "learning_rate": 0.0002, "epoch": 1.0391382405745062, "step": 14470}, {"loss": 0.7764, "grad_norm": 0.6522644758224487, "learning_rate": 0.0002, "epoch": 1.0398563734290844, "step": 14480}, {"loss": 0.7249, "grad_norm": 0.784273624420166, "learning_rate": 0.0002, "epoch": 1.0405745062836624, "step": 14490}, {"loss": 0.7173, "grad_norm": 0.673891544342041, "learning_rate": 0.0002, "epoch": 1.0412926391382407, "step": 14500}, {"loss": 0.6647, "grad_norm": 0.6566316485404968, "learning_rate": 0.0002, "epoch": 1.0420107719928187, "step": 14510}, {"loss": 0.7626, "grad_norm": 0.6062059998512268, "learning_rate": 0.0002, "epoch": 1.0427289048473967, "step": 14520}, {"loss": 0.7061, "grad_norm": 0.6884504556655884, "learning_rate": 0.0002, "epoch": 1.0434470377019749, "step": 14530}, {"loss": 0.7293, "grad_norm": 0.6642231345176697, "learning_rate": 0.0002, "epoch": 1.044165170556553, "step": 14540}, {"loss": 0.7084, "grad_norm": 0.6989523768424988, "learning_rate": 0.0002, "epoch": 1.0448833034111311, "step": 14550}, {"loss": 0.7751, "grad_norm": 0.8179892301559448, "learning_rate": 0.0002, "epoch": 1.0456014362657091, "step": 14560}, {"loss": 0.7225, "grad_norm": 0.6426970362663269, "learning_rate": 0.0002, "epoch": 1.0463195691202873, "step": 14570}, {"loss": 0.7756, "grad_norm": 0.678445041179657, "learning_rate": 0.0002, "epoch": 1.0470377019748653, "step": 14580}, {"loss": 0.7172, "grad_norm": 0.7573820352554321, "learning_rate": 0.0002, "epoch": 1.0477558348294433, "step": 14590}, {"loss": 0.8092, "grad_norm": 0.734443724155426, "learning_rate": 0.0002, "epoch": 1.0484739676840216, "step": 14600}, {"loss": 0.7205, "grad_norm": 0.7333676218986511, "learning_rate": 0.0002, "epoch": 1.0491921005385996, "step": 14610}, {"loss": 0.7276, "grad_norm": 0.6122187972068787, "learning_rate": 0.0002, "epoch": 1.0499102333931778, "step": 14620}, {"loss": 0.7051, "grad_norm": 0.6916412711143494, "learning_rate": 0.0002, "epoch": 1.0506283662477558, "step": 14630}, {"loss": 0.7315, "grad_norm": 0.5898127555847168, "learning_rate": 0.0002, "epoch": 1.051346499102334, "step": 14640}, {"loss": 0.7293, "grad_norm": 0.6071873307228088, "learning_rate": 0.0002, "epoch": 1.052064631956912, "step": 14650}, {"loss": 0.7924, "grad_norm": 0.6530455946922302, "learning_rate": 0.0002, "epoch": 1.05278276481149, "step": 14660}, {"loss": 0.7055, "grad_norm": 0.6919314861297607, "learning_rate": 0.0002, "epoch": 1.0535008976660682, "step": 14670}, {"loss": 0.7481, "grad_norm": 0.7843509912490845, "learning_rate": 0.0002, "epoch": 1.0542190305206462, "step": 14680}, {"loss": 0.7253, "grad_norm": 0.6106747388839722, "learning_rate": 0.0002, "epoch": 1.0549371633752245, "step": 14690}, {"loss": 0.7206, "grad_norm": 0.7828368544578552, "learning_rate": 0.0002, "epoch": 1.0556552962298025, "step": 14700}, {"loss": 0.6933, "grad_norm": 0.6772044897079468, "learning_rate": 0.0002, "epoch": 1.0563734290843807, "step": 14710}, {"loss": 0.6851, "grad_norm": 0.5430962443351746, "learning_rate": 0.0002, "epoch": 1.0570915619389587, "step": 14720}, {"loss": 0.7306, "grad_norm": 0.7364194989204407, "learning_rate": 0.0002, "epoch": 1.0578096947935367, "step": 14730}, {"loss": 0.703, "grad_norm": 0.5607585310935974, "learning_rate": 0.0002, "epoch": 1.058527827648115, "step": 14740}, {"loss": 0.7488, "grad_norm": 0.7917081713676453, "learning_rate": 0.0002, "epoch": 1.059245960502693, "step": 14750}, {"loss": 0.71, "grad_norm": 0.7852025628089905, "learning_rate": 0.0002, "epoch": 1.0599640933572712, "step": 14760}, {"loss": 0.7093, "grad_norm": 0.6329161524772644, "learning_rate": 0.0002, "epoch": 1.0606822262118492, "step": 14770}, {"loss": 0.7244, "grad_norm": 0.7607306838035583, "learning_rate": 0.0002, "epoch": 1.0614003590664274, "step": 14780}, {"loss": 0.7237, "grad_norm": 0.7236617207527161, "learning_rate": 0.0002, "epoch": 1.0621184919210054, "step": 14790}, {"loss": 0.7133, "grad_norm": 0.793542206287384, "learning_rate": 0.0002, "epoch": 1.0628366247755834, "step": 14800}, {"loss": 0.7482, "grad_norm": 0.53999263048172, "learning_rate": 0.0002, "epoch": 1.0635547576301616, "step": 14810}, {"loss": 0.732, "grad_norm": 0.5821034908294678, "learning_rate": 0.0002, "epoch": 1.0642728904847396, "step": 14820}, {"loss": 0.7066, "grad_norm": 0.6593600511550903, "learning_rate": 0.0002, "epoch": 1.0649910233393178, "step": 14830}, {"loss": 0.7458, "grad_norm": 0.70230633020401, "learning_rate": 0.0002, "epoch": 1.0657091561938958, "step": 14840}, {"loss": 0.7244, "grad_norm": 0.5715264081954956, "learning_rate": 0.0002, "epoch": 1.066427289048474, "step": 14850}, {"loss": 0.723, "grad_norm": 0.6610119938850403, "learning_rate": 0.0002, "epoch": 1.067145421903052, "step": 14860}, {"loss": 0.745, "grad_norm": 0.5470091700553894, "learning_rate": 0.0002, "epoch": 1.06786355475763, "step": 14870}, {"loss": 0.7464, "grad_norm": 0.7529906630516052, "learning_rate": 0.0002, "epoch": 1.0685816876122083, "step": 14880}, {"loss": 0.7421, "grad_norm": 0.7532844543457031, "learning_rate": 0.0002, "epoch": 1.0692998204667863, "step": 14890}, {"loss": 0.6706, "grad_norm": 0.6439316868782043, "learning_rate": 0.0002, "epoch": 1.0700179533213645, "step": 14900}, {"loss": 0.7276, "grad_norm": 0.5580114126205444, "learning_rate": 0.0002, "epoch": 1.0707360861759425, "step": 14910}, {"loss": 0.7478, "grad_norm": 0.6299236416816711, "learning_rate": 0.0002, "epoch": 1.0714542190305207, "step": 14920}, {"loss": 0.7927, "grad_norm": 0.6934021711349487, "learning_rate": 0.0002, "epoch": 1.0721723518850987, "step": 14930}, {"loss": 0.6766, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 1.0728904847396767, "step": 14940}, {"loss": 0.7072, "grad_norm": 0.8921014070510864, "learning_rate": 0.0002, "epoch": 1.073608617594255, "step": 14950}, {"loss": 0.7127, "grad_norm": 0.5934301614761353, "learning_rate": 0.0002, "epoch": 1.074326750448833, "step": 14960}, {"loss": 0.7595, "grad_norm": 0.8379642367362976, "learning_rate": 0.0002, "epoch": 1.0750448833034112, "step": 14970}, {"loss": 0.7231, "grad_norm": 0.6842767596244812, "learning_rate": 0.0002, "epoch": 1.0757630161579892, "step": 14980}, {"loss": 0.7362, "grad_norm": 0.7296533584594727, "learning_rate": 0.0002, "epoch": 1.0764811490125674, "step": 14990}, {"loss": 0.688, "grad_norm": 0.6821087002754211, "learning_rate": 0.0002, "epoch": 1.0771992818671454, "step": 15000}, {"loss": 0.6808, "grad_norm": 0.6133626699447632, "learning_rate": 0.0002, "epoch": 1.0779174147217234, "step": 15010}, {"loss": 0.7351, "grad_norm": 0.6774773001670837, "learning_rate": 0.0002, "epoch": 1.0786355475763016, "step": 15020}, {"loss": 0.7403, "grad_norm": 0.6818786859512329, "learning_rate": 0.0002, "epoch": 1.0793536804308796, "step": 15030}, {"loss": 0.7005, "grad_norm": 0.7763522863388062, "learning_rate": 0.0002, "epoch": 1.0800718132854579, "step": 15040}, {"loss": 0.7028, "grad_norm": 0.7259193658828735, "learning_rate": 0.0002, "epoch": 1.0807899461400359, "step": 15050}, {"loss": 0.7232, "grad_norm": 0.6797525882720947, "learning_rate": 0.0002, "epoch": 1.081508078994614, "step": 15060}, {"loss": 0.7051, "grad_norm": 0.5775881409645081, "learning_rate": 0.0002, "epoch": 1.082226211849192, "step": 15070}, {"loss": 0.745, "grad_norm": 0.7055524587631226, "learning_rate": 0.0002, "epoch": 1.08294434470377, "step": 15080}, {"loss": 0.7539, "grad_norm": 0.8018748760223389, "learning_rate": 0.0002, "epoch": 1.0836624775583483, "step": 15090}, {"loss": 0.6833, "grad_norm": 0.6738115549087524, "learning_rate": 0.0002, "epoch": 1.0843806104129263, "step": 15100}, {"loss": 0.7014, "grad_norm": 0.6586359143257141, "learning_rate": 0.0002, "epoch": 1.0850987432675046, "step": 15110}, {"loss": 0.7391, "grad_norm": 0.7396895885467529, "learning_rate": 0.0002, "epoch": 1.0858168761220826, "step": 15120}, {"loss": 0.7473, "grad_norm": 0.7224817276000977, "learning_rate": 0.0002, "epoch": 1.0865350089766608, "step": 15130}, {"loss": 0.7137, "grad_norm": 0.798514187335968, "learning_rate": 0.0002, "epoch": 1.0872531418312388, "step": 15140}, {"loss": 0.757, "grad_norm": 0.79301518201828, "learning_rate": 0.0002, "epoch": 1.0879712746858168, "step": 15150}, {"loss": 0.7, "grad_norm": 0.7106764316558838, "learning_rate": 0.0002, "epoch": 1.088689407540395, "step": 15160}, {"loss": 0.7515, "grad_norm": 0.6525473594665527, "learning_rate": 0.0002, "epoch": 1.089407540394973, "step": 15170}, {"loss": 0.7067, "grad_norm": 0.6001671552658081, "learning_rate": 0.0002, "epoch": 1.0901256732495512, "step": 15180}, {"loss": 0.722, "grad_norm": 0.6949557662010193, "learning_rate": 0.0002, "epoch": 1.0908438061041292, "step": 15190}, {"loss": 0.7165, "grad_norm": 0.5713186860084534, "learning_rate": 0.0002, "epoch": 1.0915619389587075, "step": 15200}, {"loss": 0.7073, "grad_norm": 0.8773220181465149, "learning_rate": 0.0002, "epoch": 1.0922800718132855, "step": 15210}, {"loss": 0.7332, "grad_norm": 0.5837785601615906, "learning_rate": 0.0002, "epoch": 1.0929982046678635, "step": 15220}, {"loss": 0.7451, "grad_norm": 0.7243856191635132, "learning_rate": 0.0002, "epoch": 1.0937163375224417, "step": 15230}, {"loss": 0.6885, "grad_norm": 0.7008263468742371, "learning_rate": 0.0002, "epoch": 1.0944344703770197, "step": 15240}, {"loss": 0.7259, "grad_norm": 0.7061941623687744, "learning_rate": 0.0002, "epoch": 1.095152603231598, "step": 15250}, {"loss": 0.7482, "grad_norm": 0.575903594493866, "learning_rate": 0.0002, "epoch": 1.095870736086176, "step": 15260}, {"loss": 0.7001, "grad_norm": 0.6794043183326721, "learning_rate": 0.0002, "epoch": 1.0965888689407541, "step": 15270}, {"loss": 0.708, "grad_norm": 0.7194870710372925, "learning_rate": 0.0002, "epoch": 1.0973070017953321, "step": 15280}, {"loss": 0.7248, "grad_norm": 0.8063322305679321, "learning_rate": 0.0002, "epoch": 1.0980251346499101, "step": 15290}, {"loss": 0.7128, "grad_norm": 0.786101758480072, "learning_rate": 0.0002, "epoch": 1.0987432675044884, "step": 15300}, {"loss": 0.7523, "grad_norm": 0.827474057674408, "learning_rate": 0.0002, "epoch": 1.0994614003590664, "step": 15310}, {"loss": 0.7624, "grad_norm": 0.6514455080032349, "learning_rate": 0.0002, "epoch": 1.1001795332136446, "step": 15320}, {"loss": 0.745, "grad_norm": 0.7534348368644714, "learning_rate": 0.0002, "epoch": 1.1008976660682226, "step": 15330}, {"loss": 0.7359, "grad_norm": 0.6991367340087891, "learning_rate": 0.0002, "epoch": 1.1016157989228008, "step": 15340}, {"loss": 0.717, "grad_norm": 0.6742196679115295, "learning_rate": 0.0002, "epoch": 1.1023339317773788, "step": 15350}, {"loss": 0.737, "grad_norm": 0.7373757362365723, "learning_rate": 0.0002, "epoch": 1.1030520646319568, "step": 15360}, {"loss": 0.7421, "grad_norm": 0.6834485530853271, "learning_rate": 0.0002, "epoch": 1.103770197486535, "step": 15370}, {"loss": 0.7015, "grad_norm": 0.6454901099205017, "learning_rate": 0.0002, "epoch": 1.104488330341113, "step": 15380}, {"loss": 0.7276, "grad_norm": 0.7764508128166199, "learning_rate": 0.0002, "epoch": 1.1052064631956913, "step": 15390}, {"loss": 0.747, "grad_norm": 0.668560802936554, "learning_rate": 0.0002, "epoch": 1.1059245960502693, "step": 15400}, {"loss": 0.6705, "grad_norm": 0.579655110836029, "learning_rate": 0.0002, "epoch": 1.1066427289048475, "step": 15410}, {"loss": 0.7101, "grad_norm": 0.7196493148803711, "learning_rate": 0.0002, "epoch": 1.1073608617594255, "step": 15420}, {"loss": 0.8027, "grad_norm": 0.5530232191085815, "learning_rate": 0.0002, "epoch": 1.1080789946140035, "step": 15430}, {"loss": 0.7369, "grad_norm": 0.6542958617210388, "learning_rate": 0.0002, "epoch": 1.1087971274685817, "step": 15440}, {"loss": 0.7475, "grad_norm": 0.7468852400779724, "learning_rate": 0.0002, "epoch": 1.1095152603231597, "step": 15450}, {"loss": 0.6898, "grad_norm": 0.8119780421257019, "learning_rate": 0.0002, "epoch": 1.110233393177738, "step": 15460}, {"loss": 0.7652, "grad_norm": 0.7807733416557312, "learning_rate": 0.0002, "epoch": 1.110951526032316, "step": 15470}, {"loss": 0.697, "grad_norm": 0.7352553009986877, "learning_rate": 0.0002, "epoch": 1.1116696588868942, "step": 15480}, {"loss": 0.7509, "grad_norm": 0.8455224633216858, "learning_rate": 0.0002, "epoch": 1.1123877917414722, "step": 15490}, {"loss": 0.7757, "grad_norm": 0.635308563709259, "learning_rate": 0.0002, "epoch": 1.1131059245960502, "step": 15500}, {"loss": 0.685, "grad_norm": 0.6268794536590576, "learning_rate": 0.0002, "epoch": 1.1138240574506284, "step": 15510}, {"loss": 0.7174, "grad_norm": 0.6829593181610107, "learning_rate": 0.0002, "epoch": 1.1145421903052064, "step": 15520}, {"loss": 0.7264, "grad_norm": 0.5997796058654785, "learning_rate": 0.0002, "epoch": 1.1152603231597846, "step": 15530}, {"loss": 0.7167, "grad_norm": 0.7500942349433899, "learning_rate": 0.0002, "epoch": 1.1159784560143626, "step": 15540}, {"loss": 0.7275, "grad_norm": 0.7052047848701477, "learning_rate": 0.0002, "epoch": 1.1166965888689409, "step": 15550}, {"loss": 0.7832, "grad_norm": 0.6698189377784729, "learning_rate": 0.0002, "epoch": 1.1174147217235189, "step": 15560}, {"loss": 0.7587, "grad_norm": 0.7890462875366211, "learning_rate": 0.0002, "epoch": 1.1181328545780969, "step": 15570}, {"loss": 0.7092, "grad_norm": 0.7002465128898621, "learning_rate": 0.0002, "epoch": 1.118850987432675, "step": 15580}, {"loss": 0.6903, "grad_norm": 0.7456073760986328, "learning_rate": 0.0002, "epoch": 1.119569120287253, "step": 15590}, {"loss": 0.7577, "grad_norm": 0.7997385263442993, "learning_rate": 0.0002, "epoch": 1.1202872531418313, "step": 15600}, {"loss": 0.7005, "grad_norm": 0.6640482544898987, "learning_rate": 0.0002, "epoch": 1.1210053859964093, "step": 15610}, {"loss": 0.7334, "grad_norm": 0.7765318155288696, "learning_rate": 0.0002, "epoch": 1.1217235188509875, "step": 15620}, {"loss": 0.6977, "grad_norm": 0.7184962630271912, "learning_rate": 0.0002, "epoch": 1.1224416517055655, "step": 15630}, {"loss": 0.7362, "grad_norm": 0.7310904264450073, "learning_rate": 0.0002, "epoch": 1.1231597845601435, "step": 15640}, {"loss": 0.7278, "grad_norm": 0.7406452298164368, "learning_rate": 0.0002, "epoch": 1.1238779174147218, "step": 15650}, {"loss": 0.7074, "grad_norm": 0.7546738982200623, "learning_rate": 0.0002, "epoch": 1.1245960502692998, "step": 15660}, {"loss": 0.7641, "grad_norm": 0.7069764733314514, "learning_rate": 0.0002, "epoch": 1.125314183123878, "step": 15670}, {"loss": 0.76, "grad_norm": 0.6309521198272705, "learning_rate": 0.0002, "epoch": 1.126032315978456, "step": 15680}, {"loss": 0.7862, "grad_norm": 0.8050156831741333, "learning_rate": 0.0002, "epoch": 1.1267504488330342, "step": 15690}, {"loss": 0.7553, "grad_norm": 0.726556122303009, "learning_rate": 0.0002, "epoch": 1.1274685816876122, "step": 15700}, {"loss": 0.7763, "grad_norm": 0.77745521068573, "learning_rate": 0.0002, "epoch": 1.1281867145421902, "step": 15710}, {"loss": 0.7703, "grad_norm": 0.7467634677886963, "learning_rate": 0.0002, "epoch": 1.1289048473967684, "step": 15720}, {"loss": 0.7676, "grad_norm": 0.8207895755767822, "learning_rate": 0.0002, "epoch": 1.1296229802513464, "step": 15730}, {"loss": 0.6747, "grad_norm": 0.8253937363624573, "learning_rate": 0.0002, "epoch": 1.1303411131059247, "step": 15740}, {"loss": 0.6983, "grad_norm": 0.6313983798027039, "learning_rate": 0.0002, "epoch": 1.1310592459605027, "step": 15750}, {"loss": 0.6916, "grad_norm": 0.8040992021560669, "learning_rate": 0.0002, "epoch": 1.1317773788150807, "step": 15760}, {"loss": 0.7295, "grad_norm": 0.5937064290046692, "learning_rate": 0.0002, "epoch": 1.132495511669659, "step": 15770}, {"loss": 0.7494, "grad_norm": 0.6486281156539917, "learning_rate": 0.0002, "epoch": 1.133213644524237, "step": 15780}, {"loss": 0.7029, "grad_norm": 0.6161853075027466, "learning_rate": 0.0002, "epoch": 1.1339317773788151, "step": 15790}, {"loss": 0.7019, "grad_norm": 0.6926610469818115, "learning_rate": 0.0002, "epoch": 1.1346499102333931, "step": 15800}, {"loss": 0.6906, "grad_norm": 0.6084047555923462, "learning_rate": 0.0002, "epoch": 1.1353680430879713, "step": 15810}, {"loss": 0.7091, "grad_norm": 0.6928383111953735, "learning_rate": 0.0002, "epoch": 1.1360861759425493, "step": 15820}, {"loss": 0.7238, "grad_norm": 0.7784243822097778, "learning_rate": 0.0002, "epoch": 1.1368043087971276, "step": 15830}, {"loss": 0.6943, "grad_norm": 0.7169384956359863, "learning_rate": 0.0002, "epoch": 1.1375224416517056, "step": 15840}, {"loss": 0.7287, "grad_norm": 0.6953616142272949, "learning_rate": 0.0002, "epoch": 1.1382405745062836, "step": 15850}, {"loss": 0.7489, "grad_norm": 0.7345215082168579, "learning_rate": 0.0002, "epoch": 1.1389587073608618, "step": 15860}, {"loss": 0.683, "grad_norm": 0.5469502806663513, "learning_rate": 0.0002, "epoch": 1.1396768402154398, "step": 15870}, {"loss": 0.717, "grad_norm": 0.687680721282959, "learning_rate": 0.0002, "epoch": 1.140394973070018, "step": 15880}, {"loss": 0.7171, "grad_norm": 0.6879996657371521, "learning_rate": 0.0002, "epoch": 1.141113105924596, "step": 15890}, {"loss": 0.7321, "grad_norm": 0.728886067867279, "learning_rate": 0.0002, "epoch": 1.141831238779174, "step": 15900}, {"loss": 0.7752, "grad_norm": 0.929531455039978, "learning_rate": 0.0002, "epoch": 1.1425493716337523, "step": 15910}, {"loss": 0.7353, "grad_norm": 0.8122507333755493, "learning_rate": 0.0002, "epoch": 1.1432675044883303, "step": 15920}, {"loss": 0.7138, "grad_norm": 0.6494652628898621, "learning_rate": 0.0002, "epoch": 1.1439856373429085, "step": 15930}, {"loss": 0.7489, "grad_norm": 0.7307567596435547, "learning_rate": 0.0002, "epoch": 1.1447037701974865, "step": 15940}, {"loss": 0.7385, "grad_norm": 0.548678994178772, "learning_rate": 0.0002, "epoch": 1.1454219030520647, "step": 15950}, {"loss": 0.7152, "grad_norm": 0.8011603951454163, "learning_rate": 0.0002, "epoch": 1.1461400359066427, "step": 15960}, {"loss": 0.7324, "grad_norm": 0.7026647329330444, "learning_rate": 0.0002, "epoch": 1.146858168761221, "step": 15970}, {"loss": 0.7464, "grad_norm": 0.7338995933532715, "learning_rate": 0.0002, "epoch": 1.147576301615799, "step": 15980}, {"loss": 0.7416, "grad_norm": 0.8453443646430969, "learning_rate": 0.0002, "epoch": 1.148294434470377, "step": 15990}, {"loss": 0.7419, "grad_norm": 0.6787207126617432, "learning_rate": 0.0002, "epoch": 1.1490125673249552, "step": 16000}, {"loss": 0.7487, "grad_norm": 0.6314631104469299, "learning_rate": 0.0002, "epoch": 1.1497307001795332, "step": 16010}, {"loss": 0.7165, "grad_norm": 0.8812752962112427, "learning_rate": 0.0002, "epoch": 1.1504488330341114, "step": 16020}, {"loss": 0.774, "grad_norm": 0.6528969407081604, "learning_rate": 0.0002, "epoch": 1.1511669658886894, "step": 16030}, {"loss": 0.7321, "grad_norm": 0.7843571305274963, "learning_rate": 0.0002, "epoch": 1.1518850987432674, "step": 16040}, {"loss": 0.7769, "grad_norm": 0.7095080018043518, "learning_rate": 0.0002, "epoch": 1.1526032315978456, "step": 16050}, {"loss": 0.744, "grad_norm": 0.7495582103729248, "learning_rate": 0.0002, "epoch": 1.1533213644524236, "step": 16060}, {"loss": 0.7813, "grad_norm": 0.6002049446105957, "learning_rate": 0.0002, "epoch": 1.1540394973070018, "step": 16070}, {"loss": 0.7117, "grad_norm": 0.565014123916626, "learning_rate": 0.0002, "epoch": 1.1547576301615798, "step": 16080}, {"loss": 0.7664, "grad_norm": 0.8209971785545349, "learning_rate": 0.0002, "epoch": 1.155475763016158, "step": 16090}, {"loss": 0.7486, "grad_norm": 0.7137531042098999, "learning_rate": 0.0002, "epoch": 1.156193895870736, "step": 16100}, {"loss": 0.7197, "grad_norm": 0.7307516932487488, "learning_rate": 0.0002, "epoch": 1.1569120287253143, "step": 16110}, {"loss": 0.7351, "grad_norm": 0.6686444878578186, "learning_rate": 0.0002, "epoch": 1.1576301615798923, "step": 16120}, {"loss": 0.7407, "grad_norm": 0.7977298498153687, "learning_rate": 0.0002, "epoch": 1.1583482944344703, "step": 16130}, {"loss": 0.6696, "grad_norm": 0.6980607509613037, "learning_rate": 0.0002, "epoch": 1.1590664272890485, "step": 16140}, {"loss": 0.7513, "grad_norm": 0.6622613668441772, "learning_rate": 0.0002, "epoch": 1.1597845601436265, "step": 16150}, {"loss": 0.7162, "grad_norm": 0.6598347425460815, "learning_rate": 0.0002, "epoch": 1.1605026929982047, "step": 16160}, {"loss": 0.7418, "grad_norm": 0.6686234474182129, "learning_rate": 0.0002, "epoch": 1.1612208258527827, "step": 16170}, {"loss": 0.7104, "grad_norm": 0.7308177947998047, "learning_rate": 0.0002, "epoch": 1.1619389587073607, "step": 16180}, {"loss": 0.7337, "grad_norm": 0.939537525177002, "learning_rate": 0.0002, "epoch": 1.162657091561939, "step": 16190}, {"loss": 0.7054, "grad_norm": 0.5514758825302124, "learning_rate": 0.0002, "epoch": 1.163375224416517, "step": 16200}, {"loss": 0.7449, "grad_norm": 0.589142918586731, "learning_rate": 0.0002, "epoch": 1.1640933572710952, "step": 16210}, {"loss": 0.7438, "grad_norm": 0.6888012290000916, "learning_rate": 0.0002, "epoch": 1.1648114901256732, "step": 16220}, {"loss": 0.719, "grad_norm": 0.82566899061203, "learning_rate": 0.0002, "epoch": 1.1655296229802514, "step": 16230}, {"loss": 0.7274, "grad_norm": 0.6107817888259888, "learning_rate": 0.0002, "epoch": 1.1662477558348294, "step": 16240}, {"loss": 0.6849, "grad_norm": 0.7831398844718933, "learning_rate": 0.0002, "epoch": 1.1669658886894076, "step": 16250}, {"loss": 0.7077, "grad_norm": 0.6468397974967957, "learning_rate": 0.0002, "epoch": 1.1676840215439857, "step": 16260}, {"loss": 0.7056, "grad_norm": 0.7284161448478699, "learning_rate": 0.0002, "epoch": 1.1684021543985637, "step": 16270}, {"loss": 0.7476, "grad_norm": 0.6182818412780762, "learning_rate": 0.0002, "epoch": 1.1691202872531419, "step": 16280}, {"loss": 0.7608, "grad_norm": 0.7091781497001648, "learning_rate": 0.0002, "epoch": 1.1698384201077199, "step": 16290}, {"loss": 0.7235, "grad_norm": 0.7327643632888794, "learning_rate": 0.0002, "epoch": 1.170556552962298, "step": 16300}, {"loss": 0.7304, "grad_norm": 0.5864694118499756, "learning_rate": 0.0002, "epoch": 1.171274685816876, "step": 16310}, {"loss": 0.7011, "grad_norm": 0.7049986720085144, "learning_rate": 0.0002, "epoch": 1.171992818671454, "step": 16320}, {"loss": 0.7234, "grad_norm": 0.7563399076461792, "learning_rate": 0.0002, "epoch": 1.1727109515260323, "step": 16330}, {"loss": 0.7313, "grad_norm": 0.5888143181800842, "learning_rate": 0.0002, "epoch": 1.1734290843806103, "step": 16340}, {"loss": 0.7078, "grad_norm": 0.8670049905776978, "learning_rate": 0.0002, "epoch": 1.1741472172351886, "step": 16350}, {"loss": 0.7656, "grad_norm": 0.8045654296875, "learning_rate": 0.0002, "epoch": 1.1748653500897666, "step": 16360}, {"loss": 0.7942, "grad_norm": 0.9115668535232544, "learning_rate": 0.0002, "epoch": 1.1755834829443448, "step": 16370}, {"loss": 0.6807, "grad_norm": 0.6943584084510803, "learning_rate": 0.0002, "epoch": 1.1763016157989228, "step": 16380}, {"loss": 0.7558, "grad_norm": 0.7931740283966064, "learning_rate": 0.0002, "epoch": 1.177019748653501, "step": 16390}, {"loss": 0.7247, "grad_norm": 0.7967953085899353, "learning_rate": 0.0002, "epoch": 1.177737881508079, "step": 16400}, {"loss": 0.7294, "grad_norm": 0.575165867805481, "learning_rate": 0.0002, "epoch": 1.178456014362657, "step": 16410}, {"loss": 0.8045, "grad_norm": 0.6803409457206726, "learning_rate": 0.0002, "epoch": 1.1791741472172352, "step": 16420}, {"loss": 0.7594, "grad_norm": 0.7661909461021423, "learning_rate": 0.0002, "epoch": 1.1798922800718132, "step": 16430}, {"loss": 0.7387, "grad_norm": 0.7907630205154419, "learning_rate": 0.0002, "epoch": 1.1806104129263915, "step": 16440}, {"loss": 0.6954, "grad_norm": 0.7215338945388794, "learning_rate": 0.0002, "epoch": 1.1813285457809695, "step": 16450}, {"loss": 0.7503, "grad_norm": 0.6824054718017578, "learning_rate": 0.0002, "epoch": 1.1820466786355475, "step": 16460}, {"loss": 0.7548, "grad_norm": 0.8057665228843689, "learning_rate": 0.0002, "epoch": 1.1827648114901257, "step": 16470}, {"loss": 0.7572, "grad_norm": 0.7487542033195496, "learning_rate": 0.0002, "epoch": 1.1834829443447037, "step": 16480}, {"loss": 0.7267, "grad_norm": 0.7254953384399414, "learning_rate": 0.0002, "epoch": 1.184201077199282, "step": 16490}, {"loss": 0.6906, "grad_norm": 0.6986604332923889, "learning_rate": 0.0002, "epoch": 1.18491921005386, "step": 16500}, {"loss": 0.6979, "grad_norm": 0.7889591455459595, "learning_rate": 0.0002, "epoch": 1.1856373429084381, "step": 16510}, {"loss": 0.7455, "grad_norm": 0.6029604077339172, "learning_rate": 0.0002, "epoch": 1.1863554757630161, "step": 16520}, {"loss": 0.7673, "grad_norm": 0.680322527885437, "learning_rate": 0.0002, "epoch": 1.1870736086175944, "step": 16530}, {"loss": 0.708, "grad_norm": 0.8588826060295105, "learning_rate": 0.0002, "epoch": 1.1877917414721724, "step": 16540}, {"loss": 0.7291, "grad_norm": 0.7614806890487671, "learning_rate": 0.0002, "epoch": 1.1885098743267504, "step": 16550}, {"loss": 0.7021, "grad_norm": 0.7523183226585388, "learning_rate": 0.0002, "epoch": 1.1892280071813286, "step": 16560}, {"loss": 0.7452, "grad_norm": 0.8299532532691956, "learning_rate": 0.0002, "epoch": 1.1899461400359066, "step": 16570}, {"loss": 0.7409, "grad_norm": 0.6709241271018982, "learning_rate": 0.0002, "epoch": 1.1906642728904848, "step": 16580}, {"loss": 0.7322, "grad_norm": 0.665414035320282, "learning_rate": 0.0002, "epoch": 1.1913824057450628, "step": 16590}, {"loss": 0.7699, "grad_norm": 0.7582152485847473, "learning_rate": 0.0002, "epoch": 1.1921005385996408, "step": 16600}, {"loss": 0.7069, "grad_norm": 0.5856947302818298, "learning_rate": 0.0002, "epoch": 1.192818671454219, "step": 16610}, {"loss": 0.7444, "grad_norm": 0.6972885727882385, "learning_rate": 0.0002, "epoch": 1.193536804308797, "step": 16620}, {"loss": 0.7265, "grad_norm": 0.6884734630584717, "learning_rate": 0.0002, "epoch": 1.1942549371633753, "step": 16630}, {"loss": 0.6881, "grad_norm": 0.7380475401878357, "learning_rate": 0.0002, "epoch": 1.1949730700179533, "step": 16640}, {"loss": 0.7297, "grad_norm": 0.7976197600364685, "learning_rate": 0.0002, "epoch": 1.1956912028725315, "step": 16650}, {"loss": 0.7328, "grad_norm": 0.819256067276001, "learning_rate": 0.0002, "epoch": 1.1964093357271095, "step": 16660}, {"loss": 0.771, "grad_norm": 0.587867796421051, "learning_rate": 0.0002, "epoch": 1.1971274685816877, "step": 16670}, {"loss": 0.7357, "grad_norm": 0.9162678122520447, "learning_rate": 0.0002, "epoch": 1.1978456014362657, "step": 16680}, {"loss": 0.7472, "grad_norm": 0.7452084422111511, "learning_rate": 0.0002, "epoch": 1.1985637342908437, "step": 16690}, {"loss": 0.7257, "grad_norm": 0.7966971397399902, "learning_rate": 0.0002, "epoch": 1.199281867145422, "step": 16700}, {"loss": 0.8051, "grad_norm": 0.6605724692344666, "learning_rate": 0.0002, "epoch": 1.2, "step": 16710}, {"loss": 0.729, "grad_norm": 0.6499220728874207, "learning_rate": 0.0002, "epoch": 1.2007181328545782, "step": 16720}, {"loss": 0.7107, "grad_norm": 0.7422114610671997, "learning_rate": 0.0002, "epoch": 1.2014362657091562, "step": 16730}, {"loss": 0.6712, "grad_norm": 0.6652370095252991, "learning_rate": 0.0002, "epoch": 1.2021543985637342, "step": 16740}, {"loss": 0.7804, "grad_norm": 0.8761070370674133, "learning_rate": 0.0002, "epoch": 1.2028725314183124, "step": 16750}, {"loss": 0.737, "grad_norm": 0.7294463515281677, "learning_rate": 0.0002, "epoch": 1.2035906642728904, "step": 16760}, {"loss": 0.7638, "grad_norm": 0.7725599408149719, "learning_rate": 0.0002, "epoch": 1.2043087971274686, "step": 16770}, {"loss": 0.6857, "grad_norm": 0.5630005598068237, "learning_rate": 0.0002, "epoch": 1.2050269299820466, "step": 16780}, {"loss": 0.7344, "grad_norm": 0.7601404786109924, "learning_rate": 0.0002, "epoch": 1.2057450628366249, "step": 16790}, {"loss": 0.729, "grad_norm": 0.6859985589981079, "learning_rate": 0.0002, "epoch": 1.2064631956912029, "step": 16800}, {"loss": 0.7203, "grad_norm": 0.7040054798126221, "learning_rate": 0.0002, "epoch": 1.207181328545781, "step": 16810}, {"loss": 0.7727, "grad_norm": 0.7058989405632019, "learning_rate": 0.0002, "epoch": 1.207899461400359, "step": 16820}, {"loss": 0.7247, "grad_norm": 0.7646133899688721, "learning_rate": 0.0002, "epoch": 1.208617594254937, "step": 16830}, {"loss": 0.7903, "grad_norm": 0.669550359249115, "learning_rate": 0.0002, "epoch": 1.2093357271095153, "step": 16840}, {"loss": 0.7313, "grad_norm": 0.6613401174545288, "learning_rate": 0.0002, "epoch": 1.2100538599640933, "step": 16850}, {"loss": 0.7181, "grad_norm": 0.8636519312858582, "learning_rate": 0.0002, "epoch": 1.2107719928186715, "step": 16860}, {"loss": 0.7111, "grad_norm": 0.6077507138252258, "learning_rate": 0.0002, "epoch": 1.2114901256732495, "step": 16870}, {"loss": 0.7706, "grad_norm": 0.7892228364944458, "learning_rate": 0.0002, "epoch": 1.2122082585278275, "step": 16880}, {"loss": 0.685, "grad_norm": 0.7424154877662659, "learning_rate": 0.0002, "epoch": 1.2129263913824058, "step": 16890}, {"loss": 0.6707, "grad_norm": 0.6525408029556274, "learning_rate": 0.0002, "epoch": 1.2136445242369838, "step": 16900}, {"loss": 0.7721, "grad_norm": 0.6178015470504761, "learning_rate": 0.0002, "epoch": 1.214362657091562, "step": 16910}, {"loss": 0.6971, "grad_norm": 0.7319437861442566, "learning_rate": 0.0002, "epoch": 1.21508078994614, "step": 16920}, {"loss": 0.7261, "grad_norm": 0.6823344826698303, "learning_rate": 0.0002, "epoch": 1.2157989228007182, "step": 16930}, {"loss": 0.7048, "grad_norm": 0.5681257843971252, "learning_rate": 0.0002, "epoch": 1.2165170556552962, "step": 16940}, {"loss": 0.7398, "grad_norm": 0.7939814925193787, "learning_rate": 0.0002, "epoch": 1.2172351885098744, "step": 16950}, {"loss": 0.7192, "grad_norm": 0.7031611800193787, "learning_rate": 0.0002, "epoch": 1.2179533213644524, "step": 16960}, {"loss": 0.7212, "grad_norm": 0.7610133290290833, "learning_rate": 0.0002, "epoch": 1.2186714542190304, "step": 16970}, {"loss": 0.7599, "grad_norm": 0.8707142472267151, "learning_rate": 0.0002, "epoch": 1.2193895870736087, "step": 16980}, {"loss": 0.7121, "grad_norm": 0.6603384017944336, "learning_rate": 0.0002, "epoch": 1.2201077199281867, "step": 16990}, {"loss": 0.7315, "grad_norm": 0.7218315005302429, "learning_rate": 0.0002, "epoch": 1.220825852782765, "step": 17000}, {"loss": 0.7513, "grad_norm": 0.8043148517608643, "learning_rate": 0.0002, "epoch": 1.221543985637343, "step": 17010}, {"loss": 0.6749, "grad_norm": 0.7232559323310852, "learning_rate": 0.0002, "epoch": 1.222262118491921, "step": 17020}, {"loss": 0.7681, "grad_norm": 0.690376341342926, "learning_rate": 0.0002, "epoch": 1.2229802513464991, "step": 17030}, {"loss": 0.7042, "grad_norm": 0.602436363697052, "learning_rate": 0.0002, "epoch": 1.2236983842010771, "step": 17040}, {"loss": 0.7129, "grad_norm": 0.7610493898391724, "learning_rate": 0.0002, "epoch": 1.2244165170556554, "step": 17050}, {"loss": 0.758, "grad_norm": 0.7504690885543823, "learning_rate": 0.0002, "epoch": 1.2251346499102334, "step": 17060}, {"loss": 0.6908, "grad_norm": 0.8080246448516846, "learning_rate": 0.0002, "epoch": 1.2258527827648116, "step": 17070}, {"loss": 0.7519, "grad_norm": 1.0240572690963745, "learning_rate": 0.0002, "epoch": 1.2265709156193896, "step": 17080}, {"loss": 0.7193, "grad_norm": 0.6874111294746399, "learning_rate": 0.0002, "epoch": 1.2272890484739678, "step": 17090}, {"loss": 0.79, "grad_norm": 0.800069272518158, "learning_rate": 0.0002, "epoch": 1.2280071813285458, "step": 17100}, {"loss": 0.742, "grad_norm": 0.8628103137016296, "learning_rate": 0.0002, "epoch": 1.2287253141831238, "step": 17110}, {"loss": 0.7022, "grad_norm": 0.7408499121665955, "learning_rate": 0.0002, "epoch": 1.229443447037702, "step": 17120}, {"loss": 0.6774, "grad_norm": 0.6494335532188416, "learning_rate": 0.0002, "epoch": 1.23016157989228, "step": 17130}, {"loss": 0.7025, "grad_norm": 0.6493549942970276, "learning_rate": 0.0002, "epoch": 1.2308797127468583, "step": 17140}, {"loss": 0.7448, "grad_norm": 0.6972658038139343, "learning_rate": 0.0002, "epoch": 1.2315978456014363, "step": 17150}, {"loss": 0.7219, "grad_norm": 0.6877315044403076, "learning_rate": 0.0002, "epoch": 1.2323159784560143, "step": 17160}, {"loss": 0.7945, "grad_norm": 0.7569024562835693, "learning_rate": 0.0002, "epoch": 1.2330341113105925, "step": 17170}, {"loss": 0.7467, "grad_norm": 0.696260392665863, "learning_rate": 0.0002, "epoch": 1.2337522441651705, "step": 17180}, {"loss": 0.6716, "grad_norm": 0.6150345802307129, "learning_rate": 0.0002, "epoch": 1.2344703770197487, "step": 17190}, {"loss": 0.7416, "grad_norm": 0.69009929895401, "learning_rate": 0.0002, "epoch": 1.2351885098743267, "step": 17200}, {"loss": 0.787, "grad_norm": 0.7035185098648071, "learning_rate": 0.0002, "epoch": 1.235906642728905, "step": 17210}, {"loss": 0.6896, "grad_norm": 0.6792506575584412, "learning_rate": 0.0002, "epoch": 1.236624775583483, "step": 17220}, {"loss": 0.6953, "grad_norm": 0.6310356855392456, "learning_rate": 0.0002, "epoch": 1.2373429084380612, "step": 17230}, {"loss": 0.7531, "grad_norm": 0.647026538848877, "learning_rate": 0.0002, "epoch": 1.2380610412926392, "step": 17240}, {"loss": 0.8014, "grad_norm": 0.7609930038452148, "learning_rate": 0.0002, "epoch": 1.2387791741472172, "step": 17250}, {"loss": 0.8045, "grad_norm": 0.791890561580658, "learning_rate": 0.0002, "epoch": 1.2394973070017954, "step": 17260}, {"loss": 0.7445, "grad_norm": 0.7126715183258057, "learning_rate": 0.0002, "epoch": 1.2402154398563734, "step": 17270}, {"loss": 0.6561, "grad_norm": 0.7850401401519775, "learning_rate": 0.0002, "epoch": 1.2409335727109516, "step": 17280}, {"loss": 0.7454, "grad_norm": 0.6694281697273254, "learning_rate": 0.0002, "epoch": 1.2416517055655296, "step": 17290}, {"loss": 0.6711, "grad_norm": 0.6418080925941467, "learning_rate": 0.0002, "epoch": 1.2423698384201076, "step": 17300}, {"loss": 0.7504, "grad_norm": 0.7308132648468018, "learning_rate": 0.0002, "epoch": 1.2430879712746858, "step": 17310}, {"loss": 0.6896, "grad_norm": 0.8322312235832214, "learning_rate": 0.0002, "epoch": 1.2438061041292638, "step": 17320}, {"loss": 0.7341, "grad_norm": 0.6959006190299988, "learning_rate": 0.0002, "epoch": 1.244524236983842, "step": 17330}, {"loss": 0.7025, "grad_norm": 0.7110121846199036, "learning_rate": 0.0002, "epoch": 1.24524236983842, "step": 17340}, {"loss": 0.7858, "grad_norm": 0.6496296525001526, "learning_rate": 0.0002, "epoch": 1.2459605026929983, "step": 17350}, {"loss": 0.7061, "grad_norm": 0.7649076581001282, "learning_rate": 0.0002, "epoch": 1.2466786355475763, "step": 17360}, {"loss": 0.7155, "grad_norm": 0.7139049172401428, "learning_rate": 0.0002, "epoch": 1.2473967684021545, "step": 17370}, {"loss": 0.6932, "grad_norm": 0.7709113955497742, "learning_rate": 0.0002, "epoch": 1.2481149012567325, "step": 17380}, {"loss": 0.731, "grad_norm": 0.7160373330116272, "learning_rate": 0.0002, "epoch": 1.2488330341113105, "step": 17390}, {"loss": 0.7146, "grad_norm": 0.5608301162719727, "learning_rate": 0.0002, "epoch": 1.2495511669658887, "step": 17400}, {"loss": 0.7368, "grad_norm": 0.6913180351257324, "learning_rate": 0.0002, "epoch": 1.2502692998204668, "step": 17410}, {"loss": 0.7167, "grad_norm": 0.6980322599411011, "learning_rate": 0.0002, "epoch": 1.250987432675045, "step": 17420}, {"loss": 0.7096, "grad_norm": 0.8155394792556763, "learning_rate": 0.0002, "epoch": 1.251705565529623, "step": 17430}, {"loss": 0.7477, "grad_norm": 0.8015886545181274, "learning_rate": 0.0002, "epoch": 1.252423698384201, "step": 17440}, {"loss": 0.7006, "grad_norm": 0.5985556244850159, "learning_rate": 0.0002, "epoch": 1.2531418312387792, "step": 17450}, {"loss": 0.7171, "grad_norm": 0.70317143201828, "learning_rate": 0.0002, "epoch": 1.2538599640933572, "step": 17460}, {"loss": 0.7006, "grad_norm": 0.612501323223114, "learning_rate": 0.0002, "epoch": 1.2545780969479354, "step": 17470}, {"loss": 0.7639, "grad_norm": 0.7347102165222168, "learning_rate": 0.0002, "epoch": 1.2552962298025134, "step": 17480}, {"loss": 0.7303, "grad_norm": 0.9189441800117493, "learning_rate": 0.0002, "epoch": 1.2560143626570914, "step": 17490}, {"loss": 0.7547, "grad_norm": 0.7727932929992676, "learning_rate": 0.0002, "epoch": 1.2567324955116697, "step": 17500}, {"loss": 0.6979, "grad_norm": 0.6782869696617126, "learning_rate": 0.0002, "epoch": 1.2574506283662479, "step": 17510}, {"loss": 0.7146, "grad_norm": 0.5710638761520386, "learning_rate": 0.0002, "epoch": 1.2581687612208259, "step": 17520}, {"loss": 0.6999, "grad_norm": 0.6856266856193542, "learning_rate": 0.0002, "epoch": 1.2588868940754039, "step": 17530}, {"loss": 0.7229, "grad_norm": 0.7257347702980042, "learning_rate": 0.0002, "epoch": 1.259605026929982, "step": 17540}, {"loss": 0.7475, "grad_norm": 0.6343092918395996, "learning_rate": 0.0002, "epoch": 1.26032315978456, "step": 17550}, {"loss": 0.7863, "grad_norm": 0.6482594013214111, "learning_rate": 0.0002, "epoch": 1.2610412926391383, "step": 17560}, {"loss": 0.716, "grad_norm": 0.6542837619781494, "learning_rate": 0.0002, "epoch": 1.2617594254937163, "step": 17570}, {"loss": 0.7871, "grad_norm": 0.7106123566627502, "learning_rate": 0.0002, "epoch": 1.2624775583482943, "step": 17580}, {"loss": 0.7446, "grad_norm": 0.9081960320472717, "learning_rate": 0.0002, "epoch": 1.2631956912028726, "step": 17590}, {"loss": 0.7591, "grad_norm": 0.7010290026664734, "learning_rate": 0.0002, "epoch": 1.2639138240574506, "step": 17600}, {"loss": 0.7391, "grad_norm": 0.9973132610321045, "learning_rate": 0.0002, "epoch": 1.2646319569120288, "step": 17610}, {"loss": 0.725, "grad_norm": 0.8003297448158264, "learning_rate": 0.0002, "epoch": 1.2653500897666068, "step": 17620}, {"loss": 0.697, "grad_norm": 0.7383468151092529, "learning_rate": 0.0002, "epoch": 1.2660682226211848, "step": 17630}, {"loss": 0.785, "grad_norm": 0.6337200999259949, "learning_rate": 0.0002, "epoch": 1.266786355475763, "step": 17640}, {"loss": 0.7469, "grad_norm": 0.6371761560440063, "learning_rate": 0.0002, "epoch": 1.2675044883303412, "step": 17650}, {"loss": 0.7348, "grad_norm": 0.7283522486686707, "learning_rate": 0.0002, "epoch": 1.2682226211849192, "step": 17660}, {"loss": 0.7251, "grad_norm": 0.8191015720367432, "learning_rate": 0.0002, "epoch": 1.2689407540394972, "step": 17670}, {"loss": 0.7558, "grad_norm": 0.6210351586341858, "learning_rate": 0.0002, "epoch": 1.2696588868940755, "step": 17680}, {"loss": 0.7733, "grad_norm": 0.6563277840614319, "learning_rate": 0.0002, "epoch": 1.2703770197486535, "step": 17690}, {"loss": 0.7065, "grad_norm": 0.7111260294914246, "learning_rate": 0.0002, "epoch": 1.2710951526032317, "step": 17700}, {"loss": 0.7079, "grad_norm": 0.7061500549316406, "learning_rate": 0.0002, "epoch": 1.2718132854578097, "step": 17710}, {"loss": 0.7612, "grad_norm": 0.7657744884490967, "learning_rate": 0.0002, "epoch": 1.2725314183123877, "step": 17720}, {"loss": 0.7513, "grad_norm": 0.6952996850013733, "learning_rate": 0.0002, "epoch": 1.273249551166966, "step": 17730}, {"loss": 0.7402, "grad_norm": 0.5678043961524963, "learning_rate": 0.0002, "epoch": 1.273967684021544, "step": 17740}, {"loss": 0.7357, "grad_norm": 0.8608036041259766, "learning_rate": 0.0002, "epoch": 1.2746858168761221, "step": 17750}, {"loss": 0.7482, "grad_norm": 0.7184045910835266, "learning_rate": 0.0002, "epoch": 1.2754039497307001, "step": 17760}, {"loss": 0.7277, "grad_norm": 0.6647557616233826, "learning_rate": 0.0002, "epoch": 1.2761220825852782, "step": 17770}, {"loss": 0.6866, "grad_norm": 0.6899349093437195, "learning_rate": 0.0002, "epoch": 1.2768402154398564, "step": 17780}, {"loss": 0.721, "grad_norm": 0.7073346972465515, "learning_rate": 0.0002, "epoch": 1.2775583482944346, "step": 17790}, {"loss": 0.7432, "grad_norm": 0.8896707892417908, "learning_rate": 0.0002, "epoch": 1.2782764811490126, "step": 17800}, {"loss": 0.7318, "grad_norm": 0.5072778463363647, "learning_rate": 0.0002, "epoch": 1.2789946140035906, "step": 17810}, {"loss": 0.7648, "grad_norm": 0.8889711499214172, "learning_rate": 0.0002, "epoch": 1.2797127468581688, "step": 17820}, {"loss": 0.6894, "grad_norm": 0.5583778619766235, "learning_rate": 0.0002, "epoch": 1.2804308797127468, "step": 17830}, {"loss": 0.7488, "grad_norm": 0.6526148915290833, "learning_rate": 0.0002, "epoch": 1.281149012567325, "step": 17840}, {"loss": 0.7462, "grad_norm": 0.7658175826072693, "learning_rate": 0.0002, "epoch": 1.281867145421903, "step": 17850}, {"loss": 0.7298, "grad_norm": 0.5547847151756287, "learning_rate": 0.0002, "epoch": 1.282585278276481, "step": 17860}, {"loss": 0.705, "grad_norm": 0.6153780817985535, "learning_rate": 0.0002, "epoch": 1.2833034111310593, "step": 17870}, {"loss": 0.7173, "grad_norm": 0.8474061489105225, "learning_rate": 0.0002, "epoch": 1.2840215439856373, "step": 17880}, {"loss": 0.7597, "grad_norm": 0.859260618686676, "learning_rate": 0.0002, "epoch": 1.2847396768402155, "step": 17890}, {"loss": 0.7237, "grad_norm": 0.7270520329475403, "learning_rate": 0.0002, "epoch": 1.2854578096947935, "step": 17900}, {"loss": 0.701, "grad_norm": 0.8166249394416809, "learning_rate": 0.0002, "epoch": 1.2861759425493715, "step": 17910}, {"loss": 0.686, "grad_norm": 0.9158982038497925, "learning_rate": 0.0002, "epoch": 1.2868940754039497, "step": 17920}, {"loss": 0.7243, "grad_norm": 0.8132565021514893, "learning_rate": 0.0002, "epoch": 1.287612208258528, "step": 17930}, {"loss": 0.6909, "grad_norm": 0.7914409637451172, "learning_rate": 0.0002, "epoch": 1.288330341113106, "step": 17940}, {"loss": 0.7034, "grad_norm": 0.6256071329116821, "learning_rate": 0.0002, "epoch": 1.289048473967684, "step": 17950}, {"loss": 0.7279, "grad_norm": 0.6463542580604553, "learning_rate": 0.0002, "epoch": 1.2897666068222622, "step": 17960}, {"loss": 0.7601, "grad_norm": 0.6702672839164734, "learning_rate": 0.0002, "epoch": 1.2904847396768402, "step": 17970}, {"loss": 0.7355, "grad_norm": 0.8666605949401855, "learning_rate": 0.0002, "epoch": 1.2912028725314184, "step": 17980}, {"loss": 0.6838, "grad_norm": 0.8055952787399292, "learning_rate": 0.0002, "epoch": 1.2919210053859964, "step": 17990}, {"loss": 0.7361, "grad_norm": 0.6909741163253784, "learning_rate": 0.0002, "epoch": 1.2926391382405744, "step": 18000}, {"loss": 0.7766, "grad_norm": 0.663702130317688, "learning_rate": 0.0002, "epoch": 1.2933572710951526, "step": 18010}, {"loss": 0.7071, "grad_norm": 0.6952448487281799, "learning_rate": 0.0002, "epoch": 1.2940754039497306, "step": 18020}, {"loss": 0.7359, "grad_norm": 0.5722854137420654, "learning_rate": 0.0002, "epoch": 1.2947935368043089, "step": 18030}, {"loss": 0.764, "grad_norm": 0.7987681031227112, "learning_rate": 0.0002, "epoch": 1.2955116696588869, "step": 18040}, {"loss": 0.743, "grad_norm": 0.661133348941803, "learning_rate": 0.0002, "epoch": 1.2962298025134649, "step": 18050}, {"loss": 0.7627, "grad_norm": 0.6025064587593079, "learning_rate": 0.0002, "epoch": 1.296947935368043, "step": 18060}, {"loss": 0.7242, "grad_norm": 0.7569907903671265, "learning_rate": 0.0002, "epoch": 1.2976660682226213, "step": 18070}, {"loss": 0.7234, "grad_norm": 0.7222012281417847, "learning_rate": 0.0002, "epoch": 1.2983842010771993, "step": 18080}, {"loss": 0.7133, "grad_norm": 0.5291963815689087, "learning_rate": 0.0002, "epoch": 1.2991023339317773, "step": 18090}, {"loss": 0.7215, "grad_norm": 0.6808363199234009, "learning_rate": 0.0002, "epoch": 1.2998204667863555, "step": 18100}, {"loss": 0.7621, "grad_norm": 0.6797927618026733, "learning_rate": 0.0002, "epoch": 1.3005385996409335, "step": 18110}, {"loss": 0.7474, "grad_norm": 0.7775542140007019, "learning_rate": 0.0002, "epoch": 1.3012567324955118, "step": 18120}, {"loss": 0.7376, "grad_norm": 0.7369466423988342, "learning_rate": 0.0002, "epoch": 1.3019748653500898, "step": 18130}, {"loss": 0.7098, "grad_norm": 0.6822494864463806, "learning_rate": 0.0002, "epoch": 1.3026929982046678, "step": 18140}, {"loss": 0.7675, "grad_norm": 0.9222138524055481, "learning_rate": 0.0002, "epoch": 1.303411131059246, "step": 18150}, {"loss": 0.7593, "grad_norm": 0.7485767006874084, "learning_rate": 0.0002, "epoch": 1.304129263913824, "step": 18160}, {"loss": 0.7293, "grad_norm": 0.6383684277534485, "learning_rate": 0.0002, "epoch": 1.3048473967684022, "step": 18170}, {"loss": 0.7929, "grad_norm": 0.5934187173843384, "learning_rate": 0.0002, "epoch": 1.3055655296229802, "step": 18180}, {"loss": 0.7576, "grad_norm": 0.7265770435333252, "learning_rate": 0.0002, "epoch": 1.3062836624775582, "step": 18190}, {"loss": 0.7126, "grad_norm": 0.8149140477180481, "learning_rate": 0.0002, "epoch": 1.3070017953321365, "step": 18200}, {"loss": 0.7529, "grad_norm": 0.8067880272865295, "learning_rate": 0.0002, "epoch": 1.3077199281867147, "step": 18210}, {"loss": 0.7173, "grad_norm": 0.6109178066253662, "learning_rate": 0.0002, "epoch": 1.3084380610412927, "step": 18220}, {"loss": 0.7452, "grad_norm": 0.7194176316261292, "learning_rate": 0.0002, "epoch": 1.3091561938958707, "step": 18230}, {"loss": 0.732, "grad_norm": 0.6452242136001587, "learning_rate": 0.0002, "epoch": 1.309874326750449, "step": 18240}, {"loss": 0.7772, "grad_norm": 0.680550217628479, "learning_rate": 0.0002, "epoch": 1.310592459605027, "step": 18250}, {"loss": 0.7334, "grad_norm": 0.7005740404129028, "learning_rate": 0.0002, "epoch": 1.3113105924596051, "step": 18260}, {"loss": 0.7537, "grad_norm": 0.7217825055122375, "learning_rate": 0.0002, "epoch": 1.3120287253141831, "step": 18270}, {"loss": 0.7797, "grad_norm": 0.7730209231376648, "learning_rate": 0.0002, "epoch": 1.3127468581687611, "step": 18280}, {"loss": 0.7257, "grad_norm": 0.8291956186294556, "learning_rate": 0.0002, "epoch": 1.3134649910233394, "step": 18290}, {"loss": 0.7234, "grad_norm": 0.758528470993042, "learning_rate": 0.0002, "epoch": 1.3141831238779174, "step": 18300}, {"loss": 0.6915, "grad_norm": 0.9682782292366028, "learning_rate": 0.0002, "epoch": 1.3149012567324956, "step": 18310}, {"loss": 0.686, "grad_norm": 0.5784780979156494, "learning_rate": 0.0002, "epoch": 1.3156193895870736, "step": 18320}, {"loss": 0.7277, "grad_norm": 0.5870532393455505, "learning_rate": 0.0002, "epoch": 1.3163375224416516, "step": 18330}, {"loss": 0.7594, "grad_norm": 0.5950172543525696, "learning_rate": 0.0002, "epoch": 1.3170556552962298, "step": 18340}, {"loss": 0.7086, "grad_norm": 0.7625961899757385, "learning_rate": 0.0002, "epoch": 1.317773788150808, "step": 18350}, {"loss": 0.7075, "grad_norm": 0.8027397394180298, "learning_rate": 0.0002, "epoch": 1.318491921005386, "step": 18360}, {"loss": 0.7249, "grad_norm": 0.8424779772758484, "learning_rate": 0.0002, "epoch": 1.319210053859964, "step": 18370}, {"loss": 0.7349, "grad_norm": 0.5741737484931946, "learning_rate": 0.0002, "epoch": 1.3199281867145423, "step": 18380}, {"loss": 0.7421, "grad_norm": 0.7363710999488831, "learning_rate": 0.0002, "epoch": 1.3206463195691203, "step": 18390}, {"loss": 0.7208, "grad_norm": 0.7900536060333252, "learning_rate": 0.0002, "epoch": 1.3213644524236985, "step": 18400}, {"loss": 0.6836, "grad_norm": 0.6273105144500732, "learning_rate": 0.0002, "epoch": 1.3220825852782765, "step": 18410}, {"loss": 0.7365, "grad_norm": 0.7612496018409729, "learning_rate": 0.0002, "epoch": 1.3228007181328545, "step": 18420}, {"loss": 0.7521, "grad_norm": 0.729653537273407, "learning_rate": 0.0002, "epoch": 1.3235188509874327, "step": 18430}, {"loss": 0.7153, "grad_norm": 0.6599212288856506, "learning_rate": 0.0002, "epoch": 1.3242369838420107, "step": 18440}, {"loss": 0.7315, "grad_norm": 0.762320876121521, "learning_rate": 0.0002, "epoch": 1.324955116696589, "step": 18450}, {"loss": 0.6986, "grad_norm": 0.7468838095664978, "learning_rate": 0.0002, "epoch": 1.325673249551167, "step": 18460}, {"loss": 0.7527, "grad_norm": 0.6376237273216248, "learning_rate": 0.0002, "epoch": 1.326391382405745, "step": 18470}, {"loss": 0.7173, "grad_norm": 0.6722603440284729, "learning_rate": 0.0002, "epoch": 1.3271095152603232, "step": 18480}, {"loss": 0.6821, "grad_norm": 0.7011231780052185, "learning_rate": 0.0002, "epoch": 1.3278276481149014, "step": 18490}, {"loss": 0.7942, "grad_norm": 0.5325027108192444, "learning_rate": 0.0002, "epoch": 1.3285457809694794, "step": 18500}, {"loss": 0.6709, "grad_norm": 0.6916731595993042, "learning_rate": 0.0002, "epoch": 1.3292639138240574, "step": 18510}, {"loss": 0.7204, "grad_norm": 0.6529106497764587, "learning_rate": 0.0002, "epoch": 1.3299820466786356, "step": 18520}, {"loss": 0.7289, "grad_norm": 0.7708640694618225, "learning_rate": 0.0002, "epoch": 1.3307001795332136, "step": 18530}, {"loss": 0.7688, "grad_norm": 0.7125861048698425, "learning_rate": 0.0002, "epoch": 1.3314183123877918, "step": 18540}, {"loss": 0.723, "grad_norm": 0.7663969993591309, "learning_rate": 0.0002, "epoch": 1.3321364452423698, "step": 18550}, {"loss": 0.6993, "grad_norm": 0.601141631603241, "learning_rate": 0.0002, "epoch": 1.3328545780969479, "step": 18560}, {"loss": 0.734, "grad_norm": 0.6185581088066101, "learning_rate": 0.0002, "epoch": 1.333572710951526, "step": 18570}, {"loss": 0.6938, "grad_norm": 0.6136596202850342, "learning_rate": 0.0002, "epoch": 1.334290843806104, "step": 18580}, {"loss": 0.6963, "grad_norm": 0.8377187252044678, "learning_rate": 0.0002, "epoch": 1.3350089766606823, "step": 18590}, {"loss": 0.7399, "grad_norm": 0.7649989724159241, "learning_rate": 0.0002, "epoch": 1.3357271095152603, "step": 18600}, {"loss": 0.7565, "grad_norm": 0.7944515347480774, "learning_rate": 0.0002, "epoch": 1.3364452423698383, "step": 18610}, {"loss": 0.7894, "grad_norm": 0.619024395942688, "learning_rate": 0.0002, "epoch": 1.3371633752244165, "step": 18620}, {"loss": 0.7497, "grad_norm": 0.7849082946777344, "learning_rate": 0.0002, "epoch": 1.3378815080789948, "step": 18630}, {"loss": 0.7123, "grad_norm": 0.5740780830383301, "learning_rate": 0.0002, "epoch": 1.3385996409335728, "step": 18640}, {"loss": 0.7211, "grad_norm": 0.6897456645965576, "learning_rate": 0.0002, "epoch": 1.3393177737881508, "step": 18650}, {"loss": 0.7174, "grad_norm": 0.6263600587844849, "learning_rate": 0.0002, "epoch": 1.340035906642729, "step": 18660}, {"loss": 0.7048, "grad_norm": 0.5744550824165344, "learning_rate": 0.0002, "epoch": 1.340754039497307, "step": 18670}, {"loss": 0.7773, "grad_norm": 0.7785728573799133, "learning_rate": 0.0002, "epoch": 1.3414721723518852, "step": 18680}, {"loss": 0.7697, "grad_norm": 0.6944230198860168, "learning_rate": 0.0002, "epoch": 1.3421903052064632, "step": 18690}, {"loss": 0.7387, "grad_norm": 0.7388073801994324, "learning_rate": 0.0002, "epoch": 1.3429084380610412, "step": 18700}, {"loss": 0.7776, "grad_norm": 0.9555586576461792, "learning_rate": 0.0002, "epoch": 1.3436265709156194, "step": 18710}, {"loss": 0.7308, "grad_norm": 0.8510582447052002, "learning_rate": 0.0002, "epoch": 1.3443447037701974, "step": 18720}, {"loss": 0.7131, "grad_norm": 0.6093049645423889, "learning_rate": 0.0002, "epoch": 1.3450628366247757, "step": 18730}, {"loss": 0.7194, "grad_norm": 0.9159273505210876, "learning_rate": 0.0002, "epoch": 1.3457809694793537, "step": 18740}, {"loss": 0.7626, "grad_norm": 0.7188084721565247, "learning_rate": 0.0002, "epoch": 1.3464991023339317, "step": 18750}, {"loss": 0.7212, "grad_norm": 0.7228650450706482, "learning_rate": 0.0002, "epoch": 1.3472172351885099, "step": 18760}, {"loss": 0.7213, "grad_norm": 0.8160615563392639, "learning_rate": 0.0002, "epoch": 1.347935368043088, "step": 18770}, {"loss": 0.7093, "grad_norm": 0.6485389471054077, "learning_rate": 0.0002, "epoch": 1.3486535008976661, "step": 18780}, {"loss": 0.7044, "grad_norm": 0.6755139827728271, "learning_rate": 0.0002, "epoch": 1.3493716337522441, "step": 18790}, {"loss": 0.7413, "grad_norm": 0.6923297643661499, "learning_rate": 0.0002, "epoch": 1.3500897666068223, "step": 18800}, {"loss": 0.7184, "grad_norm": 0.6954510807991028, "learning_rate": 0.0002, "epoch": 1.3508078994614003, "step": 18810}, {"loss": 0.6987, "grad_norm": 0.9948558807373047, "learning_rate": 0.0002, "epoch": 1.3515260323159786, "step": 18820}, {"loss": 0.7315, "grad_norm": 0.708381175994873, "learning_rate": 0.0002, "epoch": 1.3522441651705566, "step": 18830}, {"loss": 0.7135, "grad_norm": 0.6409999132156372, "learning_rate": 0.0002, "epoch": 1.3529622980251346, "step": 18840}, {"loss": 0.7204, "grad_norm": 0.6365936994552612, "learning_rate": 0.0002, "epoch": 1.3536804308797128, "step": 18850}, {"loss": 0.691, "grad_norm": 0.7620742917060852, "learning_rate": 0.0002, "epoch": 1.3543985637342908, "step": 18860}, {"loss": 0.7458, "grad_norm": 0.6849071383476257, "learning_rate": 0.0002, "epoch": 1.355116696588869, "step": 18870}, {"loss": 0.7221, "grad_norm": 0.5776316523551941, "learning_rate": 0.0002, "epoch": 1.355834829443447, "step": 18880}, {"loss": 0.7412, "grad_norm": 0.597236156463623, "learning_rate": 0.0002, "epoch": 1.356552962298025, "step": 18890}, {"loss": 0.7065, "grad_norm": 0.6569282412528992, "learning_rate": 0.0002, "epoch": 1.3572710951526032, "step": 18900}, {"loss": 0.6995, "grad_norm": 0.6384802460670471, "learning_rate": 0.0002, "epoch": 1.3579892280071812, "step": 18910}, {"loss": 0.7592, "grad_norm": 0.6623879671096802, "learning_rate": 0.0002, "epoch": 1.3587073608617595, "step": 18920}, {"loss": 0.7288, "grad_norm": 0.6149632334709167, "learning_rate": 0.0002, "epoch": 1.3594254937163375, "step": 18930}, {"loss": 0.7392, "grad_norm": 0.6978002190589905, "learning_rate": 0.0002, "epoch": 1.3601436265709157, "step": 18940}, {"loss": 0.7405, "grad_norm": 0.7579124569892883, "learning_rate": 0.0002, "epoch": 1.3608617594254937, "step": 18950}, {"loss": 0.7589, "grad_norm": 0.7138084173202515, "learning_rate": 0.0002, "epoch": 1.361579892280072, "step": 18960}, {"loss": 0.7257, "grad_norm": 0.678322434425354, "learning_rate": 0.0002, "epoch": 1.36229802513465, "step": 18970}, {"loss": 0.7221, "grad_norm": 0.694346010684967, "learning_rate": 0.0002, "epoch": 1.363016157989228, "step": 18980}, {"loss": 0.6986, "grad_norm": 0.682262659072876, "learning_rate": 0.0002, "epoch": 1.3637342908438062, "step": 18990}, {"loss": 0.7297, "grad_norm": 0.9068194627761841, "learning_rate": 0.0002, "epoch": 1.3644524236983842, "step": 19000}, {"loss": 0.756, "grad_norm": 0.6691566705703735, "learning_rate": 0.0002, "epoch": 1.3651705565529624, "step": 19010}, {"loss": 0.7158, "grad_norm": 0.7791378498077393, "learning_rate": 0.0002, "epoch": 1.3658886894075404, "step": 19020}, {"loss": 0.6904, "grad_norm": 0.717107355594635, "learning_rate": 0.0002, "epoch": 1.3666068222621184, "step": 19030}, {"loss": 0.7308, "grad_norm": 0.7897566556930542, "learning_rate": 0.0002, "epoch": 1.3673249551166966, "step": 19040}, {"loss": 0.7278, "grad_norm": 0.8823844790458679, "learning_rate": 0.0002, "epoch": 1.3680430879712746, "step": 19050}, {"loss": 0.7252, "grad_norm": 0.6512053608894348, "learning_rate": 0.0002, "epoch": 1.3687612208258528, "step": 19060}, {"loss": 0.6861, "grad_norm": 0.6871389150619507, "learning_rate": 0.0002, "epoch": 1.3694793536804308, "step": 19070}, {"loss": 0.7311, "grad_norm": 0.6795603036880493, "learning_rate": 0.0002, "epoch": 1.370197486535009, "step": 19080}, {"loss": 0.7351, "grad_norm": 0.6569121479988098, "learning_rate": 0.0002, "epoch": 1.370915619389587, "step": 19090}, {"loss": 0.7743, "grad_norm": 0.6769960522651672, "learning_rate": 0.0002, "epoch": 1.3716337522441653, "step": 19100}, {"loss": 0.7275, "grad_norm": 0.726613461971283, "learning_rate": 0.0002, "epoch": 1.3723518850987433, "step": 19110}, {"loss": 0.7484, "grad_norm": 0.7287817001342773, "learning_rate": 0.0002, "epoch": 1.3730700179533213, "step": 19120}, {"loss": 0.7305, "grad_norm": 0.6169242858886719, "learning_rate": 0.0002, "epoch": 1.3737881508078995, "step": 19130}, {"loss": 0.7195, "grad_norm": 0.6537347435951233, "learning_rate": 0.0002, "epoch": 1.3745062836624775, "step": 19140}, {"loss": 0.7402, "grad_norm": 0.6113879680633545, "learning_rate": 0.0002, "epoch": 1.3752244165170557, "step": 19150}, {"loss": 0.7012, "grad_norm": 0.6415297985076904, "learning_rate": 0.0002, "epoch": 1.3759425493716337, "step": 19160}, {"loss": 0.7367, "grad_norm": 0.6812838315963745, "learning_rate": 0.0002, "epoch": 1.3766606822262117, "step": 19170}, {"loss": 0.7117, "grad_norm": 0.7331814169883728, "learning_rate": 0.0002, "epoch": 1.37737881508079, "step": 19180}, {"loss": 0.7496, "grad_norm": 0.7265108823776245, "learning_rate": 0.0002, "epoch": 1.378096947935368, "step": 19190}, {"loss": 0.699, "grad_norm": 0.6233167052268982, "learning_rate": 0.0002, "epoch": 1.3788150807899462, "step": 19200}, {"loss": 0.6978, "grad_norm": 0.6841492652893066, "learning_rate": 0.0002, "epoch": 1.3795332136445242, "step": 19210}, {"loss": 0.6934, "grad_norm": 0.822853684425354, "learning_rate": 0.0002, "epoch": 1.3802513464991024, "step": 19220}, {"loss": 0.7574, "grad_norm": 0.8078812956809998, "learning_rate": 0.0002, "epoch": 1.3809694793536804, "step": 19230}, {"loss": 0.7429, "grad_norm": 0.7269898056983948, "learning_rate": 0.0002, "epoch": 1.3816876122082586, "step": 19240}, {"loss": 0.7552, "grad_norm": 0.6297033429145813, "learning_rate": 0.0002, "epoch": 1.3824057450628366, "step": 19250}, {"loss": 0.7396, "grad_norm": 0.8097442388534546, "learning_rate": 0.0002, "epoch": 1.3831238779174146, "step": 19260}, {"loss": 0.7281, "grad_norm": 0.6442803740501404, "learning_rate": 0.0002, "epoch": 1.3838420107719929, "step": 19270}, {"loss": 0.7598, "grad_norm": 0.659866213798523, "learning_rate": 0.0002, "epoch": 1.3845601436265709, "step": 19280}, {"loss": 0.7262, "grad_norm": 0.7537921667098999, "learning_rate": 0.0002, "epoch": 1.385278276481149, "step": 19290}, {"loss": 0.7215, "grad_norm": 0.8441828489303589, "learning_rate": 0.0002, "epoch": 1.385996409335727, "step": 19300}, {"loss": 0.725, "grad_norm": 0.8506057262420654, "learning_rate": 0.0002, "epoch": 1.386714542190305, "step": 19310}, {"loss": 0.7747, "grad_norm": 0.6747094392776489, "learning_rate": 0.0002, "epoch": 1.3874326750448833, "step": 19320}, {"loss": 0.7785, "grad_norm": 0.7906509041786194, "learning_rate": 0.0002, "epoch": 1.3881508078994613, "step": 19330}, {"loss": 0.8147, "grad_norm": 0.6784867644309998, "learning_rate": 0.0002, "epoch": 1.3888689407540395, "step": 19340}, {"loss": 0.7861, "grad_norm": 0.6371709108352661, "learning_rate": 0.0002, "epoch": 1.3895870736086176, "step": 19350}, {"loss": 0.7434, "grad_norm": 0.7858285307884216, "learning_rate": 0.0002, "epoch": 1.3903052064631956, "step": 19360}, {"loss": 0.7638, "grad_norm": 0.711395263671875, "learning_rate": 0.0002, "epoch": 1.3910233393177738, "step": 19370}, {"loss": 0.725, "grad_norm": 0.7023257613182068, "learning_rate": 0.0002, "epoch": 1.391741472172352, "step": 19380}, {"loss": 0.7612, "grad_norm": 0.7036022543907166, "learning_rate": 0.0002, "epoch": 1.39245960502693, "step": 19390}, {"loss": 0.7354, "grad_norm": 0.6418436169624329, "learning_rate": 0.0002, "epoch": 1.393177737881508, "step": 19400}, {"loss": 0.7444, "grad_norm": 0.7108847498893738, "learning_rate": 0.0002, "epoch": 1.3938958707360862, "step": 19410}, {"loss": 0.771, "grad_norm": 0.6940230131149292, "learning_rate": 0.0002, "epoch": 1.3946140035906642, "step": 19420}, {"loss": 0.6791, "grad_norm": 0.6750220656394958, "learning_rate": 0.0002, "epoch": 1.3953321364452425, "step": 19430}, {"loss": 0.7466, "grad_norm": 0.7479177713394165, "learning_rate": 0.0002, "epoch": 1.3960502692998205, "step": 19440}, {"loss": 0.7259, "grad_norm": 0.626124918460846, "learning_rate": 0.0002, "epoch": 1.3967684021543985, "step": 19450}, {"loss": 0.7108, "grad_norm": 0.8908559083938599, "learning_rate": 0.0002, "epoch": 1.3974865350089767, "step": 19460}, {"loss": 0.7451, "grad_norm": 0.6163712739944458, "learning_rate": 0.0002, "epoch": 1.3982046678635547, "step": 19470}, {"loss": 0.7437, "grad_norm": 0.6993312239646912, "learning_rate": 0.0002, "epoch": 1.398922800718133, "step": 19480}, {"loss": 0.7035, "grad_norm": 0.6162890791893005, "learning_rate": 0.0002, "epoch": 1.399640933572711, "step": 19490}, {"loss": 0.7455, "grad_norm": 0.7797643542289734, "learning_rate": 0.0002, "epoch": 1.400359066427289, "step": 19500}, {"loss": 0.7497, "grad_norm": 0.7038744688034058, "learning_rate": 0.0002, "epoch": 1.4010771992818671, "step": 19510}, {"loss": 0.7084, "grad_norm": 0.6902393698692322, "learning_rate": 0.0002, "epoch": 1.4017953321364454, "step": 19520}, {"loss": 0.7136, "grad_norm": 0.5436386466026306, "learning_rate": 0.0002, "epoch": 1.4025134649910234, "step": 19530}, {"loss": 0.7457, "grad_norm": 0.6537990570068359, "learning_rate": 0.0002, "epoch": 1.4032315978456014, "step": 19540}, {"loss": 0.727, "grad_norm": 0.739691972732544, "learning_rate": 0.0002, "epoch": 1.4039497307001796, "step": 19550}, {"loss": 0.7537, "grad_norm": 0.7287635803222656, "learning_rate": 0.0002, "epoch": 1.4046678635547576, "step": 19560}, {"loss": 0.707, "grad_norm": 0.6809501051902771, "learning_rate": 0.0002, "epoch": 1.4053859964093358, "step": 19570}, {"loss": 0.7336, "grad_norm": 0.8302195072174072, "learning_rate": 0.0002, "epoch": 1.4061041292639138, "step": 19580}, {"loss": 0.7201, "grad_norm": 0.6613629460334778, "learning_rate": 0.0002, "epoch": 1.4068222621184918, "step": 19590}, {"loss": 0.7415, "grad_norm": 0.7897207736968994, "learning_rate": 0.0002, "epoch": 1.40754039497307, "step": 19600}, {"loss": 0.7483, "grad_norm": 0.8368293642997742, "learning_rate": 0.0002, "epoch": 1.408258527827648, "step": 19610}, {"loss": 0.7412, "grad_norm": 0.665109395980835, "learning_rate": 0.0002, "epoch": 1.4089766606822263, "step": 19620}, {"loss": 0.7339, "grad_norm": 0.7359302639961243, "learning_rate": 0.0002, "epoch": 1.4096947935368043, "step": 19630}, {"loss": 0.7775, "grad_norm": 0.8048052787780762, "learning_rate": 0.0002, "epoch": 1.4104129263913823, "step": 19640}, {"loss": 0.7668, "grad_norm": 0.7414906620979309, "learning_rate": 0.0002, "epoch": 1.4111310592459605, "step": 19650}, {"loss": 0.7386, "grad_norm": 0.7894161343574524, "learning_rate": 0.0002, "epoch": 1.4118491921005387, "step": 19660}, {"loss": 0.7371, "grad_norm": 0.6724628210067749, "learning_rate": 0.0002, "epoch": 1.4125673249551167, "step": 19670}, {"loss": 0.7243, "grad_norm": 0.9397756457328796, "learning_rate": 0.0002, "epoch": 1.4132854578096947, "step": 19680}, {"loss": 0.7109, "grad_norm": 0.6684842109680176, "learning_rate": 0.0002, "epoch": 1.414003590664273, "step": 19690}, {"loss": 0.7693, "grad_norm": 0.7753993272781372, "learning_rate": 0.0002, "epoch": 1.414721723518851, "step": 19700}, {"loss": 0.7653, "grad_norm": 0.6934253573417664, "learning_rate": 0.0002, "epoch": 1.4154398563734292, "step": 19710}, {"loss": 0.7393, "grad_norm": 0.8567284941673279, "learning_rate": 0.0002, "epoch": 1.4161579892280072, "step": 19720}, {"loss": 0.6907, "grad_norm": 0.9471787214279175, "learning_rate": 0.0002, "epoch": 1.4168761220825852, "step": 19730}, {"loss": 0.709, "grad_norm": 0.6664855480194092, "learning_rate": 0.0002, "epoch": 1.4175942549371634, "step": 19740}, {"loss": 0.7149, "grad_norm": 0.6713361740112305, "learning_rate": 0.0002, "epoch": 1.4183123877917414, "step": 19750}, {"loss": 0.7302, "grad_norm": 0.6488258838653564, "learning_rate": 0.0002, "epoch": 1.4190305206463196, "step": 19760}, {"loss": 0.7612, "grad_norm": 0.7089938521385193, "learning_rate": 0.0002, "epoch": 1.4197486535008976, "step": 19770}, {"loss": 0.7245, "grad_norm": 0.6433218717575073, "learning_rate": 0.0002, "epoch": 1.4204667863554756, "step": 19780}, {"loss": 0.7105, "grad_norm": 0.7025160193443298, "learning_rate": 0.0002, "epoch": 1.4211849192100539, "step": 19790}, {"loss": 0.7948, "grad_norm": 0.7030544877052307, "learning_rate": 0.0002, "epoch": 1.421903052064632, "step": 19800}, {"loss": 0.7333, "grad_norm": 0.6515552401542664, "learning_rate": 0.0002, "epoch": 1.42262118491921, "step": 19810}, {"loss": 0.7342, "grad_norm": 0.6463841795921326, "learning_rate": 0.0002, "epoch": 1.423339317773788, "step": 19820}, {"loss": 0.7457, "grad_norm": 0.6654344201087952, "learning_rate": 0.0002, "epoch": 1.4240574506283663, "step": 19830}, {"loss": 0.7289, "grad_norm": 0.7223384380340576, "learning_rate": 0.0002, "epoch": 1.4247755834829443, "step": 19840}, {"loss": 0.7471, "grad_norm": 0.6575722694396973, "learning_rate": 0.0002, "epoch": 1.4254937163375225, "step": 19850}, {"loss": 0.7559, "grad_norm": 0.6216059327125549, "learning_rate": 0.0002, "epoch": 1.4262118491921005, "step": 19860}, {"loss": 0.7638, "grad_norm": 0.7451487183570862, "learning_rate": 0.0002, "epoch": 1.4269299820466785, "step": 19870}, {"loss": 0.7083, "grad_norm": 0.6563336253166199, "learning_rate": 0.0002, "epoch": 1.4276481149012568, "step": 19880}, {"loss": 0.7122, "grad_norm": 0.8021975159645081, "learning_rate": 0.0002, "epoch": 1.4283662477558348, "step": 19890}, {"loss": 0.7389, "grad_norm": 0.7474712133407593, "learning_rate": 0.0002, "epoch": 1.429084380610413, "step": 19900}, {"loss": 0.7839, "grad_norm": 0.7316377758979797, "learning_rate": 0.0002, "epoch": 1.429802513464991, "step": 19910}, {"loss": 0.7588, "grad_norm": 0.646892786026001, "learning_rate": 0.0002, "epoch": 1.430520646319569, "step": 19920}, {"loss": 0.7175, "grad_norm": 0.6268765926361084, "learning_rate": 0.0002, "epoch": 1.4312387791741472, "step": 19930}, {"loss": 0.7502, "grad_norm": 0.7104699611663818, "learning_rate": 0.0002, "epoch": 1.4319569120287254, "step": 19940}, {"loss": 0.7006, "grad_norm": 0.6742063760757446, "learning_rate": 0.0002, "epoch": 1.4326750448833034, "step": 19950}, {"loss": 0.7394, "grad_norm": 0.6973381638526917, "learning_rate": 0.0002, "epoch": 1.4333931777378814, "step": 19960}, {"loss": 0.7428, "grad_norm": 0.5819381475448608, "learning_rate": 0.0002, "epoch": 1.4341113105924597, "step": 19970}, {"loss": 0.7836, "grad_norm": 0.680623471736908, "learning_rate": 0.0002, "epoch": 1.4348294434470377, "step": 19980}, {"loss": 0.7063, "grad_norm": 0.5899890661239624, "learning_rate": 0.0002, "epoch": 1.435547576301616, "step": 19990}, {"loss": 0.7438, "grad_norm": 0.6225098371505737, "learning_rate": 0.0002, "epoch": 1.436265709156194, "step": 20000}, {"loss": 0.7065, "grad_norm": 0.6314228773117065, "learning_rate": 0.0002, "epoch": 1.436983842010772, "step": 20010}, {"loss": 0.677, "grad_norm": 0.8690667152404785, "learning_rate": 0.0002, "epoch": 1.4377019748653501, "step": 20020}, {"loss": 0.7491, "grad_norm": 0.7166543006896973, "learning_rate": 0.0002, "epoch": 1.4384201077199281, "step": 20030}, {"loss": 0.7686, "grad_norm": 0.7051591873168945, "learning_rate": 0.0002, "epoch": 1.4391382405745063, "step": 20040}, {"loss": 0.6669, "grad_norm": 0.7606652975082397, "learning_rate": 0.0002, "epoch": 1.4398563734290843, "step": 20050}, {"loss": 0.7427, "grad_norm": 0.6343185305595398, "learning_rate": 0.0002, "epoch": 1.4405745062836623, "step": 20060}, {"loss": 0.6956, "grad_norm": 0.5625789761543274, "learning_rate": 0.0002, "epoch": 1.4412926391382406, "step": 20070}, {"loss": 0.7421, "grad_norm": 0.6081897020339966, "learning_rate": 0.0002, "epoch": 1.4420107719928188, "step": 20080}, {"loss": 0.7646, "grad_norm": 0.9571536779403687, "learning_rate": 0.0002, "epoch": 1.4427289048473968, "step": 20090}, {"loss": 0.6939, "grad_norm": 0.869531512260437, "learning_rate": 0.0002, "epoch": 1.4434470377019748, "step": 20100}, {"loss": 0.7684, "grad_norm": 0.6865507960319519, "learning_rate": 0.0002, "epoch": 1.444165170556553, "step": 20110}, {"loss": 0.6835, "grad_norm": 0.7572755813598633, "learning_rate": 0.0002, "epoch": 1.444883303411131, "step": 20120}, {"loss": 0.7392, "grad_norm": 0.79011070728302, "learning_rate": 0.0002, "epoch": 1.4456014362657092, "step": 20130}, {"loss": 0.7624, "grad_norm": 0.8297342658042908, "learning_rate": 0.0002, "epoch": 1.4463195691202873, "step": 20140}, {"loss": 0.696, "grad_norm": 0.6593490839004517, "learning_rate": 0.0002, "epoch": 1.4470377019748653, "step": 20150}, {"loss": 0.7062, "grad_norm": 1.0264687538146973, "learning_rate": 0.0002, "epoch": 1.4477558348294435, "step": 20160}, {"loss": 0.7804, "grad_norm": 0.7032888531684875, "learning_rate": 0.0002, "epoch": 1.4484739676840215, "step": 20170}, {"loss": 0.7692, "grad_norm": 0.6438494920730591, "learning_rate": 0.0002, "epoch": 1.4491921005385997, "step": 20180}, {"loss": 0.7189, "grad_norm": 0.7448790669441223, "learning_rate": 0.0002, "epoch": 1.4499102333931777, "step": 20190}, {"loss": 0.7389, "grad_norm": 0.7551555037498474, "learning_rate": 0.0002, "epoch": 1.4506283662477557, "step": 20200}, {"loss": 0.7636, "grad_norm": 0.6677857041358948, "learning_rate": 0.0002, "epoch": 1.451346499102334, "step": 20210}, {"loss": 0.7261, "grad_norm": 0.7888486385345459, "learning_rate": 0.0002, "epoch": 1.4520646319569122, "step": 20220}, {"loss": 0.7349, "grad_norm": 0.6658565402030945, "learning_rate": 0.0002, "epoch": 1.4527827648114902, "step": 20230}, {"loss": 0.7862, "grad_norm": 0.6800249814987183, "learning_rate": 0.0002, "epoch": 1.4535008976660682, "step": 20240}, {"loss": 0.7464, "grad_norm": 0.7419682741165161, "learning_rate": 0.0002, "epoch": 1.4542190305206464, "step": 20250}, {"loss": 0.7118, "grad_norm": 0.8848792910575867, "learning_rate": 0.0002, "epoch": 1.4549371633752244, "step": 20260}, {"loss": 0.729, "grad_norm": 0.6513857245445251, "learning_rate": 0.0002, "epoch": 1.4556552962298026, "step": 20270}, {"loss": 0.7325, "grad_norm": 0.5605742335319519, "learning_rate": 0.0002, "epoch": 1.4563734290843806, "step": 20280}, {"loss": 0.7078, "grad_norm": 0.6737141013145447, "learning_rate": 0.0002, "epoch": 1.4570915619389586, "step": 20290}, {"loss": 0.6971, "grad_norm": 0.6663289666175842, "learning_rate": 0.0002, "epoch": 1.4578096947935368, "step": 20300}, {"loss": 0.7161, "grad_norm": 0.7157106995582581, "learning_rate": 0.0002, "epoch": 1.4585278276481148, "step": 20310}, {"loss": 0.7024, "grad_norm": 0.7713354825973511, "learning_rate": 0.0002, "epoch": 1.459245960502693, "step": 20320}, {"loss": 0.7043, "grad_norm": 0.8334044218063354, "learning_rate": 0.0002, "epoch": 1.459964093357271, "step": 20330}, {"loss": 0.7151, "grad_norm": 0.7268327474594116, "learning_rate": 0.0002, "epoch": 1.460682226211849, "step": 20340}, {"loss": 0.7415, "grad_norm": 0.6791431903839111, "learning_rate": 0.0002, "epoch": 1.4614003590664273, "step": 20350}, {"loss": 0.7738, "grad_norm": 0.8177870512008667, "learning_rate": 0.0002, "epoch": 1.4621184919210055, "step": 20360}, {"loss": 0.7212, "grad_norm": 0.8064364790916443, "learning_rate": 0.0002, "epoch": 1.4628366247755835, "step": 20370}, {"loss": 0.7285, "grad_norm": 0.6547006964683533, "learning_rate": 0.0002, "epoch": 1.4635547576301615, "step": 20380}, {"loss": 0.7444, "grad_norm": 0.6381436586380005, "learning_rate": 0.0002, "epoch": 1.4642728904847397, "step": 20390}, {"loss": 0.7593, "grad_norm": 0.7351248264312744, "learning_rate": 0.0002, "epoch": 1.4649910233393177, "step": 20400}, {"loss": 0.7385, "grad_norm": 0.7037558555603027, "learning_rate": 0.0002, "epoch": 1.465709156193896, "step": 20410}, {"loss": 0.7815, "grad_norm": 0.6294074654579163, "learning_rate": 0.0002, "epoch": 1.466427289048474, "step": 20420}, {"loss": 0.6665, "grad_norm": 0.9722632765769958, "learning_rate": 0.0002, "epoch": 1.467145421903052, "step": 20430}, {"loss": 0.7363, "grad_norm": 0.753065824508667, "learning_rate": 0.0002, "epoch": 1.4678635547576302, "step": 20440}, {"loss": 0.7568, "grad_norm": 0.7317194938659668, "learning_rate": 0.0002, "epoch": 1.4685816876122082, "step": 20450}, {"loss": 0.6948, "grad_norm": 0.6862193942070007, "learning_rate": 0.0002, "epoch": 1.4692998204667864, "step": 20460}, {"loss": 0.7552, "grad_norm": 0.7643225193023682, "learning_rate": 0.0002, "epoch": 1.4700179533213644, "step": 20470}, {"loss": 0.6757, "grad_norm": 0.5904353260993958, "learning_rate": 0.0002, "epoch": 1.4707360861759424, "step": 20480}, {"loss": 0.7779, "grad_norm": 0.5812238454818726, "learning_rate": 0.0002, "epoch": 1.4714542190305206, "step": 20490}, {"loss": 0.7252, "grad_norm": 0.7478151321411133, "learning_rate": 0.0002, "epoch": 1.4721723518850989, "step": 20500}, {"loss": 0.7165, "grad_norm": 0.7625645399093628, "learning_rate": 0.0002, "epoch": 1.4728904847396769, "step": 20510}, {"loss": 0.7383, "grad_norm": 0.6354498267173767, "learning_rate": 0.0002, "epoch": 1.4736086175942549, "step": 20520}, {"loss": 0.7095, "grad_norm": 0.8731162548065186, "learning_rate": 0.0002, "epoch": 1.474326750448833, "step": 20530}, {"loss": 0.7535, "grad_norm": 0.7346670627593994, "learning_rate": 0.0002, "epoch": 1.475044883303411, "step": 20540}, {"loss": 0.78, "grad_norm": 1.038447618484497, "learning_rate": 0.0002, "epoch": 1.4757630161579893, "step": 20550}, {"loss": 0.7026, "grad_norm": 0.7032809257507324, "learning_rate": 0.0002, "epoch": 1.4764811490125673, "step": 20560}, {"loss": 0.6776, "grad_norm": 0.8008337020874023, "learning_rate": 0.0002, "epoch": 1.4771992818671453, "step": 20570}, {"loss": 0.776, "grad_norm": 0.6735056638717651, "learning_rate": 0.0002, "epoch": 1.4779174147217236, "step": 20580}, {"loss": 0.7632, "grad_norm": 0.622056245803833, "learning_rate": 0.0002, "epoch": 1.4786355475763016, "step": 20590}, {"loss": 0.7467, "grad_norm": 0.6580422520637512, "learning_rate": 0.0002, "epoch": 1.4793536804308798, "step": 20600}, {"loss": 0.7161, "grad_norm": 0.8401153087615967, "learning_rate": 0.0002, "epoch": 1.4800718132854578, "step": 20610}, {"loss": 0.7581, "grad_norm": 0.7564560770988464, "learning_rate": 0.0002, "epoch": 1.4807899461400358, "step": 20620}, {"loss": 0.7507, "grad_norm": 0.8319511413574219, "learning_rate": 0.0002, "epoch": 1.481508078994614, "step": 20630}, {"loss": 0.7379, "grad_norm": 0.7430182695388794, "learning_rate": 0.0002, "epoch": 1.4822262118491922, "step": 20640}, {"loss": 0.7273, "grad_norm": 0.7996522784233093, "learning_rate": 0.0002, "epoch": 1.4829443447037702, "step": 20650}, {"loss": 0.7223, "grad_norm": 0.6993277072906494, "learning_rate": 0.0002, "epoch": 1.4836624775583482, "step": 20660}, {"loss": 0.7328, "grad_norm": 0.8621185421943665, "learning_rate": 0.0002, "epoch": 1.4843806104129265, "step": 20670}, {"loss": 0.7327, "grad_norm": 0.7709757685661316, "learning_rate": 0.0002, "epoch": 1.4850987432675045, "step": 20680}, {"loss": 0.7053, "grad_norm": 0.743760347366333, "learning_rate": 0.0002, "epoch": 1.4858168761220827, "step": 20690}, {"loss": 0.6763, "grad_norm": 0.8353745341300964, "learning_rate": 0.0002, "epoch": 1.4865350089766607, "step": 20700}, {"loss": 0.6933, "grad_norm": 0.8510433435440063, "learning_rate": 0.0002, "epoch": 1.4872531418312387, "step": 20710}, {"loss": 0.7486, "grad_norm": 0.7065894603729248, "learning_rate": 0.0002, "epoch": 1.487971274685817, "step": 20720}, {"loss": 0.736, "grad_norm": 0.6878955960273743, "learning_rate": 0.0002, "epoch": 1.488689407540395, "step": 20730}, {"loss": 0.6958, "grad_norm": 0.7861111760139465, "learning_rate": 0.0002, "epoch": 1.4894075403949731, "step": 20740}, {"loss": 0.7568, "grad_norm": 0.4810725152492523, "learning_rate": 0.0002, "epoch": 1.4901256732495511, "step": 20750}, {"loss": 0.8147, "grad_norm": 0.7246082425117493, "learning_rate": 0.0002, "epoch": 1.4908438061041291, "step": 20760}, {"loss": 0.7312, "grad_norm": 0.7101936340332031, "learning_rate": 0.0002, "epoch": 1.4915619389587074, "step": 20770}, {"loss": 0.7393, "grad_norm": 0.7508591413497925, "learning_rate": 0.0002, "epoch": 1.4922800718132856, "step": 20780}, {"loss": 0.7635, "grad_norm": 0.8872039914131165, "learning_rate": 0.0002, "epoch": 1.4929982046678636, "step": 20790}, {"loss": 0.7352, "grad_norm": 0.7257922887802124, "learning_rate": 0.0002, "epoch": 1.4937163375224416, "step": 20800}, {"loss": 0.7497, "grad_norm": 0.7886278629302979, "learning_rate": 0.0002, "epoch": 1.4944344703770198, "step": 20810}, {"loss": 0.7247, "grad_norm": 0.6746290922164917, "learning_rate": 0.0002, "epoch": 1.4951526032315978, "step": 20820}, {"loss": 0.7836, "grad_norm": 0.8118207454681396, "learning_rate": 0.0002, "epoch": 1.495870736086176, "step": 20830}, {"loss": 0.7323, "grad_norm": 0.7337301969528198, "learning_rate": 0.0002, "epoch": 1.496588868940754, "step": 20840}, {"loss": 0.7105, "grad_norm": 0.5451242327690125, "learning_rate": 0.0002, "epoch": 1.497307001795332, "step": 20850}, {"loss": 0.7255, "grad_norm": 0.8398377299308777, "learning_rate": 0.0002, "epoch": 1.4980251346499103, "step": 20860}, {"loss": 0.7217, "grad_norm": 0.7196659445762634, "learning_rate": 0.0002, "epoch": 1.4987432675044883, "step": 20870}, {"loss": 0.6843, "grad_norm": 0.6659539937973022, "learning_rate": 0.0002, "epoch": 1.4994614003590665, "step": 20880}, {"loss": 0.7337, "grad_norm": 0.6071978807449341, "learning_rate": 0.0002, "epoch": 1.5001795332136445, "step": 20890}, {"loss": 0.7221, "grad_norm": 0.6704870462417603, "learning_rate": 0.0002, "epoch": 1.5008976660682225, "step": 20900}, {"loss": 0.6946, "grad_norm": 0.7216639518737793, "learning_rate": 0.0002, "epoch": 1.5016157989228007, "step": 20910}, {"loss": 0.7282, "grad_norm": 0.6050528287887573, "learning_rate": 0.0002, "epoch": 1.502333931777379, "step": 20920}, {"loss": 0.7142, "grad_norm": 0.7422218918800354, "learning_rate": 0.0002, "epoch": 1.503052064631957, "step": 20930}, {"loss": 0.7779, "grad_norm": 0.7157148122787476, "learning_rate": 0.0002, "epoch": 1.503770197486535, "step": 20940}, {"loss": 0.7179, "grad_norm": 0.6704899668693542, "learning_rate": 0.0002, "epoch": 1.504488330341113, "step": 20950}, {"loss": 0.7124, "grad_norm": 0.7573544979095459, "learning_rate": 0.0002, "epoch": 1.5052064631956912, "step": 20960}, {"loss": 0.7831, "grad_norm": 0.6710506677627563, "learning_rate": 0.0002, "epoch": 1.5059245960502694, "step": 20970}, {"loss": 0.7123, "grad_norm": 0.7559793591499329, "learning_rate": 0.0002, "epoch": 1.5066427289048474, "step": 20980}, {"loss": 0.7442, "grad_norm": 0.6705940961837769, "learning_rate": 0.0002, "epoch": 1.5073608617594254, "step": 20990}, {"loss": 0.7387, "grad_norm": 0.8016680479049683, "learning_rate": 0.0002, "epoch": 1.5080789946140036, "step": 21000}, {"loss": 0.7101, "grad_norm": 0.8154481649398804, "learning_rate": 0.0002, "epoch": 1.5087971274685816, "step": 21010}, {"loss": 0.7223, "grad_norm": 0.5830582976341248, "learning_rate": 0.0002, "epoch": 1.5095152603231599, "step": 21020}, {"loss": 0.753, "grad_norm": 0.7088601589202881, "learning_rate": 0.0002, "epoch": 1.5102333931777379, "step": 21030}, {"loss": 0.7278, "grad_norm": 0.7499658465385437, "learning_rate": 0.0002, "epoch": 1.5109515260323159, "step": 21040}, {"loss": 0.7441, "grad_norm": 0.7684667706489563, "learning_rate": 0.0002, "epoch": 1.511669658886894, "step": 21050}, {"loss": 0.7665, "grad_norm": 0.7183627486228943, "learning_rate": 0.0002, "epoch": 1.5123877917414723, "step": 21060}, {"loss": 0.7777, "grad_norm": 0.8201524615287781, "learning_rate": 0.0002, "epoch": 1.5131059245960503, "step": 21070}, {"loss": 0.7005, "grad_norm": 0.6359647512435913, "learning_rate": 0.0002, "epoch": 1.5138240574506283, "step": 21080}, {"loss": 0.7231, "grad_norm": 0.7419124245643616, "learning_rate": 0.0002, "epoch": 1.5145421903052063, "step": 21090}, {"loss": 0.724, "grad_norm": 0.6145808696746826, "learning_rate": 0.0002, "epoch": 1.5152603231597845, "step": 21100}, {"loss": 0.7563, "grad_norm": 0.7116656303405762, "learning_rate": 0.0002, "epoch": 1.5159784560143628, "step": 21110}, {"loss": 0.7221, "grad_norm": 0.8927125334739685, "learning_rate": 0.0002, "epoch": 1.5166965888689408, "step": 21120}, {"loss": 0.7159, "grad_norm": 0.7527788877487183, "learning_rate": 0.0002, "epoch": 1.5174147217235188, "step": 21130}, {"loss": 0.7147, "grad_norm": 0.7537266612052917, "learning_rate": 0.0002, "epoch": 1.518132854578097, "step": 21140}, {"loss": 0.7451, "grad_norm": 0.9051724672317505, "learning_rate": 0.0002, "epoch": 1.518850987432675, "step": 21150}, {"loss": 0.7362, "grad_norm": 0.7258086800575256, "learning_rate": 0.0002, "epoch": 1.5195691202872532, "step": 21160}, {"loss": 0.7096, "grad_norm": 0.60377436876297, "learning_rate": 0.0002, "epoch": 1.5202872531418312, "step": 21170}, {"loss": 0.7141, "grad_norm": 0.613362729549408, "learning_rate": 0.0002, "epoch": 1.5210053859964092, "step": 21180}, {"loss": 0.7018, "grad_norm": 0.6311782002449036, "learning_rate": 0.0002, "epoch": 1.5217235188509874, "step": 21190}, {"loss": 0.8144, "grad_norm": 0.7814380526542664, "learning_rate": 0.0002, "epoch": 1.5224416517055657, "step": 21200}, {"loss": 0.7505, "grad_norm": 0.8482790589332581, "learning_rate": 0.0002, "epoch": 1.5231597845601437, "step": 21210}, {"loss": 0.7387, "grad_norm": 0.6767336130142212, "learning_rate": 0.0002, "epoch": 1.5238779174147217, "step": 21220}, {"loss": 0.7556, "grad_norm": 0.7000219821929932, "learning_rate": 0.0002, "epoch": 1.5245960502692997, "step": 21230}, {"loss": 0.7628, "grad_norm": 0.8848617076873779, "learning_rate": 0.0002, "epoch": 1.525314183123878, "step": 21240}, {"loss": 0.7226, "grad_norm": 0.692258894443512, "learning_rate": 0.0002, "epoch": 1.5260323159784561, "step": 21250}, {"loss": 0.7535, "grad_norm": 0.7701950073242188, "learning_rate": 0.0002, "epoch": 1.5267504488330341, "step": 21260}, {"loss": 0.7531, "grad_norm": 0.7454132437705994, "learning_rate": 0.0002, "epoch": 1.5274685816876121, "step": 21270}, {"loss": 0.7663, "grad_norm": 0.7299574613571167, "learning_rate": 0.0002, "epoch": 1.5281867145421903, "step": 21280}, {"loss": 0.6993, "grad_norm": 0.6693950891494751, "learning_rate": 0.0002, "epoch": 1.5289048473967684, "step": 21290}, {"loss": 0.7567, "grad_norm": 0.8323785066604614, "learning_rate": 0.0002, "epoch": 1.5296229802513466, "step": 21300}, {"loss": 0.7205, "grad_norm": 0.8998763561248779, "learning_rate": 0.0002, "epoch": 1.5303411131059246, "step": 21310}, {"loss": 0.7779, "grad_norm": 0.8118193745613098, "learning_rate": 0.0002, "epoch": 1.5310592459605026, "step": 21320}, {"loss": 0.7642, "grad_norm": 0.8966332077980042, "learning_rate": 0.0002, "epoch": 1.5317773788150808, "step": 21330}, {"loss": 0.7626, "grad_norm": 0.7849827408790588, "learning_rate": 0.0002, "epoch": 1.532495511669659, "step": 21340}, {"loss": 0.7501, "grad_norm": 0.897583544254303, "learning_rate": 0.0002, "epoch": 1.533213644524237, "step": 21350}, {"loss": 0.7812, "grad_norm": 0.7998009324073792, "learning_rate": 0.0002, "epoch": 1.533931777378815, "step": 21360}, {"loss": 0.7217, "grad_norm": 0.5890361070632935, "learning_rate": 0.0002, "epoch": 1.534649910233393, "step": 21370}, {"loss": 0.7283, "grad_norm": 0.7321302890777588, "learning_rate": 0.0002, "epoch": 1.5353680430879713, "step": 21380}, {"loss": 0.7238, "grad_norm": 0.7746050357818604, "learning_rate": 0.0002, "epoch": 1.5360861759425495, "step": 21390}, {"loss": 0.7146, "grad_norm": 0.7033910155296326, "learning_rate": 0.0002, "epoch": 1.5368043087971275, "step": 21400}, {"loss": 0.6783, "grad_norm": 0.7229148149490356, "learning_rate": 0.0002, "epoch": 1.5375224416517055, "step": 21410}, {"loss": 0.7347, "grad_norm": 0.8055810928344727, "learning_rate": 0.0002, "epoch": 1.5382405745062837, "step": 21420}, {"loss": 0.7382, "grad_norm": 0.9411654472351074, "learning_rate": 0.0002, "epoch": 1.5389587073608617, "step": 21430}, {"loss": 0.6916, "grad_norm": 0.7297126650810242, "learning_rate": 0.0002, "epoch": 1.53967684021544, "step": 21440}, {"loss": 0.6977, "grad_norm": 0.7316457629203796, "learning_rate": 0.0002, "epoch": 1.540394973070018, "step": 21450}, {"loss": 0.713, "grad_norm": 0.8568798303604126, "learning_rate": 0.0002, "epoch": 1.541113105924596, "step": 21460}, {"loss": 0.6916, "grad_norm": 0.7829580307006836, "learning_rate": 0.0002, "epoch": 1.5418312387791742, "step": 21470}, {"loss": 0.712, "grad_norm": 0.6679823398590088, "learning_rate": 0.0002, "epoch": 1.5425493716337524, "step": 21480}, {"loss": 0.6978, "grad_norm": 0.5680868029594421, "learning_rate": 0.0002, "epoch": 1.5432675044883304, "step": 21490}, {"loss": 0.7638, "grad_norm": 0.6878862380981445, "learning_rate": 0.0002, "epoch": 1.5439856373429084, "step": 21500}, {"loss": 0.7634, "grad_norm": 0.7391727566719055, "learning_rate": 0.0002, "epoch": 1.5447037701974864, "step": 21510}, {"loss": 0.7781, "grad_norm": 0.844994843006134, "learning_rate": 0.0002, "epoch": 1.5454219030520646, "step": 21520}, {"loss": 0.7052, "grad_norm": 0.7852550148963928, "learning_rate": 0.0002, "epoch": 1.5461400359066428, "step": 21530}, {"loss": 0.7364, "grad_norm": 0.8370407223701477, "learning_rate": 0.0002, "epoch": 1.5468581687612208, "step": 21540}, {"loss": 0.7266, "grad_norm": 0.7138169407844543, "learning_rate": 0.0002, "epoch": 1.5475763016157988, "step": 21550}, {"loss": 0.7078, "grad_norm": 0.7660839557647705, "learning_rate": 0.0002, "epoch": 1.548294434470377, "step": 21560}, {"loss": 0.7056, "grad_norm": 0.6628666520118713, "learning_rate": 0.0002, "epoch": 1.549012567324955, "step": 21570}, {"loss": 0.7384, "grad_norm": 0.602262020111084, "learning_rate": 0.0002, "epoch": 1.5497307001795333, "step": 21580}, {"loss": 0.7258, "grad_norm": 0.6120333671569824, "learning_rate": 0.0002, "epoch": 1.5504488330341113, "step": 21590}, {"loss": 0.8094, "grad_norm": 0.6742582321166992, "learning_rate": 0.0002, "epoch": 1.5511669658886893, "step": 21600}, {"loss": 0.6807, "grad_norm": 0.6788192391395569, "learning_rate": 0.0002, "epoch": 1.5518850987432675, "step": 21610}, {"loss": 0.6969, "grad_norm": 0.7124713659286499, "learning_rate": 0.0002, "epoch": 1.5526032315978457, "step": 21620}, {"loss": 0.7296, "grad_norm": 0.6297248005867004, "learning_rate": 0.0002, "epoch": 1.5533213644524237, "step": 21630}, {"loss": 0.7466, "grad_norm": 0.8977078199386597, "learning_rate": 0.0002, "epoch": 1.5540394973070017, "step": 21640}, {"loss": 0.7376, "grad_norm": 0.7543209791183472, "learning_rate": 0.0002, "epoch": 1.5547576301615798, "step": 21650}, {"loss": 0.749, "grad_norm": 0.8704302310943604, "learning_rate": 0.0002, "epoch": 1.555475763016158, "step": 21660}, {"loss": 0.7801, "grad_norm": 0.7848012447357178, "learning_rate": 0.0002, "epoch": 1.5561938958707362, "step": 21670}, {"loss": 0.7062, "grad_norm": 0.7496278285980225, "learning_rate": 0.0002, "epoch": 1.5569120287253142, "step": 21680}, {"loss": 0.7503, "grad_norm": 0.7305200099945068, "learning_rate": 0.0002, "epoch": 1.5576301615798922, "step": 21690}, {"loss": 0.7429, "grad_norm": 0.6671105623245239, "learning_rate": 0.0002, "epoch": 1.5583482944344704, "step": 21700}, {"loss": 0.7293, "grad_norm": 0.8536111116409302, "learning_rate": 0.0002, "epoch": 1.5590664272890484, "step": 21710}, {"loss": 0.7169, "grad_norm": 0.7360461354255676, "learning_rate": 0.0002, "epoch": 1.5597845601436267, "step": 21720}, {"loss": 0.7314, "grad_norm": 0.6665109395980835, "learning_rate": 0.0002, "epoch": 1.5605026929982047, "step": 21730}, {"loss": 0.7262, "grad_norm": 0.5879628658294678, "learning_rate": 0.0002, "epoch": 1.5612208258527827, "step": 21740}, {"loss": 0.7099, "grad_norm": 0.6937240958213806, "learning_rate": 0.0002, "epoch": 1.5619389587073609, "step": 21750}, {"loss": 0.7669, "grad_norm": 0.7118659019470215, "learning_rate": 0.0002, "epoch": 1.562657091561939, "step": 21760}, {"loss": 0.7196, "grad_norm": 0.7858866453170776, "learning_rate": 0.0002, "epoch": 1.563375224416517, "step": 21770}, {"loss": 0.7552, "grad_norm": 0.8691372871398926, "learning_rate": 0.0002, "epoch": 1.564093357271095, "step": 21780}, {"loss": 0.7684, "grad_norm": 0.8884942531585693, "learning_rate": 0.0002, "epoch": 1.564811490125673, "step": 21790}, {"loss": 0.7128, "grad_norm": 0.6335656046867371, "learning_rate": 0.0002, "epoch": 1.5655296229802513, "step": 21800}, {"loss": 0.7233, "grad_norm": 0.8666166067123413, "learning_rate": 0.0002, "epoch": 1.5662477558348296, "step": 21810}, {"loss": 0.6771, "grad_norm": 0.7961624264717102, "learning_rate": 0.0002, "epoch": 1.5669658886894076, "step": 21820}, {"loss": 0.7286, "grad_norm": 0.6331174373626709, "learning_rate": 0.0002, "epoch": 1.5676840215439856, "step": 21830}, {"loss": 0.7273, "grad_norm": 0.6476998925209045, "learning_rate": 0.0002, "epoch": 1.5684021543985638, "step": 21840}, {"loss": 0.7507, "grad_norm": 0.8279129266738892, "learning_rate": 0.0002, "epoch": 1.5691202872531418, "step": 21850}, {"loss": 0.7219, "grad_norm": 0.6997109651565552, "learning_rate": 0.0002, "epoch": 1.56983842010772, "step": 21860}, {"loss": 0.7424, "grad_norm": 0.6992211937904358, "learning_rate": 0.0002, "epoch": 1.570556552962298, "step": 21870}, {"loss": 0.7275, "grad_norm": 0.7766915559768677, "learning_rate": 0.0002, "epoch": 1.571274685816876, "step": 21880}, {"loss": 0.7651, "grad_norm": 0.6845845580101013, "learning_rate": 0.0002, "epoch": 1.5719928186714542, "step": 21890}, {"loss": 0.706, "grad_norm": 0.7247874140739441, "learning_rate": 0.0002, "epoch": 1.5727109515260325, "step": 21900}, {"loss": 0.7812, "grad_norm": 0.802342414855957, "learning_rate": 0.0002, "epoch": 1.5734290843806105, "step": 21910}, {"loss": 0.7028, "grad_norm": 0.7797709107398987, "learning_rate": 0.0002, "epoch": 1.5741472172351885, "step": 21920}, {"loss": 0.7466, "grad_norm": 0.6534958481788635, "learning_rate": 0.0002, "epoch": 1.5748653500897665, "step": 21930}, {"loss": 0.7148, "grad_norm": 0.6003528237342834, "learning_rate": 0.0002, "epoch": 1.5755834829443447, "step": 21940}, {"loss": 0.7282, "grad_norm": 0.6920075416564941, "learning_rate": 0.0002, "epoch": 1.576301615798923, "step": 21950}, {"loss": 0.6533, "grad_norm": 0.7213456034660339, "learning_rate": 0.0002, "epoch": 1.577019748653501, "step": 21960}, {"loss": 0.6875, "grad_norm": 0.7101914286613464, "learning_rate": 0.0002, "epoch": 1.577737881508079, "step": 21970}, {"loss": 0.7421, "grad_norm": 0.9531592130661011, "learning_rate": 0.0002, "epoch": 1.5784560143626571, "step": 21980}, {"loss": 0.7454, "grad_norm": 0.7690590023994446, "learning_rate": 0.0002, "epoch": 1.5791741472172351, "step": 21990}, {"loss": 0.7135, "grad_norm": 0.8226363062858582, "learning_rate": 0.0002, "epoch": 1.5798922800718134, "step": 22000}, {"loss": 0.7518, "grad_norm": 0.6128851175308228, "learning_rate": 0.0002, "epoch": 1.5806104129263914, "step": 22010}, {"loss": 0.7253, "grad_norm": 0.827008068561554, "learning_rate": 0.0002, "epoch": 1.5813285457809694, "step": 22020}, {"loss": 0.7176, "grad_norm": 0.6729007363319397, "learning_rate": 0.0002, "epoch": 1.5820466786355476, "step": 22030}, {"loss": 0.7503, "grad_norm": 0.6397014260292053, "learning_rate": 0.0002, "epoch": 1.5827648114901258, "step": 22040}, {"loss": 0.7531, "grad_norm": 0.6927793622016907, "learning_rate": 0.0002, "epoch": 1.5834829443447038, "step": 22050}, {"loss": 0.7499, "grad_norm": 0.7527112364768982, "learning_rate": 0.0002, "epoch": 1.5842010771992818, "step": 22060}, {"loss": 0.739, "grad_norm": 0.6418012380599976, "learning_rate": 0.0002, "epoch": 1.5849192100538598, "step": 22070}, {"loss": 0.727, "grad_norm": 0.7627281546592712, "learning_rate": 0.0002, "epoch": 1.585637342908438, "step": 22080}, {"loss": 0.7115, "grad_norm": 0.753851592540741, "learning_rate": 0.0002, "epoch": 1.5863554757630163, "step": 22090}, {"loss": 0.7677, "grad_norm": 0.6049349904060364, "learning_rate": 0.0002, "epoch": 1.5870736086175943, "step": 22100}, {"loss": 0.7494, "grad_norm": 0.6677758693695068, "learning_rate": 0.0002, "epoch": 1.5877917414721723, "step": 22110}, {"loss": 0.7259, "grad_norm": 0.913489818572998, "learning_rate": 0.0002, "epoch": 1.5885098743267505, "step": 22120}, {"loss": 0.7823, "grad_norm": 0.6779162883758545, "learning_rate": 0.0002, "epoch": 1.5892280071813285, "step": 22130}, {"loss": 0.7674, "grad_norm": 0.910076916217804, "learning_rate": 0.0002, "epoch": 1.5899461400359067, "step": 22140}, {"loss": 0.7162, "grad_norm": 0.9506068229675293, "learning_rate": 0.0002, "epoch": 1.5906642728904847, "step": 22150}, {"loss": 0.7343, "grad_norm": 0.6552460789680481, "learning_rate": 0.0002, "epoch": 1.5913824057450627, "step": 22160}, {"loss": 0.7488, "grad_norm": 0.6855819821357727, "learning_rate": 0.0002, "epoch": 1.592100538599641, "step": 22170}, {"loss": 0.6785, "grad_norm": 0.6713384985923767, "learning_rate": 0.0002, "epoch": 1.5928186714542192, "step": 22180}, {"loss": 0.7287, "grad_norm": 0.7168547511100769, "learning_rate": 0.0002, "epoch": 1.5935368043087972, "step": 22190}, {"loss": 0.7259, "grad_norm": 0.8395482897758484, "learning_rate": 0.0002, "epoch": 1.5942549371633752, "step": 22200}, {"loss": 0.6995, "grad_norm": 0.6676998138427734, "learning_rate": 0.0002, "epoch": 1.5949730700179532, "step": 22210}, {"loss": 0.7152, "grad_norm": 0.5837140083312988, "learning_rate": 0.0002, "epoch": 1.5956912028725314, "step": 22220}, {"loss": 0.7464, "grad_norm": 0.8399306535720825, "learning_rate": 0.0002, "epoch": 1.5964093357271096, "step": 22230}, {"loss": 0.7053, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 1.5971274685816876, "step": 22240}, {"loss": 0.784, "grad_norm": 0.768604040145874, "learning_rate": 0.0002, "epoch": 1.5978456014362656, "step": 22250}, {"loss": 0.6946, "grad_norm": 0.6382646560668945, "learning_rate": 0.0002, "epoch": 1.5985637342908436, "step": 22260}, {"loss": 0.7035, "grad_norm": 0.7244897484779358, "learning_rate": 0.0002, "epoch": 1.5992818671454219, "step": 22270}, {"loss": 0.7168, "grad_norm": 0.6250987648963928, "learning_rate": 0.0002, "epoch": 1.6, "step": 22280}, {"loss": 0.7182, "grad_norm": 0.8731992244720459, "learning_rate": 0.0002, "epoch": 1.600718132854578, "step": 22290}, {"loss": 0.6866, "grad_norm": 0.5861822962760925, "learning_rate": 0.0002, "epoch": 1.601436265709156, "step": 22300}, {"loss": 0.6909, "grad_norm": 0.716805100440979, "learning_rate": 0.0002, "epoch": 1.6021543985637343, "step": 22310}, {"loss": 0.7377, "grad_norm": 0.6650034189224243, "learning_rate": 0.0002, "epoch": 1.6028725314183125, "step": 22320}, {"loss": 0.7107, "grad_norm": 0.6944432854652405, "learning_rate": 0.0002, "epoch": 1.6035906642728905, "step": 22330}, {"loss": 0.682, "grad_norm": 0.7411999106407166, "learning_rate": 0.0002, "epoch": 1.6043087971274685, "step": 22340}, {"loss": 0.7294, "grad_norm": 0.831828773021698, "learning_rate": 0.0002, "epoch": 1.6050269299820465, "step": 22350}, {"loss": 0.7305, "grad_norm": 0.6252152919769287, "learning_rate": 0.0002, "epoch": 1.6057450628366248, "step": 22360}, {"loss": 0.7479, "grad_norm": 0.8643325567245483, "learning_rate": 0.0002, "epoch": 1.606463195691203, "step": 22370}, {"loss": 0.7417, "grad_norm": 0.7330279350280762, "learning_rate": 0.0002, "epoch": 1.607181328545781, "step": 22380}, {"loss": 0.7198, "grad_norm": 0.7235422730445862, "learning_rate": 0.0002, "epoch": 1.607899461400359, "step": 22390}, {"loss": 0.7638, "grad_norm": 0.6940887570381165, "learning_rate": 0.0002, "epoch": 1.608617594254937, "step": 22400}, {"loss": 0.714, "grad_norm": 0.7907325625419617, "learning_rate": 0.0002, "epoch": 1.6093357271095152, "step": 22410}, {"loss": 0.7824, "grad_norm": 0.6899075508117676, "learning_rate": 0.0002, "epoch": 1.6100538599640934, "step": 22420}, {"loss": 0.7502, "grad_norm": 0.7057487368583679, "learning_rate": 0.0002, "epoch": 1.6107719928186714, "step": 22430}, {"loss": 0.7437, "grad_norm": 0.9235003590583801, "learning_rate": 0.0002, "epoch": 1.6114901256732495, "step": 22440}, {"loss": 0.7115, "grad_norm": 0.7238173484802246, "learning_rate": 0.0002, "epoch": 1.6122082585278277, "step": 22450}, {"loss": 0.7628, "grad_norm": 0.5931997299194336, "learning_rate": 0.0002, "epoch": 1.612926391382406, "step": 22460}, {"loss": 0.6663, "grad_norm": 0.6705866456031799, "learning_rate": 0.0002, "epoch": 1.613644524236984, "step": 22470}, {"loss": 0.749, "grad_norm": 0.7392773032188416, "learning_rate": 0.0002, "epoch": 1.614362657091562, "step": 22480}, {"loss": 0.7292, "grad_norm": 0.6286543607711792, "learning_rate": 0.0002, "epoch": 1.61508078994614, "step": 22490}, {"loss": 0.7264, "grad_norm": 0.7467446327209473, "learning_rate": 0.0002, "epoch": 1.6157989228007181, "step": 22500}, {"loss": 0.732, "grad_norm": 0.8353021740913391, "learning_rate": 0.0002, "epoch": 1.6165170556552964, "step": 22510}, {"loss": 0.7626, "grad_norm": 0.7333045601844788, "learning_rate": 0.0002, "epoch": 1.6172351885098744, "step": 22520}, {"loss": 0.7567, "grad_norm": 0.6203709244728088, "learning_rate": 0.0002, "epoch": 1.6179533213644524, "step": 22530}, {"loss": 0.7478, "grad_norm": 0.5585690140724182, "learning_rate": 0.0002, "epoch": 1.6186714542190304, "step": 22540}, {"loss": 0.669, "grad_norm": 0.7157222032546997, "learning_rate": 0.0002, "epoch": 1.6193895870736086, "step": 22550}, {"loss": 0.7224, "grad_norm": 0.8129993677139282, "learning_rate": 0.0002, "epoch": 1.6201077199281868, "step": 22560}, {"loss": 0.7374, "grad_norm": 0.6745335459709167, "learning_rate": 0.0002, "epoch": 1.6208258527827648, "step": 22570}, {"loss": 0.7276, "grad_norm": 0.7684996724128723, "learning_rate": 0.0002, "epoch": 1.6215439856373428, "step": 22580}, {"loss": 0.7479, "grad_norm": 0.6735436916351318, "learning_rate": 0.0002, "epoch": 1.622262118491921, "step": 22590}, {"loss": 0.6596, "grad_norm": 0.7394272089004517, "learning_rate": 0.0002, "epoch": 1.6229802513464993, "step": 22600}, {"loss": 0.7382, "grad_norm": 0.7268046140670776, "learning_rate": 0.0002, "epoch": 1.6236983842010773, "step": 22610}, {"loss": 0.7619, "grad_norm": 0.8338810205459595, "learning_rate": 0.0002, "epoch": 1.6244165170556553, "step": 22620}, {"loss": 0.7247, "grad_norm": 0.9293080568313599, "learning_rate": 0.0002, "epoch": 1.6251346499102333, "step": 22630}, {"loss": 0.7601, "grad_norm": 0.8084996938705444, "learning_rate": 0.0002, "epoch": 1.6258527827648115, "step": 22640}, {"loss": 0.7053, "grad_norm": 0.6605180501937866, "learning_rate": 0.0002, "epoch": 1.6265709156193897, "step": 22650}, {"loss": 0.7489, "grad_norm": 0.8402717113494873, "learning_rate": 0.0002, "epoch": 1.6272890484739677, "step": 22660}, {"loss": 0.7468, "grad_norm": 0.653055727481842, "learning_rate": 0.0002, "epoch": 1.6280071813285457, "step": 22670}, {"loss": 0.7179, "grad_norm": 0.6477823257446289, "learning_rate": 0.0002, "epoch": 1.6287253141831237, "step": 22680}, {"loss": 0.7216, "grad_norm": 0.9053590893745422, "learning_rate": 0.0002, "epoch": 1.629443447037702, "step": 22690}, {"loss": 0.7257, "grad_norm": 0.90384441614151, "learning_rate": 0.0002, "epoch": 1.6301615798922802, "step": 22700}, {"loss": 0.7703, "grad_norm": 0.6789469122886658, "learning_rate": 0.0002, "epoch": 1.6308797127468582, "step": 22710}, {"loss": 0.7706, "grad_norm": 0.7221854329109192, "learning_rate": 0.0002, "epoch": 1.6315978456014362, "step": 22720}, {"loss": 0.7457, "grad_norm": 0.7724022269248962, "learning_rate": 0.0002, "epoch": 1.6323159784560144, "step": 22730}, {"loss": 0.7864, "grad_norm": 0.8213715553283691, "learning_rate": 0.0002, "epoch": 1.6330341113105926, "step": 22740}, {"loss": 0.7356, "grad_norm": 0.7102876305580139, "learning_rate": 0.0002, "epoch": 1.6337522441651706, "step": 22750}, {"loss": 0.7208, "grad_norm": 0.8817880749702454, "learning_rate": 0.0002, "epoch": 1.6344703770197486, "step": 22760}, {"loss": 0.7722, "grad_norm": 0.8446506857872009, "learning_rate": 0.0002, "epoch": 1.6351885098743266, "step": 22770}, {"loss": 0.7341, "grad_norm": 0.6749029755592346, "learning_rate": 0.0002, "epoch": 1.6359066427289048, "step": 22780}, {"loss": 0.7599, "grad_norm": 0.7013556957244873, "learning_rate": 0.0002, "epoch": 1.636624775583483, "step": 22790}, {"loss": 0.7488, "grad_norm": 0.7767965793609619, "learning_rate": 0.0002, "epoch": 1.637342908438061, "step": 22800}, {"loss": 0.7387, "grad_norm": 0.7354073524475098, "learning_rate": 0.0002, "epoch": 1.638061041292639, "step": 22810}, {"loss": 0.7816, "grad_norm": 0.8871088027954102, "learning_rate": 0.0002, "epoch": 1.638779174147217, "step": 22820}, {"loss": 0.7243, "grad_norm": 0.6573871374130249, "learning_rate": 0.0002, "epoch": 1.6394973070017953, "step": 22830}, {"loss": 0.7812, "grad_norm": 0.5679349303245544, "learning_rate": 0.0002, "epoch": 1.6402154398563735, "step": 22840}, {"loss": 0.7402, "grad_norm": 0.7072559595108032, "learning_rate": 0.0002, "epoch": 1.6409335727109515, "step": 22850}, {"loss": 0.751, "grad_norm": 0.7639257311820984, "learning_rate": 0.0002, "epoch": 1.6416517055655295, "step": 22860}, {"loss": 0.7357, "grad_norm": 0.6699341535568237, "learning_rate": 0.0002, "epoch": 1.6423698384201078, "step": 22870}, {"loss": 0.7295, "grad_norm": 0.8285767436027527, "learning_rate": 0.0002, "epoch": 1.643087971274686, "step": 22880}, {"loss": 0.7267, "grad_norm": 0.7328150272369385, "learning_rate": 0.0002, "epoch": 1.643806104129264, "step": 22890}, {"loss": 0.6904, "grad_norm": 0.8122354745864868, "learning_rate": 0.0002, "epoch": 1.644524236983842, "step": 22900}, {"loss": 0.7853, "grad_norm": 0.7322969436645508, "learning_rate": 0.0002, "epoch": 1.64524236983842, "step": 22910}, {"loss": 0.7629, "grad_norm": 0.7269576191902161, "learning_rate": 0.0002, "epoch": 1.6459605026929982, "step": 22920}, {"loss": 0.728, "grad_norm": 0.7037042379379272, "learning_rate": 0.0002, "epoch": 1.6466786355475764, "step": 22930}, {"loss": 0.752, "grad_norm": 0.6960355639457703, "learning_rate": 0.0002, "epoch": 1.6473967684021544, "step": 22940}, {"loss": 0.7484, "grad_norm": 0.7446839213371277, "learning_rate": 0.0002, "epoch": 1.6481149012567324, "step": 22950}, {"loss": 0.7528, "grad_norm": 0.7201664447784424, "learning_rate": 0.0002, "epoch": 1.6488330341113104, "step": 22960}, {"loss": 0.7183, "grad_norm": 0.7062349319458008, "learning_rate": 0.0002, "epoch": 1.6495511669658887, "step": 22970}, {"loss": 0.6999, "grad_norm": 0.7666636109352112, "learning_rate": 0.0002, "epoch": 1.6502692998204669, "step": 22980}, {"loss": 0.7103, "grad_norm": 0.7872112393379211, "learning_rate": 0.0002, "epoch": 1.6509874326750449, "step": 22990}, {"loss": 0.7307, "grad_norm": 0.7428551316261292, "learning_rate": 0.0002, "epoch": 1.6517055655296229, "step": 23000}, {"loss": 0.7573, "grad_norm": 0.6087952852249146, "learning_rate": 0.0002, "epoch": 1.6524236983842011, "step": 23010}, {"loss": 0.8045, "grad_norm": 0.7191354036331177, "learning_rate": 0.0002, "epoch": 1.6531418312387793, "step": 23020}, {"loss": 0.7517, "grad_norm": 0.8679710626602173, "learning_rate": 0.0002, "epoch": 1.6538599640933573, "step": 23030}, {"loss": 0.7084, "grad_norm": 0.7232310175895691, "learning_rate": 0.0002, "epoch": 1.6545780969479353, "step": 23040}, {"loss": 0.7007, "grad_norm": 0.5695104002952576, "learning_rate": 0.0002, "epoch": 1.6552962298025133, "step": 23050}, {"loss": 0.7115, "grad_norm": 0.6363076567649841, "learning_rate": 0.0002, "epoch": 1.6560143626570916, "step": 23060}, {"loss": 0.7639, "grad_norm": 0.8168749809265137, "learning_rate": 0.0002, "epoch": 1.6567324955116698, "step": 23070}, {"loss": 0.6768, "grad_norm": 0.7664111852645874, "learning_rate": 0.0002, "epoch": 1.6574506283662478, "step": 23080}, {"loss": 0.7492, "grad_norm": 0.6748140454292297, "learning_rate": 0.0002, "epoch": 1.6581687612208258, "step": 23090}, {"loss": 0.7213, "grad_norm": 0.6258183121681213, "learning_rate": 0.0002, "epoch": 1.6588868940754038, "step": 23100}, {"loss": 0.783, "grad_norm": 0.8669735193252563, "learning_rate": 0.0002, "epoch": 1.659605026929982, "step": 23110}, {"loss": 0.6847, "grad_norm": 0.5606119632720947, "learning_rate": 0.0002, "epoch": 1.6603231597845602, "step": 23120}, {"loss": 0.6889, "grad_norm": 0.6602507829666138, "learning_rate": 0.0002, "epoch": 1.6610412926391382, "step": 23130}, {"loss": 0.7605, "grad_norm": 0.7237988710403442, "learning_rate": 0.0002, "epoch": 1.6617594254937162, "step": 23140}, {"loss": 0.7663, "grad_norm": 0.9054415225982666, "learning_rate": 0.0002, "epoch": 1.6624775583482945, "step": 23150}, {"loss": 0.7603, "grad_norm": 0.5186660289764404, "learning_rate": 0.0002, "epoch": 1.6631956912028727, "step": 23160}, {"loss": 0.7442, "grad_norm": 0.719584584236145, "learning_rate": 0.0002, "epoch": 1.6639138240574507, "step": 23170}, {"loss": 0.7715, "grad_norm": 0.7583617568016052, "learning_rate": 0.0002, "epoch": 1.6646319569120287, "step": 23180}, {"loss": 0.7402, "grad_norm": 0.7985982298851013, "learning_rate": 0.0002, "epoch": 1.6653500897666067, "step": 23190}, {"loss": 0.7515, "grad_norm": 0.6952691674232483, "learning_rate": 0.0002, "epoch": 1.666068222621185, "step": 23200}, {"loss": 0.7491, "grad_norm": 0.7184221744537354, "learning_rate": 0.0002, "epoch": 1.6667863554757631, "step": 23210}, {"loss": 0.7608, "grad_norm": 0.8256361484527588, "learning_rate": 0.0002, "epoch": 1.6675044883303412, "step": 23220}, {"loss": 0.7331, "grad_norm": 0.7534128427505493, "learning_rate": 0.0002, "epoch": 1.6682226211849192, "step": 23230}, {"loss": 0.7196, "grad_norm": 0.7711095213890076, "learning_rate": 0.0002, "epoch": 1.6689407540394972, "step": 23240}, {"loss": 0.7871, "grad_norm": 0.6326615810394287, "learning_rate": 0.0002, "epoch": 1.6696588868940754, "step": 23250}, {"loss": 0.7244, "grad_norm": 0.8345766663551331, "learning_rate": 0.0002, "epoch": 1.6703770197486536, "step": 23260}, {"loss": 0.7819, "grad_norm": 0.9079837203025818, "learning_rate": 0.0002, "epoch": 1.6710951526032316, "step": 23270}, {"loss": 0.7259, "grad_norm": 0.7310197353363037, "learning_rate": 0.0002, "epoch": 1.6718132854578096, "step": 23280}, {"loss": 0.7253, "grad_norm": 0.7573344707489014, "learning_rate": 0.0002, "epoch": 1.6725314183123878, "step": 23290}, {"loss": 0.6817, "grad_norm": 0.7708047032356262, "learning_rate": 0.0002, "epoch": 1.673249551166966, "step": 23300}, {"loss": 0.7247, "grad_norm": 0.7665812969207764, "learning_rate": 0.0002, "epoch": 1.673967684021544, "step": 23310}, {"loss": 0.7048, "grad_norm": 0.7988788485527039, "learning_rate": 0.0002, "epoch": 1.674685816876122, "step": 23320}, {"loss": 0.7396, "grad_norm": 0.755042552947998, "learning_rate": 0.0002, "epoch": 1.6754039497307, "step": 23330}, {"loss": 0.7392, "grad_norm": 0.6605848670005798, "learning_rate": 0.0002, "epoch": 1.6761220825852783, "step": 23340}, {"loss": 0.7394, "grad_norm": 0.8762016296386719, "learning_rate": 0.0002, "epoch": 1.6768402154398565, "step": 23350}, {"loss": 0.7661, "grad_norm": 0.604742169380188, "learning_rate": 0.0002, "epoch": 1.6775583482944345, "step": 23360}, {"loss": 0.7422, "grad_norm": 0.7479172945022583, "learning_rate": 0.0002, "epoch": 1.6782764811490125, "step": 23370}, {"loss": 0.7248, "grad_norm": 0.6418702602386475, "learning_rate": 0.0002, "epoch": 1.6789946140035905, "step": 23380}, {"loss": 0.7717, "grad_norm": 0.6783933639526367, "learning_rate": 0.0002, "epoch": 1.6797127468581687, "step": 23390}, {"loss": 0.7099, "grad_norm": 0.7036024928092957, "learning_rate": 0.0002, "epoch": 1.680430879712747, "step": 23400}, {"loss": 0.7439, "grad_norm": 0.6833266615867615, "learning_rate": 0.0002, "epoch": 1.681149012567325, "step": 23410}, {"loss": 0.753, "grad_norm": 0.8867062330245972, "learning_rate": 0.0002, "epoch": 1.681867145421903, "step": 23420}, {"loss": 0.7694, "grad_norm": 0.7825753092765808, "learning_rate": 0.0002, "epoch": 1.6825852782764812, "step": 23430}, {"loss": 0.7127, "grad_norm": 0.6396880745887756, "learning_rate": 0.0002, "epoch": 1.6833034111310592, "step": 23440}, {"loss": 0.7465, "grad_norm": 0.5723230242729187, "learning_rate": 0.0002, "epoch": 1.6840215439856374, "step": 23450}, {"loss": 0.7102, "grad_norm": 0.6949231624603271, "learning_rate": 0.0002, "epoch": 1.6847396768402154, "step": 23460}, {"loss": 0.7421, "grad_norm": 0.8290650248527527, "learning_rate": 0.0002, "epoch": 1.6854578096947934, "step": 23470}, {"loss": 0.7774, "grad_norm": 0.7765078544616699, "learning_rate": 0.0002, "epoch": 1.6861759425493716, "step": 23480}, {"loss": 0.7271, "grad_norm": 0.7084149718284607, "learning_rate": 0.0002, "epoch": 1.6868940754039499, "step": 23490}, {"loss": 0.8188, "grad_norm": 0.6916654109954834, "learning_rate": 0.0002, "epoch": 1.6876122082585279, "step": 23500}, {"loss": 0.7235, "grad_norm": 0.5615179538726807, "learning_rate": 0.0002, "epoch": 1.6883303411131059, "step": 23510}, {"loss": 0.7203, "grad_norm": 0.7996105551719666, "learning_rate": 0.0002, "epoch": 1.6890484739676839, "step": 23520}, {"loss": 0.7145, "grad_norm": 0.7010168433189392, "learning_rate": 0.0002, "epoch": 1.689766606822262, "step": 23530}, {"loss": 0.7696, "grad_norm": 0.7876442074775696, "learning_rate": 0.0002, "epoch": 1.6904847396768403, "step": 23540}, {"loss": 0.6966, "grad_norm": 0.7508043646812439, "learning_rate": 0.0002, "epoch": 1.6912028725314183, "step": 23550}, {"loss": 0.729, "grad_norm": 0.8125874400138855, "learning_rate": 0.0002, "epoch": 1.6919210053859963, "step": 23560}, {"loss": 0.774, "grad_norm": 0.711840808391571, "learning_rate": 0.0002, "epoch": 1.6926391382405745, "step": 23570}, {"loss": 0.7165, "grad_norm": 0.6540026068687439, "learning_rate": 0.0002, "epoch": 1.6933572710951525, "step": 23580}, {"loss": 0.7578, "grad_norm": 0.8376550078392029, "learning_rate": 0.0002, "epoch": 1.6940754039497308, "step": 23590}, {"loss": 0.7746, "grad_norm": 0.7075366973876953, "learning_rate": 0.0002, "epoch": 1.6947935368043088, "step": 23600}, {"loss": 0.7639, "grad_norm": 0.7522266507148743, "learning_rate": 0.0002, "epoch": 1.6955116696588868, "step": 23610}, {"loss": 0.7386, "grad_norm": 0.7572667002677917, "learning_rate": 0.0002, "epoch": 1.696229802513465, "step": 23620}, {"loss": 0.6896, "grad_norm": 0.6126907467842102, "learning_rate": 0.0002, "epoch": 1.6969479353680432, "step": 23630}, {"loss": 0.7182, "grad_norm": 0.7473152875900269, "learning_rate": 0.0002, "epoch": 1.6976660682226212, "step": 23640}, {"loss": 0.7272, "grad_norm": 0.6630390286445618, "learning_rate": 0.0002, "epoch": 1.6983842010771992, "step": 23650}, {"loss": 0.7232, "grad_norm": 0.5848073363304138, "learning_rate": 0.0002, "epoch": 1.6991023339317772, "step": 23660}, {"loss": 0.6923, "grad_norm": 0.5901942849159241, "learning_rate": 0.0002, "epoch": 1.6998204667863555, "step": 23670}, {"loss": 0.79, "grad_norm": 0.7896918058395386, "learning_rate": 0.0002, "epoch": 1.7005385996409337, "step": 23680}, {"loss": 0.77, "grad_norm": 0.705362856388092, "learning_rate": 0.0002, "epoch": 1.7012567324955117, "step": 23690}, {"loss": 0.751, "grad_norm": 0.9917470812797546, "learning_rate": 0.0002, "epoch": 1.7019748653500897, "step": 23700}, {"loss": 0.7403, "grad_norm": 0.7550538778305054, "learning_rate": 0.0002, "epoch": 1.702692998204668, "step": 23710}, {"loss": 0.7398, "grad_norm": 0.8348238468170166, "learning_rate": 0.0002, "epoch": 1.703411131059246, "step": 23720}, {"loss": 0.7799, "grad_norm": 0.5979694128036499, "learning_rate": 0.0002, "epoch": 1.7041292639138241, "step": 23730}, {"loss": 0.7035, "grad_norm": 0.7451775670051575, "learning_rate": 0.0002, "epoch": 1.7048473967684021, "step": 23740}, {"loss": 0.7237, "grad_norm": 0.7614818215370178, "learning_rate": 0.0002, "epoch": 1.7055655296229801, "step": 23750}, {"loss": 0.7636, "grad_norm": 0.5590742826461792, "learning_rate": 0.0002, "epoch": 1.7062836624775584, "step": 23760}, {"loss": 0.701, "grad_norm": 0.7039094567298889, "learning_rate": 0.0002, "epoch": 1.7070017953321366, "step": 23770}, {"loss": 0.7145, "grad_norm": 0.7963233590126038, "learning_rate": 0.0002, "epoch": 1.7077199281867146, "step": 23780}, {"loss": 0.7702, "grad_norm": 0.7214934825897217, "learning_rate": 0.0002, "epoch": 1.7084380610412926, "step": 23790}, {"loss": 0.7515, "grad_norm": 0.7310500741004944, "learning_rate": 0.0002, "epoch": 1.7091561938958706, "step": 23800}, {"loss": 0.7038, "grad_norm": 0.6653284430503845, "learning_rate": 0.0002, "epoch": 1.7098743267504488, "step": 23810}, {"loss": 0.698, "grad_norm": 0.6632702946662903, "learning_rate": 0.0002, "epoch": 1.710592459605027, "step": 23820}, {"loss": 0.7338, "grad_norm": 0.6314955949783325, "learning_rate": 0.0002, "epoch": 1.711310592459605, "step": 23830}, {"loss": 0.7511, "grad_norm": 0.73652583360672, "learning_rate": 0.0002, "epoch": 1.712028725314183, "step": 23840}, {"loss": 0.6999, "grad_norm": 0.5685144662857056, "learning_rate": 0.0002, "epoch": 1.7127468581687613, "step": 23850}, {"loss": 0.7295, "grad_norm": 0.7010223865509033, "learning_rate": 0.0002, "epoch": 1.7134649910233393, "step": 23860}, {"loss": 0.7488, "grad_norm": 0.7643879652023315, "learning_rate": 0.0002, "epoch": 1.7141831238779175, "step": 23870}, {"loss": 0.7449, "grad_norm": 0.7543165683746338, "learning_rate": 0.0002, "epoch": 1.7149012567324955, "step": 23880}, {"loss": 0.6946, "grad_norm": 0.8816508054733276, "learning_rate": 0.0002, "epoch": 1.7156193895870735, "step": 23890}, {"loss": 0.7398, "grad_norm": 0.7979614734649658, "learning_rate": 0.0002, "epoch": 1.7163375224416517, "step": 23900}, {"loss": 0.7844, "grad_norm": 0.7631057500839233, "learning_rate": 0.0002, "epoch": 1.71705565529623, "step": 23910}, {"loss": 0.7409, "grad_norm": 0.6349977254867554, "learning_rate": 0.0002, "epoch": 1.717773788150808, "step": 23920}, {"loss": 0.74, "grad_norm": 0.7464412450790405, "learning_rate": 0.0002, "epoch": 1.718491921005386, "step": 23930}, {"loss": 0.7164, "grad_norm": 0.6985567212104797, "learning_rate": 0.0002, "epoch": 1.719210053859964, "step": 23940}, {"loss": 0.7256, "grad_norm": 0.6641302704811096, "learning_rate": 0.0002, "epoch": 1.7199281867145422, "step": 23950}, {"loss": 0.7154, "grad_norm": 0.7299597263336182, "learning_rate": 0.0002, "epoch": 1.7206463195691204, "step": 23960}, {"loss": 0.7535, "grad_norm": 0.7812355756759644, "learning_rate": 0.0002, "epoch": 1.7213644524236984, "step": 23970}, {"loss": 0.7363, "grad_norm": 0.667571485042572, "learning_rate": 0.0002, "epoch": 1.7220825852782764, "step": 23980}, {"loss": 0.7427, "grad_norm": 0.8244081735610962, "learning_rate": 0.0002, "epoch": 1.7228007181328546, "step": 23990}, {"loss": 0.7191, "grad_norm": 0.6684445738792419, "learning_rate": 0.0002, "epoch": 1.7235188509874326, "step": 24000}, {"loss": 0.8042, "grad_norm": 0.7002949118614197, "learning_rate": 0.0002, "epoch": 1.7242369838420109, "step": 24010}, {"loss": 0.7134, "grad_norm": 0.6249772906303406, "learning_rate": 0.0002, "epoch": 1.7249551166965889, "step": 24020}, {"loss": 0.721, "grad_norm": 0.7279905080795288, "learning_rate": 0.0002, "epoch": 1.7256732495511669, "step": 24030}, {"loss": 0.7374, "grad_norm": 0.631148636341095, "learning_rate": 0.0002, "epoch": 1.726391382405745, "step": 24040}, {"loss": 0.697, "grad_norm": 0.7486464977264404, "learning_rate": 0.0002, "epoch": 1.7271095152603233, "step": 24050}, {"loss": 0.715, "grad_norm": 0.7494347095489502, "learning_rate": 0.0002, "epoch": 1.7278276481149013, "step": 24060}, {"loss": 0.7609, "grad_norm": 0.7821264863014221, "learning_rate": 0.0002, "epoch": 1.7285457809694793, "step": 24070}, {"loss": 0.6925, "grad_norm": 0.7211608290672302, "learning_rate": 0.0002, "epoch": 1.7292639138240573, "step": 24080}, {"loss": 0.7444, "grad_norm": 0.7028553485870361, "learning_rate": 0.0002, "epoch": 1.7299820466786355, "step": 24090}, {"loss": 0.8065, "grad_norm": 0.6189247369766235, "learning_rate": 0.0002, "epoch": 1.7307001795332138, "step": 24100}, {"loss": 0.7011, "grad_norm": 0.7339756488800049, "learning_rate": 0.0002, "epoch": 1.7314183123877918, "step": 24110}, {"loss": 0.8071, "grad_norm": 0.6700502038002014, "learning_rate": 0.0002, "epoch": 1.7321364452423698, "step": 24120}, {"loss": 0.7608, "grad_norm": 0.6139533519744873, "learning_rate": 0.0002, "epoch": 1.732854578096948, "step": 24130}, {"loss": 0.7251, "grad_norm": 0.7249825596809387, "learning_rate": 0.0002, "epoch": 1.733572710951526, "step": 24140}, {"loss": 0.6954, "grad_norm": 0.6531777381896973, "learning_rate": 0.0002, "epoch": 1.7342908438061042, "step": 24150}, {"loss": 0.7214, "grad_norm": 0.8443833589553833, "learning_rate": 0.0002, "epoch": 1.7350089766606822, "step": 24160}, {"loss": 0.75, "grad_norm": 0.7040373086929321, "learning_rate": 0.0002, "epoch": 1.7357271095152602, "step": 24170}, {"loss": 0.701, "grad_norm": 0.8647749423980713, "learning_rate": 0.0002, "epoch": 1.7364452423698384, "step": 24180}, {"loss": 0.7033, "grad_norm": 0.7297305464744568, "learning_rate": 0.0002, "epoch": 1.7371633752244167, "step": 24190}, {"loss": 0.7187, "grad_norm": 0.8191218376159668, "learning_rate": 0.0002, "epoch": 1.7378815080789947, "step": 24200}, {"loss": 0.7665, "grad_norm": 0.7315607666969299, "learning_rate": 0.0002, "epoch": 1.7385996409335727, "step": 24210}, {"loss": 0.7467, "grad_norm": 0.694486677646637, "learning_rate": 0.0002, "epoch": 1.7393177737881507, "step": 24220}, {"loss": 0.7476, "grad_norm": 0.8115953207015991, "learning_rate": 0.0002, "epoch": 1.740035906642729, "step": 24230}, {"loss": 0.7792, "grad_norm": 0.7379186153411865, "learning_rate": 0.0002, "epoch": 1.7407540394973071, "step": 24240}, {"loss": 0.7224, "grad_norm": 0.6820309162139893, "learning_rate": 0.0002, "epoch": 1.7414721723518851, "step": 24250}, {"loss": 0.7558, "grad_norm": 0.8210766911506653, "learning_rate": 0.0002, "epoch": 1.7421903052064631, "step": 24260}, {"loss": 0.7098, "grad_norm": 0.724466860294342, "learning_rate": 0.0002, "epoch": 1.7429084380610413, "step": 24270}, {"loss": 0.7343, "grad_norm": 0.8768740296363831, "learning_rate": 0.0002, "epoch": 1.7436265709156193, "step": 24280}, {"loss": 0.7041, "grad_norm": 0.6691206097602844, "learning_rate": 0.0002, "epoch": 1.7443447037701976, "step": 24290}, {"loss": 0.7526, "grad_norm": 0.6529893279075623, "learning_rate": 0.0002, "epoch": 1.7450628366247756, "step": 24300}, {"loss": 0.7638, "grad_norm": 0.904729962348938, "learning_rate": 0.0002, "epoch": 1.7457809694793536, "step": 24310}, {"loss": 0.7463, "grad_norm": 0.655235230922699, "learning_rate": 0.0002, "epoch": 1.7464991023339318, "step": 24320}, {"loss": 0.7625, "grad_norm": 0.9476361274719238, "learning_rate": 0.0002, "epoch": 1.74721723518851, "step": 24330}, {"loss": 0.688, "grad_norm": 0.55366051197052, "learning_rate": 0.0002, "epoch": 1.747935368043088, "step": 24340}, {"loss": 0.7664, "grad_norm": 0.7192568182945251, "learning_rate": 0.0002, "epoch": 1.748653500897666, "step": 24350}, {"loss": 0.7423, "grad_norm": 0.7193983793258667, "learning_rate": 0.0002, "epoch": 1.749371633752244, "step": 24360}, {"loss": 0.7463, "grad_norm": 0.753998339176178, "learning_rate": 0.0002, "epoch": 1.7500897666068223, "step": 24370}, {"loss": 0.7415, "grad_norm": 1.1058299541473389, "learning_rate": 0.0002, "epoch": 1.7508078994614005, "step": 24380}, {"loss": 0.7373, "grad_norm": 0.7213007211685181, "learning_rate": 0.0002, "epoch": 1.7515260323159785, "step": 24390}, {"loss": 0.7395, "grad_norm": 0.972494900226593, "learning_rate": 0.0002, "epoch": 1.7522441651705565, "step": 24400}, {"loss": 0.7689, "grad_norm": 0.8045306205749512, "learning_rate": 0.0002, "epoch": 1.7529622980251347, "step": 24410}, {"loss": 0.7463, "grad_norm": 0.82415372133255, "learning_rate": 0.0002, "epoch": 1.7536804308797127, "step": 24420}, {"loss": 0.7384, "grad_norm": 0.72683185338974, "learning_rate": 0.0002, "epoch": 1.754398563734291, "step": 24430}, {"loss": 0.7512, "grad_norm": 0.687907338142395, "learning_rate": 0.0002, "epoch": 1.755116696588869, "step": 24440}, {"loss": 0.7627, "grad_norm": 0.6616531610488892, "learning_rate": 0.0002, "epoch": 1.755834829443447, "step": 24450}, {"loss": 0.7425, "grad_norm": 0.7225571870803833, "learning_rate": 0.0002, "epoch": 1.7565529622980252, "step": 24460}, {"loss": 0.7584, "grad_norm": 0.7597603797912598, "learning_rate": 0.0002, "epoch": 1.7572710951526034, "step": 24470}, {"loss": 0.7076, "grad_norm": 0.7850660681724548, "learning_rate": 0.0002, "epoch": 1.7579892280071814, "step": 24480}, {"loss": 0.7294, "grad_norm": 0.9843530058860779, "learning_rate": 0.0002, "epoch": 1.7587073608617594, "step": 24490}, {"loss": 0.7237, "grad_norm": 0.7010256052017212, "learning_rate": 0.0002, "epoch": 1.7594254937163374, "step": 24500}, {"loss": 0.7143, "grad_norm": 0.5669383406639099, "learning_rate": 0.0002, "epoch": 1.7601436265709156, "step": 24510}, {"loss": 0.7511, "grad_norm": 0.7043302655220032, "learning_rate": 0.0002, "epoch": 1.7608617594254938, "step": 24520}, {"loss": 0.73, "grad_norm": 0.8000741600990295, "learning_rate": 0.0002, "epoch": 1.7615798922800718, "step": 24530}, {"loss": 0.6994, "grad_norm": 0.7084416747093201, "learning_rate": 0.0002, "epoch": 1.7622980251346498, "step": 24540}, {"loss": 0.7337, "grad_norm": 0.7290608882904053, "learning_rate": 0.0002, "epoch": 1.763016157989228, "step": 24550}, {"loss": 0.6968, "grad_norm": 0.8710007071495056, "learning_rate": 0.0002, "epoch": 1.763734290843806, "step": 24560}, {"loss": 0.7023, "grad_norm": 0.6346535682678223, "learning_rate": 0.0002, "epoch": 1.7644524236983843, "step": 24570}, {"loss": 0.684, "grad_norm": 0.8990599513053894, "learning_rate": 0.0002, "epoch": 1.7651705565529623, "step": 24580}, {"loss": 0.7222, "grad_norm": 0.7823857665061951, "learning_rate": 0.0002, "epoch": 1.7658886894075403, "step": 24590}, {"loss": 0.7392, "grad_norm": 0.6250144839286804, "learning_rate": 0.0002, "epoch": 1.7666068222621185, "step": 24600}, {"loss": 0.7159, "grad_norm": 0.715657114982605, "learning_rate": 0.0002, "epoch": 1.7673249551166967, "step": 24610}, {"loss": 0.7245, "grad_norm": 0.6254874467849731, "learning_rate": 0.0002, "epoch": 1.7680430879712747, "step": 24620}, {"loss": 0.7258, "grad_norm": 0.6873717904090881, "learning_rate": 0.0002, "epoch": 1.7687612208258527, "step": 24630}, {"loss": 0.7951, "grad_norm": 0.7273038625717163, "learning_rate": 0.0002, "epoch": 1.7694793536804307, "step": 24640}, {"loss": 0.7417, "grad_norm": 0.9079981446266174, "learning_rate": 0.0002, "epoch": 1.770197486535009, "step": 24650}, {"loss": 0.7138, "grad_norm": 0.6262510418891907, "learning_rate": 0.0002, "epoch": 1.7709156193895872, "step": 24660}, {"loss": 0.6995, "grad_norm": 0.7326231002807617, "learning_rate": 0.0002, "epoch": 1.7716337522441652, "step": 24670}, {"loss": 0.7483, "grad_norm": 0.7828301787376404, "learning_rate": 0.0002, "epoch": 1.7723518850987432, "step": 24680}, {"loss": 0.689, "grad_norm": 0.5881586670875549, "learning_rate": 0.0002, "epoch": 1.7730700179533212, "step": 24690}, {"loss": 0.744, "grad_norm": 0.7101683020591736, "learning_rate": 0.0002, "epoch": 1.7737881508078994, "step": 24700}, {"loss": 0.7145, "grad_norm": 0.8466469049453735, "learning_rate": 0.0002, "epoch": 1.7745062836624776, "step": 24710}, {"loss": 0.7428, "grad_norm": 0.7770822644233704, "learning_rate": 0.0002, "epoch": 1.7752244165170556, "step": 24720}, {"loss": 0.7299, "grad_norm": 0.7259120345115662, "learning_rate": 0.0002, "epoch": 1.7759425493716336, "step": 24730}, {"loss": 0.6909, "grad_norm": 0.7696824669837952, "learning_rate": 0.0002, "epoch": 1.7766606822262119, "step": 24740}, {"loss": 0.7659, "grad_norm": 0.7603837847709656, "learning_rate": 0.0002, "epoch": 1.77737881508079, "step": 24750}, {"loss": 0.6966, "grad_norm": 0.6166595220565796, "learning_rate": 0.0002, "epoch": 1.778096947935368, "step": 24760}, {"loss": 0.6987, "grad_norm": 0.7493758797645569, "learning_rate": 0.0002, "epoch": 1.778815080789946, "step": 24770}, {"loss": 0.6808, "grad_norm": 0.7177459597587585, "learning_rate": 0.0002, "epoch": 1.779533213644524, "step": 24780}, {"loss": 0.7411, "grad_norm": 0.6666781306266785, "learning_rate": 0.0002, "epoch": 1.7802513464991023, "step": 24790}, {"loss": 0.6867, "grad_norm": 0.6556468605995178, "learning_rate": 0.0002, "epoch": 1.7809694793536806, "step": 24800}, {"loss": 0.7375, "grad_norm": 0.6119393706321716, "learning_rate": 0.0002, "epoch": 1.7816876122082586, "step": 24810}, {"loss": 0.7059, "grad_norm": 0.8573325276374817, "learning_rate": 0.0002, "epoch": 1.7824057450628366, "step": 24820}, {"loss": 0.7708, "grad_norm": 0.8017005920410156, "learning_rate": 0.0002, "epoch": 1.7831238779174146, "step": 24830}, {"loss": 0.7041, "grad_norm": 0.7337947487831116, "learning_rate": 0.0002, "epoch": 1.7838420107719928, "step": 24840}, {"loss": 0.7325, "grad_norm": 0.6717178225517273, "learning_rate": 0.0002, "epoch": 1.784560143626571, "step": 24850}, {"loss": 0.7285, "grad_norm": 0.8243708610534668, "learning_rate": 0.0002, "epoch": 1.785278276481149, "step": 24860}, {"loss": 0.701, "grad_norm": 0.8111547827720642, "learning_rate": 0.0002, "epoch": 1.785996409335727, "step": 24870}, {"loss": 0.7105, "grad_norm": 0.8577823042869568, "learning_rate": 0.0002, "epoch": 1.7867145421903052, "step": 24880}, {"loss": 0.7419, "grad_norm": 0.6488644480705261, "learning_rate": 0.0002, "epoch": 1.7874326750448835, "step": 24890}, {"loss": 0.7112, "grad_norm": 0.6446744799613953, "learning_rate": 0.0002, "epoch": 1.7881508078994615, "step": 24900}, {"loss": 0.7531, "grad_norm": 0.6400182247161865, "learning_rate": 0.0002, "epoch": 1.7888689407540395, "step": 24910}, {"loss": 0.711, "grad_norm": 0.8059108853340149, "learning_rate": 0.0002, "epoch": 1.7895870736086175, "step": 24920}, {"loss": 0.7678, "grad_norm": 0.7101734280586243, "learning_rate": 0.0002, "epoch": 1.7903052064631957, "step": 24930}, {"loss": 0.7648, "grad_norm": 1.0397762060165405, "learning_rate": 0.0002, "epoch": 1.791023339317774, "step": 24940}, {"loss": 0.7079, "grad_norm": 0.6231128573417664, "learning_rate": 0.0002, "epoch": 1.791741472172352, "step": 24950}, {"loss": 0.7525, "grad_norm": 5.905253887176514, "learning_rate": 0.0002, "epoch": 1.79245960502693, "step": 24960}, {"loss": 0.7286, "grad_norm": 0.8003911375999451, "learning_rate": 0.0002, "epoch": 1.793177737881508, "step": 24970}, {"loss": 0.7002, "grad_norm": 0.6340393424034119, "learning_rate": 0.0002, "epoch": 1.7938958707360861, "step": 24980}, {"loss": 0.7056, "grad_norm": 0.8701013922691345, "learning_rate": 0.0002, "epoch": 1.7946140035906644, "step": 24990}, {"loss": 0.7192, "grad_norm": 0.9085575342178345, "learning_rate": 0.0002, "epoch": 1.7953321364452424, "step": 25000}, {"loss": 0.7367, "grad_norm": 0.6306625604629517, "learning_rate": 0.0002, "epoch": 1.7960502692998204, "step": 25010}, {"loss": 0.7122, "grad_norm": 0.6985056400299072, "learning_rate": 0.0002, "epoch": 1.7967684021543986, "step": 25020}, {"loss": 0.7005, "grad_norm": 0.7309113144874573, "learning_rate": 0.0002, "epoch": 1.7974865350089768, "step": 25030}, {"loss": 0.7414, "grad_norm": 0.6795042157173157, "learning_rate": 0.0002, "epoch": 1.7982046678635548, "step": 25040}, {"loss": 0.7606, "grad_norm": 0.6920178532600403, "learning_rate": 0.0002, "epoch": 1.7989228007181328, "step": 25050}, {"loss": 0.7094, "grad_norm": 0.6578564047813416, "learning_rate": 0.0002, "epoch": 1.7996409335727108, "step": 25060}, {"loss": 0.7471, "grad_norm": 0.6718358993530273, "learning_rate": 0.0002, "epoch": 1.800359066427289, "step": 25070}, {"loss": 0.7271, "grad_norm": 0.9086750149726868, "learning_rate": 0.0002, "epoch": 1.8010771992818673, "step": 25080}, {"loss": 0.7653, "grad_norm": 0.6102437973022461, "learning_rate": 0.0002, "epoch": 1.8017953321364453, "step": 25090}, {"loss": 0.7538, "grad_norm": 0.6391313076019287, "learning_rate": 0.0002, "epoch": 1.8025134649910233, "step": 25100}, {"loss": 0.766, "grad_norm": 0.7150128483772278, "learning_rate": 0.0002, "epoch": 1.8032315978456013, "step": 25110}, {"loss": 0.7036, "grad_norm": 0.9833421111106873, "learning_rate": 0.0002, "epoch": 1.8039497307001795, "step": 25120}, {"loss": 0.7122, "grad_norm": 0.774002194404602, "learning_rate": 0.0002, "epoch": 1.8046678635547577, "step": 25130}, {"loss": 0.7329, "grad_norm": 0.644443154335022, "learning_rate": 0.0002, "epoch": 1.8053859964093357, "step": 25140}, {"loss": 0.7039, "grad_norm": 0.6996100544929504, "learning_rate": 0.0002, "epoch": 1.8061041292639137, "step": 25150}, {"loss": 0.6962, "grad_norm": 0.7545985579490662, "learning_rate": 0.0002, "epoch": 1.806822262118492, "step": 25160}, {"loss": 0.7432, "grad_norm": 0.7505226731300354, "learning_rate": 0.0002, "epoch": 1.8075403949730702, "step": 25170}, {"loss": 0.7189, "grad_norm": 0.800681471824646, "learning_rate": 0.0002, "epoch": 1.8082585278276482, "step": 25180}, {"loss": 0.7131, "grad_norm": 0.8268337845802307, "learning_rate": 0.0002, "epoch": 1.8089766606822262, "step": 25190}, {"loss": 0.7933, "grad_norm": 0.6436594128608704, "learning_rate": 0.0002, "epoch": 1.8096947935368042, "step": 25200}, {"loss": 0.7478, "grad_norm": 0.6961014270782471, "learning_rate": 0.0002, "epoch": 1.8104129263913824, "step": 25210}, {"loss": 0.7519, "grad_norm": 0.6649489998817444, "learning_rate": 0.0002, "epoch": 1.8111310592459606, "step": 25220}, {"loss": 0.7307, "grad_norm": 0.7071637511253357, "learning_rate": 0.0002, "epoch": 1.8118491921005386, "step": 25230}, {"loss": 0.7074, "grad_norm": 0.9082241654396057, "learning_rate": 0.0002, "epoch": 1.8125673249551166, "step": 25240}, {"loss": 0.7406, "grad_norm": 0.6318159103393555, "learning_rate": 0.0002, "epoch": 1.8132854578096946, "step": 25250}, {"loss": 0.7081, "grad_norm": 0.8006597757339478, "learning_rate": 0.0002, "epoch": 1.8140035906642729, "step": 25260}, {"loss": 0.7593, "grad_norm": 0.7950259447097778, "learning_rate": 0.0002, "epoch": 1.814721723518851, "step": 25270}, {"loss": 0.6897, "grad_norm": 0.8376588821411133, "learning_rate": 0.0002, "epoch": 1.815439856373429, "step": 25280}, {"loss": 0.747, "grad_norm": 0.8343217968940735, "learning_rate": 0.0002, "epoch": 1.816157989228007, "step": 25290}, {"loss": 0.7611, "grad_norm": 0.6240017414093018, "learning_rate": 0.0002, "epoch": 1.8168761220825853, "step": 25300}, {"loss": 0.7458, "grad_norm": 0.7079808712005615, "learning_rate": 0.0002, "epoch": 1.8175942549371635, "step": 25310}, {"loss": 0.7254, "grad_norm": 0.5930073261260986, "learning_rate": 0.0002, "epoch": 1.8183123877917415, "step": 25320}, {"loss": 0.7647, "grad_norm": 0.6994491815567017, "learning_rate": 0.0002, "epoch": 1.8190305206463195, "step": 25330}, {"loss": 0.726, "grad_norm": 0.8285305500030518, "learning_rate": 0.0002, "epoch": 1.8197486535008975, "step": 25340}, {"loss": 0.7215, "grad_norm": 0.6880194544792175, "learning_rate": 0.0002, "epoch": 1.8204667863554758, "step": 25350}, {"loss": 0.7365, "grad_norm": 0.7301307916641235, "learning_rate": 0.0002, "epoch": 1.821184919210054, "step": 25360}, {"loss": 0.7308, "grad_norm": 0.8117532730102539, "learning_rate": 0.0002, "epoch": 1.821903052064632, "step": 25370}, {"loss": 0.7395, "grad_norm": 0.8098701238632202, "learning_rate": 0.0002, "epoch": 1.82262118491921, "step": 25380}, {"loss": 0.7082, "grad_norm": 0.6899038553237915, "learning_rate": 0.0002, "epoch": 1.823339317773788, "step": 25390}, {"loss": 0.697, "grad_norm": 0.7350431084632874, "learning_rate": 0.0002, "epoch": 1.8240574506283662, "step": 25400}, {"loss": 0.7389, "grad_norm": 0.8723382949829102, "learning_rate": 0.0002, "epoch": 1.8247755834829444, "step": 25410}, {"loss": 0.7375, "grad_norm": 0.7448108196258545, "learning_rate": 0.0002, "epoch": 1.8254937163375224, "step": 25420}, {"loss": 0.7279, "grad_norm": 0.7525040507316589, "learning_rate": 0.0002, "epoch": 1.8262118491921004, "step": 25430}, {"loss": 0.7164, "grad_norm": 0.7148599028587341, "learning_rate": 0.0002, "epoch": 1.8269299820466787, "step": 25440}, {"loss": 0.7955, "grad_norm": 1.1802153587341309, "learning_rate": 0.0002, "epoch": 1.827648114901257, "step": 25450}, {"loss": 0.7094, "grad_norm": 0.619945764541626, "learning_rate": 0.0002, "epoch": 1.828366247755835, "step": 25460}, {"loss": 0.8234, "grad_norm": 0.7065792679786682, "learning_rate": 0.0002, "epoch": 1.829084380610413, "step": 25470}, {"loss": 0.796, "grad_norm": 0.6626001596450806, "learning_rate": 0.0002, "epoch": 1.829802513464991, "step": 25480}, {"loss": 0.7402, "grad_norm": 0.8368920087814331, "learning_rate": 0.0002, "epoch": 1.8305206463195691, "step": 25490}, {"loss": 0.6513, "grad_norm": 0.7528934478759766, "learning_rate": 0.0002, "epoch": 1.8312387791741473, "step": 25500}, {"loss": 0.7272, "grad_norm": 0.6472136378288269, "learning_rate": 0.0002, "epoch": 1.8319569120287253, "step": 25510}, {"loss": 0.7221, "grad_norm": 0.7818671464920044, "learning_rate": 0.0002, "epoch": 1.8326750448833034, "step": 25520}, {"loss": 0.7582, "grad_norm": 0.8280798196792603, "learning_rate": 0.0002, "epoch": 1.8333931777378814, "step": 25530}, {"loss": 0.7079, "grad_norm": 0.7038599252700806, "learning_rate": 0.0002, "epoch": 1.8341113105924596, "step": 25540}, {"loss": 0.711, "grad_norm": 0.6345962882041931, "learning_rate": 0.0002, "epoch": 1.8348294434470378, "step": 25550}, {"loss": 0.7553, "grad_norm": 0.6891741752624512, "learning_rate": 0.0002, "epoch": 1.8355475763016158, "step": 25560}, {"loss": 0.754, "grad_norm": 0.7753492593765259, "learning_rate": 0.0002, "epoch": 1.8362657091561938, "step": 25570}, {"loss": 0.7149, "grad_norm": 0.6907210946083069, "learning_rate": 0.0002, "epoch": 1.836983842010772, "step": 25580}, {"loss": 0.705, "grad_norm": 0.7483090162277222, "learning_rate": 0.0002, "epoch": 1.8377019748653503, "step": 25590}, {"loss": 0.7716, "grad_norm": 0.8749029636383057, "learning_rate": 0.0002, "epoch": 1.8384201077199283, "step": 25600}, {"loss": 0.7745, "grad_norm": 0.6936851143836975, "learning_rate": 0.0002, "epoch": 1.8391382405745063, "step": 25610}, {"loss": 0.7297, "grad_norm": 0.7273763418197632, "learning_rate": 0.0002, "epoch": 1.8398563734290843, "step": 25620}, {"loss": 0.724, "grad_norm": 0.7655298113822937, "learning_rate": 0.0002, "epoch": 1.8405745062836625, "step": 25630}, {"loss": 0.7566, "grad_norm": 0.7207344770431519, "learning_rate": 0.0002, "epoch": 1.8412926391382407, "step": 25640}, {"loss": 0.7092, "grad_norm": 0.6970131397247314, "learning_rate": 0.0002, "epoch": 1.8420107719928187, "step": 25650}, {"loss": 0.7164, "grad_norm": 0.7777560353279114, "learning_rate": 0.0002, "epoch": 1.8427289048473967, "step": 25660}, {"loss": 0.7594, "grad_norm": 0.7070116400718689, "learning_rate": 0.0002, "epoch": 1.8434470377019747, "step": 25670}, {"loss": 0.7603, "grad_norm": 0.6980257630348206, "learning_rate": 0.0002, "epoch": 1.844165170556553, "step": 25680}, {"loss": 0.7782, "grad_norm": 0.906563401222229, "learning_rate": 0.0002, "epoch": 1.8448833034111312, "step": 25690}, {"loss": 0.7377, "grad_norm": 0.567991316318512, "learning_rate": 0.0002, "epoch": 1.8456014362657092, "step": 25700}, {"loss": 0.7236, "grad_norm": 0.5954506993293762, "learning_rate": 0.0002, "epoch": 1.8463195691202872, "step": 25710}, {"loss": 0.7287, "grad_norm": 0.8073318600654602, "learning_rate": 0.0002, "epoch": 1.8470377019748654, "step": 25720}, {"loss": 0.7627, "grad_norm": 0.7439551949501038, "learning_rate": 0.0002, "epoch": 1.8477558348294436, "step": 25730}, {"loss": 0.7719, "grad_norm": 0.8091771602630615, "learning_rate": 0.0002, "epoch": 1.8484739676840216, "step": 25740}, {"loss": 0.7477, "grad_norm": 0.6584576964378357, "learning_rate": 0.0002, "epoch": 1.8491921005385996, "step": 25750}, {"loss": 0.6988, "grad_norm": 0.8161963224411011, "learning_rate": 0.0002, "epoch": 1.8499102333931776, "step": 25760}, {"loss": 0.7607, "grad_norm": 0.7337122559547424, "learning_rate": 0.0002, "epoch": 1.8506283662477558, "step": 25770}, {"loss": 0.7279, "grad_norm": 0.8968114256858826, "learning_rate": 0.0002, "epoch": 1.851346499102334, "step": 25780}, {"loss": 0.7162, "grad_norm": 0.8647686839103699, "learning_rate": 0.0002, "epoch": 1.852064631956912, "step": 25790}, {"loss": 0.7315, "grad_norm": 0.7775349020957947, "learning_rate": 0.0002, "epoch": 1.85278276481149, "step": 25800}, {"loss": 0.7739, "grad_norm": 0.686072587966919, "learning_rate": 0.0002, "epoch": 1.853500897666068, "step": 25810}, {"loss": 0.7138, "grad_norm": 0.7053380012512207, "learning_rate": 0.0002, "epoch": 1.8542190305206463, "step": 25820}, {"loss": 0.7583, "grad_norm": 0.7899979948997498, "learning_rate": 0.0002, "epoch": 1.8549371633752245, "step": 25830}, {"loss": 0.7633, "grad_norm": 0.6970776915550232, "learning_rate": 0.0002, "epoch": 1.8556552962298025, "step": 25840}, {"loss": 0.7704, "grad_norm": 0.7210841774940491, "learning_rate": 0.0002, "epoch": 1.8563734290843805, "step": 25850}, {"loss": 0.7422, "grad_norm": 0.7297208905220032, "learning_rate": 0.0002, "epoch": 1.8570915619389587, "step": 25860}, {"loss": 0.698, "grad_norm": 0.7782729268074036, "learning_rate": 0.0002, "epoch": 1.857809694793537, "step": 25870}, {"loss": 0.7791, "grad_norm": 0.7227505445480347, "learning_rate": 0.0002, "epoch": 1.858527827648115, "step": 25880}, {"loss": 0.7899, "grad_norm": 0.7489684224128723, "learning_rate": 0.0002, "epoch": 1.859245960502693, "step": 25890}, {"loss": 0.7875, "grad_norm": 0.7447289824485779, "learning_rate": 0.0002, "epoch": 1.859964093357271, "step": 25900}, {"loss": 0.7151, "grad_norm": 0.8516317009925842, "learning_rate": 0.0002, "epoch": 1.8606822262118492, "step": 25910}, {"loss": 0.6947, "grad_norm": 0.6864543557167053, "learning_rate": 0.0002, "epoch": 1.8614003590664274, "step": 25920}, {"loss": 0.7516, "grad_norm": 0.6753451824188232, "learning_rate": 0.0002, "epoch": 1.8621184919210054, "step": 25930}, {"loss": 0.7606, "grad_norm": 0.631679117679596, "learning_rate": 0.0002, "epoch": 1.8628366247755834, "step": 25940}, {"loss": 0.7663, "grad_norm": 0.7715049982070923, "learning_rate": 0.0002, "epoch": 1.8635547576301614, "step": 25950}, {"loss": 0.6967, "grad_norm": 0.7354850769042969, "learning_rate": 0.0002, "epoch": 1.8642728904847397, "step": 25960}, {"loss": 0.7331, "grad_norm": 0.7443442940711975, "learning_rate": 0.0002, "epoch": 1.8649910233393179, "step": 25970}, {"loss": 0.7558, "grad_norm": 0.6880337595939636, "learning_rate": 0.0002, "epoch": 1.8657091561938959, "step": 25980}, {"loss": 0.752, "grad_norm": 0.843941867351532, "learning_rate": 0.0002, "epoch": 1.8664272890484739, "step": 25990}, {"loss": 0.6941, "grad_norm": 0.6904318928718567, "learning_rate": 0.0002, "epoch": 1.867145421903052, "step": 26000}, {"loss": 0.6995, "grad_norm": 0.9041751623153687, "learning_rate": 0.0002, "epoch": 1.86786355475763, "step": 26010}, {"loss": 0.7503, "grad_norm": 0.7470057010650635, "learning_rate": 0.0002, "epoch": 1.8685816876122083, "step": 26020}, {"loss": 0.775, "grad_norm": 0.6921331882476807, "learning_rate": 0.0002, "epoch": 1.8692998204667863, "step": 26030}, {"loss": 0.7376, "grad_norm": 0.7627376914024353, "learning_rate": 0.0002, "epoch": 1.8700179533213643, "step": 26040}, {"loss": 0.7459, "grad_norm": 0.7784932851791382, "learning_rate": 0.0002, "epoch": 1.8707360861759426, "step": 26050}, {"loss": 0.7479, "grad_norm": 0.6399524807929993, "learning_rate": 0.0002, "epoch": 1.8714542190305208, "step": 26060}, {"loss": 0.7128, "grad_norm": 0.6478492617607117, "learning_rate": 0.0002, "epoch": 1.8721723518850988, "step": 26070}, {"loss": 0.6901, "grad_norm": 0.6376804113388062, "learning_rate": 0.0002, "epoch": 1.8728904847396768, "step": 26080}, {"loss": 0.7037, "grad_norm": 0.6976892352104187, "learning_rate": 0.0002, "epoch": 1.8736086175942548, "step": 26090}, {"loss": 0.7071, "grad_norm": 0.7997903227806091, "learning_rate": 0.0002, "epoch": 1.874326750448833, "step": 26100}, {"loss": 0.7152, "grad_norm": 0.6984273791313171, "learning_rate": 0.0002, "epoch": 1.8750448833034112, "step": 26110}, {"loss": 0.7768, "grad_norm": 0.7020659446716309, "learning_rate": 0.0002, "epoch": 1.8757630161579892, "step": 26120}, {"loss": 0.7518, "grad_norm": 0.784986138343811, "learning_rate": 0.0002, "epoch": 1.8764811490125672, "step": 26130}, {"loss": 0.7224, "grad_norm": 0.7369210124015808, "learning_rate": 0.0002, "epoch": 1.8771992818671455, "step": 26140}, {"loss": 0.7935, "grad_norm": 0.7730622291564941, "learning_rate": 0.0002, "epoch": 1.8779174147217235, "step": 26150}, {"loss": 0.697, "grad_norm": 0.7253434658050537, "learning_rate": 0.0002, "epoch": 1.8786355475763017, "step": 26160}, {"loss": 0.6866, "grad_norm": 0.8019800186157227, "learning_rate": 0.0002, "epoch": 1.8793536804308797, "step": 26170}, {"loss": 0.7341, "grad_norm": 0.7337628602981567, "learning_rate": 0.0002, "epoch": 1.8800718132854577, "step": 26180}, {"loss": 0.752, "grad_norm": 0.7049200534820557, "learning_rate": 0.0002, "epoch": 1.880789946140036, "step": 26190}, {"loss": 0.73, "grad_norm": 0.6451525092124939, "learning_rate": 0.0002, "epoch": 1.8815080789946141, "step": 26200}, {"loss": 0.749, "grad_norm": 0.7660874724388123, "learning_rate": 0.0002, "epoch": 1.8822262118491921, "step": 26210}, {"loss": 0.7377, "grad_norm": 0.8464223146438599, "learning_rate": 0.0002, "epoch": 1.8829443447037701, "step": 26220}, {"loss": 0.7402, "grad_norm": 0.859503984451294, "learning_rate": 0.0002, "epoch": 1.8836624775583481, "step": 26230}, {"loss": 0.7057, "grad_norm": 0.6969478726387024, "learning_rate": 0.0002, "epoch": 1.8843806104129264, "step": 26240}, {"loss": 0.7338, "grad_norm": 0.6860285997390747, "learning_rate": 0.0002, "epoch": 1.8850987432675046, "step": 26250}, {"loss": 0.7397, "grad_norm": 0.5873110294342041, "learning_rate": 0.0002, "epoch": 1.8858168761220826, "step": 26260}, {"loss": 0.7208, "grad_norm": 0.6959530115127563, "learning_rate": 0.0002, "epoch": 1.8865350089766606, "step": 26270}, {"loss": 0.7156, "grad_norm": 0.8734689950942993, "learning_rate": 0.0002, "epoch": 1.8872531418312388, "step": 26280}, {"loss": 0.689, "grad_norm": 0.7385509014129639, "learning_rate": 0.0002, "epoch": 1.8879712746858168, "step": 26290}, {"loss": 0.7355, "grad_norm": 0.6702063083648682, "learning_rate": 0.0002, "epoch": 1.888689407540395, "step": 26300}, {"loss": 0.7247, "grad_norm": 0.8177255988121033, "learning_rate": 0.0002, "epoch": 1.889407540394973, "step": 26310}, {"loss": 0.7451, "grad_norm": 0.6638466715812683, "learning_rate": 0.0002, "epoch": 1.890125673249551, "step": 26320}, {"loss": 0.7176, "grad_norm": 0.8584128618240356, "learning_rate": 0.0002, "epoch": 1.8908438061041293, "step": 26330}, {"loss": 0.7216, "grad_norm": 0.677561342716217, "learning_rate": 0.0002, "epoch": 1.8915619389587075, "step": 26340}, {"loss": 0.7502, "grad_norm": 0.6931864619255066, "learning_rate": 0.0002, "epoch": 1.8922800718132855, "step": 26350}, {"loss": 0.7548, "grad_norm": 0.6583828330039978, "learning_rate": 0.0002, "epoch": 1.8929982046678635, "step": 26360}, {"loss": 0.7544, "grad_norm": 0.6708519458770752, "learning_rate": 0.0002, "epoch": 1.8937163375224415, "step": 26370}, {"loss": 0.7034, "grad_norm": 0.7684788107872009, "learning_rate": 0.0002, "epoch": 1.8944344703770197, "step": 26380}, {"loss": 0.7243, "grad_norm": 0.703217625617981, "learning_rate": 0.0002, "epoch": 1.895152603231598, "step": 26390}, {"loss": 0.7768, "grad_norm": 0.6686710119247437, "learning_rate": 0.0002, "epoch": 1.895870736086176, "step": 26400}, {"loss": 0.7999, "grad_norm": 0.7429705262184143, "learning_rate": 0.0002, "epoch": 1.896588868940754, "step": 26410}, {"loss": 0.7695, "grad_norm": 0.7835305333137512, "learning_rate": 0.0002, "epoch": 1.8973070017953322, "step": 26420}, {"loss": 0.722, "grad_norm": 0.7793689370155334, "learning_rate": 0.0002, "epoch": 1.8980251346499102, "step": 26430}, {"loss": 0.7872, "grad_norm": 0.7337237000465393, "learning_rate": 0.0002, "epoch": 1.8987432675044884, "step": 26440}, {"loss": 0.7092, "grad_norm": 0.5734546780586243, "learning_rate": 0.0002, "epoch": 1.8994614003590664, "step": 26450}, {"loss": 0.7738, "grad_norm": 0.655937135219574, "learning_rate": 0.0002, "epoch": 1.9001795332136444, "step": 26460}, {"loss": 0.7302, "grad_norm": 1.0200905799865723, "learning_rate": 0.0002, "epoch": 1.9008976660682226, "step": 26470}, {"loss": 0.733, "grad_norm": 0.6118829250335693, "learning_rate": 0.0002, "epoch": 1.9016157989228009, "step": 26480}, {"loss": 0.7255, "grad_norm": 0.7459297776222229, "learning_rate": 0.0002, "epoch": 1.9023339317773789, "step": 26490}, {"loss": 0.7257, "grad_norm": 0.9451959729194641, "learning_rate": 0.0002, "epoch": 1.9030520646319569, "step": 26500}, {"loss": 0.7911, "grad_norm": 0.9694880247116089, "learning_rate": 0.0002, "epoch": 1.9037701974865349, "step": 26510}, {"loss": 0.7913, "grad_norm": 0.806532084941864, "learning_rate": 0.0002, "epoch": 1.904488330341113, "step": 26520}, {"loss": 0.7375, "grad_norm": 0.7016968727111816, "learning_rate": 0.0002, "epoch": 1.9052064631956913, "step": 26530}, {"loss": 0.7128, "grad_norm": 0.7707533836364746, "learning_rate": 0.0002, "epoch": 1.9059245960502693, "step": 26540}, {"loss": 0.7225, "grad_norm": 0.716044545173645, "learning_rate": 0.0002, "epoch": 1.9066427289048473, "step": 26550}, {"loss": 0.7569, "grad_norm": 0.7904782295227051, "learning_rate": 0.0002, "epoch": 1.9073608617594255, "step": 26560}, {"loss": 0.7112, "grad_norm": 0.8557461500167847, "learning_rate": 0.0002, "epoch": 1.9080789946140035, "step": 26570}, {"loss": 0.7377, "grad_norm": 0.6807048916816711, "learning_rate": 0.0002, "epoch": 1.9087971274685818, "step": 26580}, {"loss": 0.7066, "grad_norm": 0.8374032974243164, "learning_rate": 0.0002, "epoch": 1.9095152603231598, "step": 26590}, {"loss": 0.7282, "grad_norm": 0.7936834692955017, "learning_rate": 0.0002, "epoch": 1.9102333931777378, "step": 26600}, {"loss": 0.741, "grad_norm": 0.6342210173606873, "learning_rate": 0.0002, "epoch": 1.910951526032316, "step": 26610}, {"loss": 0.7117, "grad_norm": 0.8222208023071289, "learning_rate": 0.0002, "epoch": 1.9116696588868942, "step": 26620}, {"loss": 0.6965, "grad_norm": 0.7890012860298157, "learning_rate": 0.0002, "epoch": 1.9123877917414722, "step": 26630}, {"loss": 0.7141, "grad_norm": 0.6415254473686218, "learning_rate": 0.0002, "epoch": 1.9131059245960502, "step": 26640}, {"loss": 0.7232, "grad_norm": 0.7936763763427734, "learning_rate": 0.0002, "epoch": 1.9138240574506282, "step": 26650}, {"loss": 0.7411, "grad_norm": 0.7174334526062012, "learning_rate": 0.0002, "epoch": 1.9145421903052064, "step": 26660}, {"loss": 0.715, "grad_norm": 0.6503710746765137, "learning_rate": 0.0002, "epoch": 1.9152603231597847, "step": 26670}, {"loss": 0.7629, "grad_norm": 0.7618577480316162, "learning_rate": 0.0002, "epoch": 1.9159784560143627, "step": 26680}, {"loss": 0.7581, "grad_norm": 0.7984131574630737, "learning_rate": 0.0002, "epoch": 1.9166965888689407, "step": 26690}, {"loss": 0.7126, "grad_norm": 0.6863887906074524, "learning_rate": 0.0002, "epoch": 1.917414721723519, "step": 26700}, {"loss": 0.738, "grad_norm": 0.7621138691902161, "learning_rate": 0.0002, "epoch": 1.918132854578097, "step": 26710}, {"loss": 0.7095, "grad_norm": 0.7855543494224548, "learning_rate": 0.0002, "epoch": 1.9188509874326751, "step": 26720}, {"loss": 0.7354, "grad_norm": 0.7045016288757324, "learning_rate": 0.0002, "epoch": 1.9195691202872531, "step": 26730}, {"loss": 0.7188, "grad_norm": 0.7799559235572815, "learning_rate": 0.0002, "epoch": 1.9202872531418311, "step": 26740}, {"loss": 0.7714, "grad_norm": 0.7999796271324158, "learning_rate": 0.0002, "epoch": 1.9210053859964094, "step": 26750}, {"loss": 0.6856, "grad_norm": 0.5479980111122131, "learning_rate": 0.0002, "epoch": 1.9217235188509876, "step": 26760}, {"loss": 0.7153, "grad_norm": 0.7192868590354919, "learning_rate": 0.0002, "epoch": 1.9224416517055656, "step": 26770}, {"loss": 0.7272, "grad_norm": 0.7642375826835632, "learning_rate": 0.0002, "epoch": 1.9231597845601436, "step": 26780}, {"loss": 0.6923, "grad_norm": 0.7015959620475769, "learning_rate": 0.0002, "epoch": 1.9238779174147216, "step": 26790}, {"loss": 0.8291, "grad_norm": 0.6685634851455688, "learning_rate": 0.0002, "epoch": 1.9245960502692998, "step": 26800}, {"loss": 0.7404, "grad_norm": 0.674363911151886, "learning_rate": 0.0002, "epoch": 1.925314183123878, "step": 26810}, {"loss": 0.7145, "grad_norm": 0.769318163394928, "learning_rate": 0.0002, "epoch": 1.926032315978456, "step": 26820}, {"loss": 0.7323, "grad_norm": 0.7397989630699158, "learning_rate": 0.0002, "epoch": 1.926750448833034, "step": 26830}, {"loss": 0.7399, "grad_norm": 0.7603814601898193, "learning_rate": 0.0002, "epoch": 1.9274685816876123, "step": 26840}, {"loss": 0.7147, "grad_norm": 0.5960564613342285, "learning_rate": 0.0002, "epoch": 1.9281867145421903, "step": 26850}, {"loss": 0.7292, "grad_norm": 0.8158858418464661, "learning_rate": 0.0002, "epoch": 1.9289048473967685, "step": 26860}, {"loss": 0.7609, "grad_norm": 0.7022058367729187, "learning_rate": 0.0002, "epoch": 1.9296229802513465, "step": 26870}, {"loss": 0.809, "grad_norm": 0.7249060273170471, "learning_rate": 0.0002, "epoch": 1.9303411131059245, "step": 26880}, {"loss": 0.7437, "grad_norm": 0.7613264322280884, "learning_rate": 0.0002, "epoch": 1.9310592459605027, "step": 26890}, {"loss": 0.7238, "grad_norm": 0.6857499480247498, "learning_rate": 0.0002, "epoch": 1.931777378815081, "step": 26900}, {"loss": 0.7651, "grad_norm": 0.6968346834182739, "learning_rate": 0.0002, "epoch": 1.932495511669659, "step": 26910}, {"loss": 0.6837, "grad_norm": 0.7079267501831055, "learning_rate": 0.0002, "epoch": 1.933213644524237, "step": 26920}, {"loss": 0.7482, "grad_norm": 0.6571618914604187, "learning_rate": 0.0002, "epoch": 1.933931777378815, "step": 26930}, {"loss": 0.7344, "grad_norm": 0.7460548281669617, "learning_rate": 0.0002, "epoch": 1.9346499102333932, "step": 26940}, {"loss": 0.7038, "grad_norm": 0.7954307794570923, "learning_rate": 0.0002, "epoch": 1.9353680430879714, "step": 26950}, {"loss": 0.6847, "grad_norm": 0.8696223497390747, "learning_rate": 0.0002, "epoch": 1.9360861759425494, "step": 26960}, {"loss": 0.7657, "grad_norm": 0.726004421710968, "learning_rate": 0.0002, "epoch": 1.9368043087971274, "step": 26970}, {"loss": 0.771, "grad_norm": 0.8760337829589844, "learning_rate": 0.0002, "epoch": 1.9375224416517056, "step": 26980}, {"loss": 0.6917, "grad_norm": 0.7308675646781921, "learning_rate": 0.0002, "epoch": 1.9382405745062836, "step": 26990}, {"loss": 0.7155, "grad_norm": 0.5900304317474365, "learning_rate": 0.0002, "epoch": 1.9389587073608618, "step": 27000}, {"loss": 0.6917, "grad_norm": 0.8839457631111145, "learning_rate": 0.0002, "epoch": 1.9396768402154398, "step": 27010}, {"loss": 0.7443, "grad_norm": 0.7239173650741577, "learning_rate": 0.0002, "epoch": 1.9403949730700178, "step": 27020}, {"loss": 0.7081, "grad_norm": 0.8972901701927185, "learning_rate": 0.0002, "epoch": 1.941113105924596, "step": 27030}, {"loss": 0.7422, "grad_norm": 0.7140652537345886, "learning_rate": 0.0002, "epoch": 1.9418312387791743, "step": 27040}, {"loss": 0.7679, "grad_norm": 0.7502743005752563, "learning_rate": 0.0002, "epoch": 1.9425493716337523, "step": 27050}, {"loss": 0.7311, "grad_norm": 0.6420751810073853, "learning_rate": 0.0002, "epoch": 1.9432675044883303, "step": 27060}, {"loss": 0.7403, "grad_norm": 0.6671820282936096, "learning_rate": 0.0002, "epoch": 1.9439856373429083, "step": 27070}, {"loss": 0.6919, "grad_norm": 0.6268796324729919, "learning_rate": 0.0002, "epoch": 1.9447037701974865, "step": 27080}, {"loss": 0.8154, "grad_norm": 0.6850021481513977, "learning_rate": 0.0002, "epoch": 1.9454219030520647, "step": 27090}, {"loss": 0.7179, "grad_norm": 0.6380038261413574, "learning_rate": 0.0002, "epoch": 1.9461400359066428, "step": 27100}, {"loss": 0.7638, "grad_norm": 0.5806204080581665, "learning_rate": 0.0002, "epoch": 1.9468581687612208, "step": 27110}, {"loss": 0.7032, "grad_norm": 0.8236927390098572, "learning_rate": 0.0002, "epoch": 1.947576301615799, "step": 27120}, {"loss": 0.7398, "grad_norm": 0.7915826439857483, "learning_rate": 0.0002, "epoch": 1.948294434470377, "step": 27130}, {"loss": 0.729, "grad_norm": 0.7467429041862488, "learning_rate": 0.0002, "epoch": 1.9490125673249552, "step": 27140}, {"loss": 0.7297, "grad_norm": 0.6278707981109619, "learning_rate": 0.0002, "epoch": 1.9497307001795332, "step": 27150}, {"loss": 0.7272, "grad_norm": 0.7353739142417908, "learning_rate": 0.0002, "epoch": 1.9504488330341112, "step": 27160}, {"loss": 0.6877, "grad_norm": 0.6443645358085632, "learning_rate": 0.0002, "epoch": 1.9511669658886894, "step": 27170}, {"loss": 0.7479, "grad_norm": 0.770800769329071, "learning_rate": 0.0002, "epoch": 1.9518850987432677, "step": 27180}, {"loss": 0.713, "grad_norm": 0.8982598781585693, "learning_rate": 0.0002, "epoch": 1.9526032315978457, "step": 27190}, {"loss": 0.7447, "grad_norm": 0.775017499923706, "learning_rate": 0.0002, "epoch": 1.9533213644524237, "step": 27200}, {"loss": 0.76, "grad_norm": 0.8271628618240356, "learning_rate": 0.0002, "epoch": 1.9540394973070017, "step": 27210}, {"loss": 0.7321, "grad_norm": 0.7460184693336487, "learning_rate": 0.0002, "epoch": 1.9547576301615799, "step": 27220}, {"loss": 0.6999, "grad_norm": 0.7732188105583191, "learning_rate": 0.0002, "epoch": 1.955475763016158, "step": 27230}, {"loss": 0.7135, "grad_norm": 0.7398577332496643, "learning_rate": 0.0002, "epoch": 1.956193895870736, "step": 27240}, {"loss": 0.7347, "grad_norm": 0.7132339477539062, "learning_rate": 0.0002, "epoch": 1.9569120287253141, "step": 27250}, {"loss": 0.7731, "grad_norm": 0.6718965768814087, "learning_rate": 0.0002, "epoch": 1.9576301615798921, "step": 27260}, {"loss": 0.7088, "grad_norm": 0.7914422154426575, "learning_rate": 0.0002, "epoch": 1.9583482944344703, "step": 27270}, {"loss": 0.6998, "grad_norm": 0.8314110636711121, "learning_rate": 0.0002, "epoch": 1.9590664272890486, "step": 27280}, {"loss": 0.7662, "grad_norm": 0.7810674905776978, "learning_rate": 0.0002, "epoch": 1.9597845601436266, "step": 27290}, {"loss": 0.7278, "grad_norm": 0.7691007256507874, "learning_rate": 0.0002, "epoch": 1.9605026929982046, "step": 27300}, {"loss": 0.7152, "grad_norm": 0.6753138899803162, "learning_rate": 0.0002, "epoch": 1.9612208258527828, "step": 27310}, {"loss": 0.7519, "grad_norm": 0.5881175994873047, "learning_rate": 0.0002, "epoch": 1.961938958707361, "step": 27320}, {"loss": 0.6877, "grad_norm": 0.8414133191108704, "learning_rate": 0.0002, "epoch": 1.962657091561939, "step": 27330}, {"loss": 0.7241, "grad_norm": 0.7363715171813965, "learning_rate": 0.0002, "epoch": 1.963375224416517, "step": 27340}, {"loss": 0.7153, "grad_norm": 0.6526232361793518, "learning_rate": 0.0002, "epoch": 1.964093357271095, "step": 27350}, {"loss": 0.8178, "grad_norm": 0.6821389198303223, "learning_rate": 0.0002, "epoch": 1.9648114901256732, "step": 27360}, {"loss": 0.7134, "grad_norm": 0.7306062579154968, "learning_rate": 0.0002, "epoch": 1.9655296229802515, "step": 27370}, {"loss": 0.7139, "grad_norm": 0.6458130478858948, "learning_rate": 0.0002, "epoch": 1.9662477558348295, "step": 27380}, {"loss": 0.7397, "grad_norm": 0.7243196368217468, "learning_rate": 0.0002, "epoch": 1.9669658886894075, "step": 27390}, {"loss": 0.6729, "grad_norm": 0.8062235713005066, "learning_rate": 0.0002, "epoch": 1.9676840215439855, "step": 27400}, {"loss": 0.7972, "grad_norm": 0.68441241979599, "learning_rate": 0.0002, "epoch": 1.9684021543985637, "step": 27410}, {"loss": 0.7235, "grad_norm": 0.7504498958587646, "learning_rate": 0.0002, "epoch": 1.969120287253142, "step": 27420}, {"loss": 0.7192, "grad_norm": 0.7469466328620911, "learning_rate": 0.0002, "epoch": 1.96983842010772, "step": 27430}, {"loss": 0.7556, "grad_norm": 0.7109853625297546, "learning_rate": 0.0002, "epoch": 1.970556552962298, "step": 27440}, {"loss": 0.7977, "grad_norm": 0.6964903473854065, "learning_rate": 0.0002, "epoch": 1.9712746858168761, "step": 27450}, {"loss": 0.7692, "grad_norm": 0.8224200010299683, "learning_rate": 0.0002, "epoch": 1.9719928186714544, "step": 27460}, {"loss": 0.7318, "grad_norm": 0.6195617318153381, "learning_rate": 0.0002, "epoch": 1.9727109515260324, "step": 27470}, {"loss": 0.7843, "grad_norm": 0.691511332988739, "learning_rate": 0.0002, "epoch": 1.9734290843806104, "step": 27480}, {"loss": 0.7324, "grad_norm": 0.7437900304794312, "learning_rate": 0.0002, "epoch": 1.9741472172351884, "step": 27490}, {"loss": 0.6736, "grad_norm": 0.7987960577011108, "learning_rate": 0.0002, "epoch": 1.9748653500897666, "step": 27500}, {"loss": 0.7005, "grad_norm": 0.7117776274681091, "learning_rate": 0.0002, "epoch": 1.9755834829443448, "step": 27510}, {"loss": 0.7201, "grad_norm": 0.8473866581916809, "learning_rate": 0.0002, "epoch": 1.9763016157989228, "step": 27520}, {"loss": 0.7528, "grad_norm": 0.7178242802619934, "learning_rate": 0.0002, "epoch": 1.9770197486535008, "step": 27530}, {"loss": 0.7112, "grad_norm": 0.760145902633667, "learning_rate": 0.0002, "epoch": 1.9777378815080788, "step": 27540}, {"loss": 0.8118, "grad_norm": 0.764436662197113, "learning_rate": 0.0002, "epoch": 1.978456014362657, "step": 27550}, {"loss": 0.7542, "grad_norm": 0.7245904803276062, "learning_rate": 0.0002, "epoch": 1.9791741472172353, "step": 27560}, {"loss": 0.7316, "grad_norm": 0.6317000389099121, "learning_rate": 0.0002, "epoch": 1.9798922800718133, "step": 27570}, {"loss": 0.7504, "grad_norm": 0.8764704465866089, "learning_rate": 0.0002, "epoch": 1.9806104129263913, "step": 27580}, {"loss": 0.7845, "grad_norm": 0.6111825108528137, "learning_rate": 0.0002, "epoch": 1.9813285457809695, "step": 27590}, {"loss": 0.7101, "grad_norm": 0.6797714233398438, "learning_rate": 0.0002, "epoch": 1.9820466786355477, "step": 27600}, {"loss": 0.8037, "grad_norm": 0.7754142880439758, "learning_rate": 0.0002, "epoch": 1.9827648114901257, "step": 27610}, {"loss": 0.7288, "grad_norm": 0.7243061661720276, "learning_rate": 0.0002, "epoch": 1.9834829443447037, "step": 27620}, {"loss": 0.6626, "grad_norm": 0.6194812655448914, "learning_rate": 0.0002, "epoch": 1.9842010771992817, "step": 27630}, {"loss": 0.7162, "grad_norm": 0.6399638056755066, "learning_rate": 0.0002, "epoch": 1.98491921005386, "step": 27640}, {"loss": 0.764, "grad_norm": 0.7637218832969666, "learning_rate": 0.0002, "epoch": 1.9856373429084382, "step": 27650}, {"loss": 0.7386, "grad_norm": 0.9099404811859131, "learning_rate": 0.0002, "epoch": 1.9863554757630162, "step": 27660}, {"loss": 0.7579, "grad_norm": 0.6892596483230591, "learning_rate": 0.0002, "epoch": 1.9870736086175942, "step": 27670}, {"loss": 0.7802, "grad_norm": 0.5962418913841248, "learning_rate": 0.0002, "epoch": 1.9877917414721722, "step": 27680}, {"loss": 0.7162, "grad_norm": 0.5750163197517395, "learning_rate": 0.0002, "epoch": 1.9885098743267504, "step": 27690}, {"loss": 0.7553, "grad_norm": 0.6740097403526306, "learning_rate": 0.0002, "epoch": 1.9892280071813286, "step": 27700}, {"loss": 0.7444, "grad_norm": 0.6968644857406616, "learning_rate": 0.0002, "epoch": 1.9899461400359066, "step": 27710}, {"loss": 0.7383, "grad_norm": 0.6788132190704346, "learning_rate": 0.0002, "epoch": 1.9906642728904846, "step": 27720}, {"loss": 0.7249, "grad_norm": 0.8600544929504395, "learning_rate": 0.0002, "epoch": 1.9913824057450629, "step": 27730}, {"loss": 0.7133, "grad_norm": 0.6227671504020691, "learning_rate": 0.0002, "epoch": 1.992100538599641, "step": 27740}, {"loss": 0.7815, "grad_norm": 0.6611875295639038, "learning_rate": 0.0002, "epoch": 1.992818671454219, "step": 27750}, {"loss": 0.7423, "grad_norm": 0.714568018913269, "learning_rate": 0.0002, "epoch": 1.993536804308797, "step": 27760}, {"loss": 0.7297, "grad_norm": 0.6328669190406799, "learning_rate": 0.0002, "epoch": 1.994254937163375, "step": 27770}, {"loss": 0.7398, "grad_norm": 0.8673429489135742, "learning_rate": 0.0002, "epoch": 1.9949730700179533, "step": 27780}, {"loss": 0.7301, "grad_norm": 0.820620059967041, "learning_rate": 0.0002, "epoch": 1.9956912028725315, "step": 27790}, {"loss": 0.7828, "grad_norm": 0.8748094439506531, "learning_rate": 0.0002, "epoch": 1.9964093357271095, "step": 27800}, {"loss": 0.6945, "grad_norm": 0.8118113875389099, "learning_rate": 0.0002, "epoch": 1.9971274685816875, "step": 27810}, {"loss": 0.742, "grad_norm": 0.6886725425720215, "learning_rate": 0.0002, "epoch": 1.9978456014362656, "step": 27820}, {"loss": 0.7293, "grad_norm": 0.7101268768310547, "learning_rate": 0.0002, "epoch": 1.9985637342908438, "step": 27830}, {"loss": 0.7317, "grad_norm": 0.7823781967163086, "learning_rate": 0.0002, "epoch": 1.999281867145422, "step": 27840}, {"loss": 0.7711, "grad_norm": 0.8491085767745972, "learning_rate": 0.0002, "epoch": 2.0, "step": 27850}, {"eval_loss": 1.0868422985076904, "eval_runtime": 55.1699, "eval_samples_per_second": 13.286, "eval_steps_per_second": 1.668, "epoch": 2.0, "step": 27850}, {"loss": 0.6808, "grad_norm": 0.9003389477729797, "learning_rate": 0.0002, "epoch": 2.000718132854578, "step": 27860}, {"loss": 0.6379, "grad_norm": 0.8898349404335022, "learning_rate": 0.0002, "epoch": 2.001436265709156, "step": 27870}, {"loss": 0.7157, "grad_norm": 0.7525973320007324, "learning_rate": 0.0002, "epoch": 2.0021543985637344, "step": 27880}, {"loss": 0.6681, "grad_norm": 0.7821497321128845, "learning_rate": 0.0002, "epoch": 2.0028725314183125, "step": 27890}, {"loss": 0.6781, "grad_norm": 0.6334691047668457, "learning_rate": 0.0002, "epoch": 2.0035906642728905, "step": 27900}, {"loss": 0.6349, "grad_norm": 0.732991099357605, "learning_rate": 0.0002, "epoch": 2.0043087971274685, "step": 27910}, {"loss": 0.6776, "grad_norm": 0.949942946434021, "learning_rate": 0.0002, "epoch": 2.0050269299820465, "step": 27920}, {"loss": 0.735, "grad_norm": 0.657267689704895, "learning_rate": 0.0002, "epoch": 2.005745062836625, "step": 27930}, {"loss": 0.7123, "grad_norm": 0.8329252004623413, "learning_rate": 0.0002, "epoch": 2.006463195691203, "step": 27940}, {"loss": 0.6826, "grad_norm": 0.7816959023475647, "learning_rate": 0.0002, "epoch": 2.007181328545781, "step": 27950}, {"loss": 0.6511, "grad_norm": 0.7546323537826538, "learning_rate": 0.0002, "epoch": 2.007899461400359, "step": 27960}, {"loss": 0.6222, "grad_norm": 0.9519657492637634, "learning_rate": 0.0002, "epoch": 2.0086175942549374, "step": 27970}, {"loss": 0.6642, "grad_norm": 0.7934315800666809, "learning_rate": 0.0002, "epoch": 2.0093357271095154, "step": 27980}, {"loss": 0.666, "grad_norm": 0.9579764604568481, "learning_rate": 0.0002, "epoch": 2.0100538599640934, "step": 27990}, {"loss": 0.6376, "grad_norm": 0.764167070388794, "learning_rate": 0.0002, "epoch": 2.0107719928186714, "step": 28000}, {"loss": 0.6512, "grad_norm": 0.7380000948905945, "learning_rate": 0.0002, "epoch": 2.0114901256732494, "step": 28010}, {"loss": 0.6893, "grad_norm": 0.7220044732093811, "learning_rate": 0.0002, "epoch": 2.012208258527828, "step": 28020}, {"loss": 0.6168, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 2.012926391382406, "step": 28030}, {"loss": 0.6595, "grad_norm": 0.7507190704345703, "learning_rate": 0.0002, "epoch": 2.013644524236984, "step": 28040}, {"loss": 0.6974, "grad_norm": 0.9488387703895569, "learning_rate": 0.0002, "epoch": 2.014362657091562, "step": 28050}, {"loss": 0.6489, "grad_norm": 0.9092940092086792, "learning_rate": 0.0002, "epoch": 2.01508078994614, "step": 28060}, {"loss": 0.6545, "grad_norm": 0.7859629392623901, "learning_rate": 0.0002, "epoch": 2.0157989228007183, "step": 28070}, {"loss": 0.6552, "grad_norm": 0.7636393904685974, "learning_rate": 0.0002, "epoch": 2.0165170556552963, "step": 28080}, {"loss": 0.696, "grad_norm": 0.8860714435577393, "learning_rate": 0.0002, "epoch": 2.0172351885098743, "step": 28090}, {"loss": 0.6368, "grad_norm": 0.6837195158004761, "learning_rate": 0.0002, "epoch": 2.0179533213644523, "step": 28100}, {"loss": 0.6405, "grad_norm": 0.7778242826461792, "learning_rate": 0.0002, "epoch": 2.0186714542190307, "step": 28110}, {"loss": 0.6417, "grad_norm": 0.7164766788482666, "learning_rate": 0.0002, "epoch": 2.0193895870736087, "step": 28120}, {"loss": 0.6684, "grad_norm": 0.8965572118759155, "learning_rate": 0.0002, "epoch": 2.0201077199281867, "step": 28130}, {"loss": 0.6213, "grad_norm": 0.8074374794960022, "learning_rate": 0.0002, "epoch": 2.0208258527827647, "step": 28140}, {"loss": 0.6563, "grad_norm": 0.8307222127914429, "learning_rate": 0.0002, "epoch": 2.0215439856373427, "step": 28150}, {"loss": 0.6617, "grad_norm": 0.9600032567977905, "learning_rate": 0.0002, "epoch": 2.022262118491921, "step": 28160}, {"loss": 0.6722, "grad_norm": 0.8541040420532227, "learning_rate": 0.0002, "epoch": 2.022980251346499, "step": 28170}, {"loss": 0.6803, "grad_norm": 0.8864985704421997, "learning_rate": 0.0002, "epoch": 2.023698384201077, "step": 28180}, {"loss": 0.6516, "grad_norm": 0.7926326990127563, "learning_rate": 0.0002, "epoch": 2.024416517055655, "step": 28190}, {"loss": 0.6595, "grad_norm": 1.0548077821731567, "learning_rate": 0.0002, "epoch": 2.025134649910233, "step": 28200}, {"loss": 0.6859, "grad_norm": 0.7468827366828918, "learning_rate": 0.0002, "epoch": 2.0258527827648116, "step": 28210}, {"loss": 0.6605, "grad_norm": 0.7683286070823669, "learning_rate": 0.0002, "epoch": 2.0265709156193896, "step": 28220}, {"loss": 0.6656, "grad_norm": 0.7307319641113281, "learning_rate": 0.0002, "epoch": 2.0272890484739676, "step": 28230}, {"loss": 0.7148, "grad_norm": 0.7813416719436646, "learning_rate": 0.0002, "epoch": 2.0280071813285456, "step": 28240}, {"loss": 0.6882, "grad_norm": 0.7954556941986084, "learning_rate": 0.0002, "epoch": 2.028725314183124, "step": 28250}, {"loss": 0.6192, "grad_norm": 0.8836418986320496, "learning_rate": 0.0002, "epoch": 2.029443447037702, "step": 28260}, {"loss": 0.6275, "grad_norm": 0.7092728614807129, "learning_rate": 0.0002, "epoch": 2.03016157989228, "step": 28270}, {"loss": 0.6735, "grad_norm": 0.8512285351753235, "learning_rate": 0.0002, "epoch": 2.030879712746858, "step": 28280}, {"loss": 0.6586, "grad_norm": 0.8005346059799194, "learning_rate": 0.0002, "epoch": 2.031597845601436, "step": 28290}, {"loss": 0.6129, "grad_norm": 0.8872515559196472, "learning_rate": 0.0002, "epoch": 2.0323159784560145, "step": 28300}, {"loss": 0.6935, "grad_norm": 0.7948436737060547, "learning_rate": 0.0002, "epoch": 2.0330341113105925, "step": 28310}, {"loss": 0.6831, "grad_norm": 0.7418082356452942, "learning_rate": 0.0002, "epoch": 2.0337522441651705, "step": 28320}, {"loss": 0.6922, "grad_norm": 0.9600949287414551, "learning_rate": 0.0002, "epoch": 2.0344703770197485, "step": 28330}, {"loss": 0.6015, "grad_norm": 0.9767434597015381, "learning_rate": 0.0002, "epoch": 2.0351885098743265, "step": 28340}, {"loss": 0.6637, "grad_norm": 0.7435336709022522, "learning_rate": 0.0002, "epoch": 2.035906642728905, "step": 28350}, {"loss": 0.649, "grad_norm": 0.997978925704956, "learning_rate": 0.0002, "epoch": 2.036624775583483, "step": 28360}, {"loss": 0.6957, "grad_norm": 0.9072412252426147, "learning_rate": 0.0002, "epoch": 2.037342908438061, "step": 28370}, {"loss": 0.6816, "grad_norm": 0.8396701812744141, "learning_rate": 0.0002, "epoch": 2.038061041292639, "step": 28380}, {"loss": 0.6487, "grad_norm": 1.0449832677841187, "learning_rate": 0.0002, "epoch": 2.0387791741472174, "step": 28390}, {"loss": 0.6826, "grad_norm": 0.6471025943756104, "learning_rate": 0.0002, "epoch": 2.0394973070017954, "step": 28400}, {"loss": 0.6597, "grad_norm": 0.8147950768470764, "learning_rate": 0.0002, "epoch": 2.0402154398563734, "step": 28410}, {"loss": 0.6502, "grad_norm": 0.902508020401001, "learning_rate": 0.0002, "epoch": 2.0409335727109514, "step": 28420}, {"loss": 0.6303, "grad_norm": 0.6426262855529785, "learning_rate": 0.0002, "epoch": 2.0416517055655294, "step": 28430}, {"loss": 0.6812, "grad_norm": 0.8016643524169922, "learning_rate": 0.0002, "epoch": 2.042369838420108, "step": 28440}, {"loss": 0.6535, "grad_norm": 0.6841614246368408, "learning_rate": 0.0002, "epoch": 2.043087971274686, "step": 28450}, {"loss": 0.638, "grad_norm": 0.7713631987571716, "learning_rate": 0.0002, "epoch": 2.043806104129264, "step": 28460}, {"loss": 0.6456, "grad_norm": 0.8795675039291382, "learning_rate": 0.0002, "epoch": 2.044524236983842, "step": 28470}, {"loss": 0.6858, "grad_norm": 0.725447416305542, "learning_rate": 0.0002, "epoch": 2.04524236983842, "step": 28480}, {"loss": 0.6289, "grad_norm": 0.806861162185669, "learning_rate": 0.0002, "epoch": 2.0459605026929983, "step": 28490}, {"loss": 0.6269, "grad_norm": 0.752953827381134, "learning_rate": 0.0002, "epoch": 2.0466786355475763, "step": 28500}, {"loss": 0.6818, "grad_norm": 0.7143173813819885, "learning_rate": 0.0002, "epoch": 2.0473967684021543, "step": 28510}, {"loss": 0.6606, "grad_norm": 0.9316226243972778, "learning_rate": 0.0002, "epoch": 2.0481149012567323, "step": 28520}, {"loss": 0.6284, "grad_norm": 0.7292338609695435, "learning_rate": 0.0002, "epoch": 2.048833034111311, "step": 28530}, {"loss": 0.6528, "grad_norm": 0.7392885088920593, "learning_rate": 0.0002, "epoch": 2.049551166965889, "step": 28540}, {"loss": 0.7007, "grad_norm": 0.7288873195648193, "learning_rate": 0.0002, "epoch": 2.050269299820467, "step": 28550}, {"loss": 0.6239, "grad_norm": 0.7791221141815186, "learning_rate": 0.0002, "epoch": 2.050987432675045, "step": 28560}, {"loss": 0.684, "grad_norm": 0.821983814239502, "learning_rate": 0.0002, "epoch": 2.051705565529623, "step": 28570}, {"loss": 0.6545, "grad_norm": 0.8925826549530029, "learning_rate": 0.0002, "epoch": 2.0524236983842012, "step": 28580}, {"loss": 0.719, "grad_norm": 0.7181646227836609, "learning_rate": 0.0002, "epoch": 2.0531418312387792, "step": 28590}, {"loss": 0.686, "grad_norm": 0.6387725472450256, "learning_rate": 0.0002, "epoch": 2.0538599640933572, "step": 28600}, {"loss": 0.6662, "grad_norm": 0.8398096561431885, "learning_rate": 0.0002, "epoch": 2.0545780969479353, "step": 28610}, {"loss": 0.69, "grad_norm": 1.0458195209503174, "learning_rate": 0.0002, "epoch": 2.0552962298025133, "step": 28620}, {"loss": 0.655, "grad_norm": 0.7032150626182556, "learning_rate": 0.0002, "epoch": 2.0560143626570917, "step": 28630}, {"loss": 0.6551, "grad_norm": 0.8850845098495483, "learning_rate": 0.0002, "epoch": 2.0567324955116697, "step": 28640}, {"loss": 0.6767, "grad_norm": 0.8587120175361633, "learning_rate": 0.0002, "epoch": 2.0574506283662477, "step": 28650}, {"loss": 0.6721, "grad_norm": 0.7462602853775024, "learning_rate": 0.0002, "epoch": 2.0581687612208257, "step": 28660}, {"loss": 0.6639, "grad_norm": 0.7355574369430542, "learning_rate": 0.0002, "epoch": 2.058886894075404, "step": 28670}, {"loss": 0.6216, "grad_norm": 0.9229736328125, "learning_rate": 0.0002, "epoch": 2.059605026929982, "step": 28680}, {"loss": 0.6692, "grad_norm": 0.7685085535049438, "learning_rate": 0.0002, "epoch": 2.06032315978456, "step": 28690}, {"loss": 0.6801, "grad_norm": 0.6749364137649536, "learning_rate": 0.0002, "epoch": 2.061041292639138, "step": 28700}, {"loss": 0.6721, "grad_norm": 0.7608520984649658, "learning_rate": 0.0002, "epoch": 2.061759425493716, "step": 28710}, {"loss": 0.6721, "grad_norm": 0.9451281428337097, "learning_rate": 0.0002, "epoch": 2.0624775583482946, "step": 28720}, {"loss": 0.671, "grad_norm": 0.7869735360145569, "learning_rate": 0.0002, "epoch": 2.0631956912028726, "step": 28730}, {"loss": 0.6409, "grad_norm": 0.8422008156776428, "learning_rate": 0.0002, "epoch": 2.0639138240574506, "step": 28740}, {"loss": 0.6686, "grad_norm": 0.7486162781715393, "learning_rate": 0.0002, "epoch": 2.0646319569120286, "step": 28750}, {"loss": 0.6641, "grad_norm": 0.9374173879623413, "learning_rate": 0.0002, "epoch": 2.0653500897666066, "step": 28760}, {"loss": 0.6737, "grad_norm": 0.8749295473098755, "learning_rate": 0.0002, "epoch": 2.066068222621185, "step": 28770}, {"loss": 0.636, "grad_norm": 0.8265942931175232, "learning_rate": 0.0002, "epoch": 2.066786355475763, "step": 28780}, {"loss": 0.6819, "grad_norm": 0.8541982769966125, "learning_rate": 0.0002, "epoch": 2.067504488330341, "step": 28790}, {"loss": 0.661, "grad_norm": 0.8220006227493286, "learning_rate": 0.0002, "epoch": 2.068222621184919, "step": 28800}, {"loss": 0.6942, "grad_norm": 0.7302022576332092, "learning_rate": 0.0002, "epoch": 2.0689407540394975, "step": 28810}, {"loss": 0.68, "grad_norm": 0.7073875069618225, "learning_rate": 0.0002, "epoch": 2.0696588868940755, "step": 28820}, {"loss": 0.6275, "grad_norm": 0.7792919874191284, "learning_rate": 0.0002, "epoch": 2.0703770197486535, "step": 28830}, {"loss": 0.6941, "grad_norm": 0.8268185257911682, "learning_rate": 0.0002, "epoch": 2.0710951526032315, "step": 28840}, {"loss": 0.6776, "grad_norm": 0.7576423287391663, "learning_rate": 0.0002, "epoch": 2.0718132854578095, "step": 28850}, {"loss": 0.6298, "grad_norm": 0.8255910873413086, "learning_rate": 0.0002, "epoch": 2.072531418312388, "step": 28860}, {"loss": 0.6695, "grad_norm": 0.7900934815406799, "learning_rate": 0.0002, "epoch": 2.073249551166966, "step": 28870}, {"loss": 0.6532, "grad_norm": 0.846665620803833, "learning_rate": 0.0002, "epoch": 2.073967684021544, "step": 28880}, {"loss": 0.6598, "grad_norm": 0.8159831166267395, "learning_rate": 0.0002, "epoch": 2.074685816876122, "step": 28890}, {"loss": 0.6341, "grad_norm": 0.7395941615104675, "learning_rate": 0.0002, "epoch": 2.0754039497307, "step": 28900}, {"loss": 0.6513, "grad_norm": 0.9765046238899231, "learning_rate": 0.0002, "epoch": 2.0761220825852784, "step": 28910}, {"loss": 0.6785, "grad_norm": 0.8358173966407776, "learning_rate": 0.0002, "epoch": 2.0768402154398564, "step": 28920}, {"loss": 0.6973, "grad_norm": 0.6848723292350769, "learning_rate": 0.0002, "epoch": 2.0775583482944344, "step": 28930}, {"loss": 0.6381, "grad_norm": 0.7965065836906433, "learning_rate": 0.0002, "epoch": 2.0782764811490124, "step": 28940}, {"loss": 0.667, "grad_norm": 0.7618608474731445, "learning_rate": 0.0002, "epoch": 2.078994614003591, "step": 28950}, {"loss": 0.6683, "grad_norm": 0.890615701675415, "learning_rate": 0.0002, "epoch": 2.079712746858169, "step": 28960}, {"loss": 0.6641, "grad_norm": 0.7310431003570557, "learning_rate": 0.0002, "epoch": 2.080430879712747, "step": 28970}, {"loss": 0.6511, "grad_norm": 0.8228268027305603, "learning_rate": 0.0002, "epoch": 2.081149012567325, "step": 28980}, {"loss": 0.655, "grad_norm": 0.883577287197113, "learning_rate": 0.0002, "epoch": 2.081867145421903, "step": 28990}, {"loss": 0.7232, "grad_norm": 0.8359243869781494, "learning_rate": 0.0002, "epoch": 2.0825852782764813, "step": 29000}, {"loss": 0.6744, "grad_norm": 0.8285391330718994, "learning_rate": 0.0002, "epoch": 2.0833034111310593, "step": 29010}, {"loss": 0.6951, "grad_norm": 0.8991064429283142, "learning_rate": 0.0002, "epoch": 2.0840215439856373, "step": 29020}, {"loss": 0.6444, "grad_norm": 0.6911244988441467, "learning_rate": 0.0002, "epoch": 2.0847396768402153, "step": 29030}, {"loss": 0.7098, "grad_norm": 0.8462249636650085, "learning_rate": 0.0002, "epoch": 2.0854578096947933, "step": 29040}, {"loss": 0.6813, "grad_norm": 0.9149548411369324, "learning_rate": 0.0002, "epoch": 2.0861759425493718, "step": 29050}, {"loss": 0.6948, "grad_norm": 0.7365630269050598, "learning_rate": 0.0002, "epoch": 2.0868940754039498, "step": 29060}, {"loss": 0.6391, "grad_norm": 0.8439079523086548, "learning_rate": 0.0002, "epoch": 2.087612208258528, "step": 29070}, {"loss": 0.6566, "grad_norm": 0.7123780846595764, "learning_rate": 0.0002, "epoch": 2.088330341113106, "step": 29080}, {"loss": 0.6305, "grad_norm": 0.6854261755943298, "learning_rate": 0.0002, "epoch": 2.0890484739676842, "step": 29090}, {"loss": 0.667, "grad_norm": 0.83026123046875, "learning_rate": 0.0002, "epoch": 2.0897666068222622, "step": 29100}, {"loss": 0.661, "grad_norm": 0.8413158059120178, "learning_rate": 0.0002, "epoch": 2.0904847396768402, "step": 29110}, {"loss": 0.7194, "grad_norm": 0.9646758437156677, "learning_rate": 0.0002, "epoch": 2.0912028725314182, "step": 29120}, {"loss": 0.7101, "grad_norm": 0.8421565890312195, "learning_rate": 0.0002, "epoch": 2.0919210053859962, "step": 29130}, {"loss": 0.6685, "grad_norm": 0.7748899459838867, "learning_rate": 0.0002, "epoch": 2.0926391382405747, "step": 29140}, {"loss": 0.6596, "grad_norm": 0.5973830819129944, "learning_rate": 0.0002, "epoch": 2.0933572710951527, "step": 29150}, {"loss": 0.6437, "grad_norm": 0.8440837860107422, "learning_rate": 0.0002, "epoch": 2.0940754039497307, "step": 29160}, {"loss": 0.6373, "grad_norm": 0.7392688989639282, "learning_rate": 0.0002, "epoch": 2.0947935368043087, "step": 29170}, {"loss": 0.6907, "grad_norm": 1.0522996187210083, "learning_rate": 0.0002, "epoch": 2.0955116696588867, "step": 29180}, {"loss": 0.6733, "grad_norm": 0.7330273389816284, "learning_rate": 0.0002, "epoch": 2.096229802513465, "step": 29190}, {"loss": 0.7219, "grad_norm": 1.11064875125885, "learning_rate": 0.0002, "epoch": 2.096947935368043, "step": 29200}, {"loss": 0.6125, "grad_norm": 0.795446515083313, "learning_rate": 0.0002, "epoch": 2.097666068222621, "step": 29210}, {"loss": 0.6466, "grad_norm": 0.5552594661712646, "learning_rate": 0.0002, "epoch": 2.098384201077199, "step": 29220}, {"loss": 0.6601, "grad_norm": 0.7327710390090942, "learning_rate": 0.0002, "epoch": 2.0991023339317776, "step": 29230}, {"loss": 0.656, "grad_norm": 0.7474247217178345, "learning_rate": 0.0002, "epoch": 2.0998204667863556, "step": 29240}, {"loss": 0.6707, "grad_norm": 0.7775853276252747, "learning_rate": 0.0002, "epoch": 2.1005385996409336, "step": 29250}, {"loss": 0.6623, "grad_norm": 0.769527018070221, "learning_rate": 0.0002, "epoch": 2.1012567324955116, "step": 29260}, {"loss": 0.6183, "grad_norm": 0.8350797891616821, "learning_rate": 0.0002, "epoch": 2.1019748653500896, "step": 29270}, {"loss": 0.6623, "grad_norm": 0.8749061822891235, "learning_rate": 0.0002, "epoch": 2.102692998204668, "step": 29280}, {"loss": 0.6292, "grad_norm": 0.7838778495788574, "learning_rate": 0.0002, "epoch": 2.103411131059246, "step": 29290}, {"loss": 0.699, "grad_norm": 0.8144710063934326, "learning_rate": 0.0002, "epoch": 2.104129263913824, "step": 29300}, {"loss": 0.6291, "grad_norm": 0.7965250015258789, "learning_rate": 0.0002, "epoch": 2.104847396768402, "step": 29310}, {"loss": 0.6387, "grad_norm": 0.7075945138931274, "learning_rate": 0.0002, "epoch": 2.10556552962298, "step": 29320}, {"loss": 0.6846, "grad_norm": 0.9449555277824402, "learning_rate": 0.0002, "epoch": 2.1062836624775585, "step": 29330}, {"loss": 0.6571, "grad_norm": 0.9114580750465393, "learning_rate": 0.0002, "epoch": 2.1070017953321365, "step": 29340}, {"loss": 0.6652, "grad_norm": 0.8768125176429749, "learning_rate": 0.0002, "epoch": 2.1077199281867145, "step": 29350}, {"loss": 0.7134, "grad_norm": 0.8586908578872681, "learning_rate": 0.0002, "epoch": 2.1084380610412925, "step": 29360}, {"loss": 0.6471, "grad_norm": 0.8351234793663025, "learning_rate": 0.0002, "epoch": 2.109156193895871, "step": 29370}, {"loss": 0.671, "grad_norm": 0.686488687992096, "learning_rate": 0.0002, "epoch": 2.109874326750449, "step": 29380}, {"loss": 0.6706, "grad_norm": 0.7910184264183044, "learning_rate": 0.0002, "epoch": 2.110592459605027, "step": 29390}, {"loss": 0.7367, "grad_norm": 0.7649612426757812, "learning_rate": 0.0002, "epoch": 2.111310592459605, "step": 29400}, {"loss": 0.6386, "grad_norm": 0.7790259122848511, "learning_rate": 0.0002, "epoch": 2.112028725314183, "step": 29410}, {"loss": 0.6983, "grad_norm": 0.8386351466178894, "learning_rate": 0.0002, "epoch": 2.1127468581687614, "step": 29420}, {"loss": 0.6519, "grad_norm": 0.8605695366859436, "learning_rate": 0.0002, "epoch": 2.1134649910233394, "step": 29430}, {"loss": 0.6686, "grad_norm": 0.6808947920799255, "learning_rate": 0.0002, "epoch": 2.1141831238779174, "step": 29440}, {"loss": 0.6743, "grad_norm": 0.8310001492500305, "learning_rate": 0.0002, "epoch": 2.1149012567324954, "step": 29450}, {"loss": 0.6669, "grad_norm": 1.289986252784729, "learning_rate": 0.0002, "epoch": 2.1156193895870734, "step": 29460}, {"loss": 0.6947, "grad_norm": 0.8679313659667969, "learning_rate": 0.0002, "epoch": 2.116337522441652, "step": 29470}, {"loss": 0.6954, "grad_norm": 0.9149175882339478, "learning_rate": 0.0002, "epoch": 2.11705565529623, "step": 29480}, {"loss": 0.6908, "grad_norm": 0.8405622839927673, "learning_rate": 0.0002, "epoch": 2.117773788150808, "step": 29490}, {"loss": 0.7436, "grad_norm": 0.9174691438674927, "learning_rate": 0.0002, "epoch": 2.118491921005386, "step": 29500}, {"loss": 0.6804, "grad_norm": 0.8865614533424377, "learning_rate": 0.0002, "epoch": 2.1192100538599643, "step": 29510}, {"loss": 0.6535, "grad_norm": 0.645301342010498, "learning_rate": 0.0002, "epoch": 2.1199281867145423, "step": 29520}, {"loss": 0.6879, "grad_norm": 0.7612960338592529, "learning_rate": 0.0002, "epoch": 2.1206463195691203, "step": 29530}, {"loss": 0.6874, "grad_norm": 0.7575576305389404, "learning_rate": 0.0002, "epoch": 2.1213644524236983, "step": 29540}, {"loss": 0.6924, "grad_norm": 0.8746156096458435, "learning_rate": 0.0002, "epoch": 2.1220825852782763, "step": 29550}, {"loss": 0.6659, "grad_norm": 0.8488934636116028, "learning_rate": 0.0002, "epoch": 2.1228007181328548, "step": 29560}, {"loss": 0.6568, "grad_norm": 0.8064972162246704, "learning_rate": 0.0002, "epoch": 2.1235188509874328, "step": 29570}, {"loss": 0.713, "grad_norm": 0.7410933971405029, "learning_rate": 0.0002, "epoch": 2.1242369838420108, "step": 29580}, {"loss": 0.649, "grad_norm": 0.7023535966873169, "learning_rate": 0.0002, "epoch": 2.1249551166965888, "step": 29590}, {"loss": 0.6574, "grad_norm": 0.8591743111610413, "learning_rate": 0.0002, "epoch": 2.1256732495511668, "step": 29600}, {"loss": 0.673, "grad_norm": 0.7270186543464661, "learning_rate": 0.0002, "epoch": 2.126391382405745, "step": 29610}, {"loss": 0.6262, "grad_norm": 0.9639726281166077, "learning_rate": 0.0002, "epoch": 2.127109515260323, "step": 29620}, {"loss": 0.6434, "grad_norm": 0.8519027829170227, "learning_rate": 0.0002, "epoch": 2.127827648114901, "step": 29630}, {"loss": 0.6843, "grad_norm": 0.8786447048187256, "learning_rate": 0.0002, "epoch": 2.128545780969479, "step": 29640}, {"loss": 0.6386, "grad_norm": 0.7452822923660278, "learning_rate": 0.0002, "epoch": 2.129263913824057, "step": 29650}, {"loss": 0.6577, "grad_norm": 0.9385744333267212, "learning_rate": 0.0002, "epoch": 2.1299820466786357, "step": 29660}, {"loss": 0.7088, "grad_norm": 0.7650160193443298, "learning_rate": 0.0002, "epoch": 2.1307001795332137, "step": 29670}, {"loss": 0.6742, "grad_norm": 0.7581976652145386, "learning_rate": 0.0002, "epoch": 2.1314183123877917, "step": 29680}, {"loss": 0.6358, "grad_norm": 0.8455183506011963, "learning_rate": 0.0002, "epoch": 2.1321364452423697, "step": 29690}, {"loss": 0.6288, "grad_norm": 0.7200509905815125, "learning_rate": 0.0002, "epoch": 2.132854578096948, "step": 29700}, {"loss": 0.695, "grad_norm": 0.7071877121925354, "learning_rate": 0.0002, "epoch": 2.133572710951526, "step": 29710}, {"loss": 0.6852, "grad_norm": 0.9197220802307129, "learning_rate": 0.0002, "epoch": 2.134290843806104, "step": 29720}, {"loss": 0.6578, "grad_norm": 0.6787277460098267, "learning_rate": 0.0002, "epoch": 2.135008976660682, "step": 29730}, {"loss": 0.666, "grad_norm": 0.8183788061141968, "learning_rate": 0.0002, "epoch": 2.13572710951526, "step": 29740}, {"loss": 0.6754, "grad_norm": 0.7958994507789612, "learning_rate": 0.0002, "epoch": 2.1364452423698386, "step": 29750}, {"loss": 0.6761, "grad_norm": 0.8803889155387878, "learning_rate": 0.0002, "epoch": 2.1371633752244166, "step": 29760}, {"loss": 0.686, "grad_norm": 0.6682677268981934, "learning_rate": 0.0002, "epoch": 2.1378815080789946, "step": 29770}, {"loss": 0.6878, "grad_norm": 1.0198085308074951, "learning_rate": 0.0002, "epoch": 2.1385996409335726, "step": 29780}, {"loss": 0.6576, "grad_norm": 1.0258227586746216, "learning_rate": 0.0002, "epoch": 2.139317773788151, "step": 29790}, {"loss": 0.6454, "grad_norm": 0.8920917510986328, "learning_rate": 0.0002, "epoch": 2.140035906642729, "step": 29800}, {"loss": 0.6926, "grad_norm": 0.8352635502815247, "learning_rate": 0.0002, "epoch": 2.140754039497307, "step": 29810}, {"loss": 0.692, "grad_norm": 0.8422067165374756, "learning_rate": 0.0002, "epoch": 2.141472172351885, "step": 29820}, {"loss": 0.72, "grad_norm": 0.8845202326774597, "learning_rate": 0.0002, "epoch": 2.142190305206463, "step": 29830}, {"loss": 0.688, "grad_norm": 0.659397542476654, "learning_rate": 0.0002, "epoch": 2.1429084380610415, "step": 29840}, {"loss": 0.6354, "grad_norm": 0.6233306527137756, "learning_rate": 0.0002, "epoch": 2.1436265709156195, "step": 29850}, {"loss": 0.6946, "grad_norm": 0.8951199054718018, "learning_rate": 0.0002, "epoch": 2.1443447037701975, "step": 29860}, {"loss": 0.6417, "grad_norm": 0.6980211734771729, "learning_rate": 0.0002, "epoch": 2.1450628366247755, "step": 29870}, {"loss": 0.6754, "grad_norm": 0.8463385105133057, "learning_rate": 0.0002, "epoch": 2.1457809694793535, "step": 29880}, {"loss": 0.6636, "grad_norm": 0.682183027267456, "learning_rate": 0.0002, "epoch": 2.146499102333932, "step": 29890}, {"loss": 0.6605, "grad_norm": 0.8491033911705017, "learning_rate": 0.0002, "epoch": 2.14721723518851, "step": 29900}, {"loss": 0.6851, "grad_norm": 0.8112631440162659, "learning_rate": 0.0002, "epoch": 2.147935368043088, "step": 29910}, {"loss": 0.6804, "grad_norm": 1.0186359882354736, "learning_rate": 0.0002, "epoch": 2.148653500897666, "step": 29920}, {"loss": 0.6709, "grad_norm": 0.7904929518699646, "learning_rate": 0.0002, "epoch": 2.149371633752244, "step": 29930}, {"loss": 0.6535, "grad_norm": 0.8381312489509583, "learning_rate": 0.0002, "epoch": 2.1500897666068224, "step": 29940}, {"loss": 0.6896, "grad_norm": 0.7596192359924316, "learning_rate": 0.0002, "epoch": 2.1508078994614004, "step": 29950}, {"loss": 0.6473, "grad_norm": 0.7532448768615723, "learning_rate": 0.0002, "epoch": 2.1515260323159784, "step": 29960}, {"loss": 0.7051, "grad_norm": 0.7877430319786072, "learning_rate": 0.0002, "epoch": 2.1522441651705564, "step": 29970}, {"loss": 0.6657, "grad_norm": 0.6870610117912292, "learning_rate": 0.0002, "epoch": 2.152962298025135, "step": 29980}, {"loss": 0.6518, "grad_norm": 0.7154987454414368, "learning_rate": 0.0002, "epoch": 2.153680430879713, "step": 29990}, {"loss": 0.6418, "grad_norm": 0.7692370414733887, "learning_rate": 0.0002, "epoch": 2.154398563734291, "step": 30000}, {"loss": 0.6557, "grad_norm": 0.7745859026908875, "learning_rate": 0.0002, "epoch": 2.155116696588869, "step": 30010}, {"loss": 0.61, "grad_norm": 0.718207061290741, "learning_rate": 0.0002, "epoch": 2.155834829443447, "step": 30020}, {"loss": 0.6348, "grad_norm": 0.8851615786552429, "learning_rate": 0.0002, "epoch": 2.1565529622980253, "step": 30030}, {"loss": 0.7108, "grad_norm": 0.736194372177124, "learning_rate": 0.0002, "epoch": 2.1572710951526033, "step": 30040}, {"loss": 0.6682, "grad_norm": 0.9908117055892944, "learning_rate": 0.0002, "epoch": 2.1579892280071813, "step": 30050}, {"loss": 0.6348, "grad_norm": 0.6772316694259644, "learning_rate": 0.0002, "epoch": 2.1587073608617593, "step": 30060}, {"loss": 0.6952, "grad_norm": 0.7474411725997925, "learning_rate": 0.0002, "epoch": 2.1594254937163377, "step": 30070}, {"loss": 0.6698, "grad_norm": 0.8140033483505249, "learning_rate": 0.0002, "epoch": 2.1601436265709157, "step": 30080}, {"loss": 0.6516, "grad_norm": 0.912555992603302, "learning_rate": 0.0002, "epoch": 2.1608617594254937, "step": 30090}, {"loss": 0.6818, "grad_norm": 0.8189636468887329, "learning_rate": 0.0002, "epoch": 2.1615798922800717, "step": 30100}, {"loss": 0.6662, "grad_norm": 0.7520000338554382, "learning_rate": 0.0002, "epoch": 2.1622980251346497, "step": 30110}, {"loss": 0.678, "grad_norm": 0.9635465741157532, "learning_rate": 0.0002, "epoch": 2.163016157989228, "step": 30120}, {"loss": 0.6641, "grad_norm": 0.9139830470085144, "learning_rate": 0.0002, "epoch": 2.163734290843806, "step": 30130}, {"loss": 0.6685, "grad_norm": 0.844384491443634, "learning_rate": 0.0002, "epoch": 2.164452423698384, "step": 30140}, {"loss": 0.708, "grad_norm": 0.8296793103218079, "learning_rate": 0.0002, "epoch": 2.165170556552962, "step": 30150}, {"loss": 0.668, "grad_norm": 0.7929309606552124, "learning_rate": 0.0002, "epoch": 2.16588868940754, "step": 30160}, {"loss": 0.6221, "grad_norm": 0.8046507239341736, "learning_rate": 0.0002, "epoch": 2.1666068222621186, "step": 30170}, {"loss": 0.6788, "grad_norm": 0.8161377310752869, "learning_rate": 0.0002, "epoch": 2.1673249551166966, "step": 30180}, {"loss": 0.6578, "grad_norm": 0.6984363794326782, "learning_rate": 0.0002, "epoch": 2.1680430879712747, "step": 30190}, {"loss": 0.6774, "grad_norm": 0.8578489422798157, "learning_rate": 0.0002, "epoch": 2.1687612208258527, "step": 30200}, {"loss": 0.668, "grad_norm": 0.8051524758338928, "learning_rate": 0.0002, "epoch": 2.1694793536804307, "step": 30210}, {"loss": 0.6212, "grad_norm": 0.6775792241096497, "learning_rate": 0.0002, "epoch": 2.170197486535009, "step": 30220}, {"loss": 0.705, "grad_norm": 0.7102242708206177, "learning_rate": 0.0002, "epoch": 2.170915619389587, "step": 30230}, {"loss": 0.6814, "grad_norm": 0.9038975238800049, "learning_rate": 0.0002, "epoch": 2.171633752244165, "step": 30240}, {"loss": 0.6919, "grad_norm": 0.8509918451309204, "learning_rate": 0.0002, "epoch": 2.172351885098743, "step": 30250}, {"loss": 0.6904, "grad_norm": 0.8816375732421875, "learning_rate": 0.0002, "epoch": 2.1730700179533216, "step": 30260}, {"loss": 0.7211, "grad_norm": 0.7907037138938904, "learning_rate": 0.0002, "epoch": 2.1737881508078996, "step": 30270}, {"loss": 0.6542, "grad_norm": 0.7104434967041016, "learning_rate": 0.0002, "epoch": 2.1745062836624776, "step": 30280}, {"loss": 0.6863, "grad_norm": 1.028658151626587, "learning_rate": 0.0002, "epoch": 2.1752244165170556, "step": 30290}, {"loss": 0.6789, "grad_norm": 0.8542430400848389, "learning_rate": 0.0002, "epoch": 2.1759425493716336, "step": 30300}, {"loss": 0.6783, "grad_norm": 0.7438064813613892, "learning_rate": 0.0002, "epoch": 2.176660682226212, "step": 30310}, {"loss": 0.63, "grad_norm": 0.8384708762168884, "learning_rate": 0.0002, "epoch": 2.17737881508079, "step": 30320}, {"loss": 0.6861, "grad_norm": 0.9034163355827332, "learning_rate": 0.0002, "epoch": 2.178096947935368, "step": 30330}, {"loss": 0.666, "grad_norm": 0.9659526944160461, "learning_rate": 0.0002, "epoch": 2.178815080789946, "step": 30340}, {"loss": 0.6819, "grad_norm": 0.6685642600059509, "learning_rate": 0.0002, "epoch": 2.1795332136445245, "step": 30350}, {"loss": 0.6759, "grad_norm": 0.9180589318275452, "learning_rate": 0.0002, "epoch": 2.1802513464991025, "step": 30360}, {"loss": 0.6575, "grad_norm": 0.9550795555114746, "learning_rate": 0.0002, "epoch": 2.1809694793536805, "step": 30370}, {"loss": 0.7014, "grad_norm": 0.8517686724662781, "learning_rate": 0.0002, "epoch": 2.1816876122082585, "step": 30380}, {"loss": 0.7069, "grad_norm": 0.7351927161216736, "learning_rate": 0.0002, "epoch": 2.1824057450628365, "step": 30390}, {"loss": 0.6555, "grad_norm": 0.8439408540725708, "learning_rate": 0.0002, "epoch": 2.183123877917415, "step": 30400}, {"loss": 0.69, "grad_norm": 0.8322570323944092, "learning_rate": 0.0002, "epoch": 2.183842010771993, "step": 30410}, {"loss": 0.6801, "grad_norm": 0.6735888123512268, "learning_rate": 0.0002, "epoch": 2.184560143626571, "step": 30420}, {"loss": 0.6844, "grad_norm": 0.7273133397102356, "learning_rate": 0.0002, "epoch": 2.185278276481149, "step": 30430}, {"loss": 0.7119, "grad_norm": 0.7841959595680237, "learning_rate": 0.0002, "epoch": 2.185996409335727, "step": 30440}, {"loss": 0.6717, "grad_norm": 0.67259281873703, "learning_rate": 0.0002, "epoch": 2.1867145421903054, "step": 30450}, {"loss": 0.6857, "grad_norm": 0.7646223306655884, "learning_rate": 0.0002, "epoch": 2.1874326750448834, "step": 30460}, {"loss": 0.6803, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 2.1881508078994614, "step": 30470}, {"loss": 0.6512, "grad_norm": 0.8818342685699463, "learning_rate": 0.0002, "epoch": 2.1888689407540394, "step": 30480}, {"loss": 0.6778, "grad_norm": 0.7421377897262573, "learning_rate": 0.0002, "epoch": 2.1895870736086174, "step": 30490}, {"loss": 0.6783, "grad_norm": 0.8180080652236938, "learning_rate": 0.0002, "epoch": 2.190305206463196, "step": 30500}, {"loss": 0.6774, "grad_norm": 0.8003571033477783, "learning_rate": 0.0002, "epoch": 2.191023339317774, "step": 30510}, {"loss": 0.7, "grad_norm": 0.8200605511665344, "learning_rate": 0.0002, "epoch": 2.191741472172352, "step": 30520}, {"loss": 0.7113, "grad_norm": 0.8878887295722961, "learning_rate": 0.0002, "epoch": 2.19245960502693, "step": 30530}, {"loss": 0.6364, "grad_norm": 0.8518163561820984, "learning_rate": 0.0002, "epoch": 2.1931777378815083, "step": 30540}, {"loss": 0.7039, "grad_norm": 0.8182454705238342, "learning_rate": 0.0002, "epoch": 2.1938958707360863, "step": 30550}, {"loss": 0.6966, "grad_norm": 0.9395919442176819, "learning_rate": 0.0002, "epoch": 2.1946140035906643, "step": 30560}, {"loss": 0.6617, "grad_norm": 0.7916256189346313, "learning_rate": 0.0002, "epoch": 2.1953321364452423, "step": 30570}, {"loss": 0.6869, "grad_norm": 0.7303445339202881, "learning_rate": 0.0002, "epoch": 2.1960502692998203, "step": 30580}, {"loss": 0.6485, "grad_norm": 0.7407387495040894, "learning_rate": 0.0002, "epoch": 2.1967684021543987, "step": 30590}, {"loss": 0.6704, "grad_norm": 0.7410500645637512, "learning_rate": 0.0002, "epoch": 2.1974865350089767, "step": 30600}, {"loss": 0.7013, "grad_norm": 0.9176440834999084, "learning_rate": 0.0002, "epoch": 2.1982046678635547, "step": 30610}, {"loss": 0.706, "grad_norm": 0.8823038935661316, "learning_rate": 0.0002, "epoch": 2.1989228007181327, "step": 30620}, {"loss": 0.7418, "grad_norm": 0.9263436198234558, "learning_rate": 0.0002, "epoch": 2.199640933572711, "step": 30630}, {"loss": 0.6019, "grad_norm": 0.6753571033477783, "learning_rate": 0.0002, "epoch": 2.200359066427289, "step": 30640}, {"loss": 0.6808, "grad_norm": 0.841160774230957, "learning_rate": 0.0002, "epoch": 2.201077199281867, "step": 30650}, {"loss": 0.6917, "grad_norm": 0.8786441683769226, "learning_rate": 0.0002, "epoch": 2.201795332136445, "step": 30660}, {"loss": 0.6878, "grad_norm": 0.8833681344985962, "learning_rate": 0.0002, "epoch": 2.202513464991023, "step": 30670}, {"loss": 0.7061, "grad_norm": 0.6609824299812317, "learning_rate": 0.0002, "epoch": 2.2032315978456016, "step": 30680}, {"loss": 0.6572, "grad_norm": 0.7308626174926758, "learning_rate": 0.0002, "epoch": 2.2039497307001796, "step": 30690}, {"loss": 0.7127, "grad_norm": 0.8854711055755615, "learning_rate": 0.0002, "epoch": 2.2046678635547576, "step": 30700}, {"loss": 0.6836, "grad_norm": 0.839043140411377, "learning_rate": 0.0002, "epoch": 2.2053859964093356, "step": 30710}, {"loss": 0.6577, "grad_norm": 0.9030174016952515, "learning_rate": 0.0002, "epoch": 2.2061041292639136, "step": 30720}, {"loss": 0.663, "grad_norm": 0.6856667399406433, "learning_rate": 0.0002, "epoch": 2.206822262118492, "step": 30730}, {"loss": 0.6672, "grad_norm": 0.8823501467704773, "learning_rate": 0.0002, "epoch": 2.20754039497307, "step": 30740}, {"loss": 0.6809, "grad_norm": 0.8501278162002563, "learning_rate": 0.0002, "epoch": 2.208258527827648, "step": 30750}, {"loss": 0.7402, "grad_norm": 0.8099446892738342, "learning_rate": 0.0002, "epoch": 2.208976660682226, "step": 30760}, {"loss": 0.6996, "grad_norm": 0.7203072905540466, "learning_rate": 0.0002, "epoch": 2.209694793536804, "step": 30770}, {"loss": 0.7494, "grad_norm": 1.0898563861846924, "learning_rate": 0.0002, "epoch": 2.2104129263913825, "step": 30780}, {"loss": 0.6432, "grad_norm": 0.8157216906547546, "learning_rate": 0.0002, "epoch": 2.2111310592459605, "step": 30790}, {"loss": 0.634, "grad_norm": 0.7617478966712952, "learning_rate": 0.0002, "epoch": 2.2118491921005385, "step": 30800}, {"loss": 0.7155, "grad_norm": 0.790503978729248, "learning_rate": 0.0002, "epoch": 2.2125673249551165, "step": 30810}, {"loss": 0.6301, "grad_norm": 0.9289199113845825, "learning_rate": 0.0002, "epoch": 2.213285457809695, "step": 30820}, {"loss": 0.6867, "grad_norm": 0.9267001748085022, "learning_rate": 0.0002, "epoch": 2.214003590664273, "step": 30830}, {"loss": 0.7012, "grad_norm": 0.716023862361908, "learning_rate": 0.0002, "epoch": 2.214721723518851, "step": 30840}, {"loss": 0.6755, "grad_norm": 0.8733863234519958, "learning_rate": 0.0002, "epoch": 2.215439856373429, "step": 30850}, {"loss": 0.6713, "grad_norm": 0.7743660807609558, "learning_rate": 0.0002, "epoch": 2.216157989228007, "step": 30860}, {"loss": 0.665, "grad_norm": 0.7974567413330078, "learning_rate": 0.0002, "epoch": 2.2168761220825854, "step": 30870}, {"loss": 0.6624, "grad_norm": 0.6617984771728516, "learning_rate": 0.0002, "epoch": 2.2175942549371634, "step": 30880}, {"loss": 0.6332, "grad_norm": 0.6925143003463745, "learning_rate": 0.0002, "epoch": 2.2183123877917414, "step": 30890}, {"loss": 0.6986, "grad_norm": 0.6853532195091248, "learning_rate": 0.0002, "epoch": 2.2190305206463194, "step": 30900}, {"loss": 0.6881, "grad_norm": 0.7964699268341064, "learning_rate": 0.0002, "epoch": 2.219748653500898, "step": 30910}, {"loss": 0.6879, "grad_norm": 0.8116228580474854, "learning_rate": 0.0002, "epoch": 2.220466786355476, "step": 30920}, {"loss": 0.6599, "grad_norm": 1.0121010541915894, "learning_rate": 0.0002, "epoch": 2.221184919210054, "step": 30930}, {"loss": 0.6873, "grad_norm": 0.7348445653915405, "learning_rate": 0.0002, "epoch": 2.221903052064632, "step": 30940}, {"loss": 0.6711, "grad_norm": 0.8998047709465027, "learning_rate": 0.0002, "epoch": 2.22262118491921, "step": 30950}, {"loss": 0.692, "grad_norm": 0.6108106970787048, "learning_rate": 0.0002, "epoch": 2.2233393177737883, "step": 30960}, {"loss": 0.6515, "grad_norm": 1.287834882736206, "learning_rate": 0.0002, "epoch": 2.2240574506283664, "step": 30970}, {"loss": 0.6513, "grad_norm": 0.8584468960762024, "learning_rate": 0.0002, "epoch": 2.2247755834829444, "step": 30980}, {"loss": 0.6907, "grad_norm": 0.865276038646698, "learning_rate": 0.0002, "epoch": 2.2254937163375224, "step": 30990}, {"loss": 0.7516, "grad_norm": 0.8713302612304688, "learning_rate": 0.0002, "epoch": 2.2262118491921004, "step": 31000}, {"loss": 0.7127, "grad_norm": 0.9210535883903503, "learning_rate": 0.0002, "epoch": 2.226929982046679, "step": 31010}, {"loss": 0.6543, "grad_norm": 0.8578430414199829, "learning_rate": 0.0002, "epoch": 2.227648114901257, "step": 31020}, {"loss": 0.6964, "grad_norm": 0.7128387093544006, "learning_rate": 0.0002, "epoch": 2.228366247755835, "step": 31030}, {"loss": 0.6949, "grad_norm": 0.8059941530227661, "learning_rate": 0.0002, "epoch": 2.229084380610413, "step": 31040}, {"loss": 0.6422, "grad_norm": 0.8043261170387268, "learning_rate": 0.0002, "epoch": 2.229802513464991, "step": 31050}, {"loss": 0.691, "grad_norm": 0.9260253310203552, "learning_rate": 0.0002, "epoch": 2.2305206463195693, "step": 31060}, {"loss": 0.6601, "grad_norm": 0.7908085584640503, "learning_rate": 0.0002, "epoch": 2.2312387791741473, "step": 31070}, {"loss": 0.6312, "grad_norm": 0.7860442996025085, "learning_rate": 0.0002, "epoch": 2.2319569120287253, "step": 31080}, {"loss": 0.715, "grad_norm": 0.8388702273368835, "learning_rate": 0.0002, "epoch": 2.2326750448833033, "step": 31090}, {"loss": 0.7015, "grad_norm": 0.835686206817627, "learning_rate": 0.0002, "epoch": 2.2333931777378817, "step": 31100}, {"loss": 0.6796, "grad_norm": 0.8148298859596252, "learning_rate": 0.0002, "epoch": 2.2341113105924597, "step": 31110}, {"loss": 0.6318, "grad_norm": 0.8501878976821899, "learning_rate": 0.0002, "epoch": 2.2348294434470377, "step": 31120}, {"loss": 0.7262, "grad_norm": 0.793323278427124, "learning_rate": 0.0002, "epoch": 2.2355475763016157, "step": 31130}, {"loss": 0.722, "grad_norm": 0.8234742879867554, "learning_rate": 0.0002, "epoch": 2.2362657091561937, "step": 31140}, {"loss": 0.6746, "grad_norm": 0.8691303133964539, "learning_rate": 0.0002, "epoch": 2.236983842010772, "step": 31150}, {"loss": 0.6191, "grad_norm": 0.8707090020179749, "learning_rate": 0.0002, "epoch": 2.23770197486535, "step": 31160}, {"loss": 0.6988, "grad_norm": 0.8468940854072571, "learning_rate": 0.0002, "epoch": 2.238420107719928, "step": 31170}, {"loss": 0.6429, "grad_norm": 0.7275772094726562, "learning_rate": 0.0002, "epoch": 2.239138240574506, "step": 31180}, {"loss": 0.7057, "grad_norm": 0.8765808939933777, "learning_rate": 0.0002, "epoch": 2.2398563734290846, "step": 31190}, {"loss": 0.7273, "grad_norm": 1.02803635597229, "learning_rate": 0.0002, "epoch": 2.2405745062836626, "step": 31200}, {"loss": 0.7303, "grad_norm": 0.7999185919761658, "learning_rate": 0.0002, "epoch": 2.2412926391382406, "step": 31210}, {"loss": 0.658, "grad_norm": 0.5711870789527893, "learning_rate": 0.0002, "epoch": 2.2420107719928186, "step": 31220}, {"loss": 0.6527, "grad_norm": 0.7183604836463928, "learning_rate": 0.0002, "epoch": 2.2427289048473966, "step": 31230}, {"loss": 0.6817, "grad_norm": 0.8819206357002258, "learning_rate": 0.0002, "epoch": 2.243447037701975, "step": 31240}, {"loss": 0.6805, "grad_norm": 0.9078969955444336, "learning_rate": 0.0002, "epoch": 2.244165170556553, "step": 31250}, {"loss": 0.6937, "grad_norm": 1.184506893157959, "learning_rate": 0.0002, "epoch": 2.244883303411131, "step": 31260}, {"loss": 0.7682, "grad_norm": 0.8660752177238464, "learning_rate": 0.0002, "epoch": 2.245601436265709, "step": 31270}, {"loss": 0.6461, "grad_norm": 1.011796236038208, "learning_rate": 0.0002, "epoch": 2.246319569120287, "step": 31280}, {"loss": 0.677, "grad_norm": 0.9168157577514648, "learning_rate": 0.0002, "epoch": 2.2470377019748655, "step": 31290}, {"loss": 0.6844, "grad_norm": 0.7798577547073364, "learning_rate": 0.0002, "epoch": 2.2477558348294435, "step": 31300}, {"loss": 0.6622, "grad_norm": 0.6609913110733032, "learning_rate": 0.0002, "epoch": 2.2484739676840215, "step": 31310}, {"loss": 0.6616, "grad_norm": 0.64737868309021, "learning_rate": 0.0002, "epoch": 2.2491921005385995, "step": 31320}, {"loss": 0.665, "grad_norm": 1.0700385570526123, "learning_rate": 0.0002, "epoch": 2.2499102333931775, "step": 31330}, {"loss": 0.6539, "grad_norm": 0.7838551998138428, "learning_rate": 0.0002, "epoch": 2.250628366247756, "step": 31340}, {"loss": 0.7002, "grad_norm": 0.9225728511810303, "learning_rate": 0.0002, "epoch": 2.251346499102334, "step": 31350}, {"loss": 0.6758, "grad_norm": 0.7956384420394897, "learning_rate": 0.0002, "epoch": 2.252064631956912, "step": 31360}, {"loss": 0.7039, "grad_norm": 0.7645466923713684, "learning_rate": 0.0002, "epoch": 2.25278276481149, "step": 31370}, {"loss": 0.6816, "grad_norm": 0.9595549702644348, "learning_rate": 0.0002, "epoch": 2.2535008976660684, "step": 31380}, {"loss": 0.6419, "grad_norm": 0.6124163866043091, "learning_rate": 0.0002, "epoch": 2.2542190305206464, "step": 31390}, {"loss": 0.6573, "grad_norm": 0.7531530261039734, "learning_rate": 0.0002, "epoch": 2.2549371633752244, "step": 31400}, {"loss": 0.6223, "grad_norm": 0.6904721856117249, "learning_rate": 0.0002, "epoch": 2.2556552962298024, "step": 31410}, {"loss": 0.6661, "grad_norm": 0.7644204497337341, "learning_rate": 0.0002, "epoch": 2.2563734290843804, "step": 31420}, {"loss": 0.7122, "grad_norm": 0.7879737019538879, "learning_rate": 0.0002, "epoch": 2.257091561938959, "step": 31430}, {"loss": 0.6407, "grad_norm": 0.796450138092041, "learning_rate": 0.0002, "epoch": 2.257809694793537, "step": 31440}, {"loss": 0.722, "grad_norm": 0.7536656856536865, "learning_rate": 0.0002, "epoch": 2.258527827648115, "step": 31450}, {"loss": 0.681, "grad_norm": 0.6797451376914978, "learning_rate": 0.0002, "epoch": 2.259245960502693, "step": 31460}, {"loss": 0.6916, "grad_norm": 0.7833347320556641, "learning_rate": 0.0002, "epoch": 2.2599640933572713, "step": 31470}, {"loss": 0.702, "grad_norm": 0.7571428418159485, "learning_rate": 0.0002, "epoch": 2.2606822262118493, "step": 31480}, {"loss": 0.6878, "grad_norm": 0.7028690576553345, "learning_rate": 0.0002, "epoch": 2.2614003590664273, "step": 31490}, {"loss": 0.6863, "grad_norm": 0.7854651212692261, "learning_rate": 0.0002, "epoch": 2.2621184919210053, "step": 31500}, {"loss": 0.6895, "grad_norm": 1.1924974918365479, "learning_rate": 0.0002, "epoch": 2.2628366247755833, "step": 31510}, {"loss": 0.7174, "grad_norm": 0.8087588548660278, "learning_rate": 0.0002, "epoch": 2.2635547576301613, "step": 31520}, {"loss": 0.6398, "grad_norm": 0.8521981835365295, "learning_rate": 0.0002, "epoch": 2.26427289048474, "step": 31530}, {"loss": 0.6654, "grad_norm": 0.754585862159729, "learning_rate": 0.0002, "epoch": 2.264991023339318, "step": 31540}, {"loss": 0.6854, "grad_norm": 0.8403395414352417, "learning_rate": 0.0002, "epoch": 2.265709156193896, "step": 31550}, {"loss": 0.6873, "grad_norm": 0.9724786877632141, "learning_rate": 0.0002, "epoch": 2.266427289048474, "step": 31560}, {"loss": 0.6876, "grad_norm": 0.7568767070770264, "learning_rate": 0.0002, "epoch": 2.2671454219030522, "step": 31570}, {"loss": 0.6161, "grad_norm": 0.712009608745575, "learning_rate": 0.0002, "epoch": 2.2678635547576302, "step": 31580}, {"loss": 0.6568, "grad_norm": 0.7649937868118286, "learning_rate": 0.0002, "epoch": 2.2685816876122082, "step": 31590}, {"loss": 0.6195, "grad_norm": 0.7319537997245789, "learning_rate": 0.0002, "epoch": 2.2692998204667862, "step": 31600}, {"loss": 0.6434, "grad_norm": 0.9597942233085632, "learning_rate": 0.0002, "epoch": 2.2700179533213642, "step": 31610}, {"loss": 0.6273, "grad_norm": 0.7403358817100525, "learning_rate": 0.0002, "epoch": 2.2707360861759427, "step": 31620}, {"loss": 0.7185, "grad_norm": 0.7395114898681641, "learning_rate": 0.0002, "epoch": 2.2714542190305207, "step": 31630}, {"loss": 0.6357, "grad_norm": 0.8835344314575195, "learning_rate": 0.0002, "epoch": 2.2721723518850987, "step": 31640}, {"loss": 0.7442, "grad_norm": 0.76587975025177, "learning_rate": 0.0002, "epoch": 2.2728904847396767, "step": 31650}, {"loss": 0.6491, "grad_norm": 0.6472584009170532, "learning_rate": 0.0002, "epoch": 2.273608617594255, "step": 31660}, {"loss": 0.7026, "grad_norm": 1.0170460939407349, "learning_rate": 0.0002, "epoch": 2.274326750448833, "step": 31670}, {"loss": 0.6839, "grad_norm": 0.8170912265777588, "learning_rate": 0.0002, "epoch": 2.275044883303411, "step": 31680}, {"loss": 0.6599, "grad_norm": 0.6821279525756836, "learning_rate": 0.0002, "epoch": 2.275763016157989, "step": 31690}, {"loss": 0.6346, "grad_norm": 0.8150709867477417, "learning_rate": 0.0002, "epoch": 2.276481149012567, "step": 31700}, {"loss": 0.6639, "grad_norm": 0.6786386370658875, "learning_rate": 0.0002, "epoch": 2.2771992818671456, "step": 31710}, {"loss": 0.6753, "grad_norm": 0.8871912360191345, "learning_rate": 0.0002, "epoch": 2.2779174147217236, "step": 31720}, {"loss": 0.6826, "grad_norm": 0.7710220813751221, "learning_rate": 0.0002, "epoch": 2.2786355475763016, "step": 31730}, {"loss": 0.7118, "grad_norm": 0.8073079586029053, "learning_rate": 0.0002, "epoch": 2.2793536804308796, "step": 31740}, {"loss": 0.6614, "grad_norm": 0.8228550553321838, "learning_rate": 0.0002, "epoch": 2.280071813285458, "step": 31750}, {"loss": 0.7162, "grad_norm": 0.7987996339797974, "learning_rate": 0.0002, "epoch": 2.280789946140036, "step": 31760}, {"loss": 0.6953, "grad_norm": 0.744326651096344, "learning_rate": 0.0002, "epoch": 2.281508078994614, "step": 31770}, {"loss": 0.7089, "grad_norm": 0.7672302722930908, "learning_rate": 0.0002, "epoch": 2.282226211849192, "step": 31780}, {"loss": 0.6926, "grad_norm": 0.8079774975776672, "learning_rate": 0.0002, "epoch": 2.28294434470377, "step": 31790}, {"loss": 0.6361, "grad_norm": 0.7383643984794617, "learning_rate": 0.0002, "epoch": 2.283662477558348, "step": 31800}, {"loss": 0.6924, "grad_norm": 0.8542332649230957, "learning_rate": 0.0002, "epoch": 2.2843806104129265, "step": 31810}, {"loss": 0.7156, "grad_norm": 0.7657321691513062, "learning_rate": 0.0002, "epoch": 2.2850987432675045, "step": 31820}, {"loss": 0.6545, "grad_norm": 0.7485944628715515, "learning_rate": 0.0002, "epoch": 2.2858168761220825, "step": 31830}, {"loss": 0.6452, "grad_norm": 0.7817596793174744, "learning_rate": 0.0002, "epoch": 2.2865350089766605, "step": 31840}, {"loss": 0.6398, "grad_norm": 0.840421736240387, "learning_rate": 0.0002, "epoch": 2.287253141831239, "step": 31850}, {"loss": 0.7245, "grad_norm": 0.8190447688102722, "learning_rate": 0.0002, "epoch": 2.287971274685817, "step": 31860}, {"loss": 0.7343, "grad_norm": 0.9582287669181824, "learning_rate": 0.0002, "epoch": 2.288689407540395, "step": 31870}, {"loss": 0.683, "grad_norm": 1.0939116477966309, "learning_rate": 0.0002, "epoch": 2.289407540394973, "step": 31880}, {"loss": 0.7176, "grad_norm": 1.0901678800582886, "learning_rate": 0.0002, "epoch": 2.290125673249551, "step": 31890}, {"loss": 0.6711, "grad_norm": 0.8025168776512146, "learning_rate": 0.0002, "epoch": 2.2908438061041294, "step": 31900}, {"loss": 0.6901, "grad_norm": 0.8157371878623962, "learning_rate": 0.0002, "epoch": 2.2915619389587074, "step": 31910}, {"loss": 0.6643, "grad_norm": 0.7735328078269958, "learning_rate": 0.0002, "epoch": 2.2922800718132854, "step": 31920}, {"loss": 0.689, "grad_norm": 0.7501550316810608, "learning_rate": 0.0002, "epoch": 2.2929982046678634, "step": 31930}, {"loss": 0.6605, "grad_norm": 0.76664799451828, "learning_rate": 0.0002, "epoch": 2.293716337522442, "step": 31940}, {"loss": 0.6818, "grad_norm": 1.0044599771499634, "learning_rate": 0.0002, "epoch": 2.29443447037702, "step": 31950}, {"loss": 0.6566, "grad_norm": 0.7773551344871521, "learning_rate": 0.0002, "epoch": 2.295152603231598, "step": 31960}, {"loss": 0.6834, "grad_norm": 0.9021226763725281, "learning_rate": 0.0002, "epoch": 2.295870736086176, "step": 31970}, {"loss": 0.6757, "grad_norm": 0.9075915813446045, "learning_rate": 0.0002, "epoch": 2.296588868940754, "step": 31980}, {"loss": 0.6584, "grad_norm": 0.9109290242195129, "learning_rate": 0.0002, "epoch": 2.2973070017953323, "step": 31990}, {"loss": 0.6792, "grad_norm": 0.7742900252342224, "learning_rate": 0.0002, "epoch": 2.2980251346499103, "step": 32000}, {"loss": 0.7137, "grad_norm": 0.633260190486908, "learning_rate": 0.0002, "epoch": 2.2987432675044883, "step": 32010}, {"loss": 0.6644, "grad_norm": 0.8593834042549133, "learning_rate": 0.0002, "epoch": 2.2994614003590663, "step": 32020}, {"loss": 0.6961, "grad_norm": 0.88165283203125, "learning_rate": 0.0002, "epoch": 2.3001795332136448, "step": 32030}, {"loss": 0.7779, "grad_norm": 0.7840633988380432, "learning_rate": 0.0002, "epoch": 2.3008976660682228, "step": 32040}, {"loss": 0.7045, "grad_norm": 0.8150764107704163, "learning_rate": 0.0002, "epoch": 2.3016157989228008, "step": 32050}, {"loss": 0.6556, "grad_norm": 0.7683324813842773, "learning_rate": 0.0002, "epoch": 2.3023339317773788, "step": 32060}, {"loss": 0.6657, "grad_norm": 0.7581049799919128, "learning_rate": 0.0002, "epoch": 2.3030520646319568, "step": 32070}, {"loss": 0.6683, "grad_norm": 0.911687970161438, "learning_rate": 0.0002, "epoch": 2.3037701974865348, "step": 32080}, {"loss": 0.7029, "grad_norm": 1.0596355199813843, "learning_rate": 0.0002, "epoch": 2.3044883303411132, "step": 32090}, {"loss": 0.6955, "grad_norm": 0.7329661846160889, "learning_rate": 0.0002, "epoch": 2.3052064631956912, "step": 32100}, {"loss": 0.6798, "grad_norm": 0.8251074552536011, "learning_rate": 0.0002, "epoch": 2.3059245960502692, "step": 32110}, {"loss": 0.692, "grad_norm": 0.7765523195266724, "learning_rate": 0.0002, "epoch": 2.3066427289048472, "step": 32120}, {"loss": 0.6375, "grad_norm": 0.8246980905532837, "learning_rate": 0.0002, "epoch": 2.3073608617594257, "step": 32130}, {"loss": 0.6815, "grad_norm": 0.833387017250061, "learning_rate": 0.0002, "epoch": 2.3080789946140037, "step": 32140}, {"loss": 0.6261, "grad_norm": 0.9558065533638, "learning_rate": 0.0002, "epoch": 2.3087971274685817, "step": 32150}, {"loss": 0.6723, "grad_norm": 0.788151204586029, "learning_rate": 0.0002, "epoch": 2.3095152603231597, "step": 32160}, {"loss": 0.6398, "grad_norm": 0.8662320971488953, "learning_rate": 0.0002, "epoch": 2.3102333931777377, "step": 32170}, {"loss": 0.7014, "grad_norm": 0.7079060673713684, "learning_rate": 0.0002, "epoch": 2.310951526032316, "step": 32180}, {"loss": 0.6479, "grad_norm": 0.8477022647857666, "learning_rate": 0.0002, "epoch": 2.311669658886894, "step": 32190}, {"loss": 0.6872, "grad_norm": 0.6549711227416992, "learning_rate": 0.0002, "epoch": 2.312387791741472, "step": 32200}, {"loss": 0.6668, "grad_norm": 0.8274375796318054, "learning_rate": 0.0002, "epoch": 2.31310592459605, "step": 32210}, {"loss": 0.6731, "grad_norm": 0.6305822730064392, "learning_rate": 0.0002, "epoch": 2.3138240574506286, "step": 32220}, {"loss": 0.6908, "grad_norm": 0.8105725049972534, "learning_rate": 0.0002, "epoch": 2.3145421903052066, "step": 32230}, {"loss": 0.7028, "grad_norm": 0.7317119240760803, "learning_rate": 0.0002, "epoch": 2.3152603231597846, "step": 32240}, {"loss": 0.6444, "grad_norm": 0.7729924917221069, "learning_rate": 0.0002, "epoch": 2.3159784560143626, "step": 32250}, {"loss": 0.6945, "grad_norm": 0.8092145919799805, "learning_rate": 0.0002, "epoch": 2.3166965888689406, "step": 32260}, {"loss": 0.663, "grad_norm": 0.8723762035369873, "learning_rate": 0.0002, "epoch": 2.317414721723519, "step": 32270}, {"loss": 0.6992, "grad_norm": 0.9699533581733704, "learning_rate": 0.0002, "epoch": 2.318132854578097, "step": 32280}, {"loss": 0.7488, "grad_norm": 1.2972444295883179, "learning_rate": 0.0002, "epoch": 2.318850987432675, "step": 32290}, {"loss": 0.6969, "grad_norm": 0.7888450622558594, "learning_rate": 0.0002, "epoch": 2.319569120287253, "step": 32300}, {"loss": 0.6876, "grad_norm": 0.7457000017166138, "learning_rate": 0.0002, "epoch": 2.3202872531418315, "step": 32310}, {"loss": 0.6891, "grad_norm": 0.7270606756210327, "learning_rate": 0.0002, "epoch": 2.3210053859964095, "step": 32320}, {"loss": 0.6607, "grad_norm": 0.7930711507797241, "learning_rate": 0.0002, "epoch": 2.3217235188509875, "step": 32330}, {"loss": 0.7222, "grad_norm": 0.9015030264854431, "learning_rate": 0.0002, "epoch": 2.3224416517055655, "step": 32340}, {"loss": 0.6544, "grad_norm": 0.9385523796081543, "learning_rate": 0.0002, "epoch": 2.3231597845601435, "step": 32350}, {"loss": 0.6779, "grad_norm": 0.7293606400489807, "learning_rate": 0.0002, "epoch": 2.3238779174147215, "step": 32360}, {"loss": 0.6556, "grad_norm": 0.797618567943573, "learning_rate": 0.0002, "epoch": 2.3245960502693, "step": 32370}, {"loss": 0.6743, "grad_norm": 0.8588258028030396, "learning_rate": 0.0002, "epoch": 2.325314183123878, "step": 32380}, {"loss": 0.659, "grad_norm": 0.7490078210830688, "learning_rate": 0.0002, "epoch": 2.326032315978456, "step": 32390}, {"loss": 0.7365, "grad_norm": 0.7569956183433533, "learning_rate": 0.0002, "epoch": 2.326750448833034, "step": 32400}, {"loss": 0.7048, "grad_norm": 0.8754122853279114, "learning_rate": 0.0002, "epoch": 2.3274685816876124, "step": 32410}, {"loss": 0.6845, "grad_norm": 0.9410699605941772, "learning_rate": 0.0002, "epoch": 2.3281867145421904, "step": 32420}, {"loss": 0.6611, "grad_norm": 1.1309062242507935, "learning_rate": 0.0002, "epoch": 2.3289048473967684, "step": 32430}, {"loss": 0.6609, "grad_norm": 0.7923168540000916, "learning_rate": 0.0002, "epoch": 2.3296229802513464, "step": 32440}, {"loss": 0.6728, "grad_norm": 0.830387532711029, "learning_rate": 0.0002, "epoch": 2.3303411131059244, "step": 32450}, {"loss": 0.673, "grad_norm": 0.9087454080581665, "learning_rate": 0.0002, "epoch": 2.331059245960503, "step": 32460}, {"loss": 0.6749, "grad_norm": 0.8892660737037659, "learning_rate": 0.0002, "epoch": 2.331777378815081, "step": 32470}, {"loss": 0.7101, "grad_norm": 0.84930819272995, "learning_rate": 0.0002, "epoch": 2.332495511669659, "step": 32480}, {"loss": 0.6465, "grad_norm": 0.7736781239509583, "learning_rate": 0.0002, "epoch": 2.333213644524237, "step": 32490}, {"loss": 0.6976, "grad_norm": 0.7396222352981567, "learning_rate": 0.0002, "epoch": 2.3339317773788153, "step": 32500}, {"loss": 0.6484, "grad_norm": 0.7710241079330444, "learning_rate": 0.0002, "epoch": 2.3346499102333933, "step": 32510}, {"loss": 0.6591, "grad_norm": 0.7297301888465881, "learning_rate": 0.0002, "epoch": 2.3353680430879713, "step": 32520}, {"loss": 0.7375, "grad_norm": 0.9084094166755676, "learning_rate": 0.0002, "epoch": 2.3360861759425493, "step": 32530}, {"loss": 0.6775, "grad_norm": 0.6425859332084656, "learning_rate": 0.0002, "epoch": 2.3368043087971273, "step": 32540}, {"loss": 0.7249, "grad_norm": 0.8646581172943115, "learning_rate": 0.0002, "epoch": 2.3375224416517058, "step": 32550}, {"loss": 0.6862, "grad_norm": 0.91925048828125, "learning_rate": 0.0002, "epoch": 2.3382405745062838, "step": 32560}, {"loss": 0.6805, "grad_norm": 0.8687716722488403, "learning_rate": 0.0002, "epoch": 2.3389587073608618, "step": 32570}, {"loss": 0.6377, "grad_norm": 0.9769517183303833, "learning_rate": 0.0002, "epoch": 2.3396768402154398, "step": 32580}, {"loss": 0.6459, "grad_norm": 0.7240557074546814, "learning_rate": 0.0002, "epoch": 2.340394973070018, "step": 32590}, {"loss": 0.7029, "grad_norm": 0.6631549000740051, "learning_rate": 0.0002, "epoch": 2.341113105924596, "step": 32600}, {"loss": 0.6524, "grad_norm": 0.9103635549545288, "learning_rate": 0.0002, "epoch": 2.341831238779174, "step": 32610}, {"loss": 0.6695, "grad_norm": 0.8718403577804565, "learning_rate": 0.0002, "epoch": 2.342549371633752, "step": 32620}, {"loss": 0.7006, "grad_norm": 0.8020271062850952, "learning_rate": 0.0002, "epoch": 2.34326750448833, "step": 32630}, {"loss": 0.6853, "grad_norm": 0.7834265232086182, "learning_rate": 0.0002, "epoch": 2.343985637342908, "step": 32640}, {"loss": 0.6447, "grad_norm": 0.8909988403320312, "learning_rate": 0.0002, "epoch": 2.3447037701974867, "step": 32650}, {"loss": 0.6762, "grad_norm": 0.6915582418441772, "learning_rate": 0.0002, "epoch": 2.3454219030520647, "step": 32660}, {"loss": 0.6993, "grad_norm": 0.8829401135444641, "learning_rate": 0.0002, "epoch": 2.3461400359066427, "step": 32670}, {"loss": 0.6035, "grad_norm": 0.8869150876998901, "learning_rate": 0.0002, "epoch": 2.3468581687612207, "step": 32680}, {"loss": 0.6404, "grad_norm": 0.8348933458328247, "learning_rate": 0.0002, "epoch": 2.347576301615799, "step": 32690}, {"loss": 0.6961, "grad_norm": 0.7591108679771423, "learning_rate": 0.0002, "epoch": 2.348294434470377, "step": 32700}, {"loss": 0.7155, "grad_norm": 0.8343638181686401, "learning_rate": 0.0002, "epoch": 2.349012567324955, "step": 32710}, {"loss": 0.6949, "grad_norm": 0.8537896275520325, "learning_rate": 0.0002, "epoch": 2.349730700179533, "step": 32720}, {"loss": 0.6545, "grad_norm": 0.7750797867774963, "learning_rate": 0.0002, "epoch": 2.350448833034111, "step": 32730}, {"loss": 0.7226, "grad_norm": 0.7553941607475281, "learning_rate": 0.0002, "epoch": 2.3511669658886896, "step": 32740}, {"loss": 0.6985, "grad_norm": 0.8083372712135315, "learning_rate": 0.0002, "epoch": 2.3518850987432676, "step": 32750}, {"loss": 0.6345, "grad_norm": 0.8016324043273926, "learning_rate": 0.0002, "epoch": 2.3526032315978456, "step": 32760}, {"loss": 0.6348, "grad_norm": 0.7524061799049377, "learning_rate": 0.0002, "epoch": 2.3533213644524236, "step": 32770}, {"loss": 0.6782, "grad_norm": 0.9046763777732849, "learning_rate": 0.0002, "epoch": 2.354039497307002, "step": 32780}, {"loss": 0.6745, "grad_norm": 0.9704324007034302, "learning_rate": 0.0002, "epoch": 2.35475763016158, "step": 32790}, {"loss": 0.7095, "grad_norm": 0.8756019473075867, "learning_rate": 0.0002, "epoch": 2.355475763016158, "step": 32800}, {"loss": 0.6989, "grad_norm": 0.7345646023750305, "learning_rate": 0.0002, "epoch": 2.356193895870736, "step": 32810}, {"loss": 0.6659, "grad_norm": 0.8022899031639099, "learning_rate": 0.0002, "epoch": 2.356912028725314, "step": 32820}, {"loss": 0.6997, "grad_norm": 0.7663353085517883, "learning_rate": 0.0002, "epoch": 2.3576301615798925, "step": 32830}, {"loss": 0.6683, "grad_norm": 0.7802956104278564, "learning_rate": 0.0002, "epoch": 2.3583482944344705, "step": 32840}, {"loss": 0.679, "grad_norm": 0.8130960464477539, "learning_rate": 0.0002, "epoch": 2.3590664272890485, "step": 32850}, {"loss": 0.6792, "grad_norm": 0.9671252369880676, "learning_rate": 0.0002, "epoch": 2.3597845601436265, "step": 32860}, {"loss": 0.6989, "grad_norm": 0.8806724548339844, "learning_rate": 0.0002, "epoch": 2.3605026929982045, "step": 32870}, {"loss": 0.6674, "grad_norm": 0.9378283619880676, "learning_rate": 0.0002, "epoch": 2.361220825852783, "step": 32880}, {"loss": 0.6607, "grad_norm": 0.8638162612915039, "learning_rate": 0.0002, "epoch": 2.361938958707361, "step": 32890}, {"loss": 0.6866, "grad_norm": 0.7321885228157043, "learning_rate": 0.0002, "epoch": 2.362657091561939, "step": 32900}, {"loss": 0.6682, "grad_norm": 0.8445415496826172, "learning_rate": 0.0002, "epoch": 2.363375224416517, "step": 32910}, {"loss": 0.6863, "grad_norm": 0.915715754032135, "learning_rate": 0.0002, "epoch": 2.364093357271095, "step": 32920}, {"loss": 0.6671, "grad_norm": 0.8674854040145874, "learning_rate": 0.0002, "epoch": 2.3648114901256734, "step": 32930}, {"loss": 0.7124, "grad_norm": 0.7577189207077026, "learning_rate": 0.0002, "epoch": 2.3655296229802514, "step": 32940}, {"loss": 0.6879, "grad_norm": 0.8649988174438477, "learning_rate": 0.0002, "epoch": 2.3662477558348294, "step": 32950}, {"loss": 0.6571, "grad_norm": 0.9760734438896179, "learning_rate": 0.0002, "epoch": 2.3669658886894074, "step": 32960}, {"loss": 0.7002, "grad_norm": 0.8909491300582886, "learning_rate": 0.0002, "epoch": 2.367684021543986, "step": 32970}, {"loss": 0.6961, "grad_norm": 0.6970168948173523, "learning_rate": 0.0002, "epoch": 2.368402154398564, "step": 32980}, {"loss": 0.6153, "grad_norm": 0.8208426237106323, "learning_rate": 0.0002, "epoch": 2.369120287253142, "step": 32990}, {"loss": 0.626, "grad_norm": 0.8477405309677124, "learning_rate": 0.0002, "epoch": 2.36983842010772, "step": 33000}, {"loss": 0.6588, "grad_norm": 0.7771625518798828, "learning_rate": 0.0002, "epoch": 2.370556552962298, "step": 33010}, {"loss": 0.673, "grad_norm": 0.7811821103096008, "learning_rate": 0.0002, "epoch": 2.3712746858168763, "step": 33020}, {"loss": 0.6792, "grad_norm": 0.6280415654182434, "learning_rate": 0.0002, "epoch": 2.3719928186714543, "step": 33030}, {"loss": 0.6567, "grad_norm": 0.8733929395675659, "learning_rate": 0.0002, "epoch": 2.3727109515260323, "step": 33040}, {"loss": 0.6844, "grad_norm": 0.6169558167457581, "learning_rate": 0.0002, "epoch": 2.3734290843806103, "step": 33050}, {"loss": 0.6675, "grad_norm": 0.7414724826812744, "learning_rate": 0.0002, "epoch": 2.3741472172351887, "step": 33060}, {"loss": 0.6905, "grad_norm": 0.7484683990478516, "learning_rate": 0.0002, "epoch": 2.3748653500897667, "step": 33070}, {"loss": 0.6676, "grad_norm": 0.8495098948478699, "learning_rate": 0.0002, "epoch": 2.3755834829443447, "step": 33080}, {"loss": 0.687, "grad_norm": 0.9057353734970093, "learning_rate": 0.0002, "epoch": 2.3763016157989227, "step": 33090}, {"loss": 0.6911, "grad_norm": 0.8028274178504944, "learning_rate": 0.0002, "epoch": 2.3770197486535007, "step": 33100}, {"loss": 0.6851, "grad_norm": 1.2398128509521484, "learning_rate": 0.0002, "epoch": 2.377737881508079, "step": 33110}, {"loss": 0.6753, "grad_norm": 0.7894110679626465, "learning_rate": 0.0002, "epoch": 2.378456014362657, "step": 33120}, {"loss": 0.6625, "grad_norm": 0.8530096411705017, "learning_rate": 0.0002, "epoch": 2.379174147217235, "step": 33130}, {"loss": 0.7061, "grad_norm": 0.892613410949707, "learning_rate": 0.0002, "epoch": 2.379892280071813, "step": 33140}, {"loss": 0.6719, "grad_norm": 0.868606448173523, "learning_rate": 0.0002, "epoch": 2.380610412926391, "step": 33150}, {"loss": 0.6423, "grad_norm": 0.6801115870475769, "learning_rate": 0.0002, "epoch": 2.3813285457809696, "step": 33160}, {"loss": 0.6723, "grad_norm": 0.9517148733139038, "learning_rate": 0.0002, "epoch": 2.3820466786355476, "step": 33170}, {"loss": 0.6957, "grad_norm": 0.8986499309539795, "learning_rate": 0.0002, "epoch": 2.3827648114901256, "step": 33180}, {"loss": 0.6767, "grad_norm": 0.8467642068862915, "learning_rate": 0.0002, "epoch": 2.3834829443447036, "step": 33190}, {"loss": 0.7228, "grad_norm": 0.8400940299034119, "learning_rate": 0.0002, "epoch": 2.3842010771992816, "step": 33200}, {"loss": 0.7048, "grad_norm": 0.86443030834198, "learning_rate": 0.0002, "epoch": 2.38491921005386, "step": 33210}, {"loss": 0.6227, "grad_norm": 0.8599014282226562, "learning_rate": 0.0002, "epoch": 2.385637342908438, "step": 33220}, {"loss": 0.673, "grad_norm": 0.868735134601593, "learning_rate": 0.0002, "epoch": 2.386355475763016, "step": 33230}, {"loss": 0.6612, "grad_norm": 0.941734790802002, "learning_rate": 0.0002, "epoch": 2.387073608617594, "step": 33240}, {"loss": 0.6951, "grad_norm": 0.9342881441116333, "learning_rate": 0.0002, "epoch": 2.3877917414721725, "step": 33250}, {"loss": 0.7255, "grad_norm": 1.012920618057251, "learning_rate": 0.0002, "epoch": 2.3885098743267505, "step": 33260}, {"loss": 0.6399, "grad_norm": 0.6949151754379272, "learning_rate": 0.0002, "epoch": 2.3892280071813286, "step": 33270}, {"loss": 0.7137, "grad_norm": 0.8283912539482117, "learning_rate": 0.0002, "epoch": 2.3899461400359066, "step": 33280}, {"loss": 0.7324, "grad_norm": 0.807273805141449, "learning_rate": 0.0002, "epoch": 2.3906642728904846, "step": 33290}, {"loss": 0.7353, "grad_norm": 0.8109124302864075, "learning_rate": 0.0002, "epoch": 2.391382405745063, "step": 33300}, {"loss": 0.689, "grad_norm": 0.7477563619613647, "learning_rate": 0.0002, "epoch": 2.392100538599641, "step": 33310}, {"loss": 0.6585, "grad_norm": 0.6961637735366821, "learning_rate": 0.0002, "epoch": 2.392818671454219, "step": 33320}, {"loss": 0.6919, "grad_norm": 0.9424173831939697, "learning_rate": 0.0002, "epoch": 2.393536804308797, "step": 33330}, {"loss": 0.6965, "grad_norm": 0.8289623856544495, "learning_rate": 0.0002, "epoch": 2.3942549371633755, "step": 33340}, {"loss": 0.6761, "grad_norm": 0.8106551170349121, "learning_rate": 0.0002, "epoch": 2.3949730700179535, "step": 33350}, {"loss": 0.6675, "grad_norm": 0.8800507187843323, "learning_rate": 0.0002, "epoch": 2.3956912028725315, "step": 33360}, {"loss": 0.6636, "grad_norm": 0.7662274241447449, "learning_rate": 0.0002, "epoch": 2.3964093357271095, "step": 33370}, {"loss": 0.6824, "grad_norm": 0.889204740524292, "learning_rate": 0.0002, "epoch": 2.3971274685816875, "step": 33380}, {"loss": 0.6539, "grad_norm": 0.7991349697113037, "learning_rate": 0.0002, "epoch": 2.3978456014362655, "step": 33390}, {"loss": 0.6818, "grad_norm": 0.8210278749465942, "learning_rate": 0.0002, "epoch": 2.398563734290844, "step": 33400}, {"loss": 0.7118, "grad_norm": 0.91801917552948, "learning_rate": 0.0002, "epoch": 2.399281867145422, "step": 33410}, {"loss": 0.726, "grad_norm": 0.8086220622062683, "learning_rate": 0.0002, "epoch": 2.4, "step": 33420}, {"loss": 0.7418, "grad_norm": 0.901613175868988, "learning_rate": 0.0002, "epoch": 2.400718132854578, "step": 33430}, {"loss": 0.6904, "grad_norm": 0.9865965247154236, "learning_rate": 0.0002, "epoch": 2.4014362657091564, "step": 33440}, {"loss": 0.7543, "grad_norm": 0.8160675168037415, "learning_rate": 0.0002, "epoch": 2.4021543985637344, "step": 33450}, {"loss": 0.6598, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 2.4028725314183124, "step": 33460}, {"loss": 0.6784, "grad_norm": 0.8490013480186462, "learning_rate": 0.0002, "epoch": 2.4035906642728904, "step": 33470}, {"loss": 0.6844, "grad_norm": 0.6947163939476013, "learning_rate": 0.0002, "epoch": 2.4043087971274684, "step": 33480}, {"loss": 0.6606, "grad_norm": 0.7984827756881714, "learning_rate": 0.0002, "epoch": 2.405026929982047, "step": 33490}, {"loss": 0.7032, "grad_norm": 0.7826083302497864, "learning_rate": 0.0002, "epoch": 2.405745062836625, "step": 33500}, {"loss": 0.6914, "grad_norm": 0.8213959336280823, "learning_rate": 0.0002, "epoch": 2.406463195691203, "step": 33510}, {"loss": 0.6855, "grad_norm": 0.8790069818496704, "learning_rate": 0.0002, "epoch": 2.407181328545781, "step": 33520}, {"loss": 0.6278, "grad_norm": 0.9093378782272339, "learning_rate": 0.0002, "epoch": 2.4078994614003593, "step": 33530}, {"loss": 0.6724, "grad_norm": 0.8085389137268066, "learning_rate": 0.0002, "epoch": 2.4086175942549373, "step": 33540}, {"loss": 0.6456, "grad_norm": 0.7952343225479126, "learning_rate": 0.0002, "epoch": 2.4093357271095153, "step": 33550}, {"loss": 0.7357, "grad_norm": 0.9576563835144043, "learning_rate": 0.0002, "epoch": 2.4100538599640933, "step": 33560}, {"loss": 0.7123, "grad_norm": 0.7722929120063782, "learning_rate": 0.0002, "epoch": 2.4107719928186713, "step": 33570}, {"loss": 0.6647, "grad_norm": 0.8634604215621948, "learning_rate": 0.0002, "epoch": 2.4114901256732497, "step": 33580}, {"loss": 0.6677, "grad_norm": 0.7805271148681641, "learning_rate": 0.0002, "epoch": 2.4122082585278277, "step": 33590}, {"loss": 0.6629, "grad_norm": 0.8274481296539307, "learning_rate": 0.0002, "epoch": 2.4129263913824057, "step": 33600}, {"loss": 0.6396, "grad_norm": 0.9265141487121582, "learning_rate": 0.0002, "epoch": 2.4136445242369837, "step": 33610}, {"loss": 0.6727, "grad_norm": 0.7497374415397644, "learning_rate": 0.0002, "epoch": 2.414362657091562, "step": 33620}, {"loss": 0.6543, "grad_norm": 0.7048972249031067, "learning_rate": 0.0002, "epoch": 2.41508078994614, "step": 33630}, {"loss": 0.6863, "grad_norm": 0.8449550271034241, "learning_rate": 0.0002, "epoch": 2.415798922800718, "step": 33640}, {"loss": 0.6891, "grad_norm": 0.7581984400749207, "learning_rate": 0.0002, "epoch": 2.416517055655296, "step": 33650}, {"loss": 0.6845, "grad_norm": 0.7744191288948059, "learning_rate": 0.0002, "epoch": 2.417235188509874, "step": 33660}, {"loss": 0.6412, "grad_norm": 0.6736614108085632, "learning_rate": 0.0002, "epoch": 2.417953321364452, "step": 33670}, {"loss": 0.6792, "grad_norm": 0.985431432723999, "learning_rate": 0.0002, "epoch": 2.4186714542190306, "step": 33680}, {"loss": 0.6675, "grad_norm": 0.8027978539466858, "learning_rate": 0.0002, "epoch": 2.4193895870736086, "step": 33690}, {"loss": 0.7107, "grad_norm": 0.6809377074241638, "learning_rate": 0.0002, "epoch": 2.4201077199281866, "step": 33700}, {"loss": 0.7332, "grad_norm": 0.8305349946022034, "learning_rate": 0.0002, "epoch": 2.4208258527827646, "step": 33710}, {"loss": 0.642, "grad_norm": 0.7632496356964111, "learning_rate": 0.0002, "epoch": 2.421543985637343, "step": 33720}, {"loss": 0.6614, "grad_norm": 0.7241050601005554, "learning_rate": 0.0002, "epoch": 2.422262118491921, "step": 33730}, {"loss": 0.6668, "grad_norm": 0.6729857325553894, "learning_rate": 0.0002, "epoch": 2.422980251346499, "step": 33740}, {"loss": 0.7289, "grad_norm": 0.7741881012916565, "learning_rate": 0.0002, "epoch": 2.423698384201077, "step": 33750}, {"loss": 0.6895, "grad_norm": 0.7844415903091431, "learning_rate": 0.0002, "epoch": 2.424416517055655, "step": 33760}, {"loss": 0.7073, "grad_norm": 0.7960098385810852, "learning_rate": 0.0002, "epoch": 2.4251346499102335, "step": 33770}, {"loss": 0.702, "grad_norm": 0.8267978429794312, "learning_rate": 0.0002, "epoch": 2.4258527827648115, "step": 33780}, {"loss": 0.6379, "grad_norm": 0.7498974204063416, "learning_rate": 0.0002, "epoch": 2.4265709156193895, "step": 33790}, {"loss": 0.6749, "grad_norm": 0.8357859253883362, "learning_rate": 0.0002, "epoch": 2.4272890484739675, "step": 33800}, {"loss": 0.6617, "grad_norm": 0.8056104779243469, "learning_rate": 0.0002, "epoch": 2.428007181328546, "step": 33810}, {"loss": 0.701, "grad_norm": 0.806897759437561, "learning_rate": 0.0002, "epoch": 2.428725314183124, "step": 33820}, {"loss": 0.6771, "grad_norm": 0.7770048975944519, "learning_rate": 0.0002, "epoch": 2.429443447037702, "step": 33830}, {"loss": 0.7096, "grad_norm": 0.8311458230018616, "learning_rate": 0.0002, "epoch": 2.43016157989228, "step": 33840}, {"loss": 0.7127, "grad_norm": 0.9201730489730835, "learning_rate": 0.0002, "epoch": 2.430879712746858, "step": 33850}, {"loss": 0.6722, "grad_norm": 0.83509761095047, "learning_rate": 0.0002, "epoch": 2.4315978456014364, "step": 33860}, {"loss": 0.6477, "grad_norm": 0.7680139541625977, "learning_rate": 0.0002, "epoch": 2.4323159784560144, "step": 33870}, {"loss": 0.7229, "grad_norm": 0.8956670165061951, "learning_rate": 0.0002, "epoch": 2.4330341113105924, "step": 33880}, {"loss": 0.6598, "grad_norm": 0.717941164970398, "learning_rate": 0.0002, "epoch": 2.4337522441651704, "step": 33890}, {"loss": 0.6546, "grad_norm": 0.777206540107727, "learning_rate": 0.0002, "epoch": 2.434470377019749, "step": 33900}, {"loss": 0.7442, "grad_norm": 0.90232914686203, "learning_rate": 0.0002, "epoch": 2.435188509874327, "step": 33910}, {"loss": 0.6763, "grad_norm": 1.0817158222198486, "learning_rate": 0.0002, "epoch": 2.435906642728905, "step": 33920}, {"loss": 0.6995, "grad_norm": 0.7890931367874146, "learning_rate": 0.0002, "epoch": 2.436624775583483, "step": 33930}, {"loss": 0.6438, "grad_norm": 0.9279449582099915, "learning_rate": 0.0002, "epoch": 2.437342908438061, "step": 33940}, {"loss": 0.6694, "grad_norm": 0.8313823342323303, "learning_rate": 0.0002, "epoch": 2.438061041292639, "step": 33950}, {"loss": 0.6841, "grad_norm": 1.0510340929031372, "learning_rate": 0.0002, "epoch": 2.4387791741472173, "step": 33960}, {"loss": 0.7203, "grad_norm": 0.8002574443817139, "learning_rate": 0.0002, "epoch": 2.4394973070017953, "step": 33970}, {"loss": 0.6767, "grad_norm": 0.7822834253311157, "learning_rate": 0.0002, "epoch": 2.4402154398563733, "step": 33980}, {"loss": 0.6289, "grad_norm": 0.9050403237342834, "learning_rate": 0.0002, "epoch": 2.4409335727109513, "step": 33990}, {"loss": 0.6798, "grad_norm": 0.7569652199745178, "learning_rate": 0.0002, "epoch": 2.44165170556553, "step": 34000}, {"loss": 0.648, "grad_norm": 0.6609470844268799, "learning_rate": 0.0002, "epoch": 2.442369838420108, "step": 34010}, {"loss": 0.6734, "grad_norm": 0.8090947866439819, "learning_rate": 0.0002, "epoch": 2.443087971274686, "step": 34020}, {"loss": 0.6621, "grad_norm": 0.647814929485321, "learning_rate": 0.0002, "epoch": 2.443806104129264, "step": 34030}, {"loss": 0.7227, "grad_norm": 0.9308601021766663, "learning_rate": 0.0002, "epoch": 2.444524236983842, "step": 34040}, {"loss": 0.6937, "grad_norm": 0.8259239792823792, "learning_rate": 0.0002, "epoch": 2.4452423698384202, "step": 34050}, {"loss": 0.6813, "grad_norm": 0.9410025477409363, "learning_rate": 0.0002, "epoch": 2.4459605026929983, "step": 34060}, {"loss": 0.7112, "grad_norm": 0.7446974515914917, "learning_rate": 0.0002, "epoch": 2.4466786355475763, "step": 34070}, {"loss": 0.6608, "grad_norm": 0.7093849182128906, "learning_rate": 0.0002, "epoch": 2.4473967684021543, "step": 34080}, {"loss": 0.6801, "grad_norm": 0.8726152181625366, "learning_rate": 0.0002, "epoch": 2.4481149012567327, "step": 34090}, {"loss": 0.7164, "grad_norm": 0.808300793170929, "learning_rate": 0.0002, "epoch": 2.4488330341113107, "step": 34100}, {"loss": 0.658, "grad_norm": 0.6884859800338745, "learning_rate": 0.0002, "epoch": 2.4495511669658887, "step": 34110}, {"loss": 0.6444, "grad_norm": 0.7151864767074585, "learning_rate": 0.0002, "epoch": 2.4502692998204667, "step": 34120}, {"loss": 0.6685, "grad_norm": 0.9261866807937622, "learning_rate": 0.0002, "epoch": 2.4509874326750447, "step": 34130}, {"loss": 0.6717, "grad_norm": 0.8069018125534058, "learning_rate": 0.0002, "epoch": 2.451705565529623, "step": 34140}, {"loss": 0.7436, "grad_norm": 0.8001297116279602, "learning_rate": 0.0002, "epoch": 2.452423698384201, "step": 34150}, {"loss": 0.7032, "grad_norm": 0.8547799587249756, "learning_rate": 0.0002, "epoch": 2.453141831238779, "step": 34160}, {"loss": 0.7226, "grad_norm": 0.6693823337554932, "learning_rate": 0.0002, "epoch": 2.453859964093357, "step": 34170}, {"loss": 0.6644, "grad_norm": 0.6646198630332947, "learning_rate": 0.0002, "epoch": 2.4545780969479356, "step": 34180}, {"loss": 0.6891, "grad_norm": 0.9330950975418091, "learning_rate": 0.0002, "epoch": 2.4552962298025136, "step": 34190}, {"loss": 0.6728, "grad_norm": 0.7738645672798157, "learning_rate": 0.0002, "epoch": 2.4560143626570916, "step": 34200}, {"loss": 0.7162, "grad_norm": 0.7929846048355103, "learning_rate": 0.0002, "epoch": 2.4567324955116696, "step": 34210}, {"loss": 0.6793, "grad_norm": 0.8936280012130737, "learning_rate": 0.0002, "epoch": 2.4574506283662476, "step": 34220}, {"loss": 0.6758, "grad_norm": 0.9099360108375549, "learning_rate": 0.0002, "epoch": 2.4581687612208256, "step": 34230}, {"loss": 0.666, "grad_norm": 0.7941291928291321, "learning_rate": 0.0002, "epoch": 2.458886894075404, "step": 34240}, {"loss": 0.6689, "grad_norm": 0.7169737219810486, "learning_rate": 0.0002, "epoch": 2.459605026929982, "step": 34250}, {"loss": 0.7417, "grad_norm": 0.8994171023368835, "learning_rate": 0.0002, "epoch": 2.46032315978456, "step": 34260}, {"loss": 0.6807, "grad_norm": 0.8087331056594849, "learning_rate": 0.0002, "epoch": 2.461041292639138, "step": 34270}, {"loss": 0.7152, "grad_norm": 0.935502827167511, "learning_rate": 0.0002, "epoch": 2.4617594254937165, "step": 34280}, {"loss": 0.7448, "grad_norm": 0.8957464694976807, "learning_rate": 0.0002, "epoch": 2.4624775583482945, "step": 34290}, {"loss": 0.6501, "grad_norm": 0.9017183780670166, "learning_rate": 0.0002, "epoch": 2.4631956912028725, "step": 34300}, {"loss": 0.6985, "grad_norm": 0.7778640389442444, "learning_rate": 0.0002, "epoch": 2.4639138240574505, "step": 34310}, {"loss": 0.7041, "grad_norm": 0.8870323896408081, "learning_rate": 0.0002, "epoch": 2.4646319569120285, "step": 34320}, {"loss": 0.6796, "grad_norm": 0.7660176753997803, "learning_rate": 0.0002, "epoch": 2.465350089766607, "step": 34330}, {"loss": 0.6705, "grad_norm": 0.8442226648330688, "learning_rate": 0.0002, "epoch": 2.466068222621185, "step": 34340}, {"loss": 0.7019, "grad_norm": 0.7522561550140381, "learning_rate": 0.0002, "epoch": 2.466786355475763, "step": 34350}, {"loss": 0.7331, "grad_norm": 0.9355213046073914, "learning_rate": 0.0002, "epoch": 2.467504488330341, "step": 34360}, {"loss": 0.688, "grad_norm": 0.8487382531166077, "learning_rate": 0.0002, "epoch": 2.4682226211849194, "step": 34370}, {"loss": 0.7068, "grad_norm": 0.7869813442230225, "learning_rate": 0.0002, "epoch": 2.4689407540394974, "step": 34380}, {"loss": 0.6809, "grad_norm": 0.7562848329544067, "learning_rate": 0.0002, "epoch": 2.4696588868940754, "step": 34390}, {"loss": 0.653, "grad_norm": 0.740829586982727, "learning_rate": 0.0002, "epoch": 2.4703770197486534, "step": 34400}, {"loss": 0.656, "grad_norm": 1.0862116813659668, "learning_rate": 0.0002, "epoch": 2.4710951526032314, "step": 34410}, {"loss": 0.6429, "grad_norm": 0.9633645415306091, "learning_rate": 0.0002, "epoch": 2.47181328545781, "step": 34420}, {"loss": 0.7126, "grad_norm": 0.8467186093330383, "learning_rate": 0.0002, "epoch": 2.472531418312388, "step": 34430}, {"loss": 0.6783, "grad_norm": 0.9972147941589355, "learning_rate": 0.0002, "epoch": 2.473249551166966, "step": 34440}, {"loss": 0.701, "grad_norm": 0.8086632490158081, "learning_rate": 0.0002, "epoch": 2.473967684021544, "step": 34450}, {"loss": 0.7127, "grad_norm": 0.9043704271316528, "learning_rate": 0.0002, "epoch": 2.4746858168761223, "step": 34460}, {"loss": 0.6861, "grad_norm": 0.8275330662727356, "learning_rate": 0.0002, "epoch": 2.4754039497307003, "step": 34470}, {"loss": 0.6443, "grad_norm": 0.8142464756965637, "learning_rate": 0.0002, "epoch": 2.4761220825852783, "step": 34480}, {"loss": 0.637, "grad_norm": 0.7116754651069641, "learning_rate": 0.0002, "epoch": 2.4768402154398563, "step": 34490}, {"loss": 0.6572, "grad_norm": 0.8742281198501587, "learning_rate": 0.0002, "epoch": 2.4775583482944343, "step": 34500}, {"loss": 0.6615, "grad_norm": 0.7545657157897949, "learning_rate": 0.0002, "epoch": 2.4782764811490123, "step": 34510}, {"loss": 0.6715, "grad_norm": 0.7586482167243958, "learning_rate": 0.0002, "epoch": 2.478994614003591, "step": 34520}, {"loss": 0.71, "grad_norm": 0.9212547540664673, "learning_rate": 0.0002, "epoch": 2.479712746858169, "step": 34530}, {"loss": 0.6742, "grad_norm": 0.9391530752182007, "learning_rate": 0.0002, "epoch": 2.480430879712747, "step": 34540}, {"loss": 0.6565, "grad_norm": 1.119698166847229, "learning_rate": 0.0002, "epoch": 2.481149012567325, "step": 34550}, {"loss": 0.6734, "grad_norm": 0.8499019145965576, "learning_rate": 0.0002, "epoch": 2.4818671454219032, "step": 34560}, {"loss": 0.7043, "grad_norm": 0.7629778385162354, "learning_rate": 0.0002, "epoch": 2.4825852782764812, "step": 34570}, {"loss": 0.671, "grad_norm": 0.7667021155357361, "learning_rate": 0.0002, "epoch": 2.4833034111310592, "step": 34580}, {"loss": 0.6202, "grad_norm": 0.6711493730545044, "learning_rate": 0.0002, "epoch": 2.4840215439856372, "step": 34590}, {"loss": 0.6644, "grad_norm": 0.7354223728179932, "learning_rate": 0.0002, "epoch": 2.4847396768402152, "step": 34600}, {"loss": 0.622, "grad_norm": 0.875295102596283, "learning_rate": 0.0002, "epoch": 2.4854578096947937, "step": 34610}, {"loss": 0.6946, "grad_norm": 0.7341493964195251, "learning_rate": 0.0002, "epoch": 2.4861759425493717, "step": 34620}, {"loss": 0.6674, "grad_norm": 0.9049216508865356, "learning_rate": 0.0002, "epoch": 2.4868940754039497, "step": 34630}, {"loss": 0.7017, "grad_norm": 0.7214788198471069, "learning_rate": 0.0002, "epoch": 2.4876122082585277, "step": 34640}, {"loss": 0.6571, "grad_norm": 0.7514070868492126, "learning_rate": 0.0002, "epoch": 2.488330341113106, "step": 34650}, {"loss": 0.6623, "grad_norm": 0.6929763555526733, "learning_rate": 0.0002, "epoch": 2.489048473967684, "step": 34660}, {"loss": 0.7118, "grad_norm": 1.11346435546875, "learning_rate": 0.0002, "epoch": 2.489766606822262, "step": 34670}, {"loss": 0.6664, "grad_norm": 0.9285556674003601, "learning_rate": 0.0002, "epoch": 2.49048473967684, "step": 34680}, {"loss": 0.7094, "grad_norm": 0.7699695825576782, "learning_rate": 0.0002, "epoch": 2.491202872531418, "step": 34690}, {"loss": 0.6575, "grad_norm": 0.872349739074707, "learning_rate": 0.0002, "epoch": 2.4919210053859966, "step": 34700}, {"loss": 0.6886, "grad_norm": 0.8692147135734558, "learning_rate": 0.0002, "epoch": 2.4926391382405746, "step": 34710}, {"loss": 0.711, "grad_norm": 0.799740195274353, "learning_rate": 0.0002, "epoch": 2.4933572710951526, "step": 34720}, {"loss": 0.6849, "grad_norm": 0.7320986986160278, "learning_rate": 0.0002, "epoch": 2.4940754039497306, "step": 34730}, {"loss": 0.7138, "grad_norm": 0.8233383893966675, "learning_rate": 0.0002, "epoch": 2.494793536804309, "step": 34740}, {"loss": 0.6937, "grad_norm": 0.9605086445808411, "learning_rate": 0.0002, "epoch": 2.495511669658887, "step": 34750}, {"loss": 0.6511, "grad_norm": 0.8597773909568787, "learning_rate": 0.0002, "epoch": 2.496229802513465, "step": 34760}, {"loss": 0.6793, "grad_norm": 0.7459201812744141, "learning_rate": 0.0002, "epoch": 2.496947935368043, "step": 34770}, {"loss": 0.7098, "grad_norm": 0.778457522392273, "learning_rate": 0.0002, "epoch": 2.497666068222621, "step": 34780}, {"loss": 0.6727, "grad_norm": 0.8591375946998596, "learning_rate": 0.0002, "epoch": 2.498384201077199, "step": 34790}, {"loss": 0.6439, "grad_norm": 0.9689867496490479, "learning_rate": 0.0002, "epoch": 2.4991023339317775, "step": 34800}, {"loss": 0.6365, "grad_norm": 0.7430615425109863, "learning_rate": 0.0002, "epoch": 2.4998204667863555, "step": 34810}, {"loss": 0.7207, "grad_norm": 0.8545114994049072, "learning_rate": 0.0002, "epoch": 2.5005385996409335, "step": 34820}, {"loss": 0.7318, "grad_norm": 0.7115356922149658, "learning_rate": 0.0002, "epoch": 2.5012567324955115, "step": 34830}, {"loss": 0.6985, "grad_norm": 0.7616795301437378, "learning_rate": 0.0002, "epoch": 2.50197486535009, "step": 34840}, {"loss": 0.7153, "grad_norm": 0.8097891211509705, "learning_rate": 0.0002, "epoch": 2.502692998204668, "step": 34850}, {"loss": 0.7131, "grad_norm": 0.7397396564483643, "learning_rate": 0.0002, "epoch": 2.503411131059246, "step": 34860}, {"loss": 0.7213, "grad_norm": 0.7531594038009644, "learning_rate": 0.0002, "epoch": 2.504129263913824, "step": 34870}, {"loss": 0.678, "grad_norm": 0.8050091862678528, "learning_rate": 0.0002, "epoch": 2.504847396768402, "step": 34880}, {"loss": 0.6765, "grad_norm": 0.7550507187843323, "learning_rate": 0.0002, "epoch": 2.5055655296229804, "step": 34890}, {"loss": 0.6861, "grad_norm": 1.0131759643554688, "learning_rate": 0.0002, "epoch": 2.5062836624775584, "step": 34900}, {"loss": 0.6755, "grad_norm": 0.9275356531143188, "learning_rate": 0.0002, "epoch": 2.5070017953321364, "step": 34910}, {"loss": 0.7108, "grad_norm": 0.6655791997909546, "learning_rate": 0.0002, "epoch": 2.5077199281867144, "step": 34920}, {"loss": 0.7154, "grad_norm": 0.79361891746521, "learning_rate": 0.0002, "epoch": 2.508438061041293, "step": 34930}, {"loss": 0.6506, "grad_norm": 0.8223658800125122, "learning_rate": 0.0002, "epoch": 2.509156193895871, "step": 34940}, {"loss": 0.6869, "grad_norm": 1.0070416927337646, "learning_rate": 0.0002, "epoch": 2.509874326750449, "step": 34950}, {"loss": 0.6819, "grad_norm": 0.8408986330032349, "learning_rate": 0.0002, "epoch": 2.510592459605027, "step": 34960}, {"loss": 0.7195, "grad_norm": 0.8178259134292603, "learning_rate": 0.0002, "epoch": 2.511310592459605, "step": 34970}, {"loss": 0.6738, "grad_norm": 0.747876763343811, "learning_rate": 0.0002, "epoch": 2.512028725314183, "step": 34980}, {"loss": 0.6706, "grad_norm": 0.8551825881004333, "learning_rate": 0.0002, "epoch": 2.5127468581687613, "step": 34990}, {"loss": 0.653, "grad_norm": 0.8366564512252808, "learning_rate": 0.0002, "epoch": 2.5134649910233393, "step": 35000}, {"loss": 0.6427, "grad_norm": 0.8491294384002686, "learning_rate": 0.0002, "epoch": 2.5141831238779173, "step": 35010}, {"loss": 0.6714, "grad_norm": 0.8854562640190125, "learning_rate": 0.0002, "epoch": 2.5149012567324958, "step": 35020}, {"loss": 0.6606, "grad_norm": 0.8652133345603943, "learning_rate": 0.0002, "epoch": 2.5156193895870738, "step": 35030}, {"loss": 0.658, "grad_norm": 0.8734033107757568, "learning_rate": 0.0002, "epoch": 2.5163375224416518, "step": 35040}, {"loss": 0.6528, "grad_norm": 0.8613446950912476, "learning_rate": 0.0002, "epoch": 2.5170556552962298, "step": 35050}, {"loss": 0.6943, "grad_norm": 0.762395441532135, "learning_rate": 0.0002, "epoch": 2.5177737881508078, "step": 35060}, {"loss": 0.66, "grad_norm": 0.806220293045044, "learning_rate": 0.0002, "epoch": 2.5184919210053858, "step": 35070}, {"loss": 0.6867, "grad_norm": 0.7781713008880615, "learning_rate": 0.0002, "epoch": 2.519210053859964, "step": 35080}, {"loss": 0.6927, "grad_norm": 0.8639848828315735, "learning_rate": 0.0002, "epoch": 2.519928186714542, "step": 35090}, {"loss": 0.6397, "grad_norm": 0.7331740260124207, "learning_rate": 0.0002, "epoch": 2.52064631956912, "step": 35100}, {"loss": 0.6916, "grad_norm": 0.8148137927055359, "learning_rate": 0.0002, "epoch": 2.521364452423698, "step": 35110}, {"loss": 0.6877, "grad_norm": 0.6939297914505005, "learning_rate": 0.0002, "epoch": 2.5220825852782767, "step": 35120}, {"loss": 0.6669, "grad_norm": 0.8151076436042786, "learning_rate": 0.0002, "epoch": 2.5228007181328547, "step": 35130}, {"loss": 0.6761, "grad_norm": 0.9193238019943237, "learning_rate": 0.0002, "epoch": 2.5235188509874327, "step": 35140}, {"loss": 0.7136, "grad_norm": 0.8230985403060913, "learning_rate": 0.0002, "epoch": 2.5242369838420107, "step": 35150}, {"loss": 0.7127, "grad_norm": 0.865492582321167, "learning_rate": 0.0002, "epoch": 2.5249551166965887, "step": 35160}, {"loss": 0.6591, "grad_norm": 0.7673570513725281, "learning_rate": 0.0002, "epoch": 2.525673249551167, "step": 35170}, {"loss": 0.6703, "grad_norm": 0.8296313881874084, "learning_rate": 0.0002, "epoch": 2.526391382405745, "step": 35180}, {"loss": 0.6588, "grad_norm": 0.6531317234039307, "learning_rate": 0.0002, "epoch": 2.527109515260323, "step": 35190}, {"loss": 0.7129, "grad_norm": 0.9865642189979553, "learning_rate": 0.0002, "epoch": 2.527827648114901, "step": 35200}, {"loss": 0.6728, "grad_norm": 0.8001098036766052, "learning_rate": 0.0002, "epoch": 2.5285457809694796, "step": 35210}, {"loss": 0.6737, "grad_norm": 0.7523218393325806, "learning_rate": 0.0002, "epoch": 2.5292639138240576, "step": 35220}, {"loss": 0.6426, "grad_norm": 1.061640977859497, "learning_rate": 0.0002, "epoch": 2.5299820466786356, "step": 35230}, {"loss": 0.6974, "grad_norm": 0.9668078422546387, "learning_rate": 0.0002, "epoch": 2.5307001795332136, "step": 35240}, {"loss": 0.7189, "grad_norm": 0.9554983973503113, "learning_rate": 0.0002, "epoch": 2.5314183123877916, "step": 35250}, {"loss": 0.648, "grad_norm": 0.8343066573143005, "learning_rate": 0.0002, "epoch": 2.5321364452423696, "step": 35260}, {"loss": 0.639, "grad_norm": 0.8408095240592957, "learning_rate": 0.0002, "epoch": 2.532854578096948, "step": 35270}, {"loss": 0.6412, "grad_norm": 0.8593984842300415, "learning_rate": 0.0002, "epoch": 2.533572710951526, "step": 35280}, {"loss": 0.6689, "grad_norm": 0.7593855261802673, "learning_rate": 0.0002, "epoch": 2.534290843806104, "step": 35290}, {"loss": 0.6731, "grad_norm": 0.9179701209068298, "learning_rate": 0.0002, "epoch": 2.5350089766606825, "step": 35300}, {"loss": 0.7194, "grad_norm": 0.749022901058197, "learning_rate": 0.0002, "epoch": 2.5357271095152605, "step": 35310}, {"loss": 0.6488, "grad_norm": 0.7172152400016785, "learning_rate": 0.0002, "epoch": 2.5364452423698385, "step": 35320}, {"loss": 0.6934, "grad_norm": 0.8228873610496521, "learning_rate": 0.0002, "epoch": 2.5371633752244165, "step": 35330}, {"loss": 0.7245, "grad_norm": 0.9663547277450562, "learning_rate": 0.0002, "epoch": 2.5378815080789945, "step": 35340}, {"loss": 0.6974, "grad_norm": 0.8446536660194397, "learning_rate": 0.0002, "epoch": 2.5385996409335725, "step": 35350}, {"loss": 0.6942, "grad_norm": 0.9751029014587402, "learning_rate": 0.0002, "epoch": 2.539317773788151, "step": 35360}, {"loss": 0.7001, "grad_norm": 0.7460315823554993, "learning_rate": 0.0002, "epoch": 2.540035906642729, "step": 35370}, {"loss": 0.6928, "grad_norm": 0.8269246816635132, "learning_rate": 0.0002, "epoch": 2.540754039497307, "step": 35380}, {"loss": 0.6559, "grad_norm": 0.7200030088424683, "learning_rate": 0.0002, "epoch": 2.541472172351885, "step": 35390}, {"loss": 0.6736, "grad_norm": 0.9586671590805054, "learning_rate": 0.0002, "epoch": 2.5421903052064634, "step": 35400}, {"loss": 0.6653, "grad_norm": 0.7872378826141357, "learning_rate": 0.0002, "epoch": 2.5429084380610414, "step": 35410}, {"loss": 0.7002, "grad_norm": 0.8257358074188232, "learning_rate": 0.0002, "epoch": 2.5436265709156194, "step": 35420}, {"loss": 0.6888, "grad_norm": 0.6924505829811096, "learning_rate": 0.0002, "epoch": 2.5443447037701974, "step": 35430}, {"loss": 0.6536, "grad_norm": 1.1171481609344482, "learning_rate": 0.0002, "epoch": 2.5450628366247754, "step": 35440}, {"loss": 0.7087, "grad_norm": 0.9635605216026306, "learning_rate": 0.0002, "epoch": 2.545780969479354, "step": 35450}, {"loss": 0.6545, "grad_norm": 0.9760567545890808, "learning_rate": 0.0002, "epoch": 2.546499102333932, "step": 35460}, {"loss": 0.6858, "grad_norm": 0.8523460030555725, "learning_rate": 0.0002, "epoch": 2.54721723518851, "step": 35470}, {"loss": 0.6702, "grad_norm": 0.9316970109939575, "learning_rate": 0.0002, "epoch": 2.547935368043088, "step": 35480}, {"loss": 0.7028, "grad_norm": 0.7401485443115234, "learning_rate": 0.0002, "epoch": 2.5486535008976663, "step": 35490}, {"loss": 0.6991, "grad_norm": 1.0627065896987915, "learning_rate": 0.0002, "epoch": 2.5493716337522443, "step": 35500}, {"loss": 0.6401, "grad_norm": 0.7463156580924988, "learning_rate": 0.0002, "epoch": 2.5500897666068223, "step": 35510}, {"loss": 0.6978, "grad_norm": 0.9935570359230042, "learning_rate": 0.0002, "epoch": 2.5508078994614003, "step": 35520}, {"loss": 0.7531, "grad_norm": 0.8824051022529602, "learning_rate": 0.0002, "epoch": 2.5515260323159783, "step": 35530}, {"loss": 0.7078, "grad_norm": 0.8018375635147095, "learning_rate": 0.0002, "epoch": 2.5522441651705563, "step": 35540}, {"loss": 0.6757, "grad_norm": 0.7523182034492493, "learning_rate": 0.0002, "epoch": 2.5529622980251347, "step": 35550}, {"loss": 0.6631, "grad_norm": 0.6771712303161621, "learning_rate": 0.0002, "epoch": 2.5536804308797127, "step": 35560}, {"loss": 0.6679, "grad_norm": 0.7903336882591248, "learning_rate": 0.0002, "epoch": 2.5543985637342908, "step": 35570}, {"loss": 0.7069, "grad_norm": 0.7973808646202087, "learning_rate": 0.0002, "epoch": 2.555116696588869, "step": 35580}, {"loss": 0.6388, "grad_norm": 0.9082772731781006, "learning_rate": 0.0002, "epoch": 2.555834829443447, "step": 35590}, {"loss": 0.6926, "grad_norm": 0.779671311378479, "learning_rate": 0.0002, "epoch": 2.556552962298025, "step": 35600}, {"loss": 0.6966, "grad_norm": 0.710058331489563, "learning_rate": 0.0002, "epoch": 2.557271095152603, "step": 35610}, {"loss": 0.701, "grad_norm": 0.8217873573303223, "learning_rate": 0.0002, "epoch": 2.557989228007181, "step": 35620}, {"loss": 0.6773, "grad_norm": 0.8017855286598206, "learning_rate": 0.0002, "epoch": 2.558707360861759, "step": 35630}, {"loss": 0.6764, "grad_norm": 0.6671402454376221, "learning_rate": 0.0002, "epoch": 2.5594254937163377, "step": 35640}, {"loss": 0.6946, "grad_norm": 0.9357045292854309, "learning_rate": 0.0002, "epoch": 2.5601436265709157, "step": 35650}, {"loss": 0.695, "grad_norm": 0.7676312327384949, "learning_rate": 0.0002, "epoch": 2.5608617594254937, "step": 35660}, {"loss": 0.7086, "grad_norm": 0.7602545619010925, "learning_rate": 0.0002, "epoch": 2.5615798922800717, "step": 35670}, {"loss": 0.695, "grad_norm": 0.8112275004386902, "learning_rate": 0.0002, "epoch": 2.56229802513465, "step": 35680}, {"loss": 0.7492, "grad_norm": 0.73296719789505, "learning_rate": 0.0002, "epoch": 2.563016157989228, "step": 35690}, {"loss": 0.6935, "grad_norm": 0.9007818102836609, "learning_rate": 0.0002, "epoch": 2.563734290843806, "step": 35700}, {"loss": 0.7287, "grad_norm": 0.7526060938835144, "learning_rate": 0.0002, "epoch": 2.564452423698384, "step": 35710}, {"loss": 0.6762, "grad_norm": 0.813875675201416, "learning_rate": 0.0002, "epoch": 2.565170556552962, "step": 35720}, {"loss": 0.666, "grad_norm": 0.7767695784568787, "learning_rate": 0.0002, "epoch": 2.5658886894075406, "step": 35730}, {"loss": 0.6591, "grad_norm": 0.7840573787689209, "learning_rate": 0.0002, "epoch": 2.5666068222621186, "step": 35740}, {"loss": 0.7131, "grad_norm": 0.7400487661361694, "learning_rate": 0.0002, "epoch": 2.5673249551166966, "step": 35750}, {"loss": 0.6571, "grad_norm": 0.7424315810203552, "learning_rate": 0.0002, "epoch": 2.5680430879712746, "step": 35760}, {"loss": 0.6861, "grad_norm": 0.7812185883522034, "learning_rate": 0.0002, "epoch": 2.568761220825853, "step": 35770}, {"loss": 0.7034, "grad_norm": 0.8397669196128845, "learning_rate": 0.0002, "epoch": 2.569479353680431, "step": 35780}, {"loss": 0.6734, "grad_norm": 0.7543849945068359, "learning_rate": 0.0002, "epoch": 2.570197486535009, "step": 35790}, {"loss": 0.7393, "grad_norm": 0.903634786605835, "learning_rate": 0.0002, "epoch": 2.570915619389587, "step": 35800}, {"loss": 0.6884, "grad_norm": 0.853335976600647, "learning_rate": 0.0002, "epoch": 2.571633752244165, "step": 35810}, {"loss": 0.6843, "grad_norm": 0.8441029787063599, "learning_rate": 0.0002, "epoch": 2.572351885098743, "step": 35820}, {"loss": 0.6874, "grad_norm": 0.9072228670120239, "learning_rate": 0.0002, "epoch": 2.5730700179533215, "step": 35830}, {"loss": 0.6866, "grad_norm": 0.7720168828964233, "learning_rate": 0.0002, "epoch": 2.5737881508078995, "step": 35840}, {"loss": 0.695, "grad_norm": 0.8719366788864136, "learning_rate": 0.0002, "epoch": 2.5745062836624775, "step": 35850}, {"loss": 0.7842, "grad_norm": 0.766209065914154, "learning_rate": 0.0002, "epoch": 2.575224416517056, "step": 35860}, {"loss": 0.6688, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 2.575942549371634, "step": 35870}, {"loss": 0.7309, "grad_norm": 0.8068482875823975, "learning_rate": 0.0002, "epoch": 2.576660682226212, "step": 35880}, {"loss": 0.703, "grad_norm": 0.8321225643157959, "learning_rate": 0.0002, "epoch": 2.57737881508079, "step": 35890}, {"loss": 0.6885, "grad_norm": 0.9787611961364746, "learning_rate": 0.0002, "epoch": 2.578096947935368, "step": 35900}, {"loss": 0.7246, "grad_norm": 0.6955108642578125, "learning_rate": 0.0002, "epoch": 2.578815080789946, "step": 35910}, {"loss": 0.6972, "grad_norm": 0.8309195637702942, "learning_rate": 0.0002, "epoch": 2.5795332136445244, "step": 35920}, {"loss": 0.6735, "grad_norm": 0.9309390783309937, "learning_rate": 0.0002, "epoch": 2.5802513464991024, "step": 35930}, {"loss": 0.7376, "grad_norm": 0.903537392616272, "learning_rate": 0.0002, "epoch": 2.5809694793536804, "step": 35940}, {"loss": 0.6578, "grad_norm": 0.9530633091926575, "learning_rate": 0.0002, "epoch": 2.5816876122082584, "step": 35950}, {"loss": 0.6707, "grad_norm": 1.0140212774276733, "learning_rate": 0.0002, "epoch": 2.582405745062837, "step": 35960}, {"loss": 0.6859, "grad_norm": 0.8224637508392334, "learning_rate": 0.0002, "epoch": 2.583123877917415, "step": 35970}, {"loss": 0.7158, "grad_norm": 0.7952998280525208, "learning_rate": 0.0002, "epoch": 2.583842010771993, "step": 35980}, {"loss": 0.65, "grad_norm": 0.6057878136634827, "learning_rate": 0.0002, "epoch": 2.584560143626571, "step": 35990}, {"loss": 0.6566, "grad_norm": 0.9172457456588745, "learning_rate": 0.0002, "epoch": 2.585278276481149, "step": 36000}, {"loss": 0.6863, "grad_norm": 1.0061585903167725, "learning_rate": 0.0002, "epoch": 2.5859964093357273, "step": 36010}, {"loss": 0.6831, "grad_norm": 0.8555058240890503, "learning_rate": 0.0002, "epoch": 2.5867145421903053, "step": 36020}, {"loss": 0.7181, "grad_norm": 0.7732099890708923, "learning_rate": 0.0002, "epoch": 2.5874326750448833, "step": 36030}, {"loss": 0.7383, "grad_norm": 0.9026121497154236, "learning_rate": 0.0002, "epoch": 2.5881508078994613, "step": 36040}, {"loss": 0.6221, "grad_norm": 0.7477090954780579, "learning_rate": 0.0002, "epoch": 2.5888689407540397, "step": 36050}, {"loss": 0.6852, "grad_norm": 0.8835780024528503, "learning_rate": 0.0002, "epoch": 2.5895870736086177, "step": 36060}, {"loss": 0.6786, "grad_norm": 0.7555899024009705, "learning_rate": 0.0002, "epoch": 2.5903052064631957, "step": 36070}, {"loss": 0.6723, "grad_norm": 0.7983574867248535, "learning_rate": 0.0002, "epoch": 2.5910233393177737, "step": 36080}, {"loss": 0.64, "grad_norm": 0.9261698722839355, "learning_rate": 0.0002, "epoch": 2.5917414721723517, "step": 36090}, {"loss": 0.6363, "grad_norm": 0.6834031343460083, "learning_rate": 0.0002, "epoch": 2.5924596050269297, "step": 36100}, {"loss": 0.702, "grad_norm": 0.9528526067733765, "learning_rate": 0.0002, "epoch": 2.593177737881508, "step": 36110}, {"loss": 0.7271, "grad_norm": 0.7469993233680725, "learning_rate": 0.0002, "epoch": 2.593895870736086, "step": 36120}, {"loss": 0.6967, "grad_norm": 0.6750355362892151, "learning_rate": 0.0002, "epoch": 2.594614003590664, "step": 36130}, {"loss": 0.6893, "grad_norm": 0.8591015338897705, "learning_rate": 0.0002, "epoch": 2.5953321364452426, "step": 36140}, {"loss": 0.7015, "grad_norm": 0.7359472513198853, "learning_rate": 0.0002, "epoch": 2.5960502692998206, "step": 36150}, {"loss": 0.6697, "grad_norm": 0.8450608253479004, "learning_rate": 0.0002, "epoch": 2.5967684021543986, "step": 36160}, {"loss": 0.7034, "grad_norm": 0.9069468975067139, "learning_rate": 0.0002, "epoch": 2.5974865350089766, "step": 36170}, {"loss": 0.6814, "grad_norm": 0.9261118173599243, "learning_rate": 0.0002, "epoch": 2.5982046678635546, "step": 36180}, {"loss": 0.6575, "grad_norm": 0.7164715528488159, "learning_rate": 0.0002, "epoch": 2.5989228007181326, "step": 36190}, {"loss": 0.7044, "grad_norm": 0.8809511661529541, "learning_rate": 0.0002, "epoch": 2.599640933572711, "step": 36200}, {"loss": 0.6333, "grad_norm": 0.9872701168060303, "learning_rate": 0.0002, "epoch": 2.600359066427289, "step": 36210}, {"loss": 0.689, "grad_norm": 0.7544043064117432, "learning_rate": 0.0002, "epoch": 2.601077199281867, "step": 36220}, {"loss": 0.658, "grad_norm": 0.9890767335891724, "learning_rate": 0.0002, "epoch": 2.601795332136445, "step": 36230}, {"loss": 0.6981, "grad_norm": 0.907865047454834, "learning_rate": 0.0002, "epoch": 2.6025134649910235, "step": 36240}, {"loss": 0.7131, "grad_norm": 0.7724096179008484, "learning_rate": 0.0002, "epoch": 2.6032315978456015, "step": 36250}, {"loss": 0.7034, "grad_norm": 0.7996655106544495, "learning_rate": 0.0002, "epoch": 2.6039497307001795, "step": 36260}, {"loss": 0.6744, "grad_norm": 0.7184412479400635, "learning_rate": 0.0002, "epoch": 2.6046678635547575, "step": 36270}, {"loss": 0.7133, "grad_norm": 0.7781601548194885, "learning_rate": 0.0002, "epoch": 2.6053859964093355, "step": 36280}, {"loss": 0.6975, "grad_norm": 0.8972102403640747, "learning_rate": 0.0002, "epoch": 2.6061041292639135, "step": 36290}, {"loss": 0.6757, "grad_norm": 0.6831884980201721, "learning_rate": 0.0002, "epoch": 2.606822262118492, "step": 36300}, {"loss": 0.6633, "grad_norm": 0.9049789905548096, "learning_rate": 0.0002, "epoch": 2.60754039497307, "step": 36310}, {"loss": 0.7048, "grad_norm": 0.8062970042228699, "learning_rate": 0.0002, "epoch": 2.608258527827648, "step": 36320}, {"loss": 0.6695, "grad_norm": 0.94797682762146, "learning_rate": 0.0002, "epoch": 2.6089766606822264, "step": 36330}, {"loss": 0.6934, "grad_norm": 0.7907559275627136, "learning_rate": 0.0002, "epoch": 2.6096947935368044, "step": 36340}, {"loss": 0.6299, "grad_norm": 0.6720156073570251, "learning_rate": 0.0002, "epoch": 2.6104129263913824, "step": 36350}, {"loss": 0.644, "grad_norm": 0.729228138923645, "learning_rate": 0.0002, "epoch": 2.6111310592459605, "step": 36360}, {"loss": 0.6651, "grad_norm": 0.9072836637496948, "learning_rate": 0.0002, "epoch": 2.6118491921005385, "step": 36370}, {"loss": 0.6821, "grad_norm": 0.8022173643112183, "learning_rate": 0.0002, "epoch": 2.6125673249551165, "step": 36380}, {"loss": 0.6587, "grad_norm": 0.7475612163543701, "learning_rate": 0.0002, "epoch": 2.613285457809695, "step": 36390}, {"loss": 0.6454, "grad_norm": 0.7976534366607666, "learning_rate": 0.0002, "epoch": 2.614003590664273, "step": 36400}, {"loss": 0.7173, "grad_norm": 0.7118260860443115, "learning_rate": 0.0002, "epoch": 2.614721723518851, "step": 36410}, {"loss": 0.7173, "grad_norm": 0.666500985622406, "learning_rate": 0.0002, "epoch": 2.6154398563734294, "step": 36420}, {"loss": 0.719, "grad_norm": 0.8776089549064636, "learning_rate": 0.0002, "epoch": 2.6161579892280074, "step": 36430}, {"loss": 0.6928, "grad_norm": 0.9375919699668884, "learning_rate": 0.0002, "epoch": 2.6168761220825854, "step": 36440}, {"loss": 0.6627, "grad_norm": 0.8162244558334351, "learning_rate": 0.0002, "epoch": 2.6175942549371634, "step": 36450}, {"loss": 0.6586, "grad_norm": 0.8459304571151733, "learning_rate": 0.0002, "epoch": 2.6183123877917414, "step": 36460}, {"loss": 0.6777, "grad_norm": 0.7731037735939026, "learning_rate": 0.0002, "epoch": 2.6190305206463194, "step": 36470}, {"loss": 0.7288, "grad_norm": 0.7857680320739746, "learning_rate": 0.0002, "epoch": 2.619748653500898, "step": 36480}, {"loss": 0.664, "grad_norm": 0.8415161371231079, "learning_rate": 0.0002, "epoch": 2.620466786355476, "step": 36490}, {"loss": 0.703, "grad_norm": 0.8103558421134949, "learning_rate": 0.0002, "epoch": 2.621184919210054, "step": 36500}, {"loss": 0.6693, "grad_norm": 0.7876150608062744, "learning_rate": 0.0002, "epoch": 2.621903052064632, "step": 36510}, {"loss": 0.6562, "grad_norm": 0.7316484451293945, "learning_rate": 0.0002, "epoch": 2.6226211849192103, "step": 36520}, {"loss": 0.6263, "grad_norm": 0.7209784984588623, "learning_rate": 0.0002, "epoch": 2.6233393177737883, "step": 36530}, {"loss": 0.6767, "grad_norm": 0.8933016657829285, "learning_rate": 0.0002, "epoch": 2.6240574506283663, "step": 36540}, {"loss": 0.7217, "grad_norm": 0.8078171610832214, "learning_rate": 0.0002, "epoch": 2.6247755834829443, "step": 36550}, {"loss": 0.7106, "grad_norm": 0.9134724736213684, "learning_rate": 0.0002, "epoch": 2.6254937163375223, "step": 36560}, {"loss": 0.6909, "grad_norm": 0.8691368699073792, "learning_rate": 0.0002, "epoch": 2.6262118491921003, "step": 36570}, {"loss": 0.6769, "grad_norm": 0.706479012966156, "learning_rate": 0.0002, "epoch": 2.6269299820466787, "step": 36580}, {"loss": 0.6864, "grad_norm": 0.9333644509315491, "learning_rate": 0.0002, "epoch": 2.6276481149012567, "step": 36590}, {"loss": 0.6704, "grad_norm": 0.8156154155731201, "learning_rate": 0.0002, "epoch": 2.6283662477558347, "step": 36600}, {"loss": 0.7128, "grad_norm": 0.812745213508606, "learning_rate": 0.0002, "epoch": 2.629084380610413, "step": 36610}, {"loss": 0.6901, "grad_norm": 0.8898148536682129, "learning_rate": 0.0002, "epoch": 2.629802513464991, "step": 36620}, {"loss": 0.6821, "grad_norm": 0.8083946108818054, "learning_rate": 0.0002, "epoch": 2.630520646319569, "step": 36630}, {"loss": 0.7285, "grad_norm": 0.7050122618675232, "learning_rate": 0.0002, "epoch": 2.631238779174147, "step": 36640}, {"loss": 0.6751, "grad_norm": 0.8155789971351624, "learning_rate": 0.0002, "epoch": 2.631956912028725, "step": 36650}, {"loss": 0.7258, "grad_norm": 0.9102175235748291, "learning_rate": 0.0002, "epoch": 2.632675044883303, "step": 36660}, {"loss": 0.6697, "grad_norm": 0.6621248126029968, "learning_rate": 0.0002, "epoch": 2.6333931777378816, "step": 36670}, {"loss": 0.6405, "grad_norm": 0.7338519096374512, "learning_rate": 0.0002, "epoch": 2.6341113105924596, "step": 36680}, {"loss": 0.6784, "grad_norm": 0.7536506652832031, "learning_rate": 0.0002, "epoch": 2.6348294434470376, "step": 36690}, {"loss": 0.6974, "grad_norm": 0.9357436299324036, "learning_rate": 0.0002, "epoch": 2.635547576301616, "step": 36700}, {"loss": 0.7729, "grad_norm": 0.7732111215591431, "learning_rate": 0.0002, "epoch": 2.636265709156194, "step": 36710}, {"loss": 0.6905, "grad_norm": 0.6863537430763245, "learning_rate": 0.0002, "epoch": 2.636983842010772, "step": 36720}, {"loss": 0.7058, "grad_norm": 0.8014764785766602, "learning_rate": 0.0002, "epoch": 2.63770197486535, "step": 36730}, {"loss": 0.697, "grad_norm": 0.8103911280632019, "learning_rate": 0.0002, "epoch": 2.638420107719928, "step": 36740}, {"loss": 0.7164, "grad_norm": 0.882652997970581, "learning_rate": 0.0002, "epoch": 2.639138240574506, "step": 36750}, {"loss": 0.6689, "grad_norm": 0.8705278038978577, "learning_rate": 0.0002, "epoch": 2.6398563734290845, "step": 36760}, {"loss": 0.6863, "grad_norm": 0.80764240026474, "learning_rate": 0.0002, "epoch": 2.6405745062836625, "step": 36770}, {"loss": 0.6761, "grad_norm": 0.9668620824813843, "learning_rate": 0.0002, "epoch": 2.6412926391382405, "step": 36780}, {"loss": 0.6576, "grad_norm": 0.7477577328681946, "learning_rate": 0.0002, "epoch": 2.6420107719928185, "step": 36790}, {"loss": 0.6558, "grad_norm": 0.8344516754150391, "learning_rate": 0.0002, "epoch": 2.642728904847397, "step": 36800}, {"loss": 0.6949, "grad_norm": 0.9520720839500427, "learning_rate": 0.0002, "epoch": 2.643447037701975, "step": 36810}, {"loss": 0.6731, "grad_norm": 0.5942372679710388, "learning_rate": 0.0002, "epoch": 2.644165170556553, "step": 36820}, {"loss": 0.6509, "grad_norm": 0.7411555051803589, "learning_rate": 0.0002, "epoch": 2.644883303411131, "step": 36830}, {"loss": 0.6948, "grad_norm": 0.6597771048545837, "learning_rate": 0.0002, "epoch": 2.645601436265709, "step": 36840}, {"loss": 0.6379, "grad_norm": 0.8636548519134521, "learning_rate": 0.0002, "epoch": 2.646319569120287, "step": 36850}, {"loss": 0.6965, "grad_norm": 0.8557497262954712, "learning_rate": 0.0002, "epoch": 2.6470377019748654, "step": 36860}, {"loss": 0.7061, "grad_norm": 0.8535996675491333, "learning_rate": 0.0002, "epoch": 2.6477558348294434, "step": 36870}, {"loss": 0.7087, "grad_norm": 0.7996463775634766, "learning_rate": 0.0002, "epoch": 2.6484739676840214, "step": 36880}, {"loss": 0.7174, "grad_norm": 0.6462067365646362, "learning_rate": 0.0002, "epoch": 2.6491921005386, "step": 36890}, {"loss": 0.6905, "grad_norm": 0.8849772214889526, "learning_rate": 0.0002, "epoch": 2.649910233393178, "step": 36900}, {"loss": 0.6973, "grad_norm": 0.999173641204834, "learning_rate": 0.0002, "epoch": 2.650628366247756, "step": 36910}, {"loss": 0.628, "grad_norm": 0.7221724987030029, "learning_rate": 0.0002, "epoch": 2.651346499102334, "step": 36920}, {"loss": 0.6698, "grad_norm": 0.8122989535331726, "learning_rate": 0.0002, "epoch": 2.652064631956912, "step": 36930}, {"loss": 0.6758, "grad_norm": 0.724267840385437, "learning_rate": 0.0002, "epoch": 2.65278276481149, "step": 36940}, {"loss": 0.6542, "grad_norm": 0.8250583410263062, "learning_rate": 0.0002, "epoch": 2.6535008976660683, "step": 36950}, {"loss": 0.6914, "grad_norm": 0.7623526453971863, "learning_rate": 0.0002, "epoch": 2.6542190305206463, "step": 36960}, {"loss": 0.6699, "grad_norm": 0.6474025845527649, "learning_rate": 0.0002, "epoch": 2.6549371633752243, "step": 36970}, {"loss": 0.7235, "grad_norm": 0.9751694202423096, "learning_rate": 0.0002, "epoch": 2.655655296229803, "step": 36980}, {"loss": 0.7423, "grad_norm": 0.8338939547538757, "learning_rate": 0.0002, "epoch": 2.656373429084381, "step": 36990}, {"loss": 0.6641, "grad_norm": 0.8877421021461487, "learning_rate": 0.0002, "epoch": 2.657091561938959, "step": 37000}, {"loss": 0.6639, "grad_norm": 0.9590298533439636, "learning_rate": 0.0002, "epoch": 2.657809694793537, "step": 37010}, {"loss": 0.6617, "grad_norm": 0.8224121928215027, "learning_rate": 0.0002, "epoch": 2.658527827648115, "step": 37020}, {"loss": 0.6359, "grad_norm": 0.9871236681938171, "learning_rate": 0.0002, "epoch": 2.659245960502693, "step": 37030}, {"loss": 0.65, "grad_norm": 0.8729037046432495, "learning_rate": 0.0002, "epoch": 2.6599640933572712, "step": 37040}, {"loss": 0.6561, "grad_norm": 0.6279319524765015, "learning_rate": 0.0002, "epoch": 2.6606822262118492, "step": 37050}, {"loss": 0.7031, "grad_norm": 1.0278962850570679, "learning_rate": 0.0002, "epoch": 2.6614003590664272, "step": 37060}, {"loss": 0.6552, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 2.6621184919210052, "step": 37070}, {"loss": 0.6994, "grad_norm": 0.7432018518447876, "learning_rate": 0.0002, "epoch": 2.6628366247755837, "step": 37080}, {"loss": 0.7086, "grad_norm": 0.9425008296966553, "learning_rate": 0.0002, "epoch": 2.6635547576301617, "step": 37090}, {"loss": 0.716, "grad_norm": 0.7542579174041748, "learning_rate": 0.0002, "epoch": 2.6642728904847397, "step": 37100}, {"loss": 0.6714, "grad_norm": 0.8469315767288208, "learning_rate": 0.0002, "epoch": 2.6649910233393177, "step": 37110}, {"loss": 0.6638, "grad_norm": 0.865777313709259, "learning_rate": 0.0002, "epoch": 2.6657091561938957, "step": 37120}, {"loss": 0.741, "grad_norm": 0.7293250560760498, "learning_rate": 0.0002, "epoch": 2.6664272890484737, "step": 37130}, {"loss": 0.6662, "grad_norm": 0.7199395895004272, "learning_rate": 0.0002, "epoch": 2.667145421903052, "step": 37140}, {"loss": 0.7078, "grad_norm": 0.7801268100738525, "learning_rate": 0.0002, "epoch": 2.66786355475763, "step": 37150}, {"loss": 0.7083, "grad_norm": 0.8706921935081482, "learning_rate": 0.0002, "epoch": 2.668581687612208, "step": 37160}, {"loss": 0.69, "grad_norm": 0.7124722599983215, "learning_rate": 0.0002, "epoch": 2.6692998204667866, "step": 37170}, {"loss": 0.625, "grad_norm": 0.8333015441894531, "learning_rate": 0.0002, "epoch": 2.6700179533213646, "step": 37180}, {"loss": 0.636, "grad_norm": 0.8822736740112305, "learning_rate": 0.0002, "epoch": 2.6707360861759426, "step": 37190}, {"loss": 0.6731, "grad_norm": 0.8300906419754028, "learning_rate": 0.0002, "epoch": 2.6714542190305206, "step": 37200}, {"loss": 0.6883, "grad_norm": 0.887126088142395, "learning_rate": 0.0002, "epoch": 2.6721723518850986, "step": 37210}, {"loss": 0.7211, "grad_norm": 0.7473671436309814, "learning_rate": 0.0002, "epoch": 2.6728904847396766, "step": 37220}, {"loss": 0.7032, "grad_norm": 0.8121018409729004, "learning_rate": 0.0002, "epoch": 2.673608617594255, "step": 37230}, {"loss": 0.6262, "grad_norm": 0.7882586717605591, "learning_rate": 0.0002, "epoch": 2.674326750448833, "step": 37240}, {"loss": 0.7201, "grad_norm": 0.797060489654541, "learning_rate": 0.0002, "epoch": 2.675044883303411, "step": 37250}, {"loss": 0.6635, "grad_norm": 0.9776935577392578, "learning_rate": 0.0002, "epoch": 2.6757630161579895, "step": 37260}, {"loss": 0.6883, "grad_norm": 0.9527283906936646, "learning_rate": 0.0002, "epoch": 2.6764811490125675, "step": 37270}, {"loss": 0.6968, "grad_norm": 0.7232038974761963, "learning_rate": 0.0002, "epoch": 2.6771992818671455, "step": 37280}, {"loss": 0.6544, "grad_norm": 0.8514575362205505, "learning_rate": 0.0002, "epoch": 2.6779174147217235, "step": 37290}, {"loss": 0.6956, "grad_norm": 0.8951214551925659, "learning_rate": 0.0002, "epoch": 2.6786355475763015, "step": 37300}, {"loss": 0.7435, "grad_norm": 0.7569643259048462, "learning_rate": 0.0002, "epoch": 2.6793536804308795, "step": 37310}, {"loss": 0.6522, "grad_norm": 1.0522346496582031, "learning_rate": 0.0002, "epoch": 2.680071813285458, "step": 37320}, {"loss": 0.7051, "grad_norm": 0.8914180994033813, "learning_rate": 0.0002, "epoch": 2.680789946140036, "step": 37330}, {"loss": 0.6941, "grad_norm": 0.8251807689666748, "learning_rate": 0.0002, "epoch": 2.681508078994614, "step": 37340}, {"loss": 0.6783, "grad_norm": 0.8215394020080566, "learning_rate": 0.0002, "epoch": 2.682226211849192, "step": 37350}, {"loss": 0.682, "grad_norm": 0.8043696880340576, "learning_rate": 0.0002, "epoch": 2.6829443447037704, "step": 37360}, {"loss": 0.6614, "grad_norm": 0.767250657081604, "learning_rate": 0.0002, "epoch": 2.6836624775583484, "step": 37370}, {"loss": 0.7197, "grad_norm": 0.817740261554718, "learning_rate": 0.0002, "epoch": 2.6843806104129264, "step": 37380}, {"loss": 0.6839, "grad_norm": 0.7963255047798157, "learning_rate": 0.0002, "epoch": 2.6850987432675044, "step": 37390}, {"loss": 0.7469, "grad_norm": 0.839271605014801, "learning_rate": 0.0002, "epoch": 2.6858168761220824, "step": 37400}, {"loss": 0.6879, "grad_norm": 0.7882823348045349, "learning_rate": 0.0002, "epoch": 2.6865350089766604, "step": 37410}, {"loss": 0.6768, "grad_norm": 0.8316412568092346, "learning_rate": 0.0002, "epoch": 2.687253141831239, "step": 37420}, {"loss": 0.7031, "grad_norm": 1.0044993162155151, "learning_rate": 0.0002, "epoch": 2.687971274685817, "step": 37430}, {"loss": 0.6988, "grad_norm": 0.8342832326889038, "learning_rate": 0.0002, "epoch": 2.688689407540395, "step": 37440}, {"loss": 0.6685, "grad_norm": 0.6743215322494507, "learning_rate": 0.0002, "epoch": 2.6894075403949733, "step": 37450}, {"loss": 0.6567, "grad_norm": 0.6872923970222473, "learning_rate": 0.0002, "epoch": 2.6901256732495513, "step": 37460}, {"loss": 0.7089, "grad_norm": 0.7377792596817017, "learning_rate": 0.0002, "epoch": 2.6908438061041293, "step": 37470}, {"loss": 0.676, "grad_norm": 0.7677304744720459, "learning_rate": 0.0002, "epoch": 2.6915619389587073, "step": 37480}, {"loss": 0.6693, "grad_norm": 0.9951061010360718, "learning_rate": 0.0002, "epoch": 2.6922800718132853, "step": 37490}, {"loss": 0.6517, "grad_norm": 0.7452111840248108, "learning_rate": 0.0002, "epoch": 2.6929982046678633, "step": 37500}, {"loss": 0.7503, "grad_norm": 0.9663393497467041, "learning_rate": 0.0002, "epoch": 2.6937163375224418, "step": 37510}, {"loss": 0.7025, "grad_norm": 0.7919635772705078, "learning_rate": 0.0002, "epoch": 2.6944344703770198, "step": 37520}, {"loss": 0.7257, "grad_norm": 0.9977981448173523, "learning_rate": 0.0002, "epoch": 2.6951526032315978, "step": 37530}, {"loss": 0.6507, "grad_norm": 0.7279480695724487, "learning_rate": 0.0002, "epoch": 2.695870736086176, "step": 37540}, {"loss": 0.7448, "grad_norm": 0.7218075394630432, "learning_rate": 0.0002, "epoch": 2.6965888689407542, "step": 37550}, {"loss": 0.6845, "grad_norm": 0.9041047096252441, "learning_rate": 0.0002, "epoch": 2.6973070017953322, "step": 37560}, {"loss": 0.6848, "grad_norm": 0.7689407467842102, "learning_rate": 0.0002, "epoch": 2.6980251346499102, "step": 37570}, {"loss": 0.7136, "grad_norm": 0.8184728622436523, "learning_rate": 0.0002, "epoch": 2.6987432675044882, "step": 37580}, {"loss": 0.6952, "grad_norm": 0.7536661624908447, "learning_rate": 0.0002, "epoch": 2.6994614003590662, "step": 37590}, {"loss": 0.7064, "grad_norm": 0.8371431231498718, "learning_rate": 0.0002, "epoch": 2.7001795332136447, "step": 37600}, {"loss": 0.7118, "grad_norm": 0.8562723994255066, "learning_rate": 0.0002, "epoch": 2.7008976660682227, "step": 37610}, {"loss": 0.6602, "grad_norm": 0.8227898478507996, "learning_rate": 0.0002, "epoch": 2.7016157989228007, "step": 37620}, {"loss": 0.7324, "grad_norm": 0.764792799949646, "learning_rate": 0.0002, "epoch": 2.7023339317773787, "step": 37630}, {"loss": 0.7289, "grad_norm": 0.7782649993896484, "learning_rate": 0.0002, "epoch": 2.703052064631957, "step": 37640}, {"loss": 0.705, "grad_norm": 0.7669944167137146, "learning_rate": 0.0002, "epoch": 2.703770197486535, "step": 37650}, {"loss": 0.7019, "grad_norm": 0.7945750951766968, "learning_rate": 0.0002, "epoch": 2.704488330341113, "step": 37660}, {"loss": 0.6789, "grad_norm": 0.6840786337852478, "learning_rate": 0.0002, "epoch": 2.705206463195691, "step": 37670}, {"loss": 0.768, "grad_norm": 1.0565117597579956, "learning_rate": 0.0002, "epoch": 2.705924596050269, "step": 37680}, {"loss": 0.737, "grad_norm": 0.7407042384147644, "learning_rate": 0.0002, "epoch": 2.706642728904847, "step": 37690}, {"loss": 0.712, "grad_norm": 0.7862113118171692, "learning_rate": 0.0002, "epoch": 2.7073608617594256, "step": 37700}, {"loss": 0.6331, "grad_norm": 0.7487596273422241, "learning_rate": 0.0002, "epoch": 2.7080789946140036, "step": 37710}, {"loss": 0.6917, "grad_norm": 0.9416596293449402, "learning_rate": 0.0002, "epoch": 2.7087971274685816, "step": 37720}, {"loss": 0.717, "grad_norm": 0.8943207263946533, "learning_rate": 0.0002, "epoch": 2.70951526032316, "step": 37730}, {"loss": 0.6505, "grad_norm": 0.9263445138931274, "learning_rate": 0.0002, "epoch": 2.710233393177738, "step": 37740}, {"loss": 0.7423, "grad_norm": 0.6869737505912781, "learning_rate": 0.0002, "epoch": 2.710951526032316, "step": 37750}, {"loss": 0.724, "grad_norm": 0.9186407923698425, "learning_rate": 0.0002, "epoch": 2.711669658886894, "step": 37760}, {"loss": 0.6757, "grad_norm": 0.8379335999488831, "learning_rate": 0.0002, "epoch": 2.712387791741472, "step": 37770}, {"loss": 0.7352, "grad_norm": 0.7248736023902893, "learning_rate": 0.0002, "epoch": 2.71310592459605, "step": 37780}, {"loss": 0.7023, "grad_norm": 0.8636229038238525, "learning_rate": 0.0002, "epoch": 2.7138240574506285, "step": 37790}, {"loss": 0.726, "grad_norm": 0.7590767741203308, "learning_rate": 0.0002, "epoch": 2.7145421903052065, "step": 37800}, {"loss": 0.6837, "grad_norm": 0.8946404457092285, "learning_rate": 0.0002, "epoch": 2.7152603231597845, "step": 37810}, {"loss": 0.7135, "grad_norm": 0.7822132706642151, "learning_rate": 0.0002, "epoch": 2.7159784560143625, "step": 37820}, {"loss": 0.7034, "grad_norm": 0.7882820963859558, "learning_rate": 0.0002, "epoch": 2.716696588868941, "step": 37830}, {"loss": 0.6667, "grad_norm": 0.8025872707366943, "learning_rate": 0.0002, "epoch": 2.717414721723519, "step": 37840}, {"loss": 0.6967, "grad_norm": 0.8618839979171753, "learning_rate": 0.0002, "epoch": 2.718132854578097, "step": 37850}, {"loss": 0.699, "grad_norm": 0.6975733637809753, "learning_rate": 0.0002, "epoch": 2.718850987432675, "step": 37860}, {"loss": 0.6858, "grad_norm": 0.7952182292938232, "learning_rate": 0.0002, "epoch": 2.719569120287253, "step": 37870}, {"loss": 0.7018, "grad_norm": 0.7580680251121521, "learning_rate": 0.0002, "epoch": 2.7202872531418314, "step": 37880}, {"loss": 0.6838, "grad_norm": 0.9504257440567017, "learning_rate": 0.0002, "epoch": 2.7210053859964094, "step": 37890}, {"loss": 0.6801, "grad_norm": 0.856614351272583, "learning_rate": 0.0002, "epoch": 2.7217235188509874, "step": 37900}, {"loss": 0.6647, "grad_norm": 1.0092085599899292, "learning_rate": 0.0002, "epoch": 2.7224416517055654, "step": 37910}, {"loss": 0.6709, "grad_norm": 0.9009839296340942, "learning_rate": 0.0002, "epoch": 2.723159784560144, "step": 37920}, {"loss": 0.7009, "grad_norm": 0.9247435331344604, "learning_rate": 0.0002, "epoch": 2.723877917414722, "step": 37930}, {"loss": 0.6924, "grad_norm": 1.0774317979812622, "learning_rate": 0.0002, "epoch": 2.7245960502693, "step": 37940}, {"loss": 0.6706, "grad_norm": 0.9104372262954712, "learning_rate": 0.0002, "epoch": 2.725314183123878, "step": 37950}, {"loss": 0.6608, "grad_norm": 0.7904245257377625, "learning_rate": 0.0002, "epoch": 2.726032315978456, "step": 37960}, {"loss": 0.6937, "grad_norm": 0.9555521607398987, "learning_rate": 0.0002, "epoch": 2.726750448833034, "step": 37970}, {"loss": 0.6497, "grad_norm": 0.7769099473953247, "learning_rate": 0.0002, "epoch": 2.7274685816876123, "step": 37980}, {"loss": 0.63, "grad_norm": 0.9202065467834473, "learning_rate": 0.0002, "epoch": 2.7281867145421903, "step": 37990}, {"loss": 0.7021, "grad_norm": 0.732510507106781, "learning_rate": 0.0002, "epoch": 2.7289048473967683, "step": 38000}, {"loss": 0.6665, "grad_norm": 0.7723771929740906, "learning_rate": 0.0002, "epoch": 2.7296229802513468, "step": 38010}, {"loss": 0.6836, "grad_norm": 0.7948567867279053, "learning_rate": 0.0002, "epoch": 2.7303411131059248, "step": 38020}, {"loss": 0.6802, "grad_norm": 0.7702966928482056, "learning_rate": 0.0002, "epoch": 2.7310592459605028, "step": 38030}, {"loss": 0.6859, "grad_norm": 0.689098060131073, "learning_rate": 0.0002, "epoch": 2.7317773788150808, "step": 38040}, {"loss": 0.7027, "grad_norm": 0.7951080203056335, "learning_rate": 0.0002, "epoch": 2.7324955116696588, "step": 38050}, {"loss": 0.6895, "grad_norm": 0.7284924983978271, "learning_rate": 0.0002, "epoch": 2.7332136445242368, "step": 38060}, {"loss": 0.7409, "grad_norm": 0.9198044538497925, "learning_rate": 0.0002, "epoch": 2.733931777378815, "step": 38070}, {"loss": 0.6699, "grad_norm": 0.8653260469436646, "learning_rate": 0.0002, "epoch": 2.734649910233393, "step": 38080}, {"loss": 0.6832, "grad_norm": 0.8503400683403015, "learning_rate": 0.0002, "epoch": 2.735368043087971, "step": 38090}, {"loss": 0.6955, "grad_norm": 0.8388783931732178, "learning_rate": 0.0002, "epoch": 2.736086175942549, "step": 38100}, {"loss": 0.7059, "grad_norm": 0.7636904716491699, "learning_rate": 0.0002, "epoch": 2.7368043087971277, "step": 38110}, {"loss": 0.6659, "grad_norm": 0.8990790247917175, "learning_rate": 0.0002, "epoch": 2.7375224416517057, "step": 38120}, {"loss": 0.6487, "grad_norm": 0.8878970742225647, "learning_rate": 0.0002, "epoch": 2.7382405745062837, "step": 38130}, {"loss": 0.6725, "grad_norm": 0.7684310078620911, "learning_rate": 0.0002, "epoch": 2.7389587073608617, "step": 38140}, {"loss": 0.6935, "grad_norm": 1.0777359008789062, "learning_rate": 0.0002, "epoch": 2.7396768402154397, "step": 38150}, {"loss": 0.6904, "grad_norm": 0.768764317035675, "learning_rate": 0.0002, "epoch": 2.740394973070018, "step": 38160}, {"loss": 0.6509, "grad_norm": 0.7490760087966919, "learning_rate": 0.0002, "epoch": 2.741113105924596, "step": 38170}, {"loss": 0.6907, "grad_norm": 0.860373854637146, "learning_rate": 0.0002, "epoch": 2.741831238779174, "step": 38180}, {"loss": 0.6704, "grad_norm": 0.7145599722862244, "learning_rate": 0.0002, "epoch": 2.742549371633752, "step": 38190}, {"loss": 0.6798, "grad_norm": 0.8347760438919067, "learning_rate": 0.0002, "epoch": 2.7432675044883306, "step": 38200}, {"loss": 0.7029, "grad_norm": 0.8425729274749756, "learning_rate": 0.0002, "epoch": 2.7439856373429086, "step": 38210}, {"loss": 0.6442, "grad_norm": 0.9289436936378479, "learning_rate": 0.0002, "epoch": 2.7447037701974866, "step": 38220}, {"loss": 0.694, "grad_norm": 0.7608675360679626, "learning_rate": 0.0002, "epoch": 2.7454219030520646, "step": 38230}, {"loss": 0.7097, "grad_norm": 0.8067167401313782, "learning_rate": 0.0002, "epoch": 2.7461400359066426, "step": 38240}, {"loss": 0.704, "grad_norm": 0.8599629402160645, "learning_rate": 0.0002, "epoch": 2.7468581687612206, "step": 38250}, {"loss": 0.6259, "grad_norm": 0.8425742387771606, "learning_rate": 0.0002, "epoch": 2.747576301615799, "step": 38260}, {"loss": 0.6875, "grad_norm": 0.8626754283905029, "learning_rate": 0.0002, "epoch": 2.748294434470377, "step": 38270}, {"loss": 0.7357, "grad_norm": 0.797652006149292, "learning_rate": 0.0002, "epoch": 2.749012567324955, "step": 38280}, {"loss": 0.7184, "grad_norm": 0.7971500754356384, "learning_rate": 0.0002, "epoch": 2.7497307001795335, "step": 38290}, {"loss": 0.7035, "grad_norm": 0.9786333441734314, "learning_rate": 0.0002, "epoch": 2.7504488330341115, "step": 38300}, {"loss": 0.6501, "grad_norm": 0.7146100997924805, "learning_rate": 0.0002, "epoch": 2.7511669658886895, "step": 38310}, {"loss": 0.7087, "grad_norm": 0.8436099886894226, "learning_rate": 0.0002, "epoch": 2.7518850987432675, "step": 38320}, {"loss": 0.6911, "grad_norm": 0.8943847417831421, "learning_rate": 0.0002, "epoch": 2.7526032315978455, "step": 38330}, {"loss": 0.6397, "grad_norm": 0.8170148730278015, "learning_rate": 0.0002, "epoch": 2.7533213644524235, "step": 38340}, {"loss": 0.6756, "grad_norm": 0.7804728746414185, "learning_rate": 0.0002, "epoch": 2.754039497307002, "step": 38350}, {"loss": 0.6954, "grad_norm": 0.9139971137046814, "learning_rate": 0.0002, "epoch": 2.75475763016158, "step": 38360}, {"loss": 0.7083, "grad_norm": 0.835332453250885, "learning_rate": 0.0002, "epoch": 2.755475763016158, "step": 38370}, {"loss": 0.7112, "grad_norm": 1.0904794931411743, "learning_rate": 0.0002, "epoch": 2.756193895870736, "step": 38380}, {"loss": 0.6881, "grad_norm": 0.7443365454673767, "learning_rate": 0.0002, "epoch": 2.7569120287253144, "step": 38390}, {"loss": 0.6896, "grad_norm": 1.1336839199066162, "learning_rate": 0.0002, "epoch": 2.7576301615798924, "step": 38400}, {"loss": 0.6777, "grad_norm": 0.9024015665054321, "learning_rate": 0.0002, "epoch": 2.7583482944344704, "step": 38410}, {"loss": 0.629, "grad_norm": 0.7380578517913818, "learning_rate": 0.0002, "epoch": 2.7590664272890484, "step": 38420}, {"loss": 0.7708, "grad_norm": 0.9860634207725525, "learning_rate": 0.0002, "epoch": 2.7597845601436264, "step": 38430}, {"loss": 0.6694, "grad_norm": 0.7928970456123352, "learning_rate": 0.0002, "epoch": 2.760502692998205, "step": 38440}, {"loss": 0.669, "grad_norm": 1.0357221364974976, "learning_rate": 0.0002, "epoch": 2.761220825852783, "step": 38450}, {"loss": 0.6763, "grad_norm": 0.8110901117324829, "learning_rate": 0.0002, "epoch": 2.761938958707361, "step": 38460}, {"loss": 0.6528, "grad_norm": 0.8420981764793396, "learning_rate": 0.0002, "epoch": 2.762657091561939, "step": 38470}, {"loss": 0.6841, "grad_norm": 0.858955979347229, "learning_rate": 0.0002, "epoch": 2.7633752244165173, "step": 38480}, {"loss": 0.7387, "grad_norm": 0.9851368069648743, "learning_rate": 0.0002, "epoch": 2.7640933572710953, "step": 38490}, {"loss": 0.6939, "grad_norm": 0.8073325753211975, "learning_rate": 0.0002, "epoch": 2.7648114901256733, "step": 38500}, {"loss": 0.7033, "grad_norm": 1.0654062032699585, "learning_rate": 0.0002, "epoch": 2.7655296229802513, "step": 38510}, {"loss": 0.692, "grad_norm": 0.719603955745697, "learning_rate": 0.0002, "epoch": 2.7662477558348293, "step": 38520}, {"loss": 0.7032, "grad_norm": 0.9790831804275513, "learning_rate": 0.0002, "epoch": 2.7669658886894073, "step": 38530}, {"loss": 0.6613, "grad_norm": 0.907619833946228, "learning_rate": 0.0002, "epoch": 2.7676840215439857, "step": 38540}, {"loss": 0.6683, "grad_norm": 0.7463719248771667, "learning_rate": 0.0002, "epoch": 2.7684021543985637, "step": 38550}, {"loss": 0.6785, "grad_norm": 1.0687178373336792, "learning_rate": 0.0002, "epoch": 2.7691202872531417, "step": 38560}, {"loss": 0.6901, "grad_norm": 0.7397776246070862, "learning_rate": 0.0002, "epoch": 2.76983842010772, "step": 38570}, {"loss": 0.6861, "grad_norm": 0.7392559051513672, "learning_rate": 0.0002, "epoch": 2.770556552962298, "step": 38580}, {"loss": 0.6954, "grad_norm": 0.9774793982505798, "learning_rate": 0.0002, "epoch": 2.771274685816876, "step": 38590}, {"loss": 0.6641, "grad_norm": 0.9502208828926086, "learning_rate": 0.0002, "epoch": 2.771992818671454, "step": 38600}, {"loss": 0.6908, "grad_norm": 0.776108980178833, "learning_rate": 0.0002, "epoch": 2.772710951526032, "step": 38610}, {"loss": 0.6826, "grad_norm": 0.7633077502250671, "learning_rate": 0.0002, "epoch": 2.77342908438061, "step": 38620}, {"loss": 0.6559, "grad_norm": 0.9445580244064331, "learning_rate": 0.0002, "epoch": 2.7741472172351886, "step": 38630}, {"loss": 0.7085, "grad_norm": 0.943165123462677, "learning_rate": 0.0002, "epoch": 2.7748653500897666, "step": 38640}, {"loss": 0.6739, "grad_norm": 0.9045929908752441, "learning_rate": 0.0002, "epoch": 2.7755834829443446, "step": 38650}, {"loss": 0.7351, "grad_norm": 0.9425684213638306, "learning_rate": 0.0002, "epoch": 2.7763016157989227, "step": 38660}, {"loss": 0.6602, "grad_norm": 0.9106295704841614, "learning_rate": 0.0002, "epoch": 2.777019748653501, "step": 38670}, {"loss": 0.7076, "grad_norm": 0.6264749765396118, "learning_rate": 0.0002, "epoch": 2.777737881508079, "step": 38680}, {"loss": 0.7234, "grad_norm": 0.9156801700592041, "learning_rate": 0.0002, "epoch": 2.778456014362657, "step": 38690}, {"loss": 0.6804, "grad_norm": 0.9752956032752991, "learning_rate": 0.0002, "epoch": 2.779174147217235, "step": 38700}, {"loss": 0.686, "grad_norm": 0.7849555611610413, "learning_rate": 0.0002, "epoch": 2.779892280071813, "step": 38710}, {"loss": 0.72, "grad_norm": 0.8109981417655945, "learning_rate": 0.0002, "epoch": 2.780610412926391, "step": 38720}, {"loss": 0.6592, "grad_norm": 0.7882387638092041, "learning_rate": 0.0002, "epoch": 2.7813285457809696, "step": 38730}, {"loss": 0.6948, "grad_norm": 0.9049678444862366, "learning_rate": 0.0002, "epoch": 2.7820466786355476, "step": 38740}, {"loss": 0.7032, "grad_norm": 0.7678212523460388, "learning_rate": 0.0002, "epoch": 2.7827648114901256, "step": 38750}, {"loss": 0.6882, "grad_norm": 0.9754453301429749, "learning_rate": 0.0002, "epoch": 2.783482944344704, "step": 38760}, {"loss": 0.7071, "grad_norm": 0.7643493413925171, "learning_rate": 0.0002, "epoch": 2.784201077199282, "step": 38770}, {"loss": 0.6817, "grad_norm": 0.7440303564071655, "learning_rate": 0.0002, "epoch": 2.78491921005386, "step": 38780}, {"loss": 0.6869, "grad_norm": 0.8870946168899536, "learning_rate": 0.0002, "epoch": 2.785637342908438, "step": 38790}, {"loss": 0.7391, "grad_norm": 0.8100579977035522, "learning_rate": 0.0002, "epoch": 2.786355475763016, "step": 38800}, {"loss": 0.7003, "grad_norm": 0.7082616090774536, "learning_rate": 0.0002, "epoch": 2.787073608617594, "step": 38810}, {"loss": 0.697, "grad_norm": 0.7880047559738159, "learning_rate": 0.0002, "epoch": 2.7877917414721725, "step": 38820}, {"loss": 0.6635, "grad_norm": 0.7217963337898254, "learning_rate": 0.0002, "epoch": 2.7885098743267505, "step": 38830}, {"loss": 0.696, "grad_norm": 0.799124002456665, "learning_rate": 0.0002, "epoch": 2.7892280071813285, "step": 38840}, {"loss": 0.7267, "grad_norm": 1.0004022121429443, "learning_rate": 0.0002, "epoch": 2.789946140035907, "step": 38850}, {"loss": 0.6325, "grad_norm": 0.7866547107696533, "learning_rate": 0.0002, "epoch": 2.790664272890485, "step": 38860}, {"loss": 0.6573, "grad_norm": 0.891603410243988, "learning_rate": 0.0002, "epoch": 2.791382405745063, "step": 38870}, {"loss": 0.6949, "grad_norm": 0.7687129378318787, "learning_rate": 0.0002, "epoch": 2.792100538599641, "step": 38880}, {"loss": 0.6753, "grad_norm": 0.7549769282341003, "learning_rate": 0.0002, "epoch": 2.792818671454219, "step": 38890}, {"loss": 0.7103, "grad_norm": 0.7792351245880127, "learning_rate": 0.0002, "epoch": 2.793536804308797, "step": 38900}, {"loss": 0.671, "grad_norm": 0.7352819442749023, "learning_rate": 0.0002, "epoch": 2.7942549371633754, "step": 38910}, {"loss": 0.7176, "grad_norm": 0.8758018612861633, "learning_rate": 0.0002, "epoch": 2.7949730700179534, "step": 38920}, {"loss": 0.7033, "grad_norm": 0.8213023543357849, "learning_rate": 0.0002, "epoch": 2.7956912028725314, "step": 38930}, {"loss": 0.6759, "grad_norm": 0.899368941783905, "learning_rate": 0.0002, "epoch": 2.7964093357271094, "step": 38940}, {"loss": 0.6994, "grad_norm": 0.7497758269309998, "learning_rate": 0.0002, "epoch": 2.797127468581688, "step": 38950}, {"loss": 0.7006, "grad_norm": 0.870704710483551, "learning_rate": 0.0002, "epoch": 2.797845601436266, "step": 38960}, {"loss": 0.6865, "grad_norm": 0.8021528720855713, "learning_rate": 0.0002, "epoch": 2.798563734290844, "step": 38970}, {"loss": 0.7254, "grad_norm": 0.7541360855102539, "learning_rate": 0.0002, "epoch": 2.799281867145422, "step": 38980}, {"loss": 0.6275, "grad_norm": 0.8909788131713867, "learning_rate": 0.0002, "epoch": 2.8, "step": 38990}, {"loss": 0.6801, "grad_norm": 0.8175999522209167, "learning_rate": 0.0002, "epoch": 2.800718132854578, "step": 39000}, {"loss": 0.6961, "grad_norm": 0.7336044311523438, "learning_rate": 0.0002, "epoch": 2.8014362657091563, "step": 39010}, {"loss": 0.6573, "grad_norm": 0.7354168891906738, "learning_rate": 0.0002, "epoch": 2.8021543985637343, "step": 39020}, {"loss": 0.6207, "grad_norm": 0.8771968483924866, "learning_rate": 0.0002, "epoch": 2.8028725314183123, "step": 39030}, {"loss": 0.671, "grad_norm": 0.8073309063911438, "learning_rate": 0.0002, "epoch": 2.8035906642728907, "step": 39040}, {"loss": 0.6869, "grad_norm": 0.8475365042686462, "learning_rate": 0.0002, "epoch": 2.8043087971274687, "step": 39050}, {"loss": 0.6549, "grad_norm": 0.7233281135559082, "learning_rate": 0.0002, "epoch": 2.8050269299820467, "step": 39060}, {"loss": 0.6937, "grad_norm": 0.9850572347640991, "learning_rate": 0.0002, "epoch": 2.8057450628366247, "step": 39070}, {"loss": 0.7091, "grad_norm": 1.0635435581207275, "learning_rate": 0.0002, "epoch": 2.8064631956912027, "step": 39080}, {"loss": 0.6345, "grad_norm": 0.8183665871620178, "learning_rate": 0.0002, "epoch": 2.8071813285457807, "step": 39090}, {"loss": 0.7116, "grad_norm": 0.802228569984436, "learning_rate": 0.0002, "epoch": 2.807899461400359, "step": 39100}, {"loss": 0.7078, "grad_norm": 0.9861624836921692, "learning_rate": 0.0002, "epoch": 2.808617594254937, "step": 39110}, {"loss": 0.7242, "grad_norm": 0.675205409526825, "learning_rate": 0.0002, "epoch": 2.809335727109515, "step": 39120}, {"loss": 0.6599, "grad_norm": 0.7503975629806519, "learning_rate": 0.0002, "epoch": 2.8100538599640936, "step": 39130}, {"loss": 0.6684, "grad_norm": 0.8266825675964355, "learning_rate": 0.0002, "epoch": 2.8107719928186716, "step": 39140}, {"loss": 0.6869, "grad_norm": 0.6956485509872437, "learning_rate": 0.0002, "epoch": 2.8114901256732496, "step": 39150}, {"loss": 0.6495, "grad_norm": 0.7363799214363098, "learning_rate": 0.0002, "epoch": 2.8122082585278276, "step": 39160}, {"loss": 0.7047, "grad_norm": 1.3893407583236694, "learning_rate": 0.0002, "epoch": 2.8129263913824056, "step": 39170}, {"loss": 0.6501, "grad_norm": 1.0619654655456543, "learning_rate": 0.0002, "epoch": 2.8136445242369836, "step": 39180}, {"loss": 0.703, "grad_norm": 0.7924326062202454, "learning_rate": 0.0002, "epoch": 2.814362657091562, "step": 39190}, {"loss": 0.6748, "grad_norm": 0.8838121294975281, "learning_rate": 0.0002, "epoch": 2.81508078994614, "step": 39200}, {"loss": 0.6759, "grad_norm": 0.9059016108512878, "learning_rate": 0.0002, "epoch": 2.815798922800718, "step": 39210}, {"loss": 0.6812, "grad_norm": 0.9284590482711792, "learning_rate": 0.0002, "epoch": 2.816517055655296, "step": 39220}, {"loss": 0.6261, "grad_norm": 0.7992225289344788, "learning_rate": 0.0002, "epoch": 2.8172351885098745, "step": 39230}, {"loss": 0.6623, "grad_norm": 0.816376805305481, "learning_rate": 0.0002, "epoch": 2.8179533213644525, "step": 39240}, {"loss": 0.6825, "grad_norm": 0.9183637499809265, "learning_rate": 0.0002, "epoch": 2.8186714542190305, "step": 39250}, {"loss": 0.6558, "grad_norm": 0.7232057452201843, "learning_rate": 0.0002, "epoch": 2.8193895870736085, "step": 39260}, {"loss": 0.7396, "grad_norm": 0.9012457728385925, "learning_rate": 0.0002, "epoch": 2.8201077199281865, "step": 39270}, {"loss": 0.6823, "grad_norm": 0.7796093821525574, "learning_rate": 0.0002, "epoch": 2.8208258527827645, "step": 39280}, {"loss": 0.6997, "grad_norm": 0.8331146836280823, "learning_rate": 0.0002, "epoch": 2.821543985637343, "step": 39290}, {"loss": 0.6867, "grad_norm": 0.8031269907951355, "learning_rate": 0.0002, "epoch": 2.822262118491921, "step": 39300}, {"loss": 0.7451, "grad_norm": 0.8563299179077148, "learning_rate": 0.0002, "epoch": 2.822980251346499, "step": 39310}, {"loss": 0.6828, "grad_norm": 0.8083387613296509, "learning_rate": 0.0002, "epoch": 2.8236983842010774, "step": 39320}, {"loss": 0.723, "grad_norm": 0.8132631182670593, "learning_rate": 0.0002, "epoch": 2.8244165170556554, "step": 39330}, {"loss": 0.6882, "grad_norm": 0.9071316719055176, "learning_rate": 0.0002, "epoch": 2.8251346499102334, "step": 39340}, {"loss": 0.7057, "grad_norm": 0.8224168419837952, "learning_rate": 0.0002, "epoch": 2.8258527827648114, "step": 39350}, {"loss": 0.6831, "grad_norm": 1.073014497756958, "learning_rate": 0.0002, "epoch": 2.8265709156193894, "step": 39360}, {"loss": 0.7392, "grad_norm": 0.9466553926467896, "learning_rate": 0.0002, "epoch": 2.8272890484739674, "step": 39370}, {"loss": 0.7288, "grad_norm": 0.8946257829666138, "learning_rate": 0.0002, "epoch": 2.828007181328546, "step": 39380}, {"loss": 0.7023, "grad_norm": 0.8497758507728577, "learning_rate": 0.0002, "epoch": 2.828725314183124, "step": 39390}, {"loss": 0.6787, "grad_norm": 0.8952143788337708, "learning_rate": 0.0002, "epoch": 2.829443447037702, "step": 39400}, {"loss": 0.7059, "grad_norm": 0.8839313983917236, "learning_rate": 0.0002, "epoch": 2.8301615798922803, "step": 39410}, {"loss": 0.6643, "grad_norm": 0.7576757669448853, "learning_rate": 0.0002, "epoch": 2.8308797127468583, "step": 39420}, {"loss": 0.6509, "grad_norm": 0.8212469816207886, "learning_rate": 0.0002, "epoch": 2.8315978456014363, "step": 39430}, {"loss": 0.6728, "grad_norm": 0.9289504885673523, "learning_rate": 0.0002, "epoch": 2.8323159784560143, "step": 39440}, {"loss": 0.6773, "grad_norm": 0.8745405077934265, "learning_rate": 0.0002, "epoch": 2.8330341113105924, "step": 39450}, {"loss": 0.6741, "grad_norm": 0.7974533438682556, "learning_rate": 0.0002, "epoch": 2.8337522441651704, "step": 39460}, {"loss": 0.6887, "grad_norm": 0.914289116859436, "learning_rate": 0.0002, "epoch": 2.834470377019749, "step": 39470}, {"loss": 0.7009, "grad_norm": 0.7686914801597595, "learning_rate": 0.0002, "epoch": 2.835188509874327, "step": 39480}, {"loss": 0.679, "grad_norm": 0.9289370179176331, "learning_rate": 0.0002, "epoch": 2.835906642728905, "step": 39490}, {"loss": 0.684, "grad_norm": 0.8851973414421082, "learning_rate": 0.0002, "epoch": 2.836624775583483, "step": 39500}, {"loss": 0.7012, "grad_norm": 0.7754096388816833, "learning_rate": 0.0002, "epoch": 2.8373429084380613, "step": 39510}, {"loss": 0.6936, "grad_norm": 0.8801632523536682, "learning_rate": 0.0002, "epoch": 2.8380610412926393, "step": 39520}, {"loss": 0.6878, "grad_norm": 0.9031528234481812, "learning_rate": 0.0002, "epoch": 2.8387791741472173, "step": 39530}, {"loss": 0.6815, "grad_norm": 0.7113721966743469, "learning_rate": 0.0002, "epoch": 2.8394973070017953, "step": 39540}, {"loss": 0.7287, "grad_norm": 0.7880923748016357, "learning_rate": 0.0002, "epoch": 2.8402154398563733, "step": 39550}, {"loss": 0.671, "grad_norm": 2.4828813076019287, "learning_rate": 0.0002, "epoch": 2.8409335727109513, "step": 39560}, {"loss": 0.6824, "grad_norm": 0.9174619913101196, "learning_rate": 0.0002, "epoch": 2.8416517055655297, "step": 39570}, {"loss": 0.7086, "grad_norm": 0.9708074927330017, "learning_rate": 0.0002, "epoch": 2.8423698384201077, "step": 39580}, {"loss": 0.7021, "grad_norm": 0.7968248724937439, "learning_rate": 0.0002, "epoch": 2.8430879712746857, "step": 39590}, {"loss": 0.7121, "grad_norm": 0.7967682480812073, "learning_rate": 0.0002, "epoch": 2.843806104129264, "step": 39600}, {"loss": 0.6284, "grad_norm": 0.7487651109695435, "learning_rate": 0.0002, "epoch": 2.844524236983842, "step": 39610}, {"loss": 0.6624, "grad_norm": 0.6997556686401367, "learning_rate": 0.0002, "epoch": 2.84524236983842, "step": 39620}, {"loss": 0.6987, "grad_norm": 0.7639351487159729, "learning_rate": 0.0002, "epoch": 2.845960502692998, "step": 39630}, {"loss": 0.6757, "grad_norm": 0.9086648225784302, "learning_rate": 0.0002, "epoch": 2.846678635547576, "step": 39640}, {"loss": 0.6841, "grad_norm": 0.91103196144104, "learning_rate": 0.0002, "epoch": 2.847396768402154, "step": 39650}, {"loss": 0.7046, "grad_norm": 0.8096913695335388, "learning_rate": 0.0002, "epoch": 2.8481149012567326, "step": 39660}, {"loss": 0.679, "grad_norm": 0.8961427807807922, "learning_rate": 0.0002, "epoch": 2.8488330341113106, "step": 39670}, {"loss": 0.6589, "grad_norm": 0.7489904761314392, "learning_rate": 0.0002, "epoch": 2.8495511669658886, "step": 39680}, {"loss": 0.6581, "grad_norm": 0.7893617749214172, "learning_rate": 0.0002, "epoch": 2.850269299820467, "step": 39690}, {"loss": 0.7326, "grad_norm": 0.8259761929512024, "learning_rate": 0.0002, "epoch": 2.850987432675045, "step": 39700}, {"loss": 0.6763, "grad_norm": 0.7006617188453674, "learning_rate": 0.0002, "epoch": 2.851705565529623, "step": 39710}, {"loss": 0.7095, "grad_norm": 0.8922327756881714, "learning_rate": 0.0002, "epoch": 2.852423698384201, "step": 39720}, {"loss": 0.6829, "grad_norm": 0.9058550000190735, "learning_rate": 0.0002, "epoch": 2.853141831238779, "step": 39730}, {"loss": 0.6777, "grad_norm": 0.7627129554748535, "learning_rate": 0.0002, "epoch": 2.853859964093357, "step": 39740}, {"loss": 0.6937, "grad_norm": 0.9316968321800232, "learning_rate": 0.0002, "epoch": 2.8545780969479355, "step": 39750}, {"loss": 0.6882, "grad_norm": 0.8424679040908813, "learning_rate": 0.0002, "epoch": 2.8552962298025135, "step": 39760}, {"loss": 0.7018, "grad_norm": 0.6185386776924133, "learning_rate": 0.0002, "epoch": 2.8560143626570915, "step": 39770}, {"loss": 0.7106, "grad_norm": 0.709902286529541, "learning_rate": 0.0002, "epoch": 2.8567324955116695, "step": 39780}, {"loss": 0.7007, "grad_norm": 0.93730229139328, "learning_rate": 0.0002, "epoch": 2.857450628366248, "step": 39790}, {"loss": 0.6973, "grad_norm": 0.875989556312561, "learning_rate": 0.0002, "epoch": 2.858168761220826, "step": 39800}, {"loss": 0.6685, "grad_norm": 0.7424131631851196, "learning_rate": 0.0002, "epoch": 2.858886894075404, "step": 39810}, {"loss": 0.6704, "grad_norm": 0.9108477830886841, "learning_rate": 0.0002, "epoch": 2.859605026929982, "step": 39820}, {"loss": 0.6677, "grad_norm": 0.8248386383056641, "learning_rate": 0.0002, "epoch": 2.86032315978456, "step": 39830}, {"loss": 0.6591, "grad_norm": 0.8739979863166809, "learning_rate": 0.0002, "epoch": 2.861041292639138, "step": 39840}, {"loss": 0.6674, "grad_norm": 0.7940961122512817, "learning_rate": 0.0002, "epoch": 2.8617594254937164, "step": 39850}, {"loss": 0.6875, "grad_norm": 0.7594687938690186, "learning_rate": 0.0002, "epoch": 2.8624775583482944, "step": 39860}, {"loss": 0.7339, "grad_norm": 0.9884313941001892, "learning_rate": 0.0002, "epoch": 2.8631956912028724, "step": 39870}, {"loss": 0.6583, "grad_norm": 0.8537741303443909, "learning_rate": 0.0002, "epoch": 2.863913824057451, "step": 39880}, {"loss": 0.6746, "grad_norm": 0.7407512664794922, "learning_rate": 0.0002, "epoch": 2.864631956912029, "step": 39890}, {"loss": 0.7211, "grad_norm": 1.0179548263549805, "learning_rate": 0.0002, "epoch": 2.865350089766607, "step": 39900}, {"loss": 0.6916, "grad_norm": 0.8822470307350159, "learning_rate": 0.0002, "epoch": 2.866068222621185, "step": 39910}, {"loss": 0.7141, "grad_norm": 0.794448733329773, "learning_rate": 0.0002, "epoch": 2.866786355475763, "step": 39920}, {"loss": 0.6993, "grad_norm": 0.8115299940109253, "learning_rate": 0.0002, "epoch": 2.867504488330341, "step": 39930}, {"loss": 0.655, "grad_norm": 0.7998958826065063, "learning_rate": 0.0002, "epoch": 2.8682226211849193, "step": 39940}, {"loss": 0.7414, "grad_norm": 0.8222435116767883, "learning_rate": 0.0002, "epoch": 2.8689407540394973, "step": 39950}, {"loss": 0.6987, "grad_norm": 0.9495923519134521, "learning_rate": 0.0002, "epoch": 2.8696588868940753, "step": 39960}, {"loss": 0.6567, "grad_norm": 0.6749192476272583, "learning_rate": 0.0002, "epoch": 2.8703770197486533, "step": 39970}, {"loss": 0.7003, "grad_norm": 0.8910874128341675, "learning_rate": 0.0002, "epoch": 2.871095152603232, "step": 39980}, {"loss": 0.6935, "grad_norm": 0.7051638960838318, "learning_rate": 0.0002, "epoch": 2.87181328545781, "step": 39990}, {"loss": 0.663, "grad_norm": 0.8456535339355469, "learning_rate": 0.0002, "epoch": 2.872531418312388, "step": 40000}, {"loss": 0.7222, "grad_norm": 0.934894859790802, "learning_rate": 0.0002, "epoch": 2.873249551166966, "step": 40010}, {"loss": 0.7106, "grad_norm": 0.6740477681159973, "learning_rate": 0.0002, "epoch": 2.873967684021544, "step": 40020}, {"loss": 0.6981, "grad_norm": 0.6632325649261475, "learning_rate": 0.0002, "epoch": 2.8746858168761222, "step": 40030}, {"loss": 0.7037, "grad_norm": 0.8889022469520569, "learning_rate": 0.0002, "epoch": 2.8754039497307002, "step": 40040}, {"loss": 0.7094, "grad_norm": 0.7460705637931824, "learning_rate": 0.0002, "epoch": 2.8761220825852782, "step": 40050}, {"loss": 0.6994, "grad_norm": 0.9795911908149719, "learning_rate": 0.0002, "epoch": 2.8768402154398562, "step": 40060}, {"loss": 0.6602, "grad_norm": 1.0002509355545044, "learning_rate": 0.0002, "epoch": 2.8775583482944347, "step": 40070}, {"loss": 0.7191, "grad_norm": 0.7867239713668823, "learning_rate": 0.0002, "epoch": 2.8782764811490127, "step": 40080}, {"loss": 0.6772, "grad_norm": 1.0221471786499023, "learning_rate": 0.0002, "epoch": 2.8789946140035907, "step": 40090}, {"loss": 0.7317, "grad_norm": 0.8091005086898804, "learning_rate": 0.0002, "epoch": 2.8797127468581687, "step": 40100}, {"loss": 0.7334, "grad_norm": 0.8485820293426514, "learning_rate": 0.0002, "epoch": 2.8804308797127467, "step": 40110}, {"loss": 0.7221, "grad_norm": 0.7850196957588196, "learning_rate": 0.0002, "epoch": 2.8811490125673247, "step": 40120}, {"loss": 0.6696, "grad_norm": 0.7906134128570557, "learning_rate": 0.0002, "epoch": 2.881867145421903, "step": 40130}, {"loss": 0.648, "grad_norm": 0.7957962155342102, "learning_rate": 0.0002, "epoch": 2.882585278276481, "step": 40140}, {"loss": 0.6774, "grad_norm": 1.0687522888183594, "learning_rate": 0.0002, "epoch": 2.883303411131059, "step": 40150}, {"loss": 0.7256, "grad_norm": 0.713752031326294, "learning_rate": 0.0002, "epoch": 2.8840215439856376, "step": 40160}, {"loss": 0.7144, "grad_norm": 1.1603864431381226, "learning_rate": 0.0002, "epoch": 2.8847396768402156, "step": 40170}, {"loss": 0.7223, "grad_norm": 0.8423245549201965, "learning_rate": 0.0002, "epoch": 2.8854578096947936, "step": 40180}, {"loss": 0.6796, "grad_norm": 0.7554550766944885, "learning_rate": 0.0002, "epoch": 2.8861759425493716, "step": 40190}, {"loss": 0.6923, "grad_norm": 0.6006978750228882, "learning_rate": 0.0002, "epoch": 2.8868940754039496, "step": 40200}, {"loss": 0.6893, "grad_norm": 0.923068106174469, "learning_rate": 0.0002, "epoch": 2.8876122082585276, "step": 40210}, {"loss": 0.6688, "grad_norm": 0.7659787535667419, "learning_rate": 0.0002, "epoch": 2.888330341113106, "step": 40220}, {"loss": 0.6706, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 2.889048473967684, "step": 40230}, {"loss": 0.6922, "grad_norm": 1.1267355680465698, "learning_rate": 0.0002, "epoch": 2.889766606822262, "step": 40240}, {"loss": 0.7636, "grad_norm": 0.8548554182052612, "learning_rate": 0.0002, "epoch": 2.89048473967684, "step": 40250}, {"loss": 0.6847, "grad_norm": 0.7846875786781311, "learning_rate": 0.0002, "epoch": 2.8912028725314185, "step": 40260}, {"loss": 0.6796, "grad_norm": 0.8606904745101929, "learning_rate": 0.0002, "epoch": 2.8919210053859965, "step": 40270}, {"loss": 0.6864, "grad_norm": 0.6508898138999939, "learning_rate": 0.0002, "epoch": 2.8926391382405745, "step": 40280}, {"loss": 0.6793, "grad_norm": 0.7903237342834473, "learning_rate": 0.0002, "epoch": 2.8933572710951525, "step": 40290}, {"loss": 0.6642, "grad_norm": 0.7320941686630249, "learning_rate": 0.0002, "epoch": 2.8940754039497305, "step": 40300}, {"loss": 0.6813, "grad_norm": 1.0031821727752686, "learning_rate": 0.0002, "epoch": 2.894793536804309, "step": 40310}, {"loss": 0.6071, "grad_norm": 0.7463554739952087, "learning_rate": 0.0002, "epoch": 2.895511669658887, "step": 40320}, {"loss": 0.6856, "grad_norm": 0.8455599546432495, "learning_rate": 0.0002, "epoch": 2.896229802513465, "step": 40330}, {"loss": 0.7252, "grad_norm": 0.7645914554595947, "learning_rate": 0.0002, "epoch": 2.896947935368043, "step": 40340}, {"loss": 0.7181, "grad_norm": 0.9074810147285461, "learning_rate": 0.0002, "epoch": 2.8976660682226214, "step": 40350}, {"loss": 0.6935, "grad_norm": 0.9070153832435608, "learning_rate": 0.0002, "epoch": 2.8983842010771994, "step": 40360}, {"loss": 0.7021, "grad_norm": 0.8649221658706665, "learning_rate": 0.0002, "epoch": 2.8991023339317774, "step": 40370}, {"loss": 0.7402, "grad_norm": 1.0325016975402832, "learning_rate": 0.0002, "epoch": 2.8998204667863554, "step": 40380}, {"loss": 0.6889, "grad_norm": 0.8688622713088989, "learning_rate": 0.0002, "epoch": 2.9005385996409334, "step": 40390}, {"loss": 0.7209, "grad_norm": 0.83316969871521, "learning_rate": 0.0002, "epoch": 2.9012567324955114, "step": 40400}, {"loss": 0.6915, "grad_norm": 1.0146536827087402, "learning_rate": 0.0002, "epoch": 2.90197486535009, "step": 40410}, {"loss": 0.67, "grad_norm": 6.21811580657959, "learning_rate": 0.0002, "epoch": 2.902692998204668, "step": 40420}, {"loss": 0.675, "grad_norm": 0.8747655749320984, "learning_rate": 0.0002, "epoch": 2.903411131059246, "step": 40430}, {"loss": 0.6781, "grad_norm": 0.8671547174453735, "learning_rate": 0.0002, "epoch": 2.9041292639138243, "step": 40440}, {"loss": 0.693, "grad_norm": 0.7888760566711426, "learning_rate": 0.0002, "epoch": 2.9048473967684023, "step": 40450}, {"loss": 0.7208, "grad_norm": 0.7182217240333557, "learning_rate": 0.0002, "epoch": 2.9055655296229803, "step": 40460}, {"loss": 0.7393, "grad_norm": 0.8802227973937988, "learning_rate": 0.0002, "epoch": 2.9062836624775583, "step": 40470}, {"loss": 0.6755, "grad_norm": 0.8106126189231873, "learning_rate": 0.0002, "epoch": 2.9070017953321363, "step": 40480}, {"loss": 0.7251, "grad_norm": 0.7313538789749146, "learning_rate": 0.0002, "epoch": 2.9077199281867143, "step": 40490}, {"loss": 0.6927, "grad_norm": 0.6098655462265015, "learning_rate": 0.0002, "epoch": 2.9084380610412928, "step": 40500}, {"loss": 0.6667, "grad_norm": 0.8849560618400574, "learning_rate": 0.0002, "epoch": 2.9091561938958708, "step": 40510}, {"loss": 0.7199, "grad_norm": 0.8761322498321533, "learning_rate": 0.0002, "epoch": 2.9098743267504488, "step": 40520}, {"loss": 0.6952, "grad_norm": 0.8259703516960144, "learning_rate": 0.0002, "epoch": 2.9105924596050268, "step": 40530}, {"loss": 0.6547, "grad_norm": 0.6613079309463501, "learning_rate": 0.0002, "epoch": 2.911310592459605, "step": 40540}, {"loss": 0.7642, "grad_norm": 0.825678825378418, "learning_rate": 0.0002, "epoch": 2.912028725314183, "step": 40550}, {"loss": 0.7052, "grad_norm": 0.824850857257843, "learning_rate": 0.0002, "epoch": 2.912746858168761, "step": 40560}, {"loss": 0.6869, "grad_norm": 0.9629682898521423, "learning_rate": 0.0002, "epoch": 2.9134649910233392, "step": 40570}, {"loss": 0.7588, "grad_norm": 0.7446485161781311, "learning_rate": 0.0002, "epoch": 2.9141831238779172, "step": 40580}, {"loss": 0.7045, "grad_norm": 0.9028317928314209, "learning_rate": 0.0002, "epoch": 2.9149012567324957, "step": 40590}, {"loss": 0.7128, "grad_norm": 0.9646022319793701, "learning_rate": 0.0002, "epoch": 2.9156193895870737, "step": 40600}, {"loss": 0.6782, "grad_norm": 0.8845045566558838, "learning_rate": 0.0002, "epoch": 2.9163375224416517, "step": 40610}, {"loss": 0.7179, "grad_norm": 0.9660372734069824, "learning_rate": 0.0002, "epoch": 2.9170556552962297, "step": 40620}, {"loss": 0.7442, "grad_norm": 0.8914347290992737, "learning_rate": 0.0002, "epoch": 2.917773788150808, "step": 40630}, {"loss": 0.6435, "grad_norm": 0.7789235711097717, "learning_rate": 0.0002, "epoch": 2.918491921005386, "step": 40640}, {"loss": 0.7156, "grad_norm": 0.8221206665039062, "learning_rate": 0.0002, "epoch": 2.919210053859964, "step": 40650}, {"loss": 0.7363, "grad_norm": 0.9550618529319763, "learning_rate": 0.0002, "epoch": 2.919928186714542, "step": 40660}, {"loss": 0.6911, "grad_norm": 0.868315577507019, "learning_rate": 0.0002, "epoch": 2.92064631956912, "step": 40670}, {"loss": 0.6939, "grad_norm": 0.852878749370575, "learning_rate": 0.0002, "epoch": 2.921364452423698, "step": 40680}, {"loss": 0.6497, "grad_norm": 0.8388790488243103, "learning_rate": 0.0002, "epoch": 2.9220825852782766, "step": 40690}, {"loss": 0.7299, "grad_norm": 0.9897602200508118, "learning_rate": 0.0002, "epoch": 2.9228007181328546, "step": 40700}, {"loss": 0.695, "grad_norm": 0.8050527572631836, "learning_rate": 0.0002, "epoch": 2.9235188509874326, "step": 40710}, {"loss": 0.6924, "grad_norm": 0.7296929955482483, "learning_rate": 0.0002, "epoch": 2.924236983842011, "step": 40720}, {"loss": 0.759, "grad_norm": 0.917475700378418, "learning_rate": 0.0002, "epoch": 2.924955116696589, "step": 40730}, {"loss": 0.6965, "grad_norm": 0.9118483662605286, "learning_rate": 0.0002, "epoch": 2.925673249551167, "step": 40740}, {"loss": 0.6918, "grad_norm": 0.7722473740577698, "learning_rate": 0.0002, "epoch": 2.926391382405745, "step": 40750}, {"loss": 0.7103, "grad_norm": 0.7950358986854553, "learning_rate": 0.0002, "epoch": 2.927109515260323, "step": 40760}, {"loss": 0.7266, "grad_norm": 0.8868561387062073, "learning_rate": 0.0002, "epoch": 2.927827648114901, "step": 40770}, {"loss": 0.7513, "grad_norm": 0.7923154830932617, "learning_rate": 0.0002, "epoch": 2.9285457809694795, "step": 40780}, {"loss": 0.6822, "grad_norm": 0.7285428047180176, "learning_rate": 0.0002, "epoch": 2.9292639138240575, "step": 40790}, {"loss": 0.6748, "grad_norm": 0.794775664806366, "learning_rate": 0.0002, "epoch": 2.9299820466786355, "step": 40800}, {"loss": 0.6967, "grad_norm": 0.8351698517799377, "learning_rate": 0.0002, "epoch": 2.9307001795332135, "step": 40810}, {"loss": 0.6927, "grad_norm": 0.853082001209259, "learning_rate": 0.0002, "epoch": 2.931418312387792, "step": 40820}, {"loss": 0.7047, "grad_norm": 0.8209722638130188, "learning_rate": 0.0002, "epoch": 2.93213644524237, "step": 40830}, {"loss": 0.6742, "grad_norm": 0.8982136845588684, "learning_rate": 0.0002, "epoch": 2.932854578096948, "step": 40840}, {"loss": 0.6617, "grad_norm": 0.8373305201530457, "learning_rate": 0.0002, "epoch": 2.933572710951526, "step": 40850}, {"loss": 0.6754, "grad_norm": 0.8326864242553711, "learning_rate": 0.0002, "epoch": 2.934290843806104, "step": 40860}, {"loss": 0.7151, "grad_norm": 0.7232590317726135, "learning_rate": 0.0002, "epoch": 2.9350089766606824, "step": 40870}, {"loss": 0.7311, "grad_norm": 0.823615312576294, "learning_rate": 0.0002, "epoch": 2.9357271095152604, "step": 40880}, {"loss": 0.7122, "grad_norm": 0.7532811760902405, "learning_rate": 0.0002, "epoch": 2.9364452423698384, "step": 40890}, {"loss": 0.7254, "grad_norm": 0.9594773650169373, "learning_rate": 0.0002, "epoch": 2.9371633752244164, "step": 40900}, {"loss": 0.7024, "grad_norm": 0.8368398547172546, "learning_rate": 0.0002, "epoch": 2.937881508078995, "step": 40910}, {"loss": 0.7201, "grad_norm": 0.8336817026138306, "learning_rate": 0.0002, "epoch": 2.938599640933573, "step": 40920}, {"loss": 0.6402, "grad_norm": 0.8413758277893066, "learning_rate": 0.0002, "epoch": 2.939317773788151, "step": 40930}, {"loss": 0.7054, "grad_norm": 0.7117549180984497, "learning_rate": 0.0002, "epoch": 2.940035906642729, "step": 40940}, {"loss": 0.6101, "grad_norm": 0.8741925954818726, "learning_rate": 0.0002, "epoch": 2.940754039497307, "step": 40950}, {"loss": 0.7491, "grad_norm": 0.8476088047027588, "learning_rate": 0.0002, "epoch": 2.941472172351885, "step": 40960}, {"loss": 0.7084, "grad_norm": 0.674659788608551, "learning_rate": 0.0002, "epoch": 2.9421903052064633, "step": 40970}, {"loss": 0.6714, "grad_norm": 0.7087500691413879, "learning_rate": 0.0002, "epoch": 2.9429084380610413, "step": 40980}, {"loss": 0.6953, "grad_norm": 0.9202252626419067, "learning_rate": 0.0002, "epoch": 2.9436265709156193, "step": 40990}, {"loss": 0.7244, "grad_norm": 0.9775124192237854, "learning_rate": 0.0002, "epoch": 2.9443447037701977, "step": 41000}, {"loss": 0.6897, "grad_norm": 0.7465068101882935, "learning_rate": 0.0002, "epoch": 2.9450628366247757, "step": 41010}, {"loss": 0.6944, "grad_norm": 0.7229986786842346, "learning_rate": 0.0002, "epoch": 2.9457809694793538, "step": 41020}, {"loss": 0.6754, "grad_norm": 0.7228954434394836, "learning_rate": 0.0002, "epoch": 2.9464991023339318, "step": 41030}, {"loss": 0.6604, "grad_norm": 0.9396149516105652, "learning_rate": 0.0002, "epoch": 2.9472172351885098, "step": 41040}, {"loss": 0.6498, "grad_norm": 0.9458696842193604, "learning_rate": 0.0002, "epoch": 2.9479353680430878, "step": 41050}, {"loss": 0.7154, "grad_norm": 0.8276246190071106, "learning_rate": 0.0002, "epoch": 2.948653500897666, "step": 41060}, {"loss": 0.6567, "grad_norm": 0.7927420139312744, "learning_rate": 0.0002, "epoch": 2.949371633752244, "step": 41070}, {"loss": 0.7442, "grad_norm": 0.7403103709220886, "learning_rate": 0.0002, "epoch": 2.950089766606822, "step": 41080}, {"loss": 0.6856, "grad_norm": 0.9813524484634399, "learning_rate": 0.0002, "epoch": 2.9508078994614, "step": 41090}, {"loss": 0.7271, "grad_norm": 0.8560924530029297, "learning_rate": 0.0002, "epoch": 2.9515260323159787, "step": 41100}, {"loss": 0.6851, "grad_norm": 0.6937443017959595, "learning_rate": 0.0002, "epoch": 2.9522441651705567, "step": 41110}, {"loss": 0.6817, "grad_norm": 0.8440476655960083, "learning_rate": 0.0002, "epoch": 2.9529622980251347, "step": 41120}, {"loss": 0.7082, "grad_norm": 1.1260770559310913, "learning_rate": 0.0002, "epoch": 2.9536804308797127, "step": 41130}, {"loss": 0.6745, "grad_norm": 0.8789936900138855, "learning_rate": 0.0002, "epoch": 2.9543985637342907, "step": 41140}, {"loss": 0.7297, "grad_norm": 0.8205832839012146, "learning_rate": 0.0002, "epoch": 2.9551166965888687, "step": 41150}, {"loss": 0.7036, "grad_norm": 0.8148444294929504, "learning_rate": 0.0002, "epoch": 2.955834829443447, "step": 41160}, {"loss": 0.6923, "grad_norm": 0.791296660900116, "learning_rate": 0.0002, "epoch": 2.956552962298025, "step": 41170}, {"loss": 0.6589, "grad_norm": 1.3229854106903076, "learning_rate": 0.0002, "epoch": 2.957271095152603, "step": 41180}, {"loss": 0.6691, "grad_norm": 0.906423807144165, "learning_rate": 0.0002, "epoch": 2.9579892280071816, "step": 41190}, {"loss": 0.6979, "grad_norm": 0.8707411289215088, "learning_rate": 0.0002, "epoch": 2.9587073608617596, "step": 41200}, {"loss": 0.6442, "grad_norm": 1.0362473726272583, "learning_rate": 0.0002, "epoch": 2.9594254937163376, "step": 41210}, {"loss": 0.6725, "grad_norm": 0.818546712398529, "learning_rate": 0.0002, "epoch": 2.9601436265709156, "step": 41220}, {"loss": 0.7158, "grad_norm": 0.8558517098426819, "learning_rate": 0.0002, "epoch": 2.9608617594254936, "step": 41230}, {"loss": 0.7056, "grad_norm": 0.8262931704521179, "learning_rate": 0.0002, "epoch": 2.9615798922800716, "step": 41240}, {"loss": 0.6256, "grad_norm": 0.9603250026702881, "learning_rate": 0.0002, "epoch": 2.96229802513465, "step": 41250}, {"loss": 0.68, "grad_norm": 0.891610860824585, "learning_rate": 0.0002, "epoch": 2.963016157989228, "step": 41260}, {"loss": 0.7732, "grad_norm": 0.9823883175849915, "learning_rate": 0.0002, "epoch": 2.963734290843806, "step": 41270}, {"loss": 0.7144, "grad_norm": 0.8783510327339172, "learning_rate": 0.0002, "epoch": 2.9644524236983845, "step": 41280}, {"loss": 0.7196, "grad_norm": 0.873656690120697, "learning_rate": 0.0002, "epoch": 2.9651705565529625, "step": 41290}, {"loss": 0.6531, "grad_norm": 0.8281165957450867, "learning_rate": 0.0002, "epoch": 2.9658886894075405, "step": 41300}, {"loss": 0.69, "grad_norm": 0.8008899092674255, "learning_rate": 0.0002, "epoch": 2.9666068222621185, "step": 41310}, {"loss": 0.6923, "grad_norm": 0.8564065098762512, "learning_rate": 0.0002, "epoch": 2.9673249551166965, "step": 41320}, {"loss": 0.6871, "grad_norm": 0.786119818687439, "learning_rate": 0.0002, "epoch": 2.9680430879712745, "step": 41330}, {"loss": 0.7105, "grad_norm": 1.3152399063110352, "learning_rate": 0.0002, "epoch": 2.968761220825853, "step": 41340}, {"loss": 0.6575, "grad_norm": 0.7551527619361877, "learning_rate": 0.0002, "epoch": 2.969479353680431, "step": 41350}, {"loss": 0.6939, "grad_norm": 1.1397290229797363, "learning_rate": 0.0002, "epoch": 2.970197486535009, "step": 41360}, {"loss": 0.7119, "grad_norm": 0.8333854079246521, "learning_rate": 0.0002, "epoch": 2.970915619389587, "step": 41370}, {"loss": 0.6941, "grad_norm": 0.8096165657043457, "learning_rate": 0.0002, "epoch": 2.9716337522441654, "step": 41380}, {"loss": 0.7748, "grad_norm": 0.8378547430038452, "learning_rate": 0.0002, "epoch": 2.9723518850987434, "step": 41390}, {"loss": 0.7678, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 2.9730700179533214, "step": 41400}, {"loss": 0.6962, "grad_norm": 0.8722409605979919, "learning_rate": 0.0002, "epoch": 2.9737881508078994, "step": 41410}, {"loss": 0.7298, "grad_norm": 0.6680061221122742, "learning_rate": 0.0002, "epoch": 2.9745062836624774, "step": 41420}, {"loss": 0.6731, "grad_norm": 0.7666152715682983, "learning_rate": 0.0002, "epoch": 2.9752244165170554, "step": 41430}, {"loss": 0.7377, "grad_norm": 0.8489957451820374, "learning_rate": 0.0002, "epoch": 2.975942549371634, "step": 41440}, {"loss": 0.6816, "grad_norm": 0.8516127467155457, "learning_rate": 0.0002, "epoch": 2.976660682226212, "step": 41450}, {"loss": 0.697, "grad_norm": 0.8836804628372192, "learning_rate": 0.0002, "epoch": 2.97737881508079, "step": 41460}, {"loss": 0.7048, "grad_norm": 1.0963364839553833, "learning_rate": 0.0002, "epoch": 2.9780969479353683, "step": 41470}, {"loss": 0.6695, "grad_norm": 0.9908610582351685, "learning_rate": 0.0002, "epoch": 2.9788150807899463, "step": 41480}, {"loss": 0.7184, "grad_norm": 0.8822041153907776, "learning_rate": 0.0002, "epoch": 2.9795332136445243, "step": 41490}, {"loss": 0.7192, "grad_norm": 0.717723548412323, "learning_rate": 0.0002, "epoch": 2.9802513464991023, "step": 41500}, {"loss": 0.711, "grad_norm": 0.8413400053977966, "learning_rate": 0.0002, "epoch": 2.9809694793536803, "step": 41510}, {"loss": 0.6871, "grad_norm": 0.8771023750305176, "learning_rate": 0.0002, "epoch": 2.9816876122082583, "step": 41520}, {"loss": 0.6802, "grad_norm": 0.7185000777244568, "learning_rate": 0.0002, "epoch": 2.9824057450628367, "step": 41530}, {"loss": 0.706, "grad_norm": 0.8299767374992371, "learning_rate": 0.0002, "epoch": 2.9831238779174147, "step": 41540}, {"loss": 0.6569, "grad_norm": 0.9309971928596497, "learning_rate": 0.0002, "epoch": 2.9838420107719927, "step": 41550}, {"loss": 0.6598, "grad_norm": 0.7644693851470947, "learning_rate": 0.0002, "epoch": 2.984560143626571, "step": 41560}, {"loss": 0.7186, "grad_norm": 0.7888111472129822, "learning_rate": 0.0002, "epoch": 2.985278276481149, "step": 41570}, {"loss": 0.6984, "grad_norm": 1.0921967029571533, "learning_rate": 0.0002, "epoch": 2.985996409335727, "step": 41580}, {"loss": 0.6629, "grad_norm": 0.8116785883903503, "learning_rate": 0.0002, "epoch": 2.986714542190305, "step": 41590}, {"loss": 0.6842, "grad_norm": 0.983269214630127, "learning_rate": 0.0002, "epoch": 2.987432675044883, "step": 41600}, {"loss": 0.6675, "grad_norm": 0.81700599193573, "learning_rate": 0.0002, "epoch": 2.988150807899461, "step": 41610}, {"loss": 0.7525, "grad_norm": 0.7545617818832397, "learning_rate": 0.0002, "epoch": 2.9888689407540396, "step": 41620}, {"loss": 0.6698, "grad_norm": 0.8695791363716125, "learning_rate": 0.0002, "epoch": 2.9895870736086176, "step": 41630}, {"loss": 0.7446, "grad_norm": 0.8980445861816406, "learning_rate": 0.0002, "epoch": 2.9903052064631956, "step": 41640}, {"loss": 0.6616, "grad_norm": 0.7884747982025146, "learning_rate": 0.0002, "epoch": 2.9910233393177736, "step": 41650}, {"loss": 0.6461, "grad_norm": 0.8347880840301514, "learning_rate": 0.0002, "epoch": 2.991741472172352, "step": 41660}, {"loss": 0.6607, "grad_norm": 0.7786261439323425, "learning_rate": 0.0002, "epoch": 2.99245960502693, "step": 41670}, {"loss": 0.6834, "grad_norm": 0.7830624580383301, "learning_rate": 0.0002, "epoch": 2.993177737881508, "step": 41680}, {"loss": 0.7116, "grad_norm": 0.8293532133102417, "learning_rate": 0.0002, "epoch": 2.993895870736086, "step": 41690}, {"loss": 0.7029, "grad_norm": 0.8476244211196899, "learning_rate": 0.0002, "epoch": 2.994614003590664, "step": 41700}, {"loss": 0.6909, "grad_norm": 0.7218726873397827, "learning_rate": 0.0002, "epoch": 2.995332136445242, "step": 41710}, {"loss": 0.6579, "grad_norm": 0.8144199252128601, "learning_rate": 0.0002, "epoch": 2.9960502692998205, "step": 41720}, {"loss": 0.7011, "grad_norm": 0.7047123312950134, "learning_rate": 0.0002, "epoch": 2.9967684021543985, "step": 41730}, {"loss": 0.6555, "grad_norm": 0.8412184715270996, "learning_rate": 0.0002, "epoch": 2.9974865350089765, "step": 41740}, {"loss": 0.7237, "grad_norm": 0.8840848207473755, "learning_rate": 0.0002, "epoch": 2.998204667863555, "step": 41750}, {"loss": 0.6618, "grad_norm": 0.7302142977714539, "learning_rate": 0.0002, "epoch": 2.998922800718133, "step": 41760}, {"loss": 0.6596, "grad_norm": 0.7075994610786438, "learning_rate": 0.0002, "epoch": 2.999640933572711, "step": 41770}, {"eval_loss": 1.1079821586608887, "eval_runtime": 55.1897, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 3.0, "step": 41775}, {"loss": 0.6472, "grad_norm": 0.8630077838897705, "learning_rate": 0.0002, "epoch": 3.000359066427289, "step": 41780}, {"loss": 0.5843, "grad_norm": 0.8901806473731995, "learning_rate": 0.0002, "epoch": 3.001077199281867, "step": 41790}, {"loss": 0.5789, "grad_norm": 0.8291767835617065, "learning_rate": 0.0002, "epoch": 3.0017953321364454, "step": 41800}, {"loss": 0.6049, "grad_norm": 0.792519211769104, "learning_rate": 0.0002, "epoch": 3.0025134649910235, "step": 41810}, {"loss": 0.6131, "grad_norm": 1.1330063343048096, "learning_rate": 0.0002, "epoch": 3.0032315978456015, "step": 41820}, {"loss": 0.6225, "grad_norm": 0.9401350617408752, "learning_rate": 0.0002, "epoch": 3.0039497307001795, "step": 41830}, {"loss": 0.5924, "grad_norm": 0.8065463304519653, "learning_rate": 0.0002, "epoch": 3.0046678635547575, "step": 41840}, {"loss": 0.6161, "grad_norm": 0.8309979438781738, "learning_rate": 0.0002, "epoch": 3.005385996409336, "step": 41850}, {"loss": 0.6099, "grad_norm": 0.7432689070701599, "learning_rate": 0.0002, "epoch": 3.006104129263914, "step": 41860}, {"loss": 0.5901, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 3.006822262118492, "step": 41870}, {"loss": 0.6211, "grad_norm": 1.4364255666732788, "learning_rate": 0.0002, "epoch": 3.00754039497307, "step": 41880}, {"loss": 0.5988, "grad_norm": 0.9023072123527527, "learning_rate": 0.0002, "epoch": 3.008258527827648, "step": 41890}, {"loss": 0.6296, "grad_norm": 0.7790587544441223, "learning_rate": 0.0002, "epoch": 3.0089766606822264, "step": 41900}, {"loss": 0.5908, "grad_norm": 0.9163706302642822, "learning_rate": 0.0002, "epoch": 3.0096947935368044, "step": 41910}, {"loss": 0.6216, "grad_norm": 0.8147963285446167, "learning_rate": 0.0002, "epoch": 3.0104129263913824, "step": 41920}, {"loss": 0.6546, "grad_norm": 0.8432748913764954, "learning_rate": 0.0002, "epoch": 3.0111310592459604, "step": 41930}, {"loss": 0.5815, "grad_norm": 0.9216182231903076, "learning_rate": 0.0002, "epoch": 3.011849192100539, "step": 41940}, {"loss": 0.6336, "grad_norm": 0.62154221534729, "learning_rate": 0.0002, "epoch": 3.012567324955117, "step": 41950}, {"loss": 0.5868, "grad_norm": 0.8902392387390137, "learning_rate": 0.0002, "epoch": 3.013285457809695, "step": 41960}, {"loss": 0.6205, "grad_norm": 0.9601083993911743, "learning_rate": 0.0002, "epoch": 3.014003590664273, "step": 41970}, {"loss": 0.6001, "grad_norm": 0.8938809037208557, "learning_rate": 0.0002, "epoch": 3.014721723518851, "step": 41980}, {"loss": 0.6215, "grad_norm": 1.0621999502182007, "learning_rate": 0.0002, "epoch": 3.0154398563734293, "step": 41990}, {"loss": 0.6453, "grad_norm": 0.7310585379600525, "learning_rate": 0.0002, "epoch": 3.0161579892280073, "step": 42000}, {"loss": 0.5674, "grad_norm": 0.8475853800773621, "learning_rate": 0.0002, "epoch": 3.0168761220825853, "step": 42010}, {"loss": 0.605, "grad_norm": 0.8509864807128906, "learning_rate": 0.0002, "epoch": 3.0175942549371633, "step": 42020}, {"loss": 0.6487, "grad_norm": 0.7461876273155212, "learning_rate": 0.0002, "epoch": 3.0183123877917413, "step": 42030}, {"loss": 0.6136, "grad_norm": 0.7734265327453613, "learning_rate": 0.0002, "epoch": 3.0190305206463197, "step": 42040}, {"loss": 0.6073, "grad_norm": 0.9056455492973328, "learning_rate": 0.0002, "epoch": 3.0197486535008977, "step": 42050}, {"loss": 0.6015, "grad_norm": 0.9183889031410217, "learning_rate": 0.0002, "epoch": 3.0204667863554757, "step": 42060}, {"loss": 0.6502, "grad_norm": 1.0777326822280884, "learning_rate": 0.0002, "epoch": 3.0211849192100537, "step": 42070}, {"loss": 0.6775, "grad_norm": 0.9217308163642883, "learning_rate": 0.0002, "epoch": 3.021903052064632, "step": 42080}, {"loss": 0.6157, "grad_norm": 0.8220202326774597, "learning_rate": 0.0002, "epoch": 3.02262118491921, "step": 42090}, {"loss": 0.5786, "grad_norm": 0.8454978466033936, "learning_rate": 0.0002, "epoch": 3.023339317773788, "step": 42100}, {"loss": 0.5653, "grad_norm": 0.8116370439529419, "learning_rate": 0.0002, "epoch": 3.024057450628366, "step": 42110}, {"loss": 0.6307, "grad_norm": 0.8064935207366943, "learning_rate": 0.0002, "epoch": 3.024775583482944, "step": 42120}, {"loss": 0.6567, "grad_norm": 0.9718650579452515, "learning_rate": 0.0002, "epoch": 3.0254937163375226, "step": 42130}, {"loss": 0.5936, "grad_norm": 0.8817588090896606, "learning_rate": 0.0002, "epoch": 3.0262118491921006, "step": 42140}, {"loss": 0.5625, "grad_norm": 0.7757318615913391, "learning_rate": 0.0002, "epoch": 3.0269299820466786, "step": 42150}, {"loss": 0.5704, "grad_norm": 0.7500545382499695, "learning_rate": 0.0002, "epoch": 3.0276481149012566, "step": 42160}, {"loss": 0.5635, "grad_norm": 0.72913658618927, "learning_rate": 0.0002, "epoch": 3.0283662477558346, "step": 42170}, {"loss": 0.6354, "grad_norm": 0.7641891837120056, "learning_rate": 0.0002, "epoch": 3.029084380610413, "step": 42180}, {"loss": 0.621, "grad_norm": 0.7682021856307983, "learning_rate": 0.0002, "epoch": 3.029802513464991, "step": 42190}, {"loss": 0.6377, "grad_norm": 0.8145958781242371, "learning_rate": 0.0002, "epoch": 3.030520646319569, "step": 42200}, {"loss": 0.6008, "grad_norm": 1.0546396970748901, "learning_rate": 0.0002, "epoch": 3.031238779174147, "step": 42210}, {"loss": 0.6177, "grad_norm": 0.8222804665565491, "learning_rate": 0.0002, "epoch": 3.0319569120287255, "step": 42220}, {"loss": 0.6264, "grad_norm": 0.8245829343795776, "learning_rate": 0.0002, "epoch": 3.0326750448833035, "step": 42230}, {"loss": 0.5828, "grad_norm": 0.9059963822364807, "learning_rate": 0.0002, "epoch": 3.0333931777378815, "step": 42240}, {"loss": 0.6373, "grad_norm": 1.026747465133667, "learning_rate": 0.0002, "epoch": 3.0341113105924595, "step": 42250}, {"loss": 0.636, "grad_norm": 0.9108404517173767, "learning_rate": 0.0002, "epoch": 3.0348294434470375, "step": 42260}, {"loss": 0.589, "grad_norm": 0.9828516840934753, "learning_rate": 0.0002, "epoch": 3.035547576301616, "step": 42270}, {"loss": 0.6558, "grad_norm": 0.9664266705513, "learning_rate": 0.0002, "epoch": 3.036265709156194, "step": 42280}, {"loss": 0.6157, "grad_norm": 0.7577654719352722, "learning_rate": 0.0002, "epoch": 3.036983842010772, "step": 42290}, {"loss": 0.5849, "grad_norm": 0.8331853151321411, "learning_rate": 0.0002, "epoch": 3.03770197486535, "step": 42300}, {"loss": 0.6335, "grad_norm": 0.8017228245735168, "learning_rate": 0.0002, "epoch": 3.038420107719928, "step": 42310}, {"loss": 0.6148, "grad_norm": 1.0316718816757202, "learning_rate": 0.0002, "epoch": 3.0391382405745064, "step": 42320}, {"loss": 0.5934, "grad_norm": 0.9379803538322449, "learning_rate": 0.0002, "epoch": 3.0398563734290844, "step": 42330}, {"loss": 0.6358, "grad_norm": 0.7554476857185364, "learning_rate": 0.0002, "epoch": 3.0405745062836624, "step": 42340}, {"loss": 0.5951, "grad_norm": 0.7377917766571045, "learning_rate": 0.0002, "epoch": 3.0412926391382404, "step": 42350}, {"loss": 0.5769, "grad_norm": 1.0655276775360107, "learning_rate": 0.0002, "epoch": 3.042010771992819, "step": 42360}, {"loss": 0.5892, "grad_norm": 0.7748511433601379, "learning_rate": 0.0002, "epoch": 3.042728904847397, "step": 42370}, {"loss": 0.6512, "grad_norm": 0.848649799823761, "learning_rate": 0.0002, "epoch": 3.043447037701975, "step": 42380}, {"loss": 0.6411, "grad_norm": 0.7754636406898499, "learning_rate": 0.0002, "epoch": 3.044165170556553, "step": 42390}, {"loss": 0.6665, "grad_norm": 0.8173656463623047, "learning_rate": 0.0002, "epoch": 3.044883303411131, "step": 42400}, {"loss": 0.5877, "grad_norm": 0.7881983518600464, "learning_rate": 0.0002, "epoch": 3.0456014362657093, "step": 42410}, {"loss": 0.5832, "grad_norm": 0.971072256565094, "learning_rate": 0.0002, "epoch": 3.0463195691202873, "step": 42420}, {"loss": 0.6303, "grad_norm": 0.8400143384933472, "learning_rate": 0.0002, "epoch": 3.0470377019748653, "step": 42430}, {"loss": 0.6557, "grad_norm": 1.0028647184371948, "learning_rate": 0.0002, "epoch": 3.0477558348294433, "step": 42440}, {"loss": 0.5949, "grad_norm": 0.9728034734725952, "learning_rate": 0.0002, "epoch": 3.0484739676840213, "step": 42450}, {"loss": 0.6222, "grad_norm": 0.937633752822876, "learning_rate": 0.0002, "epoch": 3.0491921005386, "step": 42460}, {"loss": 0.6254, "grad_norm": 1.0265642404556274, "learning_rate": 0.0002, "epoch": 3.049910233393178, "step": 42470}, {"loss": 0.6078, "grad_norm": 0.9733216762542725, "learning_rate": 0.0002, "epoch": 3.050628366247756, "step": 42480}, {"loss": 0.5766, "grad_norm": 0.7039174437522888, "learning_rate": 0.0002, "epoch": 3.051346499102334, "step": 42490}, {"loss": 0.6422, "grad_norm": 0.7515231370925903, "learning_rate": 0.0002, "epoch": 3.0520646319569122, "step": 42500}, {"loss": 0.5517, "grad_norm": 0.9115300178527832, "learning_rate": 0.0002, "epoch": 3.0527827648114902, "step": 42510}, {"loss": 0.6738, "grad_norm": 0.7403655648231506, "learning_rate": 0.0002, "epoch": 3.0535008976660682, "step": 42520}, {"loss": 0.5528, "grad_norm": 0.7826810479164124, "learning_rate": 0.0002, "epoch": 3.0542190305206462, "step": 42530}, {"loss": 0.6513, "grad_norm": 0.8007349371910095, "learning_rate": 0.0002, "epoch": 3.0549371633752243, "step": 42540}, {"loss": 0.6118, "grad_norm": 0.7975959777832031, "learning_rate": 0.0002, "epoch": 3.0556552962298027, "step": 42550}, {"loss": 0.6157, "grad_norm": 0.9665228128433228, "learning_rate": 0.0002, "epoch": 3.0563734290843807, "step": 42560}, {"loss": 0.6095, "grad_norm": 0.8386123180389404, "learning_rate": 0.0002, "epoch": 3.0570915619389587, "step": 42570}, {"loss": 0.64, "grad_norm": 0.7437782287597656, "learning_rate": 0.0002, "epoch": 3.0578096947935367, "step": 42580}, {"loss": 0.6399, "grad_norm": 0.8360698223114014, "learning_rate": 0.0002, "epoch": 3.0585278276481147, "step": 42590}, {"loss": 0.6259, "grad_norm": 0.8982073664665222, "learning_rate": 0.0002, "epoch": 3.059245960502693, "step": 42600}, {"loss": 0.6235, "grad_norm": 0.9425758719444275, "learning_rate": 0.0002, "epoch": 3.059964093357271, "step": 42610}, {"loss": 0.631, "grad_norm": 0.8567131161689758, "learning_rate": 0.0002, "epoch": 3.060682226211849, "step": 42620}, {"loss": 0.609, "grad_norm": 0.9322942495346069, "learning_rate": 0.0002, "epoch": 3.061400359066427, "step": 42630}, {"loss": 0.6384, "grad_norm": 0.8283235430717468, "learning_rate": 0.0002, "epoch": 3.0621184919210056, "step": 42640}, {"loss": 0.6345, "grad_norm": 0.8457967638969421, "learning_rate": 0.0002, "epoch": 3.0628366247755836, "step": 42650}, {"loss": 0.631, "grad_norm": 0.8205100893974304, "learning_rate": 0.0002, "epoch": 3.0635547576301616, "step": 42660}, {"loss": 0.6094, "grad_norm": 0.8385181427001953, "learning_rate": 0.0002, "epoch": 3.0642728904847396, "step": 42670}, {"loss": 0.6169, "grad_norm": 1.2959390878677368, "learning_rate": 0.0002, "epoch": 3.0649910233393176, "step": 42680}, {"loss": 0.6531, "grad_norm": 0.7150540351867676, "learning_rate": 0.0002, "epoch": 3.065709156193896, "step": 42690}, {"loss": 0.6456, "grad_norm": 0.6647360920906067, "learning_rate": 0.0002, "epoch": 3.066427289048474, "step": 42700}, {"loss": 0.6151, "grad_norm": 0.9148316979408264, "learning_rate": 0.0002, "epoch": 3.067145421903052, "step": 42710}, {"loss": 0.6298, "grad_norm": 0.8606209754943848, "learning_rate": 0.0002, "epoch": 3.06786355475763, "step": 42720}, {"loss": 0.636, "grad_norm": 1.4255632162094116, "learning_rate": 0.0002, "epoch": 3.068581687612208, "step": 42730}, {"loss": 0.6363, "grad_norm": 0.9131710529327393, "learning_rate": 0.0002, "epoch": 3.0692998204667865, "step": 42740}, {"loss": 0.6432, "grad_norm": 0.9560360908508301, "learning_rate": 0.0002, "epoch": 3.0700179533213645, "step": 42750}, {"loss": 0.6259, "grad_norm": 0.9278100728988647, "learning_rate": 0.0002, "epoch": 3.0707360861759425, "step": 42760}, {"loss": 0.6001, "grad_norm": 0.7258471846580505, "learning_rate": 0.0002, "epoch": 3.0714542190305205, "step": 42770}, {"loss": 0.6447, "grad_norm": 1.1537690162658691, "learning_rate": 0.0002, "epoch": 3.072172351885099, "step": 42780}, {"loss": 0.6237, "grad_norm": 0.8562588691711426, "learning_rate": 0.0002, "epoch": 3.072890484739677, "step": 42790}, {"loss": 0.645, "grad_norm": 1.0271626710891724, "learning_rate": 0.0002, "epoch": 3.073608617594255, "step": 42800}, {"loss": 0.6782, "grad_norm": 0.85148024559021, "learning_rate": 0.0002, "epoch": 3.074326750448833, "step": 42810}, {"loss": 0.5905, "grad_norm": 0.805772602558136, "learning_rate": 0.0002, "epoch": 3.075044883303411, "step": 42820}, {"loss": 0.623, "grad_norm": 0.8057122230529785, "learning_rate": 0.0002, "epoch": 3.0757630161579894, "step": 42830}, {"loss": 0.6391, "grad_norm": 0.7997274994850159, "learning_rate": 0.0002, "epoch": 3.0764811490125674, "step": 42840}, {"loss": 0.5965, "grad_norm": 0.8739321231842041, "learning_rate": 0.0002, "epoch": 3.0771992818671454, "step": 42850}, {"loss": 0.6027, "grad_norm": 0.833951473236084, "learning_rate": 0.0002, "epoch": 3.0779174147217234, "step": 42860}, {"loss": 0.6251, "grad_norm": 0.8813839554786682, "learning_rate": 0.0002, "epoch": 3.0786355475763014, "step": 42870}, {"loss": 0.6485, "grad_norm": 0.9020521640777588, "learning_rate": 0.0002, "epoch": 3.07935368043088, "step": 42880}, {"loss": 0.5719, "grad_norm": 0.888148844242096, "learning_rate": 0.0002, "epoch": 3.080071813285458, "step": 42890}, {"loss": 0.6715, "grad_norm": 0.8110589385032654, "learning_rate": 0.0002, "epoch": 3.080789946140036, "step": 42900}, {"loss": 0.5931, "grad_norm": 0.818738579750061, "learning_rate": 0.0002, "epoch": 3.081508078994614, "step": 42910}, {"loss": 0.6723, "grad_norm": 0.9607479572296143, "learning_rate": 0.0002, "epoch": 3.082226211849192, "step": 42920}, {"loss": 0.6045, "grad_norm": 0.8162698745727539, "learning_rate": 0.0002, "epoch": 3.0829443447037703, "step": 42930}, {"loss": 0.5975, "grad_norm": 0.8170801997184753, "learning_rate": 0.0002, "epoch": 3.0836624775583483, "step": 42940}, {"loss": 0.5748, "grad_norm": 0.9250763654708862, "learning_rate": 0.0002, "epoch": 3.0843806104129263, "step": 42950}, {"loss": 0.6651, "grad_norm": 0.898097813129425, "learning_rate": 0.0002, "epoch": 3.0850987432675043, "step": 42960}, {"loss": 0.6573, "grad_norm": 0.9398433566093445, "learning_rate": 0.0002, "epoch": 3.0858168761220828, "step": 42970}, {"loss": 0.6243, "grad_norm": 1.052808165550232, "learning_rate": 0.0002, "epoch": 3.0865350089766608, "step": 42980}, {"loss": 0.6622, "grad_norm": 0.8974723219871521, "learning_rate": 0.0002, "epoch": 3.087253141831239, "step": 42990}, {"loss": 0.6135, "grad_norm": 0.7517408728599548, "learning_rate": 0.0002, "epoch": 3.087971274685817, "step": 43000}, {"loss": 0.6185, "grad_norm": 0.8054485321044922, "learning_rate": 0.0002, "epoch": 3.088689407540395, "step": 43010}, {"loss": 0.6199, "grad_norm": 0.9896154999732971, "learning_rate": 0.0002, "epoch": 3.0894075403949732, "step": 43020}, {"loss": 0.6308, "grad_norm": 0.7887356281280518, "learning_rate": 0.0002, "epoch": 3.0901256732495512, "step": 43030}, {"loss": 0.6173, "grad_norm": 1.0119125843048096, "learning_rate": 0.0002, "epoch": 3.0908438061041292, "step": 43040}, {"loss": 0.6294, "grad_norm": 0.8753892779350281, "learning_rate": 0.0002, "epoch": 3.0915619389587072, "step": 43050}, {"loss": 0.6068, "grad_norm": 0.8322654962539673, "learning_rate": 0.0002, "epoch": 3.0922800718132857, "step": 43060}, {"loss": 0.6237, "grad_norm": 1.0605992078781128, "learning_rate": 0.0002, "epoch": 3.0929982046678637, "step": 43070}, {"loss": 0.6507, "grad_norm": 0.8783912062644958, "learning_rate": 0.0002, "epoch": 3.0937163375224417, "step": 43080}, {"loss": 0.6023, "grad_norm": 0.8839107751846313, "learning_rate": 0.0002, "epoch": 3.0944344703770197, "step": 43090}, {"loss": 0.6588, "grad_norm": 1.1655086278915405, "learning_rate": 0.0002, "epoch": 3.0951526032315977, "step": 43100}, {"loss": 0.6367, "grad_norm": 0.7051523327827454, "learning_rate": 0.0002, "epoch": 3.095870736086176, "step": 43110}, {"loss": 0.5941, "grad_norm": 0.7793807983398438, "learning_rate": 0.0002, "epoch": 3.096588868940754, "step": 43120}, {"loss": 0.6073, "grad_norm": 0.8352194428443909, "learning_rate": 0.0002, "epoch": 3.097307001795332, "step": 43130}, {"loss": 0.6087, "grad_norm": 0.9684847593307495, "learning_rate": 0.0002, "epoch": 3.09802513464991, "step": 43140}, {"loss": 0.6347, "grad_norm": 1.1106340885162354, "learning_rate": 0.0002, "epoch": 3.098743267504488, "step": 43150}, {"loss": 0.6395, "grad_norm": 0.7814911603927612, "learning_rate": 0.0002, "epoch": 3.0994614003590666, "step": 43160}, {"loss": 0.637, "grad_norm": 0.7923110723495483, "learning_rate": 0.0002, "epoch": 3.1001795332136446, "step": 43170}, {"loss": 0.6218, "grad_norm": 0.87022864818573, "learning_rate": 0.0002, "epoch": 3.1008976660682226, "step": 43180}, {"loss": 0.6246, "grad_norm": 0.9352855682373047, "learning_rate": 0.0002, "epoch": 3.1016157989228006, "step": 43190}, {"loss": 0.5943, "grad_norm": 0.8548445105552673, "learning_rate": 0.0002, "epoch": 3.1023339317773786, "step": 43200}, {"loss": 0.6106, "grad_norm": 0.9576025009155273, "learning_rate": 0.0002, "epoch": 3.103052064631957, "step": 43210}, {"loss": 0.6222, "grad_norm": 0.7430430054664612, "learning_rate": 0.0002, "epoch": 3.103770197486535, "step": 43220}, {"loss": 0.6223, "grad_norm": 0.9619144797325134, "learning_rate": 0.0002, "epoch": 3.104488330341113, "step": 43230}, {"loss": 0.6171, "grad_norm": 0.8622338771820068, "learning_rate": 0.0002, "epoch": 3.105206463195691, "step": 43240}, {"loss": 0.6336, "grad_norm": 0.853489339351654, "learning_rate": 0.0002, "epoch": 3.1059245960502695, "step": 43250}, {"loss": 0.635, "grad_norm": 0.9253206849098206, "learning_rate": 0.0002, "epoch": 3.1066427289048475, "step": 43260}, {"loss": 0.68, "grad_norm": 0.9700671434402466, "learning_rate": 0.0002, "epoch": 3.1073608617594255, "step": 43270}, {"loss": 0.6284, "grad_norm": 1.0550731420516968, "learning_rate": 0.0002, "epoch": 3.1080789946140035, "step": 43280}, {"loss": 0.6389, "grad_norm": 0.939452052116394, "learning_rate": 0.0002, "epoch": 3.1087971274685815, "step": 43290}, {"loss": 0.621, "grad_norm": 0.8855276107788086, "learning_rate": 0.0002, "epoch": 3.10951526032316, "step": 43300}, {"loss": 0.5814, "grad_norm": 0.92197185754776, "learning_rate": 0.0002, "epoch": 3.110233393177738, "step": 43310}, {"loss": 0.6341, "grad_norm": 0.8825578689575195, "learning_rate": 0.0002, "epoch": 3.110951526032316, "step": 43320}, {"loss": 0.6412, "grad_norm": 0.9964608550071716, "learning_rate": 0.0002, "epoch": 3.111669658886894, "step": 43330}, {"loss": 0.6074, "grad_norm": 0.9070520401000977, "learning_rate": 0.0002, "epoch": 3.1123877917414724, "step": 43340}, {"loss": 0.6503, "grad_norm": 0.9699633717536926, "learning_rate": 0.0002, "epoch": 3.1131059245960504, "step": 43350}, {"loss": 0.6545, "grad_norm": 0.7384091019630432, "learning_rate": 0.0002, "epoch": 3.1138240574506284, "step": 43360}, {"loss": 0.6644, "grad_norm": 0.9445326328277588, "learning_rate": 0.0002, "epoch": 3.1145421903052064, "step": 43370}, {"loss": 0.6088, "grad_norm": 0.8906524181365967, "learning_rate": 0.0002, "epoch": 3.1152603231597844, "step": 43380}, {"loss": 0.6213, "grad_norm": 0.8850129246711731, "learning_rate": 0.0002, "epoch": 3.115978456014363, "step": 43390}, {"loss": 0.6156, "grad_norm": 0.7091860771179199, "learning_rate": 0.0002, "epoch": 3.116696588868941, "step": 43400}, {"loss": 0.6056, "grad_norm": 0.8992764949798584, "learning_rate": 0.0002, "epoch": 3.117414721723519, "step": 43410}, {"loss": 0.6336, "grad_norm": 0.9166698455810547, "learning_rate": 0.0002, "epoch": 3.118132854578097, "step": 43420}, {"loss": 0.7011, "grad_norm": 1.1195749044418335, "learning_rate": 0.0002, "epoch": 3.118850987432675, "step": 43430}, {"loss": 0.6409, "grad_norm": 0.9414069652557373, "learning_rate": 0.0002, "epoch": 3.1195691202872533, "step": 43440}, {"loss": 0.6533, "grad_norm": 0.7641217112541199, "learning_rate": 0.0002, "epoch": 3.1202872531418313, "step": 43450}, {"loss": 0.6613, "grad_norm": 1.2659285068511963, "learning_rate": 0.0002, "epoch": 3.1210053859964093, "step": 43460}, {"loss": 0.631, "grad_norm": 0.9968213438987732, "learning_rate": 0.0002, "epoch": 3.1217235188509873, "step": 43470}, {"loss": 0.5833, "grad_norm": 0.8819042444229126, "learning_rate": 0.0002, "epoch": 3.1224416517055653, "step": 43480}, {"loss": 0.6819, "grad_norm": 0.9124775528907776, "learning_rate": 0.0002, "epoch": 3.1231597845601438, "step": 43490}, {"loss": 0.675, "grad_norm": 0.868354082107544, "learning_rate": 0.0002, "epoch": 3.1238779174147218, "step": 43500}, {"loss": 0.6348, "grad_norm": 0.7367526292800903, "learning_rate": 0.0002, "epoch": 3.1245960502692998, "step": 43510}, {"loss": 0.6068, "grad_norm": 0.7553679943084717, "learning_rate": 0.0002, "epoch": 3.1253141831238778, "step": 43520}, {"loss": 0.6346, "grad_norm": 0.7970008850097656, "learning_rate": 0.0002, "epoch": 3.126032315978456, "step": 43530}, {"loss": 0.6357, "grad_norm": 0.9117488861083984, "learning_rate": 0.0002, "epoch": 3.126750448833034, "step": 43540}, {"loss": 0.6609, "grad_norm": 0.8004103899002075, "learning_rate": 0.0002, "epoch": 3.127468581687612, "step": 43550}, {"loss": 0.596, "grad_norm": 0.736518919467926, "learning_rate": 0.0002, "epoch": 3.12818671454219, "step": 43560}, {"loss": 0.5945, "grad_norm": 0.8568395376205444, "learning_rate": 0.0002, "epoch": 3.128904847396768, "step": 43570}, {"loss": 0.665, "grad_norm": 0.9344052672386169, "learning_rate": 0.0002, "epoch": 3.1296229802513467, "step": 43580}, {"loss": 0.6403, "grad_norm": 0.7986525297164917, "learning_rate": 0.0002, "epoch": 3.1303411131059247, "step": 43590}, {"loss": 0.61, "grad_norm": 0.8283242583274841, "learning_rate": 0.0002, "epoch": 3.1310592459605027, "step": 43600}, {"loss": 0.6003, "grad_norm": 0.6534292101860046, "learning_rate": 0.0002, "epoch": 3.1317773788150807, "step": 43610}, {"loss": 0.6994, "grad_norm": 0.9585428833961487, "learning_rate": 0.0002, "epoch": 3.132495511669659, "step": 43620}, {"loss": 0.6007, "grad_norm": 0.8299157023429871, "learning_rate": 0.0002, "epoch": 3.133213644524237, "step": 43630}, {"loss": 0.6169, "grad_norm": 0.9050052762031555, "learning_rate": 0.0002, "epoch": 3.133931777378815, "step": 43640}, {"loss": 0.6217, "grad_norm": 1.0457062721252441, "learning_rate": 0.0002, "epoch": 3.134649910233393, "step": 43650}, {"loss": 0.6147, "grad_norm": 0.907691240310669, "learning_rate": 0.0002, "epoch": 3.135368043087971, "step": 43660}, {"loss": 0.5808, "grad_norm": 0.8868935108184814, "learning_rate": 0.0002, "epoch": 3.1360861759425496, "step": 43670}, {"loss": 0.6427, "grad_norm": 0.8585456609725952, "learning_rate": 0.0002, "epoch": 3.1368043087971276, "step": 43680}, {"loss": 0.6242, "grad_norm": 1.0402741432189941, "learning_rate": 0.0002, "epoch": 3.1375224416517056, "step": 43690}, {"loss": 0.641, "grad_norm": 1.0866798162460327, "learning_rate": 0.0002, "epoch": 3.1382405745062836, "step": 43700}, {"loss": 0.6082, "grad_norm": 0.7637296915054321, "learning_rate": 0.0002, "epoch": 3.1389587073608616, "step": 43710}, {"loss": 0.6256, "grad_norm": 0.755235493183136, "learning_rate": 0.0002, "epoch": 3.13967684021544, "step": 43720}, {"loss": 0.6441, "grad_norm": 0.7258853316307068, "learning_rate": 0.0002, "epoch": 3.140394973070018, "step": 43730}, {"loss": 0.5891, "grad_norm": 1.0425268411636353, "learning_rate": 0.0002, "epoch": 3.141113105924596, "step": 43740}, {"loss": 0.6527, "grad_norm": 0.9171959757804871, "learning_rate": 0.0002, "epoch": 3.141831238779174, "step": 43750}, {"loss": 0.6365, "grad_norm": 0.8900150656700134, "learning_rate": 0.0002, "epoch": 3.142549371633752, "step": 43760}, {"loss": 0.6324, "grad_norm": 0.9879246354103088, "learning_rate": 0.0002, "epoch": 3.1432675044883305, "step": 43770}, {"loss": 0.6624, "grad_norm": 0.7853389382362366, "learning_rate": 0.0002, "epoch": 3.1439856373429085, "step": 43780}, {"loss": 0.6259, "grad_norm": 1.0245232582092285, "learning_rate": 0.0002, "epoch": 3.1447037701974865, "step": 43790}, {"loss": 0.6278, "grad_norm": 0.8486390113830566, "learning_rate": 0.0002, "epoch": 3.1454219030520645, "step": 43800}, {"loss": 0.6175, "grad_norm": 0.8536406755447388, "learning_rate": 0.0002, "epoch": 3.146140035906643, "step": 43810}, {"loss": 0.5901, "grad_norm": 0.9653734564781189, "learning_rate": 0.0002, "epoch": 3.146858168761221, "step": 43820}, {"loss": 0.6041, "grad_norm": 0.8292608857154846, "learning_rate": 0.0002, "epoch": 3.147576301615799, "step": 43830}, {"loss": 0.6688, "grad_norm": 1.147524118423462, "learning_rate": 0.0002, "epoch": 3.148294434470377, "step": 43840}, {"loss": 0.6155, "grad_norm": 0.9317546486854553, "learning_rate": 0.0002, "epoch": 3.149012567324955, "step": 43850}, {"loss": 0.6305, "grad_norm": 0.8651045560836792, "learning_rate": 0.0002, "epoch": 3.1497307001795334, "step": 43860}, {"loss": 0.5985, "grad_norm": 0.8718969225883484, "learning_rate": 0.0002, "epoch": 3.1504488330341114, "step": 43870}, {"loss": 0.6206, "grad_norm": 1.0140702724456787, "learning_rate": 0.0002, "epoch": 3.1511669658886894, "step": 43880}, {"loss": 0.5941, "grad_norm": 0.75941401720047, "learning_rate": 0.0002, "epoch": 3.1518850987432674, "step": 43890}, {"loss": 0.5957, "grad_norm": 0.6618940234184265, "learning_rate": 0.0002, "epoch": 3.152603231597846, "step": 43900}, {"loss": 0.6262, "grad_norm": 1.0013338327407837, "learning_rate": 0.0002, "epoch": 3.153321364452424, "step": 43910}, {"loss": 0.6263, "grad_norm": 0.8735299706459045, "learning_rate": 0.0002, "epoch": 3.154039497307002, "step": 43920}, {"loss": 0.627, "grad_norm": 1.141914963722229, "learning_rate": 0.0002, "epoch": 3.15475763016158, "step": 43930}, {"loss": 0.6604, "grad_norm": 1.0916038751602173, "learning_rate": 0.0002, "epoch": 3.155475763016158, "step": 43940}, {"loss": 0.6228, "grad_norm": 0.7042547464370728, "learning_rate": 0.0002, "epoch": 3.1561938958707363, "step": 43950}, {"loss": 0.6069, "grad_norm": 0.9885236620903015, "learning_rate": 0.0002, "epoch": 3.1569120287253143, "step": 43960}, {"loss": 0.5973, "grad_norm": 0.8083009719848633, "learning_rate": 0.0002, "epoch": 3.1576301615798923, "step": 43970}, {"loss": 0.6416, "grad_norm": 1.082627296447754, "learning_rate": 0.0002, "epoch": 3.1583482944344703, "step": 43980}, {"loss": 0.624, "grad_norm": 0.9293290376663208, "learning_rate": 0.0002, "epoch": 3.1590664272890483, "step": 43990}, {"loss": 0.5665, "grad_norm": 0.861003041267395, "learning_rate": 0.0002, "epoch": 3.1597845601436267, "step": 44000}, {"loss": 0.6221, "grad_norm": 0.9565994143486023, "learning_rate": 0.0002, "epoch": 3.1605026929982047, "step": 44010}, {"loss": 0.7038, "grad_norm": 0.9609305262565613, "learning_rate": 0.0002, "epoch": 3.1612208258527827, "step": 44020}, {"loss": 0.6064, "grad_norm": 0.847830593585968, "learning_rate": 0.0002, "epoch": 3.1619389587073607, "step": 44030}, {"loss": 0.6299, "grad_norm": 0.852357804775238, "learning_rate": 0.0002, "epoch": 3.1626570915619387, "step": 44040}, {"loss": 0.5943, "grad_norm": 0.8634562492370605, "learning_rate": 0.0002, "epoch": 3.163375224416517, "step": 44050}, {"loss": 0.6011, "grad_norm": 1.0259950160980225, "learning_rate": 0.0002, "epoch": 3.164093357271095, "step": 44060}, {"loss": 0.7039, "grad_norm": 0.9615250825881958, "learning_rate": 0.0002, "epoch": 3.164811490125673, "step": 44070}, {"loss": 0.6179, "grad_norm": 0.9892165660858154, "learning_rate": 0.0002, "epoch": 3.165529622980251, "step": 44080}, {"loss": 0.6295, "grad_norm": 0.8827354907989502, "learning_rate": 0.0002, "epoch": 3.1662477558348296, "step": 44090}, {"loss": 0.6131, "grad_norm": 0.9258168339729309, "learning_rate": 0.0002, "epoch": 3.1669658886894076, "step": 44100}, {"loss": 0.5746, "grad_norm": 0.7983399033546448, "learning_rate": 0.0002, "epoch": 3.1676840215439857, "step": 44110}, {"loss": 0.6075, "grad_norm": 0.9917809963226318, "learning_rate": 0.0002, "epoch": 3.1684021543985637, "step": 44120}, {"loss": 0.6474, "grad_norm": 1.058927297592163, "learning_rate": 0.0002, "epoch": 3.1691202872531417, "step": 44130}, {"loss": 0.6211, "grad_norm": 1.0095895528793335, "learning_rate": 0.0002, "epoch": 3.16983842010772, "step": 44140}, {"loss": 0.6586, "grad_norm": 0.9032495617866516, "learning_rate": 0.0002, "epoch": 3.170556552962298, "step": 44150}, {"loss": 0.6356, "grad_norm": 0.9391272664070129, "learning_rate": 0.0002, "epoch": 3.171274685816876, "step": 44160}, {"loss": 0.6324, "grad_norm": 0.990755558013916, "learning_rate": 0.0002, "epoch": 3.171992818671454, "step": 44170}, {"loss": 0.5647, "grad_norm": 0.9310759902000427, "learning_rate": 0.0002, "epoch": 3.172710951526032, "step": 44180}, {"loss": 0.6802, "grad_norm": 0.7698856592178345, "learning_rate": 0.0002, "epoch": 3.1734290843806106, "step": 44190}, {"loss": 0.6109, "grad_norm": 0.7735867500305176, "learning_rate": 0.0002, "epoch": 3.1741472172351886, "step": 44200}, {"loss": 0.6252, "grad_norm": 1.1447525024414062, "learning_rate": 0.0002, "epoch": 3.1748653500897666, "step": 44210}, {"loss": 0.6268, "grad_norm": 0.8667060136795044, "learning_rate": 0.0002, "epoch": 3.1755834829443446, "step": 44220}, {"loss": 0.6066, "grad_norm": 0.8596829771995544, "learning_rate": 0.0002, "epoch": 3.176301615798923, "step": 44230}, {"loss": 0.6142, "grad_norm": 0.8607654571533203, "learning_rate": 0.0002, "epoch": 3.177019748653501, "step": 44240}, {"loss": 0.6358, "grad_norm": 0.9346948266029358, "learning_rate": 0.0002, "epoch": 3.177737881508079, "step": 44250}, {"loss": 0.6099, "grad_norm": 0.852344810962677, "learning_rate": 0.0002, "epoch": 3.178456014362657, "step": 44260}, {"loss": 0.5759, "grad_norm": 0.9260450005531311, "learning_rate": 0.0002, "epoch": 3.179174147217235, "step": 44270}, {"loss": 0.6419, "grad_norm": 0.924053430557251, "learning_rate": 0.0002, "epoch": 3.1798922800718135, "step": 44280}, {"loss": 0.6456, "grad_norm": 1.001965045928955, "learning_rate": 0.0002, "epoch": 3.1806104129263915, "step": 44290}, {"loss": 0.6211, "grad_norm": 0.943215012550354, "learning_rate": 0.0002, "epoch": 3.1813285457809695, "step": 44300}, {"loss": 0.6261, "grad_norm": 1.006977915763855, "learning_rate": 0.0002, "epoch": 3.1820466786355475, "step": 44310}, {"loss": 0.6684, "grad_norm": 0.9768950343132019, "learning_rate": 0.0002, "epoch": 3.1827648114901255, "step": 44320}, {"loss": 0.6334, "grad_norm": 0.9297489523887634, "learning_rate": 0.0002, "epoch": 3.183482944344704, "step": 44330}, {"loss": 0.6291, "grad_norm": 0.9110919237136841, "learning_rate": 0.0002, "epoch": 3.184201077199282, "step": 44340}, {"loss": 0.6389, "grad_norm": 0.9821381568908691, "learning_rate": 0.0002, "epoch": 3.18491921005386, "step": 44350}, {"loss": 0.6342, "grad_norm": 0.8451243042945862, "learning_rate": 0.0002, "epoch": 3.185637342908438, "step": 44360}, {"loss": 0.6709, "grad_norm": 0.9676638245582581, "learning_rate": 0.0002, "epoch": 3.1863554757630164, "step": 44370}, {"loss": 0.6506, "grad_norm": 0.9826035499572754, "learning_rate": 0.0002, "epoch": 3.1870736086175944, "step": 44380}, {"loss": 0.6425, "grad_norm": 0.9453121423721313, "learning_rate": 0.0002, "epoch": 3.1877917414721724, "step": 44390}, {"loss": 0.6481, "grad_norm": 0.7766330242156982, "learning_rate": 0.0002, "epoch": 3.1885098743267504, "step": 44400}, {"loss": 0.6369, "grad_norm": 0.9302349090576172, "learning_rate": 0.0002, "epoch": 3.1892280071813284, "step": 44410}, {"loss": 0.5586, "grad_norm": 0.8335331082344055, "learning_rate": 0.0002, "epoch": 3.189946140035907, "step": 44420}, {"loss": 0.673, "grad_norm": 0.6722736358642578, "learning_rate": 0.0002, "epoch": 3.190664272890485, "step": 44430}, {"loss": 0.6809, "grad_norm": 0.9047536849975586, "learning_rate": 0.0002, "epoch": 3.191382405745063, "step": 44440}, {"loss": 0.6085, "grad_norm": 0.9653822183609009, "learning_rate": 0.0002, "epoch": 3.192100538599641, "step": 44450}, {"loss": 0.6071, "grad_norm": 0.7750703692436218, "learning_rate": 0.0002, "epoch": 3.192818671454219, "step": 44460}, {"loss": 0.6323, "grad_norm": 0.7767539024353027, "learning_rate": 0.0002, "epoch": 3.1935368043087973, "step": 44470}, {"loss": 0.6471, "grad_norm": 0.8597778081893921, "learning_rate": 0.0002, "epoch": 3.1942549371633753, "step": 44480}, {"loss": 0.6804, "grad_norm": 1.1711493730545044, "learning_rate": 0.0002, "epoch": 3.1949730700179533, "step": 44490}, {"loss": 0.5917, "grad_norm": 0.9025220274925232, "learning_rate": 0.0002, "epoch": 3.1956912028725313, "step": 44500}, {"loss": 0.6445, "grad_norm": 0.8084979057312012, "learning_rate": 0.0002, "epoch": 3.1964093357271093, "step": 44510}, {"loss": 0.5943, "grad_norm": 0.8475074172019958, "learning_rate": 0.0002, "epoch": 3.1971274685816877, "step": 44520}, {"loss": 0.5959, "grad_norm": 0.9915644526481628, "learning_rate": 0.0002, "epoch": 3.1978456014362657, "step": 44530}, {"loss": 0.627, "grad_norm": 0.992231547832489, "learning_rate": 0.0002, "epoch": 3.1985637342908437, "step": 44540}, {"loss": 0.625, "grad_norm": 0.9804556369781494, "learning_rate": 0.0002, "epoch": 3.1992818671454217, "step": 44550}, {"loss": 0.6534, "grad_norm": 1.045558214187622, "learning_rate": 0.0002, "epoch": 3.2, "step": 44560}, {"loss": 0.6201, "grad_norm": 1.0880261659622192, "learning_rate": 0.0002, "epoch": 3.200718132854578, "step": 44570}, {"loss": 0.6471, "grad_norm": 0.9511138200759888, "learning_rate": 0.0002, "epoch": 3.201436265709156, "step": 44580}, {"loss": 0.5961, "grad_norm": 0.9115344882011414, "learning_rate": 0.0002, "epoch": 3.202154398563734, "step": 44590}, {"loss": 0.6504, "grad_norm": 1.0738362073898315, "learning_rate": 0.0002, "epoch": 3.202872531418312, "step": 44600}, {"loss": 0.6324, "grad_norm": 0.8209697604179382, "learning_rate": 0.0002, "epoch": 3.2035906642728906, "step": 44610}, {"loss": 0.6445, "grad_norm": 0.9220197796821594, "learning_rate": 0.0002, "epoch": 3.2043087971274686, "step": 44620}, {"loss": 0.5798, "grad_norm": 0.8859700560569763, "learning_rate": 0.0002, "epoch": 3.2050269299820466, "step": 44630}, {"loss": 0.6185, "grad_norm": 0.9772757291793823, "learning_rate": 0.0002, "epoch": 3.2057450628366246, "step": 44640}, {"loss": 0.6528, "grad_norm": 0.9385574460029602, "learning_rate": 0.0002, "epoch": 3.206463195691203, "step": 44650}, {"loss": 0.6098, "grad_norm": 0.839958906173706, "learning_rate": 0.0002, "epoch": 3.207181328545781, "step": 44660}, {"loss": 0.6803, "grad_norm": 0.860478401184082, "learning_rate": 0.0002, "epoch": 3.207899461400359, "step": 44670}, {"loss": 0.683, "grad_norm": 0.846886396408081, "learning_rate": 0.0002, "epoch": 3.208617594254937, "step": 44680}, {"loss": 0.6312, "grad_norm": 0.8591006398200989, "learning_rate": 0.0002, "epoch": 3.209335727109515, "step": 44690}, {"loss": 0.6173, "grad_norm": 0.9236023426055908, "learning_rate": 0.0002, "epoch": 3.2100538599640935, "step": 44700}, {"loss": 0.6471, "grad_norm": 0.7348999977111816, "learning_rate": 0.0002, "epoch": 3.2107719928186715, "step": 44710}, {"loss": 0.6239, "grad_norm": 1.0041730403900146, "learning_rate": 0.0002, "epoch": 3.2114901256732495, "step": 44720}, {"loss": 0.6612, "grad_norm": 0.8382687568664551, "learning_rate": 0.0002, "epoch": 3.2122082585278275, "step": 44730}, {"loss": 0.6026, "grad_norm": 0.8253511190414429, "learning_rate": 0.0002, "epoch": 3.2129263913824055, "step": 44740}, {"loss": 0.6129, "grad_norm": 0.9589242935180664, "learning_rate": 0.0002, "epoch": 3.213644524236984, "step": 44750}, {"loss": 0.6476, "grad_norm": 0.8938157558441162, "learning_rate": 0.0002, "epoch": 3.214362657091562, "step": 44760}, {"loss": 0.6811, "grad_norm": 1.0085135698318481, "learning_rate": 0.0002, "epoch": 3.21508078994614, "step": 44770}, {"loss": 0.646, "grad_norm": 0.8647134304046631, "learning_rate": 0.0002, "epoch": 3.215798922800718, "step": 44780}, {"loss": 0.6169, "grad_norm": 1.09453284740448, "learning_rate": 0.0002, "epoch": 3.216517055655296, "step": 44790}, {"loss": 0.6156, "grad_norm": 0.8710666298866272, "learning_rate": 0.0002, "epoch": 3.2172351885098744, "step": 44800}, {"loss": 0.662, "grad_norm": 0.8080880641937256, "learning_rate": 0.0002, "epoch": 3.2179533213644524, "step": 44810}, {"loss": 0.6039, "grad_norm": 1.0440675020217896, "learning_rate": 0.0002, "epoch": 3.2186714542190304, "step": 44820}, {"loss": 0.6629, "grad_norm": 1.1036376953125, "learning_rate": 0.0002, "epoch": 3.2193895870736084, "step": 44830}, {"loss": 0.6474, "grad_norm": 0.8783546686172485, "learning_rate": 0.0002, "epoch": 3.220107719928187, "step": 44840}, {"loss": 0.6286, "grad_norm": 0.7816855907440186, "learning_rate": 0.0002, "epoch": 3.220825852782765, "step": 44850}, {"loss": 0.622, "grad_norm": 1.0099157094955444, "learning_rate": 0.0002, "epoch": 3.221543985637343, "step": 44860}, {"loss": 0.6668, "grad_norm": 1.054928183555603, "learning_rate": 0.0002, "epoch": 3.222262118491921, "step": 44870}, {"loss": 0.6104, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 3.222980251346499, "step": 44880}, {"loss": 0.686, "grad_norm": 0.9730798602104187, "learning_rate": 0.0002, "epoch": 3.2236983842010773, "step": 44890}, {"loss": 0.6533, "grad_norm": 0.7911382913589478, "learning_rate": 0.0002, "epoch": 3.2244165170556554, "step": 44900}, {"loss": 0.6466, "grad_norm": 0.9574400782585144, "learning_rate": 0.0002, "epoch": 3.2251346499102334, "step": 44910}, {"loss": 0.693, "grad_norm": 0.8101068139076233, "learning_rate": 0.0002, "epoch": 3.2258527827648114, "step": 44920}, {"loss": 0.6605, "grad_norm": 0.754146933555603, "learning_rate": 0.0002, "epoch": 3.22657091561939, "step": 44930}, {"loss": 0.6317, "grad_norm": 0.7471939921379089, "learning_rate": 0.0002, "epoch": 3.227289048473968, "step": 44940}, {"loss": 0.6378, "grad_norm": 1.0040855407714844, "learning_rate": 0.0002, "epoch": 3.228007181328546, "step": 44950}, {"loss": 0.6496, "grad_norm": 1.0016074180603027, "learning_rate": 0.0002, "epoch": 3.228725314183124, "step": 44960}, {"loss": 0.6, "grad_norm": 1.0432976484298706, "learning_rate": 0.0002, "epoch": 3.229443447037702, "step": 44970}, {"loss": 0.635, "grad_norm": 0.8517055511474609, "learning_rate": 0.0002, "epoch": 3.2301615798922803, "step": 44980}, {"loss": 0.6168, "grad_norm": 0.9174178242683411, "learning_rate": 0.0002, "epoch": 3.2308797127468583, "step": 44990}, {"loss": 0.6325, "grad_norm": 0.9733774065971375, "learning_rate": 0.0002, "epoch": 3.2315978456014363, "step": 45000}, {"loss": 0.6743, "grad_norm": 0.9074714779853821, "learning_rate": 0.0002, "epoch": 3.2323159784560143, "step": 45010}, {"loss": 0.6372, "grad_norm": 0.8802759051322937, "learning_rate": 0.0002, "epoch": 3.2330341113105923, "step": 45020}, {"loss": 0.6189, "grad_norm": 1.0620871782302856, "learning_rate": 0.0002, "epoch": 3.2337522441651707, "step": 45030}, {"loss": 0.6201, "grad_norm": 0.8069542050361633, "learning_rate": 0.0002, "epoch": 3.2344703770197487, "step": 45040}, {"loss": 0.618, "grad_norm": 0.9139137864112854, "learning_rate": 0.0002, "epoch": 3.2351885098743267, "step": 45050}, {"loss": 0.6389, "grad_norm": 0.8936411142349243, "learning_rate": 0.0002, "epoch": 3.2359066427289047, "step": 45060}, {"loss": 0.6602, "grad_norm": 0.9098079204559326, "learning_rate": 0.0002, "epoch": 3.2366247755834827, "step": 45070}, {"loss": 0.6423, "grad_norm": 1.062953233718872, "learning_rate": 0.0002, "epoch": 3.237342908438061, "step": 45080}, {"loss": 0.6527, "grad_norm": 0.8656470775604248, "learning_rate": 0.0002, "epoch": 3.238061041292639, "step": 45090}, {"loss": 0.6362, "grad_norm": 0.9299449920654297, "learning_rate": 0.0002, "epoch": 3.238779174147217, "step": 45100}, {"loss": 0.6469, "grad_norm": 1.0102022886276245, "learning_rate": 0.0002, "epoch": 3.239497307001795, "step": 45110}, {"loss": 0.5984, "grad_norm": 0.8074561953544617, "learning_rate": 0.0002, "epoch": 3.2402154398563736, "step": 45120}, {"loss": 0.6196, "grad_norm": 1.044105887413025, "learning_rate": 0.0002, "epoch": 3.2409335727109516, "step": 45130}, {"loss": 0.6471, "grad_norm": 0.8742762207984924, "learning_rate": 0.0002, "epoch": 3.2416517055655296, "step": 45140}, {"loss": 0.648, "grad_norm": 0.8240015506744385, "learning_rate": 0.0002, "epoch": 3.2423698384201076, "step": 45150}, {"loss": 0.6599, "grad_norm": 0.8438951373100281, "learning_rate": 0.0002, "epoch": 3.2430879712746856, "step": 45160}, {"loss": 0.6406, "grad_norm": 1.02358877658844, "learning_rate": 0.0002, "epoch": 3.243806104129264, "step": 45170}, {"loss": 0.6581, "grad_norm": 0.8824774026870728, "learning_rate": 0.0002, "epoch": 3.244524236983842, "step": 45180}, {"loss": 0.658, "grad_norm": 0.971015989780426, "learning_rate": 0.0002, "epoch": 3.24524236983842, "step": 45190}, {"loss": 0.6473, "grad_norm": 0.9282383918762207, "learning_rate": 0.0002, "epoch": 3.245960502692998, "step": 45200}, {"loss": 0.6376, "grad_norm": 0.7908362746238708, "learning_rate": 0.0002, "epoch": 3.2466786355475765, "step": 45210}, {"loss": 0.6765, "grad_norm": 1.0721662044525146, "learning_rate": 0.0002, "epoch": 3.2473967684021545, "step": 45220}, {"loss": 0.7102, "grad_norm": 0.9516810774803162, "learning_rate": 0.0002, "epoch": 3.2481149012567325, "step": 45230}, {"loss": 0.6332, "grad_norm": 0.7914131283760071, "learning_rate": 0.0002, "epoch": 3.2488330341113105, "step": 45240}, {"loss": 0.6018, "grad_norm": 0.8492292761802673, "learning_rate": 0.0002, "epoch": 3.2495511669658885, "step": 45250}, {"loss": 0.6272, "grad_norm": 0.8880114555358887, "learning_rate": 0.0002, "epoch": 3.250269299820467, "step": 45260}, {"loss": 0.6394, "grad_norm": 0.7808310985565186, "learning_rate": 0.0002, "epoch": 3.250987432675045, "step": 45270}, {"loss": 0.6161, "grad_norm": 0.8566828966140747, "learning_rate": 0.0002, "epoch": 3.251705565529623, "step": 45280}, {"loss": 0.6408, "grad_norm": 0.7929658889770508, "learning_rate": 0.0002, "epoch": 3.252423698384201, "step": 45290}, {"loss": 0.6182, "grad_norm": 0.678207516670227, "learning_rate": 0.0002, "epoch": 3.253141831238779, "step": 45300}, {"loss": 0.6315, "grad_norm": 0.9963029623031616, "learning_rate": 0.0002, "epoch": 3.2538599640933574, "step": 45310}, {"loss": 0.6496, "grad_norm": 0.835304856300354, "learning_rate": 0.0002, "epoch": 3.2545780969479354, "step": 45320}, {"loss": 0.6099, "grad_norm": 0.7281617522239685, "learning_rate": 0.0002, "epoch": 3.2552962298025134, "step": 45330}, {"loss": 0.6224, "grad_norm": 1.244890570640564, "learning_rate": 0.0002, "epoch": 3.2560143626570914, "step": 45340}, {"loss": 0.6317, "grad_norm": 0.8372750282287598, "learning_rate": 0.0002, "epoch": 3.2567324955116694, "step": 45350}, {"loss": 0.604, "grad_norm": 1.0029667615890503, "learning_rate": 0.0002, "epoch": 3.257450628366248, "step": 45360}, {"loss": 0.596, "grad_norm": 0.8561908602714539, "learning_rate": 0.0002, "epoch": 3.258168761220826, "step": 45370}, {"loss": 0.6185, "grad_norm": 1.0058085918426514, "learning_rate": 0.0002, "epoch": 3.258886894075404, "step": 45380}, {"loss": 0.6415, "grad_norm": 0.7768221497535706, "learning_rate": 0.0002, "epoch": 3.259605026929982, "step": 45390}, {"loss": 0.635, "grad_norm": 0.8443793058395386, "learning_rate": 0.0002, "epoch": 3.2603231597845603, "step": 45400}, {"loss": 0.6579, "grad_norm": 1.0140392780303955, "learning_rate": 0.0002, "epoch": 3.2610412926391383, "step": 45410}, {"loss": 0.6434, "grad_norm": 0.8397058248519897, "learning_rate": 0.0002, "epoch": 3.2617594254937163, "step": 45420}, {"loss": 0.6361, "grad_norm": 0.9717063903808594, "learning_rate": 0.0002, "epoch": 3.2624775583482943, "step": 45430}, {"loss": 0.6837, "grad_norm": 1.0279473066329956, "learning_rate": 0.0002, "epoch": 3.2631956912028723, "step": 45440}, {"loss": 0.6274, "grad_norm": 1.207457184791565, "learning_rate": 0.0002, "epoch": 3.263913824057451, "step": 45450}, {"loss": 0.681, "grad_norm": 0.8121998906135559, "learning_rate": 0.0002, "epoch": 3.264631956912029, "step": 45460}, {"loss": 0.6202, "grad_norm": 1.037733554840088, "learning_rate": 0.0002, "epoch": 3.265350089766607, "step": 45470}, {"loss": 0.6146, "grad_norm": 0.9305754899978638, "learning_rate": 0.0002, "epoch": 3.266068222621185, "step": 45480}, {"loss": 0.6186, "grad_norm": 0.9733602404594421, "learning_rate": 0.0002, "epoch": 3.2667863554757632, "step": 45490}, {"loss": 0.6713, "grad_norm": 0.8345039486885071, "learning_rate": 0.0002, "epoch": 3.2675044883303412, "step": 45500}, {"loss": 0.6315, "grad_norm": 0.8601692318916321, "learning_rate": 0.0002, "epoch": 3.2682226211849192, "step": 45510}, {"loss": 0.5953, "grad_norm": 0.7921277284622192, "learning_rate": 0.0002, "epoch": 3.2689407540394972, "step": 45520}, {"loss": 0.6781, "grad_norm": 0.8324153423309326, "learning_rate": 0.0002, "epoch": 3.2696588868940752, "step": 45530}, {"loss": 0.6413, "grad_norm": 0.85141521692276, "learning_rate": 0.0002, "epoch": 3.2703770197486537, "step": 45540}, {"loss": 0.654, "grad_norm": 0.9399608373641968, "learning_rate": 0.0002, "epoch": 3.2710951526032317, "step": 45550}, {"loss": 0.6364, "grad_norm": 0.9829166531562805, "learning_rate": 0.0002, "epoch": 3.2718132854578097, "step": 45560}, {"loss": 0.627, "grad_norm": 0.9936266541481018, "learning_rate": 0.0002, "epoch": 3.2725314183123877, "step": 45570}, {"loss": 0.6465, "grad_norm": 1.036165714263916, "learning_rate": 0.0002, "epoch": 3.2732495511669657, "step": 45580}, {"loss": 0.6216, "grad_norm": 0.8988680243492126, "learning_rate": 0.0002, "epoch": 3.273967684021544, "step": 45590}, {"loss": 0.6368, "grad_norm": 0.9173405766487122, "learning_rate": 0.0002, "epoch": 3.274685816876122, "step": 45600}, {"loss": 0.6455, "grad_norm": 0.9967324733734131, "learning_rate": 0.0002, "epoch": 3.2754039497307, "step": 45610}, {"loss": 0.6236, "grad_norm": 0.9097777009010315, "learning_rate": 0.0002, "epoch": 3.276122082585278, "step": 45620}, {"loss": 0.632, "grad_norm": 1.0559430122375488, "learning_rate": 0.0002, "epoch": 3.276840215439856, "step": 45630}, {"loss": 0.5999, "grad_norm": 0.9583360552787781, "learning_rate": 0.0002, "epoch": 3.2775583482944346, "step": 45640}, {"loss": 0.6329, "grad_norm": 0.7630334496498108, "learning_rate": 0.0002, "epoch": 3.2782764811490126, "step": 45650}, {"loss": 0.6873, "grad_norm": 0.9955230355262756, "learning_rate": 0.0002, "epoch": 3.2789946140035906, "step": 45660}, {"loss": 0.6216, "grad_norm": 0.8685793876647949, "learning_rate": 0.0002, "epoch": 3.2797127468581686, "step": 45670}, {"loss": 0.6243, "grad_norm": 0.919913113117218, "learning_rate": 0.0002, "epoch": 3.280430879712747, "step": 45680}, {"loss": 0.6334, "grad_norm": 0.826144814491272, "learning_rate": 0.0002, "epoch": 3.281149012567325, "step": 45690}, {"loss": 0.6359, "grad_norm": 0.9750179052352905, "learning_rate": 0.0002, "epoch": 3.281867145421903, "step": 45700}, {"loss": 0.6589, "grad_norm": 0.7931897640228271, "learning_rate": 0.0002, "epoch": 3.282585278276481, "step": 45710}, {"loss": 0.6785, "grad_norm": 1.0380089282989502, "learning_rate": 0.0002, "epoch": 3.283303411131059, "step": 45720}, {"loss": 0.6219, "grad_norm": 0.8220566511154175, "learning_rate": 0.0002, "epoch": 3.2840215439856375, "step": 45730}, {"loss": 0.5737, "grad_norm": 0.9688239693641663, "learning_rate": 0.0002, "epoch": 3.2847396768402155, "step": 45740}, {"loss": 0.603, "grad_norm": 0.8760311603546143, "learning_rate": 0.0002, "epoch": 3.2854578096947935, "step": 45750}, {"loss": 0.6134, "grad_norm": 0.8103382587432861, "learning_rate": 0.0002, "epoch": 3.2861759425493715, "step": 45760}, {"loss": 0.6475, "grad_norm": 0.8835865259170532, "learning_rate": 0.0002, "epoch": 3.28689407540395, "step": 45770}, {"loss": 0.6423, "grad_norm": 0.9021160006523132, "learning_rate": 0.0002, "epoch": 3.287612208258528, "step": 45780}, {"loss": 0.6693, "grad_norm": 0.8182386159896851, "learning_rate": 0.0002, "epoch": 3.288330341113106, "step": 45790}, {"loss": 0.6408, "grad_norm": 0.8555024862289429, "learning_rate": 0.0002, "epoch": 3.289048473967684, "step": 45800}, {"loss": 0.6839, "grad_norm": 1.0982348918914795, "learning_rate": 0.0002, "epoch": 3.289766606822262, "step": 45810}, {"loss": 0.6323, "grad_norm": 1.06246817111969, "learning_rate": 0.0002, "epoch": 3.2904847396768404, "step": 45820}, {"loss": 0.5924, "grad_norm": 1.1727149486541748, "learning_rate": 0.0002, "epoch": 3.2912028725314184, "step": 45830}, {"loss": 0.624, "grad_norm": 0.8224700093269348, "learning_rate": 0.0002, "epoch": 3.2919210053859964, "step": 45840}, {"loss": 0.6445, "grad_norm": 0.8195698261260986, "learning_rate": 0.0002, "epoch": 3.2926391382405744, "step": 45850}, {"loss": 0.6106, "grad_norm": 0.8424476981163025, "learning_rate": 0.0002, "epoch": 3.2933572710951524, "step": 45860}, {"loss": 0.6705, "grad_norm": 0.9804632067680359, "learning_rate": 0.0002, "epoch": 3.294075403949731, "step": 45870}, {"loss": 0.6538, "grad_norm": 0.8701804876327515, "learning_rate": 0.0002, "epoch": 3.294793536804309, "step": 45880}, {"loss": 0.6264, "grad_norm": 0.8876864910125732, "learning_rate": 0.0002, "epoch": 3.295511669658887, "step": 45890}, {"loss": 0.6401, "grad_norm": 1.0105448961257935, "learning_rate": 0.0002, "epoch": 3.296229802513465, "step": 45900}, {"loss": 0.687, "grad_norm": 0.847017228603363, "learning_rate": 0.0002, "epoch": 3.296947935368043, "step": 45910}, {"loss": 0.6433, "grad_norm": 0.7610297799110413, "learning_rate": 0.0002, "epoch": 3.2976660682226213, "step": 45920}, {"loss": 0.6499, "grad_norm": 0.7272670269012451, "learning_rate": 0.0002, "epoch": 3.2983842010771993, "step": 45930}, {"loss": 0.6366, "grad_norm": 0.8243510127067566, "learning_rate": 0.0002, "epoch": 3.2991023339317773, "step": 45940}, {"loss": 0.6498, "grad_norm": 1.0113074779510498, "learning_rate": 0.0002, "epoch": 3.2998204667863553, "step": 45950}, {"loss": 0.6639, "grad_norm": 0.8578087687492371, "learning_rate": 0.0002, "epoch": 3.3005385996409338, "step": 45960}, {"loss": 0.6137, "grad_norm": 0.9511606097221375, "learning_rate": 0.0002, "epoch": 3.3012567324955118, "step": 45970}, {"loss": 0.6115, "grad_norm": 0.8612566590309143, "learning_rate": 0.0002, "epoch": 3.3019748653500898, "step": 45980}, {"loss": 0.6799, "grad_norm": 0.8702331185340881, "learning_rate": 0.0002, "epoch": 3.3026929982046678, "step": 45990}, {"loss": 0.6429, "grad_norm": 1.0229583978652954, "learning_rate": 0.0002, "epoch": 3.3034111310592458, "step": 46000}, {"loss": 0.6054, "grad_norm": 1.1775577068328857, "learning_rate": 0.0002, "epoch": 3.304129263913824, "step": 46010}, {"loss": 0.6958, "grad_norm": 0.9922171831130981, "learning_rate": 0.0002, "epoch": 3.3048473967684022, "step": 46020}, {"loss": 0.6642, "grad_norm": 0.8246880769729614, "learning_rate": 0.0002, "epoch": 3.3055655296229802, "step": 46030}, {"loss": 0.678, "grad_norm": 0.9351653456687927, "learning_rate": 0.0002, "epoch": 3.3062836624775582, "step": 46040}, {"loss": 0.649, "grad_norm": 0.9617429375648499, "learning_rate": 0.0002, "epoch": 3.3070017953321367, "step": 46050}, {"loss": 0.6314, "grad_norm": 0.9753885269165039, "learning_rate": 0.0002, "epoch": 3.3077199281867147, "step": 46060}, {"loss": 0.6434, "grad_norm": 0.8532425165176392, "learning_rate": 0.0002, "epoch": 3.3084380610412927, "step": 46070}, {"loss": 0.6312, "grad_norm": 0.9722012877464294, "learning_rate": 0.0002, "epoch": 3.3091561938958707, "step": 46080}, {"loss": 0.6629, "grad_norm": 0.8950021266937256, "learning_rate": 0.0002, "epoch": 3.3098743267504487, "step": 46090}, {"loss": 0.6278, "grad_norm": 0.8536333441734314, "learning_rate": 0.0002, "epoch": 3.3105924596050267, "step": 46100}, {"loss": 0.6359, "grad_norm": 0.9423946738243103, "learning_rate": 0.0002, "epoch": 3.311310592459605, "step": 46110}, {"loss": 0.6647, "grad_norm": 0.8573169112205505, "learning_rate": 0.0002, "epoch": 3.312028725314183, "step": 46120}, {"loss": 0.6127, "grad_norm": 1.0122376680374146, "learning_rate": 0.0002, "epoch": 3.312746858168761, "step": 46130}, {"loss": 0.6782, "grad_norm": 0.7492560744285583, "learning_rate": 0.0002, "epoch": 3.313464991023339, "step": 46140}, {"loss": 0.6315, "grad_norm": 1.023658037185669, "learning_rate": 0.0002, "epoch": 3.3141831238779176, "step": 46150}, {"loss": 0.6051, "grad_norm": 1.1191970109939575, "learning_rate": 0.0002, "epoch": 3.3149012567324956, "step": 46160}, {"loss": 0.6247, "grad_norm": 0.9847373962402344, "learning_rate": 0.0002, "epoch": 3.3156193895870736, "step": 46170}, {"loss": 0.661, "grad_norm": 0.7315911054611206, "learning_rate": 0.0002, "epoch": 3.3163375224416516, "step": 46180}, {"loss": 0.6017, "grad_norm": 0.8267890214920044, "learning_rate": 0.0002, "epoch": 3.3170556552962296, "step": 46190}, {"loss": 0.6202, "grad_norm": 0.8898099064826965, "learning_rate": 0.0002, "epoch": 3.317773788150808, "step": 46200}, {"loss": 0.651, "grad_norm": 0.8525369167327881, "learning_rate": 0.0002, "epoch": 3.318491921005386, "step": 46210}, {"loss": 0.6705, "grad_norm": 0.8074760437011719, "learning_rate": 0.0002, "epoch": 3.319210053859964, "step": 46220}, {"loss": 0.641, "grad_norm": 0.8473616242408752, "learning_rate": 0.0002, "epoch": 3.319928186714542, "step": 46230}, {"loss": 0.6092, "grad_norm": 0.8678314089775085, "learning_rate": 0.0002, "epoch": 3.3206463195691205, "step": 46240}, {"loss": 0.655, "grad_norm": 0.8718782067298889, "learning_rate": 0.0002, "epoch": 3.3213644524236985, "step": 46250}, {"loss": 0.6266, "grad_norm": 0.9384858012199402, "learning_rate": 0.0002, "epoch": 3.3220825852782765, "step": 46260}, {"loss": 0.6393, "grad_norm": 0.9295032620429993, "learning_rate": 0.0002, "epoch": 3.3228007181328545, "step": 46270}, {"loss": 0.6824, "grad_norm": 0.9472482800483704, "learning_rate": 0.0002, "epoch": 3.3235188509874325, "step": 46280}, {"loss": 0.6177, "grad_norm": 0.7970638275146484, "learning_rate": 0.0002, "epoch": 3.324236983842011, "step": 46290}, {"loss": 0.6431, "grad_norm": 0.9508723020553589, "learning_rate": 0.0002, "epoch": 3.324955116696589, "step": 46300}, {"loss": 0.6126, "grad_norm": 0.9153636693954468, "learning_rate": 0.0002, "epoch": 3.325673249551167, "step": 46310}, {"loss": 0.6042, "grad_norm": 0.7890323400497437, "learning_rate": 0.0002, "epoch": 3.326391382405745, "step": 46320}, {"loss": 0.6525, "grad_norm": 0.8711825609207153, "learning_rate": 0.0002, "epoch": 3.3271095152603234, "step": 46330}, {"loss": 0.6253, "grad_norm": 0.9938926696777344, "learning_rate": 0.0002, "epoch": 3.3278276481149014, "step": 46340}, {"loss": 0.6227, "grad_norm": 0.8497524857521057, "learning_rate": 0.0002, "epoch": 3.3285457809694794, "step": 46350}, {"loss": 0.6472, "grad_norm": 0.9191650748252869, "learning_rate": 0.0002, "epoch": 3.3292639138240574, "step": 46360}, {"loss": 0.6385, "grad_norm": 0.8974085450172424, "learning_rate": 0.0002, "epoch": 3.3299820466786354, "step": 46370}, {"loss": 0.618, "grad_norm": 0.9928934574127197, "learning_rate": 0.0002, "epoch": 3.3307001795332134, "step": 46380}, {"loss": 0.6254, "grad_norm": 0.9011030197143555, "learning_rate": 0.0002, "epoch": 3.331418312387792, "step": 46390}, {"loss": 0.6146, "grad_norm": 0.898594856262207, "learning_rate": 0.0002, "epoch": 3.33213644524237, "step": 46400}, {"loss": 0.6321, "grad_norm": 0.7506672143936157, "learning_rate": 0.0002, "epoch": 3.332854578096948, "step": 46410}, {"loss": 0.6329, "grad_norm": 0.9239172339439392, "learning_rate": 0.0002, "epoch": 3.333572710951526, "step": 46420}, {"loss": 0.6278, "grad_norm": 1.0749682188034058, "learning_rate": 0.0002, "epoch": 3.3342908438061043, "step": 46430}, {"loss": 0.6568, "grad_norm": 0.9262617230415344, "learning_rate": 0.0002, "epoch": 3.3350089766606823, "step": 46440}, {"loss": 0.6034, "grad_norm": 0.8681274056434631, "learning_rate": 0.0002, "epoch": 3.3357271095152603, "step": 46450}, {"loss": 0.6261, "grad_norm": 0.9558620452880859, "learning_rate": 0.0002, "epoch": 3.3364452423698383, "step": 46460}, {"loss": 0.6087, "grad_norm": 0.8907097578048706, "learning_rate": 0.0002, "epoch": 3.3371633752244163, "step": 46470}, {"loss": 0.6356, "grad_norm": 1.0941565036773682, "learning_rate": 0.0002, "epoch": 3.3378815080789948, "step": 46480}, {"loss": 0.6536, "grad_norm": 0.8971590995788574, "learning_rate": 0.0002, "epoch": 3.3385996409335728, "step": 46490}, {"loss": 0.6252, "grad_norm": 1.0315606594085693, "learning_rate": 0.0002, "epoch": 3.3393177737881508, "step": 46500}, {"loss": 0.5819, "grad_norm": 0.7717124223709106, "learning_rate": 0.0002, "epoch": 3.3400359066427288, "step": 46510}, {"loss": 0.612, "grad_norm": 0.8060970902442932, "learning_rate": 0.0002, "epoch": 3.340754039497307, "step": 46520}, {"loss": 0.7036, "grad_norm": 0.969510018825531, "learning_rate": 0.0002, "epoch": 3.341472172351885, "step": 46530}, {"loss": 0.6163, "grad_norm": 0.8837248682975769, "learning_rate": 0.0002, "epoch": 3.342190305206463, "step": 46540}, {"loss": 0.6762, "grad_norm": 0.9561076164245605, "learning_rate": 0.0002, "epoch": 3.342908438061041, "step": 46550}, {"loss": 0.687, "grad_norm": 0.8529208898544312, "learning_rate": 0.0002, "epoch": 3.343626570915619, "step": 46560}, {"loss": 0.611, "grad_norm": 1.1300519704818726, "learning_rate": 0.0002, "epoch": 3.3443447037701977, "step": 46570}, {"loss": 0.6088, "grad_norm": 0.8330956101417542, "learning_rate": 0.0002, "epoch": 3.3450628366247757, "step": 46580}, {"loss": 0.6725, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 3.3457809694793537, "step": 46590}, {"loss": 0.6667, "grad_norm": 1.0470821857452393, "learning_rate": 0.0002, "epoch": 3.3464991023339317, "step": 46600}, {"loss": 0.6408, "grad_norm": 0.9933704137802124, "learning_rate": 0.0002, "epoch": 3.34721723518851, "step": 46610}, {"loss": 0.6416, "grad_norm": 0.8130798935890198, "learning_rate": 0.0002, "epoch": 3.347935368043088, "step": 46620}, {"loss": 0.6576, "grad_norm": 0.9746946692466736, "learning_rate": 0.0002, "epoch": 3.348653500897666, "step": 46630}, {"loss": 0.6254, "grad_norm": 0.8607267141342163, "learning_rate": 0.0002, "epoch": 3.349371633752244, "step": 46640}, {"loss": 0.6639, "grad_norm": 0.800335705280304, "learning_rate": 0.0002, "epoch": 3.350089766606822, "step": 46650}, {"loss": 0.6749, "grad_norm": 1.0083239078521729, "learning_rate": 0.0002, "epoch": 3.3508078994614, "step": 46660}, {"loss": 0.6606, "grad_norm": 1.0774433612823486, "learning_rate": 0.0002, "epoch": 3.3515260323159786, "step": 46670}, {"loss": 0.6408, "grad_norm": 0.9378824234008789, "learning_rate": 0.0002, "epoch": 3.3522441651705566, "step": 46680}, {"loss": 0.5879, "grad_norm": 0.8490564227104187, "learning_rate": 0.0002, "epoch": 3.3529622980251346, "step": 46690}, {"loss": 0.6364, "grad_norm": 1.0415582656860352, "learning_rate": 0.0002, "epoch": 3.3536804308797126, "step": 46700}, {"loss": 0.5813, "grad_norm": 0.8514367938041687, "learning_rate": 0.0002, "epoch": 3.354398563734291, "step": 46710}, {"loss": 0.6847, "grad_norm": 0.7691360712051392, "learning_rate": 0.0002, "epoch": 3.355116696588869, "step": 46720}, {"loss": 0.6295, "grad_norm": 0.8345438241958618, "learning_rate": 0.0002, "epoch": 3.355834829443447, "step": 46730}, {"loss": 0.6093, "grad_norm": 1.023492693901062, "learning_rate": 0.0002, "epoch": 3.356552962298025, "step": 46740}, {"loss": 0.5997, "grad_norm": 0.9648325443267822, "learning_rate": 0.0002, "epoch": 3.357271095152603, "step": 46750}, {"loss": 0.6379, "grad_norm": 0.9029248356819153, "learning_rate": 0.0002, "epoch": 3.3579892280071815, "step": 46760}, {"loss": 0.6551, "grad_norm": 0.9109513759613037, "learning_rate": 0.0002, "epoch": 3.3587073608617595, "step": 46770}, {"loss": 0.6616, "grad_norm": 0.7757390141487122, "learning_rate": 0.0002, "epoch": 3.3594254937163375, "step": 46780}, {"loss": 0.6088, "grad_norm": 0.794035792350769, "learning_rate": 0.0002, "epoch": 3.3601436265709155, "step": 46790}, {"loss": 0.6405, "grad_norm": 0.8211429715156555, "learning_rate": 0.0002, "epoch": 3.360861759425494, "step": 46800}, {"loss": 0.6359, "grad_norm": 0.8620322346687317, "learning_rate": 0.0002, "epoch": 3.361579892280072, "step": 46810}, {"loss": 0.6357, "grad_norm": 0.9392538070678711, "learning_rate": 0.0002, "epoch": 3.36229802513465, "step": 46820}, {"loss": 0.6225, "grad_norm": 0.8297873139381409, "learning_rate": 0.0002, "epoch": 3.363016157989228, "step": 46830}, {"loss": 0.639, "grad_norm": 0.9158190488815308, "learning_rate": 0.0002, "epoch": 3.363734290843806, "step": 46840}, {"loss": 0.6168, "grad_norm": 1.1449424028396606, "learning_rate": 0.0002, "epoch": 3.3644524236983844, "step": 46850}, {"loss": 0.6413, "grad_norm": 0.8718444108963013, "learning_rate": 0.0002, "epoch": 3.3651705565529624, "step": 46860}, {"loss": 0.624, "grad_norm": 0.7744014263153076, "learning_rate": 0.0002, "epoch": 3.3658886894075404, "step": 46870}, {"loss": 0.6238, "grad_norm": 0.8392460942268372, "learning_rate": 0.0002, "epoch": 3.3666068222621184, "step": 46880}, {"loss": 0.6753, "grad_norm": 1.0424989461898804, "learning_rate": 0.0002, "epoch": 3.367324955116697, "step": 46890}, {"loss": 0.6038, "grad_norm": 1.4696359634399414, "learning_rate": 0.0002, "epoch": 3.368043087971275, "step": 46900}, {"loss": 0.6525, "grad_norm": 0.9298201203346252, "learning_rate": 0.0002, "epoch": 3.368761220825853, "step": 46910}, {"loss": 0.6351, "grad_norm": 0.8965262770652771, "learning_rate": 0.0002, "epoch": 3.369479353680431, "step": 46920}, {"loss": 0.6505, "grad_norm": 0.9395381808280945, "learning_rate": 0.0002, "epoch": 3.370197486535009, "step": 46930}, {"loss": 0.6161, "grad_norm": 0.9069047570228577, "learning_rate": 0.0002, "epoch": 3.370915619389587, "step": 46940}, {"loss": 0.6576, "grad_norm": 0.9208605885505676, "learning_rate": 0.0002, "epoch": 3.3716337522441653, "step": 46950}, {"loss": 0.6456, "grad_norm": 0.9493077397346497, "learning_rate": 0.0002, "epoch": 3.3723518850987433, "step": 46960}, {"loss": 0.6609, "grad_norm": 1.0804208517074585, "learning_rate": 0.0002, "epoch": 3.3730700179533213, "step": 46970}, {"loss": 0.6267, "grad_norm": 0.9465714693069458, "learning_rate": 0.0002, "epoch": 3.3737881508078993, "step": 46980}, {"loss": 0.6633, "grad_norm": 0.9189882278442383, "learning_rate": 0.0002, "epoch": 3.3745062836624777, "step": 46990}, {"loss": 0.6518, "grad_norm": 1.0199357271194458, "learning_rate": 0.0002, "epoch": 3.3752244165170557, "step": 47000}, {"loss": 0.6645, "grad_norm": 0.8999426960945129, "learning_rate": 0.0002, "epoch": 3.3759425493716337, "step": 47010}, {"loss": 0.637, "grad_norm": 0.8923690319061279, "learning_rate": 0.0002, "epoch": 3.3766606822262117, "step": 47020}, {"loss": 0.6543, "grad_norm": 0.7459347248077393, "learning_rate": 0.0002, "epoch": 3.3773788150807897, "step": 47030}, {"loss": 0.6269, "grad_norm": 0.7702858448028564, "learning_rate": 0.0002, "epoch": 3.378096947935368, "step": 47040}, {"loss": 0.6399, "grad_norm": 0.8296625018119812, "learning_rate": 0.0002, "epoch": 3.378815080789946, "step": 47050}, {"loss": 0.6552, "grad_norm": 1.2952555418014526, "learning_rate": 0.0002, "epoch": 3.379533213644524, "step": 47060}, {"loss": 0.6264, "grad_norm": 0.7778869271278381, "learning_rate": 0.0002, "epoch": 3.380251346499102, "step": 47070}, {"loss": 0.6906, "grad_norm": 0.9151549339294434, "learning_rate": 0.0002, "epoch": 3.3809694793536806, "step": 47080}, {"loss": 0.6443, "grad_norm": 0.7883925437927246, "learning_rate": 0.0002, "epoch": 3.3816876122082586, "step": 47090}, {"loss": 0.6124, "grad_norm": 0.9602295756340027, "learning_rate": 0.0002, "epoch": 3.3824057450628366, "step": 47100}, {"loss": 0.651, "grad_norm": 0.7953121066093445, "learning_rate": 0.0002, "epoch": 3.3831238779174146, "step": 47110}, {"loss": 0.638, "grad_norm": 1.110148549079895, "learning_rate": 0.0002, "epoch": 3.3838420107719926, "step": 47120}, {"loss": 0.6386, "grad_norm": 0.9359608888626099, "learning_rate": 0.0002, "epoch": 3.384560143626571, "step": 47130}, {"loss": 0.6075, "grad_norm": 0.7877762317657471, "learning_rate": 0.0002, "epoch": 3.385278276481149, "step": 47140}, {"loss": 0.6657, "grad_norm": 0.8586933016777039, "learning_rate": 0.0002, "epoch": 3.385996409335727, "step": 47150}, {"loss": 0.6438, "grad_norm": 0.8920878767967224, "learning_rate": 0.0002, "epoch": 3.386714542190305, "step": 47160}, {"loss": 0.6584, "grad_norm": 0.9692603349685669, "learning_rate": 0.0002, "epoch": 3.3874326750448835, "step": 47170}, {"loss": 0.6643, "grad_norm": 0.9038610458374023, "learning_rate": 0.0002, "epoch": 3.3881508078994615, "step": 47180}, {"loss": 0.6002, "grad_norm": 1.6299188137054443, "learning_rate": 0.0002, "epoch": 3.3888689407540395, "step": 47190}, {"loss": 0.6423, "grad_norm": 0.9704291820526123, "learning_rate": 0.0002, "epoch": 3.3895870736086176, "step": 47200}, {"loss": 0.6808, "grad_norm": 0.9503401517868042, "learning_rate": 0.0002, "epoch": 3.3903052064631956, "step": 47210}, {"loss": 0.6871, "grad_norm": 1.0051378011703491, "learning_rate": 0.0002, "epoch": 3.3910233393177736, "step": 47220}, {"loss": 0.6207, "grad_norm": 0.7336357235908508, "learning_rate": 0.0002, "epoch": 3.391741472172352, "step": 47230}, {"loss": 0.6688, "grad_norm": 0.9847398996353149, "learning_rate": 0.0002, "epoch": 3.39245960502693, "step": 47240}, {"loss": 0.6305, "grad_norm": 0.8100917339324951, "learning_rate": 0.0002, "epoch": 3.393177737881508, "step": 47250}, {"loss": 0.6418, "grad_norm": 0.9752838611602783, "learning_rate": 0.0002, "epoch": 3.393895870736086, "step": 47260}, {"loss": 0.6237, "grad_norm": 0.9400623440742493, "learning_rate": 0.0002, "epoch": 3.3946140035906645, "step": 47270}, {"loss": 0.6321, "grad_norm": 0.7310057878494263, "learning_rate": 0.0002, "epoch": 3.3953321364452425, "step": 47280}, {"loss": 0.6209, "grad_norm": 0.8898789286613464, "learning_rate": 0.0002, "epoch": 3.3960502692998205, "step": 47290}, {"loss": 0.6496, "grad_norm": 1.0157585144042969, "learning_rate": 0.0002, "epoch": 3.3967684021543985, "step": 47300}, {"loss": 0.6497, "grad_norm": 0.9108527898788452, "learning_rate": 0.0002, "epoch": 3.3974865350089765, "step": 47310}, {"loss": 0.5928, "grad_norm": 0.9796249270439148, "learning_rate": 0.0002, "epoch": 3.398204667863555, "step": 47320}, {"loss": 0.6169, "grad_norm": 0.8176435232162476, "learning_rate": 0.0002, "epoch": 3.398922800718133, "step": 47330}, {"loss": 0.6279, "grad_norm": 0.9981188178062439, "learning_rate": 0.0002, "epoch": 3.399640933572711, "step": 47340}, {"loss": 0.6657, "grad_norm": 0.9774404764175415, "learning_rate": 0.0002, "epoch": 3.400359066427289, "step": 47350}, {"loss": 0.68, "grad_norm": 0.8624991774559021, "learning_rate": 0.0002, "epoch": 3.4010771992818674, "step": 47360}, {"loss": 0.6597, "grad_norm": 0.9191665053367615, "learning_rate": 0.0002, "epoch": 3.4017953321364454, "step": 47370}, {"loss": 0.6249, "grad_norm": 0.7971290946006775, "learning_rate": 0.0002, "epoch": 3.4025134649910234, "step": 47380}, {"loss": 0.617, "grad_norm": 0.8336732983589172, "learning_rate": 0.0002, "epoch": 3.4032315978456014, "step": 47390}, {"loss": 0.6435, "grad_norm": 0.7730334401130676, "learning_rate": 0.0002, "epoch": 3.4039497307001794, "step": 47400}, {"loss": 0.6348, "grad_norm": 0.8559145927429199, "learning_rate": 0.0002, "epoch": 3.404667863554758, "step": 47410}, {"loss": 0.6466, "grad_norm": 1.0261447429656982, "learning_rate": 0.0002, "epoch": 3.405385996409336, "step": 47420}, {"loss": 0.6556, "grad_norm": 0.9931781888008118, "learning_rate": 0.0002, "epoch": 3.406104129263914, "step": 47430}, {"loss": 0.6226, "grad_norm": 0.8971807360649109, "learning_rate": 0.0002, "epoch": 3.406822262118492, "step": 47440}, {"loss": 0.656, "grad_norm": 0.8886999487876892, "learning_rate": 0.0002, "epoch": 3.4075403949730703, "step": 47450}, {"loss": 0.6256, "grad_norm": 0.9551735520362854, "learning_rate": 0.0002, "epoch": 3.4082585278276483, "step": 47460}, {"loss": 0.6646, "grad_norm": 0.9066859483718872, "learning_rate": 0.0002, "epoch": 3.4089766606822263, "step": 47470}, {"loss": 0.6655, "grad_norm": 0.9192125201225281, "learning_rate": 0.0002, "epoch": 3.4096947935368043, "step": 47480}, {"loss": 0.6197, "grad_norm": 0.9332839250564575, "learning_rate": 0.0002, "epoch": 3.4104129263913823, "step": 47490}, {"loss": 0.6134, "grad_norm": 0.745563805103302, "learning_rate": 0.0002, "epoch": 3.4111310592459603, "step": 47500}, {"loss": 0.6206, "grad_norm": 0.6843905448913574, "learning_rate": 0.0002, "epoch": 3.4118491921005387, "step": 47510}, {"loss": 0.6742, "grad_norm": 0.8063111305236816, "learning_rate": 0.0002, "epoch": 3.4125673249551167, "step": 47520}, {"loss": 0.6138, "grad_norm": 0.9666593670845032, "learning_rate": 0.0002, "epoch": 3.4132854578096947, "step": 47530}, {"loss": 0.635, "grad_norm": 0.8112747073173523, "learning_rate": 0.0002, "epoch": 3.4140035906642727, "step": 47540}, {"loss": 0.6225, "grad_norm": 0.820807933807373, "learning_rate": 0.0002, "epoch": 3.414721723518851, "step": 47550}, {"loss": 0.6262, "grad_norm": 0.8476285338401794, "learning_rate": 0.0002, "epoch": 3.415439856373429, "step": 47560}, {"loss": 0.6134, "grad_norm": 1.0232552289962769, "learning_rate": 0.0002, "epoch": 3.416157989228007, "step": 47570}, {"loss": 0.604, "grad_norm": 0.8749372363090515, "learning_rate": 0.0002, "epoch": 3.416876122082585, "step": 47580}, {"loss": 0.6463, "grad_norm": 0.8117937445640564, "learning_rate": 0.0002, "epoch": 3.417594254937163, "step": 47590}, {"loss": 0.623, "grad_norm": 0.9010460376739502, "learning_rate": 0.0002, "epoch": 3.4183123877917416, "step": 47600}, {"loss": 0.6676, "grad_norm": 0.8955527544021606, "learning_rate": 0.0002, "epoch": 3.4190305206463196, "step": 47610}, {"loss": 0.6424, "grad_norm": 0.884186327457428, "learning_rate": 0.0002, "epoch": 3.4197486535008976, "step": 47620}, {"loss": 0.6377, "grad_norm": 0.8995241522789001, "learning_rate": 0.0002, "epoch": 3.4204667863554756, "step": 47630}, {"loss": 0.651, "grad_norm": 1.0627013444900513, "learning_rate": 0.0002, "epoch": 3.421184919210054, "step": 47640}, {"loss": 0.6338, "grad_norm": 0.8619979619979858, "learning_rate": 0.0002, "epoch": 3.421903052064632, "step": 47650}, {"loss": 0.6483, "grad_norm": 0.9682498574256897, "learning_rate": 0.0002, "epoch": 3.42262118491921, "step": 47660}, {"loss": 0.6006, "grad_norm": 0.9614400863647461, "learning_rate": 0.0002, "epoch": 3.423339317773788, "step": 47670}, {"loss": 0.6088, "grad_norm": 0.7986962795257568, "learning_rate": 0.0002, "epoch": 3.424057450628366, "step": 47680}, {"loss": 0.6056, "grad_norm": 0.8255957961082458, "learning_rate": 0.0002, "epoch": 3.4247755834829445, "step": 47690}, {"loss": 0.663, "grad_norm": 0.9139757752418518, "learning_rate": 0.0002, "epoch": 3.4254937163375225, "step": 47700}, {"loss": 0.61, "grad_norm": 0.8086292743682861, "learning_rate": 0.0002, "epoch": 3.4262118491921005, "step": 47710}, {"loss": 0.6604, "grad_norm": 0.8852273225784302, "learning_rate": 0.0002, "epoch": 3.4269299820466785, "step": 47720}, {"loss": 0.6168, "grad_norm": 0.7568784356117249, "learning_rate": 0.0002, "epoch": 3.427648114901257, "step": 47730}, {"loss": 0.6559, "grad_norm": 0.8933039903640747, "learning_rate": 0.0002, "epoch": 3.428366247755835, "step": 47740}, {"loss": 0.6406, "grad_norm": 0.8101669549942017, "learning_rate": 0.0002, "epoch": 3.429084380610413, "step": 47750}, {"loss": 0.6287, "grad_norm": 0.7021054625511169, "learning_rate": 0.0002, "epoch": 3.429802513464991, "step": 47760}, {"loss": 0.6159, "grad_norm": 0.8282538652420044, "learning_rate": 0.0002, "epoch": 3.430520646319569, "step": 47770}, {"loss": 0.6439, "grad_norm": 0.8168348670005798, "learning_rate": 0.0002, "epoch": 3.431238779174147, "step": 47780}, {"loss": 0.6265, "grad_norm": 0.9504001140594482, "learning_rate": 0.0002, "epoch": 3.4319569120287254, "step": 47790}, {"loss": 0.6688, "grad_norm": 0.7500190734863281, "learning_rate": 0.0002, "epoch": 3.4326750448833034, "step": 47800}, {"loss": 0.6818, "grad_norm": 0.8645710945129395, "learning_rate": 0.0002, "epoch": 3.4333931777378814, "step": 47810}, {"loss": 0.6268, "grad_norm": 0.8088704943656921, "learning_rate": 0.0002, "epoch": 3.4341113105924594, "step": 47820}, {"loss": 0.6795, "grad_norm": 0.9981673955917358, "learning_rate": 0.0002, "epoch": 3.434829443447038, "step": 47830}, {"loss": 0.6615, "grad_norm": 0.9363315105438232, "learning_rate": 0.0002, "epoch": 3.435547576301616, "step": 47840}, {"loss": 0.6028, "grad_norm": 0.8471030592918396, "learning_rate": 0.0002, "epoch": 3.436265709156194, "step": 47850}, {"loss": 0.6658, "grad_norm": 0.9447668790817261, "learning_rate": 0.0002, "epoch": 3.436983842010772, "step": 47860}, {"loss": 0.6511, "grad_norm": 0.9494127631187439, "learning_rate": 0.0002, "epoch": 3.43770197486535, "step": 47870}, {"loss": 0.6134, "grad_norm": 0.8340432643890381, "learning_rate": 0.0002, "epoch": 3.4384201077199283, "step": 47880}, {"loss": 0.6731, "grad_norm": 0.8466387987136841, "learning_rate": 0.0002, "epoch": 3.4391382405745063, "step": 47890}, {"loss": 0.6552, "grad_norm": 0.9498962759971619, "learning_rate": 0.0002, "epoch": 3.4398563734290843, "step": 47900}, {"loss": 0.6593, "grad_norm": 0.8490501046180725, "learning_rate": 0.0002, "epoch": 3.4405745062836623, "step": 47910}, {"loss": 0.6038, "grad_norm": 0.9506490230560303, "learning_rate": 0.0002, "epoch": 3.441292639138241, "step": 47920}, {"loss": 0.6317, "grad_norm": 0.7944257855415344, "learning_rate": 0.0002, "epoch": 3.442010771992819, "step": 47930}, {"loss": 0.6193, "grad_norm": 0.9725518226623535, "learning_rate": 0.0002, "epoch": 3.442728904847397, "step": 47940}, {"loss": 0.635, "grad_norm": 0.7823024392127991, "learning_rate": 0.0002, "epoch": 3.443447037701975, "step": 47950}, {"loss": 0.6221, "grad_norm": 0.810565173625946, "learning_rate": 0.0002, "epoch": 3.444165170556553, "step": 47960}, {"loss": 0.6519, "grad_norm": 0.9809024333953857, "learning_rate": 0.0002, "epoch": 3.4448833034111312, "step": 47970}, {"loss": 0.6441, "grad_norm": 0.8818578720092773, "learning_rate": 0.0002, "epoch": 3.4456014362657092, "step": 47980}, {"loss": 0.6452, "grad_norm": 0.9843092560768127, "learning_rate": 0.0002, "epoch": 3.4463195691202873, "step": 47990}, {"loss": 0.6076, "grad_norm": 0.916313886642456, "learning_rate": 0.0002, "epoch": 3.4470377019748653, "step": 48000}, {"loss": 0.6399, "grad_norm": 0.908442497253418, "learning_rate": 0.0002, "epoch": 3.4477558348294433, "step": 48010}, {"loss": 0.6263, "grad_norm": 0.9880178570747375, "learning_rate": 0.0002, "epoch": 3.4484739676840217, "step": 48020}, {"loss": 0.6802, "grad_norm": 0.9276854991912842, "learning_rate": 0.0002, "epoch": 3.4491921005385997, "step": 48030}, {"loss": 0.6522, "grad_norm": 1.0879448652267456, "learning_rate": 0.0002, "epoch": 3.4499102333931777, "step": 48040}, {"loss": 0.6362, "grad_norm": 0.7430389523506165, "learning_rate": 0.0002, "epoch": 3.4506283662477557, "step": 48050}, {"loss": 0.6064, "grad_norm": 1.0880072116851807, "learning_rate": 0.0002, "epoch": 3.4513464991023337, "step": 48060}, {"loss": 0.6152, "grad_norm": 1.0424141883850098, "learning_rate": 0.0002, "epoch": 3.452064631956912, "step": 48070}, {"loss": 0.6485, "grad_norm": 0.926330029964447, "learning_rate": 0.0002, "epoch": 3.45278276481149, "step": 48080}, {"loss": 0.6261, "grad_norm": 0.8911219239234924, "learning_rate": 0.0002, "epoch": 3.453500897666068, "step": 48090}, {"loss": 0.6883, "grad_norm": 0.8727201223373413, "learning_rate": 0.0002, "epoch": 3.454219030520646, "step": 48100}, {"loss": 0.6473, "grad_norm": 0.8573940396308899, "learning_rate": 0.0002, "epoch": 3.4549371633752246, "step": 48110}, {"loss": 0.6645, "grad_norm": 1.0427064895629883, "learning_rate": 0.0002, "epoch": 3.4556552962298026, "step": 48120}, {"loss": 0.6489, "grad_norm": 0.8688231706619263, "learning_rate": 0.0002, "epoch": 3.4563734290843806, "step": 48130}, {"loss": 0.5947, "grad_norm": 0.8856009244918823, "learning_rate": 0.0002, "epoch": 3.4570915619389586, "step": 48140}, {"loss": 0.6482, "grad_norm": 0.9535353183746338, "learning_rate": 0.0002, "epoch": 3.4578096947935366, "step": 48150}, {"loss": 0.6435, "grad_norm": 0.9466010928153992, "learning_rate": 0.0002, "epoch": 3.458527827648115, "step": 48160}, {"loss": 0.6231, "grad_norm": 0.9783535599708557, "learning_rate": 0.0002, "epoch": 3.459245960502693, "step": 48170}, {"loss": 0.6926, "grad_norm": 0.8010456562042236, "learning_rate": 0.0002, "epoch": 3.459964093357271, "step": 48180}, {"loss": 0.6141, "grad_norm": 0.8928955793380737, "learning_rate": 0.0002, "epoch": 3.460682226211849, "step": 48190}, {"loss": 0.6699, "grad_norm": 0.7565838694572449, "learning_rate": 0.0002, "epoch": 3.4614003590664275, "step": 48200}, {"loss": 0.6218, "grad_norm": 1.0044180154800415, "learning_rate": 0.0002, "epoch": 3.4621184919210055, "step": 48210}, {"loss": 0.6182, "grad_norm": 0.8161038160324097, "learning_rate": 0.0002, "epoch": 3.4628366247755835, "step": 48220}, {"loss": 0.6869, "grad_norm": 1.1000211238861084, "learning_rate": 0.0002, "epoch": 3.4635547576301615, "step": 48230}, {"loss": 0.7141, "grad_norm": 0.7942240238189697, "learning_rate": 0.0002, "epoch": 3.4642728904847395, "step": 48240}, {"loss": 0.6247, "grad_norm": 0.7546432018280029, "learning_rate": 0.0002, "epoch": 3.464991023339318, "step": 48250}, {"loss": 0.6319, "grad_norm": 0.7705255150794983, "learning_rate": 0.0002, "epoch": 3.465709156193896, "step": 48260}, {"loss": 0.6414, "grad_norm": 0.7958067059516907, "learning_rate": 0.0002, "epoch": 3.466427289048474, "step": 48270}, {"loss": 0.6526, "grad_norm": 0.9199120402336121, "learning_rate": 0.0002, "epoch": 3.467145421903052, "step": 48280}, {"loss": 0.6476, "grad_norm": 1.118672251701355, "learning_rate": 0.0002, "epoch": 3.46786355475763, "step": 48290}, {"loss": 0.6543, "grad_norm": 0.9161015748977661, "learning_rate": 0.0002, "epoch": 3.4685816876122084, "step": 48300}, {"loss": 0.6767, "grad_norm": 1.1086218357086182, "learning_rate": 0.0002, "epoch": 3.4692998204667864, "step": 48310}, {"loss": 0.5917, "grad_norm": 1.0123368501663208, "learning_rate": 0.0002, "epoch": 3.4700179533213644, "step": 48320}, {"loss": 0.6277, "grad_norm": 0.7380602359771729, "learning_rate": 0.0002, "epoch": 3.4707360861759424, "step": 48330}, {"loss": 0.6407, "grad_norm": 0.8967105150222778, "learning_rate": 0.0002, "epoch": 3.4714542190305204, "step": 48340}, {"loss": 0.6526, "grad_norm": 1.0134044885635376, "learning_rate": 0.0002, "epoch": 3.472172351885099, "step": 48350}, {"loss": 0.6436, "grad_norm": 1.080815076828003, "learning_rate": 0.0002, "epoch": 3.472890484739677, "step": 48360}, {"loss": 0.6644, "grad_norm": 1.151721477508545, "learning_rate": 0.0002, "epoch": 3.473608617594255, "step": 48370}, {"loss": 0.6612, "grad_norm": 0.9436505436897278, "learning_rate": 0.0002, "epoch": 3.474326750448833, "step": 48380}, {"loss": 0.6503, "grad_norm": 0.9154609441757202, "learning_rate": 0.0002, "epoch": 3.4750448833034113, "step": 48390}, {"loss": 0.6151, "grad_norm": 0.8943037986755371, "learning_rate": 0.0002, "epoch": 3.4757630161579893, "step": 48400}, {"loss": 0.6316, "grad_norm": 0.936988115310669, "learning_rate": 0.0002, "epoch": 3.4764811490125673, "step": 48410}, {"loss": 0.6638, "grad_norm": 0.826960027217865, "learning_rate": 0.0002, "epoch": 3.4771992818671453, "step": 48420}, {"loss": 0.6242, "grad_norm": 1.0487587451934814, "learning_rate": 0.0002, "epoch": 3.4779174147217233, "step": 48430}, {"loss": 0.6302, "grad_norm": 0.729163646697998, "learning_rate": 0.0002, "epoch": 3.478635547576302, "step": 48440}, {"loss": 0.6115, "grad_norm": 0.8156948089599609, "learning_rate": 0.0002, "epoch": 3.47935368043088, "step": 48450}, {"loss": 0.6455, "grad_norm": 0.8004332184791565, "learning_rate": 0.0002, "epoch": 3.480071813285458, "step": 48460}, {"loss": 0.621, "grad_norm": 0.9632692337036133, "learning_rate": 0.0002, "epoch": 3.480789946140036, "step": 48470}, {"loss": 0.6214, "grad_norm": 1.0950212478637695, "learning_rate": 0.0002, "epoch": 3.4815080789946142, "step": 48480}, {"loss": 0.6659, "grad_norm": 0.8574318885803223, "learning_rate": 0.0002, "epoch": 3.4822262118491922, "step": 48490}, {"loss": 0.6969, "grad_norm": 0.8552606701850891, "learning_rate": 0.0002, "epoch": 3.4829443447037702, "step": 48500}, {"loss": 0.6253, "grad_norm": 0.9698445200920105, "learning_rate": 0.0002, "epoch": 3.4836624775583482, "step": 48510}, {"loss": 0.6844, "grad_norm": 0.9427815675735474, "learning_rate": 0.0002, "epoch": 3.4843806104129262, "step": 48520}, {"loss": 0.6722, "grad_norm": 0.7902070879936218, "learning_rate": 0.0002, "epoch": 3.4850987432675042, "step": 48530}, {"loss": 0.6708, "grad_norm": 1.0300066471099854, "learning_rate": 0.0002, "epoch": 3.4858168761220827, "step": 48540}, {"loss": 0.6113, "grad_norm": 1.1688778400421143, "learning_rate": 0.0002, "epoch": 3.4865350089766607, "step": 48550}, {"loss": 0.5956, "grad_norm": 1.0012071132659912, "learning_rate": 0.0002, "epoch": 3.4872531418312387, "step": 48560}, {"loss": 0.6536, "grad_norm": 1.112094759941101, "learning_rate": 0.0002, "epoch": 3.4879712746858167, "step": 48570}, {"loss": 0.6625, "grad_norm": 0.8547284603118896, "learning_rate": 0.0002, "epoch": 3.488689407540395, "step": 48580}, {"loss": 0.6488, "grad_norm": 0.8827278017997742, "learning_rate": 0.0002, "epoch": 3.489407540394973, "step": 48590}, {"loss": 0.6437, "grad_norm": 0.9255490303039551, "learning_rate": 0.0002, "epoch": 3.490125673249551, "step": 48600}, {"loss": 0.6089, "grad_norm": 0.8000030517578125, "learning_rate": 0.0002, "epoch": 3.490843806104129, "step": 48610}, {"loss": 0.647, "grad_norm": 0.9327391386032104, "learning_rate": 0.0002, "epoch": 3.491561938958707, "step": 48620}, {"loss": 0.6678, "grad_norm": 0.9004138708114624, "learning_rate": 0.0002, "epoch": 3.4922800718132856, "step": 48630}, {"loss": 0.6145, "grad_norm": 0.9886971116065979, "learning_rate": 0.0002, "epoch": 3.4929982046678636, "step": 48640}, {"loss": 0.6309, "grad_norm": 0.9890487194061279, "learning_rate": 0.0002, "epoch": 3.4937163375224416, "step": 48650}, {"loss": 0.655, "grad_norm": 0.7024438977241516, "learning_rate": 0.0002, "epoch": 3.4944344703770196, "step": 48660}, {"loss": 0.6313, "grad_norm": 0.8397303223609924, "learning_rate": 0.0002, "epoch": 3.495152603231598, "step": 48670}, {"loss": 0.6429, "grad_norm": 0.9120950698852539, "learning_rate": 0.0002, "epoch": 3.495870736086176, "step": 48680}, {"loss": 0.631, "grad_norm": 1.057299017906189, "learning_rate": 0.0002, "epoch": 3.496588868940754, "step": 48690}, {"loss": 0.6459, "grad_norm": 0.821325957775116, "learning_rate": 0.0002, "epoch": 3.497307001795332, "step": 48700}, {"loss": 0.6174, "grad_norm": 1.0029970407485962, "learning_rate": 0.0002, "epoch": 3.49802513464991, "step": 48710}, {"loss": 0.6374, "grad_norm": 0.9483712911605835, "learning_rate": 0.0002, "epoch": 3.4987432675044885, "step": 48720}, {"loss": 0.6472, "grad_norm": 0.9637855291366577, "learning_rate": 0.0002, "epoch": 3.4994614003590665, "step": 48730}, {"loss": 0.6639, "grad_norm": 0.6848894357681274, "learning_rate": 0.0002, "epoch": 3.5001795332136445, "step": 48740}, {"loss": 0.6129, "grad_norm": 0.7848573327064514, "learning_rate": 0.0002, "epoch": 3.5008976660682225, "step": 48750}, {"loss": 0.6306, "grad_norm": 1.0341308116912842, "learning_rate": 0.0002, "epoch": 3.501615798922801, "step": 48760}, {"loss": 0.6063, "grad_norm": 0.8858218193054199, "learning_rate": 0.0002, "epoch": 3.502333931777379, "step": 48770}, {"loss": 0.6729, "grad_norm": 0.8366939425468445, "learning_rate": 0.0002, "epoch": 3.503052064631957, "step": 48780}, {"loss": 0.6736, "grad_norm": 0.7926092147827148, "learning_rate": 0.0002, "epoch": 3.503770197486535, "step": 48790}, {"loss": 0.6279, "grad_norm": 0.8503843545913696, "learning_rate": 0.0002, "epoch": 3.504488330341113, "step": 48800}, {"loss": 0.6162, "grad_norm": 0.8867869973182678, "learning_rate": 0.0002, "epoch": 3.505206463195691, "step": 48810}, {"loss": 0.6987, "grad_norm": 1.0336930751800537, "learning_rate": 0.0002, "epoch": 3.5059245960502694, "step": 48820}, {"loss": 0.6333, "grad_norm": 0.8564051985740662, "learning_rate": 0.0002, "epoch": 3.5066427289048474, "step": 48830}, {"loss": 0.6574, "grad_norm": 0.9202605485916138, "learning_rate": 0.0002, "epoch": 3.5073608617594254, "step": 48840}, {"loss": 0.6457, "grad_norm": 0.8838639855384827, "learning_rate": 0.0002, "epoch": 3.508078994614004, "step": 48850}, {"loss": 0.631, "grad_norm": 0.8975196480751038, "learning_rate": 0.0002, "epoch": 3.508797127468582, "step": 48860}, {"loss": 0.6335, "grad_norm": 0.8842370510101318, "learning_rate": 0.0002, "epoch": 3.50951526032316, "step": 48870}, {"loss": 0.6569, "grad_norm": 0.9195886254310608, "learning_rate": 0.0002, "epoch": 3.510233393177738, "step": 48880}, {"loss": 0.6647, "grad_norm": 0.986130952835083, "learning_rate": 0.0002, "epoch": 3.510951526032316, "step": 48890}, {"loss": 0.6676, "grad_norm": 0.8119593858718872, "learning_rate": 0.0002, "epoch": 3.511669658886894, "step": 48900}, {"loss": 0.653, "grad_norm": 0.9027136564254761, "learning_rate": 0.0002, "epoch": 3.5123877917414723, "step": 48910}, {"loss": 0.6731, "grad_norm": 0.8560537099838257, "learning_rate": 0.0002, "epoch": 3.5131059245960503, "step": 48920}, {"loss": 0.7032, "grad_norm": 0.7073559165000916, "learning_rate": 0.0002, "epoch": 3.5138240574506283, "step": 48930}, {"loss": 0.6738, "grad_norm": 0.8753304481506348, "learning_rate": 0.0002, "epoch": 3.5145421903052063, "step": 48940}, {"loss": 0.6366, "grad_norm": 0.9151145815849304, "learning_rate": 0.0002, "epoch": 3.5152603231597848, "step": 48950}, {"loss": 0.6135, "grad_norm": 0.7794315814971924, "learning_rate": 0.0002, "epoch": 3.5159784560143628, "step": 48960}, {"loss": 0.658, "grad_norm": 0.9226023554801941, "learning_rate": 0.0002, "epoch": 3.5166965888689408, "step": 48970}, {"loss": 0.6473, "grad_norm": 0.8442051410675049, "learning_rate": 0.0002, "epoch": 3.5174147217235188, "step": 48980}, {"loss": 0.6267, "grad_norm": 0.9769423007965088, "learning_rate": 0.0002, "epoch": 3.5181328545780968, "step": 48990}, {"loss": 0.6333, "grad_norm": 0.740347146987915, "learning_rate": 0.0002, "epoch": 3.5188509874326748, "step": 49000}, {"loss": 0.6652, "grad_norm": 0.8963457345962524, "learning_rate": 0.0002, "epoch": 3.519569120287253, "step": 49010}, {"loss": 0.6782, "grad_norm": 0.8410176634788513, "learning_rate": 0.0002, "epoch": 3.520287253141831, "step": 49020}, {"loss": 0.6496, "grad_norm": 1.0486022233963013, "learning_rate": 0.0002, "epoch": 3.521005385996409, "step": 49030}, {"loss": 0.6275, "grad_norm": 0.95393967628479, "learning_rate": 0.0002, "epoch": 3.5217235188509877, "step": 49040}, {"loss": 0.6328, "grad_norm": 0.8261157274246216, "learning_rate": 0.0002, "epoch": 3.5224416517055657, "step": 49050}, {"loss": 0.6441, "grad_norm": 0.9321704506874084, "learning_rate": 0.0002, "epoch": 3.5231597845601437, "step": 49060}, {"loss": 0.6202, "grad_norm": 1.2596088647842407, "learning_rate": 0.0002, "epoch": 3.5238779174147217, "step": 49070}, {"loss": 0.6596, "grad_norm": 0.8584637641906738, "learning_rate": 0.0002, "epoch": 3.5245960502692997, "step": 49080}, {"loss": 0.6708, "grad_norm": 0.850520670413971, "learning_rate": 0.0002, "epoch": 3.5253141831238777, "step": 49090}, {"loss": 0.6543, "grad_norm": 0.8915920257568359, "learning_rate": 0.0002, "epoch": 3.526032315978456, "step": 49100}, {"loss": 0.6558, "grad_norm": 0.9070239067077637, "learning_rate": 0.0002, "epoch": 3.526750448833034, "step": 49110}, {"loss": 0.6128, "grad_norm": 0.699878990650177, "learning_rate": 0.0002, "epoch": 3.527468581687612, "step": 49120}, {"loss": 0.6454, "grad_norm": 0.9003779888153076, "learning_rate": 0.0002, "epoch": 3.5281867145421906, "step": 49130}, {"loss": 0.6177, "grad_norm": 0.7886711955070496, "learning_rate": 0.0002, "epoch": 3.5289048473967686, "step": 49140}, {"loss": 0.6499, "grad_norm": 0.7368922233581543, "learning_rate": 0.0002, "epoch": 3.5296229802513466, "step": 49150}, {"loss": 0.6382, "grad_norm": 0.8585197329521179, "learning_rate": 0.0002, "epoch": 3.5303411131059246, "step": 49160}, {"loss": 0.6761, "grad_norm": 1.0205435752868652, "learning_rate": 0.0002, "epoch": 3.5310592459605026, "step": 49170}, {"loss": 0.6544, "grad_norm": 0.8756650686264038, "learning_rate": 0.0002, "epoch": 3.5317773788150806, "step": 49180}, {"loss": 0.6592, "grad_norm": 1.0278643369674683, "learning_rate": 0.0002, "epoch": 3.532495511669659, "step": 49190}, {"loss": 0.6682, "grad_norm": 0.8641911745071411, "learning_rate": 0.0002, "epoch": 3.533213644524237, "step": 49200}, {"loss": 0.6531, "grad_norm": 0.8730159401893616, "learning_rate": 0.0002, "epoch": 3.533931777378815, "step": 49210}, {"loss": 0.636, "grad_norm": 0.918637216091156, "learning_rate": 0.0002, "epoch": 3.534649910233393, "step": 49220}, {"loss": 0.6815, "grad_norm": 1.0467222929000854, "learning_rate": 0.0002, "epoch": 3.5353680430879715, "step": 49230}, {"loss": 0.6554, "grad_norm": 1.005009412765503, "learning_rate": 0.0002, "epoch": 3.5360861759425495, "step": 49240}, {"loss": 0.649, "grad_norm": 0.9775063395500183, "learning_rate": 0.0002, "epoch": 3.5368043087971275, "step": 49250}, {"loss": 0.6527, "grad_norm": 0.8198322057723999, "learning_rate": 0.0002, "epoch": 3.5375224416517055, "step": 49260}, {"loss": 0.664, "grad_norm": 0.8184829354286194, "learning_rate": 0.0002, "epoch": 3.5382405745062835, "step": 49270}, {"loss": 0.6493, "grad_norm": 0.9520270824432373, "learning_rate": 0.0002, "epoch": 3.5389587073608615, "step": 49280}, {"loss": 0.5935, "grad_norm": 0.7816803455352783, "learning_rate": 0.0002, "epoch": 3.53967684021544, "step": 49290}, {"loss": 0.6424, "grad_norm": 0.6915702819824219, "learning_rate": 0.0002, "epoch": 3.540394973070018, "step": 49300}, {"loss": 0.6447, "grad_norm": 0.8282375931739807, "learning_rate": 0.0002, "epoch": 3.541113105924596, "step": 49310}, {"loss": 0.6164, "grad_norm": 1.0797513723373413, "learning_rate": 0.0002, "epoch": 3.5418312387791744, "step": 49320}, {"loss": 0.6836, "grad_norm": 0.868671715259552, "learning_rate": 0.0002, "epoch": 3.5425493716337524, "step": 49330}, {"loss": 0.6453, "grad_norm": 0.8534455895423889, "learning_rate": 0.0002, "epoch": 3.5432675044883304, "step": 49340}, {"loss": 0.6706, "grad_norm": 0.816411554813385, "learning_rate": 0.0002, "epoch": 3.5439856373429084, "step": 49350}, {"loss": 0.6101, "grad_norm": 0.7813423275947571, "learning_rate": 0.0002, "epoch": 3.5447037701974864, "step": 49360}, {"loss": 0.6617, "grad_norm": 0.8002013564109802, "learning_rate": 0.0002, "epoch": 3.5454219030520644, "step": 49370}, {"loss": 0.6667, "grad_norm": 0.9740113615989685, "learning_rate": 0.0002, "epoch": 3.546140035906643, "step": 49380}, {"loss": 0.6938, "grad_norm": 0.9046127200126648, "learning_rate": 0.0002, "epoch": 3.546858168761221, "step": 49390}, {"loss": 0.6444, "grad_norm": 0.8635150194168091, "learning_rate": 0.0002, "epoch": 3.547576301615799, "step": 49400}, {"loss": 0.6273, "grad_norm": 0.9488558769226074, "learning_rate": 0.0002, "epoch": 3.5482944344703773, "step": 49410}, {"loss": 0.6542, "grad_norm": 0.9637090563774109, "learning_rate": 0.0002, "epoch": 3.5490125673249553, "step": 49420}, {"loss": 0.6468, "grad_norm": 1.042245626449585, "learning_rate": 0.0002, "epoch": 3.5497307001795333, "step": 49430}, {"loss": 0.6999, "grad_norm": 0.9076175689697266, "learning_rate": 0.0002, "epoch": 3.5504488330341113, "step": 49440}, {"loss": 0.6192, "grad_norm": 0.8480596542358398, "learning_rate": 0.0002, "epoch": 3.5511669658886893, "step": 49450}, {"loss": 0.6835, "grad_norm": 0.8483007550239563, "learning_rate": 0.0002, "epoch": 3.5518850987432673, "step": 49460}, {"loss": 0.6607, "grad_norm": 0.7855815887451172, "learning_rate": 0.0002, "epoch": 3.5526032315978457, "step": 49470}, {"loss": 0.6364, "grad_norm": 0.8435823917388916, "learning_rate": 0.0002, "epoch": 3.5533213644524237, "step": 49480}, {"loss": 0.6674, "grad_norm": 0.8613026142120361, "learning_rate": 0.0002, "epoch": 3.5540394973070017, "step": 49490}, {"loss": 0.6651, "grad_norm": 0.9654812812805176, "learning_rate": 0.0002, "epoch": 3.5547576301615798, "step": 49500}, {"loss": 0.6471, "grad_norm": 0.8888838887214661, "learning_rate": 0.0002, "epoch": 3.555475763016158, "step": 49510}, {"loss": 0.622, "grad_norm": 0.7718146443367004, "learning_rate": 0.0002, "epoch": 3.556193895870736, "step": 49520}, {"loss": 0.6297, "grad_norm": 0.9487382173538208, "learning_rate": 0.0002, "epoch": 3.556912028725314, "step": 49530}, {"loss": 0.6516, "grad_norm": 0.9256559610366821, "learning_rate": 0.0002, "epoch": 3.557630161579892, "step": 49540}, {"loss": 0.6461, "grad_norm": 0.8879945874214172, "learning_rate": 0.0002, "epoch": 3.55834829443447, "step": 49550}, {"loss": 0.6367, "grad_norm": 0.8498744368553162, "learning_rate": 0.0002, "epoch": 3.559066427289048, "step": 49560}, {"loss": 0.6274, "grad_norm": 0.9550948143005371, "learning_rate": 0.0002, "epoch": 3.5597845601436267, "step": 49570}, {"loss": 0.635, "grad_norm": 0.8386164903640747, "learning_rate": 0.0002, "epoch": 3.5605026929982047, "step": 49580}, {"loss": 0.6495, "grad_norm": 0.925573468208313, "learning_rate": 0.0002, "epoch": 3.5612208258527827, "step": 49590}, {"loss": 0.676, "grad_norm": 0.8867112398147583, "learning_rate": 0.0002, "epoch": 3.561938958707361, "step": 49600}, {"loss": 0.6156, "grad_norm": 0.7638537883758545, "learning_rate": 0.0002, "epoch": 3.562657091561939, "step": 49610}, {"loss": 0.6597, "grad_norm": 0.9491845965385437, "learning_rate": 0.0002, "epoch": 3.563375224416517, "step": 49620}, {"loss": 0.6237, "grad_norm": 0.8384189605712891, "learning_rate": 0.0002, "epoch": 3.564093357271095, "step": 49630}, {"loss": 0.6102, "grad_norm": 0.8850575089454651, "learning_rate": 0.0002, "epoch": 3.564811490125673, "step": 49640}, {"loss": 0.6517, "grad_norm": 1.020916223526001, "learning_rate": 0.0002, "epoch": 3.565529622980251, "step": 49650}, {"loss": 0.6569, "grad_norm": 0.9298280477523804, "learning_rate": 0.0002, "epoch": 3.5662477558348296, "step": 49660}, {"loss": 0.6094, "grad_norm": 0.9795742034912109, "learning_rate": 0.0002, "epoch": 3.5669658886894076, "step": 49670}, {"loss": 0.6147, "grad_norm": 0.9401193261146545, "learning_rate": 0.0002, "epoch": 3.5676840215439856, "step": 49680}, {"loss": 0.622, "grad_norm": 1.0383585691452026, "learning_rate": 0.0002, "epoch": 3.568402154398564, "step": 49690}, {"loss": 0.6304, "grad_norm": 0.8370866179466248, "learning_rate": 0.0002, "epoch": 3.569120287253142, "step": 49700}, {"loss": 0.6356, "grad_norm": 0.8207486271858215, "learning_rate": 0.0002, "epoch": 3.56983842010772, "step": 49710}, {"loss": 0.6328, "grad_norm": 0.8551223278045654, "learning_rate": 0.0002, "epoch": 3.570556552962298, "step": 49720}, {"loss": 0.621, "grad_norm": 0.8041176199913025, "learning_rate": 0.0002, "epoch": 3.571274685816876, "step": 49730}, {"loss": 0.5818, "grad_norm": 0.9862527847290039, "learning_rate": 0.0002, "epoch": 3.571992818671454, "step": 49740}, {"loss": 0.6448, "grad_norm": 0.7557165622711182, "learning_rate": 0.0002, "epoch": 3.5727109515260325, "step": 49750}, {"loss": 0.6484, "grad_norm": 1.0908563137054443, "learning_rate": 0.0002, "epoch": 3.5734290843806105, "step": 49760}, {"loss": 0.6497, "grad_norm": 0.7245369553565979, "learning_rate": 0.0002, "epoch": 3.5741472172351885, "step": 49770}, {"loss": 0.6315, "grad_norm": 0.7851184010505676, "learning_rate": 0.0002, "epoch": 3.5748653500897665, "step": 49780}, {"loss": 0.6245, "grad_norm": 0.9443599581718445, "learning_rate": 0.0002, "epoch": 3.575583482944345, "step": 49790}, {"loss": 0.6481, "grad_norm": 1.021196961402893, "learning_rate": 0.0002, "epoch": 3.576301615798923, "step": 49800}, {"loss": 0.6368, "grad_norm": 0.9099196195602417, "learning_rate": 0.0002, "epoch": 3.577019748653501, "step": 49810}, {"loss": 0.6372, "grad_norm": 0.9397716522216797, "learning_rate": 0.0002, "epoch": 3.577737881508079, "step": 49820}, {"loss": 0.6208, "grad_norm": 0.9214922785758972, "learning_rate": 0.0002, "epoch": 3.578456014362657, "step": 49830}, {"loss": 0.6219, "grad_norm": 1.0053879022598267, "learning_rate": 0.0002, "epoch": 3.579174147217235, "step": 49840}, {"loss": 0.6283, "grad_norm": 0.9415460228919983, "learning_rate": 0.0002, "epoch": 3.5798922800718134, "step": 49850}, {"loss": 0.6759, "grad_norm": 1.0807833671569824, "learning_rate": 0.0002, "epoch": 3.5806104129263914, "step": 49860}, {"loss": 0.6404, "grad_norm": 1.0070871114730835, "learning_rate": 0.0002, "epoch": 3.5813285457809694, "step": 49870}, {"loss": 0.6411, "grad_norm": 0.9707024693489075, "learning_rate": 0.0002, "epoch": 3.582046678635548, "step": 49880}, {"loss": 0.6852, "grad_norm": 0.9979593753814697, "learning_rate": 0.0002, "epoch": 3.582764811490126, "step": 49890}, {"loss": 0.6519, "grad_norm": 0.7238648533821106, "learning_rate": 0.0002, "epoch": 3.583482944344704, "step": 49900}, {"loss": 0.6452, "grad_norm": 0.8168631792068481, "learning_rate": 0.0002, "epoch": 3.584201077199282, "step": 49910}, {"loss": 0.6174, "grad_norm": 0.8156409859657288, "learning_rate": 0.0002, "epoch": 3.58491921005386, "step": 49920}, {"loss": 0.6248, "grad_norm": 0.9256414175033569, "learning_rate": 0.0002, "epoch": 3.585637342908438, "step": 49930}, {"loss": 0.6077, "grad_norm": 1.0090070962905884, "learning_rate": 0.0002, "epoch": 3.5863554757630163, "step": 49940}, {"loss": 0.6016, "grad_norm": 0.8257701992988586, "learning_rate": 0.0002, "epoch": 3.5870736086175943, "step": 49950}, {"loss": 0.6996, "grad_norm": 0.9189013242721558, "learning_rate": 0.0002, "epoch": 3.5877917414721723, "step": 49960}, {"loss": 0.661, "grad_norm": 0.8497788310050964, "learning_rate": 0.0002, "epoch": 3.5885098743267507, "step": 49970}, {"loss": 0.6335, "grad_norm": 0.9596505761146545, "learning_rate": 0.0002, "epoch": 3.5892280071813287, "step": 49980}, {"loss": 0.697, "grad_norm": 0.8773331642150879, "learning_rate": 0.0002, "epoch": 3.5899461400359067, "step": 49990}, {"loss": 0.6259, "grad_norm": 0.8952302932739258, "learning_rate": 0.0002, "epoch": 3.5906642728904847, "step": 50000}, {"loss": 0.6152, "grad_norm": 0.7713809609413147, "learning_rate": 0.0002, "epoch": 3.5913824057450627, "step": 50010}, {"loss": 0.6127, "grad_norm": 1.0151346921920776, "learning_rate": 0.0002, "epoch": 3.5921005385996407, "step": 50020}, {"loss": 0.6093, "grad_norm": 0.8793733716011047, "learning_rate": 0.0002, "epoch": 3.592818671454219, "step": 50030}, {"loss": 0.5986, "grad_norm": 0.8881325721740723, "learning_rate": 0.0002, "epoch": 3.593536804308797, "step": 50040}, {"loss": 0.6351, "grad_norm": 0.9346749782562256, "learning_rate": 0.0002, "epoch": 3.594254937163375, "step": 50050}, {"loss": 0.6501, "grad_norm": 0.8705052137374878, "learning_rate": 0.0002, "epoch": 3.594973070017953, "step": 50060}, {"loss": 0.6753, "grad_norm": 1.039197564125061, "learning_rate": 0.0002, "epoch": 3.5956912028725316, "step": 50070}, {"loss": 0.6565, "grad_norm": 0.7053273320198059, "learning_rate": 0.0002, "epoch": 3.5964093357271096, "step": 50080}, {"loss": 0.6546, "grad_norm": 0.8268665671348572, "learning_rate": 0.0002, "epoch": 3.5971274685816876, "step": 50090}, {"loss": 0.6637, "grad_norm": 0.8921764492988586, "learning_rate": 0.0002, "epoch": 3.5978456014362656, "step": 50100}, {"loss": 0.6827, "grad_norm": 0.9756084680557251, "learning_rate": 0.0002, "epoch": 3.5985637342908436, "step": 50110}, {"loss": 0.6746, "grad_norm": 0.9275530576705933, "learning_rate": 0.0002, "epoch": 3.5992818671454216, "step": 50120}, {"loss": 0.6709, "grad_norm": 0.9030009508132935, "learning_rate": 0.0002, "epoch": 3.6, "step": 50130}, {"loss": 0.6344, "grad_norm": 0.7805638909339905, "learning_rate": 0.0002, "epoch": 3.600718132854578, "step": 50140}, {"loss": 0.6437, "grad_norm": 0.7627325057983398, "learning_rate": 0.0002, "epoch": 3.601436265709156, "step": 50150}, {"loss": 0.6523, "grad_norm": 0.7809714078903198, "learning_rate": 0.0002, "epoch": 3.6021543985637345, "step": 50160}, {"loss": 0.6578, "grad_norm": 0.7910378575325012, "learning_rate": 0.0002, "epoch": 3.6028725314183125, "step": 50170}, {"loss": 0.6522, "grad_norm": 1.004438042640686, "learning_rate": 0.0002, "epoch": 3.6035906642728905, "step": 50180}, {"loss": 0.6657, "grad_norm": 0.825969934463501, "learning_rate": 0.0002, "epoch": 3.6043087971274685, "step": 50190}, {"loss": 0.6788, "grad_norm": 0.8866565227508545, "learning_rate": 0.0002, "epoch": 3.6050269299820465, "step": 50200}, {"loss": 0.6643, "grad_norm": 0.8920543193817139, "learning_rate": 0.0002, "epoch": 3.6057450628366245, "step": 50210}, {"loss": 0.668, "grad_norm": 1.106584906578064, "learning_rate": 0.0002, "epoch": 3.606463195691203, "step": 50220}, {"loss": 0.6878, "grad_norm": 0.916607677936554, "learning_rate": 0.0002, "epoch": 3.607181328545781, "step": 50230}, {"loss": 0.6084, "grad_norm": 0.8014767169952393, "learning_rate": 0.0002, "epoch": 3.607899461400359, "step": 50240}, {"loss": 0.6718, "grad_norm": 0.9556822776794434, "learning_rate": 0.0002, "epoch": 3.608617594254937, "step": 50250}, {"loss": 0.6896, "grad_norm": 0.9630016684532166, "learning_rate": 0.0002, "epoch": 3.6093357271095154, "step": 50260}, {"loss": 0.692, "grad_norm": 0.9862125515937805, "learning_rate": 0.0002, "epoch": 3.6100538599640934, "step": 50270}, {"loss": 0.5981, "grad_norm": 1.0043333768844604, "learning_rate": 0.0002, "epoch": 3.6107719928186714, "step": 50280}, {"loss": 0.6243, "grad_norm": 0.9255319833755493, "learning_rate": 0.0002, "epoch": 3.6114901256732495, "step": 50290}, {"loss": 0.6374, "grad_norm": 1.012023687362671, "learning_rate": 0.0002, "epoch": 3.6122082585278275, "step": 50300}, {"loss": 0.6896, "grad_norm": 1.0701122283935547, "learning_rate": 0.0002, "epoch": 3.612926391382406, "step": 50310}, {"loss": 0.6474, "grad_norm": 0.8270810842514038, "learning_rate": 0.0002, "epoch": 3.613644524236984, "step": 50320}, {"loss": 0.6667, "grad_norm": 0.8881328105926514, "learning_rate": 0.0002, "epoch": 3.614362657091562, "step": 50330}, {"loss": 0.6517, "grad_norm": 0.9536844491958618, "learning_rate": 0.0002, "epoch": 3.61508078994614, "step": 50340}, {"loss": 0.62, "grad_norm": 0.8044326305389404, "learning_rate": 0.0002, "epoch": 3.6157989228007184, "step": 50350}, {"loss": 0.6259, "grad_norm": 0.834591805934906, "learning_rate": 0.0002, "epoch": 3.6165170556552964, "step": 50360}, {"loss": 0.7173, "grad_norm": 0.903752863407135, "learning_rate": 0.0002, "epoch": 3.6172351885098744, "step": 50370}, {"loss": 0.6305, "grad_norm": 0.9148632884025574, "learning_rate": 0.0002, "epoch": 3.6179533213644524, "step": 50380}, {"loss": 0.6624, "grad_norm": 0.9280176162719727, "learning_rate": 0.0002, "epoch": 3.6186714542190304, "step": 50390}, {"loss": 0.6457, "grad_norm": 0.9524136781692505, "learning_rate": 0.0002, "epoch": 3.6193895870736084, "step": 50400}, {"loss": 0.6918, "grad_norm": 1.1751197576522827, "learning_rate": 0.0002, "epoch": 3.620107719928187, "step": 50410}, {"loss": 0.6161, "grad_norm": 1.032279133796692, "learning_rate": 0.0002, "epoch": 3.620825852782765, "step": 50420}, {"loss": 0.6347, "grad_norm": 0.790741503238678, "learning_rate": 0.0002, "epoch": 3.621543985637343, "step": 50430}, {"loss": 0.695, "grad_norm": 0.9584221243858337, "learning_rate": 0.0002, "epoch": 3.6222621184919213, "step": 50440}, {"loss": 0.6393, "grad_norm": 0.7792508006095886, "learning_rate": 0.0002, "epoch": 3.6229802513464993, "step": 50450}, {"loss": 0.6398, "grad_norm": 0.8273448944091797, "learning_rate": 0.0002, "epoch": 3.6236983842010773, "step": 50460}, {"loss": 0.6436, "grad_norm": 0.8001132607460022, "learning_rate": 0.0002, "epoch": 3.6244165170556553, "step": 50470}, {"loss": 0.6499, "grad_norm": 1.077109694480896, "learning_rate": 0.0002, "epoch": 3.6251346499102333, "step": 50480}, {"loss": 0.6587, "grad_norm": 1.111274003982544, "learning_rate": 0.0002, "epoch": 3.6258527827648113, "step": 50490}, {"loss": 0.6842, "grad_norm": 0.7757347822189331, "learning_rate": 0.0002, "epoch": 3.6265709156193897, "step": 50500}, {"loss": 0.6887, "grad_norm": 0.9217049479484558, "learning_rate": 0.0002, "epoch": 3.6272890484739677, "step": 50510}, {"loss": 0.6903, "grad_norm": 0.9362251162528992, "learning_rate": 0.0002, "epoch": 3.6280071813285457, "step": 50520}, {"loss": 0.625, "grad_norm": 0.9435479044914246, "learning_rate": 0.0002, "epoch": 3.6287253141831237, "step": 50530}, {"loss": 0.5869, "grad_norm": 0.7748915553092957, "learning_rate": 0.0002, "epoch": 3.629443447037702, "step": 50540}, {"loss": 0.637, "grad_norm": 0.8238945007324219, "learning_rate": 0.0002, "epoch": 3.63016157989228, "step": 50550}, {"loss": 0.6251, "grad_norm": 0.8421505093574524, "learning_rate": 0.0002, "epoch": 3.630879712746858, "step": 50560}, {"loss": 0.6544, "grad_norm": 1.0272293090820312, "learning_rate": 0.0002, "epoch": 3.631597845601436, "step": 50570}, {"loss": 0.6467, "grad_norm": 0.7643818259239197, "learning_rate": 0.0002, "epoch": 3.632315978456014, "step": 50580}, {"loss": 0.6716, "grad_norm": 0.9756225347518921, "learning_rate": 0.0002, "epoch": 3.6330341113105926, "step": 50590}, {"loss": 0.6534, "grad_norm": 0.9311570525169373, "learning_rate": 0.0002, "epoch": 3.6337522441651706, "step": 50600}, {"loss": 0.6465, "grad_norm": 0.8829827904701233, "learning_rate": 0.0002, "epoch": 3.6344703770197486, "step": 50610}, {"loss": 0.626, "grad_norm": 0.9473454356193542, "learning_rate": 0.0002, "epoch": 3.6351885098743266, "step": 50620}, {"loss": 0.713, "grad_norm": 1.1023668050765991, "learning_rate": 0.0002, "epoch": 3.635906642728905, "step": 50630}, {"loss": 0.6287, "grad_norm": 0.8490299582481384, "learning_rate": 0.0002, "epoch": 3.636624775583483, "step": 50640}, {"loss": 0.6373, "grad_norm": 1.1129392385482788, "learning_rate": 0.0002, "epoch": 3.637342908438061, "step": 50650}, {"loss": 0.7351, "grad_norm": 1.0334501266479492, "learning_rate": 0.0002, "epoch": 3.638061041292639, "step": 50660}, {"loss": 0.69, "grad_norm": 0.8397296667098999, "learning_rate": 0.0002, "epoch": 3.638779174147217, "step": 50670}, {"loss": 0.6075, "grad_norm": 0.7984256744384766, "learning_rate": 0.0002, "epoch": 3.639497307001795, "step": 50680}, {"loss": 0.651, "grad_norm": 1.1182054281234741, "learning_rate": 0.0002, "epoch": 3.6402154398563735, "step": 50690}, {"loss": 0.6511, "grad_norm": 0.8743279576301575, "learning_rate": 0.0002, "epoch": 3.6409335727109515, "step": 50700}, {"loss": 0.6894, "grad_norm": 0.9101628661155701, "learning_rate": 0.0002, "epoch": 3.6416517055655295, "step": 50710}, {"loss": 0.6591, "grad_norm": 0.8866934180259705, "learning_rate": 0.0002, "epoch": 3.642369838420108, "step": 50720}, {"loss": 0.6483, "grad_norm": 0.863945484161377, "learning_rate": 0.0002, "epoch": 3.643087971274686, "step": 50730}, {"loss": 0.6443, "grad_norm": 1.0845744609832764, "learning_rate": 0.0002, "epoch": 3.643806104129264, "step": 50740}, {"loss": 0.6611, "grad_norm": 0.8610911965370178, "learning_rate": 0.0002, "epoch": 3.644524236983842, "step": 50750}, {"loss": 0.6617, "grad_norm": 0.8502625226974487, "learning_rate": 0.0002, "epoch": 3.64524236983842, "step": 50760}, {"loss": 0.6283, "grad_norm": 0.847372829914093, "learning_rate": 0.0002, "epoch": 3.645960502692998, "step": 50770}, {"loss": 0.5724, "grad_norm": 0.8649292588233948, "learning_rate": 0.0002, "epoch": 3.6466786355475764, "step": 50780}, {"loss": 0.6253, "grad_norm": 0.8742905855178833, "learning_rate": 0.0002, "epoch": 3.6473967684021544, "step": 50790}, {"loss": 0.68, "grad_norm": 0.9546048641204834, "learning_rate": 0.0002, "epoch": 3.6481149012567324, "step": 50800}, {"loss": 0.6212, "grad_norm": 0.7893161773681641, "learning_rate": 0.0002, "epoch": 3.6488330341113104, "step": 50810}, {"loss": 0.6328, "grad_norm": 0.9350247979164124, "learning_rate": 0.0002, "epoch": 3.649551166965889, "step": 50820}, {"loss": 0.6893, "grad_norm": 0.772149384021759, "learning_rate": 0.0002, "epoch": 3.650269299820467, "step": 50830}, {"loss": 0.6107, "grad_norm": 0.8281718492507935, "learning_rate": 0.0002, "epoch": 3.650987432675045, "step": 50840}, {"loss": 0.6136, "grad_norm": 0.8063850402832031, "learning_rate": 0.0002, "epoch": 3.651705565529623, "step": 50850}, {"loss": 0.6416, "grad_norm": 0.8101351261138916, "learning_rate": 0.0002, "epoch": 3.652423698384201, "step": 50860}, {"loss": 0.6636, "grad_norm": 0.8747833371162415, "learning_rate": 0.0002, "epoch": 3.6531418312387793, "step": 50870}, {"loss": 0.6575, "grad_norm": 0.9634656310081482, "learning_rate": 0.0002, "epoch": 3.6538599640933573, "step": 50880}, {"loss": 0.6227, "grad_norm": 1.1646045446395874, "learning_rate": 0.0002, "epoch": 3.6545780969479353, "step": 50890}, {"loss": 0.6628, "grad_norm": 0.8538454174995422, "learning_rate": 0.0002, "epoch": 3.6552962298025133, "step": 50900}, {"loss": 0.6488, "grad_norm": 0.7639184594154358, "learning_rate": 0.0002, "epoch": 3.656014362657092, "step": 50910}, {"loss": 0.6495, "grad_norm": 0.8750212788581848, "learning_rate": 0.0002, "epoch": 3.65673249551167, "step": 50920}, {"loss": 0.6601, "grad_norm": 0.9161198735237122, "learning_rate": 0.0002, "epoch": 3.657450628366248, "step": 50930}, {"loss": 0.6809, "grad_norm": 0.7987924814224243, "learning_rate": 0.0002, "epoch": 3.658168761220826, "step": 50940}, {"loss": 0.6228, "grad_norm": 0.8939290642738342, "learning_rate": 0.0002, "epoch": 3.658886894075404, "step": 50950}, {"loss": 0.687, "grad_norm": 0.9803797602653503, "learning_rate": 0.0002, "epoch": 3.659605026929982, "step": 50960}, {"loss": 0.6368, "grad_norm": 1.2423512935638428, "learning_rate": 0.0002, "epoch": 3.6603231597845602, "step": 50970}, {"loss": 0.6477, "grad_norm": 1.0023225545883179, "learning_rate": 0.0002, "epoch": 3.6610412926391382, "step": 50980}, {"loss": 0.6659, "grad_norm": 0.9066677689552307, "learning_rate": 0.0002, "epoch": 3.6617594254937162, "step": 50990}, {"loss": 0.6348, "grad_norm": 0.8906226754188538, "learning_rate": 0.0002, "epoch": 3.6624775583482947, "step": 51000}, {"loss": 0.5967, "grad_norm": 0.7449954152107239, "learning_rate": 0.0002, "epoch": 3.6631956912028727, "step": 51010}, {"loss": 0.6167, "grad_norm": 0.812612771987915, "learning_rate": 0.0002, "epoch": 3.6639138240574507, "step": 51020}, {"loss": 0.6414, "grad_norm": 0.861818253993988, "learning_rate": 0.0002, "epoch": 3.6646319569120287, "step": 51030}, {"loss": 0.6418, "grad_norm": 0.849726676940918, "learning_rate": 0.0002, "epoch": 3.6653500897666067, "step": 51040}, {"loss": 0.6613, "grad_norm": 0.9738494753837585, "learning_rate": 0.0002, "epoch": 3.6660682226211847, "step": 51050}, {"loss": 0.6094, "grad_norm": 0.928989827632904, "learning_rate": 0.0002, "epoch": 3.666786355475763, "step": 51060}, {"loss": 0.623, "grad_norm": 0.9725563526153564, "learning_rate": 0.0002, "epoch": 3.667504488330341, "step": 51070}, {"loss": 0.5967, "grad_norm": 0.9366095066070557, "learning_rate": 0.0002, "epoch": 3.668222621184919, "step": 51080}, {"loss": 0.6175, "grad_norm": 0.8012986779212952, "learning_rate": 0.0002, "epoch": 3.668940754039497, "step": 51090}, {"loss": 0.6428, "grad_norm": 1.0646892786026, "learning_rate": 0.0002, "epoch": 3.6696588868940756, "step": 51100}, {"loss": 0.6333, "grad_norm": 0.7245157361030579, "learning_rate": 0.0002, "epoch": 3.6703770197486536, "step": 51110}, {"loss": 0.6618, "grad_norm": 0.6938936114311218, "learning_rate": 0.0002, "epoch": 3.6710951526032316, "step": 51120}, {"loss": 0.6511, "grad_norm": 0.8461366295814514, "learning_rate": 0.0002, "epoch": 3.6718132854578096, "step": 51130}, {"loss": 0.6168, "grad_norm": 0.8392583131790161, "learning_rate": 0.0002, "epoch": 3.6725314183123876, "step": 51140}, {"loss": 0.6616, "grad_norm": 0.7245259284973145, "learning_rate": 0.0002, "epoch": 3.673249551166966, "step": 51150}, {"loss": 0.6165, "grad_norm": 1.0742167234420776, "learning_rate": 0.0002, "epoch": 3.673967684021544, "step": 51160}, {"loss": 0.6805, "grad_norm": 0.9553889036178589, "learning_rate": 0.0002, "epoch": 3.674685816876122, "step": 51170}, {"loss": 0.6065, "grad_norm": 0.8713715672492981, "learning_rate": 0.0002, "epoch": 3.6754039497307, "step": 51180}, {"loss": 0.599, "grad_norm": 0.7499800324440002, "learning_rate": 0.0002, "epoch": 3.6761220825852785, "step": 51190}, {"loss": 0.7143, "grad_norm": 1.1118139028549194, "learning_rate": 0.0002, "epoch": 3.6768402154398565, "step": 51200}, {"loss": 0.6694, "grad_norm": 0.8146613836288452, "learning_rate": 0.0002, "epoch": 3.6775583482944345, "step": 51210}, {"loss": 0.6528, "grad_norm": 0.9331285357475281, "learning_rate": 0.0002, "epoch": 3.6782764811490125, "step": 51220}, {"loss": 0.6429, "grad_norm": 1.0497597455978394, "learning_rate": 0.0002, "epoch": 3.6789946140035905, "step": 51230}, {"loss": 0.6404, "grad_norm": 0.879814863204956, "learning_rate": 0.0002, "epoch": 3.6797127468581685, "step": 51240}, {"loss": 0.6617, "grad_norm": 0.9896606802940369, "learning_rate": 0.0002, "epoch": 3.680430879712747, "step": 51250}, {"loss": 0.6461, "grad_norm": 0.928236186504364, "learning_rate": 0.0002, "epoch": 3.681149012567325, "step": 51260}, {"loss": 0.6516, "grad_norm": 0.8436732292175293, "learning_rate": 0.0002, "epoch": 3.681867145421903, "step": 51270}, {"loss": 0.6428, "grad_norm": 0.93634432554245, "learning_rate": 0.0002, "epoch": 3.6825852782764814, "step": 51280}, {"loss": 0.6081, "grad_norm": 0.8477143049240112, "learning_rate": 0.0002, "epoch": 3.6833034111310594, "step": 51290}, {"loss": 0.6536, "grad_norm": 0.8720934987068176, "learning_rate": 0.0002, "epoch": 3.6840215439856374, "step": 51300}, {"loss": 0.6523, "grad_norm": 0.7322931289672852, "learning_rate": 0.0002, "epoch": 3.6847396768402154, "step": 51310}, {"loss": 0.6475, "grad_norm": 1.0064427852630615, "learning_rate": 0.0002, "epoch": 3.6854578096947934, "step": 51320}, {"loss": 0.681, "grad_norm": 1.0197817087173462, "learning_rate": 0.0002, "epoch": 3.6861759425493714, "step": 51330}, {"loss": 0.5904, "grad_norm": 0.8764060139656067, "learning_rate": 0.0002, "epoch": 3.68689407540395, "step": 51340}, {"loss": 0.625, "grad_norm": 0.9763964414596558, "learning_rate": 0.0002, "epoch": 3.687612208258528, "step": 51350}, {"loss": 0.6299, "grad_norm": 0.8389105200767517, "learning_rate": 0.0002, "epoch": 3.688330341113106, "step": 51360}, {"loss": 0.6885, "grad_norm": 0.9215750694274902, "learning_rate": 0.0002, "epoch": 3.689048473967684, "step": 51370}, {"loss": 0.6325, "grad_norm": 0.8444913625717163, "learning_rate": 0.0002, "epoch": 3.6897666068222623, "step": 51380}, {"loss": 0.657, "grad_norm": 0.9635153412818909, "learning_rate": 0.0002, "epoch": 3.6904847396768403, "step": 51390}, {"loss": 0.7045, "grad_norm": 1.0397378206253052, "learning_rate": 0.0002, "epoch": 3.6912028725314183, "step": 51400}, {"loss": 0.6635, "grad_norm": 0.9154748320579529, "learning_rate": 0.0002, "epoch": 3.6919210053859963, "step": 51410}, {"loss": 0.6757, "grad_norm": 0.906445324420929, "learning_rate": 0.0002, "epoch": 3.6926391382405743, "step": 51420}, {"loss": 0.6533, "grad_norm": 0.9237992763519287, "learning_rate": 0.0002, "epoch": 3.6933572710951523, "step": 51430}, {"loss": 0.6257, "grad_norm": 0.8796338438987732, "learning_rate": 0.0002, "epoch": 3.6940754039497308, "step": 51440}, {"loss": 0.7063, "grad_norm": 0.8613203763961792, "learning_rate": 0.0002, "epoch": 3.6947935368043088, "step": 51450}, {"loss": 0.6455, "grad_norm": 0.7957607507705688, "learning_rate": 0.0002, "epoch": 3.6955116696588868, "step": 51460}, {"loss": 0.6328, "grad_norm": 0.9183711409568787, "learning_rate": 0.0002, "epoch": 3.6962298025134652, "step": 51470}, {"loss": 0.6289, "grad_norm": 1.0108308792114258, "learning_rate": 0.0002, "epoch": 3.6969479353680432, "step": 51480}, {"loss": 0.668, "grad_norm": 0.7768247127532959, "learning_rate": 0.0002, "epoch": 3.6976660682226212, "step": 51490}, {"loss": 0.6483, "grad_norm": 1.0051485300064087, "learning_rate": 0.0002, "epoch": 3.6983842010771992, "step": 51500}, {"loss": 0.6268, "grad_norm": 0.82451993227005, "learning_rate": 0.0002, "epoch": 3.6991023339317772, "step": 51510}, {"loss": 0.6258, "grad_norm": 0.9542286992073059, "learning_rate": 0.0002, "epoch": 3.6998204667863552, "step": 51520}, {"loss": 0.6415, "grad_norm": 0.693890392780304, "learning_rate": 0.0002, "epoch": 3.7005385996409337, "step": 51530}, {"loss": 0.6445, "grad_norm": 0.9068924784660339, "learning_rate": 0.0002, "epoch": 3.7012567324955117, "step": 51540}, {"loss": 0.6386, "grad_norm": 0.8694922924041748, "learning_rate": 0.0002, "epoch": 3.7019748653500897, "step": 51550}, {"loss": 0.6563, "grad_norm": 0.941081702709198, "learning_rate": 0.0002, "epoch": 3.702692998204668, "step": 51560}, {"loss": 0.6068, "grad_norm": 0.7385984659194946, "learning_rate": 0.0002, "epoch": 3.703411131059246, "step": 51570}, {"loss": 0.6243, "grad_norm": 1.0399216413497925, "learning_rate": 0.0002, "epoch": 3.704129263913824, "step": 51580}, {"loss": 0.6776, "grad_norm": 0.9802294969558716, "learning_rate": 0.0002, "epoch": 3.704847396768402, "step": 51590}, {"loss": 0.6243, "grad_norm": 1.0409669876098633, "learning_rate": 0.0002, "epoch": 3.70556552962298, "step": 51600}, {"loss": 0.6812, "grad_norm": 0.8972786068916321, "learning_rate": 0.0002, "epoch": 3.706283662477558, "step": 51610}, {"loss": 0.5993, "grad_norm": 1.1916245222091675, "learning_rate": 0.0002, "epoch": 3.7070017953321366, "step": 51620}, {"loss": 0.6566, "grad_norm": 0.9545385241508484, "learning_rate": 0.0002, "epoch": 3.7077199281867146, "step": 51630}, {"loss": 0.6497, "grad_norm": 1.0773427486419678, "learning_rate": 0.0002, "epoch": 3.7084380610412926, "step": 51640}, {"loss": 0.6768, "grad_norm": 1.0856024026870728, "learning_rate": 0.0002, "epoch": 3.7091561938958706, "step": 51650}, {"loss": 0.6404, "grad_norm": 0.7678500413894653, "learning_rate": 0.0002, "epoch": 3.709874326750449, "step": 51660}, {"loss": 0.6571, "grad_norm": 0.7276270985603333, "learning_rate": 0.0002, "epoch": 3.710592459605027, "step": 51670}, {"loss": 0.6498, "grad_norm": 0.8859017491340637, "learning_rate": 0.0002, "epoch": 3.711310592459605, "step": 51680}, {"loss": 0.6602, "grad_norm": 0.9037614464759827, "learning_rate": 0.0002, "epoch": 3.712028725314183, "step": 51690}, {"loss": 0.685, "grad_norm": 0.9223412275314331, "learning_rate": 0.0002, "epoch": 3.712746858168761, "step": 51700}, {"loss": 0.647, "grad_norm": 0.8812923431396484, "learning_rate": 0.0002, "epoch": 3.713464991023339, "step": 51710}, {"loss": 0.6546, "grad_norm": 0.8242456912994385, "learning_rate": 0.0002, "epoch": 3.7141831238779175, "step": 51720}, {"loss": 0.6462, "grad_norm": 0.8368834257125854, "learning_rate": 0.0002, "epoch": 3.7149012567324955, "step": 51730}, {"loss": 0.6432, "grad_norm": 0.8624704480171204, "learning_rate": 0.0002, "epoch": 3.7156193895870735, "step": 51740}, {"loss": 0.6367, "grad_norm": 0.9138273596763611, "learning_rate": 0.0002, "epoch": 3.716337522441652, "step": 51750}, {"loss": 0.6717, "grad_norm": 0.8088571429252625, "learning_rate": 0.0002, "epoch": 3.71705565529623, "step": 51760}, {"loss": 0.658, "grad_norm": 0.882808268070221, "learning_rate": 0.0002, "epoch": 3.717773788150808, "step": 51770}, {"loss": 0.6686, "grad_norm": 0.9368035197257996, "learning_rate": 0.0002, "epoch": 3.718491921005386, "step": 51780}, {"loss": 0.6482, "grad_norm": 0.8341794013977051, "learning_rate": 0.0002, "epoch": 3.719210053859964, "step": 51790}, {"loss": 0.6486, "grad_norm": 0.8692073225975037, "learning_rate": 0.0002, "epoch": 3.719928186714542, "step": 51800}, {"loss": 0.6591, "grad_norm": 0.7566918730735779, "learning_rate": 0.0002, "epoch": 3.7206463195691204, "step": 51810}, {"loss": 0.707, "grad_norm": 1.113138198852539, "learning_rate": 0.0002, "epoch": 3.7213644524236984, "step": 51820}, {"loss": 0.6683, "grad_norm": 0.8793158531188965, "learning_rate": 0.0002, "epoch": 3.7220825852782764, "step": 51830}, {"loss": 0.6343, "grad_norm": 0.8856439590454102, "learning_rate": 0.0002, "epoch": 3.722800718132855, "step": 51840}, {"loss": 0.6238, "grad_norm": 1.0182029008865356, "learning_rate": 0.0002, "epoch": 3.723518850987433, "step": 51850}, {"loss": 0.6743, "grad_norm": 1.1177181005477905, "learning_rate": 0.0002, "epoch": 3.724236983842011, "step": 51860}, {"loss": 0.6477, "grad_norm": 0.6600990295410156, "learning_rate": 0.0002, "epoch": 3.724955116696589, "step": 51870}, {"loss": 0.6532, "grad_norm": 1.0563536882400513, "learning_rate": 0.0002, "epoch": 3.725673249551167, "step": 51880}, {"loss": 0.6648, "grad_norm": 1.1067734956741333, "learning_rate": 0.0002, "epoch": 3.726391382405745, "step": 51890}, {"loss": 0.6547, "grad_norm": 1.0204616785049438, "learning_rate": 0.0002, "epoch": 3.7271095152603233, "step": 51900}, {"loss": 0.685, "grad_norm": 0.8647155165672302, "learning_rate": 0.0002, "epoch": 3.7278276481149013, "step": 51910}, {"loss": 0.739, "grad_norm": 1.0754971504211426, "learning_rate": 0.0002, "epoch": 3.7285457809694793, "step": 51920}, {"loss": 0.6535, "grad_norm": 1.0448992252349854, "learning_rate": 0.0002, "epoch": 3.7292639138240573, "step": 51930}, {"loss": 0.6802, "grad_norm": 0.963434100151062, "learning_rate": 0.0002, "epoch": 3.7299820466786358, "step": 51940}, {"loss": 0.6367, "grad_norm": 0.8112701773643494, "learning_rate": 0.0002, "epoch": 3.7307001795332138, "step": 51950}, {"loss": 0.6785, "grad_norm": 0.7975119948387146, "learning_rate": 0.0002, "epoch": 3.7314183123877918, "step": 51960}, {"loss": 0.6748, "grad_norm": 0.7953376173973083, "learning_rate": 0.0002, "epoch": 3.7321364452423698, "step": 51970}, {"loss": 0.6464, "grad_norm": 0.9519981741905212, "learning_rate": 0.0002, "epoch": 3.7328545780969478, "step": 51980}, {"loss": 0.6247, "grad_norm": 0.8705791234970093, "learning_rate": 0.0002, "epoch": 3.7335727109515258, "step": 51990}, {"loss": 0.6876, "grad_norm": 0.870205283164978, "learning_rate": 0.0002, "epoch": 3.734290843806104, "step": 52000}, {"loss": 0.6681, "grad_norm": 0.9558930993080139, "learning_rate": 0.0002, "epoch": 3.735008976660682, "step": 52010}, {"loss": 0.6772, "grad_norm": 0.9330434799194336, "learning_rate": 0.0002, "epoch": 3.73572710951526, "step": 52020}, {"loss": 0.6365, "grad_norm": 0.783620297908783, "learning_rate": 0.0002, "epoch": 3.7364452423698387, "step": 52030}, {"loss": 0.6275, "grad_norm": 0.7575166821479797, "learning_rate": 0.0002, "epoch": 3.7371633752244167, "step": 52040}, {"loss": 0.6859, "grad_norm": 1.0592705011367798, "learning_rate": 0.0002, "epoch": 3.7378815080789947, "step": 52050}, {"loss": 0.6704, "grad_norm": 0.9309433102607727, "learning_rate": 0.0002, "epoch": 3.7385996409335727, "step": 52060}, {"loss": 0.6607, "grad_norm": 0.972861647605896, "learning_rate": 0.0002, "epoch": 3.7393177737881507, "step": 52070}, {"loss": 0.6267, "grad_norm": 0.9318740963935852, "learning_rate": 0.0002, "epoch": 3.7400359066427287, "step": 52080}, {"loss": 0.6404, "grad_norm": 0.7938477396965027, "learning_rate": 0.0002, "epoch": 3.740754039497307, "step": 52090}, {"loss": 0.6451, "grad_norm": 1.1515966653823853, "learning_rate": 0.0002, "epoch": 3.741472172351885, "step": 52100}, {"loss": 0.6179, "grad_norm": 1.076869010925293, "learning_rate": 0.0002, "epoch": 3.742190305206463, "step": 52110}, {"loss": 0.6477, "grad_norm": 0.8516066670417786, "learning_rate": 0.0002, "epoch": 3.7429084380610416, "step": 52120}, {"loss": 0.6741, "grad_norm": 0.6853429079055786, "learning_rate": 0.0002, "epoch": 3.7436265709156196, "step": 52130}, {"loss": 0.6392, "grad_norm": 0.8179695010185242, "learning_rate": 0.0002, "epoch": 3.7443447037701976, "step": 52140}, {"loss": 0.6692, "grad_norm": 0.8395232558250427, "learning_rate": 0.0002, "epoch": 3.7450628366247756, "step": 52150}, {"loss": 0.6902, "grad_norm": 1.0178003311157227, "learning_rate": 0.0002, "epoch": 3.7457809694793536, "step": 52160}, {"loss": 0.6726, "grad_norm": 1.1801023483276367, "learning_rate": 0.0002, "epoch": 3.7464991023339316, "step": 52170}, {"loss": 0.6334, "grad_norm": 0.8215751647949219, "learning_rate": 0.0002, "epoch": 3.74721723518851, "step": 52180}, {"loss": 0.5992, "grad_norm": 1.17083740234375, "learning_rate": 0.0002, "epoch": 3.747935368043088, "step": 52190}, {"loss": 0.6219, "grad_norm": 0.9230290651321411, "learning_rate": 0.0002, "epoch": 3.748653500897666, "step": 52200}, {"loss": 0.6503, "grad_norm": 0.8431521058082581, "learning_rate": 0.0002, "epoch": 3.749371633752244, "step": 52210}, {"loss": 0.6983, "grad_norm": 0.9690840244293213, "learning_rate": 0.0002, "epoch": 3.7500897666068225, "step": 52220}, {"loss": 0.6204, "grad_norm": 1.0022395849227905, "learning_rate": 0.0002, "epoch": 3.7508078994614005, "step": 52230}, {"loss": 0.6683, "grad_norm": 1.0489065647125244, "learning_rate": 0.0002, "epoch": 3.7515260323159785, "step": 52240}, {"loss": 0.6439, "grad_norm": 0.7880696058273315, "learning_rate": 0.0002, "epoch": 3.7522441651705565, "step": 52250}, {"loss": 0.6933, "grad_norm": 1.0255829095840454, "learning_rate": 0.0002, "epoch": 3.7529622980251345, "step": 52260}, {"loss": 0.6631, "grad_norm": 0.8470141291618347, "learning_rate": 0.0002, "epoch": 3.7536804308797125, "step": 52270}, {"loss": 0.5956, "grad_norm": 0.9040523171424866, "learning_rate": 0.0002, "epoch": 3.754398563734291, "step": 52280}, {"loss": 0.6759, "grad_norm": 0.9564392566680908, "learning_rate": 0.0002, "epoch": 3.755116696588869, "step": 52290}, {"loss": 0.6717, "grad_norm": 0.907857358455658, "learning_rate": 0.0002, "epoch": 3.755834829443447, "step": 52300}, {"loss": 0.6821, "grad_norm": 0.8929873704910278, "learning_rate": 0.0002, "epoch": 3.7565529622980254, "step": 52310}, {"loss": 0.655, "grad_norm": 0.854434072971344, "learning_rate": 0.0002, "epoch": 3.7572710951526034, "step": 52320}, {"loss": 0.6668, "grad_norm": 0.8744779229164124, "learning_rate": 0.0002, "epoch": 3.7579892280071814, "step": 52330}, {"loss": 0.6628, "grad_norm": 0.9022667407989502, "learning_rate": 0.0002, "epoch": 3.7587073608617594, "step": 52340}, {"loss": 0.6275, "grad_norm": 0.8884857892990112, "learning_rate": 0.0002, "epoch": 3.7594254937163374, "step": 52350}, {"loss": 0.6585, "grad_norm": 1.0228430032730103, "learning_rate": 0.0002, "epoch": 3.7601436265709154, "step": 52360}, {"loss": 0.6092, "grad_norm": 0.8593528270721436, "learning_rate": 0.0002, "epoch": 3.760861759425494, "step": 52370}, {"loss": 0.664, "grad_norm": 0.9435563087463379, "learning_rate": 0.0002, "epoch": 3.761579892280072, "step": 52380}, {"loss": 0.6326, "grad_norm": 0.7545679807662964, "learning_rate": 0.0002, "epoch": 3.76229802513465, "step": 52390}, {"loss": 0.6628, "grad_norm": 0.9411585927009583, "learning_rate": 0.0002, "epoch": 3.7630161579892283, "step": 52400}, {"loss": 0.62, "grad_norm": 0.9764377474784851, "learning_rate": 0.0002, "epoch": 3.7637342908438063, "step": 52410}, {"loss": 0.671, "grad_norm": 1.0718384981155396, "learning_rate": 0.0002, "epoch": 3.7644524236983843, "step": 52420}, {"loss": 0.6654, "grad_norm": 0.8765230774879456, "learning_rate": 0.0002, "epoch": 3.7651705565529623, "step": 52430}, {"loss": 0.6602, "grad_norm": 0.9275036454200745, "learning_rate": 0.0002, "epoch": 3.7658886894075403, "step": 52440}, {"loss": 0.6098, "grad_norm": 0.967410147190094, "learning_rate": 0.0002, "epoch": 3.7666068222621183, "step": 52450}, {"loss": 0.6195, "grad_norm": 0.7738949060440063, "learning_rate": 0.0002, "epoch": 3.7673249551166967, "step": 52460}, {"loss": 0.6054, "grad_norm": 1.0828070640563965, "learning_rate": 0.0002, "epoch": 3.7680430879712747, "step": 52470}, {"loss": 0.6208, "grad_norm": 0.9570213556289673, "learning_rate": 0.0002, "epoch": 3.7687612208258527, "step": 52480}, {"loss": 0.6703, "grad_norm": 1.0688215494155884, "learning_rate": 0.0002, "epoch": 3.7694793536804307, "step": 52490}, {"loss": 0.5993, "grad_norm": 0.7970073223114014, "learning_rate": 0.0002, "epoch": 3.770197486535009, "step": 52500}, {"loss": 0.6537, "grad_norm": 0.7132976651191711, "learning_rate": 0.0002, "epoch": 3.770915619389587, "step": 52510}, {"loss": 0.6571, "grad_norm": 1.152268648147583, "learning_rate": 0.0002, "epoch": 3.771633752244165, "step": 52520}, {"loss": 0.6548, "grad_norm": 0.8645235896110535, "learning_rate": 0.0002, "epoch": 3.772351885098743, "step": 52530}, {"loss": 0.6918, "grad_norm": 0.7725570201873779, "learning_rate": 0.0002, "epoch": 3.773070017953321, "step": 52540}, {"loss": 0.6796, "grad_norm": 0.9718102812767029, "learning_rate": 0.0002, "epoch": 3.773788150807899, "step": 52550}, {"loss": 0.6298, "grad_norm": 0.7568017840385437, "learning_rate": 0.0002, "epoch": 3.7745062836624776, "step": 52560}, {"loss": 0.6652, "grad_norm": 0.9578912854194641, "learning_rate": 0.0002, "epoch": 3.7752244165170556, "step": 52570}, {"loss": 0.6417, "grad_norm": 0.8657314777374268, "learning_rate": 0.0002, "epoch": 3.7759425493716336, "step": 52580}, {"loss": 0.6552, "grad_norm": 0.7564393281936646, "learning_rate": 0.0002, "epoch": 3.776660682226212, "step": 52590}, {"loss": 0.69, "grad_norm": 0.7631160616874695, "learning_rate": 0.0002, "epoch": 3.77737881508079, "step": 52600}, {"loss": 0.6427, "grad_norm": 1.1852056980133057, "learning_rate": 0.0002, "epoch": 3.778096947935368, "step": 52610}, {"loss": 0.6369, "grad_norm": 1.0620790719985962, "learning_rate": 0.0002, "epoch": 3.778815080789946, "step": 52620}, {"loss": 0.6782, "grad_norm": 0.8677777647972107, "learning_rate": 0.0002, "epoch": 3.779533213644524, "step": 52630}, {"loss": 0.6249, "grad_norm": 0.9913218021392822, "learning_rate": 0.0002, "epoch": 3.780251346499102, "step": 52640}, {"loss": 0.625, "grad_norm": 0.9868429899215698, "learning_rate": 0.0002, "epoch": 3.7809694793536806, "step": 52650}, {"loss": 0.6252, "grad_norm": 0.8791782259941101, "learning_rate": 0.0002, "epoch": 3.7816876122082586, "step": 52660}, {"loss": 0.6675, "grad_norm": 0.9503955245018005, "learning_rate": 0.0002, "epoch": 3.7824057450628366, "step": 52670}, {"loss": 0.6406, "grad_norm": 0.8647131323814392, "learning_rate": 0.0002, "epoch": 3.7831238779174146, "step": 52680}, {"loss": 0.6654, "grad_norm": 0.9819629788398743, "learning_rate": 0.0002, "epoch": 3.783842010771993, "step": 52690}, {"loss": 0.593, "grad_norm": 0.8548610210418701, "learning_rate": 0.0002, "epoch": 3.784560143626571, "step": 52700}, {"loss": 0.6614, "grad_norm": 0.8706230521202087, "learning_rate": 0.0002, "epoch": 3.785278276481149, "step": 52710}, {"loss": 0.6326, "grad_norm": 1.0032461881637573, "learning_rate": 0.0002, "epoch": 3.785996409335727, "step": 52720}, {"loss": 0.6172, "grad_norm": 1.0578246116638184, "learning_rate": 0.0002, "epoch": 3.786714542190305, "step": 52730}, {"loss": 0.6392, "grad_norm": 0.9854007363319397, "learning_rate": 0.0002, "epoch": 3.7874326750448835, "step": 52740}, {"loss": 0.6462, "grad_norm": 0.8389187455177307, "learning_rate": 0.0002, "epoch": 3.7881508078994615, "step": 52750}, {"loss": 0.6515, "grad_norm": 0.9192399978637695, "learning_rate": 0.0002, "epoch": 3.7888689407540395, "step": 52760}, {"loss": 0.6436, "grad_norm": 0.9518283605575562, "learning_rate": 0.0002, "epoch": 3.7895870736086175, "step": 52770}, {"loss": 0.6548, "grad_norm": 1.1296825408935547, "learning_rate": 0.0002, "epoch": 3.790305206463196, "step": 52780}, {"loss": 0.6073, "grad_norm": 1.0589144229888916, "learning_rate": 0.0002, "epoch": 3.791023339317774, "step": 52790}, {"loss": 0.6593, "grad_norm": 0.8954343199729919, "learning_rate": 0.0002, "epoch": 3.791741472172352, "step": 52800}, {"loss": 0.6678, "grad_norm": 0.8283370733261108, "learning_rate": 0.0002, "epoch": 3.79245960502693, "step": 52810}, {"loss": 0.6865, "grad_norm": 0.910642683506012, "learning_rate": 0.0002, "epoch": 3.793177737881508, "step": 52820}, {"loss": 0.6672, "grad_norm": 0.9255108833312988, "learning_rate": 0.0002, "epoch": 3.793895870736086, "step": 52830}, {"loss": 0.6836, "grad_norm": 0.8773723244667053, "learning_rate": 0.0002, "epoch": 3.7946140035906644, "step": 52840}, {"loss": 0.6815, "grad_norm": 0.8454240560531616, "learning_rate": 0.0002, "epoch": 3.7953321364452424, "step": 52850}, {"loss": 0.6594, "grad_norm": 0.7636052966117859, "learning_rate": 0.0002, "epoch": 3.7960502692998204, "step": 52860}, {"loss": 0.6663, "grad_norm": 0.9358382821083069, "learning_rate": 0.0002, "epoch": 3.796768402154399, "step": 52870}, {"loss": 0.6761, "grad_norm": 0.9662801623344421, "learning_rate": 0.0002, "epoch": 3.797486535008977, "step": 52880}, {"loss": 0.6749, "grad_norm": 0.995907187461853, "learning_rate": 0.0002, "epoch": 3.798204667863555, "step": 52890}, {"loss": 0.6715, "grad_norm": 0.8700127005577087, "learning_rate": 0.0002, "epoch": 3.798922800718133, "step": 52900}, {"loss": 0.6554, "grad_norm": 0.8987792134284973, "learning_rate": 0.0002, "epoch": 3.799640933572711, "step": 52910}, {"loss": 0.6655, "grad_norm": 0.9753904938697815, "learning_rate": 0.0002, "epoch": 3.800359066427289, "step": 52920}, {"loss": 0.6536, "grad_norm": 0.7873555421829224, "learning_rate": 0.0002, "epoch": 3.8010771992818673, "step": 52930}, {"loss": 0.6233, "grad_norm": 0.8177929520606995, "learning_rate": 0.0002, "epoch": 3.8017953321364453, "step": 52940}, {"loss": 0.6508, "grad_norm": 0.8865532279014587, "learning_rate": 0.0002, "epoch": 3.8025134649910233, "step": 52950}, {"loss": 0.6922, "grad_norm": 0.9113775491714478, "learning_rate": 0.0002, "epoch": 3.8032315978456013, "step": 52960}, {"loss": 0.6382, "grad_norm": 0.9424585700035095, "learning_rate": 0.0002, "epoch": 3.8039497307001797, "step": 52970}, {"loss": 0.6694, "grad_norm": 0.8347237706184387, "learning_rate": 0.0002, "epoch": 3.8046678635547577, "step": 52980}, {"loss": 0.643, "grad_norm": 0.826863169670105, "learning_rate": 0.0002, "epoch": 3.8053859964093357, "step": 52990}, {"loss": 0.639, "grad_norm": 0.7313310503959656, "learning_rate": 0.0002, "epoch": 3.8061041292639137, "step": 53000}, {"loss": 0.6831, "grad_norm": 0.8352667093276978, "learning_rate": 0.0002, "epoch": 3.8068222621184917, "step": 53010}, {"loss": 0.6265, "grad_norm": 0.748461127281189, "learning_rate": 0.0002, "epoch": 3.80754039497307, "step": 53020}, {"loss": 0.6433, "grad_norm": 0.943256139755249, "learning_rate": 0.0002, "epoch": 3.808258527827648, "step": 53030}, {"loss": 0.6702, "grad_norm": 1.0448410511016846, "learning_rate": 0.0002, "epoch": 3.808976660682226, "step": 53040}, {"loss": 0.6901, "grad_norm": 0.9047636985778809, "learning_rate": 0.0002, "epoch": 3.809694793536804, "step": 53050}, {"loss": 0.6774, "grad_norm": 0.8594381213188171, "learning_rate": 0.0002, "epoch": 3.8104129263913826, "step": 53060}, {"loss": 0.6664, "grad_norm": 0.7593536972999573, "learning_rate": 0.0002, "epoch": 3.8111310592459606, "step": 53070}, {"loss": 0.6651, "grad_norm": 0.7189019918441772, "learning_rate": 0.0002, "epoch": 3.8118491921005386, "step": 53080}, {"loss": 0.6657, "grad_norm": 0.8569809198379517, "learning_rate": 0.0002, "epoch": 3.8125673249551166, "step": 53090}, {"loss": 0.6689, "grad_norm": 0.923378050327301, "learning_rate": 0.0002, "epoch": 3.8132854578096946, "step": 53100}, {"loss": 0.6168, "grad_norm": 0.9088824391365051, "learning_rate": 0.0002, "epoch": 3.8140035906642726, "step": 53110}, {"loss": 0.6514, "grad_norm": 1.1386840343475342, "learning_rate": 0.0002, "epoch": 3.814721723518851, "step": 53120}, {"loss": 0.6182, "grad_norm": 0.8389552235603333, "learning_rate": 0.0002, "epoch": 3.815439856373429, "step": 53130}, {"loss": 0.6779, "grad_norm": 0.7940975427627563, "learning_rate": 0.0002, "epoch": 3.816157989228007, "step": 53140}, {"loss": 0.6825, "grad_norm": 0.8389907479286194, "learning_rate": 0.0002, "epoch": 3.8168761220825855, "step": 53150}, {"loss": 0.6763, "grad_norm": 0.774206280708313, "learning_rate": 0.0002, "epoch": 3.8175942549371635, "step": 53160}, {"loss": 0.7011, "grad_norm": 1.189447283744812, "learning_rate": 0.0002, "epoch": 3.8183123877917415, "step": 53170}, {"loss": 0.6206, "grad_norm": 0.9875882863998413, "learning_rate": 0.0002, "epoch": 3.8190305206463195, "step": 53180}, {"loss": 0.6254, "grad_norm": 0.9205945134162903, "learning_rate": 0.0002, "epoch": 3.8197486535008975, "step": 53190}, {"loss": 0.5845, "grad_norm": 0.8312796354293823, "learning_rate": 0.0002, "epoch": 3.8204667863554755, "step": 53200}, {"loss": 0.6415, "grad_norm": 0.9755756855010986, "learning_rate": 0.0002, "epoch": 3.821184919210054, "step": 53210}, {"loss": 0.6657, "grad_norm": 1.0722965002059937, "learning_rate": 0.0002, "epoch": 3.821903052064632, "step": 53220}, {"loss": 0.6547, "grad_norm": 0.7720510959625244, "learning_rate": 0.0002, "epoch": 3.82262118491921, "step": 53230}, {"loss": 0.6383, "grad_norm": 1.020147681236267, "learning_rate": 0.0002, "epoch": 3.823339317773788, "step": 53240}, {"loss": 0.6491, "grad_norm": 0.8241816759109497, "learning_rate": 0.0002, "epoch": 3.8240574506283664, "step": 53250}, {"loss": 0.6914, "grad_norm": 0.8939895629882812, "learning_rate": 0.0002, "epoch": 3.8247755834829444, "step": 53260}, {"loss": 0.6725, "grad_norm": 1.010852336883545, "learning_rate": 0.0002, "epoch": 3.8254937163375224, "step": 53270}, {"loss": 0.6841, "grad_norm": 0.8201420307159424, "learning_rate": 0.0002, "epoch": 3.8262118491921004, "step": 53280}, {"loss": 0.6739, "grad_norm": 0.8797973990440369, "learning_rate": 0.0002, "epoch": 3.8269299820466784, "step": 53290}, {"loss": 0.658, "grad_norm": 0.9034950137138367, "learning_rate": 0.0002, "epoch": 3.827648114901257, "step": 53300}, {"loss": 0.6314, "grad_norm": 0.926802933216095, "learning_rate": 0.0002, "epoch": 3.828366247755835, "step": 53310}, {"loss": 0.6526, "grad_norm": 1.0205509662628174, "learning_rate": 0.0002, "epoch": 3.829084380610413, "step": 53320}, {"loss": 0.6596, "grad_norm": 0.9524099230766296, "learning_rate": 0.0002, "epoch": 3.829802513464991, "step": 53330}, {"loss": 0.6796, "grad_norm": 0.9692625999450684, "learning_rate": 0.0002, "epoch": 3.8305206463195693, "step": 53340}, {"loss": 0.628, "grad_norm": 0.7255275845527649, "learning_rate": 0.0002, "epoch": 3.8312387791741473, "step": 53350}, {"loss": 0.6104, "grad_norm": 0.7199059724807739, "learning_rate": 0.0002, "epoch": 3.8319569120287253, "step": 53360}, {"loss": 0.6703, "grad_norm": 1.004464864730835, "learning_rate": 0.0002, "epoch": 3.8326750448833034, "step": 53370}, {"loss": 0.7032, "grad_norm": 0.9092583060264587, "learning_rate": 0.0002, "epoch": 3.8333931777378814, "step": 53380}, {"loss": 0.6811, "grad_norm": 0.945091724395752, "learning_rate": 0.0002, "epoch": 3.8341113105924594, "step": 53390}, {"loss": 0.611, "grad_norm": 0.7980135679244995, "learning_rate": 0.0002, "epoch": 3.834829443447038, "step": 53400}, {"loss": 0.6604, "grad_norm": 0.7812868356704712, "learning_rate": 0.0002, "epoch": 3.835547576301616, "step": 53410}, {"loss": 0.6104, "grad_norm": 0.8957077860832214, "learning_rate": 0.0002, "epoch": 3.836265709156194, "step": 53420}, {"loss": 0.6754, "grad_norm": 0.9119600653648376, "learning_rate": 0.0002, "epoch": 3.8369838420107722, "step": 53430}, {"loss": 0.7346, "grad_norm": 0.8208187222480774, "learning_rate": 0.0002, "epoch": 3.8377019748653503, "step": 53440}, {"loss": 0.6549, "grad_norm": 0.7930439114570618, "learning_rate": 0.0002, "epoch": 3.8384201077199283, "step": 53450}, {"loss": 0.6192, "grad_norm": 0.8937777280807495, "learning_rate": 0.0002, "epoch": 3.8391382405745063, "step": 53460}, {"loss": 0.5954, "grad_norm": 0.7583796977996826, "learning_rate": 0.0002, "epoch": 3.8398563734290843, "step": 53470}, {"loss": 0.6217, "grad_norm": 1.0735969543457031, "learning_rate": 0.0002, "epoch": 3.8405745062836623, "step": 53480}, {"loss": 0.6472, "grad_norm": 1.1106033325195312, "learning_rate": 0.0002, "epoch": 3.8412926391382407, "step": 53490}, {"loss": 0.6813, "grad_norm": 1.092631220817566, "learning_rate": 0.0002, "epoch": 3.8420107719928187, "step": 53500}, {"loss": 0.6437, "grad_norm": 0.9961787462234497, "learning_rate": 0.0002, "epoch": 3.8427289048473967, "step": 53510}, {"loss": 0.6382, "grad_norm": 0.833831250667572, "learning_rate": 0.0002, "epoch": 3.8434470377019747, "step": 53520}, {"loss": 0.6403, "grad_norm": 1.0000009536743164, "learning_rate": 0.0002, "epoch": 3.844165170556553, "step": 53530}, {"loss": 0.6824, "grad_norm": 0.9784213304519653, "learning_rate": 0.0002, "epoch": 3.844883303411131, "step": 53540}, {"loss": 0.6816, "grad_norm": 0.8582558035850525, "learning_rate": 0.0002, "epoch": 3.845601436265709, "step": 53550}, {"loss": 0.5944, "grad_norm": 0.8267415761947632, "learning_rate": 0.0002, "epoch": 3.846319569120287, "step": 53560}, {"loss": 0.6562, "grad_norm": 0.8783000111579895, "learning_rate": 0.0002, "epoch": 3.847037701974865, "step": 53570}, {"loss": 0.6795, "grad_norm": 0.9866999983787537, "learning_rate": 0.0002, "epoch": 3.8477558348294436, "step": 53580}, {"loss": 0.7222, "grad_norm": 0.8459296226501465, "learning_rate": 0.0002, "epoch": 3.8484739676840216, "step": 53590}, {"loss": 0.6748, "grad_norm": 0.9804834723472595, "learning_rate": 0.0002, "epoch": 3.8491921005385996, "step": 53600}, {"loss": 0.6115, "grad_norm": 0.951074481010437, "learning_rate": 0.0002, "epoch": 3.8499102333931776, "step": 53610}, {"loss": 0.5914, "grad_norm": 0.8020104169845581, "learning_rate": 0.0002, "epoch": 3.850628366247756, "step": 53620}, {"loss": 0.6237, "grad_norm": 0.9296963214874268, "learning_rate": 0.0002, "epoch": 3.851346499102334, "step": 53630}, {"loss": 0.6384, "grad_norm": 0.8983652591705322, "learning_rate": 0.0002, "epoch": 3.852064631956912, "step": 53640}, {"loss": 0.6855, "grad_norm": 1.031858205795288, "learning_rate": 0.0002, "epoch": 3.85278276481149, "step": 53650}, {"loss": 0.622, "grad_norm": 0.8943952918052673, "learning_rate": 0.0002, "epoch": 3.853500897666068, "step": 53660}, {"loss": 0.6745, "grad_norm": 1.0072312355041504, "learning_rate": 0.0002, "epoch": 3.854219030520646, "step": 53670}, {"loss": 0.677, "grad_norm": 1.0604884624481201, "learning_rate": 0.0002, "epoch": 3.8549371633752245, "step": 53680}, {"loss": 0.5873, "grad_norm": 0.834223210811615, "learning_rate": 0.0002, "epoch": 3.8556552962298025, "step": 53690}, {"loss": 0.665, "grad_norm": 0.9872867465019226, "learning_rate": 0.0002, "epoch": 3.8563734290843805, "step": 53700}, {"loss": 0.6689, "grad_norm": 0.7999459505081177, "learning_rate": 0.0002, "epoch": 3.857091561938959, "step": 53710}, {"loss": 0.6744, "grad_norm": 0.717722475528717, "learning_rate": 0.0002, "epoch": 3.857809694793537, "step": 53720}, {"loss": 0.6348, "grad_norm": 1.0675442218780518, "learning_rate": 0.0002, "epoch": 3.858527827648115, "step": 53730}, {"loss": 0.6141, "grad_norm": 0.9789777398109436, "learning_rate": 0.0002, "epoch": 3.859245960502693, "step": 53740}, {"loss": 0.6455, "grad_norm": 0.9318669438362122, "learning_rate": 0.0002, "epoch": 3.859964093357271, "step": 53750}, {"loss": 0.6587, "grad_norm": 0.9848631024360657, "learning_rate": 0.0002, "epoch": 3.860682226211849, "step": 53760}, {"loss": 0.6202, "grad_norm": 0.8754391670227051, "learning_rate": 0.0002, "epoch": 3.8614003590664274, "step": 53770}, {"loss": 0.6411, "grad_norm": 0.9024585485458374, "learning_rate": 0.0002, "epoch": 3.8621184919210054, "step": 53780}, {"loss": 0.6643, "grad_norm": 0.8974794745445251, "learning_rate": 0.0002, "epoch": 3.8628366247755834, "step": 53790}, {"loss": 0.6729, "grad_norm": 0.8342790603637695, "learning_rate": 0.0002, "epoch": 3.8635547576301614, "step": 53800}, {"loss": 0.6322, "grad_norm": 0.8177682757377625, "learning_rate": 0.0002, "epoch": 3.86427289048474, "step": 53810}, {"loss": 0.6525, "grad_norm": 1.0259089469909668, "learning_rate": 0.0002, "epoch": 3.864991023339318, "step": 53820}, {"loss": 0.6508, "grad_norm": 1.042290210723877, "learning_rate": 0.0002, "epoch": 3.865709156193896, "step": 53830}, {"loss": 0.6963, "grad_norm": 0.7316540479660034, "learning_rate": 0.0002, "epoch": 3.866427289048474, "step": 53840}, {"loss": 0.6491, "grad_norm": 0.9384970664978027, "learning_rate": 0.0002, "epoch": 3.867145421903052, "step": 53850}, {"loss": 0.6689, "grad_norm": 0.9273143410682678, "learning_rate": 0.0002, "epoch": 3.86786355475763, "step": 53860}, {"loss": 0.6443, "grad_norm": 1.1183570623397827, "learning_rate": 0.0002, "epoch": 3.8685816876122083, "step": 53870}, {"loss": 0.6712, "grad_norm": 0.9455275535583496, "learning_rate": 0.0002, "epoch": 3.8692998204667863, "step": 53880}, {"loss": 0.6662, "grad_norm": 0.8702114820480347, "learning_rate": 0.0002, "epoch": 3.8700179533213643, "step": 53890}, {"loss": 0.7032, "grad_norm": 0.8751053214073181, "learning_rate": 0.0002, "epoch": 3.870736086175943, "step": 53900}, {"loss": 0.6398, "grad_norm": 0.9793110489845276, "learning_rate": 0.0002, "epoch": 3.871454219030521, "step": 53910}, {"loss": 0.6577, "grad_norm": 0.9705014824867249, "learning_rate": 0.0002, "epoch": 3.872172351885099, "step": 53920}, {"loss": 0.751, "grad_norm": 1.051504373550415, "learning_rate": 0.0002, "epoch": 3.872890484739677, "step": 53930}, {"loss": 0.6606, "grad_norm": 0.8590622544288635, "learning_rate": 0.0002, "epoch": 3.873608617594255, "step": 53940}, {"loss": 0.6495, "grad_norm": 0.7828099727630615, "learning_rate": 0.0002, "epoch": 3.874326750448833, "step": 53950}, {"loss": 0.6294, "grad_norm": 0.86341792345047, "learning_rate": 0.0002, "epoch": 3.8750448833034112, "step": 53960}, {"loss": 0.6677, "grad_norm": 1.114670991897583, "learning_rate": 0.0002, "epoch": 3.8757630161579892, "step": 53970}, {"loss": 0.6533, "grad_norm": 0.8559519052505493, "learning_rate": 0.0002, "epoch": 3.8764811490125672, "step": 53980}, {"loss": 0.6517, "grad_norm": 1.0518953800201416, "learning_rate": 0.0002, "epoch": 3.8771992818671457, "step": 53990}, {"loss": 0.6359, "grad_norm": 0.7157500982284546, "learning_rate": 0.0002, "epoch": 3.8779174147217237, "step": 54000}, {"loss": 0.6847, "grad_norm": 0.8390372395515442, "learning_rate": 0.0002, "epoch": 3.8786355475763017, "step": 54010}, {"loss": 0.6376, "grad_norm": 0.8486756086349487, "learning_rate": 0.0002, "epoch": 3.8793536804308797, "step": 54020}, {"loss": 0.6184, "grad_norm": 0.8361587524414062, "learning_rate": 0.0002, "epoch": 3.8800718132854577, "step": 54030}, {"loss": 0.6552, "grad_norm": 0.9490554928779602, "learning_rate": 0.0002, "epoch": 3.8807899461400357, "step": 54040}, {"loss": 0.6653, "grad_norm": 1.0311323404312134, "learning_rate": 0.0002, "epoch": 3.881508078994614, "step": 54050}, {"loss": 0.6484, "grad_norm": 0.84800124168396, "learning_rate": 0.0002, "epoch": 3.882226211849192, "step": 54060}, {"loss": 0.6995, "grad_norm": 0.8940879702568054, "learning_rate": 0.0002, "epoch": 3.88294434470377, "step": 54070}, {"loss": 0.6157, "grad_norm": 0.985542356967926, "learning_rate": 0.0002, "epoch": 3.883662477558348, "step": 54080}, {"loss": 0.6221, "grad_norm": 0.8846475481987, "learning_rate": 0.0002, "epoch": 3.8843806104129266, "step": 54090}, {"loss": 0.6656, "grad_norm": 0.9186338186264038, "learning_rate": 0.0002, "epoch": 3.8850987432675046, "step": 54100}, {"loss": 0.6367, "grad_norm": 1.106598973274231, "learning_rate": 0.0002, "epoch": 3.8858168761220826, "step": 54110}, {"loss": 0.6311, "grad_norm": 0.8167300224304199, "learning_rate": 0.0002, "epoch": 3.8865350089766606, "step": 54120}, {"loss": 0.694, "grad_norm": 0.9153622984886169, "learning_rate": 0.0002, "epoch": 3.8872531418312386, "step": 54130}, {"loss": 0.6669, "grad_norm": 0.8464475274085999, "learning_rate": 0.0002, "epoch": 3.8879712746858166, "step": 54140}, {"loss": 0.6658, "grad_norm": 0.8889452815055847, "learning_rate": 0.0002, "epoch": 3.888689407540395, "step": 54150}, {"loss": 0.6291, "grad_norm": 0.7861065864562988, "learning_rate": 0.0002, "epoch": 3.889407540394973, "step": 54160}, {"loss": 0.6315, "grad_norm": 0.882674515247345, "learning_rate": 0.0002, "epoch": 3.890125673249551, "step": 54170}, {"loss": 0.6223, "grad_norm": 0.8503835201263428, "learning_rate": 0.0002, "epoch": 3.8908438061041295, "step": 54180}, {"loss": 0.6176, "grad_norm": 0.888455331325531, "learning_rate": 0.0002, "epoch": 3.8915619389587075, "step": 54190}, {"loss": 0.6985, "grad_norm": 1.0473699569702148, "learning_rate": 0.0002, "epoch": 3.8922800718132855, "step": 54200}, {"loss": 0.6513, "grad_norm": 0.9548208713531494, "learning_rate": 0.0002, "epoch": 3.8929982046678635, "step": 54210}, {"loss": 0.6089, "grad_norm": 0.9158754944801331, "learning_rate": 0.0002, "epoch": 3.8937163375224415, "step": 54220}, {"loss": 0.6352, "grad_norm": 0.9001154899597168, "learning_rate": 0.0002, "epoch": 3.8944344703770195, "step": 54230}, {"loss": 0.6657, "grad_norm": 0.9736626148223877, "learning_rate": 0.0002, "epoch": 3.895152603231598, "step": 54240}, {"loss": 0.7248, "grad_norm": 0.8809846043586731, "learning_rate": 0.0002, "epoch": 3.895870736086176, "step": 54250}, {"loss": 0.6364, "grad_norm": 0.887583315372467, "learning_rate": 0.0002, "epoch": 3.896588868940754, "step": 54260}, {"loss": 0.6252, "grad_norm": 0.8395712971687317, "learning_rate": 0.0002, "epoch": 3.8973070017953324, "step": 54270}, {"loss": 0.681, "grad_norm": 0.8391315937042236, "learning_rate": 0.0002, "epoch": 3.8980251346499104, "step": 54280}, {"loss": 0.6352, "grad_norm": 0.8210049271583557, "learning_rate": 0.0002, "epoch": 3.8987432675044884, "step": 54290}, {"loss": 0.6484, "grad_norm": 1.1364530324935913, "learning_rate": 0.0002, "epoch": 3.8994614003590664, "step": 54300}, {"loss": 0.6383, "grad_norm": 0.7712056636810303, "learning_rate": 0.0002, "epoch": 3.9001795332136444, "step": 54310}, {"loss": 0.6516, "grad_norm": 0.9466049671173096, "learning_rate": 0.0002, "epoch": 3.9008976660682224, "step": 54320}, {"loss": 0.6938, "grad_norm": 1.0367140769958496, "learning_rate": 0.0002, "epoch": 3.901615798922801, "step": 54330}, {"loss": 0.672, "grad_norm": 1.0168321132659912, "learning_rate": 0.0002, "epoch": 3.902333931777379, "step": 54340}, {"loss": 0.6306, "grad_norm": 0.7830407619476318, "learning_rate": 0.0002, "epoch": 3.903052064631957, "step": 54350}, {"loss": 0.7198, "grad_norm": 0.9649789333343506, "learning_rate": 0.0002, "epoch": 3.903770197486535, "step": 54360}, {"loss": 0.6644, "grad_norm": 0.681077778339386, "learning_rate": 0.0002, "epoch": 3.9044883303411133, "step": 54370}, {"loss": 0.6677, "grad_norm": 0.8970136046409607, "learning_rate": 0.0002, "epoch": 3.9052064631956913, "step": 54380}, {"loss": 0.6581, "grad_norm": 0.9155173301696777, "learning_rate": 0.0002, "epoch": 3.9059245960502693, "step": 54390}, {"loss": 0.6711, "grad_norm": 1.0447794198989868, "learning_rate": 0.0002, "epoch": 3.9066427289048473, "step": 54400}, {"loss": 0.6883, "grad_norm": 0.7823813557624817, "learning_rate": 0.0002, "epoch": 3.9073608617594253, "step": 54410}, {"loss": 0.6688, "grad_norm": 0.9289445877075195, "learning_rate": 0.0002, "epoch": 3.9080789946140033, "step": 54420}, {"loss": 0.7024, "grad_norm": 0.9983111619949341, "learning_rate": 0.0002, "epoch": 3.9087971274685818, "step": 54430}, {"loss": 0.6687, "grad_norm": 0.7952495813369751, "learning_rate": 0.0002, "epoch": 3.9095152603231598, "step": 54440}, {"loss": 0.6118, "grad_norm": 0.8045601844787598, "learning_rate": 0.0002, "epoch": 3.9102333931777378, "step": 54450}, {"loss": 0.6388, "grad_norm": 0.936585009098053, "learning_rate": 0.0002, "epoch": 3.910951526032316, "step": 54460}, {"loss": 0.6217, "grad_norm": 0.745793879032135, "learning_rate": 0.0002, "epoch": 3.911669658886894, "step": 54470}, {"loss": 0.6814, "grad_norm": 0.9137616157531738, "learning_rate": 0.0002, "epoch": 3.912387791741472, "step": 54480}, {"loss": 0.6792, "grad_norm": 0.826316237449646, "learning_rate": 0.0002, "epoch": 3.9131059245960502, "step": 54490}, {"loss": 0.6914, "grad_norm": 0.94313645362854, "learning_rate": 0.0002, "epoch": 3.9138240574506282, "step": 54500}, {"loss": 0.62, "grad_norm": 1.045893907546997, "learning_rate": 0.0002, "epoch": 3.9145421903052062, "step": 54510}, {"loss": 0.5841, "grad_norm": 0.9122704863548279, "learning_rate": 0.0002, "epoch": 3.9152603231597847, "step": 54520}, {"loss": 0.7029, "grad_norm": 1.0999689102172852, "learning_rate": 0.0002, "epoch": 3.9159784560143627, "step": 54530}, {"loss": 0.6387, "grad_norm": 0.9281555414199829, "learning_rate": 0.0002, "epoch": 3.9166965888689407, "step": 54540}, {"loss": 0.6227, "grad_norm": 1.1439622640609741, "learning_rate": 0.0002, "epoch": 3.917414721723519, "step": 54550}, {"loss": 0.6733, "grad_norm": 0.9375617504119873, "learning_rate": 0.0002, "epoch": 3.918132854578097, "step": 54560}, {"loss": 0.6503, "grad_norm": 0.92906653881073, "learning_rate": 0.0002, "epoch": 3.918850987432675, "step": 54570}, {"loss": 0.6361, "grad_norm": 1.0840893983840942, "learning_rate": 0.0002, "epoch": 3.919569120287253, "step": 54580}, {"loss": 0.6476, "grad_norm": 0.8145509362220764, "learning_rate": 0.0002, "epoch": 3.920287253141831, "step": 54590}, {"loss": 0.6826, "grad_norm": 0.973737895488739, "learning_rate": 0.0002, "epoch": 3.921005385996409, "step": 54600}, {"loss": 0.6822, "grad_norm": 0.9302353858947754, "learning_rate": 0.0002, "epoch": 3.9217235188509876, "step": 54610}, {"loss": 0.6522, "grad_norm": 0.9167897701263428, "learning_rate": 0.0002, "epoch": 3.9224416517055656, "step": 54620}, {"loss": 0.6783, "grad_norm": 0.8096851706504822, "learning_rate": 0.0002, "epoch": 3.9231597845601436, "step": 54630}, {"loss": 0.6369, "grad_norm": 0.8006368279457092, "learning_rate": 0.0002, "epoch": 3.9238779174147216, "step": 54640}, {"loss": 0.6533, "grad_norm": 0.7800863981246948, "learning_rate": 0.0002, "epoch": 3.9245960502693, "step": 54650}, {"loss": 0.6518, "grad_norm": 1.0331560373306274, "learning_rate": 0.0002, "epoch": 3.925314183123878, "step": 54660}, {"loss": 0.6764, "grad_norm": 1.0057517290115356, "learning_rate": 0.0002, "epoch": 3.926032315978456, "step": 54670}, {"loss": 0.6636, "grad_norm": 0.8920564651489258, "learning_rate": 0.0002, "epoch": 3.926750448833034, "step": 54680}, {"loss": 0.6432, "grad_norm": 0.7704599499702454, "learning_rate": 0.0002, "epoch": 3.927468581687612, "step": 54690}, {"loss": 0.6532, "grad_norm": 0.827032208442688, "learning_rate": 0.0002, "epoch": 3.92818671454219, "step": 54700}, {"loss": 0.7083, "grad_norm": 1.0019268989562988, "learning_rate": 0.0002, "epoch": 3.9289048473967685, "step": 54710}, {"loss": 0.6026, "grad_norm": 0.862033486366272, "learning_rate": 0.0002, "epoch": 3.9296229802513465, "step": 54720}, {"loss": 0.599, "grad_norm": 0.8965592980384827, "learning_rate": 0.0002, "epoch": 3.9303411131059245, "step": 54730}, {"loss": 0.6739, "grad_norm": 0.7689077854156494, "learning_rate": 0.0002, "epoch": 3.931059245960503, "step": 54740}, {"loss": 0.6401, "grad_norm": 0.846276581287384, "learning_rate": 0.0002, "epoch": 3.931777378815081, "step": 54750}, {"loss": 0.6942, "grad_norm": 0.8932713866233826, "learning_rate": 0.0002, "epoch": 3.932495511669659, "step": 54760}, {"loss": 0.6697, "grad_norm": 0.9711386561393738, "learning_rate": 0.0002, "epoch": 3.933213644524237, "step": 54770}, {"loss": 0.6672, "grad_norm": 0.9290250539779663, "learning_rate": 0.0002, "epoch": 3.933931777378815, "step": 54780}, {"loss": 0.6365, "grad_norm": 1.0897367000579834, "learning_rate": 0.0002, "epoch": 3.934649910233393, "step": 54790}, {"loss": 0.6647, "grad_norm": 0.8451842665672302, "learning_rate": 0.0002, "epoch": 3.9353680430879714, "step": 54800}, {"loss": 0.6705, "grad_norm": 0.8400090336799622, "learning_rate": 0.0002, "epoch": 3.9360861759425494, "step": 54810}, {"loss": 0.6577, "grad_norm": 0.951383650302887, "learning_rate": 0.0002, "epoch": 3.9368043087971274, "step": 54820}, {"loss": 0.654, "grad_norm": 0.848838210105896, "learning_rate": 0.0002, "epoch": 3.937522441651706, "step": 54830}, {"loss": 0.6852, "grad_norm": 0.735763669013977, "learning_rate": 0.0002, "epoch": 3.938240574506284, "step": 54840}, {"loss": 0.6574, "grad_norm": 0.979037344455719, "learning_rate": 0.0002, "epoch": 3.938958707360862, "step": 54850}, {"loss": 0.5851, "grad_norm": 0.933674693107605, "learning_rate": 0.0002, "epoch": 3.93967684021544, "step": 54860}, {"loss": 0.6931, "grad_norm": 0.835593044757843, "learning_rate": 0.0002, "epoch": 3.940394973070018, "step": 54870}, {"loss": 0.6967, "grad_norm": 1.0034281015396118, "learning_rate": 0.0002, "epoch": 3.941113105924596, "step": 54880}, {"loss": 0.6442, "grad_norm": 0.9732975959777832, "learning_rate": 0.0002, "epoch": 3.9418312387791743, "step": 54890}, {"loss": 0.6657, "grad_norm": 0.9666336178779602, "learning_rate": 0.0002, "epoch": 3.9425493716337523, "step": 54900}, {"loss": 0.6521, "grad_norm": 0.755310595035553, "learning_rate": 0.0002, "epoch": 3.9432675044883303, "step": 54910}, {"loss": 0.6562, "grad_norm": 0.8732092976570129, "learning_rate": 0.0002, "epoch": 3.9439856373429083, "step": 54920}, {"loss": 0.6486, "grad_norm": 1.139453649520874, "learning_rate": 0.0002, "epoch": 3.9447037701974867, "step": 54930}, {"loss": 0.6609, "grad_norm": 0.9044837951660156, "learning_rate": 0.0002, "epoch": 3.9454219030520647, "step": 54940}, {"loss": 0.6344, "grad_norm": 1.0496679544448853, "learning_rate": 0.0002, "epoch": 3.9461400359066428, "step": 54950}, {"loss": 0.6471, "grad_norm": 1.0099035501480103, "learning_rate": 0.0002, "epoch": 3.9468581687612208, "step": 54960}, {"loss": 0.6143, "grad_norm": 1.0694963932037354, "learning_rate": 0.0002, "epoch": 3.9475763016157988, "step": 54970}, {"loss": 0.6209, "grad_norm": 1.0012997388839722, "learning_rate": 0.0002, "epoch": 3.9482944344703768, "step": 54980}, {"loss": 0.7379, "grad_norm": 0.8910513520240784, "learning_rate": 0.0002, "epoch": 3.949012567324955, "step": 54990}, {"loss": 0.7184, "grad_norm": 1.0267579555511475, "learning_rate": 0.0002, "epoch": 3.949730700179533, "step": 55000}, {"loss": 0.6844, "grad_norm": 0.9786432385444641, "learning_rate": 0.0002, "epoch": 3.950448833034111, "step": 55010}, {"loss": 0.6499, "grad_norm": 0.8703538775444031, "learning_rate": 0.0002, "epoch": 3.9511669658886897, "step": 55020}, {"loss": 0.5989, "grad_norm": 0.8970484137535095, "learning_rate": 0.0002, "epoch": 3.9518850987432677, "step": 55030}, {"loss": 0.659, "grad_norm": 0.8781577944755554, "learning_rate": 0.0002, "epoch": 3.9526032315978457, "step": 55040}, {"loss": 0.6944, "grad_norm": 0.8040280938148499, "learning_rate": 0.0002, "epoch": 3.9533213644524237, "step": 55050}, {"loss": 0.6359, "grad_norm": 0.851926326751709, "learning_rate": 0.0002, "epoch": 3.9540394973070017, "step": 55060}, {"loss": 0.6806, "grad_norm": 0.8597240447998047, "learning_rate": 0.0002, "epoch": 3.9547576301615797, "step": 55070}, {"loss": 0.6499, "grad_norm": 0.9461944699287415, "learning_rate": 0.0002, "epoch": 3.955475763016158, "step": 55080}, {"loss": 0.6222, "grad_norm": 0.7576611042022705, "learning_rate": 0.0002, "epoch": 3.956193895870736, "step": 55090}, {"loss": 0.6735, "grad_norm": 0.9484710693359375, "learning_rate": 0.0002, "epoch": 3.956912028725314, "step": 55100}, {"loss": 0.6586, "grad_norm": 0.9487117528915405, "learning_rate": 0.0002, "epoch": 3.957630161579892, "step": 55110}, {"loss": 0.6632, "grad_norm": 0.870090663433075, "learning_rate": 0.0002, "epoch": 3.9583482944344706, "step": 55120}, {"loss": 0.6786, "grad_norm": 0.8496458530426025, "learning_rate": 0.0002, "epoch": 3.9590664272890486, "step": 55130}, {"loss": 0.6631, "grad_norm": 1.0121779441833496, "learning_rate": 0.0002, "epoch": 3.9597845601436266, "step": 55140}, {"loss": 0.7005, "grad_norm": 0.8912323713302612, "learning_rate": 0.0002, "epoch": 3.9605026929982046, "step": 55150}, {"loss": 0.6398, "grad_norm": 0.8398444652557373, "learning_rate": 0.0002, "epoch": 3.9612208258527826, "step": 55160}, {"loss": 0.6183, "grad_norm": 0.8046348690986633, "learning_rate": 0.0002, "epoch": 3.961938958707361, "step": 55170}, {"loss": 0.6357, "grad_norm": 1.0369254350662231, "learning_rate": 0.0002, "epoch": 3.962657091561939, "step": 55180}, {"loss": 0.6053, "grad_norm": 1.172431230545044, "learning_rate": 0.0002, "epoch": 3.963375224416517, "step": 55190}, {"loss": 0.643, "grad_norm": 0.8093554377555847, "learning_rate": 0.0002, "epoch": 3.964093357271095, "step": 55200}, {"loss": 0.6416, "grad_norm": 0.8851078748703003, "learning_rate": 0.0002, "epoch": 3.9648114901256735, "step": 55210}, {"loss": 0.6516, "grad_norm": 0.7494266033172607, "learning_rate": 0.0002, "epoch": 3.9655296229802515, "step": 55220}, {"loss": 0.629, "grad_norm": 0.9556898474693298, "learning_rate": 0.0002, "epoch": 3.9662477558348295, "step": 55230}, {"loss": 0.6481, "grad_norm": 1.016017198562622, "learning_rate": 0.0002, "epoch": 3.9669658886894075, "step": 55240}, {"loss": 0.7185, "grad_norm": 0.8425998091697693, "learning_rate": 0.0002, "epoch": 3.9676840215439855, "step": 55250}, {"loss": 0.6609, "grad_norm": 0.717673122882843, "learning_rate": 0.0002, "epoch": 3.9684021543985635, "step": 55260}, {"loss": 0.6453, "grad_norm": 0.8366572856903076, "learning_rate": 0.0002, "epoch": 3.969120287253142, "step": 55270}, {"loss": 0.6841, "grad_norm": 0.8981583118438721, "learning_rate": 0.0002, "epoch": 3.96983842010772, "step": 55280}, {"loss": 0.6351, "grad_norm": 0.8868781328201294, "learning_rate": 0.0002, "epoch": 3.970556552962298, "step": 55290}, {"loss": 0.6755, "grad_norm": 1.0632785558700562, "learning_rate": 0.0002, "epoch": 3.9712746858168764, "step": 55300}, {"loss": 0.6433, "grad_norm": 0.8813109993934631, "learning_rate": 0.0002, "epoch": 3.9719928186714544, "step": 55310}, {"loss": 0.5699, "grad_norm": 0.8225542306900024, "learning_rate": 0.0002, "epoch": 3.9727109515260324, "step": 55320}, {"loss": 0.6591, "grad_norm": 1.1391420364379883, "learning_rate": 0.0002, "epoch": 3.9734290843806104, "step": 55330}, {"loss": 0.6551, "grad_norm": 1.0371832847595215, "learning_rate": 0.0002, "epoch": 3.9741472172351884, "step": 55340}, {"loss": 0.7538, "grad_norm": 1.0542186498641968, "learning_rate": 0.0002, "epoch": 3.9748653500897664, "step": 55350}, {"loss": 0.6799, "grad_norm": 1.0178009271621704, "learning_rate": 0.0002, "epoch": 3.975583482944345, "step": 55360}, {"loss": 0.6394, "grad_norm": 0.7927802205085754, "learning_rate": 0.0002, "epoch": 3.976301615798923, "step": 55370}, {"loss": 0.6632, "grad_norm": 0.9350495934486389, "learning_rate": 0.0002, "epoch": 3.977019748653501, "step": 55380}, {"loss": 0.6889, "grad_norm": 1.0240116119384766, "learning_rate": 0.0002, "epoch": 3.977737881508079, "step": 55390}, {"loss": 0.6756, "grad_norm": 1.0279067754745483, "learning_rate": 0.0002, "epoch": 3.9784560143626573, "step": 55400}, {"loss": 0.6979, "grad_norm": 1.1228227615356445, "learning_rate": 0.0002, "epoch": 3.9791741472172353, "step": 55410}, {"loss": 0.6595, "grad_norm": 0.9500134587287903, "learning_rate": 0.0002, "epoch": 3.9798922800718133, "step": 55420}, {"loss": 0.6875, "grad_norm": 0.9229732155799866, "learning_rate": 0.0002, "epoch": 3.9806104129263913, "step": 55430}, {"loss": 0.6742, "grad_norm": 0.7946729063987732, "learning_rate": 0.0002, "epoch": 3.9813285457809693, "step": 55440}, {"loss": 0.6643, "grad_norm": 0.9987489581108093, "learning_rate": 0.0002, "epoch": 3.9820466786355477, "step": 55450}, {"loss": 0.6642, "grad_norm": 0.9670467972755432, "learning_rate": 0.0002, "epoch": 3.9827648114901257, "step": 55460}, {"loss": 0.6603, "grad_norm": 0.835028350353241, "learning_rate": 0.0002, "epoch": 3.9834829443447037, "step": 55470}, {"loss": 0.6198, "grad_norm": 0.8678702712059021, "learning_rate": 0.0002, "epoch": 3.9842010771992817, "step": 55480}, {"loss": 0.6581, "grad_norm": 0.8581197261810303, "learning_rate": 0.0002, "epoch": 3.98491921005386, "step": 55490}, {"loss": 0.614, "grad_norm": 0.779848039150238, "learning_rate": 0.0002, "epoch": 3.985637342908438, "step": 55500}, {"loss": 0.634, "grad_norm": 0.8827589154243469, "learning_rate": 0.0002, "epoch": 3.986355475763016, "step": 55510}, {"loss": 0.624, "grad_norm": 1.0108301639556885, "learning_rate": 0.0002, "epoch": 3.987073608617594, "step": 55520}, {"loss": 0.6553, "grad_norm": 0.8506004214286804, "learning_rate": 0.0002, "epoch": 3.987791741472172, "step": 55530}, {"loss": 0.6229, "grad_norm": 1.0297727584838867, "learning_rate": 0.0002, "epoch": 3.98850987432675, "step": 55540}, {"loss": 0.6551, "grad_norm": 0.8579224944114685, "learning_rate": 0.0002, "epoch": 3.9892280071813286, "step": 55550}, {"loss": 0.6491, "grad_norm": 0.8503788113594055, "learning_rate": 0.0002, "epoch": 3.9899461400359066, "step": 55560}, {"loss": 0.6941, "grad_norm": 1.1144801378250122, "learning_rate": 0.0002, "epoch": 3.9906642728904846, "step": 55570}, {"loss": 0.6956, "grad_norm": 0.8418305516242981, "learning_rate": 0.0002, "epoch": 3.991382405745063, "step": 55580}, {"loss": 0.6226, "grad_norm": 1.0065871477127075, "learning_rate": 0.0002, "epoch": 3.992100538599641, "step": 55590}, {"loss": 0.6775, "grad_norm": 0.8160259127616882, "learning_rate": 0.0002, "epoch": 3.992818671454219, "step": 55600}, {"loss": 0.624, "grad_norm": 0.8678009510040283, "learning_rate": 0.0002, "epoch": 3.993536804308797, "step": 55610}, {"loss": 0.6552, "grad_norm": 0.863465428352356, "learning_rate": 0.0002, "epoch": 3.994254937163375, "step": 55620}, {"loss": 0.6764, "grad_norm": 0.9242135286331177, "learning_rate": 0.0002, "epoch": 3.994973070017953, "step": 55630}, {"loss": 0.6774, "grad_norm": 1.0285470485687256, "learning_rate": 0.0002, "epoch": 3.9956912028725315, "step": 55640}, {"loss": 0.6882, "grad_norm": 0.8953320384025574, "learning_rate": 0.0002, "epoch": 3.9964093357271095, "step": 55650}, {"loss": 0.6935, "grad_norm": 0.915892481803894, "learning_rate": 0.0002, "epoch": 3.9971274685816875, "step": 55660}, {"loss": 0.641, "grad_norm": 0.8235118985176086, "learning_rate": 0.0002, "epoch": 3.9978456014362656, "step": 55670}, {"loss": 0.6417, "grad_norm": 1.0178656578063965, "learning_rate": 0.0002, "epoch": 3.998563734290844, "step": 55680}, {"loss": 0.6635, "grad_norm": 0.9926803708076477, "learning_rate": 0.0002, "epoch": 3.999281867145422, "step": 55690}, {"loss": 0.6476, "grad_norm": 0.9213629961013794, "learning_rate": 0.0002, "epoch": 4.0, "step": 55700}, {"eval_loss": 1.1152480840682983, "eval_runtime": 55.2237, "eval_samples_per_second": 13.273, "eval_steps_per_second": 1.666, "epoch": 4.0, "step": 55700}, {"loss": 0.6085, "grad_norm": 1.0820496082305908, "learning_rate": 0.0002, "epoch": 4.000718132854578, "step": 55710}, {"loss": 0.5506, "grad_norm": 0.9036441445350647, "learning_rate": 0.0002, "epoch": 4.001436265709156, "step": 55720}, {"loss": 0.5924, "grad_norm": 1.102754831314087, "learning_rate": 0.0002, "epoch": 4.002154398563734, "step": 55730}, {"loss": 0.6192, "grad_norm": 0.98259437084198, "learning_rate": 0.0002, "epoch": 4.002872531418312, "step": 55740}, {"loss": 0.567, "grad_norm": 1.1935845613479614, "learning_rate": 0.0002, "epoch": 4.003590664272891, "step": 55750}, {"loss": 0.6205, "grad_norm": 0.9925830960273743, "learning_rate": 0.0002, "epoch": 4.004308797127469, "step": 55760}, {"loss": 0.5545, "grad_norm": 1.075087070465088, "learning_rate": 0.0002, "epoch": 4.005026929982047, "step": 55770}, {"loss": 0.5591, "grad_norm": 0.8746396899223328, "learning_rate": 0.0002, "epoch": 4.005745062836625, "step": 55780}, {"loss": 0.5745, "grad_norm": 0.7635995745658875, "learning_rate": 0.0002, "epoch": 4.006463195691203, "step": 55790}, {"loss": 0.599, "grad_norm": 0.9064885377883911, "learning_rate": 0.0002, "epoch": 4.007181328545781, "step": 55800}, {"loss": 0.5668, "grad_norm": 1.018478274345398, "learning_rate": 0.0002, "epoch": 4.007899461400359, "step": 55810}, {"loss": 0.5573, "grad_norm": 0.9797589778900146, "learning_rate": 0.0002, "epoch": 4.008617594254937, "step": 55820}, {"loss": 0.5784, "grad_norm": 0.7867457866668701, "learning_rate": 0.0002, "epoch": 4.009335727109515, "step": 55830}, {"loss": 0.5607, "grad_norm": 0.9998070597648621, "learning_rate": 0.0002, "epoch": 4.010053859964093, "step": 55840}, {"loss": 0.5655, "grad_norm": 0.8656311631202698, "learning_rate": 0.0002, "epoch": 4.010771992818672, "step": 55850}, {"loss": 0.533, "grad_norm": 0.945469081401825, "learning_rate": 0.0002, "epoch": 4.01149012567325, "step": 55860}, {"loss": 0.625, "grad_norm": 0.8809926509857178, "learning_rate": 0.0002, "epoch": 4.012208258527828, "step": 55870}, {"loss": 0.5795, "grad_norm": 0.8047897219657898, "learning_rate": 0.0002, "epoch": 4.012926391382406, "step": 55880}, {"loss": 0.5322, "grad_norm": 1.0563900470733643, "learning_rate": 0.0002, "epoch": 4.013644524236984, "step": 55890}, {"loss": 0.5597, "grad_norm": 0.8578300476074219, "learning_rate": 0.0002, "epoch": 4.014362657091562, "step": 55900}, {"loss": 0.5634, "grad_norm": 1.0304765701293945, "learning_rate": 0.0002, "epoch": 4.01508078994614, "step": 55910}, {"loss": 0.558, "grad_norm": 0.8087666034698486, "learning_rate": 0.0002, "epoch": 4.015798922800718, "step": 55920}, {"loss": 0.5557, "grad_norm": 1.0192348957061768, "learning_rate": 0.0002, "epoch": 4.016517055655296, "step": 55930}, {"loss": 0.6269, "grad_norm": 1.061194658279419, "learning_rate": 0.0002, "epoch": 4.017235188509875, "step": 55940}, {"loss": 0.5812, "grad_norm": 0.93668133020401, "learning_rate": 0.0002, "epoch": 4.017953321364453, "step": 55950}, {"loss": 0.6104, "grad_norm": 1.1569286584854126, "learning_rate": 0.0002, "epoch": 4.018671454219031, "step": 55960}, {"loss": 0.5832, "grad_norm": 0.9853817224502563, "learning_rate": 0.0002, "epoch": 4.019389587073609, "step": 55970}, {"loss": 0.6154, "grad_norm": 0.851109504699707, "learning_rate": 0.0002, "epoch": 4.020107719928187, "step": 55980}, {"loss": 0.5993, "grad_norm": 1.053525447845459, "learning_rate": 0.0002, "epoch": 4.020825852782765, "step": 55990}, {"loss": 0.571, "grad_norm": 0.8307225704193115, "learning_rate": 0.0002, "epoch": 4.021543985637343, "step": 56000}, {"loss": 0.5419, "grad_norm": 1.2741150856018066, "learning_rate": 0.0002, "epoch": 4.022262118491921, "step": 56010}, {"loss": 0.6001, "grad_norm": 0.9708344340324402, "learning_rate": 0.0002, "epoch": 4.022980251346499, "step": 56020}, {"loss": 0.5989, "grad_norm": 1.265034556388855, "learning_rate": 0.0002, "epoch": 4.023698384201078, "step": 56030}, {"loss": 0.5852, "grad_norm": 0.9364367723464966, "learning_rate": 0.0002, "epoch": 4.024416517055656, "step": 56040}, {"loss": 0.6108, "grad_norm": 0.8643592000007629, "learning_rate": 0.0002, "epoch": 4.025134649910234, "step": 56050}, {"loss": 0.6074, "grad_norm": 0.9742133021354675, "learning_rate": 0.0002, "epoch": 4.025852782764812, "step": 56060}, {"loss": 0.5699, "grad_norm": 1.1793473958969116, "learning_rate": 0.0002, "epoch": 4.02657091561939, "step": 56070}, {"loss": 0.5911, "grad_norm": 0.9641149044036865, "learning_rate": 0.0002, "epoch": 4.027289048473968, "step": 56080}, {"loss": 0.6083, "grad_norm": 0.9426136016845703, "learning_rate": 0.0002, "epoch": 4.028007181328546, "step": 56090}, {"loss": 0.5692, "grad_norm": 0.9211869835853577, "learning_rate": 0.0002, "epoch": 4.028725314183124, "step": 56100}, {"loss": 0.6109, "grad_norm": 1.1576565504074097, "learning_rate": 0.0002, "epoch": 4.029443447037702, "step": 56110}, {"loss": 0.5684, "grad_norm": 1.0014013051986694, "learning_rate": 0.0002, "epoch": 4.03016157989228, "step": 56120}, {"loss": 0.6017, "grad_norm": 0.9307010769844055, "learning_rate": 0.0002, "epoch": 4.0308797127468585, "step": 56130}, {"loss": 0.5582, "grad_norm": 0.8290148377418518, "learning_rate": 0.0002, "epoch": 4.0315978456014365, "step": 56140}, {"loss": 0.5921, "grad_norm": 1.0648446083068848, "learning_rate": 0.0002, "epoch": 4.0323159784560145, "step": 56150}, {"loss": 0.6116, "grad_norm": 1.1545547246932983, "learning_rate": 0.0002, "epoch": 4.0330341113105925, "step": 56160}, {"loss": 0.6301, "grad_norm": 0.9643545150756836, "learning_rate": 0.0002, "epoch": 4.0337522441651705, "step": 56170}, {"loss": 0.5655, "grad_norm": 0.8913900256156921, "learning_rate": 0.0002, "epoch": 4.0344703770197485, "step": 56180}, {"loss": 0.5897, "grad_norm": 0.9445754289627075, "learning_rate": 0.0002, "epoch": 4.0351885098743265, "step": 56190}, {"loss": 0.6204, "grad_norm": 0.9353124499320984, "learning_rate": 0.0002, "epoch": 4.0359066427289045, "step": 56200}, {"loss": 0.6017, "grad_norm": 1.1780431270599365, "learning_rate": 0.0002, "epoch": 4.0366247755834825, "step": 56210}, {"loss": 0.5767, "grad_norm": 0.9208880662918091, "learning_rate": 0.0002, "epoch": 4.037342908438061, "step": 56220}, {"loss": 0.5367, "grad_norm": 0.9475517272949219, "learning_rate": 0.0002, "epoch": 4.038061041292639, "step": 56230}, {"loss": 0.576, "grad_norm": 0.7478583455085754, "learning_rate": 0.0002, "epoch": 4.038779174147217, "step": 56240}, {"loss": 0.5616, "grad_norm": 1.0026403665542603, "learning_rate": 0.0002, "epoch": 4.039497307001795, "step": 56250}, {"loss": 0.6031, "grad_norm": 0.9664973020553589, "learning_rate": 0.0002, "epoch": 4.040215439856373, "step": 56260}, {"loss": 0.5764, "grad_norm": 1.0655616521835327, "learning_rate": 0.0002, "epoch": 4.040933572710951, "step": 56270}, {"loss": 0.5862, "grad_norm": 0.8367540240287781, "learning_rate": 0.0002, "epoch": 4.041651705565529, "step": 56280}, {"loss": 0.5828, "grad_norm": 0.7982191443443298, "learning_rate": 0.0002, "epoch": 4.042369838420107, "step": 56290}, {"loss": 0.5637, "grad_norm": 0.8304495215415955, "learning_rate": 0.0002, "epoch": 4.043087971274685, "step": 56300}, {"loss": 0.5974, "grad_norm": 0.95123291015625, "learning_rate": 0.0002, "epoch": 4.043806104129264, "step": 56310}, {"loss": 0.617, "grad_norm": 0.9504102468490601, "learning_rate": 0.0002, "epoch": 4.044524236983842, "step": 56320}, {"loss": 0.6143, "grad_norm": 0.7432710528373718, "learning_rate": 0.0002, "epoch": 4.04524236983842, "step": 56330}, {"loss": 0.6157, "grad_norm": 0.9327874183654785, "learning_rate": 0.0002, "epoch": 4.045960502692998, "step": 56340}, {"loss": 0.591, "grad_norm": 0.9161670804023743, "learning_rate": 0.0002, "epoch": 4.046678635547576, "step": 56350}, {"loss": 0.6111, "grad_norm": 0.9371771812438965, "learning_rate": 0.0002, "epoch": 4.047396768402154, "step": 56360}, {"loss": 0.6101, "grad_norm": 1.0332437753677368, "learning_rate": 0.0002, "epoch": 4.048114901256732, "step": 56370}, {"loss": 0.5451, "grad_norm": 0.7346320748329163, "learning_rate": 0.0002, "epoch": 4.04883303411131, "step": 56380}, {"loss": 0.6416, "grad_norm": 0.8247857689857483, "learning_rate": 0.0002, "epoch": 4.049551166965888, "step": 56390}, {"loss": 0.6208, "grad_norm": 0.925325334072113, "learning_rate": 0.0002, "epoch": 4.050269299820466, "step": 56400}, {"loss": 0.558, "grad_norm": 0.7344088554382324, "learning_rate": 0.0002, "epoch": 4.050987432675045, "step": 56410}, {"loss": 0.5978, "grad_norm": 0.9204918146133423, "learning_rate": 0.0002, "epoch": 4.051705565529623, "step": 56420}, {"loss": 0.5788, "grad_norm": 0.8273472785949707, "learning_rate": 0.0002, "epoch": 4.052423698384201, "step": 56430}, {"loss": 0.5551, "grad_norm": 0.9524998068809509, "learning_rate": 0.0002, "epoch": 4.053141831238779, "step": 56440}, {"loss": 0.5836, "grad_norm": 0.9168205857276917, "learning_rate": 0.0002, "epoch": 4.053859964093357, "step": 56450}, {"loss": 0.6035, "grad_norm": 0.9634994864463806, "learning_rate": 0.0002, "epoch": 4.054578096947935, "step": 56460}, {"loss": 0.5907, "grad_norm": 1.2027593851089478, "learning_rate": 0.0002, "epoch": 4.055296229802513, "step": 56470}, {"loss": 0.5691, "grad_norm": 1.2347805500030518, "learning_rate": 0.0002, "epoch": 4.056014362657091, "step": 56480}, {"loss": 0.5789, "grad_norm": 0.8621458411216736, "learning_rate": 0.0002, "epoch": 4.056732495511669, "step": 56490}, {"loss": 0.6082, "grad_norm": 0.9194608330726624, "learning_rate": 0.0002, "epoch": 4.057450628366248, "step": 56500}, {"loss": 0.5667, "grad_norm": 1.0153663158416748, "learning_rate": 0.0002, "epoch": 4.058168761220826, "step": 56510}, {"loss": 0.5908, "grad_norm": 0.9170986413955688, "learning_rate": 0.0002, "epoch": 4.058886894075404, "step": 56520}, {"loss": 0.5672, "grad_norm": 1.033057689666748, "learning_rate": 0.0002, "epoch": 4.059605026929982, "step": 56530}, {"loss": 0.5577, "grad_norm": 1.0125197172164917, "learning_rate": 0.0002, "epoch": 4.06032315978456, "step": 56540}, {"loss": 0.5821, "grad_norm": 0.9429898262023926, "learning_rate": 0.0002, "epoch": 4.061041292639138, "step": 56550}, {"loss": 0.5655, "grad_norm": 0.9242179989814758, "learning_rate": 0.0002, "epoch": 4.061759425493716, "step": 56560}, {"loss": 0.5568, "grad_norm": 0.9365091323852539, "learning_rate": 0.0002, "epoch": 4.062477558348294, "step": 56570}, {"loss": 0.6104, "grad_norm": 0.9148455858230591, "learning_rate": 0.0002, "epoch": 4.063195691202872, "step": 56580}, {"loss": 0.5891, "grad_norm": 0.8546709418296814, "learning_rate": 0.0002, "epoch": 4.063913824057451, "step": 56590}, {"loss": 0.6079, "grad_norm": 0.9743902087211609, "learning_rate": 0.0002, "epoch": 4.064631956912029, "step": 56600}, {"loss": 0.6109, "grad_norm": 1.0599974393844604, "learning_rate": 0.0002, "epoch": 4.065350089766607, "step": 56610}, {"loss": 0.5746, "grad_norm": 0.9677841067314148, "learning_rate": 0.0002, "epoch": 4.066068222621185, "step": 56620}, {"loss": 0.5957, "grad_norm": 0.8892754316329956, "learning_rate": 0.0002, "epoch": 4.066786355475763, "step": 56630}, {"loss": 0.5899, "grad_norm": 0.8837814331054688, "learning_rate": 0.0002, "epoch": 4.067504488330341, "step": 56640}, {"loss": 0.5784, "grad_norm": 0.9284095764160156, "learning_rate": 0.0002, "epoch": 4.068222621184919, "step": 56650}, {"loss": 0.5829, "grad_norm": 1.0163567066192627, "learning_rate": 0.0002, "epoch": 4.068940754039497, "step": 56660}, {"loss": 0.5349, "grad_norm": 0.8713456988334656, "learning_rate": 0.0002, "epoch": 4.069658886894075, "step": 56670}, {"loss": 0.5345, "grad_norm": 0.8356686234474182, "learning_rate": 0.0002, "epoch": 4.070377019748653, "step": 56680}, {"loss": 0.5473, "grad_norm": 0.8998766541481018, "learning_rate": 0.0002, "epoch": 4.071095152603232, "step": 56690}, {"loss": 0.5896, "grad_norm": 1.0441967248916626, "learning_rate": 0.0002, "epoch": 4.07181328545781, "step": 56700}, {"loss": 0.5817, "grad_norm": 0.9313125610351562, "learning_rate": 0.0002, "epoch": 4.072531418312388, "step": 56710}, {"loss": 0.5477, "grad_norm": 0.9912964701652527, "learning_rate": 0.0002, "epoch": 4.073249551166966, "step": 56720}, {"loss": 0.5974, "grad_norm": 0.9048459529876709, "learning_rate": 0.0002, "epoch": 4.073967684021544, "step": 56730}, {"loss": 0.5927, "grad_norm": 1.0248944759368896, "learning_rate": 0.0002, "epoch": 4.074685816876122, "step": 56740}, {"loss": 0.6019, "grad_norm": 1.4526786804199219, "learning_rate": 0.0002, "epoch": 4.0754039497307, "step": 56750}, {"loss": 0.6267, "grad_norm": 0.9813178181648254, "learning_rate": 0.0002, "epoch": 4.076122082585278, "step": 56760}, {"loss": 0.5707, "grad_norm": 1.0686813592910767, "learning_rate": 0.0002, "epoch": 4.076840215439856, "step": 56770}, {"loss": 0.5857, "grad_norm": 1.1093482971191406, "learning_rate": 0.0002, "epoch": 4.077558348294435, "step": 56780}, {"loss": 0.5768, "grad_norm": 0.9377819895744324, "learning_rate": 0.0002, "epoch": 4.078276481149013, "step": 56790}, {"loss": 0.6342, "grad_norm": 0.8043649196624756, "learning_rate": 0.0002, "epoch": 4.078994614003591, "step": 56800}, {"loss": 0.6005, "grad_norm": 0.7995415925979614, "learning_rate": 0.0002, "epoch": 4.079712746858169, "step": 56810}, {"loss": 0.5466, "grad_norm": 1.0076148509979248, "learning_rate": 0.0002, "epoch": 4.080430879712747, "step": 56820}, {"loss": 0.6021, "grad_norm": 0.8192076683044434, "learning_rate": 0.0002, "epoch": 4.081149012567325, "step": 56830}, {"loss": 0.5439, "grad_norm": 0.9226266145706177, "learning_rate": 0.0002, "epoch": 4.081867145421903, "step": 56840}, {"loss": 0.5893, "grad_norm": 0.8877972960472107, "learning_rate": 0.0002, "epoch": 4.082585278276481, "step": 56850}, {"loss": 0.5774, "grad_norm": 0.9578937888145447, "learning_rate": 0.0002, "epoch": 4.083303411131059, "step": 56860}, {"loss": 0.5946, "grad_norm": 0.8929167985916138, "learning_rate": 0.0002, "epoch": 4.084021543985638, "step": 56870}, {"loss": 0.5226, "grad_norm": 1.0015977621078491, "learning_rate": 0.0002, "epoch": 4.084739676840216, "step": 56880}, {"loss": 0.5931, "grad_norm": 0.9768750667572021, "learning_rate": 0.0002, "epoch": 4.085457809694794, "step": 56890}, {"loss": 0.5983, "grad_norm": 1.0834569931030273, "learning_rate": 0.0002, "epoch": 4.086175942549372, "step": 56900}, {"loss": 0.5786, "grad_norm": 0.8761230707168579, "learning_rate": 0.0002, "epoch": 4.08689407540395, "step": 56910}, {"loss": 0.5708, "grad_norm": 1.027064323425293, "learning_rate": 0.0002, "epoch": 4.087612208258528, "step": 56920}, {"loss": 0.601, "grad_norm": 1.130336880683899, "learning_rate": 0.0002, "epoch": 4.088330341113106, "step": 56930}, {"loss": 0.5664, "grad_norm": 0.8157579898834229, "learning_rate": 0.0002, "epoch": 4.089048473967684, "step": 56940}, {"loss": 0.5789, "grad_norm": 1.071175217628479, "learning_rate": 0.0002, "epoch": 4.089766606822262, "step": 56950}, {"loss": 0.5942, "grad_norm": 0.9534492492675781, "learning_rate": 0.0002, "epoch": 4.09048473967684, "step": 56960}, {"loss": 0.5803, "grad_norm": 0.9584037661552429, "learning_rate": 0.0002, "epoch": 4.091202872531419, "step": 56970}, {"loss": 0.5647, "grad_norm": 1.1513131856918335, "learning_rate": 0.0002, "epoch": 4.091921005385997, "step": 56980}, {"loss": 0.5971, "grad_norm": 1.0167666673660278, "learning_rate": 0.0002, "epoch": 4.092639138240575, "step": 56990}, {"loss": 0.5981, "grad_norm": 1.0630987882614136, "learning_rate": 0.0002, "epoch": 4.093357271095153, "step": 57000}, {"loss": 0.5734, "grad_norm": 1.0326893329620361, "learning_rate": 0.0002, "epoch": 4.094075403949731, "step": 57010}, {"loss": 0.572, "grad_norm": 0.9701678156852722, "learning_rate": 0.0002, "epoch": 4.094793536804309, "step": 57020}, {"loss": 0.5815, "grad_norm": 0.839935302734375, "learning_rate": 0.0002, "epoch": 4.095511669658887, "step": 57030}, {"loss": 0.6051, "grad_norm": 0.8995838761329651, "learning_rate": 0.0002, "epoch": 4.096229802513465, "step": 57040}, {"loss": 0.6037, "grad_norm": 0.8039916157722473, "learning_rate": 0.0002, "epoch": 4.096947935368043, "step": 57050}, {"loss": 0.5597, "grad_norm": 1.126122236251831, "learning_rate": 0.0002, "epoch": 4.097666068222622, "step": 57060}, {"loss": 0.5943, "grad_norm": 0.8749837875366211, "learning_rate": 0.0002, "epoch": 4.0983842010772, "step": 57070}, {"loss": 0.6017, "grad_norm": 0.8630341291427612, "learning_rate": 0.0002, "epoch": 4.099102333931778, "step": 57080}, {"loss": 0.6083, "grad_norm": 0.8889496922492981, "learning_rate": 0.0002, "epoch": 4.099820466786356, "step": 57090}, {"loss": 0.5727, "grad_norm": 0.9050310254096985, "learning_rate": 0.0002, "epoch": 4.100538599640934, "step": 57100}, {"loss": 0.5824, "grad_norm": 0.943072497844696, "learning_rate": 0.0002, "epoch": 4.101256732495512, "step": 57110}, {"loss": 0.6036, "grad_norm": 0.9031552672386169, "learning_rate": 0.0002, "epoch": 4.10197486535009, "step": 57120}, {"loss": 0.5913, "grad_norm": 0.939862847328186, "learning_rate": 0.0002, "epoch": 4.102692998204668, "step": 57130}, {"loss": 0.5738, "grad_norm": 0.8080634474754333, "learning_rate": 0.0002, "epoch": 4.103411131059246, "step": 57140}, {"loss": 0.5841, "grad_norm": 0.9181693196296692, "learning_rate": 0.0002, "epoch": 4.1041292639138245, "step": 57150}, {"loss": 0.5561, "grad_norm": 0.9609217643737793, "learning_rate": 0.0002, "epoch": 4.1048473967684025, "step": 57160}, {"loss": 0.5572, "grad_norm": 1.1246516704559326, "learning_rate": 0.0002, "epoch": 4.1055655296229805, "step": 57170}, {"loss": 0.5886, "grad_norm": 1.0616880655288696, "learning_rate": 0.0002, "epoch": 4.1062836624775585, "step": 57180}, {"loss": 0.5579, "grad_norm": 0.9954505562782288, "learning_rate": 0.0002, "epoch": 4.1070017953321365, "step": 57190}, {"loss": 0.5899, "grad_norm": 1.0602279901504517, "learning_rate": 0.0002, "epoch": 4.1077199281867145, "step": 57200}, {"loss": 0.5747, "grad_norm": 0.8984764814376831, "learning_rate": 0.0002, "epoch": 4.1084380610412925, "step": 57210}, {"loss": 0.5502, "grad_norm": 0.845167875289917, "learning_rate": 0.0002, "epoch": 4.1091561938958705, "step": 57220}, {"loss": 0.6147, "grad_norm": 0.7901500463485718, "learning_rate": 0.0002, "epoch": 4.1098743267504485, "step": 57230}, {"loss": 0.5883, "grad_norm": 1.0462526082992554, "learning_rate": 0.0002, "epoch": 4.1105924596050265, "step": 57240}, {"loss": 0.6334, "grad_norm": 0.9098827838897705, "learning_rate": 0.0002, "epoch": 4.111310592459605, "step": 57250}, {"loss": 0.5794, "grad_norm": 0.9234077334403992, "learning_rate": 0.0002, "epoch": 4.112028725314183, "step": 57260}, {"loss": 0.623, "grad_norm": 1.0033560991287231, "learning_rate": 0.0002, "epoch": 4.112746858168761, "step": 57270}, {"loss": 0.5392, "grad_norm": 1.0620051622390747, "learning_rate": 0.0002, "epoch": 4.113464991023339, "step": 57280}, {"loss": 0.6144, "grad_norm": 0.8679345846176147, "learning_rate": 0.0002, "epoch": 4.114183123877917, "step": 57290}, {"loss": 0.5951, "grad_norm": 0.7557345628738403, "learning_rate": 0.0002, "epoch": 4.114901256732495, "step": 57300}, {"loss": 0.575, "grad_norm": 0.8970935344696045, "learning_rate": 0.0002, "epoch": 4.115619389587073, "step": 57310}, {"loss": 0.5595, "grad_norm": 1.0779842138290405, "learning_rate": 0.0002, "epoch": 4.116337522441651, "step": 57320}, {"loss": 0.5532, "grad_norm": 1.2036106586456299, "learning_rate": 0.0002, "epoch": 4.117055655296229, "step": 57330}, {"loss": 0.5959, "grad_norm": 0.8337953686714172, "learning_rate": 0.0002, "epoch": 4.117773788150808, "step": 57340}, {"loss": 0.6128, "grad_norm": 0.9850410223007202, "learning_rate": 0.0002, "epoch": 4.118491921005386, "step": 57350}, {"loss": 0.5676, "grad_norm": 0.8028770685195923, "learning_rate": 0.0002, "epoch": 4.119210053859964, "step": 57360}, {"loss": 0.5693, "grad_norm": 0.8693217039108276, "learning_rate": 0.0002, "epoch": 4.119928186714542, "step": 57370}, {"loss": 0.5897, "grad_norm": 0.8795534372329712, "learning_rate": 0.0002, "epoch": 4.12064631956912, "step": 57380}, {"loss": 0.5692, "grad_norm": 1.0081543922424316, "learning_rate": 0.0002, "epoch": 4.121364452423698, "step": 57390}, {"loss": 0.6027, "grad_norm": 0.8776742219924927, "learning_rate": 0.0002, "epoch": 4.122082585278276, "step": 57400}, {"loss": 0.6418, "grad_norm": 0.8247824311256409, "learning_rate": 0.0002, "epoch": 4.122800718132854, "step": 57410}, {"loss": 0.5537, "grad_norm": 1.1346335411071777, "learning_rate": 0.0002, "epoch": 4.123518850987432, "step": 57420}, {"loss": 0.5949, "grad_norm": 1.0671089887619019, "learning_rate": 0.0002, "epoch": 4.124236983842011, "step": 57430}, {"loss": 0.5908, "grad_norm": 0.8548333048820496, "learning_rate": 0.0002, "epoch": 4.124955116696589, "step": 57440}, {"loss": 0.5967, "grad_norm": 1.0221573114395142, "learning_rate": 0.0002, "epoch": 4.125673249551167, "step": 57450}, {"loss": 0.6238, "grad_norm": 0.9746617674827576, "learning_rate": 0.0002, "epoch": 4.126391382405745, "step": 57460}, {"loss": 0.5855, "grad_norm": 0.8104965090751648, "learning_rate": 0.0002, "epoch": 4.127109515260323, "step": 57470}, {"loss": 0.5724, "grad_norm": 1.0401487350463867, "learning_rate": 0.0002, "epoch": 4.127827648114901, "step": 57480}, {"loss": 0.5956, "grad_norm": 0.8828882575035095, "learning_rate": 0.0002, "epoch": 4.128545780969479, "step": 57490}, {"loss": 0.5851, "grad_norm": 1.0121098756790161, "learning_rate": 0.0002, "epoch": 4.129263913824057, "step": 57500}, {"loss": 0.5923, "grad_norm": 0.8789737820625305, "learning_rate": 0.0002, "epoch": 4.129982046678635, "step": 57510}, {"loss": 0.5929, "grad_norm": 1.0386744737625122, "learning_rate": 0.0002, "epoch": 4.130700179533213, "step": 57520}, {"loss": 0.6104, "grad_norm": 1.0092610120773315, "learning_rate": 0.0002, "epoch": 4.131418312387792, "step": 57530}, {"loss": 0.5974, "grad_norm": 0.8706282377243042, "learning_rate": 0.0002, "epoch": 4.13213644524237, "step": 57540}, {"loss": 0.5829, "grad_norm": 0.9270507097244263, "learning_rate": 0.0002, "epoch": 4.132854578096948, "step": 57550}, {"loss": 0.5826, "grad_norm": 1.0303068161010742, "learning_rate": 0.0002, "epoch": 4.133572710951526, "step": 57560}, {"loss": 0.5515, "grad_norm": 1.1169062852859497, "learning_rate": 0.0002, "epoch": 4.134290843806104, "step": 57570}, {"loss": 0.5848, "grad_norm": 0.8530599474906921, "learning_rate": 0.0002, "epoch": 4.135008976660682, "step": 57580}, {"loss": 0.6231, "grad_norm": 1.1395039558410645, "learning_rate": 0.0002, "epoch": 4.13572710951526, "step": 57590}, {"loss": 0.5739, "grad_norm": 0.8944115042686462, "learning_rate": 0.0002, "epoch": 4.136445242369838, "step": 57600}, {"loss": 0.6212, "grad_norm": 1.137966275215149, "learning_rate": 0.0002, "epoch": 4.137163375224416, "step": 57610}, {"loss": 0.6041, "grad_norm": 0.8244962692260742, "learning_rate": 0.0002, "epoch": 4.137881508078995, "step": 57620}, {"loss": 0.6078, "grad_norm": 1.1935817003250122, "learning_rate": 0.0002, "epoch": 4.138599640933573, "step": 57630}, {"loss": 0.5939, "grad_norm": 0.9774235486984253, "learning_rate": 0.0002, "epoch": 4.139317773788151, "step": 57640}, {"loss": 0.5963, "grad_norm": 1.066219449043274, "learning_rate": 0.0002, "epoch": 4.140035906642729, "step": 57650}, {"loss": 0.6008, "grad_norm": 0.8631396293640137, "learning_rate": 0.0002, "epoch": 4.140754039497307, "step": 57660}, {"loss": 0.5622, "grad_norm": 0.888410747051239, "learning_rate": 0.0002, "epoch": 4.141472172351885, "step": 57670}, {"loss": 0.5675, "grad_norm": 1.002642035484314, "learning_rate": 0.0002, "epoch": 4.142190305206463, "step": 57680}, {"loss": 0.5269, "grad_norm": 1.0092825889587402, "learning_rate": 0.0002, "epoch": 4.142908438061041, "step": 57690}, {"loss": 0.588, "grad_norm": 0.9126971364021301, "learning_rate": 0.0002, "epoch": 4.143626570915619, "step": 57700}, {"loss": 0.5593, "grad_norm": 1.0303562879562378, "learning_rate": 0.0002, "epoch": 4.144344703770198, "step": 57710}, {"loss": 0.6183, "grad_norm": 1.1230897903442383, "learning_rate": 0.0002, "epoch": 4.145062836624776, "step": 57720}, {"loss": 0.5934, "grad_norm": 1.0494099855422974, "learning_rate": 0.0002, "epoch": 4.145780969479354, "step": 57730}, {"loss": 0.6022, "grad_norm": 0.9555442333221436, "learning_rate": 0.0002, "epoch": 4.146499102333932, "step": 57740}, {"loss": 0.609, "grad_norm": 0.8255124092102051, "learning_rate": 0.0002, "epoch": 4.14721723518851, "step": 57750}, {"loss": 0.5659, "grad_norm": 1.097853660583496, "learning_rate": 0.0002, "epoch": 4.147935368043088, "step": 57760}, {"loss": 0.5698, "grad_norm": 1.0272663831710815, "learning_rate": 0.0002, "epoch": 4.148653500897666, "step": 57770}, {"loss": 0.5701, "grad_norm": 1.022571086883545, "learning_rate": 0.0002, "epoch": 4.149371633752244, "step": 57780}, {"loss": 0.579, "grad_norm": 0.964543342590332, "learning_rate": 0.0002, "epoch": 4.150089766606822, "step": 57790}, {"loss": 0.6175, "grad_norm": 0.9251219034194946, "learning_rate": 0.0002, "epoch": 4.1508078994614, "step": 57800}, {"loss": 0.564, "grad_norm": 1.081840991973877, "learning_rate": 0.0002, "epoch": 4.151526032315979, "step": 57810}, {"loss": 0.5956, "grad_norm": 0.8989445567131042, "learning_rate": 0.0002, "epoch": 4.152244165170557, "step": 57820}, {"loss": 0.5849, "grad_norm": 0.903629720211029, "learning_rate": 0.0002, "epoch": 4.152962298025135, "step": 57830}, {"loss": 0.6202, "grad_norm": 0.8985397219657898, "learning_rate": 0.0002, "epoch": 4.153680430879713, "step": 57840}, {"loss": 0.5629, "grad_norm": 1.047778844833374, "learning_rate": 0.0002, "epoch": 4.154398563734291, "step": 57850}, {"loss": 0.6045, "grad_norm": 0.9804165363311768, "learning_rate": 0.0002, "epoch": 4.155116696588869, "step": 57860}, {"loss": 0.5815, "grad_norm": 1.187309980392456, "learning_rate": 0.0002, "epoch": 4.155834829443447, "step": 57870}, {"loss": 0.6304, "grad_norm": 0.9854836463928223, "learning_rate": 0.0002, "epoch": 4.156552962298025, "step": 57880}, {"loss": 0.6076, "grad_norm": 0.8494308590888977, "learning_rate": 0.0002, "epoch": 4.157271095152603, "step": 57890}, {"loss": 0.6033, "grad_norm": 0.9359684586524963, "learning_rate": 0.0002, "epoch": 4.157989228007182, "step": 57900}, {"loss": 0.5546, "grad_norm": 0.8971988558769226, "learning_rate": 0.0002, "epoch": 4.15870736086176, "step": 57910}, {"loss": 0.5934, "grad_norm": 0.8848021030426025, "learning_rate": 0.0002, "epoch": 4.159425493716338, "step": 57920}, {"loss": 0.6102, "grad_norm": 0.982877790927887, "learning_rate": 0.0002, "epoch": 4.160143626570916, "step": 57930}, {"loss": 0.6091, "grad_norm": 0.8668819069862366, "learning_rate": 0.0002, "epoch": 4.160861759425494, "step": 57940}, {"loss": 0.5969, "grad_norm": 1.06569504737854, "learning_rate": 0.0002, "epoch": 4.161579892280072, "step": 57950}, {"loss": 0.5799, "grad_norm": 1.165740728378296, "learning_rate": 0.0002, "epoch": 4.16229802513465, "step": 57960}, {"loss": 0.6038, "grad_norm": 1.0534512996673584, "learning_rate": 0.0002, "epoch": 4.163016157989228, "step": 57970}, {"loss": 0.594, "grad_norm": 0.8785330653190613, "learning_rate": 0.0002, "epoch": 4.163734290843806, "step": 57980}, {"loss": 0.5981, "grad_norm": 1.1244874000549316, "learning_rate": 0.0002, "epoch": 4.164452423698384, "step": 57990}, {"loss": 0.6456, "grad_norm": 0.8839399218559265, "learning_rate": 0.0002, "epoch": 4.165170556552963, "step": 58000}, {"loss": 0.5767, "grad_norm": 1.0603798627853394, "learning_rate": 0.0002, "epoch": 4.165888689407541, "step": 58010}, {"loss": 0.6334, "grad_norm": 0.9737853407859802, "learning_rate": 0.0002, "epoch": 4.166606822262119, "step": 58020}, {"loss": 0.5901, "grad_norm": 1.0650558471679688, "learning_rate": 0.0002, "epoch": 4.167324955116697, "step": 58030}, {"loss": 0.6549, "grad_norm": 0.7528959512710571, "learning_rate": 0.0002, "epoch": 4.168043087971275, "step": 58040}, {"loss": 0.5593, "grad_norm": 0.9286156892776489, "learning_rate": 0.0002, "epoch": 4.168761220825853, "step": 58050}, {"loss": 0.6093, "grad_norm": 1.0225880146026611, "learning_rate": 0.0002, "epoch": 4.169479353680431, "step": 58060}, {"loss": 0.5993, "grad_norm": 0.9990654587745667, "learning_rate": 0.0002, "epoch": 4.170197486535009, "step": 58070}, {"loss": 0.6002, "grad_norm": 1.052057147026062, "learning_rate": 0.0002, "epoch": 4.170915619389587, "step": 58080}, {"loss": 0.5911, "grad_norm": 0.7366801500320435, "learning_rate": 0.0002, "epoch": 4.1716337522441655, "step": 58090}, {"loss": 0.6273, "grad_norm": 1.0943711996078491, "learning_rate": 0.0002, "epoch": 4.1723518850987436, "step": 58100}, {"loss": 0.6095, "grad_norm": 1.1297656297683716, "learning_rate": 0.0002, "epoch": 4.1730700179533216, "step": 58110}, {"loss": 0.6123, "grad_norm": 0.7861461639404297, "learning_rate": 0.0002, "epoch": 4.1737881508078996, "step": 58120}, {"loss": 0.6188, "grad_norm": 0.8643335103988647, "learning_rate": 0.0002, "epoch": 4.174506283662478, "step": 58130}, {"loss": 0.6103, "grad_norm": 0.957288384437561, "learning_rate": 0.0002, "epoch": 4.175224416517056, "step": 58140}, {"loss": 0.5636, "grad_norm": 0.9175366759300232, "learning_rate": 0.0002, "epoch": 4.175942549371634, "step": 58150}, {"loss": 0.6288, "grad_norm": 1.129935622215271, "learning_rate": 0.0002, "epoch": 4.176660682226212, "step": 58160}, {"loss": 0.5969, "grad_norm": 0.9683087468147278, "learning_rate": 0.0002, "epoch": 4.17737881508079, "step": 58170}, {"loss": 0.6249, "grad_norm": 1.045171856880188, "learning_rate": 0.0002, "epoch": 4.1780969479353685, "step": 58180}, {"loss": 0.5611, "grad_norm": 0.9858742952346802, "learning_rate": 0.0002, "epoch": 4.1788150807899465, "step": 58190}, {"loss": 0.5946, "grad_norm": 0.8513413071632385, "learning_rate": 0.0002, "epoch": 4.1795332136445245, "step": 58200}, {"loss": 0.5928, "grad_norm": 0.9584265947341919, "learning_rate": 0.0002, "epoch": 4.1802513464991025, "step": 58210}, {"loss": 0.5864, "grad_norm": 0.8828920722007751, "learning_rate": 0.0002, "epoch": 4.1809694793536805, "step": 58220}, {"loss": 0.5745, "grad_norm": 0.9849961400032043, "learning_rate": 0.0002, "epoch": 4.1816876122082585, "step": 58230}, {"loss": 0.5355, "grad_norm": 1.0601637363433838, "learning_rate": 0.0002, "epoch": 4.1824057450628365, "step": 58240}, {"loss": 0.6063, "grad_norm": 1.2206604480743408, "learning_rate": 0.0002, "epoch": 4.1831238779174145, "step": 58250}, {"loss": 0.6176, "grad_norm": 1.1768009662628174, "learning_rate": 0.0002, "epoch": 4.1838420107719925, "step": 58260}, {"loss": 0.5572, "grad_norm": 0.9521295428276062, "learning_rate": 0.0002, "epoch": 4.184560143626571, "step": 58270}, {"loss": 0.5978, "grad_norm": 0.892971932888031, "learning_rate": 0.0002, "epoch": 4.185278276481149, "step": 58280}, {"loss": 0.5727, "grad_norm": 0.8712016940116882, "learning_rate": 0.0002, "epoch": 4.185996409335727, "step": 58290}, {"loss": 0.6124, "grad_norm": 1.0190843343734741, "learning_rate": 0.0002, "epoch": 4.186714542190305, "step": 58300}, {"loss": 0.6324, "grad_norm": 1.0149270296096802, "learning_rate": 0.0002, "epoch": 4.187432675044883, "step": 58310}, {"loss": 0.6337, "grad_norm": 1.1818004846572876, "learning_rate": 0.0002, "epoch": 4.188150807899461, "step": 58320}, {"loss": 0.5588, "grad_norm": 0.7892335653305054, "learning_rate": 0.0002, "epoch": 4.188868940754039, "step": 58330}, {"loss": 0.6132, "grad_norm": 0.9792808890342712, "learning_rate": 0.0002, "epoch": 4.189587073608617, "step": 58340}, {"loss": 0.5841, "grad_norm": 0.9946883320808411, "learning_rate": 0.0002, "epoch": 4.190305206463195, "step": 58350}, {"loss": 0.6043, "grad_norm": 1.0363789796829224, "learning_rate": 0.0002, "epoch": 4.191023339317773, "step": 58360}, {"loss": 0.5843, "grad_norm": 0.9285917282104492, "learning_rate": 0.0002, "epoch": 4.191741472172352, "step": 58370}, {"loss": 0.6042, "grad_norm": 0.9461679458618164, "learning_rate": 0.0002, "epoch": 4.19245960502693, "step": 58380}, {"loss": 0.5666, "grad_norm": 1.0344175100326538, "learning_rate": 0.0002, "epoch": 4.193177737881508, "step": 58390}, {"loss": 0.6032, "grad_norm": 0.9530242085456848, "learning_rate": 0.0002, "epoch": 4.193895870736086, "step": 58400}, {"loss": 0.5887, "grad_norm": 0.9171900749206543, "learning_rate": 0.0002, "epoch": 4.194614003590664, "step": 58410}, {"loss": 0.6116, "grad_norm": 0.8094898462295532, "learning_rate": 0.0002, "epoch": 4.195332136445242, "step": 58420}, {"loss": 0.5268, "grad_norm": 0.921981930732727, "learning_rate": 0.0002, "epoch": 4.19605026929982, "step": 58430}, {"loss": 0.551, "grad_norm": 0.9783532023429871, "learning_rate": 0.0002, "epoch": 4.196768402154398, "step": 58440}, {"loss": 0.5774, "grad_norm": 1.017805576324463, "learning_rate": 0.0002, "epoch": 4.197486535008976, "step": 58450}, {"loss": 0.6261, "grad_norm": 0.9244308471679688, "learning_rate": 0.0002, "epoch": 4.198204667863555, "step": 58460}, {"loss": 0.6247, "grad_norm": 0.9942585229873657, "learning_rate": 0.0002, "epoch": 4.198922800718133, "step": 58470}, {"loss": 0.5803, "grad_norm": 1.1045037508010864, "learning_rate": 0.0002, "epoch": 4.199640933572711, "step": 58480}, {"loss": 0.5846, "grad_norm": 0.9483149647712708, "learning_rate": 0.0002, "epoch": 4.200359066427289, "step": 58490}, {"loss": 0.5997, "grad_norm": 1.0807271003723145, "learning_rate": 0.0002, "epoch": 4.201077199281867, "step": 58500}, {"loss": 0.5474, "grad_norm": 0.7697445750236511, "learning_rate": 0.0002, "epoch": 4.201795332136445, "step": 58510}, {"loss": 0.5692, "grad_norm": 1.0761178731918335, "learning_rate": 0.0002, "epoch": 4.202513464991023, "step": 58520}, {"loss": 0.5667, "grad_norm": 0.9992024898529053, "learning_rate": 0.0002, "epoch": 4.203231597845601, "step": 58530}, {"loss": 0.5606, "grad_norm": 0.8741498589515686, "learning_rate": 0.0002, "epoch": 4.203949730700179, "step": 58540}, {"loss": 0.6012, "grad_norm": 0.8557528853416443, "learning_rate": 0.0002, "epoch": 4.204667863554757, "step": 58550}, {"loss": 0.5191, "grad_norm": 0.8853630423545837, "learning_rate": 0.0002, "epoch": 4.205385996409336, "step": 58560}, {"loss": 0.5806, "grad_norm": 0.9858933687210083, "learning_rate": 0.0002, "epoch": 4.206104129263914, "step": 58570}, {"loss": 0.5908, "grad_norm": 1.104732871055603, "learning_rate": 0.0002, "epoch": 4.206822262118492, "step": 58580}, {"loss": 0.5993, "grad_norm": 0.9345462322235107, "learning_rate": 0.0002, "epoch": 4.20754039497307, "step": 58590}, {"loss": 0.6101, "grad_norm": 0.9620407819747925, "learning_rate": 0.0002, "epoch": 4.208258527827648, "step": 58600}, {"loss": 0.5848, "grad_norm": 0.8546963334083557, "learning_rate": 0.0002, "epoch": 4.208976660682226, "step": 58610}, {"loss": 0.5747, "grad_norm": 0.8125145435333252, "learning_rate": 0.0002, "epoch": 4.209694793536804, "step": 58620}, {"loss": 0.604, "grad_norm": 0.8481138944625854, "learning_rate": 0.0002, "epoch": 4.210412926391382, "step": 58630}, {"loss": 0.5928, "grad_norm": 0.8884692788124084, "learning_rate": 0.0002, "epoch": 4.21113105924596, "step": 58640}, {"loss": 0.5612, "grad_norm": 1.09279465675354, "learning_rate": 0.0002, "epoch": 4.211849192100539, "step": 58650}, {"loss": 0.644, "grad_norm": 0.9806583523750305, "learning_rate": 0.0002, "epoch": 4.212567324955117, "step": 58660}, {"loss": 0.5737, "grad_norm": 0.9510366916656494, "learning_rate": 0.0002, "epoch": 4.213285457809695, "step": 58670}, {"loss": 0.5996, "grad_norm": 0.7517459988594055, "learning_rate": 0.0002, "epoch": 4.214003590664273, "step": 58680}, {"loss": 0.6274, "grad_norm": 1.1134123802185059, "learning_rate": 0.0002, "epoch": 4.214721723518851, "step": 58690}, {"loss": 0.5842, "grad_norm": 0.8307328820228577, "learning_rate": 0.0002, "epoch": 4.215439856373429, "step": 58700}, {"loss": 0.5795, "grad_norm": 0.8211639523506165, "learning_rate": 0.0002, "epoch": 4.216157989228007, "step": 58710}, {"loss": 0.5613, "grad_norm": 1.0749584436416626, "learning_rate": 0.0002, "epoch": 4.216876122082585, "step": 58720}, {"loss": 0.5956, "grad_norm": 1.1394833326339722, "learning_rate": 0.0002, "epoch": 4.217594254937163, "step": 58730}, {"loss": 0.609, "grad_norm": 1.05130934715271, "learning_rate": 0.0002, "epoch": 4.218312387791742, "step": 58740}, {"loss": 0.6294, "grad_norm": 0.7949456572532654, "learning_rate": 0.0002, "epoch": 4.21903052064632, "step": 58750}, {"loss": 0.6148, "grad_norm": 0.906506359577179, "learning_rate": 0.0002, "epoch": 4.219748653500898, "step": 58760}, {"loss": 0.5778, "grad_norm": 0.8338989615440369, "learning_rate": 0.0002, "epoch": 4.220466786355476, "step": 58770}, {"loss": 0.5402, "grad_norm": 0.9325370788574219, "learning_rate": 0.0002, "epoch": 4.221184919210054, "step": 58780}, {"loss": 0.5657, "grad_norm": 1.0208096504211426, "learning_rate": 0.0002, "epoch": 4.221903052064632, "step": 58790}, {"loss": 0.6523, "grad_norm": 1.0075920820236206, "learning_rate": 0.0002, "epoch": 4.22262118491921, "step": 58800}, {"loss": 0.5545, "grad_norm": 0.9858701229095459, "learning_rate": 0.0002, "epoch": 4.223339317773788, "step": 58810}, {"loss": 0.6343, "grad_norm": 1.0010110139846802, "learning_rate": 0.0002, "epoch": 4.224057450628366, "step": 58820}, {"loss": 0.5991, "grad_norm": 0.9360540509223938, "learning_rate": 0.0002, "epoch": 4.224775583482945, "step": 58830}, {"loss": 0.5887, "grad_norm": 0.9021786451339722, "learning_rate": 0.0002, "epoch": 4.225493716337523, "step": 58840}, {"loss": 0.6132, "grad_norm": 1.1778476238250732, "learning_rate": 0.0002, "epoch": 4.226211849192101, "step": 58850}, {"loss": 0.5956, "grad_norm": 1.0061023235321045, "learning_rate": 0.0002, "epoch": 4.226929982046679, "step": 58860}, {"loss": 0.5846, "grad_norm": 0.8839752674102783, "learning_rate": 0.0002, "epoch": 4.227648114901257, "step": 58870}, {"loss": 0.6129, "grad_norm": 1.0078870058059692, "learning_rate": 0.0002, "epoch": 4.228366247755835, "step": 58880}, {"loss": 0.6403, "grad_norm": 0.8926451206207275, "learning_rate": 0.0002, "epoch": 4.229084380610413, "step": 58890}, {"loss": 0.5987, "grad_norm": 1.4018772840499878, "learning_rate": 0.0002, "epoch": 4.229802513464991, "step": 58900}, {"loss": 0.5925, "grad_norm": 0.9911289215087891, "learning_rate": 0.0002, "epoch": 4.230520646319569, "step": 58910}, {"loss": 0.5846, "grad_norm": 0.9374576807022095, "learning_rate": 0.0002, "epoch": 4.231238779174147, "step": 58920}, {"loss": 0.5856, "grad_norm": 1.179650068283081, "learning_rate": 0.0002, "epoch": 4.231956912028726, "step": 58930}, {"loss": 0.601, "grad_norm": 0.9434911012649536, "learning_rate": 0.0002, "epoch": 4.232675044883304, "step": 58940}, {"loss": 0.6137, "grad_norm": 1.0061911344528198, "learning_rate": 0.0002, "epoch": 4.233393177737882, "step": 58950}, {"loss": 0.5847, "grad_norm": 0.9663233757019043, "learning_rate": 0.0002, "epoch": 4.23411131059246, "step": 58960}, {"loss": 0.5748, "grad_norm": 0.8897581696510315, "learning_rate": 0.0002, "epoch": 4.234829443447038, "step": 58970}, {"loss": 0.5586, "grad_norm": 0.873281717300415, "learning_rate": 0.0002, "epoch": 4.235547576301616, "step": 58980}, {"loss": 0.6027, "grad_norm": 0.9146949052810669, "learning_rate": 0.0002, "epoch": 4.236265709156194, "step": 58990}, {"loss": 0.6356, "grad_norm": 0.9381195306777954, "learning_rate": 0.0002, "epoch": 4.236983842010772, "step": 59000}, {"loss": 0.5641, "grad_norm": 0.9700697064399719, "learning_rate": 0.0002, "epoch": 4.23770197486535, "step": 59010}, {"loss": 0.6099, "grad_norm": 0.9050154685974121, "learning_rate": 0.0002, "epoch": 4.238420107719929, "step": 59020}, {"loss": 0.552, "grad_norm": 0.9901503324508667, "learning_rate": 0.0002, "epoch": 4.239138240574507, "step": 59030}, {"loss": 0.6333, "grad_norm": 0.9009594321250916, "learning_rate": 0.0002, "epoch": 4.239856373429085, "step": 59040}, {"loss": 0.6104, "grad_norm": 1.0924968719482422, "learning_rate": 0.0002, "epoch": 4.240574506283663, "step": 59050}, {"loss": 0.6269, "grad_norm": 0.9939947724342346, "learning_rate": 0.0002, "epoch": 4.241292639138241, "step": 59060}, {"loss": 0.6039, "grad_norm": 1.0577857494354248, "learning_rate": 0.0002, "epoch": 4.242010771992819, "step": 59070}, {"loss": 0.5992, "grad_norm": 1.0836747884750366, "learning_rate": 0.0002, "epoch": 4.242728904847397, "step": 59080}, {"loss": 0.6518, "grad_norm": 0.97043377161026, "learning_rate": 0.0002, "epoch": 4.243447037701975, "step": 59090}, {"loss": 0.5877, "grad_norm": 0.7711901664733887, "learning_rate": 0.0002, "epoch": 4.244165170556553, "step": 59100}, {"loss": 0.6017, "grad_norm": 1.0143170356750488, "learning_rate": 0.0002, "epoch": 4.244883303411131, "step": 59110}, {"loss": 0.6245, "grad_norm": 0.9151925444602966, "learning_rate": 0.0002, "epoch": 4.2456014362657095, "step": 59120}, {"loss": 0.6436, "grad_norm": 0.9252700209617615, "learning_rate": 0.0002, "epoch": 4.2463195691202875, "step": 59130}, {"loss": 0.5696, "grad_norm": 0.8429408073425293, "learning_rate": 0.0002, "epoch": 4.2470377019748655, "step": 59140}, {"loss": 0.5737, "grad_norm": 0.9645987153053284, "learning_rate": 0.0002, "epoch": 4.2477558348294435, "step": 59150}, {"loss": 0.6045, "grad_norm": 0.9949791431427002, "learning_rate": 0.0002, "epoch": 4.2484739676840215, "step": 59160}, {"loss": 0.6069, "grad_norm": 0.9128350615501404, "learning_rate": 0.0002, "epoch": 4.2491921005385995, "step": 59170}, {"loss": 0.596, "grad_norm": 0.7406911849975586, "learning_rate": 0.0002, "epoch": 4.2499102333931775, "step": 59180}, {"loss": 0.5796, "grad_norm": 1.0237419605255127, "learning_rate": 0.0002, "epoch": 4.2506283662477555, "step": 59190}, {"loss": 0.631, "grad_norm": 0.805459201335907, "learning_rate": 0.0002, "epoch": 4.2513464991023335, "step": 59200}, {"loss": 0.6104, "grad_norm": 0.8477254509925842, "learning_rate": 0.0002, "epoch": 4.252064631956912, "step": 59210}, {"loss": 0.5608, "grad_norm": 0.984023928642273, "learning_rate": 0.0002, "epoch": 4.25278276481149, "step": 59220}, {"loss": 0.6185, "grad_norm": 1.0667484998703003, "learning_rate": 0.0002, "epoch": 4.253500897666068, "step": 59230}, {"loss": 0.5596, "grad_norm": 0.7192284464836121, "learning_rate": 0.0002, "epoch": 4.254219030520646, "step": 59240}, {"loss": 0.5971, "grad_norm": 0.9557451009750366, "learning_rate": 0.0002, "epoch": 4.254937163375224, "step": 59250}, {"loss": 0.6012, "grad_norm": 0.9209784865379333, "learning_rate": 0.0002, "epoch": 4.255655296229802, "step": 59260}, {"loss": 0.67, "grad_norm": 0.9785363674163818, "learning_rate": 0.0002, "epoch": 4.25637342908438, "step": 59270}, {"loss": 0.6185, "grad_norm": 0.910214364528656, "learning_rate": 0.0002, "epoch": 4.257091561938958, "step": 59280}, {"loss": 0.6451, "grad_norm": 0.8945858478546143, "learning_rate": 0.0002, "epoch": 4.257809694793536, "step": 59290}, {"loss": 0.5876, "grad_norm": 1.0984420776367188, "learning_rate": 0.0002, "epoch": 4.258527827648114, "step": 59300}, {"loss": 0.5616, "grad_norm": 1.0256640911102295, "learning_rate": 0.0002, "epoch": 4.259245960502693, "step": 59310}, {"loss": 0.5825, "grad_norm": 0.978397786617279, "learning_rate": 0.0002, "epoch": 4.259964093357271, "step": 59320}, {"loss": 0.6043, "grad_norm": 0.7587000727653503, "learning_rate": 0.0002, "epoch": 4.260682226211849, "step": 59330}, {"loss": 0.5616, "grad_norm": 0.9384620785713196, "learning_rate": 0.0002, "epoch": 4.261400359066427, "step": 59340}, {"loss": 0.6669, "grad_norm": 0.893992006778717, "learning_rate": 0.0002, "epoch": 4.262118491921005, "step": 59350}, {"loss": 0.561, "grad_norm": 1.0231536626815796, "learning_rate": 0.0002, "epoch": 4.262836624775583, "step": 59360}, {"loss": 0.5912, "grad_norm": 0.9810128211975098, "learning_rate": 0.0002, "epoch": 4.263554757630161, "step": 59370}, {"loss": 0.5871, "grad_norm": 1.0868116617202759, "learning_rate": 0.0002, "epoch": 4.264272890484739, "step": 59380}, {"loss": 0.5986, "grad_norm": 1.1433676481246948, "learning_rate": 0.0002, "epoch": 4.264991023339318, "step": 59390}, {"loss": 0.6306, "grad_norm": 0.9836946725845337, "learning_rate": 0.0002, "epoch": 4.265709156193896, "step": 59400}, {"loss": 0.5854, "grad_norm": 0.9473603963851929, "learning_rate": 0.0002, "epoch": 4.266427289048474, "step": 59410}, {"loss": 0.6095, "grad_norm": 0.9066835641860962, "learning_rate": 0.0002, "epoch": 4.267145421903052, "step": 59420}, {"loss": 0.656, "grad_norm": 1.0534718036651611, "learning_rate": 0.0002, "epoch": 4.26786355475763, "step": 59430}, {"loss": 0.5624, "grad_norm": 1.0392775535583496, "learning_rate": 0.0002, "epoch": 4.268581687612208, "step": 59440}, {"loss": 0.5697, "grad_norm": 1.011472463607788, "learning_rate": 0.0002, "epoch": 4.269299820466786, "step": 59450}, {"loss": 0.5971, "grad_norm": 1.0704147815704346, "learning_rate": 0.0002, "epoch": 4.270017953321364, "step": 59460}, {"loss": 0.5719, "grad_norm": 0.9349238872528076, "learning_rate": 0.0002, "epoch": 4.270736086175942, "step": 59470}, {"loss": 0.5637, "grad_norm": 0.8745087385177612, "learning_rate": 0.0002, "epoch": 4.27145421903052, "step": 59480}, {"loss": 0.6246, "grad_norm": 0.8823763728141785, "learning_rate": 0.0002, "epoch": 4.272172351885099, "step": 59490}, {"loss": 0.6021, "grad_norm": 1.110912799835205, "learning_rate": 0.0002, "epoch": 4.272890484739677, "step": 59500}, {"loss": 0.5939, "grad_norm": 1.0000925064086914, "learning_rate": 0.0002, "epoch": 4.273608617594255, "step": 59510}, {"loss": 0.5531, "grad_norm": 1.1578227281570435, "learning_rate": 0.0002, "epoch": 4.274326750448833, "step": 59520}, {"loss": 0.6372, "grad_norm": 0.875720202922821, "learning_rate": 0.0002, "epoch": 4.275044883303411, "step": 59530}, {"loss": 0.5956, "grad_norm": 0.9562238454818726, "learning_rate": 0.0002, "epoch": 4.275763016157989, "step": 59540}, {"loss": 0.5996, "grad_norm": 0.8384222388267517, "learning_rate": 0.0002, "epoch": 4.276481149012567, "step": 59550}, {"loss": 0.6001, "grad_norm": 1.2719428539276123, "learning_rate": 0.0002, "epoch": 4.277199281867145, "step": 59560}, {"loss": 0.6286, "grad_norm": 1.0656434297561646, "learning_rate": 0.0002, "epoch": 4.277917414721723, "step": 59570}, {"loss": 0.5895, "grad_norm": 1.0766716003417969, "learning_rate": 0.0002, "epoch": 4.278635547576302, "step": 59580}, {"loss": 0.5831, "grad_norm": 0.8892807960510254, "learning_rate": 0.0002, "epoch": 4.27935368043088, "step": 59590}, {"loss": 0.5717, "grad_norm": 0.8956300020217896, "learning_rate": 0.0002, "epoch": 4.280071813285458, "step": 59600}, {"loss": 0.5965, "grad_norm": 0.9562926888465881, "learning_rate": 0.0002, "epoch": 4.280789946140036, "step": 59610}, {"loss": 0.5487, "grad_norm": 1.009141445159912, "learning_rate": 0.0002, "epoch": 4.281508078994614, "step": 59620}, {"loss": 0.6337, "grad_norm": 1.0546064376831055, "learning_rate": 0.0002, "epoch": 4.282226211849192, "step": 59630}, {"loss": 0.5771, "grad_norm": 0.8831254243850708, "learning_rate": 0.0002, "epoch": 4.28294434470377, "step": 59640}, {"loss": 0.6241, "grad_norm": 0.9560053944587708, "learning_rate": 0.0002, "epoch": 4.283662477558348, "step": 59650}, {"loss": 0.6012, "grad_norm": 1.030339241027832, "learning_rate": 0.0002, "epoch": 4.284380610412926, "step": 59660}, {"loss": 0.6174, "grad_norm": 1.00662100315094, "learning_rate": 0.0002, "epoch": 4.285098743267504, "step": 59670}, {"loss": 0.5802, "grad_norm": 1.0759116411209106, "learning_rate": 0.0002, "epoch": 4.285816876122083, "step": 59680}, {"loss": 0.6429, "grad_norm": 0.9985393285751343, "learning_rate": 0.0002, "epoch": 4.286535008976661, "step": 59690}, {"loss": 0.5992, "grad_norm": 0.9044474959373474, "learning_rate": 0.0002, "epoch": 4.287253141831239, "step": 59700}, {"loss": 0.6263, "grad_norm": 1.1224442720413208, "learning_rate": 0.0002, "epoch": 4.287971274685817, "step": 59710}, {"loss": 0.6118, "grad_norm": 0.8436414003372192, "learning_rate": 0.0002, "epoch": 4.288689407540395, "step": 59720}, {"loss": 0.5881, "grad_norm": 1.0695041418075562, "learning_rate": 0.0002, "epoch": 4.289407540394973, "step": 59730}, {"loss": 0.5994, "grad_norm": 0.8809951543807983, "learning_rate": 0.0002, "epoch": 4.290125673249551, "step": 59740}, {"loss": 0.6508, "grad_norm": 1.0213792324066162, "learning_rate": 0.0002, "epoch": 4.290843806104129, "step": 59750}, {"loss": 0.5851, "grad_norm": 0.9660196900367737, "learning_rate": 0.0002, "epoch": 4.291561938958707, "step": 59760}, {"loss": 0.6582, "grad_norm": 0.8005787134170532, "learning_rate": 0.0002, "epoch": 4.292280071813286, "step": 59770}, {"loss": 0.6504, "grad_norm": 1.0016109943389893, "learning_rate": 0.0002, "epoch": 4.292998204667864, "step": 59780}, {"loss": 0.5765, "grad_norm": 0.9112903475761414, "learning_rate": 0.0002, "epoch": 4.293716337522442, "step": 59790}, {"loss": 0.5925, "grad_norm": 0.9999852180480957, "learning_rate": 0.0002, "epoch": 4.29443447037702, "step": 59800}, {"loss": 0.636, "grad_norm": 0.9323953986167908, "learning_rate": 0.0002, "epoch": 4.295152603231598, "step": 59810}, {"loss": 0.5743, "grad_norm": 0.903037965297699, "learning_rate": 0.0002, "epoch": 4.295870736086176, "step": 59820}, {"loss": 0.6008, "grad_norm": 1.2462431192398071, "learning_rate": 0.0002, "epoch": 4.296588868940754, "step": 59830}, {"loss": 0.6126, "grad_norm": 1.2322230339050293, "learning_rate": 0.0002, "epoch": 4.297307001795332, "step": 59840}, {"loss": 0.6029, "grad_norm": 0.9584668278694153, "learning_rate": 0.0002, "epoch": 4.29802513464991, "step": 59850}, {"loss": 0.6179, "grad_norm": 0.9664767980575562, "learning_rate": 0.0002, "epoch": 4.298743267504488, "step": 59860}, {"loss": 0.5909, "grad_norm": 0.8860437273979187, "learning_rate": 0.0002, "epoch": 4.299461400359067, "step": 59870}, {"loss": 0.5708, "grad_norm": 1.0825127363204956, "learning_rate": 0.0002, "epoch": 4.300179533213645, "step": 59880}, {"loss": 0.6338, "grad_norm": 1.1312100887298584, "learning_rate": 0.0002, "epoch": 4.300897666068223, "step": 59890}, {"loss": 0.6362, "grad_norm": 0.8289751410484314, "learning_rate": 0.0002, "epoch": 4.301615798922801, "step": 59900}, {"loss": 0.6061, "grad_norm": 0.8990927934646606, "learning_rate": 0.0002, "epoch": 4.302333931777379, "step": 59910}, {"loss": 0.5993, "grad_norm": 0.9667525887489319, "learning_rate": 0.0002, "epoch": 4.303052064631957, "step": 59920}, {"loss": 0.5756, "grad_norm": 0.8656060695648193, "learning_rate": 0.0002, "epoch": 4.303770197486535, "step": 59930}, {"loss": 0.6271, "grad_norm": 0.8909396529197693, "learning_rate": 0.0002, "epoch": 4.304488330341113, "step": 59940}, {"loss": 0.5918, "grad_norm": 0.9533283114433289, "learning_rate": 0.0002, "epoch": 4.305206463195692, "step": 59950}, {"loss": 0.6146, "grad_norm": 0.9090739488601685, "learning_rate": 0.0002, "epoch": 4.30592459605027, "step": 59960}, {"loss": 0.5949, "grad_norm": 1.096656322479248, "learning_rate": 0.0002, "epoch": 4.306642728904848, "step": 59970}, {"loss": 0.582, "grad_norm": 1.0392465591430664, "learning_rate": 0.0002, "epoch": 4.307360861759426, "step": 59980}, {"loss": 0.6552, "grad_norm": 0.8733913898468018, "learning_rate": 0.0002, "epoch": 4.308078994614004, "step": 59990}, {"loss": 0.5771, "grad_norm": 0.8287094235420227, "learning_rate": 0.0002, "epoch": 4.308797127468582, "step": 60000}, {"loss": 0.6157, "grad_norm": 0.9267017245292664, "learning_rate": 0.0002, "epoch": 4.30951526032316, "step": 60010}, {"loss": 0.6402, "grad_norm": 0.9969515800476074, "learning_rate": 0.0002, "epoch": 4.310233393177738, "step": 60020}, {"loss": 0.541, "grad_norm": 1.0005015134811401, "learning_rate": 0.0002, "epoch": 4.310951526032316, "step": 60030}, {"loss": 0.6295, "grad_norm": 1.1215369701385498, "learning_rate": 0.0002, "epoch": 4.311669658886894, "step": 60040}, {"loss": 0.6225, "grad_norm": 1.0434890985488892, "learning_rate": 0.0002, "epoch": 4.312387791741473, "step": 60050}, {"loss": 0.5962, "grad_norm": 0.967989981174469, "learning_rate": 0.0002, "epoch": 4.313105924596051, "step": 60060}, {"loss": 0.5862, "grad_norm": 1.007599115371704, "learning_rate": 0.0002, "epoch": 4.313824057450629, "step": 60070}, {"loss": 0.6233, "grad_norm": 0.9356340765953064, "learning_rate": 0.0002, "epoch": 4.314542190305207, "step": 60080}, {"loss": 0.5642, "grad_norm": 0.9566757678985596, "learning_rate": 0.0002, "epoch": 4.315260323159785, "step": 60090}, {"loss": 0.6142, "grad_norm": 1.1066830158233643, "learning_rate": 0.0002, "epoch": 4.315978456014363, "step": 60100}, {"loss": 0.5432, "grad_norm": 0.9895772933959961, "learning_rate": 0.0002, "epoch": 4.316696588868941, "step": 60110}, {"loss": 0.5542, "grad_norm": 1.07423734664917, "learning_rate": 0.0002, "epoch": 4.317414721723519, "step": 60120}, {"loss": 0.5975, "grad_norm": 1.0777037143707275, "learning_rate": 0.0002, "epoch": 4.318132854578097, "step": 60130}, {"loss": 0.6168, "grad_norm": 1.1475656032562256, "learning_rate": 0.0002, "epoch": 4.3188509874326755, "step": 60140}, {"loss": 0.6038, "grad_norm": 1.0705864429473877, "learning_rate": 0.0002, "epoch": 4.3195691202872535, "step": 60150}, {"loss": 0.6032, "grad_norm": 0.8676854968070984, "learning_rate": 0.0002, "epoch": 4.3202872531418315, "step": 60160}, {"loss": 0.632, "grad_norm": 0.9488174319267273, "learning_rate": 0.0002, "epoch": 4.3210053859964095, "step": 60170}, {"loss": 0.6137, "grad_norm": 1.1171153783798218, "learning_rate": 0.0002, "epoch": 4.3217235188509875, "step": 60180}, {"loss": 0.6477, "grad_norm": 1.091435194015503, "learning_rate": 0.0002, "epoch": 4.3224416517055655, "step": 60190}, {"loss": 0.6105, "grad_norm": 0.880944013595581, "learning_rate": 0.0002, "epoch": 4.3231597845601435, "step": 60200}, {"loss": 0.5736, "grad_norm": 0.8458809852600098, "learning_rate": 0.0002, "epoch": 4.3238779174147215, "step": 60210}, {"loss": 0.6211, "grad_norm": 0.7900225520133972, "learning_rate": 0.0002, "epoch": 4.3245960502692995, "step": 60220}, {"loss": 0.6205, "grad_norm": 0.966742753982544, "learning_rate": 0.0002, "epoch": 4.3253141831238775, "step": 60230}, {"loss": 0.6178, "grad_norm": 0.8948110342025757, "learning_rate": 0.0002, "epoch": 4.326032315978456, "step": 60240}, {"loss": 0.6176, "grad_norm": 0.8598700165748596, "learning_rate": 0.0002, "epoch": 4.326750448833034, "step": 60250}, {"loss": 0.6373, "grad_norm": 1.127610206604004, "learning_rate": 0.0002, "epoch": 4.327468581687612, "step": 60260}, {"loss": 0.6081, "grad_norm": 0.8357340693473816, "learning_rate": 0.0002, "epoch": 4.32818671454219, "step": 60270}, {"loss": 0.5839, "grad_norm": 0.8771896362304688, "learning_rate": 0.0002, "epoch": 4.328904847396768, "step": 60280}, {"loss": 0.5959, "grad_norm": 0.9202101826667786, "learning_rate": 0.0002, "epoch": 4.329622980251346, "step": 60290}, {"loss": 0.6387, "grad_norm": 1.1427538394927979, "learning_rate": 0.0002, "epoch": 4.330341113105924, "step": 60300}, {"loss": 0.6306, "grad_norm": 0.8711863160133362, "learning_rate": 0.0002, "epoch": 4.331059245960502, "step": 60310}, {"loss": 0.6011, "grad_norm": 0.972723662853241, "learning_rate": 0.0002, "epoch": 4.33177737881508, "step": 60320}, {"loss": 0.5761, "grad_norm": 1.1496877670288086, "learning_rate": 0.0002, "epoch": 4.332495511669659, "step": 60330}, {"loss": 0.6472, "grad_norm": 1.008581519126892, "learning_rate": 0.0002, "epoch": 4.333213644524237, "step": 60340}, {"loss": 0.6479, "grad_norm": 1.0802706480026245, "learning_rate": 0.0002, "epoch": 4.333931777378815, "step": 60350}, {"loss": 0.6105, "grad_norm": 0.8394291996955872, "learning_rate": 0.0002, "epoch": 4.334649910233393, "step": 60360}, {"loss": 0.6241, "grad_norm": 0.8355905413627625, "learning_rate": 0.0002, "epoch": 4.335368043087971, "step": 60370}, {"loss": 0.6282, "grad_norm": 0.9583960175514221, "learning_rate": 0.0002, "epoch": 4.336086175942549, "step": 60380}, {"loss": 0.6436, "grad_norm": 1.138934850692749, "learning_rate": 0.0002, "epoch": 4.336804308797127, "step": 60390}, {"loss": 0.587, "grad_norm": 1.0334709882736206, "learning_rate": 0.0002, "epoch": 4.337522441651705, "step": 60400}, {"loss": 0.5596, "grad_norm": 0.729686439037323, "learning_rate": 0.0002, "epoch": 4.338240574506283, "step": 60410}, {"loss": 0.5863, "grad_norm": 0.8735929727554321, "learning_rate": 0.0002, "epoch": 4.338958707360861, "step": 60420}, {"loss": 0.5732, "grad_norm": 0.9617681503295898, "learning_rate": 0.0002, "epoch": 4.33967684021544, "step": 60430}, {"loss": 0.5865, "grad_norm": 0.9439655542373657, "learning_rate": 0.0002, "epoch": 4.340394973070018, "step": 60440}, {"loss": 0.5959, "grad_norm": 0.9275408387184143, "learning_rate": 0.0002, "epoch": 4.341113105924596, "step": 60450}, {"loss": 0.6295, "grad_norm": 1.0693308115005493, "learning_rate": 0.0002, "epoch": 4.341831238779174, "step": 60460}, {"loss": 0.6455, "grad_norm": 0.9234438538551331, "learning_rate": 0.0002, "epoch": 4.342549371633752, "step": 60470}, {"loss": 0.6308, "grad_norm": 1.1376168727874756, "learning_rate": 0.0002, "epoch": 4.34326750448833, "step": 60480}, {"loss": 0.623, "grad_norm": 0.9218108654022217, "learning_rate": 0.0002, "epoch": 4.343985637342908, "step": 60490}, {"loss": 0.6291, "grad_norm": 1.1467362642288208, "learning_rate": 0.0002, "epoch": 4.344703770197486, "step": 60500}, {"loss": 0.5757, "grad_norm": 0.9459165930747986, "learning_rate": 0.0002, "epoch": 4.345421903052064, "step": 60510}, {"loss": 0.5963, "grad_norm": 0.9460827708244324, "learning_rate": 0.0002, "epoch": 4.346140035906643, "step": 60520}, {"loss": 0.5822, "grad_norm": 1.0845041275024414, "learning_rate": 0.0002, "epoch": 4.346858168761221, "step": 60530}, {"loss": 0.6326, "grad_norm": 1.082675576210022, "learning_rate": 0.0002, "epoch": 4.347576301615799, "step": 60540}, {"loss": 0.5419, "grad_norm": 0.8443698883056641, "learning_rate": 0.0002, "epoch": 4.348294434470377, "step": 60550}, {"loss": 0.5634, "grad_norm": 1.018393874168396, "learning_rate": 0.0002, "epoch": 4.349012567324955, "step": 60560}, {"loss": 0.6447, "grad_norm": 0.8796373009681702, "learning_rate": 0.0002, "epoch": 4.349730700179533, "step": 60570}, {"loss": 0.6108, "grad_norm": 1.097942590713501, "learning_rate": 0.0002, "epoch": 4.350448833034111, "step": 60580}, {"loss": 0.6161, "grad_norm": 0.8750485181808472, "learning_rate": 0.0002, "epoch": 4.351166965888689, "step": 60590}, {"loss": 0.5849, "grad_norm": 1.0339995622634888, "learning_rate": 0.0002, "epoch": 4.351885098743267, "step": 60600}, {"loss": 0.6097, "grad_norm": 0.9077731966972351, "learning_rate": 0.0002, "epoch": 4.352603231597846, "step": 60610}, {"loss": 0.5657, "grad_norm": 1.051321029663086, "learning_rate": 0.0002, "epoch": 4.353321364452424, "step": 60620}, {"loss": 0.6089, "grad_norm": 1.0018669366836548, "learning_rate": 0.0002, "epoch": 4.354039497307002, "step": 60630}, {"loss": 0.5957, "grad_norm": 1.0349196195602417, "learning_rate": 0.0002, "epoch": 4.35475763016158, "step": 60640}, {"loss": 0.6212, "grad_norm": 1.009589672088623, "learning_rate": 0.0002, "epoch": 4.355475763016158, "step": 60650}, {"loss": 0.5542, "grad_norm": 1.0463480949401855, "learning_rate": 0.0002, "epoch": 4.356193895870736, "step": 60660}, {"loss": 0.5797, "grad_norm": 0.9815132021903992, "learning_rate": 0.0002, "epoch": 4.356912028725314, "step": 60670}, {"loss": 0.6089, "grad_norm": 1.0977262258529663, "learning_rate": 0.0002, "epoch": 4.357630161579892, "step": 60680}, {"loss": 0.6061, "grad_norm": 0.8450005054473877, "learning_rate": 0.0002, "epoch": 4.35834829443447, "step": 60690}, {"loss": 0.5913, "grad_norm": 1.0959078073501587, "learning_rate": 0.0002, "epoch": 4.359066427289049, "step": 60700}, {"loss": 0.5957, "grad_norm": 0.9155098795890808, "learning_rate": 0.0002, "epoch": 4.359784560143627, "step": 60710}, {"loss": 0.6084, "grad_norm": 0.9267987012863159, "learning_rate": 0.0002, "epoch": 4.360502692998205, "step": 60720}, {"loss": 0.5974, "grad_norm": 1.177472472190857, "learning_rate": 0.0002, "epoch": 4.361220825852783, "step": 60730}, {"loss": 0.5911, "grad_norm": 0.8615312576293945, "learning_rate": 0.0002, "epoch": 4.361938958707361, "step": 60740}, {"loss": 0.5819, "grad_norm": 1.0939710140228271, "learning_rate": 0.0002, "epoch": 4.362657091561939, "step": 60750}, {"loss": 0.6263, "grad_norm": 1.0928049087524414, "learning_rate": 0.0002, "epoch": 4.363375224416517, "step": 60760}, {"loss": 0.5772, "grad_norm": 1.0796833038330078, "learning_rate": 0.0002, "epoch": 4.364093357271095, "step": 60770}, {"loss": 0.5879, "grad_norm": 0.9768339991569519, "learning_rate": 0.0002, "epoch": 4.364811490125673, "step": 60780}, {"loss": 0.6335, "grad_norm": 0.9082722067832947, "learning_rate": 0.0002, "epoch": 4.365529622980251, "step": 60790}, {"loss": 0.6037, "grad_norm": 0.9614832997322083, "learning_rate": 0.0002, "epoch": 4.36624775583483, "step": 60800}, {"loss": 0.6185, "grad_norm": 0.8874651789665222, "learning_rate": 0.0002, "epoch": 4.366965888689408, "step": 60810}, {"loss": 0.6524, "grad_norm": 0.8810178637504578, "learning_rate": 0.0002, "epoch": 4.367684021543986, "step": 60820}, {"loss": 0.5908, "grad_norm": 1.0893806219100952, "learning_rate": 0.0002, "epoch": 4.368402154398564, "step": 60830}, {"loss": 0.5782, "grad_norm": 0.9042278528213501, "learning_rate": 0.0002, "epoch": 4.369120287253142, "step": 60840}, {"loss": 0.5798, "grad_norm": 1.0832217931747437, "learning_rate": 0.0002, "epoch": 4.36983842010772, "step": 60850}, {"loss": 0.6235, "grad_norm": 0.9431114792823792, "learning_rate": 0.0002, "epoch": 4.370556552962298, "step": 60860}, {"loss": 0.5869, "grad_norm": 1.031553030014038, "learning_rate": 0.0002, "epoch": 4.371274685816876, "step": 60870}, {"loss": 0.5839, "grad_norm": 0.8702824711799622, "learning_rate": 0.0002, "epoch": 4.371992818671454, "step": 60880}, {"loss": 0.6028, "grad_norm": 1.1109199523925781, "learning_rate": 0.0002, "epoch": 4.372710951526033, "step": 60890}, {"loss": 0.6423, "grad_norm": 0.8369361162185669, "learning_rate": 0.0002, "epoch": 4.373429084380611, "step": 60900}, {"loss": 0.6011, "grad_norm": 0.988915205001831, "learning_rate": 0.0002, "epoch": 4.374147217235189, "step": 60910}, {"loss": 0.6266, "grad_norm": 0.9365919232368469, "learning_rate": 0.0002, "epoch": 4.374865350089767, "step": 60920}, {"loss": 0.5786, "grad_norm": 0.9789398908615112, "learning_rate": 0.0002, "epoch": 4.375583482944345, "step": 60930}, {"loss": 0.6459, "grad_norm": 0.8786931037902832, "learning_rate": 0.0002, "epoch": 4.376301615798923, "step": 60940}, {"loss": 0.631, "grad_norm": 0.8891511559486389, "learning_rate": 0.0002, "epoch": 4.377019748653501, "step": 60950}, {"loss": 0.5909, "grad_norm": 0.9561707377433777, "learning_rate": 0.0002, "epoch": 4.377737881508079, "step": 60960}, {"loss": 0.5815, "grad_norm": 0.8674200177192688, "learning_rate": 0.0002, "epoch": 4.378456014362657, "step": 60970}, {"loss": 0.5664, "grad_norm": 0.9285916090011597, "learning_rate": 0.0002, "epoch": 4.379174147217235, "step": 60980}, {"loss": 0.5727, "grad_norm": 0.9185547232627869, "learning_rate": 0.0002, "epoch": 4.379892280071814, "step": 60990}, {"loss": 0.6296, "grad_norm": 1.081664800643921, "learning_rate": 0.0002, "epoch": 4.380610412926392, "step": 61000}, {"loss": 0.6346, "grad_norm": 1.0475854873657227, "learning_rate": 0.0002, "epoch": 4.38132854578097, "step": 61010}, {"loss": 0.6394, "grad_norm": 1.1519653797149658, "learning_rate": 0.0002, "epoch": 4.382046678635548, "step": 61020}, {"loss": 0.6437, "grad_norm": 0.8757607936859131, "learning_rate": 0.0002, "epoch": 4.382764811490126, "step": 61030}, {"loss": 0.6143, "grad_norm": 0.8707934021949768, "learning_rate": 0.0002, "epoch": 4.383482944344704, "step": 61040}, {"loss": 0.5782, "grad_norm": 1.1807516813278198, "learning_rate": 0.0002, "epoch": 4.384201077199282, "step": 61050}, {"loss": 0.5901, "grad_norm": 1.0674688816070557, "learning_rate": 0.0002, "epoch": 4.38491921005386, "step": 61060}, {"loss": 0.6247, "grad_norm": 0.9321209788322449, "learning_rate": 0.0002, "epoch": 4.385637342908438, "step": 61070}, {"loss": 0.5882, "grad_norm": 1.0786446332931519, "learning_rate": 0.0002, "epoch": 4.3863554757630165, "step": 61080}, {"loss": 0.5966, "grad_norm": 0.9733907580375671, "learning_rate": 0.0002, "epoch": 4.3870736086175945, "step": 61090}, {"loss": 0.5826, "grad_norm": 0.9476010203361511, "learning_rate": 0.0002, "epoch": 4.3877917414721725, "step": 61100}, {"loss": 0.6204, "grad_norm": 1.1321563720703125, "learning_rate": 0.0002, "epoch": 4.3885098743267505, "step": 61110}, {"loss": 0.5908, "grad_norm": 0.9379117488861084, "learning_rate": 0.0002, "epoch": 4.3892280071813286, "step": 61120}, {"loss": 0.586, "grad_norm": 0.8409728407859802, "learning_rate": 0.0002, "epoch": 4.3899461400359066, "step": 61130}, {"loss": 0.614, "grad_norm": 0.8309189081192017, "learning_rate": 0.0002, "epoch": 4.3906642728904846, "step": 61140}, {"loss": 0.6284, "grad_norm": 0.8922196626663208, "learning_rate": 0.0002, "epoch": 4.391382405745063, "step": 61150}, {"loss": 0.6358, "grad_norm": 0.8274614214897156, "learning_rate": 0.0002, "epoch": 4.392100538599641, "step": 61160}, {"loss": 0.5827, "grad_norm": 1.0928618907928467, "learning_rate": 0.0002, "epoch": 4.392818671454219, "step": 61170}, {"loss": 0.616, "grad_norm": 0.9771125316619873, "learning_rate": 0.0002, "epoch": 4.3935368043087974, "step": 61180}, {"loss": 0.6238, "grad_norm": 0.8844535946846008, "learning_rate": 0.0002, "epoch": 4.3942549371633755, "step": 61190}, {"loss": 0.5974, "grad_norm": 1.0498822927474976, "learning_rate": 0.0002, "epoch": 4.3949730700179535, "step": 61200}, {"loss": 0.596, "grad_norm": 0.9882155060768127, "learning_rate": 0.0002, "epoch": 4.3956912028725315, "step": 61210}, {"loss": 0.6385, "grad_norm": 1.090356707572937, "learning_rate": 0.0002, "epoch": 4.3964093357271095, "step": 61220}, {"loss": 0.6298, "grad_norm": 1.0908088684082031, "learning_rate": 0.0002, "epoch": 4.3971274685816875, "step": 61230}, {"loss": 0.6405, "grad_norm": 1.0013501644134521, "learning_rate": 0.0002, "epoch": 4.3978456014362655, "step": 61240}, {"loss": 0.5995, "grad_norm": 1.0916062593460083, "learning_rate": 0.0002, "epoch": 4.3985637342908435, "step": 61250}, {"loss": 0.5938, "grad_norm": 1.0817667245864868, "learning_rate": 0.0002, "epoch": 4.399281867145422, "step": 61260}, {"loss": 0.604, "grad_norm": 0.9745162129402161, "learning_rate": 0.0002, "epoch": 4.4, "step": 61270}, {"loss": 0.6028, "grad_norm": 1.0653400421142578, "learning_rate": 0.0002, "epoch": 4.400718132854578, "step": 61280}, {"loss": 0.6064, "grad_norm": 1.0082067251205444, "learning_rate": 0.0002, "epoch": 4.401436265709156, "step": 61290}, {"loss": 0.5719, "grad_norm": 0.7963659167289734, "learning_rate": 0.0002, "epoch": 4.402154398563734, "step": 61300}, {"loss": 0.6724, "grad_norm": 1.0428845882415771, "learning_rate": 0.0002, "epoch": 4.402872531418312, "step": 61310}, {"loss": 0.5991, "grad_norm": 0.9205707311630249, "learning_rate": 0.0002, "epoch": 4.40359066427289, "step": 61320}, {"loss": 0.6169, "grad_norm": 1.0103533267974854, "learning_rate": 0.0002, "epoch": 4.404308797127468, "step": 61330}, {"loss": 0.6284, "grad_norm": 1.113547682762146, "learning_rate": 0.0002, "epoch": 4.405026929982046, "step": 61340}, {"loss": 0.6071, "grad_norm": 1.137488842010498, "learning_rate": 0.0002, "epoch": 4.405745062836624, "step": 61350}, {"loss": 0.6303, "grad_norm": 1.1284101009368896, "learning_rate": 0.0002, "epoch": 4.406463195691203, "step": 61360}, {"loss": 0.5613, "grad_norm": 0.8010451197624207, "learning_rate": 0.0002, "epoch": 4.407181328545781, "step": 61370}, {"loss": 0.5963, "grad_norm": 0.8893977403640747, "learning_rate": 0.0002, "epoch": 4.407899461400359, "step": 61380}, {"loss": 0.6154, "grad_norm": 0.9098272323608398, "learning_rate": 0.0002, "epoch": 4.408617594254937, "step": 61390}, {"loss": 0.6091, "grad_norm": 1.0613329410552979, "learning_rate": 0.0002, "epoch": 4.409335727109515, "step": 61400}, {"loss": 0.6222, "grad_norm": 1.0070269107818604, "learning_rate": 0.0002, "epoch": 4.410053859964093, "step": 61410}, {"loss": 0.5894, "grad_norm": 0.8632227778434753, "learning_rate": 0.0002, "epoch": 4.410771992818671, "step": 61420}, {"loss": 0.6412, "grad_norm": 1.0183731317520142, "learning_rate": 0.0002, "epoch": 4.411490125673249, "step": 61430}, {"loss": 0.596, "grad_norm": 0.9049941897392273, "learning_rate": 0.0002, "epoch": 4.412208258527827, "step": 61440}, {"loss": 0.5991, "grad_norm": 1.0184082984924316, "learning_rate": 0.0002, "epoch": 4.412926391382406, "step": 61450}, {"loss": 0.5758, "grad_norm": 0.9994277358055115, "learning_rate": 0.0002, "epoch": 4.413644524236984, "step": 61460}, {"loss": 0.6009, "grad_norm": 1.0112420320510864, "learning_rate": 0.0002, "epoch": 4.414362657091562, "step": 61470}, {"loss": 0.584, "grad_norm": 0.9751759171485901, "learning_rate": 0.0002, "epoch": 4.41508078994614, "step": 61480}, {"loss": 0.6307, "grad_norm": 1.047135591506958, "learning_rate": 0.0002, "epoch": 4.415798922800718, "step": 61490}, {"loss": 0.6645, "grad_norm": 0.886282742023468, "learning_rate": 0.0002, "epoch": 4.416517055655296, "step": 61500}, {"loss": 0.6168, "grad_norm": 0.971964418888092, "learning_rate": 0.0002, "epoch": 4.417235188509874, "step": 61510}, {"loss": 0.5822, "grad_norm": 0.9603846073150635, "learning_rate": 0.0002, "epoch": 4.417953321364452, "step": 61520}, {"loss": 0.6349, "grad_norm": 1.060042142868042, "learning_rate": 0.0002, "epoch": 4.41867145421903, "step": 61530}, {"loss": 0.6223, "grad_norm": 1.1231369972229004, "learning_rate": 0.0002, "epoch": 4.419389587073608, "step": 61540}, {"loss": 0.6175, "grad_norm": 0.8269591331481934, "learning_rate": 0.0002, "epoch": 4.420107719928187, "step": 61550}, {"loss": 0.6285, "grad_norm": 1.0341241359710693, "learning_rate": 0.0002, "epoch": 4.420825852782765, "step": 61560}, {"loss": 0.6054, "grad_norm": 0.7276636958122253, "learning_rate": 0.0002, "epoch": 4.421543985637343, "step": 61570}, {"loss": 0.6321, "grad_norm": 1.0663669109344482, "learning_rate": 0.0002, "epoch": 4.422262118491921, "step": 61580}, {"loss": 0.5944, "grad_norm": 0.9764387011528015, "learning_rate": 0.0002, "epoch": 4.422980251346499, "step": 61590}, {"loss": 0.6065, "grad_norm": 1.0953258275985718, "learning_rate": 0.0002, "epoch": 4.423698384201077, "step": 61600}, {"loss": 0.5815, "grad_norm": 0.8877012729644775, "learning_rate": 0.0002, "epoch": 4.424416517055655, "step": 61610}, {"loss": 0.5798, "grad_norm": 0.8781440854072571, "learning_rate": 0.0002, "epoch": 4.425134649910233, "step": 61620}, {"loss": 0.6223, "grad_norm": 0.8333432674407959, "learning_rate": 0.0002, "epoch": 4.425852782764811, "step": 61630}, {"loss": 0.5949, "grad_norm": 0.9647989869117737, "learning_rate": 0.0002, "epoch": 4.42657091561939, "step": 61640}, {"loss": 0.6135, "grad_norm": 1.0801783800125122, "learning_rate": 0.0002, "epoch": 4.427289048473968, "step": 61650}, {"loss": 0.6065, "grad_norm": 0.8215882778167725, "learning_rate": 0.0002, "epoch": 4.428007181328546, "step": 61660}, {"loss": 0.5851, "grad_norm": 0.9853931665420532, "learning_rate": 0.0002, "epoch": 4.428725314183124, "step": 61670}, {"loss": 0.5942, "grad_norm": 0.8658010959625244, "learning_rate": 0.0002, "epoch": 4.429443447037702, "step": 61680}, {"loss": 0.6413, "grad_norm": 1.124064326286316, "learning_rate": 0.0002, "epoch": 4.43016157989228, "step": 61690}, {"loss": 0.6021, "grad_norm": 1.009340763092041, "learning_rate": 0.0002, "epoch": 4.430879712746858, "step": 61700}, {"loss": 0.6127, "grad_norm": 0.8705293536186218, "learning_rate": 0.0002, "epoch": 4.431597845601436, "step": 61710}, {"loss": 0.5971, "grad_norm": 1.1323511600494385, "learning_rate": 0.0002, "epoch": 4.432315978456014, "step": 61720}, {"loss": 0.5985, "grad_norm": 1.1203019618988037, "learning_rate": 0.0002, "epoch": 4.433034111310592, "step": 61730}, {"loss": 0.6178, "grad_norm": 1.1683770418167114, "learning_rate": 0.0002, "epoch": 4.433752244165171, "step": 61740}, {"loss": 0.6132, "grad_norm": 1.0735899209976196, "learning_rate": 0.0002, "epoch": 4.434470377019749, "step": 61750}, {"loss": 0.5664, "grad_norm": 1.142496109008789, "learning_rate": 0.0002, "epoch": 4.435188509874327, "step": 61760}, {"loss": 0.6276, "grad_norm": 1.1157732009887695, "learning_rate": 0.0002, "epoch": 4.435906642728905, "step": 61770}, {"loss": 0.6237, "grad_norm": 0.8845949172973633, "learning_rate": 0.0002, "epoch": 4.436624775583483, "step": 61780}, {"loss": 0.5964, "grad_norm": 1.1212759017944336, "learning_rate": 0.0002, "epoch": 4.437342908438061, "step": 61790}, {"loss": 0.6185, "grad_norm": 0.8832488656044006, "learning_rate": 0.0002, "epoch": 4.438061041292639, "step": 61800}, {"loss": 0.6264, "grad_norm": 0.9059590101242065, "learning_rate": 0.0002, "epoch": 4.438779174147217, "step": 61810}, {"loss": 0.6303, "grad_norm": 1.0625685453414917, "learning_rate": 0.0002, "epoch": 4.439497307001796, "step": 61820}, {"loss": 0.5795, "grad_norm": 0.9565598368644714, "learning_rate": 0.0002, "epoch": 4.440215439856374, "step": 61830}, {"loss": 0.6027, "grad_norm": 0.8975377082824707, "learning_rate": 0.0002, "epoch": 4.440933572710952, "step": 61840}, {"loss": 0.6334, "grad_norm": 1.0412718057632446, "learning_rate": 0.0002, "epoch": 4.44165170556553, "step": 61850}, {"loss": 0.6455, "grad_norm": 0.9923529624938965, "learning_rate": 0.0002, "epoch": 4.442369838420108, "step": 61860}, {"loss": 0.5931, "grad_norm": 1.3025734424591064, "learning_rate": 0.0002, "epoch": 4.443087971274686, "step": 61870}, {"loss": 0.5804, "grad_norm": 1.0031960010528564, "learning_rate": 0.0002, "epoch": 4.443806104129264, "step": 61880}, {"loss": 0.602, "grad_norm": 1.0974701642990112, "learning_rate": 0.0002, "epoch": 4.444524236983842, "step": 61890}, {"loss": 0.6078, "grad_norm": 1.1044024229049683, "learning_rate": 0.0002, "epoch": 4.44524236983842, "step": 61900}, {"loss": 0.6454, "grad_norm": 1.0782772302627563, "learning_rate": 0.0002, "epoch": 4.445960502692998, "step": 61910}, {"loss": 0.6453, "grad_norm": 1.006304383277893, "learning_rate": 0.0002, "epoch": 4.446678635547577, "step": 61920}, {"loss": 0.5449, "grad_norm": 0.9258833527565002, "learning_rate": 0.0002, "epoch": 4.447396768402155, "step": 61930}, {"loss": 0.5744, "grad_norm": 0.9888426065444946, "learning_rate": 0.0002, "epoch": 4.448114901256733, "step": 61940}, {"loss": 0.5853, "grad_norm": 0.9592963457107544, "learning_rate": 0.0002, "epoch": 4.448833034111311, "step": 61950}, {"loss": 0.6142, "grad_norm": 1.0527986288070679, "learning_rate": 0.0002, "epoch": 4.449551166965889, "step": 61960}, {"loss": 0.5829, "grad_norm": 0.8613291382789612, "learning_rate": 0.0002, "epoch": 4.450269299820467, "step": 61970}, {"loss": 0.6176, "grad_norm": 1.1083767414093018, "learning_rate": 0.0002, "epoch": 4.450987432675045, "step": 61980}, {"loss": 0.5768, "grad_norm": 0.772679328918457, "learning_rate": 0.0002, "epoch": 4.451705565529623, "step": 61990}, {"loss": 0.6348, "grad_norm": 0.9052274227142334, "learning_rate": 0.0002, "epoch": 4.452423698384201, "step": 62000}, {"loss": 0.6202, "grad_norm": 1.129667043685913, "learning_rate": 0.0002, "epoch": 4.45314183123878, "step": 62010}, {"loss": 0.6265, "grad_norm": 0.9994529485702515, "learning_rate": 0.0002, "epoch": 4.453859964093358, "step": 62020}, {"loss": 0.6249, "grad_norm": 0.982155978679657, "learning_rate": 0.0002, "epoch": 4.454578096947936, "step": 62030}, {"loss": 0.6255, "grad_norm": 0.9139904975891113, "learning_rate": 0.0002, "epoch": 4.455296229802514, "step": 62040}, {"loss": 0.6237, "grad_norm": 1.0877810716629028, "learning_rate": 0.0002, "epoch": 4.456014362657092, "step": 62050}, {"loss": 0.6105, "grad_norm": 1.0535308122634888, "learning_rate": 0.0002, "epoch": 4.45673249551167, "step": 62060}, {"loss": 0.6084, "grad_norm": 1.0225313901901245, "learning_rate": 0.0002, "epoch": 4.457450628366248, "step": 62070}, {"loss": 0.6239, "grad_norm": 0.8443132042884827, "learning_rate": 0.0002, "epoch": 4.458168761220826, "step": 62080}, {"loss": 0.5895, "grad_norm": 1.0426654815673828, "learning_rate": 0.0002, "epoch": 4.458886894075404, "step": 62090}, {"loss": 0.6022, "grad_norm": 1.1110700368881226, "learning_rate": 0.0002, "epoch": 4.459605026929982, "step": 62100}, {"loss": 0.6436, "grad_norm": 1.0200893878936768, "learning_rate": 0.0002, "epoch": 4.4603231597845605, "step": 62110}, {"loss": 0.628, "grad_norm": 0.9102830290794373, "learning_rate": 0.0002, "epoch": 4.4610412926391385, "step": 62120}, {"loss": 0.5894, "grad_norm": 1.1395094394683838, "learning_rate": 0.0002, "epoch": 4.4617594254937165, "step": 62130}, {"loss": 0.5765, "grad_norm": 1.1202316284179688, "learning_rate": 0.0002, "epoch": 4.4624775583482945, "step": 62140}, {"loss": 0.6238, "grad_norm": 1.142580509185791, "learning_rate": 0.0002, "epoch": 4.4631956912028725, "step": 62150}, {"loss": 0.6502, "grad_norm": 0.9843677878379822, "learning_rate": 0.0002, "epoch": 4.4639138240574505, "step": 62160}, {"loss": 0.6734, "grad_norm": 1.0351676940917969, "learning_rate": 0.0002, "epoch": 4.4646319569120285, "step": 62170}, {"loss": 0.6371, "grad_norm": 0.9365093111991882, "learning_rate": 0.0002, "epoch": 4.4653500897666065, "step": 62180}, {"loss": 0.5827, "grad_norm": 1.041193962097168, "learning_rate": 0.0002, "epoch": 4.4660682226211845, "step": 62190}, {"loss": 0.555, "grad_norm": 0.9686329960823059, "learning_rate": 0.0002, "epoch": 4.466786355475763, "step": 62200}, {"loss": 0.6405, "grad_norm": 1.028622031211853, "learning_rate": 0.0002, "epoch": 4.467504488330341, "step": 62210}, {"loss": 0.5928, "grad_norm": 0.9717516899108887, "learning_rate": 0.0002, "epoch": 4.468222621184919, "step": 62220}, {"loss": 0.6028, "grad_norm": 1.0467450618743896, "learning_rate": 0.0002, "epoch": 4.468940754039497, "step": 62230}, {"loss": 0.593, "grad_norm": 0.943717896938324, "learning_rate": 0.0002, "epoch": 4.469658886894075, "step": 62240}, {"loss": 0.5861, "grad_norm": 0.909429132938385, "learning_rate": 0.0002, "epoch": 4.470377019748653, "step": 62250}, {"loss": 0.6211, "grad_norm": 1.0294792652130127, "learning_rate": 0.0002, "epoch": 4.471095152603231, "step": 62260}, {"loss": 0.6215, "grad_norm": 1.1044281721115112, "learning_rate": 0.0002, "epoch": 4.471813285457809, "step": 62270}, {"loss": 0.6147, "grad_norm": 1.1555784940719604, "learning_rate": 0.0002, "epoch": 4.472531418312387, "step": 62280}, {"loss": 0.627, "grad_norm": 0.9441297650337219, "learning_rate": 0.0002, "epoch": 4.473249551166965, "step": 62290}, {"loss": 0.6205, "grad_norm": 0.9164380431175232, "learning_rate": 0.0002, "epoch": 4.473967684021544, "step": 62300}, {"loss": 0.6413, "grad_norm": 1.1139159202575684, "learning_rate": 0.0002, "epoch": 4.474685816876122, "step": 62310}, {"loss": 0.6013, "grad_norm": 1.0201882123947144, "learning_rate": 0.0002, "epoch": 4.4754039497307, "step": 62320}, {"loss": 0.6127, "grad_norm": 1.1471681594848633, "learning_rate": 0.0002, "epoch": 4.476122082585278, "step": 62330}, {"loss": 0.6322, "grad_norm": 1.0333549976348877, "learning_rate": 0.0002, "epoch": 4.476840215439856, "step": 62340}, {"loss": 0.654, "grad_norm": 0.8929767608642578, "learning_rate": 0.0002, "epoch": 4.477558348294434, "step": 62350}, {"loss": 0.6325, "grad_norm": 0.9465752840042114, "learning_rate": 0.0002, "epoch": 4.478276481149012, "step": 62360}, {"loss": 0.619, "grad_norm": 1.2155033349990845, "learning_rate": 0.0002, "epoch": 4.47899461400359, "step": 62370}, {"loss": 0.5538, "grad_norm": 0.7181217074394226, "learning_rate": 0.0002, "epoch": 4.479712746858169, "step": 62380}, {"loss": 0.6236, "grad_norm": 1.0052744150161743, "learning_rate": 0.0002, "epoch": 4.480430879712747, "step": 62390}, {"loss": 0.6443, "grad_norm": 0.8522219061851501, "learning_rate": 0.0002, "epoch": 4.481149012567325, "step": 62400}, {"loss": 0.6073, "grad_norm": 0.8844723105430603, "learning_rate": 0.0002, "epoch": 4.481867145421903, "step": 62410}, {"loss": 0.6193, "grad_norm": 0.9542465209960938, "learning_rate": 0.0002, "epoch": 4.482585278276481, "step": 62420}, {"loss": 0.6099, "grad_norm": 0.8963674306869507, "learning_rate": 0.0002, "epoch": 4.483303411131059, "step": 62430}, {"loss": 0.5826, "grad_norm": 0.8105363845825195, "learning_rate": 0.0002, "epoch": 4.484021543985637, "step": 62440}, {"loss": 0.6688, "grad_norm": 0.9618421196937561, "learning_rate": 0.0002, "epoch": 4.484739676840215, "step": 62450}, {"loss": 0.6042, "grad_norm": 1.1931076049804688, "learning_rate": 0.0002, "epoch": 4.485457809694793, "step": 62460}, {"loss": 0.5869, "grad_norm": 0.7406999468803406, "learning_rate": 0.0002, "epoch": 4.486175942549371, "step": 62470}, {"loss": 0.604, "grad_norm": 0.7698216438293457, "learning_rate": 0.0002, "epoch": 4.48689407540395, "step": 62480}, {"loss": 0.6062, "grad_norm": 0.862271249294281, "learning_rate": 0.0002, "epoch": 4.487612208258528, "step": 62490}, {"loss": 0.645, "grad_norm": 1.0025171041488647, "learning_rate": 0.0002, "epoch": 4.488330341113106, "step": 62500}, {"loss": 0.5727, "grad_norm": 0.8474493622779846, "learning_rate": 0.0002, "epoch": 4.489048473967684, "step": 62510}, {"loss": 0.6907, "grad_norm": 0.8965697884559631, "learning_rate": 0.0002, "epoch": 4.489766606822262, "step": 62520}, {"loss": 0.5846, "grad_norm": 1.1276488304138184, "learning_rate": 0.0002, "epoch": 4.49048473967684, "step": 62530}, {"loss": 0.6018, "grad_norm": 1.0253537893295288, "learning_rate": 0.0002, "epoch": 4.491202872531418, "step": 62540}, {"loss": 0.5831, "grad_norm": 1.1750596761703491, "learning_rate": 0.0002, "epoch": 4.491921005385996, "step": 62550}, {"loss": 0.6272, "grad_norm": 0.9951794147491455, "learning_rate": 0.0002, "epoch": 4.492639138240574, "step": 62560}, {"loss": 0.5931, "grad_norm": 1.2510017156600952, "learning_rate": 0.0002, "epoch": 4.493357271095153, "step": 62570}, {"loss": 0.6268, "grad_norm": 1.4066375494003296, "learning_rate": 0.0002, "epoch": 4.494075403949731, "step": 62580}, {"loss": 0.6274, "grad_norm": 0.988175094127655, "learning_rate": 0.0002, "epoch": 4.494793536804309, "step": 62590}, {"loss": 0.607, "grad_norm": 1.2049115896224976, "learning_rate": 0.0002, "epoch": 4.495511669658887, "step": 62600}, {"loss": 0.6384, "grad_norm": 0.962464451789856, "learning_rate": 0.0002, "epoch": 4.496229802513465, "step": 62610}, {"loss": 0.6436, "grad_norm": 0.9324793815612793, "learning_rate": 0.0002, "epoch": 4.496947935368043, "step": 62620}, {"loss": 0.6568, "grad_norm": 0.9174214005470276, "learning_rate": 0.0002, "epoch": 4.497666068222621, "step": 62630}, {"loss": 0.6146, "grad_norm": 0.9729902148246765, "learning_rate": 0.0002, "epoch": 4.498384201077199, "step": 62640}, {"loss": 0.6564, "grad_norm": 1.0190484523773193, "learning_rate": 0.0002, "epoch": 4.499102333931777, "step": 62650}, {"loss": 0.6571, "grad_norm": 1.1473679542541504, "learning_rate": 0.0002, "epoch": 4.499820466786355, "step": 62660}, {"loss": 0.6115, "grad_norm": 1.0160558223724365, "learning_rate": 0.0002, "epoch": 4.500538599640934, "step": 62670}, {"loss": 0.6206, "grad_norm": 0.8083887100219727, "learning_rate": 0.0002, "epoch": 4.501256732495512, "step": 62680}, {"loss": 0.6107, "grad_norm": 0.941933274269104, "learning_rate": 0.0002, "epoch": 4.50197486535009, "step": 62690}, {"loss": 0.6181, "grad_norm": 0.9962822794914246, "learning_rate": 0.0002, "epoch": 4.502692998204668, "step": 62700}, {"loss": 0.6364, "grad_norm": 0.8993943333625793, "learning_rate": 0.0002, "epoch": 4.503411131059246, "step": 62710}, {"loss": 0.6141, "grad_norm": 0.9438319206237793, "learning_rate": 0.0002, "epoch": 4.504129263913824, "step": 62720}, {"loss": 0.6453, "grad_norm": 0.7951892018318176, "learning_rate": 0.0002, "epoch": 4.504847396768402, "step": 62730}, {"loss": 0.616, "grad_norm": 0.8875413537025452, "learning_rate": 0.0002, "epoch": 4.50556552962298, "step": 62740}, {"loss": 0.5702, "grad_norm": 0.993819534778595, "learning_rate": 0.0002, "epoch": 4.506283662477558, "step": 62750}, {"loss": 0.6427, "grad_norm": 0.9177559018135071, "learning_rate": 0.0002, "epoch": 4.507001795332137, "step": 62760}, {"loss": 0.6278, "grad_norm": 0.8632771968841553, "learning_rate": 0.0002, "epoch": 4.507719928186715, "step": 62770}, {"loss": 0.6665, "grad_norm": 0.943778395652771, "learning_rate": 0.0002, "epoch": 4.508438061041293, "step": 62780}, {"loss": 0.6068, "grad_norm": 0.8754997849464417, "learning_rate": 0.0002, "epoch": 4.509156193895871, "step": 62790}, {"loss": 0.6345, "grad_norm": 1.102683424949646, "learning_rate": 0.0002, "epoch": 4.509874326750449, "step": 62800}, {"loss": 0.6057, "grad_norm": 1.1156457662582397, "learning_rate": 0.0002, "epoch": 4.510592459605027, "step": 62810}, {"loss": 0.5915, "grad_norm": 0.9178887009620667, "learning_rate": 0.0002, "epoch": 4.511310592459605, "step": 62820}, {"loss": 0.6081, "grad_norm": 0.9520689249038696, "learning_rate": 0.0002, "epoch": 4.512028725314183, "step": 62830}, {"loss": 0.6434, "grad_norm": 0.8880525231361389, "learning_rate": 0.0002, "epoch": 4.512746858168761, "step": 62840}, {"loss": 0.6895, "grad_norm": 0.9541497826576233, "learning_rate": 0.0002, "epoch": 4.513464991023339, "step": 62850}, {"loss": 0.6675, "grad_norm": 1.003766417503357, "learning_rate": 0.0002, "epoch": 4.514183123877918, "step": 62860}, {"loss": 0.6412, "grad_norm": 0.8844705820083618, "learning_rate": 0.0002, "epoch": 4.514901256732496, "step": 62870}, {"loss": 0.6289, "grad_norm": 1.1870828866958618, "learning_rate": 0.0002, "epoch": 4.515619389587074, "step": 62880}, {"loss": 0.6611, "grad_norm": 0.863487184047699, "learning_rate": 0.0002, "epoch": 4.516337522441652, "step": 62890}, {"loss": 0.59, "grad_norm": 0.997770369052887, "learning_rate": 0.0002, "epoch": 4.51705565529623, "step": 62900}, {"loss": 0.6476, "grad_norm": 0.9708612561225891, "learning_rate": 0.0002, "epoch": 4.517773788150808, "step": 62910}, {"loss": 0.6084, "grad_norm": 1.1381206512451172, "learning_rate": 0.0002, "epoch": 4.518491921005386, "step": 62920}, {"loss": 0.5739, "grad_norm": 1.0386693477630615, "learning_rate": 0.0002, "epoch": 4.519210053859964, "step": 62930}, {"loss": 0.6038, "grad_norm": 1.1711705923080444, "learning_rate": 0.0002, "epoch": 4.519928186714543, "step": 62940}, {"loss": 0.6276, "grad_norm": 0.8727447390556335, "learning_rate": 0.0002, "epoch": 4.520646319569121, "step": 62950}, {"loss": 0.6298, "grad_norm": 0.9215193390846252, "learning_rate": 0.0002, "epoch": 4.521364452423699, "step": 62960}, {"loss": 0.6199, "grad_norm": 1.005467176437378, "learning_rate": 0.0002, "epoch": 4.522082585278277, "step": 62970}, {"loss": 0.6324, "grad_norm": 0.8761187791824341, "learning_rate": 0.0002, "epoch": 4.522800718132855, "step": 62980}, {"loss": 0.6152, "grad_norm": 0.957848310470581, "learning_rate": 0.0002, "epoch": 4.523518850987433, "step": 62990}, {"loss": 0.5752, "grad_norm": 0.8634148836135864, "learning_rate": 0.0002, "epoch": 4.524236983842011, "step": 63000}, {"loss": 0.6127, "grad_norm": 0.9557477235794067, "learning_rate": 0.0002, "epoch": 4.524955116696589, "step": 63010}, {"loss": 0.5708, "grad_norm": 1.017720341682434, "learning_rate": 0.0002, "epoch": 4.525673249551167, "step": 63020}, {"loss": 0.6186, "grad_norm": 1.0281825065612793, "learning_rate": 0.0002, "epoch": 4.526391382405745, "step": 63030}, {"loss": 0.6221, "grad_norm": 1.253974437713623, "learning_rate": 0.0002, "epoch": 4.527109515260323, "step": 63040}, {"loss": 0.6381, "grad_norm": 0.8489068150520325, "learning_rate": 0.0002, "epoch": 4.527827648114902, "step": 63050}, {"loss": 0.6022, "grad_norm": 0.9681686162948608, "learning_rate": 0.0002, "epoch": 4.52854578096948, "step": 63060}, {"loss": 0.6166, "grad_norm": 1.10277259349823, "learning_rate": 0.0002, "epoch": 4.529263913824058, "step": 63070}, {"loss": 0.5838, "grad_norm": 0.9469163417816162, "learning_rate": 0.0002, "epoch": 4.529982046678636, "step": 63080}, {"loss": 0.6323, "grad_norm": 1.1228134632110596, "learning_rate": 0.0002, "epoch": 4.530700179533214, "step": 63090}, {"loss": 0.6143, "grad_norm": 0.9673212170600891, "learning_rate": 0.0002, "epoch": 4.531418312387792, "step": 63100}, {"loss": 0.713, "grad_norm": 1.0221107006072998, "learning_rate": 0.0002, "epoch": 4.53213644524237, "step": 63110}, {"loss": 0.6099, "grad_norm": 0.826372504234314, "learning_rate": 0.0002, "epoch": 4.532854578096948, "step": 63120}, {"loss": 0.6487, "grad_norm": 1.1805331707000732, "learning_rate": 0.0002, "epoch": 4.5335727109515265, "step": 63130}, {"loss": 0.6088, "grad_norm": 0.9645666480064392, "learning_rate": 0.0002, "epoch": 4.5342908438061045, "step": 63140}, {"loss": 0.6049, "grad_norm": 1.0838309526443481, "learning_rate": 0.0002, "epoch": 4.5350089766606825, "step": 63150}, {"loss": 0.5972, "grad_norm": 1.061414361000061, "learning_rate": 0.0002, "epoch": 4.5357271095152605, "step": 63160}, {"loss": 0.5706, "grad_norm": 0.841961145401001, "learning_rate": 0.0002, "epoch": 4.5364452423698385, "step": 63170}, {"loss": 0.6168, "grad_norm": 1.1220186948776245, "learning_rate": 0.0002, "epoch": 4.5371633752244165, "step": 63180}, {"loss": 0.6055, "grad_norm": 1.036441445350647, "learning_rate": 0.0002, "epoch": 4.5378815080789945, "step": 63190}, {"loss": 0.619, "grad_norm": 0.9089716076850891, "learning_rate": 0.0002, "epoch": 4.5385996409335725, "step": 63200}, {"loss": 0.6373, "grad_norm": 0.8699982762336731, "learning_rate": 0.0002, "epoch": 4.5393177737881505, "step": 63210}, {"loss": 0.6082, "grad_norm": 0.8489565253257751, "learning_rate": 0.0002, "epoch": 4.5400359066427285, "step": 63220}, {"loss": 0.5957, "grad_norm": 0.7778416275978088, "learning_rate": 0.0002, "epoch": 4.540754039497307, "step": 63230}, {"loss": 0.6109, "grad_norm": 1.0625852346420288, "learning_rate": 0.0002, "epoch": 4.541472172351885, "step": 63240}, {"loss": 0.6039, "grad_norm": 0.8515732884407043, "learning_rate": 0.0002, "epoch": 4.542190305206463, "step": 63250}, {"loss": 0.5827, "grad_norm": 0.7679561376571655, "learning_rate": 0.0002, "epoch": 4.542908438061041, "step": 63260}, {"loss": 0.5948, "grad_norm": 0.7358446717262268, "learning_rate": 0.0002, "epoch": 4.543626570915619, "step": 63270}, {"loss": 0.6265, "grad_norm": 1.0866128206253052, "learning_rate": 0.0002, "epoch": 4.544344703770197, "step": 63280}, {"loss": 0.6622, "grad_norm": 1.0870225429534912, "learning_rate": 0.0002, "epoch": 4.545062836624775, "step": 63290}, {"loss": 0.5859, "grad_norm": 0.951095461845398, "learning_rate": 0.0002, "epoch": 4.545780969479353, "step": 63300}, {"loss": 0.6252, "grad_norm": 1.0914306640625, "learning_rate": 0.0002, "epoch": 4.546499102333931, "step": 63310}, {"loss": 0.6504, "grad_norm": 0.8676106333732605, "learning_rate": 0.0002, "epoch": 4.54721723518851, "step": 63320}, {"loss": 0.6088, "grad_norm": 1.0129096508026123, "learning_rate": 0.0002, "epoch": 4.547935368043088, "step": 63330}, {"loss": 0.617, "grad_norm": 0.8710526823997498, "learning_rate": 0.0002, "epoch": 4.548653500897666, "step": 63340}, {"loss": 0.6336, "grad_norm": 0.7014815807342529, "learning_rate": 0.0002, "epoch": 4.549371633752244, "step": 63350}, {"loss": 0.5758, "grad_norm": 1.1546777486801147, "learning_rate": 0.0002, "epoch": 4.550089766606822, "step": 63360}, {"loss": 0.5976, "grad_norm": 0.7464957237243652, "learning_rate": 0.0002, "epoch": 4.5508078994614, "step": 63370}, {"loss": 0.6016, "grad_norm": 0.9976209998130798, "learning_rate": 0.0002, "epoch": 4.551526032315978, "step": 63380}, {"loss": 0.5784, "grad_norm": 0.9543681740760803, "learning_rate": 0.0002, "epoch": 4.552244165170556, "step": 63390}, {"loss": 0.5873, "grad_norm": 1.1498578786849976, "learning_rate": 0.0002, "epoch": 4.552962298025134, "step": 63400}, {"loss": 0.6445, "grad_norm": 1.0162293910980225, "learning_rate": 0.0002, "epoch": 4.553680430879712, "step": 63410}, {"loss": 0.5677, "grad_norm": 0.9015304446220398, "learning_rate": 0.0002, "epoch": 4.554398563734291, "step": 63420}, {"loss": 0.6257, "grad_norm": 1.1639831066131592, "learning_rate": 0.0002, "epoch": 4.555116696588869, "step": 63430}, {"loss": 0.6763, "grad_norm": 0.9494703412055969, "learning_rate": 0.0002, "epoch": 4.555834829443447, "step": 63440}, {"loss": 0.5955, "grad_norm": 1.0555956363677979, "learning_rate": 0.0002, "epoch": 4.556552962298025, "step": 63450}, {"loss": 0.6634, "grad_norm": 0.8513827919960022, "learning_rate": 0.0002, "epoch": 4.557271095152603, "step": 63460}, {"loss": 0.6507, "grad_norm": 1.0614275932312012, "learning_rate": 0.0002, "epoch": 4.557989228007181, "step": 63470}, {"loss": 0.5619, "grad_norm": 0.8341137766838074, "learning_rate": 0.0002, "epoch": 4.558707360861759, "step": 63480}, {"loss": 0.6147, "grad_norm": 1.2136222124099731, "learning_rate": 0.0002, "epoch": 4.559425493716337, "step": 63490}, {"loss": 0.6313, "grad_norm": 0.8806019425392151, "learning_rate": 0.0002, "epoch": 4.560143626570916, "step": 63500}, {"loss": 0.6012, "grad_norm": 1.2548854351043701, "learning_rate": 0.0002, "epoch": 4.560861759425494, "step": 63510}, {"loss": 0.5995, "grad_norm": 1.0162668228149414, "learning_rate": 0.0002, "epoch": 4.561579892280072, "step": 63520}, {"loss": 0.5895, "grad_norm": 1.0487624406814575, "learning_rate": 0.0002, "epoch": 4.56229802513465, "step": 63530}, {"loss": 0.5997, "grad_norm": 1.2505502700805664, "learning_rate": 0.0002, "epoch": 4.563016157989228, "step": 63540}, {"loss": 0.618, "grad_norm": 0.9930511713027954, "learning_rate": 0.0002, "epoch": 4.563734290843806, "step": 63550}, {"loss": 0.6695, "grad_norm": 0.8132568001747131, "learning_rate": 0.0002, "epoch": 4.564452423698384, "step": 63560}, {"loss": 0.6221, "grad_norm": 1.0129177570343018, "learning_rate": 0.0002, "epoch": 4.565170556552962, "step": 63570}, {"loss": 0.6463, "grad_norm": 0.9011693596839905, "learning_rate": 0.0002, "epoch": 4.56588868940754, "step": 63580}, {"loss": 0.6046, "grad_norm": 0.9161545634269714, "learning_rate": 0.0002, "epoch": 4.566606822262118, "step": 63590}, {"loss": 0.6413, "grad_norm": 0.8852348327636719, "learning_rate": 0.0002, "epoch": 4.567324955116696, "step": 63600}, {"loss": 0.6282, "grad_norm": 0.8579391837120056, "learning_rate": 0.0002, "epoch": 4.568043087971275, "step": 63610}, {"loss": 0.6041, "grad_norm": 0.9271050095558167, "learning_rate": 0.0002, "epoch": 4.568761220825853, "step": 63620}, {"loss": 0.6156, "grad_norm": 0.9881834983825684, "learning_rate": 0.0002, "epoch": 4.569479353680431, "step": 63630}, {"loss": 0.6164, "grad_norm": 1.0255686044692993, "learning_rate": 0.0002, "epoch": 4.570197486535009, "step": 63640}, {"loss": 0.6416, "grad_norm": 0.8758876919746399, "learning_rate": 0.0002, "epoch": 4.570915619389587, "step": 63650}, {"loss": 0.6787, "grad_norm": 1.0134185552597046, "learning_rate": 0.0002, "epoch": 4.571633752244165, "step": 63660}, {"loss": 0.6245, "grad_norm": 0.8535705208778381, "learning_rate": 0.0002, "epoch": 4.572351885098743, "step": 63670}, {"loss": 0.6282, "grad_norm": 0.9614834785461426, "learning_rate": 0.0002, "epoch": 4.573070017953321, "step": 63680}, {"loss": 0.6461, "grad_norm": 0.9004243612289429, "learning_rate": 0.0002, "epoch": 4.5737881508079, "step": 63690}, {"loss": 0.6172, "grad_norm": 0.9563080072402954, "learning_rate": 0.0002, "epoch": 4.574506283662478, "step": 63700}, {"loss": 0.6059, "grad_norm": 1.024857521057129, "learning_rate": 0.0002, "epoch": 4.575224416517056, "step": 63710}, {"loss": 0.6188, "grad_norm": 0.9345638155937195, "learning_rate": 0.0002, "epoch": 4.575942549371634, "step": 63720}, {"loss": 0.6814, "grad_norm": 1.27083158493042, "learning_rate": 0.0002, "epoch": 4.576660682226212, "step": 63730}, {"loss": 0.5987, "grad_norm": 1.0866559743881226, "learning_rate": 0.0002, "epoch": 4.57737881508079, "step": 63740}, {"loss": 0.5738, "grad_norm": 0.9253925681114197, "learning_rate": 0.0002, "epoch": 4.578096947935368, "step": 63750}, {"loss": 0.5981, "grad_norm": 0.8127399682998657, "learning_rate": 0.0002, "epoch": 4.578815080789946, "step": 63760}, {"loss": 0.6321, "grad_norm": 1.0453993082046509, "learning_rate": 0.0002, "epoch": 4.579533213644524, "step": 63770}, {"loss": 0.6423, "grad_norm": 1.2227544784545898, "learning_rate": 0.0002, "epoch": 4.580251346499102, "step": 63780}, {"loss": 0.6405, "grad_norm": 1.0207865238189697, "learning_rate": 0.0002, "epoch": 4.580969479353681, "step": 63790}, {"loss": 0.6268, "grad_norm": 1.030447244644165, "learning_rate": 0.0002, "epoch": 4.581687612208259, "step": 63800}, {"loss": 0.6014, "grad_norm": 1.0855677127838135, "learning_rate": 0.0002, "epoch": 4.582405745062837, "step": 63810}, {"loss": 0.6204, "grad_norm": 0.9572556018829346, "learning_rate": 0.0002, "epoch": 4.583123877917415, "step": 63820}, {"loss": 0.6094, "grad_norm": 0.9061040282249451, "learning_rate": 0.0002, "epoch": 4.583842010771993, "step": 63830}, {"loss": 0.6074, "grad_norm": 0.9267677068710327, "learning_rate": 0.0002, "epoch": 4.584560143626571, "step": 63840}, {"loss": 0.6525, "grad_norm": 1.070076823234558, "learning_rate": 0.0002, "epoch": 4.585278276481149, "step": 63850}, {"loss": 0.6074, "grad_norm": 1.045881748199463, "learning_rate": 0.0002, "epoch": 4.585996409335727, "step": 63860}, {"loss": 0.6106, "grad_norm": 0.9190576672554016, "learning_rate": 0.0002, "epoch": 4.586714542190305, "step": 63870}, {"loss": 0.6213, "grad_norm": 0.9263932704925537, "learning_rate": 0.0002, "epoch": 4.587432675044884, "step": 63880}, {"loss": 0.6077, "grad_norm": 1.0217589139938354, "learning_rate": 0.0002, "epoch": 4.588150807899462, "step": 63890}, {"loss": 0.5798, "grad_norm": 0.9200088381767273, "learning_rate": 0.0002, "epoch": 4.58886894075404, "step": 63900}, {"loss": 0.6311, "grad_norm": 0.9877251386642456, "learning_rate": 0.0002, "epoch": 4.589587073608618, "step": 63910}, {"loss": 0.5981, "grad_norm": 1.0059093236923218, "learning_rate": 0.0002, "epoch": 4.590305206463196, "step": 63920}, {"loss": 0.6265, "grad_norm": 1.2618095874786377, "learning_rate": 0.0002, "epoch": 4.591023339317774, "step": 63930}, {"loss": 0.583, "grad_norm": 1.1779268980026245, "learning_rate": 0.0002, "epoch": 4.591741472172352, "step": 63940}, {"loss": 0.6232, "grad_norm": 1.2339502573013306, "learning_rate": 0.0002, "epoch": 4.59245960502693, "step": 63950}, {"loss": 0.5985, "grad_norm": 0.7488788366317749, "learning_rate": 0.0002, "epoch": 4.593177737881508, "step": 63960}, {"loss": 0.5991, "grad_norm": 0.8366380929946899, "learning_rate": 0.0002, "epoch": 4.593895870736086, "step": 63970}, {"loss": 0.5864, "grad_norm": 1.0292677879333496, "learning_rate": 0.0002, "epoch": 4.594614003590665, "step": 63980}, {"loss": 0.666, "grad_norm": 0.7938551306724548, "learning_rate": 0.0002, "epoch": 4.595332136445243, "step": 63990}, {"loss": 0.6202, "grad_norm": 0.7958516478538513, "learning_rate": 0.0002, "epoch": 4.596050269299821, "step": 64000}, {"loss": 0.5868, "grad_norm": 0.9613908529281616, "learning_rate": 0.0002, "epoch": 4.596768402154399, "step": 64010}, {"loss": 0.6299, "grad_norm": 1.0253773927688599, "learning_rate": 0.0002, "epoch": 4.597486535008977, "step": 64020}, {"loss": 0.5964, "grad_norm": 1.0560888051986694, "learning_rate": 0.0002, "epoch": 4.598204667863555, "step": 64030}, {"loss": 0.6681, "grad_norm": 1.1093556880950928, "learning_rate": 0.0002, "epoch": 4.598922800718133, "step": 64040}, {"loss": 0.6097, "grad_norm": 0.8492098450660706, "learning_rate": 0.0002, "epoch": 4.599640933572711, "step": 64050}, {"loss": 0.6029, "grad_norm": 1.0070436000823975, "learning_rate": 0.0002, "epoch": 4.6003590664272895, "step": 64060}, {"loss": 0.6392, "grad_norm": 0.9774282574653625, "learning_rate": 0.0002, "epoch": 4.6010771992818675, "step": 64070}, {"loss": 0.6397, "grad_norm": 1.0744960308074951, "learning_rate": 0.0002, "epoch": 4.6017953321364455, "step": 64080}, {"loss": 0.6491, "grad_norm": 1.0101491212844849, "learning_rate": 0.0002, "epoch": 4.6025134649910235, "step": 64090}, {"loss": 0.594, "grad_norm": 1.2306591272354126, "learning_rate": 0.0002, "epoch": 4.6032315978456015, "step": 64100}, {"loss": 0.5783, "grad_norm": 0.9187033176422119, "learning_rate": 0.0002, "epoch": 4.6039497307001795, "step": 64110}, {"loss": 0.5982, "grad_norm": 0.9178676605224609, "learning_rate": 0.0002, "epoch": 4.6046678635547575, "step": 64120}, {"loss": 0.6074, "grad_norm": 1.006374716758728, "learning_rate": 0.0002, "epoch": 4.6053859964093355, "step": 64130}, {"loss": 0.6402, "grad_norm": 1.0774449110031128, "learning_rate": 0.0002, "epoch": 4.6061041292639135, "step": 64140}, {"loss": 0.6076, "grad_norm": 1.0360658168792725, "learning_rate": 0.0002, "epoch": 4.6068222621184916, "step": 64150}, {"loss": 0.6259, "grad_norm": 1.1061090230941772, "learning_rate": 0.0002, "epoch": 4.6075403949730696, "step": 64160}, {"loss": 0.6304, "grad_norm": 1.0320971012115479, "learning_rate": 0.0002, "epoch": 4.608258527827648, "step": 64170}, {"loss": 0.6182, "grad_norm": 0.8596988916397095, "learning_rate": 0.0002, "epoch": 4.6089766606822264, "step": 64180}, {"loss": 0.5646, "grad_norm": 1.1665741205215454, "learning_rate": 0.0002, "epoch": 4.6096947935368044, "step": 64190}, {"loss": 0.6219, "grad_norm": 0.857207715511322, "learning_rate": 0.0002, "epoch": 4.6104129263913824, "step": 64200}, {"loss": 0.6271, "grad_norm": 1.0088987350463867, "learning_rate": 0.0002, "epoch": 4.6111310592459605, "step": 64210}, {"loss": 0.6209, "grad_norm": 1.0985605716705322, "learning_rate": 0.0002, "epoch": 4.6118491921005385, "step": 64220}, {"loss": 0.6455, "grad_norm": 0.9504913687705994, "learning_rate": 0.0002, "epoch": 4.6125673249551165, "step": 64230}, {"loss": 0.6054, "grad_norm": 0.8415018916130066, "learning_rate": 0.0002, "epoch": 4.6132854578096945, "step": 64240}, {"loss": 0.5975, "grad_norm": 0.9857034087181091, "learning_rate": 0.0002, "epoch": 4.614003590664273, "step": 64250}, {"loss": 0.6347, "grad_norm": 1.0164235830307007, "learning_rate": 0.0002, "epoch": 4.614721723518851, "step": 64260}, {"loss": 0.5877, "grad_norm": 0.949481725692749, "learning_rate": 0.0002, "epoch": 4.615439856373429, "step": 64270}, {"loss": 0.5737, "grad_norm": 0.9526455998420715, "learning_rate": 0.0002, "epoch": 4.616157989228007, "step": 64280}, {"loss": 0.6134, "grad_norm": 1.1121242046356201, "learning_rate": 0.0002, "epoch": 4.616876122082585, "step": 64290}, {"loss": 0.6152, "grad_norm": 0.9598871469497681, "learning_rate": 0.0002, "epoch": 4.617594254937163, "step": 64300}, {"loss": 0.6405, "grad_norm": 1.0406304597854614, "learning_rate": 0.0002, "epoch": 4.618312387791741, "step": 64310}, {"loss": 0.5971, "grad_norm": 1.1816964149475098, "learning_rate": 0.0002, "epoch": 4.619030520646319, "step": 64320}, {"loss": 0.6483, "grad_norm": 0.9818326830863953, "learning_rate": 0.0002, "epoch": 4.619748653500897, "step": 64330}, {"loss": 0.6141, "grad_norm": 0.952017605304718, "learning_rate": 0.0002, "epoch": 4.620466786355475, "step": 64340}, {"loss": 0.6146, "grad_norm": 1.1263453960418701, "learning_rate": 0.0002, "epoch": 4.621184919210053, "step": 64350}, {"loss": 0.5973, "grad_norm": 1.1158473491668701, "learning_rate": 0.0002, "epoch": 4.621903052064632, "step": 64360}, {"loss": 0.6029, "grad_norm": 0.9056766033172607, "learning_rate": 0.0002, "epoch": 4.62262118491921, "step": 64370}, {"loss": 0.6488, "grad_norm": 0.8113203048706055, "learning_rate": 0.0002, "epoch": 4.623339317773788, "step": 64380}, {"loss": 0.6391, "grad_norm": 0.8646712899208069, "learning_rate": 0.0002, "epoch": 4.624057450628366, "step": 64390}, {"loss": 0.6191, "grad_norm": 1.0064425468444824, "learning_rate": 0.0002, "epoch": 4.624775583482944, "step": 64400}, {"loss": 0.5826, "grad_norm": 0.9867565631866455, "learning_rate": 0.0002, "epoch": 4.625493716337522, "step": 64410}, {"loss": 0.6409, "grad_norm": 1.018764615058899, "learning_rate": 0.0002, "epoch": 4.6262118491921, "step": 64420}, {"loss": 0.5992, "grad_norm": 1.0607863664627075, "learning_rate": 0.0002, "epoch": 4.626929982046678, "step": 64430}, {"loss": 0.6502, "grad_norm": 1.012825846672058, "learning_rate": 0.0002, "epoch": 4.627648114901257, "step": 64440}, {"loss": 0.6074, "grad_norm": 0.8441653847694397, "learning_rate": 0.0002, "epoch": 4.628366247755835, "step": 64450}, {"loss": 0.6462, "grad_norm": 0.9819194674491882, "learning_rate": 0.0002, "epoch": 4.629084380610413, "step": 64460}, {"loss": 0.5983, "grad_norm": 0.925519585609436, "learning_rate": 0.0002, "epoch": 4.629802513464991, "step": 64470}, {"loss": 0.5959, "grad_norm": 0.9409030079841614, "learning_rate": 0.0002, "epoch": 4.630520646319569, "step": 64480}, {"loss": 0.6265, "grad_norm": 1.148024559020996, "learning_rate": 0.0002, "epoch": 4.631238779174147, "step": 64490}, {"loss": 0.6556, "grad_norm": 0.8225533962249756, "learning_rate": 0.0002, "epoch": 4.631956912028725, "step": 64500}, {"loss": 0.5922, "grad_norm": 0.8806734681129456, "learning_rate": 0.0002, "epoch": 4.632675044883303, "step": 64510}, {"loss": 0.6202, "grad_norm": 0.9656694531440735, "learning_rate": 0.0002, "epoch": 4.633393177737881, "step": 64520}, {"loss": 0.6044, "grad_norm": 0.9977783560752869, "learning_rate": 0.0002, "epoch": 4.634111310592459, "step": 64530}, {"loss": 0.5741, "grad_norm": 0.9259420037269592, "learning_rate": 0.0002, "epoch": 4.634829443447038, "step": 64540}, {"loss": 0.5801, "grad_norm": 1.0215885639190674, "learning_rate": 0.0002, "epoch": 4.635547576301616, "step": 64550}, {"loss": 0.6492, "grad_norm": 1.1082557439804077, "learning_rate": 0.0002, "epoch": 4.636265709156194, "step": 64560}, {"loss": 0.6285, "grad_norm": 1.1183207035064697, "learning_rate": 0.0002, "epoch": 4.636983842010772, "step": 64570}, {"loss": 0.6216, "grad_norm": 0.9914339184761047, "learning_rate": 0.0002, "epoch": 4.63770197486535, "step": 64580}, {"loss": 0.6416, "grad_norm": 0.8065831661224365, "learning_rate": 0.0002, "epoch": 4.638420107719928, "step": 64590}, {"loss": 0.6078, "grad_norm": 1.1546721458435059, "learning_rate": 0.0002, "epoch": 4.639138240574506, "step": 64600}, {"loss": 0.6219, "grad_norm": 1.0395900011062622, "learning_rate": 0.0002, "epoch": 4.639856373429084, "step": 64610}, {"loss": 0.5939, "grad_norm": 0.9957455992698669, "learning_rate": 0.0002, "epoch": 4.640574506283663, "step": 64620}, {"loss": 0.6653, "grad_norm": 1.069557785987854, "learning_rate": 0.0002, "epoch": 4.641292639138241, "step": 64630}, {"loss": 0.6546, "grad_norm": 1.005236268043518, "learning_rate": 0.0002, "epoch": 4.642010771992819, "step": 64640}, {"loss": 0.6262, "grad_norm": 1.0216304063796997, "learning_rate": 0.0002, "epoch": 4.642728904847397, "step": 64650}, {"loss": 0.6756, "grad_norm": 0.8567317128181458, "learning_rate": 0.0002, "epoch": 4.643447037701975, "step": 64660}, {"loss": 0.5997, "grad_norm": 1.0386067628860474, "learning_rate": 0.0002, "epoch": 4.644165170556553, "step": 64670}, {"loss": 0.6471, "grad_norm": 0.9566055536270142, "learning_rate": 0.0002, "epoch": 4.644883303411131, "step": 64680}, {"loss": 0.6601, "grad_norm": 1.0990564823150635, "learning_rate": 0.0002, "epoch": 4.645601436265709, "step": 64690}, {"loss": 0.6418, "grad_norm": 0.9962695240974426, "learning_rate": 0.0002, "epoch": 4.646319569120287, "step": 64700}, {"loss": 0.6442, "grad_norm": 0.9041377305984497, "learning_rate": 0.0002, "epoch": 4.647037701974865, "step": 64710}, {"loss": 0.6276, "grad_norm": 0.8611233234405518, "learning_rate": 0.0002, "epoch": 4.647755834829443, "step": 64720}, {"loss": 0.6015, "grad_norm": 1.1569812297821045, "learning_rate": 0.0002, "epoch": 4.648473967684022, "step": 64730}, {"loss": 0.6169, "grad_norm": 0.7946197390556335, "learning_rate": 0.0002, "epoch": 4.6491921005386, "step": 64740}, {"loss": 0.668, "grad_norm": 0.9612061381340027, "learning_rate": 0.0002, "epoch": 4.649910233393178, "step": 64750}, {"loss": 0.6741, "grad_norm": 0.9669303297996521, "learning_rate": 0.0002, "epoch": 4.650628366247756, "step": 64760}, {"loss": 0.593, "grad_norm": 0.8117775321006775, "learning_rate": 0.0002, "epoch": 4.651346499102334, "step": 64770}, {"loss": 0.6915, "grad_norm": 1.2326241731643677, "learning_rate": 0.0002, "epoch": 4.652064631956912, "step": 64780}, {"loss": 0.6076, "grad_norm": 0.7494568228721619, "learning_rate": 0.0002, "epoch": 4.65278276481149, "step": 64790}, {"loss": 0.58, "grad_norm": 0.8145379424095154, "learning_rate": 0.0002, "epoch": 4.653500897666068, "step": 64800}, {"loss": 0.6351, "grad_norm": 1.0139610767364502, "learning_rate": 0.0002, "epoch": 4.654219030520647, "step": 64810}, {"loss": 0.6575, "grad_norm": 0.9887115359306335, "learning_rate": 0.0002, "epoch": 4.654937163375225, "step": 64820}, {"loss": 0.6338, "grad_norm": 0.9565147161483765, "learning_rate": 0.0002, "epoch": 4.655655296229803, "step": 64830}, {"loss": 0.6212, "grad_norm": 0.9022467136383057, "learning_rate": 0.0002, "epoch": 4.656373429084381, "step": 64840}, {"loss": 0.6395, "grad_norm": 1.075003981590271, "learning_rate": 0.0002, "epoch": 4.657091561938959, "step": 64850}, {"loss": 0.6191, "grad_norm": 0.8705733418464661, "learning_rate": 0.0002, "epoch": 4.657809694793537, "step": 64860}, {"loss": 0.5543, "grad_norm": 1.0826832056045532, "learning_rate": 0.0002, "epoch": 4.658527827648115, "step": 64870}, {"loss": 0.6363, "grad_norm": 1.1056268215179443, "learning_rate": 0.0002, "epoch": 4.659245960502693, "step": 64880}, {"loss": 0.6252, "grad_norm": 0.8664149641990662, "learning_rate": 0.0002, "epoch": 4.659964093357271, "step": 64890}, {"loss": 0.6126, "grad_norm": 0.9487230181694031, "learning_rate": 0.0002, "epoch": 4.660682226211849, "step": 64900}, {"loss": 0.5968, "grad_norm": 1.0357837677001953, "learning_rate": 0.0002, "epoch": 4.661400359066427, "step": 64910}, {"loss": 0.603, "grad_norm": 0.8620632290840149, "learning_rate": 0.0002, "epoch": 4.662118491921006, "step": 64920}, {"loss": 0.6113, "grad_norm": 1.108986735343933, "learning_rate": 0.0002, "epoch": 4.662836624775584, "step": 64930}, {"loss": 0.6115, "grad_norm": 0.8017674684524536, "learning_rate": 0.0002, "epoch": 4.663554757630162, "step": 64940}, {"loss": 0.6268, "grad_norm": 0.882347583770752, "learning_rate": 0.0002, "epoch": 4.66427289048474, "step": 64950}, {"loss": 0.657, "grad_norm": 0.9466867446899414, "learning_rate": 0.0002, "epoch": 4.664991023339318, "step": 64960}, {"loss": 0.645, "grad_norm": 1.1823636293411255, "learning_rate": 0.0002, "epoch": 4.665709156193896, "step": 64970}, {"loss": 0.5889, "grad_norm": 0.9535016417503357, "learning_rate": 0.0002, "epoch": 4.666427289048474, "step": 64980}, {"loss": 0.5986, "grad_norm": 0.9456726312637329, "learning_rate": 0.0002, "epoch": 4.667145421903052, "step": 64990}, {"loss": 0.6334, "grad_norm": 0.7761920690536499, "learning_rate": 0.0002, "epoch": 4.667863554757631, "step": 65000}, {"loss": 0.6645, "grad_norm": 1.060357689857483, "learning_rate": 0.0002, "epoch": 4.668581687612209, "step": 65010}, {"loss": 0.6369, "grad_norm": 0.9083862900733948, "learning_rate": 0.0002, "epoch": 4.669299820466787, "step": 65020}, {"loss": 0.5839, "grad_norm": 0.8745762705802917, "learning_rate": 0.0002, "epoch": 4.670017953321365, "step": 65030}, {"loss": 0.6517, "grad_norm": 0.8715422749519348, "learning_rate": 0.0002, "epoch": 4.670736086175943, "step": 65040}, {"loss": 0.6061, "grad_norm": 0.9407707452774048, "learning_rate": 0.0002, "epoch": 4.671454219030521, "step": 65050}, {"loss": 0.5928, "grad_norm": 0.8998945355415344, "learning_rate": 0.0002, "epoch": 4.672172351885099, "step": 65060}, {"loss": 0.6107, "grad_norm": 0.9147891998291016, "learning_rate": 0.0002, "epoch": 4.672890484739677, "step": 65070}, {"loss": 0.6215, "grad_norm": 1.116614580154419, "learning_rate": 0.0002, "epoch": 4.673608617594255, "step": 65080}, {"loss": 0.641, "grad_norm": 1.0764213800430298, "learning_rate": 0.0002, "epoch": 4.674326750448833, "step": 65090}, {"loss": 0.6353, "grad_norm": 0.9115945100784302, "learning_rate": 0.0002, "epoch": 4.6750448833034115, "step": 65100}, {"loss": 0.6506, "grad_norm": 1.001251459121704, "learning_rate": 0.0002, "epoch": 4.6757630161579895, "step": 65110}, {"loss": 0.6414, "grad_norm": 1.0330020189285278, "learning_rate": 0.0002, "epoch": 4.6764811490125675, "step": 65120}, {"loss": 0.6421, "grad_norm": 0.9083197116851807, "learning_rate": 0.0002, "epoch": 4.6771992818671455, "step": 65130}, {"loss": 0.5905, "grad_norm": 0.9298770427703857, "learning_rate": 0.0002, "epoch": 4.6779174147217235, "step": 65140}, {"loss": 0.633, "grad_norm": 1.0009549856185913, "learning_rate": 0.0002, "epoch": 4.6786355475763015, "step": 65150}, {"loss": 0.661, "grad_norm": 0.951389729976654, "learning_rate": 0.0002, "epoch": 4.6793536804308795, "step": 65160}, {"loss": 0.6282, "grad_norm": 1.151870608329773, "learning_rate": 0.0002, "epoch": 4.6800718132854575, "step": 65170}, {"loss": 0.5944, "grad_norm": 1.0074727535247803, "learning_rate": 0.0002, "epoch": 4.680789946140036, "step": 65180}, {"loss": 0.6539, "grad_norm": 1.0490152835845947, "learning_rate": 0.0002, "epoch": 4.681508078994614, "step": 65190}, {"loss": 0.6604, "grad_norm": 0.8967363834381104, "learning_rate": 0.0002, "epoch": 4.682226211849192, "step": 65200}, {"loss": 0.6582, "grad_norm": 1.2314889430999756, "learning_rate": 0.0002, "epoch": 4.68294434470377, "step": 65210}, {"loss": 0.6104, "grad_norm": 0.7764074802398682, "learning_rate": 0.0002, "epoch": 4.683662477558348, "step": 65220}, {"loss": 0.6401, "grad_norm": 1.0587822198867798, "learning_rate": 0.0002, "epoch": 4.684380610412926, "step": 65230}, {"loss": 0.556, "grad_norm": 0.916114091873169, "learning_rate": 0.0002, "epoch": 4.685098743267504, "step": 65240}, {"loss": 0.5912, "grad_norm": 0.9117472767829895, "learning_rate": 0.0002, "epoch": 4.685816876122082, "step": 65250}, {"loss": 0.6127, "grad_norm": 0.8369293212890625, "learning_rate": 0.0002, "epoch": 4.68653500897666, "step": 65260}, {"loss": 0.5715, "grad_norm": 0.9700121879577637, "learning_rate": 0.0002, "epoch": 4.687253141831238, "step": 65270}, {"loss": 0.6364, "grad_norm": 1.0008411407470703, "learning_rate": 0.0002, "epoch": 4.687971274685816, "step": 65280}, {"loss": 0.5816, "grad_norm": 0.9339549541473389, "learning_rate": 0.0002, "epoch": 4.688689407540395, "step": 65290}, {"loss": 0.6382, "grad_norm": 0.956701934337616, "learning_rate": 0.0002, "epoch": 4.689407540394973, "step": 65300}, {"loss": 0.6368, "grad_norm": 1.2042720317840576, "learning_rate": 0.0002, "epoch": 4.690125673249551, "step": 65310}, {"loss": 0.6138, "grad_norm": 0.8679144382476807, "learning_rate": 0.0002, "epoch": 4.690843806104129, "step": 65320}, {"loss": 0.6619, "grad_norm": 1.2320687770843506, "learning_rate": 0.0002, "epoch": 4.691561938958707, "step": 65330}, {"loss": 0.6212, "grad_norm": 0.8397238850593567, "learning_rate": 0.0002, "epoch": 4.692280071813285, "step": 65340}, {"loss": 0.578, "grad_norm": 0.7850362658500671, "learning_rate": 0.0002, "epoch": 4.692998204667863, "step": 65350}, {"loss": 0.632, "grad_norm": 0.9281290173530579, "learning_rate": 0.0002, "epoch": 4.693716337522441, "step": 65360}, {"loss": 0.6492, "grad_norm": 1.1506335735321045, "learning_rate": 0.0002, "epoch": 4.69443447037702, "step": 65370}, {"loss": 0.6503, "grad_norm": 1.0910584926605225, "learning_rate": 0.0002, "epoch": 4.695152603231598, "step": 65380}, {"loss": 0.66, "grad_norm": 0.8937386274337769, "learning_rate": 0.0002, "epoch": 4.695870736086176, "step": 65390}, {"loss": 0.6425, "grad_norm": 1.0163888931274414, "learning_rate": 0.0002, "epoch": 4.696588868940754, "step": 65400}, {"loss": 0.647, "grad_norm": 1.0290007591247559, "learning_rate": 0.0002, "epoch": 4.697307001795332, "step": 65410}, {"loss": 0.614, "grad_norm": 0.9046576023101807, "learning_rate": 0.0002, "epoch": 4.69802513464991, "step": 65420}, {"loss": 0.5844, "grad_norm": 1.0030237436294556, "learning_rate": 0.0002, "epoch": 4.698743267504488, "step": 65430}, {"loss": 0.6273, "grad_norm": 0.8196740746498108, "learning_rate": 0.0002, "epoch": 4.699461400359066, "step": 65440}, {"loss": 0.6273, "grad_norm": 0.9036651849746704, "learning_rate": 0.0002, "epoch": 4.700179533213644, "step": 65450}, {"loss": 0.6024, "grad_norm": 1.2080141305923462, "learning_rate": 0.0002, "epoch": 4.700897666068222, "step": 65460}, {"loss": 0.6461, "grad_norm": 0.8743635416030884, "learning_rate": 0.0002, "epoch": 4.7016157989228, "step": 65470}, {"loss": 0.6129, "grad_norm": 0.9566192030906677, "learning_rate": 0.0002, "epoch": 4.702333931777379, "step": 65480}, {"loss": 0.6721, "grad_norm": 1.0505144596099854, "learning_rate": 0.0002, "epoch": 4.703052064631957, "step": 65490}, {"loss": 0.6287, "grad_norm": 0.8797298073768616, "learning_rate": 0.0002, "epoch": 4.703770197486535, "step": 65500}, {"loss": 0.6515, "grad_norm": 0.9970770478248596, "learning_rate": 0.0002, "epoch": 4.704488330341113, "step": 65510}, {"loss": 0.6096, "grad_norm": 1.1743851900100708, "learning_rate": 0.0002, "epoch": 4.705206463195691, "step": 65520}, {"loss": 0.5755, "grad_norm": 0.9534381031990051, "learning_rate": 0.0002, "epoch": 4.705924596050269, "step": 65530}, {"loss": 0.6039, "grad_norm": 0.9735581278800964, "learning_rate": 0.0002, "epoch": 4.706642728904847, "step": 65540}, {"loss": 0.6217, "grad_norm": 1.185352087020874, "learning_rate": 0.0002, "epoch": 4.707360861759425, "step": 65550}, {"loss": 0.6398, "grad_norm": 0.9383901357650757, "learning_rate": 0.0002, "epoch": 4.708078994614004, "step": 65560}, {"loss": 0.6654, "grad_norm": 1.0194662809371948, "learning_rate": 0.0002, "epoch": 4.708797127468582, "step": 65570}, {"loss": 0.6008, "grad_norm": 0.8448300361633301, "learning_rate": 0.0002, "epoch": 4.70951526032316, "step": 65580}, {"loss": 0.6608, "grad_norm": 1.1930629014968872, "learning_rate": 0.0002, "epoch": 4.710233393177738, "step": 65590}, {"loss": 0.6082, "grad_norm": 1.0038636922836304, "learning_rate": 0.0002, "epoch": 4.710951526032316, "step": 65600}, {"loss": 0.6613, "grad_norm": 0.8206564784049988, "learning_rate": 0.0002, "epoch": 4.711669658886894, "step": 65610}, {"loss": 0.6142, "grad_norm": 1.0984861850738525, "learning_rate": 0.0002, "epoch": 4.712387791741472, "step": 65620}, {"loss": 0.6368, "grad_norm": 1.2891547679901123, "learning_rate": 0.0002, "epoch": 4.71310592459605, "step": 65630}, {"loss": 0.5857, "grad_norm": 0.927062451839447, "learning_rate": 0.0002, "epoch": 4.713824057450628, "step": 65640}, {"loss": 0.6187, "grad_norm": 0.8647334575653076, "learning_rate": 0.0002, "epoch": 4.714542190305206, "step": 65650}, {"loss": 0.6327, "grad_norm": 1.1017670631408691, "learning_rate": 0.0002, "epoch": 4.715260323159785, "step": 65660}, {"loss": 0.6398, "grad_norm": 0.9589072465896606, "learning_rate": 0.0002, "epoch": 4.715978456014363, "step": 65670}, {"loss": 0.6179, "grad_norm": 0.9496776461601257, "learning_rate": 0.0002, "epoch": 4.716696588868941, "step": 65680}, {"loss": 0.625, "grad_norm": 0.9266180396080017, "learning_rate": 0.0002, "epoch": 4.717414721723519, "step": 65690}, {"loss": 0.637, "grad_norm": 0.8699696063995361, "learning_rate": 0.0002, "epoch": 4.718132854578097, "step": 65700}, {"loss": 0.6402, "grad_norm": 1.0444015264511108, "learning_rate": 0.0002, "epoch": 4.718850987432675, "step": 65710}, {"loss": 0.6526, "grad_norm": 1.0100741386413574, "learning_rate": 0.0002, "epoch": 4.719569120287253, "step": 65720}, {"loss": 0.617, "grad_norm": 1.1442630290985107, "learning_rate": 0.0002, "epoch": 4.720287253141831, "step": 65730}, {"loss": 0.6214, "grad_norm": 0.8937877416610718, "learning_rate": 0.0002, "epoch": 4.721005385996409, "step": 65740}, {"loss": 0.625, "grad_norm": 1.0718764066696167, "learning_rate": 0.0002, "epoch": 4.721723518850988, "step": 65750}, {"loss": 0.6182, "grad_norm": 0.8838587999343872, "learning_rate": 0.0002, "epoch": 4.722441651705566, "step": 65760}, {"loss": 0.6254, "grad_norm": 1.1247940063476562, "learning_rate": 0.0002, "epoch": 4.723159784560144, "step": 65770}, {"loss": 0.5917, "grad_norm": 0.9491105675697327, "learning_rate": 0.0002, "epoch": 4.723877917414722, "step": 65780}, {"loss": 0.6178, "grad_norm": 1.0896921157836914, "learning_rate": 0.0002, "epoch": 4.7245960502693, "step": 65790}, {"loss": 0.5975, "grad_norm": 1.0097380876541138, "learning_rate": 0.0002, "epoch": 4.725314183123878, "step": 65800}, {"loss": 0.592, "grad_norm": 0.911763608455658, "learning_rate": 0.0002, "epoch": 4.726032315978456, "step": 65810}, {"loss": 0.6274, "grad_norm": 1.1295124292373657, "learning_rate": 0.0002, "epoch": 4.726750448833034, "step": 65820}, {"loss": 0.6004, "grad_norm": 0.7637538313865662, "learning_rate": 0.0002, "epoch": 4.727468581687612, "step": 65830}, {"loss": 0.6136, "grad_norm": 0.9255306720733643, "learning_rate": 0.0002, "epoch": 4.72818671454219, "step": 65840}, {"loss": 0.6013, "grad_norm": 0.9847530126571655, "learning_rate": 0.0002, "epoch": 4.728904847396769, "step": 65850}, {"loss": 0.6283, "grad_norm": 0.9036182761192322, "learning_rate": 0.0002, "epoch": 4.729622980251347, "step": 65860}, {"loss": 0.6374, "grad_norm": 0.8284199833869934, "learning_rate": 0.0002, "epoch": 4.730341113105925, "step": 65870}, {"loss": 0.6228, "grad_norm": 1.0142838954925537, "learning_rate": 0.0002, "epoch": 4.731059245960503, "step": 65880}, {"loss": 0.624, "grad_norm": 0.9389033913612366, "learning_rate": 0.0002, "epoch": 4.731777378815081, "step": 65890}, {"loss": 0.6414, "grad_norm": 0.8870056867599487, "learning_rate": 0.0002, "epoch": 4.732495511669659, "step": 65900}, {"loss": 0.6261, "grad_norm": 1.1211678981781006, "learning_rate": 0.0002, "epoch": 4.733213644524237, "step": 65910}, {"loss": 0.6065, "grad_norm": 0.7796614170074463, "learning_rate": 0.0002, "epoch": 4.733931777378815, "step": 65920}, {"loss": 0.6701, "grad_norm": 1.0360451936721802, "learning_rate": 0.0002, "epoch": 4.734649910233394, "step": 65930}, {"loss": 0.68, "grad_norm": 0.8383482098579407, "learning_rate": 0.0002, "epoch": 4.735368043087972, "step": 65940}, {"loss": 0.6014, "grad_norm": 0.7985122799873352, "learning_rate": 0.0002, "epoch": 4.73608617594255, "step": 65950}, {"loss": 0.6431, "grad_norm": 1.0314199924468994, "learning_rate": 0.0002, "epoch": 4.736804308797128, "step": 65960}, {"loss": 0.5894, "grad_norm": 0.9279016852378845, "learning_rate": 0.0002, "epoch": 4.737522441651706, "step": 65970}, {"loss": 0.6327, "grad_norm": 1.1046063899993896, "learning_rate": 0.0002, "epoch": 4.738240574506284, "step": 65980}, {"loss": 0.5778, "grad_norm": 0.9075793623924255, "learning_rate": 0.0002, "epoch": 4.738958707360862, "step": 65990}, {"loss": 0.5832, "grad_norm": 1.0945355892181396, "learning_rate": 0.0002, "epoch": 4.73967684021544, "step": 66000}, {"loss": 0.6256, "grad_norm": 0.8885519504547119, "learning_rate": 0.0002, "epoch": 4.740394973070018, "step": 66010}, {"loss": 0.6283, "grad_norm": 0.9312083125114441, "learning_rate": 0.0002, "epoch": 4.741113105924596, "step": 66020}, {"loss": 0.6328, "grad_norm": 1.1574538946151733, "learning_rate": 0.0002, "epoch": 4.741831238779174, "step": 66030}, {"loss": 0.6693, "grad_norm": 0.9346209168434143, "learning_rate": 0.0002, "epoch": 4.742549371633753, "step": 66040}, {"loss": 0.6252, "grad_norm": 0.8935149312019348, "learning_rate": 0.0002, "epoch": 4.743267504488331, "step": 66050}, {"loss": 0.6137, "grad_norm": 0.8958369493484497, "learning_rate": 0.0002, "epoch": 4.743985637342909, "step": 66060}, {"loss": 0.6088, "grad_norm": 0.9383506774902344, "learning_rate": 0.0002, "epoch": 4.744703770197487, "step": 66070}, {"loss": 0.6323, "grad_norm": 0.9868947863578796, "learning_rate": 0.0002, "epoch": 4.745421903052065, "step": 66080}, {"loss": 0.6426, "grad_norm": 1.3417645692825317, "learning_rate": 0.0002, "epoch": 4.746140035906643, "step": 66090}, {"loss": 0.5417, "grad_norm": 1.070693850517273, "learning_rate": 0.0002, "epoch": 4.746858168761221, "step": 66100}, {"loss": 0.6326, "grad_norm": 0.8841570019721985, "learning_rate": 0.0002, "epoch": 4.747576301615799, "step": 66110}, {"loss": 0.655, "grad_norm": 0.7963120341300964, "learning_rate": 0.0002, "epoch": 4.7482944344703775, "step": 66120}, {"loss": 0.6145, "grad_norm": 0.8145691156387329, "learning_rate": 0.0002, "epoch": 4.7490125673249555, "step": 66130}, {"loss": 0.6081, "grad_norm": 0.9074729681015015, "learning_rate": 0.0002, "epoch": 4.7497307001795335, "step": 66140}, {"loss": 0.5651, "grad_norm": 0.9129886627197266, "learning_rate": 0.0002, "epoch": 4.7504488330341115, "step": 66150}, {"loss": 0.6111, "grad_norm": 0.91527259349823, "learning_rate": 0.0002, "epoch": 4.7511669658886895, "step": 66160}, {"loss": 0.672, "grad_norm": 0.9569419622421265, "learning_rate": 0.0002, "epoch": 4.7518850987432675, "step": 66170}, {"loss": 0.597, "grad_norm": 0.8777104616165161, "learning_rate": 0.0002, "epoch": 4.7526032315978455, "step": 66180}, {"loss": 0.6433, "grad_norm": 0.9673085808753967, "learning_rate": 0.0002, "epoch": 4.7533213644524235, "step": 66190}, {"loss": 0.5783, "grad_norm": 1.0683966875076294, "learning_rate": 0.0002, "epoch": 4.7540394973070015, "step": 66200}, {"loss": 0.6356, "grad_norm": 1.1591907739639282, "learning_rate": 0.0002, "epoch": 4.7547576301615795, "step": 66210}, {"loss": 0.6482, "grad_norm": 1.1973309516906738, "learning_rate": 0.0002, "epoch": 4.755475763016158, "step": 66220}, {"loss": 0.5998, "grad_norm": 0.8472012281417847, "learning_rate": 0.0002, "epoch": 4.756193895870736, "step": 66230}, {"loss": 0.717, "grad_norm": 0.9896261692047119, "learning_rate": 0.0002, "epoch": 4.756912028725314, "step": 66240}, {"loss": 0.6368, "grad_norm": 0.8498432040214539, "learning_rate": 0.0002, "epoch": 4.757630161579892, "step": 66250}, {"loss": 0.5931, "grad_norm": 0.9624166488647461, "learning_rate": 0.0002, "epoch": 4.75834829443447, "step": 66260}, {"loss": 0.645, "grad_norm": 1.0951786041259766, "learning_rate": 0.0002, "epoch": 4.759066427289048, "step": 66270}, {"loss": 0.6092, "grad_norm": 0.9863157868385315, "learning_rate": 0.0002, "epoch": 4.759784560143626, "step": 66280}, {"loss": 0.6682, "grad_norm": 1.0062068700790405, "learning_rate": 0.0002, "epoch": 4.760502692998204, "step": 66290}, {"loss": 0.5704, "grad_norm": 0.8075495958328247, "learning_rate": 0.0002, "epoch": 4.761220825852782, "step": 66300}, {"loss": 0.6297, "grad_norm": 0.9617878198623657, "learning_rate": 0.0002, "epoch": 4.761938958707361, "step": 66310}, {"loss": 0.6141, "grad_norm": 1.097091555595398, "learning_rate": 0.0002, "epoch": 4.762657091561939, "step": 66320}, {"loss": 0.6152, "grad_norm": 1.2713453769683838, "learning_rate": 0.0002, "epoch": 4.763375224416517, "step": 66330}, {"loss": 0.6726, "grad_norm": 0.9473448991775513, "learning_rate": 0.0002, "epoch": 4.764093357271095, "step": 66340}, {"loss": 0.6032, "grad_norm": 1.0176854133605957, "learning_rate": 0.0002, "epoch": 4.764811490125673, "step": 66350}, {"loss": 0.6429, "grad_norm": 1.0486242771148682, "learning_rate": 0.0002, "epoch": 4.765529622980251, "step": 66360}, {"loss": 0.6875, "grad_norm": 1.249985694885254, "learning_rate": 0.0002, "epoch": 4.766247755834829, "step": 66370}, {"loss": 0.6086, "grad_norm": 1.283875584602356, "learning_rate": 0.0002, "epoch": 4.766965888689407, "step": 66380}, {"loss": 0.5997, "grad_norm": 1.0009022951126099, "learning_rate": 0.0002, "epoch": 4.767684021543985, "step": 66390}, {"loss": 0.5782, "grad_norm": 0.9718021750450134, "learning_rate": 0.0002, "epoch": 4.768402154398563, "step": 66400}, {"loss": 0.6292, "grad_norm": 1.0865732431411743, "learning_rate": 0.0002, "epoch": 4.769120287253142, "step": 66410}, {"loss": 0.6038, "grad_norm": 0.9273189306259155, "learning_rate": 0.0002, "epoch": 4.76983842010772, "step": 66420}, {"loss": 0.6244, "grad_norm": 1.067535638809204, "learning_rate": 0.0002, "epoch": 4.770556552962298, "step": 66430}, {"loss": 0.6434, "grad_norm": 1.0551011562347412, "learning_rate": 0.0002, "epoch": 4.771274685816876, "step": 66440}, {"loss": 0.6151, "grad_norm": 1.0336146354675293, "learning_rate": 0.0002, "epoch": 4.771992818671454, "step": 66450}, {"loss": 0.5955, "grad_norm": 0.8738380670547485, "learning_rate": 0.0002, "epoch": 4.772710951526032, "step": 66460}, {"loss": 0.6386, "grad_norm": 1.1048321723937988, "learning_rate": 0.0002, "epoch": 4.77342908438061, "step": 66470}, {"loss": 0.592, "grad_norm": 0.8471167683601379, "learning_rate": 0.0002, "epoch": 4.774147217235188, "step": 66480}, {"loss": 0.6139, "grad_norm": 1.2527031898498535, "learning_rate": 0.0002, "epoch": 4.774865350089767, "step": 66490}, {"loss": 0.579, "grad_norm": 1.0056052207946777, "learning_rate": 0.0002, "epoch": 4.775583482944345, "step": 66500}, {"loss": 0.6448, "grad_norm": 1.142456293106079, "learning_rate": 0.0002, "epoch": 4.776301615798923, "step": 66510}, {"loss": 0.6399, "grad_norm": 1.1813132762908936, "learning_rate": 0.0002, "epoch": 4.777019748653501, "step": 66520}, {"loss": 0.6575, "grad_norm": 0.8683654069900513, "learning_rate": 0.0002, "epoch": 4.777737881508079, "step": 66530}, {"loss": 0.6059, "grad_norm": 1.0577980279922485, "learning_rate": 0.0002, "epoch": 4.778456014362657, "step": 66540}, {"loss": 0.5923, "grad_norm": 1.077438473701477, "learning_rate": 0.0002, "epoch": 4.779174147217235, "step": 66550}, {"loss": 0.5744, "grad_norm": 1.0107938051223755, "learning_rate": 0.0002, "epoch": 4.779892280071813, "step": 66560}, {"loss": 0.6155, "grad_norm": 0.8071168065071106, "learning_rate": 0.0002, "epoch": 4.780610412926391, "step": 66570}, {"loss": 0.6126, "grad_norm": 0.8887564539909363, "learning_rate": 0.0002, "epoch": 4.781328545780969, "step": 66580}, {"loss": 0.6417, "grad_norm": 0.9823092222213745, "learning_rate": 0.0002, "epoch": 4.782046678635547, "step": 66590}, {"loss": 0.6108, "grad_norm": 0.9026784300804138, "learning_rate": 0.0002, "epoch": 4.782764811490126, "step": 66600}, {"loss": 0.6252, "grad_norm": 0.8912792205810547, "learning_rate": 0.0002, "epoch": 4.783482944344704, "step": 66610}, {"loss": 0.6285, "grad_norm": 1.0955979824066162, "learning_rate": 0.0002, "epoch": 4.784201077199282, "step": 66620}, {"loss": 0.6161, "grad_norm": 0.8614793419837952, "learning_rate": 0.0002, "epoch": 4.78491921005386, "step": 66630}, {"loss": 0.6343, "grad_norm": 0.7247269153594971, "learning_rate": 0.0002, "epoch": 4.785637342908438, "step": 66640}, {"loss": 0.5634, "grad_norm": 0.9685400724411011, "learning_rate": 0.0002, "epoch": 4.786355475763016, "step": 66650}, {"loss": 0.6419, "grad_norm": 0.9219905734062195, "learning_rate": 0.0002, "epoch": 4.787073608617594, "step": 66660}, {"loss": 0.6509, "grad_norm": 0.9217489361763, "learning_rate": 0.0002, "epoch": 4.787791741472172, "step": 66670}, {"loss": 0.6151, "grad_norm": 1.13791823387146, "learning_rate": 0.0002, "epoch": 4.788509874326751, "step": 66680}, {"loss": 0.6114, "grad_norm": 0.857542872428894, "learning_rate": 0.0002, "epoch": 4.789228007181329, "step": 66690}, {"loss": 0.6317, "grad_norm": 0.9886694550514221, "learning_rate": 0.0002, "epoch": 4.789946140035907, "step": 66700}, {"loss": 0.6436, "grad_norm": 0.987952470779419, "learning_rate": 0.0002, "epoch": 4.790664272890485, "step": 66710}, {"loss": 0.6284, "grad_norm": 1.051612377166748, "learning_rate": 0.0002, "epoch": 4.791382405745063, "step": 66720}, {"loss": 0.6207, "grad_norm": 0.9816454648971558, "learning_rate": 0.0002, "epoch": 4.792100538599641, "step": 66730}, {"loss": 0.6618, "grad_norm": 1.0953829288482666, "learning_rate": 0.0002, "epoch": 4.792818671454219, "step": 66740}, {"loss": 0.652, "grad_norm": 0.8720369935035706, "learning_rate": 0.0002, "epoch": 4.793536804308797, "step": 66750}, {"loss": 0.569, "grad_norm": 0.8910234570503235, "learning_rate": 0.0002, "epoch": 4.794254937163375, "step": 66760}, {"loss": 0.5814, "grad_norm": 0.8300510048866272, "learning_rate": 0.0002, "epoch": 4.794973070017953, "step": 66770}, {"loss": 0.591, "grad_norm": 0.9380533695220947, "learning_rate": 0.0002, "epoch": 4.795691202872531, "step": 66780}, {"loss": 0.6201, "grad_norm": 0.8361864686012268, "learning_rate": 0.0002, "epoch": 4.79640933572711, "step": 66790}, {"loss": 0.6192, "grad_norm": 1.051262617111206, "learning_rate": 0.0002, "epoch": 4.797127468581688, "step": 66800}, {"loss": 0.6408, "grad_norm": 1.1324400901794434, "learning_rate": 0.0002, "epoch": 4.797845601436266, "step": 66810}, {"loss": 0.6156, "grad_norm": 0.853903591632843, "learning_rate": 0.0002, "epoch": 4.798563734290844, "step": 66820}, {"loss": 0.5923, "grad_norm": 0.9949867725372314, "learning_rate": 0.0002, "epoch": 4.799281867145422, "step": 66830}, {"loss": 0.6453, "grad_norm": 0.9204033017158508, "learning_rate": 0.0002, "epoch": 4.8, "step": 66840}, {"loss": 0.6221, "grad_norm": 0.7461584806442261, "learning_rate": 0.0002, "epoch": 4.800718132854578, "step": 66850}, {"loss": 0.6019, "grad_norm": 1.1019874811172485, "learning_rate": 0.0002, "epoch": 4.801436265709156, "step": 66860}, {"loss": 0.6514, "grad_norm": 1.1695797443389893, "learning_rate": 0.0002, "epoch": 4.802154398563735, "step": 66870}, {"loss": 0.6105, "grad_norm": 1.0902758836746216, "learning_rate": 0.0002, "epoch": 4.802872531418313, "step": 66880}, {"loss": 0.6297, "grad_norm": 0.8778618574142456, "learning_rate": 0.0002, "epoch": 4.803590664272891, "step": 66890}, {"loss": 0.6608, "grad_norm": 0.905505359172821, "learning_rate": 0.0002, "epoch": 4.804308797127469, "step": 66900}, {"loss": 0.6386, "grad_norm": 1.0802056789398193, "learning_rate": 0.0002, "epoch": 4.805026929982047, "step": 66910}, {"loss": 0.5866, "grad_norm": 0.7899449467658997, "learning_rate": 0.0002, "epoch": 4.805745062836625, "step": 66920}, {"loss": 0.6169, "grad_norm": 1.1938519477844238, "learning_rate": 0.0002, "epoch": 4.806463195691203, "step": 66930}, {"loss": 0.5979, "grad_norm": 1.0213780403137207, "learning_rate": 0.0002, "epoch": 4.807181328545781, "step": 66940}, {"loss": 0.6518, "grad_norm": 0.9925506711006165, "learning_rate": 0.0002, "epoch": 4.807899461400359, "step": 66950}, {"loss": 0.6229, "grad_norm": 1.0174424648284912, "learning_rate": 0.0002, "epoch": 4.808617594254937, "step": 66960}, {"loss": 0.5932, "grad_norm": 1.0515072345733643, "learning_rate": 0.0002, "epoch": 4.809335727109516, "step": 66970}, {"loss": 0.6169, "grad_norm": 1.0161492824554443, "learning_rate": 0.0002, "epoch": 4.810053859964094, "step": 66980}, {"loss": 0.5804, "grad_norm": 0.8421840071678162, "learning_rate": 0.0002, "epoch": 4.810771992818672, "step": 66990}, {"loss": 0.6792, "grad_norm": 1.0493539571762085, "learning_rate": 0.0002, "epoch": 4.81149012567325, "step": 67000}, {"loss": 0.5906, "grad_norm": 1.1133309602737427, "learning_rate": 0.0002, "epoch": 4.812208258527828, "step": 67010}, {"loss": 0.5771, "grad_norm": 0.924017071723938, "learning_rate": 0.0002, "epoch": 4.812926391382406, "step": 67020}, {"loss": 0.625, "grad_norm": 1.0568689107894897, "learning_rate": 0.0002, "epoch": 4.813644524236984, "step": 67030}, {"loss": 0.6654, "grad_norm": 0.989414632320404, "learning_rate": 0.0002, "epoch": 4.814362657091562, "step": 67040}, {"loss": 0.6186, "grad_norm": 0.9256827235221863, "learning_rate": 0.0002, "epoch": 4.8150807899461405, "step": 67050}, {"loss": 0.637, "grad_norm": 0.9538901448249817, "learning_rate": 0.0002, "epoch": 4.8157989228007185, "step": 67060}, {"loss": 0.632, "grad_norm": 1.0373849868774414, "learning_rate": 0.0002, "epoch": 4.8165170556552965, "step": 67070}, {"loss": 0.5956, "grad_norm": 1.0019729137420654, "learning_rate": 0.0002, "epoch": 4.8172351885098745, "step": 67080}, {"loss": 0.636, "grad_norm": 0.9930381178855896, "learning_rate": 0.0002, "epoch": 4.8179533213644525, "step": 67090}, {"loss": 0.6106, "grad_norm": 1.0008453130722046, "learning_rate": 0.0002, "epoch": 4.8186714542190305, "step": 67100}, {"loss": 0.5841, "grad_norm": 1.0153851509094238, "learning_rate": 0.0002, "epoch": 4.8193895870736085, "step": 67110}, {"loss": 0.6012, "grad_norm": 1.0193161964416504, "learning_rate": 0.0002, "epoch": 4.8201077199281865, "step": 67120}, {"loss": 0.6602, "grad_norm": 1.0204501152038574, "learning_rate": 0.0002, "epoch": 4.8208258527827645, "step": 67130}, {"loss": 0.6235, "grad_norm": 0.9097670316696167, "learning_rate": 0.0002, "epoch": 4.8215439856373425, "step": 67140}, {"loss": 0.5836, "grad_norm": 0.9288716912269592, "learning_rate": 0.0002, "epoch": 4.8222621184919205, "step": 67150}, {"loss": 0.604, "grad_norm": 0.9975850582122803, "learning_rate": 0.0002, "epoch": 4.822980251346499, "step": 67160}, {"loss": 0.6877, "grad_norm": 0.8502511382102966, "learning_rate": 0.0002, "epoch": 4.823698384201077, "step": 67170}, {"loss": 0.6194, "grad_norm": 1.0129257440567017, "learning_rate": 0.0002, "epoch": 4.824416517055655, "step": 67180}, {"loss": 0.6294, "grad_norm": 1.0009492635726929, "learning_rate": 0.0002, "epoch": 4.825134649910233, "step": 67190}, {"loss": 0.5757, "grad_norm": 0.9273321032524109, "learning_rate": 0.0002, "epoch": 4.825852782764811, "step": 67200}, {"loss": 0.5749, "grad_norm": 1.0438604354858398, "learning_rate": 0.0002, "epoch": 4.8265709156193894, "step": 67210}, {"loss": 0.6273, "grad_norm": 1.119573712348938, "learning_rate": 0.0002, "epoch": 4.8272890484739674, "step": 67220}, {"loss": 0.6284, "grad_norm": 0.9607422351837158, "learning_rate": 0.0002, "epoch": 4.8280071813285454, "step": 67230}, {"loss": 0.6259, "grad_norm": 0.9614062905311584, "learning_rate": 0.0002, "epoch": 4.828725314183124, "step": 67240}, {"loss": 0.5709, "grad_norm": 1.1017652750015259, "learning_rate": 0.0002, "epoch": 4.829443447037702, "step": 67250}, {"loss": 0.6203, "grad_norm": 1.0521706342697144, "learning_rate": 0.0002, "epoch": 4.83016157989228, "step": 67260}, {"loss": 0.6266, "grad_norm": 0.7685959339141846, "learning_rate": 0.0002, "epoch": 4.830879712746858, "step": 67270}, {"loss": 0.5809, "grad_norm": 0.7894896268844604, "learning_rate": 0.0002, "epoch": 4.831597845601436, "step": 67280}, {"loss": 0.6349, "grad_norm": 1.0882996320724487, "learning_rate": 0.0002, "epoch": 4.832315978456014, "step": 67290}, {"loss": 0.6129, "grad_norm": 0.9215409755706787, "learning_rate": 0.0002, "epoch": 4.833034111310592, "step": 67300}, {"loss": 0.6142, "grad_norm": 0.8660635352134705, "learning_rate": 0.0002, "epoch": 4.83375224416517, "step": 67310}, {"loss": 0.6378, "grad_norm": 0.980879008769989, "learning_rate": 0.0002, "epoch": 4.834470377019748, "step": 67320}, {"loss": 0.6291, "grad_norm": 1.0356814861297607, "learning_rate": 0.0002, "epoch": 4.835188509874326, "step": 67330}, {"loss": 0.6271, "grad_norm": 1.0265507698059082, "learning_rate": 0.0002, "epoch": 4.835906642728904, "step": 67340}, {"loss": 0.6009, "grad_norm": 1.0659137964248657, "learning_rate": 0.0002, "epoch": 4.836624775583483, "step": 67350}, {"loss": 0.5946, "grad_norm": 0.9485231637954712, "learning_rate": 0.0002, "epoch": 4.837342908438061, "step": 67360}, {"loss": 0.6338, "grad_norm": 1.0950140953063965, "learning_rate": 0.0002, "epoch": 4.838061041292639, "step": 67370}, {"loss": 0.6314, "grad_norm": 0.8907382488250732, "learning_rate": 0.0002, "epoch": 4.838779174147217, "step": 67380}, {"loss": 0.6066, "grad_norm": 0.9777120351791382, "learning_rate": 0.0002, "epoch": 4.839497307001795, "step": 67390}, {"loss": 0.6258, "grad_norm": 0.8482252955436707, "learning_rate": 0.0002, "epoch": 4.840215439856373, "step": 67400}, {"loss": 0.603, "grad_norm": 0.8505899906158447, "learning_rate": 0.0002, "epoch": 4.840933572710951, "step": 67410}, {"loss": 0.609, "grad_norm": 0.8574482798576355, "learning_rate": 0.0002, "epoch": 4.841651705565529, "step": 67420}, {"loss": 0.6188, "grad_norm": 1.092310905456543, "learning_rate": 0.0002, "epoch": 4.842369838420108, "step": 67430}, {"loss": 0.619, "grad_norm": 0.9418560266494751, "learning_rate": 0.0002, "epoch": 4.843087971274686, "step": 67440}, {"loss": 0.6367, "grad_norm": 1.1310782432556152, "learning_rate": 0.0002, "epoch": 4.843806104129264, "step": 67450}, {"loss": 0.664, "grad_norm": 0.9993671774864197, "learning_rate": 0.0002, "epoch": 4.844524236983842, "step": 67460}, {"loss": 0.6247, "grad_norm": 0.8322528600692749, "learning_rate": 0.0002, "epoch": 4.84524236983842, "step": 67470}, {"loss": 0.5828, "grad_norm": 0.8488435745239258, "learning_rate": 0.0002, "epoch": 4.845960502692998, "step": 67480}, {"loss": 0.6023, "grad_norm": 0.8070611357688904, "learning_rate": 0.0002, "epoch": 4.846678635547576, "step": 67490}, {"loss": 0.6362, "grad_norm": 0.8200163245201111, "learning_rate": 0.0002, "epoch": 4.847396768402154, "step": 67500}, {"loss": 0.612, "grad_norm": 0.91901034116745, "learning_rate": 0.0002, "epoch": 4.848114901256732, "step": 67510}, {"loss": 0.6191, "grad_norm": 1.0938435792922974, "learning_rate": 0.0002, "epoch": 4.84883303411131, "step": 67520}, {"loss": 0.6736, "grad_norm": 0.7926174402236938, "learning_rate": 0.0002, "epoch": 4.849551166965889, "step": 67530}, {"loss": 0.6252, "grad_norm": 0.9914385676383972, "learning_rate": 0.0002, "epoch": 4.850269299820467, "step": 67540}, {"loss": 0.6278, "grad_norm": 1.033065915107727, "learning_rate": 0.0002, "epoch": 4.850987432675045, "step": 67550}, {"loss": 0.6334, "grad_norm": 0.9700239300727844, "learning_rate": 0.0002, "epoch": 4.851705565529623, "step": 67560}, {"loss": 0.6308, "grad_norm": 0.8550103902816772, "learning_rate": 0.0002, "epoch": 4.852423698384201, "step": 67570}, {"loss": 0.6194, "grad_norm": 1.0009654760360718, "learning_rate": 0.0002, "epoch": 4.853141831238779, "step": 67580}, {"loss": 0.5825, "grad_norm": 1.0766186714172363, "learning_rate": 0.0002, "epoch": 4.853859964093357, "step": 67590}, {"loss": 0.6216, "grad_norm": 0.9512220621109009, "learning_rate": 0.0002, "epoch": 4.854578096947935, "step": 67600}, {"loss": 0.6301, "grad_norm": 0.8434456586837769, "learning_rate": 0.0002, "epoch": 4.855296229802514, "step": 67610}, {"loss": 0.6416, "grad_norm": 1.0276665687561035, "learning_rate": 0.0002, "epoch": 4.856014362657092, "step": 67620}, {"loss": 0.6063, "grad_norm": 0.9758516550064087, "learning_rate": 0.0002, "epoch": 4.85673249551167, "step": 67630}, {"loss": 0.622, "grad_norm": 0.8988076448440552, "learning_rate": 0.0002, "epoch": 4.857450628366248, "step": 67640}, {"loss": 0.6516, "grad_norm": 1.0038257837295532, "learning_rate": 0.0002, "epoch": 4.858168761220826, "step": 67650}, {"loss": 0.6322, "grad_norm": 0.9973093867301941, "learning_rate": 0.0002, "epoch": 4.858886894075404, "step": 67660}, {"loss": 0.6065, "grad_norm": 0.9754974246025085, "learning_rate": 0.0002, "epoch": 4.859605026929982, "step": 67670}, {"loss": 0.6191, "grad_norm": 1.1829560995101929, "learning_rate": 0.0002, "epoch": 4.86032315978456, "step": 67680}, {"loss": 0.6267, "grad_norm": 1.1077659130096436, "learning_rate": 0.0002, "epoch": 4.861041292639138, "step": 67690}, {"loss": 0.6312, "grad_norm": 0.9862872958183289, "learning_rate": 0.0002, "epoch": 4.861759425493716, "step": 67700}, {"loss": 0.6281, "grad_norm": 0.9826052188873291, "learning_rate": 0.0002, "epoch": 4.862477558348294, "step": 67710}, {"loss": 0.6227, "grad_norm": 0.940082848072052, "learning_rate": 0.0002, "epoch": 4.863195691202873, "step": 67720}, {"loss": 0.6232, "grad_norm": 0.895434558391571, "learning_rate": 0.0002, "epoch": 4.863913824057451, "step": 67730}, {"loss": 0.6674, "grad_norm": 1.1194682121276855, "learning_rate": 0.0002, "epoch": 4.864631956912029, "step": 67740}, {"loss": 0.5981, "grad_norm": 0.9984544515609741, "learning_rate": 0.0002, "epoch": 4.865350089766607, "step": 67750}, {"loss": 0.6583, "grad_norm": 1.049224615097046, "learning_rate": 0.0002, "epoch": 4.866068222621185, "step": 67760}, {"loss": 0.583, "grad_norm": 1.009515643119812, "learning_rate": 0.0002, "epoch": 4.866786355475763, "step": 67770}, {"loss": 0.6466, "grad_norm": 1.0336902141571045, "learning_rate": 0.0002, "epoch": 4.867504488330341, "step": 67780}, {"loss": 0.6909, "grad_norm": 0.9310635924339294, "learning_rate": 0.0002, "epoch": 4.868222621184919, "step": 67790}, {"loss": 0.7267, "grad_norm": 0.934882640838623, "learning_rate": 0.0002, "epoch": 4.868940754039498, "step": 67800}, {"loss": 0.648, "grad_norm": 0.8663495779037476, "learning_rate": 0.0002, "epoch": 4.869658886894076, "step": 67810}, {"loss": 0.6275, "grad_norm": 1.0085018873214722, "learning_rate": 0.0002, "epoch": 4.870377019748654, "step": 67820}, {"loss": 0.6571, "grad_norm": 0.896507978439331, "learning_rate": 0.0002, "epoch": 4.871095152603232, "step": 67830}, {"loss": 0.6711, "grad_norm": 0.925809919834137, "learning_rate": 0.0002, "epoch": 4.87181328545781, "step": 67840}, {"loss": 0.5917, "grad_norm": 0.8044029474258423, "learning_rate": 0.0002, "epoch": 4.872531418312388, "step": 67850}, {"loss": 0.6671, "grad_norm": 1.0026800632476807, "learning_rate": 0.0002, "epoch": 4.873249551166966, "step": 67860}, {"loss": 0.6175, "grad_norm": 0.9577589631080627, "learning_rate": 0.0002, "epoch": 4.873967684021544, "step": 67870}, {"loss": 0.591, "grad_norm": 0.8225193619728088, "learning_rate": 0.0002, "epoch": 4.874685816876122, "step": 67880}, {"loss": 0.6, "grad_norm": 1.0019139051437378, "learning_rate": 0.0002, "epoch": 4.8754039497307, "step": 67890}, {"loss": 0.6521, "grad_norm": 0.9282827377319336, "learning_rate": 0.0002, "epoch": 4.876122082585278, "step": 67900}, {"loss": 0.6251, "grad_norm": 0.8204836249351501, "learning_rate": 0.0002, "epoch": 4.876840215439857, "step": 67910}, {"loss": 0.6345, "grad_norm": 0.907356321811676, "learning_rate": 0.0002, "epoch": 4.877558348294435, "step": 67920}, {"loss": 0.6438, "grad_norm": 1.12422776222229, "learning_rate": 0.0002, "epoch": 4.878276481149013, "step": 67930}, {"loss": 0.6727, "grad_norm": 0.8230205178260803, "learning_rate": 0.0002, "epoch": 4.878994614003591, "step": 67940}, {"loss": 0.6361, "grad_norm": 1.1588479280471802, "learning_rate": 0.0002, "epoch": 4.879712746858169, "step": 67950}, {"loss": 0.6489, "grad_norm": 1.1064553260803223, "learning_rate": 0.0002, "epoch": 4.880430879712747, "step": 67960}, {"loss": 0.5851, "grad_norm": 0.9311534762382507, "learning_rate": 0.0002, "epoch": 4.881149012567325, "step": 67970}, {"loss": 0.6238, "grad_norm": 0.7575639486312866, "learning_rate": 0.0002, "epoch": 4.881867145421903, "step": 67980}, {"loss": 0.5933, "grad_norm": 0.9201191067695618, "learning_rate": 0.0002, "epoch": 4.882585278276482, "step": 67990}, {"loss": 0.5806, "grad_norm": 0.8487658500671387, "learning_rate": 0.0002, "epoch": 4.88330341113106, "step": 68000}, {"loss": 0.598, "grad_norm": 0.9645208716392517, "learning_rate": 0.0002, "epoch": 4.884021543985638, "step": 68010}, {"loss": 0.6112, "grad_norm": 0.8594469428062439, "learning_rate": 0.0002, "epoch": 4.884739676840216, "step": 68020}, {"loss": 0.6115, "grad_norm": 0.9518412947654724, "learning_rate": 0.0002, "epoch": 4.885457809694794, "step": 68030}, {"loss": 0.6071, "grad_norm": 1.0934258699417114, "learning_rate": 0.0002, "epoch": 4.886175942549372, "step": 68040}, {"loss": 0.6265, "grad_norm": 0.988761842250824, "learning_rate": 0.0002, "epoch": 4.88689407540395, "step": 68050}, {"loss": 0.5981, "grad_norm": 0.7572013735771179, "learning_rate": 0.0002, "epoch": 4.887612208258528, "step": 68060}, {"loss": 0.6286, "grad_norm": 0.8801929950714111, "learning_rate": 0.0002, "epoch": 4.888330341113106, "step": 68070}, {"loss": 0.6503, "grad_norm": 1.0080658197402954, "learning_rate": 0.0002, "epoch": 4.889048473967684, "step": 68080}, {"loss": 0.6064, "grad_norm": 0.9588785171508789, "learning_rate": 0.0002, "epoch": 4.8897666068222625, "step": 68090}, {"loss": 0.6159, "grad_norm": 1.0994032621383667, "learning_rate": 0.0002, "epoch": 4.8904847396768405, "step": 68100}, {"loss": 0.6357, "grad_norm": 0.9851962924003601, "learning_rate": 0.0002, "epoch": 4.8912028725314185, "step": 68110}, {"loss": 0.5999, "grad_norm": 0.9566116333007812, "learning_rate": 0.0002, "epoch": 4.8919210053859965, "step": 68120}, {"loss": 0.6742, "grad_norm": 0.8708083033561707, "learning_rate": 0.0002, "epoch": 4.8926391382405745, "step": 68130}, {"loss": 0.6489, "grad_norm": 1.2182754278182983, "learning_rate": 0.0002, "epoch": 4.8933572710951525, "step": 68140}, {"loss": 0.6442, "grad_norm": 1.047988772392273, "learning_rate": 0.0002, "epoch": 4.8940754039497305, "step": 68150}, {"loss": 0.6176, "grad_norm": 0.8665831685066223, "learning_rate": 0.0002, "epoch": 4.8947935368043085, "step": 68160}, {"loss": 0.5721, "grad_norm": 0.9313908219337463, "learning_rate": 0.0002, "epoch": 4.8955116696588865, "step": 68170}, {"loss": 0.6073, "grad_norm": 0.9568582773208618, "learning_rate": 0.0002, "epoch": 4.896229802513465, "step": 68180}, {"loss": 0.6308, "grad_norm": 1.0427594184875488, "learning_rate": 0.0002, "epoch": 4.896947935368043, "step": 68190}, {"loss": 0.6357, "grad_norm": 0.9132021069526672, "learning_rate": 0.0002, "epoch": 4.897666068222621, "step": 68200}, {"loss": 0.6264, "grad_norm": 0.9597318768501282, "learning_rate": 0.0002, "epoch": 4.898384201077199, "step": 68210}, {"loss": 0.6025, "grad_norm": 1.0736947059631348, "learning_rate": 0.0002, "epoch": 4.899102333931777, "step": 68220}, {"loss": 0.5942, "grad_norm": 0.9318404793739319, "learning_rate": 0.0002, "epoch": 4.899820466786355, "step": 68230}, {"loss": 0.5991, "grad_norm": 0.8594326972961426, "learning_rate": 0.0002, "epoch": 4.900538599640933, "step": 68240}, {"loss": 0.6145, "grad_norm": 1.1437443494796753, "learning_rate": 0.0002, "epoch": 4.901256732495511, "step": 68250}, {"loss": 0.6414, "grad_norm": 1.1599408388137817, "learning_rate": 0.0002, "epoch": 4.901974865350089, "step": 68260}, {"loss": 0.6148, "grad_norm": 1.160628080368042, "learning_rate": 0.0002, "epoch": 4.902692998204667, "step": 68270}, {"loss": 0.613, "grad_norm": 1.0147801637649536, "learning_rate": 0.0002, "epoch": 4.903411131059246, "step": 68280}, {"loss": 0.6502, "grad_norm": 0.8622691631317139, "learning_rate": 0.0002, "epoch": 4.904129263913824, "step": 68290}, {"loss": 0.618, "grad_norm": 0.7179980874061584, "learning_rate": 0.0002, "epoch": 4.904847396768402, "step": 68300}, {"loss": 0.6388, "grad_norm": 1.1705092191696167, "learning_rate": 0.0002, "epoch": 4.90556552962298, "step": 68310}, {"loss": 0.6164, "grad_norm": 1.1687676906585693, "learning_rate": 0.0002, "epoch": 4.906283662477558, "step": 68320}, {"loss": 0.6791, "grad_norm": 1.1621531248092651, "learning_rate": 0.0002, "epoch": 4.907001795332136, "step": 68330}, {"loss": 0.6474, "grad_norm": 1.0241422653198242, "learning_rate": 0.0002, "epoch": 4.907719928186714, "step": 68340}, {"loss": 0.6225, "grad_norm": 0.943354070186615, "learning_rate": 0.0002, "epoch": 4.908438061041292, "step": 68350}, {"loss": 0.6596, "grad_norm": 0.8091703653335571, "learning_rate": 0.0002, "epoch": 4.909156193895871, "step": 68360}, {"loss": 0.6196, "grad_norm": 0.8871228694915771, "learning_rate": 0.0002, "epoch": 4.909874326750449, "step": 68370}, {"loss": 0.5714, "grad_norm": 1.0951069593429565, "learning_rate": 0.0002, "epoch": 4.910592459605027, "step": 68380}, {"loss": 0.6407, "grad_norm": 1.1355193853378296, "learning_rate": 0.0002, "epoch": 4.911310592459605, "step": 68390}, {"loss": 0.6369, "grad_norm": 1.0741122961044312, "learning_rate": 0.0002, "epoch": 4.912028725314183, "step": 68400}, {"loss": 0.6176, "grad_norm": 0.9285269975662231, "learning_rate": 0.0002, "epoch": 4.912746858168761, "step": 68410}, {"loss": 0.6433, "grad_norm": 1.080695390701294, "learning_rate": 0.0002, "epoch": 4.913464991023339, "step": 68420}, {"loss": 0.6505, "grad_norm": 0.921331524848938, "learning_rate": 0.0002, "epoch": 4.914183123877917, "step": 68430}, {"loss": 0.701, "grad_norm": 0.9763174057006836, "learning_rate": 0.0002, "epoch": 4.914901256732495, "step": 68440}, {"loss": 0.6429, "grad_norm": 1.1133354902267456, "learning_rate": 0.0002, "epoch": 4.915619389587073, "step": 68450}, {"loss": 0.6117, "grad_norm": 0.8373502492904663, "learning_rate": 0.0002, "epoch": 4.916337522441651, "step": 68460}, {"loss": 0.5993, "grad_norm": 0.9192346334457397, "learning_rate": 0.0002, "epoch": 4.91705565529623, "step": 68470}, {"loss": 0.626, "grad_norm": 1.0724657773971558, "learning_rate": 0.0002, "epoch": 4.917773788150808, "step": 68480}, {"loss": 0.6339, "grad_norm": 0.9209843873977661, "learning_rate": 0.0002, "epoch": 4.918491921005386, "step": 68490}, {"loss": 0.6427, "grad_norm": 0.9201577305793762, "learning_rate": 0.0002, "epoch": 4.919210053859964, "step": 68500}, {"loss": 0.6686, "grad_norm": 0.8086138963699341, "learning_rate": 0.0002, "epoch": 4.919928186714542, "step": 68510}, {"loss": 0.564, "grad_norm": 1.0917785167694092, "learning_rate": 0.0002, "epoch": 4.92064631956912, "step": 68520}, {"loss": 0.6177, "grad_norm": 0.9287897944450378, "learning_rate": 0.0002, "epoch": 4.921364452423698, "step": 68530}, {"loss": 0.6344, "grad_norm": 0.9830158948898315, "learning_rate": 0.0002, "epoch": 4.922082585278276, "step": 68540}, {"loss": 0.6583, "grad_norm": 0.8674678802490234, "learning_rate": 0.0002, "epoch": 4.922800718132855, "step": 68550}, {"loss": 0.6284, "grad_norm": 0.7996176481246948, "learning_rate": 0.0002, "epoch": 4.923518850987433, "step": 68560}, {"loss": 0.6089, "grad_norm": 1.1284033060073853, "learning_rate": 0.0002, "epoch": 4.924236983842011, "step": 68570}, {"loss": 0.6454, "grad_norm": 0.894339919090271, "learning_rate": 0.0002, "epoch": 4.924955116696589, "step": 68580}, {"loss": 0.6231, "grad_norm": 1.1140280961990356, "learning_rate": 0.0002, "epoch": 4.925673249551167, "step": 68590}, {"loss": 0.6318, "grad_norm": 0.9048344492912292, "learning_rate": 0.0002, "epoch": 4.926391382405745, "step": 68600}, {"loss": 0.5963, "grad_norm": 0.9380471706390381, "learning_rate": 0.0002, "epoch": 4.927109515260323, "step": 68610}, {"loss": 0.6384, "grad_norm": 0.8598429560661316, "learning_rate": 0.0002, "epoch": 4.927827648114901, "step": 68620}, {"loss": 0.6486, "grad_norm": 1.0813355445861816, "learning_rate": 0.0002, "epoch": 4.928545780969479, "step": 68630}, {"loss": 0.6367, "grad_norm": 0.979053795337677, "learning_rate": 0.0002, "epoch": 4.929263913824057, "step": 68640}, {"loss": 0.6084, "grad_norm": 0.8194574117660522, "learning_rate": 0.0002, "epoch": 4.929982046678636, "step": 68650}, {"loss": 0.6469, "grad_norm": 0.8593540787696838, "learning_rate": 0.0002, "epoch": 4.930700179533214, "step": 68660}, {"loss": 0.6465, "grad_norm": 1.0134016275405884, "learning_rate": 0.0002, "epoch": 4.931418312387792, "step": 68670}, {"loss": 0.6221, "grad_norm": 1.060586929321289, "learning_rate": 0.0002, "epoch": 4.93213644524237, "step": 68680}, {"loss": 0.5861, "grad_norm": 0.84132319688797, "learning_rate": 0.0002, "epoch": 4.932854578096948, "step": 68690}, {"loss": 0.6206, "grad_norm": 1.0767526626586914, "learning_rate": 0.0002, "epoch": 4.933572710951526, "step": 68700}, {"loss": 0.6294, "grad_norm": 0.8858519792556763, "learning_rate": 0.0002, "epoch": 4.934290843806104, "step": 68710}, {"loss": 0.6727, "grad_norm": 1.194031000137329, "learning_rate": 0.0002, "epoch": 4.935008976660682, "step": 68720}, {"loss": 0.6231, "grad_norm": 0.8270226120948792, "learning_rate": 0.0002, "epoch": 4.93572710951526, "step": 68730}, {"loss": 0.6538, "grad_norm": 1.0385973453521729, "learning_rate": 0.0002, "epoch": 4.936445242369839, "step": 68740}, {"loss": 0.623, "grad_norm": 0.9062243700027466, "learning_rate": 0.0002, "epoch": 4.937163375224417, "step": 68750}, {"loss": 0.6578, "grad_norm": 1.0526955127716064, "learning_rate": 0.0002, "epoch": 4.937881508078995, "step": 68760}, {"loss": 0.6425, "grad_norm": 0.930604100227356, "learning_rate": 0.0002, "epoch": 4.938599640933573, "step": 68770}, {"loss": 0.6228, "grad_norm": 0.9635265469551086, "learning_rate": 0.0002, "epoch": 4.939317773788151, "step": 68780}, {"loss": 0.6269, "grad_norm": 0.9825171232223511, "learning_rate": 0.0002, "epoch": 4.940035906642729, "step": 68790}, {"loss": 0.6063, "grad_norm": 0.9621182680130005, "learning_rate": 0.0002, "epoch": 4.940754039497307, "step": 68800}, {"loss": 0.6558, "grad_norm": 0.9655307531356812, "learning_rate": 0.0002, "epoch": 4.941472172351885, "step": 68810}, {"loss": 0.6441, "grad_norm": 1.2948180437088013, "learning_rate": 0.0002, "epoch": 4.942190305206463, "step": 68820}, {"loss": 0.6757, "grad_norm": 0.9206728339195251, "learning_rate": 0.0002, "epoch": 4.942908438061041, "step": 68830}, {"loss": 0.6554, "grad_norm": 1.0235631465911865, "learning_rate": 0.0002, "epoch": 4.94362657091562, "step": 68840}, {"loss": 0.6386, "grad_norm": 1.0542538166046143, "learning_rate": 0.0002, "epoch": 4.944344703770198, "step": 68850}, {"loss": 0.6359, "grad_norm": 0.9787087440490723, "learning_rate": 0.0002, "epoch": 4.945062836624776, "step": 68860}, {"loss": 0.659, "grad_norm": 0.9527219533920288, "learning_rate": 0.0002, "epoch": 4.945780969479354, "step": 68870}, {"loss": 0.6504, "grad_norm": 1.1525826454162598, "learning_rate": 0.0002, "epoch": 4.946499102333932, "step": 68880}, {"loss": 0.6345, "grad_norm": 0.8610072731971741, "learning_rate": 0.0002, "epoch": 4.94721723518851, "step": 68890}, {"loss": 0.6029, "grad_norm": 1.1403616666793823, "learning_rate": 0.0002, "epoch": 4.947935368043088, "step": 68900}, {"loss": 0.6476, "grad_norm": 1.10334312915802, "learning_rate": 0.0002, "epoch": 4.948653500897666, "step": 68910}, {"loss": 0.6123, "grad_norm": 0.8633760809898376, "learning_rate": 0.0002, "epoch": 4.949371633752245, "step": 68920}, {"loss": 0.6619, "grad_norm": 1.1291080713272095, "learning_rate": 0.0002, "epoch": 4.950089766606823, "step": 68930}, {"loss": 0.6003, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 4.950807899461401, "step": 68940}, {"loss": 0.6126, "grad_norm": 0.9207960963249207, "learning_rate": 0.0002, "epoch": 4.951526032315979, "step": 68950}, {"loss": 0.6031, "grad_norm": 0.9815934300422668, "learning_rate": 0.0002, "epoch": 4.952244165170557, "step": 68960}, {"loss": 0.6201, "grad_norm": 0.9725701808929443, "learning_rate": 0.0002, "epoch": 4.952962298025135, "step": 68970}, {"loss": 0.6251, "grad_norm": 0.844926655292511, "learning_rate": 0.0002, "epoch": 4.953680430879713, "step": 68980}, {"loss": 0.6446, "grad_norm": 0.9898511171340942, "learning_rate": 0.0002, "epoch": 4.954398563734291, "step": 68990}, {"loss": 0.629, "grad_norm": 1.1311410665512085, "learning_rate": 0.0002, "epoch": 4.955116696588869, "step": 69000}, {"loss": 0.6525, "grad_norm": 1.218610405921936, "learning_rate": 0.0002, "epoch": 4.955834829443447, "step": 69010}, {"loss": 0.6639, "grad_norm": 1.1536420583724976, "learning_rate": 0.0002, "epoch": 4.956552962298025, "step": 69020}, {"loss": 0.6375, "grad_norm": 1.1857786178588867, "learning_rate": 0.0002, "epoch": 4.957271095152604, "step": 69030}, {"loss": 0.6618, "grad_norm": 0.9969246983528137, "learning_rate": 0.0002, "epoch": 4.957989228007182, "step": 69040}, {"loss": 0.633, "grad_norm": 1.138635277748108, "learning_rate": 0.0002, "epoch": 4.95870736086176, "step": 69050}, {"loss": 0.6344, "grad_norm": 1.110474705696106, "learning_rate": 0.0002, "epoch": 4.959425493716338, "step": 69060}, {"loss": 0.687, "grad_norm": 1.0366318225860596, "learning_rate": 0.0002, "epoch": 4.960143626570916, "step": 69070}, {"loss": 0.6384, "grad_norm": 0.6927996277809143, "learning_rate": 0.0002, "epoch": 4.960861759425494, "step": 69080}, {"loss": 0.6337, "grad_norm": 1.0368026494979858, "learning_rate": 0.0002, "epoch": 4.961579892280072, "step": 69090}, {"loss": 0.6077, "grad_norm": 1.0638312101364136, "learning_rate": 0.0002, "epoch": 4.96229802513465, "step": 69100}, {"loss": 0.6403, "grad_norm": 1.0372415781021118, "learning_rate": 0.0002, "epoch": 4.9630161579892285, "step": 69110}, {"loss": 0.6347, "grad_norm": 0.8257387280464172, "learning_rate": 0.0002, "epoch": 4.9637342908438065, "step": 69120}, {"loss": 0.6405, "grad_norm": 1.0046974420547485, "learning_rate": 0.0002, "epoch": 4.9644524236983845, "step": 69130}, {"loss": 0.623, "grad_norm": 1.0139652490615845, "learning_rate": 0.0002, "epoch": 4.9651705565529625, "step": 69140}, {"loss": 0.5857, "grad_norm": 1.0214691162109375, "learning_rate": 0.0002, "epoch": 4.9658886894075405, "step": 69150}, {"loss": 0.624, "grad_norm": 1.1042424440383911, "learning_rate": 0.0002, "epoch": 4.9666068222621185, "step": 69160}, {"loss": 0.6475, "grad_norm": 0.8749067783355713, "learning_rate": 0.0002, "epoch": 4.9673249551166965, "step": 69170}, {"loss": 0.6734, "grad_norm": 0.9894024133682251, "learning_rate": 0.0002, "epoch": 4.9680430879712745, "step": 69180}, {"loss": 0.5894, "grad_norm": 1.0218034982681274, "learning_rate": 0.0002, "epoch": 4.9687612208258525, "step": 69190}, {"loss": 0.6423, "grad_norm": 0.9782929420471191, "learning_rate": 0.0002, "epoch": 4.9694793536804305, "step": 69200}, {"loss": 0.6455, "grad_norm": 0.9373409748077393, "learning_rate": 0.0002, "epoch": 4.9701974865350085, "step": 69210}, {"loss": 0.6105, "grad_norm": 1.0329546928405762, "learning_rate": 0.0002, "epoch": 4.970915619389587, "step": 69220}, {"loss": 0.6877, "grad_norm": 0.9746108055114746, "learning_rate": 0.0002, "epoch": 4.971633752244165, "step": 69230}, {"loss": 0.6342, "grad_norm": 0.9202073216438293, "learning_rate": 0.0002, "epoch": 4.972351885098743, "step": 69240}, {"loss": 0.6102, "grad_norm": 1.078032374382019, "learning_rate": 0.0002, "epoch": 4.973070017953321, "step": 69250}, {"loss": 0.6349, "grad_norm": 0.8860024809837341, "learning_rate": 0.0002, "epoch": 4.973788150807899, "step": 69260}, {"loss": 0.5971, "grad_norm": 0.915212094783783, "learning_rate": 0.0002, "epoch": 4.974506283662477, "step": 69270}, {"loss": 0.623, "grad_norm": 1.1192166805267334, "learning_rate": 0.0002, "epoch": 4.975224416517055, "step": 69280}, {"loss": 0.6347, "grad_norm": 0.8387445211410522, "learning_rate": 0.0002, "epoch": 4.975942549371633, "step": 69290}, {"loss": 0.6392, "grad_norm": 1.1210044622421265, "learning_rate": 0.0002, "epoch": 4.976660682226212, "step": 69300}, {"loss": 0.6565, "grad_norm": 1.0051207542419434, "learning_rate": 0.0002, "epoch": 4.97737881508079, "step": 69310}, {"loss": 0.5961, "grad_norm": 0.9248682856559753, "learning_rate": 0.0002, "epoch": 4.978096947935368, "step": 69320}, {"loss": 0.6067, "grad_norm": 0.8265128135681152, "learning_rate": 0.0002, "epoch": 4.978815080789946, "step": 69330}, {"loss": 0.6068, "grad_norm": 0.9432681798934937, "learning_rate": 0.0002, "epoch": 4.979533213644524, "step": 69340}, {"loss": 0.627, "grad_norm": 1.0135977268218994, "learning_rate": 0.0002, "epoch": 4.980251346499102, "step": 69350}, {"loss": 0.5882, "grad_norm": 0.9857245683670044, "learning_rate": 0.0002, "epoch": 4.98096947935368, "step": 69360}, {"loss": 0.6396, "grad_norm": 0.9215952157974243, "learning_rate": 0.0002, "epoch": 4.981687612208258, "step": 69370}, {"loss": 0.565, "grad_norm": 1.1518077850341797, "learning_rate": 0.0002, "epoch": 4.982405745062836, "step": 69380}, {"loss": 0.6022, "grad_norm": 0.8836095929145813, "learning_rate": 0.0002, "epoch": 4.983123877917414, "step": 69390}, {"loss": 0.6442, "grad_norm": 0.8082528710365295, "learning_rate": 0.0002, "epoch": 4.983842010771993, "step": 69400}, {"loss": 0.597, "grad_norm": 0.9295604825019836, "learning_rate": 0.0002, "epoch": 4.984560143626571, "step": 69410}, {"loss": 0.5811, "grad_norm": 1.002057433128357, "learning_rate": 0.0002, "epoch": 4.985278276481149, "step": 69420}, {"loss": 0.6275, "grad_norm": 0.8127216100692749, "learning_rate": 0.0002, "epoch": 4.985996409335727, "step": 69430}, {"loss": 0.6223, "grad_norm": 1.058138370513916, "learning_rate": 0.0002, "epoch": 4.986714542190305, "step": 69440}, {"loss": 0.6317, "grad_norm": 0.8451166749000549, "learning_rate": 0.0002, "epoch": 4.987432675044883, "step": 69450}, {"loss": 0.6135, "grad_norm": 0.9687268137931824, "learning_rate": 0.0002, "epoch": 4.988150807899461, "step": 69460}, {"loss": 0.5926, "grad_norm": 1.0342036485671997, "learning_rate": 0.0002, "epoch": 4.988868940754039, "step": 69470}, {"loss": 0.636, "grad_norm": 0.9042398929595947, "learning_rate": 0.0002, "epoch": 4.989587073608618, "step": 69480}, {"loss": 0.6193, "grad_norm": 1.0575438737869263, "learning_rate": 0.0002, "epoch": 4.990305206463196, "step": 69490}, {"loss": 0.5887, "grad_norm": 0.9364935159683228, "learning_rate": 0.0002, "epoch": 4.991023339317774, "step": 69500}, {"loss": 0.6532, "grad_norm": 1.0327378511428833, "learning_rate": 0.0002, "epoch": 4.991741472172352, "step": 69510}, {"loss": 0.6397, "grad_norm": 0.815592885017395, "learning_rate": 0.0002, "epoch": 4.99245960502693, "step": 69520}, {"loss": 0.6776, "grad_norm": 1.0813369750976562, "learning_rate": 0.0002, "epoch": 4.993177737881508, "step": 69530}, {"loss": 0.6964, "grad_norm": 1.0277023315429688, "learning_rate": 0.0002, "epoch": 4.993895870736086, "step": 69540}, {"loss": 0.6369, "grad_norm": 1.0291162729263306, "learning_rate": 0.0002, "epoch": 4.994614003590664, "step": 69550}, {"loss": 0.5842, "grad_norm": 0.8435685634613037, "learning_rate": 0.0002, "epoch": 4.995332136445242, "step": 69560}, {"loss": 0.6146, "grad_norm": 1.1972291469573975, "learning_rate": 0.0002, "epoch": 4.99605026929982, "step": 69570}, {"loss": 0.5977, "grad_norm": 0.8114907741546631, "learning_rate": 0.0002, "epoch": 4.996768402154398, "step": 69580}, {"loss": 0.6137, "grad_norm": 0.8296133875846863, "learning_rate": 0.0002, "epoch": 4.997486535008977, "step": 69590}, {"loss": 0.6273, "grad_norm": 1.1728706359863281, "learning_rate": 0.0002, "epoch": 4.998204667863555, "step": 69600}, {"loss": 0.6579, "grad_norm": 0.9586578607559204, "learning_rate": 0.0002, "epoch": 4.998922800718133, "step": 69610}, {"loss": 0.612, "grad_norm": 0.9725151062011719, "learning_rate": 0.0002, "epoch": 4.999640933572711, "step": 69620}, {"eval_loss": 1.133581519126892, "eval_runtime": 55.2151, "eval_samples_per_second": 13.275, "eval_steps_per_second": 1.666, "epoch": 5.0, "step": 69625}, {"loss": 0.5741, "grad_norm": 0.9312055706977844, "learning_rate": 0.0002, "epoch": 5.000359066427289, "step": 69630}, {"loss": 0.5625, "grad_norm": 1.0534896850585938, "learning_rate": 0.0002, "epoch": 5.001077199281867, "step": 69640}, {"loss": 0.581, "grad_norm": 0.8891698718070984, "learning_rate": 0.0002, "epoch": 5.001795332136445, "step": 69650}, {"loss": 0.554, "grad_norm": 0.7791097164154053, "learning_rate": 0.0002, "epoch": 5.002513464991023, "step": 69660}, {"loss": 0.5146, "grad_norm": 1.2891173362731934, "learning_rate": 0.0002, "epoch": 5.003231597845601, "step": 69670}, {"loss": 0.551, "grad_norm": 0.7909513711929321, "learning_rate": 0.0002, "epoch": 5.00394973070018, "step": 69680}, {"loss": 0.5671, "grad_norm": 0.988648533821106, "learning_rate": 0.0002, "epoch": 5.004667863554758, "step": 69690}, {"loss": 0.5113, "grad_norm": 0.9669296741485596, "learning_rate": 0.0002, "epoch": 5.005385996409336, "step": 69700}, {"loss": 0.5974, "grad_norm": 1.2393349409103394, "learning_rate": 0.0002, "epoch": 5.006104129263914, "step": 69710}, {"loss": 0.5481, "grad_norm": 1.2420750856399536, "learning_rate": 0.0002, "epoch": 5.006822262118492, "step": 69720}, {"loss": 0.5725, "grad_norm": 1.1698096990585327, "learning_rate": 0.0002, "epoch": 5.00754039497307, "step": 69730}, {"loss": 0.5646, "grad_norm": 1.2228301763534546, "learning_rate": 0.0002, "epoch": 5.008258527827648, "step": 69740}, {"loss": 0.6048, "grad_norm": 0.9350621104240417, "learning_rate": 0.0002, "epoch": 5.008976660682226, "step": 69750}, {"loss": 0.5278, "grad_norm": 0.9828507304191589, "learning_rate": 0.0002, "epoch": 5.009694793536804, "step": 69760}, {"loss": 0.5188, "grad_norm": 0.9372149109840393, "learning_rate": 0.0002, "epoch": 5.010412926391383, "step": 69770}, {"loss": 0.5408, "grad_norm": 0.8098477125167847, "learning_rate": 0.0002, "epoch": 5.011131059245961, "step": 69780}, {"loss": 0.533, "grad_norm": 1.0418338775634766, "learning_rate": 0.0002, "epoch": 5.011849192100539, "step": 69790}, {"loss": 0.5423, "grad_norm": 1.0175801515579224, "learning_rate": 0.0002, "epoch": 5.012567324955117, "step": 69800}, {"loss": 0.5389, "grad_norm": 1.2128081321716309, "learning_rate": 0.0002, "epoch": 5.013285457809695, "step": 69810}, {"loss": 0.5307, "grad_norm": 1.001805067062378, "learning_rate": 0.0002, "epoch": 5.014003590664273, "step": 69820}, {"loss": 0.533, "grad_norm": 0.8957470059394836, "learning_rate": 0.0002, "epoch": 5.014721723518851, "step": 69830}, {"loss": 0.6017, "grad_norm": 0.9344548583030701, "learning_rate": 0.0002, "epoch": 5.015439856373429, "step": 69840}, {"loss": 0.6182, "grad_norm": 0.8545927405357361, "learning_rate": 0.0002, "epoch": 5.016157989228007, "step": 69850}, {"loss": 0.5543, "grad_norm": 1.3907777070999146, "learning_rate": 0.0002, "epoch": 5.016876122082586, "step": 69860}, {"loss": 0.5028, "grad_norm": 0.8112093806266785, "learning_rate": 0.0002, "epoch": 5.017594254937164, "step": 69870}, {"loss": 0.5, "grad_norm": 1.0151532888412476, "learning_rate": 0.0002, "epoch": 5.018312387791742, "step": 69880}, {"loss": 0.5622, "grad_norm": 1.249021053314209, "learning_rate": 0.0002, "epoch": 5.01903052064632, "step": 69890}, {"loss": 0.5419, "grad_norm": 0.9310314059257507, "learning_rate": 0.0002, "epoch": 5.019748653500898, "step": 69900}, {"loss": 0.5628, "grad_norm": 0.9444572925567627, "learning_rate": 0.0002, "epoch": 5.020466786355476, "step": 69910}, {"loss": 0.5436, "grad_norm": 1.0952081680297852, "learning_rate": 0.0002, "epoch": 5.021184919210054, "step": 69920}, {"loss": 0.5532, "grad_norm": 1.2106375694274902, "learning_rate": 0.0002, "epoch": 5.021903052064632, "step": 69930}, {"loss": 0.5307, "grad_norm": 1.0179580450057983, "learning_rate": 0.0002, "epoch": 5.02262118491921, "step": 69940}, {"loss": 0.5537, "grad_norm": 1.0865367650985718, "learning_rate": 0.0002, "epoch": 5.023339317773788, "step": 69950}, {"loss": 0.6011, "grad_norm": 1.0965075492858887, "learning_rate": 0.0002, "epoch": 5.024057450628367, "step": 69960}, {"loss": 0.5255, "grad_norm": 0.8879445791244507, "learning_rate": 0.0002, "epoch": 5.024775583482945, "step": 69970}, {"loss": 0.5681, "grad_norm": 1.2588363885879517, "learning_rate": 0.0002, "epoch": 5.025493716337523, "step": 69980}, {"loss": 0.5288, "grad_norm": 0.935705304145813, "learning_rate": 0.0002, "epoch": 5.026211849192101, "step": 69990}, {"loss": 0.4922, "grad_norm": 1.072012186050415, "learning_rate": 0.0002, "epoch": 5.026929982046679, "step": 70000}, {"loss": 0.5729, "grad_norm": 1.286438226699829, "learning_rate": 0.0002, "epoch": 5.027648114901257, "step": 70010}, {"loss": 0.5569, "grad_norm": 1.1165392398834229, "learning_rate": 0.0002, "epoch": 5.028366247755835, "step": 70020}, {"loss": 0.5348, "grad_norm": 0.7998424172401428, "learning_rate": 0.0002, "epoch": 5.029084380610413, "step": 70030}, {"loss": 0.5436, "grad_norm": 1.5669852495193481, "learning_rate": 0.0002, "epoch": 5.029802513464991, "step": 70040}, {"loss": 0.5595, "grad_norm": 0.9780290722846985, "learning_rate": 0.0002, "epoch": 5.0305206463195695, "step": 70050}, {"loss": 0.5612, "grad_norm": 0.9837628602981567, "learning_rate": 0.0002, "epoch": 5.0312387791741475, "step": 70060}, {"loss": 0.5369, "grad_norm": 0.9558916091918945, "learning_rate": 0.0002, "epoch": 5.0319569120287255, "step": 70070}, {"loss": 0.552, "grad_norm": 0.8893155455589294, "learning_rate": 0.0002, "epoch": 5.0326750448833035, "step": 70080}, {"loss": 0.5684, "grad_norm": 1.1403675079345703, "learning_rate": 0.0002, "epoch": 5.0333931777378815, "step": 70090}, {"loss": 0.5352, "grad_norm": 1.0453649759292603, "learning_rate": 0.0002, "epoch": 5.0341113105924595, "step": 70100}, {"loss": 0.5691, "grad_norm": 0.8127498030662537, "learning_rate": 0.0002, "epoch": 5.0348294434470375, "step": 70110}, {"loss": 0.5254, "grad_norm": 0.9344680309295654, "learning_rate": 0.0002, "epoch": 5.0355475763016155, "step": 70120}, {"loss": 0.5385, "grad_norm": 1.0302079916000366, "learning_rate": 0.0002, "epoch": 5.0362657091561935, "step": 70130}, {"loss": 0.5949, "grad_norm": 1.0549713373184204, "learning_rate": 0.0002, "epoch": 5.036983842010772, "step": 70140}, {"loss": 0.4886, "grad_norm": 0.8916767835617065, "learning_rate": 0.0002, "epoch": 5.03770197486535, "step": 70150}, {"loss": 0.5761, "grad_norm": 0.9799798130989075, "learning_rate": 0.0002, "epoch": 5.038420107719928, "step": 70160}, {"loss": 0.5138, "grad_norm": 1.15560781955719, "learning_rate": 0.0002, "epoch": 5.039138240574506, "step": 70170}, {"loss": 0.6075, "grad_norm": 1.0577017068862915, "learning_rate": 0.0002, "epoch": 5.039856373429084, "step": 70180}, {"loss": 0.5316, "grad_norm": 1.027990698814392, "learning_rate": 0.0002, "epoch": 5.040574506283662, "step": 70190}, {"loss": 0.567, "grad_norm": 1.0818232297897339, "learning_rate": 0.0002, "epoch": 5.04129263913824, "step": 70200}, {"loss": 0.5699, "grad_norm": 1.0287196636199951, "learning_rate": 0.0002, "epoch": 5.042010771992818, "step": 70210}, {"loss": 0.5129, "grad_norm": 1.1569273471832275, "learning_rate": 0.0002, "epoch": 5.042728904847396, "step": 70220}, {"loss": 0.5407, "grad_norm": 1.0485484600067139, "learning_rate": 0.0002, "epoch": 5.0434470377019744, "step": 70230}, {"loss": 0.5203, "grad_norm": 0.9244540333747864, "learning_rate": 0.0002, "epoch": 5.044165170556553, "step": 70240}, {"loss": 0.5277, "grad_norm": 0.9576422572135925, "learning_rate": 0.0002, "epoch": 5.044883303411131, "step": 70250}, {"loss": 0.539, "grad_norm": 0.8719421625137329, "learning_rate": 0.0002, "epoch": 5.045601436265709, "step": 70260}, {"loss": 0.5725, "grad_norm": 0.8685409426689148, "learning_rate": 0.0002, "epoch": 5.046319569120287, "step": 70270}, {"loss": 0.5111, "grad_norm": 1.2735247611999512, "learning_rate": 0.0002, "epoch": 5.047037701974865, "step": 70280}, {"loss": 0.5768, "grad_norm": 0.9082128405570984, "learning_rate": 0.0002, "epoch": 5.047755834829443, "step": 70290}, {"loss": 0.5649, "grad_norm": 1.0626471042633057, "learning_rate": 0.0002, "epoch": 5.048473967684021, "step": 70300}, {"loss": 0.5694, "grad_norm": 1.1463991403579712, "learning_rate": 0.0002, "epoch": 5.049192100538599, "step": 70310}, {"loss": 0.5912, "grad_norm": 0.8825355172157288, "learning_rate": 0.0002, "epoch": 5.049910233393177, "step": 70320}, {"loss": 0.5814, "grad_norm": 1.0549408197402954, "learning_rate": 0.0002, "epoch": 5.050628366247756, "step": 70330}, {"loss": 0.5658, "grad_norm": 1.3740944862365723, "learning_rate": 0.0002, "epoch": 5.051346499102334, "step": 70340}, {"loss": 0.5665, "grad_norm": 1.4197895526885986, "learning_rate": 0.0002, "epoch": 5.052064631956912, "step": 70350}, {"loss": 0.5852, "grad_norm": 1.1764925718307495, "learning_rate": 0.0002, "epoch": 5.05278276481149, "step": 70360}, {"loss": 0.5551, "grad_norm": 1.0443403720855713, "learning_rate": 0.0002, "epoch": 5.053500897666068, "step": 70370}, {"loss": 0.5647, "grad_norm": 1.1807527542114258, "learning_rate": 0.0002, "epoch": 5.054219030520646, "step": 70380}, {"loss": 0.5712, "grad_norm": 1.4032433032989502, "learning_rate": 0.0002, "epoch": 5.054937163375224, "step": 70390}, {"loss": 0.5656, "grad_norm": 0.9815662503242493, "learning_rate": 0.0002, "epoch": 5.055655296229802, "step": 70400}, {"loss": 0.5878, "grad_norm": 0.9368446469306946, "learning_rate": 0.0002, "epoch": 5.05637342908438, "step": 70410}, {"loss": 0.5639, "grad_norm": 1.1156736612319946, "learning_rate": 0.0002, "epoch": 5.057091561938959, "step": 70420}, {"loss": 0.5564, "grad_norm": 1.01651132106781, "learning_rate": 0.0002, "epoch": 5.057809694793537, "step": 70430}, {"loss": 0.5276, "grad_norm": 0.9906342029571533, "learning_rate": 0.0002, "epoch": 5.058527827648115, "step": 70440}, {"loss": 0.5533, "grad_norm": 0.8666667938232422, "learning_rate": 0.0002, "epoch": 5.059245960502693, "step": 70450}, {"loss": 0.5253, "grad_norm": 1.0508924722671509, "learning_rate": 0.0002, "epoch": 5.059964093357271, "step": 70460}, {"loss": 0.5456, "grad_norm": 1.2472858428955078, "learning_rate": 0.0002, "epoch": 5.060682226211849, "step": 70470}, {"loss": 0.5836, "grad_norm": 1.019073724746704, "learning_rate": 0.0002, "epoch": 5.061400359066427, "step": 70480}, {"loss": 0.5206, "grad_norm": 0.9745403528213501, "learning_rate": 0.0002, "epoch": 5.062118491921005, "step": 70490}, {"loss": 0.5543, "grad_norm": 1.121208906173706, "learning_rate": 0.0002, "epoch": 5.062836624775583, "step": 70500}, {"loss": 0.54, "grad_norm": 1.0535147190093994, "learning_rate": 0.0002, "epoch": 5.063554757630161, "step": 70510}, {"loss": 0.5601, "grad_norm": 1.0368950366973877, "learning_rate": 0.0002, "epoch": 5.06427289048474, "step": 70520}, {"loss": 0.5495, "grad_norm": 0.948964536190033, "learning_rate": 0.0002, "epoch": 5.064991023339318, "step": 70530}, {"loss": 0.5254, "grad_norm": 1.0289826393127441, "learning_rate": 0.0002, "epoch": 5.065709156193896, "step": 70540}, {"loss": 0.591, "grad_norm": 1.118374228477478, "learning_rate": 0.0002, "epoch": 5.066427289048474, "step": 70550}, {"loss": 0.5874, "grad_norm": 0.8712816834449768, "learning_rate": 0.0002, "epoch": 5.067145421903052, "step": 70560}, {"loss": 0.557, "grad_norm": 0.9057969450950623, "learning_rate": 0.0002, "epoch": 5.06786355475763, "step": 70570}, {"loss": 0.5606, "grad_norm": 0.9292685985565186, "learning_rate": 0.0002, "epoch": 5.068581687612208, "step": 70580}, {"loss": 0.5468, "grad_norm": 0.9159911274909973, "learning_rate": 0.0002, "epoch": 5.069299820466786, "step": 70590}, {"loss": 0.5608, "grad_norm": 0.973848819732666, "learning_rate": 0.0002, "epoch": 5.070017953321364, "step": 70600}, {"loss": 0.5199, "grad_norm": 0.7892279028892517, "learning_rate": 0.0002, "epoch": 5.070736086175943, "step": 70610}, {"loss": 0.6009, "grad_norm": 0.9943311214447021, "learning_rate": 0.0002, "epoch": 5.071454219030521, "step": 70620}, {"loss": 0.5224, "grad_norm": 1.1457926034927368, "learning_rate": 0.0002, "epoch": 5.072172351885099, "step": 70630}, {"loss": 0.5821, "grad_norm": 0.9307738542556763, "learning_rate": 0.0002, "epoch": 5.072890484739677, "step": 70640}, {"loss": 0.5375, "grad_norm": 1.0899816751480103, "learning_rate": 0.0002, "epoch": 5.073608617594255, "step": 70650}, {"loss": 0.5407, "grad_norm": 0.8357672691345215, "learning_rate": 0.0002, "epoch": 5.074326750448833, "step": 70660}, {"loss": 0.5745, "grad_norm": 0.8889468312263489, "learning_rate": 0.0002, "epoch": 5.075044883303411, "step": 70670}, {"loss": 0.5595, "grad_norm": 0.9152118563652039, "learning_rate": 0.0002, "epoch": 5.075763016157989, "step": 70680}, {"loss": 0.5706, "grad_norm": 1.106160044670105, "learning_rate": 0.0002, "epoch": 5.076481149012567, "step": 70690}, {"loss": 0.5659, "grad_norm": 0.8519207835197449, "learning_rate": 0.0002, "epoch": 5.077199281867145, "step": 70700}, {"loss": 0.5312, "grad_norm": 0.9754986763000488, "learning_rate": 0.0002, "epoch": 5.077917414721724, "step": 70710}, {"loss": 0.5602, "grad_norm": 1.167883276939392, "learning_rate": 0.0002, "epoch": 5.078635547576302, "step": 70720}, {"loss": 0.5427, "grad_norm": 0.987622082233429, "learning_rate": 0.0002, "epoch": 5.07935368043088, "step": 70730}, {"loss": 0.5346, "grad_norm": 1.0008184909820557, "learning_rate": 0.0002, "epoch": 5.080071813285458, "step": 70740}, {"loss": 0.5219, "grad_norm": 0.6318819522857666, "learning_rate": 0.0002, "epoch": 5.080789946140036, "step": 70750}, {"loss": 0.5838, "grad_norm": 0.984886884689331, "learning_rate": 0.0002, "epoch": 5.081508078994614, "step": 70760}, {"loss": 0.5775, "grad_norm": 1.0583622455596924, "learning_rate": 0.0002, "epoch": 5.082226211849192, "step": 70770}, {"loss": 0.579, "grad_norm": 0.9730119705200195, "learning_rate": 0.0002, "epoch": 5.08294434470377, "step": 70780}, {"loss": 0.5806, "grad_norm": 1.0201330184936523, "learning_rate": 0.0002, "epoch": 5.083662477558348, "step": 70790}, {"loss": 0.5568, "grad_norm": 1.0479248762130737, "learning_rate": 0.0002, "epoch": 5.084380610412927, "step": 70800}, {"loss": 0.5619, "grad_norm": 0.9185113906860352, "learning_rate": 0.0002, "epoch": 5.085098743267505, "step": 70810}, {"loss": 0.5468, "grad_norm": 0.9326799511909485, "learning_rate": 0.0002, "epoch": 5.085816876122083, "step": 70820}, {"loss": 0.5424, "grad_norm": 0.958739697933197, "learning_rate": 0.0002, "epoch": 5.086535008976661, "step": 70830}, {"loss": 0.6098, "grad_norm": 0.9643770456314087, "learning_rate": 0.0002, "epoch": 5.087253141831239, "step": 70840}, {"loss": 0.5427, "grad_norm": 0.8650234341621399, "learning_rate": 0.0002, "epoch": 5.087971274685817, "step": 70850}, {"loss": 0.5452, "grad_norm": 0.9354105591773987, "learning_rate": 0.0002, "epoch": 5.088689407540395, "step": 70860}, {"loss": 0.5467, "grad_norm": 0.8736345171928406, "learning_rate": 0.0002, "epoch": 5.089407540394973, "step": 70870}, {"loss": 0.5607, "grad_norm": 0.9172632098197937, "learning_rate": 0.0002, "epoch": 5.090125673249551, "step": 70880}, {"loss": 0.5136, "grad_norm": 0.9495565295219421, "learning_rate": 0.0002, "epoch": 5.09084380610413, "step": 70890}, {"loss": 0.5633, "grad_norm": 1.0328829288482666, "learning_rate": 0.0002, "epoch": 5.091561938958708, "step": 70900}, {"loss": 0.566, "grad_norm": 0.9335703253746033, "learning_rate": 0.0002, "epoch": 5.092280071813286, "step": 70910}, {"loss": 0.5393, "grad_norm": 1.0919437408447266, "learning_rate": 0.0002, "epoch": 5.092998204667864, "step": 70920}, {"loss": 0.5931, "grad_norm": 1.03340744972229, "learning_rate": 0.0002, "epoch": 5.093716337522442, "step": 70930}, {"loss": 0.5228, "grad_norm": 1.0501604080200195, "learning_rate": 0.0002, "epoch": 5.09443447037702, "step": 70940}, {"loss": 0.5518, "grad_norm": 0.9442012310028076, "learning_rate": 0.0002, "epoch": 5.095152603231598, "step": 70950}, {"loss": 0.5185, "grad_norm": 1.2592464685440063, "learning_rate": 0.0002, "epoch": 5.095870736086176, "step": 70960}, {"loss": 0.5524, "grad_norm": 1.0961427688598633, "learning_rate": 0.0002, "epoch": 5.096588868940754, "step": 70970}, {"loss": 0.5702, "grad_norm": 1.0472424030303955, "learning_rate": 0.0002, "epoch": 5.097307001795333, "step": 70980}, {"loss": 0.5697, "grad_norm": 0.9489352107048035, "learning_rate": 0.0002, "epoch": 5.098025134649911, "step": 70990}, {"loss": 0.5559, "grad_norm": 1.0499446392059326, "learning_rate": 0.0002, "epoch": 5.098743267504489, "step": 71000}, {"loss": 0.5815, "grad_norm": 1.013005018234253, "learning_rate": 0.0002, "epoch": 5.099461400359067, "step": 71010}, {"loss": 0.5524, "grad_norm": 0.9594261050224304, "learning_rate": 0.0002, "epoch": 5.100179533213645, "step": 71020}, {"loss": 0.5746, "grad_norm": 1.2016123533248901, "learning_rate": 0.0002, "epoch": 5.100897666068223, "step": 71030}, {"loss": 0.5605, "grad_norm": 1.0389765501022339, "learning_rate": 0.0002, "epoch": 5.101615798922801, "step": 71040}, {"loss": 0.5036, "grad_norm": 1.053534746170044, "learning_rate": 0.0002, "epoch": 5.102333931777379, "step": 71050}, {"loss": 0.5764, "grad_norm": 1.1379448175430298, "learning_rate": 0.0002, "epoch": 5.103052064631957, "step": 71060}, {"loss": 0.5487, "grad_norm": 0.8796491622924805, "learning_rate": 0.0002, "epoch": 5.103770197486535, "step": 71070}, {"loss": 0.59, "grad_norm": 1.0591254234313965, "learning_rate": 0.0002, "epoch": 5.1044883303411135, "step": 71080}, {"loss": 0.5591, "grad_norm": 0.9622171521186829, "learning_rate": 0.0002, "epoch": 5.1052064631956915, "step": 71090}, {"loss": 0.5737, "grad_norm": 0.9173060059547424, "learning_rate": 0.0002, "epoch": 5.1059245960502695, "step": 71100}, {"loss": 0.5794, "grad_norm": 0.8363444805145264, "learning_rate": 0.0002, "epoch": 5.1066427289048475, "step": 71110}, {"loss": 0.5689, "grad_norm": 1.1006172895431519, "learning_rate": 0.0002, "epoch": 5.1073608617594255, "step": 71120}, {"loss": 0.5753, "grad_norm": 1.0720574855804443, "learning_rate": 0.0002, "epoch": 5.1080789946140035, "step": 71130}, {"loss": 0.5585, "grad_norm": 1.0560680627822876, "learning_rate": 0.0002, "epoch": 5.1087971274685815, "step": 71140}, {"loss": 0.5535, "grad_norm": 0.8485415577888489, "learning_rate": 0.0002, "epoch": 5.1095152603231595, "step": 71150}, {"loss": 0.545, "grad_norm": 1.109383225440979, "learning_rate": 0.0002, "epoch": 5.1102333931777375, "step": 71160}, {"loss": 0.568, "grad_norm": 0.9296035766601562, "learning_rate": 0.0002, "epoch": 5.110951526032316, "step": 71170}, {"loss": 0.5151, "grad_norm": 1.2855182886123657, "learning_rate": 0.0002, "epoch": 5.111669658886894, "step": 71180}, {"loss": 0.5578, "grad_norm": 1.0313524007797241, "learning_rate": 0.0002, "epoch": 5.112387791741472, "step": 71190}, {"loss": 0.5486, "grad_norm": 1.0436697006225586, "learning_rate": 0.0002, "epoch": 5.11310592459605, "step": 71200}, {"loss": 0.5592, "grad_norm": 0.901333212852478, "learning_rate": 0.0002, "epoch": 5.113824057450628, "step": 71210}, {"loss": 0.5644, "grad_norm": 1.2170051336288452, "learning_rate": 0.0002, "epoch": 5.114542190305206, "step": 71220}, {"loss": 0.5508, "grad_norm": 0.8850961327552795, "learning_rate": 0.0002, "epoch": 5.115260323159784, "step": 71230}, {"loss": 0.5814, "grad_norm": 1.0147113800048828, "learning_rate": 0.0002, "epoch": 5.115978456014362, "step": 71240}, {"loss": 0.5824, "grad_norm": 1.0043506622314453, "learning_rate": 0.0002, "epoch": 5.11669658886894, "step": 71250}, {"loss": 0.5363, "grad_norm": 0.9887113571166992, "learning_rate": 0.0002, "epoch": 5.117414721723518, "step": 71260}, {"loss": 0.5956, "grad_norm": 1.1013392210006714, "learning_rate": 0.0002, "epoch": 5.118132854578097, "step": 71270}, {"loss": 0.5596, "grad_norm": 0.9213799238204956, "learning_rate": 0.0002, "epoch": 5.118850987432675, "step": 71280}, {"loss": 0.5473, "grad_norm": 1.047400712966919, "learning_rate": 0.0002, "epoch": 5.119569120287253, "step": 71290}, {"loss": 0.5866, "grad_norm": 1.030534029006958, "learning_rate": 0.0002, "epoch": 5.120287253141831, "step": 71300}, {"loss": 0.5713, "grad_norm": 0.9464976191520691, "learning_rate": 0.0002, "epoch": 5.121005385996409, "step": 71310}, {"loss": 0.5707, "grad_norm": 0.8610315918922424, "learning_rate": 0.0002, "epoch": 5.121723518850987, "step": 71320}, {"loss": 0.5498, "grad_norm": 1.0824426412582397, "learning_rate": 0.0002, "epoch": 5.122441651705565, "step": 71330}, {"loss": 0.5802, "grad_norm": 0.9382733106613159, "learning_rate": 0.0002, "epoch": 5.123159784560143, "step": 71340}, {"loss": 0.5899, "grad_norm": 0.9364684224128723, "learning_rate": 0.0002, "epoch": 5.123877917414721, "step": 71350}, {"loss": 0.5839, "grad_norm": 0.9583013653755188, "learning_rate": 0.0002, "epoch": 5.1245960502693, "step": 71360}, {"loss": 0.5446, "grad_norm": 1.287533164024353, "learning_rate": 0.0002, "epoch": 5.125314183123878, "step": 71370}, {"loss": 0.5602, "grad_norm": 1.5031169652938843, "learning_rate": 0.0002, "epoch": 5.126032315978456, "step": 71380}, {"loss": 0.5143, "grad_norm": 0.9891406297683716, "learning_rate": 0.0002, "epoch": 5.126750448833034, "step": 71390}, {"loss": 0.5408, "grad_norm": 1.1851537227630615, "learning_rate": 0.0002, "epoch": 5.127468581687612, "step": 71400}, {"loss": 0.586, "grad_norm": 0.9869971871376038, "learning_rate": 0.0002, "epoch": 5.12818671454219, "step": 71410}, {"loss": 0.575, "grad_norm": 0.961662769317627, "learning_rate": 0.0002, "epoch": 5.128904847396768, "step": 71420}, {"loss": 0.5686, "grad_norm": 1.1036419868469238, "learning_rate": 0.0002, "epoch": 5.129622980251346, "step": 71430}, {"loss": 0.5642, "grad_norm": 1.175361156463623, "learning_rate": 0.0002, "epoch": 5.130341113105924, "step": 71440}, {"loss": 0.5294, "grad_norm": 0.9801875948905945, "learning_rate": 0.0002, "epoch": 5.131059245960503, "step": 71450}, {"loss": 0.5123, "grad_norm": 0.9424611330032349, "learning_rate": 0.0002, "epoch": 5.131777378815081, "step": 71460}, {"loss": 0.651, "grad_norm": 1.11662757396698, "learning_rate": 0.0002, "epoch": 5.132495511669659, "step": 71470}, {"loss": 0.5498, "grad_norm": 0.9969366192817688, "learning_rate": 0.0002, "epoch": 5.133213644524237, "step": 71480}, {"loss": 0.5315, "grad_norm": 1.278640866279602, "learning_rate": 0.0002, "epoch": 5.133931777378815, "step": 71490}, {"loss": 0.5525, "grad_norm": 1.1090457439422607, "learning_rate": 0.0002, "epoch": 5.134649910233393, "step": 71500}, {"loss": 0.5307, "grad_norm": 1.01808500289917, "learning_rate": 0.0002, "epoch": 5.135368043087971, "step": 71510}, {"loss": 0.5465, "grad_norm": 1.029135823249817, "learning_rate": 0.0002, "epoch": 5.136086175942549, "step": 71520}, {"loss": 0.588, "grad_norm": 1.1207175254821777, "learning_rate": 0.0002, "epoch": 5.136804308797127, "step": 71530}, {"loss": 0.5451, "grad_norm": 1.0327218770980835, "learning_rate": 0.0002, "epoch": 5.137522441651706, "step": 71540}, {"loss": 0.5944, "grad_norm": 1.042490839958191, "learning_rate": 0.0002, "epoch": 5.138240574506284, "step": 71550}, {"loss": 0.5777, "grad_norm": 1.1800413131713867, "learning_rate": 0.0002, "epoch": 5.138958707360862, "step": 71560}, {"loss": 0.6002, "grad_norm": 1.0748766660690308, "learning_rate": 0.0002, "epoch": 5.13967684021544, "step": 71570}, {"loss": 0.5418, "grad_norm": 0.9983090758323669, "learning_rate": 0.0002, "epoch": 5.140394973070018, "step": 71580}, {"loss": 0.5423, "grad_norm": 1.30636727809906, "learning_rate": 0.0002, "epoch": 5.141113105924596, "step": 71590}, {"loss": 0.5742, "grad_norm": 0.9960222840309143, "learning_rate": 0.0002, "epoch": 5.141831238779174, "step": 71600}, {"loss": 0.5496, "grad_norm": 1.237027645111084, "learning_rate": 0.0002, "epoch": 5.142549371633752, "step": 71610}, {"loss": 0.564, "grad_norm": 1.0913307666778564, "learning_rate": 0.0002, "epoch": 5.14326750448833, "step": 71620}, {"loss": 0.5458, "grad_norm": 0.940657913684845, "learning_rate": 0.0002, "epoch": 5.143985637342908, "step": 71630}, {"loss": 0.5918, "grad_norm": 1.093796730041504, "learning_rate": 0.0002, "epoch": 5.144703770197487, "step": 71640}, {"loss": 0.5519, "grad_norm": 0.9703856110572815, "learning_rate": 0.0002, "epoch": 5.145421903052065, "step": 71650}, {"loss": 0.5859, "grad_norm": 0.9874776005744934, "learning_rate": 0.0002, "epoch": 5.146140035906643, "step": 71660}, {"loss": 0.555, "grad_norm": 0.9723859429359436, "learning_rate": 0.0002, "epoch": 5.146858168761221, "step": 71670}, {"loss": 0.5866, "grad_norm": 0.997107207775116, "learning_rate": 0.0002, "epoch": 5.147576301615799, "step": 71680}, {"loss": 0.5399, "grad_norm": 1.0261175632476807, "learning_rate": 0.0002, "epoch": 5.148294434470377, "step": 71690}, {"loss": 0.5427, "grad_norm": 0.9093905687332153, "learning_rate": 0.0002, "epoch": 5.149012567324955, "step": 71700}, {"loss": 0.557, "grad_norm": 0.9909888505935669, "learning_rate": 0.0002, "epoch": 5.149730700179533, "step": 71710}, {"loss": 0.5343, "grad_norm": 0.9111971259117126, "learning_rate": 0.0002, "epoch": 5.150448833034111, "step": 71720}, {"loss": 0.5717, "grad_norm": 0.9319643974304199, "learning_rate": 0.0002, "epoch": 5.15116696588869, "step": 71730}, {"loss": 0.5676, "grad_norm": 1.0744104385375977, "learning_rate": 0.0002, "epoch": 5.151885098743268, "step": 71740}, {"loss": 0.5914, "grad_norm": 1.1555477380752563, "learning_rate": 0.0002, "epoch": 5.152603231597846, "step": 71750}, {"loss": 0.5859, "grad_norm": 0.9809171557426453, "learning_rate": 0.0002, "epoch": 5.153321364452424, "step": 71760}, {"loss": 0.5663, "grad_norm": 0.7937686443328857, "learning_rate": 0.0002, "epoch": 5.154039497307002, "step": 71770}, {"loss": 0.5637, "grad_norm": 1.1925430297851562, "learning_rate": 0.0002, "epoch": 5.15475763016158, "step": 71780}, {"loss": 0.5759, "grad_norm": 1.077412486076355, "learning_rate": 0.0002, "epoch": 5.155475763016158, "step": 71790}, {"loss": 0.5653, "grad_norm": 0.7992808222770691, "learning_rate": 0.0002, "epoch": 5.156193895870736, "step": 71800}, {"loss": 0.5596, "grad_norm": 1.0938535928726196, "learning_rate": 0.0002, "epoch": 5.156912028725314, "step": 71810}, {"loss": 0.5562, "grad_norm": 0.9458112120628357, "learning_rate": 0.0002, "epoch": 5.157630161579892, "step": 71820}, {"loss": 0.5514, "grad_norm": 0.984940230846405, "learning_rate": 0.0002, "epoch": 5.158348294434471, "step": 71830}, {"loss": 0.5262, "grad_norm": 0.9242565035820007, "learning_rate": 0.0002, "epoch": 5.159066427289049, "step": 71840}, {"loss": 0.5591, "grad_norm": 0.8386720418930054, "learning_rate": 0.0002, "epoch": 5.159784560143627, "step": 71850}, {"loss": 0.5871, "grad_norm": 0.9627357721328735, "learning_rate": 0.0002, "epoch": 5.160502692998205, "step": 71860}, {"loss": 0.6063, "grad_norm": 1.0118762254714966, "learning_rate": 0.0002, "epoch": 5.161220825852783, "step": 71870}, {"loss": 0.5558, "grad_norm": 1.1552608013153076, "learning_rate": 0.0002, "epoch": 5.161938958707361, "step": 71880}, {"loss": 0.5789, "grad_norm": 1.0910389423370361, "learning_rate": 0.0002, "epoch": 5.162657091561939, "step": 71890}, {"loss": 0.5568, "grad_norm": 1.046639084815979, "learning_rate": 0.0002, "epoch": 5.163375224416517, "step": 71900}, {"loss": 0.5646, "grad_norm": 1.0087649822235107, "learning_rate": 0.0002, "epoch": 5.164093357271095, "step": 71910}, {"loss": 0.5663, "grad_norm": 0.9418644309043884, "learning_rate": 0.0002, "epoch": 5.164811490125674, "step": 71920}, {"loss": 0.5668, "grad_norm": 1.1213915348052979, "learning_rate": 0.0002, "epoch": 5.165529622980252, "step": 71930}, {"loss": 0.5979, "grad_norm": 1.043786644935608, "learning_rate": 0.0002, "epoch": 5.16624775583483, "step": 71940}, {"loss": 0.5714, "grad_norm": 1.2150449752807617, "learning_rate": 0.0002, "epoch": 5.166965888689408, "step": 71950}, {"loss": 0.5766, "grad_norm": 1.1214520931243896, "learning_rate": 0.0002, "epoch": 5.167684021543986, "step": 71960}, {"loss": 0.5851, "grad_norm": 0.9235218167304993, "learning_rate": 0.0002, "epoch": 5.168402154398564, "step": 71970}, {"loss": 0.5917, "grad_norm": 0.8736480474472046, "learning_rate": 0.0002, "epoch": 5.169120287253142, "step": 71980}, {"loss": 0.5508, "grad_norm": 0.8723195195198059, "learning_rate": 0.0002, "epoch": 5.16983842010772, "step": 71990}, {"loss": 0.5927, "grad_norm": 1.0873022079467773, "learning_rate": 0.0002, "epoch": 5.170556552962298, "step": 72000}, {"loss": 0.5507, "grad_norm": 0.9196295142173767, "learning_rate": 0.0002, "epoch": 5.1712746858168765, "step": 72010}, {"loss": 0.5416, "grad_norm": 0.9244471192359924, "learning_rate": 0.0002, "epoch": 5.1719928186714546, "step": 72020}, {"loss": 0.5626, "grad_norm": 1.0555505752563477, "learning_rate": 0.0002, "epoch": 5.1727109515260326, "step": 72030}, {"loss": 0.6181, "grad_norm": 1.1527929306030273, "learning_rate": 0.0002, "epoch": 5.1734290843806106, "step": 72040}, {"loss": 0.6129, "grad_norm": 0.9069058895111084, "learning_rate": 0.0002, "epoch": 5.174147217235189, "step": 72050}, {"loss": 0.5597, "grad_norm": 1.1047141551971436, "learning_rate": 0.0002, "epoch": 5.174865350089767, "step": 72060}, {"loss": 0.5307, "grad_norm": 0.9805511832237244, "learning_rate": 0.0002, "epoch": 5.175583482944345, "step": 72070}, {"loss": 0.5672, "grad_norm": 1.1636970043182373, "learning_rate": 0.0002, "epoch": 5.176301615798923, "step": 72080}, {"loss": 0.6424, "grad_norm": 1.0193538665771484, "learning_rate": 0.0002, "epoch": 5.177019748653501, "step": 72090}, {"loss": 0.5722, "grad_norm": 0.8850618600845337, "learning_rate": 0.0002, "epoch": 5.177737881508079, "step": 72100}, {"loss": 0.5938, "grad_norm": 1.042271614074707, "learning_rate": 0.0002, "epoch": 5.1784560143626575, "step": 72110}, {"loss": 0.569, "grad_norm": 1.1405227184295654, "learning_rate": 0.0002, "epoch": 5.1791741472172355, "step": 72120}, {"loss": 0.5762, "grad_norm": 1.0013195276260376, "learning_rate": 0.0002, "epoch": 5.1798922800718135, "step": 72130}, {"loss": 0.5948, "grad_norm": 1.0474903583526611, "learning_rate": 0.0002, "epoch": 5.1806104129263915, "step": 72140}, {"loss": 0.5692, "grad_norm": 1.0384612083435059, "learning_rate": 0.0002, "epoch": 5.1813285457809695, "step": 72150}, {"loss": 0.5588, "grad_norm": 1.145086646080017, "learning_rate": 0.0002, "epoch": 5.1820466786355475, "step": 72160}, {"loss": 0.5294, "grad_norm": 1.0845173597335815, "learning_rate": 0.0002, "epoch": 5.1827648114901255, "step": 72170}, {"loss": 0.5796, "grad_norm": 0.9870346188545227, "learning_rate": 0.0002, "epoch": 5.1834829443447035, "step": 72180}, {"loss": 0.5844, "grad_norm": 1.1098768711090088, "learning_rate": 0.0002, "epoch": 5.1842010771992815, "step": 72190}, {"loss": 0.5536, "grad_norm": 0.9397785067558289, "learning_rate": 0.0002, "epoch": 5.18491921005386, "step": 72200}, {"loss": 0.5847, "grad_norm": 1.0817532539367676, "learning_rate": 0.0002, "epoch": 5.185637342908438, "step": 72210}, {"loss": 0.5492, "grad_norm": 1.0027309656143188, "learning_rate": 0.0002, "epoch": 5.186355475763016, "step": 72220}, {"loss": 0.5685, "grad_norm": 0.8262016773223877, "learning_rate": 0.0002, "epoch": 5.187073608617594, "step": 72230}, {"loss": 0.53, "grad_norm": 0.9968137741088867, "learning_rate": 0.0002, "epoch": 5.187791741472172, "step": 72240}, {"loss": 0.5663, "grad_norm": 0.9072695970535278, "learning_rate": 0.0002, "epoch": 5.18850987432675, "step": 72250}, {"loss": 0.5799, "grad_norm": 1.0388357639312744, "learning_rate": 0.0002, "epoch": 5.189228007181328, "step": 72260}, {"loss": 0.5805, "grad_norm": 0.8883537650108337, "learning_rate": 0.0002, "epoch": 5.189946140035906, "step": 72270}, {"loss": 0.5723, "grad_norm": 1.0161921977996826, "learning_rate": 0.0002, "epoch": 5.190664272890484, "step": 72280}, {"loss": 0.5805, "grad_norm": 0.964936375617981, "learning_rate": 0.0002, "epoch": 5.191382405745063, "step": 72290}, {"loss": 0.5145, "grad_norm": 0.9728496670722961, "learning_rate": 0.0002, "epoch": 5.192100538599641, "step": 72300}, {"loss": 0.552, "grad_norm": 1.2411649227142334, "learning_rate": 0.0002, "epoch": 5.192818671454219, "step": 72310}, {"loss": 0.5482, "grad_norm": 0.9430946111679077, "learning_rate": 0.0002, "epoch": 5.193536804308797, "step": 72320}, {"loss": 0.5007, "grad_norm": 1.1522886753082275, "learning_rate": 0.0002, "epoch": 5.194254937163375, "step": 72330}, {"loss": 0.5013, "grad_norm": 1.0727189779281616, "learning_rate": 0.0002, "epoch": 5.194973070017953, "step": 72340}, {"loss": 0.5157, "grad_norm": 1.2506077289581299, "learning_rate": 0.0002, "epoch": 5.195691202872531, "step": 72350}, {"loss": 0.592, "grad_norm": 1.0949938297271729, "learning_rate": 0.0002, "epoch": 5.196409335727109, "step": 72360}, {"loss": 0.5642, "grad_norm": 1.191125750541687, "learning_rate": 0.0002, "epoch": 5.197127468581687, "step": 72370}, {"loss": 0.5756, "grad_norm": 1.1154223680496216, "learning_rate": 0.0002, "epoch": 5.197845601436265, "step": 72380}, {"loss": 0.5996, "grad_norm": 0.9623886942863464, "learning_rate": 0.0002, "epoch": 5.198563734290844, "step": 72390}, {"loss": 0.5579, "grad_norm": 0.9432680010795593, "learning_rate": 0.0002, "epoch": 5.199281867145422, "step": 72400}, {"loss": 0.6055, "grad_norm": 1.035905122756958, "learning_rate": 0.0002, "epoch": 5.2, "step": 72410}, {"loss": 0.5515, "grad_norm": 0.9044913053512573, "learning_rate": 0.0002, "epoch": 5.200718132854578, "step": 72420}, {"loss": 0.5845, "grad_norm": 1.082187533378601, "learning_rate": 0.0002, "epoch": 5.201436265709156, "step": 72430}, {"loss": 0.6215, "grad_norm": 0.9368400573730469, "learning_rate": 0.0002, "epoch": 5.202154398563734, "step": 72440}, {"loss": 0.5903, "grad_norm": 1.1515194177627563, "learning_rate": 0.0002, "epoch": 5.202872531418312, "step": 72450}, {"loss": 0.5698, "grad_norm": 0.8333232402801514, "learning_rate": 0.0002, "epoch": 5.20359066427289, "step": 72460}, {"loss": 0.5534, "grad_norm": 1.0885688066482544, "learning_rate": 0.0002, "epoch": 5.204308797127468, "step": 72470}, {"loss": 0.5459, "grad_norm": 0.8189428448677063, "learning_rate": 0.0002, "epoch": 5.205026929982047, "step": 72480}, {"loss": 0.5981, "grad_norm": 1.0145429372787476, "learning_rate": 0.0002, "epoch": 5.205745062836625, "step": 72490}, {"loss": 0.5451, "grad_norm": 1.132490634918213, "learning_rate": 0.0002, "epoch": 5.206463195691203, "step": 72500}, {"loss": 0.5566, "grad_norm": 0.8866808414459229, "learning_rate": 0.0002, "epoch": 5.207181328545781, "step": 72510}, {"loss": 0.5469, "grad_norm": 0.9681518077850342, "learning_rate": 0.0002, "epoch": 5.207899461400359, "step": 72520}, {"loss": 0.5716, "grad_norm": 0.9992330074310303, "learning_rate": 0.0002, "epoch": 5.208617594254937, "step": 72530}, {"loss": 0.5894, "grad_norm": 1.0767436027526855, "learning_rate": 0.0002, "epoch": 5.209335727109515, "step": 72540}, {"loss": 0.5828, "grad_norm": 1.1362388134002686, "learning_rate": 0.0002, "epoch": 5.210053859964093, "step": 72550}, {"loss": 0.6156, "grad_norm": 0.9741758704185486, "learning_rate": 0.0002, "epoch": 5.210771992818671, "step": 72560}, {"loss": 0.6119, "grad_norm": 0.8216298818588257, "learning_rate": 0.0002, "epoch": 5.211490125673249, "step": 72570}, {"loss": 0.5813, "grad_norm": 0.7500724792480469, "learning_rate": 0.0002, "epoch": 5.212208258527828, "step": 72580}, {"loss": 0.5427, "grad_norm": 0.9152594804763794, "learning_rate": 0.0002, "epoch": 5.212926391382406, "step": 72590}, {"loss": 0.5792, "grad_norm": 1.014940857887268, "learning_rate": 0.0002, "epoch": 5.213644524236984, "step": 72600}, {"loss": 0.5487, "grad_norm": 0.9333099722862244, "learning_rate": 0.0002, "epoch": 5.214362657091562, "step": 72610}, {"loss": 0.5647, "grad_norm": 0.7940610647201538, "learning_rate": 0.0002, "epoch": 5.21508078994614, "step": 72620}, {"loss": 0.5474, "grad_norm": 1.0365521907806396, "learning_rate": 0.0002, "epoch": 5.215798922800718, "step": 72630}, {"loss": 0.6009, "grad_norm": 1.37727952003479, "learning_rate": 0.0002, "epoch": 5.216517055655296, "step": 72640}, {"loss": 0.5389, "grad_norm": 1.2019168138504028, "learning_rate": 0.0002, "epoch": 5.217235188509874, "step": 72650}, {"loss": 0.5593, "grad_norm": 1.1696226596832275, "learning_rate": 0.0002, "epoch": 5.217953321364452, "step": 72660}, {"loss": 0.5507, "grad_norm": 0.9608798623085022, "learning_rate": 0.0002, "epoch": 5.218671454219031, "step": 72670}, {"loss": 0.5502, "grad_norm": 0.9139777421951294, "learning_rate": 0.0002, "epoch": 5.219389587073609, "step": 72680}, {"loss": 0.5955, "grad_norm": 0.9937016367912292, "learning_rate": 0.0002, "epoch": 5.220107719928187, "step": 72690}, {"loss": 0.6031, "grad_norm": 1.2787059545516968, "learning_rate": 0.0002, "epoch": 5.220825852782765, "step": 72700}, {"loss": 0.5601, "grad_norm": 1.0757197141647339, "learning_rate": 0.0002, "epoch": 5.221543985637343, "step": 72710}, {"loss": 0.5556, "grad_norm": 0.8053579926490784, "learning_rate": 0.0002, "epoch": 5.222262118491921, "step": 72720}, {"loss": 0.5655, "grad_norm": 1.0239759683609009, "learning_rate": 0.0002, "epoch": 5.222980251346499, "step": 72730}, {"loss": 0.6153, "grad_norm": 0.9972975850105286, "learning_rate": 0.0002, "epoch": 5.223698384201077, "step": 72740}, {"loss": 0.569, "grad_norm": 1.0504519939422607, "learning_rate": 0.0002, "epoch": 5.224416517055655, "step": 72750}, {"loss": 0.5345, "grad_norm": 1.1793010234832764, "learning_rate": 0.0002, "epoch": 5.225134649910234, "step": 72760}, {"loss": 0.5674, "grad_norm": 1.1098815202713013, "learning_rate": 0.0002, "epoch": 5.225852782764812, "step": 72770}, {"loss": 0.5689, "grad_norm": 1.1078516244888306, "learning_rate": 0.0002, "epoch": 5.22657091561939, "step": 72780}, {"loss": 0.5614, "grad_norm": 0.8684433698654175, "learning_rate": 0.0002, "epoch": 5.227289048473968, "step": 72790}, {"loss": 0.5545, "grad_norm": 1.159390926361084, "learning_rate": 0.0002, "epoch": 5.228007181328546, "step": 72800}, {"loss": 0.5726, "grad_norm": 1.0468506813049316, "learning_rate": 0.0002, "epoch": 5.228725314183124, "step": 72810}, {"loss": 0.5662, "grad_norm": 0.8684625029563904, "learning_rate": 0.0002, "epoch": 5.229443447037702, "step": 72820}, {"loss": 0.6074, "grad_norm": 1.0117321014404297, "learning_rate": 0.0002, "epoch": 5.23016157989228, "step": 72830}, {"loss": 0.5956, "grad_norm": 1.0513219833374023, "learning_rate": 0.0002, "epoch": 5.230879712746858, "step": 72840}, {"loss": 0.5796, "grad_norm": 1.0659555196762085, "learning_rate": 0.0002, "epoch": 5.231597845601437, "step": 72850}, {"loss": 0.5916, "grad_norm": 0.7726831436157227, "learning_rate": 0.0002, "epoch": 5.232315978456015, "step": 72860}, {"loss": 0.557, "grad_norm": 1.0346935987472534, "learning_rate": 0.0002, "epoch": 5.233034111310593, "step": 72870}, {"loss": 0.567, "grad_norm": 0.9112410545349121, "learning_rate": 0.0002, "epoch": 5.233752244165171, "step": 72880}, {"loss": 0.575, "grad_norm": 1.2933332920074463, "learning_rate": 0.0002, "epoch": 5.234470377019749, "step": 72890}, {"loss": 0.5733, "grad_norm": 0.9740806221961975, "learning_rate": 0.0002, "epoch": 5.235188509874327, "step": 72900}, {"loss": 0.5661, "grad_norm": 0.8041712641716003, "learning_rate": 0.0002, "epoch": 5.235906642728905, "step": 72910}, {"loss": 0.5936, "grad_norm": 0.9510180950164795, "learning_rate": 0.0002, "epoch": 5.236624775583483, "step": 72920}, {"loss": 0.6312, "grad_norm": 0.9103419780731201, "learning_rate": 0.0002, "epoch": 5.237342908438061, "step": 72930}, {"loss": 0.5298, "grad_norm": 0.8317763805389404, "learning_rate": 0.0002, "epoch": 5.238061041292639, "step": 72940}, {"loss": 0.5887, "grad_norm": 1.0269867181777954, "learning_rate": 0.0002, "epoch": 5.238779174147218, "step": 72950}, {"loss": 0.6141, "grad_norm": 1.0599713325500488, "learning_rate": 0.0002, "epoch": 5.239497307001796, "step": 72960}, {"loss": 0.5785, "grad_norm": 0.9341228008270264, "learning_rate": 0.0002, "epoch": 5.240215439856374, "step": 72970}, {"loss": 0.5256, "grad_norm": 1.1216323375701904, "learning_rate": 0.0002, "epoch": 5.240933572710952, "step": 72980}, {"loss": 0.5995, "grad_norm": 0.9396152496337891, "learning_rate": 0.0002, "epoch": 5.24165170556553, "step": 72990}, {"loss": 0.6281, "grad_norm": 1.1474549770355225, "learning_rate": 0.0002, "epoch": 5.242369838420108, "step": 73000}, {"loss": 0.5693, "grad_norm": 1.2160102128982544, "learning_rate": 0.0002, "epoch": 5.243087971274686, "step": 73010}, {"loss": 0.5914, "grad_norm": 1.0755409002304077, "learning_rate": 0.0002, "epoch": 5.243806104129264, "step": 73020}, {"loss": 0.5697, "grad_norm": 1.0645225048065186, "learning_rate": 0.0002, "epoch": 5.244524236983842, "step": 73030}, {"loss": 0.5669, "grad_norm": 1.1155469417572021, "learning_rate": 0.0002, "epoch": 5.2452423698384205, "step": 73040}, {"loss": 0.5448, "grad_norm": 1.1631708145141602, "learning_rate": 0.0002, "epoch": 5.2459605026929985, "step": 73050}, {"loss": 0.6034, "grad_norm": 0.8747480511665344, "learning_rate": 0.0002, "epoch": 5.2466786355475765, "step": 73060}, {"loss": 0.5647, "grad_norm": 0.9174497723579407, "learning_rate": 0.0002, "epoch": 5.2473967684021545, "step": 73070}, {"loss": 0.5804, "grad_norm": 1.334018349647522, "learning_rate": 0.0002, "epoch": 5.2481149012567325, "step": 73080}, {"loss": 0.5491, "grad_norm": 1.0842393636703491, "learning_rate": 0.0002, "epoch": 5.2488330341113105, "step": 73090}, {"loss": 0.6078, "grad_norm": 1.0531692504882812, "learning_rate": 0.0002, "epoch": 5.2495511669658885, "step": 73100}, {"loss": 0.5912, "grad_norm": 0.9069980978965759, "learning_rate": 0.0002, "epoch": 5.2502692998204665, "step": 73110}, {"loss": 0.5845, "grad_norm": 1.1319832801818848, "learning_rate": 0.0002, "epoch": 5.2509874326750445, "step": 73120}, {"loss": 0.5921, "grad_norm": 1.0468456745147705, "learning_rate": 0.0002, "epoch": 5.2517055655296225, "step": 73130}, {"loss": 0.5688, "grad_norm": 1.1752768754959106, "learning_rate": 0.0002, "epoch": 5.252423698384201, "step": 73140}, {"loss": 0.5709, "grad_norm": 1.0697909593582153, "learning_rate": 0.0002, "epoch": 5.253141831238779, "step": 73150}, {"loss": 0.6187, "grad_norm": 1.1179429292678833, "learning_rate": 0.0002, "epoch": 5.253859964093357, "step": 73160}, {"loss": 0.6127, "grad_norm": 0.9088113903999329, "learning_rate": 0.0002, "epoch": 5.254578096947935, "step": 73170}, {"loss": 0.629, "grad_norm": 0.8814208507537842, "learning_rate": 0.0002, "epoch": 5.255296229802513, "step": 73180}, {"loss": 0.5881, "grad_norm": 1.026688814163208, "learning_rate": 0.0002, "epoch": 5.256014362657091, "step": 73190}, {"loss": 0.5883, "grad_norm": 0.9974902868270874, "learning_rate": 0.0002, "epoch": 5.256732495511669, "step": 73200}, {"loss": 0.5219, "grad_norm": 0.948743999004364, "learning_rate": 0.0002, "epoch": 5.257450628366247, "step": 73210}, {"loss": 0.5489, "grad_norm": 0.9069591164588928, "learning_rate": 0.0002, "epoch": 5.258168761220825, "step": 73220}, {"loss": 0.5667, "grad_norm": 1.0574030876159668, "learning_rate": 0.0002, "epoch": 5.258886894075404, "step": 73230}, {"loss": 0.5903, "grad_norm": 0.9299649596214294, "learning_rate": 0.0002, "epoch": 5.259605026929982, "step": 73240}, {"loss": 0.5678, "grad_norm": 0.9888820648193359, "learning_rate": 0.0002, "epoch": 5.26032315978456, "step": 73250}, {"loss": 0.5993, "grad_norm": 1.0164920091629028, "learning_rate": 0.0002, "epoch": 5.261041292639138, "step": 73260}, {"loss": 0.5585, "grad_norm": 0.933210551738739, "learning_rate": 0.0002, "epoch": 5.261759425493716, "step": 73270}, {"loss": 0.6061, "grad_norm": 1.1754034757614136, "learning_rate": 0.0002, "epoch": 5.262477558348294, "step": 73280}, {"loss": 0.5727, "grad_norm": 1.1599570512771606, "learning_rate": 0.0002, "epoch": 5.263195691202872, "step": 73290}, {"loss": 0.6252, "grad_norm": 1.0497905015945435, "learning_rate": 0.0002, "epoch": 5.26391382405745, "step": 73300}, {"loss": 0.5861, "grad_norm": 1.3603366613388062, "learning_rate": 0.0002, "epoch": 5.264631956912028, "step": 73310}, {"loss": 0.5713, "grad_norm": 1.0283215045928955, "learning_rate": 0.0002, "epoch": 5.265350089766607, "step": 73320}, {"loss": 0.6048, "grad_norm": 1.1043906211853027, "learning_rate": 0.0002, "epoch": 5.266068222621185, "step": 73330}, {"loss": 0.5383, "grad_norm": 0.9386111497879028, "learning_rate": 0.0002, "epoch": 5.266786355475763, "step": 73340}, {"loss": 0.5826, "grad_norm": 1.3586112260818481, "learning_rate": 0.0002, "epoch": 5.267504488330341, "step": 73350}, {"loss": 0.6213, "grad_norm": 1.034179449081421, "learning_rate": 0.0002, "epoch": 5.268222621184919, "step": 73360}, {"loss": 0.5809, "grad_norm": 0.9645284414291382, "learning_rate": 0.0002, "epoch": 5.268940754039497, "step": 73370}, {"loss": 0.5595, "grad_norm": 1.1078046560287476, "learning_rate": 0.0002, "epoch": 5.269658886894075, "step": 73380}, {"loss": 0.5518, "grad_norm": 0.9737151265144348, "learning_rate": 0.0002, "epoch": 5.270377019748653, "step": 73390}, {"loss": 0.5984, "grad_norm": 1.1911388635635376, "learning_rate": 0.0002, "epoch": 5.271095152603231, "step": 73400}, {"loss": 0.5867, "grad_norm": 0.9089180827140808, "learning_rate": 0.0002, "epoch": 5.27181328545781, "step": 73410}, {"loss": 0.6021, "grad_norm": 1.094515085220337, "learning_rate": 0.0002, "epoch": 5.272531418312388, "step": 73420}, {"loss": 0.652, "grad_norm": 1.2531700134277344, "learning_rate": 0.0002, "epoch": 5.273249551166966, "step": 73430}, {"loss": 0.5616, "grad_norm": 0.9279667139053345, "learning_rate": 0.0002, "epoch": 5.273967684021544, "step": 73440}, {"loss": 0.5378, "grad_norm": 0.9872317314147949, "learning_rate": 0.0002, "epoch": 5.274685816876122, "step": 73450}, {"loss": 0.5732, "grad_norm": 1.0645262002944946, "learning_rate": 0.0002, "epoch": 5.2754039497307, "step": 73460}, {"loss": 0.5331, "grad_norm": 0.9505489468574524, "learning_rate": 0.0002, "epoch": 5.276122082585278, "step": 73470}, {"loss": 0.5826, "grad_norm": 1.0444035530090332, "learning_rate": 0.0002, "epoch": 5.276840215439856, "step": 73480}, {"loss": 0.6267, "grad_norm": 1.1813455820083618, "learning_rate": 0.0002, "epoch": 5.277558348294434, "step": 73490}, {"loss": 0.5645, "grad_norm": 0.782117486000061, "learning_rate": 0.0002, "epoch": 5.278276481149012, "step": 73500}, {"loss": 0.5829, "grad_norm": 0.8837172389030457, "learning_rate": 0.0002, "epoch": 5.278994614003591, "step": 73510}, {"loss": 0.5894, "grad_norm": 0.8320443630218506, "learning_rate": 0.0002, "epoch": 5.279712746858169, "step": 73520}, {"loss": 0.5793, "grad_norm": 1.111466407775879, "learning_rate": 0.0002, "epoch": 5.280430879712747, "step": 73530}, {"loss": 0.5796, "grad_norm": 1.0448017120361328, "learning_rate": 0.0002, "epoch": 5.281149012567325, "step": 73540}, {"loss": 0.5642, "grad_norm": 1.2046639919281006, "learning_rate": 0.0002, "epoch": 5.281867145421903, "step": 73550}, {"loss": 0.5859, "grad_norm": 1.084886074066162, "learning_rate": 0.0002, "epoch": 5.282585278276481, "step": 73560}, {"loss": 0.6055, "grad_norm": 0.8321937918663025, "learning_rate": 0.0002, "epoch": 5.283303411131059, "step": 73570}, {"loss": 0.5735, "grad_norm": 1.172440767288208, "learning_rate": 0.0002, "epoch": 5.284021543985637, "step": 73580}, {"loss": 0.5491, "grad_norm": 0.937133252620697, "learning_rate": 0.0002, "epoch": 5.284739676840215, "step": 73590}, {"loss": 0.5575, "grad_norm": 1.0996583700180054, "learning_rate": 0.0002, "epoch": 5.285457809694794, "step": 73600}, {"loss": 0.5813, "grad_norm": 1.2459958791732788, "learning_rate": 0.0002, "epoch": 5.286175942549372, "step": 73610}, {"loss": 0.6146, "grad_norm": 0.8362332582473755, "learning_rate": 0.0002, "epoch": 5.28689407540395, "step": 73620}, {"loss": 0.5333, "grad_norm": 0.9784061312675476, "learning_rate": 0.0002, "epoch": 5.287612208258528, "step": 73630}, {"loss": 0.6146, "grad_norm": 1.087041974067688, "learning_rate": 0.0002, "epoch": 5.288330341113106, "step": 73640}, {"loss": 0.5775, "grad_norm": 0.8641281723976135, "learning_rate": 0.0002, "epoch": 5.289048473967684, "step": 73650}, {"loss": 0.5592, "grad_norm": 1.030386209487915, "learning_rate": 0.0002, "epoch": 5.289766606822262, "step": 73660}, {"loss": 0.5899, "grad_norm": 1.0551509857177734, "learning_rate": 0.0002, "epoch": 5.29048473967684, "step": 73670}, {"loss": 0.5805, "grad_norm": 0.9969013333320618, "learning_rate": 0.0002, "epoch": 5.291202872531418, "step": 73680}, {"loss": 0.5841, "grad_norm": 0.9566490054130554, "learning_rate": 0.0002, "epoch": 5.291921005385996, "step": 73690}, {"loss": 0.5756, "grad_norm": 1.1376742124557495, "learning_rate": 0.0002, "epoch": 5.292639138240575, "step": 73700}, {"loss": 0.5697, "grad_norm": 1.0127843618392944, "learning_rate": 0.0002, "epoch": 5.293357271095153, "step": 73710}, {"loss": 0.5673, "grad_norm": 0.9500759243965149, "learning_rate": 0.0002, "epoch": 5.294075403949731, "step": 73720}, {"loss": 0.6251, "grad_norm": 0.9597342610359192, "learning_rate": 0.0002, "epoch": 5.294793536804309, "step": 73730}, {"loss": 0.5887, "grad_norm": 1.0982595682144165, "learning_rate": 0.0002, "epoch": 5.295511669658887, "step": 73740}, {"loss": 0.5623, "grad_norm": 0.9007689952850342, "learning_rate": 0.0002, "epoch": 5.296229802513465, "step": 73750}, {"loss": 0.5854, "grad_norm": 0.9329614639282227, "learning_rate": 0.0002, "epoch": 5.296947935368043, "step": 73760}, {"loss": 0.5867, "grad_norm": 1.235142469406128, "learning_rate": 0.0002, "epoch": 5.297666068222621, "step": 73770}, {"loss": 0.6009, "grad_norm": 1.0875943899154663, "learning_rate": 0.0002, "epoch": 5.298384201077199, "step": 73780}, {"loss": 0.6009, "grad_norm": 1.0499054193496704, "learning_rate": 0.0002, "epoch": 5.299102333931778, "step": 73790}, {"loss": 0.625, "grad_norm": 1.117954969406128, "learning_rate": 0.0002, "epoch": 5.299820466786356, "step": 73800}, {"loss": 0.5502, "grad_norm": 0.800291121006012, "learning_rate": 0.0002, "epoch": 5.300538599640934, "step": 73810}, {"loss": 0.5815, "grad_norm": 1.1461842060089111, "learning_rate": 0.0002, "epoch": 5.301256732495512, "step": 73820}, {"loss": 0.6091, "grad_norm": 1.0084760189056396, "learning_rate": 0.0002, "epoch": 5.30197486535009, "step": 73830}, {"loss": 0.5802, "grad_norm": 1.1249386072158813, "learning_rate": 0.0002, "epoch": 5.302692998204668, "step": 73840}, {"loss": 0.55, "grad_norm": 1.0846004486083984, "learning_rate": 0.0002, "epoch": 5.303411131059246, "step": 73850}, {"loss": 0.5923, "grad_norm": 1.1557925939559937, "learning_rate": 0.0002, "epoch": 5.304129263913824, "step": 73860}, {"loss": 0.5904, "grad_norm": 1.2287988662719727, "learning_rate": 0.0002, "epoch": 5.304847396768402, "step": 73870}, {"loss": 0.554, "grad_norm": 0.9618542194366455, "learning_rate": 0.0002, "epoch": 5.30556552962298, "step": 73880}, {"loss": 0.5787, "grad_norm": 0.9429472088813782, "learning_rate": 0.0002, "epoch": 5.306283662477559, "step": 73890}, {"loss": 0.5937, "grad_norm": 0.9032631516456604, "learning_rate": 0.0002, "epoch": 5.307001795332137, "step": 73900}, {"loss": 0.577, "grad_norm": 1.0008580684661865, "learning_rate": 0.0002, "epoch": 5.307719928186715, "step": 73910}, {"loss": 0.5462, "grad_norm": 0.9795624017715454, "learning_rate": 0.0002, "epoch": 5.308438061041293, "step": 73920}, {"loss": 0.582, "grad_norm": 1.1194090843200684, "learning_rate": 0.0002, "epoch": 5.309156193895871, "step": 73930}, {"loss": 0.5859, "grad_norm": 1.1057528257369995, "learning_rate": 0.0002, "epoch": 5.309874326750449, "step": 73940}, {"loss": 0.5503, "grad_norm": 0.7807615995407104, "learning_rate": 0.0002, "epoch": 5.310592459605027, "step": 73950}, {"loss": 0.6128, "grad_norm": 0.9465593099594116, "learning_rate": 0.0002, "epoch": 5.311310592459605, "step": 73960}, {"loss": 0.5831, "grad_norm": 1.104210615158081, "learning_rate": 0.0002, "epoch": 5.312028725314184, "step": 73970}, {"loss": 0.5478, "grad_norm": 1.0452964305877686, "learning_rate": 0.0002, "epoch": 5.312746858168762, "step": 73980}, {"loss": 0.5856, "grad_norm": 1.0314992666244507, "learning_rate": 0.0002, "epoch": 5.31346499102334, "step": 73990}, {"loss": 0.6222, "grad_norm": 0.9187130928039551, "learning_rate": 0.0002, "epoch": 5.314183123877918, "step": 74000}, {"loss": 0.5739, "grad_norm": 0.8660678267478943, "learning_rate": 0.0002, "epoch": 5.314901256732496, "step": 74010}, {"loss": 0.5296, "grad_norm": 0.9470953345298767, "learning_rate": 0.0002, "epoch": 5.315619389587074, "step": 74020}, {"loss": 0.5772, "grad_norm": 1.0028631687164307, "learning_rate": 0.0002, "epoch": 5.316337522441652, "step": 74030}, {"loss": 0.6159, "grad_norm": 1.0237356424331665, "learning_rate": 0.0002, "epoch": 5.31705565529623, "step": 74040}, {"loss": 0.6277, "grad_norm": 1.0299798250198364, "learning_rate": 0.0002, "epoch": 5.317773788150808, "step": 74050}, {"loss": 0.568, "grad_norm": 1.0326799154281616, "learning_rate": 0.0002, "epoch": 5.318491921005386, "step": 74060}, {"loss": 0.5766, "grad_norm": 1.156346082687378, "learning_rate": 0.0002, "epoch": 5.3192100538599645, "step": 74070}, {"loss": 0.598, "grad_norm": 1.1542664766311646, "learning_rate": 0.0002, "epoch": 5.3199281867145425, "step": 74080}, {"loss": 0.5736, "grad_norm": 1.0503013134002686, "learning_rate": 0.0002, "epoch": 5.3206463195691205, "step": 74090}, {"loss": 0.6172, "grad_norm": 1.1088979244232178, "learning_rate": 0.0002, "epoch": 5.3213644524236985, "step": 74100}, {"loss": 0.5536, "grad_norm": 0.9314014911651611, "learning_rate": 0.0002, "epoch": 5.3220825852782765, "step": 74110}, {"loss": 0.6205, "grad_norm": 1.0813525915145874, "learning_rate": 0.0002, "epoch": 5.3228007181328545, "step": 74120}, {"loss": 0.6019, "grad_norm": 0.7824062705039978, "learning_rate": 0.0002, "epoch": 5.3235188509874325, "step": 74130}, {"loss": 0.6183, "grad_norm": 1.0552699565887451, "learning_rate": 0.0002, "epoch": 5.3242369838420105, "step": 74140}, {"loss": 0.5714, "grad_norm": 1.0916554927825928, "learning_rate": 0.0002, "epoch": 5.3249551166965885, "step": 74150}, {"loss": 0.6128, "grad_norm": 1.205618143081665, "learning_rate": 0.0002, "epoch": 5.325673249551167, "step": 74160}, {"loss": 0.616, "grad_norm": 1.2551230192184448, "learning_rate": 0.0002, "epoch": 5.326391382405745, "step": 74170}, {"loss": 0.5467, "grad_norm": 0.7715005278587341, "learning_rate": 0.0002, "epoch": 5.327109515260323, "step": 74180}, {"loss": 0.5793, "grad_norm": 1.1059352159500122, "learning_rate": 0.0002, "epoch": 5.327827648114901, "step": 74190}, {"loss": 0.5768, "grad_norm": 0.9441812634468079, "learning_rate": 0.0002, "epoch": 5.328545780969479, "step": 74200}, {"loss": 0.5708, "grad_norm": 1.0012084245681763, "learning_rate": 0.0002, "epoch": 5.329263913824057, "step": 74210}, {"loss": 0.5289, "grad_norm": 0.8594073057174683, "learning_rate": 0.0002, "epoch": 5.329982046678635, "step": 74220}, {"loss": 0.5933, "grad_norm": 0.8931775093078613, "learning_rate": 0.0002, "epoch": 5.330700179533213, "step": 74230}, {"loss": 0.5722, "grad_norm": 0.967250406742096, "learning_rate": 0.0002, "epoch": 5.331418312387791, "step": 74240}, {"loss": 0.5483, "grad_norm": 0.9776269793510437, "learning_rate": 0.0002, "epoch": 5.332136445242369, "step": 74250}, {"loss": 0.5655, "grad_norm": 0.9393186569213867, "learning_rate": 0.0002, "epoch": 5.332854578096948, "step": 74260}, {"loss": 0.5704, "grad_norm": 1.0081093311309814, "learning_rate": 0.0002, "epoch": 5.333572710951526, "step": 74270}, {"loss": 0.5588, "grad_norm": 0.9002147316932678, "learning_rate": 0.0002, "epoch": 5.334290843806104, "step": 74280}, {"loss": 0.5851, "grad_norm": 0.9237701296806335, "learning_rate": 0.0002, "epoch": 5.335008976660682, "step": 74290}, {"loss": 0.5958, "grad_norm": 1.070694923400879, "learning_rate": 0.0002, "epoch": 5.33572710951526, "step": 74300}, {"loss": 0.5877, "grad_norm": 1.0134668350219727, "learning_rate": 0.0002, "epoch": 5.336445242369838, "step": 74310}, {"loss": 0.5828, "grad_norm": 1.0903294086456299, "learning_rate": 0.0002, "epoch": 5.337163375224416, "step": 74320}, {"loss": 0.5146, "grad_norm": 0.9000239372253418, "learning_rate": 0.0002, "epoch": 5.337881508078994, "step": 74330}, {"loss": 0.5357, "grad_norm": 1.0584321022033691, "learning_rate": 0.0002, "epoch": 5.338599640933572, "step": 74340}, {"loss": 0.5844, "grad_norm": 1.046420931816101, "learning_rate": 0.0002, "epoch": 5.339317773788151, "step": 74350}, {"loss": 0.5489, "grad_norm": 0.8862320184707642, "learning_rate": 0.0002, "epoch": 5.340035906642729, "step": 74360}, {"loss": 0.5923, "grad_norm": 0.8197309970855713, "learning_rate": 0.0002, "epoch": 5.340754039497307, "step": 74370}, {"loss": 0.5408, "grad_norm": 0.9539661407470703, "learning_rate": 0.0002, "epoch": 5.341472172351885, "step": 74380}, {"loss": 0.5943, "grad_norm": 1.481026530265808, "learning_rate": 0.0002, "epoch": 5.342190305206463, "step": 74390}, {"loss": 0.6242, "grad_norm": 1.0685169696807861, "learning_rate": 0.0002, "epoch": 5.342908438061041, "step": 74400}, {"loss": 0.5917, "grad_norm": 1.1468359231948853, "learning_rate": 0.0002, "epoch": 5.343626570915619, "step": 74410}, {"loss": 0.556, "grad_norm": 0.9982373714447021, "learning_rate": 0.0002, "epoch": 5.344344703770197, "step": 74420}, {"loss": 0.6003, "grad_norm": 0.9273471236228943, "learning_rate": 0.0002, "epoch": 5.345062836624775, "step": 74430}, {"loss": 0.5239, "grad_norm": 1.058828592300415, "learning_rate": 0.0002, "epoch": 5.345780969479353, "step": 74440}, {"loss": 0.5434, "grad_norm": 1.0442006587982178, "learning_rate": 0.0002, "epoch": 5.346499102333932, "step": 74450}, {"loss": 0.5614, "grad_norm": 1.0955053567886353, "learning_rate": 0.0002, "epoch": 5.34721723518851, "step": 74460}, {"loss": 0.5992, "grad_norm": 0.9326002597808838, "learning_rate": 0.0002, "epoch": 5.347935368043088, "step": 74470}, {"loss": 0.6173, "grad_norm": 0.9496979117393494, "learning_rate": 0.0002, "epoch": 5.348653500897666, "step": 74480}, {"loss": 0.5483, "grad_norm": 1.1995937824249268, "learning_rate": 0.0002, "epoch": 5.349371633752244, "step": 74490}, {"loss": 0.5759, "grad_norm": 0.8761899471282959, "learning_rate": 0.0002, "epoch": 5.350089766606822, "step": 74500}, {"loss": 0.5866, "grad_norm": 1.2390170097351074, "learning_rate": 0.0002, "epoch": 5.3508078994614, "step": 74510}, {"loss": 0.6065, "grad_norm": 0.9101138114929199, "learning_rate": 0.0002, "epoch": 5.351526032315978, "step": 74520}, {"loss": 0.5908, "grad_norm": 0.925466001033783, "learning_rate": 0.0002, "epoch": 5.352244165170557, "step": 74530}, {"loss": 0.5992, "grad_norm": 0.9483969807624817, "learning_rate": 0.0002, "epoch": 5.352962298025135, "step": 74540}, {"loss": 0.5881, "grad_norm": 1.0530859231948853, "learning_rate": 0.0002, "epoch": 5.353680430879713, "step": 74550}, {"loss": 0.5607, "grad_norm": 1.209647536277771, "learning_rate": 0.0002, "epoch": 5.354398563734291, "step": 74560}, {"loss": 0.5782, "grad_norm": 0.9849331378936768, "learning_rate": 0.0002, "epoch": 5.355116696588869, "step": 74570}, {"loss": 0.6448, "grad_norm": 1.0822848081588745, "learning_rate": 0.0002, "epoch": 5.355834829443447, "step": 74580}, {"loss": 0.631, "grad_norm": 1.1460528373718262, "learning_rate": 0.0002, "epoch": 5.356552962298025, "step": 74590}, {"loss": 0.5634, "grad_norm": 0.9509134292602539, "learning_rate": 0.0002, "epoch": 5.357271095152603, "step": 74600}, {"loss": 0.5492, "grad_norm": 0.9884999394416809, "learning_rate": 0.0002, "epoch": 5.357989228007181, "step": 74610}, {"loss": 0.6096, "grad_norm": 0.9619579911231995, "learning_rate": 0.0002, "epoch": 5.358707360861759, "step": 74620}, {"loss": 0.5686, "grad_norm": 0.8596125245094299, "learning_rate": 0.0002, "epoch": 5.359425493716338, "step": 74630}, {"loss": 0.6112, "grad_norm": 1.16913640499115, "learning_rate": 0.0002, "epoch": 5.360143626570916, "step": 74640}, {"loss": 0.5779, "grad_norm": 0.99276202917099, "learning_rate": 0.0002, "epoch": 5.360861759425494, "step": 74650}, {"loss": 0.5699, "grad_norm": 1.1293696165084839, "learning_rate": 0.0002, "epoch": 5.361579892280072, "step": 74660}, {"loss": 0.5727, "grad_norm": 1.187947154045105, "learning_rate": 0.0002, "epoch": 5.36229802513465, "step": 74670}, {"loss": 0.5574, "grad_norm": 0.8637247681617737, "learning_rate": 0.0002, "epoch": 5.363016157989228, "step": 74680}, {"loss": 0.5738, "grad_norm": 1.1049476861953735, "learning_rate": 0.0002, "epoch": 5.363734290843806, "step": 74690}, {"loss": 0.6082, "grad_norm": 1.1736515760421753, "learning_rate": 0.0002, "epoch": 5.364452423698384, "step": 74700}, {"loss": 0.6238, "grad_norm": 1.0203301906585693, "learning_rate": 0.0002, "epoch": 5.365170556552962, "step": 74710}, {"loss": 0.5612, "grad_norm": 1.15559720993042, "learning_rate": 0.0002, "epoch": 5.365888689407541, "step": 74720}, {"loss": 0.5699, "grad_norm": 1.2008144855499268, "learning_rate": 0.0002, "epoch": 5.366606822262119, "step": 74730}, {"loss": 0.5749, "grad_norm": 1.0385756492614746, "learning_rate": 0.0002, "epoch": 5.367324955116697, "step": 74740}, {"loss": 0.5745, "grad_norm": 0.8964240550994873, "learning_rate": 0.0002, "epoch": 5.368043087971275, "step": 74750}, {"loss": 0.5799, "grad_norm": 0.9824761748313904, "learning_rate": 0.0002, "epoch": 5.368761220825853, "step": 74760}, {"loss": 0.5714, "grad_norm": 0.8815994262695312, "learning_rate": 0.0002, "epoch": 5.369479353680431, "step": 74770}, {"loss": 0.584, "grad_norm": 0.9729493856430054, "learning_rate": 0.0002, "epoch": 5.370197486535009, "step": 74780}, {"loss": 0.5884, "grad_norm": 1.1032123565673828, "learning_rate": 0.0002, "epoch": 5.370915619389587, "step": 74790}, {"loss": 0.5804, "grad_norm": 1.039591908454895, "learning_rate": 0.0002, "epoch": 5.371633752244165, "step": 74800}, {"loss": 0.5693, "grad_norm": 0.9741610884666443, "learning_rate": 0.0002, "epoch": 5.372351885098743, "step": 74810}, {"loss": 0.6225, "grad_norm": 0.9789814949035645, "learning_rate": 0.0002, "epoch": 5.373070017953322, "step": 74820}, {"loss": 0.5765, "grad_norm": 1.0777033567428589, "learning_rate": 0.0002, "epoch": 5.3737881508079, "step": 74830}, {"loss": 0.5553, "grad_norm": 0.9058641195297241, "learning_rate": 0.0002, "epoch": 5.374506283662478, "step": 74840}, {"loss": 0.5733, "grad_norm": 1.2161815166473389, "learning_rate": 0.0002, "epoch": 5.375224416517056, "step": 74850}, {"loss": 0.5679, "grad_norm": 1.1079481840133667, "learning_rate": 0.0002, "epoch": 5.375942549371634, "step": 74860}, {"loss": 0.605, "grad_norm": 0.9494470357894897, "learning_rate": 0.0002, "epoch": 5.376660682226212, "step": 74870}, {"loss": 0.6155, "grad_norm": 1.0116358995437622, "learning_rate": 0.0002, "epoch": 5.37737881508079, "step": 74880}, {"loss": 0.5595, "grad_norm": 0.9382423162460327, "learning_rate": 0.0002, "epoch": 5.378096947935368, "step": 74890}, {"loss": 0.5441, "grad_norm": 1.036151647567749, "learning_rate": 0.0002, "epoch": 5.378815080789946, "step": 74900}, {"loss": 0.5441, "grad_norm": 0.9436623454093933, "learning_rate": 0.0002, "epoch": 5.379533213644525, "step": 74910}, {"loss": 0.5327, "grad_norm": 1.0149152278900146, "learning_rate": 0.0002, "epoch": 5.380251346499103, "step": 74920}, {"loss": 0.5554, "grad_norm": 1.1645641326904297, "learning_rate": 0.0002, "epoch": 5.380969479353681, "step": 74930}, {"loss": 0.5662, "grad_norm": 1.002287745475769, "learning_rate": 0.0002, "epoch": 5.381687612208259, "step": 74940}, {"loss": 0.5602, "grad_norm": 1.1176437139511108, "learning_rate": 0.0002, "epoch": 5.382405745062837, "step": 74950}, {"loss": 0.582, "grad_norm": 0.9210802912712097, "learning_rate": 0.0002, "epoch": 5.383123877917415, "step": 74960}, {"loss": 0.5996, "grad_norm": 1.1873447895050049, "learning_rate": 0.0002, "epoch": 5.383842010771993, "step": 74970}, {"loss": 0.5391, "grad_norm": 0.8372976779937744, "learning_rate": 0.0002, "epoch": 5.384560143626571, "step": 74980}, {"loss": 0.5808, "grad_norm": 0.9220532178878784, "learning_rate": 0.0002, "epoch": 5.385278276481149, "step": 74990}, {"loss": 0.5897, "grad_norm": 0.9196901917457581, "learning_rate": 0.0002, "epoch": 5.385996409335727, "step": 75000}, {"loss": 0.5838, "grad_norm": 0.9325235486030579, "learning_rate": 0.0002, "epoch": 5.3867145421903055, "step": 75010}, {"loss": 0.5652, "grad_norm": 1.0902531147003174, "learning_rate": 0.0002, "epoch": 5.3874326750448835, "step": 75020}, {"loss": 0.581, "grad_norm": 1.049468755722046, "learning_rate": 0.0002, "epoch": 5.3881508078994615, "step": 75030}, {"loss": 0.6184, "grad_norm": 0.9372574687004089, "learning_rate": 0.0002, "epoch": 5.3888689407540395, "step": 75040}, {"loss": 0.6158, "grad_norm": 0.9013437628746033, "learning_rate": 0.0002, "epoch": 5.3895870736086176, "step": 75050}, {"loss": 0.5656, "grad_norm": 1.2111071348190308, "learning_rate": 0.0002, "epoch": 5.3903052064631956, "step": 75060}, {"loss": 0.5983, "grad_norm": 1.0006011724472046, "learning_rate": 0.0002, "epoch": 5.3910233393177736, "step": 75070}, {"loss": 0.5807, "grad_norm": 0.9180546402931213, "learning_rate": 0.0002, "epoch": 5.391741472172352, "step": 75080}, {"loss": 0.5878, "grad_norm": 1.096113920211792, "learning_rate": 0.0002, "epoch": 5.3924596050269304, "step": 75090}, {"loss": 0.5416, "grad_norm": 0.9041603207588196, "learning_rate": 0.0002, "epoch": 5.3931777378815084, "step": 75100}, {"loss": 0.5933, "grad_norm": 0.9675783514976501, "learning_rate": 0.0002, "epoch": 5.3938958707360865, "step": 75110}, {"loss": 0.5813, "grad_norm": 1.0952513217926025, "learning_rate": 0.0002, "epoch": 5.3946140035906645, "step": 75120}, {"loss": 0.5961, "grad_norm": 1.0166294574737549, "learning_rate": 0.0002, "epoch": 5.3953321364452425, "step": 75130}, {"loss": 0.6119, "grad_norm": 1.0892874002456665, "learning_rate": 0.0002, "epoch": 5.3960502692998205, "step": 75140}, {"loss": 0.6036, "grad_norm": 0.9894046187400818, "learning_rate": 0.0002, "epoch": 5.3967684021543985, "step": 75150}, {"loss": 0.5844, "grad_norm": 0.9991754293441772, "learning_rate": 0.0002, "epoch": 5.3974865350089765, "step": 75160}, {"loss": 0.5746, "grad_norm": 1.1027519702911377, "learning_rate": 0.0002, "epoch": 5.3982046678635545, "step": 75170}, {"loss": 0.5464, "grad_norm": 1.0579880475997925, "learning_rate": 0.0002, "epoch": 5.3989228007181325, "step": 75180}, {"loss": 0.5705, "grad_norm": 1.1149101257324219, "learning_rate": 0.0002, "epoch": 5.399640933572711, "step": 75190}, {"loss": 0.579, "grad_norm": 0.8802945017814636, "learning_rate": 0.0002, "epoch": 5.400359066427289, "step": 75200}, {"loss": 0.6117, "grad_norm": 0.9168137907981873, "learning_rate": 0.0002, "epoch": 5.401077199281867, "step": 75210}, {"loss": 0.543, "grad_norm": 1.232630968093872, "learning_rate": 0.0002, "epoch": 5.401795332136445, "step": 75220}, {"loss": 0.5739, "grad_norm": 1.1038591861724854, "learning_rate": 0.0002, "epoch": 5.402513464991023, "step": 75230}, {"loss": 0.5754, "grad_norm": 0.8985993266105652, "learning_rate": 0.0002, "epoch": 5.403231597845601, "step": 75240}, {"loss": 0.5517, "grad_norm": 1.1096316576004028, "learning_rate": 0.0002, "epoch": 5.403949730700179, "step": 75250}, {"loss": 0.5834, "grad_norm": 0.8516051173210144, "learning_rate": 0.0002, "epoch": 5.404667863554757, "step": 75260}, {"loss": 0.5779, "grad_norm": 0.9967356324195862, "learning_rate": 0.0002, "epoch": 5.405385996409335, "step": 75270}, {"loss": 0.6065, "grad_norm": 1.0092874765396118, "learning_rate": 0.0002, "epoch": 5.406104129263914, "step": 75280}, {"loss": 0.59, "grad_norm": 1.049838662147522, "learning_rate": 0.0002, "epoch": 5.406822262118492, "step": 75290}, {"loss": 0.6077, "grad_norm": 1.1491070985794067, "learning_rate": 0.0002, "epoch": 5.40754039497307, "step": 75300}, {"loss": 0.6423, "grad_norm": 0.9348118901252747, "learning_rate": 0.0002, "epoch": 5.408258527827648, "step": 75310}, {"loss": 0.5505, "grad_norm": 1.1226147413253784, "learning_rate": 0.0002, "epoch": 5.408976660682226, "step": 75320}, {"loss": 0.5906, "grad_norm": 0.9042587876319885, "learning_rate": 0.0002, "epoch": 5.409694793536804, "step": 75330}, {"loss": 0.5885, "grad_norm": 1.1212877035140991, "learning_rate": 0.0002, "epoch": 5.410412926391382, "step": 75340}, {"loss": 0.6056, "grad_norm": 0.9805570840835571, "learning_rate": 0.0002, "epoch": 5.41113105924596, "step": 75350}, {"loss": 0.5891, "grad_norm": 0.9803917407989502, "learning_rate": 0.0002, "epoch": 5.411849192100538, "step": 75360}, {"loss": 0.6338, "grad_norm": 1.2139064073562622, "learning_rate": 0.0002, "epoch": 5.412567324955116, "step": 75370}, {"loss": 0.5694, "grad_norm": 0.9510865211486816, "learning_rate": 0.0002, "epoch": 5.413285457809695, "step": 75380}, {"loss": 0.6072, "grad_norm": 1.0752202272415161, "learning_rate": 0.0002, "epoch": 5.414003590664273, "step": 75390}, {"loss": 0.5998, "grad_norm": 1.1144053936004639, "learning_rate": 0.0002, "epoch": 5.414721723518851, "step": 75400}, {"loss": 0.5783, "grad_norm": 1.128998875617981, "learning_rate": 0.0002, "epoch": 5.415439856373429, "step": 75410}, {"loss": 0.6092, "grad_norm": 1.2901849746704102, "learning_rate": 0.0002, "epoch": 5.416157989228007, "step": 75420}, {"loss": 0.5799, "grad_norm": 1.2822786569595337, "learning_rate": 0.0002, "epoch": 5.416876122082585, "step": 75430}, {"loss": 0.5744, "grad_norm": 0.8724783658981323, "learning_rate": 0.0002, "epoch": 5.417594254937163, "step": 75440}, {"loss": 0.5821, "grad_norm": 1.1321152448654175, "learning_rate": 0.0002, "epoch": 5.418312387791741, "step": 75450}, {"loss": 0.6394, "grad_norm": 1.1211779117584229, "learning_rate": 0.0002, "epoch": 5.419030520646319, "step": 75460}, {"loss": 0.584, "grad_norm": 1.0542290210723877, "learning_rate": 0.0002, "epoch": 5.419748653500898, "step": 75470}, {"loss": 0.5472, "grad_norm": 0.9432206153869629, "learning_rate": 0.0002, "epoch": 5.420466786355476, "step": 75480}, {"loss": 0.6053, "grad_norm": 1.2051608562469482, "learning_rate": 0.0002, "epoch": 5.421184919210054, "step": 75490}, {"loss": 0.5698, "grad_norm": 1.188256859779358, "learning_rate": 0.0002, "epoch": 5.421903052064632, "step": 75500}, {"loss": 0.5762, "grad_norm": 1.2768784761428833, "learning_rate": 0.0002, "epoch": 5.42262118491921, "step": 75510}, {"loss": 0.5961, "grad_norm": 0.8228567242622375, "learning_rate": 0.0002, "epoch": 5.423339317773788, "step": 75520}, {"loss": 0.602, "grad_norm": 1.235684871673584, "learning_rate": 0.0002, "epoch": 5.424057450628366, "step": 75530}, {"loss": 0.5923, "grad_norm": 0.8361109495162964, "learning_rate": 0.0002, "epoch": 5.424775583482944, "step": 75540}, {"loss": 0.578, "grad_norm": 1.0450727939605713, "learning_rate": 0.0002, "epoch": 5.425493716337522, "step": 75550}, {"loss": 0.6383, "grad_norm": 0.9942979216575623, "learning_rate": 0.0002, "epoch": 5.4262118491921, "step": 75560}, {"loss": 0.6406, "grad_norm": 0.8162592053413391, "learning_rate": 0.0002, "epoch": 5.426929982046679, "step": 75570}, {"loss": 0.5684, "grad_norm": 0.9193033576011658, "learning_rate": 0.0002, "epoch": 5.427648114901257, "step": 75580}, {"loss": 0.5773, "grad_norm": 1.095130443572998, "learning_rate": 0.0002, "epoch": 5.428366247755835, "step": 75590}, {"loss": 0.6036, "grad_norm": 1.1752824783325195, "learning_rate": 0.0002, "epoch": 5.429084380610413, "step": 75600}, {"loss": 0.5773, "grad_norm": 1.2007960081100464, "learning_rate": 0.0002, "epoch": 5.429802513464991, "step": 75610}, {"loss": 0.5928, "grad_norm": 0.997347354888916, "learning_rate": 0.0002, "epoch": 5.430520646319569, "step": 75620}, {"loss": 0.5798, "grad_norm": 1.3878827095031738, "learning_rate": 0.0002, "epoch": 5.431238779174147, "step": 75630}, {"loss": 0.5954, "grad_norm": 1.1839812994003296, "learning_rate": 0.0002, "epoch": 5.431956912028725, "step": 75640}, {"loss": 0.5789, "grad_norm": 0.9912546873092651, "learning_rate": 0.0002, "epoch": 5.432675044883303, "step": 75650}, {"loss": 0.5916, "grad_norm": 0.9305517673492432, "learning_rate": 0.0002, "epoch": 5.433393177737882, "step": 75660}, {"loss": 0.5869, "grad_norm": 1.0036604404449463, "learning_rate": 0.0002, "epoch": 5.43411131059246, "step": 75670}, {"loss": 0.5797, "grad_norm": 1.2500226497650146, "learning_rate": 0.0002, "epoch": 5.434829443447038, "step": 75680}, {"loss": 0.5923, "grad_norm": 0.9476167559623718, "learning_rate": 0.0002, "epoch": 5.435547576301616, "step": 75690}, {"loss": 0.5426, "grad_norm": 0.9769760370254517, "learning_rate": 0.0002, "epoch": 5.436265709156194, "step": 75700}, {"loss": 0.5397, "grad_norm": 1.1001025438308716, "learning_rate": 0.0002, "epoch": 5.436983842010772, "step": 75710}, {"loss": 0.5832, "grad_norm": 1.1783069372177124, "learning_rate": 0.0002, "epoch": 5.43770197486535, "step": 75720}, {"loss": 0.5961, "grad_norm": 0.887438952922821, "learning_rate": 0.0002, "epoch": 5.438420107719928, "step": 75730}, {"loss": 0.5904, "grad_norm": 0.9631154537200928, "learning_rate": 0.0002, "epoch": 5.439138240574506, "step": 75740}, {"loss": 0.5827, "grad_norm": 1.0824158191680908, "learning_rate": 0.0002, "epoch": 5.439856373429085, "step": 75750}, {"loss": 0.5824, "grad_norm": 1.0108296871185303, "learning_rate": 0.0002, "epoch": 5.440574506283663, "step": 75760}, {"loss": 0.6338, "grad_norm": 1.1728253364562988, "learning_rate": 0.0002, "epoch": 5.441292639138241, "step": 75770}, {"loss": 0.5661, "grad_norm": 1.0904773473739624, "learning_rate": 0.0002, "epoch": 5.442010771992819, "step": 75780}, {"loss": 0.638, "grad_norm": 0.8982957601547241, "learning_rate": 0.0002, "epoch": 5.442728904847397, "step": 75790}, {"loss": 0.583, "grad_norm": 1.0233404636383057, "learning_rate": 0.0002, "epoch": 5.443447037701975, "step": 75800}, {"loss": 0.6279, "grad_norm": 1.0092064142227173, "learning_rate": 0.0002, "epoch": 5.444165170556553, "step": 75810}, {"loss": 0.5673, "grad_norm": 1.2747842073440552, "learning_rate": 0.0002, "epoch": 5.444883303411131, "step": 75820}, {"loss": 0.5604, "grad_norm": 1.0365403890609741, "learning_rate": 0.0002, "epoch": 5.445601436265709, "step": 75830}, {"loss": 0.591, "grad_norm": 1.0413976907730103, "learning_rate": 0.0002, "epoch": 5.446319569120288, "step": 75840}, {"loss": 0.5995, "grad_norm": 0.8858456015586853, "learning_rate": 0.0002, "epoch": 5.447037701974866, "step": 75850}, {"loss": 0.5628, "grad_norm": 0.9823445677757263, "learning_rate": 0.0002, "epoch": 5.447755834829444, "step": 75860}, {"loss": 0.5691, "grad_norm": 0.8515284061431885, "learning_rate": 0.0002, "epoch": 5.448473967684022, "step": 75870}, {"loss": 0.5702, "grad_norm": 1.130850911140442, "learning_rate": 0.0002, "epoch": 5.4491921005386, "step": 75880}, {"loss": 0.5669, "grad_norm": 0.984725832939148, "learning_rate": 0.0002, "epoch": 5.449910233393178, "step": 75890}, {"loss": 0.5658, "grad_norm": 1.1701595783233643, "learning_rate": 0.0002, "epoch": 5.450628366247756, "step": 75900}, {"loss": 0.5555, "grad_norm": 0.8988107442855835, "learning_rate": 0.0002, "epoch": 5.451346499102334, "step": 75910}, {"loss": 0.6669, "grad_norm": 0.9909947514533997, "learning_rate": 0.0002, "epoch": 5.452064631956912, "step": 75920}, {"loss": 0.5528, "grad_norm": 0.8861672282218933, "learning_rate": 0.0002, "epoch": 5.45278276481149, "step": 75930}, {"loss": 0.5826, "grad_norm": 0.9513981938362122, "learning_rate": 0.0002, "epoch": 5.453500897666069, "step": 75940}, {"loss": 0.5827, "grad_norm": 1.0320760011672974, "learning_rate": 0.0002, "epoch": 5.454219030520647, "step": 75950}, {"loss": 0.5816, "grad_norm": 0.9830206632614136, "learning_rate": 0.0002, "epoch": 5.454937163375225, "step": 75960}, {"loss": 0.5228, "grad_norm": 0.9816349148750305, "learning_rate": 0.0002, "epoch": 5.455655296229803, "step": 75970}, {"loss": 0.594, "grad_norm": 0.9741218090057373, "learning_rate": 0.0002, "epoch": 5.456373429084381, "step": 75980}, {"loss": 0.634, "grad_norm": 1.1291148662567139, "learning_rate": 0.0002, "epoch": 5.457091561938959, "step": 75990}, {"loss": 0.5986, "grad_norm": 0.9770109057426453, "learning_rate": 0.0002, "epoch": 5.457809694793537, "step": 76000}, {"loss": 0.5783, "grad_norm": 1.0204377174377441, "learning_rate": 0.0002, "epoch": 5.458527827648115, "step": 76010}, {"loss": 0.5881, "grad_norm": 1.0453336238861084, "learning_rate": 0.0002, "epoch": 5.459245960502693, "step": 76020}, {"loss": 0.5798, "grad_norm": 1.1595505475997925, "learning_rate": 0.0002, "epoch": 5.4599640933572715, "step": 76030}, {"loss": 0.5787, "grad_norm": 1.1686701774597168, "learning_rate": 0.0002, "epoch": 5.4606822262118495, "step": 76040}, {"loss": 0.5746, "grad_norm": 1.14364755153656, "learning_rate": 0.0002, "epoch": 5.4614003590664275, "step": 76050}, {"loss": 0.5925, "grad_norm": 0.9742125868797302, "learning_rate": 0.0002, "epoch": 5.4621184919210055, "step": 76060}, {"loss": 0.6067, "grad_norm": 0.8235608339309692, "learning_rate": 0.0002, "epoch": 5.4628366247755835, "step": 76070}, {"loss": 0.5908, "grad_norm": 0.9801425337791443, "learning_rate": 0.0002, "epoch": 5.4635547576301615, "step": 76080}, {"loss": 0.6126, "grad_norm": 0.9001221060752869, "learning_rate": 0.0002, "epoch": 5.4642728904847395, "step": 76090}, {"loss": 0.6682, "grad_norm": 0.9292157888412476, "learning_rate": 0.0002, "epoch": 5.4649910233393175, "step": 76100}, {"loss": 0.6412, "grad_norm": 1.0024322271347046, "learning_rate": 0.0002, "epoch": 5.4657091561938955, "step": 76110}, {"loss": 0.5398, "grad_norm": 0.8057159781455994, "learning_rate": 0.0002, "epoch": 5.4664272890484735, "step": 76120}, {"loss": 0.5881, "grad_norm": 1.0617927312850952, "learning_rate": 0.0002, "epoch": 5.467145421903052, "step": 76130}, {"loss": 0.598, "grad_norm": 1.003967046737671, "learning_rate": 0.0002, "epoch": 5.46786355475763, "step": 76140}, {"loss": 0.5427, "grad_norm": 0.903408944606781, "learning_rate": 0.0002, "epoch": 5.468581687612208, "step": 76150}, {"loss": 0.5884, "grad_norm": 0.8173895478248596, "learning_rate": 0.0002, "epoch": 5.469299820466786, "step": 76160}, {"loss": 0.5526, "grad_norm": 1.0187482833862305, "learning_rate": 0.0002, "epoch": 5.470017953321364, "step": 76170}, {"loss": 0.5392, "grad_norm": 1.0418041944503784, "learning_rate": 0.0002, "epoch": 5.470736086175942, "step": 76180}, {"loss": 0.5761, "grad_norm": 0.9768357872962952, "learning_rate": 0.0002, "epoch": 5.47145421903052, "step": 76190}, {"loss": 0.5595, "grad_norm": 1.0834382772445679, "learning_rate": 0.0002, "epoch": 5.472172351885098, "step": 76200}, {"loss": 0.5783, "grad_norm": 0.8447439670562744, "learning_rate": 0.0002, "epoch": 5.472890484739676, "step": 76210}, {"loss": 0.5695, "grad_norm": 0.9379050135612488, "learning_rate": 0.0002, "epoch": 5.473608617594255, "step": 76220}, {"loss": 0.6053, "grad_norm": 1.0395485162734985, "learning_rate": 0.0002, "epoch": 5.474326750448833, "step": 76230}, {"loss": 0.5587, "grad_norm": 1.2082624435424805, "learning_rate": 0.0002, "epoch": 5.475044883303411, "step": 76240}, {"loss": 0.5891, "grad_norm": 1.0714443922042847, "learning_rate": 0.0002, "epoch": 5.475763016157989, "step": 76250}, {"loss": 0.5819, "grad_norm": 0.945319414138794, "learning_rate": 0.0002, "epoch": 5.476481149012567, "step": 76260}, {"loss": 0.5791, "grad_norm": 1.1415241956710815, "learning_rate": 0.0002, "epoch": 5.477199281867145, "step": 76270}, {"loss": 0.5586, "grad_norm": 0.9221673011779785, "learning_rate": 0.0002, "epoch": 5.477917414721723, "step": 76280}, {"loss": 0.5999, "grad_norm": 1.0118398666381836, "learning_rate": 0.0002, "epoch": 5.478635547576301, "step": 76290}, {"loss": 0.621, "grad_norm": 1.396807312965393, "learning_rate": 0.0002, "epoch": 5.479353680430879, "step": 76300}, {"loss": 0.5808, "grad_norm": 1.0437991619110107, "learning_rate": 0.0002, "epoch": 5.480071813285457, "step": 76310}, {"loss": 0.5846, "grad_norm": 1.5910401344299316, "learning_rate": 0.0002, "epoch": 5.480789946140036, "step": 76320}, {"loss": 0.6047, "grad_norm": 0.9262010455131531, "learning_rate": 0.0002, "epoch": 5.481508078994614, "step": 76330}, {"loss": 0.6079, "grad_norm": 1.2534247636795044, "learning_rate": 0.0002, "epoch": 5.482226211849192, "step": 76340}, {"loss": 0.5918, "grad_norm": 1.186294674873352, "learning_rate": 0.0002, "epoch": 5.48294434470377, "step": 76350}, {"loss": 0.5957, "grad_norm": 0.9822857975959778, "learning_rate": 0.0002, "epoch": 5.483662477558348, "step": 76360}, {"loss": 0.5427, "grad_norm": 1.0006381273269653, "learning_rate": 0.0002, "epoch": 5.484380610412926, "step": 76370}, {"loss": 0.5893, "grad_norm": 0.8960304260253906, "learning_rate": 0.0002, "epoch": 5.485098743267504, "step": 76380}, {"loss": 0.5515, "grad_norm": 0.7309539914131165, "learning_rate": 0.0002, "epoch": 5.485816876122082, "step": 76390}, {"loss": 0.5796, "grad_norm": 0.9747139811515808, "learning_rate": 0.0002, "epoch": 5.486535008976661, "step": 76400}, {"loss": 0.5898, "grad_norm": 0.9586864113807678, "learning_rate": 0.0002, "epoch": 5.487253141831239, "step": 76410}, {"loss": 0.6236, "grad_norm": 1.0815327167510986, "learning_rate": 0.0002, "epoch": 5.487971274685817, "step": 76420}, {"loss": 0.5923, "grad_norm": 1.1324117183685303, "learning_rate": 0.0002, "epoch": 5.488689407540395, "step": 76430}, {"loss": 0.5904, "grad_norm": 0.8575648069381714, "learning_rate": 0.0002, "epoch": 5.489407540394973, "step": 76440}, {"loss": 0.5477, "grad_norm": 0.9821682572364807, "learning_rate": 0.0002, "epoch": 5.490125673249551, "step": 76450}, {"loss": 0.5821, "grad_norm": 1.1611464023590088, "learning_rate": 0.0002, "epoch": 5.490843806104129, "step": 76460}, {"loss": 0.5227, "grad_norm": 1.0340297222137451, "learning_rate": 0.0002, "epoch": 5.491561938958707, "step": 76470}, {"loss": 0.6143, "grad_norm": 1.0116628408432007, "learning_rate": 0.0002, "epoch": 5.492280071813285, "step": 76480}, {"loss": 0.5968, "grad_norm": 0.9619752764701843, "learning_rate": 0.0002, "epoch": 5.492998204667863, "step": 76490}, {"loss": 0.5898, "grad_norm": 0.9924456477165222, "learning_rate": 0.0002, "epoch": 5.493716337522442, "step": 76500}, {"loss": 0.6041, "grad_norm": 0.9449224472045898, "learning_rate": 0.0002, "epoch": 5.49443447037702, "step": 76510}, {"loss": 0.5902, "grad_norm": 0.9075009822845459, "learning_rate": 0.0002, "epoch": 5.495152603231598, "step": 76520}, {"loss": 0.5602, "grad_norm": 1.3078763484954834, "learning_rate": 0.0002, "epoch": 5.495870736086176, "step": 76530}, {"loss": 0.5474, "grad_norm": 1.3162729740142822, "learning_rate": 0.0002, "epoch": 5.496588868940754, "step": 76540}, {"loss": 0.5938, "grad_norm": 1.144333839416504, "learning_rate": 0.0002, "epoch": 5.497307001795332, "step": 76550}, {"loss": 0.6105, "grad_norm": 0.9332208633422852, "learning_rate": 0.0002, "epoch": 5.49802513464991, "step": 76560}, {"loss": 0.5795, "grad_norm": 0.9660165309906006, "learning_rate": 0.0002, "epoch": 5.498743267504488, "step": 76570}, {"loss": 0.6023, "grad_norm": 1.0954749584197998, "learning_rate": 0.0002, "epoch": 5.499461400359066, "step": 76580}, {"loss": 0.5583, "grad_norm": 1.0537810325622559, "learning_rate": 0.0002, "epoch": 5.500179533213645, "step": 76590}, {"loss": 0.5976, "grad_norm": 0.9944321513175964, "learning_rate": 0.0002, "epoch": 5.500897666068223, "step": 76600}, {"loss": 0.5622, "grad_norm": 1.094462513923645, "learning_rate": 0.0002, "epoch": 5.501615798922801, "step": 76610}, {"loss": 0.6031, "grad_norm": 1.0246481895446777, "learning_rate": 0.0002, "epoch": 5.502333931777379, "step": 76620}, {"loss": 0.6211, "grad_norm": 0.9705453515052795, "learning_rate": 0.0002, "epoch": 5.503052064631957, "step": 76630}, {"loss": 0.6118, "grad_norm": 1.5252249240875244, "learning_rate": 0.0002, "epoch": 5.503770197486535, "step": 76640}, {"loss": 0.6351, "grad_norm": 0.8469606637954712, "learning_rate": 0.0002, "epoch": 5.504488330341113, "step": 76650}, {"loss": 0.6125, "grad_norm": 1.1882504224777222, "learning_rate": 0.0002, "epoch": 5.505206463195691, "step": 76660}, {"loss": 0.612, "grad_norm": 0.8447994589805603, "learning_rate": 0.0002, "epoch": 5.505924596050269, "step": 76670}, {"loss": 0.6233, "grad_norm": 0.9340696930885315, "learning_rate": 0.0002, "epoch": 5.506642728904847, "step": 76680}, {"loss": 0.5655, "grad_norm": 0.9622383713722229, "learning_rate": 0.0002, "epoch": 5.507360861759426, "step": 76690}, {"loss": 0.6346, "grad_norm": 1.1516523361206055, "learning_rate": 0.0002, "epoch": 5.508078994614004, "step": 76700}, {"loss": 0.5675, "grad_norm": 1.207190990447998, "learning_rate": 0.0002, "epoch": 5.508797127468582, "step": 76710}, {"loss": 0.5614, "grad_norm": 1.1244179010391235, "learning_rate": 0.0002, "epoch": 5.50951526032316, "step": 76720}, {"loss": 0.531, "grad_norm": 1.052288293838501, "learning_rate": 0.0002, "epoch": 5.510233393177738, "step": 76730}, {"loss": 0.5977, "grad_norm": 0.9571291208267212, "learning_rate": 0.0002, "epoch": 5.510951526032316, "step": 76740}, {"loss": 0.5974, "grad_norm": 0.9449458122253418, "learning_rate": 0.0002, "epoch": 5.511669658886894, "step": 76750}, {"loss": 0.59, "grad_norm": 1.0140511989593506, "learning_rate": 0.0002, "epoch": 5.512387791741472, "step": 76760}, {"loss": 0.5992, "grad_norm": 1.057715654373169, "learning_rate": 0.0002, "epoch": 5.513105924596051, "step": 76770}, {"loss": 0.5643, "grad_norm": 0.930642306804657, "learning_rate": 0.0002, "epoch": 5.513824057450629, "step": 76780}, {"loss": 0.5695, "grad_norm": 1.1213828325271606, "learning_rate": 0.0002, "epoch": 5.514542190305207, "step": 76790}, {"loss": 0.584, "grad_norm": 0.9147387742996216, "learning_rate": 0.0002, "epoch": 5.515260323159785, "step": 76800}, {"loss": 0.5759, "grad_norm": 1.1786983013153076, "learning_rate": 0.0002, "epoch": 5.515978456014363, "step": 76810}, {"loss": 0.5762, "grad_norm": 1.1022626161575317, "learning_rate": 0.0002, "epoch": 5.516696588868941, "step": 76820}, {"loss": 0.5795, "grad_norm": 1.0389000177383423, "learning_rate": 0.0002, "epoch": 5.517414721723519, "step": 76830}, {"loss": 0.5932, "grad_norm": 1.0750621557235718, "learning_rate": 0.0002, "epoch": 5.518132854578097, "step": 76840}, {"loss": 0.6177, "grad_norm": 1.0372626781463623, "learning_rate": 0.0002, "epoch": 5.518850987432675, "step": 76850}, {"loss": 0.5659, "grad_norm": 1.0989108085632324, "learning_rate": 0.0002, "epoch": 5.519569120287253, "step": 76860}, {"loss": 0.5525, "grad_norm": 1.030346155166626, "learning_rate": 0.0002, "epoch": 5.520287253141831, "step": 76870}, {"loss": 0.6669, "grad_norm": 1.1362419128417969, "learning_rate": 0.0002, "epoch": 5.52100538599641, "step": 76880}, {"loss": 0.5951, "grad_norm": 0.9110873937606812, "learning_rate": 0.0002, "epoch": 5.521723518850988, "step": 76890}, {"loss": 0.6161, "grad_norm": 1.0214358568191528, "learning_rate": 0.0002, "epoch": 5.522441651705566, "step": 76900}, {"loss": 0.6055, "grad_norm": 1.3764830827713013, "learning_rate": 0.0002, "epoch": 5.523159784560144, "step": 76910}, {"loss": 0.5822, "grad_norm": 1.0396335124969482, "learning_rate": 0.0002, "epoch": 5.523877917414722, "step": 76920}, {"loss": 0.6262, "grad_norm": 1.1942898035049438, "learning_rate": 0.0002, "epoch": 5.5245960502693, "step": 76930}, {"loss": 0.5927, "grad_norm": 0.8795760869979858, "learning_rate": 0.0002, "epoch": 5.525314183123878, "step": 76940}, {"loss": 0.5788, "grad_norm": 1.1081048250198364, "learning_rate": 0.0002, "epoch": 5.526032315978456, "step": 76950}, {"loss": 0.6101, "grad_norm": 0.9652274250984192, "learning_rate": 0.0002, "epoch": 5.526750448833035, "step": 76960}, {"loss": 0.6382, "grad_norm": 0.96559739112854, "learning_rate": 0.0002, "epoch": 5.527468581687613, "step": 76970}, {"loss": 0.6412, "grad_norm": 1.0416076183319092, "learning_rate": 0.0002, "epoch": 5.528186714542191, "step": 76980}, {"loss": 0.6027, "grad_norm": 0.9854229092597961, "learning_rate": 0.0002, "epoch": 5.528904847396769, "step": 76990}, {"loss": 0.6306, "grad_norm": 1.0515462160110474, "learning_rate": 0.0002, "epoch": 5.529622980251347, "step": 77000}, {"loss": 0.5783, "grad_norm": 1.0287327766418457, "learning_rate": 0.0002, "epoch": 5.530341113105925, "step": 77010}, {"loss": 0.6038, "grad_norm": 0.9579883217811584, "learning_rate": 0.0002, "epoch": 5.531059245960503, "step": 77020}, {"loss": 0.5908, "grad_norm": 1.0365805625915527, "learning_rate": 0.0002, "epoch": 5.531777378815081, "step": 77030}, {"loss": 0.5564, "grad_norm": 1.1600725650787354, "learning_rate": 0.0002, "epoch": 5.532495511669659, "step": 77040}, {"loss": 0.6147, "grad_norm": 0.8598031401634216, "learning_rate": 0.0002, "epoch": 5.533213644524237, "step": 77050}, {"loss": 0.5648, "grad_norm": 0.8884791731834412, "learning_rate": 0.0002, "epoch": 5.533931777378815, "step": 77060}, {"loss": 0.5559, "grad_norm": 0.900223433971405, "learning_rate": 0.0002, "epoch": 5.5346499102333935, "step": 77070}, {"loss": 0.5725, "grad_norm": 1.0212652683258057, "learning_rate": 0.0002, "epoch": 5.5353680430879715, "step": 77080}, {"loss": 0.6645, "grad_norm": 1.0924701690673828, "learning_rate": 0.0002, "epoch": 5.5360861759425495, "step": 77090}, {"loss": 0.5957, "grad_norm": 1.1955485343933105, "learning_rate": 0.0002, "epoch": 5.5368043087971275, "step": 77100}, {"loss": 0.5855, "grad_norm": 1.2157706022262573, "learning_rate": 0.0002, "epoch": 5.5375224416517055, "step": 77110}, {"loss": 0.6067, "grad_norm": 1.1118255853652954, "learning_rate": 0.0002, "epoch": 5.5382405745062835, "step": 77120}, {"loss": 0.5813, "grad_norm": 1.0146820545196533, "learning_rate": 0.0002, "epoch": 5.5389587073608615, "step": 77130}, {"loss": 0.6004, "grad_norm": 1.0876632928848267, "learning_rate": 0.0002, "epoch": 5.5396768402154395, "step": 77140}, {"loss": 0.5934, "grad_norm": 0.7914495468139648, "learning_rate": 0.0002, "epoch": 5.540394973070018, "step": 77150}, {"loss": 0.5666, "grad_norm": 1.0584027767181396, "learning_rate": 0.0002, "epoch": 5.541113105924596, "step": 77160}, {"loss": 0.523, "grad_norm": 0.9816845059394836, "learning_rate": 0.0002, "epoch": 5.541831238779174, "step": 77170}, {"loss": 0.5487, "grad_norm": 1.219076156616211, "learning_rate": 0.0002, "epoch": 5.542549371633752, "step": 77180}, {"loss": 0.639, "grad_norm": 0.9526635408401489, "learning_rate": 0.0002, "epoch": 5.54326750448833, "step": 77190}, {"loss": 0.5849, "grad_norm": 0.8437230587005615, "learning_rate": 0.0002, "epoch": 5.543985637342908, "step": 77200}, {"loss": 0.5858, "grad_norm": 0.9670451283454895, "learning_rate": 0.0002, "epoch": 5.544703770197486, "step": 77210}, {"loss": 0.559, "grad_norm": 1.015687346458435, "learning_rate": 0.0002, "epoch": 5.545421903052064, "step": 77220}, {"loss": 0.6065, "grad_norm": 0.8280553817749023, "learning_rate": 0.0002, "epoch": 5.546140035906642, "step": 77230}, {"loss": 0.5999, "grad_norm": 1.1320816278457642, "learning_rate": 0.0002, "epoch": 5.54685816876122, "step": 77240}, {"loss": 0.5894, "grad_norm": 1.3338711261749268, "learning_rate": 0.0002, "epoch": 5.547576301615799, "step": 77250}, {"loss": 0.591, "grad_norm": 0.9553194642066956, "learning_rate": 0.0002, "epoch": 5.548294434470377, "step": 77260}, {"loss": 0.6286, "grad_norm": 1.0604912042617798, "learning_rate": 0.0002, "epoch": 5.549012567324955, "step": 77270}, {"loss": 0.6362, "grad_norm": 1.1037590503692627, "learning_rate": 0.0002, "epoch": 5.549730700179533, "step": 77280}, {"loss": 0.6021, "grad_norm": 1.166212558746338, "learning_rate": 0.0002, "epoch": 5.550448833034111, "step": 77290}, {"loss": 0.5624, "grad_norm": 1.0189802646636963, "learning_rate": 0.0002, "epoch": 5.551166965888689, "step": 77300}, {"loss": 0.5998, "grad_norm": 0.9592387080192566, "learning_rate": 0.0002, "epoch": 5.551885098743267, "step": 77310}, {"loss": 0.609, "grad_norm": 0.9533785581588745, "learning_rate": 0.0002, "epoch": 5.552603231597845, "step": 77320}, {"loss": 0.5879, "grad_norm": 0.9666807055473328, "learning_rate": 0.0002, "epoch": 5.553321364452424, "step": 77330}, {"loss": 0.6049, "grad_norm": 0.8827478289604187, "learning_rate": 0.0002, "epoch": 5.554039497307002, "step": 77340}, {"loss": 0.5644, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 5.55475763016158, "step": 77350}, {"loss": 0.6083, "grad_norm": 1.14597487449646, "learning_rate": 0.0002, "epoch": 5.555475763016158, "step": 77360}, {"loss": 0.6025, "grad_norm": 1.009392499923706, "learning_rate": 0.0002, "epoch": 5.556193895870736, "step": 77370}, {"loss": 0.6141, "grad_norm": 1.115757942199707, "learning_rate": 0.0002, "epoch": 5.556912028725314, "step": 77380}, {"loss": 0.5538, "grad_norm": 0.9907452464103699, "learning_rate": 0.0002, "epoch": 5.557630161579892, "step": 77390}, {"loss": 0.6142, "grad_norm": 1.0667012929916382, "learning_rate": 0.0002, "epoch": 5.55834829443447, "step": 77400}, {"loss": 0.5728, "grad_norm": 0.9301251173019409, "learning_rate": 0.0002, "epoch": 5.559066427289048, "step": 77410}, {"loss": 0.6174, "grad_norm": 1.090384602546692, "learning_rate": 0.0002, "epoch": 5.559784560143626, "step": 77420}, {"loss": 0.5802, "grad_norm": 0.8073469996452332, "learning_rate": 0.0002, "epoch": 5.560502692998204, "step": 77430}, {"loss": 0.5757, "grad_norm": 1.1003652811050415, "learning_rate": 0.0002, "epoch": 5.561220825852783, "step": 77440}, {"loss": 0.5899, "grad_norm": 0.9493791460990906, "learning_rate": 0.0002, "epoch": 5.561938958707361, "step": 77450}, {"loss": 0.6029, "grad_norm": 0.925388514995575, "learning_rate": 0.0002, "epoch": 5.562657091561939, "step": 77460}, {"loss": 0.5893, "grad_norm": 1.0946427583694458, "learning_rate": 0.0002, "epoch": 5.563375224416517, "step": 77470}, {"loss": 0.58, "grad_norm": 0.9791404008865356, "learning_rate": 0.0002, "epoch": 5.564093357271095, "step": 77480}, {"loss": 0.5887, "grad_norm": 1.0534733533859253, "learning_rate": 0.0002, "epoch": 5.564811490125673, "step": 77490}, {"loss": 0.564, "grad_norm": 0.9351776242256165, "learning_rate": 0.0002, "epoch": 5.565529622980251, "step": 77500}, {"loss": 0.5489, "grad_norm": 1.004448413848877, "learning_rate": 0.0002, "epoch": 5.566247755834829, "step": 77510}, {"loss": 0.5717, "grad_norm": 1.0199403762817383, "learning_rate": 0.0002, "epoch": 5.566965888689408, "step": 77520}, {"loss": 0.6358, "grad_norm": 1.0693204402923584, "learning_rate": 0.0002, "epoch": 5.567684021543986, "step": 77530}, {"loss": 0.5896, "grad_norm": 1.0635178089141846, "learning_rate": 0.0002, "epoch": 5.568402154398564, "step": 77540}, {"loss": 0.6399, "grad_norm": 1.1154648065567017, "learning_rate": 0.0002, "epoch": 5.569120287253142, "step": 77550}, {"loss": 0.5748, "grad_norm": 0.999116837978363, "learning_rate": 0.0002, "epoch": 5.56983842010772, "step": 77560}, {"loss": 0.6159, "grad_norm": 0.9967397451400757, "learning_rate": 0.0002, "epoch": 5.570556552962298, "step": 77570}, {"loss": 0.6041, "grad_norm": 0.9684699773788452, "learning_rate": 0.0002, "epoch": 5.571274685816876, "step": 77580}, {"loss": 0.5876, "grad_norm": 1.027213454246521, "learning_rate": 0.0002, "epoch": 5.571992818671454, "step": 77590}, {"loss": 0.6631, "grad_norm": 1.0571194887161255, "learning_rate": 0.0002, "epoch": 5.572710951526032, "step": 77600}, {"loss": 0.5927, "grad_norm": 1.2010499238967896, "learning_rate": 0.0002, "epoch": 5.57342908438061, "step": 77610}, {"loss": 0.5962, "grad_norm": 1.1033680438995361, "learning_rate": 0.0002, "epoch": 5.574147217235188, "step": 77620}, {"loss": 0.5668, "grad_norm": 0.9394578337669373, "learning_rate": 0.0002, "epoch": 5.574865350089767, "step": 77630}, {"loss": 0.6018, "grad_norm": 1.379382610321045, "learning_rate": 0.0002, "epoch": 5.575583482944345, "step": 77640}, {"loss": 0.5921, "grad_norm": 0.9787197709083557, "learning_rate": 0.0002, "epoch": 5.576301615798923, "step": 77650}, {"loss": 0.569, "grad_norm": 0.9680284261703491, "learning_rate": 0.0002, "epoch": 5.577019748653501, "step": 77660}, {"loss": 0.5761, "grad_norm": 1.0449682474136353, "learning_rate": 0.0002, "epoch": 5.577737881508079, "step": 77670}, {"loss": 0.5835, "grad_norm": 1.1243085861206055, "learning_rate": 0.0002, "epoch": 5.578456014362657, "step": 77680}, {"loss": 0.5873, "grad_norm": 0.9228966236114502, "learning_rate": 0.0002, "epoch": 5.579174147217235, "step": 77690}, {"loss": 0.5888, "grad_norm": 1.1349890232086182, "learning_rate": 0.0002, "epoch": 5.579892280071813, "step": 77700}, {"loss": 0.6272, "grad_norm": 1.2248499393463135, "learning_rate": 0.0002, "epoch": 5.580610412926392, "step": 77710}, {"loss": 0.5734, "grad_norm": 1.0066324472427368, "learning_rate": 0.0002, "epoch": 5.58132854578097, "step": 77720}, {"loss": 0.6047, "grad_norm": 1.2642878293991089, "learning_rate": 0.0002, "epoch": 5.582046678635548, "step": 77730}, {"loss": 0.5946, "grad_norm": 1.031591534614563, "learning_rate": 0.0002, "epoch": 5.582764811490126, "step": 77740}, {"loss": 0.5743, "grad_norm": 1.0925929546356201, "learning_rate": 0.0002, "epoch": 5.583482944344704, "step": 77750}, {"loss": 0.6113, "grad_norm": 1.0567110776901245, "learning_rate": 0.0002, "epoch": 5.584201077199282, "step": 77760}, {"loss": 0.5523, "grad_norm": 1.246246099472046, "learning_rate": 0.0002, "epoch": 5.58491921005386, "step": 77770}, {"loss": 0.5934, "grad_norm": 1.2467739582061768, "learning_rate": 0.0002, "epoch": 5.585637342908438, "step": 77780}, {"loss": 0.6211, "grad_norm": 1.2695211172103882, "learning_rate": 0.0002, "epoch": 5.586355475763016, "step": 77790}, {"loss": 0.5824, "grad_norm": 1.0498571395874023, "learning_rate": 0.0002, "epoch": 5.587073608617594, "step": 77800}, {"loss": 0.5545, "grad_norm": 1.0078339576721191, "learning_rate": 0.0002, "epoch": 5.587791741472173, "step": 77810}, {"loss": 0.5995, "grad_norm": 1.108199954032898, "learning_rate": 0.0002, "epoch": 5.588509874326751, "step": 77820}, {"loss": 0.5716, "grad_norm": 1.0577641725540161, "learning_rate": 0.0002, "epoch": 5.589228007181329, "step": 77830}, {"loss": 0.6106, "grad_norm": 1.2169439792633057, "learning_rate": 0.0002, "epoch": 5.589946140035907, "step": 77840}, {"loss": 0.563, "grad_norm": 0.8310868740081787, "learning_rate": 0.0002, "epoch": 5.590664272890485, "step": 77850}, {"loss": 0.5749, "grad_norm": 0.9794082045555115, "learning_rate": 0.0002, "epoch": 5.591382405745063, "step": 77860}, {"loss": 0.6025, "grad_norm": 0.8867404460906982, "learning_rate": 0.0002, "epoch": 5.592100538599641, "step": 77870}, {"loss": 0.5581, "grad_norm": 0.9204208254814148, "learning_rate": 0.0002, "epoch": 5.592818671454219, "step": 77880}, {"loss": 0.5646, "grad_norm": 0.9801714420318604, "learning_rate": 0.0002, "epoch": 5.593536804308797, "step": 77890}, {"loss": 0.6036, "grad_norm": 0.9383925199508667, "learning_rate": 0.0002, "epoch": 5.594254937163376, "step": 77900}, {"loss": 0.6417, "grad_norm": 0.9124664068222046, "learning_rate": 0.0002, "epoch": 5.594973070017954, "step": 77910}, {"loss": 0.559, "grad_norm": 0.9618783593177795, "learning_rate": 0.0002, "epoch": 5.595691202872532, "step": 77920}, {"loss": 0.604, "grad_norm": 0.9575216770172119, "learning_rate": 0.0002, "epoch": 5.59640933572711, "step": 77930}, {"loss": 0.5987, "grad_norm": 1.1223464012145996, "learning_rate": 0.0002, "epoch": 5.597127468581688, "step": 77940}, {"loss": 0.615, "grad_norm": 0.9947475790977478, "learning_rate": 0.0002, "epoch": 5.597845601436266, "step": 77950}, {"loss": 0.5618, "grad_norm": 1.141959309577942, "learning_rate": 0.0002, "epoch": 5.598563734290844, "step": 77960}, {"loss": 0.5966, "grad_norm": 1.095525860786438, "learning_rate": 0.0002, "epoch": 5.599281867145422, "step": 77970}, {"loss": 0.5619, "grad_norm": 0.9396624565124512, "learning_rate": 0.0002, "epoch": 5.6, "step": 77980}, {"loss": 0.5549, "grad_norm": 0.8162274956703186, "learning_rate": 0.0002, "epoch": 5.600718132854578, "step": 77990}, {"loss": 0.5815, "grad_norm": 1.0130535364151, "learning_rate": 0.0002, "epoch": 5.6014362657091565, "step": 78000}, {"loss": 0.5891, "grad_norm": 1.0016634464263916, "learning_rate": 0.0002, "epoch": 5.6021543985637345, "step": 78010}, {"loss": 0.6029, "grad_norm": 0.8936169743537903, "learning_rate": 0.0002, "epoch": 5.6028725314183125, "step": 78020}, {"loss": 0.6284, "grad_norm": 1.169625163078308, "learning_rate": 0.0002, "epoch": 5.6035906642728905, "step": 78030}, {"loss": 0.6038, "grad_norm": 0.8896323442459106, "learning_rate": 0.0002, "epoch": 5.6043087971274685, "step": 78040}, {"loss": 0.6219, "grad_norm": 1.0939475297927856, "learning_rate": 0.0002, "epoch": 5.6050269299820465, "step": 78050}, {"loss": 0.6009, "grad_norm": 1.0880711078643799, "learning_rate": 0.0002, "epoch": 5.6057450628366245, "step": 78060}, {"loss": 0.6416, "grad_norm": 1.1426655054092407, "learning_rate": 0.0002, "epoch": 5.6064631956912026, "step": 78070}, {"loss": 0.6124, "grad_norm": 1.118586540222168, "learning_rate": 0.0002, "epoch": 5.607181328545781, "step": 78080}, {"loss": 0.5791, "grad_norm": 0.8784464597702026, "learning_rate": 0.0002, "epoch": 5.607899461400359, "step": 78090}, {"loss": 0.6385, "grad_norm": 1.137229561805725, "learning_rate": 0.0002, "epoch": 5.608617594254937, "step": 78100}, {"loss": 0.5998, "grad_norm": 1.1041932106018066, "learning_rate": 0.0002, "epoch": 5.6093357271095154, "step": 78110}, {"loss": 0.5985, "grad_norm": 1.0170503854751587, "learning_rate": 0.0002, "epoch": 5.6100538599640934, "step": 78120}, {"loss": 0.6376, "grad_norm": 1.298754334449768, "learning_rate": 0.0002, "epoch": 5.6107719928186714, "step": 78130}, {"loss": 0.6284, "grad_norm": 0.9344905018806458, "learning_rate": 0.0002, "epoch": 5.6114901256732495, "step": 78140}, {"loss": 0.5835, "grad_norm": 0.9467785954475403, "learning_rate": 0.0002, "epoch": 5.6122082585278275, "step": 78150}, {"loss": 0.5661, "grad_norm": 1.0617443323135376, "learning_rate": 0.0002, "epoch": 5.6129263913824055, "step": 78160}, {"loss": 0.5908, "grad_norm": 0.9017760753631592, "learning_rate": 0.0002, "epoch": 5.6136445242369835, "step": 78170}, {"loss": 0.5701, "grad_norm": 1.152601957321167, "learning_rate": 0.0002, "epoch": 5.6143626570915615, "step": 78180}, {"loss": 0.6319, "grad_norm": 0.9889463186264038, "learning_rate": 0.0002, "epoch": 5.61508078994614, "step": 78190}, {"loss": 0.5733, "grad_norm": 1.0367393493652344, "learning_rate": 0.0002, "epoch": 5.615798922800718, "step": 78200}, {"loss": 0.5785, "grad_norm": 0.8466457724571228, "learning_rate": 0.0002, "epoch": 5.616517055655296, "step": 78210}, {"loss": 0.563, "grad_norm": 0.936083197593689, "learning_rate": 0.0002, "epoch": 5.617235188509874, "step": 78220}, {"loss": 0.6077, "grad_norm": 1.018784999847412, "learning_rate": 0.0002, "epoch": 5.617953321364452, "step": 78230}, {"loss": 0.5676, "grad_norm": 0.8527804017066956, "learning_rate": 0.0002, "epoch": 5.61867145421903, "step": 78240}, {"loss": 0.5721, "grad_norm": 1.1873106956481934, "learning_rate": 0.0002, "epoch": 5.619389587073608, "step": 78250}, {"loss": 0.5905, "grad_norm": 0.9401728510856628, "learning_rate": 0.0002, "epoch": 5.620107719928186, "step": 78260}, {"loss": 0.5986, "grad_norm": 1.0801159143447876, "learning_rate": 0.0002, "epoch": 5.620825852782765, "step": 78270}, {"loss": 0.5769, "grad_norm": 1.0053739547729492, "learning_rate": 0.0002, "epoch": 5.621543985637343, "step": 78280}, {"loss": 0.5907, "grad_norm": 0.8599331378936768, "learning_rate": 0.0002, "epoch": 5.622262118491921, "step": 78290}, {"loss": 0.5689, "grad_norm": 2.3157296180725098, "learning_rate": 0.0002, "epoch": 5.622980251346499, "step": 78300}, {"loss": 0.5749, "grad_norm": 1.0027490854263306, "learning_rate": 0.0002, "epoch": 5.623698384201077, "step": 78310}, {"loss": 0.5452, "grad_norm": 0.996688961982727, "learning_rate": 0.0002, "epoch": 5.624416517055655, "step": 78320}, {"loss": 0.5979, "grad_norm": 1.0462113618850708, "learning_rate": 0.0002, "epoch": 5.625134649910233, "step": 78330}, {"loss": 0.5547, "grad_norm": 0.8750988245010376, "learning_rate": 0.0002, "epoch": 5.625852782764811, "step": 78340}, {"loss": 0.6076, "grad_norm": 0.8078145384788513, "learning_rate": 0.0002, "epoch": 5.626570915619389, "step": 78350}, {"loss": 0.6431, "grad_norm": 0.9047532081604004, "learning_rate": 0.0002, "epoch": 5.627289048473967, "step": 78360}, {"loss": 0.6027, "grad_norm": 0.9784479737281799, "learning_rate": 0.0002, "epoch": 5.628007181328546, "step": 78370}, {"loss": 0.6005, "grad_norm": 0.9529541730880737, "learning_rate": 0.0002, "epoch": 5.628725314183124, "step": 78380}, {"loss": 0.6057, "grad_norm": 0.8264740109443665, "learning_rate": 0.0002, "epoch": 5.629443447037702, "step": 78390}, {"loss": 0.5991, "grad_norm": 1.049724817276001, "learning_rate": 0.0002, "epoch": 5.63016157989228, "step": 78400}, {"loss": 0.5637, "grad_norm": 0.9866746068000793, "learning_rate": 0.0002, "epoch": 5.630879712746858, "step": 78410}, {"loss": 0.5622, "grad_norm": 0.897155225276947, "learning_rate": 0.0002, "epoch": 5.631597845601436, "step": 78420}, {"loss": 0.5838, "grad_norm": 1.225464940071106, "learning_rate": 0.0002, "epoch": 5.632315978456014, "step": 78430}, {"loss": 0.5928, "grad_norm": 0.8793753981590271, "learning_rate": 0.0002, "epoch": 5.633034111310592, "step": 78440}, {"loss": 0.6009, "grad_norm": 1.082482099533081, "learning_rate": 0.0002, "epoch": 5.63375224416517, "step": 78450}, {"loss": 0.6546, "grad_norm": 1.054064393043518, "learning_rate": 0.0002, "epoch": 5.634470377019749, "step": 78460}, {"loss": 0.5795, "grad_norm": 1.0032247304916382, "learning_rate": 0.0002, "epoch": 5.635188509874327, "step": 78470}, {"loss": 0.5697, "grad_norm": 0.8544651865959167, "learning_rate": 0.0002, "epoch": 5.635906642728905, "step": 78480}, {"loss": 0.6196, "grad_norm": 0.9475075602531433, "learning_rate": 0.0002, "epoch": 5.636624775583483, "step": 78490}, {"loss": 0.5975, "grad_norm": 1.0814138650894165, "learning_rate": 0.0002, "epoch": 5.637342908438061, "step": 78500}, {"loss": 0.5853, "grad_norm": 1.0813153982162476, "learning_rate": 0.0002, "epoch": 5.638061041292639, "step": 78510}, {"loss": 0.5806, "grad_norm": 1.0225616693496704, "learning_rate": 0.0002, "epoch": 5.638779174147217, "step": 78520}, {"loss": 0.5913, "grad_norm": 1.0777465105056763, "learning_rate": 0.0002, "epoch": 5.639497307001795, "step": 78530}, {"loss": 0.6207, "grad_norm": 1.156148910522461, "learning_rate": 0.0002, "epoch": 5.640215439856373, "step": 78540}, {"loss": 0.5843, "grad_norm": 1.0147465467453003, "learning_rate": 0.0002, "epoch": 5.640933572710951, "step": 78550}, {"loss": 0.6045, "grad_norm": 0.9606683850288391, "learning_rate": 0.0002, "epoch": 5.64165170556553, "step": 78560}, {"loss": 0.6457, "grad_norm": 0.9478723406791687, "learning_rate": 0.0002, "epoch": 5.642369838420108, "step": 78570}, {"loss": 0.5502, "grad_norm": 1.0653880834579468, "learning_rate": 0.0002, "epoch": 5.643087971274686, "step": 78580}, {"loss": 0.5938, "grad_norm": 1.7519923448562622, "learning_rate": 0.0002, "epoch": 5.643806104129264, "step": 78590}, {"loss": 0.6015, "grad_norm": 1.0567299127578735, "learning_rate": 0.0002, "epoch": 5.644524236983842, "step": 78600}, {"loss": 0.6329, "grad_norm": 0.8980287909507751, "learning_rate": 0.0002, "epoch": 5.64524236983842, "step": 78610}, {"loss": 0.6319, "grad_norm": 0.8792264461517334, "learning_rate": 0.0002, "epoch": 5.645960502692998, "step": 78620}, {"loss": 0.6234, "grad_norm": 1.2306275367736816, "learning_rate": 0.0002, "epoch": 5.646678635547576, "step": 78630}, {"loss": 0.5567, "grad_norm": 0.8259932398796082, "learning_rate": 0.0002, "epoch": 5.647396768402155, "step": 78640}, {"loss": 0.5484, "grad_norm": 0.9605076313018799, "learning_rate": 0.0002, "epoch": 5.648114901256733, "step": 78650}, {"loss": 0.5934, "grad_norm": 0.9967419505119324, "learning_rate": 0.0002, "epoch": 5.648833034111311, "step": 78660}, {"loss": 0.5755, "grad_norm": 0.9774024486541748, "learning_rate": 0.0002, "epoch": 5.649551166965889, "step": 78670}, {"loss": 0.6079, "grad_norm": 0.9838066697120667, "learning_rate": 0.0002, "epoch": 5.650269299820467, "step": 78680}, {"loss": 0.5674, "grad_norm": 1.1617798805236816, "learning_rate": 0.0002, "epoch": 5.650987432675045, "step": 78690}, {"loss": 0.6252, "grad_norm": 1.075006365776062, "learning_rate": 0.0002, "epoch": 5.651705565529623, "step": 78700}, {"loss": 0.5404, "grad_norm": 0.8859893679618835, "learning_rate": 0.0002, "epoch": 5.652423698384201, "step": 78710}, {"loss": 0.5657, "grad_norm": 1.0774717330932617, "learning_rate": 0.0002, "epoch": 5.653141831238779, "step": 78720}, {"loss": 0.625, "grad_norm": 1.147273302078247, "learning_rate": 0.0002, "epoch": 5.653859964093357, "step": 78730}, {"loss": 0.5819, "grad_norm": 1.1403213739395142, "learning_rate": 0.0002, "epoch": 5.654578096947935, "step": 78740}, {"loss": 0.5721, "grad_norm": 0.9115353226661682, "learning_rate": 0.0002, "epoch": 5.655296229802514, "step": 78750}, {"loss": 0.5521, "grad_norm": 0.9303002953529358, "learning_rate": 0.0002, "epoch": 5.656014362657092, "step": 78760}, {"loss": 0.6078, "grad_norm": 0.9324957728385925, "learning_rate": 0.0002, "epoch": 5.65673249551167, "step": 78770}, {"loss": 0.589, "grad_norm": 0.9688063859939575, "learning_rate": 0.0002, "epoch": 5.657450628366248, "step": 78780}, {"loss": 0.614, "grad_norm": 0.9019638299942017, "learning_rate": 0.0002, "epoch": 5.658168761220826, "step": 78790}, {"loss": 0.5594, "grad_norm": 0.8236798048019409, "learning_rate": 0.0002, "epoch": 5.658886894075404, "step": 78800}, {"loss": 0.6074, "grad_norm": 1.2702386379241943, "learning_rate": 0.0002, "epoch": 5.659605026929982, "step": 78810}, {"loss": 0.5738, "grad_norm": 1.041077971458435, "learning_rate": 0.0002, "epoch": 5.66032315978456, "step": 78820}, {"loss": 0.5773, "grad_norm": 0.9028838276863098, "learning_rate": 0.0002, "epoch": 5.661041292639139, "step": 78830}, {"loss": 0.5871, "grad_norm": 0.9874144196510315, "learning_rate": 0.0002, "epoch": 5.661759425493717, "step": 78840}, {"loss": 0.6039, "grad_norm": 0.9633761048316956, "learning_rate": 0.0002, "epoch": 5.662477558348295, "step": 78850}, {"loss": 0.5794, "grad_norm": 0.9069564342498779, "learning_rate": 0.0002, "epoch": 5.663195691202873, "step": 78860}, {"loss": 0.5836, "grad_norm": 0.9560621976852417, "learning_rate": 0.0002, "epoch": 5.663913824057451, "step": 78870}, {"loss": 0.579, "grad_norm": 0.9941161870956421, "learning_rate": 0.0002, "epoch": 5.664631956912029, "step": 78880}, {"loss": 0.6184, "grad_norm": 0.920407235622406, "learning_rate": 0.0002, "epoch": 5.665350089766607, "step": 78890}, {"loss": 0.6223, "grad_norm": 0.9909250140190125, "learning_rate": 0.0002, "epoch": 5.666068222621185, "step": 78900}, {"loss": 0.6154, "grad_norm": 0.9528568983078003, "learning_rate": 0.0002, "epoch": 5.666786355475763, "step": 78910}, {"loss": 0.6153, "grad_norm": 1.041440725326538, "learning_rate": 0.0002, "epoch": 5.667504488330341, "step": 78920}, {"loss": 0.609, "grad_norm": 1.0072191953659058, "learning_rate": 0.0002, "epoch": 5.66822262118492, "step": 78930}, {"loss": 0.6136, "grad_norm": 1.0740574598312378, "learning_rate": 0.0002, "epoch": 5.668940754039498, "step": 78940}, {"loss": 0.583, "grad_norm": 0.9168822169303894, "learning_rate": 0.0002, "epoch": 5.669658886894076, "step": 78950}, {"loss": 0.5808, "grad_norm": 1.1818004846572876, "learning_rate": 0.0002, "epoch": 5.670377019748654, "step": 78960}, {"loss": 0.6584, "grad_norm": 1.1925201416015625, "learning_rate": 0.0002, "epoch": 5.671095152603232, "step": 78970}, {"loss": 0.6074, "grad_norm": 0.879940390586853, "learning_rate": 0.0002, "epoch": 5.67181328545781, "step": 78980}, {"loss": 0.5863, "grad_norm": 1.0998331308364868, "learning_rate": 0.0002, "epoch": 5.672531418312388, "step": 78990}, {"loss": 0.5688, "grad_norm": 1.076637625694275, "learning_rate": 0.0002, "epoch": 5.673249551166966, "step": 79000}, {"loss": 0.6183, "grad_norm": 1.076864242553711, "learning_rate": 0.0002, "epoch": 5.673967684021544, "step": 79010}, {"loss": 0.6031, "grad_norm": 1.0206586122512817, "learning_rate": 0.0002, "epoch": 5.6746858168761225, "step": 79020}, {"loss": 0.5658, "grad_norm": 0.8242515325546265, "learning_rate": 0.0002, "epoch": 5.6754039497307005, "step": 79030}, {"loss": 0.5782, "grad_norm": 1.1180634498596191, "learning_rate": 0.0002, "epoch": 5.6761220825852785, "step": 79040}, {"loss": 0.6039, "grad_norm": 1.0155152082443237, "learning_rate": 0.0002, "epoch": 5.6768402154398565, "step": 79050}, {"loss": 0.5877, "grad_norm": 1.0445241928100586, "learning_rate": 0.0002, "epoch": 5.6775583482944345, "step": 79060}, {"loss": 0.5809, "grad_norm": 0.9851725697517395, "learning_rate": 0.0002, "epoch": 5.6782764811490125, "step": 79070}, {"loss": 0.5807, "grad_norm": 0.9979640245437622, "learning_rate": 0.0002, "epoch": 5.6789946140035905, "step": 79080}, {"loss": 0.6049, "grad_norm": 1.0398952960968018, "learning_rate": 0.0002, "epoch": 5.6797127468581685, "step": 79090}, {"loss": 0.6279, "grad_norm": 1.094164252281189, "learning_rate": 0.0002, "epoch": 5.6804308797127465, "step": 79100}, {"loss": 0.6325, "grad_norm": 0.9546816945075989, "learning_rate": 0.0002, "epoch": 5.6811490125673245, "step": 79110}, {"loss": 0.5658, "grad_norm": 1.1635938882827759, "learning_rate": 0.0002, "epoch": 5.681867145421903, "step": 79120}, {"loss": 0.5849, "grad_norm": 1.0260306596755981, "learning_rate": 0.0002, "epoch": 5.682585278276481, "step": 79130}, {"loss": 0.5653, "grad_norm": 0.9900122284889221, "learning_rate": 0.0002, "epoch": 5.683303411131059, "step": 79140}, {"loss": 0.6107, "grad_norm": 1.049688458442688, "learning_rate": 0.0002, "epoch": 5.684021543985637, "step": 79150}, {"loss": 0.5887, "grad_norm": 1.124272108078003, "learning_rate": 0.0002, "epoch": 5.684739676840215, "step": 79160}, {"loss": 0.5695, "grad_norm": 1.1109849214553833, "learning_rate": 0.0002, "epoch": 5.685457809694793, "step": 79170}, {"loss": 0.6014, "grad_norm": 0.739007830619812, "learning_rate": 0.0002, "epoch": 5.686175942549371, "step": 79180}, {"loss": 0.5995, "grad_norm": 1.2063007354736328, "learning_rate": 0.0002, "epoch": 5.686894075403949, "step": 79190}, {"loss": 0.5563, "grad_norm": 1.223317265510559, "learning_rate": 0.0002, "epoch": 5.687612208258528, "step": 79200}, {"loss": 0.6017, "grad_norm": 0.8042855858802795, "learning_rate": 0.0002, "epoch": 5.688330341113106, "step": 79210}, {"loss": 0.5909, "grad_norm": 0.9294175505638123, "learning_rate": 0.0002, "epoch": 5.689048473967684, "step": 79220}, {"loss": 0.6091, "grad_norm": 0.978084146976471, "learning_rate": 0.0002, "epoch": 5.689766606822262, "step": 79230}, {"loss": 0.6094, "grad_norm": 0.9271620512008667, "learning_rate": 0.0002, "epoch": 5.69048473967684, "step": 79240}, {"loss": 0.6454, "grad_norm": 1.158677339553833, "learning_rate": 0.0002, "epoch": 5.691202872531418, "step": 79250}, {"loss": 0.6054, "grad_norm": 0.9468576312065125, "learning_rate": 0.0002, "epoch": 5.691921005385996, "step": 79260}, {"loss": 0.6094, "grad_norm": 1.2025824785232544, "learning_rate": 0.0002, "epoch": 5.692639138240574, "step": 79270}, {"loss": 0.5995, "grad_norm": 1.0167860984802246, "learning_rate": 0.0002, "epoch": 5.693357271095152, "step": 79280}, {"loss": 0.5596, "grad_norm": 0.971199631690979, "learning_rate": 0.0002, "epoch": 5.69407540394973, "step": 79290}, {"loss": 0.6051, "grad_norm": 1.1757864952087402, "learning_rate": 0.0002, "epoch": 5.694793536804308, "step": 79300}, {"loss": 0.5915, "grad_norm": 1.0199662446975708, "learning_rate": 0.0002, "epoch": 5.695511669658887, "step": 79310}, {"loss": 0.5654, "grad_norm": 0.9662485122680664, "learning_rate": 0.0002, "epoch": 5.696229802513465, "step": 79320}, {"loss": 0.5602, "grad_norm": 0.9324414134025574, "learning_rate": 0.0002, "epoch": 5.696947935368043, "step": 79330}, {"loss": 0.5939, "grad_norm": 0.855752170085907, "learning_rate": 0.0002, "epoch": 5.697666068222621, "step": 79340}, {"loss": 0.6202, "grad_norm": 1.2723703384399414, "learning_rate": 0.0002, "epoch": 5.698384201077199, "step": 79350}, {"loss": 0.6028, "grad_norm": 1.0254011154174805, "learning_rate": 0.0002, "epoch": 5.699102333931777, "step": 79360}, {"loss": 0.5853, "grad_norm": 1.0958263874053955, "learning_rate": 0.0002, "epoch": 5.699820466786355, "step": 79370}, {"loss": 0.6292, "grad_norm": 1.0214145183563232, "learning_rate": 0.0002, "epoch": 5.700538599640933, "step": 79380}, {"loss": 0.6576, "grad_norm": 1.1087455749511719, "learning_rate": 0.0002, "epoch": 5.701256732495512, "step": 79390}, {"loss": 0.576, "grad_norm": 0.8885074853897095, "learning_rate": 0.0002, "epoch": 5.70197486535009, "step": 79400}, {"loss": 0.5452, "grad_norm": 0.9854450821876526, "learning_rate": 0.0002, "epoch": 5.702692998204668, "step": 79410}, {"loss": 0.5903, "grad_norm": 0.858744204044342, "learning_rate": 0.0002, "epoch": 5.703411131059246, "step": 79420}, {"loss": 0.5975, "grad_norm": 0.9434788823127747, "learning_rate": 0.0002, "epoch": 5.704129263913824, "step": 79430}, {"loss": 0.648, "grad_norm": 1.1388801336288452, "learning_rate": 0.0002, "epoch": 5.704847396768402, "step": 79440}, {"loss": 0.5895, "grad_norm": 1.0701899528503418, "learning_rate": 0.0002, "epoch": 5.70556552962298, "step": 79450}, {"loss": 0.5697, "grad_norm": 0.9147594571113586, "learning_rate": 0.0002, "epoch": 5.706283662477558, "step": 79460}, {"loss": 0.6043, "grad_norm": 1.055008053779602, "learning_rate": 0.0002, "epoch": 5.707001795332136, "step": 79470}, {"loss": 0.5625, "grad_norm": 0.7841609716415405, "learning_rate": 0.0002, "epoch": 5.707719928186714, "step": 79480}, {"loss": 0.6048, "grad_norm": 1.0334571599960327, "learning_rate": 0.0002, "epoch": 5.708438061041292, "step": 79490}, {"loss": 0.5924, "grad_norm": 1.2841367721557617, "learning_rate": 0.0002, "epoch": 5.709156193895871, "step": 79500}, {"loss": 0.5957, "grad_norm": 1.0296638011932373, "learning_rate": 0.0002, "epoch": 5.709874326750449, "step": 79510}, {"loss": 0.6015, "grad_norm": 0.9161922931671143, "learning_rate": 0.0002, "epoch": 5.710592459605027, "step": 79520}, {"loss": 0.6056, "grad_norm": 1.056856632232666, "learning_rate": 0.0002, "epoch": 5.711310592459605, "step": 79530}, {"loss": 0.5762, "grad_norm": 0.9919893145561218, "learning_rate": 0.0002, "epoch": 5.712028725314183, "step": 79540}, {"loss": 0.5987, "grad_norm": 1.1128891706466675, "learning_rate": 0.0002, "epoch": 5.712746858168761, "step": 79550}, {"loss": 0.5835, "grad_norm": 1.1171997785568237, "learning_rate": 0.0002, "epoch": 5.713464991023339, "step": 79560}, {"loss": 0.6037, "grad_norm": 0.9389346837997437, "learning_rate": 0.0002, "epoch": 5.714183123877917, "step": 79570}, {"loss": 0.5805, "grad_norm": 0.9869245886802673, "learning_rate": 0.0002, "epoch": 5.714901256732496, "step": 79580}, {"loss": 0.5776, "grad_norm": 0.9019966721534729, "learning_rate": 0.0002, "epoch": 5.715619389587074, "step": 79590}, {"loss": 0.567, "grad_norm": 0.9791252017021179, "learning_rate": 0.0002, "epoch": 5.716337522441652, "step": 79600}, {"loss": 0.5817, "grad_norm": 1.0269849300384521, "learning_rate": 0.0002, "epoch": 5.71705565529623, "step": 79610}, {"loss": 0.602, "grad_norm": 1.0340129137039185, "learning_rate": 0.0002, "epoch": 5.717773788150808, "step": 79620}, {"loss": 0.5969, "grad_norm": 0.9742604494094849, "learning_rate": 0.0002, "epoch": 5.718491921005386, "step": 79630}, {"loss": 0.5945, "grad_norm": 1.126868724822998, "learning_rate": 0.0002, "epoch": 5.719210053859964, "step": 79640}, {"loss": 0.601, "grad_norm": 1.04326331615448, "learning_rate": 0.0002, "epoch": 5.719928186714542, "step": 79650}, {"loss": 0.6071, "grad_norm": 0.8300277590751648, "learning_rate": 0.0002, "epoch": 5.72064631956912, "step": 79660}, {"loss": 0.6121, "grad_norm": 0.8482570052146912, "learning_rate": 0.0002, "epoch": 5.721364452423698, "step": 79670}, {"loss": 0.5937, "grad_norm": 1.0777807235717773, "learning_rate": 0.0002, "epoch": 5.722082585278277, "step": 79680}, {"loss": 0.5739, "grad_norm": 1.2682723999023438, "learning_rate": 0.0002, "epoch": 5.722800718132855, "step": 79690}, {"loss": 0.5759, "grad_norm": 0.8742772340774536, "learning_rate": 0.0002, "epoch": 5.723518850987433, "step": 79700}, {"loss": 0.5839, "grad_norm": 0.9218387603759766, "learning_rate": 0.0002, "epoch": 5.724236983842011, "step": 79710}, {"loss": 0.5968, "grad_norm": 0.8977975845336914, "learning_rate": 0.0002, "epoch": 5.724955116696589, "step": 79720}, {"loss": 0.5743, "grad_norm": 1.0873085260391235, "learning_rate": 0.0002, "epoch": 5.725673249551167, "step": 79730}, {"loss": 0.5986, "grad_norm": 0.9811807870864868, "learning_rate": 0.0002, "epoch": 5.726391382405745, "step": 79740}, {"loss": 0.5881, "grad_norm": 0.926764965057373, "learning_rate": 0.0002, "epoch": 5.727109515260323, "step": 79750}, {"loss": 0.5738, "grad_norm": 1.0103713274002075, "learning_rate": 0.0002, "epoch": 5.727827648114902, "step": 79760}, {"loss": 0.5807, "grad_norm": 1.1389189958572388, "learning_rate": 0.0002, "epoch": 5.72854578096948, "step": 79770}, {"loss": 0.636, "grad_norm": 1.1654961109161377, "learning_rate": 0.0002, "epoch": 5.729263913824058, "step": 79780}, {"loss": 0.5863, "grad_norm": 0.7925996780395508, "learning_rate": 0.0002, "epoch": 5.729982046678636, "step": 79790}, {"loss": 0.6005, "grad_norm": 1.3329131603240967, "learning_rate": 0.0002, "epoch": 5.730700179533214, "step": 79800}, {"loss": 0.6295, "grad_norm": 1.158328890800476, "learning_rate": 0.0002, "epoch": 5.731418312387792, "step": 79810}, {"loss": 0.5832, "grad_norm": 0.9904412031173706, "learning_rate": 0.0002, "epoch": 5.73213644524237, "step": 79820}, {"loss": 0.582, "grad_norm": 1.099233865737915, "learning_rate": 0.0002, "epoch": 5.732854578096948, "step": 79830}, {"loss": 0.6135, "grad_norm": 1.0224473476409912, "learning_rate": 0.0002, "epoch": 5.733572710951526, "step": 79840}, {"loss": 0.6063, "grad_norm": 1.0482215881347656, "learning_rate": 0.0002, "epoch": 5.734290843806104, "step": 79850}, {"loss": 0.5792, "grad_norm": 0.9790018200874329, "learning_rate": 0.0002, "epoch": 5.735008976660682, "step": 79860}, {"loss": 0.6089, "grad_norm": 1.034548044204712, "learning_rate": 0.0002, "epoch": 5.735727109515261, "step": 79870}, {"loss": 0.5676, "grad_norm": 0.799286961555481, "learning_rate": 0.0002, "epoch": 5.736445242369839, "step": 79880}, {"loss": 0.5344, "grad_norm": 1.0119048357009888, "learning_rate": 0.0002, "epoch": 5.737163375224417, "step": 79890}, {"loss": 0.5859, "grad_norm": 0.9742264151573181, "learning_rate": 0.0002, "epoch": 5.737881508078995, "step": 79900}, {"loss": 0.5992, "grad_norm": 1.0408239364624023, "learning_rate": 0.0002, "epoch": 5.738599640933573, "step": 79910}, {"loss": 0.6009, "grad_norm": 0.9165748953819275, "learning_rate": 0.0002, "epoch": 5.739317773788151, "step": 79920}, {"loss": 0.5864, "grad_norm": 1.1859451532363892, "learning_rate": 0.0002, "epoch": 5.740035906642729, "step": 79930}, {"loss": 0.5948, "grad_norm": 0.8772084712982178, "learning_rate": 0.0002, "epoch": 5.740754039497307, "step": 79940}, {"loss": 0.5614, "grad_norm": 1.0123273134231567, "learning_rate": 0.0002, "epoch": 5.741472172351886, "step": 79950}, {"loss": 0.6405, "grad_norm": 1.1873936653137207, "learning_rate": 0.0002, "epoch": 5.742190305206464, "step": 79960}, {"loss": 0.5818, "grad_norm": 0.9065699577331543, "learning_rate": 0.0002, "epoch": 5.742908438061042, "step": 79970}, {"loss": 0.6068, "grad_norm": 1.1626464128494263, "learning_rate": 0.0002, "epoch": 5.74362657091562, "step": 79980}, {"loss": 0.5814, "grad_norm": 1.0311716794967651, "learning_rate": 0.0002, "epoch": 5.744344703770198, "step": 79990}, {"loss": 0.5752, "grad_norm": 1.0865558385849, "learning_rate": 0.0002, "epoch": 5.745062836624776, "step": 80000}, {"loss": 0.6477, "grad_norm": 1.0257176160812378, "learning_rate": 0.0002, "epoch": 5.745780969479354, "step": 80010}, {"loss": 0.6172, "grad_norm": 0.9805439710617065, "learning_rate": 0.0002, "epoch": 5.746499102333932, "step": 80020}, {"loss": 0.5949, "grad_norm": 0.9744977355003357, "learning_rate": 0.0002, "epoch": 5.74721723518851, "step": 80030}, {"loss": 0.5893, "grad_norm": 1.302816390991211, "learning_rate": 0.0002, "epoch": 5.747935368043088, "step": 80040}, {"loss": 0.5653, "grad_norm": 0.8866990208625793, "learning_rate": 0.0002, "epoch": 5.748653500897666, "step": 80050}, {"loss": 0.5648, "grad_norm": 1.0133726596832275, "learning_rate": 0.0002, "epoch": 5.7493716337522445, "step": 80060}, {"loss": 0.6016, "grad_norm": 1.0043569803237915, "learning_rate": 0.0002, "epoch": 5.7500897666068225, "step": 80070}, {"loss": 0.6493, "grad_norm": 0.9100040197372437, "learning_rate": 0.0002, "epoch": 5.7508078994614005, "step": 80080}, {"loss": 0.5469, "grad_norm": 0.7994180917739868, "learning_rate": 0.0002, "epoch": 5.7515260323159785, "step": 80090}, {"loss": 0.6521, "grad_norm": 1.120188593864441, "learning_rate": 0.0002, "epoch": 5.7522441651705565, "step": 80100}, {"loss": 0.5737, "grad_norm": 0.9555420279502869, "learning_rate": 0.0002, "epoch": 5.7529622980251345, "step": 80110}, {"loss": 0.5897, "grad_norm": 1.0305951833724976, "learning_rate": 0.0002, "epoch": 5.7536804308797125, "step": 80120}, {"loss": 0.5821, "grad_norm": 0.9632731676101685, "learning_rate": 0.0002, "epoch": 5.7543985637342905, "step": 80130}, {"loss": 0.5618, "grad_norm": 1.2654297351837158, "learning_rate": 0.0002, "epoch": 5.755116696588869, "step": 80140}, {"loss": 0.6044, "grad_norm": 1.027190089225769, "learning_rate": 0.0002, "epoch": 5.755834829443447, "step": 80150}, {"loss": 0.6131, "grad_norm": 0.9829175472259521, "learning_rate": 0.0002, "epoch": 5.756552962298025, "step": 80160}, {"loss": 0.609, "grad_norm": 1.083803653717041, "learning_rate": 0.0002, "epoch": 5.757271095152603, "step": 80170}, {"loss": 0.6134, "grad_norm": 0.9353913068771362, "learning_rate": 0.0002, "epoch": 5.757989228007181, "step": 80180}, {"loss": 0.6515, "grad_norm": 1.1824370622634888, "learning_rate": 0.0002, "epoch": 5.758707360861759, "step": 80190}, {"loss": 0.6012, "grad_norm": 1.0901048183441162, "learning_rate": 0.0002, "epoch": 5.759425493716337, "step": 80200}, {"loss": 0.5639, "grad_norm": 1.0389254093170166, "learning_rate": 0.0002, "epoch": 5.760143626570915, "step": 80210}, {"loss": 0.6085, "grad_norm": 0.9746400117874146, "learning_rate": 0.0002, "epoch": 5.760861759425493, "step": 80220}, {"loss": 0.5874, "grad_norm": 0.9319248795509338, "learning_rate": 0.0002, "epoch": 5.761579892280071, "step": 80230}, {"loss": 0.5726, "grad_norm": 1.152784824371338, "learning_rate": 0.0002, "epoch": 5.76229802513465, "step": 80240}, {"loss": 0.5998, "grad_norm": 0.9462733864784241, "learning_rate": 0.0002, "epoch": 5.763016157989228, "step": 80250}, {"loss": 0.5755, "grad_norm": 0.8884182572364807, "learning_rate": 0.0002, "epoch": 5.763734290843806, "step": 80260}, {"loss": 0.5864, "grad_norm": 0.8755964636802673, "learning_rate": 0.0002, "epoch": 5.764452423698384, "step": 80270}, {"loss": 0.5659, "grad_norm": 0.8983452320098877, "learning_rate": 0.0002, "epoch": 5.765170556552962, "step": 80280}, {"loss": 0.5799, "grad_norm": 0.8565991520881653, "learning_rate": 0.0002, "epoch": 5.76588868940754, "step": 80290}, {"loss": 0.598, "grad_norm": 1.0557159185409546, "learning_rate": 0.0002, "epoch": 5.766606822262118, "step": 80300}, {"loss": 0.6441, "grad_norm": 1.057214379310608, "learning_rate": 0.0002, "epoch": 5.767324955116696, "step": 80310}, {"loss": 0.6038, "grad_norm": 0.9852516055107117, "learning_rate": 0.0002, "epoch": 5.768043087971275, "step": 80320}, {"loss": 0.5676, "grad_norm": 1.0339698791503906, "learning_rate": 0.0002, "epoch": 5.768761220825853, "step": 80330}, {"loss": 0.5963, "grad_norm": 1.0056889057159424, "learning_rate": 0.0002, "epoch": 5.769479353680431, "step": 80340}, {"loss": 0.5588, "grad_norm": 1.0941663980484009, "learning_rate": 0.0002, "epoch": 5.770197486535009, "step": 80350}, {"loss": 0.5729, "grad_norm": 1.2145589590072632, "learning_rate": 0.0002, "epoch": 5.770915619389587, "step": 80360}, {"loss": 0.5819, "grad_norm": 0.9609606862068176, "learning_rate": 0.0002, "epoch": 5.771633752244165, "step": 80370}, {"loss": 0.6313, "grad_norm": 0.8815773129463196, "learning_rate": 0.0002, "epoch": 5.772351885098743, "step": 80380}, {"loss": 0.6046, "grad_norm": 1.2630987167358398, "learning_rate": 0.0002, "epoch": 5.773070017953321, "step": 80390}, {"loss": 0.5918, "grad_norm": 1.0605450868606567, "learning_rate": 0.0002, "epoch": 5.773788150807899, "step": 80400}, {"loss": 0.6074, "grad_norm": 1.165069341659546, "learning_rate": 0.0002, "epoch": 5.774506283662477, "step": 80410}, {"loss": 0.5683, "grad_norm": 0.9038028717041016, "learning_rate": 0.0002, "epoch": 5.775224416517055, "step": 80420}, {"loss": 0.6024, "grad_norm": 1.0571858882904053, "learning_rate": 0.0002, "epoch": 5.775942549371634, "step": 80430}, {"loss": 0.624, "grad_norm": 1.0388168096542358, "learning_rate": 0.0002, "epoch": 5.776660682226212, "step": 80440}, {"loss": 0.6139, "grad_norm": 1.0552119016647339, "learning_rate": 0.0002, "epoch": 5.77737881508079, "step": 80450}, {"loss": 0.5988, "grad_norm": 1.0610109567642212, "learning_rate": 0.0002, "epoch": 5.778096947935368, "step": 80460}, {"loss": 0.6264, "grad_norm": 0.9906430244445801, "learning_rate": 0.0002, "epoch": 5.778815080789946, "step": 80470}, {"loss": 0.5807, "grad_norm": 1.1511857509613037, "learning_rate": 0.0002, "epoch": 5.779533213644524, "step": 80480}, {"loss": 0.6202, "grad_norm": 1.2738412618637085, "learning_rate": 0.0002, "epoch": 5.780251346499102, "step": 80490}, {"loss": 0.5957, "grad_norm": 0.8945937752723694, "learning_rate": 0.0002, "epoch": 5.78096947935368, "step": 80500}, {"loss": 0.6049, "grad_norm": 1.1105149984359741, "learning_rate": 0.0002, "epoch": 5.781687612208259, "step": 80510}, {"loss": 0.5989, "grad_norm": 0.8432297110557556, "learning_rate": 0.0002, "epoch": 5.782405745062837, "step": 80520}, {"loss": 0.6321, "grad_norm": 0.9257984757423401, "learning_rate": 0.0002, "epoch": 5.783123877917415, "step": 80530}, {"loss": 0.6191, "grad_norm": 1.1708799600601196, "learning_rate": 0.0002, "epoch": 5.783842010771993, "step": 80540}, {"loss": 0.5465, "grad_norm": 0.9969521164894104, "learning_rate": 0.0002, "epoch": 5.784560143626571, "step": 80550}, {"loss": 0.6569, "grad_norm": 1.0361413955688477, "learning_rate": 0.0002, "epoch": 5.785278276481149, "step": 80560}, {"loss": 0.6131, "grad_norm": 0.9876393675804138, "learning_rate": 0.0002, "epoch": 5.785996409335727, "step": 80570}, {"loss": 0.5586, "grad_norm": 1.0356241464614868, "learning_rate": 0.0002, "epoch": 5.786714542190305, "step": 80580}, {"loss": 0.5647, "grad_norm": 1.178865671157837, "learning_rate": 0.0002, "epoch": 5.787432675044883, "step": 80590}, {"loss": 0.578, "grad_norm": 0.8614338636398315, "learning_rate": 0.0002, "epoch": 5.788150807899461, "step": 80600}, {"loss": 0.5916, "grad_norm": 1.020734429359436, "learning_rate": 0.0002, "epoch": 5.788868940754039, "step": 80610}, {"loss": 0.6015, "grad_norm": 1.035951852798462, "learning_rate": 0.0002, "epoch": 5.789587073608618, "step": 80620}, {"loss": 0.5838, "grad_norm": 0.898637592792511, "learning_rate": 0.0002, "epoch": 5.790305206463196, "step": 80630}, {"loss": 0.5894, "grad_norm": 0.9803016781806946, "learning_rate": 0.0002, "epoch": 5.791023339317774, "step": 80640}, {"loss": 0.5806, "grad_norm": 1.2902555465698242, "learning_rate": 0.0002, "epoch": 5.791741472172352, "step": 80650}, {"loss": 0.6136, "grad_norm": 1.3364112377166748, "learning_rate": 0.0002, "epoch": 5.79245960502693, "step": 80660}, {"loss": 0.6071, "grad_norm": 0.8553985953330994, "learning_rate": 0.0002, "epoch": 5.793177737881508, "step": 80670}, {"loss": 0.5853, "grad_norm": 0.8211889863014221, "learning_rate": 0.0002, "epoch": 5.793895870736086, "step": 80680}, {"loss": 0.5732, "grad_norm": 0.9288306832313538, "learning_rate": 0.0002, "epoch": 5.794614003590664, "step": 80690}, {"loss": 0.6241, "grad_norm": 1.0716029405593872, "learning_rate": 0.0002, "epoch": 5.795332136445243, "step": 80700}, {"loss": 0.643, "grad_norm": 0.9957329034805298, "learning_rate": 0.0002, "epoch": 5.796050269299821, "step": 80710}, {"loss": 0.5762, "grad_norm": 0.9691376090049744, "learning_rate": 0.0002, "epoch": 5.796768402154399, "step": 80720}, {"loss": 0.6227, "grad_norm": 1.0590804815292358, "learning_rate": 0.0002, "epoch": 5.797486535008977, "step": 80730}, {"loss": 0.59, "grad_norm": 1.0408968925476074, "learning_rate": 0.0002, "epoch": 5.798204667863555, "step": 80740}, {"loss": 0.5656, "grad_norm": 1.0249526500701904, "learning_rate": 0.0002, "epoch": 5.798922800718133, "step": 80750}, {"loss": 0.5991, "grad_norm": 1.3658806085586548, "learning_rate": 0.0002, "epoch": 5.799640933572711, "step": 80760}, {"loss": 0.5671, "grad_norm": 0.9562603831291199, "learning_rate": 0.0002, "epoch": 5.800359066427289, "step": 80770}, {"loss": 0.5929, "grad_norm": 0.8790915012359619, "learning_rate": 0.0002, "epoch": 5.801077199281867, "step": 80780}, {"loss": 0.5864, "grad_norm": 0.8351004123687744, "learning_rate": 0.0002, "epoch": 5.801795332136445, "step": 80790}, {"loss": 0.5544, "grad_norm": 0.964562714099884, "learning_rate": 0.0002, "epoch": 5.802513464991024, "step": 80800}, {"loss": 0.6388, "grad_norm": 1.0873116254806519, "learning_rate": 0.0002, "epoch": 5.803231597845602, "step": 80810}, {"loss": 0.5891, "grad_norm": 0.9821216464042664, "learning_rate": 0.0002, "epoch": 5.80394973070018, "step": 80820}, {"loss": 0.631, "grad_norm": 1.1158807277679443, "learning_rate": 0.0002, "epoch": 5.804667863554758, "step": 80830}, {"loss": 0.6068, "grad_norm": 1.0098856687545776, "learning_rate": 0.0002, "epoch": 5.805385996409336, "step": 80840}, {"loss": 0.6112, "grad_norm": 0.9628035426139832, "learning_rate": 0.0002, "epoch": 5.806104129263914, "step": 80850}, {"loss": 0.6003, "grad_norm": 1.133800983428955, "learning_rate": 0.0002, "epoch": 5.806822262118492, "step": 80860}, {"loss": 0.5802, "grad_norm": 0.9423992037773132, "learning_rate": 0.0002, "epoch": 5.80754039497307, "step": 80870}, {"loss": 0.5729, "grad_norm": 1.0758612155914307, "learning_rate": 0.0002, "epoch": 5.808258527827648, "step": 80880}, {"loss": 0.586, "grad_norm": 1.232029914855957, "learning_rate": 0.0002, "epoch": 5.808976660682227, "step": 80890}, {"loss": 0.5932, "grad_norm": 1.1063108444213867, "learning_rate": 0.0002, "epoch": 5.809694793536805, "step": 80900}, {"loss": 0.5627, "grad_norm": 0.9759877920150757, "learning_rate": 0.0002, "epoch": 5.810412926391383, "step": 80910}, {"loss": 0.6169, "grad_norm": 0.9180193543434143, "learning_rate": 0.0002, "epoch": 5.811131059245961, "step": 80920}, {"loss": 0.6198, "grad_norm": 1.0818052291870117, "learning_rate": 0.0002, "epoch": 5.811849192100539, "step": 80930}, {"loss": 0.5997, "grad_norm": 0.998986542224884, "learning_rate": 0.0002, "epoch": 5.812567324955117, "step": 80940}, {"loss": 0.6183, "grad_norm": 1.1549060344696045, "learning_rate": 0.0002, "epoch": 5.813285457809695, "step": 80950}, {"loss": 0.5858, "grad_norm": 1.1900213956832886, "learning_rate": 0.0002, "epoch": 5.814003590664273, "step": 80960}, {"loss": 0.6249, "grad_norm": 0.8114368915557861, "learning_rate": 0.0002, "epoch": 5.814721723518851, "step": 80970}, {"loss": 0.6199, "grad_norm": 1.0296406745910645, "learning_rate": 0.0002, "epoch": 5.815439856373429, "step": 80980}, {"loss": 0.6226, "grad_norm": 1.0466746091842651, "learning_rate": 0.0002, "epoch": 5.8161579892280075, "step": 80990}, {"loss": 0.6303, "grad_norm": 1.0524508953094482, "learning_rate": 0.0002, "epoch": 5.8168761220825855, "step": 81000}, {"loss": 0.5708, "grad_norm": 1.1588358879089355, "learning_rate": 0.0002, "epoch": 5.8175942549371635, "step": 81010}, {"loss": 0.5818, "grad_norm": 0.9378601908683777, "learning_rate": 0.0002, "epoch": 5.8183123877917415, "step": 81020}, {"loss": 0.6404, "grad_norm": 0.9486441612243652, "learning_rate": 0.0002, "epoch": 5.8190305206463195, "step": 81030}, {"loss": 0.566, "grad_norm": 0.9805227518081665, "learning_rate": 0.0002, "epoch": 5.8197486535008975, "step": 81040}, {"loss": 0.6025, "grad_norm": 1.1627717018127441, "learning_rate": 0.0002, "epoch": 5.8204667863554755, "step": 81050}, {"loss": 0.5954, "grad_norm": 1.0716841220855713, "learning_rate": 0.0002, "epoch": 5.8211849192100535, "step": 81060}, {"loss": 0.6045, "grad_norm": 1.2398899793624878, "learning_rate": 0.0002, "epoch": 5.821903052064632, "step": 81070}, {"loss": 0.5813, "grad_norm": 1.0934730768203735, "learning_rate": 0.0002, "epoch": 5.82262118491921, "step": 81080}, {"loss": 0.5601, "grad_norm": 0.9701796174049377, "learning_rate": 0.0002, "epoch": 5.823339317773788, "step": 81090}, {"loss": 0.6493, "grad_norm": 1.0218969583511353, "learning_rate": 0.0002, "epoch": 5.824057450628366, "step": 81100}, {"loss": 0.6121, "grad_norm": 1.3066465854644775, "learning_rate": 0.0002, "epoch": 5.824775583482944, "step": 81110}, {"loss": 0.6145, "grad_norm": 1.1067441701889038, "learning_rate": 0.0002, "epoch": 5.825493716337522, "step": 81120}, {"loss": 0.5959, "grad_norm": 0.9750344753265381, "learning_rate": 0.0002, "epoch": 5.8262118491921004, "step": 81130}, {"loss": 0.6192, "grad_norm": 1.129191279411316, "learning_rate": 0.0002, "epoch": 5.8269299820466784, "step": 81140}, {"loss": 0.6191, "grad_norm": 1.05964195728302, "learning_rate": 0.0002, "epoch": 5.8276481149012564, "step": 81150}, {"loss": 0.6353, "grad_norm": 1.1094872951507568, "learning_rate": 0.0002, "epoch": 5.8283662477558345, "step": 81160}, {"loss": 0.5835, "grad_norm": 0.9163196086883545, "learning_rate": 0.0002, "epoch": 5.8290843806104125, "step": 81170}, {"loss": 0.6513, "grad_norm": 1.0035687685012817, "learning_rate": 0.0002, "epoch": 5.829802513464991, "step": 81180}, {"loss": 0.5948, "grad_norm": 1.0353461503982544, "learning_rate": 0.0002, "epoch": 5.830520646319569, "step": 81190}, {"loss": 0.602, "grad_norm": 1.0566555261611938, "learning_rate": 0.0002, "epoch": 5.831238779174147, "step": 81200}, {"loss": 0.6086, "grad_norm": 1.2373290061950684, "learning_rate": 0.0002, "epoch": 5.831956912028725, "step": 81210}, {"loss": 0.6054, "grad_norm": 0.8818837404251099, "learning_rate": 0.0002, "epoch": 5.832675044883303, "step": 81220}, {"loss": 0.604, "grad_norm": 1.1024713516235352, "learning_rate": 0.0002, "epoch": 5.833393177737881, "step": 81230}, {"loss": 0.6649, "grad_norm": 1.2478809356689453, "learning_rate": 0.0002, "epoch": 5.834111310592459, "step": 81240}, {"loss": 0.584, "grad_norm": 0.8647364377975464, "learning_rate": 0.0002, "epoch": 5.834829443447037, "step": 81250}, {"loss": 0.6089, "grad_norm": 1.1106358766555786, "learning_rate": 0.0002, "epoch": 5.835547576301616, "step": 81260}, {"loss": 0.5934, "grad_norm": 0.9432938694953918, "learning_rate": 0.0002, "epoch": 5.836265709156194, "step": 81270}, {"loss": 0.6401, "grad_norm": 1.0283797979354858, "learning_rate": 0.0002, "epoch": 5.836983842010772, "step": 81280}, {"loss": 0.6549, "grad_norm": 1.158918857574463, "learning_rate": 0.0002, "epoch": 5.83770197486535, "step": 81290}, {"loss": 0.5974, "grad_norm": 0.9700069427490234, "learning_rate": 0.0002, "epoch": 5.838420107719928, "step": 81300}, {"loss": 0.5841, "grad_norm": 1.08310866355896, "learning_rate": 0.0002, "epoch": 5.839138240574506, "step": 81310}, {"loss": 0.6234, "grad_norm": 1.05460524559021, "learning_rate": 0.0002, "epoch": 5.839856373429084, "step": 81320}, {"loss": 0.5586, "grad_norm": 0.9849268794059753, "learning_rate": 0.0002, "epoch": 5.840574506283662, "step": 81330}, {"loss": 0.5927, "grad_norm": 0.888306736946106, "learning_rate": 0.0002, "epoch": 5.84129263913824, "step": 81340}, {"loss": 0.6106, "grad_norm": 1.0337001085281372, "learning_rate": 0.0002, "epoch": 5.842010771992818, "step": 81350}, {"loss": 0.5957, "grad_norm": 1.0778567790985107, "learning_rate": 0.0002, "epoch": 5.842728904847397, "step": 81360}, {"loss": 0.5801, "grad_norm": 1.1484156847000122, "learning_rate": 0.0002, "epoch": 5.843447037701975, "step": 81370}, {"loss": 0.6348, "grad_norm": 1.0948245525360107, "learning_rate": 0.0002, "epoch": 5.844165170556553, "step": 81380}, {"loss": 0.5561, "grad_norm": 0.9363969564437866, "learning_rate": 0.0002, "epoch": 5.844883303411131, "step": 81390}, {"loss": 0.6336, "grad_norm": 1.0151013135910034, "learning_rate": 0.0002, "epoch": 5.845601436265709, "step": 81400}, {"loss": 0.6063, "grad_norm": 0.9925733804702759, "learning_rate": 0.0002, "epoch": 5.846319569120287, "step": 81410}, {"loss": 0.6512, "grad_norm": 1.0356744527816772, "learning_rate": 0.0002, "epoch": 5.847037701974865, "step": 81420}, {"loss": 0.5947, "grad_norm": 1.0633001327514648, "learning_rate": 0.0002, "epoch": 5.847755834829443, "step": 81430}, {"loss": 0.5851, "grad_norm": 0.9900460839271545, "learning_rate": 0.0002, "epoch": 5.848473967684021, "step": 81440}, {"loss": 0.6216, "grad_norm": 1.2677979469299316, "learning_rate": 0.0002, "epoch": 5.8491921005386, "step": 81450}, {"loss": 0.5633, "grad_norm": 0.8174138069152832, "learning_rate": 0.0002, "epoch": 5.849910233393178, "step": 81460}, {"loss": 0.6283, "grad_norm": 1.1986393928527832, "learning_rate": 0.0002, "epoch": 5.850628366247756, "step": 81470}, {"loss": 0.6056, "grad_norm": 1.1009358167648315, "learning_rate": 0.0002, "epoch": 5.851346499102334, "step": 81480}, {"loss": 0.6244, "grad_norm": 0.966446578502655, "learning_rate": 0.0002, "epoch": 5.852064631956912, "step": 81490}, {"loss": 0.5687, "grad_norm": 0.9657767415046692, "learning_rate": 0.0002, "epoch": 5.85278276481149, "step": 81500}, {"loss": 0.547, "grad_norm": 1.0480058193206787, "learning_rate": 0.0002, "epoch": 5.853500897666068, "step": 81510}, {"loss": 0.5737, "grad_norm": 1.2003830671310425, "learning_rate": 0.0002, "epoch": 5.854219030520646, "step": 81520}, {"loss": 0.602, "grad_norm": 0.8683754205703735, "learning_rate": 0.0002, "epoch": 5.854937163375224, "step": 81530}, {"loss": 0.5923, "grad_norm": 1.0860967636108398, "learning_rate": 0.0002, "epoch": 5.855655296229802, "step": 81540}, {"loss": 0.5959, "grad_norm": 1.0415282249450684, "learning_rate": 0.0002, "epoch": 5.856373429084381, "step": 81550}, {"loss": 0.6017, "grad_norm": 0.9897454380989075, "learning_rate": 0.0002, "epoch": 5.857091561938959, "step": 81560}, {"loss": 0.5588, "grad_norm": 1.173884630203247, "learning_rate": 0.0002, "epoch": 5.857809694793537, "step": 81570}, {"loss": 0.5715, "grad_norm": 1.2426209449768066, "learning_rate": 0.0002, "epoch": 5.858527827648115, "step": 81580}, {"loss": 0.6079, "grad_norm": 0.9390465021133423, "learning_rate": 0.0002, "epoch": 5.859245960502693, "step": 81590}, {"loss": 0.5896, "grad_norm": 1.1387195587158203, "learning_rate": 0.0002, "epoch": 5.859964093357271, "step": 81600}, {"loss": 0.6025, "grad_norm": 0.9902143478393555, "learning_rate": 0.0002, "epoch": 5.860682226211849, "step": 81610}, {"loss": 0.6197, "grad_norm": 0.8328776359558105, "learning_rate": 0.0002, "epoch": 5.861400359066427, "step": 81620}, {"loss": 0.6586, "grad_norm": 0.9837837815284729, "learning_rate": 0.0002, "epoch": 5.862118491921006, "step": 81630}, {"loss": 0.5793, "grad_norm": 1.0013370513916016, "learning_rate": 0.0002, "epoch": 5.862836624775584, "step": 81640}, {"loss": 0.6129, "grad_norm": 0.9408028721809387, "learning_rate": 0.0002, "epoch": 5.863554757630162, "step": 81650}, {"loss": 0.572, "grad_norm": 1.093140959739685, "learning_rate": 0.0002, "epoch": 5.86427289048474, "step": 81660}, {"loss": 0.6037, "grad_norm": 0.9554300904273987, "learning_rate": 0.0002, "epoch": 5.864991023339318, "step": 81670}, {"loss": 0.6136, "grad_norm": 1.1276485919952393, "learning_rate": 0.0002, "epoch": 5.865709156193896, "step": 81680}, {"loss": 0.6072, "grad_norm": 0.9628785252571106, "learning_rate": 0.0002, "epoch": 5.866427289048474, "step": 81690}, {"loss": 0.5962, "grad_norm": 0.9844689965248108, "learning_rate": 0.0002, "epoch": 5.867145421903052, "step": 81700}, {"loss": 0.5883, "grad_norm": 0.9679856896400452, "learning_rate": 0.0002, "epoch": 5.86786355475763, "step": 81710}, {"loss": 0.6244, "grad_norm": 1.0225571393966675, "learning_rate": 0.0002, "epoch": 5.868581687612208, "step": 81720}, {"loss": 0.6132, "grad_norm": 0.9330390691757202, "learning_rate": 0.0002, "epoch": 5.869299820466786, "step": 81730}, {"loss": 0.5895, "grad_norm": 1.0584566593170166, "learning_rate": 0.0002, "epoch": 5.870017953321365, "step": 81740}, {"loss": 0.5618, "grad_norm": 0.781548023223877, "learning_rate": 0.0002, "epoch": 5.870736086175943, "step": 81750}, {"loss": 0.5651, "grad_norm": 0.8906106352806091, "learning_rate": 0.0002, "epoch": 5.871454219030521, "step": 81760}, {"loss": 0.6258, "grad_norm": 1.1402281522750854, "learning_rate": 0.0002, "epoch": 5.872172351885099, "step": 81770}, {"loss": 0.5943, "grad_norm": 0.9991076588630676, "learning_rate": 0.0002, "epoch": 5.872890484739677, "step": 81780}, {"loss": 0.6095, "grad_norm": 1.0120140314102173, "learning_rate": 0.0002, "epoch": 5.873608617594255, "step": 81790}, {"loss": 0.6114, "grad_norm": 0.8857715725898743, "learning_rate": 0.0002, "epoch": 5.874326750448833, "step": 81800}, {"loss": 0.6027, "grad_norm": 0.8531954288482666, "learning_rate": 0.0002, "epoch": 5.875044883303411, "step": 81810}, {"loss": 0.6468, "grad_norm": 1.1601015329360962, "learning_rate": 0.0002, "epoch": 5.87576301615799, "step": 81820}, {"loss": 0.643, "grad_norm": 1.1435350179672241, "learning_rate": 0.0002, "epoch": 5.876481149012568, "step": 81830}, {"loss": 0.6195, "grad_norm": 0.9526153802871704, "learning_rate": 0.0002, "epoch": 5.877199281867146, "step": 81840}, {"loss": 0.648, "grad_norm": 1.06845223903656, "learning_rate": 0.0002, "epoch": 5.877917414721724, "step": 81850}, {"loss": 0.5963, "grad_norm": 0.9239344596862793, "learning_rate": 0.0002, "epoch": 5.878635547576302, "step": 81860}, {"loss": 0.5669, "grad_norm": 0.8632398247718811, "learning_rate": 0.0002, "epoch": 5.87935368043088, "step": 81870}, {"loss": 0.5904, "grad_norm": 0.9148443341255188, "learning_rate": 0.0002, "epoch": 5.880071813285458, "step": 81880}, {"loss": 0.5554, "grad_norm": 0.9910652041435242, "learning_rate": 0.0002, "epoch": 5.880789946140036, "step": 81890}, {"loss": 0.6132, "grad_norm": 0.8335179090499878, "learning_rate": 0.0002, "epoch": 5.881508078994614, "step": 81900}, {"loss": 0.6106, "grad_norm": 0.9921387434005737, "learning_rate": 0.0002, "epoch": 5.882226211849192, "step": 81910}, {"loss": 0.6327, "grad_norm": 1.0532517433166504, "learning_rate": 0.0002, "epoch": 5.88294434470377, "step": 81920}, {"loss": 0.6071, "grad_norm": 1.026400089263916, "learning_rate": 0.0002, "epoch": 5.883662477558349, "step": 81930}, {"loss": 0.6759, "grad_norm": 1.019195318222046, "learning_rate": 0.0002, "epoch": 5.884380610412927, "step": 81940}, {"loss": 0.5922, "grad_norm": 0.987238347530365, "learning_rate": 0.0002, "epoch": 5.885098743267505, "step": 81950}, {"loss": 0.5864, "grad_norm": 1.1714487075805664, "learning_rate": 0.0002, "epoch": 5.885816876122083, "step": 81960}, {"loss": 0.6006, "grad_norm": 1.0854483842849731, "learning_rate": 0.0002, "epoch": 5.886535008976661, "step": 81970}, {"loss": 0.588, "grad_norm": 1.0678396224975586, "learning_rate": 0.0002, "epoch": 5.887253141831239, "step": 81980}, {"loss": 0.6061, "grad_norm": 1.1009471416473389, "learning_rate": 0.0002, "epoch": 5.887971274685817, "step": 81990}, {"loss": 0.6397, "grad_norm": 1.2056844234466553, "learning_rate": 0.0002, "epoch": 5.888689407540395, "step": 82000}, {"loss": 0.6018, "grad_norm": 1.131302833557129, "learning_rate": 0.0002, "epoch": 5.8894075403949735, "step": 82010}, {"loss": 0.5822, "grad_norm": 1.4466036558151245, "learning_rate": 0.0002, "epoch": 5.8901256732495515, "step": 82020}, {"loss": 0.6295, "grad_norm": 1.051228404045105, "learning_rate": 0.0002, "epoch": 5.8908438061041295, "step": 82030}, {"loss": 0.5567, "grad_norm": 1.0010617971420288, "learning_rate": 0.0002, "epoch": 5.8915619389587075, "step": 82040}, {"loss": 0.5674, "grad_norm": 0.9095138311386108, "learning_rate": 0.0002, "epoch": 5.8922800718132855, "step": 82050}, {"loss": 0.5947, "grad_norm": 1.0237005949020386, "learning_rate": 0.0002, "epoch": 5.8929982046678635, "step": 82060}, {"loss": 0.6258, "grad_norm": 1.035122036933899, "learning_rate": 0.0002, "epoch": 5.8937163375224415, "step": 82070}, {"loss": 0.5866, "grad_norm": 1.0271964073181152, "learning_rate": 0.0002, "epoch": 5.8944344703770195, "step": 82080}, {"loss": 0.637, "grad_norm": 1.2044503688812256, "learning_rate": 0.0002, "epoch": 5.8951526032315975, "step": 82090}, {"loss": 0.6356, "grad_norm": 1.0275284051895142, "learning_rate": 0.0002, "epoch": 5.8958707360861755, "step": 82100}, {"loss": 0.6216, "grad_norm": 0.9974840879440308, "learning_rate": 0.0002, "epoch": 5.896588868940754, "step": 82110}, {"loss": 0.572, "grad_norm": 1.009968638420105, "learning_rate": 0.0002, "epoch": 5.897307001795332, "step": 82120}, {"loss": 0.6432, "grad_norm": 0.8396142721176147, "learning_rate": 0.0002, "epoch": 5.89802513464991, "step": 82130}, {"loss": 0.5671, "grad_norm": 1.002354621887207, "learning_rate": 0.0002, "epoch": 5.898743267504488, "step": 82140}, {"loss": 0.565, "grad_norm": 0.9998893737792969, "learning_rate": 0.0002, "epoch": 5.899461400359066, "step": 82150}, {"loss": 0.5836, "grad_norm": 1.1027010679244995, "learning_rate": 0.0002, "epoch": 5.900179533213644, "step": 82160}, {"loss": 0.6069, "grad_norm": 1.2028530836105347, "learning_rate": 0.0002, "epoch": 5.900897666068222, "step": 82170}, {"loss": 0.6184, "grad_norm": 1.0018759965896606, "learning_rate": 0.0002, "epoch": 5.9016157989228, "step": 82180}, {"loss": 0.5866, "grad_norm": 0.8911277055740356, "learning_rate": 0.0002, "epoch": 5.902333931777379, "step": 82190}, {"loss": 0.5638, "grad_norm": 1.0172009468078613, "learning_rate": 0.0002, "epoch": 5.903052064631957, "step": 82200}, {"loss": 0.6181, "grad_norm": 1.1664029359817505, "learning_rate": 0.0002, "epoch": 5.903770197486535, "step": 82210}, {"loss": 0.5863, "grad_norm": 1.0620089769363403, "learning_rate": 0.0002, "epoch": 5.904488330341113, "step": 82220}, {"loss": 0.6175, "grad_norm": 1.0756114721298218, "learning_rate": 0.0002, "epoch": 5.905206463195691, "step": 82230}, {"loss": 0.6223, "grad_norm": 1.1727497577667236, "learning_rate": 0.0002, "epoch": 5.905924596050269, "step": 82240}, {"loss": 0.5777, "grad_norm": 0.9833515882492065, "learning_rate": 0.0002, "epoch": 5.906642728904847, "step": 82250}, {"loss": 0.6344, "grad_norm": 0.9236368536949158, "learning_rate": 0.0002, "epoch": 5.907360861759425, "step": 82260}, {"loss": 0.6301, "grad_norm": 0.9773947596549988, "learning_rate": 0.0002, "epoch": 5.908078994614003, "step": 82270}, {"loss": 0.6255, "grad_norm": 1.1427783966064453, "learning_rate": 0.0002, "epoch": 5.908797127468581, "step": 82280}, {"loss": 0.6359, "grad_norm": 1.0215164422988892, "learning_rate": 0.0002, "epoch": 5.909515260323159, "step": 82290}, {"loss": 0.631, "grad_norm": 1.1157845258712769, "learning_rate": 0.0002, "epoch": 5.910233393177738, "step": 82300}, {"loss": 0.5706, "grad_norm": 1.1490662097930908, "learning_rate": 0.0002, "epoch": 5.910951526032316, "step": 82310}, {"loss": 0.5932, "grad_norm": 0.7233976125717163, "learning_rate": 0.0002, "epoch": 5.911669658886894, "step": 82320}, {"loss": 0.6199, "grad_norm": 1.0053865909576416, "learning_rate": 0.0002, "epoch": 5.912387791741472, "step": 82330}, {"loss": 0.6283, "grad_norm": 0.9764766097068787, "learning_rate": 0.0002, "epoch": 5.91310592459605, "step": 82340}, {"loss": 0.5981, "grad_norm": 0.9492928385734558, "learning_rate": 0.0002, "epoch": 5.913824057450628, "step": 82350}, {"loss": 0.6234, "grad_norm": 0.9538891315460205, "learning_rate": 0.0002, "epoch": 5.914542190305206, "step": 82360}, {"loss": 0.6717, "grad_norm": 1.2620314359664917, "learning_rate": 0.0002, "epoch": 5.915260323159784, "step": 82370}, {"loss": 0.5956, "grad_norm": 0.9913349151611328, "learning_rate": 0.0002, "epoch": 5.915978456014363, "step": 82380}, {"loss": 0.5877, "grad_norm": 0.9712074995040894, "learning_rate": 0.0002, "epoch": 5.916696588868941, "step": 82390}, {"loss": 0.5935, "grad_norm": 1.1554654836654663, "learning_rate": 0.0002, "epoch": 5.917414721723519, "step": 82400}, {"loss": 0.5881, "grad_norm": 1.1418904066085815, "learning_rate": 0.0002, "epoch": 5.918132854578097, "step": 82410}, {"loss": 0.5472, "grad_norm": 0.9405845999717712, "learning_rate": 0.0002, "epoch": 5.918850987432675, "step": 82420}, {"loss": 0.606, "grad_norm": 1.0801819562911987, "learning_rate": 0.0002, "epoch": 5.919569120287253, "step": 82430}, {"loss": 0.5953, "grad_norm": 0.8643896579742432, "learning_rate": 0.0002, "epoch": 5.920287253141831, "step": 82440}, {"loss": 0.6042, "grad_norm": 1.106025218963623, "learning_rate": 0.0002, "epoch": 5.921005385996409, "step": 82450}, {"loss": 0.5879, "grad_norm": 1.0338234901428223, "learning_rate": 0.0002, "epoch": 5.921723518850987, "step": 82460}, {"loss": 0.6733, "grad_norm": 1.0648493766784668, "learning_rate": 0.0002, "epoch": 5.922441651705565, "step": 82470}, {"loss": 0.6233, "grad_norm": 1.1950433254241943, "learning_rate": 0.0002, "epoch": 5.923159784560143, "step": 82480}, {"loss": 0.6148, "grad_norm": 0.8730897903442383, "learning_rate": 0.0002, "epoch": 5.923877917414722, "step": 82490}, {"loss": 0.6138, "grad_norm": 1.2262312173843384, "learning_rate": 0.0002, "epoch": 5.9245960502693, "step": 82500}, {"loss": 0.616, "grad_norm": 0.9526116251945496, "learning_rate": 0.0002, "epoch": 5.925314183123878, "step": 82510}, {"loss": 0.6372, "grad_norm": 1.0540224313735962, "learning_rate": 0.0002, "epoch": 5.926032315978456, "step": 82520}, {"loss": 0.6102, "grad_norm": 1.0537306070327759, "learning_rate": 0.0002, "epoch": 5.926750448833034, "step": 82530}, {"loss": 0.5789, "grad_norm": 1.134207844734192, "learning_rate": 0.0002, "epoch": 5.927468581687612, "step": 82540}, {"loss": 0.622, "grad_norm": 0.9042250514030457, "learning_rate": 0.0002, "epoch": 5.92818671454219, "step": 82550}, {"loss": 0.6207, "grad_norm": 1.0424834489822388, "learning_rate": 0.0002, "epoch": 5.928904847396768, "step": 82560}, {"loss": 0.5334, "grad_norm": 1.1571602821350098, "learning_rate": 0.0002, "epoch": 5.929622980251347, "step": 82570}, {"loss": 0.6549, "grad_norm": 1.1033377647399902, "learning_rate": 0.0002, "epoch": 5.930341113105925, "step": 82580}, {"loss": 0.5819, "grad_norm": 0.9211772680282593, "learning_rate": 0.0002, "epoch": 5.931059245960503, "step": 82590}, {"loss": 0.591, "grad_norm": 1.0566459894180298, "learning_rate": 0.0002, "epoch": 5.931777378815081, "step": 82600}, {"loss": 0.6318, "grad_norm": 1.1773834228515625, "learning_rate": 0.0002, "epoch": 5.932495511669659, "step": 82610}, {"loss": 0.6067, "grad_norm": 1.193396806716919, "learning_rate": 0.0002, "epoch": 5.933213644524237, "step": 82620}, {"loss": 0.6105, "grad_norm": 1.1101785898208618, "learning_rate": 0.0002, "epoch": 5.933931777378815, "step": 82630}, {"loss": 0.5742, "grad_norm": 0.6988118886947632, "learning_rate": 0.0002, "epoch": 5.934649910233393, "step": 82640}, {"loss": 0.626, "grad_norm": 0.9590985774993896, "learning_rate": 0.0002, "epoch": 5.935368043087971, "step": 82650}, {"loss": 0.5909, "grad_norm": 0.8512062430381775, "learning_rate": 0.0002, "epoch": 5.936086175942549, "step": 82660}, {"loss": 0.539, "grad_norm": 1.0381710529327393, "learning_rate": 0.0002, "epoch": 5.936804308797128, "step": 82670}, {"loss": 0.5608, "grad_norm": 1.0816296339035034, "learning_rate": 0.0002, "epoch": 5.937522441651706, "step": 82680}, {"loss": 0.6087, "grad_norm": 1.0592364072799683, "learning_rate": 0.0002, "epoch": 5.938240574506284, "step": 82690}, {"loss": 0.5792, "grad_norm": 0.737452507019043, "learning_rate": 0.0002, "epoch": 5.938958707360862, "step": 82700}, {"loss": 0.6031, "grad_norm": 0.9019039869308472, "learning_rate": 0.0002, "epoch": 5.93967684021544, "step": 82710}, {"loss": 0.6153, "grad_norm": 1.0049666166305542, "learning_rate": 0.0002, "epoch": 5.940394973070018, "step": 82720}, {"loss": 0.619, "grad_norm": 1.0016309022903442, "learning_rate": 0.0002, "epoch": 5.941113105924596, "step": 82730}, {"loss": 0.5796, "grad_norm": 0.7967594861984253, "learning_rate": 0.0002, "epoch": 5.941831238779174, "step": 82740}, {"loss": 0.6418, "grad_norm": 0.8978520631790161, "learning_rate": 0.0002, "epoch": 5.942549371633753, "step": 82750}, {"loss": 0.6234, "grad_norm": 1.0101654529571533, "learning_rate": 0.0002, "epoch": 5.943267504488331, "step": 82760}, {"loss": 0.5813, "grad_norm": 1.1515586376190186, "learning_rate": 0.0002, "epoch": 5.943985637342909, "step": 82770}, {"loss": 0.6031, "grad_norm": 0.8666134476661682, "learning_rate": 0.0002, "epoch": 5.944703770197487, "step": 82780}, {"loss": 0.565, "grad_norm": 1.1365231275558472, "learning_rate": 0.0002, "epoch": 5.945421903052065, "step": 82790}, {"loss": 0.6122, "grad_norm": 1.211229920387268, "learning_rate": 0.0002, "epoch": 5.946140035906643, "step": 82800}, {"loss": 0.5815, "grad_norm": 0.9900869727134705, "learning_rate": 0.0002, "epoch": 5.946858168761221, "step": 82810}, {"loss": 0.5973, "grad_norm": 0.9555928111076355, "learning_rate": 0.0002, "epoch": 5.947576301615799, "step": 82820}, {"loss": 0.5667, "grad_norm": 0.8468470573425293, "learning_rate": 0.0002, "epoch": 5.948294434470377, "step": 82830}, {"loss": 0.5895, "grad_norm": 1.0280319452285767, "learning_rate": 0.0002, "epoch": 5.949012567324955, "step": 82840}, {"loss": 0.5663, "grad_norm": 0.930145800113678, "learning_rate": 0.0002, "epoch": 5.949730700179533, "step": 82850}, {"loss": 0.5482, "grad_norm": 1.0677028894424438, "learning_rate": 0.0002, "epoch": 5.950448833034112, "step": 82860}, {"loss": 0.6009, "grad_norm": 1.2035255432128906, "learning_rate": 0.0002, "epoch": 5.95116696588869, "step": 82870}, {"loss": 0.6207, "grad_norm": 0.897537887096405, "learning_rate": 0.0002, "epoch": 5.951885098743268, "step": 82880}, {"loss": 0.6383, "grad_norm": 1.2858690023422241, "learning_rate": 0.0002, "epoch": 5.952603231597846, "step": 82890}, {"loss": 0.6111, "grad_norm": 1.0300413370132446, "learning_rate": 0.0002, "epoch": 5.953321364452424, "step": 82900}, {"loss": 0.6469, "grad_norm": 0.9873301982879639, "learning_rate": 0.0002, "epoch": 5.954039497307002, "step": 82910}, {"loss": 0.6173, "grad_norm": 1.0315600633621216, "learning_rate": 0.0002, "epoch": 5.95475763016158, "step": 82920}, {"loss": 0.5566, "grad_norm": 1.0631790161132812, "learning_rate": 0.0002, "epoch": 5.955475763016158, "step": 82930}, {"loss": 0.6067, "grad_norm": 1.035544514656067, "learning_rate": 0.0002, "epoch": 5.9561938958707366, "step": 82940}, {"loss": 0.6311, "grad_norm": 1.0162041187286377, "learning_rate": 0.0002, "epoch": 5.956912028725315, "step": 82950}, {"loss": 0.6005, "grad_norm": 0.7858892679214478, "learning_rate": 0.0002, "epoch": 5.957630161579893, "step": 82960}, {"loss": 0.5961, "grad_norm": 1.0359784364700317, "learning_rate": 0.0002, "epoch": 5.958348294434471, "step": 82970}, {"loss": 0.5704, "grad_norm": 1.057173252105713, "learning_rate": 0.0002, "epoch": 5.959066427289049, "step": 82980}, {"loss": 0.6127, "grad_norm": 1.1017464399337769, "learning_rate": 0.0002, "epoch": 5.959784560143627, "step": 82990}, {"loss": 0.5455, "grad_norm": 1.0688945055007935, "learning_rate": 0.0002, "epoch": 5.960502692998205, "step": 83000}, {"loss": 0.5429, "grad_norm": 1.048864483833313, "learning_rate": 0.0002, "epoch": 5.961220825852783, "step": 83010}, {"loss": 0.5559, "grad_norm": 1.057308316230774, "learning_rate": 0.0002, "epoch": 5.961938958707361, "step": 83020}, {"loss": 0.5703, "grad_norm": 0.9014604687690735, "learning_rate": 0.0002, "epoch": 5.962657091561939, "step": 83030}, {"loss": 0.6029, "grad_norm": 0.9899709224700928, "learning_rate": 0.0002, "epoch": 5.963375224416517, "step": 83040}, {"loss": 0.6403, "grad_norm": 1.0675519704818726, "learning_rate": 0.0002, "epoch": 5.9640933572710955, "step": 83050}, {"loss": 0.6016, "grad_norm": 0.9497889876365662, "learning_rate": 0.0002, "epoch": 5.9648114901256735, "step": 83060}, {"loss": 0.5997, "grad_norm": 0.9149549603462219, "learning_rate": 0.0002, "epoch": 5.9655296229802515, "step": 83070}, {"loss": 0.6105, "grad_norm": 1.329373836517334, "learning_rate": 0.0002, "epoch": 5.9662477558348295, "step": 83080}, {"loss": 0.6077, "grad_norm": 1.0731712579727173, "learning_rate": 0.0002, "epoch": 5.9669658886894075, "step": 83090}, {"loss": 0.6269, "grad_norm": 0.9498835802078247, "learning_rate": 0.0002, "epoch": 5.9676840215439855, "step": 83100}, {"loss": 0.6196, "grad_norm": 1.1222829818725586, "learning_rate": 0.0002, "epoch": 5.9684021543985635, "step": 83110}, {"loss": 0.5784, "grad_norm": 0.9923429489135742, "learning_rate": 0.0002, "epoch": 5.9691202872531415, "step": 83120}, {"loss": 0.6223, "grad_norm": 0.9046645164489746, "learning_rate": 0.0002, "epoch": 5.96983842010772, "step": 83130}, {"loss": 0.6252, "grad_norm": 0.9259500503540039, "learning_rate": 0.0002, "epoch": 5.970556552962298, "step": 83140}, {"loss": 0.5849, "grad_norm": 1.0604174137115479, "learning_rate": 0.0002, "epoch": 5.971274685816876, "step": 83150}, {"loss": 0.5789, "grad_norm": 1.0391676425933838, "learning_rate": 0.0002, "epoch": 5.971992818671454, "step": 83160}, {"loss": 0.5861, "grad_norm": 0.8825796246528625, "learning_rate": 0.0002, "epoch": 5.972710951526032, "step": 83170}, {"loss": 0.6164, "grad_norm": 0.9687952399253845, "learning_rate": 0.0002, "epoch": 5.97342908438061, "step": 83180}, {"loss": 0.6127, "grad_norm": 0.9401392340660095, "learning_rate": 0.0002, "epoch": 5.974147217235188, "step": 83190}, {"loss": 0.572, "grad_norm": 1.0526834726333618, "learning_rate": 0.0002, "epoch": 5.974865350089766, "step": 83200}, {"loss": 0.6047, "grad_norm": 1.1882060766220093, "learning_rate": 0.0002, "epoch": 5.975583482944344, "step": 83210}, {"loss": 0.5731, "grad_norm": 0.9182824492454529, "learning_rate": 0.0002, "epoch": 5.976301615798922, "step": 83220}, {"loss": 0.6092, "grad_norm": 1.344875454902649, "learning_rate": 0.0002, "epoch": 5.977019748653501, "step": 83230}, {"loss": 0.6198, "grad_norm": 1.3868434429168701, "learning_rate": 0.0002, "epoch": 5.977737881508079, "step": 83240}, {"loss": 0.6187, "grad_norm": 1.2702280282974243, "learning_rate": 0.0002, "epoch": 5.978456014362657, "step": 83250}, {"loss": 0.6271, "grad_norm": 0.9808234572410583, "learning_rate": 0.0002, "epoch": 5.979174147217235, "step": 83260}, {"loss": 0.6027, "grad_norm": 0.9225142598152161, "learning_rate": 0.0002, "epoch": 5.979892280071813, "step": 83270}, {"loss": 0.626, "grad_norm": 1.1095874309539795, "learning_rate": 0.0002, "epoch": 5.980610412926391, "step": 83280}, {"loss": 0.5994, "grad_norm": 1.2650344371795654, "learning_rate": 0.0002, "epoch": 5.981328545780969, "step": 83290}, {"loss": 0.5808, "grad_norm": 0.8230084180831909, "learning_rate": 0.0002, "epoch": 5.982046678635547, "step": 83300}, {"loss": 0.6399, "grad_norm": 1.171427607536316, "learning_rate": 0.0002, "epoch": 5.982764811490125, "step": 83310}, {"loss": 0.6033, "grad_norm": 0.7458868026733398, "learning_rate": 0.0002, "epoch": 5.983482944344704, "step": 83320}, {"loss": 0.6235, "grad_norm": 0.9238616228103638, "learning_rate": 0.0002, "epoch": 5.984201077199282, "step": 83330}, {"loss": 0.6316, "grad_norm": 1.027495265007019, "learning_rate": 0.0002, "epoch": 5.98491921005386, "step": 83340}, {"loss": 0.6202, "grad_norm": 1.0694037675857544, "learning_rate": 0.0002, "epoch": 5.985637342908438, "step": 83350}, {"loss": 0.5883, "grad_norm": 0.9498767256736755, "learning_rate": 0.0002, "epoch": 5.986355475763016, "step": 83360}, {"loss": 0.6022, "grad_norm": 1.0524284839630127, "learning_rate": 0.0002, "epoch": 5.987073608617594, "step": 83370}, {"loss": 0.5695, "grad_norm": 1.07961905002594, "learning_rate": 0.0002, "epoch": 5.987791741472172, "step": 83380}, {"loss": 0.5835, "grad_norm": 1.1436965465545654, "learning_rate": 0.0002, "epoch": 5.98850987432675, "step": 83390}, {"loss": 0.5835, "grad_norm": 1.2610782384872437, "learning_rate": 0.0002, "epoch": 5.989228007181328, "step": 83400}, {"loss": 0.6018, "grad_norm": 1.1105682849884033, "learning_rate": 0.0002, "epoch": 5.989946140035906, "step": 83410}, {"loss": 0.5989, "grad_norm": 0.9900349378585815, "learning_rate": 0.0002, "epoch": 5.990664272890485, "step": 83420}, {"loss": 0.6492, "grad_norm": 0.8766723275184631, "learning_rate": 0.0002, "epoch": 5.991382405745063, "step": 83430}, {"loss": 0.5944, "grad_norm": 0.9532597661018372, "learning_rate": 0.0002, "epoch": 5.992100538599641, "step": 83440}, {"loss": 0.5903, "grad_norm": 1.016831398010254, "learning_rate": 0.0002, "epoch": 5.992818671454219, "step": 83450}, {"loss": 0.6159, "grad_norm": 0.9884716272354126, "learning_rate": 0.0002, "epoch": 5.993536804308797, "step": 83460}, {"loss": 0.5559, "grad_norm": 0.9415417909622192, "learning_rate": 0.0002, "epoch": 5.994254937163375, "step": 83470}, {"loss": 0.5644, "grad_norm": 0.8629752397537231, "learning_rate": 0.0002, "epoch": 5.994973070017953, "step": 83480}, {"loss": 0.5961, "grad_norm": 1.061378002166748, "learning_rate": 0.0002, "epoch": 5.995691202872531, "step": 83490}, {"loss": 0.6117, "grad_norm": 0.907195508480072, "learning_rate": 0.0002, "epoch": 5.99640933572711, "step": 83500}, {"loss": 0.6584, "grad_norm": 1.023658037185669, "learning_rate": 0.0002, "epoch": 5.997127468581688, "step": 83510}, {"loss": 0.6009, "grad_norm": 0.9893278479576111, "learning_rate": 0.0002, "epoch": 5.997845601436266, "step": 83520}, {"loss": 0.609, "grad_norm": 1.1909127235412598, "learning_rate": 0.0002, "epoch": 5.998563734290844, "step": 83530}, {"loss": 0.5507, "grad_norm": 1.1800892353057861, "learning_rate": 0.0002, "epoch": 5.999281867145422, "step": 83540}, {"loss": 0.605, "grad_norm": 1.0822563171386719, "learning_rate": 0.0002, "epoch": 6.0, "step": 83550}]} +{"epoch": 7.0, "step": 97475, "epoch_duration": 14966.17798256874, "total_accumulated_duration": 108765.94532608986, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.5816, "grad_norm": 1.0291756391525269, "learning_rate": 0.0002, "epoch": 0.000718132854578097, "step": 10}, {"loss": 1.1527, "grad_norm": 0.6570823192596436, "learning_rate": 0.0002, "epoch": 0.001436265709156194, "step": 20}, {"loss": 1.0014, "grad_norm": 0.693844199180603, "learning_rate": 0.0002, "epoch": 0.0021543985637342907, "step": 30}, {"loss": 0.9377, "grad_norm": 0.5608532428741455, "learning_rate": 0.0002, "epoch": 0.002872531418312388, "step": 40}, {"loss": 0.9533, "grad_norm": 0.549075722694397, "learning_rate": 0.0002, "epoch": 0.003590664272890485, "step": 50}, {"loss": 0.9164, "grad_norm": 0.47189879417419434, "learning_rate": 0.0002, "epoch": 0.004308797127468581, "step": 60}, {"loss": 0.8898, "grad_norm": 0.5799676775932312, "learning_rate": 0.0002, "epoch": 0.005026929982046679, "step": 70}, {"loss": 0.859, "grad_norm": 0.45907193422317505, "learning_rate": 0.0002, "epoch": 0.005745062836624776, "step": 80}, {"loss": 0.8697, "grad_norm": 0.4373045861721039, "learning_rate": 0.0002, "epoch": 0.006463195691202872, "step": 90}, {"loss": 0.8879, "grad_norm": 0.5636304020881653, "learning_rate": 0.0002, "epoch": 0.00718132854578097, "step": 100}, {"loss": 0.8397, "grad_norm": 0.5248253345489502, "learning_rate": 0.0002, "epoch": 0.007899461400359067, "step": 110}, {"loss": 0.9021, "grad_norm": 0.5082874298095703, "learning_rate": 0.0002, "epoch": 0.008617594254937163, "step": 120}, {"loss": 0.8678, "grad_norm": 0.42670881748199463, "learning_rate": 0.0002, "epoch": 0.00933572710951526, "step": 130}, {"loss": 0.7847, "grad_norm": 0.43311649560928345, "learning_rate": 0.0002, "epoch": 0.010053859964093357, "step": 140}, {"loss": 0.9252, "grad_norm": 0.43456509709358215, "learning_rate": 0.0002, "epoch": 0.010771992818671455, "step": 150}, {"loss": 0.8812, "grad_norm": 0.9222815632820129, "learning_rate": 0.0002, "epoch": 0.011490125673249552, "step": 160}, {"loss": 0.8651, "grad_norm": 0.42752256989479065, "learning_rate": 0.0002, "epoch": 0.012208258527827648, "step": 170}, {"loss": 0.8898, "grad_norm": 0.4175542891025543, "learning_rate": 0.0002, "epoch": 0.012926391382405745, "step": 180}, {"loss": 0.8519, "grad_norm": 0.4377831518650055, "learning_rate": 0.0002, "epoch": 0.013644524236983842, "step": 190}, {"loss": 0.8849, "grad_norm": 0.47263655066490173, "learning_rate": 0.0002, "epoch": 0.01436265709156194, "step": 200}, {"loss": 0.8764, "grad_norm": 0.3870520293712616, "learning_rate": 0.0002, "epoch": 0.015080789946140035, "step": 210}, {"loss": 0.833, "grad_norm": 0.4950464963912964, "learning_rate": 0.0002, "epoch": 0.015798922800718134, "step": 220}, {"loss": 0.8323, "grad_norm": 0.4643295407295227, "learning_rate": 0.0002, "epoch": 0.01651705565529623, "step": 230}, {"loss": 0.8363, "grad_norm": 0.5152903199195862, "learning_rate": 0.0002, "epoch": 0.017235188509874325, "step": 240}, {"loss": 0.873, "grad_norm": 0.3800727427005768, "learning_rate": 0.0002, "epoch": 0.017953321364452424, "step": 250}, {"loss": 0.8252, "grad_norm": 0.43700528144836426, "learning_rate": 0.0002, "epoch": 0.01867145421903052, "step": 260}, {"loss": 0.8686, "grad_norm": 0.3712887763977051, "learning_rate": 0.0002, "epoch": 0.01938958707360862, "step": 270}, {"loss": 0.8329, "grad_norm": 0.4202553629875183, "learning_rate": 0.0002, "epoch": 0.020107719928186715, "step": 280}, {"loss": 0.8143, "grad_norm": 0.40585094690322876, "learning_rate": 0.0002, "epoch": 0.02082585278276481, "step": 290}, {"loss": 0.8463, "grad_norm": 0.4685470759868622, "learning_rate": 0.0002, "epoch": 0.02154398563734291, "step": 300}, {"loss": 0.8321, "grad_norm": 0.373169481754303, "learning_rate": 0.0002, "epoch": 0.022262118491921005, "step": 310}, {"loss": 0.8031, "grad_norm": 0.39681482315063477, "learning_rate": 0.0002, "epoch": 0.022980251346499104, "step": 320}, {"loss": 0.8667, "grad_norm": 0.3919322192668915, "learning_rate": 0.0002, "epoch": 0.0236983842010772, "step": 330}, {"loss": 0.8196, "grad_norm": 0.4728981554508209, "learning_rate": 0.0002, "epoch": 0.024416517055655295, "step": 340}, {"loss": 0.8662, "grad_norm": 0.42439374327659607, "learning_rate": 0.0002, "epoch": 0.025134649910233394, "step": 350}, {"loss": 0.8618, "grad_norm": 0.425650030374527, "learning_rate": 0.0002, "epoch": 0.02585278276481149, "step": 360}, {"loss": 0.8249, "grad_norm": 0.4076762795448303, "learning_rate": 0.0002, "epoch": 0.02657091561938959, "step": 370}, {"loss": 0.8293, "grad_norm": 0.44335922598838806, "learning_rate": 0.0002, "epoch": 0.027289048473967684, "step": 380}, {"loss": 0.8288, "grad_norm": 0.5313619375228882, "learning_rate": 0.0002, "epoch": 0.02800718132854578, "step": 390}, {"loss": 0.8431, "grad_norm": 0.37089797854423523, "learning_rate": 0.0002, "epoch": 0.02872531418312388, "step": 400}, {"loss": 0.7644, "grad_norm": 0.5193604826927185, "learning_rate": 0.0002, "epoch": 0.029443447037701975, "step": 410}, {"loss": 0.7853, "grad_norm": 0.4428552985191345, "learning_rate": 0.0002, "epoch": 0.03016157989228007, "step": 420}, {"loss": 0.8641, "grad_norm": 0.384171724319458, "learning_rate": 0.0002, "epoch": 0.03087971274685817, "step": 430}, {"loss": 0.8236, "grad_norm": 0.3906913101673126, "learning_rate": 0.0002, "epoch": 0.03159784560143627, "step": 440}, {"loss": 0.8215, "grad_norm": 0.5365669131278992, "learning_rate": 0.0002, "epoch": 0.03231597845601436, "step": 450}, {"loss": 0.8376, "grad_norm": 0.4785287380218506, "learning_rate": 0.0002, "epoch": 0.03303411131059246, "step": 460}, {"loss": 0.8439, "grad_norm": 0.40048182010650635, "learning_rate": 0.0002, "epoch": 0.03375224416517056, "step": 470}, {"loss": 0.8306, "grad_norm": 0.49529239535331726, "learning_rate": 0.0002, "epoch": 0.03447037701974865, "step": 480}, {"loss": 0.8653, "grad_norm": 0.5853474140167236, "learning_rate": 0.0002, "epoch": 0.03518850987432675, "step": 490}, {"loss": 0.7952, "grad_norm": 0.3802863359451294, "learning_rate": 0.0002, "epoch": 0.03590664272890485, "step": 500}, {"loss": 0.8986, "grad_norm": 0.40374308824539185, "learning_rate": 0.0002, "epoch": 0.03662477558348295, "step": 510}, {"loss": 0.8495, "grad_norm": 0.4320009648799896, "learning_rate": 0.0002, "epoch": 0.03734290843806104, "step": 520}, {"loss": 0.8838, "grad_norm": 0.5198846459388733, "learning_rate": 0.0002, "epoch": 0.03806104129263914, "step": 530}, {"loss": 0.8343, "grad_norm": 0.4136947989463806, "learning_rate": 0.0002, "epoch": 0.03877917414721724, "step": 540}, {"loss": 0.8752, "grad_norm": 0.39344364404678345, "learning_rate": 0.0002, "epoch": 0.03949730700179533, "step": 550}, {"loss": 0.8088, "grad_norm": 0.4659644067287445, "learning_rate": 0.0002, "epoch": 0.04021543985637343, "step": 560}, {"loss": 0.766, "grad_norm": 0.3898842930793762, "learning_rate": 0.0002, "epoch": 0.04093357271095153, "step": 570}, {"loss": 0.7806, "grad_norm": 0.3964841961860657, "learning_rate": 0.0002, "epoch": 0.04165170556552962, "step": 580}, {"loss": 0.801, "grad_norm": 0.5172179341316223, "learning_rate": 0.0002, "epoch": 0.04236983842010772, "step": 590}, {"loss": 0.8253, "grad_norm": 0.5362544059753418, "learning_rate": 0.0002, "epoch": 0.04308797127468582, "step": 600}, {"loss": 0.8701, "grad_norm": 0.3975909948348999, "learning_rate": 0.0002, "epoch": 0.04380610412926391, "step": 610}, {"loss": 0.844, "grad_norm": 0.3905031085014343, "learning_rate": 0.0002, "epoch": 0.04452423698384201, "step": 620}, {"loss": 0.7723, "grad_norm": 0.5148088932037354, "learning_rate": 0.0002, "epoch": 0.04524236983842011, "step": 630}, {"loss": 0.8309, "grad_norm": 0.38826194405555725, "learning_rate": 0.0002, "epoch": 0.04596050269299821, "step": 640}, {"loss": 0.8379, "grad_norm": 0.5432049036026001, "learning_rate": 0.0002, "epoch": 0.0466786355475763, "step": 650}, {"loss": 0.838, "grad_norm": 0.42048221826553345, "learning_rate": 0.0002, "epoch": 0.0473967684021544, "step": 660}, {"loss": 0.8337, "grad_norm": 0.4683088958263397, "learning_rate": 0.0002, "epoch": 0.0481149012567325, "step": 670}, {"loss": 0.7982, "grad_norm": 0.4623735249042511, "learning_rate": 0.0002, "epoch": 0.04883303411131059, "step": 680}, {"loss": 0.8905, "grad_norm": 0.509128212928772, "learning_rate": 0.0002, "epoch": 0.04955116696588869, "step": 690}, {"loss": 0.8193, "grad_norm": 0.45767295360565186, "learning_rate": 0.0002, "epoch": 0.05026929982046679, "step": 700}, {"loss": 0.7658, "grad_norm": 0.4023726284503937, "learning_rate": 0.0002, "epoch": 0.05098743267504488, "step": 710}, {"loss": 0.8552, "grad_norm": 0.4407201409339905, "learning_rate": 0.0002, "epoch": 0.05170556552962298, "step": 720}, {"loss": 0.8369, "grad_norm": 0.41862091422080994, "learning_rate": 0.0002, "epoch": 0.05242369838420108, "step": 730}, {"loss": 0.8856, "grad_norm": 0.37473055720329285, "learning_rate": 0.0002, "epoch": 0.05314183123877918, "step": 740}, {"loss": 0.8282, "grad_norm": 0.4882921576499939, "learning_rate": 0.0002, "epoch": 0.05385996409335727, "step": 750}, {"loss": 0.8257, "grad_norm": 0.47890132665634155, "learning_rate": 0.0002, "epoch": 0.05457809694793537, "step": 760}, {"loss": 0.9222, "grad_norm": 0.5811166167259216, "learning_rate": 0.0002, "epoch": 0.05529622980251347, "step": 770}, {"loss": 0.7943, "grad_norm": 0.41113588213920593, "learning_rate": 0.0002, "epoch": 0.05601436265709156, "step": 780}, {"loss": 0.791, "grad_norm": 0.4120602607727051, "learning_rate": 0.0002, "epoch": 0.05673249551166966, "step": 790}, {"loss": 0.9038, "grad_norm": 0.39287394285202026, "learning_rate": 0.0002, "epoch": 0.05745062836624776, "step": 800}, {"loss": 0.8131, "grad_norm": 0.3986941874027252, "learning_rate": 0.0002, "epoch": 0.05816876122082585, "step": 810}, {"loss": 0.8268, "grad_norm": 0.4264012575149536, "learning_rate": 0.0002, "epoch": 0.05888689407540395, "step": 820}, {"loss": 0.7881, "grad_norm": 0.481139600276947, "learning_rate": 0.0002, "epoch": 0.05960502692998205, "step": 830}, {"loss": 0.8477, "grad_norm": 0.5561784505844116, "learning_rate": 0.0002, "epoch": 0.06032315978456014, "step": 840}, {"loss": 0.7817, "grad_norm": 0.4787197411060333, "learning_rate": 0.0002, "epoch": 0.06104129263913824, "step": 850}, {"loss": 0.8567, "grad_norm": 0.46454647183418274, "learning_rate": 0.0002, "epoch": 0.06175942549371634, "step": 860}, {"loss": 0.8429, "grad_norm": 0.5929669141769409, "learning_rate": 0.0002, "epoch": 0.06247755834829444, "step": 870}, {"loss": 0.8019, "grad_norm": 0.4561384618282318, "learning_rate": 0.0002, "epoch": 0.06319569120287254, "step": 880}, {"loss": 0.8686, "grad_norm": 0.45767998695373535, "learning_rate": 0.0002, "epoch": 0.06391382405745062, "step": 890}, {"loss": 0.818, "grad_norm": 0.42475444078445435, "learning_rate": 0.0002, "epoch": 0.06463195691202872, "step": 900}, {"loss": 0.8579, "grad_norm": 0.4911022484302521, "learning_rate": 0.0002, "epoch": 0.06535008976660682, "step": 910}, {"loss": 0.8067, "grad_norm": 0.5229166746139526, "learning_rate": 0.0002, "epoch": 0.06606822262118492, "step": 920}, {"loss": 0.8563, "grad_norm": 0.38134580850601196, "learning_rate": 0.0002, "epoch": 0.06678635547576302, "step": 930}, {"loss": 0.815, "grad_norm": 0.4171486496925354, "learning_rate": 0.0002, "epoch": 0.06750448833034112, "step": 940}, {"loss": 0.8122, "grad_norm": 0.45171529054641724, "learning_rate": 0.0002, "epoch": 0.06822262118491922, "step": 950}, {"loss": 0.8436, "grad_norm": 0.44889307022094727, "learning_rate": 0.0002, "epoch": 0.0689407540394973, "step": 960}, {"loss": 0.8149, "grad_norm": 0.44902464747428894, "learning_rate": 0.0002, "epoch": 0.0696588868940754, "step": 970}, {"loss": 0.7916, "grad_norm": 0.4671969413757324, "learning_rate": 0.0002, "epoch": 0.0703770197486535, "step": 980}, {"loss": 0.8147, "grad_norm": 0.4686984717845917, "learning_rate": 0.0002, "epoch": 0.0710951526032316, "step": 990}, {"loss": 0.806, "grad_norm": 0.4513658583164215, "learning_rate": 0.0002, "epoch": 0.0718132854578097, "step": 1000}, {"loss": 0.8348, "grad_norm": 0.48861828446388245, "learning_rate": 0.0002, "epoch": 0.0725314183123878, "step": 1010}, {"loss": 0.8038, "grad_norm": 0.7603165507316589, "learning_rate": 0.0002, "epoch": 0.0732495511669659, "step": 1020}, {"loss": 0.7844, "grad_norm": 0.501654863357544, "learning_rate": 0.0002, "epoch": 0.07396768402154398, "step": 1030}, {"loss": 0.7623, "grad_norm": 0.45291560888290405, "learning_rate": 0.0002, "epoch": 0.07468581687612208, "step": 1040}, {"loss": 0.8174, "grad_norm": 0.42454713582992554, "learning_rate": 0.0002, "epoch": 0.07540394973070018, "step": 1050}, {"loss": 0.7874, "grad_norm": 0.4655592441558838, "learning_rate": 0.0002, "epoch": 0.07612208258527828, "step": 1060}, {"loss": 0.8855, "grad_norm": 0.5011071562767029, "learning_rate": 0.0002, "epoch": 0.07684021543985638, "step": 1070}, {"loss": 0.8502, "grad_norm": 0.37221577763557434, "learning_rate": 0.0002, "epoch": 0.07755834829443448, "step": 1080}, {"loss": 0.8623, "grad_norm": 0.5123572945594788, "learning_rate": 0.0002, "epoch": 0.07827648114901256, "step": 1090}, {"loss": 0.8527, "grad_norm": 0.44138720631599426, "learning_rate": 0.0002, "epoch": 0.07899461400359066, "step": 1100}, {"loss": 0.7949, "grad_norm": 0.38932886719703674, "learning_rate": 0.0002, "epoch": 0.07971274685816876, "step": 1110}, {"loss": 0.8289, "grad_norm": 0.435820072889328, "learning_rate": 0.0002, "epoch": 0.08043087971274686, "step": 1120}, {"loss": 0.787, "grad_norm": 0.3820142149925232, "learning_rate": 0.0002, "epoch": 0.08114901256732496, "step": 1130}, {"loss": 0.8617, "grad_norm": 0.39680808782577515, "learning_rate": 0.0002, "epoch": 0.08186714542190306, "step": 1140}, {"loss": 0.8047, "grad_norm": 0.4833722412586212, "learning_rate": 0.0002, "epoch": 0.08258527827648116, "step": 1150}, {"loss": 0.8513, "grad_norm": 0.5045956969261169, "learning_rate": 0.0002, "epoch": 0.08330341113105924, "step": 1160}, {"loss": 0.8366, "grad_norm": 0.3652207553386688, "learning_rate": 0.0002, "epoch": 0.08402154398563734, "step": 1170}, {"loss": 0.8464, "grad_norm": 0.44447052478790283, "learning_rate": 0.0002, "epoch": 0.08473967684021544, "step": 1180}, {"loss": 0.8362, "grad_norm": 0.44942694902420044, "learning_rate": 0.0002, "epoch": 0.08545780969479354, "step": 1190}, {"loss": 0.7932, "grad_norm": 0.48789075016975403, "learning_rate": 0.0002, "epoch": 0.08617594254937164, "step": 1200}, {"loss": 0.8008, "grad_norm": 0.3981451094150543, "learning_rate": 0.0002, "epoch": 0.08689407540394974, "step": 1210}, {"loss": 0.8296, "grad_norm": 0.45545220375061035, "learning_rate": 0.0002, "epoch": 0.08761220825852782, "step": 1220}, {"loss": 0.8406, "grad_norm": 0.562138557434082, "learning_rate": 0.0002, "epoch": 0.08833034111310592, "step": 1230}, {"loss": 0.808, "grad_norm": 0.48523494601249695, "learning_rate": 0.0002, "epoch": 0.08904847396768402, "step": 1240}, {"loss": 0.8024, "grad_norm": 0.35054388642311096, "learning_rate": 0.0002, "epoch": 0.08976660682226212, "step": 1250}, {"loss": 0.8635, "grad_norm": 0.4148605167865753, "learning_rate": 0.0002, "epoch": 0.09048473967684022, "step": 1260}, {"loss": 0.8379, "grad_norm": 0.50171959400177, "learning_rate": 0.0002, "epoch": 0.09120287253141832, "step": 1270}, {"loss": 0.8466, "grad_norm": 0.41747573018074036, "learning_rate": 0.0002, "epoch": 0.09192100538599642, "step": 1280}, {"loss": 0.7905, "grad_norm": 0.43028751015663147, "learning_rate": 0.0002, "epoch": 0.0926391382405745, "step": 1290}, {"loss": 0.8071, "grad_norm": 0.41274991631507874, "learning_rate": 0.0002, "epoch": 0.0933572710951526, "step": 1300}, {"loss": 0.8214, "grad_norm": 0.5399569272994995, "learning_rate": 0.0002, "epoch": 0.0940754039497307, "step": 1310}, {"loss": 0.8108, "grad_norm": 0.44284379482269287, "learning_rate": 0.0002, "epoch": 0.0947935368043088, "step": 1320}, {"loss": 0.8301, "grad_norm": 0.42511969804763794, "learning_rate": 0.0002, "epoch": 0.0955116696588869, "step": 1330}, {"loss": 0.8527, "grad_norm": 0.5717929005622864, "learning_rate": 0.0002, "epoch": 0.096229802513465, "step": 1340}, {"loss": 0.8232, "grad_norm": 0.4104631245136261, "learning_rate": 0.0002, "epoch": 0.09694793536804308, "step": 1350}, {"loss": 0.8697, "grad_norm": 0.4144339859485626, "learning_rate": 0.0002, "epoch": 0.09766606822262118, "step": 1360}, {"loss": 0.7909, "grad_norm": 0.43676936626434326, "learning_rate": 0.0002, "epoch": 0.09838420107719928, "step": 1370}, {"loss": 0.8757, "grad_norm": 0.5297161340713501, "learning_rate": 0.0002, "epoch": 0.09910233393177738, "step": 1380}, {"loss": 0.7772, "grad_norm": 0.5319193601608276, "learning_rate": 0.0002, "epoch": 0.09982046678635548, "step": 1390}, {"loss": 0.8167, "grad_norm": 0.4083728492259979, "learning_rate": 0.0002, "epoch": 0.10053859964093358, "step": 1400}, {"loss": 0.8436, "grad_norm": 0.4193868339061737, "learning_rate": 0.0002, "epoch": 0.10125673249551168, "step": 1410}, {"loss": 0.8634, "grad_norm": 0.4062198996543884, "learning_rate": 0.0002, "epoch": 0.10197486535008976, "step": 1420}, {"loss": 0.7984, "grad_norm": 0.43972232937812805, "learning_rate": 0.0002, "epoch": 0.10269299820466786, "step": 1430}, {"loss": 0.8278, "grad_norm": 0.4598410725593567, "learning_rate": 0.0002, "epoch": 0.10341113105924596, "step": 1440}, {"loss": 0.8527, "grad_norm": 0.571662187576294, "learning_rate": 0.0002, "epoch": 0.10412926391382406, "step": 1450}, {"loss": 0.8485, "grad_norm": 0.5437791347503662, "learning_rate": 0.0002, "epoch": 0.10484739676840216, "step": 1460}, {"loss": 0.8172, "grad_norm": 0.4241923391819, "learning_rate": 0.0002, "epoch": 0.10556552962298026, "step": 1470}, {"loss": 0.8224, "grad_norm": 0.5185145735740662, "learning_rate": 0.0002, "epoch": 0.10628366247755835, "step": 1480}, {"loss": 0.8292, "grad_norm": 0.537626326084137, "learning_rate": 0.0002, "epoch": 0.10700179533213644, "step": 1490}, {"loss": 0.8227, "grad_norm": 0.4573661983013153, "learning_rate": 0.0002, "epoch": 0.10771992818671454, "step": 1500}, {"loss": 0.8318, "grad_norm": 0.4521017074584961, "learning_rate": 0.0002, "epoch": 0.10843806104129264, "step": 1510}, {"loss": 0.8107, "grad_norm": 0.6835159063339233, "learning_rate": 0.0002, "epoch": 0.10915619389587074, "step": 1520}, {"loss": 0.8256, "grad_norm": 0.43522894382476807, "learning_rate": 0.0002, "epoch": 0.10987432675044884, "step": 1530}, {"loss": 0.8211, "grad_norm": 0.685547411441803, "learning_rate": 0.0002, "epoch": 0.11059245960502694, "step": 1540}, {"loss": 0.8393, "grad_norm": 0.5283669233322144, "learning_rate": 0.0002, "epoch": 0.11131059245960502, "step": 1550}, {"loss": 0.8493, "grad_norm": 0.4869283437728882, "learning_rate": 0.0002, "epoch": 0.11202872531418312, "step": 1560}, {"loss": 0.8614, "grad_norm": 0.43024054169654846, "learning_rate": 0.0002, "epoch": 0.11274685816876122, "step": 1570}, {"loss": 0.8026, "grad_norm": 0.46726059913635254, "learning_rate": 0.0002, "epoch": 0.11346499102333932, "step": 1580}, {"loss": 0.8103, "grad_norm": 0.5046039819717407, "learning_rate": 0.0002, "epoch": 0.11418312387791742, "step": 1590}, {"loss": 0.8242, "grad_norm": 0.48972827196121216, "learning_rate": 0.0002, "epoch": 0.11490125673249552, "step": 1600}, {"loss": 0.8114, "grad_norm": 0.5221049189567566, "learning_rate": 0.0002, "epoch": 0.11561938958707361, "step": 1610}, {"loss": 0.8022, "grad_norm": 0.49169477820396423, "learning_rate": 0.0002, "epoch": 0.1163375224416517, "step": 1620}, {"loss": 0.8223, "grad_norm": 0.48462188243865967, "learning_rate": 0.0002, "epoch": 0.1170556552962298, "step": 1630}, {"loss": 0.8409, "grad_norm": 0.9001021981239319, "learning_rate": 0.0002, "epoch": 0.1177737881508079, "step": 1640}, {"loss": 0.8037, "grad_norm": 0.47555917501449585, "learning_rate": 0.0002, "epoch": 0.118491921005386, "step": 1650}, {"loss": 0.8047, "grad_norm": 0.4523521959781647, "learning_rate": 0.0002, "epoch": 0.1192100538599641, "step": 1660}, {"loss": 0.8552, "grad_norm": 0.510956346988678, "learning_rate": 0.0002, "epoch": 0.1199281867145422, "step": 1670}, {"loss": 0.8081, "grad_norm": 0.48063746094703674, "learning_rate": 0.0002, "epoch": 0.12064631956912028, "step": 1680}, {"loss": 0.7712, "grad_norm": 0.5209490060806274, "learning_rate": 0.0002, "epoch": 0.12136445242369838, "step": 1690}, {"loss": 0.8019, "grad_norm": 0.5488983988761902, "learning_rate": 0.0002, "epoch": 0.12208258527827648, "step": 1700}, {"loss": 0.829, "grad_norm": 0.5263523459434509, "learning_rate": 0.0002, "epoch": 0.12280071813285458, "step": 1710}, {"loss": 0.7761, "grad_norm": 0.45365768671035767, "learning_rate": 0.0002, "epoch": 0.12351885098743268, "step": 1720}, {"loss": 0.8432, "grad_norm": 0.4366922378540039, "learning_rate": 0.0002, "epoch": 0.12423698384201078, "step": 1730}, {"loss": 0.8261, "grad_norm": 0.4841083884239197, "learning_rate": 0.0002, "epoch": 0.12495511669658887, "step": 1740}, {"loss": 0.7834, "grad_norm": 0.46546968817710876, "learning_rate": 0.0002, "epoch": 0.12567324955116696, "step": 1750}, {"loss": 0.7874, "grad_norm": 0.39987099170684814, "learning_rate": 0.0002, "epoch": 0.12639138240574507, "step": 1760}, {"loss": 0.813, "grad_norm": 0.4661678969860077, "learning_rate": 0.0002, "epoch": 0.12710951526032316, "step": 1770}, {"loss": 0.8516, "grad_norm": 0.46716657280921936, "learning_rate": 0.0002, "epoch": 0.12782764811490124, "step": 1780}, {"loss": 0.8065, "grad_norm": 0.46164995431900024, "learning_rate": 0.0002, "epoch": 0.12854578096947936, "step": 1790}, {"loss": 0.8911, "grad_norm": 0.4910370111465454, "learning_rate": 0.0002, "epoch": 0.12926391382405744, "step": 1800}, {"loss": 0.7773, "grad_norm": 0.5615737438201904, "learning_rate": 0.0002, "epoch": 0.12998204667863555, "step": 1810}, {"loss": 0.7726, "grad_norm": 0.5739728808403015, "learning_rate": 0.0002, "epoch": 0.13070017953321364, "step": 1820}, {"loss": 0.8307, "grad_norm": 0.44104722142219543, "learning_rate": 0.0002, "epoch": 0.13141831238779175, "step": 1830}, {"loss": 0.7533, "grad_norm": 0.46373724937438965, "learning_rate": 0.0002, "epoch": 0.13213644524236984, "step": 1840}, {"loss": 0.8181, "grad_norm": 0.4481196403503418, "learning_rate": 0.0002, "epoch": 0.13285457809694792, "step": 1850}, {"loss": 0.8508, "grad_norm": 0.5689327716827393, "learning_rate": 0.0002, "epoch": 0.13357271095152604, "step": 1860}, {"loss": 0.8364, "grad_norm": 0.5334849953651428, "learning_rate": 0.0002, "epoch": 0.13429084380610412, "step": 1870}, {"loss": 0.8018, "grad_norm": 0.5177253484725952, "learning_rate": 0.0002, "epoch": 0.13500897666068223, "step": 1880}, {"loss": 0.869, "grad_norm": 0.4919368326663971, "learning_rate": 0.0002, "epoch": 0.13572710951526032, "step": 1890}, {"loss": 0.7647, "grad_norm": 0.5987576842308044, "learning_rate": 0.0002, "epoch": 0.13644524236983843, "step": 1900}, {"loss": 0.8546, "grad_norm": 0.49790486693382263, "learning_rate": 0.0002, "epoch": 0.13716337522441652, "step": 1910}, {"loss": 0.8402, "grad_norm": 0.5337542295455933, "learning_rate": 0.0002, "epoch": 0.1378815080789946, "step": 1920}, {"loss": 0.815, "grad_norm": 0.5171598792076111, "learning_rate": 0.0002, "epoch": 0.13859964093357272, "step": 1930}, {"loss": 0.843, "grad_norm": 0.5003953576087952, "learning_rate": 0.0002, "epoch": 0.1393177737881508, "step": 1940}, {"loss": 0.7867, "grad_norm": 0.5147887468338013, "learning_rate": 0.0002, "epoch": 0.1400359066427289, "step": 1950}, {"loss": 0.8215, "grad_norm": 0.6365984678268433, "learning_rate": 0.0002, "epoch": 0.140754039497307, "step": 1960}, {"loss": 0.8397, "grad_norm": 0.5449512004852295, "learning_rate": 0.0002, "epoch": 0.1414721723518851, "step": 1970}, {"loss": 0.8177, "grad_norm": 0.4062703847885132, "learning_rate": 0.0002, "epoch": 0.1421903052064632, "step": 1980}, {"loss": 0.8058, "grad_norm": 0.4446912705898285, "learning_rate": 0.0002, "epoch": 0.14290843806104128, "step": 1990}, {"loss": 0.7854, "grad_norm": 0.49001234769821167, "learning_rate": 0.0002, "epoch": 0.1436265709156194, "step": 2000}, {"loss": 0.8136, "grad_norm": 0.5591765642166138, "learning_rate": 0.0002, "epoch": 0.14434470377019748, "step": 2010}, {"loss": 0.7808, "grad_norm": 0.6476696133613586, "learning_rate": 0.0002, "epoch": 0.1450628366247756, "step": 2020}, {"loss": 0.8137, "grad_norm": 0.44688376784324646, "learning_rate": 0.0002, "epoch": 0.14578096947935368, "step": 2030}, {"loss": 0.8253, "grad_norm": 0.4437490701675415, "learning_rate": 0.0002, "epoch": 0.1464991023339318, "step": 2040}, {"loss": 0.7654, "grad_norm": 0.59927898645401, "learning_rate": 0.0002, "epoch": 0.14721723518850988, "step": 2050}, {"loss": 0.825, "grad_norm": 0.4356591999530792, "learning_rate": 0.0002, "epoch": 0.14793536804308796, "step": 2060}, {"loss": 0.8038, "grad_norm": 0.5560822486877441, "learning_rate": 0.0002, "epoch": 0.14865350089766607, "step": 2070}, {"loss": 0.838, "grad_norm": 0.43027108907699585, "learning_rate": 0.0002, "epoch": 0.14937163375224416, "step": 2080}, {"loss": 0.8317, "grad_norm": 0.41215455532073975, "learning_rate": 0.0002, "epoch": 0.15008976660682227, "step": 2090}, {"loss": 0.7948, "grad_norm": 0.4607839584350586, "learning_rate": 0.0002, "epoch": 0.15080789946140036, "step": 2100}, {"loss": 0.7981, "grad_norm": 0.4699854254722595, "learning_rate": 0.0002, "epoch": 0.15152603231597844, "step": 2110}, {"loss": 0.8464, "grad_norm": 0.5111975073814392, "learning_rate": 0.0002, "epoch": 0.15224416517055656, "step": 2120}, {"loss": 0.7672, "grad_norm": 0.4713742733001709, "learning_rate": 0.0002, "epoch": 0.15296229802513464, "step": 2130}, {"loss": 0.7692, "grad_norm": 0.3816622793674469, "learning_rate": 0.0002, "epoch": 0.15368043087971275, "step": 2140}, {"loss": 0.7824, "grad_norm": 0.4637526273727417, "learning_rate": 0.0002, "epoch": 0.15439856373429084, "step": 2150}, {"loss": 0.8185, "grad_norm": 0.3691818118095398, "learning_rate": 0.0002, "epoch": 0.15511669658886895, "step": 2160}, {"loss": 0.8298, "grad_norm": 0.4435218274593353, "learning_rate": 0.0002, "epoch": 0.15583482944344704, "step": 2170}, {"loss": 0.7917, "grad_norm": 0.5282211899757385, "learning_rate": 0.0002, "epoch": 0.15655296229802512, "step": 2180}, {"loss": 0.8006, "grad_norm": 0.7611056566238403, "learning_rate": 0.0002, "epoch": 0.15727109515260324, "step": 2190}, {"loss": 0.8039, "grad_norm": 0.5951169729232788, "learning_rate": 0.0002, "epoch": 0.15798922800718132, "step": 2200}, {"loss": 0.8314, "grad_norm": 0.5243265628814697, "learning_rate": 0.0002, "epoch": 0.15870736086175943, "step": 2210}, {"loss": 0.7817, "grad_norm": 0.518944501876831, "learning_rate": 0.0002, "epoch": 0.15942549371633752, "step": 2220}, {"loss": 0.8187, "grad_norm": 0.4264616072177887, "learning_rate": 0.0002, "epoch": 0.16014362657091563, "step": 2230}, {"loss": 0.7916, "grad_norm": 0.4619045853614807, "learning_rate": 0.0002, "epoch": 0.16086175942549372, "step": 2240}, {"loss": 0.84, "grad_norm": 0.4047030508518219, "learning_rate": 0.0002, "epoch": 0.1615798922800718, "step": 2250}, {"loss": 0.8133, "grad_norm": 0.47133687138557434, "learning_rate": 0.0002, "epoch": 0.16229802513464991, "step": 2260}, {"loss": 0.8032, "grad_norm": 0.4990246593952179, "learning_rate": 0.0002, "epoch": 0.163016157989228, "step": 2270}, {"loss": 0.752, "grad_norm": 0.5145298838615417, "learning_rate": 0.0002, "epoch": 0.1637342908438061, "step": 2280}, {"loss": 0.8441, "grad_norm": 0.5354352593421936, "learning_rate": 0.0002, "epoch": 0.1644524236983842, "step": 2290}, {"loss": 0.8099, "grad_norm": 0.47621065378189087, "learning_rate": 0.0002, "epoch": 0.1651705565529623, "step": 2300}, {"loss": 0.8105, "grad_norm": 0.45333582162857056, "learning_rate": 0.0002, "epoch": 0.1658886894075404, "step": 2310}, {"loss": 0.8126, "grad_norm": 0.4832790493965149, "learning_rate": 0.0002, "epoch": 0.16660682226211848, "step": 2320}, {"loss": 0.8271, "grad_norm": 0.4922761619091034, "learning_rate": 0.0002, "epoch": 0.1673249551166966, "step": 2330}, {"loss": 0.8324, "grad_norm": 0.5701655149459839, "learning_rate": 0.0002, "epoch": 0.16804308797127468, "step": 2340}, {"loss": 0.844, "grad_norm": 0.5170459151268005, "learning_rate": 0.0002, "epoch": 0.1687612208258528, "step": 2350}, {"loss": 0.7995, "grad_norm": 0.6562373638153076, "learning_rate": 0.0002, "epoch": 0.16947935368043088, "step": 2360}, {"loss": 0.7733, "grad_norm": 0.5350262522697449, "learning_rate": 0.0002, "epoch": 0.170197486535009, "step": 2370}, {"loss": 0.8501, "grad_norm": 0.5163491368293762, "learning_rate": 0.0002, "epoch": 0.17091561938958708, "step": 2380}, {"loss": 0.7708, "grad_norm": 0.48841530084609985, "learning_rate": 0.0002, "epoch": 0.17163375224416516, "step": 2390}, {"loss": 0.7969, "grad_norm": 0.44912993907928467, "learning_rate": 0.0002, "epoch": 0.17235188509874327, "step": 2400}, {"loss": 0.7706, "grad_norm": 0.5770647525787354, "learning_rate": 0.0002, "epoch": 0.17307001795332136, "step": 2410}, {"loss": 0.8233, "grad_norm": 0.4716179072856903, "learning_rate": 0.0002, "epoch": 0.17378815080789947, "step": 2420}, {"loss": 0.7802, "grad_norm": 0.5465078949928284, "learning_rate": 0.0002, "epoch": 0.17450628366247756, "step": 2430}, {"loss": 0.8191, "grad_norm": 0.40810713171958923, "learning_rate": 0.0002, "epoch": 0.17522441651705564, "step": 2440}, {"loss": 0.7971, "grad_norm": 0.3789578080177307, "learning_rate": 0.0002, "epoch": 0.17594254937163376, "step": 2450}, {"loss": 0.7437, "grad_norm": 0.4615110158920288, "learning_rate": 0.0002, "epoch": 0.17666068222621184, "step": 2460}, {"loss": 0.8102, "grad_norm": 0.4400235712528229, "learning_rate": 0.0002, "epoch": 0.17737881508078995, "step": 2470}, {"loss": 0.8254, "grad_norm": 0.5935020446777344, "learning_rate": 0.0002, "epoch": 0.17809694793536804, "step": 2480}, {"loss": 0.7886, "grad_norm": 0.5672990679740906, "learning_rate": 0.0002, "epoch": 0.17881508078994615, "step": 2490}, {"loss": 0.7829, "grad_norm": 0.4132838845252991, "learning_rate": 0.0002, "epoch": 0.17953321364452424, "step": 2500}, {"loss": 0.8056, "grad_norm": 0.5373716950416565, "learning_rate": 0.0002, "epoch": 0.18025134649910232, "step": 2510}, {"loss": 0.8061, "grad_norm": 0.5335832834243774, "learning_rate": 0.0002, "epoch": 0.18096947935368043, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5705642700195312, "learning_rate": 0.0002, "epoch": 0.18168761220825852, "step": 2530}, {"loss": 0.7779, "grad_norm": 0.4807959496974945, "learning_rate": 0.0002, "epoch": 0.18240574506283663, "step": 2540}, {"loss": 0.7767, "grad_norm": 0.4430573880672455, "learning_rate": 0.0002, "epoch": 0.18312387791741472, "step": 2550}, {"loss": 0.7921, "grad_norm": 0.5294728875160217, "learning_rate": 0.0002, "epoch": 0.18384201077199283, "step": 2560}, {"loss": 0.8102, "grad_norm": 0.661173403263092, "learning_rate": 0.0002, "epoch": 0.18456014362657092, "step": 2570}, {"loss": 0.803, "grad_norm": 0.5044304728507996, "learning_rate": 0.0002, "epoch": 0.185278276481149, "step": 2580}, {"loss": 0.7833, "grad_norm": 0.48929551243782043, "learning_rate": 0.0002, "epoch": 0.18599640933572711, "step": 2590}, {"loss": 0.8252, "grad_norm": 0.5054438710212708, "learning_rate": 0.0002, "epoch": 0.1867145421903052, "step": 2600}, {"loss": 0.7665, "grad_norm": 0.5613677501678467, "learning_rate": 0.0002, "epoch": 0.1874326750448833, "step": 2610}, {"loss": 0.7954, "grad_norm": 0.5762478709220886, "learning_rate": 0.0002, "epoch": 0.1881508078994614, "step": 2620}, {"loss": 0.8312, "grad_norm": 0.4523695409297943, "learning_rate": 0.0002, "epoch": 0.1888689407540395, "step": 2630}, {"loss": 0.8098, "grad_norm": 0.5235317945480347, "learning_rate": 0.0002, "epoch": 0.1895870736086176, "step": 2640}, {"loss": 0.8281, "grad_norm": 0.4894576370716095, "learning_rate": 0.0002, "epoch": 0.19030520646319568, "step": 2650}, {"loss": 0.7923, "grad_norm": 0.45731106400489807, "learning_rate": 0.0002, "epoch": 0.1910233393177738, "step": 2660}, {"loss": 0.7942, "grad_norm": 0.4726541042327881, "learning_rate": 0.0002, "epoch": 0.19174147217235188, "step": 2670}, {"loss": 0.7979, "grad_norm": 0.4281631410121918, "learning_rate": 0.0002, "epoch": 0.19245960502693, "step": 2680}, {"loss": 0.8076, "grad_norm": 0.48011314868927, "learning_rate": 0.0002, "epoch": 0.19317773788150808, "step": 2690}, {"loss": 0.7785, "grad_norm": 0.45785006880760193, "learning_rate": 0.0002, "epoch": 0.19389587073608616, "step": 2700}, {"loss": 0.7726, "grad_norm": 0.5244625210762024, "learning_rate": 0.0002, "epoch": 0.19461400359066428, "step": 2710}, {"loss": 0.8674, "grad_norm": 0.4674883186817169, "learning_rate": 0.0002, "epoch": 0.19533213644524236, "step": 2720}, {"loss": 0.8465, "grad_norm": 0.5969558358192444, "learning_rate": 0.0002, "epoch": 0.19605026929982047, "step": 2730}, {"loss": 0.8238, "grad_norm": 0.44413265585899353, "learning_rate": 0.0002, "epoch": 0.19676840215439856, "step": 2740}, {"loss": 0.8181, "grad_norm": 0.5094553828239441, "learning_rate": 0.0002, "epoch": 0.19748653500897667, "step": 2750}, {"loss": 0.7593, "grad_norm": 0.4931736886501312, "learning_rate": 0.0002, "epoch": 0.19820466786355476, "step": 2760}, {"loss": 0.8535, "grad_norm": 0.4766625463962555, "learning_rate": 0.0002, "epoch": 0.19892280071813284, "step": 2770}, {"loss": 0.754, "grad_norm": 0.4196971654891968, "learning_rate": 0.0002, "epoch": 0.19964093357271095, "step": 2780}, {"loss": 0.7794, "grad_norm": 0.4693375825881958, "learning_rate": 0.0002, "epoch": 0.20035906642728904, "step": 2790}, {"loss": 0.8336, "grad_norm": 0.5407108664512634, "learning_rate": 0.0002, "epoch": 0.20107719928186715, "step": 2800}, {"loss": 0.7938, "grad_norm": 0.42864227294921875, "learning_rate": 0.0002, "epoch": 0.20179533213644524, "step": 2810}, {"loss": 0.8059, "grad_norm": 0.4928833246231079, "learning_rate": 0.0002, "epoch": 0.20251346499102335, "step": 2820}, {"loss": 0.8221, "grad_norm": 0.5575131773948669, "learning_rate": 0.0002, "epoch": 0.20323159784560144, "step": 2830}, {"loss": 0.7712, "grad_norm": 0.505114734172821, "learning_rate": 0.0002, "epoch": 0.20394973070017952, "step": 2840}, {"loss": 0.7986, "grad_norm": 0.4727420210838318, "learning_rate": 0.0002, "epoch": 0.20466786355475763, "step": 2850}, {"loss": 0.7662, "grad_norm": 0.48218145966529846, "learning_rate": 0.0002, "epoch": 0.20538599640933572, "step": 2860}, {"loss": 0.8055, "grad_norm": 0.5196906328201294, "learning_rate": 0.0002, "epoch": 0.20610412926391383, "step": 2870}, {"loss": 0.8401, "grad_norm": 0.4927639067173004, "learning_rate": 0.0002, "epoch": 0.20682226211849192, "step": 2880}, {"loss": 0.8067, "grad_norm": 0.5076990127563477, "learning_rate": 0.0002, "epoch": 0.20754039497307003, "step": 2890}, {"loss": 0.789, "grad_norm": 0.4606800079345703, "learning_rate": 0.0002, "epoch": 0.20825852782764812, "step": 2900}, {"loss": 0.8381, "grad_norm": 0.6184319257736206, "learning_rate": 0.0002, "epoch": 0.2089766606822262, "step": 2910}, {"loss": 0.8019, "grad_norm": 0.5237935781478882, "learning_rate": 0.0002, "epoch": 0.2096947935368043, "step": 2920}, {"loss": 0.7763, "grad_norm": 0.43966251611709595, "learning_rate": 0.0002, "epoch": 0.2104129263913824, "step": 2930}, {"loss": 0.7915, "grad_norm": 0.48786666989326477, "learning_rate": 0.0002, "epoch": 0.2111310592459605, "step": 2940}, {"loss": 0.7549, "grad_norm": 0.4397817552089691, "learning_rate": 0.0002, "epoch": 0.2118491921005386, "step": 2950}, {"loss": 0.8342, "grad_norm": 0.5155336260795593, "learning_rate": 0.0002, "epoch": 0.2125673249551167, "step": 2960}, {"loss": 0.7885, "grad_norm": 0.48058274388313293, "learning_rate": 0.0002, "epoch": 0.2132854578096948, "step": 2970}, {"loss": 0.8208, "grad_norm": 0.5022647976875305, "learning_rate": 0.0002, "epoch": 0.21400359066427288, "step": 2980}, {"loss": 0.784, "grad_norm": 0.5417225360870361, "learning_rate": 0.0002, "epoch": 0.214721723518851, "step": 2990}, {"loss": 0.8518, "grad_norm": 0.46300315856933594, "learning_rate": 0.0002, "epoch": 0.21543985637342908, "step": 3000}, {"loss": 0.764, "grad_norm": 0.5375089049339294, "learning_rate": 0.0002, "epoch": 0.2161579892280072, "step": 3010}, {"loss": 0.8459, "grad_norm": 0.5050022602081299, "learning_rate": 0.0002, "epoch": 0.21687612208258528, "step": 3020}, {"loss": 0.797, "grad_norm": 0.46347716450691223, "learning_rate": 0.0002, "epoch": 0.21759425493716336, "step": 3030}, {"loss": 0.8014, "grad_norm": 0.544874370098114, "learning_rate": 0.0002, "epoch": 0.21831238779174147, "step": 3040}, {"loss": 0.802, "grad_norm": 0.4268142580986023, "learning_rate": 0.0002, "epoch": 0.21903052064631956, "step": 3050}, {"loss": 0.8224, "grad_norm": 0.5527157187461853, "learning_rate": 0.0002, "epoch": 0.21974865350089767, "step": 3060}, {"loss": 0.771, "grad_norm": 0.5565235018730164, "learning_rate": 0.0002, "epoch": 0.22046678635547576, "step": 3070}, {"loss": 0.7807, "grad_norm": 0.4900645613670349, "learning_rate": 0.0002, "epoch": 0.22118491921005387, "step": 3080}, {"loss": 0.8321, "grad_norm": 0.4951242208480835, "learning_rate": 0.0002, "epoch": 0.22190305206463196, "step": 3090}, {"loss": 0.8301, "grad_norm": 0.5831719636917114, "learning_rate": 0.0002, "epoch": 0.22262118491921004, "step": 3100}, {"loss": 0.8011, "grad_norm": 0.417576402425766, "learning_rate": 0.0002, "epoch": 0.22333931777378815, "step": 3110}, {"loss": 0.8226, "grad_norm": 0.4715117812156677, "learning_rate": 0.0002, "epoch": 0.22405745062836624, "step": 3120}, {"loss": 0.778, "grad_norm": 0.5956445336341858, "learning_rate": 0.0002, "epoch": 0.22477558348294435, "step": 3130}, {"loss": 0.788, "grad_norm": 0.408184289932251, "learning_rate": 0.0002, "epoch": 0.22549371633752244, "step": 3140}, {"loss": 0.8096, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 0.22621184919210055, "step": 3150}, {"loss": 0.7722, "grad_norm": 0.5631294846534729, "learning_rate": 0.0002, "epoch": 0.22692998204667864, "step": 3160}, {"loss": 0.7933, "grad_norm": 0.5054665803909302, "learning_rate": 0.0002, "epoch": 0.22764811490125672, "step": 3170}, {"loss": 0.8572, "grad_norm": 0.47388020157814026, "learning_rate": 0.0002, "epoch": 0.22836624775583483, "step": 3180}, {"loss": 0.8148, "grad_norm": 0.45871609449386597, "learning_rate": 0.0002, "epoch": 0.22908438061041292, "step": 3190}, {"loss": 0.8373, "grad_norm": 0.42431211471557617, "learning_rate": 0.0002, "epoch": 0.22980251346499103, "step": 3200}, {"loss": 0.7847, "grad_norm": 0.584872305393219, "learning_rate": 0.0002, "epoch": 0.23052064631956912, "step": 3210}, {"loss": 0.8118, "grad_norm": 0.5489653944969177, "learning_rate": 0.0002, "epoch": 0.23123877917414723, "step": 3220}, {"loss": 0.8552, "grad_norm": 0.5803213119506836, "learning_rate": 0.0002, "epoch": 0.23195691202872532, "step": 3230}, {"loss": 0.7702, "grad_norm": 0.906505823135376, "learning_rate": 0.0002, "epoch": 0.2326750448833034, "step": 3240}, {"loss": 0.8454, "grad_norm": 0.4569525718688965, "learning_rate": 0.0002, "epoch": 0.2333931777378815, "step": 3250}, {"loss": 0.7641, "grad_norm": 0.5566741228103638, "learning_rate": 0.0002, "epoch": 0.2341113105924596, "step": 3260}, {"loss": 0.7964, "grad_norm": 0.5059959888458252, "learning_rate": 0.0002, "epoch": 0.2348294434470377, "step": 3270}, {"loss": 0.7965, "grad_norm": 0.530828058719635, "learning_rate": 0.0002, "epoch": 0.2355475763016158, "step": 3280}, {"loss": 0.807, "grad_norm": 0.5149409174919128, "learning_rate": 0.0002, "epoch": 0.2362657091561939, "step": 3290}, {"loss": 0.8067, "grad_norm": 0.7323763966560364, "learning_rate": 0.0002, "epoch": 0.236983842010772, "step": 3300}, {"loss": 0.774, "grad_norm": 0.6794836521148682, "learning_rate": 0.0002, "epoch": 0.23770197486535008, "step": 3310}, {"loss": 0.7902, "grad_norm": 0.5176534056663513, "learning_rate": 0.0002, "epoch": 0.2384201077199282, "step": 3320}, {"loss": 0.8119, "grad_norm": 0.42245906591415405, "learning_rate": 0.0002, "epoch": 0.23913824057450628, "step": 3330}, {"loss": 0.868, "grad_norm": 0.43535107374191284, "learning_rate": 0.0002, "epoch": 0.2398563734290844, "step": 3340}, {"loss": 0.825, "grad_norm": 0.7038307785987854, "learning_rate": 0.0002, "epoch": 0.24057450628366248, "step": 3350}, {"loss": 0.7818, "grad_norm": 0.5689977407455444, "learning_rate": 0.0002, "epoch": 0.24129263913824056, "step": 3360}, {"loss": 0.7958, "grad_norm": 0.538136899471283, "learning_rate": 0.0002, "epoch": 0.24201077199281867, "step": 3370}, {"loss": 0.7995, "grad_norm": 0.7433661222457886, "learning_rate": 0.0002, "epoch": 0.24272890484739676, "step": 3380}, {"loss": 0.8564, "grad_norm": 0.6996734738349915, "learning_rate": 0.0002, "epoch": 0.24344703770197487, "step": 3390}, {"loss": 0.8288, "grad_norm": 0.5055703520774841, "learning_rate": 0.0002, "epoch": 0.24416517055655296, "step": 3400}, {"loss": 0.7741, "grad_norm": 0.5218513607978821, "learning_rate": 0.0002, "epoch": 0.24488330341113107, "step": 3410}, {"loss": 0.7903, "grad_norm": 0.42782822251319885, "learning_rate": 0.0002, "epoch": 0.24560143626570916, "step": 3420}, {"loss": 0.8005, "grad_norm": 0.4991157650947571, "learning_rate": 0.0002, "epoch": 0.24631956912028724, "step": 3430}, {"loss": 0.8151, "grad_norm": 0.5063165426254272, "learning_rate": 0.0002, "epoch": 0.24703770197486535, "step": 3440}, {"loss": 0.7722, "grad_norm": 0.45863136649131775, "learning_rate": 0.0002, "epoch": 0.24775583482944344, "step": 3450}, {"loss": 0.8236, "grad_norm": 0.474728524684906, "learning_rate": 0.0002, "epoch": 0.24847396768402155, "step": 3460}, {"loss": 0.7698, "grad_norm": 0.522570013999939, "learning_rate": 0.0002, "epoch": 0.24919210053859964, "step": 3470}, {"loss": 0.7448, "grad_norm": 0.5474396347999573, "learning_rate": 0.0002, "epoch": 0.24991023339317775, "step": 3480}, {"loss": 0.8339, "grad_norm": 0.49094662070274353, "learning_rate": 0.0002, "epoch": 0.2506283662477558, "step": 3490}, {"loss": 0.7864, "grad_norm": 0.6399132609367371, "learning_rate": 0.0002, "epoch": 0.2513464991023339, "step": 3500}, {"loss": 0.7988, "grad_norm": 0.5910066366195679, "learning_rate": 0.0002, "epoch": 0.25206463195691203, "step": 3510}, {"loss": 0.813, "grad_norm": 0.4761259853839874, "learning_rate": 0.0002, "epoch": 0.25278276481149015, "step": 3520}, {"loss": 0.812, "grad_norm": 0.5124502182006836, "learning_rate": 0.0002, "epoch": 0.2535008976660682, "step": 3530}, {"loss": 0.7699, "grad_norm": 0.4329150915145874, "learning_rate": 0.0002, "epoch": 0.2542190305206463, "step": 3540}, {"loss": 0.8205, "grad_norm": 0.4839608371257782, "learning_rate": 0.0002, "epoch": 0.25493716337522443, "step": 3550}, {"loss": 0.8279, "grad_norm": 0.5413459539413452, "learning_rate": 0.0002, "epoch": 0.2556552962298025, "step": 3560}, {"loss": 0.8253, "grad_norm": 0.5761468410491943, "learning_rate": 0.0002, "epoch": 0.2563734290843806, "step": 3570}, {"loss": 0.8473, "grad_norm": 0.49266132712364197, "learning_rate": 0.0002, "epoch": 0.2570915619389587, "step": 3580}, {"loss": 0.7946, "grad_norm": 0.7377930879592896, "learning_rate": 0.0002, "epoch": 0.2578096947935368, "step": 3590}, {"loss": 0.799, "grad_norm": 0.543541431427002, "learning_rate": 0.0002, "epoch": 0.2585278276481149, "step": 3600}, {"loss": 0.8044, "grad_norm": 0.48385897278785706, "learning_rate": 0.0002, "epoch": 0.259245960502693, "step": 3610}, {"loss": 0.7686, "grad_norm": 0.5152639746665955, "learning_rate": 0.0002, "epoch": 0.2599640933572711, "step": 3620}, {"loss": 0.7438, "grad_norm": 0.5601988434791565, "learning_rate": 0.0002, "epoch": 0.26068222621184917, "step": 3630}, {"loss": 0.7915, "grad_norm": 0.4349626302719116, "learning_rate": 0.0002, "epoch": 0.2614003590664273, "step": 3640}, {"loss": 0.7825, "grad_norm": 0.5487161874771118, "learning_rate": 0.0002, "epoch": 0.2621184919210054, "step": 3650}, {"loss": 0.8085, "grad_norm": 0.45603805780410767, "learning_rate": 0.0002, "epoch": 0.2628366247755835, "step": 3660}, {"loss": 0.7858, "grad_norm": 0.5012730956077576, "learning_rate": 0.0002, "epoch": 0.26355475763016156, "step": 3670}, {"loss": 0.8022, "grad_norm": 0.4523845314979553, "learning_rate": 0.0002, "epoch": 0.2642728904847397, "step": 3680}, {"loss": 0.7932, "grad_norm": 0.5756664872169495, "learning_rate": 0.0002, "epoch": 0.2649910233393178, "step": 3690}, {"loss": 0.816, "grad_norm": 0.48467493057250977, "learning_rate": 0.0002, "epoch": 0.26570915619389585, "step": 3700}, {"loss": 0.7825, "grad_norm": 0.4860585927963257, "learning_rate": 0.0002, "epoch": 0.26642728904847396, "step": 3710}, {"loss": 0.7903, "grad_norm": 0.5067077875137329, "learning_rate": 0.0002, "epoch": 0.26714542190305207, "step": 3720}, {"loss": 0.8155, "grad_norm": 0.5490895509719849, "learning_rate": 0.0002, "epoch": 0.2678635547576302, "step": 3730}, {"loss": 0.7542, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.26858168761220824, "step": 3740}, {"loss": 0.7991, "grad_norm": 0.5026951432228088, "learning_rate": 0.0002, "epoch": 0.26929982046678635, "step": 3750}, {"loss": 0.8152, "grad_norm": 0.49474090337753296, "learning_rate": 0.0002, "epoch": 0.27001795332136447, "step": 3760}, {"loss": 0.8235, "grad_norm": 0.6381985545158386, "learning_rate": 0.0002, "epoch": 0.2707360861759425, "step": 3770}, {"loss": 0.8024, "grad_norm": 0.4784011244773865, "learning_rate": 0.0002, "epoch": 0.27145421903052064, "step": 3780}, {"loss": 0.7746, "grad_norm": 0.5126543045043945, "learning_rate": 0.0002, "epoch": 0.27217235188509875, "step": 3790}, {"loss": 0.841, "grad_norm": 0.5428652763366699, "learning_rate": 0.0002, "epoch": 0.27289048473967686, "step": 3800}, {"loss": 0.8137, "grad_norm": 0.5427033305168152, "learning_rate": 0.0002, "epoch": 0.2736086175942549, "step": 3810}, {"loss": 0.7274, "grad_norm": 0.46467480063438416, "learning_rate": 0.0002, "epoch": 0.27432675044883303, "step": 3820}, {"loss": 0.8414, "grad_norm": 0.494367390871048, "learning_rate": 0.0002, "epoch": 0.27504488330341115, "step": 3830}, {"loss": 0.8151, "grad_norm": 0.59856778383255, "learning_rate": 0.0002, "epoch": 0.2757630161579892, "step": 3840}, {"loss": 0.7899, "grad_norm": 0.422128826379776, "learning_rate": 0.0002, "epoch": 0.2764811490125673, "step": 3850}, {"loss": 0.8153, "grad_norm": 0.5757306814193726, "learning_rate": 0.0002, "epoch": 0.27719928186714543, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5850930213928223, "learning_rate": 0.0002, "epoch": 0.27791741472172354, "step": 3870}, {"loss": 0.8044, "grad_norm": 0.5633023977279663, "learning_rate": 0.0002, "epoch": 0.2786355475763016, "step": 3880}, {"loss": 0.8402, "grad_norm": 0.5037940144538879, "learning_rate": 0.0002, "epoch": 0.2793536804308797, "step": 3890}, {"loss": 0.822, "grad_norm": 0.5255506038665771, "learning_rate": 0.0002, "epoch": 0.2800718132854578, "step": 3900}, {"loss": 0.7625, "grad_norm": 0.44584617018699646, "learning_rate": 0.0002, "epoch": 0.2807899461400359, "step": 3910}, {"loss": 0.8131, "grad_norm": 0.4803239405155182, "learning_rate": 0.0002, "epoch": 0.281508078994614, "step": 3920}, {"loss": 0.8122, "grad_norm": 0.5206008553504944, "learning_rate": 0.0002, "epoch": 0.2822262118491921, "step": 3930}, {"loss": 0.8988, "grad_norm": 0.5596373081207275, "learning_rate": 0.0002, "epoch": 0.2829443447037702, "step": 3940}, {"loss": 0.8091, "grad_norm": 0.4487258493900299, "learning_rate": 0.0002, "epoch": 0.2836624775583483, "step": 3950}, {"loss": 0.7933, "grad_norm": 0.4774281978607178, "learning_rate": 0.0002, "epoch": 0.2843806104129264, "step": 3960}, {"loss": 0.8994, "grad_norm": 0.571829617023468, "learning_rate": 0.0002, "epoch": 0.2850987432675045, "step": 3970}, {"loss": 0.7971, "grad_norm": 0.45251455903053284, "learning_rate": 0.0002, "epoch": 0.28581687612208256, "step": 3980}, {"loss": 0.8007, "grad_norm": 0.5119943618774414, "learning_rate": 0.0002, "epoch": 0.2865350089766607, "step": 3990}, {"loss": 0.8087, "grad_norm": 0.42333969473838806, "learning_rate": 0.0002, "epoch": 0.2872531418312388, "step": 4000}, {"loss": 0.7978, "grad_norm": 0.5694096684455872, "learning_rate": 0.0002, "epoch": 0.2879712746858169, "step": 4010}, {"loss": 0.845, "grad_norm": 0.44457492232322693, "learning_rate": 0.0002, "epoch": 0.28868940754039496, "step": 4020}, {"loss": 0.7268, "grad_norm": 0.496545672416687, "learning_rate": 0.0002, "epoch": 0.2894075403949731, "step": 4030}, {"loss": 0.7908, "grad_norm": 0.5092352032661438, "learning_rate": 0.0002, "epoch": 0.2901256732495512, "step": 4040}, {"loss": 0.807, "grad_norm": 0.5124567151069641, "learning_rate": 0.0002, "epoch": 0.29084380610412924, "step": 4050}, {"loss": 0.8191, "grad_norm": 0.5148161053657532, "learning_rate": 0.0002, "epoch": 0.29156193895870736, "step": 4060}, {"loss": 0.7893, "grad_norm": 0.48183947801589966, "learning_rate": 0.0002, "epoch": 0.29228007181328547, "step": 4070}, {"loss": 0.8397, "grad_norm": 0.47728800773620605, "learning_rate": 0.0002, "epoch": 0.2929982046678636, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.29371633752244164, "step": 4090}, {"loss": 0.8019, "grad_norm": 0.5343585014343262, "learning_rate": 0.0002, "epoch": 0.29443447037701975, "step": 4100}, {"loss": 0.7933, "grad_norm": 0.5760312676429749, "learning_rate": 0.0002, "epoch": 0.29515260323159787, "step": 4110}, {"loss": 0.811, "grad_norm": 0.5894787907600403, "learning_rate": 0.0002, "epoch": 0.2958707360861759, "step": 4120}, {"loss": 0.7375, "grad_norm": 0.4528578817844391, "learning_rate": 0.0002, "epoch": 0.29658886894075404, "step": 4130}, {"loss": 0.7761, "grad_norm": 0.6027235388755798, "learning_rate": 0.0002, "epoch": 0.29730700179533215, "step": 4140}, {"loss": 0.7636, "grad_norm": 0.5060310959815979, "learning_rate": 0.0002, "epoch": 0.2980251346499102, "step": 4150}, {"loss": 0.8122, "grad_norm": 0.475252628326416, "learning_rate": 0.0002, "epoch": 0.2987432675044883, "step": 4160}, {"loss": 0.8306, "grad_norm": 0.4855351448059082, "learning_rate": 0.0002, "epoch": 0.29946140035906643, "step": 4170}, {"loss": 0.7863, "grad_norm": 0.6720767021179199, "learning_rate": 0.0002, "epoch": 0.30017953321364454, "step": 4180}, {"loss": 0.7755, "grad_norm": 0.6409553289413452, "learning_rate": 0.0002, "epoch": 0.3008976660682226, "step": 4190}, {"loss": 0.8333, "grad_norm": 0.5508167147636414, "learning_rate": 0.0002, "epoch": 0.3016157989228007, "step": 4200}, {"loss": 0.8138, "grad_norm": 0.45958149433135986, "learning_rate": 0.0002, "epoch": 0.30233393177737883, "step": 4210}, {"loss": 0.8266, "grad_norm": 0.5201641321182251, "learning_rate": 0.0002, "epoch": 0.3030520646319569, "step": 4220}, {"loss": 0.8246, "grad_norm": 0.5440032482147217, "learning_rate": 0.0002, "epoch": 0.303770197486535, "step": 4230}, {"loss": 0.7863, "grad_norm": 0.43566814064979553, "learning_rate": 0.0002, "epoch": 0.3044883303411131, "step": 4240}, {"loss": 0.7835, "grad_norm": 0.4479893445968628, "learning_rate": 0.0002, "epoch": 0.3052064631956912, "step": 4250}, {"loss": 0.7646, "grad_norm": 0.40390217304229736, "learning_rate": 0.0002, "epoch": 0.3059245960502693, "step": 4260}, {"loss": 0.8382, "grad_norm": 0.5143486261367798, "learning_rate": 0.0002, "epoch": 0.3066427289048474, "step": 4270}, {"loss": 0.799, "grad_norm": 0.5289962887763977, "learning_rate": 0.0002, "epoch": 0.3073608617594255, "step": 4280}, {"loss": 0.7706, "grad_norm": 0.609561026096344, "learning_rate": 0.0002, "epoch": 0.30807899461400357, "step": 4290}, {"loss": 0.809, "grad_norm": 0.5967493653297424, "learning_rate": 0.0002, "epoch": 0.3087971274685817, "step": 4300}, {"loss": 0.8034, "grad_norm": 0.5323672890663147, "learning_rate": 0.0002, "epoch": 0.3095152603231598, "step": 4310}, {"loss": 0.8463, "grad_norm": 0.4996737241744995, "learning_rate": 0.0002, "epoch": 0.3102333931777379, "step": 4320}, {"loss": 0.7879, "grad_norm": 0.5528829097747803, "learning_rate": 0.0002, "epoch": 0.31095152603231596, "step": 4330}, {"loss": 0.8383, "grad_norm": 0.5394268035888672, "learning_rate": 0.0002, "epoch": 0.3116696588868941, "step": 4340}, {"loss": 0.8258, "grad_norm": 0.4654628038406372, "learning_rate": 0.0002, "epoch": 0.3123877917414722, "step": 4350}, {"loss": 0.8045, "grad_norm": 0.4933706521987915, "learning_rate": 0.0002, "epoch": 0.31310592459605024, "step": 4360}, {"loss": 0.7891, "grad_norm": 0.5310598611831665, "learning_rate": 0.0002, "epoch": 0.31382405745062836, "step": 4370}, {"loss": 0.8362, "grad_norm": 0.5558765530586243, "learning_rate": 0.0002, "epoch": 0.31454219030520647, "step": 4380}, {"loss": 0.8013, "grad_norm": 0.5281313061714172, "learning_rate": 0.0002, "epoch": 0.3152603231597846, "step": 4390}, {"loss": 0.8034, "grad_norm": 0.5100293755531311, "learning_rate": 0.0002, "epoch": 0.31597845601436264, "step": 4400}, {"loss": 0.795, "grad_norm": 0.48762813210487366, "learning_rate": 0.0002, "epoch": 0.31669658886894075, "step": 4410}, {"loss": 0.7941, "grad_norm": 0.5211702585220337, "learning_rate": 0.0002, "epoch": 0.31741472172351887, "step": 4420}, {"loss": 0.8079, "grad_norm": 0.696747899055481, "learning_rate": 0.0002, "epoch": 0.3181328545780969, "step": 4430}, {"loss": 0.77, "grad_norm": 0.6334946751594543, "learning_rate": 0.0002, "epoch": 0.31885098743267504, "step": 4440}, {"loss": 0.7871, "grad_norm": 0.5333067178726196, "learning_rate": 0.0002, "epoch": 0.31956912028725315, "step": 4450}, {"loss": 0.7846, "grad_norm": 0.500091552734375, "learning_rate": 0.0002, "epoch": 0.32028725314183126, "step": 4460}, {"loss": 0.7884, "grad_norm": 0.5190957188606262, "learning_rate": 0.0002, "epoch": 0.3210053859964093, "step": 4470}, {"loss": 0.7988, "grad_norm": 0.6702370047569275, "learning_rate": 0.0002, "epoch": 0.32172351885098743, "step": 4480}, {"loss": 0.8014, "grad_norm": 0.4393869638442993, "learning_rate": 0.0002, "epoch": 0.32244165170556555, "step": 4490}, {"loss": 0.8373, "grad_norm": 0.4766499400138855, "learning_rate": 0.0002, "epoch": 0.3231597845601436, "step": 4500}, {"loss": 0.7567, "grad_norm": 0.561836838722229, "learning_rate": 0.0002, "epoch": 0.3238779174147217, "step": 4510}, {"loss": 0.7727, "grad_norm": 0.44366541504859924, "learning_rate": 0.0002, "epoch": 0.32459605026929983, "step": 4520}, {"loss": 0.8109, "grad_norm": 0.46504274010658264, "learning_rate": 0.0002, "epoch": 0.32531418312387794, "step": 4530}, {"loss": 0.7868, "grad_norm": 0.5498034954071045, "learning_rate": 0.0002, "epoch": 0.326032315978456, "step": 4540}, {"loss": 0.7638, "grad_norm": 0.5901338458061218, "learning_rate": 0.0002, "epoch": 0.3267504488330341, "step": 4550}, {"loss": 0.8016, "grad_norm": 0.5485442876815796, "learning_rate": 0.0002, "epoch": 0.3274685816876122, "step": 4560}, {"loss": 0.7944, "grad_norm": 0.512584924697876, "learning_rate": 0.0002, "epoch": 0.3281867145421903, "step": 4570}, {"loss": 0.8193, "grad_norm": 0.5208188891410828, "learning_rate": 0.0002, "epoch": 0.3289048473967684, "step": 4580}, {"loss": 0.7833, "grad_norm": 0.4923836886882782, "learning_rate": 0.0002, "epoch": 0.3296229802513465, "step": 4590}, {"loss": 0.8102, "grad_norm": 0.49258530139923096, "learning_rate": 0.0002, "epoch": 0.3303411131059246, "step": 4600}, {"loss": 0.7874, "grad_norm": 0.4788922667503357, "learning_rate": 0.0002, "epoch": 0.3310592459605027, "step": 4610}, {"loss": 0.8298, "grad_norm": 0.48276954889297485, "learning_rate": 0.0002, "epoch": 0.3317773788150808, "step": 4620}, {"loss": 0.8519, "grad_norm": 0.6300732493400574, "learning_rate": 0.0002, "epoch": 0.3324955116696589, "step": 4630}, {"loss": 0.8434, "grad_norm": 0.47594770789146423, "learning_rate": 0.0002, "epoch": 0.33321364452423696, "step": 4640}, {"loss": 0.8123, "grad_norm": 0.4728924632072449, "learning_rate": 0.0002, "epoch": 0.3339317773788151, "step": 4650}, {"loss": 0.8113, "grad_norm": 0.5586788654327393, "learning_rate": 0.0002, "epoch": 0.3346499102333932, "step": 4660}, {"loss": 0.7949, "grad_norm": 0.4573180377483368, "learning_rate": 0.0002, "epoch": 0.3353680430879713, "step": 4670}, {"loss": 0.8341, "grad_norm": 0.6391524076461792, "learning_rate": 0.0002, "epoch": 0.33608617594254936, "step": 4680}, {"loss": 0.8126, "grad_norm": 0.6570921540260315, "learning_rate": 0.0002, "epoch": 0.33680430879712747, "step": 4690}, {"loss": 0.796, "grad_norm": 0.4601454734802246, "learning_rate": 0.0002, "epoch": 0.3375224416517056, "step": 4700}, {"loss": 0.8158, "grad_norm": 0.5640755295753479, "learning_rate": 0.0002, "epoch": 0.33824057450628364, "step": 4710}, {"loss": 0.8326, "grad_norm": 0.43475520610809326, "learning_rate": 0.0002, "epoch": 0.33895870736086176, "step": 4720}, {"loss": 0.7684, "grad_norm": 0.4785807132720947, "learning_rate": 0.0002, "epoch": 0.33967684021543987, "step": 4730}, {"loss": 0.8257, "grad_norm": 0.4934665262699127, "learning_rate": 0.0002, "epoch": 0.340394973070018, "step": 4740}, {"loss": 0.7713, "grad_norm": 0.45327693223953247, "learning_rate": 0.0002, "epoch": 0.34111310592459604, "step": 4750}, {"loss": 0.7944, "grad_norm": 0.4710456430912018, "learning_rate": 0.0002, "epoch": 0.34183123877917415, "step": 4760}, {"loss": 0.7689, "grad_norm": 0.5591559410095215, "learning_rate": 0.0002, "epoch": 0.34254937163375226, "step": 4770}, {"loss": 0.8204, "grad_norm": 0.48958835005760193, "learning_rate": 0.0002, "epoch": 0.3432675044883303, "step": 4780}, {"loss": 0.8232, "grad_norm": 0.4613766670227051, "learning_rate": 0.0002, "epoch": 0.34398563734290843, "step": 4790}, {"loss": 0.8339, "grad_norm": 0.5425335764884949, "learning_rate": 0.0002, "epoch": 0.34470377019748655, "step": 4800}, {"loss": 0.828, "grad_norm": 0.4964924156665802, "learning_rate": 0.0002, "epoch": 0.3454219030520646, "step": 4810}, {"loss": 0.8264, "grad_norm": 0.613449215888977, "learning_rate": 0.0002, "epoch": 0.3461400359066427, "step": 4820}, {"loss": 0.846, "grad_norm": 0.6553348898887634, "learning_rate": 0.0002, "epoch": 0.34685816876122083, "step": 4830}, {"loss": 0.8181, "grad_norm": 0.5863470435142517, "learning_rate": 0.0002, "epoch": 0.34757630161579894, "step": 4840}, {"loss": 0.8205, "grad_norm": 0.5338097810745239, "learning_rate": 0.0002, "epoch": 0.348294434470377, "step": 4850}, {"loss": 0.7926, "grad_norm": 0.6129760146141052, "learning_rate": 0.0002, "epoch": 0.3490125673249551, "step": 4860}, {"loss": 0.7745, "grad_norm": 0.6100956797599792, "learning_rate": 0.0002, "epoch": 0.3497307001795332, "step": 4870}, {"loss": 0.7642, "grad_norm": 0.5478541254997253, "learning_rate": 0.0002, "epoch": 0.3504488330341113, "step": 4880}, {"loss": 0.7558, "grad_norm": 0.5725359916687012, "learning_rate": 0.0002, "epoch": 0.3511669658886894, "step": 4890}, {"loss": 0.8208, "grad_norm": 0.6141043901443481, "learning_rate": 0.0002, "epoch": 0.3518850987432675, "step": 4900}, {"loss": 0.841, "grad_norm": 0.597191572189331, "learning_rate": 0.0002, "epoch": 0.3526032315978456, "step": 4910}, {"loss": 0.8234, "grad_norm": 0.5988389253616333, "learning_rate": 0.0002, "epoch": 0.3533213644524237, "step": 4920}, {"loss": 0.7775, "grad_norm": 0.5503361821174622, "learning_rate": 0.0002, "epoch": 0.3540394973070018, "step": 4930}, {"loss": 0.8315, "grad_norm": 0.5932779312133789, "learning_rate": 0.0002, "epoch": 0.3547576301615799, "step": 4940}, {"loss": 0.8407, "grad_norm": 0.48911359906196594, "learning_rate": 0.0002, "epoch": 0.35547576301615796, "step": 4950}, {"loss": 0.8191, "grad_norm": 0.5435750484466553, "learning_rate": 0.0002, "epoch": 0.3561938958707361, "step": 4960}, {"loss": 0.7551, "grad_norm": 0.4786977767944336, "learning_rate": 0.0002, "epoch": 0.3569120287253142, "step": 4970}, {"loss": 0.7845, "grad_norm": 0.4022316336631775, "learning_rate": 0.0002, "epoch": 0.3576301615798923, "step": 4980}, {"loss": 0.8032, "grad_norm": 0.4848504364490509, "learning_rate": 0.0002, "epoch": 0.35834829443447036, "step": 4990}, {"loss": 0.809, "grad_norm": 0.5093459486961365, "learning_rate": 0.0002, "epoch": 0.3590664272890485, "step": 5000}, {"loss": 0.8424, "grad_norm": 0.47368478775024414, "learning_rate": 0.0002, "epoch": 0.3597845601436266, "step": 5010}, {"loss": 0.811, "grad_norm": 0.6041097044944763, "learning_rate": 0.0002, "epoch": 0.36050269299820464, "step": 5020}, {"loss": 0.8023, "grad_norm": 0.5384424924850464, "learning_rate": 0.0002, "epoch": 0.36122082585278276, "step": 5030}, {"loss": 0.826, "grad_norm": 0.4668518602848053, "learning_rate": 0.0002, "epoch": 0.36193895870736087, "step": 5040}, {"loss": 0.7785, "grad_norm": 0.5471060276031494, "learning_rate": 0.0002, "epoch": 0.362657091561939, "step": 5050}, {"loss": 0.7511, "grad_norm": 0.731369137763977, "learning_rate": 0.0002, "epoch": 0.36337522441651704, "step": 5060}, {"loss": 0.8646, "grad_norm": 0.5119590759277344, "learning_rate": 0.0002, "epoch": 0.36409335727109515, "step": 5070}, {"loss": 0.8125, "grad_norm": 0.567428469657898, "learning_rate": 0.0002, "epoch": 0.36481149012567327, "step": 5080}, {"loss": 0.7616, "grad_norm": 0.5139971375465393, "learning_rate": 0.0002, "epoch": 0.3655296229802513, "step": 5090}, {"loss": 0.8091, "grad_norm": 0.5701581835746765, "learning_rate": 0.0002, "epoch": 0.36624775583482944, "step": 5100}, {"loss": 0.821, "grad_norm": 0.5022063851356506, "learning_rate": 0.0002, "epoch": 0.36696588868940755, "step": 5110}, {"loss": 0.7879, "grad_norm": 0.4684354364871979, "learning_rate": 0.0002, "epoch": 0.36768402154398566, "step": 5120}, {"loss": 0.8028, "grad_norm": 0.5423495769500732, "learning_rate": 0.0002, "epoch": 0.3684021543985637, "step": 5130}, {"loss": 0.7763, "grad_norm": 0.46262967586517334, "learning_rate": 0.0002, "epoch": 0.36912028725314183, "step": 5140}, {"loss": 0.8485, "grad_norm": 0.4720141589641571, "learning_rate": 0.0002, "epoch": 0.36983842010771995, "step": 5150}, {"loss": 0.7778, "grad_norm": 0.5113096833229065, "learning_rate": 0.0002, "epoch": 0.370556552962298, "step": 5160}, {"loss": 0.7854, "grad_norm": 0.5253350138664246, "learning_rate": 0.0002, "epoch": 0.3712746858168761, "step": 5170}, {"loss": 0.8539, "grad_norm": 0.5799776315689087, "learning_rate": 0.0002, "epoch": 0.37199281867145423, "step": 5180}, {"loss": 0.78, "grad_norm": 0.5166001319885254, "learning_rate": 0.0002, "epoch": 0.37271095152603234, "step": 5190}, {"loss": 0.7939, "grad_norm": 0.5658290386199951, "learning_rate": 0.0002, "epoch": 0.3734290843806104, "step": 5200}, {"loss": 0.8059, "grad_norm": 0.45811113715171814, "learning_rate": 0.0002, "epoch": 0.3741472172351885, "step": 5210}, {"loss": 0.8024, "grad_norm": 0.5509489178657532, "learning_rate": 0.0002, "epoch": 0.3748653500897666, "step": 5220}, {"loss": 0.7537, "grad_norm": 0.47473257780075073, "learning_rate": 0.0002, "epoch": 0.3755834829443447, "step": 5230}, {"loss": 0.8159, "grad_norm": 0.3858596086502075, "learning_rate": 0.0002, "epoch": 0.3763016157989228, "step": 5240}, {"loss": 0.8592, "grad_norm": 0.6941536068916321, "learning_rate": 0.0002, "epoch": 0.3770197486535009, "step": 5250}, {"loss": 0.8489, "grad_norm": 0.46940872073173523, "learning_rate": 0.0002, "epoch": 0.377737881508079, "step": 5260}, {"loss": 0.7818, "grad_norm": 0.5413833260536194, "learning_rate": 0.0002, "epoch": 0.3784560143626571, "step": 5270}, {"loss": 0.8202, "grad_norm": 0.5165658593177795, "learning_rate": 0.0002, "epoch": 0.3791741472172352, "step": 5280}, {"loss": 0.7837, "grad_norm": 0.6567398309707642, "learning_rate": 0.0002, "epoch": 0.3798922800718133, "step": 5290}, {"loss": 0.7991, "grad_norm": 0.5466915965080261, "learning_rate": 0.0002, "epoch": 0.38061041292639136, "step": 5300}, {"loss": 0.7683, "grad_norm": 0.4800598621368408, "learning_rate": 0.0002, "epoch": 0.3813285457809695, "step": 5310}, {"loss": 0.8653, "grad_norm": 0.4551742970943451, "learning_rate": 0.0002, "epoch": 0.3820466786355476, "step": 5320}, {"loss": 0.8283, "grad_norm": 0.5561164617538452, "learning_rate": 0.0002, "epoch": 0.3827648114901257, "step": 5330}, {"loss": 0.8192, "grad_norm": 0.6170380115509033, "learning_rate": 0.0002, "epoch": 0.38348294434470376, "step": 5340}, {"loss": 0.8015, "grad_norm": 0.465762197971344, "learning_rate": 0.0002, "epoch": 0.38420107719928187, "step": 5350}, {"loss": 0.7561, "grad_norm": 0.6176838874816895, "learning_rate": 0.0002, "epoch": 0.38491921005386, "step": 5360}, {"loss": 0.7571, "grad_norm": 0.657926082611084, "learning_rate": 0.0002, "epoch": 0.38563734290843804, "step": 5370}, {"loss": 0.7366, "grad_norm": 0.5063281655311584, "learning_rate": 0.0002, "epoch": 0.38635547576301615, "step": 5380}, {"loss": 0.8259, "grad_norm": 0.6960828304290771, "learning_rate": 0.0002, "epoch": 0.38707360861759427, "step": 5390}, {"loss": 0.8058, "grad_norm": 0.46712034940719604, "learning_rate": 0.0002, "epoch": 0.3877917414721723, "step": 5400}, {"loss": 0.7674, "grad_norm": 0.598114013671875, "learning_rate": 0.0002, "epoch": 0.38850987432675044, "step": 5410}, {"loss": 0.8256, "grad_norm": 0.6798132061958313, "learning_rate": 0.0002, "epoch": 0.38922800718132855, "step": 5420}, {"loss": 0.844, "grad_norm": 0.5194289088249207, "learning_rate": 0.0002, "epoch": 0.38994614003590666, "step": 5430}, {"loss": 0.7666, "grad_norm": 0.48175323009490967, "learning_rate": 0.0002, "epoch": 0.3906642728904847, "step": 5440}, {"loss": 0.8089, "grad_norm": 0.4979408085346222, "learning_rate": 0.0002, "epoch": 0.39138240574506283, "step": 5450}, {"loss": 0.7938, "grad_norm": 0.6440972685813904, "learning_rate": 0.0002, "epoch": 0.39210053859964095, "step": 5460}, {"loss": 0.8531, "grad_norm": 0.5977227091789246, "learning_rate": 0.0002, "epoch": 0.392818671454219, "step": 5470}, {"loss": 0.8384, "grad_norm": 0.4735909104347229, "learning_rate": 0.0002, "epoch": 0.3935368043087971, "step": 5480}, {"loss": 0.8579, "grad_norm": 0.48181721568107605, "learning_rate": 0.0002, "epoch": 0.39425493716337523, "step": 5490}, {"loss": 0.8113, "grad_norm": 0.6339454650878906, "learning_rate": 0.0002, "epoch": 0.39497307001795334, "step": 5500}, {"loss": 0.7682, "grad_norm": 0.5364336371421814, "learning_rate": 0.0002, "epoch": 0.3956912028725314, "step": 5510}, {"loss": 0.8198, "grad_norm": 0.5499233603477478, "learning_rate": 0.0002, "epoch": 0.3964093357271095, "step": 5520}, {"loss": 0.7981, "grad_norm": 0.47249847650527954, "learning_rate": 0.0002, "epoch": 0.3971274685816876, "step": 5530}, {"loss": 0.8207, "grad_norm": 0.5692135095596313, "learning_rate": 0.0002, "epoch": 0.3978456014362657, "step": 5540}, {"loss": 0.8173, "grad_norm": 0.6009272933006287, "learning_rate": 0.0002, "epoch": 0.3985637342908438, "step": 5550}, {"loss": 0.7622, "grad_norm": 0.5198255181312561, "learning_rate": 0.0002, "epoch": 0.3992818671454219, "step": 5560}, {"loss": 0.8597, "grad_norm": 0.5474766492843628, "learning_rate": 0.0002, "epoch": 0.4, "step": 5570}, {"loss": 0.841, "grad_norm": 0.5577479600906372, "learning_rate": 0.0002, "epoch": 0.4007181328545781, "step": 5580}, {"loss": 0.7986, "grad_norm": 0.5350302457809448, "learning_rate": 0.0002, "epoch": 0.4014362657091562, "step": 5590}, {"loss": 0.7892, "grad_norm": 0.6310991048812866, "learning_rate": 0.0002, "epoch": 0.4021543985637343, "step": 5600}, {"loss": 0.7834, "grad_norm": 0.5695762038230896, "learning_rate": 0.0002, "epoch": 0.40287253141831236, "step": 5610}, {"loss": 0.7508, "grad_norm": 0.5431827306747437, "learning_rate": 0.0002, "epoch": 0.4035906642728905, "step": 5620}, {"loss": 0.8743, "grad_norm": 0.4923325777053833, "learning_rate": 0.0002, "epoch": 0.4043087971274686, "step": 5630}, {"loss": 0.7745, "grad_norm": 0.531399667263031, "learning_rate": 0.0002, "epoch": 0.4050269299820467, "step": 5640}, {"loss": 0.7982, "grad_norm": 0.5854769349098206, "learning_rate": 0.0002, "epoch": 0.40574506283662476, "step": 5650}, {"loss": 0.8225, "grad_norm": 0.6684802174568176, "learning_rate": 0.0002, "epoch": 0.40646319569120287, "step": 5660}, {"loss": 0.7405, "grad_norm": 0.6618620753288269, "learning_rate": 0.0002, "epoch": 0.407181328545781, "step": 5670}, {"loss": 0.7707, "grad_norm": 0.4930776059627533, "learning_rate": 0.0002, "epoch": 0.40789946140035904, "step": 5680}, {"loss": 0.7846, "grad_norm": 0.506628155708313, "learning_rate": 0.0002, "epoch": 0.40861759425493716, "step": 5690}, {"loss": 0.7827, "grad_norm": 0.5250783562660217, "learning_rate": 0.0002, "epoch": 0.40933572710951527, "step": 5700}, {"loss": 0.8386, "grad_norm": 0.6773046851158142, "learning_rate": 0.0002, "epoch": 0.4100538599640934, "step": 5710}, {"loss": 0.8096, "grad_norm": 0.6750592589378357, "learning_rate": 0.0002, "epoch": 0.41077199281867144, "step": 5720}, {"loss": 0.7873, "grad_norm": 0.5277232527732849, "learning_rate": 0.0002, "epoch": 0.41149012567324955, "step": 5730}, {"loss": 0.762, "grad_norm": 0.5155990719795227, "learning_rate": 0.0002, "epoch": 0.41220825852782766, "step": 5740}, {"loss": 0.871, "grad_norm": 0.5236294865608215, "learning_rate": 0.0002, "epoch": 0.4129263913824057, "step": 5750}, {"loss": 0.7753, "grad_norm": 0.5073592066764832, "learning_rate": 0.0002, "epoch": 0.41364452423698383, "step": 5760}, {"loss": 0.7984, "grad_norm": 0.6997184753417969, "learning_rate": 0.0002, "epoch": 0.41436265709156195, "step": 5770}, {"loss": 0.7579, "grad_norm": 0.5282439589500427, "learning_rate": 0.0002, "epoch": 0.41508078994614006, "step": 5780}, {"loss": 0.7831, "grad_norm": 0.4997355341911316, "learning_rate": 0.0002, "epoch": 0.4157989228007181, "step": 5790}, {"loss": 0.8022, "grad_norm": 0.6081610321998596, "learning_rate": 0.0002, "epoch": 0.41651705565529623, "step": 5800}, {"loss": 0.8068, "grad_norm": 0.5640295147895813, "learning_rate": 0.0002, "epoch": 0.41723518850987434, "step": 5810}, {"loss": 0.7819, "grad_norm": 0.6443586349487305, "learning_rate": 0.0002, "epoch": 0.4179533213644524, "step": 5820}, {"loss": 0.8132, "grad_norm": 0.6456229090690613, "learning_rate": 0.0002, "epoch": 0.4186714542190305, "step": 5830}, {"loss": 0.785, "grad_norm": 0.5422267317771912, "learning_rate": 0.0002, "epoch": 0.4193895870736086, "step": 5840}, {"loss": 0.7962, "grad_norm": 0.45251885056495667, "learning_rate": 0.0002, "epoch": 0.42010771992818674, "step": 5850}, {"loss": 0.7945, "grad_norm": 0.781165599822998, "learning_rate": 0.0002, "epoch": 0.4208258527827648, "step": 5860}, {"loss": 0.8171, "grad_norm": 0.5359160900115967, "learning_rate": 0.0002, "epoch": 0.4215439856373429, "step": 5870}, {"loss": 0.8012, "grad_norm": 0.6201958656311035, "learning_rate": 0.0002, "epoch": 0.422262118491921, "step": 5880}, {"loss": 0.8363, "grad_norm": 0.5985850691795349, "learning_rate": 0.0002, "epoch": 0.4229802513464991, "step": 5890}, {"loss": 0.7842, "grad_norm": 0.5550961494445801, "learning_rate": 0.0002, "epoch": 0.4236983842010772, "step": 5900}, {"loss": 0.7717, "grad_norm": 0.6284893155097961, "learning_rate": 0.0002, "epoch": 0.4244165170556553, "step": 5910}, {"loss": 0.8165, "grad_norm": 0.6143685579299927, "learning_rate": 0.0002, "epoch": 0.4251346499102334, "step": 5920}, {"loss": 0.7986, "grad_norm": 0.5065329670906067, "learning_rate": 0.0002, "epoch": 0.4258527827648115, "step": 5930}, {"loss": 0.7883, "grad_norm": 0.7274345755577087, "learning_rate": 0.0002, "epoch": 0.4265709156193896, "step": 5940}, {"loss": 0.8126, "grad_norm": 0.606531023979187, "learning_rate": 0.0002, "epoch": 0.4272890484739677, "step": 5950}, {"loss": 0.7805, "grad_norm": 0.5983648300170898, "learning_rate": 0.0002, "epoch": 0.42800718132854576, "step": 5960}, {"loss": 0.8124, "grad_norm": 0.5546031594276428, "learning_rate": 0.0002, "epoch": 0.4287253141831239, "step": 5970}, {"loss": 0.8184, "grad_norm": 0.666868269443512, "learning_rate": 0.0002, "epoch": 0.429443447037702, "step": 5980}, {"loss": 0.8171, "grad_norm": 0.41438576579093933, "learning_rate": 0.0002, "epoch": 0.4301615798922801, "step": 5990}, {"loss": 0.8456, "grad_norm": 0.5012526512145996, "learning_rate": 0.0002, "epoch": 0.43087971274685816, "step": 6000}, {"loss": 0.7837, "grad_norm": 0.6071694493293762, "learning_rate": 0.0002, "epoch": 0.43159784560143627, "step": 6010}, {"loss": 0.8364, "grad_norm": 0.5538384914398193, "learning_rate": 0.0002, "epoch": 0.4323159784560144, "step": 6020}, {"loss": 0.7888, "grad_norm": 0.5798718929290771, "learning_rate": 0.0002, "epoch": 0.43303411131059244, "step": 6030}, {"loss": 0.8196, "grad_norm": 0.5442442893981934, "learning_rate": 0.0002, "epoch": 0.43375224416517055, "step": 6040}, {"loss": 0.8041, "grad_norm": 0.6895565390586853, "learning_rate": 0.0002, "epoch": 0.43447037701974867, "step": 6050}, {"loss": 0.8154, "grad_norm": 0.6498045325279236, "learning_rate": 0.0002, "epoch": 0.4351885098743267, "step": 6060}, {"loss": 0.782, "grad_norm": 0.5225510001182556, "learning_rate": 0.0002, "epoch": 0.43590664272890484, "step": 6070}, {"loss": 0.7809, "grad_norm": 0.6366992592811584, "learning_rate": 0.0002, "epoch": 0.43662477558348295, "step": 6080}, {"loss": 0.7715, "grad_norm": 0.47929027676582336, "learning_rate": 0.0002, "epoch": 0.43734290843806106, "step": 6090}, {"loss": 0.7481, "grad_norm": 0.5722405910491943, "learning_rate": 0.0002, "epoch": 0.4380610412926391, "step": 6100}, {"loss": 0.765, "grad_norm": 0.6008004546165466, "learning_rate": 0.0002, "epoch": 0.43877917414721723, "step": 6110}, {"loss": 0.7795, "grad_norm": 0.5922580361366272, "learning_rate": 0.0002, "epoch": 0.43949730700179535, "step": 6120}, {"loss": 0.8542, "grad_norm": 0.7051905393600464, "learning_rate": 0.0002, "epoch": 0.4402154398563734, "step": 6130}, {"loss": 0.8159, "grad_norm": 0.5146450400352478, "learning_rate": 0.0002, "epoch": 0.4409335727109515, "step": 6140}, {"loss": 0.8178, "grad_norm": 0.5605781674385071, "learning_rate": 0.0002, "epoch": 0.44165170556552963, "step": 6150}, {"loss": 0.8409, "grad_norm": 0.8008661866188049, "learning_rate": 0.0002, "epoch": 0.44236983842010774, "step": 6160}, {"loss": 0.797, "grad_norm": 0.47406497597694397, "learning_rate": 0.0002, "epoch": 0.4430879712746858, "step": 6170}, {"loss": 0.7853, "grad_norm": 0.612287700176239, "learning_rate": 0.0002, "epoch": 0.4438061041292639, "step": 6180}, {"loss": 0.835, "grad_norm": 0.561188280582428, "learning_rate": 0.0002, "epoch": 0.444524236983842, "step": 6190}, {"loss": 0.7604, "grad_norm": 0.6233669519424438, "learning_rate": 0.0002, "epoch": 0.4452423698384201, "step": 6200}, {"loss": 0.7539, "grad_norm": 0.45546263456344604, "learning_rate": 0.0002, "epoch": 0.4459605026929982, "step": 6210}, {"loss": 0.8183, "grad_norm": 0.5947871208190918, "learning_rate": 0.0002, "epoch": 0.4466786355475763, "step": 6220}, {"loss": 0.789, "grad_norm": 0.6109753847122192, "learning_rate": 0.0002, "epoch": 0.4473967684021544, "step": 6230}, {"loss": 0.7811, "grad_norm": 0.6380727887153625, "learning_rate": 0.0002, "epoch": 0.4481149012567325, "step": 6240}, {"loss": 0.7845, "grad_norm": 0.5225699543952942, "learning_rate": 0.0002, "epoch": 0.4488330341113106, "step": 6250}, {"loss": 0.8217, "grad_norm": 0.521503210067749, "learning_rate": 0.0002, "epoch": 0.4495511669658887, "step": 6260}, {"loss": 0.8392, "grad_norm": 0.5523216128349304, "learning_rate": 0.0002, "epoch": 0.45026929982046676, "step": 6270}, {"loss": 0.8228, "grad_norm": 0.5954921841621399, "learning_rate": 0.0002, "epoch": 0.4509874326750449, "step": 6280}, {"loss": 0.7798, "grad_norm": 0.702751100063324, "learning_rate": 0.0002, "epoch": 0.451705565529623, "step": 6290}, {"loss": 0.7865, "grad_norm": 0.5756356120109558, "learning_rate": 0.0002, "epoch": 0.4524236983842011, "step": 6300}, {"loss": 0.8128, "grad_norm": 0.45365944504737854, "learning_rate": 0.0002, "epoch": 0.45314183123877916, "step": 6310}, {"loss": 0.8027, "grad_norm": 0.5027855038642883, "learning_rate": 0.0002, "epoch": 0.45385996409335727, "step": 6320}, {"loss": 0.8052, "grad_norm": 0.6551687121391296, "learning_rate": 0.0002, "epoch": 0.4545780969479354, "step": 6330}, {"loss": 0.7507, "grad_norm": 0.5296684503555298, "learning_rate": 0.0002, "epoch": 0.45529622980251344, "step": 6340}, {"loss": 0.8209, "grad_norm": 0.5762032866477966, "learning_rate": 0.0002, "epoch": 0.45601436265709155, "step": 6350}, {"loss": 0.8209, "grad_norm": 0.5234073996543884, "learning_rate": 0.0002, "epoch": 0.45673249551166967, "step": 6360}, {"loss": 0.8412, "grad_norm": 0.5090946555137634, "learning_rate": 0.0002, "epoch": 0.4574506283662478, "step": 6370}, {"loss": 0.787, "grad_norm": 0.6515111327171326, "learning_rate": 0.0002, "epoch": 0.45816876122082584, "step": 6380}, {"loss": 0.7351, "grad_norm": 0.7904898524284363, "learning_rate": 0.0002, "epoch": 0.45888689407540395, "step": 6390}, {"loss": 0.841, "grad_norm": 0.6379680037498474, "learning_rate": 0.0002, "epoch": 0.45960502692998206, "step": 6400}, {"loss": 0.7727, "grad_norm": 0.641759991645813, "learning_rate": 0.0002, "epoch": 0.4603231597845601, "step": 6410}, {"loss": 0.8346, "grad_norm": 0.5273829698562622, "learning_rate": 0.0002, "epoch": 0.46104129263913823, "step": 6420}, {"loss": 0.7722, "grad_norm": 0.5668497681617737, "learning_rate": 0.0002, "epoch": 0.46175942549371635, "step": 6430}, {"loss": 0.8157, "grad_norm": 0.5862061381340027, "learning_rate": 0.0002, "epoch": 0.46247755834829446, "step": 6440}, {"loss": 0.818, "grad_norm": 0.5239592790603638, "learning_rate": 0.0002, "epoch": 0.4631956912028725, "step": 6450}, {"loss": 0.7803, "grad_norm": 0.5078722834587097, "learning_rate": 0.0002, "epoch": 0.46391382405745063, "step": 6460}, {"loss": 0.7934, "grad_norm": 0.566509485244751, "learning_rate": 0.0002, "epoch": 0.46463195691202874, "step": 6470}, {"loss": 0.7746, "grad_norm": 0.5952697396278381, "learning_rate": 0.0002, "epoch": 0.4653500897666068, "step": 6480}, {"loss": 0.8088, "grad_norm": 0.6548156142234802, "learning_rate": 0.0002, "epoch": 0.4660682226211849, "step": 6490}, {"loss": 0.8303, "grad_norm": 0.4768427908420563, "learning_rate": 0.0002, "epoch": 0.466786355475763, "step": 6500}, {"loss": 0.805, "grad_norm": 0.5588273406028748, "learning_rate": 0.0002, "epoch": 0.46750448833034114, "step": 6510}, {"loss": 0.7774, "grad_norm": 0.5348677039146423, "learning_rate": 0.0002, "epoch": 0.4682226211849192, "step": 6520}, {"loss": 0.7969, "grad_norm": 0.4784318804740906, "learning_rate": 0.0002, "epoch": 0.4689407540394973, "step": 6530}, {"loss": 0.8073, "grad_norm": 0.5112265944480896, "learning_rate": 0.0002, "epoch": 0.4696588868940754, "step": 6540}, {"loss": 0.8289, "grad_norm": 0.7250495553016663, "learning_rate": 0.0002, "epoch": 0.4703770197486535, "step": 6550}, {"loss": 0.808, "grad_norm": 0.538608968257904, "learning_rate": 0.0002, "epoch": 0.4710951526032316, "step": 6560}, {"loss": 0.7977, "grad_norm": 0.5981247425079346, "learning_rate": 0.0002, "epoch": 0.4718132854578097, "step": 6570}, {"loss": 0.8092, "grad_norm": 0.5466762781143188, "learning_rate": 0.0002, "epoch": 0.4725314183123878, "step": 6580}, {"loss": 0.8136, "grad_norm": 0.5609987378120422, "learning_rate": 0.0002, "epoch": 0.4732495511669659, "step": 6590}, {"loss": 0.8575, "grad_norm": 0.6091027855873108, "learning_rate": 0.0002, "epoch": 0.473967684021544, "step": 6600}, {"loss": 0.7741, "grad_norm": 0.5542886853218079, "learning_rate": 0.0002, "epoch": 0.4746858168761221, "step": 6610}, {"loss": 0.7867, "grad_norm": 0.5656579732894897, "learning_rate": 0.0002, "epoch": 0.47540394973070016, "step": 6620}, {"loss": 0.7647, "grad_norm": 0.47507357597351074, "learning_rate": 0.0002, "epoch": 0.4761220825852783, "step": 6630}, {"loss": 0.8323, "grad_norm": 0.6039174199104309, "learning_rate": 0.0002, "epoch": 0.4768402154398564, "step": 6640}, {"loss": 0.7812, "grad_norm": 0.7129740715026855, "learning_rate": 0.0002, "epoch": 0.47755834829443444, "step": 6650}, {"loss": 0.8001, "grad_norm": 0.5189188718795776, "learning_rate": 0.0002, "epoch": 0.47827648114901256, "step": 6660}, {"loss": 0.7467, "grad_norm": 0.7548696398735046, "learning_rate": 0.0002, "epoch": 0.47899461400359067, "step": 6670}, {"loss": 0.7694, "grad_norm": 0.4729466438293457, "learning_rate": 0.0002, "epoch": 0.4797127468581688, "step": 6680}, {"loss": 0.7497, "grad_norm": 0.6190000772476196, "learning_rate": 0.0002, "epoch": 0.48043087971274684, "step": 6690}, {"loss": 0.7691, "grad_norm": 0.6276983022689819, "learning_rate": 0.0002, "epoch": 0.48114901256732495, "step": 6700}, {"loss": 0.7947, "grad_norm": 0.6097590923309326, "learning_rate": 0.0002, "epoch": 0.48186714542190306, "step": 6710}, {"loss": 0.7735, "grad_norm": 0.6507330536842346, "learning_rate": 0.0002, "epoch": 0.4825852782764811, "step": 6720}, {"loss": 0.817, "grad_norm": 0.5501991510391235, "learning_rate": 0.0002, "epoch": 0.48330341113105924, "step": 6730}, {"loss": 0.7998, "grad_norm": 0.5928015112876892, "learning_rate": 0.0002, "epoch": 0.48402154398563735, "step": 6740}, {"loss": 0.7717, "grad_norm": 0.5523008704185486, "learning_rate": 0.0002, "epoch": 0.48473967684021546, "step": 6750}, {"loss": 0.7821, "grad_norm": 0.5997263789176941, "learning_rate": 0.0002, "epoch": 0.4854578096947935, "step": 6760}, {"loss": 0.7619, "grad_norm": 0.6201002597808838, "learning_rate": 0.0002, "epoch": 0.48617594254937163, "step": 6770}, {"loss": 0.8018, "grad_norm": 0.6338862776756287, "learning_rate": 0.0002, "epoch": 0.48689407540394974, "step": 6780}, {"loss": 0.7547, "grad_norm": 0.5542550086975098, "learning_rate": 0.0002, "epoch": 0.4876122082585278, "step": 6790}, {"loss": 0.7754, "grad_norm": 0.5587872862815857, "learning_rate": 0.0002, "epoch": 0.4883303411131059, "step": 6800}, {"loss": 0.7913, "grad_norm": 0.5895681977272034, "learning_rate": 0.0002, "epoch": 0.489048473967684, "step": 6810}, {"loss": 0.7799, "grad_norm": 0.4948221743106842, "learning_rate": 0.0002, "epoch": 0.48976660682226214, "step": 6820}, {"loss": 0.8057, "grad_norm": 0.44546931982040405, "learning_rate": 0.0002, "epoch": 0.4904847396768402, "step": 6830}, {"loss": 0.8124, "grad_norm": 0.632046103477478, "learning_rate": 0.0002, "epoch": 0.4912028725314183, "step": 6840}, {"loss": 0.8014, "grad_norm": 0.49396243691444397, "learning_rate": 0.0002, "epoch": 0.4919210053859964, "step": 6850}, {"loss": 0.7127, "grad_norm": 0.497745156288147, "learning_rate": 0.0002, "epoch": 0.4926391382405745, "step": 6860}, {"loss": 0.8306, "grad_norm": 0.7336170077323914, "learning_rate": 0.0002, "epoch": 0.4933572710951526, "step": 6870}, {"loss": 0.8342, "grad_norm": 0.6723181009292603, "learning_rate": 0.0002, "epoch": 0.4940754039497307, "step": 6880}, {"loss": 0.8251, "grad_norm": 0.5887754559516907, "learning_rate": 0.0002, "epoch": 0.4947935368043088, "step": 6890}, {"loss": 0.7904, "grad_norm": 0.6580226421356201, "learning_rate": 0.0002, "epoch": 0.4955116696588869, "step": 6900}, {"loss": 0.8203, "grad_norm": 0.7385056614875793, "learning_rate": 0.0002, "epoch": 0.496229802513465, "step": 6910}, {"loss": 0.87, "grad_norm": 0.48736000061035156, "learning_rate": 0.0002, "epoch": 0.4969479353680431, "step": 6920}, {"loss": 0.8045, "grad_norm": 0.6304559111595154, "learning_rate": 0.0002, "epoch": 0.49766606822262116, "step": 6930}, {"loss": 0.8323, "grad_norm": 0.607148289680481, "learning_rate": 0.0002, "epoch": 0.4983842010771993, "step": 6940}, {"loss": 0.8277, "grad_norm": 0.5467981696128845, "learning_rate": 0.0002, "epoch": 0.4991023339317774, "step": 6950}, {"loss": 0.804, "grad_norm": 0.7046723961830139, "learning_rate": 0.0002, "epoch": 0.4998204667863555, "step": 6960}, {"loss": 0.7836, "grad_norm": 0.5487921833992004, "learning_rate": 0.0002, "epoch": 0.5005385996409336, "step": 6970}, {"loss": 0.8445, "grad_norm": 0.5706006288528442, "learning_rate": 0.0002, "epoch": 0.5012567324955116, "step": 6980}, {"loss": 0.8216, "grad_norm": 0.539536714553833, "learning_rate": 0.0002, "epoch": 0.5019748653500897, "step": 6990}, {"loss": 0.7829, "grad_norm": 0.5527397394180298, "learning_rate": 0.0002, "epoch": 0.5026929982046678, "step": 7000}, {"loss": 0.8342, "grad_norm": 0.5498567223548889, "learning_rate": 0.0002, "epoch": 0.503411131059246, "step": 7010}, {"loss": 0.8073, "grad_norm": 0.5878575444221497, "learning_rate": 0.0002, "epoch": 0.5041292639138241, "step": 7020}, {"loss": 0.8284, "grad_norm": 0.646153450012207, "learning_rate": 0.0002, "epoch": 0.5048473967684022, "step": 7030}, {"loss": 0.7758, "grad_norm": 0.5603899359703064, "learning_rate": 0.0002, "epoch": 0.5055655296229803, "step": 7040}, {"loss": 0.8002, "grad_norm": 0.5849952697753906, "learning_rate": 0.0002, "epoch": 0.5062836624775583, "step": 7050}, {"loss": 0.7953, "grad_norm": 0.6082724928855896, "learning_rate": 0.0002, "epoch": 0.5070017953321364, "step": 7060}, {"loss": 0.8046, "grad_norm": 0.5900670289993286, "learning_rate": 0.0002, "epoch": 0.5077199281867145, "step": 7070}, {"loss": 0.8612, "grad_norm": 0.5856624841690063, "learning_rate": 0.0002, "epoch": 0.5084380610412926, "step": 7080}, {"loss": 0.8289, "grad_norm": 0.6177338361740112, "learning_rate": 0.0002, "epoch": 0.5091561938958707, "step": 7090}, {"loss": 0.8139, "grad_norm": 0.5559300184249878, "learning_rate": 0.0002, "epoch": 0.5098743267504489, "step": 7100}, {"loss": 0.8083, "grad_norm": 0.62027907371521, "learning_rate": 0.0002, "epoch": 0.510592459605027, "step": 7110}, {"loss": 0.8037, "grad_norm": 0.6334301829338074, "learning_rate": 0.0002, "epoch": 0.511310592459605, "step": 7120}, {"loss": 0.8107, "grad_norm": 0.513795018196106, "learning_rate": 0.0002, "epoch": 0.5120287253141831, "step": 7130}, {"loss": 0.7566, "grad_norm": 0.7004675269126892, "learning_rate": 0.0002, "epoch": 0.5127468581687612, "step": 7140}, {"loss": 0.7893, "grad_norm": 0.5614308714866638, "learning_rate": 0.0002, "epoch": 0.5134649910233393, "step": 7150}, {"loss": 0.7868, "grad_norm": 0.5037539601325989, "learning_rate": 0.0002, "epoch": 0.5141831238779174, "step": 7160}, {"loss": 0.7981, "grad_norm": 0.5568661093711853, "learning_rate": 0.0002, "epoch": 0.5149012567324955, "step": 7170}, {"loss": 0.8333, "grad_norm": 0.7513397336006165, "learning_rate": 0.0002, "epoch": 0.5156193895870737, "step": 7180}, {"loss": 0.792, "grad_norm": 0.7264583706855774, "learning_rate": 0.0002, "epoch": 0.5163375224416517, "step": 7190}, {"loss": 0.8671, "grad_norm": 0.6355819702148438, "learning_rate": 0.0002, "epoch": 0.5170556552962298, "step": 7200}, {"loss": 0.7734, "grad_norm": 0.6063222289085388, "learning_rate": 0.0002, "epoch": 0.5177737881508079, "step": 7210}, {"loss": 0.812, "grad_norm": 0.6484307646751404, "learning_rate": 0.0002, "epoch": 0.518491921005386, "step": 7220}, {"loss": 0.7852, "grad_norm": 0.5260455012321472, "learning_rate": 0.0002, "epoch": 0.5192100538599641, "step": 7230}, {"loss": 0.8301, "grad_norm": 0.6718002557754517, "learning_rate": 0.0002, "epoch": 0.5199281867145422, "step": 7240}, {"loss": 0.8178, "grad_norm": 0.5997617244720459, "learning_rate": 0.0002, "epoch": 0.5206463195691203, "step": 7250}, {"loss": 0.7631, "grad_norm": 0.5838589668273926, "learning_rate": 0.0002, "epoch": 0.5213644524236983, "step": 7260}, {"loss": 0.7853, "grad_norm": 0.5755977630615234, "learning_rate": 0.0002, "epoch": 0.5220825852782764, "step": 7270}, {"loss": 0.8233, "grad_norm": 0.6442093253135681, "learning_rate": 0.0002, "epoch": 0.5228007181328546, "step": 7280}, {"loss": 0.822, "grad_norm": 0.6128416657447815, "learning_rate": 0.0002, "epoch": 0.5235188509874327, "step": 7290}, {"loss": 0.802, "grad_norm": 0.509742796421051, "learning_rate": 0.0002, "epoch": 0.5242369838420108, "step": 7300}, {"loss": 0.7438, "grad_norm": 0.5450230836868286, "learning_rate": 0.0002, "epoch": 0.5249551166965889, "step": 7310}, {"loss": 0.7881, "grad_norm": 0.5437141060829163, "learning_rate": 0.0002, "epoch": 0.525673249551167, "step": 7320}, {"loss": 0.795, "grad_norm": 0.5291738510131836, "learning_rate": 0.0002, "epoch": 0.526391382405745, "step": 7330}, {"loss": 0.8204, "grad_norm": 0.5101743936538696, "learning_rate": 0.0002, "epoch": 0.5271095152603231, "step": 7340}, {"loss": 0.856, "grad_norm": 0.5678408145904541, "learning_rate": 0.0002, "epoch": 0.5278276481149012, "step": 7350}, {"loss": 0.8435, "grad_norm": 0.6332360506057739, "learning_rate": 0.0002, "epoch": 0.5285457809694794, "step": 7360}, {"loss": 0.8521, "grad_norm": 0.4935058653354645, "learning_rate": 0.0002, "epoch": 0.5292639138240575, "step": 7370}, {"loss": 0.7699, "grad_norm": 0.6399656534194946, "learning_rate": 0.0002, "epoch": 0.5299820466786356, "step": 7380}, {"loss": 0.7956, "grad_norm": 0.5986794233322144, "learning_rate": 0.0002, "epoch": 0.5307001795332137, "step": 7390}, {"loss": 0.774, "grad_norm": 0.6948414444923401, "learning_rate": 0.0002, "epoch": 0.5314183123877917, "step": 7400}, {"loss": 0.8267, "grad_norm": 0.5337842106819153, "learning_rate": 0.0002, "epoch": 0.5321364452423698, "step": 7410}, {"loss": 0.7634, "grad_norm": 0.6897268295288086, "learning_rate": 0.0002, "epoch": 0.5328545780969479, "step": 7420}, {"loss": 0.7606, "grad_norm": 0.6361175179481506, "learning_rate": 0.0002, "epoch": 0.533572710951526, "step": 7430}, {"loss": 0.7592, "grad_norm": 0.5242252945899963, "learning_rate": 0.0002, "epoch": 0.5342908438061041, "step": 7440}, {"loss": 0.7387, "grad_norm": 0.5731322765350342, "learning_rate": 0.0002, "epoch": 0.5350089766606823, "step": 7450}, {"loss": 0.8215, "grad_norm": 0.5790955424308777, "learning_rate": 0.0002, "epoch": 0.5357271095152604, "step": 7460}, {"loss": 0.7714, "grad_norm": 0.4979061782360077, "learning_rate": 0.0002, "epoch": 0.5364452423698384, "step": 7470}, {"loss": 0.794, "grad_norm": 0.7335101962089539, "learning_rate": 0.0002, "epoch": 0.5371633752244165, "step": 7480}, {"loss": 0.787, "grad_norm": 0.592521071434021, "learning_rate": 0.0002, "epoch": 0.5378815080789946, "step": 7490}, {"loss": 0.7421, "grad_norm": 0.5784769654273987, "learning_rate": 0.0002, "epoch": 0.5385996409335727, "step": 7500}, {"loss": 0.789, "grad_norm": 0.8148589730262756, "learning_rate": 0.0002, "epoch": 0.5393177737881508, "step": 7510}, {"loss": 0.7777, "grad_norm": 0.5727689862251282, "learning_rate": 0.0002, "epoch": 0.5400359066427289, "step": 7520}, {"loss": 0.8321, "grad_norm": 0.6958279609680176, "learning_rate": 0.0002, "epoch": 0.540754039497307, "step": 7530}, {"loss": 0.7678, "grad_norm": 0.6302788257598877, "learning_rate": 0.0002, "epoch": 0.541472172351885, "step": 7540}, {"loss": 0.7772, "grad_norm": 0.5950970649719238, "learning_rate": 0.0002, "epoch": 0.5421903052064632, "step": 7550}, {"loss": 0.8076, "grad_norm": 0.4275270104408264, "learning_rate": 0.0002, "epoch": 0.5429084380610413, "step": 7560}, {"loss": 0.8158, "grad_norm": 0.7579900622367859, "learning_rate": 0.0002, "epoch": 0.5436265709156194, "step": 7570}, {"loss": 0.8036, "grad_norm": 0.5835317969322205, "learning_rate": 0.0002, "epoch": 0.5443447037701975, "step": 7580}, {"loss": 0.7947, "grad_norm": 0.5305142998695374, "learning_rate": 0.0002, "epoch": 0.5450628366247756, "step": 7590}, {"loss": 0.8043, "grad_norm": 0.6076129674911499, "learning_rate": 0.0002, "epoch": 0.5457809694793537, "step": 7600}, {"loss": 0.8197, "grad_norm": 0.5341935753822327, "learning_rate": 0.0002, "epoch": 0.5464991023339317, "step": 7610}, {"loss": 0.7424, "grad_norm": 0.6070826053619385, "learning_rate": 0.0002, "epoch": 0.5472172351885098, "step": 7620}, {"loss": 0.7801, "grad_norm": 0.6193035840988159, "learning_rate": 0.0002, "epoch": 0.547935368043088, "step": 7630}, {"loss": 0.7639, "grad_norm": 0.6171614527702332, "learning_rate": 0.0002, "epoch": 0.5486535008976661, "step": 7640}, {"loss": 0.7655, "grad_norm": 0.5700938105583191, "learning_rate": 0.0002, "epoch": 0.5493716337522442, "step": 7650}, {"loss": 0.8289, "grad_norm": 0.5742418169975281, "learning_rate": 0.0002, "epoch": 0.5500897666068223, "step": 7660}, {"loss": 0.7942, "grad_norm": 0.6450320482254028, "learning_rate": 0.0002, "epoch": 0.5508078994614004, "step": 7670}, {"loss": 0.807, "grad_norm": 0.542860209941864, "learning_rate": 0.0002, "epoch": 0.5515260323159784, "step": 7680}, {"loss": 0.8298, "grad_norm": 0.538007915019989, "learning_rate": 0.0002, "epoch": 0.5522441651705565, "step": 7690}, {"loss": 0.8301, "grad_norm": 0.5846288204193115, "learning_rate": 0.0002, "epoch": 0.5529622980251346, "step": 7700}, {"loss": 0.7893, "grad_norm": 0.623315155506134, "learning_rate": 0.0002, "epoch": 0.5536804308797127, "step": 7710}, {"loss": 0.8043, "grad_norm": 0.6607962250709534, "learning_rate": 0.0002, "epoch": 0.5543985637342909, "step": 7720}, {"loss": 0.7615, "grad_norm": 0.5258557200431824, "learning_rate": 0.0002, "epoch": 0.555116696588869, "step": 7730}, {"loss": 0.8177, "grad_norm": 0.6464316844940186, "learning_rate": 0.0002, "epoch": 0.5558348294434471, "step": 7740}, {"loss": 0.7683, "grad_norm": 0.6390621662139893, "learning_rate": 0.0002, "epoch": 0.5565529622980251, "step": 7750}, {"loss": 0.8447, "grad_norm": 0.5327560305595398, "learning_rate": 0.0002, "epoch": 0.5572710951526032, "step": 7760}, {"loss": 0.7833, "grad_norm": 0.8202064633369446, "learning_rate": 0.0002, "epoch": 0.5579892280071813, "step": 7770}, {"loss": 0.7818, "grad_norm": 0.45350968837738037, "learning_rate": 0.0002, "epoch": 0.5587073608617594, "step": 7780}, {"loss": 0.7299, "grad_norm": 0.5031413435935974, "learning_rate": 0.0002, "epoch": 0.5594254937163375, "step": 7790}, {"loss": 0.7542, "grad_norm": 0.5047417879104614, "learning_rate": 0.0002, "epoch": 0.5601436265709157, "step": 7800}, {"loss": 0.7989, "grad_norm": 0.668912410736084, "learning_rate": 0.0002, "epoch": 0.5608617594254938, "step": 7810}, {"loss": 0.8226, "grad_norm": 0.6106061339378357, "learning_rate": 0.0002, "epoch": 0.5615798922800718, "step": 7820}, {"loss": 0.7489, "grad_norm": 0.5558443665504456, "learning_rate": 0.0002, "epoch": 0.5622980251346499, "step": 7830}, {"loss": 0.79, "grad_norm": 0.5937177538871765, "learning_rate": 0.0002, "epoch": 0.563016157989228, "step": 7840}, {"loss": 0.7857, "grad_norm": 0.67307448387146, "learning_rate": 0.0002, "epoch": 0.5637342908438061, "step": 7850}, {"loss": 0.8037, "grad_norm": 0.4615475833415985, "learning_rate": 0.0002, "epoch": 0.5644524236983842, "step": 7860}, {"loss": 0.7519, "grad_norm": 0.5462577939033508, "learning_rate": 0.0002, "epoch": 0.5651705565529623, "step": 7870}, {"loss": 0.7821, "grad_norm": 0.6422402858734131, "learning_rate": 0.0002, "epoch": 0.5658886894075404, "step": 7880}, {"loss": 0.8327, "grad_norm": 0.5313532948493958, "learning_rate": 0.0002, "epoch": 0.5666068222621184, "step": 7890}, {"loss": 0.7771, "grad_norm": 0.5647847056388855, "learning_rate": 0.0002, "epoch": 0.5673249551166966, "step": 7900}, {"loss": 0.8126, "grad_norm": 0.6581610441207886, "learning_rate": 0.0002, "epoch": 0.5680430879712747, "step": 7910}, {"loss": 0.7549, "grad_norm": 0.46947669982910156, "learning_rate": 0.0002, "epoch": 0.5687612208258528, "step": 7920}, {"loss": 0.8333, "grad_norm": 0.6420038342475891, "learning_rate": 0.0002, "epoch": 0.5694793536804309, "step": 7930}, {"loss": 0.7921, "grad_norm": 0.6730441451072693, "learning_rate": 0.0002, "epoch": 0.570197486535009, "step": 7940}, {"loss": 0.7668, "grad_norm": 0.3849070966243744, "learning_rate": 0.0002, "epoch": 0.5709156193895871, "step": 7950}, {"loss": 0.8297, "grad_norm": 0.6076335906982422, "learning_rate": 0.0002, "epoch": 0.5716337522441651, "step": 7960}, {"loss": 0.7932, "grad_norm": 0.6446982026100159, "learning_rate": 0.0002, "epoch": 0.5723518850987432, "step": 7970}, {"loss": 0.7988, "grad_norm": 0.6019234657287598, "learning_rate": 0.0002, "epoch": 0.5730700179533214, "step": 7980}, {"loss": 0.8103, "grad_norm": 0.620880663394928, "learning_rate": 0.0002, "epoch": 0.5737881508078995, "step": 7990}, {"loss": 0.7712, "grad_norm": 0.4927573502063751, "learning_rate": 0.0002, "epoch": 0.5745062836624776, "step": 8000}, {"loss": 0.7499, "grad_norm": 0.6276804804801941, "learning_rate": 0.0002, "epoch": 0.5752244165170557, "step": 8010}, {"loss": 0.8232, "grad_norm": 0.484518826007843, "learning_rate": 0.0002, "epoch": 0.5759425493716338, "step": 8020}, {"loss": 0.7658, "grad_norm": 0.5019962787628174, "learning_rate": 0.0002, "epoch": 0.5766606822262118, "step": 8030}, {"loss": 0.7827, "grad_norm": 0.6685234308242798, "learning_rate": 0.0002, "epoch": 0.5773788150807899, "step": 8040}, {"loss": 0.7811, "grad_norm": 0.5762107372283936, "learning_rate": 0.0002, "epoch": 0.578096947935368, "step": 8050}, {"loss": 0.8256, "grad_norm": 0.6402477025985718, "learning_rate": 0.0002, "epoch": 0.5788150807899461, "step": 8060}, {"loss": 0.779, "grad_norm": 0.5919345617294312, "learning_rate": 0.0002, "epoch": 0.5795332136445243, "step": 8070}, {"loss": 0.8179, "grad_norm": 0.47100913524627686, "learning_rate": 0.0002, "epoch": 0.5802513464991024, "step": 8080}, {"loss": 0.7832, "grad_norm": 0.6029118895530701, "learning_rate": 0.0002, "epoch": 0.5809694793536805, "step": 8090}, {"loss": 0.8061, "grad_norm": 0.5896338820457458, "learning_rate": 0.0002, "epoch": 0.5816876122082585, "step": 8100}, {"loss": 0.7991, "grad_norm": 0.49017754197120667, "learning_rate": 0.0002, "epoch": 0.5824057450628366, "step": 8110}, {"loss": 0.8148, "grad_norm": 0.5049256086349487, "learning_rate": 0.0002, "epoch": 0.5831238779174147, "step": 8120}, {"loss": 0.7561, "grad_norm": 0.6874517798423767, "learning_rate": 0.0002, "epoch": 0.5838420107719928, "step": 8130}, {"loss": 0.7908, "grad_norm": 0.5429391264915466, "learning_rate": 0.0002, "epoch": 0.5845601436265709, "step": 8140}, {"loss": 0.7834, "grad_norm": 0.5533722639083862, "learning_rate": 0.0002, "epoch": 0.585278276481149, "step": 8150}, {"loss": 0.7725, "grad_norm": 0.5827956199645996, "learning_rate": 0.0002, "epoch": 0.5859964093357272, "step": 8160}, {"loss": 0.7758, "grad_norm": 0.6670212149620056, "learning_rate": 0.0002, "epoch": 0.5867145421903052, "step": 8170}, {"loss": 0.7625, "grad_norm": 0.5231172442436218, "learning_rate": 0.0002, "epoch": 0.5874326750448833, "step": 8180}, {"loss": 0.7975, "grad_norm": 0.567447304725647, "learning_rate": 0.0002, "epoch": 0.5881508078994614, "step": 8190}, {"loss": 0.7463, "grad_norm": 0.5318575501441956, "learning_rate": 0.0002, "epoch": 0.5888689407540395, "step": 8200}, {"loss": 0.7961, "grad_norm": 0.6959463357925415, "learning_rate": 0.0002, "epoch": 0.5895870736086176, "step": 8210}, {"loss": 0.7575, "grad_norm": 0.6964931488037109, "learning_rate": 0.0002, "epoch": 0.5903052064631957, "step": 8220}, {"loss": 0.8382, "grad_norm": 0.5164617896080017, "learning_rate": 0.0002, "epoch": 0.5910233393177737, "step": 8230}, {"loss": 0.8152, "grad_norm": 0.5456110239028931, "learning_rate": 0.0002, "epoch": 0.5917414721723518, "step": 8240}, {"loss": 0.7627, "grad_norm": 0.6553666591644287, "learning_rate": 0.0002, "epoch": 0.59245960502693, "step": 8250}, {"loss": 0.8134, "grad_norm": 0.6185845732688904, "learning_rate": 0.0002, "epoch": 0.5931777378815081, "step": 8260}, {"loss": 0.8216, "grad_norm": 0.6110545992851257, "learning_rate": 0.0002, "epoch": 0.5938958707360862, "step": 8270}, {"loss": 0.805, "grad_norm": 0.5186824202537537, "learning_rate": 0.0002, "epoch": 0.5946140035906643, "step": 8280}, {"loss": 0.7934, "grad_norm": 0.7003735303878784, "learning_rate": 0.0002, "epoch": 0.5953321364452424, "step": 8290}, {"loss": 0.8095, "grad_norm": 0.4606216549873352, "learning_rate": 0.0002, "epoch": 0.5960502692998204, "step": 8300}, {"loss": 0.8051, "grad_norm": 0.5903441309928894, "learning_rate": 0.0002, "epoch": 0.5967684021543985, "step": 8310}, {"loss": 0.7861, "grad_norm": 0.7916744947433472, "learning_rate": 0.0002, "epoch": 0.5974865350089766, "step": 8320}, {"loss": 0.8234, "grad_norm": 0.5506401062011719, "learning_rate": 0.0002, "epoch": 0.5982046678635548, "step": 8330}, {"loss": 0.8137, "grad_norm": 0.5749204158782959, "learning_rate": 0.0002, "epoch": 0.5989228007181329, "step": 8340}, {"loss": 0.8133, "grad_norm": 0.6807544827461243, "learning_rate": 0.0002, "epoch": 0.599640933572711, "step": 8350}, {"loss": 0.8089, "grad_norm": 0.5782986283302307, "learning_rate": 0.0002, "epoch": 0.6003590664272891, "step": 8360}, {"loss": 0.8725, "grad_norm": 0.7336342334747314, "learning_rate": 0.0002, "epoch": 0.6010771992818671, "step": 8370}, {"loss": 0.7992, "grad_norm": 0.5762712955474854, "learning_rate": 0.0002, "epoch": 0.6017953321364452, "step": 8380}, {"loss": 0.8037, "grad_norm": 0.5726776719093323, "learning_rate": 0.0002, "epoch": 0.6025134649910233, "step": 8390}, {"loss": 0.7918, "grad_norm": 0.5355535745620728, "learning_rate": 0.0002, "epoch": 0.6032315978456014, "step": 8400}, {"loss": 0.8138, "grad_norm": 0.6762161254882812, "learning_rate": 0.0002, "epoch": 0.6039497307001795, "step": 8410}, {"loss": 0.8357, "grad_norm": 0.8200717568397522, "learning_rate": 0.0002, "epoch": 0.6046678635547577, "step": 8420}, {"loss": 0.79, "grad_norm": 0.5600009560585022, "learning_rate": 0.0002, "epoch": 0.6053859964093358, "step": 8430}, {"loss": 0.7387, "grad_norm": 0.6465966105461121, "learning_rate": 0.0002, "epoch": 0.6061041292639138, "step": 8440}, {"loss": 0.838, "grad_norm": 0.5176072120666504, "learning_rate": 0.0002, "epoch": 0.6068222621184919, "step": 8450}, {"loss": 0.7855, "grad_norm": 0.5777280926704407, "learning_rate": 0.0002, "epoch": 0.60754039497307, "step": 8460}, {"loss": 0.7776, "grad_norm": 0.5989252924919128, "learning_rate": 0.0002, "epoch": 0.6082585278276481, "step": 8470}, {"loss": 0.8216, "grad_norm": 0.5207306742668152, "learning_rate": 0.0002, "epoch": 0.6089766606822262, "step": 8480}, {"loss": 0.8092, "grad_norm": 0.5242675542831421, "learning_rate": 0.0002, "epoch": 0.6096947935368043, "step": 8490}, {"loss": 0.7546, "grad_norm": 0.5631455183029175, "learning_rate": 0.0002, "epoch": 0.6104129263913824, "step": 8500}, {"loss": 0.7495, "grad_norm": 0.65207439661026, "learning_rate": 0.0002, "epoch": 0.6111310592459605, "step": 8510}, {"loss": 0.8023, "grad_norm": 0.5808899998664856, "learning_rate": 0.0002, "epoch": 0.6118491921005386, "step": 8520}, {"loss": 0.7763, "grad_norm": 0.558127760887146, "learning_rate": 0.0002, "epoch": 0.6125673249551167, "step": 8530}, {"loss": 0.8012, "grad_norm": 0.6063143014907837, "learning_rate": 0.0002, "epoch": 0.6132854578096948, "step": 8540}, {"loss": 0.7496, "grad_norm": 0.5491744875907898, "learning_rate": 0.0002, "epoch": 0.6140035906642729, "step": 8550}, {"loss": 0.779, "grad_norm": 0.5105780959129333, "learning_rate": 0.0002, "epoch": 0.614721723518851, "step": 8560}, {"loss": 0.7983, "grad_norm": 0.6892395615577698, "learning_rate": 0.0002, "epoch": 0.6154398563734291, "step": 8570}, {"loss": 0.7563, "grad_norm": 0.7411758899688721, "learning_rate": 0.0002, "epoch": 0.6161579892280071, "step": 8580}, {"loss": 0.7455, "grad_norm": 0.6745429635047913, "learning_rate": 0.0002, "epoch": 0.6168761220825852, "step": 8590}, {"loss": 0.8213, "grad_norm": 0.596007227897644, "learning_rate": 0.0002, "epoch": 0.6175942549371634, "step": 8600}, {"loss": 0.7963, "grad_norm": 0.6751060485839844, "learning_rate": 0.0002, "epoch": 0.6183123877917415, "step": 8610}, {"loss": 0.7343, "grad_norm": 0.711124837398529, "learning_rate": 0.0002, "epoch": 0.6190305206463196, "step": 8620}, {"loss": 0.773, "grad_norm": 0.6110914945602417, "learning_rate": 0.0002, "epoch": 0.6197486535008977, "step": 8630}, {"loss": 0.7497, "grad_norm": 0.5687659978866577, "learning_rate": 0.0002, "epoch": 0.6204667863554758, "step": 8640}, {"loss": 0.7754, "grad_norm": 0.7025772929191589, "learning_rate": 0.0002, "epoch": 0.6211849192100538, "step": 8650}, {"loss": 0.7423, "grad_norm": 0.6456184983253479, "learning_rate": 0.0002, "epoch": 0.6219030520646319, "step": 8660}, {"loss": 0.7449, "grad_norm": 0.5317023992538452, "learning_rate": 0.0002, "epoch": 0.62262118491921, "step": 8670}, {"loss": 0.8146, "grad_norm": 0.5531691908836365, "learning_rate": 0.0002, "epoch": 0.6233393177737881, "step": 8680}, {"loss": 0.8171, "grad_norm": 0.6063531637191772, "learning_rate": 0.0002, "epoch": 0.6240574506283663, "step": 8690}, {"loss": 0.7943, "grad_norm": 1.094390630722046, "learning_rate": 0.0002, "epoch": 0.6247755834829444, "step": 8700}, {"loss": 0.7993, "grad_norm": 0.5558148622512817, "learning_rate": 0.0002, "epoch": 0.6254937163375225, "step": 8710}, {"loss": 0.7747, "grad_norm": 0.5470370054244995, "learning_rate": 0.0002, "epoch": 0.6262118491921005, "step": 8720}, {"loss": 0.8252, "grad_norm": 0.5852634310722351, "learning_rate": 0.0002, "epoch": 0.6269299820466786, "step": 8730}, {"loss": 0.8712, "grad_norm": 0.6120240092277527, "learning_rate": 0.0002, "epoch": 0.6276481149012567, "step": 8740}, {"loss": 0.8367, "grad_norm": 0.5608004927635193, "learning_rate": 0.0002, "epoch": 0.6283662477558348, "step": 8750}, {"loss": 0.7711, "grad_norm": 0.5980432033538818, "learning_rate": 0.0002, "epoch": 0.6290843806104129, "step": 8760}, {"loss": 0.7903, "grad_norm": 0.5670580863952637, "learning_rate": 0.0002, "epoch": 0.629802513464991, "step": 8770}, {"loss": 0.7765, "grad_norm": 0.5931687951087952, "learning_rate": 0.0002, "epoch": 0.6305206463195692, "step": 8780}, {"loss": 0.7752, "grad_norm": 0.7872577905654907, "learning_rate": 0.0002, "epoch": 0.6312387791741472, "step": 8790}, {"loss": 0.8045, "grad_norm": 0.6355181336402893, "learning_rate": 0.0002, "epoch": 0.6319569120287253, "step": 8800}, {"loss": 0.7651, "grad_norm": 0.501913845539093, "learning_rate": 0.0002, "epoch": 0.6326750448833034, "step": 8810}, {"loss": 0.8023, "grad_norm": 0.5956716537475586, "learning_rate": 0.0002, "epoch": 0.6333931777378815, "step": 8820}, {"loss": 0.798, "grad_norm": 0.6448253393173218, "learning_rate": 0.0002, "epoch": 0.6341113105924596, "step": 8830}, {"loss": 0.7878, "grad_norm": 0.6139631271362305, "learning_rate": 0.0002, "epoch": 0.6348294434470377, "step": 8840}, {"loss": 0.7767, "grad_norm": 0.5894306302070618, "learning_rate": 0.0002, "epoch": 0.6355475763016158, "step": 8850}, {"loss": 0.7516, "grad_norm": 0.8724799752235413, "learning_rate": 0.0002, "epoch": 0.6362657091561938, "step": 8860}, {"loss": 0.7715, "grad_norm": 0.5413858890533447, "learning_rate": 0.0002, "epoch": 0.636983842010772, "step": 8870}, {"loss": 0.8175, "grad_norm": 0.5993430614471436, "learning_rate": 0.0002, "epoch": 0.6377019748653501, "step": 8880}, {"loss": 0.7865, "grad_norm": 0.539415717124939, "learning_rate": 0.0002, "epoch": 0.6384201077199282, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.600125789642334, "learning_rate": 0.0002, "epoch": 0.6391382405745063, "step": 8900}, {"loss": 0.7886, "grad_norm": 0.5597978234291077, "learning_rate": 0.0002, "epoch": 0.6398563734290844, "step": 8910}, {"loss": 0.8468, "grad_norm": 0.6262031197547913, "learning_rate": 0.0002, "epoch": 0.6405745062836625, "step": 8920}, {"loss": 0.7523, "grad_norm": 0.72662752866745, "learning_rate": 0.0002, "epoch": 0.6412926391382405, "step": 8930}, {"loss": 0.8099, "grad_norm": 0.613002598285675, "learning_rate": 0.0002, "epoch": 0.6420107719928186, "step": 8940}, {"loss": 0.8112, "grad_norm": 0.6511827707290649, "learning_rate": 0.0002, "epoch": 0.6427289048473968, "step": 8950}, {"loss": 0.7479, "grad_norm": 0.5383973717689514, "learning_rate": 0.0002, "epoch": 0.6434470377019749, "step": 8960}, {"loss": 0.764, "grad_norm": 0.5236184597015381, "learning_rate": 0.0002, "epoch": 0.644165170556553, "step": 8970}, {"loss": 0.7515, "grad_norm": 0.5938544273376465, "learning_rate": 0.0002, "epoch": 0.6448833034111311, "step": 8980}, {"loss": 0.8103, "grad_norm": 0.4594680964946747, "learning_rate": 0.0002, "epoch": 0.6456014362657092, "step": 8990}, {"loss": 0.7495, "grad_norm": 0.6314211487770081, "learning_rate": 0.0002, "epoch": 0.6463195691202872, "step": 9000}, {"loss": 0.8162, "grad_norm": 0.6291103363037109, "learning_rate": 0.0002, "epoch": 0.6470377019748653, "step": 9010}, {"loss": 0.8167, "grad_norm": 0.5888266563415527, "learning_rate": 0.0002, "epoch": 0.6477558348294434, "step": 9020}, {"loss": 0.7685, "grad_norm": 0.5613022446632385, "learning_rate": 0.0002, "epoch": 0.6484739676840215, "step": 9030}, {"loss": 0.8142, "grad_norm": 0.7219604253768921, "learning_rate": 0.0002, "epoch": 0.6491921005385997, "step": 9040}, {"loss": 0.805, "grad_norm": 0.5846529006958008, "learning_rate": 0.0002, "epoch": 0.6499102333931778, "step": 9050}, {"loss": 0.8471, "grad_norm": 0.7264063954353333, "learning_rate": 0.0002, "epoch": 0.6506283662477559, "step": 9060}, {"loss": 0.7925, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "epoch": 0.6513464991023339, "step": 9070}, {"loss": 0.7961, "grad_norm": 0.4857395887374878, "learning_rate": 0.0002, "epoch": 0.652064631956912, "step": 9080}, {"loss": 0.7567, "grad_norm": 0.5044030547142029, "learning_rate": 0.0002, "epoch": 0.6527827648114901, "step": 9090}, {"loss": 0.7889, "grad_norm": 0.6105342507362366, "learning_rate": 0.0002, "epoch": 0.6535008976660682, "step": 9100}, {"loss": 0.7692, "grad_norm": 0.6408740282058716, "learning_rate": 0.0002, "epoch": 0.6542190305206463, "step": 9110}, {"loss": 0.7788, "grad_norm": 0.7474880814552307, "learning_rate": 0.0002, "epoch": 0.6549371633752245, "step": 9120}, {"loss": 0.7694, "grad_norm": 0.584768533706665, "learning_rate": 0.0002, "epoch": 0.6556552962298026, "step": 9130}, {"loss": 0.8273, "grad_norm": 0.6368113160133362, "learning_rate": 0.0002, "epoch": 0.6563734290843806, "step": 9140}, {"loss": 0.7493, "grad_norm": 0.693631649017334, "learning_rate": 0.0002, "epoch": 0.6570915619389587, "step": 9150}, {"loss": 0.7636, "grad_norm": 0.6094512343406677, "learning_rate": 0.0002, "epoch": 0.6578096947935368, "step": 9160}, {"loss": 0.8269, "grad_norm": 0.7154942750930786, "learning_rate": 0.0002, "epoch": 0.6585278276481149, "step": 9170}, {"loss": 0.7623, "grad_norm": 0.5749237537384033, "learning_rate": 0.0002, "epoch": 0.659245960502693, "step": 9180}, {"loss": 0.799, "grad_norm": 0.6214450001716614, "learning_rate": 0.0002, "epoch": 0.6599640933572711, "step": 9190}, {"loss": 0.7973, "grad_norm": 0.6357814073562622, "learning_rate": 0.0002, "epoch": 0.6606822262118492, "step": 9200}, {"loss": 0.773, "grad_norm": 0.5677326917648315, "learning_rate": 0.0002, "epoch": 0.6614003590664272, "step": 9210}, {"loss": 0.8173, "grad_norm": 0.5432633757591248, "learning_rate": 0.0002, "epoch": 0.6621184919210054, "step": 9220}, {"loss": 0.7573, "grad_norm": 0.43935060501098633, "learning_rate": 0.0002, "epoch": 0.6628366247755835, "step": 9230}, {"loss": 0.848, "grad_norm": 0.5350922346115112, "learning_rate": 0.0002, "epoch": 0.6635547576301616, "step": 9240}, {"loss": 0.7409, "grad_norm": 0.7745687365531921, "learning_rate": 0.0002, "epoch": 0.6642728904847397, "step": 9250}, {"loss": 0.7412, "grad_norm": 0.5767113566398621, "learning_rate": 0.0002, "epoch": 0.6649910233393178, "step": 9260}, {"loss": 0.8197, "grad_norm": 0.49304983019828796, "learning_rate": 0.0002, "epoch": 0.6657091561938959, "step": 9270}, {"loss": 0.7856, "grad_norm": 0.6355269551277161, "learning_rate": 0.0002, "epoch": 0.6664272890484739, "step": 9280}, {"loss": 0.7659, "grad_norm": 0.5539451241493225, "learning_rate": 0.0002, "epoch": 0.667145421903052, "step": 9290}, {"loss": 0.7888, "grad_norm": 0.5225138068199158, "learning_rate": 0.0002, "epoch": 0.6678635547576302, "step": 9300}, {"loss": 0.8048, "grad_norm": 0.5435736179351807, "learning_rate": 0.0002, "epoch": 0.6685816876122083, "step": 9310}, {"loss": 0.8284, "grad_norm": 0.611266553401947, "learning_rate": 0.0002, "epoch": 0.6692998204667864, "step": 9320}, {"loss": 0.8081, "grad_norm": 0.5880926251411438, "learning_rate": 0.0002, "epoch": 0.6700179533213645, "step": 9330}, {"loss": 0.7781, "grad_norm": 0.5301468372344971, "learning_rate": 0.0002, "epoch": 0.6707360861759426, "step": 9340}, {"loss": 0.7586, "grad_norm": 0.5614377856254578, "learning_rate": 0.0002, "epoch": 0.6714542190305206, "step": 9350}, {"loss": 0.7538, "grad_norm": 0.7177342176437378, "learning_rate": 0.0002, "epoch": 0.6721723518850987, "step": 9360}, {"loss": 0.7412, "grad_norm": 0.5187423825263977, "learning_rate": 0.0002, "epoch": 0.6728904847396768, "step": 9370}, {"loss": 0.7456, "grad_norm": 0.49305087327957153, "learning_rate": 0.0002, "epoch": 0.6736086175942549, "step": 9380}, {"loss": 0.7926, "grad_norm": 0.555867612361908, "learning_rate": 0.0002, "epoch": 0.6743267504488331, "step": 9390}, {"loss": 0.7486, "grad_norm": 0.8308040499687195, "learning_rate": 0.0002, "epoch": 0.6750448833034112, "step": 9400}, {"loss": 0.8225, "grad_norm": 0.6522438526153564, "learning_rate": 0.0002, "epoch": 0.6757630161579893, "step": 9410}, {"loss": 0.8283, "grad_norm": 0.5768371224403381, "learning_rate": 0.0002, "epoch": 0.6764811490125673, "step": 9420}, {"loss": 0.7815, "grad_norm": 0.783802330493927, "learning_rate": 0.0002, "epoch": 0.6771992818671454, "step": 9430}, {"loss": 0.7511, "grad_norm": 0.5246656537055969, "learning_rate": 0.0002, "epoch": 0.6779174147217235, "step": 9440}, {"loss": 0.7866, "grad_norm": 0.6630974411964417, "learning_rate": 0.0002, "epoch": 0.6786355475763016, "step": 9450}, {"loss": 0.7961, "grad_norm": 0.5012770295143127, "learning_rate": 0.0002, "epoch": 0.6793536804308797, "step": 9460}, {"loss": 0.7762, "grad_norm": 0.6208643317222595, "learning_rate": 0.0002, "epoch": 0.6800718132854578, "step": 9470}, {"loss": 0.7229, "grad_norm": 0.6033898591995239, "learning_rate": 0.0002, "epoch": 0.680789946140036, "step": 9480}, {"loss": 0.8315, "grad_norm": 0.6613174080848694, "learning_rate": 0.0002, "epoch": 0.681508078994614, "step": 9490}, {"loss": 0.7874, "grad_norm": 0.6417899131774902, "learning_rate": 0.0002, "epoch": 0.6822262118491921, "step": 9500}, {"loss": 0.7979, "grad_norm": 0.5060321092605591, "learning_rate": 0.0002, "epoch": 0.6829443447037702, "step": 9510}, {"loss": 0.7908, "grad_norm": 0.586670458316803, "learning_rate": 0.0002, "epoch": 0.6836624775583483, "step": 9520}, {"loss": 0.7652, "grad_norm": 0.6607828736305237, "learning_rate": 0.0002, "epoch": 0.6843806104129264, "step": 9530}, {"loss": 0.7645, "grad_norm": 0.5142775177955627, "learning_rate": 0.0002, "epoch": 0.6850987432675045, "step": 9540}, {"loss": 0.7553, "grad_norm": 0.741000771522522, "learning_rate": 0.0002, "epoch": 0.6858168761220825, "step": 9550}, {"loss": 0.8453, "grad_norm": 0.4687826335430145, "learning_rate": 0.0002, "epoch": 0.6865350089766606, "step": 9560}, {"loss": 0.7582, "grad_norm": 0.6452056169509888, "learning_rate": 0.0002, "epoch": 0.6872531418312388, "step": 9570}, {"loss": 0.7965, "grad_norm": 0.6393555402755737, "learning_rate": 0.0002, "epoch": 0.6879712746858169, "step": 9580}, {"loss": 0.802, "grad_norm": 0.4907757043838501, "learning_rate": 0.0002, "epoch": 0.688689407540395, "step": 9590}, {"loss": 0.7813, "grad_norm": 0.5380825996398926, "learning_rate": 0.0002, "epoch": 0.6894075403949731, "step": 9600}, {"loss": 0.8188, "grad_norm": 0.5657393932342529, "learning_rate": 0.0002, "epoch": 0.6901256732495512, "step": 9610}, {"loss": 0.7581, "grad_norm": 0.8505447506904602, "learning_rate": 0.0002, "epoch": 0.6908438061041292, "step": 9620}, {"loss": 0.7631, "grad_norm": 0.5389836430549622, "learning_rate": 0.0002, "epoch": 0.6915619389587073, "step": 9630}, {"loss": 0.8015, "grad_norm": 0.4977441728115082, "learning_rate": 0.0002, "epoch": 0.6922800718132854, "step": 9640}, {"loss": 0.8057, "grad_norm": 0.5855389833450317, "learning_rate": 0.0002, "epoch": 0.6929982046678635, "step": 9650}, {"loss": 0.7735, "grad_norm": 0.633994996547699, "learning_rate": 0.0002, "epoch": 0.6937163375224417, "step": 9660}, {"loss": 0.7918, "grad_norm": 0.5592191815376282, "learning_rate": 0.0002, "epoch": 0.6944344703770198, "step": 9670}, {"loss": 0.7883, "grad_norm": 0.6030594706535339, "learning_rate": 0.0002, "epoch": 0.6951526032315979, "step": 9680}, {"loss": 0.7472, "grad_norm": 0.6782388687133789, "learning_rate": 0.0002, "epoch": 0.6958707360861759, "step": 9690}, {"loss": 0.8097, "grad_norm": 0.6777627468109131, "learning_rate": 0.0002, "epoch": 0.696588868940754, "step": 9700}, {"loss": 0.7958, "grad_norm": 0.5674123764038086, "learning_rate": 0.0002, "epoch": 0.6973070017953321, "step": 9710}, {"loss": 0.7743, "grad_norm": 0.5280387997627258, "learning_rate": 0.0002, "epoch": 0.6980251346499102, "step": 9720}, {"loss": 0.7496, "grad_norm": 0.5471981763839722, "learning_rate": 0.0002, "epoch": 0.6987432675044883, "step": 9730}, {"loss": 0.7837, "grad_norm": 0.6751061677932739, "learning_rate": 0.0002, "epoch": 0.6994614003590665, "step": 9740}, {"loss": 0.7686, "grad_norm": 0.5942487716674805, "learning_rate": 0.0002, "epoch": 0.7001795332136446, "step": 9750}, {"loss": 0.757, "grad_norm": 0.6165713667869568, "learning_rate": 0.0002, "epoch": 0.7008976660682226, "step": 9760}, {"loss": 0.7864, "grad_norm": 0.5745091438293457, "learning_rate": 0.0002, "epoch": 0.7016157989228007, "step": 9770}, {"loss": 0.8079, "grad_norm": 0.600308358669281, "learning_rate": 0.0002, "epoch": 0.7023339317773788, "step": 9780}, {"loss": 0.7527, "grad_norm": 0.6448577046394348, "learning_rate": 0.0002, "epoch": 0.7030520646319569, "step": 9790}, {"loss": 0.7725, "grad_norm": 0.5662767291069031, "learning_rate": 0.0002, "epoch": 0.703770197486535, "step": 9800}, {"loss": 0.8028, "grad_norm": 0.6490433812141418, "learning_rate": 0.0002, "epoch": 0.7044883303411131, "step": 9810}, {"loss": 0.8006, "grad_norm": 0.6126134991645813, "learning_rate": 0.0002, "epoch": 0.7052064631956912, "step": 9820}, {"loss": 0.8034, "grad_norm": 0.7181116938591003, "learning_rate": 0.0002, "epoch": 0.7059245960502692, "step": 9830}, {"loss": 0.7937, "grad_norm": 0.7805212140083313, "learning_rate": 0.0002, "epoch": 0.7066427289048474, "step": 9840}, {"loss": 0.7781, "grad_norm": 0.7521958947181702, "learning_rate": 0.0002, "epoch": 0.7073608617594255, "step": 9850}, {"loss": 0.7412, "grad_norm": 0.5610787868499756, "learning_rate": 0.0002, "epoch": 0.7080789946140036, "step": 9860}, {"loss": 0.7627, "grad_norm": 0.7026229500770569, "learning_rate": 0.0002, "epoch": 0.7087971274685817, "step": 9870}, {"loss": 0.8085, "grad_norm": 0.551691472530365, "learning_rate": 0.0002, "epoch": 0.7095152603231598, "step": 9880}, {"loss": 0.7874, "grad_norm": 0.5841995477676392, "learning_rate": 0.0002, "epoch": 0.7102333931777379, "step": 9890}, {"loss": 0.7749, "grad_norm": 0.7170061469078064, "learning_rate": 0.0002, "epoch": 0.7109515260323159, "step": 9900}, {"loss": 0.7917, "grad_norm": 0.49836990237236023, "learning_rate": 0.0002, "epoch": 0.711669658886894, "step": 9910}, {"loss": 0.7667, "grad_norm": 0.5234556794166565, "learning_rate": 0.0002, "epoch": 0.7123877917414722, "step": 9920}, {"loss": 0.8438, "grad_norm": 0.7590384483337402, "learning_rate": 0.0002, "epoch": 0.7131059245960503, "step": 9930}, {"loss": 0.7725, "grad_norm": 0.5657515525817871, "learning_rate": 0.0002, "epoch": 0.7138240574506284, "step": 9940}, {"loss": 0.8184, "grad_norm": 0.5969128012657166, "learning_rate": 0.0002, "epoch": 0.7145421903052065, "step": 9950}, {"loss": 0.7375, "grad_norm": 0.7136867046356201, "learning_rate": 0.0002, "epoch": 0.7152603231597846, "step": 9960}, {"loss": 0.7883, "grad_norm": 0.6774699091911316, "learning_rate": 0.0002, "epoch": 0.7159784560143626, "step": 9970}, {"loss": 0.7629, "grad_norm": 0.6066371202468872, "learning_rate": 0.0002, "epoch": 0.7166965888689407, "step": 9980}, {"loss": 0.7767, "grad_norm": 0.7355279922485352, "learning_rate": 0.0002, "epoch": 0.7174147217235188, "step": 9990}, {"loss": 0.7643, "grad_norm": 0.7996646761894226, "learning_rate": 0.0002, "epoch": 0.718132854578097, "step": 10000}, {"loss": 0.8304, "grad_norm": 0.628839910030365, "learning_rate": 0.0002, "epoch": 0.7188509874326751, "step": 10010}, {"loss": 0.7292, "grad_norm": 0.5472931265830994, "learning_rate": 0.0002, "epoch": 0.7195691202872532, "step": 10020}, {"loss": 0.7787, "grad_norm": 0.5776344537734985, "learning_rate": 0.0002, "epoch": 0.7202872531418313, "step": 10030}, {"loss": 0.7432, "grad_norm": 0.5041707158088684, "learning_rate": 0.0002, "epoch": 0.7210053859964093, "step": 10040}, {"loss": 0.7923, "grad_norm": 0.5965308547019958, "learning_rate": 0.0002, "epoch": 0.7217235188509874, "step": 10050}, {"loss": 0.8131, "grad_norm": 0.5892689228057861, "learning_rate": 0.0002, "epoch": 0.7224416517055655, "step": 10060}, {"loss": 0.7961, "grad_norm": 0.5695884227752686, "learning_rate": 0.0002, "epoch": 0.7231597845601436, "step": 10070}, {"loss": 0.7806, "grad_norm": 0.6547690629959106, "learning_rate": 0.0002, "epoch": 0.7238779174147217, "step": 10080}, {"loss": 0.7978, "grad_norm": 0.6759928464889526, "learning_rate": 0.0002, "epoch": 0.7245960502692999, "step": 10090}, {"loss": 0.7547, "grad_norm": 0.6829725503921509, "learning_rate": 0.0002, "epoch": 0.725314183123878, "step": 10100}, {"loss": 0.7507, "grad_norm": 0.5242751240730286, "learning_rate": 0.0002, "epoch": 0.726032315978456, "step": 10110}, {"loss": 0.8042, "grad_norm": 0.6947014927864075, "learning_rate": 0.0002, "epoch": 0.7267504488330341, "step": 10120}, {"loss": 0.7621, "grad_norm": 0.6094982624053955, "learning_rate": 0.0002, "epoch": 0.7274685816876122, "step": 10130}, {"loss": 0.7911, "grad_norm": 0.628461480140686, "learning_rate": 0.0002, "epoch": 0.7281867145421903, "step": 10140}, {"loss": 0.7839, "grad_norm": 0.4952087104320526, "learning_rate": 0.0002, "epoch": 0.7289048473967684, "step": 10150}, {"loss": 0.7582, "grad_norm": 0.6917221546173096, "learning_rate": 0.0002, "epoch": 0.7296229802513465, "step": 10160}, {"loss": 0.7791, "grad_norm": 0.6866413354873657, "learning_rate": 0.0002, "epoch": 0.7303411131059246, "step": 10170}, {"loss": 0.7628, "grad_norm": 0.5505863428115845, "learning_rate": 0.0002, "epoch": 0.7310592459605026, "step": 10180}, {"loss": 0.7941, "grad_norm": 0.5903199911117554, "learning_rate": 0.0002, "epoch": 0.7317773788150808, "step": 10190}, {"loss": 0.8072, "grad_norm": 0.5001798272132874, "learning_rate": 0.0002, "epoch": 0.7324955116696589, "step": 10200}, {"loss": 0.7934, "grad_norm": 0.5117581486701965, "learning_rate": 0.0002, "epoch": 0.733213644524237, "step": 10210}, {"loss": 0.8364, "grad_norm": 0.7716088891029358, "learning_rate": 0.0002, "epoch": 0.7339317773788151, "step": 10220}, {"loss": 0.7775, "grad_norm": 0.5973874926567078, "learning_rate": 0.0002, "epoch": 0.7346499102333932, "step": 10230}, {"loss": 0.7689, "grad_norm": 0.6433483362197876, "learning_rate": 0.0002, "epoch": 0.7353680430879713, "step": 10240}, {"loss": 0.8307, "grad_norm": 0.6241081357002258, "learning_rate": 0.0002, "epoch": 0.7360861759425493, "step": 10250}, {"loss": 0.7432, "grad_norm": 0.7198845744132996, "learning_rate": 0.0002, "epoch": 0.7368043087971274, "step": 10260}, {"loss": 0.7545, "grad_norm": 0.5879023671150208, "learning_rate": 0.0002, "epoch": 0.7375224416517056, "step": 10270}, {"loss": 0.7526, "grad_norm": 0.5810162425041199, "learning_rate": 0.0002, "epoch": 0.7382405745062837, "step": 10280}, {"loss": 0.7839, "grad_norm": 0.6336500644683838, "learning_rate": 0.0002, "epoch": 0.7389587073608618, "step": 10290}, {"loss": 0.7597, "grad_norm": 0.5627583861351013, "learning_rate": 0.0002, "epoch": 0.7396768402154399, "step": 10300}, {"loss": 0.8166, "grad_norm": 0.5396066904067993, "learning_rate": 0.0002, "epoch": 0.740394973070018, "step": 10310}, {"loss": 0.7698, "grad_norm": 0.5519505143165588, "learning_rate": 0.0002, "epoch": 0.741113105924596, "step": 10320}, {"loss": 0.7953, "grad_norm": 0.628710925579071, "learning_rate": 0.0002, "epoch": 0.7418312387791741, "step": 10330}, {"loss": 0.805, "grad_norm": 0.6466957926750183, "learning_rate": 0.0002, "epoch": 0.7425493716337522, "step": 10340}, {"loss": 0.8173, "grad_norm": 0.6269286274909973, "learning_rate": 0.0002, "epoch": 0.7432675044883303, "step": 10350}, {"loss": 0.8315, "grad_norm": 0.6985455751419067, "learning_rate": 0.0002, "epoch": 0.7439856373429085, "step": 10360}, {"loss": 0.7598, "grad_norm": 0.6203648447990417, "learning_rate": 0.0002, "epoch": 0.7447037701974866, "step": 10370}, {"loss": 0.7937, "grad_norm": 0.6524295210838318, "learning_rate": 0.0002, "epoch": 0.7454219030520647, "step": 10380}, {"loss": 0.8005, "grad_norm": 0.6108002662658691, "learning_rate": 0.0002, "epoch": 0.7461400359066427, "step": 10390}, {"loss": 0.7592, "grad_norm": 0.5196276903152466, "learning_rate": 0.0002, "epoch": 0.7468581687612208, "step": 10400}, {"loss": 0.7769, "grad_norm": 0.6207506656646729, "learning_rate": 0.0002, "epoch": 0.7475763016157989, "step": 10410}, {"loss": 0.8066, "grad_norm": 0.6015686988830566, "learning_rate": 0.0002, "epoch": 0.748294434470377, "step": 10420}, {"loss": 0.7993, "grad_norm": 0.6402649879455566, "learning_rate": 0.0002, "epoch": 0.7490125673249551, "step": 10430}, {"loss": 0.802, "grad_norm": 0.7816081047058105, "learning_rate": 0.0002, "epoch": 0.7497307001795332, "step": 10440}, {"loss": 0.8021, "grad_norm": 0.6148143410682678, "learning_rate": 0.0002, "epoch": 0.7504488330341114, "step": 10450}, {"loss": 0.7986, "grad_norm": 0.6496613621711731, "learning_rate": 0.0002, "epoch": 0.7511669658886894, "step": 10460}, {"loss": 0.8152, "grad_norm": 0.49158045649528503, "learning_rate": 0.0002, "epoch": 0.7518850987432675, "step": 10470}, {"loss": 0.8098, "grad_norm": 0.8629217743873596, "learning_rate": 0.0002, "epoch": 0.7526032315978456, "step": 10480}, {"loss": 0.807, "grad_norm": 0.6800066828727722, "learning_rate": 0.0002, "epoch": 0.7533213644524237, "step": 10490}, {"loss": 0.7238, "grad_norm": 0.6480063199996948, "learning_rate": 0.0002, "epoch": 0.7540394973070018, "step": 10500}, {"loss": 0.7818, "grad_norm": 0.5740751028060913, "learning_rate": 0.0002, "epoch": 0.7547576301615799, "step": 10510}, {"loss": 0.7732, "grad_norm": 0.7182627320289612, "learning_rate": 0.0002, "epoch": 0.755475763016158, "step": 10520}, {"loss": 0.7752, "grad_norm": 0.6482816934585571, "learning_rate": 0.0002, "epoch": 0.756193895870736, "step": 10530}, {"loss": 0.7564, "grad_norm": 0.4937674105167389, "learning_rate": 0.0002, "epoch": 0.7569120287253142, "step": 10540}, {"loss": 0.7783, "grad_norm": 0.6818482875823975, "learning_rate": 0.0002, "epoch": 0.7576301615798923, "step": 10550}, {"loss": 0.8303, "grad_norm": 0.6375173926353455, "learning_rate": 0.0002, "epoch": 0.7583482944344704, "step": 10560}, {"loss": 0.77, "grad_norm": 0.528798520565033, "learning_rate": 0.0002, "epoch": 0.7590664272890485, "step": 10570}, {"loss": 0.8435, "grad_norm": 0.42099910974502563, "learning_rate": 0.0002, "epoch": 0.7597845601436266, "step": 10580}, {"loss": 0.8218, "grad_norm": 0.529604434967041, "learning_rate": 0.0002, "epoch": 0.7605026929982047, "step": 10590}, {"loss": 0.7833, "grad_norm": 0.6236841082572937, "learning_rate": 0.0002, "epoch": 0.7612208258527827, "step": 10600}, {"loss": 0.777, "grad_norm": 0.6194891929626465, "learning_rate": 0.0002, "epoch": 0.7619389587073608, "step": 10610}, {"loss": 0.7967, "grad_norm": 0.5206209421157837, "learning_rate": 0.0002, "epoch": 0.762657091561939, "step": 10620}, {"loss": 0.811, "grad_norm": 0.7981295585632324, "learning_rate": 0.0002, "epoch": 0.7633752244165171, "step": 10630}, {"loss": 0.8016, "grad_norm": 0.6113479137420654, "learning_rate": 0.0002, "epoch": 0.7640933572710952, "step": 10640}, {"loss": 0.7642, "grad_norm": 0.7025435566902161, "learning_rate": 0.0002, "epoch": 0.7648114901256733, "step": 10650}, {"loss": 0.7293, "grad_norm": 0.46914348006248474, "learning_rate": 0.0002, "epoch": 0.7655296229802514, "step": 10660}, {"loss": 0.8079, "grad_norm": 0.6134725213050842, "learning_rate": 0.0002, "epoch": 0.7662477558348294, "step": 10670}, {"loss": 0.7469, "grad_norm": 0.583859920501709, "learning_rate": 0.0002, "epoch": 0.7669658886894075, "step": 10680}, {"loss": 0.843, "grad_norm": 0.511349081993103, "learning_rate": 0.0002, "epoch": 0.7676840215439856, "step": 10690}, {"loss": 0.8355, "grad_norm": 0.6467110514640808, "learning_rate": 0.0002, "epoch": 0.7684021543985637, "step": 10700}, {"loss": 0.7935, "grad_norm": 0.7210163474082947, "learning_rate": 0.0002, "epoch": 0.7691202872531419, "step": 10710}, {"loss": 0.7807, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "epoch": 0.76983842010772, "step": 10720}, {"loss": 0.7742, "grad_norm": 0.6237271428108215, "learning_rate": 0.0002, "epoch": 0.7705565529622981, "step": 10730}, {"loss": 0.8185, "grad_norm": 0.664328396320343, "learning_rate": 0.0002, "epoch": 0.7712746858168761, "step": 10740}, {"loss": 0.8096, "grad_norm": 0.6550520062446594, "learning_rate": 0.0002, "epoch": 0.7719928186714542, "step": 10750}, {"loss": 0.7538, "grad_norm": 0.5103325843811035, "learning_rate": 0.0002, "epoch": 0.7727109515260323, "step": 10760}, {"loss": 0.7777, "grad_norm": 0.7171200513839722, "learning_rate": 0.0002, "epoch": 0.7734290843806104, "step": 10770}, {"loss": 0.7743, "grad_norm": 0.5947384834289551, "learning_rate": 0.0002, "epoch": 0.7741472172351885, "step": 10780}, {"loss": 0.781, "grad_norm": 0.5293096899986267, "learning_rate": 0.0002, "epoch": 0.7748653500897666, "step": 10790}, {"loss": 0.777, "grad_norm": 0.6372577548027039, "learning_rate": 0.0002, "epoch": 0.7755834829443446, "step": 10800}, {"loss": 0.7972, "grad_norm": 0.5738261938095093, "learning_rate": 0.0002, "epoch": 0.7763016157989228, "step": 10810}, {"loss": 0.7877, "grad_norm": 0.7309247255325317, "learning_rate": 0.0002, "epoch": 0.7770197486535009, "step": 10820}, {"loss": 0.7745, "grad_norm": 0.8867193460464478, "learning_rate": 0.0002, "epoch": 0.777737881508079, "step": 10830}, {"loss": 0.7959, "grad_norm": 0.6151437759399414, "learning_rate": 0.0002, "epoch": 0.7784560143626571, "step": 10840}, {"loss": 0.7897, "grad_norm": 0.5645464658737183, "learning_rate": 0.0002, "epoch": 0.7791741472172352, "step": 10850}, {"loss": 0.7858, "grad_norm": 0.5118698477745056, "learning_rate": 0.0002, "epoch": 0.7798922800718133, "step": 10860}, {"loss": 0.8064, "grad_norm": 0.618181049823761, "learning_rate": 0.0002, "epoch": 0.7806104129263913, "step": 10870}, {"loss": 0.7675, "grad_norm": 0.7206462025642395, "learning_rate": 0.0002, "epoch": 0.7813285457809694, "step": 10880}, {"loss": 0.8162, "grad_norm": 0.7993820905685425, "learning_rate": 0.0002, "epoch": 0.7820466786355476, "step": 10890}, {"loss": 0.781, "grad_norm": 0.5072754621505737, "learning_rate": 0.0002, "epoch": 0.7827648114901257, "step": 10900}, {"loss": 0.7575, "grad_norm": 0.5829088687896729, "learning_rate": 0.0002, "epoch": 0.7834829443447038, "step": 10910}, {"loss": 0.7552, "grad_norm": 0.5778957605361938, "learning_rate": 0.0002, "epoch": 0.7842010771992819, "step": 10920}, {"loss": 0.7652, "grad_norm": 0.7237067222595215, "learning_rate": 0.0002, "epoch": 0.78491921005386, "step": 10930}, {"loss": 0.8357, "grad_norm": 0.5778013467788696, "learning_rate": 0.0002, "epoch": 0.785637342908438, "step": 10940}, {"loss": 0.7464, "grad_norm": 0.6129629611968994, "learning_rate": 0.0002, "epoch": 0.7863554757630161, "step": 10950}, {"loss": 0.7863, "grad_norm": 0.5637320876121521, "learning_rate": 0.0002, "epoch": 0.7870736086175942, "step": 10960}, {"loss": 0.7645, "grad_norm": 0.6253715753555298, "learning_rate": 0.0002, "epoch": 0.7877917414721723, "step": 10970}, {"loss": 0.8307, "grad_norm": 0.6209888458251953, "learning_rate": 0.0002, "epoch": 0.7885098743267505, "step": 10980}, {"loss": 0.7899, "grad_norm": 1.0841948986053467, "learning_rate": 0.0002, "epoch": 0.7892280071813286, "step": 10990}, {"loss": 0.7659, "grad_norm": 0.6570560336112976, "learning_rate": 0.0002, "epoch": 0.7899461400359067, "step": 11000}, {"loss": 0.7839, "grad_norm": 0.4830388128757477, "learning_rate": 0.0002, "epoch": 0.7906642728904847, "step": 11010}, {"loss": 0.8064, "grad_norm": 0.7607520222663879, "learning_rate": 0.0002, "epoch": 0.7913824057450628, "step": 11020}, {"loss": 0.8009, "grad_norm": 0.8202590346336365, "learning_rate": 0.0002, "epoch": 0.7921005385996409, "step": 11030}, {"loss": 0.7788, "grad_norm": 0.5640848278999329, "learning_rate": 0.0002, "epoch": 0.792818671454219, "step": 11040}, {"loss": 0.8298, "grad_norm": 0.7773675322532654, "learning_rate": 0.0002, "epoch": 0.7935368043087971, "step": 11050}, {"loss": 0.793, "grad_norm": 0.664139986038208, "learning_rate": 0.0002, "epoch": 0.7942549371633753, "step": 11060}, {"loss": 0.7886, "grad_norm": 0.6097795367240906, "learning_rate": 0.0002, "epoch": 0.7949730700179534, "step": 11070}, {"loss": 0.7989, "grad_norm": 0.9208881258964539, "learning_rate": 0.0002, "epoch": 0.7956912028725314, "step": 11080}, {"loss": 0.8045, "grad_norm": 0.6210731863975525, "learning_rate": 0.0002, "epoch": 0.7964093357271095, "step": 11090}, {"loss": 0.7868, "grad_norm": 0.7060235738754272, "learning_rate": 0.0002, "epoch": 0.7971274685816876, "step": 11100}, {"loss": 0.8041, "grad_norm": 0.48695266246795654, "learning_rate": 0.0002, "epoch": 0.7978456014362657, "step": 11110}, {"loss": 0.7885, "grad_norm": 0.6458830833435059, "learning_rate": 0.0002, "epoch": 0.7985637342908438, "step": 11120}, {"loss": 0.7773, "grad_norm": 0.572545051574707, "learning_rate": 0.0002, "epoch": 0.7992818671454219, "step": 11130}, {"loss": 0.7984, "grad_norm": 0.5925027132034302, "learning_rate": 0.0002, "epoch": 0.8, "step": 11140}, {"loss": 0.7571, "grad_norm": 0.569622278213501, "learning_rate": 0.0002, "epoch": 0.800718132854578, "step": 11150}, {"loss": 0.7765, "grad_norm": 0.537146806716919, "learning_rate": 0.0002, "epoch": 0.8014362657091562, "step": 11160}, {"loss": 0.7896, "grad_norm": 0.7118613719940186, "learning_rate": 0.0002, "epoch": 0.8021543985637343, "step": 11170}, {"loss": 0.7398, "grad_norm": 0.6183688044548035, "learning_rate": 0.0002, "epoch": 0.8028725314183124, "step": 11180}, {"loss": 0.7545, "grad_norm": 0.5187385082244873, "learning_rate": 0.0002, "epoch": 0.8035906642728905, "step": 11190}, {"loss": 0.766, "grad_norm": 0.5422571301460266, "learning_rate": 0.0002, "epoch": 0.8043087971274686, "step": 11200}, {"loss": 0.756, "grad_norm": 0.635050892829895, "learning_rate": 0.0002, "epoch": 0.8050269299820467, "step": 11210}, {"loss": 0.7337, "grad_norm": 0.6584872007369995, "learning_rate": 0.0002, "epoch": 0.8057450628366247, "step": 11220}, {"loss": 0.7467, "grad_norm": 0.624921977519989, "learning_rate": 0.0002, "epoch": 0.8064631956912028, "step": 11230}, {"loss": 0.7559, "grad_norm": 0.6837546229362488, "learning_rate": 0.0002, "epoch": 0.807181328545781, "step": 11240}, {"loss": 0.7861, "grad_norm": 0.5861160755157471, "learning_rate": 0.0002, "epoch": 0.8078994614003591, "step": 11250}, {"loss": 0.7883, "grad_norm": 0.5751383900642395, "learning_rate": 0.0002, "epoch": 0.8086175942549372, "step": 11260}, {"loss": 0.8103, "grad_norm": 0.7181510329246521, "learning_rate": 0.0002, "epoch": 0.8093357271095153, "step": 11270}, {"loss": 0.8066, "grad_norm": 0.5862139463424683, "learning_rate": 0.0002, "epoch": 0.8100538599640934, "step": 11280}, {"loss": 0.7692, "grad_norm": 0.4880113899707794, "learning_rate": 0.0002, "epoch": 0.8107719928186714, "step": 11290}, {"loss": 0.8154, "grad_norm": 0.565590500831604, "learning_rate": 0.0002, "epoch": 0.8114901256732495, "step": 11300}, {"loss": 0.7893, "grad_norm": 0.6171264052391052, "learning_rate": 0.0002, "epoch": 0.8122082585278276, "step": 11310}, {"loss": 0.816, "grad_norm": 0.5815969109535217, "learning_rate": 0.0002, "epoch": 0.8129263913824057, "step": 11320}, {"loss": 0.7462, "grad_norm": 0.5407653450965881, "learning_rate": 0.0002, "epoch": 0.8136445242369839, "step": 11330}, {"loss": 0.7647, "grad_norm": 0.6990084648132324, "learning_rate": 0.0002, "epoch": 0.814362657091562, "step": 11340}, {"loss": 0.783, "grad_norm": 0.5845068097114563, "learning_rate": 0.0002, "epoch": 0.8150807899461401, "step": 11350}, {"loss": 0.7839, "grad_norm": 0.5978701114654541, "learning_rate": 0.0002, "epoch": 0.8157989228007181, "step": 11360}, {"loss": 0.7342, "grad_norm": 0.6873053312301636, "learning_rate": 0.0002, "epoch": 0.8165170556552962, "step": 11370}, {"loss": 0.7656, "grad_norm": 0.7048654556274414, "learning_rate": 0.0002, "epoch": 0.8172351885098743, "step": 11380}, {"loss": 0.7293, "grad_norm": 0.7631531953811646, "learning_rate": 0.0002, "epoch": 0.8179533213644524, "step": 11390}, {"loss": 0.8606, "grad_norm": 0.704922080039978, "learning_rate": 0.0002, "epoch": 0.8186714542190305, "step": 11400}, {"loss": 0.8066, "grad_norm": 0.595460832118988, "learning_rate": 0.0002, "epoch": 0.8193895870736086, "step": 11410}, {"loss": 0.809, "grad_norm": 0.5882242918014526, "learning_rate": 0.0002, "epoch": 0.8201077199281868, "step": 11420}, {"loss": 0.7639, "grad_norm": 0.6433175206184387, "learning_rate": 0.0002, "epoch": 0.8208258527827648, "step": 11430}, {"loss": 0.7522, "grad_norm": 0.6047986149787903, "learning_rate": 0.0002, "epoch": 0.8215439856373429, "step": 11440}, {"loss": 0.8305, "grad_norm": 0.6462088823318481, "learning_rate": 0.0002, "epoch": 0.822262118491921, "step": 11450}, {"loss": 0.8144, "grad_norm": 0.5558379888534546, "learning_rate": 0.0002, "epoch": 0.8229802513464991, "step": 11460}, {"loss": 0.7916, "grad_norm": 0.6745542287826538, "learning_rate": 0.0002, "epoch": 0.8236983842010772, "step": 11470}, {"loss": 0.7853, "grad_norm": 0.7082334756851196, "learning_rate": 0.0002, "epoch": 0.8244165170556553, "step": 11480}, {"loss": 0.7533, "grad_norm": 0.703889787197113, "learning_rate": 0.0002, "epoch": 0.8251346499102334, "step": 11490}, {"loss": 0.8085, "grad_norm": 0.5261096358299255, "learning_rate": 0.0002, "epoch": 0.8258527827648114, "step": 11500}, {"loss": 0.7903, "grad_norm": 0.6009393930435181, "learning_rate": 0.0002, "epoch": 0.8265709156193896, "step": 11510}, {"loss": 0.7377, "grad_norm": 0.584274172782898, "learning_rate": 0.0002, "epoch": 0.8272890484739677, "step": 11520}, {"loss": 0.7926, "grad_norm": 0.6803238987922668, "learning_rate": 0.0002, "epoch": 0.8280071813285458, "step": 11530}, {"loss": 0.7948, "grad_norm": 0.6230084896087646, "learning_rate": 0.0002, "epoch": 0.8287253141831239, "step": 11540}, {"loss": 0.7902, "grad_norm": 0.6090595722198486, "learning_rate": 0.0002, "epoch": 0.829443447037702, "step": 11550}, {"loss": 0.7514, "grad_norm": 0.5292693376541138, "learning_rate": 0.0002, "epoch": 0.8301615798922801, "step": 11560}, {"loss": 0.7979, "grad_norm": 0.5675389766693115, "learning_rate": 0.0002, "epoch": 0.8308797127468581, "step": 11570}, {"loss": 0.7851, "grad_norm": 0.554874062538147, "learning_rate": 0.0002, "epoch": 0.8315978456014362, "step": 11580}, {"loss": 0.8004, "grad_norm": 0.8582373261451721, "learning_rate": 0.0002, "epoch": 0.8323159784560143, "step": 11590}, {"loss": 0.7864, "grad_norm": 0.5743035674095154, "learning_rate": 0.0002, "epoch": 0.8330341113105925, "step": 11600}, {"loss": 0.7714, "grad_norm": 0.5749582648277283, "learning_rate": 0.0002, "epoch": 0.8337522441651706, "step": 11610}, {"loss": 0.8131, "grad_norm": 0.5207278728485107, "learning_rate": 0.0002, "epoch": 0.8344703770197487, "step": 11620}, {"loss": 0.785, "grad_norm": 0.6262611150741577, "learning_rate": 0.0002, "epoch": 0.8351885098743268, "step": 11630}, {"loss": 0.7699, "grad_norm": 0.5490066409111023, "learning_rate": 0.0002, "epoch": 0.8359066427289048, "step": 11640}, {"loss": 0.7779, "grad_norm": 0.6283167600631714, "learning_rate": 0.0002, "epoch": 0.8366247755834829, "step": 11650}, {"loss": 0.7508, "grad_norm": 0.7701452374458313, "learning_rate": 0.0002, "epoch": 0.837342908438061, "step": 11660}, {"loss": 0.7662, "grad_norm": 0.5825072526931763, "learning_rate": 0.0002, "epoch": 0.8380610412926391, "step": 11670}, {"loss": 0.758, "grad_norm": 0.6119720935821533, "learning_rate": 0.0002, "epoch": 0.8387791741472173, "step": 11680}, {"loss": 0.7995, "grad_norm": 0.689383327960968, "learning_rate": 0.0002, "epoch": 0.8394973070017954, "step": 11690}, {"loss": 0.7615, "grad_norm": 0.5396560430526733, "learning_rate": 0.0002, "epoch": 0.8402154398563735, "step": 11700}, {"loss": 0.8073, "grad_norm": 0.577178955078125, "learning_rate": 0.0002, "epoch": 0.8409335727109515, "step": 11710}, {"loss": 0.7911, "grad_norm": 0.6652564406394958, "learning_rate": 0.0002, "epoch": 0.8416517055655296, "step": 11720}, {"loss": 0.7708, "grad_norm": 0.588377058506012, "learning_rate": 0.0002, "epoch": 0.8423698384201077, "step": 11730}, {"loss": 0.8245, "grad_norm": 0.6180438995361328, "learning_rate": 0.0002, "epoch": 0.8430879712746858, "step": 11740}, {"loss": 0.729, "grad_norm": 0.6897811889648438, "learning_rate": 0.0002, "epoch": 0.8438061041292639, "step": 11750}, {"loss": 0.8026, "grad_norm": 0.5826608538627625, "learning_rate": 0.0002, "epoch": 0.844524236983842, "step": 11760}, {"loss": 0.7959, "grad_norm": 0.6511976718902588, "learning_rate": 0.0002, "epoch": 0.8452423698384202, "step": 11770}, {"loss": 0.7705, "grad_norm": 0.4738382399082184, "learning_rate": 0.0002, "epoch": 0.8459605026929982, "step": 11780}, {"loss": 0.8317, "grad_norm": 0.541780948638916, "learning_rate": 0.0002, "epoch": 0.8466786355475763, "step": 11790}, {"loss": 0.774, "grad_norm": 0.6115241050720215, "learning_rate": 0.0002, "epoch": 0.8473967684021544, "step": 11800}, {"loss": 0.834, "grad_norm": 0.7067801356315613, "learning_rate": 0.0002, "epoch": 0.8481149012567325, "step": 11810}, {"loss": 0.7725, "grad_norm": 0.5602791905403137, "learning_rate": 0.0002, "epoch": 0.8488330341113106, "step": 11820}, {"loss": 0.7832, "grad_norm": 0.6968005299568176, "learning_rate": 0.0002, "epoch": 0.8495511669658887, "step": 11830}, {"loss": 0.7556, "grad_norm": 0.621132493019104, "learning_rate": 0.0002, "epoch": 0.8502692998204668, "step": 11840}, {"loss": 0.8036, "grad_norm": 0.5777568817138672, "learning_rate": 0.0002, "epoch": 0.8509874326750448, "step": 11850}, {"loss": 0.8071, "grad_norm": 0.6468178629875183, "learning_rate": 0.0002, "epoch": 0.851705565529623, "step": 11860}, {"loss": 0.8074, "grad_norm": 0.6216070652008057, "learning_rate": 0.0002, "epoch": 0.8524236983842011, "step": 11870}, {"loss": 0.7736, "grad_norm": 0.7402005791664124, "learning_rate": 0.0002, "epoch": 0.8531418312387792, "step": 11880}, {"loss": 0.7877, "grad_norm": 0.5192958116531372, "learning_rate": 0.0002, "epoch": 0.8538599640933573, "step": 11890}, {"loss": 0.7113, "grad_norm": 0.6050501465797424, "learning_rate": 0.0002, "epoch": 0.8545780969479354, "step": 11900}, {"loss": 0.8131, "grad_norm": 0.5363124012947083, "learning_rate": 0.0002, "epoch": 0.8552962298025135, "step": 11910}, {"loss": 0.7861, "grad_norm": 0.525288462638855, "learning_rate": 0.0002, "epoch": 0.8560143626570915, "step": 11920}, {"loss": 0.726, "grad_norm": 0.6129848957061768, "learning_rate": 0.0002, "epoch": 0.8567324955116696, "step": 11930}, {"loss": 0.7921, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 0.8574506283662477, "step": 11940}, {"loss": 0.772, "grad_norm": 0.5862830281257629, "learning_rate": 0.0002, "epoch": 0.8581687612208259, "step": 11950}, {"loss": 0.7272, "grad_norm": 0.7078025341033936, "learning_rate": 0.0002, "epoch": 0.858886894075404, "step": 11960}, {"loss": 0.7733, "grad_norm": 0.6600908637046814, "learning_rate": 0.0002, "epoch": 0.8596050269299821, "step": 11970}, {"loss": 0.7784, "grad_norm": 0.5914377570152283, "learning_rate": 0.0002, "epoch": 0.8603231597845602, "step": 11980}, {"loss": 0.8222, "grad_norm": 0.7844575047492981, "learning_rate": 0.0002, "epoch": 0.8610412926391382, "step": 11990}, {"loss": 0.8059, "grad_norm": 0.6605148315429688, "learning_rate": 0.0002, "epoch": 0.8617594254937163, "step": 12000}, {"loss": 0.8066, "grad_norm": 0.6320111155509949, "learning_rate": 0.0002, "epoch": 0.8624775583482944, "step": 12010}, {"loss": 0.7844, "grad_norm": 0.5833557844161987, "learning_rate": 0.0002, "epoch": 0.8631956912028725, "step": 12020}, {"loss": 0.8016, "grad_norm": 0.5322666764259338, "learning_rate": 0.0002, "epoch": 0.8639138240574507, "step": 12030}, {"loss": 0.8142, "grad_norm": 0.568696141242981, "learning_rate": 0.0002, "epoch": 0.8646319569120288, "step": 12040}, {"loss": 0.7929, "grad_norm": 0.5739135146141052, "learning_rate": 0.0002, "epoch": 0.8653500897666068, "step": 12050}, {"loss": 0.7877, "grad_norm": 0.6667993068695068, "learning_rate": 0.0002, "epoch": 0.8660682226211849, "step": 12060}, {"loss": 0.7538, "grad_norm": 0.5393701195716858, "learning_rate": 0.0002, "epoch": 0.866786355475763, "step": 12070}, {"loss": 0.8014, "grad_norm": 0.7036312818527222, "learning_rate": 0.0002, "epoch": 0.8675044883303411, "step": 12080}, {"loss": 0.7937, "grad_norm": 0.5851739048957825, "learning_rate": 0.0002, "epoch": 0.8682226211849192, "step": 12090}, {"loss": 0.8121, "grad_norm": 0.6554462909698486, "learning_rate": 0.0002, "epoch": 0.8689407540394973, "step": 12100}, {"loss": 0.8541, "grad_norm": 0.8224838376045227, "learning_rate": 0.0002, "epoch": 0.8696588868940754, "step": 12110}, {"loss": 0.73, "grad_norm": 0.513981819152832, "learning_rate": 0.0002, "epoch": 0.8703770197486534, "step": 12120}, {"loss": 0.7371, "grad_norm": 0.6913988590240479, "learning_rate": 0.0002, "epoch": 0.8710951526032316, "step": 12130}, {"loss": 0.762, "grad_norm": 0.5539003610610962, "learning_rate": 0.0002, "epoch": 0.8718132854578097, "step": 12140}, {"loss": 0.7535, "grad_norm": 0.6216937303543091, "learning_rate": 0.0002, "epoch": 0.8725314183123878, "step": 12150}, {"loss": 0.7344, "grad_norm": 0.5594495534896851, "learning_rate": 0.0002, "epoch": 0.8732495511669659, "step": 12160}, {"loss": 0.7342, "grad_norm": 0.6025309562683105, "learning_rate": 0.0002, "epoch": 0.873967684021544, "step": 12170}, {"loss": 0.7561, "grad_norm": 0.5285239815711975, "learning_rate": 0.0002, "epoch": 0.8746858168761221, "step": 12180}, {"loss": 0.7619, "grad_norm": 1.0394607782363892, "learning_rate": 0.0002, "epoch": 0.8754039497307001, "step": 12190}, {"loss": 0.8111, "grad_norm": 0.5128031373023987, "learning_rate": 0.0002, "epoch": 0.8761220825852782, "step": 12200}, {"loss": 0.8113, "grad_norm": 0.5883685946464539, "learning_rate": 0.0002, "epoch": 0.8768402154398564, "step": 12210}, {"loss": 0.7493, "grad_norm": 0.593204915523529, "learning_rate": 0.0002, "epoch": 0.8775583482944345, "step": 12220}, {"loss": 0.7739, "grad_norm": 0.7141679525375366, "learning_rate": 0.0002, "epoch": 0.8782764811490126, "step": 12230}, {"loss": 0.8155, "grad_norm": 0.6381585597991943, "learning_rate": 0.0002, "epoch": 0.8789946140035907, "step": 12240}, {"loss": 0.7756, "grad_norm": 0.7076981067657471, "learning_rate": 0.0002, "epoch": 0.8797127468581688, "step": 12250}, {"loss": 0.8186, "grad_norm": 0.8046461939811707, "learning_rate": 0.0002, "epoch": 0.8804308797127468, "step": 12260}, {"loss": 0.7615, "grad_norm": 0.635160505771637, "learning_rate": 0.0002, "epoch": 0.8811490125673249, "step": 12270}, {"loss": 0.7695, "grad_norm": 0.6388354301452637, "learning_rate": 0.0002, "epoch": 0.881867145421903, "step": 12280}, {"loss": 0.81, "grad_norm": 0.5612906217575073, "learning_rate": 0.0002, "epoch": 0.8825852782764811, "step": 12290}, {"loss": 0.8055, "grad_norm": 0.6716228723526001, "learning_rate": 0.0002, "epoch": 0.8833034111310593, "step": 12300}, {"loss": 0.757, "grad_norm": 0.6488762497901917, "learning_rate": 0.0002, "epoch": 0.8840215439856374, "step": 12310}, {"loss": 0.7794, "grad_norm": 0.5770853757858276, "learning_rate": 0.0002, "epoch": 0.8847396768402155, "step": 12320}, {"loss": 0.7617, "grad_norm": 0.5006616711616516, "learning_rate": 0.0002, "epoch": 0.8854578096947935, "step": 12330}, {"loss": 0.7512, "grad_norm": 0.6428417563438416, "learning_rate": 0.0002, "epoch": 0.8861759425493716, "step": 12340}, {"loss": 0.796, "grad_norm": 0.5721977949142456, "learning_rate": 0.0002, "epoch": 0.8868940754039497, "step": 12350}, {"loss": 0.7764, "grad_norm": 0.7000266313552856, "learning_rate": 0.0002, "epoch": 0.8876122082585278, "step": 12360}, {"loss": 0.7524, "grad_norm": 0.5252631306648254, "learning_rate": 0.0002, "epoch": 0.8883303411131059, "step": 12370}, {"loss": 0.7635, "grad_norm": 0.5788044929504395, "learning_rate": 0.0002, "epoch": 0.889048473967684, "step": 12380}, {"loss": 0.7856, "grad_norm": 0.6730653643608093, "learning_rate": 0.0002, "epoch": 0.8897666068222622, "step": 12390}, {"loss": 0.7925, "grad_norm": 0.5556851029396057, "learning_rate": 0.0002, "epoch": 0.8904847396768402, "step": 12400}, {"loss": 0.6958, "grad_norm": 0.616189181804657, "learning_rate": 0.0002, "epoch": 0.8912028725314183, "step": 12410}, {"loss": 0.7468, "grad_norm": 0.6360940337181091, "learning_rate": 0.0002, "epoch": 0.8919210053859964, "step": 12420}, {"loss": 0.8088, "grad_norm": 0.5832887887954712, "learning_rate": 0.0002, "epoch": 0.8926391382405745, "step": 12430}, {"loss": 0.7383, "grad_norm": 0.8319168090820312, "learning_rate": 0.0002, "epoch": 0.8933572710951526, "step": 12440}, {"loss": 0.8597, "grad_norm": 0.5415005087852478, "learning_rate": 0.0002, "epoch": 0.8940754039497307, "step": 12450}, {"loss": 0.7439, "grad_norm": 0.4959808588027954, "learning_rate": 0.0002, "epoch": 0.8947935368043088, "step": 12460}, {"loss": 0.8493, "grad_norm": 0.5102260708808899, "learning_rate": 0.0002, "epoch": 0.8955116696588868, "step": 12470}, {"loss": 0.7274, "grad_norm": 0.773972749710083, "learning_rate": 0.0002, "epoch": 0.896229802513465, "step": 12480}, {"loss": 0.7797, "grad_norm": 0.6314513087272644, "learning_rate": 0.0002, "epoch": 0.8969479353680431, "step": 12490}, {"loss": 0.7839, "grad_norm": 0.6503705382347107, "learning_rate": 0.0002, "epoch": 0.8976660682226212, "step": 12500}, {"loss": 0.8177, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 0.8983842010771993, "step": 12510}, {"loss": 0.7448, "grad_norm": 0.7222756743431091, "learning_rate": 0.0002, "epoch": 0.8991023339317774, "step": 12520}, {"loss": 0.7779, "grad_norm": 0.7242336869239807, "learning_rate": 0.0002, "epoch": 0.8998204667863555, "step": 12530}, {"loss": 0.7577, "grad_norm": 0.625769317150116, "learning_rate": 0.0002, "epoch": 0.9005385996409335, "step": 12540}, {"loss": 0.8528, "grad_norm": 0.6003357172012329, "learning_rate": 0.0002, "epoch": 0.9012567324955116, "step": 12550}, {"loss": 0.7871, "grad_norm": 0.6089374423027039, "learning_rate": 0.0002, "epoch": 0.9019748653500897, "step": 12560}, {"loss": 0.74, "grad_norm": 0.6232544183731079, "learning_rate": 0.0002, "epoch": 0.9026929982046679, "step": 12570}, {"loss": 0.7993, "grad_norm": 0.5426769256591797, "learning_rate": 0.0002, "epoch": 0.903411131059246, "step": 12580}, {"loss": 0.8023, "grad_norm": 0.5711943507194519, "learning_rate": 0.0002, "epoch": 0.9041292639138241, "step": 12590}, {"loss": 0.7915, "grad_norm": 0.5287838578224182, "learning_rate": 0.0002, "epoch": 0.9048473967684022, "step": 12600}, {"loss": 0.7394, "grad_norm": 0.6192951798439026, "learning_rate": 0.0002, "epoch": 0.9055655296229802, "step": 12610}, {"loss": 0.7547, "grad_norm": 0.493082195520401, "learning_rate": 0.0002, "epoch": 0.9062836624775583, "step": 12620}, {"loss": 0.7604, "grad_norm": 0.7668463587760925, "learning_rate": 0.0002, "epoch": 0.9070017953321364, "step": 12630}, {"loss": 0.8079, "grad_norm": 0.6298037767410278, "learning_rate": 0.0002, "epoch": 0.9077199281867145, "step": 12640}, {"loss": 0.7451, "grad_norm": 0.5502580404281616, "learning_rate": 0.0002, "epoch": 0.9084380610412927, "step": 12650}, {"loss": 0.763, "grad_norm": 0.5525170564651489, "learning_rate": 0.0002, "epoch": 0.9091561938958708, "step": 12660}, {"loss": 0.7579, "grad_norm": 0.9753695726394653, "learning_rate": 0.0002, "epoch": 0.9098743267504489, "step": 12670}, {"loss": 0.872, "grad_norm": 0.611427366733551, "learning_rate": 0.0002, "epoch": 0.9105924596050269, "step": 12680}, {"loss": 0.7786, "grad_norm": 0.5141594409942627, "learning_rate": 0.0002, "epoch": 0.911310592459605, "step": 12690}, {"loss": 0.7384, "grad_norm": 0.6739137172698975, "learning_rate": 0.0002, "epoch": 0.9120287253141831, "step": 12700}, {"loss": 0.8579, "grad_norm": 0.5759707689285278, "learning_rate": 0.0002, "epoch": 0.9127468581687612, "step": 12710}, {"loss": 0.7559, "grad_norm": 0.5548733472824097, "learning_rate": 0.0002, "epoch": 0.9134649910233393, "step": 12720}, {"loss": 0.8225, "grad_norm": 0.7014280557632446, "learning_rate": 0.0002, "epoch": 0.9141831238779174, "step": 12730}, {"loss": 0.7936, "grad_norm": 0.5939958691596985, "learning_rate": 0.0002, "epoch": 0.9149012567324956, "step": 12740}, {"loss": 0.7756, "grad_norm": 0.5995593667030334, "learning_rate": 0.0002, "epoch": 0.9156193895870736, "step": 12750}, {"loss": 0.7423, "grad_norm": 0.6686680316925049, "learning_rate": 0.0002, "epoch": 0.9163375224416517, "step": 12760}, {"loss": 0.8057, "grad_norm": 0.4742372930049896, "learning_rate": 0.0002, "epoch": 0.9170556552962298, "step": 12770}, {"loss": 0.7795, "grad_norm": 0.5493217706680298, "learning_rate": 0.0002, "epoch": 0.9177737881508079, "step": 12780}, {"loss": 0.7859, "grad_norm": 0.5641885995864868, "learning_rate": 0.0002, "epoch": 0.918491921005386, "step": 12790}, {"loss": 0.7775, "grad_norm": 0.5814061164855957, "learning_rate": 0.0002, "epoch": 0.9192100538599641, "step": 12800}, {"loss": 0.8204, "grad_norm": 0.6774331331253052, "learning_rate": 0.0002, "epoch": 0.9199281867145422, "step": 12810}, {"loss": 0.8205, "grad_norm": 0.5592127442359924, "learning_rate": 0.0002, "epoch": 0.9206463195691202, "step": 12820}, {"loss": 0.7788, "grad_norm": 0.5246456861495972, "learning_rate": 0.0002, "epoch": 0.9213644524236984, "step": 12830}, {"loss": 0.7886, "grad_norm": 0.6524264812469482, "learning_rate": 0.0002, "epoch": 0.9220825852782765, "step": 12840}, {"loss": 0.796, "grad_norm": 0.6010791063308716, "learning_rate": 0.0002, "epoch": 0.9228007181328546, "step": 12850}, {"loss": 0.7998, "grad_norm": 0.5289866924285889, "learning_rate": 0.0002, "epoch": 0.9235188509874327, "step": 12860}, {"loss": 0.7582, "grad_norm": 0.6850762367248535, "learning_rate": 0.0002, "epoch": 0.9242369838420108, "step": 12870}, {"loss": 0.7894, "grad_norm": 0.5293797850608826, "learning_rate": 0.0002, "epoch": 0.9249551166965889, "step": 12880}, {"loss": 0.7738, "grad_norm": 0.6045399308204651, "learning_rate": 0.0002, "epoch": 0.9256732495511669, "step": 12890}, {"loss": 0.7207, "grad_norm": 0.7026739716529846, "learning_rate": 0.0002, "epoch": 0.926391382405745, "step": 12900}, {"loss": 0.7726, "grad_norm": 0.6884756684303284, "learning_rate": 0.0002, "epoch": 0.9271095152603231, "step": 12910}, {"loss": 0.7913, "grad_norm": 0.637884795665741, "learning_rate": 0.0002, "epoch": 0.9278276481149013, "step": 12920}, {"loss": 0.7513, "grad_norm": 0.513913631439209, "learning_rate": 0.0002, "epoch": 0.9285457809694794, "step": 12930}, {"loss": 0.8, "grad_norm": 0.6642340421676636, "learning_rate": 0.0002, "epoch": 0.9292639138240575, "step": 12940}, {"loss": 0.8026, "grad_norm": 0.5708861947059631, "learning_rate": 0.0002, "epoch": 0.9299820466786356, "step": 12950}, {"loss": 0.8234, "grad_norm": 0.5896512866020203, "learning_rate": 0.0002, "epoch": 0.9307001795332136, "step": 12960}, {"loss": 0.77, "grad_norm": 0.5754874348640442, "learning_rate": 0.0002, "epoch": 0.9314183123877917, "step": 12970}, {"loss": 0.7594, "grad_norm": 0.6363751888275146, "learning_rate": 0.0002, "epoch": 0.9321364452423698, "step": 12980}, {"loss": 0.7898, "grad_norm": 0.7660197019577026, "learning_rate": 0.0002, "epoch": 0.9328545780969479, "step": 12990}, {"loss": 0.792, "grad_norm": 0.607728898525238, "learning_rate": 0.0002, "epoch": 0.933572710951526, "step": 13000}, {"loss": 0.734, "grad_norm": 0.5257042050361633, "learning_rate": 0.0002, "epoch": 0.9342908438061042, "step": 13010}, {"loss": 0.8129, "grad_norm": 0.7916908264160156, "learning_rate": 0.0002, "epoch": 0.9350089766606823, "step": 13020}, {"loss": 0.81, "grad_norm": 0.8310123085975647, "learning_rate": 0.0002, "epoch": 0.9357271095152603, "step": 13030}, {"loss": 0.7738, "grad_norm": 0.6543728113174438, "learning_rate": 0.0002, "epoch": 0.9364452423698384, "step": 13040}, {"loss": 0.7797, "grad_norm": 0.7153878808021545, "learning_rate": 0.0002, "epoch": 0.9371633752244165, "step": 13050}, {"loss": 0.779, "grad_norm": 0.7510694265365601, "learning_rate": 0.0002, "epoch": 0.9378815080789946, "step": 13060}, {"loss": 0.7761, "grad_norm": 0.5524464249610901, "learning_rate": 0.0002, "epoch": 0.9385996409335727, "step": 13070}, {"loss": 0.8635, "grad_norm": 0.6657140254974365, "learning_rate": 0.0002, "epoch": 0.9393177737881508, "step": 13080}, {"loss": 0.8097, "grad_norm": 0.5757394433021545, "learning_rate": 0.0002, "epoch": 0.940035906642729, "step": 13090}, {"loss": 0.7967, "grad_norm": 0.6171187162399292, "learning_rate": 0.0002, "epoch": 0.940754039497307, "step": 13100}, {"loss": 0.8197, "grad_norm": 0.5946314334869385, "learning_rate": 0.0002, "epoch": 0.9414721723518851, "step": 13110}, {"loss": 0.7184, "grad_norm": 0.5727229714393616, "learning_rate": 0.0002, "epoch": 0.9421903052064632, "step": 13120}, {"loss": 0.7981, "grad_norm": 0.7805224061012268, "learning_rate": 0.0002, "epoch": 0.9429084380610413, "step": 13130}, {"loss": 0.8045, "grad_norm": 0.5763523578643799, "learning_rate": 0.0002, "epoch": 0.9436265709156194, "step": 13140}, {"loss": 0.7462, "grad_norm": 0.8310899138450623, "learning_rate": 0.0002, "epoch": 0.9443447037701975, "step": 13150}, {"loss": 0.7818, "grad_norm": 0.7531784772872925, "learning_rate": 0.0002, "epoch": 0.9450628366247756, "step": 13160}, {"loss": 0.8418, "grad_norm": 0.678779661655426, "learning_rate": 0.0002, "epoch": 0.9457809694793536, "step": 13170}, {"loss": 0.8064, "grad_norm": 0.8096453547477722, "learning_rate": 0.0002, "epoch": 0.9464991023339318, "step": 13180}, {"loss": 0.7676, "grad_norm": 0.6743921637535095, "learning_rate": 0.0002, "epoch": 0.9472172351885099, "step": 13190}, {"loss": 0.7949, "grad_norm": 0.606852114200592, "learning_rate": 0.0002, "epoch": 0.947935368043088, "step": 13200}, {"loss": 0.7908, "grad_norm": 0.6550270915031433, "learning_rate": 0.0002, "epoch": 0.9486535008976661, "step": 13210}, {"loss": 0.7564, "grad_norm": 0.6494552493095398, "learning_rate": 0.0002, "epoch": 0.9493716337522442, "step": 13220}, {"loss": 0.7974, "grad_norm": 0.5867666602134705, "learning_rate": 0.0002, "epoch": 0.9500897666068223, "step": 13230}, {"loss": 0.8117, "grad_norm": 0.6283786296844482, "learning_rate": 0.0002, "epoch": 0.9508078994614003, "step": 13240}, {"loss": 0.7775, "grad_norm": 0.6824573278427124, "learning_rate": 0.0002, "epoch": 0.9515260323159784, "step": 13250}, {"loss": 0.7674, "grad_norm": 0.6945744156837463, "learning_rate": 0.0002, "epoch": 0.9522441651705565, "step": 13260}, {"loss": 0.7384, "grad_norm": 0.6468575596809387, "learning_rate": 0.0002, "epoch": 0.9529622980251347, "step": 13270}, {"loss": 0.7548, "grad_norm": 0.6819407939910889, "learning_rate": 0.0002, "epoch": 0.9536804308797128, "step": 13280}, {"loss": 0.7933, "grad_norm": 0.6660491824150085, "learning_rate": 0.0002, "epoch": 0.9543985637342909, "step": 13290}, {"loss": 0.7293, "grad_norm": 0.6320462226867676, "learning_rate": 0.0002, "epoch": 0.9551166965888689, "step": 13300}, {"loss": 0.8122, "grad_norm": 0.46753761172294617, "learning_rate": 0.0002, "epoch": 0.955834829443447, "step": 13310}, {"loss": 0.7953, "grad_norm": 0.6608774065971375, "learning_rate": 0.0002, "epoch": 0.9565529622980251, "step": 13320}, {"loss": 0.8217, "grad_norm": 0.607448935508728, "learning_rate": 0.0002, "epoch": 0.9572710951526032, "step": 13330}, {"loss": 0.7278, "grad_norm": 0.6796701550483704, "learning_rate": 0.0002, "epoch": 0.9579892280071813, "step": 13340}, {"loss": 0.7979, "grad_norm": 0.7655861377716064, "learning_rate": 0.0002, "epoch": 0.9587073608617595, "step": 13350}, {"loss": 0.7822, "grad_norm": 0.5881335735321045, "learning_rate": 0.0002, "epoch": 0.9594254937163376, "step": 13360}, {"loss": 0.815, "grad_norm": 0.6855270862579346, "learning_rate": 0.0002, "epoch": 0.9601436265709156, "step": 13370}, {"loss": 0.8025, "grad_norm": 0.6072475910186768, "learning_rate": 0.0002, "epoch": 0.9608617594254937, "step": 13380}, {"loss": 0.7756, "grad_norm": 0.5983994603157043, "learning_rate": 0.0002, "epoch": 0.9615798922800718, "step": 13390}, {"loss": 0.8121, "grad_norm": 0.6141189932823181, "learning_rate": 0.0002, "epoch": 0.9622980251346499, "step": 13400}, {"loss": 0.8059, "grad_norm": 0.6539722084999084, "learning_rate": 0.0002, "epoch": 0.963016157989228, "step": 13410}, {"loss": 0.8085, "grad_norm": 0.5425801277160645, "learning_rate": 0.0002, "epoch": 0.9637342908438061, "step": 13420}, {"loss": 0.7687, "grad_norm": 0.8038925528526306, "learning_rate": 0.0002, "epoch": 0.9644524236983842, "step": 13430}, {"loss": 0.8015, "grad_norm": 0.5729590058326721, "learning_rate": 0.0002, "epoch": 0.9651705565529622, "step": 13440}, {"loss": 0.782, "grad_norm": 0.5695241689682007, "learning_rate": 0.0002, "epoch": 0.9658886894075404, "step": 13450}, {"loss": 0.7984, "grad_norm": 0.5913681387901306, "learning_rate": 0.0002, "epoch": 0.9666068222621185, "step": 13460}, {"loss": 0.7947, "grad_norm": 1.1798994541168213, "learning_rate": 0.0002, "epoch": 0.9673249551166966, "step": 13470}, {"loss": 0.7342, "grad_norm": 0.5931369066238403, "learning_rate": 0.0002, "epoch": 0.9680430879712747, "step": 13480}, {"loss": 0.8432, "grad_norm": 0.6269514560699463, "learning_rate": 0.0002, "epoch": 0.9687612208258528, "step": 13490}, {"loss": 0.7357, "grad_norm": 0.7380245327949524, "learning_rate": 0.0002, "epoch": 0.9694793536804309, "step": 13500}, {"loss": 0.8006, "grad_norm": 0.5668187141418457, "learning_rate": 0.0002, "epoch": 0.9701974865350089, "step": 13510}, {"loss": 0.7562, "grad_norm": 0.547149121761322, "learning_rate": 0.0002, "epoch": 0.970915619389587, "step": 13520}, {"loss": 0.8239, "grad_norm": 0.49131739139556885, "learning_rate": 0.0002, "epoch": 0.9716337522441651, "step": 13530}, {"loss": 0.8159, "grad_norm": 0.6385366320610046, "learning_rate": 0.0002, "epoch": 0.9723518850987433, "step": 13540}, {"loss": 0.7882, "grad_norm": 0.5962417125701904, "learning_rate": 0.0002, "epoch": 0.9730700179533214, "step": 13550}, {"loss": 0.7353, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9737881508078995, "step": 13560}, {"loss": 0.7511, "grad_norm": 0.5757403373718262, "learning_rate": 0.0002, "epoch": 0.9745062836624776, "step": 13570}, {"loss": 0.7858, "grad_norm": 0.7214667201042175, "learning_rate": 0.0002, "epoch": 0.9752244165170556, "step": 13580}, {"loss": 0.7492, "grad_norm": 0.5902701020240784, "learning_rate": 0.0002, "epoch": 0.9759425493716337, "step": 13590}, {"loss": 0.8177, "grad_norm": 0.752805769443512, "learning_rate": 0.0002, "epoch": 0.9766606822262118, "step": 13600}, {"loss": 0.7622, "grad_norm": 0.5943595767021179, "learning_rate": 0.0002, "epoch": 0.9773788150807899, "step": 13610}, {"loss": 0.7781, "grad_norm": 0.6752488613128662, "learning_rate": 0.0002, "epoch": 0.978096947935368, "step": 13620}, {"loss": 0.8022, "grad_norm": 0.5295413732528687, "learning_rate": 0.0002, "epoch": 0.9788150807899462, "step": 13630}, {"loss": 0.7462, "grad_norm": 0.732549250125885, "learning_rate": 0.0002, "epoch": 0.9795332136445243, "step": 13640}, {"loss": 0.7939, "grad_norm": 0.5701823830604553, "learning_rate": 0.0002, "epoch": 0.9802513464991023, "step": 13650}, {"loss": 0.7609, "grad_norm": 0.576898455619812, "learning_rate": 0.0002, "epoch": 0.9809694793536804, "step": 13660}, {"loss": 0.7576, "grad_norm": 0.5916832089424133, "learning_rate": 0.0002, "epoch": 0.9816876122082585, "step": 13670}, {"loss": 0.7587, "grad_norm": 0.5554524660110474, "learning_rate": 0.0002, "epoch": 0.9824057450628366, "step": 13680}, {"loss": 0.8274, "grad_norm": 0.6988440752029419, "learning_rate": 0.0002, "epoch": 0.9831238779174147, "step": 13690}, {"loss": 0.7485, "grad_norm": 0.6660445332527161, "learning_rate": 0.0002, "epoch": 0.9838420107719928, "step": 13700}, {"loss": 0.7609, "grad_norm": 2.421210289001465, "learning_rate": 0.0002, "epoch": 0.984560143626571, "step": 13710}, {"loss": 0.784, "grad_norm": 0.6307598948478699, "learning_rate": 0.0002, "epoch": 0.985278276481149, "step": 13720}, {"loss": 0.7757, "grad_norm": 0.6832480430603027, "learning_rate": 0.0002, "epoch": 0.9859964093357271, "step": 13730}, {"loss": 0.8064, "grad_norm": 0.5974255204200745, "learning_rate": 0.0002, "epoch": 0.9867145421903052, "step": 13740}, {"loss": 0.7871, "grad_norm": 0.6540380716323853, "learning_rate": 0.0002, "epoch": 0.9874326750448833, "step": 13750}, {"loss": 0.7735, "grad_norm": 0.7532727122306824, "learning_rate": 0.0002, "epoch": 0.9881508078994614, "step": 13760}, {"loss": 0.7392, "grad_norm": 0.6776283383369446, "learning_rate": 0.0002, "epoch": 0.9888689407540395, "step": 13770}, {"loss": 0.7852, "grad_norm": 0.5776281356811523, "learning_rate": 0.0002, "epoch": 0.9895870736086176, "step": 13780}, {"loss": 0.8216, "grad_norm": 0.5473008751869202, "learning_rate": 0.0002, "epoch": 0.9903052064631956, "step": 13790}, {"loss": 0.7776, "grad_norm": 0.5428591370582581, "learning_rate": 0.0002, "epoch": 0.9910233393177738, "step": 13800}, {"loss": 0.7823, "grad_norm": 0.5173406004905701, "learning_rate": 0.0002, "epoch": 0.9917414721723519, "step": 13810}, {"loss": 0.762, "grad_norm": 0.6462617516517639, "learning_rate": 0.0002, "epoch": 0.99245960502693, "step": 13820}, {"loss": 0.7656, "grad_norm": 0.5800426006317139, "learning_rate": 0.0002, "epoch": 0.9931777378815081, "step": 13830}, {"loss": 0.8028, "grad_norm": 0.5015466809272766, "learning_rate": 0.0002, "epoch": 0.9938958707360862, "step": 13840}, {"loss": 0.7782, "grad_norm": 0.59474778175354, "learning_rate": 0.0002, "epoch": 0.9946140035906643, "step": 13850}, {"loss": 0.7891, "grad_norm": 0.5609583258628845, "learning_rate": 0.0002, "epoch": 0.9953321364452423, "step": 13860}, {"loss": 0.7647, "grad_norm": 0.5762063264846802, "learning_rate": 0.0002, "epoch": 0.9960502692998204, "step": 13870}, {"loss": 0.7594, "grad_norm": 0.6419214010238647, "learning_rate": 0.0002, "epoch": 0.9967684021543985, "step": 13880}, {"loss": 0.7599, "grad_norm": 0.7821950316429138, "learning_rate": 0.0002, "epoch": 0.9974865350089767, "step": 13890}, {"loss": 0.7529, "grad_norm": 0.6216017007827759, "learning_rate": 0.0002, "epoch": 0.9982046678635548, "step": 13900}, {"loss": 0.7621, "grad_norm": 0.5446485877037048, "learning_rate": 0.0002, "epoch": 0.9989228007181329, "step": 13910}, {"loss": 0.74, "grad_norm": 0.5037565231323242, "learning_rate": 0.0002, "epoch": 0.999640933572711, "step": 13920}, {"eval_loss": 1.09147310256958, "eval_runtime": 55.1915, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 1.0, "step": 13925}, {"loss": 0.7479, "grad_norm": 0.5808277130126953, "learning_rate": 0.0002, "epoch": 1.000359066427289, "step": 13930}, {"loss": 0.7147, "grad_norm": 0.47258496284484863, "learning_rate": 0.0002, "epoch": 1.0010771992818672, "step": 13940}, {"loss": 0.7075, "grad_norm": 0.8921670317649841, "learning_rate": 0.0002, "epoch": 1.0017953321364452, "step": 13950}, {"loss": 0.7737, "grad_norm": 0.746729850769043, "learning_rate": 0.0002, "epoch": 1.0025134649910232, "step": 13960}, {"loss": 0.6912, "grad_norm": 0.6243796944618225, "learning_rate": 0.0002, "epoch": 1.0032315978456015, "step": 13970}, {"loss": 0.7171, "grad_norm": 0.6725090742111206, "learning_rate": 0.0002, "epoch": 1.0039497307001795, "step": 13980}, {"loss": 0.7094, "grad_norm": 0.8762497305870056, "learning_rate": 0.0002, "epoch": 1.0046678635547577, "step": 13990}, {"loss": 0.7183, "grad_norm": 0.7694411873817444, "learning_rate": 0.0002, "epoch": 1.0053859964093357, "step": 14000}, {"loss": 0.7741, "grad_norm": 0.6208822727203369, "learning_rate": 0.0002, "epoch": 1.006104129263914, "step": 14010}, {"loss": 0.7291, "grad_norm": 0.8503357768058777, "learning_rate": 0.0002, "epoch": 1.006822262118492, "step": 14020}, {"loss": 0.7189, "grad_norm": 0.5813316106796265, "learning_rate": 0.0002, "epoch": 1.00754039497307, "step": 14030}, {"loss": 0.751, "grad_norm": 0.8186036348342896, "learning_rate": 0.0002, "epoch": 1.0082585278276481, "step": 14040}, {"loss": 0.7205, "grad_norm": 0.759873628616333, "learning_rate": 0.0002, "epoch": 1.0089766606822261, "step": 14050}, {"loss": 0.7517, "grad_norm": 0.8437777161598206, "learning_rate": 0.0002, "epoch": 1.0096947935368044, "step": 14060}, {"loss": 0.7205, "grad_norm": 0.5750975012779236, "learning_rate": 0.0002, "epoch": 1.0104129263913824, "step": 14070}, {"loss": 0.7079, "grad_norm": 0.5873221158981323, "learning_rate": 0.0002, "epoch": 1.0111310592459606, "step": 14080}, {"loss": 0.7645, "grad_norm": 0.6381314396858215, "learning_rate": 0.0002, "epoch": 1.0118491921005386, "step": 14090}, {"loss": 0.7246, "grad_norm": 0.6510405540466309, "learning_rate": 0.0002, "epoch": 1.0125673249551166, "step": 14100}, {"loss": 0.6906, "grad_norm": 0.7698671221733093, "learning_rate": 0.0002, "epoch": 1.0132854578096948, "step": 14110}, {"loss": 0.7008, "grad_norm": 0.646180272102356, "learning_rate": 0.0002, "epoch": 1.0140035906642728, "step": 14120}, {"loss": 0.7446, "grad_norm": 0.6183205246925354, "learning_rate": 0.0002, "epoch": 1.014721723518851, "step": 14130}, {"loss": 0.747, "grad_norm": 0.5082563757896423, "learning_rate": 0.0002, "epoch": 1.015439856373429, "step": 14140}, {"loss": 0.7229, "grad_norm": 0.7285500764846802, "learning_rate": 0.0002, "epoch": 1.0161579892280073, "step": 14150}, {"loss": 0.6879, "grad_norm": 0.6368175148963928, "learning_rate": 0.0002, "epoch": 1.0168761220825853, "step": 14160}, {"loss": 0.712, "grad_norm": 0.44868743419647217, "learning_rate": 0.0002, "epoch": 1.0175942549371633, "step": 14170}, {"loss": 0.7299, "grad_norm": 0.6346513628959656, "learning_rate": 0.0002, "epoch": 1.0183123877917415, "step": 14180}, {"loss": 0.7099, "grad_norm": 0.7287803292274475, "learning_rate": 0.0002, "epoch": 1.0190305206463195, "step": 14190}, {"loss": 0.6915, "grad_norm": 0.6701363325119019, "learning_rate": 0.0002, "epoch": 1.0197486535008977, "step": 14200}, {"loss": 0.7389, "grad_norm": 0.6419289112091064, "learning_rate": 0.0002, "epoch": 1.0204667863554757, "step": 14210}, {"loss": 0.7386, "grad_norm": 0.7703002095222473, "learning_rate": 0.0002, "epoch": 1.021184919210054, "step": 14220}, {"loss": 0.6819, "grad_norm": 0.6803670525550842, "learning_rate": 0.0002, "epoch": 1.021903052064632, "step": 14230}, {"loss": 0.74, "grad_norm": 0.5780976414680481, "learning_rate": 0.0002, "epoch": 1.02262118491921, "step": 14240}, {"loss": 0.6912, "grad_norm": 0.5096051096916199, "learning_rate": 0.0002, "epoch": 1.0233393177737882, "step": 14250}, {"loss": 0.7585, "grad_norm": 0.6058611869812012, "learning_rate": 0.0002, "epoch": 1.0240574506283662, "step": 14260}, {"loss": 0.7542, "grad_norm": 0.6703311204910278, "learning_rate": 0.0002, "epoch": 1.0247755834829444, "step": 14270}, {"loss": 0.7541, "grad_norm": 0.7143640518188477, "learning_rate": 0.0002, "epoch": 1.0254937163375224, "step": 14280}, {"loss": 0.7411, "grad_norm": 0.6730744242668152, "learning_rate": 0.0002, "epoch": 1.0262118491921006, "step": 14290}, {"loss": 0.7072, "grad_norm": 0.8180603384971619, "learning_rate": 0.0002, "epoch": 1.0269299820466786, "step": 14300}, {"loss": 0.6944, "grad_norm": 0.6752267479896545, "learning_rate": 0.0002, "epoch": 1.0276481149012566, "step": 14310}, {"loss": 0.7105, "grad_norm": 0.678428590297699, "learning_rate": 0.0002, "epoch": 1.0283662477558349, "step": 14320}, {"loss": 0.7496, "grad_norm": 0.5959973931312561, "learning_rate": 0.0002, "epoch": 1.0290843806104129, "step": 14330}, {"loss": 0.7196, "grad_norm": 0.5797176957130432, "learning_rate": 0.0002, "epoch": 1.029802513464991, "step": 14340}, {"loss": 0.7853, "grad_norm": 0.6415652632713318, "learning_rate": 0.0002, "epoch": 1.030520646319569, "step": 14350}, {"loss": 0.7297, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 1.0312387791741473, "step": 14360}, {"loss": 0.7715, "grad_norm": 0.7158452272415161, "learning_rate": 0.0002, "epoch": 1.0319569120287253, "step": 14370}, {"loss": 0.7526, "grad_norm": 0.6066089272499084, "learning_rate": 0.0002, "epoch": 1.0326750448833033, "step": 14380}, {"loss": 0.7639, "grad_norm": 0.7359582781791687, "learning_rate": 0.0002, "epoch": 1.0333931777378815, "step": 14390}, {"loss": 0.7445, "grad_norm": 0.7372373938560486, "learning_rate": 0.0002, "epoch": 1.0341113105924595, "step": 14400}, {"loss": 0.7262, "grad_norm": 0.7511868476867676, "learning_rate": 0.0002, "epoch": 1.0348294434470378, "step": 14410}, {"loss": 0.7145, "grad_norm": 0.5449917912483215, "learning_rate": 0.0002, "epoch": 1.0355475763016158, "step": 14420}, {"loss": 0.6908, "grad_norm": 0.6700817346572876, "learning_rate": 0.0002, "epoch": 1.036265709156194, "step": 14430}, {"loss": 0.7237, "grad_norm": 0.7061316967010498, "learning_rate": 0.0002, "epoch": 1.036983842010772, "step": 14440}, {"loss": 0.7166, "grad_norm": 0.7582663893699646, "learning_rate": 0.0002, "epoch": 1.03770197486535, "step": 14450}, {"loss": 0.7447, "grad_norm": 0.6408873200416565, "learning_rate": 0.0002, "epoch": 1.0384201077199282, "step": 14460}, {"loss": 0.728, "grad_norm": 0.7645436525344849, "learning_rate": 0.0002, "epoch": 1.0391382405745062, "step": 14470}, {"loss": 0.7764, "grad_norm": 0.6522644758224487, "learning_rate": 0.0002, "epoch": 1.0398563734290844, "step": 14480}, {"loss": 0.7249, "grad_norm": 0.784273624420166, "learning_rate": 0.0002, "epoch": 1.0405745062836624, "step": 14490}, {"loss": 0.7173, "grad_norm": 0.673891544342041, "learning_rate": 0.0002, "epoch": 1.0412926391382407, "step": 14500}, {"loss": 0.6647, "grad_norm": 0.6566316485404968, "learning_rate": 0.0002, "epoch": 1.0420107719928187, "step": 14510}, {"loss": 0.7626, "grad_norm": 0.6062059998512268, "learning_rate": 0.0002, "epoch": 1.0427289048473967, "step": 14520}, {"loss": 0.7061, "grad_norm": 0.6884504556655884, "learning_rate": 0.0002, "epoch": 1.0434470377019749, "step": 14530}, {"loss": 0.7293, "grad_norm": 0.6642231345176697, "learning_rate": 0.0002, "epoch": 1.044165170556553, "step": 14540}, {"loss": 0.7084, "grad_norm": 0.6989523768424988, "learning_rate": 0.0002, "epoch": 1.0448833034111311, "step": 14550}, {"loss": 0.7751, "grad_norm": 0.8179892301559448, "learning_rate": 0.0002, "epoch": 1.0456014362657091, "step": 14560}, {"loss": 0.7225, "grad_norm": 0.6426970362663269, "learning_rate": 0.0002, "epoch": 1.0463195691202873, "step": 14570}, {"loss": 0.7756, "grad_norm": 0.678445041179657, "learning_rate": 0.0002, "epoch": 1.0470377019748653, "step": 14580}, {"loss": 0.7172, "grad_norm": 0.7573820352554321, "learning_rate": 0.0002, "epoch": 1.0477558348294433, "step": 14590}, {"loss": 0.8092, "grad_norm": 0.734443724155426, "learning_rate": 0.0002, "epoch": 1.0484739676840216, "step": 14600}, {"loss": 0.7205, "grad_norm": 0.7333676218986511, "learning_rate": 0.0002, "epoch": 1.0491921005385996, "step": 14610}, {"loss": 0.7276, "grad_norm": 0.6122187972068787, "learning_rate": 0.0002, "epoch": 1.0499102333931778, "step": 14620}, {"loss": 0.7051, "grad_norm": 0.6916412711143494, "learning_rate": 0.0002, "epoch": 1.0506283662477558, "step": 14630}, {"loss": 0.7315, "grad_norm": 0.5898127555847168, "learning_rate": 0.0002, "epoch": 1.051346499102334, "step": 14640}, {"loss": 0.7293, "grad_norm": 0.6071873307228088, "learning_rate": 0.0002, "epoch": 1.052064631956912, "step": 14650}, {"loss": 0.7924, "grad_norm": 0.6530455946922302, "learning_rate": 0.0002, "epoch": 1.05278276481149, "step": 14660}, {"loss": 0.7055, "grad_norm": 0.6919314861297607, "learning_rate": 0.0002, "epoch": 1.0535008976660682, "step": 14670}, {"loss": 0.7481, "grad_norm": 0.7843509912490845, "learning_rate": 0.0002, "epoch": 1.0542190305206462, "step": 14680}, {"loss": 0.7253, "grad_norm": 0.6106747388839722, "learning_rate": 0.0002, "epoch": 1.0549371633752245, "step": 14690}, {"loss": 0.7206, "grad_norm": 0.7828368544578552, "learning_rate": 0.0002, "epoch": 1.0556552962298025, "step": 14700}, {"loss": 0.6933, "grad_norm": 0.6772044897079468, "learning_rate": 0.0002, "epoch": 1.0563734290843807, "step": 14710}, {"loss": 0.6851, "grad_norm": 0.5430962443351746, "learning_rate": 0.0002, "epoch": 1.0570915619389587, "step": 14720}, {"loss": 0.7306, "grad_norm": 0.7364194989204407, "learning_rate": 0.0002, "epoch": 1.0578096947935367, "step": 14730}, {"loss": 0.703, "grad_norm": 0.5607585310935974, "learning_rate": 0.0002, "epoch": 1.058527827648115, "step": 14740}, {"loss": 0.7488, "grad_norm": 0.7917081713676453, "learning_rate": 0.0002, "epoch": 1.059245960502693, "step": 14750}, {"loss": 0.71, "grad_norm": 0.7852025628089905, "learning_rate": 0.0002, "epoch": 1.0599640933572712, "step": 14760}, {"loss": 0.7093, "grad_norm": 0.6329161524772644, "learning_rate": 0.0002, "epoch": 1.0606822262118492, "step": 14770}, {"loss": 0.7244, "grad_norm": 0.7607306838035583, "learning_rate": 0.0002, "epoch": 1.0614003590664274, "step": 14780}, {"loss": 0.7237, "grad_norm": 0.7236617207527161, "learning_rate": 0.0002, "epoch": 1.0621184919210054, "step": 14790}, {"loss": 0.7133, "grad_norm": 0.793542206287384, "learning_rate": 0.0002, "epoch": 1.0628366247755834, "step": 14800}, {"loss": 0.7482, "grad_norm": 0.53999263048172, "learning_rate": 0.0002, "epoch": 1.0635547576301616, "step": 14810}, {"loss": 0.732, "grad_norm": 0.5821034908294678, "learning_rate": 0.0002, "epoch": 1.0642728904847396, "step": 14820}, {"loss": 0.7066, "grad_norm": 0.6593600511550903, "learning_rate": 0.0002, "epoch": 1.0649910233393178, "step": 14830}, {"loss": 0.7458, "grad_norm": 0.70230633020401, "learning_rate": 0.0002, "epoch": 1.0657091561938958, "step": 14840}, {"loss": 0.7244, "grad_norm": 0.5715264081954956, "learning_rate": 0.0002, "epoch": 1.066427289048474, "step": 14850}, {"loss": 0.723, "grad_norm": 0.6610119938850403, "learning_rate": 0.0002, "epoch": 1.067145421903052, "step": 14860}, {"loss": 0.745, "grad_norm": 0.5470091700553894, "learning_rate": 0.0002, "epoch": 1.06786355475763, "step": 14870}, {"loss": 0.7464, "grad_norm": 0.7529906630516052, "learning_rate": 0.0002, "epoch": 1.0685816876122083, "step": 14880}, {"loss": 0.7421, "grad_norm": 0.7532844543457031, "learning_rate": 0.0002, "epoch": 1.0692998204667863, "step": 14890}, {"loss": 0.6706, "grad_norm": 0.6439316868782043, "learning_rate": 0.0002, "epoch": 1.0700179533213645, "step": 14900}, {"loss": 0.7276, "grad_norm": 0.5580114126205444, "learning_rate": 0.0002, "epoch": 1.0707360861759425, "step": 14910}, {"loss": 0.7478, "grad_norm": 0.6299236416816711, "learning_rate": 0.0002, "epoch": 1.0714542190305207, "step": 14920}, {"loss": 0.7927, "grad_norm": 0.6934021711349487, "learning_rate": 0.0002, "epoch": 1.0721723518850987, "step": 14930}, {"loss": 0.6766, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 1.0728904847396767, "step": 14940}, {"loss": 0.7072, "grad_norm": 0.8921014070510864, "learning_rate": 0.0002, "epoch": 1.073608617594255, "step": 14950}, {"loss": 0.7127, "grad_norm": 0.5934301614761353, "learning_rate": 0.0002, "epoch": 1.074326750448833, "step": 14960}, {"loss": 0.7595, "grad_norm": 0.8379642367362976, "learning_rate": 0.0002, "epoch": 1.0750448833034112, "step": 14970}, {"loss": 0.7231, "grad_norm": 0.6842767596244812, "learning_rate": 0.0002, "epoch": 1.0757630161579892, "step": 14980}, {"loss": 0.7362, "grad_norm": 0.7296533584594727, "learning_rate": 0.0002, "epoch": 1.0764811490125674, "step": 14990}, {"loss": 0.688, "grad_norm": 0.6821087002754211, "learning_rate": 0.0002, "epoch": 1.0771992818671454, "step": 15000}, {"loss": 0.6808, "grad_norm": 0.6133626699447632, "learning_rate": 0.0002, "epoch": 1.0779174147217234, "step": 15010}, {"loss": 0.7351, "grad_norm": 0.6774773001670837, "learning_rate": 0.0002, "epoch": 1.0786355475763016, "step": 15020}, {"loss": 0.7403, "grad_norm": 0.6818786859512329, "learning_rate": 0.0002, "epoch": 1.0793536804308796, "step": 15030}, {"loss": 0.7005, "grad_norm": 0.7763522863388062, "learning_rate": 0.0002, "epoch": 1.0800718132854579, "step": 15040}, {"loss": 0.7028, "grad_norm": 0.7259193658828735, "learning_rate": 0.0002, "epoch": 1.0807899461400359, "step": 15050}, {"loss": 0.7232, "grad_norm": 0.6797525882720947, "learning_rate": 0.0002, "epoch": 1.081508078994614, "step": 15060}, {"loss": 0.7051, "grad_norm": 0.5775881409645081, "learning_rate": 0.0002, "epoch": 1.082226211849192, "step": 15070}, {"loss": 0.745, "grad_norm": 0.7055524587631226, "learning_rate": 0.0002, "epoch": 1.08294434470377, "step": 15080}, {"loss": 0.7539, "grad_norm": 0.8018748760223389, "learning_rate": 0.0002, "epoch": 1.0836624775583483, "step": 15090}, {"loss": 0.6833, "grad_norm": 0.6738115549087524, "learning_rate": 0.0002, "epoch": 1.0843806104129263, "step": 15100}, {"loss": 0.7014, "grad_norm": 0.6586359143257141, "learning_rate": 0.0002, "epoch": 1.0850987432675046, "step": 15110}, {"loss": 0.7391, "grad_norm": 0.7396895885467529, "learning_rate": 0.0002, "epoch": 1.0858168761220826, "step": 15120}, {"loss": 0.7473, "grad_norm": 0.7224817276000977, "learning_rate": 0.0002, "epoch": 1.0865350089766608, "step": 15130}, {"loss": 0.7137, "grad_norm": 0.798514187335968, "learning_rate": 0.0002, "epoch": 1.0872531418312388, "step": 15140}, {"loss": 0.757, "grad_norm": 0.79301518201828, "learning_rate": 0.0002, "epoch": 1.0879712746858168, "step": 15150}, {"loss": 0.7, "grad_norm": 0.7106764316558838, "learning_rate": 0.0002, "epoch": 1.088689407540395, "step": 15160}, {"loss": 0.7515, "grad_norm": 0.6525473594665527, "learning_rate": 0.0002, "epoch": 1.089407540394973, "step": 15170}, {"loss": 0.7067, "grad_norm": 0.6001671552658081, "learning_rate": 0.0002, "epoch": 1.0901256732495512, "step": 15180}, {"loss": 0.722, "grad_norm": 0.6949557662010193, "learning_rate": 0.0002, "epoch": 1.0908438061041292, "step": 15190}, {"loss": 0.7165, "grad_norm": 0.5713186860084534, "learning_rate": 0.0002, "epoch": 1.0915619389587075, "step": 15200}, {"loss": 0.7073, "grad_norm": 0.8773220181465149, "learning_rate": 0.0002, "epoch": 1.0922800718132855, "step": 15210}, {"loss": 0.7332, "grad_norm": 0.5837785601615906, "learning_rate": 0.0002, "epoch": 1.0929982046678635, "step": 15220}, {"loss": 0.7451, "grad_norm": 0.7243856191635132, "learning_rate": 0.0002, "epoch": 1.0937163375224417, "step": 15230}, {"loss": 0.6885, "grad_norm": 0.7008263468742371, "learning_rate": 0.0002, "epoch": 1.0944344703770197, "step": 15240}, {"loss": 0.7259, "grad_norm": 0.7061941623687744, "learning_rate": 0.0002, "epoch": 1.095152603231598, "step": 15250}, {"loss": 0.7482, "grad_norm": 0.575903594493866, "learning_rate": 0.0002, "epoch": 1.095870736086176, "step": 15260}, {"loss": 0.7001, "grad_norm": 0.6794043183326721, "learning_rate": 0.0002, "epoch": 1.0965888689407541, "step": 15270}, {"loss": 0.708, "grad_norm": 0.7194870710372925, "learning_rate": 0.0002, "epoch": 1.0973070017953321, "step": 15280}, {"loss": 0.7248, "grad_norm": 0.8063322305679321, "learning_rate": 0.0002, "epoch": 1.0980251346499101, "step": 15290}, {"loss": 0.7128, "grad_norm": 0.786101758480072, "learning_rate": 0.0002, "epoch": 1.0987432675044884, "step": 15300}, {"loss": 0.7523, "grad_norm": 0.827474057674408, "learning_rate": 0.0002, "epoch": 1.0994614003590664, "step": 15310}, {"loss": 0.7624, "grad_norm": 0.6514455080032349, "learning_rate": 0.0002, "epoch": 1.1001795332136446, "step": 15320}, {"loss": 0.745, "grad_norm": 0.7534348368644714, "learning_rate": 0.0002, "epoch": 1.1008976660682226, "step": 15330}, {"loss": 0.7359, "grad_norm": 0.6991367340087891, "learning_rate": 0.0002, "epoch": 1.1016157989228008, "step": 15340}, {"loss": 0.717, "grad_norm": 0.6742196679115295, "learning_rate": 0.0002, "epoch": 1.1023339317773788, "step": 15350}, {"loss": 0.737, "grad_norm": 0.7373757362365723, "learning_rate": 0.0002, "epoch": 1.1030520646319568, "step": 15360}, {"loss": 0.7421, "grad_norm": 0.6834485530853271, "learning_rate": 0.0002, "epoch": 1.103770197486535, "step": 15370}, {"loss": 0.7015, "grad_norm": 0.6454901099205017, "learning_rate": 0.0002, "epoch": 1.104488330341113, "step": 15380}, {"loss": 0.7276, "grad_norm": 0.7764508128166199, "learning_rate": 0.0002, "epoch": 1.1052064631956913, "step": 15390}, {"loss": 0.747, "grad_norm": 0.668560802936554, "learning_rate": 0.0002, "epoch": 1.1059245960502693, "step": 15400}, {"loss": 0.6705, "grad_norm": 0.579655110836029, "learning_rate": 0.0002, "epoch": 1.1066427289048475, "step": 15410}, {"loss": 0.7101, "grad_norm": 0.7196493148803711, "learning_rate": 0.0002, "epoch": 1.1073608617594255, "step": 15420}, {"loss": 0.8027, "grad_norm": 0.5530232191085815, "learning_rate": 0.0002, "epoch": 1.1080789946140035, "step": 15430}, {"loss": 0.7369, "grad_norm": 0.6542958617210388, "learning_rate": 0.0002, "epoch": 1.1087971274685817, "step": 15440}, {"loss": 0.7475, "grad_norm": 0.7468852400779724, "learning_rate": 0.0002, "epoch": 1.1095152603231597, "step": 15450}, {"loss": 0.6898, "grad_norm": 0.8119780421257019, "learning_rate": 0.0002, "epoch": 1.110233393177738, "step": 15460}, {"loss": 0.7652, "grad_norm": 0.7807733416557312, "learning_rate": 0.0002, "epoch": 1.110951526032316, "step": 15470}, {"loss": 0.697, "grad_norm": 0.7352553009986877, "learning_rate": 0.0002, "epoch": 1.1116696588868942, "step": 15480}, {"loss": 0.7509, "grad_norm": 0.8455224633216858, "learning_rate": 0.0002, "epoch": 1.1123877917414722, "step": 15490}, {"loss": 0.7757, "grad_norm": 0.635308563709259, "learning_rate": 0.0002, "epoch": 1.1131059245960502, "step": 15500}, {"loss": 0.685, "grad_norm": 0.6268794536590576, "learning_rate": 0.0002, "epoch": 1.1138240574506284, "step": 15510}, {"loss": 0.7174, "grad_norm": 0.6829593181610107, "learning_rate": 0.0002, "epoch": 1.1145421903052064, "step": 15520}, {"loss": 0.7264, "grad_norm": 0.5997796058654785, "learning_rate": 0.0002, "epoch": 1.1152603231597846, "step": 15530}, {"loss": 0.7167, "grad_norm": 0.7500942349433899, "learning_rate": 0.0002, "epoch": 1.1159784560143626, "step": 15540}, {"loss": 0.7275, "grad_norm": 0.7052047848701477, "learning_rate": 0.0002, "epoch": 1.1166965888689409, "step": 15550}, {"loss": 0.7832, "grad_norm": 0.6698189377784729, "learning_rate": 0.0002, "epoch": 1.1174147217235189, "step": 15560}, {"loss": 0.7587, "grad_norm": 0.7890462875366211, "learning_rate": 0.0002, "epoch": 1.1181328545780969, "step": 15570}, {"loss": 0.7092, "grad_norm": 0.7002465128898621, "learning_rate": 0.0002, "epoch": 1.118850987432675, "step": 15580}, {"loss": 0.6903, "grad_norm": 0.7456073760986328, "learning_rate": 0.0002, "epoch": 1.119569120287253, "step": 15590}, {"loss": 0.7577, "grad_norm": 0.7997385263442993, "learning_rate": 0.0002, "epoch": 1.1202872531418313, "step": 15600}, {"loss": 0.7005, "grad_norm": 0.6640482544898987, "learning_rate": 0.0002, "epoch": 1.1210053859964093, "step": 15610}, {"loss": 0.7334, "grad_norm": 0.7765318155288696, "learning_rate": 0.0002, "epoch": 1.1217235188509875, "step": 15620}, {"loss": 0.6977, "grad_norm": 0.7184962630271912, "learning_rate": 0.0002, "epoch": 1.1224416517055655, "step": 15630}, {"loss": 0.7362, "grad_norm": 0.7310904264450073, "learning_rate": 0.0002, "epoch": 1.1231597845601435, "step": 15640}, {"loss": 0.7278, "grad_norm": 0.7406452298164368, "learning_rate": 0.0002, "epoch": 1.1238779174147218, "step": 15650}, {"loss": 0.7074, "grad_norm": 0.7546738982200623, "learning_rate": 0.0002, "epoch": 1.1245960502692998, "step": 15660}, {"loss": 0.7641, "grad_norm": 0.7069764733314514, "learning_rate": 0.0002, "epoch": 1.125314183123878, "step": 15670}, {"loss": 0.76, "grad_norm": 0.6309521198272705, "learning_rate": 0.0002, "epoch": 1.126032315978456, "step": 15680}, {"loss": 0.7862, "grad_norm": 0.8050156831741333, "learning_rate": 0.0002, "epoch": 1.1267504488330342, "step": 15690}, {"loss": 0.7553, "grad_norm": 0.726556122303009, "learning_rate": 0.0002, "epoch": 1.1274685816876122, "step": 15700}, {"loss": 0.7763, "grad_norm": 0.77745521068573, "learning_rate": 0.0002, "epoch": 1.1281867145421902, "step": 15710}, {"loss": 0.7703, "grad_norm": 0.7467634677886963, "learning_rate": 0.0002, "epoch": 1.1289048473967684, "step": 15720}, {"loss": 0.7676, "grad_norm": 0.8207895755767822, "learning_rate": 0.0002, "epoch": 1.1296229802513464, "step": 15730}, {"loss": 0.6747, "grad_norm": 0.8253937363624573, "learning_rate": 0.0002, "epoch": 1.1303411131059247, "step": 15740}, {"loss": 0.6983, "grad_norm": 0.6313983798027039, "learning_rate": 0.0002, "epoch": 1.1310592459605027, "step": 15750}, {"loss": 0.6916, "grad_norm": 0.8040992021560669, "learning_rate": 0.0002, "epoch": 1.1317773788150807, "step": 15760}, {"loss": 0.7295, "grad_norm": 0.5937064290046692, "learning_rate": 0.0002, "epoch": 1.132495511669659, "step": 15770}, {"loss": 0.7494, "grad_norm": 0.6486281156539917, "learning_rate": 0.0002, "epoch": 1.133213644524237, "step": 15780}, {"loss": 0.7029, "grad_norm": 0.6161853075027466, "learning_rate": 0.0002, "epoch": 1.1339317773788151, "step": 15790}, {"loss": 0.7019, "grad_norm": 0.6926610469818115, "learning_rate": 0.0002, "epoch": 1.1346499102333931, "step": 15800}, {"loss": 0.6906, "grad_norm": 0.6084047555923462, "learning_rate": 0.0002, "epoch": 1.1353680430879713, "step": 15810}, {"loss": 0.7091, "grad_norm": 0.6928383111953735, "learning_rate": 0.0002, "epoch": 1.1360861759425493, "step": 15820}, {"loss": 0.7238, "grad_norm": 0.7784243822097778, "learning_rate": 0.0002, "epoch": 1.1368043087971276, "step": 15830}, {"loss": 0.6943, "grad_norm": 0.7169384956359863, "learning_rate": 0.0002, "epoch": 1.1375224416517056, "step": 15840}, {"loss": 0.7287, "grad_norm": 0.6953616142272949, "learning_rate": 0.0002, "epoch": 1.1382405745062836, "step": 15850}, {"loss": 0.7489, "grad_norm": 0.7345215082168579, "learning_rate": 0.0002, "epoch": 1.1389587073608618, "step": 15860}, {"loss": 0.683, "grad_norm": 0.5469502806663513, "learning_rate": 0.0002, "epoch": 1.1396768402154398, "step": 15870}, {"loss": 0.717, "grad_norm": 0.687680721282959, "learning_rate": 0.0002, "epoch": 1.140394973070018, "step": 15880}, {"loss": 0.7171, "grad_norm": 0.6879996657371521, "learning_rate": 0.0002, "epoch": 1.141113105924596, "step": 15890}, {"loss": 0.7321, "grad_norm": 0.728886067867279, "learning_rate": 0.0002, "epoch": 1.141831238779174, "step": 15900}, {"loss": 0.7752, "grad_norm": 0.929531455039978, "learning_rate": 0.0002, "epoch": 1.1425493716337523, "step": 15910}, {"loss": 0.7353, "grad_norm": 0.8122507333755493, "learning_rate": 0.0002, "epoch": 1.1432675044883303, "step": 15920}, {"loss": 0.7138, "grad_norm": 0.6494652628898621, "learning_rate": 0.0002, "epoch": 1.1439856373429085, "step": 15930}, {"loss": 0.7489, "grad_norm": 0.7307567596435547, "learning_rate": 0.0002, "epoch": 1.1447037701974865, "step": 15940}, {"loss": 0.7385, "grad_norm": 0.548678994178772, "learning_rate": 0.0002, "epoch": 1.1454219030520647, "step": 15950}, {"loss": 0.7152, "grad_norm": 0.8011603951454163, "learning_rate": 0.0002, "epoch": 1.1461400359066427, "step": 15960}, {"loss": 0.7324, "grad_norm": 0.7026647329330444, "learning_rate": 0.0002, "epoch": 1.146858168761221, "step": 15970}, {"loss": 0.7464, "grad_norm": 0.7338995933532715, "learning_rate": 0.0002, "epoch": 1.147576301615799, "step": 15980}, {"loss": 0.7416, "grad_norm": 0.8453443646430969, "learning_rate": 0.0002, "epoch": 1.148294434470377, "step": 15990}, {"loss": 0.7419, "grad_norm": 0.6787207126617432, "learning_rate": 0.0002, "epoch": 1.1490125673249552, "step": 16000}, {"loss": 0.7487, "grad_norm": 0.6314631104469299, "learning_rate": 0.0002, "epoch": 1.1497307001795332, "step": 16010}, {"loss": 0.7165, "grad_norm": 0.8812752962112427, "learning_rate": 0.0002, "epoch": 1.1504488330341114, "step": 16020}, {"loss": 0.774, "grad_norm": 0.6528969407081604, "learning_rate": 0.0002, "epoch": 1.1511669658886894, "step": 16030}, {"loss": 0.7321, "grad_norm": 0.7843571305274963, "learning_rate": 0.0002, "epoch": 1.1518850987432674, "step": 16040}, {"loss": 0.7769, "grad_norm": 0.7095080018043518, "learning_rate": 0.0002, "epoch": 1.1526032315978456, "step": 16050}, {"loss": 0.744, "grad_norm": 0.7495582103729248, "learning_rate": 0.0002, "epoch": 1.1533213644524236, "step": 16060}, {"loss": 0.7813, "grad_norm": 0.6002049446105957, "learning_rate": 0.0002, "epoch": 1.1540394973070018, "step": 16070}, {"loss": 0.7117, "grad_norm": 0.565014123916626, "learning_rate": 0.0002, "epoch": 1.1547576301615798, "step": 16080}, {"loss": 0.7664, "grad_norm": 0.8209971785545349, "learning_rate": 0.0002, "epoch": 1.155475763016158, "step": 16090}, {"loss": 0.7486, "grad_norm": 0.7137531042098999, "learning_rate": 0.0002, "epoch": 1.156193895870736, "step": 16100}, {"loss": 0.7197, "grad_norm": 0.7307516932487488, "learning_rate": 0.0002, "epoch": 1.1569120287253143, "step": 16110}, {"loss": 0.7351, "grad_norm": 0.6686444878578186, "learning_rate": 0.0002, "epoch": 1.1576301615798923, "step": 16120}, {"loss": 0.7407, "grad_norm": 0.7977298498153687, "learning_rate": 0.0002, "epoch": 1.1583482944344703, "step": 16130}, {"loss": 0.6696, "grad_norm": 0.6980607509613037, "learning_rate": 0.0002, "epoch": 1.1590664272890485, "step": 16140}, {"loss": 0.7513, "grad_norm": 0.6622613668441772, "learning_rate": 0.0002, "epoch": 1.1597845601436265, "step": 16150}, {"loss": 0.7162, "grad_norm": 0.6598347425460815, "learning_rate": 0.0002, "epoch": 1.1605026929982047, "step": 16160}, {"loss": 0.7418, "grad_norm": 0.6686234474182129, "learning_rate": 0.0002, "epoch": 1.1612208258527827, "step": 16170}, {"loss": 0.7104, "grad_norm": 0.7308177947998047, "learning_rate": 0.0002, "epoch": 1.1619389587073607, "step": 16180}, {"loss": 0.7337, "grad_norm": 0.939537525177002, "learning_rate": 0.0002, "epoch": 1.162657091561939, "step": 16190}, {"loss": 0.7054, "grad_norm": 0.5514758825302124, "learning_rate": 0.0002, "epoch": 1.163375224416517, "step": 16200}, {"loss": 0.7449, "grad_norm": 0.589142918586731, "learning_rate": 0.0002, "epoch": 1.1640933572710952, "step": 16210}, {"loss": 0.7438, "grad_norm": 0.6888012290000916, "learning_rate": 0.0002, "epoch": 1.1648114901256732, "step": 16220}, {"loss": 0.719, "grad_norm": 0.82566899061203, "learning_rate": 0.0002, "epoch": 1.1655296229802514, "step": 16230}, {"loss": 0.7274, "grad_norm": 0.6107817888259888, "learning_rate": 0.0002, "epoch": 1.1662477558348294, "step": 16240}, {"loss": 0.6849, "grad_norm": 0.7831398844718933, "learning_rate": 0.0002, "epoch": 1.1669658886894076, "step": 16250}, {"loss": 0.7077, "grad_norm": 0.6468397974967957, "learning_rate": 0.0002, "epoch": 1.1676840215439857, "step": 16260}, {"loss": 0.7056, "grad_norm": 0.7284161448478699, "learning_rate": 0.0002, "epoch": 1.1684021543985637, "step": 16270}, {"loss": 0.7476, "grad_norm": 0.6182818412780762, "learning_rate": 0.0002, "epoch": 1.1691202872531419, "step": 16280}, {"loss": 0.7608, "grad_norm": 0.7091781497001648, "learning_rate": 0.0002, "epoch": 1.1698384201077199, "step": 16290}, {"loss": 0.7235, "grad_norm": 0.7327643632888794, "learning_rate": 0.0002, "epoch": 1.170556552962298, "step": 16300}, {"loss": 0.7304, "grad_norm": 0.5864694118499756, "learning_rate": 0.0002, "epoch": 1.171274685816876, "step": 16310}, {"loss": 0.7011, "grad_norm": 0.7049986720085144, "learning_rate": 0.0002, "epoch": 1.171992818671454, "step": 16320}, {"loss": 0.7234, "grad_norm": 0.7563399076461792, "learning_rate": 0.0002, "epoch": 1.1727109515260323, "step": 16330}, {"loss": 0.7313, "grad_norm": 0.5888143181800842, "learning_rate": 0.0002, "epoch": 1.1734290843806103, "step": 16340}, {"loss": 0.7078, "grad_norm": 0.8670049905776978, "learning_rate": 0.0002, "epoch": 1.1741472172351886, "step": 16350}, {"loss": 0.7656, "grad_norm": 0.8045654296875, "learning_rate": 0.0002, "epoch": 1.1748653500897666, "step": 16360}, {"loss": 0.7942, "grad_norm": 0.9115668535232544, "learning_rate": 0.0002, "epoch": 1.1755834829443448, "step": 16370}, {"loss": 0.6807, "grad_norm": 0.6943584084510803, "learning_rate": 0.0002, "epoch": 1.1763016157989228, "step": 16380}, {"loss": 0.7558, "grad_norm": 0.7931740283966064, "learning_rate": 0.0002, "epoch": 1.177019748653501, "step": 16390}, {"loss": 0.7247, "grad_norm": 0.7967953085899353, "learning_rate": 0.0002, "epoch": 1.177737881508079, "step": 16400}, {"loss": 0.7294, "grad_norm": 0.575165867805481, "learning_rate": 0.0002, "epoch": 1.178456014362657, "step": 16410}, {"loss": 0.8045, "grad_norm": 0.6803409457206726, "learning_rate": 0.0002, "epoch": 1.1791741472172352, "step": 16420}, {"loss": 0.7594, "grad_norm": 0.7661909461021423, "learning_rate": 0.0002, "epoch": 1.1798922800718132, "step": 16430}, {"loss": 0.7387, "grad_norm": 0.7907630205154419, "learning_rate": 0.0002, "epoch": 1.1806104129263915, "step": 16440}, {"loss": 0.6954, "grad_norm": 0.7215338945388794, "learning_rate": 0.0002, "epoch": 1.1813285457809695, "step": 16450}, {"loss": 0.7503, "grad_norm": 0.6824054718017578, "learning_rate": 0.0002, "epoch": 1.1820466786355475, "step": 16460}, {"loss": 0.7548, "grad_norm": 0.8057665228843689, "learning_rate": 0.0002, "epoch": 1.1827648114901257, "step": 16470}, {"loss": 0.7572, "grad_norm": 0.7487542033195496, "learning_rate": 0.0002, "epoch": 1.1834829443447037, "step": 16480}, {"loss": 0.7267, "grad_norm": 0.7254953384399414, "learning_rate": 0.0002, "epoch": 1.184201077199282, "step": 16490}, {"loss": 0.6906, "grad_norm": 0.6986604332923889, "learning_rate": 0.0002, "epoch": 1.18491921005386, "step": 16500}, {"loss": 0.6979, "grad_norm": 0.7889591455459595, "learning_rate": 0.0002, "epoch": 1.1856373429084381, "step": 16510}, {"loss": 0.7455, "grad_norm": 0.6029604077339172, "learning_rate": 0.0002, "epoch": 1.1863554757630161, "step": 16520}, {"loss": 0.7673, "grad_norm": 0.680322527885437, "learning_rate": 0.0002, "epoch": 1.1870736086175944, "step": 16530}, {"loss": 0.708, "grad_norm": 0.8588826060295105, "learning_rate": 0.0002, "epoch": 1.1877917414721724, "step": 16540}, {"loss": 0.7291, "grad_norm": 0.7614806890487671, "learning_rate": 0.0002, "epoch": 1.1885098743267504, "step": 16550}, {"loss": 0.7021, "grad_norm": 0.7523183226585388, "learning_rate": 0.0002, "epoch": 1.1892280071813286, "step": 16560}, {"loss": 0.7452, "grad_norm": 0.8299532532691956, "learning_rate": 0.0002, "epoch": 1.1899461400359066, "step": 16570}, {"loss": 0.7409, "grad_norm": 0.6709241271018982, "learning_rate": 0.0002, "epoch": 1.1906642728904848, "step": 16580}, {"loss": 0.7322, "grad_norm": 0.665414035320282, "learning_rate": 0.0002, "epoch": 1.1913824057450628, "step": 16590}, {"loss": 0.7699, "grad_norm": 0.7582152485847473, "learning_rate": 0.0002, "epoch": 1.1921005385996408, "step": 16600}, {"loss": 0.7069, "grad_norm": 0.5856947302818298, "learning_rate": 0.0002, "epoch": 1.192818671454219, "step": 16610}, {"loss": 0.7444, "grad_norm": 0.6972885727882385, "learning_rate": 0.0002, "epoch": 1.193536804308797, "step": 16620}, {"loss": 0.7265, "grad_norm": 0.6884734630584717, "learning_rate": 0.0002, "epoch": 1.1942549371633753, "step": 16630}, {"loss": 0.6881, "grad_norm": 0.7380475401878357, "learning_rate": 0.0002, "epoch": 1.1949730700179533, "step": 16640}, {"loss": 0.7297, "grad_norm": 0.7976197600364685, "learning_rate": 0.0002, "epoch": 1.1956912028725315, "step": 16650}, {"loss": 0.7328, "grad_norm": 0.819256067276001, "learning_rate": 0.0002, "epoch": 1.1964093357271095, "step": 16660}, {"loss": 0.771, "grad_norm": 0.587867796421051, "learning_rate": 0.0002, "epoch": 1.1971274685816877, "step": 16670}, {"loss": 0.7357, "grad_norm": 0.9162678122520447, "learning_rate": 0.0002, "epoch": 1.1978456014362657, "step": 16680}, {"loss": 0.7472, "grad_norm": 0.7452084422111511, "learning_rate": 0.0002, "epoch": 1.1985637342908437, "step": 16690}, {"loss": 0.7257, "grad_norm": 0.7966971397399902, "learning_rate": 0.0002, "epoch": 1.199281867145422, "step": 16700}, {"loss": 0.8051, "grad_norm": 0.6605724692344666, "learning_rate": 0.0002, "epoch": 1.2, "step": 16710}, {"loss": 0.729, "grad_norm": 0.6499220728874207, "learning_rate": 0.0002, "epoch": 1.2007181328545782, "step": 16720}, {"loss": 0.7107, "grad_norm": 0.7422114610671997, "learning_rate": 0.0002, "epoch": 1.2014362657091562, "step": 16730}, {"loss": 0.6712, "grad_norm": 0.6652370095252991, "learning_rate": 0.0002, "epoch": 1.2021543985637342, "step": 16740}, {"loss": 0.7804, "grad_norm": 0.8761070370674133, "learning_rate": 0.0002, "epoch": 1.2028725314183124, "step": 16750}, {"loss": 0.737, "grad_norm": 0.7294463515281677, "learning_rate": 0.0002, "epoch": 1.2035906642728904, "step": 16760}, {"loss": 0.7638, "grad_norm": 0.7725599408149719, "learning_rate": 0.0002, "epoch": 1.2043087971274686, "step": 16770}, {"loss": 0.6857, "grad_norm": 0.5630005598068237, "learning_rate": 0.0002, "epoch": 1.2050269299820466, "step": 16780}, {"loss": 0.7344, "grad_norm": 0.7601404786109924, "learning_rate": 0.0002, "epoch": 1.2057450628366249, "step": 16790}, {"loss": 0.729, "grad_norm": 0.6859985589981079, "learning_rate": 0.0002, "epoch": 1.2064631956912029, "step": 16800}, {"loss": 0.7203, "grad_norm": 0.7040054798126221, "learning_rate": 0.0002, "epoch": 1.207181328545781, "step": 16810}, {"loss": 0.7727, "grad_norm": 0.7058989405632019, "learning_rate": 0.0002, "epoch": 1.207899461400359, "step": 16820}, {"loss": 0.7247, "grad_norm": 0.7646133899688721, "learning_rate": 0.0002, "epoch": 1.208617594254937, "step": 16830}, {"loss": 0.7903, "grad_norm": 0.669550359249115, "learning_rate": 0.0002, "epoch": 1.2093357271095153, "step": 16840}, {"loss": 0.7313, "grad_norm": 0.6613401174545288, "learning_rate": 0.0002, "epoch": 1.2100538599640933, "step": 16850}, {"loss": 0.7181, "grad_norm": 0.8636519312858582, "learning_rate": 0.0002, "epoch": 1.2107719928186715, "step": 16860}, {"loss": 0.7111, "grad_norm": 0.6077507138252258, "learning_rate": 0.0002, "epoch": 1.2114901256732495, "step": 16870}, {"loss": 0.7706, "grad_norm": 0.7892228364944458, "learning_rate": 0.0002, "epoch": 1.2122082585278275, "step": 16880}, {"loss": 0.685, "grad_norm": 0.7424154877662659, "learning_rate": 0.0002, "epoch": 1.2129263913824058, "step": 16890}, {"loss": 0.6707, "grad_norm": 0.6525408029556274, "learning_rate": 0.0002, "epoch": 1.2136445242369838, "step": 16900}, {"loss": 0.7721, "grad_norm": 0.6178015470504761, "learning_rate": 0.0002, "epoch": 1.214362657091562, "step": 16910}, {"loss": 0.6971, "grad_norm": 0.7319437861442566, "learning_rate": 0.0002, "epoch": 1.21508078994614, "step": 16920}, {"loss": 0.7261, "grad_norm": 0.6823344826698303, "learning_rate": 0.0002, "epoch": 1.2157989228007182, "step": 16930}, {"loss": 0.7048, "grad_norm": 0.5681257843971252, "learning_rate": 0.0002, "epoch": 1.2165170556552962, "step": 16940}, {"loss": 0.7398, "grad_norm": 0.7939814925193787, "learning_rate": 0.0002, "epoch": 1.2172351885098744, "step": 16950}, {"loss": 0.7192, "grad_norm": 0.7031611800193787, "learning_rate": 0.0002, "epoch": 1.2179533213644524, "step": 16960}, {"loss": 0.7212, "grad_norm": 0.7610133290290833, "learning_rate": 0.0002, "epoch": 1.2186714542190304, "step": 16970}, {"loss": 0.7599, "grad_norm": 0.8707142472267151, "learning_rate": 0.0002, "epoch": 1.2193895870736087, "step": 16980}, {"loss": 0.7121, "grad_norm": 0.6603384017944336, "learning_rate": 0.0002, "epoch": 1.2201077199281867, "step": 16990}, {"loss": 0.7315, "grad_norm": 0.7218315005302429, "learning_rate": 0.0002, "epoch": 1.220825852782765, "step": 17000}, {"loss": 0.7513, "grad_norm": 0.8043148517608643, "learning_rate": 0.0002, "epoch": 1.221543985637343, "step": 17010}, {"loss": 0.6749, "grad_norm": 0.7232559323310852, "learning_rate": 0.0002, "epoch": 1.222262118491921, "step": 17020}, {"loss": 0.7681, "grad_norm": 0.690376341342926, "learning_rate": 0.0002, "epoch": 1.2229802513464991, "step": 17030}, {"loss": 0.7042, "grad_norm": 0.602436363697052, "learning_rate": 0.0002, "epoch": 1.2236983842010771, "step": 17040}, {"loss": 0.7129, "grad_norm": 0.7610493898391724, "learning_rate": 0.0002, "epoch": 1.2244165170556554, "step": 17050}, {"loss": 0.758, "grad_norm": 0.7504690885543823, "learning_rate": 0.0002, "epoch": 1.2251346499102334, "step": 17060}, {"loss": 0.6908, "grad_norm": 0.8080246448516846, "learning_rate": 0.0002, "epoch": 1.2258527827648116, "step": 17070}, {"loss": 0.7519, "grad_norm": 1.0240572690963745, "learning_rate": 0.0002, "epoch": 1.2265709156193896, "step": 17080}, {"loss": 0.7193, "grad_norm": 0.6874111294746399, "learning_rate": 0.0002, "epoch": 1.2272890484739678, "step": 17090}, {"loss": 0.79, "grad_norm": 0.800069272518158, "learning_rate": 0.0002, "epoch": 1.2280071813285458, "step": 17100}, {"loss": 0.742, "grad_norm": 0.8628103137016296, "learning_rate": 0.0002, "epoch": 1.2287253141831238, "step": 17110}, {"loss": 0.7022, "grad_norm": 0.7408499121665955, "learning_rate": 0.0002, "epoch": 1.229443447037702, "step": 17120}, {"loss": 0.6774, "grad_norm": 0.6494335532188416, "learning_rate": 0.0002, "epoch": 1.23016157989228, "step": 17130}, {"loss": 0.7025, "grad_norm": 0.6493549942970276, "learning_rate": 0.0002, "epoch": 1.2308797127468583, "step": 17140}, {"loss": 0.7448, "grad_norm": 0.6972658038139343, "learning_rate": 0.0002, "epoch": 1.2315978456014363, "step": 17150}, {"loss": 0.7219, "grad_norm": 0.6877315044403076, "learning_rate": 0.0002, "epoch": 1.2323159784560143, "step": 17160}, {"loss": 0.7945, "grad_norm": 0.7569024562835693, "learning_rate": 0.0002, "epoch": 1.2330341113105925, "step": 17170}, {"loss": 0.7467, "grad_norm": 0.696260392665863, "learning_rate": 0.0002, "epoch": 1.2337522441651705, "step": 17180}, {"loss": 0.6716, "grad_norm": 0.6150345802307129, "learning_rate": 0.0002, "epoch": 1.2344703770197487, "step": 17190}, {"loss": 0.7416, "grad_norm": 0.69009929895401, "learning_rate": 0.0002, "epoch": 1.2351885098743267, "step": 17200}, {"loss": 0.787, "grad_norm": 0.7035185098648071, "learning_rate": 0.0002, "epoch": 1.235906642728905, "step": 17210}, {"loss": 0.6896, "grad_norm": 0.6792506575584412, "learning_rate": 0.0002, "epoch": 1.236624775583483, "step": 17220}, {"loss": 0.6953, "grad_norm": 0.6310356855392456, "learning_rate": 0.0002, "epoch": 1.2373429084380612, "step": 17230}, {"loss": 0.7531, "grad_norm": 0.647026538848877, "learning_rate": 0.0002, "epoch": 1.2380610412926392, "step": 17240}, {"loss": 0.8014, "grad_norm": 0.7609930038452148, "learning_rate": 0.0002, "epoch": 1.2387791741472172, "step": 17250}, {"loss": 0.8045, "grad_norm": 0.791890561580658, "learning_rate": 0.0002, "epoch": 1.2394973070017954, "step": 17260}, {"loss": 0.7445, "grad_norm": 0.7126715183258057, "learning_rate": 0.0002, "epoch": 1.2402154398563734, "step": 17270}, {"loss": 0.6561, "grad_norm": 0.7850401401519775, "learning_rate": 0.0002, "epoch": 1.2409335727109516, "step": 17280}, {"loss": 0.7454, "grad_norm": 0.6694281697273254, "learning_rate": 0.0002, "epoch": 1.2416517055655296, "step": 17290}, {"loss": 0.6711, "grad_norm": 0.6418080925941467, "learning_rate": 0.0002, "epoch": 1.2423698384201076, "step": 17300}, {"loss": 0.7504, "grad_norm": 0.7308132648468018, "learning_rate": 0.0002, "epoch": 1.2430879712746858, "step": 17310}, {"loss": 0.6896, "grad_norm": 0.8322312235832214, "learning_rate": 0.0002, "epoch": 1.2438061041292638, "step": 17320}, {"loss": 0.7341, "grad_norm": 0.6959006190299988, "learning_rate": 0.0002, "epoch": 1.244524236983842, "step": 17330}, {"loss": 0.7025, "grad_norm": 0.7110121846199036, "learning_rate": 0.0002, "epoch": 1.24524236983842, "step": 17340}, {"loss": 0.7858, "grad_norm": 0.6496296525001526, "learning_rate": 0.0002, "epoch": 1.2459605026929983, "step": 17350}, {"loss": 0.7061, "grad_norm": 0.7649076581001282, "learning_rate": 0.0002, "epoch": 1.2466786355475763, "step": 17360}, {"loss": 0.7155, "grad_norm": 0.7139049172401428, "learning_rate": 0.0002, "epoch": 1.2473967684021545, "step": 17370}, {"loss": 0.6932, "grad_norm": 0.7709113955497742, "learning_rate": 0.0002, "epoch": 1.2481149012567325, "step": 17380}, {"loss": 0.731, "grad_norm": 0.7160373330116272, "learning_rate": 0.0002, "epoch": 1.2488330341113105, "step": 17390}, {"loss": 0.7146, "grad_norm": 0.5608301162719727, "learning_rate": 0.0002, "epoch": 1.2495511669658887, "step": 17400}, {"loss": 0.7368, "grad_norm": 0.6913180351257324, "learning_rate": 0.0002, "epoch": 1.2502692998204668, "step": 17410}, {"loss": 0.7167, "grad_norm": 0.6980322599411011, "learning_rate": 0.0002, "epoch": 1.250987432675045, "step": 17420}, {"loss": 0.7096, "grad_norm": 0.8155394792556763, "learning_rate": 0.0002, "epoch": 1.251705565529623, "step": 17430}, {"loss": 0.7477, "grad_norm": 0.8015886545181274, "learning_rate": 0.0002, "epoch": 1.252423698384201, "step": 17440}, {"loss": 0.7006, "grad_norm": 0.5985556244850159, "learning_rate": 0.0002, "epoch": 1.2531418312387792, "step": 17450}, {"loss": 0.7171, "grad_norm": 0.70317143201828, "learning_rate": 0.0002, "epoch": 1.2538599640933572, "step": 17460}, {"loss": 0.7006, "grad_norm": 0.612501323223114, "learning_rate": 0.0002, "epoch": 1.2545780969479354, "step": 17470}, {"loss": 0.7639, "grad_norm": 0.7347102165222168, "learning_rate": 0.0002, "epoch": 1.2552962298025134, "step": 17480}, {"loss": 0.7303, "grad_norm": 0.9189441800117493, "learning_rate": 0.0002, "epoch": 1.2560143626570914, "step": 17490}, {"loss": 0.7547, "grad_norm": 0.7727932929992676, "learning_rate": 0.0002, "epoch": 1.2567324955116697, "step": 17500}, {"loss": 0.6979, "grad_norm": 0.6782869696617126, "learning_rate": 0.0002, "epoch": 1.2574506283662479, "step": 17510}, {"loss": 0.7146, "grad_norm": 0.5710638761520386, "learning_rate": 0.0002, "epoch": 1.2581687612208259, "step": 17520}, {"loss": 0.6999, "grad_norm": 0.6856266856193542, "learning_rate": 0.0002, "epoch": 1.2588868940754039, "step": 17530}, {"loss": 0.7229, "grad_norm": 0.7257347702980042, "learning_rate": 0.0002, "epoch": 1.259605026929982, "step": 17540}, {"loss": 0.7475, "grad_norm": 0.6343092918395996, "learning_rate": 0.0002, "epoch": 1.26032315978456, "step": 17550}, {"loss": 0.7863, "grad_norm": 0.6482594013214111, "learning_rate": 0.0002, "epoch": 1.2610412926391383, "step": 17560}, {"loss": 0.716, "grad_norm": 0.6542837619781494, "learning_rate": 0.0002, "epoch": 1.2617594254937163, "step": 17570}, {"loss": 0.7871, "grad_norm": 0.7106123566627502, "learning_rate": 0.0002, "epoch": 1.2624775583482943, "step": 17580}, {"loss": 0.7446, "grad_norm": 0.9081960320472717, "learning_rate": 0.0002, "epoch": 1.2631956912028726, "step": 17590}, {"loss": 0.7591, "grad_norm": 0.7010290026664734, "learning_rate": 0.0002, "epoch": 1.2639138240574506, "step": 17600}, {"loss": 0.7391, "grad_norm": 0.9973132610321045, "learning_rate": 0.0002, "epoch": 1.2646319569120288, "step": 17610}, {"loss": 0.725, "grad_norm": 0.8003297448158264, "learning_rate": 0.0002, "epoch": 1.2653500897666068, "step": 17620}, {"loss": 0.697, "grad_norm": 0.7383468151092529, "learning_rate": 0.0002, "epoch": 1.2660682226211848, "step": 17630}, {"loss": 0.785, "grad_norm": 0.6337200999259949, "learning_rate": 0.0002, "epoch": 1.266786355475763, "step": 17640}, {"loss": 0.7469, "grad_norm": 0.6371761560440063, "learning_rate": 0.0002, "epoch": 1.2675044883303412, "step": 17650}, {"loss": 0.7348, "grad_norm": 0.7283522486686707, "learning_rate": 0.0002, "epoch": 1.2682226211849192, "step": 17660}, {"loss": 0.7251, "grad_norm": 0.8191015720367432, "learning_rate": 0.0002, "epoch": 1.2689407540394972, "step": 17670}, {"loss": 0.7558, "grad_norm": 0.6210351586341858, "learning_rate": 0.0002, "epoch": 1.2696588868940755, "step": 17680}, {"loss": 0.7733, "grad_norm": 0.6563277840614319, "learning_rate": 0.0002, "epoch": 1.2703770197486535, "step": 17690}, {"loss": 0.7065, "grad_norm": 0.7111260294914246, "learning_rate": 0.0002, "epoch": 1.2710951526032317, "step": 17700}, {"loss": 0.7079, "grad_norm": 0.7061500549316406, "learning_rate": 0.0002, "epoch": 1.2718132854578097, "step": 17710}, {"loss": 0.7612, "grad_norm": 0.7657744884490967, "learning_rate": 0.0002, "epoch": 1.2725314183123877, "step": 17720}, {"loss": 0.7513, "grad_norm": 0.6952996850013733, "learning_rate": 0.0002, "epoch": 1.273249551166966, "step": 17730}, {"loss": 0.7402, "grad_norm": 0.5678043961524963, "learning_rate": 0.0002, "epoch": 1.273967684021544, "step": 17740}, {"loss": 0.7357, "grad_norm": 0.8608036041259766, "learning_rate": 0.0002, "epoch": 1.2746858168761221, "step": 17750}, {"loss": 0.7482, "grad_norm": 0.7184045910835266, "learning_rate": 0.0002, "epoch": 1.2754039497307001, "step": 17760}, {"loss": 0.7277, "grad_norm": 0.6647557616233826, "learning_rate": 0.0002, "epoch": 1.2761220825852782, "step": 17770}, {"loss": 0.6866, "grad_norm": 0.6899349093437195, "learning_rate": 0.0002, "epoch": 1.2768402154398564, "step": 17780}, {"loss": 0.721, "grad_norm": 0.7073346972465515, "learning_rate": 0.0002, "epoch": 1.2775583482944346, "step": 17790}, {"loss": 0.7432, "grad_norm": 0.8896707892417908, "learning_rate": 0.0002, "epoch": 1.2782764811490126, "step": 17800}, {"loss": 0.7318, "grad_norm": 0.5072778463363647, "learning_rate": 0.0002, "epoch": 1.2789946140035906, "step": 17810}, {"loss": 0.7648, "grad_norm": 0.8889711499214172, "learning_rate": 0.0002, "epoch": 1.2797127468581688, "step": 17820}, {"loss": 0.6894, "grad_norm": 0.5583778619766235, "learning_rate": 0.0002, "epoch": 1.2804308797127468, "step": 17830}, {"loss": 0.7488, "grad_norm": 0.6526148915290833, "learning_rate": 0.0002, "epoch": 1.281149012567325, "step": 17840}, {"loss": 0.7462, "grad_norm": 0.7658175826072693, "learning_rate": 0.0002, "epoch": 1.281867145421903, "step": 17850}, {"loss": 0.7298, "grad_norm": 0.5547847151756287, "learning_rate": 0.0002, "epoch": 1.282585278276481, "step": 17860}, {"loss": 0.705, "grad_norm": 0.6153780817985535, "learning_rate": 0.0002, "epoch": 1.2833034111310593, "step": 17870}, {"loss": 0.7173, "grad_norm": 0.8474061489105225, "learning_rate": 0.0002, "epoch": 1.2840215439856373, "step": 17880}, {"loss": 0.7597, "grad_norm": 0.859260618686676, "learning_rate": 0.0002, "epoch": 1.2847396768402155, "step": 17890}, {"loss": 0.7237, "grad_norm": 0.7270520329475403, "learning_rate": 0.0002, "epoch": 1.2854578096947935, "step": 17900}, {"loss": 0.701, "grad_norm": 0.8166249394416809, "learning_rate": 0.0002, "epoch": 1.2861759425493715, "step": 17910}, {"loss": 0.686, "grad_norm": 0.9158982038497925, "learning_rate": 0.0002, "epoch": 1.2868940754039497, "step": 17920}, {"loss": 0.7243, "grad_norm": 0.8132565021514893, "learning_rate": 0.0002, "epoch": 1.287612208258528, "step": 17930}, {"loss": 0.6909, "grad_norm": 0.7914409637451172, "learning_rate": 0.0002, "epoch": 1.288330341113106, "step": 17940}, {"loss": 0.7034, "grad_norm": 0.6256071329116821, "learning_rate": 0.0002, "epoch": 1.289048473967684, "step": 17950}, {"loss": 0.7279, "grad_norm": 0.6463542580604553, "learning_rate": 0.0002, "epoch": 1.2897666068222622, "step": 17960}, {"loss": 0.7601, "grad_norm": 0.6702672839164734, "learning_rate": 0.0002, "epoch": 1.2904847396768402, "step": 17970}, {"loss": 0.7355, "grad_norm": 0.8666605949401855, "learning_rate": 0.0002, "epoch": 1.2912028725314184, "step": 17980}, {"loss": 0.6838, "grad_norm": 0.8055952787399292, "learning_rate": 0.0002, "epoch": 1.2919210053859964, "step": 17990}, {"loss": 0.7361, "grad_norm": 0.6909741163253784, "learning_rate": 0.0002, "epoch": 1.2926391382405744, "step": 18000}, {"loss": 0.7766, "grad_norm": 0.663702130317688, "learning_rate": 0.0002, "epoch": 1.2933572710951526, "step": 18010}, {"loss": 0.7071, "grad_norm": 0.6952448487281799, "learning_rate": 0.0002, "epoch": 1.2940754039497306, "step": 18020}, {"loss": 0.7359, "grad_norm": 0.5722854137420654, "learning_rate": 0.0002, "epoch": 1.2947935368043089, "step": 18030}, {"loss": 0.764, "grad_norm": 0.7987681031227112, "learning_rate": 0.0002, "epoch": 1.2955116696588869, "step": 18040}, {"loss": 0.743, "grad_norm": 0.661133348941803, "learning_rate": 0.0002, "epoch": 1.2962298025134649, "step": 18050}, {"loss": 0.7627, "grad_norm": 0.6025064587593079, "learning_rate": 0.0002, "epoch": 1.296947935368043, "step": 18060}, {"loss": 0.7242, "grad_norm": 0.7569907903671265, "learning_rate": 0.0002, "epoch": 1.2976660682226213, "step": 18070}, {"loss": 0.7234, "grad_norm": 0.7222012281417847, "learning_rate": 0.0002, "epoch": 1.2983842010771993, "step": 18080}, {"loss": 0.7133, "grad_norm": 0.5291963815689087, "learning_rate": 0.0002, "epoch": 1.2991023339317773, "step": 18090}, {"loss": 0.7215, "grad_norm": 0.6808363199234009, "learning_rate": 0.0002, "epoch": 1.2998204667863555, "step": 18100}, {"loss": 0.7621, "grad_norm": 0.6797927618026733, "learning_rate": 0.0002, "epoch": 1.3005385996409335, "step": 18110}, {"loss": 0.7474, "grad_norm": 0.7775542140007019, "learning_rate": 0.0002, "epoch": 1.3012567324955118, "step": 18120}, {"loss": 0.7376, "grad_norm": 0.7369466423988342, "learning_rate": 0.0002, "epoch": 1.3019748653500898, "step": 18130}, {"loss": 0.7098, "grad_norm": 0.6822494864463806, "learning_rate": 0.0002, "epoch": 1.3026929982046678, "step": 18140}, {"loss": 0.7675, "grad_norm": 0.9222138524055481, "learning_rate": 0.0002, "epoch": 1.303411131059246, "step": 18150}, {"loss": 0.7593, "grad_norm": 0.7485767006874084, "learning_rate": 0.0002, "epoch": 1.304129263913824, "step": 18160}, {"loss": 0.7293, "grad_norm": 0.6383684277534485, "learning_rate": 0.0002, "epoch": 1.3048473967684022, "step": 18170}, {"loss": 0.7929, "grad_norm": 0.5934187173843384, "learning_rate": 0.0002, "epoch": 1.3055655296229802, "step": 18180}, {"loss": 0.7576, "grad_norm": 0.7265770435333252, "learning_rate": 0.0002, "epoch": 1.3062836624775582, "step": 18190}, {"loss": 0.7126, "grad_norm": 0.8149140477180481, "learning_rate": 0.0002, "epoch": 1.3070017953321365, "step": 18200}, {"loss": 0.7529, "grad_norm": 0.8067880272865295, "learning_rate": 0.0002, "epoch": 1.3077199281867147, "step": 18210}, {"loss": 0.7173, "grad_norm": 0.6109178066253662, "learning_rate": 0.0002, "epoch": 1.3084380610412927, "step": 18220}, {"loss": 0.7452, "grad_norm": 0.7194176316261292, "learning_rate": 0.0002, "epoch": 1.3091561938958707, "step": 18230}, {"loss": 0.732, "grad_norm": 0.6452242136001587, "learning_rate": 0.0002, "epoch": 1.309874326750449, "step": 18240}, {"loss": 0.7772, "grad_norm": 0.680550217628479, "learning_rate": 0.0002, "epoch": 1.310592459605027, "step": 18250}, {"loss": 0.7334, "grad_norm": 0.7005740404129028, "learning_rate": 0.0002, "epoch": 1.3113105924596051, "step": 18260}, {"loss": 0.7537, "grad_norm": 0.7217825055122375, "learning_rate": 0.0002, "epoch": 1.3120287253141831, "step": 18270}, {"loss": 0.7797, "grad_norm": 0.7730209231376648, "learning_rate": 0.0002, "epoch": 1.3127468581687611, "step": 18280}, {"loss": 0.7257, "grad_norm": 0.8291956186294556, "learning_rate": 0.0002, "epoch": 1.3134649910233394, "step": 18290}, {"loss": 0.7234, "grad_norm": 0.758528470993042, "learning_rate": 0.0002, "epoch": 1.3141831238779174, "step": 18300}, {"loss": 0.6915, "grad_norm": 0.9682782292366028, "learning_rate": 0.0002, "epoch": 1.3149012567324956, "step": 18310}, {"loss": 0.686, "grad_norm": 0.5784780979156494, "learning_rate": 0.0002, "epoch": 1.3156193895870736, "step": 18320}, {"loss": 0.7277, "grad_norm": 0.5870532393455505, "learning_rate": 0.0002, "epoch": 1.3163375224416516, "step": 18330}, {"loss": 0.7594, "grad_norm": 0.5950172543525696, "learning_rate": 0.0002, "epoch": 1.3170556552962298, "step": 18340}, {"loss": 0.7086, "grad_norm": 0.7625961899757385, "learning_rate": 0.0002, "epoch": 1.317773788150808, "step": 18350}, {"loss": 0.7075, "grad_norm": 0.8027397394180298, "learning_rate": 0.0002, "epoch": 1.318491921005386, "step": 18360}, {"loss": 0.7249, "grad_norm": 0.8424779772758484, "learning_rate": 0.0002, "epoch": 1.319210053859964, "step": 18370}, {"loss": 0.7349, "grad_norm": 0.5741737484931946, "learning_rate": 0.0002, "epoch": 1.3199281867145423, "step": 18380}, {"loss": 0.7421, "grad_norm": 0.7363710999488831, "learning_rate": 0.0002, "epoch": 1.3206463195691203, "step": 18390}, {"loss": 0.7208, "grad_norm": 0.7900536060333252, "learning_rate": 0.0002, "epoch": 1.3213644524236985, "step": 18400}, {"loss": 0.6836, "grad_norm": 0.6273105144500732, "learning_rate": 0.0002, "epoch": 1.3220825852782765, "step": 18410}, {"loss": 0.7365, "grad_norm": 0.7612496018409729, "learning_rate": 0.0002, "epoch": 1.3228007181328545, "step": 18420}, {"loss": 0.7521, "grad_norm": 0.729653537273407, "learning_rate": 0.0002, "epoch": 1.3235188509874327, "step": 18430}, {"loss": 0.7153, "grad_norm": 0.6599212288856506, "learning_rate": 0.0002, "epoch": 1.3242369838420107, "step": 18440}, {"loss": 0.7315, "grad_norm": 0.762320876121521, "learning_rate": 0.0002, "epoch": 1.324955116696589, "step": 18450}, {"loss": 0.6986, "grad_norm": 0.7468838095664978, "learning_rate": 0.0002, "epoch": 1.325673249551167, "step": 18460}, {"loss": 0.7527, "grad_norm": 0.6376237273216248, "learning_rate": 0.0002, "epoch": 1.326391382405745, "step": 18470}, {"loss": 0.7173, "grad_norm": 0.6722603440284729, "learning_rate": 0.0002, "epoch": 1.3271095152603232, "step": 18480}, {"loss": 0.6821, "grad_norm": 0.7011231780052185, "learning_rate": 0.0002, "epoch": 1.3278276481149014, "step": 18490}, {"loss": 0.7942, "grad_norm": 0.5325027108192444, "learning_rate": 0.0002, "epoch": 1.3285457809694794, "step": 18500}, {"loss": 0.6709, "grad_norm": 0.6916731595993042, "learning_rate": 0.0002, "epoch": 1.3292639138240574, "step": 18510}, {"loss": 0.7204, "grad_norm": 0.6529106497764587, "learning_rate": 0.0002, "epoch": 1.3299820466786356, "step": 18520}, {"loss": 0.7289, "grad_norm": 0.7708640694618225, "learning_rate": 0.0002, "epoch": 1.3307001795332136, "step": 18530}, {"loss": 0.7688, "grad_norm": 0.7125861048698425, "learning_rate": 0.0002, "epoch": 1.3314183123877918, "step": 18540}, {"loss": 0.723, "grad_norm": 0.7663969993591309, "learning_rate": 0.0002, "epoch": 1.3321364452423698, "step": 18550}, {"loss": 0.6993, "grad_norm": 0.601141631603241, "learning_rate": 0.0002, "epoch": 1.3328545780969479, "step": 18560}, {"loss": 0.734, "grad_norm": 0.6185581088066101, "learning_rate": 0.0002, "epoch": 1.333572710951526, "step": 18570}, {"loss": 0.6938, "grad_norm": 0.6136596202850342, "learning_rate": 0.0002, "epoch": 1.334290843806104, "step": 18580}, {"loss": 0.6963, "grad_norm": 0.8377187252044678, "learning_rate": 0.0002, "epoch": 1.3350089766606823, "step": 18590}, {"loss": 0.7399, "grad_norm": 0.7649989724159241, "learning_rate": 0.0002, "epoch": 1.3357271095152603, "step": 18600}, {"loss": 0.7565, "grad_norm": 0.7944515347480774, "learning_rate": 0.0002, "epoch": 1.3364452423698383, "step": 18610}, {"loss": 0.7894, "grad_norm": 0.619024395942688, "learning_rate": 0.0002, "epoch": 1.3371633752244165, "step": 18620}, {"loss": 0.7497, "grad_norm": 0.7849082946777344, "learning_rate": 0.0002, "epoch": 1.3378815080789948, "step": 18630}, {"loss": 0.7123, "grad_norm": 0.5740780830383301, "learning_rate": 0.0002, "epoch": 1.3385996409335728, "step": 18640}, {"loss": 0.7211, "grad_norm": 0.6897456645965576, "learning_rate": 0.0002, "epoch": 1.3393177737881508, "step": 18650}, {"loss": 0.7174, "grad_norm": 0.6263600587844849, "learning_rate": 0.0002, "epoch": 1.340035906642729, "step": 18660}, {"loss": 0.7048, "grad_norm": 0.5744550824165344, "learning_rate": 0.0002, "epoch": 1.340754039497307, "step": 18670}, {"loss": 0.7773, "grad_norm": 0.7785728573799133, "learning_rate": 0.0002, "epoch": 1.3414721723518852, "step": 18680}, {"loss": 0.7697, "grad_norm": 0.6944230198860168, "learning_rate": 0.0002, "epoch": 1.3421903052064632, "step": 18690}, {"loss": 0.7387, "grad_norm": 0.7388073801994324, "learning_rate": 0.0002, "epoch": 1.3429084380610412, "step": 18700}, {"loss": 0.7776, "grad_norm": 0.9555586576461792, "learning_rate": 0.0002, "epoch": 1.3436265709156194, "step": 18710}, {"loss": 0.7308, "grad_norm": 0.8510582447052002, "learning_rate": 0.0002, "epoch": 1.3443447037701974, "step": 18720}, {"loss": 0.7131, "grad_norm": 0.6093049645423889, "learning_rate": 0.0002, "epoch": 1.3450628366247757, "step": 18730}, {"loss": 0.7194, "grad_norm": 0.9159273505210876, "learning_rate": 0.0002, "epoch": 1.3457809694793537, "step": 18740}, {"loss": 0.7626, "grad_norm": 0.7188084721565247, "learning_rate": 0.0002, "epoch": 1.3464991023339317, "step": 18750}, {"loss": 0.7212, "grad_norm": 0.7228650450706482, "learning_rate": 0.0002, "epoch": 1.3472172351885099, "step": 18760}, {"loss": 0.7213, "grad_norm": 0.8160615563392639, "learning_rate": 0.0002, "epoch": 1.347935368043088, "step": 18770}, {"loss": 0.7093, "grad_norm": 0.6485389471054077, "learning_rate": 0.0002, "epoch": 1.3486535008976661, "step": 18780}, {"loss": 0.7044, "grad_norm": 0.6755139827728271, "learning_rate": 0.0002, "epoch": 1.3493716337522441, "step": 18790}, {"loss": 0.7413, "grad_norm": 0.6923297643661499, "learning_rate": 0.0002, "epoch": 1.3500897666068223, "step": 18800}, {"loss": 0.7184, "grad_norm": 0.6954510807991028, "learning_rate": 0.0002, "epoch": 1.3508078994614003, "step": 18810}, {"loss": 0.6987, "grad_norm": 0.9948558807373047, "learning_rate": 0.0002, "epoch": 1.3515260323159786, "step": 18820}, {"loss": 0.7315, "grad_norm": 0.708381175994873, "learning_rate": 0.0002, "epoch": 1.3522441651705566, "step": 18830}, {"loss": 0.7135, "grad_norm": 0.6409999132156372, "learning_rate": 0.0002, "epoch": 1.3529622980251346, "step": 18840}, {"loss": 0.7204, "grad_norm": 0.6365936994552612, "learning_rate": 0.0002, "epoch": 1.3536804308797128, "step": 18850}, {"loss": 0.691, "grad_norm": 0.7620742917060852, "learning_rate": 0.0002, "epoch": 1.3543985637342908, "step": 18860}, {"loss": 0.7458, "grad_norm": 0.6849071383476257, "learning_rate": 0.0002, "epoch": 1.355116696588869, "step": 18870}, {"loss": 0.7221, "grad_norm": 0.5776316523551941, "learning_rate": 0.0002, "epoch": 1.355834829443447, "step": 18880}, {"loss": 0.7412, "grad_norm": 0.597236156463623, "learning_rate": 0.0002, "epoch": 1.356552962298025, "step": 18890}, {"loss": 0.7065, "grad_norm": 0.6569282412528992, "learning_rate": 0.0002, "epoch": 1.3572710951526032, "step": 18900}, {"loss": 0.6995, "grad_norm": 0.6384802460670471, "learning_rate": 0.0002, "epoch": 1.3579892280071812, "step": 18910}, {"loss": 0.7592, "grad_norm": 0.6623879671096802, "learning_rate": 0.0002, "epoch": 1.3587073608617595, "step": 18920}, {"loss": 0.7288, "grad_norm": 0.6149632334709167, "learning_rate": 0.0002, "epoch": 1.3594254937163375, "step": 18930}, {"loss": 0.7392, "grad_norm": 0.6978002190589905, "learning_rate": 0.0002, "epoch": 1.3601436265709157, "step": 18940}, {"loss": 0.7405, "grad_norm": 0.7579124569892883, "learning_rate": 0.0002, "epoch": 1.3608617594254937, "step": 18950}, {"loss": 0.7589, "grad_norm": 0.7138084173202515, "learning_rate": 0.0002, "epoch": 1.361579892280072, "step": 18960}, {"loss": 0.7257, "grad_norm": 0.678322434425354, "learning_rate": 0.0002, "epoch": 1.36229802513465, "step": 18970}, {"loss": 0.7221, "grad_norm": 0.694346010684967, "learning_rate": 0.0002, "epoch": 1.363016157989228, "step": 18980}, {"loss": 0.6986, "grad_norm": 0.682262659072876, "learning_rate": 0.0002, "epoch": 1.3637342908438062, "step": 18990}, {"loss": 0.7297, "grad_norm": 0.9068194627761841, "learning_rate": 0.0002, "epoch": 1.3644524236983842, "step": 19000}, {"loss": 0.756, "grad_norm": 0.6691566705703735, "learning_rate": 0.0002, "epoch": 1.3651705565529624, "step": 19010}, {"loss": 0.7158, "grad_norm": 0.7791378498077393, "learning_rate": 0.0002, "epoch": 1.3658886894075404, "step": 19020}, {"loss": 0.6904, "grad_norm": 0.717107355594635, "learning_rate": 0.0002, "epoch": 1.3666068222621184, "step": 19030}, {"loss": 0.7308, "grad_norm": 0.7897566556930542, "learning_rate": 0.0002, "epoch": 1.3673249551166966, "step": 19040}, {"loss": 0.7278, "grad_norm": 0.8823844790458679, "learning_rate": 0.0002, "epoch": 1.3680430879712746, "step": 19050}, {"loss": 0.7252, "grad_norm": 0.6512053608894348, "learning_rate": 0.0002, "epoch": 1.3687612208258528, "step": 19060}, {"loss": 0.6861, "grad_norm": 0.6871389150619507, "learning_rate": 0.0002, "epoch": 1.3694793536804308, "step": 19070}, {"loss": 0.7311, "grad_norm": 0.6795603036880493, "learning_rate": 0.0002, "epoch": 1.370197486535009, "step": 19080}, {"loss": 0.7351, "grad_norm": 0.6569121479988098, "learning_rate": 0.0002, "epoch": 1.370915619389587, "step": 19090}, {"loss": 0.7743, "grad_norm": 0.6769960522651672, "learning_rate": 0.0002, "epoch": 1.3716337522441653, "step": 19100}, {"loss": 0.7275, "grad_norm": 0.726613461971283, "learning_rate": 0.0002, "epoch": 1.3723518850987433, "step": 19110}, {"loss": 0.7484, "grad_norm": 0.7287817001342773, "learning_rate": 0.0002, "epoch": 1.3730700179533213, "step": 19120}, {"loss": 0.7305, "grad_norm": 0.6169242858886719, "learning_rate": 0.0002, "epoch": 1.3737881508078995, "step": 19130}, {"loss": 0.7195, "grad_norm": 0.6537347435951233, "learning_rate": 0.0002, "epoch": 1.3745062836624775, "step": 19140}, {"loss": 0.7402, "grad_norm": 0.6113879680633545, "learning_rate": 0.0002, "epoch": 1.3752244165170557, "step": 19150}, {"loss": 0.7012, "grad_norm": 0.6415297985076904, "learning_rate": 0.0002, "epoch": 1.3759425493716337, "step": 19160}, {"loss": 0.7367, "grad_norm": 0.6812838315963745, "learning_rate": 0.0002, "epoch": 1.3766606822262117, "step": 19170}, {"loss": 0.7117, "grad_norm": 0.7331814169883728, "learning_rate": 0.0002, "epoch": 1.37737881508079, "step": 19180}, {"loss": 0.7496, "grad_norm": 0.7265108823776245, "learning_rate": 0.0002, "epoch": 1.378096947935368, "step": 19190}, {"loss": 0.699, "grad_norm": 0.6233167052268982, "learning_rate": 0.0002, "epoch": 1.3788150807899462, "step": 19200}, {"loss": 0.6978, "grad_norm": 0.6841492652893066, "learning_rate": 0.0002, "epoch": 1.3795332136445242, "step": 19210}, {"loss": 0.6934, "grad_norm": 0.822853684425354, "learning_rate": 0.0002, "epoch": 1.3802513464991024, "step": 19220}, {"loss": 0.7574, "grad_norm": 0.8078812956809998, "learning_rate": 0.0002, "epoch": 1.3809694793536804, "step": 19230}, {"loss": 0.7429, "grad_norm": 0.7269898056983948, "learning_rate": 0.0002, "epoch": 1.3816876122082586, "step": 19240}, {"loss": 0.7552, "grad_norm": 0.6297033429145813, "learning_rate": 0.0002, "epoch": 1.3824057450628366, "step": 19250}, {"loss": 0.7396, "grad_norm": 0.8097442388534546, "learning_rate": 0.0002, "epoch": 1.3831238779174146, "step": 19260}, {"loss": 0.7281, "grad_norm": 0.6442803740501404, "learning_rate": 0.0002, "epoch": 1.3838420107719929, "step": 19270}, {"loss": 0.7598, "grad_norm": 0.659866213798523, "learning_rate": 0.0002, "epoch": 1.3845601436265709, "step": 19280}, {"loss": 0.7262, "grad_norm": 0.7537921667098999, "learning_rate": 0.0002, "epoch": 1.385278276481149, "step": 19290}, {"loss": 0.7215, "grad_norm": 0.8441828489303589, "learning_rate": 0.0002, "epoch": 1.385996409335727, "step": 19300}, {"loss": 0.725, "grad_norm": 0.8506057262420654, "learning_rate": 0.0002, "epoch": 1.386714542190305, "step": 19310}, {"loss": 0.7747, "grad_norm": 0.6747094392776489, "learning_rate": 0.0002, "epoch": 1.3874326750448833, "step": 19320}, {"loss": 0.7785, "grad_norm": 0.7906509041786194, "learning_rate": 0.0002, "epoch": 1.3881508078994613, "step": 19330}, {"loss": 0.8147, "grad_norm": 0.6784867644309998, "learning_rate": 0.0002, "epoch": 1.3888689407540395, "step": 19340}, {"loss": 0.7861, "grad_norm": 0.6371709108352661, "learning_rate": 0.0002, "epoch": 1.3895870736086176, "step": 19350}, {"loss": 0.7434, "grad_norm": 0.7858285307884216, "learning_rate": 0.0002, "epoch": 1.3903052064631956, "step": 19360}, {"loss": 0.7638, "grad_norm": 0.711395263671875, "learning_rate": 0.0002, "epoch": 1.3910233393177738, "step": 19370}, {"loss": 0.725, "grad_norm": 0.7023257613182068, "learning_rate": 0.0002, "epoch": 1.391741472172352, "step": 19380}, {"loss": 0.7612, "grad_norm": 0.7036022543907166, "learning_rate": 0.0002, "epoch": 1.39245960502693, "step": 19390}, {"loss": 0.7354, "grad_norm": 0.6418436169624329, "learning_rate": 0.0002, "epoch": 1.393177737881508, "step": 19400}, {"loss": 0.7444, "grad_norm": 0.7108847498893738, "learning_rate": 0.0002, "epoch": 1.3938958707360862, "step": 19410}, {"loss": 0.771, "grad_norm": 0.6940230131149292, "learning_rate": 0.0002, "epoch": 1.3946140035906642, "step": 19420}, {"loss": 0.6791, "grad_norm": 0.6750220656394958, "learning_rate": 0.0002, "epoch": 1.3953321364452425, "step": 19430}, {"loss": 0.7466, "grad_norm": 0.7479177713394165, "learning_rate": 0.0002, "epoch": 1.3960502692998205, "step": 19440}, {"loss": 0.7259, "grad_norm": 0.626124918460846, "learning_rate": 0.0002, "epoch": 1.3967684021543985, "step": 19450}, {"loss": 0.7108, "grad_norm": 0.8908559083938599, "learning_rate": 0.0002, "epoch": 1.3974865350089767, "step": 19460}, {"loss": 0.7451, "grad_norm": 0.6163712739944458, "learning_rate": 0.0002, "epoch": 1.3982046678635547, "step": 19470}, {"loss": 0.7437, "grad_norm": 0.6993312239646912, "learning_rate": 0.0002, "epoch": 1.398922800718133, "step": 19480}, {"loss": 0.7035, "grad_norm": 0.6162890791893005, "learning_rate": 0.0002, "epoch": 1.399640933572711, "step": 19490}, {"loss": 0.7455, "grad_norm": 0.7797643542289734, "learning_rate": 0.0002, "epoch": 1.400359066427289, "step": 19500}, {"loss": 0.7497, "grad_norm": 0.7038744688034058, "learning_rate": 0.0002, "epoch": 1.4010771992818671, "step": 19510}, {"loss": 0.7084, "grad_norm": 0.6902393698692322, "learning_rate": 0.0002, "epoch": 1.4017953321364454, "step": 19520}, {"loss": 0.7136, "grad_norm": 0.5436386466026306, "learning_rate": 0.0002, "epoch": 1.4025134649910234, "step": 19530}, {"loss": 0.7457, "grad_norm": 0.6537990570068359, "learning_rate": 0.0002, "epoch": 1.4032315978456014, "step": 19540}, {"loss": 0.727, "grad_norm": 0.739691972732544, "learning_rate": 0.0002, "epoch": 1.4039497307001796, "step": 19550}, {"loss": 0.7537, "grad_norm": 0.7287635803222656, "learning_rate": 0.0002, "epoch": 1.4046678635547576, "step": 19560}, {"loss": 0.707, "grad_norm": 0.6809501051902771, "learning_rate": 0.0002, "epoch": 1.4053859964093358, "step": 19570}, {"loss": 0.7336, "grad_norm": 0.8302195072174072, "learning_rate": 0.0002, "epoch": 1.4061041292639138, "step": 19580}, {"loss": 0.7201, "grad_norm": 0.6613629460334778, "learning_rate": 0.0002, "epoch": 1.4068222621184918, "step": 19590}, {"loss": 0.7415, "grad_norm": 0.7897207736968994, "learning_rate": 0.0002, "epoch": 1.40754039497307, "step": 19600}, {"loss": 0.7483, "grad_norm": 0.8368293642997742, "learning_rate": 0.0002, "epoch": 1.408258527827648, "step": 19610}, {"loss": 0.7412, "grad_norm": 0.665109395980835, "learning_rate": 0.0002, "epoch": 1.4089766606822263, "step": 19620}, {"loss": 0.7339, "grad_norm": 0.7359302639961243, "learning_rate": 0.0002, "epoch": 1.4096947935368043, "step": 19630}, {"loss": 0.7775, "grad_norm": 0.8048052787780762, "learning_rate": 0.0002, "epoch": 1.4104129263913823, "step": 19640}, {"loss": 0.7668, "grad_norm": 0.7414906620979309, "learning_rate": 0.0002, "epoch": 1.4111310592459605, "step": 19650}, {"loss": 0.7386, "grad_norm": 0.7894161343574524, "learning_rate": 0.0002, "epoch": 1.4118491921005387, "step": 19660}, {"loss": 0.7371, "grad_norm": 0.6724628210067749, "learning_rate": 0.0002, "epoch": 1.4125673249551167, "step": 19670}, {"loss": 0.7243, "grad_norm": 0.9397756457328796, "learning_rate": 0.0002, "epoch": 1.4132854578096947, "step": 19680}, {"loss": 0.7109, "grad_norm": 0.6684842109680176, "learning_rate": 0.0002, "epoch": 1.414003590664273, "step": 19690}, {"loss": 0.7693, "grad_norm": 0.7753993272781372, "learning_rate": 0.0002, "epoch": 1.414721723518851, "step": 19700}, {"loss": 0.7653, "grad_norm": 0.6934253573417664, "learning_rate": 0.0002, "epoch": 1.4154398563734292, "step": 19710}, {"loss": 0.7393, "grad_norm": 0.8567284941673279, "learning_rate": 0.0002, "epoch": 1.4161579892280072, "step": 19720}, {"loss": 0.6907, "grad_norm": 0.9471787214279175, "learning_rate": 0.0002, "epoch": 1.4168761220825852, "step": 19730}, {"loss": 0.709, "grad_norm": 0.6664855480194092, "learning_rate": 0.0002, "epoch": 1.4175942549371634, "step": 19740}, {"loss": 0.7149, "grad_norm": 0.6713361740112305, "learning_rate": 0.0002, "epoch": 1.4183123877917414, "step": 19750}, {"loss": 0.7302, "grad_norm": 0.6488258838653564, "learning_rate": 0.0002, "epoch": 1.4190305206463196, "step": 19760}, {"loss": 0.7612, "grad_norm": 0.7089938521385193, "learning_rate": 0.0002, "epoch": 1.4197486535008976, "step": 19770}, {"loss": 0.7245, "grad_norm": 0.6433218717575073, "learning_rate": 0.0002, "epoch": 1.4204667863554756, "step": 19780}, {"loss": 0.7105, "grad_norm": 0.7025160193443298, "learning_rate": 0.0002, "epoch": 1.4211849192100539, "step": 19790}, {"loss": 0.7948, "grad_norm": 0.7030544877052307, "learning_rate": 0.0002, "epoch": 1.421903052064632, "step": 19800}, {"loss": 0.7333, "grad_norm": 0.6515552401542664, "learning_rate": 0.0002, "epoch": 1.42262118491921, "step": 19810}, {"loss": 0.7342, "grad_norm": 0.6463841795921326, "learning_rate": 0.0002, "epoch": 1.423339317773788, "step": 19820}, {"loss": 0.7457, "grad_norm": 0.6654344201087952, "learning_rate": 0.0002, "epoch": 1.4240574506283663, "step": 19830}, {"loss": 0.7289, "grad_norm": 0.7223384380340576, "learning_rate": 0.0002, "epoch": 1.4247755834829443, "step": 19840}, {"loss": 0.7471, "grad_norm": 0.6575722694396973, "learning_rate": 0.0002, "epoch": 1.4254937163375225, "step": 19850}, {"loss": 0.7559, "grad_norm": 0.6216059327125549, "learning_rate": 0.0002, "epoch": 1.4262118491921005, "step": 19860}, {"loss": 0.7638, "grad_norm": 0.7451487183570862, "learning_rate": 0.0002, "epoch": 1.4269299820466785, "step": 19870}, {"loss": 0.7083, "grad_norm": 0.6563336253166199, "learning_rate": 0.0002, "epoch": 1.4276481149012568, "step": 19880}, {"loss": 0.7122, "grad_norm": 0.8021975159645081, "learning_rate": 0.0002, "epoch": 1.4283662477558348, "step": 19890}, {"loss": 0.7389, "grad_norm": 0.7474712133407593, "learning_rate": 0.0002, "epoch": 1.429084380610413, "step": 19900}, {"loss": 0.7839, "grad_norm": 0.7316377758979797, "learning_rate": 0.0002, "epoch": 1.429802513464991, "step": 19910}, {"loss": 0.7588, "grad_norm": 0.646892786026001, "learning_rate": 0.0002, "epoch": 1.430520646319569, "step": 19920}, {"loss": 0.7175, "grad_norm": 0.6268765926361084, "learning_rate": 0.0002, "epoch": 1.4312387791741472, "step": 19930}, {"loss": 0.7502, "grad_norm": 0.7104699611663818, "learning_rate": 0.0002, "epoch": 1.4319569120287254, "step": 19940}, {"loss": 0.7006, "grad_norm": 0.6742063760757446, "learning_rate": 0.0002, "epoch": 1.4326750448833034, "step": 19950}, {"loss": 0.7394, "grad_norm": 0.6973381638526917, "learning_rate": 0.0002, "epoch": 1.4333931777378814, "step": 19960}, {"loss": 0.7428, "grad_norm": 0.5819381475448608, "learning_rate": 0.0002, "epoch": 1.4341113105924597, "step": 19970}, {"loss": 0.7836, "grad_norm": 0.680623471736908, "learning_rate": 0.0002, "epoch": 1.4348294434470377, "step": 19980}, {"loss": 0.7063, "grad_norm": 0.5899890661239624, "learning_rate": 0.0002, "epoch": 1.435547576301616, "step": 19990}, {"loss": 0.7438, "grad_norm": 0.6225098371505737, "learning_rate": 0.0002, "epoch": 1.436265709156194, "step": 20000}, {"loss": 0.7065, "grad_norm": 0.6314228773117065, "learning_rate": 0.0002, "epoch": 1.436983842010772, "step": 20010}, {"loss": 0.677, "grad_norm": 0.8690667152404785, "learning_rate": 0.0002, "epoch": 1.4377019748653501, "step": 20020}, {"loss": 0.7491, "grad_norm": 0.7166543006896973, "learning_rate": 0.0002, "epoch": 1.4384201077199281, "step": 20030}, {"loss": 0.7686, "grad_norm": 0.7051591873168945, "learning_rate": 0.0002, "epoch": 1.4391382405745063, "step": 20040}, {"loss": 0.6669, "grad_norm": 0.7606652975082397, "learning_rate": 0.0002, "epoch": 1.4398563734290843, "step": 20050}, {"loss": 0.7427, "grad_norm": 0.6343185305595398, "learning_rate": 0.0002, "epoch": 1.4405745062836623, "step": 20060}, {"loss": 0.6956, "grad_norm": 0.5625789761543274, "learning_rate": 0.0002, "epoch": 1.4412926391382406, "step": 20070}, {"loss": 0.7421, "grad_norm": 0.6081897020339966, "learning_rate": 0.0002, "epoch": 1.4420107719928188, "step": 20080}, {"loss": 0.7646, "grad_norm": 0.9571536779403687, "learning_rate": 0.0002, "epoch": 1.4427289048473968, "step": 20090}, {"loss": 0.6939, "grad_norm": 0.869531512260437, "learning_rate": 0.0002, "epoch": 1.4434470377019748, "step": 20100}, {"loss": 0.7684, "grad_norm": 0.6865507960319519, "learning_rate": 0.0002, "epoch": 1.444165170556553, "step": 20110}, {"loss": 0.6835, "grad_norm": 0.7572755813598633, "learning_rate": 0.0002, "epoch": 1.444883303411131, "step": 20120}, {"loss": 0.7392, "grad_norm": 0.79011070728302, "learning_rate": 0.0002, "epoch": 1.4456014362657092, "step": 20130}, {"loss": 0.7624, "grad_norm": 0.8297342658042908, "learning_rate": 0.0002, "epoch": 1.4463195691202873, "step": 20140}, {"loss": 0.696, "grad_norm": 0.6593490839004517, "learning_rate": 0.0002, "epoch": 1.4470377019748653, "step": 20150}, {"loss": 0.7062, "grad_norm": 1.0264687538146973, "learning_rate": 0.0002, "epoch": 1.4477558348294435, "step": 20160}, {"loss": 0.7804, "grad_norm": 0.7032888531684875, "learning_rate": 0.0002, "epoch": 1.4484739676840215, "step": 20170}, {"loss": 0.7692, "grad_norm": 0.6438494920730591, "learning_rate": 0.0002, "epoch": 1.4491921005385997, "step": 20180}, {"loss": 0.7189, "grad_norm": 0.7448790669441223, "learning_rate": 0.0002, "epoch": 1.4499102333931777, "step": 20190}, {"loss": 0.7389, "grad_norm": 0.7551555037498474, "learning_rate": 0.0002, "epoch": 1.4506283662477557, "step": 20200}, {"loss": 0.7636, "grad_norm": 0.6677857041358948, "learning_rate": 0.0002, "epoch": 1.451346499102334, "step": 20210}, {"loss": 0.7261, "grad_norm": 0.7888486385345459, "learning_rate": 0.0002, "epoch": 1.4520646319569122, "step": 20220}, {"loss": 0.7349, "grad_norm": 0.6658565402030945, "learning_rate": 0.0002, "epoch": 1.4527827648114902, "step": 20230}, {"loss": 0.7862, "grad_norm": 0.6800249814987183, "learning_rate": 0.0002, "epoch": 1.4535008976660682, "step": 20240}, {"loss": 0.7464, "grad_norm": 0.7419682741165161, "learning_rate": 0.0002, "epoch": 1.4542190305206464, "step": 20250}, {"loss": 0.7118, "grad_norm": 0.8848792910575867, "learning_rate": 0.0002, "epoch": 1.4549371633752244, "step": 20260}, {"loss": 0.729, "grad_norm": 0.6513857245445251, "learning_rate": 0.0002, "epoch": 1.4556552962298026, "step": 20270}, {"loss": 0.7325, "grad_norm": 0.5605742335319519, "learning_rate": 0.0002, "epoch": 1.4563734290843806, "step": 20280}, {"loss": 0.7078, "grad_norm": 0.6737141013145447, "learning_rate": 0.0002, "epoch": 1.4570915619389586, "step": 20290}, {"loss": 0.6971, "grad_norm": 0.6663289666175842, "learning_rate": 0.0002, "epoch": 1.4578096947935368, "step": 20300}, {"loss": 0.7161, "grad_norm": 0.7157106995582581, "learning_rate": 0.0002, "epoch": 1.4585278276481148, "step": 20310}, {"loss": 0.7024, "grad_norm": 0.7713354825973511, "learning_rate": 0.0002, "epoch": 1.459245960502693, "step": 20320}, {"loss": 0.7043, "grad_norm": 0.8334044218063354, "learning_rate": 0.0002, "epoch": 1.459964093357271, "step": 20330}, {"loss": 0.7151, "grad_norm": 0.7268327474594116, "learning_rate": 0.0002, "epoch": 1.460682226211849, "step": 20340}, {"loss": 0.7415, "grad_norm": 0.6791431903839111, "learning_rate": 0.0002, "epoch": 1.4614003590664273, "step": 20350}, {"loss": 0.7738, "grad_norm": 0.8177870512008667, "learning_rate": 0.0002, "epoch": 1.4621184919210055, "step": 20360}, {"loss": 0.7212, "grad_norm": 0.8064364790916443, "learning_rate": 0.0002, "epoch": 1.4628366247755835, "step": 20370}, {"loss": 0.7285, "grad_norm": 0.6547006964683533, "learning_rate": 0.0002, "epoch": 1.4635547576301615, "step": 20380}, {"loss": 0.7444, "grad_norm": 0.6381436586380005, "learning_rate": 0.0002, "epoch": 1.4642728904847397, "step": 20390}, {"loss": 0.7593, "grad_norm": 0.7351248264312744, "learning_rate": 0.0002, "epoch": 1.4649910233393177, "step": 20400}, {"loss": 0.7385, "grad_norm": 0.7037558555603027, "learning_rate": 0.0002, "epoch": 1.465709156193896, "step": 20410}, {"loss": 0.7815, "grad_norm": 0.6294074654579163, "learning_rate": 0.0002, "epoch": 1.466427289048474, "step": 20420}, {"loss": 0.6665, "grad_norm": 0.9722632765769958, "learning_rate": 0.0002, "epoch": 1.467145421903052, "step": 20430}, {"loss": 0.7363, "grad_norm": 0.753065824508667, "learning_rate": 0.0002, "epoch": 1.4678635547576302, "step": 20440}, {"loss": 0.7568, "grad_norm": 0.7317194938659668, "learning_rate": 0.0002, "epoch": 1.4685816876122082, "step": 20450}, {"loss": 0.6948, "grad_norm": 0.6862193942070007, "learning_rate": 0.0002, "epoch": 1.4692998204667864, "step": 20460}, {"loss": 0.7552, "grad_norm": 0.7643225193023682, "learning_rate": 0.0002, "epoch": 1.4700179533213644, "step": 20470}, {"loss": 0.6757, "grad_norm": 0.5904353260993958, "learning_rate": 0.0002, "epoch": 1.4707360861759424, "step": 20480}, {"loss": 0.7779, "grad_norm": 0.5812238454818726, "learning_rate": 0.0002, "epoch": 1.4714542190305206, "step": 20490}, {"loss": 0.7252, "grad_norm": 0.7478151321411133, "learning_rate": 0.0002, "epoch": 1.4721723518850989, "step": 20500}, {"loss": 0.7165, "grad_norm": 0.7625645399093628, "learning_rate": 0.0002, "epoch": 1.4728904847396769, "step": 20510}, {"loss": 0.7383, "grad_norm": 0.6354498267173767, "learning_rate": 0.0002, "epoch": 1.4736086175942549, "step": 20520}, {"loss": 0.7095, "grad_norm": 0.8731162548065186, "learning_rate": 0.0002, "epoch": 1.474326750448833, "step": 20530}, {"loss": 0.7535, "grad_norm": 0.7346670627593994, "learning_rate": 0.0002, "epoch": 1.475044883303411, "step": 20540}, {"loss": 0.78, "grad_norm": 1.038447618484497, "learning_rate": 0.0002, "epoch": 1.4757630161579893, "step": 20550}, {"loss": 0.7026, "grad_norm": 0.7032809257507324, "learning_rate": 0.0002, "epoch": 1.4764811490125673, "step": 20560}, {"loss": 0.6776, "grad_norm": 0.8008337020874023, "learning_rate": 0.0002, "epoch": 1.4771992818671453, "step": 20570}, {"loss": 0.776, "grad_norm": 0.6735056638717651, "learning_rate": 0.0002, "epoch": 1.4779174147217236, "step": 20580}, {"loss": 0.7632, "grad_norm": 0.622056245803833, "learning_rate": 0.0002, "epoch": 1.4786355475763016, "step": 20590}, {"loss": 0.7467, "grad_norm": 0.6580422520637512, "learning_rate": 0.0002, "epoch": 1.4793536804308798, "step": 20600}, {"loss": 0.7161, "grad_norm": 0.8401153087615967, "learning_rate": 0.0002, "epoch": 1.4800718132854578, "step": 20610}, {"loss": 0.7581, "grad_norm": 0.7564560770988464, "learning_rate": 0.0002, "epoch": 1.4807899461400358, "step": 20620}, {"loss": 0.7507, "grad_norm": 0.8319511413574219, "learning_rate": 0.0002, "epoch": 1.481508078994614, "step": 20630}, {"loss": 0.7379, "grad_norm": 0.7430182695388794, "learning_rate": 0.0002, "epoch": 1.4822262118491922, "step": 20640}, {"loss": 0.7273, "grad_norm": 0.7996522784233093, "learning_rate": 0.0002, "epoch": 1.4829443447037702, "step": 20650}, {"loss": 0.7223, "grad_norm": 0.6993277072906494, "learning_rate": 0.0002, "epoch": 1.4836624775583482, "step": 20660}, {"loss": 0.7328, "grad_norm": 0.8621185421943665, "learning_rate": 0.0002, "epoch": 1.4843806104129265, "step": 20670}, {"loss": 0.7327, "grad_norm": 0.7709757685661316, "learning_rate": 0.0002, "epoch": 1.4850987432675045, "step": 20680}, {"loss": 0.7053, "grad_norm": 0.743760347366333, "learning_rate": 0.0002, "epoch": 1.4858168761220827, "step": 20690}, {"loss": 0.6763, "grad_norm": 0.8353745341300964, "learning_rate": 0.0002, "epoch": 1.4865350089766607, "step": 20700}, {"loss": 0.6933, "grad_norm": 0.8510433435440063, "learning_rate": 0.0002, "epoch": 1.4872531418312387, "step": 20710}, {"loss": 0.7486, "grad_norm": 0.7065894603729248, "learning_rate": 0.0002, "epoch": 1.487971274685817, "step": 20720}, {"loss": 0.736, "grad_norm": 0.6878955960273743, "learning_rate": 0.0002, "epoch": 1.488689407540395, "step": 20730}, {"loss": 0.6958, "grad_norm": 0.7861111760139465, "learning_rate": 0.0002, "epoch": 1.4894075403949731, "step": 20740}, {"loss": 0.7568, "grad_norm": 0.4810725152492523, "learning_rate": 0.0002, "epoch": 1.4901256732495511, "step": 20750}, {"loss": 0.8147, "grad_norm": 0.7246082425117493, "learning_rate": 0.0002, "epoch": 1.4908438061041291, "step": 20760}, {"loss": 0.7312, "grad_norm": 0.7101936340332031, "learning_rate": 0.0002, "epoch": 1.4915619389587074, "step": 20770}, {"loss": 0.7393, "grad_norm": 0.7508591413497925, "learning_rate": 0.0002, "epoch": 1.4922800718132856, "step": 20780}, {"loss": 0.7635, "grad_norm": 0.8872039914131165, "learning_rate": 0.0002, "epoch": 1.4929982046678636, "step": 20790}, {"loss": 0.7352, "grad_norm": 0.7257922887802124, "learning_rate": 0.0002, "epoch": 1.4937163375224416, "step": 20800}, {"loss": 0.7497, "grad_norm": 0.7886278629302979, "learning_rate": 0.0002, "epoch": 1.4944344703770198, "step": 20810}, {"loss": 0.7247, "grad_norm": 0.6746290922164917, "learning_rate": 0.0002, "epoch": 1.4951526032315978, "step": 20820}, {"loss": 0.7836, "grad_norm": 0.8118207454681396, "learning_rate": 0.0002, "epoch": 1.495870736086176, "step": 20830}, {"loss": 0.7323, "grad_norm": 0.7337301969528198, "learning_rate": 0.0002, "epoch": 1.496588868940754, "step": 20840}, {"loss": 0.7105, "grad_norm": 0.5451242327690125, "learning_rate": 0.0002, "epoch": 1.497307001795332, "step": 20850}, {"loss": 0.7255, "grad_norm": 0.8398377299308777, "learning_rate": 0.0002, "epoch": 1.4980251346499103, "step": 20860}, {"loss": 0.7217, "grad_norm": 0.7196659445762634, "learning_rate": 0.0002, "epoch": 1.4987432675044883, "step": 20870}, {"loss": 0.6843, "grad_norm": 0.6659539937973022, "learning_rate": 0.0002, "epoch": 1.4994614003590665, "step": 20880}, {"loss": 0.7337, "grad_norm": 0.6071978807449341, "learning_rate": 0.0002, "epoch": 1.5001795332136445, "step": 20890}, {"loss": 0.7221, "grad_norm": 0.6704870462417603, "learning_rate": 0.0002, "epoch": 1.5008976660682225, "step": 20900}, {"loss": 0.6946, "grad_norm": 0.7216639518737793, "learning_rate": 0.0002, "epoch": 1.5016157989228007, "step": 20910}, {"loss": 0.7282, "grad_norm": 0.6050528287887573, "learning_rate": 0.0002, "epoch": 1.502333931777379, "step": 20920}, {"loss": 0.7142, "grad_norm": 0.7422218918800354, "learning_rate": 0.0002, "epoch": 1.503052064631957, "step": 20930}, {"loss": 0.7779, "grad_norm": 0.7157148122787476, "learning_rate": 0.0002, "epoch": 1.503770197486535, "step": 20940}, {"loss": 0.7179, "grad_norm": 0.6704899668693542, "learning_rate": 0.0002, "epoch": 1.504488330341113, "step": 20950}, {"loss": 0.7124, "grad_norm": 0.7573544979095459, "learning_rate": 0.0002, "epoch": 1.5052064631956912, "step": 20960}, {"loss": 0.7831, "grad_norm": 0.6710506677627563, "learning_rate": 0.0002, "epoch": 1.5059245960502694, "step": 20970}, {"loss": 0.7123, "grad_norm": 0.7559793591499329, "learning_rate": 0.0002, "epoch": 1.5066427289048474, "step": 20980}, {"loss": 0.7442, "grad_norm": 0.6705940961837769, "learning_rate": 0.0002, "epoch": 1.5073608617594254, "step": 20990}, {"loss": 0.7387, "grad_norm": 0.8016680479049683, "learning_rate": 0.0002, "epoch": 1.5080789946140036, "step": 21000}, {"loss": 0.7101, "grad_norm": 0.8154481649398804, "learning_rate": 0.0002, "epoch": 1.5087971274685816, "step": 21010}, {"loss": 0.7223, "grad_norm": 0.5830582976341248, "learning_rate": 0.0002, "epoch": 1.5095152603231599, "step": 21020}, {"loss": 0.753, "grad_norm": 0.7088601589202881, "learning_rate": 0.0002, "epoch": 1.5102333931777379, "step": 21030}, {"loss": 0.7278, "grad_norm": 0.7499658465385437, "learning_rate": 0.0002, "epoch": 1.5109515260323159, "step": 21040}, {"loss": 0.7441, "grad_norm": 0.7684667706489563, "learning_rate": 0.0002, "epoch": 1.511669658886894, "step": 21050}, {"loss": 0.7665, "grad_norm": 0.7183627486228943, "learning_rate": 0.0002, "epoch": 1.5123877917414723, "step": 21060}, {"loss": 0.7777, "grad_norm": 0.8201524615287781, "learning_rate": 0.0002, "epoch": 1.5131059245960503, "step": 21070}, {"loss": 0.7005, "grad_norm": 0.6359647512435913, "learning_rate": 0.0002, "epoch": 1.5138240574506283, "step": 21080}, {"loss": 0.7231, "grad_norm": 0.7419124245643616, "learning_rate": 0.0002, "epoch": 1.5145421903052063, "step": 21090}, {"loss": 0.724, "grad_norm": 0.6145808696746826, "learning_rate": 0.0002, "epoch": 1.5152603231597845, "step": 21100}, {"loss": 0.7563, "grad_norm": 0.7116656303405762, "learning_rate": 0.0002, "epoch": 1.5159784560143628, "step": 21110}, {"loss": 0.7221, "grad_norm": 0.8927125334739685, "learning_rate": 0.0002, "epoch": 1.5166965888689408, "step": 21120}, {"loss": 0.7159, "grad_norm": 0.7527788877487183, "learning_rate": 0.0002, "epoch": 1.5174147217235188, "step": 21130}, {"loss": 0.7147, "grad_norm": 0.7537266612052917, "learning_rate": 0.0002, "epoch": 1.518132854578097, "step": 21140}, {"loss": 0.7451, "grad_norm": 0.9051724672317505, "learning_rate": 0.0002, "epoch": 1.518850987432675, "step": 21150}, {"loss": 0.7362, "grad_norm": 0.7258086800575256, "learning_rate": 0.0002, "epoch": 1.5195691202872532, "step": 21160}, {"loss": 0.7096, "grad_norm": 0.60377436876297, "learning_rate": 0.0002, "epoch": 1.5202872531418312, "step": 21170}, {"loss": 0.7141, "grad_norm": 0.613362729549408, "learning_rate": 0.0002, "epoch": 1.5210053859964092, "step": 21180}, {"loss": 0.7018, "grad_norm": 0.6311782002449036, "learning_rate": 0.0002, "epoch": 1.5217235188509874, "step": 21190}, {"loss": 0.8144, "grad_norm": 0.7814380526542664, "learning_rate": 0.0002, "epoch": 1.5224416517055657, "step": 21200}, {"loss": 0.7505, "grad_norm": 0.8482790589332581, "learning_rate": 0.0002, "epoch": 1.5231597845601437, "step": 21210}, {"loss": 0.7387, "grad_norm": 0.6767336130142212, "learning_rate": 0.0002, "epoch": 1.5238779174147217, "step": 21220}, {"loss": 0.7556, "grad_norm": 0.7000219821929932, "learning_rate": 0.0002, "epoch": 1.5245960502692997, "step": 21230}, {"loss": 0.7628, "grad_norm": 0.8848617076873779, "learning_rate": 0.0002, "epoch": 1.525314183123878, "step": 21240}, {"loss": 0.7226, "grad_norm": 0.692258894443512, "learning_rate": 0.0002, "epoch": 1.5260323159784561, "step": 21250}, {"loss": 0.7535, "grad_norm": 0.7701950073242188, "learning_rate": 0.0002, "epoch": 1.5267504488330341, "step": 21260}, {"loss": 0.7531, "grad_norm": 0.7454132437705994, "learning_rate": 0.0002, "epoch": 1.5274685816876121, "step": 21270}, {"loss": 0.7663, "grad_norm": 0.7299574613571167, "learning_rate": 0.0002, "epoch": 1.5281867145421903, "step": 21280}, {"loss": 0.6993, "grad_norm": 0.6693950891494751, "learning_rate": 0.0002, "epoch": 1.5289048473967684, "step": 21290}, {"loss": 0.7567, "grad_norm": 0.8323785066604614, "learning_rate": 0.0002, "epoch": 1.5296229802513466, "step": 21300}, {"loss": 0.7205, "grad_norm": 0.8998763561248779, "learning_rate": 0.0002, "epoch": 1.5303411131059246, "step": 21310}, {"loss": 0.7779, "grad_norm": 0.8118193745613098, "learning_rate": 0.0002, "epoch": 1.5310592459605026, "step": 21320}, {"loss": 0.7642, "grad_norm": 0.8966332077980042, "learning_rate": 0.0002, "epoch": 1.5317773788150808, "step": 21330}, {"loss": 0.7626, "grad_norm": 0.7849827408790588, "learning_rate": 0.0002, "epoch": 1.532495511669659, "step": 21340}, {"loss": 0.7501, "grad_norm": 0.897583544254303, "learning_rate": 0.0002, "epoch": 1.533213644524237, "step": 21350}, {"loss": 0.7812, "grad_norm": 0.7998009324073792, "learning_rate": 0.0002, "epoch": 1.533931777378815, "step": 21360}, {"loss": 0.7217, "grad_norm": 0.5890361070632935, "learning_rate": 0.0002, "epoch": 1.534649910233393, "step": 21370}, {"loss": 0.7283, "grad_norm": 0.7321302890777588, "learning_rate": 0.0002, "epoch": 1.5353680430879713, "step": 21380}, {"loss": 0.7238, "grad_norm": 0.7746050357818604, "learning_rate": 0.0002, "epoch": 1.5360861759425495, "step": 21390}, {"loss": 0.7146, "grad_norm": 0.7033910155296326, "learning_rate": 0.0002, "epoch": 1.5368043087971275, "step": 21400}, {"loss": 0.6783, "grad_norm": 0.7229148149490356, "learning_rate": 0.0002, "epoch": 1.5375224416517055, "step": 21410}, {"loss": 0.7347, "grad_norm": 0.8055810928344727, "learning_rate": 0.0002, "epoch": 1.5382405745062837, "step": 21420}, {"loss": 0.7382, "grad_norm": 0.9411654472351074, "learning_rate": 0.0002, "epoch": 1.5389587073608617, "step": 21430}, {"loss": 0.6916, "grad_norm": 0.7297126650810242, "learning_rate": 0.0002, "epoch": 1.53967684021544, "step": 21440}, {"loss": 0.6977, "grad_norm": 0.7316457629203796, "learning_rate": 0.0002, "epoch": 1.540394973070018, "step": 21450}, {"loss": 0.713, "grad_norm": 0.8568798303604126, "learning_rate": 0.0002, "epoch": 1.541113105924596, "step": 21460}, {"loss": 0.6916, "grad_norm": 0.7829580307006836, "learning_rate": 0.0002, "epoch": 1.5418312387791742, "step": 21470}, {"loss": 0.712, "grad_norm": 0.6679823398590088, "learning_rate": 0.0002, "epoch": 1.5425493716337524, "step": 21480}, {"loss": 0.6978, "grad_norm": 0.5680868029594421, "learning_rate": 0.0002, "epoch": 1.5432675044883304, "step": 21490}, {"loss": 0.7638, "grad_norm": 0.6878862380981445, "learning_rate": 0.0002, "epoch": 1.5439856373429084, "step": 21500}, {"loss": 0.7634, "grad_norm": 0.7391727566719055, "learning_rate": 0.0002, "epoch": 1.5447037701974864, "step": 21510}, {"loss": 0.7781, "grad_norm": 0.844994843006134, "learning_rate": 0.0002, "epoch": 1.5454219030520646, "step": 21520}, {"loss": 0.7052, "grad_norm": 0.7852550148963928, "learning_rate": 0.0002, "epoch": 1.5461400359066428, "step": 21530}, {"loss": 0.7364, "grad_norm": 0.8370407223701477, "learning_rate": 0.0002, "epoch": 1.5468581687612208, "step": 21540}, {"loss": 0.7266, "grad_norm": 0.7138169407844543, "learning_rate": 0.0002, "epoch": 1.5475763016157988, "step": 21550}, {"loss": 0.7078, "grad_norm": 0.7660839557647705, "learning_rate": 0.0002, "epoch": 1.548294434470377, "step": 21560}, {"loss": 0.7056, "grad_norm": 0.6628666520118713, "learning_rate": 0.0002, "epoch": 1.549012567324955, "step": 21570}, {"loss": 0.7384, "grad_norm": 0.602262020111084, "learning_rate": 0.0002, "epoch": 1.5497307001795333, "step": 21580}, {"loss": 0.7258, "grad_norm": 0.6120333671569824, "learning_rate": 0.0002, "epoch": 1.5504488330341113, "step": 21590}, {"loss": 0.8094, "grad_norm": 0.6742582321166992, "learning_rate": 0.0002, "epoch": 1.5511669658886893, "step": 21600}, {"loss": 0.6807, "grad_norm": 0.6788192391395569, "learning_rate": 0.0002, "epoch": 1.5518850987432675, "step": 21610}, {"loss": 0.6969, "grad_norm": 0.7124713659286499, "learning_rate": 0.0002, "epoch": 1.5526032315978457, "step": 21620}, {"loss": 0.7296, "grad_norm": 0.6297248005867004, "learning_rate": 0.0002, "epoch": 1.5533213644524237, "step": 21630}, {"loss": 0.7466, "grad_norm": 0.8977078199386597, "learning_rate": 0.0002, "epoch": 1.5540394973070017, "step": 21640}, {"loss": 0.7376, "grad_norm": 0.7543209791183472, "learning_rate": 0.0002, "epoch": 1.5547576301615798, "step": 21650}, {"loss": 0.749, "grad_norm": 0.8704302310943604, "learning_rate": 0.0002, "epoch": 1.555475763016158, "step": 21660}, {"loss": 0.7801, "grad_norm": 0.7848012447357178, "learning_rate": 0.0002, "epoch": 1.5561938958707362, "step": 21670}, {"loss": 0.7062, "grad_norm": 0.7496278285980225, "learning_rate": 0.0002, "epoch": 1.5569120287253142, "step": 21680}, {"loss": 0.7503, "grad_norm": 0.7305200099945068, "learning_rate": 0.0002, "epoch": 1.5576301615798922, "step": 21690}, {"loss": 0.7429, "grad_norm": 0.6671105623245239, "learning_rate": 0.0002, "epoch": 1.5583482944344704, "step": 21700}, {"loss": 0.7293, "grad_norm": 0.8536111116409302, "learning_rate": 0.0002, "epoch": 1.5590664272890484, "step": 21710}, {"loss": 0.7169, "grad_norm": 0.7360461354255676, "learning_rate": 0.0002, "epoch": 1.5597845601436267, "step": 21720}, {"loss": 0.7314, "grad_norm": 0.6665109395980835, "learning_rate": 0.0002, "epoch": 1.5605026929982047, "step": 21730}, {"loss": 0.7262, "grad_norm": 0.5879628658294678, "learning_rate": 0.0002, "epoch": 1.5612208258527827, "step": 21740}, {"loss": 0.7099, "grad_norm": 0.6937240958213806, "learning_rate": 0.0002, "epoch": 1.5619389587073609, "step": 21750}, {"loss": 0.7669, "grad_norm": 0.7118659019470215, "learning_rate": 0.0002, "epoch": 1.562657091561939, "step": 21760}, {"loss": 0.7196, "grad_norm": 0.7858866453170776, "learning_rate": 0.0002, "epoch": 1.563375224416517, "step": 21770}, {"loss": 0.7552, "grad_norm": 0.8691372871398926, "learning_rate": 0.0002, "epoch": 1.564093357271095, "step": 21780}, {"loss": 0.7684, "grad_norm": 0.8884942531585693, "learning_rate": 0.0002, "epoch": 1.564811490125673, "step": 21790}, {"loss": 0.7128, "grad_norm": 0.6335656046867371, "learning_rate": 0.0002, "epoch": 1.5655296229802513, "step": 21800}, {"loss": 0.7233, "grad_norm": 0.8666166067123413, "learning_rate": 0.0002, "epoch": 1.5662477558348296, "step": 21810}, {"loss": 0.6771, "grad_norm": 0.7961624264717102, "learning_rate": 0.0002, "epoch": 1.5669658886894076, "step": 21820}, {"loss": 0.7286, "grad_norm": 0.6331174373626709, "learning_rate": 0.0002, "epoch": 1.5676840215439856, "step": 21830}, {"loss": 0.7273, "grad_norm": 0.6476998925209045, "learning_rate": 0.0002, "epoch": 1.5684021543985638, "step": 21840}, {"loss": 0.7507, "grad_norm": 0.8279129266738892, "learning_rate": 0.0002, "epoch": 1.5691202872531418, "step": 21850}, {"loss": 0.7219, "grad_norm": 0.6997109651565552, "learning_rate": 0.0002, "epoch": 1.56983842010772, "step": 21860}, {"loss": 0.7424, "grad_norm": 0.6992211937904358, "learning_rate": 0.0002, "epoch": 1.570556552962298, "step": 21870}, {"loss": 0.7275, "grad_norm": 0.7766915559768677, "learning_rate": 0.0002, "epoch": 1.571274685816876, "step": 21880}, {"loss": 0.7651, "grad_norm": 0.6845845580101013, "learning_rate": 0.0002, "epoch": 1.5719928186714542, "step": 21890}, {"loss": 0.706, "grad_norm": 0.7247874140739441, "learning_rate": 0.0002, "epoch": 1.5727109515260325, "step": 21900}, {"loss": 0.7812, "grad_norm": 0.802342414855957, "learning_rate": 0.0002, "epoch": 1.5734290843806105, "step": 21910}, {"loss": 0.7028, "grad_norm": 0.7797709107398987, "learning_rate": 0.0002, "epoch": 1.5741472172351885, "step": 21920}, {"loss": 0.7466, "grad_norm": 0.6534958481788635, "learning_rate": 0.0002, "epoch": 1.5748653500897665, "step": 21930}, {"loss": 0.7148, "grad_norm": 0.6003528237342834, "learning_rate": 0.0002, "epoch": 1.5755834829443447, "step": 21940}, {"loss": 0.7282, "grad_norm": 0.6920075416564941, "learning_rate": 0.0002, "epoch": 1.576301615798923, "step": 21950}, {"loss": 0.6533, "grad_norm": 0.7213456034660339, "learning_rate": 0.0002, "epoch": 1.577019748653501, "step": 21960}, {"loss": 0.6875, "grad_norm": 0.7101914286613464, "learning_rate": 0.0002, "epoch": 1.577737881508079, "step": 21970}, {"loss": 0.7421, "grad_norm": 0.9531592130661011, "learning_rate": 0.0002, "epoch": 1.5784560143626571, "step": 21980}, {"loss": 0.7454, "grad_norm": 0.7690590023994446, "learning_rate": 0.0002, "epoch": 1.5791741472172351, "step": 21990}, {"loss": 0.7135, "grad_norm": 0.8226363062858582, "learning_rate": 0.0002, "epoch": 1.5798922800718134, "step": 22000}, {"loss": 0.7518, "grad_norm": 0.6128851175308228, "learning_rate": 0.0002, "epoch": 1.5806104129263914, "step": 22010}, {"loss": 0.7253, "grad_norm": 0.827008068561554, "learning_rate": 0.0002, "epoch": 1.5813285457809694, "step": 22020}, {"loss": 0.7176, "grad_norm": 0.6729007363319397, "learning_rate": 0.0002, "epoch": 1.5820466786355476, "step": 22030}, {"loss": 0.7503, "grad_norm": 0.6397014260292053, "learning_rate": 0.0002, "epoch": 1.5827648114901258, "step": 22040}, {"loss": 0.7531, "grad_norm": 0.6927793622016907, "learning_rate": 0.0002, "epoch": 1.5834829443447038, "step": 22050}, {"loss": 0.7499, "grad_norm": 0.7527112364768982, "learning_rate": 0.0002, "epoch": 1.5842010771992818, "step": 22060}, {"loss": 0.739, "grad_norm": 0.6418012380599976, "learning_rate": 0.0002, "epoch": 1.5849192100538598, "step": 22070}, {"loss": 0.727, "grad_norm": 0.7627281546592712, "learning_rate": 0.0002, "epoch": 1.585637342908438, "step": 22080}, {"loss": 0.7115, "grad_norm": 0.753851592540741, "learning_rate": 0.0002, "epoch": 1.5863554757630163, "step": 22090}, {"loss": 0.7677, "grad_norm": 0.6049349904060364, "learning_rate": 0.0002, "epoch": 1.5870736086175943, "step": 22100}, {"loss": 0.7494, "grad_norm": 0.6677758693695068, "learning_rate": 0.0002, "epoch": 1.5877917414721723, "step": 22110}, {"loss": 0.7259, "grad_norm": 0.913489818572998, "learning_rate": 0.0002, "epoch": 1.5885098743267505, "step": 22120}, {"loss": 0.7823, "grad_norm": 0.6779162883758545, "learning_rate": 0.0002, "epoch": 1.5892280071813285, "step": 22130}, {"loss": 0.7674, "grad_norm": 0.910076916217804, "learning_rate": 0.0002, "epoch": 1.5899461400359067, "step": 22140}, {"loss": 0.7162, "grad_norm": 0.9506068229675293, "learning_rate": 0.0002, "epoch": 1.5906642728904847, "step": 22150}, {"loss": 0.7343, "grad_norm": 0.6552460789680481, "learning_rate": 0.0002, "epoch": 1.5913824057450627, "step": 22160}, {"loss": 0.7488, "grad_norm": 0.6855819821357727, "learning_rate": 0.0002, "epoch": 1.592100538599641, "step": 22170}, {"loss": 0.6785, "grad_norm": 0.6713384985923767, "learning_rate": 0.0002, "epoch": 1.5928186714542192, "step": 22180}, {"loss": 0.7287, "grad_norm": 0.7168547511100769, "learning_rate": 0.0002, "epoch": 1.5935368043087972, "step": 22190}, {"loss": 0.7259, "grad_norm": 0.8395482897758484, "learning_rate": 0.0002, "epoch": 1.5942549371633752, "step": 22200}, {"loss": 0.6995, "grad_norm": 0.6676998138427734, "learning_rate": 0.0002, "epoch": 1.5949730700179532, "step": 22210}, {"loss": 0.7152, "grad_norm": 0.5837140083312988, "learning_rate": 0.0002, "epoch": 1.5956912028725314, "step": 22220}, {"loss": 0.7464, "grad_norm": 0.8399306535720825, "learning_rate": 0.0002, "epoch": 1.5964093357271096, "step": 22230}, {"loss": 0.7053, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 1.5971274685816876, "step": 22240}, {"loss": 0.784, "grad_norm": 0.768604040145874, "learning_rate": 0.0002, "epoch": 1.5978456014362656, "step": 22250}, {"loss": 0.6946, "grad_norm": 0.6382646560668945, "learning_rate": 0.0002, "epoch": 1.5985637342908436, "step": 22260}, {"loss": 0.7035, "grad_norm": 0.7244897484779358, "learning_rate": 0.0002, "epoch": 1.5992818671454219, "step": 22270}, {"loss": 0.7168, "grad_norm": 0.6250987648963928, "learning_rate": 0.0002, "epoch": 1.6, "step": 22280}, {"loss": 0.7182, "grad_norm": 0.8731992244720459, "learning_rate": 0.0002, "epoch": 1.600718132854578, "step": 22290}, {"loss": 0.6866, "grad_norm": 0.5861822962760925, "learning_rate": 0.0002, "epoch": 1.601436265709156, "step": 22300}, {"loss": 0.6909, "grad_norm": 0.716805100440979, "learning_rate": 0.0002, "epoch": 1.6021543985637343, "step": 22310}, {"loss": 0.7377, "grad_norm": 0.6650034189224243, "learning_rate": 0.0002, "epoch": 1.6028725314183125, "step": 22320}, {"loss": 0.7107, "grad_norm": 0.6944432854652405, "learning_rate": 0.0002, "epoch": 1.6035906642728905, "step": 22330}, {"loss": 0.682, "grad_norm": 0.7411999106407166, "learning_rate": 0.0002, "epoch": 1.6043087971274685, "step": 22340}, {"loss": 0.7294, "grad_norm": 0.831828773021698, "learning_rate": 0.0002, "epoch": 1.6050269299820465, "step": 22350}, {"loss": 0.7305, "grad_norm": 0.6252152919769287, "learning_rate": 0.0002, "epoch": 1.6057450628366248, "step": 22360}, {"loss": 0.7479, "grad_norm": 0.8643325567245483, "learning_rate": 0.0002, "epoch": 1.606463195691203, "step": 22370}, {"loss": 0.7417, "grad_norm": 0.7330279350280762, "learning_rate": 0.0002, "epoch": 1.607181328545781, "step": 22380}, {"loss": 0.7198, "grad_norm": 0.7235422730445862, "learning_rate": 0.0002, "epoch": 1.607899461400359, "step": 22390}, {"loss": 0.7638, "grad_norm": 0.6940887570381165, "learning_rate": 0.0002, "epoch": 1.608617594254937, "step": 22400}, {"loss": 0.714, "grad_norm": 0.7907325625419617, "learning_rate": 0.0002, "epoch": 1.6093357271095152, "step": 22410}, {"loss": 0.7824, "grad_norm": 0.6899075508117676, "learning_rate": 0.0002, "epoch": 1.6100538599640934, "step": 22420}, {"loss": 0.7502, "grad_norm": 0.7057487368583679, "learning_rate": 0.0002, "epoch": 1.6107719928186714, "step": 22430}, {"loss": 0.7437, "grad_norm": 0.9235003590583801, "learning_rate": 0.0002, "epoch": 1.6114901256732495, "step": 22440}, {"loss": 0.7115, "grad_norm": 0.7238173484802246, "learning_rate": 0.0002, "epoch": 1.6122082585278277, "step": 22450}, {"loss": 0.7628, "grad_norm": 0.5931997299194336, "learning_rate": 0.0002, "epoch": 1.612926391382406, "step": 22460}, {"loss": 0.6663, "grad_norm": 0.6705866456031799, "learning_rate": 0.0002, "epoch": 1.613644524236984, "step": 22470}, {"loss": 0.749, "grad_norm": 0.7392773032188416, "learning_rate": 0.0002, "epoch": 1.614362657091562, "step": 22480}, {"loss": 0.7292, "grad_norm": 0.6286543607711792, "learning_rate": 0.0002, "epoch": 1.61508078994614, "step": 22490}, {"loss": 0.7264, "grad_norm": 0.7467446327209473, "learning_rate": 0.0002, "epoch": 1.6157989228007181, "step": 22500}, {"loss": 0.732, "grad_norm": 0.8353021740913391, "learning_rate": 0.0002, "epoch": 1.6165170556552964, "step": 22510}, {"loss": 0.7626, "grad_norm": 0.7333045601844788, "learning_rate": 0.0002, "epoch": 1.6172351885098744, "step": 22520}, {"loss": 0.7567, "grad_norm": 0.6203709244728088, "learning_rate": 0.0002, "epoch": 1.6179533213644524, "step": 22530}, {"loss": 0.7478, "grad_norm": 0.5585690140724182, "learning_rate": 0.0002, "epoch": 1.6186714542190304, "step": 22540}, {"loss": 0.669, "grad_norm": 0.7157222032546997, "learning_rate": 0.0002, "epoch": 1.6193895870736086, "step": 22550}, {"loss": 0.7224, "grad_norm": 0.8129993677139282, "learning_rate": 0.0002, "epoch": 1.6201077199281868, "step": 22560}, {"loss": 0.7374, "grad_norm": 0.6745335459709167, "learning_rate": 0.0002, "epoch": 1.6208258527827648, "step": 22570}, {"loss": 0.7276, "grad_norm": 0.7684996724128723, "learning_rate": 0.0002, "epoch": 1.6215439856373428, "step": 22580}, {"loss": 0.7479, "grad_norm": 0.6735436916351318, "learning_rate": 0.0002, "epoch": 1.622262118491921, "step": 22590}, {"loss": 0.6596, "grad_norm": 0.7394272089004517, "learning_rate": 0.0002, "epoch": 1.6229802513464993, "step": 22600}, {"loss": 0.7382, "grad_norm": 0.7268046140670776, "learning_rate": 0.0002, "epoch": 1.6236983842010773, "step": 22610}, {"loss": 0.7619, "grad_norm": 0.8338810205459595, "learning_rate": 0.0002, "epoch": 1.6244165170556553, "step": 22620}, {"loss": 0.7247, "grad_norm": 0.9293080568313599, "learning_rate": 0.0002, "epoch": 1.6251346499102333, "step": 22630}, {"loss": 0.7601, "grad_norm": 0.8084996938705444, "learning_rate": 0.0002, "epoch": 1.6258527827648115, "step": 22640}, {"loss": 0.7053, "grad_norm": 0.6605180501937866, "learning_rate": 0.0002, "epoch": 1.6265709156193897, "step": 22650}, {"loss": 0.7489, "grad_norm": 0.8402717113494873, "learning_rate": 0.0002, "epoch": 1.6272890484739677, "step": 22660}, {"loss": 0.7468, "grad_norm": 0.653055727481842, "learning_rate": 0.0002, "epoch": 1.6280071813285457, "step": 22670}, {"loss": 0.7179, "grad_norm": 0.6477823257446289, "learning_rate": 0.0002, "epoch": 1.6287253141831237, "step": 22680}, {"loss": 0.7216, "grad_norm": 0.9053590893745422, "learning_rate": 0.0002, "epoch": 1.629443447037702, "step": 22690}, {"loss": 0.7257, "grad_norm": 0.90384441614151, "learning_rate": 0.0002, "epoch": 1.6301615798922802, "step": 22700}, {"loss": 0.7703, "grad_norm": 0.6789469122886658, "learning_rate": 0.0002, "epoch": 1.6308797127468582, "step": 22710}, {"loss": 0.7706, "grad_norm": 0.7221854329109192, "learning_rate": 0.0002, "epoch": 1.6315978456014362, "step": 22720}, {"loss": 0.7457, "grad_norm": 0.7724022269248962, "learning_rate": 0.0002, "epoch": 1.6323159784560144, "step": 22730}, {"loss": 0.7864, "grad_norm": 0.8213715553283691, "learning_rate": 0.0002, "epoch": 1.6330341113105926, "step": 22740}, {"loss": 0.7356, "grad_norm": 0.7102876305580139, "learning_rate": 0.0002, "epoch": 1.6337522441651706, "step": 22750}, {"loss": 0.7208, "grad_norm": 0.8817880749702454, "learning_rate": 0.0002, "epoch": 1.6344703770197486, "step": 22760}, {"loss": 0.7722, "grad_norm": 0.8446506857872009, "learning_rate": 0.0002, "epoch": 1.6351885098743266, "step": 22770}, {"loss": 0.7341, "grad_norm": 0.6749029755592346, "learning_rate": 0.0002, "epoch": 1.6359066427289048, "step": 22780}, {"loss": 0.7599, "grad_norm": 0.7013556957244873, "learning_rate": 0.0002, "epoch": 1.636624775583483, "step": 22790}, {"loss": 0.7488, "grad_norm": 0.7767965793609619, "learning_rate": 0.0002, "epoch": 1.637342908438061, "step": 22800}, {"loss": 0.7387, "grad_norm": 0.7354073524475098, "learning_rate": 0.0002, "epoch": 1.638061041292639, "step": 22810}, {"loss": 0.7816, "grad_norm": 0.8871088027954102, "learning_rate": 0.0002, "epoch": 1.638779174147217, "step": 22820}, {"loss": 0.7243, "grad_norm": 0.6573871374130249, "learning_rate": 0.0002, "epoch": 1.6394973070017953, "step": 22830}, {"loss": 0.7812, "grad_norm": 0.5679349303245544, "learning_rate": 0.0002, "epoch": 1.6402154398563735, "step": 22840}, {"loss": 0.7402, "grad_norm": 0.7072559595108032, "learning_rate": 0.0002, "epoch": 1.6409335727109515, "step": 22850}, {"loss": 0.751, "grad_norm": 0.7639257311820984, "learning_rate": 0.0002, "epoch": 1.6416517055655295, "step": 22860}, {"loss": 0.7357, "grad_norm": 0.6699341535568237, "learning_rate": 0.0002, "epoch": 1.6423698384201078, "step": 22870}, {"loss": 0.7295, "grad_norm": 0.8285767436027527, "learning_rate": 0.0002, "epoch": 1.643087971274686, "step": 22880}, {"loss": 0.7267, "grad_norm": 0.7328150272369385, "learning_rate": 0.0002, "epoch": 1.643806104129264, "step": 22890}, {"loss": 0.6904, "grad_norm": 0.8122354745864868, "learning_rate": 0.0002, "epoch": 1.644524236983842, "step": 22900}, {"loss": 0.7853, "grad_norm": 0.7322969436645508, "learning_rate": 0.0002, "epoch": 1.64524236983842, "step": 22910}, {"loss": 0.7629, "grad_norm": 0.7269576191902161, "learning_rate": 0.0002, "epoch": 1.6459605026929982, "step": 22920}, {"loss": 0.728, "grad_norm": 0.7037042379379272, "learning_rate": 0.0002, "epoch": 1.6466786355475764, "step": 22930}, {"loss": 0.752, "grad_norm": 0.6960355639457703, "learning_rate": 0.0002, "epoch": 1.6473967684021544, "step": 22940}, {"loss": 0.7484, "grad_norm": 0.7446839213371277, "learning_rate": 0.0002, "epoch": 1.6481149012567324, "step": 22950}, {"loss": 0.7528, "grad_norm": 0.7201664447784424, "learning_rate": 0.0002, "epoch": 1.6488330341113104, "step": 22960}, {"loss": 0.7183, "grad_norm": 0.7062349319458008, "learning_rate": 0.0002, "epoch": 1.6495511669658887, "step": 22970}, {"loss": 0.6999, "grad_norm": 0.7666636109352112, "learning_rate": 0.0002, "epoch": 1.6502692998204669, "step": 22980}, {"loss": 0.7103, "grad_norm": 0.7872112393379211, "learning_rate": 0.0002, "epoch": 1.6509874326750449, "step": 22990}, {"loss": 0.7307, "grad_norm": 0.7428551316261292, "learning_rate": 0.0002, "epoch": 1.6517055655296229, "step": 23000}, {"loss": 0.7573, "grad_norm": 0.6087952852249146, "learning_rate": 0.0002, "epoch": 1.6524236983842011, "step": 23010}, {"loss": 0.8045, "grad_norm": 0.7191354036331177, "learning_rate": 0.0002, "epoch": 1.6531418312387793, "step": 23020}, {"loss": 0.7517, "grad_norm": 0.8679710626602173, "learning_rate": 0.0002, "epoch": 1.6538599640933573, "step": 23030}, {"loss": 0.7084, "grad_norm": 0.7232310175895691, "learning_rate": 0.0002, "epoch": 1.6545780969479353, "step": 23040}, {"loss": 0.7007, "grad_norm": 0.5695104002952576, "learning_rate": 0.0002, "epoch": 1.6552962298025133, "step": 23050}, {"loss": 0.7115, "grad_norm": 0.6363076567649841, "learning_rate": 0.0002, "epoch": 1.6560143626570916, "step": 23060}, {"loss": 0.7639, "grad_norm": 0.8168749809265137, "learning_rate": 0.0002, "epoch": 1.6567324955116698, "step": 23070}, {"loss": 0.6768, "grad_norm": 0.7664111852645874, "learning_rate": 0.0002, "epoch": 1.6574506283662478, "step": 23080}, {"loss": 0.7492, "grad_norm": 0.6748140454292297, "learning_rate": 0.0002, "epoch": 1.6581687612208258, "step": 23090}, {"loss": 0.7213, "grad_norm": 0.6258183121681213, "learning_rate": 0.0002, "epoch": 1.6588868940754038, "step": 23100}, {"loss": 0.783, "grad_norm": 0.8669735193252563, "learning_rate": 0.0002, "epoch": 1.659605026929982, "step": 23110}, {"loss": 0.6847, "grad_norm": 0.5606119632720947, "learning_rate": 0.0002, "epoch": 1.6603231597845602, "step": 23120}, {"loss": 0.6889, "grad_norm": 0.6602507829666138, "learning_rate": 0.0002, "epoch": 1.6610412926391382, "step": 23130}, {"loss": 0.7605, "grad_norm": 0.7237988710403442, "learning_rate": 0.0002, "epoch": 1.6617594254937162, "step": 23140}, {"loss": 0.7663, "grad_norm": 0.9054415225982666, "learning_rate": 0.0002, "epoch": 1.6624775583482945, "step": 23150}, {"loss": 0.7603, "grad_norm": 0.5186660289764404, "learning_rate": 0.0002, "epoch": 1.6631956912028727, "step": 23160}, {"loss": 0.7442, "grad_norm": 0.719584584236145, "learning_rate": 0.0002, "epoch": 1.6639138240574507, "step": 23170}, {"loss": 0.7715, "grad_norm": 0.7583617568016052, "learning_rate": 0.0002, "epoch": 1.6646319569120287, "step": 23180}, {"loss": 0.7402, "grad_norm": 0.7985982298851013, "learning_rate": 0.0002, "epoch": 1.6653500897666067, "step": 23190}, {"loss": 0.7515, "grad_norm": 0.6952691674232483, "learning_rate": 0.0002, "epoch": 1.666068222621185, "step": 23200}, {"loss": 0.7491, "grad_norm": 0.7184221744537354, "learning_rate": 0.0002, "epoch": 1.6667863554757631, "step": 23210}, {"loss": 0.7608, "grad_norm": 0.8256361484527588, "learning_rate": 0.0002, "epoch": 1.6675044883303412, "step": 23220}, {"loss": 0.7331, "grad_norm": 0.7534128427505493, "learning_rate": 0.0002, "epoch": 1.6682226211849192, "step": 23230}, {"loss": 0.7196, "grad_norm": 0.7711095213890076, "learning_rate": 0.0002, "epoch": 1.6689407540394972, "step": 23240}, {"loss": 0.7871, "grad_norm": 0.6326615810394287, "learning_rate": 0.0002, "epoch": 1.6696588868940754, "step": 23250}, {"loss": 0.7244, "grad_norm": 0.8345766663551331, "learning_rate": 0.0002, "epoch": 1.6703770197486536, "step": 23260}, {"loss": 0.7819, "grad_norm": 0.9079837203025818, "learning_rate": 0.0002, "epoch": 1.6710951526032316, "step": 23270}, {"loss": 0.7259, "grad_norm": 0.7310197353363037, "learning_rate": 0.0002, "epoch": 1.6718132854578096, "step": 23280}, {"loss": 0.7253, "grad_norm": 0.7573344707489014, "learning_rate": 0.0002, "epoch": 1.6725314183123878, "step": 23290}, {"loss": 0.6817, "grad_norm": 0.7708047032356262, "learning_rate": 0.0002, "epoch": 1.673249551166966, "step": 23300}, {"loss": 0.7247, "grad_norm": 0.7665812969207764, "learning_rate": 0.0002, "epoch": 1.673967684021544, "step": 23310}, {"loss": 0.7048, "grad_norm": 0.7988788485527039, "learning_rate": 0.0002, "epoch": 1.674685816876122, "step": 23320}, {"loss": 0.7396, "grad_norm": 0.755042552947998, "learning_rate": 0.0002, "epoch": 1.6754039497307, "step": 23330}, {"loss": 0.7392, "grad_norm": 0.6605848670005798, "learning_rate": 0.0002, "epoch": 1.6761220825852783, "step": 23340}, {"loss": 0.7394, "grad_norm": 0.8762016296386719, "learning_rate": 0.0002, "epoch": 1.6768402154398565, "step": 23350}, {"loss": 0.7661, "grad_norm": 0.604742169380188, "learning_rate": 0.0002, "epoch": 1.6775583482944345, "step": 23360}, {"loss": 0.7422, "grad_norm": 0.7479172945022583, "learning_rate": 0.0002, "epoch": 1.6782764811490125, "step": 23370}, {"loss": 0.7248, "grad_norm": 0.6418702602386475, "learning_rate": 0.0002, "epoch": 1.6789946140035905, "step": 23380}, {"loss": 0.7717, "grad_norm": 0.6783933639526367, "learning_rate": 0.0002, "epoch": 1.6797127468581687, "step": 23390}, {"loss": 0.7099, "grad_norm": 0.7036024928092957, "learning_rate": 0.0002, "epoch": 1.680430879712747, "step": 23400}, {"loss": 0.7439, "grad_norm": 0.6833266615867615, "learning_rate": 0.0002, "epoch": 1.681149012567325, "step": 23410}, {"loss": 0.753, "grad_norm": 0.8867062330245972, "learning_rate": 0.0002, "epoch": 1.681867145421903, "step": 23420}, {"loss": 0.7694, "grad_norm": 0.7825753092765808, "learning_rate": 0.0002, "epoch": 1.6825852782764812, "step": 23430}, {"loss": 0.7127, "grad_norm": 0.6396880745887756, "learning_rate": 0.0002, "epoch": 1.6833034111310592, "step": 23440}, {"loss": 0.7465, "grad_norm": 0.5723230242729187, "learning_rate": 0.0002, "epoch": 1.6840215439856374, "step": 23450}, {"loss": 0.7102, "grad_norm": 0.6949231624603271, "learning_rate": 0.0002, "epoch": 1.6847396768402154, "step": 23460}, {"loss": 0.7421, "grad_norm": 0.8290650248527527, "learning_rate": 0.0002, "epoch": 1.6854578096947934, "step": 23470}, {"loss": 0.7774, "grad_norm": 0.7765078544616699, "learning_rate": 0.0002, "epoch": 1.6861759425493716, "step": 23480}, {"loss": 0.7271, "grad_norm": 0.7084149718284607, "learning_rate": 0.0002, "epoch": 1.6868940754039499, "step": 23490}, {"loss": 0.8188, "grad_norm": 0.6916654109954834, "learning_rate": 0.0002, "epoch": 1.6876122082585279, "step": 23500}, {"loss": 0.7235, "grad_norm": 0.5615179538726807, "learning_rate": 0.0002, "epoch": 1.6883303411131059, "step": 23510}, {"loss": 0.7203, "grad_norm": 0.7996105551719666, "learning_rate": 0.0002, "epoch": 1.6890484739676839, "step": 23520}, {"loss": 0.7145, "grad_norm": 0.7010168433189392, "learning_rate": 0.0002, "epoch": 1.689766606822262, "step": 23530}, {"loss": 0.7696, "grad_norm": 0.7876442074775696, "learning_rate": 0.0002, "epoch": 1.6904847396768403, "step": 23540}, {"loss": 0.6966, "grad_norm": 0.7508043646812439, "learning_rate": 0.0002, "epoch": 1.6912028725314183, "step": 23550}, {"loss": 0.729, "grad_norm": 0.8125874400138855, "learning_rate": 0.0002, "epoch": 1.6919210053859963, "step": 23560}, {"loss": 0.774, "grad_norm": 0.711840808391571, "learning_rate": 0.0002, "epoch": 1.6926391382405745, "step": 23570}, {"loss": 0.7165, "grad_norm": 0.6540026068687439, "learning_rate": 0.0002, "epoch": 1.6933572710951525, "step": 23580}, {"loss": 0.7578, "grad_norm": 0.8376550078392029, "learning_rate": 0.0002, "epoch": 1.6940754039497308, "step": 23590}, {"loss": 0.7746, "grad_norm": 0.7075366973876953, "learning_rate": 0.0002, "epoch": 1.6947935368043088, "step": 23600}, {"loss": 0.7639, "grad_norm": 0.7522266507148743, "learning_rate": 0.0002, "epoch": 1.6955116696588868, "step": 23610}, {"loss": 0.7386, "grad_norm": 0.7572667002677917, "learning_rate": 0.0002, "epoch": 1.696229802513465, "step": 23620}, {"loss": 0.6896, "grad_norm": 0.6126907467842102, "learning_rate": 0.0002, "epoch": 1.6969479353680432, "step": 23630}, {"loss": 0.7182, "grad_norm": 0.7473152875900269, "learning_rate": 0.0002, "epoch": 1.6976660682226212, "step": 23640}, {"loss": 0.7272, "grad_norm": 0.6630390286445618, "learning_rate": 0.0002, "epoch": 1.6983842010771992, "step": 23650}, {"loss": 0.7232, "grad_norm": 0.5848073363304138, "learning_rate": 0.0002, "epoch": 1.6991023339317772, "step": 23660}, {"loss": 0.6923, "grad_norm": 0.5901942849159241, "learning_rate": 0.0002, "epoch": 1.6998204667863555, "step": 23670}, {"loss": 0.79, "grad_norm": 0.7896918058395386, "learning_rate": 0.0002, "epoch": 1.7005385996409337, "step": 23680}, {"loss": 0.77, "grad_norm": 0.705362856388092, "learning_rate": 0.0002, "epoch": 1.7012567324955117, "step": 23690}, {"loss": 0.751, "grad_norm": 0.9917470812797546, "learning_rate": 0.0002, "epoch": 1.7019748653500897, "step": 23700}, {"loss": 0.7403, "grad_norm": 0.7550538778305054, "learning_rate": 0.0002, "epoch": 1.702692998204668, "step": 23710}, {"loss": 0.7398, "grad_norm": 0.8348238468170166, "learning_rate": 0.0002, "epoch": 1.703411131059246, "step": 23720}, {"loss": 0.7799, "grad_norm": 0.5979694128036499, "learning_rate": 0.0002, "epoch": 1.7041292639138241, "step": 23730}, {"loss": 0.7035, "grad_norm": 0.7451775670051575, "learning_rate": 0.0002, "epoch": 1.7048473967684021, "step": 23740}, {"loss": 0.7237, "grad_norm": 0.7614818215370178, "learning_rate": 0.0002, "epoch": 1.7055655296229801, "step": 23750}, {"loss": 0.7636, "grad_norm": 0.5590742826461792, "learning_rate": 0.0002, "epoch": 1.7062836624775584, "step": 23760}, {"loss": 0.701, "grad_norm": 0.7039094567298889, "learning_rate": 0.0002, "epoch": 1.7070017953321366, "step": 23770}, {"loss": 0.7145, "grad_norm": 0.7963233590126038, "learning_rate": 0.0002, "epoch": 1.7077199281867146, "step": 23780}, {"loss": 0.7702, "grad_norm": 0.7214934825897217, "learning_rate": 0.0002, "epoch": 1.7084380610412926, "step": 23790}, {"loss": 0.7515, "grad_norm": 0.7310500741004944, "learning_rate": 0.0002, "epoch": 1.7091561938958706, "step": 23800}, {"loss": 0.7038, "grad_norm": 0.6653284430503845, "learning_rate": 0.0002, "epoch": 1.7098743267504488, "step": 23810}, {"loss": 0.698, "grad_norm": 0.6632702946662903, "learning_rate": 0.0002, "epoch": 1.710592459605027, "step": 23820}, {"loss": 0.7338, "grad_norm": 0.6314955949783325, "learning_rate": 0.0002, "epoch": 1.711310592459605, "step": 23830}, {"loss": 0.7511, "grad_norm": 0.73652583360672, "learning_rate": 0.0002, "epoch": 1.712028725314183, "step": 23840}, {"loss": 0.6999, "grad_norm": 0.5685144662857056, "learning_rate": 0.0002, "epoch": 1.7127468581687613, "step": 23850}, {"loss": 0.7295, "grad_norm": 0.7010223865509033, "learning_rate": 0.0002, "epoch": 1.7134649910233393, "step": 23860}, {"loss": 0.7488, "grad_norm": 0.7643879652023315, "learning_rate": 0.0002, "epoch": 1.7141831238779175, "step": 23870}, {"loss": 0.7449, "grad_norm": 0.7543165683746338, "learning_rate": 0.0002, "epoch": 1.7149012567324955, "step": 23880}, {"loss": 0.6946, "grad_norm": 0.8816508054733276, "learning_rate": 0.0002, "epoch": 1.7156193895870735, "step": 23890}, {"loss": 0.7398, "grad_norm": 0.7979614734649658, "learning_rate": 0.0002, "epoch": 1.7163375224416517, "step": 23900}, {"loss": 0.7844, "grad_norm": 0.7631057500839233, "learning_rate": 0.0002, "epoch": 1.71705565529623, "step": 23910}, {"loss": 0.7409, "grad_norm": 0.6349977254867554, "learning_rate": 0.0002, "epoch": 1.717773788150808, "step": 23920}, {"loss": 0.74, "grad_norm": 0.7464412450790405, "learning_rate": 0.0002, "epoch": 1.718491921005386, "step": 23930}, {"loss": 0.7164, "grad_norm": 0.6985567212104797, "learning_rate": 0.0002, "epoch": 1.719210053859964, "step": 23940}, {"loss": 0.7256, "grad_norm": 0.6641302704811096, "learning_rate": 0.0002, "epoch": 1.7199281867145422, "step": 23950}, {"loss": 0.7154, "grad_norm": 0.7299597263336182, "learning_rate": 0.0002, "epoch": 1.7206463195691204, "step": 23960}, {"loss": 0.7535, "grad_norm": 0.7812355756759644, "learning_rate": 0.0002, "epoch": 1.7213644524236984, "step": 23970}, {"loss": 0.7363, "grad_norm": 0.667571485042572, "learning_rate": 0.0002, "epoch": 1.7220825852782764, "step": 23980}, {"loss": 0.7427, "grad_norm": 0.8244081735610962, "learning_rate": 0.0002, "epoch": 1.7228007181328546, "step": 23990}, {"loss": 0.7191, "grad_norm": 0.6684445738792419, "learning_rate": 0.0002, "epoch": 1.7235188509874326, "step": 24000}, {"loss": 0.8042, "grad_norm": 0.7002949118614197, "learning_rate": 0.0002, "epoch": 1.7242369838420109, "step": 24010}, {"loss": 0.7134, "grad_norm": 0.6249772906303406, "learning_rate": 0.0002, "epoch": 1.7249551166965889, "step": 24020}, {"loss": 0.721, "grad_norm": 0.7279905080795288, "learning_rate": 0.0002, "epoch": 1.7256732495511669, "step": 24030}, {"loss": 0.7374, "grad_norm": 0.631148636341095, "learning_rate": 0.0002, "epoch": 1.726391382405745, "step": 24040}, {"loss": 0.697, "grad_norm": 0.7486464977264404, "learning_rate": 0.0002, "epoch": 1.7271095152603233, "step": 24050}, {"loss": 0.715, "grad_norm": 0.7494347095489502, "learning_rate": 0.0002, "epoch": 1.7278276481149013, "step": 24060}, {"loss": 0.7609, "grad_norm": 0.7821264863014221, "learning_rate": 0.0002, "epoch": 1.7285457809694793, "step": 24070}, {"loss": 0.6925, "grad_norm": 0.7211608290672302, "learning_rate": 0.0002, "epoch": 1.7292639138240573, "step": 24080}, {"loss": 0.7444, "grad_norm": 0.7028553485870361, "learning_rate": 0.0002, "epoch": 1.7299820466786355, "step": 24090}, {"loss": 0.8065, "grad_norm": 0.6189247369766235, "learning_rate": 0.0002, "epoch": 1.7307001795332138, "step": 24100}, {"loss": 0.7011, "grad_norm": 0.7339756488800049, "learning_rate": 0.0002, "epoch": 1.7314183123877918, "step": 24110}, {"loss": 0.8071, "grad_norm": 0.6700502038002014, "learning_rate": 0.0002, "epoch": 1.7321364452423698, "step": 24120}, {"loss": 0.7608, "grad_norm": 0.6139533519744873, "learning_rate": 0.0002, "epoch": 1.732854578096948, "step": 24130}, {"loss": 0.7251, "grad_norm": 0.7249825596809387, "learning_rate": 0.0002, "epoch": 1.733572710951526, "step": 24140}, {"loss": 0.6954, "grad_norm": 0.6531777381896973, "learning_rate": 0.0002, "epoch": 1.7342908438061042, "step": 24150}, {"loss": 0.7214, "grad_norm": 0.8443833589553833, "learning_rate": 0.0002, "epoch": 1.7350089766606822, "step": 24160}, {"loss": 0.75, "grad_norm": 0.7040373086929321, "learning_rate": 0.0002, "epoch": 1.7357271095152602, "step": 24170}, {"loss": 0.701, "grad_norm": 0.8647749423980713, "learning_rate": 0.0002, "epoch": 1.7364452423698384, "step": 24180}, {"loss": 0.7033, "grad_norm": 0.7297305464744568, "learning_rate": 0.0002, "epoch": 1.7371633752244167, "step": 24190}, {"loss": 0.7187, "grad_norm": 0.8191218376159668, "learning_rate": 0.0002, "epoch": 1.7378815080789947, "step": 24200}, {"loss": 0.7665, "grad_norm": 0.7315607666969299, "learning_rate": 0.0002, "epoch": 1.7385996409335727, "step": 24210}, {"loss": 0.7467, "grad_norm": 0.694486677646637, "learning_rate": 0.0002, "epoch": 1.7393177737881507, "step": 24220}, {"loss": 0.7476, "grad_norm": 0.8115953207015991, "learning_rate": 0.0002, "epoch": 1.740035906642729, "step": 24230}, {"loss": 0.7792, "grad_norm": 0.7379186153411865, "learning_rate": 0.0002, "epoch": 1.7407540394973071, "step": 24240}, {"loss": 0.7224, "grad_norm": 0.6820309162139893, "learning_rate": 0.0002, "epoch": 1.7414721723518851, "step": 24250}, {"loss": 0.7558, "grad_norm": 0.8210766911506653, "learning_rate": 0.0002, "epoch": 1.7421903052064631, "step": 24260}, {"loss": 0.7098, "grad_norm": 0.724466860294342, "learning_rate": 0.0002, "epoch": 1.7429084380610413, "step": 24270}, {"loss": 0.7343, "grad_norm": 0.8768740296363831, "learning_rate": 0.0002, "epoch": 1.7436265709156193, "step": 24280}, {"loss": 0.7041, "grad_norm": 0.6691206097602844, "learning_rate": 0.0002, "epoch": 1.7443447037701976, "step": 24290}, {"loss": 0.7526, "grad_norm": 0.6529893279075623, "learning_rate": 0.0002, "epoch": 1.7450628366247756, "step": 24300}, {"loss": 0.7638, "grad_norm": 0.904729962348938, "learning_rate": 0.0002, "epoch": 1.7457809694793536, "step": 24310}, {"loss": 0.7463, "grad_norm": 0.655235230922699, "learning_rate": 0.0002, "epoch": 1.7464991023339318, "step": 24320}, {"loss": 0.7625, "grad_norm": 0.9476361274719238, "learning_rate": 0.0002, "epoch": 1.74721723518851, "step": 24330}, {"loss": 0.688, "grad_norm": 0.55366051197052, "learning_rate": 0.0002, "epoch": 1.747935368043088, "step": 24340}, {"loss": 0.7664, "grad_norm": 0.7192568182945251, "learning_rate": 0.0002, "epoch": 1.748653500897666, "step": 24350}, {"loss": 0.7423, "grad_norm": 0.7193983793258667, "learning_rate": 0.0002, "epoch": 1.749371633752244, "step": 24360}, {"loss": 0.7463, "grad_norm": 0.753998339176178, "learning_rate": 0.0002, "epoch": 1.7500897666068223, "step": 24370}, {"loss": 0.7415, "grad_norm": 1.1058299541473389, "learning_rate": 0.0002, "epoch": 1.7508078994614005, "step": 24380}, {"loss": 0.7373, "grad_norm": 0.7213007211685181, "learning_rate": 0.0002, "epoch": 1.7515260323159785, "step": 24390}, {"loss": 0.7395, "grad_norm": 0.972494900226593, "learning_rate": 0.0002, "epoch": 1.7522441651705565, "step": 24400}, {"loss": 0.7689, "grad_norm": 0.8045306205749512, "learning_rate": 0.0002, "epoch": 1.7529622980251347, "step": 24410}, {"loss": 0.7463, "grad_norm": 0.82415372133255, "learning_rate": 0.0002, "epoch": 1.7536804308797127, "step": 24420}, {"loss": 0.7384, "grad_norm": 0.72683185338974, "learning_rate": 0.0002, "epoch": 1.754398563734291, "step": 24430}, {"loss": 0.7512, "grad_norm": 0.687907338142395, "learning_rate": 0.0002, "epoch": 1.755116696588869, "step": 24440}, {"loss": 0.7627, "grad_norm": 0.6616531610488892, "learning_rate": 0.0002, "epoch": 1.755834829443447, "step": 24450}, {"loss": 0.7425, "grad_norm": 0.7225571870803833, "learning_rate": 0.0002, "epoch": 1.7565529622980252, "step": 24460}, {"loss": 0.7584, "grad_norm": 0.7597603797912598, "learning_rate": 0.0002, "epoch": 1.7572710951526034, "step": 24470}, {"loss": 0.7076, "grad_norm": 0.7850660681724548, "learning_rate": 0.0002, "epoch": 1.7579892280071814, "step": 24480}, {"loss": 0.7294, "grad_norm": 0.9843530058860779, "learning_rate": 0.0002, "epoch": 1.7587073608617594, "step": 24490}, {"loss": 0.7237, "grad_norm": 0.7010256052017212, "learning_rate": 0.0002, "epoch": 1.7594254937163374, "step": 24500}, {"loss": 0.7143, "grad_norm": 0.5669383406639099, "learning_rate": 0.0002, "epoch": 1.7601436265709156, "step": 24510}, {"loss": 0.7511, "grad_norm": 0.7043302655220032, "learning_rate": 0.0002, "epoch": 1.7608617594254938, "step": 24520}, {"loss": 0.73, "grad_norm": 0.8000741600990295, "learning_rate": 0.0002, "epoch": 1.7615798922800718, "step": 24530}, {"loss": 0.6994, "grad_norm": 0.7084416747093201, "learning_rate": 0.0002, "epoch": 1.7622980251346498, "step": 24540}, {"loss": 0.7337, "grad_norm": 0.7290608882904053, "learning_rate": 0.0002, "epoch": 1.763016157989228, "step": 24550}, {"loss": 0.6968, "grad_norm": 0.8710007071495056, "learning_rate": 0.0002, "epoch": 1.763734290843806, "step": 24560}, {"loss": 0.7023, "grad_norm": 0.6346535682678223, "learning_rate": 0.0002, "epoch": 1.7644524236983843, "step": 24570}, {"loss": 0.684, "grad_norm": 0.8990599513053894, "learning_rate": 0.0002, "epoch": 1.7651705565529623, "step": 24580}, {"loss": 0.7222, "grad_norm": 0.7823857665061951, "learning_rate": 0.0002, "epoch": 1.7658886894075403, "step": 24590}, {"loss": 0.7392, "grad_norm": 0.6250144839286804, "learning_rate": 0.0002, "epoch": 1.7666068222621185, "step": 24600}, {"loss": 0.7159, "grad_norm": 0.715657114982605, "learning_rate": 0.0002, "epoch": 1.7673249551166967, "step": 24610}, {"loss": 0.7245, "grad_norm": 0.6254874467849731, "learning_rate": 0.0002, "epoch": 1.7680430879712747, "step": 24620}, {"loss": 0.7258, "grad_norm": 0.6873717904090881, "learning_rate": 0.0002, "epoch": 1.7687612208258527, "step": 24630}, {"loss": 0.7951, "grad_norm": 0.7273038625717163, "learning_rate": 0.0002, "epoch": 1.7694793536804307, "step": 24640}, {"loss": 0.7417, "grad_norm": 0.9079981446266174, "learning_rate": 0.0002, "epoch": 1.770197486535009, "step": 24650}, {"loss": 0.7138, "grad_norm": 0.6262510418891907, "learning_rate": 0.0002, "epoch": 1.7709156193895872, "step": 24660}, {"loss": 0.6995, "grad_norm": 0.7326231002807617, "learning_rate": 0.0002, "epoch": 1.7716337522441652, "step": 24670}, {"loss": 0.7483, "grad_norm": 0.7828301787376404, "learning_rate": 0.0002, "epoch": 1.7723518850987432, "step": 24680}, {"loss": 0.689, "grad_norm": 0.5881586670875549, "learning_rate": 0.0002, "epoch": 1.7730700179533212, "step": 24690}, {"loss": 0.744, "grad_norm": 0.7101683020591736, "learning_rate": 0.0002, "epoch": 1.7737881508078994, "step": 24700}, {"loss": 0.7145, "grad_norm": 0.8466469049453735, "learning_rate": 0.0002, "epoch": 1.7745062836624776, "step": 24710}, {"loss": 0.7428, "grad_norm": 0.7770822644233704, "learning_rate": 0.0002, "epoch": 1.7752244165170556, "step": 24720}, {"loss": 0.7299, "grad_norm": 0.7259120345115662, "learning_rate": 0.0002, "epoch": 1.7759425493716336, "step": 24730}, {"loss": 0.6909, "grad_norm": 0.7696824669837952, "learning_rate": 0.0002, "epoch": 1.7766606822262119, "step": 24740}, {"loss": 0.7659, "grad_norm": 0.7603837847709656, "learning_rate": 0.0002, "epoch": 1.77737881508079, "step": 24750}, {"loss": 0.6966, "grad_norm": 0.6166595220565796, "learning_rate": 0.0002, "epoch": 1.778096947935368, "step": 24760}, {"loss": 0.6987, "grad_norm": 0.7493758797645569, "learning_rate": 0.0002, "epoch": 1.778815080789946, "step": 24770}, {"loss": 0.6808, "grad_norm": 0.7177459597587585, "learning_rate": 0.0002, "epoch": 1.779533213644524, "step": 24780}, {"loss": 0.7411, "grad_norm": 0.6666781306266785, "learning_rate": 0.0002, "epoch": 1.7802513464991023, "step": 24790}, {"loss": 0.6867, "grad_norm": 0.6556468605995178, "learning_rate": 0.0002, "epoch": 1.7809694793536806, "step": 24800}, {"loss": 0.7375, "grad_norm": 0.6119393706321716, "learning_rate": 0.0002, "epoch": 1.7816876122082586, "step": 24810}, {"loss": 0.7059, "grad_norm": 0.8573325276374817, "learning_rate": 0.0002, "epoch": 1.7824057450628366, "step": 24820}, {"loss": 0.7708, "grad_norm": 0.8017005920410156, "learning_rate": 0.0002, "epoch": 1.7831238779174146, "step": 24830}, {"loss": 0.7041, "grad_norm": 0.7337947487831116, "learning_rate": 0.0002, "epoch": 1.7838420107719928, "step": 24840}, {"loss": 0.7325, "grad_norm": 0.6717178225517273, "learning_rate": 0.0002, "epoch": 1.784560143626571, "step": 24850}, {"loss": 0.7285, "grad_norm": 0.8243708610534668, "learning_rate": 0.0002, "epoch": 1.785278276481149, "step": 24860}, {"loss": 0.701, "grad_norm": 0.8111547827720642, "learning_rate": 0.0002, "epoch": 1.785996409335727, "step": 24870}, {"loss": 0.7105, "grad_norm": 0.8577823042869568, "learning_rate": 0.0002, "epoch": 1.7867145421903052, "step": 24880}, {"loss": 0.7419, "grad_norm": 0.6488644480705261, "learning_rate": 0.0002, "epoch": 1.7874326750448835, "step": 24890}, {"loss": 0.7112, "grad_norm": 0.6446744799613953, "learning_rate": 0.0002, "epoch": 1.7881508078994615, "step": 24900}, {"loss": 0.7531, "grad_norm": 0.6400182247161865, "learning_rate": 0.0002, "epoch": 1.7888689407540395, "step": 24910}, {"loss": 0.711, "grad_norm": 0.8059108853340149, "learning_rate": 0.0002, "epoch": 1.7895870736086175, "step": 24920}, {"loss": 0.7678, "grad_norm": 0.7101734280586243, "learning_rate": 0.0002, "epoch": 1.7903052064631957, "step": 24930}, {"loss": 0.7648, "grad_norm": 1.0397762060165405, "learning_rate": 0.0002, "epoch": 1.791023339317774, "step": 24940}, {"loss": 0.7079, "grad_norm": 0.6231128573417664, "learning_rate": 0.0002, "epoch": 1.791741472172352, "step": 24950}, {"loss": 0.7525, "grad_norm": 5.905253887176514, "learning_rate": 0.0002, "epoch": 1.79245960502693, "step": 24960}, {"loss": 0.7286, "grad_norm": 0.8003911375999451, "learning_rate": 0.0002, "epoch": 1.793177737881508, "step": 24970}, {"loss": 0.7002, "grad_norm": 0.6340393424034119, "learning_rate": 0.0002, "epoch": 1.7938958707360861, "step": 24980}, {"loss": 0.7056, "grad_norm": 0.8701013922691345, "learning_rate": 0.0002, "epoch": 1.7946140035906644, "step": 24990}, {"loss": 0.7192, "grad_norm": 0.9085575342178345, "learning_rate": 0.0002, "epoch": 1.7953321364452424, "step": 25000}, {"loss": 0.7367, "grad_norm": 0.6306625604629517, "learning_rate": 0.0002, "epoch": 1.7960502692998204, "step": 25010}, {"loss": 0.7122, "grad_norm": 0.6985056400299072, "learning_rate": 0.0002, "epoch": 1.7967684021543986, "step": 25020}, {"loss": 0.7005, "grad_norm": 0.7309113144874573, "learning_rate": 0.0002, "epoch": 1.7974865350089768, "step": 25030}, {"loss": 0.7414, "grad_norm": 0.6795042157173157, "learning_rate": 0.0002, "epoch": 1.7982046678635548, "step": 25040}, {"loss": 0.7606, "grad_norm": 0.6920178532600403, "learning_rate": 0.0002, "epoch": 1.7989228007181328, "step": 25050}, {"loss": 0.7094, "grad_norm": 0.6578564047813416, "learning_rate": 0.0002, "epoch": 1.7996409335727108, "step": 25060}, {"loss": 0.7471, "grad_norm": 0.6718358993530273, "learning_rate": 0.0002, "epoch": 1.800359066427289, "step": 25070}, {"loss": 0.7271, "grad_norm": 0.9086750149726868, "learning_rate": 0.0002, "epoch": 1.8010771992818673, "step": 25080}, {"loss": 0.7653, "grad_norm": 0.6102437973022461, "learning_rate": 0.0002, "epoch": 1.8017953321364453, "step": 25090}, {"loss": 0.7538, "grad_norm": 0.6391313076019287, "learning_rate": 0.0002, "epoch": 1.8025134649910233, "step": 25100}, {"loss": 0.766, "grad_norm": 0.7150128483772278, "learning_rate": 0.0002, "epoch": 1.8032315978456013, "step": 25110}, {"loss": 0.7036, "grad_norm": 0.9833421111106873, "learning_rate": 0.0002, "epoch": 1.8039497307001795, "step": 25120}, {"loss": 0.7122, "grad_norm": 0.774002194404602, "learning_rate": 0.0002, "epoch": 1.8046678635547577, "step": 25130}, {"loss": 0.7329, "grad_norm": 0.644443154335022, "learning_rate": 0.0002, "epoch": 1.8053859964093357, "step": 25140}, {"loss": 0.7039, "grad_norm": 0.6996100544929504, "learning_rate": 0.0002, "epoch": 1.8061041292639137, "step": 25150}, {"loss": 0.6962, "grad_norm": 0.7545985579490662, "learning_rate": 0.0002, "epoch": 1.806822262118492, "step": 25160}, {"loss": 0.7432, "grad_norm": 0.7505226731300354, "learning_rate": 0.0002, "epoch": 1.8075403949730702, "step": 25170}, {"loss": 0.7189, "grad_norm": 0.800681471824646, "learning_rate": 0.0002, "epoch": 1.8082585278276482, "step": 25180}, {"loss": 0.7131, "grad_norm": 0.8268337845802307, "learning_rate": 0.0002, "epoch": 1.8089766606822262, "step": 25190}, {"loss": 0.7933, "grad_norm": 0.6436594128608704, "learning_rate": 0.0002, "epoch": 1.8096947935368042, "step": 25200}, {"loss": 0.7478, "grad_norm": 0.6961014270782471, "learning_rate": 0.0002, "epoch": 1.8104129263913824, "step": 25210}, {"loss": 0.7519, "grad_norm": 0.6649489998817444, "learning_rate": 0.0002, "epoch": 1.8111310592459606, "step": 25220}, {"loss": 0.7307, "grad_norm": 0.7071637511253357, "learning_rate": 0.0002, "epoch": 1.8118491921005386, "step": 25230}, {"loss": 0.7074, "grad_norm": 0.9082241654396057, "learning_rate": 0.0002, "epoch": 1.8125673249551166, "step": 25240}, {"loss": 0.7406, "grad_norm": 0.6318159103393555, "learning_rate": 0.0002, "epoch": 1.8132854578096946, "step": 25250}, {"loss": 0.7081, "grad_norm": 0.8006597757339478, "learning_rate": 0.0002, "epoch": 1.8140035906642729, "step": 25260}, {"loss": 0.7593, "grad_norm": 0.7950259447097778, "learning_rate": 0.0002, "epoch": 1.814721723518851, "step": 25270}, {"loss": 0.6897, "grad_norm": 0.8376588821411133, "learning_rate": 0.0002, "epoch": 1.815439856373429, "step": 25280}, {"loss": 0.747, "grad_norm": 0.8343217968940735, "learning_rate": 0.0002, "epoch": 1.816157989228007, "step": 25290}, {"loss": 0.7611, "grad_norm": 0.6240017414093018, "learning_rate": 0.0002, "epoch": 1.8168761220825853, "step": 25300}, {"loss": 0.7458, "grad_norm": 0.7079808712005615, "learning_rate": 0.0002, "epoch": 1.8175942549371635, "step": 25310}, {"loss": 0.7254, "grad_norm": 0.5930073261260986, "learning_rate": 0.0002, "epoch": 1.8183123877917415, "step": 25320}, {"loss": 0.7647, "grad_norm": 0.6994491815567017, "learning_rate": 0.0002, "epoch": 1.8190305206463195, "step": 25330}, {"loss": 0.726, "grad_norm": 0.8285305500030518, "learning_rate": 0.0002, "epoch": 1.8197486535008975, "step": 25340}, {"loss": 0.7215, "grad_norm": 0.6880194544792175, "learning_rate": 0.0002, "epoch": 1.8204667863554758, "step": 25350}, {"loss": 0.7365, "grad_norm": 0.7301307916641235, "learning_rate": 0.0002, "epoch": 1.821184919210054, "step": 25360}, {"loss": 0.7308, "grad_norm": 0.8117532730102539, "learning_rate": 0.0002, "epoch": 1.821903052064632, "step": 25370}, {"loss": 0.7395, "grad_norm": 0.8098701238632202, "learning_rate": 0.0002, "epoch": 1.82262118491921, "step": 25380}, {"loss": 0.7082, "grad_norm": 0.6899038553237915, "learning_rate": 0.0002, "epoch": 1.823339317773788, "step": 25390}, {"loss": 0.697, "grad_norm": 0.7350431084632874, "learning_rate": 0.0002, "epoch": 1.8240574506283662, "step": 25400}, {"loss": 0.7389, "grad_norm": 0.8723382949829102, "learning_rate": 0.0002, "epoch": 1.8247755834829444, "step": 25410}, {"loss": 0.7375, "grad_norm": 0.7448108196258545, "learning_rate": 0.0002, "epoch": 1.8254937163375224, "step": 25420}, {"loss": 0.7279, "grad_norm": 0.7525040507316589, "learning_rate": 0.0002, "epoch": 1.8262118491921004, "step": 25430}, {"loss": 0.7164, "grad_norm": 0.7148599028587341, "learning_rate": 0.0002, "epoch": 1.8269299820466787, "step": 25440}, {"loss": 0.7955, "grad_norm": 1.1802153587341309, "learning_rate": 0.0002, "epoch": 1.827648114901257, "step": 25450}, {"loss": 0.7094, "grad_norm": 0.619945764541626, "learning_rate": 0.0002, "epoch": 1.828366247755835, "step": 25460}, {"loss": 0.8234, "grad_norm": 0.7065792679786682, "learning_rate": 0.0002, "epoch": 1.829084380610413, "step": 25470}, {"loss": 0.796, "grad_norm": 0.6626001596450806, "learning_rate": 0.0002, "epoch": 1.829802513464991, "step": 25480}, {"loss": 0.7402, "grad_norm": 0.8368920087814331, "learning_rate": 0.0002, "epoch": 1.8305206463195691, "step": 25490}, {"loss": 0.6513, "grad_norm": 0.7528934478759766, "learning_rate": 0.0002, "epoch": 1.8312387791741473, "step": 25500}, {"loss": 0.7272, "grad_norm": 0.6472136378288269, "learning_rate": 0.0002, "epoch": 1.8319569120287253, "step": 25510}, {"loss": 0.7221, "grad_norm": 0.7818671464920044, "learning_rate": 0.0002, "epoch": 1.8326750448833034, "step": 25520}, {"loss": 0.7582, "grad_norm": 0.8280798196792603, "learning_rate": 0.0002, "epoch": 1.8333931777378814, "step": 25530}, {"loss": 0.7079, "grad_norm": 0.7038599252700806, "learning_rate": 0.0002, "epoch": 1.8341113105924596, "step": 25540}, {"loss": 0.711, "grad_norm": 0.6345962882041931, "learning_rate": 0.0002, "epoch": 1.8348294434470378, "step": 25550}, {"loss": 0.7553, "grad_norm": 0.6891741752624512, "learning_rate": 0.0002, "epoch": 1.8355475763016158, "step": 25560}, {"loss": 0.754, "grad_norm": 0.7753492593765259, "learning_rate": 0.0002, "epoch": 1.8362657091561938, "step": 25570}, {"loss": 0.7149, "grad_norm": 0.6907210946083069, "learning_rate": 0.0002, "epoch": 1.836983842010772, "step": 25580}, {"loss": 0.705, "grad_norm": 0.7483090162277222, "learning_rate": 0.0002, "epoch": 1.8377019748653503, "step": 25590}, {"loss": 0.7716, "grad_norm": 0.8749029636383057, "learning_rate": 0.0002, "epoch": 1.8384201077199283, "step": 25600}, {"loss": 0.7745, "grad_norm": 0.6936851143836975, "learning_rate": 0.0002, "epoch": 1.8391382405745063, "step": 25610}, {"loss": 0.7297, "grad_norm": 0.7273763418197632, "learning_rate": 0.0002, "epoch": 1.8398563734290843, "step": 25620}, {"loss": 0.724, "grad_norm": 0.7655298113822937, "learning_rate": 0.0002, "epoch": 1.8405745062836625, "step": 25630}, {"loss": 0.7566, "grad_norm": 0.7207344770431519, "learning_rate": 0.0002, "epoch": 1.8412926391382407, "step": 25640}, {"loss": 0.7092, "grad_norm": 0.6970131397247314, "learning_rate": 0.0002, "epoch": 1.8420107719928187, "step": 25650}, {"loss": 0.7164, "grad_norm": 0.7777560353279114, "learning_rate": 0.0002, "epoch": 1.8427289048473967, "step": 25660}, {"loss": 0.7594, "grad_norm": 0.7070116400718689, "learning_rate": 0.0002, "epoch": 1.8434470377019747, "step": 25670}, {"loss": 0.7603, "grad_norm": 0.6980257630348206, "learning_rate": 0.0002, "epoch": 1.844165170556553, "step": 25680}, {"loss": 0.7782, "grad_norm": 0.906563401222229, "learning_rate": 0.0002, "epoch": 1.8448833034111312, "step": 25690}, {"loss": 0.7377, "grad_norm": 0.567991316318512, "learning_rate": 0.0002, "epoch": 1.8456014362657092, "step": 25700}, {"loss": 0.7236, "grad_norm": 0.5954506993293762, "learning_rate": 0.0002, "epoch": 1.8463195691202872, "step": 25710}, {"loss": 0.7287, "grad_norm": 0.8073318600654602, "learning_rate": 0.0002, "epoch": 1.8470377019748654, "step": 25720}, {"loss": 0.7627, "grad_norm": 0.7439551949501038, "learning_rate": 0.0002, "epoch": 1.8477558348294436, "step": 25730}, {"loss": 0.7719, "grad_norm": 0.8091771602630615, "learning_rate": 0.0002, "epoch": 1.8484739676840216, "step": 25740}, {"loss": 0.7477, "grad_norm": 0.6584576964378357, "learning_rate": 0.0002, "epoch": 1.8491921005385996, "step": 25750}, {"loss": 0.6988, "grad_norm": 0.8161963224411011, "learning_rate": 0.0002, "epoch": 1.8499102333931776, "step": 25760}, {"loss": 0.7607, "grad_norm": 0.7337122559547424, "learning_rate": 0.0002, "epoch": 1.8506283662477558, "step": 25770}, {"loss": 0.7279, "grad_norm": 0.8968114256858826, "learning_rate": 0.0002, "epoch": 1.851346499102334, "step": 25780}, {"loss": 0.7162, "grad_norm": 0.8647686839103699, "learning_rate": 0.0002, "epoch": 1.852064631956912, "step": 25790}, {"loss": 0.7315, "grad_norm": 0.7775349020957947, "learning_rate": 0.0002, "epoch": 1.85278276481149, "step": 25800}, {"loss": 0.7739, "grad_norm": 0.686072587966919, "learning_rate": 0.0002, "epoch": 1.853500897666068, "step": 25810}, {"loss": 0.7138, "grad_norm": 0.7053380012512207, "learning_rate": 0.0002, "epoch": 1.8542190305206463, "step": 25820}, {"loss": 0.7583, "grad_norm": 0.7899979948997498, "learning_rate": 0.0002, "epoch": 1.8549371633752245, "step": 25830}, {"loss": 0.7633, "grad_norm": 0.6970776915550232, "learning_rate": 0.0002, "epoch": 1.8556552962298025, "step": 25840}, {"loss": 0.7704, "grad_norm": 0.7210841774940491, "learning_rate": 0.0002, "epoch": 1.8563734290843805, "step": 25850}, {"loss": 0.7422, "grad_norm": 0.7297208905220032, "learning_rate": 0.0002, "epoch": 1.8570915619389587, "step": 25860}, {"loss": 0.698, "grad_norm": 0.7782729268074036, "learning_rate": 0.0002, "epoch": 1.857809694793537, "step": 25870}, {"loss": 0.7791, "grad_norm": 0.7227505445480347, "learning_rate": 0.0002, "epoch": 1.858527827648115, "step": 25880}, {"loss": 0.7899, "grad_norm": 0.7489684224128723, "learning_rate": 0.0002, "epoch": 1.859245960502693, "step": 25890}, {"loss": 0.7875, "grad_norm": 0.7447289824485779, "learning_rate": 0.0002, "epoch": 1.859964093357271, "step": 25900}, {"loss": 0.7151, "grad_norm": 0.8516317009925842, "learning_rate": 0.0002, "epoch": 1.8606822262118492, "step": 25910}, {"loss": 0.6947, "grad_norm": 0.6864543557167053, "learning_rate": 0.0002, "epoch": 1.8614003590664274, "step": 25920}, {"loss": 0.7516, "grad_norm": 0.6753451824188232, "learning_rate": 0.0002, "epoch": 1.8621184919210054, "step": 25930}, {"loss": 0.7606, "grad_norm": 0.631679117679596, "learning_rate": 0.0002, "epoch": 1.8628366247755834, "step": 25940}, {"loss": 0.7663, "grad_norm": 0.7715049982070923, "learning_rate": 0.0002, "epoch": 1.8635547576301614, "step": 25950}, {"loss": 0.6967, "grad_norm": 0.7354850769042969, "learning_rate": 0.0002, "epoch": 1.8642728904847397, "step": 25960}, {"loss": 0.7331, "grad_norm": 0.7443442940711975, "learning_rate": 0.0002, "epoch": 1.8649910233393179, "step": 25970}, {"loss": 0.7558, "grad_norm": 0.6880337595939636, "learning_rate": 0.0002, "epoch": 1.8657091561938959, "step": 25980}, {"loss": 0.752, "grad_norm": 0.843941867351532, "learning_rate": 0.0002, "epoch": 1.8664272890484739, "step": 25990}, {"loss": 0.6941, "grad_norm": 0.6904318928718567, "learning_rate": 0.0002, "epoch": 1.867145421903052, "step": 26000}, {"loss": 0.6995, "grad_norm": 0.9041751623153687, "learning_rate": 0.0002, "epoch": 1.86786355475763, "step": 26010}, {"loss": 0.7503, "grad_norm": 0.7470057010650635, "learning_rate": 0.0002, "epoch": 1.8685816876122083, "step": 26020}, {"loss": 0.775, "grad_norm": 0.6921331882476807, "learning_rate": 0.0002, "epoch": 1.8692998204667863, "step": 26030}, {"loss": 0.7376, "grad_norm": 0.7627376914024353, "learning_rate": 0.0002, "epoch": 1.8700179533213643, "step": 26040}, {"loss": 0.7459, "grad_norm": 0.7784932851791382, "learning_rate": 0.0002, "epoch": 1.8707360861759426, "step": 26050}, {"loss": 0.7479, "grad_norm": 0.6399524807929993, "learning_rate": 0.0002, "epoch": 1.8714542190305208, "step": 26060}, {"loss": 0.7128, "grad_norm": 0.6478492617607117, "learning_rate": 0.0002, "epoch": 1.8721723518850988, "step": 26070}, {"loss": 0.6901, "grad_norm": 0.6376804113388062, "learning_rate": 0.0002, "epoch": 1.8728904847396768, "step": 26080}, {"loss": 0.7037, "grad_norm": 0.6976892352104187, "learning_rate": 0.0002, "epoch": 1.8736086175942548, "step": 26090}, {"loss": 0.7071, "grad_norm": 0.7997903227806091, "learning_rate": 0.0002, "epoch": 1.874326750448833, "step": 26100}, {"loss": 0.7152, "grad_norm": 0.6984273791313171, "learning_rate": 0.0002, "epoch": 1.8750448833034112, "step": 26110}, {"loss": 0.7768, "grad_norm": 0.7020659446716309, "learning_rate": 0.0002, "epoch": 1.8757630161579892, "step": 26120}, {"loss": 0.7518, "grad_norm": 0.784986138343811, "learning_rate": 0.0002, "epoch": 1.8764811490125672, "step": 26130}, {"loss": 0.7224, "grad_norm": 0.7369210124015808, "learning_rate": 0.0002, "epoch": 1.8771992818671455, "step": 26140}, {"loss": 0.7935, "grad_norm": 0.7730622291564941, "learning_rate": 0.0002, "epoch": 1.8779174147217235, "step": 26150}, {"loss": 0.697, "grad_norm": 0.7253434658050537, "learning_rate": 0.0002, "epoch": 1.8786355475763017, "step": 26160}, {"loss": 0.6866, "grad_norm": 0.8019800186157227, "learning_rate": 0.0002, "epoch": 1.8793536804308797, "step": 26170}, {"loss": 0.7341, "grad_norm": 0.7337628602981567, "learning_rate": 0.0002, "epoch": 1.8800718132854577, "step": 26180}, {"loss": 0.752, "grad_norm": 0.7049200534820557, "learning_rate": 0.0002, "epoch": 1.880789946140036, "step": 26190}, {"loss": 0.73, "grad_norm": 0.6451525092124939, "learning_rate": 0.0002, "epoch": 1.8815080789946141, "step": 26200}, {"loss": 0.749, "grad_norm": 0.7660874724388123, "learning_rate": 0.0002, "epoch": 1.8822262118491921, "step": 26210}, {"loss": 0.7377, "grad_norm": 0.8464223146438599, "learning_rate": 0.0002, "epoch": 1.8829443447037701, "step": 26220}, {"loss": 0.7402, "grad_norm": 0.859503984451294, "learning_rate": 0.0002, "epoch": 1.8836624775583481, "step": 26230}, {"loss": 0.7057, "grad_norm": 0.6969478726387024, "learning_rate": 0.0002, "epoch": 1.8843806104129264, "step": 26240}, {"loss": 0.7338, "grad_norm": 0.6860285997390747, "learning_rate": 0.0002, "epoch": 1.8850987432675046, "step": 26250}, {"loss": 0.7397, "grad_norm": 0.5873110294342041, "learning_rate": 0.0002, "epoch": 1.8858168761220826, "step": 26260}, {"loss": 0.7208, "grad_norm": 0.6959530115127563, "learning_rate": 0.0002, "epoch": 1.8865350089766606, "step": 26270}, {"loss": 0.7156, "grad_norm": 0.8734689950942993, "learning_rate": 0.0002, "epoch": 1.8872531418312388, "step": 26280}, {"loss": 0.689, "grad_norm": 0.7385509014129639, "learning_rate": 0.0002, "epoch": 1.8879712746858168, "step": 26290}, {"loss": 0.7355, "grad_norm": 0.6702063083648682, "learning_rate": 0.0002, "epoch": 1.888689407540395, "step": 26300}, {"loss": 0.7247, "grad_norm": 0.8177255988121033, "learning_rate": 0.0002, "epoch": 1.889407540394973, "step": 26310}, {"loss": 0.7451, "grad_norm": 0.6638466715812683, "learning_rate": 0.0002, "epoch": 1.890125673249551, "step": 26320}, {"loss": 0.7176, "grad_norm": 0.8584128618240356, "learning_rate": 0.0002, "epoch": 1.8908438061041293, "step": 26330}, {"loss": 0.7216, "grad_norm": 0.677561342716217, "learning_rate": 0.0002, "epoch": 1.8915619389587075, "step": 26340}, {"loss": 0.7502, "grad_norm": 0.6931864619255066, "learning_rate": 0.0002, "epoch": 1.8922800718132855, "step": 26350}, {"loss": 0.7548, "grad_norm": 0.6583828330039978, "learning_rate": 0.0002, "epoch": 1.8929982046678635, "step": 26360}, {"loss": 0.7544, "grad_norm": 0.6708519458770752, "learning_rate": 0.0002, "epoch": 1.8937163375224415, "step": 26370}, {"loss": 0.7034, "grad_norm": 0.7684788107872009, "learning_rate": 0.0002, "epoch": 1.8944344703770197, "step": 26380}, {"loss": 0.7243, "grad_norm": 0.703217625617981, "learning_rate": 0.0002, "epoch": 1.895152603231598, "step": 26390}, {"loss": 0.7768, "grad_norm": 0.6686710119247437, "learning_rate": 0.0002, "epoch": 1.895870736086176, "step": 26400}, {"loss": 0.7999, "grad_norm": 0.7429705262184143, "learning_rate": 0.0002, "epoch": 1.896588868940754, "step": 26410}, {"loss": 0.7695, "grad_norm": 0.7835305333137512, "learning_rate": 0.0002, "epoch": 1.8973070017953322, "step": 26420}, {"loss": 0.722, "grad_norm": 0.7793689370155334, "learning_rate": 0.0002, "epoch": 1.8980251346499102, "step": 26430}, {"loss": 0.7872, "grad_norm": 0.7337237000465393, "learning_rate": 0.0002, "epoch": 1.8987432675044884, "step": 26440}, {"loss": 0.7092, "grad_norm": 0.5734546780586243, "learning_rate": 0.0002, "epoch": 1.8994614003590664, "step": 26450}, {"loss": 0.7738, "grad_norm": 0.655937135219574, "learning_rate": 0.0002, "epoch": 1.9001795332136444, "step": 26460}, {"loss": 0.7302, "grad_norm": 1.0200905799865723, "learning_rate": 0.0002, "epoch": 1.9008976660682226, "step": 26470}, {"loss": 0.733, "grad_norm": 0.6118829250335693, "learning_rate": 0.0002, "epoch": 1.9016157989228009, "step": 26480}, {"loss": 0.7255, "grad_norm": 0.7459297776222229, "learning_rate": 0.0002, "epoch": 1.9023339317773789, "step": 26490}, {"loss": 0.7257, "grad_norm": 0.9451959729194641, "learning_rate": 0.0002, "epoch": 1.9030520646319569, "step": 26500}, {"loss": 0.7911, "grad_norm": 0.9694880247116089, "learning_rate": 0.0002, "epoch": 1.9037701974865349, "step": 26510}, {"loss": 0.7913, "grad_norm": 0.806532084941864, "learning_rate": 0.0002, "epoch": 1.904488330341113, "step": 26520}, {"loss": 0.7375, "grad_norm": 0.7016968727111816, "learning_rate": 0.0002, "epoch": 1.9052064631956913, "step": 26530}, {"loss": 0.7128, "grad_norm": 0.7707533836364746, "learning_rate": 0.0002, "epoch": 1.9059245960502693, "step": 26540}, {"loss": 0.7225, "grad_norm": 0.716044545173645, "learning_rate": 0.0002, "epoch": 1.9066427289048473, "step": 26550}, {"loss": 0.7569, "grad_norm": 0.7904782295227051, "learning_rate": 0.0002, "epoch": 1.9073608617594255, "step": 26560}, {"loss": 0.7112, "grad_norm": 0.8557461500167847, "learning_rate": 0.0002, "epoch": 1.9080789946140035, "step": 26570}, {"loss": 0.7377, "grad_norm": 0.6807048916816711, "learning_rate": 0.0002, "epoch": 1.9087971274685818, "step": 26580}, {"loss": 0.7066, "grad_norm": 0.8374032974243164, "learning_rate": 0.0002, "epoch": 1.9095152603231598, "step": 26590}, {"loss": 0.7282, "grad_norm": 0.7936834692955017, "learning_rate": 0.0002, "epoch": 1.9102333931777378, "step": 26600}, {"loss": 0.741, "grad_norm": 0.6342210173606873, "learning_rate": 0.0002, "epoch": 1.910951526032316, "step": 26610}, {"loss": 0.7117, "grad_norm": 0.8222208023071289, "learning_rate": 0.0002, "epoch": 1.9116696588868942, "step": 26620}, {"loss": 0.6965, "grad_norm": 0.7890012860298157, "learning_rate": 0.0002, "epoch": 1.9123877917414722, "step": 26630}, {"loss": 0.7141, "grad_norm": 0.6415254473686218, "learning_rate": 0.0002, "epoch": 1.9131059245960502, "step": 26640}, {"loss": 0.7232, "grad_norm": 0.7936763763427734, "learning_rate": 0.0002, "epoch": 1.9138240574506282, "step": 26650}, {"loss": 0.7411, "grad_norm": 0.7174334526062012, "learning_rate": 0.0002, "epoch": 1.9145421903052064, "step": 26660}, {"loss": 0.715, "grad_norm": 0.6503710746765137, "learning_rate": 0.0002, "epoch": 1.9152603231597847, "step": 26670}, {"loss": 0.7629, "grad_norm": 0.7618577480316162, "learning_rate": 0.0002, "epoch": 1.9159784560143627, "step": 26680}, {"loss": 0.7581, "grad_norm": 0.7984131574630737, "learning_rate": 0.0002, "epoch": 1.9166965888689407, "step": 26690}, {"loss": 0.7126, "grad_norm": 0.6863887906074524, "learning_rate": 0.0002, "epoch": 1.917414721723519, "step": 26700}, {"loss": 0.738, "grad_norm": 0.7621138691902161, "learning_rate": 0.0002, "epoch": 1.918132854578097, "step": 26710}, {"loss": 0.7095, "grad_norm": 0.7855543494224548, "learning_rate": 0.0002, "epoch": 1.9188509874326751, "step": 26720}, {"loss": 0.7354, "grad_norm": 0.7045016288757324, "learning_rate": 0.0002, "epoch": 1.9195691202872531, "step": 26730}, {"loss": 0.7188, "grad_norm": 0.7799559235572815, "learning_rate": 0.0002, "epoch": 1.9202872531418311, "step": 26740}, {"loss": 0.7714, "grad_norm": 0.7999796271324158, "learning_rate": 0.0002, "epoch": 1.9210053859964094, "step": 26750}, {"loss": 0.6856, "grad_norm": 0.5479980111122131, "learning_rate": 0.0002, "epoch": 1.9217235188509876, "step": 26760}, {"loss": 0.7153, "grad_norm": 0.7192868590354919, "learning_rate": 0.0002, "epoch": 1.9224416517055656, "step": 26770}, {"loss": 0.7272, "grad_norm": 0.7642375826835632, "learning_rate": 0.0002, "epoch": 1.9231597845601436, "step": 26780}, {"loss": 0.6923, "grad_norm": 0.7015959620475769, "learning_rate": 0.0002, "epoch": 1.9238779174147216, "step": 26790}, {"loss": 0.8291, "grad_norm": 0.6685634851455688, "learning_rate": 0.0002, "epoch": 1.9245960502692998, "step": 26800}, {"loss": 0.7404, "grad_norm": 0.674363911151886, "learning_rate": 0.0002, "epoch": 1.925314183123878, "step": 26810}, {"loss": 0.7145, "grad_norm": 0.769318163394928, "learning_rate": 0.0002, "epoch": 1.926032315978456, "step": 26820}, {"loss": 0.7323, "grad_norm": 0.7397989630699158, "learning_rate": 0.0002, "epoch": 1.926750448833034, "step": 26830}, {"loss": 0.7399, "grad_norm": 0.7603814601898193, "learning_rate": 0.0002, "epoch": 1.9274685816876123, "step": 26840}, {"loss": 0.7147, "grad_norm": 0.5960564613342285, "learning_rate": 0.0002, "epoch": 1.9281867145421903, "step": 26850}, {"loss": 0.7292, "grad_norm": 0.8158858418464661, "learning_rate": 0.0002, "epoch": 1.9289048473967685, "step": 26860}, {"loss": 0.7609, "grad_norm": 0.7022058367729187, "learning_rate": 0.0002, "epoch": 1.9296229802513465, "step": 26870}, {"loss": 0.809, "grad_norm": 0.7249060273170471, "learning_rate": 0.0002, "epoch": 1.9303411131059245, "step": 26880}, {"loss": 0.7437, "grad_norm": 0.7613264322280884, "learning_rate": 0.0002, "epoch": 1.9310592459605027, "step": 26890}, {"loss": 0.7238, "grad_norm": 0.6857499480247498, "learning_rate": 0.0002, "epoch": 1.931777378815081, "step": 26900}, {"loss": 0.7651, "grad_norm": 0.6968346834182739, "learning_rate": 0.0002, "epoch": 1.932495511669659, "step": 26910}, {"loss": 0.6837, "grad_norm": 0.7079267501831055, "learning_rate": 0.0002, "epoch": 1.933213644524237, "step": 26920}, {"loss": 0.7482, "grad_norm": 0.6571618914604187, "learning_rate": 0.0002, "epoch": 1.933931777378815, "step": 26930}, {"loss": 0.7344, "grad_norm": 0.7460548281669617, "learning_rate": 0.0002, "epoch": 1.9346499102333932, "step": 26940}, {"loss": 0.7038, "grad_norm": 0.7954307794570923, "learning_rate": 0.0002, "epoch": 1.9353680430879714, "step": 26950}, {"loss": 0.6847, "grad_norm": 0.8696223497390747, "learning_rate": 0.0002, "epoch": 1.9360861759425494, "step": 26960}, {"loss": 0.7657, "grad_norm": 0.726004421710968, "learning_rate": 0.0002, "epoch": 1.9368043087971274, "step": 26970}, {"loss": 0.771, "grad_norm": 0.8760337829589844, "learning_rate": 0.0002, "epoch": 1.9375224416517056, "step": 26980}, {"loss": 0.6917, "grad_norm": 0.7308675646781921, "learning_rate": 0.0002, "epoch": 1.9382405745062836, "step": 26990}, {"loss": 0.7155, "grad_norm": 0.5900304317474365, "learning_rate": 0.0002, "epoch": 1.9389587073608618, "step": 27000}, {"loss": 0.6917, "grad_norm": 0.8839457631111145, "learning_rate": 0.0002, "epoch": 1.9396768402154398, "step": 27010}, {"loss": 0.7443, "grad_norm": 0.7239173650741577, "learning_rate": 0.0002, "epoch": 1.9403949730700178, "step": 27020}, {"loss": 0.7081, "grad_norm": 0.8972901701927185, "learning_rate": 0.0002, "epoch": 1.941113105924596, "step": 27030}, {"loss": 0.7422, "grad_norm": 0.7140652537345886, "learning_rate": 0.0002, "epoch": 1.9418312387791743, "step": 27040}, {"loss": 0.7679, "grad_norm": 0.7502743005752563, "learning_rate": 0.0002, "epoch": 1.9425493716337523, "step": 27050}, {"loss": 0.7311, "grad_norm": 0.6420751810073853, "learning_rate": 0.0002, "epoch": 1.9432675044883303, "step": 27060}, {"loss": 0.7403, "grad_norm": 0.6671820282936096, "learning_rate": 0.0002, "epoch": 1.9439856373429083, "step": 27070}, {"loss": 0.6919, "grad_norm": 0.6268796324729919, "learning_rate": 0.0002, "epoch": 1.9447037701974865, "step": 27080}, {"loss": 0.8154, "grad_norm": 0.6850021481513977, "learning_rate": 0.0002, "epoch": 1.9454219030520647, "step": 27090}, {"loss": 0.7179, "grad_norm": 0.6380038261413574, "learning_rate": 0.0002, "epoch": 1.9461400359066428, "step": 27100}, {"loss": 0.7638, "grad_norm": 0.5806204080581665, "learning_rate": 0.0002, "epoch": 1.9468581687612208, "step": 27110}, {"loss": 0.7032, "grad_norm": 0.8236927390098572, "learning_rate": 0.0002, "epoch": 1.947576301615799, "step": 27120}, {"loss": 0.7398, "grad_norm": 0.7915826439857483, "learning_rate": 0.0002, "epoch": 1.948294434470377, "step": 27130}, {"loss": 0.729, "grad_norm": 0.7467429041862488, "learning_rate": 0.0002, "epoch": 1.9490125673249552, "step": 27140}, {"loss": 0.7297, "grad_norm": 0.6278707981109619, "learning_rate": 0.0002, "epoch": 1.9497307001795332, "step": 27150}, {"loss": 0.7272, "grad_norm": 0.7353739142417908, "learning_rate": 0.0002, "epoch": 1.9504488330341112, "step": 27160}, {"loss": 0.6877, "grad_norm": 0.6443645358085632, "learning_rate": 0.0002, "epoch": 1.9511669658886894, "step": 27170}, {"loss": 0.7479, "grad_norm": 0.770800769329071, "learning_rate": 0.0002, "epoch": 1.9518850987432677, "step": 27180}, {"loss": 0.713, "grad_norm": 0.8982598781585693, "learning_rate": 0.0002, "epoch": 1.9526032315978457, "step": 27190}, {"loss": 0.7447, "grad_norm": 0.775017499923706, "learning_rate": 0.0002, "epoch": 1.9533213644524237, "step": 27200}, {"loss": 0.76, "grad_norm": 0.8271628618240356, "learning_rate": 0.0002, "epoch": 1.9540394973070017, "step": 27210}, {"loss": 0.7321, "grad_norm": 0.7460184693336487, "learning_rate": 0.0002, "epoch": 1.9547576301615799, "step": 27220}, {"loss": 0.6999, "grad_norm": 0.7732188105583191, "learning_rate": 0.0002, "epoch": 1.955475763016158, "step": 27230}, {"loss": 0.7135, "grad_norm": 0.7398577332496643, "learning_rate": 0.0002, "epoch": 1.956193895870736, "step": 27240}, {"loss": 0.7347, "grad_norm": 0.7132339477539062, "learning_rate": 0.0002, "epoch": 1.9569120287253141, "step": 27250}, {"loss": 0.7731, "grad_norm": 0.6718965768814087, "learning_rate": 0.0002, "epoch": 1.9576301615798921, "step": 27260}, {"loss": 0.7088, "grad_norm": 0.7914422154426575, "learning_rate": 0.0002, "epoch": 1.9583482944344703, "step": 27270}, {"loss": 0.6998, "grad_norm": 0.8314110636711121, "learning_rate": 0.0002, "epoch": 1.9590664272890486, "step": 27280}, {"loss": 0.7662, "grad_norm": 0.7810674905776978, "learning_rate": 0.0002, "epoch": 1.9597845601436266, "step": 27290}, {"loss": 0.7278, "grad_norm": 0.7691007256507874, "learning_rate": 0.0002, "epoch": 1.9605026929982046, "step": 27300}, {"loss": 0.7152, "grad_norm": 0.6753138899803162, "learning_rate": 0.0002, "epoch": 1.9612208258527828, "step": 27310}, {"loss": 0.7519, "grad_norm": 0.5881175994873047, "learning_rate": 0.0002, "epoch": 1.961938958707361, "step": 27320}, {"loss": 0.6877, "grad_norm": 0.8414133191108704, "learning_rate": 0.0002, "epoch": 1.962657091561939, "step": 27330}, {"loss": 0.7241, "grad_norm": 0.7363715171813965, "learning_rate": 0.0002, "epoch": 1.963375224416517, "step": 27340}, {"loss": 0.7153, "grad_norm": 0.6526232361793518, "learning_rate": 0.0002, "epoch": 1.964093357271095, "step": 27350}, {"loss": 0.8178, "grad_norm": 0.6821389198303223, "learning_rate": 0.0002, "epoch": 1.9648114901256732, "step": 27360}, {"loss": 0.7134, "grad_norm": 0.7306062579154968, "learning_rate": 0.0002, "epoch": 1.9655296229802515, "step": 27370}, {"loss": 0.7139, "grad_norm": 0.6458130478858948, "learning_rate": 0.0002, "epoch": 1.9662477558348295, "step": 27380}, {"loss": 0.7397, "grad_norm": 0.7243196368217468, "learning_rate": 0.0002, "epoch": 1.9669658886894075, "step": 27390}, {"loss": 0.6729, "grad_norm": 0.8062235713005066, "learning_rate": 0.0002, "epoch": 1.9676840215439855, "step": 27400}, {"loss": 0.7972, "grad_norm": 0.68441241979599, "learning_rate": 0.0002, "epoch": 1.9684021543985637, "step": 27410}, {"loss": 0.7235, "grad_norm": 0.7504498958587646, "learning_rate": 0.0002, "epoch": 1.969120287253142, "step": 27420}, {"loss": 0.7192, "grad_norm": 0.7469466328620911, "learning_rate": 0.0002, "epoch": 1.96983842010772, "step": 27430}, {"loss": 0.7556, "grad_norm": 0.7109853625297546, "learning_rate": 0.0002, "epoch": 1.970556552962298, "step": 27440}, {"loss": 0.7977, "grad_norm": 0.6964903473854065, "learning_rate": 0.0002, "epoch": 1.9712746858168761, "step": 27450}, {"loss": 0.7692, "grad_norm": 0.8224200010299683, "learning_rate": 0.0002, "epoch": 1.9719928186714544, "step": 27460}, {"loss": 0.7318, "grad_norm": 0.6195617318153381, "learning_rate": 0.0002, "epoch": 1.9727109515260324, "step": 27470}, {"loss": 0.7843, "grad_norm": 0.691511332988739, "learning_rate": 0.0002, "epoch": 1.9734290843806104, "step": 27480}, {"loss": 0.7324, "grad_norm": 0.7437900304794312, "learning_rate": 0.0002, "epoch": 1.9741472172351884, "step": 27490}, {"loss": 0.6736, "grad_norm": 0.7987960577011108, "learning_rate": 0.0002, "epoch": 1.9748653500897666, "step": 27500}, {"loss": 0.7005, "grad_norm": 0.7117776274681091, "learning_rate": 0.0002, "epoch": 1.9755834829443448, "step": 27510}, {"loss": 0.7201, "grad_norm": 0.8473866581916809, "learning_rate": 0.0002, "epoch": 1.9763016157989228, "step": 27520}, {"loss": 0.7528, "grad_norm": 0.7178242802619934, "learning_rate": 0.0002, "epoch": 1.9770197486535008, "step": 27530}, {"loss": 0.7112, "grad_norm": 0.760145902633667, "learning_rate": 0.0002, "epoch": 1.9777378815080788, "step": 27540}, {"loss": 0.8118, "grad_norm": 0.764436662197113, "learning_rate": 0.0002, "epoch": 1.978456014362657, "step": 27550}, {"loss": 0.7542, "grad_norm": 0.7245904803276062, "learning_rate": 0.0002, "epoch": 1.9791741472172353, "step": 27560}, {"loss": 0.7316, "grad_norm": 0.6317000389099121, "learning_rate": 0.0002, "epoch": 1.9798922800718133, "step": 27570}, {"loss": 0.7504, "grad_norm": 0.8764704465866089, "learning_rate": 0.0002, "epoch": 1.9806104129263913, "step": 27580}, {"loss": 0.7845, "grad_norm": 0.6111825108528137, "learning_rate": 0.0002, "epoch": 1.9813285457809695, "step": 27590}, {"loss": 0.7101, "grad_norm": 0.6797714233398438, "learning_rate": 0.0002, "epoch": 1.9820466786355477, "step": 27600}, {"loss": 0.8037, "grad_norm": 0.7754142880439758, "learning_rate": 0.0002, "epoch": 1.9827648114901257, "step": 27610}, {"loss": 0.7288, "grad_norm": 0.7243061661720276, "learning_rate": 0.0002, "epoch": 1.9834829443447037, "step": 27620}, {"loss": 0.6626, "grad_norm": 0.6194812655448914, "learning_rate": 0.0002, "epoch": 1.9842010771992817, "step": 27630}, {"loss": 0.7162, "grad_norm": 0.6399638056755066, "learning_rate": 0.0002, "epoch": 1.98491921005386, "step": 27640}, {"loss": 0.764, "grad_norm": 0.7637218832969666, "learning_rate": 0.0002, "epoch": 1.9856373429084382, "step": 27650}, {"loss": 0.7386, "grad_norm": 0.9099404811859131, "learning_rate": 0.0002, "epoch": 1.9863554757630162, "step": 27660}, {"loss": 0.7579, "grad_norm": 0.6892596483230591, "learning_rate": 0.0002, "epoch": 1.9870736086175942, "step": 27670}, {"loss": 0.7802, "grad_norm": 0.5962418913841248, "learning_rate": 0.0002, "epoch": 1.9877917414721722, "step": 27680}, {"loss": 0.7162, "grad_norm": 0.5750163197517395, "learning_rate": 0.0002, "epoch": 1.9885098743267504, "step": 27690}, {"loss": 0.7553, "grad_norm": 0.6740097403526306, "learning_rate": 0.0002, "epoch": 1.9892280071813286, "step": 27700}, {"loss": 0.7444, "grad_norm": 0.6968644857406616, "learning_rate": 0.0002, "epoch": 1.9899461400359066, "step": 27710}, {"loss": 0.7383, "grad_norm": 0.6788132190704346, "learning_rate": 0.0002, "epoch": 1.9906642728904846, "step": 27720}, {"loss": 0.7249, "grad_norm": 0.8600544929504395, "learning_rate": 0.0002, "epoch": 1.9913824057450629, "step": 27730}, {"loss": 0.7133, "grad_norm": 0.6227671504020691, "learning_rate": 0.0002, "epoch": 1.992100538599641, "step": 27740}, {"loss": 0.7815, "grad_norm": 0.6611875295639038, "learning_rate": 0.0002, "epoch": 1.992818671454219, "step": 27750}, {"loss": 0.7423, "grad_norm": 0.714568018913269, "learning_rate": 0.0002, "epoch": 1.993536804308797, "step": 27760}, {"loss": 0.7297, "grad_norm": 0.6328669190406799, "learning_rate": 0.0002, "epoch": 1.994254937163375, "step": 27770}, {"loss": 0.7398, "grad_norm": 0.8673429489135742, "learning_rate": 0.0002, "epoch": 1.9949730700179533, "step": 27780}, {"loss": 0.7301, "grad_norm": 0.820620059967041, "learning_rate": 0.0002, "epoch": 1.9956912028725315, "step": 27790}, {"loss": 0.7828, "grad_norm": 0.8748094439506531, "learning_rate": 0.0002, "epoch": 1.9964093357271095, "step": 27800}, {"loss": 0.6945, "grad_norm": 0.8118113875389099, "learning_rate": 0.0002, "epoch": 1.9971274685816875, "step": 27810}, {"loss": 0.742, "grad_norm": 0.6886725425720215, "learning_rate": 0.0002, "epoch": 1.9978456014362656, "step": 27820}, {"loss": 0.7293, "grad_norm": 0.7101268768310547, "learning_rate": 0.0002, "epoch": 1.9985637342908438, "step": 27830}, {"loss": 0.7317, "grad_norm": 0.7823781967163086, "learning_rate": 0.0002, "epoch": 1.999281867145422, "step": 27840}, {"loss": 0.7711, "grad_norm": 0.8491085767745972, "learning_rate": 0.0002, "epoch": 2.0, "step": 27850}, {"eval_loss": 1.0868422985076904, "eval_runtime": 55.1699, "eval_samples_per_second": 13.286, "eval_steps_per_second": 1.668, "epoch": 2.0, "step": 27850}, {"loss": 0.6808, "grad_norm": 0.9003389477729797, "learning_rate": 0.0002, "epoch": 2.000718132854578, "step": 27860}, {"loss": 0.6379, "grad_norm": 0.8898349404335022, "learning_rate": 0.0002, "epoch": 2.001436265709156, "step": 27870}, {"loss": 0.7157, "grad_norm": 0.7525973320007324, "learning_rate": 0.0002, "epoch": 2.0021543985637344, "step": 27880}, {"loss": 0.6681, "grad_norm": 0.7821497321128845, "learning_rate": 0.0002, "epoch": 2.0028725314183125, "step": 27890}, {"loss": 0.6781, "grad_norm": 0.6334691047668457, "learning_rate": 0.0002, "epoch": 2.0035906642728905, "step": 27900}, {"loss": 0.6349, "grad_norm": 0.732991099357605, "learning_rate": 0.0002, "epoch": 2.0043087971274685, "step": 27910}, {"loss": 0.6776, "grad_norm": 0.949942946434021, "learning_rate": 0.0002, "epoch": 2.0050269299820465, "step": 27920}, {"loss": 0.735, "grad_norm": 0.657267689704895, "learning_rate": 0.0002, "epoch": 2.005745062836625, "step": 27930}, {"loss": 0.7123, "grad_norm": 0.8329252004623413, "learning_rate": 0.0002, "epoch": 2.006463195691203, "step": 27940}, {"loss": 0.6826, "grad_norm": 0.7816959023475647, "learning_rate": 0.0002, "epoch": 2.007181328545781, "step": 27950}, {"loss": 0.6511, "grad_norm": 0.7546323537826538, "learning_rate": 0.0002, "epoch": 2.007899461400359, "step": 27960}, {"loss": 0.6222, "grad_norm": 0.9519657492637634, "learning_rate": 0.0002, "epoch": 2.0086175942549374, "step": 27970}, {"loss": 0.6642, "grad_norm": 0.7934315800666809, "learning_rate": 0.0002, "epoch": 2.0093357271095154, "step": 27980}, {"loss": 0.666, "grad_norm": 0.9579764604568481, "learning_rate": 0.0002, "epoch": 2.0100538599640934, "step": 27990}, {"loss": 0.6376, "grad_norm": 0.764167070388794, "learning_rate": 0.0002, "epoch": 2.0107719928186714, "step": 28000}, {"loss": 0.6512, "grad_norm": 0.7380000948905945, "learning_rate": 0.0002, "epoch": 2.0114901256732494, "step": 28010}, {"loss": 0.6893, "grad_norm": 0.7220044732093811, "learning_rate": 0.0002, "epoch": 2.012208258527828, "step": 28020}, {"loss": 0.6168, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 2.012926391382406, "step": 28030}, {"loss": 0.6595, "grad_norm": 0.7507190704345703, "learning_rate": 0.0002, "epoch": 2.013644524236984, "step": 28040}, {"loss": 0.6974, "grad_norm": 0.9488387703895569, "learning_rate": 0.0002, "epoch": 2.014362657091562, "step": 28050}, {"loss": 0.6489, "grad_norm": 0.9092940092086792, "learning_rate": 0.0002, "epoch": 2.01508078994614, "step": 28060}, {"loss": 0.6545, "grad_norm": 0.7859629392623901, "learning_rate": 0.0002, "epoch": 2.0157989228007183, "step": 28070}, {"loss": 0.6552, "grad_norm": 0.7636393904685974, "learning_rate": 0.0002, "epoch": 2.0165170556552963, "step": 28080}, {"loss": 0.696, "grad_norm": 0.8860714435577393, "learning_rate": 0.0002, "epoch": 2.0172351885098743, "step": 28090}, {"loss": 0.6368, "grad_norm": 0.6837195158004761, "learning_rate": 0.0002, "epoch": 2.0179533213644523, "step": 28100}, {"loss": 0.6405, "grad_norm": 0.7778242826461792, "learning_rate": 0.0002, "epoch": 2.0186714542190307, "step": 28110}, {"loss": 0.6417, "grad_norm": 0.7164766788482666, "learning_rate": 0.0002, "epoch": 2.0193895870736087, "step": 28120}, {"loss": 0.6684, "grad_norm": 0.8965572118759155, "learning_rate": 0.0002, "epoch": 2.0201077199281867, "step": 28130}, {"loss": 0.6213, "grad_norm": 0.8074374794960022, "learning_rate": 0.0002, "epoch": 2.0208258527827647, "step": 28140}, {"loss": 0.6563, "grad_norm": 0.8307222127914429, "learning_rate": 0.0002, "epoch": 2.0215439856373427, "step": 28150}, {"loss": 0.6617, "grad_norm": 0.9600032567977905, "learning_rate": 0.0002, "epoch": 2.022262118491921, "step": 28160}, {"loss": 0.6722, "grad_norm": 0.8541040420532227, "learning_rate": 0.0002, "epoch": 2.022980251346499, "step": 28170}, {"loss": 0.6803, "grad_norm": 0.8864985704421997, "learning_rate": 0.0002, "epoch": 2.023698384201077, "step": 28180}, {"loss": 0.6516, "grad_norm": 0.7926326990127563, "learning_rate": 0.0002, "epoch": 2.024416517055655, "step": 28190}, {"loss": 0.6595, "grad_norm": 1.0548077821731567, "learning_rate": 0.0002, "epoch": 2.025134649910233, "step": 28200}, {"loss": 0.6859, "grad_norm": 0.7468827366828918, "learning_rate": 0.0002, "epoch": 2.0258527827648116, "step": 28210}, {"loss": 0.6605, "grad_norm": 0.7683286070823669, "learning_rate": 0.0002, "epoch": 2.0265709156193896, "step": 28220}, {"loss": 0.6656, "grad_norm": 0.7307319641113281, "learning_rate": 0.0002, "epoch": 2.0272890484739676, "step": 28230}, {"loss": 0.7148, "grad_norm": 0.7813416719436646, "learning_rate": 0.0002, "epoch": 2.0280071813285456, "step": 28240}, {"loss": 0.6882, "grad_norm": 0.7954556941986084, "learning_rate": 0.0002, "epoch": 2.028725314183124, "step": 28250}, {"loss": 0.6192, "grad_norm": 0.8836418986320496, "learning_rate": 0.0002, "epoch": 2.029443447037702, "step": 28260}, {"loss": 0.6275, "grad_norm": 0.7092728614807129, "learning_rate": 0.0002, "epoch": 2.03016157989228, "step": 28270}, {"loss": 0.6735, "grad_norm": 0.8512285351753235, "learning_rate": 0.0002, "epoch": 2.030879712746858, "step": 28280}, {"loss": 0.6586, "grad_norm": 0.8005346059799194, "learning_rate": 0.0002, "epoch": 2.031597845601436, "step": 28290}, {"loss": 0.6129, "grad_norm": 0.8872515559196472, "learning_rate": 0.0002, "epoch": 2.0323159784560145, "step": 28300}, {"loss": 0.6935, "grad_norm": 0.7948436737060547, "learning_rate": 0.0002, "epoch": 2.0330341113105925, "step": 28310}, {"loss": 0.6831, "grad_norm": 0.7418082356452942, "learning_rate": 0.0002, "epoch": 2.0337522441651705, "step": 28320}, {"loss": 0.6922, "grad_norm": 0.9600949287414551, "learning_rate": 0.0002, "epoch": 2.0344703770197485, "step": 28330}, {"loss": 0.6015, "grad_norm": 0.9767434597015381, "learning_rate": 0.0002, "epoch": 2.0351885098743265, "step": 28340}, {"loss": 0.6637, "grad_norm": 0.7435336709022522, "learning_rate": 0.0002, "epoch": 2.035906642728905, "step": 28350}, {"loss": 0.649, "grad_norm": 0.997978925704956, "learning_rate": 0.0002, "epoch": 2.036624775583483, "step": 28360}, {"loss": 0.6957, "grad_norm": 0.9072412252426147, "learning_rate": 0.0002, "epoch": 2.037342908438061, "step": 28370}, {"loss": 0.6816, "grad_norm": 0.8396701812744141, "learning_rate": 0.0002, "epoch": 2.038061041292639, "step": 28380}, {"loss": 0.6487, "grad_norm": 1.0449832677841187, "learning_rate": 0.0002, "epoch": 2.0387791741472174, "step": 28390}, {"loss": 0.6826, "grad_norm": 0.6471025943756104, "learning_rate": 0.0002, "epoch": 2.0394973070017954, "step": 28400}, {"loss": 0.6597, "grad_norm": 0.8147950768470764, "learning_rate": 0.0002, "epoch": 2.0402154398563734, "step": 28410}, {"loss": 0.6502, "grad_norm": 0.902508020401001, "learning_rate": 0.0002, "epoch": 2.0409335727109514, "step": 28420}, {"loss": 0.6303, "grad_norm": 0.6426262855529785, "learning_rate": 0.0002, "epoch": 2.0416517055655294, "step": 28430}, {"loss": 0.6812, "grad_norm": 0.8016643524169922, "learning_rate": 0.0002, "epoch": 2.042369838420108, "step": 28440}, {"loss": 0.6535, "grad_norm": 0.6841614246368408, "learning_rate": 0.0002, "epoch": 2.043087971274686, "step": 28450}, {"loss": 0.638, "grad_norm": 0.7713631987571716, "learning_rate": 0.0002, "epoch": 2.043806104129264, "step": 28460}, {"loss": 0.6456, "grad_norm": 0.8795675039291382, "learning_rate": 0.0002, "epoch": 2.044524236983842, "step": 28470}, {"loss": 0.6858, "grad_norm": 0.725447416305542, "learning_rate": 0.0002, "epoch": 2.04524236983842, "step": 28480}, {"loss": 0.6289, "grad_norm": 0.806861162185669, "learning_rate": 0.0002, "epoch": 2.0459605026929983, "step": 28490}, {"loss": 0.6269, "grad_norm": 0.752953827381134, "learning_rate": 0.0002, "epoch": 2.0466786355475763, "step": 28500}, {"loss": 0.6818, "grad_norm": 0.7143173813819885, "learning_rate": 0.0002, "epoch": 2.0473967684021543, "step": 28510}, {"loss": 0.6606, "grad_norm": 0.9316226243972778, "learning_rate": 0.0002, "epoch": 2.0481149012567323, "step": 28520}, {"loss": 0.6284, "grad_norm": 0.7292338609695435, "learning_rate": 0.0002, "epoch": 2.048833034111311, "step": 28530}, {"loss": 0.6528, "grad_norm": 0.7392885088920593, "learning_rate": 0.0002, "epoch": 2.049551166965889, "step": 28540}, {"loss": 0.7007, "grad_norm": 0.7288873195648193, "learning_rate": 0.0002, "epoch": 2.050269299820467, "step": 28550}, {"loss": 0.6239, "grad_norm": 0.7791221141815186, "learning_rate": 0.0002, "epoch": 2.050987432675045, "step": 28560}, {"loss": 0.684, "grad_norm": 0.821983814239502, "learning_rate": 0.0002, "epoch": 2.051705565529623, "step": 28570}, {"loss": 0.6545, "grad_norm": 0.8925826549530029, "learning_rate": 0.0002, "epoch": 2.0524236983842012, "step": 28580}, {"loss": 0.719, "grad_norm": 0.7181646227836609, "learning_rate": 0.0002, "epoch": 2.0531418312387792, "step": 28590}, {"loss": 0.686, "grad_norm": 0.6387725472450256, "learning_rate": 0.0002, "epoch": 2.0538599640933572, "step": 28600}, {"loss": 0.6662, "grad_norm": 0.8398096561431885, "learning_rate": 0.0002, "epoch": 2.0545780969479353, "step": 28610}, {"loss": 0.69, "grad_norm": 1.0458195209503174, "learning_rate": 0.0002, "epoch": 2.0552962298025133, "step": 28620}, {"loss": 0.655, "grad_norm": 0.7032150626182556, "learning_rate": 0.0002, "epoch": 2.0560143626570917, "step": 28630}, {"loss": 0.6551, "grad_norm": 0.8850845098495483, "learning_rate": 0.0002, "epoch": 2.0567324955116697, "step": 28640}, {"loss": 0.6767, "grad_norm": 0.8587120175361633, "learning_rate": 0.0002, "epoch": 2.0574506283662477, "step": 28650}, {"loss": 0.6721, "grad_norm": 0.7462602853775024, "learning_rate": 0.0002, "epoch": 2.0581687612208257, "step": 28660}, {"loss": 0.6639, "grad_norm": 0.7355574369430542, "learning_rate": 0.0002, "epoch": 2.058886894075404, "step": 28670}, {"loss": 0.6216, "grad_norm": 0.9229736328125, "learning_rate": 0.0002, "epoch": 2.059605026929982, "step": 28680}, {"loss": 0.6692, "grad_norm": 0.7685085535049438, "learning_rate": 0.0002, "epoch": 2.06032315978456, "step": 28690}, {"loss": 0.6801, "grad_norm": 0.6749364137649536, "learning_rate": 0.0002, "epoch": 2.061041292639138, "step": 28700}, {"loss": 0.6721, "grad_norm": 0.7608520984649658, "learning_rate": 0.0002, "epoch": 2.061759425493716, "step": 28710}, {"loss": 0.6721, "grad_norm": 0.9451281428337097, "learning_rate": 0.0002, "epoch": 2.0624775583482946, "step": 28720}, {"loss": 0.671, "grad_norm": 0.7869735360145569, "learning_rate": 0.0002, "epoch": 2.0631956912028726, "step": 28730}, {"loss": 0.6409, "grad_norm": 0.8422008156776428, "learning_rate": 0.0002, "epoch": 2.0639138240574506, "step": 28740}, {"loss": 0.6686, "grad_norm": 0.7486162781715393, "learning_rate": 0.0002, "epoch": 2.0646319569120286, "step": 28750}, {"loss": 0.6641, "grad_norm": 0.9374173879623413, "learning_rate": 0.0002, "epoch": 2.0653500897666066, "step": 28760}, {"loss": 0.6737, "grad_norm": 0.8749295473098755, "learning_rate": 0.0002, "epoch": 2.066068222621185, "step": 28770}, {"loss": 0.636, "grad_norm": 0.8265942931175232, "learning_rate": 0.0002, "epoch": 2.066786355475763, "step": 28780}, {"loss": 0.6819, "grad_norm": 0.8541982769966125, "learning_rate": 0.0002, "epoch": 2.067504488330341, "step": 28790}, {"loss": 0.661, "grad_norm": 0.8220006227493286, "learning_rate": 0.0002, "epoch": 2.068222621184919, "step": 28800}, {"loss": 0.6942, "grad_norm": 0.7302022576332092, "learning_rate": 0.0002, "epoch": 2.0689407540394975, "step": 28810}, {"loss": 0.68, "grad_norm": 0.7073875069618225, "learning_rate": 0.0002, "epoch": 2.0696588868940755, "step": 28820}, {"loss": 0.6275, "grad_norm": 0.7792919874191284, "learning_rate": 0.0002, "epoch": 2.0703770197486535, "step": 28830}, {"loss": 0.6941, "grad_norm": 0.8268185257911682, "learning_rate": 0.0002, "epoch": 2.0710951526032315, "step": 28840}, {"loss": 0.6776, "grad_norm": 0.7576423287391663, "learning_rate": 0.0002, "epoch": 2.0718132854578095, "step": 28850}, {"loss": 0.6298, "grad_norm": 0.8255910873413086, "learning_rate": 0.0002, "epoch": 2.072531418312388, "step": 28860}, {"loss": 0.6695, "grad_norm": 0.7900934815406799, "learning_rate": 0.0002, "epoch": 2.073249551166966, "step": 28870}, {"loss": 0.6532, "grad_norm": 0.846665620803833, "learning_rate": 0.0002, "epoch": 2.073967684021544, "step": 28880}, {"loss": 0.6598, "grad_norm": 0.8159831166267395, "learning_rate": 0.0002, "epoch": 2.074685816876122, "step": 28890}, {"loss": 0.6341, "grad_norm": 0.7395941615104675, "learning_rate": 0.0002, "epoch": 2.0754039497307, "step": 28900}, {"loss": 0.6513, "grad_norm": 0.9765046238899231, "learning_rate": 0.0002, "epoch": 2.0761220825852784, "step": 28910}, {"loss": 0.6785, "grad_norm": 0.8358173966407776, "learning_rate": 0.0002, "epoch": 2.0768402154398564, "step": 28920}, {"loss": 0.6973, "grad_norm": 0.6848723292350769, "learning_rate": 0.0002, "epoch": 2.0775583482944344, "step": 28930}, {"loss": 0.6381, "grad_norm": 0.7965065836906433, "learning_rate": 0.0002, "epoch": 2.0782764811490124, "step": 28940}, {"loss": 0.667, "grad_norm": 0.7618608474731445, "learning_rate": 0.0002, "epoch": 2.078994614003591, "step": 28950}, {"loss": 0.6683, "grad_norm": 0.890615701675415, "learning_rate": 0.0002, "epoch": 2.079712746858169, "step": 28960}, {"loss": 0.6641, "grad_norm": 0.7310431003570557, "learning_rate": 0.0002, "epoch": 2.080430879712747, "step": 28970}, {"loss": 0.6511, "grad_norm": 0.8228268027305603, "learning_rate": 0.0002, "epoch": 2.081149012567325, "step": 28980}, {"loss": 0.655, "grad_norm": 0.883577287197113, "learning_rate": 0.0002, "epoch": 2.081867145421903, "step": 28990}, {"loss": 0.7232, "grad_norm": 0.8359243869781494, "learning_rate": 0.0002, "epoch": 2.0825852782764813, "step": 29000}, {"loss": 0.6744, "grad_norm": 0.8285391330718994, "learning_rate": 0.0002, "epoch": 2.0833034111310593, "step": 29010}, {"loss": 0.6951, "grad_norm": 0.8991064429283142, "learning_rate": 0.0002, "epoch": 2.0840215439856373, "step": 29020}, {"loss": 0.6444, "grad_norm": 0.6911244988441467, "learning_rate": 0.0002, "epoch": 2.0847396768402153, "step": 29030}, {"loss": 0.7098, "grad_norm": 0.8462249636650085, "learning_rate": 0.0002, "epoch": 2.0854578096947933, "step": 29040}, {"loss": 0.6813, "grad_norm": 0.9149548411369324, "learning_rate": 0.0002, "epoch": 2.0861759425493718, "step": 29050}, {"loss": 0.6948, "grad_norm": 0.7365630269050598, "learning_rate": 0.0002, "epoch": 2.0868940754039498, "step": 29060}, {"loss": 0.6391, "grad_norm": 0.8439079523086548, "learning_rate": 0.0002, "epoch": 2.087612208258528, "step": 29070}, {"loss": 0.6566, "grad_norm": 0.7123780846595764, "learning_rate": 0.0002, "epoch": 2.088330341113106, "step": 29080}, {"loss": 0.6305, "grad_norm": 0.6854261755943298, "learning_rate": 0.0002, "epoch": 2.0890484739676842, "step": 29090}, {"loss": 0.667, "grad_norm": 0.83026123046875, "learning_rate": 0.0002, "epoch": 2.0897666068222622, "step": 29100}, {"loss": 0.661, "grad_norm": 0.8413158059120178, "learning_rate": 0.0002, "epoch": 2.0904847396768402, "step": 29110}, {"loss": 0.7194, "grad_norm": 0.9646758437156677, "learning_rate": 0.0002, "epoch": 2.0912028725314182, "step": 29120}, {"loss": 0.7101, "grad_norm": 0.8421565890312195, "learning_rate": 0.0002, "epoch": 2.0919210053859962, "step": 29130}, {"loss": 0.6685, "grad_norm": 0.7748899459838867, "learning_rate": 0.0002, "epoch": 2.0926391382405747, "step": 29140}, {"loss": 0.6596, "grad_norm": 0.5973830819129944, "learning_rate": 0.0002, "epoch": 2.0933572710951527, "step": 29150}, {"loss": 0.6437, "grad_norm": 0.8440837860107422, "learning_rate": 0.0002, "epoch": 2.0940754039497307, "step": 29160}, {"loss": 0.6373, "grad_norm": 0.7392688989639282, "learning_rate": 0.0002, "epoch": 2.0947935368043087, "step": 29170}, {"loss": 0.6907, "grad_norm": 1.0522996187210083, "learning_rate": 0.0002, "epoch": 2.0955116696588867, "step": 29180}, {"loss": 0.6733, "grad_norm": 0.7330273389816284, "learning_rate": 0.0002, "epoch": 2.096229802513465, "step": 29190}, {"loss": 0.7219, "grad_norm": 1.11064875125885, "learning_rate": 0.0002, "epoch": 2.096947935368043, "step": 29200}, {"loss": 0.6125, "grad_norm": 0.795446515083313, "learning_rate": 0.0002, "epoch": 2.097666068222621, "step": 29210}, {"loss": 0.6466, "grad_norm": 0.5552594661712646, "learning_rate": 0.0002, "epoch": 2.098384201077199, "step": 29220}, {"loss": 0.6601, "grad_norm": 0.7327710390090942, "learning_rate": 0.0002, "epoch": 2.0991023339317776, "step": 29230}, {"loss": 0.656, "grad_norm": 0.7474247217178345, "learning_rate": 0.0002, "epoch": 2.0998204667863556, "step": 29240}, {"loss": 0.6707, "grad_norm": 0.7775853276252747, "learning_rate": 0.0002, "epoch": 2.1005385996409336, "step": 29250}, {"loss": 0.6623, "grad_norm": 0.769527018070221, "learning_rate": 0.0002, "epoch": 2.1012567324955116, "step": 29260}, {"loss": 0.6183, "grad_norm": 0.8350797891616821, "learning_rate": 0.0002, "epoch": 2.1019748653500896, "step": 29270}, {"loss": 0.6623, "grad_norm": 0.8749061822891235, "learning_rate": 0.0002, "epoch": 2.102692998204668, "step": 29280}, {"loss": 0.6292, "grad_norm": 0.7838778495788574, "learning_rate": 0.0002, "epoch": 2.103411131059246, "step": 29290}, {"loss": 0.699, "grad_norm": 0.8144710063934326, "learning_rate": 0.0002, "epoch": 2.104129263913824, "step": 29300}, {"loss": 0.6291, "grad_norm": 0.7965250015258789, "learning_rate": 0.0002, "epoch": 2.104847396768402, "step": 29310}, {"loss": 0.6387, "grad_norm": 0.7075945138931274, "learning_rate": 0.0002, "epoch": 2.10556552962298, "step": 29320}, {"loss": 0.6846, "grad_norm": 0.9449555277824402, "learning_rate": 0.0002, "epoch": 2.1062836624775585, "step": 29330}, {"loss": 0.6571, "grad_norm": 0.9114580750465393, "learning_rate": 0.0002, "epoch": 2.1070017953321365, "step": 29340}, {"loss": 0.6652, "grad_norm": 0.8768125176429749, "learning_rate": 0.0002, "epoch": 2.1077199281867145, "step": 29350}, {"loss": 0.7134, "grad_norm": 0.8586908578872681, "learning_rate": 0.0002, "epoch": 2.1084380610412925, "step": 29360}, {"loss": 0.6471, "grad_norm": 0.8351234793663025, "learning_rate": 0.0002, "epoch": 2.109156193895871, "step": 29370}, {"loss": 0.671, "grad_norm": 0.686488687992096, "learning_rate": 0.0002, "epoch": 2.109874326750449, "step": 29380}, {"loss": 0.6706, "grad_norm": 0.7910184264183044, "learning_rate": 0.0002, "epoch": 2.110592459605027, "step": 29390}, {"loss": 0.7367, "grad_norm": 0.7649612426757812, "learning_rate": 0.0002, "epoch": 2.111310592459605, "step": 29400}, {"loss": 0.6386, "grad_norm": 0.7790259122848511, "learning_rate": 0.0002, "epoch": 2.112028725314183, "step": 29410}, {"loss": 0.6983, "grad_norm": 0.8386351466178894, "learning_rate": 0.0002, "epoch": 2.1127468581687614, "step": 29420}, {"loss": 0.6519, "grad_norm": 0.8605695366859436, "learning_rate": 0.0002, "epoch": 2.1134649910233394, "step": 29430}, {"loss": 0.6686, "grad_norm": 0.6808947920799255, "learning_rate": 0.0002, "epoch": 2.1141831238779174, "step": 29440}, {"loss": 0.6743, "grad_norm": 0.8310001492500305, "learning_rate": 0.0002, "epoch": 2.1149012567324954, "step": 29450}, {"loss": 0.6669, "grad_norm": 1.289986252784729, "learning_rate": 0.0002, "epoch": 2.1156193895870734, "step": 29460}, {"loss": 0.6947, "grad_norm": 0.8679313659667969, "learning_rate": 0.0002, "epoch": 2.116337522441652, "step": 29470}, {"loss": 0.6954, "grad_norm": 0.9149175882339478, "learning_rate": 0.0002, "epoch": 2.11705565529623, "step": 29480}, {"loss": 0.6908, "grad_norm": 0.8405622839927673, "learning_rate": 0.0002, "epoch": 2.117773788150808, "step": 29490}, {"loss": 0.7436, "grad_norm": 0.9174691438674927, "learning_rate": 0.0002, "epoch": 2.118491921005386, "step": 29500}, {"loss": 0.6804, "grad_norm": 0.8865614533424377, "learning_rate": 0.0002, "epoch": 2.1192100538599643, "step": 29510}, {"loss": 0.6535, "grad_norm": 0.645301342010498, "learning_rate": 0.0002, "epoch": 2.1199281867145423, "step": 29520}, {"loss": 0.6879, "grad_norm": 0.7612960338592529, "learning_rate": 0.0002, "epoch": 2.1206463195691203, "step": 29530}, {"loss": 0.6874, "grad_norm": 0.7575576305389404, "learning_rate": 0.0002, "epoch": 2.1213644524236983, "step": 29540}, {"loss": 0.6924, "grad_norm": 0.8746156096458435, "learning_rate": 0.0002, "epoch": 2.1220825852782763, "step": 29550}, {"loss": 0.6659, "grad_norm": 0.8488934636116028, "learning_rate": 0.0002, "epoch": 2.1228007181328548, "step": 29560}, {"loss": 0.6568, "grad_norm": 0.8064972162246704, "learning_rate": 0.0002, "epoch": 2.1235188509874328, "step": 29570}, {"loss": 0.713, "grad_norm": 0.7410933971405029, "learning_rate": 0.0002, "epoch": 2.1242369838420108, "step": 29580}, {"loss": 0.649, "grad_norm": 0.7023535966873169, "learning_rate": 0.0002, "epoch": 2.1249551166965888, "step": 29590}, {"loss": 0.6574, "grad_norm": 0.8591743111610413, "learning_rate": 0.0002, "epoch": 2.1256732495511668, "step": 29600}, {"loss": 0.673, "grad_norm": 0.7270186543464661, "learning_rate": 0.0002, "epoch": 2.126391382405745, "step": 29610}, {"loss": 0.6262, "grad_norm": 0.9639726281166077, "learning_rate": 0.0002, "epoch": 2.127109515260323, "step": 29620}, {"loss": 0.6434, "grad_norm": 0.8519027829170227, "learning_rate": 0.0002, "epoch": 2.127827648114901, "step": 29630}, {"loss": 0.6843, "grad_norm": 0.8786447048187256, "learning_rate": 0.0002, "epoch": 2.128545780969479, "step": 29640}, {"loss": 0.6386, "grad_norm": 0.7452822923660278, "learning_rate": 0.0002, "epoch": 2.129263913824057, "step": 29650}, {"loss": 0.6577, "grad_norm": 0.9385744333267212, "learning_rate": 0.0002, "epoch": 2.1299820466786357, "step": 29660}, {"loss": 0.7088, "grad_norm": 0.7650160193443298, "learning_rate": 0.0002, "epoch": 2.1307001795332137, "step": 29670}, {"loss": 0.6742, "grad_norm": 0.7581976652145386, "learning_rate": 0.0002, "epoch": 2.1314183123877917, "step": 29680}, {"loss": 0.6358, "grad_norm": 0.8455183506011963, "learning_rate": 0.0002, "epoch": 2.1321364452423697, "step": 29690}, {"loss": 0.6288, "grad_norm": 0.7200509905815125, "learning_rate": 0.0002, "epoch": 2.132854578096948, "step": 29700}, {"loss": 0.695, "grad_norm": 0.7071877121925354, "learning_rate": 0.0002, "epoch": 2.133572710951526, "step": 29710}, {"loss": 0.6852, "grad_norm": 0.9197220802307129, "learning_rate": 0.0002, "epoch": 2.134290843806104, "step": 29720}, {"loss": 0.6578, "grad_norm": 0.6787277460098267, "learning_rate": 0.0002, "epoch": 2.135008976660682, "step": 29730}, {"loss": 0.666, "grad_norm": 0.8183788061141968, "learning_rate": 0.0002, "epoch": 2.13572710951526, "step": 29740}, {"loss": 0.6754, "grad_norm": 0.7958994507789612, "learning_rate": 0.0002, "epoch": 2.1364452423698386, "step": 29750}, {"loss": 0.6761, "grad_norm": 0.8803889155387878, "learning_rate": 0.0002, "epoch": 2.1371633752244166, "step": 29760}, {"loss": 0.686, "grad_norm": 0.6682677268981934, "learning_rate": 0.0002, "epoch": 2.1378815080789946, "step": 29770}, {"loss": 0.6878, "grad_norm": 1.0198085308074951, "learning_rate": 0.0002, "epoch": 2.1385996409335726, "step": 29780}, {"loss": 0.6576, "grad_norm": 1.0258227586746216, "learning_rate": 0.0002, "epoch": 2.139317773788151, "step": 29790}, {"loss": 0.6454, "grad_norm": 0.8920917510986328, "learning_rate": 0.0002, "epoch": 2.140035906642729, "step": 29800}, {"loss": 0.6926, "grad_norm": 0.8352635502815247, "learning_rate": 0.0002, "epoch": 2.140754039497307, "step": 29810}, {"loss": 0.692, "grad_norm": 0.8422067165374756, "learning_rate": 0.0002, "epoch": 2.141472172351885, "step": 29820}, {"loss": 0.72, "grad_norm": 0.8845202326774597, "learning_rate": 0.0002, "epoch": 2.142190305206463, "step": 29830}, {"loss": 0.688, "grad_norm": 0.659397542476654, "learning_rate": 0.0002, "epoch": 2.1429084380610415, "step": 29840}, {"loss": 0.6354, "grad_norm": 0.6233306527137756, "learning_rate": 0.0002, "epoch": 2.1436265709156195, "step": 29850}, {"loss": 0.6946, "grad_norm": 0.8951199054718018, "learning_rate": 0.0002, "epoch": 2.1443447037701975, "step": 29860}, {"loss": 0.6417, "grad_norm": 0.6980211734771729, "learning_rate": 0.0002, "epoch": 2.1450628366247755, "step": 29870}, {"loss": 0.6754, "grad_norm": 0.8463385105133057, "learning_rate": 0.0002, "epoch": 2.1457809694793535, "step": 29880}, {"loss": 0.6636, "grad_norm": 0.682183027267456, "learning_rate": 0.0002, "epoch": 2.146499102333932, "step": 29890}, {"loss": 0.6605, "grad_norm": 0.8491033911705017, "learning_rate": 0.0002, "epoch": 2.14721723518851, "step": 29900}, {"loss": 0.6851, "grad_norm": 0.8112631440162659, "learning_rate": 0.0002, "epoch": 2.147935368043088, "step": 29910}, {"loss": 0.6804, "grad_norm": 1.0186359882354736, "learning_rate": 0.0002, "epoch": 2.148653500897666, "step": 29920}, {"loss": 0.6709, "grad_norm": 0.7904929518699646, "learning_rate": 0.0002, "epoch": 2.149371633752244, "step": 29930}, {"loss": 0.6535, "grad_norm": 0.8381312489509583, "learning_rate": 0.0002, "epoch": 2.1500897666068224, "step": 29940}, {"loss": 0.6896, "grad_norm": 0.7596192359924316, "learning_rate": 0.0002, "epoch": 2.1508078994614004, "step": 29950}, {"loss": 0.6473, "grad_norm": 0.7532448768615723, "learning_rate": 0.0002, "epoch": 2.1515260323159784, "step": 29960}, {"loss": 0.7051, "grad_norm": 0.7877430319786072, "learning_rate": 0.0002, "epoch": 2.1522441651705564, "step": 29970}, {"loss": 0.6657, "grad_norm": 0.6870610117912292, "learning_rate": 0.0002, "epoch": 2.152962298025135, "step": 29980}, {"loss": 0.6518, "grad_norm": 0.7154987454414368, "learning_rate": 0.0002, "epoch": 2.153680430879713, "step": 29990}, {"loss": 0.6418, "grad_norm": 0.7692370414733887, "learning_rate": 0.0002, "epoch": 2.154398563734291, "step": 30000}, {"loss": 0.6557, "grad_norm": 0.7745859026908875, "learning_rate": 0.0002, "epoch": 2.155116696588869, "step": 30010}, {"loss": 0.61, "grad_norm": 0.718207061290741, "learning_rate": 0.0002, "epoch": 2.155834829443447, "step": 30020}, {"loss": 0.6348, "grad_norm": 0.8851615786552429, "learning_rate": 0.0002, "epoch": 2.1565529622980253, "step": 30030}, {"loss": 0.7108, "grad_norm": 0.736194372177124, "learning_rate": 0.0002, "epoch": 2.1572710951526033, "step": 30040}, {"loss": 0.6682, "grad_norm": 0.9908117055892944, "learning_rate": 0.0002, "epoch": 2.1579892280071813, "step": 30050}, {"loss": 0.6348, "grad_norm": 0.6772316694259644, "learning_rate": 0.0002, "epoch": 2.1587073608617593, "step": 30060}, {"loss": 0.6952, "grad_norm": 0.7474411725997925, "learning_rate": 0.0002, "epoch": 2.1594254937163377, "step": 30070}, {"loss": 0.6698, "grad_norm": 0.8140033483505249, "learning_rate": 0.0002, "epoch": 2.1601436265709157, "step": 30080}, {"loss": 0.6516, "grad_norm": 0.912555992603302, "learning_rate": 0.0002, "epoch": 2.1608617594254937, "step": 30090}, {"loss": 0.6818, "grad_norm": 0.8189636468887329, "learning_rate": 0.0002, "epoch": 2.1615798922800717, "step": 30100}, {"loss": 0.6662, "grad_norm": 0.7520000338554382, "learning_rate": 0.0002, "epoch": 2.1622980251346497, "step": 30110}, {"loss": 0.678, "grad_norm": 0.9635465741157532, "learning_rate": 0.0002, "epoch": 2.163016157989228, "step": 30120}, {"loss": 0.6641, "grad_norm": 0.9139830470085144, "learning_rate": 0.0002, "epoch": 2.163734290843806, "step": 30130}, {"loss": 0.6685, "grad_norm": 0.844384491443634, "learning_rate": 0.0002, "epoch": 2.164452423698384, "step": 30140}, {"loss": 0.708, "grad_norm": 0.8296793103218079, "learning_rate": 0.0002, "epoch": 2.165170556552962, "step": 30150}, {"loss": 0.668, "grad_norm": 0.7929309606552124, "learning_rate": 0.0002, "epoch": 2.16588868940754, "step": 30160}, {"loss": 0.6221, "grad_norm": 0.8046507239341736, "learning_rate": 0.0002, "epoch": 2.1666068222621186, "step": 30170}, {"loss": 0.6788, "grad_norm": 0.8161377310752869, "learning_rate": 0.0002, "epoch": 2.1673249551166966, "step": 30180}, {"loss": 0.6578, "grad_norm": 0.6984363794326782, "learning_rate": 0.0002, "epoch": 2.1680430879712747, "step": 30190}, {"loss": 0.6774, "grad_norm": 0.8578489422798157, "learning_rate": 0.0002, "epoch": 2.1687612208258527, "step": 30200}, {"loss": 0.668, "grad_norm": 0.8051524758338928, "learning_rate": 0.0002, "epoch": 2.1694793536804307, "step": 30210}, {"loss": 0.6212, "grad_norm": 0.6775792241096497, "learning_rate": 0.0002, "epoch": 2.170197486535009, "step": 30220}, {"loss": 0.705, "grad_norm": 0.7102242708206177, "learning_rate": 0.0002, "epoch": 2.170915619389587, "step": 30230}, {"loss": 0.6814, "grad_norm": 0.9038975238800049, "learning_rate": 0.0002, "epoch": 2.171633752244165, "step": 30240}, {"loss": 0.6919, "grad_norm": 0.8509918451309204, "learning_rate": 0.0002, "epoch": 2.172351885098743, "step": 30250}, {"loss": 0.6904, "grad_norm": 0.8816375732421875, "learning_rate": 0.0002, "epoch": 2.1730700179533216, "step": 30260}, {"loss": 0.7211, "grad_norm": 0.7907037138938904, "learning_rate": 0.0002, "epoch": 2.1737881508078996, "step": 30270}, {"loss": 0.6542, "grad_norm": 0.7104434967041016, "learning_rate": 0.0002, "epoch": 2.1745062836624776, "step": 30280}, {"loss": 0.6863, "grad_norm": 1.028658151626587, "learning_rate": 0.0002, "epoch": 2.1752244165170556, "step": 30290}, {"loss": 0.6789, "grad_norm": 0.8542430400848389, "learning_rate": 0.0002, "epoch": 2.1759425493716336, "step": 30300}, {"loss": 0.6783, "grad_norm": 0.7438064813613892, "learning_rate": 0.0002, "epoch": 2.176660682226212, "step": 30310}, {"loss": 0.63, "grad_norm": 0.8384708762168884, "learning_rate": 0.0002, "epoch": 2.17737881508079, "step": 30320}, {"loss": 0.6861, "grad_norm": 0.9034163355827332, "learning_rate": 0.0002, "epoch": 2.178096947935368, "step": 30330}, {"loss": 0.666, "grad_norm": 0.9659526944160461, "learning_rate": 0.0002, "epoch": 2.178815080789946, "step": 30340}, {"loss": 0.6819, "grad_norm": 0.6685642600059509, "learning_rate": 0.0002, "epoch": 2.1795332136445245, "step": 30350}, {"loss": 0.6759, "grad_norm": 0.9180589318275452, "learning_rate": 0.0002, "epoch": 2.1802513464991025, "step": 30360}, {"loss": 0.6575, "grad_norm": 0.9550795555114746, "learning_rate": 0.0002, "epoch": 2.1809694793536805, "step": 30370}, {"loss": 0.7014, "grad_norm": 0.8517686724662781, "learning_rate": 0.0002, "epoch": 2.1816876122082585, "step": 30380}, {"loss": 0.7069, "grad_norm": 0.7351927161216736, "learning_rate": 0.0002, "epoch": 2.1824057450628365, "step": 30390}, {"loss": 0.6555, "grad_norm": 0.8439408540725708, "learning_rate": 0.0002, "epoch": 2.183123877917415, "step": 30400}, {"loss": 0.69, "grad_norm": 0.8322570323944092, "learning_rate": 0.0002, "epoch": 2.183842010771993, "step": 30410}, {"loss": 0.6801, "grad_norm": 0.6735888123512268, "learning_rate": 0.0002, "epoch": 2.184560143626571, "step": 30420}, {"loss": 0.6844, "grad_norm": 0.7273133397102356, "learning_rate": 0.0002, "epoch": 2.185278276481149, "step": 30430}, {"loss": 0.7119, "grad_norm": 0.7841959595680237, "learning_rate": 0.0002, "epoch": 2.185996409335727, "step": 30440}, {"loss": 0.6717, "grad_norm": 0.67259281873703, "learning_rate": 0.0002, "epoch": 2.1867145421903054, "step": 30450}, {"loss": 0.6857, "grad_norm": 0.7646223306655884, "learning_rate": 0.0002, "epoch": 2.1874326750448834, "step": 30460}, {"loss": 0.6803, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 2.1881508078994614, "step": 30470}, {"loss": 0.6512, "grad_norm": 0.8818342685699463, "learning_rate": 0.0002, "epoch": 2.1888689407540394, "step": 30480}, {"loss": 0.6778, "grad_norm": 0.7421377897262573, "learning_rate": 0.0002, "epoch": 2.1895870736086174, "step": 30490}, {"loss": 0.6783, "grad_norm": 0.8180080652236938, "learning_rate": 0.0002, "epoch": 2.190305206463196, "step": 30500}, {"loss": 0.6774, "grad_norm": 0.8003571033477783, "learning_rate": 0.0002, "epoch": 2.191023339317774, "step": 30510}, {"loss": 0.7, "grad_norm": 0.8200605511665344, "learning_rate": 0.0002, "epoch": 2.191741472172352, "step": 30520}, {"loss": 0.7113, "grad_norm": 0.8878887295722961, "learning_rate": 0.0002, "epoch": 2.19245960502693, "step": 30530}, {"loss": 0.6364, "grad_norm": 0.8518163561820984, "learning_rate": 0.0002, "epoch": 2.1931777378815083, "step": 30540}, {"loss": 0.7039, "grad_norm": 0.8182454705238342, "learning_rate": 0.0002, "epoch": 2.1938958707360863, "step": 30550}, {"loss": 0.6966, "grad_norm": 0.9395919442176819, "learning_rate": 0.0002, "epoch": 2.1946140035906643, "step": 30560}, {"loss": 0.6617, "grad_norm": 0.7916256189346313, "learning_rate": 0.0002, "epoch": 2.1953321364452423, "step": 30570}, {"loss": 0.6869, "grad_norm": 0.7303445339202881, "learning_rate": 0.0002, "epoch": 2.1960502692998203, "step": 30580}, {"loss": 0.6485, "grad_norm": 0.7407387495040894, "learning_rate": 0.0002, "epoch": 2.1967684021543987, "step": 30590}, {"loss": 0.6704, "grad_norm": 0.7410500645637512, "learning_rate": 0.0002, "epoch": 2.1974865350089767, "step": 30600}, {"loss": 0.7013, "grad_norm": 0.9176440834999084, "learning_rate": 0.0002, "epoch": 2.1982046678635547, "step": 30610}, {"loss": 0.706, "grad_norm": 0.8823038935661316, "learning_rate": 0.0002, "epoch": 2.1989228007181327, "step": 30620}, {"loss": 0.7418, "grad_norm": 0.9263436198234558, "learning_rate": 0.0002, "epoch": 2.199640933572711, "step": 30630}, {"loss": 0.6019, "grad_norm": 0.6753571033477783, "learning_rate": 0.0002, "epoch": 2.200359066427289, "step": 30640}, {"loss": 0.6808, "grad_norm": 0.841160774230957, "learning_rate": 0.0002, "epoch": 2.201077199281867, "step": 30650}, {"loss": 0.6917, "grad_norm": 0.8786441683769226, "learning_rate": 0.0002, "epoch": 2.201795332136445, "step": 30660}, {"loss": 0.6878, "grad_norm": 0.8833681344985962, "learning_rate": 0.0002, "epoch": 2.202513464991023, "step": 30670}, {"loss": 0.7061, "grad_norm": 0.6609824299812317, "learning_rate": 0.0002, "epoch": 2.2032315978456016, "step": 30680}, {"loss": 0.6572, "grad_norm": 0.7308626174926758, "learning_rate": 0.0002, "epoch": 2.2039497307001796, "step": 30690}, {"loss": 0.7127, "grad_norm": 0.8854711055755615, "learning_rate": 0.0002, "epoch": 2.2046678635547576, "step": 30700}, {"loss": 0.6836, "grad_norm": 0.839043140411377, "learning_rate": 0.0002, "epoch": 2.2053859964093356, "step": 30710}, {"loss": 0.6577, "grad_norm": 0.9030174016952515, "learning_rate": 0.0002, "epoch": 2.2061041292639136, "step": 30720}, {"loss": 0.663, "grad_norm": 0.6856667399406433, "learning_rate": 0.0002, "epoch": 2.206822262118492, "step": 30730}, {"loss": 0.6672, "grad_norm": 0.8823501467704773, "learning_rate": 0.0002, "epoch": 2.20754039497307, "step": 30740}, {"loss": 0.6809, "grad_norm": 0.8501278162002563, "learning_rate": 0.0002, "epoch": 2.208258527827648, "step": 30750}, {"loss": 0.7402, "grad_norm": 0.8099446892738342, "learning_rate": 0.0002, "epoch": 2.208976660682226, "step": 30760}, {"loss": 0.6996, "grad_norm": 0.7203072905540466, "learning_rate": 0.0002, "epoch": 2.209694793536804, "step": 30770}, {"loss": 0.7494, "grad_norm": 1.0898563861846924, "learning_rate": 0.0002, "epoch": 2.2104129263913825, "step": 30780}, {"loss": 0.6432, "grad_norm": 0.8157216906547546, "learning_rate": 0.0002, "epoch": 2.2111310592459605, "step": 30790}, {"loss": 0.634, "grad_norm": 0.7617478966712952, "learning_rate": 0.0002, "epoch": 2.2118491921005385, "step": 30800}, {"loss": 0.7155, "grad_norm": 0.790503978729248, "learning_rate": 0.0002, "epoch": 2.2125673249551165, "step": 30810}, {"loss": 0.6301, "grad_norm": 0.9289199113845825, "learning_rate": 0.0002, "epoch": 2.213285457809695, "step": 30820}, {"loss": 0.6867, "grad_norm": 0.9267001748085022, "learning_rate": 0.0002, "epoch": 2.214003590664273, "step": 30830}, {"loss": 0.7012, "grad_norm": 0.716023862361908, "learning_rate": 0.0002, "epoch": 2.214721723518851, "step": 30840}, {"loss": 0.6755, "grad_norm": 0.8733863234519958, "learning_rate": 0.0002, "epoch": 2.215439856373429, "step": 30850}, {"loss": 0.6713, "grad_norm": 0.7743660807609558, "learning_rate": 0.0002, "epoch": 2.216157989228007, "step": 30860}, {"loss": 0.665, "grad_norm": 0.7974567413330078, "learning_rate": 0.0002, "epoch": 2.2168761220825854, "step": 30870}, {"loss": 0.6624, "grad_norm": 0.6617984771728516, "learning_rate": 0.0002, "epoch": 2.2175942549371634, "step": 30880}, {"loss": 0.6332, "grad_norm": 0.6925143003463745, "learning_rate": 0.0002, "epoch": 2.2183123877917414, "step": 30890}, {"loss": 0.6986, "grad_norm": 0.6853532195091248, "learning_rate": 0.0002, "epoch": 2.2190305206463194, "step": 30900}, {"loss": 0.6881, "grad_norm": 0.7964699268341064, "learning_rate": 0.0002, "epoch": 2.219748653500898, "step": 30910}, {"loss": 0.6879, "grad_norm": 0.8116228580474854, "learning_rate": 0.0002, "epoch": 2.220466786355476, "step": 30920}, {"loss": 0.6599, "grad_norm": 1.0121010541915894, "learning_rate": 0.0002, "epoch": 2.221184919210054, "step": 30930}, {"loss": 0.6873, "grad_norm": 0.7348445653915405, "learning_rate": 0.0002, "epoch": 2.221903052064632, "step": 30940}, {"loss": 0.6711, "grad_norm": 0.8998047709465027, "learning_rate": 0.0002, "epoch": 2.22262118491921, "step": 30950}, {"loss": 0.692, "grad_norm": 0.6108106970787048, "learning_rate": 0.0002, "epoch": 2.2233393177737883, "step": 30960}, {"loss": 0.6515, "grad_norm": 1.287834882736206, "learning_rate": 0.0002, "epoch": 2.2240574506283664, "step": 30970}, {"loss": 0.6513, "grad_norm": 0.8584468960762024, "learning_rate": 0.0002, "epoch": 2.2247755834829444, "step": 30980}, {"loss": 0.6907, "grad_norm": 0.865276038646698, "learning_rate": 0.0002, "epoch": 2.2254937163375224, "step": 30990}, {"loss": 0.7516, "grad_norm": 0.8713302612304688, "learning_rate": 0.0002, "epoch": 2.2262118491921004, "step": 31000}, {"loss": 0.7127, "grad_norm": 0.9210535883903503, "learning_rate": 0.0002, "epoch": 2.226929982046679, "step": 31010}, {"loss": 0.6543, "grad_norm": 0.8578430414199829, "learning_rate": 0.0002, "epoch": 2.227648114901257, "step": 31020}, {"loss": 0.6964, "grad_norm": 0.7128387093544006, "learning_rate": 0.0002, "epoch": 2.228366247755835, "step": 31030}, {"loss": 0.6949, "grad_norm": 0.8059941530227661, "learning_rate": 0.0002, "epoch": 2.229084380610413, "step": 31040}, {"loss": 0.6422, "grad_norm": 0.8043261170387268, "learning_rate": 0.0002, "epoch": 2.229802513464991, "step": 31050}, {"loss": 0.691, "grad_norm": 0.9260253310203552, "learning_rate": 0.0002, "epoch": 2.2305206463195693, "step": 31060}, {"loss": 0.6601, "grad_norm": 0.7908085584640503, "learning_rate": 0.0002, "epoch": 2.2312387791741473, "step": 31070}, {"loss": 0.6312, "grad_norm": 0.7860442996025085, "learning_rate": 0.0002, "epoch": 2.2319569120287253, "step": 31080}, {"loss": 0.715, "grad_norm": 0.8388702273368835, "learning_rate": 0.0002, "epoch": 2.2326750448833033, "step": 31090}, {"loss": 0.7015, "grad_norm": 0.835686206817627, "learning_rate": 0.0002, "epoch": 2.2333931777378817, "step": 31100}, {"loss": 0.6796, "grad_norm": 0.8148298859596252, "learning_rate": 0.0002, "epoch": 2.2341113105924597, "step": 31110}, {"loss": 0.6318, "grad_norm": 0.8501878976821899, "learning_rate": 0.0002, "epoch": 2.2348294434470377, "step": 31120}, {"loss": 0.7262, "grad_norm": 0.793323278427124, "learning_rate": 0.0002, "epoch": 2.2355475763016157, "step": 31130}, {"loss": 0.722, "grad_norm": 0.8234742879867554, "learning_rate": 0.0002, "epoch": 2.2362657091561937, "step": 31140}, {"loss": 0.6746, "grad_norm": 0.8691303133964539, "learning_rate": 0.0002, "epoch": 2.236983842010772, "step": 31150}, {"loss": 0.6191, "grad_norm": 0.8707090020179749, "learning_rate": 0.0002, "epoch": 2.23770197486535, "step": 31160}, {"loss": 0.6988, "grad_norm": 0.8468940854072571, "learning_rate": 0.0002, "epoch": 2.238420107719928, "step": 31170}, {"loss": 0.6429, "grad_norm": 0.7275772094726562, "learning_rate": 0.0002, "epoch": 2.239138240574506, "step": 31180}, {"loss": 0.7057, "grad_norm": 0.8765808939933777, "learning_rate": 0.0002, "epoch": 2.2398563734290846, "step": 31190}, {"loss": 0.7273, "grad_norm": 1.02803635597229, "learning_rate": 0.0002, "epoch": 2.2405745062836626, "step": 31200}, {"loss": 0.7303, "grad_norm": 0.7999185919761658, "learning_rate": 0.0002, "epoch": 2.2412926391382406, "step": 31210}, {"loss": 0.658, "grad_norm": 0.5711870789527893, "learning_rate": 0.0002, "epoch": 2.2420107719928186, "step": 31220}, {"loss": 0.6527, "grad_norm": 0.7183604836463928, "learning_rate": 0.0002, "epoch": 2.2427289048473966, "step": 31230}, {"loss": 0.6817, "grad_norm": 0.8819206357002258, "learning_rate": 0.0002, "epoch": 2.243447037701975, "step": 31240}, {"loss": 0.6805, "grad_norm": 0.9078969955444336, "learning_rate": 0.0002, "epoch": 2.244165170556553, "step": 31250}, {"loss": 0.6937, "grad_norm": 1.184506893157959, "learning_rate": 0.0002, "epoch": 2.244883303411131, "step": 31260}, {"loss": 0.7682, "grad_norm": 0.8660752177238464, "learning_rate": 0.0002, "epoch": 2.245601436265709, "step": 31270}, {"loss": 0.6461, "grad_norm": 1.011796236038208, "learning_rate": 0.0002, "epoch": 2.246319569120287, "step": 31280}, {"loss": 0.677, "grad_norm": 0.9168157577514648, "learning_rate": 0.0002, "epoch": 2.2470377019748655, "step": 31290}, {"loss": 0.6844, "grad_norm": 0.7798577547073364, "learning_rate": 0.0002, "epoch": 2.2477558348294435, "step": 31300}, {"loss": 0.6622, "grad_norm": 0.6609913110733032, "learning_rate": 0.0002, "epoch": 2.2484739676840215, "step": 31310}, {"loss": 0.6616, "grad_norm": 0.64737868309021, "learning_rate": 0.0002, "epoch": 2.2491921005385995, "step": 31320}, {"loss": 0.665, "grad_norm": 1.0700385570526123, "learning_rate": 0.0002, "epoch": 2.2499102333931775, "step": 31330}, {"loss": 0.6539, "grad_norm": 0.7838551998138428, "learning_rate": 0.0002, "epoch": 2.250628366247756, "step": 31340}, {"loss": 0.7002, "grad_norm": 0.9225728511810303, "learning_rate": 0.0002, "epoch": 2.251346499102334, "step": 31350}, {"loss": 0.6758, "grad_norm": 0.7956384420394897, "learning_rate": 0.0002, "epoch": 2.252064631956912, "step": 31360}, {"loss": 0.7039, "grad_norm": 0.7645466923713684, "learning_rate": 0.0002, "epoch": 2.25278276481149, "step": 31370}, {"loss": 0.6816, "grad_norm": 0.9595549702644348, "learning_rate": 0.0002, "epoch": 2.2535008976660684, "step": 31380}, {"loss": 0.6419, "grad_norm": 0.6124163866043091, "learning_rate": 0.0002, "epoch": 2.2542190305206464, "step": 31390}, {"loss": 0.6573, "grad_norm": 0.7531530261039734, "learning_rate": 0.0002, "epoch": 2.2549371633752244, "step": 31400}, {"loss": 0.6223, "grad_norm": 0.6904721856117249, "learning_rate": 0.0002, "epoch": 2.2556552962298024, "step": 31410}, {"loss": 0.6661, "grad_norm": 0.7644204497337341, "learning_rate": 0.0002, "epoch": 2.2563734290843804, "step": 31420}, {"loss": 0.7122, "grad_norm": 0.7879737019538879, "learning_rate": 0.0002, "epoch": 2.257091561938959, "step": 31430}, {"loss": 0.6407, "grad_norm": 0.796450138092041, "learning_rate": 0.0002, "epoch": 2.257809694793537, "step": 31440}, {"loss": 0.722, "grad_norm": 0.7536656856536865, "learning_rate": 0.0002, "epoch": 2.258527827648115, "step": 31450}, {"loss": 0.681, "grad_norm": 0.6797451376914978, "learning_rate": 0.0002, "epoch": 2.259245960502693, "step": 31460}, {"loss": 0.6916, "grad_norm": 0.7833347320556641, "learning_rate": 0.0002, "epoch": 2.2599640933572713, "step": 31470}, {"loss": 0.702, "grad_norm": 0.7571428418159485, "learning_rate": 0.0002, "epoch": 2.2606822262118493, "step": 31480}, {"loss": 0.6878, "grad_norm": 0.7028690576553345, "learning_rate": 0.0002, "epoch": 2.2614003590664273, "step": 31490}, {"loss": 0.6863, "grad_norm": 0.7854651212692261, "learning_rate": 0.0002, "epoch": 2.2621184919210053, "step": 31500}, {"loss": 0.6895, "grad_norm": 1.1924974918365479, "learning_rate": 0.0002, "epoch": 2.2628366247755833, "step": 31510}, {"loss": 0.7174, "grad_norm": 0.8087588548660278, "learning_rate": 0.0002, "epoch": 2.2635547576301613, "step": 31520}, {"loss": 0.6398, "grad_norm": 0.8521981835365295, "learning_rate": 0.0002, "epoch": 2.26427289048474, "step": 31530}, {"loss": 0.6654, "grad_norm": 0.754585862159729, "learning_rate": 0.0002, "epoch": 2.264991023339318, "step": 31540}, {"loss": 0.6854, "grad_norm": 0.8403395414352417, "learning_rate": 0.0002, "epoch": 2.265709156193896, "step": 31550}, {"loss": 0.6873, "grad_norm": 0.9724786877632141, "learning_rate": 0.0002, "epoch": 2.266427289048474, "step": 31560}, {"loss": 0.6876, "grad_norm": 0.7568767070770264, "learning_rate": 0.0002, "epoch": 2.2671454219030522, "step": 31570}, {"loss": 0.6161, "grad_norm": 0.712009608745575, "learning_rate": 0.0002, "epoch": 2.2678635547576302, "step": 31580}, {"loss": 0.6568, "grad_norm": 0.7649937868118286, "learning_rate": 0.0002, "epoch": 2.2685816876122082, "step": 31590}, {"loss": 0.6195, "grad_norm": 0.7319537997245789, "learning_rate": 0.0002, "epoch": 2.2692998204667862, "step": 31600}, {"loss": 0.6434, "grad_norm": 0.9597942233085632, "learning_rate": 0.0002, "epoch": 2.2700179533213642, "step": 31610}, {"loss": 0.6273, "grad_norm": 0.7403358817100525, "learning_rate": 0.0002, "epoch": 2.2707360861759427, "step": 31620}, {"loss": 0.7185, "grad_norm": 0.7395114898681641, "learning_rate": 0.0002, "epoch": 2.2714542190305207, "step": 31630}, {"loss": 0.6357, "grad_norm": 0.8835344314575195, "learning_rate": 0.0002, "epoch": 2.2721723518850987, "step": 31640}, {"loss": 0.7442, "grad_norm": 0.76587975025177, "learning_rate": 0.0002, "epoch": 2.2728904847396767, "step": 31650}, {"loss": 0.6491, "grad_norm": 0.6472584009170532, "learning_rate": 0.0002, "epoch": 2.273608617594255, "step": 31660}, {"loss": 0.7026, "grad_norm": 1.0170460939407349, "learning_rate": 0.0002, "epoch": 2.274326750448833, "step": 31670}, {"loss": 0.6839, "grad_norm": 0.8170912265777588, "learning_rate": 0.0002, "epoch": 2.275044883303411, "step": 31680}, {"loss": 0.6599, "grad_norm": 0.6821279525756836, "learning_rate": 0.0002, "epoch": 2.275763016157989, "step": 31690}, {"loss": 0.6346, "grad_norm": 0.8150709867477417, "learning_rate": 0.0002, "epoch": 2.276481149012567, "step": 31700}, {"loss": 0.6639, "grad_norm": 0.6786386370658875, "learning_rate": 0.0002, "epoch": 2.2771992818671456, "step": 31710}, {"loss": 0.6753, "grad_norm": 0.8871912360191345, "learning_rate": 0.0002, "epoch": 2.2779174147217236, "step": 31720}, {"loss": 0.6826, "grad_norm": 0.7710220813751221, "learning_rate": 0.0002, "epoch": 2.2786355475763016, "step": 31730}, {"loss": 0.7118, "grad_norm": 0.8073079586029053, "learning_rate": 0.0002, "epoch": 2.2793536804308796, "step": 31740}, {"loss": 0.6614, "grad_norm": 0.8228550553321838, "learning_rate": 0.0002, "epoch": 2.280071813285458, "step": 31750}, {"loss": 0.7162, "grad_norm": 0.7987996339797974, "learning_rate": 0.0002, "epoch": 2.280789946140036, "step": 31760}, {"loss": 0.6953, "grad_norm": 0.744326651096344, "learning_rate": 0.0002, "epoch": 2.281508078994614, "step": 31770}, {"loss": 0.7089, "grad_norm": 0.7672302722930908, "learning_rate": 0.0002, "epoch": 2.282226211849192, "step": 31780}, {"loss": 0.6926, "grad_norm": 0.8079774975776672, "learning_rate": 0.0002, "epoch": 2.28294434470377, "step": 31790}, {"loss": 0.6361, "grad_norm": 0.7383643984794617, "learning_rate": 0.0002, "epoch": 2.283662477558348, "step": 31800}, {"loss": 0.6924, "grad_norm": 0.8542332649230957, "learning_rate": 0.0002, "epoch": 2.2843806104129265, "step": 31810}, {"loss": 0.7156, "grad_norm": 0.7657321691513062, "learning_rate": 0.0002, "epoch": 2.2850987432675045, "step": 31820}, {"loss": 0.6545, "grad_norm": 0.7485944628715515, "learning_rate": 0.0002, "epoch": 2.2858168761220825, "step": 31830}, {"loss": 0.6452, "grad_norm": 0.7817596793174744, "learning_rate": 0.0002, "epoch": 2.2865350089766605, "step": 31840}, {"loss": 0.6398, "grad_norm": 0.840421736240387, "learning_rate": 0.0002, "epoch": 2.287253141831239, "step": 31850}, {"loss": 0.7245, "grad_norm": 0.8190447688102722, "learning_rate": 0.0002, "epoch": 2.287971274685817, "step": 31860}, {"loss": 0.7343, "grad_norm": 0.9582287669181824, "learning_rate": 0.0002, "epoch": 2.288689407540395, "step": 31870}, {"loss": 0.683, "grad_norm": 1.0939116477966309, "learning_rate": 0.0002, "epoch": 2.289407540394973, "step": 31880}, {"loss": 0.7176, "grad_norm": 1.0901678800582886, "learning_rate": 0.0002, "epoch": 2.290125673249551, "step": 31890}, {"loss": 0.6711, "grad_norm": 0.8025168776512146, "learning_rate": 0.0002, "epoch": 2.2908438061041294, "step": 31900}, {"loss": 0.6901, "grad_norm": 0.8157371878623962, "learning_rate": 0.0002, "epoch": 2.2915619389587074, "step": 31910}, {"loss": 0.6643, "grad_norm": 0.7735328078269958, "learning_rate": 0.0002, "epoch": 2.2922800718132854, "step": 31920}, {"loss": 0.689, "grad_norm": 0.7501550316810608, "learning_rate": 0.0002, "epoch": 2.2929982046678634, "step": 31930}, {"loss": 0.6605, "grad_norm": 0.76664799451828, "learning_rate": 0.0002, "epoch": 2.293716337522442, "step": 31940}, {"loss": 0.6818, "grad_norm": 1.0044599771499634, "learning_rate": 0.0002, "epoch": 2.29443447037702, "step": 31950}, {"loss": 0.6566, "grad_norm": 0.7773551344871521, "learning_rate": 0.0002, "epoch": 2.295152603231598, "step": 31960}, {"loss": 0.6834, "grad_norm": 0.9021226763725281, "learning_rate": 0.0002, "epoch": 2.295870736086176, "step": 31970}, {"loss": 0.6757, "grad_norm": 0.9075915813446045, "learning_rate": 0.0002, "epoch": 2.296588868940754, "step": 31980}, {"loss": 0.6584, "grad_norm": 0.9109290242195129, "learning_rate": 0.0002, "epoch": 2.2973070017953323, "step": 31990}, {"loss": 0.6792, "grad_norm": 0.7742900252342224, "learning_rate": 0.0002, "epoch": 2.2980251346499103, "step": 32000}, {"loss": 0.7137, "grad_norm": 0.633260190486908, "learning_rate": 0.0002, "epoch": 2.2987432675044883, "step": 32010}, {"loss": 0.6644, "grad_norm": 0.8593834042549133, "learning_rate": 0.0002, "epoch": 2.2994614003590663, "step": 32020}, {"loss": 0.6961, "grad_norm": 0.88165283203125, "learning_rate": 0.0002, "epoch": 2.3001795332136448, "step": 32030}, {"loss": 0.7779, "grad_norm": 0.7840633988380432, "learning_rate": 0.0002, "epoch": 2.3008976660682228, "step": 32040}, {"loss": 0.7045, "grad_norm": 0.8150764107704163, "learning_rate": 0.0002, "epoch": 2.3016157989228008, "step": 32050}, {"loss": 0.6556, "grad_norm": 0.7683324813842773, "learning_rate": 0.0002, "epoch": 2.3023339317773788, "step": 32060}, {"loss": 0.6657, "grad_norm": 0.7581049799919128, "learning_rate": 0.0002, "epoch": 2.3030520646319568, "step": 32070}, {"loss": 0.6683, "grad_norm": 0.911687970161438, "learning_rate": 0.0002, "epoch": 2.3037701974865348, "step": 32080}, {"loss": 0.7029, "grad_norm": 1.0596355199813843, "learning_rate": 0.0002, "epoch": 2.3044883303411132, "step": 32090}, {"loss": 0.6955, "grad_norm": 0.7329661846160889, "learning_rate": 0.0002, "epoch": 2.3052064631956912, "step": 32100}, {"loss": 0.6798, "grad_norm": 0.8251074552536011, "learning_rate": 0.0002, "epoch": 2.3059245960502692, "step": 32110}, {"loss": 0.692, "grad_norm": 0.7765523195266724, "learning_rate": 0.0002, "epoch": 2.3066427289048472, "step": 32120}, {"loss": 0.6375, "grad_norm": 0.8246980905532837, "learning_rate": 0.0002, "epoch": 2.3073608617594257, "step": 32130}, {"loss": 0.6815, "grad_norm": 0.833387017250061, "learning_rate": 0.0002, "epoch": 2.3080789946140037, "step": 32140}, {"loss": 0.6261, "grad_norm": 0.9558065533638, "learning_rate": 0.0002, "epoch": 2.3087971274685817, "step": 32150}, {"loss": 0.6723, "grad_norm": 0.788151204586029, "learning_rate": 0.0002, "epoch": 2.3095152603231597, "step": 32160}, {"loss": 0.6398, "grad_norm": 0.8662320971488953, "learning_rate": 0.0002, "epoch": 2.3102333931777377, "step": 32170}, {"loss": 0.7014, "grad_norm": 0.7079060673713684, "learning_rate": 0.0002, "epoch": 2.310951526032316, "step": 32180}, {"loss": 0.6479, "grad_norm": 0.8477022647857666, "learning_rate": 0.0002, "epoch": 2.311669658886894, "step": 32190}, {"loss": 0.6872, "grad_norm": 0.6549711227416992, "learning_rate": 0.0002, "epoch": 2.312387791741472, "step": 32200}, {"loss": 0.6668, "grad_norm": 0.8274375796318054, "learning_rate": 0.0002, "epoch": 2.31310592459605, "step": 32210}, {"loss": 0.6731, "grad_norm": 0.6305822730064392, "learning_rate": 0.0002, "epoch": 2.3138240574506286, "step": 32220}, {"loss": 0.6908, "grad_norm": 0.8105725049972534, "learning_rate": 0.0002, "epoch": 2.3145421903052066, "step": 32230}, {"loss": 0.7028, "grad_norm": 0.7317119240760803, "learning_rate": 0.0002, "epoch": 2.3152603231597846, "step": 32240}, {"loss": 0.6444, "grad_norm": 0.7729924917221069, "learning_rate": 0.0002, "epoch": 2.3159784560143626, "step": 32250}, {"loss": 0.6945, "grad_norm": 0.8092145919799805, "learning_rate": 0.0002, "epoch": 2.3166965888689406, "step": 32260}, {"loss": 0.663, "grad_norm": 0.8723762035369873, "learning_rate": 0.0002, "epoch": 2.317414721723519, "step": 32270}, {"loss": 0.6992, "grad_norm": 0.9699533581733704, "learning_rate": 0.0002, "epoch": 2.318132854578097, "step": 32280}, {"loss": 0.7488, "grad_norm": 1.2972444295883179, "learning_rate": 0.0002, "epoch": 2.318850987432675, "step": 32290}, {"loss": 0.6969, "grad_norm": 0.7888450622558594, "learning_rate": 0.0002, "epoch": 2.319569120287253, "step": 32300}, {"loss": 0.6876, "grad_norm": 0.7457000017166138, "learning_rate": 0.0002, "epoch": 2.3202872531418315, "step": 32310}, {"loss": 0.6891, "grad_norm": 0.7270606756210327, "learning_rate": 0.0002, "epoch": 2.3210053859964095, "step": 32320}, {"loss": 0.6607, "grad_norm": 0.7930711507797241, "learning_rate": 0.0002, "epoch": 2.3217235188509875, "step": 32330}, {"loss": 0.7222, "grad_norm": 0.9015030264854431, "learning_rate": 0.0002, "epoch": 2.3224416517055655, "step": 32340}, {"loss": 0.6544, "grad_norm": 0.9385523796081543, "learning_rate": 0.0002, "epoch": 2.3231597845601435, "step": 32350}, {"loss": 0.6779, "grad_norm": 0.7293606400489807, "learning_rate": 0.0002, "epoch": 2.3238779174147215, "step": 32360}, {"loss": 0.6556, "grad_norm": 0.797618567943573, "learning_rate": 0.0002, "epoch": 2.3245960502693, "step": 32370}, {"loss": 0.6743, "grad_norm": 0.8588258028030396, "learning_rate": 0.0002, "epoch": 2.325314183123878, "step": 32380}, {"loss": 0.659, "grad_norm": 0.7490078210830688, "learning_rate": 0.0002, "epoch": 2.326032315978456, "step": 32390}, {"loss": 0.7365, "grad_norm": 0.7569956183433533, "learning_rate": 0.0002, "epoch": 2.326750448833034, "step": 32400}, {"loss": 0.7048, "grad_norm": 0.8754122853279114, "learning_rate": 0.0002, "epoch": 2.3274685816876124, "step": 32410}, {"loss": 0.6845, "grad_norm": 0.9410699605941772, "learning_rate": 0.0002, "epoch": 2.3281867145421904, "step": 32420}, {"loss": 0.6611, "grad_norm": 1.1309062242507935, "learning_rate": 0.0002, "epoch": 2.3289048473967684, "step": 32430}, {"loss": 0.6609, "grad_norm": 0.7923168540000916, "learning_rate": 0.0002, "epoch": 2.3296229802513464, "step": 32440}, {"loss": 0.6728, "grad_norm": 0.830387532711029, "learning_rate": 0.0002, "epoch": 2.3303411131059244, "step": 32450}, {"loss": 0.673, "grad_norm": 0.9087454080581665, "learning_rate": 0.0002, "epoch": 2.331059245960503, "step": 32460}, {"loss": 0.6749, "grad_norm": 0.8892660737037659, "learning_rate": 0.0002, "epoch": 2.331777378815081, "step": 32470}, {"loss": 0.7101, "grad_norm": 0.84930819272995, "learning_rate": 0.0002, "epoch": 2.332495511669659, "step": 32480}, {"loss": 0.6465, "grad_norm": 0.7736781239509583, "learning_rate": 0.0002, "epoch": 2.333213644524237, "step": 32490}, {"loss": 0.6976, "grad_norm": 0.7396222352981567, "learning_rate": 0.0002, "epoch": 2.3339317773788153, "step": 32500}, {"loss": 0.6484, "grad_norm": 0.7710241079330444, "learning_rate": 0.0002, "epoch": 2.3346499102333933, "step": 32510}, {"loss": 0.6591, "grad_norm": 0.7297301888465881, "learning_rate": 0.0002, "epoch": 2.3353680430879713, "step": 32520}, {"loss": 0.7375, "grad_norm": 0.9084094166755676, "learning_rate": 0.0002, "epoch": 2.3360861759425493, "step": 32530}, {"loss": 0.6775, "grad_norm": 0.6425859332084656, "learning_rate": 0.0002, "epoch": 2.3368043087971273, "step": 32540}, {"loss": 0.7249, "grad_norm": 0.8646581172943115, "learning_rate": 0.0002, "epoch": 2.3375224416517058, "step": 32550}, {"loss": 0.6862, "grad_norm": 0.91925048828125, "learning_rate": 0.0002, "epoch": 2.3382405745062838, "step": 32560}, {"loss": 0.6805, "grad_norm": 0.8687716722488403, "learning_rate": 0.0002, "epoch": 2.3389587073608618, "step": 32570}, {"loss": 0.6377, "grad_norm": 0.9769517183303833, "learning_rate": 0.0002, "epoch": 2.3396768402154398, "step": 32580}, {"loss": 0.6459, "grad_norm": 0.7240557074546814, "learning_rate": 0.0002, "epoch": 2.340394973070018, "step": 32590}, {"loss": 0.7029, "grad_norm": 0.6631549000740051, "learning_rate": 0.0002, "epoch": 2.341113105924596, "step": 32600}, {"loss": 0.6524, "grad_norm": 0.9103635549545288, "learning_rate": 0.0002, "epoch": 2.341831238779174, "step": 32610}, {"loss": 0.6695, "grad_norm": 0.8718403577804565, "learning_rate": 0.0002, "epoch": 2.342549371633752, "step": 32620}, {"loss": 0.7006, "grad_norm": 0.8020271062850952, "learning_rate": 0.0002, "epoch": 2.34326750448833, "step": 32630}, {"loss": 0.6853, "grad_norm": 0.7834265232086182, "learning_rate": 0.0002, "epoch": 2.343985637342908, "step": 32640}, {"loss": 0.6447, "grad_norm": 0.8909988403320312, "learning_rate": 0.0002, "epoch": 2.3447037701974867, "step": 32650}, {"loss": 0.6762, "grad_norm": 0.6915582418441772, "learning_rate": 0.0002, "epoch": 2.3454219030520647, "step": 32660}, {"loss": 0.6993, "grad_norm": 0.8829401135444641, "learning_rate": 0.0002, "epoch": 2.3461400359066427, "step": 32670}, {"loss": 0.6035, "grad_norm": 0.8869150876998901, "learning_rate": 0.0002, "epoch": 2.3468581687612207, "step": 32680}, {"loss": 0.6404, "grad_norm": 0.8348933458328247, "learning_rate": 0.0002, "epoch": 2.347576301615799, "step": 32690}, {"loss": 0.6961, "grad_norm": 0.7591108679771423, "learning_rate": 0.0002, "epoch": 2.348294434470377, "step": 32700}, {"loss": 0.7155, "grad_norm": 0.8343638181686401, "learning_rate": 0.0002, "epoch": 2.349012567324955, "step": 32710}, {"loss": 0.6949, "grad_norm": 0.8537896275520325, "learning_rate": 0.0002, "epoch": 2.349730700179533, "step": 32720}, {"loss": 0.6545, "grad_norm": 0.7750797867774963, "learning_rate": 0.0002, "epoch": 2.350448833034111, "step": 32730}, {"loss": 0.7226, "grad_norm": 0.7553941607475281, "learning_rate": 0.0002, "epoch": 2.3511669658886896, "step": 32740}, {"loss": 0.6985, "grad_norm": 0.8083372712135315, "learning_rate": 0.0002, "epoch": 2.3518850987432676, "step": 32750}, {"loss": 0.6345, "grad_norm": 0.8016324043273926, "learning_rate": 0.0002, "epoch": 2.3526032315978456, "step": 32760}, {"loss": 0.6348, "grad_norm": 0.7524061799049377, "learning_rate": 0.0002, "epoch": 2.3533213644524236, "step": 32770}, {"loss": 0.6782, "grad_norm": 0.9046763777732849, "learning_rate": 0.0002, "epoch": 2.354039497307002, "step": 32780}, {"loss": 0.6745, "grad_norm": 0.9704324007034302, "learning_rate": 0.0002, "epoch": 2.35475763016158, "step": 32790}, {"loss": 0.7095, "grad_norm": 0.8756019473075867, "learning_rate": 0.0002, "epoch": 2.355475763016158, "step": 32800}, {"loss": 0.6989, "grad_norm": 0.7345646023750305, "learning_rate": 0.0002, "epoch": 2.356193895870736, "step": 32810}, {"loss": 0.6659, "grad_norm": 0.8022899031639099, "learning_rate": 0.0002, "epoch": 2.356912028725314, "step": 32820}, {"loss": 0.6997, "grad_norm": 0.7663353085517883, "learning_rate": 0.0002, "epoch": 2.3576301615798925, "step": 32830}, {"loss": 0.6683, "grad_norm": 0.7802956104278564, "learning_rate": 0.0002, "epoch": 2.3583482944344705, "step": 32840}, {"loss": 0.679, "grad_norm": 0.8130960464477539, "learning_rate": 0.0002, "epoch": 2.3590664272890485, "step": 32850}, {"loss": 0.6792, "grad_norm": 0.9671252369880676, "learning_rate": 0.0002, "epoch": 2.3597845601436265, "step": 32860}, {"loss": 0.6989, "grad_norm": 0.8806724548339844, "learning_rate": 0.0002, "epoch": 2.3605026929982045, "step": 32870}, {"loss": 0.6674, "grad_norm": 0.9378283619880676, "learning_rate": 0.0002, "epoch": 2.361220825852783, "step": 32880}, {"loss": 0.6607, "grad_norm": 0.8638162612915039, "learning_rate": 0.0002, "epoch": 2.361938958707361, "step": 32890}, {"loss": 0.6866, "grad_norm": 0.7321885228157043, "learning_rate": 0.0002, "epoch": 2.362657091561939, "step": 32900}, {"loss": 0.6682, "grad_norm": 0.8445415496826172, "learning_rate": 0.0002, "epoch": 2.363375224416517, "step": 32910}, {"loss": 0.6863, "grad_norm": 0.915715754032135, "learning_rate": 0.0002, "epoch": 2.364093357271095, "step": 32920}, {"loss": 0.6671, "grad_norm": 0.8674854040145874, "learning_rate": 0.0002, "epoch": 2.3648114901256734, "step": 32930}, {"loss": 0.7124, "grad_norm": 0.7577189207077026, "learning_rate": 0.0002, "epoch": 2.3655296229802514, "step": 32940}, {"loss": 0.6879, "grad_norm": 0.8649988174438477, "learning_rate": 0.0002, "epoch": 2.3662477558348294, "step": 32950}, {"loss": 0.6571, "grad_norm": 0.9760734438896179, "learning_rate": 0.0002, "epoch": 2.3669658886894074, "step": 32960}, {"loss": 0.7002, "grad_norm": 0.8909491300582886, "learning_rate": 0.0002, "epoch": 2.367684021543986, "step": 32970}, {"loss": 0.6961, "grad_norm": 0.6970168948173523, "learning_rate": 0.0002, "epoch": 2.368402154398564, "step": 32980}, {"loss": 0.6153, "grad_norm": 0.8208426237106323, "learning_rate": 0.0002, "epoch": 2.369120287253142, "step": 32990}, {"loss": 0.626, "grad_norm": 0.8477405309677124, "learning_rate": 0.0002, "epoch": 2.36983842010772, "step": 33000}, {"loss": 0.6588, "grad_norm": 0.7771625518798828, "learning_rate": 0.0002, "epoch": 2.370556552962298, "step": 33010}, {"loss": 0.673, "grad_norm": 0.7811821103096008, "learning_rate": 0.0002, "epoch": 2.3712746858168763, "step": 33020}, {"loss": 0.6792, "grad_norm": 0.6280415654182434, "learning_rate": 0.0002, "epoch": 2.3719928186714543, "step": 33030}, {"loss": 0.6567, "grad_norm": 0.8733929395675659, "learning_rate": 0.0002, "epoch": 2.3727109515260323, "step": 33040}, {"loss": 0.6844, "grad_norm": 0.6169558167457581, "learning_rate": 0.0002, "epoch": 2.3734290843806103, "step": 33050}, {"loss": 0.6675, "grad_norm": 0.7414724826812744, "learning_rate": 0.0002, "epoch": 2.3741472172351887, "step": 33060}, {"loss": 0.6905, "grad_norm": 0.7484683990478516, "learning_rate": 0.0002, "epoch": 2.3748653500897667, "step": 33070}, {"loss": 0.6676, "grad_norm": 0.8495098948478699, "learning_rate": 0.0002, "epoch": 2.3755834829443447, "step": 33080}, {"loss": 0.687, "grad_norm": 0.9057353734970093, "learning_rate": 0.0002, "epoch": 2.3763016157989227, "step": 33090}, {"loss": 0.6911, "grad_norm": 0.8028274178504944, "learning_rate": 0.0002, "epoch": 2.3770197486535007, "step": 33100}, {"loss": 0.6851, "grad_norm": 1.2398128509521484, "learning_rate": 0.0002, "epoch": 2.377737881508079, "step": 33110}, {"loss": 0.6753, "grad_norm": 0.7894110679626465, "learning_rate": 0.0002, "epoch": 2.378456014362657, "step": 33120}, {"loss": 0.6625, "grad_norm": 0.8530096411705017, "learning_rate": 0.0002, "epoch": 2.379174147217235, "step": 33130}, {"loss": 0.7061, "grad_norm": 0.892613410949707, "learning_rate": 0.0002, "epoch": 2.379892280071813, "step": 33140}, {"loss": 0.6719, "grad_norm": 0.868606448173523, "learning_rate": 0.0002, "epoch": 2.380610412926391, "step": 33150}, {"loss": 0.6423, "grad_norm": 0.6801115870475769, "learning_rate": 0.0002, "epoch": 2.3813285457809696, "step": 33160}, {"loss": 0.6723, "grad_norm": 0.9517148733139038, "learning_rate": 0.0002, "epoch": 2.3820466786355476, "step": 33170}, {"loss": 0.6957, "grad_norm": 0.8986499309539795, "learning_rate": 0.0002, "epoch": 2.3827648114901256, "step": 33180}, {"loss": 0.6767, "grad_norm": 0.8467642068862915, "learning_rate": 0.0002, "epoch": 2.3834829443447036, "step": 33190}, {"loss": 0.7228, "grad_norm": 0.8400940299034119, "learning_rate": 0.0002, "epoch": 2.3842010771992816, "step": 33200}, {"loss": 0.7048, "grad_norm": 0.86443030834198, "learning_rate": 0.0002, "epoch": 2.38491921005386, "step": 33210}, {"loss": 0.6227, "grad_norm": 0.8599014282226562, "learning_rate": 0.0002, "epoch": 2.385637342908438, "step": 33220}, {"loss": 0.673, "grad_norm": 0.868735134601593, "learning_rate": 0.0002, "epoch": 2.386355475763016, "step": 33230}, {"loss": 0.6612, "grad_norm": 0.941734790802002, "learning_rate": 0.0002, "epoch": 2.387073608617594, "step": 33240}, {"loss": 0.6951, "grad_norm": 0.9342881441116333, "learning_rate": 0.0002, "epoch": 2.3877917414721725, "step": 33250}, {"loss": 0.7255, "grad_norm": 1.012920618057251, "learning_rate": 0.0002, "epoch": 2.3885098743267505, "step": 33260}, {"loss": 0.6399, "grad_norm": 0.6949151754379272, "learning_rate": 0.0002, "epoch": 2.3892280071813286, "step": 33270}, {"loss": 0.7137, "grad_norm": 0.8283912539482117, "learning_rate": 0.0002, "epoch": 2.3899461400359066, "step": 33280}, {"loss": 0.7324, "grad_norm": 0.807273805141449, "learning_rate": 0.0002, "epoch": 2.3906642728904846, "step": 33290}, {"loss": 0.7353, "grad_norm": 0.8109124302864075, "learning_rate": 0.0002, "epoch": 2.391382405745063, "step": 33300}, {"loss": 0.689, "grad_norm": 0.7477563619613647, "learning_rate": 0.0002, "epoch": 2.392100538599641, "step": 33310}, {"loss": 0.6585, "grad_norm": 0.6961637735366821, "learning_rate": 0.0002, "epoch": 2.392818671454219, "step": 33320}, {"loss": 0.6919, "grad_norm": 0.9424173831939697, "learning_rate": 0.0002, "epoch": 2.393536804308797, "step": 33330}, {"loss": 0.6965, "grad_norm": 0.8289623856544495, "learning_rate": 0.0002, "epoch": 2.3942549371633755, "step": 33340}, {"loss": 0.6761, "grad_norm": 0.8106551170349121, "learning_rate": 0.0002, "epoch": 2.3949730700179535, "step": 33350}, {"loss": 0.6675, "grad_norm": 0.8800507187843323, "learning_rate": 0.0002, "epoch": 2.3956912028725315, "step": 33360}, {"loss": 0.6636, "grad_norm": 0.7662274241447449, "learning_rate": 0.0002, "epoch": 2.3964093357271095, "step": 33370}, {"loss": 0.6824, "grad_norm": 0.889204740524292, "learning_rate": 0.0002, "epoch": 2.3971274685816875, "step": 33380}, {"loss": 0.6539, "grad_norm": 0.7991349697113037, "learning_rate": 0.0002, "epoch": 2.3978456014362655, "step": 33390}, {"loss": 0.6818, "grad_norm": 0.8210278749465942, "learning_rate": 0.0002, "epoch": 2.398563734290844, "step": 33400}, {"loss": 0.7118, "grad_norm": 0.91801917552948, "learning_rate": 0.0002, "epoch": 2.399281867145422, "step": 33410}, {"loss": 0.726, "grad_norm": 0.8086220622062683, "learning_rate": 0.0002, "epoch": 2.4, "step": 33420}, {"loss": 0.7418, "grad_norm": 0.901613175868988, "learning_rate": 0.0002, "epoch": 2.400718132854578, "step": 33430}, {"loss": 0.6904, "grad_norm": 0.9865965247154236, "learning_rate": 0.0002, "epoch": 2.4014362657091564, "step": 33440}, {"loss": 0.7543, "grad_norm": 0.8160675168037415, "learning_rate": 0.0002, "epoch": 2.4021543985637344, "step": 33450}, {"loss": 0.6598, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 2.4028725314183124, "step": 33460}, {"loss": 0.6784, "grad_norm": 0.8490013480186462, "learning_rate": 0.0002, "epoch": 2.4035906642728904, "step": 33470}, {"loss": 0.6844, "grad_norm": 0.6947163939476013, "learning_rate": 0.0002, "epoch": 2.4043087971274684, "step": 33480}, {"loss": 0.6606, "grad_norm": 0.7984827756881714, "learning_rate": 0.0002, "epoch": 2.405026929982047, "step": 33490}, {"loss": 0.7032, "grad_norm": 0.7826083302497864, "learning_rate": 0.0002, "epoch": 2.405745062836625, "step": 33500}, {"loss": 0.6914, "grad_norm": 0.8213959336280823, "learning_rate": 0.0002, "epoch": 2.406463195691203, "step": 33510}, {"loss": 0.6855, "grad_norm": 0.8790069818496704, "learning_rate": 0.0002, "epoch": 2.407181328545781, "step": 33520}, {"loss": 0.6278, "grad_norm": 0.9093378782272339, "learning_rate": 0.0002, "epoch": 2.4078994614003593, "step": 33530}, {"loss": 0.6724, "grad_norm": 0.8085389137268066, "learning_rate": 0.0002, "epoch": 2.4086175942549373, "step": 33540}, {"loss": 0.6456, "grad_norm": 0.7952343225479126, "learning_rate": 0.0002, "epoch": 2.4093357271095153, "step": 33550}, {"loss": 0.7357, "grad_norm": 0.9576563835144043, "learning_rate": 0.0002, "epoch": 2.4100538599640933, "step": 33560}, {"loss": 0.7123, "grad_norm": 0.7722929120063782, "learning_rate": 0.0002, "epoch": 2.4107719928186713, "step": 33570}, {"loss": 0.6647, "grad_norm": 0.8634604215621948, "learning_rate": 0.0002, "epoch": 2.4114901256732497, "step": 33580}, {"loss": 0.6677, "grad_norm": 0.7805271148681641, "learning_rate": 0.0002, "epoch": 2.4122082585278277, "step": 33590}, {"loss": 0.6629, "grad_norm": 0.8274481296539307, "learning_rate": 0.0002, "epoch": 2.4129263913824057, "step": 33600}, {"loss": 0.6396, "grad_norm": 0.9265141487121582, "learning_rate": 0.0002, "epoch": 2.4136445242369837, "step": 33610}, {"loss": 0.6727, "grad_norm": 0.7497374415397644, "learning_rate": 0.0002, "epoch": 2.414362657091562, "step": 33620}, {"loss": 0.6543, "grad_norm": 0.7048972249031067, "learning_rate": 0.0002, "epoch": 2.41508078994614, "step": 33630}, {"loss": 0.6863, "grad_norm": 0.8449550271034241, "learning_rate": 0.0002, "epoch": 2.415798922800718, "step": 33640}, {"loss": 0.6891, "grad_norm": 0.7581984400749207, "learning_rate": 0.0002, "epoch": 2.416517055655296, "step": 33650}, {"loss": 0.6845, "grad_norm": 0.7744191288948059, "learning_rate": 0.0002, "epoch": 2.417235188509874, "step": 33660}, {"loss": 0.6412, "grad_norm": 0.6736614108085632, "learning_rate": 0.0002, "epoch": 2.417953321364452, "step": 33670}, {"loss": 0.6792, "grad_norm": 0.985431432723999, "learning_rate": 0.0002, "epoch": 2.4186714542190306, "step": 33680}, {"loss": 0.6675, "grad_norm": 0.8027978539466858, "learning_rate": 0.0002, "epoch": 2.4193895870736086, "step": 33690}, {"loss": 0.7107, "grad_norm": 0.6809377074241638, "learning_rate": 0.0002, "epoch": 2.4201077199281866, "step": 33700}, {"loss": 0.7332, "grad_norm": 0.8305349946022034, "learning_rate": 0.0002, "epoch": 2.4208258527827646, "step": 33710}, {"loss": 0.642, "grad_norm": 0.7632496356964111, "learning_rate": 0.0002, "epoch": 2.421543985637343, "step": 33720}, {"loss": 0.6614, "grad_norm": 0.7241050601005554, "learning_rate": 0.0002, "epoch": 2.422262118491921, "step": 33730}, {"loss": 0.6668, "grad_norm": 0.6729857325553894, "learning_rate": 0.0002, "epoch": 2.422980251346499, "step": 33740}, {"loss": 0.7289, "grad_norm": 0.7741881012916565, "learning_rate": 0.0002, "epoch": 2.423698384201077, "step": 33750}, {"loss": 0.6895, "grad_norm": 0.7844415903091431, "learning_rate": 0.0002, "epoch": 2.424416517055655, "step": 33760}, {"loss": 0.7073, "grad_norm": 0.7960098385810852, "learning_rate": 0.0002, "epoch": 2.4251346499102335, "step": 33770}, {"loss": 0.702, "grad_norm": 0.8267978429794312, "learning_rate": 0.0002, "epoch": 2.4258527827648115, "step": 33780}, {"loss": 0.6379, "grad_norm": 0.7498974204063416, "learning_rate": 0.0002, "epoch": 2.4265709156193895, "step": 33790}, {"loss": 0.6749, "grad_norm": 0.8357859253883362, "learning_rate": 0.0002, "epoch": 2.4272890484739675, "step": 33800}, {"loss": 0.6617, "grad_norm": 0.8056104779243469, "learning_rate": 0.0002, "epoch": 2.428007181328546, "step": 33810}, {"loss": 0.701, "grad_norm": 0.806897759437561, "learning_rate": 0.0002, "epoch": 2.428725314183124, "step": 33820}, {"loss": 0.6771, "grad_norm": 0.7770048975944519, "learning_rate": 0.0002, "epoch": 2.429443447037702, "step": 33830}, {"loss": 0.7096, "grad_norm": 0.8311458230018616, "learning_rate": 0.0002, "epoch": 2.43016157989228, "step": 33840}, {"loss": 0.7127, "grad_norm": 0.9201730489730835, "learning_rate": 0.0002, "epoch": 2.430879712746858, "step": 33850}, {"loss": 0.6722, "grad_norm": 0.83509761095047, "learning_rate": 0.0002, "epoch": 2.4315978456014364, "step": 33860}, {"loss": 0.6477, "grad_norm": 0.7680139541625977, "learning_rate": 0.0002, "epoch": 2.4323159784560144, "step": 33870}, {"loss": 0.7229, "grad_norm": 0.8956670165061951, "learning_rate": 0.0002, "epoch": 2.4330341113105924, "step": 33880}, {"loss": 0.6598, "grad_norm": 0.717941164970398, "learning_rate": 0.0002, "epoch": 2.4337522441651704, "step": 33890}, {"loss": 0.6546, "grad_norm": 0.777206540107727, "learning_rate": 0.0002, "epoch": 2.434470377019749, "step": 33900}, {"loss": 0.7442, "grad_norm": 0.90232914686203, "learning_rate": 0.0002, "epoch": 2.435188509874327, "step": 33910}, {"loss": 0.6763, "grad_norm": 1.0817158222198486, "learning_rate": 0.0002, "epoch": 2.435906642728905, "step": 33920}, {"loss": 0.6995, "grad_norm": 0.7890931367874146, "learning_rate": 0.0002, "epoch": 2.436624775583483, "step": 33930}, {"loss": 0.6438, "grad_norm": 0.9279449582099915, "learning_rate": 0.0002, "epoch": 2.437342908438061, "step": 33940}, {"loss": 0.6694, "grad_norm": 0.8313823342323303, "learning_rate": 0.0002, "epoch": 2.438061041292639, "step": 33950}, {"loss": 0.6841, "grad_norm": 1.0510340929031372, "learning_rate": 0.0002, "epoch": 2.4387791741472173, "step": 33960}, {"loss": 0.7203, "grad_norm": 0.8002574443817139, "learning_rate": 0.0002, "epoch": 2.4394973070017953, "step": 33970}, {"loss": 0.6767, "grad_norm": 0.7822834253311157, "learning_rate": 0.0002, "epoch": 2.4402154398563733, "step": 33980}, {"loss": 0.6289, "grad_norm": 0.9050403237342834, "learning_rate": 0.0002, "epoch": 2.4409335727109513, "step": 33990}, {"loss": 0.6798, "grad_norm": 0.7569652199745178, "learning_rate": 0.0002, "epoch": 2.44165170556553, "step": 34000}, {"loss": 0.648, "grad_norm": 0.6609470844268799, "learning_rate": 0.0002, "epoch": 2.442369838420108, "step": 34010}, {"loss": 0.6734, "grad_norm": 0.8090947866439819, "learning_rate": 0.0002, "epoch": 2.443087971274686, "step": 34020}, {"loss": 0.6621, "grad_norm": 0.647814929485321, "learning_rate": 0.0002, "epoch": 2.443806104129264, "step": 34030}, {"loss": 0.7227, "grad_norm": 0.9308601021766663, "learning_rate": 0.0002, "epoch": 2.444524236983842, "step": 34040}, {"loss": 0.6937, "grad_norm": 0.8259239792823792, "learning_rate": 0.0002, "epoch": 2.4452423698384202, "step": 34050}, {"loss": 0.6813, "grad_norm": 0.9410025477409363, "learning_rate": 0.0002, "epoch": 2.4459605026929983, "step": 34060}, {"loss": 0.7112, "grad_norm": 0.7446974515914917, "learning_rate": 0.0002, "epoch": 2.4466786355475763, "step": 34070}, {"loss": 0.6608, "grad_norm": 0.7093849182128906, "learning_rate": 0.0002, "epoch": 2.4473967684021543, "step": 34080}, {"loss": 0.6801, "grad_norm": 0.8726152181625366, "learning_rate": 0.0002, "epoch": 2.4481149012567327, "step": 34090}, {"loss": 0.7164, "grad_norm": 0.808300793170929, "learning_rate": 0.0002, "epoch": 2.4488330341113107, "step": 34100}, {"loss": 0.658, "grad_norm": 0.6884859800338745, "learning_rate": 0.0002, "epoch": 2.4495511669658887, "step": 34110}, {"loss": 0.6444, "grad_norm": 0.7151864767074585, "learning_rate": 0.0002, "epoch": 2.4502692998204667, "step": 34120}, {"loss": 0.6685, "grad_norm": 0.9261866807937622, "learning_rate": 0.0002, "epoch": 2.4509874326750447, "step": 34130}, {"loss": 0.6717, "grad_norm": 0.8069018125534058, "learning_rate": 0.0002, "epoch": 2.451705565529623, "step": 34140}, {"loss": 0.7436, "grad_norm": 0.8001297116279602, "learning_rate": 0.0002, "epoch": 2.452423698384201, "step": 34150}, {"loss": 0.7032, "grad_norm": 0.8547799587249756, "learning_rate": 0.0002, "epoch": 2.453141831238779, "step": 34160}, {"loss": 0.7226, "grad_norm": 0.6693823337554932, "learning_rate": 0.0002, "epoch": 2.453859964093357, "step": 34170}, {"loss": 0.6644, "grad_norm": 0.6646198630332947, "learning_rate": 0.0002, "epoch": 2.4545780969479356, "step": 34180}, {"loss": 0.6891, "grad_norm": 0.9330950975418091, "learning_rate": 0.0002, "epoch": 2.4552962298025136, "step": 34190}, {"loss": 0.6728, "grad_norm": 0.7738645672798157, "learning_rate": 0.0002, "epoch": 2.4560143626570916, "step": 34200}, {"loss": 0.7162, "grad_norm": 0.7929846048355103, "learning_rate": 0.0002, "epoch": 2.4567324955116696, "step": 34210}, {"loss": 0.6793, "grad_norm": 0.8936280012130737, "learning_rate": 0.0002, "epoch": 2.4574506283662476, "step": 34220}, {"loss": 0.6758, "grad_norm": 0.9099360108375549, "learning_rate": 0.0002, "epoch": 2.4581687612208256, "step": 34230}, {"loss": 0.666, "grad_norm": 0.7941291928291321, "learning_rate": 0.0002, "epoch": 2.458886894075404, "step": 34240}, {"loss": 0.6689, "grad_norm": 0.7169737219810486, "learning_rate": 0.0002, "epoch": 2.459605026929982, "step": 34250}, {"loss": 0.7417, "grad_norm": 0.8994171023368835, "learning_rate": 0.0002, "epoch": 2.46032315978456, "step": 34260}, {"loss": 0.6807, "grad_norm": 0.8087331056594849, "learning_rate": 0.0002, "epoch": 2.461041292639138, "step": 34270}, {"loss": 0.7152, "grad_norm": 0.935502827167511, "learning_rate": 0.0002, "epoch": 2.4617594254937165, "step": 34280}, {"loss": 0.7448, "grad_norm": 0.8957464694976807, "learning_rate": 0.0002, "epoch": 2.4624775583482945, "step": 34290}, {"loss": 0.6501, "grad_norm": 0.9017183780670166, "learning_rate": 0.0002, "epoch": 2.4631956912028725, "step": 34300}, {"loss": 0.6985, "grad_norm": 0.7778640389442444, "learning_rate": 0.0002, "epoch": 2.4639138240574505, "step": 34310}, {"loss": 0.7041, "grad_norm": 0.8870323896408081, "learning_rate": 0.0002, "epoch": 2.4646319569120285, "step": 34320}, {"loss": 0.6796, "grad_norm": 0.7660176753997803, "learning_rate": 0.0002, "epoch": 2.465350089766607, "step": 34330}, {"loss": 0.6705, "grad_norm": 0.8442226648330688, "learning_rate": 0.0002, "epoch": 2.466068222621185, "step": 34340}, {"loss": 0.7019, "grad_norm": 0.7522561550140381, "learning_rate": 0.0002, "epoch": 2.466786355475763, "step": 34350}, {"loss": 0.7331, "grad_norm": 0.9355213046073914, "learning_rate": 0.0002, "epoch": 2.467504488330341, "step": 34360}, {"loss": 0.688, "grad_norm": 0.8487382531166077, "learning_rate": 0.0002, "epoch": 2.4682226211849194, "step": 34370}, {"loss": 0.7068, "grad_norm": 0.7869813442230225, "learning_rate": 0.0002, "epoch": 2.4689407540394974, "step": 34380}, {"loss": 0.6809, "grad_norm": 0.7562848329544067, "learning_rate": 0.0002, "epoch": 2.4696588868940754, "step": 34390}, {"loss": 0.653, "grad_norm": 0.740829586982727, "learning_rate": 0.0002, "epoch": 2.4703770197486534, "step": 34400}, {"loss": 0.656, "grad_norm": 1.0862116813659668, "learning_rate": 0.0002, "epoch": 2.4710951526032314, "step": 34410}, {"loss": 0.6429, "grad_norm": 0.9633645415306091, "learning_rate": 0.0002, "epoch": 2.47181328545781, "step": 34420}, {"loss": 0.7126, "grad_norm": 0.8467186093330383, "learning_rate": 0.0002, "epoch": 2.472531418312388, "step": 34430}, {"loss": 0.6783, "grad_norm": 0.9972147941589355, "learning_rate": 0.0002, "epoch": 2.473249551166966, "step": 34440}, {"loss": 0.701, "grad_norm": 0.8086632490158081, "learning_rate": 0.0002, "epoch": 2.473967684021544, "step": 34450}, {"loss": 0.7127, "grad_norm": 0.9043704271316528, "learning_rate": 0.0002, "epoch": 2.4746858168761223, "step": 34460}, {"loss": 0.6861, "grad_norm": 0.8275330662727356, "learning_rate": 0.0002, "epoch": 2.4754039497307003, "step": 34470}, {"loss": 0.6443, "grad_norm": 0.8142464756965637, "learning_rate": 0.0002, "epoch": 2.4761220825852783, "step": 34480}, {"loss": 0.637, "grad_norm": 0.7116754651069641, "learning_rate": 0.0002, "epoch": 2.4768402154398563, "step": 34490}, {"loss": 0.6572, "grad_norm": 0.8742281198501587, "learning_rate": 0.0002, "epoch": 2.4775583482944343, "step": 34500}, {"loss": 0.6615, "grad_norm": 0.7545657157897949, "learning_rate": 0.0002, "epoch": 2.4782764811490123, "step": 34510}, {"loss": 0.6715, "grad_norm": 0.7586482167243958, "learning_rate": 0.0002, "epoch": 2.478994614003591, "step": 34520}, {"loss": 0.71, "grad_norm": 0.9212547540664673, "learning_rate": 0.0002, "epoch": 2.479712746858169, "step": 34530}, {"loss": 0.6742, "grad_norm": 0.9391530752182007, "learning_rate": 0.0002, "epoch": 2.480430879712747, "step": 34540}, {"loss": 0.6565, "grad_norm": 1.119698166847229, "learning_rate": 0.0002, "epoch": 2.481149012567325, "step": 34550}, {"loss": 0.6734, "grad_norm": 0.8499019145965576, "learning_rate": 0.0002, "epoch": 2.4818671454219032, "step": 34560}, {"loss": 0.7043, "grad_norm": 0.7629778385162354, "learning_rate": 0.0002, "epoch": 2.4825852782764812, "step": 34570}, {"loss": 0.671, "grad_norm": 0.7667021155357361, "learning_rate": 0.0002, "epoch": 2.4833034111310592, "step": 34580}, {"loss": 0.6202, "grad_norm": 0.6711493730545044, "learning_rate": 0.0002, "epoch": 2.4840215439856372, "step": 34590}, {"loss": 0.6644, "grad_norm": 0.7354223728179932, "learning_rate": 0.0002, "epoch": 2.4847396768402152, "step": 34600}, {"loss": 0.622, "grad_norm": 0.875295102596283, "learning_rate": 0.0002, "epoch": 2.4854578096947937, "step": 34610}, {"loss": 0.6946, "grad_norm": 0.7341493964195251, "learning_rate": 0.0002, "epoch": 2.4861759425493717, "step": 34620}, {"loss": 0.6674, "grad_norm": 0.9049216508865356, "learning_rate": 0.0002, "epoch": 2.4868940754039497, "step": 34630}, {"loss": 0.7017, "grad_norm": 0.7214788198471069, "learning_rate": 0.0002, "epoch": 2.4876122082585277, "step": 34640}, {"loss": 0.6571, "grad_norm": 0.7514070868492126, "learning_rate": 0.0002, "epoch": 2.488330341113106, "step": 34650}, {"loss": 0.6623, "grad_norm": 0.6929763555526733, "learning_rate": 0.0002, "epoch": 2.489048473967684, "step": 34660}, {"loss": 0.7118, "grad_norm": 1.11346435546875, "learning_rate": 0.0002, "epoch": 2.489766606822262, "step": 34670}, {"loss": 0.6664, "grad_norm": 0.9285556674003601, "learning_rate": 0.0002, "epoch": 2.49048473967684, "step": 34680}, {"loss": 0.7094, "grad_norm": 0.7699695825576782, "learning_rate": 0.0002, "epoch": 2.491202872531418, "step": 34690}, {"loss": 0.6575, "grad_norm": 0.872349739074707, "learning_rate": 0.0002, "epoch": 2.4919210053859966, "step": 34700}, {"loss": 0.6886, "grad_norm": 0.8692147135734558, "learning_rate": 0.0002, "epoch": 2.4926391382405746, "step": 34710}, {"loss": 0.711, "grad_norm": 0.799740195274353, "learning_rate": 0.0002, "epoch": 2.4933572710951526, "step": 34720}, {"loss": 0.6849, "grad_norm": 0.7320986986160278, "learning_rate": 0.0002, "epoch": 2.4940754039497306, "step": 34730}, {"loss": 0.7138, "grad_norm": 0.8233383893966675, "learning_rate": 0.0002, "epoch": 2.494793536804309, "step": 34740}, {"loss": 0.6937, "grad_norm": 0.9605086445808411, "learning_rate": 0.0002, "epoch": 2.495511669658887, "step": 34750}, {"loss": 0.6511, "grad_norm": 0.8597773909568787, "learning_rate": 0.0002, "epoch": 2.496229802513465, "step": 34760}, {"loss": 0.6793, "grad_norm": 0.7459201812744141, "learning_rate": 0.0002, "epoch": 2.496947935368043, "step": 34770}, {"loss": 0.7098, "grad_norm": 0.778457522392273, "learning_rate": 0.0002, "epoch": 2.497666068222621, "step": 34780}, {"loss": 0.6727, "grad_norm": 0.8591375946998596, "learning_rate": 0.0002, "epoch": 2.498384201077199, "step": 34790}, {"loss": 0.6439, "grad_norm": 0.9689867496490479, "learning_rate": 0.0002, "epoch": 2.4991023339317775, "step": 34800}, {"loss": 0.6365, "grad_norm": 0.7430615425109863, "learning_rate": 0.0002, "epoch": 2.4998204667863555, "step": 34810}, {"loss": 0.7207, "grad_norm": 0.8545114994049072, "learning_rate": 0.0002, "epoch": 2.5005385996409335, "step": 34820}, {"loss": 0.7318, "grad_norm": 0.7115356922149658, "learning_rate": 0.0002, "epoch": 2.5012567324955115, "step": 34830}, {"loss": 0.6985, "grad_norm": 0.7616795301437378, "learning_rate": 0.0002, "epoch": 2.50197486535009, "step": 34840}, {"loss": 0.7153, "grad_norm": 0.8097891211509705, "learning_rate": 0.0002, "epoch": 2.502692998204668, "step": 34850}, {"loss": 0.7131, "grad_norm": 0.7397396564483643, "learning_rate": 0.0002, "epoch": 2.503411131059246, "step": 34860}, {"loss": 0.7213, "grad_norm": 0.7531594038009644, "learning_rate": 0.0002, "epoch": 2.504129263913824, "step": 34870}, {"loss": 0.678, "grad_norm": 0.8050091862678528, "learning_rate": 0.0002, "epoch": 2.504847396768402, "step": 34880}, {"loss": 0.6765, "grad_norm": 0.7550507187843323, "learning_rate": 0.0002, "epoch": 2.5055655296229804, "step": 34890}, {"loss": 0.6861, "grad_norm": 1.0131759643554688, "learning_rate": 0.0002, "epoch": 2.5062836624775584, "step": 34900}, {"loss": 0.6755, "grad_norm": 0.9275356531143188, "learning_rate": 0.0002, "epoch": 2.5070017953321364, "step": 34910}, {"loss": 0.7108, "grad_norm": 0.6655791997909546, "learning_rate": 0.0002, "epoch": 2.5077199281867144, "step": 34920}, {"loss": 0.7154, "grad_norm": 0.79361891746521, "learning_rate": 0.0002, "epoch": 2.508438061041293, "step": 34930}, {"loss": 0.6506, "grad_norm": 0.8223658800125122, "learning_rate": 0.0002, "epoch": 2.509156193895871, "step": 34940}, {"loss": 0.6869, "grad_norm": 1.0070416927337646, "learning_rate": 0.0002, "epoch": 2.509874326750449, "step": 34950}, {"loss": 0.6819, "grad_norm": 0.8408986330032349, "learning_rate": 0.0002, "epoch": 2.510592459605027, "step": 34960}, {"loss": 0.7195, "grad_norm": 0.8178259134292603, "learning_rate": 0.0002, "epoch": 2.511310592459605, "step": 34970}, {"loss": 0.6738, "grad_norm": 0.747876763343811, "learning_rate": 0.0002, "epoch": 2.512028725314183, "step": 34980}, {"loss": 0.6706, "grad_norm": 0.8551825881004333, "learning_rate": 0.0002, "epoch": 2.5127468581687613, "step": 34990}, {"loss": 0.653, "grad_norm": 0.8366564512252808, "learning_rate": 0.0002, "epoch": 2.5134649910233393, "step": 35000}, {"loss": 0.6427, "grad_norm": 0.8491294384002686, "learning_rate": 0.0002, "epoch": 2.5141831238779173, "step": 35010}, {"loss": 0.6714, "grad_norm": 0.8854562640190125, "learning_rate": 0.0002, "epoch": 2.5149012567324958, "step": 35020}, {"loss": 0.6606, "grad_norm": 0.8652133345603943, "learning_rate": 0.0002, "epoch": 2.5156193895870738, "step": 35030}, {"loss": 0.658, "grad_norm": 0.8734033107757568, "learning_rate": 0.0002, "epoch": 2.5163375224416518, "step": 35040}, {"loss": 0.6528, "grad_norm": 0.8613446950912476, "learning_rate": 0.0002, "epoch": 2.5170556552962298, "step": 35050}, {"loss": 0.6943, "grad_norm": 0.762395441532135, "learning_rate": 0.0002, "epoch": 2.5177737881508078, "step": 35060}, {"loss": 0.66, "grad_norm": 0.806220293045044, "learning_rate": 0.0002, "epoch": 2.5184919210053858, "step": 35070}, {"loss": 0.6867, "grad_norm": 0.7781713008880615, "learning_rate": 0.0002, "epoch": 2.519210053859964, "step": 35080}, {"loss": 0.6927, "grad_norm": 0.8639848828315735, "learning_rate": 0.0002, "epoch": 2.519928186714542, "step": 35090}, {"loss": 0.6397, "grad_norm": 0.7331740260124207, "learning_rate": 0.0002, "epoch": 2.52064631956912, "step": 35100}, {"loss": 0.6916, "grad_norm": 0.8148137927055359, "learning_rate": 0.0002, "epoch": 2.521364452423698, "step": 35110}, {"loss": 0.6877, "grad_norm": 0.6939297914505005, "learning_rate": 0.0002, "epoch": 2.5220825852782767, "step": 35120}, {"loss": 0.6669, "grad_norm": 0.8151076436042786, "learning_rate": 0.0002, "epoch": 2.5228007181328547, "step": 35130}, {"loss": 0.6761, "grad_norm": 0.9193238019943237, "learning_rate": 0.0002, "epoch": 2.5235188509874327, "step": 35140}, {"loss": 0.7136, "grad_norm": 0.8230985403060913, "learning_rate": 0.0002, "epoch": 2.5242369838420107, "step": 35150}, {"loss": 0.7127, "grad_norm": 0.865492582321167, "learning_rate": 0.0002, "epoch": 2.5249551166965887, "step": 35160}, {"loss": 0.6591, "grad_norm": 0.7673570513725281, "learning_rate": 0.0002, "epoch": 2.525673249551167, "step": 35170}, {"loss": 0.6703, "grad_norm": 0.8296313881874084, "learning_rate": 0.0002, "epoch": 2.526391382405745, "step": 35180}, {"loss": 0.6588, "grad_norm": 0.6531317234039307, "learning_rate": 0.0002, "epoch": 2.527109515260323, "step": 35190}, {"loss": 0.7129, "grad_norm": 0.9865642189979553, "learning_rate": 0.0002, "epoch": 2.527827648114901, "step": 35200}, {"loss": 0.6728, "grad_norm": 0.8001098036766052, "learning_rate": 0.0002, "epoch": 2.5285457809694796, "step": 35210}, {"loss": 0.6737, "grad_norm": 0.7523218393325806, "learning_rate": 0.0002, "epoch": 2.5292639138240576, "step": 35220}, {"loss": 0.6426, "grad_norm": 1.061640977859497, "learning_rate": 0.0002, "epoch": 2.5299820466786356, "step": 35230}, {"loss": 0.6974, "grad_norm": 0.9668078422546387, "learning_rate": 0.0002, "epoch": 2.5307001795332136, "step": 35240}, {"loss": 0.7189, "grad_norm": 0.9554983973503113, "learning_rate": 0.0002, "epoch": 2.5314183123877916, "step": 35250}, {"loss": 0.648, "grad_norm": 0.8343066573143005, "learning_rate": 0.0002, "epoch": 2.5321364452423696, "step": 35260}, {"loss": 0.639, "grad_norm": 0.8408095240592957, "learning_rate": 0.0002, "epoch": 2.532854578096948, "step": 35270}, {"loss": 0.6412, "grad_norm": 0.8593984842300415, "learning_rate": 0.0002, "epoch": 2.533572710951526, "step": 35280}, {"loss": 0.6689, "grad_norm": 0.7593855261802673, "learning_rate": 0.0002, "epoch": 2.534290843806104, "step": 35290}, {"loss": 0.6731, "grad_norm": 0.9179701209068298, "learning_rate": 0.0002, "epoch": 2.5350089766606825, "step": 35300}, {"loss": 0.7194, "grad_norm": 0.749022901058197, "learning_rate": 0.0002, "epoch": 2.5357271095152605, "step": 35310}, {"loss": 0.6488, "grad_norm": 0.7172152400016785, "learning_rate": 0.0002, "epoch": 2.5364452423698385, "step": 35320}, {"loss": 0.6934, "grad_norm": 0.8228873610496521, "learning_rate": 0.0002, "epoch": 2.5371633752244165, "step": 35330}, {"loss": 0.7245, "grad_norm": 0.9663547277450562, "learning_rate": 0.0002, "epoch": 2.5378815080789945, "step": 35340}, {"loss": 0.6974, "grad_norm": 0.8446536660194397, "learning_rate": 0.0002, "epoch": 2.5385996409335725, "step": 35350}, {"loss": 0.6942, "grad_norm": 0.9751029014587402, "learning_rate": 0.0002, "epoch": 2.539317773788151, "step": 35360}, {"loss": 0.7001, "grad_norm": 0.7460315823554993, "learning_rate": 0.0002, "epoch": 2.540035906642729, "step": 35370}, {"loss": 0.6928, "grad_norm": 0.8269246816635132, "learning_rate": 0.0002, "epoch": 2.540754039497307, "step": 35380}, {"loss": 0.6559, "grad_norm": 0.7200030088424683, "learning_rate": 0.0002, "epoch": 2.541472172351885, "step": 35390}, {"loss": 0.6736, "grad_norm": 0.9586671590805054, "learning_rate": 0.0002, "epoch": 2.5421903052064634, "step": 35400}, {"loss": 0.6653, "grad_norm": 0.7872378826141357, "learning_rate": 0.0002, "epoch": 2.5429084380610414, "step": 35410}, {"loss": 0.7002, "grad_norm": 0.8257358074188232, "learning_rate": 0.0002, "epoch": 2.5436265709156194, "step": 35420}, {"loss": 0.6888, "grad_norm": 0.6924505829811096, "learning_rate": 0.0002, "epoch": 2.5443447037701974, "step": 35430}, {"loss": 0.6536, "grad_norm": 1.1171481609344482, "learning_rate": 0.0002, "epoch": 2.5450628366247754, "step": 35440}, {"loss": 0.7087, "grad_norm": 0.9635605216026306, "learning_rate": 0.0002, "epoch": 2.545780969479354, "step": 35450}, {"loss": 0.6545, "grad_norm": 0.9760567545890808, "learning_rate": 0.0002, "epoch": 2.546499102333932, "step": 35460}, {"loss": 0.6858, "grad_norm": 0.8523460030555725, "learning_rate": 0.0002, "epoch": 2.54721723518851, "step": 35470}, {"loss": 0.6702, "grad_norm": 0.9316970109939575, "learning_rate": 0.0002, "epoch": 2.547935368043088, "step": 35480}, {"loss": 0.7028, "grad_norm": 0.7401485443115234, "learning_rate": 0.0002, "epoch": 2.5486535008976663, "step": 35490}, {"loss": 0.6991, "grad_norm": 1.0627065896987915, "learning_rate": 0.0002, "epoch": 2.5493716337522443, "step": 35500}, {"loss": 0.6401, "grad_norm": 0.7463156580924988, "learning_rate": 0.0002, "epoch": 2.5500897666068223, "step": 35510}, {"loss": 0.6978, "grad_norm": 0.9935570359230042, "learning_rate": 0.0002, "epoch": 2.5508078994614003, "step": 35520}, {"loss": 0.7531, "grad_norm": 0.8824051022529602, "learning_rate": 0.0002, "epoch": 2.5515260323159783, "step": 35530}, {"loss": 0.7078, "grad_norm": 0.8018375635147095, "learning_rate": 0.0002, "epoch": 2.5522441651705563, "step": 35540}, {"loss": 0.6757, "grad_norm": 0.7523182034492493, "learning_rate": 0.0002, "epoch": 2.5529622980251347, "step": 35550}, {"loss": 0.6631, "grad_norm": 0.6771712303161621, "learning_rate": 0.0002, "epoch": 2.5536804308797127, "step": 35560}, {"loss": 0.6679, "grad_norm": 0.7903336882591248, "learning_rate": 0.0002, "epoch": 2.5543985637342908, "step": 35570}, {"loss": 0.7069, "grad_norm": 0.7973808646202087, "learning_rate": 0.0002, "epoch": 2.555116696588869, "step": 35580}, {"loss": 0.6388, "grad_norm": 0.9082772731781006, "learning_rate": 0.0002, "epoch": 2.555834829443447, "step": 35590}, {"loss": 0.6926, "grad_norm": 0.779671311378479, "learning_rate": 0.0002, "epoch": 2.556552962298025, "step": 35600}, {"loss": 0.6966, "grad_norm": 0.710058331489563, "learning_rate": 0.0002, "epoch": 2.557271095152603, "step": 35610}, {"loss": 0.701, "grad_norm": 0.8217873573303223, "learning_rate": 0.0002, "epoch": 2.557989228007181, "step": 35620}, {"loss": 0.6773, "grad_norm": 0.8017855286598206, "learning_rate": 0.0002, "epoch": 2.558707360861759, "step": 35630}, {"loss": 0.6764, "grad_norm": 0.6671402454376221, "learning_rate": 0.0002, "epoch": 2.5594254937163377, "step": 35640}, {"loss": 0.6946, "grad_norm": 0.9357045292854309, "learning_rate": 0.0002, "epoch": 2.5601436265709157, "step": 35650}, {"loss": 0.695, "grad_norm": 0.7676312327384949, "learning_rate": 0.0002, "epoch": 2.5608617594254937, "step": 35660}, {"loss": 0.7086, "grad_norm": 0.7602545619010925, "learning_rate": 0.0002, "epoch": 2.5615798922800717, "step": 35670}, {"loss": 0.695, "grad_norm": 0.8112275004386902, "learning_rate": 0.0002, "epoch": 2.56229802513465, "step": 35680}, {"loss": 0.7492, "grad_norm": 0.73296719789505, "learning_rate": 0.0002, "epoch": 2.563016157989228, "step": 35690}, {"loss": 0.6935, "grad_norm": 0.9007818102836609, "learning_rate": 0.0002, "epoch": 2.563734290843806, "step": 35700}, {"loss": 0.7287, "grad_norm": 0.7526060938835144, "learning_rate": 0.0002, "epoch": 2.564452423698384, "step": 35710}, {"loss": 0.6762, "grad_norm": 0.813875675201416, "learning_rate": 0.0002, "epoch": 2.565170556552962, "step": 35720}, {"loss": 0.666, "grad_norm": 0.7767695784568787, "learning_rate": 0.0002, "epoch": 2.5658886894075406, "step": 35730}, {"loss": 0.6591, "grad_norm": 0.7840573787689209, "learning_rate": 0.0002, "epoch": 2.5666068222621186, "step": 35740}, {"loss": 0.7131, "grad_norm": 0.7400487661361694, "learning_rate": 0.0002, "epoch": 2.5673249551166966, "step": 35750}, {"loss": 0.6571, "grad_norm": 0.7424315810203552, "learning_rate": 0.0002, "epoch": 2.5680430879712746, "step": 35760}, {"loss": 0.6861, "grad_norm": 0.7812185883522034, "learning_rate": 0.0002, "epoch": 2.568761220825853, "step": 35770}, {"loss": 0.7034, "grad_norm": 0.8397669196128845, "learning_rate": 0.0002, "epoch": 2.569479353680431, "step": 35780}, {"loss": 0.6734, "grad_norm": 0.7543849945068359, "learning_rate": 0.0002, "epoch": 2.570197486535009, "step": 35790}, {"loss": 0.7393, "grad_norm": 0.903634786605835, "learning_rate": 0.0002, "epoch": 2.570915619389587, "step": 35800}, {"loss": 0.6884, "grad_norm": 0.853335976600647, "learning_rate": 0.0002, "epoch": 2.571633752244165, "step": 35810}, {"loss": 0.6843, "grad_norm": 0.8441029787063599, "learning_rate": 0.0002, "epoch": 2.572351885098743, "step": 35820}, {"loss": 0.6874, "grad_norm": 0.9072228670120239, "learning_rate": 0.0002, "epoch": 2.5730700179533215, "step": 35830}, {"loss": 0.6866, "grad_norm": 0.7720168828964233, "learning_rate": 0.0002, "epoch": 2.5737881508078995, "step": 35840}, {"loss": 0.695, "grad_norm": 0.8719366788864136, "learning_rate": 0.0002, "epoch": 2.5745062836624775, "step": 35850}, {"loss": 0.7842, "grad_norm": 0.766209065914154, "learning_rate": 0.0002, "epoch": 2.575224416517056, "step": 35860}, {"loss": 0.6688, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 2.575942549371634, "step": 35870}, {"loss": 0.7309, "grad_norm": 0.8068482875823975, "learning_rate": 0.0002, "epoch": 2.576660682226212, "step": 35880}, {"loss": 0.703, "grad_norm": 0.8321225643157959, "learning_rate": 0.0002, "epoch": 2.57737881508079, "step": 35890}, {"loss": 0.6885, "grad_norm": 0.9787611961364746, "learning_rate": 0.0002, "epoch": 2.578096947935368, "step": 35900}, {"loss": 0.7246, "grad_norm": 0.6955108642578125, "learning_rate": 0.0002, "epoch": 2.578815080789946, "step": 35910}, {"loss": 0.6972, "grad_norm": 0.8309195637702942, "learning_rate": 0.0002, "epoch": 2.5795332136445244, "step": 35920}, {"loss": 0.6735, "grad_norm": 0.9309390783309937, "learning_rate": 0.0002, "epoch": 2.5802513464991024, "step": 35930}, {"loss": 0.7376, "grad_norm": 0.903537392616272, "learning_rate": 0.0002, "epoch": 2.5809694793536804, "step": 35940}, {"loss": 0.6578, "grad_norm": 0.9530633091926575, "learning_rate": 0.0002, "epoch": 2.5816876122082584, "step": 35950}, {"loss": 0.6707, "grad_norm": 1.0140212774276733, "learning_rate": 0.0002, "epoch": 2.582405745062837, "step": 35960}, {"loss": 0.6859, "grad_norm": 0.8224637508392334, "learning_rate": 0.0002, "epoch": 2.583123877917415, "step": 35970}, {"loss": 0.7158, "grad_norm": 0.7952998280525208, "learning_rate": 0.0002, "epoch": 2.583842010771993, "step": 35980}, {"loss": 0.65, "grad_norm": 0.6057878136634827, "learning_rate": 0.0002, "epoch": 2.584560143626571, "step": 35990}, {"loss": 0.6566, "grad_norm": 0.9172457456588745, "learning_rate": 0.0002, "epoch": 2.585278276481149, "step": 36000}, {"loss": 0.6863, "grad_norm": 1.0061585903167725, "learning_rate": 0.0002, "epoch": 2.5859964093357273, "step": 36010}, {"loss": 0.6831, "grad_norm": 0.8555058240890503, "learning_rate": 0.0002, "epoch": 2.5867145421903053, "step": 36020}, {"loss": 0.7181, "grad_norm": 0.7732099890708923, "learning_rate": 0.0002, "epoch": 2.5874326750448833, "step": 36030}, {"loss": 0.7383, "grad_norm": 0.9026121497154236, "learning_rate": 0.0002, "epoch": 2.5881508078994613, "step": 36040}, {"loss": 0.6221, "grad_norm": 0.7477090954780579, "learning_rate": 0.0002, "epoch": 2.5888689407540397, "step": 36050}, {"loss": 0.6852, "grad_norm": 0.8835780024528503, "learning_rate": 0.0002, "epoch": 2.5895870736086177, "step": 36060}, {"loss": 0.6786, "grad_norm": 0.7555899024009705, "learning_rate": 0.0002, "epoch": 2.5903052064631957, "step": 36070}, {"loss": 0.6723, "grad_norm": 0.7983574867248535, "learning_rate": 0.0002, "epoch": 2.5910233393177737, "step": 36080}, {"loss": 0.64, "grad_norm": 0.9261698722839355, "learning_rate": 0.0002, "epoch": 2.5917414721723517, "step": 36090}, {"loss": 0.6363, "grad_norm": 0.6834031343460083, "learning_rate": 0.0002, "epoch": 2.5924596050269297, "step": 36100}, {"loss": 0.702, "grad_norm": 0.9528526067733765, "learning_rate": 0.0002, "epoch": 2.593177737881508, "step": 36110}, {"loss": 0.7271, "grad_norm": 0.7469993233680725, "learning_rate": 0.0002, "epoch": 2.593895870736086, "step": 36120}, {"loss": 0.6967, "grad_norm": 0.6750355362892151, "learning_rate": 0.0002, "epoch": 2.594614003590664, "step": 36130}, {"loss": 0.6893, "grad_norm": 0.8591015338897705, "learning_rate": 0.0002, "epoch": 2.5953321364452426, "step": 36140}, {"loss": 0.7015, "grad_norm": 0.7359472513198853, "learning_rate": 0.0002, "epoch": 2.5960502692998206, "step": 36150}, {"loss": 0.6697, "grad_norm": 0.8450608253479004, "learning_rate": 0.0002, "epoch": 2.5967684021543986, "step": 36160}, {"loss": 0.7034, "grad_norm": 0.9069468975067139, "learning_rate": 0.0002, "epoch": 2.5974865350089766, "step": 36170}, {"loss": 0.6814, "grad_norm": 0.9261118173599243, "learning_rate": 0.0002, "epoch": 2.5982046678635546, "step": 36180}, {"loss": 0.6575, "grad_norm": 0.7164715528488159, "learning_rate": 0.0002, "epoch": 2.5989228007181326, "step": 36190}, {"loss": 0.7044, "grad_norm": 0.8809511661529541, "learning_rate": 0.0002, "epoch": 2.599640933572711, "step": 36200}, {"loss": 0.6333, "grad_norm": 0.9872701168060303, "learning_rate": 0.0002, "epoch": 2.600359066427289, "step": 36210}, {"loss": 0.689, "grad_norm": 0.7544043064117432, "learning_rate": 0.0002, "epoch": 2.601077199281867, "step": 36220}, {"loss": 0.658, "grad_norm": 0.9890767335891724, "learning_rate": 0.0002, "epoch": 2.601795332136445, "step": 36230}, {"loss": 0.6981, "grad_norm": 0.907865047454834, "learning_rate": 0.0002, "epoch": 2.6025134649910235, "step": 36240}, {"loss": 0.7131, "grad_norm": 0.7724096179008484, "learning_rate": 0.0002, "epoch": 2.6032315978456015, "step": 36250}, {"loss": 0.7034, "grad_norm": 0.7996655106544495, "learning_rate": 0.0002, "epoch": 2.6039497307001795, "step": 36260}, {"loss": 0.6744, "grad_norm": 0.7184412479400635, "learning_rate": 0.0002, "epoch": 2.6046678635547575, "step": 36270}, {"loss": 0.7133, "grad_norm": 0.7781601548194885, "learning_rate": 0.0002, "epoch": 2.6053859964093355, "step": 36280}, {"loss": 0.6975, "grad_norm": 0.8972102403640747, "learning_rate": 0.0002, "epoch": 2.6061041292639135, "step": 36290}, {"loss": 0.6757, "grad_norm": 0.6831884980201721, "learning_rate": 0.0002, "epoch": 2.606822262118492, "step": 36300}, {"loss": 0.6633, "grad_norm": 0.9049789905548096, "learning_rate": 0.0002, "epoch": 2.60754039497307, "step": 36310}, {"loss": 0.7048, "grad_norm": 0.8062970042228699, "learning_rate": 0.0002, "epoch": 2.608258527827648, "step": 36320}, {"loss": 0.6695, "grad_norm": 0.94797682762146, "learning_rate": 0.0002, "epoch": 2.6089766606822264, "step": 36330}, {"loss": 0.6934, "grad_norm": 0.7907559275627136, "learning_rate": 0.0002, "epoch": 2.6096947935368044, "step": 36340}, {"loss": 0.6299, "grad_norm": 0.6720156073570251, "learning_rate": 0.0002, "epoch": 2.6104129263913824, "step": 36350}, {"loss": 0.644, "grad_norm": 0.729228138923645, "learning_rate": 0.0002, "epoch": 2.6111310592459605, "step": 36360}, {"loss": 0.6651, "grad_norm": 0.9072836637496948, "learning_rate": 0.0002, "epoch": 2.6118491921005385, "step": 36370}, {"loss": 0.6821, "grad_norm": 0.8022173643112183, "learning_rate": 0.0002, "epoch": 2.6125673249551165, "step": 36380}, {"loss": 0.6587, "grad_norm": 0.7475612163543701, "learning_rate": 0.0002, "epoch": 2.613285457809695, "step": 36390}, {"loss": 0.6454, "grad_norm": 0.7976534366607666, "learning_rate": 0.0002, "epoch": 2.614003590664273, "step": 36400}, {"loss": 0.7173, "grad_norm": 0.7118260860443115, "learning_rate": 0.0002, "epoch": 2.614721723518851, "step": 36410}, {"loss": 0.7173, "grad_norm": 0.666500985622406, "learning_rate": 0.0002, "epoch": 2.6154398563734294, "step": 36420}, {"loss": 0.719, "grad_norm": 0.8776089549064636, "learning_rate": 0.0002, "epoch": 2.6161579892280074, "step": 36430}, {"loss": 0.6928, "grad_norm": 0.9375919699668884, "learning_rate": 0.0002, "epoch": 2.6168761220825854, "step": 36440}, {"loss": 0.6627, "grad_norm": 0.8162244558334351, "learning_rate": 0.0002, "epoch": 2.6175942549371634, "step": 36450}, {"loss": 0.6586, "grad_norm": 0.8459304571151733, "learning_rate": 0.0002, "epoch": 2.6183123877917414, "step": 36460}, {"loss": 0.6777, "grad_norm": 0.7731037735939026, "learning_rate": 0.0002, "epoch": 2.6190305206463194, "step": 36470}, {"loss": 0.7288, "grad_norm": 0.7857680320739746, "learning_rate": 0.0002, "epoch": 2.619748653500898, "step": 36480}, {"loss": 0.664, "grad_norm": 0.8415161371231079, "learning_rate": 0.0002, "epoch": 2.620466786355476, "step": 36490}, {"loss": 0.703, "grad_norm": 0.8103558421134949, "learning_rate": 0.0002, "epoch": 2.621184919210054, "step": 36500}, {"loss": 0.6693, "grad_norm": 0.7876150608062744, "learning_rate": 0.0002, "epoch": 2.621903052064632, "step": 36510}, {"loss": 0.6562, "grad_norm": 0.7316484451293945, "learning_rate": 0.0002, "epoch": 2.6226211849192103, "step": 36520}, {"loss": 0.6263, "grad_norm": 0.7209784984588623, "learning_rate": 0.0002, "epoch": 2.6233393177737883, "step": 36530}, {"loss": 0.6767, "grad_norm": 0.8933016657829285, "learning_rate": 0.0002, "epoch": 2.6240574506283663, "step": 36540}, {"loss": 0.7217, "grad_norm": 0.8078171610832214, "learning_rate": 0.0002, "epoch": 2.6247755834829443, "step": 36550}, {"loss": 0.7106, "grad_norm": 0.9134724736213684, "learning_rate": 0.0002, "epoch": 2.6254937163375223, "step": 36560}, {"loss": 0.6909, "grad_norm": 0.8691368699073792, "learning_rate": 0.0002, "epoch": 2.6262118491921003, "step": 36570}, {"loss": 0.6769, "grad_norm": 0.706479012966156, "learning_rate": 0.0002, "epoch": 2.6269299820466787, "step": 36580}, {"loss": 0.6864, "grad_norm": 0.9333644509315491, "learning_rate": 0.0002, "epoch": 2.6276481149012567, "step": 36590}, {"loss": 0.6704, "grad_norm": 0.8156154155731201, "learning_rate": 0.0002, "epoch": 2.6283662477558347, "step": 36600}, {"loss": 0.7128, "grad_norm": 0.812745213508606, "learning_rate": 0.0002, "epoch": 2.629084380610413, "step": 36610}, {"loss": 0.6901, "grad_norm": 0.8898148536682129, "learning_rate": 0.0002, "epoch": 2.629802513464991, "step": 36620}, {"loss": 0.6821, "grad_norm": 0.8083946108818054, "learning_rate": 0.0002, "epoch": 2.630520646319569, "step": 36630}, {"loss": 0.7285, "grad_norm": 0.7050122618675232, "learning_rate": 0.0002, "epoch": 2.631238779174147, "step": 36640}, {"loss": 0.6751, "grad_norm": 0.8155789971351624, "learning_rate": 0.0002, "epoch": 2.631956912028725, "step": 36650}, {"loss": 0.7258, "grad_norm": 0.9102175235748291, "learning_rate": 0.0002, "epoch": 2.632675044883303, "step": 36660}, {"loss": 0.6697, "grad_norm": 0.6621248126029968, "learning_rate": 0.0002, "epoch": 2.6333931777378816, "step": 36670}, {"loss": 0.6405, "grad_norm": 0.7338519096374512, "learning_rate": 0.0002, "epoch": 2.6341113105924596, "step": 36680}, {"loss": 0.6784, "grad_norm": 0.7536506652832031, "learning_rate": 0.0002, "epoch": 2.6348294434470376, "step": 36690}, {"loss": 0.6974, "grad_norm": 0.9357436299324036, "learning_rate": 0.0002, "epoch": 2.635547576301616, "step": 36700}, {"loss": 0.7729, "grad_norm": 0.7732111215591431, "learning_rate": 0.0002, "epoch": 2.636265709156194, "step": 36710}, {"loss": 0.6905, "grad_norm": 0.6863537430763245, "learning_rate": 0.0002, "epoch": 2.636983842010772, "step": 36720}, {"loss": 0.7058, "grad_norm": 0.8014764785766602, "learning_rate": 0.0002, "epoch": 2.63770197486535, "step": 36730}, {"loss": 0.697, "grad_norm": 0.8103911280632019, "learning_rate": 0.0002, "epoch": 2.638420107719928, "step": 36740}, {"loss": 0.7164, "grad_norm": 0.882652997970581, "learning_rate": 0.0002, "epoch": 2.639138240574506, "step": 36750}, {"loss": 0.6689, "grad_norm": 0.8705278038978577, "learning_rate": 0.0002, "epoch": 2.6398563734290845, "step": 36760}, {"loss": 0.6863, "grad_norm": 0.80764240026474, "learning_rate": 0.0002, "epoch": 2.6405745062836625, "step": 36770}, {"loss": 0.6761, "grad_norm": 0.9668620824813843, "learning_rate": 0.0002, "epoch": 2.6412926391382405, "step": 36780}, {"loss": 0.6576, "grad_norm": 0.7477577328681946, "learning_rate": 0.0002, "epoch": 2.6420107719928185, "step": 36790}, {"loss": 0.6558, "grad_norm": 0.8344516754150391, "learning_rate": 0.0002, "epoch": 2.642728904847397, "step": 36800}, {"loss": 0.6949, "grad_norm": 0.9520720839500427, "learning_rate": 0.0002, "epoch": 2.643447037701975, "step": 36810}, {"loss": 0.6731, "grad_norm": 0.5942372679710388, "learning_rate": 0.0002, "epoch": 2.644165170556553, "step": 36820}, {"loss": 0.6509, "grad_norm": 0.7411555051803589, "learning_rate": 0.0002, "epoch": 2.644883303411131, "step": 36830}, {"loss": 0.6948, "grad_norm": 0.6597771048545837, "learning_rate": 0.0002, "epoch": 2.645601436265709, "step": 36840}, {"loss": 0.6379, "grad_norm": 0.8636548519134521, "learning_rate": 0.0002, "epoch": 2.646319569120287, "step": 36850}, {"loss": 0.6965, "grad_norm": 0.8557497262954712, "learning_rate": 0.0002, "epoch": 2.6470377019748654, "step": 36860}, {"loss": 0.7061, "grad_norm": 0.8535996675491333, "learning_rate": 0.0002, "epoch": 2.6477558348294434, "step": 36870}, {"loss": 0.7087, "grad_norm": 0.7996463775634766, "learning_rate": 0.0002, "epoch": 2.6484739676840214, "step": 36880}, {"loss": 0.7174, "grad_norm": 0.6462067365646362, "learning_rate": 0.0002, "epoch": 2.6491921005386, "step": 36890}, {"loss": 0.6905, "grad_norm": 0.8849772214889526, "learning_rate": 0.0002, "epoch": 2.649910233393178, "step": 36900}, {"loss": 0.6973, "grad_norm": 0.999173641204834, "learning_rate": 0.0002, "epoch": 2.650628366247756, "step": 36910}, {"loss": 0.628, "grad_norm": 0.7221724987030029, "learning_rate": 0.0002, "epoch": 2.651346499102334, "step": 36920}, {"loss": 0.6698, "grad_norm": 0.8122989535331726, "learning_rate": 0.0002, "epoch": 2.652064631956912, "step": 36930}, {"loss": 0.6758, "grad_norm": 0.724267840385437, "learning_rate": 0.0002, "epoch": 2.65278276481149, "step": 36940}, {"loss": 0.6542, "grad_norm": 0.8250583410263062, "learning_rate": 0.0002, "epoch": 2.6535008976660683, "step": 36950}, {"loss": 0.6914, "grad_norm": 0.7623526453971863, "learning_rate": 0.0002, "epoch": 2.6542190305206463, "step": 36960}, {"loss": 0.6699, "grad_norm": 0.6474025845527649, "learning_rate": 0.0002, "epoch": 2.6549371633752243, "step": 36970}, {"loss": 0.7235, "grad_norm": 0.9751694202423096, "learning_rate": 0.0002, "epoch": 2.655655296229803, "step": 36980}, {"loss": 0.7423, "grad_norm": 0.8338939547538757, "learning_rate": 0.0002, "epoch": 2.656373429084381, "step": 36990}, {"loss": 0.6641, "grad_norm": 0.8877421021461487, "learning_rate": 0.0002, "epoch": 2.657091561938959, "step": 37000}, {"loss": 0.6639, "grad_norm": 0.9590298533439636, "learning_rate": 0.0002, "epoch": 2.657809694793537, "step": 37010}, {"loss": 0.6617, "grad_norm": 0.8224121928215027, "learning_rate": 0.0002, "epoch": 2.658527827648115, "step": 37020}, {"loss": 0.6359, "grad_norm": 0.9871236681938171, "learning_rate": 0.0002, "epoch": 2.659245960502693, "step": 37030}, {"loss": 0.65, "grad_norm": 0.8729037046432495, "learning_rate": 0.0002, "epoch": 2.6599640933572712, "step": 37040}, {"loss": 0.6561, "grad_norm": 0.6279319524765015, "learning_rate": 0.0002, "epoch": 2.6606822262118492, "step": 37050}, {"loss": 0.7031, "grad_norm": 1.0278962850570679, "learning_rate": 0.0002, "epoch": 2.6614003590664272, "step": 37060}, {"loss": 0.6552, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 2.6621184919210052, "step": 37070}, {"loss": 0.6994, "grad_norm": 0.7432018518447876, "learning_rate": 0.0002, "epoch": 2.6628366247755837, "step": 37080}, {"loss": 0.7086, "grad_norm": 0.9425008296966553, "learning_rate": 0.0002, "epoch": 2.6635547576301617, "step": 37090}, {"loss": 0.716, "grad_norm": 0.7542579174041748, "learning_rate": 0.0002, "epoch": 2.6642728904847397, "step": 37100}, {"loss": 0.6714, "grad_norm": 0.8469315767288208, "learning_rate": 0.0002, "epoch": 2.6649910233393177, "step": 37110}, {"loss": 0.6638, "grad_norm": 0.865777313709259, "learning_rate": 0.0002, "epoch": 2.6657091561938957, "step": 37120}, {"loss": 0.741, "grad_norm": 0.7293250560760498, "learning_rate": 0.0002, "epoch": 2.6664272890484737, "step": 37130}, {"loss": 0.6662, "grad_norm": 0.7199395895004272, "learning_rate": 0.0002, "epoch": 2.667145421903052, "step": 37140}, {"loss": 0.7078, "grad_norm": 0.7801268100738525, "learning_rate": 0.0002, "epoch": 2.66786355475763, "step": 37150}, {"loss": 0.7083, "grad_norm": 0.8706921935081482, "learning_rate": 0.0002, "epoch": 2.668581687612208, "step": 37160}, {"loss": 0.69, "grad_norm": 0.7124722599983215, "learning_rate": 0.0002, "epoch": 2.6692998204667866, "step": 37170}, {"loss": 0.625, "grad_norm": 0.8333015441894531, "learning_rate": 0.0002, "epoch": 2.6700179533213646, "step": 37180}, {"loss": 0.636, "grad_norm": 0.8822736740112305, "learning_rate": 0.0002, "epoch": 2.6707360861759426, "step": 37190}, {"loss": 0.6731, "grad_norm": 0.8300906419754028, "learning_rate": 0.0002, "epoch": 2.6714542190305206, "step": 37200}, {"loss": 0.6883, "grad_norm": 0.887126088142395, "learning_rate": 0.0002, "epoch": 2.6721723518850986, "step": 37210}, {"loss": 0.7211, "grad_norm": 0.7473671436309814, "learning_rate": 0.0002, "epoch": 2.6728904847396766, "step": 37220}, {"loss": 0.7032, "grad_norm": 0.8121018409729004, "learning_rate": 0.0002, "epoch": 2.673608617594255, "step": 37230}, {"loss": 0.6262, "grad_norm": 0.7882586717605591, "learning_rate": 0.0002, "epoch": 2.674326750448833, "step": 37240}, {"loss": 0.7201, "grad_norm": 0.797060489654541, "learning_rate": 0.0002, "epoch": 2.675044883303411, "step": 37250}, {"loss": 0.6635, "grad_norm": 0.9776935577392578, "learning_rate": 0.0002, "epoch": 2.6757630161579895, "step": 37260}, {"loss": 0.6883, "grad_norm": 0.9527283906936646, "learning_rate": 0.0002, "epoch": 2.6764811490125675, "step": 37270}, {"loss": 0.6968, "grad_norm": 0.7232038974761963, "learning_rate": 0.0002, "epoch": 2.6771992818671455, "step": 37280}, {"loss": 0.6544, "grad_norm": 0.8514575362205505, "learning_rate": 0.0002, "epoch": 2.6779174147217235, "step": 37290}, {"loss": 0.6956, "grad_norm": 0.8951214551925659, "learning_rate": 0.0002, "epoch": 2.6786355475763015, "step": 37300}, {"loss": 0.7435, "grad_norm": 0.7569643259048462, "learning_rate": 0.0002, "epoch": 2.6793536804308795, "step": 37310}, {"loss": 0.6522, "grad_norm": 1.0522346496582031, "learning_rate": 0.0002, "epoch": 2.680071813285458, "step": 37320}, {"loss": 0.7051, "grad_norm": 0.8914180994033813, "learning_rate": 0.0002, "epoch": 2.680789946140036, "step": 37330}, {"loss": 0.6941, "grad_norm": 0.8251807689666748, "learning_rate": 0.0002, "epoch": 2.681508078994614, "step": 37340}, {"loss": 0.6783, "grad_norm": 0.8215394020080566, "learning_rate": 0.0002, "epoch": 2.682226211849192, "step": 37350}, {"loss": 0.682, "grad_norm": 0.8043696880340576, "learning_rate": 0.0002, "epoch": 2.6829443447037704, "step": 37360}, {"loss": 0.6614, "grad_norm": 0.767250657081604, "learning_rate": 0.0002, "epoch": 2.6836624775583484, "step": 37370}, {"loss": 0.7197, "grad_norm": 0.817740261554718, "learning_rate": 0.0002, "epoch": 2.6843806104129264, "step": 37380}, {"loss": 0.6839, "grad_norm": 0.7963255047798157, "learning_rate": 0.0002, "epoch": 2.6850987432675044, "step": 37390}, {"loss": 0.7469, "grad_norm": 0.839271605014801, "learning_rate": 0.0002, "epoch": 2.6858168761220824, "step": 37400}, {"loss": 0.6879, "grad_norm": 0.7882823348045349, "learning_rate": 0.0002, "epoch": 2.6865350089766604, "step": 37410}, {"loss": 0.6768, "grad_norm": 0.8316412568092346, "learning_rate": 0.0002, "epoch": 2.687253141831239, "step": 37420}, {"loss": 0.7031, "grad_norm": 1.0044993162155151, "learning_rate": 0.0002, "epoch": 2.687971274685817, "step": 37430}, {"loss": 0.6988, "grad_norm": 0.8342832326889038, "learning_rate": 0.0002, "epoch": 2.688689407540395, "step": 37440}, {"loss": 0.6685, "grad_norm": 0.6743215322494507, "learning_rate": 0.0002, "epoch": 2.6894075403949733, "step": 37450}, {"loss": 0.6567, "grad_norm": 0.6872923970222473, "learning_rate": 0.0002, "epoch": 2.6901256732495513, "step": 37460}, {"loss": 0.7089, "grad_norm": 0.7377792596817017, "learning_rate": 0.0002, "epoch": 2.6908438061041293, "step": 37470}, {"loss": 0.676, "grad_norm": 0.7677304744720459, "learning_rate": 0.0002, "epoch": 2.6915619389587073, "step": 37480}, {"loss": 0.6693, "grad_norm": 0.9951061010360718, "learning_rate": 0.0002, "epoch": 2.6922800718132853, "step": 37490}, {"loss": 0.6517, "grad_norm": 0.7452111840248108, "learning_rate": 0.0002, "epoch": 2.6929982046678633, "step": 37500}, {"loss": 0.7503, "grad_norm": 0.9663393497467041, "learning_rate": 0.0002, "epoch": 2.6937163375224418, "step": 37510}, {"loss": 0.7025, "grad_norm": 0.7919635772705078, "learning_rate": 0.0002, "epoch": 2.6944344703770198, "step": 37520}, {"loss": 0.7257, "grad_norm": 0.9977981448173523, "learning_rate": 0.0002, "epoch": 2.6951526032315978, "step": 37530}, {"loss": 0.6507, "grad_norm": 0.7279480695724487, "learning_rate": 0.0002, "epoch": 2.695870736086176, "step": 37540}, {"loss": 0.7448, "grad_norm": 0.7218075394630432, "learning_rate": 0.0002, "epoch": 2.6965888689407542, "step": 37550}, {"loss": 0.6845, "grad_norm": 0.9041047096252441, "learning_rate": 0.0002, "epoch": 2.6973070017953322, "step": 37560}, {"loss": 0.6848, "grad_norm": 0.7689407467842102, "learning_rate": 0.0002, "epoch": 2.6980251346499102, "step": 37570}, {"loss": 0.7136, "grad_norm": 0.8184728622436523, "learning_rate": 0.0002, "epoch": 2.6987432675044882, "step": 37580}, {"loss": 0.6952, "grad_norm": 0.7536661624908447, "learning_rate": 0.0002, "epoch": 2.6994614003590662, "step": 37590}, {"loss": 0.7064, "grad_norm": 0.8371431231498718, "learning_rate": 0.0002, "epoch": 2.7001795332136447, "step": 37600}, {"loss": 0.7118, "grad_norm": 0.8562723994255066, "learning_rate": 0.0002, "epoch": 2.7008976660682227, "step": 37610}, {"loss": 0.6602, "grad_norm": 0.8227898478507996, "learning_rate": 0.0002, "epoch": 2.7016157989228007, "step": 37620}, {"loss": 0.7324, "grad_norm": 0.764792799949646, "learning_rate": 0.0002, "epoch": 2.7023339317773787, "step": 37630}, {"loss": 0.7289, "grad_norm": 0.7782649993896484, "learning_rate": 0.0002, "epoch": 2.703052064631957, "step": 37640}, {"loss": 0.705, "grad_norm": 0.7669944167137146, "learning_rate": 0.0002, "epoch": 2.703770197486535, "step": 37650}, {"loss": 0.7019, "grad_norm": 0.7945750951766968, "learning_rate": 0.0002, "epoch": 2.704488330341113, "step": 37660}, {"loss": 0.6789, "grad_norm": 0.6840786337852478, "learning_rate": 0.0002, "epoch": 2.705206463195691, "step": 37670}, {"loss": 0.768, "grad_norm": 1.0565117597579956, "learning_rate": 0.0002, "epoch": 2.705924596050269, "step": 37680}, {"loss": 0.737, "grad_norm": 0.7407042384147644, "learning_rate": 0.0002, "epoch": 2.706642728904847, "step": 37690}, {"loss": 0.712, "grad_norm": 0.7862113118171692, "learning_rate": 0.0002, "epoch": 2.7073608617594256, "step": 37700}, {"loss": 0.6331, "grad_norm": 0.7487596273422241, "learning_rate": 0.0002, "epoch": 2.7080789946140036, "step": 37710}, {"loss": 0.6917, "grad_norm": 0.9416596293449402, "learning_rate": 0.0002, "epoch": 2.7087971274685816, "step": 37720}, {"loss": 0.717, "grad_norm": 0.8943207263946533, "learning_rate": 0.0002, "epoch": 2.70951526032316, "step": 37730}, {"loss": 0.6505, "grad_norm": 0.9263445138931274, "learning_rate": 0.0002, "epoch": 2.710233393177738, "step": 37740}, {"loss": 0.7423, "grad_norm": 0.6869737505912781, "learning_rate": 0.0002, "epoch": 2.710951526032316, "step": 37750}, {"loss": 0.724, "grad_norm": 0.9186407923698425, "learning_rate": 0.0002, "epoch": 2.711669658886894, "step": 37760}, {"loss": 0.6757, "grad_norm": 0.8379335999488831, "learning_rate": 0.0002, "epoch": 2.712387791741472, "step": 37770}, {"loss": 0.7352, "grad_norm": 0.7248736023902893, "learning_rate": 0.0002, "epoch": 2.71310592459605, "step": 37780}, {"loss": 0.7023, "grad_norm": 0.8636229038238525, "learning_rate": 0.0002, "epoch": 2.7138240574506285, "step": 37790}, {"loss": 0.726, "grad_norm": 0.7590767741203308, "learning_rate": 0.0002, "epoch": 2.7145421903052065, "step": 37800}, {"loss": 0.6837, "grad_norm": 0.8946404457092285, "learning_rate": 0.0002, "epoch": 2.7152603231597845, "step": 37810}, {"loss": 0.7135, "grad_norm": 0.7822132706642151, "learning_rate": 0.0002, "epoch": 2.7159784560143625, "step": 37820}, {"loss": 0.7034, "grad_norm": 0.7882820963859558, "learning_rate": 0.0002, "epoch": 2.716696588868941, "step": 37830}, {"loss": 0.6667, "grad_norm": 0.8025872707366943, "learning_rate": 0.0002, "epoch": 2.717414721723519, "step": 37840}, {"loss": 0.6967, "grad_norm": 0.8618839979171753, "learning_rate": 0.0002, "epoch": 2.718132854578097, "step": 37850}, {"loss": 0.699, "grad_norm": 0.6975733637809753, "learning_rate": 0.0002, "epoch": 2.718850987432675, "step": 37860}, {"loss": 0.6858, "grad_norm": 0.7952182292938232, "learning_rate": 0.0002, "epoch": 2.719569120287253, "step": 37870}, {"loss": 0.7018, "grad_norm": 0.7580680251121521, "learning_rate": 0.0002, "epoch": 2.7202872531418314, "step": 37880}, {"loss": 0.6838, "grad_norm": 0.9504257440567017, "learning_rate": 0.0002, "epoch": 2.7210053859964094, "step": 37890}, {"loss": 0.6801, "grad_norm": 0.856614351272583, "learning_rate": 0.0002, "epoch": 2.7217235188509874, "step": 37900}, {"loss": 0.6647, "grad_norm": 1.0092085599899292, "learning_rate": 0.0002, "epoch": 2.7224416517055654, "step": 37910}, {"loss": 0.6709, "grad_norm": 0.9009839296340942, "learning_rate": 0.0002, "epoch": 2.723159784560144, "step": 37920}, {"loss": 0.7009, "grad_norm": 0.9247435331344604, "learning_rate": 0.0002, "epoch": 2.723877917414722, "step": 37930}, {"loss": 0.6924, "grad_norm": 1.0774317979812622, "learning_rate": 0.0002, "epoch": 2.7245960502693, "step": 37940}, {"loss": 0.6706, "grad_norm": 0.9104372262954712, "learning_rate": 0.0002, "epoch": 2.725314183123878, "step": 37950}, {"loss": 0.6608, "grad_norm": 0.7904245257377625, "learning_rate": 0.0002, "epoch": 2.726032315978456, "step": 37960}, {"loss": 0.6937, "grad_norm": 0.9555521607398987, "learning_rate": 0.0002, "epoch": 2.726750448833034, "step": 37970}, {"loss": 0.6497, "grad_norm": 0.7769099473953247, "learning_rate": 0.0002, "epoch": 2.7274685816876123, "step": 37980}, {"loss": 0.63, "grad_norm": 0.9202065467834473, "learning_rate": 0.0002, "epoch": 2.7281867145421903, "step": 37990}, {"loss": 0.7021, "grad_norm": 0.732510507106781, "learning_rate": 0.0002, "epoch": 2.7289048473967683, "step": 38000}, {"loss": 0.6665, "grad_norm": 0.7723771929740906, "learning_rate": 0.0002, "epoch": 2.7296229802513468, "step": 38010}, {"loss": 0.6836, "grad_norm": 0.7948567867279053, "learning_rate": 0.0002, "epoch": 2.7303411131059248, "step": 38020}, {"loss": 0.6802, "grad_norm": 0.7702966928482056, "learning_rate": 0.0002, "epoch": 2.7310592459605028, "step": 38030}, {"loss": 0.6859, "grad_norm": 0.689098060131073, "learning_rate": 0.0002, "epoch": 2.7317773788150808, "step": 38040}, {"loss": 0.7027, "grad_norm": 0.7951080203056335, "learning_rate": 0.0002, "epoch": 2.7324955116696588, "step": 38050}, {"loss": 0.6895, "grad_norm": 0.7284924983978271, "learning_rate": 0.0002, "epoch": 2.7332136445242368, "step": 38060}, {"loss": 0.7409, "grad_norm": 0.9198044538497925, "learning_rate": 0.0002, "epoch": 2.733931777378815, "step": 38070}, {"loss": 0.6699, "grad_norm": 0.8653260469436646, "learning_rate": 0.0002, "epoch": 2.734649910233393, "step": 38080}, {"loss": 0.6832, "grad_norm": 0.8503400683403015, "learning_rate": 0.0002, "epoch": 2.735368043087971, "step": 38090}, {"loss": 0.6955, "grad_norm": 0.8388783931732178, "learning_rate": 0.0002, "epoch": 2.736086175942549, "step": 38100}, {"loss": 0.7059, "grad_norm": 0.7636904716491699, "learning_rate": 0.0002, "epoch": 2.7368043087971277, "step": 38110}, {"loss": 0.6659, "grad_norm": 0.8990790247917175, "learning_rate": 0.0002, "epoch": 2.7375224416517057, "step": 38120}, {"loss": 0.6487, "grad_norm": 0.8878970742225647, "learning_rate": 0.0002, "epoch": 2.7382405745062837, "step": 38130}, {"loss": 0.6725, "grad_norm": 0.7684310078620911, "learning_rate": 0.0002, "epoch": 2.7389587073608617, "step": 38140}, {"loss": 0.6935, "grad_norm": 1.0777359008789062, "learning_rate": 0.0002, "epoch": 2.7396768402154397, "step": 38150}, {"loss": 0.6904, "grad_norm": 0.768764317035675, "learning_rate": 0.0002, "epoch": 2.740394973070018, "step": 38160}, {"loss": 0.6509, "grad_norm": 0.7490760087966919, "learning_rate": 0.0002, "epoch": 2.741113105924596, "step": 38170}, {"loss": 0.6907, "grad_norm": 0.860373854637146, "learning_rate": 0.0002, "epoch": 2.741831238779174, "step": 38180}, {"loss": 0.6704, "grad_norm": 0.7145599722862244, "learning_rate": 0.0002, "epoch": 2.742549371633752, "step": 38190}, {"loss": 0.6798, "grad_norm": 0.8347760438919067, "learning_rate": 0.0002, "epoch": 2.7432675044883306, "step": 38200}, {"loss": 0.7029, "grad_norm": 0.8425729274749756, "learning_rate": 0.0002, "epoch": 2.7439856373429086, "step": 38210}, {"loss": 0.6442, "grad_norm": 0.9289436936378479, "learning_rate": 0.0002, "epoch": 2.7447037701974866, "step": 38220}, {"loss": 0.694, "grad_norm": 0.7608675360679626, "learning_rate": 0.0002, "epoch": 2.7454219030520646, "step": 38230}, {"loss": 0.7097, "grad_norm": 0.8067167401313782, "learning_rate": 0.0002, "epoch": 2.7461400359066426, "step": 38240}, {"loss": 0.704, "grad_norm": 0.8599629402160645, "learning_rate": 0.0002, "epoch": 2.7468581687612206, "step": 38250}, {"loss": 0.6259, "grad_norm": 0.8425742387771606, "learning_rate": 0.0002, "epoch": 2.747576301615799, "step": 38260}, {"loss": 0.6875, "grad_norm": 0.8626754283905029, "learning_rate": 0.0002, "epoch": 2.748294434470377, "step": 38270}, {"loss": 0.7357, "grad_norm": 0.797652006149292, "learning_rate": 0.0002, "epoch": 2.749012567324955, "step": 38280}, {"loss": 0.7184, "grad_norm": 0.7971500754356384, "learning_rate": 0.0002, "epoch": 2.7497307001795335, "step": 38290}, {"loss": 0.7035, "grad_norm": 0.9786333441734314, "learning_rate": 0.0002, "epoch": 2.7504488330341115, "step": 38300}, {"loss": 0.6501, "grad_norm": 0.7146100997924805, "learning_rate": 0.0002, "epoch": 2.7511669658886895, "step": 38310}, {"loss": 0.7087, "grad_norm": 0.8436099886894226, "learning_rate": 0.0002, "epoch": 2.7518850987432675, "step": 38320}, {"loss": 0.6911, "grad_norm": 0.8943847417831421, "learning_rate": 0.0002, "epoch": 2.7526032315978455, "step": 38330}, {"loss": 0.6397, "grad_norm": 0.8170148730278015, "learning_rate": 0.0002, "epoch": 2.7533213644524235, "step": 38340}, {"loss": 0.6756, "grad_norm": 0.7804728746414185, "learning_rate": 0.0002, "epoch": 2.754039497307002, "step": 38350}, {"loss": 0.6954, "grad_norm": 0.9139971137046814, "learning_rate": 0.0002, "epoch": 2.75475763016158, "step": 38360}, {"loss": 0.7083, "grad_norm": 0.835332453250885, "learning_rate": 0.0002, "epoch": 2.755475763016158, "step": 38370}, {"loss": 0.7112, "grad_norm": 1.0904794931411743, "learning_rate": 0.0002, "epoch": 2.756193895870736, "step": 38380}, {"loss": 0.6881, "grad_norm": 0.7443365454673767, "learning_rate": 0.0002, "epoch": 2.7569120287253144, "step": 38390}, {"loss": 0.6896, "grad_norm": 1.1336839199066162, "learning_rate": 0.0002, "epoch": 2.7576301615798924, "step": 38400}, {"loss": 0.6777, "grad_norm": 0.9024015665054321, "learning_rate": 0.0002, "epoch": 2.7583482944344704, "step": 38410}, {"loss": 0.629, "grad_norm": 0.7380578517913818, "learning_rate": 0.0002, "epoch": 2.7590664272890484, "step": 38420}, {"loss": 0.7708, "grad_norm": 0.9860634207725525, "learning_rate": 0.0002, "epoch": 2.7597845601436264, "step": 38430}, {"loss": 0.6694, "grad_norm": 0.7928970456123352, "learning_rate": 0.0002, "epoch": 2.760502692998205, "step": 38440}, {"loss": 0.669, "grad_norm": 1.0357221364974976, "learning_rate": 0.0002, "epoch": 2.761220825852783, "step": 38450}, {"loss": 0.6763, "grad_norm": 0.8110901117324829, "learning_rate": 0.0002, "epoch": 2.761938958707361, "step": 38460}, {"loss": 0.6528, "grad_norm": 0.8420981764793396, "learning_rate": 0.0002, "epoch": 2.762657091561939, "step": 38470}, {"loss": 0.6841, "grad_norm": 0.858955979347229, "learning_rate": 0.0002, "epoch": 2.7633752244165173, "step": 38480}, {"loss": 0.7387, "grad_norm": 0.9851368069648743, "learning_rate": 0.0002, "epoch": 2.7640933572710953, "step": 38490}, {"loss": 0.6939, "grad_norm": 0.8073325753211975, "learning_rate": 0.0002, "epoch": 2.7648114901256733, "step": 38500}, {"loss": 0.7033, "grad_norm": 1.0654062032699585, "learning_rate": 0.0002, "epoch": 2.7655296229802513, "step": 38510}, {"loss": 0.692, "grad_norm": 0.719603955745697, "learning_rate": 0.0002, "epoch": 2.7662477558348293, "step": 38520}, {"loss": 0.7032, "grad_norm": 0.9790831804275513, "learning_rate": 0.0002, "epoch": 2.7669658886894073, "step": 38530}, {"loss": 0.6613, "grad_norm": 0.907619833946228, "learning_rate": 0.0002, "epoch": 2.7676840215439857, "step": 38540}, {"loss": 0.6683, "grad_norm": 0.7463719248771667, "learning_rate": 0.0002, "epoch": 2.7684021543985637, "step": 38550}, {"loss": 0.6785, "grad_norm": 1.0687178373336792, "learning_rate": 0.0002, "epoch": 2.7691202872531417, "step": 38560}, {"loss": 0.6901, "grad_norm": 0.7397776246070862, "learning_rate": 0.0002, "epoch": 2.76983842010772, "step": 38570}, {"loss": 0.6861, "grad_norm": 0.7392559051513672, "learning_rate": 0.0002, "epoch": 2.770556552962298, "step": 38580}, {"loss": 0.6954, "grad_norm": 0.9774793982505798, "learning_rate": 0.0002, "epoch": 2.771274685816876, "step": 38590}, {"loss": 0.6641, "grad_norm": 0.9502208828926086, "learning_rate": 0.0002, "epoch": 2.771992818671454, "step": 38600}, {"loss": 0.6908, "grad_norm": 0.776108980178833, "learning_rate": 0.0002, "epoch": 2.772710951526032, "step": 38610}, {"loss": 0.6826, "grad_norm": 0.7633077502250671, "learning_rate": 0.0002, "epoch": 2.77342908438061, "step": 38620}, {"loss": 0.6559, "grad_norm": 0.9445580244064331, "learning_rate": 0.0002, "epoch": 2.7741472172351886, "step": 38630}, {"loss": 0.7085, "grad_norm": 0.943165123462677, "learning_rate": 0.0002, "epoch": 2.7748653500897666, "step": 38640}, {"loss": 0.6739, "grad_norm": 0.9045929908752441, "learning_rate": 0.0002, "epoch": 2.7755834829443446, "step": 38650}, {"loss": 0.7351, "grad_norm": 0.9425684213638306, "learning_rate": 0.0002, "epoch": 2.7763016157989227, "step": 38660}, {"loss": 0.6602, "grad_norm": 0.9106295704841614, "learning_rate": 0.0002, "epoch": 2.777019748653501, "step": 38670}, {"loss": 0.7076, "grad_norm": 0.6264749765396118, "learning_rate": 0.0002, "epoch": 2.777737881508079, "step": 38680}, {"loss": 0.7234, "grad_norm": 0.9156801700592041, "learning_rate": 0.0002, "epoch": 2.778456014362657, "step": 38690}, {"loss": 0.6804, "grad_norm": 0.9752956032752991, "learning_rate": 0.0002, "epoch": 2.779174147217235, "step": 38700}, {"loss": 0.686, "grad_norm": 0.7849555611610413, "learning_rate": 0.0002, "epoch": 2.779892280071813, "step": 38710}, {"loss": 0.72, "grad_norm": 0.8109981417655945, "learning_rate": 0.0002, "epoch": 2.780610412926391, "step": 38720}, {"loss": 0.6592, "grad_norm": 0.7882387638092041, "learning_rate": 0.0002, "epoch": 2.7813285457809696, "step": 38730}, {"loss": 0.6948, "grad_norm": 0.9049678444862366, "learning_rate": 0.0002, "epoch": 2.7820466786355476, "step": 38740}, {"loss": 0.7032, "grad_norm": 0.7678212523460388, "learning_rate": 0.0002, "epoch": 2.7827648114901256, "step": 38750}, {"loss": 0.6882, "grad_norm": 0.9754453301429749, "learning_rate": 0.0002, "epoch": 2.783482944344704, "step": 38760}, {"loss": 0.7071, "grad_norm": 0.7643493413925171, "learning_rate": 0.0002, "epoch": 2.784201077199282, "step": 38770}, {"loss": 0.6817, "grad_norm": 0.7440303564071655, "learning_rate": 0.0002, "epoch": 2.78491921005386, "step": 38780}, {"loss": 0.6869, "grad_norm": 0.8870946168899536, "learning_rate": 0.0002, "epoch": 2.785637342908438, "step": 38790}, {"loss": 0.7391, "grad_norm": 0.8100579977035522, "learning_rate": 0.0002, "epoch": 2.786355475763016, "step": 38800}, {"loss": 0.7003, "grad_norm": 0.7082616090774536, "learning_rate": 0.0002, "epoch": 2.787073608617594, "step": 38810}, {"loss": 0.697, "grad_norm": 0.7880047559738159, "learning_rate": 0.0002, "epoch": 2.7877917414721725, "step": 38820}, {"loss": 0.6635, "grad_norm": 0.7217963337898254, "learning_rate": 0.0002, "epoch": 2.7885098743267505, "step": 38830}, {"loss": 0.696, "grad_norm": 0.799124002456665, "learning_rate": 0.0002, "epoch": 2.7892280071813285, "step": 38840}, {"loss": 0.7267, "grad_norm": 1.0004022121429443, "learning_rate": 0.0002, "epoch": 2.789946140035907, "step": 38850}, {"loss": 0.6325, "grad_norm": 0.7866547107696533, "learning_rate": 0.0002, "epoch": 2.790664272890485, "step": 38860}, {"loss": 0.6573, "grad_norm": 0.891603410243988, "learning_rate": 0.0002, "epoch": 2.791382405745063, "step": 38870}, {"loss": 0.6949, "grad_norm": 0.7687129378318787, "learning_rate": 0.0002, "epoch": 2.792100538599641, "step": 38880}, {"loss": 0.6753, "grad_norm": 0.7549769282341003, "learning_rate": 0.0002, "epoch": 2.792818671454219, "step": 38890}, {"loss": 0.7103, "grad_norm": 0.7792351245880127, "learning_rate": 0.0002, "epoch": 2.793536804308797, "step": 38900}, {"loss": 0.671, "grad_norm": 0.7352819442749023, "learning_rate": 0.0002, "epoch": 2.7942549371633754, "step": 38910}, {"loss": 0.7176, "grad_norm": 0.8758018612861633, "learning_rate": 0.0002, "epoch": 2.7949730700179534, "step": 38920}, {"loss": 0.7033, "grad_norm": 0.8213023543357849, "learning_rate": 0.0002, "epoch": 2.7956912028725314, "step": 38930}, {"loss": 0.6759, "grad_norm": 0.899368941783905, "learning_rate": 0.0002, "epoch": 2.7964093357271094, "step": 38940}, {"loss": 0.6994, "grad_norm": 0.7497758269309998, "learning_rate": 0.0002, "epoch": 2.797127468581688, "step": 38950}, {"loss": 0.7006, "grad_norm": 0.870704710483551, "learning_rate": 0.0002, "epoch": 2.797845601436266, "step": 38960}, {"loss": 0.6865, "grad_norm": 0.8021528720855713, "learning_rate": 0.0002, "epoch": 2.798563734290844, "step": 38970}, {"loss": 0.7254, "grad_norm": 0.7541360855102539, "learning_rate": 0.0002, "epoch": 2.799281867145422, "step": 38980}, {"loss": 0.6275, "grad_norm": 0.8909788131713867, "learning_rate": 0.0002, "epoch": 2.8, "step": 38990}, {"loss": 0.6801, "grad_norm": 0.8175999522209167, "learning_rate": 0.0002, "epoch": 2.800718132854578, "step": 39000}, {"loss": 0.6961, "grad_norm": 0.7336044311523438, "learning_rate": 0.0002, "epoch": 2.8014362657091563, "step": 39010}, {"loss": 0.6573, "grad_norm": 0.7354168891906738, "learning_rate": 0.0002, "epoch": 2.8021543985637343, "step": 39020}, {"loss": 0.6207, "grad_norm": 0.8771968483924866, "learning_rate": 0.0002, "epoch": 2.8028725314183123, "step": 39030}, {"loss": 0.671, "grad_norm": 0.8073309063911438, "learning_rate": 0.0002, "epoch": 2.8035906642728907, "step": 39040}, {"loss": 0.6869, "grad_norm": 0.8475365042686462, "learning_rate": 0.0002, "epoch": 2.8043087971274687, "step": 39050}, {"loss": 0.6549, "grad_norm": 0.7233281135559082, "learning_rate": 0.0002, "epoch": 2.8050269299820467, "step": 39060}, {"loss": 0.6937, "grad_norm": 0.9850572347640991, "learning_rate": 0.0002, "epoch": 2.8057450628366247, "step": 39070}, {"loss": 0.7091, "grad_norm": 1.0635435581207275, "learning_rate": 0.0002, "epoch": 2.8064631956912027, "step": 39080}, {"loss": 0.6345, "grad_norm": 0.8183665871620178, "learning_rate": 0.0002, "epoch": 2.8071813285457807, "step": 39090}, {"loss": 0.7116, "grad_norm": 0.802228569984436, "learning_rate": 0.0002, "epoch": 2.807899461400359, "step": 39100}, {"loss": 0.7078, "grad_norm": 0.9861624836921692, "learning_rate": 0.0002, "epoch": 2.808617594254937, "step": 39110}, {"loss": 0.7242, "grad_norm": 0.675205409526825, "learning_rate": 0.0002, "epoch": 2.809335727109515, "step": 39120}, {"loss": 0.6599, "grad_norm": 0.7503975629806519, "learning_rate": 0.0002, "epoch": 2.8100538599640936, "step": 39130}, {"loss": 0.6684, "grad_norm": 0.8266825675964355, "learning_rate": 0.0002, "epoch": 2.8107719928186716, "step": 39140}, {"loss": 0.6869, "grad_norm": 0.6956485509872437, "learning_rate": 0.0002, "epoch": 2.8114901256732496, "step": 39150}, {"loss": 0.6495, "grad_norm": 0.7363799214363098, "learning_rate": 0.0002, "epoch": 2.8122082585278276, "step": 39160}, {"loss": 0.7047, "grad_norm": 1.3893407583236694, "learning_rate": 0.0002, "epoch": 2.8129263913824056, "step": 39170}, {"loss": 0.6501, "grad_norm": 1.0619654655456543, "learning_rate": 0.0002, "epoch": 2.8136445242369836, "step": 39180}, {"loss": 0.703, "grad_norm": 0.7924326062202454, "learning_rate": 0.0002, "epoch": 2.814362657091562, "step": 39190}, {"loss": 0.6748, "grad_norm": 0.8838121294975281, "learning_rate": 0.0002, "epoch": 2.81508078994614, "step": 39200}, {"loss": 0.6759, "grad_norm": 0.9059016108512878, "learning_rate": 0.0002, "epoch": 2.815798922800718, "step": 39210}, {"loss": 0.6812, "grad_norm": 0.9284590482711792, "learning_rate": 0.0002, "epoch": 2.816517055655296, "step": 39220}, {"loss": 0.6261, "grad_norm": 0.7992225289344788, "learning_rate": 0.0002, "epoch": 2.8172351885098745, "step": 39230}, {"loss": 0.6623, "grad_norm": 0.816376805305481, "learning_rate": 0.0002, "epoch": 2.8179533213644525, "step": 39240}, {"loss": 0.6825, "grad_norm": 0.9183637499809265, "learning_rate": 0.0002, "epoch": 2.8186714542190305, "step": 39250}, {"loss": 0.6558, "grad_norm": 0.7232057452201843, "learning_rate": 0.0002, "epoch": 2.8193895870736085, "step": 39260}, {"loss": 0.7396, "grad_norm": 0.9012457728385925, "learning_rate": 0.0002, "epoch": 2.8201077199281865, "step": 39270}, {"loss": 0.6823, "grad_norm": 0.7796093821525574, "learning_rate": 0.0002, "epoch": 2.8208258527827645, "step": 39280}, {"loss": 0.6997, "grad_norm": 0.8331146836280823, "learning_rate": 0.0002, "epoch": 2.821543985637343, "step": 39290}, {"loss": 0.6867, "grad_norm": 0.8031269907951355, "learning_rate": 0.0002, "epoch": 2.822262118491921, "step": 39300}, {"loss": 0.7451, "grad_norm": 0.8563299179077148, "learning_rate": 0.0002, "epoch": 2.822980251346499, "step": 39310}, {"loss": 0.6828, "grad_norm": 0.8083387613296509, "learning_rate": 0.0002, "epoch": 2.8236983842010774, "step": 39320}, {"loss": 0.723, "grad_norm": 0.8132631182670593, "learning_rate": 0.0002, "epoch": 2.8244165170556554, "step": 39330}, {"loss": 0.6882, "grad_norm": 0.9071316719055176, "learning_rate": 0.0002, "epoch": 2.8251346499102334, "step": 39340}, {"loss": 0.7057, "grad_norm": 0.8224168419837952, "learning_rate": 0.0002, "epoch": 2.8258527827648114, "step": 39350}, {"loss": 0.6831, "grad_norm": 1.073014497756958, "learning_rate": 0.0002, "epoch": 2.8265709156193894, "step": 39360}, {"loss": 0.7392, "grad_norm": 0.9466553926467896, "learning_rate": 0.0002, "epoch": 2.8272890484739674, "step": 39370}, {"loss": 0.7288, "grad_norm": 0.8946257829666138, "learning_rate": 0.0002, "epoch": 2.828007181328546, "step": 39380}, {"loss": 0.7023, "grad_norm": 0.8497758507728577, "learning_rate": 0.0002, "epoch": 2.828725314183124, "step": 39390}, {"loss": 0.6787, "grad_norm": 0.8952143788337708, "learning_rate": 0.0002, "epoch": 2.829443447037702, "step": 39400}, {"loss": 0.7059, "grad_norm": 0.8839313983917236, "learning_rate": 0.0002, "epoch": 2.8301615798922803, "step": 39410}, {"loss": 0.6643, "grad_norm": 0.7576757669448853, "learning_rate": 0.0002, "epoch": 2.8308797127468583, "step": 39420}, {"loss": 0.6509, "grad_norm": 0.8212469816207886, "learning_rate": 0.0002, "epoch": 2.8315978456014363, "step": 39430}, {"loss": 0.6728, "grad_norm": 0.9289504885673523, "learning_rate": 0.0002, "epoch": 2.8323159784560143, "step": 39440}, {"loss": 0.6773, "grad_norm": 0.8745405077934265, "learning_rate": 0.0002, "epoch": 2.8330341113105924, "step": 39450}, {"loss": 0.6741, "grad_norm": 0.7974533438682556, "learning_rate": 0.0002, "epoch": 2.8337522441651704, "step": 39460}, {"loss": 0.6887, "grad_norm": 0.914289116859436, "learning_rate": 0.0002, "epoch": 2.834470377019749, "step": 39470}, {"loss": 0.7009, "grad_norm": 0.7686914801597595, "learning_rate": 0.0002, "epoch": 2.835188509874327, "step": 39480}, {"loss": 0.679, "grad_norm": 0.9289370179176331, "learning_rate": 0.0002, "epoch": 2.835906642728905, "step": 39490}, {"loss": 0.684, "grad_norm": 0.8851973414421082, "learning_rate": 0.0002, "epoch": 2.836624775583483, "step": 39500}, {"loss": 0.7012, "grad_norm": 0.7754096388816833, "learning_rate": 0.0002, "epoch": 2.8373429084380613, "step": 39510}, {"loss": 0.6936, "grad_norm": 0.8801632523536682, "learning_rate": 0.0002, "epoch": 2.8380610412926393, "step": 39520}, {"loss": 0.6878, "grad_norm": 0.9031528234481812, "learning_rate": 0.0002, "epoch": 2.8387791741472173, "step": 39530}, {"loss": 0.6815, "grad_norm": 0.7113721966743469, "learning_rate": 0.0002, "epoch": 2.8394973070017953, "step": 39540}, {"loss": 0.7287, "grad_norm": 0.7880923748016357, "learning_rate": 0.0002, "epoch": 2.8402154398563733, "step": 39550}, {"loss": 0.671, "grad_norm": 2.4828813076019287, "learning_rate": 0.0002, "epoch": 2.8409335727109513, "step": 39560}, {"loss": 0.6824, "grad_norm": 0.9174619913101196, "learning_rate": 0.0002, "epoch": 2.8416517055655297, "step": 39570}, {"loss": 0.7086, "grad_norm": 0.9708074927330017, "learning_rate": 0.0002, "epoch": 2.8423698384201077, "step": 39580}, {"loss": 0.7021, "grad_norm": 0.7968248724937439, "learning_rate": 0.0002, "epoch": 2.8430879712746857, "step": 39590}, {"loss": 0.7121, "grad_norm": 0.7967682480812073, "learning_rate": 0.0002, "epoch": 2.843806104129264, "step": 39600}, {"loss": 0.6284, "grad_norm": 0.7487651109695435, "learning_rate": 0.0002, "epoch": 2.844524236983842, "step": 39610}, {"loss": 0.6624, "grad_norm": 0.6997556686401367, "learning_rate": 0.0002, "epoch": 2.84524236983842, "step": 39620}, {"loss": 0.6987, "grad_norm": 0.7639351487159729, "learning_rate": 0.0002, "epoch": 2.845960502692998, "step": 39630}, {"loss": 0.6757, "grad_norm": 0.9086648225784302, "learning_rate": 0.0002, "epoch": 2.846678635547576, "step": 39640}, {"loss": 0.6841, "grad_norm": 0.91103196144104, "learning_rate": 0.0002, "epoch": 2.847396768402154, "step": 39650}, {"loss": 0.7046, "grad_norm": 0.8096913695335388, "learning_rate": 0.0002, "epoch": 2.8481149012567326, "step": 39660}, {"loss": 0.679, "grad_norm": 0.8961427807807922, "learning_rate": 0.0002, "epoch": 2.8488330341113106, "step": 39670}, {"loss": 0.6589, "grad_norm": 0.7489904761314392, "learning_rate": 0.0002, "epoch": 2.8495511669658886, "step": 39680}, {"loss": 0.6581, "grad_norm": 0.7893617749214172, "learning_rate": 0.0002, "epoch": 2.850269299820467, "step": 39690}, {"loss": 0.7326, "grad_norm": 0.8259761929512024, "learning_rate": 0.0002, "epoch": 2.850987432675045, "step": 39700}, {"loss": 0.6763, "grad_norm": 0.7006617188453674, "learning_rate": 0.0002, "epoch": 2.851705565529623, "step": 39710}, {"loss": 0.7095, "grad_norm": 0.8922327756881714, "learning_rate": 0.0002, "epoch": 2.852423698384201, "step": 39720}, {"loss": 0.6829, "grad_norm": 0.9058550000190735, "learning_rate": 0.0002, "epoch": 2.853141831238779, "step": 39730}, {"loss": 0.6777, "grad_norm": 0.7627129554748535, "learning_rate": 0.0002, "epoch": 2.853859964093357, "step": 39740}, {"loss": 0.6937, "grad_norm": 0.9316968321800232, "learning_rate": 0.0002, "epoch": 2.8545780969479355, "step": 39750}, {"loss": 0.6882, "grad_norm": 0.8424679040908813, "learning_rate": 0.0002, "epoch": 2.8552962298025135, "step": 39760}, {"loss": 0.7018, "grad_norm": 0.6185386776924133, "learning_rate": 0.0002, "epoch": 2.8560143626570915, "step": 39770}, {"loss": 0.7106, "grad_norm": 0.709902286529541, "learning_rate": 0.0002, "epoch": 2.8567324955116695, "step": 39780}, {"loss": 0.7007, "grad_norm": 0.93730229139328, "learning_rate": 0.0002, "epoch": 2.857450628366248, "step": 39790}, {"loss": 0.6973, "grad_norm": 0.875989556312561, "learning_rate": 0.0002, "epoch": 2.858168761220826, "step": 39800}, {"loss": 0.6685, "grad_norm": 0.7424131631851196, "learning_rate": 0.0002, "epoch": 2.858886894075404, "step": 39810}, {"loss": 0.6704, "grad_norm": 0.9108477830886841, "learning_rate": 0.0002, "epoch": 2.859605026929982, "step": 39820}, {"loss": 0.6677, "grad_norm": 0.8248386383056641, "learning_rate": 0.0002, "epoch": 2.86032315978456, "step": 39830}, {"loss": 0.6591, "grad_norm": 0.8739979863166809, "learning_rate": 0.0002, "epoch": 2.861041292639138, "step": 39840}, {"loss": 0.6674, "grad_norm": 0.7940961122512817, "learning_rate": 0.0002, "epoch": 2.8617594254937164, "step": 39850}, {"loss": 0.6875, "grad_norm": 0.7594687938690186, "learning_rate": 0.0002, "epoch": 2.8624775583482944, "step": 39860}, {"loss": 0.7339, "grad_norm": 0.9884313941001892, "learning_rate": 0.0002, "epoch": 2.8631956912028724, "step": 39870}, {"loss": 0.6583, "grad_norm": 0.8537741303443909, "learning_rate": 0.0002, "epoch": 2.863913824057451, "step": 39880}, {"loss": 0.6746, "grad_norm": 0.7407512664794922, "learning_rate": 0.0002, "epoch": 2.864631956912029, "step": 39890}, {"loss": 0.7211, "grad_norm": 1.0179548263549805, "learning_rate": 0.0002, "epoch": 2.865350089766607, "step": 39900}, {"loss": 0.6916, "grad_norm": 0.8822470307350159, "learning_rate": 0.0002, "epoch": 2.866068222621185, "step": 39910}, {"loss": 0.7141, "grad_norm": 0.794448733329773, "learning_rate": 0.0002, "epoch": 2.866786355475763, "step": 39920}, {"loss": 0.6993, "grad_norm": 0.8115299940109253, "learning_rate": 0.0002, "epoch": 2.867504488330341, "step": 39930}, {"loss": 0.655, "grad_norm": 0.7998958826065063, "learning_rate": 0.0002, "epoch": 2.8682226211849193, "step": 39940}, {"loss": 0.7414, "grad_norm": 0.8222435116767883, "learning_rate": 0.0002, "epoch": 2.8689407540394973, "step": 39950}, {"loss": 0.6987, "grad_norm": 0.9495923519134521, "learning_rate": 0.0002, "epoch": 2.8696588868940753, "step": 39960}, {"loss": 0.6567, "grad_norm": 0.6749192476272583, "learning_rate": 0.0002, "epoch": 2.8703770197486533, "step": 39970}, {"loss": 0.7003, "grad_norm": 0.8910874128341675, "learning_rate": 0.0002, "epoch": 2.871095152603232, "step": 39980}, {"loss": 0.6935, "grad_norm": 0.7051638960838318, "learning_rate": 0.0002, "epoch": 2.87181328545781, "step": 39990}, {"loss": 0.663, "grad_norm": 0.8456535339355469, "learning_rate": 0.0002, "epoch": 2.872531418312388, "step": 40000}, {"loss": 0.7222, "grad_norm": 0.934894859790802, "learning_rate": 0.0002, "epoch": 2.873249551166966, "step": 40010}, {"loss": 0.7106, "grad_norm": 0.6740477681159973, "learning_rate": 0.0002, "epoch": 2.873967684021544, "step": 40020}, {"loss": 0.6981, "grad_norm": 0.6632325649261475, "learning_rate": 0.0002, "epoch": 2.8746858168761222, "step": 40030}, {"loss": 0.7037, "grad_norm": 0.8889022469520569, "learning_rate": 0.0002, "epoch": 2.8754039497307002, "step": 40040}, {"loss": 0.7094, "grad_norm": 0.7460705637931824, "learning_rate": 0.0002, "epoch": 2.8761220825852782, "step": 40050}, {"loss": 0.6994, "grad_norm": 0.9795911908149719, "learning_rate": 0.0002, "epoch": 2.8768402154398562, "step": 40060}, {"loss": 0.6602, "grad_norm": 1.0002509355545044, "learning_rate": 0.0002, "epoch": 2.8775583482944347, "step": 40070}, {"loss": 0.7191, "grad_norm": 0.7867239713668823, "learning_rate": 0.0002, "epoch": 2.8782764811490127, "step": 40080}, {"loss": 0.6772, "grad_norm": 1.0221471786499023, "learning_rate": 0.0002, "epoch": 2.8789946140035907, "step": 40090}, {"loss": 0.7317, "grad_norm": 0.8091005086898804, "learning_rate": 0.0002, "epoch": 2.8797127468581687, "step": 40100}, {"loss": 0.7334, "grad_norm": 0.8485820293426514, "learning_rate": 0.0002, "epoch": 2.8804308797127467, "step": 40110}, {"loss": 0.7221, "grad_norm": 0.7850196957588196, "learning_rate": 0.0002, "epoch": 2.8811490125673247, "step": 40120}, {"loss": 0.6696, "grad_norm": 0.7906134128570557, "learning_rate": 0.0002, "epoch": 2.881867145421903, "step": 40130}, {"loss": 0.648, "grad_norm": 0.7957962155342102, "learning_rate": 0.0002, "epoch": 2.882585278276481, "step": 40140}, {"loss": 0.6774, "grad_norm": 1.0687522888183594, "learning_rate": 0.0002, "epoch": 2.883303411131059, "step": 40150}, {"loss": 0.7256, "grad_norm": 0.713752031326294, "learning_rate": 0.0002, "epoch": 2.8840215439856376, "step": 40160}, {"loss": 0.7144, "grad_norm": 1.1603864431381226, "learning_rate": 0.0002, "epoch": 2.8847396768402156, "step": 40170}, {"loss": 0.7223, "grad_norm": 0.8423245549201965, "learning_rate": 0.0002, "epoch": 2.8854578096947936, "step": 40180}, {"loss": 0.6796, "grad_norm": 0.7554550766944885, "learning_rate": 0.0002, "epoch": 2.8861759425493716, "step": 40190}, {"loss": 0.6923, "grad_norm": 0.6006978750228882, "learning_rate": 0.0002, "epoch": 2.8868940754039496, "step": 40200}, {"loss": 0.6893, "grad_norm": 0.923068106174469, "learning_rate": 0.0002, "epoch": 2.8876122082585276, "step": 40210}, {"loss": 0.6688, "grad_norm": 0.7659787535667419, "learning_rate": 0.0002, "epoch": 2.888330341113106, "step": 40220}, {"loss": 0.6706, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 2.889048473967684, "step": 40230}, {"loss": 0.6922, "grad_norm": 1.1267355680465698, "learning_rate": 0.0002, "epoch": 2.889766606822262, "step": 40240}, {"loss": 0.7636, "grad_norm": 0.8548554182052612, "learning_rate": 0.0002, "epoch": 2.89048473967684, "step": 40250}, {"loss": 0.6847, "grad_norm": 0.7846875786781311, "learning_rate": 0.0002, "epoch": 2.8912028725314185, "step": 40260}, {"loss": 0.6796, "grad_norm": 0.8606904745101929, "learning_rate": 0.0002, "epoch": 2.8919210053859965, "step": 40270}, {"loss": 0.6864, "grad_norm": 0.6508898138999939, "learning_rate": 0.0002, "epoch": 2.8926391382405745, "step": 40280}, {"loss": 0.6793, "grad_norm": 0.7903237342834473, "learning_rate": 0.0002, "epoch": 2.8933572710951525, "step": 40290}, {"loss": 0.6642, "grad_norm": 0.7320941686630249, "learning_rate": 0.0002, "epoch": 2.8940754039497305, "step": 40300}, {"loss": 0.6813, "grad_norm": 1.0031821727752686, "learning_rate": 0.0002, "epoch": 2.894793536804309, "step": 40310}, {"loss": 0.6071, "grad_norm": 0.7463554739952087, "learning_rate": 0.0002, "epoch": 2.895511669658887, "step": 40320}, {"loss": 0.6856, "grad_norm": 0.8455599546432495, "learning_rate": 0.0002, "epoch": 2.896229802513465, "step": 40330}, {"loss": 0.7252, "grad_norm": 0.7645914554595947, "learning_rate": 0.0002, "epoch": 2.896947935368043, "step": 40340}, {"loss": 0.7181, "grad_norm": 0.9074810147285461, "learning_rate": 0.0002, "epoch": 2.8976660682226214, "step": 40350}, {"loss": 0.6935, "grad_norm": 0.9070153832435608, "learning_rate": 0.0002, "epoch": 2.8983842010771994, "step": 40360}, {"loss": 0.7021, "grad_norm": 0.8649221658706665, "learning_rate": 0.0002, "epoch": 2.8991023339317774, "step": 40370}, {"loss": 0.7402, "grad_norm": 1.0325016975402832, "learning_rate": 0.0002, "epoch": 2.8998204667863554, "step": 40380}, {"loss": 0.6889, "grad_norm": 0.8688622713088989, "learning_rate": 0.0002, "epoch": 2.9005385996409334, "step": 40390}, {"loss": 0.7209, "grad_norm": 0.83316969871521, "learning_rate": 0.0002, "epoch": 2.9012567324955114, "step": 40400}, {"loss": 0.6915, "grad_norm": 1.0146536827087402, "learning_rate": 0.0002, "epoch": 2.90197486535009, "step": 40410}, {"loss": 0.67, "grad_norm": 6.21811580657959, "learning_rate": 0.0002, "epoch": 2.902692998204668, "step": 40420}, {"loss": 0.675, "grad_norm": 0.8747655749320984, "learning_rate": 0.0002, "epoch": 2.903411131059246, "step": 40430}, {"loss": 0.6781, "grad_norm": 0.8671547174453735, "learning_rate": 0.0002, "epoch": 2.9041292639138243, "step": 40440}, {"loss": 0.693, "grad_norm": 0.7888760566711426, "learning_rate": 0.0002, "epoch": 2.9048473967684023, "step": 40450}, {"loss": 0.7208, "grad_norm": 0.7182217240333557, "learning_rate": 0.0002, "epoch": 2.9055655296229803, "step": 40460}, {"loss": 0.7393, "grad_norm": 0.8802227973937988, "learning_rate": 0.0002, "epoch": 2.9062836624775583, "step": 40470}, {"loss": 0.6755, "grad_norm": 0.8106126189231873, "learning_rate": 0.0002, "epoch": 2.9070017953321363, "step": 40480}, {"loss": 0.7251, "grad_norm": 0.7313538789749146, "learning_rate": 0.0002, "epoch": 2.9077199281867143, "step": 40490}, {"loss": 0.6927, "grad_norm": 0.6098655462265015, "learning_rate": 0.0002, "epoch": 2.9084380610412928, "step": 40500}, {"loss": 0.6667, "grad_norm": 0.8849560618400574, "learning_rate": 0.0002, "epoch": 2.9091561938958708, "step": 40510}, {"loss": 0.7199, "grad_norm": 0.8761322498321533, "learning_rate": 0.0002, "epoch": 2.9098743267504488, "step": 40520}, {"loss": 0.6952, "grad_norm": 0.8259703516960144, "learning_rate": 0.0002, "epoch": 2.9105924596050268, "step": 40530}, {"loss": 0.6547, "grad_norm": 0.6613079309463501, "learning_rate": 0.0002, "epoch": 2.911310592459605, "step": 40540}, {"loss": 0.7642, "grad_norm": 0.825678825378418, "learning_rate": 0.0002, "epoch": 2.912028725314183, "step": 40550}, {"loss": 0.7052, "grad_norm": 0.824850857257843, "learning_rate": 0.0002, "epoch": 2.912746858168761, "step": 40560}, {"loss": 0.6869, "grad_norm": 0.9629682898521423, "learning_rate": 0.0002, "epoch": 2.9134649910233392, "step": 40570}, {"loss": 0.7588, "grad_norm": 0.7446485161781311, "learning_rate": 0.0002, "epoch": 2.9141831238779172, "step": 40580}, {"loss": 0.7045, "grad_norm": 0.9028317928314209, "learning_rate": 0.0002, "epoch": 2.9149012567324957, "step": 40590}, {"loss": 0.7128, "grad_norm": 0.9646022319793701, "learning_rate": 0.0002, "epoch": 2.9156193895870737, "step": 40600}, {"loss": 0.6782, "grad_norm": 0.8845045566558838, "learning_rate": 0.0002, "epoch": 2.9163375224416517, "step": 40610}, {"loss": 0.7179, "grad_norm": 0.9660372734069824, "learning_rate": 0.0002, "epoch": 2.9170556552962297, "step": 40620}, {"loss": 0.7442, "grad_norm": 0.8914347290992737, "learning_rate": 0.0002, "epoch": 2.917773788150808, "step": 40630}, {"loss": 0.6435, "grad_norm": 0.7789235711097717, "learning_rate": 0.0002, "epoch": 2.918491921005386, "step": 40640}, {"loss": 0.7156, "grad_norm": 0.8221206665039062, "learning_rate": 0.0002, "epoch": 2.919210053859964, "step": 40650}, {"loss": 0.7363, "grad_norm": 0.9550618529319763, "learning_rate": 0.0002, "epoch": 2.919928186714542, "step": 40660}, {"loss": 0.6911, "grad_norm": 0.868315577507019, "learning_rate": 0.0002, "epoch": 2.92064631956912, "step": 40670}, {"loss": 0.6939, "grad_norm": 0.852878749370575, "learning_rate": 0.0002, "epoch": 2.921364452423698, "step": 40680}, {"loss": 0.6497, "grad_norm": 0.8388790488243103, "learning_rate": 0.0002, "epoch": 2.9220825852782766, "step": 40690}, {"loss": 0.7299, "grad_norm": 0.9897602200508118, "learning_rate": 0.0002, "epoch": 2.9228007181328546, "step": 40700}, {"loss": 0.695, "grad_norm": 0.8050527572631836, "learning_rate": 0.0002, "epoch": 2.9235188509874326, "step": 40710}, {"loss": 0.6924, "grad_norm": 0.7296929955482483, "learning_rate": 0.0002, "epoch": 2.924236983842011, "step": 40720}, {"loss": 0.759, "grad_norm": 0.917475700378418, "learning_rate": 0.0002, "epoch": 2.924955116696589, "step": 40730}, {"loss": 0.6965, "grad_norm": 0.9118483662605286, "learning_rate": 0.0002, "epoch": 2.925673249551167, "step": 40740}, {"loss": 0.6918, "grad_norm": 0.7722473740577698, "learning_rate": 0.0002, "epoch": 2.926391382405745, "step": 40750}, {"loss": 0.7103, "grad_norm": 0.7950358986854553, "learning_rate": 0.0002, "epoch": 2.927109515260323, "step": 40760}, {"loss": 0.7266, "grad_norm": 0.8868561387062073, "learning_rate": 0.0002, "epoch": 2.927827648114901, "step": 40770}, {"loss": 0.7513, "grad_norm": 0.7923154830932617, "learning_rate": 0.0002, "epoch": 2.9285457809694795, "step": 40780}, {"loss": 0.6822, "grad_norm": 0.7285428047180176, "learning_rate": 0.0002, "epoch": 2.9292639138240575, "step": 40790}, {"loss": 0.6748, "grad_norm": 0.794775664806366, "learning_rate": 0.0002, "epoch": 2.9299820466786355, "step": 40800}, {"loss": 0.6967, "grad_norm": 0.8351698517799377, "learning_rate": 0.0002, "epoch": 2.9307001795332135, "step": 40810}, {"loss": 0.6927, "grad_norm": 0.853082001209259, "learning_rate": 0.0002, "epoch": 2.931418312387792, "step": 40820}, {"loss": 0.7047, "grad_norm": 0.8209722638130188, "learning_rate": 0.0002, "epoch": 2.93213644524237, "step": 40830}, {"loss": 0.6742, "grad_norm": 0.8982136845588684, "learning_rate": 0.0002, "epoch": 2.932854578096948, "step": 40840}, {"loss": 0.6617, "grad_norm": 0.8373305201530457, "learning_rate": 0.0002, "epoch": 2.933572710951526, "step": 40850}, {"loss": 0.6754, "grad_norm": 0.8326864242553711, "learning_rate": 0.0002, "epoch": 2.934290843806104, "step": 40860}, {"loss": 0.7151, "grad_norm": 0.7232590317726135, "learning_rate": 0.0002, "epoch": 2.9350089766606824, "step": 40870}, {"loss": 0.7311, "grad_norm": 0.823615312576294, "learning_rate": 0.0002, "epoch": 2.9357271095152604, "step": 40880}, {"loss": 0.7122, "grad_norm": 0.7532811760902405, "learning_rate": 0.0002, "epoch": 2.9364452423698384, "step": 40890}, {"loss": 0.7254, "grad_norm": 0.9594773650169373, "learning_rate": 0.0002, "epoch": 2.9371633752244164, "step": 40900}, {"loss": 0.7024, "grad_norm": 0.8368398547172546, "learning_rate": 0.0002, "epoch": 2.937881508078995, "step": 40910}, {"loss": 0.7201, "grad_norm": 0.8336817026138306, "learning_rate": 0.0002, "epoch": 2.938599640933573, "step": 40920}, {"loss": 0.6402, "grad_norm": 0.8413758277893066, "learning_rate": 0.0002, "epoch": 2.939317773788151, "step": 40930}, {"loss": 0.7054, "grad_norm": 0.7117549180984497, "learning_rate": 0.0002, "epoch": 2.940035906642729, "step": 40940}, {"loss": 0.6101, "grad_norm": 0.8741925954818726, "learning_rate": 0.0002, "epoch": 2.940754039497307, "step": 40950}, {"loss": 0.7491, "grad_norm": 0.8476088047027588, "learning_rate": 0.0002, "epoch": 2.941472172351885, "step": 40960}, {"loss": 0.7084, "grad_norm": 0.674659788608551, "learning_rate": 0.0002, "epoch": 2.9421903052064633, "step": 40970}, {"loss": 0.6714, "grad_norm": 0.7087500691413879, "learning_rate": 0.0002, "epoch": 2.9429084380610413, "step": 40980}, {"loss": 0.6953, "grad_norm": 0.9202252626419067, "learning_rate": 0.0002, "epoch": 2.9436265709156193, "step": 40990}, {"loss": 0.7244, "grad_norm": 0.9775124192237854, "learning_rate": 0.0002, "epoch": 2.9443447037701977, "step": 41000}, {"loss": 0.6897, "grad_norm": 0.7465068101882935, "learning_rate": 0.0002, "epoch": 2.9450628366247757, "step": 41010}, {"loss": 0.6944, "grad_norm": 0.7229986786842346, "learning_rate": 0.0002, "epoch": 2.9457809694793538, "step": 41020}, {"loss": 0.6754, "grad_norm": 0.7228954434394836, "learning_rate": 0.0002, "epoch": 2.9464991023339318, "step": 41030}, {"loss": 0.6604, "grad_norm": 0.9396149516105652, "learning_rate": 0.0002, "epoch": 2.9472172351885098, "step": 41040}, {"loss": 0.6498, "grad_norm": 0.9458696842193604, "learning_rate": 0.0002, "epoch": 2.9479353680430878, "step": 41050}, {"loss": 0.7154, "grad_norm": 0.8276246190071106, "learning_rate": 0.0002, "epoch": 2.948653500897666, "step": 41060}, {"loss": 0.6567, "grad_norm": 0.7927420139312744, "learning_rate": 0.0002, "epoch": 2.949371633752244, "step": 41070}, {"loss": 0.7442, "grad_norm": 0.7403103709220886, "learning_rate": 0.0002, "epoch": 2.950089766606822, "step": 41080}, {"loss": 0.6856, "grad_norm": 0.9813524484634399, "learning_rate": 0.0002, "epoch": 2.9508078994614, "step": 41090}, {"loss": 0.7271, "grad_norm": 0.8560924530029297, "learning_rate": 0.0002, "epoch": 2.9515260323159787, "step": 41100}, {"loss": 0.6851, "grad_norm": 0.6937443017959595, "learning_rate": 0.0002, "epoch": 2.9522441651705567, "step": 41110}, {"loss": 0.6817, "grad_norm": 0.8440476655960083, "learning_rate": 0.0002, "epoch": 2.9529622980251347, "step": 41120}, {"loss": 0.7082, "grad_norm": 1.1260770559310913, "learning_rate": 0.0002, "epoch": 2.9536804308797127, "step": 41130}, {"loss": 0.6745, "grad_norm": 0.8789936900138855, "learning_rate": 0.0002, "epoch": 2.9543985637342907, "step": 41140}, {"loss": 0.7297, "grad_norm": 0.8205832839012146, "learning_rate": 0.0002, "epoch": 2.9551166965888687, "step": 41150}, {"loss": 0.7036, "grad_norm": 0.8148444294929504, "learning_rate": 0.0002, "epoch": 2.955834829443447, "step": 41160}, {"loss": 0.6923, "grad_norm": 0.791296660900116, "learning_rate": 0.0002, "epoch": 2.956552962298025, "step": 41170}, {"loss": 0.6589, "grad_norm": 1.3229854106903076, "learning_rate": 0.0002, "epoch": 2.957271095152603, "step": 41180}, {"loss": 0.6691, "grad_norm": 0.906423807144165, "learning_rate": 0.0002, "epoch": 2.9579892280071816, "step": 41190}, {"loss": 0.6979, "grad_norm": 0.8707411289215088, "learning_rate": 0.0002, "epoch": 2.9587073608617596, "step": 41200}, {"loss": 0.6442, "grad_norm": 1.0362473726272583, "learning_rate": 0.0002, "epoch": 2.9594254937163376, "step": 41210}, {"loss": 0.6725, "grad_norm": 0.818546712398529, "learning_rate": 0.0002, "epoch": 2.9601436265709156, "step": 41220}, {"loss": 0.7158, "grad_norm": 0.8558517098426819, "learning_rate": 0.0002, "epoch": 2.9608617594254936, "step": 41230}, {"loss": 0.7056, "grad_norm": 0.8262931704521179, "learning_rate": 0.0002, "epoch": 2.9615798922800716, "step": 41240}, {"loss": 0.6256, "grad_norm": 0.9603250026702881, "learning_rate": 0.0002, "epoch": 2.96229802513465, "step": 41250}, {"loss": 0.68, "grad_norm": 0.891610860824585, "learning_rate": 0.0002, "epoch": 2.963016157989228, "step": 41260}, {"loss": 0.7732, "grad_norm": 0.9823883175849915, "learning_rate": 0.0002, "epoch": 2.963734290843806, "step": 41270}, {"loss": 0.7144, "grad_norm": 0.8783510327339172, "learning_rate": 0.0002, "epoch": 2.9644524236983845, "step": 41280}, {"loss": 0.7196, "grad_norm": 0.873656690120697, "learning_rate": 0.0002, "epoch": 2.9651705565529625, "step": 41290}, {"loss": 0.6531, "grad_norm": 0.8281165957450867, "learning_rate": 0.0002, "epoch": 2.9658886894075405, "step": 41300}, {"loss": 0.69, "grad_norm": 0.8008899092674255, "learning_rate": 0.0002, "epoch": 2.9666068222621185, "step": 41310}, {"loss": 0.6923, "grad_norm": 0.8564065098762512, "learning_rate": 0.0002, "epoch": 2.9673249551166965, "step": 41320}, {"loss": 0.6871, "grad_norm": 0.786119818687439, "learning_rate": 0.0002, "epoch": 2.9680430879712745, "step": 41330}, {"loss": 0.7105, "grad_norm": 1.3152399063110352, "learning_rate": 0.0002, "epoch": 2.968761220825853, "step": 41340}, {"loss": 0.6575, "grad_norm": 0.7551527619361877, "learning_rate": 0.0002, "epoch": 2.969479353680431, "step": 41350}, {"loss": 0.6939, "grad_norm": 1.1397290229797363, "learning_rate": 0.0002, "epoch": 2.970197486535009, "step": 41360}, {"loss": 0.7119, "grad_norm": 0.8333854079246521, "learning_rate": 0.0002, "epoch": 2.970915619389587, "step": 41370}, {"loss": 0.6941, "grad_norm": 0.8096165657043457, "learning_rate": 0.0002, "epoch": 2.9716337522441654, "step": 41380}, {"loss": 0.7748, "grad_norm": 0.8378547430038452, "learning_rate": 0.0002, "epoch": 2.9723518850987434, "step": 41390}, {"loss": 0.7678, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 2.9730700179533214, "step": 41400}, {"loss": 0.6962, "grad_norm": 0.8722409605979919, "learning_rate": 0.0002, "epoch": 2.9737881508078994, "step": 41410}, {"loss": 0.7298, "grad_norm": 0.6680061221122742, "learning_rate": 0.0002, "epoch": 2.9745062836624774, "step": 41420}, {"loss": 0.6731, "grad_norm": 0.7666152715682983, "learning_rate": 0.0002, "epoch": 2.9752244165170554, "step": 41430}, {"loss": 0.7377, "grad_norm": 0.8489957451820374, "learning_rate": 0.0002, "epoch": 2.975942549371634, "step": 41440}, {"loss": 0.6816, "grad_norm": 0.8516127467155457, "learning_rate": 0.0002, "epoch": 2.976660682226212, "step": 41450}, {"loss": 0.697, "grad_norm": 0.8836804628372192, "learning_rate": 0.0002, "epoch": 2.97737881508079, "step": 41460}, {"loss": 0.7048, "grad_norm": 1.0963364839553833, "learning_rate": 0.0002, "epoch": 2.9780969479353683, "step": 41470}, {"loss": 0.6695, "grad_norm": 0.9908610582351685, "learning_rate": 0.0002, "epoch": 2.9788150807899463, "step": 41480}, {"loss": 0.7184, "grad_norm": 0.8822041153907776, "learning_rate": 0.0002, "epoch": 2.9795332136445243, "step": 41490}, {"loss": 0.7192, "grad_norm": 0.717723548412323, "learning_rate": 0.0002, "epoch": 2.9802513464991023, "step": 41500}, {"loss": 0.711, "grad_norm": 0.8413400053977966, "learning_rate": 0.0002, "epoch": 2.9809694793536803, "step": 41510}, {"loss": 0.6871, "grad_norm": 0.8771023750305176, "learning_rate": 0.0002, "epoch": 2.9816876122082583, "step": 41520}, {"loss": 0.6802, "grad_norm": 0.7185000777244568, "learning_rate": 0.0002, "epoch": 2.9824057450628367, "step": 41530}, {"loss": 0.706, "grad_norm": 0.8299767374992371, "learning_rate": 0.0002, "epoch": 2.9831238779174147, "step": 41540}, {"loss": 0.6569, "grad_norm": 0.9309971928596497, "learning_rate": 0.0002, "epoch": 2.9838420107719927, "step": 41550}, {"loss": 0.6598, "grad_norm": 0.7644693851470947, "learning_rate": 0.0002, "epoch": 2.984560143626571, "step": 41560}, {"loss": 0.7186, "grad_norm": 0.7888111472129822, "learning_rate": 0.0002, "epoch": 2.985278276481149, "step": 41570}, {"loss": 0.6984, "grad_norm": 1.0921967029571533, "learning_rate": 0.0002, "epoch": 2.985996409335727, "step": 41580}, {"loss": 0.6629, "grad_norm": 0.8116785883903503, "learning_rate": 0.0002, "epoch": 2.986714542190305, "step": 41590}, {"loss": 0.6842, "grad_norm": 0.983269214630127, "learning_rate": 0.0002, "epoch": 2.987432675044883, "step": 41600}, {"loss": 0.6675, "grad_norm": 0.81700599193573, "learning_rate": 0.0002, "epoch": 2.988150807899461, "step": 41610}, {"loss": 0.7525, "grad_norm": 0.7545617818832397, "learning_rate": 0.0002, "epoch": 2.9888689407540396, "step": 41620}, {"loss": 0.6698, "grad_norm": 0.8695791363716125, "learning_rate": 0.0002, "epoch": 2.9895870736086176, "step": 41630}, {"loss": 0.7446, "grad_norm": 0.8980445861816406, "learning_rate": 0.0002, "epoch": 2.9903052064631956, "step": 41640}, {"loss": 0.6616, "grad_norm": 0.7884747982025146, "learning_rate": 0.0002, "epoch": 2.9910233393177736, "step": 41650}, {"loss": 0.6461, "grad_norm": 0.8347880840301514, "learning_rate": 0.0002, "epoch": 2.991741472172352, "step": 41660}, {"loss": 0.6607, "grad_norm": 0.7786261439323425, "learning_rate": 0.0002, "epoch": 2.99245960502693, "step": 41670}, {"loss": 0.6834, "grad_norm": 0.7830624580383301, "learning_rate": 0.0002, "epoch": 2.993177737881508, "step": 41680}, {"loss": 0.7116, "grad_norm": 0.8293532133102417, "learning_rate": 0.0002, "epoch": 2.993895870736086, "step": 41690}, {"loss": 0.7029, "grad_norm": 0.8476244211196899, "learning_rate": 0.0002, "epoch": 2.994614003590664, "step": 41700}, {"loss": 0.6909, "grad_norm": 0.7218726873397827, "learning_rate": 0.0002, "epoch": 2.995332136445242, "step": 41710}, {"loss": 0.6579, "grad_norm": 0.8144199252128601, "learning_rate": 0.0002, "epoch": 2.9960502692998205, "step": 41720}, {"loss": 0.7011, "grad_norm": 0.7047123312950134, "learning_rate": 0.0002, "epoch": 2.9967684021543985, "step": 41730}, {"loss": 0.6555, "grad_norm": 0.8412184715270996, "learning_rate": 0.0002, "epoch": 2.9974865350089765, "step": 41740}, {"loss": 0.7237, "grad_norm": 0.8840848207473755, "learning_rate": 0.0002, "epoch": 2.998204667863555, "step": 41750}, {"loss": 0.6618, "grad_norm": 0.7302142977714539, "learning_rate": 0.0002, "epoch": 2.998922800718133, "step": 41760}, {"loss": 0.6596, "grad_norm": 0.7075994610786438, "learning_rate": 0.0002, "epoch": 2.999640933572711, "step": 41770}, {"eval_loss": 1.1079821586608887, "eval_runtime": 55.1897, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 3.0, "step": 41775}, {"loss": 0.6472, "grad_norm": 0.8630077838897705, "learning_rate": 0.0002, "epoch": 3.000359066427289, "step": 41780}, {"loss": 0.5843, "grad_norm": 0.8901806473731995, "learning_rate": 0.0002, "epoch": 3.001077199281867, "step": 41790}, {"loss": 0.5789, "grad_norm": 0.8291767835617065, "learning_rate": 0.0002, "epoch": 3.0017953321364454, "step": 41800}, {"loss": 0.6049, "grad_norm": 0.792519211769104, "learning_rate": 0.0002, "epoch": 3.0025134649910235, "step": 41810}, {"loss": 0.6131, "grad_norm": 1.1330063343048096, "learning_rate": 0.0002, "epoch": 3.0032315978456015, "step": 41820}, {"loss": 0.6225, "grad_norm": 0.9401350617408752, "learning_rate": 0.0002, "epoch": 3.0039497307001795, "step": 41830}, {"loss": 0.5924, "grad_norm": 0.8065463304519653, "learning_rate": 0.0002, "epoch": 3.0046678635547575, "step": 41840}, {"loss": 0.6161, "grad_norm": 0.8309979438781738, "learning_rate": 0.0002, "epoch": 3.005385996409336, "step": 41850}, {"loss": 0.6099, "grad_norm": 0.7432689070701599, "learning_rate": 0.0002, "epoch": 3.006104129263914, "step": 41860}, {"loss": 0.5901, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 3.006822262118492, "step": 41870}, {"loss": 0.6211, "grad_norm": 1.4364255666732788, "learning_rate": 0.0002, "epoch": 3.00754039497307, "step": 41880}, {"loss": 0.5988, "grad_norm": 0.9023072123527527, "learning_rate": 0.0002, "epoch": 3.008258527827648, "step": 41890}, {"loss": 0.6296, "grad_norm": 0.7790587544441223, "learning_rate": 0.0002, "epoch": 3.0089766606822264, "step": 41900}, {"loss": 0.5908, "grad_norm": 0.9163706302642822, "learning_rate": 0.0002, "epoch": 3.0096947935368044, "step": 41910}, {"loss": 0.6216, "grad_norm": 0.8147963285446167, "learning_rate": 0.0002, "epoch": 3.0104129263913824, "step": 41920}, {"loss": 0.6546, "grad_norm": 0.8432748913764954, "learning_rate": 0.0002, "epoch": 3.0111310592459604, "step": 41930}, {"loss": 0.5815, "grad_norm": 0.9216182231903076, "learning_rate": 0.0002, "epoch": 3.011849192100539, "step": 41940}, {"loss": 0.6336, "grad_norm": 0.62154221534729, "learning_rate": 0.0002, "epoch": 3.012567324955117, "step": 41950}, {"loss": 0.5868, "grad_norm": 0.8902392387390137, "learning_rate": 0.0002, "epoch": 3.013285457809695, "step": 41960}, {"loss": 0.6205, "grad_norm": 0.9601083993911743, "learning_rate": 0.0002, "epoch": 3.014003590664273, "step": 41970}, {"loss": 0.6001, "grad_norm": 0.8938809037208557, "learning_rate": 0.0002, "epoch": 3.014721723518851, "step": 41980}, {"loss": 0.6215, "grad_norm": 1.0621999502182007, "learning_rate": 0.0002, "epoch": 3.0154398563734293, "step": 41990}, {"loss": 0.6453, "grad_norm": 0.7310585379600525, "learning_rate": 0.0002, "epoch": 3.0161579892280073, "step": 42000}, {"loss": 0.5674, "grad_norm": 0.8475853800773621, "learning_rate": 0.0002, "epoch": 3.0168761220825853, "step": 42010}, {"loss": 0.605, "grad_norm": 0.8509864807128906, "learning_rate": 0.0002, "epoch": 3.0175942549371633, "step": 42020}, {"loss": 0.6487, "grad_norm": 0.7461876273155212, "learning_rate": 0.0002, "epoch": 3.0183123877917413, "step": 42030}, {"loss": 0.6136, "grad_norm": 0.7734265327453613, "learning_rate": 0.0002, "epoch": 3.0190305206463197, "step": 42040}, {"loss": 0.6073, "grad_norm": 0.9056455492973328, "learning_rate": 0.0002, "epoch": 3.0197486535008977, "step": 42050}, {"loss": 0.6015, "grad_norm": 0.9183889031410217, "learning_rate": 0.0002, "epoch": 3.0204667863554757, "step": 42060}, {"loss": 0.6502, "grad_norm": 1.0777326822280884, "learning_rate": 0.0002, "epoch": 3.0211849192100537, "step": 42070}, {"loss": 0.6775, "grad_norm": 0.9217308163642883, "learning_rate": 0.0002, "epoch": 3.021903052064632, "step": 42080}, {"loss": 0.6157, "grad_norm": 0.8220202326774597, "learning_rate": 0.0002, "epoch": 3.02262118491921, "step": 42090}, {"loss": 0.5786, "grad_norm": 0.8454978466033936, "learning_rate": 0.0002, "epoch": 3.023339317773788, "step": 42100}, {"loss": 0.5653, "grad_norm": 0.8116370439529419, "learning_rate": 0.0002, "epoch": 3.024057450628366, "step": 42110}, {"loss": 0.6307, "grad_norm": 0.8064935207366943, "learning_rate": 0.0002, "epoch": 3.024775583482944, "step": 42120}, {"loss": 0.6567, "grad_norm": 0.9718650579452515, "learning_rate": 0.0002, "epoch": 3.0254937163375226, "step": 42130}, {"loss": 0.5936, "grad_norm": 0.8817588090896606, "learning_rate": 0.0002, "epoch": 3.0262118491921006, "step": 42140}, {"loss": 0.5625, "grad_norm": 0.7757318615913391, "learning_rate": 0.0002, "epoch": 3.0269299820466786, "step": 42150}, {"loss": 0.5704, "grad_norm": 0.7500545382499695, "learning_rate": 0.0002, "epoch": 3.0276481149012566, "step": 42160}, {"loss": 0.5635, "grad_norm": 0.72913658618927, "learning_rate": 0.0002, "epoch": 3.0283662477558346, "step": 42170}, {"loss": 0.6354, "grad_norm": 0.7641891837120056, "learning_rate": 0.0002, "epoch": 3.029084380610413, "step": 42180}, {"loss": 0.621, "grad_norm": 0.7682021856307983, "learning_rate": 0.0002, "epoch": 3.029802513464991, "step": 42190}, {"loss": 0.6377, "grad_norm": 0.8145958781242371, "learning_rate": 0.0002, "epoch": 3.030520646319569, "step": 42200}, {"loss": 0.6008, "grad_norm": 1.0546396970748901, "learning_rate": 0.0002, "epoch": 3.031238779174147, "step": 42210}, {"loss": 0.6177, "grad_norm": 0.8222804665565491, "learning_rate": 0.0002, "epoch": 3.0319569120287255, "step": 42220}, {"loss": 0.6264, "grad_norm": 0.8245829343795776, "learning_rate": 0.0002, "epoch": 3.0326750448833035, "step": 42230}, {"loss": 0.5828, "grad_norm": 0.9059963822364807, "learning_rate": 0.0002, "epoch": 3.0333931777378815, "step": 42240}, {"loss": 0.6373, "grad_norm": 1.026747465133667, "learning_rate": 0.0002, "epoch": 3.0341113105924595, "step": 42250}, {"loss": 0.636, "grad_norm": 0.9108404517173767, "learning_rate": 0.0002, "epoch": 3.0348294434470375, "step": 42260}, {"loss": 0.589, "grad_norm": 0.9828516840934753, "learning_rate": 0.0002, "epoch": 3.035547576301616, "step": 42270}, {"loss": 0.6558, "grad_norm": 0.9664266705513, "learning_rate": 0.0002, "epoch": 3.036265709156194, "step": 42280}, {"loss": 0.6157, "grad_norm": 0.7577654719352722, "learning_rate": 0.0002, "epoch": 3.036983842010772, "step": 42290}, {"loss": 0.5849, "grad_norm": 0.8331853151321411, "learning_rate": 0.0002, "epoch": 3.03770197486535, "step": 42300}, {"loss": 0.6335, "grad_norm": 0.8017228245735168, "learning_rate": 0.0002, "epoch": 3.038420107719928, "step": 42310}, {"loss": 0.6148, "grad_norm": 1.0316718816757202, "learning_rate": 0.0002, "epoch": 3.0391382405745064, "step": 42320}, {"loss": 0.5934, "grad_norm": 0.9379803538322449, "learning_rate": 0.0002, "epoch": 3.0398563734290844, "step": 42330}, {"loss": 0.6358, "grad_norm": 0.7554476857185364, "learning_rate": 0.0002, "epoch": 3.0405745062836624, "step": 42340}, {"loss": 0.5951, "grad_norm": 0.7377917766571045, "learning_rate": 0.0002, "epoch": 3.0412926391382404, "step": 42350}, {"loss": 0.5769, "grad_norm": 1.0655276775360107, "learning_rate": 0.0002, "epoch": 3.042010771992819, "step": 42360}, {"loss": 0.5892, "grad_norm": 0.7748511433601379, "learning_rate": 0.0002, "epoch": 3.042728904847397, "step": 42370}, {"loss": 0.6512, "grad_norm": 0.848649799823761, "learning_rate": 0.0002, "epoch": 3.043447037701975, "step": 42380}, {"loss": 0.6411, "grad_norm": 0.7754636406898499, "learning_rate": 0.0002, "epoch": 3.044165170556553, "step": 42390}, {"loss": 0.6665, "grad_norm": 0.8173656463623047, "learning_rate": 0.0002, "epoch": 3.044883303411131, "step": 42400}, {"loss": 0.5877, "grad_norm": 0.7881983518600464, "learning_rate": 0.0002, "epoch": 3.0456014362657093, "step": 42410}, {"loss": 0.5832, "grad_norm": 0.971072256565094, "learning_rate": 0.0002, "epoch": 3.0463195691202873, "step": 42420}, {"loss": 0.6303, "grad_norm": 0.8400143384933472, "learning_rate": 0.0002, "epoch": 3.0470377019748653, "step": 42430}, {"loss": 0.6557, "grad_norm": 1.0028647184371948, "learning_rate": 0.0002, "epoch": 3.0477558348294433, "step": 42440}, {"loss": 0.5949, "grad_norm": 0.9728034734725952, "learning_rate": 0.0002, "epoch": 3.0484739676840213, "step": 42450}, {"loss": 0.6222, "grad_norm": 0.937633752822876, "learning_rate": 0.0002, "epoch": 3.0491921005386, "step": 42460}, {"loss": 0.6254, "grad_norm": 1.0265642404556274, "learning_rate": 0.0002, "epoch": 3.049910233393178, "step": 42470}, {"loss": 0.6078, "grad_norm": 0.9733216762542725, "learning_rate": 0.0002, "epoch": 3.050628366247756, "step": 42480}, {"loss": 0.5766, "grad_norm": 0.7039174437522888, "learning_rate": 0.0002, "epoch": 3.051346499102334, "step": 42490}, {"loss": 0.6422, "grad_norm": 0.7515231370925903, "learning_rate": 0.0002, "epoch": 3.0520646319569122, "step": 42500}, {"loss": 0.5517, "grad_norm": 0.9115300178527832, "learning_rate": 0.0002, "epoch": 3.0527827648114902, "step": 42510}, {"loss": 0.6738, "grad_norm": 0.7403655648231506, "learning_rate": 0.0002, "epoch": 3.0535008976660682, "step": 42520}, {"loss": 0.5528, "grad_norm": 0.7826810479164124, "learning_rate": 0.0002, "epoch": 3.0542190305206462, "step": 42530}, {"loss": 0.6513, "grad_norm": 0.8007349371910095, "learning_rate": 0.0002, "epoch": 3.0549371633752243, "step": 42540}, {"loss": 0.6118, "grad_norm": 0.7975959777832031, "learning_rate": 0.0002, "epoch": 3.0556552962298027, "step": 42550}, {"loss": 0.6157, "grad_norm": 0.9665228128433228, "learning_rate": 0.0002, "epoch": 3.0563734290843807, "step": 42560}, {"loss": 0.6095, "grad_norm": 0.8386123180389404, "learning_rate": 0.0002, "epoch": 3.0570915619389587, "step": 42570}, {"loss": 0.64, "grad_norm": 0.7437782287597656, "learning_rate": 0.0002, "epoch": 3.0578096947935367, "step": 42580}, {"loss": 0.6399, "grad_norm": 0.8360698223114014, "learning_rate": 0.0002, "epoch": 3.0585278276481147, "step": 42590}, {"loss": 0.6259, "grad_norm": 0.8982073664665222, "learning_rate": 0.0002, "epoch": 3.059245960502693, "step": 42600}, {"loss": 0.6235, "grad_norm": 0.9425758719444275, "learning_rate": 0.0002, "epoch": 3.059964093357271, "step": 42610}, {"loss": 0.631, "grad_norm": 0.8567131161689758, "learning_rate": 0.0002, "epoch": 3.060682226211849, "step": 42620}, {"loss": 0.609, "grad_norm": 0.9322942495346069, "learning_rate": 0.0002, "epoch": 3.061400359066427, "step": 42630}, {"loss": 0.6384, "grad_norm": 0.8283235430717468, "learning_rate": 0.0002, "epoch": 3.0621184919210056, "step": 42640}, {"loss": 0.6345, "grad_norm": 0.8457967638969421, "learning_rate": 0.0002, "epoch": 3.0628366247755836, "step": 42650}, {"loss": 0.631, "grad_norm": 0.8205100893974304, "learning_rate": 0.0002, "epoch": 3.0635547576301616, "step": 42660}, {"loss": 0.6094, "grad_norm": 0.8385181427001953, "learning_rate": 0.0002, "epoch": 3.0642728904847396, "step": 42670}, {"loss": 0.6169, "grad_norm": 1.2959390878677368, "learning_rate": 0.0002, "epoch": 3.0649910233393176, "step": 42680}, {"loss": 0.6531, "grad_norm": 0.7150540351867676, "learning_rate": 0.0002, "epoch": 3.065709156193896, "step": 42690}, {"loss": 0.6456, "grad_norm": 0.6647360920906067, "learning_rate": 0.0002, "epoch": 3.066427289048474, "step": 42700}, {"loss": 0.6151, "grad_norm": 0.9148316979408264, "learning_rate": 0.0002, "epoch": 3.067145421903052, "step": 42710}, {"loss": 0.6298, "grad_norm": 0.8606209754943848, "learning_rate": 0.0002, "epoch": 3.06786355475763, "step": 42720}, {"loss": 0.636, "grad_norm": 1.4255632162094116, "learning_rate": 0.0002, "epoch": 3.068581687612208, "step": 42730}, {"loss": 0.6363, "grad_norm": 0.9131710529327393, "learning_rate": 0.0002, "epoch": 3.0692998204667865, "step": 42740}, {"loss": 0.6432, "grad_norm": 0.9560360908508301, "learning_rate": 0.0002, "epoch": 3.0700179533213645, "step": 42750}, {"loss": 0.6259, "grad_norm": 0.9278100728988647, "learning_rate": 0.0002, "epoch": 3.0707360861759425, "step": 42760}, {"loss": 0.6001, "grad_norm": 0.7258471846580505, "learning_rate": 0.0002, "epoch": 3.0714542190305205, "step": 42770}, {"loss": 0.6447, "grad_norm": 1.1537690162658691, "learning_rate": 0.0002, "epoch": 3.072172351885099, "step": 42780}, {"loss": 0.6237, "grad_norm": 0.8562588691711426, "learning_rate": 0.0002, "epoch": 3.072890484739677, "step": 42790}, {"loss": 0.645, "grad_norm": 1.0271626710891724, "learning_rate": 0.0002, "epoch": 3.073608617594255, "step": 42800}, {"loss": 0.6782, "grad_norm": 0.85148024559021, "learning_rate": 0.0002, "epoch": 3.074326750448833, "step": 42810}, {"loss": 0.5905, "grad_norm": 0.805772602558136, "learning_rate": 0.0002, "epoch": 3.075044883303411, "step": 42820}, {"loss": 0.623, "grad_norm": 0.8057122230529785, "learning_rate": 0.0002, "epoch": 3.0757630161579894, "step": 42830}, {"loss": 0.6391, "grad_norm": 0.7997274994850159, "learning_rate": 0.0002, "epoch": 3.0764811490125674, "step": 42840}, {"loss": 0.5965, "grad_norm": 0.8739321231842041, "learning_rate": 0.0002, "epoch": 3.0771992818671454, "step": 42850}, {"loss": 0.6027, "grad_norm": 0.833951473236084, "learning_rate": 0.0002, "epoch": 3.0779174147217234, "step": 42860}, {"loss": 0.6251, "grad_norm": 0.8813839554786682, "learning_rate": 0.0002, "epoch": 3.0786355475763014, "step": 42870}, {"loss": 0.6485, "grad_norm": 0.9020521640777588, "learning_rate": 0.0002, "epoch": 3.07935368043088, "step": 42880}, {"loss": 0.5719, "grad_norm": 0.888148844242096, "learning_rate": 0.0002, "epoch": 3.080071813285458, "step": 42890}, {"loss": 0.6715, "grad_norm": 0.8110589385032654, "learning_rate": 0.0002, "epoch": 3.080789946140036, "step": 42900}, {"loss": 0.5931, "grad_norm": 0.818738579750061, "learning_rate": 0.0002, "epoch": 3.081508078994614, "step": 42910}, {"loss": 0.6723, "grad_norm": 0.9607479572296143, "learning_rate": 0.0002, "epoch": 3.082226211849192, "step": 42920}, {"loss": 0.6045, "grad_norm": 0.8162698745727539, "learning_rate": 0.0002, "epoch": 3.0829443447037703, "step": 42930}, {"loss": 0.5975, "grad_norm": 0.8170801997184753, "learning_rate": 0.0002, "epoch": 3.0836624775583483, "step": 42940}, {"loss": 0.5748, "grad_norm": 0.9250763654708862, "learning_rate": 0.0002, "epoch": 3.0843806104129263, "step": 42950}, {"loss": 0.6651, "grad_norm": 0.898097813129425, "learning_rate": 0.0002, "epoch": 3.0850987432675043, "step": 42960}, {"loss": 0.6573, "grad_norm": 0.9398433566093445, "learning_rate": 0.0002, "epoch": 3.0858168761220828, "step": 42970}, {"loss": 0.6243, "grad_norm": 1.052808165550232, "learning_rate": 0.0002, "epoch": 3.0865350089766608, "step": 42980}, {"loss": 0.6622, "grad_norm": 0.8974723219871521, "learning_rate": 0.0002, "epoch": 3.087253141831239, "step": 42990}, {"loss": 0.6135, "grad_norm": 0.7517408728599548, "learning_rate": 0.0002, "epoch": 3.087971274685817, "step": 43000}, {"loss": 0.6185, "grad_norm": 0.8054485321044922, "learning_rate": 0.0002, "epoch": 3.088689407540395, "step": 43010}, {"loss": 0.6199, "grad_norm": 0.9896154999732971, "learning_rate": 0.0002, "epoch": 3.0894075403949732, "step": 43020}, {"loss": 0.6308, "grad_norm": 0.7887356281280518, "learning_rate": 0.0002, "epoch": 3.0901256732495512, "step": 43030}, {"loss": 0.6173, "grad_norm": 1.0119125843048096, "learning_rate": 0.0002, "epoch": 3.0908438061041292, "step": 43040}, {"loss": 0.6294, "grad_norm": 0.8753892779350281, "learning_rate": 0.0002, "epoch": 3.0915619389587072, "step": 43050}, {"loss": 0.6068, "grad_norm": 0.8322654962539673, "learning_rate": 0.0002, "epoch": 3.0922800718132857, "step": 43060}, {"loss": 0.6237, "grad_norm": 1.0605992078781128, "learning_rate": 0.0002, "epoch": 3.0929982046678637, "step": 43070}, {"loss": 0.6507, "grad_norm": 0.8783912062644958, "learning_rate": 0.0002, "epoch": 3.0937163375224417, "step": 43080}, {"loss": 0.6023, "grad_norm": 0.8839107751846313, "learning_rate": 0.0002, "epoch": 3.0944344703770197, "step": 43090}, {"loss": 0.6588, "grad_norm": 1.1655086278915405, "learning_rate": 0.0002, "epoch": 3.0951526032315977, "step": 43100}, {"loss": 0.6367, "grad_norm": 0.7051523327827454, "learning_rate": 0.0002, "epoch": 3.095870736086176, "step": 43110}, {"loss": 0.5941, "grad_norm": 0.7793807983398438, "learning_rate": 0.0002, "epoch": 3.096588868940754, "step": 43120}, {"loss": 0.6073, "grad_norm": 0.8352194428443909, "learning_rate": 0.0002, "epoch": 3.097307001795332, "step": 43130}, {"loss": 0.6087, "grad_norm": 0.9684847593307495, "learning_rate": 0.0002, "epoch": 3.09802513464991, "step": 43140}, {"loss": 0.6347, "grad_norm": 1.1106340885162354, "learning_rate": 0.0002, "epoch": 3.098743267504488, "step": 43150}, {"loss": 0.6395, "grad_norm": 0.7814911603927612, "learning_rate": 0.0002, "epoch": 3.0994614003590666, "step": 43160}, {"loss": 0.637, "grad_norm": 0.7923110723495483, "learning_rate": 0.0002, "epoch": 3.1001795332136446, "step": 43170}, {"loss": 0.6218, "grad_norm": 0.87022864818573, "learning_rate": 0.0002, "epoch": 3.1008976660682226, "step": 43180}, {"loss": 0.6246, "grad_norm": 0.9352855682373047, "learning_rate": 0.0002, "epoch": 3.1016157989228006, "step": 43190}, {"loss": 0.5943, "grad_norm": 0.8548445105552673, "learning_rate": 0.0002, "epoch": 3.1023339317773786, "step": 43200}, {"loss": 0.6106, "grad_norm": 0.9576025009155273, "learning_rate": 0.0002, "epoch": 3.103052064631957, "step": 43210}, {"loss": 0.6222, "grad_norm": 0.7430430054664612, "learning_rate": 0.0002, "epoch": 3.103770197486535, "step": 43220}, {"loss": 0.6223, "grad_norm": 0.9619144797325134, "learning_rate": 0.0002, "epoch": 3.104488330341113, "step": 43230}, {"loss": 0.6171, "grad_norm": 0.8622338771820068, "learning_rate": 0.0002, "epoch": 3.105206463195691, "step": 43240}, {"loss": 0.6336, "grad_norm": 0.853489339351654, "learning_rate": 0.0002, "epoch": 3.1059245960502695, "step": 43250}, {"loss": 0.635, "grad_norm": 0.9253206849098206, "learning_rate": 0.0002, "epoch": 3.1066427289048475, "step": 43260}, {"loss": 0.68, "grad_norm": 0.9700671434402466, "learning_rate": 0.0002, "epoch": 3.1073608617594255, "step": 43270}, {"loss": 0.6284, "grad_norm": 1.0550731420516968, "learning_rate": 0.0002, "epoch": 3.1080789946140035, "step": 43280}, {"loss": 0.6389, "grad_norm": 0.939452052116394, "learning_rate": 0.0002, "epoch": 3.1087971274685815, "step": 43290}, {"loss": 0.621, "grad_norm": 0.8855276107788086, "learning_rate": 0.0002, "epoch": 3.10951526032316, "step": 43300}, {"loss": 0.5814, "grad_norm": 0.92197185754776, "learning_rate": 0.0002, "epoch": 3.110233393177738, "step": 43310}, {"loss": 0.6341, "grad_norm": 0.8825578689575195, "learning_rate": 0.0002, "epoch": 3.110951526032316, "step": 43320}, {"loss": 0.6412, "grad_norm": 0.9964608550071716, "learning_rate": 0.0002, "epoch": 3.111669658886894, "step": 43330}, {"loss": 0.6074, "grad_norm": 0.9070520401000977, "learning_rate": 0.0002, "epoch": 3.1123877917414724, "step": 43340}, {"loss": 0.6503, "grad_norm": 0.9699633717536926, "learning_rate": 0.0002, "epoch": 3.1131059245960504, "step": 43350}, {"loss": 0.6545, "grad_norm": 0.7384091019630432, "learning_rate": 0.0002, "epoch": 3.1138240574506284, "step": 43360}, {"loss": 0.6644, "grad_norm": 0.9445326328277588, "learning_rate": 0.0002, "epoch": 3.1145421903052064, "step": 43370}, {"loss": 0.6088, "grad_norm": 0.8906524181365967, "learning_rate": 0.0002, "epoch": 3.1152603231597844, "step": 43380}, {"loss": 0.6213, "grad_norm": 0.8850129246711731, "learning_rate": 0.0002, "epoch": 3.115978456014363, "step": 43390}, {"loss": 0.6156, "grad_norm": 0.7091860771179199, "learning_rate": 0.0002, "epoch": 3.116696588868941, "step": 43400}, {"loss": 0.6056, "grad_norm": 0.8992764949798584, "learning_rate": 0.0002, "epoch": 3.117414721723519, "step": 43410}, {"loss": 0.6336, "grad_norm": 0.9166698455810547, "learning_rate": 0.0002, "epoch": 3.118132854578097, "step": 43420}, {"loss": 0.7011, "grad_norm": 1.1195749044418335, "learning_rate": 0.0002, "epoch": 3.118850987432675, "step": 43430}, {"loss": 0.6409, "grad_norm": 0.9414069652557373, "learning_rate": 0.0002, "epoch": 3.1195691202872533, "step": 43440}, {"loss": 0.6533, "grad_norm": 0.7641217112541199, "learning_rate": 0.0002, "epoch": 3.1202872531418313, "step": 43450}, {"loss": 0.6613, "grad_norm": 1.2659285068511963, "learning_rate": 0.0002, "epoch": 3.1210053859964093, "step": 43460}, {"loss": 0.631, "grad_norm": 0.9968213438987732, "learning_rate": 0.0002, "epoch": 3.1217235188509873, "step": 43470}, {"loss": 0.5833, "grad_norm": 0.8819042444229126, "learning_rate": 0.0002, "epoch": 3.1224416517055653, "step": 43480}, {"loss": 0.6819, "grad_norm": 0.9124775528907776, "learning_rate": 0.0002, "epoch": 3.1231597845601438, "step": 43490}, {"loss": 0.675, "grad_norm": 0.868354082107544, "learning_rate": 0.0002, "epoch": 3.1238779174147218, "step": 43500}, {"loss": 0.6348, "grad_norm": 0.7367526292800903, "learning_rate": 0.0002, "epoch": 3.1245960502692998, "step": 43510}, {"loss": 0.6068, "grad_norm": 0.7553679943084717, "learning_rate": 0.0002, "epoch": 3.1253141831238778, "step": 43520}, {"loss": 0.6346, "grad_norm": 0.7970008850097656, "learning_rate": 0.0002, "epoch": 3.126032315978456, "step": 43530}, {"loss": 0.6357, "grad_norm": 0.9117488861083984, "learning_rate": 0.0002, "epoch": 3.126750448833034, "step": 43540}, {"loss": 0.6609, "grad_norm": 0.8004103899002075, "learning_rate": 0.0002, "epoch": 3.127468581687612, "step": 43550}, {"loss": 0.596, "grad_norm": 0.736518919467926, "learning_rate": 0.0002, "epoch": 3.12818671454219, "step": 43560}, {"loss": 0.5945, "grad_norm": 0.8568395376205444, "learning_rate": 0.0002, "epoch": 3.128904847396768, "step": 43570}, {"loss": 0.665, "grad_norm": 0.9344052672386169, "learning_rate": 0.0002, "epoch": 3.1296229802513467, "step": 43580}, {"loss": 0.6403, "grad_norm": 0.7986525297164917, "learning_rate": 0.0002, "epoch": 3.1303411131059247, "step": 43590}, {"loss": 0.61, "grad_norm": 0.8283242583274841, "learning_rate": 0.0002, "epoch": 3.1310592459605027, "step": 43600}, {"loss": 0.6003, "grad_norm": 0.6534292101860046, "learning_rate": 0.0002, "epoch": 3.1317773788150807, "step": 43610}, {"loss": 0.6994, "grad_norm": 0.9585428833961487, "learning_rate": 0.0002, "epoch": 3.132495511669659, "step": 43620}, {"loss": 0.6007, "grad_norm": 0.8299157023429871, "learning_rate": 0.0002, "epoch": 3.133213644524237, "step": 43630}, {"loss": 0.6169, "grad_norm": 0.9050052762031555, "learning_rate": 0.0002, "epoch": 3.133931777378815, "step": 43640}, {"loss": 0.6217, "grad_norm": 1.0457062721252441, "learning_rate": 0.0002, "epoch": 3.134649910233393, "step": 43650}, {"loss": 0.6147, "grad_norm": 0.907691240310669, "learning_rate": 0.0002, "epoch": 3.135368043087971, "step": 43660}, {"loss": 0.5808, "grad_norm": 0.8868935108184814, "learning_rate": 0.0002, "epoch": 3.1360861759425496, "step": 43670}, {"loss": 0.6427, "grad_norm": 0.8585456609725952, "learning_rate": 0.0002, "epoch": 3.1368043087971276, "step": 43680}, {"loss": 0.6242, "grad_norm": 1.0402741432189941, "learning_rate": 0.0002, "epoch": 3.1375224416517056, "step": 43690}, {"loss": 0.641, "grad_norm": 1.0866798162460327, "learning_rate": 0.0002, "epoch": 3.1382405745062836, "step": 43700}, {"loss": 0.6082, "grad_norm": 0.7637296915054321, "learning_rate": 0.0002, "epoch": 3.1389587073608616, "step": 43710}, {"loss": 0.6256, "grad_norm": 0.755235493183136, "learning_rate": 0.0002, "epoch": 3.13967684021544, "step": 43720}, {"loss": 0.6441, "grad_norm": 0.7258853316307068, "learning_rate": 0.0002, "epoch": 3.140394973070018, "step": 43730}, {"loss": 0.5891, "grad_norm": 1.0425268411636353, "learning_rate": 0.0002, "epoch": 3.141113105924596, "step": 43740}, {"loss": 0.6527, "grad_norm": 0.9171959757804871, "learning_rate": 0.0002, "epoch": 3.141831238779174, "step": 43750}, {"loss": 0.6365, "grad_norm": 0.8900150656700134, "learning_rate": 0.0002, "epoch": 3.142549371633752, "step": 43760}, {"loss": 0.6324, "grad_norm": 0.9879246354103088, "learning_rate": 0.0002, "epoch": 3.1432675044883305, "step": 43770}, {"loss": 0.6624, "grad_norm": 0.7853389382362366, "learning_rate": 0.0002, "epoch": 3.1439856373429085, "step": 43780}, {"loss": 0.6259, "grad_norm": 1.0245232582092285, "learning_rate": 0.0002, "epoch": 3.1447037701974865, "step": 43790}, {"loss": 0.6278, "grad_norm": 0.8486390113830566, "learning_rate": 0.0002, "epoch": 3.1454219030520645, "step": 43800}, {"loss": 0.6175, "grad_norm": 0.8536406755447388, "learning_rate": 0.0002, "epoch": 3.146140035906643, "step": 43810}, {"loss": 0.5901, "grad_norm": 0.9653734564781189, "learning_rate": 0.0002, "epoch": 3.146858168761221, "step": 43820}, {"loss": 0.6041, "grad_norm": 0.8292608857154846, "learning_rate": 0.0002, "epoch": 3.147576301615799, "step": 43830}, {"loss": 0.6688, "grad_norm": 1.147524118423462, "learning_rate": 0.0002, "epoch": 3.148294434470377, "step": 43840}, {"loss": 0.6155, "grad_norm": 0.9317546486854553, "learning_rate": 0.0002, "epoch": 3.149012567324955, "step": 43850}, {"loss": 0.6305, "grad_norm": 0.8651045560836792, "learning_rate": 0.0002, "epoch": 3.1497307001795334, "step": 43860}, {"loss": 0.5985, "grad_norm": 0.8718969225883484, "learning_rate": 0.0002, "epoch": 3.1504488330341114, "step": 43870}, {"loss": 0.6206, "grad_norm": 1.0140702724456787, "learning_rate": 0.0002, "epoch": 3.1511669658886894, "step": 43880}, {"loss": 0.5941, "grad_norm": 0.75941401720047, "learning_rate": 0.0002, "epoch": 3.1518850987432674, "step": 43890}, {"loss": 0.5957, "grad_norm": 0.6618940234184265, "learning_rate": 0.0002, "epoch": 3.152603231597846, "step": 43900}, {"loss": 0.6262, "grad_norm": 1.0013338327407837, "learning_rate": 0.0002, "epoch": 3.153321364452424, "step": 43910}, {"loss": 0.6263, "grad_norm": 0.8735299706459045, "learning_rate": 0.0002, "epoch": 3.154039497307002, "step": 43920}, {"loss": 0.627, "grad_norm": 1.141914963722229, "learning_rate": 0.0002, "epoch": 3.15475763016158, "step": 43930}, {"loss": 0.6604, "grad_norm": 1.0916038751602173, "learning_rate": 0.0002, "epoch": 3.155475763016158, "step": 43940}, {"loss": 0.6228, "grad_norm": 0.7042547464370728, "learning_rate": 0.0002, "epoch": 3.1561938958707363, "step": 43950}, {"loss": 0.6069, "grad_norm": 0.9885236620903015, "learning_rate": 0.0002, "epoch": 3.1569120287253143, "step": 43960}, {"loss": 0.5973, "grad_norm": 0.8083009719848633, "learning_rate": 0.0002, "epoch": 3.1576301615798923, "step": 43970}, {"loss": 0.6416, "grad_norm": 1.082627296447754, "learning_rate": 0.0002, "epoch": 3.1583482944344703, "step": 43980}, {"loss": 0.624, "grad_norm": 0.9293290376663208, "learning_rate": 0.0002, "epoch": 3.1590664272890483, "step": 43990}, {"loss": 0.5665, "grad_norm": 0.861003041267395, "learning_rate": 0.0002, "epoch": 3.1597845601436267, "step": 44000}, {"loss": 0.6221, "grad_norm": 0.9565994143486023, "learning_rate": 0.0002, "epoch": 3.1605026929982047, "step": 44010}, {"loss": 0.7038, "grad_norm": 0.9609305262565613, "learning_rate": 0.0002, "epoch": 3.1612208258527827, "step": 44020}, {"loss": 0.6064, "grad_norm": 0.847830593585968, "learning_rate": 0.0002, "epoch": 3.1619389587073607, "step": 44030}, {"loss": 0.6299, "grad_norm": 0.852357804775238, "learning_rate": 0.0002, "epoch": 3.1626570915619387, "step": 44040}, {"loss": 0.5943, "grad_norm": 0.8634562492370605, "learning_rate": 0.0002, "epoch": 3.163375224416517, "step": 44050}, {"loss": 0.6011, "grad_norm": 1.0259950160980225, "learning_rate": 0.0002, "epoch": 3.164093357271095, "step": 44060}, {"loss": 0.7039, "grad_norm": 0.9615250825881958, "learning_rate": 0.0002, "epoch": 3.164811490125673, "step": 44070}, {"loss": 0.6179, "grad_norm": 0.9892165660858154, "learning_rate": 0.0002, "epoch": 3.165529622980251, "step": 44080}, {"loss": 0.6295, "grad_norm": 0.8827354907989502, "learning_rate": 0.0002, "epoch": 3.1662477558348296, "step": 44090}, {"loss": 0.6131, "grad_norm": 0.9258168339729309, "learning_rate": 0.0002, "epoch": 3.1669658886894076, "step": 44100}, {"loss": 0.5746, "grad_norm": 0.7983399033546448, "learning_rate": 0.0002, "epoch": 3.1676840215439857, "step": 44110}, {"loss": 0.6075, "grad_norm": 0.9917809963226318, "learning_rate": 0.0002, "epoch": 3.1684021543985637, "step": 44120}, {"loss": 0.6474, "grad_norm": 1.058927297592163, "learning_rate": 0.0002, "epoch": 3.1691202872531417, "step": 44130}, {"loss": 0.6211, "grad_norm": 1.0095895528793335, "learning_rate": 0.0002, "epoch": 3.16983842010772, "step": 44140}, {"loss": 0.6586, "grad_norm": 0.9032495617866516, "learning_rate": 0.0002, "epoch": 3.170556552962298, "step": 44150}, {"loss": 0.6356, "grad_norm": 0.9391272664070129, "learning_rate": 0.0002, "epoch": 3.171274685816876, "step": 44160}, {"loss": 0.6324, "grad_norm": 0.990755558013916, "learning_rate": 0.0002, "epoch": 3.171992818671454, "step": 44170}, {"loss": 0.5647, "grad_norm": 0.9310759902000427, "learning_rate": 0.0002, "epoch": 3.172710951526032, "step": 44180}, {"loss": 0.6802, "grad_norm": 0.7698856592178345, "learning_rate": 0.0002, "epoch": 3.1734290843806106, "step": 44190}, {"loss": 0.6109, "grad_norm": 0.7735867500305176, "learning_rate": 0.0002, "epoch": 3.1741472172351886, "step": 44200}, {"loss": 0.6252, "grad_norm": 1.1447525024414062, "learning_rate": 0.0002, "epoch": 3.1748653500897666, "step": 44210}, {"loss": 0.6268, "grad_norm": 0.8667060136795044, "learning_rate": 0.0002, "epoch": 3.1755834829443446, "step": 44220}, {"loss": 0.6066, "grad_norm": 0.8596829771995544, "learning_rate": 0.0002, "epoch": 3.176301615798923, "step": 44230}, {"loss": 0.6142, "grad_norm": 0.8607654571533203, "learning_rate": 0.0002, "epoch": 3.177019748653501, "step": 44240}, {"loss": 0.6358, "grad_norm": 0.9346948266029358, "learning_rate": 0.0002, "epoch": 3.177737881508079, "step": 44250}, {"loss": 0.6099, "grad_norm": 0.852344810962677, "learning_rate": 0.0002, "epoch": 3.178456014362657, "step": 44260}, {"loss": 0.5759, "grad_norm": 0.9260450005531311, "learning_rate": 0.0002, "epoch": 3.179174147217235, "step": 44270}, {"loss": 0.6419, "grad_norm": 0.924053430557251, "learning_rate": 0.0002, "epoch": 3.1798922800718135, "step": 44280}, {"loss": 0.6456, "grad_norm": 1.001965045928955, "learning_rate": 0.0002, "epoch": 3.1806104129263915, "step": 44290}, {"loss": 0.6211, "grad_norm": 0.943215012550354, "learning_rate": 0.0002, "epoch": 3.1813285457809695, "step": 44300}, {"loss": 0.6261, "grad_norm": 1.006977915763855, "learning_rate": 0.0002, "epoch": 3.1820466786355475, "step": 44310}, {"loss": 0.6684, "grad_norm": 0.9768950343132019, "learning_rate": 0.0002, "epoch": 3.1827648114901255, "step": 44320}, {"loss": 0.6334, "grad_norm": 0.9297489523887634, "learning_rate": 0.0002, "epoch": 3.183482944344704, "step": 44330}, {"loss": 0.6291, "grad_norm": 0.9110919237136841, "learning_rate": 0.0002, "epoch": 3.184201077199282, "step": 44340}, {"loss": 0.6389, "grad_norm": 0.9821381568908691, "learning_rate": 0.0002, "epoch": 3.18491921005386, "step": 44350}, {"loss": 0.6342, "grad_norm": 0.8451243042945862, "learning_rate": 0.0002, "epoch": 3.185637342908438, "step": 44360}, {"loss": 0.6709, "grad_norm": 0.9676638245582581, "learning_rate": 0.0002, "epoch": 3.1863554757630164, "step": 44370}, {"loss": 0.6506, "grad_norm": 0.9826035499572754, "learning_rate": 0.0002, "epoch": 3.1870736086175944, "step": 44380}, {"loss": 0.6425, "grad_norm": 0.9453121423721313, "learning_rate": 0.0002, "epoch": 3.1877917414721724, "step": 44390}, {"loss": 0.6481, "grad_norm": 0.7766330242156982, "learning_rate": 0.0002, "epoch": 3.1885098743267504, "step": 44400}, {"loss": 0.6369, "grad_norm": 0.9302349090576172, "learning_rate": 0.0002, "epoch": 3.1892280071813284, "step": 44410}, {"loss": 0.5586, "grad_norm": 0.8335331082344055, "learning_rate": 0.0002, "epoch": 3.189946140035907, "step": 44420}, {"loss": 0.673, "grad_norm": 0.6722736358642578, "learning_rate": 0.0002, "epoch": 3.190664272890485, "step": 44430}, {"loss": 0.6809, "grad_norm": 0.9047536849975586, "learning_rate": 0.0002, "epoch": 3.191382405745063, "step": 44440}, {"loss": 0.6085, "grad_norm": 0.9653822183609009, "learning_rate": 0.0002, "epoch": 3.192100538599641, "step": 44450}, {"loss": 0.6071, "grad_norm": 0.7750703692436218, "learning_rate": 0.0002, "epoch": 3.192818671454219, "step": 44460}, {"loss": 0.6323, "grad_norm": 0.7767539024353027, "learning_rate": 0.0002, "epoch": 3.1935368043087973, "step": 44470}, {"loss": 0.6471, "grad_norm": 0.8597778081893921, "learning_rate": 0.0002, "epoch": 3.1942549371633753, "step": 44480}, {"loss": 0.6804, "grad_norm": 1.1711493730545044, "learning_rate": 0.0002, "epoch": 3.1949730700179533, "step": 44490}, {"loss": 0.5917, "grad_norm": 0.9025220274925232, "learning_rate": 0.0002, "epoch": 3.1956912028725313, "step": 44500}, {"loss": 0.6445, "grad_norm": 0.8084979057312012, "learning_rate": 0.0002, "epoch": 3.1964093357271093, "step": 44510}, {"loss": 0.5943, "grad_norm": 0.8475074172019958, "learning_rate": 0.0002, "epoch": 3.1971274685816877, "step": 44520}, {"loss": 0.5959, "grad_norm": 0.9915644526481628, "learning_rate": 0.0002, "epoch": 3.1978456014362657, "step": 44530}, {"loss": 0.627, "grad_norm": 0.992231547832489, "learning_rate": 0.0002, "epoch": 3.1985637342908437, "step": 44540}, {"loss": 0.625, "grad_norm": 0.9804556369781494, "learning_rate": 0.0002, "epoch": 3.1992818671454217, "step": 44550}, {"loss": 0.6534, "grad_norm": 1.045558214187622, "learning_rate": 0.0002, "epoch": 3.2, "step": 44560}, {"loss": 0.6201, "grad_norm": 1.0880261659622192, "learning_rate": 0.0002, "epoch": 3.200718132854578, "step": 44570}, {"loss": 0.6471, "grad_norm": 0.9511138200759888, "learning_rate": 0.0002, "epoch": 3.201436265709156, "step": 44580}, {"loss": 0.5961, "grad_norm": 0.9115344882011414, "learning_rate": 0.0002, "epoch": 3.202154398563734, "step": 44590}, {"loss": 0.6504, "grad_norm": 1.0738362073898315, "learning_rate": 0.0002, "epoch": 3.202872531418312, "step": 44600}, {"loss": 0.6324, "grad_norm": 0.8209697604179382, "learning_rate": 0.0002, "epoch": 3.2035906642728906, "step": 44610}, {"loss": 0.6445, "grad_norm": 0.9220197796821594, "learning_rate": 0.0002, "epoch": 3.2043087971274686, "step": 44620}, {"loss": 0.5798, "grad_norm": 0.8859700560569763, "learning_rate": 0.0002, "epoch": 3.2050269299820466, "step": 44630}, {"loss": 0.6185, "grad_norm": 0.9772757291793823, "learning_rate": 0.0002, "epoch": 3.2057450628366246, "step": 44640}, {"loss": 0.6528, "grad_norm": 0.9385574460029602, "learning_rate": 0.0002, "epoch": 3.206463195691203, "step": 44650}, {"loss": 0.6098, "grad_norm": 0.839958906173706, "learning_rate": 0.0002, "epoch": 3.207181328545781, "step": 44660}, {"loss": 0.6803, "grad_norm": 0.860478401184082, "learning_rate": 0.0002, "epoch": 3.207899461400359, "step": 44670}, {"loss": 0.683, "grad_norm": 0.846886396408081, "learning_rate": 0.0002, "epoch": 3.208617594254937, "step": 44680}, {"loss": 0.6312, "grad_norm": 0.8591006398200989, "learning_rate": 0.0002, "epoch": 3.209335727109515, "step": 44690}, {"loss": 0.6173, "grad_norm": 0.9236023426055908, "learning_rate": 0.0002, "epoch": 3.2100538599640935, "step": 44700}, {"loss": 0.6471, "grad_norm": 0.7348999977111816, "learning_rate": 0.0002, "epoch": 3.2107719928186715, "step": 44710}, {"loss": 0.6239, "grad_norm": 1.0041730403900146, "learning_rate": 0.0002, "epoch": 3.2114901256732495, "step": 44720}, {"loss": 0.6612, "grad_norm": 0.8382687568664551, "learning_rate": 0.0002, "epoch": 3.2122082585278275, "step": 44730}, {"loss": 0.6026, "grad_norm": 0.8253511190414429, "learning_rate": 0.0002, "epoch": 3.2129263913824055, "step": 44740}, {"loss": 0.6129, "grad_norm": 0.9589242935180664, "learning_rate": 0.0002, "epoch": 3.213644524236984, "step": 44750}, {"loss": 0.6476, "grad_norm": 0.8938157558441162, "learning_rate": 0.0002, "epoch": 3.214362657091562, "step": 44760}, {"loss": 0.6811, "grad_norm": 1.0085135698318481, "learning_rate": 0.0002, "epoch": 3.21508078994614, "step": 44770}, {"loss": 0.646, "grad_norm": 0.8647134304046631, "learning_rate": 0.0002, "epoch": 3.215798922800718, "step": 44780}, {"loss": 0.6169, "grad_norm": 1.09453284740448, "learning_rate": 0.0002, "epoch": 3.216517055655296, "step": 44790}, {"loss": 0.6156, "grad_norm": 0.8710666298866272, "learning_rate": 0.0002, "epoch": 3.2172351885098744, "step": 44800}, {"loss": 0.662, "grad_norm": 0.8080880641937256, "learning_rate": 0.0002, "epoch": 3.2179533213644524, "step": 44810}, {"loss": 0.6039, "grad_norm": 1.0440675020217896, "learning_rate": 0.0002, "epoch": 3.2186714542190304, "step": 44820}, {"loss": 0.6629, "grad_norm": 1.1036376953125, "learning_rate": 0.0002, "epoch": 3.2193895870736084, "step": 44830}, {"loss": 0.6474, "grad_norm": 0.8783546686172485, "learning_rate": 0.0002, "epoch": 3.220107719928187, "step": 44840}, {"loss": 0.6286, "grad_norm": 0.7816855907440186, "learning_rate": 0.0002, "epoch": 3.220825852782765, "step": 44850}, {"loss": 0.622, "grad_norm": 1.0099157094955444, "learning_rate": 0.0002, "epoch": 3.221543985637343, "step": 44860}, {"loss": 0.6668, "grad_norm": 1.054928183555603, "learning_rate": 0.0002, "epoch": 3.222262118491921, "step": 44870}, {"loss": 0.6104, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 3.222980251346499, "step": 44880}, {"loss": 0.686, "grad_norm": 0.9730798602104187, "learning_rate": 0.0002, "epoch": 3.2236983842010773, "step": 44890}, {"loss": 0.6533, "grad_norm": 0.7911382913589478, "learning_rate": 0.0002, "epoch": 3.2244165170556554, "step": 44900}, {"loss": 0.6466, "grad_norm": 0.9574400782585144, "learning_rate": 0.0002, "epoch": 3.2251346499102334, "step": 44910}, {"loss": 0.693, "grad_norm": 0.8101068139076233, "learning_rate": 0.0002, "epoch": 3.2258527827648114, "step": 44920}, {"loss": 0.6605, "grad_norm": 0.754146933555603, "learning_rate": 0.0002, "epoch": 3.22657091561939, "step": 44930}, {"loss": 0.6317, "grad_norm": 0.7471939921379089, "learning_rate": 0.0002, "epoch": 3.227289048473968, "step": 44940}, {"loss": 0.6378, "grad_norm": 1.0040855407714844, "learning_rate": 0.0002, "epoch": 3.228007181328546, "step": 44950}, {"loss": 0.6496, "grad_norm": 1.0016074180603027, "learning_rate": 0.0002, "epoch": 3.228725314183124, "step": 44960}, {"loss": 0.6, "grad_norm": 1.0432976484298706, "learning_rate": 0.0002, "epoch": 3.229443447037702, "step": 44970}, {"loss": 0.635, "grad_norm": 0.8517055511474609, "learning_rate": 0.0002, "epoch": 3.2301615798922803, "step": 44980}, {"loss": 0.6168, "grad_norm": 0.9174178242683411, "learning_rate": 0.0002, "epoch": 3.2308797127468583, "step": 44990}, {"loss": 0.6325, "grad_norm": 0.9733774065971375, "learning_rate": 0.0002, "epoch": 3.2315978456014363, "step": 45000}, {"loss": 0.6743, "grad_norm": 0.9074714779853821, "learning_rate": 0.0002, "epoch": 3.2323159784560143, "step": 45010}, {"loss": 0.6372, "grad_norm": 0.8802759051322937, "learning_rate": 0.0002, "epoch": 3.2330341113105923, "step": 45020}, {"loss": 0.6189, "grad_norm": 1.0620871782302856, "learning_rate": 0.0002, "epoch": 3.2337522441651707, "step": 45030}, {"loss": 0.6201, "grad_norm": 0.8069542050361633, "learning_rate": 0.0002, "epoch": 3.2344703770197487, "step": 45040}, {"loss": 0.618, "grad_norm": 0.9139137864112854, "learning_rate": 0.0002, "epoch": 3.2351885098743267, "step": 45050}, {"loss": 0.6389, "grad_norm": 0.8936411142349243, "learning_rate": 0.0002, "epoch": 3.2359066427289047, "step": 45060}, {"loss": 0.6602, "grad_norm": 0.9098079204559326, "learning_rate": 0.0002, "epoch": 3.2366247755834827, "step": 45070}, {"loss": 0.6423, "grad_norm": 1.062953233718872, "learning_rate": 0.0002, "epoch": 3.237342908438061, "step": 45080}, {"loss": 0.6527, "grad_norm": 0.8656470775604248, "learning_rate": 0.0002, "epoch": 3.238061041292639, "step": 45090}, {"loss": 0.6362, "grad_norm": 0.9299449920654297, "learning_rate": 0.0002, "epoch": 3.238779174147217, "step": 45100}, {"loss": 0.6469, "grad_norm": 1.0102022886276245, "learning_rate": 0.0002, "epoch": 3.239497307001795, "step": 45110}, {"loss": 0.5984, "grad_norm": 0.8074561953544617, "learning_rate": 0.0002, "epoch": 3.2402154398563736, "step": 45120}, {"loss": 0.6196, "grad_norm": 1.044105887413025, "learning_rate": 0.0002, "epoch": 3.2409335727109516, "step": 45130}, {"loss": 0.6471, "grad_norm": 0.8742762207984924, "learning_rate": 0.0002, "epoch": 3.2416517055655296, "step": 45140}, {"loss": 0.648, "grad_norm": 0.8240015506744385, "learning_rate": 0.0002, "epoch": 3.2423698384201076, "step": 45150}, {"loss": 0.6599, "grad_norm": 0.8438951373100281, "learning_rate": 0.0002, "epoch": 3.2430879712746856, "step": 45160}, {"loss": 0.6406, "grad_norm": 1.02358877658844, "learning_rate": 0.0002, "epoch": 3.243806104129264, "step": 45170}, {"loss": 0.6581, "grad_norm": 0.8824774026870728, "learning_rate": 0.0002, "epoch": 3.244524236983842, "step": 45180}, {"loss": 0.658, "grad_norm": 0.971015989780426, "learning_rate": 0.0002, "epoch": 3.24524236983842, "step": 45190}, {"loss": 0.6473, "grad_norm": 0.9282383918762207, "learning_rate": 0.0002, "epoch": 3.245960502692998, "step": 45200}, {"loss": 0.6376, "grad_norm": 0.7908362746238708, "learning_rate": 0.0002, "epoch": 3.2466786355475765, "step": 45210}, {"loss": 0.6765, "grad_norm": 1.0721662044525146, "learning_rate": 0.0002, "epoch": 3.2473967684021545, "step": 45220}, {"loss": 0.7102, "grad_norm": 0.9516810774803162, "learning_rate": 0.0002, "epoch": 3.2481149012567325, "step": 45230}, {"loss": 0.6332, "grad_norm": 0.7914131283760071, "learning_rate": 0.0002, "epoch": 3.2488330341113105, "step": 45240}, {"loss": 0.6018, "grad_norm": 0.8492292761802673, "learning_rate": 0.0002, "epoch": 3.2495511669658885, "step": 45250}, {"loss": 0.6272, "grad_norm": 0.8880114555358887, "learning_rate": 0.0002, "epoch": 3.250269299820467, "step": 45260}, {"loss": 0.6394, "grad_norm": 0.7808310985565186, "learning_rate": 0.0002, "epoch": 3.250987432675045, "step": 45270}, {"loss": 0.6161, "grad_norm": 0.8566828966140747, "learning_rate": 0.0002, "epoch": 3.251705565529623, "step": 45280}, {"loss": 0.6408, "grad_norm": 0.7929658889770508, "learning_rate": 0.0002, "epoch": 3.252423698384201, "step": 45290}, {"loss": 0.6182, "grad_norm": 0.678207516670227, "learning_rate": 0.0002, "epoch": 3.253141831238779, "step": 45300}, {"loss": 0.6315, "grad_norm": 0.9963029623031616, "learning_rate": 0.0002, "epoch": 3.2538599640933574, "step": 45310}, {"loss": 0.6496, "grad_norm": 0.835304856300354, "learning_rate": 0.0002, "epoch": 3.2545780969479354, "step": 45320}, {"loss": 0.6099, "grad_norm": 0.7281617522239685, "learning_rate": 0.0002, "epoch": 3.2552962298025134, "step": 45330}, {"loss": 0.6224, "grad_norm": 1.244890570640564, "learning_rate": 0.0002, "epoch": 3.2560143626570914, "step": 45340}, {"loss": 0.6317, "grad_norm": 0.8372750282287598, "learning_rate": 0.0002, "epoch": 3.2567324955116694, "step": 45350}, {"loss": 0.604, "grad_norm": 1.0029667615890503, "learning_rate": 0.0002, "epoch": 3.257450628366248, "step": 45360}, {"loss": 0.596, "grad_norm": 0.8561908602714539, "learning_rate": 0.0002, "epoch": 3.258168761220826, "step": 45370}, {"loss": 0.6185, "grad_norm": 1.0058085918426514, "learning_rate": 0.0002, "epoch": 3.258886894075404, "step": 45380}, {"loss": 0.6415, "grad_norm": 0.7768221497535706, "learning_rate": 0.0002, "epoch": 3.259605026929982, "step": 45390}, {"loss": 0.635, "grad_norm": 0.8443793058395386, "learning_rate": 0.0002, "epoch": 3.2603231597845603, "step": 45400}, {"loss": 0.6579, "grad_norm": 1.0140392780303955, "learning_rate": 0.0002, "epoch": 3.2610412926391383, "step": 45410}, {"loss": 0.6434, "grad_norm": 0.8397058248519897, "learning_rate": 0.0002, "epoch": 3.2617594254937163, "step": 45420}, {"loss": 0.6361, "grad_norm": 0.9717063903808594, "learning_rate": 0.0002, "epoch": 3.2624775583482943, "step": 45430}, {"loss": 0.6837, "grad_norm": 1.0279473066329956, "learning_rate": 0.0002, "epoch": 3.2631956912028723, "step": 45440}, {"loss": 0.6274, "grad_norm": 1.207457184791565, "learning_rate": 0.0002, "epoch": 3.263913824057451, "step": 45450}, {"loss": 0.681, "grad_norm": 0.8121998906135559, "learning_rate": 0.0002, "epoch": 3.264631956912029, "step": 45460}, {"loss": 0.6202, "grad_norm": 1.037733554840088, "learning_rate": 0.0002, "epoch": 3.265350089766607, "step": 45470}, {"loss": 0.6146, "grad_norm": 0.9305754899978638, "learning_rate": 0.0002, "epoch": 3.266068222621185, "step": 45480}, {"loss": 0.6186, "grad_norm": 0.9733602404594421, "learning_rate": 0.0002, "epoch": 3.2667863554757632, "step": 45490}, {"loss": 0.6713, "grad_norm": 0.8345039486885071, "learning_rate": 0.0002, "epoch": 3.2675044883303412, "step": 45500}, {"loss": 0.6315, "grad_norm": 0.8601692318916321, "learning_rate": 0.0002, "epoch": 3.2682226211849192, "step": 45510}, {"loss": 0.5953, "grad_norm": 0.7921277284622192, "learning_rate": 0.0002, "epoch": 3.2689407540394972, "step": 45520}, {"loss": 0.6781, "grad_norm": 0.8324153423309326, "learning_rate": 0.0002, "epoch": 3.2696588868940752, "step": 45530}, {"loss": 0.6413, "grad_norm": 0.85141521692276, "learning_rate": 0.0002, "epoch": 3.2703770197486537, "step": 45540}, {"loss": 0.654, "grad_norm": 0.9399608373641968, "learning_rate": 0.0002, "epoch": 3.2710951526032317, "step": 45550}, {"loss": 0.6364, "grad_norm": 0.9829166531562805, "learning_rate": 0.0002, "epoch": 3.2718132854578097, "step": 45560}, {"loss": 0.627, "grad_norm": 0.9936266541481018, "learning_rate": 0.0002, "epoch": 3.2725314183123877, "step": 45570}, {"loss": 0.6465, "grad_norm": 1.036165714263916, "learning_rate": 0.0002, "epoch": 3.2732495511669657, "step": 45580}, {"loss": 0.6216, "grad_norm": 0.8988680243492126, "learning_rate": 0.0002, "epoch": 3.273967684021544, "step": 45590}, {"loss": 0.6368, "grad_norm": 0.9173405766487122, "learning_rate": 0.0002, "epoch": 3.274685816876122, "step": 45600}, {"loss": 0.6455, "grad_norm": 0.9967324733734131, "learning_rate": 0.0002, "epoch": 3.2754039497307, "step": 45610}, {"loss": 0.6236, "grad_norm": 0.9097777009010315, "learning_rate": 0.0002, "epoch": 3.276122082585278, "step": 45620}, {"loss": 0.632, "grad_norm": 1.0559430122375488, "learning_rate": 0.0002, "epoch": 3.276840215439856, "step": 45630}, {"loss": 0.5999, "grad_norm": 0.9583360552787781, "learning_rate": 0.0002, "epoch": 3.2775583482944346, "step": 45640}, {"loss": 0.6329, "grad_norm": 0.7630334496498108, "learning_rate": 0.0002, "epoch": 3.2782764811490126, "step": 45650}, {"loss": 0.6873, "grad_norm": 0.9955230355262756, "learning_rate": 0.0002, "epoch": 3.2789946140035906, "step": 45660}, {"loss": 0.6216, "grad_norm": 0.8685793876647949, "learning_rate": 0.0002, "epoch": 3.2797127468581686, "step": 45670}, {"loss": 0.6243, "grad_norm": 0.919913113117218, "learning_rate": 0.0002, "epoch": 3.280430879712747, "step": 45680}, {"loss": 0.6334, "grad_norm": 0.826144814491272, "learning_rate": 0.0002, "epoch": 3.281149012567325, "step": 45690}, {"loss": 0.6359, "grad_norm": 0.9750179052352905, "learning_rate": 0.0002, "epoch": 3.281867145421903, "step": 45700}, {"loss": 0.6589, "grad_norm": 0.7931897640228271, "learning_rate": 0.0002, "epoch": 3.282585278276481, "step": 45710}, {"loss": 0.6785, "grad_norm": 1.0380089282989502, "learning_rate": 0.0002, "epoch": 3.283303411131059, "step": 45720}, {"loss": 0.6219, "grad_norm": 0.8220566511154175, "learning_rate": 0.0002, "epoch": 3.2840215439856375, "step": 45730}, {"loss": 0.5737, "grad_norm": 0.9688239693641663, "learning_rate": 0.0002, "epoch": 3.2847396768402155, "step": 45740}, {"loss": 0.603, "grad_norm": 0.8760311603546143, "learning_rate": 0.0002, "epoch": 3.2854578096947935, "step": 45750}, {"loss": 0.6134, "grad_norm": 0.8103382587432861, "learning_rate": 0.0002, "epoch": 3.2861759425493715, "step": 45760}, {"loss": 0.6475, "grad_norm": 0.8835865259170532, "learning_rate": 0.0002, "epoch": 3.28689407540395, "step": 45770}, {"loss": 0.6423, "grad_norm": 0.9021160006523132, "learning_rate": 0.0002, "epoch": 3.287612208258528, "step": 45780}, {"loss": 0.6693, "grad_norm": 0.8182386159896851, "learning_rate": 0.0002, "epoch": 3.288330341113106, "step": 45790}, {"loss": 0.6408, "grad_norm": 0.8555024862289429, "learning_rate": 0.0002, "epoch": 3.289048473967684, "step": 45800}, {"loss": 0.6839, "grad_norm": 1.0982348918914795, "learning_rate": 0.0002, "epoch": 3.289766606822262, "step": 45810}, {"loss": 0.6323, "grad_norm": 1.06246817111969, "learning_rate": 0.0002, "epoch": 3.2904847396768404, "step": 45820}, {"loss": 0.5924, "grad_norm": 1.1727149486541748, "learning_rate": 0.0002, "epoch": 3.2912028725314184, "step": 45830}, {"loss": 0.624, "grad_norm": 0.8224700093269348, "learning_rate": 0.0002, "epoch": 3.2919210053859964, "step": 45840}, {"loss": 0.6445, "grad_norm": 0.8195698261260986, "learning_rate": 0.0002, "epoch": 3.2926391382405744, "step": 45850}, {"loss": 0.6106, "grad_norm": 0.8424476981163025, "learning_rate": 0.0002, "epoch": 3.2933572710951524, "step": 45860}, {"loss": 0.6705, "grad_norm": 0.9804632067680359, "learning_rate": 0.0002, "epoch": 3.294075403949731, "step": 45870}, {"loss": 0.6538, "grad_norm": 0.8701804876327515, "learning_rate": 0.0002, "epoch": 3.294793536804309, "step": 45880}, {"loss": 0.6264, "grad_norm": 0.8876864910125732, "learning_rate": 0.0002, "epoch": 3.295511669658887, "step": 45890}, {"loss": 0.6401, "grad_norm": 1.0105448961257935, "learning_rate": 0.0002, "epoch": 3.296229802513465, "step": 45900}, {"loss": 0.687, "grad_norm": 0.847017228603363, "learning_rate": 0.0002, "epoch": 3.296947935368043, "step": 45910}, {"loss": 0.6433, "grad_norm": 0.7610297799110413, "learning_rate": 0.0002, "epoch": 3.2976660682226213, "step": 45920}, {"loss": 0.6499, "grad_norm": 0.7272670269012451, "learning_rate": 0.0002, "epoch": 3.2983842010771993, "step": 45930}, {"loss": 0.6366, "grad_norm": 0.8243510127067566, "learning_rate": 0.0002, "epoch": 3.2991023339317773, "step": 45940}, {"loss": 0.6498, "grad_norm": 1.0113074779510498, "learning_rate": 0.0002, "epoch": 3.2998204667863553, "step": 45950}, {"loss": 0.6639, "grad_norm": 0.8578087687492371, "learning_rate": 0.0002, "epoch": 3.3005385996409338, "step": 45960}, {"loss": 0.6137, "grad_norm": 0.9511606097221375, "learning_rate": 0.0002, "epoch": 3.3012567324955118, "step": 45970}, {"loss": 0.6115, "grad_norm": 0.8612566590309143, "learning_rate": 0.0002, "epoch": 3.3019748653500898, "step": 45980}, {"loss": 0.6799, "grad_norm": 0.8702331185340881, "learning_rate": 0.0002, "epoch": 3.3026929982046678, "step": 45990}, {"loss": 0.6429, "grad_norm": 1.0229583978652954, "learning_rate": 0.0002, "epoch": 3.3034111310592458, "step": 46000}, {"loss": 0.6054, "grad_norm": 1.1775577068328857, "learning_rate": 0.0002, "epoch": 3.304129263913824, "step": 46010}, {"loss": 0.6958, "grad_norm": 0.9922171831130981, "learning_rate": 0.0002, "epoch": 3.3048473967684022, "step": 46020}, {"loss": 0.6642, "grad_norm": 0.8246880769729614, "learning_rate": 0.0002, "epoch": 3.3055655296229802, "step": 46030}, {"loss": 0.678, "grad_norm": 0.9351653456687927, "learning_rate": 0.0002, "epoch": 3.3062836624775582, "step": 46040}, {"loss": 0.649, "grad_norm": 0.9617429375648499, "learning_rate": 0.0002, "epoch": 3.3070017953321367, "step": 46050}, {"loss": 0.6314, "grad_norm": 0.9753885269165039, "learning_rate": 0.0002, "epoch": 3.3077199281867147, "step": 46060}, {"loss": 0.6434, "grad_norm": 0.8532425165176392, "learning_rate": 0.0002, "epoch": 3.3084380610412927, "step": 46070}, {"loss": 0.6312, "grad_norm": 0.9722012877464294, "learning_rate": 0.0002, "epoch": 3.3091561938958707, "step": 46080}, {"loss": 0.6629, "grad_norm": 0.8950021266937256, "learning_rate": 0.0002, "epoch": 3.3098743267504487, "step": 46090}, {"loss": 0.6278, "grad_norm": 0.8536333441734314, "learning_rate": 0.0002, "epoch": 3.3105924596050267, "step": 46100}, {"loss": 0.6359, "grad_norm": 0.9423946738243103, "learning_rate": 0.0002, "epoch": 3.311310592459605, "step": 46110}, {"loss": 0.6647, "grad_norm": 0.8573169112205505, "learning_rate": 0.0002, "epoch": 3.312028725314183, "step": 46120}, {"loss": 0.6127, "grad_norm": 1.0122376680374146, "learning_rate": 0.0002, "epoch": 3.312746858168761, "step": 46130}, {"loss": 0.6782, "grad_norm": 0.7492560744285583, "learning_rate": 0.0002, "epoch": 3.313464991023339, "step": 46140}, {"loss": 0.6315, "grad_norm": 1.023658037185669, "learning_rate": 0.0002, "epoch": 3.3141831238779176, "step": 46150}, {"loss": 0.6051, "grad_norm": 1.1191970109939575, "learning_rate": 0.0002, "epoch": 3.3149012567324956, "step": 46160}, {"loss": 0.6247, "grad_norm": 0.9847373962402344, "learning_rate": 0.0002, "epoch": 3.3156193895870736, "step": 46170}, {"loss": 0.661, "grad_norm": 0.7315911054611206, "learning_rate": 0.0002, "epoch": 3.3163375224416516, "step": 46180}, {"loss": 0.6017, "grad_norm": 0.8267890214920044, "learning_rate": 0.0002, "epoch": 3.3170556552962296, "step": 46190}, {"loss": 0.6202, "grad_norm": 0.8898099064826965, "learning_rate": 0.0002, "epoch": 3.317773788150808, "step": 46200}, {"loss": 0.651, "grad_norm": 0.8525369167327881, "learning_rate": 0.0002, "epoch": 3.318491921005386, "step": 46210}, {"loss": 0.6705, "grad_norm": 0.8074760437011719, "learning_rate": 0.0002, "epoch": 3.319210053859964, "step": 46220}, {"loss": 0.641, "grad_norm": 0.8473616242408752, "learning_rate": 0.0002, "epoch": 3.319928186714542, "step": 46230}, {"loss": 0.6092, "grad_norm": 0.8678314089775085, "learning_rate": 0.0002, "epoch": 3.3206463195691205, "step": 46240}, {"loss": 0.655, "grad_norm": 0.8718782067298889, "learning_rate": 0.0002, "epoch": 3.3213644524236985, "step": 46250}, {"loss": 0.6266, "grad_norm": 0.9384858012199402, "learning_rate": 0.0002, "epoch": 3.3220825852782765, "step": 46260}, {"loss": 0.6393, "grad_norm": 0.9295032620429993, "learning_rate": 0.0002, "epoch": 3.3228007181328545, "step": 46270}, {"loss": 0.6824, "grad_norm": 0.9472482800483704, "learning_rate": 0.0002, "epoch": 3.3235188509874325, "step": 46280}, {"loss": 0.6177, "grad_norm": 0.7970638275146484, "learning_rate": 0.0002, "epoch": 3.324236983842011, "step": 46290}, {"loss": 0.6431, "grad_norm": 0.9508723020553589, "learning_rate": 0.0002, "epoch": 3.324955116696589, "step": 46300}, {"loss": 0.6126, "grad_norm": 0.9153636693954468, "learning_rate": 0.0002, "epoch": 3.325673249551167, "step": 46310}, {"loss": 0.6042, "grad_norm": 0.7890323400497437, "learning_rate": 0.0002, "epoch": 3.326391382405745, "step": 46320}, {"loss": 0.6525, "grad_norm": 0.8711825609207153, "learning_rate": 0.0002, "epoch": 3.3271095152603234, "step": 46330}, {"loss": 0.6253, "grad_norm": 0.9938926696777344, "learning_rate": 0.0002, "epoch": 3.3278276481149014, "step": 46340}, {"loss": 0.6227, "grad_norm": 0.8497524857521057, "learning_rate": 0.0002, "epoch": 3.3285457809694794, "step": 46350}, {"loss": 0.6472, "grad_norm": 0.9191650748252869, "learning_rate": 0.0002, "epoch": 3.3292639138240574, "step": 46360}, {"loss": 0.6385, "grad_norm": 0.8974085450172424, "learning_rate": 0.0002, "epoch": 3.3299820466786354, "step": 46370}, {"loss": 0.618, "grad_norm": 0.9928934574127197, "learning_rate": 0.0002, "epoch": 3.3307001795332134, "step": 46380}, {"loss": 0.6254, "grad_norm": 0.9011030197143555, "learning_rate": 0.0002, "epoch": 3.331418312387792, "step": 46390}, {"loss": 0.6146, "grad_norm": 0.898594856262207, "learning_rate": 0.0002, "epoch": 3.33213644524237, "step": 46400}, {"loss": 0.6321, "grad_norm": 0.7506672143936157, "learning_rate": 0.0002, "epoch": 3.332854578096948, "step": 46410}, {"loss": 0.6329, "grad_norm": 0.9239172339439392, "learning_rate": 0.0002, "epoch": 3.333572710951526, "step": 46420}, {"loss": 0.6278, "grad_norm": 1.0749682188034058, "learning_rate": 0.0002, "epoch": 3.3342908438061043, "step": 46430}, {"loss": 0.6568, "grad_norm": 0.9262617230415344, "learning_rate": 0.0002, "epoch": 3.3350089766606823, "step": 46440}, {"loss": 0.6034, "grad_norm": 0.8681274056434631, "learning_rate": 0.0002, "epoch": 3.3357271095152603, "step": 46450}, {"loss": 0.6261, "grad_norm": 0.9558620452880859, "learning_rate": 0.0002, "epoch": 3.3364452423698383, "step": 46460}, {"loss": 0.6087, "grad_norm": 0.8907097578048706, "learning_rate": 0.0002, "epoch": 3.3371633752244163, "step": 46470}, {"loss": 0.6356, "grad_norm": 1.0941565036773682, "learning_rate": 0.0002, "epoch": 3.3378815080789948, "step": 46480}, {"loss": 0.6536, "grad_norm": 0.8971590995788574, "learning_rate": 0.0002, "epoch": 3.3385996409335728, "step": 46490}, {"loss": 0.6252, "grad_norm": 1.0315606594085693, "learning_rate": 0.0002, "epoch": 3.3393177737881508, "step": 46500}, {"loss": 0.5819, "grad_norm": 0.7717124223709106, "learning_rate": 0.0002, "epoch": 3.3400359066427288, "step": 46510}, {"loss": 0.612, "grad_norm": 0.8060970902442932, "learning_rate": 0.0002, "epoch": 3.340754039497307, "step": 46520}, {"loss": 0.7036, "grad_norm": 0.969510018825531, "learning_rate": 0.0002, "epoch": 3.341472172351885, "step": 46530}, {"loss": 0.6163, "grad_norm": 0.8837248682975769, "learning_rate": 0.0002, "epoch": 3.342190305206463, "step": 46540}, {"loss": 0.6762, "grad_norm": 0.9561076164245605, "learning_rate": 0.0002, "epoch": 3.342908438061041, "step": 46550}, {"loss": 0.687, "grad_norm": 0.8529208898544312, "learning_rate": 0.0002, "epoch": 3.343626570915619, "step": 46560}, {"loss": 0.611, "grad_norm": 1.1300519704818726, "learning_rate": 0.0002, "epoch": 3.3443447037701977, "step": 46570}, {"loss": 0.6088, "grad_norm": 0.8330956101417542, "learning_rate": 0.0002, "epoch": 3.3450628366247757, "step": 46580}, {"loss": 0.6725, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 3.3457809694793537, "step": 46590}, {"loss": 0.6667, "grad_norm": 1.0470821857452393, "learning_rate": 0.0002, "epoch": 3.3464991023339317, "step": 46600}, {"loss": 0.6408, "grad_norm": 0.9933704137802124, "learning_rate": 0.0002, "epoch": 3.34721723518851, "step": 46610}, {"loss": 0.6416, "grad_norm": 0.8130798935890198, "learning_rate": 0.0002, "epoch": 3.347935368043088, "step": 46620}, {"loss": 0.6576, "grad_norm": 0.9746946692466736, "learning_rate": 0.0002, "epoch": 3.348653500897666, "step": 46630}, {"loss": 0.6254, "grad_norm": 0.8607267141342163, "learning_rate": 0.0002, "epoch": 3.349371633752244, "step": 46640}, {"loss": 0.6639, "grad_norm": 0.800335705280304, "learning_rate": 0.0002, "epoch": 3.350089766606822, "step": 46650}, {"loss": 0.6749, "grad_norm": 1.0083239078521729, "learning_rate": 0.0002, "epoch": 3.3508078994614, "step": 46660}, {"loss": 0.6606, "grad_norm": 1.0774433612823486, "learning_rate": 0.0002, "epoch": 3.3515260323159786, "step": 46670}, {"loss": 0.6408, "grad_norm": 0.9378824234008789, "learning_rate": 0.0002, "epoch": 3.3522441651705566, "step": 46680}, {"loss": 0.5879, "grad_norm": 0.8490564227104187, "learning_rate": 0.0002, "epoch": 3.3529622980251346, "step": 46690}, {"loss": 0.6364, "grad_norm": 1.0415582656860352, "learning_rate": 0.0002, "epoch": 3.3536804308797126, "step": 46700}, {"loss": 0.5813, "grad_norm": 0.8514367938041687, "learning_rate": 0.0002, "epoch": 3.354398563734291, "step": 46710}, {"loss": 0.6847, "grad_norm": 0.7691360712051392, "learning_rate": 0.0002, "epoch": 3.355116696588869, "step": 46720}, {"loss": 0.6295, "grad_norm": 0.8345438241958618, "learning_rate": 0.0002, "epoch": 3.355834829443447, "step": 46730}, {"loss": 0.6093, "grad_norm": 1.023492693901062, "learning_rate": 0.0002, "epoch": 3.356552962298025, "step": 46740}, {"loss": 0.5997, "grad_norm": 0.9648325443267822, "learning_rate": 0.0002, "epoch": 3.357271095152603, "step": 46750}, {"loss": 0.6379, "grad_norm": 0.9029248356819153, "learning_rate": 0.0002, "epoch": 3.3579892280071815, "step": 46760}, {"loss": 0.6551, "grad_norm": 0.9109513759613037, "learning_rate": 0.0002, "epoch": 3.3587073608617595, "step": 46770}, {"loss": 0.6616, "grad_norm": 0.7757390141487122, "learning_rate": 0.0002, "epoch": 3.3594254937163375, "step": 46780}, {"loss": 0.6088, "grad_norm": 0.794035792350769, "learning_rate": 0.0002, "epoch": 3.3601436265709155, "step": 46790}, {"loss": 0.6405, "grad_norm": 0.8211429715156555, "learning_rate": 0.0002, "epoch": 3.360861759425494, "step": 46800}, {"loss": 0.6359, "grad_norm": 0.8620322346687317, "learning_rate": 0.0002, "epoch": 3.361579892280072, "step": 46810}, {"loss": 0.6357, "grad_norm": 0.9392538070678711, "learning_rate": 0.0002, "epoch": 3.36229802513465, "step": 46820}, {"loss": 0.6225, "grad_norm": 0.8297873139381409, "learning_rate": 0.0002, "epoch": 3.363016157989228, "step": 46830}, {"loss": 0.639, "grad_norm": 0.9158190488815308, "learning_rate": 0.0002, "epoch": 3.363734290843806, "step": 46840}, {"loss": 0.6168, "grad_norm": 1.1449424028396606, "learning_rate": 0.0002, "epoch": 3.3644524236983844, "step": 46850}, {"loss": 0.6413, "grad_norm": 0.8718444108963013, "learning_rate": 0.0002, "epoch": 3.3651705565529624, "step": 46860}, {"loss": 0.624, "grad_norm": 0.7744014263153076, "learning_rate": 0.0002, "epoch": 3.3658886894075404, "step": 46870}, {"loss": 0.6238, "grad_norm": 0.8392460942268372, "learning_rate": 0.0002, "epoch": 3.3666068222621184, "step": 46880}, {"loss": 0.6753, "grad_norm": 1.0424989461898804, "learning_rate": 0.0002, "epoch": 3.367324955116697, "step": 46890}, {"loss": 0.6038, "grad_norm": 1.4696359634399414, "learning_rate": 0.0002, "epoch": 3.368043087971275, "step": 46900}, {"loss": 0.6525, "grad_norm": 0.9298201203346252, "learning_rate": 0.0002, "epoch": 3.368761220825853, "step": 46910}, {"loss": 0.6351, "grad_norm": 0.8965262770652771, "learning_rate": 0.0002, "epoch": 3.369479353680431, "step": 46920}, {"loss": 0.6505, "grad_norm": 0.9395381808280945, "learning_rate": 0.0002, "epoch": 3.370197486535009, "step": 46930}, {"loss": 0.6161, "grad_norm": 0.9069047570228577, "learning_rate": 0.0002, "epoch": 3.370915619389587, "step": 46940}, {"loss": 0.6576, "grad_norm": 0.9208605885505676, "learning_rate": 0.0002, "epoch": 3.3716337522441653, "step": 46950}, {"loss": 0.6456, "grad_norm": 0.9493077397346497, "learning_rate": 0.0002, "epoch": 3.3723518850987433, "step": 46960}, {"loss": 0.6609, "grad_norm": 1.0804208517074585, "learning_rate": 0.0002, "epoch": 3.3730700179533213, "step": 46970}, {"loss": 0.6267, "grad_norm": 0.9465714693069458, "learning_rate": 0.0002, "epoch": 3.3737881508078993, "step": 46980}, {"loss": 0.6633, "grad_norm": 0.9189882278442383, "learning_rate": 0.0002, "epoch": 3.3745062836624777, "step": 46990}, {"loss": 0.6518, "grad_norm": 1.0199357271194458, "learning_rate": 0.0002, "epoch": 3.3752244165170557, "step": 47000}, {"loss": 0.6645, "grad_norm": 0.8999426960945129, "learning_rate": 0.0002, "epoch": 3.3759425493716337, "step": 47010}, {"loss": 0.637, "grad_norm": 0.8923690319061279, "learning_rate": 0.0002, "epoch": 3.3766606822262117, "step": 47020}, {"loss": 0.6543, "grad_norm": 0.7459347248077393, "learning_rate": 0.0002, "epoch": 3.3773788150807897, "step": 47030}, {"loss": 0.6269, "grad_norm": 0.7702858448028564, "learning_rate": 0.0002, "epoch": 3.378096947935368, "step": 47040}, {"loss": 0.6399, "grad_norm": 0.8296625018119812, "learning_rate": 0.0002, "epoch": 3.378815080789946, "step": 47050}, {"loss": 0.6552, "grad_norm": 1.2952555418014526, "learning_rate": 0.0002, "epoch": 3.379533213644524, "step": 47060}, {"loss": 0.6264, "grad_norm": 0.7778869271278381, "learning_rate": 0.0002, "epoch": 3.380251346499102, "step": 47070}, {"loss": 0.6906, "grad_norm": 0.9151549339294434, "learning_rate": 0.0002, "epoch": 3.3809694793536806, "step": 47080}, {"loss": 0.6443, "grad_norm": 0.7883925437927246, "learning_rate": 0.0002, "epoch": 3.3816876122082586, "step": 47090}, {"loss": 0.6124, "grad_norm": 0.9602295756340027, "learning_rate": 0.0002, "epoch": 3.3824057450628366, "step": 47100}, {"loss": 0.651, "grad_norm": 0.7953121066093445, "learning_rate": 0.0002, "epoch": 3.3831238779174146, "step": 47110}, {"loss": 0.638, "grad_norm": 1.110148549079895, "learning_rate": 0.0002, "epoch": 3.3838420107719926, "step": 47120}, {"loss": 0.6386, "grad_norm": 0.9359608888626099, "learning_rate": 0.0002, "epoch": 3.384560143626571, "step": 47130}, {"loss": 0.6075, "grad_norm": 0.7877762317657471, "learning_rate": 0.0002, "epoch": 3.385278276481149, "step": 47140}, {"loss": 0.6657, "grad_norm": 0.8586933016777039, "learning_rate": 0.0002, "epoch": 3.385996409335727, "step": 47150}, {"loss": 0.6438, "grad_norm": 0.8920878767967224, "learning_rate": 0.0002, "epoch": 3.386714542190305, "step": 47160}, {"loss": 0.6584, "grad_norm": 0.9692603349685669, "learning_rate": 0.0002, "epoch": 3.3874326750448835, "step": 47170}, {"loss": 0.6643, "grad_norm": 0.9038610458374023, "learning_rate": 0.0002, "epoch": 3.3881508078994615, "step": 47180}, {"loss": 0.6002, "grad_norm": 1.6299188137054443, "learning_rate": 0.0002, "epoch": 3.3888689407540395, "step": 47190}, {"loss": 0.6423, "grad_norm": 0.9704291820526123, "learning_rate": 0.0002, "epoch": 3.3895870736086176, "step": 47200}, {"loss": 0.6808, "grad_norm": 0.9503401517868042, "learning_rate": 0.0002, "epoch": 3.3903052064631956, "step": 47210}, {"loss": 0.6871, "grad_norm": 1.0051378011703491, "learning_rate": 0.0002, "epoch": 3.3910233393177736, "step": 47220}, {"loss": 0.6207, "grad_norm": 0.7336357235908508, "learning_rate": 0.0002, "epoch": 3.391741472172352, "step": 47230}, {"loss": 0.6688, "grad_norm": 0.9847398996353149, "learning_rate": 0.0002, "epoch": 3.39245960502693, "step": 47240}, {"loss": 0.6305, "grad_norm": 0.8100917339324951, "learning_rate": 0.0002, "epoch": 3.393177737881508, "step": 47250}, {"loss": 0.6418, "grad_norm": 0.9752838611602783, "learning_rate": 0.0002, "epoch": 3.393895870736086, "step": 47260}, {"loss": 0.6237, "grad_norm": 0.9400623440742493, "learning_rate": 0.0002, "epoch": 3.3946140035906645, "step": 47270}, {"loss": 0.6321, "grad_norm": 0.7310057878494263, "learning_rate": 0.0002, "epoch": 3.3953321364452425, "step": 47280}, {"loss": 0.6209, "grad_norm": 0.8898789286613464, "learning_rate": 0.0002, "epoch": 3.3960502692998205, "step": 47290}, {"loss": 0.6496, "grad_norm": 1.0157585144042969, "learning_rate": 0.0002, "epoch": 3.3967684021543985, "step": 47300}, {"loss": 0.6497, "grad_norm": 0.9108527898788452, "learning_rate": 0.0002, "epoch": 3.3974865350089765, "step": 47310}, {"loss": 0.5928, "grad_norm": 0.9796249270439148, "learning_rate": 0.0002, "epoch": 3.398204667863555, "step": 47320}, {"loss": 0.6169, "grad_norm": 0.8176435232162476, "learning_rate": 0.0002, "epoch": 3.398922800718133, "step": 47330}, {"loss": 0.6279, "grad_norm": 0.9981188178062439, "learning_rate": 0.0002, "epoch": 3.399640933572711, "step": 47340}, {"loss": 0.6657, "grad_norm": 0.9774404764175415, "learning_rate": 0.0002, "epoch": 3.400359066427289, "step": 47350}, {"loss": 0.68, "grad_norm": 0.8624991774559021, "learning_rate": 0.0002, "epoch": 3.4010771992818674, "step": 47360}, {"loss": 0.6597, "grad_norm": 0.9191665053367615, "learning_rate": 0.0002, "epoch": 3.4017953321364454, "step": 47370}, {"loss": 0.6249, "grad_norm": 0.7971290946006775, "learning_rate": 0.0002, "epoch": 3.4025134649910234, "step": 47380}, {"loss": 0.617, "grad_norm": 0.8336732983589172, "learning_rate": 0.0002, "epoch": 3.4032315978456014, "step": 47390}, {"loss": 0.6435, "grad_norm": 0.7730334401130676, "learning_rate": 0.0002, "epoch": 3.4039497307001794, "step": 47400}, {"loss": 0.6348, "grad_norm": 0.8559145927429199, "learning_rate": 0.0002, "epoch": 3.404667863554758, "step": 47410}, {"loss": 0.6466, "grad_norm": 1.0261447429656982, "learning_rate": 0.0002, "epoch": 3.405385996409336, "step": 47420}, {"loss": 0.6556, "grad_norm": 0.9931781888008118, "learning_rate": 0.0002, "epoch": 3.406104129263914, "step": 47430}, {"loss": 0.6226, "grad_norm": 0.8971807360649109, "learning_rate": 0.0002, "epoch": 3.406822262118492, "step": 47440}, {"loss": 0.656, "grad_norm": 0.8886999487876892, "learning_rate": 0.0002, "epoch": 3.4075403949730703, "step": 47450}, {"loss": 0.6256, "grad_norm": 0.9551735520362854, "learning_rate": 0.0002, "epoch": 3.4082585278276483, "step": 47460}, {"loss": 0.6646, "grad_norm": 0.9066859483718872, "learning_rate": 0.0002, "epoch": 3.4089766606822263, "step": 47470}, {"loss": 0.6655, "grad_norm": 0.9192125201225281, "learning_rate": 0.0002, "epoch": 3.4096947935368043, "step": 47480}, {"loss": 0.6197, "grad_norm": 0.9332839250564575, "learning_rate": 0.0002, "epoch": 3.4104129263913823, "step": 47490}, {"loss": 0.6134, "grad_norm": 0.745563805103302, "learning_rate": 0.0002, "epoch": 3.4111310592459603, "step": 47500}, {"loss": 0.6206, "grad_norm": 0.6843905448913574, "learning_rate": 0.0002, "epoch": 3.4118491921005387, "step": 47510}, {"loss": 0.6742, "grad_norm": 0.8063111305236816, "learning_rate": 0.0002, "epoch": 3.4125673249551167, "step": 47520}, {"loss": 0.6138, "grad_norm": 0.9666593670845032, "learning_rate": 0.0002, "epoch": 3.4132854578096947, "step": 47530}, {"loss": 0.635, "grad_norm": 0.8112747073173523, "learning_rate": 0.0002, "epoch": 3.4140035906642727, "step": 47540}, {"loss": 0.6225, "grad_norm": 0.820807933807373, "learning_rate": 0.0002, "epoch": 3.414721723518851, "step": 47550}, {"loss": 0.6262, "grad_norm": 0.8476285338401794, "learning_rate": 0.0002, "epoch": 3.415439856373429, "step": 47560}, {"loss": 0.6134, "grad_norm": 1.0232552289962769, "learning_rate": 0.0002, "epoch": 3.416157989228007, "step": 47570}, {"loss": 0.604, "grad_norm": 0.8749372363090515, "learning_rate": 0.0002, "epoch": 3.416876122082585, "step": 47580}, {"loss": 0.6463, "grad_norm": 0.8117937445640564, "learning_rate": 0.0002, "epoch": 3.417594254937163, "step": 47590}, {"loss": 0.623, "grad_norm": 0.9010460376739502, "learning_rate": 0.0002, "epoch": 3.4183123877917416, "step": 47600}, {"loss": 0.6676, "grad_norm": 0.8955527544021606, "learning_rate": 0.0002, "epoch": 3.4190305206463196, "step": 47610}, {"loss": 0.6424, "grad_norm": 0.884186327457428, "learning_rate": 0.0002, "epoch": 3.4197486535008976, "step": 47620}, {"loss": 0.6377, "grad_norm": 0.8995241522789001, "learning_rate": 0.0002, "epoch": 3.4204667863554756, "step": 47630}, {"loss": 0.651, "grad_norm": 1.0627013444900513, "learning_rate": 0.0002, "epoch": 3.421184919210054, "step": 47640}, {"loss": 0.6338, "grad_norm": 0.8619979619979858, "learning_rate": 0.0002, "epoch": 3.421903052064632, "step": 47650}, {"loss": 0.6483, "grad_norm": 0.9682498574256897, "learning_rate": 0.0002, "epoch": 3.42262118491921, "step": 47660}, {"loss": 0.6006, "grad_norm": 0.9614400863647461, "learning_rate": 0.0002, "epoch": 3.423339317773788, "step": 47670}, {"loss": 0.6088, "grad_norm": 0.7986962795257568, "learning_rate": 0.0002, "epoch": 3.424057450628366, "step": 47680}, {"loss": 0.6056, "grad_norm": 0.8255957961082458, "learning_rate": 0.0002, "epoch": 3.4247755834829445, "step": 47690}, {"loss": 0.663, "grad_norm": 0.9139757752418518, "learning_rate": 0.0002, "epoch": 3.4254937163375225, "step": 47700}, {"loss": 0.61, "grad_norm": 0.8086292743682861, "learning_rate": 0.0002, "epoch": 3.4262118491921005, "step": 47710}, {"loss": 0.6604, "grad_norm": 0.8852273225784302, "learning_rate": 0.0002, "epoch": 3.4269299820466785, "step": 47720}, {"loss": 0.6168, "grad_norm": 0.7568784356117249, "learning_rate": 0.0002, "epoch": 3.427648114901257, "step": 47730}, {"loss": 0.6559, "grad_norm": 0.8933039903640747, "learning_rate": 0.0002, "epoch": 3.428366247755835, "step": 47740}, {"loss": 0.6406, "grad_norm": 0.8101669549942017, "learning_rate": 0.0002, "epoch": 3.429084380610413, "step": 47750}, {"loss": 0.6287, "grad_norm": 0.7021054625511169, "learning_rate": 0.0002, "epoch": 3.429802513464991, "step": 47760}, {"loss": 0.6159, "grad_norm": 0.8282538652420044, "learning_rate": 0.0002, "epoch": 3.430520646319569, "step": 47770}, {"loss": 0.6439, "grad_norm": 0.8168348670005798, "learning_rate": 0.0002, "epoch": 3.431238779174147, "step": 47780}, {"loss": 0.6265, "grad_norm": 0.9504001140594482, "learning_rate": 0.0002, "epoch": 3.4319569120287254, "step": 47790}, {"loss": 0.6688, "grad_norm": 0.7500190734863281, "learning_rate": 0.0002, "epoch": 3.4326750448833034, "step": 47800}, {"loss": 0.6818, "grad_norm": 0.8645710945129395, "learning_rate": 0.0002, "epoch": 3.4333931777378814, "step": 47810}, {"loss": 0.6268, "grad_norm": 0.8088704943656921, "learning_rate": 0.0002, "epoch": 3.4341113105924594, "step": 47820}, {"loss": 0.6795, "grad_norm": 0.9981673955917358, "learning_rate": 0.0002, "epoch": 3.434829443447038, "step": 47830}, {"loss": 0.6615, "grad_norm": 0.9363315105438232, "learning_rate": 0.0002, "epoch": 3.435547576301616, "step": 47840}, {"loss": 0.6028, "grad_norm": 0.8471030592918396, "learning_rate": 0.0002, "epoch": 3.436265709156194, "step": 47850}, {"loss": 0.6658, "grad_norm": 0.9447668790817261, "learning_rate": 0.0002, "epoch": 3.436983842010772, "step": 47860}, {"loss": 0.6511, "grad_norm": 0.9494127631187439, "learning_rate": 0.0002, "epoch": 3.43770197486535, "step": 47870}, {"loss": 0.6134, "grad_norm": 0.8340432643890381, "learning_rate": 0.0002, "epoch": 3.4384201077199283, "step": 47880}, {"loss": 0.6731, "grad_norm": 0.8466387987136841, "learning_rate": 0.0002, "epoch": 3.4391382405745063, "step": 47890}, {"loss": 0.6552, "grad_norm": 0.9498962759971619, "learning_rate": 0.0002, "epoch": 3.4398563734290843, "step": 47900}, {"loss": 0.6593, "grad_norm": 0.8490501046180725, "learning_rate": 0.0002, "epoch": 3.4405745062836623, "step": 47910}, {"loss": 0.6038, "grad_norm": 0.9506490230560303, "learning_rate": 0.0002, "epoch": 3.441292639138241, "step": 47920}, {"loss": 0.6317, "grad_norm": 0.7944257855415344, "learning_rate": 0.0002, "epoch": 3.442010771992819, "step": 47930}, {"loss": 0.6193, "grad_norm": 0.9725518226623535, "learning_rate": 0.0002, "epoch": 3.442728904847397, "step": 47940}, {"loss": 0.635, "grad_norm": 0.7823024392127991, "learning_rate": 0.0002, "epoch": 3.443447037701975, "step": 47950}, {"loss": 0.6221, "grad_norm": 0.810565173625946, "learning_rate": 0.0002, "epoch": 3.444165170556553, "step": 47960}, {"loss": 0.6519, "grad_norm": 0.9809024333953857, "learning_rate": 0.0002, "epoch": 3.4448833034111312, "step": 47970}, {"loss": 0.6441, "grad_norm": 0.8818578720092773, "learning_rate": 0.0002, "epoch": 3.4456014362657092, "step": 47980}, {"loss": 0.6452, "grad_norm": 0.9843092560768127, "learning_rate": 0.0002, "epoch": 3.4463195691202873, "step": 47990}, {"loss": 0.6076, "grad_norm": 0.916313886642456, "learning_rate": 0.0002, "epoch": 3.4470377019748653, "step": 48000}, {"loss": 0.6399, "grad_norm": 0.908442497253418, "learning_rate": 0.0002, "epoch": 3.4477558348294433, "step": 48010}, {"loss": 0.6263, "grad_norm": 0.9880178570747375, "learning_rate": 0.0002, "epoch": 3.4484739676840217, "step": 48020}, {"loss": 0.6802, "grad_norm": 0.9276854991912842, "learning_rate": 0.0002, "epoch": 3.4491921005385997, "step": 48030}, {"loss": 0.6522, "grad_norm": 1.0879448652267456, "learning_rate": 0.0002, "epoch": 3.4499102333931777, "step": 48040}, {"loss": 0.6362, "grad_norm": 0.7430389523506165, "learning_rate": 0.0002, "epoch": 3.4506283662477557, "step": 48050}, {"loss": 0.6064, "grad_norm": 1.0880072116851807, "learning_rate": 0.0002, "epoch": 3.4513464991023337, "step": 48060}, {"loss": 0.6152, "grad_norm": 1.0424141883850098, "learning_rate": 0.0002, "epoch": 3.452064631956912, "step": 48070}, {"loss": 0.6485, "grad_norm": 0.926330029964447, "learning_rate": 0.0002, "epoch": 3.45278276481149, "step": 48080}, {"loss": 0.6261, "grad_norm": 0.8911219239234924, "learning_rate": 0.0002, "epoch": 3.453500897666068, "step": 48090}, {"loss": 0.6883, "grad_norm": 0.8727201223373413, "learning_rate": 0.0002, "epoch": 3.454219030520646, "step": 48100}, {"loss": 0.6473, "grad_norm": 0.8573940396308899, "learning_rate": 0.0002, "epoch": 3.4549371633752246, "step": 48110}, {"loss": 0.6645, "grad_norm": 1.0427064895629883, "learning_rate": 0.0002, "epoch": 3.4556552962298026, "step": 48120}, {"loss": 0.6489, "grad_norm": 0.8688231706619263, "learning_rate": 0.0002, "epoch": 3.4563734290843806, "step": 48130}, {"loss": 0.5947, "grad_norm": 0.8856009244918823, "learning_rate": 0.0002, "epoch": 3.4570915619389586, "step": 48140}, {"loss": 0.6482, "grad_norm": 0.9535353183746338, "learning_rate": 0.0002, "epoch": 3.4578096947935366, "step": 48150}, {"loss": 0.6435, "grad_norm": 0.9466010928153992, "learning_rate": 0.0002, "epoch": 3.458527827648115, "step": 48160}, {"loss": 0.6231, "grad_norm": 0.9783535599708557, "learning_rate": 0.0002, "epoch": 3.459245960502693, "step": 48170}, {"loss": 0.6926, "grad_norm": 0.8010456562042236, "learning_rate": 0.0002, "epoch": 3.459964093357271, "step": 48180}, {"loss": 0.6141, "grad_norm": 0.8928955793380737, "learning_rate": 0.0002, "epoch": 3.460682226211849, "step": 48190}, {"loss": 0.6699, "grad_norm": 0.7565838694572449, "learning_rate": 0.0002, "epoch": 3.4614003590664275, "step": 48200}, {"loss": 0.6218, "grad_norm": 1.0044180154800415, "learning_rate": 0.0002, "epoch": 3.4621184919210055, "step": 48210}, {"loss": 0.6182, "grad_norm": 0.8161038160324097, "learning_rate": 0.0002, "epoch": 3.4628366247755835, "step": 48220}, {"loss": 0.6869, "grad_norm": 1.1000211238861084, "learning_rate": 0.0002, "epoch": 3.4635547576301615, "step": 48230}, {"loss": 0.7141, "grad_norm": 0.7942240238189697, "learning_rate": 0.0002, "epoch": 3.4642728904847395, "step": 48240}, {"loss": 0.6247, "grad_norm": 0.7546432018280029, "learning_rate": 0.0002, "epoch": 3.464991023339318, "step": 48250}, {"loss": 0.6319, "grad_norm": 0.7705255150794983, "learning_rate": 0.0002, "epoch": 3.465709156193896, "step": 48260}, {"loss": 0.6414, "grad_norm": 0.7958067059516907, "learning_rate": 0.0002, "epoch": 3.466427289048474, "step": 48270}, {"loss": 0.6526, "grad_norm": 0.9199120402336121, "learning_rate": 0.0002, "epoch": 3.467145421903052, "step": 48280}, {"loss": 0.6476, "grad_norm": 1.118672251701355, "learning_rate": 0.0002, "epoch": 3.46786355475763, "step": 48290}, {"loss": 0.6543, "grad_norm": 0.9161015748977661, "learning_rate": 0.0002, "epoch": 3.4685816876122084, "step": 48300}, {"loss": 0.6767, "grad_norm": 1.1086218357086182, "learning_rate": 0.0002, "epoch": 3.4692998204667864, "step": 48310}, {"loss": 0.5917, "grad_norm": 1.0123368501663208, "learning_rate": 0.0002, "epoch": 3.4700179533213644, "step": 48320}, {"loss": 0.6277, "grad_norm": 0.7380602359771729, "learning_rate": 0.0002, "epoch": 3.4707360861759424, "step": 48330}, {"loss": 0.6407, "grad_norm": 0.8967105150222778, "learning_rate": 0.0002, "epoch": 3.4714542190305204, "step": 48340}, {"loss": 0.6526, "grad_norm": 1.0134044885635376, "learning_rate": 0.0002, "epoch": 3.472172351885099, "step": 48350}, {"loss": 0.6436, "grad_norm": 1.080815076828003, "learning_rate": 0.0002, "epoch": 3.472890484739677, "step": 48360}, {"loss": 0.6644, "grad_norm": 1.151721477508545, "learning_rate": 0.0002, "epoch": 3.473608617594255, "step": 48370}, {"loss": 0.6612, "grad_norm": 0.9436505436897278, "learning_rate": 0.0002, "epoch": 3.474326750448833, "step": 48380}, {"loss": 0.6503, "grad_norm": 0.9154609441757202, "learning_rate": 0.0002, "epoch": 3.4750448833034113, "step": 48390}, {"loss": 0.6151, "grad_norm": 0.8943037986755371, "learning_rate": 0.0002, "epoch": 3.4757630161579893, "step": 48400}, {"loss": 0.6316, "grad_norm": 0.936988115310669, "learning_rate": 0.0002, "epoch": 3.4764811490125673, "step": 48410}, {"loss": 0.6638, "grad_norm": 0.826960027217865, "learning_rate": 0.0002, "epoch": 3.4771992818671453, "step": 48420}, {"loss": 0.6242, "grad_norm": 1.0487587451934814, "learning_rate": 0.0002, "epoch": 3.4779174147217233, "step": 48430}, {"loss": 0.6302, "grad_norm": 0.729163646697998, "learning_rate": 0.0002, "epoch": 3.478635547576302, "step": 48440}, {"loss": 0.6115, "grad_norm": 0.8156948089599609, "learning_rate": 0.0002, "epoch": 3.47935368043088, "step": 48450}, {"loss": 0.6455, "grad_norm": 0.8004332184791565, "learning_rate": 0.0002, "epoch": 3.480071813285458, "step": 48460}, {"loss": 0.621, "grad_norm": 0.9632692337036133, "learning_rate": 0.0002, "epoch": 3.480789946140036, "step": 48470}, {"loss": 0.6214, "grad_norm": 1.0950212478637695, "learning_rate": 0.0002, "epoch": 3.4815080789946142, "step": 48480}, {"loss": 0.6659, "grad_norm": 0.8574318885803223, "learning_rate": 0.0002, "epoch": 3.4822262118491922, "step": 48490}, {"loss": 0.6969, "grad_norm": 0.8552606701850891, "learning_rate": 0.0002, "epoch": 3.4829443447037702, "step": 48500}, {"loss": 0.6253, "grad_norm": 0.9698445200920105, "learning_rate": 0.0002, "epoch": 3.4836624775583482, "step": 48510}, {"loss": 0.6844, "grad_norm": 0.9427815675735474, "learning_rate": 0.0002, "epoch": 3.4843806104129262, "step": 48520}, {"loss": 0.6722, "grad_norm": 0.7902070879936218, "learning_rate": 0.0002, "epoch": 3.4850987432675042, "step": 48530}, {"loss": 0.6708, "grad_norm": 1.0300066471099854, "learning_rate": 0.0002, "epoch": 3.4858168761220827, "step": 48540}, {"loss": 0.6113, "grad_norm": 1.1688778400421143, "learning_rate": 0.0002, "epoch": 3.4865350089766607, "step": 48550}, {"loss": 0.5956, "grad_norm": 1.0012071132659912, "learning_rate": 0.0002, "epoch": 3.4872531418312387, "step": 48560}, {"loss": 0.6536, "grad_norm": 1.112094759941101, "learning_rate": 0.0002, "epoch": 3.4879712746858167, "step": 48570}, {"loss": 0.6625, "grad_norm": 0.8547284603118896, "learning_rate": 0.0002, "epoch": 3.488689407540395, "step": 48580}, {"loss": 0.6488, "grad_norm": 0.8827278017997742, "learning_rate": 0.0002, "epoch": 3.489407540394973, "step": 48590}, {"loss": 0.6437, "grad_norm": 0.9255490303039551, "learning_rate": 0.0002, "epoch": 3.490125673249551, "step": 48600}, {"loss": 0.6089, "grad_norm": 0.8000030517578125, "learning_rate": 0.0002, "epoch": 3.490843806104129, "step": 48610}, {"loss": 0.647, "grad_norm": 0.9327391386032104, "learning_rate": 0.0002, "epoch": 3.491561938958707, "step": 48620}, {"loss": 0.6678, "grad_norm": 0.9004138708114624, "learning_rate": 0.0002, "epoch": 3.4922800718132856, "step": 48630}, {"loss": 0.6145, "grad_norm": 0.9886971116065979, "learning_rate": 0.0002, "epoch": 3.4929982046678636, "step": 48640}, {"loss": 0.6309, "grad_norm": 0.9890487194061279, "learning_rate": 0.0002, "epoch": 3.4937163375224416, "step": 48650}, {"loss": 0.655, "grad_norm": 0.7024438977241516, "learning_rate": 0.0002, "epoch": 3.4944344703770196, "step": 48660}, {"loss": 0.6313, "grad_norm": 0.8397303223609924, "learning_rate": 0.0002, "epoch": 3.495152603231598, "step": 48670}, {"loss": 0.6429, "grad_norm": 0.9120950698852539, "learning_rate": 0.0002, "epoch": 3.495870736086176, "step": 48680}, {"loss": 0.631, "grad_norm": 1.057299017906189, "learning_rate": 0.0002, "epoch": 3.496588868940754, "step": 48690}, {"loss": 0.6459, "grad_norm": 0.821325957775116, "learning_rate": 0.0002, "epoch": 3.497307001795332, "step": 48700}, {"loss": 0.6174, "grad_norm": 1.0029970407485962, "learning_rate": 0.0002, "epoch": 3.49802513464991, "step": 48710}, {"loss": 0.6374, "grad_norm": 0.9483712911605835, "learning_rate": 0.0002, "epoch": 3.4987432675044885, "step": 48720}, {"loss": 0.6472, "grad_norm": 0.9637855291366577, "learning_rate": 0.0002, "epoch": 3.4994614003590665, "step": 48730}, {"loss": 0.6639, "grad_norm": 0.6848894357681274, "learning_rate": 0.0002, "epoch": 3.5001795332136445, "step": 48740}, {"loss": 0.6129, "grad_norm": 0.7848573327064514, "learning_rate": 0.0002, "epoch": 3.5008976660682225, "step": 48750}, {"loss": 0.6306, "grad_norm": 1.0341308116912842, "learning_rate": 0.0002, "epoch": 3.501615798922801, "step": 48760}, {"loss": 0.6063, "grad_norm": 0.8858218193054199, "learning_rate": 0.0002, "epoch": 3.502333931777379, "step": 48770}, {"loss": 0.6729, "grad_norm": 0.8366939425468445, "learning_rate": 0.0002, "epoch": 3.503052064631957, "step": 48780}, {"loss": 0.6736, "grad_norm": 0.7926092147827148, "learning_rate": 0.0002, "epoch": 3.503770197486535, "step": 48790}, {"loss": 0.6279, "grad_norm": 0.8503843545913696, "learning_rate": 0.0002, "epoch": 3.504488330341113, "step": 48800}, {"loss": 0.6162, "grad_norm": 0.8867869973182678, "learning_rate": 0.0002, "epoch": 3.505206463195691, "step": 48810}, {"loss": 0.6987, "grad_norm": 1.0336930751800537, "learning_rate": 0.0002, "epoch": 3.5059245960502694, "step": 48820}, {"loss": 0.6333, "grad_norm": 0.8564051985740662, "learning_rate": 0.0002, "epoch": 3.5066427289048474, "step": 48830}, {"loss": 0.6574, "grad_norm": 0.9202605485916138, "learning_rate": 0.0002, "epoch": 3.5073608617594254, "step": 48840}, {"loss": 0.6457, "grad_norm": 0.8838639855384827, "learning_rate": 0.0002, "epoch": 3.508078994614004, "step": 48850}, {"loss": 0.631, "grad_norm": 0.8975196480751038, "learning_rate": 0.0002, "epoch": 3.508797127468582, "step": 48860}, {"loss": 0.6335, "grad_norm": 0.8842370510101318, "learning_rate": 0.0002, "epoch": 3.50951526032316, "step": 48870}, {"loss": 0.6569, "grad_norm": 0.9195886254310608, "learning_rate": 0.0002, "epoch": 3.510233393177738, "step": 48880}, {"loss": 0.6647, "grad_norm": 0.986130952835083, "learning_rate": 0.0002, "epoch": 3.510951526032316, "step": 48890}, {"loss": 0.6676, "grad_norm": 0.8119593858718872, "learning_rate": 0.0002, "epoch": 3.511669658886894, "step": 48900}, {"loss": 0.653, "grad_norm": 0.9027136564254761, "learning_rate": 0.0002, "epoch": 3.5123877917414723, "step": 48910}, {"loss": 0.6731, "grad_norm": 0.8560537099838257, "learning_rate": 0.0002, "epoch": 3.5131059245960503, "step": 48920}, {"loss": 0.7032, "grad_norm": 0.7073559165000916, "learning_rate": 0.0002, "epoch": 3.5138240574506283, "step": 48930}, {"loss": 0.6738, "grad_norm": 0.8753304481506348, "learning_rate": 0.0002, "epoch": 3.5145421903052063, "step": 48940}, {"loss": 0.6366, "grad_norm": 0.9151145815849304, "learning_rate": 0.0002, "epoch": 3.5152603231597848, "step": 48950}, {"loss": 0.6135, "grad_norm": 0.7794315814971924, "learning_rate": 0.0002, "epoch": 3.5159784560143628, "step": 48960}, {"loss": 0.658, "grad_norm": 0.9226023554801941, "learning_rate": 0.0002, "epoch": 3.5166965888689408, "step": 48970}, {"loss": 0.6473, "grad_norm": 0.8442051410675049, "learning_rate": 0.0002, "epoch": 3.5174147217235188, "step": 48980}, {"loss": 0.6267, "grad_norm": 0.9769423007965088, "learning_rate": 0.0002, "epoch": 3.5181328545780968, "step": 48990}, {"loss": 0.6333, "grad_norm": 0.740347146987915, "learning_rate": 0.0002, "epoch": 3.5188509874326748, "step": 49000}, {"loss": 0.6652, "grad_norm": 0.8963457345962524, "learning_rate": 0.0002, "epoch": 3.519569120287253, "step": 49010}, {"loss": 0.6782, "grad_norm": 0.8410176634788513, "learning_rate": 0.0002, "epoch": 3.520287253141831, "step": 49020}, {"loss": 0.6496, "grad_norm": 1.0486022233963013, "learning_rate": 0.0002, "epoch": 3.521005385996409, "step": 49030}, {"loss": 0.6275, "grad_norm": 0.95393967628479, "learning_rate": 0.0002, "epoch": 3.5217235188509877, "step": 49040}, {"loss": 0.6328, "grad_norm": 0.8261157274246216, "learning_rate": 0.0002, "epoch": 3.5224416517055657, "step": 49050}, {"loss": 0.6441, "grad_norm": 0.9321704506874084, "learning_rate": 0.0002, "epoch": 3.5231597845601437, "step": 49060}, {"loss": 0.6202, "grad_norm": 1.2596088647842407, "learning_rate": 0.0002, "epoch": 3.5238779174147217, "step": 49070}, {"loss": 0.6596, "grad_norm": 0.8584637641906738, "learning_rate": 0.0002, "epoch": 3.5245960502692997, "step": 49080}, {"loss": 0.6708, "grad_norm": 0.850520670413971, "learning_rate": 0.0002, "epoch": 3.5253141831238777, "step": 49090}, {"loss": 0.6543, "grad_norm": 0.8915920257568359, "learning_rate": 0.0002, "epoch": 3.526032315978456, "step": 49100}, {"loss": 0.6558, "grad_norm": 0.9070239067077637, "learning_rate": 0.0002, "epoch": 3.526750448833034, "step": 49110}, {"loss": 0.6128, "grad_norm": 0.699878990650177, "learning_rate": 0.0002, "epoch": 3.527468581687612, "step": 49120}, {"loss": 0.6454, "grad_norm": 0.9003779888153076, "learning_rate": 0.0002, "epoch": 3.5281867145421906, "step": 49130}, {"loss": 0.6177, "grad_norm": 0.7886711955070496, "learning_rate": 0.0002, "epoch": 3.5289048473967686, "step": 49140}, {"loss": 0.6499, "grad_norm": 0.7368922233581543, "learning_rate": 0.0002, "epoch": 3.5296229802513466, "step": 49150}, {"loss": 0.6382, "grad_norm": 0.8585197329521179, "learning_rate": 0.0002, "epoch": 3.5303411131059246, "step": 49160}, {"loss": 0.6761, "grad_norm": 1.0205435752868652, "learning_rate": 0.0002, "epoch": 3.5310592459605026, "step": 49170}, {"loss": 0.6544, "grad_norm": 0.8756650686264038, "learning_rate": 0.0002, "epoch": 3.5317773788150806, "step": 49180}, {"loss": 0.6592, "grad_norm": 1.0278643369674683, "learning_rate": 0.0002, "epoch": 3.532495511669659, "step": 49190}, {"loss": 0.6682, "grad_norm": 0.8641911745071411, "learning_rate": 0.0002, "epoch": 3.533213644524237, "step": 49200}, {"loss": 0.6531, "grad_norm": 0.8730159401893616, "learning_rate": 0.0002, "epoch": 3.533931777378815, "step": 49210}, {"loss": 0.636, "grad_norm": 0.918637216091156, "learning_rate": 0.0002, "epoch": 3.534649910233393, "step": 49220}, {"loss": 0.6815, "grad_norm": 1.0467222929000854, "learning_rate": 0.0002, "epoch": 3.5353680430879715, "step": 49230}, {"loss": 0.6554, "grad_norm": 1.005009412765503, "learning_rate": 0.0002, "epoch": 3.5360861759425495, "step": 49240}, {"loss": 0.649, "grad_norm": 0.9775063395500183, "learning_rate": 0.0002, "epoch": 3.5368043087971275, "step": 49250}, {"loss": 0.6527, "grad_norm": 0.8198322057723999, "learning_rate": 0.0002, "epoch": 3.5375224416517055, "step": 49260}, {"loss": 0.664, "grad_norm": 0.8184829354286194, "learning_rate": 0.0002, "epoch": 3.5382405745062835, "step": 49270}, {"loss": 0.6493, "grad_norm": 0.9520270824432373, "learning_rate": 0.0002, "epoch": 3.5389587073608615, "step": 49280}, {"loss": 0.5935, "grad_norm": 0.7816803455352783, "learning_rate": 0.0002, "epoch": 3.53967684021544, "step": 49290}, {"loss": 0.6424, "grad_norm": 0.6915702819824219, "learning_rate": 0.0002, "epoch": 3.540394973070018, "step": 49300}, {"loss": 0.6447, "grad_norm": 0.8282375931739807, "learning_rate": 0.0002, "epoch": 3.541113105924596, "step": 49310}, {"loss": 0.6164, "grad_norm": 1.0797513723373413, "learning_rate": 0.0002, "epoch": 3.5418312387791744, "step": 49320}, {"loss": 0.6836, "grad_norm": 0.868671715259552, "learning_rate": 0.0002, "epoch": 3.5425493716337524, "step": 49330}, {"loss": 0.6453, "grad_norm": 0.8534455895423889, "learning_rate": 0.0002, "epoch": 3.5432675044883304, "step": 49340}, {"loss": 0.6706, "grad_norm": 0.816411554813385, "learning_rate": 0.0002, "epoch": 3.5439856373429084, "step": 49350}, {"loss": 0.6101, "grad_norm": 0.7813423275947571, "learning_rate": 0.0002, "epoch": 3.5447037701974864, "step": 49360}, {"loss": 0.6617, "grad_norm": 0.8002013564109802, "learning_rate": 0.0002, "epoch": 3.5454219030520644, "step": 49370}, {"loss": 0.6667, "grad_norm": 0.9740113615989685, "learning_rate": 0.0002, "epoch": 3.546140035906643, "step": 49380}, {"loss": 0.6938, "grad_norm": 0.9046127200126648, "learning_rate": 0.0002, "epoch": 3.546858168761221, "step": 49390}, {"loss": 0.6444, "grad_norm": 0.8635150194168091, "learning_rate": 0.0002, "epoch": 3.547576301615799, "step": 49400}, {"loss": 0.6273, "grad_norm": 0.9488558769226074, "learning_rate": 0.0002, "epoch": 3.5482944344703773, "step": 49410}, {"loss": 0.6542, "grad_norm": 0.9637090563774109, "learning_rate": 0.0002, "epoch": 3.5490125673249553, "step": 49420}, {"loss": 0.6468, "grad_norm": 1.042245626449585, "learning_rate": 0.0002, "epoch": 3.5497307001795333, "step": 49430}, {"loss": 0.6999, "grad_norm": 0.9076175689697266, "learning_rate": 0.0002, "epoch": 3.5504488330341113, "step": 49440}, {"loss": 0.6192, "grad_norm": 0.8480596542358398, "learning_rate": 0.0002, "epoch": 3.5511669658886893, "step": 49450}, {"loss": 0.6835, "grad_norm": 0.8483007550239563, "learning_rate": 0.0002, "epoch": 3.5518850987432673, "step": 49460}, {"loss": 0.6607, "grad_norm": 0.7855815887451172, "learning_rate": 0.0002, "epoch": 3.5526032315978457, "step": 49470}, {"loss": 0.6364, "grad_norm": 0.8435823917388916, "learning_rate": 0.0002, "epoch": 3.5533213644524237, "step": 49480}, {"loss": 0.6674, "grad_norm": 0.8613026142120361, "learning_rate": 0.0002, "epoch": 3.5540394973070017, "step": 49490}, {"loss": 0.6651, "grad_norm": 0.9654812812805176, "learning_rate": 0.0002, "epoch": 3.5547576301615798, "step": 49500}, {"loss": 0.6471, "grad_norm": 0.8888838887214661, "learning_rate": 0.0002, "epoch": 3.555475763016158, "step": 49510}, {"loss": 0.622, "grad_norm": 0.7718146443367004, "learning_rate": 0.0002, "epoch": 3.556193895870736, "step": 49520}, {"loss": 0.6297, "grad_norm": 0.9487382173538208, "learning_rate": 0.0002, "epoch": 3.556912028725314, "step": 49530}, {"loss": 0.6516, "grad_norm": 0.9256559610366821, "learning_rate": 0.0002, "epoch": 3.557630161579892, "step": 49540}, {"loss": 0.6461, "grad_norm": 0.8879945874214172, "learning_rate": 0.0002, "epoch": 3.55834829443447, "step": 49550}, {"loss": 0.6367, "grad_norm": 0.8498744368553162, "learning_rate": 0.0002, "epoch": 3.559066427289048, "step": 49560}, {"loss": 0.6274, "grad_norm": 0.9550948143005371, "learning_rate": 0.0002, "epoch": 3.5597845601436267, "step": 49570}, {"loss": 0.635, "grad_norm": 0.8386164903640747, "learning_rate": 0.0002, "epoch": 3.5605026929982047, "step": 49580}, {"loss": 0.6495, "grad_norm": 0.925573468208313, "learning_rate": 0.0002, "epoch": 3.5612208258527827, "step": 49590}, {"loss": 0.676, "grad_norm": 0.8867112398147583, "learning_rate": 0.0002, "epoch": 3.561938958707361, "step": 49600}, {"loss": 0.6156, "grad_norm": 0.7638537883758545, "learning_rate": 0.0002, "epoch": 3.562657091561939, "step": 49610}, {"loss": 0.6597, "grad_norm": 0.9491845965385437, "learning_rate": 0.0002, "epoch": 3.563375224416517, "step": 49620}, {"loss": 0.6237, "grad_norm": 0.8384189605712891, "learning_rate": 0.0002, "epoch": 3.564093357271095, "step": 49630}, {"loss": 0.6102, "grad_norm": 0.8850575089454651, "learning_rate": 0.0002, "epoch": 3.564811490125673, "step": 49640}, {"loss": 0.6517, "grad_norm": 1.020916223526001, "learning_rate": 0.0002, "epoch": 3.565529622980251, "step": 49650}, {"loss": 0.6569, "grad_norm": 0.9298280477523804, "learning_rate": 0.0002, "epoch": 3.5662477558348296, "step": 49660}, {"loss": 0.6094, "grad_norm": 0.9795742034912109, "learning_rate": 0.0002, "epoch": 3.5669658886894076, "step": 49670}, {"loss": 0.6147, "grad_norm": 0.9401193261146545, "learning_rate": 0.0002, "epoch": 3.5676840215439856, "step": 49680}, {"loss": 0.622, "grad_norm": 1.0383585691452026, "learning_rate": 0.0002, "epoch": 3.568402154398564, "step": 49690}, {"loss": 0.6304, "grad_norm": 0.8370866179466248, "learning_rate": 0.0002, "epoch": 3.569120287253142, "step": 49700}, {"loss": 0.6356, "grad_norm": 0.8207486271858215, "learning_rate": 0.0002, "epoch": 3.56983842010772, "step": 49710}, {"loss": 0.6328, "grad_norm": 0.8551223278045654, "learning_rate": 0.0002, "epoch": 3.570556552962298, "step": 49720}, {"loss": 0.621, "grad_norm": 0.8041176199913025, "learning_rate": 0.0002, "epoch": 3.571274685816876, "step": 49730}, {"loss": 0.5818, "grad_norm": 0.9862527847290039, "learning_rate": 0.0002, "epoch": 3.571992818671454, "step": 49740}, {"loss": 0.6448, "grad_norm": 0.7557165622711182, "learning_rate": 0.0002, "epoch": 3.5727109515260325, "step": 49750}, {"loss": 0.6484, "grad_norm": 1.0908563137054443, "learning_rate": 0.0002, "epoch": 3.5734290843806105, "step": 49760}, {"loss": 0.6497, "grad_norm": 0.7245369553565979, "learning_rate": 0.0002, "epoch": 3.5741472172351885, "step": 49770}, {"loss": 0.6315, "grad_norm": 0.7851184010505676, "learning_rate": 0.0002, "epoch": 3.5748653500897665, "step": 49780}, {"loss": 0.6245, "grad_norm": 0.9443599581718445, "learning_rate": 0.0002, "epoch": 3.575583482944345, "step": 49790}, {"loss": 0.6481, "grad_norm": 1.021196961402893, "learning_rate": 0.0002, "epoch": 3.576301615798923, "step": 49800}, {"loss": 0.6368, "grad_norm": 0.9099196195602417, "learning_rate": 0.0002, "epoch": 3.577019748653501, "step": 49810}, {"loss": 0.6372, "grad_norm": 0.9397716522216797, "learning_rate": 0.0002, "epoch": 3.577737881508079, "step": 49820}, {"loss": 0.6208, "grad_norm": 0.9214922785758972, "learning_rate": 0.0002, "epoch": 3.578456014362657, "step": 49830}, {"loss": 0.6219, "grad_norm": 1.0053879022598267, "learning_rate": 0.0002, "epoch": 3.579174147217235, "step": 49840}, {"loss": 0.6283, "grad_norm": 0.9415460228919983, "learning_rate": 0.0002, "epoch": 3.5798922800718134, "step": 49850}, {"loss": 0.6759, "grad_norm": 1.0807833671569824, "learning_rate": 0.0002, "epoch": 3.5806104129263914, "step": 49860}, {"loss": 0.6404, "grad_norm": 1.0070871114730835, "learning_rate": 0.0002, "epoch": 3.5813285457809694, "step": 49870}, {"loss": 0.6411, "grad_norm": 0.9707024693489075, "learning_rate": 0.0002, "epoch": 3.582046678635548, "step": 49880}, {"loss": 0.6852, "grad_norm": 0.9979593753814697, "learning_rate": 0.0002, "epoch": 3.582764811490126, "step": 49890}, {"loss": 0.6519, "grad_norm": 0.7238648533821106, "learning_rate": 0.0002, "epoch": 3.583482944344704, "step": 49900}, {"loss": 0.6452, "grad_norm": 0.8168631792068481, "learning_rate": 0.0002, "epoch": 3.584201077199282, "step": 49910}, {"loss": 0.6174, "grad_norm": 0.8156409859657288, "learning_rate": 0.0002, "epoch": 3.58491921005386, "step": 49920}, {"loss": 0.6248, "grad_norm": 0.9256414175033569, "learning_rate": 0.0002, "epoch": 3.585637342908438, "step": 49930}, {"loss": 0.6077, "grad_norm": 1.0090070962905884, "learning_rate": 0.0002, "epoch": 3.5863554757630163, "step": 49940}, {"loss": 0.6016, "grad_norm": 0.8257701992988586, "learning_rate": 0.0002, "epoch": 3.5870736086175943, "step": 49950}, {"loss": 0.6996, "grad_norm": 0.9189013242721558, "learning_rate": 0.0002, "epoch": 3.5877917414721723, "step": 49960}, {"loss": 0.661, "grad_norm": 0.8497788310050964, "learning_rate": 0.0002, "epoch": 3.5885098743267507, "step": 49970}, {"loss": 0.6335, "grad_norm": 0.9596505761146545, "learning_rate": 0.0002, "epoch": 3.5892280071813287, "step": 49980}, {"loss": 0.697, "grad_norm": 0.8773331642150879, "learning_rate": 0.0002, "epoch": 3.5899461400359067, "step": 49990}, {"loss": 0.6259, "grad_norm": 0.8952302932739258, "learning_rate": 0.0002, "epoch": 3.5906642728904847, "step": 50000}, {"loss": 0.6152, "grad_norm": 0.7713809609413147, "learning_rate": 0.0002, "epoch": 3.5913824057450627, "step": 50010}, {"loss": 0.6127, "grad_norm": 1.0151346921920776, "learning_rate": 0.0002, "epoch": 3.5921005385996407, "step": 50020}, {"loss": 0.6093, "grad_norm": 0.8793733716011047, "learning_rate": 0.0002, "epoch": 3.592818671454219, "step": 50030}, {"loss": 0.5986, "grad_norm": 0.8881325721740723, "learning_rate": 0.0002, "epoch": 3.593536804308797, "step": 50040}, {"loss": 0.6351, "grad_norm": 0.9346749782562256, "learning_rate": 0.0002, "epoch": 3.594254937163375, "step": 50050}, {"loss": 0.6501, "grad_norm": 0.8705052137374878, "learning_rate": 0.0002, "epoch": 3.594973070017953, "step": 50060}, {"loss": 0.6753, "grad_norm": 1.039197564125061, "learning_rate": 0.0002, "epoch": 3.5956912028725316, "step": 50070}, {"loss": 0.6565, "grad_norm": 0.7053273320198059, "learning_rate": 0.0002, "epoch": 3.5964093357271096, "step": 50080}, {"loss": 0.6546, "grad_norm": 0.8268665671348572, "learning_rate": 0.0002, "epoch": 3.5971274685816876, "step": 50090}, {"loss": 0.6637, "grad_norm": 0.8921764492988586, "learning_rate": 0.0002, "epoch": 3.5978456014362656, "step": 50100}, {"loss": 0.6827, "grad_norm": 0.9756084680557251, "learning_rate": 0.0002, "epoch": 3.5985637342908436, "step": 50110}, {"loss": 0.6746, "grad_norm": 0.9275530576705933, "learning_rate": 0.0002, "epoch": 3.5992818671454216, "step": 50120}, {"loss": 0.6709, "grad_norm": 0.9030009508132935, "learning_rate": 0.0002, "epoch": 3.6, "step": 50130}, {"loss": 0.6344, "grad_norm": 0.7805638909339905, "learning_rate": 0.0002, "epoch": 3.600718132854578, "step": 50140}, {"loss": 0.6437, "grad_norm": 0.7627325057983398, "learning_rate": 0.0002, "epoch": 3.601436265709156, "step": 50150}, {"loss": 0.6523, "grad_norm": 0.7809714078903198, "learning_rate": 0.0002, "epoch": 3.6021543985637345, "step": 50160}, {"loss": 0.6578, "grad_norm": 0.7910378575325012, "learning_rate": 0.0002, "epoch": 3.6028725314183125, "step": 50170}, {"loss": 0.6522, "grad_norm": 1.004438042640686, "learning_rate": 0.0002, "epoch": 3.6035906642728905, "step": 50180}, {"loss": 0.6657, "grad_norm": 0.825969934463501, "learning_rate": 0.0002, "epoch": 3.6043087971274685, "step": 50190}, {"loss": 0.6788, "grad_norm": 0.8866565227508545, "learning_rate": 0.0002, "epoch": 3.6050269299820465, "step": 50200}, {"loss": 0.6643, "grad_norm": 0.8920543193817139, "learning_rate": 0.0002, "epoch": 3.6057450628366245, "step": 50210}, {"loss": 0.668, "grad_norm": 1.106584906578064, "learning_rate": 0.0002, "epoch": 3.606463195691203, "step": 50220}, {"loss": 0.6878, "grad_norm": 0.916607677936554, "learning_rate": 0.0002, "epoch": 3.607181328545781, "step": 50230}, {"loss": 0.6084, "grad_norm": 0.8014767169952393, "learning_rate": 0.0002, "epoch": 3.607899461400359, "step": 50240}, {"loss": 0.6718, "grad_norm": 0.9556822776794434, "learning_rate": 0.0002, "epoch": 3.608617594254937, "step": 50250}, {"loss": 0.6896, "grad_norm": 0.9630016684532166, "learning_rate": 0.0002, "epoch": 3.6093357271095154, "step": 50260}, {"loss": 0.692, "grad_norm": 0.9862125515937805, "learning_rate": 0.0002, "epoch": 3.6100538599640934, "step": 50270}, {"loss": 0.5981, "grad_norm": 1.0043333768844604, "learning_rate": 0.0002, "epoch": 3.6107719928186714, "step": 50280}, {"loss": 0.6243, "grad_norm": 0.9255319833755493, "learning_rate": 0.0002, "epoch": 3.6114901256732495, "step": 50290}, {"loss": 0.6374, "grad_norm": 1.012023687362671, "learning_rate": 0.0002, "epoch": 3.6122082585278275, "step": 50300}, {"loss": 0.6896, "grad_norm": 1.0701122283935547, "learning_rate": 0.0002, "epoch": 3.612926391382406, "step": 50310}, {"loss": 0.6474, "grad_norm": 0.8270810842514038, "learning_rate": 0.0002, "epoch": 3.613644524236984, "step": 50320}, {"loss": 0.6667, "grad_norm": 0.8881328105926514, "learning_rate": 0.0002, "epoch": 3.614362657091562, "step": 50330}, {"loss": 0.6517, "grad_norm": 0.9536844491958618, "learning_rate": 0.0002, "epoch": 3.61508078994614, "step": 50340}, {"loss": 0.62, "grad_norm": 0.8044326305389404, "learning_rate": 0.0002, "epoch": 3.6157989228007184, "step": 50350}, {"loss": 0.6259, "grad_norm": 0.834591805934906, "learning_rate": 0.0002, "epoch": 3.6165170556552964, "step": 50360}, {"loss": 0.7173, "grad_norm": 0.903752863407135, "learning_rate": 0.0002, "epoch": 3.6172351885098744, "step": 50370}, {"loss": 0.6305, "grad_norm": 0.9148632884025574, "learning_rate": 0.0002, "epoch": 3.6179533213644524, "step": 50380}, {"loss": 0.6624, "grad_norm": 0.9280176162719727, "learning_rate": 0.0002, "epoch": 3.6186714542190304, "step": 50390}, {"loss": 0.6457, "grad_norm": 0.9524136781692505, "learning_rate": 0.0002, "epoch": 3.6193895870736084, "step": 50400}, {"loss": 0.6918, "grad_norm": 1.1751197576522827, "learning_rate": 0.0002, "epoch": 3.620107719928187, "step": 50410}, {"loss": 0.6161, "grad_norm": 1.032279133796692, "learning_rate": 0.0002, "epoch": 3.620825852782765, "step": 50420}, {"loss": 0.6347, "grad_norm": 0.790741503238678, "learning_rate": 0.0002, "epoch": 3.621543985637343, "step": 50430}, {"loss": 0.695, "grad_norm": 0.9584221243858337, "learning_rate": 0.0002, "epoch": 3.6222621184919213, "step": 50440}, {"loss": 0.6393, "grad_norm": 0.7792508006095886, "learning_rate": 0.0002, "epoch": 3.6229802513464993, "step": 50450}, {"loss": 0.6398, "grad_norm": 0.8273448944091797, "learning_rate": 0.0002, "epoch": 3.6236983842010773, "step": 50460}, {"loss": 0.6436, "grad_norm": 0.8001132607460022, "learning_rate": 0.0002, "epoch": 3.6244165170556553, "step": 50470}, {"loss": 0.6499, "grad_norm": 1.077109694480896, "learning_rate": 0.0002, "epoch": 3.6251346499102333, "step": 50480}, {"loss": 0.6587, "grad_norm": 1.111274003982544, "learning_rate": 0.0002, "epoch": 3.6258527827648113, "step": 50490}, {"loss": 0.6842, "grad_norm": 0.7757347822189331, "learning_rate": 0.0002, "epoch": 3.6265709156193897, "step": 50500}, {"loss": 0.6887, "grad_norm": 0.9217049479484558, "learning_rate": 0.0002, "epoch": 3.6272890484739677, "step": 50510}, {"loss": 0.6903, "grad_norm": 0.9362251162528992, "learning_rate": 0.0002, "epoch": 3.6280071813285457, "step": 50520}, {"loss": 0.625, "grad_norm": 0.9435479044914246, "learning_rate": 0.0002, "epoch": 3.6287253141831237, "step": 50530}, {"loss": 0.5869, "grad_norm": 0.7748915553092957, "learning_rate": 0.0002, "epoch": 3.629443447037702, "step": 50540}, {"loss": 0.637, "grad_norm": 0.8238945007324219, "learning_rate": 0.0002, "epoch": 3.63016157989228, "step": 50550}, {"loss": 0.6251, "grad_norm": 0.8421505093574524, "learning_rate": 0.0002, "epoch": 3.630879712746858, "step": 50560}, {"loss": 0.6544, "grad_norm": 1.0272293090820312, "learning_rate": 0.0002, "epoch": 3.631597845601436, "step": 50570}, {"loss": 0.6467, "grad_norm": 0.7643818259239197, "learning_rate": 0.0002, "epoch": 3.632315978456014, "step": 50580}, {"loss": 0.6716, "grad_norm": 0.9756225347518921, "learning_rate": 0.0002, "epoch": 3.6330341113105926, "step": 50590}, {"loss": 0.6534, "grad_norm": 0.9311570525169373, "learning_rate": 0.0002, "epoch": 3.6337522441651706, "step": 50600}, {"loss": 0.6465, "grad_norm": 0.8829827904701233, "learning_rate": 0.0002, "epoch": 3.6344703770197486, "step": 50610}, {"loss": 0.626, "grad_norm": 0.9473454356193542, "learning_rate": 0.0002, "epoch": 3.6351885098743266, "step": 50620}, {"loss": 0.713, "grad_norm": 1.1023668050765991, "learning_rate": 0.0002, "epoch": 3.635906642728905, "step": 50630}, {"loss": 0.6287, "grad_norm": 0.8490299582481384, "learning_rate": 0.0002, "epoch": 3.636624775583483, "step": 50640}, {"loss": 0.6373, "grad_norm": 1.1129392385482788, "learning_rate": 0.0002, "epoch": 3.637342908438061, "step": 50650}, {"loss": 0.7351, "grad_norm": 1.0334501266479492, "learning_rate": 0.0002, "epoch": 3.638061041292639, "step": 50660}, {"loss": 0.69, "grad_norm": 0.8397296667098999, "learning_rate": 0.0002, "epoch": 3.638779174147217, "step": 50670}, {"loss": 0.6075, "grad_norm": 0.7984256744384766, "learning_rate": 0.0002, "epoch": 3.639497307001795, "step": 50680}, {"loss": 0.651, "grad_norm": 1.1182054281234741, "learning_rate": 0.0002, "epoch": 3.6402154398563735, "step": 50690}, {"loss": 0.6511, "grad_norm": 0.8743279576301575, "learning_rate": 0.0002, "epoch": 3.6409335727109515, "step": 50700}, {"loss": 0.6894, "grad_norm": 0.9101628661155701, "learning_rate": 0.0002, "epoch": 3.6416517055655295, "step": 50710}, {"loss": 0.6591, "grad_norm": 0.8866934180259705, "learning_rate": 0.0002, "epoch": 3.642369838420108, "step": 50720}, {"loss": 0.6483, "grad_norm": 0.863945484161377, "learning_rate": 0.0002, "epoch": 3.643087971274686, "step": 50730}, {"loss": 0.6443, "grad_norm": 1.0845744609832764, "learning_rate": 0.0002, "epoch": 3.643806104129264, "step": 50740}, {"loss": 0.6611, "grad_norm": 0.8610911965370178, "learning_rate": 0.0002, "epoch": 3.644524236983842, "step": 50750}, {"loss": 0.6617, "grad_norm": 0.8502625226974487, "learning_rate": 0.0002, "epoch": 3.64524236983842, "step": 50760}, {"loss": 0.6283, "grad_norm": 0.847372829914093, "learning_rate": 0.0002, "epoch": 3.645960502692998, "step": 50770}, {"loss": 0.5724, "grad_norm": 0.8649292588233948, "learning_rate": 0.0002, "epoch": 3.6466786355475764, "step": 50780}, {"loss": 0.6253, "grad_norm": 0.8742905855178833, "learning_rate": 0.0002, "epoch": 3.6473967684021544, "step": 50790}, {"loss": 0.68, "grad_norm": 0.9546048641204834, "learning_rate": 0.0002, "epoch": 3.6481149012567324, "step": 50800}, {"loss": 0.6212, "grad_norm": 0.7893161773681641, "learning_rate": 0.0002, "epoch": 3.6488330341113104, "step": 50810}, {"loss": 0.6328, "grad_norm": 0.9350247979164124, "learning_rate": 0.0002, "epoch": 3.649551166965889, "step": 50820}, {"loss": 0.6893, "grad_norm": 0.772149384021759, "learning_rate": 0.0002, "epoch": 3.650269299820467, "step": 50830}, {"loss": 0.6107, "grad_norm": 0.8281718492507935, "learning_rate": 0.0002, "epoch": 3.650987432675045, "step": 50840}, {"loss": 0.6136, "grad_norm": 0.8063850402832031, "learning_rate": 0.0002, "epoch": 3.651705565529623, "step": 50850}, {"loss": 0.6416, "grad_norm": 0.8101351261138916, "learning_rate": 0.0002, "epoch": 3.652423698384201, "step": 50860}, {"loss": 0.6636, "grad_norm": 0.8747833371162415, "learning_rate": 0.0002, "epoch": 3.6531418312387793, "step": 50870}, {"loss": 0.6575, "grad_norm": 0.9634656310081482, "learning_rate": 0.0002, "epoch": 3.6538599640933573, "step": 50880}, {"loss": 0.6227, "grad_norm": 1.1646045446395874, "learning_rate": 0.0002, "epoch": 3.6545780969479353, "step": 50890}, {"loss": 0.6628, "grad_norm": 0.8538454174995422, "learning_rate": 0.0002, "epoch": 3.6552962298025133, "step": 50900}, {"loss": 0.6488, "grad_norm": 0.7639184594154358, "learning_rate": 0.0002, "epoch": 3.656014362657092, "step": 50910}, {"loss": 0.6495, "grad_norm": 0.8750212788581848, "learning_rate": 0.0002, "epoch": 3.65673249551167, "step": 50920}, {"loss": 0.6601, "grad_norm": 0.9161198735237122, "learning_rate": 0.0002, "epoch": 3.657450628366248, "step": 50930}, {"loss": 0.6809, "grad_norm": 0.7987924814224243, "learning_rate": 0.0002, "epoch": 3.658168761220826, "step": 50940}, {"loss": 0.6228, "grad_norm": 0.8939290642738342, "learning_rate": 0.0002, "epoch": 3.658886894075404, "step": 50950}, {"loss": 0.687, "grad_norm": 0.9803797602653503, "learning_rate": 0.0002, "epoch": 3.659605026929982, "step": 50960}, {"loss": 0.6368, "grad_norm": 1.2423512935638428, "learning_rate": 0.0002, "epoch": 3.6603231597845602, "step": 50970}, {"loss": 0.6477, "grad_norm": 1.0023225545883179, "learning_rate": 0.0002, "epoch": 3.6610412926391382, "step": 50980}, {"loss": 0.6659, "grad_norm": 0.9066677689552307, "learning_rate": 0.0002, "epoch": 3.6617594254937162, "step": 50990}, {"loss": 0.6348, "grad_norm": 0.8906226754188538, "learning_rate": 0.0002, "epoch": 3.6624775583482947, "step": 51000}, {"loss": 0.5967, "grad_norm": 0.7449954152107239, "learning_rate": 0.0002, "epoch": 3.6631956912028727, "step": 51010}, {"loss": 0.6167, "grad_norm": 0.812612771987915, "learning_rate": 0.0002, "epoch": 3.6639138240574507, "step": 51020}, {"loss": 0.6414, "grad_norm": 0.861818253993988, "learning_rate": 0.0002, "epoch": 3.6646319569120287, "step": 51030}, {"loss": 0.6418, "grad_norm": 0.849726676940918, "learning_rate": 0.0002, "epoch": 3.6653500897666067, "step": 51040}, {"loss": 0.6613, "grad_norm": 0.9738494753837585, "learning_rate": 0.0002, "epoch": 3.6660682226211847, "step": 51050}, {"loss": 0.6094, "grad_norm": 0.928989827632904, "learning_rate": 0.0002, "epoch": 3.666786355475763, "step": 51060}, {"loss": 0.623, "grad_norm": 0.9725563526153564, "learning_rate": 0.0002, "epoch": 3.667504488330341, "step": 51070}, {"loss": 0.5967, "grad_norm": 0.9366095066070557, "learning_rate": 0.0002, "epoch": 3.668222621184919, "step": 51080}, {"loss": 0.6175, "grad_norm": 0.8012986779212952, "learning_rate": 0.0002, "epoch": 3.668940754039497, "step": 51090}, {"loss": 0.6428, "grad_norm": 1.0646892786026, "learning_rate": 0.0002, "epoch": 3.6696588868940756, "step": 51100}, {"loss": 0.6333, "grad_norm": 0.7245157361030579, "learning_rate": 0.0002, "epoch": 3.6703770197486536, "step": 51110}, {"loss": 0.6618, "grad_norm": 0.6938936114311218, "learning_rate": 0.0002, "epoch": 3.6710951526032316, "step": 51120}, {"loss": 0.6511, "grad_norm": 0.8461366295814514, "learning_rate": 0.0002, "epoch": 3.6718132854578096, "step": 51130}, {"loss": 0.6168, "grad_norm": 0.8392583131790161, "learning_rate": 0.0002, "epoch": 3.6725314183123876, "step": 51140}, {"loss": 0.6616, "grad_norm": 0.7245259284973145, "learning_rate": 0.0002, "epoch": 3.673249551166966, "step": 51150}, {"loss": 0.6165, "grad_norm": 1.0742167234420776, "learning_rate": 0.0002, "epoch": 3.673967684021544, "step": 51160}, {"loss": 0.6805, "grad_norm": 0.9553889036178589, "learning_rate": 0.0002, "epoch": 3.674685816876122, "step": 51170}, {"loss": 0.6065, "grad_norm": 0.8713715672492981, "learning_rate": 0.0002, "epoch": 3.6754039497307, "step": 51180}, {"loss": 0.599, "grad_norm": 0.7499800324440002, "learning_rate": 0.0002, "epoch": 3.6761220825852785, "step": 51190}, {"loss": 0.7143, "grad_norm": 1.1118139028549194, "learning_rate": 0.0002, "epoch": 3.6768402154398565, "step": 51200}, {"loss": 0.6694, "grad_norm": 0.8146613836288452, "learning_rate": 0.0002, "epoch": 3.6775583482944345, "step": 51210}, {"loss": 0.6528, "grad_norm": 0.9331285357475281, "learning_rate": 0.0002, "epoch": 3.6782764811490125, "step": 51220}, {"loss": 0.6429, "grad_norm": 1.0497597455978394, "learning_rate": 0.0002, "epoch": 3.6789946140035905, "step": 51230}, {"loss": 0.6404, "grad_norm": 0.879814863204956, "learning_rate": 0.0002, "epoch": 3.6797127468581685, "step": 51240}, {"loss": 0.6617, "grad_norm": 0.9896606802940369, "learning_rate": 0.0002, "epoch": 3.680430879712747, "step": 51250}, {"loss": 0.6461, "grad_norm": 0.928236186504364, "learning_rate": 0.0002, "epoch": 3.681149012567325, "step": 51260}, {"loss": 0.6516, "grad_norm": 0.8436732292175293, "learning_rate": 0.0002, "epoch": 3.681867145421903, "step": 51270}, {"loss": 0.6428, "grad_norm": 0.93634432554245, "learning_rate": 0.0002, "epoch": 3.6825852782764814, "step": 51280}, {"loss": 0.6081, "grad_norm": 0.8477143049240112, "learning_rate": 0.0002, "epoch": 3.6833034111310594, "step": 51290}, {"loss": 0.6536, "grad_norm": 0.8720934987068176, "learning_rate": 0.0002, "epoch": 3.6840215439856374, "step": 51300}, {"loss": 0.6523, "grad_norm": 0.7322931289672852, "learning_rate": 0.0002, "epoch": 3.6847396768402154, "step": 51310}, {"loss": 0.6475, "grad_norm": 1.0064427852630615, "learning_rate": 0.0002, "epoch": 3.6854578096947934, "step": 51320}, {"loss": 0.681, "grad_norm": 1.0197817087173462, "learning_rate": 0.0002, "epoch": 3.6861759425493714, "step": 51330}, {"loss": 0.5904, "grad_norm": 0.8764060139656067, "learning_rate": 0.0002, "epoch": 3.68689407540395, "step": 51340}, {"loss": 0.625, "grad_norm": 0.9763964414596558, "learning_rate": 0.0002, "epoch": 3.687612208258528, "step": 51350}, {"loss": 0.6299, "grad_norm": 0.8389105200767517, "learning_rate": 0.0002, "epoch": 3.688330341113106, "step": 51360}, {"loss": 0.6885, "grad_norm": 0.9215750694274902, "learning_rate": 0.0002, "epoch": 3.689048473967684, "step": 51370}, {"loss": 0.6325, "grad_norm": 0.8444913625717163, "learning_rate": 0.0002, "epoch": 3.6897666068222623, "step": 51380}, {"loss": 0.657, "grad_norm": 0.9635153412818909, "learning_rate": 0.0002, "epoch": 3.6904847396768403, "step": 51390}, {"loss": 0.7045, "grad_norm": 1.0397378206253052, "learning_rate": 0.0002, "epoch": 3.6912028725314183, "step": 51400}, {"loss": 0.6635, "grad_norm": 0.9154748320579529, "learning_rate": 0.0002, "epoch": 3.6919210053859963, "step": 51410}, {"loss": 0.6757, "grad_norm": 0.906445324420929, "learning_rate": 0.0002, "epoch": 3.6926391382405743, "step": 51420}, {"loss": 0.6533, "grad_norm": 0.9237992763519287, "learning_rate": 0.0002, "epoch": 3.6933572710951523, "step": 51430}, {"loss": 0.6257, "grad_norm": 0.8796338438987732, "learning_rate": 0.0002, "epoch": 3.6940754039497308, "step": 51440}, {"loss": 0.7063, "grad_norm": 0.8613203763961792, "learning_rate": 0.0002, "epoch": 3.6947935368043088, "step": 51450}, {"loss": 0.6455, "grad_norm": 0.7957607507705688, "learning_rate": 0.0002, "epoch": 3.6955116696588868, "step": 51460}, {"loss": 0.6328, "grad_norm": 0.9183711409568787, "learning_rate": 0.0002, "epoch": 3.6962298025134652, "step": 51470}, {"loss": 0.6289, "grad_norm": 1.0108308792114258, "learning_rate": 0.0002, "epoch": 3.6969479353680432, "step": 51480}, {"loss": 0.668, "grad_norm": 0.7768247127532959, "learning_rate": 0.0002, "epoch": 3.6976660682226212, "step": 51490}, {"loss": 0.6483, "grad_norm": 1.0051485300064087, "learning_rate": 0.0002, "epoch": 3.6983842010771992, "step": 51500}, {"loss": 0.6268, "grad_norm": 0.82451993227005, "learning_rate": 0.0002, "epoch": 3.6991023339317772, "step": 51510}, {"loss": 0.6258, "grad_norm": 0.9542286992073059, "learning_rate": 0.0002, "epoch": 3.6998204667863552, "step": 51520}, {"loss": 0.6415, "grad_norm": 0.693890392780304, "learning_rate": 0.0002, "epoch": 3.7005385996409337, "step": 51530}, {"loss": 0.6445, "grad_norm": 0.9068924784660339, "learning_rate": 0.0002, "epoch": 3.7012567324955117, "step": 51540}, {"loss": 0.6386, "grad_norm": 0.8694922924041748, "learning_rate": 0.0002, "epoch": 3.7019748653500897, "step": 51550}, {"loss": 0.6563, "grad_norm": 0.941081702709198, "learning_rate": 0.0002, "epoch": 3.702692998204668, "step": 51560}, {"loss": 0.6068, "grad_norm": 0.7385984659194946, "learning_rate": 0.0002, "epoch": 3.703411131059246, "step": 51570}, {"loss": 0.6243, "grad_norm": 1.0399216413497925, "learning_rate": 0.0002, "epoch": 3.704129263913824, "step": 51580}, {"loss": 0.6776, "grad_norm": 0.9802294969558716, "learning_rate": 0.0002, "epoch": 3.704847396768402, "step": 51590}, {"loss": 0.6243, "grad_norm": 1.0409669876098633, "learning_rate": 0.0002, "epoch": 3.70556552962298, "step": 51600}, {"loss": 0.6812, "grad_norm": 0.8972786068916321, "learning_rate": 0.0002, "epoch": 3.706283662477558, "step": 51610}, {"loss": 0.5993, "grad_norm": 1.1916245222091675, "learning_rate": 0.0002, "epoch": 3.7070017953321366, "step": 51620}, {"loss": 0.6566, "grad_norm": 0.9545385241508484, "learning_rate": 0.0002, "epoch": 3.7077199281867146, "step": 51630}, {"loss": 0.6497, "grad_norm": 1.0773427486419678, "learning_rate": 0.0002, "epoch": 3.7084380610412926, "step": 51640}, {"loss": 0.6768, "grad_norm": 1.0856024026870728, "learning_rate": 0.0002, "epoch": 3.7091561938958706, "step": 51650}, {"loss": 0.6404, "grad_norm": 0.7678500413894653, "learning_rate": 0.0002, "epoch": 3.709874326750449, "step": 51660}, {"loss": 0.6571, "grad_norm": 0.7276270985603333, "learning_rate": 0.0002, "epoch": 3.710592459605027, "step": 51670}, {"loss": 0.6498, "grad_norm": 0.8859017491340637, "learning_rate": 0.0002, "epoch": 3.711310592459605, "step": 51680}, {"loss": 0.6602, "grad_norm": 0.9037614464759827, "learning_rate": 0.0002, "epoch": 3.712028725314183, "step": 51690}, {"loss": 0.685, "grad_norm": 0.9223412275314331, "learning_rate": 0.0002, "epoch": 3.712746858168761, "step": 51700}, {"loss": 0.647, "grad_norm": 0.8812923431396484, "learning_rate": 0.0002, "epoch": 3.713464991023339, "step": 51710}, {"loss": 0.6546, "grad_norm": 0.8242456912994385, "learning_rate": 0.0002, "epoch": 3.7141831238779175, "step": 51720}, {"loss": 0.6462, "grad_norm": 0.8368834257125854, "learning_rate": 0.0002, "epoch": 3.7149012567324955, "step": 51730}, {"loss": 0.6432, "grad_norm": 0.8624704480171204, "learning_rate": 0.0002, "epoch": 3.7156193895870735, "step": 51740}, {"loss": 0.6367, "grad_norm": 0.9138273596763611, "learning_rate": 0.0002, "epoch": 3.716337522441652, "step": 51750}, {"loss": 0.6717, "grad_norm": 0.8088571429252625, "learning_rate": 0.0002, "epoch": 3.71705565529623, "step": 51760}, {"loss": 0.658, "grad_norm": 0.882808268070221, "learning_rate": 0.0002, "epoch": 3.717773788150808, "step": 51770}, {"loss": 0.6686, "grad_norm": 0.9368035197257996, "learning_rate": 0.0002, "epoch": 3.718491921005386, "step": 51780}, {"loss": 0.6482, "grad_norm": 0.8341794013977051, "learning_rate": 0.0002, "epoch": 3.719210053859964, "step": 51790}, {"loss": 0.6486, "grad_norm": 0.8692073225975037, "learning_rate": 0.0002, "epoch": 3.719928186714542, "step": 51800}, {"loss": 0.6591, "grad_norm": 0.7566918730735779, "learning_rate": 0.0002, "epoch": 3.7206463195691204, "step": 51810}, {"loss": 0.707, "grad_norm": 1.113138198852539, "learning_rate": 0.0002, "epoch": 3.7213644524236984, "step": 51820}, {"loss": 0.6683, "grad_norm": 0.8793158531188965, "learning_rate": 0.0002, "epoch": 3.7220825852782764, "step": 51830}, {"loss": 0.6343, "grad_norm": 0.8856439590454102, "learning_rate": 0.0002, "epoch": 3.722800718132855, "step": 51840}, {"loss": 0.6238, "grad_norm": 1.0182029008865356, "learning_rate": 0.0002, "epoch": 3.723518850987433, "step": 51850}, {"loss": 0.6743, "grad_norm": 1.1177181005477905, "learning_rate": 0.0002, "epoch": 3.724236983842011, "step": 51860}, {"loss": 0.6477, "grad_norm": 0.6600990295410156, "learning_rate": 0.0002, "epoch": 3.724955116696589, "step": 51870}, {"loss": 0.6532, "grad_norm": 1.0563536882400513, "learning_rate": 0.0002, "epoch": 3.725673249551167, "step": 51880}, {"loss": 0.6648, "grad_norm": 1.1067734956741333, "learning_rate": 0.0002, "epoch": 3.726391382405745, "step": 51890}, {"loss": 0.6547, "grad_norm": 1.0204616785049438, "learning_rate": 0.0002, "epoch": 3.7271095152603233, "step": 51900}, {"loss": 0.685, "grad_norm": 0.8647155165672302, "learning_rate": 0.0002, "epoch": 3.7278276481149013, "step": 51910}, {"loss": 0.739, "grad_norm": 1.0754971504211426, "learning_rate": 0.0002, "epoch": 3.7285457809694793, "step": 51920}, {"loss": 0.6535, "grad_norm": 1.0448992252349854, "learning_rate": 0.0002, "epoch": 3.7292639138240573, "step": 51930}, {"loss": 0.6802, "grad_norm": 0.963434100151062, "learning_rate": 0.0002, "epoch": 3.7299820466786358, "step": 51940}, {"loss": 0.6367, "grad_norm": 0.8112701773643494, "learning_rate": 0.0002, "epoch": 3.7307001795332138, "step": 51950}, {"loss": 0.6785, "grad_norm": 0.7975119948387146, "learning_rate": 0.0002, "epoch": 3.7314183123877918, "step": 51960}, {"loss": 0.6748, "grad_norm": 0.7953376173973083, "learning_rate": 0.0002, "epoch": 3.7321364452423698, "step": 51970}, {"loss": 0.6464, "grad_norm": 0.9519981741905212, "learning_rate": 0.0002, "epoch": 3.7328545780969478, "step": 51980}, {"loss": 0.6247, "grad_norm": 0.8705791234970093, "learning_rate": 0.0002, "epoch": 3.7335727109515258, "step": 51990}, {"loss": 0.6876, "grad_norm": 0.870205283164978, "learning_rate": 0.0002, "epoch": 3.734290843806104, "step": 52000}, {"loss": 0.6681, "grad_norm": 0.9558930993080139, "learning_rate": 0.0002, "epoch": 3.735008976660682, "step": 52010}, {"loss": 0.6772, "grad_norm": 0.9330434799194336, "learning_rate": 0.0002, "epoch": 3.73572710951526, "step": 52020}, {"loss": 0.6365, "grad_norm": 0.783620297908783, "learning_rate": 0.0002, "epoch": 3.7364452423698387, "step": 52030}, {"loss": 0.6275, "grad_norm": 0.7575166821479797, "learning_rate": 0.0002, "epoch": 3.7371633752244167, "step": 52040}, {"loss": 0.6859, "grad_norm": 1.0592705011367798, "learning_rate": 0.0002, "epoch": 3.7378815080789947, "step": 52050}, {"loss": 0.6704, "grad_norm": 0.9309433102607727, "learning_rate": 0.0002, "epoch": 3.7385996409335727, "step": 52060}, {"loss": 0.6607, "grad_norm": 0.972861647605896, "learning_rate": 0.0002, "epoch": 3.7393177737881507, "step": 52070}, {"loss": 0.6267, "grad_norm": 0.9318740963935852, "learning_rate": 0.0002, "epoch": 3.7400359066427287, "step": 52080}, {"loss": 0.6404, "grad_norm": 0.7938477396965027, "learning_rate": 0.0002, "epoch": 3.740754039497307, "step": 52090}, {"loss": 0.6451, "grad_norm": 1.1515966653823853, "learning_rate": 0.0002, "epoch": 3.741472172351885, "step": 52100}, {"loss": 0.6179, "grad_norm": 1.076869010925293, "learning_rate": 0.0002, "epoch": 3.742190305206463, "step": 52110}, {"loss": 0.6477, "grad_norm": 0.8516066670417786, "learning_rate": 0.0002, "epoch": 3.7429084380610416, "step": 52120}, {"loss": 0.6741, "grad_norm": 0.6853429079055786, "learning_rate": 0.0002, "epoch": 3.7436265709156196, "step": 52130}, {"loss": 0.6392, "grad_norm": 0.8179695010185242, "learning_rate": 0.0002, "epoch": 3.7443447037701976, "step": 52140}, {"loss": 0.6692, "grad_norm": 0.8395232558250427, "learning_rate": 0.0002, "epoch": 3.7450628366247756, "step": 52150}, {"loss": 0.6902, "grad_norm": 1.0178003311157227, "learning_rate": 0.0002, "epoch": 3.7457809694793536, "step": 52160}, {"loss": 0.6726, "grad_norm": 1.1801023483276367, "learning_rate": 0.0002, "epoch": 3.7464991023339316, "step": 52170}, {"loss": 0.6334, "grad_norm": 0.8215751647949219, "learning_rate": 0.0002, "epoch": 3.74721723518851, "step": 52180}, {"loss": 0.5992, "grad_norm": 1.17083740234375, "learning_rate": 0.0002, "epoch": 3.747935368043088, "step": 52190}, {"loss": 0.6219, "grad_norm": 0.9230290651321411, "learning_rate": 0.0002, "epoch": 3.748653500897666, "step": 52200}, {"loss": 0.6503, "grad_norm": 0.8431521058082581, "learning_rate": 0.0002, "epoch": 3.749371633752244, "step": 52210}, {"loss": 0.6983, "grad_norm": 0.9690840244293213, "learning_rate": 0.0002, "epoch": 3.7500897666068225, "step": 52220}, {"loss": 0.6204, "grad_norm": 1.0022395849227905, "learning_rate": 0.0002, "epoch": 3.7508078994614005, "step": 52230}, {"loss": 0.6683, "grad_norm": 1.0489065647125244, "learning_rate": 0.0002, "epoch": 3.7515260323159785, "step": 52240}, {"loss": 0.6439, "grad_norm": 0.7880696058273315, "learning_rate": 0.0002, "epoch": 3.7522441651705565, "step": 52250}, {"loss": 0.6933, "grad_norm": 1.0255829095840454, "learning_rate": 0.0002, "epoch": 3.7529622980251345, "step": 52260}, {"loss": 0.6631, "grad_norm": 0.8470141291618347, "learning_rate": 0.0002, "epoch": 3.7536804308797125, "step": 52270}, {"loss": 0.5956, "grad_norm": 0.9040523171424866, "learning_rate": 0.0002, "epoch": 3.754398563734291, "step": 52280}, {"loss": 0.6759, "grad_norm": 0.9564392566680908, "learning_rate": 0.0002, "epoch": 3.755116696588869, "step": 52290}, {"loss": 0.6717, "grad_norm": 0.907857358455658, "learning_rate": 0.0002, "epoch": 3.755834829443447, "step": 52300}, {"loss": 0.6821, "grad_norm": 0.8929873704910278, "learning_rate": 0.0002, "epoch": 3.7565529622980254, "step": 52310}, {"loss": 0.655, "grad_norm": 0.854434072971344, "learning_rate": 0.0002, "epoch": 3.7572710951526034, "step": 52320}, {"loss": 0.6668, "grad_norm": 0.8744779229164124, "learning_rate": 0.0002, "epoch": 3.7579892280071814, "step": 52330}, {"loss": 0.6628, "grad_norm": 0.9022667407989502, "learning_rate": 0.0002, "epoch": 3.7587073608617594, "step": 52340}, {"loss": 0.6275, "grad_norm": 0.8884857892990112, "learning_rate": 0.0002, "epoch": 3.7594254937163374, "step": 52350}, {"loss": 0.6585, "grad_norm": 1.0228430032730103, "learning_rate": 0.0002, "epoch": 3.7601436265709154, "step": 52360}, {"loss": 0.6092, "grad_norm": 0.8593528270721436, "learning_rate": 0.0002, "epoch": 3.760861759425494, "step": 52370}, {"loss": 0.664, "grad_norm": 0.9435563087463379, "learning_rate": 0.0002, "epoch": 3.761579892280072, "step": 52380}, {"loss": 0.6326, "grad_norm": 0.7545679807662964, "learning_rate": 0.0002, "epoch": 3.76229802513465, "step": 52390}, {"loss": 0.6628, "grad_norm": 0.9411585927009583, "learning_rate": 0.0002, "epoch": 3.7630161579892283, "step": 52400}, {"loss": 0.62, "grad_norm": 0.9764377474784851, "learning_rate": 0.0002, "epoch": 3.7637342908438063, "step": 52410}, {"loss": 0.671, "grad_norm": 1.0718384981155396, "learning_rate": 0.0002, "epoch": 3.7644524236983843, "step": 52420}, {"loss": 0.6654, "grad_norm": 0.8765230774879456, "learning_rate": 0.0002, "epoch": 3.7651705565529623, "step": 52430}, {"loss": 0.6602, "grad_norm": 0.9275036454200745, "learning_rate": 0.0002, "epoch": 3.7658886894075403, "step": 52440}, {"loss": 0.6098, "grad_norm": 0.967410147190094, "learning_rate": 0.0002, "epoch": 3.7666068222621183, "step": 52450}, {"loss": 0.6195, "grad_norm": 0.7738949060440063, "learning_rate": 0.0002, "epoch": 3.7673249551166967, "step": 52460}, {"loss": 0.6054, "grad_norm": 1.0828070640563965, "learning_rate": 0.0002, "epoch": 3.7680430879712747, "step": 52470}, {"loss": 0.6208, "grad_norm": 0.9570213556289673, "learning_rate": 0.0002, "epoch": 3.7687612208258527, "step": 52480}, {"loss": 0.6703, "grad_norm": 1.0688215494155884, "learning_rate": 0.0002, "epoch": 3.7694793536804307, "step": 52490}, {"loss": 0.5993, "grad_norm": 0.7970073223114014, "learning_rate": 0.0002, "epoch": 3.770197486535009, "step": 52500}, {"loss": 0.6537, "grad_norm": 0.7132976651191711, "learning_rate": 0.0002, "epoch": 3.770915619389587, "step": 52510}, {"loss": 0.6571, "grad_norm": 1.152268648147583, "learning_rate": 0.0002, "epoch": 3.771633752244165, "step": 52520}, {"loss": 0.6548, "grad_norm": 0.8645235896110535, "learning_rate": 0.0002, "epoch": 3.772351885098743, "step": 52530}, {"loss": 0.6918, "grad_norm": 0.7725570201873779, "learning_rate": 0.0002, "epoch": 3.773070017953321, "step": 52540}, {"loss": 0.6796, "grad_norm": 0.9718102812767029, "learning_rate": 0.0002, "epoch": 3.773788150807899, "step": 52550}, {"loss": 0.6298, "grad_norm": 0.7568017840385437, "learning_rate": 0.0002, "epoch": 3.7745062836624776, "step": 52560}, {"loss": 0.6652, "grad_norm": 0.9578912854194641, "learning_rate": 0.0002, "epoch": 3.7752244165170556, "step": 52570}, {"loss": 0.6417, "grad_norm": 0.8657314777374268, "learning_rate": 0.0002, "epoch": 3.7759425493716336, "step": 52580}, {"loss": 0.6552, "grad_norm": 0.7564393281936646, "learning_rate": 0.0002, "epoch": 3.776660682226212, "step": 52590}, {"loss": 0.69, "grad_norm": 0.7631160616874695, "learning_rate": 0.0002, "epoch": 3.77737881508079, "step": 52600}, {"loss": 0.6427, "grad_norm": 1.1852056980133057, "learning_rate": 0.0002, "epoch": 3.778096947935368, "step": 52610}, {"loss": 0.6369, "grad_norm": 1.0620790719985962, "learning_rate": 0.0002, "epoch": 3.778815080789946, "step": 52620}, {"loss": 0.6782, "grad_norm": 0.8677777647972107, "learning_rate": 0.0002, "epoch": 3.779533213644524, "step": 52630}, {"loss": 0.6249, "grad_norm": 0.9913218021392822, "learning_rate": 0.0002, "epoch": 3.780251346499102, "step": 52640}, {"loss": 0.625, "grad_norm": 0.9868429899215698, "learning_rate": 0.0002, "epoch": 3.7809694793536806, "step": 52650}, {"loss": 0.6252, "grad_norm": 0.8791782259941101, "learning_rate": 0.0002, "epoch": 3.7816876122082586, "step": 52660}, {"loss": 0.6675, "grad_norm": 0.9503955245018005, "learning_rate": 0.0002, "epoch": 3.7824057450628366, "step": 52670}, {"loss": 0.6406, "grad_norm": 0.8647131323814392, "learning_rate": 0.0002, "epoch": 3.7831238779174146, "step": 52680}, {"loss": 0.6654, "grad_norm": 0.9819629788398743, "learning_rate": 0.0002, "epoch": 3.783842010771993, "step": 52690}, {"loss": 0.593, "grad_norm": 0.8548610210418701, "learning_rate": 0.0002, "epoch": 3.784560143626571, "step": 52700}, {"loss": 0.6614, "grad_norm": 0.8706230521202087, "learning_rate": 0.0002, "epoch": 3.785278276481149, "step": 52710}, {"loss": 0.6326, "grad_norm": 1.0032461881637573, "learning_rate": 0.0002, "epoch": 3.785996409335727, "step": 52720}, {"loss": 0.6172, "grad_norm": 1.0578246116638184, "learning_rate": 0.0002, "epoch": 3.786714542190305, "step": 52730}, {"loss": 0.6392, "grad_norm": 0.9854007363319397, "learning_rate": 0.0002, "epoch": 3.7874326750448835, "step": 52740}, {"loss": 0.6462, "grad_norm": 0.8389187455177307, "learning_rate": 0.0002, "epoch": 3.7881508078994615, "step": 52750}, {"loss": 0.6515, "grad_norm": 0.9192399978637695, "learning_rate": 0.0002, "epoch": 3.7888689407540395, "step": 52760}, {"loss": 0.6436, "grad_norm": 0.9518283605575562, "learning_rate": 0.0002, "epoch": 3.7895870736086175, "step": 52770}, {"loss": 0.6548, "grad_norm": 1.1296825408935547, "learning_rate": 0.0002, "epoch": 3.790305206463196, "step": 52780}, {"loss": 0.6073, "grad_norm": 1.0589144229888916, "learning_rate": 0.0002, "epoch": 3.791023339317774, "step": 52790}, {"loss": 0.6593, "grad_norm": 0.8954343199729919, "learning_rate": 0.0002, "epoch": 3.791741472172352, "step": 52800}, {"loss": 0.6678, "grad_norm": 0.8283370733261108, "learning_rate": 0.0002, "epoch": 3.79245960502693, "step": 52810}, {"loss": 0.6865, "grad_norm": 0.910642683506012, "learning_rate": 0.0002, "epoch": 3.793177737881508, "step": 52820}, {"loss": 0.6672, "grad_norm": 0.9255108833312988, "learning_rate": 0.0002, "epoch": 3.793895870736086, "step": 52830}, {"loss": 0.6836, "grad_norm": 0.8773723244667053, "learning_rate": 0.0002, "epoch": 3.7946140035906644, "step": 52840}, {"loss": 0.6815, "grad_norm": 0.8454240560531616, "learning_rate": 0.0002, "epoch": 3.7953321364452424, "step": 52850}, {"loss": 0.6594, "grad_norm": 0.7636052966117859, "learning_rate": 0.0002, "epoch": 3.7960502692998204, "step": 52860}, {"loss": 0.6663, "grad_norm": 0.9358382821083069, "learning_rate": 0.0002, "epoch": 3.796768402154399, "step": 52870}, {"loss": 0.6761, "grad_norm": 0.9662801623344421, "learning_rate": 0.0002, "epoch": 3.797486535008977, "step": 52880}, {"loss": 0.6749, "grad_norm": 0.995907187461853, "learning_rate": 0.0002, "epoch": 3.798204667863555, "step": 52890}, {"loss": 0.6715, "grad_norm": 0.8700127005577087, "learning_rate": 0.0002, "epoch": 3.798922800718133, "step": 52900}, {"loss": 0.6554, "grad_norm": 0.8987792134284973, "learning_rate": 0.0002, "epoch": 3.799640933572711, "step": 52910}, {"loss": 0.6655, "grad_norm": 0.9753904938697815, "learning_rate": 0.0002, "epoch": 3.800359066427289, "step": 52920}, {"loss": 0.6536, "grad_norm": 0.7873555421829224, "learning_rate": 0.0002, "epoch": 3.8010771992818673, "step": 52930}, {"loss": 0.6233, "grad_norm": 0.8177929520606995, "learning_rate": 0.0002, "epoch": 3.8017953321364453, "step": 52940}, {"loss": 0.6508, "grad_norm": 0.8865532279014587, "learning_rate": 0.0002, "epoch": 3.8025134649910233, "step": 52950}, {"loss": 0.6922, "grad_norm": 0.9113775491714478, "learning_rate": 0.0002, "epoch": 3.8032315978456013, "step": 52960}, {"loss": 0.6382, "grad_norm": 0.9424585700035095, "learning_rate": 0.0002, "epoch": 3.8039497307001797, "step": 52970}, {"loss": 0.6694, "grad_norm": 0.8347237706184387, "learning_rate": 0.0002, "epoch": 3.8046678635547577, "step": 52980}, {"loss": 0.643, "grad_norm": 0.826863169670105, "learning_rate": 0.0002, "epoch": 3.8053859964093357, "step": 52990}, {"loss": 0.639, "grad_norm": 0.7313310503959656, "learning_rate": 0.0002, "epoch": 3.8061041292639137, "step": 53000}, {"loss": 0.6831, "grad_norm": 0.8352667093276978, "learning_rate": 0.0002, "epoch": 3.8068222621184917, "step": 53010}, {"loss": 0.6265, "grad_norm": 0.748461127281189, "learning_rate": 0.0002, "epoch": 3.80754039497307, "step": 53020}, {"loss": 0.6433, "grad_norm": 0.943256139755249, "learning_rate": 0.0002, "epoch": 3.808258527827648, "step": 53030}, {"loss": 0.6702, "grad_norm": 1.0448410511016846, "learning_rate": 0.0002, "epoch": 3.808976660682226, "step": 53040}, {"loss": 0.6901, "grad_norm": 0.9047636985778809, "learning_rate": 0.0002, "epoch": 3.809694793536804, "step": 53050}, {"loss": 0.6774, "grad_norm": 0.8594381213188171, "learning_rate": 0.0002, "epoch": 3.8104129263913826, "step": 53060}, {"loss": 0.6664, "grad_norm": 0.7593536972999573, "learning_rate": 0.0002, "epoch": 3.8111310592459606, "step": 53070}, {"loss": 0.6651, "grad_norm": 0.7189019918441772, "learning_rate": 0.0002, "epoch": 3.8118491921005386, "step": 53080}, {"loss": 0.6657, "grad_norm": 0.8569809198379517, "learning_rate": 0.0002, "epoch": 3.8125673249551166, "step": 53090}, {"loss": 0.6689, "grad_norm": 0.923378050327301, "learning_rate": 0.0002, "epoch": 3.8132854578096946, "step": 53100}, {"loss": 0.6168, "grad_norm": 0.9088824391365051, "learning_rate": 0.0002, "epoch": 3.8140035906642726, "step": 53110}, {"loss": 0.6514, "grad_norm": 1.1386840343475342, "learning_rate": 0.0002, "epoch": 3.814721723518851, "step": 53120}, {"loss": 0.6182, "grad_norm": 0.8389552235603333, "learning_rate": 0.0002, "epoch": 3.815439856373429, "step": 53130}, {"loss": 0.6779, "grad_norm": 0.7940975427627563, "learning_rate": 0.0002, "epoch": 3.816157989228007, "step": 53140}, {"loss": 0.6825, "grad_norm": 0.8389907479286194, "learning_rate": 0.0002, "epoch": 3.8168761220825855, "step": 53150}, {"loss": 0.6763, "grad_norm": 0.774206280708313, "learning_rate": 0.0002, "epoch": 3.8175942549371635, "step": 53160}, {"loss": 0.7011, "grad_norm": 1.189447283744812, "learning_rate": 0.0002, "epoch": 3.8183123877917415, "step": 53170}, {"loss": 0.6206, "grad_norm": 0.9875882863998413, "learning_rate": 0.0002, "epoch": 3.8190305206463195, "step": 53180}, {"loss": 0.6254, "grad_norm": 0.9205945134162903, "learning_rate": 0.0002, "epoch": 3.8197486535008975, "step": 53190}, {"loss": 0.5845, "grad_norm": 0.8312796354293823, "learning_rate": 0.0002, "epoch": 3.8204667863554755, "step": 53200}, {"loss": 0.6415, "grad_norm": 0.9755756855010986, "learning_rate": 0.0002, "epoch": 3.821184919210054, "step": 53210}, {"loss": 0.6657, "grad_norm": 1.0722965002059937, "learning_rate": 0.0002, "epoch": 3.821903052064632, "step": 53220}, {"loss": 0.6547, "grad_norm": 0.7720510959625244, "learning_rate": 0.0002, "epoch": 3.82262118491921, "step": 53230}, {"loss": 0.6383, "grad_norm": 1.020147681236267, "learning_rate": 0.0002, "epoch": 3.823339317773788, "step": 53240}, {"loss": 0.6491, "grad_norm": 0.8241816759109497, "learning_rate": 0.0002, "epoch": 3.8240574506283664, "step": 53250}, {"loss": 0.6914, "grad_norm": 0.8939895629882812, "learning_rate": 0.0002, "epoch": 3.8247755834829444, "step": 53260}, {"loss": 0.6725, "grad_norm": 1.010852336883545, "learning_rate": 0.0002, "epoch": 3.8254937163375224, "step": 53270}, {"loss": 0.6841, "grad_norm": 0.8201420307159424, "learning_rate": 0.0002, "epoch": 3.8262118491921004, "step": 53280}, {"loss": 0.6739, "grad_norm": 0.8797973990440369, "learning_rate": 0.0002, "epoch": 3.8269299820466784, "step": 53290}, {"loss": 0.658, "grad_norm": 0.9034950137138367, "learning_rate": 0.0002, "epoch": 3.827648114901257, "step": 53300}, {"loss": 0.6314, "grad_norm": 0.926802933216095, "learning_rate": 0.0002, "epoch": 3.828366247755835, "step": 53310}, {"loss": 0.6526, "grad_norm": 1.0205509662628174, "learning_rate": 0.0002, "epoch": 3.829084380610413, "step": 53320}, {"loss": 0.6596, "grad_norm": 0.9524099230766296, "learning_rate": 0.0002, "epoch": 3.829802513464991, "step": 53330}, {"loss": 0.6796, "grad_norm": 0.9692625999450684, "learning_rate": 0.0002, "epoch": 3.8305206463195693, "step": 53340}, {"loss": 0.628, "grad_norm": 0.7255275845527649, "learning_rate": 0.0002, "epoch": 3.8312387791741473, "step": 53350}, {"loss": 0.6104, "grad_norm": 0.7199059724807739, "learning_rate": 0.0002, "epoch": 3.8319569120287253, "step": 53360}, {"loss": 0.6703, "grad_norm": 1.004464864730835, "learning_rate": 0.0002, "epoch": 3.8326750448833034, "step": 53370}, {"loss": 0.7032, "grad_norm": 0.9092583060264587, "learning_rate": 0.0002, "epoch": 3.8333931777378814, "step": 53380}, {"loss": 0.6811, "grad_norm": 0.945091724395752, "learning_rate": 0.0002, "epoch": 3.8341113105924594, "step": 53390}, {"loss": 0.611, "grad_norm": 0.7980135679244995, "learning_rate": 0.0002, "epoch": 3.834829443447038, "step": 53400}, {"loss": 0.6604, "grad_norm": 0.7812868356704712, "learning_rate": 0.0002, "epoch": 3.835547576301616, "step": 53410}, {"loss": 0.6104, "grad_norm": 0.8957077860832214, "learning_rate": 0.0002, "epoch": 3.836265709156194, "step": 53420}, {"loss": 0.6754, "grad_norm": 0.9119600653648376, "learning_rate": 0.0002, "epoch": 3.8369838420107722, "step": 53430}, {"loss": 0.7346, "grad_norm": 0.8208187222480774, "learning_rate": 0.0002, "epoch": 3.8377019748653503, "step": 53440}, {"loss": 0.6549, "grad_norm": 0.7930439114570618, "learning_rate": 0.0002, "epoch": 3.8384201077199283, "step": 53450}, {"loss": 0.6192, "grad_norm": 0.8937777280807495, "learning_rate": 0.0002, "epoch": 3.8391382405745063, "step": 53460}, {"loss": 0.5954, "grad_norm": 0.7583796977996826, "learning_rate": 0.0002, "epoch": 3.8398563734290843, "step": 53470}, {"loss": 0.6217, "grad_norm": 1.0735969543457031, "learning_rate": 0.0002, "epoch": 3.8405745062836623, "step": 53480}, {"loss": 0.6472, "grad_norm": 1.1106033325195312, "learning_rate": 0.0002, "epoch": 3.8412926391382407, "step": 53490}, {"loss": 0.6813, "grad_norm": 1.092631220817566, "learning_rate": 0.0002, "epoch": 3.8420107719928187, "step": 53500}, {"loss": 0.6437, "grad_norm": 0.9961787462234497, "learning_rate": 0.0002, "epoch": 3.8427289048473967, "step": 53510}, {"loss": 0.6382, "grad_norm": 0.833831250667572, "learning_rate": 0.0002, "epoch": 3.8434470377019747, "step": 53520}, {"loss": 0.6403, "grad_norm": 1.0000009536743164, "learning_rate": 0.0002, "epoch": 3.844165170556553, "step": 53530}, {"loss": 0.6824, "grad_norm": 0.9784213304519653, "learning_rate": 0.0002, "epoch": 3.844883303411131, "step": 53540}, {"loss": 0.6816, "grad_norm": 0.8582558035850525, "learning_rate": 0.0002, "epoch": 3.845601436265709, "step": 53550}, {"loss": 0.5944, "grad_norm": 0.8267415761947632, "learning_rate": 0.0002, "epoch": 3.846319569120287, "step": 53560}, {"loss": 0.6562, "grad_norm": 0.8783000111579895, "learning_rate": 0.0002, "epoch": 3.847037701974865, "step": 53570}, {"loss": 0.6795, "grad_norm": 0.9866999983787537, "learning_rate": 0.0002, "epoch": 3.8477558348294436, "step": 53580}, {"loss": 0.7222, "grad_norm": 0.8459296226501465, "learning_rate": 0.0002, "epoch": 3.8484739676840216, "step": 53590}, {"loss": 0.6748, "grad_norm": 0.9804834723472595, "learning_rate": 0.0002, "epoch": 3.8491921005385996, "step": 53600}, {"loss": 0.6115, "grad_norm": 0.951074481010437, "learning_rate": 0.0002, "epoch": 3.8499102333931776, "step": 53610}, {"loss": 0.5914, "grad_norm": 0.8020104169845581, "learning_rate": 0.0002, "epoch": 3.850628366247756, "step": 53620}, {"loss": 0.6237, "grad_norm": 0.9296963214874268, "learning_rate": 0.0002, "epoch": 3.851346499102334, "step": 53630}, {"loss": 0.6384, "grad_norm": 0.8983652591705322, "learning_rate": 0.0002, "epoch": 3.852064631956912, "step": 53640}, {"loss": 0.6855, "grad_norm": 1.031858205795288, "learning_rate": 0.0002, "epoch": 3.85278276481149, "step": 53650}, {"loss": 0.622, "grad_norm": 0.8943952918052673, "learning_rate": 0.0002, "epoch": 3.853500897666068, "step": 53660}, {"loss": 0.6745, "grad_norm": 1.0072312355041504, "learning_rate": 0.0002, "epoch": 3.854219030520646, "step": 53670}, {"loss": 0.677, "grad_norm": 1.0604884624481201, "learning_rate": 0.0002, "epoch": 3.8549371633752245, "step": 53680}, {"loss": 0.5873, "grad_norm": 0.834223210811615, "learning_rate": 0.0002, "epoch": 3.8556552962298025, "step": 53690}, {"loss": 0.665, "grad_norm": 0.9872867465019226, "learning_rate": 0.0002, "epoch": 3.8563734290843805, "step": 53700}, {"loss": 0.6689, "grad_norm": 0.7999459505081177, "learning_rate": 0.0002, "epoch": 3.857091561938959, "step": 53710}, {"loss": 0.6744, "grad_norm": 0.717722475528717, "learning_rate": 0.0002, "epoch": 3.857809694793537, "step": 53720}, {"loss": 0.6348, "grad_norm": 1.0675442218780518, "learning_rate": 0.0002, "epoch": 3.858527827648115, "step": 53730}, {"loss": 0.6141, "grad_norm": 0.9789777398109436, "learning_rate": 0.0002, "epoch": 3.859245960502693, "step": 53740}, {"loss": 0.6455, "grad_norm": 0.9318669438362122, "learning_rate": 0.0002, "epoch": 3.859964093357271, "step": 53750}, {"loss": 0.6587, "grad_norm": 0.9848631024360657, "learning_rate": 0.0002, "epoch": 3.860682226211849, "step": 53760}, {"loss": 0.6202, "grad_norm": 0.8754391670227051, "learning_rate": 0.0002, "epoch": 3.8614003590664274, "step": 53770}, {"loss": 0.6411, "grad_norm": 0.9024585485458374, "learning_rate": 0.0002, "epoch": 3.8621184919210054, "step": 53780}, {"loss": 0.6643, "grad_norm": 0.8974794745445251, "learning_rate": 0.0002, "epoch": 3.8628366247755834, "step": 53790}, {"loss": 0.6729, "grad_norm": 0.8342790603637695, "learning_rate": 0.0002, "epoch": 3.8635547576301614, "step": 53800}, {"loss": 0.6322, "grad_norm": 0.8177682757377625, "learning_rate": 0.0002, "epoch": 3.86427289048474, "step": 53810}, {"loss": 0.6525, "grad_norm": 1.0259089469909668, "learning_rate": 0.0002, "epoch": 3.864991023339318, "step": 53820}, {"loss": 0.6508, "grad_norm": 1.042290210723877, "learning_rate": 0.0002, "epoch": 3.865709156193896, "step": 53830}, {"loss": 0.6963, "grad_norm": 0.7316540479660034, "learning_rate": 0.0002, "epoch": 3.866427289048474, "step": 53840}, {"loss": 0.6491, "grad_norm": 0.9384970664978027, "learning_rate": 0.0002, "epoch": 3.867145421903052, "step": 53850}, {"loss": 0.6689, "grad_norm": 0.9273143410682678, "learning_rate": 0.0002, "epoch": 3.86786355475763, "step": 53860}, {"loss": 0.6443, "grad_norm": 1.1183570623397827, "learning_rate": 0.0002, "epoch": 3.8685816876122083, "step": 53870}, {"loss": 0.6712, "grad_norm": 0.9455275535583496, "learning_rate": 0.0002, "epoch": 3.8692998204667863, "step": 53880}, {"loss": 0.6662, "grad_norm": 0.8702114820480347, "learning_rate": 0.0002, "epoch": 3.8700179533213643, "step": 53890}, {"loss": 0.7032, "grad_norm": 0.8751053214073181, "learning_rate": 0.0002, "epoch": 3.870736086175943, "step": 53900}, {"loss": 0.6398, "grad_norm": 0.9793110489845276, "learning_rate": 0.0002, "epoch": 3.871454219030521, "step": 53910}, {"loss": 0.6577, "grad_norm": 0.9705014824867249, "learning_rate": 0.0002, "epoch": 3.872172351885099, "step": 53920}, {"loss": 0.751, "grad_norm": 1.051504373550415, "learning_rate": 0.0002, "epoch": 3.872890484739677, "step": 53930}, {"loss": 0.6606, "grad_norm": 0.8590622544288635, "learning_rate": 0.0002, "epoch": 3.873608617594255, "step": 53940}, {"loss": 0.6495, "grad_norm": 0.7828099727630615, "learning_rate": 0.0002, "epoch": 3.874326750448833, "step": 53950}, {"loss": 0.6294, "grad_norm": 0.86341792345047, "learning_rate": 0.0002, "epoch": 3.8750448833034112, "step": 53960}, {"loss": 0.6677, "grad_norm": 1.114670991897583, "learning_rate": 0.0002, "epoch": 3.8757630161579892, "step": 53970}, {"loss": 0.6533, "grad_norm": 0.8559519052505493, "learning_rate": 0.0002, "epoch": 3.8764811490125672, "step": 53980}, {"loss": 0.6517, "grad_norm": 1.0518953800201416, "learning_rate": 0.0002, "epoch": 3.8771992818671457, "step": 53990}, {"loss": 0.6359, "grad_norm": 0.7157500982284546, "learning_rate": 0.0002, "epoch": 3.8779174147217237, "step": 54000}, {"loss": 0.6847, "grad_norm": 0.8390372395515442, "learning_rate": 0.0002, "epoch": 3.8786355475763017, "step": 54010}, {"loss": 0.6376, "grad_norm": 0.8486756086349487, "learning_rate": 0.0002, "epoch": 3.8793536804308797, "step": 54020}, {"loss": 0.6184, "grad_norm": 0.8361587524414062, "learning_rate": 0.0002, "epoch": 3.8800718132854577, "step": 54030}, {"loss": 0.6552, "grad_norm": 0.9490554928779602, "learning_rate": 0.0002, "epoch": 3.8807899461400357, "step": 54040}, {"loss": 0.6653, "grad_norm": 1.0311323404312134, "learning_rate": 0.0002, "epoch": 3.881508078994614, "step": 54050}, {"loss": 0.6484, "grad_norm": 0.84800124168396, "learning_rate": 0.0002, "epoch": 3.882226211849192, "step": 54060}, {"loss": 0.6995, "grad_norm": 0.8940879702568054, "learning_rate": 0.0002, "epoch": 3.88294434470377, "step": 54070}, {"loss": 0.6157, "grad_norm": 0.985542356967926, "learning_rate": 0.0002, "epoch": 3.883662477558348, "step": 54080}, {"loss": 0.6221, "grad_norm": 0.8846475481987, "learning_rate": 0.0002, "epoch": 3.8843806104129266, "step": 54090}, {"loss": 0.6656, "grad_norm": 0.9186338186264038, "learning_rate": 0.0002, "epoch": 3.8850987432675046, "step": 54100}, {"loss": 0.6367, "grad_norm": 1.106598973274231, "learning_rate": 0.0002, "epoch": 3.8858168761220826, "step": 54110}, {"loss": 0.6311, "grad_norm": 0.8167300224304199, "learning_rate": 0.0002, "epoch": 3.8865350089766606, "step": 54120}, {"loss": 0.694, "grad_norm": 0.9153622984886169, "learning_rate": 0.0002, "epoch": 3.8872531418312386, "step": 54130}, {"loss": 0.6669, "grad_norm": 0.8464475274085999, "learning_rate": 0.0002, "epoch": 3.8879712746858166, "step": 54140}, {"loss": 0.6658, "grad_norm": 0.8889452815055847, "learning_rate": 0.0002, "epoch": 3.888689407540395, "step": 54150}, {"loss": 0.6291, "grad_norm": 0.7861065864562988, "learning_rate": 0.0002, "epoch": 3.889407540394973, "step": 54160}, {"loss": 0.6315, "grad_norm": 0.882674515247345, "learning_rate": 0.0002, "epoch": 3.890125673249551, "step": 54170}, {"loss": 0.6223, "grad_norm": 0.8503835201263428, "learning_rate": 0.0002, "epoch": 3.8908438061041295, "step": 54180}, {"loss": 0.6176, "grad_norm": 0.888455331325531, "learning_rate": 0.0002, "epoch": 3.8915619389587075, "step": 54190}, {"loss": 0.6985, "grad_norm": 1.0473699569702148, "learning_rate": 0.0002, "epoch": 3.8922800718132855, "step": 54200}, {"loss": 0.6513, "grad_norm": 0.9548208713531494, "learning_rate": 0.0002, "epoch": 3.8929982046678635, "step": 54210}, {"loss": 0.6089, "grad_norm": 0.9158754944801331, "learning_rate": 0.0002, "epoch": 3.8937163375224415, "step": 54220}, {"loss": 0.6352, "grad_norm": 0.9001154899597168, "learning_rate": 0.0002, "epoch": 3.8944344703770195, "step": 54230}, {"loss": 0.6657, "grad_norm": 0.9736626148223877, "learning_rate": 0.0002, "epoch": 3.895152603231598, "step": 54240}, {"loss": 0.7248, "grad_norm": 0.8809846043586731, "learning_rate": 0.0002, "epoch": 3.895870736086176, "step": 54250}, {"loss": 0.6364, "grad_norm": 0.887583315372467, "learning_rate": 0.0002, "epoch": 3.896588868940754, "step": 54260}, {"loss": 0.6252, "grad_norm": 0.8395712971687317, "learning_rate": 0.0002, "epoch": 3.8973070017953324, "step": 54270}, {"loss": 0.681, "grad_norm": 0.8391315937042236, "learning_rate": 0.0002, "epoch": 3.8980251346499104, "step": 54280}, {"loss": 0.6352, "grad_norm": 0.8210049271583557, "learning_rate": 0.0002, "epoch": 3.8987432675044884, "step": 54290}, {"loss": 0.6484, "grad_norm": 1.1364530324935913, "learning_rate": 0.0002, "epoch": 3.8994614003590664, "step": 54300}, {"loss": 0.6383, "grad_norm": 0.7712056636810303, "learning_rate": 0.0002, "epoch": 3.9001795332136444, "step": 54310}, {"loss": 0.6516, "grad_norm": 0.9466049671173096, "learning_rate": 0.0002, "epoch": 3.9008976660682224, "step": 54320}, {"loss": 0.6938, "grad_norm": 1.0367140769958496, "learning_rate": 0.0002, "epoch": 3.901615798922801, "step": 54330}, {"loss": 0.672, "grad_norm": 1.0168321132659912, "learning_rate": 0.0002, "epoch": 3.902333931777379, "step": 54340}, {"loss": 0.6306, "grad_norm": 0.7830407619476318, "learning_rate": 0.0002, "epoch": 3.903052064631957, "step": 54350}, {"loss": 0.7198, "grad_norm": 0.9649789333343506, "learning_rate": 0.0002, "epoch": 3.903770197486535, "step": 54360}, {"loss": 0.6644, "grad_norm": 0.681077778339386, "learning_rate": 0.0002, "epoch": 3.9044883303411133, "step": 54370}, {"loss": 0.6677, "grad_norm": 0.8970136046409607, "learning_rate": 0.0002, "epoch": 3.9052064631956913, "step": 54380}, {"loss": 0.6581, "grad_norm": 0.9155173301696777, "learning_rate": 0.0002, "epoch": 3.9059245960502693, "step": 54390}, {"loss": 0.6711, "grad_norm": 1.0447794198989868, "learning_rate": 0.0002, "epoch": 3.9066427289048473, "step": 54400}, {"loss": 0.6883, "grad_norm": 0.7823813557624817, "learning_rate": 0.0002, "epoch": 3.9073608617594253, "step": 54410}, {"loss": 0.6688, "grad_norm": 0.9289445877075195, "learning_rate": 0.0002, "epoch": 3.9080789946140033, "step": 54420}, {"loss": 0.7024, "grad_norm": 0.9983111619949341, "learning_rate": 0.0002, "epoch": 3.9087971274685818, "step": 54430}, {"loss": 0.6687, "grad_norm": 0.7952495813369751, "learning_rate": 0.0002, "epoch": 3.9095152603231598, "step": 54440}, {"loss": 0.6118, "grad_norm": 0.8045601844787598, "learning_rate": 0.0002, "epoch": 3.9102333931777378, "step": 54450}, {"loss": 0.6388, "grad_norm": 0.936585009098053, "learning_rate": 0.0002, "epoch": 3.910951526032316, "step": 54460}, {"loss": 0.6217, "grad_norm": 0.745793879032135, "learning_rate": 0.0002, "epoch": 3.911669658886894, "step": 54470}, {"loss": 0.6814, "grad_norm": 0.9137616157531738, "learning_rate": 0.0002, "epoch": 3.912387791741472, "step": 54480}, {"loss": 0.6792, "grad_norm": 0.826316237449646, "learning_rate": 0.0002, "epoch": 3.9131059245960502, "step": 54490}, {"loss": 0.6914, "grad_norm": 0.94313645362854, "learning_rate": 0.0002, "epoch": 3.9138240574506282, "step": 54500}, {"loss": 0.62, "grad_norm": 1.045893907546997, "learning_rate": 0.0002, "epoch": 3.9145421903052062, "step": 54510}, {"loss": 0.5841, "grad_norm": 0.9122704863548279, "learning_rate": 0.0002, "epoch": 3.9152603231597847, "step": 54520}, {"loss": 0.7029, "grad_norm": 1.0999689102172852, "learning_rate": 0.0002, "epoch": 3.9159784560143627, "step": 54530}, {"loss": 0.6387, "grad_norm": 0.9281555414199829, "learning_rate": 0.0002, "epoch": 3.9166965888689407, "step": 54540}, {"loss": 0.6227, "grad_norm": 1.1439622640609741, "learning_rate": 0.0002, "epoch": 3.917414721723519, "step": 54550}, {"loss": 0.6733, "grad_norm": 0.9375617504119873, "learning_rate": 0.0002, "epoch": 3.918132854578097, "step": 54560}, {"loss": 0.6503, "grad_norm": 0.92906653881073, "learning_rate": 0.0002, "epoch": 3.918850987432675, "step": 54570}, {"loss": 0.6361, "grad_norm": 1.0840893983840942, "learning_rate": 0.0002, "epoch": 3.919569120287253, "step": 54580}, {"loss": 0.6476, "grad_norm": 0.8145509362220764, "learning_rate": 0.0002, "epoch": 3.920287253141831, "step": 54590}, {"loss": 0.6826, "grad_norm": 0.973737895488739, "learning_rate": 0.0002, "epoch": 3.921005385996409, "step": 54600}, {"loss": 0.6822, "grad_norm": 0.9302353858947754, "learning_rate": 0.0002, "epoch": 3.9217235188509876, "step": 54610}, {"loss": 0.6522, "grad_norm": 0.9167897701263428, "learning_rate": 0.0002, "epoch": 3.9224416517055656, "step": 54620}, {"loss": 0.6783, "grad_norm": 0.8096851706504822, "learning_rate": 0.0002, "epoch": 3.9231597845601436, "step": 54630}, {"loss": 0.6369, "grad_norm": 0.8006368279457092, "learning_rate": 0.0002, "epoch": 3.9238779174147216, "step": 54640}, {"loss": 0.6533, "grad_norm": 0.7800863981246948, "learning_rate": 0.0002, "epoch": 3.9245960502693, "step": 54650}, {"loss": 0.6518, "grad_norm": 1.0331560373306274, "learning_rate": 0.0002, "epoch": 3.925314183123878, "step": 54660}, {"loss": 0.6764, "grad_norm": 1.0057517290115356, "learning_rate": 0.0002, "epoch": 3.926032315978456, "step": 54670}, {"loss": 0.6636, "grad_norm": 0.8920564651489258, "learning_rate": 0.0002, "epoch": 3.926750448833034, "step": 54680}, {"loss": 0.6432, "grad_norm": 0.7704599499702454, "learning_rate": 0.0002, "epoch": 3.927468581687612, "step": 54690}, {"loss": 0.6532, "grad_norm": 0.827032208442688, "learning_rate": 0.0002, "epoch": 3.92818671454219, "step": 54700}, {"loss": 0.7083, "grad_norm": 1.0019268989562988, "learning_rate": 0.0002, "epoch": 3.9289048473967685, "step": 54710}, {"loss": 0.6026, "grad_norm": 0.862033486366272, "learning_rate": 0.0002, "epoch": 3.9296229802513465, "step": 54720}, {"loss": 0.599, "grad_norm": 0.8965592980384827, "learning_rate": 0.0002, "epoch": 3.9303411131059245, "step": 54730}, {"loss": 0.6739, "grad_norm": 0.7689077854156494, "learning_rate": 0.0002, "epoch": 3.931059245960503, "step": 54740}, {"loss": 0.6401, "grad_norm": 0.846276581287384, "learning_rate": 0.0002, "epoch": 3.931777378815081, "step": 54750}, {"loss": 0.6942, "grad_norm": 0.8932713866233826, "learning_rate": 0.0002, "epoch": 3.932495511669659, "step": 54760}, {"loss": 0.6697, "grad_norm": 0.9711386561393738, "learning_rate": 0.0002, "epoch": 3.933213644524237, "step": 54770}, {"loss": 0.6672, "grad_norm": 0.9290250539779663, "learning_rate": 0.0002, "epoch": 3.933931777378815, "step": 54780}, {"loss": 0.6365, "grad_norm": 1.0897367000579834, "learning_rate": 0.0002, "epoch": 3.934649910233393, "step": 54790}, {"loss": 0.6647, "grad_norm": 0.8451842665672302, "learning_rate": 0.0002, "epoch": 3.9353680430879714, "step": 54800}, {"loss": 0.6705, "grad_norm": 0.8400090336799622, "learning_rate": 0.0002, "epoch": 3.9360861759425494, "step": 54810}, {"loss": 0.6577, "grad_norm": 0.951383650302887, "learning_rate": 0.0002, "epoch": 3.9368043087971274, "step": 54820}, {"loss": 0.654, "grad_norm": 0.848838210105896, "learning_rate": 0.0002, "epoch": 3.937522441651706, "step": 54830}, {"loss": 0.6852, "grad_norm": 0.735763669013977, "learning_rate": 0.0002, "epoch": 3.938240574506284, "step": 54840}, {"loss": 0.6574, "grad_norm": 0.979037344455719, "learning_rate": 0.0002, "epoch": 3.938958707360862, "step": 54850}, {"loss": 0.5851, "grad_norm": 0.933674693107605, "learning_rate": 0.0002, "epoch": 3.93967684021544, "step": 54860}, {"loss": 0.6931, "grad_norm": 0.835593044757843, "learning_rate": 0.0002, "epoch": 3.940394973070018, "step": 54870}, {"loss": 0.6967, "grad_norm": 1.0034281015396118, "learning_rate": 0.0002, "epoch": 3.941113105924596, "step": 54880}, {"loss": 0.6442, "grad_norm": 0.9732975959777832, "learning_rate": 0.0002, "epoch": 3.9418312387791743, "step": 54890}, {"loss": 0.6657, "grad_norm": 0.9666336178779602, "learning_rate": 0.0002, "epoch": 3.9425493716337523, "step": 54900}, {"loss": 0.6521, "grad_norm": 0.755310595035553, "learning_rate": 0.0002, "epoch": 3.9432675044883303, "step": 54910}, {"loss": 0.6562, "grad_norm": 0.8732092976570129, "learning_rate": 0.0002, "epoch": 3.9439856373429083, "step": 54920}, {"loss": 0.6486, "grad_norm": 1.139453649520874, "learning_rate": 0.0002, "epoch": 3.9447037701974867, "step": 54930}, {"loss": 0.6609, "grad_norm": 0.9044837951660156, "learning_rate": 0.0002, "epoch": 3.9454219030520647, "step": 54940}, {"loss": 0.6344, "grad_norm": 1.0496679544448853, "learning_rate": 0.0002, "epoch": 3.9461400359066428, "step": 54950}, {"loss": 0.6471, "grad_norm": 1.0099035501480103, "learning_rate": 0.0002, "epoch": 3.9468581687612208, "step": 54960}, {"loss": 0.6143, "grad_norm": 1.0694963932037354, "learning_rate": 0.0002, "epoch": 3.9475763016157988, "step": 54970}, {"loss": 0.6209, "grad_norm": 1.0012997388839722, "learning_rate": 0.0002, "epoch": 3.9482944344703768, "step": 54980}, {"loss": 0.7379, "grad_norm": 0.8910513520240784, "learning_rate": 0.0002, "epoch": 3.949012567324955, "step": 54990}, {"loss": 0.7184, "grad_norm": 1.0267579555511475, "learning_rate": 0.0002, "epoch": 3.949730700179533, "step": 55000}, {"loss": 0.6844, "grad_norm": 0.9786432385444641, "learning_rate": 0.0002, "epoch": 3.950448833034111, "step": 55010}, {"loss": 0.6499, "grad_norm": 0.8703538775444031, "learning_rate": 0.0002, "epoch": 3.9511669658886897, "step": 55020}, {"loss": 0.5989, "grad_norm": 0.8970484137535095, "learning_rate": 0.0002, "epoch": 3.9518850987432677, "step": 55030}, {"loss": 0.659, "grad_norm": 0.8781577944755554, "learning_rate": 0.0002, "epoch": 3.9526032315978457, "step": 55040}, {"loss": 0.6944, "grad_norm": 0.8040280938148499, "learning_rate": 0.0002, "epoch": 3.9533213644524237, "step": 55050}, {"loss": 0.6359, "grad_norm": 0.851926326751709, "learning_rate": 0.0002, "epoch": 3.9540394973070017, "step": 55060}, {"loss": 0.6806, "grad_norm": 0.8597240447998047, "learning_rate": 0.0002, "epoch": 3.9547576301615797, "step": 55070}, {"loss": 0.6499, "grad_norm": 0.9461944699287415, "learning_rate": 0.0002, "epoch": 3.955475763016158, "step": 55080}, {"loss": 0.6222, "grad_norm": 0.7576611042022705, "learning_rate": 0.0002, "epoch": 3.956193895870736, "step": 55090}, {"loss": 0.6735, "grad_norm": 0.9484710693359375, "learning_rate": 0.0002, "epoch": 3.956912028725314, "step": 55100}, {"loss": 0.6586, "grad_norm": 0.9487117528915405, "learning_rate": 0.0002, "epoch": 3.957630161579892, "step": 55110}, {"loss": 0.6632, "grad_norm": 0.870090663433075, "learning_rate": 0.0002, "epoch": 3.9583482944344706, "step": 55120}, {"loss": 0.6786, "grad_norm": 0.8496458530426025, "learning_rate": 0.0002, "epoch": 3.9590664272890486, "step": 55130}, {"loss": 0.6631, "grad_norm": 1.0121779441833496, "learning_rate": 0.0002, "epoch": 3.9597845601436266, "step": 55140}, {"loss": 0.7005, "grad_norm": 0.8912323713302612, "learning_rate": 0.0002, "epoch": 3.9605026929982046, "step": 55150}, {"loss": 0.6398, "grad_norm": 0.8398444652557373, "learning_rate": 0.0002, "epoch": 3.9612208258527826, "step": 55160}, {"loss": 0.6183, "grad_norm": 0.8046348690986633, "learning_rate": 0.0002, "epoch": 3.961938958707361, "step": 55170}, {"loss": 0.6357, "grad_norm": 1.0369254350662231, "learning_rate": 0.0002, "epoch": 3.962657091561939, "step": 55180}, {"loss": 0.6053, "grad_norm": 1.172431230545044, "learning_rate": 0.0002, "epoch": 3.963375224416517, "step": 55190}, {"loss": 0.643, "grad_norm": 0.8093554377555847, "learning_rate": 0.0002, "epoch": 3.964093357271095, "step": 55200}, {"loss": 0.6416, "grad_norm": 0.8851078748703003, "learning_rate": 0.0002, "epoch": 3.9648114901256735, "step": 55210}, {"loss": 0.6516, "grad_norm": 0.7494266033172607, "learning_rate": 0.0002, "epoch": 3.9655296229802515, "step": 55220}, {"loss": 0.629, "grad_norm": 0.9556898474693298, "learning_rate": 0.0002, "epoch": 3.9662477558348295, "step": 55230}, {"loss": 0.6481, "grad_norm": 1.016017198562622, "learning_rate": 0.0002, "epoch": 3.9669658886894075, "step": 55240}, {"loss": 0.7185, "grad_norm": 0.8425998091697693, "learning_rate": 0.0002, "epoch": 3.9676840215439855, "step": 55250}, {"loss": 0.6609, "grad_norm": 0.717673122882843, "learning_rate": 0.0002, "epoch": 3.9684021543985635, "step": 55260}, {"loss": 0.6453, "grad_norm": 0.8366572856903076, "learning_rate": 0.0002, "epoch": 3.969120287253142, "step": 55270}, {"loss": 0.6841, "grad_norm": 0.8981583118438721, "learning_rate": 0.0002, "epoch": 3.96983842010772, "step": 55280}, {"loss": 0.6351, "grad_norm": 0.8868781328201294, "learning_rate": 0.0002, "epoch": 3.970556552962298, "step": 55290}, {"loss": 0.6755, "grad_norm": 1.0632785558700562, "learning_rate": 0.0002, "epoch": 3.9712746858168764, "step": 55300}, {"loss": 0.6433, "grad_norm": 0.8813109993934631, "learning_rate": 0.0002, "epoch": 3.9719928186714544, "step": 55310}, {"loss": 0.5699, "grad_norm": 0.8225542306900024, "learning_rate": 0.0002, "epoch": 3.9727109515260324, "step": 55320}, {"loss": 0.6591, "grad_norm": 1.1391420364379883, "learning_rate": 0.0002, "epoch": 3.9734290843806104, "step": 55330}, {"loss": 0.6551, "grad_norm": 1.0371832847595215, "learning_rate": 0.0002, "epoch": 3.9741472172351884, "step": 55340}, {"loss": 0.7538, "grad_norm": 1.0542186498641968, "learning_rate": 0.0002, "epoch": 3.9748653500897664, "step": 55350}, {"loss": 0.6799, "grad_norm": 1.0178009271621704, "learning_rate": 0.0002, "epoch": 3.975583482944345, "step": 55360}, {"loss": 0.6394, "grad_norm": 0.7927802205085754, "learning_rate": 0.0002, "epoch": 3.976301615798923, "step": 55370}, {"loss": 0.6632, "grad_norm": 0.9350495934486389, "learning_rate": 0.0002, "epoch": 3.977019748653501, "step": 55380}, {"loss": 0.6889, "grad_norm": 1.0240116119384766, "learning_rate": 0.0002, "epoch": 3.977737881508079, "step": 55390}, {"loss": 0.6756, "grad_norm": 1.0279067754745483, "learning_rate": 0.0002, "epoch": 3.9784560143626573, "step": 55400}, {"loss": 0.6979, "grad_norm": 1.1228227615356445, "learning_rate": 0.0002, "epoch": 3.9791741472172353, "step": 55410}, {"loss": 0.6595, "grad_norm": 0.9500134587287903, "learning_rate": 0.0002, "epoch": 3.9798922800718133, "step": 55420}, {"loss": 0.6875, "grad_norm": 0.9229732155799866, "learning_rate": 0.0002, "epoch": 3.9806104129263913, "step": 55430}, {"loss": 0.6742, "grad_norm": 0.7946729063987732, "learning_rate": 0.0002, "epoch": 3.9813285457809693, "step": 55440}, {"loss": 0.6643, "grad_norm": 0.9987489581108093, "learning_rate": 0.0002, "epoch": 3.9820466786355477, "step": 55450}, {"loss": 0.6642, "grad_norm": 0.9670467972755432, "learning_rate": 0.0002, "epoch": 3.9827648114901257, "step": 55460}, {"loss": 0.6603, "grad_norm": 0.835028350353241, "learning_rate": 0.0002, "epoch": 3.9834829443447037, "step": 55470}, {"loss": 0.6198, "grad_norm": 0.8678702712059021, "learning_rate": 0.0002, "epoch": 3.9842010771992817, "step": 55480}, {"loss": 0.6581, "grad_norm": 0.8581197261810303, "learning_rate": 0.0002, "epoch": 3.98491921005386, "step": 55490}, {"loss": 0.614, "grad_norm": 0.779848039150238, "learning_rate": 0.0002, "epoch": 3.985637342908438, "step": 55500}, {"loss": 0.634, "grad_norm": 0.8827589154243469, "learning_rate": 0.0002, "epoch": 3.986355475763016, "step": 55510}, {"loss": 0.624, "grad_norm": 1.0108301639556885, "learning_rate": 0.0002, "epoch": 3.987073608617594, "step": 55520}, {"loss": 0.6553, "grad_norm": 0.8506004214286804, "learning_rate": 0.0002, "epoch": 3.987791741472172, "step": 55530}, {"loss": 0.6229, "grad_norm": 1.0297727584838867, "learning_rate": 0.0002, "epoch": 3.98850987432675, "step": 55540}, {"loss": 0.6551, "grad_norm": 0.8579224944114685, "learning_rate": 0.0002, "epoch": 3.9892280071813286, "step": 55550}, {"loss": 0.6491, "grad_norm": 0.8503788113594055, "learning_rate": 0.0002, "epoch": 3.9899461400359066, "step": 55560}, {"loss": 0.6941, "grad_norm": 1.1144801378250122, "learning_rate": 0.0002, "epoch": 3.9906642728904846, "step": 55570}, {"loss": 0.6956, "grad_norm": 0.8418305516242981, "learning_rate": 0.0002, "epoch": 3.991382405745063, "step": 55580}, {"loss": 0.6226, "grad_norm": 1.0065871477127075, "learning_rate": 0.0002, "epoch": 3.992100538599641, "step": 55590}, {"loss": 0.6775, "grad_norm": 0.8160259127616882, "learning_rate": 0.0002, "epoch": 3.992818671454219, "step": 55600}, {"loss": 0.624, "grad_norm": 0.8678009510040283, "learning_rate": 0.0002, "epoch": 3.993536804308797, "step": 55610}, {"loss": 0.6552, "grad_norm": 0.863465428352356, "learning_rate": 0.0002, "epoch": 3.994254937163375, "step": 55620}, {"loss": 0.6764, "grad_norm": 0.9242135286331177, "learning_rate": 0.0002, "epoch": 3.994973070017953, "step": 55630}, {"loss": 0.6774, "grad_norm": 1.0285470485687256, "learning_rate": 0.0002, "epoch": 3.9956912028725315, "step": 55640}, {"loss": 0.6882, "grad_norm": 0.8953320384025574, "learning_rate": 0.0002, "epoch": 3.9964093357271095, "step": 55650}, {"loss": 0.6935, "grad_norm": 0.915892481803894, "learning_rate": 0.0002, "epoch": 3.9971274685816875, "step": 55660}, {"loss": 0.641, "grad_norm": 0.8235118985176086, "learning_rate": 0.0002, "epoch": 3.9978456014362656, "step": 55670}, {"loss": 0.6417, "grad_norm": 1.0178656578063965, "learning_rate": 0.0002, "epoch": 3.998563734290844, "step": 55680}, {"loss": 0.6635, "grad_norm": 0.9926803708076477, "learning_rate": 0.0002, "epoch": 3.999281867145422, "step": 55690}, {"loss": 0.6476, "grad_norm": 0.9213629961013794, "learning_rate": 0.0002, "epoch": 4.0, "step": 55700}, {"eval_loss": 1.1152480840682983, "eval_runtime": 55.2237, "eval_samples_per_second": 13.273, "eval_steps_per_second": 1.666, "epoch": 4.0, "step": 55700}, {"loss": 0.6085, "grad_norm": 1.0820496082305908, "learning_rate": 0.0002, "epoch": 4.000718132854578, "step": 55710}, {"loss": 0.5506, "grad_norm": 0.9036441445350647, "learning_rate": 0.0002, "epoch": 4.001436265709156, "step": 55720}, {"loss": 0.5924, "grad_norm": 1.102754831314087, "learning_rate": 0.0002, "epoch": 4.002154398563734, "step": 55730}, {"loss": 0.6192, "grad_norm": 0.98259437084198, "learning_rate": 0.0002, "epoch": 4.002872531418312, "step": 55740}, {"loss": 0.567, "grad_norm": 1.1935845613479614, "learning_rate": 0.0002, "epoch": 4.003590664272891, "step": 55750}, {"loss": 0.6205, "grad_norm": 0.9925830960273743, "learning_rate": 0.0002, "epoch": 4.004308797127469, "step": 55760}, {"loss": 0.5545, "grad_norm": 1.075087070465088, "learning_rate": 0.0002, "epoch": 4.005026929982047, "step": 55770}, {"loss": 0.5591, "grad_norm": 0.8746396899223328, "learning_rate": 0.0002, "epoch": 4.005745062836625, "step": 55780}, {"loss": 0.5745, "grad_norm": 0.7635995745658875, "learning_rate": 0.0002, "epoch": 4.006463195691203, "step": 55790}, {"loss": 0.599, "grad_norm": 0.9064885377883911, "learning_rate": 0.0002, "epoch": 4.007181328545781, "step": 55800}, {"loss": 0.5668, "grad_norm": 1.018478274345398, "learning_rate": 0.0002, "epoch": 4.007899461400359, "step": 55810}, {"loss": 0.5573, "grad_norm": 0.9797589778900146, "learning_rate": 0.0002, "epoch": 4.008617594254937, "step": 55820}, {"loss": 0.5784, "grad_norm": 0.7867457866668701, "learning_rate": 0.0002, "epoch": 4.009335727109515, "step": 55830}, {"loss": 0.5607, "grad_norm": 0.9998070597648621, "learning_rate": 0.0002, "epoch": 4.010053859964093, "step": 55840}, {"loss": 0.5655, "grad_norm": 0.8656311631202698, "learning_rate": 0.0002, "epoch": 4.010771992818672, "step": 55850}, {"loss": 0.533, "grad_norm": 0.945469081401825, "learning_rate": 0.0002, "epoch": 4.01149012567325, "step": 55860}, {"loss": 0.625, "grad_norm": 0.8809926509857178, "learning_rate": 0.0002, "epoch": 4.012208258527828, "step": 55870}, {"loss": 0.5795, "grad_norm": 0.8047897219657898, "learning_rate": 0.0002, "epoch": 4.012926391382406, "step": 55880}, {"loss": 0.5322, "grad_norm": 1.0563900470733643, "learning_rate": 0.0002, "epoch": 4.013644524236984, "step": 55890}, {"loss": 0.5597, "grad_norm": 0.8578300476074219, "learning_rate": 0.0002, "epoch": 4.014362657091562, "step": 55900}, {"loss": 0.5634, "grad_norm": 1.0304765701293945, "learning_rate": 0.0002, "epoch": 4.01508078994614, "step": 55910}, {"loss": 0.558, "grad_norm": 0.8087666034698486, "learning_rate": 0.0002, "epoch": 4.015798922800718, "step": 55920}, {"loss": 0.5557, "grad_norm": 1.0192348957061768, "learning_rate": 0.0002, "epoch": 4.016517055655296, "step": 55930}, {"loss": 0.6269, "grad_norm": 1.061194658279419, "learning_rate": 0.0002, "epoch": 4.017235188509875, "step": 55940}, {"loss": 0.5812, "grad_norm": 0.93668133020401, "learning_rate": 0.0002, "epoch": 4.017953321364453, "step": 55950}, {"loss": 0.6104, "grad_norm": 1.1569286584854126, "learning_rate": 0.0002, "epoch": 4.018671454219031, "step": 55960}, {"loss": 0.5832, "grad_norm": 0.9853817224502563, "learning_rate": 0.0002, "epoch": 4.019389587073609, "step": 55970}, {"loss": 0.6154, "grad_norm": 0.851109504699707, "learning_rate": 0.0002, "epoch": 4.020107719928187, "step": 55980}, {"loss": 0.5993, "grad_norm": 1.053525447845459, "learning_rate": 0.0002, "epoch": 4.020825852782765, "step": 55990}, {"loss": 0.571, "grad_norm": 0.8307225704193115, "learning_rate": 0.0002, "epoch": 4.021543985637343, "step": 56000}, {"loss": 0.5419, "grad_norm": 1.2741150856018066, "learning_rate": 0.0002, "epoch": 4.022262118491921, "step": 56010}, {"loss": 0.6001, "grad_norm": 0.9708344340324402, "learning_rate": 0.0002, "epoch": 4.022980251346499, "step": 56020}, {"loss": 0.5989, "grad_norm": 1.265034556388855, "learning_rate": 0.0002, "epoch": 4.023698384201078, "step": 56030}, {"loss": 0.5852, "grad_norm": 0.9364367723464966, "learning_rate": 0.0002, "epoch": 4.024416517055656, "step": 56040}, {"loss": 0.6108, "grad_norm": 0.8643592000007629, "learning_rate": 0.0002, "epoch": 4.025134649910234, "step": 56050}, {"loss": 0.6074, "grad_norm": 0.9742133021354675, "learning_rate": 0.0002, "epoch": 4.025852782764812, "step": 56060}, {"loss": 0.5699, "grad_norm": 1.1793473958969116, "learning_rate": 0.0002, "epoch": 4.02657091561939, "step": 56070}, {"loss": 0.5911, "grad_norm": 0.9641149044036865, "learning_rate": 0.0002, "epoch": 4.027289048473968, "step": 56080}, {"loss": 0.6083, "grad_norm": 0.9426136016845703, "learning_rate": 0.0002, "epoch": 4.028007181328546, "step": 56090}, {"loss": 0.5692, "grad_norm": 0.9211869835853577, "learning_rate": 0.0002, "epoch": 4.028725314183124, "step": 56100}, {"loss": 0.6109, "grad_norm": 1.1576565504074097, "learning_rate": 0.0002, "epoch": 4.029443447037702, "step": 56110}, {"loss": 0.5684, "grad_norm": 1.0014013051986694, "learning_rate": 0.0002, "epoch": 4.03016157989228, "step": 56120}, {"loss": 0.6017, "grad_norm": 0.9307010769844055, "learning_rate": 0.0002, "epoch": 4.0308797127468585, "step": 56130}, {"loss": 0.5582, "grad_norm": 0.8290148377418518, "learning_rate": 0.0002, "epoch": 4.0315978456014365, "step": 56140}, {"loss": 0.5921, "grad_norm": 1.0648446083068848, "learning_rate": 0.0002, "epoch": 4.0323159784560145, "step": 56150}, {"loss": 0.6116, "grad_norm": 1.1545547246932983, "learning_rate": 0.0002, "epoch": 4.0330341113105925, "step": 56160}, {"loss": 0.6301, "grad_norm": 0.9643545150756836, "learning_rate": 0.0002, "epoch": 4.0337522441651705, "step": 56170}, {"loss": 0.5655, "grad_norm": 0.8913900256156921, "learning_rate": 0.0002, "epoch": 4.0344703770197485, "step": 56180}, {"loss": 0.5897, "grad_norm": 0.9445754289627075, "learning_rate": 0.0002, "epoch": 4.0351885098743265, "step": 56190}, {"loss": 0.6204, "grad_norm": 0.9353124499320984, "learning_rate": 0.0002, "epoch": 4.0359066427289045, "step": 56200}, {"loss": 0.6017, "grad_norm": 1.1780431270599365, "learning_rate": 0.0002, "epoch": 4.0366247755834825, "step": 56210}, {"loss": 0.5767, "grad_norm": 0.9208880662918091, "learning_rate": 0.0002, "epoch": 4.037342908438061, "step": 56220}, {"loss": 0.5367, "grad_norm": 0.9475517272949219, "learning_rate": 0.0002, "epoch": 4.038061041292639, "step": 56230}, {"loss": 0.576, "grad_norm": 0.7478583455085754, "learning_rate": 0.0002, "epoch": 4.038779174147217, "step": 56240}, {"loss": 0.5616, "grad_norm": 1.0026403665542603, "learning_rate": 0.0002, "epoch": 4.039497307001795, "step": 56250}, {"loss": 0.6031, "grad_norm": 0.9664973020553589, "learning_rate": 0.0002, "epoch": 4.040215439856373, "step": 56260}, {"loss": 0.5764, "grad_norm": 1.0655616521835327, "learning_rate": 0.0002, "epoch": 4.040933572710951, "step": 56270}, {"loss": 0.5862, "grad_norm": 0.8367540240287781, "learning_rate": 0.0002, "epoch": 4.041651705565529, "step": 56280}, {"loss": 0.5828, "grad_norm": 0.7982191443443298, "learning_rate": 0.0002, "epoch": 4.042369838420107, "step": 56290}, {"loss": 0.5637, "grad_norm": 0.8304495215415955, "learning_rate": 0.0002, "epoch": 4.043087971274685, "step": 56300}, {"loss": 0.5974, "grad_norm": 0.95123291015625, "learning_rate": 0.0002, "epoch": 4.043806104129264, "step": 56310}, {"loss": 0.617, "grad_norm": 0.9504102468490601, "learning_rate": 0.0002, "epoch": 4.044524236983842, "step": 56320}, {"loss": 0.6143, "grad_norm": 0.7432710528373718, "learning_rate": 0.0002, "epoch": 4.04524236983842, "step": 56330}, {"loss": 0.6157, "grad_norm": 0.9327874183654785, "learning_rate": 0.0002, "epoch": 4.045960502692998, "step": 56340}, {"loss": 0.591, "grad_norm": 0.9161670804023743, "learning_rate": 0.0002, "epoch": 4.046678635547576, "step": 56350}, {"loss": 0.6111, "grad_norm": 0.9371771812438965, "learning_rate": 0.0002, "epoch": 4.047396768402154, "step": 56360}, {"loss": 0.6101, "grad_norm": 1.0332437753677368, "learning_rate": 0.0002, "epoch": 4.048114901256732, "step": 56370}, {"loss": 0.5451, "grad_norm": 0.7346320748329163, "learning_rate": 0.0002, "epoch": 4.04883303411131, "step": 56380}, {"loss": 0.6416, "grad_norm": 0.8247857689857483, "learning_rate": 0.0002, "epoch": 4.049551166965888, "step": 56390}, {"loss": 0.6208, "grad_norm": 0.925325334072113, "learning_rate": 0.0002, "epoch": 4.050269299820466, "step": 56400}, {"loss": 0.558, "grad_norm": 0.7344088554382324, "learning_rate": 0.0002, "epoch": 4.050987432675045, "step": 56410}, {"loss": 0.5978, "grad_norm": 0.9204918146133423, "learning_rate": 0.0002, "epoch": 4.051705565529623, "step": 56420}, {"loss": 0.5788, "grad_norm": 0.8273472785949707, "learning_rate": 0.0002, "epoch": 4.052423698384201, "step": 56430}, {"loss": 0.5551, "grad_norm": 0.9524998068809509, "learning_rate": 0.0002, "epoch": 4.053141831238779, "step": 56440}, {"loss": 0.5836, "grad_norm": 0.9168205857276917, "learning_rate": 0.0002, "epoch": 4.053859964093357, "step": 56450}, {"loss": 0.6035, "grad_norm": 0.9634994864463806, "learning_rate": 0.0002, "epoch": 4.054578096947935, "step": 56460}, {"loss": 0.5907, "grad_norm": 1.2027593851089478, "learning_rate": 0.0002, "epoch": 4.055296229802513, "step": 56470}, {"loss": 0.5691, "grad_norm": 1.2347805500030518, "learning_rate": 0.0002, "epoch": 4.056014362657091, "step": 56480}, {"loss": 0.5789, "grad_norm": 0.8621458411216736, "learning_rate": 0.0002, "epoch": 4.056732495511669, "step": 56490}, {"loss": 0.6082, "grad_norm": 0.9194608330726624, "learning_rate": 0.0002, "epoch": 4.057450628366248, "step": 56500}, {"loss": 0.5667, "grad_norm": 1.0153663158416748, "learning_rate": 0.0002, "epoch": 4.058168761220826, "step": 56510}, {"loss": 0.5908, "grad_norm": 0.9170986413955688, "learning_rate": 0.0002, "epoch": 4.058886894075404, "step": 56520}, {"loss": 0.5672, "grad_norm": 1.033057689666748, "learning_rate": 0.0002, "epoch": 4.059605026929982, "step": 56530}, {"loss": 0.5577, "grad_norm": 1.0125197172164917, "learning_rate": 0.0002, "epoch": 4.06032315978456, "step": 56540}, {"loss": 0.5821, "grad_norm": 0.9429898262023926, "learning_rate": 0.0002, "epoch": 4.061041292639138, "step": 56550}, {"loss": 0.5655, "grad_norm": 0.9242179989814758, "learning_rate": 0.0002, "epoch": 4.061759425493716, "step": 56560}, {"loss": 0.5568, "grad_norm": 0.9365091323852539, "learning_rate": 0.0002, "epoch": 4.062477558348294, "step": 56570}, {"loss": 0.6104, "grad_norm": 0.9148455858230591, "learning_rate": 0.0002, "epoch": 4.063195691202872, "step": 56580}, {"loss": 0.5891, "grad_norm": 0.8546709418296814, "learning_rate": 0.0002, "epoch": 4.063913824057451, "step": 56590}, {"loss": 0.6079, "grad_norm": 0.9743902087211609, "learning_rate": 0.0002, "epoch": 4.064631956912029, "step": 56600}, {"loss": 0.6109, "grad_norm": 1.0599974393844604, "learning_rate": 0.0002, "epoch": 4.065350089766607, "step": 56610}, {"loss": 0.5746, "grad_norm": 0.9677841067314148, "learning_rate": 0.0002, "epoch": 4.066068222621185, "step": 56620}, {"loss": 0.5957, "grad_norm": 0.8892754316329956, "learning_rate": 0.0002, "epoch": 4.066786355475763, "step": 56630}, {"loss": 0.5899, "grad_norm": 0.8837814331054688, "learning_rate": 0.0002, "epoch": 4.067504488330341, "step": 56640}, {"loss": 0.5784, "grad_norm": 0.9284095764160156, "learning_rate": 0.0002, "epoch": 4.068222621184919, "step": 56650}, {"loss": 0.5829, "grad_norm": 1.0163567066192627, "learning_rate": 0.0002, "epoch": 4.068940754039497, "step": 56660}, {"loss": 0.5349, "grad_norm": 0.8713456988334656, "learning_rate": 0.0002, "epoch": 4.069658886894075, "step": 56670}, {"loss": 0.5345, "grad_norm": 0.8356686234474182, "learning_rate": 0.0002, "epoch": 4.070377019748653, "step": 56680}, {"loss": 0.5473, "grad_norm": 0.8998766541481018, "learning_rate": 0.0002, "epoch": 4.071095152603232, "step": 56690}, {"loss": 0.5896, "grad_norm": 1.0441967248916626, "learning_rate": 0.0002, "epoch": 4.07181328545781, "step": 56700}, {"loss": 0.5817, "grad_norm": 0.9313125610351562, "learning_rate": 0.0002, "epoch": 4.072531418312388, "step": 56710}, {"loss": 0.5477, "grad_norm": 0.9912964701652527, "learning_rate": 0.0002, "epoch": 4.073249551166966, "step": 56720}, {"loss": 0.5974, "grad_norm": 0.9048459529876709, "learning_rate": 0.0002, "epoch": 4.073967684021544, "step": 56730}, {"loss": 0.5927, "grad_norm": 1.0248944759368896, "learning_rate": 0.0002, "epoch": 4.074685816876122, "step": 56740}, {"loss": 0.6019, "grad_norm": 1.4526786804199219, "learning_rate": 0.0002, "epoch": 4.0754039497307, "step": 56750}, {"loss": 0.6267, "grad_norm": 0.9813178181648254, "learning_rate": 0.0002, "epoch": 4.076122082585278, "step": 56760}, {"loss": 0.5707, "grad_norm": 1.0686813592910767, "learning_rate": 0.0002, "epoch": 4.076840215439856, "step": 56770}, {"loss": 0.5857, "grad_norm": 1.1093482971191406, "learning_rate": 0.0002, "epoch": 4.077558348294435, "step": 56780}, {"loss": 0.5768, "grad_norm": 0.9377819895744324, "learning_rate": 0.0002, "epoch": 4.078276481149013, "step": 56790}, {"loss": 0.6342, "grad_norm": 0.8043649196624756, "learning_rate": 0.0002, "epoch": 4.078994614003591, "step": 56800}, {"loss": 0.6005, "grad_norm": 0.7995415925979614, "learning_rate": 0.0002, "epoch": 4.079712746858169, "step": 56810}, {"loss": 0.5466, "grad_norm": 1.0076148509979248, "learning_rate": 0.0002, "epoch": 4.080430879712747, "step": 56820}, {"loss": 0.6021, "grad_norm": 0.8192076683044434, "learning_rate": 0.0002, "epoch": 4.081149012567325, "step": 56830}, {"loss": 0.5439, "grad_norm": 0.9226266145706177, "learning_rate": 0.0002, "epoch": 4.081867145421903, "step": 56840}, {"loss": 0.5893, "grad_norm": 0.8877972960472107, "learning_rate": 0.0002, "epoch": 4.082585278276481, "step": 56850}, {"loss": 0.5774, "grad_norm": 0.9578937888145447, "learning_rate": 0.0002, "epoch": 4.083303411131059, "step": 56860}, {"loss": 0.5946, "grad_norm": 0.8929167985916138, "learning_rate": 0.0002, "epoch": 4.084021543985638, "step": 56870}, {"loss": 0.5226, "grad_norm": 1.0015977621078491, "learning_rate": 0.0002, "epoch": 4.084739676840216, "step": 56880}, {"loss": 0.5931, "grad_norm": 0.9768750667572021, "learning_rate": 0.0002, "epoch": 4.085457809694794, "step": 56890}, {"loss": 0.5983, "grad_norm": 1.0834569931030273, "learning_rate": 0.0002, "epoch": 4.086175942549372, "step": 56900}, {"loss": 0.5786, "grad_norm": 0.8761230707168579, "learning_rate": 0.0002, "epoch": 4.08689407540395, "step": 56910}, {"loss": 0.5708, "grad_norm": 1.027064323425293, "learning_rate": 0.0002, "epoch": 4.087612208258528, "step": 56920}, {"loss": 0.601, "grad_norm": 1.130336880683899, "learning_rate": 0.0002, "epoch": 4.088330341113106, "step": 56930}, {"loss": 0.5664, "grad_norm": 0.8157579898834229, "learning_rate": 0.0002, "epoch": 4.089048473967684, "step": 56940}, {"loss": 0.5789, "grad_norm": 1.071175217628479, "learning_rate": 0.0002, "epoch": 4.089766606822262, "step": 56950}, {"loss": 0.5942, "grad_norm": 0.9534492492675781, "learning_rate": 0.0002, "epoch": 4.09048473967684, "step": 56960}, {"loss": 0.5803, "grad_norm": 0.9584037661552429, "learning_rate": 0.0002, "epoch": 4.091202872531419, "step": 56970}, {"loss": 0.5647, "grad_norm": 1.1513131856918335, "learning_rate": 0.0002, "epoch": 4.091921005385997, "step": 56980}, {"loss": 0.5971, "grad_norm": 1.0167666673660278, "learning_rate": 0.0002, "epoch": 4.092639138240575, "step": 56990}, {"loss": 0.5981, "grad_norm": 1.0630987882614136, "learning_rate": 0.0002, "epoch": 4.093357271095153, "step": 57000}, {"loss": 0.5734, "grad_norm": 1.0326893329620361, "learning_rate": 0.0002, "epoch": 4.094075403949731, "step": 57010}, {"loss": 0.572, "grad_norm": 0.9701678156852722, "learning_rate": 0.0002, "epoch": 4.094793536804309, "step": 57020}, {"loss": 0.5815, "grad_norm": 0.839935302734375, "learning_rate": 0.0002, "epoch": 4.095511669658887, "step": 57030}, {"loss": 0.6051, "grad_norm": 0.8995838761329651, "learning_rate": 0.0002, "epoch": 4.096229802513465, "step": 57040}, {"loss": 0.6037, "grad_norm": 0.8039916157722473, "learning_rate": 0.0002, "epoch": 4.096947935368043, "step": 57050}, {"loss": 0.5597, "grad_norm": 1.126122236251831, "learning_rate": 0.0002, "epoch": 4.097666068222622, "step": 57060}, {"loss": 0.5943, "grad_norm": 0.8749837875366211, "learning_rate": 0.0002, "epoch": 4.0983842010772, "step": 57070}, {"loss": 0.6017, "grad_norm": 0.8630341291427612, "learning_rate": 0.0002, "epoch": 4.099102333931778, "step": 57080}, {"loss": 0.6083, "grad_norm": 0.8889496922492981, "learning_rate": 0.0002, "epoch": 4.099820466786356, "step": 57090}, {"loss": 0.5727, "grad_norm": 0.9050310254096985, "learning_rate": 0.0002, "epoch": 4.100538599640934, "step": 57100}, {"loss": 0.5824, "grad_norm": 0.943072497844696, "learning_rate": 0.0002, "epoch": 4.101256732495512, "step": 57110}, {"loss": 0.6036, "grad_norm": 0.9031552672386169, "learning_rate": 0.0002, "epoch": 4.10197486535009, "step": 57120}, {"loss": 0.5913, "grad_norm": 0.939862847328186, "learning_rate": 0.0002, "epoch": 4.102692998204668, "step": 57130}, {"loss": 0.5738, "grad_norm": 0.8080634474754333, "learning_rate": 0.0002, "epoch": 4.103411131059246, "step": 57140}, {"loss": 0.5841, "grad_norm": 0.9181693196296692, "learning_rate": 0.0002, "epoch": 4.1041292639138245, "step": 57150}, {"loss": 0.5561, "grad_norm": 0.9609217643737793, "learning_rate": 0.0002, "epoch": 4.1048473967684025, "step": 57160}, {"loss": 0.5572, "grad_norm": 1.1246516704559326, "learning_rate": 0.0002, "epoch": 4.1055655296229805, "step": 57170}, {"loss": 0.5886, "grad_norm": 1.0616880655288696, "learning_rate": 0.0002, "epoch": 4.1062836624775585, "step": 57180}, {"loss": 0.5579, "grad_norm": 0.9954505562782288, "learning_rate": 0.0002, "epoch": 4.1070017953321365, "step": 57190}, {"loss": 0.5899, "grad_norm": 1.0602279901504517, "learning_rate": 0.0002, "epoch": 4.1077199281867145, "step": 57200}, {"loss": 0.5747, "grad_norm": 0.8984764814376831, "learning_rate": 0.0002, "epoch": 4.1084380610412925, "step": 57210}, {"loss": 0.5502, "grad_norm": 0.845167875289917, "learning_rate": 0.0002, "epoch": 4.1091561938958705, "step": 57220}, {"loss": 0.6147, "grad_norm": 0.7901500463485718, "learning_rate": 0.0002, "epoch": 4.1098743267504485, "step": 57230}, {"loss": 0.5883, "grad_norm": 1.0462526082992554, "learning_rate": 0.0002, "epoch": 4.1105924596050265, "step": 57240}, {"loss": 0.6334, "grad_norm": 0.9098827838897705, "learning_rate": 0.0002, "epoch": 4.111310592459605, "step": 57250}, {"loss": 0.5794, "grad_norm": 0.9234077334403992, "learning_rate": 0.0002, "epoch": 4.112028725314183, "step": 57260}, {"loss": 0.623, "grad_norm": 1.0033560991287231, "learning_rate": 0.0002, "epoch": 4.112746858168761, "step": 57270}, {"loss": 0.5392, "grad_norm": 1.0620051622390747, "learning_rate": 0.0002, "epoch": 4.113464991023339, "step": 57280}, {"loss": 0.6144, "grad_norm": 0.8679345846176147, "learning_rate": 0.0002, "epoch": 4.114183123877917, "step": 57290}, {"loss": 0.5951, "grad_norm": 0.7557345628738403, "learning_rate": 0.0002, "epoch": 4.114901256732495, "step": 57300}, {"loss": 0.575, "grad_norm": 0.8970935344696045, "learning_rate": 0.0002, "epoch": 4.115619389587073, "step": 57310}, {"loss": 0.5595, "grad_norm": 1.0779842138290405, "learning_rate": 0.0002, "epoch": 4.116337522441651, "step": 57320}, {"loss": 0.5532, "grad_norm": 1.2036106586456299, "learning_rate": 0.0002, "epoch": 4.117055655296229, "step": 57330}, {"loss": 0.5959, "grad_norm": 0.8337953686714172, "learning_rate": 0.0002, "epoch": 4.117773788150808, "step": 57340}, {"loss": 0.6128, "grad_norm": 0.9850410223007202, "learning_rate": 0.0002, "epoch": 4.118491921005386, "step": 57350}, {"loss": 0.5676, "grad_norm": 0.8028770685195923, "learning_rate": 0.0002, "epoch": 4.119210053859964, "step": 57360}, {"loss": 0.5693, "grad_norm": 0.8693217039108276, "learning_rate": 0.0002, "epoch": 4.119928186714542, "step": 57370}, {"loss": 0.5897, "grad_norm": 0.8795534372329712, "learning_rate": 0.0002, "epoch": 4.12064631956912, "step": 57380}, {"loss": 0.5692, "grad_norm": 1.0081543922424316, "learning_rate": 0.0002, "epoch": 4.121364452423698, "step": 57390}, {"loss": 0.6027, "grad_norm": 0.8776742219924927, "learning_rate": 0.0002, "epoch": 4.122082585278276, "step": 57400}, {"loss": 0.6418, "grad_norm": 0.8247824311256409, "learning_rate": 0.0002, "epoch": 4.122800718132854, "step": 57410}, {"loss": 0.5537, "grad_norm": 1.1346335411071777, "learning_rate": 0.0002, "epoch": 4.123518850987432, "step": 57420}, {"loss": 0.5949, "grad_norm": 1.0671089887619019, "learning_rate": 0.0002, "epoch": 4.124236983842011, "step": 57430}, {"loss": 0.5908, "grad_norm": 0.8548333048820496, "learning_rate": 0.0002, "epoch": 4.124955116696589, "step": 57440}, {"loss": 0.5967, "grad_norm": 1.0221573114395142, "learning_rate": 0.0002, "epoch": 4.125673249551167, "step": 57450}, {"loss": 0.6238, "grad_norm": 0.9746617674827576, "learning_rate": 0.0002, "epoch": 4.126391382405745, "step": 57460}, {"loss": 0.5855, "grad_norm": 0.8104965090751648, "learning_rate": 0.0002, "epoch": 4.127109515260323, "step": 57470}, {"loss": 0.5724, "grad_norm": 1.0401487350463867, "learning_rate": 0.0002, "epoch": 4.127827648114901, "step": 57480}, {"loss": 0.5956, "grad_norm": 0.8828882575035095, "learning_rate": 0.0002, "epoch": 4.128545780969479, "step": 57490}, {"loss": 0.5851, "grad_norm": 1.0121098756790161, "learning_rate": 0.0002, "epoch": 4.129263913824057, "step": 57500}, {"loss": 0.5923, "grad_norm": 0.8789737820625305, "learning_rate": 0.0002, "epoch": 4.129982046678635, "step": 57510}, {"loss": 0.5929, "grad_norm": 1.0386744737625122, "learning_rate": 0.0002, "epoch": 4.130700179533213, "step": 57520}, {"loss": 0.6104, "grad_norm": 1.0092610120773315, "learning_rate": 0.0002, "epoch": 4.131418312387792, "step": 57530}, {"loss": 0.5974, "grad_norm": 0.8706282377243042, "learning_rate": 0.0002, "epoch": 4.13213644524237, "step": 57540}, {"loss": 0.5829, "grad_norm": 0.9270507097244263, "learning_rate": 0.0002, "epoch": 4.132854578096948, "step": 57550}, {"loss": 0.5826, "grad_norm": 1.0303068161010742, "learning_rate": 0.0002, "epoch": 4.133572710951526, "step": 57560}, {"loss": 0.5515, "grad_norm": 1.1169062852859497, "learning_rate": 0.0002, "epoch": 4.134290843806104, "step": 57570}, {"loss": 0.5848, "grad_norm": 0.8530599474906921, "learning_rate": 0.0002, "epoch": 4.135008976660682, "step": 57580}, {"loss": 0.6231, "grad_norm": 1.1395039558410645, "learning_rate": 0.0002, "epoch": 4.13572710951526, "step": 57590}, {"loss": 0.5739, "grad_norm": 0.8944115042686462, "learning_rate": 0.0002, "epoch": 4.136445242369838, "step": 57600}, {"loss": 0.6212, "grad_norm": 1.137966275215149, "learning_rate": 0.0002, "epoch": 4.137163375224416, "step": 57610}, {"loss": 0.6041, "grad_norm": 0.8244962692260742, "learning_rate": 0.0002, "epoch": 4.137881508078995, "step": 57620}, {"loss": 0.6078, "grad_norm": 1.1935817003250122, "learning_rate": 0.0002, "epoch": 4.138599640933573, "step": 57630}, {"loss": 0.5939, "grad_norm": 0.9774235486984253, "learning_rate": 0.0002, "epoch": 4.139317773788151, "step": 57640}, {"loss": 0.5963, "grad_norm": 1.066219449043274, "learning_rate": 0.0002, "epoch": 4.140035906642729, "step": 57650}, {"loss": 0.6008, "grad_norm": 0.8631396293640137, "learning_rate": 0.0002, "epoch": 4.140754039497307, "step": 57660}, {"loss": 0.5622, "grad_norm": 0.888410747051239, "learning_rate": 0.0002, "epoch": 4.141472172351885, "step": 57670}, {"loss": 0.5675, "grad_norm": 1.002642035484314, "learning_rate": 0.0002, "epoch": 4.142190305206463, "step": 57680}, {"loss": 0.5269, "grad_norm": 1.0092825889587402, "learning_rate": 0.0002, "epoch": 4.142908438061041, "step": 57690}, {"loss": 0.588, "grad_norm": 0.9126971364021301, "learning_rate": 0.0002, "epoch": 4.143626570915619, "step": 57700}, {"loss": 0.5593, "grad_norm": 1.0303562879562378, "learning_rate": 0.0002, "epoch": 4.144344703770198, "step": 57710}, {"loss": 0.6183, "grad_norm": 1.1230897903442383, "learning_rate": 0.0002, "epoch": 4.145062836624776, "step": 57720}, {"loss": 0.5934, "grad_norm": 1.0494099855422974, "learning_rate": 0.0002, "epoch": 4.145780969479354, "step": 57730}, {"loss": 0.6022, "grad_norm": 0.9555442333221436, "learning_rate": 0.0002, "epoch": 4.146499102333932, "step": 57740}, {"loss": 0.609, "grad_norm": 0.8255124092102051, "learning_rate": 0.0002, "epoch": 4.14721723518851, "step": 57750}, {"loss": 0.5659, "grad_norm": 1.097853660583496, "learning_rate": 0.0002, "epoch": 4.147935368043088, "step": 57760}, {"loss": 0.5698, "grad_norm": 1.0272663831710815, "learning_rate": 0.0002, "epoch": 4.148653500897666, "step": 57770}, {"loss": 0.5701, "grad_norm": 1.022571086883545, "learning_rate": 0.0002, "epoch": 4.149371633752244, "step": 57780}, {"loss": 0.579, "grad_norm": 0.964543342590332, "learning_rate": 0.0002, "epoch": 4.150089766606822, "step": 57790}, {"loss": 0.6175, "grad_norm": 0.9251219034194946, "learning_rate": 0.0002, "epoch": 4.1508078994614, "step": 57800}, {"loss": 0.564, "grad_norm": 1.081840991973877, "learning_rate": 0.0002, "epoch": 4.151526032315979, "step": 57810}, {"loss": 0.5956, "grad_norm": 0.8989445567131042, "learning_rate": 0.0002, "epoch": 4.152244165170557, "step": 57820}, {"loss": 0.5849, "grad_norm": 0.903629720211029, "learning_rate": 0.0002, "epoch": 4.152962298025135, "step": 57830}, {"loss": 0.6202, "grad_norm": 0.8985397219657898, "learning_rate": 0.0002, "epoch": 4.153680430879713, "step": 57840}, {"loss": 0.5629, "grad_norm": 1.047778844833374, "learning_rate": 0.0002, "epoch": 4.154398563734291, "step": 57850}, {"loss": 0.6045, "grad_norm": 0.9804165363311768, "learning_rate": 0.0002, "epoch": 4.155116696588869, "step": 57860}, {"loss": 0.5815, "grad_norm": 1.187309980392456, "learning_rate": 0.0002, "epoch": 4.155834829443447, "step": 57870}, {"loss": 0.6304, "grad_norm": 0.9854836463928223, "learning_rate": 0.0002, "epoch": 4.156552962298025, "step": 57880}, {"loss": 0.6076, "grad_norm": 0.8494308590888977, "learning_rate": 0.0002, "epoch": 4.157271095152603, "step": 57890}, {"loss": 0.6033, "grad_norm": 0.9359684586524963, "learning_rate": 0.0002, "epoch": 4.157989228007182, "step": 57900}, {"loss": 0.5546, "grad_norm": 0.8971988558769226, "learning_rate": 0.0002, "epoch": 4.15870736086176, "step": 57910}, {"loss": 0.5934, "grad_norm": 0.8848021030426025, "learning_rate": 0.0002, "epoch": 4.159425493716338, "step": 57920}, {"loss": 0.6102, "grad_norm": 0.982877790927887, "learning_rate": 0.0002, "epoch": 4.160143626570916, "step": 57930}, {"loss": 0.6091, "grad_norm": 0.8668819069862366, "learning_rate": 0.0002, "epoch": 4.160861759425494, "step": 57940}, {"loss": 0.5969, "grad_norm": 1.06569504737854, "learning_rate": 0.0002, "epoch": 4.161579892280072, "step": 57950}, {"loss": 0.5799, "grad_norm": 1.165740728378296, "learning_rate": 0.0002, "epoch": 4.16229802513465, "step": 57960}, {"loss": 0.6038, "grad_norm": 1.0534512996673584, "learning_rate": 0.0002, "epoch": 4.163016157989228, "step": 57970}, {"loss": 0.594, "grad_norm": 0.8785330653190613, "learning_rate": 0.0002, "epoch": 4.163734290843806, "step": 57980}, {"loss": 0.5981, "grad_norm": 1.1244874000549316, "learning_rate": 0.0002, "epoch": 4.164452423698384, "step": 57990}, {"loss": 0.6456, "grad_norm": 0.8839399218559265, "learning_rate": 0.0002, "epoch": 4.165170556552963, "step": 58000}, {"loss": 0.5767, "grad_norm": 1.0603798627853394, "learning_rate": 0.0002, "epoch": 4.165888689407541, "step": 58010}, {"loss": 0.6334, "grad_norm": 0.9737853407859802, "learning_rate": 0.0002, "epoch": 4.166606822262119, "step": 58020}, {"loss": 0.5901, "grad_norm": 1.0650558471679688, "learning_rate": 0.0002, "epoch": 4.167324955116697, "step": 58030}, {"loss": 0.6549, "grad_norm": 0.7528959512710571, "learning_rate": 0.0002, "epoch": 4.168043087971275, "step": 58040}, {"loss": 0.5593, "grad_norm": 0.9286156892776489, "learning_rate": 0.0002, "epoch": 4.168761220825853, "step": 58050}, {"loss": 0.6093, "grad_norm": 1.0225880146026611, "learning_rate": 0.0002, "epoch": 4.169479353680431, "step": 58060}, {"loss": 0.5993, "grad_norm": 0.9990654587745667, "learning_rate": 0.0002, "epoch": 4.170197486535009, "step": 58070}, {"loss": 0.6002, "grad_norm": 1.052057147026062, "learning_rate": 0.0002, "epoch": 4.170915619389587, "step": 58080}, {"loss": 0.5911, "grad_norm": 0.7366801500320435, "learning_rate": 0.0002, "epoch": 4.1716337522441655, "step": 58090}, {"loss": 0.6273, "grad_norm": 1.0943711996078491, "learning_rate": 0.0002, "epoch": 4.1723518850987436, "step": 58100}, {"loss": 0.6095, "grad_norm": 1.1297656297683716, "learning_rate": 0.0002, "epoch": 4.1730700179533216, "step": 58110}, {"loss": 0.6123, "grad_norm": 0.7861461639404297, "learning_rate": 0.0002, "epoch": 4.1737881508078996, "step": 58120}, {"loss": 0.6188, "grad_norm": 0.8643335103988647, "learning_rate": 0.0002, "epoch": 4.174506283662478, "step": 58130}, {"loss": 0.6103, "grad_norm": 0.957288384437561, "learning_rate": 0.0002, "epoch": 4.175224416517056, "step": 58140}, {"loss": 0.5636, "grad_norm": 0.9175366759300232, "learning_rate": 0.0002, "epoch": 4.175942549371634, "step": 58150}, {"loss": 0.6288, "grad_norm": 1.129935622215271, "learning_rate": 0.0002, "epoch": 4.176660682226212, "step": 58160}, {"loss": 0.5969, "grad_norm": 0.9683087468147278, "learning_rate": 0.0002, "epoch": 4.17737881508079, "step": 58170}, {"loss": 0.6249, "grad_norm": 1.045171856880188, "learning_rate": 0.0002, "epoch": 4.1780969479353685, "step": 58180}, {"loss": 0.5611, "grad_norm": 0.9858742952346802, "learning_rate": 0.0002, "epoch": 4.1788150807899465, "step": 58190}, {"loss": 0.5946, "grad_norm": 0.8513413071632385, "learning_rate": 0.0002, "epoch": 4.1795332136445245, "step": 58200}, {"loss": 0.5928, "grad_norm": 0.9584265947341919, "learning_rate": 0.0002, "epoch": 4.1802513464991025, "step": 58210}, {"loss": 0.5864, "grad_norm": 0.8828920722007751, "learning_rate": 0.0002, "epoch": 4.1809694793536805, "step": 58220}, {"loss": 0.5745, "grad_norm": 0.9849961400032043, "learning_rate": 0.0002, "epoch": 4.1816876122082585, "step": 58230}, {"loss": 0.5355, "grad_norm": 1.0601637363433838, "learning_rate": 0.0002, "epoch": 4.1824057450628365, "step": 58240}, {"loss": 0.6063, "grad_norm": 1.2206604480743408, "learning_rate": 0.0002, "epoch": 4.1831238779174145, "step": 58250}, {"loss": 0.6176, "grad_norm": 1.1768009662628174, "learning_rate": 0.0002, "epoch": 4.1838420107719925, "step": 58260}, {"loss": 0.5572, "grad_norm": 0.9521295428276062, "learning_rate": 0.0002, "epoch": 4.184560143626571, "step": 58270}, {"loss": 0.5978, "grad_norm": 0.892971932888031, "learning_rate": 0.0002, "epoch": 4.185278276481149, "step": 58280}, {"loss": 0.5727, "grad_norm": 0.8712016940116882, "learning_rate": 0.0002, "epoch": 4.185996409335727, "step": 58290}, {"loss": 0.6124, "grad_norm": 1.0190843343734741, "learning_rate": 0.0002, "epoch": 4.186714542190305, "step": 58300}, {"loss": 0.6324, "grad_norm": 1.0149270296096802, "learning_rate": 0.0002, "epoch": 4.187432675044883, "step": 58310}, {"loss": 0.6337, "grad_norm": 1.1818004846572876, "learning_rate": 0.0002, "epoch": 4.188150807899461, "step": 58320}, {"loss": 0.5588, "grad_norm": 0.7892335653305054, "learning_rate": 0.0002, "epoch": 4.188868940754039, "step": 58330}, {"loss": 0.6132, "grad_norm": 0.9792808890342712, "learning_rate": 0.0002, "epoch": 4.189587073608617, "step": 58340}, {"loss": 0.5841, "grad_norm": 0.9946883320808411, "learning_rate": 0.0002, "epoch": 4.190305206463195, "step": 58350}, {"loss": 0.6043, "grad_norm": 1.0363789796829224, "learning_rate": 0.0002, "epoch": 4.191023339317773, "step": 58360}, {"loss": 0.5843, "grad_norm": 0.9285917282104492, "learning_rate": 0.0002, "epoch": 4.191741472172352, "step": 58370}, {"loss": 0.6042, "grad_norm": 0.9461679458618164, "learning_rate": 0.0002, "epoch": 4.19245960502693, "step": 58380}, {"loss": 0.5666, "grad_norm": 1.0344175100326538, "learning_rate": 0.0002, "epoch": 4.193177737881508, "step": 58390}, {"loss": 0.6032, "grad_norm": 0.9530242085456848, "learning_rate": 0.0002, "epoch": 4.193895870736086, "step": 58400}, {"loss": 0.5887, "grad_norm": 0.9171900749206543, "learning_rate": 0.0002, "epoch": 4.194614003590664, "step": 58410}, {"loss": 0.6116, "grad_norm": 0.8094898462295532, "learning_rate": 0.0002, "epoch": 4.195332136445242, "step": 58420}, {"loss": 0.5268, "grad_norm": 0.921981930732727, "learning_rate": 0.0002, "epoch": 4.19605026929982, "step": 58430}, {"loss": 0.551, "grad_norm": 0.9783532023429871, "learning_rate": 0.0002, "epoch": 4.196768402154398, "step": 58440}, {"loss": 0.5774, "grad_norm": 1.017805576324463, "learning_rate": 0.0002, "epoch": 4.197486535008976, "step": 58450}, {"loss": 0.6261, "grad_norm": 0.9244308471679688, "learning_rate": 0.0002, "epoch": 4.198204667863555, "step": 58460}, {"loss": 0.6247, "grad_norm": 0.9942585229873657, "learning_rate": 0.0002, "epoch": 4.198922800718133, "step": 58470}, {"loss": 0.5803, "grad_norm": 1.1045037508010864, "learning_rate": 0.0002, "epoch": 4.199640933572711, "step": 58480}, {"loss": 0.5846, "grad_norm": 0.9483149647712708, "learning_rate": 0.0002, "epoch": 4.200359066427289, "step": 58490}, {"loss": 0.5997, "grad_norm": 1.0807271003723145, "learning_rate": 0.0002, "epoch": 4.201077199281867, "step": 58500}, {"loss": 0.5474, "grad_norm": 0.7697445750236511, "learning_rate": 0.0002, "epoch": 4.201795332136445, "step": 58510}, {"loss": 0.5692, "grad_norm": 1.0761178731918335, "learning_rate": 0.0002, "epoch": 4.202513464991023, "step": 58520}, {"loss": 0.5667, "grad_norm": 0.9992024898529053, "learning_rate": 0.0002, "epoch": 4.203231597845601, "step": 58530}, {"loss": 0.5606, "grad_norm": 0.8741498589515686, "learning_rate": 0.0002, "epoch": 4.203949730700179, "step": 58540}, {"loss": 0.6012, "grad_norm": 0.8557528853416443, "learning_rate": 0.0002, "epoch": 4.204667863554757, "step": 58550}, {"loss": 0.5191, "grad_norm": 0.8853630423545837, "learning_rate": 0.0002, "epoch": 4.205385996409336, "step": 58560}, {"loss": 0.5806, "grad_norm": 0.9858933687210083, "learning_rate": 0.0002, "epoch": 4.206104129263914, "step": 58570}, {"loss": 0.5908, "grad_norm": 1.104732871055603, "learning_rate": 0.0002, "epoch": 4.206822262118492, "step": 58580}, {"loss": 0.5993, "grad_norm": 0.9345462322235107, "learning_rate": 0.0002, "epoch": 4.20754039497307, "step": 58590}, {"loss": 0.6101, "grad_norm": 0.9620407819747925, "learning_rate": 0.0002, "epoch": 4.208258527827648, "step": 58600}, {"loss": 0.5848, "grad_norm": 0.8546963334083557, "learning_rate": 0.0002, "epoch": 4.208976660682226, "step": 58610}, {"loss": 0.5747, "grad_norm": 0.8125145435333252, "learning_rate": 0.0002, "epoch": 4.209694793536804, "step": 58620}, {"loss": 0.604, "grad_norm": 0.8481138944625854, "learning_rate": 0.0002, "epoch": 4.210412926391382, "step": 58630}, {"loss": 0.5928, "grad_norm": 0.8884692788124084, "learning_rate": 0.0002, "epoch": 4.21113105924596, "step": 58640}, {"loss": 0.5612, "grad_norm": 1.09279465675354, "learning_rate": 0.0002, "epoch": 4.211849192100539, "step": 58650}, {"loss": 0.644, "grad_norm": 0.9806583523750305, "learning_rate": 0.0002, "epoch": 4.212567324955117, "step": 58660}, {"loss": 0.5737, "grad_norm": 0.9510366916656494, "learning_rate": 0.0002, "epoch": 4.213285457809695, "step": 58670}, {"loss": 0.5996, "grad_norm": 0.7517459988594055, "learning_rate": 0.0002, "epoch": 4.214003590664273, "step": 58680}, {"loss": 0.6274, "grad_norm": 1.1134123802185059, "learning_rate": 0.0002, "epoch": 4.214721723518851, "step": 58690}, {"loss": 0.5842, "grad_norm": 0.8307328820228577, "learning_rate": 0.0002, "epoch": 4.215439856373429, "step": 58700}, {"loss": 0.5795, "grad_norm": 0.8211639523506165, "learning_rate": 0.0002, "epoch": 4.216157989228007, "step": 58710}, {"loss": 0.5613, "grad_norm": 1.0749584436416626, "learning_rate": 0.0002, "epoch": 4.216876122082585, "step": 58720}, {"loss": 0.5956, "grad_norm": 1.1394833326339722, "learning_rate": 0.0002, "epoch": 4.217594254937163, "step": 58730}, {"loss": 0.609, "grad_norm": 1.05130934715271, "learning_rate": 0.0002, "epoch": 4.218312387791742, "step": 58740}, {"loss": 0.6294, "grad_norm": 0.7949456572532654, "learning_rate": 0.0002, "epoch": 4.21903052064632, "step": 58750}, {"loss": 0.6148, "grad_norm": 0.906506359577179, "learning_rate": 0.0002, "epoch": 4.219748653500898, "step": 58760}, {"loss": 0.5778, "grad_norm": 0.8338989615440369, "learning_rate": 0.0002, "epoch": 4.220466786355476, "step": 58770}, {"loss": 0.5402, "grad_norm": 0.9325370788574219, "learning_rate": 0.0002, "epoch": 4.221184919210054, "step": 58780}, {"loss": 0.5657, "grad_norm": 1.0208096504211426, "learning_rate": 0.0002, "epoch": 4.221903052064632, "step": 58790}, {"loss": 0.6523, "grad_norm": 1.0075920820236206, "learning_rate": 0.0002, "epoch": 4.22262118491921, "step": 58800}, {"loss": 0.5545, "grad_norm": 0.9858701229095459, "learning_rate": 0.0002, "epoch": 4.223339317773788, "step": 58810}, {"loss": 0.6343, "grad_norm": 1.0010110139846802, "learning_rate": 0.0002, "epoch": 4.224057450628366, "step": 58820}, {"loss": 0.5991, "grad_norm": 0.9360540509223938, "learning_rate": 0.0002, "epoch": 4.224775583482945, "step": 58830}, {"loss": 0.5887, "grad_norm": 0.9021786451339722, "learning_rate": 0.0002, "epoch": 4.225493716337523, "step": 58840}, {"loss": 0.6132, "grad_norm": 1.1778476238250732, "learning_rate": 0.0002, "epoch": 4.226211849192101, "step": 58850}, {"loss": 0.5956, "grad_norm": 1.0061023235321045, "learning_rate": 0.0002, "epoch": 4.226929982046679, "step": 58860}, {"loss": 0.5846, "grad_norm": 0.8839752674102783, "learning_rate": 0.0002, "epoch": 4.227648114901257, "step": 58870}, {"loss": 0.6129, "grad_norm": 1.0078870058059692, "learning_rate": 0.0002, "epoch": 4.228366247755835, "step": 58880}, {"loss": 0.6403, "grad_norm": 0.8926451206207275, "learning_rate": 0.0002, "epoch": 4.229084380610413, "step": 58890}, {"loss": 0.5987, "grad_norm": 1.4018772840499878, "learning_rate": 0.0002, "epoch": 4.229802513464991, "step": 58900}, {"loss": 0.5925, "grad_norm": 0.9911289215087891, "learning_rate": 0.0002, "epoch": 4.230520646319569, "step": 58910}, {"loss": 0.5846, "grad_norm": 0.9374576807022095, "learning_rate": 0.0002, "epoch": 4.231238779174147, "step": 58920}, {"loss": 0.5856, "grad_norm": 1.179650068283081, "learning_rate": 0.0002, "epoch": 4.231956912028726, "step": 58930}, {"loss": 0.601, "grad_norm": 0.9434911012649536, "learning_rate": 0.0002, "epoch": 4.232675044883304, "step": 58940}, {"loss": 0.6137, "grad_norm": 1.0061911344528198, "learning_rate": 0.0002, "epoch": 4.233393177737882, "step": 58950}, {"loss": 0.5847, "grad_norm": 0.9663233757019043, "learning_rate": 0.0002, "epoch": 4.23411131059246, "step": 58960}, {"loss": 0.5748, "grad_norm": 0.8897581696510315, "learning_rate": 0.0002, "epoch": 4.234829443447038, "step": 58970}, {"loss": 0.5586, "grad_norm": 0.873281717300415, "learning_rate": 0.0002, "epoch": 4.235547576301616, "step": 58980}, {"loss": 0.6027, "grad_norm": 0.9146949052810669, "learning_rate": 0.0002, "epoch": 4.236265709156194, "step": 58990}, {"loss": 0.6356, "grad_norm": 0.9381195306777954, "learning_rate": 0.0002, "epoch": 4.236983842010772, "step": 59000}, {"loss": 0.5641, "grad_norm": 0.9700697064399719, "learning_rate": 0.0002, "epoch": 4.23770197486535, "step": 59010}, {"loss": 0.6099, "grad_norm": 0.9050154685974121, "learning_rate": 0.0002, "epoch": 4.238420107719929, "step": 59020}, {"loss": 0.552, "grad_norm": 0.9901503324508667, "learning_rate": 0.0002, "epoch": 4.239138240574507, "step": 59030}, {"loss": 0.6333, "grad_norm": 0.9009594321250916, "learning_rate": 0.0002, "epoch": 4.239856373429085, "step": 59040}, {"loss": 0.6104, "grad_norm": 1.0924968719482422, "learning_rate": 0.0002, "epoch": 4.240574506283663, "step": 59050}, {"loss": 0.6269, "grad_norm": 0.9939947724342346, "learning_rate": 0.0002, "epoch": 4.241292639138241, "step": 59060}, {"loss": 0.6039, "grad_norm": 1.0577857494354248, "learning_rate": 0.0002, "epoch": 4.242010771992819, "step": 59070}, {"loss": 0.5992, "grad_norm": 1.0836747884750366, "learning_rate": 0.0002, "epoch": 4.242728904847397, "step": 59080}, {"loss": 0.6518, "grad_norm": 0.97043377161026, "learning_rate": 0.0002, "epoch": 4.243447037701975, "step": 59090}, {"loss": 0.5877, "grad_norm": 0.7711901664733887, "learning_rate": 0.0002, "epoch": 4.244165170556553, "step": 59100}, {"loss": 0.6017, "grad_norm": 1.0143170356750488, "learning_rate": 0.0002, "epoch": 4.244883303411131, "step": 59110}, {"loss": 0.6245, "grad_norm": 0.9151925444602966, "learning_rate": 0.0002, "epoch": 4.2456014362657095, "step": 59120}, {"loss": 0.6436, "grad_norm": 0.9252700209617615, "learning_rate": 0.0002, "epoch": 4.2463195691202875, "step": 59130}, {"loss": 0.5696, "grad_norm": 0.8429408073425293, "learning_rate": 0.0002, "epoch": 4.2470377019748655, "step": 59140}, {"loss": 0.5737, "grad_norm": 0.9645987153053284, "learning_rate": 0.0002, "epoch": 4.2477558348294435, "step": 59150}, {"loss": 0.6045, "grad_norm": 0.9949791431427002, "learning_rate": 0.0002, "epoch": 4.2484739676840215, "step": 59160}, {"loss": 0.6069, "grad_norm": 0.9128350615501404, "learning_rate": 0.0002, "epoch": 4.2491921005385995, "step": 59170}, {"loss": 0.596, "grad_norm": 0.7406911849975586, "learning_rate": 0.0002, "epoch": 4.2499102333931775, "step": 59180}, {"loss": 0.5796, "grad_norm": 1.0237419605255127, "learning_rate": 0.0002, "epoch": 4.2506283662477555, "step": 59190}, {"loss": 0.631, "grad_norm": 0.805459201335907, "learning_rate": 0.0002, "epoch": 4.2513464991023335, "step": 59200}, {"loss": 0.6104, "grad_norm": 0.8477254509925842, "learning_rate": 0.0002, "epoch": 4.252064631956912, "step": 59210}, {"loss": 0.5608, "grad_norm": 0.984023928642273, "learning_rate": 0.0002, "epoch": 4.25278276481149, "step": 59220}, {"loss": 0.6185, "grad_norm": 1.0667484998703003, "learning_rate": 0.0002, "epoch": 4.253500897666068, "step": 59230}, {"loss": 0.5596, "grad_norm": 0.7192284464836121, "learning_rate": 0.0002, "epoch": 4.254219030520646, "step": 59240}, {"loss": 0.5971, "grad_norm": 0.9557451009750366, "learning_rate": 0.0002, "epoch": 4.254937163375224, "step": 59250}, {"loss": 0.6012, "grad_norm": 0.9209784865379333, "learning_rate": 0.0002, "epoch": 4.255655296229802, "step": 59260}, {"loss": 0.67, "grad_norm": 0.9785363674163818, "learning_rate": 0.0002, "epoch": 4.25637342908438, "step": 59270}, {"loss": 0.6185, "grad_norm": 0.910214364528656, "learning_rate": 0.0002, "epoch": 4.257091561938958, "step": 59280}, {"loss": 0.6451, "grad_norm": 0.8945858478546143, "learning_rate": 0.0002, "epoch": 4.257809694793536, "step": 59290}, {"loss": 0.5876, "grad_norm": 1.0984420776367188, "learning_rate": 0.0002, "epoch": 4.258527827648114, "step": 59300}, {"loss": 0.5616, "grad_norm": 1.0256640911102295, "learning_rate": 0.0002, "epoch": 4.259245960502693, "step": 59310}, {"loss": 0.5825, "grad_norm": 0.978397786617279, "learning_rate": 0.0002, "epoch": 4.259964093357271, "step": 59320}, {"loss": 0.6043, "grad_norm": 0.7587000727653503, "learning_rate": 0.0002, "epoch": 4.260682226211849, "step": 59330}, {"loss": 0.5616, "grad_norm": 0.9384620785713196, "learning_rate": 0.0002, "epoch": 4.261400359066427, "step": 59340}, {"loss": 0.6669, "grad_norm": 0.893992006778717, "learning_rate": 0.0002, "epoch": 4.262118491921005, "step": 59350}, {"loss": 0.561, "grad_norm": 1.0231536626815796, "learning_rate": 0.0002, "epoch": 4.262836624775583, "step": 59360}, {"loss": 0.5912, "grad_norm": 0.9810128211975098, "learning_rate": 0.0002, "epoch": 4.263554757630161, "step": 59370}, {"loss": 0.5871, "grad_norm": 1.0868116617202759, "learning_rate": 0.0002, "epoch": 4.264272890484739, "step": 59380}, {"loss": 0.5986, "grad_norm": 1.1433676481246948, "learning_rate": 0.0002, "epoch": 4.264991023339318, "step": 59390}, {"loss": 0.6306, "grad_norm": 0.9836946725845337, "learning_rate": 0.0002, "epoch": 4.265709156193896, "step": 59400}, {"loss": 0.5854, "grad_norm": 0.9473603963851929, "learning_rate": 0.0002, "epoch": 4.266427289048474, "step": 59410}, {"loss": 0.6095, "grad_norm": 0.9066835641860962, "learning_rate": 0.0002, "epoch": 4.267145421903052, "step": 59420}, {"loss": 0.656, "grad_norm": 1.0534718036651611, "learning_rate": 0.0002, "epoch": 4.26786355475763, "step": 59430}, {"loss": 0.5624, "grad_norm": 1.0392775535583496, "learning_rate": 0.0002, "epoch": 4.268581687612208, "step": 59440}, {"loss": 0.5697, "grad_norm": 1.011472463607788, "learning_rate": 0.0002, "epoch": 4.269299820466786, "step": 59450}, {"loss": 0.5971, "grad_norm": 1.0704147815704346, "learning_rate": 0.0002, "epoch": 4.270017953321364, "step": 59460}, {"loss": 0.5719, "grad_norm": 0.9349238872528076, "learning_rate": 0.0002, "epoch": 4.270736086175942, "step": 59470}, {"loss": 0.5637, "grad_norm": 0.8745087385177612, "learning_rate": 0.0002, "epoch": 4.27145421903052, "step": 59480}, {"loss": 0.6246, "grad_norm": 0.8823763728141785, "learning_rate": 0.0002, "epoch": 4.272172351885099, "step": 59490}, {"loss": 0.6021, "grad_norm": 1.110912799835205, "learning_rate": 0.0002, "epoch": 4.272890484739677, "step": 59500}, {"loss": 0.5939, "grad_norm": 1.0000925064086914, "learning_rate": 0.0002, "epoch": 4.273608617594255, "step": 59510}, {"loss": 0.5531, "grad_norm": 1.1578227281570435, "learning_rate": 0.0002, "epoch": 4.274326750448833, "step": 59520}, {"loss": 0.6372, "grad_norm": 0.875720202922821, "learning_rate": 0.0002, "epoch": 4.275044883303411, "step": 59530}, {"loss": 0.5956, "grad_norm": 0.9562238454818726, "learning_rate": 0.0002, "epoch": 4.275763016157989, "step": 59540}, {"loss": 0.5996, "grad_norm": 0.8384222388267517, "learning_rate": 0.0002, "epoch": 4.276481149012567, "step": 59550}, {"loss": 0.6001, "grad_norm": 1.2719428539276123, "learning_rate": 0.0002, "epoch": 4.277199281867145, "step": 59560}, {"loss": 0.6286, "grad_norm": 1.0656434297561646, "learning_rate": 0.0002, "epoch": 4.277917414721723, "step": 59570}, {"loss": 0.5895, "grad_norm": 1.0766716003417969, "learning_rate": 0.0002, "epoch": 4.278635547576302, "step": 59580}, {"loss": 0.5831, "grad_norm": 0.8892807960510254, "learning_rate": 0.0002, "epoch": 4.27935368043088, "step": 59590}, {"loss": 0.5717, "grad_norm": 0.8956300020217896, "learning_rate": 0.0002, "epoch": 4.280071813285458, "step": 59600}, {"loss": 0.5965, "grad_norm": 0.9562926888465881, "learning_rate": 0.0002, "epoch": 4.280789946140036, "step": 59610}, {"loss": 0.5487, "grad_norm": 1.009141445159912, "learning_rate": 0.0002, "epoch": 4.281508078994614, "step": 59620}, {"loss": 0.6337, "grad_norm": 1.0546064376831055, "learning_rate": 0.0002, "epoch": 4.282226211849192, "step": 59630}, {"loss": 0.5771, "grad_norm": 0.8831254243850708, "learning_rate": 0.0002, "epoch": 4.28294434470377, "step": 59640}, {"loss": 0.6241, "grad_norm": 0.9560053944587708, "learning_rate": 0.0002, "epoch": 4.283662477558348, "step": 59650}, {"loss": 0.6012, "grad_norm": 1.030339241027832, "learning_rate": 0.0002, "epoch": 4.284380610412926, "step": 59660}, {"loss": 0.6174, "grad_norm": 1.00662100315094, "learning_rate": 0.0002, "epoch": 4.285098743267504, "step": 59670}, {"loss": 0.5802, "grad_norm": 1.0759116411209106, "learning_rate": 0.0002, "epoch": 4.285816876122083, "step": 59680}, {"loss": 0.6429, "grad_norm": 0.9985393285751343, "learning_rate": 0.0002, "epoch": 4.286535008976661, "step": 59690}, {"loss": 0.5992, "grad_norm": 0.9044474959373474, "learning_rate": 0.0002, "epoch": 4.287253141831239, "step": 59700}, {"loss": 0.6263, "grad_norm": 1.1224442720413208, "learning_rate": 0.0002, "epoch": 4.287971274685817, "step": 59710}, {"loss": 0.6118, "grad_norm": 0.8436414003372192, "learning_rate": 0.0002, "epoch": 4.288689407540395, "step": 59720}, {"loss": 0.5881, "grad_norm": 1.0695041418075562, "learning_rate": 0.0002, "epoch": 4.289407540394973, "step": 59730}, {"loss": 0.5994, "grad_norm": 0.8809951543807983, "learning_rate": 0.0002, "epoch": 4.290125673249551, "step": 59740}, {"loss": 0.6508, "grad_norm": 1.0213792324066162, "learning_rate": 0.0002, "epoch": 4.290843806104129, "step": 59750}, {"loss": 0.5851, "grad_norm": 0.9660196900367737, "learning_rate": 0.0002, "epoch": 4.291561938958707, "step": 59760}, {"loss": 0.6582, "grad_norm": 0.8005787134170532, "learning_rate": 0.0002, "epoch": 4.292280071813286, "step": 59770}, {"loss": 0.6504, "grad_norm": 1.0016109943389893, "learning_rate": 0.0002, "epoch": 4.292998204667864, "step": 59780}, {"loss": 0.5765, "grad_norm": 0.9112903475761414, "learning_rate": 0.0002, "epoch": 4.293716337522442, "step": 59790}, {"loss": 0.5925, "grad_norm": 0.9999852180480957, "learning_rate": 0.0002, "epoch": 4.29443447037702, "step": 59800}, {"loss": 0.636, "grad_norm": 0.9323953986167908, "learning_rate": 0.0002, "epoch": 4.295152603231598, "step": 59810}, {"loss": 0.5743, "grad_norm": 0.903037965297699, "learning_rate": 0.0002, "epoch": 4.295870736086176, "step": 59820}, {"loss": 0.6008, "grad_norm": 1.2462431192398071, "learning_rate": 0.0002, "epoch": 4.296588868940754, "step": 59830}, {"loss": 0.6126, "grad_norm": 1.2322230339050293, "learning_rate": 0.0002, "epoch": 4.297307001795332, "step": 59840}, {"loss": 0.6029, "grad_norm": 0.9584668278694153, "learning_rate": 0.0002, "epoch": 4.29802513464991, "step": 59850}, {"loss": 0.6179, "grad_norm": 0.9664767980575562, "learning_rate": 0.0002, "epoch": 4.298743267504488, "step": 59860}, {"loss": 0.5909, "grad_norm": 0.8860437273979187, "learning_rate": 0.0002, "epoch": 4.299461400359067, "step": 59870}, {"loss": 0.5708, "grad_norm": 1.0825127363204956, "learning_rate": 0.0002, "epoch": 4.300179533213645, "step": 59880}, {"loss": 0.6338, "grad_norm": 1.1312100887298584, "learning_rate": 0.0002, "epoch": 4.300897666068223, "step": 59890}, {"loss": 0.6362, "grad_norm": 0.8289751410484314, "learning_rate": 0.0002, "epoch": 4.301615798922801, "step": 59900}, {"loss": 0.6061, "grad_norm": 0.8990927934646606, "learning_rate": 0.0002, "epoch": 4.302333931777379, "step": 59910}, {"loss": 0.5993, "grad_norm": 0.9667525887489319, "learning_rate": 0.0002, "epoch": 4.303052064631957, "step": 59920}, {"loss": 0.5756, "grad_norm": 0.8656060695648193, "learning_rate": 0.0002, "epoch": 4.303770197486535, "step": 59930}, {"loss": 0.6271, "grad_norm": 0.8909396529197693, "learning_rate": 0.0002, "epoch": 4.304488330341113, "step": 59940}, {"loss": 0.5918, "grad_norm": 0.9533283114433289, "learning_rate": 0.0002, "epoch": 4.305206463195692, "step": 59950}, {"loss": 0.6146, "grad_norm": 0.9090739488601685, "learning_rate": 0.0002, "epoch": 4.30592459605027, "step": 59960}, {"loss": 0.5949, "grad_norm": 1.096656322479248, "learning_rate": 0.0002, "epoch": 4.306642728904848, "step": 59970}, {"loss": 0.582, "grad_norm": 1.0392465591430664, "learning_rate": 0.0002, "epoch": 4.307360861759426, "step": 59980}, {"loss": 0.6552, "grad_norm": 0.8733913898468018, "learning_rate": 0.0002, "epoch": 4.308078994614004, "step": 59990}, {"loss": 0.5771, "grad_norm": 0.8287094235420227, "learning_rate": 0.0002, "epoch": 4.308797127468582, "step": 60000}, {"loss": 0.6157, "grad_norm": 0.9267017245292664, "learning_rate": 0.0002, "epoch": 4.30951526032316, "step": 60010}, {"loss": 0.6402, "grad_norm": 0.9969515800476074, "learning_rate": 0.0002, "epoch": 4.310233393177738, "step": 60020}, {"loss": 0.541, "grad_norm": 1.0005015134811401, "learning_rate": 0.0002, "epoch": 4.310951526032316, "step": 60030}, {"loss": 0.6295, "grad_norm": 1.1215369701385498, "learning_rate": 0.0002, "epoch": 4.311669658886894, "step": 60040}, {"loss": 0.6225, "grad_norm": 1.0434890985488892, "learning_rate": 0.0002, "epoch": 4.312387791741473, "step": 60050}, {"loss": 0.5962, "grad_norm": 0.967989981174469, "learning_rate": 0.0002, "epoch": 4.313105924596051, "step": 60060}, {"loss": 0.5862, "grad_norm": 1.007599115371704, "learning_rate": 0.0002, "epoch": 4.313824057450629, "step": 60070}, {"loss": 0.6233, "grad_norm": 0.9356340765953064, "learning_rate": 0.0002, "epoch": 4.314542190305207, "step": 60080}, {"loss": 0.5642, "grad_norm": 0.9566757678985596, "learning_rate": 0.0002, "epoch": 4.315260323159785, "step": 60090}, {"loss": 0.6142, "grad_norm": 1.1066830158233643, "learning_rate": 0.0002, "epoch": 4.315978456014363, "step": 60100}, {"loss": 0.5432, "grad_norm": 0.9895772933959961, "learning_rate": 0.0002, "epoch": 4.316696588868941, "step": 60110}, {"loss": 0.5542, "grad_norm": 1.07423734664917, "learning_rate": 0.0002, "epoch": 4.317414721723519, "step": 60120}, {"loss": 0.5975, "grad_norm": 1.0777037143707275, "learning_rate": 0.0002, "epoch": 4.318132854578097, "step": 60130}, {"loss": 0.6168, "grad_norm": 1.1475656032562256, "learning_rate": 0.0002, "epoch": 4.3188509874326755, "step": 60140}, {"loss": 0.6038, "grad_norm": 1.0705864429473877, "learning_rate": 0.0002, "epoch": 4.3195691202872535, "step": 60150}, {"loss": 0.6032, "grad_norm": 0.8676854968070984, "learning_rate": 0.0002, "epoch": 4.3202872531418315, "step": 60160}, {"loss": 0.632, "grad_norm": 0.9488174319267273, "learning_rate": 0.0002, "epoch": 4.3210053859964095, "step": 60170}, {"loss": 0.6137, "grad_norm": 1.1171153783798218, "learning_rate": 0.0002, "epoch": 4.3217235188509875, "step": 60180}, {"loss": 0.6477, "grad_norm": 1.091435194015503, "learning_rate": 0.0002, "epoch": 4.3224416517055655, "step": 60190}, {"loss": 0.6105, "grad_norm": 0.880944013595581, "learning_rate": 0.0002, "epoch": 4.3231597845601435, "step": 60200}, {"loss": 0.5736, "grad_norm": 0.8458809852600098, "learning_rate": 0.0002, "epoch": 4.3238779174147215, "step": 60210}, {"loss": 0.6211, "grad_norm": 0.7900225520133972, "learning_rate": 0.0002, "epoch": 4.3245960502692995, "step": 60220}, {"loss": 0.6205, "grad_norm": 0.966742753982544, "learning_rate": 0.0002, "epoch": 4.3253141831238775, "step": 60230}, {"loss": 0.6178, "grad_norm": 0.8948110342025757, "learning_rate": 0.0002, "epoch": 4.326032315978456, "step": 60240}, {"loss": 0.6176, "grad_norm": 0.8598700165748596, "learning_rate": 0.0002, "epoch": 4.326750448833034, "step": 60250}, {"loss": 0.6373, "grad_norm": 1.127610206604004, "learning_rate": 0.0002, "epoch": 4.327468581687612, "step": 60260}, {"loss": 0.6081, "grad_norm": 0.8357340693473816, "learning_rate": 0.0002, "epoch": 4.32818671454219, "step": 60270}, {"loss": 0.5839, "grad_norm": 0.8771896362304688, "learning_rate": 0.0002, "epoch": 4.328904847396768, "step": 60280}, {"loss": 0.5959, "grad_norm": 0.9202101826667786, "learning_rate": 0.0002, "epoch": 4.329622980251346, "step": 60290}, {"loss": 0.6387, "grad_norm": 1.1427538394927979, "learning_rate": 0.0002, "epoch": 4.330341113105924, "step": 60300}, {"loss": 0.6306, "grad_norm": 0.8711863160133362, "learning_rate": 0.0002, "epoch": 4.331059245960502, "step": 60310}, {"loss": 0.6011, "grad_norm": 0.972723662853241, "learning_rate": 0.0002, "epoch": 4.33177737881508, "step": 60320}, {"loss": 0.5761, "grad_norm": 1.1496877670288086, "learning_rate": 0.0002, "epoch": 4.332495511669659, "step": 60330}, {"loss": 0.6472, "grad_norm": 1.008581519126892, "learning_rate": 0.0002, "epoch": 4.333213644524237, "step": 60340}, {"loss": 0.6479, "grad_norm": 1.0802706480026245, "learning_rate": 0.0002, "epoch": 4.333931777378815, "step": 60350}, {"loss": 0.6105, "grad_norm": 0.8394291996955872, "learning_rate": 0.0002, "epoch": 4.334649910233393, "step": 60360}, {"loss": 0.6241, "grad_norm": 0.8355905413627625, "learning_rate": 0.0002, "epoch": 4.335368043087971, "step": 60370}, {"loss": 0.6282, "grad_norm": 0.9583960175514221, "learning_rate": 0.0002, "epoch": 4.336086175942549, "step": 60380}, {"loss": 0.6436, "grad_norm": 1.138934850692749, "learning_rate": 0.0002, "epoch": 4.336804308797127, "step": 60390}, {"loss": 0.587, "grad_norm": 1.0334709882736206, "learning_rate": 0.0002, "epoch": 4.337522441651705, "step": 60400}, {"loss": 0.5596, "grad_norm": 0.729686439037323, "learning_rate": 0.0002, "epoch": 4.338240574506283, "step": 60410}, {"loss": 0.5863, "grad_norm": 0.8735929727554321, "learning_rate": 0.0002, "epoch": 4.338958707360861, "step": 60420}, {"loss": 0.5732, "grad_norm": 0.9617681503295898, "learning_rate": 0.0002, "epoch": 4.33967684021544, "step": 60430}, {"loss": 0.5865, "grad_norm": 0.9439655542373657, "learning_rate": 0.0002, "epoch": 4.340394973070018, "step": 60440}, {"loss": 0.5959, "grad_norm": 0.9275408387184143, "learning_rate": 0.0002, "epoch": 4.341113105924596, "step": 60450}, {"loss": 0.6295, "grad_norm": 1.0693308115005493, "learning_rate": 0.0002, "epoch": 4.341831238779174, "step": 60460}, {"loss": 0.6455, "grad_norm": 0.9234438538551331, "learning_rate": 0.0002, "epoch": 4.342549371633752, "step": 60470}, {"loss": 0.6308, "grad_norm": 1.1376168727874756, "learning_rate": 0.0002, "epoch": 4.34326750448833, "step": 60480}, {"loss": 0.623, "grad_norm": 0.9218108654022217, "learning_rate": 0.0002, "epoch": 4.343985637342908, "step": 60490}, {"loss": 0.6291, "grad_norm": 1.1467362642288208, "learning_rate": 0.0002, "epoch": 4.344703770197486, "step": 60500}, {"loss": 0.5757, "grad_norm": 0.9459165930747986, "learning_rate": 0.0002, "epoch": 4.345421903052064, "step": 60510}, {"loss": 0.5963, "grad_norm": 0.9460827708244324, "learning_rate": 0.0002, "epoch": 4.346140035906643, "step": 60520}, {"loss": 0.5822, "grad_norm": 1.0845041275024414, "learning_rate": 0.0002, "epoch": 4.346858168761221, "step": 60530}, {"loss": 0.6326, "grad_norm": 1.082675576210022, "learning_rate": 0.0002, "epoch": 4.347576301615799, "step": 60540}, {"loss": 0.5419, "grad_norm": 0.8443698883056641, "learning_rate": 0.0002, "epoch": 4.348294434470377, "step": 60550}, {"loss": 0.5634, "grad_norm": 1.018393874168396, "learning_rate": 0.0002, "epoch": 4.349012567324955, "step": 60560}, {"loss": 0.6447, "grad_norm": 0.8796373009681702, "learning_rate": 0.0002, "epoch": 4.349730700179533, "step": 60570}, {"loss": 0.6108, "grad_norm": 1.097942590713501, "learning_rate": 0.0002, "epoch": 4.350448833034111, "step": 60580}, {"loss": 0.6161, "grad_norm": 0.8750485181808472, "learning_rate": 0.0002, "epoch": 4.351166965888689, "step": 60590}, {"loss": 0.5849, "grad_norm": 1.0339995622634888, "learning_rate": 0.0002, "epoch": 4.351885098743267, "step": 60600}, {"loss": 0.6097, "grad_norm": 0.9077731966972351, "learning_rate": 0.0002, "epoch": 4.352603231597846, "step": 60610}, {"loss": 0.5657, "grad_norm": 1.051321029663086, "learning_rate": 0.0002, "epoch": 4.353321364452424, "step": 60620}, {"loss": 0.6089, "grad_norm": 1.0018669366836548, "learning_rate": 0.0002, "epoch": 4.354039497307002, "step": 60630}, {"loss": 0.5957, "grad_norm": 1.0349196195602417, "learning_rate": 0.0002, "epoch": 4.35475763016158, "step": 60640}, {"loss": 0.6212, "grad_norm": 1.009589672088623, "learning_rate": 0.0002, "epoch": 4.355475763016158, "step": 60650}, {"loss": 0.5542, "grad_norm": 1.0463480949401855, "learning_rate": 0.0002, "epoch": 4.356193895870736, "step": 60660}, {"loss": 0.5797, "grad_norm": 0.9815132021903992, "learning_rate": 0.0002, "epoch": 4.356912028725314, "step": 60670}, {"loss": 0.6089, "grad_norm": 1.0977262258529663, "learning_rate": 0.0002, "epoch": 4.357630161579892, "step": 60680}, {"loss": 0.6061, "grad_norm": 0.8450005054473877, "learning_rate": 0.0002, "epoch": 4.35834829443447, "step": 60690}, {"loss": 0.5913, "grad_norm": 1.0959078073501587, "learning_rate": 0.0002, "epoch": 4.359066427289049, "step": 60700}, {"loss": 0.5957, "grad_norm": 0.9155098795890808, "learning_rate": 0.0002, "epoch": 4.359784560143627, "step": 60710}, {"loss": 0.6084, "grad_norm": 0.9267987012863159, "learning_rate": 0.0002, "epoch": 4.360502692998205, "step": 60720}, {"loss": 0.5974, "grad_norm": 1.177472472190857, "learning_rate": 0.0002, "epoch": 4.361220825852783, "step": 60730}, {"loss": 0.5911, "grad_norm": 0.8615312576293945, "learning_rate": 0.0002, "epoch": 4.361938958707361, "step": 60740}, {"loss": 0.5819, "grad_norm": 1.0939710140228271, "learning_rate": 0.0002, "epoch": 4.362657091561939, "step": 60750}, {"loss": 0.6263, "grad_norm": 1.0928049087524414, "learning_rate": 0.0002, "epoch": 4.363375224416517, "step": 60760}, {"loss": 0.5772, "grad_norm": 1.0796833038330078, "learning_rate": 0.0002, "epoch": 4.364093357271095, "step": 60770}, {"loss": 0.5879, "grad_norm": 0.9768339991569519, "learning_rate": 0.0002, "epoch": 4.364811490125673, "step": 60780}, {"loss": 0.6335, "grad_norm": 0.9082722067832947, "learning_rate": 0.0002, "epoch": 4.365529622980251, "step": 60790}, {"loss": 0.6037, "grad_norm": 0.9614832997322083, "learning_rate": 0.0002, "epoch": 4.36624775583483, "step": 60800}, {"loss": 0.6185, "grad_norm": 0.8874651789665222, "learning_rate": 0.0002, "epoch": 4.366965888689408, "step": 60810}, {"loss": 0.6524, "grad_norm": 0.8810178637504578, "learning_rate": 0.0002, "epoch": 4.367684021543986, "step": 60820}, {"loss": 0.5908, "grad_norm": 1.0893806219100952, "learning_rate": 0.0002, "epoch": 4.368402154398564, "step": 60830}, {"loss": 0.5782, "grad_norm": 0.9042278528213501, "learning_rate": 0.0002, "epoch": 4.369120287253142, "step": 60840}, {"loss": 0.5798, "grad_norm": 1.0832217931747437, "learning_rate": 0.0002, "epoch": 4.36983842010772, "step": 60850}, {"loss": 0.6235, "grad_norm": 0.9431114792823792, "learning_rate": 0.0002, "epoch": 4.370556552962298, "step": 60860}, {"loss": 0.5869, "grad_norm": 1.031553030014038, "learning_rate": 0.0002, "epoch": 4.371274685816876, "step": 60870}, {"loss": 0.5839, "grad_norm": 0.8702824711799622, "learning_rate": 0.0002, "epoch": 4.371992818671454, "step": 60880}, {"loss": 0.6028, "grad_norm": 1.1109199523925781, "learning_rate": 0.0002, "epoch": 4.372710951526033, "step": 60890}, {"loss": 0.6423, "grad_norm": 0.8369361162185669, "learning_rate": 0.0002, "epoch": 4.373429084380611, "step": 60900}, {"loss": 0.6011, "grad_norm": 0.988915205001831, "learning_rate": 0.0002, "epoch": 4.374147217235189, "step": 60910}, {"loss": 0.6266, "grad_norm": 0.9365919232368469, "learning_rate": 0.0002, "epoch": 4.374865350089767, "step": 60920}, {"loss": 0.5786, "grad_norm": 0.9789398908615112, "learning_rate": 0.0002, "epoch": 4.375583482944345, "step": 60930}, {"loss": 0.6459, "grad_norm": 0.8786931037902832, "learning_rate": 0.0002, "epoch": 4.376301615798923, "step": 60940}, {"loss": 0.631, "grad_norm": 0.8891511559486389, "learning_rate": 0.0002, "epoch": 4.377019748653501, "step": 60950}, {"loss": 0.5909, "grad_norm": 0.9561707377433777, "learning_rate": 0.0002, "epoch": 4.377737881508079, "step": 60960}, {"loss": 0.5815, "grad_norm": 0.8674200177192688, "learning_rate": 0.0002, "epoch": 4.378456014362657, "step": 60970}, {"loss": 0.5664, "grad_norm": 0.9285916090011597, "learning_rate": 0.0002, "epoch": 4.379174147217235, "step": 60980}, {"loss": 0.5727, "grad_norm": 0.9185547232627869, "learning_rate": 0.0002, "epoch": 4.379892280071814, "step": 60990}, {"loss": 0.6296, "grad_norm": 1.081664800643921, "learning_rate": 0.0002, "epoch": 4.380610412926392, "step": 61000}, {"loss": 0.6346, "grad_norm": 1.0475854873657227, "learning_rate": 0.0002, "epoch": 4.38132854578097, "step": 61010}, {"loss": 0.6394, "grad_norm": 1.1519653797149658, "learning_rate": 0.0002, "epoch": 4.382046678635548, "step": 61020}, {"loss": 0.6437, "grad_norm": 0.8757607936859131, "learning_rate": 0.0002, "epoch": 4.382764811490126, "step": 61030}, {"loss": 0.6143, "grad_norm": 0.8707934021949768, "learning_rate": 0.0002, "epoch": 4.383482944344704, "step": 61040}, {"loss": 0.5782, "grad_norm": 1.1807516813278198, "learning_rate": 0.0002, "epoch": 4.384201077199282, "step": 61050}, {"loss": 0.5901, "grad_norm": 1.0674688816070557, "learning_rate": 0.0002, "epoch": 4.38491921005386, "step": 61060}, {"loss": 0.6247, "grad_norm": 0.9321209788322449, "learning_rate": 0.0002, "epoch": 4.385637342908438, "step": 61070}, {"loss": 0.5882, "grad_norm": 1.0786446332931519, "learning_rate": 0.0002, "epoch": 4.3863554757630165, "step": 61080}, {"loss": 0.5966, "grad_norm": 0.9733907580375671, "learning_rate": 0.0002, "epoch": 4.3870736086175945, "step": 61090}, {"loss": 0.5826, "grad_norm": 0.9476010203361511, "learning_rate": 0.0002, "epoch": 4.3877917414721725, "step": 61100}, {"loss": 0.6204, "grad_norm": 1.1321563720703125, "learning_rate": 0.0002, "epoch": 4.3885098743267505, "step": 61110}, {"loss": 0.5908, "grad_norm": 0.9379117488861084, "learning_rate": 0.0002, "epoch": 4.3892280071813286, "step": 61120}, {"loss": 0.586, "grad_norm": 0.8409728407859802, "learning_rate": 0.0002, "epoch": 4.3899461400359066, "step": 61130}, {"loss": 0.614, "grad_norm": 0.8309189081192017, "learning_rate": 0.0002, "epoch": 4.3906642728904846, "step": 61140}, {"loss": 0.6284, "grad_norm": 0.8922196626663208, "learning_rate": 0.0002, "epoch": 4.391382405745063, "step": 61150}, {"loss": 0.6358, "grad_norm": 0.8274614214897156, "learning_rate": 0.0002, "epoch": 4.392100538599641, "step": 61160}, {"loss": 0.5827, "grad_norm": 1.0928618907928467, "learning_rate": 0.0002, "epoch": 4.392818671454219, "step": 61170}, {"loss": 0.616, "grad_norm": 0.9771125316619873, "learning_rate": 0.0002, "epoch": 4.3935368043087974, "step": 61180}, {"loss": 0.6238, "grad_norm": 0.8844535946846008, "learning_rate": 0.0002, "epoch": 4.3942549371633755, "step": 61190}, {"loss": 0.5974, "grad_norm": 1.0498822927474976, "learning_rate": 0.0002, "epoch": 4.3949730700179535, "step": 61200}, {"loss": 0.596, "grad_norm": 0.9882155060768127, "learning_rate": 0.0002, "epoch": 4.3956912028725315, "step": 61210}, {"loss": 0.6385, "grad_norm": 1.090356707572937, "learning_rate": 0.0002, "epoch": 4.3964093357271095, "step": 61220}, {"loss": 0.6298, "grad_norm": 1.0908088684082031, "learning_rate": 0.0002, "epoch": 4.3971274685816875, "step": 61230}, {"loss": 0.6405, "grad_norm": 1.0013501644134521, "learning_rate": 0.0002, "epoch": 4.3978456014362655, "step": 61240}, {"loss": 0.5995, "grad_norm": 1.0916062593460083, "learning_rate": 0.0002, "epoch": 4.3985637342908435, "step": 61250}, {"loss": 0.5938, "grad_norm": 1.0817667245864868, "learning_rate": 0.0002, "epoch": 4.399281867145422, "step": 61260}, {"loss": 0.604, "grad_norm": 0.9745162129402161, "learning_rate": 0.0002, "epoch": 4.4, "step": 61270}, {"loss": 0.6028, "grad_norm": 1.0653400421142578, "learning_rate": 0.0002, "epoch": 4.400718132854578, "step": 61280}, {"loss": 0.6064, "grad_norm": 1.0082067251205444, "learning_rate": 0.0002, "epoch": 4.401436265709156, "step": 61290}, {"loss": 0.5719, "grad_norm": 0.7963659167289734, "learning_rate": 0.0002, "epoch": 4.402154398563734, "step": 61300}, {"loss": 0.6724, "grad_norm": 1.0428845882415771, "learning_rate": 0.0002, "epoch": 4.402872531418312, "step": 61310}, {"loss": 0.5991, "grad_norm": 0.9205707311630249, "learning_rate": 0.0002, "epoch": 4.40359066427289, "step": 61320}, {"loss": 0.6169, "grad_norm": 1.0103533267974854, "learning_rate": 0.0002, "epoch": 4.404308797127468, "step": 61330}, {"loss": 0.6284, "grad_norm": 1.113547682762146, "learning_rate": 0.0002, "epoch": 4.405026929982046, "step": 61340}, {"loss": 0.6071, "grad_norm": 1.137488842010498, "learning_rate": 0.0002, "epoch": 4.405745062836624, "step": 61350}, {"loss": 0.6303, "grad_norm": 1.1284101009368896, "learning_rate": 0.0002, "epoch": 4.406463195691203, "step": 61360}, {"loss": 0.5613, "grad_norm": 0.8010451197624207, "learning_rate": 0.0002, "epoch": 4.407181328545781, "step": 61370}, {"loss": 0.5963, "grad_norm": 0.8893977403640747, "learning_rate": 0.0002, "epoch": 4.407899461400359, "step": 61380}, {"loss": 0.6154, "grad_norm": 0.9098272323608398, "learning_rate": 0.0002, "epoch": 4.408617594254937, "step": 61390}, {"loss": 0.6091, "grad_norm": 1.0613329410552979, "learning_rate": 0.0002, "epoch": 4.409335727109515, "step": 61400}, {"loss": 0.6222, "grad_norm": 1.0070269107818604, "learning_rate": 0.0002, "epoch": 4.410053859964093, "step": 61410}, {"loss": 0.5894, "grad_norm": 0.8632227778434753, "learning_rate": 0.0002, "epoch": 4.410771992818671, "step": 61420}, {"loss": 0.6412, "grad_norm": 1.0183731317520142, "learning_rate": 0.0002, "epoch": 4.411490125673249, "step": 61430}, {"loss": 0.596, "grad_norm": 0.9049941897392273, "learning_rate": 0.0002, "epoch": 4.412208258527827, "step": 61440}, {"loss": 0.5991, "grad_norm": 1.0184082984924316, "learning_rate": 0.0002, "epoch": 4.412926391382406, "step": 61450}, {"loss": 0.5758, "grad_norm": 0.9994277358055115, "learning_rate": 0.0002, "epoch": 4.413644524236984, "step": 61460}, {"loss": 0.6009, "grad_norm": 1.0112420320510864, "learning_rate": 0.0002, "epoch": 4.414362657091562, "step": 61470}, {"loss": 0.584, "grad_norm": 0.9751759171485901, "learning_rate": 0.0002, "epoch": 4.41508078994614, "step": 61480}, {"loss": 0.6307, "grad_norm": 1.047135591506958, "learning_rate": 0.0002, "epoch": 4.415798922800718, "step": 61490}, {"loss": 0.6645, "grad_norm": 0.886282742023468, "learning_rate": 0.0002, "epoch": 4.416517055655296, "step": 61500}, {"loss": 0.6168, "grad_norm": 0.971964418888092, "learning_rate": 0.0002, "epoch": 4.417235188509874, "step": 61510}, {"loss": 0.5822, "grad_norm": 0.9603846073150635, "learning_rate": 0.0002, "epoch": 4.417953321364452, "step": 61520}, {"loss": 0.6349, "grad_norm": 1.060042142868042, "learning_rate": 0.0002, "epoch": 4.41867145421903, "step": 61530}, {"loss": 0.6223, "grad_norm": 1.1231369972229004, "learning_rate": 0.0002, "epoch": 4.419389587073608, "step": 61540}, {"loss": 0.6175, "grad_norm": 0.8269591331481934, "learning_rate": 0.0002, "epoch": 4.420107719928187, "step": 61550}, {"loss": 0.6285, "grad_norm": 1.0341241359710693, "learning_rate": 0.0002, "epoch": 4.420825852782765, "step": 61560}, {"loss": 0.6054, "grad_norm": 0.7276636958122253, "learning_rate": 0.0002, "epoch": 4.421543985637343, "step": 61570}, {"loss": 0.6321, "grad_norm": 1.0663669109344482, "learning_rate": 0.0002, "epoch": 4.422262118491921, "step": 61580}, {"loss": 0.5944, "grad_norm": 0.9764387011528015, "learning_rate": 0.0002, "epoch": 4.422980251346499, "step": 61590}, {"loss": 0.6065, "grad_norm": 1.0953258275985718, "learning_rate": 0.0002, "epoch": 4.423698384201077, "step": 61600}, {"loss": 0.5815, "grad_norm": 0.8877012729644775, "learning_rate": 0.0002, "epoch": 4.424416517055655, "step": 61610}, {"loss": 0.5798, "grad_norm": 0.8781440854072571, "learning_rate": 0.0002, "epoch": 4.425134649910233, "step": 61620}, {"loss": 0.6223, "grad_norm": 0.8333432674407959, "learning_rate": 0.0002, "epoch": 4.425852782764811, "step": 61630}, {"loss": 0.5949, "grad_norm": 0.9647989869117737, "learning_rate": 0.0002, "epoch": 4.42657091561939, "step": 61640}, {"loss": 0.6135, "grad_norm": 1.0801783800125122, "learning_rate": 0.0002, "epoch": 4.427289048473968, "step": 61650}, {"loss": 0.6065, "grad_norm": 0.8215882778167725, "learning_rate": 0.0002, "epoch": 4.428007181328546, "step": 61660}, {"loss": 0.5851, "grad_norm": 0.9853931665420532, "learning_rate": 0.0002, "epoch": 4.428725314183124, "step": 61670}, {"loss": 0.5942, "grad_norm": 0.8658010959625244, "learning_rate": 0.0002, "epoch": 4.429443447037702, "step": 61680}, {"loss": 0.6413, "grad_norm": 1.124064326286316, "learning_rate": 0.0002, "epoch": 4.43016157989228, "step": 61690}, {"loss": 0.6021, "grad_norm": 1.009340763092041, "learning_rate": 0.0002, "epoch": 4.430879712746858, "step": 61700}, {"loss": 0.6127, "grad_norm": 0.8705293536186218, "learning_rate": 0.0002, "epoch": 4.431597845601436, "step": 61710}, {"loss": 0.5971, "grad_norm": 1.1323511600494385, "learning_rate": 0.0002, "epoch": 4.432315978456014, "step": 61720}, {"loss": 0.5985, "grad_norm": 1.1203019618988037, "learning_rate": 0.0002, "epoch": 4.433034111310592, "step": 61730}, {"loss": 0.6178, "grad_norm": 1.1683770418167114, "learning_rate": 0.0002, "epoch": 4.433752244165171, "step": 61740}, {"loss": 0.6132, "grad_norm": 1.0735899209976196, "learning_rate": 0.0002, "epoch": 4.434470377019749, "step": 61750}, {"loss": 0.5664, "grad_norm": 1.142496109008789, "learning_rate": 0.0002, "epoch": 4.435188509874327, "step": 61760}, {"loss": 0.6276, "grad_norm": 1.1157732009887695, "learning_rate": 0.0002, "epoch": 4.435906642728905, "step": 61770}, {"loss": 0.6237, "grad_norm": 0.8845949172973633, "learning_rate": 0.0002, "epoch": 4.436624775583483, "step": 61780}, {"loss": 0.5964, "grad_norm": 1.1212759017944336, "learning_rate": 0.0002, "epoch": 4.437342908438061, "step": 61790}, {"loss": 0.6185, "grad_norm": 0.8832488656044006, "learning_rate": 0.0002, "epoch": 4.438061041292639, "step": 61800}, {"loss": 0.6264, "grad_norm": 0.9059590101242065, "learning_rate": 0.0002, "epoch": 4.438779174147217, "step": 61810}, {"loss": 0.6303, "grad_norm": 1.0625685453414917, "learning_rate": 0.0002, "epoch": 4.439497307001796, "step": 61820}, {"loss": 0.5795, "grad_norm": 0.9565598368644714, "learning_rate": 0.0002, "epoch": 4.440215439856374, "step": 61830}, {"loss": 0.6027, "grad_norm": 0.8975377082824707, "learning_rate": 0.0002, "epoch": 4.440933572710952, "step": 61840}, {"loss": 0.6334, "grad_norm": 1.0412718057632446, "learning_rate": 0.0002, "epoch": 4.44165170556553, "step": 61850}, {"loss": 0.6455, "grad_norm": 0.9923529624938965, "learning_rate": 0.0002, "epoch": 4.442369838420108, "step": 61860}, {"loss": 0.5931, "grad_norm": 1.3025734424591064, "learning_rate": 0.0002, "epoch": 4.443087971274686, "step": 61870}, {"loss": 0.5804, "grad_norm": 1.0031960010528564, "learning_rate": 0.0002, "epoch": 4.443806104129264, "step": 61880}, {"loss": 0.602, "grad_norm": 1.0974701642990112, "learning_rate": 0.0002, "epoch": 4.444524236983842, "step": 61890}, {"loss": 0.6078, "grad_norm": 1.1044024229049683, "learning_rate": 0.0002, "epoch": 4.44524236983842, "step": 61900}, {"loss": 0.6454, "grad_norm": 1.0782772302627563, "learning_rate": 0.0002, "epoch": 4.445960502692998, "step": 61910}, {"loss": 0.6453, "grad_norm": 1.006304383277893, "learning_rate": 0.0002, "epoch": 4.446678635547577, "step": 61920}, {"loss": 0.5449, "grad_norm": 0.9258833527565002, "learning_rate": 0.0002, "epoch": 4.447396768402155, "step": 61930}, {"loss": 0.5744, "grad_norm": 0.9888426065444946, "learning_rate": 0.0002, "epoch": 4.448114901256733, "step": 61940}, {"loss": 0.5853, "grad_norm": 0.9592963457107544, "learning_rate": 0.0002, "epoch": 4.448833034111311, "step": 61950}, {"loss": 0.6142, "grad_norm": 1.0527986288070679, "learning_rate": 0.0002, "epoch": 4.449551166965889, "step": 61960}, {"loss": 0.5829, "grad_norm": 0.8613291382789612, "learning_rate": 0.0002, "epoch": 4.450269299820467, "step": 61970}, {"loss": 0.6176, "grad_norm": 1.1083767414093018, "learning_rate": 0.0002, "epoch": 4.450987432675045, "step": 61980}, {"loss": 0.5768, "grad_norm": 0.772679328918457, "learning_rate": 0.0002, "epoch": 4.451705565529623, "step": 61990}, {"loss": 0.6348, "grad_norm": 0.9052274227142334, "learning_rate": 0.0002, "epoch": 4.452423698384201, "step": 62000}, {"loss": 0.6202, "grad_norm": 1.129667043685913, "learning_rate": 0.0002, "epoch": 4.45314183123878, "step": 62010}, {"loss": 0.6265, "grad_norm": 0.9994529485702515, "learning_rate": 0.0002, "epoch": 4.453859964093358, "step": 62020}, {"loss": 0.6249, "grad_norm": 0.982155978679657, "learning_rate": 0.0002, "epoch": 4.454578096947936, "step": 62030}, {"loss": 0.6255, "grad_norm": 0.9139904975891113, "learning_rate": 0.0002, "epoch": 4.455296229802514, "step": 62040}, {"loss": 0.6237, "grad_norm": 1.0877810716629028, "learning_rate": 0.0002, "epoch": 4.456014362657092, "step": 62050}, {"loss": 0.6105, "grad_norm": 1.0535308122634888, "learning_rate": 0.0002, "epoch": 4.45673249551167, "step": 62060}, {"loss": 0.6084, "grad_norm": 1.0225313901901245, "learning_rate": 0.0002, "epoch": 4.457450628366248, "step": 62070}, {"loss": 0.6239, "grad_norm": 0.8443132042884827, "learning_rate": 0.0002, "epoch": 4.458168761220826, "step": 62080}, {"loss": 0.5895, "grad_norm": 1.0426654815673828, "learning_rate": 0.0002, "epoch": 4.458886894075404, "step": 62090}, {"loss": 0.6022, "grad_norm": 1.1110700368881226, "learning_rate": 0.0002, "epoch": 4.459605026929982, "step": 62100}, {"loss": 0.6436, "grad_norm": 1.0200893878936768, "learning_rate": 0.0002, "epoch": 4.4603231597845605, "step": 62110}, {"loss": 0.628, "grad_norm": 0.9102830290794373, "learning_rate": 0.0002, "epoch": 4.4610412926391385, "step": 62120}, {"loss": 0.5894, "grad_norm": 1.1395094394683838, "learning_rate": 0.0002, "epoch": 4.4617594254937165, "step": 62130}, {"loss": 0.5765, "grad_norm": 1.1202316284179688, "learning_rate": 0.0002, "epoch": 4.4624775583482945, "step": 62140}, {"loss": 0.6238, "grad_norm": 1.142580509185791, "learning_rate": 0.0002, "epoch": 4.4631956912028725, "step": 62150}, {"loss": 0.6502, "grad_norm": 0.9843677878379822, "learning_rate": 0.0002, "epoch": 4.4639138240574505, "step": 62160}, {"loss": 0.6734, "grad_norm": 1.0351676940917969, "learning_rate": 0.0002, "epoch": 4.4646319569120285, "step": 62170}, {"loss": 0.6371, "grad_norm": 0.9365093111991882, "learning_rate": 0.0002, "epoch": 4.4653500897666065, "step": 62180}, {"loss": 0.5827, "grad_norm": 1.041193962097168, "learning_rate": 0.0002, "epoch": 4.4660682226211845, "step": 62190}, {"loss": 0.555, "grad_norm": 0.9686329960823059, "learning_rate": 0.0002, "epoch": 4.466786355475763, "step": 62200}, {"loss": 0.6405, "grad_norm": 1.028622031211853, "learning_rate": 0.0002, "epoch": 4.467504488330341, "step": 62210}, {"loss": 0.5928, "grad_norm": 0.9717516899108887, "learning_rate": 0.0002, "epoch": 4.468222621184919, "step": 62220}, {"loss": 0.6028, "grad_norm": 1.0467450618743896, "learning_rate": 0.0002, "epoch": 4.468940754039497, "step": 62230}, {"loss": 0.593, "grad_norm": 0.943717896938324, "learning_rate": 0.0002, "epoch": 4.469658886894075, "step": 62240}, {"loss": 0.5861, "grad_norm": 0.909429132938385, "learning_rate": 0.0002, "epoch": 4.470377019748653, "step": 62250}, {"loss": 0.6211, "grad_norm": 1.0294792652130127, "learning_rate": 0.0002, "epoch": 4.471095152603231, "step": 62260}, {"loss": 0.6215, "grad_norm": 1.1044281721115112, "learning_rate": 0.0002, "epoch": 4.471813285457809, "step": 62270}, {"loss": 0.6147, "grad_norm": 1.1555784940719604, "learning_rate": 0.0002, "epoch": 4.472531418312387, "step": 62280}, {"loss": 0.627, "grad_norm": 0.9441297650337219, "learning_rate": 0.0002, "epoch": 4.473249551166965, "step": 62290}, {"loss": 0.6205, "grad_norm": 0.9164380431175232, "learning_rate": 0.0002, "epoch": 4.473967684021544, "step": 62300}, {"loss": 0.6413, "grad_norm": 1.1139159202575684, "learning_rate": 0.0002, "epoch": 4.474685816876122, "step": 62310}, {"loss": 0.6013, "grad_norm": 1.0201882123947144, "learning_rate": 0.0002, "epoch": 4.4754039497307, "step": 62320}, {"loss": 0.6127, "grad_norm": 1.1471681594848633, "learning_rate": 0.0002, "epoch": 4.476122082585278, "step": 62330}, {"loss": 0.6322, "grad_norm": 1.0333549976348877, "learning_rate": 0.0002, "epoch": 4.476840215439856, "step": 62340}, {"loss": 0.654, "grad_norm": 0.8929767608642578, "learning_rate": 0.0002, "epoch": 4.477558348294434, "step": 62350}, {"loss": 0.6325, "grad_norm": 0.9465752840042114, "learning_rate": 0.0002, "epoch": 4.478276481149012, "step": 62360}, {"loss": 0.619, "grad_norm": 1.2155033349990845, "learning_rate": 0.0002, "epoch": 4.47899461400359, "step": 62370}, {"loss": 0.5538, "grad_norm": 0.7181217074394226, "learning_rate": 0.0002, "epoch": 4.479712746858169, "step": 62380}, {"loss": 0.6236, "grad_norm": 1.0052744150161743, "learning_rate": 0.0002, "epoch": 4.480430879712747, "step": 62390}, {"loss": 0.6443, "grad_norm": 0.8522219061851501, "learning_rate": 0.0002, "epoch": 4.481149012567325, "step": 62400}, {"loss": 0.6073, "grad_norm": 0.8844723105430603, "learning_rate": 0.0002, "epoch": 4.481867145421903, "step": 62410}, {"loss": 0.6193, "grad_norm": 0.9542465209960938, "learning_rate": 0.0002, "epoch": 4.482585278276481, "step": 62420}, {"loss": 0.6099, "grad_norm": 0.8963674306869507, "learning_rate": 0.0002, "epoch": 4.483303411131059, "step": 62430}, {"loss": 0.5826, "grad_norm": 0.8105363845825195, "learning_rate": 0.0002, "epoch": 4.484021543985637, "step": 62440}, {"loss": 0.6688, "grad_norm": 0.9618421196937561, "learning_rate": 0.0002, "epoch": 4.484739676840215, "step": 62450}, {"loss": 0.6042, "grad_norm": 1.1931076049804688, "learning_rate": 0.0002, "epoch": 4.485457809694793, "step": 62460}, {"loss": 0.5869, "grad_norm": 0.7406999468803406, "learning_rate": 0.0002, "epoch": 4.486175942549371, "step": 62470}, {"loss": 0.604, "grad_norm": 0.7698216438293457, "learning_rate": 0.0002, "epoch": 4.48689407540395, "step": 62480}, {"loss": 0.6062, "grad_norm": 0.862271249294281, "learning_rate": 0.0002, "epoch": 4.487612208258528, "step": 62490}, {"loss": 0.645, "grad_norm": 1.0025171041488647, "learning_rate": 0.0002, "epoch": 4.488330341113106, "step": 62500}, {"loss": 0.5727, "grad_norm": 0.8474493622779846, "learning_rate": 0.0002, "epoch": 4.489048473967684, "step": 62510}, {"loss": 0.6907, "grad_norm": 0.8965697884559631, "learning_rate": 0.0002, "epoch": 4.489766606822262, "step": 62520}, {"loss": 0.5846, "grad_norm": 1.1276488304138184, "learning_rate": 0.0002, "epoch": 4.49048473967684, "step": 62530}, {"loss": 0.6018, "grad_norm": 1.0253537893295288, "learning_rate": 0.0002, "epoch": 4.491202872531418, "step": 62540}, {"loss": 0.5831, "grad_norm": 1.1750596761703491, "learning_rate": 0.0002, "epoch": 4.491921005385996, "step": 62550}, {"loss": 0.6272, "grad_norm": 0.9951794147491455, "learning_rate": 0.0002, "epoch": 4.492639138240574, "step": 62560}, {"loss": 0.5931, "grad_norm": 1.2510017156600952, "learning_rate": 0.0002, "epoch": 4.493357271095153, "step": 62570}, {"loss": 0.6268, "grad_norm": 1.4066375494003296, "learning_rate": 0.0002, "epoch": 4.494075403949731, "step": 62580}, {"loss": 0.6274, "grad_norm": 0.988175094127655, "learning_rate": 0.0002, "epoch": 4.494793536804309, "step": 62590}, {"loss": 0.607, "grad_norm": 1.2049115896224976, "learning_rate": 0.0002, "epoch": 4.495511669658887, "step": 62600}, {"loss": 0.6384, "grad_norm": 0.962464451789856, "learning_rate": 0.0002, "epoch": 4.496229802513465, "step": 62610}, {"loss": 0.6436, "grad_norm": 0.9324793815612793, "learning_rate": 0.0002, "epoch": 4.496947935368043, "step": 62620}, {"loss": 0.6568, "grad_norm": 0.9174214005470276, "learning_rate": 0.0002, "epoch": 4.497666068222621, "step": 62630}, {"loss": 0.6146, "grad_norm": 0.9729902148246765, "learning_rate": 0.0002, "epoch": 4.498384201077199, "step": 62640}, {"loss": 0.6564, "grad_norm": 1.0190484523773193, "learning_rate": 0.0002, "epoch": 4.499102333931777, "step": 62650}, {"loss": 0.6571, "grad_norm": 1.1473679542541504, "learning_rate": 0.0002, "epoch": 4.499820466786355, "step": 62660}, {"loss": 0.6115, "grad_norm": 1.0160558223724365, "learning_rate": 0.0002, "epoch": 4.500538599640934, "step": 62670}, {"loss": 0.6206, "grad_norm": 0.8083887100219727, "learning_rate": 0.0002, "epoch": 4.501256732495512, "step": 62680}, {"loss": 0.6107, "grad_norm": 0.941933274269104, "learning_rate": 0.0002, "epoch": 4.50197486535009, "step": 62690}, {"loss": 0.6181, "grad_norm": 0.9962822794914246, "learning_rate": 0.0002, "epoch": 4.502692998204668, "step": 62700}, {"loss": 0.6364, "grad_norm": 0.8993943333625793, "learning_rate": 0.0002, "epoch": 4.503411131059246, "step": 62710}, {"loss": 0.6141, "grad_norm": 0.9438319206237793, "learning_rate": 0.0002, "epoch": 4.504129263913824, "step": 62720}, {"loss": 0.6453, "grad_norm": 0.7951892018318176, "learning_rate": 0.0002, "epoch": 4.504847396768402, "step": 62730}, {"loss": 0.616, "grad_norm": 0.8875413537025452, "learning_rate": 0.0002, "epoch": 4.50556552962298, "step": 62740}, {"loss": 0.5702, "grad_norm": 0.993819534778595, "learning_rate": 0.0002, "epoch": 4.506283662477558, "step": 62750}, {"loss": 0.6427, "grad_norm": 0.9177559018135071, "learning_rate": 0.0002, "epoch": 4.507001795332137, "step": 62760}, {"loss": 0.6278, "grad_norm": 0.8632771968841553, "learning_rate": 0.0002, "epoch": 4.507719928186715, "step": 62770}, {"loss": 0.6665, "grad_norm": 0.943778395652771, "learning_rate": 0.0002, "epoch": 4.508438061041293, "step": 62780}, {"loss": 0.6068, "grad_norm": 0.8754997849464417, "learning_rate": 0.0002, "epoch": 4.509156193895871, "step": 62790}, {"loss": 0.6345, "grad_norm": 1.102683424949646, "learning_rate": 0.0002, "epoch": 4.509874326750449, "step": 62800}, {"loss": 0.6057, "grad_norm": 1.1156457662582397, "learning_rate": 0.0002, "epoch": 4.510592459605027, "step": 62810}, {"loss": 0.5915, "grad_norm": 0.9178887009620667, "learning_rate": 0.0002, "epoch": 4.511310592459605, "step": 62820}, {"loss": 0.6081, "grad_norm": 0.9520689249038696, "learning_rate": 0.0002, "epoch": 4.512028725314183, "step": 62830}, {"loss": 0.6434, "grad_norm": 0.8880525231361389, "learning_rate": 0.0002, "epoch": 4.512746858168761, "step": 62840}, {"loss": 0.6895, "grad_norm": 0.9541497826576233, "learning_rate": 0.0002, "epoch": 4.513464991023339, "step": 62850}, {"loss": 0.6675, "grad_norm": 1.003766417503357, "learning_rate": 0.0002, "epoch": 4.514183123877918, "step": 62860}, {"loss": 0.6412, "grad_norm": 0.8844705820083618, "learning_rate": 0.0002, "epoch": 4.514901256732496, "step": 62870}, {"loss": 0.6289, "grad_norm": 1.1870828866958618, "learning_rate": 0.0002, "epoch": 4.515619389587074, "step": 62880}, {"loss": 0.6611, "grad_norm": 0.863487184047699, "learning_rate": 0.0002, "epoch": 4.516337522441652, "step": 62890}, {"loss": 0.59, "grad_norm": 0.997770369052887, "learning_rate": 0.0002, "epoch": 4.51705565529623, "step": 62900}, {"loss": 0.6476, "grad_norm": 0.9708612561225891, "learning_rate": 0.0002, "epoch": 4.517773788150808, "step": 62910}, {"loss": 0.6084, "grad_norm": 1.1381206512451172, "learning_rate": 0.0002, "epoch": 4.518491921005386, "step": 62920}, {"loss": 0.5739, "grad_norm": 1.0386693477630615, "learning_rate": 0.0002, "epoch": 4.519210053859964, "step": 62930}, {"loss": 0.6038, "grad_norm": 1.1711705923080444, "learning_rate": 0.0002, "epoch": 4.519928186714543, "step": 62940}, {"loss": 0.6276, "grad_norm": 0.8727447390556335, "learning_rate": 0.0002, "epoch": 4.520646319569121, "step": 62950}, {"loss": 0.6298, "grad_norm": 0.9215193390846252, "learning_rate": 0.0002, "epoch": 4.521364452423699, "step": 62960}, {"loss": 0.6199, "grad_norm": 1.005467176437378, "learning_rate": 0.0002, "epoch": 4.522082585278277, "step": 62970}, {"loss": 0.6324, "grad_norm": 0.8761187791824341, "learning_rate": 0.0002, "epoch": 4.522800718132855, "step": 62980}, {"loss": 0.6152, "grad_norm": 0.957848310470581, "learning_rate": 0.0002, "epoch": 4.523518850987433, "step": 62990}, {"loss": 0.5752, "grad_norm": 0.8634148836135864, "learning_rate": 0.0002, "epoch": 4.524236983842011, "step": 63000}, {"loss": 0.6127, "grad_norm": 0.9557477235794067, "learning_rate": 0.0002, "epoch": 4.524955116696589, "step": 63010}, {"loss": 0.5708, "grad_norm": 1.017720341682434, "learning_rate": 0.0002, "epoch": 4.525673249551167, "step": 63020}, {"loss": 0.6186, "grad_norm": 1.0281825065612793, "learning_rate": 0.0002, "epoch": 4.526391382405745, "step": 63030}, {"loss": 0.6221, "grad_norm": 1.253974437713623, "learning_rate": 0.0002, "epoch": 4.527109515260323, "step": 63040}, {"loss": 0.6381, "grad_norm": 0.8489068150520325, "learning_rate": 0.0002, "epoch": 4.527827648114902, "step": 63050}, {"loss": 0.6022, "grad_norm": 0.9681686162948608, "learning_rate": 0.0002, "epoch": 4.52854578096948, "step": 63060}, {"loss": 0.6166, "grad_norm": 1.10277259349823, "learning_rate": 0.0002, "epoch": 4.529263913824058, "step": 63070}, {"loss": 0.5838, "grad_norm": 0.9469163417816162, "learning_rate": 0.0002, "epoch": 4.529982046678636, "step": 63080}, {"loss": 0.6323, "grad_norm": 1.1228134632110596, "learning_rate": 0.0002, "epoch": 4.530700179533214, "step": 63090}, {"loss": 0.6143, "grad_norm": 0.9673212170600891, "learning_rate": 0.0002, "epoch": 4.531418312387792, "step": 63100}, {"loss": 0.713, "grad_norm": 1.0221107006072998, "learning_rate": 0.0002, "epoch": 4.53213644524237, "step": 63110}, {"loss": 0.6099, "grad_norm": 0.826372504234314, "learning_rate": 0.0002, "epoch": 4.532854578096948, "step": 63120}, {"loss": 0.6487, "grad_norm": 1.1805331707000732, "learning_rate": 0.0002, "epoch": 4.5335727109515265, "step": 63130}, {"loss": 0.6088, "grad_norm": 0.9645666480064392, "learning_rate": 0.0002, "epoch": 4.5342908438061045, "step": 63140}, {"loss": 0.6049, "grad_norm": 1.0838309526443481, "learning_rate": 0.0002, "epoch": 4.5350089766606825, "step": 63150}, {"loss": 0.5972, "grad_norm": 1.061414361000061, "learning_rate": 0.0002, "epoch": 4.5357271095152605, "step": 63160}, {"loss": 0.5706, "grad_norm": 0.841961145401001, "learning_rate": 0.0002, "epoch": 4.5364452423698385, "step": 63170}, {"loss": 0.6168, "grad_norm": 1.1220186948776245, "learning_rate": 0.0002, "epoch": 4.5371633752244165, "step": 63180}, {"loss": 0.6055, "grad_norm": 1.036441445350647, "learning_rate": 0.0002, "epoch": 4.5378815080789945, "step": 63190}, {"loss": 0.619, "grad_norm": 0.9089716076850891, "learning_rate": 0.0002, "epoch": 4.5385996409335725, "step": 63200}, {"loss": 0.6373, "grad_norm": 0.8699982762336731, "learning_rate": 0.0002, "epoch": 4.5393177737881505, "step": 63210}, {"loss": 0.6082, "grad_norm": 0.8489565253257751, "learning_rate": 0.0002, "epoch": 4.5400359066427285, "step": 63220}, {"loss": 0.5957, "grad_norm": 0.7778416275978088, "learning_rate": 0.0002, "epoch": 4.540754039497307, "step": 63230}, {"loss": 0.6109, "grad_norm": 1.0625852346420288, "learning_rate": 0.0002, "epoch": 4.541472172351885, "step": 63240}, {"loss": 0.6039, "grad_norm": 0.8515732884407043, "learning_rate": 0.0002, "epoch": 4.542190305206463, "step": 63250}, {"loss": 0.5827, "grad_norm": 0.7679561376571655, "learning_rate": 0.0002, "epoch": 4.542908438061041, "step": 63260}, {"loss": 0.5948, "grad_norm": 0.7358446717262268, "learning_rate": 0.0002, "epoch": 4.543626570915619, "step": 63270}, {"loss": 0.6265, "grad_norm": 1.0866128206253052, "learning_rate": 0.0002, "epoch": 4.544344703770197, "step": 63280}, {"loss": 0.6622, "grad_norm": 1.0870225429534912, "learning_rate": 0.0002, "epoch": 4.545062836624775, "step": 63290}, {"loss": 0.5859, "grad_norm": 0.951095461845398, "learning_rate": 0.0002, "epoch": 4.545780969479353, "step": 63300}, {"loss": 0.6252, "grad_norm": 1.0914306640625, "learning_rate": 0.0002, "epoch": 4.546499102333931, "step": 63310}, {"loss": 0.6504, "grad_norm": 0.8676106333732605, "learning_rate": 0.0002, "epoch": 4.54721723518851, "step": 63320}, {"loss": 0.6088, "grad_norm": 1.0129096508026123, "learning_rate": 0.0002, "epoch": 4.547935368043088, "step": 63330}, {"loss": 0.617, "grad_norm": 0.8710526823997498, "learning_rate": 0.0002, "epoch": 4.548653500897666, "step": 63340}, {"loss": 0.6336, "grad_norm": 0.7014815807342529, "learning_rate": 0.0002, "epoch": 4.549371633752244, "step": 63350}, {"loss": 0.5758, "grad_norm": 1.1546777486801147, "learning_rate": 0.0002, "epoch": 4.550089766606822, "step": 63360}, {"loss": 0.5976, "grad_norm": 0.7464957237243652, "learning_rate": 0.0002, "epoch": 4.5508078994614, "step": 63370}, {"loss": 0.6016, "grad_norm": 0.9976209998130798, "learning_rate": 0.0002, "epoch": 4.551526032315978, "step": 63380}, {"loss": 0.5784, "grad_norm": 0.9543681740760803, "learning_rate": 0.0002, "epoch": 4.552244165170556, "step": 63390}, {"loss": 0.5873, "grad_norm": 1.1498578786849976, "learning_rate": 0.0002, "epoch": 4.552962298025134, "step": 63400}, {"loss": 0.6445, "grad_norm": 1.0162293910980225, "learning_rate": 0.0002, "epoch": 4.553680430879712, "step": 63410}, {"loss": 0.5677, "grad_norm": 0.9015304446220398, "learning_rate": 0.0002, "epoch": 4.554398563734291, "step": 63420}, {"loss": 0.6257, "grad_norm": 1.1639831066131592, "learning_rate": 0.0002, "epoch": 4.555116696588869, "step": 63430}, {"loss": 0.6763, "grad_norm": 0.9494703412055969, "learning_rate": 0.0002, "epoch": 4.555834829443447, "step": 63440}, {"loss": 0.5955, "grad_norm": 1.0555956363677979, "learning_rate": 0.0002, "epoch": 4.556552962298025, "step": 63450}, {"loss": 0.6634, "grad_norm": 0.8513827919960022, "learning_rate": 0.0002, "epoch": 4.557271095152603, "step": 63460}, {"loss": 0.6507, "grad_norm": 1.0614275932312012, "learning_rate": 0.0002, "epoch": 4.557989228007181, "step": 63470}, {"loss": 0.5619, "grad_norm": 0.8341137766838074, "learning_rate": 0.0002, "epoch": 4.558707360861759, "step": 63480}, {"loss": 0.6147, "grad_norm": 1.2136222124099731, "learning_rate": 0.0002, "epoch": 4.559425493716337, "step": 63490}, {"loss": 0.6313, "grad_norm": 0.8806019425392151, "learning_rate": 0.0002, "epoch": 4.560143626570916, "step": 63500}, {"loss": 0.6012, "grad_norm": 1.2548854351043701, "learning_rate": 0.0002, "epoch": 4.560861759425494, "step": 63510}, {"loss": 0.5995, "grad_norm": 1.0162668228149414, "learning_rate": 0.0002, "epoch": 4.561579892280072, "step": 63520}, {"loss": 0.5895, "grad_norm": 1.0487624406814575, "learning_rate": 0.0002, "epoch": 4.56229802513465, "step": 63530}, {"loss": 0.5997, "grad_norm": 1.2505502700805664, "learning_rate": 0.0002, "epoch": 4.563016157989228, "step": 63540}, {"loss": 0.618, "grad_norm": 0.9930511713027954, "learning_rate": 0.0002, "epoch": 4.563734290843806, "step": 63550}, {"loss": 0.6695, "grad_norm": 0.8132568001747131, "learning_rate": 0.0002, "epoch": 4.564452423698384, "step": 63560}, {"loss": 0.6221, "grad_norm": 1.0129177570343018, "learning_rate": 0.0002, "epoch": 4.565170556552962, "step": 63570}, {"loss": 0.6463, "grad_norm": 0.9011693596839905, "learning_rate": 0.0002, "epoch": 4.56588868940754, "step": 63580}, {"loss": 0.6046, "grad_norm": 0.9161545634269714, "learning_rate": 0.0002, "epoch": 4.566606822262118, "step": 63590}, {"loss": 0.6413, "grad_norm": 0.8852348327636719, "learning_rate": 0.0002, "epoch": 4.567324955116696, "step": 63600}, {"loss": 0.6282, "grad_norm": 0.8579391837120056, "learning_rate": 0.0002, "epoch": 4.568043087971275, "step": 63610}, {"loss": 0.6041, "grad_norm": 0.9271050095558167, "learning_rate": 0.0002, "epoch": 4.568761220825853, "step": 63620}, {"loss": 0.6156, "grad_norm": 0.9881834983825684, "learning_rate": 0.0002, "epoch": 4.569479353680431, "step": 63630}, {"loss": 0.6164, "grad_norm": 1.0255686044692993, "learning_rate": 0.0002, "epoch": 4.570197486535009, "step": 63640}, {"loss": 0.6416, "grad_norm": 0.8758876919746399, "learning_rate": 0.0002, "epoch": 4.570915619389587, "step": 63650}, {"loss": 0.6787, "grad_norm": 1.0134185552597046, "learning_rate": 0.0002, "epoch": 4.571633752244165, "step": 63660}, {"loss": 0.6245, "grad_norm": 0.8535705208778381, "learning_rate": 0.0002, "epoch": 4.572351885098743, "step": 63670}, {"loss": 0.6282, "grad_norm": 0.9614834785461426, "learning_rate": 0.0002, "epoch": 4.573070017953321, "step": 63680}, {"loss": 0.6461, "grad_norm": 0.9004243612289429, "learning_rate": 0.0002, "epoch": 4.5737881508079, "step": 63690}, {"loss": 0.6172, "grad_norm": 0.9563080072402954, "learning_rate": 0.0002, "epoch": 4.574506283662478, "step": 63700}, {"loss": 0.6059, "grad_norm": 1.024857521057129, "learning_rate": 0.0002, "epoch": 4.575224416517056, "step": 63710}, {"loss": 0.6188, "grad_norm": 0.9345638155937195, "learning_rate": 0.0002, "epoch": 4.575942549371634, "step": 63720}, {"loss": 0.6814, "grad_norm": 1.27083158493042, "learning_rate": 0.0002, "epoch": 4.576660682226212, "step": 63730}, {"loss": 0.5987, "grad_norm": 1.0866559743881226, "learning_rate": 0.0002, "epoch": 4.57737881508079, "step": 63740}, {"loss": 0.5738, "grad_norm": 0.9253925681114197, "learning_rate": 0.0002, "epoch": 4.578096947935368, "step": 63750}, {"loss": 0.5981, "grad_norm": 0.8127399682998657, "learning_rate": 0.0002, "epoch": 4.578815080789946, "step": 63760}, {"loss": 0.6321, "grad_norm": 1.0453993082046509, "learning_rate": 0.0002, "epoch": 4.579533213644524, "step": 63770}, {"loss": 0.6423, "grad_norm": 1.2227544784545898, "learning_rate": 0.0002, "epoch": 4.580251346499102, "step": 63780}, {"loss": 0.6405, "grad_norm": 1.0207865238189697, "learning_rate": 0.0002, "epoch": 4.580969479353681, "step": 63790}, {"loss": 0.6268, "grad_norm": 1.030447244644165, "learning_rate": 0.0002, "epoch": 4.581687612208259, "step": 63800}, {"loss": 0.6014, "grad_norm": 1.0855677127838135, "learning_rate": 0.0002, "epoch": 4.582405745062837, "step": 63810}, {"loss": 0.6204, "grad_norm": 0.9572556018829346, "learning_rate": 0.0002, "epoch": 4.583123877917415, "step": 63820}, {"loss": 0.6094, "grad_norm": 0.9061040282249451, "learning_rate": 0.0002, "epoch": 4.583842010771993, "step": 63830}, {"loss": 0.6074, "grad_norm": 0.9267677068710327, "learning_rate": 0.0002, "epoch": 4.584560143626571, "step": 63840}, {"loss": 0.6525, "grad_norm": 1.070076823234558, "learning_rate": 0.0002, "epoch": 4.585278276481149, "step": 63850}, {"loss": 0.6074, "grad_norm": 1.045881748199463, "learning_rate": 0.0002, "epoch": 4.585996409335727, "step": 63860}, {"loss": 0.6106, "grad_norm": 0.9190576672554016, "learning_rate": 0.0002, "epoch": 4.586714542190305, "step": 63870}, {"loss": 0.6213, "grad_norm": 0.9263932704925537, "learning_rate": 0.0002, "epoch": 4.587432675044884, "step": 63880}, {"loss": 0.6077, "grad_norm": 1.0217589139938354, "learning_rate": 0.0002, "epoch": 4.588150807899462, "step": 63890}, {"loss": 0.5798, "grad_norm": 0.9200088381767273, "learning_rate": 0.0002, "epoch": 4.58886894075404, "step": 63900}, {"loss": 0.6311, "grad_norm": 0.9877251386642456, "learning_rate": 0.0002, "epoch": 4.589587073608618, "step": 63910}, {"loss": 0.5981, "grad_norm": 1.0059093236923218, "learning_rate": 0.0002, "epoch": 4.590305206463196, "step": 63920}, {"loss": 0.6265, "grad_norm": 1.2618095874786377, "learning_rate": 0.0002, "epoch": 4.591023339317774, "step": 63930}, {"loss": 0.583, "grad_norm": 1.1779268980026245, "learning_rate": 0.0002, "epoch": 4.591741472172352, "step": 63940}, {"loss": 0.6232, "grad_norm": 1.2339502573013306, "learning_rate": 0.0002, "epoch": 4.59245960502693, "step": 63950}, {"loss": 0.5985, "grad_norm": 0.7488788366317749, "learning_rate": 0.0002, "epoch": 4.593177737881508, "step": 63960}, {"loss": 0.5991, "grad_norm": 0.8366380929946899, "learning_rate": 0.0002, "epoch": 4.593895870736086, "step": 63970}, {"loss": 0.5864, "grad_norm": 1.0292677879333496, "learning_rate": 0.0002, "epoch": 4.594614003590665, "step": 63980}, {"loss": 0.666, "grad_norm": 0.7938551306724548, "learning_rate": 0.0002, "epoch": 4.595332136445243, "step": 63990}, {"loss": 0.6202, "grad_norm": 0.7958516478538513, "learning_rate": 0.0002, "epoch": 4.596050269299821, "step": 64000}, {"loss": 0.5868, "grad_norm": 0.9613908529281616, "learning_rate": 0.0002, "epoch": 4.596768402154399, "step": 64010}, {"loss": 0.6299, "grad_norm": 1.0253773927688599, "learning_rate": 0.0002, "epoch": 4.597486535008977, "step": 64020}, {"loss": 0.5964, "grad_norm": 1.0560888051986694, "learning_rate": 0.0002, "epoch": 4.598204667863555, "step": 64030}, {"loss": 0.6681, "grad_norm": 1.1093556880950928, "learning_rate": 0.0002, "epoch": 4.598922800718133, "step": 64040}, {"loss": 0.6097, "grad_norm": 0.8492098450660706, "learning_rate": 0.0002, "epoch": 4.599640933572711, "step": 64050}, {"loss": 0.6029, "grad_norm": 1.0070436000823975, "learning_rate": 0.0002, "epoch": 4.6003590664272895, "step": 64060}, {"loss": 0.6392, "grad_norm": 0.9774282574653625, "learning_rate": 0.0002, "epoch": 4.6010771992818675, "step": 64070}, {"loss": 0.6397, "grad_norm": 1.0744960308074951, "learning_rate": 0.0002, "epoch": 4.6017953321364455, "step": 64080}, {"loss": 0.6491, "grad_norm": 1.0101491212844849, "learning_rate": 0.0002, "epoch": 4.6025134649910235, "step": 64090}, {"loss": 0.594, "grad_norm": 1.2306591272354126, "learning_rate": 0.0002, "epoch": 4.6032315978456015, "step": 64100}, {"loss": 0.5783, "grad_norm": 0.9187033176422119, "learning_rate": 0.0002, "epoch": 4.6039497307001795, "step": 64110}, {"loss": 0.5982, "grad_norm": 0.9178676605224609, "learning_rate": 0.0002, "epoch": 4.6046678635547575, "step": 64120}, {"loss": 0.6074, "grad_norm": 1.006374716758728, "learning_rate": 0.0002, "epoch": 4.6053859964093355, "step": 64130}, {"loss": 0.6402, "grad_norm": 1.0774449110031128, "learning_rate": 0.0002, "epoch": 4.6061041292639135, "step": 64140}, {"loss": 0.6076, "grad_norm": 1.0360658168792725, "learning_rate": 0.0002, "epoch": 4.6068222621184916, "step": 64150}, {"loss": 0.6259, "grad_norm": 1.1061090230941772, "learning_rate": 0.0002, "epoch": 4.6075403949730696, "step": 64160}, {"loss": 0.6304, "grad_norm": 1.0320971012115479, "learning_rate": 0.0002, "epoch": 4.608258527827648, "step": 64170}, {"loss": 0.6182, "grad_norm": 0.8596988916397095, "learning_rate": 0.0002, "epoch": 4.6089766606822264, "step": 64180}, {"loss": 0.5646, "grad_norm": 1.1665741205215454, "learning_rate": 0.0002, "epoch": 4.6096947935368044, "step": 64190}, {"loss": 0.6219, "grad_norm": 0.857207715511322, "learning_rate": 0.0002, "epoch": 4.6104129263913824, "step": 64200}, {"loss": 0.6271, "grad_norm": 1.0088987350463867, "learning_rate": 0.0002, "epoch": 4.6111310592459605, "step": 64210}, {"loss": 0.6209, "grad_norm": 1.0985605716705322, "learning_rate": 0.0002, "epoch": 4.6118491921005385, "step": 64220}, {"loss": 0.6455, "grad_norm": 0.9504913687705994, "learning_rate": 0.0002, "epoch": 4.6125673249551165, "step": 64230}, {"loss": 0.6054, "grad_norm": 0.8415018916130066, "learning_rate": 0.0002, "epoch": 4.6132854578096945, "step": 64240}, {"loss": 0.5975, "grad_norm": 0.9857034087181091, "learning_rate": 0.0002, "epoch": 4.614003590664273, "step": 64250}, {"loss": 0.6347, "grad_norm": 1.0164235830307007, "learning_rate": 0.0002, "epoch": 4.614721723518851, "step": 64260}, {"loss": 0.5877, "grad_norm": 0.949481725692749, "learning_rate": 0.0002, "epoch": 4.615439856373429, "step": 64270}, {"loss": 0.5737, "grad_norm": 0.9526455998420715, "learning_rate": 0.0002, "epoch": 4.616157989228007, "step": 64280}, {"loss": 0.6134, "grad_norm": 1.1121242046356201, "learning_rate": 0.0002, "epoch": 4.616876122082585, "step": 64290}, {"loss": 0.6152, "grad_norm": 0.9598871469497681, "learning_rate": 0.0002, "epoch": 4.617594254937163, "step": 64300}, {"loss": 0.6405, "grad_norm": 1.0406304597854614, "learning_rate": 0.0002, "epoch": 4.618312387791741, "step": 64310}, {"loss": 0.5971, "grad_norm": 1.1816964149475098, "learning_rate": 0.0002, "epoch": 4.619030520646319, "step": 64320}, {"loss": 0.6483, "grad_norm": 0.9818326830863953, "learning_rate": 0.0002, "epoch": 4.619748653500897, "step": 64330}, {"loss": 0.6141, "grad_norm": 0.952017605304718, "learning_rate": 0.0002, "epoch": 4.620466786355475, "step": 64340}, {"loss": 0.6146, "grad_norm": 1.1263453960418701, "learning_rate": 0.0002, "epoch": 4.621184919210053, "step": 64350}, {"loss": 0.5973, "grad_norm": 1.1158473491668701, "learning_rate": 0.0002, "epoch": 4.621903052064632, "step": 64360}, {"loss": 0.6029, "grad_norm": 0.9056766033172607, "learning_rate": 0.0002, "epoch": 4.62262118491921, "step": 64370}, {"loss": 0.6488, "grad_norm": 0.8113203048706055, "learning_rate": 0.0002, "epoch": 4.623339317773788, "step": 64380}, {"loss": 0.6391, "grad_norm": 0.8646712899208069, "learning_rate": 0.0002, "epoch": 4.624057450628366, "step": 64390}, {"loss": 0.6191, "grad_norm": 1.0064425468444824, "learning_rate": 0.0002, "epoch": 4.624775583482944, "step": 64400}, {"loss": 0.5826, "grad_norm": 0.9867565631866455, "learning_rate": 0.0002, "epoch": 4.625493716337522, "step": 64410}, {"loss": 0.6409, "grad_norm": 1.018764615058899, "learning_rate": 0.0002, "epoch": 4.6262118491921, "step": 64420}, {"loss": 0.5992, "grad_norm": 1.0607863664627075, "learning_rate": 0.0002, "epoch": 4.626929982046678, "step": 64430}, {"loss": 0.6502, "grad_norm": 1.012825846672058, "learning_rate": 0.0002, "epoch": 4.627648114901257, "step": 64440}, {"loss": 0.6074, "grad_norm": 0.8441653847694397, "learning_rate": 0.0002, "epoch": 4.628366247755835, "step": 64450}, {"loss": 0.6462, "grad_norm": 0.9819194674491882, "learning_rate": 0.0002, "epoch": 4.629084380610413, "step": 64460}, {"loss": 0.5983, "grad_norm": 0.925519585609436, "learning_rate": 0.0002, "epoch": 4.629802513464991, "step": 64470}, {"loss": 0.5959, "grad_norm": 0.9409030079841614, "learning_rate": 0.0002, "epoch": 4.630520646319569, "step": 64480}, {"loss": 0.6265, "grad_norm": 1.148024559020996, "learning_rate": 0.0002, "epoch": 4.631238779174147, "step": 64490}, {"loss": 0.6556, "grad_norm": 0.8225533962249756, "learning_rate": 0.0002, "epoch": 4.631956912028725, "step": 64500}, {"loss": 0.5922, "grad_norm": 0.8806734681129456, "learning_rate": 0.0002, "epoch": 4.632675044883303, "step": 64510}, {"loss": 0.6202, "grad_norm": 0.9656694531440735, "learning_rate": 0.0002, "epoch": 4.633393177737881, "step": 64520}, {"loss": 0.6044, "grad_norm": 0.9977783560752869, "learning_rate": 0.0002, "epoch": 4.634111310592459, "step": 64530}, {"loss": 0.5741, "grad_norm": 0.9259420037269592, "learning_rate": 0.0002, "epoch": 4.634829443447038, "step": 64540}, {"loss": 0.5801, "grad_norm": 1.0215885639190674, "learning_rate": 0.0002, "epoch": 4.635547576301616, "step": 64550}, {"loss": 0.6492, "grad_norm": 1.1082557439804077, "learning_rate": 0.0002, "epoch": 4.636265709156194, "step": 64560}, {"loss": 0.6285, "grad_norm": 1.1183207035064697, "learning_rate": 0.0002, "epoch": 4.636983842010772, "step": 64570}, {"loss": 0.6216, "grad_norm": 0.9914339184761047, "learning_rate": 0.0002, "epoch": 4.63770197486535, "step": 64580}, {"loss": 0.6416, "grad_norm": 0.8065831661224365, "learning_rate": 0.0002, "epoch": 4.638420107719928, "step": 64590}, {"loss": 0.6078, "grad_norm": 1.1546721458435059, "learning_rate": 0.0002, "epoch": 4.639138240574506, "step": 64600}, {"loss": 0.6219, "grad_norm": 1.0395900011062622, "learning_rate": 0.0002, "epoch": 4.639856373429084, "step": 64610}, {"loss": 0.5939, "grad_norm": 0.9957455992698669, "learning_rate": 0.0002, "epoch": 4.640574506283663, "step": 64620}, {"loss": 0.6653, "grad_norm": 1.069557785987854, "learning_rate": 0.0002, "epoch": 4.641292639138241, "step": 64630}, {"loss": 0.6546, "grad_norm": 1.005236268043518, "learning_rate": 0.0002, "epoch": 4.642010771992819, "step": 64640}, {"loss": 0.6262, "grad_norm": 1.0216304063796997, "learning_rate": 0.0002, "epoch": 4.642728904847397, "step": 64650}, {"loss": 0.6756, "grad_norm": 0.8567317128181458, "learning_rate": 0.0002, "epoch": 4.643447037701975, "step": 64660}, {"loss": 0.5997, "grad_norm": 1.0386067628860474, "learning_rate": 0.0002, "epoch": 4.644165170556553, "step": 64670}, {"loss": 0.6471, "grad_norm": 0.9566055536270142, "learning_rate": 0.0002, "epoch": 4.644883303411131, "step": 64680}, {"loss": 0.6601, "grad_norm": 1.0990564823150635, "learning_rate": 0.0002, "epoch": 4.645601436265709, "step": 64690}, {"loss": 0.6418, "grad_norm": 0.9962695240974426, "learning_rate": 0.0002, "epoch": 4.646319569120287, "step": 64700}, {"loss": 0.6442, "grad_norm": 0.9041377305984497, "learning_rate": 0.0002, "epoch": 4.647037701974865, "step": 64710}, {"loss": 0.6276, "grad_norm": 0.8611233234405518, "learning_rate": 0.0002, "epoch": 4.647755834829443, "step": 64720}, {"loss": 0.6015, "grad_norm": 1.1569812297821045, "learning_rate": 0.0002, "epoch": 4.648473967684022, "step": 64730}, {"loss": 0.6169, "grad_norm": 0.7946197390556335, "learning_rate": 0.0002, "epoch": 4.6491921005386, "step": 64740}, {"loss": 0.668, "grad_norm": 0.9612061381340027, "learning_rate": 0.0002, "epoch": 4.649910233393178, "step": 64750}, {"loss": 0.6741, "grad_norm": 0.9669303297996521, "learning_rate": 0.0002, "epoch": 4.650628366247756, "step": 64760}, {"loss": 0.593, "grad_norm": 0.8117775321006775, "learning_rate": 0.0002, "epoch": 4.651346499102334, "step": 64770}, {"loss": 0.6915, "grad_norm": 1.2326241731643677, "learning_rate": 0.0002, "epoch": 4.652064631956912, "step": 64780}, {"loss": 0.6076, "grad_norm": 0.7494568228721619, "learning_rate": 0.0002, "epoch": 4.65278276481149, "step": 64790}, {"loss": 0.58, "grad_norm": 0.8145379424095154, "learning_rate": 0.0002, "epoch": 4.653500897666068, "step": 64800}, {"loss": 0.6351, "grad_norm": 1.0139610767364502, "learning_rate": 0.0002, "epoch": 4.654219030520647, "step": 64810}, {"loss": 0.6575, "grad_norm": 0.9887115359306335, "learning_rate": 0.0002, "epoch": 4.654937163375225, "step": 64820}, {"loss": 0.6338, "grad_norm": 0.9565147161483765, "learning_rate": 0.0002, "epoch": 4.655655296229803, "step": 64830}, {"loss": 0.6212, "grad_norm": 0.9022467136383057, "learning_rate": 0.0002, "epoch": 4.656373429084381, "step": 64840}, {"loss": 0.6395, "grad_norm": 1.075003981590271, "learning_rate": 0.0002, "epoch": 4.657091561938959, "step": 64850}, {"loss": 0.6191, "grad_norm": 0.8705733418464661, "learning_rate": 0.0002, "epoch": 4.657809694793537, "step": 64860}, {"loss": 0.5543, "grad_norm": 1.0826832056045532, "learning_rate": 0.0002, "epoch": 4.658527827648115, "step": 64870}, {"loss": 0.6363, "grad_norm": 1.1056268215179443, "learning_rate": 0.0002, "epoch": 4.659245960502693, "step": 64880}, {"loss": 0.6252, "grad_norm": 0.8664149641990662, "learning_rate": 0.0002, "epoch": 4.659964093357271, "step": 64890}, {"loss": 0.6126, "grad_norm": 0.9487230181694031, "learning_rate": 0.0002, "epoch": 4.660682226211849, "step": 64900}, {"loss": 0.5968, "grad_norm": 1.0357837677001953, "learning_rate": 0.0002, "epoch": 4.661400359066427, "step": 64910}, {"loss": 0.603, "grad_norm": 0.8620632290840149, "learning_rate": 0.0002, "epoch": 4.662118491921006, "step": 64920}, {"loss": 0.6113, "grad_norm": 1.108986735343933, "learning_rate": 0.0002, "epoch": 4.662836624775584, "step": 64930}, {"loss": 0.6115, "grad_norm": 0.8017674684524536, "learning_rate": 0.0002, "epoch": 4.663554757630162, "step": 64940}, {"loss": 0.6268, "grad_norm": 0.882347583770752, "learning_rate": 0.0002, "epoch": 4.66427289048474, "step": 64950}, {"loss": 0.657, "grad_norm": 0.9466867446899414, "learning_rate": 0.0002, "epoch": 4.664991023339318, "step": 64960}, {"loss": 0.645, "grad_norm": 1.1823636293411255, "learning_rate": 0.0002, "epoch": 4.665709156193896, "step": 64970}, {"loss": 0.5889, "grad_norm": 0.9535016417503357, "learning_rate": 0.0002, "epoch": 4.666427289048474, "step": 64980}, {"loss": 0.5986, "grad_norm": 0.9456726312637329, "learning_rate": 0.0002, "epoch": 4.667145421903052, "step": 64990}, {"loss": 0.6334, "grad_norm": 0.7761920690536499, "learning_rate": 0.0002, "epoch": 4.667863554757631, "step": 65000}, {"loss": 0.6645, "grad_norm": 1.060357689857483, "learning_rate": 0.0002, "epoch": 4.668581687612209, "step": 65010}, {"loss": 0.6369, "grad_norm": 0.9083862900733948, "learning_rate": 0.0002, "epoch": 4.669299820466787, "step": 65020}, {"loss": 0.5839, "grad_norm": 0.8745762705802917, "learning_rate": 0.0002, "epoch": 4.670017953321365, "step": 65030}, {"loss": 0.6517, "grad_norm": 0.8715422749519348, "learning_rate": 0.0002, "epoch": 4.670736086175943, "step": 65040}, {"loss": 0.6061, "grad_norm": 0.9407707452774048, "learning_rate": 0.0002, "epoch": 4.671454219030521, "step": 65050}, {"loss": 0.5928, "grad_norm": 0.8998945355415344, "learning_rate": 0.0002, "epoch": 4.672172351885099, "step": 65060}, {"loss": 0.6107, "grad_norm": 0.9147891998291016, "learning_rate": 0.0002, "epoch": 4.672890484739677, "step": 65070}, {"loss": 0.6215, "grad_norm": 1.116614580154419, "learning_rate": 0.0002, "epoch": 4.673608617594255, "step": 65080}, {"loss": 0.641, "grad_norm": 1.0764213800430298, "learning_rate": 0.0002, "epoch": 4.674326750448833, "step": 65090}, {"loss": 0.6353, "grad_norm": 0.9115945100784302, "learning_rate": 0.0002, "epoch": 4.6750448833034115, "step": 65100}, {"loss": 0.6506, "grad_norm": 1.001251459121704, "learning_rate": 0.0002, "epoch": 4.6757630161579895, "step": 65110}, {"loss": 0.6414, "grad_norm": 1.0330020189285278, "learning_rate": 0.0002, "epoch": 4.6764811490125675, "step": 65120}, {"loss": 0.6421, "grad_norm": 0.9083197116851807, "learning_rate": 0.0002, "epoch": 4.6771992818671455, "step": 65130}, {"loss": 0.5905, "grad_norm": 0.9298770427703857, "learning_rate": 0.0002, "epoch": 4.6779174147217235, "step": 65140}, {"loss": 0.633, "grad_norm": 1.0009549856185913, "learning_rate": 0.0002, "epoch": 4.6786355475763015, "step": 65150}, {"loss": 0.661, "grad_norm": 0.951389729976654, "learning_rate": 0.0002, "epoch": 4.6793536804308795, "step": 65160}, {"loss": 0.6282, "grad_norm": 1.151870608329773, "learning_rate": 0.0002, "epoch": 4.6800718132854575, "step": 65170}, {"loss": 0.5944, "grad_norm": 1.0074727535247803, "learning_rate": 0.0002, "epoch": 4.680789946140036, "step": 65180}, {"loss": 0.6539, "grad_norm": 1.0490152835845947, "learning_rate": 0.0002, "epoch": 4.681508078994614, "step": 65190}, {"loss": 0.6604, "grad_norm": 0.8967363834381104, "learning_rate": 0.0002, "epoch": 4.682226211849192, "step": 65200}, {"loss": 0.6582, "grad_norm": 1.2314889430999756, "learning_rate": 0.0002, "epoch": 4.68294434470377, "step": 65210}, {"loss": 0.6104, "grad_norm": 0.7764074802398682, "learning_rate": 0.0002, "epoch": 4.683662477558348, "step": 65220}, {"loss": 0.6401, "grad_norm": 1.0587822198867798, "learning_rate": 0.0002, "epoch": 4.684380610412926, "step": 65230}, {"loss": 0.556, "grad_norm": 0.916114091873169, "learning_rate": 0.0002, "epoch": 4.685098743267504, "step": 65240}, {"loss": 0.5912, "grad_norm": 0.9117472767829895, "learning_rate": 0.0002, "epoch": 4.685816876122082, "step": 65250}, {"loss": 0.6127, "grad_norm": 0.8369293212890625, "learning_rate": 0.0002, "epoch": 4.68653500897666, "step": 65260}, {"loss": 0.5715, "grad_norm": 0.9700121879577637, "learning_rate": 0.0002, "epoch": 4.687253141831238, "step": 65270}, {"loss": 0.6364, "grad_norm": 1.0008411407470703, "learning_rate": 0.0002, "epoch": 4.687971274685816, "step": 65280}, {"loss": 0.5816, "grad_norm": 0.9339549541473389, "learning_rate": 0.0002, "epoch": 4.688689407540395, "step": 65290}, {"loss": 0.6382, "grad_norm": 0.956701934337616, "learning_rate": 0.0002, "epoch": 4.689407540394973, "step": 65300}, {"loss": 0.6368, "grad_norm": 1.2042720317840576, "learning_rate": 0.0002, "epoch": 4.690125673249551, "step": 65310}, {"loss": 0.6138, "grad_norm": 0.8679144382476807, "learning_rate": 0.0002, "epoch": 4.690843806104129, "step": 65320}, {"loss": 0.6619, "grad_norm": 1.2320687770843506, "learning_rate": 0.0002, "epoch": 4.691561938958707, "step": 65330}, {"loss": 0.6212, "grad_norm": 0.8397238850593567, "learning_rate": 0.0002, "epoch": 4.692280071813285, "step": 65340}, {"loss": 0.578, "grad_norm": 0.7850362658500671, "learning_rate": 0.0002, "epoch": 4.692998204667863, "step": 65350}, {"loss": 0.632, "grad_norm": 0.9281290173530579, "learning_rate": 0.0002, "epoch": 4.693716337522441, "step": 65360}, {"loss": 0.6492, "grad_norm": 1.1506335735321045, "learning_rate": 0.0002, "epoch": 4.69443447037702, "step": 65370}, {"loss": 0.6503, "grad_norm": 1.0910584926605225, "learning_rate": 0.0002, "epoch": 4.695152603231598, "step": 65380}, {"loss": 0.66, "grad_norm": 0.8937386274337769, "learning_rate": 0.0002, "epoch": 4.695870736086176, "step": 65390}, {"loss": 0.6425, "grad_norm": 1.0163888931274414, "learning_rate": 0.0002, "epoch": 4.696588868940754, "step": 65400}, {"loss": 0.647, "grad_norm": 1.0290007591247559, "learning_rate": 0.0002, "epoch": 4.697307001795332, "step": 65410}, {"loss": 0.614, "grad_norm": 0.9046576023101807, "learning_rate": 0.0002, "epoch": 4.69802513464991, "step": 65420}, {"loss": 0.5844, "grad_norm": 1.0030237436294556, "learning_rate": 0.0002, "epoch": 4.698743267504488, "step": 65430}, {"loss": 0.6273, "grad_norm": 0.8196740746498108, "learning_rate": 0.0002, "epoch": 4.699461400359066, "step": 65440}, {"loss": 0.6273, "grad_norm": 0.9036651849746704, "learning_rate": 0.0002, "epoch": 4.700179533213644, "step": 65450}, {"loss": 0.6024, "grad_norm": 1.2080141305923462, "learning_rate": 0.0002, "epoch": 4.700897666068222, "step": 65460}, {"loss": 0.6461, "grad_norm": 0.8743635416030884, "learning_rate": 0.0002, "epoch": 4.7016157989228, "step": 65470}, {"loss": 0.6129, "grad_norm": 0.9566192030906677, "learning_rate": 0.0002, "epoch": 4.702333931777379, "step": 65480}, {"loss": 0.6721, "grad_norm": 1.0505144596099854, "learning_rate": 0.0002, "epoch": 4.703052064631957, "step": 65490}, {"loss": 0.6287, "grad_norm": 0.8797298073768616, "learning_rate": 0.0002, "epoch": 4.703770197486535, "step": 65500}, {"loss": 0.6515, "grad_norm": 0.9970770478248596, "learning_rate": 0.0002, "epoch": 4.704488330341113, "step": 65510}, {"loss": 0.6096, "grad_norm": 1.1743851900100708, "learning_rate": 0.0002, "epoch": 4.705206463195691, "step": 65520}, {"loss": 0.5755, "grad_norm": 0.9534381031990051, "learning_rate": 0.0002, "epoch": 4.705924596050269, "step": 65530}, {"loss": 0.6039, "grad_norm": 0.9735581278800964, "learning_rate": 0.0002, "epoch": 4.706642728904847, "step": 65540}, {"loss": 0.6217, "grad_norm": 1.185352087020874, "learning_rate": 0.0002, "epoch": 4.707360861759425, "step": 65550}, {"loss": 0.6398, "grad_norm": 0.9383901357650757, "learning_rate": 0.0002, "epoch": 4.708078994614004, "step": 65560}, {"loss": 0.6654, "grad_norm": 1.0194662809371948, "learning_rate": 0.0002, "epoch": 4.708797127468582, "step": 65570}, {"loss": 0.6008, "grad_norm": 0.8448300361633301, "learning_rate": 0.0002, "epoch": 4.70951526032316, "step": 65580}, {"loss": 0.6608, "grad_norm": 1.1930629014968872, "learning_rate": 0.0002, "epoch": 4.710233393177738, "step": 65590}, {"loss": 0.6082, "grad_norm": 1.0038636922836304, "learning_rate": 0.0002, "epoch": 4.710951526032316, "step": 65600}, {"loss": 0.6613, "grad_norm": 0.8206564784049988, "learning_rate": 0.0002, "epoch": 4.711669658886894, "step": 65610}, {"loss": 0.6142, "grad_norm": 1.0984861850738525, "learning_rate": 0.0002, "epoch": 4.712387791741472, "step": 65620}, {"loss": 0.6368, "grad_norm": 1.2891547679901123, "learning_rate": 0.0002, "epoch": 4.71310592459605, "step": 65630}, {"loss": 0.5857, "grad_norm": 0.927062451839447, "learning_rate": 0.0002, "epoch": 4.713824057450628, "step": 65640}, {"loss": 0.6187, "grad_norm": 0.8647334575653076, "learning_rate": 0.0002, "epoch": 4.714542190305206, "step": 65650}, {"loss": 0.6327, "grad_norm": 1.1017670631408691, "learning_rate": 0.0002, "epoch": 4.715260323159785, "step": 65660}, {"loss": 0.6398, "grad_norm": 0.9589072465896606, "learning_rate": 0.0002, "epoch": 4.715978456014363, "step": 65670}, {"loss": 0.6179, "grad_norm": 0.9496776461601257, "learning_rate": 0.0002, "epoch": 4.716696588868941, "step": 65680}, {"loss": 0.625, "grad_norm": 0.9266180396080017, "learning_rate": 0.0002, "epoch": 4.717414721723519, "step": 65690}, {"loss": 0.637, "grad_norm": 0.8699696063995361, "learning_rate": 0.0002, "epoch": 4.718132854578097, "step": 65700}, {"loss": 0.6402, "grad_norm": 1.0444015264511108, "learning_rate": 0.0002, "epoch": 4.718850987432675, "step": 65710}, {"loss": 0.6526, "grad_norm": 1.0100741386413574, "learning_rate": 0.0002, "epoch": 4.719569120287253, "step": 65720}, {"loss": 0.617, "grad_norm": 1.1442630290985107, "learning_rate": 0.0002, "epoch": 4.720287253141831, "step": 65730}, {"loss": 0.6214, "grad_norm": 0.8937877416610718, "learning_rate": 0.0002, "epoch": 4.721005385996409, "step": 65740}, {"loss": 0.625, "grad_norm": 1.0718764066696167, "learning_rate": 0.0002, "epoch": 4.721723518850988, "step": 65750}, {"loss": 0.6182, "grad_norm": 0.8838587999343872, "learning_rate": 0.0002, "epoch": 4.722441651705566, "step": 65760}, {"loss": 0.6254, "grad_norm": 1.1247940063476562, "learning_rate": 0.0002, "epoch": 4.723159784560144, "step": 65770}, {"loss": 0.5917, "grad_norm": 0.9491105675697327, "learning_rate": 0.0002, "epoch": 4.723877917414722, "step": 65780}, {"loss": 0.6178, "grad_norm": 1.0896921157836914, "learning_rate": 0.0002, "epoch": 4.7245960502693, "step": 65790}, {"loss": 0.5975, "grad_norm": 1.0097380876541138, "learning_rate": 0.0002, "epoch": 4.725314183123878, "step": 65800}, {"loss": 0.592, "grad_norm": 0.911763608455658, "learning_rate": 0.0002, "epoch": 4.726032315978456, "step": 65810}, {"loss": 0.6274, "grad_norm": 1.1295124292373657, "learning_rate": 0.0002, "epoch": 4.726750448833034, "step": 65820}, {"loss": 0.6004, "grad_norm": 0.7637538313865662, "learning_rate": 0.0002, "epoch": 4.727468581687612, "step": 65830}, {"loss": 0.6136, "grad_norm": 0.9255306720733643, "learning_rate": 0.0002, "epoch": 4.72818671454219, "step": 65840}, {"loss": 0.6013, "grad_norm": 0.9847530126571655, "learning_rate": 0.0002, "epoch": 4.728904847396769, "step": 65850}, {"loss": 0.6283, "grad_norm": 0.9036182761192322, "learning_rate": 0.0002, "epoch": 4.729622980251347, "step": 65860}, {"loss": 0.6374, "grad_norm": 0.8284199833869934, "learning_rate": 0.0002, "epoch": 4.730341113105925, "step": 65870}, {"loss": 0.6228, "grad_norm": 1.0142838954925537, "learning_rate": 0.0002, "epoch": 4.731059245960503, "step": 65880}, {"loss": 0.624, "grad_norm": 0.9389033913612366, "learning_rate": 0.0002, "epoch": 4.731777378815081, "step": 65890}, {"loss": 0.6414, "grad_norm": 0.8870056867599487, "learning_rate": 0.0002, "epoch": 4.732495511669659, "step": 65900}, {"loss": 0.6261, "grad_norm": 1.1211678981781006, "learning_rate": 0.0002, "epoch": 4.733213644524237, "step": 65910}, {"loss": 0.6065, "grad_norm": 0.7796614170074463, "learning_rate": 0.0002, "epoch": 4.733931777378815, "step": 65920}, {"loss": 0.6701, "grad_norm": 1.0360451936721802, "learning_rate": 0.0002, "epoch": 4.734649910233394, "step": 65930}, {"loss": 0.68, "grad_norm": 0.8383482098579407, "learning_rate": 0.0002, "epoch": 4.735368043087972, "step": 65940}, {"loss": 0.6014, "grad_norm": 0.7985122799873352, "learning_rate": 0.0002, "epoch": 4.73608617594255, "step": 65950}, {"loss": 0.6431, "grad_norm": 1.0314199924468994, "learning_rate": 0.0002, "epoch": 4.736804308797128, "step": 65960}, {"loss": 0.5894, "grad_norm": 0.9279016852378845, "learning_rate": 0.0002, "epoch": 4.737522441651706, "step": 65970}, {"loss": 0.6327, "grad_norm": 1.1046063899993896, "learning_rate": 0.0002, "epoch": 4.738240574506284, "step": 65980}, {"loss": 0.5778, "grad_norm": 0.9075793623924255, "learning_rate": 0.0002, "epoch": 4.738958707360862, "step": 65990}, {"loss": 0.5832, "grad_norm": 1.0945355892181396, "learning_rate": 0.0002, "epoch": 4.73967684021544, "step": 66000}, {"loss": 0.6256, "grad_norm": 0.8885519504547119, "learning_rate": 0.0002, "epoch": 4.740394973070018, "step": 66010}, {"loss": 0.6283, "grad_norm": 0.9312083125114441, "learning_rate": 0.0002, "epoch": 4.741113105924596, "step": 66020}, {"loss": 0.6328, "grad_norm": 1.1574538946151733, "learning_rate": 0.0002, "epoch": 4.741831238779174, "step": 66030}, {"loss": 0.6693, "grad_norm": 0.9346209168434143, "learning_rate": 0.0002, "epoch": 4.742549371633753, "step": 66040}, {"loss": 0.6252, "grad_norm": 0.8935149312019348, "learning_rate": 0.0002, "epoch": 4.743267504488331, "step": 66050}, {"loss": 0.6137, "grad_norm": 0.8958369493484497, "learning_rate": 0.0002, "epoch": 4.743985637342909, "step": 66060}, {"loss": 0.6088, "grad_norm": 0.9383506774902344, "learning_rate": 0.0002, "epoch": 4.744703770197487, "step": 66070}, {"loss": 0.6323, "grad_norm": 0.9868947863578796, "learning_rate": 0.0002, "epoch": 4.745421903052065, "step": 66080}, {"loss": 0.6426, "grad_norm": 1.3417645692825317, "learning_rate": 0.0002, "epoch": 4.746140035906643, "step": 66090}, {"loss": 0.5417, "grad_norm": 1.070693850517273, "learning_rate": 0.0002, "epoch": 4.746858168761221, "step": 66100}, {"loss": 0.6326, "grad_norm": 0.8841570019721985, "learning_rate": 0.0002, "epoch": 4.747576301615799, "step": 66110}, {"loss": 0.655, "grad_norm": 0.7963120341300964, "learning_rate": 0.0002, "epoch": 4.7482944344703775, "step": 66120}, {"loss": 0.6145, "grad_norm": 0.8145691156387329, "learning_rate": 0.0002, "epoch": 4.7490125673249555, "step": 66130}, {"loss": 0.6081, "grad_norm": 0.9074729681015015, "learning_rate": 0.0002, "epoch": 4.7497307001795335, "step": 66140}, {"loss": 0.5651, "grad_norm": 0.9129886627197266, "learning_rate": 0.0002, "epoch": 4.7504488330341115, "step": 66150}, {"loss": 0.6111, "grad_norm": 0.91527259349823, "learning_rate": 0.0002, "epoch": 4.7511669658886895, "step": 66160}, {"loss": 0.672, "grad_norm": 0.9569419622421265, "learning_rate": 0.0002, "epoch": 4.7518850987432675, "step": 66170}, {"loss": 0.597, "grad_norm": 0.8777104616165161, "learning_rate": 0.0002, "epoch": 4.7526032315978455, "step": 66180}, {"loss": 0.6433, "grad_norm": 0.9673085808753967, "learning_rate": 0.0002, "epoch": 4.7533213644524235, "step": 66190}, {"loss": 0.5783, "grad_norm": 1.0683966875076294, "learning_rate": 0.0002, "epoch": 4.7540394973070015, "step": 66200}, {"loss": 0.6356, "grad_norm": 1.1591907739639282, "learning_rate": 0.0002, "epoch": 4.7547576301615795, "step": 66210}, {"loss": 0.6482, "grad_norm": 1.1973309516906738, "learning_rate": 0.0002, "epoch": 4.755475763016158, "step": 66220}, {"loss": 0.5998, "grad_norm": 0.8472012281417847, "learning_rate": 0.0002, "epoch": 4.756193895870736, "step": 66230}, {"loss": 0.717, "grad_norm": 0.9896261692047119, "learning_rate": 0.0002, "epoch": 4.756912028725314, "step": 66240}, {"loss": 0.6368, "grad_norm": 0.8498432040214539, "learning_rate": 0.0002, "epoch": 4.757630161579892, "step": 66250}, {"loss": 0.5931, "grad_norm": 0.9624166488647461, "learning_rate": 0.0002, "epoch": 4.75834829443447, "step": 66260}, {"loss": 0.645, "grad_norm": 1.0951786041259766, "learning_rate": 0.0002, "epoch": 4.759066427289048, "step": 66270}, {"loss": 0.6092, "grad_norm": 0.9863157868385315, "learning_rate": 0.0002, "epoch": 4.759784560143626, "step": 66280}, {"loss": 0.6682, "grad_norm": 1.0062068700790405, "learning_rate": 0.0002, "epoch": 4.760502692998204, "step": 66290}, {"loss": 0.5704, "grad_norm": 0.8075495958328247, "learning_rate": 0.0002, "epoch": 4.761220825852782, "step": 66300}, {"loss": 0.6297, "grad_norm": 0.9617878198623657, "learning_rate": 0.0002, "epoch": 4.761938958707361, "step": 66310}, {"loss": 0.6141, "grad_norm": 1.097091555595398, "learning_rate": 0.0002, "epoch": 4.762657091561939, "step": 66320}, {"loss": 0.6152, "grad_norm": 1.2713453769683838, "learning_rate": 0.0002, "epoch": 4.763375224416517, "step": 66330}, {"loss": 0.6726, "grad_norm": 0.9473448991775513, "learning_rate": 0.0002, "epoch": 4.764093357271095, "step": 66340}, {"loss": 0.6032, "grad_norm": 1.0176854133605957, "learning_rate": 0.0002, "epoch": 4.764811490125673, "step": 66350}, {"loss": 0.6429, "grad_norm": 1.0486242771148682, "learning_rate": 0.0002, "epoch": 4.765529622980251, "step": 66360}, {"loss": 0.6875, "grad_norm": 1.249985694885254, "learning_rate": 0.0002, "epoch": 4.766247755834829, "step": 66370}, {"loss": 0.6086, "grad_norm": 1.283875584602356, "learning_rate": 0.0002, "epoch": 4.766965888689407, "step": 66380}, {"loss": 0.5997, "grad_norm": 1.0009022951126099, "learning_rate": 0.0002, "epoch": 4.767684021543985, "step": 66390}, {"loss": 0.5782, "grad_norm": 0.9718021750450134, "learning_rate": 0.0002, "epoch": 4.768402154398563, "step": 66400}, {"loss": 0.6292, "grad_norm": 1.0865732431411743, "learning_rate": 0.0002, "epoch": 4.769120287253142, "step": 66410}, {"loss": 0.6038, "grad_norm": 0.9273189306259155, "learning_rate": 0.0002, "epoch": 4.76983842010772, "step": 66420}, {"loss": 0.6244, "grad_norm": 1.067535638809204, "learning_rate": 0.0002, "epoch": 4.770556552962298, "step": 66430}, {"loss": 0.6434, "grad_norm": 1.0551011562347412, "learning_rate": 0.0002, "epoch": 4.771274685816876, "step": 66440}, {"loss": 0.6151, "grad_norm": 1.0336146354675293, "learning_rate": 0.0002, "epoch": 4.771992818671454, "step": 66450}, {"loss": 0.5955, "grad_norm": 0.8738380670547485, "learning_rate": 0.0002, "epoch": 4.772710951526032, "step": 66460}, {"loss": 0.6386, "grad_norm": 1.1048321723937988, "learning_rate": 0.0002, "epoch": 4.77342908438061, "step": 66470}, {"loss": 0.592, "grad_norm": 0.8471167683601379, "learning_rate": 0.0002, "epoch": 4.774147217235188, "step": 66480}, {"loss": 0.6139, "grad_norm": 1.2527031898498535, "learning_rate": 0.0002, "epoch": 4.774865350089767, "step": 66490}, {"loss": 0.579, "grad_norm": 1.0056052207946777, "learning_rate": 0.0002, "epoch": 4.775583482944345, "step": 66500}, {"loss": 0.6448, "grad_norm": 1.142456293106079, "learning_rate": 0.0002, "epoch": 4.776301615798923, "step": 66510}, {"loss": 0.6399, "grad_norm": 1.1813132762908936, "learning_rate": 0.0002, "epoch": 4.777019748653501, "step": 66520}, {"loss": 0.6575, "grad_norm": 0.8683654069900513, "learning_rate": 0.0002, "epoch": 4.777737881508079, "step": 66530}, {"loss": 0.6059, "grad_norm": 1.0577980279922485, "learning_rate": 0.0002, "epoch": 4.778456014362657, "step": 66540}, {"loss": 0.5923, "grad_norm": 1.077438473701477, "learning_rate": 0.0002, "epoch": 4.779174147217235, "step": 66550}, {"loss": 0.5744, "grad_norm": 1.0107938051223755, "learning_rate": 0.0002, "epoch": 4.779892280071813, "step": 66560}, {"loss": 0.6155, "grad_norm": 0.8071168065071106, "learning_rate": 0.0002, "epoch": 4.780610412926391, "step": 66570}, {"loss": 0.6126, "grad_norm": 0.8887564539909363, "learning_rate": 0.0002, "epoch": 4.781328545780969, "step": 66580}, {"loss": 0.6417, "grad_norm": 0.9823092222213745, "learning_rate": 0.0002, "epoch": 4.782046678635547, "step": 66590}, {"loss": 0.6108, "grad_norm": 0.9026784300804138, "learning_rate": 0.0002, "epoch": 4.782764811490126, "step": 66600}, {"loss": 0.6252, "grad_norm": 0.8912792205810547, "learning_rate": 0.0002, "epoch": 4.783482944344704, "step": 66610}, {"loss": 0.6285, "grad_norm": 1.0955979824066162, "learning_rate": 0.0002, "epoch": 4.784201077199282, "step": 66620}, {"loss": 0.6161, "grad_norm": 0.8614793419837952, "learning_rate": 0.0002, "epoch": 4.78491921005386, "step": 66630}, {"loss": 0.6343, "grad_norm": 0.7247269153594971, "learning_rate": 0.0002, "epoch": 4.785637342908438, "step": 66640}, {"loss": 0.5634, "grad_norm": 0.9685400724411011, "learning_rate": 0.0002, "epoch": 4.786355475763016, "step": 66650}, {"loss": 0.6419, "grad_norm": 0.9219905734062195, "learning_rate": 0.0002, "epoch": 4.787073608617594, "step": 66660}, {"loss": 0.6509, "grad_norm": 0.9217489361763, "learning_rate": 0.0002, "epoch": 4.787791741472172, "step": 66670}, {"loss": 0.6151, "grad_norm": 1.13791823387146, "learning_rate": 0.0002, "epoch": 4.788509874326751, "step": 66680}, {"loss": 0.6114, "grad_norm": 0.857542872428894, "learning_rate": 0.0002, "epoch": 4.789228007181329, "step": 66690}, {"loss": 0.6317, "grad_norm": 0.9886694550514221, "learning_rate": 0.0002, "epoch": 4.789946140035907, "step": 66700}, {"loss": 0.6436, "grad_norm": 0.987952470779419, "learning_rate": 0.0002, "epoch": 4.790664272890485, "step": 66710}, {"loss": 0.6284, "grad_norm": 1.051612377166748, "learning_rate": 0.0002, "epoch": 4.791382405745063, "step": 66720}, {"loss": 0.6207, "grad_norm": 0.9816454648971558, "learning_rate": 0.0002, "epoch": 4.792100538599641, "step": 66730}, {"loss": 0.6618, "grad_norm": 1.0953829288482666, "learning_rate": 0.0002, "epoch": 4.792818671454219, "step": 66740}, {"loss": 0.652, "grad_norm": 0.8720369935035706, "learning_rate": 0.0002, "epoch": 4.793536804308797, "step": 66750}, {"loss": 0.569, "grad_norm": 0.8910234570503235, "learning_rate": 0.0002, "epoch": 4.794254937163375, "step": 66760}, {"loss": 0.5814, "grad_norm": 0.8300510048866272, "learning_rate": 0.0002, "epoch": 4.794973070017953, "step": 66770}, {"loss": 0.591, "grad_norm": 0.9380533695220947, "learning_rate": 0.0002, "epoch": 4.795691202872531, "step": 66780}, {"loss": 0.6201, "grad_norm": 0.8361864686012268, "learning_rate": 0.0002, "epoch": 4.79640933572711, "step": 66790}, {"loss": 0.6192, "grad_norm": 1.051262617111206, "learning_rate": 0.0002, "epoch": 4.797127468581688, "step": 66800}, {"loss": 0.6408, "grad_norm": 1.1324400901794434, "learning_rate": 0.0002, "epoch": 4.797845601436266, "step": 66810}, {"loss": 0.6156, "grad_norm": 0.853903591632843, "learning_rate": 0.0002, "epoch": 4.798563734290844, "step": 66820}, {"loss": 0.5923, "grad_norm": 0.9949867725372314, "learning_rate": 0.0002, "epoch": 4.799281867145422, "step": 66830}, {"loss": 0.6453, "grad_norm": 0.9204033017158508, "learning_rate": 0.0002, "epoch": 4.8, "step": 66840}, {"loss": 0.6221, "grad_norm": 0.7461584806442261, "learning_rate": 0.0002, "epoch": 4.800718132854578, "step": 66850}, {"loss": 0.6019, "grad_norm": 1.1019874811172485, "learning_rate": 0.0002, "epoch": 4.801436265709156, "step": 66860}, {"loss": 0.6514, "grad_norm": 1.1695797443389893, "learning_rate": 0.0002, "epoch": 4.802154398563735, "step": 66870}, {"loss": 0.6105, "grad_norm": 1.0902758836746216, "learning_rate": 0.0002, "epoch": 4.802872531418313, "step": 66880}, {"loss": 0.6297, "grad_norm": 0.8778618574142456, "learning_rate": 0.0002, "epoch": 4.803590664272891, "step": 66890}, {"loss": 0.6608, "grad_norm": 0.905505359172821, "learning_rate": 0.0002, "epoch": 4.804308797127469, "step": 66900}, {"loss": 0.6386, "grad_norm": 1.0802056789398193, "learning_rate": 0.0002, "epoch": 4.805026929982047, "step": 66910}, {"loss": 0.5866, "grad_norm": 0.7899449467658997, "learning_rate": 0.0002, "epoch": 4.805745062836625, "step": 66920}, {"loss": 0.6169, "grad_norm": 1.1938519477844238, "learning_rate": 0.0002, "epoch": 4.806463195691203, "step": 66930}, {"loss": 0.5979, "grad_norm": 1.0213780403137207, "learning_rate": 0.0002, "epoch": 4.807181328545781, "step": 66940}, {"loss": 0.6518, "grad_norm": 0.9925506711006165, "learning_rate": 0.0002, "epoch": 4.807899461400359, "step": 66950}, {"loss": 0.6229, "grad_norm": 1.0174424648284912, "learning_rate": 0.0002, "epoch": 4.808617594254937, "step": 66960}, {"loss": 0.5932, "grad_norm": 1.0515072345733643, "learning_rate": 0.0002, "epoch": 4.809335727109516, "step": 66970}, {"loss": 0.6169, "grad_norm": 1.0161492824554443, "learning_rate": 0.0002, "epoch": 4.810053859964094, "step": 66980}, {"loss": 0.5804, "grad_norm": 0.8421840071678162, "learning_rate": 0.0002, "epoch": 4.810771992818672, "step": 66990}, {"loss": 0.6792, "grad_norm": 1.0493539571762085, "learning_rate": 0.0002, "epoch": 4.81149012567325, "step": 67000}, {"loss": 0.5906, "grad_norm": 1.1133309602737427, "learning_rate": 0.0002, "epoch": 4.812208258527828, "step": 67010}, {"loss": 0.5771, "grad_norm": 0.924017071723938, "learning_rate": 0.0002, "epoch": 4.812926391382406, "step": 67020}, {"loss": 0.625, "grad_norm": 1.0568689107894897, "learning_rate": 0.0002, "epoch": 4.813644524236984, "step": 67030}, {"loss": 0.6654, "grad_norm": 0.989414632320404, "learning_rate": 0.0002, "epoch": 4.814362657091562, "step": 67040}, {"loss": 0.6186, "grad_norm": 0.9256827235221863, "learning_rate": 0.0002, "epoch": 4.8150807899461405, "step": 67050}, {"loss": 0.637, "grad_norm": 0.9538901448249817, "learning_rate": 0.0002, "epoch": 4.8157989228007185, "step": 67060}, {"loss": 0.632, "grad_norm": 1.0373849868774414, "learning_rate": 0.0002, "epoch": 4.8165170556552965, "step": 67070}, {"loss": 0.5956, "grad_norm": 1.0019729137420654, "learning_rate": 0.0002, "epoch": 4.8172351885098745, "step": 67080}, {"loss": 0.636, "grad_norm": 0.9930381178855896, "learning_rate": 0.0002, "epoch": 4.8179533213644525, "step": 67090}, {"loss": 0.6106, "grad_norm": 1.0008453130722046, "learning_rate": 0.0002, "epoch": 4.8186714542190305, "step": 67100}, {"loss": 0.5841, "grad_norm": 1.0153851509094238, "learning_rate": 0.0002, "epoch": 4.8193895870736085, "step": 67110}, {"loss": 0.6012, "grad_norm": 1.0193161964416504, "learning_rate": 0.0002, "epoch": 4.8201077199281865, "step": 67120}, {"loss": 0.6602, "grad_norm": 1.0204501152038574, "learning_rate": 0.0002, "epoch": 4.8208258527827645, "step": 67130}, {"loss": 0.6235, "grad_norm": 0.9097670316696167, "learning_rate": 0.0002, "epoch": 4.8215439856373425, "step": 67140}, {"loss": 0.5836, "grad_norm": 0.9288716912269592, "learning_rate": 0.0002, "epoch": 4.8222621184919205, "step": 67150}, {"loss": 0.604, "grad_norm": 0.9975850582122803, "learning_rate": 0.0002, "epoch": 4.822980251346499, "step": 67160}, {"loss": 0.6877, "grad_norm": 0.8502511382102966, "learning_rate": 0.0002, "epoch": 4.823698384201077, "step": 67170}, {"loss": 0.6194, "grad_norm": 1.0129257440567017, "learning_rate": 0.0002, "epoch": 4.824416517055655, "step": 67180}, {"loss": 0.6294, "grad_norm": 1.0009492635726929, "learning_rate": 0.0002, "epoch": 4.825134649910233, "step": 67190}, {"loss": 0.5757, "grad_norm": 0.9273321032524109, "learning_rate": 0.0002, "epoch": 4.825852782764811, "step": 67200}, {"loss": 0.5749, "grad_norm": 1.0438604354858398, "learning_rate": 0.0002, "epoch": 4.8265709156193894, "step": 67210}, {"loss": 0.6273, "grad_norm": 1.119573712348938, "learning_rate": 0.0002, "epoch": 4.8272890484739674, "step": 67220}, {"loss": 0.6284, "grad_norm": 0.9607422351837158, "learning_rate": 0.0002, "epoch": 4.8280071813285454, "step": 67230}, {"loss": 0.6259, "grad_norm": 0.9614062905311584, "learning_rate": 0.0002, "epoch": 4.828725314183124, "step": 67240}, {"loss": 0.5709, "grad_norm": 1.1017652750015259, "learning_rate": 0.0002, "epoch": 4.829443447037702, "step": 67250}, {"loss": 0.6203, "grad_norm": 1.0521706342697144, "learning_rate": 0.0002, "epoch": 4.83016157989228, "step": 67260}, {"loss": 0.6266, "grad_norm": 0.7685959339141846, "learning_rate": 0.0002, "epoch": 4.830879712746858, "step": 67270}, {"loss": 0.5809, "grad_norm": 0.7894896268844604, "learning_rate": 0.0002, "epoch": 4.831597845601436, "step": 67280}, {"loss": 0.6349, "grad_norm": 1.0882996320724487, "learning_rate": 0.0002, "epoch": 4.832315978456014, "step": 67290}, {"loss": 0.6129, "grad_norm": 0.9215409755706787, "learning_rate": 0.0002, "epoch": 4.833034111310592, "step": 67300}, {"loss": 0.6142, "grad_norm": 0.8660635352134705, "learning_rate": 0.0002, "epoch": 4.83375224416517, "step": 67310}, {"loss": 0.6378, "grad_norm": 0.980879008769989, "learning_rate": 0.0002, "epoch": 4.834470377019748, "step": 67320}, {"loss": 0.6291, "grad_norm": 1.0356814861297607, "learning_rate": 0.0002, "epoch": 4.835188509874326, "step": 67330}, {"loss": 0.6271, "grad_norm": 1.0265507698059082, "learning_rate": 0.0002, "epoch": 4.835906642728904, "step": 67340}, {"loss": 0.6009, "grad_norm": 1.0659137964248657, "learning_rate": 0.0002, "epoch": 4.836624775583483, "step": 67350}, {"loss": 0.5946, "grad_norm": 0.9485231637954712, "learning_rate": 0.0002, "epoch": 4.837342908438061, "step": 67360}, {"loss": 0.6338, "grad_norm": 1.0950140953063965, "learning_rate": 0.0002, "epoch": 4.838061041292639, "step": 67370}, {"loss": 0.6314, "grad_norm": 0.8907382488250732, "learning_rate": 0.0002, "epoch": 4.838779174147217, "step": 67380}, {"loss": 0.6066, "grad_norm": 0.9777120351791382, "learning_rate": 0.0002, "epoch": 4.839497307001795, "step": 67390}, {"loss": 0.6258, "grad_norm": 0.8482252955436707, "learning_rate": 0.0002, "epoch": 4.840215439856373, "step": 67400}, {"loss": 0.603, "grad_norm": 0.8505899906158447, "learning_rate": 0.0002, "epoch": 4.840933572710951, "step": 67410}, {"loss": 0.609, "grad_norm": 0.8574482798576355, "learning_rate": 0.0002, "epoch": 4.841651705565529, "step": 67420}, {"loss": 0.6188, "grad_norm": 1.092310905456543, "learning_rate": 0.0002, "epoch": 4.842369838420108, "step": 67430}, {"loss": 0.619, "grad_norm": 0.9418560266494751, "learning_rate": 0.0002, "epoch": 4.843087971274686, "step": 67440}, {"loss": 0.6367, "grad_norm": 1.1310782432556152, "learning_rate": 0.0002, "epoch": 4.843806104129264, "step": 67450}, {"loss": 0.664, "grad_norm": 0.9993671774864197, "learning_rate": 0.0002, "epoch": 4.844524236983842, "step": 67460}, {"loss": 0.6247, "grad_norm": 0.8322528600692749, "learning_rate": 0.0002, "epoch": 4.84524236983842, "step": 67470}, {"loss": 0.5828, "grad_norm": 0.8488435745239258, "learning_rate": 0.0002, "epoch": 4.845960502692998, "step": 67480}, {"loss": 0.6023, "grad_norm": 0.8070611357688904, "learning_rate": 0.0002, "epoch": 4.846678635547576, "step": 67490}, {"loss": 0.6362, "grad_norm": 0.8200163245201111, "learning_rate": 0.0002, "epoch": 4.847396768402154, "step": 67500}, {"loss": 0.612, "grad_norm": 0.91901034116745, "learning_rate": 0.0002, "epoch": 4.848114901256732, "step": 67510}, {"loss": 0.6191, "grad_norm": 1.0938435792922974, "learning_rate": 0.0002, "epoch": 4.84883303411131, "step": 67520}, {"loss": 0.6736, "grad_norm": 0.7926174402236938, "learning_rate": 0.0002, "epoch": 4.849551166965889, "step": 67530}, {"loss": 0.6252, "grad_norm": 0.9914385676383972, "learning_rate": 0.0002, "epoch": 4.850269299820467, "step": 67540}, {"loss": 0.6278, "grad_norm": 1.033065915107727, "learning_rate": 0.0002, "epoch": 4.850987432675045, "step": 67550}, {"loss": 0.6334, "grad_norm": 0.9700239300727844, "learning_rate": 0.0002, "epoch": 4.851705565529623, "step": 67560}, {"loss": 0.6308, "grad_norm": 0.8550103902816772, "learning_rate": 0.0002, "epoch": 4.852423698384201, "step": 67570}, {"loss": 0.6194, "grad_norm": 1.0009654760360718, "learning_rate": 0.0002, "epoch": 4.853141831238779, "step": 67580}, {"loss": 0.5825, "grad_norm": 1.0766186714172363, "learning_rate": 0.0002, "epoch": 4.853859964093357, "step": 67590}, {"loss": 0.6216, "grad_norm": 0.9512220621109009, "learning_rate": 0.0002, "epoch": 4.854578096947935, "step": 67600}, {"loss": 0.6301, "grad_norm": 0.8434456586837769, "learning_rate": 0.0002, "epoch": 4.855296229802514, "step": 67610}, {"loss": 0.6416, "grad_norm": 1.0276665687561035, "learning_rate": 0.0002, "epoch": 4.856014362657092, "step": 67620}, {"loss": 0.6063, "grad_norm": 0.9758516550064087, "learning_rate": 0.0002, "epoch": 4.85673249551167, "step": 67630}, {"loss": 0.622, "grad_norm": 0.8988076448440552, "learning_rate": 0.0002, "epoch": 4.857450628366248, "step": 67640}, {"loss": 0.6516, "grad_norm": 1.0038257837295532, "learning_rate": 0.0002, "epoch": 4.858168761220826, "step": 67650}, {"loss": 0.6322, "grad_norm": 0.9973093867301941, "learning_rate": 0.0002, "epoch": 4.858886894075404, "step": 67660}, {"loss": 0.6065, "grad_norm": 0.9754974246025085, "learning_rate": 0.0002, "epoch": 4.859605026929982, "step": 67670}, {"loss": 0.6191, "grad_norm": 1.1829560995101929, "learning_rate": 0.0002, "epoch": 4.86032315978456, "step": 67680}, {"loss": 0.6267, "grad_norm": 1.1077659130096436, "learning_rate": 0.0002, "epoch": 4.861041292639138, "step": 67690}, {"loss": 0.6312, "grad_norm": 0.9862872958183289, "learning_rate": 0.0002, "epoch": 4.861759425493716, "step": 67700}, {"loss": 0.6281, "grad_norm": 0.9826052188873291, "learning_rate": 0.0002, "epoch": 4.862477558348294, "step": 67710}, {"loss": 0.6227, "grad_norm": 0.940082848072052, "learning_rate": 0.0002, "epoch": 4.863195691202873, "step": 67720}, {"loss": 0.6232, "grad_norm": 0.895434558391571, "learning_rate": 0.0002, "epoch": 4.863913824057451, "step": 67730}, {"loss": 0.6674, "grad_norm": 1.1194682121276855, "learning_rate": 0.0002, "epoch": 4.864631956912029, "step": 67740}, {"loss": 0.5981, "grad_norm": 0.9984544515609741, "learning_rate": 0.0002, "epoch": 4.865350089766607, "step": 67750}, {"loss": 0.6583, "grad_norm": 1.049224615097046, "learning_rate": 0.0002, "epoch": 4.866068222621185, "step": 67760}, {"loss": 0.583, "grad_norm": 1.009515643119812, "learning_rate": 0.0002, "epoch": 4.866786355475763, "step": 67770}, {"loss": 0.6466, "grad_norm": 1.0336902141571045, "learning_rate": 0.0002, "epoch": 4.867504488330341, "step": 67780}, {"loss": 0.6909, "grad_norm": 0.9310635924339294, "learning_rate": 0.0002, "epoch": 4.868222621184919, "step": 67790}, {"loss": 0.7267, "grad_norm": 0.934882640838623, "learning_rate": 0.0002, "epoch": 4.868940754039498, "step": 67800}, {"loss": 0.648, "grad_norm": 0.8663495779037476, "learning_rate": 0.0002, "epoch": 4.869658886894076, "step": 67810}, {"loss": 0.6275, "grad_norm": 1.0085018873214722, "learning_rate": 0.0002, "epoch": 4.870377019748654, "step": 67820}, {"loss": 0.6571, "grad_norm": 0.896507978439331, "learning_rate": 0.0002, "epoch": 4.871095152603232, "step": 67830}, {"loss": 0.6711, "grad_norm": 0.925809919834137, "learning_rate": 0.0002, "epoch": 4.87181328545781, "step": 67840}, {"loss": 0.5917, "grad_norm": 0.8044029474258423, "learning_rate": 0.0002, "epoch": 4.872531418312388, "step": 67850}, {"loss": 0.6671, "grad_norm": 1.0026800632476807, "learning_rate": 0.0002, "epoch": 4.873249551166966, "step": 67860}, {"loss": 0.6175, "grad_norm": 0.9577589631080627, "learning_rate": 0.0002, "epoch": 4.873967684021544, "step": 67870}, {"loss": 0.591, "grad_norm": 0.8225193619728088, "learning_rate": 0.0002, "epoch": 4.874685816876122, "step": 67880}, {"loss": 0.6, "grad_norm": 1.0019139051437378, "learning_rate": 0.0002, "epoch": 4.8754039497307, "step": 67890}, {"loss": 0.6521, "grad_norm": 0.9282827377319336, "learning_rate": 0.0002, "epoch": 4.876122082585278, "step": 67900}, {"loss": 0.6251, "grad_norm": 0.8204836249351501, "learning_rate": 0.0002, "epoch": 4.876840215439857, "step": 67910}, {"loss": 0.6345, "grad_norm": 0.907356321811676, "learning_rate": 0.0002, "epoch": 4.877558348294435, "step": 67920}, {"loss": 0.6438, "grad_norm": 1.12422776222229, "learning_rate": 0.0002, "epoch": 4.878276481149013, "step": 67930}, {"loss": 0.6727, "grad_norm": 0.8230205178260803, "learning_rate": 0.0002, "epoch": 4.878994614003591, "step": 67940}, {"loss": 0.6361, "grad_norm": 1.1588479280471802, "learning_rate": 0.0002, "epoch": 4.879712746858169, "step": 67950}, {"loss": 0.6489, "grad_norm": 1.1064553260803223, "learning_rate": 0.0002, "epoch": 4.880430879712747, "step": 67960}, {"loss": 0.5851, "grad_norm": 0.9311534762382507, "learning_rate": 0.0002, "epoch": 4.881149012567325, "step": 67970}, {"loss": 0.6238, "grad_norm": 0.7575639486312866, "learning_rate": 0.0002, "epoch": 4.881867145421903, "step": 67980}, {"loss": 0.5933, "grad_norm": 0.9201191067695618, "learning_rate": 0.0002, "epoch": 4.882585278276482, "step": 67990}, {"loss": 0.5806, "grad_norm": 0.8487658500671387, "learning_rate": 0.0002, "epoch": 4.88330341113106, "step": 68000}, {"loss": 0.598, "grad_norm": 0.9645208716392517, "learning_rate": 0.0002, "epoch": 4.884021543985638, "step": 68010}, {"loss": 0.6112, "grad_norm": 0.8594469428062439, "learning_rate": 0.0002, "epoch": 4.884739676840216, "step": 68020}, {"loss": 0.6115, "grad_norm": 0.9518412947654724, "learning_rate": 0.0002, "epoch": 4.885457809694794, "step": 68030}, {"loss": 0.6071, "grad_norm": 1.0934258699417114, "learning_rate": 0.0002, "epoch": 4.886175942549372, "step": 68040}, {"loss": 0.6265, "grad_norm": 0.988761842250824, "learning_rate": 0.0002, "epoch": 4.88689407540395, "step": 68050}, {"loss": 0.5981, "grad_norm": 0.7572013735771179, "learning_rate": 0.0002, "epoch": 4.887612208258528, "step": 68060}, {"loss": 0.6286, "grad_norm": 0.8801929950714111, "learning_rate": 0.0002, "epoch": 4.888330341113106, "step": 68070}, {"loss": 0.6503, "grad_norm": 1.0080658197402954, "learning_rate": 0.0002, "epoch": 4.889048473967684, "step": 68080}, {"loss": 0.6064, "grad_norm": 0.9588785171508789, "learning_rate": 0.0002, "epoch": 4.8897666068222625, "step": 68090}, {"loss": 0.6159, "grad_norm": 1.0994032621383667, "learning_rate": 0.0002, "epoch": 4.8904847396768405, "step": 68100}, {"loss": 0.6357, "grad_norm": 0.9851962924003601, "learning_rate": 0.0002, "epoch": 4.8912028725314185, "step": 68110}, {"loss": 0.5999, "grad_norm": 0.9566116333007812, "learning_rate": 0.0002, "epoch": 4.8919210053859965, "step": 68120}, {"loss": 0.6742, "grad_norm": 0.8708083033561707, "learning_rate": 0.0002, "epoch": 4.8926391382405745, "step": 68130}, {"loss": 0.6489, "grad_norm": 1.2182754278182983, "learning_rate": 0.0002, "epoch": 4.8933572710951525, "step": 68140}, {"loss": 0.6442, "grad_norm": 1.047988772392273, "learning_rate": 0.0002, "epoch": 4.8940754039497305, "step": 68150}, {"loss": 0.6176, "grad_norm": 0.8665831685066223, "learning_rate": 0.0002, "epoch": 4.8947935368043085, "step": 68160}, {"loss": 0.5721, "grad_norm": 0.9313908219337463, "learning_rate": 0.0002, "epoch": 4.8955116696588865, "step": 68170}, {"loss": 0.6073, "grad_norm": 0.9568582773208618, "learning_rate": 0.0002, "epoch": 4.896229802513465, "step": 68180}, {"loss": 0.6308, "grad_norm": 1.0427594184875488, "learning_rate": 0.0002, "epoch": 4.896947935368043, "step": 68190}, {"loss": 0.6357, "grad_norm": 0.9132021069526672, "learning_rate": 0.0002, "epoch": 4.897666068222621, "step": 68200}, {"loss": 0.6264, "grad_norm": 0.9597318768501282, "learning_rate": 0.0002, "epoch": 4.898384201077199, "step": 68210}, {"loss": 0.6025, "grad_norm": 1.0736947059631348, "learning_rate": 0.0002, "epoch": 4.899102333931777, "step": 68220}, {"loss": 0.5942, "grad_norm": 0.9318404793739319, "learning_rate": 0.0002, "epoch": 4.899820466786355, "step": 68230}, {"loss": 0.5991, "grad_norm": 0.8594326972961426, "learning_rate": 0.0002, "epoch": 4.900538599640933, "step": 68240}, {"loss": 0.6145, "grad_norm": 1.1437443494796753, "learning_rate": 0.0002, "epoch": 4.901256732495511, "step": 68250}, {"loss": 0.6414, "grad_norm": 1.1599408388137817, "learning_rate": 0.0002, "epoch": 4.901974865350089, "step": 68260}, {"loss": 0.6148, "grad_norm": 1.160628080368042, "learning_rate": 0.0002, "epoch": 4.902692998204667, "step": 68270}, {"loss": 0.613, "grad_norm": 1.0147801637649536, "learning_rate": 0.0002, "epoch": 4.903411131059246, "step": 68280}, {"loss": 0.6502, "grad_norm": 0.8622691631317139, "learning_rate": 0.0002, "epoch": 4.904129263913824, "step": 68290}, {"loss": 0.618, "grad_norm": 0.7179980874061584, "learning_rate": 0.0002, "epoch": 4.904847396768402, "step": 68300}, {"loss": 0.6388, "grad_norm": 1.1705092191696167, "learning_rate": 0.0002, "epoch": 4.90556552962298, "step": 68310}, {"loss": 0.6164, "grad_norm": 1.1687676906585693, "learning_rate": 0.0002, "epoch": 4.906283662477558, "step": 68320}, {"loss": 0.6791, "grad_norm": 1.1621531248092651, "learning_rate": 0.0002, "epoch": 4.907001795332136, "step": 68330}, {"loss": 0.6474, "grad_norm": 1.0241422653198242, "learning_rate": 0.0002, "epoch": 4.907719928186714, "step": 68340}, {"loss": 0.6225, "grad_norm": 0.943354070186615, "learning_rate": 0.0002, "epoch": 4.908438061041292, "step": 68350}, {"loss": 0.6596, "grad_norm": 0.8091703653335571, "learning_rate": 0.0002, "epoch": 4.909156193895871, "step": 68360}, {"loss": 0.6196, "grad_norm": 0.8871228694915771, "learning_rate": 0.0002, "epoch": 4.909874326750449, "step": 68370}, {"loss": 0.5714, "grad_norm": 1.0951069593429565, "learning_rate": 0.0002, "epoch": 4.910592459605027, "step": 68380}, {"loss": 0.6407, "grad_norm": 1.1355193853378296, "learning_rate": 0.0002, "epoch": 4.911310592459605, "step": 68390}, {"loss": 0.6369, "grad_norm": 1.0741122961044312, "learning_rate": 0.0002, "epoch": 4.912028725314183, "step": 68400}, {"loss": 0.6176, "grad_norm": 0.9285269975662231, "learning_rate": 0.0002, "epoch": 4.912746858168761, "step": 68410}, {"loss": 0.6433, "grad_norm": 1.080695390701294, "learning_rate": 0.0002, "epoch": 4.913464991023339, "step": 68420}, {"loss": 0.6505, "grad_norm": 0.921331524848938, "learning_rate": 0.0002, "epoch": 4.914183123877917, "step": 68430}, {"loss": 0.701, "grad_norm": 0.9763174057006836, "learning_rate": 0.0002, "epoch": 4.914901256732495, "step": 68440}, {"loss": 0.6429, "grad_norm": 1.1133354902267456, "learning_rate": 0.0002, "epoch": 4.915619389587073, "step": 68450}, {"loss": 0.6117, "grad_norm": 0.8373502492904663, "learning_rate": 0.0002, "epoch": 4.916337522441651, "step": 68460}, {"loss": 0.5993, "grad_norm": 0.9192346334457397, "learning_rate": 0.0002, "epoch": 4.91705565529623, "step": 68470}, {"loss": 0.626, "grad_norm": 1.0724657773971558, "learning_rate": 0.0002, "epoch": 4.917773788150808, "step": 68480}, {"loss": 0.6339, "grad_norm": 0.9209843873977661, "learning_rate": 0.0002, "epoch": 4.918491921005386, "step": 68490}, {"loss": 0.6427, "grad_norm": 0.9201577305793762, "learning_rate": 0.0002, "epoch": 4.919210053859964, "step": 68500}, {"loss": 0.6686, "grad_norm": 0.8086138963699341, "learning_rate": 0.0002, "epoch": 4.919928186714542, "step": 68510}, {"loss": 0.564, "grad_norm": 1.0917785167694092, "learning_rate": 0.0002, "epoch": 4.92064631956912, "step": 68520}, {"loss": 0.6177, "grad_norm": 0.9287897944450378, "learning_rate": 0.0002, "epoch": 4.921364452423698, "step": 68530}, {"loss": 0.6344, "grad_norm": 0.9830158948898315, "learning_rate": 0.0002, "epoch": 4.922082585278276, "step": 68540}, {"loss": 0.6583, "grad_norm": 0.8674678802490234, "learning_rate": 0.0002, "epoch": 4.922800718132855, "step": 68550}, {"loss": 0.6284, "grad_norm": 0.7996176481246948, "learning_rate": 0.0002, "epoch": 4.923518850987433, "step": 68560}, {"loss": 0.6089, "grad_norm": 1.1284033060073853, "learning_rate": 0.0002, "epoch": 4.924236983842011, "step": 68570}, {"loss": 0.6454, "grad_norm": 0.894339919090271, "learning_rate": 0.0002, "epoch": 4.924955116696589, "step": 68580}, {"loss": 0.6231, "grad_norm": 1.1140280961990356, "learning_rate": 0.0002, "epoch": 4.925673249551167, "step": 68590}, {"loss": 0.6318, "grad_norm": 0.9048344492912292, "learning_rate": 0.0002, "epoch": 4.926391382405745, "step": 68600}, {"loss": 0.5963, "grad_norm": 0.9380471706390381, "learning_rate": 0.0002, "epoch": 4.927109515260323, "step": 68610}, {"loss": 0.6384, "grad_norm": 0.8598429560661316, "learning_rate": 0.0002, "epoch": 4.927827648114901, "step": 68620}, {"loss": 0.6486, "grad_norm": 1.0813355445861816, "learning_rate": 0.0002, "epoch": 4.928545780969479, "step": 68630}, {"loss": 0.6367, "grad_norm": 0.979053795337677, "learning_rate": 0.0002, "epoch": 4.929263913824057, "step": 68640}, {"loss": 0.6084, "grad_norm": 0.8194574117660522, "learning_rate": 0.0002, "epoch": 4.929982046678636, "step": 68650}, {"loss": 0.6469, "grad_norm": 0.8593540787696838, "learning_rate": 0.0002, "epoch": 4.930700179533214, "step": 68660}, {"loss": 0.6465, "grad_norm": 1.0134016275405884, "learning_rate": 0.0002, "epoch": 4.931418312387792, "step": 68670}, {"loss": 0.6221, "grad_norm": 1.060586929321289, "learning_rate": 0.0002, "epoch": 4.93213644524237, "step": 68680}, {"loss": 0.5861, "grad_norm": 0.84132319688797, "learning_rate": 0.0002, "epoch": 4.932854578096948, "step": 68690}, {"loss": 0.6206, "grad_norm": 1.0767526626586914, "learning_rate": 0.0002, "epoch": 4.933572710951526, "step": 68700}, {"loss": 0.6294, "grad_norm": 0.8858519792556763, "learning_rate": 0.0002, "epoch": 4.934290843806104, "step": 68710}, {"loss": 0.6727, "grad_norm": 1.194031000137329, "learning_rate": 0.0002, "epoch": 4.935008976660682, "step": 68720}, {"loss": 0.6231, "grad_norm": 0.8270226120948792, "learning_rate": 0.0002, "epoch": 4.93572710951526, "step": 68730}, {"loss": 0.6538, "grad_norm": 1.0385973453521729, "learning_rate": 0.0002, "epoch": 4.936445242369839, "step": 68740}, {"loss": 0.623, "grad_norm": 0.9062243700027466, "learning_rate": 0.0002, "epoch": 4.937163375224417, "step": 68750}, {"loss": 0.6578, "grad_norm": 1.0526955127716064, "learning_rate": 0.0002, "epoch": 4.937881508078995, "step": 68760}, {"loss": 0.6425, "grad_norm": 0.930604100227356, "learning_rate": 0.0002, "epoch": 4.938599640933573, "step": 68770}, {"loss": 0.6228, "grad_norm": 0.9635265469551086, "learning_rate": 0.0002, "epoch": 4.939317773788151, "step": 68780}, {"loss": 0.6269, "grad_norm": 0.9825171232223511, "learning_rate": 0.0002, "epoch": 4.940035906642729, "step": 68790}, {"loss": 0.6063, "grad_norm": 0.9621182680130005, "learning_rate": 0.0002, "epoch": 4.940754039497307, "step": 68800}, {"loss": 0.6558, "grad_norm": 0.9655307531356812, "learning_rate": 0.0002, "epoch": 4.941472172351885, "step": 68810}, {"loss": 0.6441, "grad_norm": 1.2948180437088013, "learning_rate": 0.0002, "epoch": 4.942190305206463, "step": 68820}, {"loss": 0.6757, "grad_norm": 0.9206728339195251, "learning_rate": 0.0002, "epoch": 4.942908438061041, "step": 68830}, {"loss": 0.6554, "grad_norm": 1.0235631465911865, "learning_rate": 0.0002, "epoch": 4.94362657091562, "step": 68840}, {"loss": 0.6386, "grad_norm": 1.0542538166046143, "learning_rate": 0.0002, "epoch": 4.944344703770198, "step": 68850}, {"loss": 0.6359, "grad_norm": 0.9787087440490723, "learning_rate": 0.0002, "epoch": 4.945062836624776, "step": 68860}, {"loss": 0.659, "grad_norm": 0.9527219533920288, "learning_rate": 0.0002, "epoch": 4.945780969479354, "step": 68870}, {"loss": 0.6504, "grad_norm": 1.1525826454162598, "learning_rate": 0.0002, "epoch": 4.946499102333932, "step": 68880}, {"loss": 0.6345, "grad_norm": 0.8610072731971741, "learning_rate": 0.0002, "epoch": 4.94721723518851, "step": 68890}, {"loss": 0.6029, "grad_norm": 1.1403616666793823, "learning_rate": 0.0002, "epoch": 4.947935368043088, "step": 68900}, {"loss": 0.6476, "grad_norm": 1.10334312915802, "learning_rate": 0.0002, "epoch": 4.948653500897666, "step": 68910}, {"loss": 0.6123, "grad_norm": 0.8633760809898376, "learning_rate": 0.0002, "epoch": 4.949371633752245, "step": 68920}, {"loss": 0.6619, "grad_norm": 1.1291080713272095, "learning_rate": 0.0002, "epoch": 4.950089766606823, "step": 68930}, {"loss": 0.6003, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 4.950807899461401, "step": 68940}, {"loss": 0.6126, "grad_norm": 0.9207960963249207, "learning_rate": 0.0002, "epoch": 4.951526032315979, "step": 68950}, {"loss": 0.6031, "grad_norm": 0.9815934300422668, "learning_rate": 0.0002, "epoch": 4.952244165170557, "step": 68960}, {"loss": 0.6201, "grad_norm": 0.9725701808929443, "learning_rate": 0.0002, "epoch": 4.952962298025135, "step": 68970}, {"loss": 0.6251, "grad_norm": 0.844926655292511, "learning_rate": 0.0002, "epoch": 4.953680430879713, "step": 68980}, {"loss": 0.6446, "grad_norm": 0.9898511171340942, "learning_rate": 0.0002, "epoch": 4.954398563734291, "step": 68990}, {"loss": 0.629, "grad_norm": 1.1311410665512085, "learning_rate": 0.0002, "epoch": 4.955116696588869, "step": 69000}, {"loss": 0.6525, "grad_norm": 1.218610405921936, "learning_rate": 0.0002, "epoch": 4.955834829443447, "step": 69010}, {"loss": 0.6639, "grad_norm": 1.1536420583724976, "learning_rate": 0.0002, "epoch": 4.956552962298025, "step": 69020}, {"loss": 0.6375, "grad_norm": 1.1857786178588867, "learning_rate": 0.0002, "epoch": 4.957271095152604, "step": 69030}, {"loss": 0.6618, "grad_norm": 0.9969246983528137, "learning_rate": 0.0002, "epoch": 4.957989228007182, "step": 69040}, {"loss": 0.633, "grad_norm": 1.138635277748108, "learning_rate": 0.0002, "epoch": 4.95870736086176, "step": 69050}, {"loss": 0.6344, "grad_norm": 1.110474705696106, "learning_rate": 0.0002, "epoch": 4.959425493716338, "step": 69060}, {"loss": 0.687, "grad_norm": 1.0366318225860596, "learning_rate": 0.0002, "epoch": 4.960143626570916, "step": 69070}, {"loss": 0.6384, "grad_norm": 0.6927996277809143, "learning_rate": 0.0002, "epoch": 4.960861759425494, "step": 69080}, {"loss": 0.6337, "grad_norm": 1.0368026494979858, "learning_rate": 0.0002, "epoch": 4.961579892280072, "step": 69090}, {"loss": 0.6077, "grad_norm": 1.0638312101364136, "learning_rate": 0.0002, "epoch": 4.96229802513465, "step": 69100}, {"loss": 0.6403, "grad_norm": 1.0372415781021118, "learning_rate": 0.0002, "epoch": 4.9630161579892285, "step": 69110}, {"loss": 0.6347, "grad_norm": 0.8257387280464172, "learning_rate": 0.0002, "epoch": 4.9637342908438065, "step": 69120}, {"loss": 0.6405, "grad_norm": 1.0046974420547485, "learning_rate": 0.0002, "epoch": 4.9644524236983845, "step": 69130}, {"loss": 0.623, "grad_norm": 1.0139652490615845, "learning_rate": 0.0002, "epoch": 4.9651705565529625, "step": 69140}, {"loss": 0.5857, "grad_norm": 1.0214691162109375, "learning_rate": 0.0002, "epoch": 4.9658886894075405, "step": 69150}, {"loss": 0.624, "grad_norm": 1.1042424440383911, "learning_rate": 0.0002, "epoch": 4.9666068222621185, "step": 69160}, {"loss": 0.6475, "grad_norm": 0.8749067783355713, "learning_rate": 0.0002, "epoch": 4.9673249551166965, "step": 69170}, {"loss": 0.6734, "grad_norm": 0.9894024133682251, "learning_rate": 0.0002, "epoch": 4.9680430879712745, "step": 69180}, {"loss": 0.5894, "grad_norm": 1.0218034982681274, "learning_rate": 0.0002, "epoch": 4.9687612208258525, "step": 69190}, {"loss": 0.6423, "grad_norm": 0.9782929420471191, "learning_rate": 0.0002, "epoch": 4.9694793536804305, "step": 69200}, {"loss": 0.6455, "grad_norm": 0.9373409748077393, "learning_rate": 0.0002, "epoch": 4.9701974865350085, "step": 69210}, {"loss": 0.6105, "grad_norm": 1.0329546928405762, "learning_rate": 0.0002, "epoch": 4.970915619389587, "step": 69220}, {"loss": 0.6877, "grad_norm": 0.9746108055114746, "learning_rate": 0.0002, "epoch": 4.971633752244165, "step": 69230}, {"loss": 0.6342, "grad_norm": 0.9202073216438293, "learning_rate": 0.0002, "epoch": 4.972351885098743, "step": 69240}, {"loss": 0.6102, "grad_norm": 1.078032374382019, "learning_rate": 0.0002, "epoch": 4.973070017953321, "step": 69250}, {"loss": 0.6349, "grad_norm": 0.8860024809837341, "learning_rate": 0.0002, "epoch": 4.973788150807899, "step": 69260}, {"loss": 0.5971, "grad_norm": 0.915212094783783, "learning_rate": 0.0002, "epoch": 4.974506283662477, "step": 69270}, {"loss": 0.623, "grad_norm": 1.1192166805267334, "learning_rate": 0.0002, "epoch": 4.975224416517055, "step": 69280}, {"loss": 0.6347, "grad_norm": 0.8387445211410522, "learning_rate": 0.0002, "epoch": 4.975942549371633, "step": 69290}, {"loss": 0.6392, "grad_norm": 1.1210044622421265, "learning_rate": 0.0002, "epoch": 4.976660682226212, "step": 69300}, {"loss": 0.6565, "grad_norm": 1.0051207542419434, "learning_rate": 0.0002, "epoch": 4.97737881508079, "step": 69310}, {"loss": 0.5961, "grad_norm": 0.9248682856559753, "learning_rate": 0.0002, "epoch": 4.978096947935368, "step": 69320}, {"loss": 0.6067, "grad_norm": 0.8265128135681152, "learning_rate": 0.0002, "epoch": 4.978815080789946, "step": 69330}, {"loss": 0.6068, "grad_norm": 0.9432681798934937, "learning_rate": 0.0002, "epoch": 4.979533213644524, "step": 69340}, {"loss": 0.627, "grad_norm": 1.0135977268218994, "learning_rate": 0.0002, "epoch": 4.980251346499102, "step": 69350}, {"loss": 0.5882, "grad_norm": 0.9857245683670044, "learning_rate": 0.0002, "epoch": 4.98096947935368, "step": 69360}, {"loss": 0.6396, "grad_norm": 0.9215952157974243, "learning_rate": 0.0002, "epoch": 4.981687612208258, "step": 69370}, {"loss": 0.565, "grad_norm": 1.1518077850341797, "learning_rate": 0.0002, "epoch": 4.982405745062836, "step": 69380}, {"loss": 0.6022, "grad_norm": 0.8836095929145813, "learning_rate": 0.0002, "epoch": 4.983123877917414, "step": 69390}, {"loss": 0.6442, "grad_norm": 0.8082528710365295, "learning_rate": 0.0002, "epoch": 4.983842010771993, "step": 69400}, {"loss": 0.597, "grad_norm": 0.9295604825019836, "learning_rate": 0.0002, "epoch": 4.984560143626571, "step": 69410}, {"loss": 0.5811, "grad_norm": 1.002057433128357, "learning_rate": 0.0002, "epoch": 4.985278276481149, "step": 69420}, {"loss": 0.6275, "grad_norm": 0.8127216100692749, "learning_rate": 0.0002, "epoch": 4.985996409335727, "step": 69430}, {"loss": 0.6223, "grad_norm": 1.058138370513916, "learning_rate": 0.0002, "epoch": 4.986714542190305, "step": 69440}, {"loss": 0.6317, "grad_norm": 0.8451166749000549, "learning_rate": 0.0002, "epoch": 4.987432675044883, "step": 69450}, {"loss": 0.6135, "grad_norm": 0.9687268137931824, "learning_rate": 0.0002, "epoch": 4.988150807899461, "step": 69460}, {"loss": 0.5926, "grad_norm": 1.0342036485671997, "learning_rate": 0.0002, "epoch": 4.988868940754039, "step": 69470}, {"loss": 0.636, "grad_norm": 0.9042398929595947, "learning_rate": 0.0002, "epoch": 4.989587073608618, "step": 69480}, {"loss": 0.6193, "grad_norm": 1.0575438737869263, "learning_rate": 0.0002, "epoch": 4.990305206463196, "step": 69490}, {"loss": 0.5887, "grad_norm": 0.9364935159683228, "learning_rate": 0.0002, "epoch": 4.991023339317774, "step": 69500}, {"loss": 0.6532, "grad_norm": 1.0327378511428833, "learning_rate": 0.0002, "epoch": 4.991741472172352, "step": 69510}, {"loss": 0.6397, "grad_norm": 0.815592885017395, "learning_rate": 0.0002, "epoch": 4.99245960502693, "step": 69520}, {"loss": 0.6776, "grad_norm": 1.0813369750976562, "learning_rate": 0.0002, "epoch": 4.993177737881508, "step": 69530}, {"loss": 0.6964, "grad_norm": 1.0277023315429688, "learning_rate": 0.0002, "epoch": 4.993895870736086, "step": 69540}, {"loss": 0.6369, "grad_norm": 1.0291162729263306, "learning_rate": 0.0002, "epoch": 4.994614003590664, "step": 69550}, {"loss": 0.5842, "grad_norm": 0.8435685634613037, "learning_rate": 0.0002, "epoch": 4.995332136445242, "step": 69560}, {"loss": 0.6146, "grad_norm": 1.1972291469573975, "learning_rate": 0.0002, "epoch": 4.99605026929982, "step": 69570}, {"loss": 0.5977, "grad_norm": 0.8114907741546631, "learning_rate": 0.0002, "epoch": 4.996768402154398, "step": 69580}, {"loss": 0.6137, "grad_norm": 0.8296133875846863, "learning_rate": 0.0002, "epoch": 4.997486535008977, "step": 69590}, {"loss": 0.6273, "grad_norm": 1.1728706359863281, "learning_rate": 0.0002, "epoch": 4.998204667863555, "step": 69600}, {"loss": 0.6579, "grad_norm": 0.9586578607559204, "learning_rate": 0.0002, "epoch": 4.998922800718133, "step": 69610}, {"loss": 0.612, "grad_norm": 0.9725151062011719, "learning_rate": 0.0002, "epoch": 4.999640933572711, "step": 69620}, {"eval_loss": 1.133581519126892, "eval_runtime": 55.2151, "eval_samples_per_second": 13.275, "eval_steps_per_second": 1.666, "epoch": 5.0, "step": 69625}, {"loss": 0.5741, "grad_norm": 0.9312055706977844, "learning_rate": 0.0002, "epoch": 5.000359066427289, "step": 69630}, {"loss": 0.5625, "grad_norm": 1.0534896850585938, "learning_rate": 0.0002, "epoch": 5.001077199281867, "step": 69640}, {"loss": 0.581, "grad_norm": 0.8891698718070984, "learning_rate": 0.0002, "epoch": 5.001795332136445, "step": 69650}, {"loss": 0.554, "grad_norm": 0.7791097164154053, "learning_rate": 0.0002, "epoch": 5.002513464991023, "step": 69660}, {"loss": 0.5146, "grad_norm": 1.2891173362731934, "learning_rate": 0.0002, "epoch": 5.003231597845601, "step": 69670}, {"loss": 0.551, "grad_norm": 0.7909513711929321, "learning_rate": 0.0002, "epoch": 5.00394973070018, "step": 69680}, {"loss": 0.5671, "grad_norm": 0.988648533821106, "learning_rate": 0.0002, "epoch": 5.004667863554758, "step": 69690}, {"loss": 0.5113, "grad_norm": 0.9669296741485596, "learning_rate": 0.0002, "epoch": 5.005385996409336, "step": 69700}, {"loss": 0.5974, "grad_norm": 1.2393349409103394, "learning_rate": 0.0002, "epoch": 5.006104129263914, "step": 69710}, {"loss": 0.5481, "grad_norm": 1.2420750856399536, "learning_rate": 0.0002, "epoch": 5.006822262118492, "step": 69720}, {"loss": 0.5725, "grad_norm": 1.1698096990585327, "learning_rate": 0.0002, "epoch": 5.00754039497307, "step": 69730}, {"loss": 0.5646, "grad_norm": 1.2228301763534546, "learning_rate": 0.0002, "epoch": 5.008258527827648, "step": 69740}, {"loss": 0.6048, "grad_norm": 0.9350621104240417, "learning_rate": 0.0002, "epoch": 5.008976660682226, "step": 69750}, {"loss": 0.5278, "grad_norm": 0.9828507304191589, "learning_rate": 0.0002, "epoch": 5.009694793536804, "step": 69760}, {"loss": 0.5188, "grad_norm": 0.9372149109840393, "learning_rate": 0.0002, "epoch": 5.010412926391383, "step": 69770}, {"loss": 0.5408, "grad_norm": 0.8098477125167847, "learning_rate": 0.0002, "epoch": 5.011131059245961, "step": 69780}, {"loss": 0.533, "grad_norm": 1.0418338775634766, "learning_rate": 0.0002, "epoch": 5.011849192100539, "step": 69790}, {"loss": 0.5423, "grad_norm": 1.0175801515579224, "learning_rate": 0.0002, "epoch": 5.012567324955117, "step": 69800}, {"loss": 0.5389, "grad_norm": 1.2128081321716309, "learning_rate": 0.0002, "epoch": 5.013285457809695, "step": 69810}, {"loss": 0.5307, "grad_norm": 1.001805067062378, "learning_rate": 0.0002, "epoch": 5.014003590664273, "step": 69820}, {"loss": 0.533, "grad_norm": 0.8957470059394836, "learning_rate": 0.0002, "epoch": 5.014721723518851, "step": 69830}, {"loss": 0.6017, "grad_norm": 0.9344548583030701, "learning_rate": 0.0002, "epoch": 5.015439856373429, "step": 69840}, {"loss": 0.6182, "grad_norm": 0.8545927405357361, "learning_rate": 0.0002, "epoch": 5.016157989228007, "step": 69850}, {"loss": 0.5543, "grad_norm": 1.3907777070999146, "learning_rate": 0.0002, "epoch": 5.016876122082586, "step": 69860}, {"loss": 0.5028, "grad_norm": 0.8112093806266785, "learning_rate": 0.0002, "epoch": 5.017594254937164, "step": 69870}, {"loss": 0.5, "grad_norm": 1.0151532888412476, "learning_rate": 0.0002, "epoch": 5.018312387791742, "step": 69880}, {"loss": 0.5622, "grad_norm": 1.249021053314209, "learning_rate": 0.0002, "epoch": 5.01903052064632, "step": 69890}, {"loss": 0.5419, "grad_norm": 0.9310314059257507, "learning_rate": 0.0002, "epoch": 5.019748653500898, "step": 69900}, {"loss": 0.5628, "grad_norm": 0.9444572925567627, "learning_rate": 0.0002, "epoch": 5.020466786355476, "step": 69910}, {"loss": 0.5436, "grad_norm": 1.0952081680297852, "learning_rate": 0.0002, "epoch": 5.021184919210054, "step": 69920}, {"loss": 0.5532, "grad_norm": 1.2106375694274902, "learning_rate": 0.0002, "epoch": 5.021903052064632, "step": 69930}, {"loss": 0.5307, "grad_norm": 1.0179580450057983, "learning_rate": 0.0002, "epoch": 5.02262118491921, "step": 69940}, {"loss": 0.5537, "grad_norm": 1.0865367650985718, "learning_rate": 0.0002, "epoch": 5.023339317773788, "step": 69950}, {"loss": 0.6011, "grad_norm": 1.0965075492858887, "learning_rate": 0.0002, "epoch": 5.024057450628367, "step": 69960}, {"loss": 0.5255, "grad_norm": 0.8879445791244507, "learning_rate": 0.0002, "epoch": 5.024775583482945, "step": 69970}, {"loss": 0.5681, "grad_norm": 1.2588363885879517, "learning_rate": 0.0002, "epoch": 5.025493716337523, "step": 69980}, {"loss": 0.5288, "grad_norm": 0.935705304145813, "learning_rate": 0.0002, "epoch": 5.026211849192101, "step": 69990}, {"loss": 0.4922, "grad_norm": 1.072012186050415, "learning_rate": 0.0002, "epoch": 5.026929982046679, "step": 70000}, {"loss": 0.5729, "grad_norm": 1.286438226699829, "learning_rate": 0.0002, "epoch": 5.027648114901257, "step": 70010}, {"loss": 0.5569, "grad_norm": 1.1165392398834229, "learning_rate": 0.0002, "epoch": 5.028366247755835, "step": 70020}, {"loss": 0.5348, "grad_norm": 0.7998424172401428, "learning_rate": 0.0002, "epoch": 5.029084380610413, "step": 70030}, {"loss": 0.5436, "grad_norm": 1.5669852495193481, "learning_rate": 0.0002, "epoch": 5.029802513464991, "step": 70040}, {"loss": 0.5595, "grad_norm": 0.9780290722846985, "learning_rate": 0.0002, "epoch": 5.0305206463195695, "step": 70050}, {"loss": 0.5612, "grad_norm": 0.9837628602981567, "learning_rate": 0.0002, "epoch": 5.0312387791741475, "step": 70060}, {"loss": 0.5369, "grad_norm": 0.9558916091918945, "learning_rate": 0.0002, "epoch": 5.0319569120287255, "step": 70070}, {"loss": 0.552, "grad_norm": 0.8893155455589294, "learning_rate": 0.0002, "epoch": 5.0326750448833035, "step": 70080}, {"loss": 0.5684, "grad_norm": 1.1403675079345703, "learning_rate": 0.0002, "epoch": 5.0333931777378815, "step": 70090}, {"loss": 0.5352, "grad_norm": 1.0453649759292603, "learning_rate": 0.0002, "epoch": 5.0341113105924595, "step": 70100}, {"loss": 0.5691, "grad_norm": 0.8127498030662537, "learning_rate": 0.0002, "epoch": 5.0348294434470375, "step": 70110}, {"loss": 0.5254, "grad_norm": 0.9344680309295654, "learning_rate": 0.0002, "epoch": 5.0355475763016155, "step": 70120}, {"loss": 0.5385, "grad_norm": 1.0302079916000366, "learning_rate": 0.0002, "epoch": 5.0362657091561935, "step": 70130}, {"loss": 0.5949, "grad_norm": 1.0549713373184204, "learning_rate": 0.0002, "epoch": 5.036983842010772, "step": 70140}, {"loss": 0.4886, "grad_norm": 0.8916767835617065, "learning_rate": 0.0002, "epoch": 5.03770197486535, "step": 70150}, {"loss": 0.5761, "grad_norm": 0.9799798130989075, "learning_rate": 0.0002, "epoch": 5.038420107719928, "step": 70160}, {"loss": 0.5138, "grad_norm": 1.15560781955719, "learning_rate": 0.0002, "epoch": 5.039138240574506, "step": 70170}, {"loss": 0.6075, "grad_norm": 1.0577017068862915, "learning_rate": 0.0002, "epoch": 5.039856373429084, "step": 70180}, {"loss": 0.5316, "grad_norm": 1.027990698814392, "learning_rate": 0.0002, "epoch": 5.040574506283662, "step": 70190}, {"loss": 0.567, "grad_norm": 1.0818232297897339, "learning_rate": 0.0002, "epoch": 5.04129263913824, "step": 70200}, {"loss": 0.5699, "grad_norm": 1.0287196636199951, "learning_rate": 0.0002, "epoch": 5.042010771992818, "step": 70210}, {"loss": 0.5129, "grad_norm": 1.1569273471832275, "learning_rate": 0.0002, "epoch": 5.042728904847396, "step": 70220}, {"loss": 0.5407, "grad_norm": 1.0485484600067139, "learning_rate": 0.0002, "epoch": 5.0434470377019744, "step": 70230}, {"loss": 0.5203, "grad_norm": 0.9244540333747864, "learning_rate": 0.0002, "epoch": 5.044165170556553, "step": 70240}, {"loss": 0.5277, "grad_norm": 0.9576422572135925, "learning_rate": 0.0002, "epoch": 5.044883303411131, "step": 70250}, {"loss": 0.539, "grad_norm": 0.8719421625137329, "learning_rate": 0.0002, "epoch": 5.045601436265709, "step": 70260}, {"loss": 0.5725, "grad_norm": 0.8685409426689148, "learning_rate": 0.0002, "epoch": 5.046319569120287, "step": 70270}, {"loss": 0.5111, "grad_norm": 1.2735247611999512, "learning_rate": 0.0002, "epoch": 5.047037701974865, "step": 70280}, {"loss": 0.5768, "grad_norm": 0.9082128405570984, "learning_rate": 0.0002, "epoch": 5.047755834829443, "step": 70290}, {"loss": 0.5649, "grad_norm": 1.0626471042633057, "learning_rate": 0.0002, "epoch": 5.048473967684021, "step": 70300}, {"loss": 0.5694, "grad_norm": 1.1463991403579712, "learning_rate": 0.0002, "epoch": 5.049192100538599, "step": 70310}, {"loss": 0.5912, "grad_norm": 0.8825355172157288, "learning_rate": 0.0002, "epoch": 5.049910233393177, "step": 70320}, {"loss": 0.5814, "grad_norm": 1.0549408197402954, "learning_rate": 0.0002, "epoch": 5.050628366247756, "step": 70330}, {"loss": 0.5658, "grad_norm": 1.3740944862365723, "learning_rate": 0.0002, "epoch": 5.051346499102334, "step": 70340}, {"loss": 0.5665, "grad_norm": 1.4197895526885986, "learning_rate": 0.0002, "epoch": 5.052064631956912, "step": 70350}, {"loss": 0.5852, "grad_norm": 1.1764925718307495, "learning_rate": 0.0002, "epoch": 5.05278276481149, "step": 70360}, {"loss": 0.5551, "grad_norm": 1.0443403720855713, "learning_rate": 0.0002, "epoch": 5.053500897666068, "step": 70370}, {"loss": 0.5647, "grad_norm": 1.1807527542114258, "learning_rate": 0.0002, "epoch": 5.054219030520646, "step": 70380}, {"loss": 0.5712, "grad_norm": 1.4032433032989502, "learning_rate": 0.0002, "epoch": 5.054937163375224, "step": 70390}, {"loss": 0.5656, "grad_norm": 0.9815662503242493, "learning_rate": 0.0002, "epoch": 5.055655296229802, "step": 70400}, {"loss": 0.5878, "grad_norm": 0.9368446469306946, "learning_rate": 0.0002, "epoch": 5.05637342908438, "step": 70410}, {"loss": 0.5639, "grad_norm": 1.1156736612319946, "learning_rate": 0.0002, "epoch": 5.057091561938959, "step": 70420}, {"loss": 0.5564, "grad_norm": 1.01651132106781, "learning_rate": 0.0002, "epoch": 5.057809694793537, "step": 70430}, {"loss": 0.5276, "grad_norm": 0.9906342029571533, "learning_rate": 0.0002, "epoch": 5.058527827648115, "step": 70440}, {"loss": 0.5533, "grad_norm": 0.8666667938232422, "learning_rate": 0.0002, "epoch": 5.059245960502693, "step": 70450}, {"loss": 0.5253, "grad_norm": 1.0508924722671509, "learning_rate": 0.0002, "epoch": 5.059964093357271, "step": 70460}, {"loss": 0.5456, "grad_norm": 1.2472858428955078, "learning_rate": 0.0002, "epoch": 5.060682226211849, "step": 70470}, {"loss": 0.5836, "grad_norm": 1.019073724746704, "learning_rate": 0.0002, "epoch": 5.061400359066427, "step": 70480}, {"loss": 0.5206, "grad_norm": 0.9745403528213501, "learning_rate": 0.0002, "epoch": 5.062118491921005, "step": 70490}, {"loss": 0.5543, "grad_norm": 1.121208906173706, "learning_rate": 0.0002, "epoch": 5.062836624775583, "step": 70500}, {"loss": 0.54, "grad_norm": 1.0535147190093994, "learning_rate": 0.0002, "epoch": 5.063554757630161, "step": 70510}, {"loss": 0.5601, "grad_norm": 1.0368950366973877, "learning_rate": 0.0002, "epoch": 5.06427289048474, "step": 70520}, {"loss": 0.5495, "grad_norm": 0.948964536190033, "learning_rate": 0.0002, "epoch": 5.064991023339318, "step": 70530}, {"loss": 0.5254, "grad_norm": 1.0289826393127441, "learning_rate": 0.0002, "epoch": 5.065709156193896, "step": 70540}, {"loss": 0.591, "grad_norm": 1.118374228477478, "learning_rate": 0.0002, "epoch": 5.066427289048474, "step": 70550}, {"loss": 0.5874, "grad_norm": 0.8712816834449768, "learning_rate": 0.0002, "epoch": 5.067145421903052, "step": 70560}, {"loss": 0.557, "grad_norm": 0.9057969450950623, "learning_rate": 0.0002, "epoch": 5.06786355475763, "step": 70570}, {"loss": 0.5606, "grad_norm": 0.9292685985565186, "learning_rate": 0.0002, "epoch": 5.068581687612208, "step": 70580}, {"loss": 0.5468, "grad_norm": 0.9159911274909973, "learning_rate": 0.0002, "epoch": 5.069299820466786, "step": 70590}, {"loss": 0.5608, "grad_norm": 0.973848819732666, "learning_rate": 0.0002, "epoch": 5.070017953321364, "step": 70600}, {"loss": 0.5199, "grad_norm": 0.7892279028892517, "learning_rate": 0.0002, "epoch": 5.070736086175943, "step": 70610}, {"loss": 0.6009, "grad_norm": 0.9943311214447021, "learning_rate": 0.0002, "epoch": 5.071454219030521, "step": 70620}, {"loss": 0.5224, "grad_norm": 1.1457926034927368, "learning_rate": 0.0002, "epoch": 5.072172351885099, "step": 70630}, {"loss": 0.5821, "grad_norm": 0.9307738542556763, "learning_rate": 0.0002, "epoch": 5.072890484739677, "step": 70640}, {"loss": 0.5375, "grad_norm": 1.0899816751480103, "learning_rate": 0.0002, "epoch": 5.073608617594255, "step": 70650}, {"loss": 0.5407, "grad_norm": 0.8357672691345215, "learning_rate": 0.0002, "epoch": 5.074326750448833, "step": 70660}, {"loss": 0.5745, "grad_norm": 0.8889468312263489, "learning_rate": 0.0002, "epoch": 5.075044883303411, "step": 70670}, {"loss": 0.5595, "grad_norm": 0.9152118563652039, "learning_rate": 0.0002, "epoch": 5.075763016157989, "step": 70680}, {"loss": 0.5706, "grad_norm": 1.106160044670105, "learning_rate": 0.0002, "epoch": 5.076481149012567, "step": 70690}, {"loss": 0.5659, "grad_norm": 0.8519207835197449, "learning_rate": 0.0002, "epoch": 5.077199281867145, "step": 70700}, {"loss": 0.5312, "grad_norm": 0.9754986763000488, "learning_rate": 0.0002, "epoch": 5.077917414721724, "step": 70710}, {"loss": 0.5602, "grad_norm": 1.167883276939392, "learning_rate": 0.0002, "epoch": 5.078635547576302, "step": 70720}, {"loss": 0.5427, "grad_norm": 0.987622082233429, "learning_rate": 0.0002, "epoch": 5.07935368043088, "step": 70730}, {"loss": 0.5346, "grad_norm": 1.0008184909820557, "learning_rate": 0.0002, "epoch": 5.080071813285458, "step": 70740}, {"loss": 0.5219, "grad_norm": 0.6318819522857666, "learning_rate": 0.0002, "epoch": 5.080789946140036, "step": 70750}, {"loss": 0.5838, "grad_norm": 0.984886884689331, "learning_rate": 0.0002, "epoch": 5.081508078994614, "step": 70760}, {"loss": 0.5775, "grad_norm": 1.0583622455596924, "learning_rate": 0.0002, "epoch": 5.082226211849192, "step": 70770}, {"loss": 0.579, "grad_norm": 0.9730119705200195, "learning_rate": 0.0002, "epoch": 5.08294434470377, "step": 70780}, {"loss": 0.5806, "grad_norm": 1.0201330184936523, "learning_rate": 0.0002, "epoch": 5.083662477558348, "step": 70790}, {"loss": 0.5568, "grad_norm": 1.0479248762130737, "learning_rate": 0.0002, "epoch": 5.084380610412927, "step": 70800}, {"loss": 0.5619, "grad_norm": 0.9185113906860352, "learning_rate": 0.0002, "epoch": 5.085098743267505, "step": 70810}, {"loss": 0.5468, "grad_norm": 0.9326799511909485, "learning_rate": 0.0002, "epoch": 5.085816876122083, "step": 70820}, {"loss": 0.5424, "grad_norm": 0.958739697933197, "learning_rate": 0.0002, "epoch": 5.086535008976661, "step": 70830}, {"loss": 0.6098, "grad_norm": 0.9643770456314087, "learning_rate": 0.0002, "epoch": 5.087253141831239, "step": 70840}, {"loss": 0.5427, "grad_norm": 0.8650234341621399, "learning_rate": 0.0002, "epoch": 5.087971274685817, "step": 70850}, {"loss": 0.5452, "grad_norm": 0.9354105591773987, "learning_rate": 0.0002, "epoch": 5.088689407540395, "step": 70860}, {"loss": 0.5467, "grad_norm": 0.8736345171928406, "learning_rate": 0.0002, "epoch": 5.089407540394973, "step": 70870}, {"loss": 0.5607, "grad_norm": 0.9172632098197937, "learning_rate": 0.0002, "epoch": 5.090125673249551, "step": 70880}, {"loss": 0.5136, "grad_norm": 0.9495565295219421, "learning_rate": 0.0002, "epoch": 5.09084380610413, "step": 70890}, {"loss": 0.5633, "grad_norm": 1.0328829288482666, "learning_rate": 0.0002, "epoch": 5.091561938958708, "step": 70900}, {"loss": 0.566, "grad_norm": 0.9335703253746033, "learning_rate": 0.0002, "epoch": 5.092280071813286, "step": 70910}, {"loss": 0.5393, "grad_norm": 1.0919437408447266, "learning_rate": 0.0002, "epoch": 5.092998204667864, "step": 70920}, {"loss": 0.5931, "grad_norm": 1.03340744972229, "learning_rate": 0.0002, "epoch": 5.093716337522442, "step": 70930}, {"loss": 0.5228, "grad_norm": 1.0501604080200195, "learning_rate": 0.0002, "epoch": 5.09443447037702, "step": 70940}, {"loss": 0.5518, "grad_norm": 0.9442012310028076, "learning_rate": 0.0002, "epoch": 5.095152603231598, "step": 70950}, {"loss": 0.5185, "grad_norm": 1.2592464685440063, "learning_rate": 0.0002, "epoch": 5.095870736086176, "step": 70960}, {"loss": 0.5524, "grad_norm": 1.0961427688598633, "learning_rate": 0.0002, "epoch": 5.096588868940754, "step": 70970}, {"loss": 0.5702, "grad_norm": 1.0472424030303955, "learning_rate": 0.0002, "epoch": 5.097307001795333, "step": 70980}, {"loss": 0.5697, "grad_norm": 0.9489352107048035, "learning_rate": 0.0002, "epoch": 5.098025134649911, "step": 70990}, {"loss": 0.5559, "grad_norm": 1.0499446392059326, "learning_rate": 0.0002, "epoch": 5.098743267504489, "step": 71000}, {"loss": 0.5815, "grad_norm": 1.013005018234253, "learning_rate": 0.0002, "epoch": 5.099461400359067, "step": 71010}, {"loss": 0.5524, "grad_norm": 0.9594261050224304, "learning_rate": 0.0002, "epoch": 5.100179533213645, "step": 71020}, {"loss": 0.5746, "grad_norm": 1.2016123533248901, "learning_rate": 0.0002, "epoch": 5.100897666068223, "step": 71030}, {"loss": 0.5605, "grad_norm": 1.0389765501022339, "learning_rate": 0.0002, "epoch": 5.101615798922801, "step": 71040}, {"loss": 0.5036, "grad_norm": 1.053534746170044, "learning_rate": 0.0002, "epoch": 5.102333931777379, "step": 71050}, {"loss": 0.5764, "grad_norm": 1.1379448175430298, "learning_rate": 0.0002, "epoch": 5.103052064631957, "step": 71060}, {"loss": 0.5487, "grad_norm": 0.8796491622924805, "learning_rate": 0.0002, "epoch": 5.103770197486535, "step": 71070}, {"loss": 0.59, "grad_norm": 1.0591254234313965, "learning_rate": 0.0002, "epoch": 5.1044883303411135, "step": 71080}, {"loss": 0.5591, "grad_norm": 0.9622171521186829, "learning_rate": 0.0002, "epoch": 5.1052064631956915, "step": 71090}, {"loss": 0.5737, "grad_norm": 0.9173060059547424, "learning_rate": 0.0002, "epoch": 5.1059245960502695, "step": 71100}, {"loss": 0.5794, "grad_norm": 0.8363444805145264, "learning_rate": 0.0002, "epoch": 5.1066427289048475, "step": 71110}, {"loss": 0.5689, "grad_norm": 1.1006172895431519, "learning_rate": 0.0002, "epoch": 5.1073608617594255, "step": 71120}, {"loss": 0.5753, "grad_norm": 1.0720574855804443, "learning_rate": 0.0002, "epoch": 5.1080789946140035, "step": 71130}, {"loss": 0.5585, "grad_norm": 1.0560680627822876, "learning_rate": 0.0002, "epoch": 5.1087971274685815, "step": 71140}, {"loss": 0.5535, "grad_norm": 0.8485415577888489, "learning_rate": 0.0002, "epoch": 5.1095152603231595, "step": 71150}, {"loss": 0.545, "grad_norm": 1.109383225440979, "learning_rate": 0.0002, "epoch": 5.1102333931777375, "step": 71160}, {"loss": 0.568, "grad_norm": 0.9296035766601562, "learning_rate": 0.0002, "epoch": 5.110951526032316, "step": 71170}, {"loss": 0.5151, "grad_norm": 1.2855182886123657, "learning_rate": 0.0002, "epoch": 5.111669658886894, "step": 71180}, {"loss": 0.5578, "grad_norm": 1.0313524007797241, "learning_rate": 0.0002, "epoch": 5.112387791741472, "step": 71190}, {"loss": 0.5486, "grad_norm": 1.0436697006225586, "learning_rate": 0.0002, "epoch": 5.11310592459605, "step": 71200}, {"loss": 0.5592, "grad_norm": 0.901333212852478, "learning_rate": 0.0002, "epoch": 5.113824057450628, "step": 71210}, {"loss": 0.5644, "grad_norm": 1.2170051336288452, "learning_rate": 0.0002, "epoch": 5.114542190305206, "step": 71220}, {"loss": 0.5508, "grad_norm": 0.8850961327552795, "learning_rate": 0.0002, "epoch": 5.115260323159784, "step": 71230}, {"loss": 0.5814, "grad_norm": 1.0147113800048828, "learning_rate": 0.0002, "epoch": 5.115978456014362, "step": 71240}, {"loss": 0.5824, "grad_norm": 1.0043506622314453, "learning_rate": 0.0002, "epoch": 5.11669658886894, "step": 71250}, {"loss": 0.5363, "grad_norm": 0.9887113571166992, "learning_rate": 0.0002, "epoch": 5.117414721723518, "step": 71260}, {"loss": 0.5956, "grad_norm": 1.1013392210006714, "learning_rate": 0.0002, "epoch": 5.118132854578097, "step": 71270}, {"loss": 0.5596, "grad_norm": 0.9213799238204956, "learning_rate": 0.0002, "epoch": 5.118850987432675, "step": 71280}, {"loss": 0.5473, "grad_norm": 1.047400712966919, "learning_rate": 0.0002, "epoch": 5.119569120287253, "step": 71290}, {"loss": 0.5866, "grad_norm": 1.030534029006958, "learning_rate": 0.0002, "epoch": 5.120287253141831, "step": 71300}, {"loss": 0.5713, "grad_norm": 0.9464976191520691, "learning_rate": 0.0002, "epoch": 5.121005385996409, "step": 71310}, {"loss": 0.5707, "grad_norm": 0.8610315918922424, "learning_rate": 0.0002, "epoch": 5.121723518850987, "step": 71320}, {"loss": 0.5498, "grad_norm": 1.0824426412582397, "learning_rate": 0.0002, "epoch": 5.122441651705565, "step": 71330}, {"loss": 0.5802, "grad_norm": 0.9382733106613159, "learning_rate": 0.0002, "epoch": 5.123159784560143, "step": 71340}, {"loss": 0.5899, "grad_norm": 0.9364684224128723, "learning_rate": 0.0002, "epoch": 5.123877917414721, "step": 71350}, {"loss": 0.5839, "grad_norm": 0.9583013653755188, "learning_rate": 0.0002, "epoch": 5.1245960502693, "step": 71360}, {"loss": 0.5446, "grad_norm": 1.287533164024353, "learning_rate": 0.0002, "epoch": 5.125314183123878, "step": 71370}, {"loss": 0.5602, "grad_norm": 1.5031169652938843, "learning_rate": 0.0002, "epoch": 5.126032315978456, "step": 71380}, {"loss": 0.5143, "grad_norm": 0.9891406297683716, "learning_rate": 0.0002, "epoch": 5.126750448833034, "step": 71390}, {"loss": 0.5408, "grad_norm": 1.1851537227630615, "learning_rate": 0.0002, "epoch": 5.127468581687612, "step": 71400}, {"loss": 0.586, "grad_norm": 0.9869971871376038, "learning_rate": 0.0002, "epoch": 5.12818671454219, "step": 71410}, {"loss": 0.575, "grad_norm": 0.961662769317627, "learning_rate": 0.0002, "epoch": 5.128904847396768, "step": 71420}, {"loss": 0.5686, "grad_norm": 1.1036419868469238, "learning_rate": 0.0002, "epoch": 5.129622980251346, "step": 71430}, {"loss": 0.5642, "grad_norm": 1.175361156463623, "learning_rate": 0.0002, "epoch": 5.130341113105924, "step": 71440}, {"loss": 0.5294, "grad_norm": 0.9801875948905945, "learning_rate": 0.0002, "epoch": 5.131059245960503, "step": 71450}, {"loss": 0.5123, "grad_norm": 0.9424611330032349, "learning_rate": 0.0002, "epoch": 5.131777378815081, "step": 71460}, {"loss": 0.651, "grad_norm": 1.11662757396698, "learning_rate": 0.0002, "epoch": 5.132495511669659, "step": 71470}, {"loss": 0.5498, "grad_norm": 0.9969366192817688, "learning_rate": 0.0002, "epoch": 5.133213644524237, "step": 71480}, {"loss": 0.5315, "grad_norm": 1.278640866279602, "learning_rate": 0.0002, "epoch": 5.133931777378815, "step": 71490}, {"loss": 0.5525, "grad_norm": 1.1090457439422607, "learning_rate": 0.0002, "epoch": 5.134649910233393, "step": 71500}, {"loss": 0.5307, "grad_norm": 1.01808500289917, "learning_rate": 0.0002, "epoch": 5.135368043087971, "step": 71510}, {"loss": 0.5465, "grad_norm": 1.029135823249817, "learning_rate": 0.0002, "epoch": 5.136086175942549, "step": 71520}, {"loss": 0.588, "grad_norm": 1.1207175254821777, "learning_rate": 0.0002, "epoch": 5.136804308797127, "step": 71530}, {"loss": 0.5451, "grad_norm": 1.0327218770980835, "learning_rate": 0.0002, "epoch": 5.137522441651706, "step": 71540}, {"loss": 0.5944, "grad_norm": 1.042490839958191, "learning_rate": 0.0002, "epoch": 5.138240574506284, "step": 71550}, {"loss": 0.5777, "grad_norm": 1.1800413131713867, "learning_rate": 0.0002, "epoch": 5.138958707360862, "step": 71560}, {"loss": 0.6002, "grad_norm": 1.0748766660690308, "learning_rate": 0.0002, "epoch": 5.13967684021544, "step": 71570}, {"loss": 0.5418, "grad_norm": 0.9983090758323669, "learning_rate": 0.0002, "epoch": 5.140394973070018, "step": 71580}, {"loss": 0.5423, "grad_norm": 1.30636727809906, "learning_rate": 0.0002, "epoch": 5.141113105924596, "step": 71590}, {"loss": 0.5742, "grad_norm": 0.9960222840309143, "learning_rate": 0.0002, "epoch": 5.141831238779174, "step": 71600}, {"loss": 0.5496, "grad_norm": 1.237027645111084, "learning_rate": 0.0002, "epoch": 5.142549371633752, "step": 71610}, {"loss": 0.564, "grad_norm": 1.0913307666778564, "learning_rate": 0.0002, "epoch": 5.14326750448833, "step": 71620}, {"loss": 0.5458, "grad_norm": 0.940657913684845, "learning_rate": 0.0002, "epoch": 5.143985637342908, "step": 71630}, {"loss": 0.5918, "grad_norm": 1.093796730041504, "learning_rate": 0.0002, "epoch": 5.144703770197487, "step": 71640}, {"loss": 0.5519, "grad_norm": 0.9703856110572815, "learning_rate": 0.0002, "epoch": 5.145421903052065, "step": 71650}, {"loss": 0.5859, "grad_norm": 0.9874776005744934, "learning_rate": 0.0002, "epoch": 5.146140035906643, "step": 71660}, {"loss": 0.555, "grad_norm": 0.9723859429359436, "learning_rate": 0.0002, "epoch": 5.146858168761221, "step": 71670}, {"loss": 0.5866, "grad_norm": 0.997107207775116, "learning_rate": 0.0002, "epoch": 5.147576301615799, "step": 71680}, {"loss": 0.5399, "grad_norm": 1.0261175632476807, "learning_rate": 0.0002, "epoch": 5.148294434470377, "step": 71690}, {"loss": 0.5427, "grad_norm": 0.9093905687332153, "learning_rate": 0.0002, "epoch": 5.149012567324955, "step": 71700}, {"loss": 0.557, "grad_norm": 0.9909888505935669, "learning_rate": 0.0002, "epoch": 5.149730700179533, "step": 71710}, {"loss": 0.5343, "grad_norm": 0.9111971259117126, "learning_rate": 0.0002, "epoch": 5.150448833034111, "step": 71720}, {"loss": 0.5717, "grad_norm": 0.9319643974304199, "learning_rate": 0.0002, "epoch": 5.15116696588869, "step": 71730}, {"loss": 0.5676, "grad_norm": 1.0744104385375977, "learning_rate": 0.0002, "epoch": 5.151885098743268, "step": 71740}, {"loss": 0.5914, "grad_norm": 1.1555477380752563, "learning_rate": 0.0002, "epoch": 5.152603231597846, "step": 71750}, {"loss": 0.5859, "grad_norm": 0.9809171557426453, "learning_rate": 0.0002, "epoch": 5.153321364452424, "step": 71760}, {"loss": 0.5663, "grad_norm": 0.7937686443328857, "learning_rate": 0.0002, "epoch": 5.154039497307002, "step": 71770}, {"loss": 0.5637, "grad_norm": 1.1925430297851562, "learning_rate": 0.0002, "epoch": 5.15475763016158, "step": 71780}, {"loss": 0.5759, "grad_norm": 1.077412486076355, "learning_rate": 0.0002, "epoch": 5.155475763016158, "step": 71790}, {"loss": 0.5653, "grad_norm": 0.7992808222770691, "learning_rate": 0.0002, "epoch": 5.156193895870736, "step": 71800}, {"loss": 0.5596, "grad_norm": 1.0938535928726196, "learning_rate": 0.0002, "epoch": 5.156912028725314, "step": 71810}, {"loss": 0.5562, "grad_norm": 0.9458112120628357, "learning_rate": 0.0002, "epoch": 5.157630161579892, "step": 71820}, {"loss": 0.5514, "grad_norm": 0.984940230846405, "learning_rate": 0.0002, "epoch": 5.158348294434471, "step": 71830}, {"loss": 0.5262, "grad_norm": 0.9242565035820007, "learning_rate": 0.0002, "epoch": 5.159066427289049, "step": 71840}, {"loss": 0.5591, "grad_norm": 0.8386720418930054, "learning_rate": 0.0002, "epoch": 5.159784560143627, "step": 71850}, {"loss": 0.5871, "grad_norm": 0.9627357721328735, "learning_rate": 0.0002, "epoch": 5.160502692998205, "step": 71860}, {"loss": 0.6063, "grad_norm": 1.0118762254714966, "learning_rate": 0.0002, "epoch": 5.161220825852783, "step": 71870}, {"loss": 0.5558, "grad_norm": 1.1552608013153076, "learning_rate": 0.0002, "epoch": 5.161938958707361, "step": 71880}, {"loss": 0.5789, "grad_norm": 1.0910389423370361, "learning_rate": 0.0002, "epoch": 5.162657091561939, "step": 71890}, {"loss": 0.5568, "grad_norm": 1.046639084815979, "learning_rate": 0.0002, "epoch": 5.163375224416517, "step": 71900}, {"loss": 0.5646, "grad_norm": 1.0087649822235107, "learning_rate": 0.0002, "epoch": 5.164093357271095, "step": 71910}, {"loss": 0.5663, "grad_norm": 0.9418644309043884, "learning_rate": 0.0002, "epoch": 5.164811490125674, "step": 71920}, {"loss": 0.5668, "grad_norm": 1.1213915348052979, "learning_rate": 0.0002, "epoch": 5.165529622980252, "step": 71930}, {"loss": 0.5979, "grad_norm": 1.043786644935608, "learning_rate": 0.0002, "epoch": 5.16624775583483, "step": 71940}, {"loss": 0.5714, "grad_norm": 1.2150449752807617, "learning_rate": 0.0002, "epoch": 5.166965888689408, "step": 71950}, {"loss": 0.5766, "grad_norm": 1.1214520931243896, "learning_rate": 0.0002, "epoch": 5.167684021543986, "step": 71960}, {"loss": 0.5851, "grad_norm": 0.9235218167304993, "learning_rate": 0.0002, "epoch": 5.168402154398564, "step": 71970}, {"loss": 0.5917, "grad_norm": 0.8736480474472046, "learning_rate": 0.0002, "epoch": 5.169120287253142, "step": 71980}, {"loss": 0.5508, "grad_norm": 0.8723195195198059, "learning_rate": 0.0002, "epoch": 5.16983842010772, "step": 71990}, {"loss": 0.5927, "grad_norm": 1.0873022079467773, "learning_rate": 0.0002, "epoch": 5.170556552962298, "step": 72000}, {"loss": 0.5507, "grad_norm": 0.9196295142173767, "learning_rate": 0.0002, "epoch": 5.1712746858168765, "step": 72010}, {"loss": 0.5416, "grad_norm": 0.9244471192359924, "learning_rate": 0.0002, "epoch": 5.1719928186714546, "step": 72020}, {"loss": 0.5626, "grad_norm": 1.0555505752563477, "learning_rate": 0.0002, "epoch": 5.1727109515260326, "step": 72030}, {"loss": 0.6181, "grad_norm": 1.1527929306030273, "learning_rate": 0.0002, "epoch": 5.1734290843806106, "step": 72040}, {"loss": 0.6129, "grad_norm": 0.9069058895111084, "learning_rate": 0.0002, "epoch": 5.174147217235189, "step": 72050}, {"loss": 0.5597, "grad_norm": 1.1047141551971436, "learning_rate": 0.0002, "epoch": 5.174865350089767, "step": 72060}, {"loss": 0.5307, "grad_norm": 0.9805511832237244, "learning_rate": 0.0002, "epoch": 5.175583482944345, "step": 72070}, {"loss": 0.5672, "grad_norm": 1.1636970043182373, "learning_rate": 0.0002, "epoch": 5.176301615798923, "step": 72080}, {"loss": 0.6424, "grad_norm": 1.0193538665771484, "learning_rate": 0.0002, "epoch": 5.177019748653501, "step": 72090}, {"loss": 0.5722, "grad_norm": 0.8850618600845337, "learning_rate": 0.0002, "epoch": 5.177737881508079, "step": 72100}, {"loss": 0.5938, "grad_norm": 1.042271614074707, "learning_rate": 0.0002, "epoch": 5.1784560143626575, "step": 72110}, {"loss": 0.569, "grad_norm": 1.1405227184295654, "learning_rate": 0.0002, "epoch": 5.1791741472172355, "step": 72120}, {"loss": 0.5762, "grad_norm": 1.0013195276260376, "learning_rate": 0.0002, "epoch": 5.1798922800718135, "step": 72130}, {"loss": 0.5948, "grad_norm": 1.0474903583526611, "learning_rate": 0.0002, "epoch": 5.1806104129263915, "step": 72140}, {"loss": 0.5692, "grad_norm": 1.0384612083435059, "learning_rate": 0.0002, "epoch": 5.1813285457809695, "step": 72150}, {"loss": 0.5588, "grad_norm": 1.145086646080017, "learning_rate": 0.0002, "epoch": 5.1820466786355475, "step": 72160}, {"loss": 0.5294, "grad_norm": 1.0845173597335815, "learning_rate": 0.0002, "epoch": 5.1827648114901255, "step": 72170}, {"loss": 0.5796, "grad_norm": 0.9870346188545227, "learning_rate": 0.0002, "epoch": 5.1834829443447035, "step": 72180}, {"loss": 0.5844, "grad_norm": 1.1098768711090088, "learning_rate": 0.0002, "epoch": 5.1842010771992815, "step": 72190}, {"loss": 0.5536, "grad_norm": 0.9397785067558289, "learning_rate": 0.0002, "epoch": 5.18491921005386, "step": 72200}, {"loss": 0.5847, "grad_norm": 1.0817532539367676, "learning_rate": 0.0002, "epoch": 5.185637342908438, "step": 72210}, {"loss": 0.5492, "grad_norm": 1.0027309656143188, "learning_rate": 0.0002, "epoch": 5.186355475763016, "step": 72220}, {"loss": 0.5685, "grad_norm": 0.8262016773223877, "learning_rate": 0.0002, "epoch": 5.187073608617594, "step": 72230}, {"loss": 0.53, "grad_norm": 0.9968137741088867, "learning_rate": 0.0002, "epoch": 5.187791741472172, "step": 72240}, {"loss": 0.5663, "grad_norm": 0.9072695970535278, "learning_rate": 0.0002, "epoch": 5.18850987432675, "step": 72250}, {"loss": 0.5799, "grad_norm": 1.0388357639312744, "learning_rate": 0.0002, "epoch": 5.189228007181328, "step": 72260}, {"loss": 0.5805, "grad_norm": 0.8883537650108337, "learning_rate": 0.0002, "epoch": 5.189946140035906, "step": 72270}, {"loss": 0.5723, "grad_norm": 1.0161921977996826, "learning_rate": 0.0002, "epoch": 5.190664272890484, "step": 72280}, {"loss": 0.5805, "grad_norm": 0.964936375617981, "learning_rate": 0.0002, "epoch": 5.191382405745063, "step": 72290}, {"loss": 0.5145, "grad_norm": 0.9728496670722961, "learning_rate": 0.0002, "epoch": 5.192100538599641, "step": 72300}, {"loss": 0.552, "grad_norm": 1.2411649227142334, "learning_rate": 0.0002, "epoch": 5.192818671454219, "step": 72310}, {"loss": 0.5482, "grad_norm": 0.9430946111679077, "learning_rate": 0.0002, "epoch": 5.193536804308797, "step": 72320}, {"loss": 0.5007, "grad_norm": 1.1522886753082275, "learning_rate": 0.0002, "epoch": 5.194254937163375, "step": 72330}, {"loss": 0.5013, "grad_norm": 1.0727189779281616, "learning_rate": 0.0002, "epoch": 5.194973070017953, "step": 72340}, {"loss": 0.5157, "grad_norm": 1.2506077289581299, "learning_rate": 0.0002, "epoch": 5.195691202872531, "step": 72350}, {"loss": 0.592, "grad_norm": 1.0949938297271729, "learning_rate": 0.0002, "epoch": 5.196409335727109, "step": 72360}, {"loss": 0.5642, "grad_norm": 1.191125750541687, "learning_rate": 0.0002, "epoch": 5.197127468581687, "step": 72370}, {"loss": 0.5756, "grad_norm": 1.1154223680496216, "learning_rate": 0.0002, "epoch": 5.197845601436265, "step": 72380}, {"loss": 0.5996, "grad_norm": 0.9623886942863464, "learning_rate": 0.0002, "epoch": 5.198563734290844, "step": 72390}, {"loss": 0.5579, "grad_norm": 0.9432680010795593, "learning_rate": 0.0002, "epoch": 5.199281867145422, "step": 72400}, {"loss": 0.6055, "grad_norm": 1.035905122756958, "learning_rate": 0.0002, "epoch": 5.2, "step": 72410}, {"loss": 0.5515, "grad_norm": 0.9044913053512573, "learning_rate": 0.0002, "epoch": 5.200718132854578, "step": 72420}, {"loss": 0.5845, "grad_norm": 1.082187533378601, "learning_rate": 0.0002, "epoch": 5.201436265709156, "step": 72430}, {"loss": 0.6215, "grad_norm": 0.9368400573730469, "learning_rate": 0.0002, "epoch": 5.202154398563734, "step": 72440}, {"loss": 0.5903, "grad_norm": 1.1515194177627563, "learning_rate": 0.0002, "epoch": 5.202872531418312, "step": 72450}, {"loss": 0.5698, "grad_norm": 0.8333232402801514, "learning_rate": 0.0002, "epoch": 5.20359066427289, "step": 72460}, {"loss": 0.5534, "grad_norm": 1.0885688066482544, "learning_rate": 0.0002, "epoch": 5.204308797127468, "step": 72470}, {"loss": 0.5459, "grad_norm": 0.8189428448677063, "learning_rate": 0.0002, "epoch": 5.205026929982047, "step": 72480}, {"loss": 0.5981, "grad_norm": 1.0145429372787476, "learning_rate": 0.0002, "epoch": 5.205745062836625, "step": 72490}, {"loss": 0.5451, "grad_norm": 1.132490634918213, "learning_rate": 0.0002, "epoch": 5.206463195691203, "step": 72500}, {"loss": 0.5566, "grad_norm": 0.8866808414459229, "learning_rate": 0.0002, "epoch": 5.207181328545781, "step": 72510}, {"loss": 0.5469, "grad_norm": 0.9681518077850342, "learning_rate": 0.0002, "epoch": 5.207899461400359, "step": 72520}, {"loss": 0.5716, "grad_norm": 0.9992330074310303, "learning_rate": 0.0002, "epoch": 5.208617594254937, "step": 72530}, {"loss": 0.5894, "grad_norm": 1.0767436027526855, "learning_rate": 0.0002, "epoch": 5.209335727109515, "step": 72540}, {"loss": 0.5828, "grad_norm": 1.1362388134002686, "learning_rate": 0.0002, "epoch": 5.210053859964093, "step": 72550}, {"loss": 0.6156, "grad_norm": 0.9741758704185486, "learning_rate": 0.0002, "epoch": 5.210771992818671, "step": 72560}, {"loss": 0.6119, "grad_norm": 0.8216298818588257, "learning_rate": 0.0002, "epoch": 5.211490125673249, "step": 72570}, {"loss": 0.5813, "grad_norm": 0.7500724792480469, "learning_rate": 0.0002, "epoch": 5.212208258527828, "step": 72580}, {"loss": 0.5427, "grad_norm": 0.9152594804763794, "learning_rate": 0.0002, "epoch": 5.212926391382406, "step": 72590}, {"loss": 0.5792, "grad_norm": 1.014940857887268, "learning_rate": 0.0002, "epoch": 5.213644524236984, "step": 72600}, {"loss": 0.5487, "grad_norm": 0.9333099722862244, "learning_rate": 0.0002, "epoch": 5.214362657091562, "step": 72610}, {"loss": 0.5647, "grad_norm": 0.7940610647201538, "learning_rate": 0.0002, "epoch": 5.21508078994614, "step": 72620}, {"loss": 0.5474, "grad_norm": 1.0365521907806396, "learning_rate": 0.0002, "epoch": 5.215798922800718, "step": 72630}, {"loss": 0.6009, "grad_norm": 1.37727952003479, "learning_rate": 0.0002, "epoch": 5.216517055655296, "step": 72640}, {"loss": 0.5389, "grad_norm": 1.2019168138504028, "learning_rate": 0.0002, "epoch": 5.217235188509874, "step": 72650}, {"loss": 0.5593, "grad_norm": 1.1696226596832275, "learning_rate": 0.0002, "epoch": 5.217953321364452, "step": 72660}, {"loss": 0.5507, "grad_norm": 0.9608798623085022, "learning_rate": 0.0002, "epoch": 5.218671454219031, "step": 72670}, {"loss": 0.5502, "grad_norm": 0.9139777421951294, "learning_rate": 0.0002, "epoch": 5.219389587073609, "step": 72680}, {"loss": 0.5955, "grad_norm": 0.9937016367912292, "learning_rate": 0.0002, "epoch": 5.220107719928187, "step": 72690}, {"loss": 0.6031, "grad_norm": 1.2787059545516968, "learning_rate": 0.0002, "epoch": 5.220825852782765, "step": 72700}, {"loss": 0.5601, "grad_norm": 1.0757197141647339, "learning_rate": 0.0002, "epoch": 5.221543985637343, "step": 72710}, {"loss": 0.5556, "grad_norm": 0.8053579926490784, "learning_rate": 0.0002, "epoch": 5.222262118491921, "step": 72720}, {"loss": 0.5655, "grad_norm": 1.0239759683609009, "learning_rate": 0.0002, "epoch": 5.222980251346499, "step": 72730}, {"loss": 0.6153, "grad_norm": 0.9972975850105286, "learning_rate": 0.0002, "epoch": 5.223698384201077, "step": 72740}, {"loss": 0.569, "grad_norm": 1.0504519939422607, "learning_rate": 0.0002, "epoch": 5.224416517055655, "step": 72750}, {"loss": 0.5345, "grad_norm": 1.1793010234832764, "learning_rate": 0.0002, "epoch": 5.225134649910234, "step": 72760}, {"loss": 0.5674, "grad_norm": 1.1098815202713013, "learning_rate": 0.0002, "epoch": 5.225852782764812, "step": 72770}, {"loss": 0.5689, "grad_norm": 1.1078516244888306, "learning_rate": 0.0002, "epoch": 5.22657091561939, "step": 72780}, {"loss": 0.5614, "grad_norm": 0.8684433698654175, "learning_rate": 0.0002, "epoch": 5.227289048473968, "step": 72790}, {"loss": 0.5545, "grad_norm": 1.159390926361084, "learning_rate": 0.0002, "epoch": 5.228007181328546, "step": 72800}, {"loss": 0.5726, "grad_norm": 1.0468506813049316, "learning_rate": 0.0002, "epoch": 5.228725314183124, "step": 72810}, {"loss": 0.5662, "grad_norm": 0.8684625029563904, "learning_rate": 0.0002, "epoch": 5.229443447037702, "step": 72820}, {"loss": 0.6074, "grad_norm": 1.0117321014404297, "learning_rate": 0.0002, "epoch": 5.23016157989228, "step": 72830}, {"loss": 0.5956, "grad_norm": 1.0513219833374023, "learning_rate": 0.0002, "epoch": 5.230879712746858, "step": 72840}, {"loss": 0.5796, "grad_norm": 1.0659555196762085, "learning_rate": 0.0002, "epoch": 5.231597845601437, "step": 72850}, {"loss": 0.5916, "grad_norm": 0.7726831436157227, "learning_rate": 0.0002, "epoch": 5.232315978456015, "step": 72860}, {"loss": 0.557, "grad_norm": 1.0346935987472534, "learning_rate": 0.0002, "epoch": 5.233034111310593, "step": 72870}, {"loss": 0.567, "grad_norm": 0.9112410545349121, "learning_rate": 0.0002, "epoch": 5.233752244165171, "step": 72880}, {"loss": 0.575, "grad_norm": 1.2933332920074463, "learning_rate": 0.0002, "epoch": 5.234470377019749, "step": 72890}, {"loss": 0.5733, "grad_norm": 0.9740806221961975, "learning_rate": 0.0002, "epoch": 5.235188509874327, "step": 72900}, {"loss": 0.5661, "grad_norm": 0.8041712641716003, "learning_rate": 0.0002, "epoch": 5.235906642728905, "step": 72910}, {"loss": 0.5936, "grad_norm": 0.9510180950164795, "learning_rate": 0.0002, "epoch": 5.236624775583483, "step": 72920}, {"loss": 0.6312, "grad_norm": 0.9103419780731201, "learning_rate": 0.0002, "epoch": 5.237342908438061, "step": 72930}, {"loss": 0.5298, "grad_norm": 0.8317763805389404, "learning_rate": 0.0002, "epoch": 5.238061041292639, "step": 72940}, {"loss": 0.5887, "grad_norm": 1.0269867181777954, "learning_rate": 0.0002, "epoch": 5.238779174147218, "step": 72950}, {"loss": 0.6141, "grad_norm": 1.0599713325500488, "learning_rate": 0.0002, "epoch": 5.239497307001796, "step": 72960}, {"loss": 0.5785, "grad_norm": 0.9341228008270264, "learning_rate": 0.0002, "epoch": 5.240215439856374, "step": 72970}, {"loss": 0.5256, "grad_norm": 1.1216323375701904, "learning_rate": 0.0002, "epoch": 5.240933572710952, "step": 72980}, {"loss": 0.5995, "grad_norm": 0.9396152496337891, "learning_rate": 0.0002, "epoch": 5.24165170556553, "step": 72990}, {"loss": 0.6281, "grad_norm": 1.1474549770355225, "learning_rate": 0.0002, "epoch": 5.242369838420108, "step": 73000}, {"loss": 0.5693, "grad_norm": 1.2160102128982544, "learning_rate": 0.0002, "epoch": 5.243087971274686, "step": 73010}, {"loss": 0.5914, "grad_norm": 1.0755409002304077, "learning_rate": 0.0002, "epoch": 5.243806104129264, "step": 73020}, {"loss": 0.5697, "grad_norm": 1.0645225048065186, "learning_rate": 0.0002, "epoch": 5.244524236983842, "step": 73030}, {"loss": 0.5669, "grad_norm": 1.1155469417572021, "learning_rate": 0.0002, "epoch": 5.2452423698384205, "step": 73040}, {"loss": 0.5448, "grad_norm": 1.1631708145141602, "learning_rate": 0.0002, "epoch": 5.2459605026929985, "step": 73050}, {"loss": 0.6034, "grad_norm": 0.8747480511665344, "learning_rate": 0.0002, "epoch": 5.2466786355475765, "step": 73060}, {"loss": 0.5647, "grad_norm": 0.9174497723579407, "learning_rate": 0.0002, "epoch": 5.2473967684021545, "step": 73070}, {"loss": 0.5804, "grad_norm": 1.334018349647522, "learning_rate": 0.0002, "epoch": 5.2481149012567325, "step": 73080}, {"loss": 0.5491, "grad_norm": 1.0842393636703491, "learning_rate": 0.0002, "epoch": 5.2488330341113105, "step": 73090}, {"loss": 0.6078, "grad_norm": 1.0531692504882812, "learning_rate": 0.0002, "epoch": 5.2495511669658885, "step": 73100}, {"loss": 0.5912, "grad_norm": 0.9069980978965759, "learning_rate": 0.0002, "epoch": 5.2502692998204665, "step": 73110}, {"loss": 0.5845, "grad_norm": 1.1319832801818848, "learning_rate": 0.0002, "epoch": 5.2509874326750445, "step": 73120}, {"loss": 0.5921, "grad_norm": 1.0468456745147705, "learning_rate": 0.0002, "epoch": 5.2517055655296225, "step": 73130}, {"loss": 0.5688, "grad_norm": 1.1752768754959106, "learning_rate": 0.0002, "epoch": 5.252423698384201, "step": 73140}, {"loss": 0.5709, "grad_norm": 1.0697909593582153, "learning_rate": 0.0002, "epoch": 5.253141831238779, "step": 73150}, {"loss": 0.6187, "grad_norm": 1.1179429292678833, "learning_rate": 0.0002, "epoch": 5.253859964093357, "step": 73160}, {"loss": 0.6127, "grad_norm": 0.9088113903999329, "learning_rate": 0.0002, "epoch": 5.254578096947935, "step": 73170}, {"loss": 0.629, "grad_norm": 0.8814208507537842, "learning_rate": 0.0002, "epoch": 5.255296229802513, "step": 73180}, {"loss": 0.5881, "grad_norm": 1.026688814163208, "learning_rate": 0.0002, "epoch": 5.256014362657091, "step": 73190}, {"loss": 0.5883, "grad_norm": 0.9974902868270874, "learning_rate": 0.0002, "epoch": 5.256732495511669, "step": 73200}, {"loss": 0.5219, "grad_norm": 0.948743999004364, "learning_rate": 0.0002, "epoch": 5.257450628366247, "step": 73210}, {"loss": 0.5489, "grad_norm": 0.9069591164588928, "learning_rate": 0.0002, "epoch": 5.258168761220825, "step": 73220}, {"loss": 0.5667, "grad_norm": 1.0574030876159668, "learning_rate": 0.0002, "epoch": 5.258886894075404, "step": 73230}, {"loss": 0.5903, "grad_norm": 0.9299649596214294, "learning_rate": 0.0002, "epoch": 5.259605026929982, "step": 73240}, {"loss": 0.5678, "grad_norm": 0.9888820648193359, "learning_rate": 0.0002, "epoch": 5.26032315978456, "step": 73250}, {"loss": 0.5993, "grad_norm": 1.0164920091629028, "learning_rate": 0.0002, "epoch": 5.261041292639138, "step": 73260}, {"loss": 0.5585, "grad_norm": 0.933210551738739, "learning_rate": 0.0002, "epoch": 5.261759425493716, "step": 73270}, {"loss": 0.6061, "grad_norm": 1.1754034757614136, "learning_rate": 0.0002, "epoch": 5.262477558348294, "step": 73280}, {"loss": 0.5727, "grad_norm": 1.1599570512771606, "learning_rate": 0.0002, "epoch": 5.263195691202872, "step": 73290}, {"loss": 0.6252, "grad_norm": 1.0497905015945435, "learning_rate": 0.0002, "epoch": 5.26391382405745, "step": 73300}, {"loss": 0.5861, "grad_norm": 1.3603366613388062, "learning_rate": 0.0002, "epoch": 5.264631956912028, "step": 73310}, {"loss": 0.5713, "grad_norm": 1.0283215045928955, "learning_rate": 0.0002, "epoch": 5.265350089766607, "step": 73320}, {"loss": 0.6048, "grad_norm": 1.1043906211853027, "learning_rate": 0.0002, "epoch": 5.266068222621185, "step": 73330}, {"loss": 0.5383, "grad_norm": 0.9386111497879028, "learning_rate": 0.0002, "epoch": 5.266786355475763, "step": 73340}, {"loss": 0.5826, "grad_norm": 1.3586112260818481, "learning_rate": 0.0002, "epoch": 5.267504488330341, "step": 73350}, {"loss": 0.6213, "grad_norm": 1.034179449081421, "learning_rate": 0.0002, "epoch": 5.268222621184919, "step": 73360}, {"loss": 0.5809, "grad_norm": 0.9645284414291382, "learning_rate": 0.0002, "epoch": 5.268940754039497, "step": 73370}, {"loss": 0.5595, "grad_norm": 1.1078046560287476, "learning_rate": 0.0002, "epoch": 5.269658886894075, "step": 73380}, {"loss": 0.5518, "grad_norm": 0.9737151265144348, "learning_rate": 0.0002, "epoch": 5.270377019748653, "step": 73390}, {"loss": 0.5984, "grad_norm": 1.1911388635635376, "learning_rate": 0.0002, "epoch": 5.271095152603231, "step": 73400}, {"loss": 0.5867, "grad_norm": 0.9089180827140808, "learning_rate": 0.0002, "epoch": 5.27181328545781, "step": 73410}, {"loss": 0.6021, "grad_norm": 1.094515085220337, "learning_rate": 0.0002, "epoch": 5.272531418312388, "step": 73420}, {"loss": 0.652, "grad_norm": 1.2531700134277344, "learning_rate": 0.0002, "epoch": 5.273249551166966, "step": 73430}, {"loss": 0.5616, "grad_norm": 0.9279667139053345, "learning_rate": 0.0002, "epoch": 5.273967684021544, "step": 73440}, {"loss": 0.5378, "grad_norm": 0.9872317314147949, "learning_rate": 0.0002, "epoch": 5.274685816876122, "step": 73450}, {"loss": 0.5732, "grad_norm": 1.0645262002944946, "learning_rate": 0.0002, "epoch": 5.2754039497307, "step": 73460}, {"loss": 0.5331, "grad_norm": 0.9505489468574524, "learning_rate": 0.0002, "epoch": 5.276122082585278, "step": 73470}, {"loss": 0.5826, "grad_norm": 1.0444035530090332, "learning_rate": 0.0002, "epoch": 5.276840215439856, "step": 73480}, {"loss": 0.6267, "grad_norm": 1.1813455820083618, "learning_rate": 0.0002, "epoch": 5.277558348294434, "step": 73490}, {"loss": 0.5645, "grad_norm": 0.782117486000061, "learning_rate": 0.0002, "epoch": 5.278276481149012, "step": 73500}, {"loss": 0.5829, "grad_norm": 0.8837172389030457, "learning_rate": 0.0002, "epoch": 5.278994614003591, "step": 73510}, {"loss": 0.5894, "grad_norm": 0.8320443630218506, "learning_rate": 0.0002, "epoch": 5.279712746858169, "step": 73520}, {"loss": 0.5793, "grad_norm": 1.111466407775879, "learning_rate": 0.0002, "epoch": 5.280430879712747, "step": 73530}, {"loss": 0.5796, "grad_norm": 1.0448017120361328, "learning_rate": 0.0002, "epoch": 5.281149012567325, "step": 73540}, {"loss": 0.5642, "grad_norm": 1.2046639919281006, "learning_rate": 0.0002, "epoch": 5.281867145421903, "step": 73550}, {"loss": 0.5859, "grad_norm": 1.084886074066162, "learning_rate": 0.0002, "epoch": 5.282585278276481, "step": 73560}, {"loss": 0.6055, "grad_norm": 0.8321937918663025, "learning_rate": 0.0002, "epoch": 5.283303411131059, "step": 73570}, {"loss": 0.5735, "grad_norm": 1.172440767288208, "learning_rate": 0.0002, "epoch": 5.284021543985637, "step": 73580}, {"loss": 0.5491, "grad_norm": 0.937133252620697, "learning_rate": 0.0002, "epoch": 5.284739676840215, "step": 73590}, {"loss": 0.5575, "grad_norm": 1.0996583700180054, "learning_rate": 0.0002, "epoch": 5.285457809694794, "step": 73600}, {"loss": 0.5813, "grad_norm": 1.2459958791732788, "learning_rate": 0.0002, "epoch": 5.286175942549372, "step": 73610}, {"loss": 0.6146, "grad_norm": 0.8362332582473755, "learning_rate": 0.0002, "epoch": 5.28689407540395, "step": 73620}, {"loss": 0.5333, "grad_norm": 0.9784061312675476, "learning_rate": 0.0002, "epoch": 5.287612208258528, "step": 73630}, {"loss": 0.6146, "grad_norm": 1.087041974067688, "learning_rate": 0.0002, "epoch": 5.288330341113106, "step": 73640}, {"loss": 0.5775, "grad_norm": 0.8641281723976135, "learning_rate": 0.0002, "epoch": 5.289048473967684, "step": 73650}, {"loss": 0.5592, "grad_norm": 1.030386209487915, "learning_rate": 0.0002, "epoch": 5.289766606822262, "step": 73660}, {"loss": 0.5899, "grad_norm": 1.0551509857177734, "learning_rate": 0.0002, "epoch": 5.29048473967684, "step": 73670}, {"loss": 0.5805, "grad_norm": 0.9969013333320618, "learning_rate": 0.0002, "epoch": 5.291202872531418, "step": 73680}, {"loss": 0.5841, "grad_norm": 0.9566490054130554, "learning_rate": 0.0002, "epoch": 5.291921005385996, "step": 73690}, {"loss": 0.5756, "grad_norm": 1.1376742124557495, "learning_rate": 0.0002, "epoch": 5.292639138240575, "step": 73700}, {"loss": 0.5697, "grad_norm": 1.0127843618392944, "learning_rate": 0.0002, "epoch": 5.293357271095153, "step": 73710}, {"loss": 0.5673, "grad_norm": 0.9500759243965149, "learning_rate": 0.0002, "epoch": 5.294075403949731, "step": 73720}, {"loss": 0.6251, "grad_norm": 0.9597342610359192, "learning_rate": 0.0002, "epoch": 5.294793536804309, "step": 73730}, {"loss": 0.5887, "grad_norm": 1.0982595682144165, "learning_rate": 0.0002, "epoch": 5.295511669658887, "step": 73740}, {"loss": 0.5623, "grad_norm": 0.9007689952850342, "learning_rate": 0.0002, "epoch": 5.296229802513465, "step": 73750}, {"loss": 0.5854, "grad_norm": 0.9329614639282227, "learning_rate": 0.0002, "epoch": 5.296947935368043, "step": 73760}, {"loss": 0.5867, "grad_norm": 1.235142469406128, "learning_rate": 0.0002, "epoch": 5.297666068222621, "step": 73770}, {"loss": 0.6009, "grad_norm": 1.0875943899154663, "learning_rate": 0.0002, "epoch": 5.298384201077199, "step": 73780}, {"loss": 0.6009, "grad_norm": 1.0499054193496704, "learning_rate": 0.0002, "epoch": 5.299102333931778, "step": 73790}, {"loss": 0.625, "grad_norm": 1.117954969406128, "learning_rate": 0.0002, "epoch": 5.299820466786356, "step": 73800}, {"loss": 0.5502, "grad_norm": 0.800291121006012, "learning_rate": 0.0002, "epoch": 5.300538599640934, "step": 73810}, {"loss": 0.5815, "grad_norm": 1.1461842060089111, "learning_rate": 0.0002, "epoch": 5.301256732495512, "step": 73820}, {"loss": 0.6091, "grad_norm": 1.0084760189056396, "learning_rate": 0.0002, "epoch": 5.30197486535009, "step": 73830}, {"loss": 0.5802, "grad_norm": 1.1249386072158813, "learning_rate": 0.0002, "epoch": 5.302692998204668, "step": 73840}, {"loss": 0.55, "grad_norm": 1.0846004486083984, "learning_rate": 0.0002, "epoch": 5.303411131059246, "step": 73850}, {"loss": 0.5923, "grad_norm": 1.1557925939559937, "learning_rate": 0.0002, "epoch": 5.304129263913824, "step": 73860}, {"loss": 0.5904, "grad_norm": 1.2287988662719727, "learning_rate": 0.0002, "epoch": 5.304847396768402, "step": 73870}, {"loss": 0.554, "grad_norm": 0.9618542194366455, "learning_rate": 0.0002, "epoch": 5.30556552962298, "step": 73880}, {"loss": 0.5787, "grad_norm": 0.9429472088813782, "learning_rate": 0.0002, "epoch": 5.306283662477559, "step": 73890}, {"loss": 0.5937, "grad_norm": 0.9032631516456604, "learning_rate": 0.0002, "epoch": 5.307001795332137, "step": 73900}, {"loss": 0.577, "grad_norm": 1.0008580684661865, "learning_rate": 0.0002, "epoch": 5.307719928186715, "step": 73910}, {"loss": 0.5462, "grad_norm": 0.9795624017715454, "learning_rate": 0.0002, "epoch": 5.308438061041293, "step": 73920}, {"loss": 0.582, "grad_norm": 1.1194090843200684, "learning_rate": 0.0002, "epoch": 5.309156193895871, "step": 73930}, {"loss": 0.5859, "grad_norm": 1.1057528257369995, "learning_rate": 0.0002, "epoch": 5.309874326750449, "step": 73940}, {"loss": 0.5503, "grad_norm": 0.7807615995407104, "learning_rate": 0.0002, "epoch": 5.310592459605027, "step": 73950}, {"loss": 0.6128, "grad_norm": 0.9465593099594116, "learning_rate": 0.0002, "epoch": 5.311310592459605, "step": 73960}, {"loss": 0.5831, "grad_norm": 1.104210615158081, "learning_rate": 0.0002, "epoch": 5.312028725314184, "step": 73970}, {"loss": 0.5478, "grad_norm": 1.0452964305877686, "learning_rate": 0.0002, "epoch": 5.312746858168762, "step": 73980}, {"loss": 0.5856, "grad_norm": 1.0314992666244507, "learning_rate": 0.0002, "epoch": 5.31346499102334, "step": 73990}, {"loss": 0.6222, "grad_norm": 0.9187130928039551, "learning_rate": 0.0002, "epoch": 5.314183123877918, "step": 74000}, {"loss": 0.5739, "grad_norm": 0.8660678267478943, "learning_rate": 0.0002, "epoch": 5.314901256732496, "step": 74010}, {"loss": 0.5296, "grad_norm": 0.9470953345298767, "learning_rate": 0.0002, "epoch": 5.315619389587074, "step": 74020}, {"loss": 0.5772, "grad_norm": 1.0028631687164307, "learning_rate": 0.0002, "epoch": 5.316337522441652, "step": 74030}, {"loss": 0.6159, "grad_norm": 1.0237356424331665, "learning_rate": 0.0002, "epoch": 5.31705565529623, "step": 74040}, {"loss": 0.6277, "grad_norm": 1.0299798250198364, "learning_rate": 0.0002, "epoch": 5.317773788150808, "step": 74050}, {"loss": 0.568, "grad_norm": 1.0326799154281616, "learning_rate": 0.0002, "epoch": 5.318491921005386, "step": 74060}, {"loss": 0.5766, "grad_norm": 1.156346082687378, "learning_rate": 0.0002, "epoch": 5.3192100538599645, "step": 74070}, {"loss": 0.598, "grad_norm": 1.1542664766311646, "learning_rate": 0.0002, "epoch": 5.3199281867145425, "step": 74080}, {"loss": 0.5736, "grad_norm": 1.0503013134002686, "learning_rate": 0.0002, "epoch": 5.3206463195691205, "step": 74090}, {"loss": 0.6172, "grad_norm": 1.1088979244232178, "learning_rate": 0.0002, "epoch": 5.3213644524236985, "step": 74100}, {"loss": 0.5536, "grad_norm": 0.9314014911651611, "learning_rate": 0.0002, "epoch": 5.3220825852782765, "step": 74110}, {"loss": 0.6205, "grad_norm": 1.0813525915145874, "learning_rate": 0.0002, "epoch": 5.3228007181328545, "step": 74120}, {"loss": 0.6019, "grad_norm": 0.7824062705039978, "learning_rate": 0.0002, "epoch": 5.3235188509874325, "step": 74130}, {"loss": 0.6183, "grad_norm": 1.0552699565887451, "learning_rate": 0.0002, "epoch": 5.3242369838420105, "step": 74140}, {"loss": 0.5714, "grad_norm": 1.0916554927825928, "learning_rate": 0.0002, "epoch": 5.3249551166965885, "step": 74150}, {"loss": 0.6128, "grad_norm": 1.205618143081665, "learning_rate": 0.0002, "epoch": 5.325673249551167, "step": 74160}, {"loss": 0.616, "grad_norm": 1.2551230192184448, "learning_rate": 0.0002, "epoch": 5.326391382405745, "step": 74170}, {"loss": 0.5467, "grad_norm": 0.7715005278587341, "learning_rate": 0.0002, "epoch": 5.327109515260323, "step": 74180}, {"loss": 0.5793, "grad_norm": 1.1059352159500122, "learning_rate": 0.0002, "epoch": 5.327827648114901, "step": 74190}, {"loss": 0.5768, "grad_norm": 0.9441812634468079, "learning_rate": 0.0002, "epoch": 5.328545780969479, "step": 74200}, {"loss": 0.5708, "grad_norm": 1.0012084245681763, "learning_rate": 0.0002, "epoch": 5.329263913824057, "step": 74210}, {"loss": 0.5289, "grad_norm": 0.8594073057174683, "learning_rate": 0.0002, "epoch": 5.329982046678635, "step": 74220}, {"loss": 0.5933, "grad_norm": 0.8931775093078613, "learning_rate": 0.0002, "epoch": 5.330700179533213, "step": 74230}, {"loss": 0.5722, "grad_norm": 0.967250406742096, "learning_rate": 0.0002, "epoch": 5.331418312387791, "step": 74240}, {"loss": 0.5483, "grad_norm": 0.9776269793510437, "learning_rate": 0.0002, "epoch": 5.332136445242369, "step": 74250}, {"loss": 0.5655, "grad_norm": 0.9393186569213867, "learning_rate": 0.0002, "epoch": 5.332854578096948, "step": 74260}, {"loss": 0.5704, "grad_norm": 1.0081093311309814, "learning_rate": 0.0002, "epoch": 5.333572710951526, "step": 74270}, {"loss": 0.5588, "grad_norm": 0.9002147316932678, "learning_rate": 0.0002, "epoch": 5.334290843806104, "step": 74280}, {"loss": 0.5851, "grad_norm": 0.9237701296806335, "learning_rate": 0.0002, "epoch": 5.335008976660682, "step": 74290}, {"loss": 0.5958, "grad_norm": 1.070694923400879, "learning_rate": 0.0002, "epoch": 5.33572710951526, "step": 74300}, {"loss": 0.5877, "grad_norm": 1.0134668350219727, "learning_rate": 0.0002, "epoch": 5.336445242369838, "step": 74310}, {"loss": 0.5828, "grad_norm": 1.0903294086456299, "learning_rate": 0.0002, "epoch": 5.337163375224416, "step": 74320}, {"loss": 0.5146, "grad_norm": 0.9000239372253418, "learning_rate": 0.0002, "epoch": 5.337881508078994, "step": 74330}, {"loss": 0.5357, "grad_norm": 1.0584321022033691, "learning_rate": 0.0002, "epoch": 5.338599640933572, "step": 74340}, {"loss": 0.5844, "grad_norm": 1.046420931816101, "learning_rate": 0.0002, "epoch": 5.339317773788151, "step": 74350}, {"loss": 0.5489, "grad_norm": 0.8862320184707642, "learning_rate": 0.0002, "epoch": 5.340035906642729, "step": 74360}, {"loss": 0.5923, "grad_norm": 0.8197309970855713, "learning_rate": 0.0002, "epoch": 5.340754039497307, "step": 74370}, {"loss": 0.5408, "grad_norm": 0.9539661407470703, "learning_rate": 0.0002, "epoch": 5.341472172351885, "step": 74380}, {"loss": 0.5943, "grad_norm": 1.481026530265808, "learning_rate": 0.0002, "epoch": 5.342190305206463, "step": 74390}, {"loss": 0.6242, "grad_norm": 1.0685169696807861, "learning_rate": 0.0002, "epoch": 5.342908438061041, "step": 74400}, {"loss": 0.5917, "grad_norm": 1.1468359231948853, "learning_rate": 0.0002, "epoch": 5.343626570915619, "step": 74410}, {"loss": 0.556, "grad_norm": 0.9982373714447021, "learning_rate": 0.0002, "epoch": 5.344344703770197, "step": 74420}, {"loss": 0.6003, "grad_norm": 0.9273471236228943, "learning_rate": 0.0002, "epoch": 5.345062836624775, "step": 74430}, {"loss": 0.5239, "grad_norm": 1.058828592300415, "learning_rate": 0.0002, "epoch": 5.345780969479353, "step": 74440}, {"loss": 0.5434, "grad_norm": 1.0442006587982178, "learning_rate": 0.0002, "epoch": 5.346499102333932, "step": 74450}, {"loss": 0.5614, "grad_norm": 1.0955053567886353, "learning_rate": 0.0002, "epoch": 5.34721723518851, "step": 74460}, {"loss": 0.5992, "grad_norm": 0.9326002597808838, "learning_rate": 0.0002, "epoch": 5.347935368043088, "step": 74470}, {"loss": 0.6173, "grad_norm": 0.9496979117393494, "learning_rate": 0.0002, "epoch": 5.348653500897666, "step": 74480}, {"loss": 0.5483, "grad_norm": 1.1995937824249268, "learning_rate": 0.0002, "epoch": 5.349371633752244, "step": 74490}, {"loss": 0.5759, "grad_norm": 0.8761899471282959, "learning_rate": 0.0002, "epoch": 5.350089766606822, "step": 74500}, {"loss": 0.5866, "grad_norm": 1.2390170097351074, "learning_rate": 0.0002, "epoch": 5.3508078994614, "step": 74510}, {"loss": 0.6065, "grad_norm": 0.9101138114929199, "learning_rate": 0.0002, "epoch": 5.351526032315978, "step": 74520}, {"loss": 0.5908, "grad_norm": 0.925466001033783, "learning_rate": 0.0002, "epoch": 5.352244165170557, "step": 74530}, {"loss": 0.5992, "grad_norm": 0.9483969807624817, "learning_rate": 0.0002, "epoch": 5.352962298025135, "step": 74540}, {"loss": 0.5881, "grad_norm": 1.0530859231948853, "learning_rate": 0.0002, "epoch": 5.353680430879713, "step": 74550}, {"loss": 0.5607, "grad_norm": 1.209647536277771, "learning_rate": 0.0002, "epoch": 5.354398563734291, "step": 74560}, {"loss": 0.5782, "grad_norm": 0.9849331378936768, "learning_rate": 0.0002, "epoch": 5.355116696588869, "step": 74570}, {"loss": 0.6448, "grad_norm": 1.0822848081588745, "learning_rate": 0.0002, "epoch": 5.355834829443447, "step": 74580}, {"loss": 0.631, "grad_norm": 1.1460528373718262, "learning_rate": 0.0002, "epoch": 5.356552962298025, "step": 74590}, {"loss": 0.5634, "grad_norm": 0.9509134292602539, "learning_rate": 0.0002, "epoch": 5.357271095152603, "step": 74600}, {"loss": 0.5492, "grad_norm": 0.9884999394416809, "learning_rate": 0.0002, "epoch": 5.357989228007181, "step": 74610}, {"loss": 0.6096, "grad_norm": 0.9619579911231995, "learning_rate": 0.0002, "epoch": 5.358707360861759, "step": 74620}, {"loss": 0.5686, "grad_norm": 0.8596125245094299, "learning_rate": 0.0002, "epoch": 5.359425493716338, "step": 74630}, {"loss": 0.6112, "grad_norm": 1.16913640499115, "learning_rate": 0.0002, "epoch": 5.360143626570916, "step": 74640}, {"loss": 0.5779, "grad_norm": 0.99276202917099, "learning_rate": 0.0002, "epoch": 5.360861759425494, "step": 74650}, {"loss": 0.5699, "grad_norm": 1.1293696165084839, "learning_rate": 0.0002, "epoch": 5.361579892280072, "step": 74660}, {"loss": 0.5727, "grad_norm": 1.187947154045105, "learning_rate": 0.0002, "epoch": 5.36229802513465, "step": 74670}, {"loss": 0.5574, "grad_norm": 0.8637247681617737, "learning_rate": 0.0002, "epoch": 5.363016157989228, "step": 74680}, {"loss": 0.5738, "grad_norm": 1.1049476861953735, "learning_rate": 0.0002, "epoch": 5.363734290843806, "step": 74690}, {"loss": 0.6082, "grad_norm": 1.1736515760421753, "learning_rate": 0.0002, "epoch": 5.364452423698384, "step": 74700}, {"loss": 0.6238, "grad_norm": 1.0203301906585693, "learning_rate": 0.0002, "epoch": 5.365170556552962, "step": 74710}, {"loss": 0.5612, "grad_norm": 1.15559720993042, "learning_rate": 0.0002, "epoch": 5.365888689407541, "step": 74720}, {"loss": 0.5699, "grad_norm": 1.2008144855499268, "learning_rate": 0.0002, "epoch": 5.366606822262119, "step": 74730}, {"loss": 0.5749, "grad_norm": 1.0385756492614746, "learning_rate": 0.0002, "epoch": 5.367324955116697, "step": 74740}, {"loss": 0.5745, "grad_norm": 0.8964240550994873, "learning_rate": 0.0002, "epoch": 5.368043087971275, "step": 74750}, {"loss": 0.5799, "grad_norm": 0.9824761748313904, "learning_rate": 0.0002, "epoch": 5.368761220825853, "step": 74760}, {"loss": 0.5714, "grad_norm": 0.8815994262695312, "learning_rate": 0.0002, "epoch": 5.369479353680431, "step": 74770}, {"loss": 0.584, "grad_norm": 0.9729493856430054, "learning_rate": 0.0002, "epoch": 5.370197486535009, "step": 74780}, {"loss": 0.5884, "grad_norm": 1.1032123565673828, "learning_rate": 0.0002, "epoch": 5.370915619389587, "step": 74790}, {"loss": 0.5804, "grad_norm": 1.039591908454895, "learning_rate": 0.0002, "epoch": 5.371633752244165, "step": 74800}, {"loss": 0.5693, "grad_norm": 0.9741610884666443, "learning_rate": 0.0002, "epoch": 5.372351885098743, "step": 74810}, {"loss": 0.6225, "grad_norm": 0.9789814949035645, "learning_rate": 0.0002, "epoch": 5.373070017953322, "step": 74820}, {"loss": 0.5765, "grad_norm": 1.0777033567428589, "learning_rate": 0.0002, "epoch": 5.3737881508079, "step": 74830}, {"loss": 0.5553, "grad_norm": 0.9058641195297241, "learning_rate": 0.0002, "epoch": 5.374506283662478, "step": 74840}, {"loss": 0.5733, "grad_norm": 1.2161815166473389, "learning_rate": 0.0002, "epoch": 5.375224416517056, "step": 74850}, {"loss": 0.5679, "grad_norm": 1.1079481840133667, "learning_rate": 0.0002, "epoch": 5.375942549371634, "step": 74860}, {"loss": 0.605, "grad_norm": 0.9494470357894897, "learning_rate": 0.0002, "epoch": 5.376660682226212, "step": 74870}, {"loss": 0.6155, "grad_norm": 1.0116358995437622, "learning_rate": 0.0002, "epoch": 5.37737881508079, "step": 74880}, {"loss": 0.5595, "grad_norm": 0.9382423162460327, "learning_rate": 0.0002, "epoch": 5.378096947935368, "step": 74890}, {"loss": 0.5441, "grad_norm": 1.036151647567749, "learning_rate": 0.0002, "epoch": 5.378815080789946, "step": 74900}, {"loss": 0.5441, "grad_norm": 0.9436623454093933, "learning_rate": 0.0002, "epoch": 5.379533213644525, "step": 74910}, {"loss": 0.5327, "grad_norm": 1.0149152278900146, "learning_rate": 0.0002, "epoch": 5.380251346499103, "step": 74920}, {"loss": 0.5554, "grad_norm": 1.1645641326904297, "learning_rate": 0.0002, "epoch": 5.380969479353681, "step": 74930}, {"loss": 0.5662, "grad_norm": 1.002287745475769, "learning_rate": 0.0002, "epoch": 5.381687612208259, "step": 74940}, {"loss": 0.5602, "grad_norm": 1.1176437139511108, "learning_rate": 0.0002, "epoch": 5.382405745062837, "step": 74950}, {"loss": 0.582, "grad_norm": 0.9210802912712097, "learning_rate": 0.0002, "epoch": 5.383123877917415, "step": 74960}, {"loss": 0.5996, "grad_norm": 1.1873447895050049, "learning_rate": 0.0002, "epoch": 5.383842010771993, "step": 74970}, {"loss": 0.5391, "grad_norm": 0.8372976779937744, "learning_rate": 0.0002, "epoch": 5.384560143626571, "step": 74980}, {"loss": 0.5808, "grad_norm": 0.9220532178878784, "learning_rate": 0.0002, "epoch": 5.385278276481149, "step": 74990}, {"loss": 0.5897, "grad_norm": 0.9196901917457581, "learning_rate": 0.0002, "epoch": 5.385996409335727, "step": 75000}, {"loss": 0.5838, "grad_norm": 0.9325235486030579, "learning_rate": 0.0002, "epoch": 5.3867145421903055, "step": 75010}, {"loss": 0.5652, "grad_norm": 1.0902531147003174, "learning_rate": 0.0002, "epoch": 5.3874326750448835, "step": 75020}, {"loss": 0.581, "grad_norm": 1.049468755722046, "learning_rate": 0.0002, "epoch": 5.3881508078994615, "step": 75030}, {"loss": 0.6184, "grad_norm": 0.9372574687004089, "learning_rate": 0.0002, "epoch": 5.3888689407540395, "step": 75040}, {"loss": 0.6158, "grad_norm": 0.9013437628746033, "learning_rate": 0.0002, "epoch": 5.3895870736086176, "step": 75050}, {"loss": 0.5656, "grad_norm": 1.2111071348190308, "learning_rate": 0.0002, "epoch": 5.3903052064631956, "step": 75060}, {"loss": 0.5983, "grad_norm": 1.0006011724472046, "learning_rate": 0.0002, "epoch": 5.3910233393177736, "step": 75070}, {"loss": 0.5807, "grad_norm": 0.9180546402931213, "learning_rate": 0.0002, "epoch": 5.391741472172352, "step": 75080}, {"loss": 0.5878, "grad_norm": 1.096113920211792, "learning_rate": 0.0002, "epoch": 5.3924596050269304, "step": 75090}, {"loss": 0.5416, "grad_norm": 0.9041603207588196, "learning_rate": 0.0002, "epoch": 5.3931777378815084, "step": 75100}, {"loss": 0.5933, "grad_norm": 0.9675783514976501, "learning_rate": 0.0002, "epoch": 5.3938958707360865, "step": 75110}, {"loss": 0.5813, "grad_norm": 1.0952513217926025, "learning_rate": 0.0002, "epoch": 5.3946140035906645, "step": 75120}, {"loss": 0.5961, "grad_norm": 1.0166294574737549, "learning_rate": 0.0002, "epoch": 5.3953321364452425, "step": 75130}, {"loss": 0.6119, "grad_norm": 1.0892874002456665, "learning_rate": 0.0002, "epoch": 5.3960502692998205, "step": 75140}, {"loss": 0.6036, "grad_norm": 0.9894046187400818, "learning_rate": 0.0002, "epoch": 5.3967684021543985, "step": 75150}, {"loss": 0.5844, "grad_norm": 0.9991754293441772, "learning_rate": 0.0002, "epoch": 5.3974865350089765, "step": 75160}, {"loss": 0.5746, "grad_norm": 1.1027519702911377, "learning_rate": 0.0002, "epoch": 5.3982046678635545, "step": 75170}, {"loss": 0.5464, "grad_norm": 1.0579880475997925, "learning_rate": 0.0002, "epoch": 5.3989228007181325, "step": 75180}, {"loss": 0.5705, "grad_norm": 1.1149101257324219, "learning_rate": 0.0002, "epoch": 5.399640933572711, "step": 75190}, {"loss": 0.579, "grad_norm": 0.8802945017814636, "learning_rate": 0.0002, "epoch": 5.400359066427289, "step": 75200}, {"loss": 0.6117, "grad_norm": 0.9168137907981873, "learning_rate": 0.0002, "epoch": 5.401077199281867, "step": 75210}, {"loss": 0.543, "grad_norm": 1.232630968093872, "learning_rate": 0.0002, "epoch": 5.401795332136445, "step": 75220}, {"loss": 0.5739, "grad_norm": 1.1038591861724854, "learning_rate": 0.0002, "epoch": 5.402513464991023, "step": 75230}, {"loss": 0.5754, "grad_norm": 0.8985993266105652, "learning_rate": 0.0002, "epoch": 5.403231597845601, "step": 75240}, {"loss": 0.5517, "grad_norm": 1.1096316576004028, "learning_rate": 0.0002, "epoch": 5.403949730700179, "step": 75250}, {"loss": 0.5834, "grad_norm": 0.8516051173210144, "learning_rate": 0.0002, "epoch": 5.404667863554757, "step": 75260}, {"loss": 0.5779, "grad_norm": 0.9967356324195862, "learning_rate": 0.0002, "epoch": 5.405385996409335, "step": 75270}, {"loss": 0.6065, "grad_norm": 1.0092874765396118, "learning_rate": 0.0002, "epoch": 5.406104129263914, "step": 75280}, {"loss": 0.59, "grad_norm": 1.049838662147522, "learning_rate": 0.0002, "epoch": 5.406822262118492, "step": 75290}, {"loss": 0.6077, "grad_norm": 1.1491070985794067, "learning_rate": 0.0002, "epoch": 5.40754039497307, "step": 75300}, {"loss": 0.6423, "grad_norm": 0.9348118901252747, "learning_rate": 0.0002, "epoch": 5.408258527827648, "step": 75310}, {"loss": 0.5505, "grad_norm": 1.1226147413253784, "learning_rate": 0.0002, "epoch": 5.408976660682226, "step": 75320}, {"loss": 0.5906, "grad_norm": 0.9042587876319885, "learning_rate": 0.0002, "epoch": 5.409694793536804, "step": 75330}, {"loss": 0.5885, "grad_norm": 1.1212877035140991, "learning_rate": 0.0002, "epoch": 5.410412926391382, "step": 75340}, {"loss": 0.6056, "grad_norm": 0.9805570840835571, "learning_rate": 0.0002, "epoch": 5.41113105924596, "step": 75350}, {"loss": 0.5891, "grad_norm": 0.9803917407989502, "learning_rate": 0.0002, "epoch": 5.411849192100538, "step": 75360}, {"loss": 0.6338, "grad_norm": 1.2139064073562622, "learning_rate": 0.0002, "epoch": 5.412567324955116, "step": 75370}, {"loss": 0.5694, "grad_norm": 0.9510865211486816, "learning_rate": 0.0002, "epoch": 5.413285457809695, "step": 75380}, {"loss": 0.6072, "grad_norm": 1.0752202272415161, "learning_rate": 0.0002, "epoch": 5.414003590664273, "step": 75390}, {"loss": 0.5998, "grad_norm": 1.1144053936004639, "learning_rate": 0.0002, "epoch": 5.414721723518851, "step": 75400}, {"loss": 0.5783, "grad_norm": 1.128998875617981, "learning_rate": 0.0002, "epoch": 5.415439856373429, "step": 75410}, {"loss": 0.6092, "grad_norm": 1.2901849746704102, "learning_rate": 0.0002, "epoch": 5.416157989228007, "step": 75420}, {"loss": 0.5799, "grad_norm": 1.2822786569595337, "learning_rate": 0.0002, "epoch": 5.416876122082585, "step": 75430}, {"loss": 0.5744, "grad_norm": 0.8724783658981323, "learning_rate": 0.0002, "epoch": 5.417594254937163, "step": 75440}, {"loss": 0.5821, "grad_norm": 1.1321152448654175, "learning_rate": 0.0002, "epoch": 5.418312387791741, "step": 75450}, {"loss": 0.6394, "grad_norm": 1.1211779117584229, "learning_rate": 0.0002, "epoch": 5.419030520646319, "step": 75460}, {"loss": 0.584, "grad_norm": 1.0542290210723877, "learning_rate": 0.0002, "epoch": 5.419748653500898, "step": 75470}, {"loss": 0.5472, "grad_norm": 0.9432206153869629, "learning_rate": 0.0002, "epoch": 5.420466786355476, "step": 75480}, {"loss": 0.6053, "grad_norm": 1.2051608562469482, "learning_rate": 0.0002, "epoch": 5.421184919210054, "step": 75490}, {"loss": 0.5698, "grad_norm": 1.188256859779358, "learning_rate": 0.0002, "epoch": 5.421903052064632, "step": 75500}, {"loss": 0.5762, "grad_norm": 1.2768784761428833, "learning_rate": 0.0002, "epoch": 5.42262118491921, "step": 75510}, {"loss": 0.5961, "grad_norm": 0.8228567242622375, "learning_rate": 0.0002, "epoch": 5.423339317773788, "step": 75520}, {"loss": 0.602, "grad_norm": 1.235684871673584, "learning_rate": 0.0002, "epoch": 5.424057450628366, "step": 75530}, {"loss": 0.5923, "grad_norm": 0.8361109495162964, "learning_rate": 0.0002, "epoch": 5.424775583482944, "step": 75540}, {"loss": 0.578, "grad_norm": 1.0450727939605713, "learning_rate": 0.0002, "epoch": 5.425493716337522, "step": 75550}, {"loss": 0.6383, "grad_norm": 0.9942979216575623, "learning_rate": 0.0002, "epoch": 5.4262118491921, "step": 75560}, {"loss": 0.6406, "grad_norm": 0.8162592053413391, "learning_rate": 0.0002, "epoch": 5.426929982046679, "step": 75570}, {"loss": 0.5684, "grad_norm": 0.9193033576011658, "learning_rate": 0.0002, "epoch": 5.427648114901257, "step": 75580}, {"loss": 0.5773, "grad_norm": 1.095130443572998, "learning_rate": 0.0002, "epoch": 5.428366247755835, "step": 75590}, {"loss": 0.6036, "grad_norm": 1.1752824783325195, "learning_rate": 0.0002, "epoch": 5.429084380610413, "step": 75600}, {"loss": 0.5773, "grad_norm": 1.2007960081100464, "learning_rate": 0.0002, "epoch": 5.429802513464991, "step": 75610}, {"loss": 0.5928, "grad_norm": 0.997347354888916, "learning_rate": 0.0002, "epoch": 5.430520646319569, "step": 75620}, {"loss": 0.5798, "grad_norm": 1.3878827095031738, "learning_rate": 0.0002, "epoch": 5.431238779174147, "step": 75630}, {"loss": 0.5954, "grad_norm": 1.1839812994003296, "learning_rate": 0.0002, "epoch": 5.431956912028725, "step": 75640}, {"loss": 0.5789, "grad_norm": 0.9912546873092651, "learning_rate": 0.0002, "epoch": 5.432675044883303, "step": 75650}, {"loss": 0.5916, "grad_norm": 0.9305517673492432, "learning_rate": 0.0002, "epoch": 5.433393177737882, "step": 75660}, {"loss": 0.5869, "grad_norm": 1.0036604404449463, "learning_rate": 0.0002, "epoch": 5.43411131059246, "step": 75670}, {"loss": 0.5797, "grad_norm": 1.2500226497650146, "learning_rate": 0.0002, "epoch": 5.434829443447038, "step": 75680}, {"loss": 0.5923, "grad_norm": 0.9476167559623718, "learning_rate": 0.0002, "epoch": 5.435547576301616, "step": 75690}, {"loss": 0.5426, "grad_norm": 0.9769760370254517, "learning_rate": 0.0002, "epoch": 5.436265709156194, "step": 75700}, {"loss": 0.5397, "grad_norm": 1.1001025438308716, "learning_rate": 0.0002, "epoch": 5.436983842010772, "step": 75710}, {"loss": 0.5832, "grad_norm": 1.1783069372177124, "learning_rate": 0.0002, "epoch": 5.43770197486535, "step": 75720}, {"loss": 0.5961, "grad_norm": 0.887438952922821, "learning_rate": 0.0002, "epoch": 5.438420107719928, "step": 75730}, {"loss": 0.5904, "grad_norm": 0.9631154537200928, "learning_rate": 0.0002, "epoch": 5.439138240574506, "step": 75740}, {"loss": 0.5827, "grad_norm": 1.0824158191680908, "learning_rate": 0.0002, "epoch": 5.439856373429085, "step": 75750}, {"loss": 0.5824, "grad_norm": 1.0108296871185303, "learning_rate": 0.0002, "epoch": 5.440574506283663, "step": 75760}, {"loss": 0.6338, "grad_norm": 1.1728253364562988, "learning_rate": 0.0002, "epoch": 5.441292639138241, "step": 75770}, {"loss": 0.5661, "grad_norm": 1.0904773473739624, "learning_rate": 0.0002, "epoch": 5.442010771992819, "step": 75780}, {"loss": 0.638, "grad_norm": 0.8982957601547241, "learning_rate": 0.0002, "epoch": 5.442728904847397, "step": 75790}, {"loss": 0.583, "grad_norm": 1.0233404636383057, "learning_rate": 0.0002, "epoch": 5.443447037701975, "step": 75800}, {"loss": 0.6279, "grad_norm": 1.0092064142227173, "learning_rate": 0.0002, "epoch": 5.444165170556553, "step": 75810}, {"loss": 0.5673, "grad_norm": 1.2747842073440552, "learning_rate": 0.0002, "epoch": 5.444883303411131, "step": 75820}, {"loss": 0.5604, "grad_norm": 1.0365403890609741, "learning_rate": 0.0002, "epoch": 5.445601436265709, "step": 75830}, {"loss": 0.591, "grad_norm": 1.0413976907730103, "learning_rate": 0.0002, "epoch": 5.446319569120288, "step": 75840}, {"loss": 0.5995, "grad_norm": 0.8858456015586853, "learning_rate": 0.0002, "epoch": 5.447037701974866, "step": 75850}, {"loss": 0.5628, "grad_norm": 0.9823445677757263, "learning_rate": 0.0002, "epoch": 5.447755834829444, "step": 75860}, {"loss": 0.5691, "grad_norm": 0.8515284061431885, "learning_rate": 0.0002, "epoch": 5.448473967684022, "step": 75870}, {"loss": 0.5702, "grad_norm": 1.130850911140442, "learning_rate": 0.0002, "epoch": 5.4491921005386, "step": 75880}, {"loss": 0.5669, "grad_norm": 0.984725832939148, "learning_rate": 0.0002, "epoch": 5.449910233393178, "step": 75890}, {"loss": 0.5658, "grad_norm": 1.1701595783233643, "learning_rate": 0.0002, "epoch": 5.450628366247756, "step": 75900}, {"loss": 0.5555, "grad_norm": 0.8988107442855835, "learning_rate": 0.0002, "epoch": 5.451346499102334, "step": 75910}, {"loss": 0.6669, "grad_norm": 0.9909947514533997, "learning_rate": 0.0002, "epoch": 5.452064631956912, "step": 75920}, {"loss": 0.5528, "grad_norm": 0.8861672282218933, "learning_rate": 0.0002, "epoch": 5.45278276481149, "step": 75930}, {"loss": 0.5826, "grad_norm": 0.9513981938362122, "learning_rate": 0.0002, "epoch": 5.453500897666069, "step": 75940}, {"loss": 0.5827, "grad_norm": 1.0320760011672974, "learning_rate": 0.0002, "epoch": 5.454219030520647, "step": 75950}, {"loss": 0.5816, "grad_norm": 0.9830206632614136, "learning_rate": 0.0002, "epoch": 5.454937163375225, "step": 75960}, {"loss": 0.5228, "grad_norm": 0.9816349148750305, "learning_rate": 0.0002, "epoch": 5.455655296229803, "step": 75970}, {"loss": 0.594, "grad_norm": 0.9741218090057373, "learning_rate": 0.0002, "epoch": 5.456373429084381, "step": 75980}, {"loss": 0.634, "grad_norm": 1.1291148662567139, "learning_rate": 0.0002, "epoch": 5.457091561938959, "step": 75990}, {"loss": 0.5986, "grad_norm": 0.9770109057426453, "learning_rate": 0.0002, "epoch": 5.457809694793537, "step": 76000}, {"loss": 0.5783, "grad_norm": 1.0204377174377441, "learning_rate": 0.0002, "epoch": 5.458527827648115, "step": 76010}, {"loss": 0.5881, "grad_norm": 1.0453336238861084, "learning_rate": 0.0002, "epoch": 5.459245960502693, "step": 76020}, {"loss": 0.5798, "grad_norm": 1.1595505475997925, "learning_rate": 0.0002, "epoch": 5.4599640933572715, "step": 76030}, {"loss": 0.5787, "grad_norm": 1.1686701774597168, "learning_rate": 0.0002, "epoch": 5.4606822262118495, "step": 76040}, {"loss": 0.5746, "grad_norm": 1.14364755153656, "learning_rate": 0.0002, "epoch": 5.4614003590664275, "step": 76050}, {"loss": 0.5925, "grad_norm": 0.9742125868797302, "learning_rate": 0.0002, "epoch": 5.4621184919210055, "step": 76060}, {"loss": 0.6067, "grad_norm": 0.8235608339309692, "learning_rate": 0.0002, "epoch": 5.4628366247755835, "step": 76070}, {"loss": 0.5908, "grad_norm": 0.9801425337791443, "learning_rate": 0.0002, "epoch": 5.4635547576301615, "step": 76080}, {"loss": 0.6126, "grad_norm": 0.9001221060752869, "learning_rate": 0.0002, "epoch": 5.4642728904847395, "step": 76090}, {"loss": 0.6682, "grad_norm": 0.9292157888412476, "learning_rate": 0.0002, "epoch": 5.4649910233393175, "step": 76100}, {"loss": 0.6412, "grad_norm": 1.0024322271347046, "learning_rate": 0.0002, "epoch": 5.4657091561938955, "step": 76110}, {"loss": 0.5398, "grad_norm": 0.8057159781455994, "learning_rate": 0.0002, "epoch": 5.4664272890484735, "step": 76120}, {"loss": 0.5881, "grad_norm": 1.0617927312850952, "learning_rate": 0.0002, "epoch": 5.467145421903052, "step": 76130}, {"loss": 0.598, "grad_norm": 1.003967046737671, "learning_rate": 0.0002, "epoch": 5.46786355475763, "step": 76140}, {"loss": 0.5427, "grad_norm": 0.903408944606781, "learning_rate": 0.0002, "epoch": 5.468581687612208, "step": 76150}, {"loss": 0.5884, "grad_norm": 0.8173895478248596, "learning_rate": 0.0002, "epoch": 5.469299820466786, "step": 76160}, {"loss": 0.5526, "grad_norm": 1.0187482833862305, "learning_rate": 0.0002, "epoch": 5.470017953321364, "step": 76170}, {"loss": 0.5392, "grad_norm": 1.0418041944503784, "learning_rate": 0.0002, "epoch": 5.470736086175942, "step": 76180}, {"loss": 0.5761, "grad_norm": 0.9768357872962952, "learning_rate": 0.0002, "epoch": 5.47145421903052, "step": 76190}, {"loss": 0.5595, "grad_norm": 1.0834382772445679, "learning_rate": 0.0002, "epoch": 5.472172351885098, "step": 76200}, {"loss": 0.5783, "grad_norm": 0.8447439670562744, "learning_rate": 0.0002, "epoch": 5.472890484739676, "step": 76210}, {"loss": 0.5695, "grad_norm": 0.9379050135612488, "learning_rate": 0.0002, "epoch": 5.473608617594255, "step": 76220}, {"loss": 0.6053, "grad_norm": 1.0395485162734985, "learning_rate": 0.0002, "epoch": 5.474326750448833, "step": 76230}, {"loss": 0.5587, "grad_norm": 1.2082624435424805, "learning_rate": 0.0002, "epoch": 5.475044883303411, "step": 76240}, {"loss": 0.5891, "grad_norm": 1.0714443922042847, "learning_rate": 0.0002, "epoch": 5.475763016157989, "step": 76250}, {"loss": 0.5819, "grad_norm": 0.945319414138794, "learning_rate": 0.0002, "epoch": 5.476481149012567, "step": 76260}, {"loss": 0.5791, "grad_norm": 1.1415241956710815, "learning_rate": 0.0002, "epoch": 5.477199281867145, "step": 76270}, {"loss": 0.5586, "grad_norm": 0.9221673011779785, "learning_rate": 0.0002, "epoch": 5.477917414721723, "step": 76280}, {"loss": 0.5999, "grad_norm": 1.0118398666381836, "learning_rate": 0.0002, "epoch": 5.478635547576301, "step": 76290}, {"loss": 0.621, "grad_norm": 1.396807312965393, "learning_rate": 0.0002, "epoch": 5.479353680430879, "step": 76300}, {"loss": 0.5808, "grad_norm": 1.0437991619110107, "learning_rate": 0.0002, "epoch": 5.480071813285457, "step": 76310}, {"loss": 0.5846, "grad_norm": 1.5910401344299316, "learning_rate": 0.0002, "epoch": 5.480789946140036, "step": 76320}, {"loss": 0.6047, "grad_norm": 0.9262010455131531, "learning_rate": 0.0002, "epoch": 5.481508078994614, "step": 76330}, {"loss": 0.6079, "grad_norm": 1.2534247636795044, "learning_rate": 0.0002, "epoch": 5.482226211849192, "step": 76340}, {"loss": 0.5918, "grad_norm": 1.186294674873352, "learning_rate": 0.0002, "epoch": 5.48294434470377, "step": 76350}, {"loss": 0.5957, "grad_norm": 0.9822857975959778, "learning_rate": 0.0002, "epoch": 5.483662477558348, "step": 76360}, {"loss": 0.5427, "grad_norm": 1.0006381273269653, "learning_rate": 0.0002, "epoch": 5.484380610412926, "step": 76370}, {"loss": 0.5893, "grad_norm": 0.8960304260253906, "learning_rate": 0.0002, "epoch": 5.485098743267504, "step": 76380}, {"loss": 0.5515, "grad_norm": 0.7309539914131165, "learning_rate": 0.0002, "epoch": 5.485816876122082, "step": 76390}, {"loss": 0.5796, "grad_norm": 0.9747139811515808, "learning_rate": 0.0002, "epoch": 5.486535008976661, "step": 76400}, {"loss": 0.5898, "grad_norm": 0.9586864113807678, "learning_rate": 0.0002, "epoch": 5.487253141831239, "step": 76410}, {"loss": 0.6236, "grad_norm": 1.0815327167510986, "learning_rate": 0.0002, "epoch": 5.487971274685817, "step": 76420}, {"loss": 0.5923, "grad_norm": 1.1324117183685303, "learning_rate": 0.0002, "epoch": 5.488689407540395, "step": 76430}, {"loss": 0.5904, "grad_norm": 0.8575648069381714, "learning_rate": 0.0002, "epoch": 5.489407540394973, "step": 76440}, {"loss": 0.5477, "grad_norm": 0.9821682572364807, "learning_rate": 0.0002, "epoch": 5.490125673249551, "step": 76450}, {"loss": 0.5821, "grad_norm": 1.1611464023590088, "learning_rate": 0.0002, "epoch": 5.490843806104129, "step": 76460}, {"loss": 0.5227, "grad_norm": 1.0340297222137451, "learning_rate": 0.0002, "epoch": 5.491561938958707, "step": 76470}, {"loss": 0.6143, "grad_norm": 1.0116628408432007, "learning_rate": 0.0002, "epoch": 5.492280071813285, "step": 76480}, {"loss": 0.5968, "grad_norm": 0.9619752764701843, "learning_rate": 0.0002, "epoch": 5.492998204667863, "step": 76490}, {"loss": 0.5898, "grad_norm": 0.9924456477165222, "learning_rate": 0.0002, "epoch": 5.493716337522442, "step": 76500}, {"loss": 0.6041, "grad_norm": 0.9449224472045898, "learning_rate": 0.0002, "epoch": 5.49443447037702, "step": 76510}, {"loss": 0.5902, "grad_norm": 0.9075009822845459, "learning_rate": 0.0002, "epoch": 5.495152603231598, "step": 76520}, {"loss": 0.5602, "grad_norm": 1.3078763484954834, "learning_rate": 0.0002, "epoch": 5.495870736086176, "step": 76530}, {"loss": 0.5474, "grad_norm": 1.3162729740142822, "learning_rate": 0.0002, "epoch": 5.496588868940754, "step": 76540}, {"loss": 0.5938, "grad_norm": 1.144333839416504, "learning_rate": 0.0002, "epoch": 5.497307001795332, "step": 76550}, {"loss": 0.6105, "grad_norm": 0.9332208633422852, "learning_rate": 0.0002, "epoch": 5.49802513464991, "step": 76560}, {"loss": 0.5795, "grad_norm": 0.9660165309906006, "learning_rate": 0.0002, "epoch": 5.498743267504488, "step": 76570}, {"loss": 0.6023, "grad_norm": 1.0954749584197998, "learning_rate": 0.0002, "epoch": 5.499461400359066, "step": 76580}, {"loss": 0.5583, "grad_norm": 1.0537810325622559, "learning_rate": 0.0002, "epoch": 5.500179533213645, "step": 76590}, {"loss": 0.5976, "grad_norm": 0.9944321513175964, "learning_rate": 0.0002, "epoch": 5.500897666068223, "step": 76600}, {"loss": 0.5622, "grad_norm": 1.094462513923645, "learning_rate": 0.0002, "epoch": 5.501615798922801, "step": 76610}, {"loss": 0.6031, "grad_norm": 1.0246481895446777, "learning_rate": 0.0002, "epoch": 5.502333931777379, "step": 76620}, {"loss": 0.6211, "grad_norm": 0.9705453515052795, "learning_rate": 0.0002, "epoch": 5.503052064631957, "step": 76630}, {"loss": 0.6118, "grad_norm": 1.5252249240875244, "learning_rate": 0.0002, "epoch": 5.503770197486535, "step": 76640}, {"loss": 0.6351, "grad_norm": 0.8469606637954712, "learning_rate": 0.0002, "epoch": 5.504488330341113, "step": 76650}, {"loss": 0.6125, "grad_norm": 1.1882504224777222, "learning_rate": 0.0002, "epoch": 5.505206463195691, "step": 76660}, {"loss": 0.612, "grad_norm": 0.8447994589805603, "learning_rate": 0.0002, "epoch": 5.505924596050269, "step": 76670}, {"loss": 0.6233, "grad_norm": 0.9340696930885315, "learning_rate": 0.0002, "epoch": 5.506642728904847, "step": 76680}, {"loss": 0.5655, "grad_norm": 0.9622383713722229, "learning_rate": 0.0002, "epoch": 5.507360861759426, "step": 76690}, {"loss": 0.6346, "grad_norm": 1.1516523361206055, "learning_rate": 0.0002, "epoch": 5.508078994614004, "step": 76700}, {"loss": 0.5675, "grad_norm": 1.207190990447998, "learning_rate": 0.0002, "epoch": 5.508797127468582, "step": 76710}, {"loss": 0.5614, "grad_norm": 1.1244179010391235, "learning_rate": 0.0002, "epoch": 5.50951526032316, "step": 76720}, {"loss": 0.531, "grad_norm": 1.052288293838501, "learning_rate": 0.0002, "epoch": 5.510233393177738, "step": 76730}, {"loss": 0.5977, "grad_norm": 0.9571291208267212, "learning_rate": 0.0002, "epoch": 5.510951526032316, "step": 76740}, {"loss": 0.5974, "grad_norm": 0.9449458122253418, "learning_rate": 0.0002, "epoch": 5.511669658886894, "step": 76750}, {"loss": 0.59, "grad_norm": 1.0140511989593506, "learning_rate": 0.0002, "epoch": 5.512387791741472, "step": 76760}, {"loss": 0.5992, "grad_norm": 1.057715654373169, "learning_rate": 0.0002, "epoch": 5.513105924596051, "step": 76770}, {"loss": 0.5643, "grad_norm": 0.930642306804657, "learning_rate": 0.0002, "epoch": 5.513824057450629, "step": 76780}, {"loss": 0.5695, "grad_norm": 1.1213828325271606, "learning_rate": 0.0002, "epoch": 5.514542190305207, "step": 76790}, {"loss": 0.584, "grad_norm": 0.9147387742996216, "learning_rate": 0.0002, "epoch": 5.515260323159785, "step": 76800}, {"loss": 0.5759, "grad_norm": 1.1786983013153076, "learning_rate": 0.0002, "epoch": 5.515978456014363, "step": 76810}, {"loss": 0.5762, "grad_norm": 1.1022626161575317, "learning_rate": 0.0002, "epoch": 5.516696588868941, "step": 76820}, {"loss": 0.5795, "grad_norm": 1.0389000177383423, "learning_rate": 0.0002, "epoch": 5.517414721723519, "step": 76830}, {"loss": 0.5932, "grad_norm": 1.0750621557235718, "learning_rate": 0.0002, "epoch": 5.518132854578097, "step": 76840}, {"loss": 0.6177, "grad_norm": 1.0372626781463623, "learning_rate": 0.0002, "epoch": 5.518850987432675, "step": 76850}, {"loss": 0.5659, "grad_norm": 1.0989108085632324, "learning_rate": 0.0002, "epoch": 5.519569120287253, "step": 76860}, {"loss": 0.5525, "grad_norm": 1.030346155166626, "learning_rate": 0.0002, "epoch": 5.520287253141831, "step": 76870}, {"loss": 0.6669, "grad_norm": 1.1362419128417969, "learning_rate": 0.0002, "epoch": 5.52100538599641, "step": 76880}, {"loss": 0.5951, "grad_norm": 0.9110873937606812, "learning_rate": 0.0002, "epoch": 5.521723518850988, "step": 76890}, {"loss": 0.6161, "grad_norm": 1.0214358568191528, "learning_rate": 0.0002, "epoch": 5.522441651705566, "step": 76900}, {"loss": 0.6055, "grad_norm": 1.3764830827713013, "learning_rate": 0.0002, "epoch": 5.523159784560144, "step": 76910}, {"loss": 0.5822, "grad_norm": 1.0396335124969482, "learning_rate": 0.0002, "epoch": 5.523877917414722, "step": 76920}, {"loss": 0.6262, "grad_norm": 1.1942898035049438, "learning_rate": 0.0002, "epoch": 5.5245960502693, "step": 76930}, {"loss": 0.5927, "grad_norm": 0.8795760869979858, "learning_rate": 0.0002, "epoch": 5.525314183123878, "step": 76940}, {"loss": 0.5788, "grad_norm": 1.1081048250198364, "learning_rate": 0.0002, "epoch": 5.526032315978456, "step": 76950}, {"loss": 0.6101, "grad_norm": 0.9652274250984192, "learning_rate": 0.0002, "epoch": 5.526750448833035, "step": 76960}, {"loss": 0.6382, "grad_norm": 0.96559739112854, "learning_rate": 0.0002, "epoch": 5.527468581687613, "step": 76970}, {"loss": 0.6412, "grad_norm": 1.0416076183319092, "learning_rate": 0.0002, "epoch": 5.528186714542191, "step": 76980}, {"loss": 0.6027, "grad_norm": 0.9854229092597961, "learning_rate": 0.0002, "epoch": 5.528904847396769, "step": 76990}, {"loss": 0.6306, "grad_norm": 1.0515462160110474, "learning_rate": 0.0002, "epoch": 5.529622980251347, "step": 77000}, {"loss": 0.5783, "grad_norm": 1.0287327766418457, "learning_rate": 0.0002, "epoch": 5.530341113105925, "step": 77010}, {"loss": 0.6038, "grad_norm": 0.9579883217811584, "learning_rate": 0.0002, "epoch": 5.531059245960503, "step": 77020}, {"loss": 0.5908, "grad_norm": 1.0365805625915527, "learning_rate": 0.0002, "epoch": 5.531777378815081, "step": 77030}, {"loss": 0.5564, "grad_norm": 1.1600725650787354, "learning_rate": 0.0002, "epoch": 5.532495511669659, "step": 77040}, {"loss": 0.6147, "grad_norm": 0.8598031401634216, "learning_rate": 0.0002, "epoch": 5.533213644524237, "step": 77050}, {"loss": 0.5648, "grad_norm": 0.8884791731834412, "learning_rate": 0.0002, "epoch": 5.533931777378815, "step": 77060}, {"loss": 0.5559, "grad_norm": 0.900223433971405, "learning_rate": 0.0002, "epoch": 5.5346499102333935, "step": 77070}, {"loss": 0.5725, "grad_norm": 1.0212652683258057, "learning_rate": 0.0002, "epoch": 5.5353680430879715, "step": 77080}, {"loss": 0.6645, "grad_norm": 1.0924701690673828, "learning_rate": 0.0002, "epoch": 5.5360861759425495, "step": 77090}, {"loss": 0.5957, "grad_norm": 1.1955485343933105, "learning_rate": 0.0002, "epoch": 5.5368043087971275, "step": 77100}, {"loss": 0.5855, "grad_norm": 1.2157706022262573, "learning_rate": 0.0002, "epoch": 5.5375224416517055, "step": 77110}, {"loss": 0.6067, "grad_norm": 1.1118255853652954, "learning_rate": 0.0002, "epoch": 5.5382405745062835, "step": 77120}, {"loss": 0.5813, "grad_norm": 1.0146820545196533, "learning_rate": 0.0002, "epoch": 5.5389587073608615, "step": 77130}, {"loss": 0.6004, "grad_norm": 1.0876632928848267, "learning_rate": 0.0002, "epoch": 5.5396768402154395, "step": 77140}, {"loss": 0.5934, "grad_norm": 0.7914495468139648, "learning_rate": 0.0002, "epoch": 5.540394973070018, "step": 77150}, {"loss": 0.5666, "grad_norm": 1.0584027767181396, "learning_rate": 0.0002, "epoch": 5.541113105924596, "step": 77160}, {"loss": 0.523, "grad_norm": 0.9816845059394836, "learning_rate": 0.0002, "epoch": 5.541831238779174, "step": 77170}, {"loss": 0.5487, "grad_norm": 1.219076156616211, "learning_rate": 0.0002, "epoch": 5.542549371633752, "step": 77180}, {"loss": 0.639, "grad_norm": 0.9526635408401489, "learning_rate": 0.0002, "epoch": 5.54326750448833, "step": 77190}, {"loss": 0.5849, "grad_norm": 0.8437230587005615, "learning_rate": 0.0002, "epoch": 5.543985637342908, "step": 77200}, {"loss": 0.5858, "grad_norm": 0.9670451283454895, "learning_rate": 0.0002, "epoch": 5.544703770197486, "step": 77210}, {"loss": 0.559, "grad_norm": 1.015687346458435, "learning_rate": 0.0002, "epoch": 5.545421903052064, "step": 77220}, {"loss": 0.6065, "grad_norm": 0.8280553817749023, "learning_rate": 0.0002, "epoch": 5.546140035906642, "step": 77230}, {"loss": 0.5999, "grad_norm": 1.1320816278457642, "learning_rate": 0.0002, "epoch": 5.54685816876122, "step": 77240}, {"loss": 0.5894, "grad_norm": 1.3338711261749268, "learning_rate": 0.0002, "epoch": 5.547576301615799, "step": 77250}, {"loss": 0.591, "grad_norm": 0.9553194642066956, "learning_rate": 0.0002, "epoch": 5.548294434470377, "step": 77260}, {"loss": 0.6286, "grad_norm": 1.0604912042617798, "learning_rate": 0.0002, "epoch": 5.549012567324955, "step": 77270}, {"loss": 0.6362, "grad_norm": 1.1037590503692627, "learning_rate": 0.0002, "epoch": 5.549730700179533, "step": 77280}, {"loss": 0.6021, "grad_norm": 1.166212558746338, "learning_rate": 0.0002, "epoch": 5.550448833034111, "step": 77290}, {"loss": 0.5624, "grad_norm": 1.0189802646636963, "learning_rate": 0.0002, "epoch": 5.551166965888689, "step": 77300}, {"loss": 0.5998, "grad_norm": 0.9592387080192566, "learning_rate": 0.0002, "epoch": 5.551885098743267, "step": 77310}, {"loss": 0.609, "grad_norm": 0.9533785581588745, "learning_rate": 0.0002, "epoch": 5.552603231597845, "step": 77320}, {"loss": 0.5879, "grad_norm": 0.9666807055473328, "learning_rate": 0.0002, "epoch": 5.553321364452424, "step": 77330}, {"loss": 0.6049, "grad_norm": 0.8827478289604187, "learning_rate": 0.0002, "epoch": 5.554039497307002, "step": 77340}, {"loss": 0.5644, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 5.55475763016158, "step": 77350}, {"loss": 0.6083, "grad_norm": 1.14597487449646, "learning_rate": 0.0002, "epoch": 5.555475763016158, "step": 77360}, {"loss": 0.6025, "grad_norm": 1.009392499923706, "learning_rate": 0.0002, "epoch": 5.556193895870736, "step": 77370}, {"loss": 0.6141, "grad_norm": 1.115757942199707, "learning_rate": 0.0002, "epoch": 5.556912028725314, "step": 77380}, {"loss": 0.5538, "grad_norm": 0.9907452464103699, "learning_rate": 0.0002, "epoch": 5.557630161579892, "step": 77390}, {"loss": 0.6142, "grad_norm": 1.0667012929916382, "learning_rate": 0.0002, "epoch": 5.55834829443447, "step": 77400}, {"loss": 0.5728, "grad_norm": 0.9301251173019409, "learning_rate": 0.0002, "epoch": 5.559066427289048, "step": 77410}, {"loss": 0.6174, "grad_norm": 1.090384602546692, "learning_rate": 0.0002, "epoch": 5.559784560143626, "step": 77420}, {"loss": 0.5802, "grad_norm": 0.8073469996452332, "learning_rate": 0.0002, "epoch": 5.560502692998204, "step": 77430}, {"loss": 0.5757, "grad_norm": 1.1003652811050415, "learning_rate": 0.0002, "epoch": 5.561220825852783, "step": 77440}, {"loss": 0.5899, "grad_norm": 0.9493791460990906, "learning_rate": 0.0002, "epoch": 5.561938958707361, "step": 77450}, {"loss": 0.6029, "grad_norm": 0.925388514995575, "learning_rate": 0.0002, "epoch": 5.562657091561939, "step": 77460}, {"loss": 0.5893, "grad_norm": 1.0946427583694458, "learning_rate": 0.0002, "epoch": 5.563375224416517, "step": 77470}, {"loss": 0.58, "grad_norm": 0.9791404008865356, "learning_rate": 0.0002, "epoch": 5.564093357271095, "step": 77480}, {"loss": 0.5887, "grad_norm": 1.0534733533859253, "learning_rate": 0.0002, "epoch": 5.564811490125673, "step": 77490}, {"loss": 0.564, "grad_norm": 0.9351776242256165, "learning_rate": 0.0002, "epoch": 5.565529622980251, "step": 77500}, {"loss": 0.5489, "grad_norm": 1.004448413848877, "learning_rate": 0.0002, "epoch": 5.566247755834829, "step": 77510}, {"loss": 0.5717, "grad_norm": 1.0199403762817383, "learning_rate": 0.0002, "epoch": 5.566965888689408, "step": 77520}, {"loss": 0.6358, "grad_norm": 1.0693204402923584, "learning_rate": 0.0002, "epoch": 5.567684021543986, "step": 77530}, {"loss": 0.5896, "grad_norm": 1.0635178089141846, "learning_rate": 0.0002, "epoch": 5.568402154398564, "step": 77540}, {"loss": 0.6399, "grad_norm": 1.1154648065567017, "learning_rate": 0.0002, "epoch": 5.569120287253142, "step": 77550}, {"loss": 0.5748, "grad_norm": 0.999116837978363, "learning_rate": 0.0002, "epoch": 5.56983842010772, "step": 77560}, {"loss": 0.6159, "grad_norm": 0.9967397451400757, "learning_rate": 0.0002, "epoch": 5.570556552962298, "step": 77570}, {"loss": 0.6041, "grad_norm": 0.9684699773788452, "learning_rate": 0.0002, "epoch": 5.571274685816876, "step": 77580}, {"loss": 0.5876, "grad_norm": 1.027213454246521, "learning_rate": 0.0002, "epoch": 5.571992818671454, "step": 77590}, {"loss": 0.6631, "grad_norm": 1.0571194887161255, "learning_rate": 0.0002, "epoch": 5.572710951526032, "step": 77600}, {"loss": 0.5927, "grad_norm": 1.2010499238967896, "learning_rate": 0.0002, "epoch": 5.57342908438061, "step": 77610}, {"loss": 0.5962, "grad_norm": 1.1033680438995361, "learning_rate": 0.0002, "epoch": 5.574147217235188, "step": 77620}, {"loss": 0.5668, "grad_norm": 0.9394578337669373, "learning_rate": 0.0002, "epoch": 5.574865350089767, "step": 77630}, {"loss": 0.6018, "grad_norm": 1.379382610321045, "learning_rate": 0.0002, "epoch": 5.575583482944345, "step": 77640}, {"loss": 0.5921, "grad_norm": 0.9787197709083557, "learning_rate": 0.0002, "epoch": 5.576301615798923, "step": 77650}, {"loss": 0.569, "grad_norm": 0.9680284261703491, "learning_rate": 0.0002, "epoch": 5.577019748653501, "step": 77660}, {"loss": 0.5761, "grad_norm": 1.0449682474136353, "learning_rate": 0.0002, "epoch": 5.577737881508079, "step": 77670}, {"loss": 0.5835, "grad_norm": 1.1243085861206055, "learning_rate": 0.0002, "epoch": 5.578456014362657, "step": 77680}, {"loss": 0.5873, "grad_norm": 0.9228966236114502, "learning_rate": 0.0002, "epoch": 5.579174147217235, "step": 77690}, {"loss": 0.5888, "grad_norm": 1.1349890232086182, "learning_rate": 0.0002, "epoch": 5.579892280071813, "step": 77700}, {"loss": 0.6272, "grad_norm": 1.2248499393463135, "learning_rate": 0.0002, "epoch": 5.580610412926392, "step": 77710}, {"loss": 0.5734, "grad_norm": 1.0066324472427368, "learning_rate": 0.0002, "epoch": 5.58132854578097, "step": 77720}, {"loss": 0.6047, "grad_norm": 1.2642878293991089, "learning_rate": 0.0002, "epoch": 5.582046678635548, "step": 77730}, {"loss": 0.5946, "grad_norm": 1.031591534614563, "learning_rate": 0.0002, "epoch": 5.582764811490126, "step": 77740}, {"loss": 0.5743, "grad_norm": 1.0925929546356201, "learning_rate": 0.0002, "epoch": 5.583482944344704, "step": 77750}, {"loss": 0.6113, "grad_norm": 1.0567110776901245, "learning_rate": 0.0002, "epoch": 5.584201077199282, "step": 77760}, {"loss": 0.5523, "grad_norm": 1.246246099472046, "learning_rate": 0.0002, "epoch": 5.58491921005386, "step": 77770}, {"loss": 0.5934, "grad_norm": 1.2467739582061768, "learning_rate": 0.0002, "epoch": 5.585637342908438, "step": 77780}, {"loss": 0.6211, "grad_norm": 1.2695211172103882, "learning_rate": 0.0002, "epoch": 5.586355475763016, "step": 77790}, {"loss": 0.5824, "grad_norm": 1.0498571395874023, "learning_rate": 0.0002, "epoch": 5.587073608617594, "step": 77800}, {"loss": 0.5545, "grad_norm": 1.0078339576721191, "learning_rate": 0.0002, "epoch": 5.587791741472173, "step": 77810}, {"loss": 0.5995, "grad_norm": 1.108199954032898, "learning_rate": 0.0002, "epoch": 5.588509874326751, "step": 77820}, {"loss": 0.5716, "grad_norm": 1.0577641725540161, "learning_rate": 0.0002, "epoch": 5.589228007181329, "step": 77830}, {"loss": 0.6106, "grad_norm": 1.2169439792633057, "learning_rate": 0.0002, "epoch": 5.589946140035907, "step": 77840}, {"loss": 0.563, "grad_norm": 0.8310868740081787, "learning_rate": 0.0002, "epoch": 5.590664272890485, "step": 77850}, {"loss": 0.5749, "grad_norm": 0.9794082045555115, "learning_rate": 0.0002, "epoch": 5.591382405745063, "step": 77860}, {"loss": 0.6025, "grad_norm": 0.8867404460906982, "learning_rate": 0.0002, "epoch": 5.592100538599641, "step": 77870}, {"loss": 0.5581, "grad_norm": 0.9204208254814148, "learning_rate": 0.0002, "epoch": 5.592818671454219, "step": 77880}, {"loss": 0.5646, "grad_norm": 0.9801714420318604, "learning_rate": 0.0002, "epoch": 5.593536804308797, "step": 77890}, {"loss": 0.6036, "grad_norm": 0.9383925199508667, "learning_rate": 0.0002, "epoch": 5.594254937163376, "step": 77900}, {"loss": 0.6417, "grad_norm": 0.9124664068222046, "learning_rate": 0.0002, "epoch": 5.594973070017954, "step": 77910}, {"loss": 0.559, "grad_norm": 0.9618783593177795, "learning_rate": 0.0002, "epoch": 5.595691202872532, "step": 77920}, {"loss": 0.604, "grad_norm": 0.9575216770172119, "learning_rate": 0.0002, "epoch": 5.59640933572711, "step": 77930}, {"loss": 0.5987, "grad_norm": 1.1223464012145996, "learning_rate": 0.0002, "epoch": 5.597127468581688, "step": 77940}, {"loss": 0.615, "grad_norm": 0.9947475790977478, "learning_rate": 0.0002, "epoch": 5.597845601436266, "step": 77950}, {"loss": 0.5618, "grad_norm": 1.141959309577942, "learning_rate": 0.0002, "epoch": 5.598563734290844, "step": 77960}, {"loss": 0.5966, "grad_norm": 1.095525860786438, "learning_rate": 0.0002, "epoch": 5.599281867145422, "step": 77970}, {"loss": 0.5619, "grad_norm": 0.9396624565124512, "learning_rate": 0.0002, "epoch": 5.6, "step": 77980}, {"loss": 0.5549, "grad_norm": 0.8162274956703186, "learning_rate": 0.0002, "epoch": 5.600718132854578, "step": 77990}, {"loss": 0.5815, "grad_norm": 1.0130535364151, "learning_rate": 0.0002, "epoch": 5.6014362657091565, "step": 78000}, {"loss": 0.5891, "grad_norm": 1.0016634464263916, "learning_rate": 0.0002, "epoch": 5.6021543985637345, "step": 78010}, {"loss": 0.6029, "grad_norm": 0.8936169743537903, "learning_rate": 0.0002, "epoch": 5.6028725314183125, "step": 78020}, {"loss": 0.6284, "grad_norm": 1.169625163078308, "learning_rate": 0.0002, "epoch": 5.6035906642728905, "step": 78030}, {"loss": 0.6038, "grad_norm": 0.8896323442459106, "learning_rate": 0.0002, "epoch": 5.6043087971274685, "step": 78040}, {"loss": 0.6219, "grad_norm": 1.0939475297927856, "learning_rate": 0.0002, "epoch": 5.6050269299820465, "step": 78050}, {"loss": 0.6009, "grad_norm": 1.0880711078643799, "learning_rate": 0.0002, "epoch": 5.6057450628366245, "step": 78060}, {"loss": 0.6416, "grad_norm": 1.1426655054092407, "learning_rate": 0.0002, "epoch": 5.6064631956912026, "step": 78070}, {"loss": 0.6124, "grad_norm": 1.118586540222168, "learning_rate": 0.0002, "epoch": 5.607181328545781, "step": 78080}, {"loss": 0.5791, "grad_norm": 0.8784464597702026, "learning_rate": 0.0002, "epoch": 5.607899461400359, "step": 78090}, {"loss": 0.6385, "grad_norm": 1.137229561805725, "learning_rate": 0.0002, "epoch": 5.608617594254937, "step": 78100}, {"loss": 0.5998, "grad_norm": 1.1041932106018066, "learning_rate": 0.0002, "epoch": 5.6093357271095154, "step": 78110}, {"loss": 0.5985, "grad_norm": 1.0170503854751587, "learning_rate": 0.0002, "epoch": 5.6100538599640934, "step": 78120}, {"loss": 0.6376, "grad_norm": 1.298754334449768, "learning_rate": 0.0002, "epoch": 5.6107719928186714, "step": 78130}, {"loss": 0.6284, "grad_norm": 0.9344905018806458, "learning_rate": 0.0002, "epoch": 5.6114901256732495, "step": 78140}, {"loss": 0.5835, "grad_norm": 0.9467785954475403, "learning_rate": 0.0002, "epoch": 5.6122082585278275, "step": 78150}, {"loss": 0.5661, "grad_norm": 1.0617443323135376, "learning_rate": 0.0002, "epoch": 5.6129263913824055, "step": 78160}, {"loss": 0.5908, "grad_norm": 0.9017760753631592, "learning_rate": 0.0002, "epoch": 5.6136445242369835, "step": 78170}, {"loss": 0.5701, "grad_norm": 1.152601957321167, "learning_rate": 0.0002, "epoch": 5.6143626570915615, "step": 78180}, {"loss": 0.6319, "grad_norm": 0.9889463186264038, "learning_rate": 0.0002, "epoch": 5.61508078994614, "step": 78190}, {"loss": 0.5733, "grad_norm": 1.0367393493652344, "learning_rate": 0.0002, "epoch": 5.615798922800718, "step": 78200}, {"loss": 0.5785, "grad_norm": 0.8466457724571228, "learning_rate": 0.0002, "epoch": 5.616517055655296, "step": 78210}, {"loss": 0.563, "grad_norm": 0.936083197593689, "learning_rate": 0.0002, "epoch": 5.617235188509874, "step": 78220}, {"loss": 0.6077, "grad_norm": 1.018784999847412, "learning_rate": 0.0002, "epoch": 5.617953321364452, "step": 78230}, {"loss": 0.5676, "grad_norm": 0.8527804017066956, "learning_rate": 0.0002, "epoch": 5.61867145421903, "step": 78240}, {"loss": 0.5721, "grad_norm": 1.1873106956481934, "learning_rate": 0.0002, "epoch": 5.619389587073608, "step": 78250}, {"loss": 0.5905, "grad_norm": 0.9401728510856628, "learning_rate": 0.0002, "epoch": 5.620107719928186, "step": 78260}, {"loss": 0.5986, "grad_norm": 1.0801159143447876, "learning_rate": 0.0002, "epoch": 5.620825852782765, "step": 78270}, {"loss": 0.5769, "grad_norm": 1.0053739547729492, "learning_rate": 0.0002, "epoch": 5.621543985637343, "step": 78280}, {"loss": 0.5907, "grad_norm": 0.8599331378936768, "learning_rate": 0.0002, "epoch": 5.622262118491921, "step": 78290}, {"loss": 0.5689, "grad_norm": 2.3157296180725098, "learning_rate": 0.0002, "epoch": 5.622980251346499, "step": 78300}, {"loss": 0.5749, "grad_norm": 1.0027490854263306, "learning_rate": 0.0002, "epoch": 5.623698384201077, "step": 78310}, {"loss": 0.5452, "grad_norm": 0.996688961982727, "learning_rate": 0.0002, "epoch": 5.624416517055655, "step": 78320}, {"loss": 0.5979, "grad_norm": 1.0462113618850708, "learning_rate": 0.0002, "epoch": 5.625134649910233, "step": 78330}, {"loss": 0.5547, "grad_norm": 0.8750988245010376, "learning_rate": 0.0002, "epoch": 5.625852782764811, "step": 78340}, {"loss": 0.6076, "grad_norm": 0.8078145384788513, "learning_rate": 0.0002, "epoch": 5.626570915619389, "step": 78350}, {"loss": 0.6431, "grad_norm": 0.9047532081604004, "learning_rate": 0.0002, "epoch": 5.627289048473967, "step": 78360}, {"loss": 0.6027, "grad_norm": 0.9784479737281799, "learning_rate": 0.0002, "epoch": 5.628007181328546, "step": 78370}, {"loss": 0.6005, "grad_norm": 0.9529541730880737, "learning_rate": 0.0002, "epoch": 5.628725314183124, "step": 78380}, {"loss": 0.6057, "grad_norm": 0.8264740109443665, "learning_rate": 0.0002, "epoch": 5.629443447037702, "step": 78390}, {"loss": 0.5991, "grad_norm": 1.049724817276001, "learning_rate": 0.0002, "epoch": 5.63016157989228, "step": 78400}, {"loss": 0.5637, "grad_norm": 0.9866746068000793, "learning_rate": 0.0002, "epoch": 5.630879712746858, "step": 78410}, {"loss": 0.5622, "grad_norm": 0.897155225276947, "learning_rate": 0.0002, "epoch": 5.631597845601436, "step": 78420}, {"loss": 0.5838, "grad_norm": 1.225464940071106, "learning_rate": 0.0002, "epoch": 5.632315978456014, "step": 78430}, {"loss": 0.5928, "grad_norm": 0.8793753981590271, "learning_rate": 0.0002, "epoch": 5.633034111310592, "step": 78440}, {"loss": 0.6009, "grad_norm": 1.082482099533081, "learning_rate": 0.0002, "epoch": 5.63375224416517, "step": 78450}, {"loss": 0.6546, "grad_norm": 1.054064393043518, "learning_rate": 0.0002, "epoch": 5.634470377019749, "step": 78460}, {"loss": 0.5795, "grad_norm": 1.0032247304916382, "learning_rate": 0.0002, "epoch": 5.635188509874327, "step": 78470}, {"loss": 0.5697, "grad_norm": 0.8544651865959167, "learning_rate": 0.0002, "epoch": 5.635906642728905, "step": 78480}, {"loss": 0.6196, "grad_norm": 0.9475075602531433, "learning_rate": 0.0002, "epoch": 5.636624775583483, "step": 78490}, {"loss": 0.5975, "grad_norm": 1.0814138650894165, "learning_rate": 0.0002, "epoch": 5.637342908438061, "step": 78500}, {"loss": 0.5853, "grad_norm": 1.0813153982162476, "learning_rate": 0.0002, "epoch": 5.638061041292639, "step": 78510}, {"loss": 0.5806, "grad_norm": 1.0225616693496704, "learning_rate": 0.0002, "epoch": 5.638779174147217, "step": 78520}, {"loss": 0.5913, "grad_norm": 1.0777465105056763, "learning_rate": 0.0002, "epoch": 5.639497307001795, "step": 78530}, {"loss": 0.6207, "grad_norm": 1.156148910522461, "learning_rate": 0.0002, "epoch": 5.640215439856373, "step": 78540}, {"loss": 0.5843, "grad_norm": 1.0147465467453003, "learning_rate": 0.0002, "epoch": 5.640933572710951, "step": 78550}, {"loss": 0.6045, "grad_norm": 0.9606683850288391, "learning_rate": 0.0002, "epoch": 5.64165170556553, "step": 78560}, {"loss": 0.6457, "grad_norm": 0.9478723406791687, "learning_rate": 0.0002, "epoch": 5.642369838420108, "step": 78570}, {"loss": 0.5502, "grad_norm": 1.0653880834579468, "learning_rate": 0.0002, "epoch": 5.643087971274686, "step": 78580}, {"loss": 0.5938, "grad_norm": 1.7519923448562622, "learning_rate": 0.0002, "epoch": 5.643806104129264, "step": 78590}, {"loss": 0.6015, "grad_norm": 1.0567299127578735, "learning_rate": 0.0002, "epoch": 5.644524236983842, "step": 78600}, {"loss": 0.6329, "grad_norm": 0.8980287909507751, "learning_rate": 0.0002, "epoch": 5.64524236983842, "step": 78610}, {"loss": 0.6319, "grad_norm": 0.8792264461517334, "learning_rate": 0.0002, "epoch": 5.645960502692998, "step": 78620}, {"loss": 0.6234, "grad_norm": 1.2306275367736816, "learning_rate": 0.0002, "epoch": 5.646678635547576, "step": 78630}, {"loss": 0.5567, "grad_norm": 0.8259932398796082, "learning_rate": 0.0002, "epoch": 5.647396768402155, "step": 78640}, {"loss": 0.5484, "grad_norm": 0.9605076313018799, "learning_rate": 0.0002, "epoch": 5.648114901256733, "step": 78650}, {"loss": 0.5934, "grad_norm": 0.9967419505119324, "learning_rate": 0.0002, "epoch": 5.648833034111311, "step": 78660}, {"loss": 0.5755, "grad_norm": 0.9774024486541748, "learning_rate": 0.0002, "epoch": 5.649551166965889, "step": 78670}, {"loss": 0.6079, "grad_norm": 0.9838066697120667, "learning_rate": 0.0002, "epoch": 5.650269299820467, "step": 78680}, {"loss": 0.5674, "grad_norm": 1.1617798805236816, "learning_rate": 0.0002, "epoch": 5.650987432675045, "step": 78690}, {"loss": 0.6252, "grad_norm": 1.075006365776062, "learning_rate": 0.0002, "epoch": 5.651705565529623, "step": 78700}, {"loss": 0.5404, "grad_norm": 0.8859893679618835, "learning_rate": 0.0002, "epoch": 5.652423698384201, "step": 78710}, {"loss": 0.5657, "grad_norm": 1.0774717330932617, "learning_rate": 0.0002, "epoch": 5.653141831238779, "step": 78720}, {"loss": 0.625, "grad_norm": 1.147273302078247, "learning_rate": 0.0002, "epoch": 5.653859964093357, "step": 78730}, {"loss": 0.5819, "grad_norm": 1.1403213739395142, "learning_rate": 0.0002, "epoch": 5.654578096947935, "step": 78740}, {"loss": 0.5721, "grad_norm": 0.9115353226661682, "learning_rate": 0.0002, "epoch": 5.655296229802514, "step": 78750}, {"loss": 0.5521, "grad_norm": 0.9303002953529358, "learning_rate": 0.0002, "epoch": 5.656014362657092, "step": 78760}, {"loss": 0.6078, "grad_norm": 0.9324957728385925, "learning_rate": 0.0002, "epoch": 5.65673249551167, "step": 78770}, {"loss": 0.589, "grad_norm": 0.9688063859939575, "learning_rate": 0.0002, "epoch": 5.657450628366248, "step": 78780}, {"loss": 0.614, "grad_norm": 0.9019638299942017, "learning_rate": 0.0002, "epoch": 5.658168761220826, "step": 78790}, {"loss": 0.5594, "grad_norm": 0.8236798048019409, "learning_rate": 0.0002, "epoch": 5.658886894075404, "step": 78800}, {"loss": 0.6074, "grad_norm": 1.2702386379241943, "learning_rate": 0.0002, "epoch": 5.659605026929982, "step": 78810}, {"loss": 0.5738, "grad_norm": 1.041077971458435, "learning_rate": 0.0002, "epoch": 5.66032315978456, "step": 78820}, {"loss": 0.5773, "grad_norm": 0.9028838276863098, "learning_rate": 0.0002, "epoch": 5.661041292639139, "step": 78830}, {"loss": 0.5871, "grad_norm": 0.9874144196510315, "learning_rate": 0.0002, "epoch": 5.661759425493717, "step": 78840}, {"loss": 0.6039, "grad_norm": 0.9633761048316956, "learning_rate": 0.0002, "epoch": 5.662477558348295, "step": 78850}, {"loss": 0.5794, "grad_norm": 0.9069564342498779, "learning_rate": 0.0002, "epoch": 5.663195691202873, "step": 78860}, {"loss": 0.5836, "grad_norm": 0.9560621976852417, "learning_rate": 0.0002, "epoch": 5.663913824057451, "step": 78870}, {"loss": 0.579, "grad_norm": 0.9941161870956421, "learning_rate": 0.0002, "epoch": 5.664631956912029, "step": 78880}, {"loss": 0.6184, "grad_norm": 0.920407235622406, "learning_rate": 0.0002, "epoch": 5.665350089766607, "step": 78890}, {"loss": 0.6223, "grad_norm": 0.9909250140190125, "learning_rate": 0.0002, "epoch": 5.666068222621185, "step": 78900}, {"loss": 0.6154, "grad_norm": 0.9528568983078003, "learning_rate": 0.0002, "epoch": 5.666786355475763, "step": 78910}, {"loss": 0.6153, "grad_norm": 1.041440725326538, "learning_rate": 0.0002, "epoch": 5.667504488330341, "step": 78920}, {"loss": 0.609, "grad_norm": 1.0072191953659058, "learning_rate": 0.0002, "epoch": 5.66822262118492, "step": 78930}, {"loss": 0.6136, "grad_norm": 1.0740574598312378, "learning_rate": 0.0002, "epoch": 5.668940754039498, "step": 78940}, {"loss": 0.583, "grad_norm": 0.9168822169303894, "learning_rate": 0.0002, "epoch": 5.669658886894076, "step": 78950}, {"loss": 0.5808, "grad_norm": 1.1818004846572876, "learning_rate": 0.0002, "epoch": 5.670377019748654, "step": 78960}, {"loss": 0.6584, "grad_norm": 1.1925201416015625, "learning_rate": 0.0002, "epoch": 5.671095152603232, "step": 78970}, {"loss": 0.6074, "grad_norm": 0.879940390586853, "learning_rate": 0.0002, "epoch": 5.67181328545781, "step": 78980}, {"loss": 0.5863, "grad_norm": 1.0998331308364868, "learning_rate": 0.0002, "epoch": 5.672531418312388, "step": 78990}, {"loss": 0.5688, "grad_norm": 1.076637625694275, "learning_rate": 0.0002, "epoch": 5.673249551166966, "step": 79000}, {"loss": 0.6183, "grad_norm": 1.076864242553711, "learning_rate": 0.0002, "epoch": 5.673967684021544, "step": 79010}, {"loss": 0.6031, "grad_norm": 1.0206586122512817, "learning_rate": 0.0002, "epoch": 5.6746858168761225, "step": 79020}, {"loss": 0.5658, "grad_norm": 0.8242515325546265, "learning_rate": 0.0002, "epoch": 5.6754039497307005, "step": 79030}, {"loss": 0.5782, "grad_norm": 1.1180634498596191, "learning_rate": 0.0002, "epoch": 5.6761220825852785, "step": 79040}, {"loss": 0.6039, "grad_norm": 1.0155152082443237, "learning_rate": 0.0002, "epoch": 5.6768402154398565, "step": 79050}, {"loss": 0.5877, "grad_norm": 1.0445241928100586, "learning_rate": 0.0002, "epoch": 5.6775583482944345, "step": 79060}, {"loss": 0.5809, "grad_norm": 0.9851725697517395, "learning_rate": 0.0002, "epoch": 5.6782764811490125, "step": 79070}, {"loss": 0.5807, "grad_norm": 0.9979640245437622, "learning_rate": 0.0002, "epoch": 5.6789946140035905, "step": 79080}, {"loss": 0.6049, "grad_norm": 1.0398952960968018, "learning_rate": 0.0002, "epoch": 5.6797127468581685, "step": 79090}, {"loss": 0.6279, "grad_norm": 1.094164252281189, "learning_rate": 0.0002, "epoch": 5.6804308797127465, "step": 79100}, {"loss": 0.6325, "grad_norm": 0.9546816945075989, "learning_rate": 0.0002, "epoch": 5.6811490125673245, "step": 79110}, {"loss": 0.5658, "grad_norm": 1.1635938882827759, "learning_rate": 0.0002, "epoch": 5.681867145421903, "step": 79120}, {"loss": 0.5849, "grad_norm": 1.0260306596755981, "learning_rate": 0.0002, "epoch": 5.682585278276481, "step": 79130}, {"loss": 0.5653, "grad_norm": 0.9900122284889221, "learning_rate": 0.0002, "epoch": 5.683303411131059, "step": 79140}, {"loss": 0.6107, "grad_norm": 1.049688458442688, "learning_rate": 0.0002, "epoch": 5.684021543985637, "step": 79150}, {"loss": 0.5887, "grad_norm": 1.124272108078003, "learning_rate": 0.0002, "epoch": 5.684739676840215, "step": 79160}, {"loss": 0.5695, "grad_norm": 1.1109849214553833, "learning_rate": 0.0002, "epoch": 5.685457809694793, "step": 79170}, {"loss": 0.6014, "grad_norm": 0.739007830619812, "learning_rate": 0.0002, "epoch": 5.686175942549371, "step": 79180}, {"loss": 0.5995, "grad_norm": 1.2063007354736328, "learning_rate": 0.0002, "epoch": 5.686894075403949, "step": 79190}, {"loss": 0.5563, "grad_norm": 1.223317265510559, "learning_rate": 0.0002, "epoch": 5.687612208258528, "step": 79200}, {"loss": 0.6017, "grad_norm": 0.8042855858802795, "learning_rate": 0.0002, "epoch": 5.688330341113106, "step": 79210}, {"loss": 0.5909, "grad_norm": 0.9294175505638123, "learning_rate": 0.0002, "epoch": 5.689048473967684, "step": 79220}, {"loss": 0.6091, "grad_norm": 0.978084146976471, "learning_rate": 0.0002, "epoch": 5.689766606822262, "step": 79230}, {"loss": 0.6094, "grad_norm": 0.9271620512008667, "learning_rate": 0.0002, "epoch": 5.69048473967684, "step": 79240}, {"loss": 0.6454, "grad_norm": 1.158677339553833, "learning_rate": 0.0002, "epoch": 5.691202872531418, "step": 79250}, {"loss": 0.6054, "grad_norm": 0.9468576312065125, "learning_rate": 0.0002, "epoch": 5.691921005385996, "step": 79260}, {"loss": 0.6094, "grad_norm": 1.2025824785232544, "learning_rate": 0.0002, "epoch": 5.692639138240574, "step": 79270}, {"loss": 0.5995, "grad_norm": 1.0167860984802246, "learning_rate": 0.0002, "epoch": 5.693357271095152, "step": 79280}, {"loss": 0.5596, "grad_norm": 0.971199631690979, "learning_rate": 0.0002, "epoch": 5.69407540394973, "step": 79290}, {"loss": 0.6051, "grad_norm": 1.1757864952087402, "learning_rate": 0.0002, "epoch": 5.694793536804308, "step": 79300}, {"loss": 0.5915, "grad_norm": 1.0199662446975708, "learning_rate": 0.0002, "epoch": 5.695511669658887, "step": 79310}, {"loss": 0.5654, "grad_norm": 0.9662485122680664, "learning_rate": 0.0002, "epoch": 5.696229802513465, "step": 79320}, {"loss": 0.5602, "grad_norm": 0.9324414134025574, "learning_rate": 0.0002, "epoch": 5.696947935368043, "step": 79330}, {"loss": 0.5939, "grad_norm": 0.855752170085907, "learning_rate": 0.0002, "epoch": 5.697666068222621, "step": 79340}, {"loss": 0.6202, "grad_norm": 1.2723703384399414, "learning_rate": 0.0002, "epoch": 5.698384201077199, "step": 79350}, {"loss": 0.6028, "grad_norm": 1.0254011154174805, "learning_rate": 0.0002, "epoch": 5.699102333931777, "step": 79360}, {"loss": 0.5853, "grad_norm": 1.0958263874053955, "learning_rate": 0.0002, "epoch": 5.699820466786355, "step": 79370}, {"loss": 0.6292, "grad_norm": 1.0214145183563232, "learning_rate": 0.0002, "epoch": 5.700538599640933, "step": 79380}, {"loss": 0.6576, "grad_norm": 1.1087455749511719, "learning_rate": 0.0002, "epoch": 5.701256732495512, "step": 79390}, {"loss": 0.576, "grad_norm": 0.8885074853897095, "learning_rate": 0.0002, "epoch": 5.70197486535009, "step": 79400}, {"loss": 0.5452, "grad_norm": 0.9854450821876526, "learning_rate": 0.0002, "epoch": 5.702692998204668, "step": 79410}, {"loss": 0.5903, "grad_norm": 0.858744204044342, "learning_rate": 0.0002, "epoch": 5.703411131059246, "step": 79420}, {"loss": 0.5975, "grad_norm": 0.9434788823127747, "learning_rate": 0.0002, "epoch": 5.704129263913824, "step": 79430}, {"loss": 0.648, "grad_norm": 1.1388801336288452, "learning_rate": 0.0002, "epoch": 5.704847396768402, "step": 79440}, {"loss": 0.5895, "grad_norm": 1.0701899528503418, "learning_rate": 0.0002, "epoch": 5.70556552962298, "step": 79450}, {"loss": 0.5697, "grad_norm": 0.9147594571113586, "learning_rate": 0.0002, "epoch": 5.706283662477558, "step": 79460}, {"loss": 0.6043, "grad_norm": 1.055008053779602, "learning_rate": 0.0002, "epoch": 5.707001795332136, "step": 79470}, {"loss": 0.5625, "grad_norm": 0.7841609716415405, "learning_rate": 0.0002, "epoch": 5.707719928186714, "step": 79480}, {"loss": 0.6048, "grad_norm": 1.0334571599960327, "learning_rate": 0.0002, "epoch": 5.708438061041292, "step": 79490}, {"loss": 0.5924, "grad_norm": 1.2841367721557617, "learning_rate": 0.0002, "epoch": 5.709156193895871, "step": 79500}, {"loss": 0.5957, "grad_norm": 1.0296638011932373, "learning_rate": 0.0002, "epoch": 5.709874326750449, "step": 79510}, {"loss": 0.6015, "grad_norm": 0.9161922931671143, "learning_rate": 0.0002, "epoch": 5.710592459605027, "step": 79520}, {"loss": 0.6056, "grad_norm": 1.056856632232666, "learning_rate": 0.0002, "epoch": 5.711310592459605, "step": 79530}, {"loss": 0.5762, "grad_norm": 0.9919893145561218, "learning_rate": 0.0002, "epoch": 5.712028725314183, "step": 79540}, {"loss": 0.5987, "grad_norm": 1.1128891706466675, "learning_rate": 0.0002, "epoch": 5.712746858168761, "step": 79550}, {"loss": 0.5835, "grad_norm": 1.1171997785568237, "learning_rate": 0.0002, "epoch": 5.713464991023339, "step": 79560}, {"loss": 0.6037, "grad_norm": 0.9389346837997437, "learning_rate": 0.0002, "epoch": 5.714183123877917, "step": 79570}, {"loss": 0.5805, "grad_norm": 0.9869245886802673, "learning_rate": 0.0002, "epoch": 5.714901256732496, "step": 79580}, {"loss": 0.5776, "grad_norm": 0.9019966721534729, "learning_rate": 0.0002, "epoch": 5.715619389587074, "step": 79590}, {"loss": 0.567, "grad_norm": 0.9791252017021179, "learning_rate": 0.0002, "epoch": 5.716337522441652, "step": 79600}, {"loss": 0.5817, "grad_norm": 1.0269849300384521, "learning_rate": 0.0002, "epoch": 5.71705565529623, "step": 79610}, {"loss": 0.602, "grad_norm": 1.0340129137039185, "learning_rate": 0.0002, "epoch": 5.717773788150808, "step": 79620}, {"loss": 0.5969, "grad_norm": 0.9742604494094849, "learning_rate": 0.0002, "epoch": 5.718491921005386, "step": 79630}, {"loss": 0.5945, "grad_norm": 1.126868724822998, "learning_rate": 0.0002, "epoch": 5.719210053859964, "step": 79640}, {"loss": 0.601, "grad_norm": 1.04326331615448, "learning_rate": 0.0002, "epoch": 5.719928186714542, "step": 79650}, {"loss": 0.6071, "grad_norm": 0.8300277590751648, "learning_rate": 0.0002, "epoch": 5.72064631956912, "step": 79660}, {"loss": 0.6121, "grad_norm": 0.8482570052146912, "learning_rate": 0.0002, "epoch": 5.721364452423698, "step": 79670}, {"loss": 0.5937, "grad_norm": 1.0777807235717773, "learning_rate": 0.0002, "epoch": 5.722082585278277, "step": 79680}, {"loss": 0.5739, "grad_norm": 1.2682723999023438, "learning_rate": 0.0002, "epoch": 5.722800718132855, "step": 79690}, {"loss": 0.5759, "grad_norm": 0.8742772340774536, "learning_rate": 0.0002, "epoch": 5.723518850987433, "step": 79700}, {"loss": 0.5839, "grad_norm": 0.9218387603759766, "learning_rate": 0.0002, "epoch": 5.724236983842011, "step": 79710}, {"loss": 0.5968, "grad_norm": 0.8977975845336914, "learning_rate": 0.0002, "epoch": 5.724955116696589, "step": 79720}, {"loss": 0.5743, "grad_norm": 1.0873085260391235, "learning_rate": 0.0002, "epoch": 5.725673249551167, "step": 79730}, {"loss": 0.5986, "grad_norm": 0.9811807870864868, "learning_rate": 0.0002, "epoch": 5.726391382405745, "step": 79740}, {"loss": 0.5881, "grad_norm": 0.926764965057373, "learning_rate": 0.0002, "epoch": 5.727109515260323, "step": 79750}, {"loss": 0.5738, "grad_norm": 1.0103713274002075, "learning_rate": 0.0002, "epoch": 5.727827648114902, "step": 79760}, {"loss": 0.5807, "grad_norm": 1.1389189958572388, "learning_rate": 0.0002, "epoch": 5.72854578096948, "step": 79770}, {"loss": 0.636, "grad_norm": 1.1654961109161377, "learning_rate": 0.0002, "epoch": 5.729263913824058, "step": 79780}, {"loss": 0.5863, "grad_norm": 0.7925996780395508, "learning_rate": 0.0002, "epoch": 5.729982046678636, "step": 79790}, {"loss": 0.6005, "grad_norm": 1.3329131603240967, "learning_rate": 0.0002, "epoch": 5.730700179533214, "step": 79800}, {"loss": 0.6295, "grad_norm": 1.158328890800476, "learning_rate": 0.0002, "epoch": 5.731418312387792, "step": 79810}, {"loss": 0.5832, "grad_norm": 0.9904412031173706, "learning_rate": 0.0002, "epoch": 5.73213644524237, "step": 79820}, {"loss": 0.582, "grad_norm": 1.099233865737915, "learning_rate": 0.0002, "epoch": 5.732854578096948, "step": 79830}, {"loss": 0.6135, "grad_norm": 1.0224473476409912, "learning_rate": 0.0002, "epoch": 5.733572710951526, "step": 79840}, {"loss": 0.6063, "grad_norm": 1.0482215881347656, "learning_rate": 0.0002, "epoch": 5.734290843806104, "step": 79850}, {"loss": 0.5792, "grad_norm": 0.9790018200874329, "learning_rate": 0.0002, "epoch": 5.735008976660682, "step": 79860}, {"loss": 0.6089, "grad_norm": 1.034548044204712, "learning_rate": 0.0002, "epoch": 5.735727109515261, "step": 79870}, {"loss": 0.5676, "grad_norm": 0.799286961555481, "learning_rate": 0.0002, "epoch": 5.736445242369839, "step": 79880}, {"loss": 0.5344, "grad_norm": 1.0119048357009888, "learning_rate": 0.0002, "epoch": 5.737163375224417, "step": 79890}, {"loss": 0.5859, "grad_norm": 0.9742264151573181, "learning_rate": 0.0002, "epoch": 5.737881508078995, "step": 79900}, {"loss": 0.5992, "grad_norm": 1.0408239364624023, "learning_rate": 0.0002, "epoch": 5.738599640933573, "step": 79910}, {"loss": 0.6009, "grad_norm": 0.9165748953819275, "learning_rate": 0.0002, "epoch": 5.739317773788151, "step": 79920}, {"loss": 0.5864, "grad_norm": 1.1859451532363892, "learning_rate": 0.0002, "epoch": 5.740035906642729, "step": 79930}, {"loss": 0.5948, "grad_norm": 0.8772084712982178, "learning_rate": 0.0002, "epoch": 5.740754039497307, "step": 79940}, {"loss": 0.5614, "grad_norm": 1.0123273134231567, "learning_rate": 0.0002, "epoch": 5.741472172351886, "step": 79950}, {"loss": 0.6405, "grad_norm": 1.1873936653137207, "learning_rate": 0.0002, "epoch": 5.742190305206464, "step": 79960}, {"loss": 0.5818, "grad_norm": 0.9065699577331543, "learning_rate": 0.0002, "epoch": 5.742908438061042, "step": 79970}, {"loss": 0.6068, "grad_norm": 1.1626464128494263, "learning_rate": 0.0002, "epoch": 5.74362657091562, "step": 79980}, {"loss": 0.5814, "grad_norm": 1.0311716794967651, "learning_rate": 0.0002, "epoch": 5.744344703770198, "step": 79990}, {"loss": 0.5752, "grad_norm": 1.0865558385849, "learning_rate": 0.0002, "epoch": 5.745062836624776, "step": 80000}, {"loss": 0.6477, "grad_norm": 1.0257176160812378, "learning_rate": 0.0002, "epoch": 5.745780969479354, "step": 80010}, {"loss": 0.6172, "grad_norm": 0.9805439710617065, "learning_rate": 0.0002, "epoch": 5.746499102333932, "step": 80020}, {"loss": 0.5949, "grad_norm": 0.9744977355003357, "learning_rate": 0.0002, "epoch": 5.74721723518851, "step": 80030}, {"loss": 0.5893, "grad_norm": 1.302816390991211, "learning_rate": 0.0002, "epoch": 5.747935368043088, "step": 80040}, {"loss": 0.5653, "grad_norm": 0.8866990208625793, "learning_rate": 0.0002, "epoch": 5.748653500897666, "step": 80050}, {"loss": 0.5648, "grad_norm": 1.0133726596832275, "learning_rate": 0.0002, "epoch": 5.7493716337522445, "step": 80060}, {"loss": 0.6016, "grad_norm": 1.0043569803237915, "learning_rate": 0.0002, "epoch": 5.7500897666068225, "step": 80070}, {"loss": 0.6493, "grad_norm": 0.9100040197372437, "learning_rate": 0.0002, "epoch": 5.7508078994614005, "step": 80080}, {"loss": 0.5469, "grad_norm": 0.7994180917739868, "learning_rate": 0.0002, "epoch": 5.7515260323159785, "step": 80090}, {"loss": 0.6521, "grad_norm": 1.120188593864441, "learning_rate": 0.0002, "epoch": 5.7522441651705565, "step": 80100}, {"loss": 0.5737, "grad_norm": 0.9555420279502869, "learning_rate": 0.0002, "epoch": 5.7529622980251345, "step": 80110}, {"loss": 0.5897, "grad_norm": 1.0305951833724976, "learning_rate": 0.0002, "epoch": 5.7536804308797125, "step": 80120}, {"loss": 0.5821, "grad_norm": 0.9632731676101685, "learning_rate": 0.0002, "epoch": 5.7543985637342905, "step": 80130}, {"loss": 0.5618, "grad_norm": 1.2654297351837158, "learning_rate": 0.0002, "epoch": 5.755116696588869, "step": 80140}, {"loss": 0.6044, "grad_norm": 1.027190089225769, "learning_rate": 0.0002, "epoch": 5.755834829443447, "step": 80150}, {"loss": 0.6131, "grad_norm": 0.9829175472259521, "learning_rate": 0.0002, "epoch": 5.756552962298025, "step": 80160}, {"loss": 0.609, "grad_norm": 1.083803653717041, "learning_rate": 0.0002, "epoch": 5.757271095152603, "step": 80170}, {"loss": 0.6134, "grad_norm": 0.9353913068771362, "learning_rate": 0.0002, "epoch": 5.757989228007181, "step": 80180}, {"loss": 0.6515, "grad_norm": 1.1824370622634888, "learning_rate": 0.0002, "epoch": 5.758707360861759, "step": 80190}, {"loss": 0.6012, "grad_norm": 1.0901048183441162, "learning_rate": 0.0002, "epoch": 5.759425493716337, "step": 80200}, {"loss": 0.5639, "grad_norm": 1.0389254093170166, "learning_rate": 0.0002, "epoch": 5.760143626570915, "step": 80210}, {"loss": 0.6085, "grad_norm": 0.9746400117874146, "learning_rate": 0.0002, "epoch": 5.760861759425493, "step": 80220}, {"loss": 0.5874, "grad_norm": 0.9319248795509338, "learning_rate": 0.0002, "epoch": 5.761579892280071, "step": 80230}, {"loss": 0.5726, "grad_norm": 1.152784824371338, "learning_rate": 0.0002, "epoch": 5.76229802513465, "step": 80240}, {"loss": 0.5998, "grad_norm": 0.9462733864784241, "learning_rate": 0.0002, "epoch": 5.763016157989228, "step": 80250}, {"loss": 0.5755, "grad_norm": 0.8884182572364807, "learning_rate": 0.0002, "epoch": 5.763734290843806, "step": 80260}, {"loss": 0.5864, "grad_norm": 0.8755964636802673, "learning_rate": 0.0002, "epoch": 5.764452423698384, "step": 80270}, {"loss": 0.5659, "grad_norm": 0.8983452320098877, "learning_rate": 0.0002, "epoch": 5.765170556552962, "step": 80280}, {"loss": 0.5799, "grad_norm": 0.8565991520881653, "learning_rate": 0.0002, "epoch": 5.76588868940754, "step": 80290}, {"loss": 0.598, "grad_norm": 1.0557159185409546, "learning_rate": 0.0002, "epoch": 5.766606822262118, "step": 80300}, {"loss": 0.6441, "grad_norm": 1.057214379310608, "learning_rate": 0.0002, "epoch": 5.767324955116696, "step": 80310}, {"loss": 0.6038, "grad_norm": 0.9852516055107117, "learning_rate": 0.0002, "epoch": 5.768043087971275, "step": 80320}, {"loss": 0.5676, "grad_norm": 1.0339698791503906, "learning_rate": 0.0002, "epoch": 5.768761220825853, "step": 80330}, {"loss": 0.5963, "grad_norm": 1.0056889057159424, "learning_rate": 0.0002, "epoch": 5.769479353680431, "step": 80340}, {"loss": 0.5588, "grad_norm": 1.0941663980484009, "learning_rate": 0.0002, "epoch": 5.770197486535009, "step": 80350}, {"loss": 0.5729, "grad_norm": 1.2145589590072632, "learning_rate": 0.0002, "epoch": 5.770915619389587, "step": 80360}, {"loss": 0.5819, "grad_norm": 0.9609606862068176, "learning_rate": 0.0002, "epoch": 5.771633752244165, "step": 80370}, {"loss": 0.6313, "grad_norm": 0.8815773129463196, "learning_rate": 0.0002, "epoch": 5.772351885098743, "step": 80380}, {"loss": 0.6046, "grad_norm": 1.2630987167358398, "learning_rate": 0.0002, "epoch": 5.773070017953321, "step": 80390}, {"loss": 0.5918, "grad_norm": 1.0605450868606567, "learning_rate": 0.0002, "epoch": 5.773788150807899, "step": 80400}, {"loss": 0.6074, "grad_norm": 1.165069341659546, "learning_rate": 0.0002, "epoch": 5.774506283662477, "step": 80410}, {"loss": 0.5683, "grad_norm": 0.9038028717041016, "learning_rate": 0.0002, "epoch": 5.775224416517055, "step": 80420}, {"loss": 0.6024, "grad_norm": 1.0571858882904053, "learning_rate": 0.0002, "epoch": 5.775942549371634, "step": 80430}, {"loss": 0.624, "grad_norm": 1.0388168096542358, "learning_rate": 0.0002, "epoch": 5.776660682226212, "step": 80440}, {"loss": 0.6139, "grad_norm": 1.0552119016647339, "learning_rate": 0.0002, "epoch": 5.77737881508079, "step": 80450}, {"loss": 0.5988, "grad_norm": 1.0610109567642212, "learning_rate": 0.0002, "epoch": 5.778096947935368, "step": 80460}, {"loss": 0.6264, "grad_norm": 0.9906430244445801, "learning_rate": 0.0002, "epoch": 5.778815080789946, "step": 80470}, {"loss": 0.5807, "grad_norm": 1.1511857509613037, "learning_rate": 0.0002, "epoch": 5.779533213644524, "step": 80480}, {"loss": 0.6202, "grad_norm": 1.2738412618637085, "learning_rate": 0.0002, "epoch": 5.780251346499102, "step": 80490}, {"loss": 0.5957, "grad_norm": 0.8945937752723694, "learning_rate": 0.0002, "epoch": 5.78096947935368, "step": 80500}, {"loss": 0.6049, "grad_norm": 1.1105149984359741, "learning_rate": 0.0002, "epoch": 5.781687612208259, "step": 80510}, {"loss": 0.5989, "grad_norm": 0.8432297110557556, "learning_rate": 0.0002, "epoch": 5.782405745062837, "step": 80520}, {"loss": 0.6321, "grad_norm": 0.9257984757423401, "learning_rate": 0.0002, "epoch": 5.783123877917415, "step": 80530}, {"loss": 0.6191, "grad_norm": 1.1708799600601196, "learning_rate": 0.0002, "epoch": 5.783842010771993, "step": 80540}, {"loss": 0.5465, "grad_norm": 0.9969521164894104, "learning_rate": 0.0002, "epoch": 5.784560143626571, "step": 80550}, {"loss": 0.6569, "grad_norm": 1.0361413955688477, "learning_rate": 0.0002, "epoch": 5.785278276481149, "step": 80560}, {"loss": 0.6131, "grad_norm": 0.9876393675804138, "learning_rate": 0.0002, "epoch": 5.785996409335727, "step": 80570}, {"loss": 0.5586, "grad_norm": 1.0356241464614868, "learning_rate": 0.0002, "epoch": 5.786714542190305, "step": 80580}, {"loss": 0.5647, "grad_norm": 1.178865671157837, "learning_rate": 0.0002, "epoch": 5.787432675044883, "step": 80590}, {"loss": 0.578, "grad_norm": 0.8614338636398315, "learning_rate": 0.0002, "epoch": 5.788150807899461, "step": 80600}, {"loss": 0.5916, "grad_norm": 1.020734429359436, "learning_rate": 0.0002, "epoch": 5.788868940754039, "step": 80610}, {"loss": 0.6015, "grad_norm": 1.035951852798462, "learning_rate": 0.0002, "epoch": 5.789587073608618, "step": 80620}, {"loss": 0.5838, "grad_norm": 0.898637592792511, "learning_rate": 0.0002, "epoch": 5.790305206463196, "step": 80630}, {"loss": 0.5894, "grad_norm": 0.9803016781806946, "learning_rate": 0.0002, "epoch": 5.791023339317774, "step": 80640}, {"loss": 0.5806, "grad_norm": 1.2902555465698242, "learning_rate": 0.0002, "epoch": 5.791741472172352, "step": 80650}, {"loss": 0.6136, "grad_norm": 1.3364112377166748, "learning_rate": 0.0002, "epoch": 5.79245960502693, "step": 80660}, {"loss": 0.6071, "grad_norm": 0.8553985953330994, "learning_rate": 0.0002, "epoch": 5.793177737881508, "step": 80670}, {"loss": 0.5853, "grad_norm": 0.8211889863014221, "learning_rate": 0.0002, "epoch": 5.793895870736086, "step": 80680}, {"loss": 0.5732, "grad_norm": 0.9288306832313538, "learning_rate": 0.0002, "epoch": 5.794614003590664, "step": 80690}, {"loss": 0.6241, "grad_norm": 1.0716029405593872, "learning_rate": 0.0002, "epoch": 5.795332136445243, "step": 80700}, {"loss": 0.643, "grad_norm": 0.9957329034805298, "learning_rate": 0.0002, "epoch": 5.796050269299821, "step": 80710}, {"loss": 0.5762, "grad_norm": 0.9691376090049744, "learning_rate": 0.0002, "epoch": 5.796768402154399, "step": 80720}, {"loss": 0.6227, "grad_norm": 1.0590804815292358, "learning_rate": 0.0002, "epoch": 5.797486535008977, "step": 80730}, {"loss": 0.59, "grad_norm": 1.0408968925476074, "learning_rate": 0.0002, "epoch": 5.798204667863555, "step": 80740}, {"loss": 0.5656, "grad_norm": 1.0249526500701904, "learning_rate": 0.0002, "epoch": 5.798922800718133, "step": 80750}, {"loss": 0.5991, "grad_norm": 1.3658806085586548, "learning_rate": 0.0002, "epoch": 5.799640933572711, "step": 80760}, {"loss": 0.5671, "grad_norm": 0.9562603831291199, "learning_rate": 0.0002, "epoch": 5.800359066427289, "step": 80770}, {"loss": 0.5929, "grad_norm": 0.8790915012359619, "learning_rate": 0.0002, "epoch": 5.801077199281867, "step": 80780}, {"loss": 0.5864, "grad_norm": 0.8351004123687744, "learning_rate": 0.0002, "epoch": 5.801795332136445, "step": 80790}, {"loss": 0.5544, "grad_norm": 0.964562714099884, "learning_rate": 0.0002, "epoch": 5.802513464991024, "step": 80800}, {"loss": 0.6388, "grad_norm": 1.0873116254806519, "learning_rate": 0.0002, "epoch": 5.803231597845602, "step": 80810}, {"loss": 0.5891, "grad_norm": 0.9821216464042664, "learning_rate": 0.0002, "epoch": 5.80394973070018, "step": 80820}, {"loss": 0.631, "grad_norm": 1.1158807277679443, "learning_rate": 0.0002, "epoch": 5.804667863554758, "step": 80830}, {"loss": 0.6068, "grad_norm": 1.0098856687545776, "learning_rate": 0.0002, "epoch": 5.805385996409336, "step": 80840}, {"loss": 0.6112, "grad_norm": 0.9628035426139832, "learning_rate": 0.0002, "epoch": 5.806104129263914, "step": 80850}, {"loss": 0.6003, "grad_norm": 1.133800983428955, "learning_rate": 0.0002, "epoch": 5.806822262118492, "step": 80860}, {"loss": 0.5802, "grad_norm": 0.9423992037773132, "learning_rate": 0.0002, "epoch": 5.80754039497307, "step": 80870}, {"loss": 0.5729, "grad_norm": 1.0758612155914307, "learning_rate": 0.0002, "epoch": 5.808258527827648, "step": 80880}, {"loss": 0.586, "grad_norm": 1.232029914855957, "learning_rate": 0.0002, "epoch": 5.808976660682227, "step": 80890}, {"loss": 0.5932, "grad_norm": 1.1063108444213867, "learning_rate": 0.0002, "epoch": 5.809694793536805, "step": 80900}, {"loss": 0.5627, "grad_norm": 0.9759877920150757, "learning_rate": 0.0002, "epoch": 5.810412926391383, "step": 80910}, {"loss": 0.6169, "grad_norm": 0.9180193543434143, "learning_rate": 0.0002, "epoch": 5.811131059245961, "step": 80920}, {"loss": 0.6198, "grad_norm": 1.0818052291870117, "learning_rate": 0.0002, "epoch": 5.811849192100539, "step": 80930}, {"loss": 0.5997, "grad_norm": 0.998986542224884, "learning_rate": 0.0002, "epoch": 5.812567324955117, "step": 80940}, {"loss": 0.6183, "grad_norm": 1.1549060344696045, "learning_rate": 0.0002, "epoch": 5.813285457809695, "step": 80950}, {"loss": 0.5858, "grad_norm": 1.1900213956832886, "learning_rate": 0.0002, "epoch": 5.814003590664273, "step": 80960}, {"loss": 0.6249, "grad_norm": 0.8114368915557861, "learning_rate": 0.0002, "epoch": 5.814721723518851, "step": 80970}, {"loss": 0.6199, "grad_norm": 1.0296406745910645, "learning_rate": 0.0002, "epoch": 5.815439856373429, "step": 80980}, {"loss": 0.6226, "grad_norm": 1.0466746091842651, "learning_rate": 0.0002, "epoch": 5.8161579892280075, "step": 80990}, {"loss": 0.6303, "grad_norm": 1.0524508953094482, "learning_rate": 0.0002, "epoch": 5.8168761220825855, "step": 81000}, {"loss": 0.5708, "grad_norm": 1.1588358879089355, "learning_rate": 0.0002, "epoch": 5.8175942549371635, "step": 81010}, {"loss": 0.5818, "grad_norm": 0.9378601908683777, "learning_rate": 0.0002, "epoch": 5.8183123877917415, "step": 81020}, {"loss": 0.6404, "grad_norm": 0.9486441612243652, "learning_rate": 0.0002, "epoch": 5.8190305206463195, "step": 81030}, {"loss": 0.566, "grad_norm": 0.9805227518081665, "learning_rate": 0.0002, "epoch": 5.8197486535008975, "step": 81040}, {"loss": 0.6025, "grad_norm": 1.1627717018127441, "learning_rate": 0.0002, "epoch": 5.8204667863554755, "step": 81050}, {"loss": 0.5954, "grad_norm": 1.0716841220855713, "learning_rate": 0.0002, "epoch": 5.8211849192100535, "step": 81060}, {"loss": 0.6045, "grad_norm": 1.2398899793624878, "learning_rate": 0.0002, "epoch": 5.821903052064632, "step": 81070}, {"loss": 0.5813, "grad_norm": 1.0934730768203735, "learning_rate": 0.0002, "epoch": 5.82262118491921, "step": 81080}, {"loss": 0.5601, "grad_norm": 0.9701796174049377, "learning_rate": 0.0002, "epoch": 5.823339317773788, "step": 81090}, {"loss": 0.6493, "grad_norm": 1.0218969583511353, "learning_rate": 0.0002, "epoch": 5.824057450628366, "step": 81100}, {"loss": 0.6121, "grad_norm": 1.3066465854644775, "learning_rate": 0.0002, "epoch": 5.824775583482944, "step": 81110}, {"loss": 0.6145, "grad_norm": 1.1067441701889038, "learning_rate": 0.0002, "epoch": 5.825493716337522, "step": 81120}, {"loss": 0.5959, "grad_norm": 0.9750344753265381, "learning_rate": 0.0002, "epoch": 5.8262118491921004, "step": 81130}, {"loss": 0.6192, "grad_norm": 1.129191279411316, "learning_rate": 0.0002, "epoch": 5.8269299820466784, "step": 81140}, {"loss": 0.6191, "grad_norm": 1.05964195728302, "learning_rate": 0.0002, "epoch": 5.8276481149012564, "step": 81150}, {"loss": 0.6353, "grad_norm": 1.1094872951507568, "learning_rate": 0.0002, "epoch": 5.8283662477558345, "step": 81160}, {"loss": 0.5835, "grad_norm": 0.9163196086883545, "learning_rate": 0.0002, "epoch": 5.8290843806104125, "step": 81170}, {"loss": 0.6513, "grad_norm": 1.0035687685012817, "learning_rate": 0.0002, "epoch": 5.829802513464991, "step": 81180}, {"loss": 0.5948, "grad_norm": 1.0353461503982544, "learning_rate": 0.0002, "epoch": 5.830520646319569, "step": 81190}, {"loss": 0.602, "grad_norm": 1.0566555261611938, "learning_rate": 0.0002, "epoch": 5.831238779174147, "step": 81200}, {"loss": 0.6086, "grad_norm": 1.2373290061950684, "learning_rate": 0.0002, "epoch": 5.831956912028725, "step": 81210}, {"loss": 0.6054, "grad_norm": 0.8818837404251099, "learning_rate": 0.0002, "epoch": 5.832675044883303, "step": 81220}, {"loss": 0.604, "grad_norm": 1.1024713516235352, "learning_rate": 0.0002, "epoch": 5.833393177737881, "step": 81230}, {"loss": 0.6649, "grad_norm": 1.2478809356689453, "learning_rate": 0.0002, "epoch": 5.834111310592459, "step": 81240}, {"loss": 0.584, "grad_norm": 0.8647364377975464, "learning_rate": 0.0002, "epoch": 5.834829443447037, "step": 81250}, {"loss": 0.6089, "grad_norm": 1.1106358766555786, "learning_rate": 0.0002, "epoch": 5.835547576301616, "step": 81260}, {"loss": 0.5934, "grad_norm": 0.9432938694953918, "learning_rate": 0.0002, "epoch": 5.836265709156194, "step": 81270}, {"loss": 0.6401, "grad_norm": 1.0283797979354858, "learning_rate": 0.0002, "epoch": 5.836983842010772, "step": 81280}, {"loss": 0.6549, "grad_norm": 1.158918857574463, "learning_rate": 0.0002, "epoch": 5.83770197486535, "step": 81290}, {"loss": 0.5974, "grad_norm": 0.9700069427490234, "learning_rate": 0.0002, "epoch": 5.838420107719928, "step": 81300}, {"loss": 0.5841, "grad_norm": 1.08310866355896, "learning_rate": 0.0002, "epoch": 5.839138240574506, "step": 81310}, {"loss": 0.6234, "grad_norm": 1.05460524559021, "learning_rate": 0.0002, "epoch": 5.839856373429084, "step": 81320}, {"loss": 0.5586, "grad_norm": 0.9849268794059753, "learning_rate": 0.0002, "epoch": 5.840574506283662, "step": 81330}, {"loss": 0.5927, "grad_norm": 0.888306736946106, "learning_rate": 0.0002, "epoch": 5.84129263913824, "step": 81340}, {"loss": 0.6106, "grad_norm": 1.0337001085281372, "learning_rate": 0.0002, "epoch": 5.842010771992818, "step": 81350}, {"loss": 0.5957, "grad_norm": 1.0778567790985107, "learning_rate": 0.0002, "epoch": 5.842728904847397, "step": 81360}, {"loss": 0.5801, "grad_norm": 1.1484156847000122, "learning_rate": 0.0002, "epoch": 5.843447037701975, "step": 81370}, {"loss": 0.6348, "grad_norm": 1.0948245525360107, "learning_rate": 0.0002, "epoch": 5.844165170556553, "step": 81380}, {"loss": 0.5561, "grad_norm": 0.9363969564437866, "learning_rate": 0.0002, "epoch": 5.844883303411131, "step": 81390}, {"loss": 0.6336, "grad_norm": 1.0151013135910034, "learning_rate": 0.0002, "epoch": 5.845601436265709, "step": 81400}, {"loss": 0.6063, "grad_norm": 0.9925733804702759, "learning_rate": 0.0002, "epoch": 5.846319569120287, "step": 81410}, {"loss": 0.6512, "grad_norm": 1.0356744527816772, "learning_rate": 0.0002, "epoch": 5.847037701974865, "step": 81420}, {"loss": 0.5947, "grad_norm": 1.0633001327514648, "learning_rate": 0.0002, "epoch": 5.847755834829443, "step": 81430}, {"loss": 0.5851, "grad_norm": 0.9900460839271545, "learning_rate": 0.0002, "epoch": 5.848473967684021, "step": 81440}, {"loss": 0.6216, "grad_norm": 1.2677979469299316, "learning_rate": 0.0002, "epoch": 5.8491921005386, "step": 81450}, {"loss": 0.5633, "grad_norm": 0.8174138069152832, "learning_rate": 0.0002, "epoch": 5.849910233393178, "step": 81460}, {"loss": 0.6283, "grad_norm": 1.1986393928527832, "learning_rate": 0.0002, "epoch": 5.850628366247756, "step": 81470}, {"loss": 0.6056, "grad_norm": 1.1009358167648315, "learning_rate": 0.0002, "epoch": 5.851346499102334, "step": 81480}, {"loss": 0.6244, "grad_norm": 0.966446578502655, "learning_rate": 0.0002, "epoch": 5.852064631956912, "step": 81490}, {"loss": 0.5687, "grad_norm": 0.9657767415046692, "learning_rate": 0.0002, "epoch": 5.85278276481149, "step": 81500}, {"loss": 0.547, "grad_norm": 1.0480058193206787, "learning_rate": 0.0002, "epoch": 5.853500897666068, "step": 81510}, {"loss": 0.5737, "grad_norm": 1.2003830671310425, "learning_rate": 0.0002, "epoch": 5.854219030520646, "step": 81520}, {"loss": 0.602, "grad_norm": 0.8683754205703735, "learning_rate": 0.0002, "epoch": 5.854937163375224, "step": 81530}, {"loss": 0.5923, "grad_norm": 1.0860967636108398, "learning_rate": 0.0002, "epoch": 5.855655296229802, "step": 81540}, {"loss": 0.5959, "grad_norm": 1.0415282249450684, "learning_rate": 0.0002, "epoch": 5.856373429084381, "step": 81550}, {"loss": 0.6017, "grad_norm": 0.9897454380989075, "learning_rate": 0.0002, "epoch": 5.857091561938959, "step": 81560}, {"loss": 0.5588, "grad_norm": 1.173884630203247, "learning_rate": 0.0002, "epoch": 5.857809694793537, "step": 81570}, {"loss": 0.5715, "grad_norm": 1.2426209449768066, "learning_rate": 0.0002, "epoch": 5.858527827648115, "step": 81580}, {"loss": 0.6079, "grad_norm": 0.9390465021133423, "learning_rate": 0.0002, "epoch": 5.859245960502693, "step": 81590}, {"loss": 0.5896, "grad_norm": 1.1387195587158203, "learning_rate": 0.0002, "epoch": 5.859964093357271, "step": 81600}, {"loss": 0.6025, "grad_norm": 0.9902143478393555, "learning_rate": 0.0002, "epoch": 5.860682226211849, "step": 81610}, {"loss": 0.6197, "grad_norm": 0.8328776359558105, "learning_rate": 0.0002, "epoch": 5.861400359066427, "step": 81620}, {"loss": 0.6586, "grad_norm": 0.9837837815284729, "learning_rate": 0.0002, "epoch": 5.862118491921006, "step": 81630}, {"loss": 0.5793, "grad_norm": 1.0013370513916016, "learning_rate": 0.0002, "epoch": 5.862836624775584, "step": 81640}, {"loss": 0.6129, "grad_norm": 0.9408028721809387, "learning_rate": 0.0002, "epoch": 5.863554757630162, "step": 81650}, {"loss": 0.572, "grad_norm": 1.093140959739685, "learning_rate": 0.0002, "epoch": 5.86427289048474, "step": 81660}, {"loss": 0.6037, "grad_norm": 0.9554300904273987, "learning_rate": 0.0002, "epoch": 5.864991023339318, "step": 81670}, {"loss": 0.6136, "grad_norm": 1.1276485919952393, "learning_rate": 0.0002, "epoch": 5.865709156193896, "step": 81680}, {"loss": 0.6072, "grad_norm": 0.9628785252571106, "learning_rate": 0.0002, "epoch": 5.866427289048474, "step": 81690}, {"loss": 0.5962, "grad_norm": 0.9844689965248108, "learning_rate": 0.0002, "epoch": 5.867145421903052, "step": 81700}, {"loss": 0.5883, "grad_norm": 0.9679856896400452, "learning_rate": 0.0002, "epoch": 5.86786355475763, "step": 81710}, {"loss": 0.6244, "grad_norm": 1.0225571393966675, "learning_rate": 0.0002, "epoch": 5.868581687612208, "step": 81720}, {"loss": 0.6132, "grad_norm": 0.9330390691757202, "learning_rate": 0.0002, "epoch": 5.869299820466786, "step": 81730}, {"loss": 0.5895, "grad_norm": 1.0584566593170166, "learning_rate": 0.0002, "epoch": 5.870017953321365, "step": 81740}, {"loss": 0.5618, "grad_norm": 0.781548023223877, "learning_rate": 0.0002, "epoch": 5.870736086175943, "step": 81750}, {"loss": 0.5651, "grad_norm": 0.8906106352806091, "learning_rate": 0.0002, "epoch": 5.871454219030521, "step": 81760}, {"loss": 0.6258, "grad_norm": 1.1402281522750854, "learning_rate": 0.0002, "epoch": 5.872172351885099, "step": 81770}, {"loss": 0.5943, "grad_norm": 0.9991076588630676, "learning_rate": 0.0002, "epoch": 5.872890484739677, "step": 81780}, {"loss": 0.6095, "grad_norm": 1.0120140314102173, "learning_rate": 0.0002, "epoch": 5.873608617594255, "step": 81790}, {"loss": 0.6114, "grad_norm": 0.8857715725898743, "learning_rate": 0.0002, "epoch": 5.874326750448833, "step": 81800}, {"loss": 0.6027, "grad_norm": 0.8531954288482666, "learning_rate": 0.0002, "epoch": 5.875044883303411, "step": 81810}, {"loss": 0.6468, "grad_norm": 1.1601015329360962, "learning_rate": 0.0002, "epoch": 5.87576301615799, "step": 81820}, {"loss": 0.643, "grad_norm": 1.1435350179672241, "learning_rate": 0.0002, "epoch": 5.876481149012568, "step": 81830}, {"loss": 0.6195, "grad_norm": 0.9526153802871704, "learning_rate": 0.0002, "epoch": 5.877199281867146, "step": 81840}, {"loss": 0.648, "grad_norm": 1.06845223903656, "learning_rate": 0.0002, "epoch": 5.877917414721724, "step": 81850}, {"loss": 0.5963, "grad_norm": 0.9239344596862793, "learning_rate": 0.0002, "epoch": 5.878635547576302, "step": 81860}, {"loss": 0.5669, "grad_norm": 0.8632398247718811, "learning_rate": 0.0002, "epoch": 5.87935368043088, "step": 81870}, {"loss": 0.5904, "grad_norm": 0.9148443341255188, "learning_rate": 0.0002, "epoch": 5.880071813285458, "step": 81880}, {"loss": 0.5554, "grad_norm": 0.9910652041435242, "learning_rate": 0.0002, "epoch": 5.880789946140036, "step": 81890}, {"loss": 0.6132, "grad_norm": 0.8335179090499878, "learning_rate": 0.0002, "epoch": 5.881508078994614, "step": 81900}, {"loss": 0.6106, "grad_norm": 0.9921387434005737, "learning_rate": 0.0002, "epoch": 5.882226211849192, "step": 81910}, {"loss": 0.6327, "grad_norm": 1.0532517433166504, "learning_rate": 0.0002, "epoch": 5.88294434470377, "step": 81920}, {"loss": 0.6071, "grad_norm": 1.026400089263916, "learning_rate": 0.0002, "epoch": 5.883662477558349, "step": 81930}, {"loss": 0.6759, "grad_norm": 1.019195318222046, "learning_rate": 0.0002, "epoch": 5.884380610412927, "step": 81940}, {"loss": 0.5922, "grad_norm": 0.987238347530365, "learning_rate": 0.0002, "epoch": 5.885098743267505, "step": 81950}, {"loss": 0.5864, "grad_norm": 1.1714487075805664, "learning_rate": 0.0002, "epoch": 5.885816876122083, "step": 81960}, {"loss": 0.6006, "grad_norm": 1.0854483842849731, "learning_rate": 0.0002, "epoch": 5.886535008976661, "step": 81970}, {"loss": 0.588, "grad_norm": 1.0678396224975586, "learning_rate": 0.0002, "epoch": 5.887253141831239, "step": 81980}, {"loss": 0.6061, "grad_norm": 1.1009471416473389, "learning_rate": 0.0002, "epoch": 5.887971274685817, "step": 81990}, {"loss": 0.6397, "grad_norm": 1.2056844234466553, "learning_rate": 0.0002, "epoch": 5.888689407540395, "step": 82000}, {"loss": 0.6018, "grad_norm": 1.131302833557129, "learning_rate": 0.0002, "epoch": 5.8894075403949735, "step": 82010}, {"loss": 0.5822, "grad_norm": 1.4466036558151245, "learning_rate": 0.0002, "epoch": 5.8901256732495515, "step": 82020}, {"loss": 0.6295, "grad_norm": 1.051228404045105, "learning_rate": 0.0002, "epoch": 5.8908438061041295, "step": 82030}, {"loss": 0.5567, "grad_norm": 1.0010617971420288, "learning_rate": 0.0002, "epoch": 5.8915619389587075, "step": 82040}, {"loss": 0.5674, "grad_norm": 0.9095138311386108, "learning_rate": 0.0002, "epoch": 5.8922800718132855, "step": 82050}, {"loss": 0.5947, "grad_norm": 1.0237005949020386, "learning_rate": 0.0002, "epoch": 5.8929982046678635, "step": 82060}, {"loss": 0.6258, "grad_norm": 1.035122036933899, "learning_rate": 0.0002, "epoch": 5.8937163375224415, "step": 82070}, {"loss": 0.5866, "grad_norm": 1.0271964073181152, "learning_rate": 0.0002, "epoch": 5.8944344703770195, "step": 82080}, {"loss": 0.637, "grad_norm": 1.2044503688812256, "learning_rate": 0.0002, "epoch": 5.8951526032315975, "step": 82090}, {"loss": 0.6356, "grad_norm": 1.0275284051895142, "learning_rate": 0.0002, "epoch": 5.8958707360861755, "step": 82100}, {"loss": 0.6216, "grad_norm": 0.9974840879440308, "learning_rate": 0.0002, "epoch": 5.896588868940754, "step": 82110}, {"loss": 0.572, "grad_norm": 1.009968638420105, "learning_rate": 0.0002, "epoch": 5.897307001795332, "step": 82120}, {"loss": 0.6432, "grad_norm": 0.8396142721176147, "learning_rate": 0.0002, "epoch": 5.89802513464991, "step": 82130}, {"loss": 0.5671, "grad_norm": 1.002354621887207, "learning_rate": 0.0002, "epoch": 5.898743267504488, "step": 82140}, {"loss": 0.565, "grad_norm": 0.9998893737792969, "learning_rate": 0.0002, "epoch": 5.899461400359066, "step": 82150}, {"loss": 0.5836, "grad_norm": 1.1027010679244995, "learning_rate": 0.0002, "epoch": 5.900179533213644, "step": 82160}, {"loss": 0.6069, "grad_norm": 1.2028530836105347, "learning_rate": 0.0002, "epoch": 5.900897666068222, "step": 82170}, {"loss": 0.6184, "grad_norm": 1.0018759965896606, "learning_rate": 0.0002, "epoch": 5.9016157989228, "step": 82180}, {"loss": 0.5866, "grad_norm": 0.8911277055740356, "learning_rate": 0.0002, "epoch": 5.902333931777379, "step": 82190}, {"loss": 0.5638, "grad_norm": 1.0172009468078613, "learning_rate": 0.0002, "epoch": 5.903052064631957, "step": 82200}, {"loss": 0.6181, "grad_norm": 1.1664029359817505, "learning_rate": 0.0002, "epoch": 5.903770197486535, "step": 82210}, {"loss": 0.5863, "grad_norm": 1.0620089769363403, "learning_rate": 0.0002, "epoch": 5.904488330341113, "step": 82220}, {"loss": 0.6175, "grad_norm": 1.0756114721298218, "learning_rate": 0.0002, "epoch": 5.905206463195691, "step": 82230}, {"loss": 0.6223, "grad_norm": 1.1727497577667236, "learning_rate": 0.0002, "epoch": 5.905924596050269, "step": 82240}, {"loss": 0.5777, "grad_norm": 0.9833515882492065, "learning_rate": 0.0002, "epoch": 5.906642728904847, "step": 82250}, {"loss": 0.6344, "grad_norm": 0.9236368536949158, "learning_rate": 0.0002, "epoch": 5.907360861759425, "step": 82260}, {"loss": 0.6301, "grad_norm": 0.9773947596549988, "learning_rate": 0.0002, "epoch": 5.908078994614003, "step": 82270}, {"loss": 0.6255, "grad_norm": 1.1427783966064453, "learning_rate": 0.0002, "epoch": 5.908797127468581, "step": 82280}, {"loss": 0.6359, "grad_norm": 1.0215164422988892, "learning_rate": 0.0002, "epoch": 5.909515260323159, "step": 82290}, {"loss": 0.631, "grad_norm": 1.1157845258712769, "learning_rate": 0.0002, "epoch": 5.910233393177738, "step": 82300}, {"loss": 0.5706, "grad_norm": 1.1490662097930908, "learning_rate": 0.0002, "epoch": 5.910951526032316, "step": 82310}, {"loss": 0.5932, "grad_norm": 0.7233976125717163, "learning_rate": 0.0002, "epoch": 5.911669658886894, "step": 82320}, {"loss": 0.6199, "grad_norm": 1.0053865909576416, "learning_rate": 0.0002, "epoch": 5.912387791741472, "step": 82330}, {"loss": 0.6283, "grad_norm": 0.9764766097068787, "learning_rate": 0.0002, "epoch": 5.91310592459605, "step": 82340}, {"loss": 0.5981, "grad_norm": 0.9492928385734558, "learning_rate": 0.0002, "epoch": 5.913824057450628, "step": 82350}, {"loss": 0.6234, "grad_norm": 0.9538891315460205, "learning_rate": 0.0002, "epoch": 5.914542190305206, "step": 82360}, {"loss": 0.6717, "grad_norm": 1.2620314359664917, "learning_rate": 0.0002, "epoch": 5.915260323159784, "step": 82370}, {"loss": 0.5956, "grad_norm": 0.9913349151611328, "learning_rate": 0.0002, "epoch": 5.915978456014363, "step": 82380}, {"loss": 0.5877, "grad_norm": 0.9712074995040894, "learning_rate": 0.0002, "epoch": 5.916696588868941, "step": 82390}, {"loss": 0.5935, "grad_norm": 1.1554654836654663, "learning_rate": 0.0002, "epoch": 5.917414721723519, "step": 82400}, {"loss": 0.5881, "grad_norm": 1.1418904066085815, "learning_rate": 0.0002, "epoch": 5.918132854578097, "step": 82410}, {"loss": 0.5472, "grad_norm": 0.9405845999717712, "learning_rate": 0.0002, "epoch": 5.918850987432675, "step": 82420}, {"loss": 0.606, "grad_norm": 1.0801819562911987, "learning_rate": 0.0002, "epoch": 5.919569120287253, "step": 82430}, {"loss": 0.5953, "grad_norm": 0.8643896579742432, "learning_rate": 0.0002, "epoch": 5.920287253141831, "step": 82440}, {"loss": 0.6042, "grad_norm": 1.106025218963623, "learning_rate": 0.0002, "epoch": 5.921005385996409, "step": 82450}, {"loss": 0.5879, "grad_norm": 1.0338234901428223, "learning_rate": 0.0002, "epoch": 5.921723518850987, "step": 82460}, {"loss": 0.6733, "grad_norm": 1.0648493766784668, "learning_rate": 0.0002, "epoch": 5.922441651705565, "step": 82470}, {"loss": 0.6233, "grad_norm": 1.1950433254241943, "learning_rate": 0.0002, "epoch": 5.923159784560143, "step": 82480}, {"loss": 0.6148, "grad_norm": 0.8730897903442383, "learning_rate": 0.0002, "epoch": 5.923877917414722, "step": 82490}, {"loss": 0.6138, "grad_norm": 1.2262312173843384, "learning_rate": 0.0002, "epoch": 5.9245960502693, "step": 82500}, {"loss": 0.616, "grad_norm": 0.9526116251945496, "learning_rate": 0.0002, "epoch": 5.925314183123878, "step": 82510}, {"loss": 0.6372, "grad_norm": 1.0540224313735962, "learning_rate": 0.0002, "epoch": 5.926032315978456, "step": 82520}, {"loss": 0.6102, "grad_norm": 1.0537306070327759, "learning_rate": 0.0002, "epoch": 5.926750448833034, "step": 82530}, {"loss": 0.5789, "grad_norm": 1.134207844734192, "learning_rate": 0.0002, "epoch": 5.927468581687612, "step": 82540}, {"loss": 0.622, "grad_norm": 0.9042250514030457, "learning_rate": 0.0002, "epoch": 5.92818671454219, "step": 82550}, {"loss": 0.6207, "grad_norm": 1.0424834489822388, "learning_rate": 0.0002, "epoch": 5.928904847396768, "step": 82560}, {"loss": 0.5334, "grad_norm": 1.1571602821350098, "learning_rate": 0.0002, "epoch": 5.929622980251347, "step": 82570}, {"loss": 0.6549, "grad_norm": 1.1033377647399902, "learning_rate": 0.0002, "epoch": 5.930341113105925, "step": 82580}, {"loss": 0.5819, "grad_norm": 0.9211772680282593, "learning_rate": 0.0002, "epoch": 5.931059245960503, "step": 82590}, {"loss": 0.591, "grad_norm": 1.0566459894180298, "learning_rate": 0.0002, "epoch": 5.931777378815081, "step": 82600}, {"loss": 0.6318, "grad_norm": 1.1773834228515625, "learning_rate": 0.0002, "epoch": 5.932495511669659, "step": 82610}, {"loss": 0.6067, "grad_norm": 1.193396806716919, "learning_rate": 0.0002, "epoch": 5.933213644524237, "step": 82620}, {"loss": 0.6105, "grad_norm": 1.1101785898208618, "learning_rate": 0.0002, "epoch": 5.933931777378815, "step": 82630}, {"loss": 0.5742, "grad_norm": 0.6988118886947632, "learning_rate": 0.0002, "epoch": 5.934649910233393, "step": 82640}, {"loss": 0.626, "grad_norm": 0.9590985774993896, "learning_rate": 0.0002, "epoch": 5.935368043087971, "step": 82650}, {"loss": 0.5909, "grad_norm": 0.8512062430381775, "learning_rate": 0.0002, "epoch": 5.936086175942549, "step": 82660}, {"loss": 0.539, "grad_norm": 1.0381710529327393, "learning_rate": 0.0002, "epoch": 5.936804308797128, "step": 82670}, {"loss": 0.5608, "grad_norm": 1.0816296339035034, "learning_rate": 0.0002, "epoch": 5.937522441651706, "step": 82680}, {"loss": 0.6087, "grad_norm": 1.0592364072799683, "learning_rate": 0.0002, "epoch": 5.938240574506284, "step": 82690}, {"loss": 0.5792, "grad_norm": 0.737452507019043, "learning_rate": 0.0002, "epoch": 5.938958707360862, "step": 82700}, {"loss": 0.6031, "grad_norm": 0.9019039869308472, "learning_rate": 0.0002, "epoch": 5.93967684021544, "step": 82710}, {"loss": 0.6153, "grad_norm": 1.0049666166305542, "learning_rate": 0.0002, "epoch": 5.940394973070018, "step": 82720}, {"loss": 0.619, "grad_norm": 1.0016309022903442, "learning_rate": 0.0002, "epoch": 5.941113105924596, "step": 82730}, {"loss": 0.5796, "grad_norm": 0.7967594861984253, "learning_rate": 0.0002, "epoch": 5.941831238779174, "step": 82740}, {"loss": 0.6418, "grad_norm": 0.8978520631790161, "learning_rate": 0.0002, "epoch": 5.942549371633753, "step": 82750}, {"loss": 0.6234, "grad_norm": 1.0101654529571533, "learning_rate": 0.0002, "epoch": 5.943267504488331, "step": 82760}, {"loss": 0.5813, "grad_norm": 1.1515586376190186, "learning_rate": 0.0002, "epoch": 5.943985637342909, "step": 82770}, {"loss": 0.6031, "grad_norm": 0.8666134476661682, "learning_rate": 0.0002, "epoch": 5.944703770197487, "step": 82780}, {"loss": 0.565, "grad_norm": 1.1365231275558472, "learning_rate": 0.0002, "epoch": 5.945421903052065, "step": 82790}, {"loss": 0.6122, "grad_norm": 1.211229920387268, "learning_rate": 0.0002, "epoch": 5.946140035906643, "step": 82800}, {"loss": 0.5815, "grad_norm": 0.9900869727134705, "learning_rate": 0.0002, "epoch": 5.946858168761221, "step": 82810}, {"loss": 0.5973, "grad_norm": 0.9555928111076355, "learning_rate": 0.0002, "epoch": 5.947576301615799, "step": 82820}, {"loss": 0.5667, "grad_norm": 0.8468470573425293, "learning_rate": 0.0002, "epoch": 5.948294434470377, "step": 82830}, {"loss": 0.5895, "grad_norm": 1.0280319452285767, "learning_rate": 0.0002, "epoch": 5.949012567324955, "step": 82840}, {"loss": 0.5663, "grad_norm": 0.930145800113678, "learning_rate": 0.0002, "epoch": 5.949730700179533, "step": 82850}, {"loss": 0.5482, "grad_norm": 1.0677028894424438, "learning_rate": 0.0002, "epoch": 5.950448833034112, "step": 82860}, {"loss": 0.6009, "grad_norm": 1.2035255432128906, "learning_rate": 0.0002, "epoch": 5.95116696588869, "step": 82870}, {"loss": 0.6207, "grad_norm": 0.897537887096405, "learning_rate": 0.0002, "epoch": 5.951885098743268, "step": 82880}, {"loss": 0.6383, "grad_norm": 1.2858690023422241, "learning_rate": 0.0002, "epoch": 5.952603231597846, "step": 82890}, {"loss": 0.6111, "grad_norm": 1.0300413370132446, "learning_rate": 0.0002, "epoch": 5.953321364452424, "step": 82900}, {"loss": 0.6469, "grad_norm": 0.9873301982879639, "learning_rate": 0.0002, "epoch": 5.954039497307002, "step": 82910}, {"loss": 0.6173, "grad_norm": 1.0315600633621216, "learning_rate": 0.0002, "epoch": 5.95475763016158, "step": 82920}, {"loss": 0.5566, "grad_norm": 1.0631790161132812, "learning_rate": 0.0002, "epoch": 5.955475763016158, "step": 82930}, {"loss": 0.6067, "grad_norm": 1.035544514656067, "learning_rate": 0.0002, "epoch": 5.9561938958707366, "step": 82940}, {"loss": 0.6311, "grad_norm": 1.0162041187286377, "learning_rate": 0.0002, "epoch": 5.956912028725315, "step": 82950}, {"loss": 0.6005, "grad_norm": 0.7858892679214478, "learning_rate": 0.0002, "epoch": 5.957630161579893, "step": 82960}, {"loss": 0.5961, "grad_norm": 1.0359784364700317, "learning_rate": 0.0002, "epoch": 5.958348294434471, "step": 82970}, {"loss": 0.5704, "grad_norm": 1.057173252105713, "learning_rate": 0.0002, "epoch": 5.959066427289049, "step": 82980}, {"loss": 0.6127, "grad_norm": 1.1017464399337769, "learning_rate": 0.0002, "epoch": 5.959784560143627, "step": 82990}, {"loss": 0.5455, "grad_norm": 1.0688945055007935, "learning_rate": 0.0002, "epoch": 5.960502692998205, "step": 83000}, {"loss": 0.5429, "grad_norm": 1.048864483833313, "learning_rate": 0.0002, "epoch": 5.961220825852783, "step": 83010}, {"loss": 0.5559, "grad_norm": 1.057308316230774, "learning_rate": 0.0002, "epoch": 5.961938958707361, "step": 83020}, {"loss": 0.5703, "grad_norm": 0.9014604687690735, "learning_rate": 0.0002, "epoch": 5.962657091561939, "step": 83030}, {"loss": 0.6029, "grad_norm": 0.9899709224700928, "learning_rate": 0.0002, "epoch": 5.963375224416517, "step": 83040}, {"loss": 0.6403, "grad_norm": 1.0675519704818726, "learning_rate": 0.0002, "epoch": 5.9640933572710955, "step": 83050}, {"loss": 0.6016, "grad_norm": 0.9497889876365662, "learning_rate": 0.0002, "epoch": 5.9648114901256735, "step": 83060}, {"loss": 0.5997, "grad_norm": 0.9149549603462219, "learning_rate": 0.0002, "epoch": 5.9655296229802515, "step": 83070}, {"loss": 0.6105, "grad_norm": 1.329373836517334, "learning_rate": 0.0002, "epoch": 5.9662477558348295, "step": 83080}, {"loss": 0.6077, "grad_norm": 1.0731712579727173, "learning_rate": 0.0002, "epoch": 5.9669658886894075, "step": 83090}, {"loss": 0.6269, "grad_norm": 0.9498835802078247, "learning_rate": 0.0002, "epoch": 5.9676840215439855, "step": 83100}, {"loss": 0.6196, "grad_norm": 1.1222829818725586, "learning_rate": 0.0002, "epoch": 5.9684021543985635, "step": 83110}, {"loss": 0.5784, "grad_norm": 0.9923429489135742, "learning_rate": 0.0002, "epoch": 5.9691202872531415, "step": 83120}, {"loss": 0.6223, "grad_norm": 0.9046645164489746, "learning_rate": 0.0002, "epoch": 5.96983842010772, "step": 83130}, {"loss": 0.6252, "grad_norm": 0.9259500503540039, "learning_rate": 0.0002, "epoch": 5.970556552962298, "step": 83140}, {"loss": 0.5849, "grad_norm": 1.0604174137115479, "learning_rate": 0.0002, "epoch": 5.971274685816876, "step": 83150}, {"loss": 0.5789, "grad_norm": 1.0391676425933838, "learning_rate": 0.0002, "epoch": 5.971992818671454, "step": 83160}, {"loss": 0.5861, "grad_norm": 0.8825796246528625, "learning_rate": 0.0002, "epoch": 5.972710951526032, "step": 83170}, {"loss": 0.6164, "grad_norm": 0.9687952399253845, "learning_rate": 0.0002, "epoch": 5.97342908438061, "step": 83180}, {"loss": 0.6127, "grad_norm": 0.9401392340660095, "learning_rate": 0.0002, "epoch": 5.974147217235188, "step": 83190}, {"loss": 0.572, "grad_norm": 1.0526834726333618, "learning_rate": 0.0002, "epoch": 5.974865350089766, "step": 83200}, {"loss": 0.6047, "grad_norm": 1.1882060766220093, "learning_rate": 0.0002, "epoch": 5.975583482944344, "step": 83210}, {"loss": 0.5731, "grad_norm": 0.9182824492454529, "learning_rate": 0.0002, "epoch": 5.976301615798922, "step": 83220}, {"loss": 0.6092, "grad_norm": 1.344875454902649, "learning_rate": 0.0002, "epoch": 5.977019748653501, "step": 83230}, {"loss": 0.6198, "grad_norm": 1.3868434429168701, "learning_rate": 0.0002, "epoch": 5.977737881508079, "step": 83240}, {"loss": 0.6187, "grad_norm": 1.2702280282974243, "learning_rate": 0.0002, "epoch": 5.978456014362657, "step": 83250}, {"loss": 0.6271, "grad_norm": 0.9808234572410583, "learning_rate": 0.0002, "epoch": 5.979174147217235, "step": 83260}, {"loss": 0.6027, "grad_norm": 0.9225142598152161, "learning_rate": 0.0002, "epoch": 5.979892280071813, "step": 83270}, {"loss": 0.626, "grad_norm": 1.1095874309539795, "learning_rate": 0.0002, "epoch": 5.980610412926391, "step": 83280}, {"loss": 0.5994, "grad_norm": 1.2650344371795654, "learning_rate": 0.0002, "epoch": 5.981328545780969, "step": 83290}, {"loss": 0.5808, "grad_norm": 0.8230084180831909, "learning_rate": 0.0002, "epoch": 5.982046678635547, "step": 83300}, {"loss": 0.6399, "grad_norm": 1.171427607536316, "learning_rate": 0.0002, "epoch": 5.982764811490125, "step": 83310}, {"loss": 0.6033, "grad_norm": 0.7458868026733398, "learning_rate": 0.0002, "epoch": 5.983482944344704, "step": 83320}, {"loss": 0.6235, "grad_norm": 0.9238616228103638, "learning_rate": 0.0002, "epoch": 5.984201077199282, "step": 83330}, {"loss": 0.6316, "grad_norm": 1.027495265007019, "learning_rate": 0.0002, "epoch": 5.98491921005386, "step": 83340}, {"loss": 0.6202, "grad_norm": 1.0694037675857544, "learning_rate": 0.0002, "epoch": 5.985637342908438, "step": 83350}, {"loss": 0.5883, "grad_norm": 0.9498767256736755, "learning_rate": 0.0002, "epoch": 5.986355475763016, "step": 83360}, {"loss": 0.6022, "grad_norm": 1.0524284839630127, "learning_rate": 0.0002, "epoch": 5.987073608617594, "step": 83370}, {"loss": 0.5695, "grad_norm": 1.07961905002594, "learning_rate": 0.0002, "epoch": 5.987791741472172, "step": 83380}, {"loss": 0.5835, "grad_norm": 1.1436965465545654, "learning_rate": 0.0002, "epoch": 5.98850987432675, "step": 83390}, {"loss": 0.5835, "grad_norm": 1.2610782384872437, "learning_rate": 0.0002, "epoch": 5.989228007181328, "step": 83400}, {"loss": 0.6018, "grad_norm": 1.1105682849884033, "learning_rate": 0.0002, "epoch": 5.989946140035906, "step": 83410}, {"loss": 0.5989, "grad_norm": 0.9900349378585815, "learning_rate": 0.0002, "epoch": 5.990664272890485, "step": 83420}, {"loss": 0.6492, "grad_norm": 0.8766723275184631, "learning_rate": 0.0002, "epoch": 5.991382405745063, "step": 83430}, {"loss": 0.5944, "grad_norm": 0.9532597661018372, "learning_rate": 0.0002, "epoch": 5.992100538599641, "step": 83440}, {"loss": 0.5903, "grad_norm": 1.016831398010254, "learning_rate": 0.0002, "epoch": 5.992818671454219, "step": 83450}, {"loss": 0.6159, "grad_norm": 0.9884716272354126, "learning_rate": 0.0002, "epoch": 5.993536804308797, "step": 83460}, {"loss": 0.5559, "grad_norm": 0.9415417909622192, "learning_rate": 0.0002, "epoch": 5.994254937163375, "step": 83470}, {"loss": 0.5644, "grad_norm": 0.8629752397537231, "learning_rate": 0.0002, "epoch": 5.994973070017953, "step": 83480}, {"loss": 0.5961, "grad_norm": 1.061378002166748, "learning_rate": 0.0002, "epoch": 5.995691202872531, "step": 83490}, {"loss": 0.6117, "grad_norm": 0.907195508480072, "learning_rate": 0.0002, "epoch": 5.99640933572711, "step": 83500}, {"loss": 0.6584, "grad_norm": 1.023658037185669, "learning_rate": 0.0002, "epoch": 5.997127468581688, "step": 83510}, {"loss": 0.6009, "grad_norm": 0.9893278479576111, "learning_rate": 0.0002, "epoch": 5.997845601436266, "step": 83520}, {"loss": 0.609, "grad_norm": 1.1909127235412598, "learning_rate": 0.0002, "epoch": 5.998563734290844, "step": 83530}, {"loss": 0.5507, "grad_norm": 1.1800892353057861, "learning_rate": 0.0002, "epoch": 5.999281867145422, "step": 83540}, {"loss": 0.605, "grad_norm": 1.0822563171386719, "learning_rate": 0.0002, "epoch": 6.0, "step": 83550}, {"eval_loss": 1.1494214534759521, "eval_runtime": 55.1809, "eval_samples_per_second": 13.284, "eval_steps_per_second": 1.667, "epoch": 6.0, "step": 83550}, {"loss": 0.529, "grad_norm": 0.8760911226272583, "learning_rate": 0.0002, "epoch": 6.000718132854578, "step": 83560}, {"loss": 0.524, "grad_norm": 1.0037305355072021, "learning_rate": 0.0002, "epoch": 6.001436265709156, "step": 83570}, {"loss": 0.5622, "grad_norm": 1.0550320148468018, "learning_rate": 0.0002, "epoch": 6.002154398563734, "step": 83580}, {"loss": 0.5498, "grad_norm": 0.7841113805770874, "learning_rate": 0.0002, "epoch": 6.002872531418312, "step": 83590}, {"loss": 0.5332, "grad_norm": 1.1221094131469727, "learning_rate": 0.0002, "epoch": 6.003590664272891, "step": 83600}, {"loss": 0.5521, "grad_norm": 1.174143671989441, "learning_rate": 0.0002, "epoch": 6.004308797127469, "step": 83610}, {"loss": 0.514, "grad_norm": 1.1316391229629517, "learning_rate": 0.0002, "epoch": 6.005026929982047, "step": 83620}, {"loss": 0.5221, "grad_norm": 0.9318140745162964, "learning_rate": 0.0002, "epoch": 6.005745062836625, "step": 83630}, {"loss": 0.5133, "grad_norm": 1.1589723825454712, "learning_rate": 0.0002, "epoch": 6.006463195691203, "step": 83640}, {"loss": 0.509, "grad_norm": 0.7452214360237122, "learning_rate": 0.0002, "epoch": 6.007181328545781, "step": 83650}, {"loss": 0.5522, "grad_norm": 1.205767035484314, "learning_rate": 0.0002, "epoch": 6.007899461400359, "step": 83660}, {"loss": 0.4888, "grad_norm": 0.8741596341133118, "learning_rate": 0.0002, "epoch": 6.008617594254937, "step": 83670}, {"loss": 0.5653, "grad_norm": 1.152982234954834, "learning_rate": 0.0002, "epoch": 6.009335727109515, "step": 83680}, {"loss": 0.5286, "grad_norm": 1.2438874244689941, "learning_rate": 0.0002, "epoch": 6.010053859964093, "step": 83690}, {"loss": 0.5455, "grad_norm": 1.142795443534851, "learning_rate": 0.0002, "epoch": 6.010771992818672, "step": 83700}, {"loss": 0.5678, "grad_norm": 1.1999919414520264, "learning_rate": 0.0002, "epoch": 6.01149012567325, "step": 83710}, {"loss": 0.5233, "grad_norm": 1.1839698553085327, "learning_rate": 0.0002, "epoch": 6.012208258527828, "step": 83720}, {"loss": 0.5483, "grad_norm": 1.1131623983383179, "learning_rate": 0.0002, "epoch": 6.012926391382406, "step": 83730}, {"loss": 0.5086, "grad_norm": 0.8436203598976135, "learning_rate": 0.0002, "epoch": 6.013644524236984, "step": 83740}, {"loss": 0.4991, "grad_norm": 0.9938826560974121, "learning_rate": 0.0002, "epoch": 6.014362657091562, "step": 83750}, {"loss": 0.5767, "grad_norm": 1.1624900102615356, "learning_rate": 0.0002, "epoch": 6.01508078994614, "step": 83760}, {"loss": 0.5116, "grad_norm": 1.0212476253509521, "learning_rate": 0.0002, "epoch": 6.015798922800718, "step": 83770}, {"loss": 0.5247, "grad_norm": 0.8108501434326172, "learning_rate": 0.0002, "epoch": 6.016517055655296, "step": 83780}, {"loss": 0.5325, "grad_norm": 1.3106935024261475, "learning_rate": 0.0002, "epoch": 6.017235188509875, "step": 83790}, {"loss": 0.5336, "grad_norm": 1.3103147745132446, "learning_rate": 0.0002, "epoch": 6.017953321364453, "step": 83800}, {"loss": 0.5224, "grad_norm": 0.7501855492591858, "learning_rate": 0.0002, "epoch": 6.018671454219031, "step": 83810}, {"loss": 0.5079, "grad_norm": 0.9246482253074646, "learning_rate": 0.0002, "epoch": 6.019389587073609, "step": 83820}, {"loss": 0.5038, "grad_norm": 1.0305052995681763, "learning_rate": 0.0002, "epoch": 6.020107719928187, "step": 83830}, {"loss": 0.5314, "grad_norm": 1.0912569761276245, "learning_rate": 0.0002, "epoch": 6.020825852782765, "step": 83840}, {"loss": 0.5268, "grad_norm": 0.9320057034492493, "learning_rate": 0.0002, "epoch": 6.021543985637343, "step": 83850}, {"loss": 0.4795, "grad_norm": 1.160483479499817, "learning_rate": 0.0002, "epoch": 6.022262118491921, "step": 83860}, {"loss": 0.5014, "grad_norm": 1.0211237668991089, "learning_rate": 0.0002, "epoch": 6.022980251346499, "step": 83870}, {"loss": 0.5515, "grad_norm": 0.8101710081100464, "learning_rate": 0.0002, "epoch": 6.023698384201078, "step": 83880}, {"loss": 0.509, "grad_norm": 1.0671406984329224, "learning_rate": 0.0002, "epoch": 6.024416517055656, "step": 83890}, {"loss": 0.5573, "grad_norm": 1.3084125518798828, "learning_rate": 0.0002, "epoch": 6.025134649910234, "step": 83900}, {"loss": 0.5046, "grad_norm": 1.0144813060760498, "learning_rate": 0.0002, "epoch": 6.025852782764812, "step": 83910}, {"loss": 0.5184, "grad_norm": 1.134848952293396, "learning_rate": 0.0002, "epoch": 6.02657091561939, "step": 83920}, {"loss": 0.5241, "grad_norm": 1.183115005493164, "learning_rate": 0.0002, "epoch": 6.027289048473968, "step": 83930}, {"loss": 0.5097, "grad_norm": 0.961912989616394, "learning_rate": 0.0002, "epoch": 6.028007181328546, "step": 83940}, {"loss": 0.524, "grad_norm": 0.9033881425857544, "learning_rate": 0.0002, "epoch": 6.028725314183124, "step": 83950}, {"loss": 0.4978, "grad_norm": 1.0272901058197021, "learning_rate": 0.0002, "epoch": 6.029443447037702, "step": 83960}, {"loss": 0.5218, "grad_norm": 1.0007939338684082, "learning_rate": 0.0002, "epoch": 6.03016157989228, "step": 83970}, {"loss": 0.5215, "grad_norm": 1.0941389799118042, "learning_rate": 0.0002, "epoch": 6.0308797127468585, "step": 83980}, {"loss": 0.4881, "grad_norm": 0.9068517088890076, "learning_rate": 0.0002, "epoch": 6.0315978456014365, "step": 83990}, {"loss": 0.5352, "grad_norm": 0.8636500835418701, "learning_rate": 0.0002, "epoch": 6.0323159784560145, "step": 84000}, {"loss": 0.5668, "grad_norm": 1.352675437927246, "learning_rate": 0.0002, "epoch": 6.0330341113105925, "step": 84010}, {"loss": 0.5201, "grad_norm": 1.0889637470245361, "learning_rate": 0.0002, "epoch": 6.0337522441651705, "step": 84020}, {"loss": 0.5143, "grad_norm": 0.9063141345977783, "learning_rate": 0.0002, "epoch": 6.0344703770197485, "step": 84030}, {"loss": 0.5089, "grad_norm": 1.317254900932312, "learning_rate": 0.0002, "epoch": 6.0351885098743265, "step": 84040}, {"loss": 0.5198, "grad_norm": 1.1001603603363037, "learning_rate": 0.0002, "epoch": 6.0359066427289045, "step": 84050}, {"loss": 0.5167, "grad_norm": 0.8041839003562927, "learning_rate": 0.0002, "epoch": 6.0366247755834825, "step": 84060}, {"loss": 0.5157, "grad_norm": 1.125082015991211, "learning_rate": 0.0002, "epoch": 6.037342908438061, "step": 84070}, {"loss": 0.5023, "grad_norm": 0.8926277160644531, "learning_rate": 0.0002, "epoch": 6.038061041292639, "step": 84080}, {"loss": 0.4888, "grad_norm": 1.0548304319381714, "learning_rate": 0.0002, "epoch": 6.038779174147217, "step": 84090}, {"loss": 0.5216, "grad_norm": 1.2299435138702393, "learning_rate": 0.0002, "epoch": 6.039497307001795, "step": 84100}, {"loss": 0.5243, "grad_norm": 0.7348281741142273, "learning_rate": 0.0002, "epoch": 6.040215439856373, "step": 84110}, {"loss": 0.5598, "grad_norm": 1.032209873199463, "learning_rate": 0.0002, "epoch": 6.040933572710951, "step": 84120}, {"loss": 0.5448, "grad_norm": 0.925134003162384, "learning_rate": 0.0002, "epoch": 6.041651705565529, "step": 84130}, {"loss": 0.5153, "grad_norm": 1.1078300476074219, "learning_rate": 0.0002, "epoch": 6.042369838420107, "step": 84140}, {"loss": 0.5407, "grad_norm": 0.9045702815055847, "learning_rate": 0.0002, "epoch": 6.043087971274685, "step": 84150}, {"loss": 0.5188, "grad_norm": 0.8836823105812073, "learning_rate": 0.0002, "epoch": 6.043806104129264, "step": 84160}, {"loss": 0.5242, "grad_norm": 0.8083572387695312, "learning_rate": 0.0002, "epoch": 6.044524236983842, "step": 84170}, {"loss": 0.5203, "grad_norm": 0.8744190335273743, "learning_rate": 0.0002, "epoch": 6.04524236983842, "step": 84180}, {"loss": 0.5372, "grad_norm": 1.1944562196731567, "learning_rate": 0.0002, "epoch": 6.045960502692998, "step": 84190}, {"loss": 0.5648, "grad_norm": 1.3782621622085571, "learning_rate": 0.0002, "epoch": 6.046678635547576, "step": 84200}, {"loss": 0.5744, "grad_norm": 1.2800641059875488, "learning_rate": 0.0002, "epoch": 6.047396768402154, "step": 84210}, {"loss": 0.5513, "grad_norm": 1.1035456657409668, "learning_rate": 0.0002, "epoch": 6.048114901256732, "step": 84220}, {"loss": 0.5428, "grad_norm": 1.243274211883545, "learning_rate": 0.0002, "epoch": 6.04883303411131, "step": 84230}, {"loss": 0.55, "grad_norm": 0.8821795582771301, "learning_rate": 0.0002, "epoch": 6.049551166965888, "step": 84240}, {"loss": 0.5563, "grad_norm": 0.8730825185775757, "learning_rate": 0.0002, "epoch": 6.050269299820466, "step": 84250}, {"loss": 0.5755, "grad_norm": 0.9874304533004761, "learning_rate": 0.0002, "epoch": 6.050987432675045, "step": 84260}, {"loss": 0.5261, "grad_norm": 1.3245618343353271, "learning_rate": 0.0002, "epoch": 6.051705565529623, "step": 84270}, {"loss": 0.5172, "grad_norm": 1.04741370677948, "learning_rate": 0.0002, "epoch": 6.052423698384201, "step": 84280}, {"loss": 0.511, "grad_norm": 1.1984949111938477, "learning_rate": 0.0002, "epoch": 6.053141831238779, "step": 84290}, {"loss": 0.5148, "grad_norm": 0.9603039622306824, "learning_rate": 0.0002, "epoch": 6.053859964093357, "step": 84300}, {"loss": 0.54, "grad_norm": 1.178102731704712, "learning_rate": 0.0002, "epoch": 6.054578096947935, "step": 84310}, {"loss": 0.554, "grad_norm": 1.135046124458313, "learning_rate": 0.0002, "epoch": 6.055296229802513, "step": 84320}, {"loss": 0.517, "grad_norm": 0.9682887196540833, "learning_rate": 0.0002, "epoch": 6.056014362657091, "step": 84330}, {"loss": 0.5089, "grad_norm": 0.9676550030708313, "learning_rate": 0.0002, "epoch": 6.056732495511669, "step": 84340}, {"loss": 0.5472, "grad_norm": 1.0987977981567383, "learning_rate": 0.0002, "epoch": 6.057450628366248, "step": 84350}, {"loss": 0.5414, "grad_norm": 0.9808574914932251, "learning_rate": 0.0002, "epoch": 6.058168761220826, "step": 84360}, {"loss": 0.4836, "grad_norm": 1.0585200786590576, "learning_rate": 0.0002, "epoch": 6.058886894075404, "step": 84370}, {"loss": 0.5177, "grad_norm": 0.9592017531394958, "learning_rate": 0.0002, "epoch": 6.059605026929982, "step": 84380}, {"loss": 0.5352, "grad_norm": 0.9652285575866699, "learning_rate": 0.0002, "epoch": 6.06032315978456, "step": 84390}, {"loss": 0.5237, "grad_norm": 1.1223928928375244, "learning_rate": 0.0002, "epoch": 6.061041292639138, "step": 84400}, {"loss": 0.5515, "grad_norm": 1.0554455518722534, "learning_rate": 0.0002, "epoch": 6.061759425493716, "step": 84410}, {"loss": 0.5652, "grad_norm": 1.4566363096237183, "learning_rate": 0.0002, "epoch": 6.062477558348294, "step": 84420}, {"loss": 0.5219, "grad_norm": 1.0793368816375732, "learning_rate": 0.0002, "epoch": 6.063195691202872, "step": 84430}, {"loss": 0.5532, "grad_norm": 1.1032981872558594, "learning_rate": 0.0002, "epoch": 6.063913824057451, "step": 84440}, {"loss": 0.5257, "grad_norm": 1.0701037645339966, "learning_rate": 0.0002, "epoch": 6.064631956912029, "step": 84450}, {"loss": 0.5505, "grad_norm": 0.9359426498413086, "learning_rate": 0.0002, "epoch": 6.065350089766607, "step": 84460}, {"loss": 0.5363, "grad_norm": 1.0277773141860962, "learning_rate": 0.0002, "epoch": 6.066068222621185, "step": 84470}, {"loss": 0.5082, "grad_norm": 1.029319405555725, "learning_rate": 0.0002, "epoch": 6.066786355475763, "step": 84480}, {"loss": 0.4949, "grad_norm": 1.3563756942749023, "learning_rate": 0.0002, "epoch": 6.067504488330341, "step": 84490}, {"loss": 0.55, "grad_norm": 0.9577816128730774, "learning_rate": 0.0002, "epoch": 6.068222621184919, "step": 84500}, {"loss": 0.51, "grad_norm": 0.9856799840927124, "learning_rate": 0.0002, "epoch": 6.068940754039497, "step": 84510}, {"loss": 0.5527, "grad_norm": 1.3285183906555176, "learning_rate": 0.0002, "epoch": 6.069658886894075, "step": 84520}, {"loss": 0.517, "grad_norm": 1.0407335758209229, "learning_rate": 0.0002, "epoch": 6.070377019748653, "step": 84530}, {"loss": 0.5083, "grad_norm": 1.3125360012054443, "learning_rate": 0.0002, "epoch": 6.071095152603232, "step": 84540}, {"loss": 0.4791, "grad_norm": 1.0198888778686523, "learning_rate": 0.0002, "epoch": 6.07181328545781, "step": 84550}, {"loss": 0.5629, "grad_norm": 1.198135256767273, "learning_rate": 0.0002, "epoch": 6.072531418312388, "step": 84560}, {"loss": 0.5213, "grad_norm": 1.1547776460647583, "learning_rate": 0.0002, "epoch": 6.073249551166966, "step": 84570}, {"loss": 0.5503, "grad_norm": 1.1667766571044922, "learning_rate": 0.0002, "epoch": 6.073967684021544, "step": 84580}, {"loss": 0.5465, "grad_norm": 0.945159375667572, "learning_rate": 0.0002, "epoch": 6.074685816876122, "step": 84590}, {"loss": 0.5451, "grad_norm": 1.0362721681594849, "learning_rate": 0.0002, "epoch": 6.0754039497307, "step": 84600}, {"loss": 0.5538, "grad_norm": 1.1442973613739014, "learning_rate": 0.0002, "epoch": 6.076122082585278, "step": 84610}, {"loss": 0.5285, "grad_norm": 1.2077388763427734, "learning_rate": 0.0002, "epoch": 6.076840215439856, "step": 84620}, {"loss": 0.5581, "grad_norm": 1.1404398679733276, "learning_rate": 0.0002, "epoch": 6.077558348294435, "step": 84630}, {"loss": 0.5522, "grad_norm": 1.0291249752044678, "learning_rate": 0.0002, "epoch": 6.078276481149013, "step": 84640}, {"loss": 0.5227, "grad_norm": 1.2045460939407349, "learning_rate": 0.0002, "epoch": 6.078994614003591, "step": 84650}, {"loss": 0.5475, "grad_norm": 0.9492267966270447, "learning_rate": 0.0002, "epoch": 6.079712746858169, "step": 84660}, {"loss": 0.5664, "grad_norm": 0.9108620285987854, "learning_rate": 0.0002, "epoch": 6.080430879712747, "step": 84670}, {"loss": 0.517, "grad_norm": 1.0403251647949219, "learning_rate": 0.0002, "epoch": 6.081149012567325, "step": 84680}, {"loss": 0.5245, "grad_norm": 0.8537648916244507, "learning_rate": 0.0002, "epoch": 6.081867145421903, "step": 84690}, {"loss": 0.5572, "grad_norm": 0.8450568914413452, "learning_rate": 0.0002, "epoch": 6.082585278276481, "step": 84700}, {"loss": 0.5424, "grad_norm": 0.9770439267158508, "learning_rate": 0.0002, "epoch": 6.083303411131059, "step": 84710}, {"loss": 0.5268, "grad_norm": 0.7480165958404541, "learning_rate": 0.0002, "epoch": 6.084021543985638, "step": 84720}, {"loss": 0.5565, "grad_norm": 1.0038665533065796, "learning_rate": 0.0002, "epoch": 6.084739676840216, "step": 84730}, {"loss": 0.5779, "grad_norm": 1.2631266117095947, "learning_rate": 0.0002, "epoch": 6.085457809694794, "step": 84740}, {"loss": 0.5282, "grad_norm": 1.0285290479660034, "learning_rate": 0.0002, "epoch": 6.086175942549372, "step": 84750}, {"loss": 0.5393, "grad_norm": 0.8775458335876465, "learning_rate": 0.0002, "epoch": 6.08689407540395, "step": 84760}, {"loss": 0.5046, "grad_norm": 1.105391263961792, "learning_rate": 0.0002, "epoch": 6.087612208258528, "step": 84770}, {"loss": 0.5349, "grad_norm": 0.9214589595794678, "learning_rate": 0.0002, "epoch": 6.088330341113106, "step": 84780}, {"loss": 0.5076, "grad_norm": 1.1920515298843384, "learning_rate": 0.0002, "epoch": 6.089048473967684, "step": 84790}, {"loss": 0.5481, "grad_norm": 1.0314369201660156, "learning_rate": 0.0002, "epoch": 6.089766606822262, "step": 84800}, {"loss": 0.5553, "grad_norm": 1.1323022842407227, "learning_rate": 0.0002, "epoch": 6.09048473967684, "step": 84810}, {"loss": 0.554, "grad_norm": 0.9882907271385193, "learning_rate": 0.0002, "epoch": 6.091202872531419, "step": 84820}, {"loss": 0.5038, "grad_norm": 0.9372309446334839, "learning_rate": 0.0002, "epoch": 6.091921005385997, "step": 84830}, {"loss": 0.547, "grad_norm": 0.9904384016990662, "learning_rate": 0.0002, "epoch": 6.092639138240575, "step": 84840}, {"loss": 0.6083, "grad_norm": 1.1983239650726318, "learning_rate": 0.0002, "epoch": 6.093357271095153, "step": 84850}, {"loss": 0.5018, "grad_norm": 1.0157414674758911, "learning_rate": 0.0002, "epoch": 6.094075403949731, "step": 84860}, {"loss": 0.5264, "grad_norm": 1.1213963031768799, "learning_rate": 0.0002, "epoch": 6.094793536804309, "step": 84870}, {"loss": 0.5351, "grad_norm": 0.9863889813423157, "learning_rate": 0.0002, "epoch": 6.095511669658887, "step": 84880}, {"loss": 0.5816, "grad_norm": 1.2265585660934448, "learning_rate": 0.0002, "epoch": 6.096229802513465, "step": 84890}, {"loss": 0.5176, "grad_norm": 0.9000206589698792, "learning_rate": 0.0002, "epoch": 6.096947935368043, "step": 84900}, {"loss": 0.5849, "grad_norm": 0.9284350872039795, "learning_rate": 0.0002, "epoch": 6.097666068222622, "step": 84910}, {"loss": 0.535, "grad_norm": 0.8180069923400879, "learning_rate": 0.0002, "epoch": 6.0983842010772, "step": 84920}, {"loss": 0.5082, "grad_norm": 1.0313721895217896, "learning_rate": 0.0002, "epoch": 6.099102333931778, "step": 84930}, {"loss": 0.5233, "grad_norm": 0.9959180355072021, "learning_rate": 0.0002, "epoch": 6.099820466786356, "step": 84940}, {"loss": 0.554, "grad_norm": 1.1720712184906006, "learning_rate": 0.0002, "epoch": 6.100538599640934, "step": 84950}, {"loss": 0.5286, "grad_norm": 1.1033729314804077, "learning_rate": 0.0002, "epoch": 6.101256732495512, "step": 84960}, {"loss": 0.5303, "grad_norm": 1.2325657606124878, "learning_rate": 0.0002, "epoch": 6.10197486535009, "step": 84970}, {"loss": 0.5135, "grad_norm": 1.204935073852539, "learning_rate": 0.0002, "epoch": 6.102692998204668, "step": 84980}, {"loss": 0.4999, "grad_norm": 0.9543479084968567, "learning_rate": 0.0002, "epoch": 6.103411131059246, "step": 84990}, {"loss": 0.5488, "grad_norm": 1.0036866664886475, "learning_rate": 0.0002, "epoch": 6.1041292639138245, "step": 85000}, {"loss": 0.5224, "grad_norm": 1.0862882137298584, "learning_rate": 0.0002, "epoch": 6.1048473967684025, "step": 85010}, {"loss": 0.5399, "grad_norm": 1.052764892578125, "learning_rate": 0.0002, "epoch": 6.1055655296229805, "step": 85020}, {"loss": 0.5517, "grad_norm": 1.1948769092559814, "learning_rate": 0.0002, "epoch": 6.1062836624775585, "step": 85030}, {"loss": 0.5384, "grad_norm": 1.0291588306427002, "learning_rate": 0.0002, "epoch": 6.1070017953321365, "step": 85040}, {"loss": 0.5456, "grad_norm": 1.2162322998046875, "learning_rate": 0.0002, "epoch": 6.1077199281867145, "step": 85050}, {"loss": 0.5143, "grad_norm": 1.2867375612258911, "learning_rate": 0.0002, "epoch": 6.1084380610412925, "step": 85060}, {"loss": 0.5903, "grad_norm": 0.9639427661895752, "learning_rate": 0.0002, "epoch": 6.1091561938958705, "step": 85070}, {"loss": 0.5671, "grad_norm": 1.0775039196014404, "learning_rate": 0.0002, "epoch": 6.1098743267504485, "step": 85080}, {"loss": 0.5223, "grad_norm": 1.0423188209533691, "learning_rate": 0.0002, "epoch": 6.1105924596050265, "step": 85090}, {"loss": 0.5737, "grad_norm": 0.9388473033905029, "learning_rate": 0.0002, "epoch": 6.111310592459605, "step": 85100}, {"loss": 0.5676, "grad_norm": 1.0761773586273193, "learning_rate": 0.0002, "epoch": 6.112028725314183, "step": 85110}, {"loss": 0.5144, "grad_norm": 1.0886104106903076, "learning_rate": 0.0002, "epoch": 6.112746858168761, "step": 85120}, {"loss": 0.4909, "grad_norm": 0.8716141581535339, "learning_rate": 0.0002, "epoch": 6.113464991023339, "step": 85130}, {"loss": 0.5598, "grad_norm": 1.5060595273971558, "learning_rate": 0.0002, "epoch": 6.114183123877917, "step": 85140}, {"loss": 0.5431, "grad_norm": 1.2417129278182983, "learning_rate": 0.0002, "epoch": 6.114901256732495, "step": 85150}, {"loss": 0.5405, "grad_norm": 1.063604712486267, "learning_rate": 0.0002, "epoch": 6.115619389587073, "step": 85160}, {"loss": 0.5832, "grad_norm": 1.1341352462768555, "learning_rate": 0.0002, "epoch": 6.116337522441651, "step": 85170}, {"loss": 0.5708, "grad_norm": 1.011865258216858, "learning_rate": 0.0002, "epoch": 6.117055655296229, "step": 85180}, {"loss": 0.5472, "grad_norm": 1.0746972560882568, "learning_rate": 0.0002, "epoch": 6.117773788150808, "step": 85190}, {"loss": 0.5301, "grad_norm": 0.9522349238395691, "learning_rate": 0.0002, "epoch": 6.118491921005386, "step": 85200}, {"loss": 0.5952, "grad_norm": 1.091785192489624, "learning_rate": 0.0002, "epoch": 6.119210053859964, "step": 85210}, {"loss": 0.5474, "grad_norm": 1.1013420820236206, "learning_rate": 0.0002, "epoch": 6.119928186714542, "step": 85220}, {"loss": 0.5498, "grad_norm": 0.9477053880691528, "learning_rate": 0.0002, "epoch": 6.12064631956912, "step": 85230}, {"loss": 0.5594, "grad_norm": 1.1278045177459717, "learning_rate": 0.0002, "epoch": 6.121364452423698, "step": 85240}, {"loss": 0.5266, "grad_norm": 1.0343154668807983, "learning_rate": 0.0002, "epoch": 6.122082585278276, "step": 85250}, {"loss": 0.5581, "grad_norm": 0.9023236036300659, "learning_rate": 0.0002, "epoch": 6.122800718132854, "step": 85260}, {"loss": 0.5282, "grad_norm": 1.1085705757141113, "learning_rate": 0.0002, "epoch": 6.123518850987432, "step": 85270}, {"loss": 0.5482, "grad_norm": 1.2945729494094849, "learning_rate": 0.0002, "epoch": 6.124236983842011, "step": 85280}, {"loss": 0.5331, "grad_norm": 1.0367915630340576, "learning_rate": 0.0002, "epoch": 6.124955116696589, "step": 85290}, {"loss": 0.5546, "grad_norm": 0.9990636706352234, "learning_rate": 0.0002, "epoch": 6.125673249551167, "step": 85300}, {"loss": 0.5182, "grad_norm": 0.9737518429756165, "learning_rate": 0.0002, "epoch": 6.126391382405745, "step": 85310}, {"loss": 0.5826, "grad_norm": 1.0211181640625, "learning_rate": 0.0002, "epoch": 6.127109515260323, "step": 85320}, {"loss": 0.5153, "grad_norm": 0.9609670042991638, "learning_rate": 0.0002, "epoch": 6.127827648114901, "step": 85330}, {"loss": 0.582, "grad_norm": 1.124629259109497, "learning_rate": 0.0002, "epoch": 6.128545780969479, "step": 85340}, {"loss": 0.56, "grad_norm": 0.9436500072479248, "learning_rate": 0.0002, "epoch": 6.129263913824057, "step": 85350}, {"loss": 0.5568, "grad_norm": 1.3075382709503174, "learning_rate": 0.0002, "epoch": 6.129982046678635, "step": 85360}, {"loss": 0.543, "grad_norm": 0.9185589551925659, "learning_rate": 0.0002, "epoch": 6.130700179533213, "step": 85370}, {"loss": 0.5418, "grad_norm": 1.1051443815231323, "learning_rate": 0.0002, "epoch": 6.131418312387792, "step": 85380}, {"loss": 0.5727, "grad_norm": 1.185263752937317, "learning_rate": 0.0002, "epoch": 6.13213644524237, "step": 85390}, {"loss": 0.5448, "grad_norm": 1.0959895849227905, "learning_rate": 0.0002, "epoch": 6.132854578096948, "step": 85400}, {"loss": 0.4946, "grad_norm": 0.9279834032058716, "learning_rate": 0.0002, "epoch": 6.133572710951526, "step": 85410}, {"loss": 0.5524, "grad_norm": 1.36788010597229, "learning_rate": 0.0002, "epoch": 6.134290843806104, "step": 85420}, {"loss": 0.5122, "grad_norm": 1.0156842470169067, "learning_rate": 0.0002, "epoch": 6.135008976660682, "step": 85430}, {"loss": 0.5287, "grad_norm": 0.9998385906219482, "learning_rate": 0.0002, "epoch": 6.13572710951526, "step": 85440}, {"loss": 0.5205, "grad_norm": 1.21120285987854, "learning_rate": 0.0002, "epoch": 6.136445242369838, "step": 85450}, {"loss": 0.561, "grad_norm": 1.1198976039886475, "learning_rate": 0.0002, "epoch": 6.137163375224416, "step": 85460}, {"loss": 0.5527, "grad_norm": 0.8551197648048401, "learning_rate": 0.0002, "epoch": 6.137881508078995, "step": 85470}, {"loss": 0.5501, "grad_norm": 1.378423810005188, "learning_rate": 0.0002, "epoch": 6.138599640933573, "step": 85480}, {"loss": 0.5584, "grad_norm": 1.0602139234542847, "learning_rate": 0.0002, "epoch": 6.139317773788151, "step": 85490}, {"loss": 0.5656, "grad_norm": 0.9416277408599854, "learning_rate": 0.0002, "epoch": 6.140035906642729, "step": 85500}, {"loss": 0.5461, "grad_norm": 0.9356902241706848, "learning_rate": 0.0002, "epoch": 6.140754039497307, "step": 85510}, {"loss": 0.5405, "grad_norm": 1.1635851860046387, "learning_rate": 0.0002, "epoch": 6.141472172351885, "step": 85520}, {"loss": 0.5026, "grad_norm": 0.7880265712738037, "learning_rate": 0.0002, "epoch": 6.142190305206463, "step": 85530}, {"loss": 0.6164, "grad_norm": 1.0618375539779663, "learning_rate": 0.0002, "epoch": 6.142908438061041, "step": 85540}, {"loss": 0.5202, "grad_norm": 0.8438394665718079, "learning_rate": 0.0002, "epoch": 6.143626570915619, "step": 85550}, {"loss": 0.5651, "grad_norm": 1.0630128383636475, "learning_rate": 0.0002, "epoch": 6.144344703770198, "step": 85560}, {"loss": 0.5128, "grad_norm": 1.027308464050293, "learning_rate": 0.0002, "epoch": 6.145062836624776, "step": 85570}, {"loss": 0.5519, "grad_norm": 1.0832568407058716, "learning_rate": 0.0002, "epoch": 6.145780969479354, "step": 85580}, {"loss": 0.5484, "grad_norm": 0.9134858250617981, "learning_rate": 0.0002, "epoch": 6.146499102333932, "step": 85590}, {"loss": 0.5539, "grad_norm": 1.2738041877746582, "learning_rate": 0.0002, "epoch": 6.14721723518851, "step": 85600}, {"loss": 0.5141, "grad_norm": 0.9961518049240112, "learning_rate": 0.0002, "epoch": 6.147935368043088, "step": 85610}, {"loss": 0.5173, "grad_norm": 0.8851816654205322, "learning_rate": 0.0002, "epoch": 6.148653500897666, "step": 85620}, {"loss": 0.5478, "grad_norm": 0.96479731798172, "learning_rate": 0.0002, "epoch": 6.149371633752244, "step": 85630}, {"loss": 0.536, "grad_norm": 0.903256893157959, "learning_rate": 0.0002, "epoch": 6.150089766606822, "step": 85640}, {"loss": 0.5263, "grad_norm": 1.065151333808899, "learning_rate": 0.0002, "epoch": 6.1508078994614, "step": 85650}, {"loss": 0.5495, "grad_norm": 0.9824285507202148, "learning_rate": 0.0002, "epoch": 6.151526032315979, "step": 85660}, {"loss": 0.5724, "grad_norm": 1.1620386838912964, "learning_rate": 0.0002, "epoch": 6.152244165170557, "step": 85670}, {"loss": 0.5706, "grad_norm": 1.134757161140442, "learning_rate": 0.0002, "epoch": 6.152962298025135, "step": 85680}, {"loss": 0.5532, "grad_norm": 1.165537714958191, "learning_rate": 0.0002, "epoch": 6.153680430879713, "step": 85690}, {"loss": 0.5293, "grad_norm": 0.9486454129219055, "learning_rate": 0.0002, "epoch": 6.154398563734291, "step": 85700}, {"loss": 0.5219, "grad_norm": 0.9379110932350159, "learning_rate": 0.0002, "epoch": 6.155116696588869, "step": 85710}, {"loss": 0.5623, "grad_norm": 1.0051493644714355, "learning_rate": 0.0002, "epoch": 6.155834829443447, "step": 85720}, {"loss": 0.5389, "grad_norm": 0.9311991333961487, "learning_rate": 0.0002, "epoch": 6.156552962298025, "step": 85730}, {"loss": 0.5365, "grad_norm": 1.2071181535720825, "learning_rate": 0.0002, "epoch": 6.157271095152603, "step": 85740}, {"loss": 0.6081, "grad_norm": 1.2609243392944336, "learning_rate": 0.0002, "epoch": 6.157989228007182, "step": 85750}, {"loss": 0.5238, "grad_norm": 1.0485966205596924, "learning_rate": 0.0002, "epoch": 6.15870736086176, "step": 85760}, {"loss": 0.5221, "grad_norm": 0.9949250817298889, "learning_rate": 0.0002, "epoch": 6.159425493716338, "step": 85770}, {"loss": 0.5401, "grad_norm": 0.8191118836402893, "learning_rate": 0.0002, "epoch": 6.160143626570916, "step": 85780}, {"loss": 0.5283, "grad_norm": 0.96427983045578, "learning_rate": 0.0002, "epoch": 6.160861759425494, "step": 85790}, {"loss": 0.5597, "grad_norm": 1.0336496829986572, "learning_rate": 0.0002, "epoch": 6.161579892280072, "step": 85800}, {"loss": 0.5069, "grad_norm": 1.0699222087860107, "learning_rate": 0.0002, "epoch": 6.16229802513465, "step": 85810}, {"loss": 0.5433, "grad_norm": 1.2340054512023926, "learning_rate": 0.0002, "epoch": 6.163016157989228, "step": 85820}, {"loss": 0.5233, "grad_norm": 0.981848955154419, "learning_rate": 0.0002, "epoch": 6.163734290843806, "step": 85830}, {"loss": 0.5393, "grad_norm": 1.2059850692749023, "learning_rate": 0.0002, "epoch": 6.164452423698384, "step": 85840}, {"loss": 0.5358, "grad_norm": 1.0239924192428589, "learning_rate": 0.0002, "epoch": 6.165170556552963, "step": 85850}, {"loss": 0.5715, "grad_norm": 0.8601624369621277, "learning_rate": 0.0002, "epoch": 6.165888689407541, "step": 85860}, {"loss": 0.5442, "grad_norm": 1.1900125741958618, "learning_rate": 0.0002, "epoch": 6.166606822262119, "step": 85870}, {"loss": 0.5193, "grad_norm": 0.9747354388237, "learning_rate": 0.0002, "epoch": 6.167324955116697, "step": 85880}, {"loss": 0.5226, "grad_norm": 1.1277778148651123, "learning_rate": 0.0002, "epoch": 6.168043087971275, "step": 85890}, {"loss": 0.5554, "grad_norm": 1.1270111799240112, "learning_rate": 0.0002, "epoch": 6.168761220825853, "step": 85900}, {"loss": 0.5345, "grad_norm": 1.1610701084136963, "learning_rate": 0.0002, "epoch": 6.169479353680431, "step": 85910}, {"loss": 0.5524, "grad_norm": 0.873607873916626, "learning_rate": 0.0002, "epoch": 6.170197486535009, "step": 85920}, {"loss": 0.5021, "grad_norm": 1.040145993232727, "learning_rate": 0.0002, "epoch": 6.170915619389587, "step": 85930}, {"loss": 0.5072, "grad_norm": 1.0139122009277344, "learning_rate": 0.0002, "epoch": 6.1716337522441655, "step": 85940}, {"loss": 0.5674, "grad_norm": 1.0575451850891113, "learning_rate": 0.0002, "epoch": 6.1723518850987436, "step": 85950}, {"loss": 0.5517, "grad_norm": 1.100884199142456, "learning_rate": 0.0002, "epoch": 6.1730700179533216, "step": 85960}, {"loss": 0.5165, "grad_norm": 1.1741244792938232, "learning_rate": 0.0002, "epoch": 6.1737881508078996, "step": 85970}, {"loss": 0.526, "grad_norm": 0.9446555376052856, "learning_rate": 0.0002, "epoch": 6.174506283662478, "step": 85980}, {"loss": 0.493, "grad_norm": 0.9297952055931091, "learning_rate": 0.0002, "epoch": 6.175224416517056, "step": 85990}, {"loss": 0.5059, "grad_norm": 1.196361780166626, "learning_rate": 0.0002, "epoch": 6.175942549371634, "step": 86000}, {"loss": 0.5541, "grad_norm": 1.0719913244247437, "learning_rate": 0.0002, "epoch": 6.176660682226212, "step": 86010}, {"loss": 0.5613, "grad_norm": 1.0942085981369019, "learning_rate": 0.0002, "epoch": 6.17737881508079, "step": 86020}, {"loss": 0.5632, "grad_norm": 0.8989787697792053, "learning_rate": 0.0002, "epoch": 6.1780969479353685, "step": 86030}, {"loss": 0.5778, "grad_norm": 1.071344017982483, "learning_rate": 0.0002, "epoch": 6.1788150807899465, "step": 86040}, {"loss": 0.4885, "grad_norm": 0.9686782360076904, "learning_rate": 0.0002, "epoch": 6.1795332136445245, "step": 86050}, {"loss": 0.5727, "grad_norm": 1.0769884586334229, "learning_rate": 0.0002, "epoch": 6.1802513464991025, "step": 86060}, {"loss": 0.5356, "grad_norm": 0.9761241674423218, "learning_rate": 0.0002, "epoch": 6.1809694793536805, "step": 86070}, {"loss": 0.5736, "grad_norm": 1.0531808137893677, "learning_rate": 0.0002, "epoch": 6.1816876122082585, "step": 86080}, {"loss": 0.5899, "grad_norm": 1.0523570775985718, "learning_rate": 0.0002, "epoch": 6.1824057450628365, "step": 86090}, {"loss": 0.5941, "grad_norm": 1.2155946493148804, "learning_rate": 0.0002, "epoch": 6.1831238779174145, "step": 86100}, {"loss": 0.5315, "grad_norm": 1.1012920141220093, "learning_rate": 0.0002, "epoch": 6.1838420107719925, "step": 86110}, {"loss": 0.555, "grad_norm": 0.8764983415603638, "learning_rate": 0.0002, "epoch": 6.184560143626571, "step": 86120}, {"loss": 0.5219, "grad_norm": 0.950320303440094, "learning_rate": 0.0002, "epoch": 6.185278276481149, "step": 86130}, {"loss": 0.5275, "grad_norm": 1.1183594465255737, "learning_rate": 0.0002, "epoch": 6.185996409335727, "step": 86140}, {"loss": 0.4953, "grad_norm": 1.1919164657592773, "learning_rate": 0.0002, "epoch": 6.186714542190305, "step": 86150}, {"loss": 0.5121, "grad_norm": 1.1478904485702515, "learning_rate": 0.0002, "epoch": 6.187432675044883, "step": 86160}, {"loss": 0.5482, "grad_norm": 1.0764135122299194, "learning_rate": 0.0002, "epoch": 6.188150807899461, "step": 86170}, {"loss": 0.5448, "grad_norm": 1.195090889930725, "learning_rate": 0.0002, "epoch": 6.188868940754039, "step": 86180}, {"loss": 0.5461, "grad_norm": 1.089442253112793, "learning_rate": 0.0002, "epoch": 6.189587073608617, "step": 86190}, {"loss": 0.5415, "grad_norm": 0.9705546498298645, "learning_rate": 0.0002, "epoch": 6.190305206463195, "step": 86200}, {"loss": 0.5575, "grad_norm": 1.164642333984375, "learning_rate": 0.0002, "epoch": 6.191023339317773, "step": 86210}, {"loss": 0.5354, "grad_norm": 0.9551387429237366, "learning_rate": 0.0002, "epoch": 6.191741472172352, "step": 86220}, {"loss": 0.5237, "grad_norm": 1.0483227968215942, "learning_rate": 0.0002, "epoch": 6.19245960502693, "step": 86230}, {"loss": 0.5519, "grad_norm": 1.0068920850753784, "learning_rate": 0.0002, "epoch": 6.193177737881508, "step": 86240}, {"loss": 0.6136, "grad_norm": 1.142656683921814, "learning_rate": 0.0002, "epoch": 6.193895870736086, "step": 86250}, {"loss": 0.5722, "grad_norm": 1.1186467409133911, "learning_rate": 0.0002, "epoch": 6.194614003590664, "step": 86260}, {"loss": 0.5721, "grad_norm": 1.1664706468582153, "learning_rate": 0.0002, "epoch": 6.195332136445242, "step": 86270}, {"loss": 0.5397, "grad_norm": 1.2658511400222778, "learning_rate": 0.0002, "epoch": 6.19605026929982, "step": 86280}, {"loss": 0.5593, "grad_norm": 1.122759222984314, "learning_rate": 0.0002, "epoch": 6.196768402154398, "step": 86290}, {"loss": 0.5874, "grad_norm": 1.1611319780349731, "learning_rate": 0.0002, "epoch": 6.197486535008976, "step": 86300}, {"loss": 0.531, "grad_norm": 1.0476176738739014, "learning_rate": 0.0002, "epoch": 6.198204667863555, "step": 86310}, {"loss": 0.5455, "grad_norm": 1.2284801006317139, "learning_rate": 0.0002, "epoch": 6.198922800718133, "step": 86320}, {"loss": 0.5052, "grad_norm": 1.1340757608413696, "learning_rate": 0.0002, "epoch": 6.199640933572711, "step": 86330}, {"loss": 0.5651, "grad_norm": 1.045088768005371, "learning_rate": 0.0002, "epoch": 6.200359066427289, "step": 86340}, {"loss": 0.5606, "grad_norm": 1.1200770139694214, "learning_rate": 0.0002, "epoch": 6.201077199281867, "step": 86350}, {"loss": 0.5554, "grad_norm": 1.1879554986953735, "learning_rate": 0.0002, "epoch": 6.201795332136445, "step": 86360}, {"loss": 0.5442, "grad_norm": 1.1146271228790283, "learning_rate": 0.0002, "epoch": 6.202513464991023, "step": 86370}, {"loss": 0.5472, "grad_norm": 0.8934822678565979, "learning_rate": 0.0002, "epoch": 6.203231597845601, "step": 86380}, {"loss": 0.5663, "grad_norm": 1.21973717212677, "learning_rate": 0.0002, "epoch": 6.203949730700179, "step": 86390}, {"loss": 0.5351, "grad_norm": 0.9424970746040344, "learning_rate": 0.0002, "epoch": 6.204667863554757, "step": 86400}, {"loss": 0.5291, "grad_norm": 1.0036219358444214, "learning_rate": 0.0002, "epoch": 6.205385996409336, "step": 86410}, {"loss": 0.5117, "grad_norm": 0.9319575428962708, "learning_rate": 0.0002, "epoch": 6.206104129263914, "step": 86420}, {"loss": 0.5608, "grad_norm": 1.0548789501190186, "learning_rate": 0.0002, "epoch": 6.206822262118492, "step": 86430}, {"loss": 0.5556, "grad_norm": 0.9361019730567932, "learning_rate": 0.0002, "epoch": 6.20754039497307, "step": 86440}, {"loss": 0.5765, "grad_norm": 0.9350554347038269, "learning_rate": 0.0002, "epoch": 6.208258527827648, "step": 86450}, {"loss": 0.5616, "grad_norm": 1.291595458984375, "learning_rate": 0.0002, "epoch": 6.208976660682226, "step": 86460}, {"loss": 0.584, "grad_norm": 1.0414642095565796, "learning_rate": 0.0002, "epoch": 6.209694793536804, "step": 86470}, {"loss": 0.5282, "grad_norm": 1.1983444690704346, "learning_rate": 0.0002, "epoch": 6.210412926391382, "step": 86480}, {"loss": 0.493, "grad_norm": 0.9444540739059448, "learning_rate": 0.0002, "epoch": 6.21113105924596, "step": 86490}, {"loss": 0.5533, "grad_norm": 1.072526216506958, "learning_rate": 0.0002, "epoch": 6.211849192100539, "step": 86500}, {"loss": 0.5509, "grad_norm": 1.0109381675720215, "learning_rate": 0.0002, "epoch": 6.212567324955117, "step": 86510}, {"loss": 0.5244, "grad_norm": 1.1661816835403442, "learning_rate": 0.0002, "epoch": 6.213285457809695, "step": 86520}, {"loss": 0.5192, "grad_norm": 1.0434976816177368, "learning_rate": 0.0002, "epoch": 6.214003590664273, "step": 86530}, {"loss": 0.5732, "grad_norm": 1.1290796995162964, "learning_rate": 0.0002, "epoch": 6.214721723518851, "step": 86540}, {"loss": 0.5276, "grad_norm": 0.746512234210968, "learning_rate": 0.0002, "epoch": 6.215439856373429, "step": 86550}, {"loss": 0.5412, "grad_norm": 1.0346291065216064, "learning_rate": 0.0002, "epoch": 6.216157989228007, "step": 86560}, {"loss": 0.5452, "grad_norm": 1.2428497076034546, "learning_rate": 0.0002, "epoch": 6.216876122082585, "step": 86570}, {"loss": 0.4906, "grad_norm": 1.0040535926818848, "learning_rate": 0.0002, "epoch": 6.217594254937163, "step": 86580}, {"loss": 0.5368, "grad_norm": 0.9300616383552551, "learning_rate": 0.0002, "epoch": 6.218312387791742, "step": 86590}, {"loss": 0.51, "grad_norm": 1.0006635189056396, "learning_rate": 0.0002, "epoch": 6.21903052064632, "step": 86600}, {"loss": 0.573, "grad_norm": 1.1402281522750854, "learning_rate": 0.0002, "epoch": 6.219748653500898, "step": 86610}, {"loss": 0.5324, "grad_norm": 1.1543347835540771, "learning_rate": 0.0002, "epoch": 6.220466786355476, "step": 86620}, {"loss": 0.4904, "grad_norm": 1.1074384450912476, "learning_rate": 0.0002, "epoch": 6.221184919210054, "step": 86630}, {"loss": 0.5291, "grad_norm": 0.9032864570617676, "learning_rate": 0.0002, "epoch": 6.221903052064632, "step": 86640}, {"loss": 0.5651, "grad_norm": 1.094516396522522, "learning_rate": 0.0002, "epoch": 6.22262118491921, "step": 86650}, {"loss": 0.5723, "grad_norm": 1.2248685359954834, "learning_rate": 0.0002, "epoch": 6.223339317773788, "step": 86660}, {"loss": 0.5873, "grad_norm": 1.0211371183395386, "learning_rate": 0.0002, "epoch": 6.224057450628366, "step": 86670}, {"loss": 0.5459, "grad_norm": 1.0956611633300781, "learning_rate": 0.0002, "epoch": 6.224775583482945, "step": 86680}, {"loss": 0.5615, "grad_norm": 1.1494320631027222, "learning_rate": 0.0002, "epoch": 6.225493716337523, "step": 86690}, {"loss": 0.4953, "grad_norm": 0.968108594417572, "learning_rate": 0.0002, "epoch": 6.226211849192101, "step": 86700}, {"loss": 0.5349, "grad_norm": 1.376665711402893, "learning_rate": 0.0002, "epoch": 6.226929982046679, "step": 86710}, {"loss": 0.5285, "grad_norm": 1.2121574878692627, "learning_rate": 0.0002, "epoch": 6.227648114901257, "step": 86720}, {"loss": 0.534, "grad_norm": 1.001272439956665, "learning_rate": 0.0002, "epoch": 6.228366247755835, "step": 86730}, {"loss": 0.5684, "grad_norm": 0.9023162722587585, "learning_rate": 0.0002, "epoch": 6.229084380610413, "step": 86740}, {"loss": 0.5304, "grad_norm": 1.2660632133483887, "learning_rate": 0.0002, "epoch": 6.229802513464991, "step": 86750}, {"loss": 0.52, "grad_norm": 1.0549668073654175, "learning_rate": 0.0002, "epoch": 6.230520646319569, "step": 86760}, {"loss": 0.5268, "grad_norm": 1.0364645719528198, "learning_rate": 0.0002, "epoch": 6.231238779174147, "step": 86770}, {"loss": 0.5543, "grad_norm": 1.2197567224502563, "learning_rate": 0.0002, "epoch": 6.231956912028726, "step": 86780}, {"loss": 0.5675, "grad_norm": 0.8866947889328003, "learning_rate": 0.0002, "epoch": 6.232675044883304, "step": 86790}, {"loss": 0.5666, "grad_norm": 1.1795434951782227, "learning_rate": 0.0002, "epoch": 6.233393177737882, "step": 86800}, {"loss": 0.5309, "grad_norm": 1.0882378816604614, "learning_rate": 0.0002, "epoch": 6.23411131059246, "step": 86810}, {"loss": 0.5903, "grad_norm": 1.181888222694397, "learning_rate": 0.0002, "epoch": 6.234829443447038, "step": 86820}, {"loss": 0.5847, "grad_norm": 1.031209111213684, "learning_rate": 0.0002, "epoch": 6.235547576301616, "step": 86830}, {"loss": 0.5283, "grad_norm": 1.2889492511749268, "learning_rate": 0.0002, "epoch": 6.236265709156194, "step": 86840}, {"loss": 0.5409, "grad_norm": 0.874086856842041, "learning_rate": 0.0002, "epoch": 6.236983842010772, "step": 86850}, {"loss": 0.546, "grad_norm": 1.1912312507629395, "learning_rate": 0.0002, "epoch": 6.23770197486535, "step": 86860}, {"loss": 0.5446, "grad_norm": 1.0963071584701538, "learning_rate": 0.0002, "epoch": 6.238420107719929, "step": 86870}, {"loss": 0.5917, "grad_norm": 1.028746485710144, "learning_rate": 0.0002, "epoch": 6.239138240574507, "step": 86880}, {"loss": 0.5851, "grad_norm": 1.0736430883407593, "learning_rate": 0.0002, "epoch": 6.239856373429085, "step": 86890}, {"loss": 0.5773, "grad_norm": 0.9559927582740784, "learning_rate": 0.0002, "epoch": 6.240574506283663, "step": 86900}, {"loss": 0.5694, "grad_norm": 0.9696667790412903, "learning_rate": 0.0002, "epoch": 6.241292639138241, "step": 86910}, {"loss": 0.564, "grad_norm": 1.0710713863372803, "learning_rate": 0.0002, "epoch": 6.242010771992819, "step": 86920}, {"loss": 0.5557, "grad_norm": 1.0459970235824585, "learning_rate": 0.0002, "epoch": 6.242728904847397, "step": 86930}, {"loss": 0.5845, "grad_norm": 1.212083339691162, "learning_rate": 0.0002, "epoch": 6.243447037701975, "step": 86940}, {"loss": 0.5503, "grad_norm": 1.0369303226470947, "learning_rate": 0.0002, "epoch": 6.244165170556553, "step": 86950}, {"loss": 0.5468, "grad_norm": 1.180519700050354, "learning_rate": 0.0002, "epoch": 6.244883303411131, "step": 86960}, {"loss": 0.5969, "grad_norm": 1.0670114755630493, "learning_rate": 0.0002, "epoch": 6.2456014362657095, "step": 86970}, {"loss": 0.5712, "grad_norm": 1.072209119796753, "learning_rate": 0.0002, "epoch": 6.2463195691202875, "step": 86980}, {"loss": 0.5554, "grad_norm": 0.9642090201377869, "learning_rate": 0.0002, "epoch": 6.2470377019748655, "step": 86990}, {"loss": 0.5351, "grad_norm": 1.077467918395996, "learning_rate": 0.0002, "epoch": 6.2477558348294435, "step": 87000}, {"loss": 0.5434, "grad_norm": 1.1081476211547852, "learning_rate": 0.0002, "epoch": 6.2484739676840215, "step": 87010}, {"loss": 0.5692, "grad_norm": 0.8815084099769592, "learning_rate": 0.0002, "epoch": 6.2491921005385995, "step": 87020}, {"loss": 0.5649, "grad_norm": 0.8562555313110352, "learning_rate": 0.0002, "epoch": 6.2499102333931775, "step": 87030}, {"loss": 0.5305, "grad_norm": 0.8729159235954285, "learning_rate": 0.0002, "epoch": 6.2506283662477555, "step": 87040}, {"loss": 0.5179, "grad_norm": 1.005082368850708, "learning_rate": 0.0002, "epoch": 6.2513464991023335, "step": 87050}, {"loss": 0.5326, "grad_norm": 1.3991386890411377, "learning_rate": 0.0002, "epoch": 6.252064631956912, "step": 87060}, {"loss": 0.563, "grad_norm": 1.090180516242981, "learning_rate": 0.0002, "epoch": 6.25278276481149, "step": 87070}, {"loss": 0.6074, "grad_norm": 1.08149254322052, "learning_rate": 0.0002, "epoch": 6.253500897666068, "step": 87080}, {"loss": 0.5663, "grad_norm": 1.1021103858947754, "learning_rate": 0.0002, "epoch": 6.254219030520646, "step": 87090}, {"loss": 0.5744, "grad_norm": 1.2393771409988403, "learning_rate": 0.0002, "epoch": 6.254937163375224, "step": 87100}, {"loss": 0.5379, "grad_norm": 0.9702037572860718, "learning_rate": 0.0002, "epoch": 6.255655296229802, "step": 87110}, {"loss": 0.546, "grad_norm": 1.203088641166687, "learning_rate": 0.0002, "epoch": 6.25637342908438, "step": 87120}, {"loss": 0.5315, "grad_norm": 0.9722330570220947, "learning_rate": 0.0002, "epoch": 6.257091561938958, "step": 87130}, {"loss": 0.5864, "grad_norm": 0.9802384376525879, "learning_rate": 0.0002, "epoch": 6.257809694793536, "step": 87140}, {"loss": 0.5751, "grad_norm": 0.9991751909255981, "learning_rate": 0.0002, "epoch": 6.258527827648114, "step": 87150}, {"loss": 0.5574, "grad_norm": 1.1102324724197388, "learning_rate": 0.0002, "epoch": 6.259245960502693, "step": 87160}, {"loss": 0.545, "grad_norm": 1.1357909440994263, "learning_rate": 0.0002, "epoch": 6.259964093357271, "step": 87170}, {"loss": 0.5066, "grad_norm": 1.1128548383712769, "learning_rate": 0.0002, "epoch": 6.260682226211849, "step": 87180}, {"loss": 0.6394, "grad_norm": 1.1135061979293823, "learning_rate": 0.0002, "epoch": 6.261400359066427, "step": 87190}, {"loss": 0.4923, "grad_norm": 0.9545563459396362, "learning_rate": 0.0002, "epoch": 6.262118491921005, "step": 87200}, {"loss": 0.555, "grad_norm": 1.3011159896850586, "learning_rate": 0.0002, "epoch": 6.262836624775583, "step": 87210}, {"loss": 0.5517, "grad_norm": 1.217691421508789, "learning_rate": 0.0002, "epoch": 6.263554757630161, "step": 87220}, {"loss": 0.5316, "grad_norm": 0.9615218043327332, "learning_rate": 0.0002, "epoch": 6.264272890484739, "step": 87230}, {"loss": 0.5702, "grad_norm": 0.9935932159423828, "learning_rate": 0.0002, "epoch": 6.264991023339318, "step": 87240}, {"loss": 0.5313, "grad_norm": 1.01247239112854, "learning_rate": 0.0002, "epoch": 6.265709156193896, "step": 87250}, {"loss": 0.5723, "grad_norm": 1.1960358619689941, "learning_rate": 0.0002, "epoch": 6.266427289048474, "step": 87260}, {"loss": 0.5381, "grad_norm": 1.053942322731018, "learning_rate": 0.0002, "epoch": 6.267145421903052, "step": 87270}, {"loss": 0.5679, "grad_norm": 1.2450612783432007, "learning_rate": 0.0002, "epoch": 6.26786355475763, "step": 87280}, {"loss": 0.5149, "grad_norm": 0.7816058397293091, "learning_rate": 0.0002, "epoch": 6.268581687612208, "step": 87290}, {"loss": 0.549, "grad_norm": 1.014817237854004, "learning_rate": 0.0002, "epoch": 6.269299820466786, "step": 87300}, {"loss": 0.5787, "grad_norm": 1.1871070861816406, "learning_rate": 0.0002, "epoch": 6.270017953321364, "step": 87310}, {"loss": 0.5103, "grad_norm": 1.0170562267303467, "learning_rate": 0.0002, "epoch": 6.270736086175942, "step": 87320}, {"loss": 0.555, "grad_norm": 1.216288685798645, "learning_rate": 0.0002, "epoch": 6.27145421903052, "step": 87330}, {"loss": 0.5648, "grad_norm": 0.8846057653427124, "learning_rate": 0.0002, "epoch": 6.272172351885099, "step": 87340}, {"loss": 0.5781, "grad_norm": 1.181233286857605, "learning_rate": 0.0002, "epoch": 6.272890484739677, "step": 87350}, {"loss": 0.5359, "grad_norm": 1.0051873922348022, "learning_rate": 0.0002, "epoch": 6.273608617594255, "step": 87360}, {"loss": 0.5674, "grad_norm": 1.1179516315460205, "learning_rate": 0.0002, "epoch": 6.274326750448833, "step": 87370}, {"loss": 0.5935, "grad_norm": 1.0118002891540527, "learning_rate": 0.0002, "epoch": 6.275044883303411, "step": 87380}, {"loss": 0.5789, "grad_norm": 1.0948026180267334, "learning_rate": 0.0002, "epoch": 6.275763016157989, "step": 87390}, {"loss": 0.5277, "grad_norm": 1.0836515426635742, "learning_rate": 0.0002, "epoch": 6.276481149012567, "step": 87400}, {"loss": 0.5663, "grad_norm": 0.9548853039741516, "learning_rate": 0.0002, "epoch": 6.277199281867145, "step": 87410}, {"loss": 0.58, "grad_norm": 1.2531564235687256, "learning_rate": 0.0002, "epoch": 6.277917414721723, "step": 87420}, {"loss": 0.5651, "grad_norm": 1.010250449180603, "learning_rate": 0.0002, "epoch": 6.278635547576302, "step": 87430}, {"loss": 0.6222, "grad_norm": 1.3306254148483276, "learning_rate": 0.0002, "epoch": 6.27935368043088, "step": 87440}, {"loss": 0.5397, "grad_norm": 0.9485062956809998, "learning_rate": 0.0002, "epoch": 6.280071813285458, "step": 87450}, {"loss": 0.5441, "grad_norm": 0.9938563704490662, "learning_rate": 0.0002, "epoch": 6.280789946140036, "step": 87460}, {"loss": 0.5546, "grad_norm": 1.1747362613677979, "learning_rate": 0.0002, "epoch": 6.281508078994614, "step": 87470}, {"loss": 0.566, "grad_norm": 1.1712254285812378, "learning_rate": 0.0002, "epoch": 6.282226211849192, "step": 87480}, {"loss": 0.6165, "grad_norm": 1.1453865766525269, "learning_rate": 0.0002, "epoch": 6.28294434470377, "step": 87490}, {"loss": 0.535, "grad_norm": 0.974902331829071, "learning_rate": 0.0002, "epoch": 6.283662477558348, "step": 87500}, {"loss": 0.5354, "grad_norm": 1.1181912422180176, "learning_rate": 0.0002, "epoch": 6.284380610412926, "step": 87510}, {"loss": 0.5276, "grad_norm": 1.047453761100769, "learning_rate": 0.0002, "epoch": 6.285098743267504, "step": 87520}, {"loss": 0.5689, "grad_norm": 1.185815453529358, "learning_rate": 0.0002, "epoch": 6.285816876122083, "step": 87530}, {"loss": 0.5531, "grad_norm": 1.1126786470413208, "learning_rate": 0.0002, "epoch": 6.286535008976661, "step": 87540}, {"loss": 0.5619, "grad_norm": 1.0931676626205444, "learning_rate": 0.0002, "epoch": 6.287253141831239, "step": 87550}, {"loss": 0.5625, "grad_norm": 0.9930597543716431, "learning_rate": 0.0002, "epoch": 6.287971274685817, "step": 87560}, {"loss": 0.5637, "grad_norm": 0.9909583926200867, "learning_rate": 0.0002, "epoch": 6.288689407540395, "step": 87570}, {"loss": 0.5462, "grad_norm": 1.3766822814941406, "learning_rate": 0.0002, "epoch": 6.289407540394973, "step": 87580}, {"loss": 0.5544, "grad_norm": 1.0137864351272583, "learning_rate": 0.0002, "epoch": 6.290125673249551, "step": 87590}, {"loss": 0.5678, "grad_norm": 0.8761594295501709, "learning_rate": 0.0002, "epoch": 6.290843806104129, "step": 87600}, {"loss": 0.5393, "grad_norm": 1.155881404876709, "learning_rate": 0.0002, "epoch": 6.291561938958707, "step": 87610}, {"loss": 0.5606, "grad_norm": 0.9972963333129883, "learning_rate": 0.0002, "epoch": 6.292280071813286, "step": 87620}, {"loss": 0.5776, "grad_norm": 1.195021152496338, "learning_rate": 0.0002, "epoch": 6.292998204667864, "step": 87630}, {"loss": 0.5567, "grad_norm": 0.9872829914093018, "learning_rate": 0.0002, "epoch": 6.293716337522442, "step": 87640}, {"loss": 0.588, "grad_norm": 1.3643794059753418, "learning_rate": 0.0002, "epoch": 6.29443447037702, "step": 87650}, {"loss": 0.5181, "grad_norm": 0.9389668703079224, "learning_rate": 0.0002, "epoch": 6.295152603231598, "step": 87660}, {"loss": 0.5284, "grad_norm": 1.379319429397583, "learning_rate": 0.0002, "epoch": 6.295870736086176, "step": 87670}, {"loss": 0.5091, "grad_norm": 1.1253849267959595, "learning_rate": 0.0002, "epoch": 6.296588868940754, "step": 87680}, {"loss": 0.5383, "grad_norm": 1.2402328252792358, "learning_rate": 0.0002, "epoch": 6.297307001795332, "step": 87690}, {"loss": 0.5803, "grad_norm": 1.085004210472107, "learning_rate": 0.0002, "epoch": 6.29802513464991, "step": 87700}, {"loss": 0.5705, "grad_norm": 1.0939021110534668, "learning_rate": 0.0002, "epoch": 6.298743267504488, "step": 87710}, {"loss": 0.5391, "grad_norm": 1.0350301265716553, "learning_rate": 0.0002, "epoch": 6.299461400359067, "step": 87720}, {"loss": 0.5269, "grad_norm": 0.9862944483757019, "learning_rate": 0.0002, "epoch": 6.300179533213645, "step": 87730}, {"loss": 0.5378, "grad_norm": 0.990942656993866, "learning_rate": 0.0002, "epoch": 6.300897666068223, "step": 87740}, {"loss": 0.4843, "grad_norm": 0.9287887215614319, "learning_rate": 0.0002, "epoch": 6.301615798922801, "step": 87750}, {"loss": 0.5602, "grad_norm": 1.225714087486267, "learning_rate": 0.0002, "epoch": 6.302333931777379, "step": 87760}, {"loss": 0.5513, "grad_norm": 1.0181951522827148, "learning_rate": 0.0002, "epoch": 6.303052064631957, "step": 87770}, {"loss": 0.563, "grad_norm": 0.9808282256126404, "learning_rate": 0.0002, "epoch": 6.303770197486535, "step": 87780}, {"loss": 0.5738, "grad_norm": 1.1413379907608032, "learning_rate": 0.0002, "epoch": 6.304488330341113, "step": 87790}, {"loss": 0.5548, "grad_norm": 1.1188091039657593, "learning_rate": 0.0002, "epoch": 6.305206463195692, "step": 87800}, {"loss": 0.497, "grad_norm": 1.297154188156128, "learning_rate": 0.0002, "epoch": 6.30592459605027, "step": 87810}, {"loss": 0.5481, "grad_norm": 1.0723271369934082, "learning_rate": 0.0002, "epoch": 6.306642728904848, "step": 87820}, {"loss": 0.567, "grad_norm": 1.067265510559082, "learning_rate": 0.0002, "epoch": 6.307360861759426, "step": 87830}, {"loss": 0.5893, "grad_norm": 1.01328444480896, "learning_rate": 0.0002, "epoch": 6.308078994614004, "step": 87840}, {"loss": 0.5169, "grad_norm": 1.092671513557434, "learning_rate": 0.0002, "epoch": 6.308797127468582, "step": 87850}, {"loss": 0.6079, "grad_norm": 1.168721079826355, "learning_rate": 0.0002, "epoch": 6.30951526032316, "step": 87860}, {"loss": 0.5355, "grad_norm": 1.165495753288269, "learning_rate": 0.0002, "epoch": 6.310233393177738, "step": 87870}, {"loss": 0.6015, "grad_norm": 1.10816490650177, "learning_rate": 0.0002, "epoch": 6.310951526032316, "step": 87880}, {"loss": 0.5259, "grad_norm": 0.9667611718177795, "learning_rate": 0.0002, "epoch": 6.311669658886894, "step": 87890}, {"loss": 0.589, "grad_norm": 1.22564697265625, "learning_rate": 0.0002, "epoch": 6.312387791741473, "step": 87900}, {"loss": 0.5574, "grad_norm": 1.1156506538391113, "learning_rate": 0.0002, "epoch": 6.313105924596051, "step": 87910}, {"loss": 0.5324, "grad_norm": 1.03804349899292, "learning_rate": 0.0002, "epoch": 6.313824057450629, "step": 87920}, {"loss": 0.5577, "grad_norm": 0.9424136281013489, "learning_rate": 0.0002, "epoch": 6.314542190305207, "step": 87930}, {"loss": 0.5654, "grad_norm": 1.2243257761001587, "learning_rate": 0.0002, "epoch": 6.315260323159785, "step": 87940}, {"loss": 0.5884, "grad_norm": 1.0930471420288086, "learning_rate": 0.0002, "epoch": 6.315978456014363, "step": 87950}, {"loss": 0.5227, "grad_norm": 1.096875548362732, "learning_rate": 0.0002, "epoch": 6.316696588868941, "step": 87960}, {"loss": 0.5514, "grad_norm": 1.0606242418289185, "learning_rate": 0.0002, "epoch": 6.317414721723519, "step": 87970}, {"loss": 0.5409, "grad_norm": 0.8657089471817017, "learning_rate": 0.0002, "epoch": 6.318132854578097, "step": 87980}, {"loss": 0.5496, "grad_norm": 0.9751629829406738, "learning_rate": 0.0002, "epoch": 6.3188509874326755, "step": 87990}, {"loss": 0.5677, "grad_norm": 1.0751961469650269, "learning_rate": 0.0002, "epoch": 6.3195691202872535, "step": 88000}, {"loss": 0.5408, "grad_norm": 1.0679874420166016, "learning_rate": 0.0002, "epoch": 6.3202872531418315, "step": 88010}, {"loss": 0.5695, "grad_norm": 1.4102588891983032, "learning_rate": 0.0002, "epoch": 6.3210053859964095, "step": 88020}, {"loss": 0.5744, "grad_norm": 0.8747799396514893, "learning_rate": 0.0002, "epoch": 6.3217235188509875, "step": 88030}, {"loss": 0.6024, "grad_norm": 1.0866155624389648, "learning_rate": 0.0002, "epoch": 6.3224416517055655, "step": 88040}, {"loss": 0.5964, "grad_norm": 1.2255747318267822, "learning_rate": 0.0002, "epoch": 6.3231597845601435, "step": 88050}, {"loss": 0.5536, "grad_norm": 1.031588077545166, "learning_rate": 0.0002, "epoch": 6.3238779174147215, "step": 88060}, {"loss": 0.5631, "grad_norm": 1.1994154453277588, "learning_rate": 0.0002, "epoch": 6.3245960502692995, "step": 88070}, {"loss": 0.5644, "grad_norm": 0.9172461032867432, "learning_rate": 0.0002, "epoch": 6.3253141831238775, "step": 88080}, {"loss": 0.5739, "grad_norm": 0.8762667775154114, "learning_rate": 0.0002, "epoch": 6.326032315978456, "step": 88090}, {"loss": 0.558, "grad_norm": 1.166225790977478, "learning_rate": 0.0002, "epoch": 6.326750448833034, "step": 88100}, {"loss": 0.5688, "grad_norm": 1.014858365058899, "learning_rate": 0.0002, "epoch": 6.327468581687612, "step": 88110}, {"loss": 0.5783, "grad_norm": 1.1080266237258911, "learning_rate": 0.0002, "epoch": 6.32818671454219, "step": 88120}, {"loss": 0.6146, "grad_norm": 0.9775443077087402, "learning_rate": 0.0002, "epoch": 6.328904847396768, "step": 88130}, {"loss": 0.5658, "grad_norm": 0.9032314419746399, "learning_rate": 0.0002, "epoch": 6.329622980251346, "step": 88140}, {"loss": 0.5139, "grad_norm": 1.0170091390609741, "learning_rate": 0.0002, "epoch": 6.330341113105924, "step": 88150}, {"loss": 0.5155, "grad_norm": 0.9412024617195129, "learning_rate": 0.0002, "epoch": 6.331059245960502, "step": 88160}, {"loss": 0.5454, "grad_norm": 0.9090259671211243, "learning_rate": 0.0002, "epoch": 6.33177737881508, "step": 88170}, {"loss": 0.5564, "grad_norm": 0.8896998167037964, "learning_rate": 0.0002, "epoch": 6.332495511669659, "step": 88180}, {"loss": 0.5536, "grad_norm": 1.1648571491241455, "learning_rate": 0.0002, "epoch": 6.333213644524237, "step": 88190}, {"loss": 0.5439, "grad_norm": 1.13261878490448, "learning_rate": 0.0002, "epoch": 6.333931777378815, "step": 88200}, {"loss": 0.5367, "grad_norm": 0.9561943411827087, "learning_rate": 0.0002, "epoch": 6.334649910233393, "step": 88210}, {"loss": 0.548, "grad_norm": 1.3076379299163818, "learning_rate": 0.0002, "epoch": 6.335368043087971, "step": 88220}, {"loss": 0.5706, "grad_norm": 0.9788665175437927, "learning_rate": 0.0002, "epoch": 6.336086175942549, "step": 88230}, {"loss": 0.5439, "grad_norm": 1.2843645811080933, "learning_rate": 0.0002, "epoch": 6.336804308797127, "step": 88240}, {"loss": 0.5174, "grad_norm": 1.1531981229782104, "learning_rate": 0.0002, "epoch": 6.337522441651705, "step": 88250}, {"loss": 0.5746, "grad_norm": 1.1946183443069458, "learning_rate": 0.0002, "epoch": 6.338240574506283, "step": 88260}, {"loss": 0.5778, "grad_norm": 1.1190218925476074, "learning_rate": 0.0002, "epoch": 6.338958707360861, "step": 88270}, {"loss": 0.5175, "grad_norm": 1.0605140924453735, "learning_rate": 0.0002, "epoch": 6.33967684021544, "step": 88280}, {"loss": 0.5435, "grad_norm": 1.0237314701080322, "learning_rate": 0.0002, "epoch": 6.340394973070018, "step": 88290}, {"loss": 0.5595, "grad_norm": 1.1268457174301147, "learning_rate": 0.0002, "epoch": 6.341113105924596, "step": 88300}, {"loss": 0.5706, "grad_norm": 1.0750062465667725, "learning_rate": 0.0002, "epoch": 6.341831238779174, "step": 88310}, {"loss": 0.5334, "grad_norm": 1.2356536388397217, "learning_rate": 0.0002, "epoch": 6.342549371633752, "step": 88320}, {"loss": 0.5143, "grad_norm": 1.0375114679336548, "learning_rate": 0.0002, "epoch": 6.34326750448833, "step": 88330}, {"loss": 0.5583, "grad_norm": 1.063388705253601, "learning_rate": 0.0002, "epoch": 6.343985637342908, "step": 88340}, {"loss": 0.5301, "grad_norm": 0.9182760715484619, "learning_rate": 0.0002, "epoch": 6.344703770197486, "step": 88350}, {"loss": 0.5896, "grad_norm": 0.9787414073944092, "learning_rate": 0.0002, "epoch": 6.345421903052064, "step": 88360}, {"loss": 0.579, "grad_norm": 1.295432448387146, "learning_rate": 0.0002, "epoch": 6.346140035906643, "step": 88370}, {"loss": 0.5737, "grad_norm": 0.9269146919250488, "learning_rate": 0.0002, "epoch": 6.346858168761221, "step": 88380}, {"loss": 0.5551, "grad_norm": 0.9076777696609497, "learning_rate": 0.0002, "epoch": 6.347576301615799, "step": 88390}, {"loss": 0.5542, "grad_norm": 1.1186468601226807, "learning_rate": 0.0002, "epoch": 6.348294434470377, "step": 88400}, {"loss": 0.5806, "grad_norm": 1.1021504402160645, "learning_rate": 0.0002, "epoch": 6.349012567324955, "step": 88410}, {"loss": 0.5717, "grad_norm": 1.2439358234405518, "learning_rate": 0.0002, "epoch": 6.349730700179533, "step": 88420}, {"loss": 0.5384, "grad_norm": 1.1228888034820557, "learning_rate": 0.0002, "epoch": 6.350448833034111, "step": 88430}, {"loss": 0.5634, "grad_norm": 1.226587176322937, "learning_rate": 0.0002, "epoch": 6.351166965888689, "step": 88440}, {"loss": 0.5676, "grad_norm": 1.2813525199890137, "learning_rate": 0.0002, "epoch": 6.351885098743267, "step": 88450}, {"loss": 0.544, "grad_norm": 1.411405086517334, "learning_rate": 0.0002, "epoch": 6.352603231597846, "step": 88460}, {"loss": 0.5349, "grad_norm": 1.3659696578979492, "learning_rate": 0.0002, "epoch": 6.353321364452424, "step": 88470}, {"loss": 0.5453, "grad_norm": 1.1398485898971558, "learning_rate": 0.0002, "epoch": 6.354039497307002, "step": 88480}, {"loss": 0.5628, "grad_norm": 1.2088590860366821, "learning_rate": 0.0002, "epoch": 6.35475763016158, "step": 88490}, {"loss": 0.4978, "grad_norm": 0.9191108345985413, "learning_rate": 0.0002, "epoch": 6.355475763016158, "step": 88500}, {"loss": 0.5091, "grad_norm": 0.9855144619941711, "learning_rate": 0.0002, "epoch": 6.356193895870736, "step": 88510}, {"loss": 0.5635, "grad_norm": 1.0576577186584473, "learning_rate": 0.0002, "epoch": 6.356912028725314, "step": 88520}, {"loss": 0.5081, "grad_norm": 1.0213230848312378, "learning_rate": 0.0002, "epoch": 6.357630161579892, "step": 88530}, {"loss": 0.6141, "grad_norm": 1.2086849212646484, "learning_rate": 0.0002, "epoch": 6.35834829443447, "step": 88540}, {"loss": 0.5477, "grad_norm": 1.05294930934906, "learning_rate": 0.0002, "epoch": 6.359066427289049, "step": 88550}, {"loss": 0.5991, "grad_norm": 1.1798300743103027, "learning_rate": 0.0002, "epoch": 6.359784560143627, "step": 88560}, {"loss": 0.551, "grad_norm": 1.088749885559082, "learning_rate": 0.0002, "epoch": 6.360502692998205, "step": 88570}, {"loss": 0.5299, "grad_norm": 1.0071386098861694, "learning_rate": 0.0002, "epoch": 6.361220825852783, "step": 88580}, {"loss": 0.5691, "grad_norm": 1.2080132961273193, "learning_rate": 0.0002, "epoch": 6.361938958707361, "step": 88590}, {"loss": 0.5637, "grad_norm": 0.9784366488456726, "learning_rate": 0.0002, "epoch": 6.362657091561939, "step": 88600}, {"loss": 0.5499, "grad_norm": 0.9475322961807251, "learning_rate": 0.0002, "epoch": 6.363375224416517, "step": 88610}, {"loss": 0.5467, "grad_norm": 0.8267584443092346, "learning_rate": 0.0002, "epoch": 6.364093357271095, "step": 88620}, {"loss": 0.591, "grad_norm": 1.05606210231781, "learning_rate": 0.0002, "epoch": 6.364811490125673, "step": 88630}, {"loss": 0.5859, "grad_norm": 1.2059335708618164, "learning_rate": 0.0002, "epoch": 6.365529622980251, "step": 88640}, {"loss": 0.5992, "grad_norm": 1.1900845766067505, "learning_rate": 0.0002, "epoch": 6.36624775583483, "step": 88650}, {"loss": 0.5618, "grad_norm": 1.0271358489990234, "learning_rate": 0.0002, "epoch": 6.366965888689408, "step": 88660}, {"loss": 0.5363, "grad_norm": 1.1839162111282349, "learning_rate": 0.0002, "epoch": 6.367684021543986, "step": 88670}, {"loss": 0.5508, "grad_norm": 0.9042913317680359, "learning_rate": 0.0002, "epoch": 6.368402154398564, "step": 88680}, {"loss": 0.5253, "grad_norm": 1.079893946647644, "learning_rate": 0.0002, "epoch": 6.369120287253142, "step": 88690}, {"loss": 0.5414, "grad_norm": 1.0999629497528076, "learning_rate": 0.0002, "epoch": 6.36983842010772, "step": 88700}, {"loss": 0.57, "grad_norm": 1.0618157386779785, "learning_rate": 0.0002, "epoch": 6.370556552962298, "step": 88710}, {"loss": 0.5559, "grad_norm": 0.9567645788192749, "learning_rate": 0.0002, "epoch": 6.371274685816876, "step": 88720}, {"loss": 0.5547, "grad_norm": 1.0342025756835938, "learning_rate": 0.0002, "epoch": 6.371992818671454, "step": 88730}, {"loss": 0.5302, "grad_norm": 1.0789190530776978, "learning_rate": 0.0002, "epoch": 6.372710951526033, "step": 88740}, {"loss": 0.5394, "grad_norm": 0.9956819415092468, "learning_rate": 0.0002, "epoch": 6.373429084380611, "step": 88750}, {"loss": 0.5739, "grad_norm": 0.9103280305862427, "learning_rate": 0.0002, "epoch": 6.374147217235189, "step": 88760}, {"loss": 0.5313, "grad_norm": 0.9856002330780029, "learning_rate": 0.0002, "epoch": 6.374865350089767, "step": 88770}, {"loss": 0.5482, "grad_norm": 1.1801226139068604, "learning_rate": 0.0002, "epoch": 6.375583482944345, "step": 88780}, {"loss": 0.584, "grad_norm": 0.9876776933670044, "learning_rate": 0.0002, "epoch": 6.376301615798923, "step": 88790}, {"loss": 0.5633, "grad_norm": 1.0169886350631714, "learning_rate": 0.0002, "epoch": 6.377019748653501, "step": 88800}, {"loss": 0.5525, "grad_norm": 1.0118076801300049, "learning_rate": 0.0002, "epoch": 6.377737881508079, "step": 88810}, {"loss": 0.5205, "grad_norm": 1.0641456842422485, "learning_rate": 0.0002, "epoch": 6.378456014362657, "step": 88820}, {"loss": 0.5816, "grad_norm": 1.1138534545898438, "learning_rate": 0.0002, "epoch": 6.379174147217235, "step": 88830}, {"loss": 0.5979, "grad_norm": 1.1518962383270264, "learning_rate": 0.0002, "epoch": 6.379892280071814, "step": 88840}, {"loss": 0.5644, "grad_norm": 1.3662128448486328, "learning_rate": 0.0002, "epoch": 6.380610412926392, "step": 88850}, {"loss": 0.5662, "grad_norm": 0.9544311761856079, "learning_rate": 0.0002, "epoch": 6.38132854578097, "step": 88860}, {"loss": 0.5721, "grad_norm": 0.9747556447982788, "learning_rate": 0.0002, "epoch": 6.382046678635548, "step": 88870}, {"loss": 0.5458, "grad_norm": 1.1651948690414429, "learning_rate": 0.0002, "epoch": 6.382764811490126, "step": 88880}, {"loss": 0.5644, "grad_norm": 1.4048396348953247, "learning_rate": 0.0002, "epoch": 6.383482944344704, "step": 88890}, {"loss": 0.5686, "grad_norm": 1.1144068241119385, "learning_rate": 0.0002, "epoch": 6.384201077199282, "step": 88900}, {"loss": 0.5572, "grad_norm": 1.2978034019470215, "learning_rate": 0.0002, "epoch": 6.38491921005386, "step": 88910}, {"loss": 0.5279, "grad_norm": 1.1776132583618164, "learning_rate": 0.0002, "epoch": 6.385637342908438, "step": 88920}, {"loss": 0.5844, "grad_norm": 0.8849034905433655, "learning_rate": 0.0002, "epoch": 6.3863554757630165, "step": 88930}, {"loss": 0.5566, "grad_norm": 1.1207057237625122, "learning_rate": 0.0002, "epoch": 6.3870736086175945, "step": 88940}, {"loss": 0.5889, "grad_norm": 0.9364172220230103, "learning_rate": 0.0002, "epoch": 6.3877917414721725, "step": 88950}, {"loss": 0.5788, "grad_norm": 1.1731317043304443, "learning_rate": 0.0002, "epoch": 6.3885098743267505, "step": 88960}, {"loss": 0.5743, "grad_norm": 1.0411573648452759, "learning_rate": 0.0002, "epoch": 6.3892280071813286, "step": 88970}, {"loss": 0.557, "grad_norm": 1.0817447900772095, "learning_rate": 0.0002, "epoch": 6.3899461400359066, "step": 88980}, {"loss": 0.5715, "grad_norm": 1.0037593841552734, "learning_rate": 0.0002, "epoch": 6.3906642728904846, "step": 88990}, {"loss": 0.562, "grad_norm": 1.1684437990188599, "learning_rate": 0.0002, "epoch": 6.391382405745063, "step": 89000}, {"loss": 0.5544, "grad_norm": 1.0237388610839844, "learning_rate": 0.0002, "epoch": 6.392100538599641, "step": 89010}, {"loss": 0.607, "grad_norm": 1.24791419506073, "learning_rate": 0.0002, "epoch": 6.392818671454219, "step": 89020}, {"loss": 0.5139, "grad_norm": 0.842664897441864, "learning_rate": 0.0002, "epoch": 6.3935368043087974, "step": 89030}, {"loss": 0.5606, "grad_norm": 1.1692326068878174, "learning_rate": 0.0002, "epoch": 6.3942549371633755, "step": 89040}, {"loss": 0.5656, "grad_norm": 1.0786939859390259, "learning_rate": 0.0002, "epoch": 6.3949730700179535, "step": 89050}, {"loss": 0.5901, "grad_norm": 1.1315077543258667, "learning_rate": 0.0002, "epoch": 6.3956912028725315, "step": 89060}, {"loss": 0.5642, "grad_norm": 0.9949214458465576, "learning_rate": 0.0002, "epoch": 6.3964093357271095, "step": 89070}, {"loss": 0.5367, "grad_norm": 1.0302025079727173, "learning_rate": 0.0002, "epoch": 6.3971274685816875, "step": 89080}, {"loss": 0.5453, "grad_norm": 0.9664030075073242, "learning_rate": 0.0002, "epoch": 6.3978456014362655, "step": 89090}, {"loss": 0.5496, "grad_norm": 1.1251037120819092, "learning_rate": 0.0002, "epoch": 6.3985637342908435, "step": 89100}, {"loss": 0.56, "grad_norm": 1.1103272438049316, "learning_rate": 0.0002, "epoch": 6.399281867145422, "step": 89110}, {"loss": 0.5703, "grad_norm": 0.9192888736724854, "learning_rate": 0.0002, "epoch": 6.4, "step": 89120}, {"loss": 0.5436, "grad_norm": 1.027806043624878, "learning_rate": 0.0002, "epoch": 6.400718132854578, "step": 89130}, {"loss": 0.608, "grad_norm": 1.1219452619552612, "learning_rate": 0.0002, "epoch": 6.401436265709156, "step": 89140}, {"loss": 0.5488, "grad_norm": 1.1703979969024658, "learning_rate": 0.0002, "epoch": 6.402154398563734, "step": 89150}, {"loss": 0.5251, "grad_norm": 1.025874376296997, "learning_rate": 0.0002, "epoch": 6.402872531418312, "step": 89160}, {"loss": 0.5476, "grad_norm": 1.070225715637207, "learning_rate": 0.0002, "epoch": 6.40359066427289, "step": 89170}, {"loss": 0.5539, "grad_norm": 1.1915208101272583, "learning_rate": 0.0002, "epoch": 6.404308797127468, "step": 89180}, {"loss": 0.5504, "grad_norm": 1.1954079866409302, "learning_rate": 0.0002, "epoch": 6.405026929982046, "step": 89190}, {"loss": 0.558, "grad_norm": 1.035910964012146, "learning_rate": 0.0002, "epoch": 6.405745062836624, "step": 89200}, {"loss": 0.586, "grad_norm": 1.1363351345062256, "learning_rate": 0.0002, "epoch": 6.406463195691203, "step": 89210}, {"loss": 0.5594, "grad_norm": 1.2086843252182007, "learning_rate": 0.0002, "epoch": 6.407181328545781, "step": 89220}, {"loss": 0.5928, "grad_norm": 1.3492387533187866, "learning_rate": 0.0002, "epoch": 6.407899461400359, "step": 89230}, {"loss": 0.5679, "grad_norm": 0.8746330738067627, "learning_rate": 0.0002, "epoch": 6.408617594254937, "step": 89240}, {"loss": 0.5818, "grad_norm": 1.0165427923202515, "learning_rate": 0.0002, "epoch": 6.409335727109515, "step": 89250}, {"loss": 0.5437, "grad_norm": 1.0314675569534302, "learning_rate": 0.0002, "epoch": 6.410053859964093, "step": 89260}, {"loss": 0.5741, "grad_norm": 1.2128242254257202, "learning_rate": 0.0002, "epoch": 6.410771992818671, "step": 89270}, {"loss": 0.59, "grad_norm": 0.9496060013771057, "learning_rate": 0.0002, "epoch": 6.411490125673249, "step": 89280}, {"loss": 0.5949, "grad_norm": 1.1838264465332031, "learning_rate": 0.0002, "epoch": 6.412208258527827, "step": 89290}, {"loss": 0.543, "grad_norm": 1.1700918674468994, "learning_rate": 0.0002, "epoch": 6.412926391382406, "step": 89300}, {"loss": 0.5185, "grad_norm": 1.2102051973342896, "learning_rate": 0.0002, "epoch": 6.413644524236984, "step": 89310}, {"loss": 0.5516, "grad_norm": 0.9485594630241394, "learning_rate": 0.0002, "epoch": 6.414362657091562, "step": 89320}, {"loss": 0.5516, "grad_norm": 1.041496753692627, "learning_rate": 0.0002, "epoch": 6.41508078994614, "step": 89330}, {"loss": 0.545, "grad_norm": 1.0785019397735596, "learning_rate": 0.0002, "epoch": 6.415798922800718, "step": 89340}, {"loss": 0.5553, "grad_norm": 0.9527593851089478, "learning_rate": 0.0002, "epoch": 6.416517055655296, "step": 89350}, {"loss": 0.5624, "grad_norm": 0.9879035353660583, "learning_rate": 0.0002, "epoch": 6.417235188509874, "step": 89360}, {"loss": 0.5614, "grad_norm": 0.9143751263618469, "learning_rate": 0.0002, "epoch": 6.417953321364452, "step": 89370}, {"loss": 0.6034, "grad_norm": 0.9145408272743225, "learning_rate": 0.0002, "epoch": 6.41867145421903, "step": 89380}, {"loss": 0.5355, "grad_norm": 1.0128624439239502, "learning_rate": 0.0002, "epoch": 6.419389587073608, "step": 89390}, {"loss": 0.5581, "grad_norm": 0.9454543590545654, "learning_rate": 0.0002, "epoch": 6.420107719928187, "step": 89400}, {"loss": 0.6192, "grad_norm": 1.0659215450286865, "learning_rate": 0.0002, "epoch": 6.420825852782765, "step": 89410}, {"loss": 0.5645, "grad_norm": 1.1622642278671265, "learning_rate": 0.0002, "epoch": 6.421543985637343, "step": 89420}, {"loss": 0.5868, "grad_norm": 0.9805575013160706, "learning_rate": 0.0002, "epoch": 6.422262118491921, "step": 89430}, {"loss": 0.5743, "grad_norm": 0.871903121471405, "learning_rate": 0.0002, "epoch": 6.422980251346499, "step": 89440}, {"loss": 0.5537, "grad_norm": 0.992355227470398, "learning_rate": 0.0002, "epoch": 6.423698384201077, "step": 89450}, {"loss": 0.5453, "grad_norm": 1.4055765867233276, "learning_rate": 0.0002, "epoch": 6.424416517055655, "step": 89460}, {"loss": 0.5472, "grad_norm": 1.0447325706481934, "learning_rate": 0.0002, "epoch": 6.425134649910233, "step": 89470}, {"loss": 0.5782, "grad_norm": 1.1162594556808472, "learning_rate": 0.0002, "epoch": 6.425852782764811, "step": 89480}, {"loss": 0.5644, "grad_norm": 1.0767697095870972, "learning_rate": 0.0002, "epoch": 6.42657091561939, "step": 89490}, {"loss": 0.5828, "grad_norm": 1.2253819704055786, "learning_rate": 0.0002, "epoch": 6.427289048473968, "step": 89500}, {"loss": 0.6364, "grad_norm": 1.0623136758804321, "learning_rate": 0.0002, "epoch": 6.428007181328546, "step": 89510}, {"loss": 0.5714, "grad_norm": 1.3238742351531982, "learning_rate": 0.0002, "epoch": 6.428725314183124, "step": 89520}, {"loss": 0.5303, "grad_norm": 1.2376916408538818, "learning_rate": 0.0002, "epoch": 6.429443447037702, "step": 89530}, {"loss": 0.5281, "grad_norm": 1.197453260421753, "learning_rate": 0.0002, "epoch": 6.43016157989228, "step": 89540}, {"loss": 0.5624, "grad_norm": 1.0539700984954834, "learning_rate": 0.0002, "epoch": 6.430879712746858, "step": 89550}, {"loss": 0.5327, "grad_norm": 1.0659761428833008, "learning_rate": 0.0002, "epoch": 6.431597845601436, "step": 89560}, {"loss": 0.5295, "grad_norm": 1.0186322927474976, "learning_rate": 0.0002, "epoch": 6.432315978456014, "step": 89570}, {"loss": 0.5333, "grad_norm": 1.232337474822998, "learning_rate": 0.0002, "epoch": 6.433034111310592, "step": 89580}, {"loss": 0.559, "grad_norm": 1.1512500047683716, "learning_rate": 0.0002, "epoch": 6.433752244165171, "step": 89590}, {"loss": 0.5223, "grad_norm": 1.0068955421447754, "learning_rate": 0.0002, "epoch": 6.434470377019749, "step": 89600}, {"loss": 0.5363, "grad_norm": 1.1359424591064453, "learning_rate": 0.0002, "epoch": 6.435188509874327, "step": 89610}, {"loss": 0.553, "grad_norm": 1.4369128942489624, "learning_rate": 0.0002, "epoch": 6.435906642728905, "step": 89620}, {"loss": 0.5427, "grad_norm": 0.9382445216178894, "learning_rate": 0.0002, "epoch": 6.436624775583483, "step": 89630}, {"loss": 0.5781, "grad_norm": 0.8607977628707886, "learning_rate": 0.0002, "epoch": 6.437342908438061, "step": 89640}, {"loss": 0.5283, "grad_norm": 0.9498276114463806, "learning_rate": 0.0002, "epoch": 6.438061041292639, "step": 89650}, {"loss": 0.554, "grad_norm": 1.4109948873519897, "learning_rate": 0.0002, "epoch": 6.438779174147217, "step": 89660}, {"loss": 0.5723, "grad_norm": 1.106134295463562, "learning_rate": 0.0002, "epoch": 6.439497307001796, "step": 89670}, {"loss": 0.5782, "grad_norm": 1.128963589668274, "learning_rate": 0.0002, "epoch": 6.440215439856374, "step": 89680}, {"loss": 0.5638, "grad_norm": 1.1370604038238525, "learning_rate": 0.0002, "epoch": 6.440933572710952, "step": 89690}, {"loss": 0.5459, "grad_norm": 1.380922794342041, "learning_rate": 0.0002, "epoch": 6.44165170556553, "step": 89700}, {"loss": 0.5775, "grad_norm": 0.9597383737564087, "learning_rate": 0.0002, "epoch": 6.442369838420108, "step": 89710}, {"loss": 0.5504, "grad_norm": 1.1491756439208984, "learning_rate": 0.0002, "epoch": 6.443087971274686, "step": 89720}, {"loss": 0.5584, "grad_norm": 1.1313573122024536, "learning_rate": 0.0002, "epoch": 6.443806104129264, "step": 89730}, {"loss": 0.5743, "grad_norm": 1.1081135272979736, "learning_rate": 0.0002, "epoch": 6.444524236983842, "step": 89740}, {"loss": 0.5648, "grad_norm": 1.0297505855560303, "learning_rate": 0.0002, "epoch": 6.44524236983842, "step": 89750}, {"loss": 0.5743, "grad_norm": 1.0534520149230957, "learning_rate": 0.0002, "epoch": 6.445960502692998, "step": 89760}, {"loss": 0.5503, "grad_norm": 1.218485951423645, "learning_rate": 0.0002, "epoch": 6.446678635547577, "step": 89770}, {"loss": 0.543, "grad_norm": 0.9336987137794495, "learning_rate": 0.0002, "epoch": 6.447396768402155, "step": 89780}, {"loss": 0.5485, "grad_norm": 0.9854478240013123, "learning_rate": 0.0002, "epoch": 6.448114901256733, "step": 89790}, {"loss": 0.5718, "grad_norm": 1.1036708354949951, "learning_rate": 0.0002, "epoch": 6.448833034111311, "step": 89800}, {"loss": 0.5362, "grad_norm": 1.2220509052276611, "learning_rate": 0.0002, "epoch": 6.449551166965889, "step": 89810}, {"loss": 0.577, "grad_norm": 0.9955567121505737, "learning_rate": 0.0002, "epoch": 6.450269299820467, "step": 89820}, {"loss": 0.5458, "grad_norm": 1.0350912809371948, "learning_rate": 0.0002, "epoch": 6.450987432675045, "step": 89830}, {"loss": 0.5957, "grad_norm": 1.156080722808838, "learning_rate": 0.0002, "epoch": 6.451705565529623, "step": 89840}, {"loss": 0.588, "grad_norm": 0.8922389149665833, "learning_rate": 0.0002, "epoch": 6.452423698384201, "step": 89850}, {"loss": 0.5676, "grad_norm": 0.9318913221359253, "learning_rate": 0.0002, "epoch": 6.45314183123878, "step": 89860}, {"loss": 0.5778, "grad_norm": 0.9420756101608276, "learning_rate": 0.0002, "epoch": 6.453859964093358, "step": 89870}, {"loss": 0.5624, "grad_norm": 1.0303646326065063, "learning_rate": 0.0002, "epoch": 6.454578096947936, "step": 89880}, {"loss": 0.5304, "grad_norm": 1.070806860923767, "learning_rate": 0.0002, "epoch": 6.455296229802514, "step": 89890}, {"loss": 0.5682, "grad_norm": 0.9890686869621277, "learning_rate": 0.0002, "epoch": 6.456014362657092, "step": 89900}, {"loss": 0.5533, "grad_norm": 1.1254929304122925, "learning_rate": 0.0002, "epoch": 6.45673249551167, "step": 89910}, {"loss": 0.5717, "grad_norm": 1.0023183822631836, "learning_rate": 0.0002, "epoch": 6.457450628366248, "step": 89920}, {"loss": 0.5624, "grad_norm": 1.118721604347229, "learning_rate": 0.0002, "epoch": 6.458168761220826, "step": 89930}, {"loss": 0.5667, "grad_norm": 1.2170203924179077, "learning_rate": 0.0002, "epoch": 6.458886894075404, "step": 89940}, {"loss": 0.5523, "grad_norm": 1.0662257671356201, "learning_rate": 0.0002, "epoch": 6.459605026929982, "step": 89950}, {"loss": 0.537, "grad_norm": 0.8912546634674072, "learning_rate": 0.0002, "epoch": 6.4603231597845605, "step": 89960}, {"loss": 0.5646, "grad_norm": 1.0346225500106812, "learning_rate": 0.0002, "epoch": 6.4610412926391385, "step": 89970}, {"loss": 0.5827, "grad_norm": 1.239388346672058, "learning_rate": 0.0002, "epoch": 6.4617594254937165, "step": 89980}, {"loss": 0.5728, "grad_norm": 1.0100152492523193, "learning_rate": 0.0002, "epoch": 6.4624775583482945, "step": 89990}, {"loss": 0.5288, "grad_norm": 1.1496137380599976, "learning_rate": 0.0002, "epoch": 6.4631956912028725, "step": 90000}, {"loss": 0.5464, "grad_norm": 0.9652666449546814, "learning_rate": 0.0002, "epoch": 6.4639138240574505, "step": 90010}, {"loss": 0.5714, "grad_norm": 1.459730863571167, "learning_rate": 0.0002, "epoch": 6.4646319569120285, "step": 90020}, {"loss": 0.5684, "grad_norm": 0.9096665978431702, "learning_rate": 0.0002, "epoch": 6.4653500897666065, "step": 90030}, {"loss": 0.5784, "grad_norm": 1.1356233358383179, "learning_rate": 0.0002, "epoch": 6.4660682226211845, "step": 90040}, {"loss": 0.5605, "grad_norm": 1.0192385911941528, "learning_rate": 0.0002, "epoch": 6.466786355475763, "step": 90050}, {"loss": 0.5549, "grad_norm": 0.9494831562042236, "learning_rate": 0.0002, "epoch": 6.467504488330341, "step": 90060}, {"loss": 0.5732, "grad_norm": 0.9784388542175293, "learning_rate": 0.0002, "epoch": 6.468222621184919, "step": 90070}, {"loss": 0.5597, "grad_norm": 1.0754846334457397, "learning_rate": 0.0002, "epoch": 6.468940754039497, "step": 90080}, {"loss": 0.5571, "grad_norm": 0.9019646644592285, "learning_rate": 0.0002, "epoch": 6.469658886894075, "step": 90090}, {"loss": 0.5652, "grad_norm": 1.1848793029785156, "learning_rate": 0.0002, "epoch": 6.470377019748653, "step": 90100}, {"loss": 0.6054, "grad_norm": 1.1312837600708008, "learning_rate": 0.0002, "epoch": 6.471095152603231, "step": 90110}, {"loss": 0.5333, "grad_norm": 0.9868128299713135, "learning_rate": 0.0002, "epoch": 6.471813285457809, "step": 90120}, {"loss": 0.5627, "grad_norm": 0.894279956817627, "learning_rate": 0.0002, "epoch": 6.472531418312387, "step": 90130}, {"loss": 0.5898, "grad_norm": 1.1206544637680054, "learning_rate": 0.0002, "epoch": 6.473249551166965, "step": 90140}, {"loss": 0.6155, "grad_norm": 1.048126220703125, "learning_rate": 0.0002, "epoch": 6.473967684021544, "step": 90150}, {"loss": 0.5501, "grad_norm": 0.9624786972999573, "learning_rate": 0.0002, "epoch": 6.474685816876122, "step": 90160}, {"loss": 0.5311, "grad_norm": 1.3301671743392944, "learning_rate": 0.0002, "epoch": 6.4754039497307, "step": 90170}, {"loss": 0.5668, "grad_norm": 1.1016923189163208, "learning_rate": 0.0002, "epoch": 6.476122082585278, "step": 90180}, {"loss": 0.6371, "grad_norm": 1.084158182144165, "learning_rate": 0.0002, "epoch": 6.476840215439856, "step": 90190}, {"loss": 0.6117, "grad_norm": 1.0704890489578247, "learning_rate": 0.0002, "epoch": 6.477558348294434, "step": 90200}, {"loss": 0.5813, "grad_norm": 1.0849730968475342, "learning_rate": 0.0002, "epoch": 6.478276481149012, "step": 90210}, {"loss": 0.5624, "grad_norm": 1.0671768188476562, "learning_rate": 0.0002, "epoch": 6.47899461400359, "step": 90220}, {"loss": 0.6028, "grad_norm": 1.1208873987197876, "learning_rate": 0.0002, "epoch": 6.479712746858169, "step": 90230}, {"loss": 0.6087, "grad_norm": 1.1958850622177124, "learning_rate": 0.0002, "epoch": 6.480430879712747, "step": 90240}, {"loss": 0.5699, "grad_norm": 1.2102761268615723, "learning_rate": 0.0002, "epoch": 6.481149012567325, "step": 90250}, {"loss": 0.5859, "grad_norm": 1.0813510417938232, "learning_rate": 0.0002, "epoch": 6.481867145421903, "step": 90260}, {"loss": 0.548, "grad_norm": 0.8553891777992249, "learning_rate": 0.0002, "epoch": 6.482585278276481, "step": 90270}, {"loss": 0.6162, "grad_norm": 1.0855463743209839, "learning_rate": 0.0002, "epoch": 6.483303411131059, "step": 90280}, {"loss": 0.5456, "grad_norm": 1.1179498434066772, "learning_rate": 0.0002, "epoch": 6.484021543985637, "step": 90290}, {"loss": 0.62, "grad_norm": 1.1268035173416138, "learning_rate": 0.0002, "epoch": 6.484739676840215, "step": 90300}, {"loss": 0.5721, "grad_norm": 1.0755188465118408, "learning_rate": 0.0002, "epoch": 6.485457809694793, "step": 90310}, {"loss": 0.5267, "grad_norm": 1.0469547510147095, "learning_rate": 0.0002, "epoch": 6.486175942549371, "step": 90320}, {"loss": 0.5674, "grad_norm": 0.8739270567893982, "learning_rate": 0.0002, "epoch": 6.48689407540395, "step": 90330}, {"loss": 0.5725, "grad_norm": 1.2452377080917358, "learning_rate": 0.0002, "epoch": 6.487612208258528, "step": 90340}, {"loss": 0.6005, "grad_norm": 1.1576505899429321, "learning_rate": 0.0002, "epoch": 6.488330341113106, "step": 90350}, {"loss": 0.566, "grad_norm": 1.0247524976730347, "learning_rate": 0.0002, "epoch": 6.489048473967684, "step": 90360}, {"loss": 0.5997, "grad_norm": 1.1306205987930298, "learning_rate": 0.0002, "epoch": 6.489766606822262, "step": 90370}, {"loss": 0.5458, "grad_norm": 1.0545839071273804, "learning_rate": 0.0002, "epoch": 6.49048473967684, "step": 90380}, {"loss": 0.5779, "grad_norm": 1.281407117843628, "learning_rate": 0.0002, "epoch": 6.491202872531418, "step": 90390}, {"loss": 0.5774, "grad_norm": 1.2330801486968994, "learning_rate": 0.0002, "epoch": 6.491921005385996, "step": 90400}, {"loss": 0.5507, "grad_norm": 0.8966873288154602, "learning_rate": 0.0002, "epoch": 6.492639138240574, "step": 90410}, {"loss": 0.6008, "grad_norm": 0.9748067259788513, "learning_rate": 0.0002, "epoch": 6.493357271095153, "step": 90420}, {"loss": 0.5784, "grad_norm": 0.9285972118377686, "learning_rate": 0.0002, "epoch": 6.494075403949731, "step": 90430}, {"loss": 0.5635, "grad_norm": 1.123449444770813, "learning_rate": 0.0002, "epoch": 6.494793536804309, "step": 90440}, {"loss": 0.5686, "grad_norm": 1.4190359115600586, "learning_rate": 0.0002, "epoch": 6.495511669658887, "step": 90450}, {"loss": 0.6073, "grad_norm": 0.9877263307571411, "learning_rate": 0.0002, "epoch": 6.496229802513465, "step": 90460}, {"loss": 0.5527, "grad_norm": 0.9850174188613892, "learning_rate": 0.0002, "epoch": 6.496947935368043, "step": 90470}, {"loss": 0.5777, "grad_norm": 1.3609496355056763, "learning_rate": 0.0002, "epoch": 6.497666068222621, "step": 90480}, {"loss": 0.5405, "grad_norm": 0.8299460411071777, "learning_rate": 0.0002, "epoch": 6.498384201077199, "step": 90490}, {"loss": 0.5881, "grad_norm": 1.3359589576721191, "learning_rate": 0.0002, "epoch": 6.499102333931777, "step": 90500}, {"loss": 0.61, "grad_norm": 1.1211248636245728, "learning_rate": 0.0002, "epoch": 6.499820466786355, "step": 90510}, {"loss": 0.5582, "grad_norm": 1.1070419549942017, "learning_rate": 0.0002, "epoch": 6.500538599640934, "step": 90520}, {"loss": 0.5814, "grad_norm": 1.1590572595596313, "learning_rate": 0.0002, "epoch": 6.501256732495512, "step": 90530}, {"loss": 0.5486, "grad_norm": 0.9865858554840088, "learning_rate": 0.0002, "epoch": 6.50197486535009, "step": 90540}, {"loss": 0.6033, "grad_norm": 0.9752925634384155, "learning_rate": 0.0002, "epoch": 6.502692998204668, "step": 90550}, {"loss": 0.5409, "grad_norm": 1.2411525249481201, "learning_rate": 0.0002, "epoch": 6.503411131059246, "step": 90560}, {"loss": 0.554, "grad_norm": 1.1538971662521362, "learning_rate": 0.0002, "epoch": 6.504129263913824, "step": 90570}, {"loss": 0.584, "grad_norm": 1.2818700075149536, "learning_rate": 0.0002, "epoch": 6.504847396768402, "step": 90580}, {"loss": 0.543, "grad_norm": 1.2787950038909912, "learning_rate": 0.0002, "epoch": 6.50556552962298, "step": 90590}, {"loss": 0.5897, "grad_norm": 1.1357126235961914, "learning_rate": 0.0002, "epoch": 6.506283662477558, "step": 90600}, {"loss": 0.5506, "grad_norm": 1.0781097412109375, "learning_rate": 0.0002, "epoch": 6.507001795332137, "step": 90610}, {"loss": 0.5516, "grad_norm": 0.9754705429077148, "learning_rate": 0.0002, "epoch": 6.507719928186715, "step": 90620}, {"loss": 0.5955, "grad_norm": 1.018410563468933, "learning_rate": 0.0002, "epoch": 6.508438061041293, "step": 90630}, {"loss": 0.562, "grad_norm": 1.0382000207901, "learning_rate": 0.0002, "epoch": 6.509156193895871, "step": 90640}, {"loss": 0.5243, "grad_norm": 0.9059327840805054, "learning_rate": 0.0002, "epoch": 6.509874326750449, "step": 90650}, {"loss": 0.5628, "grad_norm": 1.2049181461334229, "learning_rate": 0.0002, "epoch": 6.510592459605027, "step": 90660}, {"loss": 0.6158, "grad_norm": 1.1005393266677856, "learning_rate": 0.0002, "epoch": 6.511310592459605, "step": 90670}, {"loss": 0.563, "grad_norm": 1.0504072904586792, "learning_rate": 0.0002, "epoch": 6.512028725314183, "step": 90680}, {"loss": 0.5792, "grad_norm": 1.2491340637207031, "learning_rate": 0.0002, "epoch": 6.512746858168761, "step": 90690}, {"loss": 0.5851, "grad_norm": 0.9971826672554016, "learning_rate": 0.0002, "epoch": 6.513464991023339, "step": 90700}, {"loss": 0.5597, "grad_norm": 1.0228981971740723, "learning_rate": 0.0002, "epoch": 6.514183123877918, "step": 90710}, {"loss": 0.5453, "grad_norm": 1.1531293392181396, "learning_rate": 0.0002, "epoch": 6.514901256732496, "step": 90720}, {"loss": 0.5501, "grad_norm": 0.9401963949203491, "learning_rate": 0.0002, "epoch": 6.515619389587074, "step": 90730}, {"loss": 0.5727, "grad_norm": 1.3876653909683228, "learning_rate": 0.0002, "epoch": 6.516337522441652, "step": 90740}, {"loss": 0.5978, "grad_norm": 1.3111445903778076, "learning_rate": 0.0002, "epoch": 6.51705565529623, "step": 90750}, {"loss": 0.6003, "grad_norm": 0.8705055713653564, "learning_rate": 0.0002, "epoch": 6.517773788150808, "step": 90760}, {"loss": 0.5418, "grad_norm": 1.213295340538025, "learning_rate": 0.0002, "epoch": 6.518491921005386, "step": 90770}, {"loss": 0.6073, "grad_norm": 1.2075343132019043, "learning_rate": 0.0002, "epoch": 6.519210053859964, "step": 90780}, {"loss": 0.6203, "grad_norm": 0.9814115166664124, "learning_rate": 0.0002, "epoch": 6.519928186714543, "step": 90790}, {"loss": 0.5708, "grad_norm": 1.0937272310256958, "learning_rate": 0.0002, "epoch": 6.520646319569121, "step": 90800}, {"loss": 0.5635, "grad_norm": 1.0839916467666626, "learning_rate": 0.0002, "epoch": 6.521364452423699, "step": 90810}, {"loss": 0.6166, "grad_norm": 1.1918399333953857, "learning_rate": 0.0002, "epoch": 6.522082585278277, "step": 90820}, {"loss": 0.5531, "grad_norm": 1.1677868366241455, "learning_rate": 0.0002, "epoch": 6.522800718132855, "step": 90830}, {"loss": 0.5826, "grad_norm": 1.0840870141983032, "learning_rate": 0.0002, "epoch": 6.523518850987433, "step": 90840}, {"loss": 0.56, "grad_norm": 1.10408353805542, "learning_rate": 0.0002, "epoch": 6.524236983842011, "step": 90850}, {"loss": 0.5729, "grad_norm": 1.056705355644226, "learning_rate": 0.0002, "epoch": 6.524955116696589, "step": 90860}, {"loss": 0.5819, "grad_norm": 1.0552406311035156, "learning_rate": 0.0002, "epoch": 6.525673249551167, "step": 90870}, {"loss": 0.5631, "grad_norm": 1.000816822052002, "learning_rate": 0.0002, "epoch": 6.526391382405745, "step": 90880}, {"loss": 0.5871, "grad_norm": 1.1465239524841309, "learning_rate": 0.0002, "epoch": 6.527109515260323, "step": 90890}, {"loss": 0.5652, "grad_norm": 0.9380449652671814, "learning_rate": 0.0002, "epoch": 6.527827648114902, "step": 90900}, {"loss": 0.5291, "grad_norm": 0.9572200179100037, "learning_rate": 0.0002, "epoch": 6.52854578096948, "step": 90910}, {"loss": 0.5819, "grad_norm": 1.0058002471923828, "learning_rate": 0.0002, "epoch": 6.529263913824058, "step": 90920}, {"loss": 0.584, "grad_norm": 1.0932626724243164, "learning_rate": 0.0002, "epoch": 6.529982046678636, "step": 90930}, {"loss": 0.5448, "grad_norm": 0.9283126592636108, "learning_rate": 0.0002, "epoch": 6.530700179533214, "step": 90940}, {"loss": 0.5916, "grad_norm": 1.1347819566726685, "learning_rate": 0.0002, "epoch": 6.531418312387792, "step": 90950}, {"loss": 0.5485, "grad_norm": 1.4964616298675537, "learning_rate": 0.0002, "epoch": 6.53213644524237, "step": 90960}, {"loss": 0.5567, "grad_norm": 1.1725877523422241, "learning_rate": 0.0002, "epoch": 6.532854578096948, "step": 90970}, {"loss": 0.6339, "grad_norm": 1.185640811920166, "learning_rate": 0.0002, "epoch": 6.5335727109515265, "step": 90980}, {"loss": 0.6021, "grad_norm": 1.0598312616348267, "learning_rate": 0.0002, "epoch": 6.5342908438061045, "step": 90990}, {"loss": 0.5666, "grad_norm": 1.389320731163025, "learning_rate": 0.0002, "epoch": 6.5350089766606825, "step": 91000}, {"loss": 0.5572, "grad_norm": 1.102960467338562, "learning_rate": 0.0002, "epoch": 6.5357271095152605, "step": 91010}, {"loss": 0.5624, "grad_norm": 1.2482284307479858, "learning_rate": 0.0002, "epoch": 6.5364452423698385, "step": 91020}, {"loss": 0.5927, "grad_norm": 1.213861346244812, "learning_rate": 0.0002, "epoch": 6.5371633752244165, "step": 91030}, {"loss": 0.5876, "grad_norm": 1.1872318983078003, "learning_rate": 0.0002, "epoch": 6.5378815080789945, "step": 91040}, {"loss": 0.5713, "grad_norm": 1.0767916440963745, "learning_rate": 0.0002, "epoch": 6.5385996409335725, "step": 91050}, {"loss": 0.5619, "grad_norm": 1.0610442161560059, "learning_rate": 0.0002, "epoch": 6.5393177737881505, "step": 91060}, {"loss": 0.5661, "grad_norm": 1.0161356925964355, "learning_rate": 0.0002, "epoch": 6.5400359066427285, "step": 91070}, {"loss": 0.5421, "grad_norm": 1.373284101486206, "learning_rate": 0.0002, "epoch": 6.540754039497307, "step": 91080}, {"loss": 0.603, "grad_norm": 1.1611387729644775, "learning_rate": 0.0002, "epoch": 6.541472172351885, "step": 91090}, {"loss": 0.5632, "grad_norm": 1.1980092525482178, "learning_rate": 0.0002, "epoch": 6.542190305206463, "step": 91100}, {"loss": 0.5313, "grad_norm": 1.1174312829971313, "learning_rate": 0.0002, "epoch": 6.542908438061041, "step": 91110}, {"loss": 0.5435, "grad_norm": 1.1376914978027344, "learning_rate": 0.0002, "epoch": 6.543626570915619, "step": 91120}, {"loss": 0.5549, "grad_norm": 1.0551620721817017, "learning_rate": 0.0002, "epoch": 6.544344703770197, "step": 91130}, {"loss": 0.5796, "grad_norm": 1.2839815616607666, "learning_rate": 0.0002, "epoch": 6.545062836624775, "step": 91140}, {"loss": 0.5267, "grad_norm": 0.7656933665275574, "learning_rate": 0.0002, "epoch": 6.545780969479353, "step": 91150}, {"loss": 0.5431, "grad_norm": 1.1079483032226562, "learning_rate": 0.0002, "epoch": 6.546499102333931, "step": 91160}, {"loss": 0.5814, "grad_norm": 1.4870734214782715, "learning_rate": 0.0002, "epoch": 6.54721723518851, "step": 91170}, {"loss": 0.5978, "grad_norm": 1.1784024238586426, "learning_rate": 0.0002, "epoch": 6.547935368043088, "step": 91180}, {"loss": 0.542, "grad_norm": 1.3510793447494507, "learning_rate": 0.0002, "epoch": 6.548653500897666, "step": 91190}, {"loss": 0.5435, "grad_norm": 1.0237789154052734, "learning_rate": 0.0002, "epoch": 6.549371633752244, "step": 91200}, {"loss": 0.5321, "grad_norm": 1.0721405744552612, "learning_rate": 0.0002, "epoch": 6.550089766606822, "step": 91210}, {"loss": 0.5234, "grad_norm": 0.9794955253601074, "learning_rate": 0.0002, "epoch": 6.5508078994614, "step": 91220}, {"loss": 0.5291, "grad_norm": 1.1046847105026245, "learning_rate": 0.0002, "epoch": 6.551526032315978, "step": 91230}, {"loss": 0.5627, "grad_norm": 0.9706982374191284, "learning_rate": 0.0002, "epoch": 6.552244165170556, "step": 91240}, {"loss": 0.5801, "grad_norm": 0.9466179609298706, "learning_rate": 0.0002, "epoch": 6.552962298025134, "step": 91250}, {"loss": 0.589, "grad_norm": 1.126806616783142, "learning_rate": 0.0002, "epoch": 6.553680430879712, "step": 91260}, {"loss": 0.5529, "grad_norm": 0.9713812470436096, "learning_rate": 0.0002, "epoch": 6.554398563734291, "step": 91270}, {"loss": 0.5654, "grad_norm": 0.8955506682395935, "learning_rate": 0.0002, "epoch": 6.555116696588869, "step": 91280}, {"loss": 0.6102, "grad_norm": 1.2066279649734497, "learning_rate": 0.0002, "epoch": 6.555834829443447, "step": 91290}, {"loss": 0.5442, "grad_norm": 0.957999587059021, "learning_rate": 0.0002, "epoch": 6.556552962298025, "step": 91300}, {"loss": 0.554, "grad_norm": 1.253709077835083, "learning_rate": 0.0002, "epoch": 6.557271095152603, "step": 91310}, {"loss": 0.5588, "grad_norm": 1.0075397491455078, "learning_rate": 0.0002, "epoch": 6.557989228007181, "step": 91320}, {"loss": 0.5265, "grad_norm": 0.9356904029846191, "learning_rate": 0.0002, "epoch": 6.558707360861759, "step": 91330}, {"loss": 0.5799, "grad_norm": 1.1555782556533813, "learning_rate": 0.0002, "epoch": 6.559425493716337, "step": 91340}, {"loss": 0.5787, "grad_norm": 0.9786396026611328, "learning_rate": 0.0002, "epoch": 6.560143626570916, "step": 91350}, {"loss": 0.5417, "grad_norm": 1.156374454498291, "learning_rate": 0.0002, "epoch": 6.560861759425494, "step": 91360}, {"loss": 0.5836, "grad_norm": 1.0572668313980103, "learning_rate": 0.0002, "epoch": 6.561579892280072, "step": 91370}, {"loss": 0.5632, "grad_norm": 1.4248497486114502, "learning_rate": 0.0002, "epoch": 6.56229802513465, "step": 91380}, {"loss": 0.5868, "grad_norm": 1.1191383600234985, "learning_rate": 0.0002, "epoch": 6.563016157989228, "step": 91390}, {"loss": 0.5919, "grad_norm": 0.9622306227684021, "learning_rate": 0.0002, "epoch": 6.563734290843806, "step": 91400}, {"loss": 0.557, "grad_norm": 1.3683338165283203, "learning_rate": 0.0002, "epoch": 6.564452423698384, "step": 91410}, {"loss": 0.5844, "grad_norm": 1.0363010168075562, "learning_rate": 0.0002, "epoch": 6.565170556552962, "step": 91420}, {"loss": 0.5718, "grad_norm": 1.2861888408660889, "learning_rate": 0.0002, "epoch": 6.56588868940754, "step": 91430}, {"loss": 0.5844, "grad_norm": 1.0330547094345093, "learning_rate": 0.0002, "epoch": 6.566606822262118, "step": 91440}, {"loss": 0.5748, "grad_norm": 1.044992446899414, "learning_rate": 0.0002, "epoch": 6.567324955116696, "step": 91450}, {"loss": 0.5853, "grad_norm": 1.0722706317901611, "learning_rate": 0.0002, "epoch": 6.568043087971275, "step": 91460}, {"loss": 0.5819, "grad_norm": 1.1327447891235352, "learning_rate": 0.0002, "epoch": 6.568761220825853, "step": 91470}, {"loss": 0.5706, "grad_norm": 1.2709840536117554, "learning_rate": 0.0002, "epoch": 6.569479353680431, "step": 91480}, {"loss": 0.5815, "grad_norm": 1.0964101552963257, "learning_rate": 0.0002, "epoch": 6.570197486535009, "step": 91490}, {"loss": 0.5556, "grad_norm": 0.9897898435592651, "learning_rate": 0.0002, "epoch": 6.570915619389587, "step": 91500}, {"loss": 0.5295, "grad_norm": 1.0143952369689941, "learning_rate": 0.0002, "epoch": 6.571633752244165, "step": 91510}, {"loss": 0.5527, "grad_norm": 0.923865020275116, "learning_rate": 0.0002, "epoch": 6.572351885098743, "step": 91520}, {"loss": 0.5749, "grad_norm": 1.144390344619751, "learning_rate": 0.0002, "epoch": 6.573070017953321, "step": 91530}, {"loss": 0.6356, "grad_norm": 1.0636180639266968, "learning_rate": 0.0002, "epoch": 6.5737881508079, "step": 91540}, {"loss": 0.5174, "grad_norm": 1.0699774026870728, "learning_rate": 0.0002, "epoch": 6.574506283662478, "step": 91550}, {"loss": 0.568, "grad_norm": 1.2139345407485962, "learning_rate": 0.0002, "epoch": 6.575224416517056, "step": 91560}, {"loss": 0.5151, "grad_norm": 1.4551644325256348, "learning_rate": 0.0002, "epoch": 6.575942549371634, "step": 91570}, {"loss": 0.5936, "grad_norm": 1.2388415336608887, "learning_rate": 0.0002, "epoch": 6.576660682226212, "step": 91580}, {"loss": 0.5711, "grad_norm": 0.9303404688835144, "learning_rate": 0.0002, "epoch": 6.57737881508079, "step": 91590}, {"loss": 0.6162, "grad_norm": 0.932905912399292, "learning_rate": 0.0002, "epoch": 6.578096947935368, "step": 91600}, {"loss": 0.5594, "grad_norm": 1.0726542472839355, "learning_rate": 0.0002, "epoch": 6.578815080789946, "step": 91610}, {"loss": 0.5879, "grad_norm": 1.138890266418457, "learning_rate": 0.0002, "epoch": 6.579533213644524, "step": 91620}, {"loss": 0.5669, "grad_norm": 1.087165355682373, "learning_rate": 0.0002, "epoch": 6.580251346499102, "step": 91630}, {"loss": 0.572, "grad_norm": 1.0526753664016724, "learning_rate": 0.0002, "epoch": 6.580969479353681, "step": 91640}, {"loss": 0.5872, "grad_norm": 1.068217158317566, "learning_rate": 0.0002, "epoch": 6.581687612208259, "step": 91650}, {"loss": 0.5817, "grad_norm": 1.09737229347229, "learning_rate": 0.0002, "epoch": 6.582405745062837, "step": 91660}, {"loss": 0.588, "grad_norm": 0.9466586112976074, "learning_rate": 0.0002, "epoch": 6.583123877917415, "step": 91670}, {"loss": 0.6083, "grad_norm": 1.2311620712280273, "learning_rate": 0.0002, "epoch": 6.583842010771993, "step": 91680}, {"loss": 0.5629, "grad_norm": 1.2385680675506592, "learning_rate": 0.0002, "epoch": 6.584560143626571, "step": 91690}, {"loss": 0.6515, "grad_norm": 0.947889506816864, "learning_rate": 0.0002, "epoch": 6.585278276481149, "step": 91700}, {"loss": 0.5928, "grad_norm": 0.9600529670715332, "learning_rate": 0.0002, "epoch": 6.585996409335727, "step": 91710}, {"loss": 0.6032, "grad_norm": 1.3595638275146484, "learning_rate": 0.0002, "epoch": 6.586714542190305, "step": 91720}, {"loss": 0.5658, "grad_norm": 1.0087260007858276, "learning_rate": 0.0002, "epoch": 6.587432675044884, "step": 91730}, {"loss": 0.558, "grad_norm": 1.0008373260498047, "learning_rate": 0.0002, "epoch": 6.588150807899462, "step": 91740}, {"loss": 0.5799, "grad_norm": 1.0367980003356934, "learning_rate": 0.0002, "epoch": 6.58886894075404, "step": 91750}, {"loss": 0.5834, "grad_norm": 1.1934503316879272, "learning_rate": 0.0002, "epoch": 6.589587073608618, "step": 91760}, {"loss": 0.5837, "grad_norm": 1.0295839309692383, "learning_rate": 0.0002, "epoch": 6.590305206463196, "step": 91770}, {"loss": 0.5663, "grad_norm": 0.926913857460022, "learning_rate": 0.0002, "epoch": 6.591023339317774, "step": 91780}, {"loss": 0.6089, "grad_norm": 1.055837631225586, "learning_rate": 0.0002, "epoch": 6.591741472172352, "step": 91790}, {"loss": 0.5597, "grad_norm": 1.006401777267456, "learning_rate": 0.0002, "epoch": 6.59245960502693, "step": 91800}, {"loss": 0.5726, "grad_norm": 1.1368589401245117, "learning_rate": 0.0002, "epoch": 6.593177737881508, "step": 91810}, {"loss": 0.5896, "grad_norm": 0.8494837880134583, "learning_rate": 0.0002, "epoch": 6.593895870736086, "step": 91820}, {"loss": 0.6145, "grad_norm": 1.3219822645187378, "learning_rate": 0.0002, "epoch": 6.594614003590665, "step": 91830}, {"loss": 0.5967, "grad_norm": 1.0583800077438354, "learning_rate": 0.0002, "epoch": 6.595332136445243, "step": 91840}, {"loss": 0.5942, "grad_norm": 1.0579098463058472, "learning_rate": 0.0002, "epoch": 6.596050269299821, "step": 91850}, {"loss": 0.5828, "grad_norm": 1.0618008375167847, "learning_rate": 0.0002, "epoch": 6.596768402154399, "step": 91860}, {"loss": 0.587, "grad_norm": 0.9425104260444641, "learning_rate": 0.0002, "epoch": 6.597486535008977, "step": 91870}, {"loss": 0.5478, "grad_norm": 0.9130632281303406, "learning_rate": 0.0002, "epoch": 6.598204667863555, "step": 91880}, {"loss": 0.5769, "grad_norm": 1.126438856124878, "learning_rate": 0.0002, "epoch": 6.598922800718133, "step": 91890}, {"loss": 0.5621, "grad_norm": 0.9135168194770813, "learning_rate": 0.0002, "epoch": 6.599640933572711, "step": 91900}, {"loss": 0.5544, "grad_norm": 1.1640992164611816, "learning_rate": 0.0002, "epoch": 6.6003590664272895, "step": 91910}, {"loss": 0.5595, "grad_norm": 1.2641936540603638, "learning_rate": 0.0002, "epoch": 6.6010771992818675, "step": 91920}, {"loss": 0.6329, "grad_norm": 1.1252738237380981, "learning_rate": 0.0002, "epoch": 6.6017953321364455, "step": 91930}, {"loss": 0.5466, "grad_norm": 1.0307750701904297, "learning_rate": 0.0002, "epoch": 6.6025134649910235, "step": 91940}, {"loss": 0.581, "grad_norm": 0.978972315788269, "learning_rate": 0.0002, "epoch": 6.6032315978456015, "step": 91950}, {"loss": 0.5485, "grad_norm": 1.1350890398025513, "learning_rate": 0.0002, "epoch": 6.6039497307001795, "step": 91960}, {"loss": 0.6263, "grad_norm": 0.9177488088607788, "learning_rate": 0.0002, "epoch": 6.6046678635547575, "step": 91970}, {"loss": 0.5833, "grad_norm": 1.0381031036376953, "learning_rate": 0.0002, "epoch": 6.6053859964093355, "step": 91980}, {"loss": 0.5793, "grad_norm": 1.1706395149230957, "learning_rate": 0.0002, "epoch": 6.6061041292639135, "step": 91990}, {"loss": 0.5899, "grad_norm": 1.1102650165557861, "learning_rate": 0.0002, "epoch": 6.6068222621184916, "step": 92000}, {"loss": 0.5712, "grad_norm": 0.9234306812286377, "learning_rate": 0.0002, "epoch": 6.6075403949730696, "step": 92010}, {"loss": 0.6152, "grad_norm": 1.2014371156692505, "learning_rate": 0.0002, "epoch": 6.608258527827648, "step": 92020}, {"loss": 0.5284, "grad_norm": 0.9392209053039551, "learning_rate": 0.0002, "epoch": 6.6089766606822264, "step": 92030}, {"loss": 0.5818, "grad_norm": 1.0882072448730469, "learning_rate": 0.0002, "epoch": 6.6096947935368044, "step": 92040}, {"loss": 0.5984, "grad_norm": 1.032155156135559, "learning_rate": 0.0002, "epoch": 6.6104129263913824, "step": 92050}, {"loss": 0.5498, "grad_norm": 0.913979172706604, "learning_rate": 0.0002, "epoch": 6.6111310592459605, "step": 92060}, {"loss": 0.5683, "grad_norm": 1.205101490020752, "learning_rate": 0.0002, "epoch": 6.6118491921005385, "step": 92070}, {"loss": 0.5816, "grad_norm": 1.0713984966278076, "learning_rate": 0.0002, "epoch": 6.6125673249551165, "step": 92080}, {"loss": 0.5729, "grad_norm": 0.9191082715988159, "learning_rate": 0.0002, "epoch": 6.6132854578096945, "step": 92090}, {"loss": 0.6036, "grad_norm": 0.9553678631782532, "learning_rate": 0.0002, "epoch": 6.614003590664273, "step": 92100}, {"loss": 0.6329, "grad_norm": 1.333262324333191, "learning_rate": 0.0002, "epoch": 6.614721723518851, "step": 92110}, {"loss": 0.5624, "grad_norm": 1.030739426612854, "learning_rate": 0.0002, "epoch": 6.615439856373429, "step": 92120}, {"loss": 0.5998, "grad_norm": 0.8777900338172913, "learning_rate": 0.0002, "epoch": 6.616157989228007, "step": 92130}, {"loss": 0.5239, "grad_norm": 1.071578860282898, "learning_rate": 0.0002, "epoch": 6.616876122082585, "step": 92140}, {"loss": 0.517, "grad_norm": 1.1931039094924927, "learning_rate": 0.0002, "epoch": 6.617594254937163, "step": 92150}, {"loss": 0.5849, "grad_norm": 1.2041425704956055, "learning_rate": 0.0002, "epoch": 6.618312387791741, "step": 92160}, {"loss": 0.5544, "grad_norm": 0.8523036241531372, "learning_rate": 0.0002, "epoch": 6.619030520646319, "step": 92170}, {"loss": 0.5857, "grad_norm": 1.1914807558059692, "learning_rate": 0.0002, "epoch": 6.619748653500897, "step": 92180}, {"loss": 0.5795, "grad_norm": 1.1336464881896973, "learning_rate": 0.0002, "epoch": 6.620466786355475, "step": 92190}, {"loss": 0.5566, "grad_norm": 1.2282923460006714, "learning_rate": 0.0002, "epoch": 6.621184919210053, "step": 92200}, {"loss": 0.5627, "grad_norm": 1.1887043714523315, "learning_rate": 0.0002, "epoch": 6.621903052064632, "step": 92210}, {"loss": 0.5739, "grad_norm": 0.9654178619384766, "learning_rate": 0.0002, "epoch": 6.62262118491921, "step": 92220}, {"loss": 0.5307, "grad_norm": 0.7957702875137329, "learning_rate": 0.0002, "epoch": 6.623339317773788, "step": 92230}, {"loss": 0.5668, "grad_norm": 0.8697461485862732, "learning_rate": 0.0002, "epoch": 6.624057450628366, "step": 92240}, {"loss": 0.5391, "grad_norm": 1.0392963886260986, "learning_rate": 0.0002, "epoch": 6.624775583482944, "step": 92250}, {"loss": 0.5867, "grad_norm": 1.1502392292022705, "learning_rate": 0.0002, "epoch": 6.625493716337522, "step": 92260}, {"loss": 0.5577, "grad_norm": 1.2818870544433594, "learning_rate": 0.0002, "epoch": 6.6262118491921, "step": 92270}, {"loss": 0.5864, "grad_norm": 0.8769828081130981, "learning_rate": 0.0002, "epoch": 6.626929982046678, "step": 92280}, {"loss": 0.5892, "grad_norm": 1.2273039817810059, "learning_rate": 0.0002, "epoch": 6.627648114901257, "step": 92290}, {"loss": 0.5568, "grad_norm": 0.8619378805160522, "learning_rate": 0.0002, "epoch": 6.628366247755835, "step": 92300}, {"loss": 0.589, "grad_norm": 0.9501098990440369, "learning_rate": 0.0002, "epoch": 6.629084380610413, "step": 92310}, {"loss": 0.6012, "grad_norm": 1.0698163509368896, "learning_rate": 0.0002, "epoch": 6.629802513464991, "step": 92320}, {"loss": 0.5766, "grad_norm": 1.0689377784729004, "learning_rate": 0.0002, "epoch": 6.630520646319569, "step": 92330}, {"loss": 0.5487, "grad_norm": 1.2086275815963745, "learning_rate": 0.0002, "epoch": 6.631238779174147, "step": 92340}, {"loss": 0.563, "grad_norm": 1.1256859302520752, "learning_rate": 0.0002, "epoch": 6.631956912028725, "step": 92350}, {"loss": 0.5542, "grad_norm": 0.9717738032341003, "learning_rate": 0.0002, "epoch": 6.632675044883303, "step": 92360}, {"loss": 0.6, "grad_norm": 0.9784330725669861, "learning_rate": 0.0002, "epoch": 6.633393177737881, "step": 92370}, {"loss": 0.5571, "grad_norm": 1.2600007057189941, "learning_rate": 0.0002, "epoch": 6.634111310592459, "step": 92380}, {"loss": 0.5852, "grad_norm": 0.889910101890564, "learning_rate": 0.0002, "epoch": 6.634829443447038, "step": 92390}, {"loss": 0.5635, "grad_norm": 1.010524868965149, "learning_rate": 0.0002, "epoch": 6.635547576301616, "step": 92400}, {"loss": 0.5806, "grad_norm": 1.325664758682251, "learning_rate": 0.0002, "epoch": 6.636265709156194, "step": 92410}, {"loss": 0.6149, "grad_norm": 1.3910914659500122, "learning_rate": 0.0002, "epoch": 6.636983842010772, "step": 92420}, {"loss": 0.5964, "grad_norm": 0.8858863115310669, "learning_rate": 0.0002, "epoch": 6.63770197486535, "step": 92430}, {"loss": 0.6007, "grad_norm": 1.1841683387756348, "learning_rate": 0.0002, "epoch": 6.638420107719928, "step": 92440}, {"loss": 0.584, "grad_norm": 1.2783559560775757, "learning_rate": 0.0002, "epoch": 6.639138240574506, "step": 92450}, {"loss": 0.5683, "grad_norm": 0.9154769778251648, "learning_rate": 0.0002, "epoch": 6.639856373429084, "step": 92460}, {"loss": 0.6238, "grad_norm": 1.003371000289917, "learning_rate": 0.0002, "epoch": 6.640574506283663, "step": 92470}, {"loss": 0.5537, "grad_norm": 0.9700522422790527, "learning_rate": 0.0002, "epoch": 6.641292639138241, "step": 92480}, {"loss": 0.5263, "grad_norm": 1.273629069328308, "learning_rate": 0.0002, "epoch": 6.642010771992819, "step": 92490}, {"loss": 0.5773, "grad_norm": 1.2746435403823853, "learning_rate": 0.0002, "epoch": 6.642728904847397, "step": 92500}, {"loss": 0.5778, "grad_norm": 1.0184870958328247, "learning_rate": 0.0002, "epoch": 6.643447037701975, "step": 92510}, {"loss": 0.5438, "grad_norm": 0.9988235831260681, "learning_rate": 0.0002, "epoch": 6.644165170556553, "step": 92520}, {"loss": 0.5275, "grad_norm": 1.075997233390808, "learning_rate": 0.0002, "epoch": 6.644883303411131, "step": 92530}, {"loss": 0.5927, "grad_norm": 1.180784821510315, "learning_rate": 0.0002, "epoch": 6.645601436265709, "step": 92540}, {"loss": 0.5641, "grad_norm": 1.0889579057693481, "learning_rate": 0.0002, "epoch": 6.646319569120287, "step": 92550}, {"loss": 0.5745, "grad_norm": 1.0069187879562378, "learning_rate": 0.0002, "epoch": 6.647037701974865, "step": 92560}, {"loss": 0.5706, "grad_norm": 1.110495686531067, "learning_rate": 0.0002, "epoch": 6.647755834829443, "step": 92570}, {"loss": 0.6124, "grad_norm": 1.0540684461593628, "learning_rate": 0.0002, "epoch": 6.648473967684022, "step": 92580}, {"loss": 0.5718, "grad_norm": 1.0917930603027344, "learning_rate": 0.0002, "epoch": 6.6491921005386, "step": 92590}, {"loss": 0.5556, "grad_norm": 1.225898027420044, "learning_rate": 0.0002, "epoch": 6.649910233393178, "step": 92600}, {"loss": 0.5663, "grad_norm": 0.9372484087944031, "learning_rate": 0.0002, "epoch": 6.650628366247756, "step": 92610}, {"loss": 0.5476, "grad_norm": 0.98685622215271, "learning_rate": 0.0002, "epoch": 6.651346499102334, "step": 92620}, {"loss": 0.6096, "grad_norm": 1.1148556470870972, "learning_rate": 0.0002, "epoch": 6.652064631956912, "step": 92630}, {"loss": 0.5371, "grad_norm": 1.1483707427978516, "learning_rate": 0.0002, "epoch": 6.65278276481149, "step": 92640}, {"loss": 0.5524, "grad_norm": 1.092708706855774, "learning_rate": 0.0002, "epoch": 6.653500897666068, "step": 92650}, {"loss": 0.5959, "grad_norm": 1.0641281604766846, "learning_rate": 0.0002, "epoch": 6.654219030520647, "step": 92660}, {"loss": 0.5478, "grad_norm": 0.9953374862670898, "learning_rate": 0.0002, "epoch": 6.654937163375225, "step": 92670}, {"loss": 0.5787, "grad_norm": 0.9792306423187256, "learning_rate": 0.0002, "epoch": 6.655655296229803, "step": 92680}, {"loss": 0.5945, "grad_norm": 1.1209690570831299, "learning_rate": 0.0002, "epoch": 6.656373429084381, "step": 92690}, {"loss": 0.5531, "grad_norm": 0.8281117677688599, "learning_rate": 0.0002, "epoch": 6.657091561938959, "step": 92700}, {"loss": 0.5315, "grad_norm": 0.9189280867576599, "learning_rate": 0.0002, "epoch": 6.657809694793537, "step": 92710}, {"loss": 0.6032, "grad_norm": 1.1859153509140015, "learning_rate": 0.0002, "epoch": 6.658527827648115, "step": 92720}, {"loss": 0.5201, "grad_norm": 0.9750476479530334, "learning_rate": 0.0002, "epoch": 6.659245960502693, "step": 92730}, {"loss": 0.5516, "grad_norm": 0.9973570704460144, "learning_rate": 0.0002, "epoch": 6.659964093357271, "step": 92740}, {"loss": 0.6042, "grad_norm": 1.0170378684997559, "learning_rate": 0.0002, "epoch": 6.660682226211849, "step": 92750}, {"loss": 0.6065, "grad_norm": 1.352283239364624, "learning_rate": 0.0002, "epoch": 6.661400359066427, "step": 92760}, {"loss": 0.5577, "grad_norm": 1.1020066738128662, "learning_rate": 0.0002, "epoch": 6.662118491921006, "step": 92770}, {"loss": 0.5748, "grad_norm": 1.0750092267990112, "learning_rate": 0.0002, "epoch": 6.662836624775584, "step": 92780}, {"loss": 0.5624, "grad_norm": 1.1006640195846558, "learning_rate": 0.0002, "epoch": 6.663554757630162, "step": 92790}, {"loss": 0.5383, "grad_norm": 1.2372384071350098, "learning_rate": 0.0002, "epoch": 6.66427289048474, "step": 92800}, {"loss": 0.5914, "grad_norm": 1.084846019744873, "learning_rate": 0.0002, "epoch": 6.664991023339318, "step": 92810}, {"loss": 0.5951, "grad_norm": 1.1738693714141846, "learning_rate": 0.0002, "epoch": 6.665709156193896, "step": 92820}, {"loss": 0.5825, "grad_norm": 1.159678339958191, "learning_rate": 0.0002, "epoch": 6.666427289048474, "step": 92830}, {"loss": 0.5483, "grad_norm": 0.9957766532897949, "learning_rate": 0.0002, "epoch": 6.667145421903052, "step": 92840}, {"loss": 0.5585, "grad_norm": 1.1403744220733643, "learning_rate": 0.0002, "epoch": 6.667863554757631, "step": 92850}, {"loss": 0.6091, "grad_norm": 1.0120519399642944, "learning_rate": 0.0002, "epoch": 6.668581687612209, "step": 92860}, {"loss": 0.5857, "grad_norm": 1.0876718759536743, "learning_rate": 0.0002, "epoch": 6.669299820466787, "step": 92870}, {"loss": 0.5876, "grad_norm": 1.175749659538269, "learning_rate": 0.0002, "epoch": 6.670017953321365, "step": 92880}, {"loss": 0.5365, "grad_norm": 0.9808473587036133, "learning_rate": 0.0002, "epoch": 6.670736086175943, "step": 92890}, {"loss": 0.578, "grad_norm": 1.121573805809021, "learning_rate": 0.0002, "epoch": 6.671454219030521, "step": 92900}, {"loss": 0.5745, "grad_norm": 0.9749727249145508, "learning_rate": 0.0002, "epoch": 6.672172351885099, "step": 92910}, {"loss": 0.588, "grad_norm": 1.0969820022583008, "learning_rate": 0.0002, "epoch": 6.672890484739677, "step": 92920}, {"loss": 0.5792, "grad_norm": 1.0777957439422607, "learning_rate": 0.0002, "epoch": 6.673608617594255, "step": 92930}, {"loss": 0.598, "grad_norm": 1.2342437505722046, "learning_rate": 0.0002, "epoch": 6.674326750448833, "step": 92940}, {"loss": 0.6069, "grad_norm": 1.18901789188385, "learning_rate": 0.0002, "epoch": 6.6750448833034115, "step": 92950}, {"loss": 0.6148, "grad_norm": 1.2212412357330322, "learning_rate": 0.0002, "epoch": 6.6757630161579895, "step": 92960}, {"loss": 0.5583, "grad_norm": 1.0007524490356445, "learning_rate": 0.0002, "epoch": 6.6764811490125675, "step": 92970}, {"loss": 0.5821, "grad_norm": 1.1012821197509766, "learning_rate": 0.0002, "epoch": 6.6771992818671455, "step": 92980}, {"loss": 0.5694, "grad_norm": 0.9446989893913269, "learning_rate": 0.0002, "epoch": 6.6779174147217235, "step": 92990}, {"loss": 0.5987, "grad_norm": 1.5307164192199707, "learning_rate": 0.0002, "epoch": 6.6786355475763015, "step": 93000}, {"loss": 0.6015, "grad_norm": 1.4290575981140137, "learning_rate": 0.0002, "epoch": 6.6793536804308795, "step": 93010}, {"loss": 0.5843, "grad_norm": 1.2367054224014282, "learning_rate": 0.0002, "epoch": 6.6800718132854575, "step": 93020}, {"loss": 0.5915, "grad_norm": 0.874568521976471, "learning_rate": 0.0002, "epoch": 6.680789946140036, "step": 93030}, {"loss": 0.5684, "grad_norm": 1.152861475944519, "learning_rate": 0.0002, "epoch": 6.681508078994614, "step": 93040}, {"loss": 0.5995, "grad_norm": 0.9524891972541809, "learning_rate": 0.0002, "epoch": 6.682226211849192, "step": 93050}, {"loss": 0.548, "grad_norm": 0.8084558844566345, "learning_rate": 0.0002, "epoch": 6.68294434470377, "step": 93060}, {"loss": 0.6002, "grad_norm": 1.1458806991577148, "learning_rate": 0.0002, "epoch": 6.683662477558348, "step": 93070}, {"loss": 0.5733, "grad_norm": 1.1427397727966309, "learning_rate": 0.0002, "epoch": 6.684380610412926, "step": 93080}, {"loss": 0.5721, "grad_norm": 1.1136237382888794, "learning_rate": 0.0002, "epoch": 6.685098743267504, "step": 93090}, {"loss": 0.5173, "grad_norm": 1.0270767211914062, "learning_rate": 0.0002, "epoch": 6.685816876122082, "step": 93100}, {"loss": 0.5594, "grad_norm": 0.9473410844802856, "learning_rate": 0.0002, "epoch": 6.68653500897666, "step": 93110}, {"loss": 0.6255, "grad_norm": 1.011011004447937, "learning_rate": 0.0002, "epoch": 6.687253141831238, "step": 93120}, {"loss": 0.5662, "grad_norm": 0.9286965131759644, "learning_rate": 0.0002, "epoch": 6.687971274685816, "step": 93130}, {"loss": 0.5729, "grad_norm": 1.226515293121338, "learning_rate": 0.0002, "epoch": 6.688689407540395, "step": 93140}, {"loss": 0.5821, "grad_norm": 0.9131909608840942, "learning_rate": 0.0002, "epoch": 6.689407540394973, "step": 93150}, {"loss": 0.5328, "grad_norm": 1.2111890316009521, "learning_rate": 0.0002, "epoch": 6.690125673249551, "step": 93160}, {"loss": 0.5939, "grad_norm": 0.9296384453773499, "learning_rate": 0.0002, "epoch": 6.690843806104129, "step": 93170}, {"loss": 0.5661, "grad_norm": 0.9636726975440979, "learning_rate": 0.0002, "epoch": 6.691561938958707, "step": 93180}, {"loss": 0.5998, "grad_norm": 1.0116214752197266, "learning_rate": 0.0002, "epoch": 6.692280071813285, "step": 93190}, {"loss": 0.5925, "grad_norm": 1.2671175003051758, "learning_rate": 0.0002, "epoch": 6.692998204667863, "step": 93200}, {"loss": 0.5982, "grad_norm": 1.0676039457321167, "learning_rate": 0.0002, "epoch": 6.693716337522441, "step": 93210}, {"loss": 0.5815, "grad_norm": 1.3277634382247925, "learning_rate": 0.0002, "epoch": 6.69443447037702, "step": 93220}, {"loss": 0.5621, "grad_norm": 0.9312936663627625, "learning_rate": 0.0002, "epoch": 6.695152603231598, "step": 93230}, {"loss": 0.5727, "grad_norm": 1.410414457321167, "learning_rate": 0.0002, "epoch": 6.695870736086176, "step": 93240}, {"loss": 0.5793, "grad_norm": 1.014519453048706, "learning_rate": 0.0002, "epoch": 6.696588868940754, "step": 93250}, {"loss": 0.5801, "grad_norm": 0.9211319088935852, "learning_rate": 0.0002, "epoch": 6.697307001795332, "step": 93260}, {"loss": 0.5472, "grad_norm": 1.1027755737304688, "learning_rate": 0.0002, "epoch": 6.69802513464991, "step": 93270}, {"loss": 0.5908, "grad_norm": 1.0538618564605713, "learning_rate": 0.0002, "epoch": 6.698743267504488, "step": 93280}, {"loss": 0.5694, "grad_norm": 1.159927248954773, "learning_rate": 0.0002, "epoch": 6.699461400359066, "step": 93290}, {"loss": 0.601, "grad_norm": 1.1329137086868286, "learning_rate": 0.0002, "epoch": 6.700179533213644, "step": 93300}, {"loss": 0.5702, "grad_norm": 0.9797694683074951, "learning_rate": 0.0002, "epoch": 6.700897666068222, "step": 93310}, {"loss": 0.6145, "grad_norm": 1.0968587398529053, "learning_rate": 0.0002, "epoch": 6.7016157989228, "step": 93320}, {"loss": 0.5737, "grad_norm": 0.9620516896247864, "learning_rate": 0.0002, "epoch": 6.702333931777379, "step": 93330}, {"loss": 0.5469, "grad_norm": 1.048879623413086, "learning_rate": 0.0002, "epoch": 6.703052064631957, "step": 93340}, {"loss": 0.5641, "grad_norm": 1.086421012878418, "learning_rate": 0.0002, "epoch": 6.703770197486535, "step": 93350}, {"loss": 0.5905, "grad_norm": 1.1045429706573486, "learning_rate": 0.0002, "epoch": 6.704488330341113, "step": 93360}, {"loss": 0.5602, "grad_norm": 1.081629991531372, "learning_rate": 0.0002, "epoch": 6.705206463195691, "step": 93370}, {"loss": 0.5644, "grad_norm": 0.9947898387908936, "learning_rate": 0.0002, "epoch": 6.705924596050269, "step": 93380}, {"loss": 0.5624, "grad_norm": 0.8837184309959412, "learning_rate": 0.0002, "epoch": 6.706642728904847, "step": 93390}, {"loss": 0.6168, "grad_norm": 1.1838666200637817, "learning_rate": 0.0002, "epoch": 6.707360861759425, "step": 93400}, {"loss": 0.5586, "grad_norm": 0.9221062064170837, "learning_rate": 0.0002, "epoch": 6.708078994614004, "step": 93410}, {"loss": 0.5481, "grad_norm": 1.0049937963485718, "learning_rate": 0.0002, "epoch": 6.708797127468582, "step": 93420}, {"loss": 0.5608, "grad_norm": 0.8895014524459839, "learning_rate": 0.0002, "epoch": 6.70951526032316, "step": 93430}, {"loss": 0.6043, "grad_norm": 1.2572799921035767, "learning_rate": 0.0002, "epoch": 6.710233393177738, "step": 93440}, {"loss": 0.5763, "grad_norm": 1.082982063293457, "learning_rate": 0.0002, "epoch": 6.710951526032316, "step": 93450}, {"loss": 0.5326, "grad_norm": 1.1520570516586304, "learning_rate": 0.0002, "epoch": 6.711669658886894, "step": 93460}, {"loss": 0.6059, "grad_norm": 1.0604512691497803, "learning_rate": 0.0002, "epoch": 6.712387791741472, "step": 93470}, {"loss": 0.5683, "grad_norm": 0.9887481331825256, "learning_rate": 0.0002, "epoch": 6.71310592459605, "step": 93480}, {"loss": 0.5741, "grad_norm": 1.0163664817810059, "learning_rate": 0.0002, "epoch": 6.713824057450628, "step": 93490}, {"loss": 0.5704, "grad_norm": 1.187687873840332, "learning_rate": 0.0002, "epoch": 6.714542190305206, "step": 93500}, {"loss": 0.5841, "grad_norm": 0.8770190477371216, "learning_rate": 0.0002, "epoch": 6.715260323159785, "step": 93510}, {"loss": 0.5758, "grad_norm": 1.1552737951278687, "learning_rate": 0.0002, "epoch": 6.715978456014363, "step": 93520}, {"loss": 0.5708, "grad_norm": 1.168770432472229, "learning_rate": 0.0002, "epoch": 6.716696588868941, "step": 93530}, {"loss": 0.5653, "grad_norm": 1.1071383953094482, "learning_rate": 0.0002, "epoch": 6.717414721723519, "step": 93540}, {"loss": 0.5813, "grad_norm": 0.8549296259880066, "learning_rate": 0.0002, "epoch": 6.718132854578097, "step": 93550}, {"loss": 0.6108, "grad_norm": 1.1576329469680786, "learning_rate": 0.0002, "epoch": 6.718850987432675, "step": 93560}, {"loss": 0.5605, "grad_norm": 1.1610777378082275, "learning_rate": 0.0002, "epoch": 6.719569120287253, "step": 93570}, {"loss": 0.6055, "grad_norm": 1.0316133499145508, "learning_rate": 0.0002, "epoch": 6.720287253141831, "step": 93580}, {"loss": 0.5889, "grad_norm": 1.1048495769500732, "learning_rate": 0.0002, "epoch": 6.721005385996409, "step": 93590}, {"loss": 0.5431, "grad_norm": 1.1212984323501587, "learning_rate": 0.0002, "epoch": 6.721723518850988, "step": 93600}, {"loss": 0.5971, "grad_norm": 1.1465938091278076, "learning_rate": 0.0002, "epoch": 6.722441651705566, "step": 93610}, {"loss": 0.5881, "grad_norm": 0.8978183269500732, "learning_rate": 0.0002, "epoch": 6.723159784560144, "step": 93620}, {"loss": 0.5292, "grad_norm": 1.0475369691848755, "learning_rate": 0.0002, "epoch": 6.723877917414722, "step": 93630}, {"loss": 0.5565, "grad_norm": 1.0717675685882568, "learning_rate": 0.0002, "epoch": 6.7245960502693, "step": 93640}, {"loss": 0.5594, "grad_norm": 1.2429792881011963, "learning_rate": 0.0002, "epoch": 6.725314183123878, "step": 93650}, {"loss": 0.5939, "grad_norm": 1.0333678722381592, "learning_rate": 0.0002, "epoch": 6.726032315978456, "step": 93660}, {"loss": 0.5264, "grad_norm": 1.211590051651001, "learning_rate": 0.0002, "epoch": 6.726750448833034, "step": 93670}, {"loss": 0.6022, "grad_norm": 1.0022165775299072, "learning_rate": 0.0002, "epoch": 6.727468581687612, "step": 93680}, {"loss": 0.5909, "grad_norm": 1.0192183256149292, "learning_rate": 0.0002, "epoch": 6.72818671454219, "step": 93690}, {"loss": 0.5283, "grad_norm": 0.9370006322860718, "learning_rate": 0.0002, "epoch": 6.728904847396769, "step": 93700}, {"loss": 0.5796, "grad_norm": 0.7869033813476562, "learning_rate": 0.0002, "epoch": 6.729622980251347, "step": 93710}, {"loss": 0.5481, "grad_norm": 0.899703860282898, "learning_rate": 0.0002, "epoch": 6.730341113105925, "step": 93720}, {"loss": 0.623, "grad_norm": 1.1216487884521484, "learning_rate": 0.0002, "epoch": 6.731059245960503, "step": 93730}, {"loss": 0.5974, "grad_norm": 0.9117740988731384, "learning_rate": 0.0002, "epoch": 6.731777378815081, "step": 93740}, {"loss": 0.6382, "grad_norm": 1.070947289466858, "learning_rate": 0.0002, "epoch": 6.732495511669659, "step": 93750}, {"loss": 0.6014, "grad_norm": 1.0529371500015259, "learning_rate": 0.0002, "epoch": 6.733213644524237, "step": 93760}, {"loss": 0.5177, "grad_norm": 0.7950748801231384, "learning_rate": 0.0002, "epoch": 6.733931777378815, "step": 93770}, {"loss": 0.6239, "grad_norm": 1.0469520092010498, "learning_rate": 0.0002, "epoch": 6.734649910233394, "step": 93780}, {"loss": 0.6177, "grad_norm": 1.4734543561935425, "learning_rate": 0.0002, "epoch": 6.735368043087972, "step": 93790}, {"loss": 0.583, "grad_norm": 0.8239574432373047, "learning_rate": 0.0002, "epoch": 6.73608617594255, "step": 93800}, {"loss": 0.557, "grad_norm": 1.1228505373001099, "learning_rate": 0.0002, "epoch": 6.736804308797128, "step": 93810}, {"loss": 0.5162, "grad_norm": 1.0902183055877686, "learning_rate": 0.0002, "epoch": 6.737522441651706, "step": 93820}, {"loss": 0.6094, "grad_norm": 1.220467209815979, "learning_rate": 0.0002, "epoch": 6.738240574506284, "step": 93830}, {"loss": 0.5963, "grad_norm": 1.199582815170288, "learning_rate": 0.0002, "epoch": 6.738958707360862, "step": 93840}, {"loss": 0.6004, "grad_norm": 1.1008597612380981, "learning_rate": 0.0002, "epoch": 6.73967684021544, "step": 93850}, {"loss": 0.5582, "grad_norm": 0.8596068620681763, "learning_rate": 0.0002, "epoch": 6.740394973070018, "step": 93860}, {"loss": 0.5661, "grad_norm": 1.220947027206421, "learning_rate": 0.0002, "epoch": 6.741113105924596, "step": 93870}, {"loss": 0.5425, "grad_norm": 1.2840452194213867, "learning_rate": 0.0002, "epoch": 6.741831238779174, "step": 93880}, {"loss": 0.5713, "grad_norm": 1.1923094987869263, "learning_rate": 0.0002, "epoch": 6.742549371633753, "step": 93890}, {"loss": 0.5523, "grad_norm": 1.1287206411361694, "learning_rate": 0.0002, "epoch": 6.743267504488331, "step": 93900}, {"loss": 0.5473, "grad_norm": 0.9465082287788391, "learning_rate": 0.0002, "epoch": 6.743985637342909, "step": 93910}, {"loss": 0.5795, "grad_norm": 0.9888480305671692, "learning_rate": 0.0002, "epoch": 6.744703770197487, "step": 93920}, {"loss": 0.5968, "grad_norm": 1.1438485383987427, "learning_rate": 0.0002, "epoch": 6.745421903052065, "step": 93930}, {"loss": 0.5711, "grad_norm": 0.8203039169311523, "learning_rate": 0.0002, "epoch": 6.746140035906643, "step": 93940}, {"loss": 0.5787, "grad_norm": 1.217855453491211, "learning_rate": 0.0002, "epoch": 6.746858168761221, "step": 93950}, {"loss": 0.5488, "grad_norm": 1.245977520942688, "learning_rate": 0.0002, "epoch": 6.747576301615799, "step": 93960}, {"loss": 0.5849, "grad_norm": 1.240097165107727, "learning_rate": 0.0002, "epoch": 6.7482944344703775, "step": 93970}, {"loss": 0.5717, "grad_norm": 0.9436663389205933, "learning_rate": 0.0002, "epoch": 6.7490125673249555, "step": 93980}, {"loss": 0.5717, "grad_norm": 0.9331963062286377, "learning_rate": 0.0002, "epoch": 6.7497307001795335, "step": 93990}, {"loss": 0.5777, "grad_norm": 0.9809562563896179, "learning_rate": 0.0002, "epoch": 6.7504488330341115, "step": 94000}, {"loss": 0.6237, "grad_norm": 1.1596009731292725, "learning_rate": 0.0002, "epoch": 6.7511669658886895, "step": 94010}, {"loss": 0.61, "grad_norm": 1.082684874534607, "learning_rate": 0.0002, "epoch": 6.7518850987432675, "step": 94020}, {"loss": 0.6285, "grad_norm": 0.9931458234786987, "learning_rate": 0.0002, "epoch": 6.7526032315978455, "step": 94030}, {"loss": 0.5606, "grad_norm": 0.8717518448829651, "learning_rate": 0.0002, "epoch": 6.7533213644524235, "step": 94040}, {"loss": 0.5504, "grad_norm": 0.9379602074623108, "learning_rate": 0.0002, "epoch": 6.7540394973070015, "step": 94050}, {"loss": 0.5942, "grad_norm": 0.8819605708122253, "learning_rate": 0.0002, "epoch": 6.7547576301615795, "step": 94060}, {"loss": 0.5989, "grad_norm": 1.111547589302063, "learning_rate": 0.0002, "epoch": 6.755475763016158, "step": 94070}, {"loss": 0.5898, "grad_norm": 1.0755881071090698, "learning_rate": 0.0002, "epoch": 6.756193895870736, "step": 94080}, {"loss": 0.5494, "grad_norm": 1.0734093189239502, "learning_rate": 0.0002, "epoch": 6.756912028725314, "step": 94090}, {"loss": 0.5979, "grad_norm": 1.0390300750732422, "learning_rate": 0.0002, "epoch": 6.757630161579892, "step": 94100}, {"loss": 0.5478, "grad_norm": 0.9557124972343445, "learning_rate": 0.0002, "epoch": 6.75834829443447, "step": 94110}, {"loss": 0.5613, "grad_norm": 1.0970680713653564, "learning_rate": 0.0002, "epoch": 6.759066427289048, "step": 94120}, {"loss": 0.5828, "grad_norm": 1.0715644359588623, "learning_rate": 0.0002, "epoch": 6.759784560143626, "step": 94130}, {"loss": 0.5424, "grad_norm": 1.1311662197113037, "learning_rate": 0.0002, "epoch": 6.760502692998204, "step": 94140}, {"loss": 0.6033, "grad_norm": 0.9891370534896851, "learning_rate": 0.0002, "epoch": 6.761220825852782, "step": 94150}, {"loss": 0.577, "grad_norm": 0.9472686648368835, "learning_rate": 0.0002, "epoch": 6.761938958707361, "step": 94160}, {"loss": 0.5935, "grad_norm": 1.1044381856918335, "learning_rate": 0.0002, "epoch": 6.762657091561939, "step": 94170}, {"loss": 0.6254, "grad_norm": 1.2088780403137207, "learning_rate": 0.0002, "epoch": 6.763375224416517, "step": 94180}, {"loss": 0.554, "grad_norm": 0.9210726618766785, "learning_rate": 0.0002, "epoch": 6.764093357271095, "step": 94190}, {"loss": 0.54, "grad_norm": 1.0969771146774292, "learning_rate": 0.0002, "epoch": 6.764811490125673, "step": 94200}, {"loss": 0.5414, "grad_norm": 1.1030265092849731, "learning_rate": 0.0002, "epoch": 6.765529622980251, "step": 94210}, {"loss": 0.5973, "grad_norm": 0.9451745748519897, "learning_rate": 0.0002, "epoch": 6.766247755834829, "step": 94220}, {"loss": 0.616, "grad_norm": 1.0216296911239624, "learning_rate": 0.0002, "epoch": 6.766965888689407, "step": 94230}, {"loss": 0.5402, "grad_norm": 1.4021092653274536, "learning_rate": 0.0002, "epoch": 6.767684021543985, "step": 94240}, {"loss": 0.5991, "grad_norm": 1.2341269254684448, "learning_rate": 0.0002, "epoch": 6.768402154398563, "step": 94250}, {"loss": 0.5743, "grad_norm": 1.1086686849594116, "learning_rate": 0.0002, "epoch": 6.769120287253142, "step": 94260}, {"loss": 0.551, "grad_norm": 0.8565682172775269, "learning_rate": 0.0002, "epoch": 6.76983842010772, "step": 94270}, {"loss": 0.6026, "grad_norm": 0.9314411878585815, "learning_rate": 0.0002, "epoch": 6.770556552962298, "step": 94280}, {"loss": 0.5972, "grad_norm": 1.0592315196990967, "learning_rate": 0.0002, "epoch": 6.771274685816876, "step": 94290}, {"loss": 0.5947, "grad_norm": 1.086379885673523, "learning_rate": 0.0002, "epoch": 6.771992818671454, "step": 94300}, {"loss": 0.5484, "grad_norm": 1.13401198387146, "learning_rate": 0.0002, "epoch": 6.772710951526032, "step": 94310}, {"loss": 0.5738, "grad_norm": 1.0137985944747925, "learning_rate": 0.0002, "epoch": 6.77342908438061, "step": 94320}, {"loss": 0.5972, "grad_norm": 1.0459709167480469, "learning_rate": 0.0002, "epoch": 6.774147217235188, "step": 94330}, {"loss": 0.6279, "grad_norm": 1.2213165760040283, "learning_rate": 0.0002, "epoch": 6.774865350089767, "step": 94340}, {"loss": 0.5522, "grad_norm": 1.099478006362915, "learning_rate": 0.0002, "epoch": 6.775583482944345, "step": 94350}, {"loss": 0.5694, "grad_norm": 1.124526858329773, "learning_rate": 0.0002, "epoch": 6.776301615798923, "step": 94360}, {"loss": 0.6393, "grad_norm": 1.0199998617172241, "learning_rate": 0.0002, "epoch": 6.777019748653501, "step": 94370}, {"loss": 0.5662, "grad_norm": 1.1849408149719238, "learning_rate": 0.0002, "epoch": 6.777737881508079, "step": 94380}, {"loss": 0.5856, "grad_norm": 1.2265552282333374, "learning_rate": 0.0002, "epoch": 6.778456014362657, "step": 94390}, {"loss": 0.5817, "grad_norm": 0.7576864361763, "learning_rate": 0.0002, "epoch": 6.779174147217235, "step": 94400}, {"loss": 0.5495, "grad_norm": 0.8172970414161682, "learning_rate": 0.0002, "epoch": 6.779892280071813, "step": 94410}, {"loss": 0.5902, "grad_norm": 1.1105220317840576, "learning_rate": 0.0002, "epoch": 6.780610412926391, "step": 94420}, {"loss": 0.5918, "grad_norm": 1.0542421340942383, "learning_rate": 0.0002, "epoch": 6.781328545780969, "step": 94430}, {"loss": 0.5911, "grad_norm": 1.0088121891021729, "learning_rate": 0.0002, "epoch": 6.782046678635547, "step": 94440}, {"loss": 0.5866, "grad_norm": 0.9872488379478455, "learning_rate": 0.0002, "epoch": 6.782764811490126, "step": 94450}, {"loss": 0.5524, "grad_norm": 1.2545148134231567, "learning_rate": 0.0002, "epoch": 6.783482944344704, "step": 94460}, {"loss": 0.5365, "grad_norm": 0.8847712278366089, "learning_rate": 0.0002, "epoch": 6.784201077199282, "step": 94470}, {"loss": 0.5999, "grad_norm": 0.7758765816688538, "learning_rate": 0.0002, "epoch": 6.78491921005386, "step": 94480}, {"loss": 0.5654, "grad_norm": 1.0454037189483643, "learning_rate": 0.0002, "epoch": 6.785637342908438, "step": 94490}, {"loss": 0.5943, "grad_norm": 1.1336725950241089, "learning_rate": 0.0002, "epoch": 6.786355475763016, "step": 94500}, {"loss": 0.6091, "grad_norm": 1.081356406211853, "learning_rate": 0.0002, "epoch": 6.787073608617594, "step": 94510}, {"loss": 0.5634, "grad_norm": 1.126288890838623, "learning_rate": 0.0002, "epoch": 6.787791741472172, "step": 94520}, {"loss": 0.5771, "grad_norm": 1.1156792640686035, "learning_rate": 0.0002, "epoch": 6.788509874326751, "step": 94530}, {"loss": 0.599, "grad_norm": 1.0243451595306396, "learning_rate": 0.0002, "epoch": 6.789228007181329, "step": 94540}, {"loss": 0.5949, "grad_norm": 0.9778338670730591, "learning_rate": 0.0002, "epoch": 6.789946140035907, "step": 94550}, {"loss": 0.6, "grad_norm": 0.9668094515800476, "learning_rate": 0.0002, "epoch": 6.790664272890485, "step": 94560}, {"loss": 0.6285, "grad_norm": 1.121848464012146, "learning_rate": 0.0002, "epoch": 6.791382405745063, "step": 94570}, {"loss": 0.5878, "grad_norm": 1.105825662612915, "learning_rate": 0.0002, "epoch": 6.792100538599641, "step": 94580}, {"loss": 0.5478, "grad_norm": 1.1236833333969116, "learning_rate": 0.0002, "epoch": 6.792818671454219, "step": 94590}, {"loss": 0.5854, "grad_norm": 1.0655126571655273, "learning_rate": 0.0002, "epoch": 6.793536804308797, "step": 94600}, {"loss": 0.5271, "grad_norm": 0.9249289631843567, "learning_rate": 0.0002, "epoch": 6.794254937163375, "step": 94610}, {"loss": 0.5767, "grad_norm": 1.0177690982818604, "learning_rate": 0.0002, "epoch": 6.794973070017953, "step": 94620}, {"loss": 0.6323, "grad_norm": 1.1961153745651245, "learning_rate": 0.0002, "epoch": 6.795691202872531, "step": 94630}, {"loss": 0.5623, "grad_norm": 1.0987505912780762, "learning_rate": 0.0002, "epoch": 6.79640933572711, "step": 94640}, {"loss": 0.5672, "grad_norm": 1.0165259838104248, "learning_rate": 0.0002, "epoch": 6.797127468581688, "step": 94650}, {"loss": 0.5777, "grad_norm": 1.1336601972579956, "learning_rate": 0.0002, "epoch": 6.797845601436266, "step": 94660}, {"loss": 0.6252, "grad_norm": 1.0786010026931763, "learning_rate": 0.0002, "epoch": 6.798563734290844, "step": 94670}, {"loss": 0.5755, "grad_norm": 1.2896602153778076, "learning_rate": 0.0002, "epoch": 6.799281867145422, "step": 94680}, {"loss": 0.5858, "grad_norm": 1.0934168100357056, "learning_rate": 0.0002, "epoch": 6.8, "step": 94690}, {"loss": 0.5381, "grad_norm": 1.1080414056777954, "learning_rate": 0.0002, "epoch": 6.800718132854578, "step": 94700}, {"loss": 0.5896, "grad_norm": 1.1141704320907593, "learning_rate": 0.0002, "epoch": 6.801436265709156, "step": 94710}, {"loss": 0.5487, "grad_norm": 0.9571144580841064, "learning_rate": 0.0002, "epoch": 6.802154398563735, "step": 94720}, {"loss": 0.5487, "grad_norm": 0.8907591700553894, "learning_rate": 0.0002, "epoch": 6.802872531418313, "step": 94730}, {"loss": 0.5551, "grad_norm": 1.0547759532928467, "learning_rate": 0.0002, "epoch": 6.803590664272891, "step": 94740}, {"loss": 0.5799, "grad_norm": 0.973573625087738, "learning_rate": 0.0002, "epoch": 6.804308797127469, "step": 94750}, {"loss": 0.6073, "grad_norm": 0.7889130711555481, "learning_rate": 0.0002, "epoch": 6.805026929982047, "step": 94760}, {"loss": 0.6004, "grad_norm": 0.9414647221565247, "learning_rate": 0.0002, "epoch": 6.805745062836625, "step": 94770}, {"loss": 0.5533, "grad_norm": 0.9452534317970276, "learning_rate": 0.0002, "epoch": 6.806463195691203, "step": 94780}, {"loss": 0.5379, "grad_norm": 1.2215145826339722, "learning_rate": 0.0002, "epoch": 6.807181328545781, "step": 94790}, {"loss": 0.6045, "grad_norm": 1.116302490234375, "learning_rate": 0.0002, "epoch": 6.807899461400359, "step": 94800}, {"loss": 0.5595, "grad_norm": 0.850916862487793, "learning_rate": 0.0002, "epoch": 6.808617594254937, "step": 94810}, {"loss": 0.5411, "grad_norm": 0.8699719905853271, "learning_rate": 0.0002, "epoch": 6.809335727109516, "step": 94820}, {"loss": 0.5334, "grad_norm": 1.0958143472671509, "learning_rate": 0.0002, "epoch": 6.810053859964094, "step": 94830}, {"loss": 0.5687, "grad_norm": 1.128580927848816, "learning_rate": 0.0002, "epoch": 6.810771992818672, "step": 94840}, {"loss": 0.5622, "grad_norm": 0.9490674138069153, "learning_rate": 0.0002, "epoch": 6.81149012567325, "step": 94850}, {"loss": 0.5779, "grad_norm": 0.9294022917747498, "learning_rate": 0.0002, "epoch": 6.812208258527828, "step": 94860}, {"loss": 0.5738, "grad_norm": 1.048378348350525, "learning_rate": 0.0002, "epoch": 6.812926391382406, "step": 94870}, {"loss": 0.5634, "grad_norm": 1.1972805261611938, "learning_rate": 0.0002, "epoch": 6.813644524236984, "step": 94880}, {"loss": 0.5732, "grad_norm": 0.7709503769874573, "learning_rate": 0.0002, "epoch": 6.814362657091562, "step": 94890}, {"loss": 0.5854, "grad_norm": 1.0244873762130737, "learning_rate": 0.0002, "epoch": 6.8150807899461405, "step": 94900}, {"loss": 0.581, "grad_norm": 1.0576984882354736, "learning_rate": 0.0002, "epoch": 6.8157989228007185, "step": 94910}, {"loss": 0.5812, "grad_norm": 1.3478775024414062, "learning_rate": 0.0002, "epoch": 6.8165170556552965, "step": 94920}, {"loss": 0.597, "grad_norm": 0.982311487197876, "learning_rate": 0.0002, "epoch": 6.8172351885098745, "step": 94930}, {"loss": 0.5703, "grad_norm": 1.1846535205841064, "learning_rate": 0.0002, "epoch": 6.8179533213644525, "step": 94940}, {"loss": 0.578, "grad_norm": 0.9255896210670471, "learning_rate": 0.0002, "epoch": 6.8186714542190305, "step": 94950}, {"loss": 0.5255, "grad_norm": 0.9418646693229675, "learning_rate": 0.0002, "epoch": 6.8193895870736085, "step": 94960}, {"loss": 0.6163, "grad_norm": 1.189335584640503, "learning_rate": 0.0002, "epoch": 6.8201077199281865, "step": 94970}, {"loss": 0.5646, "grad_norm": 1.1003406047821045, "learning_rate": 0.0002, "epoch": 6.8208258527827645, "step": 94980}, {"loss": 0.5677, "grad_norm": 0.9203724265098572, "learning_rate": 0.0002, "epoch": 6.8215439856373425, "step": 94990}, {"loss": 0.5862, "grad_norm": 1.093252182006836, "learning_rate": 0.0002, "epoch": 6.8222621184919205, "step": 95000}, {"loss": 0.6286, "grad_norm": 1.2737812995910645, "learning_rate": 0.0002, "epoch": 6.822980251346499, "step": 95010}, {"loss": 0.5726, "grad_norm": 1.1859848499298096, "learning_rate": 0.0002, "epoch": 6.823698384201077, "step": 95020}, {"loss": 0.5936, "grad_norm": 0.9591164588928223, "learning_rate": 0.0002, "epoch": 6.824416517055655, "step": 95030}, {"loss": 0.5401, "grad_norm": 1.0144239664077759, "learning_rate": 0.0002, "epoch": 6.825134649910233, "step": 95040}, {"loss": 0.6106, "grad_norm": 1.2520356178283691, "learning_rate": 0.0002, "epoch": 6.825852782764811, "step": 95050}, {"loss": 0.6206, "grad_norm": 1.003438115119934, "learning_rate": 0.0002, "epoch": 6.8265709156193894, "step": 95060}, {"loss": 0.5507, "grad_norm": 0.9512312412261963, "learning_rate": 0.0002, "epoch": 6.8272890484739674, "step": 95070}, {"loss": 0.5874, "grad_norm": 0.9984938502311707, "learning_rate": 0.0002, "epoch": 6.8280071813285454, "step": 95080}, {"loss": 0.5654, "grad_norm": 0.9630827307701111, "learning_rate": 0.0002, "epoch": 6.828725314183124, "step": 95090}, {"loss": 0.5749, "grad_norm": 0.8859394192695618, "learning_rate": 0.0002, "epoch": 6.829443447037702, "step": 95100}, {"loss": 0.5888, "grad_norm": 0.9082155227661133, "learning_rate": 0.0002, "epoch": 6.83016157989228, "step": 95110}, {"loss": 0.5773, "grad_norm": 1.0707300901412964, "learning_rate": 0.0002, "epoch": 6.830879712746858, "step": 95120}, {"loss": 0.5663, "grad_norm": 1.2023502588272095, "learning_rate": 0.0002, "epoch": 6.831597845601436, "step": 95130}, {"loss": 0.5843, "grad_norm": 1.0189216136932373, "learning_rate": 0.0002, "epoch": 6.832315978456014, "step": 95140}, {"loss": 0.5881, "grad_norm": 1.1216851472854614, "learning_rate": 0.0002, "epoch": 6.833034111310592, "step": 95150}, {"loss": 0.5852, "grad_norm": 1.124589204788208, "learning_rate": 0.0002, "epoch": 6.83375224416517, "step": 95160}, {"loss": 0.5374, "grad_norm": 1.1183217763900757, "learning_rate": 0.0002, "epoch": 6.834470377019748, "step": 95170}, {"loss": 0.6106, "grad_norm": 1.0307188034057617, "learning_rate": 0.0002, "epoch": 6.835188509874326, "step": 95180}, {"loss": 0.5978, "grad_norm": 1.2438706159591675, "learning_rate": 0.0002, "epoch": 6.835906642728904, "step": 95190}, {"loss": 0.5935, "grad_norm": 1.117887258529663, "learning_rate": 0.0002, "epoch": 6.836624775583483, "step": 95200}, {"loss": 0.5965, "grad_norm": 0.8934445381164551, "learning_rate": 0.0002, "epoch": 6.837342908438061, "step": 95210}, {"loss": 0.5384, "grad_norm": 1.097379207611084, "learning_rate": 0.0002, "epoch": 6.838061041292639, "step": 95220}, {"loss": 0.5792, "grad_norm": 1.1034258604049683, "learning_rate": 0.0002, "epoch": 6.838779174147217, "step": 95230}, {"loss": 0.5846, "grad_norm": 1.052120327949524, "learning_rate": 0.0002, "epoch": 6.839497307001795, "step": 95240}, {"loss": 0.5812, "grad_norm": 1.0844687223434448, "learning_rate": 0.0002, "epoch": 6.840215439856373, "step": 95250}, {"loss": 0.5746, "grad_norm": 1.1553566455841064, "learning_rate": 0.0002, "epoch": 6.840933572710951, "step": 95260}, {"loss": 0.5881, "grad_norm": 1.1977533102035522, "learning_rate": 0.0002, "epoch": 6.841651705565529, "step": 95270}, {"loss": 0.5562, "grad_norm": 0.9635998010635376, "learning_rate": 0.0002, "epoch": 6.842369838420108, "step": 95280}, {"loss": 0.6043, "grad_norm": 1.0867844820022583, "learning_rate": 0.0002, "epoch": 6.843087971274686, "step": 95290}, {"loss": 0.618, "grad_norm": 1.1252882480621338, "learning_rate": 0.0002, "epoch": 6.843806104129264, "step": 95300}, {"loss": 0.5468, "grad_norm": 1.1130266189575195, "learning_rate": 0.0002, "epoch": 6.844524236983842, "step": 95310}, {"loss": 0.6368, "grad_norm": 1.058863878250122, "learning_rate": 0.0002, "epoch": 6.84524236983842, "step": 95320}, {"loss": 0.6138, "grad_norm": 1.173840880393982, "learning_rate": 0.0002, "epoch": 6.845960502692998, "step": 95330}, {"loss": 0.5904, "grad_norm": 1.09446120262146, "learning_rate": 0.0002, "epoch": 6.846678635547576, "step": 95340}, {"loss": 0.5658, "grad_norm": 1.0762465000152588, "learning_rate": 0.0002, "epoch": 6.847396768402154, "step": 95350}, {"loss": 0.5601, "grad_norm": 1.0056897401809692, "learning_rate": 0.0002, "epoch": 6.848114901256732, "step": 95360}, {"loss": 0.6129, "grad_norm": 0.929190456867218, "learning_rate": 0.0002, "epoch": 6.84883303411131, "step": 95370}, {"loss": 0.5996, "grad_norm": 1.1152058839797974, "learning_rate": 0.0002, "epoch": 6.849551166965889, "step": 95380}, {"loss": 0.5939, "grad_norm": 1.0163987874984741, "learning_rate": 0.0002, "epoch": 6.850269299820467, "step": 95390}, {"loss": 0.56, "grad_norm": 1.1169452667236328, "learning_rate": 0.0002, "epoch": 6.850987432675045, "step": 95400}, {"loss": 0.5376, "grad_norm": 1.2225226163864136, "learning_rate": 0.0002, "epoch": 6.851705565529623, "step": 95410}, {"loss": 0.5937, "grad_norm": 1.0833172798156738, "learning_rate": 0.0002, "epoch": 6.852423698384201, "step": 95420}, {"loss": 0.5551, "grad_norm": 1.0159578323364258, "learning_rate": 0.0002, "epoch": 6.853141831238779, "step": 95430}, {"loss": 0.5599, "grad_norm": 1.1164990663528442, "learning_rate": 0.0002, "epoch": 6.853859964093357, "step": 95440}, {"loss": 0.6329, "grad_norm": 1.1340656280517578, "learning_rate": 0.0002, "epoch": 6.854578096947935, "step": 95450}, {"loss": 0.5686, "grad_norm": 1.1228697299957275, "learning_rate": 0.0002, "epoch": 6.855296229802514, "step": 95460}, {"loss": 0.6323, "grad_norm": 1.0189276933670044, "learning_rate": 0.0002, "epoch": 6.856014362657092, "step": 95470}, {"loss": 0.5366, "grad_norm": 1.1692779064178467, "learning_rate": 0.0002, "epoch": 6.85673249551167, "step": 95480}, {"loss": 0.5634, "grad_norm": 1.0779703855514526, "learning_rate": 0.0002, "epoch": 6.857450628366248, "step": 95490}, {"loss": 0.6031, "grad_norm": 1.0127906799316406, "learning_rate": 0.0002, "epoch": 6.858168761220826, "step": 95500}, {"loss": 0.5264, "grad_norm": 1.2124756574630737, "learning_rate": 0.0002, "epoch": 6.858886894075404, "step": 95510}, {"loss": 0.6361, "grad_norm": 1.0948219299316406, "learning_rate": 0.0002, "epoch": 6.859605026929982, "step": 95520}, {"loss": 0.5874, "grad_norm": 0.8796268701553345, "learning_rate": 0.0002, "epoch": 6.86032315978456, "step": 95530}, {"loss": 0.5824, "grad_norm": 1.0725175142288208, "learning_rate": 0.0002, "epoch": 6.861041292639138, "step": 95540}, {"loss": 0.5748, "grad_norm": 0.9067171812057495, "learning_rate": 0.0002, "epoch": 6.861759425493716, "step": 95550}, {"loss": 0.5882, "grad_norm": 1.0576670169830322, "learning_rate": 0.0002, "epoch": 6.862477558348294, "step": 95560}, {"loss": 0.5742, "grad_norm": 0.9622264504432678, "learning_rate": 0.0002, "epoch": 6.863195691202873, "step": 95570}, {"loss": 0.5824, "grad_norm": 1.0197248458862305, "learning_rate": 0.0002, "epoch": 6.863913824057451, "step": 95580}, {"loss": 0.5842, "grad_norm": 0.9197335243225098, "learning_rate": 0.0002, "epoch": 6.864631956912029, "step": 95590}, {"loss": 0.5768, "grad_norm": 1.0169627666473389, "learning_rate": 0.0002, "epoch": 6.865350089766607, "step": 95600}, {"loss": 0.5475, "grad_norm": 0.9868543744087219, "learning_rate": 0.0002, "epoch": 6.866068222621185, "step": 95610}, {"loss": 0.5702, "grad_norm": 0.9861942529678345, "learning_rate": 0.0002, "epoch": 6.866786355475763, "step": 95620}, {"loss": 0.5753, "grad_norm": 1.0906847715377808, "learning_rate": 0.0002, "epoch": 6.867504488330341, "step": 95630}, {"loss": 0.5492, "grad_norm": 1.2462674379348755, "learning_rate": 0.0002, "epoch": 6.868222621184919, "step": 95640}, {"loss": 0.5849, "grad_norm": 0.9801536202430725, "learning_rate": 0.0002, "epoch": 6.868940754039498, "step": 95650}, {"loss": 0.5849, "grad_norm": 1.0568761825561523, "learning_rate": 0.0002, "epoch": 6.869658886894076, "step": 95660}, {"loss": 0.5467, "grad_norm": 0.8431015014648438, "learning_rate": 0.0002, "epoch": 6.870377019748654, "step": 95670}, {"loss": 0.5887, "grad_norm": 1.2253447771072388, "learning_rate": 0.0002, "epoch": 6.871095152603232, "step": 95680}, {"loss": 0.594, "grad_norm": 0.8862479329109192, "learning_rate": 0.0002, "epoch": 6.87181328545781, "step": 95690}, {"loss": 0.6266, "grad_norm": 1.0733704566955566, "learning_rate": 0.0002, "epoch": 6.872531418312388, "step": 95700}, {"loss": 0.5816, "grad_norm": 0.9327288269996643, "learning_rate": 0.0002, "epoch": 6.873249551166966, "step": 95710}, {"loss": 0.5686, "grad_norm": 0.9877831339836121, "learning_rate": 0.0002, "epoch": 6.873967684021544, "step": 95720}, {"loss": 0.5423, "grad_norm": 0.9772239327430725, "learning_rate": 0.0002, "epoch": 6.874685816876122, "step": 95730}, {"loss": 0.5942, "grad_norm": 0.9799681901931763, "learning_rate": 0.0002, "epoch": 6.8754039497307, "step": 95740}, {"loss": 0.5667, "grad_norm": 1.0650758743286133, "learning_rate": 0.0002, "epoch": 6.876122082585278, "step": 95750}, {"loss": 0.5787, "grad_norm": 1.068557858467102, "learning_rate": 0.0002, "epoch": 6.876840215439857, "step": 95760}, {"loss": 0.5863, "grad_norm": 1.1335437297821045, "learning_rate": 0.0002, "epoch": 6.877558348294435, "step": 95770}, {"loss": 0.5496, "grad_norm": 0.8993158936500549, "learning_rate": 0.0002, "epoch": 6.878276481149013, "step": 95780}, {"loss": 0.5581, "grad_norm": 1.0593502521514893, "learning_rate": 0.0002, "epoch": 6.878994614003591, "step": 95790}, {"loss": 0.5691, "grad_norm": 1.2181397676467896, "learning_rate": 0.0002, "epoch": 6.879712746858169, "step": 95800}, {"loss": 0.5762, "grad_norm": 0.9614198207855225, "learning_rate": 0.0002, "epoch": 6.880430879712747, "step": 95810}, {"loss": 0.5893, "grad_norm": 1.021591067314148, "learning_rate": 0.0002, "epoch": 6.881149012567325, "step": 95820}, {"loss": 0.6063, "grad_norm": 1.3752840757369995, "learning_rate": 0.0002, "epoch": 6.881867145421903, "step": 95830}, {"loss": 0.5758, "grad_norm": 1.236355185508728, "learning_rate": 0.0002, "epoch": 6.882585278276482, "step": 95840}, {"loss": 0.5714, "grad_norm": 1.1957523822784424, "learning_rate": 0.0002, "epoch": 6.88330341113106, "step": 95850}, {"loss": 0.5738, "grad_norm": 0.8793587684631348, "learning_rate": 0.0002, "epoch": 6.884021543985638, "step": 95860}, {"loss": 0.6482, "grad_norm": 1.202054738998413, "learning_rate": 0.0002, "epoch": 6.884739676840216, "step": 95870}, {"loss": 0.5713, "grad_norm": 0.8061116337776184, "learning_rate": 0.0002, "epoch": 6.885457809694794, "step": 95880}, {"loss": 0.6138, "grad_norm": 1.0037956237792969, "learning_rate": 0.0002, "epoch": 6.886175942549372, "step": 95890}, {"loss": 0.5756, "grad_norm": 1.006435751914978, "learning_rate": 0.0002, "epoch": 6.88689407540395, "step": 95900}, {"loss": 0.6145, "grad_norm": 1.141200304031372, "learning_rate": 0.0002, "epoch": 6.887612208258528, "step": 95910}, {"loss": 0.6168, "grad_norm": 0.9017927050590515, "learning_rate": 0.0002, "epoch": 6.888330341113106, "step": 95920}, {"loss": 0.5843, "grad_norm": 0.9288154244422913, "learning_rate": 0.0002, "epoch": 6.889048473967684, "step": 95930}, {"loss": 0.564, "grad_norm": 1.2263801097869873, "learning_rate": 0.0002, "epoch": 6.8897666068222625, "step": 95940}, {"loss": 0.5884, "grad_norm": 1.2005410194396973, "learning_rate": 0.0002, "epoch": 6.8904847396768405, "step": 95950}, {"loss": 0.5625, "grad_norm": 1.0801531076431274, "learning_rate": 0.0002, "epoch": 6.8912028725314185, "step": 95960}, {"loss": 0.5671, "grad_norm": 1.1115456819534302, "learning_rate": 0.0002, "epoch": 6.8919210053859965, "step": 95970}, {"loss": 0.5774, "grad_norm": 1.062920093536377, "learning_rate": 0.0002, "epoch": 6.8926391382405745, "step": 95980}, {"loss": 0.5542, "grad_norm": 0.9343897700309753, "learning_rate": 0.0002, "epoch": 6.8933572710951525, "step": 95990}, {"loss": 0.5774, "grad_norm": 1.0236390829086304, "learning_rate": 0.0002, "epoch": 6.8940754039497305, "step": 96000}, {"loss": 0.6062, "grad_norm": 1.0680996179580688, "learning_rate": 0.0002, "epoch": 6.8947935368043085, "step": 96010}, {"loss": 0.563, "grad_norm": 1.1796760559082031, "learning_rate": 0.0002, "epoch": 6.8955116696588865, "step": 96020}, {"loss": 0.5401, "grad_norm": 0.9805570840835571, "learning_rate": 0.0002, "epoch": 6.896229802513465, "step": 96030}, {"loss": 0.5848, "grad_norm": 1.245386004447937, "learning_rate": 0.0002, "epoch": 6.896947935368043, "step": 96040}, {"loss": 0.578, "grad_norm": 1.0306174755096436, "learning_rate": 0.0002, "epoch": 6.897666068222621, "step": 96050}, {"loss": 0.6198, "grad_norm": 1.0599836111068726, "learning_rate": 0.0002, "epoch": 6.898384201077199, "step": 96060}, {"loss": 0.6029, "grad_norm": 1.1438795328140259, "learning_rate": 0.0002, "epoch": 6.899102333931777, "step": 96070}, {"loss": 0.5611, "grad_norm": 0.9044751524925232, "learning_rate": 0.0002, "epoch": 6.899820466786355, "step": 96080}, {"loss": 0.5623, "grad_norm": 0.9689591526985168, "learning_rate": 0.0002, "epoch": 6.900538599640933, "step": 96090}, {"loss": 0.5645, "grad_norm": 1.003217339515686, "learning_rate": 0.0002, "epoch": 6.901256732495511, "step": 96100}, {"loss": 0.5999, "grad_norm": 1.1630250215530396, "learning_rate": 0.0002, "epoch": 6.901974865350089, "step": 96110}, {"loss": 0.5661, "grad_norm": 1.0304425954818726, "learning_rate": 0.0002, "epoch": 6.902692998204667, "step": 96120}, {"loss": 0.5584, "grad_norm": 1.0148587226867676, "learning_rate": 0.0002, "epoch": 6.903411131059246, "step": 96130}, {"loss": 0.6235, "grad_norm": 1.3722255229949951, "learning_rate": 0.0002, "epoch": 6.904129263913824, "step": 96140}, {"loss": 0.6124, "grad_norm": 1.1518549919128418, "learning_rate": 0.0002, "epoch": 6.904847396768402, "step": 96150}, {"loss": 0.5388, "grad_norm": 1.0342949628829956, "learning_rate": 0.0002, "epoch": 6.90556552962298, "step": 96160}, {"loss": 0.5691, "grad_norm": 1.0178996324539185, "learning_rate": 0.0002, "epoch": 6.906283662477558, "step": 96170}, {"loss": 0.6578, "grad_norm": 1.3429099321365356, "learning_rate": 0.0002, "epoch": 6.907001795332136, "step": 96180}, {"loss": 0.5263, "grad_norm": 1.2281367778778076, "learning_rate": 0.0002, "epoch": 6.907719928186714, "step": 96190}, {"loss": 0.6072, "grad_norm": 0.8190469145774841, "learning_rate": 0.0002, "epoch": 6.908438061041292, "step": 96200}, {"loss": 0.5929, "grad_norm": 1.1344635486602783, "learning_rate": 0.0002, "epoch": 6.909156193895871, "step": 96210}, {"loss": 0.5793, "grad_norm": 1.0540097951889038, "learning_rate": 0.0002, "epoch": 6.909874326750449, "step": 96220}, {"loss": 0.5575, "grad_norm": 1.044974446296692, "learning_rate": 0.0002, "epoch": 6.910592459605027, "step": 96230}, {"loss": 0.5782, "grad_norm": 0.6890087723731995, "learning_rate": 0.0002, "epoch": 6.911310592459605, "step": 96240}, {"loss": 0.5615, "grad_norm": 1.1266905069351196, "learning_rate": 0.0002, "epoch": 6.912028725314183, "step": 96250}, {"loss": 0.5922, "grad_norm": 1.3173121213912964, "learning_rate": 0.0002, "epoch": 6.912746858168761, "step": 96260}, {"loss": 0.5336, "grad_norm": 1.0043895244598389, "learning_rate": 0.0002, "epoch": 6.913464991023339, "step": 96270}, {"loss": 0.5642, "grad_norm": 1.0634605884552002, "learning_rate": 0.0002, "epoch": 6.914183123877917, "step": 96280}, {"loss": 0.5241, "grad_norm": 1.234516978263855, "learning_rate": 0.0002, "epoch": 6.914901256732495, "step": 96290}, {"loss": 0.5791, "grad_norm": 1.042026162147522, "learning_rate": 0.0002, "epoch": 6.915619389587073, "step": 96300}, {"loss": 0.5396, "grad_norm": 1.063632845878601, "learning_rate": 0.0002, "epoch": 6.916337522441651, "step": 96310}, {"loss": 0.6265, "grad_norm": 1.0733225345611572, "learning_rate": 0.0002, "epoch": 6.91705565529623, "step": 96320}, {"loss": 0.6003, "grad_norm": 1.4382662773132324, "learning_rate": 0.0002, "epoch": 6.917773788150808, "step": 96330}, {"loss": 0.5732, "grad_norm": 1.19964599609375, "learning_rate": 0.0002, "epoch": 6.918491921005386, "step": 96340}, {"loss": 0.6177, "grad_norm": 0.9012235403060913, "learning_rate": 0.0002, "epoch": 6.919210053859964, "step": 96350}, {"loss": 0.6113, "grad_norm": 0.8663099408149719, "learning_rate": 0.0002, "epoch": 6.919928186714542, "step": 96360}, {"loss": 0.5164, "grad_norm": 0.8944193124771118, "learning_rate": 0.0002, "epoch": 6.92064631956912, "step": 96370}, {"loss": 0.5556, "grad_norm": 1.1201437711715698, "learning_rate": 0.0002, "epoch": 6.921364452423698, "step": 96380}, {"loss": 0.6219, "grad_norm": 1.0434664487838745, "learning_rate": 0.0002, "epoch": 6.922082585278276, "step": 96390}, {"loss": 0.5978, "grad_norm": 1.2666915655136108, "learning_rate": 0.0002, "epoch": 6.922800718132855, "step": 96400}, {"loss": 0.6231, "grad_norm": 0.9610332250595093, "learning_rate": 0.0002, "epoch": 6.923518850987433, "step": 96410}, {"loss": 0.5657, "grad_norm": 1.1521750688552856, "learning_rate": 0.0002, "epoch": 6.924236983842011, "step": 96420}, {"loss": 0.5682, "grad_norm": 0.921970546245575, "learning_rate": 0.0002, "epoch": 6.924955116696589, "step": 96430}, {"loss": 0.5761, "grad_norm": 1.1277226209640503, "learning_rate": 0.0002, "epoch": 6.925673249551167, "step": 96440}, {"loss": 0.5978, "grad_norm": 1.147425889968872, "learning_rate": 0.0002, "epoch": 6.926391382405745, "step": 96450}, {"loss": 0.6032, "grad_norm": 1.0128270387649536, "learning_rate": 0.0002, "epoch": 6.927109515260323, "step": 96460}, {"loss": 0.5747, "grad_norm": 1.0726343393325806, "learning_rate": 0.0002, "epoch": 6.927827648114901, "step": 96470}, {"loss": 0.6005, "grad_norm": 0.9902656078338623, "learning_rate": 0.0002, "epoch": 6.928545780969479, "step": 96480}, {"loss": 0.5477, "grad_norm": 0.9662004709243774, "learning_rate": 0.0002, "epoch": 6.929263913824057, "step": 96490}, {"loss": 0.5871, "grad_norm": 0.9595714807510376, "learning_rate": 0.0002, "epoch": 6.929982046678636, "step": 96500}, {"loss": 0.6144, "grad_norm": 1.0666614770889282, "learning_rate": 0.0002, "epoch": 6.930700179533214, "step": 96510}, {"loss": 0.5752, "grad_norm": 0.8744403123855591, "learning_rate": 0.0002, "epoch": 6.931418312387792, "step": 96520}, {"loss": 0.6124, "grad_norm": 1.0382628440856934, "learning_rate": 0.0002, "epoch": 6.93213644524237, "step": 96530}, {"loss": 0.5445, "grad_norm": 0.9165884256362915, "learning_rate": 0.0002, "epoch": 6.932854578096948, "step": 96540}, {"loss": 0.5936, "grad_norm": 0.9073842763900757, "learning_rate": 0.0002, "epoch": 6.933572710951526, "step": 96550}, {"loss": 0.5934, "grad_norm": 1.100635051727295, "learning_rate": 0.0002, "epoch": 6.934290843806104, "step": 96560}, {"loss": 0.5869, "grad_norm": 1.1503266096115112, "learning_rate": 0.0002, "epoch": 6.935008976660682, "step": 96570}, {"loss": 0.571, "grad_norm": 0.9526805281639099, "learning_rate": 0.0002, "epoch": 6.93572710951526, "step": 96580}, {"loss": 0.5959, "grad_norm": 1.115716814994812, "learning_rate": 0.0002, "epoch": 6.936445242369839, "step": 96590}, {"loss": 0.6071, "grad_norm": 1.0669193267822266, "learning_rate": 0.0002, "epoch": 6.937163375224417, "step": 96600}, {"loss": 0.6151, "grad_norm": 1.0191189050674438, "learning_rate": 0.0002, "epoch": 6.937881508078995, "step": 96610}, {"loss": 0.5803, "grad_norm": 1.1885946989059448, "learning_rate": 0.0002, "epoch": 6.938599640933573, "step": 96620}, {"loss": 0.5476, "grad_norm": 0.9806031584739685, "learning_rate": 0.0002, "epoch": 6.939317773788151, "step": 96630}, {"loss": 0.5994, "grad_norm": 0.9700000286102295, "learning_rate": 0.0002, "epoch": 6.940035906642729, "step": 96640}, {"loss": 0.5627, "grad_norm": 1.0870105028152466, "learning_rate": 0.0002, "epoch": 6.940754039497307, "step": 96650}, {"loss": 0.6031, "grad_norm": 0.7441867589950562, "learning_rate": 0.0002, "epoch": 6.941472172351885, "step": 96660}, {"loss": 0.5989, "grad_norm": 0.8631957173347473, "learning_rate": 0.0002, "epoch": 6.942190305206463, "step": 96670}, {"loss": 0.6299, "grad_norm": 1.0538444519042969, "learning_rate": 0.0002, "epoch": 6.942908438061041, "step": 96680}, {"loss": 0.5858, "grad_norm": 1.0235437154769897, "learning_rate": 0.0002, "epoch": 6.94362657091562, "step": 96690}, {"loss": 0.5978, "grad_norm": 1.069114089012146, "learning_rate": 0.0002, "epoch": 6.944344703770198, "step": 96700}, {"loss": 0.5613, "grad_norm": 1.0421861410140991, "learning_rate": 0.0002, "epoch": 6.945062836624776, "step": 96710}, {"loss": 0.5694, "grad_norm": 0.9244136810302734, "learning_rate": 0.0002, "epoch": 6.945780969479354, "step": 96720}, {"loss": 0.6043, "grad_norm": 0.962041437625885, "learning_rate": 0.0002, "epoch": 6.946499102333932, "step": 96730}, {"loss": 0.5968, "grad_norm": 1.049677848815918, "learning_rate": 0.0002, "epoch": 6.94721723518851, "step": 96740}, {"loss": 0.5989, "grad_norm": 1.0276710987091064, "learning_rate": 0.0002, "epoch": 6.947935368043088, "step": 96750}, {"loss": 0.5799, "grad_norm": 1.036650538444519, "learning_rate": 0.0002, "epoch": 6.948653500897666, "step": 96760}, {"loss": 0.5631, "grad_norm": 1.0379945039749146, "learning_rate": 0.0002, "epoch": 6.949371633752245, "step": 96770}, {"loss": 0.5439, "grad_norm": 0.9768070578575134, "learning_rate": 0.0002, "epoch": 6.950089766606823, "step": 96780}, {"loss": 0.5646, "grad_norm": 1.0515118837356567, "learning_rate": 0.0002, "epoch": 6.950807899461401, "step": 96790}, {"loss": 0.5513, "grad_norm": 0.9186223149299622, "learning_rate": 0.0002, "epoch": 6.951526032315979, "step": 96800}, {"loss": 0.6109, "grad_norm": 1.0430902242660522, "learning_rate": 0.0002, "epoch": 6.952244165170557, "step": 96810}, {"loss": 0.5823, "grad_norm": 0.7750678658485413, "learning_rate": 0.0002, "epoch": 6.952962298025135, "step": 96820}, {"loss": 0.6031, "grad_norm": 1.1721138954162598, "learning_rate": 0.0002, "epoch": 6.953680430879713, "step": 96830}, {"loss": 0.5527, "grad_norm": 1.2088165283203125, "learning_rate": 0.0002, "epoch": 6.954398563734291, "step": 96840}, {"loss": 0.5768, "grad_norm": 0.9956802129745483, "learning_rate": 0.0002, "epoch": 6.955116696588869, "step": 96850}, {"loss": 0.6052, "grad_norm": 1.0444421768188477, "learning_rate": 0.0002, "epoch": 6.955834829443447, "step": 96860}, {"loss": 0.5615, "grad_norm": 1.2420955896377563, "learning_rate": 0.0002, "epoch": 6.956552962298025, "step": 96870}, {"loss": 0.5377, "grad_norm": 1.0187203884124756, "learning_rate": 0.0002, "epoch": 6.957271095152604, "step": 96880}, {"loss": 0.5683, "grad_norm": 1.0883756875991821, "learning_rate": 0.0002, "epoch": 6.957989228007182, "step": 96890}, {"loss": 0.5406, "grad_norm": 1.1869568824768066, "learning_rate": 0.0002, "epoch": 6.95870736086176, "step": 96900}, {"loss": 0.5901, "grad_norm": 1.242119312286377, "learning_rate": 0.0002, "epoch": 6.959425493716338, "step": 96910}, {"loss": 0.5901, "grad_norm": 1.0262869596481323, "learning_rate": 0.0002, "epoch": 6.960143626570916, "step": 96920}, {"loss": 0.5633, "grad_norm": 0.9577149152755737, "learning_rate": 0.0002, "epoch": 6.960861759425494, "step": 96930}, {"loss": 0.5805, "grad_norm": 0.9224622249603271, "learning_rate": 0.0002, "epoch": 6.961579892280072, "step": 96940}, {"loss": 0.6157, "grad_norm": 1.0761854648590088, "learning_rate": 0.0002, "epoch": 6.96229802513465, "step": 96950}, {"loss": 0.6142, "grad_norm": 1.1029279232025146, "learning_rate": 0.0002, "epoch": 6.9630161579892285, "step": 96960}, {"loss": 0.5857, "grad_norm": 1.1132091283798218, "learning_rate": 0.0002, "epoch": 6.9637342908438065, "step": 96970}, {"loss": 0.5777, "grad_norm": 0.9723706245422363, "learning_rate": 0.0002, "epoch": 6.9644524236983845, "step": 96980}, {"loss": 0.5966, "grad_norm": 1.0453037023544312, "learning_rate": 0.0002, "epoch": 6.9651705565529625, "step": 96990}, {"loss": 0.5808, "grad_norm": 1.16423499584198, "learning_rate": 0.0002, "epoch": 6.9658886894075405, "step": 97000}, {"loss": 0.5734, "grad_norm": 1.1522771120071411, "learning_rate": 0.0002, "epoch": 6.9666068222621185, "step": 97010}, {"loss": 0.6009, "grad_norm": 1.020828127861023, "learning_rate": 0.0002, "epoch": 6.9673249551166965, "step": 97020}, {"loss": 0.6043, "grad_norm": 1.0301889181137085, "learning_rate": 0.0002, "epoch": 6.9680430879712745, "step": 97030}, {"loss": 0.6041, "grad_norm": 1.0615862607955933, "learning_rate": 0.0002, "epoch": 6.9687612208258525, "step": 97040}, {"loss": 0.5875, "grad_norm": 1.1750848293304443, "learning_rate": 0.0002, "epoch": 6.9694793536804305, "step": 97050}, {"loss": 0.5812, "grad_norm": 0.916283905506134, "learning_rate": 0.0002, "epoch": 6.9701974865350085, "step": 97060}, {"loss": 0.6158, "grad_norm": 1.0715203285217285, "learning_rate": 0.0002, "epoch": 6.970915619389587, "step": 97070}, {"loss": 0.6152, "grad_norm": 1.1171340942382812, "learning_rate": 0.0002, "epoch": 6.971633752244165, "step": 97080}, {"loss": 0.6361, "grad_norm": 0.886015772819519, "learning_rate": 0.0002, "epoch": 6.972351885098743, "step": 97090}, {"loss": 0.5934, "grad_norm": 0.9498746991157532, "learning_rate": 0.0002, "epoch": 6.973070017953321, "step": 97100}, {"loss": 0.5951, "grad_norm": 1.1563011407852173, "learning_rate": 0.0002, "epoch": 6.973788150807899, "step": 97110}, {"loss": 0.5966, "grad_norm": 0.9086321592330933, "learning_rate": 0.0002, "epoch": 6.974506283662477, "step": 97120}, {"loss": 0.6268, "grad_norm": 0.9804864525794983, "learning_rate": 0.0002, "epoch": 6.975224416517055, "step": 97130}, {"loss": 0.5282, "grad_norm": 1.5005993843078613, "learning_rate": 0.0002, "epoch": 6.975942549371633, "step": 97140}, {"loss": 0.5446, "grad_norm": 1.1720819473266602, "learning_rate": 0.0002, "epoch": 6.976660682226212, "step": 97150}, {"loss": 0.5325, "grad_norm": 1.095572590827942, "learning_rate": 0.0002, "epoch": 6.97737881508079, "step": 97160}, {"loss": 0.5721, "grad_norm": 1.1880861520767212, "learning_rate": 0.0002, "epoch": 6.978096947935368, "step": 97170}, {"loss": 0.5611, "grad_norm": 1.0959832668304443, "learning_rate": 0.0002, "epoch": 6.978815080789946, "step": 97180}, {"loss": 0.5834, "grad_norm": 1.2158745527267456, "learning_rate": 0.0002, "epoch": 6.979533213644524, "step": 97190}, {"loss": 0.5937, "grad_norm": 1.0073821544647217, "learning_rate": 0.0002, "epoch": 6.980251346499102, "step": 97200}, {"loss": 0.6035, "grad_norm": 0.8503464460372925, "learning_rate": 0.0002, "epoch": 6.98096947935368, "step": 97210}, {"loss": 0.651, "grad_norm": 0.9399861097335815, "learning_rate": 0.0002, "epoch": 6.981687612208258, "step": 97220}, {"loss": 0.6135, "grad_norm": 1.1167447566986084, "learning_rate": 0.0002, "epoch": 6.982405745062836, "step": 97230}, {"loss": 0.5575, "grad_norm": 1.2710384130477905, "learning_rate": 0.0002, "epoch": 6.983123877917414, "step": 97240}, {"loss": 0.5905, "grad_norm": 0.8514767289161682, "learning_rate": 0.0002, "epoch": 6.983842010771993, "step": 97250}, {"loss": 0.5932, "grad_norm": 0.9983348846435547, "learning_rate": 0.0002, "epoch": 6.984560143626571, "step": 97260}, {"loss": 0.5975, "grad_norm": 1.1713277101516724, "learning_rate": 0.0002, "epoch": 6.985278276481149, "step": 97270}, {"loss": 0.5297, "grad_norm": 1.346272349357605, "learning_rate": 0.0002, "epoch": 6.985996409335727, "step": 97280}, {"loss": 0.5847, "grad_norm": 1.0687556266784668, "learning_rate": 0.0002, "epoch": 6.986714542190305, "step": 97290}, {"loss": 0.5938, "grad_norm": 1.035805106163025, "learning_rate": 0.0002, "epoch": 6.987432675044883, "step": 97300}, {"loss": 0.5907, "grad_norm": 1.149027705192566, "learning_rate": 0.0002, "epoch": 6.988150807899461, "step": 97310}, {"loss": 0.5534, "grad_norm": 0.9672921895980835, "learning_rate": 0.0002, "epoch": 6.988868940754039, "step": 97320}, {"loss": 0.552, "grad_norm": 1.0306763648986816, "learning_rate": 0.0002, "epoch": 6.989587073608618, "step": 97330}, {"loss": 0.5705, "grad_norm": 1.1457809209823608, "learning_rate": 0.0002, "epoch": 6.990305206463196, "step": 97340}, {"loss": 0.5767, "grad_norm": 0.9718224406242371, "learning_rate": 0.0002, "epoch": 6.991023339317774, "step": 97350}, {"loss": 0.571, "grad_norm": 0.9872630834579468, "learning_rate": 0.0002, "epoch": 6.991741472172352, "step": 97360}, {"loss": 0.611, "grad_norm": 1.0302132368087769, "learning_rate": 0.0002, "epoch": 6.99245960502693, "step": 97370}, {"loss": 0.6, "grad_norm": 1.001103162765503, "learning_rate": 0.0002, "epoch": 6.993177737881508, "step": 97380}, {"loss": 0.5612, "grad_norm": 0.9207047820091248, "learning_rate": 0.0002, "epoch": 6.993895870736086, "step": 97390}, {"loss": 0.5752, "grad_norm": 1.1986219882965088, "learning_rate": 0.0002, "epoch": 6.994614003590664, "step": 97400}, {"loss": 0.5938, "grad_norm": 1.343885064125061, "learning_rate": 0.0002, "epoch": 6.995332136445242, "step": 97410}, {"loss": 0.5869, "grad_norm": 1.0611628293991089, "learning_rate": 0.0002, "epoch": 6.99605026929982, "step": 97420}, {"loss": 0.6378, "grad_norm": 0.9514605402946472, "learning_rate": 0.0002, "epoch": 6.996768402154398, "step": 97430}, {"loss": 0.5726, "grad_norm": 1.0259917974472046, "learning_rate": 0.0002, "epoch": 6.997486535008977, "step": 97440}, {"loss": 0.5762, "grad_norm": 1.0735033750534058, "learning_rate": 0.0002, "epoch": 6.998204667863555, "step": 97450}, {"loss": 0.6173, "grad_norm": 1.053984522819519, "learning_rate": 0.0002, "epoch": 6.998922800718133, "step": 97460}, {"loss": 0.581, "grad_norm": 1.0285807847976685, "learning_rate": 0.0002, "epoch": 6.999640933572711, "step": 97470}]} +{"epoch": 8.0, "step": 111400, "epoch_duration": 14997.547347307205, "total_accumulated_duration": 123763.49267339706, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-115177-sd-42/checkpoint-27850", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.5816, "grad_norm": 1.0291756391525269, "learning_rate": 0.0002, "epoch": 0.000718132854578097, "step": 10}, {"loss": 1.1527, "grad_norm": 0.6570823192596436, "learning_rate": 0.0002, "epoch": 0.001436265709156194, "step": 20}, {"loss": 1.0014, "grad_norm": 0.693844199180603, "learning_rate": 0.0002, "epoch": 0.0021543985637342907, "step": 30}, {"loss": 0.9377, "grad_norm": 0.5608532428741455, "learning_rate": 0.0002, "epoch": 0.002872531418312388, "step": 40}, {"loss": 0.9533, "grad_norm": 0.549075722694397, "learning_rate": 0.0002, "epoch": 0.003590664272890485, "step": 50}, {"loss": 0.9164, "grad_norm": 0.47189879417419434, "learning_rate": 0.0002, "epoch": 0.004308797127468581, "step": 60}, {"loss": 0.8898, "grad_norm": 0.5799676775932312, "learning_rate": 0.0002, "epoch": 0.005026929982046679, "step": 70}, {"loss": 0.859, "grad_norm": 0.45907193422317505, "learning_rate": 0.0002, "epoch": 0.005745062836624776, "step": 80}, {"loss": 0.8697, "grad_norm": 0.4373045861721039, "learning_rate": 0.0002, "epoch": 0.006463195691202872, "step": 90}, {"loss": 0.8879, "grad_norm": 0.5636304020881653, "learning_rate": 0.0002, "epoch": 0.00718132854578097, "step": 100}, {"loss": 0.8397, "grad_norm": 0.5248253345489502, "learning_rate": 0.0002, "epoch": 0.007899461400359067, "step": 110}, {"loss": 0.9021, "grad_norm": 0.5082874298095703, "learning_rate": 0.0002, "epoch": 0.008617594254937163, "step": 120}, {"loss": 0.8678, "grad_norm": 0.42670881748199463, "learning_rate": 0.0002, "epoch": 0.00933572710951526, "step": 130}, {"loss": 0.7847, "grad_norm": 0.43311649560928345, "learning_rate": 0.0002, "epoch": 0.010053859964093357, "step": 140}, {"loss": 0.9252, "grad_norm": 0.43456509709358215, "learning_rate": 0.0002, "epoch": 0.010771992818671455, "step": 150}, {"loss": 0.8812, "grad_norm": 0.9222815632820129, "learning_rate": 0.0002, "epoch": 0.011490125673249552, "step": 160}, {"loss": 0.8651, "grad_norm": 0.42752256989479065, "learning_rate": 0.0002, "epoch": 0.012208258527827648, "step": 170}, {"loss": 0.8898, "grad_norm": 0.4175542891025543, "learning_rate": 0.0002, "epoch": 0.012926391382405745, "step": 180}, {"loss": 0.8519, "grad_norm": 0.4377831518650055, "learning_rate": 0.0002, "epoch": 0.013644524236983842, "step": 190}, {"loss": 0.8849, "grad_norm": 0.47263655066490173, "learning_rate": 0.0002, "epoch": 0.01436265709156194, "step": 200}, {"loss": 0.8764, "grad_norm": 0.3870520293712616, "learning_rate": 0.0002, "epoch": 0.015080789946140035, "step": 210}, {"loss": 0.833, "grad_norm": 0.4950464963912964, "learning_rate": 0.0002, "epoch": 0.015798922800718134, "step": 220}, {"loss": 0.8323, "grad_norm": 0.4643295407295227, "learning_rate": 0.0002, "epoch": 0.01651705565529623, "step": 230}, {"loss": 0.8363, "grad_norm": 0.5152903199195862, "learning_rate": 0.0002, "epoch": 0.017235188509874325, "step": 240}, {"loss": 0.873, "grad_norm": 0.3800727427005768, "learning_rate": 0.0002, "epoch": 0.017953321364452424, "step": 250}, {"loss": 0.8252, "grad_norm": 0.43700528144836426, "learning_rate": 0.0002, "epoch": 0.01867145421903052, "step": 260}, {"loss": 0.8686, "grad_norm": 0.3712887763977051, "learning_rate": 0.0002, "epoch": 0.01938958707360862, "step": 270}, {"loss": 0.8329, "grad_norm": 0.4202553629875183, "learning_rate": 0.0002, "epoch": 0.020107719928186715, "step": 280}, {"loss": 0.8143, "grad_norm": 0.40585094690322876, "learning_rate": 0.0002, "epoch": 0.02082585278276481, "step": 290}, {"loss": 0.8463, "grad_norm": 0.4685470759868622, "learning_rate": 0.0002, "epoch": 0.02154398563734291, "step": 300}, {"loss": 0.8321, "grad_norm": 0.373169481754303, "learning_rate": 0.0002, "epoch": 0.022262118491921005, "step": 310}, {"loss": 0.8031, "grad_norm": 0.39681482315063477, "learning_rate": 0.0002, "epoch": 0.022980251346499104, "step": 320}, {"loss": 0.8667, "grad_norm": 0.3919322192668915, "learning_rate": 0.0002, "epoch": 0.0236983842010772, "step": 330}, {"loss": 0.8196, "grad_norm": 0.4728981554508209, "learning_rate": 0.0002, "epoch": 0.024416517055655295, "step": 340}, {"loss": 0.8662, "grad_norm": 0.42439374327659607, "learning_rate": 0.0002, "epoch": 0.025134649910233394, "step": 350}, {"loss": 0.8618, "grad_norm": 0.425650030374527, "learning_rate": 0.0002, "epoch": 0.02585278276481149, "step": 360}, {"loss": 0.8249, "grad_norm": 0.4076762795448303, "learning_rate": 0.0002, "epoch": 0.02657091561938959, "step": 370}, {"loss": 0.8293, "grad_norm": 0.44335922598838806, "learning_rate": 0.0002, "epoch": 0.027289048473967684, "step": 380}, {"loss": 0.8288, "grad_norm": 0.5313619375228882, "learning_rate": 0.0002, "epoch": 0.02800718132854578, "step": 390}, {"loss": 0.8431, "grad_norm": 0.37089797854423523, "learning_rate": 0.0002, "epoch": 0.02872531418312388, "step": 400}, {"loss": 0.7644, "grad_norm": 0.5193604826927185, "learning_rate": 0.0002, "epoch": 0.029443447037701975, "step": 410}, {"loss": 0.7853, "grad_norm": 0.4428552985191345, "learning_rate": 0.0002, "epoch": 0.03016157989228007, "step": 420}, {"loss": 0.8641, "grad_norm": 0.384171724319458, "learning_rate": 0.0002, "epoch": 0.03087971274685817, "step": 430}, {"loss": 0.8236, "grad_norm": 0.3906913101673126, "learning_rate": 0.0002, "epoch": 0.03159784560143627, "step": 440}, {"loss": 0.8215, "grad_norm": 0.5365669131278992, "learning_rate": 0.0002, "epoch": 0.03231597845601436, "step": 450}, {"loss": 0.8376, "grad_norm": 0.4785287380218506, "learning_rate": 0.0002, "epoch": 0.03303411131059246, "step": 460}, {"loss": 0.8439, "grad_norm": 0.40048182010650635, "learning_rate": 0.0002, "epoch": 0.03375224416517056, "step": 470}, {"loss": 0.8306, "grad_norm": 0.49529239535331726, "learning_rate": 0.0002, "epoch": 0.03447037701974865, "step": 480}, {"loss": 0.8653, "grad_norm": 0.5853474140167236, "learning_rate": 0.0002, "epoch": 0.03518850987432675, "step": 490}, {"loss": 0.7952, "grad_norm": 0.3802863359451294, "learning_rate": 0.0002, "epoch": 0.03590664272890485, "step": 500}, {"loss": 0.8986, "grad_norm": 0.40374308824539185, "learning_rate": 0.0002, "epoch": 0.03662477558348295, "step": 510}, {"loss": 0.8495, "grad_norm": 0.4320009648799896, "learning_rate": 0.0002, "epoch": 0.03734290843806104, "step": 520}, {"loss": 0.8838, "grad_norm": 0.5198846459388733, "learning_rate": 0.0002, "epoch": 0.03806104129263914, "step": 530}, {"loss": 0.8343, "grad_norm": 0.4136947989463806, "learning_rate": 0.0002, "epoch": 0.03877917414721724, "step": 540}, {"loss": 0.8752, "grad_norm": 0.39344364404678345, "learning_rate": 0.0002, "epoch": 0.03949730700179533, "step": 550}, {"loss": 0.8088, "grad_norm": 0.4659644067287445, "learning_rate": 0.0002, "epoch": 0.04021543985637343, "step": 560}, {"loss": 0.766, "grad_norm": 0.3898842930793762, "learning_rate": 0.0002, "epoch": 0.04093357271095153, "step": 570}, {"loss": 0.7806, "grad_norm": 0.3964841961860657, "learning_rate": 0.0002, "epoch": 0.04165170556552962, "step": 580}, {"loss": 0.801, "grad_norm": 0.5172179341316223, "learning_rate": 0.0002, "epoch": 0.04236983842010772, "step": 590}, {"loss": 0.8253, "grad_norm": 0.5362544059753418, "learning_rate": 0.0002, "epoch": 0.04308797127468582, "step": 600}, {"loss": 0.8701, "grad_norm": 0.3975909948348999, "learning_rate": 0.0002, "epoch": 0.04380610412926391, "step": 610}, {"loss": 0.844, "grad_norm": 0.3905031085014343, "learning_rate": 0.0002, "epoch": 0.04452423698384201, "step": 620}, {"loss": 0.7723, "grad_norm": 0.5148088932037354, "learning_rate": 0.0002, "epoch": 0.04524236983842011, "step": 630}, {"loss": 0.8309, "grad_norm": 0.38826194405555725, "learning_rate": 0.0002, "epoch": 0.04596050269299821, "step": 640}, {"loss": 0.8379, "grad_norm": 0.5432049036026001, "learning_rate": 0.0002, "epoch": 0.0466786355475763, "step": 650}, {"loss": 0.838, "grad_norm": 0.42048221826553345, "learning_rate": 0.0002, "epoch": 0.0473967684021544, "step": 660}, {"loss": 0.8337, "grad_norm": 0.4683088958263397, "learning_rate": 0.0002, "epoch": 0.0481149012567325, "step": 670}, {"loss": 0.7982, "grad_norm": 0.4623735249042511, "learning_rate": 0.0002, "epoch": 0.04883303411131059, "step": 680}, {"loss": 0.8905, "grad_norm": 0.509128212928772, "learning_rate": 0.0002, "epoch": 0.04955116696588869, "step": 690}, {"loss": 0.8193, "grad_norm": 0.45767295360565186, "learning_rate": 0.0002, "epoch": 0.05026929982046679, "step": 700}, {"loss": 0.7658, "grad_norm": 0.4023726284503937, "learning_rate": 0.0002, "epoch": 0.05098743267504488, "step": 710}, {"loss": 0.8552, "grad_norm": 0.4407201409339905, "learning_rate": 0.0002, "epoch": 0.05170556552962298, "step": 720}, {"loss": 0.8369, "grad_norm": 0.41862091422080994, "learning_rate": 0.0002, "epoch": 0.05242369838420108, "step": 730}, {"loss": 0.8856, "grad_norm": 0.37473055720329285, "learning_rate": 0.0002, "epoch": 0.05314183123877918, "step": 740}, {"loss": 0.8282, "grad_norm": 0.4882921576499939, "learning_rate": 0.0002, "epoch": 0.05385996409335727, "step": 750}, {"loss": 0.8257, "grad_norm": 0.47890132665634155, "learning_rate": 0.0002, "epoch": 0.05457809694793537, "step": 760}, {"loss": 0.9222, "grad_norm": 0.5811166167259216, "learning_rate": 0.0002, "epoch": 0.05529622980251347, "step": 770}, {"loss": 0.7943, "grad_norm": 0.41113588213920593, "learning_rate": 0.0002, "epoch": 0.05601436265709156, "step": 780}, {"loss": 0.791, "grad_norm": 0.4120602607727051, "learning_rate": 0.0002, "epoch": 0.05673249551166966, "step": 790}, {"loss": 0.9038, "grad_norm": 0.39287394285202026, "learning_rate": 0.0002, "epoch": 0.05745062836624776, "step": 800}, {"loss": 0.8131, "grad_norm": 0.3986941874027252, "learning_rate": 0.0002, "epoch": 0.05816876122082585, "step": 810}, {"loss": 0.8268, "grad_norm": 0.4264012575149536, "learning_rate": 0.0002, "epoch": 0.05888689407540395, "step": 820}, {"loss": 0.7881, "grad_norm": 0.481139600276947, "learning_rate": 0.0002, "epoch": 0.05960502692998205, "step": 830}, {"loss": 0.8477, "grad_norm": 0.5561784505844116, "learning_rate": 0.0002, "epoch": 0.06032315978456014, "step": 840}, {"loss": 0.7817, "grad_norm": 0.4787197411060333, "learning_rate": 0.0002, "epoch": 0.06104129263913824, "step": 850}, {"loss": 0.8567, "grad_norm": 0.46454647183418274, "learning_rate": 0.0002, "epoch": 0.06175942549371634, "step": 860}, {"loss": 0.8429, "grad_norm": 0.5929669141769409, "learning_rate": 0.0002, "epoch": 0.06247755834829444, "step": 870}, {"loss": 0.8019, "grad_norm": 0.4561384618282318, "learning_rate": 0.0002, "epoch": 0.06319569120287254, "step": 880}, {"loss": 0.8686, "grad_norm": 0.45767998695373535, "learning_rate": 0.0002, "epoch": 0.06391382405745062, "step": 890}, {"loss": 0.818, "grad_norm": 0.42475444078445435, "learning_rate": 0.0002, "epoch": 0.06463195691202872, "step": 900}, {"loss": 0.8579, "grad_norm": 0.4911022484302521, "learning_rate": 0.0002, "epoch": 0.06535008976660682, "step": 910}, {"loss": 0.8067, "grad_norm": 0.5229166746139526, "learning_rate": 0.0002, "epoch": 0.06606822262118492, "step": 920}, {"loss": 0.8563, "grad_norm": 0.38134580850601196, "learning_rate": 0.0002, "epoch": 0.06678635547576302, "step": 930}, {"loss": 0.815, "grad_norm": 0.4171486496925354, "learning_rate": 0.0002, "epoch": 0.06750448833034112, "step": 940}, {"loss": 0.8122, "grad_norm": 0.45171529054641724, "learning_rate": 0.0002, "epoch": 0.06822262118491922, "step": 950}, {"loss": 0.8436, "grad_norm": 0.44889307022094727, "learning_rate": 0.0002, "epoch": 0.0689407540394973, "step": 960}, {"loss": 0.8149, "grad_norm": 0.44902464747428894, "learning_rate": 0.0002, "epoch": 0.0696588868940754, "step": 970}, {"loss": 0.7916, "grad_norm": 0.4671969413757324, "learning_rate": 0.0002, "epoch": 0.0703770197486535, "step": 980}, {"loss": 0.8147, "grad_norm": 0.4686984717845917, "learning_rate": 0.0002, "epoch": 0.0710951526032316, "step": 990}, {"loss": 0.806, "grad_norm": 0.4513658583164215, "learning_rate": 0.0002, "epoch": 0.0718132854578097, "step": 1000}, {"loss": 0.8348, "grad_norm": 0.48861828446388245, "learning_rate": 0.0002, "epoch": 0.0725314183123878, "step": 1010}, {"loss": 0.8038, "grad_norm": 0.7603165507316589, "learning_rate": 0.0002, "epoch": 0.0732495511669659, "step": 1020}, {"loss": 0.7844, "grad_norm": 0.501654863357544, "learning_rate": 0.0002, "epoch": 0.07396768402154398, "step": 1030}, {"loss": 0.7623, "grad_norm": 0.45291560888290405, "learning_rate": 0.0002, "epoch": 0.07468581687612208, "step": 1040}, {"loss": 0.8174, "grad_norm": 0.42454713582992554, "learning_rate": 0.0002, "epoch": 0.07540394973070018, "step": 1050}, {"loss": 0.7874, "grad_norm": 0.4655592441558838, "learning_rate": 0.0002, "epoch": 0.07612208258527828, "step": 1060}, {"loss": 0.8855, "grad_norm": 0.5011071562767029, "learning_rate": 0.0002, "epoch": 0.07684021543985638, "step": 1070}, {"loss": 0.8502, "grad_norm": 0.37221577763557434, "learning_rate": 0.0002, "epoch": 0.07755834829443448, "step": 1080}, {"loss": 0.8623, "grad_norm": 0.5123572945594788, "learning_rate": 0.0002, "epoch": 0.07827648114901256, "step": 1090}, {"loss": 0.8527, "grad_norm": 0.44138720631599426, "learning_rate": 0.0002, "epoch": 0.07899461400359066, "step": 1100}, {"loss": 0.7949, "grad_norm": 0.38932886719703674, "learning_rate": 0.0002, "epoch": 0.07971274685816876, "step": 1110}, {"loss": 0.8289, "grad_norm": 0.435820072889328, "learning_rate": 0.0002, "epoch": 0.08043087971274686, "step": 1120}, {"loss": 0.787, "grad_norm": 0.3820142149925232, "learning_rate": 0.0002, "epoch": 0.08114901256732496, "step": 1130}, {"loss": 0.8617, "grad_norm": 0.39680808782577515, "learning_rate": 0.0002, "epoch": 0.08186714542190306, "step": 1140}, {"loss": 0.8047, "grad_norm": 0.4833722412586212, "learning_rate": 0.0002, "epoch": 0.08258527827648116, "step": 1150}, {"loss": 0.8513, "grad_norm": 0.5045956969261169, "learning_rate": 0.0002, "epoch": 0.08330341113105924, "step": 1160}, {"loss": 0.8366, "grad_norm": 0.3652207553386688, "learning_rate": 0.0002, "epoch": 0.08402154398563734, "step": 1170}, {"loss": 0.8464, "grad_norm": 0.44447052478790283, "learning_rate": 0.0002, "epoch": 0.08473967684021544, "step": 1180}, {"loss": 0.8362, "grad_norm": 0.44942694902420044, "learning_rate": 0.0002, "epoch": 0.08545780969479354, "step": 1190}, {"loss": 0.7932, "grad_norm": 0.48789075016975403, "learning_rate": 0.0002, "epoch": 0.08617594254937164, "step": 1200}, {"loss": 0.8008, "grad_norm": 0.3981451094150543, "learning_rate": 0.0002, "epoch": 0.08689407540394974, "step": 1210}, {"loss": 0.8296, "grad_norm": 0.45545220375061035, "learning_rate": 0.0002, "epoch": 0.08761220825852782, "step": 1220}, {"loss": 0.8406, "grad_norm": 0.562138557434082, "learning_rate": 0.0002, "epoch": 0.08833034111310592, "step": 1230}, {"loss": 0.808, "grad_norm": 0.48523494601249695, "learning_rate": 0.0002, "epoch": 0.08904847396768402, "step": 1240}, {"loss": 0.8024, "grad_norm": 0.35054388642311096, "learning_rate": 0.0002, "epoch": 0.08976660682226212, "step": 1250}, {"loss": 0.8635, "grad_norm": 0.4148605167865753, "learning_rate": 0.0002, "epoch": 0.09048473967684022, "step": 1260}, {"loss": 0.8379, "grad_norm": 0.50171959400177, "learning_rate": 0.0002, "epoch": 0.09120287253141832, "step": 1270}, {"loss": 0.8466, "grad_norm": 0.41747573018074036, "learning_rate": 0.0002, "epoch": 0.09192100538599642, "step": 1280}, {"loss": 0.7905, "grad_norm": 0.43028751015663147, "learning_rate": 0.0002, "epoch": 0.0926391382405745, "step": 1290}, {"loss": 0.8071, "grad_norm": 0.41274991631507874, "learning_rate": 0.0002, "epoch": 0.0933572710951526, "step": 1300}, {"loss": 0.8214, "grad_norm": 0.5399569272994995, "learning_rate": 0.0002, "epoch": 0.0940754039497307, "step": 1310}, {"loss": 0.8108, "grad_norm": 0.44284379482269287, "learning_rate": 0.0002, "epoch": 0.0947935368043088, "step": 1320}, {"loss": 0.8301, "grad_norm": 0.42511969804763794, "learning_rate": 0.0002, "epoch": 0.0955116696588869, "step": 1330}, {"loss": 0.8527, "grad_norm": 0.5717929005622864, "learning_rate": 0.0002, "epoch": 0.096229802513465, "step": 1340}, {"loss": 0.8232, "grad_norm": 0.4104631245136261, "learning_rate": 0.0002, "epoch": 0.09694793536804308, "step": 1350}, {"loss": 0.8697, "grad_norm": 0.4144339859485626, "learning_rate": 0.0002, "epoch": 0.09766606822262118, "step": 1360}, {"loss": 0.7909, "grad_norm": 0.43676936626434326, "learning_rate": 0.0002, "epoch": 0.09838420107719928, "step": 1370}, {"loss": 0.8757, "grad_norm": 0.5297161340713501, "learning_rate": 0.0002, "epoch": 0.09910233393177738, "step": 1380}, {"loss": 0.7772, "grad_norm": 0.5319193601608276, "learning_rate": 0.0002, "epoch": 0.09982046678635548, "step": 1390}, {"loss": 0.8167, "grad_norm": 0.4083728492259979, "learning_rate": 0.0002, "epoch": 0.10053859964093358, "step": 1400}, {"loss": 0.8436, "grad_norm": 0.4193868339061737, "learning_rate": 0.0002, "epoch": 0.10125673249551168, "step": 1410}, {"loss": 0.8634, "grad_norm": 0.4062198996543884, "learning_rate": 0.0002, "epoch": 0.10197486535008976, "step": 1420}, {"loss": 0.7984, "grad_norm": 0.43972232937812805, "learning_rate": 0.0002, "epoch": 0.10269299820466786, "step": 1430}, {"loss": 0.8278, "grad_norm": 0.4598410725593567, "learning_rate": 0.0002, "epoch": 0.10341113105924596, "step": 1440}, {"loss": 0.8527, "grad_norm": 0.571662187576294, "learning_rate": 0.0002, "epoch": 0.10412926391382406, "step": 1450}, {"loss": 0.8485, "grad_norm": 0.5437791347503662, "learning_rate": 0.0002, "epoch": 0.10484739676840216, "step": 1460}, {"loss": 0.8172, "grad_norm": 0.4241923391819, "learning_rate": 0.0002, "epoch": 0.10556552962298026, "step": 1470}, {"loss": 0.8224, "grad_norm": 0.5185145735740662, "learning_rate": 0.0002, "epoch": 0.10628366247755835, "step": 1480}, {"loss": 0.8292, "grad_norm": 0.537626326084137, "learning_rate": 0.0002, "epoch": 0.10700179533213644, "step": 1490}, {"loss": 0.8227, "grad_norm": 0.4573661983013153, "learning_rate": 0.0002, "epoch": 0.10771992818671454, "step": 1500}, {"loss": 0.8318, "grad_norm": 0.4521017074584961, "learning_rate": 0.0002, "epoch": 0.10843806104129264, "step": 1510}, {"loss": 0.8107, "grad_norm": 0.6835159063339233, "learning_rate": 0.0002, "epoch": 0.10915619389587074, "step": 1520}, {"loss": 0.8256, "grad_norm": 0.43522894382476807, "learning_rate": 0.0002, "epoch": 0.10987432675044884, "step": 1530}, {"loss": 0.8211, "grad_norm": 0.685547411441803, "learning_rate": 0.0002, "epoch": 0.11059245960502694, "step": 1540}, {"loss": 0.8393, "grad_norm": 0.5283669233322144, "learning_rate": 0.0002, "epoch": 0.11131059245960502, "step": 1550}, {"loss": 0.8493, "grad_norm": 0.4869283437728882, "learning_rate": 0.0002, "epoch": 0.11202872531418312, "step": 1560}, {"loss": 0.8614, "grad_norm": 0.43024054169654846, "learning_rate": 0.0002, "epoch": 0.11274685816876122, "step": 1570}, {"loss": 0.8026, "grad_norm": 0.46726059913635254, "learning_rate": 0.0002, "epoch": 0.11346499102333932, "step": 1580}, {"loss": 0.8103, "grad_norm": 0.5046039819717407, "learning_rate": 0.0002, "epoch": 0.11418312387791742, "step": 1590}, {"loss": 0.8242, "grad_norm": 0.48972827196121216, "learning_rate": 0.0002, "epoch": 0.11490125673249552, "step": 1600}, {"loss": 0.8114, "grad_norm": 0.5221049189567566, "learning_rate": 0.0002, "epoch": 0.11561938958707361, "step": 1610}, {"loss": 0.8022, "grad_norm": 0.49169477820396423, "learning_rate": 0.0002, "epoch": 0.1163375224416517, "step": 1620}, {"loss": 0.8223, "grad_norm": 0.48462188243865967, "learning_rate": 0.0002, "epoch": 0.1170556552962298, "step": 1630}, {"loss": 0.8409, "grad_norm": 0.9001021981239319, "learning_rate": 0.0002, "epoch": 0.1177737881508079, "step": 1640}, {"loss": 0.8037, "grad_norm": 0.47555917501449585, "learning_rate": 0.0002, "epoch": 0.118491921005386, "step": 1650}, {"loss": 0.8047, "grad_norm": 0.4523521959781647, "learning_rate": 0.0002, "epoch": 0.1192100538599641, "step": 1660}, {"loss": 0.8552, "grad_norm": 0.510956346988678, "learning_rate": 0.0002, "epoch": 0.1199281867145422, "step": 1670}, {"loss": 0.8081, "grad_norm": 0.48063746094703674, "learning_rate": 0.0002, "epoch": 0.12064631956912028, "step": 1680}, {"loss": 0.7712, "grad_norm": 0.5209490060806274, "learning_rate": 0.0002, "epoch": 0.12136445242369838, "step": 1690}, {"loss": 0.8019, "grad_norm": 0.5488983988761902, "learning_rate": 0.0002, "epoch": 0.12208258527827648, "step": 1700}, {"loss": 0.829, "grad_norm": 0.5263523459434509, "learning_rate": 0.0002, "epoch": 0.12280071813285458, "step": 1710}, {"loss": 0.7761, "grad_norm": 0.45365768671035767, "learning_rate": 0.0002, "epoch": 0.12351885098743268, "step": 1720}, {"loss": 0.8432, "grad_norm": 0.4366922378540039, "learning_rate": 0.0002, "epoch": 0.12423698384201078, "step": 1730}, {"loss": 0.8261, "grad_norm": 0.4841083884239197, "learning_rate": 0.0002, "epoch": 0.12495511669658887, "step": 1740}, {"loss": 0.7834, "grad_norm": 0.46546968817710876, "learning_rate": 0.0002, "epoch": 0.12567324955116696, "step": 1750}, {"loss": 0.7874, "grad_norm": 0.39987099170684814, "learning_rate": 0.0002, "epoch": 0.12639138240574507, "step": 1760}, {"loss": 0.813, "grad_norm": 0.4661678969860077, "learning_rate": 0.0002, "epoch": 0.12710951526032316, "step": 1770}, {"loss": 0.8516, "grad_norm": 0.46716657280921936, "learning_rate": 0.0002, "epoch": 0.12782764811490124, "step": 1780}, {"loss": 0.8065, "grad_norm": 0.46164995431900024, "learning_rate": 0.0002, "epoch": 0.12854578096947936, "step": 1790}, {"loss": 0.8911, "grad_norm": 0.4910370111465454, "learning_rate": 0.0002, "epoch": 0.12926391382405744, "step": 1800}, {"loss": 0.7773, "grad_norm": 0.5615737438201904, "learning_rate": 0.0002, "epoch": 0.12998204667863555, "step": 1810}, {"loss": 0.7726, "grad_norm": 0.5739728808403015, "learning_rate": 0.0002, "epoch": 0.13070017953321364, "step": 1820}, {"loss": 0.8307, "grad_norm": 0.44104722142219543, "learning_rate": 0.0002, "epoch": 0.13141831238779175, "step": 1830}, {"loss": 0.7533, "grad_norm": 0.46373724937438965, "learning_rate": 0.0002, "epoch": 0.13213644524236984, "step": 1840}, {"loss": 0.8181, "grad_norm": 0.4481196403503418, "learning_rate": 0.0002, "epoch": 0.13285457809694792, "step": 1850}, {"loss": 0.8508, "grad_norm": 0.5689327716827393, "learning_rate": 0.0002, "epoch": 0.13357271095152604, "step": 1860}, {"loss": 0.8364, "grad_norm": 0.5334849953651428, "learning_rate": 0.0002, "epoch": 0.13429084380610412, "step": 1870}, {"loss": 0.8018, "grad_norm": 0.5177253484725952, "learning_rate": 0.0002, "epoch": 0.13500897666068223, "step": 1880}, {"loss": 0.869, "grad_norm": 0.4919368326663971, "learning_rate": 0.0002, "epoch": 0.13572710951526032, "step": 1890}, {"loss": 0.7647, "grad_norm": 0.5987576842308044, "learning_rate": 0.0002, "epoch": 0.13644524236983843, "step": 1900}, {"loss": 0.8546, "grad_norm": 0.49790486693382263, "learning_rate": 0.0002, "epoch": 0.13716337522441652, "step": 1910}, {"loss": 0.8402, "grad_norm": 0.5337542295455933, "learning_rate": 0.0002, "epoch": 0.1378815080789946, "step": 1920}, {"loss": 0.815, "grad_norm": 0.5171598792076111, "learning_rate": 0.0002, "epoch": 0.13859964093357272, "step": 1930}, {"loss": 0.843, "grad_norm": 0.5003953576087952, "learning_rate": 0.0002, "epoch": 0.1393177737881508, "step": 1940}, {"loss": 0.7867, "grad_norm": 0.5147887468338013, "learning_rate": 0.0002, "epoch": 0.1400359066427289, "step": 1950}, {"loss": 0.8215, "grad_norm": 0.6365984678268433, "learning_rate": 0.0002, "epoch": 0.140754039497307, "step": 1960}, {"loss": 0.8397, "grad_norm": 0.5449512004852295, "learning_rate": 0.0002, "epoch": 0.1414721723518851, "step": 1970}, {"loss": 0.8177, "grad_norm": 0.4062703847885132, "learning_rate": 0.0002, "epoch": 0.1421903052064632, "step": 1980}, {"loss": 0.8058, "grad_norm": 0.4446912705898285, "learning_rate": 0.0002, "epoch": 0.14290843806104128, "step": 1990}, {"loss": 0.7854, "grad_norm": 0.49001234769821167, "learning_rate": 0.0002, "epoch": 0.1436265709156194, "step": 2000}, {"loss": 0.8136, "grad_norm": 0.5591765642166138, "learning_rate": 0.0002, "epoch": 0.14434470377019748, "step": 2010}, {"loss": 0.7808, "grad_norm": 0.6476696133613586, "learning_rate": 0.0002, "epoch": 0.1450628366247756, "step": 2020}, {"loss": 0.8137, "grad_norm": 0.44688376784324646, "learning_rate": 0.0002, "epoch": 0.14578096947935368, "step": 2030}, {"loss": 0.8253, "grad_norm": 0.4437490701675415, "learning_rate": 0.0002, "epoch": 0.1464991023339318, "step": 2040}, {"loss": 0.7654, "grad_norm": 0.59927898645401, "learning_rate": 0.0002, "epoch": 0.14721723518850988, "step": 2050}, {"loss": 0.825, "grad_norm": 0.4356591999530792, "learning_rate": 0.0002, "epoch": 0.14793536804308796, "step": 2060}, {"loss": 0.8038, "grad_norm": 0.5560822486877441, "learning_rate": 0.0002, "epoch": 0.14865350089766607, "step": 2070}, {"loss": 0.838, "grad_norm": 0.43027108907699585, "learning_rate": 0.0002, "epoch": 0.14937163375224416, "step": 2080}, {"loss": 0.8317, "grad_norm": 0.41215455532073975, "learning_rate": 0.0002, "epoch": 0.15008976660682227, "step": 2090}, {"loss": 0.7948, "grad_norm": 0.4607839584350586, "learning_rate": 0.0002, "epoch": 0.15080789946140036, "step": 2100}, {"loss": 0.7981, "grad_norm": 0.4699854254722595, "learning_rate": 0.0002, "epoch": 0.15152603231597844, "step": 2110}, {"loss": 0.8464, "grad_norm": 0.5111975073814392, "learning_rate": 0.0002, "epoch": 0.15224416517055656, "step": 2120}, {"loss": 0.7672, "grad_norm": 0.4713742733001709, "learning_rate": 0.0002, "epoch": 0.15296229802513464, "step": 2130}, {"loss": 0.7692, "grad_norm": 0.3816622793674469, "learning_rate": 0.0002, "epoch": 0.15368043087971275, "step": 2140}, {"loss": 0.7824, "grad_norm": 0.4637526273727417, "learning_rate": 0.0002, "epoch": 0.15439856373429084, "step": 2150}, {"loss": 0.8185, "grad_norm": 0.3691818118095398, "learning_rate": 0.0002, "epoch": 0.15511669658886895, "step": 2160}, {"loss": 0.8298, "grad_norm": 0.4435218274593353, "learning_rate": 0.0002, "epoch": 0.15583482944344704, "step": 2170}, {"loss": 0.7917, "grad_norm": 0.5282211899757385, "learning_rate": 0.0002, "epoch": 0.15655296229802512, "step": 2180}, {"loss": 0.8006, "grad_norm": 0.7611056566238403, "learning_rate": 0.0002, "epoch": 0.15727109515260324, "step": 2190}, {"loss": 0.8039, "grad_norm": 0.5951169729232788, "learning_rate": 0.0002, "epoch": 0.15798922800718132, "step": 2200}, {"loss": 0.8314, "grad_norm": 0.5243265628814697, "learning_rate": 0.0002, "epoch": 0.15870736086175943, "step": 2210}, {"loss": 0.7817, "grad_norm": 0.518944501876831, "learning_rate": 0.0002, "epoch": 0.15942549371633752, "step": 2220}, {"loss": 0.8187, "grad_norm": 0.4264616072177887, "learning_rate": 0.0002, "epoch": 0.16014362657091563, "step": 2230}, {"loss": 0.7916, "grad_norm": 0.4619045853614807, "learning_rate": 0.0002, "epoch": 0.16086175942549372, "step": 2240}, {"loss": 0.84, "grad_norm": 0.4047030508518219, "learning_rate": 0.0002, "epoch": 0.1615798922800718, "step": 2250}, {"loss": 0.8133, "grad_norm": 0.47133687138557434, "learning_rate": 0.0002, "epoch": 0.16229802513464991, "step": 2260}, {"loss": 0.8032, "grad_norm": 0.4990246593952179, "learning_rate": 0.0002, "epoch": 0.163016157989228, "step": 2270}, {"loss": 0.752, "grad_norm": 0.5145298838615417, "learning_rate": 0.0002, "epoch": 0.1637342908438061, "step": 2280}, {"loss": 0.8441, "grad_norm": 0.5354352593421936, "learning_rate": 0.0002, "epoch": 0.1644524236983842, "step": 2290}, {"loss": 0.8099, "grad_norm": 0.47621065378189087, "learning_rate": 0.0002, "epoch": 0.1651705565529623, "step": 2300}, {"loss": 0.8105, "grad_norm": 0.45333582162857056, "learning_rate": 0.0002, "epoch": 0.1658886894075404, "step": 2310}, {"loss": 0.8126, "grad_norm": 0.4832790493965149, "learning_rate": 0.0002, "epoch": 0.16660682226211848, "step": 2320}, {"loss": 0.8271, "grad_norm": 0.4922761619091034, "learning_rate": 0.0002, "epoch": 0.1673249551166966, "step": 2330}, {"loss": 0.8324, "grad_norm": 0.5701655149459839, "learning_rate": 0.0002, "epoch": 0.16804308797127468, "step": 2340}, {"loss": 0.844, "grad_norm": 0.5170459151268005, "learning_rate": 0.0002, "epoch": 0.1687612208258528, "step": 2350}, {"loss": 0.7995, "grad_norm": 0.6562373638153076, "learning_rate": 0.0002, "epoch": 0.16947935368043088, "step": 2360}, {"loss": 0.7733, "grad_norm": 0.5350262522697449, "learning_rate": 0.0002, "epoch": 0.170197486535009, "step": 2370}, {"loss": 0.8501, "grad_norm": 0.5163491368293762, "learning_rate": 0.0002, "epoch": 0.17091561938958708, "step": 2380}, {"loss": 0.7708, "grad_norm": 0.48841530084609985, "learning_rate": 0.0002, "epoch": 0.17163375224416516, "step": 2390}, {"loss": 0.7969, "grad_norm": 0.44912993907928467, "learning_rate": 0.0002, "epoch": 0.17235188509874327, "step": 2400}, {"loss": 0.7706, "grad_norm": 0.5770647525787354, "learning_rate": 0.0002, "epoch": 0.17307001795332136, "step": 2410}, {"loss": 0.8233, "grad_norm": 0.4716179072856903, "learning_rate": 0.0002, "epoch": 0.17378815080789947, "step": 2420}, {"loss": 0.7802, "grad_norm": 0.5465078949928284, "learning_rate": 0.0002, "epoch": 0.17450628366247756, "step": 2430}, {"loss": 0.8191, "grad_norm": 0.40810713171958923, "learning_rate": 0.0002, "epoch": 0.17522441651705564, "step": 2440}, {"loss": 0.7971, "grad_norm": 0.3789578080177307, "learning_rate": 0.0002, "epoch": 0.17594254937163376, "step": 2450}, {"loss": 0.7437, "grad_norm": 0.4615110158920288, "learning_rate": 0.0002, "epoch": 0.17666068222621184, "step": 2460}, {"loss": 0.8102, "grad_norm": 0.4400235712528229, "learning_rate": 0.0002, "epoch": 0.17737881508078995, "step": 2470}, {"loss": 0.8254, "grad_norm": 0.5935020446777344, "learning_rate": 0.0002, "epoch": 0.17809694793536804, "step": 2480}, {"loss": 0.7886, "grad_norm": 0.5672990679740906, "learning_rate": 0.0002, "epoch": 0.17881508078994615, "step": 2490}, {"loss": 0.7829, "grad_norm": 0.4132838845252991, "learning_rate": 0.0002, "epoch": 0.17953321364452424, "step": 2500}, {"loss": 0.8056, "grad_norm": 0.5373716950416565, "learning_rate": 0.0002, "epoch": 0.18025134649910232, "step": 2510}, {"loss": 0.8061, "grad_norm": 0.5335832834243774, "learning_rate": 0.0002, "epoch": 0.18096947935368043, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5705642700195312, "learning_rate": 0.0002, "epoch": 0.18168761220825852, "step": 2530}, {"loss": 0.7779, "grad_norm": 0.4807959496974945, "learning_rate": 0.0002, "epoch": 0.18240574506283663, "step": 2540}, {"loss": 0.7767, "grad_norm": 0.4430573880672455, "learning_rate": 0.0002, "epoch": 0.18312387791741472, "step": 2550}, {"loss": 0.7921, "grad_norm": 0.5294728875160217, "learning_rate": 0.0002, "epoch": 0.18384201077199283, "step": 2560}, {"loss": 0.8102, "grad_norm": 0.661173403263092, "learning_rate": 0.0002, "epoch": 0.18456014362657092, "step": 2570}, {"loss": 0.803, "grad_norm": 0.5044304728507996, "learning_rate": 0.0002, "epoch": 0.185278276481149, "step": 2580}, {"loss": 0.7833, "grad_norm": 0.48929551243782043, "learning_rate": 0.0002, "epoch": 0.18599640933572711, "step": 2590}, {"loss": 0.8252, "grad_norm": 0.5054438710212708, "learning_rate": 0.0002, "epoch": 0.1867145421903052, "step": 2600}, {"loss": 0.7665, "grad_norm": 0.5613677501678467, "learning_rate": 0.0002, "epoch": 0.1874326750448833, "step": 2610}, {"loss": 0.7954, "grad_norm": 0.5762478709220886, "learning_rate": 0.0002, "epoch": 0.1881508078994614, "step": 2620}, {"loss": 0.8312, "grad_norm": 0.4523695409297943, "learning_rate": 0.0002, "epoch": 0.1888689407540395, "step": 2630}, {"loss": 0.8098, "grad_norm": 0.5235317945480347, "learning_rate": 0.0002, "epoch": 0.1895870736086176, "step": 2640}, {"loss": 0.8281, "grad_norm": 0.4894576370716095, "learning_rate": 0.0002, "epoch": 0.19030520646319568, "step": 2650}, {"loss": 0.7923, "grad_norm": 0.45731106400489807, "learning_rate": 0.0002, "epoch": 0.1910233393177738, "step": 2660}, {"loss": 0.7942, "grad_norm": 0.4726541042327881, "learning_rate": 0.0002, "epoch": 0.19174147217235188, "step": 2670}, {"loss": 0.7979, "grad_norm": 0.4281631410121918, "learning_rate": 0.0002, "epoch": 0.19245960502693, "step": 2680}, {"loss": 0.8076, "grad_norm": 0.48011314868927, "learning_rate": 0.0002, "epoch": 0.19317773788150808, "step": 2690}, {"loss": 0.7785, "grad_norm": 0.45785006880760193, "learning_rate": 0.0002, "epoch": 0.19389587073608616, "step": 2700}, {"loss": 0.7726, "grad_norm": 0.5244625210762024, "learning_rate": 0.0002, "epoch": 0.19461400359066428, "step": 2710}, {"loss": 0.8674, "grad_norm": 0.4674883186817169, "learning_rate": 0.0002, "epoch": 0.19533213644524236, "step": 2720}, {"loss": 0.8465, "grad_norm": 0.5969558358192444, "learning_rate": 0.0002, "epoch": 0.19605026929982047, "step": 2730}, {"loss": 0.8238, "grad_norm": 0.44413265585899353, "learning_rate": 0.0002, "epoch": 0.19676840215439856, "step": 2740}, {"loss": 0.8181, "grad_norm": 0.5094553828239441, "learning_rate": 0.0002, "epoch": 0.19748653500897667, "step": 2750}, {"loss": 0.7593, "grad_norm": 0.4931736886501312, "learning_rate": 0.0002, "epoch": 0.19820466786355476, "step": 2760}, {"loss": 0.8535, "grad_norm": 0.4766625463962555, "learning_rate": 0.0002, "epoch": 0.19892280071813284, "step": 2770}, {"loss": 0.754, "grad_norm": 0.4196971654891968, "learning_rate": 0.0002, "epoch": 0.19964093357271095, "step": 2780}, {"loss": 0.7794, "grad_norm": 0.4693375825881958, "learning_rate": 0.0002, "epoch": 0.20035906642728904, "step": 2790}, {"loss": 0.8336, "grad_norm": 0.5407108664512634, "learning_rate": 0.0002, "epoch": 0.20107719928186715, "step": 2800}, {"loss": 0.7938, "grad_norm": 0.42864227294921875, "learning_rate": 0.0002, "epoch": 0.20179533213644524, "step": 2810}, {"loss": 0.8059, "grad_norm": 0.4928833246231079, "learning_rate": 0.0002, "epoch": 0.20251346499102335, "step": 2820}, {"loss": 0.8221, "grad_norm": 0.5575131773948669, "learning_rate": 0.0002, "epoch": 0.20323159784560144, "step": 2830}, {"loss": 0.7712, "grad_norm": 0.505114734172821, "learning_rate": 0.0002, "epoch": 0.20394973070017952, "step": 2840}, {"loss": 0.7986, "grad_norm": 0.4727420210838318, "learning_rate": 0.0002, "epoch": 0.20466786355475763, "step": 2850}, {"loss": 0.7662, "grad_norm": 0.48218145966529846, "learning_rate": 0.0002, "epoch": 0.20538599640933572, "step": 2860}, {"loss": 0.8055, "grad_norm": 0.5196906328201294, "learning_rate": 0.0002, "epoch": 0.20610412926391383, "step": 2870}, {"loss": 0.8401, "grad_norm": 0.4927639067173004, "learning_rate": 0.0002, "epoch": 0.20682226211849192, "step": 2880}, {"loss": 0.8067, "grad_norm": 0.5076990127563477, "learning_rate": 0.0002, "epoch": 0.20754039497307003, "step": 2890}, {"loss": 0.789, "grad_norm": 0.4606800079345703, "learning_rate": 0.0002, "epoch": 0.20825852782764812, "step": 2900}, {"loss": 0.8381, "grad_norm": 0.6184319257736206, "learning_rate": 0.0002, "epoch": 0.2089766606822262, "step": 2910}, {"loss": 0.8019, "grad_norm": 0.5237935781478882, "learning_rate": 0.0002, "epoch": 0.2096947935368043, "step": 2920}, {"loss": 0.7763, "grad_norm": 0.43966251611709595, "learning_rate": 0.0002, "epoch": 0.2104129263913824, "step": 2930}, {"loss": 0.7915, "grad_norm": 0.48786666989326477, "learning_rate": 0.0002, "epoch": 0.2111310592459605, "step": 2940}, {"loss": 0.7549, "grad_norm": 0.4397817552089691, "learning_rate": 0.0002, "epoch": 0.2118491921005386, "step": 2950}, {"loss": 0.8342, "grad_norm": 0.5155336260795593, "learning_rate": 0.0002, "epoch": 0.2125673249551167, "step": 2960}, {"loss": 0.7885, "grad_norm": 0.48058274388313293, "learning_rate": 0.0002, "epoch": 0.2132854578096948, "step": 2970}, {"loss": 0.8208, "grad_norm": 0.5022647976875305, "learning_rate": 0.0002, "epoch": 0.21400359066427288, "step": 2980}, {"loss": 0.784, "grad_norm": 0.5417225360870361, "learning_rate": 0.0002, "epoch": 0.214721723518851, "step": 2990}, {"loss": 0.8518, "grad_norm": 0.46300315856933594, "learning_rate": 0.0002, "epoch": 0.21543985637342908, "step": 3000}, {"loss": 0.764, "grad_norm": 0.5375089049339294, "learning_rate": 0.0002, "epoch": 0.2161579892280072, "step": 3010}, {"loss": 0.8459, "grad_norm": 0.5050022602081299, "learning_rate": 0.0002, "epoch": 0.21687612208258528, "step": 3020}, {"loss": 0.797, "grad_norm": 0.46347716450691223, "learning_rate": 0.0002, "epoch": 0.21759425493716336, "step": 3030}, {"loss": 0.8014, "grad_norm": 0.544874370098114, "learning_rate": 0.0002, "epoch": 0.21831238779174147, "step": 3040}, {"loss": 0.802, "grad_norm": 0.4268142580986023, "learning_rate": 0.0002, "epoch": 0.21903052064631956, "step": 3050}, {"loss": 0.8224, "grad_norm": 0.5527157187461853, "learning_rate": 0.0002, "epoch": 0.21974865350089767, "step": 3060}, {"loss": 0.771, "grad_norm": 0.5565235018730164, "learning_rate": 0.0002, "epoch": 0.22046678635547576, "step": 3070}, {"loss": 0.7807, "grad_norm": 0.4900645613670349, "learning_rate": 0.0002, "epoch": 0.22118491921005387, "step": 3080}, {"loss": 0.8321, "grad_norm": 0.4951242208480835, "learning_rate": 0.0002, "epoch": 0.22190305206463196, "step": 3090}, {"loss": 0.8301, "grad_norm": 0.5831719636917114, "learning_rate": 0.0002, "epoch": 0.22262118491921004, "step": 3100}, {"loss": 0.8011, "grad_norm": 0.417576402425766, "learning_rate": 0.0002, "epoch": 0.22333931777378815, "step": 3110}, {"loss": 0.8226, "grad_norm": 0.4715117812156677, "learning_rate": 0.0002, "epoch": 0.22405745062836624, "step": 3120}, {"loss": 0.778, "grad_norm": 0.5956445336341858, "learning_rate": 0.0002, "epoch": 0.22477558348294435, "step": 3130}, {"loss": 0.788, "grad_norm": 0.408184289932251, "learning_rate": 0.0002, "epoch": 0.22549371633752244, "step": 3140}, {"loss": 0.8096, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 0.22621184919210055, "step": 3150}, {"loss": 0.7722, "grad_norm": 0.5631294846534729, "learning_rate": 0.0002, "epoch": 0.22692998204667864, "step": 3160}, {"loss": 0.7933, "grad_norm": 0.5054665803909302, "learning_rate": 0.0002, "epoch": 0.22764811490125672, "step": 3170}, {"loss": 0.8572, "grad_norm": 0.47388020157814026, "learning_rate": 0.0002, "epoch": 0.22836624775583483, "step": 3180}, {"loss": 0.8148, "grad_norm": 0.45871609449386597, "learning_rate": 0.0002, "epoch": 0.22908438061041292, "step": 3190}, {"loss": 0.8373, "grad_norm": 0.42431211471557617, "learning_rate": 0.0002, "epoch": 0.22980251346499103, "step": 3200}, {"loss": 0.7847, "grad_norm": 0.584872305393219, "learning_rate": 0.0002, "epoch": 0.23052064631956912, "step": 3210}, {"loss": 0.8118, "grad_norm": 0.5489653944969177, "learning_rate": 0.0002, "epoch": 0.23123877917414723, "step": 3220}, {"loss": 0.8552, "grad_norm": 0.5803213119506836, "learning_rate": 0.0002, "epoch": 0.23195691202872532, "step": 3230}, {"loss": 0.7702, "grad_norm": 0.906505823135376, "learning_rate": 0.0002, "epoch": 0.2326750448833034, "step": 3240}, {"loss": 0.8454, "grad_norm": 0.4569525718688965, "learning_rate": 0.0002, "epoch": 0.2333931777378815, "step": 3250}, {"loss": 0.7641, "grad_norm": 0.5566741228103638, "learning_rate": 0.0002, "epoch": 0.2341113105924596, "step": 3260}, {"loss": 0.7964, "grad_norm": 0.5059959888458252, "learning_rate": 0.0002, "epoch": 0.2348294434470377, "step": 3270}, {"loss": 0.7965, "grad_norm": 0.530828058719635, "learning_rate": 0.0002, "epoch": 0.2355475763016158, "step": 3280}, {"loss": 0.807, "grad_norm": 0.5149409174919128, "learning_rate": 0.0002, "epoch": 0.2362657091561939, "step": 3290}, {"loss": 0.8067, "grad_norm": 0.7323763966560364, "learning_rate": 0.0002, "epoch": 0.236983842010772, "step": 3300}, {"loss": 0.774, "grad_norm": 0.6794836521148682, "learning_rate": 0.0002, "epoch": 0.23770197486535008, "step": 3310}, {"loss": 0.7902, "grad_norm": 0.5176534056663513, "learning_rate": 0.0002, "epoch": 0.2384201077199282, "step": 3320}, {"loss": 0.8119, "grad_norm": 0.42245906591415405, "learning_rate": 0.0002, "epoch": 0.23913824057450628, "step": 3330}, {"loss": 0.868, "grad_norm": 0.43535107374191284, "learning_rate": 0.0002, "epoch": 0.2398563734290844, "step": 3340}, {"loss": 0.825, "grad_norm": 0.7038307785987854, "learning_rate": 0.0002, "epoch": 0.24057450628366248, "step": 3350}, {"loss": 0.7818, "grad_norm": 0.5689977407455444, "learning_rate": 0.0002, "epoch": 0.24129263913824056, "step": 3360}, {"loss": 0.7958, "grad_norm": 0.538136899471283, "learning_rate": 0.0002, "epoch": 0.24201077199281867, "step": 3370}, {"loss": 0.7995, "grad_norm": 0.7433661222457886, "learning_rate": 0.0002, "epoch": 0.24272890484739676, "step": 3380}, {"loss": 0.8564, "grad_norm": 0.6996734738349915, "learning_rate": 0.0002, "epoch": 0.24344703770197487, "step": 3390}, {"loss": 0.8288, "grad_norm": 0.5055703520774841, "learning_rate": 0.0002, "epoch": 0.24416517055655296, "step": 3400}, {"loss": 0.7741, "grad_norm": 0.5218513607978821, "learning_rate": 0.0002, "epoch": 0.24488330341113107, "step": 3410}, {"loss": 0.7903, "grad_norm": 0.42782822251319885, "learning_rate": 0.0002, "epoch": 0.24560143626570916, "step": 3420}, {"loss": 0.8005, "grad_norm": 0.4991157650947571, "learning_rate": 0.0002, "epoch": 0.24631956912028724, "step": 3430}, {"loss": 0.8151, "grad_norm": 0.5063165426254272, "learning_rate": 0.0002, "epoch": 0.24703770197486535, "step": 3440}, {"loss": 0.7722, "grad_norm": 0.45863136649131775, "learning_rate": 0.0002, "epoch": 0.24775583482944344, "step": 3450}, {"loss": 0.8236, "grad_norm": 0.474728524684906, "learning_rate": 0.0002, "epoch": 0.24847396768402155, "step": 3460}, {"loss": 0.7698, "grad_norm": 0.522570013999939, "learning_rate": 0.0002, "epoch": 0.24919210053859964, "step": 3470}, {"loss": 0.7448, "grad_norm": 0.5474396347999573, "learning_rate": 0.0002, "epoch": 0.24991023339317775, "step": 3480}, {"loss": 0.8339, "grad_norm": 0.49094662070274353, "learning_rate": 0.0002, "epoch": 0.2506283662477558, "step": 3490}, {"loss": 0.7864, "grad_norm": 0.6399132609367371, "learning_rate": 0.0002, "epoch": 0.2513464991023339, "step": 3500}, {"loss": 0.7988, "grad_norm": 0.5910066366195679, "learning_rate": 0.0002, "epoch": 0.25206463195691203, "step": 3510}, {"loss": 0.813, "grad_norm": 0.4761259853839874, "learning_rate": 0.0002, "epoch": 0.25278276481149015, "step": 3520}, {"loss": 0.812, "grad_norm": 0.5124502182006836, "learning_rate": 0.0002, "epoch": 0.2535008976660682, "step": 3530}, {"loss": 0.7699, "grad_norm": 0.4329150915145874, "learning_rate": 0.0002, "epoch": 0.2542190305206463, "step": 3540}, {"loss": 0.8205, "grad_norm": 0.4839608371257782, "learning_rate": 0.0002, "epoch": 0.25493716337522443, "step": 3550}, {"loss": 0.8279, "grad_norm": 0.5413459539413452, "learning_rate": 0.0002, "epoch": 0.2556552962298025, "step": 3560}, {"loss": 0.8253, "grad_norm": 0.5761468410491943, "learning_rate": 0.0002, "epoch": 0.2563734290843806, "step": 3570}, {"loss": 0.8473, "grad_norm": 0.49266132712364197, "learning_rate": 0.0002, "epoch": 0.2570915619389587, "step": 3580}, {"loss": 0.7946, "grad_norm": 0.7377930879592896, "learning_rate": 0.0002, "epoch": 0.2578096947935368, "step": 3590}, {"loss": 0.799, "grad_norm": 0.543541431427002, "learning_rate": 0.0002, "epoch": 0.2585278276481149, "step": 3600}, {"loss": 0.8044, "grad_norm": 0.48385897278785706, "learning_rate": 0.0002, "epoch": 0.259245960502693, "step": 3610}, {"loss": 0.7686, "grad_norm": 0.5152639746665955, "learning_rate": 0.0002, "epoch": 0.2599640933572711, "step": 3620}, {"loss": 0.7438, "grad_norm": 0.5601988434791565, "learning_rate": 0.0002, "epoch": 0.26068222621184917, "step": 3630}, {"loss": 0.7915, "grad_norm": 0.4349626302719116, "learning_rate": 0.0002, "epoch": 0.2614003590664273, "step": 3640}, {"loss": 0.7825, "grad_norm": 0.5487161874771118, "learning_rate": 0.0002, "epoch": 0.2621184919210054, "step": 3650}, {"loss": 0.8085, "grad_norm": 0.45603805780410767, "learning_rate": 0.0002, "epoch": 0.2628366247755835, "step": 3660}, {"loss": 0.7858, "grad_norm": 0.5012730956077576, "learning_rate": 0.0002, "epoch": 0.26355475763016156, "step": 3670}, {"loss": 0.8022, "grad_norm": 0.4523845314979553, "learning_rate": 0.0002, "epoch": 0.2642728904847397, "step": 3680}, {"loss": 0.7932, "grad_norm": 0.5756664872169495, "learning_rate": 0.0002, "epoch": 0.2649910233393178, "step": 3690}, {"loss": 0.816, "grad_norm": 0.48467493057250977, "learning_rate": 0.0002, "epoch": 0.26570915619389585, "step": 3700}, {"loss": 0.7825, "grad_norm": 0.4860585927963257, "learning_rate": 0.0002, "epoch": 0.26642728904847396, "step": 3710}, {"loss": 0.7903, "grad_norm": 0.5067077875137329, "learning_rate": 0.0002, "epoch": 0.26714542190305207, "step": 3720}, {"loss": 0.8155, "grad_norm": 0.5490895509719849, "learning_rate": 0.0002, "epoch": 0.2678635547576302, "step": 3730}, {"loss": 0.7542, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.26858168761220824, "step": 3740}, {"loss": 0.7991, "grad_norm": 0.5026951432228088, "learning_rate": 0.0002, "epoch": 0.26929982046678635, "step": 3750}, {"loss": 0.8152, "grad_norm": 0.49474090337753296, "learning_rate": 0.0002, "epoch": 0.27001795332136447, "step": 3760}, {"loss": 0.8235, "grad_norm": 0.6381985545158386, "learning_rate": 0.0002, "epoch": 0.2707360861759425, "step": 3770}, {"loss": 0.8024, "grad_norm": 0.4784011244773865, "learning_rate": 0.0002, "epoch": 0.27145421903052064, "step": 3780}, {"loss": 0.7746, "grad_norm": 0.5126543045043945, "learning_rate": 0.0002, "epoch": 0.27217235188509875, "step": 3790}, {"loss": 0.841, "grad_norm": 0.5428652763366699, "learning_rate": 0.0002, "epoch": 0.27289048473967686, "step": 3800}, {"loss": 0.8137, "grad_norm": 0.5427033305168152, "learning_rate": 0.0002, "epoch": 0.2736086175942549, "step": 3810}, {"loss": 0.7274, "grad_norm": 0.46467480063438416, "learning_rate": 0.0002, "epoch": 0.27432675044883303, "step": 3820}, {"loss": 0.8414, "grad_norm": 0.494367390871048, "learning_rate": 0.0002, "epoch": 0.27504488330341115, "step": 3830}, {"loss": 0.8151, "grad_norm": 0.59856778383255, "learning_rate": 0.0002, "epoch": 0.2757630161579892, "step": 3840}, {"loss": 0.7899, "grad_norm": 0.422128826379776, "learning_rate": 0.0002, "epoch": 0.2764811490125673, "step": 3850}, {"loss": 0.8153, "grad_norm": 0.5757306814193726, "learning_rate": 0.0002, "epoch": 0.27719928186714543, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5850930213928223, "learning_rate": 0.0002, "epoch": 0.27791741472172354, "step": 3870}, {"loss": 0.8044, "grad_norm": 0.5633023977279663, "learning_rate": 0.0002, "epoch": 0.2786355475763016, "step": 3880}, {"loss": 0.8402, "grad_norm": 0.5037940144538879, "learning_rate": 0.0002, "epoch": 0.2793536804308797, "step": 3890}, {"loss": 0.822, "grad_norm": 0.5255506038665771, "learning_rate": 0.0002, "epoch": 0.2800718132854578, "step": 3900}, {"loss": 0.7625, "grad_norm": 0.44584617018699646, "learning_rate": 0.0002, "epoch": 0.2807899461400359, "step": 3910}, {"loss": 0.8131, "grad_norm": 0.4803239405155182, "learning_rate": 0.0002, "epoch": 0.281508078994614, "step": 3920}, {"loss": 0.8122, "grad_norm": 0.5206008553504944, "learning_rate": 0.0002, "epoch": 0.2822262118491921, "step": 3930}, {"loss": 0.8988, "grad_norm": 0.5596373081207275, "learning_rate": 0.0002, "epoch": 0.2829443447037702, "step": 3940}, {"loss": 0.8091, "grad_norm": 0.4487258493900299, "learning_rate": 0.0002, "epoch": 0.2836624775583483, "step": 3950}, {"loss": 0.7933, "grad_norm": 0.4774281978607178, "learning_rate": 0.0002, "epoch": 0.2843806104129264, "step": 3960}, {"loss": 0.8994, "grad_norm": 0.571829617023468, "learning_rate": 0.0002, "epoch": 0.2850987432675045, "step": 3970}, {"loss": 0.7971, "grad_norm": 0.45251455903053284, "learning_rate": 0.0002, "epoch": 0.28581687612208256, "step": 3980}, {"loss": 0.8007, "grad_norm": 0.5119943618774414, "learning_rate": 0.0002, "epoch": 0.2865350089766607, "step": 3990}, {"loss": 0.8087, "grad_norm": 0.42333969473838806, "learning_rate": 0.0002, "epoch": 0.2872531418312388, "step": 4000}, {"loss": 0.7978, "grad_norm": 0.5694096684455872, "learning_rate": 0.0002, "epoch": 0.2879712746858169, "step": 4010}, {"loss": 0.845, "grad_norm": 0.44457492232322693, "learning_rate": 0.0002, "epoch": 0.28868940754039496, "step": 4020}, {"loss": 0.7268, "grad_norm": 0.496545672416687, "learning_rate": 0.0002, "epoch": 0.2894075403949731, "step": 4030}, {"loss": 0.7908, "grad_norm": 0.5092352032661438, "learning_rate": 0.0002, "epoch": 0.2901256732495512, "step": 4040}, {"loss": 0.807, "grad_norm": 0.5124567151069641, "learning_rate": 0.0002, "epoch": 0.29084380610412924, "step": 4050}, {"loss": 0.8191, "grad_norm": 0.5148161053657532, "learning_rate": 0.0002, "epoch": 0.29156193895870736, "step": 4060}, {"loss": 0.7893, "grad_norm": 0.48183947801589966, "learning_rate": 0.0002, "epoch": 0.29228007181328547, "step": 4070}, {"loss": 0.8397, "grad_norm": 0.47728800773620605, "learning_rate": 0.0002, "epoch": 0.2929982046678636, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.5073143243789673, "learning_rate": 0.0002, "epoch": 0.29371633752244164, "step": 4090}, {"loss": 0.8019, "grad_norm": 0.5343585014343262, "learning_rate": 0.0002, "epoch": 0.29443447037701975, "step": 4100}, {"loss": 0.7933, "grad_norm": 0.5760312676429749, "learning_rate": 0.0002, "epoch": 0.29515260323159787, "step": 4110}, {"loss": 0.811, "grad_norm": 0.5894787907600403, "learning_rate": 0.0002, "epoch": 0.2958707360861759, "step": 4120}, {"loss": 0.7375, "grad_norm": 0.4528578817844391, "learning_rate": 0.0002, "epoch": 0.29658886894075404, "step": 4130}, {"loss": 0.7761, "grad_norm": 0.6027235388755798, "learning_rate": 0.0002, "epoch": 0.29730700179533215, "step": 4140}, {"loss": 0.7636, "grad_norm": 0.5060310959815979, "learning_rate": 0.0002, "epoch": 0.2980251346499102, "step": 4150}, {"loss": 0.8122, "grad_norm": 0.475252628326416, "learning_rate": 0.0002, "epoch": 0.2987432675044883, "step": 4160}, {"loss": 0.8306, "grad_norm": 0.4855351448059082, "learning_rate": 0.0002, "epoch": 0.29946140035906643, "step": 4170}, {"loss": 0.7863, "grad_norm": 0.6720767021179199, "learning_rate": 0.0002, "epoch": 0.30017953321364454, "step": 4180}, {"loss": 0.7755, "grad_norm": 0.6409553289413452, "learning_rate": 0.0002, "epoch": 0.3008976660682226, "step": 4190}, {"loss": 0.8333, "grad_norm": 0.5508167147636414, "learning_rate": 0.0002, "epoch": 0.3016157989228007, "step": 4200}, {"loss": 0.8138, "grad_norm": 0.45958149433135986, "learning_rate": 0.0002, "epoch": 0.30233393177737883, "step": 4210}, {"loss": 0.8266, "grad_norm": 0.5201641321182251, "learning_rate": 0.0002, "epoch": 0.3030520646319569, "step": 4220}, {"loss": 0.8246, "grad_norm": 0.5440032482147217, "learning_rate": 0.0002, "epoch": 0.303770197486535, "step": 4230}, {"loss": 0.7863, "grad_norm": 0.43566814064979553, "learning_rate": 0.0002, "epoch": 0.3044883303411131, "step": 4240}, {"loss": 0.7835, "grad_norm": 0.4479893445968628, "learning_rate": 0.0002, "epoch": 0.3052064631956912, "step": 4250}, {"loss": 0.7646, "grad_norm": 0.40390217304229736, "learning_rate": 0.0002, "epoch": 0.3059245960502693, "step": 4260}, {"loss": 0.8382, "grad_norm": 0.5143486261367798, "learning_rate": 0.0002, "epoch": 0.3066427289048474, "step": 4270}, {"loss": 0.799, "grad_norm": 0.5289962887763977, "learning_rate": 0.0002, "epoch": 0.3073608617594255, "step": 4280}, {"loss": 0.7706, "grad_norm": 0.609561026096344, "learning_rate": 0.0002, "epoch": 0.30807899461400357, "step": 4290}, {"loss": 0.809, "grad_norm": 0.5967493653297424, "learning_rate": 0.0002, "epoch": 0.3087971274685817, "step": 4300}, {"loss": 0.8034, "grad_norm": 0.5323672890663147, "learning_rate": 0.0002, "epoch": 0.3095152603231598, "step": 4310}, {"loss": 0.8463, "grad_norm": 0.4996737241744995, "learning_rate": 0.0002, "epoch": 0.3102333931777379, "step": 4320}, {"loss": 0.7879, "grad_norm": 0.5528829097747803, "learning_rate": 0.0002, "epoch": 0.31095152603231596, "step": 4330}, {"loss": 0.8383, "grad_norm": 0.5394268035888672, "learning_rate": 0.0002, "epoch": 0.3116696588868941, "step": 4340}, {"loss": 0.8258, "grad_norm": 0.4654628038406372, "learning_rate": 0.0002, "epoch": 0.3123877917414722, "step": 4350}, {"loss": 0.8045, "grad_norm": 0.4933706521987915, "learning_rate": 0.0002, "epoch": 0.31310592459605024, "step": 4360}, {"loss": 0.7891, "grad_norm": 0.5310598611831665, "learning_rate": 0.0002, "epoch": 0.31382405745062836, "step": 4370}, {"loss": 0.8362, "grad_norm": 0.5558765530586243, "learning_rate": 0.0002, "epoch": 0.31454219030520647, "step": 4380}, {"loss": 0.8013, "grad_norm": 0.5281313061714172, "learning_rate": 0.0002, "epoch": 0.3152603231597846, "step": 4390}, {"loss": 0.8034, "grad_norm": 0.5100293755531311, "learning_rate": 0.0002, "epoch": 0.31597845601436264, "step": 4400}, {"loss": 0.795, "grad_norm": 0.48762813210487366, "learning_rate": 0.0002, "epoch": 0.31669658886894075, "step": 4410}, {"loss": 0.7941, "grad_norm": 0.5211702585220337, "learning_rate": 0.0002, "epoch": 0.31741472172351887, "step": 4420}, {"loss": 0.8079, "grad_norm": 0.696747899055481, "learning_rate": 0.0002, "epoch": 0.3181328545780969, "step": 4430}, {"loss": 0.77, "grad_norm": 0.6334946751594543, "learning_rate": 0.0002, "epoch": 0.31885098743267504, "step": 4440}, {"loss": 0.7871, "grad_norm": 0.5333067178726196, "learning_rate": 0.0002, "epoch": 0.31956912028725315, "step": 4450}, {"loss": 0.7846, "grad_norm": 0.500091552734375, "learning_rate": 0.0002, "epoch": 0.32028725314183126, "step": 4460}, {"loss": 0.7884, "grad_norm": 0.5190957188606262, "learning_rate": 0.0002, "epoch": 0.3210053859964093, "step": 4470}, {"loss": 0.7988, "grad_norm": 0.6702370047569275, "learning_rate": 0.0002, "epoch": 0.32172351885098743, "step": 4480}, {"loss": 0.8014, "grad_norm": 0.4393869638442993, "learning_rate": 0.0002, "epoch": 0.32244165170556555, "step": 4490}, {"loss": 0.8373, "grad_norm": 0.4766499400138855, "learning_rate": 0.0002, "epoch": 0.3231597845601436, "step": 4500}, {"loss": 0.7567, "grad_norm": 0.561836838722229, "learning_rate": 0.0002, "epoch": 0.3238779174147217, "step": 4510}, {"loss": 0.7727, "grad_norm": 0.44366541504859924, "learning_rate": 0.0002, "epoch": 0.32459605026929983, "step": 4520}, {"loss": 0.8109, "grad_norm": 0.46504274010658264, "learning_rate": 0.0002, "epoch": 0.32531418312387794, "step": 4530}, {"loss": 0.7868, "grad_norm": 0.5498034954071045, "learning_rate": 0.0002, "epoch": 0.326032315978456, "step": 4540}, {"loss": 0.7638, "grad_norm": 0.5901338458061218, "learning_rate": 0.0002, "epoch": 0.3267504488330341, "step": 4550}, {"loss": 0.8016, "grad_norm": 0.5485442876815796, "learning_rate": 0.0002, "epoch": 0.3274685816876122, "step": 4560}, {"loss": 0.7944, "grad_norm": 0.512584924697876, "learning_rate": 0.0002, "epoch": 0.3281867145421903, "step": 4570}, {"loss": 0.8193, "grad_norm": 0.5208188891410828, "learning_rate": 0.0002, "epoch": 0.3289048473967684, "step": 4580}, {"loss": 0.7833, "grad_norm": 0.4923836886882782, "learning_rate": 0.0002, "epoch": 0.3296229802513465, "step": 4590}, {"loss": 0.8102, "grad_norm": 0.49258530139923096, "learning_rate": 0.0002, "epoch": 0.3303411131059246, "step": 4600}, {"loss": 0.7874, "grad_norm": 0.4788922667503357, "learning_rate": 0.0002, "epoch": 0.3310592459605027, "step": 4610}, {"loss": 0.8298, "grad_norm": 0.48276954889297485, "learning_rate": 0.0002, "epoch": 0.3317773788150808, "step": 4620}, {"loss": 0.8519, "grad_norm": 0.6300732493400574, "learning_rate": 0.0002, "epoch": 0.3324955116696589, "step": 4630}, {"loss": 0.8434, "grad_norm": 0.47594770789146423, "learning_rate": 0.0002, "epoch": 0.33321364452423696, "step": 4640}, {"loss": 0.8123, "grad_norm": 0.4728924632072449, "learning_rate": 0.0002, "epoch": 0.3339317773788151, "step": 4650}, {"loss": 0.8113, "grad_norm": 0.5586788654327393, "learning_rate": 0.0002, "epoch": 0.3346499102333932, "step": 4660}, {"loss": 0.7949, "grad_norm": 0.4573180377483368, "learning_rate": 0.0002, "epoch": 0.3353680430879713, "step": 4670}, {"loss": 0.8341, "grad_norm": 0.6391524076461792, "learning_rate": 0.0002, "epoch": 0.33608617594254936, "step": 4680}, {"loss": 0.8126, "grad_norm": 0.6570921540260315, "learning_rate": 0.0002, "epoch": 0.33680430879712747, "step": 4690}, {"loss": 0.796, "grad_norm": 0.4601454734802246, "learning_rate": 0.0002, "epoch": 0.3375224416517056, "step": 4700}, {"loss": 0.8158, "grad_norm": 0.5640755295753479, "learning_rate": 0.0002, "epoch": 0.33824057450628364, "step": 4710}, {"loss": 0.8326, "grad_norm": 0.43475520610809326, "learning_rate": 0.0002, "epoch": 0.33895870736086176, "step": 4720}, {"loss": 0.7684, "grad_norm": 0.4785807132720947, "learning_rate": 0.0002, "epoch": 0.33967684021543987, "step": 4730}, {"loss": 0.8257, "grad_norm": 0.4934665262699127, "learning_rate": 0.0002, "epoch": 0.340394973070018, "step": 4740}, {"loss": 0.7713, "grad_norm": 0.45327693223953247, "learning_rate": 0.0002, "epoch": 0.34111310592459604, "step": 4750}, {"loss": 0.7944, "grad_norm": 0.4710456430912018, "learning_rate": 0.0002, "epoch": 0.34183123877917415, "step": 4760}, {"loss": 0.7689, "grad_norm": 0.5591559410095215, "learning_rate": 0.0002, "epoch": 0.34254937163375226, "step": 4770}, {"loss": 0.8204, "grad_norm": 0.48958835005760193, "learning_rate": 0.0002, "epoch": 0.3432675044883303, "step": 4780}, {"loss": 0.8232, "grad_norm": 0.4613766670227051, "learning_rate": 0.0002, "epoch": 0.34398563734290843, "step": 4790}, {"loss": 0.8339, "grad_norm": 0.5425335764884949, "learning_rate": 0.0002, "epoch": 0.34470377019748655, "step": 4800}, {"loss": 0.828, "grad_norm": 0.4964924156665802, "learning_rate": 0.0002, "epoch": 0.3454219030520646, "step": 4810}, {"loss": 0.8264, "grad_norm": 0.613449215888977, "learning_rate": 0.0002, "epoch": 0.3461400359066427, "step": 4820}, {"loss": 0.846, "grad_norm": 0.6553348898887634, "learning_rate": 0.0002, "epoch": 0.34685816876122083, "step": 4830}, {"loss": 0.8181, "grad_norm": 0.5863470435142517, "learning_rate": 0.0002, "epoch": 0.34757630161579894, "step": 4840}, {"loss": 0.8205, "grad_norm": 0.5338097810745239, "learning_rate": 0.0002, "epoch": 0.348294434470377, "step": 4850}, {"loss": 0.7926, "grad_norm": 0.6129760146141052, "learning_rate": 0.0002, "epoch": 0.3490125673249551, "step": 4860}, {"loss": 0.7745, "grad_norm": 0.6100956797599792, "learning_rate": 0.0002, "epoch": 0.3497307001795332, "step": 4870}, {"loss": 0.7642, "grad_norm": 0.5478541254997253, "learning_rate": 0.0002, "epoch": 0.3504488330341113, "step": 4880}, {"loss": 0.7558, "grad_norm": 0.5725359916687012, "learning_rate": 0.0002, "epoch": 0.3511669658886894, "step": 4890}, {"loss": 0.8208, "grad_norm": 0.6141043901443481, "learning_rate": 0.0002, "epoch": 0.3518850987432675, "step": 4900}, {"loss": 0.841, "grad_norm": 0.597191572189331, "learning_rate": 0.0002, "epoch": 0.3526032315978456, "step": 4910}, {"loss": 0.8234, "grad_norm": 0.5988389253616333, "learning_rate": 0.0002, "epoch": 0.3533213644524237, "step": 4920}, {"loss": 0.7775, "grad_norm": 0.5503361821174622, "learning_rate": 0.0002, "epoch": 0.3540394973070018, "step": 4930}, {"loss": 0.8315, "grad_norm": 0.5932779312133789, "learning_rate": 0.0002, "epoch": 0.3547576301615799, "step": 4940}, {"loss": 0.8407, "grad_norm": 0.48911359906196594, "learning_rate": 0.0002, "epoch": 0.35547576301615796, "step": 4950}, {"loss": 0.8191, "grad_norm": 0.5435750484466553, "learning_rate": 0.0002, "epoch": 0.3561938958707361, "step": 4960}, {"loss": 0.7551, "grad_norm": 0.4786977767944336, "learning_rate": 0.0002, "epoch": 0.3569120287253142, "step": 4970}, {"loss": 0.7845, "grad_norm": 0.4022316336631775, "learning_rate": 0.0002, "epoch": 0.3576301615798923, "step": 4980}, {"loss": 0.8032, "grad_norm": 0.4848504364490509, "learning_rate": 0.0002, "epoch": 0.35834829443447036, "step": 4990}, {"loss": 0.809, "grad_norm": 0.5093459486961365, "learning_rate": 0.0002, "epoch": 0.3590664272890485, "step": 5000}, {"loss": 0.8424, "grad_norm": 0.47368478775024414, "learning_rate": 0.0002, "epoch": 0.3597845601436266, "step": 5010}, {"loss": 0.811, "grad_norm": 0.6041097044944763, "learning_rate": 0.0002, "epoch": 0.36050269299820464, "step": 5020}, {"loss": 0.8023, "grad_norm": 0.5384424924850464, "learning_rate": 0.0002, "epoch": 0.36122082585278276, "step": 5030}, {"loss": 0.826, "grad_norm": 0.4668518602848053, "learning_rate": 0.0002, "epoch": 0.36193895870736087, "step": 5040}, {"loss": 0.7785, "grad_norm": 0.5471060276031494, "learning_rate": 0.0002, "epoch": 0.362657091561939, "step": 5050}, {"loss": 0.7511, "grad_norm": 0.731369137763977, "learning_rate": 0.0002, "epoch": 0.36337522441651704, "step": 5060}, {"loss": 0.8646, "grad_norm": 0.5119590759277344, "learning_rate": 0.0002, "epoch": 0.36409335727109515, "step": 5070}, {"loss": 0.8125, "grad_norm": 0.567428469657898, "learning_rate": 0.0002, "epoch": 0.36481149012567327, "step": 5080}, {"loss": 0.7616, "grad_norm": 0.5139971375465393, "learning_rate": 0.0002, "epoch": 0.3655296229802513, "step": 5090}, {"loss": 0.8091, "grad_norm": 0.5701581835746765, "learning_rate": 0.0002, "epoch": 0.36624775583482944, "step": 5100}, {"loss": 0.821, "grad_norm": 0.5022063851356506, "learning_rate": 0.0002, "epoch": 0.36696588868940755, "step": 5110}, {"loss": 0.7879, "grad_norm": 0.4684354364871979, "learning_rate": 0.0002, "epoch": 0.36768402154398566, "step": 5120}, {"loss": 0.8028, "grad_norm": 0.5423495769500732, "learning_rate": 0.0002, "epoch": 0.3684021543985637, "step": 5130}, {"loss": 0.7763, "grad_norm": 0.46262967586517334, "learning_rate": 0.0002, "epoch": 0.36912028725314183, "step": 5140}, {"loss": 0.8485, "grad_norm": 0.4720141589641571, "learning_rate": 0.0002, "epoch": 0.36983842010771995, "step": 5150}, {"loss": 0.7778, "grad_norm": 0.5113096833229065, "learning_rate": 0.0002, "epoch": 0.370556552962298, "step": 5160}, {"loss": 0.7854, "grad_norm": 0.5253350138664246, "learning_rate": 0.0002, "epoch": 0.3712746858168761, "step": 5170}, {"loss": 0.8539, "grad_norm": 0.5799776315689087, "learning_rate": 0.0002, "epoch": 0.37199281867145423, "step": 5180}, {"loss": 0.78, "grad_norm": 0.5166001319885254, "learning_rate": 0.0002, "epoch": 0.37271095152603234, "step": 5190}, {"loss": 0.7939, "grad_norm": 0.5658290386199951, "learning_rate": 0.0002, "epoch": 0.3734290843806104, "step": 5200}, {"loss": 0.8059, "grad_norm": 0.45811113715171814, "learning_rate": 0.0002, "epoch": 0.3741472172351885, "step": 5210}, {"loss": 0.8024, "grad_norm": 0.5509489178657532, "learning_rate": 0.0002, "epoch": 0.3748653500897666, "step": 5220}, {"loss": 0.7537, "grad_norm": 0.47473257780075073, "learning_rate": 0.0002, "epoch": 0.3755834829443447, "step": 5230}, {"loss": 0.8159, "grad_norm": 0.3858596086502075, "learning_rate": 0.0002, "epoch": 0.3763016157989228, "step": 5240}, {"loss": 0.8592, "grad_norm": 0.6941536068916321, "learning_rate": 0.0002, "epoch": 0.3770197486535009, "step": 5250}, {"loss": 0.8489, "grad_norm": 0.46940872073173523, "learning_rate": 0.0002, "epoch": 0.377737881508079, "step": 5260}, {"loss": 0.7818, "grad_norm": 0.5413833260536194, "learning_rate": 0.0002, "epoch": 0.3784560143626571, "step": 5270}, {"loss": 0.8202, "grad_norm": 0.5165658593177795, "learning_rate": 0.0002, "epoch": 0.3791741472172352, "step": 5280}, {"loss": 0.7837, "grad_norm": 0.6567398309707642, "learning_rate": 0.0002, "epoch": 0.3798922800718133, "step": 5290}, {"loss": 0.7991, "grad_norm": 0.5466915965080261, "learning_rate": 0.0002, "epoch": 0.38061041292639136, "step": 5300}, {"loss": 0.7683, "grad_norm": 0.4800598621368408, "learning_rate": 0.0002, "epoch": 0.3813285457809695, "step": 5310}, {"loss": 0.8653, "grad_norm": 0.4551742970943451, "learning_rate": 0.0002, "epoch": 0.3820466786355476, "step": 5320}, {"loss": 0.8283, "grad_norm": 0.5561164617538452, "learning_rate": 0.0002, "epoch": 0.3827648114901257, "step": 5330}, {"loss": 0.8192, "grad_norm": 0.6170380115509033, "learning_rate": 0.0002, "epoch": 0.38348294434470376, "step": 5340}, {"loss": 0.8015, "grad_norm": 0.465762197971344, "learning_rate": 0.0002, "epoch": 0.38420107719928187, "step": 5350}, {"loss": 0.7561, "grad_norm": 0.6176838874816895, "learning_rate": 0.0002, "epoch": 0.38491921005386, "step": 5360}, {"loss": 0.7571, "grad_norm": 0.657926082611084, "learning_rate": 0.0002, "epoch": 0.38563734290843804, "step": 5370}, {"loss": 0.7366, "grad_norm": 0.5063281655311584, "learning_rate": 0.0002, "epoch": 0.38635547576301615, "step": 5380}, {"loss": 0.8259, "grad_norm": 0.6960828304290771, "learning_rate": 0.0002, "epoch": 0.38707360861759427, "step": 5390}, {"loss": 0.8058, "grad_norm": 0.46712034940719604, "learning_rate": 0.0002, "epoch": 0.3877917414721723, "step": 5400}, {"loss": 0.7674, "grad_norm": 0.598114013671875, "learning_rate": 0.0002, "epoch": 0.38850987432675044, "step": 5410}, {"loss": 0.8256, "grad_norm": 0.6798132061958313, "learning_rate": 0.0002, "epoch": 0.38922800718132855, "step": 5420}, {"loss": 0.844, "grad_norm": 0.5194289088249207, "learning_rate": 0.0002, "epoch": 0.38994614003590666, "step": 5430}, {"loss": 0.7666, "grad_norm": 0.48175323009490967, "learning_rate": 0.0002, "epoch": 0.3906642728904847, "step": 5440}, {"loss": 0.8089, "grad_norm": 0.4979408085346222, "learning_rate": 0.0002, "epoch": 0.39138240574506283, "step": 5450}, {"loss": 0.7938, "grad_norm": 0.6440972685813904, "learning_rate": 0.0002, "epoch": 0.39210053859964095, "step": 5460}, {"loss": 0.8531, "grad_norm": 0.5977227091789246, "learning_rate": 0.0002, "epoch": 0.392818671454219, "step": 5470}, {"loss": 0.8384, "grad_norm": 0.4735909104347229, "learning_rate": 0.0002, "epoch": 0.3935368043087971, "step": 5480}, {"loss": 0.8579, "grad_norm": 0.48181721568107605, "learning_rate": 0.0002, "epoch": 0.39425493716337523, "step": 5490}, {"loss": 0.8113, "grad_norm": 0.6339454650878906, "learning_rate": 0.0002, "epoch": 0.39497307001795334, "step": 5500}, {"loss": 0.7682, "grad_norm": 0.5364336371421814, "learning_rate": 0.0002, "epoch": 0.3956912028725314, "step": 5510}, {"loss": 0.8198, "grad_norm": 0.5499233603477478, "learning_rate": 0.0002, "epoch": 0.3964093357271095, "step": 5520}, {"loss": 0.7981, "grad_norm": 0.47249847650527954, "learning_rate": 0.0002, "epoch": 0.3971274685816876, "step": 5530}, {"loss": 0.8207, "grad_norm": 0.5692135095596313, "learning_rate": 0.0002, "epoch": 0.3978456014362657, "step": 5540}, {"loss": 0.8173, "grad_norm": 0.6009272933006287, "learning_rate": 0.0002, "epoch": 0.3985637342908438, "step": 5550}, {"loss": 0.7622, "grad_norm": 0.5198255181312561, "learning_rate": 0.0002, "epoch": 0.3992818671454219, "step": 5560}, {"loss": 0.8597, "grad_norm": 0.5474766492843628, "learning_rate": 0.0002, "epoch": 0.4, "step": 5570}, {"loss": 0.841, "grad_norm": 0.5577479600906372, "learning_rate": 0.0002, "epoch": 0.4007181328545781, "step": 5580}, {"loss": 0.7986, "grad_norm": 0.5350302457809448, "learning_rate": 0.0002, "epoch": 0.4014362657091562, "step": 5590}, {"loss": 0.7892, "grad_norm": 0.6310991048812866, "learning_rate": 0.0002, "epoch": 0.4021543985637343, "step": 5600}, {"loss": 0.7834, "grad_norm": 0.5695762038230896, "learning_rate": 0.0002, "epoch": 0.40287253141831236, "step": 5610}, {"loss": 0.7508, "grad_norm": 0.5431827306747437, "learning_rate": 0.0002, "epoch": 0.4035906642728905, "step": 5620}, {"loss": 0.8743, "grad_norm": 0.4923325777053833, "learning_rate": 0.0002, "epoch": 0.4043087971274686, "step": 5630}, {"loss": 0.7745, "grad_norm": 0.531399667263031, "learning_rate": 0.0002, "epoch": 0.4050269299820467, "step": 5640}, {"loss": 0.7982, "grad_norm": 0.5854769349098206, "learning_rate": 0.0002, "epoch": 0.40574506283662476, "step": 5650}, {"loss": 0.8225, "grad_norm": 0.6684802174568176, "learning_rate": 0.0002, "epoch": 0.40646319569120287, "step": 5660}, {"loss": 0.7405, "grad_norm": 0.6618620753288269, "learning_rate": 0.0002, "epoch": 0.407181328545781, "step": 5670}, {"loss": 0.7707, "grad_norm": 0.4930776059627533, "learning_rate": 0.0002, "epoch": 0.40789946140035904, "step": 5680}, {"loss": 0.7846, "grad_norm": 0.506628155708313, "learning_rate": 0.0002, "epoch": 0.40861759425493716, "step": 5690}, {"loss": 0.7827, "grad_norm": 0.5250783562660217, "learning_rate": 0.0002, "epoch": 0.40933572710951527, "step": 5700}, {"loss": 0.8386, "grad_norm": 0.6773046851158142, "learning_rate": 0.0002, "epoch": 0.4100538599640934, "step": 5710}, {"loss": 0.8096, "grad_norm": 0.6750592589378357, "learning_rate": 0.0002, "epoch": 0.41077199281867144, "step": 5720}, {"loss": 0.7873, "grad_norm": 0.5277232527732849, "learning_rate": 0.0002, "epoch": 0.41149012567324955, "step": 5730}, {"loss": 0.762, "grad_norm": 0.5155990719795227, "learning_rate": 0.0002, "epoch": 0.41220825852782766, "step": 5740}, {"loss": 0.871, "grad_norm": 0.5236294865608215, "learning_rate": 0.0002, "epoch": 0.4129263913824057, "step": 5750}, {"loss": 0.7753, "grad_norm": 0.5073592066764832, "learning_rate": 0.0002, "epoch": 0.41364452423698383, "step": 5760}, {"loss": 0.7984, "grad_norm": 0.6997184753417969, "learning_rate": 0.0002, "epoch": 0.41436265709156195, "step": 5770}, {"loss": 0.7579, "grad_norm": 0.5282439589500427, "learning_rate": 0.0002, "epoch": 0.41508078994614006, "step": 5780}, {"loss": 0.7831, "grad_norm": 0.4997355341911316, "learning_rate": 0.0002, "epoch": 0.4157989228007181, "step": 5790}, {"loss": 0.8022, "grad_norm": 0.6081610321998596, "learning_rate": 0.0002, "epoch": 0.41651705565529623, "step": 5800}, {"loss": 0.8068, "grad_norm": 0.5640295147895813, "learning_rate": 0.0002, "epoch": 0.41723518850987434, "step": 5810}, {"loss": 0.7819, "grad_norm": 0.6443586349487305, "learning_rate": 0.0002, "epoch": 0.4179533213644524, "step": 5820}, {"loss": 0.8132, "grad_norm": 0.6456229090690613, "learning_rate": 0.0002, "epoch": 0.4186714542190305, "step": 5830}, {"loss": 0.785, "grad_norm": 0.5422267317771912, "learning_rate": 0.0002, "epoch": 0.4193895870736086, "step": 5840}, {"loss": 0.7962, "grad_norm": 0.45251885056495667, "learning_rate": 0.0002, "epoch": 0.42010771992818674, "step": 5850}, {"loss": 0.7945, "grad_norm": 0.781165599822998, "learning_rate": 0.0002, "epoch": 0.4208258527827648, "step": 5860}, {"loss": 0.8171, "grad_norm": 0.5359160900115967, "learning_rate": 0.0002, "epoch": 0.4215439856373429, "step": 5870}, {"loss": 0.8012, "grad_norm": 0.6201958656311035, "learning_rate": 0.0002, "epoch": 0.422262118491921, "step": 5880}, {"loss": 0.8363, "grad_norm": 0.5985850691795349, "learning_rate": 0.0002, "epoch": 0.4229802513464991, "step": 5890}, {"loss": 0.7842, "grad_norm": 0.5550961494445801, "learning_rate": 0.0002, "epoch": 0.4236983842010772, "step": 5900}, {"loss": 0.7717, "grad_norm": 0.6284893155097961, "learning_rate": 0.0002, "epoch": 0.4244165170556553, "step": 5910}, {"loss": 0.8165, "grad_norm": 0.6143685579299927, "learning_rate": 0.0002, "epoch": 0.4251346499102334, "step": 5920}, {"loss": 0.7986, "grad_norm": 0.5065329670906067, "learning_rate": 0.0002, "epoch": 0.4258527827648115, "step": 5930}, {"loss": 0.7883, "grad_norm": 0.7274345755577087, "learning_rate": 0.0002, "epoch": 0.4265709156193896, "step": 5940}, {"loss": 0.8126, "grad_norm": 0.606531023979187, "learning_rate": 0.0002, "epoch": 0.4272890484739677, "step": 5950}, {"loss": 0.7805, "grad_norm": 0.5983648300170898, "learning_rate": 0.0002, "epoch": 0.42800718132854576, "step": 5960}, {"loss": 0.8124, "grad_norm": 0.5546031594276428, "learning_rate": 0.0002, "epoch": 0.4287253141831239, "step": 5970}, {"loss": 0.8184, "grad_norm": 0.666868269443512, "learning_rate": 0.0002, "epoch": 0.429443447037702, "step": 5980}, {"loss": 0.8171, "grad_norm": 0.41438576579093933, "learning_rate": 0.0002, "epoch": 0.4301615798922801, "step": 5990}, {"loss": 0.8456, "grad_norm": 0.5012526512145996, "learning_rate": 0.0002, "epoch": 0.43087971274685816, "step": 6000}, {"loss": 0.7837, "grad_norm": 0.6071694493293762, "learning_rate": 0.0002, "epoch": 0.43159784560143627, "step": 6010}, {"loss": 0.8364, "grad_norm": 0.5538384914398193, "learning_rate": 0.0002, "epoch": 0.4323159784560144, "step": 6020}, {"loss": 0.7888, "grad_norm": 0.5798718929290771, "learning_rate": 0.0002, "epoch": 0.43303411131059244, "step": 6030}, {"loss": 0.8196, "grad_norm": 0.5442442893981934, "learning_rate": 0.0002, "epoch": 0.43375224416517055, "step": 6040}, {"loss": 0.8041, "grad_norm": 0.6895565390586853, "learning_rate": 0.0002, "epoch": 0.43447037701974867, "step": 6050}, {"loss": 0.8154, "grad_norm": 0.6498045325279236, "learning_rate": 0.0002, "epoch": 0.4351885098743267, "step": 6060}, {"loss": 0.782, "grad_norm": 0.5225510001182556, "learning_rate": 0.0002, "epoch": 0.43590664272890484, "step": 6070}, {"loss": 0.7809, "grad_norm": 0.6366992592811584, "learning_rate": 0.0002, "epoch": 0.43662477558348295, "step": 6080}, {"loss": 0.7715, "grad_norm": 0.47929027676582336, "learning_rate": 0.0002, "epoch": 0.43734290843806106, "step": 6090}, {"loss": 0.7481, "grad_norm": 0.5722405910491943, "learning_rate": 0.0002, "epoch": 0.4380610412926391, "step": 6100}, {"loss": 0.765, "grad_norm": 0.6008004546165466, "learning_rate": 0.0002, "epoch": 0.43877917414721723, "step": 6110}, {"loss": 0.7795, "grad_norm": 0.5922580361366272, "learning_rate": 0.0002, "epoch": 0.43949730700179535, "step": 6120}, {"loss": 0.8542, "grad_norm": 0.7051905393600464, "learning_rate": 0.0002, "epoch": 0.4402154398563734, "step": 6130}, {"loss": 0.8159, "grad_norm": 0.5146450400352478, "learning_rate": 0.0002, "epoch": 0.4409335727109515, "step": 6140}, {"loss": 0.8178, "grad_norm": 0.5605781674385071, "learning_rate": 0.0002, "epoch": 0.44165170556552963, "step": 6150}, {"loss": 0.8409, "grad_norm": 0.8008661866188049, "learning_rate": 0.0002, "epoch": 0.44236983842010774, "step": 6160}, {"loss": 0.797, "grad_norm": 0.47406497597694397, "learning_rate": 0.0002, "epoch": 0.4430879712746858, "step": 6170}, {"loss": 0.7853, "grad_norm": 0.612287700176239, "learning_rate": 0.0002, "epoch": 0.4438061041292639, "step": 6180}, {"loss": 0.835, "grad_norm": 0.561188280582428, "learning_rate": 0.0002, "epoch": 0.444524236983842, "step": 6190}, {"loss": 0.7604, "grad_norm": 0.6233669519424438, "learning_rate": 0.0002, "epoch": 0.4452423698384201, "step": 6200}, {"loss": 0.7539, "grad_norm": 0.45546263456344604, "learning_rate": 0.0002, "epoch": 0.4459605026929982, "step": 6210}, {"loss": 0.8183, "grad_norm": 0.5947871208190918, "learning_rate": 0.0002, "epoch": 0.4466786355475763, "step": 6220}, {"loss": 0.789, "grad_norm": 0.6109753847122192, "learning_rate": 0.0002, "epoch": 0.4473967684021544, "step": 6230}, {"loss": 0.7811, "grad_norm": 0.6380727887153625, "learning_rate": 0.0002, "epoch": 0.4481149012567325, "step": 6240}, {"loss": 0.7845, "grad_norm": 0.5225699543952942, "learning_rate": 0.0002, "epoch": 0.4488330341113106, "step": 6250}, {"loss": 0.8217, "grad_norm": 0.521503210067749, "learning_rate": 0.0002, "epoch": 0.4495511669658887, "step": 6260}, {"loss": 0.8392, "grad_norm": 0.5523216128349304, "learning_rate": 0.0002, "epoch": 0.45026929982046676, "step": 6270}, {"loss": 0.8228, "grad_norm": 0.5954921841621399, "learning_rate": 0.0002, "epoch": 0.4509874326750449, "step": 6280}, {"loss": 0.7798, "grad_norm": 0.702751100063324, "learning_rate": 0.0002, "epoch": 0.451705565529623, "step": 6290}, {"loss": 0.7865, "grad_norm": 0.5756356120109558, "learning_rate": 0.0002, "epoch": 0.4524236983842011, "step": 6300}, {"loss": 0.8128, "grad_norm": 0.45365944504737854, "learning_rate": 0.0002, "epoch": 0.45314183123877916, "step": 6310}, {"loss": 0.8027, "grad_norm": 0.5027855038642883, "learning_rate": 0.0002, "epoch": 0.45385996409335727, "step": 6320}, {"loss": 0.8052, "grad_norm": 0.6551687121391296, "learning_rate": 0.0002, "epoch": 0.4545780969479354, "step": 6330}, {"loss": 0.7507, "grad_norm": 0.5296684503555298, "learning_rate": 0.0002, "epoch": 0.45529622980251344, "step": 6340}, {"loss": 0.8209, "grad_norm": 0.5762032866477966, "learning_rate": 0.0002, "epoch": 0.45601436265709155, "step": 6350}, {"loss": 0.8209, "grad_norm": 0.5234073996543884, "learning_rate": 0.0002, "epoch": 0.45673249551166967, "step": 6360}, {"loss": 0.8412, "grad_norm": 0.5090946555137634, "learning_rate": 0.0002, "epoch": 0.4574506283662478, "step": 6370}, {"loss": 0.787, "grad_norm": 0.6515111327171326, "learning_rate": 0.0002, "epoch": 0.45816876122082584, "step": 6380}, {"loss": 0.7351, "grad_norm": 0.7904898524284363, "learning_rate": 0.0002, "epoch": 0.45888689407540395, "step": 6390}, {"loss": 0.841, "grad_norm": 0.6379680037498474, "learning_rate": 0.0002, "epoch": 0.45960502692998206, "step": 6400}, {"loss": 0.7727, "grad_norm": 0.641759991645813, "learning_rate": 0.0002, "epoch": 0.4603231597845601, "step": 6410}, {"loss": 0.8346, "grad_norm": 0.5273829698562622, "learning_rate": 0.0002, "epoch": 0.46104129263913823, "step": 6420}, {"loss": 0.7722, "grad_norm": 0.5668497681617737, "learning_rate": 0.0002, "epoch": 0.46175942549371635, "step": 6430}, {"loss": 0.8157, "grad_norm": 0.5862061381340027, "learning_rate": 0.0002, "epoch": 0.46247755834829446, "step": 6440}, {"loss": 0.818, "grad_norm": 0.5239592790603638, "learning_rate": 0.0002, "epoch": 0.4631956912028725, "step": 6450}, {"loss": 0.7803, "grad_norm": 0.5078722834587097, "learning_rate": 0.0002, "epoch": 0.46391382405745063, "step": 6460}, {"loss": 0.7934, "grad_norm": 0.566509485244751, "learning_rate": 0.0002, "epoch": 0.46463195691202874, "step": 6470}, {"loss": 0.7746, "grad_norm": 0.5952697396278381, "learning_rate": 0.0002, "epoch": 0.4653500897666068, "step": 6480}, {"loss": 0.8088, "grad_norm": 0.6548156142234802, "learning_rate": 0.0002, "epoch": 0.4660682226211849, "step": 6490}, {"loss": 0.8303, "grad_norm": 0.4768427908420563, "learning_rate": 0.0002, "epoch": 0.466786355475763, "step": 6500}, {"loss": 0.805, "grad_norm": 0.5588273406028748, "learning_rate": 0.0002, "epoch": 0.46750448833034114, "step": 6510}, {"loss": 0.7774, "grad_norm": 0.5348677039146423, "learning_rate": 0.0002, "epoch": 0.4682226211849192, "step": 6520}, {"loss": 0.7969, "grad_norm": 0.4784318804740906, "learning_rate": 0.0002, "epoch": 0.4689407540394973, "step": 6530}, {"loss": 0.8073, "grad_norm": 0.5112265944480896, "learning_rate": 0.0002, "epoch": 0.4696588868940754, "step": 6540}, {"loss": 0.8289, "grad_norm": 0.7250495553016663, "learning_rate": 0.0002, "epoch": 0.4703770197486535, "step": 6550}, {"loss": 0.808, "grad_norm": 0.538608968257904, "learning_rate": 0.0002, "epoch": 0.4710951526032316, "step": 6560}, {"loss": 0.7977, "grad_norm": 0.5981247425079346, "learning_rate": 0.0002, "epoch": 0.4718132854578097, "step": 6570}, {"loss": 0.8092, "grad_norm": 0.5466762781143188, "learning_rate": 0.0002, "epoch": 0.4725314183123878, "step": 6580}, {"loss": 0.8136, "grad_norm": 0.5609987378120422, "learning_rate": 0.0002, "epoch": 0.4732495511669659, "step": 6590}, {"loss": 0.8575, "grad_norm": 0.6091027855873108, "learning_rate": 0.0002, "epoch": 0.473967684021544, "step": 6600}, {"loss": 0.7741, "grad_norm": 0.5542886853218079, "learning_rate": 0.0002, "epoch": 0.4746858168761221, "step": 6610}, {"loss": 0.7867, "grad_norm": 0.5656579732894897, "learning_rate": 0.0002, "epoch": 0.47540394973070016, "step": 6620}, {"loss": 0.7647, "grad_norm": 0.47507357597351074, "learning_rate": 0.0002, "epoch": 0.4761220825852783, "step": 6630}, {"loss": 0.8323, "grad_norm": 0.6039174199104309, "learning_rate": 0.0002, "epoch": 0.4768402154398564, "step": 6640}, {"loss": 0.7812, "grad_norm": 0.7129740715026855, "learning_rate": 0.0002, "epoch": 0.47755834829443444, "step": 6650}, {"loss": 0.8001, "grad_norm": 0.5189188718795776, "learning_rate": 0.0002, "epoch": 0.47827648114901256, "step": 6660}, {"loss": 0.7467, "grad_norm": 0.7548696398735046, "learning_rate": 0.0002, "epoch": 0.47899461400359067, "step": 6670}, {"loss": 0.7694, "grad_norm": 0.4729466438293457, "learning_rate": 0.0002, "epoch": 0.4797127468581688, "step": 6680}, {"loss": 0.7497, "grad_norm": 0.6190000772476196, "learning_rate": 0.0002, "epoch": 0.48043087971274684, "step": 6690}, {"loss": 0.7691, "grad_norm": 0.6276983022689819, "learning_rate": 0.0002, "epoch": 0.48114901256732495, "step": 6700}, {"loss": 0.7947, "grad_norm": 0.6097590923309326, "learning_rate": 0.0002, "epoch": 0.48186714542190306, "step": 6710}, {"loss": 0.7735, "grad_norm": 0.6507330536842346, "learning_rate": 0.0002, "epoch": 0.4825852782764811, "step": 6720}, {"loss": 0.817, "grad_norm": 0.5501991510391235, "learning_rate": 0.0002, "epoch": 0.48330341113105924, "step": 6730}, {"loss": 0.7998, "grad_norm": 0.5928015112876892, "learning_rate": 0.0002, "epoch": 0.48402154398563735, "step": 6740}, {"loss": 0.7717, "grad_norm": 0.5523008704185486, "learning_rate": 0.0002, "epoch": 0.48473967684021546, "step": 6750}, {"loss": 0.7821, "grad_norm": 0.5997263789176941, "learning_rate": 0.0002, "epoch": 0.4854578096947935, "step": 6760}, {"loss": 0.7619, "grad_norm": 0.6201002597808838, "learning_rate": 0.0002, "epoch": 0.48617594254937163, "step": 6770}, {"loss": 0.8018, "grad_norm": 0.6338862776756287, "learning_rate": 0.0002, "epoch": 0.48689407540394974, "step": 6780}, {"loss": 0.7547, "grad_norm": 0.5542550086975098, "learning_rate": 0.0002, "epoch": 0.4876122082585278, "step": 6790}, {"loss": 0.7754, "grad_norm": 0.5587872862815857, "learning_rate": 0.0002, "epoch": 0.4883303411131059, "step": 6800}, {"loss": 0.7913, "grad_norm": 0.5895681977272034, "learning_rate": 0.0002, "epoch": 0.489048473967684, "step": 6810}, {"loss": 0.7799, "grad_norm": 0.4948221743106842, "learning_rate": 0.0002, "epoch": 0.48976660682226214, "step": 6820}, {"loss": 0.8057, "grad_norm": 0.44546931982040405, "learning_rate": 0.0002, "epoch": 0.4904847396768402, "step": 6830}, {"loss": 0.8124, "grad_norm": 0.632046103477478, "learning_rate": 0.0002, "epoch": 0.4912028725314183, "step": 6840}, {"loss": 0.8014, "grad_norm": 0.49396243691444397, "learning_rate": 0.0002, "epoch": 0.4919210053859964, "step": 6850}, {"loss": 0.7127, "grad_norm": 0.497745156288147, "learning_rate": 0.0002, "epoch": 0.4926391382405745, "step": 6860}, {"loss": 0.8306, "grad_norm": 0.7336170077323914, "learning_rate": 0.0002, "epoch": 0.4933572710951526, "step": 6870}, {"loss": 0.8342, "grad_norm": 0.6723181009292603, "learning_rate": 0.0002, "epoch": 0.4940754039497307, "step": 6880}, {"loss": 0.8251, "grad_norm": 0.5887754559516907, "learning_rate": 0.0002, "epoch": 0.4947935368043088, "step": 6890}, {"loss": 0.7904, "grad_norm": 0.6580226421356201, "learning_rate": 0.0002, "epoch": 0.4955116696588869, "step": 6900}, {"loss": 0.8203, "grad_norm": 0.7385056614875793, "learning_rate": 0.0002, "epoch": 0.496229802513465, "step": 6910}, {"loss": 0.87, "grad_norm": 0.48736000061035156, "learning_rate": 0.0002, "epoch": 0.4969479353680431, "step": 6920}, {"loss": 0.8045, "grad_norm": 0.6304559111595154, "learning_rate": 0.0002, "epoch": 0.49766606822262116, "step": 6930}, {"loss": 0.8323, "grad_norm": 0.607148289680481, "learning_rate": 0.0002, "epoch": 0.4983842010771993, "step": 6940}, {"loss": 0.8277, "grad_norm": 0.5467981696128845, "learning_rate": 0.0002, "epoch": 0.4991023339317774, "step": 6950}, {"loss": 0.804, "grad_norm": 0.7046723961830139, "learning_rate": 0.0002, "epoch": 0.4998204667863555, "step": 6960}, {"loss": 0.7836, "grad_norm": 0.5487921833992004, "learning_rate": 0.0002, "epoch": 0.5005385996409336, "step": 6970}, {"loss": 0.8445, "grad_norm": 0.5706006288528442, "learning_rate": 0.0002, "epoch": 0.5012567324955116, "step": 6980}, {"loss": 0.8216, "grad_norm": 0.539536714553833, "learning_rate": 0.0002, "epoch": 0.5019748653500897, "step": 6990}, {"loss": 0.7829, "grad_norm": 0.5527397394180298, "learning_rate": 0.0002, "epoch": 0.5026929982046678, "step": 7000}, {"loss": 0.8342, "grad_norm": 0.5498567223548889, "learning_rate": 0.0002, "epoch": 0.503411131059246, "step": 7010}, {"loss": 0.8073, "grad_norm": 0.5878575444221497, "learning_rate": 0.0002, "epoch": 0.5041292639138241, "step": 7020}, {"loss": 0.8284, "grad_norm": 0.646153450012207, "learning_rate": 0.0002, "epoch": 0.5048473967684022, "step": 7030}, {"loss": 0.7758, "grad_norm": 0.5603899359703064, "learning_rate": 0.0002, "epoch": 0.5055655296229803, "step": 7040}, {"loss": 0.8002, "grad_norm": 0.5849952697753906, "learning_rate": 0.0002, "epoch": 0.5062836624775583, "step": 7050}, {"loss": 0.7953, "grad_norm": 0.6082724928855896, "learning_rate": 0.0002, "epoch": 0.5070017953321364, "step": 7060}, {"loss": 0.8046, "grad_norm": 0.5900670289993286, "learning_rate": 0.0002, "epoch": 0.5077199281867145, "step": 7070}, {"loss": 0.8612, "grad_norm": 0.5856624841690063, "learning_rate": 0.0002, "epoch": 0.5084380610412926, "step": 7080}, {"loss": 0.8289, "grad_norm": 0.6177338361740112, "learning_rate": 0.0002, "epoch": 0.5091561938958707, "step": 7090}, {"loss": 0.8139, "grad_norm": 0.5559300184249878, "learning_rate": 0.0002, "epoch": 0.5098743267504489, "step": 7100}, {"loss": 0.8083, "grad_norm": 0.62027907371521, "learning_rate": 0.0002, "epoch": 0.510592459605027, "step": 7110}, {"loss": 0.8037, "grad_norm": 0.6334301829338074, "learning_rate": 0.0002, "epoch": 0.511310592459605, "step": 7120}, {"loss": 0.8107, "grad_norm": 0.513795018196106, "learning_rate": 0.0002, "epoch": 0.5120287253141831, "step": 7130}, {"loss": 0.7566, "grad_norm": 0.7004675269126892, "learning_rate": 0.0002, "epoch": 0.5127468581687612, "step": 7140}, {"loss": 0.7893, "grad_norm": 0.5614308714866638, "learning_rate": 0.0002, "epoch": 0.5134649910233393, "step": 7150}, {"loss": 0.7868, "grad_norm": 0.5037539601325989, "learning_rate": 0.0002, "epoch": 0.5141831238779174, "step": 7160}, {"loss": 0.7981, "grad_norm": 0.5568661093711853, "learning_rate": 0.0002, "epoch": 0.5149012567324955, "step": 7170}, {"loss": 0.8333, "grad_norm": 0.7513397336006165, "learning_rate": 0.0002, "epoch": 0.5156193895870737, "step": 7180}, {"loss": 0.792, "grad_norm": 0.7264583706855774, "learning_rate": 0.0002, "epoch": 0.5163375224416517, "step": 7190}, {"loss": 0.8671, "grad_norm": 0.6355819702148438, "learning_rate": 0.0002, "epoch": 0.5170556552962298, "step": 7200}, {"loss": 0.7734, "grad_norm": 0.6063222289085388, "learning_rate": 0.0002, "epoch": 0.5177737881508079, "step": 7210}, {"loss": 0.812, "grad_norm": 0.6484307646751404, "learning_rate": 0.0002, "epoch": 0.518491921005386, "step": 7220}, {"loss": 0.7852, "grad_norm": 0.5260455012321472, "learning_rate": 0.0002, "epoch": 0.5192100538599641, "step": 7230}, {"loss": 0.8301, "grad_norm": 0.6718002557754517, "learning_rate": 0.0002, "epoch": 0.5199281867145422, "step": 7240}, {"loss": 0.8178, "grad_norm": 0.5997617244720459, "learning_rate": 0.0002, "epoch": 0.5206463195691203, "step": 7250}, {"loss": 0.7631, "grad_norm": 0.5838589668273926, "learning_rate": 0.0002, "epoch": 0.5213644524236983, "step": 7260}, {"loss": 0.7853, "grad_norm": 0.5755977630615234, "learning_rate": 0.0002, "epoch": 0.5220825852782764, "step": 7270}, {"loss": 0.8233, "grad_norm": 0.6442093253135681, "learning_rate": 0.0002, "epoch": 0.5228007181328546, "step": 7280}, {"loss": 0.822, "grad_norm": 0.6128416657447815, "learning_rate": 0.0002, "epoch": 0.5235188509874327, "step": 7290}, {"loss": 0.802, "grad_norm": 0.509742796421051, "learning_rate": 0.0002, "epoch": 0.5242369838420108, "step": 7300}, {"loss": 0.7438, "grad_norm": 0.5450230836868286, "learning_rate": 0.0002, "epoch": 0.5249551166965889, "step": 7310}, {"loss": 0.7881, "grad_norm": 0.5437141060829163, "learning_rate": 0.0002, "epoch": 0.525673249551167, "step": 7320}, {"loss": 0.795, "grad_norm": 0.5291738510131836, "learning_rate": 0.0002, "epoch": 0.526391382405745, "step": 7330}, {"loss": 0.8204, "grad_norm": 0.5101743936538696, "learning_rate": 0.0002, "epoch": 0.5271095152603231, "step": 7340}, {"loss": 0.856, "grad_norm": 0.5678408145904541, "learning_rate": 0.0002, "epoch": 0.5278276481149012, "step": 7350}, {"loss": 0.8435, "grad_norm": 0.6332360506057739, "learning_rate": 0.0002, "epoch": 0.5285457809694794, "step": 7360}, {"loss": 0.8521, "grad_norm": 0.4935058653354645, "learning_rate": 0.0002, "epoch": 0.5292639138240575, "step": 7370}, {"loss": 0.7699, "grad_norm": 0.6399656534194946, "learning_rate": 0.0002, "epoch": 0.5299820466786356, "step": 7380}, {"loss": 0.7956, "grad_norm": 0.5986794233322144, "learning_rate": 0.0002, "epoch": 0.5307001795332137, "step": 7390}, {"loss": 0.774, "grad_norm": 0.6948414444923401, "learning_rate": 0.0002, "epoch": 0.5314183123877917, "step": 7400}, {"loss": 0.8267, "grad_norm": 0.5337842106819153, "learning_rate": 0.0002, "epoch": 0.5321364452423698, "step": 7410}, {"loss": 0.7634, "grad_norm": 0.6897268295288086, "learning_rate": 0.0002, "epoch": 0.5328545780969479, "step": 7420}, {"loss": 0.7606, "grad_norm": 0.6361175179481506, "learning_rate": 0.0002, "epoch": 0.533572710951526, "step": 7430}, {"loss": 0.7592, "grad_norm": 0.5242252945899963, "learning_rate": 0.0002, "epoch": 0.5342908438061041, "step": 7440}, {"loss": 0.7387, "grad_norm": 0.5731322765350342, "learning_rate": 0.0002, "epoch": 0.5350089766606823, "step": 7450}, {"loss": 0.8215, "grad_norm": 0.5790955424308777, "learning_rate": 0.0002, "epoch": 0.5357271095152604, "step": 7460}, {"loss": 0.7714, "grad_norm": 0.4979061782360077, "learning_rate": 0.0002, "epoch": 0.5364452423698384, "step": 7470}, {"loss": 0.794, "grad_norm": 0.7335101962089539, "learning_rate": 0.0002, "epoch": 0.5371633752244165, "step": 7480}, {"loss": 0.787, "grad_norm": 0.592521071434021, "learning_rate": 0.0002, "epoch": 0.5378815080789946, "step": 7490}, {"loss": 0.7421, "grad_norm": 0.5784769654273987, "learning_rate": 0.0002, "epoch": 0.5385996409335727, "step": 7500}, {"loss": 0.789, "grad_norm": 0.8148589730262756, "learning_rate": 0.0002, "epoch": 0.5393177737881508, "step": 7510}, {"loss": 0.7777, "grad_norm": 0.5727689862251282, "learning_rate": 0.0002, "epoch": 0.5400359066427289, "step": 7520}, {"loss": 0.8321, "grad_norm": 0.6958279609680176, "learning_rate": 0.0002, "epoch": 0.540754039497307, "step": 7530}, {"loss": 0.7678, "grad_norm": 0.6302788257598877, "learning_rate": 0.0002, "epoch": 0.541472172351885, "step": 7540}, {"loss": 0.7772, "grad_norm": 0.5950970649719238, "learning_rate": 0.0002, "epoch": 0.5421903052064632, "step": 7550}, {"loss": 0.8076, "grad_norm": 0.4275270104408264, "learning_rate": 0.0002, "epoch": 0.5429084380610413, "step": 7560}, {"loss": 0.8158, "grad_norm": 0.7579900622367859, "learning_rate": 0.0002, "epoch": 0.5436265709156194, "step": 7570}, {"loss": 0.8036, "grad_norm": 0.5835317969322205, "learning_rate": 0.0002, "epoch": 0.5443447037701975, "step": 7580}, {"loss": 0.7947, "grad_norm": 0.5305142998695374, "learning_rate": 0.0002, "epoch": 0.5450628366247756, "step": 7590}, {"loss": 0.8043, "grad_norm": 0.6076129674911499, "learning_rate": 0.0002, "epoch": 0.5457809694793537, "step": 7600}, {"loss": 0.8197, "grad_norm": 0.5341935753822327, "learning_rate": 0.0002, "epoch": 0.5464991023339317, "step": 7610}, {"loss": 0.7424, "grad_norm": 0.6070826053619385, "learning_rate": 0.0002, "epoch": 0.5472172351885098, "step": 7620}, {"loss": 0.7801, "grad_norm": 0.6193035840988159, "learning_rate": 0.0002, "epoch": 0.547935368043088, "step": 7630}, {"loss": 0.7639, "grad_norm": 0.6171614527702332, "learning_rate": 0.0002, "epoch": 0.5486535008976661, "step": 7640}, {"loss": 0.7655, "grad_norm": 0.5700938105583191, "learning_rate": 0.0002, "epoch": 0.5493716337522442, "step": 7650}, {"loss": 0.8289, "grad_norm": 0.5742418169975281, "learning_rate": 0.0002, "epoch": 0.5500897666068223, "step": 7660}, {"loss": 0.7942, "grad_norm": 0.6450320482254028, "learning_rate": 0.0002, "epoch": 0.5508078994614004, "step": 7670}, {"loss": 0.807, "grad_norm": 0.542860209941864, "learning_rate": 0.0002, "epoch": 0.5515260323159784, "step": 7680}, {"loss": 0.8298, "grad_norm": 0.538007915019989, "learning_rate": 0.0002, "epoch": 0.5522441651705565, "step": 7690}, {"loss": 0.8301, "grad_norm": 0.5846288204193115, "learning_rate": 0.0002, "epoch": 0.5529622980251346, "step": 7700}, {"loss": 0.7893, "grad_norm": 0.623315155506134, "learning_rate": 0.0002, "epoch": 0.5536804308797127, "step": 7710}, {"loss": 0.8043, "grad_norm": 0.6607962250709534, "learning_rate": 0.0002, "epoch": 0.5543985637342909, "step": 7720}, {"loss": 0.7615, "grad_norm": 0.5258557200431824, "learning_rate": 0.0002, "epoch": 0.555116696588869, "step": 7730}, {"loss": 0.8177, "grad_norm": 0.6464316844940186, "learning_rate": 0.0002, "epoch": 0.5558348294434471, "step": 7740}, {"loss": 0.7683, "grad_norm": 0.6390621662139893, "learning_rate": 0.0002, "epoch": 0.5565529622980251, "step": 7750}, {"loss": 0.8447, "grad_norm": 0.5327560305595398, "learning_rate": 0.0002, "epoch": 0.5572710951526032, "step": 7760}, {"loss": 0.7833, "grad_norm": 0.8202064633369446, "learning_rate": 0.0002, "epoch": 0.5579892280071813, "step": 7770}, {"loss": 0.7818, "grad_norm": 0.45350968837738037, "learning_rate": 0.0002, "epoch": 0.5587073608617594, "step": 7780}, {"loss": 0.7299, "grad_norm": 0.5031413435935974, "learning_rate": 0.0002, "epoch": 0.5594254937163375, "step": 7790}, {"loss": 0.7542, "grad_norm": 0.5047417879104614, "learning_rate": 0.0002, "epoch": 0.5601436265709157, "step": 7800}, {"loss": 0.7989, "grad_norm": 0.668912410736084, "learning_rate": 0.0002, "epoch": 0.5608617594254938, "step": 7810}, {"loss": 0.8226, "grad_norm": 0.6106061339378357, "learning_rate": 0.0002, "epoch": 0.5615798922800718, "step": 7820}, {"loss": 0.7489, "grad_norm": 0.5558443665504456, "learning_rate": 0.0002, "epoch": 0.5622980251346499, "step": 7830}, {"loss": 0.79, "grad_norm": 0.5937177538871765, "learning_rate": 0.0002, "epoch": 0.563016157989228, "step": 7840}, {"loss": 0.7857, "grad_norm": 0.67307448387146, "learning_rate": 0.0002, "epoch": 0.5637342908438061, "step": 7850}, {"loss": 0.8037, "grad_norm": 0.4615475833415985, "learning_rate": 0.0002, "epoch": 0.5644524236983842, "step": 7860}, {"loss": 0.7519, "grad_norm": 0.5462577939033508, "learning_rate": 0.0002, "epoch": 0.5651705565529623, "step": 7870}, {"loss": 0.7821, "grad_norm": 0.6422402858734131, "learning_rate": 0.0002, "epoch": 0.5658886894075404, "step": 7880}, {"loss": 0.8327, "grad_norm": 0.5313532948493958, "learning_rate": 0.0002, "epoch": 0.5666068222621184, "step": 7890}, {"loss": 0.7771, "grad_norm": 0.5647847056388855, "learning_rate": 0.0002, "epoch": 0.5673249551166966, "step": 7900}, {"loss": 0.8126, "grad_norm": 0.6581610441207886, "learning_rate": 0.0002, "epoch": 0.5680430879712747, "step": 7910}, {"loss": 0.7549, "grad_norm": 0.46947669982910156, "learning_rate": 0.0002, "epoch": 0.5687612208258528, "step": 7920}, {"loss": 0.8333, "grad_norm": 0.6420038342475891, "learning_rate": 0.0002, "epoch": 0.5694793536804309, "step": 7930}, {"loss": 0.7921, "grad_norm": 0.6730441451072693, "learning_rate": 0.0002, "epoch": 0.570197486535009, "step": 7940}, {"loss": 0.7668, "grad_norm": 0.3849070966243744, "learning_rate": 0.0002, "epoch": 0.5709156193895871, "step": 7950}, {"loss": 0.8297, "grad_norm": 0.6076335906982422, "learning_rate": 0.0002, "epoch": 0.5716337522441651, "step": 7960}, {"loss": 0.7932, "grad_norm": 0.6446982026100159, "learning_rate": 0.0002, "epoch": 0.5723518850987432, "step": 7970}, {"loss": 0.7988, "grad_norm": 0.6019234657287598, "learning_rate": 0.0002, "epoch": 0.5730700179533214, "step": 7980}, {"loss": 0.8103, "grad_norm": 0.620880663394928, "learning_rate": 0.0002, "epoch": 0.5737881508078995, "step": 7990}, {"loss": 0.7712, "grad_norm": 0.4927573502063751, "learning_rate": 0.0002, "epoch": 0.5745062836624776, "step": 8000}, {"loss": 0.7499, "grad_norm": 0.6276804804801941, "learning_rate": 0.0002, "epoch": 0.5752244165170557, "step": 8010}, {"loss": 0.8232, "grad_norm": 0.484518826007843, "learning_rate": 0.0002, "epoch": 0.5759425493716338, "step": 8020}, {"loss": 0.7658, "grad_norm": 0.5019962787628174, "learning_rate": 0.0002, "epoch": 0.5766606822262118, "step": 8030}, {"loss": 0.7827, "grad_norm": 0.6685234308242798, "learning_rate": 0.0002, "epoch": 0.5773788150807899, "step": 8040}, {"loss": 0.7811, "grad_norm": 0.5762107372283936, "learning_rate": 0.0002, "epoch": 0.578096947935368, "step": 8050}, {"loss": 0.8256, "grad_norm": 0.6402477025985718, "learning_rate": 0.0002, "epoch": 0.5788150807899461, "step": 8060}, {"loss": 0.779, "grad_norm": 0.5919345617294312, "learning_rate": 0.0002, "epoch": 0.5795332136445243, "step": 8070}, {"loss": 0.8179, "grad_norm": 0.47100913524627686, "learning_rate": 0.0002, "epoch": 0.5802513464991024, "step": 8080}, {"loss": 0.7832, "grad_norm": 0.6029118895530701, "learning_rate": 0.0002, "epoch": 0.5809694793536805, "step": 8090}, {"loss": 0.8061, "grad_norm": 0.5896338820457458, "learning_rate": 0.0002, "epoch": 0.5816876122082585, "step": 8100}, {"loss": 0.7991, "grad_norm": 0.49017754197120667, "learning_rate": 0.0002, "epoch": 0.5824057450628366, "step": 8110}, {"loss": 0.8148, "grad_norm": 0.5049256086349487, "learning_rate": 0.0002, "epoch": 0.5831238779174147, "step": 8120}, {"loss": 0.7561, "grad_norm": 0.6874517798423767, "learning_rate": 0.0002, "epoch": 0.5838420107719928, "step": 8130}, {"loss": 0.7908, "grad_norm": 0.5429391264915466, "learning_rate": 0.0002, "epoch": 0.5845601436265709, "step": 8140}, {"loss": 0.7834, "grad_norm": 0.5533722639083862, "learning_rate": 0.0002, "epoch": 0.585278276481149, "step": 8150}, {"loss": 0.7725, "grad_norm": 0.5827956199645996, "learning_rate": 0.0002, "epoch": 0.5859964093357272, "step": 8160}, {"loss": 0.7758, "grad_norm": 0.6670212149620056, "learning_rate": 0.0002, "epoch": 0.5867145421903052, "step": 8170}, {"loss": 0.7625, "grad_norm": 0.5231172442436218, "learning_rate": 0.0002, "epoch": 0.5874326750448833, "step": 8180}, {"loss": 0.7975, "grad_norm": 0.567447304725647, "learning_rate": 0.0002, "epoch": 0.5881508078994614, "step": 8190}, {"loss": 0.7463, "grad_norm": 0.5318575501441956, "learning_rate": 0.0002, "epoch": 0.5888689407540395, "step": 8200}, {"loss": 0.7961, "grad_norm": 0.6959463357925415, "learning_rate": 0.0002, "epoch": 0.5895870736086176, "step": 8210}, {"loss": 0.7575, "grad_norm": 0.6964931488037109, "learning_rate": 0.0002, "epoch": 0.5903052064631957, "step": 8220}, {"loss": 0.8382, "grad_norm": 0.5164617896080017, "learning_rate": 0.0002, "epoch": 0.5910233393177737, "step": 8230}, {"loss": 0.8152, "grad_norm": 0.5456110239028931, "learning_rate": 0.0002, "epoch": 0.5917414721723518, "step": 8240}, {"loss": 0.7627, "grad_norm": 0.6553666591644287, "learning_rate": 0.0002, "epoch": 0.59245960502693, "step": 8250}, {"loss": 0.8134, "grad_norm": 0.6185845732688904, "learning_rate": 0.0002, "epoch": 0.5931777378815081, "step": 8260}, {"loss": 0.8216, "grad_norm": 0.6110545992851257, "learning_rate": 0.0002, "epoch": 0.5938958707360862, "step": 8270}, {"loss": 0.805, "grad_norm": 0.5186824202537537, "learning_rate": 0.0002, "epoch": 0.5946140035906643, "step": 8280}, {"loss": 0.7934, "grad_norm": 0.7003735303878784, "learning_rate": 0.0002, "epoch": 0.5953321364452424, "step": 8290}, {"loss": 0.8095, "grad_norm": 0.4606216549873352, "learning_rate": 0.0002, "epoch": 0.5960502692998204, "step": 8300}, {"loss": 0.8051, "grad_norm": 0.5903441309928894, "learning_rate": 0.0002, "epoch": 0.5967684021543985, "step": 8310}, {"loss": 0.7861, "grad_norm": 0.7916744947433472, "learning_rate": 0.0002, "epoch": 0.5974865350089766, "step": 8320}, {"loss": 0.8234, "grad_norm": 0.5506401062011719, "learning_rate": 0.0002, "epoch": 0.5982046678635548, "step": 8330}, {"loss": 0.8137, "grad_norm": 0.5749204158782959, "learning_rate": 0.0002, "epoch": 0.5989228007181329, "step": 8340}, {"loss": 0.8133, "grad_norm": 0.6807544827461243, "learning_rate": 0.0002, "epoch": 0.599640933572711, "step": 8350}, {"loss": 0.8089, "grad_norm": 0.5782986283302307, "learning_rate": 0.0002, "epoch": 0.6003590664272891, "step": 8360}, {"loss": 0.8725, "grad_norm": 0.7336342334747314, "learning_rate": 0.0002, "epoch": 0.6010771992818671, "step": 8370}, {"loss": 0.7992, "grad_norm": 0.5762712955474854, "learning_rate": 0.0002, "epoch": 0.6017953321364452, "step": 8380}, {"loss": 0.8037, "grad_norm": 0.5726776719093323, "learning_rate": 0.0002, "epoch": 0.6025134649910233, "step": 8390}, {"loss": 0.7918, "grad_norm": 0.5355535745620728, "learning_rate": 0.0002, "epoch": 0.6032315978456014, "step": 8400}, {"loss": 0.8138, "grad_norm": 0.6762161254882812, "learning_rate": 0.0002, "epoch": 0.6039497307001795, "step": 8410}, {"loss": 0.8357, "grad_norm": 0.8200717568397522, "learning_rate": 0.0002, "epoch": 0.6046678635547577, "step": 8420}, {"loss": 0.79, "grad_norm": 0.5600009560585022, "learning_rate": 0.0002, "epoch": 0.6053859964093358, "step": 8430}, {"loss": 0.7387, "grad_norm": 0.6465966105461121, "learning_rate": 0.0002, "epoch": 0.6061041292639138, "step": 8440}, {"loss": 0.838, "grad_norm": 0.5176072120666504, "learning_rate": 0.0002, "epoch": 0.6068222621184919, "step": 8450}, {"loss": 0.7855, "grad_norm": 0.5777280926704407, "learning_rate": 0.0002, "epoch": 0.60754039497307, "step": 8460}, {"loss": 0.7776, "grad_norm": 0.5989252924919128, "learning_rate": 0.0002, "epoch": 0.6082585278276481, "step": 8470}, {"loss": 0.8216, "grad_norm": 0.5207306742668152, "learning_rate": 0.0002, "epoch": 0.6089766606822262, "step": 8480}, {"loss": 0.8092, "grad_norm": 0.5242675542831421, "learning_rate": 0.0002, "epoch": 0.6096947935368043, "step": 8490}, {"loss": 0.7546, "grad_norm": 0.5631455183029175, "learning_rate": 0.0002, "epoch": 0.6104129263913824, "step": 8500}, {"loss": 0.7495, "grad_norm": 0.65207439661026, "learning_rate": 0.0002, "epoch": 0.6111310592459605, "step": 8510}, {"loss": 0.8023, "grad_norm": 0.5808899998664856, "learning_rate": 0.0002, "epoch": 0.6118491921005386, "step": 8520}, {"loss": 0.7763, "grad_norm": 0.558127760887146, "learning_rate": 0.0002, "epoch": 0.6125673249551167, "step": 8530}, {"loss": 0.8012, "grad_norm": 0.6063143014907837, "learning_rate": 0.0002, "epoch": 0.6132854578096948, "step": 8540}, {"loss": 0.7496, "grad_norm": 0.5491744875907898, "learning_rate": 0.0002, "epoch": 0.6140035906642729, "step": 8550}, {"loss": 0.779, "grad_norm": 0.5105780959129333, "learning_rate": 0.0002, "epoch": 0.614721723518851, "step": 8560}, {"loss": 0.7983, "grad_norm": 0.6892395615577698, "learning_rate": 0.0002, "epoch": 0.6154398563734291, "step": 8570}, {"loss": 0.7563, "grad_norm": 0.7411758899688721, "learning_rate": 0.0002, "epoch": 0.6161579892280071, "step": 8580}, {"loss": 0.7455, "grad_norm": 0.6745429635047913, "learning_rate": 0.0002, "epoch": 0.6168761220825852, "step": 8590}, {"loss": 0.8213, "grad_norm": 0.596007227897644, "learning_rate": 0.0002, "epoch": 0.6175942549371634, "step": 8600}, {"loss": 0.7963, "grad_norm": 0.6751060485839844, "learning_rate": 0.0002, "epoch": 0.6183123877917415, "step": 8610}, {"loss": 0.7343, "grad_norm": 0.711124837398529, "learning_rate": 0.0002, "epoch": 0.6190305206463196, "step": 8620}, {"loss": 0.773, "grad_norm": 0.6110914945602417, "learning_rate": 0.0002, "epoch": 0.6197486535008977, "step": 8630}, {"loss": 0.7497, "grad_norm": 0.5687659978866577, "learning_rate": 0.0002, "epoch": 0.6204667863554758, "step": 8640}, {"loss": 0.7754, "grad_norm": 0.7025772929191589, "learning_rate": 0.0002, "epoch": 0.6211849192100538, "step": 8650}, {"loss": 0.7423, "grad_norm": 0.6456184983253479, "learning_rate": 0.0002, "epoch": 0.6219030520646319, "step": 8660}, {"loss": 0.7449, "grad_norm": 0.5317023992538452, "learning_rate": 0.0002, "epoch": 0.62262118491921, "step": 8670}, {"loss": 0.8146, "grad_norm": 0.5531691908836365, "learning_rate": 0.0002, "epoch": 0.6233393177737881, "step": 8680}, {"loss": 0.8171, "grad_norm": 0.6063531637191772, "learning_rate": 0.0002, "epoch": 0.6240574506283663, "step": 8690}, {"loss": 0.7943, "grad_norm": 1.094390630722046, "learning_rate": 0.0002, "epoch": 0.6247755834829444, "step": 8700}, {"loss": 0.7993, "grad_norm": 0.5558148622512817, "learning_rate": 0.0002, "epoch": 0.6254937163375225, "step": 8710}, {"loss": 0.7747, "grad_norm": 0.5470370054244995, "learning_rate": 0.0002, "epoch": 0.6262118491921005, "step": 8720}, {"loss": 0.8252, "grad_norm": 0.5852634310722351, "learning_rate": 0.0002, "epoch": 0.6269299820466786, "step": 8730}, {"loss": 0.8712, "grad_norm": 0.6120240092277527, "learning_rate": 0.0002, "epoch": 0.6276481149012567, "step": 8740}, {"loss": 0.8367, "grad_norm": 0.5608004927635193, "learning_rate": 0.0002, "epoch": 0.6283662477558348, "step": 8750}, {"loss": 0.7711, "grad_norm": 0.5980432033538818, "learning_rate": 0.0002, "epoch": 0.6290843806104129, "step": 8760}, {"loss": 0.7903, "grad_norm": 0.5670580863952637, "learning_rate": 0.0002, "epoch": 0.629802513464991, "step": 8770}, {"loss": 0.7765, "grad_norm": 0.5931687951087952, "learning_rate": 0.0002, "epoch": 0.6305206463195692, "step": 8780}, {"loss": 0.7752, "grad_norm": 0.7872577905654907, "learning_rate": 0.0002, "epoch": 0.6312387791741472, "step": 8790}, {"loss": 0.8045, "grad_norm": 0.6355181336402893, "learning_rate": 0.0002, "epoch": 0.6319569120287253, "step": 8800}, {"loss": 0.7651, "grad_norm": 0.501913845539093, "learning_rate": 0.0002, "epoch": 0.6326750448833034, "step": 8810}, {"loss": 0.8023, "grad_norm": 0.5956716537475586, "learning_rate": 0.0002, "epoch": 0.6333931777378815, "step": 8820}, {"loss": 0.798, "grad_norm": 0.6448253393173218, "learning_rate": 0.0002, "epoch": 0.6341113105924596, "step": 8830}, {"loss": 0.7878, "grad_norm": 0.6139631271362305, "learning_rate": 0.0002, "epoch": 0.6348294434470377, "step": 8840}, {"loss": 0.7767, "grad_norm": 0.5894306302070618, "learning_rate": 0.0002, "epoch": 0.6355475763016158, "step": 8850}, {"loss": 0.7516, "grad_norm": 0.8724799752235413, "learning_rate": 0.0002, "epoch": 0.6362657091561938, "step": 8860}, {"loss": 0.7715, "grad_norm": 0.5413858890533447, "learning_rate": 0.0002, "epoch": 0.636983842010772, "step": 8870}, {"loss": 0.8175, "grad_norm": 0.5993430614471436, "learning_rate": 0.0002, "epoch": 0.6377019748653501, "step": 8880}, {"loss": 0.7865, "grad_norm": 0.539415717124939, "learning_rate": 0.0002, "epoch": 0.6384201077199282, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.600125789642334, "learning_rate": 0.0002, "epoch": 0.6391382405745063, "step": 8900}, {"loss": 0.7886, "grad_norm": 0.5597978234291077, "learning_rate": 0.0002, "epoch": 0.6398563734290844, "step": 8910}, {"loss": 0.8468, "grad_norm": 0.6262031197547913, "learning_rate": 0.0002, "epoch": 0.6405745062836625, "step": 8920}, {"loss": 0.7523, "grad_norm": 0.72662752866745, "learning_rate": 0.0002, "epoch": 0.6412926391382405, "step": 8930}, {"loss": 0.8099, "grad_norm": 0.613002598285675, "learning_rate": 0.0002, "epoch": 0.6420107719928186, "step": 8940}, {"loss": 0.8112, "grad_norm": 0.6511827707290649, "learning_rate": 0.0002, "epoch": 0.6427289048473968, "step": 8950}, {"loss": 0.7479, "grad_norm": 0.5383973717689514, "learning_rate": 0.0002, "epoch": 0.6434470377019749, "step": 8960}, {"loss": 0.764, "grad_norm": 0.5236184597015381, "learning_rate": 0.0002, "epoch": 0.644165170556553, "step": 8970}, {"loss": 0.7515, "grad_norm": 0.5938544273376465, "learning_rate": 0.0002, "epoch": 0.6448833034111311, "step": 8980}, {"loss": 0.8103, "grad_norm": 0.4594680964946747, "learning_rate": 0.0002, "epoch": 0.6456014362657092, "step": 8990}, {"loss": 0.7495, "grad_norm": 0.6314211487770081, "learning_rate": 0.0002, "epoch": 0.6463195691202872, "step": 9000}, {"loss": 0.8162, "grad_norm": 0.6291103363037109, "learning_rate": 0.0002, "epoch": 0.6470377019748653, "step": 9010}, {"loss": 0.8167, "grad_norm": 0.5888266563415527, "learning_rate": 0.0002, "epoch": 0.6477558348294434, "step": 9020}, {"loss": 0.7685, "grad_norm": 0.5613022446632385, "learning_rate": 0.0002, "epoch": 0.6484739676840215, "step": 9030}, {"loss": 0.8142, "grad_norm": 0.7219604253768921, "learning_rate": 0.0002, "epoch": 0.6491921005385997, "step": 9040}, {"loss": 0.805, "grad_norm": 0.5846529006958008, "learning_rate": 0.0002, "epoch": 0.6499102333931778, "step": 9050}, {"loss": 0.8471, "grad_norm": 0.7264063954353333, "learning_rate": 0.0002, "epoch": 0.6506283662477559, "step": 9060}, {"loss": 0.7925, "grad_norm": 0.5797538757324219, "learning_rate": 0.0002, "epoch": 0.6513464991023339, "step": 9070}, {"loss": 0.7961, "grad_norm": 0.4857395887374878, "learning_rate": 0.0002, "epoch": 0.652064631956912, "step": 9080}, {"loss": 0.7567, "grad_norm": 0.5044030547142029, "learning_rate": 0.0002, "epoch": 0.6527827648114901, "step": 9090}, {"loss": 0.7889, "grad_norm": 0.6105342507362366, "learning_rate": 0.0002, "epoch": 0.6535008976660682, "step": 9100}, {"loss": 0.7692, "grad_norm": 0.6408740282058716, "learning_rate": 0.0002, "epoch": 0.6542190305206463, "step": 9110}, {"loss": 0.7788, "grad_norm": 0.7474880814552307, "learning_rate": 0.0002, "epoch": 0.6549371633752245, "step": 9120}, {"loss": 0.7694, "grad_norm": 0.584768533706665, "learning_rate": 0.0002, "epoch": 0.6556552962298026, "step": 9130}, {"loss": 0.8273, "grad_norm": 0.6368113160133362, "learning_rate": 0.0002, "epoch": 0.6563734290843806, "step": 9140}, {"loss": 0.7493, "grad_norm": 0.693631649017334, "learning_rate": 0.0002, "epoch": 0.6570915619389587, "step": 9150}, {"loss": 0.7636, "grad_norm": 0.6094512343406677, "learning_rate": 0.0002, "epoch": 0.6578096947935368, "step": 9160}, {"loss": 0.8269, "grad_norm": 0.7154942750930786, "learning_rate": 0.0002, "epoch": 0.6585278276481149, "step": 9170}, {"loss": 0.7623, "grad_norm": 0.5749237537384033, "learning_rate": 0.0002, "epoch": 0.659245960502693, "step": 9180}, {"loss": 0.799, "grad_norm": 0.6214450001716614, "learning_rate": 0.0002, "epoch": 0.6599640933572711, "step": 9190}, {"loss": 0.7973, "grad_norm": 0.6357814073562622, "learning_rate": 0.0002, "epoch": 0.6606822262118492, "step": 9200}, {"loss": 0.773, "grad_norm": 0.5677326917648315, "learning_rate": 0.0002, "epoch": 0.6614003590664272, "step": 9210}, {"loss": 0.8173, "grad_norm": 0.5432633757591248, "learning_rate": 0.0002, "epoch": 0.6621184919210054, "step": 9220}, {"loss": 0.7573, "grad_norm": 0.43935060501098633, "learning_rate": 0.0002, "epoch": 0.6628366247755835, "step": 9230}, {"loss": 0.848, "grad_norm": 0.5350922346115112, "learning_rate": 0.0002, "epoch": 0.6635547576301616, "step": 9240}, {"loss": 0.7409, "grad_norm": 0.7745687365531921, "learning_rate": 0.0002, "epoch": 0.6642728904847397, "step": 9250}, {"loss": 0.7412, "grad_norm": 0.5767113566398621, "learning_rate": 0.0002, "epoch": 0.6649910233393178, "step": 9260}, {"loss": 0.8197, "grad_norm": 0.49304983019828796, "learning_rate": 0.0002, "epoch": 0.6657091561938959, "step": 9270}, {"loss": 0.7856, "grad_norm": 0.6355269551277161, "learning_rate": 0.0002, "epoch": 0.6664272890484739, "step": 9280}, {"loss": 0.7659, "grad_norm": 0.5539451241493225, "learning_rate": 0.0002, "epoch": 0.667145421903052, "step": 9290}, {"loss": 0.7888, "grad_norm": 0.5225138068199158, "learning_rate": 0.0002, "epoch": 0.6678635547576302, "step": 9300}, {"loss": 0.8048, "grad_norm": 0.5435736179351807, "learning_rate": 0.0002, "epoch": 0.6685816876122083, "step": 9310}, {"loss": 0.8284, "grad_norm": 0.611266553401947, "learning_rate": 0.0002, "epoch": 0.6692998204667864, "step": 9320}, {"loss": 0.8081, "grad_norm": 0.5880926251411438, "learning_rate": 0.0002, "epoch": 0.6700179533213645, "step": 9330}, {"loss": 0.7781, "grad_norm": 0.5301468372344971, "learning_rate": 0.0002, "epoch": 0.6707360861759426, "step": 9340}, {"loss": 0.7586, "grad_norm": 0.5614377856254578, "learning_rate": 0.0002, "epoch": 0.6714542190305206, "step": 9350}, {"loss": 0.7538, "grad_norm": 0.7177342176437378, "learning_rate": 0.0002, "epoch": 0.6721723518850987, "step": 9360}, {"loss": 0.7412, "grad_norm": 0.5187423825263977, "learning_rate": 0.0002, "epoch": 0.6728904847396768, "step": 9370}, {"loss": 0.7456, "grad_norm": 0.49305087327957153, "learning_rate": 0.0002, "epoch": 0.6736086175942549, "step": 9380}, {"loss": 0.7926, "grad_norm": 0.555867612361908, "learning_rate": 0.0002, "epoch": 0.6743267504488331, "step": 9390}, {"loss": 0.7486, "grad_norm": 0.8308040499687195, "learning_rate": 0.0002, "epoch": 0.6750448833034112, "step": 9400}, {"loss": 0.8225, "grad_norm": 0.6522438526153564, "learning_rate": 0.0002, "epoch": 0.6757630161579893, "step": 9410}, {"loss": 0.8283, "grad_norm": 0.5768371224403381, "learning_rate": 0.0002, "epoch": 0.6764811490125673, "step": 9420}, {"loss": 0.7815, "grad_norm": 0.783802330493927, "learning_rate": 0.0002, "epoch": 0.6771992818671454, "step": 9430}, {"loss": 0.7511, "grad_norm": 0.5246656537055969, "learning_rate": 0.0002, "epoch": 0.6779174147217235, "step": 9440}, {"loss": 0.7866, "grad_norm": 0.6630974411964417, "learning_rate": 0.0002, "epoch": 0.6786355475763016, "step": 9450}, {"loss": 0.7961, "grad_norm": 0.5012770295143127, "learning_rate": 0.0002, "epoch": 0.6793536804308797, "step": 9460}, {"loss": 0.7762, "grad_norm": 0.6208643317222595, "learning_rate": 0.0002, "epoch": 0.6800718132854578, "step": 9470}, {"loss": 0.7229, "grad_norm": 0.6033898591995239, "learning_rate": 0.0002, "epoch": 0.680789946140036, "step": 9480}, {"loss": 0.8315, "grad_norm": 0.6613174080848694, "learning_rate": 0.0002, "epoch": 0.681508078994614, "step": 9490}, {"loss": 0.7874, "grad_norm": 0.6417899131774902, "learning_rate": 0.0002, "epoch": 0.6822262118491921, "step": 9500}, {"loss": 0.7979, "grad_norm": 0.5060321092605591, "learning_rate": 0.0002, "epoch": 0.6829443447037702, "step": 9510}, {"loss": 0.7908, "grad_norm": 0.586670458316803, "learning_rate": 0.0002, "epoch": 0.6836624775583483, "step": 9520}, {"loss": 0.7652, "grad_norm": 0.6607828736305237, "learning_rate": 0.0002, "epoch": 0.6843806104129264, "step": 9530}, {"loss": 0.7645, "grad_norm": 0.5142775177955627, "learning_rate": 0.0002, "epoch": 0.6850987432675045, "step": 9540}, {"loss": 0.7553, "grad_norm": 0.741000771522522, "learning_rate": 0.0002, "epoch": 0.6858168761220825, "step": 9550}, {"loss": 0.8453, "grad_norm": 0.4687826335430145, "learning_rate": 0.0002, "epoch": 0.6865350089766606, "step": 9560}, {"loss": 0.7582, "grad_norm": 0.6452056169509888, "learning_rate": 0.0002, "epoch": 0.6872531418312388, "step": 9570}, {"loss": 0.7965, "grad_norm": 0.6393555402755737, "learning_rate": 0.0002, "epoch": 0.6879712746858169, "step": 9580}, {"loss": 0.802, "grad_norm": 0.4907757043838501, "learning_rate": 0.0002, "epoch": 0.688689407540395, "step": 9590}, {"loss": 0.7813, "grad_norm": 0.5380825996398926, "learning_rate": 0.0002, "epoch": 0.6894075403949731, "step": 9600}, {"loss": 0.8188, "grad_norm": 0.5657393932342529, "learning_rate": 0.0002, "epoch": 0.6901256732495512, "step": 9610}, {"loss": 0.7581, "grad_norm": 0.8505447506904602, "learning_rate": 0.0002, "epoch": 0.6908438061041292, "step": 9620}, {"loss": 0.7631, "grad_norm": 0.5389836430549622, "learning_rate": 0.0002, "epoch": 0.6915619389587073, "step": 9630}, {"loss": 0.8015, "grad_norm": 0.4977441728115082, "learning_rate": 0.0002, "epoch": 0.6922800718132854, "step": 9640}, {"loss": 0.8057, "grad_norm": 0.5855389833450317, "learning_rate": 0.0002, "epoch": 0.6929982046678635, "step": 9650}, {"loss": 0.7735, "grad_norm": 0.633994996547699, "learning_rate": 0.0002, "epoch": 0.6937163375224417, "step": 9660}, {"loss": 0.7918, "grad_norm": 0.5592191815376282, "learning_rate": 0.0002, "epoch": 0.6944344703770198, "step": 9670}, {"loss": 0.7883, "grad_norm": 0.6030594706535339, "learning_rate": 0.0002, "epoch": 0.6951526032315979, "step": 9680}, {"loss": 0.7472, "grad_norm": 0.6782388687133789, "learning_rate": 0.0002, "epoch": 0.6958707360861759, "step": 9690}, {"loss": 0.8097, "grad_norm": 0.6777627468109131, "learning_rate": 0.0002, "epoch": 0.696588868940754, "step": 9700}, {"loss": 0.7958, "grad_norm": 0.5674123764038086, "learning_rate": 0.0002, "epoch": 0.6973070017953321, "step": 9710}, {"loss": 0.7743, "grad_norm": 0.5280387997627258, "learning_rate": 0.0002, "epoch": 0.6980251346499102, "step": 9720}, {"loss": 0.7496, "grad_norm": 0.5471981763839722, "learning_rate": 0.0002, "epoch": 0.6987432675044883, "step": 9730}, {"loss": 0.7837, "grad_norm": 0.6751061677932739, "learning_rate": 0.0002, "epoch": 0.6994614003590665, "step": 9740}, {"loss": 0.7686, "grad_norm": 0.5942487716674805, "learning_rate": 0.0002, "epoch": 0.7001795332136446, "step": 9750}, {"loss": 0.757, "grad_norm": 0.6165713667869568, "learning_rate": 0.0002, "epoch": 0.7008976660682226, "step": 9760}, {"loss": 0.7864, "grad_norm": 0.5745091438293457, "learning_rate": 0.0002, "epoch": 0.7016157989228007, "step": 9770}, {"loss": 0.8079, "grad_norm": 0.600308358669281, "learning_rate": 0.0002, "epoch": 0.7023339317773788, "step": 9780}, {"loss": 0.7527, "grad_norm": 0.6448577046394348, "learning_rate": 0.0002, "epoch": 0.7030520646319569, "step": 9790}, {"loss": 0.7725, "grad_norm": 0.5662767291069031, "learning_rate": 0.0002, "epoch": 0.703770197486535, "step": 9800}, {"loss": 0.8028, "grad_norm": 0.6490433812141418, "learning_rate": 0.0002, "epoch": 0.7044883303411131, "step": 9810}, {"loss": 0.8006, "grad_norm": 0.6126134991645813, "learning_rate": 0.0002, "epoch": 0.7052064631956912, "step": 9820}, {"loss": 0.8034, "grad_norm": 0.7181116938591003, "learning_rate": 0.0002, "epoch": 0.7059245960502692, "step": 9830}, {"loss": 0.7937, "grad_norm": 0.7805212140083313, "learning_rate": 0.0002, "epoch": 0.7066427289048474, "step": 9840}, {"loss": 0.7781, "grad_norm": 0.7521958947181702, "learning_rate": 0.0002, "epoch": 0.7073608617594255, "step": 9850}, {"loss": 0.7412, "grad_norm": 0.5610787868499756, "learning_rate": 0.0002, "epoch": 0.7080789946140036, "step": 9860}, {"loss": 0.7627, "grad_norm": 0.7026229500770569, "learning_rate": 0.0002, "epoch": 0.7087971274685817, "step": 9870}, {"loss": 0.8085, "grad_norm": 0.551691472530365, "learning_rate": 0.0002, "epoch": 0.7095152603231598, "step": 9880}, {"loss": 0.7874, "grad_norm": 0.5841995477676392, "learning_rate": 0.0002, "epoch": 0.7102333931777379, "step": 9890}, {"loss": 0.7749, "grad_norm": 0.7170061469078064, "learning_rate": 0.0002, "epoch": 0.7109515260323159, "step": 9900}, {"loss": 0.7917, "grad_norm": 0.49836990237236023, "learning_rate": 0.0002, "epoch": 0.711669658886894, "step": 9910}, {"loss": 0.7667, "grad_norm": 0.5234556794166565, "learning_rate": 0.0002, "epoch": 0.7123877917414722, "step": 9920}, {"loss": 0.8438, "grad_norm": 0.7590384483337402, "learning_rate": 0.0002, "epoch": 0.7131059245960503, "step": 9930}, {"loss": 0.7725, "grad_norm": 0.5657515525817871, "learning_rate": 0.0002, "epoch": 0.7138240574506284, "step": 9940}, {"loss": 0.8184, "grad_norm": 0.5969128012657166, "learning_rate": 0.0002, "epoch": 0.7145421903052065, "step": 9950}, {"loss": 0.7375, "grad_norm": 0.7136867046356201, "learning_rate": 0.0002, "epoch": 0.7152603231597846, "step": 9960}, {"loss": 0.7883, "grad_norm": 0.6774699091911316, "learning_rate": 0.0002, "epoch": 0.7159784560143626, "step": 9970}, {"loss": 0.7629, "grad_norm": 0.6066371202468872, "learning_rate": 0.0002, "epoch": 0.7166965888689407, "step": 9980}, {"loss": 0.7767, "grad_norm": 0.7355279922485352, "learning_rate": 0.0002, "epoch": 0.7174147217235188, "step": 9990}, {"loss": 0.7643, "grad_norm": 0.7996646761894226, "learning_rate": 0.0002, "epoch": 0.718132854578097, "step": 10000}, {"loss": 0.8304, "grad_norm": 0.628839910030365, "learning_rate": 0.0002, "epoch": 0.7188509874326751, "step": 10010}, {"loss": 0.7292, "grad_norm": 0.5472931265830994, "learning_rate": 0.0002, "epoch": 0.7195691202872532, "step": 10020}, {"loss": 0.7787, "grad_norm": 0.5776344537734985, "learning_rate": 0.0002, "epoch": 0.7202872531418313, "step": 10030}, {"loss": 0.7432, "grad_norm": 0.5041707158088684, "learning_rate": 0.0002, "epoch": 0.7210053859964093, "step": 10040}, {"loss": 0.7923, "grad_norm": 0.5965308547019958, "learning_rate": 0.0002, "epoch": 0.7217235188509874, "step": 10050}, {"loss": 0.8131, "grad_norm": 0.5892689228057861, "learning_rate": 0.0002, "epoch": 0.7224416517055655, "step": 10060}, {"loss": 0.7961, "grad_norm": 0.5695884227752686, "learning_rate": 0.0002, "epoch": 0.7231597845601436, "step": 10070}, {"loss": 0.7806, "grad_norm": 0.6547690629959106, "learning_rate": 0.0002, "epoch": 0.7238779174147217, "step": 10080}, {"loss": 0.7978, "grad_norm": 0.6759928464889526, "learning_rate": 0.0002, "epoch": 0.7245960502692999, "step": 10090}, {"loss": 0.7547, "grad_norm": 0.6829725503921509, "learning_rate": 0.0002, "epoch": 0.725314183123878, "step": 10100}, {"loss": 0.7507, "grad_norm": 0.5242751240730286, "learning_rate": 0.0002, "epoch": 0.726032315978456, "step": 10110}, {"loss": 0.8042, "grad_norm": 0.6947014927864075, "learning_rate": 0.0002, "epoch": 0.7267504488330341, "step": 10120}, {"loss": 0.7621, "grad_norm": 0.6094982624053955, "learning_rate": 0.0002, "epoch": 0.7274685816876122, "step": 10130}, {"loss": 0.7911, "grad_norm": 0.628461480140686, "learning_rate": 0.0002, "epoch": 0.7281867145421903, "step": 10140}, {"loss": 0.7839, "grad_norm": 0.4952087104320526, "learning_rate": 0.0002, "epoch": 0.7289048473967684, "step": 10150}, {"loss": 0.7582, "grad_norm": 0.6917221546173096, "learning_rate": 0.0002, "epoch": 0.7296229802513465, "step": 10160}, {"loss": 0.7791, "grad_norm": 0.6866413354873657, "learning_rate": 0.0002, "epoch": 0.7303411131059246, "step": 10170}, {"loss": 0.7628, "grad_norm": 0.5505863428115845, "learning_rate": 0.0002, "epoch": 0.7310592459605026, "step": 10180}, {"loss": 0.7941, "grad_norm": 0.5903199911117554, "learning_rate": 0.0002, "epoch": 0.7317773788150808, "step": 10190}, {"loss": 0.8072, "grad_norm": 0.5001798272132874, "learning_rate": 0.0002, "epoch": 0.7324955116696589, "step": 10200}, {"loss": 0.7934, "grad_norm": 0.5117581486701965, "learning_rate": 0.0002, "epoch": 0.733213644524237, "step": 10210}, {"loss": 0.8364, "grad_norm": 0.7716088891029358, "learning_rate": 0.0002, "epoch": 0.7339317773788151, "step": 10220}, {"loss": 0.7775, "grad_norm": 0.5973874926567078, "learning_rate": 0.0002, "epoch": 0.7346499102333932, "step": 10230}, {"loss": 0.7689, "grad_norm": 0.6433483362197876, "learning_rate": 0.0002, "epoch": 0.7353680430879713, "step": 10240}, {"loss": 0.8307, "grad_norm": 0.6241081357002258, "learning_rate": 0.0002, "epoch": 0.7360861759425493, "step": 10250}, {"loss": 0.7432, "grad_norm": 0.7198845744132996, "learning_rate": 0.0002, "epoch": 0.7368043087971274, "step": 10260}, {"loss": 0.7545, "grad_norm": 0.5879023671150208, "learning_rate": 0.0002, "epoch": 0.7375224416517056, "step": 10270}, {"loss": 0.7526, "grad_norm": 0.5810162425041199, "learning_rate": 0.0002, "epoch": 0.7382405745062837, "step": 10280}, {"loss": 0.7839, "grad_norm": 0.6336500644683838, "learning_rate": 0.0002, "epoch": 0.7389587073608618, "step": 10290}, {"loss": 0.7597, "grad_norm": 0.5627583861351013, "learning_rate": 0.0002, "epoch": 0.7396768402154399, "step": 10300}, {"loss": 0.8166, "grad_norm": 0.5396066904067993, "learning_rate": 0.0002, "epoch": 0.740394973070018, "step": 10310}, {"loss": 0.7698, "grad_norm": 0.5519505143165588, "learning_rate": 0.0002, "epoch": 0.741113105924596, "step": 10320}, {"loss": 0.7953, "grad_norm": 0.628710925579071, "learning_rate": 0.0002, "epoch": 0.7418312387791741, "step": 10330}, {"loss": 0.805, "grad_norm": 0.6466957926750183, "learning_rate": 0.0002, "epoch": 0.7425493716337522, "step": 10340}, {"loss": 0.8173, "grad_norm": 0.6269286274909973, "learning_rate": 0.0002, "epoch": 0.7432675044883303, "step": 10350}, {"loss": 0.8315, "grad_norm": 0.6985455751419067, "learning_rate": 0.0002, "epoch": 0.7439856373429085, "step": 10360}, {"loss": 0.7598, "grad_norm": 0.6203648447990417, "learning_rate": 0.0002, "epoch": 0.7447037701974866, "step": 10370}, {"loss": 0.7937, "grad_norm": 0.6524295210838318, "learning_rate": 0.0002, "epoch": 0.7454219030520647, "step": 10380}, {"loss": 0.8005, "grad_norm": 0.6108002662658691, "learning_rate": 0.0002, "epoch": 0.7461400359066427, "step": 10390}, {"loss": 0.7592, "grad_norm": 0.5196276903152466, "learning_rate": 0.0002, "epoch": 0.7468581687612208, "step": 10400}, {"loss": 0.7769, "grad_norm": 0.6207506656646729, "learning_rate": 0.0002, "epoch": 0.7475763016157989, "step": 10410}, {"loss": 0.8066, "grad_norm": 0.6015686988830566, "learning_rate": 0.0002, "epoch": 0.748294434470377, "step": 10420}, {"loss": 0.7993, "grad_norm": 0.6402649879455566, "learning_rate": 0.0002, "epoch": 0.7490125673249551, "step": 10430}, {"loss": 0.802, "grad_norm": 0.7816081047058105, "learning_rate": 0.0002, "epoch": 0.7497307001795332, "step": 10440}, {"loss": 0.8021, "grad_norm": 0.6148143410682678, "learning_rate": 0.0002, "epoch": 0.7504488330341114, "step": 10450}, {"loss": 0.7986, "grad_norm": 0.6496613621711731, "learning_rate": 0.0002, "epoch": 0.7511669658886894, "step": 10460}, {"loss": 0.8152, "grad_norm": 0.49158045649528503, "learning_rate": 0.0002, "epoch": 0.7518850987432675, "step": 10470}, {"loss": 0.8098, "grad_norm": 0.8629217743873596, "learning_rate": 0.0002, "epoch": 0.7526032315978456, "step": 10480}, {"loss": 0.807, "grad_norm": 0.6800066828727722, "learning_rate": 0.0002, "epoch": 0.7533213644524237, "step": 10490}, {"loss": 0.7238, "grad_norm": 0.6480063199996948, "learning_rate": 0.0002, "epoch": 0.7540394973070018, "step": 10500}, {"loss": 0.7818, "grad_norm": 0.5740751028060913, "learning_rate": 0.0002, "epoch": 0.7547576301615799, "step": 10510}, {"loss": 0.7732, "grad_norm": 0.7182627320289612, "learning_rate": 0.0002, "epoch": 0.755475763016158, "step": 10520}, {"loss": 0.7752, "grad_norm": 0.6482816934585571, "learning_rate": 0.0002, "epoch": 0.756193895870736, "step": 10530}, {"loss": 0.7564, "grad_norm": 0.4937674105167389, "learning_rate": 0.0002, "epoch": 0.7569120287253142, "step": 10540}, {"loss": 0.7783, "grad_norm": 0.6818482875823975, "learning_rate": 0.0002, "epoch": 0.7576301615798923, "step": 10550}, {"loss": 0.8303, "grad_norm": 0.6375173926353455, "learning_rate": 0.0002, "epoch": 0.7583482944344704, "step": 10560}, {"loss": 0.77, "grad_norm": 0.528798520565033, "learning_rate": 0.0002, "epoch": 0.7590664272890485, "step": 10570}, {"loss": 0.8435, "grad_norm": 0.42099910974502563, "learning_rate": 0.0002, "epoch": 0.7597845601436266, "step": 10580}, {"loss": 0.8218, "grad_norm": 0.529604434967041, "learning_rate": 0.0002, "epoch": 0.7605026929982047, "step": 10590}, {"loss": 0.7833, "grad_norm": 0.6236841082572937, "learning_rate": 0.0002, "epoch": 0.7612208258527827, "step": 10600}, {"loss": 0.777, "grad_norm": 0.6194891929626465, "learning_rate": 0.0002, "epoch": 0.7619389587073608, "step": 10610}, {"loss": 0.7967, "grad_norm": 0.5206209421157837, "learning_rate": 0.0002, "epoch": 0.762657091561939, "step": 10620}, {"loss": 0.811, "grad_norm": 0.7981295585632324, "learning_rate": 0.0002, "epoch": 0.7633752244165171, "step": 10630}, {"loss": 0.8016, "grad_norm": 0.6113479137420654, "learning_rate": 0.0002, "epoch": 0.7640933572710952, "step": 10640}, {"loss": 0.7642, "grad_norm": 0.7025435566902161, "learning_rate": 0.0002, "epoch": 0.7648114901256733, "step": 10650}, {"loss": 0.7293, "grad_norm": 0.46914348006248474, "learning_rate": 0.0002, "epoch": 0.7655296229802514, "step": 10660}, {"loss": 0.8079, "grad_norm": 0.6134725213050842, "learning_rate": 0.0002, "epoch": 0.7662477558348294, "step": 10670}, {"loss": 0.7469, "grad_norm": 0.583859920501709, "learning_rate": 0.0002, "epoch": 0.7669658886894075, "step": 10680}, {"loss": 0.843, "grad_norm": 0.511349081993103, "learning_rate": 0.0002, "epoch": 0.7676840215439856, "step": 10690}, {"loss": 0.8355, "grad_norm": 0.6467110514640808, "learning_rate": 0.0002, "epoch": 0.7684021543985637, "step": 10700}, {"loss": 0.7935, "grad_norm": 0.7210163474082947, "learning_rate": 0.0002, "epoch": 0.7691202872531419, "step": 10710}, {"loss": 0.7807, "grad_norm": 0.6034521460533142, "learning_rate": 0.0002, "epoch": 0.76983842010772, "step": 10720}, {"loss": 0.7742, "grad_norm": 0.6237271428108215, "learning_rate": 0.0002, "epoch": 0.7705565529622981, "step": 10730}, {"loss": 0.8185, "grad_norm": 0.664328396320343, "learning_rate": 0.0002, "epoch": 0.7712746858168761, "step": 10740}, {"loss": 0.8096, "grad_norm": 0.6550520062446594, "learning_rate": 0.0002, "epoch": 0.7719928186714542, "step": 10750}, {"loss": 0.7538, "grad_norm": 0.5103325843811035, "learning_rate": 0.0002, "epoch": 0.7727109515260323, "step": 10760}, {"loss": 0.7777, "grad_norm": 0.7171200513839722, "learning_rate": 0.0002, "epoch": 0.7734290843806104, "step": 10770}, {"loss": 0.7743, "grad_norm": 0.5947384834289551, "learning_rate": 0.0002, "epoch": 0.7741472172351885, "step": 10780}, {"loss": 0.781, "grad_norm": 0.5293096899986267, "learning_rate": 0.0002, "epoch": 0.7748653500897666, "step": 10790}, {"loss": 0.777, "grad_norm": 0.6372577548027039, "learning_rate": 0.0002, "epoch": 0.7755834829443446, "step": 10800}, {"loss": 0.7972, "grad_norm": 0.5738261938095093, "learning_rate": 0.0002, "epoch": 0.7763016157989228, "step": 10810}, {"loss": 0.7877, "grad_norm": 0.7309247255325317, "learning_rate": 0.0002, "epoch": 0.7770197486535009, "step": 10820}, {"loss": 0.7745, "grad_norm": 0.8867193460464478, "learning_rate": 0.0002, "epoch": 0.777737881508079, "step": 10830}, {"loss": 0.7959, "grad_norm": 0.6151437759399414, "learning_rate": 0.0002, "epoch": 0.7784560143626571, "step": 10840}, {"loss": 0.7897, "grad_norm": 0.5645464658737183, "learning_rate": 0.0002, "epoch": 0.7791741472172352, "step": 10850}, {"loss": 0.7858, "grad_norm": 0.5118698477745056, "learning_rate": 0.0002, "epoch": 0.7798922800718133, "step": 10860}, {"loss": 0.8064, "grad_norm": 0.618181049823761, "learning_rate": 0.0002, "epoch": 0.7806104129263913, "step": 10870}, {"loss": 0.7675, "grad_norm": 0.7206462025642395, "learning_rate": 0.0002, "epoch": 0.7813285457809694, "step": 10880}, {"loss": 0.8162, "grad_norm": 0.7993820905685425, "learning_rate": 0.0002, "epoch": 0.7820466786355476, "step": 10890}, {"loss": 0.781, "grad_norm": 0.5072754621505737, "learning_rate": 0.0002, "epoch": 0.7827648114901257, "step": 10900}, {"loss": 0.7575, "grad_norm": 0.5829088687896729, "learning_rate": 0.0002, "epoch": 0.7834829443447038, "step": 10910}, {"loss": 0.7552, "grad_norm": 0.5778957605361938, "learning_rate": 0.0002, "epoch": 0.7842010771992819, "step": 10920}, {"loss": 0.7652, "grad_norm": 0.7237067222595215, "learning_rate": 0.0002, "epoch": 0.78491921005386, "step": 10930}, {"loss": 0.8357, "grad_norm": 0.5778013467788696, "learning_rate": 0.0002, "epoch": 0.785637342908438, "step": 10940}, {"loss": 0.7464, "grad_norm": 0.6129629611968994, "learning_rate": 0.0002, "epoch": 0.7863554757630161, "step": 10950}, {"loss": 0.7863, "grad_norm": 0.5637320876121521, "learning_rate": 0.0002, "epoch": 0.7870736086175942, "step": 10960}, {"loss": 0.7645, "grad_norm": 0.6253715753555298, "learning_rate": 0.0002, "epoch": 0.7877917414721723, "step": 10970}, {"loss": 0.8307, "grad_norm": 0.6209888458251953, "learning_rate": 0.0002, "epoch": 0.7885098743267505, "step": 10980}, {"loss": 0.7899, "grad_norm": 1.0841948986053467, "learning_rate": 0.0002, "epoch": 0.7892280071813286, "step": 10990}, {"loss": 0.7659, "grad_norm": 0.6570560336112976, "learning_rate": 0.0002, "epoch": 0.7899461400359067, "step": 11000}, {"loss": 0.7839, "grad_norm": 0.4830388128757477, "learning_rate": 0.0002, "epoch": 0.7906642728904847, "step": 11010}, {"loss": 0.8064, "grad_norm": 0.7607520222663879, "learning_rate": 0.0002, "epoch": 0.7913824057450628, "step": 11020}, {"loss": 0.8009, "grad_norm": 0.8202590346336365, "learning_rate": 0.0002, "epoch": 0.7921005385996409, "step": 11030}, {"loss": 0.7788, "grad_norm": 0.5640848278999329, "learning_rate": 0.0002, "epoch": 0.792818671454219, "step": 11040}, {"loss": 0.8298, "grad_norm": 0.7773675322532654, "learning_rate": 0.0002, "epoch": 0.7935368043087971, "step": 11050}, {"loss": 0.793, "grad_norm": 0.664139986038208, "learning_rate": 0.0002, "epoch": 0.7942549371633753, "step": 11060}, {"loss": 0.7886, "grad_norm": 0.6097795367240906, "learning_rate": 0.0002, "epoch": 0.7949730700179534, "step": 11070}, {"loss": 0.7989, "grad_norm": 0.9208881258964539, "learning_rate": 0.0002, "epoch": 0.7956912028725314, "step": 11080}, {"loss": 0.8045, "grad_norm": 0.6210731863975525, "learning_rate": 0.0002, "epoch": 0.7964093357271095, "step": 11090}, {"loss": 0.7868, "grad_norm": 0.7060235738754272, "learning_rate": 0.0002, "epoch": 0.7971274685816876, "step": 11100}, {"loss": 0.8041, "grad_norm": 0.48695266246795654, "learning_rate": 0.0002, "epoch": 0.7978456014362657, "step": 11110}, {"loss": 0.7885, "grad_norm": 0.6458830833435059, "learning_rate": 0.0002, "epoch": 0.7985637342908438, "step": 11120}, {"loss": 0.7773, "grad_norm": 0.572545051574707, "learning_rate": 0.0002, "epoch": 0.7992818671454219, "step": 11130}, {"loss": 0.7984, "grad_norm": 0.5925027132034302, "learning_rate": 0.0002, "epoch": 0.8, "step": 11140}, {"loss": 0.7571, "grad_norm": 0.569622278213501, "learning_rate": 0.0002, "epoch": 0.800718132854578, "step": 11150}, {"loss": 0.7765, "grad_norm": 0.537146806716919, "learning_rate": 0.0002, "epoch": 0.8014362657091562, "step": 11160}, {"loss": 0.7896, "grad_norm": 0.7118613719940186, "learning_rate": 0.0002, "epoch": 0.8021543985637343, "step": 11170}, {"loss": 0.7398, "grad_norm": 0.6183688044548035, "learning_rate": 0.0002, "epoch": 0.8028725314183124, "step": 11180}, {"loss": 0.7545, "grad_norm": 0.5187385082244873, "learning_rate": 0.0002, "epoch": 0.8035906642728905, "step": 11190}, {"loss": 0.766, "grad_norm": 0.5422571301460266, "learning_rate": 0.0002, "epoch": 0.8043087971274686, "step": 11200}, {"loss": 0.756, "grad_norm": 0.635050892829895, "learning_rate": 0.0002, "epoch": 0.8050269299820467, "step": 11210}, {"loss": 0.7337, "grad_norm": 0.6584872007369995, "learning_rate": 0.0002, "epoch": 0.8057450628366247, "step": 11220}, {"loss": 0.7467, "grad_norm": 0.624921977519989, "learning_rate": 0.0002, "epoch": 0.8064631956912028, "step": 11230}, {"loss": 0.7559, "grad_norm": 0.6837546229362488, "learning_rate": 0.0002, "epoch": 0.807181328545781, "step": 11240}, {"loss": 0.7861, "grad_norm": 0.5861160755157471, "learning_rate": 0.0002, "epoch": 0.8078994614003591, "step": 11250}, {"loss": 0.7883, "grad_norm": 0.5751383900642395, "learning_rate": 0.0002, "epoch": 0.8086175942549372, "step": 11260}, {"loss": 0.8103, "grad_norm": 0.7181510329246521, "learning_rate": 0.0002, "epoch": 0.8093357271095153, "step": 11270}, {"loss": 0.8066, "grad_norm": 0.5862139463424683, "learning_rate": 0.0002, "epoch": 0.8100538599640934, "step": 11280}, {"loss": 0.7692, "grad_norm": 0.4880113899707794, "learning_rate": 0.0002, "epoch": 0.8107719928186714, "step": 11290}, {"loss": 0.8154, "grad_norm": 0.565590500831604, "learning_rate": 0.0002, "epoch": 0.8114901256732495, "step": 11300}, {"loss": 0.7893, "grad_norm": 0.6171264052391052, "learning_rate": 0.0002, "epoch": 0.8122082585278276, "step": 11310}, {"loss": 0.816, "grad_norm": 0.5815969109535217, "learning_rate": 0.0002, "epoch": 0.8129263913824057, "step": 11320}, {"loss": 0.7462, "grad_norm": 0.5407653450965881, "learning_rate": 0.0002, "epoch": 0.8136445242369839, "step": 11330}, {"loss": 0.7647, "grad_norm": 0.6990084648132324, "learning_rate": 0.0002, "epoch": 0.814362657091562, "step": 11340}, {"loss": 0.783, "grad_norm": 0.5845068097114563, "learning_rate": 0.0002, "epoch": 0.8150807899461401, "step": 11350}, {"loss": 0.7839, "grad_norm": 0.5978701114654541, "learning_rate": 0.0002, "epoch": 0.8157989228007181, "step": 11360}, {"loss": 0.7342, "grad_norm": 0.6873053312301636, "learning_rate": 0.0002, "epoch": 0.8165170556552962, "step": 11370}, {"loss": 0.7656, "grad_norm": 0.7048654556274414, "learning_rate": 0.0002, "epoch": 0.8172351885098743, "step": 11380}, {"loss": 0.7293, "grad_norm": 0.7631531953811646, "learning_rate": 0.0002, "epoch": 0.8179533213644524, "step": 11390}, {"loss": 0.8606, "grad_norm": 0.704922080039978, "learning_rate": 0.0002, "epoch": 0.8186714542190305, "step": 11400}, {"loss": 0.8066, "grad_norm": 0.595460832118988, "learning_rate": 0.0002, "epoch": 0.8193895870736086, "step": 11410}, {"loss": 0.809, "grad_norm": 0.5882242918014526, "learning_rate": 0.0002, "epoch": 0.8201077199281868, "step": 11420}, {"loss": 0.7639, "grad_norm": 0.6433175206184387, "learning_rate": 0.0002, "epoch": 0.8208258527827648, "step": 11430}, {"loss": 0.7522, "grad_norm": 0.6047986149787903, "learning_rate": 0.0002, "epoch": 0.8215439856373429, "step": 11440}, {"loss": 0.8305, "grad_norm": 0.6462088823318481, "learning_rate": 0.0002, "epoch": 0.822262118491921, "step": 11450}, {"loss": 0.8144, "grad_norm": 0.5558379888534546, "learning_rate": 0.0002, "epoch": 0.8229802513464991, "step": 11460}, {"loss": 0.7916, "grad_norm": 0.6745542287826538, "learning_rate": 0.0002, "epoch": 0.8236983842010772, "step": 11470}, {"loss": 0.7853, "grad_norm": 0.7082334756851196, "learning_rate": 0.0002, "epoch": 0.8244165170556553, "step": 11480}, {"loss": 0.7533, "grad_norm": 0.703889787197113, "learning_rate": 0.0002, "epoch": 0.8251346499102334, "step": 11490}, {"loss": 0.8085, "grad_norm": 0.5261096358299255, "learning_rate": 0.0002, "epoch": 0.8258527827648114, "step": 11500}, {"loss": 0.7903, "grad_norm": 0.6009393930435181, "learning_rate": 0.0002, "epoch": 0.8265709156193896, "step": 11510}, {"loss": 0.7377, "grad_norm": 0.584274172782898, "learning_rate": 0.0002, "epoch": 0.8272890484739677, "step": 11520}, {"loss": 0.7926, "grad_norm": 0.6803238987922668, "learning_rate": 0.0002, "epoch": 0.8280071813285458, "step": 11530}, {"loss": 0.7948, "grad_norm": 0.6230084896087646, "learning_rate": 0.0002, "epoch": 0.8287253141831239, "step": 11540}, {"loss": 0.7902, "grad_norm": 0.6090595722198486, "learning_rate": 0.0002, "epoch": 0.829443447037702, "step": 11550}, {"loss": 0.7514, "grad_norm": 0.5292693376541138, "learning_rate": 0.0002, "epoch": 0.8301615798922801, "step": 11560}, {"loss": 0.7979, "grad_norm": 0.5675389766693115, "learning_rate": 0.0002, "epoch": 0.8308797127468581, "step": 11570}, {"loss": 0.7851, "grad_norm": 0.554874062538147, "learning_rate": 0.0002, "epoch": 0.8315978456014362, "step": 11580}, {"loss": 0.8004, "grad_norm": 0.8582373261451721, "learning_rate": 0.0002, "epoch": 0.8323159784560143, "step": 11590}, {"loss": 0.7864, "grad_norm": 0.5743035674095154, "learning_rate": 0.0002, "epoch": 0.8330341113105925, "step": 11600}, {"loss": 0.7714, "grad_norm": 0.5749582648277283, "learning_rate": 0.0002, "epoch": 0.8337522441651706, "step": 11610}, {"loss": 0.8131, "grad_norm": 0.5207278728485107, "learning_rate": 0.0002, "epoch": 0.8344703770197487, "step": 11620}, {"loss": 0.785, "grad_norm": 0.6262611150741577, "learning_rate": 0.0002, "epoch": 0.8351885098743268, "step": 11630}, {"loss": 0.7699, "grad_norm": 0.5490066409111023, "learning_rate": 0.0002, "epoch": 0.8359066427289048, "step": 11640}, {"loss": 0.7779, "grad_norm": 0.6283167600631714, "learning_rate": 0.0002, "epoch": 0.8366247755834829, "step": 11650}, {"loss": 0.7508, "grad_norm": 0.7701452374458313, "learning_rate": 0.0002, "epoch": 0.837342908438061, "step": 11660}, {"loss": 0.7662, "grad_norm": 0.5825072526931763, "learning_rate": 0.0002, "epoch": 0.8380610412926391, "step": 11670}, {"loss": 0.758, "grad_norm": 0.6119720935821533, "learning_rate": 0.0002, "epoch": 0.8387791741472173, "step": 11680}, {"loss": 0.7995, "grad_norm": 0.689383327960968, "learning_rate": 0.0002, "epoch": 0.8394973070017954, "step": 11690}, {"loss": 0.7615, "grad_norm": 0.5396560430526733, "learning_rate": 0.0002, "epoch": 0.8402154398563735, "step": 11700}, {"loss": 0.8073, "grad_norm": 0.577178955078125, "learning_rate": 0.0002, "epoch": 0.8409335727109515, "step": 11710}, {"loss": 0.7911, "grad_norm": 0.6652564406394958, "learning_rate": 0.0002, "epoch": 0.8416517055655296, "step": 11720}, {"loss": 0.7708, "grad_norm": 0.588377058506012, "learning_rate": 0.0002, "epoch": 0.8423698384201077, "step": 11730}, {"loss": 0.8245, "grad_norm": 0.6180438995361328, "learning_rate": 0.0002, "epoch": 0.8430879712746858, "step": 11740}, {"loss": 0.729, "grad_norm": 0.6897811889648438, "learning_rate": 0.0002, "epoch": 0.8438061041292639, "step": 11750}, {"loss": 0.8026, "grad_norm": 0.5826608538627625, "learning_rate": 0.0002, "epoch": 0.844524236983842, "step": 11760}, {"loss": 0.7959, "grad_norm": 0.6511976718902588, "learning_rate": 0.0002, "epoch": 0.8452423698384202, "step": 11770}, {"loss": 0.7705, "grad_norm": 0.4738382399082184, "learning_rate": 0.0002, "epoch": 0.8459605026929982, "step": 11780}, {"loss": 0.8317, "grad_norm": 0.541780948638916, "learning_rate": 0.0002, "epoch": 0.8466786355475763, "step": 11790}, {"loss": 0.774, "grad_norm": 0.6115241050720215, "learning_rate": 0.0002, "epoch": 0.8473967684021544, "step": 11800}, {"loss": 0.834, "grad_norm": 0.7067801356315613, "learning_rate": 0.0002, "epoch": 0.8481149012567325, "step": 11810}, {"loss": 0.7725, "grad_norm": 0.5602791905403137, "learning_rate": 0.0002, "epoch": 0.8488330341113106, "step": 11820}, {"loss": 0.7832, "grad_norm": 0.6968005299568176, "learning_rate": 0.0002, "epoch": 0.8495511669658887, "step": 11830}, {"loss": 0.7556, "grad_norm": 0.621132493019104, "learning_rate": 0.0002, "epoch": 0.8502692998204668, "step": 11840}, {"loss": 0.8036, "grad_norm": 0.5777568817138672, "learning_rate": 0.0002, "epoch": 0.8509874326750448, "step": 11850}, {"loss": 0.8071, "grad_norm": 0.6468178629875183, "learning_rate": 0.0002, "epoch": 0.851705565529623, "step": 11860}, {"loss": 0.8074, "grad_norm": 0.6216070652008057, "learning_rate": 0.0002, "epoch": 0.8524236983842011, "step": 11870}, {"loss": 0.7736, "grad_norm": 0.7402005791664124, "learning_rate": 0.0002, "epoch": 0.8531418312387792, "step": 11880}, {"loss": 0.7877, "grad_norm": 0.5192958116531372, "learning_rate": 0.0002, "epoch": 0.8538599640933573, "step": 11890}, {"loss": 0.7113, "grad_norm": 0.6050501465797424, "learning_rate": 0.0002, "epoch": 0.8545780969479354, "step": 11900}, {"loss": 0.8131, "grad_norm": 0.5363124012947083, "learning_rate": 0.0002, "epoch": 0.8552962298025135, "step": 11910}, {"loss": 0.7861, "grad_norm": 0.525288462638855, "learning_rate": 0.0002, "epoch": 0.8560143626570915, "step": 11920}, {"loss": 0.726, "grad_norm": 0.6129848957061768, "learning_rate": 0.0002, "epoch": 0.8567324955116696, "step": 11930}, {"loss": 0.7921, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 0.8574506283662477, "step": 11940}, {"loss": 0.772, "grad_norm": 0.5862830281257629, "learning_rate": 0.0002, "epoch": 0.8581687612208259, "step": 11950}, {"loss": 0.7272, "grad_norm": 0.7078025341033936, "learning_rate": 0.0002, "epoch": 0.858886894075404, "step": 11960}, {"loss": 0.7733, "grad_norm": 0.6600908637046814, "learning_rate": 0.0002, "epoch": 0.8596050269299821, "step": 11970}, {"loss": 0.7784, "grad_norm": 0.5914377570152283, "learning_rate": 0.0002, "epoch": 0.8603231597845602, "step": 11980}, {"loss": 0.8222, "grad_norm": 0.7844575047492981, "learning_rate": 0.0002, "epoch": 0.8610412926391382, "step": 11990}, {"loss": 0.8059, "grad_norm": 0.6605148315429688, "learning_rate": 0.0002, "epoch": 0.8617594254937163, "step": 12000}, {"loss": 0.8066, "grad_norm": 0.6320111155509949, "learning_rate": 0.0002, "epoch": 0.8624775583482944, "step": 12010}, {"loss": 0.7844, "grad_norm": 0.5833557844161987, "learning_rate": 0.0002, "epoch": 0.8631956912028725, "step": 12020}, {"loss": 0.8016, "grad_norm": 0.5322666764259338, "learning_rate": 0.0002, "epoch": 0.8639138240574507, "step": 12030}, {"loss": 0.8142, "grad_norm": 0.568696141242981, "learning_rate": 0.0002, "epoch": 0.8646319569120288, "step": 12040}, {"loss": 0.7929, "grad_norm": 0.5739135146141052, "learning_rate": 0.0002, "epoch": 0.8653500897666068, "step": 12050}, {"loss": 0.7877, "grad_norm": 0.6667993068695068, "learning_rate": 0.0002, "epoch": 0.8660682226211849, "step": 12060}, {"loss": 0.7538, "grad_norm": 0.5393701195716858, "learning_rate": 0.0002, "epoch": 0.866786355475763, "step": 12070}, {"loss": 0.8014, "grad_norm": 0.7036312818527222, "learning_rate": 0.0002, "epoch": 0.8675044883303411, "step": 12080}, {"loss": 0.7937, "grad_norm": 0.5851739048957825, "learning_rate": 0.0002, "epoch": 0.8682226211849192, "step": 12090}, {"loss": 0.8121, "grad_norm": 0.6554462909698486, "learning_rate": 0.0002, "epoch": 0.8689407540394973, "step": 12100}, {"loss": 0.8541, "grad_norm": 0.8224838376045227, "learning_rate": 0.0002, "epoch": 0.8696588868940754, "step": 12110}, {"loss": 0.73, "grad_norm": 0.513981819152832, "learning_rate": 0.0002, "epoch": 0.8703770197486534, "step": 12120}, {"loss": 0.7371, "grad_norm": 0.6913988590240479, "learning_rate": 0.0002, "epoch": 0.8710951526032316, "step": 12130}, {"loss": 0.762, "grad_norm": 0.5539003610610962, "learning_rate": 0.0002, "epoch": 0.8718132854578097, "step": 12140}, {"loss": 0.7535, "grad_norm": 0.6216937303543091, "learning_rate": 0.0002, "epoch": 0.8725314183123878, "step": 12150}, {"loss": 0.7344, "grad_norm": 0.5594495534896851, "learning_rate": 0.0002, "epoch": 0.8732495511669659, "step": 12160}, {"loss": 0.7342, "grad_norm": 0.6025309562683105, "learning_rate": 0.0002, "epoch": 0.873967684021544, "step": 12170}, {"loss": 0.7561, "grad_norm": 0.5285239815711975, "learning_rate": 0.0002, "epoch": 0.8746858168761221, "step": 12180}, {"loss": 0.7619, "grad_norm": 1.0394607782363892, "learning_rate": 0.0002, "epoch": 0.8754039497307001, "step": 12190}, {"loss": 0.8111, "grad_norm": 0.5128031373023987, "learning_rate": 0.0002, "epoch": 0.8761220825852782, "step": 12200}, {"loss": 0.8113, "grad_norm": 0.5883685946464539, "learning_rate": 0.0002, "epoch": 0.8768402154398564, "step": 12210}, {"loss": 0.7493, "grad_norm": 0.593204915523529, "learning_rate": 0.0002, "epoch": 0.8775583482944345, "step": 12220}, {"loss": 0.7739, "grad_norm": 0.7141679525375366, "learning_rate": 0.0002, "epoch": 0.8782764811490126, "step": 12230}, {"loss": 0.8155, "grad_norm": 0.6381585597991943, "learning_rate": 0.0002, "epoch": 0.8789946140035907, "step": 12240}, {"loss": 0.7756, "grad_norm": 0.7076981067657471, "learning_rate": 0.0002, "epoch": 0.8797127468581688, "step": 12250}, {"loss": 0.8186, "grad_norm": 0.8046461939811707, "learning_rate": 0.0002, "epoch": 0.8804308797127468, "step": 12260}, {"loss": 0.7615, "grad_norm": 0.635160505771637, "learning_rate": 0.0002, "epoch": 0.8811490125673249, "step": 12270}, {"loss": 0.7695, "grad_norm": 0.6388354301452637, "learning_rate": 0.0002, "epoch": 0.881867145421903, "step": 12280}, {"loss": 0.81, "grad_norm": 0.5612906217575073, "learning_rate": 0.0002, "epoch": 0.8825852782764811, "step": 12290}, {"loss": 0.8055, "grad_norm": 0.6716228723526001, "learning_rate": 0.0002, "epoch": 0.8833034111310593, "step": 12300}, {"loss": 0.757, "grad_norm": 0.6488762497901917, "learning_rate": 0.0002, "epoch": 0.8840215439856374, "step": 12310}, {"loss": 0.7794, "grad_norm": 0.5770853757858276, "learning_rate": 0.0002, "epoch": 0.8847396768402155, "step": 12320}, {"loss": 0.7617, "grad_norm": 0.5006616711616516, "learning_rate": 0.0002, "epoch": 0.8854578096947935, "step": 12330}, {"loss": 0.7512, "grad_norm": 0.6428417563438416, "learning_rate": 0.0002, "epoch": 0.8861759425493716, "step": 12340}, {"loss": 0.796, "grad_norm": 0.5721977949142456, "learning_rate": 0.0002, "epoch": 0.8868940754039497, "step": 12350}, {"loss": 0.7764, "grad_norm": 0.7000266313552856, "learning_rate": 0.0002, "epoch": 0.8876122082585278, "step": 12360}, {"loss": 0.7524, "grad_norm": 0.5252631306648254, "learning_rate": 0.0002, "epoch": 0.8883303411131059, "step": 12370}, {"loss": 0.7635, "grad_norm": 0.5788044929504395, "learning_rate": 0.0002, "epoch": 0.889048473967684, "step": 12380}, {"loss": 0.7856, "grad_norm": 0.6730653643608093, "learning_rate": 0.0002, "epoch": 0.8897666068222622, "step": 12390}, {"loss": 0.7925, "grad_norm": 0.5556851029396057, "learning_rate": 0.0002, "epoch": 0.8904847396768402, "step": 12400}, {"loss": 0.6958, "grad_norm": 0.616189181804657, "learning_rate": 0.0002, "epoch": 0.8912028725314183, "step": 12410}, {"loss": 0.7468, "grad_norm": 0.6360940337181091, "learning_rate": 0.0002, "epoch": 0.8919210053859964, "step": 12420}, {"loss": 0.8088, "grad_norm": 0.5832887887954712, "learning_rate": 0.0002, "epoch": 0.8926391382405745, "step": 12430}, {"loss": 0.7383, "grad_norm": 0.8319168090820312, "learning_rate": 0.0002, "epoch": 0.8933572710951526, "step": 12440}, {"loss": 0.8597, "grad_norm": 0.5415005087852478, "learning_rate": 0.0002, "epoch": 0.8940754039497307, "step": 12450}, {"loss": 0.7439, "grad_norm": 0.4959808588027954, "learning_rate": 0.0002, "epoch": 0.8947935368043088, "step": 12460}, {"loss": 0.8493, "grad_norm": 0.5102260708808899, "learning_rate": 0.0002, "epoch": 0.8955116696588868, "step": 12470}, {"loss": 0.7274, "grad_norm": 0.773972749710083, "learning_rate": 0.0002, "epoch": 0.896229802513465, "step": 12480}, {"loss": 0.7797, "grad_norm": 0.6314513087272644, "learning_rate": 0.0002, "epoch": 0.8969479353680431, "step": 12490}, {"loss": 0.7839, "grad_norm": 0.6503705382347107, "learning_rate": 0.0002, "epoch": 0.8976660682226212, "step": 12500}, {"loss": 0.8177, "grad_norm": 0.5702024102210999, "learning_rate": 0.0002, "epoch": 0.8983842010771993, "step": 12510}, {"loss": 0.7448, "grad_norm": 0.7222756743431091, "learning_rate": 0.0002, "epoch": 0.8991023339317774, "step": 12520}, {"loss": 0.7779, "grad_norm": 0.7242336869239807, "learning_rate": 0.0002, "epoch": 0.8998204667863555, "step": 12530}, {"loss": 0.7577, "grad_norm": 0.625769317150116, "learning_rate": 0.0002, "epoch": 0.9005385996409335, "step": 12540}, {"loss": 0.8528, "grad_norm": 0.6003357172012329, "learning_rate": 0.0002, "epoch": 0.9012567324955116, "step": 12550}, {"loss": 0.7871, "grad_norm": 0.6089374423027039, "learning_rate": 0.0002, "epoch": 0.9019748653500897, "step": 12560}, {"loss": 0.74, "grad_norm": 0.6232544183731079, "learning_rate": 0.0002, "epoch": 0.9026929982046679, "step": 12570}, {"loss": 0.7993, "grad_norm": 0.5426769256591797, "learning_rate": 0.0002, "epoch": 0.903411131059246, "step": 12580}, {"loss": 0.8023, "grad_norm": 0.5711943507194519, "learning_rate": 0.0002, "epoch": 0.9041292639138241, "step": 12590}, {"loss": 0.7915, "grad_norm": 0.5287838578224182, "learning_rate": 0.0002, "epoch": 0.9048473967684022, "step": 12600}, {"loss": 0.7394, "grad_norm": 0.6192951798439026, "learning_rate": 0.0002, "epoch": 0.9055655296229802, "step": 12610}, {"loss": 0.7547, "grad_norm": 0.493082195520401, "learning_rate": 0.0002, "epoch": 0.9062836624775583, "step": 12620}, {"loss": 0.7604, "grad_norm": 0.7668463587760925, "learning_rate": 0.0002, "epoch": 0.9070017953321364, "step": 12630}, {"loss": 0.8079, "grad_norm": 0.6298037767410278, "learning_rate": 0.0002, "epoch": 0.9077199281867145, "step": 12640}, {"loss": 0.7451, "grad_norm": 0.5502580404281616, "learning_rate": 0.0002, "epoch": 0.9084380610412927, "step": 12650}, {"loss": 0.763, "grad_norm": 0.5525170564651489, "learning_rate": 0.0002, "epoch": 0.9091561938958708, "step": 12660}, {"loss": 0.7579, "grad_norm": 0.9753695726394653, "learning_rate": 0.0002, "epoch": 0.9098743267504489, "step": 12670}, {"loss": 0.872, "grad_norm": 0.611427366733551, "learning_rate": 0.0002, "epoch": 0.9105924596050269, "step": 12680}, {"loss": 0.7786, "grad_norm": 0.5141594409942627, "learning_rate": 0.0002, "epoch": 0.911310592459605, "step": 12690}, {"loss": 0.7384, "grad_norm": 0.6739137172698975, "learning_rate": 0.0002, "epoch": 0.9120287253141831, "step": 12700}, {"loss": 0.8579, "grad_norm": 0.5759707689285278, "learning_rate": 0.0002, "epoch": 0.9127468581687612, "step": 12710}, {"loss": 0.7559, "grad_norm": 0.5548733472824097, "learning_rate": 0.0002, "epoch": 0.9134649910233393, "step": 12720}, {"loss": 0.8225, "grad_norm": 0.7014280557632446, "learning_rate": 0.0002, "epoch": 0.9141831238779174, "step": 12730}, {"loss": 0.7936, "grad_norm": 0.5939958691596985, "learning_rate": 0.0002, "epoch": 0.9149012567324956, "step": 12740}, {"loss": 0.7756, "grad_norm": 0.5995593667030334, "learning_rate": 0.0002, "epoch": 0.9156193895870736, "step": 12750}, {"loss": 0.7423, "grad_norm": 0.6686680316925049, "learning_rate": 0.0002, "epoch": 0.9163375224416517, "step": 12760}, {"loss": 0.8057, "grad_norm": 0.4742372930049896, "learning_rate": 0.0002, "epoch": 0.9170556552962298, "step": 12770}, {"loss": 0.7795, "grad_norm": 0.5493217706680298, "learning_rate": 0.0002, "epoch": 0.9177737881508079, "step": 12780}, {"loss": 0.7859, "grad_norm": 0.5641885995864868, "learning_rate": 0.0002, "epoch": 0.918491921005386, "step": 12790}, {"loss": 0.7775, "grad_norm": 0.5814061164855957, "learning_rate": 0.0002, "epoch": 0.9192100538599641, "step": 12800}, {"loss": 0.8204, "grad_norm": 0.6774331331253052, "learning_rate": 0.0002, "epoch": 0.9199281867145422, "step": 12810}, {"loss": 0.8205, "grad_norm": 0.5592127442359924, "learning_rate": 0.0002, "epoch": 0.9206463195691202, "step": 12820}, {"loss": 0.7788, "grad_norm": 0.5246456861495972, "learning_rate": 0.0002, "epoch": 0.9213644524236984, "step": 12830}, {"loss": 0.7886, "grad_norm": 0.6524264812469482, "learning_rate": 0.0002, "epoch": 0.9220825852782765, "step": 12840}, {"loss": 0.796, "grad_norm": 0.6010791063308716, "learning_rate": 0.0002, "epoch": 0.9228007181328546, "step": 12850}, {"loss": 0.7998, "grad_norm": 0.5289866924285889, "learning_rate": 0.0002, "epoch": 0.9235188509874327, "step": 12860}, {"loss": 0.7582, "grad_norm": 0.6850762367248535, "learning_rate": 0.0002, "epoch": 0.9242369838420108, "step": 12870}, {"loss": 0.7894, "grad_norm": 0.5293797850608826, "learning_rate": 0.0002, "epoch": 0.9249551166965889, "step": 12880}, {"loss": 0.7738, "grad_norm": 0.6045399308204651, "learning_rate": 0.0002, "epoch": 0.9256732495511669, "step": 12890}, {"loss": 0.7207, "grad_norm": 0.7026739716529846, "learning_rate": 0.0002, "epoch": 0.926391382405745, "step": 12900}, {"loss": 0.7726, "grad_norm": 0.6884756684303284, "learning_rate": 0.0002, "epoch": 0.9271095152603231, "step": 12910}, {"loss": 0.7913, "grad_norm": 0.637884795665741, "learning_rate": 0.0002, "epoch": 0.9278276481149013, "step": 12920}, {"loss": 0.7513, "grad_norm": 0.513913631439209, "learning_rate": 0.0002, "epoch": 0.9285457809694794, "step": 12930}, {"loss": 0.8, "grad_norm": 0.6642340421676636, "learning_rate": 0.0002, "epoch": 0.9292639138240575, "step": 12940}, {"loss": 0.8026, "grad_norm": 0.5708861947059631, "learning_rate": 0.0002, "epoch": 0.9299820466786356, "step": 12950}, {"loss": 0.8234, "grad_norm": 0.5896512866020203, "learning_rate": 0.0002, "epoch": 0.9307001795332136, "step": 12960}, {"loss": 0.77, "grad_norm": 0.5754874348640442, "learning_rate": 0.0002, "epoch": 0.9314183123877917, "step": 12970}, {"loss": 0.7594, "grad_norm": 0.6363751888275146, "learning_rate": 0.0002, "epoch": 0.9321364452423698, "step": 12980}, {"loss": 0.7898, "grad_norm": 0.7660197019577026, "learning_rate": 0.0002, "epoch": 0.9328545780969479, "step": 12990}, {"loss": 0.792, "grad_norm": 0.607728898525238, "learning_rate": 0.0002, "epoch": 0.933572710951526, "step": 13000}, {"loss": 0.734, "grad_norm": 0.5257042050361633, "learning_rate": 0.0002, "epoch": 0.9342908438061042, "step": 13010}, {"loss": 0.8129, "grad_norm": 0.7916908264160156, "learning_rate": 0.0002, "epoch": 0.9350089766606823, "step": 13020}, {"loss": 0.81, "grad_norm": 0.8310123085975647, "learning_rate": 0.0002, "epoch": 0.9357271095152603, "step": 13030}, {"loss": 0.7738, "grad_norm": 0.6543728113174438, "learning_rate": 0.0002, "epoch": 0.9364452423698384, "step": 13040}, {"loss": 0.7797, "grad_norm": 0.7153878808021545, "learning_rate": 0.0002, "epoch": 0.9371633752244165, "step": 13050}, {"loss": 0.779, "grad_norm": 0.7510694265365601, "learning_rate": 0.0002, "epoch": 0.9378815080789946, "step": 13060}, {"loss": 0.7761, "grad_norm": 0.5524464249610901, "learning_rate": 0.0002, "epoch": 0.9385996409335727, "step": 13070}, {"loss": 0.8635, "grad_norm": 0.6657140254974365, "learning_rate": 0.0002, "epoch": 0.9393177737881508, "step": 13080}, {"loss": 0.8097, "grad_norm": 0.5757394433021545, "learning_rate": 0.0002, "epoch": 0.940035906642729, "step": 13090}, {"loss": 0.7967, "grad_norm": 0.6171187162399292, "learning_rate": 0.0002, "epoch": 0.940754039497307, "step": 13100}, {"loss": 0.8197, "grad_norm": 0.5946314334869385, "learning_rate": 0.0002, "epoch": 0.9414721723518851, "step": 13110}, {"loss": 0.7184, "grad_norm": 0.5727229714393616, "learning_rate": 0.0002, "epoch": 0.9421903052064632, "step": 13120}, {"loss": 0.7981, "grad_norm": 0.7805224061012268, "learning_rate": 0.0002, "epoch": 0.9429084380610413, "step": 13130}, {"loss": 0.8045, "grad_norm": 0.5763523578643799, "learning_rate": 0.0002, "epoch": 0.9436265709156194, "step": 13140}, {"loss": 0.7462, "grad_norm": 0.8310899138450623, "learning_rate": 0.0002, "epoch": 0.9443447037701975, "step": 13150}, {"loss": 0.7818, "grad_norm": 0.7531784772872925, "learning_rate": 0.0002, "epoch": 0.9450628366247756, "step": 13160}, {"loss": 0.8418, "grad_norm": 0.678779661655426, "learning_rate": 0.0002, "epoch": 0.9457809694793536, "step": 13170}, {"loss": 0.8064, "grad_norm": 0.8096453547477722, "learning_rate": 0.0002, "epoch": 0.9464991023339318, "step": 13180}, {"loss": 0.7676, "grad_norm": 0.6743921637535095, "learning_rate": 0.0002, "epoch": 0.9472172351885099, "step": 13190}, {"loss": 0.7949, "grad_norm": 0.606852114200592, "learning_rate": 0.0002, "epoch": 0.947935368043088, "step": 13200}, {"loss": 0.7908, "grad_norm": 0.6550270915031433, "learning_rate": 0.0002, "epoch": 0.9486535008976661, "step": 13210}, {"loss": 0.7564, "grad_norm": 0.6494552493095398, "learning_rate": 0.0002, "epoch": 0.9493716337522442, "step": 13220}, {"loss": 0.7974, "grad_norm": 0.5867666602134705, "learning_rate": 0.0002, "epoch": 0.9500897666068223, "step": 13230}, {"loss": 0.8117, "grad_norm": 0.6283786296844482, "learning_rate": 0.0002, "epoch": 0.9508078994614003, "step": 13240}, {"loss": 0.7775, "grad_norm": 0.6824573278427124, "learning_rate": 0.0002, "epoch": 0.9515260323159784, "step": 13250}, {"loss": 0.7674, "grad_norm": 0.6945744156837463, "learning_rate": 0.0002, "epoch": 0.9522441651705565, "step": 13260}, {"loss": 0.7384, "grad_norm": 0.6468575596809387, "learning_rate": 0.0002, "epoch": 0.9529622980251347, "step": 13270}, {"loss": 0.7548, "grad_norm": 0.6819407939910889, "learning_rate": 0.0002, "epoch": 0.9536804308797128, "step": 13280}, {"loss": 0.7933, "grad_norm": 0.6660491824150085, "learning_rate": 0.0002, "epoch": 0.9543985637342909, "step": 13290}, {"loss": 0.7293, "grad_norm": 0.6320462226867676, "learning_rate": 0.0002, "epoch": 0.9551166965888689, "step": 13300}, {"loss": 0.8122, "grad_norm": 0.46753761172294617, "learning_rate": 0.0002, "epoch": 0.955834829443447, "step": 13310}, {"loss": 0.7953, "grad_norm": 0.6608774065971375, "learning_rate": 0.0002, "epoch": 0.9565529622980251, "step": 13320}, {"loss": 0.8217, "grad_norm": 0.607448935508728, "learning_rate": 0.0002, "epoch": 0.9572710951526032, "step": 13330}, {"loss": 0.7278, "grad_norm": 0.6796701550483704, "learning_rate": 0.0002, "epoch": 0.9579892280071813, "step": 13340}, {"loss": 0.7979, "grad_norm": 0.7655861377716064, "learning_rate": 0.0002, "epoch": 0.9587073608617595, "step": 13350}, {"loss": 0.7822, "grad_norm": 0.5881335735321045, "learning_rate": 0.0002, "epoch": 0.9594254937163376, "step": 13360}, {"loss": 0.815, "grad_norm": 0.6855270862579346, "learning_rate": 0.0002, "epoch": 0.9601436265709156, "step": 13370}, {"loss": 0.8025, "grad_norm": 0.6072475910186768, "learning_rate": 0.0002, "epoch": 0.9608617594254937, "step": 13380}, {"loss": 0.7756, "grad_norm": 0.5983994603157043, "learning_rate": 0.0002, "epoch": 0.9615798922800718, "step": 13390}, {"loss": 0.8121, "grad_norm": 0.6141189932823181, "learning_rate": 0.0002, "epoch": 0.9622980251346499, "step": 13400}, {"loss": 0.8059, "grad_norm": 0.6539722084999084, "learning_rate": 0.0002, "epoch": 0.963016157989228, "step": 13410}, {"loss": 0.8085, "grad_norm": 0.5425801277160645, "learning_rate": 0.0002, "epoch": 0.9637342908438061, "step": 13420}, {"loss": 0.7687, "grad_norm": 0.8038925528526306, "learning_rate": 0.0002, "epoch": 0.9644524236983842, "step": 13430}, {"loss": 0.8015, "grad_norm": 0.5729590058326721, "learning_rate": 0.0002, "epoch": 0.9651705565529622, "step": 13440}, {"loss": 0.782, "grad_norm": 0.5695241689682007, "learning_rate": 0.0002, "epoch": 0.9658886894075404, "step": 13450}, {"loss": 0.7984, "grad_norm": 0.5913681387901306, "learning_rate": 0.0002, "epoch": 0.9666068222621185, "step": 13460}, {"loss": 0.7947, "grad_norm": 1.1798994541168213, "learning_rate": 0.0002, "epoch": 0.9673249551166966, "step": 13470}, {"loss": 0.7342, "grad_norm": 0.5931369066238403, "learning_rate": 0.0002, "epoch": 0.9680430879712747, "step": 13480}, {"loss": 0.8432, "grad_norm": 0.6269514560699463, "learning_rate": 0.0002, "epoch": 0.9687612208258528, "step": 13490}, {"loss": 0.7357, "grad_norm": 0.7380245327949524, "learning_rate": 0.0002, "epoch": 0.9694793536804309, "step": 13500}, {"loss": 0.8006, "grad_norm": 0.5668187141418457, "learning_rate": 0.0002, "epoch": 0.9701974865350089, "step": 13510}, {"loss": 0.7562, "grad_norm": 0.547149121761322, "learning_rate": 0.0002, "epoch": 0.970915619389587, "step": 13520}, {"loss": 0.8239, "grad_norm": 0.49131739139556885, "learning_rate": 0.0002, "epoch": 0.9716337522441651, "step": 13530}, {"loss": 0.8159, "grad_norm": 0.6385366320610046, "learning_rate": 0.0002, "epoch": 0.9723518850987433, "step": 13540}, {"loss": 0.7882, "grad_norm": 0.5962417125701904, "learning_rate": 0.0002, "epoch": 0.9730700179533214, "step": 13550}, {"loss": 0.7353, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9737881508078995, "step": 13560}, {"loss": 0.7511, "grad_norm": 0.5757403373718262, "learning_rate": 0.0002, "epoch": 0.9745062836624776, "step": 13570}, {"loss": 0.7858, "grad_norm": 0.7214667201042175, "learning_rate": 0.0002, "epoch": 0.9752244165170556, "step": 13580}, {"loss": 0.7492, "grad_norm": 0.5902701020240784, "learning_rate": 0.0002, "epoch": 0.9759425493716337, "step": 13590}, {"loss": 0.8177, "grad_norm": 0.752805769443512, "learning_rate": 0.0002, "epoch": 0.9766606822262118, "step": 13600}, {"loss": 0.7622, "grad_norm": 0.5943595767021179, "learning_rate": 0.0002, "epoch": 0.9773788150807899, "step": 13610}, {"loss": 0.7781, "grad_norm": 0.6752488613128662, "learning_rate": 0.0002, "epoch": 0.978096947935368, "step": 13620}, {"loss": 0.8022, "grad_norm": 0.5295413732528687, "learning_rate": 0.0002, "epoch": 0.9788150807899462, "step": 13630}, {"loss": 0.7462, "grad_norm": 0.732549250125885, "learning_rate": 0.0002, "epoch": 0.9795332136445243, "step": 13640}, {"loss": 0.7939, "grad_norm": 0.5701823830604553, "learning_rate": 0.0002, "epoch": 0.9802513464991023, "step": 13650}, {"loss": 0.7609, "grad_norm": 0.576898455619812, "learning_rate": 0.0002, "epoch": 0.9809694793536804, "step": 13660}, {"loss": 0.7576, "grad_norm": 0.5916832089424133, "learning_rate": 0.0002, "epoch": 0.9816876122082585, "step": 13670}, {"loss": 0.7587, "grad_norm": 0.5554524660110474, "learning_rate": 0.0002, "epoch": 0.9824057450628366, "step": 13680}, {"loss": 0.8274, "grad_norm": 0.6988440752029419, "learning_rate": 0.0002, "epoch": 0.9831238779174147, "step": 13690}, {"loss": 0.7485, "grad_norm": 0.6660445332527161, "learning_rate": 0.0002, "epoch": 0.9838420107719928, "step": 13700}, {"loss": 0.7609, "grad_norm": 2.421210289001465, "learning_rate": 0.0002, "epoch": 0.984560143626571, "step": 13710}, {"loss": 0.784, "grad_norm": 0.6307598948478699, "learning_rate": 0.0002, "epoch": 0.985278276481149, "step": 13720}, {"loss": 0.7757, "grad_norm": 0.6832480430603027, "learning_rate": 0.0002, "epoch": 0.9859964093357271, "step": 13730}, {"loss": 0.8064, "grad_norm": 0.5974255204200745, "learning_rate": 0.0002, "epoch": 0.9867145421903052, "step": 13740}, {"loss": 0.7871, "grad_norm": 0.6540380716323853, "learning_rate": 0.0002, "epoch": 0.9874326750448833, "step": 13750}, {"loss": 0.7735, "grad_norm": 0.7532727122306824, "learning_rate": 0.0002, "epoch": 0.9881508078994614, "step": 13760}, {"loss": 0.7392, "grad_norm": 0.6776283383369446, "learning_rate": 0.0002, "epoch": 0.9888689407540395, "step": 13770}, {"loss": 0.7852, "grad_norm": 0.5776281356811523, "learning_rate": 0.0002, "epoch": 0.9895870736086176, "step": 13780}, {"loss": 0.8216, "grad_norm": 0.5473008751869202, "learning_rate": 0.0002, "epoch": 0.9903052064631956, "step": 13790}, {"loss": 0.7776, "grad_norm": 0.5428591370582581, "learning_rate": 0.0002, "epoch": 0.9910233393177738, "step": 13800}, {"loss": 0.7823, "grad_norm": 0.5173406004905701, "learning_rate": 0.0002, "epoch": 0.9917414721723519, "step": 13810}, {"loss": 0.762, "grad_norm": 0.6462617516517639, "learning_rate": 0.0002, "epoch": 0.99245960502693, "step": 13820}, {"loss": 0.7656, "grad_norm": 0.5800426006317139, "learning_rate": 0.0002, "epoch": 0.9931777378815081, "step": 13830}, {"loss": 0.8028, "grad_norm": 0.5015466809272766, "learning_rate": 0.0002, "epoch": 0.9938958707360862, "step": 13840}, {"loss": 0.7782, "grad_norm": 0.59474778175354, "learning_rate": 0.0002, "epoch": 0.9946140035906643, "step": 13850}, {"loss": 0.7891, "grad_norm": 0.5609583258628845, "learning_rate": 0.0002, "epoch": 0.9953321364452423, "step": 13860}, {"loss": 0.7647, "grad_norm": 0.5762063264846802, "learning_rate": 0.0002, "epoch": 0.9960502692998204, "step": 13870}, {"loss": 0.7594, "grad_norm": 0.6419214010238647, "learning_rate": 0.0002, "epoch": 0.9967684021543985, "step": 13880}, {"loss": 0.7599, "grad_norm": 0.7821950316429138, "learning_rate": 0.0002, "epoch": 0.9974865350089767, "step": 13890}, {"loss": 0.7529, "grad_norm": 0.6216017007827759, "learning_rate": 0.0002, "epoch": 0.9982046678635548, "step": 13900}, {"loss": 0.7621, "grad_norm": 0.5446485877037048, "learning_rate": 0.0002, "epoch": 0.9989228007181329, "step": 13910}, {"loss": 0.74, "grad_norm": 0.5037565231323242, "learning_rate": 0.0002, "epoch": 0.999640933572711, "step": 13920}, {"eval_loss": 1.09147310256958, "eval_runtime": 55.1915, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 1.0, "step": 13925}, {"loss": 0.7479, "grad_norm": 0.5808277130126953, "learning_rate": 0.0002, "epoch": 1.000359066427289, "step": 13930}, {"loss": 0.7147, "grad_norm": 0.47258496284484863, "learning_rate": 0.0002, "epoch": 1.0010771992818672, "step": 13940}, {"loss": 0.7075, "grad_norm": 0.8921670317649841, "learning_rate": 0.0002, "epoch": 1.0017953321364452, "step": 13950}, {"loss": 0.7737, "grad_norm": 0.746729850769043, "learning_rate": 0.0002, "epoch": 1.0025134649910232, "step": 13960}, {"loss": 0.6912, "grad_norm": 0.6243796944618225, "learning_rate": 0.0002, "epoch": 1.0032315978456015, "step": 13970}, {"loss": 0.7171, "grad_norm": 0.6725090742111206, "learning_rate": 0.0002, "epoch": 1.0039497307001795, "step": 13980}, {"loss": 0.7094, "grad_norm": 0.8762497305870056, "learning_rate": 0.0002, "epoch": 1.0046678635547577, "step": 13990}, {"loss": 0.7183, "grad_norm": 0.7694411873817444, "learning_rate": 0.0002, "epoch": 1.0053859964093357, "step": 14000}, {"loss": 0.7741, "grad_norm": 0.6208822727203369, "learning_rate": 0.0002, "epoch": 1.006104129263914, "step": 14010}, {"loss": 0.7291, "grad_norm": 0.8503357768058777, "learning_rate": 0.0002, "epoch": 1.006822262118492, "step": 14020}, {"loss": 0.7189, "grad_norm": 0.5813316106796265, "learning_rate": 0.0002, "epoch": 1.00754039497307, "step": 14030}, {"loss": 0.751, "grad_norm": 0.8186036348342896, "learning_rate": 0.0002, "epoch": 1.0082585278276481, "step": 14040}, {"loss": 0.7205, "grad_norm": 0.759873628616333, "learning_rate": 0.0002, "epoch": 1.0089766606822261, "step": 14050}, {"loss": 0.7517, "grad_norm": 0.8437777161598206, "learning_rate": 0.0002, "epoch": 1.0096947935368044, "step": 14060}, {"loss": 0.7205, "grad_norm": 0.5750975012779236, "learning_rate": 0.0002, "epoch": 1.0104129263913824, "step": 14070}, {"loss": 0.7079, "grad_norm": 0.5873221158981323, "learning_rate": 0.0002, "epoch": 1.0111310592459606, "step": 14080}, {"loss": 0.7645, "grad_norm": 0.6381314396858215, "learning_rate": 0.0002, "epoch": 1.0118491921005386, "step": 14090}, {"loss": 0.7246, "grad_norm": 0.6510405540466309, "learning_rate": 0.0002, "epoch": 1.0125673249551166, "step": 14100}, {"loss": 0.6906, "grad_norm": 0.7698671221733093, "learning_rate": 0.0002, "epoch": 1.0132854578096948, "step": 14110}, {"loss": 0.7008, "grad_norm": 0.646180272102356, "learning_rate": 0.0002, "epoch": 1.0140035906642728, "step": 14120}, {"loss": 0.7446, "grad_norm": 0.6183205246925354, "learning_rate": 0.0002, "epoch": 1.014721723518851, "step": 14130}, {"loss": 0.747, "grad_norm": 0.5082563757896423, "learning_rate": 0.0002, "epoch": 1.015439856373429, "step": 14140}, {"loss": 0.7229, "grad_norm": 0.7285500764846802, "learning_rate": 0.0002, "epoch": 1.0161579892280073, "step": 14150}, {"loss": 0.6879, "grad_norm": 0.6368175148963928, "learning_rate": 0.0002, "epoch": 1.0168761220825853, "step": 14160}, {"loss": 0.712, "grad_norm": 0.44868743419647217, "learning_rate": 0.0002, "epoch": 1.0175942549371633, "step": 14170}, {"loss": 0.7299, "grad_norm": 0.6346513628959656, "learning_rate": 0.0002, "epoch": 1.0183123877917415, "step": 14180}, {"loss": 0.7099, "grad_norm": 0.7287803292274475, "learning_rate": 0.0002, "epoch": 1.0190305206463195, "step": 14190}, {"loss": 0.6915, "grad_norm": 0.6701363325119019, "learning_rate": 0.0002, "epoch": 1.0197486535008977, "step": 14200}, {"loss": 0.7389, "grad_norm": 0.6419289112091064, "learning_rate": 0.0002, "epoch": 1.0204667863554757, "step": 14210}, {"loss": 0.7386, "grad_norm": 0.7703002095222473, "learning_rate": 0.0002, "epoch": 1.021184919210054, "step": 14220}, {"loss": 0.6819, "grad_norm": 0.6803670525550842, "learning_rate": 0.0002, "epoch": 1.021903052064632, "step": 14230}, {"loss": 0.74, "grad_norm": 0.5780976414680481, "learning_rate": 0.0002, "epoch": 1.02262118491921, "step": 14240}, {"loss": 0.6912, "grad_norm": 0.5096051096916199, "learning_rate": 0.0002, "epoch": 1.0233393177737882, "step": 14250}, {"loss": 0.7585, "grad_norm": 0.6058611869812012, "learning_rate": 0.0002, "epoch": 1.0240574506283662, "step": 14260}, {"loss": 0.7542, "grad_norm": 0.6703311204910278, "learning_rate": 0.0002, "epoch": 1.0247755834829444, "step": 14270}, {"loss": 0.7541, "grad_norm": 0.7143640518188477, "learning_rate": 0.0002, "epoch": 1.0254937163375224, "step": 14280}, {"loss": 0.7411, "grad_norm": 0.6730744242668152, "learning_rate": 0.0002, "epoch": 1.0262118491921006, "step": 14290}, {"loss": 0.7072, "grad_norm": 0.8180603384971619, "learning_rate": 0.0002, "epoch": 1.0269299820466786, "step": 14300}, {"loss": 0.6944, "grad_norm": 0.6752267479896545, "learning_rate": 0.0002, "epoch": 1.0276481149012566, "step": 14310}, {"loss": 0.7105, "grad_norm": 0.678428590297699, "learning_rate": 0.0002, "epoch": 1.0283662477558349, "step": 14320}, {"loss": 0.7496, "grad_norm": 0.5959973931312561, "learning_rate": 0.0002, "epoch": 1.0290843806104129, "step": 14330}, {"loss": 0.7196, "grad_norm": 0.5797176957130432, "learning_rate": 0.0002, "epoch": 1.029802513464991, "step": 14340}, {"loss": 0.7853, "grad_norm": 0.6415652632713318, "learning_rate": 0.0002, "epoch": 1.030520646319569, "step": 14350}, {"loss": 0.7297, "grad_norm": 0.6984175443649292, "learning_rate": 0.0002, "epoch": 1.0312387791741473, "step": 14360}, {"loss": 0.7715, "grad_norm": 0.7158452272415161, "learning_rate": 0.0002, "epoch": 1.0319569120287253, "step": 14370}, {"loss": 0.7526, "grad_norm": 0.6066089272499084, "learning_rate": 0.0002, "epoch": 1.0326750448833033, "step": 14380}, {"loss": 0.7639, "grad_norm": 0.7359582781791687, "learning_rate": 0.0002, "epoch": 1.0333931777378815, "step": 14390}, {"loss": 0.7445, "grad_norm": 0.7372373938560486, "learning_rate": 0.0002, "epoch": 1.0341113105924595, "step": 14400}, {"loss": 0.7262, "grad_norm": 0.7511868476867676, "learning_rate": 0.0002, "epoch": 1.0348294434470378, "step": 14410}, {"loss": 0.7145, "grad_norm": 0.5449917912483215, "learning_rate": 0.0002, "epoch": 1.0355475763016158, "step": 14420}, {"loss": 0.6908, "grad_norm": 0.6700817346572876, "learning_rate": 0.0002, "epoch": 1.036265709156194, "step": 14430}, {"loss": 0.7237, "grad_norm": 0.7061316967010498, "learning_rate": 0.0002, "epoch": 1.036983842010772, "step": 14440}, {"loss": 0.7166, "grad_norm": 0.7582663893699646, "learning_rate": 0.0002, "epoch": 1.03770197486535, "step": 14450}, {"loss": 0.7447, "grad_norm": 0.6408873200416565, "learning_rate": 0.0002, "epoch": 1.0384201077199282, "step": 14460}, {"loss": 0.728, "grad_norm": 0.7645436525344849, "learning_rate": 0.0002, "epoch": 1.0391382405745062, "step": 14470}, {"loss": 0.7764, "grad_norm": 0.6522644758224487, "learning_rate": 0.0002, "epoch": 1.0398563734290844, "step": 14480}, {"loss": 0.7249, "grad_norm": 0.784273624420166, "learning_rate": 0.0002, "epoch": 1.0405745062836624, "step": 14490}, {"loss": 0.7173, "grad_norm": 0.673891544342041, "learning_rate": 0.0002, "epoch": 1.0412926391382407, "step": 14500}, {"loss": 0.6647, "grad_norm": 0.6566316485404968, "learning_rate": 0.0002, "epoch": 1.0420107719928187, "step": 14510}, {"loss": 0.7626, "grad_norm": 0.6062059998512268, "learning_rate": 0.0002, "epoch": 1.0427289048473967, "step": 14520}, {"loss": 0.7061, "grad_norm": 0.6884504556655884, "learning_rate": 0.0002, "epoch": 1.0434470377019749, "step": 14530}, {"loss": 0.7293, "grad_norm": 0.6642231345176697, "learning_rate": 0.0002, "epoch": 1.044165170556553, "step": 14540}, {"loss": 0.7084, "grad_norm": 0.6989523768424988, "learning_rate": 0.0002, "epoch": 1.0448833034111311, "step": 14550}, {"loss": 0.7751, "grad_norm": 0.8179892301559448, "learning_rate": 0.0002, "epoch": 1.0456014362657091, "step": 14560}, {"loss": 0.7225, "grad_norm": 0.6426970362663269, "learning_rate": 0.0002, "epoch": 1.0463195691202873, "step": 14570}, {"loss": 0.7756, "grad_norm": 0.678445041179657, "learning_rate": 0.0002, "epoch": 1.0470377019748653, "step": 14580}, {"loss": 0.7172, "grad_norm": 0.7573820352554321, "learning_rate": 0.0002, "epoch": 1.0477558348294433, "step": 14590}, {"loss": 0.8092, "grad_norm": 0.734443724155426, "learning_rate": 0.0002, "epoch": 1.0484739676840216, "step": 14600}, {"loss": 0.7205, "grad_norm": 0.7333676218986511, "learning_rate": 0.0002, "epoch": 1.0491921005385996, "step": 14610}, {"loss": 0.7276, "grad_norm": 0.6122187972068787, "learning_rate": 0.0002, "epoch": 1.0499102333931778, "step": 14620}, {"loss": 0.7051, "grad_norm": 0.6916412711143494, "learning_rate": 0.0002, "epoch": 1.0506283662477558, "step": 14630}, {"loss": 0.7315, "grad_norm": 0.5898127555847168, "learning_rate": 0.0002, "epoch": 1.051346499102334, "step": 14640}, {"loss": 0.7293, "grad_norm": 0.6071873307228088, "learning_rate": 0.0002, "epoch": 1.052064631956912, "step": 14650}, {"loss": 0.7924, "grad_norm": 0.6530455946922302, "learning_rate": 0.0002, "epoch": 1.05278276481149, "step": 14660}, {"loss": 0.7055, "grad_norm": 0.6919314861297607, "learning_rate": 0.0002, "epoch": 1.0535008976660682, "step": 14670}, {"loss": 0.7481, "grad_norm": 0.7843509912490845, "learning_rate": 0.0002, "epoch": 1.0542190305206462, "step": 14680}, {"loss": 0.7253, "grad_norm": 0.6106747388839722, "learning_rate": 0.0002, "epoch": 1.0549371633752245, "step": 14690}, {"loss": 0.7206, "grad_norm": 0.7828368544578552, "learning_rate": 0.0002, "epoch": 1.0556552962298025, "step": 14700}, {"loss": 0.6933, "grad_norm": 0.6772044897079468, "learning_rate": 0.0002, "epoch": 1.0563734290843807, "step": 14710}, {"loss": 0.6851, "grad_norm": 0.5430962443351746, "learning_rate": 0.0002, "epoch": 1.0570915619389587, "step": 14720}, {"loss": 0.7306, "grad_norm": 0.7364194989204407, "learning_rate": 0.0002, "epoch": 1.0578096947935367, "step": 14730}, {"loss": 0.703, "grad_norm": 0.5607585310935974, "learning_rate": 0.0002, "epoch": 1.058527827648115, "step": 14740}, {"loss": 0.7488, "grad_norm": 0.7917081713676453, "learning_rate": 0.0002, "epoch": 1.059245960502693, "step": 14750}, {"loss": 0.71, "grad_norm": 0.7852025628089905, "learning_rate": 0.0002, "epoch": 1.0599640933572712, "step": 14760}, {"loss": 0.7093, "grad_norm": 0.6329161524772644, "learning_rate": 0.0002, "epoch": 1.0606822262118492, "step": 14770}, {"loss": 0.7244, "grad_norm": 0.7607306838035583, "learning_rate": 0.0002, "epoch": 1.0614003590664274, "step": 14780}, {"loss": 0.7237, "grad_norm": 0.7236617207527161, "learning_rate": 0.0002, "epoch": 1.0621184919210054, "step": 14790}, {"loss": 0.7133, "grad_norm": 0.793542206287384, "learning_rate": 0.0002, "epoch": 1.0628366247755834, "step": 14800}, {"loss": 0.7482, "grad_norm": 0.53999263048172, "learning_rate": 0.0002, "epoch": 1.0635547576301616, "step": 14810}, {"loss": 0.732, "grad_norm": 0.5821034908294678, "learning_rate": 0.0002, "epoch": 1.0642728904847396, "step": 14820}, {"loss": 0.7066, "grad_norm": 0.6593600511550903, "learning_rate": 0.0002, "epoch": 1.0649910233393178, "step": 14830}, {"loss": 0.7458, "grad_norm": 0.70230633020401, "learning_rate": 0.0002, "epoch": 1.0657091561938958, "step": 14840}, {"loss": 0.7244, "grad_norm": 0.5715264081954956, "learning_rate": 0.0002, "epoch": 1.066427289048474, "step": 14850}, {"loss": 0.723, "grad_norm": 0.6610119938850403, "learning_rate": 0.0002, "epoch": 1.067145421903052, "step": 14860}, {"loss": 0.745, "grad_norm": 0.5470091700553894, "learning_rate": 0.0002, "epoch": 1.06786355475763, "step": 14870}, {"loss": 0.7464, "grad_norm": 0.7529906630516052, "learning_rate": 0.0002, "epoch": 1.0685816876122083, "step": 14880}, {"loss": 0.7421, "grad_norm": 0.7532844543457031, "learning_rate": 0.0002, "epoch": 1.0692998204667863, "step": 14890}, {"loss": 0.6706, "grad_norm": 0.6439316868782043, "learning_rate": 0.0002, "epoch": 1.0700179533213645, "step": 14900}, {"loss": 0.7276, "grad_norm": 0.5580114126205444, "learning_rate": 0.0002, "epoch": 1.0707360861759425, "step": 14910}, {"loss": 0.7478, "grad_norm": 0.6299236416816711, "learning_rate": 0.0002, "epoch": 1.0714542190305207, "step": 14920}, {"loss": 0.7927, "grad_norm": 0.6934021711349487, "learning_rate": 0.0002, "epoch": 1.0721723518850987, "step": 14930}, {"loss": 0.6766, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 1.0728904847396767, "step": 14940}, {"loss": 0.7072, "grad_norm": 0.8921014070510864, "learning_rate": 0.0002, "epoch": 1.073608617594255, "step": 14950}, {"loss": 0.7127, "grad_norm": 0.5934301614761353, "learning_rate": 0.0002, "epoch": 1.074326750448833, "step": 14960}, {"loss": 0.7595, "grad_norm": 0.8379642367362976, "learning_rate": 0.0002, "epoch": 1.0750448833034112, "step": 14970}, {"loss": 0.7231, "grad_norm": 0.6842767596244812, "learning_rate": 0.0002, "epoch": 1.0757630161579892, "step": 14980}, {"loss": 0.7362, "grad_norm": 0.7296533584594727, "learning_rate": 0.0002, "epoch": 1.0764811490125674, "step": 14990}, {"loss": 0.688, "grad_norm": 0.6821087002754211, "learning_rate": 0.0002, "epoch": 1.0771992818671454, "step": 15000}, {"loss": 0.6808, "grad_norm": 0.6133626699447632, "learning_rate": 0.0002, "epoch": 1.0779174147217234, "step": 15010}, {"loss": 0.7351, "grad_norm": 0.6774773001670837, "learning_rate": 0.0002, "epoch": 1.0786355475763016, "step": 15020}, {"loss": 0.7403, "grad_norm": 0.6818786859512329, "learning_rate": 0.0002, "epoch": 1.0793536804308796, "step": 15030}, {"loss": 0.7005, "grad_norm": 0.7763522863388062, "learning_rate": 0.0002, "epoch": 1.0800718132854579, "step": 15040}, {"loss": 0.7028, "grad_norm": 0.7259193658828735, "learning_rate": 0.0002, "epoch": 1.0807899461400359, "step": 15050}, {"loss": 0.7232, "grad_norm": 0.6797525882720947, "learning_rate": 0.0002, "epoch": 1.081508078994614, "step": 15060}, {"loss": 0.7051, "grad_norm": 0.5775881409645081, "learning_rate": 0.0002, "epoch": 1.082226211849192, "step": 15070}, {"loss": 0.745, "grad_norm": 0.7055524587631226, "learning_rate": 0.0002, "epoch": 1.08294434470377, "step": 15080}, {"loss": 0.7539, "grad_norm": 0.8018748760223389, "learning_rate": 0.0002, "epoch": 1.0836624775583483, "step": 15090}, {"loss": 0.6833, "grad_norm": 0.6738115549087524, "learning_rate": 0.0002, "epoch": 1.0843806104129263, "step": 15100}, {"loss": 0.7014, "grad_norm": 0.6586359143257141, "learning_rate": 0.0002, "epoch": 1.0850987432675046, "step": 15110}, {"loss": 0.7391, "grad_norm": 0.7396895885467529, "learning_rate": 0.0002, "epoch": 1.0858168761220826, "step": 15120}, {"loss": 0.7473, "grad_norm": 0.7224817276000977, "learning_rate": 0.0002, "epoch": 1.0865350089766608, "step": 15130}, {"loss": 0.7137, "grad_norm": 0.798514187335968, "learning_rate": 0.0002, "epoch": 1.0872531418312388, "step": 15140}, {"loss": 0.757, "grad_norm": 0.79301518201828, "learning_rate": 0.0002, "epoch": 1.0879712746858168, "step": 15150}, {"loss": 0.7, "grad_norm": 0.7106764316558838, "learning_rate": 0.0002, "epoch": 1.088689407540395, "step": 15160}, {"loss": 0.7515, "grad_norm": 0.6525473594665527, "learning_rate": 0.0002, "epoch": 1.089407540394973, "step": 15170}, {"loss": 0.7067, "grad_norm": 0.6001671552658081, "learning_rate": 0.0002, "epoch": 1.0901256732495512, "step": 15180}, {"loss": 0.722, "grad_norm": 0.6949557662010193, "learning_rate": 0.0002, "epoch": 1.0908438061041292, "step": 15190}, {"loss": 0.7165, "grad_norm": 0.5713186860084534, "learning_rate": 0.0002, "epoch": 1.0915619389587075, "step": 15200}, {"loss": 0.7073, "grad_norm": 0.8773220181465149, "learning_rate": 0.0002, "epoch": 1.0922800718132855, "step": 15210}, {"loss": 0.7332, "grad_norm": 0.5837785601615906, "learning_rate": 0.0002, "epoch": 1.0929982046678635, "step": 15220}, {"loss": 0.7451, "grad_norm": 0.7243856191635132, "learning_rate": 0.0002, "epoch": 1.0937163375224417, "step": 15230}, {"loss": 0.6885, "grad_norm": 0.7008263468742371, "learning_rate": 0.0002, "epoch": 1.0944344703770197, "step": 15240}, {"loss": 0.7259, "grad_norm": 0.7061941623687744, "learning_rate": 0.0002, "epoch": 1.095152603231598, "step": 15250}, {"loss": 0.7482, "grad_norm": 0.575903594493866, "learning_rate": 0.0002, "epoch": 1.095870736086176, "step": 15260}, {"loss": 0.7001, "grad_norm": 0.6794043183326721, "learning_rate": 0.0002, "epoch": 1.0965888689407541, "step": 15270}, {"loss": 0.708, "grad_norm": 0.7194870710372925, "learning_rate": 0.0002, "epoch": 1.0973070017953321, "step": 15280}, {"loss": 0.7248, "grad_norm": 0.8063322305679321, "learning_rate": 0.0002, "epoch": 1.0980251346499101, "step": 15290}, {"loss": 0.7128, "grad_norm": 0.786101758480072, "learning_rate": 0.0002, "epoch": 1.0987432675044884, "step": 15300}, {"loss": 0.7523, "grad_norm": 0.827474057674408, "learning_rate": 0.0002, "epoch": 1.0994614003590664, "step": 15310}, {"loss": 0.7624, "grad_norm": 0.6514455080032349, "learning_rate": 0.0002, "epoch": 1.1001795332136446, "step": 15320}, {"loss": 0.745, "grad_norm": 0.7534348368644714, "learning_rate": 0.0002, "epoch": 1.1008976660682226, "step": 15330}, {"loss": 0.7359, "grad_norm": 0.6991367340087891, "learning_rate": 0.0002, "epoch": 1.1016157989228008, "step": 15340}, {"loss": 0.717, "grad_norm": 0.6742196679115295, "learning_rate": 0.0002, "epoch": 1.1023339317773788, "step": 15350}, {"loss": 0.737, "grad_norm": 0.7373757362365723, "learning_rate": 0.0002, "epoch": 1.1030520646319568, "step": 15360}, {"loss": 0.7421, "grad_norm": 0.6834485530853271, "learning_rate": 0.0002, "epoch": 1.103770197486535, "step": 15370}, {"loss": 0.7015, "grad_norm": 0.6454901099205017, "learning_rate": 0.0002, "epoch": 1.104488330341113, "step": 15380}, {"loss": 0.7276, "grad_norm": 0.7764508128166199, "learning_rate": 0.0002, "epoch": 1.1052064631956913, "step": 15390}, {"loss": 0.747, "grad_norm": 0.668560802936554, "learning_rate": 0.0002, "epoch": 1.1059245960502693, "step": 15400}, {"loss": 0.6705, "grad_norm": 0.579655110836029, "learning_rate": 0.0002, "epoch": 1.1066427289048475, "step": 15410}, {"loss": 0.7101, "grad_norm": 0.7196493148803711, "learning_rate": 0.0002, "epoch": 1.1073608617594255, "step": 15420}, {"loss": 0.8027, "grad_norm": 0.5530232191085815, "learning_rate": 0.0002, "epoch": 1.1080789946140035, "step": 15430}, {"loss": 0.7369, "grad_norm": 0.6542958617210388, "learning_rate": 0.0002, "epoch": 1.1087971274685817, "step": 15440}, {"loss": 0.7475, "grad_norm": 0.7468852400779724, "learning_rate": 0.0002, "epoch": 1.1095152603231597, "step": 15450}, {"loss": 0.6898, "grad_norm": 0.8119780421257019, "learning_rate": 0.0002, "epoch": 1.110233393177738, "step": 15460}, {"loss": 0.7652, "grad_norm": 0.7807733416557312, "learning_rate": 0.0002, "epoch": 1.110951526032316, "step": 15470}, {"loss": 0.697, "grad_norm": 0.7352553009986877, "learning_rate": 0.0002, "epoch": 1.1116696588868942, "step": 15480}, {"loss": 0.7509, "grad_norm": 0.8455224633216858, "learning_rate": 0.0002, "epoch": 1.1123877917414722, "step": 15490}, {"loss": 0.7757, "grad_norm": 0.635308563709259, "learning_rate": 0.0002, "epoch": 1.1131059245960502, "step": 15500}, {"loss": 0.685, "grad_norm": 0.6268794536590576, "learning_rate": 0.0002, "epoch": 1.1138240574506284, "step": 15510}, {"loss": 0.7174, "grad_norm": 0.6829593181610107, "learning_rate": 0.0002, "epoch": 1.1145421903052064, "step": 15520}, {"loss": 0.7264, "grad_norm": 0.5997796058654785, "learning_rate": 0.0002, "epoch": 1.1152603231597846, "step": 15530}, {"loss": 0.7167, "grad_norm": 0.7500942349433899, "learning_rate": 0.0002, "epoch": 1.1159784560143626, "step": 15540}, {"loss": 0.7275, "grad_norm": 0.7052047848701477, "learning_rate": 0.0002, "epoch": 1.1166965888689409, "step": 15550}, {"loss": 0.7832, "grad_norm": 0.6698189377784729, "learning_rate": 0.0002, "epoch": 1.1174147217235189, "step": 15560}, {"loss": 0.7587, "grad_norm": 0.7890462875366211, "learning_rate": 0.0002, "epoch": 1.1181328545780969, "step": 15570}, {"loss": 0.7092, "grad_norm": 0.7002465128898621, "learning_rate": 0.0002, "epoch": 1.118850987432675, "step": 15580}, {"loss": 0.6903, "grad_norm": 0.7456073760986328, "learning_rate": 0.0002, "epoch": 1.119569120287253, "step": 15590}, {"loss": 0.7577, "grad_norm": 0.7997385263442993, "learning_rate": 0.0002, "epoch": 1.1202872531418313, "step": 15600}, {"loss": 0.7005, "grad_norm": 0.6640482544898987, "learning_rate": 0.0002, "epoch": 1.1210053859964093, "step": 15610}, {"loss": 0.7334, "grad_norm": 0.7765318155288696, "learning_rate": 0.0002, "epoch": 1.1217235188509875, "step": 15620}, {"loss": 0.6977, "grad_norm": 0.7184962630271912, "learning_rate": 0.0002, "epoch": 1.1224416517055655, "step": 15630}, {"loss": 0.7362, "grad_norm": 0.7310904264450073, "learning_rate": 0.0002, "epoch": 1.1231597845601435, "step": 15640}, {"loss": 0.7278, "grad_norm": 0.7406452298164368, "learning_rate": 0.0002, "epoch": 1.1238779174147218, "step": 15650}, {"loss": 0.7074, "grad_norm": 0.7546738982200623, "learning_rate": 0.0002, "epoch": 1.1245960502692998, "step": 15660}, {"loss": 0.7641, "grad_norm": 0.7069764733314514, "learning_rate": 0.0002, "epoch": 1.125314183123878, "step": 15670}, {"loss": 0.76, "grad_norm": 0.6309521198272705, "learning_rate": 0.0002, "epoch": 1.126032315978456, "step": 15680}, {"loss": 0.7862, "grad_norm": 0.8050156831741333, "learning_rate": 0.0002, "epoch": 1.1267504488330342, "step": 15690}, {"loss": 0.7553, "grad_norm": 0.726556122303009, "learning_rate": 0.0002, "epoch": 1.1274685816876122, "step": 15700}, {"loss": 0.7763, "grad_norm": 0.77745521068573, "learning_rate": 0.0002, "epoch": 1.1281867145421902, "step": 15710}, {"loss": 0.7703, "grad_norm": 0.7467634677886963, "learning_rate": 0.0002, "epoch": 1.1289048473967684, "step": 15720}, {"loss": 0.7676, "grad_norm": 0.8207895755767822, "learning_rate": 0.0002, "epoch": 1.1296229802513464, "step": 15730}, {"loss": 0.6747, "grad_norm": 0.8253937363624573, "learning_rate": 0.0002, "epoch": 1.1303411131059247, "step": 15740}, {"loss": 0.6983, "grad_norm": 0.6313983798027039, "learning_rate": 0.0002, "epoch": 1.1310592459605027, "step": 15750}, {"loss": 0.6916, "grad_norm": 0.8040992021560669, "learning_rate": 0.0002, "epoch": 1.1317773788150807, "step": 15760}, {"loss": 0.7295, "grad_norm": 0.5937064290046692, "learning_rate": 0.0002, "epoch": 1.132495511669659, "step": 15770}, {"loss": 0.7494, "grad_norm": 0.6486281156539917, "learning_rate": 0.0002, "epoch": 1.133213644524237, "step": 15780}, {"loss": 0.7029, "grad_norm": 0.6161853075027466, "learning_rate": 0.0002, "epoch": 1.1339317773788151, "step": 15790}, {"loss": 0.7019, "grad_norm": 0.6926610469818115, "learning_rate": 0.0002, "epoch": 1.1346499102333931, "step": 15800}, {"loss": 0.6906, "grad_norm": 0.6084047555923462, "learning_rate": 0.0002, "epoch": 1.1353680430879713, "step": 15810}, {"loss": 0.7091, "grad_norm": 0.6928383111953735, "learning_rate": 0.0002, "epoch": 1.1360861759425493, "step": 15820}, {"loss": 0.7238, "grad_norm": 0.7784243822097778, "learning_rate": 0.0002, "epoch": 1.1368043087971276, "step": 15830}, {"loss": 0.6943, "grad_norm": 0.7169384956359863, "learning_rate": 0.0002, "epoch": 1.1375224416517056, "step": 15840}, {"loss": 0.7287, "grad_norm": 0.6953616142272949, "learning_rate": 0.0002, "epoch": 1.1382405745062836, "step": 15850}, {"loss": 0.7489, "grad_norm": 0.7345215082168579, "learning_rate": 0.0002, "epoch": 1.1389587073608618, "step": 15860}, {"loss": 0.683, "grad_norm": 0.5469502806663513, "learning_rate": 0.0002, "epoch": 1.1396768402154398, "step": 15870}, {"loss": 0.717, "grad_norm": 0.687680721282959, "learning_rate": 0.0002, "epoch": 1.140394973070018, "step": 15880}, {"loss": 0.7171, "grad_norm": 0.6879996657371521, "learning_rate": 0.0002, "epoch": 1.141113105924596, "step": 15890}, {"loss": 0.7321, "grad_norm": 0.728886067867279, "learning_rate": 0.0002, "epoch": 1.141831238779174, "step": 15900}, {"loss": 0.7752, "grad_norm": 0.929531455039978, "learning_rate": 0.0002, "epoch": 1.1425493716337523, "step": 15910}, {"loss": 0.7353, "grad_norm": 0.8122507333755493, "learning_rate": 0.0002, "epoch": 1.1432675044883303, "step": 15920}, {"loss": 0.7138, "grad_norm": 0.6494652628898621, "learning_rate": 0.0002, "epoch": 1.1439856373429085, "step": 15930}, {"loss": 0.7489, "grad_norm": 0.7307567596435547, "learning_rate": 0.0002, "epoch": 1.1447037701974865, "step": 15940}, {"loss": 0.7385, "grad_norm": 0.548678994178772, "learning_rate": 0.0002, "epoch": 1.1454219030520647, "step": 15950}, {"loss": 0.7152, "grad_norm": 0.8011603951454163, "learning_rate": 0.0002, "epoch": 1.1461400359066427, "step": 15960}, {"loss": 0.7324, "grad_norm": 0.7026647329330444, "learning_rate": 0.0002, "epoch": 1.146858168761221, "step": 15970}, {"loss": 0.7464, "grad_norm": 0.7338995933532715, "learning_rate": 0.0002, "epoch": 1.147576301615799, "step": 15980}, {"loss": 0.7416, "grad_norm": 0.8453443646430969, "learning_rate": 0.0002, "epoch": 1.148294434470377, "step": 15990}, {"loss": 0.7419, "grad_norm": 0.6787207126617432, "learning_rate": 0.0002, "epoch": 1.1490125673249552, "step": 16000}, {"loss": 0.7487, "grad_norm": 0.6314631104469299, "learning_rate": 0.0002, "epoch": 1.1497307001795332, "step": 16010}, {"loss": 0.7165, "grad_norm": 0.8812752962112427, "learning_rate": 0.0002, "epoch": 1.1504488330341114, "step": 16020}, {"loss": 0.774, "grad_norm": 0.6528969407081604, "learning_rate": 0.0002, "epoch": 1.1511669658886894, "step": 16030}, {"loss": 0.7321, "grad_norm": 0.7843571305274963, "learning_rate": 0.0002, "epoch": 1.1518850987432674, "step": 16040}, {"loss": 0.7769, "grad_norm": 0.7095080018043518, "learning_rate": 0.0002, "epoch": 1.1526032315978456, "step": 16050}, {"loss": 0.744, "grad_norm": 0.7495582103729248, "learning_rate": 0.0002, "epoch": 1.1533213644524236, "step": 16060}, {"loss": 0.7813, "grad_norm": 0.6002049446105957, "learning_rate": 0.0002, "epoch": 1.1540394973070018, "step": 16070}, {"loss": 0.7117, "grad_norm": 0.565014123916626, "learning_rate": 0.0002, "epoch": 1.1547576301615798, "step": 16080}, {"loss": 0.7664, "grad_norm": 0.8209971785545349, "learning_rate": 0.0002, "epoch": 1.155475763016158, "step": 16090}, {"loss": 0.7486, "grad_norm": 0.7137531042098999, "learning_rate": 0.0002, "epoch": 1.156193895870736, "step": 16100}, {"loss": 0.7197, "grad_norm": 0.7307516932487488, "learning_rate": 0.0002, "epoch": 1.1569120287253143, "step": 16110}, {"loss": 0.7351, "grad_norm": 0.6686444878578186, "learning_rate": 0.0002, "epoch": 1.1576301615798923, "step": 16120}, {"loss": 0.7407, "grad_norm": 0.7977298498153687, "learning_rate": 0.0002, "epoch": 1.1583482944344703, "step": 16130}, {"loss": 0.6696, "grad_norm": 0.6980607509613037, "learning_rate": 0.0002, "epoch": 1.1590664272890485, "step": 16140}, {"loss": 0.7513, "grad_norm": 0.6622613668441772, "learning_rate": 0.0002, "epoch": 1.1597845601436265, "step": 16150}, {"loss": 0.7162, "grad_norm": 0.6598347425460815, "learning_rate": 0.0002, "epoch": 1.1605026929982047, "step": 16160}, {"loss": 0.7418, "grad_norm": 0.6686234474182129, "learning_rate": 0.0002, "epoch": 1.1612208258527827, "step": 16170}, {"loss": 0.7104, "grad_norm": 0.7308177947998047, "learning_rate": 0.0002, "epoch": 1.1619389587073607, "step": 16180}, {"loss": 0.7337, "grad_norm": 0.939537525177002, "learning_rate": 0.0002, "epoch": 1.162657091561939, "step": 16190}, {"loss": 0.7054, "grad_norm": 0.5514758825302124, "learning_rate": 0.0002, "epoch": 1.163375224416517, "step": 16200}, {"loss": 0.7449, "grad_norm": 0.589142918586731, "learning_rate": 0.0002, "epoch": 1.1640933572710952, "step": 16210}, {"loss": 0.7438, "grad_norm": 0.6888012290000916, "learning_rate": 0.0002, "epoch": 1.1648114901256732, "step": 16220}, {"loss": 0.719, "grad_norm": 0.82566899061203, "learning_rate": 0.0002, "epoch": 1.1655296229802514, "step": 16230}, {"loss": 0.7274, "grad_norm": 0.6107817888259888, "learning_rate": 0.0002, "epoch": 1.1662477558348294, "step": 16240}, {"loss": 0.6849, "grad_norm": 0.7831398844718933, "learning_rate": 0.0002, "epoch": 1.1669658886894076, "step": 16250}, {"loss": 0.7077, "grad_norm": 0.6468397974967957, "learning_rate": 0.0002, "epoch": 1.1676840215439857, "step": 16260}, {"loss": 0.7056, "grad_norm": 0.7284161448478699, "learning_rate": 0.0002, "epoch": 1.1684021543985637, "step": 16270}, {"loss": 0.7476, "grad_norm": 0.6182818412780762, "learning_rate": 0.0002, "epoch": 1.1691202872531419, "step": 16280}, {"loss": 0.7608, "grad_norm": 0.7091781497001648, "learning_rate": 0.0002, "epoch": 1.1698384201077199, "step": 16290}, {"loss": 0.7235, "grad_norm": 0.7327643632888794, "learning_rate": 0.0002, "epoch": 1.170556552962298, "step": 16300}, {"loss": 0.7304, "grad_norm": 0.5864694118499756, "learning_rate": 0.0002, "epoch": 1.171274685816876, "step": 16310}, {"loss": 0.7011, "grad_norm": 0.7049986720085144, "learning_rate": 0.0002, "epoch": 1.171992818671454, "step": 16320}, {"loss": 0.7234, "grad_norm": 0.7563399076461792, "learning_rate": 0.0002, "epoch": 1.1727109515260323, "step": 16330}, {"loss": 0.7313, "grad_norm": 0.5888143181800842, "learning_rate": 0.0002, "epoch": 1.1734290843806103, "step": 16340}, {"loss": 0.7078, "grad_norm": 0.8670049905776978, "learning_rate": 0.0002, "epoch": 1.1741472172351886, "step": 16350}, {"loss": 0.7656, "grad_norm": 0.8045654296875, "learning_rate": 0.0002, "epoch": 1.1748653500897666, "step": 16360}, {"loss": 0.7942, "grad_norm": 0.9115668535232544, "learning_rate": 0.0002, "epoch": 1.1755834829443448, "step": 16370}, {"loss": 0.6807, "grad_norm": 0.6943584084510803, "learning_rate": 0.0002, "epoch": 1.1763016157989228, "step": 16380}, {"loss": 0.7558, "grad_norm": 0.7931740283966064, "learning_rate": 0.0002, "epoch": 1.177019748653501, "step": 16390}, {"loss": 0.7247, "grad_norm": 0.7967953085899353, "learning_rate": 0.0002, "epoch": 1.177737881508079, "step": 16400}, {"loss": 0.7294, "grad_norm": 0.575165867805481, "learning_rate": 0.0002, "epoch": 1.178456014362657, "step": 16410}, {"loss": 0.8045, "grad_norm": 0.6803409457206726, "learning_rate": 0.0002, "epoch": 1.1791741472172352, "step": 16420}, {"loss": 0.7594, "grad_norm": 0.7661909461021423, "learning_rate": 0.0002, "epoch": 1.1798922800718132, "step": 16430}, {"loss": 0.7387, "grad_norm": 0.7907630205154419, "learning_rate": 0.0002, "epoch": 1.1806104129263915, "step": 16440}, {"loss": 0.6954, "grad_norm": 0.7215338945388794, "learning_rate": 0.0002, "epoch": 1.1813285457809695, "step": 16450}, {"loss": 0.7503, "grad_norm": 0.6824054718017578, "learning_rate": 0.0002, "epoch": 1.1820466786355475, "step": 16460}, {"loss": 0.7548, "grad_norm": 0.8057665228843689, "learning_rate": 0.0002, "epoch": 1.1827648114901257, "step": 16470}, {"loss": 0.7572, "grad_norm": 0.7487542033195496, "learning_rate": 0.0002, "epoch": 1.1834829443447037, "step": 16480}, {"loss": 0.7267, "grad_norm": 0.7254953384399414, "learning_rate": 0.0002, "epoch": 1.184201077199282, "step": 16490}, {"loss": 0.6906, "grad_norm": 0.6986604332923889, "learning_rate": 0.0002, "epoch": 1.18491921005386, "step": 16500}, {"loss": 0.6979, "grad_norm": 0.7889591455459595, "learning_rate": 0.0002, "epoch": 1.1856373429084381, "step": 16510}, {"loss": 0.7455, "grad_norm": 0.6029604077339172, "learning_rate": 0.0002, "epoch": 1.1863554757630161, "step": 16520}, {"loss": 0.7673, "grad_norm": 0.680322527885437, "learning_rate": 0.0002, "epoch": 1.1870736086175944, "step": 16530}, {"loss": 0.708, "grad_norm": 0.8588826060295105, "learning_rate": 0.0002, "epoch": 1.1877917414721724, "step": 16540}, {"loss": 0.7291, "grad_norm": 0.7614806890487671, "learning_rate": 0.0002, "epoch": 1.1885098743267504, "step": 16550}, {"loss": 0.7021, "grad_norm": 0.7523183226585388, "learning_rate": 0.0002, "epoch": 1.1892280071813286, "step": 16560}, {"loss": 0.7452, "grad_norm": 0.8299532532691956, "learning_rate": 0.0002, "epoch": 1.1899461400359066, "step": 16570}, {"loss": 0.7409, "grad_norm": 0.6709241271018982, "learning_rate": 0.0002, "epoch": 1.1906642728904848, "step": 16580}, {"loss": 0.7322, "grad_norm": 0.665414035320282, "learning_rate": 0.0002, "epoch": 1.1913824057450628, "step": 16590}, {"loss": 0.7699, "grad_norm": 0.7582152485847473, "learning_rate": 0.0002, "epoch": 1.1921005385996408, "step": 16600}, {"loss": 0.7069, "grad_norm": 0.5856947302818298, "learning_rate": 0.0002, "epoch": 1.192818671454219, "step": 16610}, {"loss": 0.7444, "grad_norm": 0.6972885727882385, "learning_rate": 0.0002, "epoch": 1.193536804308797, "step": 16620}, {"loss": 0.7265, "grad_norm": 0.6884734630584717, "learning_rate": 0.0002, "epoch": 1.1942549371633753, "step": 16630}, {"loss": 0.6881, "grad_norm": 0.7380475401878357, "learning_rate": 0.0002, "epoch": 1.1949730700179533, "step": 16640}, {"loss": 0.7297, "grad_norm": 0.7976197600364685, "learning_rate": 0.0002, "epoch": 1.1956912028725315, "step": 16650}, {"loss": 0.7328, "grad_norm": 0.819256067276001, "learning_rate": 0.0002, "epoch": 1.1964093357271095, "step": 16660}, {"loss": 0.771, "grad_norm": 0.587867796421051, "learning_rate": 0.0002, "epoch": 1.1971274685816877, "step": 16670}, {"loss": 0.7357, "grad_norm": 0.9162678122520447, "learning_rate": 0.0002, "epoch": 1.1978456014362657, "step": 16680}, {"loss": 0.7472, "grad_norm": 0.7452084422111511, "learning_rate": 0.0002, "epoch": 1.1985637342908437, "step": 16690}, {"loss": 0.7257, "grad_norm": 0.7966971397399902, "learning_rate": 0.0002, "epoch": 1.199281867145422, "step": 16700}, {"loss": 0.8051, "grad_norm": 0.6605724692344666, "learning_rate": 0.0002, "epoch": 1.2, "step": 16710}, {"loss": 0.729, "grad_norm": 0.6499220728874207, "learning_rate": 0.0002, "epoch": 1.2007181328545782, "step": 16720}, {"loss": 0.7107, "grad_norm": 0.7422114610671997, "learning_rate": 0.0002, "epoch": 1.2014362657091562, "step": 16730}, {"loss": 0.6712, "grad_norm": 0.6652370095252991, "learning_rate": 0.0002, "epoch": 1.2021543985637342, "step": 16740}, {"loss": 0.7804, "grad_norm": 0.8761070370674133, "learning_rate": 0.0002, "epoch": 1.2028725314183124, "step": 16750}, {"loss": 0.737, "grad_norm": 0.7294463515281677, "learning_rate": 0.0002, "epoch": 1.2035906642728904, "step": 16760}, {"loss": 0.7638, "grad_norm": 0.7725599408149719, "learning_rate": 0.0002, "epoch": 1.2043087971274686, "step": 16770}, {"loss": 0.6857, "grad_norm": 0.5630005598068237, "learning_rate": 0.0002, "epoch": 1.2050269299820466, "step": 16780}, {"loss": 0.7344, "grad_norm": 0.7601404786109924, "learning_rate": 0.0002, "epoch": 1.2057450628366249, "step": 16790}, {"loss": 0.729, "grad_norm": 0.6859985589981079, "learning_rate": 0.0002, "epoch": 1.2064631956912029, "step": 16800}, {"loss": 0.7203, "grad_norm": 0.7040054798126221, "learning_rate": 0.0002, "epoch": 1.207181328545781, "step": 16810}, {"loss": 0.7727, "grad_norm": 0.7058989405632019, "learning_rate": 0.0002, "epoch": 1.207899461400359, "step": 16820}, {"loss": 0.7247, "grad_norm": 0.7646133899688721, "learning_rate": 0.0002, "epoch": 1.208617594254937, "step": 16830}, {"loss": 0.7903, "grad_norm": 0.669550359249115, "learning_rate": 0.0002, "epoch": 1.2093357271095153, "step": 16840}, {"loss": 0.7313, "grad_norm": 0.6613401174545288, "learning_rate": 0.0002, "epoch": 1.2100538599640933, "step": 16850}, {"loss": 0.7181, "grad_norm": 0.8636519312858582, "learning_rate": 0.0002, "epoch": 1.2107719928186715, "step": 16860}, {"loss": 0.7111, "grad_norm": 0.6077507138252258, "learning_rate": 0.0002, "epoch": 1.2114901256732495, "step": 16870}, {"loss": 0.7706, "grad_norm": 0.7892228364944458, "learning_rate": 0.0002, "epoch": 1.2122082585278275, "step": 16880}, {"loss": 0.685, "grad_norm": 0.7424154877662659, "learning_rate": 0.0002, "epoch": 1.2129263913824058, "step": 16890}, {"loss": 0.6707, "grad_norm": 0.6525408029556274, "learning_rate": 0.0002, "epoch": 1.2136445242369838, "step": 16900}, {"loss": 0.7721, "grad_norm": 0.6178015470504761, "learning_rate": 0.0002, "epoch": 1.214362657091562, "step": 16910}, {"loss": 0.6971, "grad_norm": 0.7319437861442566, "learning_rate": 0.0002, "epoch": 1.21508078994614, "step": 16920}, {"loss": 0.7261, "grad_norm": 0.6823344826698303, "learning_rate": 0.0002, "epoch": 1.2157989228007182, "step": 16930}, {"loss": 0.7048, "grad_norm": 0.5681257843971252, "learning_rate": 0.0002, "epoch": 1.2165170556552962, "step": 16940}, {"loss": 0.7398, "grad_norm": 0.7939814925193787, "learning_rate": 0.0002, "epoch": 1.2172351885098744, "step": 16950}, {"loss": 0.7192, "grad_norm": 0.7031611800193787, "learning_rate": 0.0002, "epoch": 1.2179533213644524, "step": 16960}, {"loss": 0.7212, "grad_norm": 0.7610133290290833, "learning_rate": 0.0002, "epoch": 1.2186714542190304, "step": 16970}, {"loss": 0.7599, "grad_norm": 0.8707142472267151, "learning_rate": 0.0002, "epoch": 1.2193895870736087, "step": 16980}, {"loss": 0.7121, "grad_norm": 0.6603384017944336, "learning_rate": 0.0002, "epoch": 1.2201077199281867, "step": 16990}, {"loss": 0.7315, "grad_norm": 0.7218315005302429, "learning_rate": 0.0002, "epoch": 1.220825852782765, "step": 17000}, {"loss": 0.7513, "grad_norm": 0.8043148517608643, "learning_rate": 0.0002, "epoch": 1.221543985637343, "step": 17010}, {"loss": 0.6749, "grad_norm": 0.7232559323310852, "learning_rate": 0.0002, "epoch": 1.222262118491921, "step": 17020}, {"loss": 0.7681, "grad_norm": 0.690376341342926, "learning_rate": 0.0002, "epoch": 1.2229802513464991, "step": 17030}, {"loss": 0.7042, "grad_norm": 0.602436363697052, "learning_rate": 0.0002, "epoch": 1.2236983842010771, "step": 17040}, {"loss": 0.7129, "grad_norm": 0.7610493898391724, "learning_rate": 0.0002, "epoch": 1.2244165170556554, "step": 17050}, {"loss": 0.758, "grad_norm": 0.7504690885543823, "learning_rate": 0.0002, "epoch": 1.2251346499102334, "step": 17060}, {"loss": 0.6908, "grad_norm": 0.8080246448516846, "learning_rate": 0.0002, "epoch": 1.2258527827648116, "step": 17070}, {"loss": 0.7519, "grad_norm": 1.0240572690963745, "learning_rate": 0.0002, "epoch": 1.2265709156193896, "step": 17080}, {"loss": 0.7193, "grad_norm": 0.6874111294746399, "learning_rate": 0.0002, "epoch": 1.2272890484739678, "step": 17090}, {"loss": 0.79, "grad_norm": 0.800069272518158, "learning_rate": 0.0002, "epoch": 1.2280071813285458, "step": 17100}, {"loss": 0.742, "grad_norm": 0.8628103137016296, "learning_rate": 0.0002, "epoch": 1.2287253141831238, "step": 17110}, {"loss": 0.7022, "grad_norm": 0.7408499121665955, "learning_rate": 0.0002, "epoch": 1.229443447037702, "step": 17120}, {"loss": 0.6774, "grad_norm": 0.6494335532188416, "learning_rate": 0.0002, "epoch": 1.23016157989228, "step": 17130}, {"loss": 0.7025, "grad_norm": 0.6493549942970276, "learning_rate": 0.0002, "epoch": 1.2308797127468583, "step": 17140}, {"loss": 0.7448, "grad_norm": 0.6972658038139343, "learning_rate": 0.0002, "epoch": 1.2315978456014363, "step": 17150}, {"loss": 0.7219, "grad_norm": 0.6877315044403076, "learning_rate": 0.0002, "epoch": 1.2323159784560143, "step": 17160}, {"loss": 0.7945, "grad_norm": 0.7569024562835693, "learning_rate": 0.0002, "epoch": 1.2330341113105925, "step": 17170}, {"loss": 0.7467, "grad_norm": 0.696260392665863, "learning_rate": 0.0002, "epoch": 1.2337522441651705, "step": 17180}, {"loss": 0.6716, "grad_norm": 0.6150345802307129, "learning_rate": 0.0002, "epoch": 1.2344703770197487, "step": 17190}, {"loss": 0.7416, "grad_norm": 0.69009929895401, "learning_rate": 0.0002, "epoch": 1.2351885098743267, "step": 17200}, {"loss": 0.787, "grad_norm": 0.7035185098648071, "learning_rate": 0.0002, "epoch": 1.235906642728905, "step": 17210}, {"loss": 0.6896, "grad_norm": 0.6792506575584412, "learning_rate": 0.0002, "epoch": 1.236624775583483, "step": 17220}, {"loss": 0.6953, "grad_norm": 0.6310356855392456, "learning_rate": 0.0002, "epoch": 1.2373429084380612, "step": 17230}, {"loss": 0.7531, "grad_norm": 0.647026538848877, "learning_rate": 0.0002, "epoch": 1.2380610412926392, "step": 17240}, {"loss": 0.8014, "grad_norm": 0.7609930038452148, "learning_rate": 0.0002, "epoch": 1.2387791741472172, "step": 17250}, {"loss": 0.8045, "grad_norm": 0.791890561580658, "learning_rate": 0.0002, "epoch": 1.2394973070017954, "step": 17260}, {"loss": 0.7445, "grad_norm": 0.7126715183258057, "learning_rate": 0.0002, "epoch": 1.2402154398563734, "step": 17270}, {"loss": 0.6561, "grad_norm": 0.7850401401519775, "learning_rate": 0.0002, "epoch": 1.2409335727109516, "step": 17280}, {"loss": 0.7454, "grad_norm": 0.6694281697273254, "learning_rate": 0.0002, "epoch": 1.2416517055655296, "step": 17290}, {"loss": 0.6711, "grad_norm": 0.6418080925941467, "learning_rate": 0.0002, "epoch": 1.2423698384201076, "step": 17300}, {"loss": 0.7504, "grad_norm": 0.7308132648468018, "learning_rate": 0.0002, "epoch": 1.2430879712746858, "step": 17310}, {"loss": 0.6896, "grad_norm": 0.8322312235832214, "learning_rate": 0.0002, "epoch": 1.2438061041292638, "step": 17320}, {"loss": 0.7341, "grad_norm": 0.6959006190299988, "learning_rate": 0.0002, "epoch": 1.244524236983842, "step": 17330}, {"loss": 0.7025, "grad_norm": 0.7110121846199036, "learning_rate": 0.0002, "epoch": 1.24524236983842, "step": 17340}, {"loss": 0.7858, "grad_norm": 0.6496296525001526, "learning_rate": 0.0002, "epoch": 1.2459605026929983, "step": 17350}, {"loss": 0.7061, "grad_norm": 0.7649076581001282, "learning_rate": 0.0002, "epoch": 1.2466786355475763, "step": 17360}, {"loss": 0.7155, "grad_norm": 0.7139049172401428, "learning_rate": 0.0002, "epoch": 1.2473967684021545, "step": 17370}, {"loss": 0.6932, "grad_norm": 0.7709113955497742, "learning_rate": 0.0002, "epoch": 1.2481149012567325, "step": 17380}, {"loss": 0.731, "grad_norm": 0.7160373330116272, "learning_rate": 0.0002, "epoch": 1.2488330341113105, "step": 17390}, {"loss": 0.7146, "grad_norm": 0.5608301162719727, "learning_rate": 0.0002, "epoch": 1.2495511669658887, "step": 17400}, {"loss": 0.7368, "grad_norm": 0.6913180351257324, "learning_rate": 0.0002, "epoch": 1.2502692998204668, "step": 17410}, {"loss": 0.7167, "grad_norm": 0.6980322599411011, "learning_rate": 0.0002, "epoch": 1.250987432675045, "step": 17420}, {"loss": 0.7096, "grad_norm": 0.8155394792556763, "learning_rate": 0.0002, "epoch": 1.251705565529623, "step": 17430}, {"loss": 0.7477, "grad_norm": 0.8015886545181274, "learning_rate": 0.0002, "epoch": 1.252423698384201, "step": 17440}, {"loss": 0.7006, "grad_norm": 0.5985556244850159, "learning_rate": 0.0002, "epoch": 1.2531418312387792, "step": 17450}, {"loss": 0.7171, "grad_norm": 0.70317143201828, "learning_rate": 0.0002, "epoch": 1.2538599640933572, "step": 17460}, {"loss": 0.7006, "grad_norm": 0.612501323223114, "learning_rate": 0.0002, "epoch": 1.2545780969479354, "step": 17470}, {"loss": 0.7639, "grad_norm": 0.7347102165222168, "learning_rate": 0.0002, "epoch": 1.2552962298025134, "step": 17480}, {"loss": 0.7303, "grad_norm": 0.9189441800117493, "learning_rate": 0.0002, "epoch": 1.2560143626570914, "step": 17490}, {"loss": 0.7547, "grad_norm": 0.7727932929992676, "learning_rate": 0.0002, "epoch": 1.2567324955116697, "step": 17500}, {"loss": 0.6979, "grad_norm": 0.6782869696617126, "learning_rate": 0.0002, "epoch": 1.2574506283662479, "step": 17510}, {"loss": 0.7146, "grad_norm": 0.5710638761520386, "learning_rate": 0.0002, "epoch": 1.2581687612208259, "step": 17520}, {"loss": 0.6999, "grad_norm": 0.6856266856193542, "learning_rate": 0.0002, "epoch": 1.2588868940754039, "step": 17530}, {"loss": 0.7229, "grad_norm": 0.7257347702980042, "learning_rate": 0.0002, "epoch": 1.259605026929982, "step": 17540}, {"loss": 0.7475, "grad_norm": 0.6343092918395996, "learning_rate": 0.0002, "epoch": 1.26032315978456, "step": 17550}, {"loss": 0.7863, "grad_norm": 0.6482594013214111, "learning_rate": 0.0002, "epoch": 1.2610412926391383, "step": 17560}, {"loss": 0.716, "grad_norm": 0.6542837619781494, "learning_rate": 0.0002, "epoch": 1.2617594254937163, "step": 17570}, {"loss": 0.7871, "grad_norm": 0.7106123566627502, "learning_rate": 0.0002, "epoch": 1.2624775583482943, "step": 17580}, {"loss": 0.7446, "grad_norm": 0.9081960320472717, "learning_rate": 0.0002, "epoch": 1.2631956912028726, "step": 17590}, {"loss": 0.7591, "grad_norm": 0.7010290026664734, "learning_rate": 0.0002, "epoch": 1.2639138240574506, "step": 17600}, {"loss": 0.7391, "grad_norm": 0.9973132610321045, "learning_rate": 0.0002, "epoch": 1.2646319569120288, "step": 17610}, {"loss": 0.725, "grad_norm": 0.8003297448158264, "learning_rate": 0.0002, "epoch": 1.2653500897666068, "step": 17620}, {"loss": 0.697, "grad_norm": 0.7383468151092529, "learning_rate": 0.0002, "epoch": 1.2660682226211848, "step": 17630}, {"loss": 0.785, "grad_norm": 0.6337200999259949, "learning_rate": 0.0002, "epoch": 1.266786355475763, "step": 17640}, {"loss": 0.7469, "grad_norm": 0.6371761560440063, "learning_rate": 0.0002, "epoch": 1.2675044883303412, "step": 17650}, {"loss": 0.7348, "grad_norm": 0.7283522486686707, "learning_rate": 0.0002, "epoch": 1.2682226211849192, "step": 17660}, {"loss": 0.7251, "grad_norm": 0.8191015720367432, "learning_rate": 0.0002, "epoch": 1.2689407540394972, "step": 17670}, {"loss": 0.7558, "grad_norm": 0.6210351586341858, "learning_rate": 0.0002, "epoch": 1.2696588868940755, "step": 17680}, {"loss": 0.7733, "grad_norm": 0.6563277840614319, "learning_rate": 0.0002, "epoch": 1.2703770197486535, "step": 17690}, {"loss": 0.7065, "grad_norm": 0.7111260294914246, "learning_rate": 0.0002, "epoch": 1.2710951526032317, "step": 17700}, {"loss": 0.7079, "grad_norm": 0.7061500549316406, "learning_rate": 0.0002, "epoch": 1.2718132854578097, "step": 17710}, {"loss": 0.7612, "grad_norm": 0.7657744884490967, "learning_rate": 0.0002, "epoch": 1.2725314183123877, "step": 17720}, {"loss": 0.7513, "grad_norm": 0.6952996850013733, "learning_rate": 0.0002, "epoch": 1.273249551166966, "step": 17730}, {"loss": 0.7402, "grad_norm": 0.5678043961524963, "learning_rate": 0.0002, "epoch": 1.273967684021544, "step": 17740}, {"loss": 0.7357, "grad_norm": 0.8608036041259766, "learning_rate": 0.0002, "epoch": 1.2746858168761221, "step": 17750}, {"loss": 0.7482, "grad_norm": 0.7184045910835266, "learning_rate": 0.0002, "epoch": 1.2754039497307001, "step": 17760}, {"loss": 0.7277, "grad_norm": 0.6647557616233826, "learning_rate": 0.0002, "epoch": 1.2761220825852782, "step": 17770}, {"loss": 0.6866, "grad_norm": 0.6899349093437195, "learning_rate": 0.0002, "epoch": 1.2768402154398564, "step": 17780}, {"loss": 0.721, "grad_norm": 0.7073346972465515, "learning_rate": 0.0002, "epoch": 1.2775583482944346, "step": 17790}, {"loss": 0.7432, "grad_norm": 0.8896707892417908, "learning_rate": 0.0002, "epoch": 1.2782764811490126, "step": 17800}, {"loss": 0.7318, "grad_norm": 0.5072778463363647, "learning_rate": 0.0002, "epoch": 1.2789946140035906, "step": 17810}, {"loss": 0.7648, "grad_norm": 0.8889711499214172, "learning_rate": 0.0002, "epoch": 1.2797127468581688, "step": 17820}, {"loss": 0.6894, "grad_norm": 0.5583778619766235, "learning_rate": 0.0002, "epoch": 1.2804308797127468, "step": 17830}, {"loss": 0.7488, "grad_norm": 0.6526148915290833, "learning_rate": 0.0002, "epoch": 1.281149012567325, "step": 17840}, {"loss": 0.7462, "grad_norm": 0.7658175826072693, "learning_rate": 0.0002, "epoch": 1.281867145421903, "step": 17850}, {"loss": 0.7298, "grad_norm": 0.5547847151756287, "learning_rate": 0.0002, "epoch": 1.282585278276481, "step": 17860}, {"loss": 0.705, "grad_norm": 0.6153780817985535, "learning_rate": 0.0002, "epoch": 1.2833034111310593, "step": 17870}, {"loss": 0.7173, "grad_norm": 0.8474061489105225, "learning_rate": 0.0002, "epoch": 1.2840215439856373, "step": 17880}, {"loss": 0.7597, "grad_norm": 0.859260618686676, "learning_rate": 0.0002, "epoch": 1.2847396768402155, "step": 17890}, {"loss": 0.7237, "grad_norm": 0.7270520329475403, "learning_rate": 0.0002, "epoch": 1.2854578096947935, "step": 17900}, {"loss": 0.701, "grad_norm": 0.8166249394416809, "learning_rate": 0.0002, "epoch": 1.2861759425493715, "step": 17910}, {"loss": 0.686, "grad_norm": 0.9158982038497925, "learning_rate": 0.0002, "epoch": 1.2868940754039497, "step": 17920}, {"loss": 0.7243, "grad_norm": 0.8132565021514893, "learning_rate": 0.0002, "epoch": 1.287612208258528, "step": 17930}, {"loss": 0.6909, "grad_norm": 0.7914409637451172, "learning_rate": 0.0002, "epoch": 1.288330341113106, "step": 17940}, {"loss": 0.7034, "grad_norm": 0.6256071329116821, "learning_rate": 0.0002, "epoch": 1.289048473967684, "step": 17950}, {"loss": 0.7279, "grad_norm": 0.6463542580604553, "learning_rate": 0.0002, "epoch": 1.2897666068222622, "step": 17960}, {"loss": 0.7601, "grad_norm": 0.6702672839164734, "learning_rate": 0.0002, "epoch": 1.2904847396768402, "step": 17970}, {"loss": 0.7355, "grad_norm": 0.8666605949401855, "learning_rate": 0.0002, "epoch": 1.2912028725314184, "step": 17980}, {"loss": 0.6838, "grad_norm": 0.8055952787399292, "learning_rate": 0.0002, "epoch": 1.2919210053859964, "step": 17990}, {"loss": 0.7361, "grad_norm": 0.6909741163253784, "learning_rate": 0.0002, "epoch": 1.2926391382405744, "step": 18000}, {"loss": 0.7766, "grad_norm": 0.663702130317688, "learning_rate": 0.0002, "epoch": 1.2933572710951526, "step": 18010}, {"loss": 0.7071, "grad_norm": 0.6952448487281799, "learning_rate": 0.0002, "epoch": 1.2940754039497306, "step": 18020}, {"loss": 0.7359, "grad_norm": 0.5722854137420654, "learning_rate": 0.0002, "epoch": 1.2947935368043089, "step": 18030}, {"loss": 0.764, "grad_norm": 0.7987681031227112, "learning_rate": 0.0002, "epoch": 1.2955116696588869, "step": 18040}, {"loss": 0.743, "grad_norm": 0.661133348941803, "learning_rate": 0.0002, "epoch": 1.2962298025134649, "step": 18050}, {"loss": 0.7627, "grad_norm": 0.6025064587593079, "learning_rate": 0.0002, "epoch": 1.296947935368043, "step": 18060}, {"loss": 0.7242, "grad_norm": 0.7569907903671265, "learning_rate": 0.0002, "epoch": 1.2976660682226213, "step": 18070}, {"loss": 0.7234, "grad_norm": 0.7222012281417847, "learning_rate": 0.0002, "epoch": 1.2983842010771993, "step": 18080}, {"loss": 0.7133, "grad_norm": 0.5291963815689087, "learning_rate": 0.0002, "epoch": 1.2991023339317773, "step": 18090}, {"loss": 0.7215, "grad_norm": 0.6808363199234009, "learning_rate": 0.0002, "epoch": 1.2998204667863555, "step": 18100}, {"loss": 0.7621, "grad_norm": 0.6797927618026733, "learning_rate": 0.0002, "epoch": 1.3005385996409335, "step": 18110}, {"loss": 0.7474, "grad_norm": 0.7775542140007019, "learning_rate": 0.0002, "epoch": 1.3012567324955118, "step": 18120}, {"loss": 0.7376, "grad_norm": 0.7369466423988342, "learning_rate": 0.0002, "epoch": 1.3019748653500898, "step": 18130}, {"loss": 0.7098, "grad_norm": 0.6822494864463806, "learning_rate": 0.0002, "epoch": 1.3026929982046678, "step": 18140}, {"loss": 0.7675, "grad_norm": 0.9222138524055481, "learning_rate": 0.0002, "epoch": 1.303411131059246, "step": 18150}, {"loss": 0.7593, "grad_norm": 0.7485767006874084, "learning_rate": 0.0002, "epoch": 1.304129263913824, "step": 18160}, {"loss": 0.7293, "grad_norm": 0.6383684277534485, "learning_rate": 0.0002, "epoch": 1.3048473967684022, "step": 18170}, {"loss": 0.7929, "grad_norm": 0.5934187173843384, "learning_rate": 0.0002, "epoch": 1.3055655296229802, "step": 18180}, {"loss": 0.7576, "grad_norm": 0.7265770435333252, "learning_rate": 0.0002, "epoch": 1.3062836624775582, "step": 18190}, {"loss": 0.7126, "grad_norm": 0.8149140477180481, "learning_rate": 0.0002, "epoch": 1.3070017953321365, "step": 18200}, {"loss": 0.7529, "grad_norm": 0.8067880272865295, "learning_rate": 0.0002, "epoch": 1.3077199281867147, "step": 18210}, {"loss": 0.7173, "grad_norm": 0.6109178066253662, "learning_rate": 0.0002, "epoch": 1.3084380610412927, "step": 18220}, {"loss": 0.7452, "grad_norm": 0.7194176316261292, "learning_rate": 0.0002, "epoch": 1.3091561938958707, "step": 18230}, {"loss": 0.732, "grad_norm": 0.6452242136001587, "learning_rate": 0.0002, "epoch": 1.309874326750449, "step": 18240}, {"loss": 0.7772, "grad_norm": 0.680550217628479, "learning_rate": 0.0002, "epoch": 1.310592459605027, "step": 18250}, {"loss": 0.7334, "grad_norm": 0.7005740404129028, "learning_rate": 0.0002, "epoch": 1.3113105924596051, "step": 18260}, {"loss": 0.7537, "grad_norm": 0.7217825055122375, "learning_rate": 0.0002, "epoch": 1.3120287253141831, "step": 18270}, {"loss": 0.7797, "grad_norm": 0.7730209231376648, "learning_rate": 0.0002, "epoch": 1.3127468581687611, "step": 18280}, {"loss": 0.7257, "grad_norm": 0.8291956186294556, "learning_rate": 0.0002, "epoch": 1.3134649910233394, "step": 18290}, {"loss": 0.7234, "grad_norm": 0.758528470993042, "learning_rate": 0.0002, "epoch": 1.3141831238779174, "step": 18300}, {"loss": 0.6915, "grad_norm": 0.9682782292366028, "learning_rate": 0.0002, "epoch": 1.3149012567324956, "step": 18310}, {"loss": 0.686, "grad_norm": 0.5784780979156494, "learning_rate": 0.0002, "epoch": 1.3156193895870736, "step": 18320}, {"loss": 0.7277, "grad_norm": 0.5870532393455505, "learning_rate": 0.0002, "epoch": 1.3163375224416516, "step": 18330}, {"loss": 0.7594, "grad_norm": 0.5950172543525696, "learning_rate": 0.0002, "epoch": 1.3170556552962298, "step": 18340}, {"loss": 0.7086, "grad_norm": 0.7625961899757385, "learning_rate": 0.0002, "epoch": 1.317773788150808, "step": 18350}, {"loss": 0.7075, "grad_norm": 0.8027397394180298, "learning_rate": 0.0002, "epoch": 1.318491921005386, "step": 18360}, {"loss": 0.7249, "grad_norm": 0.8424779772758484, "learning_rate": 0.0002, "epoch": 1.319210053859964, "step": 18370}, {"loss": 0.7349, "grad_norm": 0.5741737484931946, "learning_rate": 0.0002, "epoch": 1.3199281867145423, "step": 18380}, {"loss": 0.7421, "grad_norm": 0.7363710999488831, "learning_rate": 0.0002, "epoch": 1.3206463195691203, "step": 18390}, {"loss": 0.7208, "grad_norm": 0.7900536060333252, "learning_rate": 0.0002, "epoch": 1.3213644524236985, "step": 18400}, {"loss": 0.6836, "grad_norm": 0.6273105144500732, "learning_rate": 0.0002, "epoch": 1.3220825852782765, "step": 18410}, {"loss": 0.7365, "grad_norm": 0.7612496018409729, "learning_rate": 0.0002, "epoch": 1.3228007181328545, "step": 18420}, {"loss": 0.7521, "grad_norm": 0.729653537273407, "learning_rate": 0.0002, "epoch": 1.3235188509874327, "step": 18430}, {"loss": 0.7153, "grad_norm": 0.6599212288856506, "learning_rate": 0.0002, "epoch": 1.3242369838420107, "step": 18440}, {"loss": 0.7315, "grad_norm": 0.762320876121521, "learning_rate": 0.0002, "epoch": 1.324955116696589, "step": 18450}, {"loss": 0.6986, "grad_norm": 0.7468838095664978, "learning_rate": 0.0002, "epoch": 1.325673249551167, "step": 18460}, {"loss": 0.7527, "grad_norm": 0.6376237273216248, "learning_rate": 0.0002, "epoch": 1.326391382405745, "step": 18470}, {"loss": 0.7173, "grad_norm": 0.6722603440284729, "learning_rate": 0.0002, "epoch": 1.3271095152603232, "step": 18480}, {"loss": 0.6821, "grad_norm": 0.7011231780052185, "learning_rate": 0.0002, "epoch": 1.3278276481149014, "step": 18490}, {"loss": 0.7942, "grad_norm": 0.5325027108192444, "learning_rate": 0.0002, "epoch": 1.3285457809694794, "step": 18500}, {"loss": 0.6709, "grad_norm": 0.6916731595993042, "learning_rate": 0.0002, "epoch": 1.3292639138240574, "step": 18510}, {"loss": 0.7204, "grad_norm": 0.6529106497764587, "learning_rate": 0.0002, "epoch": 1.3299820466786356, "step": 18520}, {"loss": 0.7289, "grad_norm": 0.7708640694618225, "learning_rate": 0.0002, "epoch": 1.3307001795332136, "step": 18530}, {"loss": 0.7688, "grad_norm": 0.7125861048698425, "learning_rate": 0.0002, "epoch": 1.3314183123877918, "step": 18540}, {"loss": 0.723, "grad_norm": 0.7663969993591309, "learning_rate": 0.0002, "epoch": 1.3321364452423698, "step": 18550}, {"loss": 0.6993, "grad_norm": 0.601141631603241, "learning_rate": 0.0002, "epoch": 1.3328545780969479, "step": 18560}, {"loss": 0.734, "grad_norm": 0.6185581088066101, "learning_rate": 0.0002, "epoch": 1.333572710951526, "step": 18570}, {"loss": 0.6938, "grad_norm": 0.6136596202850342, "learning_rate": 0.0002, "epoch": 1.334290843806104, "step": 18580}, {"loss": 0.6963, "grad_norm": 0.8377187252044678, "learning_rate": 0.0002, "epoch": 1.3350089766606823, "step": 18590}, {"loss": 0.7399, "grad_norm": 0.7649989724159241, "learning_rate": 0.0002, "epoch": 1.3357271095152603, "step": 18600}, {"loss": 0.7565, "grad_norm": 0.7944515347480774, "learning_rate": 0.0002, "epoch": 1.3364452423698383, "step": 18610}, {"loss": 0.7894, "grad_norm": 0.619024395942688, "learning_rate": 0.0002, "epoch": 1.3371633752244165, "step": 18620}, {"loss": 0.7497, "grad_norm": 0.7849082946777344, "learning_rate": 0.0002, "epoch": 1.3378815080789948, "step": 18630}, {"loss": 0.7123, "grad_norm": 0.5740780830383301, "learning_rate": 0.0002, "epoch": 1.3385996409335728, "step": 18640}, {"loss": 0.7211, "grad_norm": 0.6897456645965576, "learning_rate": 0.0002, "epoch": 1.3393177737881508, "step": 18650}, {"loss": 0.7174, "grad_norm": 0.6263600587844849, "learning_rate": 0.0002, "epoch": 1.340035906642729, "step": 18660}, {"loss": 0.7048, "grad_norm": 0.5744550824165344, "learning_rate": 0.0002, "epoch": 1.340754039497307, "step": 18670}, {"loss": 0.7773, "grad_norm": 0.7785728573799133, "learning_rate": 0.0002, "epoch": 1.3414721723518852, "step": 18680}, {"loss": 0.7697, "grad_norm": 0.6944230198860168, "learning_rate": 0.0002, "epoch": 1.3421903052064632, "step": 18690}, {"loss": 0.7387, "grad_norm": 0.7388073801994324, "learning_rate": 0.0002, "epoch": 1.3429084380610412, "step": 18700}, {"loss": 0.7776, "grad_norm": 0.9555586576461792, "learning_rate": 0.0002, "epoch": 1.3436265709156194, "step": 18710}, {"loss": 0.7308, "grad_norm": 0.8510582447052002, "learning_rate": 0.0002, "epoch": 1.3443447037701974, "step": 18720}, {"loss": 0.7131, "grad_norm": 0.6093049645423889, "learning_rate": 0.0002, "epoch": 1.3450628366247757, "step": 18730}, {"loss": 0.7194, "grad_norm": 0.9159273505210876, "learning_rate": 0.0002, "epoch": 1.3457809694793537, "step": 18740}, {"loss": 0.7626, "grad_norm": 0.7188084721565247, "learning_rate": 0.0002, "epoch": 1.3464991023339317, "step": 18750}, {"loss": 0.7212, "grad_norm": 0.7228650450706482, "learning_rate": 0.0002, "epoch": 1.3472172351885099, "step": 18760}, {"loss": 0.7213, "grad_norm": 0.8160615563392639, "learning_rate": 0.0002, "epoch": 1.347935368043088, "step": 18770}, {"loss": 0.7093, "grad_norm": 0.6485389471054077, "learning_rate": 0.0002, "epoch": 1.3486535008976661, "step": 18780}, {"loss": 0.7044, "grad_norm": 0.6755139827728271, "learning_rate": 0.0002, "epoch": 1.3493716337522441, "step": 18790}, {"loss": 0.7413, "grad_norm": 0.6923297643661499, "learning_rate": 0.0002, "epoch": 1.3500897666068223, "step": 18800}, {"loss": 0.7184, "grad_norm": 0.6954510807991028, "learning_rate": 0.0002, "epoch": 1.3508078994614003, "step": 18810}, {"loss": 0.6987, "grad_norm": 0.9948558807373047, "learning_rate": 0.0002, "epoch": 1.3515260323159786, "step": 18820}, {"loss": 0.7315, "grad_norm": 0.708381175994873, "learning_rate": 0.0002, "epoch": 1.3522441651705566, "step": 18830}, {"loss": 0.7135, "grad_norm": 0.6409999132156372, "learning_rate": 0.0002, "epoch": 1.3529622980251346, "step": 18840}, {"loss": 0.7204, "grad_norm": 0.6365936994552612, "learning_rate": 0.0002, "epoch": 1.3536804308797128, "step": 18850}, {"loss": 0.691, "grad_norm": 0.7620742917060852, "learning_rate": 0.0002, "epoch": 1.3543985637342908, "step": 18860}, {"loss": 0.7458, "grad_norm": 0.6849071383476257, "learning_rate": 0.0002, "epoch": 1.355116696588869, "step": 18870}, {"loss": 0.7221, "grad_norm": 0.5776316523551941, "learning_rate": 0.0002, "epoch": 1.355834829443447, "step": 18880}, {"loss": 0.7412, "grad_norm": 0.597236156463623, "learning_rate": 0.0002, "epoch": 1.356552962298025, "step": 18890}, {"loss": 0.7065, "grad_norm": 0.6569282412528992, "learning_rate": 0.0002, "epoch": 1.3572710951526032, "step": 18900}, {"loss": 0.6995, "grad_norm": 0.6384802460670471, "learning_rate": 0.0002, "epoch": 1.3579892280071812, "step": 18910}, {"loss": 0.7592, "grad_norm": 0.6623879671096802, "learning_rate": 0.0002, "epoch": 1.3587073608617595, "step": 18920}, {"loss": 0.7288, "grad_norm": 0.6149632334709167, "learning_rate": 0.0002, "epoch": 1.3594254937163375, "step": 18930}, {"loss": 0.7392, "grad_norm": 0.6978002190589905, "learning_rate": 0.0002, "epoch": 1.3601436265709157, "step": 18940}, {"loss": 0.7405, "grad_norm": 0.7579124569892883, "learning_rate": 0.0002, "epoch": 1.3608617594254937, "step": 18950}, {"loss": 0.7589, "grad_norm": 0.7138084173202515, "learning_rate": 0.0002, "epoch": 1.361579892280072, "step": 18960}, {"loss": 0.7257, "grad_norm": 0.678322434425354, "learning_rate": 0.0002, "epoch": 1.36229802513465, "step": 18970}, {"loss": 0.7221, "grad_norm": 0.694346010684967, "learning_rate": 0.0002, "epoch": 1.363016157989228, "step": 18980}, {"loss": 0.6986, "grad_norm": 0.682262659072876, "learning_rate": 0.0002, "epoch": 1.3637342908438062, "step": 18990}, {"loss": 0.7297, "grad_norm": 0.9068194627761841, "learning_rate": 0.0002, "epoch": 1.3644524236983842, "step": 19000}, {"loss": 0.756, "grad_norm": 0.6691566705703735, "learning_rate": 0.0002, "epoch": 1.3651705565529624, "step": 19010}, {"loss": 0.7158, "grad_norm": 0.7791378498077393, "learning_rate": 0.0002, "epoch": 1.3658886894075404, "step": 19020}, {"loss": 0.6904, "grad_norm": 0.717107355594635, "learning_rate": 0.0002, "epoch": 1.3666068222621184, "step": 19030}, {"loss": 0.7308, "grad_norm": 0.7897566556930542, "learning_rate": 0.0002, "epoch": 1.3673249551166966, "step": 19040}, {"loss": 0.7278, "grad_norm": 0.8823844790458679, "learning_rate": 0.0002, "epoch": 1.3680430879712746, "step": 19050}, {"loss": 0.7252, "grad_norm": 0.6512053608894348, "learning_rate": 0.0002, "epoch": 1.3687612208258528, "step": 19060}, {"loss": 0.6861, "grad_norm": 0.6871389150619507, "learning_rate": 0.0002, "epoch": 1.3694793536804308, "step": 19070}, {"loss": 0.7311, "grad_norm": 0.6795603036880493, "learning_rate": 0.0002, "epoch": 1.370197486535009, "step": 19080}, {"loss": 0.7351, "grad_norm": 0.6569121479988098, "learning_rate": 0.0002, "epoch": 1.370915619389587, "step": 19090}, {"loss": 0.7743, "grad_norm": 0.6769960522651672, "learning_rate": 0.0002, "epoch": 1.3716337522441653, "step": 19100}, {"loss": 0.7275, "grad_norm": 0.726613461971283, "learning_rate": 0.0002, "epoch": 1.3723518850987433, "step": 19110}, {"loss": 0.7484, "grad_norm": 0.7287817001342773, "learning_rate": 0.0002, "epoch": 1.3730700179533213, "step": 19120}, {"loss": 0.7305, "grad_norm": 0.6169242858886719, "learning_rate": 0.0002, "epoch": 1.3737881508078995, "step": 19130}, {"loss": 0.7195, "grad_norm": 0.6537347435951233, "learning_rate": 0.0002, "epoch": 1.3745062836624775, "step": 19140}, {"loss": 0.7402, "grad_norm": 0.6113879680633545, "learning_rate": 0.0002, "epoch": 1.3752244165170557, "step": 19150}, {"loss": 0.7012, "grad_norm": 0.6415297985076904, "learning_rate": 0.0002, "epoch": 1.3759425493716337, "step": 19160}, {"loss": 0.7367, "grad_norm": 0.6812838315963745, "learning_rate": 0.0002, "epoch": 1.3766606822262117, "step": 19170}, {"loss": 0.7117, "grad_norm": 0.7331814169883728, "learning_rate": 0.0002, "epoch": 1.37737881508079, "step": 19180}, {"loss": 0.7496, "grad_norm": 0.7265108823776245, "learning_rate": 0.0002, "epoch": 1.378096947935368, "step": 19190}, {"loss": 0.699, "grad_norm": 0.6233167052268982, "learning_rate": 0.0002, "epoch": 1.3788150807899462, "step": 19200}, {"loss": 0.6978, "grad_norm": 0.6841492652893066, "learning_rate": 0.0002, "epoch": 1.3795332136445242, "step": 19210}, {"loss": 0.6934, "grad_norm": 0.822853684425354, "learning_rate": 0.0002, "epoch": 1.3802513464991024, "step": 19220}, {"loss": 0.7574, "grad_norm": 0.8078812956809998, "learning_rate": 0.0002, "epoch": 1.3809694793536804, "step": 19230}, {"loss": 0.7429, "grad_norm": 0.7269898056983948, "learning_rate": 0.0002, "epoch": 1.3816876122082586, "step": 19240}, {"loss": 0.7552, "grad_norm": 0.6297033429145813, "learning_rate": 0.0002, "epoch": 1.3824057450628366, "step": 19250}, {"loss": 0.7396, "grad_norm": 0.8097442388534546, "learning_rate": 0.0002, "epoch": 1.3831238779174146, "step": 19260}, {"loss": 0.7281, "grad_norm": 0.6442803740501404, "learning_rate": 0.0002, "epoch": 1.3838420107719929, "step": 19270}, {"loss": 0.7598, "grad_norm": 0.659866213798523, "learning_rate": 0.0002, "epoch": 1.3845601436265709, "step": 19280}, {"loss": 0.7262, "grad_norm": 0.7537921667098999, "learning_rate": 0.0002, "epoch": 1.385278276481149, "step": 19290}, {"loss": 0.7215, "grad_norm": 0.8441828489303589, "learning_rate": 0.0002, "epoch": 1.385996409335727, "step": 19300}, {"loss": 0.725, "grad_norm": 0.8506057262420654, "learning_rate": 0.0002, "epoch": 1.386714542190305, "step": 19310}, {"loss": 0.7747, "grad_norm": 0.6747094392776489, "learning_rate": 0.0002, "epoch": 1.3874326750448833, "step": 19320}, {"loss": 0.7785, "grad_norm": 0.7906509041786194, "learning_rate": 0.0002, "epoch": 1.3881508078994613, "step": 19330}, {"loss": 0.8147, "grad_norm": 0.6784867644309998, "learning_rate": 0.0002, "epoch": 1.3888689407540395, "step": 19340}, {"loss": 0.7861, "grad_norm": 0.6371709108352661, "learning_rate": 0.0002, "epoch": 1.3895870736086176, "step": 19350}, {"loss": 0.7434, "grad_norm": 0.7858285307884216, "learning_rate": 0.0002, "epoch": 1.3903052064631956, "step": 19360}, {"loss": 0.7638, "grad_norm": 0.711395263671875, "learning_rate": 0.0002, "epoch": 1.3910233393177738, "step": 19370}, {"loss": 0.725, "grad_norm": 0.7023257613182068, "learning_rate": 0.0002, "epoch": 1.391741472172352, "step": 19380}, {"loss": 0.7612, "grad_norm": 0.7036022543907166, "learning_rate": 0.0002, "epoch": 1.39245960502693, "step": 19390}, {"loss": 0.7354, "grad_norm": 0.6418436169624329, "learning_rate": 0.0002, "epoch": 1.393177737881508, "step": 19400}, {"loss": 0.7444, "grad_norm": 0.7108847498893738, "learning_rate": 0.0002, "epoch": 1.3938958707360862, "step": 19410}, {"loss": 0.771, "grad_norm": 0.6940230131149292, "learning_rate": 0.0002, "epoch": 1.3946140035906642, "step": 19420}, {"loss": 0.6791, "grad_norm": 0.6750220656394958, "learning_rate": 0.0002, "epoch": 1.3953321364452425, "step": 19430}, {"loss": 0.7466, "grad_norm": 0.7479177713394165, "learning_rate": 0.0002, "epoch": 1.3960502692998205, "step": 19440}, {"loss": 0.7259, "grad_norm": 0.626124918460846, "learning_rate": 0.0002, "epoch": 1.3967684021543985, "step": 19450}, {"loss": 0.7108, "grad_norm": 0.8908559083938599, "learning_rate": 0.0002, "epoch": 1.3974865350089767, "step": 19460}, {"loss": 0.7451, "grad_norm": 0.6163712739944458, "learning_rate": 0.0002, "epoch": 1.3982046678635547, "step": 19470}, {"loss": 0.7437, "grad_norm": 0.6993312239646912, "learning_rate": 0.0002, "epoch": 1.398922800718133, "step": 19480}, {"loss": 0.7035, "grad_norm": 0.6162890791893005, "learning_rate": 0.0002, "epoch": 1.399640933572711, "step": 19490}, {"loss": 0.7455, "grad_norm": 0.7797643542289734, "learning_rate": 0.0002, "epoch": 1.400359066427289, "step": 19500}, {"loss": 0.7497, "grad_norm": 0.7038744688034058, "learning_rate": 0.0002, "epoch": 1.4010771992818671, "step": 19510}, {"loss": 0.7084, "grad_norm": 0.6902393698692322, "learning_rate": 0.0002, "epoch": 1.4017953321364454, "step": 19520}, {"loss": 0.7136, "grad_norm": 0.5436386466026306, "learning_rate": 0.0002, "epoch": 1.4025134649910234, "step": 19530}, {"loss": 0.7457, "grad_norm": 0.6537990570068359, "learning_rate": 0.0002, "epoch": 1.4032315978456014, "step": 19540}, {"loss": 0.727, "grad_norm": 0.739691972732544, "learning_rate": 0.0002, "epoch": 1.4039497307001796, "step": 19550}, {"loss": 0.7537, "grad_norm": 0.7287635803222656, "learning_rate": 0.0002, "epoch": 1.4046678635547576, "step": 19560}, {"loss": 0.707, "grad_norm": 0.6809501051902771, "learning_rate": 0.0002, "epoch": 1.4053859964093358, "step": 19570}, {"loss": 0.7336, "grad_norm": 0.8302195072174072, "learning_rate": 0.0002, "epoch": 1.4061041292639138, "step": 19580}, {"loss": 0.7201, "grad_norm": 0.6613629460334778, "learning_rate": 0.0002, "epoch": 1.4068222621184918, "step": 19590}, {"loss": 0.7415, "grad_norm": 0.7897207736968994, "learning_rate": 0.0002, "epoch": 1.40754039497307, "step": 19600}, {"loss": 0.7483, "grad_norm": 0.8368293642997742, "learning_rate": 0.0002, "epoch": 1.408258527827648, "step": 19610}, {"loss": 0.7412, "grad_norm": 0.665109395980835, "learning_rate": 0.0002, "epoch": 1.4089766606822263, "step": 19620}, {"loss": 0.7339, "grad_norm": 0.7359302639961243, "learning_rate": 0.0002, "epoch": 1.4096947935368043, "step": 19630}, {"loss": 0.7775, "grad_norm": 0.8048052787780762, "learning_rate": 0.0002, "epoch": 1.4104129263913823, "step": 19640}, {"loss": 0.7668, "grad_norm": 0.7414906620979309, "learning_rate": 0.0002, "epoch": 1.4111310592459605, "step": 19650}, {"loss": 0.7386, "grad_norm": 0.7894161343574524, "learning_rate": 0.0002, "epoch": 1.4118491921005387, "step": 19660}, {"loss": 0.7371, "grad_norm": 0.6724628210067749, "learning_rate": 0.0002, "epoch": 1.4125673249551167, "step": 19670}, {"loss": 0.7243, "grad_norm": 0.9397756457328796, "learning_rate": 0.0002, "epoch": 1.4132854578096947, "step": 19680}, {"loss": 0.7109, "grad_norm": 0.6684842109680176, "learning_rate": 0.0002, "epoch": 1.414003590664273, "step": 19690}, {"loss": 0.7693, "grad_norm": 0.7753993272781372, "learning_rate": 0.0002, "epoch": 1.414721723518851, "step": 19700}, {"loss": 0.7653, "grad_norm": 0.6934253573417664, "learning_rate": 0.0002, "epoch": 1.4154398563734292, "step": 19710}, {"loss": 0.7393, "grad_norm": 0.8567284941673279, "learning_rate": 0.0002, "epoch": 1.4161579892280072, "step": 19720}, {"loss": 0.6907, "grad_norm": 0.9471787214279175, "learning_rate": 0.0002, "epoch": 1.4168761220825852, "step": 19730}, {"loss": 0.709, "grad_norm": 0.6664855480194092, "learning_rate": 0.0002, "epoch": 1.4175942549371634, "step": 19740}, {"loss": 0.7149, "grad_norm": 0.6713361740112305, "learning_rate": 0.0002, "epoch": 1.4183123877917414, "step": 19750}, {"loss": 0.7302, "grad_norm": 0.6488258838653564, "learning_rate": 0.0002, "epoch": 1.4190305206463196, "step": 19760}, {"loss": 0.7612, "grad_norm": 0.7089938521385193, "learning_rate": 0.0002, "epoch": 1.4197486535008976, "step": 19770}, {"loss": 0.7245, "grad_norm": 0.6433218717575073, "learning_rate": 0.0002, "epoch": 1.4204667863554756, "step": 19780}, {"loss": 0.7105, "grad_norm": 0.7025160193443298, "learning_rate": 0.0002, "epoch": 1.4211849192100539, "step": 19790}, {"loss": 0.7948, "grad_norm": 0.7030544877052307, "learning_rate": 0.0002, "epoch": 1.421903052064632, "step": 19800}, {"loss": 0.7333, "grad_norm": 0.6515552401542664, "learning_rate": 0.0002, "epoch": 1.42262118491921, "step": 19810}, {"loss": 0.7342, "grad_norm": 0.6463841795921326, "learning_rate": 0.0002, "epoch": 1.423339317773788, "step": 19820}, {"loss": 0.7457, "grad_norm": 0.6654344201087952, "learning_rate": 0.0002, "epoch": 1.4240574506283663, "step": 19830}, {"loss": 0.7289, "grad_norm": 0.7223384380340576, "learning_rate": 0.0002, "epoch": 1.4247755834829443, "step": 19840}, {"loss": 0.7471, "grad_norm": 0.6575722694396973, "learning_rate": 0.0002, "epoch": 1.4254937163375225, "step": 19850}, {"loss": 0.7559, "grad_norm": 0.6216059327125549, "learning_rate": 0.0002, "epoch": 1.4262118491921005, "step": 19860}, {"loss": 0.7638, "grad_norm": 0.7451487183570862, "learning_rate": 0.0002, "epoch": 1.4269299820466785, "step": 19870}, {"loss": 0.7083, "grad_norm": 0.6563336253166199, "learning_rate": 0.0002, "epoch": 1.4276481149012568, "step": 19880}, {"loss": 0.7122, "grad_norm": 0.8021975159645081, "learning_rate": 0.0002, "epoch": 1.4283662477558348, "step": 19890}, {"loss": 0.7389, "grad_norm": 0.7474712133407593, "learning_rate": 0.0002, "epoch": 1.429084380610413, "step": 19900}, {"loss": 0.7839, "grad_norm": 0.7316377758979797, "learning_rate": 0.0002, "epoch": 1.429802513464991, "step": 19910}, {"loss": 0.7588, "grad_norm": 0.646892786026001, "learning_rate": 0.0002, "epoch": 1.430520646319569, "step": 19920}, {"loss": 0.7175, "grad_norm": 0.6268765926361084, "learning_rate": 0.0002, "epoch": 1.4312387791741472, "step": 19930}, {"loss": 0.7502, "grad_norm": 0.7104699611663818, "learning_rate": 0.0002, "epoch": 1.4319569120287254, "step": 19940}, {"loss": 0.7006, "grad_norm": 0.6742063760757446, "learning_rate": 0.0002, "epoch": 1.4326750448833034, "step": 19950}, {"loss": 0.7394, "grad_norm": 0.6973381638526917, "learning_rate": 0.0002, "epoch": 1.4333931777378814, "step": 19960}, {"loss": 0.7428, "grad_norm": 0.5819381475448608, "learning_rate": 0.0002, "epoch": 1.4341113105924597, "step": 19970}, {"loss": 0.7836, "grad_norm": 0.680623471736908, "learning_rate": 0.0002, "epoch": 1.4348294434470377, "step": 19980}, {"loss": 0.7063, "grad_norm": 0.5899890661239624, "learning_rate": 0.0002, "epoch": 1.435547576301616, "step": 19990}, {"loss": 0.7438, "grad_norm": 0.6225098371505737, "learning_rate": 0.0002, "epoch": 1.436265709156194, "step": 20000}, {"loss": 0.7065, "grad_norm": 0.6314228773117065, "learning_rate": 0.0002, "epoch": 1.436983842010772, "step": 20010}, {"loss": 0.677, "grad_norm": 0.8690667152404785, "learning_rate": 0.0002, "epoch": 1.4377019748653501, "step": 20020}, {"loss": 0.7491, "grad_norm": 0.7166543006896973, "learning_rate": 0.0002, "epoch": 1.4384201077199281, "step": 20030}, {"loss": 0.7686, "grad_norm": 0.7051591873168945, "learning_rate": 0.0002, "epoch": 1.4391382405745063, "step": 20040}, {"loss": 0.6669, "grad_norm": 0.7606652975082397, "learning_rate": 0.0002, "epoch": 1.4398563734290843, "step": 20050}, {"loss": 0.7427, "grad_norm": 0.6343185305595398, "learning_rate": 0.0002, "epoch": 1.4405745062836623, "step": 20060}, {"loss": 0.6956, "grad_norm": 0.5625789761543274, "learning_rate": 0.0002, "epoch": 1.4412926391382406, "step": 20070}, {"loss": 0.7421, "grad_norm": 0.6081897020339966, "learning_rate": 0.0002, "epoch": 1.4420107719928188, "step": 20080}, {"loss": 0.7646, "grad_norm": 0.9571536779403687, "learning_rate": 0.0002, "epoch": 1.4427289048473968, "step": 20090}, {"loss": 0.6939, "grad_norm": 0.869531512260437, "learning_rate": 0.0002, "epoch": 1.4434470377019748, "step": 20100}, {"loss": 0.7684, "grad_norm": 0.6865507960319519, "learning_rate": 0.0002, "epoch": 1.444165170556553, "step": 20110}, {"loss": 0.6835, "grad_norm": 0.7572755813598633, "learning_rate": 0.0002, "epoch": 1.444883303411131, "step": 20120}, {"loss": 0.7392, "grad_norm": 0.79011070728302, "learning_rate": 0.0002, "epoch": 1.4456014362657092, "step": 20130}, {"loss": 0.7624, "grad_norm": 0.8297342658042908, "learning_rate": 0.0002, "epoch": 1.4463195691202873, "step": 20140}, {"loss": 0.696, "grad_norm": 0.6593490839004517, "learning_rate": 0.0002, "epoch": 1.4470377019748653, "step": 20150}, {"loss": 0.7062, "grad_norm": 1.0264687538146973, "learning_rate": 0.0002, "epoch": 1.4477558348294435, "step": 20160}, {"loss": 0.7804, "grad_norm": 0.7032888531684875, "learning_rate": 0.0002, "epoch": 1.4484739676840215, "step": 20170}, {"loss": 0.7692, "grad_norm": 0.6438494920730591, "learning_rate": 0.0002, "epoch": 1.4491921005385997, "step": 20180}, {"loss": 0.7189, "grad_norm": 0.7448790669441223, "learning_rate": 0.0002, "epoch": 1.4499102333931777, "step": 20190}, {"loss": 0.7389, "grad_norm": 0.7551555037498474, "learning_rate": 0.0002, "epoch": 1.4506283662477557, "step": 20200}, {"loss": 0.7636, "grad_norm": 0.6677857041358948, "learning_rate": 0.0002, "epoch": 1.451346499102334, "step": 20210}, {"loss": 0.7261, "grad_norm": 0.7888486385345459, "learning_rate": 0.0002, "epoch": 1.4520646319569122, "step": 20220}, {"loss": 0.7349, "grad_norm": 0.6658565402030945, "learning_rate": 0.0002, "epoch": 1.4527827648114902, "step": 20230}, {"loss": 0.7862, "grad_norm": 0.6800249814987183, "learning_rate": 0.0002, "epoch": 1.4535008976660682, "step": 20240}, {"loss": 0.7464, "grad_norm": 0.7419682741165161, "learning_rate": 0.0002, "epoch": 1.4542190305206464, "step": 20250}, {"loss": 0.7118, "grad_norm": 0.8848792910575867, "learning_rate": 0.0002, "epoch": 1.4549371633752244, "step": 20260}, {"loss": 0.729, "grad_norm": 0.6513857245445251, "learning_rate": 0.0002, "epoch": 1.4556552962298026, "step": 20270}, {"loss": 0.7325, "grad_norm": 0.5605742335319519, "learning_rate": 0.0002, "epoch": 1.4563734290843806, "step": 20280}, {"loss": 0.7078, "grad_norm": 0.6737141013145447, "learning_rate": 0.0002, "epoch": 1.4570915619389586, "step": 20290}, {"loss": 0.6971, "grad_norm": 0.6663289666175842, "learning_rate": 0.0002, "epoch": 1.4578096947935368, "step": 20300}, {"loss": 0.7161, "grad_norm": 0.7157106995582581, "learning_rate": 0.0002, "epoch": 1.4585278276481148, "step": 20310}, {"loss": 0.7024, "grad_norm": 0.7713354825973511, "learning_rate": 0.0002, "epoch": 1.459245960502693, "step": 20320}, {"loss": 0.7043, "grad_norm": 0.8334044218063354, "learning_rate": 0.0002, "epoch": 1.459964093357271, "step": 20330}, {"loss": 0.7151, "grad_norm": 0.7268327474594116, "learning_rate": 0.0002, "epoch": 1.460682226211849, "step": 20340}, {"loss": 0.7415, "grad_norm": 0.6791431903839111, "learning_rate": 0.0002, "epoch": 1.4614003590664273, "step": 20350}, {"loss": 0.7738, "grad_norm": 0.8177870512008667, "learning_rate": 0.0002, "epoch": 1.4621184919210055, "step": 20360}, {"loss": 0.7212, "grad_norm": 0.8064364790916443, "learning_rate": 0.0002, "epoch": 1.4628366247755835, "step": 20370}, {"loss": 0.7285, "grad_norm": 0.6547006964683533, "learning_rate": 0.0002, "epoch": 1.4635547576301615, "step": 20380}, {"loss": 0.7444, "grad_norm": 0.6381436586380005, "learning_rate": 0.0002, "epoch": 1.4642728904847397, "step": 20390}, {"loss": 0.7593, "grad_norm": 0.7351248264312744, "learning_rate": 0.0002, "epoch": 1.4649910233393177, "step": 20400}, {"loss": 0.7385, "grad_norm": 0.7037558555603027, "learning_rate": 0.0002, "epoch": 1.465709156193896, "step": 20410}, {"loss": 0.7815, "grad_norm": 0.6294074654579163, "learning_rate": 0.0002, "epoch": 1.466427289048474, "step": 20420}, {"loss": 0.6665, "grad_norm": 0.9722632765769958, "learning_rate": 0.0002, "epoch": 1.467145421903052, "step": 20430}, {"loss": 0.7363, "grad_norm": 0.753065824508667, "learning_rate": 0.0002, "epoch": 1.4678635547576302, "step": 20440}, {"loss": 0.7568, "grad_norm": 0.7317194938659668, "learning_rate": 0.0002, "epoch": 1.4685816876122082, "step": 20450}, {"loss": 0.6948, "grad_norm": 0.6862193942070007, "learning_rate": 0.0002, "epoch": 1.4692998204667864, "step": 20460}, {"loss": 0.7552, "grad_norm": 0.7643225193023682, "learning_rate": 0.0002, "epoch": 1.4700179533213644, "step": 20470}, {"loss": 0.6757, "grad_norm": 0.5904353260993958, "learning_rate": 0.0002, "epoch": 1.4707360861759424, "step": 20480}, {"loss": 0.7779, "grad_norm": 0.5812238454818726, "learning_rate": 0.0002, "epoch": 1.4714542190305206, "step": 20490}, {"loss": 0.7252, "grad_norm": 0.7478151321411133, "learning_rate": 0.0002, "epoch": 1.4721723518850989, "step": 20500}, {"loss": 0.7165, "grad_norm": 0.7625645399093628, "learning_rate": 0.0002, "epoch": 1.4728904847396769, "step": 20510}, {"loss": 0.7383, "grad_norm": 0.6354498267173767, "learning_rate": 0.0002, "epoch": 1.4736086175942549, "step": 20520}, {"loss": 0.7095, "grad_norm": 0.8731162548065186, "learning_rate": 0.0002, "epoch": 1.474326750448833, "step": 20530}, {"loss": 0.7535, "grad_norm": 0.7346670627593994, "learning_rate": 0.0002, "epoch": 1.475044883303411, "step": 20540}, {"loss": 0.78, "grad_norm": 1.038447618484497, "learning_rate": 0.0002, "epoch": 1.4757630161579893, "step": 20550}, {"loss": 0.7026, "grad_norm": 0.7032809257507324, "learning_rate": 0.0002, "epoch": 1.4764811490125673, "step": 20560}, {"loss": 0.6776, "grad_norm": 0.8008337020874023, "learning_rate": 0.0002, "epoch": 1.4771992818671453, "step": 20570}, {"loss": 0.776, "grad_norm": 0.6735056638717651, "learning_rate": 0.0002, "epoch": 1.4779174147217236, "step": 20580}, {"loss": 0.7632, "grad_norm": 0.622056245803833, "learning_rate": 0.0002, "epoch": 1.4786355475763016, "step": 20590}, {"loss": 0.7467, "grad_norm": 0.6580422520637512, "learning_rate": 0.0002, "epoch": 1.4793536804308798, "step": 20600}, {"loss": 0.7161, "grad_norm": 0.8401153087615967, "learning_rate": 0.0002, "epoch": 1.4800718132854578, "step": 20610}, {"loss": 0.7581, "grad_norm": 0.7564560770988464, "learning_rate": 0.0002, "epoch": 1.4807899461400358, "step": 20620}, {"loss": 0.7507, "grad_norm": 0.8319511413574219, "learning_rate": 0.0002, "epoch": 1.481508078994614, "step": 20630}, {"loss": 0.7379, "grad_norm": 0.7430182695388794, "learning_rate": 0.0002, "epoch": 1.4822262118491922, "step": 20640}, {"loss": 0.7273, "grad_norm": 0.7996522784233093, "learning_rate": 0.0002, "epoch": 1.4829443447037702, "step": 20650}, {"loss": 0.7223, "grad_norm": 0.6993277072906494, "learning_rate": 0.0002, "epoch": 1.4836624775583482, "step": 20660}, {"loss": 0.7328, "grad_norm": 0.8621185421943665, "learning_rate": 0.0002, "epoch": 1.4843806104129265, "step": 20670}, {"loss": 0.7327, "grad_norm": 0.7709757685661316, "learning_rate": 0.0002, "epoch": 1.4850987432675045, "step": 20680}, {"loss": 0.7053, "grad_norm": 0.743760347366333, "learning_rate": 0.0002, "epoch": 1.4858168761220827, "step": 20690}, {"loss": 0.6763, "grad_norm": 0.8353745341300964, "learning_rate": 0.0002, "epoch": 1.4865350089766607, "step": 20700}, {"loss": 0.6933, "grad_norm": 0.8510433435440063, "learning_rate": 0.0002, "epoch": 1.4872531418312387, "step": 20710}, {"loss": 0.7486, "grad_norm": 0.7065894603729248, "learning_rate": 0.0002, "epoch": 1.487971274685817, "step": 20720}, {"loss": 0.736, "grad_norm": 0.6878955960273743, "learning_rate": 0.0002, "epoch": 1.488689407540395, "step": 20730}, {"loss": 0.6958, "grad_norm": 0.7861111760139465, "learning_rate": 0.0002, "epoch": 1.4894075403949731, "step": 20740}, {"loss": 0.7568, "grad_norm": 0.4810725152492523, "learning_rate": 0.0002, "epoch": 1.4901256732495511, "step": 20750}, {"loss": 0.8147, "grad_norm": 0.7246082425117493, "learning_rate": 0.0002, "epoch": 1.4908438061041291, "step": 20760}, {"loss": 0.7312, "grad_norm": 0.7101936340332031, "learning_rate": 0.0002, "epoch": 1.4915619389587074, "step": 20770}, {"loss": 0.7393, "grad_norm": 0.7508591413497925, "learning_rate": 0.0002, "epoch": 1.4922800718132856, "step": 20780}, {"loss": 0.7635, "grad_norm": 0.8872039914131165, "learning_rate": 0.0002, "epoch": 1.4929982046678636, "step": 20790}, {"loss": 0.7352, "grad_norm": 0.7257922887802124, "learning_rate": 0.0002, "epoch": 1.4937163375224416, "step": 20800}, {"loss": 0.7497, "grad_norm": 0.7886278629302979, "learning_rate": 0.0002, "epoch": 1.4944344703770198, "step": 20810}, {"loss": 0.7247, "grad_norm": 0.6746290922164917, "learning_rate": 0.0002, "epoch": 1.4951526032315978, "step": 20820}, {"loss": 0.7836, "grad_norm": 0.8118207454681396, "learning_rate": 0.0002, "epoch": 1.495870736086176, "step": 20830}, {"loss": 0.7323, "grad_norm": 0.7337301969528198, "learning_rate": 0.0002, "epoch": 1.496588868940754, "step": 20840}, {"loss": 0.7105, "grad_norm": 0.5451242327690125, "learning_rate": 0.0002, "epoch": 1.497307001795332, "step": 20850}, {"loss": 0.7255, "grad_norm": 0.8398377299308777, "learning_rate": 0.0002, "epoch": 1.4980251346499103, "step": 20860}, {"loss": 0.7217, "grad_norm": 0.7196659445762634, "learning_rate": 0.0002, "epoch": 1.4987432675044883, "step": 20870}, {"loss": 0.6843, "grad_norm": 0.6659539937973022, "learning_rate": 0.0002, "epoch": 1.4994614003590665, "step": 20880}, {"loss": 0.7337, "grad_norm": 0.6071978807449341, "learning_rate": 0.0002, "epoch": 1.5001795332136445, "step": 20890}, {"loss": 0.7221, "grad_norm": 0.6704870462417603, "learning_rate": 0.0002, "epoch": 1.5008976660682225, "step": 20900}, {"loss": 0.6946, "grad_norm": 0.7216639518737793, "learning_rate": 0.0002, "epoch": 1.5016157989228007, "step": 20910}, {"loss": 0.7282, "grad_norm": 0.6050528287887573, "learning_rate": 0.0002, "epoch": 1.502333931777379, "step": 20920}, {"loss": 0.7142, "grad_norm": 0.7422218918800354, "learning_rate": 0.0002, "epoch": 1.503052064631957, "step": 20930}, {"loss": 0.7779, "grad_norm": 0.7157148122787476, "learning_rate": 0.0002, "epoch": 1.503770197486535, "step": 20940}, {"loss": 0.7179, "grad_norm": 0.6704899668693542, "learning_rate": 0.0002, "epoch": 1.504488330341113, "step": 20950}, {"loss": 0.7124, "grad_norm": 0.7573544979095459, "learning_rate": 0.0002, "epoch": 1.5052064631956912, "step": 20960}, {"loss": 0.7831, "grad_norm": 0.6710506677627563, "learning_rate": 0.0002, "epoch": 1.5059245960502694, "step": 20970}, {"loss": 0.7123, "grad_norm": 0.7559793591499329, "learning_rate": 0.0002, "epoch": 1.5066427289048474, "step": 20980}, {"loss": 0.7442, "grad_norm": 0.6705940961837769, "learning_rate": 0.0002, "epoch": 1.5073608617594254, "step": 20990}, {"loss": 0.7387, "grad_norm": 0.8016680479049683, "learning_rate": 0.0002, "epoch": 1.5080789946140036, "step": 21000}, {"loss": 0.7101, "grad_norm": 0.8154481649398804, "learning_rate": 0.0002, "epoch": 1.5087971274685816, "step": 21010}, {"loss": 0.7223, "grad_norm": 0.5830582976341248, "learning_rate": 0.0002, "epoch": 1.5095152603231599, "step": 21020}, {"loss": 0.753, "grad_norm": 0.7088601589202881, "learning_rate": 0.0002, "epoch": 1.5102333931777379, "step": 21030}, {"loss": 0.7278, "grad_norm": 0.7499658465385437, "learning_rate": 0.0002, "epoch": 1.5109515260323159, "step": 21040}, {"loss": 0.7441, "grad_norm": 0.7684667706489563, "learning_rate": 0.0002, "epoch": 1.511669658886894, "step": 21050}, {"loss": 0.7665, "grad_norm": 0.7183627486228943, "learning_rate": 0.0002, "epoch": 1.5123877917414723, "step": 21060}, {"loss": 0.7777, "grad_norm": 0.8201524615287781, "learning_rate": 0.0002, "epoch": 1.5131059245960503, "step": 21070}, {"loss": 0.7005, "grad_norm": 0.6359647512435913, "learning_rate": 0.0002, "epoch": 1.5138240574506283, "step": 21080}, {"loss": 0.7231, "grad_norm": 0.7419124245643616, "learning_rate": 0.0002, "epoch": 1.5145421903052063, "step": 21090}, {"loss": 0.724, "grad_norm": 0.6145808696746826, "learning_rate": 0.0002, "epoch": 1.5152603231597845, "step": 21100}, {"loss": 0.7563, "grad_norm": 0.7116656303405762, "learning_rate": 0.0002, "epoch": 1.5159784560143628, "step": 21110}, {"loss": 0.7221, "grad_norm": 0.8927125334739685, "learning_rate": 0.0002, "epoch": 1.5166965888689408, "step": 21120}, {"loss": 0.7159, "grad_norm": 0.7527788877487183, "learning_rate": 0.0002, "epoch": 1.5174147217235188, "step": 21130}, {"loss": 0.7147, "grad_norm": 0.7537266612052917, "learning_rate": 0.0002, "epoch": 1.518132854578097, "step": 21140}, {"loss": 0.7451, "grad_norm": 0.9051724672317505, "learning_rate": 0.0002, "epoch": 1.518850987432675, "step": 21150}, {"loss": 0.7362, "grad_norm": 0.7258086800575256, "learning_rate": 0.0002, "epoch": 1.5195691202872532, "step": 21160}, {"loss": 0.7096, "grad_norm": 0.60377436876297, "learning_rate": 0.0002, "epoch": 1.5202872531418312, "step": 21170}, {"loss": 0.7141, "grad_norm": 0.613362729549408, "learning_rate": 0.0002, "epoch": 1.5210053859964092, "step": 21180}, {"loss": 0.7018, "grad_norm": 0.6311782002449036, "learning_rate": 0.0002, "epoch": 1.5217235188509874, "step": 21190}, {"loss": 0.8144, "grad_norm": 0.7814380526542664, "learning_rate": 0.0002, "epoch": 1.5224416517055657, "step": 21200}, {"loss": 0.7505, "grad_norm": 0.8482790589332581, "learning_rate": 0.0002, "epoch": 1.5231597845601437, "step": 21210}, {"loss": 0.7387, "grad_norm": 0.6767336130142212, "learning_rate": 0.0002, "epoch": 1.5238779174147217, "step": 21220}, {"loss": 0.7556, "grad_norm": 0.7000219821929932, "learning_rate": 0.0002, "epoch": 1.5245960502692997, "step": 21230}, {"loss": 0.7628, "grad_norm": 0.8848617076873779, "learning_rate": 0.0002, "epoch": 1.525314183123878, "step": 21240}, {"loss": 0.7226, "grad_norm": 0.692258894443512, "learning_rate": 0.0002, "epoch": 1.5260323159784561, "step": 21250}, {"loss": 0.7535, "grad_norm": 0.7701950073242188, "learning_rate": 0.0002, "epoch": 1.5267504488330341, "step": 21260}, {"loss": 0.7531, "grad_norm": 0.7454132437705994, "learning_rate": 0.0002, "epoch": 1.5274685816876121, "step": 21270}, {"loss": 0.7663, "grad_norm": 0.7299574613571167, "learning_rate": 0.0002, "epoch": 1.5281867145421903, "step": 21280}, {"loss": 0.6993, "grad_norm": 0.6693950891494751, "learning_rate": 0.0002, "epoch": 1.5289048473967684, "step": 21290}, {"loss": 0.7567, "grad_norm": 0.8323785066604614, "learning_rate": 0.0002, "epoch": 1.5296229802513466, "step": 21300}, {"loss": 0.7205, "grad_norm": 0.8998763561248779, "learning_rate": 0.0002, "epoch": 1.5303411131059246, "step": 21310}, {"loss": 0.7779, "grad_norm": 0.8118193745613098, "learning_rate": 0.0002, "epoch": 1.5310592459605026, "step": 21320}, {"loss": 0.7642, "grad_norm": 0.8966332077980042, "learning_rate": 0.0002, "epoch": 1.5317773788150808, "step": 21330}, {"loss": 0.7626, "grad_norm": 0.7849827408790588, "learning_rate": 0.0002, "epoch": 1.532495511669659, "step": 21340}, {"loss": 0.7501, "grad_norm": 0.897583544254303, "learning_rate": 0.0002, "epoch": 1.533213644524237, "step": 21350}, {"loss": 0.7812, "grad_norm": 0.7998009324073792, "learning_rate": 0.0002, "epoch": 1.533931777378815, "step": 21360}, {"loss": 0.7217, "grad_norm": 0.5890361070632935, "learning_rate": 0.0002, "epoch": 1.534649910233393, "step": 21370}, {"loss": 0.7283, "grad_norm": 0.7321302890777588, "learning_rate": 0.0002, "epoch": 1.5353680430879713, "step": 21380}, {"loss": 0.7238, "grad_norm": 0.7746050357818604, "learning_rate": 0.0002, "epoch": 1.5360861759425495, "step": 21390}, {"loss": 0.7146, "grad_norm": 0.7033910155296326, "learning_rate": 0.0002, "epoch": 1.5368043087971275, "step": 21400}, {"loss": 0.6783, "grad_norm": 0.7229148149490356, "learning_rate": 0.0002, "epoch": 1.5375224416517055, "step": 21410}, {"loss": 0.7347, "grad_norm": 0.8055810928344727, "learning_rate": 0.0002, "epoch": 1.5382405745062837, "step": 21420}, {"loss": 0.7382, "grad_norm": 0.9411654472351074, "learning_rate": 0.0002, "epoch": 1.5389587073608617, "step": 21430}, {"loss": 0.6916, "grad_norm": 0.7297126650810242, "learning_rate": 0.0002, "epoch": 1.53967684021544, "step": 21440}, {"loss": 0.6977, "grad_norm": 0.7316457629203796, "learning_rate": 0.0002, "epoch": 1.540394973070018, "step": 21450}, {"loss": 0.713, "grad_norm": 0.8568798303604126, "learning_rate": 0.0002, "epoch": 1.541113105924596, "step": 21460}, {"loss": 0.6916, "grad_norm": 0.7829580307006836, "learning_rate": 0.0002, "epoch": 1.5418312387791742, "step": 21470}, {"loss": 0.712, "grad_norm": 0.6679823398590088, "learning_rate": 0.0002, "epoch": 1.5425493716337524, "step": 21480}, {"loss": 0.6978, "grad_norm": 0.5680868029594421, "learning_rate": 0.0002, "epoch": 1.5432675044883304, "step": 21490}, {"loss": 0.7638, "grad_norm": 0.6878862380981445, "learning_rate": 0.0002, "epoch": 1.5439856373429084, "step": 21500}, {"loss": 0.7634, "grad_norm": 0.7391727566719055, "learning_rate": 0.0002, "epoch": 1.5447037701974864, "step": 21510}, {"loss": 0.7781, "grad_norm": 0.844994843006134, "learning_rate": 0.0002, "epoch": 1.5454219030520646, "step": 21520}, {"loss": 0.7052, "grad_norm": 0.7852550148963928, "learning_rate": 0.0002, "epoch": 1.5461400359066428, "step": 21530}, {"loss": 0.7364, "grad_norm": 0.8370407223701477, "learning_rate": 0.0002, "epoch": 1.5468581687612208, "step": 21540}, {"loss": 0.7266, "grad_norm": 0.7138169407844543, "learning_rate": 0.0002, "epoch": 1.5475763016157988, "step": 21550}, {"loss": 0.7078, "grad_norm": 0.7660839557647705, "learning_rate": 0.0002, "epoch": 1.548294434470377, "step": 21560}, {"loss": 0.7056, "grad_norm": 0.6628666520118713, "learning_rate": 0.0002, "epoch": 1.549012567324955, "step": 21570}, {"loss": 0.7384, "grad_norm": 0.602262020111084, "learning_rate": 0.0002, "epoch": 1.5497307001795333, "step": 21580}, {"loss": 0.7258, "grad_norm": 0.6120333671569824, "learning_rate": 0.0002, "epoch": 1.5504488330341113, "step": 21590}, {"loss": 0.8094, "grad_norm": 0.6742582321166992, "learning_rate": 0.0002, "epoch": 1.5511669658886893, "step": 21600}, {"loss": 0.6807, "grad_norm": 0.6788192391395569, "learning_rate": 0.0002, "epoch": 1.5518850987432675, "step": 21610}, {"loss": 0.6969, "grad_norm": 0.7124713659286499, "learning_rate": 0.0002, "epoch": 1.5526032315978457, "step": 21620}, {"loss": 0.7296, "grad_norm": 0.6297248005867004, "learning_rate": 0.0002, "epoch": 1.5533213644524237, "step": 21630}, {"loss": 0.7466, "grad_norm": 0.8977078199386597, "learning_rate": 0.0002, "epoch": 1.5540394973070017, "step": 21640}, {"loss": 0.7376, "grad_norm": 0.7543209791183472, "learning_rate": 0.0002, "epoch": 1.5547576301615798, "step": 21650}, {"loss": 0.749, "grad_norm": 0.8704302310943604, "learning_rate": 0.0002, "epoch": 1.555475763016158, "step": 21660}, {"loss": 0.7801, "grad_norm": 0.7848012447357178, "learning_rate": 0.0002, "epoch": 1.5561938958707362, "step": 21670}, {"loss": 0.7062, "grad_norm": 0.7496278285980225, "learning_rate": 0.0002, "epoch": 1.5569120287253142, "step": 21680}, {"loss": 0.7503, "grad_norm": 0.7305200099945068, "learning_rate": 0.0002, "epoch": 1.5576301615798922, "step": 21690}, {"loss": 0.7429, "grad_norm": 0.6671105623245239, "learning_rate": 0.0002, "epoch": 1.5583482944344704, "step": 21700}, {"loss": 0.7293, "grad_norm": 0.8536111116409302, "learning_rate": 0.0002, "epoch": 1.5590664272890484, "step": 21710}, {"loss": 0.7169, "grad_norm": 0.7360461354255676, "learning_rate": 0.0002, "epoch": 1.5597845601436267, "step": 21720}, {"loss": 0.7314, "grad_norm": 0.6665109395980835, "learning_rate": 0.0002, "epoch": 1.5605026929982047, "step": 21730}, {"loss": 0.7262, "grad_norm": 0.5879628658294678, "learning_rate": 0.0002, "epoch": 1.5612208258527827, "step": 21740}, {"loss": 0.7099, "grad_norm": 0.6937240958213806, "learning_rate": 0.0002, "epoch": 1.5619389587073609, "step": 21750}, {"loss": 0.7669, "grad_norm": 0.7118659019470215, "learning_rate": 0.0002, "epoch": 1.562657091561939, "step": 21760}, {"loss": 0.7196, "grad_norm": 0.7858866453170776, "learning_rate": 0.0002, "epoch": 1.563375224416517, "step": 21770}, {"loss": 0.7552, "grad_norm": 0.8691372871398926, "learning_rate": 0.0002, "epoch": 1.564093357271095, "step": 21780}, {"loss": 0.7684, "grad_norm": 0.8884942531585693, "learning_rate": 0.0002, "epoch": 1.564811490125673, "step": 21790}, {"loss": 0.7128, "grad_norm": 0.6335656046867371, "learning_rate": 0.0002, "epoch": 1.5655296229802513, "step": 21800}, {"loss": 0.7233, "grad_norm": 0.8666166067123413, "learning_rate": 0.0002, "epoch": 1.5662477558348296, "step": 21810}, {"loss": 0.6771, "grad_norm": 0.7961624264717102, "learning_rate": 0.0002, "epoch": 1.5669658886894076, "step": 21820}, {"loss": 0.7286, "grad_norm": 0.6331174373626709, "learning_rate": 0.0002, "epoch": 1.5676840215439856, "step": 21830}, {"loss": 0.7273, "grad_norm": 0.6476998925209045, "learning_rate": 0.0002, "epoch": 1.5684021543985638, "step": 21840}, {"loss": 0.7507, "grad_norm": 0.8279129266738892, "learning_rate": 0.0002, "epoch": 1.5691202872531418, "step": 21850}, {"loss": 0.7219, "grad_norm": 0.6997109651565552, "learning_rate": 0.0002, "epoch": 1.56983842010772, "step": 21860}, {"loss": 0.7424, "grad_norm": 0.6992211937904358, "learning_rate": 0.0002, "epoch": 1.570556552962298, "step": 21870}, {"loss": 0.7275, "grad_norm": 0.7766915559768677, "learning_rate": 0.0002, "epoch": 1.571274685816876, "step": 21880}, {"loss": 0.7651, "grad_norm": 0.6845845580101013, "learning_rate": 0.0002, "epoch": 1.5719928186714542, "step": 21890}, {"loss": 0.706, "grad_norm": 0.7247874140739441, "learning_rate": 0.0002, "epoch": 1.5727109515260325, "step": 21900}, {"loss": 0.7812, "grad_norm": 0.802342414855957, "learning_rate": 0.0002, "epoch": 1.5734290843806105, "step": 21910}, {"loss": 0.7028, "grad_norm": 0.7797709107398987, "learning_rate": 0.0002, "epoch": 1.5741472172351885, "step": 21920}, {"loss": 0.7466, "grad_norm": 0.6534958481788635, "learning_rate": 0.0002, "epoch": 1.5748653500897665, "step": 21930}, {"loss": 0.7148, "grad_norm": 0.6003528237342834, "learning_rate": 0.0002, "epoch": 1.5755834829443447, "step": 21940}, {"loss": 0.7282, "grad_norm": 0.6920075416564941, "learning_rate": 0.0002, "epoch": 1.576301615798923, "step": 21950}, {"loss": 0.6533, "grad_norm": 0.7213456034660339, "learning_rate": 0.0002, "epoch": 1.577019748653501, "step": 21960}, {"loss": 0.6875, "grad_norm": 0.7101914286613464, "learning_rate": 0.0002, "epoch": 1.577737881508079, "step": 21970}, {"loss": 0.7421, "grad_norm": 0.9531592130661011, "learning_rate": 0.0002, "epoch": 1.5784560143626571, "step": 21980}, {"loss": 0.7454, "grad_norm": 0.7690590023994446, "learning_rate": 0.0002, "epoch": 1.5791741472172351, "step": 21990}, {"loss": 0.7135, "grad_norm": 0.8226363062858582, "learning_rate": 0.0002, "epoch": 1.5798922800718134, "step": 22000}, {"loss": 0.7518, "grad_norm": 0.6128851175308228, "learning_rate": 0.0002, "epoch": 1.5806104129263914, "step": 22010}, {"loss": 0.7253, "grad_norm": 0.827008068561554, "learning_rate": 0.0002, "epoch": 1.5813285457809694, "step": 22020}, {"loss": 0.7176, "grad_norm": 0.6729007363319397, "learning_rate": 0.0002, "epoch": 1.5820466786355476, "step": 22030}, {"loss": 0.7503, "grad_norm": 0.6397014260292053, "learning_rate": 0.0002, "epoch": 1.5827648114901258, "step": 22040}, {"loss": 0.7531, "grad_norm": 0.6927793622016907, "learning_rate": 0.0002, "epoch": 1.5834829443447038, "step": 22050}, {"loss": 0.7499, "grad_norm": 0.7527112364768982, "learning_rate": 0.0002, "epoch": 1.5842010771992818, "step": 22060}, {"loss": 0.739, "grad_norm": 0.6418012380599976, "learning_rate": 0.0002, "epoch": 1.5849192100538598, "step": 22070}, {"loss": 0.727, "grad_norm": 0.7627281546592712, "learning_rate": 0.0002, "epoch": 1.585637342908438, "step": 22080}, {"loss": 0.7115, "grad_norm": 0.753851592540741, "learning_rate": 0.0002, "epoch": 1.5863554757630163, "step": 22090}, {"loss": 0.7677, "grad_norm": 0.6049349904060364, "learning_rate": 0.0002, "epoch": 1.5870736086175943, "step": 22100}, {"loss": 0.7494, "grad_norm": 0.6677758693695068, "learning_rate": 0.0002, "epoch": 1.5877917414721723, "step": 22110}, {"loss": 0.7259, "grad_norm": 0.913489818572998, "learning_rate": 0.0002, "epoch": 1.5885098743267505, "step": 22120}, {"loss": 0.7823, "grad_norm": 0.6779162883758545, "learning_rate": 0.0002, "epoch": 1.5892280071813285, "step": 22130}, {"loss": 0.7674, "grad_norm": 0.910076916217804, "learning_rate": 0.0002, "epoch": 1.5899461400359067, "step": 22140}, {"loss": 0.7162, "grad_norm": 0.9506068229675293, "learning_rate": 0.0002, "epoch": 1.5906642728904847, "step": 22150}, {"loss": 0.7343, "grad_norm": 0.6552460789680481, "learning_rate": 0.0002, "epoch": 1.5913824057450627, "step": 22160}, {"loss": 0.7488, "grad_norm": 0.6855819821357727, "learning_rate": 0.0002, "epoch": 1.592100538599641, "step": 22170}, {"loss": 0.6785, "grad_norm": 0.6713384985923767, "learning_rate": 0.0002, "epoch": 1.5928186714542192, "step": 22180}, {"loss": 0.7287, "grad_norm": 0.7168547511100769, "learning_rate": 0.0002, "epoch": 1.5935368043087972, "step": 22190}, {"loss": 0.7259, "grad_norm": 0.8395482897758484, "learning_rate": 0.0002, "epoch": 1.5942549371633752, "step": 22200}, {"loss": 0.6995, "grad_norm": 0.6676998138427734, "learning_rate": 0.0002, "epoch": 1.5949730700179532, "step": 22210}, {"loss": 0.7152, "grad_norm": 0.5837140083312988, "learning_rate": 0.0002, "epoch": 1.5956912028725314, "step": 22220}, {"loss": 0.7464, "grad_norm": 0.8399306535720825, "learning_rate": 0.0002, "epoch": 1.5964093357271096, "step": 22230}, {"loss": 0.7053, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 1.5971274685816876, "step": 22240}, {"loss": 0.784, "grad_norm": 0.768604040145874, "learning_rate": 0.0002, "epoch": 1.5978456014362656, "step": 22250}, {"loss": 0.6946, "grad_norm": 0.6382646560668945, "learning_rate": 0.0002, "epoch": 1.5985637342908436, "step": 22260}, {"loss": 0.7035, "grad_norm": 0.7244897484779358, "learning_rate": 0.0002, "epoch": 1.5992818671454219, "step": 22270}, {"loss": 0.7168, "grad_norm": 0.6250987648963928, "learning_rate": 0.0002, "epoch": 1.6, "step": 22280}, {"loss": 0.7182, "grad_norm": 0.8731992244720459, "learning_rate": 0.0002, "epoch": 1.600718132854578, "step": 22290}, {"loss": 0.6866, "grad_norm": 0.5861822962760925, "learning_rate": 0.0002, "epoch": 1.601436265709156, "step": 22300}, {"loss": 0.6909, "grad_norm": 0.716805100440979, "learning_rate": 0.0002, "epoch": 1.6021543985637343, "step": 22310}, {"loss": 0.7377, "grad_norm": 0.6650034189224243, "learning_rate": 0.0002, "epoch": 1.6028725314183125, "step": 22320}, {"loss": 0.7107, "grad_norm": 0.6944432854652405, "learning_rate": 0.0002, "epoch": 1.6035906642728905, "step": 22330}, {"loss": 0.682, "grad_norm": 0.7411999106407166, "learning_rate": 0.0002, "epoch": 1.6043087971274685, "step": 22340}, {"loss": 0.7294, "grad_norm": 0.831828773021698, "learning_rate": 0.0002, "epoch": 1.6050269299820465, "step": 22350}, {"loss": 0.7305, "grad_norm": 0.6252152919769287, "learning_rate": 0.0002, "epoch": 1.6057450628366248, "step": 22360}, {"loss": 0.7479, "grad_norm": 0.8643325567245483, "learning_rate": 0.0002, "epoch": 1.606463195691203, "step": 22370}, {"loss": 0.7417, "grad_norm": 0.7330279350280762, "learning_rate": 0.0002, "epoch": 1.607181328545781, "step": 22380}, {"loss": 0.7198, "grad_norm": 0.7235422730445862, "learning_rate": 0.0002, "epoch": 1.607899461400359, "step": 22390}, {"loss": 0.7638, "grad_norm": 0.6940887570381165, "learning_rate": 0.0002, "epoch": 1.608617594254937, "step": 22400}, {"loss": 0.714, "grad_norm": 0.7907325625419617, "learning_rate": 0.0002, "epoch": 1.6093357271095152, "step": 22410}, {"loss": 0.7824, "grad_norm": 0.6899075508117676, "learning_rate": 0.0002, "epoch": 1.6100538599640934, "step": 22420}, {"loss": 0.7502, "grad_norm": 0.7057487368583679, "learning_rate": 0.0002, "epoch": 1.6107719928186714, "step": 22430}, {"loss": 0.7437, "grad_norm": 0.9235003590583801, "learning_rate": 0.0002, "epoch": 1.6114901256732495, "step": 22440}, {"loss": 0.7115, "grad_norm": 0.7238173484802246, "learning_rate": 0.0002, "epoch": 1.6122082585278277, "step": 22450}, {"loss": 0.7628, "grad_norm": 0.5931997299194336, "learning_rate": 0.0002, "epoch": 1.612926391382406, "step": 22460}, {"loss": 0.6663, "grad_norm": 0.6705866456031799, "learning_rate": 0.0002, "epoch": 1.613644524236984, "step": 22470}, {"loss": 0.749, "grad_norm": 0.7392773032188416, "learning_rate": 0.0002, "epoch": 1.614362657091562, "step": 22480}, {"loss": 0.7292, "grad_norm": 0.6286543607711792, "learning_rate": 0.0002, "epoch": 1.61508078994614, "step": 22490}, {"loss": 0.7264, "grad_norm": 0.7467446327209473, "learning_rate": 0.0002, "epoch": 1.6157989228007181, "step": 22500}, {"loss": 0.732, "grad_norm": 0.8353021740913391, "learning_rate": 0.0002, "epoch": 1.6165170556552964, "step": 22510}, {"loss": 0.7626, "grad_norm": 0.7333045601844788, "learning_rate": 0.0002, "epoch": 1.6172351885098744, "step": 22520}, {"loss": 0.7567, "grad_norm": 0.6203709244728088, "learning_rate": 0.0002, "epoch": 1.6179533213644524, "step": 22530}, {"loss": 0.7478, "grad_norm": 0.5585690140724182, "learning_rate": 0.0002, "epoch": 1.6186714542190304, "step": 22540}, {"loss": 0.669, "grad_norm": 0.7157222032546997, "learning_rate": 0.0002, "epoch": 1.6193895870736086, "step": 22550}, {"loss": 0.7224, "grad_norm": 0.8129993677139282, "learning_rate": 0.0002, "epoch": 1.6201077199281868, "step": 22560}, {"loss": 0.7374, "grad_norm": 0.6745335459709167, "learning_rate": 0.0002, "epoch": 1.6208258527827648, "step": 22570}, {"loss": 0.7276, "grad_norm": 0.7684996724128723, "learning_rate": 0.0002, "epoch": 1.6215439856373428, "step": 22580}, {"loss": 0.7479, "grad_norm": 0.6735436916351318, "learning_rate": 0.0002, "epoch": 1.622262118491921, "step": 22590}, {"loss": 0.6596, "grad_norm": 0.7394272089004517, "learning_rate": 0.0002, "epoch": 1.6229802513464993, "step": 22600}, {"loss": 0.7382, "grad_norm": 0.7268046140670776, "learning_rate": 0.0002, "epoch": 1.6236983842010773, "step": 22610}, {"loss": 0.7619, "grad_norm": 0.8338810205459595, "learning_rate": 0.0002, "epoch": 1.6244165170556553, "step": 22620}, {"loss": 0.7247, "grad_norm": 0.9293080568313599, "learning_rate": 0.0002, "epoch": 1.6251346499102333, "step": 22630}, {"loss": 0.7601, "grad_norm": 0.8084996938705444, "learning_rate": 0.0002, "epoch": 1.6258527827648115, "step": 22640}, {"loss": 0.7053, "grad_norm": 0.6605180501937866, "learning_rate": 0.0002, "epoch": 1.6265709156193897, "step": 22650}, {"loss": 0.7489, "grad_norm": 0.8402717113494873, "learning_rate": 0.0002, "epoch": 1.6272890484739677, "step": 22660}, {"loss": 0.7468, "grad_norm": 0.653055727481842, "learning_rate": 0.0002, "epoch": 1.6280071813285457, "step": 22670}, {"loss": 0.7179, "grad_norm": 0.6477823257446289, "learning_rate": 0.0002, "epoch": 1.6287253141831237, "step": 22680}, {"loss": 0.7216, "grad_norm": 0.9053590893745422, "learning_rate": 0.0002, "epoch": 1.629443447037702, "step": 22690}, {"loss": 0.7257, "grad_norm": 0.90384441614151, "learning_rate": 0.0002, "epoch": 1.6301615798922802, "step": 22700}, {"loss": 0.7703, "grad_norm": 0.6789469122886658, "learning_rate": 0.0002, "epoch": 1.6308797127468582, "step": 22710}, {"loss": 0.7706, "grad_norm": 0.7221854329109192, "learning_rate": 0.0002, "epoch": 1.6315978456014362, "step": 22720}, {"loss": 0.7457, "grad_norm": 0.7724022269248962, "learning_rate": 0.0002, "epoch": 1.6323159784560144, "step": 22730}, {"loss": 0.7864, "grad_norm": 0.8213715553283691, "learning_rate": 0.0002, "epoch": 1.6330341113105926, "step": 22740}, {"loss": 0.7356, "grad_norm": 0.7102876305580139, "learning_rate": 0.0002, "epoch": 1.6337522441651706, "step": 22750}, {"loss": 0.7208, "grad_norm": 0.8817880749702454, "learning_rate": 0.0002, "epoch": 1.6344703770197486, "step": 22760}, {"loss": 0.7722, "grad_norm": 0.8446506857872009, "learning_rate": 0.0002, "epoch": 1.6351885098743266, "step": 22770}, {"loss": 0.7341, "grad_norm": 0.6749029755592346, "learning_rate": 0.0002, "epoch": 1.6359066427289048, "step": 22780}, {"loss": 0.7599, "grad_norm": 0.7013556957244873, "learning_rate": 0.0002, "epoch": 1.636624775583483, "step": 22790}, {"loss": 0.7488, "grad_norm": 0.7767965793609619, "learning_rate": 0.0002, "epoch": 1.637342908438061, "step": 22800}, {"loss": 0.7387, "grad_norm": 0.7354073524475098, "learning_rate": 0.0002, "epoch": 1.638061041292639, "step": 22810}, {"loss": 0.7816, "grad_norm": 0.8871088027954102, "learning_rate": 0.0002, "epoch": 1.638779174147217, "step": 22820}, {"loss": 0.7243, "grad_norm": 0.6573871374130249, "learning_rate": 0.0002, "epoch": 1.6394973070017953, "step": 22830}, {"loss": 0.7812, "grad_norm": 0.5679349303245544, "learning_rate": 0.0002, "epoch": 1.6402154398563735, "step": 22840}, {"loss": 0.7402, "grad_norm": 0.7072559595108032, "learning_rate": 0.0002, "epoch": 1.6409335727109515, "step": 22850}, {"loss": 0.751, "grad_norm": 0.7639257311820984, "learning_rate": 0.0002, "epoch": 1.6416517055655295, "step": 22860}, {"loss": 0.7357, "grad_norm": 0.6699341535568237, "learning_rate": 0.0002, "epoch": 1.6423698384201078, "step": 22870}, {"loss": 0.7295, "grad_norm": 0.8285767436027527, "learning_rate": 0.0002, "epoch": 1.643087971274686, "step": 22880}, {"loss": 0.7267, "grad_norm": 0.7328150272369385, "learning_rate": 0.0002, "epoch": 1.643806104129264, "step": 22890}, {"loss": 0.6904, "grad_norm": 0.8122354745864868, "learning_rate": 0.0002, "epoch": 1.644524236983842, "step": 22900}, {"loss": 0.7853, "grad_norm": 0.7322969436645508, "learning_rate": 0.0002, "epoch": 1.64524236983842, "step": 22910}, {"loss": 0.7629, "grad_norm": 0.7269576191902161, "learning_rate": 0.0002, "epoch": 1.6459605026929982, "step": 22920}, {"loss": 0.728, "grad_norm": 0.7037042379379272, "learning_rate": 0.0002, "epoch": 1.6466786355475764, "step": 22930}, {"loss": 0.752, "grad_norm": 0.6960355639457703, "learning_rate": 0.0002, "epoch": 1.6473967684021544, "step": 22940}, {"loss": 0.7484, "grad_norm": 0.7446839213371277, "learning_rate": 0.0002, "epoch": 1.6481149012567324, "step": 22950}, {"loss": 0.7528, "grad_norm": 0.7201664447784424, "learning_rate": 0.0002, "epoch": 1.6488330341113104, "step": 22960}, {"loss": 0.7183, "grad_norm": 0.7062349319458008, "learning_rate": 0.0002, "epoch": 1.6495511669658887, "step": 22970}, {"loss": 0.6999, "grad_norm": 0.7666636109352112, "learning_rate": 0.0002, "epoch": 1.6502692998204669, "step": 22980}, {"loss": 0.7103, "grad_norm": 0.7872112393379211, "learning_rate": 0.0002, "epoch": 1.6509874326750449, "step": 22990}, {"loss": 0.7307, "grad_norm": 0.7428551316261292, "learning_rate": 0.0002, "epoch": 1.6517055655296229, "step": 23000}, {"loss": 0.7573, "grad_norm": 0.6087952852249146, "learning_rate": 0.0002, "epoch": 1.6524236983842011, "step": 23010}, {"loss": 0.8045, "grad_norm": 0.7191354036331177, "learning_rate": 0.0002, "epoch": 1.6531418312387793, "step": 23020}, {"loss": 0.7517, "grad_norm": 0.8679710626602173, "learning_rate": 0.0002, "epoch": 1.6538599640933573, "step": 23030}, {"loss": 0.7084, "grad_norm": 0.7232310175895691, "learning_rate": 0.0002, "epoch": 1.6545780969479353, "step": 23040}, {"loss": 0.7007, "grad_norm": 0.5695104002952576, "learning_rate": 0.0002, "epoch": 1.6552962298025133, "step": 23050}, {"loss": 0.7115, "grad_norm": 0.6363076567649841, "learning_rate": 0.0002, "epoch": 1.6560143626570916, "step": 23060}, {"loss": 0.7639, "grad_norm": 0.8168749809265137, "learning_rate": 0.0002, "epoch": 1.6567324955116698, "step": 23070}, {"loss": 0.6768, "grad_norm": 0.7664111852645874, "learning_rate": 0.0002, "epoch": 1.6574506283662478, "step": 23080}, {"loss": 0.7492, "grad_norm": 0.6748140454292297, "learning_rate": 0.0002, "epoch": 1.6581687612208258, "step": 23090}, {"loss": 0.7213, "grad_norm": 0.6258183121681213, "learning_rate": 0.0002, "epoch": 1.6588868940754038, "step": 23100}, {"loss": 0.783, "grad_norm": 0.8669735193252563, "learning_rate": 0.0002, "epoch": 1.659605026929982, "step": 23110}, {"loss": 0.6847, "grad_norm": 0.5606119632720947, "learning_rate": 0.0002, "epoch": 1.6603231597845602, "step": 23120}, {"loss": 0.6889, "grad_norm": 0.6602507829666138, "learning_rate": 0.0002, "epoch": 1.6610412926391382, "step": 23130}, {"loss": 0.7605, "grad_norm": 0.7237988710403442, "learning_rate": 0.0002, "epoch": 1.6617594254937162, "step": 23140}, {"loss": 0.7663, "grad_norm": 0.9054415225982666, "learning_rate": 0.0002, "epoch": 1.6624775583482945, "step": 23150}, {"loss": 0.7603, "grad_norm": 0.5186660289764404, "learning_rate": 0.0002, "epoch": 1.6631956912028727, "step": 23160}, {"loss": 0.7442, "grad_norm": 0.719584584236145, "learning_rate": 0.0002, "epoch": 1.6639138240574507, "step": 23170}, {"loss": 0.7715, "grad_norm": 0.7583617568016052, "learning_rate": 0.0002, "epoch": 1.6646319569120287, "step": 23180}, {"loss": 0.7402, "grad_norm": 0.7985982298851013, "learning_rate": 0.0002, "epoch": 1.6653500897666067, "step": 23190}, {"loss": 0.7515, "grad_norm": 0.6952691674232483, "learning_rate": 0.0002, "epoch": 1.666068222621185, "step": 23200}, {"loss": 0.7491, "grad_norm": 0.7184221744537354, "learning_rate": 0.0002, "epoch": 1.6667863554757631, "step": 23210}, {"loss": 0.7608, "grad_norm": 0.8256361484527588, "learning_rate": 0.0002, "epoch": 1.6675044883303412, "step": 23220}, {"loss": 0.7331, "grad_norm": 0.7534128427505493, "learning_rate": 0.0002, "epoch": 1.6682226211849192, "step": 23230}, {"loss": 0.7196, "grad_norm": 0.7711095213890076, "learning_rate": 0.0002, "epoch": 1.6689407540394972, "step": 23240}, {"loss": 0.7871, "grad_norm": 0.6326615810394287, "learning_rate": 0.0002, "epoch": 1.6696588868940754, "step": 23250}, {"loss": 0.7244, "grad_norm": 0.8345766663551331, "learning_rate": 0.0002, "epoch": 1.6703770197486536, "step": 23260}, {"loss": 0.7819, "grad_norm": 0.9079837203025818, "learning_rate": 0.0002, "epoch": 1.6710951526032316, "step": 23270}, {"loss": 0.7259, "grad_norm": 0.7310197353363037, "learning_rate": 0.0002, "epoch": 1.6718132854578096, "step": 23280}, {"loss": 0.7253, "grad_norm": 0.7573344707489014, "learning_rate": 0.0002, "epoch": 1.6725314183123878, "step": 23290}, {"loss": 0.6817, "grad_norm": 0.7708047032356262, "learning_rate": 0.0002, "epoch": 1.673249551166966, "step": 23300}, {"loss": 0.7247, "grad_norm": 0.7665812969207764, "learning_rate": 0.0002, "epoch": 1.673967684021544, "step": 23310}, {"loss": 0.7048, "grad_norm": 0.7988788485527039, "learning_rate": 0.0002, "epoch": 1.674685816876122, "step": 23320}, {"loss": 0.7396, "grad_norm": 0.755042552947998, "learning_rate": 0.0002, "epoch": 1.6754039497307, "step": 23330}, {"loss": 0.7392, "grad_norm": 0.6605848670005798, "learning_rate": 0.0002, "epoch": 1.6761220825852783, "step": 23340}, {"loss": 0.7394, "grad_norm": 0.8762016296386719, "learning_rate": 0.0002, "epoch": 1.6768402154398565, "step": 23350}, {"loss": 0.7661, "grad_norm": 0.604742169380188, "learning_rate": 0.0002, "epoch": 1.6775583482944345, "step": 23360}, {"loss": 0.7422, "grad_norm": 0.7479172945022583, "learning_rate": 0.0002, "epoch": 1.6782764811490125, "step": 23370}, {"loss": 0.7248, "grad_norm": 0.6418702602386475, "learning_rate": 0.0002, "epoch": 1.6789946140035905, "step": 23380}, {"loss": 0.7717, "grad_norm": 0.6783933639526367, "learning_rate": 0.0002, "epoch": 1.6797127468581687, "step": 23390}, {"loss": 0.7099, "grad_norm": 0.7036024928092957, "learning_rate": 0.0002, "epoch": 1.680430879712747, "step": 23400}, {"loss": 0.7439, "grad_norm": 0.6833266615867615, "learning_rate": 0.0002, "epoch": 1.681149012567325, "step": 23410}, {"loss": 0.753, "grad_norm": 0.8867062330245972, "learning_rate": 0.0002, "epoch": 1.681867145421903, "step": 23420}, {"loss": 0.7694, "grad_norm": 0.7825753092765808, "learning_rate": 0.0002, "epoch": 1.6825852782764812, "step": 23430}, {"loss": 0.7127, "grad_norm": 0.6396880745887756, "learning_rate": 0.0002, "epoch": 1.6833034111310592, "step": 23440}, {"loss": 0.7465, "grad_norm": 0.5723230242729187, "learning_rate": 0.0002, "epoch": 1.6840215439856374, "step": 23450}, {"loss": 0.7102, "grad_norm": 0.6949231624603271, "learning_rate": 0.0002, "epoch": 1.6847396768402154, "step": 23460}, {"loss": 0.7421, "grad_norm": 0.8290650248527527, "learning_rate": 0.0002, "epoch": 1.6854578096947934, "step": 23470}, {"loss": 0.7774, "grad_norm": 0.7765078544616699, "learning_rate": 0.0002, "epoch": 1.6861759425493716, "step": 23480}, {"loss": 0.7271, "grad_norm": 0.7084149718284607, "learning_rate": 0.0002, "epoch": 1.6868940754039499, "step": 23490}, {"loss": 0.8188, "grad_norm": 0.6916654109954834, "learning_rate": 0.0002, "epoch": 1.6876122082585279, "step": 23500}, {"loss": 0.7235, "grad_norm": 0.5615179538726807, "learning_rate": 0.0002, "epoch": 1.6883303411131059, "step": 23510}, {"loss": 0.7203, "grad_norm": 0.7996105551719666, "learning_rate": 0.0002, "epoch": 1.6890484739676839, "step": 23520}, {"loss": 0.7145, "grad_norm": 0.7010168433189392, "learning_rate": 0.0002, "epoch": 1.689766606822262, "step": 23530}, {"loss": 0.7696, "grad_norm": 0.7876442074775696, "learning_rate": 0.0002, "epoch": 1.6904847396768403, "step": 23540}, {"loss": 0.6966, "grad_norm": 0.7508043646812439, "learning_rate": 0.0002, "epoch": 1.6912028725314183, "step": 23550}, {"loss": 0.729, "grad_norm": 0.8125874400138855, "learning_rate": 0.0002, "epoch": 1.6919210053859963, "step": 23560}, {"loss": 0.774, "grad_norm": 0.711840808391571, "learning_rate": 0.0002, "epoch": 1.6926391382405745, "step": 23570}, {"loss": 0.7165, "grad_norm": 0.6540026068687439, "learning_rate": 0.0002, "epoch": 1.6933572710951525, "step": 23580}, {"loss": 0.7578, "grad_norm": 0.8376550078392029, "learning_rate": 0.0002, "epoch": 1.6940754039497308, "step": 23590}, {"loss": 0.7746, "grad_norm": 0.7075366973876953, "learning_rate": 0.0002, "epoch": 1.6947935368043088, "step": 23600}, {"loss": 0.7639, "grad_norm": 0.7522266507148743, "learning_rate": 0.0002, "epoch": 1.6955116696588868, "step": 23610}, {"loss": 0.7386, "grad_norm": 0.7572667002677917, "learning_rate": 0.0002, "epoch": 1.696229802513465, "step": 23620}, {"loss": 0.6896, "grad_norm": 0.6126907467842102, "learning_rate": 0.0002, "epoch": 1.6969479353680432, "step": 23630}, {"loss": 0.7182, "grad_norm": 0.7473152875900269, "learning_rate": 0.0002, "epoch": 1.6976660682226212, "step": 23640}, {"loss": 0.7272, "grad_norm": 0.6630390286445618, "learning_rate": 0.0002, "epoch": 1.6983842010771992, "step": 23650}, {"loss": 0.7232, "grad_norm": 0.5848073363304138, "learning_rate": 0.0002, "epoch": 1.6991023339317772, "step": 23660}, {"loss": 0.6923, "grad_norm": 0.5901942849159241, "learning_rate": 0.0002, "epoch": 1.6998204667863555, "step": 23670}, {"loss": 0.79, "grad_norm": 0.7896918058395386, "learning_rate": 0.0002, "epoch": 1.7005385996409337, "step": 23680}, {"loss": 0.77, "grad_norm": 0.705362856388092, "learning_rate": 0.0002, "epoch": 1.7012567324955117, "step": 23690}, {"loss": 0.751, "grad_norm": 0.9917470812797546, "learning_rate": 0.0002, "epoch": 1.7019748653500897, "step": 23700}, {"loss": 0.7403, "grad_norm": 0.7550538778305054, "learning_rate": 0.0002, "epoch": 1.702692998204668, "step": 23710}, {"loss": 0.7398, "grad_norm": 0.8348238468170166, "learning_rate": 0.0002, "epoch": 1.703411131059246, "step": 23720}, {"loss": 0.7799, "grad_norm": 0.5979694128036499, "learning_rate": 0.0002, "epoch": 1.7041292639138241, "step": 23730}, {"loss": 0.7035, "grad_norm": 0.7451775670051575, "learning_rate": 0.0002, "epoch": 1.7048473967684021, "step": 23740}, {"loss": 0.7237, "grad_norm": 0.7614818215370178, "learning_rate": 0.0002, "epoch": 1.7055655296229801, "step": 23750}, {"loss": 0.7636, "grad_norm": 0.5590742826461792, "learning_rate": 0.0002, "epoch": 1.7062836624775584, "step": 23760}, {"loss": 0.701, "grad_norm": 0.7039094567298889, "learning_rate": 0.0002, "epoch": 1.7070017953321366, "step": 23770}, {"loss": 0.7145, "grad_norm": 0.7963233590126038, "learning_rate": 0.0002, "epoch": 1.7077199281867146, "step": 23780}, {"loss": 0.7702, "grad_norm": 0.7214934825897217, "learning_rate": 0.0002, "epoch": 1.7084380610412926, "step": 23790}, {"loss": 0.7515, "grad_norm": 0.7310500741004944, "learning_rate": 0.0002, "epoch": 1.7091561938958706, "step": 23800}, {"loss": 0.7038, "grad_norm": 0.6653284430503845, "learning_rate": 0.0002, "epoch": 1.7098743267504488, "step": 23810}, {"loss": 0.698, "grad_norm": 0.6632702946662903, "learning_rate": 0.0002, "epoch": 1.710592459605027, "step": 23820}, {"loss": 0.7338, "grad_norm": 0.6314955949783325, "learning_rate": 0.0002, "epoch": 1.711310592459605, "step": 23830}, {"loss": 0.7511, "grad_norm": 0.73652583360672, "learning_rate": 0.0002, "epoch": 1.712028725314183, "step": 23840}, {"loss": 0.6999, "grad_norm": 0.5685144662857056, "learning_rate": 0.0002, "epoch": 1.7127468581687613, "step": 23850}, {"loss": 0.7295, "grad_norm": 0.7010223865509033, "learning_rate": 0.0002, "epoch": 1.7134649910233393, "step": 23860}, {"loss": 0.7488, "grad_norm": 0.7643879652023315, "learning_rate": 0.0002, "epoch": 1.7141831238779175, "step": 23870}, {"loss": 0.7449, "grad_norm": 0.7543165683746338, "learning_rate": 0.0002, "epoch": 1.7149012567324955, "step": 23880}, {"loss": 0.6946, "grad_norm": 0.8816508054733276, "learning_rate": 0.0002, "epoch": 1.7156193895870735, "step": 23890}, {"loss": 0.7398, "grad_norm": 0.7979614734649658, "learning_rate": 0.0002, "epoch": 1.7163375224416517, "step": 23900}, {"loss": 0.7844, "grad_norm": 0.7631057500839233, "learning_rate": 0.0002, "epoch": 1.71705565529623, "step": 23910}, {"loss": 0.7409, "grad_norm": 0.6349977254867554, "learning_rate": 0.0002, "epoch": 1.717773788150808, "step": 23920}, {"loss": 0.74, "grad_norm": 0.7464412450790405, "learning_rate": 0.0002, "epoch": 1.718491921005386, "step": 23930}, {"loss": 0.7164, "grad_norm": 0.6985567212104797, "learning_rate": 0.0002, "epoch": 1.719210053859964, "step": 23940}, {"loss": 0.7256, "grad_norm": 0.6641302704811096, "learning_rate": 0.0002, "epoch": 1.7199281867145422, "step": 23950}, {"loss": 0.7154, "grad_norm": 0.7299597263336182, "learning_rate": 0.0002, "epoch": 1.7206463195691204, "step": 23960}, {"loss": 0.7535, "grad_norm": 0.7812355756759644, "learning_rate": 0.0002, "epoch": 1.7213644524236984, "step": 23970}, {"loss": 0.7363, "grad_norm": 0.667571485042572, "learning_rate": 0.0002, "epoch": 1.7220825852782764, "step": 23980}, {"loss": 0.7427, "grad_norm": 0.8244081735610962, "learning_rate": 0.0002, "epoch": 1.7228007181328546, "step": 23990}, {"loss": 0.7191, "grad_norm": 0.6684445738792419, "learning_rate": 0.0002, "epoch": 1.7235188509874326, "step": 24000}, {"loss": 0.8042, "grad_norm": 0.7002949118614197, "learning_rate": 0.0002, "epoch": 1.7242369838420109, "step": 24010}, {"loss": 0.7134, "grad_norm": 0.6249772906303406, "learning_rate": 0.0002, "epoch": 1.7249551166965889, "step": 24020}, {"loss": 0.721, "grad_norm": 0.7279905080795288, "learning_rate": 0.0002, "epoch": 1.7256732495511669, "step": 24030}, {"loss": 0.7374, "grad_norm": 0.631148636341095, "learning_rate": 0.0002, "epoch": 1.726391382405745, "step": 24040}, {"loss": 0.697, "grad_norm": 0.7486464977264404, "learning_rate": 0.0002, "epoch": 1.7271095152603233, "step": 24050}, {"loss": 0.715, "grad_norm": 0.7494347095489502, "learning_rate": 0.0002, "epoch": 1.7278276481149013, "step": 24060}, {"loss": 0.7609, "grad_norm": 0.7821264863014221, "learning_rate": 0.0002, "epoch": 1.7285457809694793, "step": 24070}, {"loss": 0.6925, "grad_norm": 0.7211608290672302, "learning_rate": 0.0002, "epoch": 1.7292639138240573, "step": 24080}, {"loss": 0.7444, "grad_norm": 0.7028553485870361, "learning_rate": 0.0002, "epoch": 1.7299820466786355, "step": 24090}, {"loss": 0.8065, "grad_norm": 0.6189247369766235, "learning_rate": 0.0002, "epoch": 1.7307001795332138, "step": 24100}, {"loss": 0.7011, "grad_norm": 0.7339756488800049, "learning_rate": 0.0002, "epoch": 1.7314183123877918, "step": 24110}, {"loss": 0.8071, "grad_norm": 0.6700502038002014, "learning_rate": 0.0002, "epoch": 1.7321364452423698, "step": 24120}, {"loss": 0.7608, "grad_norm": 0.6139533519744873, "learning_rate": 0.0002, "epoch": 1.732854578096948, "step": 24130}, {"loss": 0.7251, "grad_norm": 0.7249825596809387, "learning_rate": 0.0002, "epoch": 1.733572710951526, "step": 24140}, {"loss": 0.6954, "grad_norm": 0.6531777381896973, "learning_rate": 0.0002, "epoch": 1.7342908438061042, "step": 24150}, {"loss": 0.7214, "grad_norm": 0.8443833589553833, "learning_rate": 0.0002, "epoch": 1.7350089766606822, "step": 24160}, {"loss": 0.75, "grad_norm": 0.7040373086929321, "learning_rate": 0.0002, "epoch": 1.7357271095152602, "step": 24170}, {"loss": 0.701, "grad_norm": 0.8647749423980713, "learning_rate": 0.0002, "epoch": 1.7364452423698384, "step": 24180}, {"loss": 0.7033, "grad_norm": 0.7297305464744568, "learning_rate": 0.0002, "epoch": 1.7371633752244167, "step": 24190}, {"loss": 0.7187, "grad_norm": 0.8191218376159668, "learning_rate": 0.0002, "epoch": 1.7378815080789947, "step": 24200}, {"loss": 0.7665, "grad_norm": 0.7315607666969299, "learning_rate": 0.0002, "epoch": 1.7385996409335727, "step": 24210}, {"loss": 0.7467, "grad_norm": 0.694486677646637, "learning_rate": 0.0002, "epoch": 1.7393177737881507, "step": 24220}, {"loss": 0.7476, "grad_norm": 0.8115953207015991, "learning_rate": 0.0002, "epoch": 1.740035906642729, "step": 24230}, {"loss": 0.7792, "grad_norm": 0.7379186153411865, "learning_rate": 0.0002, "epoch": 1.7407540394973071, "step": 24240}, {"loss": 0.7224, "grad_norm": 0.6820309162139893, "learning_rate": 0.0002, "epoch": 1.7414721723518851, "step": 24250}, {"loss": 0.7558, "grad_norm": 0.8210766911506653, "learning_rate": 0.0002, "epoch": 1.7421903052064631, "step": 24260}, {"loss": 0.7098, "grad_norm": 0.724466860294342, "learning_rate": 0.0002, "epoch": 1.7429084380610413, "step": 24270}, {"loss": 0.7343, "grad_norm": 0.8768740296363831, "learning_rate": 0.0002, "epoch": 1.7436265709156193, "step": 24280}, {"loss": 0.7041, "grad_norm": 0.6691206097602844, "learning_rate": 0.0002, "epoch": 1.7443447037701976, "step": 24290}, {"loss": 0.7526, "grad_norm": 0.6529893279075623, "learning_rate": 0.0002, "epoch": 1.7450628366247756, "step": 24300}, {"loss": 0.7638, "grad_norm": 0.904729962348938, "learning_rate": 0.0002, "epoch": 1.7457809694793536, "step": 24310}, {"loss": 0.7463, "grad_norm": 0.655235230922699, "learning_rate": 0.0002, "epoch": 1.7464991023339318, "step": 24320}, {"loss": 0.7625, "grad_norm": 0.9476361274719238, "learning_rate": 0.0002, "epoch": 1.74721723518851, "step": 24330}, {"loss": 0.688, "grad_norm": 0.55366051197052, "learning_rate": 0.0002, "epoch": 1.747935368043088, "step": 24340}, {"loss": 0.7664, "grad_norm": 0.7192568182945251, "learning_rate": 0.0002, "epoch": 1.748653500897666, "step": 24350}, {"loss": 0.7423, "grad_norm": 0.7193983793258667, "learning_rate": 0.0002, "epoch": 1.749371633752244, "step": 24360}, {"loss": 0.7463, "grad_norm": 0.753998339176178, "learning_rate": 0.0002, "epoch": 1.7500897666068223, "step": 24370}, {"loss": 0.7415, "grad_norm": 1.1058299541473389, "learning_rate": 0.0002, "epoch": 1.7508078994614005, "step": 24380}, {"loss": 0.7373, "grad_norm": 0.7213007211685181, "learning_rate": 0.0002, "epoch": 1.7515260323159785, "step": 24390}, {"loss": 0.7395, "grad_norm": 0.972494900226593, "learning_rate": 0.0002, "epoch": 1.7522441651705565, "step": 24400}, {"loss": 0.7689, "grad_norm": 0.8045306205749512, "learning_rate": 0.0002, "epoch": 1.7529622980251347, "step": 24410}, {"loss": 0.7463, "grad_norm": 0.82415372133255, "learning_rate": 0.0002, "epoch": 1.7536804308797127, "step": 24420}, {"loss": 0.7384, "grad_norm": 0.72683185338974, "learning_rate": 0.0002, "epoch": 1.754398563734291, "step": 24430}, {"loss": 0.7512, "grad_norm": 0.687907338142395, "learning_rate": 0.0002, "epoch": 1.755116696588869, "step": 24440}, {"loss": 0.7627, "grad_norm": 0.6616531610488892, "learning_rate": 0.0002, "epoch": 1.755834829443447, "step": 24450}, {"loss": 0.7425, "grad_norm": 0.7225571870803833, "learning_rate": 0.0002, "epoch": 1.7565529622980252, "step": 24460}, {"loss": 0.7584, "grad_norm": 0.7597603797912598, "learning_rate": 0.0002, "epoch": 1.7572710951526034, "step": 24470}, {"loss": 0.7076, "grad_norm": 0.7850660681724548, "learning_rate": 0.0002, "epoch": 1.7579892280071814, "step": 24480}, {"loss": 0.7294, "grad_norm": 0.9843530058860779, "learning_rate": 0.0002, "epoch": 1.7587073608617594, "step": 24490}, {"loss": 0.7237, "grad_norm": 0.7010256052017212, "learning_rate": 0.0002, "epoch": 1.7594254937163374, "step": 24500}, {"loss": 0.7143, "grad_norm": 0.5669383406639099, "learning_rate": 0.0002, "epoch": 1.7601436265709156, "step": 24510}, {"loss": 0.7511, "grad_norm": 0.7043302655220032, "learning_rate": 0.0002, "epoch": 1.7608617594254938, "step": 24520}, {"loss": 0.73, "grad_norm": 0.8000741600990295, "learning_rate": 0.0002, "epoch": 1.7615798922800718, "step": 24530}, {"loss": 0.6994, "grad_norm": 0.7084416747093201, "learning_rate": 0.0002, "epoch": 1.7622980251346498, "step": 24540}, {"loss": 0.7337, "grad_norm": 0.7290608882904053, "learning_rate": 0.0002, "epoch": 1.763016157989228, "step": 24550}, {"loss": 0.6968, "grad_norm": 0.8710007071495056, "learning_rate": 0.0002, "epoch": 1.763734290843806, "step": 24560}, {"loss": 0.7023, "grad_norm": 0.6346535682678223, "learning_rate": 0.0002, "epoch": 1.7644524236983843, "step": 24570}, {"loss": 0.684, "grad_norm": 0.8990599513053894, "learning_rate": 0.0002, "epoch": 1.7651705565529623, "step": 24580}, {"loss": 0.7222, "grad_norm": 0.7823857665061951, "learning_rate": 0.0002, "epoch": 1.7658886894075403, "step": 24590}, {"loss": 0.7392, "grad_norm": 0.6250144839286804, "learning_rate": 0.0002, "epoch": 1.7666068222621185, "step": 24600}, {"loss": 0.7159, "grad_norm": 0.715657114982605, "learning_rate": 0.0002, "epoch": 1.7673249551166967, "step": 24610}, {"loss": 0.7245, "grad_norm": 0.6254874467849731, "learning_rate": 0.0002, "epoch": 1.7680430879712747, "step": 24620}, {"loss": 0.7258, "grad_norm": 0.6873717904090881, "learning_rate": 0.0002, "epoch": 1.7687612208258527, "step": 24630}, {"loss": 0.7951, "grad_norm": 0.7273038625717163, "learning_rate": 0.0002, "epoch": 1.7694793536804307, "step": 24640}, {"loss": 0.7417, "grad_norm": 0.9079981446266174, "learning_rate": 0.0002, "epoch": 1.770197486535009, "step": 24650}, {"loss": 0.7138, "grad_norm": 0.6262510418891907, "learning_rate": 0.0002, "epoch": 1.7709156193895872, "step": 24660}, {"loss": 0.6995, "grad_norm": 0.7326231002807617, "learning_rate": 0.0002, "epoch": 1.7716337522441652, "step": 24670}, {"loss": 0.7483, "grad_norm": 0.7828301787376404, "learning_rate": 0.0002, "epoch": 1.7723518850987432, "step": 24680}, {"loss": 0.689, "grad_norm": 0.5881586670875549, "learning_rate": 0.0002, "epoch": 1.7730700179533212, "step": 24690}, {"loss": 0.744, "grad_norm": 0.7101683020591736, "learning_rate": 0.0002, "epoch": 1.7737881508078994, "step": 24700}, {"loss": 0.7145, "grad_norm": 0.8466469049453735, "learning_rate": 0.0002, "epoch": 1.7745062836624776, "step": 24710}, {"loss": 0.7428, "grad_norm": 0.7770822644233704, "learning_rate": 0.0002, "epoch": 1.7752244165170556, "step": 24720}, {"loss": 0.7299, "grad_norm": 0.7259120345115662, "learning_rate": 0.0002, "epoch": 1.7759425493716336, "step": 24730}, {"loss": 0.6909, "grad_norm": 0.7696824669837952, "learning_rate": 0.0002, "epoch": 1.7766606822262119, "step": 24740}, {"loss": 0.7659, "grad_norm": 0.7603837847709656, "learning_rate": 0.0002, "epoch": 1.77737881508079, "step": 24750}, {"loss": 0.6966, "grad_norm": 0.6166595220565796, "learning_rate": 0.0002, "epoch": 1.778096947935368, "step": 24760}, {"loss": 0.6987, "grad_norm": 0.7493758797645569, "learning_rate": 0.0002, "epoch": 1.778815080789946, "step": 24770}, {"loss": 0.6808, "grad_norm": 0.7177459597587585, "learning_rate": 0.0002, "epoch": 1.779533213644524, "step": 24780}, {"loss": 0.7411, "grad_norm": 0.6666781306266785, "learning_rate": 0.0002, "epoch": 1.7802513464991023, "step": 24790}, {"loss": 0.6867, "grad_norm": 0.6556468605995178, "learning_rate": 0.0002, "epoch": 1.7809694793536806, "step": 24800}, {"loss": 0.7375, "grad_norm": 0.6119393706321716, "learning_rate": 0.0002, "epoch": 1.7816876122082586, "step": 24810}, {"loss": 0.7059, "grad_norm": 0.8573325276374817, "learning_rate": 0.0002, "epoch": 1.7824057450628366, "step": 24820}, {"loss": 0.7708, "grad_norm": 0.8017005920410156, "learning_rate": 0.0002, "epoch": 1.7831238779174146, "step": 24830}, {"loss": 0.7041, "grad_norm": 0.7337947487831116, "learning_rate": 0.0002, "epoch": 1.7838420107719928, "step": 24840}, {"loss": 0.7325, "grad_norm": 0.6717178225517273, "learning_rate": 0.0002, "epoch": 1.784560143626571, "step": 24850}, {"loss": 0.7285, "grad_norm": 0.8243708610534668, "learning_rate": 0.0002, "epoch": 1.785278276481149, "step": 24860}, {"loss": 0.701, "grad_norm": 0.8111547827720642, "learning_rate": 0.0002, "epoch": 1.785996409335727, "step": 24870}, {"loss": 0.7105, "grad_norm": 0.8577823042869568, "learning_rate": 0.0002, "epoch": 1.7867145421903052, "step": 24880}, {"loss": 0.7419, "grad_norm": 0.6488644480705261, "learning_rate": 0.0002, "epoch": 1.7874326750448835, "step": 24890}, {"loss": 0.7112, "grad_norm": 0.6446744799613953, "learning_rate": 0.0002, "epoch": 1.7881508078994615, "step": 24900}, {"loss": 0.7531, "grad_norm": 0.6400182247161865, "learning_rate": 0.0002, "epoch": 1.7888689407540395, "step": 24910}, {"loss": 0.711, "grad_norm": 0.8059108853340149, "learning_rate": 0.0002, "epoch": 1.7895870736086175, "step": 24920}, {"loss": 0.7678, "grad_norm": 0.7101734280586243, "learning_rate": 0.0002, "epoch": 1.7903052064631957, "step": 24930}, {"loss": 0.7648, "grad_norm": 1.0397762060165405, "learning_rate": 0.0002, "epoch": 1.791023339317774, "step": 24940}, {"loss": 0.7079, "grad_norm": 0.6231128573417664, "learning_rate": 0.0002, "epoch": 1.791741472172352, "step": 24950}, {"loss": 0.7525, "grad_norm": 5.905253887176514, "learning_rate": 0.0002, "epoch": 1.79245960502693, "step": 24960}, {"loss": 0.7286, "grad_norm": 0.8003911375999451, "learning_rate": 0.0002, "epoch": 1.793177737881508, "step": 24970}, {"loss": 0.7002, "grad_norm": 0.6340393424034119, "learning_rate": 0.0002, "epoch": 1.7938958707360861, "step": 24980}, {"loss": 0.7056, "grad_norm": 0.8701013922691345, "learning_rate": 0.0002, "epoch": 1.7946140035906644, "step": 24990}, {"loss": 0.7192, "grad_norm": 0.9085575342178345, "learning_rate": 0.0002, "epoch": 1.7953321364452424, "step": 25000}, {"loss": 0.7367, "grad_norm": 0.6306625604629517, "learning_rate": 0.0002, "epoch": 1.7960502692998204, "step": 25010}, {"loss": 0.7122, "grad_norm": 0.6985056400299072, "learning_rate": 0.0002, "epoch": 1.7967684021543986, "step": 25020}, {"loss": 0.7005, "grad_norm": 0.7309113144874573, "learning_rate": 0.0002, "epoch": 1.7974865350089768, "step": 25030}, {"loss": 0.7414, "grad_norm": 0.6795042157173157, "learning_rate": 0.0002, "epoch": 1.7982046678635548, "step": 25040}, {"loss": 0.7606, "grad_norm": 0.6920178532600403, "learning_rate": 0.0002, "epoch": 1.7989228007181328, "step": 25050}, {"loss": 0.7094, "grad_norm": 0.6578564047813416, "learning_rate": 0.0002, "epoch": 1.7996409335727108, "step": 25060}, {"loss": 0.7471, "grad_norm": 0.6718358993530273, "learning_rate": 0.0002, "epoch": 1.800359066427289, "step": 25070}, {"loss": 0.7271, "grad_norm": 0.9086750149726868, "learning_rate": 0.0002, "epoch": 1.8010771992818673, "step": 25080}, {"loss": 0.7653, "grad_norm": 0.6102437973022461, "learning_rate": 0.0002, "epoch": 1.8017953321364453, "step": 25090}, {"loss": 0.7538, "grad_norm": 0.6391313076019287, "learning_rate": 0.0002, "epoch": 1.8025134649910233, "step": 25100}, {"loss": 0.766, "grad_norm": 0.7150128483772278, "learning_rate": 0.0002, "epoch": 1.8032315978456013, "step": 25110}, {"loss": 0.7036, "grad_norm": 0.9833421111106873, "learning_rate": 0.0002, "epoch": 1.8039497307001795, "step": 25120}, {"loss": 0.7122, "grad_norm": 0.774002194404602, "learning_rate": 0.0002, "epoch": 1.8046678635547577, "step": 25130}, {"loss": 0.7329, "grad_norm": 0.644443154335022, "learning_rate": 0.0002, "epoch": 1.8053859964093357, "step": 25140}, {"loss": 0.7039, "grad_norm": 0.6996100544929504, "learning_rate": 0.0002, "epoch": 1.8061041292639137, "step": 25150}, {"loss": 0.6962, "grad_norm": 0.7545985579490662, "learning_rate": 0.0002, "epoch": 1.806822262118492, "step": 25160}, {"loss": 0.7432, "grad_norm": 0.7505226731300354, "learning_rate": 0.0002, "epoch": 1.8075403949730702, "step": 25170}, {"loss": 0.7189, "grad_norm": 0.800681471824646, "learning_rate": 0.0002, "epoch": 1.8082585278276482, "step": 25180}, {"loss": 0.7131, "grad_norm": 0.8268337845802307, "learning_rate": 0.0002, "epoch": 1.8089766606822262, "step": 25190}, {"loss": 0.7933, "grad_norm": 0.6436594128608704, "learning_rate": 0.0002, "epoch": 1.8096947935368042, "step": 25200}, {"loss": 0.7478, "grad_norm": 0.6961014270782471, "learning_rate": 0.0002, "epoch": 1.8104129263913824, "step": 25210}, {"loss": 0.7519, "grad_norm": 0.6649489998817444, "learning_rate": 0.0002, "epoch": 1.8111310592459606, "step": 25220}, {"loss": 0.7307, "grad_norm": 0.7071637511253357, "learning_rate": 0.0002, "epoch": 1.8118491921005386, "step": 25230}, {"loss": 0.7074, "grad_norm": 0.9082241654396057, "learning_rate": 0.0002, "epoch": 1.8125673249551166, "step": 25240}, {"loss": 0.7406, "grad_norm": 0.6318159103393555, "learning_rate": 0.0002, "epoch": 1.8132854578096946, "step": 25250}, {"loss": 0.7081, "grad_norm": 0.8006597757339478, "learning_rate": 0.0002, "epoch": 1.8140035906642729, "step": 25260}, {"loss": 0.7593, "grad_norm": 0.7950259447097778, "learning_rate": 0.0002, "epoch": 1.814721723518851, "step": 25270}, {"loss": 0.6897, "grad_norm": 0.8376588821411133, "learning_rate": 0.0002, "epoch": 1.815439856373429, "step": 25280}, {"loss": 0.747, "grad_norm": 0.8343217968940735, "learning_rate": 0.0002, "epoch": 1.816157989228007, "step": 25290}, {"loss": 0.7611, "grad_norm": 0.6240017414093018, "learning_rate": 0.0002, "epoch": 1.8168761220825853, "step": 25300}, {"loss": 0.7458, "grad_norm": 0.7079808712005615, "learning_rate": 0.0002, "epoch": 1.8175942549371635, "step": 25310}, {"loss": 0.7254, "grad_norm": 0.5930073261260986, "learning_rate": 0.0002, "epoch": 1.8183123877917415, "step": 25320}, {"loss": 0.7647, "grad_norm": 0.6994491815567017, "learning_rate": 0.0002, "epoch": 1.8190305206463195, "step": 25330}, {"loss": 0.726, "grad_norm": 0.8285305500030518, "learning_rate": 0.0002, "epoch": 1.8197486535008975, "step": 25340}, {"loss": 0.7215, "grad_norm": 0.6880194544792175, "learning_rate": 0.0002, "epoch": 1.8204667863554758, "step": 25350}, {"loss": 0.7365, "grad_norm": 0.7301307916641235, "learning_rate": 0.0002, "epoch": 1.821184919210054, "step": 25360}, {"loss": 0.7308, "grad_norm": 0.8117532730102539, "learning_rate": 0.0002, "epoch": 1.821903052064632, "step": 25370}, {"loss": 0.7395, "grad_norm": 0.8098701238632202, "learning_rate": 0.0002, "epoch": 1.82262118491921, "step": 25380}, {"loss": 0.7082, "grad_norm": 0.6899038553237915, "learning_rate": 0.0002, "epoch": 1.823339317773788, "step": 25390}, {"loss": 0.697, "grad_norm": 0.7350431084632874, "learning_rate": 0.0002, "epoch": 1.8240574506283662, "step": 25400}, {"loss": 0.7389, "grad_norm": 0.8723382949829102, "learning_rate": 0.0002, "epoch": 1.8247755834829444, "step": 25410}, {"loss": 0.7375, "grad_norm": 0.7448108196258545, "learning_rate": 0.0002, "epoch": 1.8254937163375224, "step": 25420}, {"loss": 0.7279, "grad_norm": 0.7525040507316589, "learning_rate": 0.0002, "epoch": 1.8262118491921004, "step": 25430}, {"loss": 0.7164, "grad_norm": 0.7148599028587341, "learning_rate": 0.0002, "epoch": 1.8269299820466787, "step": 25440}, {"loss": 0.7955, "grad_norm": 1.1802153587341309, "learning_rate": 0.0002, "epoch": 1.827648114901257, "step": 25450}, {"loss": 0.7094, "grad_norm": 0.619945764541626, "learning_rate": 0.0002, "epoch": 1.828366247755835, "step": 25460}, {"loss": 0.8234, "grad_norm": 0.7065792679786682, "learning_rate": 0.0002, "epoch": 1.829084380610413, "step": 25470}, {"loss": 0.796, "grad_norm": 0.6626001596450806, "learning_rate": 0.0002, "epoch": 1.829802513464991, "step": 25480}, {"loss": 0.7402, "grad_norm": 0.8368920087814331, "learning_rate": 0.0002, "epoch": 1.8305206463195691, "step": 25490}, {"loss": 0.6513, "grad_norm": 0.7528934478759766, "learning_rate": 0.0002, "epoch": 1.8312387791741473, "step": 25500}, {"loss": 0.7272, "grad_norm": 0.6472136378288269, "learning_rate": 0.0002, "epoch": 1.8319569120287253, "step": 25510}, {"loss": 0.7221, "grad_norm": 0.7818671464920044, "learning_rate": 0.0002, "epoch": 1.8326750448833034, "step": 25520}, {"loss": 0.7582, "grad_norm": 0.8280798196792603, "learning_rate": 0.0002, "epoch": 1.8333931777378814, "step": 25530}, {"loss": 0.7079, "grad_norm": 0.7038599252700806, "learning_rate": 0.0002, "epoch": 1.8341113105924596, "step": 25540}, {"loss": 0.711, "grad_norm": 0.6345962882041931, "learning_rate": 0.0002, "epoch": 1.8348294434470378, "step": 25550}, {"loss": 0.7553, "grad_norm": 0.6891741752624512, "learning_rate": 0.0002, "epoch": 1.8355475763016158, "step": 25560}, {"loss": 0.754, "grad_norm": 0.7753492593765259, "learning_rate": 0.0002, "epoch": 1.8362657091561938, "step": 25570}, {"loss": 0.7149, "grad_norm": 0.6907210946083069, "learning_rate": 0.0002, "epoch": 1.836983842010772, "step": 25580}, {"loss": 0.705, "grad_norm": 0.7483090162277222, "learning_rate": 0.0002, "epoch": 1.8377019748653503, "step": 25590}, {"loss": 0.7716, "grad_norm": 0.8749029636383057, "learning_rate": 0.0002, "epoch": 1.8384201077199283, "step": 25600}, {"loss": 0.7745, "grad_norm": 0.6936851143836975, "learning_rate": 0.0002, "epoch": 1.8391382405745063, "step": 25610}, {"loss": 0.7297, "grad_norm": 0.7273763418197632, "learning_rate": 0.0002, "epoch": 1.8398563734290843, "step": 25620}, {"loss": 0.724, "grad_norm": 0.7655298113822937, "learning_rate": 0.0002, "epoch": 1.8405745062836625, "step": 25630}, {"loss": 0.7566, "grad_norm": 0.7207344770431519, "learning_rate": 0.0002, "epoch": 1.8412926391382407, "step": 25640}, {"loss": 0.7092, "grad_norm": 0.6970131397247314, "learning_rate": 0.0002, "epoch": 1.8420107719928187, "step": 25650}, {"loss": 0.7164, "grad_norm": 0.7777560353279114, "learning_rate": 0.0002, "epoch": 1.8427289048473967, "step": 25660}, {"loss": 0.7594, "grad_norm": 0.7070116400718689, "learning_rate": 0.0002, "epoch": 1.8434470377019747, "step": 25670}, {"loss": 0.7603, "grad_norm": 0.6980257630348206, "learning_rate": 0.0002, "epoch": 1.844165170556553, "step": 25680}, {"loss": 0.7782, "grad_norm": 0.906563401222229, "learning_rate": 0.0002, "epoch": 1.8448833034111312, "step": 25690}, {"loss": 0.7377, "grad_norm": 0.567991316318512, "learning_rate": 0.0002, "epoch": 1.8456014362657092, "step": 25700}, {"loss": 0.7236, "grad_norm": 0.5954506993293762, "learning_rate": 0.0002, "epoch": 1.8463195691202872, "step": 25710}, {"loss": 0.7287, "grad_norm": 0.8073318600654602, "learning_rate": 0.0002, "epoch": 1.8470377019748654, "step": 25720}, {"loss": 0.7627, "grad_norm": 0.7439551949501038, "learning_rate": 0.0002, "epoch": 1.8477558348294436, "step": 25730}, {"loss": 0.7719, "grad_norm": 0.8091771602630615, "learning_rate": 0.0002, "epoch": 1.8484739676840216, "step": 25740}, {"loss": 0.7477, "grad_norm": 0.6584576964378357, "learning_rate": 0.0002, "epoch": 1.8491921005385996, "step": 25750}, {"loss": 0.6988, "grad_norm": 0.8161963224411011, "learning_rate": 0.0002, "epoch": 1.8499102333931776, "step": 25760}, {"loss": 0.7607, "grad_norm": 0.7337122559547424, "learning_rate": 0.0002, "epoch": 1.8506283662477558, "step": 25770}, {"loss": 0.7279, "grad_norm": 0.8968114256858826, "learning_rate": 0.0002, "epoch": 1.851346499102334, "step": 25780}, {"loss": 0.7162, "grad_norm": 0.8647686839103699, "learning_rate": 0.0002, "epoch": 1.852064631956912, "step": 25790}, {"loss": 0.7315, "grad_norm": 0.7775349020957947, "learning_rate": 0.0002, "epoch": 1.85278276481149, "step": 25800}, {"loss": 0.7739, "grad_norm": 0.686072587966919, "learning_rate": 0.0002, "epoch": 1.853500897666068, "step": 25810}, {"loss": 0.7138, "grad_norm": 0.7053380012512207, "learning_rate": 0.0002, "epoch": 1.8542190305206463, "step": 25820}, {"loss": 0.7583, "grad_norm": 0.7899979948997498, "learning_rate": 0.0002, "epoch": 1.8549371633752245, "step": 25830}, {"loss": 0.7633, "grad_norm": 0.6970776915550232, "learning_rate": 0.0002, "epoch": 1.8556552962298025, "step": 25840}, {"loss": 0.7704, "grad_norm": 0.7210841774940491, "learning_rate": 0.0002, "epoch": 1.8563734290843805, "step": 25850}, {"loss": 0.7422, "grad_norm": 0.7297208905220032, "learning_rate": 0.0002, "epoch": 1.8570915619389587, "step": 25860}, {"loss": 0.698, "grad_norm": 0.7782729268074036, "learning_rate": 0.0002, "epoch": 1.857809694793537, "step": 25870}, {"loss": 0.7791, "grad_norm": 0.7227505445480347, "learning_rate": 0.0002, "epoch": 1.858527827648115, "step": 25880}, {"loss": 0.7899, "grad_norm": 0.7489684224128723, "learning_rate": 0.0002, "epoch": 1.859245960502693, "step": 25890}, {"loss": 0.7875, "grad_norm": 0.7447289824485779, "learning_rate": 0.0002, "epoch": 1.859964093357271, "step": 25900}, {"loss": 0.7151, "grad_norm": 0.8516317009925842, "learning_rate": 0.0002, "epoch": 1.8606822262118492, "step": 25910}, {"loss": 0.6947, "grad_norm": 0.6864543557167053, "learning_rate": 0.0002, "epoch": 1.8614003590664274, "step": 25920}, {"loss": 0.7516, "grad_norm": 0.6753451824188232, "learning_rate": 0.0002, "epoch": 1.8621184919210054, "step": 25930}, {"loss": 0.7606, "grad_norm": 0.631679117679596, "learning_rate": 0.0002, "epoch": 1.8628366247755834, "step": 25940}, {"loss": 0.7663, "grad_norm": 0.7715049982070923, "learning_rate": 0.0002, "epoch": 1.8635547576301614, "step": 25950}, {"loss": 0.6967, "grad_norm": 0.7354850769042969, "learning_rate": 0.0002, "epoch": 1.8642728904847397, "step": 25960}, {"loss": 0.7331, "grad_norm": 0.7443442940711975, "learning_rate": 0.0002, "epoch": 1.8649910233393179, "step": 25970}, {"loss": 0.7558, "grad_norm": 0.6880337595939636, "learning_rate": 0.0002, "epoch": 1.8657091561938959, "step": 25980}, {"loss": 0.752, "grad_norm": 0.843941867351532, "learning_rate": 0.0002, "epoch": 1.8664272890484739, "step": 25990}, {"loss": 0.6941, "grad_norm": 0.6904318928718567, "learning_rate": 0.0002, "epoch": 1.867145421903052, "step": 26000}, {"loss": 0.6995, "grad_norm": 0.9041751623153687, "learning_rate": 0.0002, "epoch": 1.86786355475763, "step": 26010}, {"loss": 0.7503, "grad_norm": 0.7470057010650635, "learning_rate": 0.0002, "epoch": 1.8685816876122083, "step": 26020}, {"loss": 0.775, "grad_norm": 0.6921331882476807, "learning_rate": 0.0002, "epoch": 1.8692998204667863, "step": 26030}, {"loss": 0.7376, "grad_norm": 0.7627376914024353, "learning_rate": 0.0002, "epoch": 1.8700179533213643, "step": 26040}, {"loss": 0.7459, "grad_norm": 0.7784932851791382, "learning_rate": 0.0002, "epoch": 1.8707360861759426, "step": 26050}, {"loss": 0.7479, "grad_norm": 0.6399524807929993, "learning_rate": 0.0002, "epoch": 1.8714542190305208, "step": 26060}, {"loss": 0.7128, "grad_norm": 0.6478492617607117, "learning_rate": 0.0002, "epoch": 1.8721723518850988, "step": 26070}, {"loss": 0.6901, "grad_norm": 0.6376804113388062, "learning_rate": 0.0002, "epoch": 1.8728904847396768, "step": 26080}, {"loss": 0.7037, "grad_norm": 0.6976892352104187, "learning_rate": 0.0002, "epoch": 1.8736086175942548, "step": 26090}, {"loss": 0.7071, "grad_norm": 0.7997903227806091, "learning_rate": 0.0002, "epoch": 1.874326750448833, "step": 26100}, {"loss": 0.7152, "grad_norm": 0.6984273791313171, "learning_rate": 0.0002, "epoch": 1.8750448833034112, "step": 26110}, {"loss": 0.7768, "grad_norm": 0.7020659446716309, "learning_rate": 0.0002, "epoch": 1.8757630161579892, "step": 26120}, {"loss": 0.7518, "grad_norm": 0.784986138343811, "learning_rate": 0.0002, "epoch": 1.8764811490125672, "step": 26130}, {"loss": 0.7224, "grad_norm": 0.7369210124015808, "learning_rate": 0.0002, "epoch": 1.8771992818671455, "step": 26140}, {"loss": 0.7935, "grad_norm": 0.7730622291564941, "learning_rate": 0.0002, "epoch": 1.8779174147217235, "step": 26150}, {"loss": 0.697, "grad_norm": 0.7253434658050537, "learning_rate": 0.0002, "epoch": 1.8786355475763017, "step": 26160}, {"loss": 0.6866, "grad_norm": 0.8019800186157227, "learning_rate": 0.0002, "epoch": 1.8793536804308797, "step": 26170}, {"loss": 0.7341, "grad_norm": 0.7337628602981567, "learning_rate": 0.0002, "epoch": 1.8800718132854577, "step": 26180}, {"loss": 0.752, "grad_norm": 0.7049200534820557, "learning_rate": 0.0002, "epoch": 1.880789946140036, "step": 26190}, {"loss": 0.73, "grad_norm": 0.6451525092124939, "learning_rate": 0.0002, "epoch": 1.8815080789946141, "step": 26200}, {"loss": 0.749, "grad_norm": 0.7660874724388123, "learning_rate": 0.0002, "epoch": 1.8822262118491921, "step": 26210}, {"loss": 0.7377, "grad_norm": 0.8464223146438599, "learning_rate": 0.0002, "epoch": 1.8829443447037701, "step": 26220}, {"loss": 0.7402, "grad_norm": 0.859503984451294, "learning_rate": 0.0002, "epoch": 1.8836624775583481, "step": 26230}, {"loss": 0.7057, "grad_norm": 0.6969478726387024, "learning_rate": 0.0002, "epoch": 1.8843806104129264, "step": 26240}, {"loss": 0.7338, "grad_norm": 0.6860285997390747, "learning_rate": 0.0002, "epoch": 1.8850987432675046, "step": 26250}, {"loss": 0.7397, "grad_norm": 0.5873110294342041, "learning_rate": 0.0002, "epoch": 1.8858168761220826, "step": 26260}, {"loss": 0.7208, "grad_norm": 0.6959530115127563, "learning_rate": 0.0002, "epoch": 1.8865350089766606, "step": 26270}, {"loss": 0.7156, "grad_norm": 0.8734689950942993, "learning_rate": 0.0002, "epoch": 1.8872531418312388, "step": 26280}, {"loss": 0.689, "grad_norm": 0.7385509014129639, "learning_rate": 0.0002, "epoch": 1.8879712746858168, "step": 26290}, {"loss": 0.7355, "grad_norm": 0.6702063083648682, "learning_rate": 0.0002, "epoch": 1.888689407540395, "step": 26300}, {"loss": 0.7247, "grad_norm": 0.8177255988121033, "learning_rate": 0.0002, "epoch": 1.889407540394973, "step": 26310}, {"loss": 0.7451, "grad_norm": 0.6638466715812683, "learning_rate": 0.0002, "epoch": 1.890125673249551, "step": 26320}, {"loss": 0.7176, "grad_norm": 0.8584128618240356, "learning_rate": 0.0002, "epoch": 1.8908438061041293, "step": 26330}, {"loss": 0.7216, "grad_norm": 0.677561342716217, "learning_rate": 0.0002, "epoch": 1.8915619389587075, "step": 26340}, {"loss": 0.7502, "grad_norm": 0.6931864619255066, "learning_rate": 0.0002, "epoch": 1.8922800718132855, "step": 26350}, {"loss": 0.7548, "grad_norm": 0.6583828330039978, "learning_rate": 0.0002, "epoch": 1.8929982046678635, "step": 26360}, {"loss": 0.7544, "grad_norm": 0.6708519458770752, "learning_rate": 0.0002, "epoch": 1.8937163375224415, "step": 26370}, {"loss": 0.7034, "grad_norm": 0.7684788107872009, "learning_rate": 0.0002, "epoch": 1.8944344703770197, "step": 26380}, {"loss": 0.7243, "grad_norm": 0.703217625617981, "learning_rate": 0.0002, "epoch": 1.895152603231598, "step": 26390}, {"loss": 0.7768, "grad_norm": 0.6686710119247437, "learning_rate": 0.0002, "epoch": 1.895870736086176, "step": 26400}, {"loss": 0.7999, "grad_norm": 0.7429705262184143, "learning_rate": 0.0002, "epoch": 1.896588868940754, "step": 26410}, {"loss": 0.7695, "grad_norm": 0.7835305333137512, "learning_rate": 0.0002, "epoch": 1.8973070017953322, "step": 26420}, {"loss": 0.722, "grad_norm": 0.7793689370155334, "learning_rate": 0.0002, "epoch": 1.8980251346499102, "step": 26430}, {"loss": 0.7872, "grad_norm": 0.7337237000465393, "learning_rate": 0.0002, "epoch": 1.8987432675044884, "step": 26440}, {"loss": 0.7092, "grad_norm": 0.5734546780586243, "learning_rate": 0.0002, "epoch": 1.8994614003590664, "step": 26450}, {"loss": 0.7738, "grad_norm": 0.655937135219574, "learning_rate": 0.0002, "epoch": 1.9001795332136444, "step": 26460}, {"loss": 0.7302, "grad_norm": 1.0200905799865723, "learning_rate": 0.0002, "epoch": 1.9008976660682226, "step": 26470}, {"loss": 0.733, "grad_norm": 0.6118829250335693, "learning_rate": 0.0002, "epoch": 1.9016157989228009, "step": 26480}, {"loss": 0.7255, "grad_norm": 0.7459297776222229, "learning_rate": 0.0002, "epoch": 1.9023339317773789, "step": 26490}, {"loss": 0.7257, "grad_norm": 0.9451959729194641, "learning_rate": 0.0002, "epoch": 1.9030520646319569, "step": 26500}, {"loss": 0.7911, "grad_norm": 0.9694880247116089, "learning_rate": 0.0002, "epoch": 1.9037701974865349, "step": 26510}, {"loss": 0.7913, "grad_norm": 0.806532084941864, "learning_rate": 0.0002, "epoch": 1.904488330341113, "step": 26520}, {"loss": 0.7375, "grad_norm": 0.7016968727111816, "learning_rate": 0.0002, "epoch": 1.9052064631956913, "step": 26530}, {"loss": 0.7128, "grad_norm": 0.7707533836364746, "learning_rate": 0.0002, "epoch": 1.9059245960502693, "step": 26540}, {"loss": 0.7225, "grad_norm": 0.716044545173645, "learning_rate": 0.0002, "epoch": 1.9066427289048473, "step": 26550}, {"loss": 0.7569, "grad_norm": 0.7904782295227051, "learning_rate": 0.0002, "epoch": 1.9073608617594255, "step": 26560}, {"loss": 0.7112, "grad_norm": 0.8557461500167847, "learning_rate": 0.0002, "epoch": 1.9080789946140035, "step": 26570}, {"loss": 0.7377, "grad_norm": 0.6807048916816711, "learning_rate": 0.0002, "epoch": 1.9087971274685818, "step": 26580}, {"loss": 0.7066, "grad_norm": 0.8374032974243164, "learning_rate": 0.0002, "epoch": 1.9095152603231598, "step": 26590}, {"loss": 0.7282, "grad_norm": 0.7936834692955017, "learning_rate": 0.0002, "epoch": 1.9102333931777378, "step": 26600}, {"loss": 0.741, "grad_norm": 0.6342210173606873, "learning_rate": 0.0002, "epoch": 1.910951526032316, "step": 26610}, {"loss": 0.7117, "grad_norm": 0.8222208023071289, "learning_rate": 0.0002, "epoch": 1.9116696588868942, "step": 26620}, {"loss": 0.6965, "grad_norm": 0.7890012860298157, "learning_rate": 0.0002, "epoch": 1.9123877917414722, "step": 26630}, {"loss": 0.7141, "grad_norm": 0.6415254473686218, "learning_rate": 0.0002, "epoch": 1.9131059245960502, "step": 26640}, {"loss": 0.7232, "grad_norm": 0.7936763763427734, "learning_rate": 0.0002, "epoch": 1.9138240574506282, "step": 26650}, {"loss": 0.7411, "grad_norm": 0.7174334526062012, "learning_rate": 0.0002, "epoch": 1.9145421903052064, "step": 26660}, {"loss": 0.715, "grad_norm": 0.6503710746765137, "learning_rate": 0.0002, "epoch": 1.9152603231597847, "step": 26670}, {"loss": 0.7629, "grad_norm": 0.7618577480316162, "learning_rate": 0.0002, "epoch": 1.9159784560143627, "step": 26680}, {"loss": 0.7581, "grad_norm": 0.7984131574630737, "learning_rate": 0.0002, "epoch": 1.9166965888689407, "step": 26690}, {"loss": 0.7126, "grad_norm": 0.6863887906074524, "learning_rate": 0.0002, "epoch": 1.917414721723519, "step": 26700}, {"loss": 0.738, "grad_norm": 0.7621138691902161, "learning_rate": 0.0002, "epoch": 1.918132854578097, "step": 26710}, {"loss": 0.7095, "grad_norm": 0.7855543494224548, "learning_rate": 0.0002, "epoch": 1.9188509874326751, "step": 26720}, {"loss": 0.7354, "grad_norm": 0.7045016288757324, "learning_rate": 0.0002, "epoch": 1.9195691202872531, "step": 26730}, {"loss": 0.7188, "grad_norm": 0.7799559235572815, "learning_rate": 0.0002, "epoch": 1.9202872531418311, "step": 26740}, {"loss": 0.7714, "grad_norm": 0.7999796271324158, "learning_rate": 0.0002, "epoch": 1.9210053859964094, "step": 26750}, {"loss": 0.6856, "grad_norm": 0.5479980111122131, "learning_rate": 0.0002, "epoch": 1.9217235188509876, "step": 26760}, {"loss": 0.7153, "grad_norm": 0.7192868590354919, "learning_rate": 0.0002, "epoch": 1.9224416517055656, "step": 26770}, {"loss": 0.7272, "grad_norm": 0.7642375826835632, "learning_rate": 0.0002, "epoch": 1.9231597845601436, "step": 26780}, {"loss": 0.6923, "grad_norm": 0.7015959620475769, "learning_rate": 0.0002, "epoch": 1.9238779174147216, "step": 26790}, {"loss": 0.8291, "grad_norm": 0.6685634851455688, "learning_rate": 0.0002, "epoch": 1.9245960502692998, "step": 26800}, {"loss": 0.7404, "grad_norm": 0.674363911151886, "learning_rate": 0.0002, "epoch": 1.925314183123878, "step": 26810}, {"loss": 0.7145, "grad_norm": 0.769318163394928, "learning_rate": 0.0002, "epoch": 1.926032315978456, "step": 26820}, {"loss": 0.7323, "grad_norm": 0.7397989630699158, "learning_rate": 0.0002, "epoch": 1.926750448833034, "step": 26830}, {"loss": 0.7399, "grad_norm": 0.7603814601898193, "learning_rate": 0.0002, "epoch": 1.9274685816876123, "step": 26840}, {"loss": 0.7147, "grad_norm": 0.5960564613342285, "learning_rate": 0.0002, "epoch": 1.9281867145421903, "step": 26850}, {"loss": 0.7292, "grad_norm": 0.8158858418464661, "learning_rate": 0.0002, "epoch": 1.9289048473967685, "step": 26860}, {"loss": 0.7609, "grad_norm": 0.7022058367729187, "learning_rate": 0.0002, "epoch": 1.9296229802513465, "step": 26870}, {"loss": 0.809, "grad_norm": 0.7249060273170471, "learning_rate": 0.0002, "epoch": 1.9303411131059245, "step": 26880}, {"loss": 0.7437, "grad_norm": 0.7613264322280884, "learning_rate": 0.0002, "epoch": 1.9310592459605027, "step": 26890}, {"loss": 0.7238, "grad_norm": 0.6857499480247498, "learning_rate": 0.0002, "epoch": 1.931777378815081, "step": 26900}, {"loss": 0.7651, "grad_norm": 0.6968346834182739, "learning_rate": 0.0002, "epoch": 1.932495511669659, "step": 26910}, {"loss": 0.6837, "grad_norm": 0.7079267501831055, "learning_rate": 0.0002, "epoch": 1.933213644524237, "step": 26920}, {"loss": 0.7482, "grad_norm": 0.6571618914604187, "learning_rate": 0.0002, "epoch": 1.933931777378815, "step": 26930}, {"loss": 0.7344, "grad_norm": 0.7460548281669617, "learning_rate": 0.0002, "epoch": 1.9346499102333932, "step": 26940}, {"loss": 0.7038, "grad_norm": 0.7954307794570923, "learning_rate": 0.0002, "epoch": 1.9353680430879714, "step": 26950}, {"loss": 0.6847, "grad_norm": 0.8696223497390747, "learning_rate": 0.0002, "epoch": 1.9360861759425494, "step": 26960}, {"loss": 0.7657, "grad_norm": 0.726004421710968, "learning_rate": 0.0002, "epoch": 1.9368043087971274, "step": 26970}, {"loss": 0.771, "grad_norm": 0.8760337829589844, "learning_rate": 0.0002, "epoch": 1.9375224416517056, "step": 26980}, {"loss": 0.6917, "grad_norm": 0.7308675646781921, "learning_rate": 0.0002, "epoch": 1.9382405745062836, "step": 26990}, {"loss": 0.7155, "grad_norm": 0.5900304317474365, "learning_rate": 0.0002, "epoch": 1.9389587073608618, "step": 27000}, {"loss": 0.6917, "grad_norm": 0.8839457631111145, "learning_rate": 0.0002, "epoch": 1.9396768402154398, "step": 27010}, {"loss": 0.7443, "grad_norm": 0.7239173650741577, "learning_rate": 0.0002, "epoch": 1.9403949730700178, "step": 27020}, {"loss": 0.7081, "grad_norm": 0.8972901701927185, "learning_rate": 0.0002, "epoch": 1.941113105924596, "step": 27030}, {"loss": 0.7422, "grad_norm": 0.7140652537345886, "learning_rate": 0.0002, "epoch": 1.9418312387791743, "step": 27040}, {"loss": 0.7679, "grad_norm": 0.7502743005752563, "learning_rate": 0.0002, "epoch": 1.9425493716337523, "step": 27050}, {"loss": 0.7311, "grad_norm": 0.6420751810073853, "learning_rate": 0.0002, "epoch": 1.9432675044883303, "step": 27060}, {"loss": 0.7403, "grad_norm": 0.6671820282936096, "learning_rate": 0.0002, "epoch": 1.9439856373429083, "step": 27070}, {"loss": 0.6919, "grad_norm": 0.6268796324729919, "learning_rate": 0.0002, "epoch": 1.9447037701974865, "step": 27080}, {"loss": 0.8154, "grad_norm": 0.6850021481513977, "learning_rate": 0.0002, "epoch": 1.9454219030520647, "step": 27090}, {"loss": 0.7179, "grad_norm": 0.6380038261413574, "learning_rate": 0.0002, "epoch": 1.9461400359066428, "step": 27100}, {"loss": 0.7638, "grad_norm": 0.5806204080581665, "learning_rate": 0.0002, "epoch": 1.9468581687612208, "step": 27110}, {"loss": 0.7032, "grad_norm": 0.8236927390098572, "learning_rate": 0.0002, "epoch": 1.947576301615799, "step": 27120}, {"loss": 0.7398, "grad_norm": 0.7915826439857483, "learning_rate": 0.0002, "epoch": 1.948294434470377, "step": 27130}, {"loss": 0.729, "grad_norm": 0.7467429041862488, "learning_rate": 0.0002, "epoch": 1.9490125673249552, "step": 27140}, {"loss": 0.7297, "grad_norm": 0.6278707981109619, "learning_rate": 0.0002, "epoch": 1.9497307001795332, "step": 27150}, {"loss": 0.7272, "grad_norm": 0.7353739142417908, "learning_rate": 0.0002, "epoch": 1.9504488330341112, "step": 27160}, {"loss": 0.6877, "grad_norm": 0.6443645358085632, "learning_rate": 0.0002, "epoch": 1.9511669658886894, "step": 27170}, {"loss": 0.7479, "grad_norm": 0.770800769329071, "learning_rate": 0.0002, "epoch": 1.9518850987432677, "step": 27180}, {"loss": 0.713, "grad_norm": 0.8982598781585693, "learning_rate": 0.0002, "epoch": 1.9526032315978457, "step": 27190}, {"loss": 0.7447, "grad_norm": 0.775017499923706, "learning_rate": 0.0002, "epoch": 1.9533213644524237, "step": 27200}, {"loss": 0.76, "grad_norm": 0.8271628618240356, "learning_rate": 0.0002, "epoch": 1.9540394973070017, "step": 27210}, {"loss": 0.7321, "grad_norm": 0.7460184693336487, "learning_rate": 0.0002, "epoch": 1.9547576301615799, "step": 27220}, {"loss": 0.6999, "grad_norm": 0.7732188105583191, "learning_rate": 0.0002, "epoch": 1.955475763016158, "step": 27230}, {"loss": 0.7135, "grad_norm": 0.7398577332496643, "learning_rate": 0.0002, "epoch": 1.956193895870736, "step": 27240}, {"loss": 0.7347, "grad_norm": 0.7132339477539062, "learning_rate": 0.0002, "epoch": 1.9569120287253141, "step": 27250}, {"loss": 0.7731, "grad_norm": 0.6718965768814087, "learning_rate": 0.0002, "epoch": 1.9576301615798921, "step": 27260}, {"loss": 0.7088, "grad_norm": 0.7914422154426575, "learning_rate": 0.0002, "epoch": 1.9583482944344703, "step": 27270}, {"loss": 0.6998, "grad_norm": 0.8314110636711121, "learning_rate": 0.0002, "epoch": 1.9590664272890486, "step": 27280}, {"loss": 0.7662, "grad_norm": 0.7810674905776978, "learning_rate": 0.0002, "epoch": 1.9597845601436266, "step": 27290}, {"loss": 0.7278, "grad_norm": 0.7691007256507874, "learning_rate": 0.0002, "epoch": 1.9605026929982046, "step": 27300}, {"loss": 0.7152, "grad_norm": 0.6753138899803162, "learning_rate": 0.0002, "epoch": 1.9612208258527828, "step": 27310}, {"loss": 0.7519, "grad_norm": 0.5881175994873047, "learning_rate": 0.0002, "epoch": 1.961938958707361, "step": 27320}, {"loss": 0.6877, "grad_norm": 0.8414133191108704, "learning_rate": 0.0002, "epoch": 1.962657091561939, "step": 27330}, {"loss": 0.7241, "grad_norm": 0.7363715171813965, "learning_rate": 0.0002, "epoch": 1.963375224416517, "step": 27340}, {"loss": 0.7153, "grad_norm": 0.6526232361793518, "learning_rate": 0.0002, "epoch": 1.964093357271095, "step": 27350}, {"loss": 0.8178, "grad_norm": 0.6821389198303223, "learning_rate": 0.0002, "epoch": 1.9648114901256732, "step": 27360}, {"loss": 0.7134, "grad_norm": 0.7306062579154968, "learning_rate": 0.0002, "epoch": 1.9655296229802515, "step": 27370}, {"loss": 0.7139, "grad_norm": 0.6458130478858948, "learning_rate": 0.0002, "epoch": 1.9662477558348295, "step": 27380}, {"loss": 0.7397, "grad_norm": 0.7243196368217468, "learning_rate": 0.0002, "epoch": 1.9669658886894075, "step": 27390}, {"loss": 0.6729, "grad_norm": 0.8062235713005066, "learning_rate": 0.0002, "epoch": 1.9676840215439855, "step": 27400}, {"loss": 0.7972, "grad_norm": 0.68441241979599, "learning_rate": 0.0002, "epoch": 1.9684021543985637, "step": 27410}, {"loss": 0.7235, "grad_norm": 0.7504498958587646, "learning_rate": 0.0002, "epoch": 1.969120287253142, "step": 27420}, {"loss": 0.7192, "grad_norm": 0.7469466328620911, "learning_rate": 0.0002, "epoch": 1.96983842010772, "step": 27430}, {"loss": 0.7556, "grad_norm": 0.7109853625297546, "learning_rate": 0.0002, "epoch": 1.970556552962298, "step": 27440}, {"loss": 0.7977, "grad_norm": 0.6964903473854065, "learning_rate": 0.0002, "epoch": 1.9712746858168761, "step": 27450}, {"loss": 0.7692, "grad_norm": 0.8224200010299683, "learning_rate": 0.0002, "epoch": 1.9719928186714544, "step": 27460}, {"loss": 0.7318, "grad_norm": 0.6195617318153381, "learning_rate": 0.0002, "epoch": 1.9727109515260324, "step": 27470}, {"loss": 0.7843, "grad_norm": 0.691511332988739, "learning_rate": 0.0002, "epoch": 1.9734290843806104, "step": 27480}, {"loss": 0.7324, "grad_norm": 0.7437900304794312, "learning_rate": 0.0002, "epoch": 1.9741472172351884, "step": 27490}, {"loss": 0.6736, "grad_norm": 0.7987960577011108, "learning_rate": 0.0002, "epoch": 1.9748653500897666, "step": 27500}, {"loss": 0.7005, "grad_norm": 0.7117776274681091, "learning_rate": 0.0002, "epoch": 1.9755834829443448, "step": 27510}, {"loss": 0.7201, "grad_norm": 0.8473866581916809, "learning_rate": 0.0002, "epoch": 1.9763016157989228, "step": 27520}, {"loss": 0.7528, "grad_norm": 0.7178242802619934, "learning_rate": 0.0002, "epoch": 1.9770197486535008, "step": 27530}, {"loss": 0.7112, "grad_norm": 0.760145902633667, "learning_rate": 0.0002, "epoch": 1.9777378815080788, "step": 27540}, {"loss": 0.8118, "grad_norm": 0.764436662197113, "learning_rate": 0.0002, "epoch": 1.978456014362657, "step": 27550}, {"loss": 0.7542, "grad_norm": 0.7245904803276062, "learning_rate": 0.0002, "epoch": 1.9791741472172353, "step": 27560}, {"loss": 0.7316, "grad_norm": 0.6317000389099121, "learning_rate": 0.0002, "epoch": 1.9798922800718133, "step": 27570}, {"loss": 0.7504, "grad_norm": 0.8764704465866089, "learning_rate": 0.0002, "epoch": 1.9806104129263913, "step": 27580}, {"loss": 0.7845, "grad_norm": 0.6111825108528137, "learning_rate": 0.0002, "epoch": 1.9813285457809695, "step": 27590}, {"loss": 0.7101, "grad_norm": 0.6797714233398438, "learning_rate": 0.0002, "epoch": 1.9820466786355477, "step": 27600}, {"loss": 0.8037, "grad_norm": 0.7754142880439758, "learning_rate": 0.0002, "epoch": 1.9827648114901257, "step": 27610}, {"loss": 0.7288, "grad_norm": 0.7243061661720276, "learning_rate": 0.0002, "epoch": 1.9834829443447037, "step": 27620}, {"loss": 0.6626, "grad_norm": 0.6194812655448914, "learning_rate": 0.0002, "epoch": 1.9842010771992817, "step": 27630}, {"loss": 0.7162, "grad_norm": 0.6399638056755066, "learning_rate": 0.0002, "epoch": 1.98491921005386, "step": 27640}, {"loss": 0.764, "grad_norm": 0.7637218832969666, "learning_rate": 0.0002, "epoch": 1.9856373429084382, "step": 27650}, {"loss": 0.7386, "grad_norm": 0.9099404811859131, "learning_rate": 0.0002, "epoch": 1.9863554757630162, "step": 27660}, {"loss": 0.7579, "grad_norm": 0.6892596483230591, "learning_rate": 0.0002, "epoch": 1.9870736086175942, "step": 27670}, {"loss": 0.7802, "grad_norm": 0.5962418913841248, "learning_rate": 0.0002, "epoch": 1.9877917414721722, "step": 27680}, {"loss": 0.7162, "grad_norm": 0.5750163197517395, "learning_rate": 0.0002, "epoch": 1.9885098743267504, "step": 27690}, {"loss": 0.7553, "grad_norm": 0.6740097403526306, "learning_rate": 0.0002, "epoch": 1.9892280071813286, "step": 27700}, {"loss": 0.7444, "grad_norm": 0.6968644857406616, "learning_rate": 0.0002, "epoch": 1.9899461400359066, "step": 27710}, {"loss": 0.7383, "grad_norm": 0.6788132190704346, "learning_rate": 0.0002, "epoch": 1.9906642728904846, "step": 27720}, {"loss": 0.7249, "grad_norm": 0.8600544929504395, "learning_rate": 0.0002, "epoch": 1.9913824057450629, "step": 27730}, {"loss": 0.7133, "grad_norm": 0.6227671504020691, "learning_rate": 0.0002, "epoch": 1.992100538599641, "step": 27740}, {"loss": 0.7815, "grad_norm": 0.6611875295639038, "learning_rate": 0.0002, "epoch": 1.992818671454219, "step": 27750}, {"loss": 0.7423, "grad_norm": 0.714568018913269, "learning_rate": 0.0002, "epoch": 1.993536804308797, "step": 27760}, {"loss": 0.7297, "grad_norm": 0.6328669190406799, "learning_rate": 0.0002, "epoch": 1.994254937163375, "step": 27770}, {"loss": 0.7398, "grad_norm": 0.8673429489135742, "learning_rate": 0.0002, "epoch": 1.9949730700179533, "step": 27780}, {"loss": 0.7301, "grad_norm": 0.820620059967041, "learning_rate": 0.0002, "epoch": 1.9956912028725315, "step": 27790}, {"loss": 0.7828, "grad_norm": 0.8748094439506531, "learning_rate": 0.0002, "epoch": 1.9964093357271095, "step": 27800}, {"loss": 0.6945, "grad_norm": 0.8118113875389099, "learning_rate": 0.0002, "epoch": 1.9971274685816875, "step": 27810}, {"loss": 0.742, "grad_norm": 0.6886725425720215, "learning_rate": 0.0002, "epoch": 1.9978456014362656, "step": 27820}, {"loss": 0.7293, "grad_norm": 0.7101268768310547, "learning_rate": 0.0002, "epoch": 1.9985637342908438, "step": 27830}, {"loss": 0.7317, "grad_norm": 0.7823781967163086, "learning_rate": 0.0002, "epoch": 1.999281867145422, "step": 27840}, {"loss": 0.7711, "grad_norm": 0.8491085767745972, "learning_rate": 0.0002, "epoch": 2.0, "step": 27850}, {"eval_loss": 1.0868422985076904, "eval_runtime": 55.1699, "eval_samples_per_second": 13.286, "eval_steps_per_second": 1.668, "epoch": 2.0, "step": 27850}, {"loss": 0.6808, "grad_norm": 0.9003389477729797, "learning_rate": 0.0002, "epoch": 2.000718132854578, "step": 27860}, {"loss": 0.6379, "grad_norm": 0.8898349404335022, "learning_rate": 0.0002, "epoch": 2.001436265709156, "step": 27870}, {"loss": 0.7157, "grad_norm": 0.7525973320007324, "learning_rate": 0.0002, "epoch": 2.0021543985637344, "step": 27880}, {"loss": 0.6681, "grad_norm": 0.7821497321128845, "learning_rate": 0.0002, "epoch": 2.0028725314183125, "step": 27890}, {"loss": 0.6781, "grad_norm": 0.6334691047668457, "learning_rate": 0.0002, "epoch": 2.0035906642728905, "step": 27900}, {"loss": 0.6349, "grad_norm": 0.732991099357605, "learning_rate": 0.0002, "epoch": 2.0043087971274685, "step": 27910}, {"loss": 0.6776, "grad_norm": 0.949942946434021, "learning_rate": 0.0002, "epoch": 2.0050269299820465, "step": 27920}, {"loss": 0.735, "grad_norm": 0.657267689704895, "learning_rate": 0.0002, "epoch": 2.005745062836625, "step": 27930}, {"loss": 0.7123, "grad_norm": 0.8329252004623413, "learning_rate": 0.0002, "epoch": 2.006463195691203, "step": 27940}, {"loss": 0.6826, "grad_norm": 0.7816959023475647, "learning_rate": 0.0002, "epoch": 2.007181328545781, "step": 27950}, {"loss": 0.6511, "grad_norm": 0.7546323537826538, "learning_rate": 0.0002, "epoch": 2.007899461400359, "step": 27960}, {"loss": 0.6222, "grad_norm": 0.9519657492637634, "learning_rate": 0.0002, "epoch": 2.0086175942549374, "step": 27970}, {"loss": 0.6642, "grad_norm": 0.7934315800666809, "learning_rate": 0.0002, "epoch": 2.0093357271095154, "step": 27980}, {"loss": 0.666, "grad_norm": 0.9579764604568481, "learning_rate": 0.0002, "epoch": 2.0100538599640934, "step": 27990}, {"loss": 0.6376, "grad_norm": 0.764167070388794, "learning_rate": 0.0002, "epoch": 2.0107719928186714, "step": 28000}, {"loss": 0.6512, "grad_norm": 0.7380000948905945, "learning_rate": 0.0002, "epoch": 2.0114901256732494, "step": 28010}, {"loss": 0.6893, "grad_norm": 0.7220044732093811, "learning_rate": 0.0002, "epoch": 2.012208258527828, "step": 28020}, {"loss": 0.6168, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 2.012926391382406, "step": 28030}, {"loss": 0.6595, "grad_norm": 0.7507190704345703, "learning_rate": 0.0002, "epoch": 2.013644524236984, "step": 28040}, {"loss": 0.6974, "grad_norm": 0.9488387703895569, "learning_rate": 0.0002, "epoch": 2.014362657091562, "step": 28050}, {"loss": 0.6489, "grad_norm": 0.9092940092086792, "learning_rate": 0.0002, "epoch": 2.01508078994614, "step": 28060}, {"loss": 0.6545, "grad_norm": 0.7859629392623901, "learning_rate": 0.0002, "epoch": 2.0157989228007183, "step": 28070}, {"loss": 0.6552, "grad_norm": 0.7636393904685974, "learning_rate": 0.0002, "epoch": 2.0165170556552963, "step": 28080}, {"loss": 0.696, "grad_norm": 0.8860714435577393, "learning_rate": 0.0002, "epoch": 2.0172351885098743, "step": 28090}, {"loss": 0.6368, "grad_norm": 0.6837195158004761, "learning_rate": 0.0002, "epoch": 2.0179533213644523, "step": 28100}, {"loss": 0.6405, "grad_norm": 0.7778242826461792, "learning_rate": 0.0002, "epoch": 2.0186714542190307, "step": 28110}, {"loss": 0.6417, "grad_norm": 0.7164766788482666, "learning_rate": 0.0002, "epoch": 2.0193895870736087, "step": 28120}, {"loss": 0.6684, "grad_norm": 0.8965572118759155, "learning_rate": 0.0002, "epoch": 2.0201077199281867, "step": 28130}, {"loss": 0.6213, "grad_norm": 0.8074374794960022, "learning_rate": 0.0002, "epoch": 2.0208258527827647, "step": 28140}, {"loss": 0.6563, "grad_norm": 0.8307222127914429, "learning_rate": 0.0002, "epoch": 2.0215439856373427, "step": 28150}, {"loss": 0.6617, "grad_norm": 0.9600032567977905, "learning_rate": 0.0002, "epoch": 2.022262118491921, "step": 28160}, {"loss": 0.6722, "grad_norm": 0.8541040420532227, "learning_rate": 0.0002, "epoch": 2.022980251346499, "step": 28170}, {"loss": 0.6803, "grad_norm": 0.8864985704421997, "learning_rate": 0.0002, "epoch": 2.023698384201077, "step": 28180}, {"loss": 0.6516, "grad_norm": 0.7926326990127563, "learning_rate": 0.0002, "epoch": 2.024416517055655, "step": 28190}, {"loss": 0.6595, "grad_norm": 1.0548077821731567, "learning_rate": 0.0002, "epoch": 2.025134649910233, "step": 28200}, {"loss": 0.6859, "grad_norm": 0.7468827366828918, "learning_rate": 0.0002, "epoch": 2.0258527827648116, "step": 28210}, {"loss": 0.6605, "grad_norm": 0.7683286070823669, "learning_rate": 0.0002, "epoch": 2.0265709156193896, "step": 28220}, {"loss": 0.6656, "grad_norm": 0.7307319641113281, "learning_rate": 0.0002, "epoch": 2.0272890484739676, "step": 28230}, {"loss": 0.7148, "grad_norm": 0.7813416719436646, "learning_rate": 0.0002, "epoch": 2.0280071813285456, "step": 28240}, {"loss": 0.6882, "grad_norm": 0.7954556941986084, "learning_rate": 0.0002, "epoch": 2.028725314183124, "step": 28250}, {"loss": 0.6192, "grad_norm": 0.8836418986320496, "learning_rate": 0.0002, "epoch": 2.029443447037702, "step": 28260}, {"loss": 0.6275, "grad_norm": 0.7092728614807129, "learning_rate": 0.0002, "epoch": 2.03016157989228, "step": 28270}, {"loss": 0.6735, "grad_norm": 0.8512285351753235, "learning_rate": 0.0002, "epoch": 2.030879712746858, "step": 28280}, {"loss": 0.6586, "grad_norm": 0.8005346059799194, "learning_rate": 0.0002, "epoch": 2.031597845601436, "step": 28290}, {"loss": 0.6129, "grad_norm": 0.8872515559196472, "learning_rate": 0.0002, "epoch": 2.0323159784560145, "step": 28300}, {"loss": 0.6935, "grad_norm": 0.7948436737060547, "learning_rate": 0.0002, "epoch": 2.0330341113105925, "step": 28310}, {"loss": 0.6831, "grad_norm": 0.7418082356452942, "learning_rate": 0.0002, "epoch": 2.0337522441651705, "step": 28320}, {"loss": 0.6922, "grad_norm": 0.9600949287414551, "learning_rate": 0.0002, "epoch": 2.0344703770197485, "step": 28330}, {"loss": 0.6015, "grad_norm": 0.9767434597015381, "learning_rate": 0.0002, "epoch": 2.0351885098743265, "step": 28340}, {"loss": 0.6637, "grad_norm": 0.7435336709022522, "learning_rate": 0.0002, "epoch": 2.035906642728905, "step": 28350}, {"loss": 0.649, "grad_norm": 0.997978925704956, "learning_rate": 0.0002, "epoch": 2.036624775583483, "step": 28360}, {"loss": 0.6957, "grad_norm": 0.9072412252426147, "learning_rate": 0.0002, "epoch": 2.037342908438061, "step": 28370}, {"loss": 0.6816, "grad_norm": 0.8396701812744141, "learning_rate": 0.0002, "epoch": 2.038061041292639, "step": 28380}, {"loss": 0.6487, "grad_norm": 1.0449832677841187, "learning_rate": 0.0002, "epoch": 2.0387791741472174, "step": 28390}, {"loss": 0.6826, "grad_norm": 0.6471025943756104, "learning_rate": 0.0002, "epoch": 2.0394973070017954, "step": 28400}, {"loss": 0.6597, "grad_norm": 0.8147950768470764, "learning_rate": 0.0002, "epoch": 2.0402154398563734, "step": 28410}, {"loss": 0.6502, "grad_norm": 0.902508020401001, "learning_rate": 0.0002, "epoch": 2.0409335727109514, "step": 28420}, {"loss": 0.6303, "grad_norm": 0.6426262855529785, "learning_rate": 0.0002, "epoch": 2.0416517055655294, "step": 28430}, {"loss": 0.6812, "grad_norm": 0.8016643524169922, "learning_rate": 0.0002, "epoch": 2.042369838420108, "step": 28440}, {"loss": 0.6535, "grad_norm": 0.6841614246368408, "learning_rate": 0.0002, "epoch": 2.043087971274686, "step": 28450}, {"loss": 0.638, "grad_norm": 0.7713631987571716, "learning_rate": 0.0002, "epoch": 2.043806104129264, "step": 28460}, {"loss": 0.6456, "grad_norm": 0.8795675039291382, "learning_rate": 0.0002, "epoch": 2.044524236983842, "step": 28470}, {"loss": 0.6858, "grad_norm": 0.725447416305542, "learning_rate": 0.0002, "epoch": 2.04524236983842, "step": 28480}, {"loss": 0.6289, "grad_norm": 0.806861162185669, "learning_rate": 0.0002, "epoch": 2.0459605026929983, "step": 28490}, {"loss": 0.6269, "grad_norm": 0.752953827381134, "learning_rate": 0.0002, "epoch": 2.0466786355475763, "step": 28500}, {"loss": 0.6818, "grad_norm": 0.7143173813819885, "learning_rate": 0.0002, "epoch": 2.0473967684021543, "step": 28510}, {"loss": 0.6606, "grad_norm": 0.9316226243972778, "learning_rate": 0.0002, "epoch": 2.0481149012567323, "step": 28520}, {"loss": 0.6284, "grad_norm": 0.7292338609695435, "learning_rate": 0.0002, "epoch": 2.048833034111311, "step": 28530}, {"loss": 0.6528, "grad_norm": 0.7392885088920593, "learning_rate": 0.0002, "epoch": 2.049551166965889, "step": 28540}, {"loss": 0.7007, "grad_norm": 0.7288873195648193, "learning_rate": 0.0002, "epoch": 2.050269299820467, "step": 28550}, {"loss": 0.6239, "grad_norm": 0.7791221141815186, "learning_rate": 0.0002, "epoch": 2.050987432675045, "step": 28560}, {"loss": 0.684, "grad_norm": 0.821983814239502, "learning_rate": 0.0002, "epoch": 2.051705565529623, "step": 28570}, {"loss": 0.6545, "grad_norm": 0.8925826549530029, "learning_rate": 0.0002, "epoch": 2.0524236983842012, "step": 28580}, {"loss": 0.719, "grad_norm": 0.7181646227836609, "learning_rate": 0.0002, "epoch": 2.0531418312387792, "step": 28590}, {"loss": 0.686, "grad_norm": 0.6387725472450256, "learning_rate": 0.0002, "epoch": 2.0538599640933572, "step": 28600}, {"loss": 0.6662, "grad_norm": 0.8398096561431885, "learning_rate": 0.0002, "epoch": 2.0545780969479353, "step": 28610}, {"loss": 0.69, "grad_norm": 1.0458195209503174, "learning_rate": 0.0002, "epoch": 2.0552962298025133, "step": 28620}, {"loss": 0.655, "grad_norm": 0.7032150626182556, "learning_rate": 0.0002, "epoch": 2.0560143626570917, "step": 28630}, {"loss": 0.6551, "grad_norm": 0.8850845098495483, "learning_rate": 0.0002, "epoch": 2.0567324955116697, "step": 28640}, {"loss": 0.6767, "grad_norm": 0.8587120175361633, "learning_rate": 0.0002, "epoch": 2.0574506283662477, "step": 28650}, {"loss": 0.6721, "grad_norm": 0.7462602853775024, "learning_rate": 0.0002, "epoch": 2.0581687612208257, "step": 28660}, {"loss": 0.6639, "grad_norm": 0.7355574369430542, "learning_rate": 0.0002, "epoch": 2.058886894075404, "step": 28670}, {"loss": 0.6216, "grad_norm": 0.9229736328125, "learning_rate": 0.0002, "epoch": 2.059605026929982, "step": 28680}, {"loss": 0.6692, "grad_norm": 0.7685085535049438, "learning_rate": 0.0002, "epoch": 2.06032315978456, "step": 28690}, {"loss": 0.6801, "grad_norm": 0.6749364137649536, "learning_rate": 0.0002, "epoch": 2.061041292639138, "step": 28700}, {"loss": 0.6721, "grad_norm": 0.7608520984649658, "learning_rate": 0.0002, "epoch": 2.061759425493716, "step": 28710}, {"loss": 0.6721, "grad_norm": 0.9451281428337097, "learning_rate": 0.0002, "epoch": 2.0624775583482946, "step": 28720}, {"loss": 0.671, "grad_norm": 0.7869735360145569, "learning_rate": 0.0002, "epoch": 2.0631956912028726, "step": 28730}, {"loss": 0.6409, "grad_norm": 0.8422008156776428, "learning_rate": 0.0002, "epoch": 2.0639138240574506, "step": 28740}, {"loss": 0.6686, "grad_norm": 0.7486162781715393, "learning_rate": 0.0002, "epoch": 2.0646319569120286, "step": 28750}, {"loss": 0.6641, "grad_norm": 0.9374173879623413, "learning_rate": 0.0002, "epoch": 2.0653500897666066, "step": 28760}, {"loss": 0.6737, "grad_norm": 0.8749295473098755, "learning_rate": 0.0002, "epoch": 2.066068222621185, "step": 28770}, {"loss": 0.636, "grad_norm": 0.8265942931175232, "learning_rate": 0.0002, "epoch": 2.066786355475763, "step": 28780}, {"loss": 0.6819, "grad_norm": 0.8541982769966125, "learning_rate": 0.0002, "epoch": 2.067504488330341, "step": 28790}, {"loss": 0.661, "grad_norm": 0.8220006227493286, "learning_rate": 0.0002, "epoch": 2.068222621184919, "step": 28800}, {"loss": 0.6942, "grad_norm": 0.7302022576332092, "learning_rate": 0.0002, "epoch": 2.0689407540394975, "step": 28810}, {"loss": 0.68, "grad_norm": 0.7073875069618225, "learning_rate": 0.0002, "epoch": 2.0696588868940755, "step": 28820}, {"loss": 0.6275, "grad_norm": 0.7792919874191284, "learning_rate": 0.0002, "epoch": 2.0703770197486535, "step": 28830}, {"loss": 0.6941, "grad_norm": 0.8268185257911682, "learning_rate": 0.0002, "epoch": 2.0710951526032315, "step": 28840}, {"loss": 0.6776, "grad_norm": 0.7576423287391663, "learning_rate": 0.0002, "epoch": 2.0718132854578095, "step": 28850}, {"loss": 0.6298, "grad_norm": 0.8255910873413086, "learning_rate": 0.0002, "epoch": 2.072531418312388, "step": 28860}, {"loss": 0.6695, "grad_norm": 0.7900934815406799, "learning_rate": 0.0002, "epoch": 2.073249551166966, "step": 28870}, {"loss": 0.6532, "grad_norm": 0.846665620803833, "learning_rate": 0.0002, "epoch": 2.073967684021544, "step": 28880}, {"loss": 0.6598, "grad_norm": 0.8159831166267395, "learning_rate": 0.0002, "epoch": 2.074685816876122, "step": 28890}, {"loss": 0.6341, "grad_norm": 0.7395941615104675, "learning_rate": 0.0002, "epoch": 2.0754039497307, "step": 28900}, {"loss": 0.6513, "grad_norm": 0.9765046238899231, "learning_rate": 0.0002, "epoch": 2.0761220825852784, "step": 28910}, {"loss": 0.6785, "grad_norm": 0.8358173966407776, "learning_rate": 0.0002, "epoch": 2.0768402154398564, "step": 28920}, {"loss": 0.6973, "grad_norm": 0.6848723292350769, "learning_rate": 0.0002, "epoch": 2.0775583482944344, "step": 28930}, {"loss": 0.6381, "grad_norm": 0.7965065836906433, "learning_rate": 0.0002, "epoch": 2.0782764811490124, "step": 28940}, {"loss": 0.667, "grad_norm": 0.7618608474731445, "learning_rate": 0.0002, "epoch": 2.078994614003591, "step": 28950}, {"loss": 0.6683, "grad_norm": 0.890615701675415, "learning_rate": 0.0002, "epoch": 2.079712746858169, "step": 28960}, {"loss": 0.6641, "grad_norm": 0.7310431003570557, "learning_rate": 0.0002, "epoch": 2.080430879712747, "step": 28970}, {"loss": 0.6511, "grad_norm": 0.8228268027305603, "learning_rate": 0.0002, "epoch": 2.081149012567325, "step": 28980}, {"loss": 0.655, "grad_norm": 0.883577287197113, "learning_rate": 0.0002, "epoch": 2.081867145421903, "step": 28990}, {"loss": 0.7232, "grad_norm": 0.8359243869781494, "learning_rate": 0.0002, "epoch": 2.0825852782764813, "step": 29000}, {"loss": 0.6744, "grad_norm": 0.8285391330718994, "learning_rate": 0.0002, "epoch": 2.0833034111310593, "step": 29010}, {"loss": 0.6951, "grad_norm": 0.8991064429283142, "learning_rate": 0.0002, "epoch": 2.0840215439856373, "step": 29020}, {"loss": 0.6444, "grad_norm": 0.6911244988441467, "learning_rate": 0.0002, "epoch": 2.0847396768402153, "step": 29030}, {"loss": 0.7098, "grad_norm": 0.8462249636650085, "learning_rate": 0.0002, "epoch": 2.0854578096947933, "step": 29040}, {"loss": 0.6813, "grad_norm": 0.9149548411369324, "learning_rate": 0.0002, "epoch": 2.0861759425493718, "step": 29050}, {"loss": 0.6948, "grad_norm": 0.7365630269050598, "learning_rate": 0.0002, "epoch": 2.0868940754039498, "step": 29060}, {"loss": 0.6391, "grad_norm": 0.8439079523086548, "learning_rate": 0.0002, "epoch": 2.087612208258528, "step": 29070}, {"loss": 0.6566, "grad_norm": 0.7123780846595764, "learning_rate": 0.0002, "epoch": 2.088330341113106, "step": 29080}, {"loss": 0.6305, "grad_norm": 0.6854261755943298, "learning_rate": 0.0002, "epoch": 2.0890484739676842, "step": 29090}, {"loss": 0.667, "grad_norm": 0.83026123046875, "learning_rate": 0.0002, "epoch": 2.0897666068222622, "step": 29100}, {"loss": 0.661, "grad_norm": 0.8413158059120178, "learning_rate": 0.0002, "epoch": 2.0904847396768402, "step": 29110}, {"loss": 0.7194, "grad_norm": 0.9646758437156677, "learning_rate": 0.0002, "epoch": 2.0912028725314182, "step": 29120}, {"loss": 0.7101, "grad_norm": 0.8421565890312195, "learning_rate": 0.0002, "epoch": 2.0919210053859962, "step": 29130}, {"loss": 0.6685, "grad_norm": 0.7748899459838867, "learning_rate": 0.0002, "epoch": 2.0926391382405747, "step": 29140}, {"loss": 0.6596, "grad_norm": 0.5973830819129944, "learning_rate": 0.0002, "epoch": 2.0933572710951527, "step": 29150}, {"loss": 0.6437, "grad_norm": 0.8440837860107422, "learning_rate": 0.0002, "epoch": 2.0940754039497307, "step": 29160}, {"loss": 0.6373, "grad_norm": 0.7392688989639282, "learning_rate": 0.0002, "epoch": 2.0947935368043087, "step": 29170}, {"loss": 0.6907, "grad_norm": 1.0522996187210083, "learning_rate": 0.0002, "epoch": 2.0955116696588867, "step": 29180}, {"loss": 0.6733, "grad_norm": 0.7330273389816284, "learning_rate": 0.0002, "epoch": 2.096229802513465, "step": 29190}, {"loss": 0.7219, "grad_norm": 1.11064875125885, "learning_rate": 0.0002, "epoch": 2.096947935368043, "step": 29200}, {"loss": 0.6125, "grad_norm": 0.795446515083313, "learning_rate": 0.0002, "epoch": 2.097666068222621, "step": 29210}, {"loss": 0.6466, "grad_norm": 0.5552594661712646, "learning_rate": 0.0002, "epoch": 2.098384201077199, "step": 29220}, {"loss": 0.6601, "grad_norm": 0.7327710390090942, "learning_rate": 0.0002, "epoch": 2.0991023339317776, "step": 29230}, {"loss": 0.656, "grad_norm": 0.7474247217178345, "learning_rate": 0.0002, "epoch": 2.0998204667863556, "step": 29240}, {"loss": 0.6707, "grad_norm": 0.7775853276252747, "learning_rate": 0.0002, "epoch": 2.1005385996409336, "step": 29250}, {"loss": 0.6623, "grad_norm": 0.769527018070221, "learning_rate": 0.0002, "epoch": 2.1012567324955116, "step": 29260}, {"loss": 0.6183, "grad_norm": 0.8350797891616821, "learning_rate": 0.0002, "epoch": 2.1019748653500896, "step": 29270}, {"loss": 0.6623, "grad_norm": 0.8749061822891235, "learning_rate": 0.0002, "epoch": 2.102692998204668, "step": 29280}, {"loss": 0.6292, "grad_norm": 0.7838778495788574, "learning_rate": 0.0002, "epoch": 2.103411131059246, "step": 29290}, {"loss": 0.699, "grad_norm": 0.8144710063934326, "learning_rate": 0.0002, "epoch": 2.104129263913824, "step": 29300}, {"loss": 0.6291, "grad_norm": 0.7965250015258789, "learning_rate": 0.0002, "epoch": 2.104847396768402, "step": 29310}, {"loss": 0.6387, "grad_norm": 0.7075945138931274, "learning_rate": 0.0002, "epoch": 2.10556552962298, "step": 29320}, {"loss": 0.6846, "grad_norm": 0.9449555277824402, "learning_rate": 0.0002, "epoch": 2.1062836624775585, "step": 29330}, {"loss": 0.6571, "grad_norm": 0.9114580750465393, "learning_rate": 0.0002, "epoch": 2.1070017953321365, "step": 29340}, {"loss": 0.6652, "grad_norm": 0.8768125176429749, "learning_rate": 0.0002, "epoch": 2.1077199281867145, "step": 29350}, {"loss": 0.7134, "grad_norm": 0.8586908578872681, "learning_rate": 0.0002, "epoch": 2.1084380610412925, "step": 29360}, {"loss": 0.6471, "grad_norm": 0.8351234793663025, "learning_rate": 0.0002, "epoch": 2.109156193895871, "step": 29370}, {"loss": 0.671, "grad_norm": 0.686488687992096, "learning_rate": 0.0002, "epoch": 2.109874326750449, "step": 29380}, {"loss": 0.6706, "grad_norm": 0.7910184264183044, "learning_rate": 0.0002, "epoch": 2.110592459605027, "step": 29390}, {"loss": 0.7367, "grad_norm": 0.7649612426757812, "learning_rate": 0.0002, "epoch": 2.111310592459605, "step": 29400}, {"loss": 0.6386, "grad_norm": 0.7790259122848511, "learning_rate": 0.0002, "epoch": 2.112028725314183, "step": 29410}, {"loss": 0.6983, "grad_norm": 0.8386351466178894, "learning_rate": 0.0002, "epoch": 2.1127468581687614, "step": 29420}, {"loss": 0.6519, "grad_norm": 0.8605695366859436, "learning_rate": 0.0002, "epoch": 2.1134649910233394, "step": 29430}, {"loss": 0.6686, "grad_norm": 0.6808947920799255, "learning_rate": 0.0002, "epoch": 2.1141831238779174, "step": 29440}, {"loss": 0.6743, "grad_norm": 0.8310001492500305, "learning_rate": 0.0002, "epoch": 2.1149012567324954, "step": 29450}, {"loss": 0.6669, "grad_norm": 1.289986252784729, "learning_rate": 0.0002, "epoch": 2.1156193895870734, "step": 29460}, {"loss": 0.6947, "grad_norm": 0.8679313659667969, "learning_rate": 0.0002, "epoch": 2.116337522441652, "step": 29470}, {"loss": 0.6954, "grad_norm": 0.9149175882339478, "learning_rate": 0.0002, "epoch": 2.11705565529623, "step": 29480}, {"loss": 0.6908, "grad_norm": 0.8405622839927673, "learning_rate": 0.0002, "epoch": 2.117773788150808, "step": 29490}, {"loss": 0.7436, "grad_norm": 0.9174691438674927, "learning_rate": 0.0002, "epoch": 2.118491921005386, "step": 29500}, {"loss": 0.6804, "grad_norm": 0.8865614533424377, "learning_rate": 0.0002, "epoch": 2.1192100538599643, "step": 29510}, {"loss": 0.6535, "grad_norm": 0.645301342010498, "learning_rate": 0.0002, "epoch": 2.1199281867145423, "step": 29520}, {"loss": 0.6879, "grad_norm": 0.7612960338592529, "learning_rate": 0.0002, "epoch": 2.1206463195691203, "step": 29530}, {"loss": 0.6874, "grad_norm": 0.7575576305389404, "learning_rate": 0.0002, "epoch": 2.1213644524236983, "step": 29540}, {"loss": 0.6924, "grad_norm": 0.8746156096458435, "learning_rate": 0.0002, "epoch": 2.1220825852782763, "step": 29550}, {"loss": 0.6659, "grad_norm": 0.8488934636116028, "learning_rate": 0.0002, "epoch": 2.1228007181328548, "step": 29560}, {"loss": 0.6568, "grad_norm": 0.8064972162246704, "learning_rate": 0.0002, "epoch": 2.1235188509874328, "step": 29570}, {"loss": 0.713, "grad_norm": 0.7410933971405029, "learning_rate": 0.0002, "epoch": 2.1242369838420108, "step": 29580}, {"loss": 0.649, "grad_norm": 0.7023535966873169, "learning_rate": 0.0002, "epoch": 2.1249551166965888, "step": 29590}, {"loss": 0.6574, "grad_norm": 0.8591743111610413, "learning_rate": 0.0002, "epoch": 2.1256732495511668, "step": 29600}, {"loss": 0.673, "grad_norm": 0.7270186543464661, "learning_rate": 0.0002, "epoch": 2.126391382405745, "step": 29610}, {"loss": 0.6262, "grad_norm": 0.9639726281166077, "learning_rate": 0.0002, "epoch": 2.127109515260323, "step": 29620}, {"loss": 0.6434, "grad_norm": 0.8519027829170227, "learning_rate": 0.0002, "epoch": 2.127827648114901, "step": 29630}, {"loss": 0.6843, "grad_norm": 0.8786447048187256, "learning_rate": 0.0002, "epoch": 2.128545780969479, "step": 29640}, {"loss": 0.6386, "grad_norm": 0.7452822923660278, "learning_rate": 0.0002, "epoch": 2.129263913824057, "step": 29650}, {"loss": 0.6577, "grad_norm": 0.9385744333267212, "learning_rate": 0.0002, "epoch": 2.1299820466786357, "step": 29660}, {"loss": 0.7088, "grad_norm": 0.7650160193443298, "learning_rate": 0.0002, "epoch": 2.1307001795332137, "step": 29670}, {"loss": 0.6742, "grad_norm": 0.7581976652145386, "learning_rate": 0.0002, "epoch": 2.1314183123877917, "step": 29680}, {"loss": 0.6358, "grad_norm": 0.8455183506011963, "learning_rate": 0.0002, "epoch": 2.1321364452423697, "step": 29690}, {"loss": 0.6288, "grad_norm": 0.7200509905815125, "learning_rate": 0.0002, "epoch": 2.132854578096948, "step": 29700}, {"loss": 0.695, "grad_norm": 0.7071877121925354, "learning_rate": 0.0002, "epoch": 2.133572710951526, "step": 29710}, {"loss": 0.6852, "grad_norm": 0.9197220802307129, "learning_rate": 0.0002, "epoch": 2.134290843806104, "step": 29720}, {"loss": 0.6578, "grad_norm": 0.6787277460098267, "learning_rate": 0.0002, "epoch": 2.135008976660682, "step": 29730}, {"loss": 0.666, "grad_norm": 0.8183788061141968, "learning_rate": 0.0002, "epoch": 2.13572710951526, "step": 29740}, {"loss": 0.6754, "grad_norm": 0.7958994507789612, "learning_rate": 0.0002, "epoch": 2.1364452423698386, "step": 29750}, {"loss": 0.6761, "grad_norm": 0.8803889155387878, "learning_rate": 0.0002, "epoch": 2.1371633752244166, "step": 29760}, {"loss": 0.686, "grad_norm": 0.6682677268981934, "learning_rate": 0.0002, "epoch": 2.1378815080789946, "step": 29770}, {"loss": 0.6878, "grad_norm": 1.0198085308074951, "learning_rate": 0.0002, "epoch": 2.1385996409335726, "step": 29780}, {"loss": 0.6576, "grad_norm": 1.0258227586746216, "learning_rate": 0.0002, "epoch": 2.139317773788151, "step": 29790}, {"loss": 0.6454, "grad_norm": 0.8920917510986328, "learning_rate": 0.0002, "epoch": 2.140035906642729, "step": 29800}, {"loss": 0.6926, "grad_norm": 0.8352635502815247, "learning_rate": 0.0002, "epoch": 2.140754039497307, "step": 29810}, {"loss": 0.692, "grad_norm": 0.8422067165374756, "learning_rate": 0.0002, "epoch": 2.141472172351885, "step": 29820}, {"loss": 0.72, "grad_norm": 0.8845202326774597, "learning_rate": 0.0002, "epoch": 2.142190305206463, "step": 29830}, {"loss": 0.688, "grad_norm": 0.659397542476654, "learning_rate": 0.0002, "epoch": 2.1429084380610415, "step": 29840}, {"loss": 0.6354, "grad_norm": 0.6233306527137756, "learning_rate": 0.0002, "epoch": 2.1436265709156195, "step": 29850}, {"loss": 0.6946, "grad_norm": 0.8951199054718018, "learning_rate": 0.0002, "epoch": 2.1443447037701975, "step": 29860}, {"loss": 0.6417, "grad_norm": 0.6980211734771729, "learning_rate": 0.0002, "epoch": 2.1450628366247755, "step": 29870}, {"loss": 0.6754, "grad_norm": 0.8463385105133057, "learning_rate": 0.0002, "epoch": 2.1457809694793535, "step": 29880}, {"loss": 0.6636, "grad_norm": 0.682183027267456, "learning_rate": 0.0002, "epoch": 2.146499102333932, "step": 29890}, {"loss": 0.6605, "grad_norm": 0.8491033911705017, "learning_rate": 0.0002, "epoch": 2.14721723518851, "step": 29900}, {"loss": 0.6851, "grad_norm": 0.8112631440162659, "learning_rate": 0.0002, "epoch": 2.147935368043088, "step": 29910}, {"loss": 0.6804, "grad_norm": 1.0186359882354736, "learning_rate": 0.0002, "epoch": 2.148653500897666, "step": 29920}, {"loss": 0.6709, "grad_norm": 0.7904929518699646, "learning_rate": 0.0002, "epoch": 2.149371633752244, "step": 29930}, {"loss": 0.6535, "grad_norm": 0.8381312489509583, "learning_rate": 0.0002, "epoch": 2.1500897666068224, "step": 29940}, {"loss": 0.6896, "grad_norm": 0.7596192359924316, "learning_rate": 0.0002, "epoch": 2.1508078994614004, "step": 29950}, {"loss": 0.6473, "grad_norm": 0.7532448768615723, "learning_rate": 0.0002, "epoch": 2.1515260323159784, "step": 29960}, {"loss": 0.7051, "grad_norm": 0.7877430319786072, "learning_rate": 0.0002, "epoch": 2.1522441651705564, "step": 29970}, {"loss": 0.6657, "grad_norm": 0.6870610117912292, "learning_rate": 0.0002, "epoch": 2.152962298025135, "step": 29980}, {"loss": 0.6518, "grad_norm": 0.7154987454414368, "learning_rate": 0.0002, "epoch": 2.153680430879713, "step": 29990}, {"loss": 0.6418, "grad_norm": 0.7692370414733887, "learning_rate": 0.0002, "epoch": 2.154398563734291, "step": 30000}, {"loss": 0.6557, "grad_norm": 0.7745859026908875, "learning_rate": 0.0002, "epoch": 2.155116696588869, "step": 30010}, {"loss": 0.61, "grad_norm": 0.718207061290741, "learning_rate": 0.0002, "epoch": 2.155834829443447, "step": 30020}, {"loss": 0.6348, "grad_norm": 0.8851615786552429, "learning_rate": 0.0002, "epoch": 2.1565529622980253, "step": 30030}, {"loss": 0.7108, "grad_norm": 0.736194372177124, "learning_rate": 0.0002, "epoch": 2.1572710951526033, "step": 30040}, {"loss": 0.6682, "grad_norm": 0.9908117055892944, "learning_rate": 0.0002, "epoch": 2.1579892280071813, "step": 30050}, {"loss": 0.6348, "grad_norm": 0.6772316694259644, "learning_rate": 0.0002, "epoch": 2.1587073608617593, "step": 30060}, {"loss": 0.6952, "grad_norm": 0.7474411725997925, "learning_rate": 0.0002, "epoch": 2.1594254937163377, "step": 30070}, {"loss": 0.6698, "grad_norm": 0.8140033483505249, "learning_rate": 0.0002, "epoch": 2.1601436265709157, "step": 30080}, {"loss": 0.6516, "grad_norm": 0.912555992603302, "learning_rate": 0.0002, "epoch": 2.1608617594254937, "step": 30090}, {"loss": 0.6818, "grad_norm": 0.8189636468887329, "learning_rate": 0.0002, "epoch": 2.1615798922800717, "step": 30100}, {"loss": 0.6662, "grad_norm": 0.7520000338554382, "learning_rate": 0.0002, "epoch": 2.1622980251346497, "step": 30110}, {"loss": 0.678, "grad_norm": 0.9635465741157532, "learning_rate": 0.0002, "epoch": 2.163016157989228, "step": 30120}, {"loss": 0.6641, "grad_norm": 0.9139830470085144, "learning_rate": 0.0002, "epoch": 2.163734290843806, "step": 30130}, {"loss": 0.6685, "grad_norm": 0.844384491443634, "learning_rate": 0.0002, "epoch": 2.164452423698384, "step": 30140}, {"loss": 0.708, "grad_norm": 0.8296793103218079, "learning_rate": 0.0002, "epoch": 2.165170556552962, "step": 30150}, {"loss": 0.668, "grad_norm": 0.7929309606552124, "learning_rate": 0.0002, "epoch": 2.16588868940754, "step": 30160}, {"loss": 0.6221, "grad_norm": 0.8046507239341736, "learning_rate": 0.0002, "epoch": 2.1666068222621186, "step": 30170}, {"loss": 0.6788, "grad_norm": 0.8161377310752869, "learning_rate": 0.0002, "epoch": 2.1673249551166966, "step": 30180}, {"loss": 0.6578, "grad_norm": 0.6984363794326782, "learning_rate": 0.0002, "epoch": 2.1680430879712747, "step": 30190}, {"loss": 0.6774, "grad_norm": 0.8578489422798157, "learning_rate": 0.0002, "epoch": 2.1687612208258527, "step": 30200}, {"loss": 0.668, "grad_norm": 0.8051524758338928, "learning_rate": 0.0002, "epoch": 2.1694793536804307, "step": 30210}, {"loss": 0.6212, "grad_norm": 0.6775792241096497, "learning_rate": 0.0002, "epoch": 2.170197486535009, "step": 30220}, {"loss": 0.705, "grad_norm": 0.7102242708206177, "learning_rate": 0.0002, "epoch": 2.170915619389587, "step": 30230}, {"loss": 0.6814, "grad_norm": 0.9038975238800049, "learning_rate": 0.0002, "epoch": 2.171633752244165, "step": 30240}, {"loss": 0.6919, "grad_norm": 0.8509918451309204, "learning_rate": 0.0002, "epoch": 2.172351885098743, "step": 30250}, {"loss": 0.6904, "grad_norm": 0.8816375732421875, "learning_rate": 0.0002, "epoch": 2.1730700179533216, "step": 30260}, {"loss": 0.7211, "grad_norm": 0.7907037138938904, "learning_rate": 0.0002, "epoch": 2.1737881508078996, "step": 30270}, {"loss": 0.6542, "grad_norm": 0.7104434967041016, "learning_rate": 0.0002, "epoch": 2.1745062836624776, "step": 30280}, {"loss": 0.6863, "grad_norm": 1.028658151626587, "learning_rate": 0.0002, "epoch": 2.1752244165170556, "step": 30290}, {"loss": 0.6789, "grad_norm": 0.8542430400848389, "learning_rate": 0.0002, "epoch": 2.1759425493716336, "step": 30300}, {"loss": 0.6783, "grad_norm": 0.7438064813613892, "learning_rate": 0.0002, "epoch": 2.176660682226212, "step": 30310}, {"loss": 0.63, "grad_norm": 0.8384708762168884, "learning_rate": 0.0002, "epoch": 2.17737881508079, "step": 30320}, {"loss": 0.6861, "grad_norm": 0.9034163355827332, "learning_rate": 0.0002, "epoch": 2.178096947935368, "step": 30330}, {"loss": 0.666, "grad_norm": 0.9659526944160461, "learning_rate": 0.0002, "epoch": 2.178815080789946, "step": 30340}, {"loss": 0.6819, "grad_norm": 0.6685642600059509, "learning_rate": 0.0002, "epoch": 2.1795332136445245, "step": 30350}, {"loss": 0.6759, "grad_norm": 0.9180589318275452, "learning_rate": 0.0002, "epoch": 2.1802513464991025, "step": 30360}, {"loss": 0.6575, "grad_norm": 0.9550795555114746, "learning_rate": 0.0002, "epoch": 2.1809694793536805, "step": 30370}, {"loss": 0.7014, "grad_norm": 0.8517686724662781, "learning_rate": 0.0002, "epoch": 2.1816876122082585, "step": 30380}, {"loss": 0.7069, "grad_norm": 0.7351927161216736, "learning_rate": 0.0002, "epoch": 2.1824057450628365, "step": 30390}, {"loss": 0.6555, "grad_norm": 0.8439408540725708, "learning_rate": 0.0002, "epoch": 2.183123877917415, "step": 30400}, {"loss": 0.69, "grad_norm": 0.8322570323944092, "learning_rate": 0.0002, "epoch": 2.183842010771993, "step": 30410}, {"loss": 0.6801, "grad_norm": 0.6735888123512268, "learning_rate": 0.0002, "epoch": 2.184560143626571, "step": 30420}, {"loss": 0.6844, "grad_norm": 0.7273133397102356, "learning_rate": 0.0002, "epoch": 2.185278276481149, "step": 30430}, {"loss": 0.7119, "grad_norm": 0.7841959595680237, "learning_rate": 0.0002, "epoch": 2.185996409335727, "step": 30440}, {"loss": 0.6717, "grad_norm": 0.67259281873703, "learning_rate": 0.0002, "epoch": 2.1867145421903054, "step": 30450}, {"loss": 0.6857, "grad_norm": 0.7646223306655884, "learning_rate": 0.0002, "epoch": 2.1874326750448834, "step": 30460}, {"loss": 0.6803, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 2.1881508078994614, "step": 30470}, {"loss": 0.6512, "grad_norm": 0.8818342685699463, "learning_rate": 0.0002, "epoch": 2.1888689407540394, "step": 30480}, {"loss": 0.6778, "grad_norm": 0.7421377897262573, "learning_rate": 0.0002, "epoch": 2.1895870736086174, "step": 30490}, {"loss": 0.6783, "grad_norm": 0.8180080652236938, "learning_rate": 0.0002, "epoch": 2.190305206463196, "step": 30500}, {"loss": 0.6774, "grad_norm": 0.8003571033477783, "learning_rate": 0.0002, "epoch": 2.191023339317774, "step": 30510}, {"loss": 0.7, "grad_norm": 0.8200605511665344, "learning_rate": 0.0002, "epoch": 2.191741472172352, "step": 30520}, {"loss": 0.7113, "grad_norm": 0.8878887295722961, "learning_rate": 0.0002, "epoch": 2.19245960502693, "step": 30530}, {"loss": 0.6364, "grad_norm": 0.8518163561820984, "learning_rate": 0.0002, "epoch": 2.1931777378815083, "step": 30540}, {"loss": 0.7039, "grad_norm": 0.8182454705238342, "learning_rate": 0.0002, "epoch": 2.1938958707360863, "step": 30550}, {"loss": 0.6966, "grad_norm": 0.9395919442176819, "learning_rate": 0.0002, "epoch": 2.1946140035906643, "step": 30560}, {"loss": 0.6617, "grad_norm": 0.7916256189346313, "learning_rate": 0.0002, "epoch": 2.1953321364452423, "step": 30570}, {"loss": 0.6869, "grad_norm": 0.7303445339202881, "learning_rate": 0.0002, "epoch": 2.1960502692998203, "step": 30580}, {"loss": 0.6485, "grad_norm": 0.7407387495040894, "learning_rate": 0.0002, "epoch": 2.1967684021543987, "step": 30590}, {"loss": 0.6704, "grad_norm": 0.7410500645637512, "learning_rate": 0.0002, "epoch": 2.1974865350089767, "step": 30600}, {"loss": 0.7013, "grad_norm": 0.9176440834999084, "learning_rate": 0.0002, "epoch": 2.1982046678635547, "step": 30610}, {"loss": 0.706, "grad_norm": 0.8823038935661316, "learning_rate": 0.0002, "epoch": 2.1989228007181327, "step": 30620}, {"loss": 0.7418, "grad_norm": 0.9263436198234558, "learning_rate": 0.0002, "epoch": 2.199640933572711, "step": 30630}, {"loss": 0.6019, "grad_norm": 0.6753571033477783, "learning_rate": 0.0002, "epoch": 2.200359066427289, "step": 30640}, {"loss": 0.6808, "grad_norm": 0.841160774230957, "learning_rate": 0.0002, "epoch": 2.201077199281867, "step": 30650}, {"loss": 0.6917, "grad_norm": 0.8786441683769226, "learning_rate": 0.0002, "epoch": 2.201795332136445, "step": 30660}, {"loss": 0.6878, "grad_norm": 0.8833681344985962, "learning_rate": 0.0002, "epoch": 2.202513464991023, "step": 30670}, {"loss": 0.7061, "grad_norm": 0.6609824299812317, "learning_rate": 0.0002, "epoch": 2.2032315978456016, "step": 30680}, {"loss": 0.6572, "grad_norm": 0.7308626174926758, "learning_rate": 0.0002, "epoch": 2.2039497307001796, "step": 30690}, {"loss": 0.7127, "grad_norm": 0.8854711055755615, "learning_rate": 0.0002, "epoch": 2.2046678635547576, "step": 30700}, {"loss": 0.6836, "grad_norm": 0.839043140411377, "learning_rate": 0.0002, "epoch": 2.2053859964093356, "step": 30710}, {"loss": 0.6577, "grad_norm": 0.9030174016952515, "learning_rate": 0.0002, "epoch": 2.2061041292639136, "step": 30720}, {"loss": 0.663, "grad_norm": 0.6856667399406433, "learning_rate": 0.0002, "epoch": 2.206822262118492, "step": 30730}, {"loss": 0.6672, "grad_norm": 0.8823501467704773, "learning_rate": 0.0002, "epoch": 2.20754039497307, "step": 30740}, {"loss": 0.6809, "grad_norm": 0.8501278162002563, "learning_rate": 0.0002, "epoch": 2.208258527827648, "step": 30750}, {"loss": 0.7402, "grad_norm": 0.8099446892738342, "learning_rate": 0.0002, "epoch": 2.208976660682226, "step": 30760}, {"loss": 0.6996, "grad_norm": 0.7203072905540466, "learning_rate": 0.0002, "epoch": 2.209694793536804, "step": 30770}, {"loss": 0.7494, "grad_norm": 1.0898563861846924, "learning_rate": 0.0002, "epoch": 2.2104129263913825, "step": 30780}, {"loss": 0.6432, "grad_norm": 0.8157216906547546, "learning_rate": 0.0002, "epoch": 2.2111310592459605, "step": 30790}, {"loss": 0.634, "grad_norm": 0.7617478966712952, "learning_rate": 0.0002, "epoch": 2.2118491921005385, "step": 30800}, {"loss": 0.7155, "grad_norm": 0.790503978729248, "learning_rate": 0.0002, "epoch": 2.2125673249551165, "step": 30810}, {"loss": 0.6301, "grad_norm": 0.9289199113845825, "learning_rate": 0.0002, "epoch": 2.213285457809695, "step": 30820}, {"loss": 0.6867, "grad_norm": 0.9267001748085022, "learning_rate": 0.0002, "epoch": 2.214003590664273, "step": 30830}, {"loss": 0.7012, "grad_norm": 0.716023862361908, "learning_rate": 0.0002, "epoch": 2.214721723518851, "step": 30840}, {"loss": 0.6755, "grad_norm": 0.8733863234519958, "learning_rate": 0.0002, "epoch": 2.215439856373429, "step": 30850}, {"loss": 0.6713, "grad_norm": 0.7743660807609558, "learning_rate": 0.0002, "epoch": 2.216157989228007, "step": 30860}, {"loss": 0.665, "grad_norm": 0.7974567413330078, "learning_rate": 0.0002, "epoch": 2.2168761220825854, "step": 30870}, {"loss": 0.6624, "grad_norm": 0.6617984771728516, "learning_rate": 0.0002, "epoch": 2.2175942549371634, "step": 30880}, {"loss": 0.6332, "grad_norm": 0.6925143003463745, "learning_rate": 0.0002, "epoch": 2.2183123877917414, "step": 30890}, {"loss": 0.6986, "grad_norm": 0.6853532195091248, "learning_rate": 0.0002, "epoch": 2.2190305206463194, "step": 30900}, {"loss": 0.6881, "grad_norm": 0.7964699268341064, "learning_rate": 0.0002, "epoch": 2.219748653500898, "step": 30910}, {"loss": 0.6879, "grad_norm": 0.8116228580474854, "learning_rate": 0.0002, "epoch": 2.220466786355476, "step": 30920}, {"loss": 0.6599, "grad_norm": 1.0121010541915894, "learning_rate": 0.0002, "epoch": 2.221184919210054, "step": 30930}, {"loss": 0.6873, "grad_norm": 0.7348445653915405, "learning_rate": 0.0002, "epoch": 2.221903052064632, "step": 30940}, {"loss": 0.6711, "grad_norm": 0.8998047709465027, "learning_rate": 0.0002, "epoch": 2.22262118491921, "step": 30950}, {"loss": 0.692, "grad_norm": 0.6108106970787048, "learning_rate": 0.0002, "epoch": 2.2233393177737883, "step": 30960}, {"loss": 0.6515, "grad_norm": 1.287834882736206, "learning_rate": 0.0002, "epoch": 2.2240574506283664, "step": 30970}, {"loss": 0.6513, "grad_norm": 0.8584468960762024, "learning_rate": 0.0002, "epoch": 2.2247755834829444, "step": 30980}, {"loss": 0.6907, "grad_norm": 0.865276038646698, "learning_rate": 0.0002, "epoch": 2.2254937163375224, "step": 30990}, {"loss": 0.7516, "grad_norm": 0.8713302612304688, "learning_rate": 0.0002, "epoch": 2.2262118491921004, "step": 31000}, {"loss": 0.7127, "grad_norm": 0.9210535883903503, "learning_rate": 0.0002, "epoch": 2.226929982046679, "step": 31010}, {"loss": 0.6543, "grad_norm": 0.8578430414199829, "learning_rate": 0.0002, "epoch": 2.227648114901257, "step": 31020}, {"loss": 0.6964, "grad_norm": 0.7128387093544006, "learning_rate": 0.0002, "epoch": 2.228366247755835, "step": 31030}, {"loss": 0.6949, "grad_norm": 0.8059941530227661, "learning_rate": 0.0002, "epoch": 2.229084380610413, "step": 31040}, {"loss": 0.6422, "grad_norm": 0.8043261170387268, "learning_rate": 0.0002, "epoch": 2.229802513464991, "step": 31050}, {"loss": 0.691, "grad_norm": 0.9260253310203552, "learning_rate": 0.0002, "epoch": 2.2305206463195693, "step": 31060}, {"loss": 0.6601, "grad_norm": 0.7908085584640503, "learning_rate": 0.0002, "epoch": 2.2312387791741473, "step": 31070}, {"loss": 0.6312, "grad_norm": 0.7860442996025085, "learning_rate": 0.0002, "epoch": 2.2319569120287253, "step": 31080}, {"loss": 0.715, "grad_norm": 0.8388702273368835, "learning_rate": 0.0002, "epoch": 2.2326750448833033, "step": 31090}, {"loss": 0.7015, "grad_norm": 0.835686206817627, "learning_rate": 0.0002, "epoch": 2.2333931777378817, "step": 31100}, {"loss": 0.6796, "grad_norm": 0.8148298859596252, "learning_rate": 0.0002, "epoch": 2.2341113105924597, "step": 31110}, {"loss": 0.6318, "grad_norm": 0.8501878976821899, "learning_rate": 0.0002, "epoch": 2.2348294434470377, "step": 31120}, {"loss": 0.7262, "grad_norm": 0.793323278427124, "learning_rate": 0.0002, "epoch": 2.2355475763016157, "step": 31130}, {"loss": 0.722, "grad_norm": 0.8234742879867554, "learning_rate": 0.0002, "epoch": 2.2362657091561937, "step": 31140}, {"loss": 0.6746, "grad_norm": 0.8691303133964539, "learning_rate": 0.0002, "epoch": 2.236983842010772, "step": 31150}, {"loss": 0.6191, "grad_norm": 0.8707090020179749, "learning_rate": 0.0002, "epoch": 2.23770197486535, "step": 31160}, {"loss": 0.6988, "grad_norm": 0.8468940854072571, "learning_rate": 0.0002, "epoch": 2.238420107719928, "step": 31170}, {"loss": 0.6429, "grad_norm": 0.7275772094726562, "learning_rate": 0.0002, "epoch": 2.239138240574506, "step": 31180}, {"loss": 0.7057, "grad_norm": 0.8765808939933777, "learning_rate": 0.0002, "epoch": 2.2398563734290846, "step": 31190}, {"loss": 0.7273, "grad_norm": 1.02803635597229, "learning_rate": 0.0002, "epoch": 2.2405745062836626, "step": 31200}, {"loss": 0.7303, "grad_norm": 0.7999185919761658, "learning_rate": 0.0002, "epoch": 2.2412926391382406, "step": 31210}, {"loss": 0.658, "grad_norm": 0.5711870789527893, "learning_rate": 0.0002, "epoch": 2.2420107719928186, "step": 31220}, {"loss": 0.6527, "grad_norm": 0.7183604836463928, "learning_rate": 0.0002, "epoch": 2.2427289048473966, "step": 31230}, {"loss": 0.6817, "grad_norm": 0.8819206357002258, "learning_rate": 0.0002, "epoch": 2.243447037701975, "step": 31240}, {"loss": 0.6805, "grad_norm": 0.9078969955444336, "learning_rate": 0.0002, "epoch": 2.244165170556553, "step": 31250}, {"loss": 0.6937, "grad_norm": 1.184506893157959, "learning_rate": 0.0002, "epoch": 2.244883303411131, "step": 31260}, {"loss": 0.7682, "grad_norm": 0.8660752177238464, "learning_rate": 0.0002, "epoch": 2.245601436265709, "step": 31270}, {"loss": 0.6461, "grad_norm": 1.011796236038208, "learning_rate": 0.0002, "epoch": 2.246319569120287, "step": 31280}, {"loss": 0.677, "grad_norm": 0.9168157577514648, "learning_rate": 0.0002, "epoch": 2.2470377019748655, "step": 31290}, {"loss": 0.6844, "grad_norm": 0.7798577547073364, "learning_rate": 0.0002, "epoch": 2.2477558348294435, "step": 31300}, {"loss": 0.6622, "grad_norm": 0.6609913110733032, "learning_rate": 0.0002, "epoch": 2.2484739676840215, "step": 31310}, {"loss": 0.6616, "grad_norm": 0.64737868309021, "learning_rate": 0.0002, "epoch": 2.2491921005385995, "step": 31320}, {"loss": 0.665, "grad_norm": 1.0700385570526123, "learning_rate": 0.0002, "epoch": 2.2499102333931775, "step": 31330}, {"loss": 0.6539, "grad_norm": 0.7838551998138428, "learning_rate": 0.0002, "epoch": 2.250628366247756, "step": 31340}, {"loss": 0.7002, "grad_norm": 0.9225728511810303, "learning_rate": 0.0002, "epoch": 2.251346499102334, "step": 31350}, {"loss": 0.6758, "grad_norm": 0.7956384420394897, "learning_rate": 0.0002, "epoch": 2.252064631956912, "step": 31360}, {"loss": 0.7039, "grad_norm": 0.7645466923713684, "learning_rate": 0.0002, "epoch": 2.25278276481149, "step": 31370}, {"loss": 0.6816, "grad_norm": 0.9595549702644348, "learning_rate": 0.0002, "epoch": 2.2535008976660684, "step": 31380}, {"loss": 0.6419, "grad_norm": 0.6124163866043091, "learning_rate": 0.0002, "epoch": 2.2542190305206464, "step": 31390}, {"loss": 0.6573, "grad_norm": 0.7531530261039734, "learning_rate": 0.0002, "epoch": 2.2549371633752244, "step": 31400}, {"loss": 0.6223, "grad_norm": 0.6904721856117249, "learning_rate": 0.0002, "epoch": 2.2556552962298024, "step": 31410}, {"loss": 0.6661, "grad_norm": 0.7644204497337341, "learning_rate": 0.0002, "epoch": 2.2563734290843804, "step": 31420}, {"loss": 0.7122, "grad_norm": 0.7879737019538879, "learning_rate": 0.0002, "epoch": 2.257091561938959, "step": 31430}, {"loss": 0.6407, "grad_norm": 0.796450138092041, "learning_rate": 0.0002, "epoch": 2.257809694793537, "step": 31440}, {"loss": 0.722, "grad_norm": 0.7536656856536865, "learning_rate": 0.0002, "epoch": 2.258527827648115, "step": 31450}, {"loss": 0.681, "grad_norm": 0.6797451376914978, "learning_rate": 0.0002, "epoch": 2.259245960502693, "step": 31460}, {"loss": 0.6916, "grad_norm": 0.7833347320556641, "learning_rate": 0.0002, "epoch": 2.2599640933572713, "step": 31470}, {"loss": 0.702, "grad_norm": 0.7571428418159485, "learning_rate": 0.0002, "epoch": 2.2606822262118493, "step": 31480}, {"loss": 0.6878, "grad_norm": 0.7028690576553345, "learning_rate": 0.0002, "epoch": 2.2614003590664273, "step": 31490}, {"loss": 0.6863, "grad_norm": 0.7854651212692261, "learning_rate": 0.0002, "epoch": 2.2621184919210053, "step": 31500}, {"loss": 0.6895, "grad_norm": 1.1924974918365479, "learning_rate": 0.0002, "epoch": 2.2628366247755833, "step": 31510}, {"loss": 0.7174, "grad_norm": 0.8087588548660278, "learning_rate": 0.0002, "epoch": 2.2635547576301613, "step": 31520}, {"loss": 0.6398, "grad_norm": 0.8521981835365295, "learning_rate": 0.0002, "epoch": 2.26427289048474, "step": 31530}, {"loss": 0.6654, "grad_norm": 0.754585862159729, "learning_rate": 0.0002, "epoch": 2.264991023339318, "step": 31540}, {"loss": 0.6854, "grad_norm": 0.8403395414352417, "learning_rate": 0.0002, "epoch": 2.265709156193896, "step": 31550}, {"loss": 0.6873, "grad_norm": 0.9724786877632141, "learning_rate": 0.0002, "epoch": 2.266427289048474, "step": 31560}, {"loss": 0.6876, "grad_norm": 0.7568767070770264, "learning_rate": 0.0002, "epoch": 2.2671454219030522, "step": 31570}, {"loss": 0.6161, "grad_norm": 0.712009608745575, "learning_rate": 0.0002, "epoch": 2.2678635547576302, "step": 31580}, {"loss": 0.6568, "grad_norm": 0.7649937868118286, "learning_rate": 0.0002, "epoch": 2.2685816876122082, "step": 31590}, {"loss": 0.6195, "grad_norm": 0.7319537997245789, "learning_rate": 0.0002, "epoch": 2.2692998204667862, "step": 31600}, {"loss": 0.6434, "grad_norm": 0.9597942233085632, "learning_rate": 0.0002, "epoch": 2.2700179533213642, "step": 31610}, {"loss": 0.6273, "grad_norm": 0.7403358817100525, "learning_rate": 0.0002, "epoch": 2.2707360861759427, "step": 31620}, {"loss": 0.7185, "grad_norm": 0.7395114898681641, "learning_rate": 0.0002, "epoch": 2.2714542190305207, "step": 31630}, {"loss": 0.6357, "grad_norm": 0.8835344314575195, "learning_rate": 0.0002, "epoch": 2.2721723518850987, "step": 31640}, {"loss": 0.7442, "grad_norm": 0.76587975025177, "learning_rate": 0.0002, "epoch": 2.2728904847396767, "step": 31650}, {"loss": 0.6491, "grad_norm": 0.6472584009170532, "learning_rate": 0.0002, "epoch": 2.273608617594255, "step": 31660}, {"loss": 0.7026, "grad_norm": 1.0170460939407349, "learning_rate": 0.0002, "epoch": 2.274326750448833, "step": 31670}, {"loss": 0.6839, "grad_norm": 0.8170912265777588, "learning_rate": 0.0002, "epoch": 2.275044883303411, "step": 31680}, {"loss": 0.6599, "grad_norm": 0.6821279525756836, "learning_rate": 0.0002, "epoch": 2.275763016157989, "step": 31690}, {"loss": 0.6346, "grad_norm": 0.8150709867477417, "learning_rate": 0.0002, "epoch": 2.276481149012567, "step": 31700}, {"loss": 0.6639, "grad_norm": 0.6786386370658875, "learning_rate": 0.0002, "epoch": 2.2771992818671456, "step": 31710}, {"loss": 0.6753, "grad_norm": 0.8871912360191345, "learning_rate": 0.0002, "epoch": 2.2779174147217236, "step": 31720}, {"loss": 0.6826, "grad_norm": 0.7710220813751221, "learning_rate": 0.0002, "epoch": 2.2786355475763016, "step": 31730}, {"loss": 0.7118, "grad_norm": 0.8073079586029053, "learning_rate": 0.0002, "epoch": 2.2793536804308796, "step": 31740}, {"loss": 0.6614, "grad_norm": 0.8228550553321838, "learning_rate": 0.0002, "epoch": 2.280071813285458, "step": 31750}, {"loss": 0.7162, "grad_norm": 0.7987996339797974, "learning_rate": 0.0002, "epoch": 2.280789946140036, "step": 31760}, {"loss": 0.6953, "grad_norm": 0.744326651096344, "learning_rate": 0.0002, "epoch": 2.281508078994614, "step": 31770}, {"loss": 0.7089, "grad_norm": 0.7672302722930908, "learning_rate": 0.0002, "epoch": 2.282226211849192, "step": 31780}, {"loss": 0.6926, "grad_norm": 0.8079774975776672, "learning_rate": 0.0002, "epoch": 2.28294434470377, "step": 31790}, {"loss": 0.6361, "grad_norm": 0.7383643984794617, "learning_rate": 0.0002, "epoch": 2.283662477558348, "step": 31800}, {"loss": 0.6924, "grad_norm": 0.8542332649230957, "learning_rate": 0.0002, "epoch": 2.2843806104129265, "step": 31810}, {"loss": 0.7156, "grad_norm": 0.7657321691513062, "learning_rate": 0.0002, "epoch": 2.2850987432675045, "step": 31820}, {"loss": 0.6545, "grad_norm": 0.7485944628715515, "learning_rate": 0.0002, "epoch": 2.2858168761220825, "step": 31830}, {"loss": 0.6452, "grad_norm": 0.7817596793174744, "learning_rate": 0.0002, "epoch": 2.2865350089766605, "step": 31840}, {"loss": 0.6398, "grad_norm": 0.840421736240387, "learning_rate": 0.0002, "epoch": 2.287253141831239, "step": 31850}, {"loss": 0.7245, "grad_norm": 0.8190447688102722, "learning_rate": 0.0002, "epoch": 2.287971274685817, "step": 31860}, {"loss": 0.7343, "grad_norm": 0.9582287669181824, "learning_rate": 0.0002, "epoch": 2.288689407540395, "step": 31870}, {"loss": 0.683, "grad_norm": 1.0939116477966309, "learning_rate": 0.0002, "epoch": 2.289407540394973, "step": 31880}, {"loss": 0.7176, "grad_norm": 1.0901678800582886, "learning_rate": 0.0002, "epoch": 2.290125673249551, "step": 31890}, {"loss": 0.6711, "grad_norm": 0.8025168776512146, "learning_rate": 0.0002, "epoch": 2.2908438061041294, "step": 31900}, {"loss": 0.6901, "grad_norm": 0.8157371878623962, "learning_rate": 0.0002, "epoch": 2.2915619389587074, "step": 31910}, {"loss": 0.6643, "grad_norm": 0.7735328078269958, "learning_rate": 0.0002, "epoch": 2.2922800718132854, "step": 31920}, {"loss": 0.689, "grad_norm": 0.7501550316810608, "learning_rate": 0.0002, "epoch": 2.2929982046678634, "step": 31930}, {"loss": 0.6605, "grad_norm": 0.76664799451828, "learning_rate": 0.0002, "epoch": 2.293716337522442, "step": 31940}, {"loss": 0.6818, "grad_norm": 1.0044599771499634, "learning_rate": 0.0002, "epoch": 2.29443447037702, "step": 31950}, {"loss": 0.6566, "grad_norm": 0.7773551344871521, "learning_rate": 0.0002, "epoch": 2.295152603231598, "step": 31960}, {"loss": 0.6834, "grad_norm": 0.9021226763725281, "learning_rate": 0.0002, "epoch": 2.295870736086176, "step": 31970}, {"loss": 0.6757, "grad_norm": 0.9075915813446045, "learning_rate": 0.0002, "epoch": 2.296588868940754, "step": 31980}, {"loss": 0.6584, "grad_norm": 0.9109290242195129, "learning_rate": 0.0002, "epoch": 2.2973070017953323, "step": 31990}, {"loss": 0.6792, "grad_norm": 0.7742900252342224, "learning_rate": 0.0002, "epoch": 2.2980251346499103, "step": 32000}, {"loss": 0.7137, "grad_norm": 0.633260190486908, "learning_rate": 0.0002, "epoch": 2.2987432675044883, "step": 32010}, {"loss": 0.6644, "grad_norm": 0.8593834042549133, "learning_rate": 0.0002, "epoch": 2.2994614003590663, "step": 32020}, {"loss": 0.6961, "grad_norm": 0.88165283203125, "learning_rate": 0.0002, "epoch": 2.3001795332136448, "step": 32030}, {"loss": 0.7779, "grad_norm": 0.7840633988380432, "learning_rate": 0.0002, "epoch": 2.3008976660682228, "step": 32040}, {"loss": 0.7045, "grad_norm": 0.8150764107704163, "learning_rate": 0.0002, "epoch": 2.3016157989228008, "step": 32050}, {"loss": 0.6556, "grad_norm": 0.7683324813842773, "learning_rate": 0.0002, "epoch": 2.3023339317773788, "step": 32060}, {"loss": 0.6657, "grad_norm": 0.7581049799919128, "learning_rate": 0.0002, "epoch": 2.3030520646319568, "step": 32070}, {"loss": 0.6683, "grad_norm": 0.911687970161438, "learning_rate": 0.0002, "epoch": 2.3037701974865348, "step": 32080}, {"loss": 0.7029, "grad_norm": 1.0596355199813843, "learning_rate": 0.0002, "epoch": 2.3044883303411132, "step": 32090}, {"loss": 0.6955, "grad_norm": 0.7329661846160889, "learning_rate": 0.0002, "epoch": 2.3052064631956912, "step": 32100}, {"loss": 0.6798, "grad_norm": 0.8251074552536011, "learning_rate": 0.0002, "epoch": 2.3059245960502692, "step": 32110}, {"loss": 0.692, "grad_norm": 0.7765523195266724, "learning_rate": 0.0002, "epoch": 2.3066427289048472, "step": 32120}, {"loss": 0.6375, "grad_norm": 0.8246980905532837, "learning_rate": 0.0002, "epoch": 2.3073608617594257, "step": 32130}, {"loss": 0.6815, "grad_norm": 0.833387017250061, "learning_rate": 0.0002, "epoch": 2.3080789946140037, "step": 32140}, {"loss": 0.6261, "grad_norm": 0.9558065533638, "learning_rate": 0.0002, "epoch": 2.3087971274685817, "step": 32150}, {"loss": 0.6723, "grad_norm": 0.788151204586029, "learning_rate": 0.0002, "epoch": 2.3095152603231597, "step": 32160}, {"loss": 0.6398, "grad_norm": 0.8662320971488953, "learning_rate": 0.0002, "epoch": 2.3102333931777377, "step": 32170}, {"loss": 0.7014, "grad_norm": 0.7079060673713684, "learning_rate": 0.0002, "epoch": 2.310951526032316, "step": 32180}, {"loss": 0.6479, "grad_norm": 0.8477022647857666, "learning_rate": 0.0002, "epoch": 2.311669658886894, "step": 32190}, {"loss": 0.6872, "grad_norm": 0.6549711227416992, "learning_rate": 0.0002, "epoch": 2.312387791741472, "step": 32200}, {"loss": 0.6668, "grad_norm": 0.8274375796318054, "learning_rate": 0.0002, "epoch": 2.31310592459605, "step": 32210}, {"loss": 0.6731, "grad_norm": 0.6305822730064392, "learning_rate": 0.0002, "epoch": 2.3138240574506286, "step": 32220}, {"loss": 0.6908, "grad_norm": 0.8105725049972534, "learning_rate": 0.0002, "epoch": 2.3145421903052066, "step": 32230}, {"loss": 0.7028, "grad_norm": 0.7317119240760803, "learning_rate": 0.0002, "epoch": 2.3152603231597846, "step": 32240}, {"loss": 0.6444, "grad_norm": 0.7729924917221069, "learning_rate": 0.0002, "epoch": 2.3159784560143626, "step": 32250}, {"loss": 0.6945, "grad_norm": 0.8092145919799805, "learning_rate": 0.0002, "epoch": 2.3166965888689406, "step": 32260}, {"loss": 0.663, "grad_norm": 0.8723762035369873, "learning_rate": 0.0002, "epoch": 2.317414721723519, "step": 32270}, {"loss": 0.6992, "grad_norm": 0.9699533581733704, "learning_rate": 0.0002, "epoch": 2.318132854578097, "step": 32280}, {"loss": 0.7488, "grad_norm": 1.2972444295883179, "learning_rate": 0.0002, "epoch": 2.318850987432675, "step": 32290}, {"loss": 0.6969, "grad_norm": 0.7888450622558594, "learning_rate": 0.0002, "epoch": 2.319569120287253, "step": 32300}, {"loss": 0.6876, "grad_norm": 0.7457000017166138, "learning_rate": 0.0002, "epoch": 2.3202872531418315, "step": 32310}, {"loss": 0.6891, "grad_norm": 0.7270606756210327, "learning_rate": 0.0002, "epoch": 2.3210053859964095, "step": 32320}, {"loss": 0.6607, "grad_norm": 0.7930711507797241, "learning_rate": 0.0002, "epoch": 2.3217235188509875, "step": 32330}, {"loss": 0.7222, "grad_norm": 0.9015030264854431, "learning_rate": 0.0002, "epoch": 2.3224416517055655, "step": 32340}, {"loss": 0.6544, "grad_norm": 0.9385523796081543, "learning_rate": 0.0002, "epoch": 2.3231597845601435, "step": 32350}, {"loss": 0.6779, "grad_norm": 0.7293606400489807, "learning_rate": 0.0002, "epoch": 2.3238779174147215, "step": 32360}, {"loss": 0.6556, "grad_norm": 0.797618567943573, "learning_rate": 0.0002, "epoch": 2.3245960502693, "step": 32370}, {"loss": 0.6743, "grad_norm": 0.8588258028030396, "learning_rate": 0.0002, "epoch": 2.325314183123878, "step": 32380}, {"loss": 0.659, "grad_norm": 0.7490078210830688, "learning_rate": 0.0002, "epoch": 2.326032315978456, "step": 32390}, {"loss": 0.7365, "grad_norm": 0.7569956183433533, "learning_rate": 0.0002, "epoch": 2.326750448833034, "step": 32400}, {"loss": 0.7048, "grad_norm": 0.8754122853279114, "learning_rate": 0.0002, "epoch": 2.3274685816876124, "step": 32410}, {"loss": 0.6845, "grad_norm": 0.9410699605941772, "learning_rate": 0.0002, "epoch": 2.3281867145421904, "step": 32420}, {"loss": 0.6611, "grad_norm": 1.1309062242507935, "learning_rate": 0.0002, "epoch": 2.3289048473967684, "step": 32430}, {"loss": 0.6609, "grad_norm": 0.7923168540000916, "learning_rate": 0.0002, "epoch": 2.3296229802513464, "step": 32440}, {"loss": 0.6728, "grad_norm": 0.830387532711029, "learning_rate": 0.0002, "epoch": 2.3303411131059244, "step": 32450}, {"loss": 0.673, "grad_norm": 0.9087454080581665, "learning_rate": 0.0002, "epoch": 2.331059245960503, "step": 32460}, {"loss": 0.6749, "grad_norm": 0.8892660737037659, "learning_rate": 0.0002, "epoch": 2.331777378815081, "step": 32470}, {"loss": 0.7101, "grad_norm": 0.84930819272995, "learning_rate": 0.0002, "epoch": 2.332495511669659, "step": 32480}, {"loss": 0.6465, "grad_norm": 0.7736781239509583, "learning_rate": 0.0002, "epoch": 2.333213644524237, "step": 32490}, {"loss": 0.6976, "grad_norm": 0.7396222352981567, "learning_rate": 0.0002, "epoch": 2.3339317773788153, "step": 32500}, {"loss": 0.6484, "grad_norm": 0.7710241079330444, "learning_rate": 0.0002, "epoch": 2.3346499102333933, "step": 32510}, {"loss": 0.6591, "grad_norm": 0.7297301888465881, "learning_rate": 0.0002, "epoch": 2.3353680430879713, "step": 32520}, {"loss": 0.7375, "grad_norm": 0.9084094166755676, "learning_rate": 0.0002, "epoch": 2.3360861759425493, "step": 32530}, {"loss": 0.6775, "grad_norm": 0.6425859332084656, "learning_rate": 0.0002, "epoch": 2.3368043087971273, "step": 32540}, {"loss": 0.7249, "grad_norm": 0.8646581172943115, "learning_rate": 0.0002, "epoch": 2.3375224416517058, "step": 32550}, {"loss": 0.6862, "grad_norm": 0.91925048828125, "learning_rate": 0.0002, "epoch": 2.3382405745062838, "step": 32560}, {"loss": 0.6805, "grad_norm": 0.8687716722488403, "learning_rate": 0.0002, "epoch": 2.3389587073608618, "step": 32570}, {"loss": 0.6377, "grad_norm": 0.9769517183303833, "learning_rate": 0.0002, "epoch": 2.3396768402154398, "step": 32580}, {"loss": 0.6459, "grad_norm": 0.7240557074546814, "learning_rate": 0.0002, "epoch": 2.340394973070018, "step": 32590}, {"loss": 0.7029, "grad_norm": 0.6631549000740051, "learning_rate": 0.0002, "epoch": 2.341113105924596, "step": 32600}, {"loss": 0.6524, "grad_norm": 0.9103635549545288, "learning_rate": 0.0002, "epoch": 2.341831238779174, "step": 32610}, {"loss": 0.6695, "grad_norm": 0.8718403577804565, "learning_rate": 0.0002, "epoch": 2.342549371633752, "step": 32620}, {"loss": 0.7006, "grad_norm": 0.8020271062850952, "learning_rate": 0.0002, "epoch": 2.34326750448833, "step": 32630}, {"loss": 0.6853, "grad_norm": 0.7834265232086182, "learning_rate": 0.0002, "epoch": 2.343985637342908, "step": 32640}, {"loss": 0.6447, "grad_norm": 0.8909988403320312, "learning_rate": 0.0002, "epoch": 2.3447037701974867, "step": 32650}, {"loss": 0.6762, "grad_norm": 0.6915582418441772, "learning_rate": 0.0002, "epoch": 2.3454219030520647, "step": 32660}, {"loss": 0.6993, "grad_norm": 0.8829401135444641, "learning_rate": 0.0002, "epoch": 2.3461400359066427, "step": 32670}, {"loss": 0.6035, "grad_norm": 0.8869150876998901, "learning_rate": 0.0002, "epoch": 2.3468581687612207, "step": 32680}, {"loss": 0.6404, "grad_norm": 0.8348933458328247, "learning_rate": 0.0002, "epoch": 2.347576301615799, "step": 32690}, {"loss": 0.6961, "grad_norm": 0.7591108679771423, "learning_rate": 0.0002, "epoch": 2.348294434470377, "step": 32700}, {"loss": 0.7155, "grad_norm": 0.8343638181686401, "learning_rate": 0.0002, "epoch": 2.349012567324955, "step": 32710}, {"loss": 0.6949, "grad_norm": 0.8537896275520325, "learning_rate": 0.0002, "epoch": 2.349730700179533, "step": 32720}, {"loss": 0.6545, "grad_norm": 0.7750797867774963, "learning_rate": 0.0002, "epoch": 2.350448833034111, "step": 32730}, {"loss": 0.7226, "grad_norm": 0.7553941607475281, "learning_rate": 0.0002, "epoch": 2.3511669658886896, "step": 32740}, {"loss": 0.6985, "grad_norm": 0.8083372712135315, "learning_rate": 0.0002, "epoch": 2.3518850987432676, "step": 32750}, {"loss": 0.6345, "grad_norm": 0.8016324043273926, "learning_rate": 0.0002, "epoch": 2.3526032315978456, "step": 32760}, {"loss": 0.6348, "grad_norm": 0.7524061799049377, "learning_rate": 0.0002, "epoch": 2.3533213644524236, "step": 32770}, {"loss": 0.6782, "grad_norm": 0.9046763777732849, "learning_rate": 0.0002, "epoch": 2.354039497307002, "step": 32780}, {"loss": 0.6745, "grad_norm": 0.9704324007034302, "learning_rate": 0.0002, "epoch": 2.35475763016158, "step": 32790}, {"loss": 0.7095, "grad_norm": 0.8756019473075867, "learning_rate": 0.0002, "epoch": 2.355475763016158, "step": 32800}, {"loss": 0.6989, "grad_norm": 0.7345646023750305, "learning_rate": 0.0002, "epoch": 2.356193895870736, "step": 32810}, {"loss": 0.6659, "grad_norm": 0.8022899031639099, "learning_rate": 0.0002, "epoch": 2.356912028725314, "step": 32820}, {"loss": 0.6997, "grad_norm": 0.7663353085517883, "learning_rate": 0.0002, "epoch": 2.3576301615798925, "step": 32830}, {"loss": 0.6683, "grad_norm": 0.7802956104278564, "learning_rate": 0.0002, "epoch": 2.3583482944344705, "step": 32840}, {"loss": 0.679, "grad_norm": 0.8130960464477539, "learning_rate": 0.0002, "epoch": 2.3590664272890485, "step": 32850}, {"loss": 0.6792, "grad_norm": 0.9671252369880676, "learning_rate": 0.0002, "epoch": 2.3597845601436265, "step": 32860}, {"loss": 0.6989, "grad_norm": 0.8806724548339844, "learning_rate": 0.0002, "epoch": 2.3605026929982045, "step": 32870}, {"loss": 0.6674, "grad_norm": 0.9378283619880676, "learning_rate": 0.0002, "epoch": 2.361220825852783, "step": 32880}, {"loss": 0.6607, "grad_norm": 0.8638162612915039, "learning_rate": 0.0002, "epoch": 2.361938958707361, "step": 32890}, {"loss": 0.6866, "grad_norm": 0.7321885228157043, "learning_rate": 0.0002, "epoch": 2.362657091561939, "step": 32900}, {"loss": 0.6682, "grad_norm": 0.8445415496826172, "learning_rate": 0.0002, "epoch": 2.363375224416517, "step": 32910}, {"loss": 0.6863, "grad_norm": 0.915715754032135, "learning_rate": 0.0002, "epoch": 2.364093357271095, "step": 32920}, {"loss": 0.6671, "grad_norm": 0.8674854040145874, "learning_rate": 0.0002, "epoch": 2.3648114901256734, "step": 32930}, {"loss": 0.7124, "grad_norm": 0.7577189207077026, "learning_rate": 0.0002, "epoch": 2.3655296229802514, "step": 32940}, {"loss": 0.6879, "grad_norm": 0.8649988174438477, "learning_rate": 0.0002, "epoch": 2.3662477558348294, "step": 32950}, {"loss": 0.6571, "grad_norm": 0.9760734438896179, "learning_rate": 0.0002, "epoch": 2.3669658886894074, "step": 32960}, {"loss": 0.7002, "grad_norm": 0.8909491300582886, "learning_rate": 0.0002, "epoch": 2.367684021543986, "step": 32970}, {"loss": 0.6961, "grad_norm": 0.6970168948173523, "learning_rate": 0.0002, "epoch": 2.368402154398564, "step": 32980}, {"loss": 0.6153, "grad_norm": 0.8208426237106323, "learning_rate": 0.0002, "epoch": 2.369120287253142, "step": 32990}, {"loss": 0.626, "grad_norm": 0.8477405309677124, "learning_rate": 0.0002, "epoch": 2.36983842010772, "step": 33000}, {"loss": 0.6588, "grad_norm": 0.7771625518798828, "learning_rate": 0.0002, "epoch": 2.370556552962298, "step": 33010}, {"loss": 0.673, "grad_norm": 0.7811821103096008, "learning_rate": 0.0002, "epoch": 2.3712746858168763, "step": 33020}, {"loss": 0.6792, "grad_norm": 0.6280415654182434, "learning_rate": 0.0002, "epoch": 2.3719928186714543, "step": 33030}, {"loss": 0.6567, "grad_norm": 0.8733929395675659, "learning_rate": 0.0002, "epoch": 2.3727109515260323, "step": 33040}, {"loss": 0.6844, "grad_norm": 0.6169558167457581, "learning_rate": 0.0002, "epoch": 2.3734290843806103, "step": 33050}, {"loss": 0.6675, "grad_norm": 0.7414724826812744, "learning_rate": 0.0002, "epoch": 2.3741472172351887, "step": 33060}, {"loss": 0.6905, "grad_norm": 0.7484683990478516, "learning_rate": 0.0002, "epoch": 2.3748653500897667, "step": 33070}, {"loss": 0.6676, "grad_norm": 0.8495098948478699, "learning_rate": 0.0002, "epoch": 2.3755834829443447, "step": 33080}, {"loss": 0.687, "grad_norm": 0.9057353734970093, "learning_rate": 0.0002, "epoch": 2.3763016157989227, "step": 33090}, {"loss": 0.6911, "grad_norm": 0.8028274178504944, "learning_rate": 0.0002, "epoch": 2.3770197486535007, "step": 33100}, {"loss": 0.6851, "grad_norm": 1.2398128509521484, "learning_rate": 0.0002, "epoch": 2.377737881508079, "step": 33110}, {"loss": 0.6753, "grad_norm": 0.7894110679626465, "learning_rate": 0.0002, "epoch": 2.378456014362657, "step": 33120}, {"loss": 0.6625, "grad_norm": 0.8530096411705017, "learning_rate": 0.0002, "epoch": 2.379174147217235, "step": 33130}, {"loss": 0.7061, "grad_norm": 0.892613410949707, "learning_rate": 0.0002, "epoch": 2.379892280071813, "step": 33140}, {"loss": 0.6719, "grad_norm": 0.868606448173523, "learning_rate": 0.0002, "epoch": 2.380610412926391, "step": 33150}, {"loss": 0.6423, "grad_norm": 0.6801115870475769, "learning_rate": 0.0002, "epoch": 2.3813285457809696, "step": 33160}, {"loss": 0.6723, "grad_norm": 0.9517148733139038, "learning_rate": 0.0002, "epoch": 2.3820466786355476, "step": 33170}, {"loss": 0.6957, "grad_norm": 0.8986499309539795, "learning_rate": 0.0002, "epoch": 2.3827648114901256, "step": 33180}, {"loss": 0.6767, "grad_norm": 0.8467642068862915, "learning_rate": 0.0002, "epoch": 2.3834829443447036, "step": 33190}, {"loss": 0.7228, "grad_norm": 0.8400940299034119, "learning_rate": 0.0002, "epoch": 2.3842010771992816, "step": 33200}, {"loss": 0.7048, "grad_norm": 0.86443030834198, "learning_rate": 0.0002, "epoch": 2.38491921005386, "step": 33210}, {"loss": 0.6227, "grad_norm": 0.8599014282226562, "learning_rate": 0.0002, "epoch": 2.385637342908438, "step": 33220}, {"loss": 0.673, "grad_norm": 0.868735134601593, "learning_rate": 0.0002, "epoch": 2.386355475763016, "step": 33230}, {"loss": 0.6612, "grad_norm": 0.941734790802002, "learning_rate": 0.0002, "epoch": 2.387073608617594, "step": 33240}, {"loss": 0.6951, "grad_norm": 0.9342881441116333, "learning_rate": 0.0002, "epoch": 2.3877917414721725, "step": 33250}, {"loss": 0.7255, "grad_norm": 1.012920618057251, "learning_rate": 0.0002, "epoch": 2.3885098743267505, "step": 33260}, {"loss": 0.6399, "grad_norm": 0.6949151754379272, "learning_rate": 0.0002, "epoch": 2.3892280071813286, "step": 33270}, {"loss": 0.7137, "grad_norm": 0.8283912539482117, "learning_rate": 0.0002, "epoch": 2.3899461400359066, "step": 33280}, {"loss": 0.7324, "grad_norm": 0.807273805141449, "learning_rate": 0.0002, "epoch": 2.3906642728904846, "step": 33290}, {"loss": 0.7353, "grad_norm": 0.8109124302864075, "learning_rate": 0.0002, "epoch": 2.391382405745063, "step": 33300}, {"loss": 0.689, "grad_norm": 0.7477563619613647, "learning_rate": 0.0002, "epoch": 2.392100538599641, "step": 33310}, {"loss": 0.6585, "grad_norm": 0.6961637735366821, "learning_rate": 0.0002, "epoch": 2.392818671454219, "step": 33320}, {"loss": 0.6919, "grad_norm": 0.9424173831939697, "learning_rate": 0.0002, "epoch": 2.393536804308797, "step": 33330}, {"loss": 0.6965, "grad_norm": 0.8289623856544495, "learning_rate": 0.0002, "epoch": 2.3942549371633755, "step": 33340}, {"loss": 0.6761, "grad_norm": 0.8106551170349121, "learning_rate": 0.0002, "epoch": 2.3949730700179535, "step": 33350}, {"loss": 0.6675, "grad_norm": 0.8800507187843323, "learning_rate": 0.0002, "epoch": 2.3956912028725315, "step": 33360}, {"loss": 0.6636, "grad_norm": 0.7662274241447449, "learning_rate": 0.0002, "epoch": 2.3964093357271095, "step": 33370}, {"loss": 0.6824, "grad_norm": 0.889204740524292, "learning_rate": 0.0002, "epoch": 2.3971274685816875, "step": 33380}, {"loss": 0.6539, "grad_norm": 0.7991349697113037, "learning_rate": 0.0002, "epoch": 2.3978456014362655, "step": 33390}, {"loss": 0.6818, "grad_norm": 0.8210278749465942, "learning_rate": 0.0002, "epoch": 2.398563734290844, "step": 33400}, {"loss": 0.7118, "grad_norm": 0.91801917552948, "learning_rate": 0.0002, "epoch": 2.399281867145422, "step": 33410}, {"loss": 0.726, "grad_norm": 0.8086220622062683, "learning_rate": 0.0002, "epoch": 2.4, "step": 33420}, {"loss": 0.7418, "grad_norm": 0.901613175868988, "learning_rate": 0.0002, "epoch": 2.400718132854578, "step": 33430}, {"loss": 0.6904, "grad_norm": 0.9865965247154236, "learning_rate": 0.0002, "epoch": 2.4014362657091564, "step": 33440}, {"loss": 0.7543, "grad_norm": 0.8160675168037415, "learning_rate": 0.0002, "epoch": 2.4021543985637344, "step": 33450}, {"loss": 0.6598, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 2.4028725314183124, "step": 33460}, {"loss": 0.6784, "grad_norm": 0.8490013480186462, "learning_rate": 0.0002, "epoch": 2.4035906642728904, "step": 33470}, {"loss": 0.6844, "grad_norm": 0.6947163939476013, "learning_rate": 0.0002, "epoch": 2.4043087971274684, "step": 33480}, {"loss": 0.6606, "grad_norm": 0.7984827756881714, "learning_rate": 0.0002, "epoch": 2.405026929982047, "step": 33490}, {"loss": 0.7032, "grad_norm": 0.7826083302497864, "learning_rate": 0.0002, "epoch": 2.405745062836625, "step": 33500}, {"loss": 0.6914, "grad_norm": 0.8213959336280823, "learning_rate": 0.0002, "epoch": 2.406463195691203, "step": 33510}, {"loss": 0.6855, "grad_norm": 0.8790069818496704, "learning_rate": 0.0002, "epoch": 2.407181328545781, "step": 33520}, {"loss": 0.6278, "grad_norm": 0.9093378782272339, "learning_rate": 0.0002, "epoch": 2.4078994614003593, "step": 33530}, {"loss": 0.6724, "grad_norm": 0.8085389137268066, "learning_rate": 0.0002, "epoch": 2.4086175942549373, "step": 33540}, {"loss": 0.6456, "grad_norm": 0.7952343225479126, "learning_rate": 0.0002, "epoch": 2.4093357271095153, "step": 33550}, {"loss": 0.7357, "grad_norm": 0.9576563835144043, "learning_rate": 0.0002, "epoch": 2.4100538599640933, "step": 33560}, {"loss": 0.7123, "grad_norm": 0.7722929120063782, "learning_rate": 0.0002, "epoch": 2.4107719928186713, "step": 33570}, {"loss": 0.6647, "grad_norm": 0.8634604215621948, "learning_rate": 0.0002, "epoch": 2.4114901256732497, "step": 33580}, {"loss": 0.6677, "grad_norm": 0.7805271148681641, "learning_rate": 0.0002, "epoch": 2.4122082585278277, "step": 33590}, {"loss": 0.6629, "grad_norm": 0.8274481296539307, "learning_rate": 0.0002, "epoch": 2.4129263913824057, "step": 33600}, {"loss": 0.6396, "grad_norm": 0.9265141487121582, "learning_rate": 0.0002, "epoch": 2.4136445242369837, "step": 33610}, {"loss": 0.6727, "grad_norm": 0.7497374415397644, "learning_rate": 0.0002, "epoch": 2.414362657091562, "step": 33620}, {"loss": 0.6543, "grad_norm": 0.7048972249031067, "learning_rate": 0.0002, "epoch": 2.41508078994614, "step": 33630}, {"loss": 0.6863, "grad_norm": 0.8449550271034241, "learning_rate": 0.0002, "epoch": 2.415798922800718, "step": 33640}, {"loss": 0.6891, "grad_norm": 0.7581984400749207, "learning_rate": 0.0002, "epoch": 2.416517055655296, "step": 33650}, {"loss": 0.6845, "grad_norm": 0.7744191288948059, "learning_rate": 0.0002, "epoch": 2.417235188509874, "step": 33660}, {"loss": 0.6412, "grad_norm": 0.6736614108085632, "learning_rate": 0.0002, "epoch": 2.417953321364452, "step": 33670}, {"loss": 0.6792, "grad_norm": 0.985431432723999, "learning_rate": 0.0002, "epoch": 2.4186714542190306, "step": 33680}, {"loss": 0.6675, "grad_norm": 0.8027978539466858, "learning_rate": 0.0002, "epoch": 2.4193895870736086, "step": 33690}, {"loss": 0.7107, "grad_norm": 0.6809377074241638, "learning_rate": 0.0002, "epoch": 2.4201077199281866, "step": 33700}, {"loss": 0.7332, "grad_norm": 0.8305349946022034, "learning_rate": 0.0002, "epoch": 2.4208258527827646, "step": 33710}, {"loss": 0.642, "grad_norm": 0.7632496356964111, "learning_rate": 0.0002, "epoch": 2.421543985637343, "step": 33720}, {"loss": 0.6614, "grad_norm": 0.7241050601005554, "learning_rate": 0.0002, "epoch": 2.422262118491921, "step": 33730}, {"loss": 0.6668, "grad_norm": 0.6729857325553894, "learning_rate": 0.0002, "epoch": 2.422980251346499, "step": 33740}, {"loss": 0.7289, "grad_norm": 0.7741881012916565, "learning_rate": 0.0002, "epoch": 2.423698384201077, "step": 33750}, {"loss": 0.6895, "grad_norm": 0.7844415903091431, "learning_rate": 0.0002, "epoch": 2.424416517055655, "step": 33760}, {"loss": 0.7073, "grad_norm": 0.7960098385810852, "learning_rate": 0.0002, "epoch": 2.4251346499102335, "step": 33770}, {"loss": 0.702, "grad_norm": 0.8267978429794312, "learning_rate": 0.0002, "epoch": 2.4258527827648115, "step": 33780}, {"loss": 0.6379, "grad_norm": 0.7498974204063416, "learning_rate": 0.0002, "epoch": 2.4265709156193895, "step": 33790}, {"loss": 0.6749, "grad_norm": 0.8357859253883362, "learning_rate": 0.0002, "epoch": 2.4272890484739675, "step": 33800}, {"loss": 0.6617, "grad_norm": 0.8056104779243469, "learning_rate": 0.0002, "epoch": 2.428007181328546, "step": 33810}, {"loss": 0.701, "grad_norm": 0.806897759437561, "learning_rate": 0.0002, "epoch": 2.428725314183124, "step": 33820}, {"loss": 0.6771, "grad_norm": 0.7770048975944519, "learning_rate": 0.0002, "epoch": 2.429443447037702, "step": 33830}, {"loss": 0.7096, "grad_norm": 0.8311458230018616, "learning_rate": 0.0002, "epoch": 2.43016157989228, "step": 33840}, {"loss": 0.7127, "grad_norm": 0.9201730489730835, "learning_rate": 0.0002, "epoch": 2.430879712746858, "step": 33850}, {"loss": 0.6722, "grad_norm": 0.83509761095047, "learning_rate": 0.0002, "epoch": 2.4315978456014364, "step": 33860}, {"loss": 0.6477, "grad_norm": 0.7680139541625977, "learning_rate": 0.0002, "epoch": 2.4323159784560144, "step": 33870}, {"loss": 0.7229, "grad_norm": 0.8956670165061951, "learning_rate": 0.0002, "epoch": 2.4330341113105924, "step": 33880}, {"loss": 0.6598, "grad_norm": 0.717941164970398, "learning_rate": 0.0002, "epoch": 2.4337522441651704, "step": 33890}, {"loss": 0.6546, "grad_norm": 0.777206540107727, "learning_rate": 0.0002, "epoch": 2.434470377019749, "step": 33900}, {"loss": 0.7442, "grad_norm": 0.90232914686203, "learning_rate": 0.0002, "epoch": 2.435188509874327, "step": 33910}, {"loss": 0.6763, "grad_norm": 1.0817158222198486, "learning_rate": 0.0002, "epoch": 2.435906642728905, "step": 33920}, {"loss": 0.6995, "grad_norm": 0.7890931367874146, "learning_rate": 0.0002, "epoch": 2.436624775583483, "step": 33930}, {"loss": 0.6438, "grad_norm": 0.9279449582099915, "learning_rate": 0.0002, "epoch": 2.437342908438061, "step": 33940}, {"loss": 0.6694, "grad_norm": 0.8313823342323303, "learning_rate": 0.0002, "epoch": 2.438061041292639, "step": 33950}, {"loss": 0.6841, "grad_norm": 1.0510340929031372, "learning_rate": 0.0002, "epoch": 2.4387791741472173, "step": 33960}, {"loss": 0.7203, "grad_norm": 0.8002574443817139, "learning_rate": 0.0002, "epoch": 2.4394973070017953, "step": 33970}, {"loss": 0.6767, "grad_norm": 0.7822834253311157, "learning_rate": 0.0002, "epoch": 2.4402154398563733, "step": 33980}, {"loss": 0.6289, "grad_norm": 0.9050403237342834, "learning_rate": 0.0002, "epoch": 2.4409335727109513, "step": 33990}, {"loss": 0.6798, "grad_norm": 0.7569652199745178, "learning_rate": 0.0002, "epoch": 2.44165170556553, "step": 34000}, {"loss": 0.648, "grad_norm": 0.6609470844268799, "learning_rate": 0.0002, "epoch": 2.442369838420108, "step": 34010}, {"loss": 0.6734, "grad_norm": 0.8090947866439819, "learning_rate": 0.0002, "epoch": 2.443087971274686, "step": 34020}, {"loss": 0.6621, "grad_norm": 0.647814929485321, "learning_rate": 0.0002, "epoch": 2.443806104129264, "step": 34030}, {"loss": 0.7227, "grad_norm": 0.9308601021766663, "learning_rate": 0.0002, "epoch": 2.444524236983842, "step": 34040}, {"loss": 0.6937, "grad_norm": 0.8259239792823792, "learning_rate": 0.0002, "epoch": 2.4452423698384202, "step": 34050}, {"loss": 0.6813, "grad_norm": 0.9410025477409363, "learning_rate": 0.0002, "epoch": 2.4459605026929983, "step": 34060}, {"loss": 0.7112, "grad_norm": 0.7446974515914917, "learning_rate": 0.0002, "epoch": 2.4466786355475763, "step": 34070}, {"loss": 0.6608, "grad_norm": 0.7093849182128906, "learning_rate": 0.0002, "epoch": 2.4473967684021543, "step": 34080}, {"loss": 0.6801, "grad_norm": 0.8726152181625366, "learning_rate": 0.0002, "epoch": 2.4481149012567327, "step": 34090}, {"loss": 0.7164, "grad_norm": 0.808300793170929, "learning_rate": 0.0002, "epoch": 2.4488330341113107, "step": 34100}, {"loss": 0.658, "grad_norm": 0.6884859800338745, "learning_rate": 0.0002, "epoch": 2.4495511669658887, "step": 34110}, {"loss": 0.6444, "grad_norm": 0.7151864767074585, "learning_rate": 0.0002, "epoch": 2.4502692998204667, "step": 34120}, {"loss": 0.6685, "grad_norm": 0.9261866807937622, "learning_rate": 0.0002, "epoch": 2.4509874326750447, "step": 34130}, {"loss": 0.6717, "grad_norm": 0.8069018125534058, "learning_rate": 0.0002, "epoch": 2.451705565529623, "step": 34140}, {"loss": 0.7436, "grad_norm": 0.8001297116279602, "learning_rate": 0.0002, "epoch": 2.452423698384201, "step": 34150}, {"loss": 0.7032, "grad_norm": 0.8547799587249756, "learning_rate": 0.0002, "epoch": 2.453141831238779, "step": 34160}, {"loss": 0.7226, "grad_norm": 0.6693823337554932, "learning_rate": 0.0002, "epoch": 2.453859964093357, "step": 34170}, {"loss": 0.6644, "grad_norm": 0.6646198630332947, "learning_rate": 0.0002, "epoch": 2.4545780969479356, "step": 34180}, {"loss": 0.6891, "grad_norm": 0.9330950975418091, "learning_rate": 0.0002, "epoch": 2.4552962298025136, "step": 34190}, {"loss": 0.6728, "grad_norm": 0.7738645672798157, "learning_rate": 0.0002, "epoch": 2.4560143626570916, "step": 34200}, {"loss": 0.7162, "grad_norm": 0.7929846048355103, "learning_rate": 0.0002, "epoch": 2.4567324955116696, "step": 34210}, {"loss": 0.6793, "grad_norm": 0.8936280012130737, "learning_rate": 0.0002, "epoch": 2.4574506283662476, "step": 34220}, {"loss": 0.6758, "grad_norm": 0.9099360108375549, "learning_rate": 0.0002, "epoch": 2.4581687612208256, "step": 34230}, {"loss": 0.666, "grad_norm": 0.7941291928291321, "learning_rate": 0.0002, "epoch": 2.458886894075404, "step": 34240}, {"loss": 0.6689, "grad_norm": 0.7169737219810486, "learning_rate": 0.0002, "epoch": 2.459605026929982, "step": 34250}, {"loss": 0.7417, "grad_norm": 0.8994171023368835, "learning_rate": 0.0002, "epoch": 2.46032315978456, "step": 34260}, {"loss": 0.6807, "grad_norm": 0.8087331056594849, "learning_rate": 0.0002, "epoch": 2.461041292639138, "step": 34270}, {"loss": 0.7152, "grad_norm": 0.935502827167511, "learning_rate": 0.0002, "epoch": 2.4617594254937165, "step": 34280}, {"loss": 0.7448, "grad_norm": 0.8957464694976807, "learning_rate": 0.0002, "epoch": 2.4624775583482945, "step": 34290}, {"loss": 0.6501, "grad_norm": 0.9017183780670166, "learning_rate": 0.0002, "epoch": 2.4631956912028725, "step": 34300}, {"loss": 0.6985, "grad_norm": 0.7778640389442444, "learning_rate": 0.0002, "epoch": 2.4639138240574505, "step": 34310}, {"loss": 0.7041, "grad_norm": 0.8870323896408081, "learning_rate": 0.0002, "epoch": 2.4646319569120285, "step": 34320}, {"loss": 0.6796, "grad_norm": 0.7660176753997803, "learning_rate": 0.0002, "epoch": 2.465350089766607, "step": 34330}, {"loss": 0.6705, "grad_norm": 0.8442226648330688, "learning_rate": 0.0002, "epoch": 2.466068222621185, "step": 34340}, {"loss": 0.7019, "grad_norm": 0.7522561550140381, "learning_rate": 0.0002, "epoch": 2.466786355475763, "step": 34350}, {"loss": 0.7331, "grad_norm": 0.9355213046073914, "learning_rate": 0.0002, "epoch": 2.467504488330341, "step": 34360}, {"loss": 0.688, "grad_norm": 0.8487382531166077, "learning_rate": 0.0002, "epoch": 2.4682226211849194, "step": 34370}, {"loss": 0.7068, "grad_norm": 0.7869813442230225, "learning_rate": 0.0002, "epoch": 2.4689407540394974, "step": 34380}, {"loss": 0.6809, "grad_norm": 0.7562848329544067, "learning_rate": 0.0002, "epoch": 2.4696588868940754, "step": 34390}, {"loss": 0.653, "grad_norm": 0.740829586982727, "learning_rate": 0.0002, "epoch": 2.4703770197486534, "step": 34400}, {"loss": 0.656, "grad_norm": 1.0862116813659668, "learning_rate": 0.0002, "epoch": 2.4710951526032314, "step": 34410}, {"loss": 0.6429, "grad_norm": 0.9633645415306091, "learning_rate": 0.0002, "epoch": 2.47181328545781, "step": 34420}, {"loss": 0.7126, "grad_norm": 0.8467186093330383, "learning_rate": 0.0002, "epoch": 2.472531418312388, "step": 34430}, {"loss": 0.6783, "grad_norm": 0.9972147941589355, "learning_rate": 0.0002, "epoch": 2.473249551166966, "step": 34440}, {"loss": 0.701, "grad_norm": 0.8086632490158081, "learning_rate": 0.0002, "epoch": 2.473967684021544, "step": 34450}, {"loss": 0.7127, "grad_norm": 0.9043704271316528, "learning_rate": 0.0002, "epoch": 2.4746858168761223, "step": 34460}, {"loss": 0.6861, "grad_norm": 0.8275330662727356, "learning_rate": 0.0002, "epoch": 2.4754039497307003, "step": 34470}, {"loss": 0.6443, "grad_norm": 0.8142464756965637, "learning_rate": 0.0002, "epoch": 2.4761220825852783, "step": 34480}, {"loss": 0.637, "grad_norm": 0.7116754651069641, "learning_rate": 0.0002, "epoch": 2.4768402154398563, "step": 34490}, {"loss": 0.6572, "grad_norm": 0.8742281198501587, "learning_rate": 0.0002, "epoch": 2.4775583482944343, "step": 34500}, {"loss": 0.6615, "grad_norm": 0.7545657157897949, "learning_rate": 0.0002, "epoch": 2.4782764811490123, "step": 34510}, {"loss": 0.6715, "grad_norm": 0.7586482167243958, "learning_rate": 0.0002, "epoch": 2.478994614003591, "step": 34520}, {"loss": 0.71, "grad_norm": 0.9212547540664673, "learning_rate": 0.0002, "epoch": 2.479712746858169, "step": 34530}, {"loss": 0.6742, "grad_norm": 0.9391530752182007, "learning_rate": 0.0002, "epoch": 2.480430879712747, "step": 34540}, {"loss": 0.6565, "grad_norm": 1.119698166847229, "learning_rate": 0.0002, "epoch": 2.481149012567325, "step": 34550}, {"loss": 0.6734, "grad_norm": 0.8499019145965576, "learning_rate": 0.0002, "epoch": 2.4818671454219032, "step": 34560}, {"loss": 0.7043, "grad_norm": 0.7629778385162354, "learning_rate": 0.0002, "epoch": 2.4825852782764812, "step": 34570}, {"loss": 0.671, "grad_norm": 0.7667021155357361, "learning_rate": 0.0002, "epoch": 2.4833034111310592, "step": 34580}, {"loss": 0.6202, "grad_norm": 0.6711493730545044, "learning_rate": 0.0002, "epoch": 2.4840215439856372, "step": 34590}, {"loss": 0.6644, "grad_norm": 0.7354223728179932, "learning_rate": 0.0002, "epoch": 2.4847396768402152, "step": 34600}, {"loss": 0.622, "grad_norm": 0.875295102596283, "learning_rate": 0.0002, "epoch": 2.4854578096947937, "step": 34610}, {"loss": 0.6946, "grad_norm": 0.7341493964195251, "learning_rate": 0.0002, "epoch": 2.4861759425493717, "step": 34620}, {"loss": 0.6674, "grad_norm": 0.9049216508865356, "learning_rate": 0.0002, "epoch": 2.4868940754039497, "step": 34630}, {"loss": 0.7017, "grad_norm": 0.7214788198471069, "learning_rate": 0.0002, "epoch": 2.4876122082585277, "step": 34640}, {"loss": 0.6571, "grad_norm": 0.7514070868492126, "learning_rate": 0.0002, "epoch": 2.488330341113106, "step": 34650}, {"loss": 0.6623, "grad_norm": 0.6929763555526733, "learning_rate": 0.0002, "epoch": 2.489048473967684, "step": 34660}, {"loss": 0.7118, "grad_norm": 1.11346435546875, "learning_rate": 0.0002, "epoch": 2.489766606822262, "step": 34670}, {"loss": 0.6664, "grad_norm": 0.9285556674003601, "learning_rate": 0.0002, "epoch": 2.49048473967684, "step": 34680}, {"loss": 0.7094, "grad_norm": 0.7699695825576782, "learning_rate": 0.0002, "epoch": 2.491202872531418, "step": 34690}, {"loss": 0.6575, "grad_norm": 0.872349739074707, "learning_rate": 0.0002, "epoch": 2.4919210053859966, "step": 34700}, {"loss": 0.6886, "grad_norm": 0.8692147135734558, "learning_rate": 0.0002, "epoch": 2.4926391382405746, "step": 34710}, {"loss": 0.711, "grad_norm": 0.799740195274353, "learning_rate": 0.0002, "epoch": 2.4933572710951526, "step": 34720}, {"loss": 0.6849, "grad_norm": 0.7320986986160278, "learning_rate": 0.0002, "epoch": 2.4940754039497306, "step": 34730}, {"loss": 0.7138, "grad_norm": 0.8233383893966675, "learning_rate": 0.0002, "epoch": 2.494793536804309, "step": 34740}, {"loss": 0.6937, "grad_norm": 0.9605086445808411, "learning_rate": 0.0002, "epoch": 2.495511669658887, "step": 34750}, {"loss": 0.6511, "grad_norm": 0.8597773909568787, "learning_rate": 0.0002, "epoch": 2.496229802513465, "step": 34760}, {"loss": 0.6793, "grad_norm": 0.7459201812744141, "learning_rate": 0.0002, "epoch": 2.496947935368043, "step": 34770}, {"loss": 0.7098, "grad_norm": 0.778457522392273, "learning_rate": 0.0002, "epoch": 2.497666068222621, "step": 34780}, {"loss": 0.6727, "grad_norm": 0.8591375946998596, "learning_rate": 0.0002, "epoch": 2.498384201077199, "step": 34790}, {"loss": 0.6439, "grad_norm": 0.9689867496490479, "learning_rate": 0.0002, "epoch": 2.4991023339317775, "step": 34800}, {"loss": 0.6365, "grad_norm": 0.7430615425109863, "learning_rate": 0.0002, "epoch": 2.4998204667863555, "step": 34810}, {"loss": 0.7207, "grad_norm": 0.8545114994049072, "learning_rate": 0.0002, "epoch": 2.5005385996409335, "step": 34820}, {"loss": 0.7318, "grad_norm": 0.7115356922149658, "learning_rate": 0.0002, "epoch": 2.5012567324955115, "step": 34830}, {"loss": 0.6985, "grad_norm": 0.7616795301437378, "learning_rate": 0.0002, "epoch": 2.50197486535009, "step": 34840}, {"loss": 0.7153, "grad_norm": 0.8097891211509705, "learning_rate": 0.0002, "epoch": 2.502692998204668, "step": 34850}, {"loss": 0.7131, "grad_norm": 0.7397396564483643, "learning_rate": 0.0002, "epoch": 2.503411131059246, "step": 34860}, {"loss": 0.7213, "grad_norm": 0.7531594038009644, "learning_rate": 0.0002, "epoch": 2.504129263913824, "step": 34870}, {"loss": 0.678, "grad_norm": 0.8050091862678528, "learning_rate": 0.0002, "epoch": 2.504847396768402, "step": 34880}, {"loss": 0.6765, "grad_norm": 0.7550507187843323, "learning_rate": 0.0002, "epoch": 2.5055655296229804, "step": 34890}, {"loss": 0.6861, "grad_norm": 1.0131759643554688, "learning_rate": 0.0002, "epoch": 2.5062836624775584, "step": 34900}, {"loss": 0.6755, "grad_norm": 0.9275356531143188, "learning_rate": 0.0002, "epoch": 2.5070017953321364, "step": 34910}, {"loss": 0.7108, "grad_norm": 0.6655791997909546, "learning_rate": 0.0002, "epoch": 2.5077199281867144, "step": 34920}, {"loss": 0.7154, "grad_norm": 0.79361891746521, "learning_rate": 0.0002, "epoch": 2.508438061041293, "step": 34930}, {"loss": 0.6506, "grad_norm": 0.8223658800125122, "learning_rate": 0.0002, "epoch": 2.509156193895871, "step": 34940}, {"loss": 0.6869, "grad_norm": 1.0070416927337646, "learning_rate": 0.0002, "epoch": 2.509874326750449, "step": 34950}, {"loss": 0.6819, "grad_norm": 0.8408986330032349, "learning_rate": 0.0002, "epoch": 2.510592459605027, "step": 34960}, {"loss": 0.7195, "grad_norm": 0.8178259134292603, "learning_rate": 0.0002, "epoch": 2.511310592459605, "step": 34970}, {"loss": 0.6738, "grad_norm": 0.747876763343811, "learning_rate": 0.0002, "epoch": 2.512028725314183, "step": 34980}, {"loss": 0.6706, "grad_norm": 0.8551825881004333, "learning_rate": 0.0002, "epoch": 2.5127468581687613, "step": 34990}, {"loss": 0.653, "grad_norm": 0.8366564512252808, "learning_rate": 0.0002, "epoch": 2.5134649910233393, "step": 35000}, {"loss": 0.6427, "grad_norm": 0.8491294384002686, "learning_rate": 0.0002, "epoch": 2.5141831238779173, "step": 35010}, {"loss": 0.6714, "grad_norm": 0.8854562640190125, "learning_rate": 0.0002, "epoch": 2.5149012567324958, "step": 35020}, {"loss": 0.6606, "grad_norm": 0.8652133345603943, "learning_rate": 0.0002, "epoch": 2.5156193895870738, "step": 35030}, {"loss": 0.658, "grad_norm": 0.8734033107757568, "learning_rate": 0.0002, "epoch": 2.5163375224416518, "step": 35040}, {"loss": 0.6528, "grad_norm": 0.8613446950912476, "learning_rate": 0.0002, "epoch": 2.5170556552962298, "step": 35050}, {"loss": 0.6943, "grad_norm": 0.762395441532135, "learning_rate": 0.0002, "epoch": 2.5177737881508078, "step": 35060}, {"loss": 0.66, "grad_norm": 0.806220293045044, "learning_rate": 0.0002, "epoch": 2.5184919210053858, "step": 35070}, {"loss": 0.6867, "grad_norm": 0.7781713008880615, "learning_rate": 0.0002, "epoch": 2.519210053859964, "step": 35080}, {"loss": 0.6927, "grad_norm": 0.8639848828315735, "learning_rate": 0.0002, "epoch": 2.519928186714542, "step": 35090}, {"loss": 0.6397, "grad_norm": 0.7331740260124207, "learning_rate": 0.0002, "epoch": 2.52064631956912, "step": 35100}, {"loss": 0.6916, "grad_norm": 0.8148137927055359, "learning_rate": 0.0002, "epoch": 2.521364452423698, "step": 35110}, {"loss": 0.6877, "grad_norm": 0.6939297914505005, "learning_rate": 0.0002, "epoch": 2.5220825852782767, "step": 35120}, {"loss": 0.6669, "grad_norm": 0.8151076436042786, "learning_rate": 0.0002, "epoch": 2.5228007181328547, "step": 35130}, {"loss": 0.6761, "grad_norm": 0.9193238019943237, "learning_rate": 0.0002, "epoch": 2.5235188509874327, "step": 35140}, {"loss": 0.7136, "grad_norm": 0.8230985403060913, "learning_rate": 0.0002, "epoch": 2.5242369838420107, "step": 35150}, {"loss": 0.7127, "grad_norm": 0.865492582321167, "learning_rate": 0.0002, "epoch": 2.5249551166965887, "step": 35160}, {"loss": 0.6591, "grad_norm": 0.7673570513725281, "learning_rate": 0.0002, "epoch": 2.525673249551167, "step": 35170}, {"loss": 0.6703, "grad_norm": 0.8296313881874084, "learning_rate": 0.0002, "epoch": 2.526391382405745, "step": 35180}, {"loss": 0.6588, "grad_norm": 0.6531317234039307, "learning_rate": 0.0002, "epoch": 2.527109515260323, "step": 35190}, {"loss": 0.7129, "grad_norm": 0.9865642189979553, "learning_rate": 0.0002, "epoch": 2.527827648114901, "step": 35200}, {"loss": 0.6728, "grad_norm": 0.8001098036766052, "learning_rate": 0.0002, "epoch": 2.5285457809694796, "step": 35210}, {"loss": 0.6737, "grad_norm": 0.7523218393325806, "learning_rate": 0.0002, "epoch": 2.5292639138240576, "step": 35220}, {"loss": 0.6426, "grad_norm": 1.061640977859497, "learning_rate": 0.0002, "epoch": 2.5299820466786356, "step": 35230}, {"loss": 0.6974, "grad_norm": 0.9668078422546387, "learning_rate": 0.0002, "epoch": 2.5307001795332136, "step": 35240}, {"loss": 0.7189, "grad_norm": 0.9554983973503113, "learning_rate": 0.0002, "epoch": 2.5314183123877916, "step": 35250}, {"loss": 0.648, "grad_norm": 0.8343066573143005, "learning_rate": 0.0002, "epoch": 2.5321364452423696, "step": 35260}, {"loss": 0.639, "grad_norm": 0.8408095240592957, "learning_rate": 0.0002, "epoch": 2.532854578096948, "step": 35270}, {"loss": 0.6412, "grad_norm": 0.8593984842300415, "learning_rate": 0.0002, "epoch": 2.533572710951526, "step": 35280}, {"loss": 0.6689, "grad_norm": 0.7593855261802673, "learning_rate": 0.0002, "epoch": 2.534290843806104, "step": 35290}, {"loss": 0.6731, "grad_norm": 0.9179701209068298, "learning_rate": 0.0002, "epoch": 2.5350089766606825, "step": 35300}, {"loss": 0.7194, "grad_norm": 0.749022901058197, "learning_rate": 0.0002, "epoch": 2.5357271095152605, "step": 35310}, {"loss": 0.6488, "grad_norm": 0.7172152400016785, "learning_rate": 0.0002, "epoch": 2.5364452423698385, "step": 35320}, {"loss": 0.6934, "grad_norm": 0.8228873610496521, "learning_rate": 0.0002, "epoch": 2.5371633752244165, "step": 35330}, {"loss": 0.7245, "grad_norm": 0.9663547277450562, "learning_rate": 0.0002, "epoch": 2.5378815080789945, "step": 35340}, {"loss": 0.6974, "grad_norm": 0.8446536660194397, "learning_rate": 0.0002, "epoch": 2.5385996409335725, "step": 35350}, {"loss": 0.6942, "grad_norm": 0.9751029014587402, "learning_rate": 0.0002, "epoch": 2.539317773788151, "step": 35360}, {"loss": 0.7001, "grad_norm": 0.7460315823554993, "learning_rate": 0.0002, "epoch": 2.540035906642729, "step": 35370}, {"loss": 0.6928, "grad_norm": 0.8269246816635132, "learning_rate": 0.0002, "epoch": 2.540754039497307, "step": 35380}, {"loss": 0.6559, "grad_norm": 0.7200030088424683, "learning_rate": 0.0002, "epoch": 2.541472172351885, "step": 35390}, {"loss": 0.6736, "grad_norm": 0.9586671590805054, "learning_rate": 0.0002, "epoch": 2.5421903052064634, "step": 35400}, {"loss": 0.6653, "grad_norm": 0.7872378826141357, "learning_rate": 0.0002, "epoch": 2.5429084380610414, "step": 35410}, {"loss": 0.7002, "grad_norm": 0.8257358074188232, "learning_rate": 0.0002, "epoch": 2.5436265709156194, "step": 35420}, {"loss": 0.6888, "grad_norm": 0.6924505829811096, "learning_rate": 0.0002, "epoch": 2.5443447037701974, "step": 35430}, {"loss": 0.6536, "grad_norm": 1.1171481609344482, "learning_rate": 0.0002, "epoch": 2.5450628366247754, "step": 35440}, {"loss": 0.7087, "grad_norm": 0.9635605216026306, "learning_rate": 0.0002, "epoch": 2.545780969479354, "step": 35450}, {"loss": 0.6545, "grad_norm": 0.9760567545890808, "learning_rate": 0.0002, "epoch": 2.546499102333932, "step": 35460}, {"loss": 0.6858, "grad_norm": 0.8523460030555725, "learning_rate": 0.0002, "epoch": 2.54721723518851, "step": 35470}, {"loss": 0.6702, "grad_norm": 0.9316970109939575, "learning_rate": 0.0002, "epoch": 2.547935368043088, "step": 35480}, {"loss": 0.7028, "grad_norm": 0.7401485443115234, "learning_rate": 0.0002, "epoch": 2.5486535008976663, "step": 35490}, {"loss": 0.6991, "grad_norm": 1.0627065896987915, "learning_rate": 0.0002, "epoch": 2.5493716337522443, "step": 35500}, {"loss": 0.6401, "grad_norm": 0.7463156580924988, "learning_rate": 0.0002, "epoch": 2.5500897666068223, "step": 35510}, {"loss": 0.6978, "grad_norm": 0.9935570359230042, "learning_rate": 0.0002, "epoch": 2.5508078994614003, "step": 35520}, {"loss": 0.7531, "grad_norm": 0.8824051022529602, "learning_rate": 0.0002, "epoch": 2.5515260323159783, "step": 35530}, {"loss": 0.7078, "grad_norm": 0.8018375635147095, "learning_rate": 0.0002, "epoch": 2.5522441651705563, "step": 35540}, {"loss": 0.6757, "grad_norm": 0.7523182034492493, "learning_rate": 0.0002, "epoch": 2.5529622980251347, "step": 35550}, {"loss": 0.6631, "grad_norm": 0.6771712303161621, "learning_rate": 0.0002, "epoch": 2.5536804308797127, "step": 35560}, {"loss": 0.6679, "grad_norm": 0.7903336882591248, "learning_rate": 0.0002, "epoch": 2.5543985637342908, "step": 35570}, {"loss": 0.7069, "grad_norm": 0.7973808646202087, "learning_rate": 0.0002, "epoch": 2.555116696588869, "step": 35580}, {"loss": 0.6388, "grad_norm": 0.9082772731781006, "learning_rate": 0.0002, "epoch": 2.555834829443447, "step": 35590}, {"loss": 0.6926, "grad_norm": 0.779671311378479, "learning_rate": 0.0002, "epoch": 2.556552962298025, "step": 35600}, {"loss": 0.6966, "grad_norm": 0.710058331489563, "learning_rate": 0.0002, "epoch": 2.557271095152603, "step": 35610}, {"loss": 0.701, "grad_norm": 0.8217873573303223, "learning_rate": 0.0002, "epoch": 2.557989228007181, "step": 35620}, {"loss": 0.6773, "grad_norm": 0.8017855286598206, "learning_rate": 0.0002, "epoch": 2.558707360861759, "step": 35630}, {"loss": 0.6764, "grad_norm": 0.6671402454376221, "learning_rate": 0.0002, "epoch": 2.5594254937163377, "step": 35640}, {"loss": 0.6946, "grad_norm": 0.9357045292854309, "learning_rate": 0.0002, "epoch": 2.5601436265709157, "step": 35650}, {"loss": 0.695, "grad_norm": 0.7676312327384949, "learning_rate": 0.0002, "epoch": 2.5608617594254937, "step": 35660}, {"loss": 0.7086, "grad_norm": 0.7602545619010925, "learning_rate": 0.0002, "epoch": 2.5615798922800717, "step": 35670}, {"loss": 0.695, "grad_norm": 0.8112275004386902, "learning_rate": 0.0002, "epoch": 2.56229802513465, "step": 35680}, {"loss": 0.7492, "grad_norm": 0.73296719789505, "learning_rate": 0.0002, "epoch": 2.563016157989228, "step": 35690}, {"loss": 0.6935, "grad_norm": 0.9007818102836609, "learning_rate": 0.0002, "epoch": 2.563734290843806, "step": 35700}, {"loss": 0.7287, "grad_norm": 0.7526060938835144, "learning_rate": 0.0002, "epoch": 2.564452423698384, "step": 35710}, {"loss": 0.6762, "grad_norm": 0.813875675201416, "learning_rate": 0.0002, "epoch": 2.565170556552962, "step": 35720}, {"loss": 0.666, "grad_norm": 0.7767695784568787, "learning_rate": 0.0002, "epoch": 2.5658886894075406, "step": 35730}, {"loss": 0.6591, "grad_norm": 0.7840573787689209, "learning_rate": 0.0002, "epoch": 2.5666068222621186, "step": 35740}, {"loss": 0.7131, "grad_norm": 0.7400487661361694, "learning_rate": 0.0002, "epoch": 2.5673249551166966, "step": 35750}, {"loss": 0.6571, "grad_norm": 0.7424315810203552, "learning_rate": 0.0002, "epoch": 2.5680430879712746, "step": 35760}, {"loss": 0.6861, "grad_norm": 0.7812185883522034, "learning_rate": 0.0002, "epoch": 2.568761220825853, "step": 35770}, {"loss": 0.7034, "grad_norm": 0.8397669196128845, "learning_rate": 0.0002, "epoch": 2.569479353680431, "step": 35780}, {"loss": 0.6734, "grad_norm": 0.7543849945068359, "learning_rate": 0.0002, "epoch": 2.570197486535009, "step": 35790}, {"loss": 0.7393, "grad_norm": 0.903634786605835, "learning_rate": 0.0002, "epoch": 2.570915619389587, "step": 35800}, {"loss": 0.6884, "grad_norm": 0.853335976600647, "learning_rate": 0.0002, "epoch": 2.571633752244165, "step": 35810}, {"loss": 0.6843, "grad_norm": 0.8441029787063599, "learning_rate": 0.0002, "epoch": 2.572351885098743, "step": 35820}, {"loss": 0.6874, "grad_norm": 0.9072228670120239, "learning_rate": 0.0002, "epoch": 2.5730700179533215, "step": 35830}, {"loss": 0.6866, "grad_norm": 0.7720168828964233, "learning_rate": 0.0002, "epoch": 2.5737881508078995, "step": 35840}, {"loss": 0.695, "grad_norm": 0.8719366788864136, "learning_rate": 0.0002, "epoch": 2.5745062836624775, "step": 35850}, {"loss": 0.7842, "grad_norm": 0.766209065914154, "learning_rate": 0.0002, "epoch": 2.575224416517056, "step": 35860}, {"loss": 0.6688, "grad_norm": 0.7814549207687378, "learning_rate": 0.0002, "epoch": 2.575942549371634, "step": 35870}, {"loss": 0.7309, "grad_norm": 0.8068482875823975, "learning_rate": 0.0002, "epoch": 2.576660682226212, "step": 35880}, {"loss": 0.703, "grad_norm": 0.8321225643157959, "learning_rate": 0.0002, "epoch": 2.57737881508079, "step": 35890}, {"loss": 0.6885, "grad_norm": 0.9787611961364746, "learning_rate": 0.0002, "epoch": 2.578096947935368, "step": 35900}, {"loss": 0.7246, "grad_norm": 0.6955108642578125, "learning_rate": 0.0002, "epoch": 2.578815080789946, "step": 35910}, {"loss": 0.6972, "grad_norm": 0.8309195637702942, "learning_rate": 0.0002, "epoch": 2.5795332136445244, "step": 35920}, {"loss": 0.6735, "grad_norm": 0.9309390783309937, "learning_rate": 0.0002, "epoch": 2.5802513464991024, "step": 35930}, {"loss": 0.7376, "grad_norm": 0.903537392616272, "learning_rate": 0.0002, "epoch": 2.5809694793536804, "step": 35940}, {"loss": 0.6578, "grad_norm": 0.9530633091926575, "learning_rate": 0.0002, "epoch": 2.5816876122082584, "step": 35950}, {"loss": 0.6707, "grad_norm": 1.0140212774276733, "learning_rate": 0.0002, "epoch": 2.582405745062837, "step": 35960}, {"loss": 0.6859, "grad_norm": 0.8224637508392334, "learning_rate": 0.0002, "epoch": 2.583123877917415, "step": 35970}, {"loss": 0.7158, "grad_norm": 0.7952998280525208, "learning_rate": 0.0002, "epoch": 2.583842010771993, "step": 35980}, {"loss": 0.65, "grad_norm": 0.6057878136634827, "learning_rate": 0.0002, "epoch": 2.584560143626571, "step": 35990}, {"loss": 0.6566, "grad_norm": 0.9172457456588745, "learning_rate": 0.0002, "epoch": 2.585278276481149, "step": 36000}, {"loss": 0.6863, "grad_norm": 1.0061585903167725, "learning_rate": 0.0002, "epoch": 2.5859964093357273, "step": 36010}, {"loss": 0.6831, "grad_norm": 0.8555058240890503, "learning_rate": 0.0002, "epoch": 2.5867145421903053, "step": 36020}, {"loss": 0.7181, "grad_norm": 0.7732099890708923, "learning_rate": 0.0002, "epoch": 2.5874326750448833, "step": 36030}, {"loss": 0.7383, "grad_norm": 0.9026121497154236, "learning_rate": 0.0002, "epoch": 2.5881508078994613, "step": 36040}, {"loss": 0.6221, "grad_norm": 0.7477090954780579, "learning_rate": 0.0002, "epoch": 2.5888689407540397, "step": 36050}, {"loss": 0.6852, "grad_norm": 0.8835780024528503, "learning_rate": 0.0002, "epoch": 2.5895870736086177, "step": 36060}, {"loss": 0.6786, "grad_norm": 0.7555899024009705, "learning_rate": 0.0002, "epoch": 2.5903052064631957, "step": 36070}, {"loss": 0.6723, "grad_norm": 0.7983574867248535, "learning_rate": 0.0002, "epoch": 2.5910233393177737, "step": 36080}, {"loss": 0.64, "grad_norm": 0.9261698722839355, "learning_rate": 0.0002, "epoch": 2.5917414721723517, "step": 36090}, {"loss": 0.6363, "grad_norm": 0.6834031343460083, "learning_rate": 0.0002, "epoch": 2.5924596050269297, "step": 36100}, {"loss": 0.702, "grad_norm": 0.9528526067733765, "learning_rate": 0.0002, "epoch": 2.593177737881508, "step": 36110}, {"loss": 0.7271, "grad_norm": 0.7469993233680725, "learning_rate": 0.0002, "epoch": 2.593895870736086, "step": 36120}, {"loss": 0.6967, "grad_norm": 0.6750355362892151, "learning_rate": 0.0002, "epoch": 2.594614003590664, "step": 36130}, {"loss": 0.6893, "grad_norm": 0.8591015338897705, "learning_rate": 0.0002, "epoch": 2.5953321364452426, "step": 36140}, {"loss": 0.7015, "grad_norm": 0.7359472513198853, "learning_rate": 0.0002, "epoch": 2.5960502692998206, "step": 36150}, {"loss": 0.6697, "grad_norm": 0.8450608253479004, "learning_rate": 0.0002, "epoch": 2.5967684021543986, "step": 36160}, {"loss": 0.7034, "grad_norm": 0.9069468975067139, "learning_rate": 0.0002, "epoch": 2.5974865350089766, "step": 36170}, {"loss": 0.6814, "grad_norm": 0.9261118173599243, "learning_rate": 0.0002, "epoch": 2.5982046678635546, "step": 36180}, {"loss": 0.6575, "grad_norm": 0.7164715528488159, "learning_rate": 0.0002, "epoch": 2.5989228007181326, "step": 36190}, {"loss": 0.7044, "grad_norm": 0.8809511661529541, "learning_rate": 0.0002, "epoch": 2.599640933572711, "step": 36200}, {"loss": 0.6333, "grad_norm": 0.9872701168060303, "learning_rate": 0.0002, "epoch": 2.600359066427289, "step": 36210}, {"loss": 0.689, "grad_norm": 0.7544043064117432, "learning_rate": 0.0002, "epoch": 2.601077199281867, "step": 36220}, {"loss": 0.658, "grad_norm": 0.9890767335891724, "learning_rate": 0.0002, "epoch": 2.601795332136445, "step": 36230}, {"loss": 0.6981, "grad_norm": 0.907865047454834, "learning_rate": 0.0002, "epoch": 2.6025134649910235, "step": 36240}, {"loss": 0.7131, "grad_norm": 0.7724096179008484, "learning_rate": 0.0002, "epoch": 2.6032315978456015, "step": 36250}, {"loss": 0.7034, "grad_norm": 0.7996655106544495, "learning_rate": 0.0002, "epoch": 2.6039497307001795, "step": 36260}, {"loss": 0.6744, "grad_norm": 0.7184412479400635, "learning_rate": 0.0002, "epoch": 2.6046678635547575, "step": 36270}, {"loss": 0.7133, "grad_norm": 0.7781601548194885, "learning_rate": 0.0002, "epoch": 2.6053859964093355, "step": 36280}, {"loss": 0.6975, "grad_norm": 0.8972102403640747, "learning_rate": 0.0002, "epoch": 2.6061041292639135, "step": 36290}, {"loss": 0.6757, "grad_norm": 0.6831884980201721, "learning_rate": 0.0002, "epoch": 2.606822262118492, "step": 36300}, {"loss": 0.6633, "grad_norm": 0.9049789905548096, "learning_rate": 0.0002, "epoch": 2.60754039497307, "step": 36310}, {"loss": 0.7048, "grad_norm": 0.8062970042228699, "learning_rate": 0.0002, "epoch": 2.608258527827648, "step": 36320}, {"loss": 0.6695, "grad_norm": 0.94797682762146, "learning_rate": 0.0002, "epoch": 2.6089766606822264, "step": 36330}, {"loss": 0.6934, "grad_norm": 0.7907559275627136, "learning_rate": 0.0002, "epoch": 2.6096947935368044, "step": 36340}, {"loss": 0.6299, "grad_norm": 0.6720156073570251, "learning_rate": 0.0002, "epoch": 2.6104129263913824, "step": 36350}, {"loss": 0.644, "grad_norm": 0.729228138923645, "learning_rate": 0.0002, "epoch": 2.6111310592459605, "step": 36360}, {"loss": 0.6651, "grad_norm": 0.9072836637496948, "learning_rate": 0.0002, "epoch": 2.6118491921005385, "step": 36370}, {"loss": 0.6821, "grad_norm": 0.8022173643112183, "learning_rate": 0.0002, "epoch": 2.6125673249551165, "step": 36380}, {"loss": 0.6587, "grad_norm": 0.7475612163543701, "learning_rate": 0.0002, "epoch": 2.613285457809695, "step": 36390}, {"loss": 0.6454, "grad_norm": 0.7976534366607666, "learning_rate": 0.0002, "epoch": 2.614003590664273, "step": 36400}, {"loss": 0.7173, "grad_norm": 0.7118260860443115, "learning_rate": 0.0002, "epoch": 2.614721723518851, "step": 36410}, {"loss": 0.7173, "grad_norm": 0.666500985622406, "learning_rate": 0.0002, "epoch": 2.6154398563734294, "step": 36420}, {"loss": 0.719, "grad_norm": 0.8776089549064636, "learning_rate": 0.0002, "epoch": 2.6161579892280074, "step": 36430}, {"loss": 0.6928, "grad_norm": 0.9375919699668884, "learning_rate": 0.0002, "epoch": 2.6168761220825854, "step": 36440}, {"loss": 0.6627, "grad_norm": 0.8162244558334351, "learning_rate": 0.0002, "epoch": 2.6175942549371634, "step": 36450}, {"loss": 0.6586, "grad_norm": 0.8459304571151733, "learning_rate": 0.0002, "epoch": 2.6183123877917414, "step": 36460}, {"loss": 0.6777, "grad_norm": 0.7731037735939026, "learning_rate": 0.0002, "epoch": 2.6190305206463194, "step": 36470}, {"loss": 0.7288, "grad_norm": 0.7857680320739746, "learning_rate": 0.0002, "epoch": 2.619748653500898, "step": 36480}, {"loss": 0.664, "grad_norm": 0.8415161371231079, "learning_rate": 0.0002, "epoch": 2.620466786355476, "step": 36490}, {"loss": 0.703, "grad_norm": 0.8103558421134949, "learning_rate": 0.0002, "epoch": 2.621184919210054, "step": 36500}, {"loss": 0.6693, "grad_norm": 0.7876150608062744, "learning_rate": 0.0002, "epoch": 2.621903052064632, "step": 36510}, {"loss": 0.6562, "grad_norm": 0.7316484451293945, "learning_rate": 0.0002, "epoch": 2.6226211849192103, "step": 36520}, {"loss": 0.6263, "grad_norm": 0.7209784984588623, "learning_rate": 0.0002, "epoch": 2.6233393177737883, "step": 36530}, {"loss": 0.6767, "grad_norm": 0.8933016657829285, "learning_rate": 0.0002, "epoch": 2.6240574506283663, "step": 36540}, {"loss": 0.7217, "grad_norm": 0.8078171610832214, "learning_rate": 0.0002, "epoch": 2.6247755834829443, "step": 36550}, {"loss": 0.7106, "grad_norm": 0.9134724736213684, "learning_rate": 0.0002, "epoch": 2.6254937163375223, "step": 36560}, {"loss": 0.6909, "grad_norm": 0.8691368699073792, "learning_rate": 0.0002, "epoch": 2.6262118491921003, "step": 36570}, {"loss": 0.6769, "grad_norm": 0.706479012966156, "learning_rate": 0.0002, "epoch": 2.6269299820466787, "step": 36580}, {"loss": 0.6864, "grad_norm": 0.9333644509315491, "learning_rate": 0.0002, "epoch": 2.6276481149012567, "step": 36590}, {"loss": 0.6704, "grad_norm": 0.8156154155731201, "learning_rate": 0.0002, "epoch": 2.6283662477558347, "step": 36600}, {"loss": 0.7128, "grad_norm": 0.812745213508606, "learning_rate": 0.0002, "epoch": 2.629084380610413, "step": 36610}, {"loss": 0.6901, "grad_norm": 0.8898148536682129, "learning_rate": 0.0002, "epoch": 2.629802513464991, "step": 36620}, {"loss": 0.6821, "grad_norm": 0.8083946108818054, "learning_rate": 0.0002, "epoch": 2.630520646319569, "step": 36630}, {"loss": 0.7285, "grad_norm": 0.7050122618675232, "learning_rate": 0.0002, "epoch": 2.631238779174147, "step": 36640}, {"loss": 0.6751, "grad_norm": 0.8155789971351624, "learning_rate": 0.0002, "epoch": 2.631956912028725, "step": 36650}, {"loss": 0.7258, "grad_norm": 0.9102175235748291, "learning_rate": 0.0002, "epoch": 2.632675044883303, "step": 36660}, {"loss": 0.6697, "grad_norm": 0.6621248126029968, "learning_rate": 0.0002, "epoch": 2.6333931777378816, "step": 36670}, {"loss": 0.6405, "grad_norm": 0.7338519096374512, "learning_rate": 0.0002, "epoch": 2.6341113105924596, "step": 36680}, {"loss": 0.6784, "grad_norm": 0.7536506652832031, "learning_rate": 0.0002, "epoch": 2.6348294434470376, "step": 36690}, {"loss": 0.6974, "grad_norm": 0.9357436299324036, "learning_rate": 0.0002, "epoch": 2.635547576301616, "step": 36700}, {"loss": 0.7729, "grad_norm": 0.7732111215591431, "learning_rate": 0.0002, "epoch": 2.636265709156194, "step": 36710}, {"loss": 0.6905, "grad_norm": 0.6863537430763245, "learning_rate": 0.0002, "epoch": 2.636983842010772, "step": 36720}, {"loss": 0.7058, "grad_norm": 0.8014764785766602, "learning_rate": 0.0002, "epoch": 2.63770197486535, "step": 36730}, {"loss": 0.697, "grad_norm": 0.8103911280632019, "learning_rate": 0.0002, "epoch": 2.638420107719928, "step": 36740}, {"loss": 0.7164, "grad_norm": 0.882652997970581, "learning_rate": 0.0002, "epoch": 2.639138240574506, "step": 36750}, {"loss": 0.6689, "grad_norm": 0.8705278038978577, "learning_rate": 0.0002, "epoch": 2.6398563734290845, "step": 36760}, {"loss": 0.6863, "grad_norm": 0.80764240026474, "learning_rate": 0.0002, "epoch": 2.6405745062836625, "step": 36770}, {"loss": 0.6761, "grad_norm": 0.9668620824813843, "learning_rate": 0.0002, "epoch": 2.6412926391382405, "step": 36780}, {"loss": 0.6576, "grad_norm": 0.7477577328681946, "learning_rate": 0.0002, "epoch": 2.6420107719928185, "step": 36790}, {"loss": 0.6558, "grad_norm": 0.8344516754150391, "learning_rate": 0.0002, "epoch": 2.642728904847397, "step": 36800}, {"loss": 0.6949, "grad_norm": 0.9520720839500427, "learning_rate": 0.0002, "epoch": 2.643447037701975, "step": 36810}, {"loss": 0.6731, "grad_norm": 0.5942372679710388, "learning_rate": 0.0002, "epoch": 2.644165170556553, "step": 36820}, {"loss": 0.6509, "grad_norm": 0.7411555051803589, "learning_rate": 0.0002, "epoch": 2.644883303411131, "step": 36830}, {"loss": 0.6948, "grad_norm": 0.6597771048545837, "learning_rate": 0.0002, "epoch": 2.645601436265709, "step": 36840}, {"loss": 0.6379, "grad_norm": 0.8636548519134521, "learning_rate": 0.0002, "epoch": 2.646319569120287, "step": 36850}, {"loss": 0.6965, "grad_norm": 0.8557497262954712, "learning_rate": 0.0002, "epoch": 2.6470377019748654, "step": 36860}, {"loss": 0.7061, "grad_norm": 0.8535996675491333, "learning_rate": 0.0002, "epoch": 2.6477558348294434, "step": 36870}, {"loss": 0.7087, "grad_norm": 0.7996463775634766, "learning_rate": 0.0002, "epoch": 2.6484739676840214, "step": 36880}, {"loss": 0.7174, "grad_norm": 0.6462067365646362, "learning_rate": 0.0002, "epoch": 2.6491921005386, "step": 36890}, {"loss": 0.6905, "grad_norm": 0.8849772214889526, "learning_rate": 0.0002, "epoch": 2.649910233393178, "step": 36900}, {"loss": 0.6973, "grad_norm": 0.999173641204834, "learning_rate": 0.0002, "epoch": 2.650628366247756, "step": 36910}, {"loss": 0.628, "grad_norm": 0.7221724987030029, "learning_rate": 0.0002, "epoch": 2.651346499102334, "step": 36920}, {"loss": 0.6698, "grad_norm": 0.8122989535331726, "learning_rate": 0.0002, "epoch": 2.652064631956912, "step": 36930}, {"loss": 0.6758, "grad_norm": 0.724267840385437, "learning_rate": 0.0002, "epoch": 2.65278276481149, "step": 36940}, {"loss": 0.6542, "grad_norm": 0.8250583410263062, "learning_rate": 0.0002, "epoch": 2.6535008976660683, "step": 36950}, {"loss": 0.6914, "grad_norm": 0.7623526453971863, "learning_rate": 0.0002, "epoch": 2.6542190305206463, "step": 36960}, {"loss": 0.6699, "grad_norm": 0.6474025845527649, "learning_rate": 0.0002, "epoch": 2.6549371633752243, "step": 36970}, {"loss": 0.7235, "grad_norm": 0.9751694202423096, "learning_rate": 0.0002, "epoch": 2.655655296229803, "step": 36980}, {"loss": 0.7423, "grad_norm": 0.8338939547538757, "learning_rate": 0.0002, "epoch": 2.656373429084381, "step": 36990}, {"loss": 0.6641, "grad_norm": 0.8877421021461487, "learning_rate": 0.0002, "epoch": 2.657091561938959, "step": 37000}, {"loss": 0.6639, "grad_norm": 0.9590298533439636, "learning_rate": 0.0002, "epoch": 2.657809694793537, "step": 37010}, {"loss": 0.6617, "grad_norm": 0.8224121928215027, "learning_rate": 0.0002, "epoch": 2.658527827648115, "step": 37020}, {"loss": 0.6359, "grad_norm": 0.9871236681938171, "learning_rate": 0.0002, "epoch": 2.659245960502693, "step": 37030}, {"loss": 0.65, "grad_norm": 0.8729037046432495, "learning_rate": 0.0002, "epoch": 2.6599640933572712, "step": 37040}, {"loss": 0.6561, "grad_norm": 0.6279319524765015, "learning_rate": 0.0002, "epoch": 2.6606822262118492, "step": 37050}, {"loss": 0.7031, "grad_norm": 1.0278962850570679, "learning_rate": 0.0002, "epoch": 2.6614003590664272, "step": 37060}, {"loss": 0.6552, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 2.6621184919210052, "step": 37070}, {"loss": 0.6994, "grad_norm": 0.7432018518447876, "learning_rate": 0.0002, "epoch": 2.6628366247755837, "step": 37080}, {"loss": 0.7086, "grad_norm": 0.9425008296966553, "learning_rate": 0.0002, "epoch": 2.6635547576301617, "step": 37090}, {"loss": 0.716, "grad_norm": 0.7542579174041748, "learning_rate": 0.0002, "epoch": 2.6642728904847397, "step": 37100}, {"loss": 0.6714, "grad_norm": 0.8469315767288208, "learning_rate": 0.0002, "epoch": 2.6649910233393177, "step": 37110}, {"loss": 0.6638, "grad_norm": 0.865777313709259, "learning_rate": 0.0002, "epoch": 2.6657091561938957, "step": 37120}, {"loss": 0.741, "grad_norm": 0.7293250560760498, "learning_rate": 0.0002, "epoch": 2.6664272890484737, "step": 37130}, {"loss": 0.6662, "grad_norm": 0.7199395895004272, "learning_rate": 0.0002, "epoch": 2.667145421903052, "step": 37140}, {"loss": 0.7078, "grad_norm": 0.7801268100738525, "learning_rate": 0.0002, "epoch": 2.66786355475763, "step": 37150}, {"loss": 0.7083, "grad_norm": 0.8706921935081482, "learning_rate": 0.0002, "epoch": 2.668581687612208, "step": 37160}, {"loss": 0.69, "grad_norm": 0.7124722599983215, "learning_rate": 0.0002, "epoch": 2.6692998204667866, "step": 37170}, {"loss": 0.625, "grad_norm": 0.8333015441894531, "learning_rate": 0.0002, "epoch": 2.6700179533213646, "step": 37180}, {"loss": 0.636, "grad_norm": 0.8822736740112305, "learning_rate": 0.0002, "epoch": 2.6707360861759426, "step": 37190}, {"loss": 0.6731, "grad_norm": 0.8300906419754028, "learning_rate": 0.0002, "epoch": 2.6714542190305206, "step": 37200}, {"loss": 0.6883, "grad_norm": 0.887126088142395, "learning_rate": 0.0002, "epoch": 2.6721723518850986, "step": 37210}, {"loss": 0.7211, "grad_norm": 0.7473671436309814, "learning_rate": 0.0002, "epoch": 2.6728904847396766, "step": 37220}, {"loss": 0.7032, "grad_norm": 0.8121018409729004, "learning_rate": 0.0002, "epoch": 2.673608617594255, "step": 37230}, {"loss": 0.6262, "grad_norm": 0.7882586717605591, "learning_rate": 0.0002, "epoch": 2.674326750448833, "step": 37240}, {"loss": 0.7201, "grad_norm": 0.797060489654541, "learning_rate": 0.0002, "epoch": 2.675044883303411, "step": 37250}, {"loss": 0.6635, "grad_norm": 0.9776935577392578, "learning_rate": 0.0002, "epoch": 2.6757630161579895, "step": 37260}, {"loss": 0.6883, "grad_norm": 0.9527283906936646, "learning_rate": 0.0002, "epoch": 2.6764811490125675, "step": 37270}, {"loss": 0.6968, "grad_norm": 0.7232038974761963, "learning_rate": 0.0002, "epoch": 2.6771992818671455, "step": 37280}, {"loss": 0.6544, "grad_norm": 0.8514575362205505, "learning_rate": 0.0002, "epoch": 2.6779174147217235, "step": 37290}, {"loss": 0.6956, "grad_norm": 0.8951214551925659, "learning_rate": 0.0002, "epoch": 2.6786355475763015, "step": 37300}, {"loss": 0.7435, "grad_norm": 0.7569643259048462, "learning_rate": 0.0002, "epoch": 2.6793536804308795, "step": 37310}, {"loss": 0.6522, "grad_norm": 1.0522346496582031, "learning_rate": 0.0002, "epoch": 2.680071813285458, "step": 37320}, {"loss": 0.7051, "grad_norm": 0.8914180994033813, "learning_rate": 0.0002, "epoch": 2.680789946140036, "step": 37330}, {"loss": 0.6941, "grad_norm": 0.8251807689666748, "learning_rate": 0.0002, "epoch": 2.681508078994614, "step": 37340}, {"loss": 0.6783, "grad_norm": 0.8215394020080566, "learning_rate": 0.0002, "epoch": 2.682226211849192, "step": 37350}, {"loss": 0.682, "grad_norm": 0.8043696880340576, "learning_rate": 0.0002, "epoch": 2.6829443447037704, "step": 37360}, {"loss": 0.6614, "grad_norm": 0.767250657081604, "learning_rate": 0.0002, "epoch": 2.6836624775583484, "step": 37370}, {"loss": 0.7197, "grad_norm": 0.817740261554718, "learning_rate": 0.0002, "epoch": 2.6843806104129264, "step": 37380}, {"loss": 0.6839, "grad_norm": 0.7963255047798157, "learning_rate": 0.0002, "epoch": 2.6850987432675044, "step": 37390}, {"loss": 0.7469, "grad_norm": 0.839271605014801, "learning_rate": 0.0002, "epoch": 2.6858168761220824, "step": 37400}, {"loss": 0.6879, "grad_norm": 0.7882823348045349, "learning_rate": 0.0002, "epoch": 2.6865350089766604, "step": 37410}, {"loss": 0.6768, "grad_norm": 0.8316412568092346, "learning_rate": 0.0002, "epoch": 2.687253141831239, "step": 37420}, {"loss": 0.7031, "grad_norm": 1.0044993162155151, "learning_rate": 0.0002, "epoch": 2.687971274685817, "step": 37430}, {"loss": 0.6988, "grad_norm": 0.8342832326889038, "learning_rate": 0.0002, "epoch": 2.688689407540395, "step": 37440}, {"loss": 0.6685, "grad_norm": 0.6743215322494507, "learning_rate": 0.0002, "epoch": 2.6894075403949733, "step": 37450}, {"loss": 0.6567, "grad_norm": 0.6872923970222473, "learning_rate": 0.0002, "epoch": 2.6901256732495513, "step": 37460}, {"loss": 0.7089, "grad_norm": 0.7377792596817017, "learning_rate": 0.0002, "epoch": 2.6908438061041293, "step": 37470}, {"loss": 0.676, "grad_norm": 0.7677304744720459, "learning_rate": 0.0002, "epoch": 2.6915619389587073, "step": 37480}, {"loss": 0.6693, "grad_norm": 0.9951061010360718, "learning_rate": 0.0002, "epoch": 2.6922800718132853, "step": 37490}, {"loss": 0.6517, "grad_norm": 0.7452111840248108, "learning_rate": 0.0002, "epoch": 2.6929982046678633, "step": 37500}, {"loss": 0.7503, "grad_norm": 0.9663393497467041, "learning_rate": 0.0002, "epoch": 2.6937163375224418, "step": 37510}, {"loss": 0.7025, "grad_norm": 0.7919635772705078, "learning_rate": 0.0002, "epoch": 2.6944344703770198, "step": 37520}, {"loss": 0.7257, "grad_norm": 0.9977981448173523, "learning_rate": 0.0002, "epoch": 2.6951526032315978, "step": 37530}, {"loss": 0.6507, "grad_norm": 0.7279480695724487, "learning_rate": 0.0002, "epoch": 2.695870736086176, "step": 37540}, {"loss": 0.7448, "grad_norm": 0.7218075394630432, "learning_rate": 0.0002, "epoch": 2.6965888689407542, "step": 37550}, {"loss": 0.6845, "grad_norm": 0.9041047096252441, "learning_rate": 0.0002, "epoch": 2.6973070017953322, "step": 37560}, {"loss": 0.6848, "grad_norm": 0.7689407467842102, "learning_rate": 0.0002, "epoch": 2.6980251346499102, "step": 37570}, {"loss": 0.7136, "grad_norm": 0.8184728622436523, "learning_rate": 0.0002, "epoch": 2.6987432675044882, "step": 37580}, {"loss": 0.6952, "grad_norm": 0.7536661624908447, "learning_rate": 0.0002, "epoch": 2.6994614003590662, "step": 37590}, {"loss": 0.7064, "grad_norm": 0.8371431231498718, "learning_rate": 0.0002, "epoch": 2.7001795332136447, "step": 37600}, {"loss": 0.7118, "grad_norm": 0.8562723994255066, "learning_rate": 0.0002, "epoch": 2.7008976660682227, "step": 37610}, {"loss": 0.6602, "grad_norm": 0.8227898478507996, "learning_rate": 0.0002, "epoch": 2.7016157989228007, "step": 37620}, {"loss": 0.7324, "grad_norm": 0.764792799949646, "learning_rate": 0.0002, "epoch": 2.7023339317773787, "step": 37630}, {"loss": 0.7289, "grad_norm": 0.7782649993896484, "learning_rate": 0.0002, "epoch": 2.703052064631957, "step": 37640}, {"loss": 0.705, "grad_norm": 0.7669944167137146, "learning_rate": 0.0002, "epoch": 2.703770197486535, "step": 37650}, {"loss": 0.7019, "grad_norm": 0.7945750951766968, "learning_rate": 0.0002, "epoch": 2.704488330341113, "step": 37660}, {"loss": 0.6789, "grad_norm": 0.6840786337852478, "learning_rate": 0.0002, "epoch": 2.705206463195691, "step": 37670}, {"loss": 0.768, "grad_norm": 1.0565117597579956, "learning_rate": 0.0002, "epoch": 2.705924596050269, "step": 37680}, {"loss": 0.737, "grad_norm": 0.7407042384147644, "learning_rate": 0.0002, "epoch": 2.706642728904847, "step": 37690}, {"loss": 0.712, "grad_norm": 0.7862113118171692, "learning_rate": 0.0002, "epoch": 2.7073608617594256, "step": 37700}, {"loss": 0.6331, "grad_norm": 0.7487596273422241, "learning_rate": 0.0002, "epoch": 2.7080789946140036, "step": 37710}, {"loss": 0.6917, "grad_norm": 0.9416596293449402, "learning_rate": 0.0002, "epoch": 2.7087971274685816, "step": 37720}, {"loss": 0.717, "grad_norm": 0.8943207263946533, "learning_rate": 0.0002, "epoch": 2.70951526032316, "step": 37730}, {"loss": 0.6505, "grad_norm": 0.9263445138931274, "learning_rate": 0.0002, "epoch": 2.710233393177738, "step": 37740}, {"loss": 0.7423, "grad_norm": 0.6869737505912781, "learning_rate": 0.0002, "epoch": 2.710951526032316, "step": 37750}, {"loss": 0.724, "grad_norm": 0.9186407923698425, "learning_rate": 0.0002, "epoch": 2.711669658886894, "step": 37760}, {"loss": 0.6757, "grad_norm": 0.8379335999488831, "learning_rate": 0.0002, "epoch": 2.712387791741472, "step": 37770}, {"loss": 0.7352, "grad_norm": 0.7248736023902893, "learning_rate": 0.0002, "epoch": 2.71310592459605, "step": 37780}, {"loss": 0.7023, "grad_norm": 0.8636229038238525, "learning_rate": 0.0002, "epoch": 2.7138240574506285, "step": 37790}, {"loss": 0.726, "grad_norm": 0.7590767741203308, "learning_rate": 0.0002, "epoch": 2.7145421903052065, "step": 37800}, {"loss": 0.6837, "grad_norm": 0.8946404457092285, "learning_rate": 0.0002, "epoch": 2.7152603231597845, "step": 37810}, {"loss": 0.7135, "grad_norm": 0.7822132706642151, "learning_rate": 0.0002, "epoch": 2.7159784560143625, "step": 37820}, {"loss": 0.7034, "grad_norm": 0.7882820963859558, "learning_rate": 0.0002, "epoch": 2.716696588868941, "step": 37830}, {"loss": 0.6667, "grad_norm": 0.8025872707366943, "learning_rate": 0.0002, "epoch": 2.717414721723519, "step": 37840}, {"loss": 0.6967, "grad_norm": 0.8618839979171753, "learning_rate": 0.0002, "epoch": 2.718132854578097, "step": 37850}, {"loss": 0.699, "grad_norm": 0.6975733637809753, "learning_rate": 0.0002, "epoch": 2.718850987432675, "step": 37860}, {"loss": 0.6858, "grad_norm": 0.7952182292938232, "learning_rate": 0.0002, "epoch": 2.719569120287253, "step": 37870}, {"loss": 0.7018, "grad_norm": 0.7580680251121521, "learning_rate": 0.0002, "epoch": 2.7202872531418314, "step": 37880}, {"loss": 0.6838, "grad_norm": 0.9504257440567017, "learning_rate": 0.0002, "epoch": 2.7210053859964094, "step": 37890}, {"loss": 0.6801, "grad_norm": 0.856614351272583, "learning_rate": 0.0002, "epoch": 2.7217235188509874, "step": 37900}, {"loss": 0.6647, "grad_norm": 1.0092085599899292, "learning_rate": 0.0002, "epoch": 2.7224416517055654, "step": 37910}, {"loss": 0.6709, "grad_norm": 0.9009839296340942, "learning_rate": 0.0002, "epoch": 2.723159784560144, "step": 37920}, {"loss": 0.7009, "grad_norm": 0.9247435331344604, "learning_rate": 0.0002, "epoch": 2.723877917414722, "step": 37930}, {"loss": 0.6924, "grad_norm": 1.0774317979812622, "learning_rate": 0.0002, "epoch": 2.7245960502693, "step": 37940}, {"loss": 0.6706, "grad_norm": 0.9104372262954712, "learning_rate": 0.0002, "epoch": 2.725314183123878, "step": 37950}, {"loss": 0.6608, "grad_norm": 0.7904245257377625, "learning_rate": 0.0002, "epoch": 2.726032315978456, "step": 37960}, {"loss": 0.6937, "grad_norm": 0.9555521607398987, "learning_rate": 0.0002, "epoch": 2.726750448833034, "step": 37970}, {"loss": 0.6497, "grad_norm": 0.7769099473953247, "learning_rate": 0.0002, "epoch": 2.7274685816876123, "step": 37980}, {"loss": 0.63, "grad_norm": 0.9202065467834473, "learning_rate": 0.0002, "epoch": 2.7281867145421903, "step": 37990}, {"loss": 0.7021, "grad_norm": 0.732510507106781, "learning_rate": 0.0002, "epoch": 2.7289048473967683, "step": 38000}, {"loss": 0.6665, "grad_norm": 0.7723771929740906, "learning_rate": 0.0002, "epoch": 2.7296229802513468, "step": 38010}, {"loss": 0.6836, "grad_norm": 0.7948567867279053, "learning_rate": 0.0002, "epoch": 2.7303411131059248, "step": 38020}, {"loss": 0.6802, "grad_norm": 0.7702966928482056, "learning_rate": 0.0002, "epoch": 2.7310592459605028, "step": 38030}, {"loss": 0.6859, "grad_norm": 0.689098060131073, "learning_rate": 0.0002, "epoch": 2.7317773788150808, "step": 38040}, {"loss": 0.7027, "grad_norm": 0.7951080203056335, "learning_rate": 0.0002, "epoch": 2.7324955116696588, "step": 38050}, {"loss": 0.6895, "grad_norm": 0.7284924983978271, "learning_rate": 0.0002, "epoch": 2.7332136445242368, "step": 38060}, {"loss": 0.7409, "grad_norm": 0.9198044538497925, "learning_rate": 0.0002, "epoch": 2.733931777378815, "step": 38070}, {"loss": 0.6699, "grad_norm": 0.8653260469436646, "learning_rate": 0.0002, "epoch": 2.734649910233393, "step": 38080}, {"loss": 0.6832, "grad_norm": 0.8503400683403015, "learning_rate": 0.0002, "epoch": 2.735368043087971, "step": 38090}, {"loss": 0.6955, "grad_norm": 0.8388783931732178, "learning_rate": 0.0002, "epoch": 2.736086175942549, "step": 38100}, {"loss": 0.7059, "grad_norm": 0.7636904716491699, "learning_rate": 0.0002, "epoch": 2.7368043087971277, "step": 38110}, {"loss": 0.6659, "grad_norm": 0.8990790247917175, "learning_rate": 0.0002, "epoch": 2.7375224416517057, "step": 38120}, {"loss": 0.6487, "grad_norm": 0.8878970742225647, "learning_rate": 0.0002, "epoch": 2.7382405745062837, "step": 38130}, {"loss": 0.6725, "grad_norm": 0.7684310078620911, "learning_rate": 0.0002, "epoch": 2.7389587073608617, "step": 38140}, {"loss": 0.6935, "grad_norm": 1.0777359008789062, "learning_rate": 0.0002, "epoch": 2.7396768402154397, "step": 38150}, {"loss": 0.6904, "grad_norm": 0.768764317035675, "learning_rate": 0.0002, "epoch": 2.740394973070018, "step": 38160}, {"loss": 0.6509, "grad_norm": 0.7490760087966919, "learning_rate": 0.0002, "epoch": 2.741113105924596, "step": 38170}, {"loss": 0.6907, "grad_norm": 0.860373854637146, "learning_rate": 0.0002, "epoch": 2.741831238779174, "step": 38180}, {"loss": 0.6704, "grad_norm": 0.7145599722862244, "learning_rate": 0.0002, "epoch": 2.742549371633752, "step": 38190}, {"loss": 0.6798, "grad_norm": 0.8347760438919067, "learning_rate": 0.0002, "epoch": 2.7432675044883306, "step": 38200}, {"loss": 0.7029, "grad_norm": 0.8425729274749756, "learning_rate": 0.0002, "epoch": 2.7439856373429086, "step": 38210}, {"loss": 0.6442, "grad_norm": 0.9289436936378479, "learning_rate": 0.0002, "epoch": 2.7447037701974866, "step": 38220}, {"loss": 0.694, "grad_norm": 0.7608675360679626, "learning_rate": 0.0002, "epoch": 2.7454219030520646, "step": 38230}, {"loss": 0.7097, "grad_norm": 0.8067167401313782, "learning_rate": 0.0002, "epoch": 2.7461400359066426, "step": 38240}, {"loss": 0.704, "grad_norm": 0.8599629402160645, "learning_rate": 0.0002, "epoch": 2.7468581687612206, "step": 38250}, {"loss": 0.6259, "grad_norm": 0.8425742387771606, "learning_rate": 0.0002, "epoch": 2.747576301615799, "step": 38260}, {"loss": 0.6875, "grad_norm": 0.8626754283905029, "learning_rate": 0.0002, "epoch": 2.748294434470377, "step": 38270}, {"loss": 0.7357, "grad_norm": 0.797652006149292, "learning_rate": 0.0002, "epoch": 2.749012567324955, "step": 38280}, {"loss": 0.7184, "grad_norm": 0.7971500754356384, "learning_rate": 0.0002, "epoch": 2.7497307001795335, "step": 38290}, {"loss": 0.7035, "grad_norm": 0.9786333441734314, "learning_rate": 0.0002, "epoch": 2.7504488330341115, "step": 38300}, {"loss": 0.6501, "grad_norm": 0.7146100997924805, "learning_rate": 0.0002, "epoch": 2.7511669658886895, "step": 38310}, {"loss": 0.7087, "grad_norm": 0.8436099886894226, "learning_rate": 0.0002, "epoch": 2.7518850987432675, "step": 38320}, {"loss": 0.6911, "grad_norm": 0.8943847417831421, "learning_rate": 0.0002, "epoch": 2.7526032315978455, "step": 38330}, {"loss": 0.6397, "grad_norm": 0.8170148730278015, "learning_rate": 0.0002, "epoch": 2.7533213644524235, "step": 38340}, {"loss": 0.6756, "grad_norm": 0.7804728746414185, "learning_rate": 0.0002, "epoch": 2.754039497307002, "step": 38350}, {"loss": 0.6954, "grad_norm": 0.9139971137046814, "learning_rate": 0.0002, "epoch": 2.75475763016158, "step": 38360}, {"loss": 0.7083, "grad_norm": 0.835332453250885, "learning_rate": 0.0002, "epoch": 2.755475763016158, "step": 38370}, {"loss": 0.7112, "grad_norm": 1.0904794931411743, "learning_rate": 0.0002, "epoch": 2.756193895870736, "step": 38380}, {"loss": 0.6881, "grad_norm": 0.7443365454673767, "learning_rate": 0.0002, "epoch": 2.7569120287253144, "step": 38390}, {"loss": 0.6896, "grad_norm": 1.1336839199066162, "learning_rate": 0.0002, "epoch": 2.7576301615798924, "step": 38400}, {"loss": 0.6777, "grad_norm": 0.9024015665054321, "learning_rate": 0.0002, "epoch": 2.7583482944344704, "step": 38410}, {"loss": 0.629, "grad_norm": 0.7380578517913818, "learning_rate": 0.0002, "epoch": 2.7590664272890484, "step": 38420}, {"loss": 0.7708, "grad_norm": 0.9860634207725525, "learning_rate": 0.0002, "epoch": 2.7597845601436264, "step": 38430}, {"loss": 0.6694, "grad_norm": 0.7928970456123352, "learning_rate": 0.0002, "epoch": 2.760502692998205, "step": 38440}, {"loss": 0.669, "grad_norm": 1.0357221364974976, "learning_rate": 0.0002, "epoch": 2.761220825852783, "step": 38450}, {"loss": 0.6763, "grad_norm": 0.8110901117324829, "learning_rate": 0.0002, "epoch": 2.761938958707361, "step": 38460}, {"loss": 0.6528, "grad_norm": 0.8420981764793396, "learning_rate": 0.0002, "epoch": 2.762657091561939, "step": 38470}, {"loss": 0.6841, "grad_norm": 0.858955979347229, "learning_rate": 0.0002, "epoch": 2.7633752244165173, "step": 38480}, {"loss": 0.7387, "grad_norm": 0.9851368069648743, "learning_rate": 0.0002, "epoch": 2.7640933572710953, "step": 38490}, {"loss": 0.6939, "grad_norm": 0.8073325753211975, "learning_rate": 0.0002, "epoch": 2.7648114901256733, "step": 38500}, {"loss": 0.7033, "grad_norm": 1.0654062032699585, "learning_rate": 0.0002, "epoch": 2.7655296229802513, "step": 38510}, {"loss": 0.692, "grad_norm": 0.719603955745697, "learning_rate": 0.0002, "epoch": 2.7662477558348293, "step": 38520}, {"loss": 0.7032, "grad_norm": 0.9790831804275513, "learning_rate": 0.0002, "epoch": 2.7669658886894073, "step": 38530}, {"loss": 0.6613, "grad_norm": 0.907619833946228, "learning_rate": 0.0002, "epoch": 2.7676840215439857, "step": 38540}, {"loss": 0.6683, "grad_norm": 0.7463719248771667, "learning_rate": 0.0002, "epoch": 2.7684021543985637, "step": 38550}, {"loss": 0.6785, "grad_norm": 1.0687178373336792, "learning_rate": 0.0002, "epoch": 2.7691202872531417, "step": 38560}, {"loss": 0.6901, "grad_norm": 0.7397776246070862, "learning_rate": 0.0002, "epoch": 2.76983842010772, "step": 38570}, {"loss": 0.6861, "grad_norm": 0.7392559051513672, "learning_rate": 0.0002, "epoch": 2.770556552962298, "step": 38580}, {"loss": 0.6954, "grad_norm": 0.9774793982505798, "learning_rate": 0.0002, "epoch": 2.771274685816876, "step": 38590}, {"loss": 0.6641, "grad_norm": 0.9502208828926086, "learning_rate": 0.0002, "epoch": 2.771992818671454, "step": 38600}, {"loss": 0.6908, "grad_norm": 0.776108980178833, "learning_rate": 0.0002, "epoch": 2.772710951526032, "step": 38610}, {"loss": 0.6826, "grad_norm": 0.7633077502250671, "learning_rate": 0.0002, "epoch": 2.77342908438061, "step": 38620}, {"loss": 0.6559, "grad_norm": 0.9445580244064331, "learning_rate": 0.0002, "epoch": 2.7741472172351886, "step": 38630}, {"loss": 0.7085, "grad_norm": 0.943165123462677, "learning_rate": 0.0002, "epoch": 2.7748653500897666, "step": 38640}, {"loss": 0.6739, "grad_norm": 0.9045929908752441, "learning_rate": 0.0002, "epoch": 2.7755834829443446, "step": 38650}, {"loss": 0.7351, "grad_norm": 0.9425684213638306, "learning_rate": 0.0002, "epoch": 2.7763016157989227, "step": 38660}, {"loss": 0.6602, "grad_norm": 0.9106295704841614, "learning_rate": 0.0002, "epoch": 2.777019748653501, "step": 38670}, {"loss": 0.7076, "grad_norm": 0.6264749765396118, "learning_rate": 0.0002, "epoch": 2.777737881508079, "step": 38680}, {"loss": 0.7234, "grad_norm": 0.9156801700592041, "learning_rate": 0.0002, "epoch": 2.778456014362657, "step": 38690}, {"loss": 0.6804, "grad_norm": 0.9752956032752991, "learning_rate": 0.0002, "epoch": 2.779174147217235, "step": 38700}, {"loss": 0.686, "grad_norm": 0.7849555611610413, "learning_rate": 0.0002, "epoch": 2.779892280071813, "step": 38710}, {"loss": 0.72, "grad_norm": 0.8109981417655945, "learning_rate": 0.0002, "epoch": 2.780610412926391, "step": 38720}, {"loss": 0.6592, "grad_norm": 0.7882387638092041, "learning_rate": 0.0002, "epoch": 2.7813285457809696, "step": 38730}, {"loss": 0.6948, "grad_norm": 0.9049678444862366, "learning_rate": 0.0002, "epoch": 2.7820466786355476, "step": 38740}, {"loss": 0.7032, "grad_norm": 0.7678212523460388, "learning_rate": 0.0002, "epoch": 2.7827648114901256, "step": 38750}, {"loss": 0.6882, "grad_norm": 0.9754453301429749, "learning_rate": 0.0002, "epoch": 2.783482944344704, "step": 38760}, {"loss": 0.7071, "grad_norm": 0.7643493413925171, "learning_rate": 0.0002, "epoch": 2.784201077199282, "step": 38770}, {"loss": 0.6817, "grad_norm": 0.7440303564071655, "learning_rate": 0.0002, "epoch": 2.78491921005386, "step": 38780}, {"loss": 0.6869, "grad_norm": 0.8870946168899536, "learning_rate": 0.0002, "epoch": 2.785637342908438, "step": 38790}, {"loss": 0.7391, "grad_norm": 0.8100579977035522, "learning_rate": 0.0002, "epoch": 2.786355475763016, "step": 38800}, {"loss": 0.7003, "grad_norm": 0.7082616090774536, "learning_rate": 0.0002, "epoch": 2.787073608617594, "step": 38810}, {"loss": 0.697, "grad_norm": 0.7880047559738159, "learning_rate": 0.0002, "epoch": 2.7877917414721725, "step": 38820}, {"loss": 0.6635, "grad_norm": 0.7217963337898254, "learning_rate": 0.0002, "epoch": 2.7885098743267505, "step": 38830}, {"loss": 0.696, "grad_norm": 0.799124002456665, "learning_rate": 0.0002, "epoch": 2.7892280071813285, "step": 38840}, {"loss": 0.7267, "grad_norm": 1.0004022121429443, "learning_rate": 0.0002, "epoch": 2.789946140035907, "step": 38850}, {"loss": 0.6325, "grad_norm": 0.7866547107696533, "learning_rate": 0.0002, "epoch": 2.790664272890485, "step": 38860}, {"loss": 0.6573, "grad_norm": 0.891603410243988, "learning_rate": 0.0002, "epoch": 2.791382405745063, "step": 38870}, {"loss": 0.6949, "grad_norm": 0.7687129378318787, "learning_rate": 0.0002, "epoch": 2.792100538599641, "step": 38880}, {"loss": 0.6753, "grad_norm": 0.7549769282341003, "learning_rate": 0.0002, "epoch": 2.792818671454219, "step": 38890}, {"loss": 0.7103, "grad_norm": 0.7792351245880127, "learning_rate": 0.0002, "epoch": 2.793536804308797, "step": 38900}, {"loss": 0.671, "grad_norm": 0.7352819442749023, "learning_rate": 0.0002, "epoch": 2.7942549371633754, "step": 38910}, {"loss": 0.7176, "grad_norm": 0.8758018612861633, "learning_rate": 0.0002, "epoch": 2.7949730700179534, "step": 38920}, {"loss": 0.7033, "grad_norm": 0.8213023543357849, "learning_rate": 0.0002, "epoch": 2.7956912028725314, "step": 38930}, {"loss": 0.6759, "grad_norm": 0.899368941783905, "learning_rate": 0.0002, "epoch": 2.7964093357271094, "step": 38940}, {"loss": 0.6994, "grad_norm": 0.7497758269309998, "learning_rate": 0.0002, "epoch": 2.797127468581688, "step": 38950}, {"loss": 0.7006, "grad_norm": 0.870704710483551, "learning_rate": 0.0002, "epoch": 2.797845601436266, "step": 38960}, {"loss": 0.6865, "grad_norm": 0.8021528720855713, "learning_rate": 0.0002, "epoch": 2.798563734290844, "step": 38970}, {"loss": 0.7254, "grad_norm": 0.7541360855102539, "learning_rate": 0.0002, "epoch": 2.799281867145422, "step": 38980}, {"loss": 0.6275, "grad_norm": 0.8909788131713867, "learning_rate": 0.0002, "epoch": 2.8, "step": 38990}, {"loss": 0.6801, "grad_norm": 0.8175999522209167, "learning_rate": 0.0002, "epoch": 2.800718132854578, "step": 39000}, {"loss": 0.6961, "grad_norm": 0.7336044311523438, "learning_rate": 0.0002, "epoch": 2.8014362657091563, "step": 39010}, {"loss": 0.6573, "grad_norm": 0.7354168891906738, "learning_rate": 0.0002, "epoch": 2.8021543985637343, "step": 39020}, {"loss": 0.6207, "grad_norm": 0.8771968483924866, "learning_rate": 0.0002, "epoch": 2.8028725314183123, "step": 39030}, {"loss": 0.671, "grad_norm": 0.8073309063911438, "learning_rate": 0.0002, "epoch": 2.8035906642728907, "step": 39040}, {"loss": 0.6869, "grad_norm": 0.8475365042686462, "learning_rate": 0.0002, "epoch": 2.8043087971274687, "step": 39050}, {"loss": 0.6549, "grad_norm": 0.7233281135559082, "learning_rate": 0.0002, "epoch": 2.8050269299820467, "step": 39060}, {"loss": 0.6937, "grad_norm": 0.9850572347640991, "learning_rate": 0.0002, "epoch": 2.8057450628366247, "step": 39070}, {"loss": 0.7091, "grad_norm": 1.0635435581207275, "learning_rate": 0.0002, "epoch": 2.8064631956912027, "step": 39080}, {"loss": 0.6345, "grad_norm": 0.8183665871620178, "learning_rate": 0.0002, "epoch": 2.8071813285457807, "step": 39090}, {"loss": 0.7116, "grad_norm": 0.802228569984436, "learning_rate": 0.0002, "epoch": 2.807899461400359, "step": 39100}, {"loss": 0.7078, "grad_norm": 0.9861624836921692, "learning_rate": 0.0002, "epoch": 2.808617594254937, "step": 39110}, {"loss": 0.7242, "grad_norm": 0.675205409526825, "learning_rate": 0.0002, "epoch": 2.809335727109515, "step": 39120}, {"loss": 0.6599, "grad_norm": 0.7503975629806519, "learning_rate": 0.0002, "epoch": 2.8100538599640936, "step": 39130}, {"loss": 0.6684, "grad_norm": 0.8266825675964355, "learning_rate": 0.0002, "epoch": 2.8107719928186716, "step": 39140}, {"loss": 0.6869, "grad_norm": 0.6956485509872437, "learning_rate": 0.0002, "epoch": 2.8114901256732496, "step": 39150}, {"loss": 0.6495, "grad_norm": 0.7363799214363098, "learning_rate": 0.0002, "epoch": 2.8122082585278276, "step": 39160}, {"loss": 0.7047, "grad_norm": 1.3893407583236694, "learning_rate": 0.0002, "epoch": 2.8129263913824056, "step": 39170}, {"loss": 0.6501, "grad_norm": 1.0619654655456543, "learning_rate": 0.0002, "epoch": 2.8136445242369836, "step": 39180}, {"loss": 0.703, "grad_norm": 0.7924326062202454, "learning_rate": 0.0002, "epoch": 2.814362657091562, "step": 39190}, {"loss": 0.6748, "grad_norm": 0.8838121294975281, "learning_rate": 0.0002, "epoch": 2.81508078994614, "step": 39200}, {"loss": 0.6759, "grad_norm": 0.9059016108512878, "learning_rate": 0.0002, "epoch": 2.815798922800718, "step": 39210}, {"loss": 0.6812, "grad_norm": 0.9284590482711792, "learning_rate": 0.0002, "epoch": 2.816517055655296, "step": 39220}, {"loss": 0.6261, "grad_norm": 0.7992225289344788, "learning_rate": 0.0002, "epoch": 2.8172351885098745, "step": 39230}, {"loss": 0.6623, "grad_norm": 0.816376805305481, "learning_rate": 0.0002, "epoch": 2.8179533213644525, "step": 39240}, {"loss": 0.6825, "grad_norm": 0.9183637499809265, "learning_rate": 0.0002, "epoch": 2.8186714542190305, "step": 39250}, {"loss": 0.6558, "grad_norm": 0.7232057452201843, "learning_rate": 0.0002, "epoch": 2.8193895870736085, "step": 39260}, {"loss": 0.7396, "grad_norm": 0.9012457728385925, "learning_rate": 0.0002, "epoch": 2.8201077199281865, "step": 39270}, {"loss": 0.6823, "grad_norm": 0.7796093821525574, "learning_rate": 0.0002, "epoch": 2.8208258527827645, "step": 39280}, {"loss": 0.6997, "grad_norm": 0.8331146836280823, "learning_rate": 0.0002, "epoch": 2.821543985637343, "step": 39290}, {"loss": 0.6867, "grad_norm": 0.8031269907951355, "learning_rate": 0.0002, "epoch": 2.822262118491921, "step": 39300}, {"loss": 0.7451, "grad_norm": 0.8563299179077148, "learning_rate": 0.0002, "epoch": 2.822980251346499, "step": 39310}, {"loss": 0.6828, "grad_norm": 0.8083387613296509, "learning_rate": 0.0002, "epoch": 2.8236983842010774, "step": 39320}, {"loss": 0.723, "grad_norm": 0.8132631182670593, "learning_rate": 0.0002, "epoch": 2.8244165170556554, "step": 39330}, {"loss": 0.6882, "grad_norm": 0.9071316719055176, "learning_rate": 0.0002, "epoch": 2.8251346499102334, "step": 39340}, {"loss": 0.7057, "grad_norm": 0.8224168419837952, "learning_rate": 0.0002, "epoch": 2.8258527827648114, "step": 39350}, {"loss": 0.6831, "grad_norm": 1.073014497756958, "learning_rate": 0.0002, "epoch": 2.8265709156193894, "step": 39360}, {"loss": 0.7392, "grad_norm": 0.9466553926467896, "learning_rate": 0.0002, "epoch": 2.8272890484739674, "step": 39370}, {"loss": 0.7288, "grad_norm": 0.8946257829666138, "learning_rate": 0.0002, "epoch": 2.828007181328546, "step": 39380}, {"loss": 0.7023, "grad_norm": 0.8497758507728577, "learning_rate": 0.0002, "epoch": 2.828725314183124, "step": 39390}, {"loss": 0.6787, "grad_norm": 0.8952143788337708, "learning_rate": 0.0002, "epoch": 2.829443447037702, "step": 39400}, {"loss": 0.7059, "grad_norm": 0.8839313983917236, "learning_rate": 0.0002, "epoch": 2.8301615798922803, "step": 39410}, {"loss": 0.6643, "grad_norm": 0.7576757669448853, "learning_rate": 0.0002, "epoch": 2.8308797127468583, "step": 39420}, {"loss": 0.6509, "grad_norm": 0.8212469816207886, "learning_rate": 0.0002, "epoch": 2.8315978456014363, "step": 39430}, {"loss": 0.6728, "grad_norm": 0.9289504885673523, "learning_rate": 0.0002, "epoch": 2.8323159784560143, "step": 39440}, {"loss": 0.6773, "grad_norm": 0.8745405077934265, "learning_rate": 0.0002, "epoch": 2.8330341113105924, "step": 39450}, {"loss": 0.6741, "grad_norm": 0.7974533438682556, "learning_rate": 0.0002, "epoch": 2.8337522441651704, "step": 39460}, {"loss": 0.6887, "grad_norm": 0.914289116859436, "learning_rate": 0.0002, "epoch": 2.834470377019749, "step": 39470}, {"loss": 0.7009, "grad_norm": 0.7686914801597595, "learning_rate": 0.0002, "epoch": 2.835188509874327, "step": 39480}, {"loss": 0.679, "grad_norm": 0.9289370179176331, "learning_rate": 0.0002, "epoch": 2.835906642728905, "step": 39490}, {"loss": 0.684, "grad_norm": 0.8851973414421082, "learning_rate": 0.0002, "epoch": 2.836624775583483, "step": 39500}, {"loss": 0.7012, "grad_norm": 0.7754096388816833, "learning_rate": 0.0002, "epoch": 2.8373429084380613, "step": 39510}, {"loss": 0.6936, "grad_norm": 0.8801632523536682, "learning_rate": 0.0002, "epoch": 2.8380610412926393, "step": 39520}, {"loss": 0.6878, "grad_norm": 0.9031528234481812, "learning_rate": 0.0002, "epoch": 2.8387791741472173, "step": 39530}, {"loss": 0.6815, "grad_norm": 0.7113721966743469, "learning_rate": 0.0002, "epoch": 2.8394973070017953, "step": 39540}, {"loss": 0.7287, "grad_norm": 0.7880923748016357, "learning_rate": 0.0002, "epoch": 2.8402154398563733, "step": 39550}, {"loss": 0.671, "grad_norm": 2.4828813076019287, "learning_rate": 0.0002, "epoch": 2.8409335727109513, "step": 39560}, {"loss": 0.6824, "grad_norm": 0.9174619913101196, "learning_rate": 0.0002, "epoch": 2.8416517055655297, "step": 39570}, {"loss": 0.7086, "grad_norm": 0.9708074927330017, "learning_rate": 0.0002, "epoch": 2.8423698384201077, "step": 39580}, {"loss": 0.7021, "grad_norm": 0.7968248724937439, "learning_rate": 0.0002, "epoch": 2.8430879712746857, "step": 39590}, {"loss": 0.7121, "grad_norm": 0.7967682480812073, "learning_rate": 0.0002, "epoch": 2.843806104129264, "step": 39600}, {"loss": 0.6284, "grad_norm": 0.7487651109695435, "learning_rate": 0.0002, "epoch": 2.844524236983842, "step": 39610}, {"loss": 0.6624, "grad_norm": 0.6997556686401367, "learning_rate": 0.0002, "epoch": 2.84524236983842, "step": 39620}, {"loss": 0.6987, "grad_norm": 0.7639351487159729, "learning_rate": 0.0002, "epoch": 2.845960502692998, "step": 39630}, {"loss": 0.6757, "grad_norm": 0.9086648225784302, "learning_rate": 0.0002, "epoch": 2.846678635547576, "step": 39640}, {"loss": 0.6841, "grad_norm": 0.91103196144104, "learning_rate": 0.0002, "epoch": 2.847396768402154, "step": 39650}, {"loss": 0.7046, "grad_norm": 0.8096913695335388, "learning_rate": 0.0002, "epoch": 2.8481149012567326, "step": 39660}, {"loss": 0.679, "grad_norm": 0.8961427807807922, "learning_rate": 0.0002, "epoch": 2.8488330341113106, "step": 39670}, {"loss": 0.6589, "grad_norm": 0.7489904761314392, "learning_rate": 0.0002, "epoch": 2.8495511669658886, "step": 39680}, {"loss": 0.6581, "grad_norm": 0.7893617749214172, "learning_rate": 0.0002, "epoch": 2.850269299820467, "step": 39690}, {"loss": 0.7326, "grad_norm": 0.8259761929512024, "learning_rate": 0.0002, "epoch": 2.850987432675045, "step": 39700}, {"loss": 0.6763, "grad_norm": 0.7006617188453674, "learning_rate": 0.0002, "epoch": 2.851705565529623, "step": 39710}, {"loss": 0.7095, "grad_norm": 0.8922327756881714, "learning_rate": 0.0002, "epoch": 2.852423698384201, "step": 39720}, {"loss": 0.6829, "grad_norm": 0.9058550000190735, "learning_rate": 0.0002, "epoch": 2.853141831238779, "step": 39730}, {"loss": 0.6777, "grad_norm": 0.7627129554748535, "learning_rate": 0.0002, "epoch": 2.853859964093357, "step": 39740}, {"loss": 0.6937, "grad_norm": 0.9316968321800232, "learning_rate": 0.0002, "epoch": 2.8545780969479355, "step": 39750}, {"loss": 0.6882, "grad_norm": 0.8424679040908813, "learning_rate": 0.0002, "epoch": 2.8552962298025135, "step": 39760}, {"loss": 0.7018, "grad_norm": 0.6185386776924133, "learning_rate": 0.0002, "epoch": 2.8560143626570915, "step": 39770}, {"loss": 0.7106, "grad_norm": 0.709902286529541, "learning_rate": 0.0002, "epoch": 2.8567324955116695, "step": 39780}, {"loss": 0.7007, "grad_norm": 0.93730229139328, "learning_rate": 0.0002, "epoch": 2.857450628366248, "step": 39790}, {"loss": 0.6973, "grad_norm": 0.875989556312561, "learning_rate": 0.0002, "epoch": 2.858168761220826, "step": 39800}, {"loss": 0.6685, "grad_norm": 0.7424131631851196, "learning_rate": 0.0002, "epoch": 2.858886894075404, "step": 39810}, {"loss": 0.6704, "grad_norm": 0.9108477830886841, "learning_rate": 0.0002, "epoch": 2.859605026929982, "step": 39820}, {"loss": 0.6677, "grad_norm": 0.8248386383056641, "learning_rate": 0.0002, "epoch": 2.86032315978456, "step": 39830}, {"loss": 0.6591, "grad_norm": 0.8739979863166809, "learning_rate": 0.0002, "epoch": 2.861041292639138, "step": 39840}, {"loss": 0.6674, "grad_norm": 0.7940961122512817, "learning_rate": 0.0002, "epoch": 2.8617594254937164, "step": 39850}, {"loss": 0.6875, "grad_norm": 0.7594687938690186, "learning_rate": 0.0002, "epoch": 2.8624775583482944, "step": 39860}, {"loss": 0.7339, "grad_norm": 0.9884313941001892, "learning_rate": 0.0002, "epoch": 2.8631956912028724, "step": 39870}, {"loss": 0.6583, "grad_norm": 0.8537741303443909, "learning_rate": 0.0002, "epoch": 2.863913824057451, "step": 39880}, {"loss": 0.6746, "grad_norm": 0.7407512664794922, "learning_rate": 0.0002, "epoch": 2.864631956912029, "step": 39890}, {"loss": 0.7211, "grad_norm": 1.0179548263549805, "learning_rate": 0.0002, "epoch": 2.865350089766607, "step": 39900}, {"loss": 0.6916, "grad_norm": 0.8822470307350159, "learning_rate": 0.0002, "epoch": 2.866068222621185, "step": 39910}, {"loss": 0.7141, "grad_norm": 0.794448733329773, "learning_rate": 0.0002, "epoch": 2.866786355475763, "step": 39920}, {"loss": 0.6993, "grad_norm": 0.8115299940109253, "learning_rate": 0.0002, "epoch": 2.867504488330341, "step": 39930}, {"loss": 0.655, "grad_norm": 0.7998958826065063, "learning_rate": 0.0002, "epoch": 2.8682226211849193, "step": 39940}, {"loss": 0.7414, "grad_norm": 0.8222435116767883, "learning_rate": 0.0002, "epoch": 2.8689407540394973, "step": 39950}, {"loss": 0.6987, "grad_norm": 0.9495923519134521, "learning_rate": 0.0002, "epoch": 2.8696588868940753, "step": 39960}, {"loss": 0.6567, "grad_norm": 0.6749192476272583, "learning_rate": 0.0002, "epoch": 2.8703770197486533, "step": 39970}, {"loss": 0.7003, "grad_norm": 0.8910874128341675, "learning_rate": 0.0002, "epoch": 2.871095152603232, "step": 39980}, {"loss": 0.6935, "grad_norm": 0.7051638960838318, "learning_rate": 0.0002, "epoch": 2.87181328545781, "step": 39990}, {"loss": 0.663, "grad_norm": 0.8456535339355469, "learning_rate": 0.0002, "epoch": 2.872531418312388, "step": 40000}, {"loss": 0.7222, "grad_norm": 0.934894859790802, "learning_rate": 0.0002, "epoch": 2.873249551166966, "step": 40010}, {"loss": 0.7106, "grad_norm": 0.6740477681159973, "learning_rate": 0.0002, "epoch": 2.873967684021544, "step": 40020}, {"loss": 0.6981, "grad_norm": 0.6632325649261475, "learning_rate": 0.0002, "epoch": 2.8746858168761222, "step": 40030}, {"loss": 0.7037, "grad_norm": 0.8889022469520569, "learning_rate": 0.0002, "epoch": 2.8754039497307002, "step": 40040}, {"loss": 0.7094, "grad_norm": 0.7460705637931824, "learning_rate": 0.0002, "epoch": 2.8761220825852782, "step": 40050}, {"loss": 0.6994, "grad_norm": 0.9795911908149719, "learning_rate": 0.0002, "epoch": 2.8768402154398562, "step": 40060}, {"loss": 0.6602, "grad_norm": 1.0002509355545044, "learning_rate": 0.0002, "epoch": 2.8775583482944347, "step": 40070}, {"loss": 0.7191, "grad_norm": 0.7867239713668823, "learning_rate": 0.0002, "epoch": 2.8782764811490127, "step": 40080}, {"loss": 0.6772, "grad_norm": 1.0221471786499023, "learning_rate": 0.0002, "epoch": 2.8789946140035907, "step": 40090}, {"loss": 0.7317, "grad_norm": 0.8091005086898804, "learning_rate": 0.0002, "epoch": 2.8797127468581687, "step": 40100}, {"loss": 0.7334, "grad_norm": 0.8485820293426514, "learning_rate": 0.0002, "epoch": 2.8804308797127467, "step": 40110}, {"loss": 0.7221, "grad_norm": 0.7850196957588196, "learning_rate": 0.0002, "epoch": 2.8811490125673247, "step": 40120}, {"loss": 0.6696, "grad_norm": 0.7906134128570557, "learning_rate": 0.0002, "epoch": 2.881867145421903, "step": 40130}, {"loss": 0.648, "grad_norm": 0.7957962155342102, "learning_rate": 0.0002, "epoch": 2.882585278276481, "step": 40140}, {"loss": 0.6774, "grad_norm": 1.0687522888183594, "learning_rate": 0.0002, "epoch": 2.883303411131059, "step": 40150}, {"loss": 0.7256, "grad_norm": 0.713752031326294, "learning_rate": 0.0002, "epoch": 2.8840215439856376, "step": 40160}, {"loss": 0.7144, "grad_norm": 1.1603864431381226, "learning_rate": 0.0002, "epoch": 2.8847396768402156, "step": 40170}, {"loss": 0.7223, "grad_norm": 0.8423245549201965, "learning_rate": 0.0002, "epoch": 2.8854578096947936, "step": 40180}, {"loss": 0.6796, "grad_norm": 0.7554550766944885, "learning_rate": 0.0002, "epoch": 2.8861759425493716, "step": 40190}, {"loss": 0.6923, "grad_norm": 0.6006978750228882, "learning_rate": 0.0002, "epoch": 2.8868940754039496, "step": 40200}, {"loss": 0.6893, "grad_norm": 0.923068106174469, "learning_rate": 0.0002, "epoch": 2.8876122082585276, "step": 40210}, {"loss": 0.6688, "grad_norm": 0.7659787535667419, "learning_rate": 0.0002, "epoch": 2.888330341113106, "step": 40220}, {"loss": 0.6706, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 2.889048473967684, "step": 40230}, {"loss": 0.6922, "grad_norm": 1.1267355680465698, "learning_rate": 0.0002, "epoch": 2.889766606822262, "step": 40240}, {"loss": 0.7636, "grad_norm": 0.8548554182052612, "learning_rate": 0.0002, "epoch": 2.89048473967684, "step": 40250}, {"loss": 0.6847, "grad_norm": 0.7846875786781311, "learning_rate": 0.0002, "epoch": 2.8912028725314185, "step": 40260}, {"loss": 0.6796, "grad_norm": 0.8606904745101929, "learning_rate": 0.0002, "epoch": 2.8919210053859965, "step": 40270}, {"loss": 0.6864, "grad_norm": 0.6508898138999939, "learning_rate": 0.0002, "epoch": 2.8926391382405745, "step": 40280}, {"loss": 0.6793, "grad_norm": 0.7903237342834473, "learning_rate": 0.0002, "epoch": 2.8933572710951525, "step": 40290}, {"loss": 0.6642, "grad_norm": 0.7320941686630249, "learning_rate": 0.0002, "epoch": 2.8940754039497305, "step": 40300}, {"loss": 0.6813, "grad_norm": 1.0031821727752686, "learning_rate": 0.0002, "epoch": 2.894793536804309, "step": 40310}, {"loss": 0.6071, "grad_norm": 0.7463554739952087, "learning_rate": 0.0002, "epoch": 2.895511669658887, "step": 40320}, {"loss": 0.6856, "grad_norm": 0.8455599546432495, "learning_rate": 0.0002, "epoch": 2.896229802513465, "step": 40330}, {"loss": 0.7252, "grad_norm": 0.7645914554595947, "learning_rate": 0.0002, "epoch": 2.896947935368043, "step": 40340}, {"loss": 0.7181, "grad_norm": 0.9074810147285461, "learning_rate": 0.0002, "epoch": 2.8976660682226214, "step": 40350}, {"loss": 0.6935, "grad_norm": 0.9070153832435608, "learning_rate": 0.0002, "epoch": 2.8983842010771994, "step": 40360}, {"loss": 0.7021, "grad_norm": 0.8649221658706665, "learning_rate": 0.0002, "epoch": 2.8991023339317774, "step": 40370}, {"loss": 0.7402, "grad_norm": 1.0325016975402832, "learning_rate": 0.0002, "epoch": 2.8998204667863554, "step": 40380}, {"loss": 0.6889, "grad_norm": 0.8688622713088989, "learning_rate": 0.0002, "epoch": 2.9005385996409334, "step": 40390}, {"loss": 0.7209, "grad_norm": 0.83316969871521, "learning_rate": 0.0002, "epoch": 2.9012567324955114, "step": 40400}, {"loss": 0.6915, "grad_norm": 1.0146536827087402, "learning_rate": 0.0002, "epoch": 2.90197486535009, "step": 40410}, {"loss": 0.67, "grad_norm": 6.21811580657959, "learning_rate": 0.0002, "epoch": 2.902692998204668, "step": 40420}, {"loss": 0.675, "grad_norm": 0.8747655749320984, "learning_rate": 0.0002, "epoch": 2.903411131059246, "step": 40430}, {"loss": 0.6781, "grad_norm": 0.8671547174453735, "learning_rate": 0.0002, "epoch": 2.9041292639138243, "step": 40440}, {"loss": 0.693, "grad_norm": 0.7888760566711426, "learning_rate": 0.0002, "epoch": 2.9048473967684023, "step": 40450}, {"loss": 0.7208, "grad_norm": 0.7182217240333557, "learning_rate": 0.0002, "epoch": 2.9055655296229803, "step": 40460}, {"loss": 0.7393, "grad_norm": 0.8802227973937988, "learning_rate": 0.0002, "epoch": 2.9062836624775583, "step": 40470}, {"loss": 0.6755, "grad_norm": 0.8106126189231873, "learning_rate": 0.0002, "epoch": 2.9070017953321363, "step": 40480}, {"loss": 0.7251, "grad_norm": 0.7313538789749146, "learning_rate": 0.0002, "epoch": 2.9077199281867143, "step": 40490}, {"loss": 0.6927, "grad_norm": 0.6098655462265015, "learning_rate": 0.0002, "epoch": 2.9084380610412928, "step": 40500}, {"loss": 0.6667, "grad_norm": 0.8849560618400574, "learning_rate": 0.0002, "epoch": 2.9091561938958708, "step": 40510}, {"loss": 0.7199, "grad_norm": 0.8761322498321533, "learning_rate": 0.0002, "epoch": 2.9098743267504488, "step": 40520}, {"loss": 0.6952, "grad_norm": 0.8259703516960144, "learning_rate": 0.0002, "epoch": 2.9105924596050268, "step": 40530}, {"loss": 0.6547, "grad_norm": 0.6613079309463501, "learning_rate": 0.0002, "epoch": 2.911310592459605, "step": 40540}, {"loss": 0.7642, "grad_norm": 0.825678825378418, "learning_rate": 0.0002, "epoch": 2.912028725314183, "step": 40550}, {"loss": 0.7052, "grad_norm": 0.824850857257843, "learning_rate": 0.0002, "epoch": 2.912746858168761, "step": 40560}, {"loss": 0.6869, "grad_norm": 0.9629682898521423, "learning_rate": 0.0002, "epoch": 2.9134649910233392, "step": 40570}, {"loss": 0.7588, "grad_norm": 0.7446485161781311, "learning_rate": 0.0002, "epoch": 2.9141831238779172, "step": 40580}, {"loss": 0.7045, "grad_norm": 0.9028317928314209, "learning_rate": 0.0002, "epoch": 2.9149012567324957, "step": 40590}, {"loss": 0.7128, "grad_norm": 0.9646022319793701, "learning_rate": 0.0002, "epoch": 2.9156193895870737, "step": 40600}, {"loss": 0.6782, "grad_norm": 0.8845045566558838, "learning_rate": 0.0002, "epoch": 2.9163375224416517, "step": 40610}, {"loss": 0.7179, "grad_norm": 0.9660372734069824, "learning_rate": 0.0002, "epoch": 2.9170556552962297, "step": 40620}, {"loss": 0.7442, "grad_norm": 0.8914347290992737, "learning_rate": 0.0002, "epoch": 2.917773788150808, "step": 40630}, {"loss": 0.6435, "grad_norm": 0.7789235711097717, "learning_rate": 0.0002, "epoch": 2.918491921005386, "step": 40640}, {"loss": 0.7156, "grad_norm": 0.8221206665039062, "learning_rate": 0.0002, "epoch": 2.919210053859964, "step": 40650}, {"loss": 0.7363, "grad_norm": 0.9550618529319763, "learning_rate": 0.0002, "epoch": 2.919928186714542, "step": 40660}, {"loss": 0.6911, "grad_norm": 0.868315577507019, "learning_rate": 0.0002, "epoch": 2.92064631956912, "step": 40670}, {"loss": 0.6939, "grad_norm": 0.852878749370575, "learning_rate": 0.0002, "epoch": 2.921364452423698, "step": 40680}, {"loss": 0.6497, "grad_norm": 0.8388790488243103, "learning_rate": 0.0002, "epoch": 2.9220825852782766, "step": 40690}, {"loss": 0.7299, "grad_norm": 0.9897602200508118, "learning_rate": 0.0002, "epoch": 2.9228007181328546, "step": 40700}, {"loss": 0.695, "grad_norm": 0.8050527572631836, "learning_rate": 0.0002, "epoch": 2.9235188509874326, "step": 40710}, {"loss": 0.6924, "grad_norm": 0.7296929955482483, "learning_rate": 0.0002, "epoch": 2.924236983842011, "step": 40720}, {"loss": 0.759, "grad_norm": 0.917475700378418, "learning_rate": 0.0002, "epoch": 2.924955116696589, "step": 40730}, {"loss": 0.6965, "grad_norm": 0.9118483662605286, "learning_rate": 0.0002, "epoch": 2.925673249551167, "step": 40740}, {"loss": 0.6918, "grad_norm": 0.7722473740577698, "learning_rate": 0.0002, "epoch": 2.926391382405745, "step": 40750}, {"loss": 0.7103, "grad_norm": 0.7950358986854553, "learning_rate": 0.0002, "epoch": 2.927109515260323, "step": 40760}, {"loss": 0.7266, "grad_norm": 0.8868561387062073, "learning_rate": 0.0002, "epoch": 2.927827648114901, "step": 40770}, {"loss": 0.7513, "grad_norm": 0.7923154830932617, "learning_rate": 0.0002, "epoch": 2.9285457809694795, "step": 40780}, {"loss": 0.6822, "grad_norm": 0.7285428047180176, "learning_rate": 0.0002, "epoch": 2.9292639138240575, "step": 40790}, {"loss": 0.6748, "grad_norm": 0.794775664806366, "learning_rate": 0.0002, "epoch": 2.9299820466786355, "step": 40800}, {"loss": 0.6967, "grad_norm": 0.8351698517799377, "learning_rate": 0.0002, "epoch": 2.9307001795332135, "step": 40810}, {"loss": 0.6927, "grad_norm": 0.853082001209259, "learning_rate": 0.0002, "epoch": 2.931418312387792, "step": 40820}, {"loss": 0.7047, "grad_norm": 0.8209722638130188, "learning_rate": 0.0002, "epoch": 2.93213644524237, "step": 40830}, {"loss": 0.6742, "grad_norm": 0.8982136845588684, "learning_rate": 0.0002, "epoch": 2.932854578096948, "step": 40840}, {"loss": 0.6617, "grad_norm": 0.8373305201530457, "learning_rate": 0.0002, "epoch": 2.933572710951526, "step": 40850}, {"loss": 0.6754, "grad_norm": 0.8326864242553711, "learning_rate": 0.0002, "epoch": 2.934290843806104, "step": 40860}, {"loss": 0.7151, "grad_norm": 0.7232590317726135, "learning_rate": 0.0002, "epoch": 2.9350089766606824, "step": 40870}, {"loss": 0.7311, "grad_norm": 0.823615312576294, "learning_rate": 0.0002, "epoch": 2.9357271095152604, "step": 40880}, {"loss": 0.7122, "grad_norm": 0.7532811760902405, "learning_rate": 0.0002, "epoch": 2.9364452423698384, "step": 40890}, {"loss": 0.7254, "grad_norm": 0.9594773650169373, "learning_rate": 0.0002, "epoch": 2.9371633752244164, "step": 40900}, {"loss": 0.7024, "grad_norm": 0.8368398547172546, "learning_rate": 0.0002, "epoch": 2.937881508078995, "step": 40910}, {"loss": 0.7201, "grad_norm": 0.8336817026138306, "learning_rate": 0.0002, "epoch": 2.938599640933573, "step": 40920}, {"loss": 0.6402, "grad_norm": 0.8413758277893066, "learning_rate": 0.0002, "epoch": 2.939317773788151, "step": 40930}, {"loss": 0.7054, "grad_norm": 0.7117549180984497, "learning_rate": 0.0002, "epoch": 2.940035906642729, "step": 40940}, {"loss": 0.6101, "grad_norm": 0.8741925954818726, "learning_rate": 0.0002, "epoch": 2.940754039497307, "step": 40950}, {"loss": 0.7491, "grad_norm": 0.8476088047027588, "learning_rate": 0.0002, "epoch": 2.941472172351885, "step": 40960}, {"loss": 0.7084, "grad_norm": 0.674659788608551, "learning_rate": 0.0002, "epoch": 2.9421903052064633, "step": 40970}, {"loss": 0.6714, "grad_norm": 0.7087500691413879, "learning_rate": 0.0002, "epoch": 2.9429084380610413, "step": 40980}, {"loss": 0.6953, "grad_norm": 0.9202252626419067, "learning_rate": 0.0002, "epoch": 2.9436265709156193, "step": 40990}, {"loss": 0.7244, "grad_norm": 0.9775124192237854, "learning_rate": 0.0002, "epoch": 2.9443447037701977, "step": 41000}, {"loss": 0.6897, "grad_norm": 0.7465068101882935, "learning_rate": 0.0002, "epoch": 2.9450628366247757, "step": 41010}, {"loss": 0.6944, "grad_norm": 0.7229986786842346, "learning_rate": 0.0002, "epoch": 2.9457809694793538, "step": 41020}, {"loss": 0.6754, "grad_norm": 0.7228954434394836, "learning_rate": 0.0002, "epoch": 2.9464991023339318, "step": 41030}, {"loss": 0.6604, "grad_norm": 0.9396149516105652, "learning_rate": 0.0002, "epoch": 2.9472172351885098, "step": 41040}, {"loss": 0.6498, "grad_norm": 0.9458696842193604, "learning_rate": 0.0002, "epoch": 2.9479353680430878, "step": 41050}, {"loss": 0.7154, "grad_norm": 0.8276246190071106, "learning_rate": 0.0002, "epoch": 2.948653500897666, "step": 41060}, {"loss": 0.6567, "grad_norm": 0.7927420139312744, "learning_rate": 0.0002, "epoch": 2.949371633752244, "step": 41070}, {"loss": 0.7442, "grad_norm": 0.7403103709220886, "learning_rate": 0.0002, "epoch": 2.950089766606822, "step": 41080}, {"loss": 0.6856, "grad_norm": 0.9813524484634399, "learning_rate": 0.0002, "epoch": 2.9508078994614, "step": 41090}, {"loss": 0.7271, "grad_norm": 0.8560924530029297, "learning_rate": 0.0002, "epoch": 2.9515260323159787, "step": 41100}, {"loss": 0.6851, "grad_norm": 0.6937443017959595, "learning_rate": 0.0002, "epoch": 2.9522441651705567, "step": 41110}, {"loss": 0.6817, "grad_norm": 0.8440476655960083, "learning_rate": 0.0002, "epoch": 2.9529622980251347, "step": 41120}, {"loss": 0.7082, "grad_norm": 1.1260770559310913, "learning_rate": 0.0002, "epoch": 2.9536804308797127, "step": 41130}, {"loss": 0.6745, "grad_norm": 0.8789936900138855, "learning_rate": 0.0002, "epoch": 2.9543985637342907, "step": 41140}, {"loss": 0.7297, "grad_norm": 0.8205832839012146, "learning_rate": 0.0002, "epoch": 2.9551166965888687, "step": 41150}, {"loss": 0.7036, "grad_norm": 0.8148444294929504, "learning_rate": 0.0002, "epoch": 2.955834829443447, "step": 41160}, {"loss": 0.6923, "grad_norm": 0.791296660900116, "learning_rate": 0.0002, "epoch": 2.956552962298025, "step": 41170}, {"loss": 0.6589, "grad_norm": 1.3229854106903076, "learning_rate": 0.0002, "epoch": 2.957271095152603, "step": 41180}, {"loss": 0.6691, "grad_norm": 0.906423807144165, "learning_rate": 0.0002, "epoch": 2.9579892280071816, "step": 41190}, {"loss": 0.6979, "grad_norm": 0.8707411289215088, "learning_rate": 0.0002, "epoch": 2.9587073608617596, "step": 41200}, {"loss": 0.6442, "grad_norm": 1.0362473726272583, "learning_rate": 0.0002, "epoch": 2.9594254937163376, "step": 41210}, {"loss": 0.6725, "grad_norm": 0.818546712398529, "learning_rate": 0.0002, "epoch": 2.9601436265709156, "step": 41220}, {"loss": 0.7158, "grad_norm": 0.8558517098426819, "learning_rate": 0.0002, "epoch": 2.9608617594254936, "step": 41230}, {"loss": 0.7056, "grad_norm": 0.8262931704521179, "learning_rate": 0.0002, "epoch": 2.9615798922800716, "step": 41240}, {"loss": 0.6256, "grad_norm": 0.9603250026702881, "learning_rate": 0.0002, "epoch": 2.96229802513465, "step": 41250}, {"loss": 0.68, "grad_norm": 0.891610860824585, "learning_rate": 0.0002, "epoch": 2.963016157989228, "step": 41260}, {"loss": 0.7732, "grad_norm": 0.9823883175849915, "learning_rate": 0.0002, "epoch": 2.963734290843806, "step": 41270}, {"loss": 0.7144, "grad_norm": 0.8783510327339172, "learning_rate": 0.0002, "epoch": 2.9644524236983845, "step": 41280}, {"loss": 0.7196, "grad_norm": 0.873656690120697, "learning_rate": 0.0002, "epoch": 2.9651705565529625, "step": 41290}, {"loss": 0.6531, "grad_norm": 0.8281165957450867, "learning_rate": 0.0002, "epoch": 2.9658886894075405, "step": 41300}, {"loss": 0.69, "grad_norm": 0.8008899092674255, "learning_rate": 0.0002, "epoch": 2.9666068222621185, "step": 41310}, {"loss": 0.6923, "grad_norm": 0.8564065098762512, "learning_rate": 0.0002, "epoch": 2.9673249551166965, "step": 41320}, {"loss": 0.6871, "grad_norm": 0.786119818687439, "learning_rate": 0.0002, "epoch": 2.9680430879712745, "step": 41330}, {"loss": 0.7105, "grad_norm": 1.3152399063110352, "learning_rate": 0.0002, "epoch": 2.968761220825853, "step": 41340}, {"loss": 0.6575, "grad_norm": 0.7551527619361877, "learning_rate": 0.0002, "epoch": 2.969479353680431, "step": 41350}, {"loss": 0.6939, "grad_norm": 1.1397290229797363, "learning_rate": 0.0002, "epoch": 2.970197486535009, "step": 41360}, {"loss": 0.7119, "grad_norm": 0.8333854079246521, "learning_rate": 0.0002, "epoch": 2.970915619389587, "step": 41370}, {"loss": 0.6941, "grad_norm": 0.8096165657043457, "learning_rate": 0.0002, "epoch": 2.9716337522441654, "step": 41380}, {"loss": 0.7748, "grad_norm": 0.8378547430038452, "learning_rate": 0.0002, "epoch": 2.9723518850987434, "step": 41390}, {"loss": 0.7678, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 2.9730700179533214, "step": 41400}, {"loss": 0.6962, "grad_norm": 0.8722409605979919, "learning_rate": 0.0002, "epoch": 2.9737881508078994, "step": 41410}, {"loss": 0.7298, "grad_norm": 0.6680061221122742, "learning_rate": 0.0002, "epoch": 2.9745062836624774, "step": 41420}, {"loss": 0.6731, "grad_norm": 0.7666152715682983, "learning_rate": 0.0002, "epoch": 2.9752244165170554, "step": 41430}, {"loss": 0.7377, "grad_norm": 0.8489957451820374, "learning_rate": 0.0002, "epoch": 2.975942549371634, "step": 41440}, {"loss": 0.6816, "grad_norm": 0.8516127467155457, "learning_rate": 0.0002, "epoch": 2.976660682226212, "step": 41450}, {"loss": 0.697, "grad_norm": 0.8836804628372192, "learning_rate": 0.0002, "epoch": 2.97737881508079, "step": 41460}, {"loss": 0.7048, "grad_norm": 1.0963364839553833, "learning_rate": 0.0002, "epoch": 2.9780969479353683, "step": 41470}, {"loss": 0.6695, "grad_norm": 0.9908610582351685, "learning_rate": 0.0002, "epoch": 2.9788150807899463, "step": 41480}, {"loss": 0.7184, "grad_norm": 0.8822041153907776, "learning_rate": 0.0002, "epoch": 2.9795332136445243, "step": 41490}, {"loss": 0.7192, "grad_norm": 0.717723548412323, "learning_rate": 0.0002, "epoch": 2.9802513464991023, "step": 41500}, {"loss": 0.711, "grad_norm": 0.8413400053977966, "learning_rate": 0.0002, "epoch": 2.9809694793536803, "step": 41510}, {"loss": 0.6871, "grad_norm": 0.8771023750305176, "learning_rate": 0.0002, "epoch": 2.9816876122082583, "step": 41520}, {"loss": 0.6802, "grad_norm": 0.7185000777244568, "learning_rate": 0.0002, "epoch": 2.9824057450628367, "step": 41530}, {"loss": 0.706, "grad_norm": 0.8299767374992371, "learning_rate": 0.0002, "epoch": 2.9831238779174147, "step": 41540}, {"loss": 0.6569, "grad_norm": 0.9309971928596497, "learning_rate": 0.0002, "epoch": 2.9838420107719927, "step": 41550}, {"loss": 0.6598, "grad_norm": 0.7644693851470947, "learning_rate": 0.0002, "epoch": 2.984560143626571, "step": 41560}, {"loss": 0.7186, "grad_norm": 0.7888111472129822, "learning_rate": 0.0002, "epoch": 2.985278276481149, "step": 41570}, {"loss": 0.6984, "grad_norm": 1.0921967029571533, "learning_rate": 0.0002, "epoch": 2.985996409335727, "step": 41580}, {"loss": 0.6629, "grad_norm": 0.8116785883903503, "learning_rate": 0.0002, "epoch": 2.986714542190305, "step": 41590}, {"loss": 0.6842, "grad_norm": 0.983269214630127, "learning_rate": 0.0002, "epoch": 2.987432675044883, "step": 41600}, {"loss": 0.6675, "grad_norm": 0.81700599193573, "learning_rate": 0.0002, "epoch": 2.988150807899461, "step": 41610}, {"loss": 0.7525, "grad_norm": 0.7545617818832397, "learning_rate": 0.0002, "epoch": 2.9888689407540396, "step": 41620}, {"loss": 0.6698, "grad_norm": 0.8695791363716125, "learning_rate": 0.0002, "epoch": 2.9895870736086176, "step": 41630}, {"loss": 0.7446, "grad_norm": 0.8980445861816406, "learning_rate": 0.0002, "epoch": 2.9903052064631956, "step": 41640}, {"loss": 0.6616, "grad_norm": 0.7884747982025146, "learning_rate": 0.0002, "epoch": 2.9910233393177736, "step": 41650}, {"loss": 0.6461, "grad_norm": 0.8347880840301514, "learning_rate": 0.0002, "epoch": 2.991741472172352, "step": 41660}, {"loss": 0.6607, "grad_norm": 0.7786261439323425, "learning_rate": 0.0002, "epoch": 2.99245960502693, "step": 41670}, {"loss": 0.6834, "grad_norm": 0.7830624580383301, "learning_rate": 0.0002, "epoch": 2.993177737881508, "step": 41680}, {"loss": 0.7116, "grad_norm": 0.8293532133102417, "learning_rate": 0.0002, "epoch": 2.993895870736086, "step": 41690}, {"loss": 0.7029, "grad_norm": 0.8476244211196899, "learning_rate": 0.0002, "epoch": 2.994614003590664, "step": 41700}, {"loss": 0.6909, "grad_norm": 0.7218726873397827, "learning_rate": 0.0002, "epoch": 2.995332136445242, "step": 41710}, {"loss": 0.6579, "grad_norm": 0.8144199252128601, "learning_rate": 0.0002, "epoch": 2.9960502692998205, "step": 41720}, {"loss": 0.7011, "grad_norm": 0.7047123312950134, "learning_rate": 0.0002, "epoch": 2.9967684021543985, "step": 41730}, {"loss": 0.6555, "grad_norm": 0.8412184715270996, "learning_rate": 0.0002, "epoch": 2.9974865350089765, "step": 41740}, {"loss": 0.7237, "grad_norm": 0.8840848207473755, "learning_rate": 0.0002, "epoch": 2.998204667863555, "step": 41750}, {"loss": 0.6618, "grad_norm": 0.7302142977714539, "learning_rate": 0.0002, "epoch": 2.998922800718133, "step": 41760}, {"loss": 0.6596, "grad_norm": 0.7075994610786438, "learning_rate": 0.0002, "epoch": 2.999640933572711, "step": 41770}, {"eval_loss": 1.1079821586608887, "eval_runtime": 55.1897, "eval_samples_per_second": 13.281, "eval_steps_per_second": 1.667, "epoch": 3.0, "step": 41775}, {"loss": 0.6472, "grad_norm": 0.8630077838897705, "learning_rate": 0.0002, "epoch": 3.000359066427289, "step": 41780}, {"loss": 0.5843, "grad_norm": 0.8901806473731995, "learning_rate": 0.0002, "epoch": 3.001077199281867, "step": 41790}, {"loss": 0.5789, "grad_norm": 0.8291767835617065, "learning_rate": 0.0002, "epoch": 3.0017953321364454, "step": 41800}, {"loss": 0.6049, "grad_norm": 0.792519211769104, "learning_rate": 0.0002, "epoch": 3.0025134649910235, "step": 41810}, {"loss": 0.6131, "grad_norm": 1.1330063343048096, "learning_rate": 0.0002, "epoch": 3.0032315978456015, "step": 41820}, {"loss": 0.6225, "grad_norm": 0.9401350617408752, "learning_rate": 0.0002, "epoch": 3.0039497307001795, "step": 41830}, {"loss": 0.5924, "grad_norm": 0.8065463304519653, "learning_rate": 0.0002, "epoch": 3.0046678635547575, "step": 41840}, {"loss": 0.6161, "grad_norm": 0.8309979438781738, "learning_rate": 0.0002, "epoch": 3.005385996409336, "step": 41850}, {"loss": 0.6099, "grad_norm": 0.7432689070701599, "learning_rate": 0.0002, "epoch": 3.006104129263914, "step": 41860}, {"loss": 0.5901, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 3.006822262118492, "step": 41870}, {"loss": 0.6211, "grad_norm": 1.4364255666732788, "learning_rate": 0.0002, "epoch": 3.00754039497307, "step": 41880}, {"loss": 0.5988, "grad_norm": 0.9023072123527527, "learning_rate": 0.0002, "epoch": 3.008258527827648, "step": 41890}, {"loss": 0.6296, "grad_norm": 0.7790587544441223, "learning_rate": 0.0002, "epoch": 3.0089766606822264, "step": 41900}, {"loss": 0.5908, "grad_norm": 0.9163706302642822, "learning_rate": 0.0002, "epoch": 3.0096947935368044, "step": 41910}, {"loss": 0.6216, "grad_norm": 0.8147963285446167, "learning_rate": 0.0002, "epoch": 3.0104129263913824, "step": 41920}, {"loss": 0.6546, "grad_norm": 0.8432748913764954, "learning_rate": 0.0002, "epoch": 3.0111310592459604, "step": 41930}, {"loss": 0.5815, "grad_norm": 0.9216182231903076, "learning_rate": 0.0002, "epoch": 3.011849192100539, "step": 41940}, {"loss": 0.6336, "grad_norm": 0.62154221534729, "learning_rate": 0.0002, "epoch": 3.012567324955117, "step": 41950}, {"loss": 0.5868, "grad_norm": 0.8902392387390137, "learning_rate": 0.0002, "epoch": 3.013285457809695, "step": 41960}, {"loss": 0.6205, "grad_norm": 0.9601083993911743, "learning_rate": 0.0002, "epoch": 3.014003590664273, "step": 41970}, {"loss": 0.6001, "grad_norm": 0.8938809037208557, "learning_rate": 0.0002, "epoch": 3.014721723518851, "step": 41980}, {"loss": 0.6215, "grad_norm": 1.0621999502182007, "learning_rate": 0.0002, "epoch": 3.0154398563734293, "step": 41990}, {"loss": 0.6453, "grad_norm": 0.7310585379600525, "learning_rate": 0.0002, "epoch": 3.0161579892280073, "step": 42000}, {"loss": 0.5674, "grad_norm": 0.8475853800773621, "learning_rate": 0.0002, "epoch": 3.0168761220825853, "step": 42010}, {"loss": 0.605, "grad_norm": 0.8509864807128906, "learning_rate": 0.0002, "epoch": 3.0175942549371633, "step": 42020}, {"loss": 0.6487, "grad_norm": 0.7461876273155212, "learning_rate": 0.0002, "epoch": 3.0183123877917413, "step": 42030}, {"loss": 0.6136, "grad_norm": 0.7734265327453613, "learning_rate": 0.0002, "epoch": 3.0190305206463197, "step": 42040}, {"loss": 0.6073, "grad_norm": 0.9056455492973328, "learning_rate": 0.0002, "epoch": 3.0197486535008977, "step": 42050}, {"loss": 0.6015, "grad_norm": 0.9183889031410217, "learning_rate": 0.0002, "epoch": 3.0204667863554757, "step": 42060}, {"loss": 0.6502, "grad_norm": 1.0777326822280884, "learning_rate": 0.0002, "epoch": 3.0211849192100537, "step": 42070}, {"loss": 0.6775, "grad_norm": 0.9217308163642883, "learning_rate": 0.0002, "epoch": 3.021903052064632, "step": 42080}, {"loss": 0.6157, "grad_norm": 0.8220202326774597, "learning_rate": 0.0002, "epoch": 3.02262118491921, "step": 42090}, {"loss": 0.5786, "grad_norm": 0.8454978466033936, "learning_rate": 0.0002, "epoch": 3.023339317773788, "step": 42100}, {"loss": 0.5653, "grad_norm": 0.8116370439529419, "learning_rate": 0.0002, "epoch": 3.024057450628366, "step": 42110}, {"loss": 0.6307, "grad_norm": 0.8064935207366943, "learning_rate": 0.0002, "epoch": 3.024775583482944, "step": 42120}, {"loss": 0.6567, "grad_norm": 0.9718650579452515, "learning_rate": 0.0002, "epoch": 3.0254937163375226, "step": 42130}, {"loss": 0.5936, "grad_norm": 0.8817588090896606, "learning_rate": 0.0002, "epoch": 3.0262118491921006, "step": 42140}, {"loss": 0.5625, "grad_norm": 0.7757318615913391, "learning_rate": 0.0002, "epoch": 3.0269299820466786, "step": 42150}, {"loss": 0.5704, "grad_norm": 0.7500545382499695, "learning_rate": 0.0002, "epoch": 3.0276481149012566, "step": 42160}, {"loss": 0.5635, "grad_norm": 0.72913658618927, "learning_rate": 0.0002, "epoch": 3.0283662477558346, "step": 42170}, {"loss": 0.6354, "grad_norm": 0.7641891837120056, "learning_rate": 0.0002, "epoch": 3.029084380610413, "step": 42180}, {"loss": 0.621, "grad_norm": 0.7682021856307983, "learning_rate": 0.0002, "epoch": 3.029802513464991, "step": 42190}, {"loss": 0.6377, "grad_norm": 0.8145958781242371, "learning_rate": 0.0002, "epoch": 3.030520646319569, "step": 42200}, {"loss": 0.6008, "grad_norm": 1.0546396970748901, "learning_rate": 0.0002, "epoch": 3.031238779174147, "step": 42210}, {"loss": 0.6177, "grad_norm": 0.8222804665565491, "learning_rate": 0.0002, "epoch": 3.0319569120287255, "step": 42220}, {"loss": 0.6264, "grad_norm": 0.8245829343795776, "learning_rate": 0.0002, "epoch": 3.0326750448833035, "step": 42230}, {"loss": 0.5828, "grad_norm": 0.9059963822364807, "learning_rate": 0.0002, "epoch": 3.0333931777378815, "step": 42240}, {"loss": 0.6373, "grad_norm": 1.026747465133667, "learning_rate": 0.0002, "epoch": 3.0341113105924595, "step": 42250}, {"loss": 0.636, "grad_norm": 0.9108404517173767, "learning_rate": 0.0002, "epoch": 3.0348294434470375, "step": 42260}, {"loss": 0.589, "grad_norm": 0.9828516840934753, "learning_rate": 0.0002, "epoch": 3.035547576301616, "step": 42270}, {"loss": 0.6558, "grad_norm": 0.9664266705513, "learning_rate": 0.0002, "epoch": 3.036265709156194, "step": 42280}, {"loss": 0.6157, "grad_norm": 0.7577654719352722, "learning_rate": 0.0002, "epoch": 3.036983842010772, "step": 42290}, {"loss": 0.5849, "grad_norm": 0.8331853151321411, "learning_rate": 0.0002, "epoch": 3.03770197486535, "step": 42300}, {"loss": 0.6335, "grad_norm": 0.8017228245735168, "learning_rate": 0.0002, "epoch": 3.038420107719928, "step": 42310}, {"loss": 0.6148, "grad_norm": 1.0316718816757202, "learning_rate": 0.0002, "epoch": 3.0391382405745064, "step": 42320}, {"loss": 0.5934, "grad_norm": 0.9379803538322449, "learning_rate": 0.0002, "epoch": 3.0398563734290844, "step": 42330}, {"loss": 0.6358, "grad_norm": 0.7554476857185364, "learning_rate": 0.0002, "epoch": 3.0405745062836624, "step": 42340}, {"loss": 0.5951, "grad_norm": 0.7377917766571045, "learning_rate": 0.0002, "epoch": 3.0412926391382404, "step": 42350}, {"loss": 0.5769, "grad_norm": 1.0655276775360107, "learning_rate": 0.0002, "epoch": 3.042010771992819, "step": 42360}, {"loss": 0.5892, "grad_norm": 0.7748511433601379, "learning_rate": 0.0002, "epoch": 3.042728904847397, "step": 42370}, {"loss": 0.6512, "grad_norm": 0.848649799823761, "learning_rate": 0.0002, "epoch": 3.043447037701975, "step": 42380}, {"loss": 0.6411, "grad_norm": 0.7754636406898499, "learning_rate": 0.0002, "epoch": 3.044165170556553, "step": 42390}, {"loss": 0.6665, "grad_norm": 0.8173656463623047, "learning_rate": 0.0002, "epoch": 3.044883303411131, "step": 42400}, {"loss": 0.5877, "grad_norm": 0.7881983518600464, "learning_rate": 0.0002, "epoch": 3.0456014362657093, "step": 42410}, {"loss": 0.5832, "grad_norm": 0.971072256565094, "learning_rate": 0.0002, "epoch": 3.0463195691202873, "step": 42420}, {"loss": 0.6303, "grad_norm": 0.8400143384933472, "learning_rate": 0.0002, "epoch": 3.0470377019748653, "step": 42430}, {"loss": 0.6557, "grad_norm": 1.0028647184371948, "learning_rate": 0.0002, "epoch": 3.0477558348294433, "step": 42440}, {"loss": 0.5949, "grad_norm": 0.9728034734725952, "learning_rate": 0.0002, "epoch": 3.0484739676840213, "step": 42450}, {"loss": 0.6222, "grad_norm": 0.937633752822876, "learning_rate": 0.0002, "epoch": 3.0491921005386, "step": 42460}, {"loss": 0.6254, "grad_norm": 1.0265642404556274, "learning_rate": 0.0002, "epoch": 3.049910233393178, "step": 42470}, {"loss": 0.6078, "grad_norm": 0.9733216762542725, "learning_rate": 0.0002, "epoch": 3.050628366247756, "step": 42480}, {"loss": 0.5766, "grad_norm": 0.7039174437522888, "learning_rate": 0.0002, "epoch": 3.051346499102334, "step": 42490}, {"loss": 0.6422, "grad_norm": 0.7515231370925903, "learning_rate": 0.0002, "epoch": 3.0520646319569122, "step": 42500}, {"loss": 0.5517, "grad_norm": 0.9115300178527832, "learning_rate": 0.0002, "epoch": 3.0527827648114902, "step": 42510}, {"loss": 0.6738, "grad_norm": 0.7403655648231506, "learning_rate": 0.0002, "epoch": 3.0535008976660682, "step": 42520}, {"loss": 0.5528, "grad_norm": 0.7826810479164124, "learning_rate": 0.0002, "epoch": 3.0542190305206462, "step": 42530}, {"loss": 0.6513, "grad_norm": 0.8007349371910095, "learning_rate": 0.0002, "epoch": 3.0549371633752243, "step": 42540}, {"loss": 0.6118, "grad_norm": 0.7975959777832031, "learning_rate": 0.0002, "epoch": 3.0556552962298027, "step": 42550}, {"loss": 0.6157, "grad_norm": 0.9665228128433228, "learning_rate": 0.0002, "epoch": 3.0563734290843807, "step": 42560}, {"loss": 0.6095, "grad_norm": 0.8386123180389404, "learning_rate": 0.0002, "epoch": 3.0570915619389587, "step": 42570}, {"loss": 0.64, "grad_norm": 0.7437782287597656, "learning_rate": 0.0002, "epoch": 3.0578096947935367, "step": 42580}, {"loss": 0.6399, "grad_norm": 0.8360698223114014, "learning_rate": 0.0002, "epoch": 3.0585278276481147, "step": 42590}, {"loss": 0.6259, "grad_norm": 0.8982073664665222, "learning_rate": 0.0002, "epoch": 3.059245960502693, "step": 42600}, {"loss": 0.6235, "grad_norm": 0.9425758719444275, "learning_rate": 0.0002, "epoch": 3.059964093357271, "step": 42610}, {"loss": 0.631, "grad_norm": 0.8567131161689758, "learning_rate": 0.0002, "epoch": 3.060682226211849, "step": 42620}, {"loss": 0.609, "grad_norm": 0.9322942495346069, "learning_rate": 0.0002, "epoch": 3.061400359066427, "step": 42630}, {"loss": 0.6384, "grad_norm": 0.8283235430717468, "learning_rate": 0.0002, "epoch": 3.0621184919210056, "step": 42640}, {"loss": 0.6345, "grad_norm": 0.8457967638969421, "learning_rate": 0.0002, "epoch": 3.0628366247755836, "step": 42650}, {"loss": 0.631, "grad_norm": 0.8205100893974304, "learning_rate": 0.0002, "epoch": 3.0635547576301616, "step": 42660}, {"loss": 0.6094, "grad_norm": 0.8385181427001953, "learning_rate": 0.0002, "epoch": 3.0642728904847396, "step": 42670}, {"loss": 0.6169, "grad_norm": 1.2959390878677368, "learning_rate": 0.0002, "epoch": 3.0649910233393176, "step": 42680}, {"loss": 0.6531, "grad_norm": 0.7150540351867676, "learning_rate": 0.0002, "epoch": 3.065709156193896, "step": 42690}, {"loss": 0.6456, "grad_norm": 0.6647360920906067, "learning_rate": 0.0002, "epoch": 3.066427289048474, "step": 42700}, {"loss": 0.6151, "grad_norm": 0.9148316979408264, "learning_rate": 0.0002, "epoch": 3.067145421903052, "step": 42710}, {"loss": 0.6298, "grad_norm": 0.8606209754943848, "learning_rate": 0.0002, "epoch": 3.06786355475763, "step": 42720}, {"loss": 0.636, "grad_norm": 1.4255632162094116, "learning_rate": 0.0002, "epoch": 3.068581687612208, "step": 42730}, {"loss": 0.6363, "grad_norm": 0.9131710529327393, "learning_rate": 0.0002, "epoch": 3.0692998204667865, "step": 42740}, {"loss": 0.6432, "grad_norm": 0.9560360908508301, "learning_rate": 0.0002, "epoch": 3.0700179533213645, "step": 42750}, {"loss": 0.6259, "grad_norm": 0.9278100728988647, "learning_rate": 0.0002, "epoch": 3.0707360861759425, "step": 42760}, {"loss": 0.6001, "grad_norm": 0.7258471846580505, "learning_rate": 0.0002, "epoch": 3.0714542190305205, "step": 42770}, {"loss": 0.6447, "grad_norm": 1.1537690162658691, "learning_rate": 0.0002, "epoch": 3.072172351885099, "step": 42780}, {"loss": 0.6237, "grad_norm": 0.8562588691711426, "learning_rate": 0.0002, "epoch": 3.072890484739677, "step": 42790}, {"loss": 0.645, "grad_norm": 1.0271626710891724, "learning_rate": 0.0002, "epoch": 3.073608617594255, "step": 42800}, {"loss": 0.6782, "grad_norm": 0.85148024559021, "learning_rate": 0.0002, "epoch": 3.074326750448833, "step": 42810}, {"loss": 0.5905, "grad_norm": 0.805772602558136, "learning_rate": 0.0002, "epoch": 3.075044883303411, "step": 42820}, {"loss": 0.623, "grad_norm": 0.8057122230529785, "learning_rate": 0.0002, "epoch": 3.0757630161579894, "step": 42830}, {"loss": 0.6391, "grad_norm": 0.7997274994850159, "learning_rate": 0.0002, "epoch": 3.0764811490125674, "step": 42840}, {"loss": 0.5965, "grad_norm": 0.8739321231842041, "learning_rate": 0.0002, "epoch": 3.0771992818671454, "step": 42850}, {"loss": 0.6027, "grad_norm": 0.833951473236084, "learning_rate": 0.0002, "epoch": 3.0779174147217234, "step": 42860}, {"loss": 0.6251, "grad_norm": 0.8813839554786682, "learning_rate": 0.0002, "epoch": 3.0786355475763014, "step": 42870}, {"loss": 0.6485, "grad_norm": 0.9020521640777588, "learning_rate": 0.0002, "epoch": 3.07935368043088, "step": 42880}, {"loss": 0.5719, "grad_norm": 0.888148844242096, "learning_rate": 0.0002, "epoch": 3.080071813285458, "step": 42890}, {"loss": 0.6715, "grad_norm": 0.8110589385032654, "learning_rate": 0.0002, "epoch": 3.080789946140036, "step": 42900}, {"loss": 0.5931, "grad_norm": 0.818738579750061, "learning_rate": 0.0002, "epoch": 3.081508078994614, "step": 42910}, {"loss": 0.6723, "grad_norm": 0.9607479572296143, "learning_rate": 0.0002, "epoch": 3.082226211849192, "step": 42920}, {"loss": 0.6045, "grad_norm": 0.8162698745727539, "learning_rate": 0.0002, "epoch": 3.0829443447037703, "step": 42930}, {"loss": 0.5975, "grad_norm": 0.8170801997184753, "learning_rate": 0.0002, "epoch": 3.0836624775583483, "step": 42940}, {"loss": 0.5748, "grad_norm": 0.9250763654708862, "learning_rate": 0.0002, "epoch": 3.0843806104129263, "step": 42950}, {"loss": 0.6651, "grad_norm": 0.898097813129425, "learning_rate": 0.0002, "epoch": 3.0850987432675043, "step": 42960}, {"loss": 0.6573, "grad_norm": 0.9398433566093445, "learning_rate": 0.0002, "epoch": 3.0858168761220828, "step": 42970}, {"loss": 0.6243, "grad_norm": 1.052808165550232, "learning_rate": 0.0002, "epoch": 3.0865350089766608, "step": 42980}, {"loss": 0.6622, "grad_norm": 0.8974723219871521, "learning_rate": 0.0002, "epoch": 3.087253141831239, "step": 42990}, {"loss": 0.6135, "grad_norm": 0.7517408728599548, "learning_rate": 0.0002, "epoch": 3.087971274685817, "step": 43000}, {"loss": 0.6185, "grad_norm": 0.8054485321044922, "learning_rate": 0.0002, "epoch": 3.088689407540395, "step": 43010}, {"loss": 0.6199, "grad_norm": 0.9896154999732971, "learning_rate": 0.0002, "epoch": 3.0894075403949732, "step": 43020}, {"loss": 0.6308, "grad_norm": 0.7887356281280518, "learning_rate": 0.0002, "epoch": 3.0901256732495512, "step": 43030}, {"loss": 0.6173, "grad_norm": 1.0119125843048096, "learning_rate": 0.0002, "epoch": 3.0908438061041292, "step": 43040}, {"loss": 0.6294, "grad_norm": 0.8753892779350281, "learning_rate": 0.0002, "epoch": 3.0915619389587072, "step": 43050}, {"loss": 0.6068, "grad_norm": 0.8322654962539673, "learning_rate": 0.0002, "epoch": 3.0922800718132857, "step": 43060}, {"loss": 0.6237, "grad_norm": 1.0605992078781128, "learning_rate": 0.0002, "epoch": 3.0929982046678637, "step": 43070}, {"loss": 0.6507, "grad_norm": 0.8783912062644958, "learning_rate": 0.0002, "epoch": 3.0937163375224417, "step": 43080}, {"loss": 0.6023, "grad_norm": 0.8839107751846313, "learning_rate": 0.0002, "epoch": 3.0944344703770197, "step": 43090}, {"loss": 0.6588, "grad_norm": 1.1655086278915405, "learning_rate": 0.0002, "epoch": 3.0951526032315977, "step": 43100}, {"loss": 0.6367, "grad_norm": 0.7051523327827454, "learning_rate": 0.0002, "epoch": 3.095870736086176, "step": 43110}, {"loss": 0.5941, "grad_norm": 0.7793807983398438, "learning_rate": 0.0002, "epoch": 3.096588868940754, "step": 43120}, {"loss": 0.6073, "grad_norm": 0.8352194428443909, "learning_rate": 0.0002, "epoch": 3.097307001795332, "step": 43130}, {"loss": 0.6087, "grad_norm": 0.9684847593307495, "learning_rate": 0.0002, "epoch": 3.09802513464991, "step": 43140}, {"loss": 0.6347, "grad_norm": 1.1106340885162354, "learning_rate": 0.0002, "epoch": 3.098743267504488, "step": 43150}, {"loss": 0.6395, "grad_norm": 0.7814911603927612, "learning_rate": 0.0002, "epoch": 3.0994614003590666, "step": 43160}, {"loss": 0.637, "grad_norm": 0.7923110723495483, "learning_rate": 0.0002, "epoch": 3.1001795332136446, "step": 43170}, {"loss": 0.6218, "grad_norm": 0.87022864818573, "learning_rate": 0.0002, "epoch": 3.1008976660682226, "step": 43180}, {"loss": 0.6246, "grad_norm": 0.9352855682373047, "learning_rate": 0.0002, "epoch": 3.1016157989228006, "step": 43190}, {"loss": 0.5943, "grad_norm": 0.8548445105552673, "learning_rate": 0.0002, "epoch": 3.1023339317773786, "step": 43200}, {"loss": 0.6106, "grad_norm": 0.9576025009155273, "learning_rate": 0.0002, "epoch": 3.103052064631957, "step": 43210}, {"loss": 0.6222, "grad_norm": 0.7430430054664612, "learning_rate": 0.0002, "epoch": 3.103770197486535, "step": 43220}, {"loss": 0.6223, "grad_norm": 0.9619144797325134, "learning_rate": 0.0002, "epoch": 3.104488330341113, "step": 43230}, {"loss": 0.6171, "grad_norm": 0.8622338771820068, "learning_rate": 0.0002, "epoch": 3.105206463195691, "step": 43240}, {"loss": 0.6336, "grad_norm": 0.853489339351654, "learning_rate": 0.0002, "epoch": 3.1059245960502695, "step": 43250}, {"loss": 0.635, "grad_norm": 0.9253206849098206, "learning_rate": 0.0002, "epoch": 3.1066427289048475, "step": 43260}, {"loss": 0.68, "grad_norm": 0.9700671434402466, "learning_rate": 0.0002, "epoch": 3.1073608617594255, "step": 43270}, {"loss": 0.6284, "grad_norm": 1.0550731420516968, "learning_rate": 0.0002, "epoch": 3.1080789946140035, "step": 43280}, {"loss": 0.6389, "grad_norm": 0.939452052116394, "learning_rate": 0.0002, "epoch": 3.1087971274685815, "step": 43290}, {"loss": 0.621, "grad_norm": 0.8855276107788086, "learning_rate": 0.0002, "epoch": 3.10951526032316, "step": 43300}, {"loss": 0.5814, "grad_norm": 0.92197185754776, "learning_rate": 0.0002, "epoch": 3.110233393177738, "step": 43310}, {"loss": 0.6341, "grad_norm": 0.8825578689575195, "learning_rate": 0.0002, "epoch": 3.110951526032316, "step": 43320}, {"loss": 0.6412, "grad_norm": 0.9964608550071716, "learning_rate": 0.0002, "epoch": 3.111669658886894, "step": 43330}, {"loss": 0.6074, "grad_norm": 0.9070520401000977, "learning_rate": 0.0002, "epoch": 3.1123877917414724, "step": 43340}, {"loss": 0.6503, "grad_norm": 0.9699633717536926, "learning_rate": 0.0002, "epoch": 3.1131059245960504, "step": 43350}, {"loss": 0.6545, "grad_norm": 0.7384091019630432, "learning_rate": 0.0002, "epoch": 3.1138240574506284, "step": 43360}, {"loss": 0.6644, "grad_norm": 0.9445326328277588, "learning_rate": 0.0002, "epoch": 3.1145421903052064, "step": 43370}, {"loss": 0.6088, "grad_norm": 0.8906524181365967, "learning_rate": 0.0002, "epoch": 3.1152603231597844, "step": 43380}, {"loss": 0.6213, "grad_norm": 0.8850129246711731, "learning_rate": 0.0002, "epoch": 3.115978456014363, "step": 43390}, {"loss": 0.6156, "grad_norm": 0.7091860771179199, "learning_rate": 0.0002, "epoch": 3.116696588868941, "step": 43400}, {"loss": 0.6056, "grad_norm": 0.8992764949798584, "learning_rate": 0.0002, "epoch": 3.117414721723519, "step": 43410}, {"loss": 0.6336, "grad_norm": 0.9166698455810547, "learning_rate": 0.0002, "epoch": 3.118132854578097, "step": 43420}, {"loss": 0.7011, "grad_norm": 1.1195749044418335, "learning_rate": 0.0002, "epoch": 3.118850987432675, "step": 43430}, {"loss": 0.6409, "grad_norm": 0.9414069652557373, "learning_rate": 0.0002, "epoch": 3.1195691202872533, "step": 43440}, {"loss": 0.6533, "grad_norm": 0.7641217112541199, "learning_rate": 0.0002, "epoch": 3.1202872531418313, "step": 43450}, {"loss": 0.6613, "grad_norm": 1.2659285068511963, "learning_rate": 0.0002, "epoch": 3.1210053859964093, "step": 43460}, {"loss": 0.631, "grad_norm": 0.9968213438987732, "learning_rate": 0.0002, "epoch": 3.1217235188509873, "step": 43470}, {"loss": 0.5833, "grad_norm": 0.8819042444229126, "learning_rate": 0.0002, "epoch": 3.1224416517055653, "step": 43480}, {"loss": 0.6819, "grad_norm": 0.9124775528907776, "learning_rate": 0.0002, "epoch": 3.1231597845601438, "step": 43490}, {"loss": 0.675, "grad_norm": 0.868354082107544, "learning_rate": 0.0002, "epoch": 3.1238779174147218, "step": 43500}, {"loss": 0.6348, "grad_norm": 0.7367526292800903, "learning_rate": 0.0002, "epoch": 3.1245960502692998, "step": 43510}, {"loss": 0.6068, "grad_norm": 0.7553679943084717, "learning_rate": 0.0002, "epoch": 3.1253141831238778, "step": 43520}, {"loss": 0.6346, "grad_norm": 0.7970008850097656, "learning_rate": 0.0002, "epoch": 3.126032315978456, "step": 43530}, {"loss": 0.6357, "grad_norm": 0.9117488861083984, "learning_rate": 0.0002, "epoch": 3.126750448833034, "step": 43540}, {"loss": 0.6609, "grad_norm": 0.8004103899002075, "learning_rate": 0.0002, "epoch": 3.127468581687612, "step": 43550}, {"loss": 0.596, "grad_norm": 0.736518919467926, "learning_rate": 0.0002, "epoch": 3.12818671454219, "step": 43560}, {"loss": 0.5945, "grad_norm": 0.8568395376205444, "learning_rate": 0.0002, "epoch": 3.128904847396768, "step": 43570}, {"loss": 0.665, "grad_norm": 0.9344052672386169, "learning_rate": 0.0002, "epoch": 3.1296229802513467, "step": 43580}, {"loss": 0.6403, "grad_norm": 0.7986525297164917, "learning_rate": 0.0002, "epoch": 3.1303411131059247, "step": 43590}, {"loss": 0.61, "grad_norm": 0.8283242583274841, "learning_rate": 0.0002, "epoch": 3.1310592459605027, "step": 43600}, {"loss": 0.6003, "grad_norm": 0.6534292101860046, "learning_rate": 0.0002, "epoch": 3.1317773788150807, "step": 43610}, {"loss": 0.6994, "grad_norm": 0.9585428833961487, "learning_rate": 0.0002, "epoch": 3.132495511669659, "step": 43620}, {"loss": 0.6007, "grad_norm": 0.8299157023429871, "learning_rate": 0.0002, "epoch": 3.133213644524237, "step": 43630}, {"loss": 0.6169, "grad_norm": 0.9050052762031555, "learning_rate": 0.0002, "epoch": 3.133931777378815, "step": 43640}, {"loss": 0.6217, "grad_norm": 1.0457062721252441, "learning_rate": 0.0002, "epoch": 3.134649910233393, "step": 43650}, {"loss": 0.6147, "grad_norm": 0.907691240310669, "learning_rate": 0.0002, "epoch": 3.135368043087971, "step": 43660}, {"loss": 0.5808, "grad_norm": 0.8868935108184814, "learning_rate": 0.0002, "epoch": 3.1360861759425496, "step": 43670}, {"loss": 0.6427, "grad_norm": 0.8585456609725952, "learning_rate": 0.0002, "epoch": 3.1368043087971276, "step": 43680}, {"loss": 0.6242, "grad_norm": 1.0402741432189941, "learning_rate": 0.0002, "epoch": 3.1375224416517056, "step": 43690}, {"loss": 0.641, "grad_norm": 1.0866798162460327, "learning_rate": 0.0002, "epoch": 3.1382405745062836, "step": 43700}, {"loss": 0.6082, "grad_norm": 0.7637296915054321, "learning_rate": 0.0002, "epoch": 3.1389587073608616, "step": 43710}, {"loss": 0.6256, "grad_norm": 0.755235493183136, "learning_rate": 0.0002, "epoch": 3.13967684021544, "step": 43720}, {"loss": 0.6441, "grad_norm": 0.7258853316307068, "learning_rate": 0.0002, "epoch": 3.140394973070018, "step": 43730}, {"loss": 0.5891, "grad_norm": 1.0425268411636353, "learning_rate": 0.0002, "epoch": 3.141113105924596, "step": 43740}, {"loss": 0.6527, "grad_norm": 0.9171959757804871, "learning_rate": 0.0002, "epoch": 3.141831238779174, "step": 43750}, {"loss": 0.6365, "grad_norm": 0.8900150656700134, "learning_rate": 0.0002, "epoch": 3.142549371633752, "step": 43760}, {"loss": 0.6324, "grad_norm": 0.9879246354103088, "learning_rate": 0.0002, "epoch": 3.1432675044883305, "step": 43770}, {"loss": 0.6624, "grad_norm": 0.7853389382362366, "learning_rate": 0.0002, "epoch": 3.1439856373429085, "step": 43780}, {"loss": 0.6259, "grad_norm": 1.0245232582092285, "learning_rate": 0.0002, "epoch": 3.1447037701974865, "step": 43790}, {"loss": 0.6278, "grad_norm": 0.8486390113830566, "learning_rate": 0.0002, "epoch": 3.1454219030520645, "step": 43800}, {"loss": 0.6175, "grad_norm": 0.8536406755447388, "learning_rate": 0.0002, "epoch": 3.146140035906643, "step": 43810}, {"loss": 0.5901, "grad_norm": 0.9653734564781189, "learning_rate": 0.0002, "epoch": 3.146858168761221, "step": 43820}, {"loss": 0.6041, "grad_norm": 0.8292608857154846, "learning_rate": 0.0002, "epoch": 3.147576301615799, "step": 43830}, {"loss": 0.6688, "grad_norm": 1.147524118423462, "learning_rate": 0.0002, "epoch": 3.148294434470377, "step": 43840}, {"loss": 0.6155, "grad_norm": 0.9317546486854553, "learning_rate": 0.0002, "epoch": 3.149012567324955, "step": 43850}, {"loss": 0.6305, "grad_norm": 0.8651045560836792, "learning_rate": 0.0002, "epoch": 3.1497307001795334, "step": 43860}, {"loss": 0.5985, "grad_norm": 0.8718969225883484, "learning_rate": 0.0002, "epoch": 3.1504488330341114, "step": 43870}, {"loss": 0.6206, "grad_norm": 1.0140702724456787, "learning_rate": 0.0002, "epoch": 3.1511669658886894, "step": 43880}, {"loss": 0.5941, "grad_norm": 0.75941401720047, "learning_rate": 0.0002, "epoch": 3.1518850987432674, "step": 43890}, {"loss": 0.5957, "grad_norm": 0.6618940234184265, "learning_rate": 0.0002, "epoch": 3.152603231597846, "step": 43900}, {"loss": 0.6262, "grad_norm": 1.0013338327407837, "learning_rate": 0.0002, "epoch": 3.153321364452424, "step": 43910}, {"loss": 0.6263, "grad_norm": 0.8735299706459045, "learning_rate": 0.0002, "epoch": 3.154039497307002, "step": 43920}, {"loss": 0.627, "grad_norm": 1.141914963722229, "learning_rate": 0.0002, "epoch": 3.15475763016158, "step": 43930}, {"loss": 0.6604, "grad_norm": 1.0916038751602173, "learning_rate": 0.0002, "epoch": 3.155475763016158, "step": 43940}, {"loss": 0.6228, "grad_norm": 0.7042547464370728, "learning_rate": 0.0002, "epoch": 3.1561938958707363, "step": 43950}, {"loss": 0.6069, "grad_norm": 0.9885236620903015, "learning_rate": 0.0002, "epoch": 3.1569120287253143, "step": 43960}, {"loss": 0.5973, "grad_norm": 0.8083009719848633, "learning_rate": 0.0002, "epoch": 3.1576301615798923, "step": 43970}, {"loss": 0.6416, "grad_norm": 1.082627296447754, "learning_rate": 0.0002, "epoch": 3.1583482944344703, "step": 43980}, {"loss": 0.624, "grad_norm": 0.9293290376663208, "learning_rate": 0.0002, "epoch": 3.1590664272890483, "step": 43990}, {"loss": 0.5665, "grad_norm": 0.861003041267395, "learning_rate": 0.0002, "epoch": 3.1597845601436267, "step": 44000}, {"loss": 0.6221, "grad_norm": 0.9565994143486023, "learning_rate": 0.0002, "epoch": 3.1605026929982047, "step": 44010}, {"loss": 0.7038, "grad_norm": 0.9609305262565613, "learning_rate": 0.0002, "epoch": 3.1612208258527827, "step": 44020}, {"loss": 0.6064, "grad_norm": 0.847830593585968, "learning_rate": 0.0002, "epoch": 3.1619389587073607, "step": 44030}, {"loss": 0.6299, "grad_norm": 0.852357804775238, "learning_rate": 0.0002, "epoch": 3.1626570915619387, "step": 44040}, {"loss": 0.5943, "grad_norm": 0.8634562492370605, "learning_rate": 0.0002, "epoch": 3.163375224416517, "step": 44050}, {"loss": 0.6011, "grad_norm": 1.0259950160980225, "learning_rate": 0.0002, "epoch": 3.164093357271095, "step": 44060}, {"loss": 0.7039, "grad_norm": 0.9615250825881958, "learning_rate": 0.0002, "epoch": 3.164811490125673, "step": 44070}, {"loss": 0.6179, "grad_norm": 0.9892165660858154, "learning_rate": 0.0002, "epoch": 3.165529622980251, "step": 44080}, {"loss": 0.6295, "grad_norm": 0.8827354907989502, "learning_rate": 0.0002, "epoch": 3.1662477558348296, "step": 44090}, {"loss": 0.6131, "grad_norm": 0.9258168339729309, "learning_rate": 0.0002, "epoch": 3.1669658886894076, "step": 44100}, {"loss": 0.5746, "grad_norm": 0.7983399033546448, "learning_rate": 0.0002, "epoch": 3.1676840215439857, "step": 44110}, {"loss": 0.6075, "grad_norm": 0.9917809963226318, "learning_rate": 0.0002, "epoch": 3.1684021543985637, "step": 44120}, {"loss": 0.6474, "grad_norm": 1.058927297592163, "learning_rate": 0.0002, "epoch": 3.1691202872531417, "step": 44130}, {"loss": 0.6211, "grad_norm": 1.0095895528793335, "learning_rate": 0.0002, "epoch": 3.16983842010772, "step": 44140}, {"loss": 0.6586, "grad_norm": 0.9032495617866516, "learning_rate": 0.0002, "epoch": 3.170556552962298, "step": 44150}, {"loss": 0.6356, "grad_norm": 0.9391272664070129, "learning_rate": 0.0002, "epoch": 3.171274685816876, "step": 44160}, {"loss": 0.6324, "grad_norm": 0.990755558013916, "learning_rate": 0.0002, "epoch": 3.171992818671454, "step": 44170}, {"loss": 0.5647, "grad_norm": 0.9310759902000427, "learning_rate": 0.0002, "epoch": 3.172710951526032, "step": 44180}, {"loss": 0.6802, "grad_norm": 0.7698856592178345, "learning_rate": 0.0002, "epoch": 3.1734290843806106, "step": 44190}, {"loss": 0.6109, "grad_norm": 0.7735867500305176, "learning_rate": 0.0002, "epoch": 3.1741472172351886, "step": 44200}, {"loss": 0.6252, "grad_norm": 1.1447525024414062, "learning_rate": 0.0002, "epoch": 3.1748653500897666, "step": 44210}, {"loss": 0.6268, "grad_norm": 0.8667060136795044, "learning_rate": 0.0002, "epoch": 3.1755834829443446, "step": 44220}, {"loss": 0.6066, "grad_norm": 0.8596829771995544, "learning_rate": 0.0002, "epoch": 3.176301615798923, "step": 44230}, {"loss": 0.6142, "grad_norm": 0.8607654571533203, "learning_rate": 0.0002, "epoch": 3.177019748653501, "step": 44240}, {"loss": 0.6358, "grad_norm": 0.9346948266029358, "learning_rate": 0.0002, "epoch": 3.177737881508079, "step": 44250}, {"loss": 0.6099, "grad_norm": 0.852344810962677, "learning_rate": 0.0002, "epoch": 3.178456014362657, "step": 44260}, {"loss": 0.5759, "grad_norm": 0.9260450005531311, "learning_rate": 0.0002, "epoch": 3.179174147217235, "step": 44270}, {"loss": 0.6419, "grad_norm": 0.924053430557251, "learning_rate": 0.0002, "epoch": 3.1798922800718135, "step": 44280}, {"loss": 0.6456, "grad_norm": 1.001965045928955, "learning_rate": 0.0002, "epoch": 3.1806104129263915, "step": 44290}, {"loss": 0.6211, "grad_norm": 0.943215012550354, "learning_rate": 0.0002, "epoch": 3.1813285457809695, "step": 44300}, {"loss": 0.6261, "grad_norm": 1.006977915763855, "learning_rate": 0.0002, "epoch": 3.1820466786355475, "step": 44310}, {"loss": 0.6684, "grad_norm": 0.9768950343132019, "learning_rate": 0.0002, "epoch": 3.1827648114901255, "step": 44320}, {"loss": 0.6334, "grad_norm": 0.9297489523887634, "learning_rate": 0.0002, "epoch": 3.183482944344704, "step": 44330}, {"loss": 0.6291, "grad_norm": 0.9110919237136841, "learning_rate": 0.0002, "epoch": 3.184201077199282, "step": 44340}, {"loss": 0.6389, "grad_norm": 0.9821381568908691, "learning_rate": 0.0002, "epoch": 3.18491921005386, "step": 44350}, {"loss": 0.6342, "grad_norm": 0.8451243042945862, "learning_rate": 0.0002, "epoch": 3.185637342908438, "step": 44360}, {"loss": 0.6709, "grad_norm": 0.9676638245582581, "learning_rate": 0.0002, "epoch": 3.1863554757630164, "step": 44370}, {"loss": 0.6506, "grad_norm": 0.9826035499572754, "learning_rate": 0.0002, "epoch": 3.1870736086175944, "step": 44380}, {"loss": 0.6425, "grad_norm": 0.9453121423721313, "learning_rate": 0.0002, "epoch": 3.1877917414721724, "step": 44390}, {"loss": 0.6481, "grad_norm": 0.7766330242156982, "learning_rate": 0.0002, "epoch": 3.1885098743267504, "step": 44400}, {"loss": 0.6369, "grad_norm": 0.9302349090576172, "learning_rate": 0.0002, "epoch": 3.1892280071813284, "step": 44410}, {"loss": 0.5586, "grad_norm": 0.8335331082344055, "learning_rate": 0.0002, "epoch": 3.189946140035907, "step": 44420}, {"loss": 0.673, "grad_norm": 0.6722736358642578, "learning_rate": 0.0002, "epoch": 3.190664272890485, "step": 44430}, {"loss": 0.6809, "grad_norm": 0.9047536849975586, "learning_rate": 0.0002, "epoch": 3.191382405745063, "step": 44440}, {"loss": 0.6085, "grad_norm": 0.9653822183609009, "learning_rate": 0.0002, "epoch": 3.192100538599641, "step": 44450}, {"loss": 0.6071, "grad_norm": 0.7750703692436218, "learning_rate": 0.0002, "epoch": 3.192818671454219, "step": 44460}, {"loss": 0.6323, "grad_norm": 0.7767539024353027, "learning_rate": 0.0002, "epoch": 3.1935368043087973, "step": 44470}, {"loss": 0.6471, "grad_norm": 0.8597778081893921, "learning_rate": 0.0002, "epoch": 3.1942549371633753, "step": 44480}, {"loss": 0.6804, "grad_norm": 1.1711493730545044, "learning_rate": 0.0002, "epoch": 3.1949730700179533, "step": 44490}, {"loss": 0.5917, "grad_norm": 0.9025220274925232, "learning_rate": 0.0002, "epoch": 3.1956912028725313, "step": 44500}, {"loss": 0.6445, "grad_norm": 0.8084979057312012, "learning_rate": 0.0002, "epoch": 3.1964093357271093, "step": 44510}, {"loss": 0.5943, "grad_norm": 0.8475074172019958, "learning_rate": 0.0002, "epoch": 3.1971274685816877, "step": 44520}, {"loss": 0.5959, "grad_norm": 0.9915644526481628, "learning_rate": 0.0002, "epoch": 3.1978456014362657, "step": 44530}, {"loss": 0.627, "grad_norm": 0.992231547832489, "learning_rate": 0.0002, "epoch": 3.1985637342908437, "step": 44540}, {"loss": 0.625, "grad_norm": 0.9804556369781494, "learning_rate": 0.0002, "epoch": 3.1992818671454217, "step": 44550}, {"loss": 0.6534, "grad_norm": 1.045558214187622, "learning_rate": 0.0002, "epoch": 3.2, "step": 44560}, {"loss": 0.6201, "grad_norm": 1.0880261659622192, "learning_rate": 0.0002, "epoch": 3.200718132854578, "step": 44570}, {"loss": 0.6471, "grad_norm": 0.9511138200759888, "learning_rate": 0.0002, "epoch": 3.201436265709156, "step": 44580}, {"loss": 0.5961, "grad_norm": 0.9115344882011414, "learning_rate": 0.0002, "epoch": 3.202154398563734, "step": 44590}, {"loss": 0.6504, "grad_norm": 1.0738362073898315, "learning_rate": 0.0002, "epoch": 3.202872531418312, "step": 44600}, {"loss": 0.6324, "grad_norm": 0.8209697604179382, "learning_rate": 0.0002, "epoch": 3.2035906642728906, "step": 44610}, {"loss": 0.6445, "grad_norm": 0.9220197796821594, "learning_rate": 0.0002, "epoch": 3.2043087971274686, "step": 44620}, {"loss": 0.5798, "grad_norm": 0.8859700560569763, "learning_rate": 0.0002, "epoch": 3.2050269299820466, "step": 44630}, {"loss": 0.6185, "grad_norm": 0.9772757291793823, "learning_rate": 0.0002, "epoch": 3.2057450628366246, "step": 44640}, {"loss": 0.6528, "grad_norm": 0.9385574460029602, "learning_rate": 0.0002, "epoch": 3.206463195691203, "step": 44650}, {"loss": 0.6098, "grad_norm": 0.839958906173706, "learning_rate": 0.0002, "epoch": 3.207181328545781, "step": 44660}, {"loss": 0.6803, "grad_norm": 0.860478401184082, "learning_rate": 0.0002, "epoch": 3.207899461400359, "step": 44670}, {"loss": 0.683, "grad_norm": 0.846886396408081, "learning_rate": 0.0002, "epoch": 3.208617594254937, "step": 44680}, {"loss": 0.6312, "grad_norm": 0.8591006398200989, "learning_rate": 0.0002, "epoch": 3.209335727109515, "step": 44690}, {"loss": 0.6173, "grad_norm": 0.9236023426055908, "learning_rate": 0.0002, "epoch": 3.2100538599640935, "step": 44700}, {"loss": 0.6471, "grad_norm": 0.7348999977111816, "learning_rate": 0.0002, "epoch": 3.2107719928186715, "step": 44710}, {"loss": 0.6239, "grad_norm": 1.0041730403900146, "learning_rate": 0.0002, "epoch": 3.2114901256732495, "step": 44720}, {"loss": 0.6612, "grad_norm": 0.8382687568664551, "learning_rate": 0.0002, "epoch": 3.2122082585278275, "step": 44730}, {"loss": 0.6026, "grad_norm": 0.8253511190414429, "learning_rate": 0.0002, "epoch": 3.2129263913824055, "step": 44740}, {"loss": 0.6129, "grad_norm": 0.9589242935180664, "learning_rate": 0.0002, "epoch": 3.213644524236984, "step": 44750}, {"loss": 0.6476, "grad_norm": 0.8938157558441162, "learning_rate": 0.0002, "epoch": 3.214362657091562, "step": 44760}, {"loss": 0.6811, "grad_norm": 1.0085135698318481, "learning_rate": 0.0002, "epoch": 3.21508078994614, "step": 44770}, {"loss": 0.646, "grad_norm": 0.8647134304046631, "learning_rate": 0.0002, "epoch": 3.215798922800718, "step": 44780}, {"loss": 0.6169, "grad_norm": 1.09453284740448, "learning_rate": 0.0002, "epoch": 3.216517055655296, "step": 44790}, {"loss": 0.6156, "grad_norm": 0.8710666298866272, "learning_rate": 0.0002, "epoch": 3.2172351885098744, "step": 44800}, {"loss": 0.662, "grad_norm": 0.8080880641937256, "learning_rate": 0.0002, "epoch": 3.2179533213644524, "step": 44810}, {"loss": 0.6039, "grad_norm": 1.0440675020217896, "learning_rate": 0.0002, "epoch": 3.2186714542190304, "step": 44820}, {"loss": 0.6629, "grad_norm": 1.1036376953125, "learning_rate": 0.0002, "epoch": 3.2193895870736084, "step": 44830}, {"loss": 0.6474, "grad_norm": 0.8783546686172485, "learning_rate": 0.0002, "epoch": 3.220107719928187, "step": 44840}, {"loss": 0.6286, "grad_norm": 0.7816855907440186, "learning_rate": 0.0002, "epoch": 3.220825852782765, "step": 44850}, {"loss": 0.622, "grad_norm": 1.0099157094955444, "learning_rate": 0.0002, "epoch": 3.221543985637343, "step": 44860}, {"loss": 0.6668, "grad_norm": 1.054928183555603, "learning_rate": 0.0002, "epoch": 3.222262118491921, "step": 44870}, {"loss": 0.6104, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 3.222980251346499, "step": 44880}, {"loss": 0.686, "grad_norm": 0.9730798602104187, "learning_rate": 0.0002, "epoch": 3.2236983842010773, "step": 44890}, {"loss": 0.6533, "grad_norm": 0.7911382913589478, "learning_rate": 0.0002, "epoch": 3.2244165170556554, "step": 44900}, {"loss": 0.6466, "grad_norm": 0.9574400782585144, "learning_rate": 0.0002, "epoch": 3.2251346499102334, "step": 44910}, {"loss": 0.693, "grad_norm": 0.8101068139076233, "learning_rate": 0.0002, "epoch": 3.2258527827648114, "step": 44920}, {"loss": 0.6605, "grad_norm": 0.754146933555603, "learning_rate": 0.0002, "epoch": 3.22657091561939, "step": 44930}, {"loss": 0.6317, "grad_norm": 0.7471939921379089, "learning_rate": 0.0002, "epoch": 3.227289048473968, "step": 44940}, {"loss": 0.6378, "grad_norm": 1.0040855407714844, "learning_rate": 0.0002, "epoch": 3.228007181328546, "step": 44950}, {"loss": 0.6496, "grad_norm": 1.0016074180603027, "learning_rate": 0.0002, "epoch": 3.228725314183124, "step": 44960}, {"loss": 0.6, "grad_norm": 1.0432976484298706, "learning_rate": 0.0002, "epoch": 3.229443447037702, "step": 44970}, {"loss": 0.635, "grad_norm": 0.8517055511474609, "learning_rate": 0.0002, "epoch": 3.2301615798922803, "step": 44980}, {"loss": 0.6168, "grad_norm": 0.9174178242683411, "learning_rate": 0.0002, "epoch": 3.2308797127468583, "step": 44990}, {"loss": 0.6325, "grad_norm": 0.9733774065971375, "learning_rate": 0.0002, "epoch": 3.2315978456014363, "step": 45000}, {"loss": 0.6743, "grad_norm": 0.9074714779853821, "learning_rate": 0.0002, "epoch": 3.2323159784560143, "step": 45010}, {"loss": 0.6372, "grad_norm": 0.8802759051322937, "learning_rate": 0.0002, "epoch": 3.2330341113105923, "step": 45020}, {"loss": 0.6189, "grad_norm": 1.0620871782302856, "learning_rate": 0.0002, "epoch": 3.2337522441651707, "step": 45030}, {"loss": 0.6201, "grad_norm": 0.8069542050361633, "learning_rate": 0.0002, "epoch": 3.2344703770197487, "step": 45040}, {"loss": 0.618, "grad_norm": 0.9139137864112854, "learning_rate": 0.0002, "epoch": 3.2351885098743267, "step": 45050}, {"loss": 0.6389, "grad_norm": 0.8936411142349243, "learning_rate": 0.0002, "epoch": 3.2359066427289047, "step": 45060}, {"loss": 0.6602, "grad_norm": 0.9098079204559326, "learning_rate": 0.0002, "epoch": 3.2366247755834827, "step": 45070}, {"loss": 0.6423, "grad_norm": 1.062953233718872, "learning_rate": 0.0002, "epoch": 3.237342908438061, "step": 45080}, {"loss": 0.6527, "grad_norm": 0.8656470775604248, "learning_rate": 0.0002, "epoch": 3.238061041292639, "step": 45090}, {"loss": 0.6362, "grad_norm": 0.9299449920654297, "learning_rate": 0.0002, "epoch": 3.238779174147217, "step": 45100}, {"loss": 0.6469, "grad_norm": 1.0102022886276245, "learning_rate": 0.0002, "epoch": 3.239497307001795, "step": 45110}, {"loss": 0.5984, "grad_norm": 0.8074561953544617, "learning_rate": 0.0002, "epoch": 3.2402154398563736, "step": 45120}, {"loss": 0.6196, "grad_norm": 1.044105887413025, "learning_rate": 0.0002, "epoch": 3.2409335727109516, "step": 45130}, {"loss": 0.6471, "grad_norm": 0.8742762207984924, "learning_rate": 0.0002, "epoch": 3.2416517055655296, "step": 45140}, {"loss": 0.648, "grad_norm": 0.8240015506744385, "learning_rate": 0.0002, "epoch": 3.2423698384201076, "step": 45150}, {"loss": 0.6599, "grad_norm": 0.8438951373100281, "learning_rate": 0.0002, "epoch": 3.2430879712746856, "step": 45160}, {"loss": 0.6406, "grad_norm": 1.02358877658844, "learning_rate": 0.0002, "epoch": 3.243806104129264, "step": 45170}, {"loss": 0.6581, "grad_norm": 0.8824774026870728, "learning_rate": 0.0002, "epoch": 3.244524236983842, "step": 45180}, {"loss": 0.658, "grad_norm": 0.971015989780426, "learning_rate": 0.0002, "epoch": 3.24524236983842, "step": 45190}, {"loss": 0.6473, "grad_norm": 0.9282383918762207, "learning_rate": 0.0002, "epoch": 3.245960502692998, "step": 45200}, {"loss": 0.6376, "grad_norm": 0.7908362746238708, "learning_rate": 0.0002, "epoch": 3.2466786355475765, "step": 45210}, {"loss": 0.6765, "grad_norm": 1.0721662044525146, "learning_rate": 0.0002, "epoch": 3.2473967684021545, "step": 45220}, {"loss": 0.7102, "grad_norm": 0.9516810774803162, "learning_rate": 0.0002, "epoch": 3.2481149012567325, "step": 45230}, {"loss": 0.6332, "grad_norm": 0.7914131283760071, "learning_rate": 0.0002, "epoch": 3.2488330341113105, "step": 45240}, {"loss": 0.6018, "grad_norm": 0.8492292761802673, "learning_rate": 0.0002, "epoch": 3.2495511669658885, "step": 45250}, {"loss": 0.6272, "grad_norm": 0.8880114555358887, "learning_rate": 0.0002, "epoch": 3.250269299820467, "step": 45260}, {"loss": 0.6394, "grad_norm": 0.7808310985565186, "learning_rate": 0.0002, "epoch": 3.250987432675045, "step": 45270}, {"loss": 0.6161, "grad_norm": 0.8566828966140747, "learning_rate": 0.0002, "epoch": 3.251705565529623, "step": 45280}, {"loss": 0.6408, "grad_norm": 0.7929658889770508, "learning_rate": 0.0002, "epoch": 3.252423698384201, "step": 45290}, {"loss": 0.6182, "grad_norm": 0.678207516670227, "learning_rate": 0.0002, "epoch": 3.253141831238779, "step": 45300}, {"loss": 0.6315, "grad_norm": 0.9963029623031616, "learning_rate": 0.0002, "epoch": 3.2538599640933574, "step": 45310}, {"loss": 0.6496, "grad_norm": 0.835304856300354, "learning_rate": 0.0002, "epoch": 3.2545780969479354, "step": 45320}, {"loss": 0.6099, "grad_norm": 0.7281617522239685, "learning_rate": 0.0002, "epoch": 3.2552962298025134, "step": 45330}, {"loss": 0.6224, "grad_norm": 1.244890570640564, "learning_rate": 0.0002, "epoch": 3.2560143626570914, "step": 45340}, {"loss": 0.6317, "grad_norm": 0.8372750282287598, "learning_rate": 0.0002, "epoch": 3.2567324955116694, "step": 45350}, {"loss": 0.604, "grad_norm": 1.0029667615890503, "learning_rate": 0.0002, "epoch": 3.257450628366248, "step": 45360}, {"loss": 0.596, "grad_norm": 0.8561908602714539, "learning_rate": 0.0002, "epoch": 3.258168761220826, "step": 45370}, {"loss": 0.6185, "grad_norm": 1.0058085918426514, "learning_rate": 0.0002, "epoch": 3.258886894075404, "step": 45380}, {"loss": 0.6415, "grad_norm": 0.7768221497535706, "learning_rate": 0.0002, "epoch": 3.259605026929982, "step": 45390}, {"loss": 0.635, "grad_norm": 0.8443793058395386, "learning_rate": 0.0002, "epoch": 3.2603231597845603, "step": 45400}, {"loss": 0.6579, "grad_norm": 1.0140392780303955, "learning_rate": 0.0002, "epoch": 3.2610412926391383, "step": 45410}, {"loss": 0.6434, "grad_norm": 0.8397058248519897, "learning_rate": 0.0002, "epoch": 3.2617594254937163, "step": 45420}, {"loss": 0.6361, "grad_norm": 0.9717063903808594, "learning_rate": 0.0002, "epoch": 3.2624775583482943, "step": 45430}, {"loss": 0.6837, "grad_norm": 1.0279473066329956, "learning_rate": 0.0002, "epoch": 3.2631956912028723, "step": 45440}, {"loss": 0.6274, "grad_norm": 1.207457184791565, "learning_rate": 0.0002, "epoch": 3.263913824057451, "step": 45450}, {"loss": 0.681, "grad_norm": 0.8121998906135559, "learning_rate": 0.0002, "epoch": 3.264631956912029, "step": 45460}, {"loss": 0.6202, "grad_norm": 1.037733554840088, "learning_rate": 0.0002, "epoch": 3.265350089766607, "step": 45470}, {"loss": 0.6146, "grad_norm": 0.9305754899978638, "learning_rate": 0.0002, "epoch": 3.266068222621185, "step": 45480}, {"loss": 0.6186, "grad_norm": 0.9733602404594421, "learning_rate": 0.0002, "epoch": 3.2667863554757632, "step": 45490}, {"loss": 0.6713, "grad_norm": 0.8345039486885071, "learning_rate": 0.0002, "epoch": 3.2675044883303412, "step": 45500}, {"loss": 0.6315, "grad_norm": 0.8601692318916321, "learning_rate": 0.0002, "epoch": 3.2682226211849192, "step": 45510}, {"loss": 0.5953, "grad_norm": 0.7921277284622192, "learning_rate": 0.0002, "epoch": 3.2689407540394972, "step": 45520}, {"loss": 0.6781, "grad_norm": 0.8324153423309326, "learning_rate": 0.0002, "epoch": 3.2696588868940752, "step": 45530}, {"loss": 0.6413, "grad_norm": 0.85141521692276, "learning_rate": 0.0002, "epoch": 3.2703770197486537, "step": 45540}, {"loss": 0.654, "grad_norm": 0.9399608373641968, "learning_rate": 0.0002, "epoch": 3.2710951526032317, "step": 45550}, {"loss": 0.6364, "grad_norm": 0.9829166531562805, "learning_rate": 0.0002, "epoch": 3.2718132854578097, "step": 45560}, {"loss": 0.627, "grad_norm": 0.9936266541481018, "learning_rate": 0.0002, "epoch": 3.2725314183123877, "step": 45570}, {"loss": 0.6465, "grad_norm": 1.036165714263916, "learning_rate": 0.0002, "epoch": 3.2732495511669657, "step": 45580}, {"loss": 0.6216, "grad_norm": 0.8988680243492126, "learning_rate": 0.0002, "epoch": 3.273967684021544, "step": 45590}, {"loss": 0.6368, "grad_norm": 0.9173405766487122, "learning_rate": 0.0002, "epoch": 3.274685816876122, "step": 45600}, {"loss": 0.6455, "grad_norm": 0.9967324733734131, "learning_rate": 0.0002, "epoch": 3.2754039497307, "step": 45610}, {"loss": 0.6236, "grad_norm": 0.9097777009010315, "learning_rate": 0.0002, "epoch": 3.276122082585278, "step": 45620}, {"loss": 0.632, "grad_norm": 1.0559430122375488, "learning_rate": 0.0002, "epoch": 3.276840215439856, "step": 45630}, {"loss": 0.5999, "grad_norm": 0.9583360552787781, "learning_rate": 0.0002, "epoch": 3.2775583482944346, "step": 45640}, {"loss": 0.6329, "grad_norm": 0.7630334496498108, "learning_rate": 0.0002, "epoch": 3.2782764811490126, "step": 45650}, {"loss": 0.6873, "grad_norm": 0.9955230355262756, "learning_rate": 0.0002, "epoch": 3.2789946140035906, "step": 45660}, {"loss": 0.6216, "grad_norm": 0.8685793876647949, "learning_rate": 0.0002, "epoch": 3.2797127468581686, "step": 45670}, {"loss": 0.6243, "grad_norm": 0.919913113117218, "learning_rate": 0.0002, "epoch": 3.280430879712747, "step": 45680}, {"loss": 0.6334, "grad_norm": 0.826144814491272, "learning_rate": 0.0002, "epoch": 3.281149012567325, "step": 45690}, {"loss": 0.6359, "grad_norm": 0.9750179052352905, "learning_rate": 0.0002, "epoch": 3.281867145421903, "step": 45700}, {"loss": 0.6589, "grad_norm": 0.7931897640228271, "learning_rate": 0.0002, "epoch": 3.282585278276481, "step": 45710}, {"loss": 0.6785, "grad_norm": 1.0380089282989502, "learning_rate": 0.0002, "epoch": 3.283303411131059, "step": 45720}, {"loss": 0.6219, "grad_norm": 0.8220566511154175, "learning_rate": 0.0002, "epoch": 3.2840215439856375, "step": 45730}, {"loss": 0.5737, "grad_norm": 0.9688239693641663, "learning_rate": 0.0002, "epoch": 3.2847396768402155, "step": 45740}, {"loss": 0.603, "grad_norm": 0.8760311603546143, "learning_rate": 0.0002, "epoch": 3.2854578096947935, "step": 45750}, {"loss": 0.6134, "grad_norm": 0.8103382587432861, "learning_rate": 0.0002, "epoch": 3.2861759425493715, "step": 45760}, {"loss": 0.6475, "grad_norm": 0.8835865259170532, "learning_rate": 0.0002, "epoch": 3.28689407540395, "step": 45770}, {"loss": 0.6423, "grad_norm": 0.9021160006523132, "learning_rate": 0.0002, "epoch": 3.287612208258528, "step": 45780}, {"loss": 0.6693, "grad_norm": 0.8182386159896851, "learning_rate": 0.0002, "epoch": 3.288330341113106, "step": 45790}, {"loss": 0.6408, "grad_norm": 0.8555024862289429, "learning_rate": 0.0002, "epoch": 3.289048473967684, "step": 45800}, {"loss": 0.6839, "grad_norm": 1.0982348918914795, "learning_rate": 0.0002, "epoch": 3.289766606822262, "step": 45810}, {"loss": 0.6323, "grad_norm": 1.06246817111969, "learning_rate": 0.0002, "epoch": 3.2904847396768404, "step": 45820}, {"loss": 0.5924, "grad_norm": 1.1727149486541748, "learning_rate": 0.0002, "epoch": 3.2912028725314184, "step": 45830}, {"loss": 0.624, "grad_norm": 0.8224700093269348, "learning_rate": 0.0002, "epoch": 3.2919210053859964, "step": 45840}, {"loss": 0.6445, "grad_norm": 0.8195698261260986, "learning_rate": 0.0002, "epoch": 3.2926391382405744, "step": 45850}, {"loss": 0.6106, "grad_norm": 0.8424476981163025, "learning_rate": 0.0002, "epoch": 3.2933572710951524, "step": 45860}, {"loss": 0.6705, "grad_norm": 0.9804632067680359, "learning_rate": 0.0002, "epoch": 3.294075403949731, "step": 45870}, {"loss": 0.6538, "grad_norm": 0.8701804876327515, "learning_rate": 0.0002, "epoch": 3.294793536804309, "step": 45880}, {"loss": 0.6264, "grad_norm": 0.8876864910125732, "learning_rate": 0.0002, "epoch": 3.295511669658887, "step": 45890}, {"loss": 0.6401, "grad_norm": 1.0105448961257935, "learning_rate": 0.0002, "epoch": 3.296229802513465, "step": 45900}, {"loss": 0.687, "grad_norm": 0.847017228603363, "learning_rate": 0.0002, "epoch": 3.296947935368043, "step": 45910}, {"loss": 0.6433, "grad_norm": 0.7610297799110413, "learning_rate": 0.0002, "epoch": 3.2976660682226213, "step": 45920}, {"loss": 0.6499, "grad_norm": 0.7272670269012451, "learning_rate": 0.0002, "epoch": 3.2983842010771993, "step": 45930}, {"loss": 0.6366, "grad_norm": 0.8243510127067566, "learning_rate": 0.0002, "epoch": 3.2991023339317773, "step": 45940}, {"loss": 0.6498, "grad_norm": 1.0113074779510498, "learning_rate": 0.0002, "epoch": 3.2998204667863553, "step": 45950}, {"loss": 0.6639, "grad_norm": 0.8578087687492371, "learning_rate": 0.0002, "epoch": 3.3005385996409338, "step": 45960}, {"loss": 0.6137, "grad_norm": 0.9511606097221375, "learning_rate": 0.0002, "epoch": 3.3012567324955118, "step": 45970}, {"loss": 0.6115, "grad_norm": 0.8612566590309143, "learning_rate": 0.0002, "epoch": 3.3019748653500898, "step": 45980}, {"loss": 0.6799, "grad_norm": 0.8702331185340881, "learning_rate": 0.0002, "epoch": 3.3026929982046678, "step": 45990}, {"loss": 0.6429, "grad_norm": 1.0229583978652954, "learning_rate": 0.0002, "epoch": 3.3034111310592458, "step": 46000}, {"loss": 0.6054, "grad_norm": 1.1775577068328857, "learning_rate": 0.0002, "epoch": 3.304129263913824, "step": 46010}, {"loss": 0.6958, "grad_norm": 0.9922171831130981, "learning_rate": 0.0002, "epoch": 3.3048473967684022, "step": 46020}, {"loss": 0.6642, "grad_norm": 0.8246880769729614, "learning_rate": 0.0002, "epoch": 3.3055655296229802, "step": 46030}, {"loss": 0.678, "grad_norm": 0.9351653456687927, "learning_rate": 0.0002, "epoch": 3.3062836624775582, "step": 46040}, {"loss": 0.649, "grad_norm": 0.9617429375648499, "learning_rate": 0.0002, "epoch": 3.3070017953321367, "step": 46050}, {"loss": 0.6314, "grad_norm": 0.9753885269165039, "learning_rate": 0.0002, "epoch": 3.3077199281867147, "step": 46060}, {"loss": 0.6434, "grad_norm": 0.8532425165176392, "learning_rate": 0.0002, "epoch": 3.3084380610412927, "step": 46070}, {"loss": 0.6312, "grad_norm": 0.9722012877464294, "learning_rate": 0.0002, "epoch": 3.3091561938958707, "step": 46080}, {"loss": 0.6629, "grad_norm": 0.8950021266937256, "learning_rate": 0.0002, "epoch": 3.3098743267504487, "step": 46090}, {"loss": 0.6278, "grad_norm": 0.8536333441734314, "learning_rate": 0.0002, "epoch": 3.3105924596050267, "step": 46100}, {"loss": 0.6359, "grad_norm": 0.9423946738243103, "learning_rate": 0.0002, "epoch": 3.311310592459605, "step": 46110}, {"loss": 0.6647, "grad_norm": 0.8573169112205505, "learning_rate": 0.0002, "epoch": 3.312028725314183, "step": 46120}, {"loss": 0.6127, "grad_norm": 1.0122376680374146, "learning_rate": 0.0002, "epoch": 3.312746858168761, "step": 46130}, {"loss": 0.6782, "grad_norm": 0.7492560744285583, "learning_rate": 0.0002, "epoch": 3.313464991023339, "step": 46140}, {"loss": 0.6315, "grad_norm": 1.023658037185669, "learning_rate": 0.0002, "epoch": 3.3141831238779176, "step": 46150}, {"loss": 0.6051, "grad_norm": 1.1191970109939575, "learning_rate": 0.0002, "epoch": 3.3149012567324956, "step": 46160}, {"loss": 0.6247, "grad_norm": 0.9847373962402344, "learning_rate": 0.0002, "epoch": 3.3156193895870736, "step": 46170}, {"loss": 0.661, "grad_norm": 0.7315911054611206, "learning_rate": 0.0002, "epoch": 3.3163375224416516, "step": 46180}, {"loss": 0.6017, "grad_norm": 0.8267890214920044, "learning_rate": 0.0002, "epoch": 3.3170556552962296, "step": 46190}, {"loss": 0.6202, "grad_norm": 0.8898099064826965, "learning_rate": 0.0002, "epoch": 3.317773788150808, "step": 46200}, {"loss": 0.651, "grad_norm": 0.8525369167327881, "learning_rate": 0.0002, "epoch": 3.318491921005386, "step": 46210}, {"loss": 0.6705, "grad_norm": 0.8074760437011719, "learning_rate": 0.0002, "epoch": 3.319210053859964, "step": 46220}, {"loss": 0.641, "grad_norm": 0.8473616242408752, "learning_rate": 0.0002, "epoch": 3.319928186714542, "step": 46230}, {"loss": 0.6092, "grad_norm": 0.8678314089775085, "learning_rate": 0.0002, "epoch": 3.3206463195691205, "step": 46240}, {"loss": 0.655, "grad_norm": 0.8718782067298889, "learning_rate": 0.0002, "epoch": 3.3213644524236985, "step": 46250}, {"loss": 0.6266, "grad_norm": 0.9384858012199402, "learning_rate": 0.0002, "epoch": 3.3220825852782765, "step": 46260}, {"loss": 0.6393, "grad_norm": 0.9295032620429993, "learning_rate": 0.0002, "epoch": 3.3228007181328545, "step": 46270}, {"loss": 0.6824, "grad_norm": 0.9472482800483704, "learning_rate": 0.0002, "epoch": 3.3235188509874325, "step": 46280}, {"loss": 0.6177, "grad_norm": 0.7970638275146484, "learning_rate": 0.0002, "epoch": 3.324236983842011, "step": 46290}, {"loss": 0.6431, "grad_norm": 0.9508723020553589, "learning_rate": 0.0002, "epoch": 3.324955116696589, "step": 46300}, {"loss": 0.6126, "grad_norm": 0.9153636693954468, "learning_rate": 0.0002, "epoch": 3.325673249551167, "step": 46310}, {"loss": 0.6042, "grad_norm": 0.7890323400497437, "learning_rate": 0.0002, "epoch": 3.326391382405745, "step": 46320}, {"loss": 0.6525, "grad_norm": 0.8711825609207153, "learning_rate": 0.0002, "epoch": 3.3271095152603234, "step": 46330}, {"loss": 0.6253, "grad_norm": 0.9938926696777344, "learning_rate": 0.0002, "epoch": 3.3278276481149014, "step": 46340}, {"loss": 0.6227, "grad_norm": 0.8497524857521057, "learning_rate": 0.0002, "epoch": 3.3285457809694794, "step": 46350}, {"loss": 0.6472, "grad_norm": 0.9191650748252869, "learning_rate": 0.0002, "epoch": 3.3292639138240574, "step": 46360}, {"loss": 0.6385, "grad_norm": 0.8974085450172424, "learning_rate": 0.0002, "epoch": 3.3299820466786354, "step": 46370}, {"loss": 0.618, "grad_norm": 0.9928934574127197, "learning_rate": 0.0002, "epoch": 3.3307001795332134, "step": 46380}, {"loss": 0.6254, "grad_norm": 0.9011030197143555, "learning_rate": 0.0002, "epoch": 3.331418312387792, "step": 46390}, {"loss": 0.6146, "grad_norm": 0.898594856262207, "learning_rate": 0.0002, "epoch": 3.33213644524237, "step": 46400}, {"loss": 0.6321, "grad_norm": 0.7506672143936157, "learning_rate": 0.0002, "epoch": 3.332854578096948, "step": 46410}, {"loss": 0.6329, "grad_norm": 0.9239172339439392, "learning_rate": 0.0002, "epoch": 3.333572710951526, "step": 46420}, {"loss": 0.6278, "grad_norm": 1.0749682188034058, "learning_rate": 0.0002, "epoch": 3.3342908438061043, "step": 46430}, {"loss": 0.6568, "grad_norm": 0.9262617230415344, "learning_rate": 0.0002, "epoch": 3.3350089766606823, "step": 46440}, {"loss": 0.6034, "grad_norm": 0.8681274056434631, "learning_rate": 0.0002, "epoch": 3.3357271095152603, "step": 46450}, {"loss": 0.6261, "grad_norm": 0.9558620452880859, "learning_rate": 0.0002, "epoch": 3.3364452423698383, "step": 46460}, {"loss": 0.6087, "grad_norm": 0.8907097578048706, "learning_rate": 0.0002, "epoch": 3.3371633752244163, "step": 46470}, {"loss": 0.6356, "grad_norm": 1.0941565036773682, "learning_rate": 0.0002, "epoch": 3.3378815080789948, "step": 46480}, {"loss": 0.6536, "grad_norm": 0.8971590995788574, "learning_rate": 0.0002, "epoch": 3.3385996409335728, "step": 46490}, {"loss": 0.6252, "grad_norm": 1.0315606594085693, "learning_rate": 0.0002, "epoch": 3.3393177737881508, "step": 46500}, {"loss": 0.5819, "grad_norm": 0.7717124223709106, "learning_rate": 0.0002, "epoch": 3.3400359066427288, "step": 46510}, {"loss": 0.612, "grad_norm": 0.8060970902442932, "learning_rate": 0.0002, "epoch": 3.340754039497307, "step": 46520}, {"loss": 0.7036, "grad_norm": 0.969510018825531, "learning_rate": 0.0002, "epoch": 3.341472172351885, "step": 46530}, {"loss": 0.6163, "grad_norm": 0.8837248682975769, "learning_rate": 0.0002, "epoch": 3.342190305206463, "step": 46540}, {"loss": 0.6762, "grad_norm": 0.9561076164245605, "learning_rate": 0.0002, "epoch": 3.342908438061041, "step": 46550}, {"loss": 0.687, "grad_norm": 0.8529208898544312, "learning_rate": 0.0002, "epoch": 3.343626570915619, "step": 46560}, {"loss": 0.611, "grad_norm": 1.1300519704818726, "learning_rate": 0.0002, "epoch": 3.3443447037701977, "step": 46570}, {"loss": 0.6088, "grad_norm": 0.8330956101417542, "learning_rate": 0.0002, "epoch": 3.3450628366247757, "step": 46580}, {"loss": 0.6725, "grad_norm": 0.7699366211891174, "learning_rate": 0.0002, "epoch": 3.3457809694793537, "step": 46590}, {"loss": 0.6667, "grad_norm": 1.0470821857452393, "learning_rate": 0.0002, "epoch": 3.3464991023339317, "step": 46600}, {"loss": 0.6408, "grad_norm": 0.9933704137802124, "learning_rate": 0.0002, "epoch": 3.34721723518851, "step": 46610}, {"loss": 0.6416, "grad_norm": 0.8130798935890198, "learning_rate": 0.0002, "epoch": 3.347935368043088, "step": 46620}, {"loss": 0.6576, "grad_norm": 0.9746946692466736, "learning_rate": 0.0002, "epoch": 3.348653500897666, "step": 46630}, {"loss": 0.6254, "grad_norm": 0.8607267141342163, "learning_rate": 0.0002, "epoch": 3.349371633752244, "step": 46640}, {"loss": 0.6639, "grad_norm": 0.800335705280304, "learning_rate": 0.0002, "epoch": 3.350089766606822, "step": 46650}, {"loss": 0.6749, "grad_norm": 1.0083239078521729, "learning_rate": 0.0002, "epoch": 3.3508078994614, "step": 46660}, {"loss": 0.6606, "grad_norm": 1.0774433612823486, "learning_rate": 0.0002, "epoch": 3.3515260323159786, "step": 46670}, {"loss": 0.6408, "grad_norm": 0.9378824234008789, "learning_rate": 0.0002, "epoch": 3.3522441651705566, "step": 46680}, {"loss": 0.5879, "grad_norm": 0.8490564227104187, "learning_rate": 0.0002, "epoch": 3.3529622980251346, "step": 46690}, {"loss": 0.6364, "grad_norm": 1.0415582656860352, "learning_rate": 0.0002, "epoch": 3.3536804308797126, "step": 46700}, {"loss": 0.5813, "grad_norm": 0.8514367938041687, "learning_rate": 0.0002, "epoch": 3.354398563734291, "step": 46710}, {"loss": 0.6847, "grad_norm": 0.7691360712051392, "learning_rate": 0.0002, "epoch": 3.355116696588869, "step": 46720}, {"loss": 0.6295, "grad_norm": 0.8345438241958618, "learning_rate": 0.0002, "epoch": 3.355834829443447, "step": 46730}, {"loss": 0.6093, "grad_norm": 1.023492693901062, "learning_rate": 0.0002, "epoch": 3.356552962298025, "step": 46740}, {"loss": 0.5997, "grad_norm": 0.9648325443267822, "learning_rate": 0.0002, "epoch": 3.357271095152603, "step": 46750}, {"loss": 0.6379, "grad_norm": 0.9029248356819153, "learning_rate": 0.0002, "epoch": 3.3579892280071815, "step": 46760}, {"loss": 0.6551, "grad_norm": 0.9109513759613037, "learning_rate": 0.0002, "epoch": 3.3587073608617595, "step": 46770}, {"loss": 0.6616, "grad_norm": 0.7757390141487122, "learning_rate": 0.0002, "epoch": 3.3594254937163375, "step": 46780}, {"loss": 0.6088, "grad_norm": 0.794035792350769, "learning_rate": 0.0002, "epoch": 3.3601436265709155, "step": 46790}, {"loss": 0.6405, "grad_norm": 0.8211429715156555, "learning_rate": 0.0002, "epoch": 3.360861759425494, "step": 46800}, {"loss": 0.6359, "grad_norm": 0.8620322346687317, "learning_rate": 0.0002, "epoch": 3.361579892280072, "step": 46810}, {"loss": 0.6357, "grad_norm": 0.9392538070678711, "learning_rate": 0.0002, "epoch": 3.36229802513465, "step": 46820}, {"loss": 0.6225, "grad_norm": 0.8297873139381409, "learning_rate": 0.0002, "epoch": 3.363016157989228, "step": 46830}, {"loss": 0.639, "grad_norm": 0.9158190488815308, "learning_rate": 0.0002, "epoch": 3.363734290843806, "step": 46840}, {"loss": 0.6168, "grad_norm": 1.1449424028396606, "learning_rate": 0.0002, "epoch": 3.3644524236983844, "step": 46850}, {"loss": 0.6413, "grad_norm": 0.8718444108963013, "learning_rate": 0.0002, "epoch": 3.3651705565529624, "step": 46860}, {"loss": 0.624, "grad_norm": 0.7744014263153076, "learning_rate": 0.0002, "epoch": 3.3658886894075404, "step": 46870}, {"loss": 0.6238, "grad_norm": 0.8392460942268372, "learning_rate": 0.0002, "epoch": 3.3666068222621184, "step": 46880}, {"loss": 0.6753, "grad_norm": 1.0424989461898804, "learning_rate": 0.0002, "epoch": 3.367324955116697, "step": 46890}, {"loss": 0.6038, "grad_norm": 1.4696359634399414, "learning_rate": 0.0002, "epoch": 3.368043087971275, "step": 46900}, {"loss": 0.6525, "grad_norm": 0.9298201203346252, "learning_rate": 0.0002, "epoch": 3.368761220825853, "step": 46910}, {"loss": 0.6351, "grad_norm": 0.8965262770652771, "learning_rate": 0.0002, "epoch": 3.369479353680431, "step": 46920}, {"loss": 0.6505, "grad_norm": 0.9395381808280945, "learning_rate": 0.0002, "epoch": 3.370197486535009, "step": 46930}, {"loss": 0.6161, "grad_norm": 0.9069047570228577, "learning_rate": 0.0002, "epoch": 3.370915619389587, "step": 46940}, {"loss": 0.6576, "grad_norm": 0.9208605885505676, "learning_rate": 0.0002, "epoch": 3.3716337522441653, "step": 46950}, {"loss": 0.6456, "grad_norm": 0.9493077397346497, "learning_rate": 0.0002, "epoch": 3.3723518850987433, "step": 46960}, {"loss": 0.6609, "grad_norm": 1.0804208517074585, "learning_rate": 0.0002, "epoch": 3.3730700179533213, "step": 46970}, {"loss": 0.6267, "grad_norm": 0.9465714693069458, "learning_rate": 0.0002, "epoch": 3.3737881508078993, "step": 46980}, {"loss": 0.6633, "grad_norm": 0.9189882278442383, "learning_rate": 0.0002, "epoch": 3.3745062836624777, "step": 46990}, {"loss": 0.6518, "grad_norm": 1.0199357271194458, "learning_rate": 0.0002, "epoch": 3.3752244165170557, "step": 47000}, {"loss": 0.6645, "grad_norm": 0.8999426960945129, "learning_rate": 0.0002, "epoch": 3.3759425493716337, "step": 47010}, {"loss": 0.637, "grad_norm": 0.8923690319061279, "learning_rate": 0.0002, "epoch": 3.3766606822262117, "step": 47020}, {"loss": 0.6543, "grad_norm": 0.7459347248077393, "learning_rate": 0.0002, "epoch": 3.3773788150807897, "step": 47030}, {"loss": 0.6269, "grad_norm": 0.7702858448028564, "learning_rate": 0.0002, "epoch": 3.378096947935368, "step": 47040}, {"loss": 0.6399, "grad_norm": 0.8296625018119812, "learning_rate": 0.0002, "epoch": 3.378815080789946, "step": 47050}, {"loss": 0.6552, "grad_norm": 1.2952555418014526, "learning_rate": 0.0002, "epoch": 3.379533213644524, "step": 47060}, {"loss": 0.6264, "grad_norm": 0.7778869271278381, "learning_rate": 0.0002, "epoch": 3.380251346499102, "step": 47070}, {"loss": 0.6906, "grad_norm": 0.9151549339294434, "learning_rate": 0.0002, "epoch": 3.3809694793536806, "step": 47080}, {"loss": 0.6443, "grad_norm": 0.7883925437927246, "learning_rate": 0.0002, "epoch": 3.3816876122082586, "step": 47090}, {"loss": 0.6124, "grad_norm": 0.9602295756340027, "learning_rate": 0.0002, "epoch": 3.3824057450628366, "step": 47100}, {"loss": 0.651, "grad_norm": 0.7953121066093445, "learning_rate": 0.0002, "epoch": 3.3831238779174146, "step": 47110}, {"loss": 0.638, "grad_norm": 1.110148549079895, "learning_rate": 0.0002, "epoch": 3.3838420107719926, "step": 47120}, {"loss": 0.6386, "grad_norm": 0.9359608888626099, "learning_rate": 0.0002, "epoch": 3.384560143626571, "step": 47130}, {"loss": 0.6075, "grad_norm": 0.7877762317657471, "learning_rate": 0.0002, "epoch": 3.385278276481149, "step": 47140}, {"loss": 0.6657, "grad_norm": 0.8586933016777039, "learning_rate": 0.0002, "epoch": 3.385996409335727, "step": 47150}, {"loss": 0.6438, "grad_norm": 0.8920878767967224, "learning_rate": 0.0002, "epoch": 3.386714542190305, "step": 47160}, {"loss": 0.6584, "grad_norm": 0.9692603349685669, "learning_rate": 0.0002, "epoch": 3.3874326750448835, "step": 47170}, {"loss": 0.6643, "grad_norm": 0.9038610458374023, "learning_rate": 0.0002, "epoch": 3.3881508078994615, "step": 47180}, {"loss": 0.6002, "grad_norm": 1.6299188137054443, "learning_rate": 0.0002, "epoch": 3.3888689407540395, "step": 47190}, {"loss": 0.6423, "grad_norm": 0.9704291820526123, "learning_rate": 0.0002, "epoch": 3.3895870736086176, "step": 47200}, {"loss": 0.6808, "grad_norm": 0.9503401517868042, "learning_rate": 0.0002, "epoch": 3.3903052064631956, "step": 47210}, {"loss": 0.6871, "grad_norm": 1.0051378011703491, "learning_rate": 0.0002, "epoch": 3.3910233393177736, "step": 47220}, {"loss": 0.6207, "grad_norm": 0.7336357235908508, "learning_rate": 0.0002, "epoch": 3.391741472172352, "step": 47230}, {"loss": 0.6688, "grad_norm": 0.9847398996353149, "learning_rate": 0.0002, "epoch": 3.39245960502693, "step": 47240}, {"loss": 0.6305, "grad_norm": 0.8100917339324951, "learning_rate": 0.0002, "epoch": 3.393177737881508, "step": 47250}, {"loss": 0.6418, "grad_norm": 0.9752838611602783, "learning_rate": 0.0002, "epoch": 3.393895870736086, "step": 47260}, {"loss": 0.6237, "grad_norm": 0.9400623440742493, "learning_rate": 0.0002, "epoch": 3.3946140035906645, "step": 47270}, {"loss": 0.6321, "grad_norm": 0.7310057878494263, "learning_rate": 0.0002, "epoch": 3.3953321364452425, "step": 47280}, {"loss": 0.6209, "grad_norm": 0.8898789286613464, "learning_rate": 0.0002, "epoch": 3.3960502692998205, "step": 47290}, {"loss": 0.6496, "grad_norm": 1.0157585144042969, "learning_rate": 0.0002, "epoch": 3.3967684021543985, "step": 47300}, {"loss": 0.6497, "grad_norm": 0.9108527898788452, "learning_rate": 0.0002, "epoch": 3.3974865350089765, "step": 47310}, {"loss": 0.5928, "grad_norm": 0.9796249270439148, "learning_rate": 0.0002, "epoch": 3.398204667863555, "step": 47320}, {"loss": 0.6169, "grad_norm": 0.8176435232162476, "learning_rate": 0.0002, "epoch": 3.398922800718133, "step": 47330}, {"loss": 0.6279, "grad_norm": 0.9981188178062439, "learning_rate": 0.0002, "epoch": 3.399640933572711, "step": 47340}, {"loss": 0.6657, "grad_norm": 0.9774404764175415, "learning_rate": 0.0002, "epoch": 3.400359066427289, "step": 47350}, {"loss": 0.68, "grad_norm": 0.8624991774559021, "learning_rate": 0.0002, "epoch": 3.4010771992818674, "step": 47360}, {"loss": 0.6597, "grad_norm": 0.9191665053367615, "learning_rate": 0.0002, "epoch": 3.4017953321364454, "step": 47370}, {"loss": 0.6249, "grad_norm": 0.7971290946006775, "learning_rate": 0.0002, "epoch": 3.4025134649910234, "step": 47380}, {"loss": 0.617, "grad_norm": 0.8336732983589172, "learning_rate": 0.0002, "epoch": 3.4032315978456014, "step": 47390}, {"loss": 0.6435, "grad_norm": 0.7730334401130676, "learning_rate": 0.0002, "epoch": 3.4039497307001794, "step": 47400}, {"loss": 0.6348, "grad_norm": 0.8559145927429199, "learning_rate": 0.0002, "epoch": 3.404667863554758, "step": 47410}, {"loss": 0.6466, "grad_norm": 1.0261447429656982, "learning_rate": 0.0002, "epoch": 3.405385996409336, "step": 47420}, {"loss": 0.6556, "grad_norm": 0.9931781888008118, "learning_rate": 0.0002, "epoch": 3.406104129263914, "step": 47430}, {"loss": 0.6226, "grad_norm": 0.8971807360649109, "learning_rate": 0.0002, "epoch": 3.406822262118492, "step": 47440}, {"loss": 0.656, "grad_norm": 0.8886999487876892, "learning_rate": 0.0002, "epoch": 3.4075403949730703, "step": 47450}, {"loss": 0.6256, "grad_norm": 0.9551735520362854, "learning_rate": 0.0002, "epoch": 3.4082585278276483, "step": 47460}, {"loss": 0.6646, "grad_norm": 0.9066859483718872, "learning_rate": 0.0002, "epoch": 3.4089766606822263, "step": 47470}, {"loss": 0.6655, "grad_norm": 0.9192125201225281, "learning_rate": 0.0002, "epoch": 3.4096947935368043, "step": 47480}, {"loss": 0.6197, "grad_norm": 0.9332839250564575, "learning_rate": 0.0002, "epoch": 3.4104129263913823, "step": 47490}, {"loss": 0.6134, "grad_norm": 0.745563805103302, "learning_rate": 0.0002, "epoch": 3.4111310592459603, "step": 47500}, {"loss": 0.6206, "grad_norm": 0.6843905448913574, "learning_rate": 0.0002, "epoch": 3.4118491921005387, "step": 47510}, {"loss": 0.6742, "grad_norm": 0.8063111305236816, "learning_rate": 0.0002, "epoch": 3.4125673249551167, "step": 47520}, {"loss": 0.6138, "grad_norm": 0.9666593670845032, "learning_rate": 0.0002, "epoch": 3.4132854578096947, "step": 47530}, {"loss": 0.635, "grad_norm": 0.8112747073173523, "learning_rate": 0.0002, "epoch": 3.4140035906642727, "step": 47540}, {"loss": 0.6225, "grad_norm": 0.820807933807373, "learning_rate": 0.0002, "epoch": 3.414721723518851, "step": 47550}, {"loss": 0.6262, "grad_norm": 0.8476285338401794, "learning_rate": 0.0002, "epoch": 3.415439856373429, "step": 47560}, {"loss": 0.6134, "grad_norm": 1.0232552289962769, "learning_rate": 0.0002, "epoch": 3.416157989228007, "step": 47570}, {"loss": 0.604, "grad_norm": 0.8749372363090515, "learning_rate": 0.0002, "epoch": 3.416876122082585, "step": 47580}, {"loss": 0.6463, "grad_norm": 0.8117937445640564, "learning_rate": 0.0002, "epoch": 3.417594254937163, "step": 47590}, {"loss": 0.623, "grad_norm": 0.9010460376739502, "learning_rate": 0.0002, "epoch": 3.4183123877917416, "step": 47600}, {"loss": 0.6676, "grad_norm": 0.8955527544021606, "learning_rate": 0.0002, "epoch": 3.4190305206463196, "step": 47610}, {"loss": 0.6424, "grad_norm": 0.884186327457428, "learning_rate": 0.0002, "epoch": 3.4197486535008976, "step": 47620}, {"loss": 0.6377, "grad_norm": 0.8995241522789001, "learning_rate": 0.0002, "epoch": 3.4204667863554756, "step": 47630}, {"loss": 0.651, "grad_norm": 1.0627013444900513, "learning_rate": 0.0002, "epoch": 3.421184919210054, "step": 47640}, {"loss": 0.6338, "grad_norm": 0.8619979619979858, "learning_rate": 0.0002, "epoch": 3.421903052064632, "step": 47650}, {"loss": 0.6483, "grad_norm": 0.9682498574256897, "learning_rate": 0.0002, "epoch": 3.42262118491921, "step": 47660}, {"loss": 0.6006, "grad_norm": 0.9614400863647461, "learning_rate": 0.0002, "epoch": 3.423339317773788, "step": 47670}, {"loss": 0.6088, "grad_norm": 0.7986962795257568, "learning_rate": 0.0002, "epoch": 3.424057450628366, "step": 47680}, {"loss": 0.6056, "grad_norm": 0.8255957961082458, "learning_rate": 0.0002, "epoch": 3.4247755834829445, "step": 47690}, {"loss": 0.663, "grad_norm": 0.9139757752418518, "learning_rate": 0.0002, "epoch": 3.4254937163375225, "step": 47700}, {"loss": 0.61, "grad_norm": 0.8086292743682861, "learning_rate": 0.0002, "epoch": 3.4262118491921005, "step": 47710}, {"loss": 0.6604, "grad_norm": 0.8852273225784302, "learning_rate": 0.0002, "epoch": 3.4269299820466785, "step": 47720}, {"loss": 0.6168, "grad_norm": 0.7568784356117249, "learning_rate": 0.0002, "epoch": 3.427648114901257, "step": 47730}, {"loss": 0.6559, "grad_norm": 0.8933039903640747, "learning_rate": 0.0002, "epoch": 3.428366247755835, "step": 47740}, {"loss": 0.6406, "grad_norm": 0.8101669549942017, "learning_rate": 0.0002, "epoch": 3.429084380610413, "step": 47750}, {"loss": 0.6287, "grad_norm": 0.7021054625511169, "learning_rate": 0.0002, "epoch": 3.429802513464991, "step": 47760}, {"loss": 0.6159, "grad_norm": 0.8282538652420044, "learning_rate": 0.0002, "epoch": 3.430520646319569, "step": 47770}, {"loss": 0.6439, "grad_norm": 0.8168348670005798, "learning_rate": 0.0002, "epoch": 3.431238779174147, "step": 47780}, {"loss": 0.6265, "grad_norm": 0.9504001140594482, "learning_rate": 0.0002, "epoch": 3.4319569120287254, "step": 47790}, {"loss": 0.6688, "grad_norm": 0.7500190734863281, "learning_rate": 0.0002, "epoch": 3.4326750448833034, "step": 47800}, {"loss": 0.6818, "grad_norm": 0.8645710945129395, "learning_rate": 0.0002, "epoch": 3.4333931777378814, "step": 47810}, {"loss": 0.6268, "grad_norm": 0.8088704943656921, "learning_rate": 0.0002, "epoch": 3.4341113105924594, "step": 47820}, {"loss": 0.6795, "grad_norm": 0.9981673955917358, "learning_rate": 0.0002, "epoch": 3.434829443447038, "step": 47830}, {"loss": 0.6615, "grad_norm": 0.9363315105438232, "learning_rate": 0.0002, "epoch": 3.435547576301616, "step": 47840}, {"loss": 0.6028, "grad_norm": 0.8471030592918396, "learning_rate": 0.0002, "epoch": 3.436265709156194, "step": 47850}, {"loss": 0.6658, "grad_norm": 0.9447668790817261, "learning_rate": 0.0002, "epoch": 3.436983842010772, "step": 47860}, {"loss": 0.6511, "grad_norm": 0.9494127631187439, "learning_rate": 0.0002, "epoch": 3.43770197486535, "step": 47870}, {"loss": 0.6134, "grad_norm": 0.8340432643890381, "learning_rate": 0.0002, "epoch": 3.4384201077199283, "step": 47880}, {"loss": 0.6731, "grad_norm": 0.8466387987136841, "learning_rate": 0.0002, "epoch": 3.4391382405745063, "step": 47890}, {"loss": 0.6552, "grad_norm": 0.9498962759971619, "learning_rate": 0.0002, "epoch": 3.4398563734290843, "step": 47900}, {"loss": 0.6593, "grad_norm": 0.8490501046180725, "learning_rate": 0.0002, "epoch": 3.4405745062836623, "step": 47910}, {"loss": 0.6038, "grad_norm": 0.9506490230560303, "learning_rate": 0.0002, "epoch": 3.441292639138241, "step": 47920}, {"loss": 0.6317, "grad_norm": 0.7944257855415344, "learning_rate": 0.0002, "epoch": 3.442010771992819, "step": 47930}, {"loss": 0.6193, "grad_norm": 0.9725518226623535, "learning_rate": 0.0002, "epoch": 3.442728904847397, "step": 47940}, {"loss": 0.635, "grad_norm": 0.7823024392127991, "learning_rate": 0.0002, "epoch": 3.443447037701975, "step": 47950}, {"loss": 0.6221, "grad_norm": 0.810565173625946, "learning_rate": 0.0002, "epoch": 3.444165170556553, "step": 47960}, {"loss": 0.6519, "grad_norm": 0.9809024333953857, "learning_rate": 0.0002, "epoch": 3.4448833034111312, "step": 47970}, {"loss": 0.6441, "grad_norm": 0.8818578720092773, "learning_rate": 0.0002, "epoch": 3.4456014362657092, "step": 47980}, {"loss": 0.6452, "grad_norm": 0.9843092560768127, "learning_rate": 0.0002, "epoch": 3.4463195691202873, "step": 47990}, {"loss": 0.6076, "grad_norm": 0.916313886642456, "learning_rate": 0.0002, "epoch": 3.4470377019748653, "step": 48000}, {"loss": 0.6399, "grad_norm": 0.908442497253418, "learning_rate": 0.0002, "epoch": 3.4477558348294433, "step": 48010}, {"loss": 0.6263, "grad_norm": 0.9880178570747375, "learning_rate": 0.0002, "epoch": 3.4484739676840217, "step": 48020}, {"loss": 0.6802, "grad_norm": 0.9276854991912842, "learning_rate": 0.0002, "epoch": 3.4491921005385997, "step": 48030}, {"loss": 0.6522, "grad_norm": 1.0879448652267456, "learning_rate": 0.0002, "epoch": 3.4499102333931777, "step": 48040}, {"loss": 0.6362, "grad_norm": 0.7430389523506165, "learning_rate": 0.0002, "epoch": 3.4506283662477557, "step": 48050}, {"loss": 0.6064, "grad_norm": 1.0880072116851807, "learning_rate": 0.0002, "epoch": 3.4513464991023337, "step": 48060}, {"loss": 0.6152, "grad_norm": 1.0424141883850098, "learning_rate": 0.0002, "epoch": 3.452064631956912, "step": 48070}, {"loss": 0.6485, "grad_norm": 0.926330029964447, "learning_rate": 0.0002, "epoch": 3.45278276481149, "step": 48080}, {"loss": 0.6261, "grad_norm": 0.8911219239234924, "learning_rate": 0.0002, "epoch": 3.453500897666068, "step": 48090}, {"loss": 0.6883, "grad_norm": 0.8727201223373413, "learning_rate": 0.0002, "epoch": 3.454219030520646, "step": 48100}, {"loss": 0.6473, "grad_norm": 0.8573940396308899, "learning_rate": 0.0002, "epoch": 3.4549371633752246, "step": 48110}, {"loss": 0.6645, "grad_norm": 1.0427064895629883, "learning_rate": 0.0002, "epoch": 3.4556552962298026, "step": 48120}, {"loss": 0.6489, "grad_norm": 0.8688231706619263, "learning_rate": 0.0002, "epoch": 3.4563734290843806, "step": 48130}, {"loss": 0.5947, "grad_norm": 0.8856009244918823, "learning_rate": 0.0002, "epoch": 3.4570915619389586, "step": 48140}, {"loss": 0.6482, "grad_norm": 0.9535353183746338, "learning_rate": 0.0002, "epoch": 3.4578096947935366, "step": 48150}, {"loss": 0.6435, "grad_norm": 0.9466010928153992, "learning_rate": 0.0002, "epoch": 3.458527827648115, "step": 48160}, {"loss": 0.6231, "grad_norm": 0.9783535599708557, "learning_rate": 0.0002, "epoch": 3.459245960502693, "step": 48170}, {"loss": 0.6926, "grad_norm": 0.8010456562042236, "learning_rate": 0.0002, "epoch": 3.459964093357271, "step": 48180}, {"loss": 0.6141, "grad_norm": 0.8928955793380737, "learning_rate": 0.0002, "epoch": 3.460682226211849, "step": 48190}, {"loss": 0.6699, "grad_norm": 0.7565838694572449, "learning_rate": 0.0002, "epoch": 3.4614003590664275, "step": 48200}, {"loss": 0.6218, "grad_norm": 1.0044180154800415, "learning_rate": 0.0002, "epoch": 3.4621184919210055, "step": 48210}, {"loss": 0.6182, "grad_norm": 0.8161038160324097, "learning_rate": 0.0002, "epoch": 3.4628366247755835, "step": 48220}, {"loss": 0.6869, "grad_norm": 1.1000211238861084, "learning_rate": 0.0002, "epoch": 3.4635547576301615, "step": 48230}, {"loss": 0.7141, "grad_norm": 0.7942240238189697, "learning_rate": 0.0002, "epoch": 3.4642728904847395, "step": 48240}, {"loss": 0.6247, "grad_norm": 0.7546432018280029, "learning_rate": 0.0002, "epoch": 3.464991023339318, "step": 48250}, {"loss": 0.6319, "grad_norm": 0.7705255150794983, "learning_rate": 0.0002, "epoch": 3.465709156193896, "step": 48260}, {"loss": 0.6414, "grad_norm": 0.7958067059516907, "learning_rate": 0.0002, "epoch": 3.466427289048474, "step": 48270}, {"loss": 0.6526, "grad_norm": 0.9199120402336121, "learning_rate": 0.0002, "epoch": 3.467145421903052, "step": 48280}, {"loss": 0.6476, "grad_norm": 1.118672251701355, "learning_rate": 0.0002, "epoch": 3.46786355475763, "step": 48290}, {"loss": 0.6543, "grad_norm": 0.9161015748977661, "learning_rate": 0.0002, "epoch": 3.4685816876122084, "step": 48300}, {"loss": 0.6767, "grad_norm": 1.1086218357086182, "learning_rate": 0.0002, "epoch": 3.4692998204667864, "step": 48310}, {"loss": 0.5917, "grad_norm": 1.0123368501663208, "learning_rate": 0.0002, "epoch": 3.4700179533213644, "step": 48320}, {"loss": 0.6277, "grad_norm": 0.7380602359771729, "learning_rate": 0.0002, "epoch": 3.4707360861759424, "step": 48330}, {"loss": 0.6407, "grad_norm": 0.8967105150222778, "learning_rate": 0.0002, "epoch": 3.4714542190305204, "step": 48340}, {"loss": 0.6526, "grad_norm": 1.0134044885635376, "learning_rate": 0.0002, "epoch": 3.472172351885099, "step": 48350}, {"loss": 0.6436, "grad_norm": 1.080815076828003, "learning_rate": 0.0002, "epoch": 3.472890484739677, "step": 48360}, {"loss": 0.6644, "grad_norm": 1.151721477508545, "learning_rate": 0.0002, "epoch": 3.473608617594255, "step": 48370}, {"loss": 0.6612, "grad_norm": 0.9436505436897278, "learning_rate": 0.0002, "epoch": 3.474326750448833, "step": 48380}, {"loss": 0.6503, "grad_norm": 0.9154609441757202, "learning_rate": 0.0002, "epoch": 3.4750448833034113, "step": 48390}, {"loss": 0.6151, "grad_norm": 0.8943037986755371, "learning_rate": 0.0002, "epoch": 3.4757630161579893, "step": 48400}, {"loss": 0.6316, "grad_norm": 0.936988115310669, "learning_rate": 0.0002, "epoch": 3.4764811490125673, "step": 48410}, {"loss": 0.6638, "grad_norm": 0.826960027217865, "learning_rate": 0.0002, "epoch": 3.4771992818671453, "step": 48420}, {"loss": 0.6242, "grad_norm": 1.0487587451934814, "learning_rate": 0.0002, "epoch": 3.4779174147217233, "step": 48430}, {"loss": 0.6302, "grad_norm": 0.729163646697998, "learning_rate": 0.0002, "epoch": 3.478635547576302, "step": 48440}, {"loss": 0.6115, "grad_norm": 0.8156948089599609, "learning_rate": 0.0002, "epoch": 3.47935368043088, "step": 48450}, {"loss": 0.6455, "grad_norm": 0.8004332184791565, "learning_rate": 0.0002, "epoch": 3.480071813285458, "step": 48460}, {"loss": 0.621, "grad_norm": 0.9632692337036133, "learning_rate": 0.0002, "epoch": 3.480789946140036, "step": 48470}, {"loss": 0.6214, "grad_norm": 1.0950212478637695, "learning_rate": 0.0002, "epoch": 3.4815080789946142, "step": 48480}, {"loss": 0.6659, "grad_norm": 0.8574318885803223, "learning_rate": 0.0002, "epoch": 3.4822262118491922, "step": 48490}, {"loss": 0.6969, "grad_norm": 0.8552606701850891, "learning_rate": 0.0002, "epoch": 3.4829443447037702, "step": 48500}, {"loss": 0.6253, "grad_norm": 0.9698445200920105, "learning_rate": 0.0002, "epoch": 3.4836624775583482, "step": 48510}, {"loss": 0.6844, "grad_norm": 0.9427815675735474, "learning_rate": 0.0002, "epoch": 3.4843806104129262, "step": 48520}, {"loss": 0.6722, "grad_norm": 0.7902070879936218, "learning_rate": 0.0002, "epoch": 3.4850987432675042, "step": 48530}, {"loss": 0.6708, "grad_norm": 1.0300066471099854, "learning_rate": 0.0002, "epoch": 3.4858168761220827, "step": 48540}, {"loss": 0.6113, "grad_norm": 1.1688778400421143, "learning_rate": 0.0002, "epoch": 3.4865350089766607, "step": 48550}, {"loss": 0.5956, "grad_norm": 1.0012071132659912, "learning_rate": 0.0002, "epoch": 3.4872531418312387, "step": 48560}, {"loss": 0.6536, "grad_norm": 1.112094759941101, "learning_rate": 0.0002, "epoch": 3.4879712746858167, "step": 48570}, {"loss": 0.6625, "grad_norm": 0.8547284603118896, "learning_rate": 0.0002, "epoch": 3.488689407540395, "step": 48580}, {"loss": 0.6488, "grad_norm": 0.8827278017997742, "learning_rate": 0.0002, "epoch": 3.489407540394973, "step": 48590}, {"loss": 0.6437, "grad_norm": 0.9255490303039551, "learning_rate": 0.0002, "epoch": 3.490125673249551, "step": 48600}, {"loss": 0.6089, "grad_norm": 0.8000030517578125, "learning_rate": 0.0002, "epoch": 3.490843806104129, "step": 48610}, {"loss": 0.647, "grad_norm": 0.9327391386032104, "learning_rate": 0.0002, "epoch": 3.491561938958707, "step": 48620}, {"loss": 0.6678, "grad_norm": 0.9004138708114624, "learning_rate": 0.0002, "epoch": 3.4922800718132856, "step": 48630}, {"loss": 0.6145, "grad_norm": 0.9886971116065979, "learning_rate": 0.0002, "epoch": 3.4929982046678636, "step": 48640}, {"loss": 0.6309, "grad_norm": 0.9890487194061279, "learning_rate": 0.0002, "epoch": 3.4937163375224416, "step": 48650}, {"loss": 0.655, "grad_norm": 0.7024438977241516, "learning_rate": 0.0002, "epoch": 3.4944344703770196, "step": 48660}, {"loss": 0.6313, "grad_norm": 0.8397303223609924, "learning_rate": 0.0002, "epoch": 3.495152603231598, "step": 48670}, {"loss": 0.6429, "grad_norm": 0.9120950698852539, "learning_rate": 0.0002, "epoch": 3.495870736086176, "step": 48680}, {"loss": 0.631, "grad_norm": 1.057299017906189, "learning_rate": 0.0002, "epoch": 3.496588868940754, "step": 48690}, {"loss": 0.6459, "grad_norm": 0.821325957775116, "learning_rate": 0.0002, "epoch": 3.497307001795332, "step": 48700}, {"loss": 0.6174, "grad_norm": 1.0029970407485962, "learning_rate": 0.0002, "epoch": 3.49802513464991, "step": 48710}, {"loss": 0.6374, "grad_norm": 0.9483712911605835, "learning_rate": 0.0002, "epoch": 3.4987432675044885, "step": 48720}, {"loss": 0.6472, "grad_norm": 0.9637855291366577, "learning_rate": 0.0002, "epoch": 3.4994614003590665, "step": 48730}, {"loss": 0.6639, "grad_norm": 0.6848894357681274, "learning_rate": 0.0002, "epoch": 3.5001795332136445, "step": 48740}, {"loss": 0.6129, "grad_norm": 0.7848573327064514, "learning_rate": 0.0002, "epoch": 3.5008976660682225, "step": 48750}, {"loss": 0.6306, "grad_norm": 1.0341308116912842, "learning_rate": 0.0002, "epoch": 3.501615798922801, "step": 48760}, {"loss": 0.6063, "grad_norm": 0.8858218193054199, "learning_rate": 0.0002, "epoch": 3.502333931777379, "step": 48770}, {"loss": 0.6729, "grad_norm": 0.8366939425468445, "learning_rate": 0.0002, "epoch": 3.503052064631957, "step": 48780}, {"loss": 0.6736, "grad_norm": 0.7926092147827148, "learning_rate": 0.0002, "epoch": 3.503770197486535, "step": 48790}, {"loss": 0.6279, "grad_norm": 0.8503843545913696, "learning_rate": 0.0002, "epoch": 3.504488330341113, "step": 48800}, {"loss": 0.6162, "grad_norm": 0.8867869973182678, "learning_rate": 0.0002, "epoch": 3.505206463195691, "step": 48810}, {"loss": 0.6987, "grad_norm": 1.0336930751800537, "learning_rate": 0.0002, "epoch": 3.5059245960502694, "step": 48820}, {"loss": 0.6333, "grad_norm": 0.8564051985740662, "learning_rate": 0.0002, "epoch": 3.5066427289048474, "step": 48830}, {"loss": 0.6574, "grad_norm": 0.9202605485916138, "learning_rate": 0.0002, "epoch": 3.5073608617594254, "step": 48840}, {"loss": 0.6457, "grad_norm": 0.8838639855384827, "learning_rate": 0.0002, "epoch": 3.508078994614004, "step": 48850}, {"loss": 0.631, "grad_norm": 0.8975196480751038, "learning_rate": 0.0002, "epoch": 3.508797127468582, "step": 48860}, {"loss": 0.6335, "grad_norm": 0.8842370510101318, "learning_rate": 0.0002, "epoch": 3.50951526032316, "step": 48870}, {"loss": 0.6569, "grad_norm": 0.9195886254310608, "learning_rate": 0.0002, "epoch": 3.510233393177738, "step": 48880}, {"loss": 0.6647, "grad_norm": 0.986130952835083, "learning_rate": 0.0002, "epoch": 3.510951526032316, "step": 48890}, {"loss": 0.6676, "grad_norm": 0.8119593858718872, "learning_rate": 0.0002, "epoch": 3.511669658886894, "step": 48900}, {"loss": 0.653, "grad_norm": 0.9027136564254761, "learning_rate": 0.0002, "epoch": 3.5123877917414723, "step": 48910}, {"loss": 0.6731, "grad_norm": 0.8560537099838257, "learning_rate": 0.0002, "epoch": 3.5131059245960503, "step": 48920}, {"loss": 0.7032, "grad_norm": 0.7073559165000916, "learning_rate": 0.0002, "epoch": 3.5138240574506283, "step": 48930}, {"loss": 0.6738, "grad_norm": 0.8753304481506348, "learning_rate": 0.0002, "epoch": 3.5145421903052063, "step": 48940}, {"loss": 0.6366, "grad_norm": 0.9151145815849304, "learning_rate": 0.0002, "epoch": 3.5152603231597848, "step": 48950}, {"loss": 0.6135, "grad_norm": 0.7794315814971924, "learning_rate": 0.0002, "epoch": 3.5159784560143628, "step": 48960}, {"loss": 0.658, "grad_norm": 0.9226023554801941, "learning_rate": 0.0002, "epoch": 3.5166965888689408, "step": 48970}, {"loss": 0.6473, "grad_norm": 0.8442051410675049, "learning_rate": 0.0002, "epoch": 3.5174147217235188, "step": 48980}, {"loss": 0.6267, "grad_norm": 0.9769423007965088, "learning_rate": 0.0002, "epoch": 3.5181328545780968, "step": 48990}, {"loss": 0.6333, "grad_norm": 0.740347146987915, "learning_rate": 0.0002, "epoch": 3.5188509874326748, "step": 49000}, {"loss": 0.6652, "grad_norm": 0.8963457345962524, "learning_rate": 0.0002, "epoch": 3.519569120287253, "step": 49010}, {"loss": 0.6782, "grad_norm": 0.8410176634788513, "learning_rate": 0.0002, "epoch": 3.520287253141831, "step": 49020}, {"loss": 0.6496, "grad_norm": 1.0486022233963013, "learning_rate": 0.0002, "epoch": 3.521005385996409, "step": 49030}, {"loss": 0.6275, "grad_norm": 0.95393967628479, "learning_rate": 0.0002, "epoch": 3.5217235188509877, "step": 49040}, {"loss": 0.6328, "grad_norm": 0.8261157274246216, "learning_rate": 0.0002, "epoch": 3.5224416517055657, "step": 49050}, {"loss": 0.6441, "grad_norm": 0.9321704506874084, "learning_rate": 0.0002, "epoch": 3.5231597845601437, "step": 49060}, {"loss": 0.6202, "grad_norm": 1.2596088647842407, "learning_rate": 0.0002, "epoch": 3.5238779174147217, "step": 49070}, {"loss": 0.6596, "grad_norm": 0.8584637641906738, "learning_rate": 0.0002, "epoch": 3.5245960502692997, "step": 49080}, {"loss": 0.6708, "grad_norm": 0.850520670413971, "learning_rate": 0.0002, "epoch": 3.5253141831238777, "step": 49090}, {"loss": 0.6543, "grad_norm": 0.8915920257568359, "learning_rate": 0.0002, "epoch": 3.526032315978456, "step": 49100}, {"loss": 0.6558, "grad_norm": 0.9070239067077637, "learning_rate": 0.0002, "epoch": 3.526750448833034, "step": 49110}, {"loss": 0.6128, "grad_norm": 0.699878990650177, "learning_rate": 0.0002, "epoch": 3.527468581687612, "step": 49120}, {"loss": 0.6454, "grad_norm": 0.9003779888153076, "learning_rate": 0.0002, "epoch": 3.5281867145421906, "step": 49130}, {"loss": 0.6177, "grad_norm": 0.7886711955070496, "learning_rate": 0.0002, "epoch": 3.5289048473967686, "step": 49140}, {"loss": 0.6499, "grad_norm": 0.7368922233581543, "learning_rate": 0.0002, "epoch": 3.5296229802513466, "step": 49150}, {"loss": 0.6382, "grad_norm": 0.8585197329521179, "learning_rate": 0.0002, "epoch": 3.5303411131059246, "step": 49160}, {"loss": 0.6761, "grad_norm": 1.0205435752868652, "learning_rate": 0.0002, "epoch": 3.5310592459605026, "step": 49170}, {"loss": 0.6544, "grad_norm": 0.8756650686264038, "learning_rate": 0.0002, "epoch": 3.5317773788150806, "step": 49180}, {"loss": 0.6592, "grad_norm": 1.0278643369674683, "learning_rate": 0.0002, "epoch": 3.532495511669659, "step": 49190}, {"loss": 0.6682, "grad_norm": 0.8641911745071411, "learning_rate": 0.0002, "epoch": 3.533213644524237, "step": 49200}, {"loss": 0.6531, "grad_norm": 0.8730159401893616, "learning_rate": 0.0002, "epoch": 3.533931777378815, "step": 49210}, {"loss": 0.636, "grad_norm": 0.918637216091156, "learning_rate": 0.0002, "epoch": 3.534649910233393, "step": 49220}, {"loss": 0.6815, "grad_norm": 1.0467222929000854, "learning_rate": 0.0002, "epoch": 3.5353680430879715, "step": 49230}, {"loss": 0.6554, "grad_norm": 1.005009412765503, "learning_rate": 0.0002, "epoch": 3.5360861759425495, "step": 49240}, {"loss": 0.649, "grad_norm": 0.9775063395500183, "learning_rate": 0.0002, "epoch": 3.5368043087971275, "step": 49250}, {"loss": 0.6527, "grad_norm": 0.8198322057723999, "learning_rate": 0.0002, "epoch": 3.5375224416517055, "step": 49260}, {"loss": 0.664, "grad_norm": 0.8184829354286194, "learning_rate": 0.0002, "epoch": 3.5382405745062835, "step": 49270}, {"loss": 0.6493, "grad_norm": 0.9520270824432373, "learning_rate": 0.0002, "epoch": 3.5389587073608615, "step": 49280}, {"loss": 0.5935, "grad_norm": 0.7816803455352783, "learning_rate": 0.0002, "epoch": 3.53967684021544, "step": 49290}, {"loss": 0.6424, "grad_norm": 0.6915702819824219, "learning_rate": 0.0002, "epoch": 3.540394973070018, "step": 49300}, {"loss": 0.6447, "grad_norm": 0.8282375931739807, "learning_rate": 0.0002, "epoch": 3.541113105924596, "step": 49310}, {"loss": 0.6164, "grad_norm": 1.0797513723373413, "learning_rate": 0.0002, "epoch": 3.5418312387791744, "step": 49320}, {"loss": 0.6836, "grad_norm": 0.868671715259552, "learning_rate": 0.0002, "epoch": 3.5425493716337524, "step": 49330}, {"loss": 0.6453, "grad_norm": 0.8534455895423889, "learning_rate": 0.0002, "epoch": 3.5432675044883304, "step": 49340}, {"loss": 0.6706, "grad_norm": 0.816411554813385, "learning_rate": 0.0002, "epoch": 3.5439856373429084, "step": 49350}, {"loss": 0.6101, "grad_norm": 0.7813423275947571, "learning_rate": 0.0002, "epoch": 3.5447037701974864, "step": 49360}, {"loss": 0.6617, "grad_norm": 0.8002013564109802, "learning_rate": 0.0002, "epoch": 3.5454219030520644, "step": 49370}, {"loss": 0.6667, "grad_norm": 0.9740113615989685, "learning_rate": 0.0002, "epoch": 3.546140035906643, "step": 49380}, {"loss": 0.6938, "grad_norm": 0.9046127200126648, "learning_rate": 0.0002, "epoch": 3.546858168761221, "step": 49390}, {"loss": 0.6444, "grad_norm": 0.8635150194168091, "learning_rate": 0.0002, "epoch": 3.547576301615799, "step": 49400}, {"loss": 0.6273, "grad_norm": 0.9488558769226074, "learning_rate": 0.0002, "epoch": 3.5482944344703773, "step": 49410}, {"loss": 0.6542, "grad_norm": 0.9637090563774109, "learning_rate": 0.0002, "epoch": 3.5490125673249553, "step": 49420}, {"loss": 0.6468, "grad_norm": 1.042245626449585, "learning_rate": 0.0002, "epoch": 3.5497307001795333, "step": 49430}, {"loss": 0.6999, "grad_norm": 0.9076175689697266, "learning_rate": 0.0002, "epoch": 3.5504488330341113, "step": 49440}, {"loss": 0.6192, "grad_norm": 0.8480596542358398, "learning_rate": 0.0002, "epoch": 3.5511669658886893, "step": 49450}, {"loss": 0.6835, "grad_norm": 0.8483007550239563, "learning_rate": 0.0002, "epoch": 3.5518850987432673, "step": 49460}, {"loss": 0.6607, "grad_norm": 0.7855815887451172, "learning_rate": 0.0002, "epoch": 3.5526032315978457, "step": 49470}, {"loss": 0.6364, "grad_norm": 0.8435823917388916, "learning_rate": 0.0002, "epoch": 3.5533213644524237, "step": 49480}, {"loss": 0.6674, "grad_norm": 0.8613026142120361, "learning_rate": 0.0002, "epoch": 3.5540394973070017, "step": 49490}, {"loss": 0.6651, "grad_norm": 0.9654812812805176, "learning_rate": 0.0002, "epoch": 3.5547576301615798, "step": 49500}, {"loss": 0.6471, "grad_norm": 0.8888838887214661, "learning_rate": 0.0002, "epoch": 3.555475763016158, "step": 49510}, {"loss": 0.622, "grad_norm": 0.7718146443367004, "learning_rate": 0.0002, "epoch": 3.556193895870736, "step": 49520}, {"loss": 0.6297, "grad_norm": 0.9487382173538208, "learning_rate": 0.0002, "epoch": 3.556912028725314, "step": 49530}, {"loss": 0.6516, "grad_norm": 0.9256559610366821, "learning_rate": 0.0002, "epoch": 3.557630161579892, "step": 49540}, {"loss": 0.6461, "grad_norm": 0.8879945874214172, "learning_rate": 0.0002, "epoch": 3.55834829443447, "step": 49550}, {"loss": 0.6367, "grad_norm": 0.8498744368553162, "learning_rate": 0.0002, "epoch": 3.559066427289048, "step": 49560}, {"loss": 0.6274, "grad_norm": 0.9550948143005371, "learning_rate": 0.0002, "epoch": 3.5597845601436267, "step": 49570}, {"loss": 0.635, "grad_norm": 0.8386164903640747, "learning_rate": 0.0002, "epoch": 3.5605026929982047, "step": 49580}, {"loss": 0.6495, "grad_norm": 0.925573468208313, "learning_rate": 0.0002, "epoch": 3.5612208258527827, "step": 49590}, {"loss": 0.676, "grad_norm": 0.8867112398147583, "learning_rate": 0.0002, "epoch": 3.561938958707361, "step": 49600}, {"loss": 0.6156, "grad_norm": 0.7638537883758545, "learning_rate": 0.0002, "epoch": 3.562657091561939, "step": 49610}, {"loss": 0.6597, "grad_norm": 0.9491845965385437, "learning_rate": 0.0002, "epoch": 3.563375224416517, "step": 49620}, {"loss": 0.6237, "grad_norm": 0.8384189605712891, "learning_rate": 0.0002, "epoch": 3.564093357271095, "step": 49630}, {"loss": 0.6102, "grad_norm": 0.8850575089454651, "learning_rate": 0.0002, "epoch": 3.564811490125673, "step": 49640}, {"loss": 0.6517, "grad_norm": 1.020916223526001, "learning_rate": 0.0002, "epoch": 3.565529622980251, "step": 49650}, {"loss": 0.6569, "grad_norm": 0.9298280477523804, "learning_rate": 0.0002, "epoch": 3.5662477558348296, "step": 49660}, {"loss": 0.6094, "grad_norm": 0.9795742034912109, "learning_rate": 0.0002, "epoch": 3.5669658886894076, "step": 49670}, {"loss": 0.6147, "grad_norm": 0.9401193261146545, "learning_rate": 0.0002, "epoch": 3.5676840215439856, "step": 49680}, {"loss": 0.622, "grad_norm": 1.0383585691452026, "learning_rate": 0.0002, "epoch": 3.568402154398564, "step": 49690}, {"loss": 0.6304, "grad_norm": 0.8370866179466248, "learning_rate": 0.0002, "epoch": 3.569120287253142, "step": 49700}, {"loss": 0.6356, "grad_norm": 0.8207486271858215, "learning_rate": 0.0002, "epoch": 3.56983842010772, "step": 49710}, {"loss": 0.6328, "grad_norm": 0.8551223278045654, "learning_rate": 0.0002, "epoch": 3.570556552962298, "step": 49720}, {"loss": 0.621, "grad_norm": 0.8041176199913025, "learning_rate": 0.0002, "epoch": 3.571274685816876, "step": 49730}, {"loss": 0.5818, "grad_norm": 0.9862527847290039, "learning_rate": 0.0002, "epoch": 3.571992818671454, "step": 49740}, {"loss": 0.6448, "grad_norm": 0.7557165622711182, "learning_rate": 0.0002, "epoch": 3.5727109515260325, "step": 49750}, {"loss": 0.6484, "grad_norm": 1.0908563137054443, "learning_rate": 0.0002, "epoch": 3.5734290843806105, "step": 49760}, {"loss": 0.6497, "grad_norm": 0.7245369553565979, "learning_rate": 0.0002, "epoch": 3.5741472172351885, "step": 49770}, {"loss": 0.6315, "grad_norm": 0.7851184010505676, "learning_rate": 0.0002, "epoch": 3.5748653500897665, "step": 49780}, {"loss": 0.6245, "grad_norm": 0.9443599581718445, "learning_rate": 0.0002, "epoch": 3.575583482944345, "step": 49790}, {"loss": 0.6481, "grad_norm": 1.021196961402893, "learning_rate": 0.0002, "epoch": 3.576301615798923, "step": 49800}, {"loss": 0.6368, "grad_norm": 0.9099196195602417, "learning_rate": 0.0002, "epoch": 3.577019748653501, "step": 49810}, {"loss": 0.6372, "grad_norm": 0.9397716522216797, "learning_rate": 0.0002, "epoch": 3.577737881508079, "step": 49820}, {"loss": 0.6208, "grad_norm": 0.9214922785758972, "learning_rate": 0.0002, "epoch": 3.578456014362657, "step": 49830}, {"loss": 0.6219, "grad_norm": 1.0053879022598267, "learning_rate": 0.0002, "epoch": 3.579174147217235, "step": 49840}, {"loss": 0.6283, "grad_norm": 0.9415460228919983, "learning_rate": 0.0002, "epoch": 3.5798922800718134, "step": 49850}, {"loss": 0.6759, "grad_norm": 1.0807833671569824, "learning_rate": 0.0002, "epoch": 3.5806104129263914, "step": 49860}, {"loss": 0.6404, "grad_norm": 1.0070871114730835, "learning_rate": 0.0002, "epoch": 3.5813285457809694, "step": 49870}, {"loss": 0.6411, "grad_norm": 0.9707024693489075, "learning_rate": 0.0002, "epoch": 3.582046678635548, "step": 49880}, {"loss": 0.6852, "grad_norm": 0.9979593753814697, "learning_rate": 0.0002, "epoch": 3.582764811490126, "step": 49890}, {"loss": 0.6519, "grad_norm": 0.7238648533821106, "learning_rate": 0.0002, "epoch": 3.583482944344704, "step": 49900}, {"loss": 0.6452, "grad_norm": 0.8168631792068481, "learning_rate": 0.0002, "epoch": 3.584201077199282, "step": 49910}, {"loss": 0.6174, "grad_norm": 0.8156409859657288, "learning_rate": 0.0002, "epoch": 3.58491921005386, "step": 49920}, {"loss": 0.6248, "grad_norm": 0.9256414175033569, "learning_rate": 0.0002, "epoch": 3.585637342908438, "step": 49930}, {"loss": 0.6077, "grad_norm": 1.0090070962905884, "learning_rate": 0.0002, "epoch": 3.5863554757630163, "step": 49940}, {"loss": 0.6016, "grad_norm": 0.8257701992988586, "learning_rate": 0.0002, "epoch": 3.5870736086175943, "step": 49950}, {"loss": 0.6996, "grad_norm": 0.9189013242721558, "learning_rate": 0.0002, "epoch": 3.5877917414721723, "step": 49960}, {"loss": 0.661, "grad_norm": 0.8497788310050964, "learning_rate": 0.0002, "epoch": 3.5885098743267507, "step": 49970}, {"loss": 0.6335, "grad_norm": 0.9596505761146545, "learning_rate": 0.0002, "epoch": 3.5892280071813287, "step": 49980}, {"loss": 0.697, "grad_norm": 0.8773331642150879, "learning_rate": 0.0002, "epoch": 3.5899461400359067, "step": 49990}, {"loss": 0.6259, "grad_norm": 0.8952302932739258, "learning_rate": 0.0002, "epoch": 3.5906642728904847, "step": 50000}, {"loss": 0.6152, "grad_norm": 0.7713809609413147, "learning_rate": 0.0002, "epoch": 3.5913824057450627, "step": 50010}, {"loss": 0.6127, "grad_norm": 1.0151346921920776, "learning_rate": 0.0002, "epoch": 3.5921005385996407, "step": 50020}, {"loss": 0.6093, "grad_norm": 0.8793733716011047, "learning_rate": 0.0002, "epoch": 3.592818671454219, "step": 50030}, {"loss": 0.5986, "grad_norm": 0.8881325721740723, "learning_rate": 0.0002, "epoch": 3.593536804308797, "step": 50040}, {"loss": 0.6351, "grad_norm": 0.9346749782562256, "learning_rate": 0.0002, "epoch": 3.594254937163375, "step": 50050}, {"loss": 0.6501, "grad_norm": 0.8705052137374878, "learning_rate": 0.0002, "epoch": 3.594973070017953, "step": 50060}, {"loss": 0.6753, "grad_norm": 1.039197564125061, "learning_rate": 0.0002, "epoch": 3.5956912028725316, "step": 50070}, {"loss": 0.6565, "grad_norm": 0.7053273320198059, "learning_rate": 0.0002, "epoch": 3.5964093357271096, "step": 50080}, {"loss": 0.6546, "grad_norm": 0.8268665671348572, "learning_rate": 0.0002, "epoch": 3.5971274685816876, "step": 50090}, {"loss": 0.6637, "grad_norm": 0.8921764492988586, "learning_rate": 0.0002, "epoch": 3.5978456014362656, "step": 50100}, {"loss": 0.6827, "grad_norm": 0.9756084680557251, "learning_rate": 0.0002, "epoch": 3.5985637342908436, "step": 50110}, {"loss": 0.6746, "grad_norm": 0.9275530576705933, "learning_rate": 0.0002, "epoch": 3.5992818671454216, "step": 50120}, {"loss": 0.6709, "grad_norm": 0.9030009508132935, "learning_rate": 0.0002, "epoch": 3.6, "step": 50130}, {"loss": 0.6344, "grad_norm": 0.7805638909339905, "learning_rate": 0.0002, "epoch": 3.600718132854578, "step": 50140}, {"loss": 0.6437, "grad_norm": 0.7627325057983398, "learning_rate": 0.0002, "epoch": 3.601436265709156, "step": 50150}, {"loss": 0.6523, "grad_norm": 0.7809714078903198, "learning_rate": 0.0002, "epoch": 3.6021543985637345, "step": 50160}, {"loss": 0.6578, "grad_norm": 0.7910378575325012, "learning_rate": 0.0002, "epoch": 3.6028725314183125, "step": 50170}, {"loss": 0.6522, "grad_norm": 1.004438042640686, "learning_rate": 0.0002, "epoch": 3.6035906642728905, "step": 50180}, {"loss": 0.6657, "grad_norm": 0.825969934463501, "learning_rate": 0.0002, "epoch": 3.6043087971274685, "step": 50190}, {"loss": 0.6788, "grad_norm": 0.8866565227508545, "learning_rate": 0.0002, "epoch": 3.6050269299820465, "step": 50200}, {"loss": 0.6643, "grad_norm": 0.8920543193817139, "learning_rate": 0.0002, "epoch": 3.6057450628366245, "step": 50210}, {"loss": 0.668, "grad_norm": 1.106584906578064, "learning_rate": 0.0002, "epoch": 3.606463195691203, "step": 50220}, {"loss": 0.6878, "grad_norm": 0.916607677936554, "learning_rate": 0.0002, "epoch": 3.607181328545781, "step": 50230}, {"loss": 0.6084, "grad_norm": 0.8014767169952393, "learning_rate": 0.0002, "epoch": 3.607899461400359, "step": 50240}, {"loss": 0.6718, "grad_norm": 0.9556822776794434, "learning_rate": 0.0002, "epoch": 3.608617594254937, "step": 50250}, {"loss": 0.6896, "grad_norm": 0.9630016684532166, "learning_rate": 0.0002, "epoch": 3.6093357271095154, "step": 50260}, {"loss": 0.692, "grad_norm": 0.9862125515937805, "learning_rate": 0.0002, "epoch": 3.6100538599640934, "step": 50270}, {"loss": 0.5981, "grad_norm": 1.0043333768844604, "learning_rate": 0.0002, "epoch": 3.6107719928186714, "step": 50280}, {"loss": 0.6243, "grad_norm": 0.9255319833755493, "learning_rate": 0.0002, "epoch": 3.6114901256732495, "step": 50290}, {"loss": 0.6374, "grad_norm": 1.012023687362671, "learning_rate": 0.0002, "epoch": 3.6122082585278275, "step": 50300}, {"loss": 0.6896, "grad_norm": 1.0701122283935547, "learning_rate": 0.0002, "epoch": 3.612926391382406, "step": 50310}, {"loss": 0.6474, "grad_norm": 0.8270810842514038, "learning_rate": 0.0002, "epoch": 3.613644524236984, "step": 50320}, {"loss": 0.6667, "grad_norm": 0.8881328105926514, "learning_rate": 0.0002, "epoch": 3.614362657091562, "step": 50330}, {"loss": 0.6517, "grad_norm": 0.9536844491958618, "learning_rate": 0.0002, "epoch": 3.61508078994614, "step": 50340}, {"loss": 0.62, "grad_norm": 0.8044326305389404, "learning_rate": 0.0002, "epoch": 3.6157989228007184, "step": 50350}, {"loss": 0.6259, "grad_norm": 0.834591805934906, "learning_rate": 0.0002, "epoch": 3.6165170556552964, "step": 50360}, {"loss": 0.7173, "grad_norm": 0.903752863407135, "learning_rate": 0.0002, "epoch": 3.6172351885098744, "step": 50370}, {"loss": 0.6305, "grad_norm": 0.9148632884025574, "learning_rate": 0.0002, "epoch": 3.6179533213644524, "step": 50380}, {"loss": 0.6624, "grad_norm": 0.9280176162719727, "learning_rate": 0.0002, "epoch": 3.6186714542190304, "step": 50390}, {"loss": 0.6457, "grad_norm": 0.9524136781692505, "learning_rate": 0.0002, "epoch": 3.6193895870736084, "step": 50400}, {"loss": 0.6918, "grad_norm": 1.1751197576522827, "learning_rate": 0.0002, "epoch": 3.620107719928187, "step": 50410}, {"loss": 0.6161, "grad_norm": 1.032279133796692, "learning_rate": 0.0002, "epoch": 3.620825852782765, "step": 50420}, {"loss": 0.6347, "grad_norm": 0.790741503238678, "learning_rate": 0.0002, "epoch": 3.621543985637343, "step": 50430}, {"loss": 0.695, "grad_norm": 0.9584221243858337, "learning_rate": 0.0002, "epoch": 3.6222621184919213, "step": 50440}, {"loss": 0.6393, "grad_norm": 0.7792508006095886, "learning_rate": 0.0002, "epoch": 3.6229802513464993, "step": 50450}, {"loss": 0.6398, "grad_norm": 0.8273448944091797, "learning_rate": 0.0002, "epoch": 3.6236983842010773, "step": 50460}, {"loss": 0.6436, "grad_norm": 0.8001132607460022, "learning_rate": 0.0002, "epoch": 3.6244165170556553, "step": 50470}, {"loss": 0.6499, "grad_norm": 1.077109694480896, "learning_rate": 0.0002, "epoch": 3.6251346499102333, "step": 50480}, {"loss": 0.6587, "grad_norm": 1.111274003982544, "learning_rate": 0.0002, "epoch": 3.6258527827648113, "step": 50490}, {"loss": 0.6842, "grad_norm": 0.7757347822189331, "learning_rate": 0.0002, "epoch": 3.6265709156193897, "step": 50500}, {"loss": 0.6887, "grad_norm": 0.9217049479484558, "learning_rate": 0.0002, "epoch": 3.6272890484739677, "step": 50510}, {"loss": 0.6903, "grad_norm": 0.9362251162528992, "learning_rate": 0.0002, "epoch": 3.6280071813285457, "step": 50520}, {"loss": 0.625, "grad_norm": 0.9435479044914246, "learning_rate": 0.0002, "epoch": 3.6287253141831237, "step": 50530}, {"loss": 0.5869, "grad_norm": 0.7748915553092957, "learning_rate": 0.0002, "epoch": 3.629443447037702, "step": 50540}, {"loss": 0.637, "grad_norm": 0.8238945007324219, "learning_rate": 0.0002, "epoch": 3.63016157989228, "step": 50550}, {"loss": 0.6251, "grad_norm": 0.8421505093574524, "learning_rate": 0.0002, "epoch": 3.630879712746858, "step": 50560}, {"loss": 0.6544, "grad_norm": 1.0272293090820312, "learning_rate": 0.0002, "epoch": 3.631597845601436, "step": 50570}, {"loss": 0.6467, "grad_norm": 0.7643818259239197, "learning_rate": 0.0002, "epoch": 3.632315978456014, "step": 50580}, {"loss": 0.6716, "grad_norm": 0.9756225347518921, "learning_rate": 0.0002, "epoch": 3.6330341113105926, "step": 50590}, {"loss": 0.6534, "grad_norm": 0.9311570525169373, "learning_rate": 0.0002, "epoch": 3.6337522441651706, "step": 50600}, {"loss": 0.6465, "grad_norm": 0.8829827904701233, "learning_rate": 0.0002, "epoch": 3.6344703770197486, "step": 50610}, {"loss": 0.626, "grad_norm": 0.9473454356193542, "learning_rate": 0.0002, "epoch": 3.6351885098743266, "step": 50620}, {"loss": 0.713, "grad_norm": 1.1023668050765991, "learning_rate": 0.0002, "epoch": 3.635906642728905, "step": 50630}, {"loss": 0.6287, "grad_norm": 0.8490299582481384, "learning_rate": 0.0002, "epoch": 3.636624775583483, "step": 50640}, {"loss": 0.6373, "grad_norm": 1.1129392385482788, "learning_rate": 0.0002, "epoch": 3.637342908438061, "step": 50650}, {"loss": 0.7351, "grad_norm": 1.0334501266479492, "learning_rate": 0.0002, "epoch": 3.638061041292639, "step": 50660}, {"loss": 0.69, "grad_norm": 0.8397296667098999, "learning_rate": 0.0002, "epoch": 3.638779174147217, "step": 50670}, {"loss": 0.6075, "grad_norm": 0.7984256744384766, "learning_rate": 0.0002, "epoch": 3.639497307001795, "step": 50680}, {"loss": 0.651, "grad_norm": 1.1182054281234741, "learning_rate": 0.0002, "epoch": 3.6402154398563735, "step": 50690}, {"loss": 0.6511, "grad_norm": 0.8743279576301575, "learning_rate": 0.0002, "epoch": 3.6409335727109515, "step": 50700}, {"loss": 0.6894, "grad_norm": 0.9101628661155701, "learning_rate": 0.0002, "epoch": 3.6416517055655295, "step": 50710}, {"loss": 0.6591, "grad_norm": 0.8866934180259705, "learning_rate": 0.0002, "epoch": 3.642369838420108, "step": 50720}, {"loss": 0.6483, "grad_norm": 0.863945484161377, "learning_rate": 0.0002, "epoch": 3.643087971274686, "step": 50730}, {"loss": 0.6443, "grad_norm": 1.0845744609832764, "learning_rate": 0.0002, "epoch": 3.643806104129264, "step": 50740}, {"loss": 0.6611, "grad_norm": 0.8610911965370178, "learning_rate": 0.0002, "epoch": 3.644524236983842, "step": 50750}, {"loss": 0.6617, "grad_norm": 0.8502625226974487, "learning_rate": 0.0002, "epoch": 3.64524236983842, "step": 50760}, {"loss": 0.6283, "grad_norm": 0.847372829914093, "learning_rate": 0.0002, "epoch": 3.645960502692998, "step": 50770}, {"loss": 0.5724, "grad_norm": 0.8649292588233948, "learning_rate": 0.0002, "epoch": 3.6466786355475764, "step": 50780}, {"loss": 0.6253, "grad_norm": 0.8742905855178833, "learning_rate": 0.0002, "epoch": 3.6473967684021544, "step": 50790}, {"loss": 0.68, "grad_norm": 0.9546048641204834, "learning_rate": 0.0002, "epoch": 3.6481149012567324, "step": 50800}, {"loss": 0.6212, "grad_norm": 0.7893161773681641, "learning_rate": 0.0002, "epoch": 3.6488330341113104, "step": 50810}, {"loss": 0.6328, "grad_norm": 0.9350247979164124, "learning_rate": 0.0002, "epoch": 3.649551166965889, "step": 50820}, {"loss": 0.6893, "grad_norm": 0.772149384021759, "learning_rate": 0.0002, "epoch": 3.650269299820467, "step": 50830}, {"loss": 0.6107, "grad_norm": 0.8281718492507935, "learning_rate": 0.0002, "epoch": 3.650987432675045, "step": 50840}, {"loss": 0.6136, "grad_norm": 0.8063850402832031, "learning_rate": 0.0002, "epoch": 3.651705565529623, "step": 50850}, {"loss": 0.6416, "grad_norm": 0.8101351261138916, "learning_rate": 0.0002, "epoch": 3.652423698384201, "step": 50860}, {"loss": 0.6636, "grad_norm": 0.8747833371162415, "learning_rate": 0.0002, "epoch": 3.6531418312387793, "step": 50870}, {"loss": 0.6575, "grad_norm": 0.9634656310081482, "learning_rate": 0.0002, "epoch": 3.6538599640933573, "step": 50880}, {"loss": 0.6227, "grad_norm": 1.1646045446395874, "learning_rate": 0.0002, "epoch": 3.6545780969479353, "step": 50890}, {"loss": 0.6628, "grad_norm": 0.8538454174995422, "learning_rate": 0.0002, "epoch": 3.6552962298025133, "step": 50900}, {"loss": 0.6488, "grad_norm": 0.7639184594154358, "learning_rate": 0.0002, "epoch": 3.656014362657092, "step": 50910}, {"loss": 0.6495, "grad_norm": 0.8750212788581848, "learning_rate": 0.0002, "epoch": 3.65673249551167, "step": 50920}, {"loss": 0.6601, "grad_norm": 0.9161198735237122, "learning_rate": 0.0002, "epoch": 3.657450628366248, "step": 50930}, {"loss": 0.6809, "grad_norm": 0.7987924814224243, "learning_rate": 0.0002, "epoch": 3.658168761220826, "step": 50940}, {"loss": 0.6228, "grad_norm": 0.8939290642738342, "learning_rate": 0.0002, "epoch": 3.658886894075404, "step": 50950}, {"loss": 0.687, "grad_norm": 0.9803797602653503, "learning_rate": 0.0002, "epoch": 3.659605026929982, "step": 50960}, {"loss": 0.6368, "grad_norm": 1.2423512935638428, "learning_rate": 0.0002, "epoch": 3.6603231597845602, "step": 50970}, {"loss": 0.6477, "grad_norm": 1.0023225545883179, "learning_rate": 0.0002, "epoch": 3.6610412926391382, "step": 50980}, {"loss": 0.6659, "grad_norm": 0.9066677689552307, "learning_rate": 0.0002, "epoch": 3.6617594254937162, "step": 50990}, {"loss": 0.6348, "grad_norm": 0.8906226754188538, "learning_rate": 0.0002, "epoch": 3.6624775583482947, "step": 51000}, {"loss": 0.5967, "grad_norm": 0.7449954152107239, "learning_rate": 0.0002, "epoch": 3.6631956912028727, "step": 51010}, {"loss": 0.6167, "grad_norm": 0.812612771987915, "learning_rate": 0.0002, "epoch": 3.6639138240574507, "step": 51020}, {"loss": 0.6414, "grad_norm": 0.861818253993988, "learning_rate": 0.0002, "epoch": 3.6646319569120287, "step": 51030}, {"loss": 0.6418, "grad_norm": 0.849726676940918, "learning_rate": 0.0002, "epoch": 3.6653500897666067, "step": 51040}, {"loss": 0.6613, "grad_norm": 0.9738494753837585, "learning_rate": 0.0002, "epoch": 3.6660682226211847, "step": 51050}, {"loss": 0.6094, "grad_norm": 0.928989827632904, "learning_rate": 0.0002, "epoch": 3.666786355475763, "step": 51060}, {"loss": 0.623, "grad_norm": 0.9725563526153564, "learning_rate": 0.0002, "epoch": 3.667504488330341, "step": 51070}, {"loss": 0.5967, "grad_norm": 0.9366095066070557, "learning_rate": 0.0002, "epoch": 3.668222621184919, "step": 51080}, {"loss": 0.6175, "grad_norm": 0.8012986779212952, "learning_rate": 0.0002, "epoch": 3.668940754039497, "step": 51090}, {"loss": 0.6428, "grad_norm": 1.0646892786026, "learning_rate": 0.0002, "epoch": 3.6696588868940756, "step": 51100}, {"loss": 0.6333, "grad_norm": 0.7245157361030579, "learning_rate": 0.0002, "epoch": 3.6703770197486536, "step": 51110}, {"loss": 0.6618, "grad_norm": 0.6938936114311218, "learning_rate": 0.0002, "epoch": 3.6710951526032316, "step": 51120}, {"loss": 0.6511, "grad_norm": 0.8461366295814514, "learning_rate": 0.0002, "epoch": 3.6718132854578096, "step": 51130}, {"loss": 0.6168, "grad_norm": 0.8392583131790161, "learning_rate": 0.0002, "epoch": 3.6725314183123876, "step": 51140}, {"loss": 0.6616, "grad_norm": 0.7245259284973145, "learning_rate": 0.0002, "epoch": 3.673249551166966, "step": 51150}, {"loss": 0.6165, "grad_norm": 1.0742167234420776, "learning_rate": 0.0002, "epoch": 3.673967684021544, "step": 51160}, {"loss": 0.6805, "grad_norm": 0.9553889036178589, "learning_rate": 0.0002, "epoch": 3.674685816876122, "step": 51170}, {"loss": 0.6065, "grad_norm": 0.8713715672492981, "learning_rate": 0.0002, "epoch": 3.6754039497307, "step": 51180}, {"loss": 0.599, "grad_norm": 0.7499800324440002, "learning_rate": 0.0002, "epoch": 3.6761220825852785, "step": 51190}, {"loss": 0.7143, "grad_norm": 1.1118139028549194, "learning_rate": 0.0002, "epoch": 3.6768402154398565, "step": 51200}, {"loss": 0.6694, "grad_norm": 0.8146613836288452, "learning_rate": 0.0002, "epoch": 3.6775583482944345, "step": 51210}, {"loss": 0.6528, "grad_norm": 0.9331285357475281, "learning_rate": 0.0002, "epoch": 3.6782764811490125, "step": 51220}, {"loss": 0.6429, "grad_norm": 1.0497597455978394, "learning_rate": 0.0002, "epoch": 3.6789946140035905, "step": 51230}, {"loss": 0.6404, "grad_norm": 0.879814863204956, "learning_rate": 0.0002, "epoch": 3.6797127468581685, "step": 51240}, {"loss": 0.6617, "grad_norm": 0.9896606802940369, "learning_rate": 0.0002, "epoch": 3.680430879712747, "step": 51250}, {"loss": 0.6461, "grad_norm": 0.928236186504364, "learning_rate": 0.0002, "epoch": 3.681149012567325, "step": 51260}, {"loss": 0.6516, "grad_norm": 0.8436732292175293, "learning_rate": 0.0002, "epoch": 3.681867145421903, "step": 51270}, {"loss": 0.6428, "grad_norm": 0.93634432554245, "learning_rate": 0.0002, "epoch": 3.6825852782764814, "step": 51280}, {"loss": 0.6081, "grad_norm": 0.8477143049240112, "learning_rate": 0.0002, "epoch": 3.6833034111310594, "step": 51290}, {"loss": 0.6536, "grad_norm": 0.8720934987068176, "learning_rate": 0.0002, "epoch": 3.6840215439856374, "step": 51300}, {"loss": 0.6523, "grad_norm": 0.7322931289672852, "learning_rate": 0.0002, "epoch": 3.6847396768402154, "step": 51310}, {"loss": 0.6475, "grad_norm": 1.0064427852630615, "learning_rate": 0.0002, "epoch": 3.6854578096947934, "step": 51320}, {"loss": 0.681, "grad_norm": 1.0197817087173462, "learning_rate": 0.0002, "epoch": 3.6861759425493714, "step": 51330}, {"loss": 0.5904, "grad_norm": 0.8764060139656067, "learning_rate": 0.0002, "epoch": 3.68689407540395, "step": 51340}, {"loss": 0.625, "grad_norm": 0.9763964414596558, "learning_rate": 0.0002, "epoch": 3.687612208258528, "step": 51350}, {"loss": 0.6299, "grad_norm": 0.8389105200767517, "learning_rate": 0.0002, "epoch": 3.688330341113106, "step": 51360}, {"loss": 0.6885, "grad_norm": 0.9215750694274902, "learning_rate": 0.0002, "epoch": 3.689048473967684, "step": 51370}, {"loss": 0.6325, "grad_norm": 0.8444913625717163, "learning_rate": 0.0002, "epoch": 3.6897666068222623, "step": 51380}, {"loss": 0.657, "grad_norm": 0.9635153412818909, "learning_rate": 0.0002, "epoch": 3.6904847396768403, "step": 51390}, {"loss": 0.7045, "grad_norm": 1.0397378206253052, "learning_rate": 0.0002, "epoch": 3.6912028725314183, "step": 51400}, {"loss": 0.6635, "grad_norm": 0.9154748320579529, "learning_rate": 0.0002, "epoch": 3.6919210053859963, "step": 51410}, {"loss": 0.6757, "grad_norm": 0.906445324420929, "learning_rate": 0.0002, "epoch": 3.6926391382405743, "step": 51420}, {"loss": 0.6533, "grad_norm": 0.9237992763519287, "learning_rate": 0.0002, "epoch": 3.6933572710951523, "step": 51430}, {"loss": 0.6257, "grad_norm": 0.8796338438987732, "learning_rate": 0.0002, "epoch": 3.6940754039497308, "step": 51440}, {"loss": 0.7063, "grad_norm": 0.8613203763961792, "learning_rate": 0.0002, "epoch": 3.6947935368043088, "step": 51450}, {"loss": 0.6455, "grad_norm": 0.7957607507705688, "learning_rate": 0.0002, "epoch": 3.6955116696588868, "step": 51460}, {"loss": 0.6328, "grad_norm": 0.9183711409568787, "learning_rate": 0.0002, "epoch": 3.6962298025134652, "step": 51470}, {"loss": 0.6289, "grad_norm": 1.0108308792114258, "learning_rate": 0.0002, "epoch": 3.6969479353680432, "step": 51480}, {"loss": 0.668, "grad_norm": 0.7768247127532959, "learning_rate": 0.0002, "epoch": 3.6976660682226212, "step": 51490}, {"loss": 0.6483, "grad_norm": 1.0051485300064087, "learning_rate": 0.0002, "epoch": 3.6983842010771992, "step": 51500}, {"loss": 0.6268, "grad_norm": 0.82451993227005, "learning_rate": 0.0002, "epoch": 3.6991023339317772, "step": 51510}, {"loss": 0.6258, "grad_norm": 0.9542286992073059, "learning_rate": 0.0002, "epoch": 3.6998204667863552, "step": 51520}, {"loss": 0.6415, "grad_norm": 0.693890392780304, "learning_rate": 0.0002, "epoch": 3.7005385996409337, "step": 51530}, {"loss": 0.6445, "grad_norm": 0.9068924784660339, "learning_rate": 0.0002, "epoch": 3.7012567324955117, "step": 51540}, {"loss": 0.6386, "grad_norm": 0.8694922924041748, "learning_rate": 0.0002, "epoch": 3.7019748653500897, "step": 51550}, {"loss": 0.6563, "grad_norm": 0.941081702709198, "learning_rate": 0.0002, "epoch": 3.702692998204668, "step": 51560}, {"loss": 0.6068, "grad_norm": 0.7385984659194946, "learning_rate": 0.0002, "epoch": 3.703411131059246, "step": 51570}, {"loss": 0.6243, "grad_norm": 1.0399216413497925, "learning_rate": 0.0002, "epoch": 3.704129263913824, "step": 51580}, {"loss": 0.6776, "grad_norm": 0.9802294969558716, "learning_rate": 0.0002, "epoch": 3.704847396768402, "step": 51590}, {"loss": 0.6243, "grad_norm": 1.0409669876098633, "learning_rate": 0.0002, "epoch": 3.70556552962298, "step": 51600}, {"loss": 0.6812, "grad_norm": 0.8972786068916321, "learning_rate": 0.0002, "epoch": 3.706283662477558, "step": 51610}, {"loss": 0.5993, "grad_norm": 1.1916245222091675, "learning_rate": 0.0002, "epoch": 3.7070017953321366, "step": 51620}, {"loss": 0.6566, "grad_norm": 0.9545385241508484, "learning_rate": 0.0002, "epoch": 3.7077199281867146, "step": 51630}, {"loss": 0.6497, "grad_norm": 1.0773427486419678, "learning_rate": 0.0002, "epoch": 3.7084380610412926, "step": 51640}, {"loss": 0.6768, "grad_norm": 1.0856024026870728, "learning_rate": 0.0002, "epoch": 3.7091561938958706, "step": 51650}, {"loss": 0.6404, "grad_norm": 0.7678500413894653, "learning_rate": 0.0002, "epoch": 3.709874326750449, "step": 51660}, {"loss": 0.6571, "grad_norm": 0.7276270985603333, "learning_rate": 0.0002, "epoch": 3.710592459605027, "step": 51670}, {"loss": 0.6498, "grad_norm": 0.8859017491340637, "learning_rate": 0.0002, "epoch": 3.711310592459605, "step": 51680}, {"loss": 0.6602, "grad_norm": 0.9037614464759827, "learning_rate": 0.0002, "epoch": 3.712028725314183, "step": 51690}, {"loss": 0.685, "grad_norm": 0.9223412275314331, "learning_rate": 0.0002, "epoch": 3.712746858168761, "step": 51700}, {"loss": 0.647, "grad_norm": 0.8812923431396484, "learning_rate": 0.0002, "epoch": 3.713464991023339, "step": 51710}, {"loss": 0.6546, "grad_norm": 0.8242456912994385, "learning_rate": 0.0002, "epoch": 3.7141831238779175, "step": 51720}, {"loss": 0.6462, "grad_norm": 0.8368834257125854, "learning_rate": 0.0002, "epoch": 3.7149012567324955, "step": 51730}, {"loss": 0.6432, "grad_norm": 0.8624704480171204, "learning_rate": 0.0002, "epoch": 3.7156193895870735, "step": 51740}, {"loss": 0.6367, "grad_norm": 0.9138273596763611, "learning_rate": 0.0002, "epoch": 3.716337522441652, "step": 51750}, {"loss": 0.6717, "grad_norm": 0.8088571429252625, "learning_rate": 0.0002, "epoch": 3.71705565529623, "step": 51760}, {"loss": 0.658, "grad_norm": 0.882808268070221, "learning_rate": 0.0002, "epoch": 3.717773788150808, "step": 51770}, {"loss": 0.6686, "grad_norm": 0.9368035197257996, "learning_rate": 0.0002, "epoch": 3.718491921005386, "step": 51780}, {"loss": 0.6482, "grad_norm": 0.8341794013977051, "learning_rate": 0.0002, "epoch": 3.719210053859964, "step": 51790}, {"loss": 0.6486, "grad_norm": 0.8692073225975037, "learning_rate": 0.0002, "epoch": 3.719928186714542, "step": 51800}, {"loss": 0.6591, "grad_norm": 0.7566918730735779, "learning_rate": 0.0002, "epoch": 3.7206463195691204, "step": 51810}, {"loss": 0.707, "grad_norm": 1.113138198852539, "learning_rate": 0.0002, "epoch": 3.7213644524236984, "step": 51820}, {"loss": 0.6683, "grad_norm": 0.8793158531188965, "learning_rate": 0.0002, "epoch": 3.7220825852782764, "step": 51830}, {"loss": 0.6343, "grad_norm": 0.8856439590454102, "learning_rate": 0.0002, "epoch": 3.722800718132855, "step": 51840}, {"loss": 0.6238, "grad_norm": 1.0182029008865356, "learning_rate": 0.0002, "epoch": 3.723518850987433, "step": 51850}, {"loss": 0.6743, "grad_norm": 1.1177181005477905, "learning_rate": 0.0002, "epoch": 3.724236983842011, "step": 51860}, {"loss": 0.6477, "grad_norm": 0.6600990295410156, "learning_rate": 0.0002, "epoch": 3.724955116696589, "step": 51870}, {"loss": 0.6532, "grad_norm": 1.0563536882400513, "learning_rate": 0.0002, "epoch": 3.725673249551167, "step": 51880}, {"loss": 0.6648, "grad_norm": 1.1067734956741333, "learning_rate": 0.0002, "epoch": 3.726391382405745, "step": 51890}, {"loss": 0.6547, "grad_norm": 1.0204616785049438, "learning_rate": 0.0002, "epoch": 3.7271095152603233, "step": 51900}, {"loss": 0.685, "grad_norm": 0.8647155165672302, "learning_rate": 0.0002, "epoch": 3.7278276481149013, "step": 51910}, {"loss": 0.739, "grad_norm": 1.0754971504211426, "learning_rate": 0.0002, "epoch": 3.7285457809694793, "step": 51920}, {"loss": 0.6535, "grad_norm": 1.0448992252349854, "learning_rate": 0.0002, "epoch": 3.7292639138240573, "step": 51930}, {"loss": 0.6802, "grad_norm": 0.963434100151062, "learning_rate": 0.0002, "epoch": 3.7299820466786358, "step": 51940}, {"loss": 0.6367, "grad_norm": 0.8112701773643494, "learning_rate": 0.0002, "epoch": 3.7307001795332138, "step": 51950}, {"loss": 0.6785, "grad_norm": 0.7975119948387146, "learning_rate": 0.0002, "epoch": 3.7314183123877918, "step": 51960}, {"loss": 0.6748, "grad_norm": 0.7953376173973083, "learning_rate": 0.0002, "epoch": 3.7321364452423698, "step": 51970}, {"loss": 0.6464, "grad_norm": 0.9519981741905212, "learning_rate": 0.0002, "epoch": 3.7328545780969478, "step": 51980}, {"loss": 0.6247, "grad_norm": 0.8705791234970093, "learning_rate": 0.0002, "epoch": 3.7335727109515258, "step": 51990}, {"loss": 0.6876, "grad_norm": 0.870205283164978, "learning_rate": 0.0002, "epoch": 3.734290843806104, "step": 52000}, {"loss": 0.6681, "grad_norm": 0.9558930993080139, "learning_rate": 0.0002, "epoch": 3.735008976660682, "step": 52010}, {"loss": 0.6772, "grad_norm": 0.9330434799194336, "learning_rate": 0.0002, "epoch": 3.73572710951526, "step": 52020}, {"loss": 0.6365, "grad_norm": 0.783620297908783, "learning_rate": 0.0002, "epoch": 3.7364452423698387, "step": 52030}, {"loss": 0.6275, "grad_norm": 0.7575166821479797, "learning_rate": 0.0002, "epoch": 3.7371633752244167, "step": 52040}, {"loss": 0.6859, "grad_norm": 1.0592705011367798, "learning_rate": 0.0002, "epoch": 3.7378815080789947, "step": 52050}, {"loss": 0.6704, "grad_norm": 0.9309433102607727, "learning_rate": 0.0002, "epoch": 3.7385996409335727, "step": 52060}, {"loss": 0.6607, "grad_norm": 0.972861647605896, "learning_rate": 0.0002, "epoch": 3.7393177737881507, "step": 52070}, {"loss": 0.6267, "grad_norm": 0.9318740963935852, "learning_rate": 0.0002, "epoch": 3.7400359066427287, "step": 52080}, {"loss": 0.6404, "grad_norm": 0.7938477396965027, "learning_rate": 0.0002, "epoch": 3.740754039497307, "step": 52090}, {"loss": 0.6451, "grad_norm": 1.1515966653823853, "learning_rate": 0.0002, "epoch": 3.741472172351885, "step": 52100}, {"loss": 0.6179, "grad_norm": 1.076869010925293, "learning_rate": 0.0002, "epoch": 3.742190305206463, "step": 52110}, {"loss": 0.6477, "grad_norm": 0.8516066670417786, "learning_rate": 0.0002, "epoch": 3.7429084380610416, "step": 52120}, {"loss": 0.6741, "grad_norm": 0.6853429079055786, "learning_rate": 0.0002, "epoch": 3.7436265709156196, "step": 52130}, {"loss": 0.6392, "grad_norm": 0.8179695010185242, "learning_rate": 0.0002, "epoch": 3.7443447037701976, "step": 52140}, {"loss": 0.6692, "grad_norm": 0.8395232558250427, "learning_rate": 0.0002, "epoch": 3.7450628366247756, "step": 52150}, {"loss": 0.6902, "grad_norm": 1.0178003311157227, "learning_rate": 0.0002, "epoch": 3.7457809694793536, "step": 52160}, {"loss": 0.6726, "grad_norm": 1.1801023483276367, "learning_rate": 0.0002, "epoch": 3.7464991023339316, "step": 52170}, {"loss": 0.6334, "grad_norm": 0.8215751647949219, "learning_rate": 0.0002, "epoch": 3.74721723518851, "step": 52180}, {"loss": 0.5992, "grad_norm": 1.17083740234375, "learning_rate": 0.0002, "epoch": 3.747935368043088, "step": 52190}, {"loss": 0.6219, "grad_norm": 0.9230290651321411, "learning_rate": 0.0002, "epoch": 3.748653500897666, "step": 52200}, {"loss": 0.6503, "grad_norm": 0.8431521058082581, "learning_rate": 0.0002, "epoch": 3.749371633752244, "step": 52210}, {"loss": 0.6983, "grad_norm": 0.9690840244293213, "learning_rate": 0.0002, "epoch": 3.7500897666068225, "step": 52220}, {"loss": 0.6204, "grad_norm": 1.0022395849227905, "learning_rate": 0.0002, "epoch": 3.7508078994614005, "step": 52230}, {"loss": 0.6683, "grad_norm": 1.0489065647125244, "learning_rate": 0.0002, "epoch": 3.7515260323159785, "step": 52240}, {"loss": 0.6439, "grad_norm": 0.7880696058273315, "learning_rate": 0.0002, "epoch": 3.7522441651705565, "step": 52250}, {"loss": 0.6933, "grad_norm": 1.0255829095840454, "learning_rate": 0.0002, "epoch": 3.7529622980251345, "step": 52260}, {"loss": 0.6631, "grad_norm": 0.8470141291618347, "learning_rate": 0.0002, "epoch": 3.7536804308797125, "step": 52270}, {"loss": 0.5956, "grad_norm": 0.9040523171424866, "learning_rate": 0.0002, "epoch": 3.754398563734291, "step": 52280}, {"loss": 0.6759, "grad_norm": 0.9564392566680908, "learning_rate": 0.0002, "epoch": 3.755116696588869, "step": 52290}, {"loss": 0.6717, "grad_norm": 0.907857358455658, "learning_rate": 0.0002, "epoch": 3.755834829443447, "step": 52300}, {"loss": 0.6821, "grad_norm": 0.8929873704910278, "learning_rate": 0.0002, "epoch": 3.7565529622980254, "step": 52310}, {"loss": 0.655, "grad_norm": 0.854434072971344, "learning_rate": 0.0002, "epoch": 3.7572710951526034, "step": 52320}, {"loss": 0.6668, "grad_norm": 0.8744779229164124, "learning_rate": 0.0002, "epoch": 3.7579892280071814, "step": 52330}, {"loss": 0.6628, "grad_norm": 0.9022667407989502, "learning_rate": 0.0002, "epoch": 3.7587073608617594, "step": 52340}, {"loss": 0.6275, "grad_norm": 0.8884857892990112, "learning_rate": 0.0002, "epoch": 3.7594254937163374, "step": 52350}, {"loss": 0.6585, "grad_norm": 1.0228430032730103, "learning_rate": 0.0002, "epoch": 3.7601436265709154, "step": 52360}, {"loss": 0.6092, "grad_norm": 0.8593528270721436, "learning_rate": 0.0002, "epoch": 3.760861759425494, "step": 52370}, {"loss": 0.664, "grad_norm": 0.9435563087463379, "learning_rate": 0.0002, "epoch": 3.761579892280072, "step": 52380}, {"loss": 0.6326, "grad_norm": 0.7545679807662964, "learning_rate": 0.0002, "epoch": 3.76229802513465, "step": 52390}, {"loss": 0.6628, "grad_norm": 0.9411585927009583, "learning_rate": 0.0002, "epoch": 3.7630161579892283, "step": 52400}, {"loss": 0.62, "grad_norm": 0.9764377474784851, "learning_rate": 0.0002, "epoch": 3.7637342908438063, "step": 52410}, {"loss": 0.671, "grad_norm": 1.0718384981155396, "learning_rate": 0.0002, "epoch": 3.7644524236983843, "step": 52420}, {"loss": 0.6654, "grad_norm": 0.8765230774879456, "learning_rate": 0.0002, "epoch": 3.7651705565529623, "step": 52430}, {"loss": 0.6602, "grad_norm": 0.9275036454200745, "learning_rate": 0.0002, "epoch": 3.7658886894075403, "step": 52440}, {"loss": 0.6098, "grad_norm": 0.967410147190094, "learning_rate": 0.0002, "epoch": 3.7666068222621183, "step": 52450}, {"loss": 0.6195, "grad_norm": 0.7738949060440063, "learning_rate": 0.0002, "epoch": 3.7673249551166967, "step": 52460}, {"loss": 0.6054, "grad_norm": 1.0828070640563965, "learning_rate": 0.0002, "epoch": 3.7680430879712747, "step": 52470}, {"loss": 0.6208, "grad_norm": 0.9570213556289673, "learning_rate": 0.0002, "epoch": 3.7687612208258527, "step": 52480}, {"loss": 0.6703, "grad_norm": 1.0688215494155884, "learning_rate": 0.0002, "epoch": 3.7694793536804307, "step": 52490}, {"loss": 0.5993, "grad_norm": 0.7970073223114014, "learning_rate": 0.0002, "epoch": 3.770197486535009, "step": 52500}, {"loss": 0.6537, "grad_norm": 0.7132976651191711, "learning_rate": 0.0002, "epoch": 3.770915619389587, "step": 52510}, {"loss": 0.6571, "grad_norm": 1.152268648147583, "learning_rate": 0.0002, "epoch": 3.771633752244165, "step": 52520}, {"loss": 0.6548, "grad_norm": 0.8645235896110535, "learning_rate": 0.0002, "epoch": 3.772351885098743, "step": 52530}, {"loss": 0.6918, "grad_norm": 0.7725570201873779, "learning_rate": 0.0002, "epoch": 3.773070017953321, "step": 52540}, {"loss": 0.6796, "grad_norm": 0.9718102812767029, "learning_rate": 0.0002, "epoch": 3.773788150807899, "step": 52550}, {"loss": 0.6298, "grad_norm": 0.7568017840385437, "learning_rate": 0.0002, "epoch": 3.7745062836624776, "step": 52560}, {"loss": 0.6652, "grad_norm": 0.9578912854194641, "learning_rate": 0.0002, "epoch": 3.7752244165170556, "step": 52570}, {"loss": 0.6417, "grad_norm": 0.8657314777374268, "learning_rate": 0.0002, "epoch": 3.7759425493716336, "step": 52580}, {"loss": 0.6552, "grad_norm": 0.7564393281936646, "learning_rate": 0.0002, "epoch": 3.776660682226212, "step": 52590}, {"loss": 0.69, "grad_norm": 0.7631160616874695, "learning_rate": 0.0002, "epoch": 3.77737881508079, "step": 52600}, {"loss": 0.6427, "grad_norm": 1.1852056980133057, "learning_rate": 0.0002, "epoch": 3.778096947935368, "step": 52610}, {"loss": 0.6369, "grad_norm": 1.0620790719985962, "learning_rate": 0.0002, "epoch": 3.778815080789946, "step": 52620}, {"loss": 0.6782, "grad_norm": 0.8677777647972107, "learning_rate": 0.0002, "epoch": 3.779533213644524, "step": 52630}, {"loss": 0.6249, "grad_norm": 0.9913218021392822, "learning_rate": 0.0002, "epoch": 3.780251346499102, "step": 52640}, {"loss": 0.625, "grad_norm": 0.9868429899215698, "learning_rate": 0.0002, "epoch": 3.7809694793536806, "step": 52650}, {"loss": 0.6252, "grad_norm": 0.8791782259941101, "learning_rate": 0.0002, "epoch": 3.7816876122082586, "step": 52660}, {"loss": 0.6675, "grad_norm": 0.9503955245018005, "learning_rate": 0.0002, "epoch": 3.7824057450628366, "step": 52670}, {"loss": 0.6406, "grad_norm": 0.8647131323814392, "learning_rate": 0.0002, "epoch": 3.7831238779174146, "step": 52680}, {"loss": 0.6654, "grad_norm": 0.9819629788398743, "learning_rate": 0.0002, "epoch": 3.783842010771993, "step": 52690}, {"loss": 0.593, "grad_norm": 0.8548610210418701, "learning_rate": 0.0002, "epoch": 3.784560143626571, "step": 52700}, {"loss": 0.6614, "grad_norm": 0.8706230521202087, "learning_rate": 0.0002, "epoch": 3.785278276481149, "step": 52710}, {"loss": 0.6326, "grad_norm": 1.0032461881637573, "learning_rate": 0.0002, "epoch": 3.785996409335727, "step": 52720}, {"loss": 0.6172, "grad_norm": 1.0578246116638184, "learning_rate": 0.0002, "epoch": 3.786714542190305, "step": 52730}, {"loss": 0.6392, "grad_norm": 0.9854007363319397, "learning_rate": 0.0002, "epoch": 3.7874326750448835, "step": 52740}, {"loss": 0.6462, "grad_norm": 0.8389187455177307, "learning_rate": 0.0002, "epoch": 3.7881508078994615, "step": 52750}, {"loss": 0.6515, "grad_norm": 0.9192399978637695, "learning_rate": 0.0002, "epoch": 3.7888689407540395, "step": 52760}, {"loss": 0.6436, "grad_norm": 0.9518283605575562, "learning_rate": 0.0002, "epoch": 3.7895870736086175, "step": 52770}, {"loss": 0.6548, "grad_norm": 1.1296825408935547, "learning_rate": 0.0002, "epoch": 3.790305206463196, "step": 52780}, {"loss": 0.6073, "grad_norm": 1.0589144229888916, "learning_rate": 0.0002, "epoch": 3.791023339317774, "step": 52790}, {"loss": 0.6593, "grad_norm": 0.8954343199729919, "learning_rate": 0.0002, "epoch": 3.791741472172352, "step": 52800}, {"loss": 0.6678, "grad_norm": 0.8283370733261108, "learning_rate": 0.0002, "epoch": 3.79245960502693, "step": 52810}, {"loss": 0.6865, "grad_norm": 0.910642683506012, "learning_rate": 0.0002, "epoch": 3.793177737881508, "step": 52820}, {"loss": 0.6672, "grad_norm": 0.9255108833312988, "learning_rate": 0.0002, "epoch": 3.793895870736086, "step": 52830}, {"loss": 0.6836, "grad_norm": 0.8773723244667053, "learning_rate": 0.0002, "epoch": 3.7946140035906644, "step": 52840}, {"loss": 0.6815, "grad_norm": 0.8454240560531616, "learning_rate": 0.0002, "epoch": 3.7953321364452424, "step": 52850}, {"loss": 0.6594, "grad_norm": 0.7636052966117859, "learning_rate": 0.0002, "epoch": 3.7960502692998204, "step": 52860}, {"loss": 0.6663, "grad_norm": 0.9358382821083069, "learning_rate": 0.0002, "epoch": 3.796768402154399, "step": 52870}, {"loss": 0.6761, "grad_norm": 0.9662801623344421, "learning_rate": 0.0002, "epoch": 3.797486535008977, "step": 52880}, {"loss": 0.6749, "grad_norm": 0.995907187461853, "learning_rate": 0.0002, "epoch": 3.798204667863555, "step": 52890}, {"loss": 0.6715, "grad_norm": 0.8700127005577087, "learning_rate": 0.0002, "epoch": 3.798922800718133, "step": 52900}, {"loss": 0.6554, "grad_norm": 0.8987792134284973, "learning_rate": 0.0002, "epoch": 3.799640933572711, "step": 52910}, {"loss": 0.6655, "grad_norm": 0.9753904938697815, "learning_rate": 0.0002, "epoch": 3.800359066427289, "step": 52920}, {"loss": 0.6536, "grad_norm": 0.7873555421829224, "learning_rate": 0.0002, "epoch": 3.8010771992818673, "step": 52930}, {"loss": 0.6233, "grad_norm": 0.8177929520606995, "learning_rate": 0.0002, "epoch": 3.8017953321364453, "step": 52940}, {"loss": 0.6508, "grad_norm": 0.8865532279014587, "learning_rate": 0.0002, "epoch": 3.8025134649910233, "step": 52950}, {"loss": 0.6922, "grad_norm": 0.9113775491714478, "learning_rate": 0.0002, "epoch": 3.8032315978456013, "step": 52960}, {"loss": 0.6382, "grad_norm": 0.9424585700035095, "learning_rate": 0.0002, "epoch": 3.8039497307001797, "step": 52970}, {"loss": 0.6694, "grad_norm": 0.8347237706184387, "learning_rate": 0.0002, "epoch": 3.8046678635547577, "step": 52980}, {"loss": 0.643, "grad_norm": 0.826863169670105, "learning_rate": 0.0002, "epoch": 3.8053859964093357, "step": 52990}, {"loss": 0.639, "grad_norm": 0.7313310503959656, "learning_rate": 0.0002, "epoch": 3.8061041292639137, "step": 53000}, {"loss": 0.6831, "grad_norm": 0.8352667093276978, "learning_rate": 0.0002, "epoch": 3.8068222621184917, "step": 53010}, {"loss": 0.6265, "grad_norm": 0.748461127281189, "learning_rate": 0.0002, "epoch": 3.80754039497307, "step": 53020}, {"loss": 0.6433, "grad_norm": 0.943256139755249, "learning_rate": 0.0002, "epoch": 3.808258527827648, "step": 53030}, {"loss": 0.6702, "grad_norm": 1.0448410511016846, "learning_rate": 0.0002, "epoch": 3.808976660682226, "step": 53040}, {"loss": 0.6901, "grad_norm": 0.9047636985778809, "learning_rate": 0.0002, "epoch": 3.809694793536804, "step": 53050}, {"loss": 0.6774, "grad_norm": 0.8594381213188171, "learning_rate": 0.0002, "epoch": 3.8104129263913826, "step": 53060}, {"loss": 0.6664, "grad_norm": 0.7593536972999573, "learning_rate": 0.0002, "epoch": 3.8111310592459606, "step": 53070}, {"loss": 0.6651, "grad_norm": 0.7189019918441772, "learning_rate": 0.0002, "epoch": 3.8118491921005386, "step": 53080}, {"loss": 0.6657, "grad_norm": 0.8569809198379517, "learning_rate": 0.0002, "epoch": 3.8125673249551166, "step": 53090}, {"loss": 0.6689, "grad_norm": 0.923378050327301, "learning_rate": 0.0002, "epoch": 3.8132854578096946, "step": 53100}, {"loss": 0.6168, "grad_norm": 0.9088824391365051, "learning_rate": 0.0002, "epoch": 3.8140035906642726, "step": 53110}, {"loss": 0.6514, "grad_norm": 1.1386840343475342, "learning_rate": 0.0002, "epoch": 3.814721723518851, "step": 53120}, {"loss": 0.6182, "grad_norm": 0.8389552235603333, "learning_rate": 0.0002, "epoch": 3.815439856373429, "step": 53130}, {"loss": 0.6779, "grad_norm": 0.7940975427627563, "learning_rate": 0.0002, "epoch": 3.816157989228007, "step": 53140}, {"loss": 0.6825, "grad_norm": 0.8389907479286194, "learning_rate": 0.0002, "epoch": 3.8168761220825855, "step": 53150}, {"loss": 0.6763, "grad_norm": 0.774206280708313, "learning_rate": 0.0002, "epoch": 3.8175942549371635, "step": 53160}, {"loss": 0.7011, "grad_norm": 1.189447283744812, "learning_rate": 0.0002, "epoch": 3.8183123877917415, "step": 53170}, {"loss": 0.6206, "grad_norm": 0.9875882863998413, "learning_rate": 0.0002, "epoch": 3.8190305206463195, "step": 53180}, {"loss": 0.6254, "grad_norm": 0.9205945134162903, "learning_rate": 0.0002, "epoch": 3.8197486535008975, "step": 53190}, {"loss": 0.5845, "grad_norm": 0.8312796354293823, "learning_rate": 0.0002, "epoch": 3.8204667863554755, "step": 53200}, {"loss": 0.6415, "grad_norm": 0.9755756855010986, "learning_rate": 0.0002, "epoch": 3.821184919210054, "step": 53210}, {"loss": 0.6657, "grad_norm": 1.0722965002059937, "learning_rate": 0.0002, "epoch": 3.821903052064632, "step": 53220}, {"loss": 0.6547, "grad_norm": 0.7720510959625244, "learning_rate": 0.0002, "epoch": 3.82262118491921, "step": 53230}, {"loss": 0.6383, "grad_norm": 1.020147681236267, "learning_rate": 0.0002, "epoch": 3.823339317773788, "step": 53240}, {"loss": 0.6491, "grad_norm": 0.8241816759109497, "learning_rate": 0.0002, "epoch": 3.8240574506283664, "step": 53250}, {"loss": 0.6914, "grad_norm": 0.8939895629882812, "learning_rate": 0.0002, "epoch": 3.8247755834829444, "step": 53260}, {"loss": 0.6725, "grad_norm": 1.010852336883545, "learning_rate": 0.0002, "epoch": 3.8254937163375224, "step": 53270}, {"loss": 0.6841, "grad_norm": 0.8201420307159424, "learning_rate": 0.0002, "epoch": 3.8262118491921004, "step": 53280}, {"loss": 0.6739, "grad_norm": 0.8797973990440369, "learning_rate": 0.0002, "epoch": 3.8269299820466784, "step": 53290}, {"loss": 0.658, "grad_norm": 0.9034950137138367, "learning_rate": 0.0002, "epoch": 3.827648114901257, "step": 53300}, {"loss": 0.6314, "grad_norm": 0.926802933216095, "learning_rate": 0.0002, "epoch": 3.828366247755835, "step": 53310}, {"loss": 0.6526, "grad_norm": 1.0205509662628174, "learning_rate": 0.0002, "epoch": 3.829084380610413, "step": 53320}, {"loss": 0.6596, "grad_norm": 0.9524099230766296, "learning_rate": 0.0002, "epoch": 3.829802513464991, "step": 53330}, {"loss": 0.6796, "grad_norm": 0.9692625999450684, "learning_rate": 0.0002, "epoch": 3.8305206463195693, "step": 53340}, {"loss": 0.628, "grad_norm": 0.7255275845527649, "learning_rate": 0.0002, "epoch": 3.8312387791741473, "step": 53350}, {"loss": 0.6104, "grad_norm": 0.7199059724807739, "learning_rate": 0.0002, "epoch": 3.8319569120287253, "step": 53360}, {"loss": 0.6703, "grad_norm": 1.004464864730835, "learning_rate": 0.0002, "epoch": 3.8326750448833034, "step": 53370}, {"loss": 0.7032, "grad_norm": 0.9092583060264587, "learning_rate": 0.0002, "epoch": 3.8333931777378814, "step": 53380}, {"loss": 0.6811, "grad_norm": 0.945091724395752, "learning_rate": 0.0002, "epoch": 3.8341113105924594, "step": 53390}, {"loss": 0.611, "grad_norm": 0.7980135679244995, "learning_rate": 0.0002, "epoch": 3.834829443447038, "step": 53400}, {"loss": 0.6604, "grad_norm": 0.7812868356704712, "learning_rate": 0.0002, "epoch": 3.835547576301616, "step": 53410}, {"loss": 0.6104, "grad_norm": 0.8957077860832214, "learning_rate": 0.0002, "epoch": 3.836265709156194, "step": 53420}, {"loss": 0.6754, "grad_norm": 0.9119600653648376, "learning_rate": 0.0002, "epoch": 3.8369838420107722, "step": 53430}, {"loss": 0.7346, "grad_norm": 0.8208187222480774, "learning_rate": 0.0002, "epoch": 3.8377019748653503, "step": 53440}, {"loss": 0.6549, "grad_norm": 0.7930439114570618, "learning_rate": 0.0002, "epoch": 3.8384201077199283, "step": 53450}, {"loss": 0.6192, "grad_norm": 0.8937777280807495, "learning_rate": 0.0002, "epoch": 3.8391382405745063, "step": 53460}, {"loss": 0.5954, "grad_norm": 0.7583796977996826, "learning_rate": 0.0002, "epoch": 3.8398563734290843, "step": 53470}, {"loss": 0.6217, "grad_norm": 1.0735969543457031, "learning_rate": 0.0002, "epoch": 3.8405745062836623, "step": 53480}, {"loss": 0.6472, "grad_norm": 1.1106033325195312, "learning_rate": 0.0002, "epoch": 3.8412926391382407, "step": 53490}, {"loss": 0.6813, "grad_norm": 1.092631220817566, "learning_rate": 0.0002, "epoch": 3.8420107719928187, "step": 53500}, {"loss": 0.6437, "grad_norm": 0.9961787462234497, "learning_rate": 0.0002, "epoch": 3.8427289048473967, "step": 53510}, {"loss": 0.6382, "grad_norm": 0.833831250667572, "learning_rate": 0.0002, "epoch": 3.8434470377019747, "step": 53520}, {"loss": 0.6403, "grad_norm": 1.0000009536743164, "learning_rate": 0.0002, "epoch": 3.844165170556553, "step": 53530}, {"loss": 0.6824, "grad_norm": 0.9784213304519653, "learning_rate": 0.0002, "epoch": 3.844883303411131, "step": 53540}, {"loss": 0.6816, "grad_norm": 0.8582558035850525, "learning_rate": 0.0002, "epoch": 3.845601436265709, "step": 53550}, {"loss": 0.5944, "grad_norm": 0.8267415761947632, "learning_rate": 0.0002, "epoch": 3.846319569120287, "step": 53560}, {"loss": 0.6562, "grad_norm": 0.8783000111579895, "learning_rate": 0.0002, "epoch": 3.847037701974865, "step": 53570}, {"loss": 0.6795, "grad_norm": 0.9866999983787537, "learning_rate": 0.0002, "epoch": 3.8477558348294436, "step": 53580}, {"loss": 0.7222, "grad_norm": 0.8459296226501465, "learning_rate": 0.0002, "epoch": 3.8484739676840216, "step": 53590}, {"loss": 0.6748, "grad_norm": 0.9804834723472595, "learning_rate": 0.0002, "epoch": 3.8491921005385996, "step": 53600}, {"loss": 0.6115, "grad_norm": 0.951074481010437, "learning_rate": 0.0002, "epoch": 3.8499102333931776, "step": 53610}, {"loss": 0.5914, "grad_norm": 0.8020104169845581, "learning_rate": 0.0002, "epoch": 3.850628366247756, "step": 53620}, {"loss": 0.6237, "grad_norm": 0.9296963214874268, "learning_rate": 0.0002, "epoch": 3.851346499102334, "step": 53630}, {"loss": 0.6384, "grad_norm": 0.8983652591705322, "learning_rate": 0.0002, "epoch": 3.852064631956912, "step": 53640}, {"loss": 0.6855, "grad_norm": 1.031858205795288, "learning_rate": 0.0002, "epoch": 3.85278276481149, "step": 53650}, {"loss": 0.622, "grad_norm": 0.8943952918052673, "learning_rate": 0.0002, "epoch": 3.853500897666068, "step": 53660}, {"loss": 0.6745, "grad_norm": 1.0072312355041504, "learning_rate": 0.0002, "epoch": 3.854219030520646, "step": 53670}, {"loss": 0.677, "grad_norm": 1.0604884624481201, "learning_rate": 0.0002, "epoch": 3.8549371633752245, "step": 53680}, {"loss": 0.5873, "grad_norm": 0.834223210811615, "learning_rate": 0.0002, "epoch": 3.8556552962298025, "step": 53690}, {"loss": 0.665, "grad_norm": 0.9872867465019226, "learning_rate": 0.0002, "epoch": 3.8563734290843805, "step": 53700}, {"loss": 0.6689, "grad_norm": 0.7999459505081177, "learning_rate": 0.0002, "epoch": 3.857091561938959, "step": 53710}, {"loss": 0.6744, "grad_norm": 0.717722475528717, "learning_rate": 0.0002, "epoch": 3.857809694793537, "step": 53720}, {"loss": 0.6348, "grad_norm": 1.0675442218780518, "learning_rate": 0.0002, "epoch": 3.858527827648115, "step": 53730}, {"loss": 0.6141, "grad_norm": 0.9789777398109436, "learning_rate": 0.0002, "epoch": 3.859245960502693, "step": 53740}, {"loss": 0.6455, "grad_norm": 0.9318669438362122, "learning_rate": 0.0002, "epoch": 3.859964093357271, "step": 53750}, {"loss": 0.6587, "grad_norm": 0.9848631024360657, "learning_rate": 0.0002, "epoch": 3.860682226211849, "step": 53760}, {"loss": 0.6202, "grad_norm": 0.8754391670227051, "learning_rate": 0.0002, "epoch": 3.8614003590664274, "step": 53770}, {"loss": 0.6411, "grad_norm": 0.9024585485458374, "learning_rate": 0.0002, "epoch": 3.8621184919210054, "step": 53780}, {"loss": 0.6643, "grad_norm": 0.8974794745445251, "learning_rate": 0.0002, "epoch": 3.8628366247755834, "step": 53790}, {"loss": 0.6729, "grad_norm": 0.8342790603637695, "learning_rate": 0.0002, "epoch": 3.8635547576301614, "step": 53800}, {"loss": 0.6322, "grad_norm": 0.8177682757377625, "learning_rate": 0.0002, "epoch": 3.86427289048474, "step": 53810}, {"loss": 0.6525, "grad_norm": 1.0259089469909668, "learning_rate": 0.0002, "epoch": 3.864991023339318, "step": 53820}, {"loss": 0.6508, "grad_norm": 1.042290210723877, "learning_rate": 0.0002, "epoch": 3.865709156193896, "step": 53830}, {"loss": 0.6963, "grad_norm": 0.7316540479660034, "learning_rate": 0.0002, "epoch": 3.866427289048474, "step": 53840}, {"loss": 0.6491, "grad_norm": 0.9384970664978027, "learning_rate": 0.0002, "epoch": 3.867145421903052, "step": 53850}, {"loss": 0.6689, "grad_norm": 0.9273143410682678, "learning_rate": 0.0002, "epoch": 3.86786355475763, "step": 53860}, {"loss": 0.6443, "grad_norm": 1.1183570623397827, "learning_rate": 0.0002, "epoch": 3.8685816876122083, "step": 53870}, {"loss": 0.6712, "grad_norm": 0.9455275535583496, "learning_rate": 0.0002, "epoch": 3.8692998204667863, "step": 53880}, {"loss": 0.6662, "grad_norm": 0.8702114820480347, "learning_rate": 0.0002, "epoch": 3.8700179533213643, "step": 53890}, {"loss": 0.7032, "grad_norm": 0.8751053214073181, "learning_rate": 0.0002, "epoch": 3.870736086175943, "step": 53900}, {"loss": 0.6398, "grad_norm": 0.9793110489845276, "learning_rate": 0.0002, "epoch": 3.871454219030521, "step": 53910}, {"loss": 0.6577, "grad_norm": 0.9705014824867249, "learning_rate": 0.0002, "epoch": 3.872172351885099, "step": 53920}, {"loss": 0.751, "grad_norm": 1.051504373550415, "learning_rate": 0.0002, "epoch": 3.872890484739677, "step": 53930}, {"loss": 0.6606, "grad_norm": 0.8590622544288635, "learning_rate": 0.0002, "epoch": 3.873608617594255, "step": 53940}, {"loss": 0.6495, "grad_norm": 0.7828099727630615, "learning_rate": 0.0002, "epoch": 3.874326750448833, "step": 53950}, {"loss": 0.6294, "grad_norm": 0.86341792345047, "learning_rate": 0.0002, "epoch": 3.8750448833034112, "step": 53960}, {"loss": 0.6677, "grad_norm": 1.114670991897583, "learning_rate": 0.0002, "epoch": 3.8757630161579892, "step": 53970}, {"loss": 0.6533, "grad_norm": 0.8559519052505493, "learning_rate": 0.0002, "epoch": 3.8764811490125672, "step": 53980}, {"loss": 0.6517, "grad_norm": 1.0518953800201416, "learning_rate": 0.0002, "epoch": 3.8771992818671457, "step": 53990}, {"loss": 0.6359, "grad_norm": 0.7157500982284546, "learning_rate": 0.0002, "epoch": 3.8779174147217237, "step": 54000}, {"loss": 0.6847, "grad_norm": 0.8390372395515442, "learning_rate": 0.0002, "epoch": 3.8786355475763017, "step": 54010}, {"loss": 0.6376, "grad_norm": 0.8486756086349487, "learning_rate": 0.0002, "epoch": 3.8793536804308797, "step": 54020}, {"loss": 0.6184, "grad_norm": 0.8361587524414062, "learning_rate": 0.0002, "epoch": 3.8800718132854577, "step": 54030}, {"loss": 0.6552, "grad_norm": 0.9490554928779602, "learning_rate": 0.0002, "epoch": 3.8807899461400357, "step": 54040}, {"loss": 0.6653, "grad_norm": 1.0311323404312134, "learning_rate": 0.0002, "epoch": 3.881508078994614, "step": 54050}, {"loss": 0.6484, "grad_norm": 0.84800124168396, "learning_rate": 0.0002, "epoch": 3.882226211849192, "step": 54060}, {"loss": 0.6995, "grad_norm": 0.8940879702568054, "learning_rate": 0.0002, "epoch": 3.88294434470377, "step": 54070}, {"loss": 0.6157, "grad_norm": 0.985542356967926, "learning_rate": 0.0002, "epoch": 3.883662477558348, "step": 54080}, {"loss": 0.6221, "grad_norm": 0.8846475481987, "learning_rate": 0.0002, "epoch": 3.8843806104129266, "step": 54090}, {"loss": 0.6656, "grad_norm": 0.9186338186264038, "learning_rate": 0.0002, "epoch": 3.8850987432675046, "step": 54100}, {"loss": 0.6367, "grad_norm": 1.106598973274231, "learning_rate": 0.0002, "epoch": 3.8858168761220826, "step": 54110}, {"loss": 0.6311, "grad_norm": 0.8167300224304199, "learning_rate": 0.0002, "epoch": 3.8865350089766606, "step": 54120}, {"loss": 0.694, "grad_norm": 0.9153622984886169, "learning_rate": 0.0002, "epoch": 3.8872531418312386, "step": 54130}, {"loss": 0.6669, "grad_norm": 0.8464475274085999, "learning_rate": 0.0002, "epoch": 3.8879712746858166, "step": 54140}, {"loss": 0.6658, "grad_norm": 0.8889452815055847, "learning_rate": 0.0002, "epoch": 3.888689407540395, "step": 54150}, {"loss": 0.6291, "grad_norm": 0.7861065864562988, "learning_rate": 0.0002, "epoch": 3.889407540394973, "step": 54160}, {"loss": 0.6315, "grad_norm": 0.882674515247345, "learning_rate": 0.0002, "epoch": 3.890125673249551, "step": 54170}, {"loss": 0.6223, "grad_norm": 0.8503835201263428, "learning_rate": 0.0002, "epoch": 3.8908438061041295, "step": 54180}, {"loss": 0.6176, "grad_norm": 0.888455331325531, "learning_rate": 0.0002, "epoch": 3.8915619389587075, "step": 54190}, {"loss": 0.6985, "grad_norm": 1.0473699569702148, "learning_rate": 0.0002, "epoch": 3.8922800718132855, "step": 54200}, {"loss": 0.6513, "grad_norm": 0.9548208713531494, "learning_rate": 0.0002, "epoch": 3.8929982046678635, "step": 54210}, {"loss": 0.6089, "grad_norm": 0.9158754944801331, "learning_rate": 0.0002, "epoch": 3.8937163375224415, "step": 54220}, {"loss": 0.6352, "grad_norm": 0.9001154899597168, "learning_rate": 0.0002, "epoch": 3.8944344703770195, "step": 54230}, {"loss": 0.6657, "grad_norm": 0.9736626148223877, "learning_rate": 0.0002, "epoch": 3.895152603231598, "step": 54240}, {"loss": 0.7248, "grad_norm": 0.8809846043586731, "learning_rate": 0.0002, "epoch": 3.895870736086176, "step": 54250}, {"loss": 0.6364, "grad_norm": 0.887583315372467, "learning_rate": 0.0002, "epoch": 3.896588868940754, "step": 54260}, {"loss": 0.6252, "grad_norm": 0.8395712971687317, "learning_rate": 0.0002, "epoch": 3.8973070017953324, "step": 54270}, {"loss": 0.681, "grad_norm": 0.8391315937042236, "learning_rate": 0.0002, "epoch": 3.8980251346499104, "step": 54280}, {"loss": 0.6352, "grad_norm": 0.8210049271583557, "learning_rate": 0.0002, "epoch": 3.8987432675044884, "step": 54290}, {"loss": 0.6484, "grad_norm": 1.1364530324935913, "learning_rate": 0.0002, "epoch": 3.8994614003590664, "step": 54300}, {"loss": 0.6383, "grad_norm": 0.7712056636810303, "learning_rate": 0.0002, "epoch": 3.9001795332136444, "step": 54310}, {"loss": 0.6516, "grad_norm": 0.9466049671173096, "learning_rate": 0.0002, "epoch": 3.9008976660682224, "step": 54320}, {"loss": 0.6938, "grad_norm": 1.0367140769958496, "learning_rate": 0.0002, "epoch": 3.901615798922801, "step": 54330}, {"loss": 0.672, "grad_norm": 1.0168321132659912, "learning_rate": 0.0002, "epoch": 3.902333931777379, "step": 54340}, {"loss": 0.6306, "grad_norm": 0.7830407619476318, "learning_rate": 0.0002, "epoch": 3.903052064631957, "step": 54350}, {"loss": 0.7198, "grad_norm": 0.9649789333343506, "learning_rate": 0.0002, "epoch": 3.903770197486535, "step": 54360}, {"loss": 0.6644, "grad_norm": 0.681077778339386, "learning_rate": 0.0002, "epoch": 3.9044883303411133, "step": 54370}, {"loss": 0.6677, "grad_norm": 0.8970136046409607, "learning_rate": 0.0002, "epoch": 3.9052064631956913, "step": 54380}, {"loss": 0.6581, "grad_norm": 0.9155173301696777, "learning_rate": 0.0002, "epoch": 3.9059245960502693, "step": 54390}, {"loss": 0.6711, "grad_norm": 1.0447794198989868, "learning_rate": 0.0002, "epoch": 3.9066427289048473, "step": 54400}, {"loss": 0.6883, "grad_norm": 0.7823813557624817, "learning_rate": 0.0002, "epoch": 3.9073608617594253, "step": 54410}, {"loss": 0.6688, "grad_norm": 0.9289445877075195, "learning_rate": 0.0002, "epoch": 3.9080789946140033, "step": 54420}, {"loss": 0.7024, "grad_norm": 0.9983111619949341, "learning_rate": 0.0002, "epoch": 3.9087971274685818, "step": 54430}, {"loss": 0.6687, "grad_norm": 0.7952495813369751, "learning_rate": 0.0002, "epoch": 3.9095152603231598, "step": 54440}, {"loss": 0.6118, "grad_norm": 0.8045601844787598, "learning_rate": 0.0002, "epoch": 3.9102333931777378, "step": 54450}, {"loss": 0.6388, "grad_norm": 0.936585009098053, "learning_rate": 0.0002, "epoch": 3.910951526032316, "step": 54460}, {"loss": 0.6217, "grad_norm": 0.745793879032135, "learning_rate": 0.0002, "epoch": 3.911669658886894, "step": 54470}, {"loss": 0.6814, "grad_norm": 0.9137616157531738, "learning_rate": 0.0002, "epoch": 3.912387791741472, "step": 54480}, {"loss": 0.6792, "grad_norm": 0.826316237449646, "learning_rate": 0.0002, "epoch": 3.9131059245960502, "step": 54490}, {"loss": 0.6914, "grad_norm": 0.94313645362854, "learning_rate": 0.0002, "epoch": 3.9138240574506282, "step": 54500}, {"loss": 0.62, "grad_norm": 1.045893907546997, "learning_rate": 0.0002, "epoch": 3.9145421903052062, "step": 54510}, {"loss": 0.5841, "grad_norm": 0.9122704863548279, "learning_rate": 0.0002, "epoch": 3.9152603231597847, "step": 54520}, {"loss": 0.7029, "grad_norm": 1.0999689102172852, "learning_rate": 0.0002, "epoch": 3.9159784560143627, "step": 54530}, {"loss": 0.6387, "grad_norm": 0.9281555414199829, "learning_rate": 0.0002, "epoch": 3.9166965888689407, "step": 54540}, {"loss": 0.6227, "grad_norm": 1.1439622640609741, "learning_rate": 0.0002, "epoch": 3.917414721723519, "step": 54550}, {"loss": 0.6733, "grad_norm": 0.9375617504119873, "learning_rate": 0.0002, "epoch": 3.918132854578097, "step": 54560}, {"loss": 0.6503, "grad_norm": 0.92906653881073, "learning_rate": 0.0002, "epoch": 3.918850987432675, "step": 54570}, {"loss": 0.6361, "grad_norm": 1.0840893983840942, "learning_rate": 0.0002, "epoch": 3.919569120287253, "step": 54580}, {"loss": 0.6476, "grad_norm": 0.8145509362220764, "learning_rate": 0.0002, "epoch": 3.920287253141831, "step": 54590}, {"loss": 0.6826, "grad_norm": 0.973737895488739, "learning_rate": 0.0002, "epoch": 3.921005385996409, "step": 54600}, {"loss": 0.6822, "grad_norm": 0.9302353858947754, "learning_rate": 0.0002, "epoch": 3.9217235188509876, "step": 54610}, {"loss": 0.6522, "grad_norm": 0.9167897701263428, "learning_rate": 0.0002, "epoch": 3.9224416517055656, "step": 54620}, {"loss": 0.6783, "grad_norm": 0.8096851706504822, "learning_rate": 0.0002, "epoch": 3.9231597845601436, "step": 54630}, {"loss": 0.6369, "grad_norm": 0.8006368279457092, "learning_rate": 0.0002, "epoch": 3.9238779174147216, "step": 54640}, {"loss": 0.6533, "grad_norm": 0.7800863981246948, "learning_rate": 0.0002, "epoch": 3.9245960502693, "step": 54650}, {"loss": 0.6518, "grad_norm": 1.0331560373306274, "learning_rate": 0.0002, "epoch": 3.925314183123878, "step": 54660}, {"loss": 0.6764, "grad_norm": 1.0057517290115356, "learning_rate": 0.0002, "epoch": 3.926032315978456, "step": 54670}, {"loss": 0.6636, "grad_norm": 0.8920564651489258, "learning_rate": 0.0002, "epoch": 3.926750448833034, "step": 54680}, {"loss": 0.6432, "grad_norm": 0.7704599499702454, "learning_rate": 0.0002, "epoch": 3.927468581687612, "step": 54690}, {"loss": 0.6532, "grad_norm": 0.827032208442688, "learning_rate": 0.0002, "epoch": 3.92818671454219, "step": 54700}, {"loss": 0.7083, "grad_norm": 1.0019268989562988, "learning_rate": 0.0002, "epoch": 3.9289048473967685, "step": 54710}, {"loss": 0.6026, "grad_norm": 0.862033486366272, "learning_rate": 0.0002, "epoch": 3.9296229802513465, "step": 54720}, {"loss": 0.599, "grad_norm": 0.8965592980384827, "learning_rate": 0.0002, "epoch": 3.9303411131059245, "step": 54730}, {"loss": 0.6739, "grad_norm": 0.7689077854156494, "learning_rate": 0.0002, "epoch": 3.931059245960503, "step": 54740}, {"loss": 0.6401, "grad_norm": 0.846276581287384, "learning_rate": 0.0002, "epoch": 3.931777378815081, "step": 54750}, {"loss": 0.6942, "grad_norm": 0.8932713866233826, "learning_rate": 0.0002, "epoch": 3.932495511669659, "step": 54760}, {"loss": 0.6697, "grad_norm": 0.9711386561393738, "learning_rate": 0.0002, "epoch": 3.933213644524237, "step": 54770}, {"loss": 0.6672, "grad_norm": 0.9290250539779663, "learning_rate": 0.0002, "epoch": 3.933931777378815, "step": 54780}, {"loss": 0.6365, "grad_norm": 1.0897367000579834, "learning_rate": 0.0002, "epoch": 3.934649910233393, "step": 54790}, {"loss": 0.6647, "grad_norm": 0.8451842665672302, "learning_rate": 0.0002, "epoch": 3.9353680430879714, "step": 54800}, {"loss": 0.6705, "grad_norm": 0.8400090336799622, "learning_rate": 0.0002, "epoch": 3.9360861759425494, "step": 54810}, {"loss": 0.6577, "grad_norm": 0.951383650302887, "learning_rate": 0.0002, "epoch": 3.9368043087971274, "step": 54820}, {"loss": 0.654, "grad_norm": 0.848838210105896, "learning_rate": 0.0002, "epoch": 3.937522441651706, "step": 54830}, {"loss": 0.6852, "grad_norm": 0.735763669013977, "learning_rate": 0.0002, "epoch": 3.938240574506284, "step": 54840}, {"loss": 0.6574, "grad_norm": 0.979037344455719, "learning_rate": 0.0002, "epoch": 3.938958707360862, "step": 54850}, {"loss": 0.5851, "grad_norm": 0.933674693107605, "learning_rate": 0.0002, "epoch": 3.93967684021544, "step": 54860}, {"loss": 0.6931, "grad_norm": 0.835593044757843, "learning_rate": 0.0002, "epoch": 3.940394973070018, "step": 54870}, {"loss": 0.6967, "grad_norm": 1.0034281015396118, "learning_rate": 0.0002, "epoch": 3.941113105924596, "step": 54880}, {"loss": 0.6442, "grad_norm": 0.9732975959777832, "learning_rate": 0.0002, "epoch": 3.9418312387791743, "step": 54890}, {"loss": 0.6657, "grad_norm": 0.9666336178779602, "learning_rate": 0.0002, "epoch": 3.9425493716337523, "step": 54900}, {"loss": 0.6521, "grad_norm": 0.755310595035553, "learning_rate": 0.0002, "epoch": 3.9432675044883303, "step": 54910}, {"loss": 0.6562, "grad_norm": 0.8732092976570129, "learning_rate": 0.0002, "epoch": 3.9439856373429083, "step": 54920}, {"loss": 0.6486, "grad_norm": 1.139453649520874, "learning_rate": 0.0002, "epoch": 3.9447037701974867, "step": 54930}, {"loss": 0.6609, "grad_norm": 0.9044837951660156, "learning_rate": 0.0002, "epoch": 3.9454219030520647, "step": 54940}, {"loss": 0.6344, "grad_norm": 1.0496679544448853, "learning_rate": 0.0002, "epoch": 3.9461400359066428, "step": 54950}, {"loss": 0.6471, "grad_norm": 1.0099035501480103, "learning_rate": 0.0002, "epoch": 3.9468581687612208, "step": 54960}, {"loss": 0.6143, "grad_norm": 1.0694963932037354, "learning_rate": 0.0002, "epoch": 3.9475763016157988, "step": 54970}, {"loss": 0.6209, "grad_norm": 1.0012997388839722, "learning_rate": 0.0002, "epoch": 3.9482944344703768, "step": 54980}, {"loss": 0.7379, "grad_norm": 0.8910513520240784, "learning_rate": 0.0002, "epoch": 3.949012567324955, "step": 54990}, {"loss": 0.7184, "grad_norm": 1.0267579555511475, "learning_rate": 0.0002, "epoch": 3.949730700179533, "step": 55000}, {"loss": 0.6844, "grad_norm": 0.9786432385444641, "learning_rate": 0.0002, "epoch": 3.950448833034111, "step": 55010}, {"loss": 0.6499, "grad_norm": 0.8703538775444031, "learning_rate": 0.0002, "epoch": 3.9511669658886897, "step": 55020}, {"loss": 0.5989, "grad_norm": 0.8970484137535095, "learning_rate": 0.0002, "epoch": 3.9518850987432677, "step": 55030}, {"loss": 0.659, "grad_norm": 0.8781577944755554, "learning_rate": 0.0002, "epoch": 3.9526032315978457, "step": 55040}, {"loss": 0.6944, "grad_norm": 0.8040280938148499, "learning_rate": 0.0002, "epoch": 3.9533213644524237, "step": 55050}, {"loss": 0.6359, "grad_norm": 0.851926326751709, "learning_rate": 0.0002, "epoch": 3.9540394973070017, "step": 55060}, {"loss": 0.6806, "grad_norm": 0.8597240447998047, "learning_rate": 0.0002, "epoch": 3.9547576301615797, "step": 55070}, {"loss": 0.6499, "grad_norm": 0.9461944699287415, "learning_rate": 0.0002, "epoch": 3.955475763016158, "step": 55080}, {"loss": 0.6222, "grad_norm": 0.7576611042022705, "learning_rate": 0.0002, "epoch": 3.956193895870736, "step": 55090}, {"loss": 0.6735, "grad_norm": 0.9484710693359375, "learning_rate": 0.0002, "epoch": 3.956912028725314, "step": 55100}, {"loss": 0.6586, "grad_norm": 0.9487117528915405, "learning_rate": 0.0002, "epoch": 3.957630161579892, "step": 55110}, {"loss": 0.6632, "grad_norm": 0.870090663433075, "learning_rate": 0.0002, "epoch": 3.9583482944344706, "step": 55120}, {"loss": 0.6786, "grad_norm": 0.8496458530426025, "learning_rate": 0.0002, "epoch": 3.9590664272890486, "step": 55130}, {"loss": 0.6631, "grad_norm": 1.0121779441833496, "learning_rate": 0.0002, "epoch": 3.9597845601436266, "step": 55140}, {"loss": 0.7005, "grad_norm": 0.8912323713302612, "learning_rate": 0.0002, "epoch": 3.9605026929982046, "step": 55150}, {"loss": 0.6398, "grad_norm": 0.8398444652557373, "learning_rate": 0.0002, "epoch": 3.9612208258527826, "step": 55160}, {"loss": 0.6183, "grad_norm": 0.8046348690986633, "learning_rate": 0.0002, "epoch": 3.961938958707361, "step": 55170}, {"loss": 0.6357, "grad_norm": 1.0369254350662231, "learning_rate": 0.0002, "epoch": 3.962657091561939, "step": 55180}, {"loss": 0.6053, "grad_norm": 1.172431230545044, "learning_rate": 0.0002, "epoch": 3.963375224416517, "step": 55190}, {"loss": 0.643, "grad_norm": 0.8093554377555847, "learning_rate": 0.0002, "epoch": 3.964093357271095, "step": 55200}, {"loss": 0.6416, "grad_norm": 0.8851078748703003, "learning_rate": 0.0002, "epoch": 3.9648114901256735, "step": 55210}, {"loss": 0.6516, "grad_norm": 0.7494266033172607, "learning_rate": 0.0002, "epoch": 3.9655296229802515, "step": 55220}, {"loss": 0.629, "grad_norm": 0.9556898474693298, "learning_rate": 0.0002, "epoch": 3.9662477558348295, "step": 55230}, {"loss": 0.6481, "grad_norm": 1.016017198562622, "learning_rate": 0.0002, "epoch": 3.9669658886894075, "step": 55240}, {"loss": 0.7185, "grad_norm": 0.8425998091697693, "learning_rate": 0.0002, "epoch": 3.9676840215439855, "step": 55250}, {"loss": 0.6609, "grad_norm": 0.717673122882843, "learning_rate": 0.0002, "epoch": 3.9684021543985635, "step": 55260}, {"loss": 0.6453, "grad_norm": 0.8366572856903076, "learning_rate": 0.0002, "epoch": 3.969120287253142, "step": 55270}, {"loss": 0.6841, "grad_norm": 0.8981583118438721, "learning_rate": 0.0002, "epoch": 3.96983842010772, "step": 55280}, {"loss": 0.6351, "grad_norm": 0.8868781328201294, "learning_rate": 0.0002, "epoch": 3.970556552962298, "step": 55290}, {"loss": 0.6755, "grad_norm": 1.0632785558700562, "learning_rate": 0.0002, "epoch": 3.9712746858168764, "step": 55300}, {"loss": 0.6433, "grad_norm": 0.8813109993934631, "learning_rate": 0.0002, "epoch": 3.9719928186714544, "step": 55310}, {"loss": 0.5699, "grad_norm": 0.8225542306900024, "learning_rate": 0.0002, "epoch": 3.9727109515260324, "step": 55320}, {"loss": 0.6591, "grad_norm": 1.1391420364379883, "learning_rate": 0.0002, "epoch": 3.9734290843806104, "step": 55330}, {"loss": 0.6551, "grad_norm": 1.0371832847595215, "learning_rate": 0.0002, "epoch": 3.9741472172351884, "step": 55340}, {"loss": 0.7538, "grad_norm": 1.0542186498641968, "learning_rate": 0.0002, "epoch": 3.9748653500897664, "step": 55350}, {"loss": 0.6799, "grad_norm": 1.0178009271621704, "learning_rate": 0.0002, "epoch": 3.975583482944345, "step": 55360}, {"loss": 0.6394, "grad_norm": 0.7927802205085754, "learning_rate": 0.0002, "epoch": 3.976301615798923, "step": 55370}, {"loss": 0.6632, "grad_norm": 0.9350495934486389, "learning_rate": 0.0002, "epoch": 3.977019748653501, "step": 55380}, {"loss": 0.6889, "grad_norm": 1.0240116119384766, "learning_rate": 0.0002, "epoch": 3.977737881508079, "step": 55390}, {"loss": 0.6756, "grad_norm": 1.0279067754745483, "learning_rate": 0.0002, "epoch": 3.9784560143626573, "step": 55400}, {"loss": 0.6979, "grad_norm": 1.1228227615356445, "learning_rate": 0.0002, "epoch": 3.9791741472172353, "step": 55410}, {"loss": 0.6595, "grad_norm": 0.9500134587287903, "learning_rate": 0.0002, "epoch": 3.9798922800718133, "step": 55420}, {"loss": 0.6875, "grad_norm": 0.9229732155799866, "learning_rate": 0.0002, "epoch": 3.9806104129263913, "step": 55430}, {"loss": 0.6742, "grad_norm": 0.7946729063987732, "learning_rate": 0.0002, "epoch": 3.9813285457809693, "step": 55440}, {"loss": 0.6643, "grad_norm": 0.9987489581108093, "learning_rate": 0.0002, "epoch": 3.9820466786355477, "step": 55450}, {"loss": 0.6642, "grad_norm": 0.9670467972755432, "learning_rate": 0.0002, "epoch": 3.9827648114901257, "step": 55460}, {"loss": 0.6603, "grad_norm": 0.835028350353241, "learning_rate": 0.0002, "epoch": 3.9834829443447037, "step": 55470}, {"loss": 0.6198, "grad_norm": 0.8678702712059021, "learning_rate": 0.0002, "epoch": 3.9842010771992817, "step": 55480}, {"loss": 0.6581, "grad_norm": 0.8581197261810303, "learning_rate": 0.0002, "epoch": 3.98491921005386, "step": 55490}, {"loss": 0.614, "grad_norm": 0.779848039150238, "learning_rate": 0.0002, "epoch": 3.985637342908438, "step": 55500}, {"loss": 0.634, "grad_norm": 0.8827589154243469, "learning_rate": 0.0002, "epoch": 3.986355475763016, "step": 55510}, {"loss": 0.624, "grad_norm": 1.0108301639556885, "learning_rate": 0.0002, "epoch": 3.987073608617594, "step": 55520}, {"loss": 0.6553, "grad_norm": 0.8506004214286804, "learning_rate": 0.0002, "epoch": 3.987791741472172, "step": 55530}, {"loss": 0.6229, "grad_norm": 1.0297727584838867, "learning_rate": 0.0002, "epoch": 3.98850987432675, "step": 55540}, {"loss": 0.6551, "grad_norm": 0.8579224944114685, "learning_rate": 0.0002, "epoch": 3.9892280071813286, "step": 55550}, {"loss": 0.6491, "grad_norm": 0.8503788113594055, "learning_rate": 0.0002, "epoch": 3.9899461400359066, "step": 55560}, {"loss": 0.6941, "grad_norm": 1.1144801378250122, "learning_rate": 0.0002, "epoch": 3.9906642728904846, "step": 55570}, {"loss": 0.6956, "grad_norm": 0.8418305516242981, "learning_rate": 0.0002, "epoch": 3.991382405745063, "step": 55580}, {"loss": 0.6226, "grad_norm": 1.0065871477127075, "learning_rate": 0.0002, "epoch": 3.992100538599641, "step": 55590}, {"loss": 0.6775, "grad_norm": 0.8160259127616882, "learning_rate": 0.0002, "epoch": 3.992818671454219, "step": 55600}, {"loss": 0.624, "grad_norm": 0.8678009510040283, "learning_rate": 0.0002, "epoch": 3.993536804308797, "step": 55610}, {"loss": 0.6552, "grad_norm": 0.863465428352356, "learning_rate": 0.0002, "epoch": 3.994254937163375, "step": 55620}, {"loss": 0.6764, "grad_norm": 0.9242135286331177, "learning_rate": 0.0002, "epoch": 3.994973070017953, "step": 55630}, {"loss": 0.6774, "grad_norm": 1.0285470485687256, "learning_rate": 0.0002, "epoch": 3.9956912028725315, "step": 55640}, {"loss": 0.6882, "grad_norm": 0.8953320384025574, "learning_rate": 0.0002, "epoch": 3.9964093357271095, "step": 55650}, {"loss": 0.6935, "grad_norm": 0.915892481803894, "learning_rate": 0.0002, "epoch": 3.9971274685816875, "step": 55660}, {"loss": 0.641, "grad_norm": 0.8235118985176086, "learning_rate": 0.0002, "epoch": 3.9978456014362656, "step": 55670}, {"loss": 0.6417, "grad_norm": 1.0178656578063965, "learning_rate": 0.0002, "epoch": 3.998563734290844, "step": 55680}, {"loss": 0.6635, "grad_norm": 0.9926803708076477, "learning_rate": 0.0002, "epoch": 3.999281867145422, "step": 55690}, {"loss": 0.6476, "grad_norm": 0.9213629961013794, "learning_rate": 0.0002, "epoch": 4.0, "step": 55700}, {"eval_loss": 1.1152480840682983, "eval_runtime": 55.2237, "eval_samples_per_second": 13.273, "eval_steps_per_second": 1.666, "epoch": 4.0, "step": 55700}, {"loss": 0.6085, "grad_norm": 1.0820496082305908, "learning_rate": 0.0002, "epoch": 4.000718132854578, "step": 55710}, {"loss": 0.5506, "grad_norm": 0.9036441445350647, "learning_rate": 0.0002, "epoch": 4.001436265709156, "step": 55720}, {"loss": 0.5924, "grad_norm": 1.102754831314087, "learning_rate": 0.0002, "epoch": 4.002154398563734, "step": 55730}, {"loss": 0.6192, "grad_norm": 0.98259437084198, "learning_rate": 0.0002, "epoch": 4.002872531418312, "step": 55740}, {"loss": 0.567, "grad_norm": 1.1935845613479614, "learning_rate": 0.0002, "epoch": 4.003590664272891, "step": 55750}, {"loss": 0.6205, "grad_norm": 0.9925830960273743, "learning_rate": 0.0002, "epoch": 4.004308797127469, "step": 55760}, {"loss": 0.5545, "grad_norm": 1.075087070465088, "learning_rate": 0.0002, "epoch": 4.005026929982047, "step": 55770}, {"loss": 0.5591, "grad_norm": 0.8746396899223328, "learning_rate": 0.0002, "epoch": 4.005745062836625, "step": 55780}, {"loss": 0.5745, "grad_norm": 0.7635995745658875, "learning_rate": 0.0002, "epoch": 4.006463195691203, "step": 55790}, {"loss": 0.599, "grad_norm": 0.9064885377883911, "learning_rate": 0.0002, "epoch": 4.007181328545781, "step": 55800}, {"loss": 0.5668, "grad_norm": 1.018478274345398, "learning_rate": 0.0002, "epoch": 4.007899461400359, "step": 55810}, {"loss": 0.5573, "grad_norm": 0.9797589778900146, "learning_rate": 0.0002, "epoch": 4.008617594254937, "step": 55820}, {"loss": 0.5784, "grad_norm": 0.7867457866668701, "learning_rate": 0.0002, "epoch": 4.009335727109515, "step": 55830}, {"loss": 0.5607, "grad_norm": 0.9998070597648621, "learning_rate": 0.0002, "epoch": 4.010053859964093, "step": 55840}, {"loss": 0.5655, "grad_norm": 0.8656311631202698, "learning_rate": 0.0002, "epoch": 4.010771992818672, "step": 55850}, {"loss": 0.533, "grad_norm": 0.945469081401825, "learning_rate": 0.0002, "epoch": 4.01149012567325, "step": 55860}, {"loss": 0.625, "grad_norm": 0.8809926509857178, "learning_rate": 0.0002, "epoch": 4.012208258527828, "step": 55870}, {"loss": 0.5795, "grad_norm": 0.8047897219657898, "learning_rate": 0.0002, "epoch": 4.012926391382406, "step": 55880}, {"loss": 0.5322, "grad_norm": 1.0563900470733643, "learning_rate": 0.0002, "epoch": 4.013644524236984, "step": 55890}, {"loss": 0.5597, "grad_norm": 0.8578300476074219, "learning_rate": 0.0002, "epoch": 4.014362657091562, "step": 55900}, {"loss": 0.5634, "grad_norm": 1.0304765701293945, "learning_rate": 0.0002, "epoch": 4.01508078994614, "step": 55910}, {"loss": 0.558, "grad_norm": 0.8087666034698486, "learning_rate": 0.0002, "epoch": 4.015798922800718, "step": 55920}, {"loss": 0.5557, "grad_norm": 1.0192348957061768, "learning_rate": 0.0002, "epoch": 4.016517055655296, "step": 55930}, {"loss": 0.6269, "grad_norm": 1.061194658279419, "learning_rate": 0.0002, "epoch": 4.017235188509875, "step": 55940}, {"loss": 0.5812, "grad_norm": 0.93668133020401, "learning_rate": 0.0002, "epoch": 4.017953321364453, "step": 55950}, {"loss": 0.6104, "grad_norm": 1.1569286584854126, "learning_rate": 0.0002, "epoch": 4.018671454219031, "step": 55960}, {"loss": 0.5832, "grad_norm": 0.9853817224502563, "learning_rate": 0.0002, "epoch": 4.019389587073609, "step": 55970}, {"loss": 0.6154, "grad_norm": 0.851109504699707, "learning_rate": 0.0002, "epoch": 4.020107719928187, "step": 55980}, {"loss": 0.5993, "grad_norm": 1.053525447845459, "learning_rate": 0.0002, "epoch": 4.020825852782765, "step": 55990}, {"loss": 0.571, "grad_norm": 0.8307225704193115, "learning_rate": 0.0002, "epoch": 4.021543985637343, "step": 56000}, {"loss": 0.5419, "grad_norm": 1.2741150856018066, "learning_rate": 0.0002, "epoch": 4.022262118491921, "step": 56010}, {"loss": 0.6001, "grad_norm": 0.9708344340324402, "learning_rate": 0.0002, "epoch": 4.022980251346499, "step": 56020}, {"loss": 0.5989, "grad_norm": 1.265034556388855, "learning_rate": 0.0002, "epoch": 4.023698384201078, "step": 56030}, {"loss": 0.5852, "grad_norm": 0.9364367723464966, "learning_rate": 0.0002, "epoch": 4.024416517055656, "step": 56040}, {"loss": 0.6108, "grad_norm": 0.8643592000007629, "learning_rate": 0.0002, "epoch": 4.025134649910234, "step": 56050}, {"loss": 0.6074, "grad_norm": 0.9742133021354675, "learning_rate": 0.0002, "epoch": 4.025852782764812, "step": 56060}, {"loss": 0.5699, "grad_norm": 1.1793473958969116, "learning_rate": 0.0002, "epoch": 4.02657091561939, "step": 56070}, {"loss": 0.5911, "grad_norm": 0.9641149044036865, "learning_rate": 0.0002, "epoch": 4.027289048473968, "step": 56080}, {"loss": 0.6083, "grad_norm": 0.9426136016845703, "learning_rate": 0.0002, "epoch": 4.028007181328546, "step": 56090}, {"loss": 0.5692, "grad_norm": 0.9211869835853577, "learning_rate": 0.0002, "epoch": 4.028725314183124, "step": 56100}, {"loss": 0.6109, "grad_norm": 1.1576565504074097, "learning_rate": 0.0002, "epoch": 4.029443447037702, "step": 56110}, {"loss": 0.5684, "grad_norm": 1.0014013051986694, "learning_rate": 0.0002, "epoch": 4.03016157989228, "step": 56120}, {"loss": 0.6017, "grad_norm": 0.9307010769844055, "learning_rate": 0.0002, "epoch": 4.0308797127468585, "step": 56130}, {"loss": 0.5582, "grad_norm": 0.8290148377418518, "learning_rate": 0.0002, "epoch": 4.0315978456014365, "step": 56140}, {"loss": 0.5921, "grad_norm": 1.0648446083068848, "learning_rate": 0.0002, "epoch": 4.0323159784560145, "step": 56150}, {"loss": 0.6116, "grad_norm": 1.1545547246932983, "learning_rate": 0.0002, "epoch": 4.0330341113105925, "step": 56160}, {"loss": 0.6301, "grad_norm": 0.9643545150756836, "learning_rate": 0.0002, "epoch": 4.0337522441651705, "step": 56170}, {"loss": 0.5655, "grad_norm": 0.8913900256156921, "learning_rate": 0.0002, "epoch": 4.0344703770197485, "step": 56180}, {"loss": 0.5897, "grad_norm": 0.9445754289627075, "learning_rate": 0.0002, "epoch": 4.0351885098743265, "step": 56190}, {"loss": 0.6204, "grad_norm": 0.9353124499320984, "learning_rate": 0.0002, "epoch": 4.0359066427289045, "step": 56200}, {"loss": 0.6017, "grad_norm": 1.1780431270599365, "learning_rate": 0.0002, "epoch": 4.0366247755834825, "step": 56210}, {"loss": 0.5767, "grad_norm": 0.9208880662918091, "learning_rate": 0.0002, "epoch": 4.037342908438061, "step": 56220}, {"loss": 0.5367, "grad_norm": 0.9475517272949219, "learning_rate": 0.0002, "epoch": 4.038061041292639, "step": 56230}, {"loss": 0.576, "grad_norm": 0.7478583455085754, "learning_rate": 0.0002, "epoch": 4.038779174147217, "step": 56240}, {"loss": 0.5616, "grad_norm": 1.0026403665542603, "learning_rate": 0.0002, "epoch": 4.039497307001795, "step": 56250}, {"loss": 0.6031, "grad_norm": 0.9664973020553589, "learning_rate": 0.0002, "epoch": 4.040215439856373, "step": 56260}, {"loss": 0.5764, "grad_norm": 1.0655616521835327, "learning_rate": 0.0002, "epoch": 4.040933572710951, "step": 56270}, {"loss": 0.5862, "grad_norm": 0.8367540240287781, "learning_rate": 0.0002, "epoch": 4.041651705565529, "step": 56280}, {"loss": 0.5828, "grad_norm": 0.7982191443443298, "learning_rate": 0.0002, "epoch": 4.042369838420107, "step": 56290}, {"loss": 0.5637, "grad_norm": 0.8304495215415955, "learning_rate": 0.0002, "epoch": 4.043087971274685, "step": 56300}, {"loss": 0.5974, "grad_norm": 0.95123291015625, "learning_rate": 0.0002, "epoch": 4.043806104129264, "step": 56310}, {"loss": 0.617, "grad_norm": 0.9504102468490601, "learning_rate": 0.0002, "epoch": 4.044524236983842, "step": 56320}, {"loss": 0.6143, "grad_norm": 0.7432710528373718, "learning_rate": 0.0002, "epoch": 4.04524236983842, "step": 56330}, {"loss": 0.6157, "grad_norm": 0.9327874183654785, "learning_rate": 0.0002, "epoch": 4.045960502692998, "step": 56340}, {"loss": 0.591, "grad_norm": 0.9161670804023743, "learning_rate": 0.0002, "epoch": 4.046678635547576, "step": 56350}, {"loss": 0.6111, "grad_norm": 0.9371771812438965, "learning_rate": 0.0002, "epoch": 4.047396768402154, "step": 56360}, {"loss": 0.6101, "grad_norm": 1.0332437753677368, "learning_rate": 0.0002, "epoch": 4.048114901256732, "step": 56370}, {"loss": 0.5451, "grad_norm": 0.7346320748329163, "learning_rate": 0.0002, "epoch": 4.04883303411131, "step": 56380}, {"loss": 0.6416, "grad_norm": 0.8247857689857483, "learning_rate": 0.0002, "epoch": 4.049551166965888, "step": 56390}, {"loss": 0.6208, "grad_norm": 0.925325334072113, "learning_rate": 0.0002, "epoch": 4.050269299820466, "step": 56400}, {"loss": 0.558, "grad_norm": 0.7344088554382324, "learning_rate": 0.0002, "epoch": 4.050987432675045, "step": 56410}, {"loss": 0.5978, "grad_norm": 0.9204918146133423, "learning_rate": 0.0002, "epoch": 4.051705565529623, "step": 56420}, {"loss": 0.5788, "grad_norm": 0.8273472785949707, "learning_rate": 0.0002, "epoch": 4.052423698384201, "step": 56430}, {"loss": 0.5551, "grad_norm": 0.9524998068809509, "learning_rate": 0.0002, "epoch": 4.053141831238779, "step": 56440}, {"loss": 0.5836, "grad_norm": 0.9168205857276917, "learning_rate": 0.0002, "epoch": 4.053859964093357, "step": 56450}, {"loss": 0.6035, "grad_norm": 0.9634994864463806, "learning_rate": 0.0002, "epoch": 4.054578096947935, "step": 56460}, {"loss": 0.5907, "grad_norm": 1.2027593851089478, "learning_rate": 0.0002, "epoch": 4.055296229802513, "step": 56470}, {"loss": 0.5691, "grad_norm": 1.2347805500030518, "learning_rate": 0.0002, "epoch": 4.056014362657091, "step": 56480}, {"loss": 0.5789, "grad_norm": 0.8621458411216736, "learning_rate": 0.0002, "epoch": 4.056732495511669, "step": 56490}, {"loss": 0.6082, "grad_norm": 0.9194608330726624, "learning_rate": 0.0002, "epoch": 4.057450628366248, "step": 56500}, {"loss": 0.5667, "grad_norm": 1.0153663158416748, "learning_rate": 0.0002, "epoch": 4.058168761220826, "step": 56510}, {"loss": 0.5908, "grad_norm": 0.9170986413955688, "learning_rate": 0.0002, "epoch": 4.058886894075404, "step": 56520}, {"loss": 0.5672, "grad_norm": 1.033057689666748, "learning_rate": 0.0002, "epoch": 4.059605026929982, "step": 56530}, {"loss": 0.5577, "grad_norm": 1.0125197172164917, "learning_rate": 0.0002, "epoch": 4.06032315978456, "step": 56540}, {"loss": 0.5821, "grad_norm": 0.9429898262023926, "learning_rate": 0.0002, "epoch": 4.061041292639138, "step": 56550}, {"loss": 0.5655, "grad_norm": 0.9242179989814758, "learning_rate": 0.0002, "epoch": 4.061759425493716, "step": 56560}, {"loss": 0.5568, "grad_norm": 0.9365091323852539, "learning_rate": 0.0002, "epoch": 4.062477558348294, "step": 56570}, {"loss": 0.6104, "grad_norm": 0.9148455858230591, "learning_rate": 0.0002, "epoch": 4.063195691202872, "step": 56580}, {"loss": 0.5891, "grad_norm": 0.8546709418296814, "learning_rate": 0.0002, "epoch": 4.063913824057451, "step": 56590}, {"loss": 0.6079, "grad_norm": 0.9743902087211609, "learning_rate": 0.0002, "epoch": 4.064631956912029, "step": 56600}, {"loss": 0.6109, "grad_norm": 1.0599974393844604, "learning_rate": 0.0002, "epoch": 4.065350089766607, "step": 56610}, {"loss": 0.5746, "grad_norm": 0.9677841067314148, "learning_rate": 0.0002, "epoch": 4.066068222621185, "step": 56620}, {"loss": 0.5957, "grad_norm": 0.8892754316329956, "learning_rate": 0.0002, "epoch": 4.066786355475763, "step": 56630}, {"loss": 0.5899, "grad_norm": 0.8837814331054688, "learning_rate": 0.0002, "epoch": 4.067504488330341, "step": 56640}, {"loss": 0.5784, "grad_norm": 0.9284095764160156, "learning_rate": 0.0002, "epoch": 4.068222621184919, "step": 56650}, {"loss": 0.5829, "grad_norm": 1.0163567066192627, "learning_rate": 0.0002, "epoch": 4.068940754039497, "step": 56660}, {"loss": 0.5349, "grad_norm": 0.8713456988334656, "learning_rate": 0.0002, "epoch": 4.069658886894075, "step": 56670}, {"loss": 0.5345, "grad_norm": 0.8356686234474182, "learning_rate": 0.0002, "epoch": 4.070377019748653, "step": 56680}, {"loss": 0.5473, "grad_norm": 0.8998766541481018, "learning_rate": 0.0002, "epoch": 4.071095152603232, "step": 56690}, {"loss": 0.5896, "grad_norm": 1.0441967248916626, "learning_rate": 0.0002, "epoch": 4.07181328545781, "step": 56700}, {"loss": 0.5817, "grad_norm": 0.9313125610351562, "learning_rate": 0.0002, "epoch": 4.072531418312388, "step": 56710}, {"loss": 0.5477, "grad_norm": 0.9912964701652527, "learning_rate": 0.0002, "epoch": 4.073249551166966, "step": 56720}, {"loss": 0.5974, "grad_norm": 0.9048459529876709, "learning_rate": 0.0002, "epoch": 4.073967684021544, "step": 56730}, {"loss": 0.5927, "grad_norm": 1.0248944759368896, "learning_rate": 0.0002, "epoch": 4.074685816876122, "step": 56740}, {"loss": 0.6019, "grad_norm": 1.4526786804199219, "learning_rate": 0.0002, "epoch": 4.0754039497307, "step": 56750}, {"loss": 0.6267, "grad_norm": 0.9813178181648254, "learning_rate": 0.0002, "epoch": 4.076122082585278, "step": 56760}, {"loss": 0.5707, "grad_norm": 1.0686813592910767, "learning_rate": 0.0002, "epoch": 4.076840215439856, "step": 56770}, {"loss": 0.5857, "grad_norm": 1.1093482971191406, "learning_rate": 0.0002, "epoch": 4.077558348294435, "step": 56780}, {"loss": 0.5768, "grad_norm": 0.9377819895744324, "learning_rate": 0.0002, "epoch": 4.078276481149013, "step": 56790}, {"loss": 0.6342, "grad_norm": 0.8043649196624756, "learning_rate": 0.0002, "epoch": 4.078994614003591, "step": 56800}, {"loss": 0.6005, "grad_norm": 0.7995415925979614, "learning_rate": 0.0002, "epoch": 4.079712746858169, "step": 56810}, {"loss": 0.5466, "grad_norm": 1.0076148509979248, "learning_rate": 0.0002, "epoch": 4.080430879712747, "step": 56820}, {"loss": 0.6021, "grad_norm": 0.8192076683044434, "learning_rate": 0.0002, "epoch": 4.081149012567325, "step": 56830}, {"loss": 0.5439, "grad_norm": 0.9226266145706177, "learning_rate": 0.0002, "epoch": 4.081867145421903, "step": 56840}, {"loss": 0.5893, "grad_norm": 0.8877972960472107, "learning_rate": 0.0002, "epoch": 4.082585278276481, "step": 56850}, {"loss": 0.5774, "grad_norm": 0.9578937888145447, "learning_rate": 0.0002, "epoch": 4.083303411131059, "step": 56860}, {"loss": 0.5946, "grad_norm": 0.8929167985916138, "learning_rate": 0.0002, "epoch": 4.084021543985638, "step": 56870}, {"loss": 0.5226, "grad_norm": 1.0015977621078491, "learning_rate": 0.0002, "epoch": 4.084739676840216, "step": 56880}, {"loss": 0.5931, "grad_norm": 0.9768750667572021, "learning_rate": 0.0002, "epoch": 4.085457809694794, "step": 56890}, {"loss": 0.5983, "grad_norm": 1.0834569931030273, "learning_rate": 0.0002, "epoch": 4.086175942549372, "step": 56900}, {"loss": 0.5786, "grad_norm": 0.8761230707168579, "learning_rate": 0.0002, "epoch": 4.08689407540395, "step": 56910}, {"loss": 0.5708, "grad_norm": 1.027064323425293, "learning_rate": 0.0002, "epoch": 4.087612208258528, "step": 56920}, {"loss": 0.601, "grad_norm": 1.130336880683899, "learning_rate": 0.0002, "epoch": 4.088330341113106, "step": 56930}, {"loss": 0.5664, "grad_norm": 0.8157579898834229, "learning_rate": 0.0002, "epoch": 4.089048473967684, "step": 56940}, {"loss": 0.5789, "grad_norm": 1.071175217628479, "learning_rate": 0.0002, "epoch": 4.089766606822262, "step": 56950}, {"loss": 0.5942, "grad_norm": 0.9534492492675781, "learning_rate": 0.0002, "epoch": 4.09048473967684, "step": 56960}, {"loss": 0.5803, "grad_norm": 0.9584037661552429, "learning_rate": 0.0002, "epoch": 4.091202872531419, "step": 56970}, {"loss": 0.5647, "grad_norm": 1.1513131856918335, "learning_rate": 0.0002, "epoch": 4.091921005385997, "step": 56980}, {"loss": 0.5971, "grad_norm": 1.0167666673660278, "learning_rate": 0.0002, "epoch": 4.092639138240575, "step": 56990}, {"loss": 0.5981, "grad_norm": 1.0630987882614136, "learning_rate": 0.0002, "epoch": 4.093357271095153, "step": 57000}, {"loss": 0.5734, "grad_norm": 1.0326893329620361, "learning_rate": 0.0002, "epoch": 4.094075403949731, "step": 57010}, {"loss": 0.572, "grad_norm": 0.9701678156852722, "learning_rate": 0.0002, "epoch": 4.094793536804309, "step": 57020}, {"loss": 0.5815, "grad_norm": 0.839935302734375, "learning_rate": 0.0002, "epoch": 4.095511669658887, "step": 57030}, {"loss": 0.6051, "grad_norm": 0.8995838761329651, "learning_rate": 0.0002, "epoch": 4.096229802513465, "step": 57040}, {"loss": 0.6037, "grad_norm": 0.8039916157722473, "learning_rate": 0.0002, "epoch": 4.096947935368043, "step": 57050}, {"loss": 0.5597, "grad_norm": 1.126122236251831, "learning_rate": 0.0002, "epoch": 4.097666068222622, "step": 57060}, {"loss": 0.5943, "grad_norm": 0.8749837875366211, "learning_rate": 0.0002, "epoch": 4.0983842010772, "step": 57070}, {"loss": 0.6017, "grad_norm": 0.8630341291427612, "learning_rate": 0.0002, "epoch": 4.099102333931778, "step": 57080}, {"loss": 0.6083, "grad_norm": 0.8889496922492981, "learning_rate": 0.0002, "epoch": 4.099820466786356, "step": 57090}, {"loss": 0.5727, "grad_norm": 0.9050310254096985, "learning_rate": 0.0002, "epoch": 4.100538599640934, "step": 57100}, {"loss": 0.5824, "grad_norm": 0.943072497844696, "learning_rate": 0.0002, "epoch": 4.101256732495512, "step": 57110}, {"loss": 0.6036, "grad_norm": 0.9031552672386169, "learning_rate": 0.0002, "epoch": 4.10197486535009, "step": 57120}, {"loss": 0.5913, "grad_norm": 0.939862847328186, "learning_rate": 0.0002, "epoch": 4.102692998204668, "step": 57130}, {"loss": 0.5738, "grad_norm": 0.8080634474754333, "learning_rate": 0.0002, "epoch": 4.103411131059246, "step": 57140}, {"loss": 0.5841, "grad_norm": 0.9181693196296692, "learning_rate": 0.0002, "epoch": 4.1041292639138245, "step": 57150}, {"loss": 0.5561, "grad_norm": 0.9609217643737793, "learning_rate": 0.0002, "epoch": 4.1048473967684025, "step": 57160}, {"loss": 0.5572, "grad_norm": 1.1246516704559326, "learning_rate": 0.0002, "epoch": 4.1055655296229805, "step": 57170}, {"loss": 0.5886, "grad_norm": 1.0616880655288696, "learning_rate": 0.0002, "epoch": 4.1062836624775585, "step": 57180}, {"loss": 0.5579, "grad_norm": 0.9954505562782288, "learning_rate": 0.0002, "epoch": 4.1070017953321365, "step": 57190}, {"loss": 0.5899, "grad_norm": 1.0602279901504517, "learning_rate": 0.0002, "epoch": 4.1077199281867145, "step": 57200}, {"loss": 0.5747, "grad_norm": 0.8984764814376831, "learning_rate": 0.0002, "epoch": 4.1084380610412925, "step": 57210}, {"loss": 0.5502, "grad_norm": 0.845167875289917, "learning_rate": 0.0002, "epoch": 4.1091561938958705, "step": 57220}, {"loss": 0.6147, "grad_norm": 0.7901500463485718, "learning_rate": 0.0002, "epoch": 4.1098743267504485, "step": 57230}, {"loss": 0.5883, "grad_norm": 1.0462526082992554, "learning_rate": 0.0002, "epoch": 4.1105924596050265, "step": 57240}, {"loss": 0.6334, "grad_norm": 0.9098827838897705, "learning_rate": 0.0002, "epoch": 4.111310592459605, "step": 57250}, {"loss": 0.5794, "grad_norm": 0.9234077334403992, "learning_rate": 0.0002, "epoch": 4.112028725314183, "step": 57260}, {"loss": 0.623, "grad_norm": 1.0033560991287231, "learning_rate": 0.0002, "epoch": 4.112746858168761, "step": 57270}, {"loss": 0.5392, "grad_norm": 1.0620051622390747, "learning_rate": 0.0002, "epoch": 4.113464991023339, "step": 57280}, {"loss": 0.6144, "grad_norm": 0.8679345846176147, "learning_rate": 0.0002, "epoch": 4.114183123877917, "step": 57290}, {"loss": 0.5951, "grad_norm": 0.7557345628738403, "learning_rate": 0.0002, "epoch": 4.114901256732495, "step": 57300}, {"loss": 0.575, "grad_norm": 0.8970935344696045, "learning_rate": 0.0002, "epoch": 4.115619389587073, "step": 57310}, {"loss": 0.5595, "grad_norm": 1.0779842138290405, "learning_rate": 0.0002, "epoch": 4.116337522441651, "step": 57320}, {"loss": 0.5532, "grad_norm": 1.2036106586456299, "learning_rate": 0.0002, "epoch": 4.117055655296229, "step": 57330}, {"loss": 0.5959, "grad_norm": 0.8337953686714172, "learning_rate": 0.0002, "epoch": 4.117773788150808, "step": 57340}, {"loss": 0.6128, "grad_norm": 0.9850410223007202, "learning_rate": 0.0002, "epoch": 4.118491921005386, "step": 57350}, {"loss": 0.5676, "grad_norm": 0.8028770685195923, "learning_rate": 0.0002, "epoch": 4.119210053859964, "step": 57360}, {"loss": 0.5693, "grad_norm": 0.8693217039108276, "learning_rate": 0.0002, "epoch": 4.119928186714542, "step": 57370}, {"loss": 0.5897, "grad_norm": 0.8795534372329712, "learning_rate": 0.0002, "epoch": 4.12064631956912, "step": 57380}, {"loss": 0.5692, "grad_norm": 1.0081543922424316, "learning_rate": 0.0002, "epoch": 4.121364452423698, "step": 57390}, {"loss": 0.6027, "grad_norm": 0.8776742219924927, "learning_rate": 0.0002, "epoch": 4.122082585278276, "step": 57400}, {"loss": 0.6418, "grad_norm": 0.8247824311256409, "learning_rate": 0.0002, "epoch": 4.122800718132854, "step": 57410}, {"loss": 0.5537, "grad_norm": 1.1346335411071777, "learning_rate": 0.0002, "epoch": 4.123518850987432, "step": 57420}, {"loss": 0.5949, "grad_norm": 1.0671089887619019, "learning_rate": 0.0002, "epoch": 4.124236983842011, "step": 57430}, {"loss": 0.5908, "grad_norm": 0.8548333048820496, "learning_rate": 0.0002, "epoch": 4.124955116696589, "step": 57440}, {"loss": 0.5967, "grad_norm": 1.0221573114395142, "learning_rate": 0.0002, "epoch": 4.125673249551167, "step": 57450}, {"loss": 0.6238, "grad_norm": 0.9746617674827576, "learning_rate": 0.0002, "epoch": 4.126391382405745, "step": 57460}, {"loss": 0.5855, "grad_norm": 0.8104965090751648, "learning_rate": 0.0002, "epoch": 4.127109515260323, "step": 57470}, {"loss": 0.5724, "grad_norm": 1.0401487350463867, "learning_rate": 0.0002, "epoch": 4.127827648114901, "step": 57480}, {"loss": 0.5956, "grad_norm": 0.8828882575035095, "learning_rate": 0.0002, "epoch": 4.128545780969479, "step": 57490}, {"loss": 0.5851, "grad_norm": 1.0121098756790161, "learning_rate": 0.0002, "epoch": 4.129263913824057, "step": 57500}, {"loss": 0.5923, "grad_norm": 0.8789737820625305, "learning_rate": 0.0002, "epoch": 4.129982046678635, "step": 57510}, {"loss": 0.5929, "grad_norm": 1.0386744737625122, "learning_rate": 0.0002, "epoch": 4.130700179533213, "step": 57520}, {"loss": 0.6104, "grad_norm": 1.0092610120773315, "learning_rate": 0.0002, "epoch": 4.131418312387792, "step": 57530}, {"loss": 0.5974, "grad_norm": 0.8706282377243042, "learning_rate": 0.0002, "epoch": 4.13213644524237, "step": 57540}, {"loss": 0.5829, "grad_norm": 0.9270507097244263, "learning_rate": 0.0002, "epoch": 4.132854578096948, "step": 57550}, {"loss": 0.5826, "grad_norm": 1.0303068161010742, "learning_rate": 0.0002, "epoch": 4.133572710951526, "step": 57560}, {"loss": 0.5515, "grad_norm": 1.1169062852859497, "learning_rate": 0.0002, "epoch": 4.134290843806104, "step": 57570}, {"loss": 0.5848, "grad_norm": 0.8530599474906921, "learning_rate": 0.0002, "epoch": 4.135008976660682, "step": 57580}, {"loss": 0.6231, "grad_norm": 1.1395039558410645, "learning_rate": 0.0002, "epoch": 4.13572710951526, "step": 57590}, {"loss": 0.5739, "grad_norm": 0.8944115042686462, "learning_rate": 0.0002, "epoch": 4.136445242369838, "step": 57600}, {"loss": 0.6212, "grad_norm": 1.137966275215149, "learning_rate": 0.0002, "epoch": 4.137163375224416, "step": 57610}, {"loss": 0.6041, "grad_norm": 0.8244962692260742, "learning_rate": 0.0002, "epoch": 4.137881508078995, "step": 57620}, {"loss": 0.6078, "grad_norm": 1.1935817003250122, "learning_rate": 0.0002, "epoch": 4.138599640933573, "step": 57630}, {"loss": 0.5939, "grad_norm": 0.9774235486984253, "learning_rate": 0.0002, "epoch": 4.139317773788151, "step": 57640}, {"loss": 0.5963, "grad_norm": 1.066219449043274, "learning_rate": 0.0002, "epoch": 4.140035906642729, "step": 57650}, {"loss": 0.6008, "grad_norm": 0.8631396293640137, "learning_rate": 0.0002, "epoch": 4.140754039497307, "step": 57660}, {"loss": 0.5622, "grad_norm": 0.888410747051239, "learning_rate": 0.0002, "epoch": 4.141472172351885, "step": 57670}, {"loss": 0.5675, "grad_norm": 1.002642035484314, "learning_rate": 0.0002, "epoch": 4.142190305206463, "step": 57680}, {"loss": 0.5269, "grad_norm": 1.0092825889587402, "learning_rate": 0.0002, "epoch": 4.142908438061041, "step": 57690}, {"loss": 0.588, "grad_norm": 0.9126971364021301, "learning_rate": 0.0002, "epoch": 4.143626570915619, "step": 57700}, {"loss": 0.5593, "grad_norm": 1.0303562879562378, "learning_rate": 0.0002, "epoch": 4.144344703770198, "step": 57710}, {"loss": 0.6183, "grad_norm": 1.1230897903442383, "learning_rate": 0.0002, "epoch": 4.145062836624776, "step": 57720}, {"loss": 0.5934, "grad_norm": 1.0494099855422974, "learning_rate": 0.0002, "epoch": 4.145780969479354, "step": 57730}, {"loss": 0.6022, "grad_norm": 0.9555442333221436, "learning_rate": 0.0002, "epoch": 4.146499102333932, "step": 57740}, {"loss": 0.609, "grad_norm": 0.8255124092102051, "learning_rate": 0.0002, "epoch": 4.14721723518851, "step": 57750}, {"loss": 0.5659, "grad_norm": 1.097853660583496, "learning_rate": 0.0002, "epoch": 4.147935368043088, "step": 57760}, {"loss": 0.5698, "grad_norm": 1.0272663831710815, "learning_rate": 0.0002, "epoch": 4.148653500897666, "step": 57770}, {"loss": 0.5701, "grad_norm": 1.022571086883545, "learning_rate": 0.0002, "epoch": 4.149371633752244, "step": 57780}, {"loss": 0.579, "grad_norm": 0.964543342590332, "learning_rate": 0.0002, "epoch": 4.150089766606822, "step": 57790}, {"loss": 0.6175, "grad_norm": 0.9251219034194946, "learning_rate": 0.0002, "epoch": 4.1508078994614, "step": 57800}, {"loss": 0.564, "grad_norm": 1.081840991973877, "learning_rate": 0.0002, "epoch": 4.151526032315979, "step": 57810}, {"loss": 0.5956, "grad_norm": 0.8989445567131042, "learning_rate": 0.0002, "epoch": 4.152244165170557, "step": 57820}, {"loss": 0.5849, "grad_norm": 0.903629720211029, "learning_rate": 0.0002, "epoch": 4.152962298025135, "step": 57830}, {"loss": 0.6202, "grad_norm": 0.8985397219657898, "learning_rate": 0.0002, "epoch": 4.153680430879713, "step": 57840}, {"loss": 0.5629, "grad_norm": 1.047778844833374, "learning_rate": 0.0002, "epoch": 4.154398563734291, "step": 57850}, {"loss": 0.6045, "grad_norm": 0.9804165363311768, "learning_rate": 0.0002, "epoch": 4.155116696588869, "step": 57860}, {"loss": 0.5815, "grad_norm": 1.187309980392456, "learning_rate": 0.0002, "epoch": 4.155834829443447, "step": 57870}, {"loss": 0.6304, "grad_norm": 0.9854836463928223, "learning_rate": 0.0002, "epoch": 4.156552962298025, "step": 57880}, {"loss": 0.6076, "grad_norm": 0.8494308590888977, "learning_rate": 0.0002, "epoch": 4.157271095152603, "step": 57890}, {"loss": 0.6033, "grad_norm": 0.9359684586524963, "learning_rate": 0.0002, "epoch": 4.157989228007182, "step": 57900}, {"loss": 0.5546, "grad_norm": 0.8971988558769226, "learning_rate": 0.0002, "epoch": 4.15870736086176, "step": 57910}, {"loss": 0.5934, "grad_norm": 0.8848021030426025, "learning_rate": 0.0002, "epoch": 4.159425493716338, "step": 57920}, {"loss": 0.6102, "grad_norm": 0.982877790927887, "learning_rate": 0.0002, "epoch": 4.160143626570916, "step": 57930}, {"loss": 0.6091, "grad_norm": 0.8668819069862366, "learning_rate": 0.0002, "epoch": 4.160861759425494, "step": 57940}, {"loss": 0.5969, "grad_norm": 1.06569504737854, "learning_rate": 0.0002, "epoch": 4.161579892280072, "step": 57950}, {"loss": 0.5799, "grad_norm": 1.165740728378296, "learning_rate": 0.0002, "epoch": 4.16229802513465, "step": 57960}, {"loss": 0.6038, "grad_norm": 1.0534512996673584, "learning_rate": 0.0002, "epoch": 4.163016157989228, "step": 57970}, {"loss": 0.594, "grad_norm": 0.8785330653190613, "learning_rate": 0.0002, "epoch": 4.163734290843806, "step": 57980}, {"loss": 0.5981, "grad_norm": 1.1244874000549316, "learning_rate": 0.0002, "epoch": 4.164452423698384, "step": 57990}, {"loss": 0.6456, "grad_norm": 0.8839399218559265, "learning_rate": 0.0002, "epoch": 4.165170556552963, "step": 58000}, {"loss": 0.5767, "grad_norm": 1.0603798627853394, "learning_rate": 0.0002, "epoch": 4.165888689407541, "step": 58010}, {"loss": 0.6334, "grad_norm": 0.9737853407859802, "learning_rate": 0.0002, "epoch": 4.166606822262119, "step": 58020}, {"loss": 0.5901, "grad_norm": 1.0650558471679688, "learning_rate": 0.0002, "epoch": 4.167324955116697, "step": 58030}, {"loss": 0.6549, "grad_norm": 0.7528959512710571, "learning_rate": 0.0002, "epoch": 4.168043087971275, "step": 58040}, {"loss": 0.5593, "grad_norm": 0.9286156892776489, "learning_rate": 0.0002, "epoch": 4.168761220825853, "step": 58050}, {"loss": 0.6093, "grad_norm": 1.0225880146026611, "learning_rate": 0.0002, "epoch": 4.169479353680431, "step": 58060}, {"loss": 0.5993, "grad_norm": 0.9990654587745667, "learning_rate": 0.0002, "epoch": 4.170197486535009, "step": 58070}, {"loss": 0.6002, "grad_norm": 1.052057147026062, "learning_rate": 0.0002, "epoch": 4.170915619389587, "step": 58080}, {"loss": 0.5911, "grad_norm": 0.7366801500320435, "learning_rate": 0.0002, "epoch": 4.1716337522441655, "step": 58090}, {"loss": 0.6273, "grad_norm": 1.0943711996078491, "learning_rate": 0.0002, "epoch": 4.1723518850987436, "step": 58100}, {"loss": 0.6095, "grad_norm": 1.1297656297683716, "learning_rate": 0.0002, "epoch": 4.1730700179533216, "step": 58110}, {"loss": 0.6123, "grad_norm": 0.7861461639404297, "learning_rate": 0.0002, "epoch": 4.1737881508078996, "step": 58120}, {"loss": 0.6188, "grad_norm": 0.8643335103988647, "learning_rate": 0.0002, "epoch": 4.174506283662478, "step": 58130}, {"loss": 0.6103, "grad_norm": 0.957288384437561, "learning_rate": 0.0002, "epoch": 4.175224416517056, "step": 58140}, {"loss": 0.5636, "grad_norm": 0.9175366759300232, "learning_rate": 0.0002, "epoch": 4.175942549371634, "step": 58150}, {"loss": 0.6288, "grad_norm": 1.129935622215271, "learning_rate": 0.0002, "epoch": 4.176660682226212, "step": 58160}, {"loss": 0.5969, "grad_norm": 0.9683087468147278, "learning_rate": 0.0002, "epoch": 4.17737881508079, "step": 58170}, {"loss": 0.6249, "grad_norm": 1.045171856880188, "learning_rate": 0.0002, "epoch": 4.1780969479353685, "step": 58180}, {"loss": 0.5611, "grad_norm": 0.9858742952346802, "learning_rate": 0.0002, "epoch": 4.1788150807899465, "step": 58190}, {"loss": 0.5946, "grad_norm": 0.8513413071632385, "learning_rate": 0.0002, "epoch": 4.1795332136445245, "step": 58200}, {"loss": 0.5928, "grad_norm": 0.9584265947341919, "learning_rate": 0.0002, "epoch": 4.1802513464991025, "step": 58210}, {"loss": 0.5864, "grad_norm": 0.8828920722007751, "learning_rate": 0.0002, "epoch": 4.1809694793536805, "step": 58220}, {"loss": 0.5745, "grad_norm": 0.9849961400032043, "learning_rate": 0.0002, "epoch": 4.1816876122082585, "step": 58230}, {"loss": 0.5355, "grad_norm": 1.0601637363433838, "learning_rate": 0.0002, "epoch": 4.1824057450628365, "step": 58240}, {"loss": 0.6063, "grad_norm": 1.2206604480743408, "learning_rate": 0.0002, "epoch": 4.1831238779174145, "step": 58250}, {"loss": 0.6176, "grad_norm": 1.1768009662628174, "learning_rate": 0.0002, "epoch": 4.1838420107719925, "step": 58260}, {"loss": 0.5572, "grad_norm": 0.9521295428276062, "learning_rate": 0.0002, "epoch": 4.184560143626571, "step": 58270}, {"loss": 0.5978, "grad_norm": 0.892971932888031, "learning_rate": 0.0002, "epoch": 4.185278276481149, "step": 58280}, {"loss": 0.5727, "grad_norm": 0.8712016940116882, "learning_rate": 0.0002, "epoch": 4.185996409335727, "step": 58290}, {"loss": 0.6124, "grad_norm": 1.0190843343734741, "learning_rate": 0.0002, "epoch": 4.186714542190305, "step": 58300}, {"loss": 0.6324, "grad_norm": 1.0149270296096802, "learning_rate": 0.0002, "epoch": 4.187432675044883, "step": 58310}, {"loss": 0.6337, "grad_norm": 1.1818004846572876, "learning_rate": 0.0002, "epoch": 4.188150807899461, "step": 58320}, {"loss": 0.5588, "grad_norm": 0.7892335653305054, "learning_rate": 0.0002, "epoch": 4.188868940754039, "step": 58330}, {"loss": 0.6132, "grad_norm": 0.9792808890342712, "learning_rate": 0.0002, "epoch": 4.189587073608617, "step": 58340}, {"loss": 0.5841, "grad_norm": 0.9946883320808411, "learning_rate": 0.0002, "epoch": 4.190305206463195, "step": 58350}, {"loss": 0.6043, "grad_norm": 1.0363789796829224, "learning_rate": 0.0002, "epoch": 4.191023339317773, "step": 58360}, {"loss": 0.5843, "grad_norm": 0.9285917282104492, "learning_rate": 0.0002, "epoch": 4.191741472172352, "step": 58370}, {"loss": 0.6042, "grad_norm": 0.9461679458618164, "learning_rate": 0.0002, "epoch": 4.19245960502693, "step": 58380}, {"loss": 0.5666, "grad_norm": 1.0344175100326538, "learning_rate": 0.0002, "epoch": 4.193177737881508, "step": 58390}, {"loss": 0.6032, "grad_norm": 0.9530242085456848, "learning_rate": 0.0002, "epoch": 4.193895870736086, "step": 58400}, {"loss": 0.5887, "grad_norm": 0.9171900749206543, "learning_rate": 0.0002, "epoch": 4.194614003590664, "step": 58410}, {"loss": 0.6116, "grad_norm": 0.8094898462295532, "learning_rate": 0.0002, "epoch": 4.195332136445242, "step": 58420}, {"loss": 0.5268, "grad_norm": 0.921981930732727, "learning_rate": 0.0002, "epoch": 4.19605026929982, "step": 58430}, {"loss": 0.551, "grad_norm": 0.9783532023429871, "learning_rate": 0.0002, "epoch": 4.196768402154398, "step": 58440}, {"loss": 0.5774, "grad_norm": 1.017805576324463, "learning_rate": 0.0002, "epoch": 4.197486535008976, "step": 58450}, {"loss": 0.6261, "grad_norm": 0.9244308471679688, "learning_rate": 0.0002, "epoch": 4.198204667863555, "step": 58460}, {"loss": 0.6247, "grad_norm": 0.9942585229873657, "learning_rate": 0.0002, "epoch": 4.198922800718133, "step": 58470}, {"loss": 0.5803, "grad_norm": 1.1045037508010864, "learning_rate": 0.0002, "epoch": 4.199640933572711, "step": 58480}, {"loss": 0.5846, "grad_norm": 0.9483149647712708, "learning_rate": 0.0002, "epoch": 4.200359066427289, "step": 58490}, {"loss": 0.5997, "grad_norm": 1.0807271003723145, "learning_rate": 0.0002, "epoch": 4.201077199281867, "step": 58500}, {"loss": 0.5474, "grad_norm": 0.7697445750236511, "learning_rate": 0.0002, "epoch": 4.201795332136445, "step": 58510}, {"loss": 0.5692, "grad_norm": 1.0761178731918335, "learning_rate": 0.0002, "epoch": 4.202513464991023, "step": 58520}, {"loss": 0.5667, "grad_norm": 0.9992024898529053, "learning_rate": 0.0002, "epoch": 4.203231597845601, "step": 58530}, {"loss": 0.5606, "grad_norm": 0.8741498589515686, "learning_rate": 0.0002, "epoch": 4.203949730700179, "step": 58540}, {"loss": 0.6012, "grad_norm": 0.8557528853416443, "learning_rate": 0.0002, "epoch": 4.204667863554757, "step": 58550}, {"loss": 0.5191, "grad_norm": 0.8853630423545837, "learning_rate": 0.0002, "epoch": 4.205385996409336, "step": 58560}, {"loss": 0.5806, "grad_norm": 0.9858933687210083, "learning_rate": 0.0002, "epoch": 4.206104129263914, "step": 58570}, {"loss": 0.5908, "grad_norm": 1.104732871055603, "learning_rate": 0.0002, "epoch": 4.206822262118492, "step": 58580}, {"loss": 0.5993, "grad_norm": 0.9345462322235107, "learning_rate": 0.0002, "epoch": 4.20754039497307, "step": 58590}, {"loss": 0.6101, "grad_norm": 0.9620407819747925, "learning_rate": 0.0002, "epoch": 4.208258527827648, "step": 58600}, {"loss": 0.5848, "grad_norm": 0.8546963334083557, "learning_rate": 0.0002, "epoch": 4.208976660682226, "step": 58610}, {"loss": 0.5747, "grad_norm": 0.8125145435333252, "learning_rate": 0.0002, "epoch": 4.209694793536804, "step": 58620}, {"loss": 0.604, "grad_norm": 0.8481138944625854, "learning_rate": 0.0002, "epoch": 4.210412926391382, "step": 58630}, {"loss": 0.5928, "grad_norm": 0.8884692788124084, "learning_rate": 0.0002, "epoch": 4.21113105924596, "step": 58640}, {"loss": 0.5612, "grad_norm": 1.09279465675354, "learning_rate": 0.0002, "epoch": 4.211849192100539, "step": 58650}, {"loss": 0.644, "grad_norm": 0.9806583523750305, "learning_rate": 0.0002, "epoch": 4.212567324955117, "step": 58660}, {"loss": 0.5737, "grad_norm": 0.9510366916656494, "learning_rate": 0.0002, "epoch": 4.213285457809695, "step": 58670}, {"loss": 0.5996, "grad_norm": 0.7517459988594055, "learning_rate": 0.0002, "epoch": 4.214003590664273, "step": 58680}, {"loss": 0.6274, "grad_norm": 1.1134123802185059, "learning_rate": 0.0002, "epoch": 4.214721723518851, "step": 58690}, {"loss": 0.5842, "grad_norm": 0.8307328820228577, "learning_rate": 0.0002, "epoch": 4.215439856373429, "step": 58700}, {"loss": 0.5795, "grad_norm": 0.8211639523506165, "learning_rate": 0.0002, "epoch": 4.216157989228007, "step": 58710}, {"loss": 0.5613, "grad_norm": 1.0749584436416626, "learning_rate": 0.0002, "epoch": 4.216876122082585, "step": 58720}, {"loss": 0.5956, "grad_norm": 1.1394833326339722, "learning_rate": 0.0002, "epoch": 4.217594254937163, "step": 58730}, {"loss": 0.609, "grad_norm": 1.05130934715271, "learning_rate": 0.0002, "epoch": 4.218312387791742, "step": 58740}, {"loss": 0.6294, "grad_norm": 0.7949456572532654, "learning_rate": 0.0002, "epoch": 4.21903052064632, "step": 58750}, {"loss": 0.6148, "grad_norm": 0.906506359577179, "learning_rate": 0.0002, "epoch": 4.219748653500898, "step": 58760}, {"loss": 0.5778, "grad_norm": 0.8338989615440369, "learning_rate": 0.0002, "epoch": 4.220466786355476, "step": 58770}, {"loss": 0.5402, "grad_norm": 0.9325370788574219, "learning_rate": 0.0002, "epoch": 4.221184919210054, "step": 58780}, {"loss": 0.5657, "grad_norm": 1.0208096504211426, "learning_rate": 0.0002, "epoch": 4.221903052064632, "step": 58790}, {"loss": 0.6523, "grad_norm": 1.0075920820236206, "learning_rate": 0.0002, "epoch": 4.22262118491921, "step": 58800}, {"loss": 0.5545, "grad_norm": 0.9858701229095459, "learning_rate": 0.0002, "epoch": 4.223339317773788, "step": 58810}, {"loss": 0.6343, "grad_norm": 1.0010110139846802, "learning_rate": 0.0002, "epoch": 4.224057450628366, "step": 58820}, {"loss": 0.5991, "grad_norm": 0.9360540509223938, "learning_rate": 0.0002, "epoch": 4.224775583482945, "step": 58830}, {"loss": 0.5887, "grad_norm": 0.9021786451339722, "learning_rate": 0.0002, "epoch": 4.225493716337523, "step": 58840}, {"loss": 0.6132, "grad_norm": 1.1778476238250732, "learning_rate": 0.0002, "epoch": 4.226211849192101, "step": 58850}, {"loss": 0.5956, "grad_norm": 1.0061023235321045, "learning_rate": 0.0002, "epoch": 4.226929982046679, "step": 58860}, {"loss": 0.5846, "grad_norm": 0.8839752674102783, "learning_rate": 0.0002, "epoch": 4.227648114901257, "step": 58870}, {"loss": 0.6129, "grad_norm": 1.0078870058059692, "learning_rate": 0.0002, "epoch": 4.228366247755835, "step": 58880}, {"loss": 0.6403, "grad_norm": 0.8926451206207275, "learning_rate": 0.0002, "epoch": 4.229084380610413, "step": 58890}, {"loss": 0.5987, "grad_norm": 1.4018772840499878, "learning_rate": 0.0002, "epoch": 4.229802513464991, "step": 58900}, {"loss": 0.5925, "grad_norm": 0.9911289215087891, "learning_rate": 0.0002, "epoch": 4.230520646319569, "step": 58910}, {"loss": 0.5846, "grad_norm": 0.9374576807022095, "learning_rate": 0.0002, "epoch": 4.231238779174147, "step": 58920}, {"loss": 0.5856, "grad_norm": 1.179650068283081, "learning_rate": 0.0002, "epoch": 4.231956912028726, "step": 58930}, {"loss": 0.601, "grad_norm": 0.9434911012649536, "learning_rate": 0.0002, "epoch": 4.232675044883304, "step": 58940}, {"loss": 0.6137, "grad_norm": 1.0061911344528198, "learning_rate": 0.0002, "epoch": 4.233393177737882, "step": 58950}, {"loss": 0.5847, "grad_norm": 0.9663233757019043, "learning_rate": 0.0002, "epoch": 4.23411131059246, "step": 58960}, {"loss": 0.5748, "grad_norm": 0.8897581696510315, "learning_rate": 0.0002, "epoch": 4.234829443447038, "step": 58970}, {"loss": 0.5586, "grad_norm": 0.873281717300415, "learning_rate": 0.0002, "epoch": 4.235547576301616, "step": 58980}, {"loss": 0.6027, "grad_norm": 0.9146949052810669, "learning_rate": 0.0002, "epoch": 4.236265709156194, "step": 58990}, {"loss": 0.6356, "grad_norm": 0.9381195306777954, "learning_rate": 0.0002, "epoch": 4.236983842010772, "step": 59000}, {"loss": 0.5641, "grad_norm": 0.9700697064399719, "learning_rate": 0.0002, "epoch": 4.23770197486535, "step": 59010}, {"loss": 0.6099, "grad_norm": 0.9050154685974121, "learning_rate": 0.0002, "epoch": 4.238420107719929, "step": 59020}, {"loss": 0.552, "grad_norm": 0.9901503324508667, "learning_rate": 0.0002, "epoch": 4.239138240574507, "step": 59030}, {"loss": 0.6333, "grad_norm": 0.9009594321250916, "learning_rate": 0.0002, "epoch": 4.239856373429085, "step": 59040}, {"loss": 0.6104, "grad_norm": 1.0924968719482422, "learning_rate": 0.0002, "epoch": 4.240574506283663, "step": 59050}, {"loss": 0.6269, "grad_norm": 0.9939947724342346, "learning_rate": 0.0002, "epoch": 4.241292639138241, "step": 59060}, {"loss": 0.6039, "grad_norm": 1.0577857494354248, "learning_rate": 0.0002, "epoch": 4.242010771992819, "step": 59070}, {"loss": 0.5992, "grad_norm": 1.0836747884750366, "learning_rate": 0.0002, "epoch": 4.242728904847397, "step": 59080}, {"loss": 0.6518, "grad_norm": 0.97043377161026, "learning_rate": 0.0002, "epoch": 4.243447037701975, "step": 59090}, {"loss": 0.5877, "grad_norm": 0.7711901664733887, "learning_rate": 0.0002, "epoch": 4.244165170556553, "step": 59100}, {"loss": 0.6017, "grad_norm": 1.0143170356750488, "learning_rate": 0.0002, "epoch": 4.244883303411131, "step": 59110}, {"loss": 0.6245, "grad_norm": 0.9151925444602966, "learning_rate": 0.0002, "epoch": 4.2456014362657095, "step": 59120}, {"loss": 0.6436, "grad_norm": 0.9252700209617615, "learning_rate": 0.0002, "epoch": 4.2463195691202875, "step": 59130}, {"loss": 0.5696, "grad_norm": 0.8429408073425293, "learning_rate": 0.0002, "epoch": 4.2470377019748655, "step": 59140}, {"loss": 0.5737, "grad_norm": 0.9645987153053284, "learning_rate": 0.0002, "epoch": 4.2477558348294435, "step": 59150}, {"loss": 0.6045, "grad_norm": 0.9949791431427002, "learning_rate": 0.0002, "epoch": 4.2484739676840215, "step": 59160}, {"loss": 0.6069, "grad_norm": 0.9128350615501404, "learning_rate": 0.0002, "epoch": 4.2491921005385995, "step": 59170}, {"loss": 0.596, "grad_norm": 0.7406911849975586, "learning_rate": 0.0002, "epoch": 4.2499102333931775, "step": 59180}, {"loss": 0.5796, "grad_norm": 1.0237419605255127, "learning_rate": 0.0002, "epoch": 4.2506283662477555, "step": 59190}, {"loss": 0.631, "grad_norm": 0.805459201335907, "learning_rate": 0.0002, "epoch": 4.2513464991023335, "step": 59200}, {"loss": 0.6104, "grad_norm": 0.8477254509925842, "learning_rate": 0.0002, "epoch": 4.252064631956912, "step": 59210}, {"loss": 0.5608, "grad_norm": 0.984023928642273, "learning_rate": 0.0002, "epoch": 4.25278276481149, "step": 59220}, {"loss": 0.6185, "grad_norm": 1.0667484998703003, "learning_rate": 0.0002, "epoch": 4.253500897666068, "step": 59230}, {"loss": 0.5596, "grad_norm": 0.7192284464836121, "learning_rate": 0.0002, "epoch": 4.254219030520646, "step": 59240}, {"loss": 0.5971, "grad_norm": 0.9557451009750366, "learning_rate": 0.0002, "epoch": 4.254937163375224, "step": 59250}, {"loss": 0.6012, "grad_norm": 0.9209784865379333, "learning_rate": 0.0002, "epoch": 4.255655296229802, "step": 59260}, {"loss": 0.67, "grad_norm": 0.9785363674163818, "learning_rate": 0.0002, "epoch": 4.25637342908438, "step": 59270}, {"loss": 0.6185, "grad_norm": 0.910214364528656, "learning_rate": 0.0002, "epoch": 4.257091561938958, "step": 59280}, {"loss": 0.6451, "grad_norm": 0.8945858478546143, "learning_rate": 0.0002, "epoch": 4.257809694793536, "step": 59290}, {"loss": 0.5876, "grad_norm": 1.0984420776367188, "learning_rate": 0.0002, "epoch": 4.258527827648114, "step": 59300}, {"loss": 0.5616, "grad_norm": 1.0256640911102295, "learning_rate": 0.0002, "epoch": 4.259245960502693, "step": 59310}, {"loss": 0.5825, "grad_norm": 0.978397786617279, "learning_rate": 0.0002, "epoch": 4.259964093357271, "step": 59320}, {"loss": 0.6043, "grad_norm": 0.7587000727653503, "learning_rate": 0.0002, "epoch": 4.260682226211849, "step": 59330}, {"loss": 0.5616, "grad_norm": 0.9384620785713196, "learning_rate": 0.0002, "epoch": 4.261400359066427, "step": 59340}, {"loss": 0.6669, "grad_norm": 0.893992006778717, "learning_rate": 0.0002, "epoch": 4.262118491921005, "step": 59350}, {"loss": 0.561, "grad_norm": 1.0231536626815796, "learning_rate": 0.0002, "epoch": 4.262836624775583, "step": 59360}, {"loss": 0.5912, "grad_norm": 0.9810128211975098, "learning_rate": 0.0002, "epoch": 4.263554757630161, "step": 59370}, {"loss": 0.5871, "grad_norm": 1.0868116617202759, "learning_rate": 0.0002, "epoch": 4.264272890484739, "step": 59380}, {"loss": 0.5986, "grad_norm": 1.1433676481246948, "learning_rate": 0.0002, "epoch": 4.264991023339318, "step": 59390}, {"loss": 0.6306, "grad_norm": 0.9836946725845337, "learning_rate": 0.0002, "epoch": 4.265709156193896, "step": 59400}, {"loss": 0.5854, "grad_norm": 0.9473603963851929, "learning_rate": 0.0002, "epoch": 4.266427289048474, "step": 59410}, {"loss": 0.6095, "grad_norm": 0.9066835641860962, "learning_rate": 0.0002, "epoch": 4.267145421903052, "step": 59420}, {"loss": 0.656, "grad_norm": 1.0534718036651611, "learning_rate": 0.0002, "epoch": 4.26786355475763, "step": 59430}, {"loss": 0.5624, "grad_norm": 1.0392775535583496, "learning_rate": 0.0002, "epoch": 4.268581687612208, "step": 59440}, {"loss": 0.5697, "grad_norm": 1.011472463607788, "learning_rate": 0.0002, "epoch": 4.269299820466786, "step": 59450}, {"loss": 0.5971, "grad_norm": 1.0704147815704346, "learning_rate": 0.0002, "epoch": 4.270017953321364, "step": 59460}, {"loss": 0.5719, "grad_norm": 0.9349238872528076, "learning_rate": 0.0002, "epoch": 4.270736086175942, "step": 59470}, {"loss": 0.5637, "grad_norm": 0.8745087385177612, "learning_rate": 0.0002, "epoch": 4.27145421903052, "step": 59480}, {"loss": 0.6246, "grad_norm": 0.8823763728141785, "learning_rate": 0.0002, "epoch": 4.272172351885099, "step": 59490}, {"loss": 0.6021, "grad_norm": 1.110912799835205, "learning_rate": 0.0002, "epoch": 4.272890484739677, "step": 59500}, {"loss": 0.5939, "grad_norm": 1.0000925064086914, "learning_rate": 0.0002, "epoch": 4.273608617594255, "step": 59510}, {"loss": 0.5531, "grad_norm": 1.1578227281570435, "learning_rate": 0.0002, "epoch": 4.274326750448833, "step": 59520}, {"loss": 0.6372, "grad_norm": 0.875720202922821, "learning_rate": 0.0002, "epoch": 4.275044883303411, "step": 59530}, {"loss": 0.5956, "grad_norm": 0.9562238454818726, "learning_rate": 0.0002, "epoch": 4.275763016157989, "step": 59540}, {"loss": 0.5996, "grad_norm": 0.8384222388267517, "learning_rate": 0.0002, "epoch": 4.276481149012567, "step": 59550}, {"loss": 0.6001, "grad_norm": 1.2719428539276123, "learning_rate": 0.0002, "epoch": 4.277199281867145, "step": 59560}, {"loss": 0.6286, "grad_norm": 1.0656434297561646, "learning_rate": 0.0002, "epoch": 4.277917414721723, "step": 59570}, {"loss": 0.5895, "grad_norm": 1.0766716003417969, "learning_rate": 0.0002, "epoch": 4.278635547576302, "step": 59580}, {"loss": 0.5831, "grad_norm": 0.8892807960510254, "learning_rate": 0.0002, "epoch": 4.27935368043088, "step": 59590}, {"loss": 0.5717, "grad_norm": 0.8956300020217896, "learning_rate": 0.0002, "epoch": 4.280071813285458, "step": 59600}, {"loss": 0.5965, "grad_norm": 0.9562926888465881, "learning_rate": 0.0002, "epoch": 4.280789946140036, "step": 59610}, {"loss": 0.5487, "grad_norm": 1.009141445159912, "learning_rate": 0.0002, "epoch": 4.281508078994614, "step": 59620}, {"loss": 0.6337, "grad_norm": 1.0546064376831055, "learning_rate": 0.0002, "epoch": 4.282226211849192, "step": 59630}, {"loss": 0.5771, "grad_norm": 0.8831254243850708, "learning_rate": 0.0002, "epoch": 4.28294434470377, "step": 59640}, {"loss": 0.6241, "grad_norm": 0.9560053944587708, "learning_rate": 0.0002, "epoch": 4.283662477558348, "step": 59650}, {"loss": 0.6012, "grad_norm": 1.030339241027832, "learning_rate": 0.0002, "epoch": 4.284380610412926, "step": 59660}, {"loss": 0.6174, "grad_norm": 1.00662100315094, "learning_rate": 0.0002, "epoch": 4.285098743267504, "step": 59670}, {"loss": 0.5802, "grad_norm": 1.0759116411209106, "learning_rate": 0.0002, "epoch": 4.285816876122083, "step": 59680}, {"loss": 0.6429, "grad_norm": 0.9985393285751343, "learning_rate": 0.0002, "epoch": 4.286535008976661, "step": 59690}, {"loss": 0.5992, "grad_norm": 0.9044474959373474, "learning_rate": 0.0002, "epoch": 4.287253141831239, "step": 59700}, {"loss": 0.6263, "grad_norm": 1.1224442720413208, "learning_rate": 0.0002, "epoch": 4.287971274685817, "step": 59710}, {"loss": 0.6118, "grad_norm": 0.8436414003372192, "learning_rate": 0.0002, "epoch": 4.288689407540395, "step": 59720}, {"loss": 0.5881, "grad_norm": 1.0695041418075562, "learning_rate": 0.0002, "epoch": 4.289407540394973, "step": 59730}, {"loss": 0.5994, "grad_norm": 0.8809951543807983, "learning_rate": 0.0002, "epoch": 4.290125673249551, "step": 59740}, {"loss": 0.6508, "grad_norm": 1.0213792324066162, "learning_rate": 0.0002, "epoch": 4.290843806104129, "step": 59750}, {"loss": 0.5851, "grad_norm": 0.9660196900367737, "learning_rate": 0.0002, "epoch": 4.291561938958707, "step": 59760}, {"loss": 0.6582, "grad_norm": 0.8005787134170532, "learning_rate": 0.0002, "epoch": 4.292280071813286, "step": 59770}, {"loss": 0.6504, "grad_norm": 1.0016109943389893, "learning_rate": 0.0002, "epoch": 4.292998204667864, "step": 59780}, {"loss": 0.5765, "grad_norm": 0.9112903475761414, "learning_rate": 0.0002, "epoch": 4.293716337522442, "step": 59790}, {"loss": 0.5925, "grad_norm": 0.9999852180480957, "learning_rate": 0.0002, "epoch": 4.29443447037702, "step": 59800}, {"loss": 0.636, "grad_norm": 0.9323953986167908, "learning_rate": 0.0002, "epoch": 4.295152603231598, "step": 59810}, {"loss": 0.5743, "grad_norm": 0.903037965297699, "learning_rate": 0.0002, "epoch": 4.295870736086176, "step": 59820}, {"loss": 0.6008, "grad_norm": 1.2462431192398071, "learning_rate": 0.0002, "epoch": 4.296588868940754, "step": 59830}, {"loss": 0.6126, "grad_norm": 1.2322230339050293, "learning_rate": 0.0002, "epoch": 4.297307001795332, "step": 59840}, {"loss": 0.6029, "grad_norm": 0.9584668278694153, "learning_rate": 0.0002, "epoch": 4.29802513464991, "step": 59850}, {"loss": 0.6179, "grad_norm": 0.9664767980575562, "learning_rate": 0.0002, "epoch": 4.298743267504488, "step": 59860}, {"loss": 0.5909, "grad_norm": 0.8860437273979187, "learning_rate": 0.0002, "epoch": 4.299461400359067, "step": 59870}, {"loss": 0.5708, "grad_norm": 1.0825127363204956, "learning_rate": 0.0002, "epoch": 4.300179533213645, "step": 59880}, {"loss": 0.6338, "grad_norm": 1.1312100887298584, "learning_rate": 0.0002, "epoch": 4.300897666068223, "step": 59890}, {"loss": 0.6362, "grad_norm": 0.8289751410484314, "learning_rate": 0.0002, "epoch": 4.301615798922801, "step": 59900}, {"loss": 0.6061, "grad_norm": 0.8990927934646606, "learning_rate": 0.0002, "epoch": 4.302333931777379, "step": 59910}, {"loss": 0.5993, "grad_norm": 0.9667525887489319, "learning_rate": 0.0002, "epoch": 4.303052064631957, "step": 59920}, {"loss": 0.5756, "grad_norm": 0.8656060695648193, "learning_rate": 0.0002, "epoch": 4.303770197486535, "step": 59930}, {"loss": 0.6271, "grad_norm": 0.8909396529197693, "learning_rate": 0.0002, "epoch": 4.304488330341113, "step": 59940}, {"loss": 0.5918, "grad_norm": 0.9533283114433289, "learning_rate": 0.0002, "epoch": 4.305206463195692, "step": 59950}, {"loss": 0.6146, "grad_norm": 0.9090739488601685, "learning_rate": 0.0002, "epoch": 4.30592459605027, "step": 59960}, {"loss": 0.5949, "grad_norm": 1.096656322479248, "learning_rate": 0.0002, "epoch": 4.306642728904848, "step": 59970}, {"loss": 0.582, "grad_norm": 1.0392465591430664, "learning_rate": 0.0002, "epoch": 4.307360861759426, "step": 59980}, {"loss": 0.6552, "grad_norm": 0.8733913898468018, "learning_rate": 0.0002, "epoch": 4.308078994614004, "step": 59990}, {"loss": 0.5771, "grad_norm": 0.8287094235420227, "learning_rate": 0.0002, "epoch": 4.308797127468582, "step": 60000}, {"loss": 0.6157, "grad_norm": 0.9267017245292664, "learning_rate": 0.0002, "epoch": 4.30951526032316, "step": 60010}, {"loss": 0.6402, "grad_norm": 0.9969515800476074, "learning_rate": 0.0002, "epoch": 4.310233393177738, "step": 60020}, {"loss": 0.541, "grad_norm": 1.0005015134811401, "learning_rate": 0.0002, "epoch": 4.310951526032316, "step": 60030}, {"loss": 0.6295, "grad_norm": 1.1215369701385498, "learning_rate": 0.0002, "epoch": 4.311669658886894, "step": 60040}, {"loss": 0.6225, "grad_norm": 1.0434890985488892, "learning_rate": 0.0002, "epoch": 4.312387791741473, "step": 60050}, {"loss": 0.5962, "grad_norm": 0.967989981174469, "learning_rate": 0.0002, "epoch": 4.313105924596051, "step": 60060}, {"loss": 0.5862, "grad_norm": 1.007599115371704, "learning_rate": 0.0002, "epoch": 4.313824057450629, "step": 60070}, {"loss": 0.6233, "grad_norm": 0.9356340765953064, "learning_rate": 0.0002, "epoch": 4.314542190305207, "step": 60080}, {"loss": 0.5642, "grad_norm": 0.9566757678985596, "learning_rate": 0.0002, "epoch": 4.315260323159785, "step": 60090}, {"loss": 0.6142, "grad_norm": 1.1066830158233643, "learning_rate": 0.0002, "epoch": 4.315978456014363, "step": 60100}, {"loss": 0.5432, "grad_norm": 0.9895772933959961, "learning_rate": 0.0002, "epoch": 4.316696588868941, "step": 60110}, {"loss": 0.5542, "grad_norm": 1.07423734664917, "learning_rate": 0.0002, "epoch": 4.317414721723519, "step": 60120}, {"loss": 0.5975, "grad_norm": 1.0777037143707275, "learning_rate": 0.0002, "epoch": 4.318132854578097, "step": 60130}, {"loss": 0.6168, "grad_norm": 1.1475656032562256, "learning_rate": 0.0002, "epoch": 4.3188509874326755, "step": 60140}, {"loss": 0.6038, "grad_norm": 1.0705864429473877, "learning_rate": 0.0002, "epoch": 4.3195691202872535, "step": 60150}, {"loss": 0.6032, "grad_norm": 0.8676854968070984, "learning_rate": 0.0002, "epoch": 4.3202872531418315, "step": 60160}, {"loss": 0.632, "grad_norm": 0.9488174319267273, "learning_rate": 0.0002, "epoch": 4.3210053859964095, "step": 60170}, {"loss": 0.6137, "grad_norm": 1.1171153783798218, "learning_rate": 0.0002, "epoch": 4.3217235188509875, "step": 60180}, {"loss": 0.6477, "grad_norm": 1.091435194015503, "learning_rate": 0.0002, "epoch": 4.3224416517055655, "step": 60190}, {"loss": 0.6105, "grad_norm": 0.880944013595581, "learning_rate": 0.0002, "epoch": 4.3231597845601435, "step": 60200}, {"loss": 0.5736, "grad_norm": 0.8458809852600098, "learning_rate": 0.0002, "epoch": 4.3238779174147215, "step": 60210}, {"loss": 0.6211, "grad_norm": 0.7900225520133972, "learning_rate": 0.0002, "epoch": 4.3245960502692995, "step": 60220}, {"loss": 0.6205, "grad_norm": 0.966742753982544, "learning_rate": 0.0002, "epoch": 4.3253141831238775, "step": 60230}, {"loss": 0.6178, "grad_norm": 0.8948110342025757, "learning_rate": 0.0002, "epoch": 4.326032315978456, "step": 60240}, {"loss": 0.6176, "grad_norm": 0.8598700165748596, "learning_rate": 0.0002, "epoch": 4.326750448833034, "step": 60250}, {"loss": 0.6373, "grad_norm": 1.127610206604004, "learning_rate": 0.0002, "epoch": 4.327468581687612, "step": 60260}, {"loss": 0.6081, "grad_norm": 0.8357340693473816, "learning_rate": 0.0002, "epoch": 4.32818671454219, "step": 60270}, {"loss": 0.5839, "grad_norm": 0.8771896362304688, "learning_rate": 0.0002, "epoch": 4.328904847396768, "step": 60280}, {"loss": 0.5959, "grad_norm": 0.9202101826667786, "learning_rate": 0.0002, "epoch": 4.329622980251346, "step": 60290}, {"loss": 0.6387, "grad_norm": 1.1427538394927979, "learning_rate": 0.0002, "epoch": 4.330341113105924, "step": 60300}, {"loss": 0.6306, "grad_norm": 0.8711863160133362, "learning_rate": 0.0002, "epoch": 4.331059245960502, "step": 60310}, {"loss": 0.6011, "grad_norm": 0.972723662853241, "learning_rate": 0.0002, "epoch": 4.33177737881508, "step": 60320}, {"loss": 0.5761, "grad_norm": 1.1496877670288086, "learning_rate": 0.0002, "epoch": 4.332495511669659, "step": 60330}, {"loss": 0.6472, "grad_norm": 1.008581519126892, "learning_rate": 0.0002, "epoch": 4.333213644524237, "step": 60340}, {"loss": 0.6479, "grad_norm": 1.0802706480026245, "learning_rate": 0.0002, "epoch": 4.333931777378815, "step": 60350}, {"loss": 0.6105, "grad_norm": 0.8394291996955872, "learning_rate": 0.0002, "epoch": 4.334649910233393, "step": 60360}, {"loss": 0.6241, "grad_norm": 0.8355905413627625, "learning_rate": 0.0002, "epoch": 4.335368043087971, "step": 60370}, {"loss": 0.6282, "grad_norm": 0.9583960175514221, "learning_rate": 0.0002, "epoch": 4.336086175942549, "step": 60380}, {"loss": 0.6436, "grad_norm": 1.138934850692749, "learning_rate": 0.0002, "epoch": 4.336804308797127, "step": 60390}, {"loss": 0.587, "grad_norm": 1.0334709882736206, "learning_rate": 0.0002, "epoch": 4.337522441651705, "step": 60400}, {"loss": 0.5596, "grad_norm": 0.729686439037323, "learning_rate": 0.0002, "epoch": 4.338240574506283, "step": 60410}, {"loss": 0.5863, "grad_norm": 0.8735929727554321, "learning_rate": 0.0002, "epoch": 4.338958707360861, "step": 60420}, {"loss": 0.5732, "grad_norm": 0.9617681503295898, "learning_rate": 0.0002, "epoch": 4.33967684021544, "step": 60430}, {"loss": 0.5865, "grad_norm": 0.9439655542373657, "learning_rate": 0.0002, "epoch": 4.340394973070018, "step": 60440}, {"loss": 0.5959, "grad_norm": 0.9275408387184143, "learning_rate": 0.0002, "epoch": 4.341113105924596, "step": 60450}, {"loss": 0.6295, "grad_norm": 1.0693308115005493, "learning_rate": 0.0002, "epoch": 4.341831238779174, "step": 60460}, {"loss": 0.6455, "grad_norm": 0.9234438538551331, "learning_rate": 0.0002, "epoch": 4.342549371633752, "step": 60470}, {"loss": 0.6308, "grad_norm": 1.1376168727874756, "learning_rate": 0.0002, "epoch": 4.34326750448833, "step": 60480}, {"loss": 0.623, "grad_norm": 0.9218108654022217, "learning_rate": 0.0002, "epoch": 4.343985637342908, "step": 60490}, {"loss": 0.6291, "grad_norm": 1.1467362642288208, "learning_rate": 0.0002, "epoch": 4.344703770197486, "step": 60500}, {"loss": 0.5757, "grad_norm": 0.9459165930747986, "learning_rate": 0.0002, "epoch": 4.345421903052064, "step": 60510}, {"loss": 0.5963, "grad_norm": 0.9460827708244324, "learning_rate": 0.0002, "epoch": 4.346140035906643, "step": 60520}, {"loss": 0.5822, "grad_norm": 1.0845041275024414, "learning_rate": 0.0002, "epoch": 4.346858168761221, "step": 60530}, {"loss": 0.6326, "grad_norm": 1.082675576210022, "learning_rate": 0.0002, "epoch": 4.347576301615799, "step": 60540}, {"loss": 0.5419, "grad_norm": 0.8443698883056641, "learning_rate": 0.0002, "epoch": 4.348294434470377, "step": 60550}, {"loss": 0.5634, "grad_norm": 1.018393874168396, "learning_rate": 0.0002, "epoch": 4.349012567324955, "step": 60560}, {"loss": 0.6447, "grad_norm": 0.8796373009681702, "learning_rate": 0.0002, "epoch": 4.349730700179533, "step": 60570}, {"loss": 0.6108, "grad_norm": 1.097942590713501, "learning_rate": 0.0002, "epoch": 4.350448833034111, "step": 60580}, {"loss": 0.6161, "grad_norm": 0.8750485181808472, "learning_rate": 0.0002, "epoch": 4.351166965888689, "step": 60590}, {"loss": 0.5849, "grad_norm": 1.0339995622634888, "learning_rate": 0.0002, "epoch": 4.351885098743267, "step": 60600}, {"loss": 0.6097, "grad_norm": 0.9077731966972351, "learning_rate": 0.0002, "epoch": 4.352603231597846, "step": 60610}, {"loss": 0.5657, "grad_norm": 1.051321029663086, "learning_rate": 0.0002, "epoch": 4.353321364452424, "step": 60620}, {"loss": 0.6089, "grad_norm": 1.0018669366836548, "learning_rate": 0.0002, "epoch": 4.354039497307002, "step": 60630}, {"loss": 0.5957, "grad_norm": 1.0349196195602417, "learning_rate": 0.0002, "epoch": 4.35475763016158, "step": 60640}, {"loss": 0.6212, "grad_norm": 1.009589672088623, "learning_rate": 0.0002, "epoch": 4.355475763016158, "step": 60650}, {"loss": 0.5542, "grad_norm": 1.0463480949401855, "learning_rate": 0.0002, "epoch": 4.356193895870736, "step": 60660}, {"loss": 0.5797, "grad_norm": 0.9815132021903992, "learning_rate": 0.0002, "epoch": 4.356912028725314, "step": 60670}, {"loss": 0.6089, "grad_norm": 1.0977262258529663, "learning_rate": 0.0002, "epoch": 4.357630161579892, "step": 60680}, {"loss": 0.6061, "grad_norm": 0.8450005054473877, "learning_rate": 0.0002, "epoch": 4.35834829443447, "step": 60690}, {"loss": 0.5913, "grad_norm": 1.0959078073501587, "learning_rate": 0.0002, "epoch": 4.359066427289049, "step": 60700}, {"loss": 0.5957, "grad_norm": 0.9155098795890808, "learning_rate": 0.0002, "epoch": 4.359784560143627, "step": 60710}, {"loss": 0.6084, "grad_norm": 0.9267987012863159, "learning_rate": 0.0002, "epoch": 4.360502692998205, "step": 60720}, {"loss": 0.5974, "grad_norm": 1.177472472190857, "learning_rate": 0.0002, "epoch": 4.361220825852783, "step": 60730}, {"loss": 0.5911, "grad_norm": 0.8615312576293945, "learning_rate": 0.0002, "epoch": 4.361938958707361, "step": 60740}, {"loss": 0.5819, "grad_norm": 1.0939710140228271, "learning_rate": 0.0002, "epoch": 4.362657091561939, "step": 60750}, {"loss": 0.6263, "grad_norm": 1.0928049087524414, "learning_rate": 0.0002, "epoch": 4.363375224416517, "step": 60760}, {"loss": 0.5772, "grad_norm": 1.0796833038330078, "learning_rate": 0.0002, "epoch": 4.364093357271095, "step": 60770}, {"loss": 0.5879, "grad_norm": 0.9768339991569519, "learning_rate": 0.0002, "epoch": 4.364811490125673, "step": 60780}, {"loss": 0.6335, "grad_norm": 0.9082722067832947, "learning_rate": 0.0002, "epoch": 4.365529622980251, "step": 60790}, {"loss": 0.6037, "grad_norm": 0.9614832997322083, "learning_rate": 0.0002, "epoch": 4.36624775583483, "step": 60800}, {"loss": 0.6185, "grad_norm": 0.8874651789665222, "learning_rate": 0.0002, "epoch": 4.366965888689408, "step": 60810}, {"loss": 0.6524, "grad_norm": 0.8810178637504578, "learning_rate": 0.0002, "epoch": 4.367684021543986, "step": 60820}, {"loss": 0.5908, "grad_norm": 1.0893806219100952, "learning_rate": 0.0002, "epoch": 4.368402154398564, "step": 60830}, {"loss": 0.5782, "grad_norm": 0.9042278528213501, "learning_rate": 0.0002, "epoch": 4.369120287253142, "step": 60840}, {"loss": 0.5798, "grad_norm": 1.0832217931747437, "learning_rate": 0.0002, "epoch": 4.36983842010772, "step": 60850}, {"loss": 0.6235, "grad_norm": 0.9431114792823792, "learning_rate": 0.0002, "epoch": 4.370556552962298, "step": 60860}, {"loss": 0.5869, "grad_norm": 1.031553030014038, "learning_rate": 0.0002, "epoch": 4.371274685816876, "step": 60870}, {"loss": 0.5839, "grad_norm": 0.8702824711799622, "learning_rate": 0.0002, "epoch": 4.371992818671454, "step": 60880}, {"loss": 0.6028, "grad_norm": 1.1109199523925781, "learning_rate": 0.0002, "epoch": 4.372710951526033, "step": 60890}, {"loss": 0.6423, "grad_norm": 0.8369361162185669, "learning_rate": 0.0002, "epoch": 4.373429084380611, "step": 60900}, {"loss": 0.6011, "grad_norm": 0.988915205001831, "learning_rate": 0.0002, "epoch": 4.374147217235189, "step": 60910}, {"loss": 0.6266, "grad_norm": 0.9365919232368469, "learning_rate": 0.0002, "epoch": 4.374865350089767, "step": 60920}, {"loss": 0.5786, "grad_norm": 0.9789398908615112, "learning_rate": 0.0002, "epoch": 4.375583482944345, "step": 60930}, {"loss": 0.6459, "grad_norm": 0.8786931037902832, "learning_rate": 0.0002, "epoch": 4.376301615798923, "step": 60940}, {"loss": 0.631, "grad_norm": 0.8891511559486389, "learning_rate": 0.0002, "epoch": 4.377019748653501, "step": 60950}, {"loss": 0.5909, "grad_norm": 0.9561707377433777, "learning_rate": 0.0002, "epoch": 4.377737881508079, "step": 60960}, {"loss": 0.5815, "grad_norm": 0.8674200177192688, "learning_rate": 0.0002, "epoch": 4.378456014362657, "step": 60970}, {"loss": 0.5664, "grad_norm": 0.9285916090011597, "learning_rate": 0.0002, "epoch": 4.379174147217235, "step": 60980}, {"loss": 0.5727, "grad_norm": 0.9185547232627869, "learning_rate": 0.0002, "epoch": 4.379892280071814, "step": 60990}, {"loss": 0.6296, "grad_norm": 1.081664800643921, "learning_rate": 0.0002, "epoch": 4.380610412926392, "step": 61000}, {"loss": 0.6346, "grad_norm": 1.0475854873657227, "learning_rate": 0.0002, "epoch": 4.38132854578097, "step": 61010}, {"loss": 0.6394, "grad_norm": 1.1519653797149658, "learning_rate": 0.0002, "epoch": 4.382046678635548, "step": 61020}, {"loss": 0.6437, "grad_norm": 0.8757607936859131, "learning_rate": 0.0002, "epoch": 4.382764811490126, "step": 61030}, {"loss": 0.6143, "grad_norm": 0.8707934021949768, "learning_rate": 0.0002, "epoch": 4.383482944344704, "step": 61040}, {"loss": 0.5782, "grad_norm": 1.1807516813278198, "learning_rate": 0.0002, "epoch": 4.384201077199282, "step": 61050}, {"loss": 0.5901, "grad_norm": 1.0674688816070557, "learning_rate": 0.0002, "epoch": 4.38491921005386, "step": 61060}, {"loss": 0.6247, "grad_norm": 0.9321209788322449, "learning_rate": 0.0002, "epoch": 4.385637342908438, "step": 61070}, {"loss": 0.5882, "grad_norm": 1.0786446332931519, "learning_rate": 0.0002, "epoch": 4.3863554757630165, "step": 61080}, {"loss": 0.5966, "grad_norm": 0.9733907580375671, "learning_rate": 0.0002, "epoch": 4.3870736086175945, "step": 61090}, {"loss": 0.5826, "grad_norm": 0.9476010203361511, "learning_rate": 0.0002, "epoch": 4.3877917414721725, "step": 61100}, {"loss": 0.6204, "grad_norm": 1.1321563720703125, "learning_rate": 0.0002, "epoch": 4.3885098743267505, "step": 61110}, {"loss": 0.5908, "grad_norm": 0.9379117488861084, "learning_rate": 0.0002, "epoch": 4.3892280071813286, "step": 61120}, {"loss": 0.586, "grad_norm": 0.8409728407859802, "learning_rate": 0.0002, "epoch": 4.3899461400359066, "step": 61130}, {"loss": 0.614, "grad_norm": 0.8309189081192017, "learning_rate": 0.0002, "epoch": 4.3906642728904846, "step": 61140}, {"loss": 0.6284, "grad_norm": 0.8922196626663208, "learning_rate": 0.0002, "epoch": 4.391382405745063, "step": 61150}, {"loss": 0.6358, "grad_norm": 0.8274614214897156, "learning_rate": 0.0002, "epoch": 4.392100538599641, "step": 61160}, {"loss": 0.5827, "grad_norm": 1.0928618907928467, "learning_rate": 0.0002, "epoch": 4.392818671454219, "step": 61170}, {"loss": 0.616, "grad_norm": 0.9771125316619873, "learning_rate": 0.0002, "epoch": 4.3935368043087974, "step": 61180}, {"loss": 0.6238, "grad_norm": 0.8844535946846008, "learning_rate": 0.0002, "epoch": 4.3942549371633755, "step": 61190}, {"loss": 0.5974, "grad_norm": 1.0498822927474976, "learning_rate": 0.0002, "epoch": 4.3949730700179535, "step": 61200}, {"loss": 0.596, "grad_norm": 0.9882155060768127, "learning_rate": 0.0002, "epoch": 4.3956912028725315, "step": 61210}, {"loss": 0.6385, "grad_norm": 1.090356707572937, "learning_rate": 0.0002, "epoch": 4.3964093357271095, "step": 61220}, {"loss": 0.6298, "grad_norm": 1.0908088684082031, "learning_rate": 0.0002, "epoch": 4.3971274685816875, "step": 61230}, {"loss": 0.6405, "grad_norm": 1.0013501644134521, "learning_rate": 0.0002, "epoch": 4.3978456014362655, "step": 61240}, {"loss": 0.5995, "grad_norm": 1.0916062593460083, "learning_rate": 0.0002, "epoch": 4.3985637342908435, "step": 61250}, {"loss": 0.5938, "grad_norm": 1.0817667245864868, "learning_rate": 0.0002, "epoch": 4.399281867145422, "step": 61260}, {"loss": 0.604, "grad_norm": 0.9745162129402161, "learning_rate": 0.0002, "epoch": 4.4, "step": 61270}, {"loss": 0.6028, "grad_norm": 1.0653400421142578, "learning_rate": 0.0002, "epoch": 4.400718132854578, "step": 61280}, {"loss": 0.6064, "grad_norm": 1.0082067251205444, "learning_rate": 0.0002, "epoch": 4.401436265709156, "step": 61290}, {"loss": 0.5719, "grad_norm": 0.7963659167289734, "learning_rate": 0.0002, "epoch": 4.402154398563734, "step": 61300}, {"loss": 0.6724, "grad_norm": 1.0428845882415771, "learning_rate": 0.0002, "epoch": 4.402872531418312, "step": 61310}, {"loss": 0.5991, "grad_norm": 0.9205707311630249, "learning_rate": 0.0002, "epoch": 4.40359066427289, "step": 61320}, {"loss": 0.6169, "grad_norm": 1.0103533267974854, "learning_rate": 0.0002, "epoch": 4.404308797127468, "step": 61330}, {"loss": 0.6284, "grad_norm": 1.113547682762146, "learning_rate": 0.0002, "epoch": 4.405026929982046, "step": 61340}, {"loss": 0.6071, "grad_norm": 1.137488842010498, "learning_rate": 0.0002, "epoch": 4.405745062836624, "step": 61350}, {"loss": 0.6303, "grad_norm": 1.1284101009368896, "learning_rate": 0.0002, "epoch": 4.406463195691203, "step": 61360}, {"loss": 0.5613, "grad_norm": 0.8010451197624207, "learning_rate": 0.0002, "epoch": 4.407181328545781, "step": 61370}, {"loss": 0.5963, "grad_norm": 0.8893977403640747, "learning_rate": 0.0002, "epoch": 4.407899461400359, "step": 61380}, {"loss": 0.6154, "grad_norm": 0.9098272323608398, "learning_rate": 0.0002, "epoch": 4.408617594254937, "step": 61390}, {"loss": 0.6091, "grad_norm": 1.0613329410552979, "learning_rate": 0.0002, "epoch": 4.409335727109515, "step": 61400}, {"loss": 0.6222, "grad_norm": 1.0070269107818604, "learning_rate": 0.0002, "epoch": 4.410053859964093, "step": 61410}, {"loss": 0.5894, "grad_norm": 0.8632227778434753, "learning_rate": 0.0002, "epoch": 4.410771992818671, "step": 61420}, {"loss": 0.6412, "grad_norm": 1.0183731317520142, "learning_rate": 0.0002, "epoch": 4.411490125673249, "step": 61430}, {"loss": 0.596, "grad_norm": 0.9049941897392273, "learning_rate": 0.0002, "epoch": 4.412208258527827, "step": 61440}, {"loss": 0.5991, "grad_norm": 1.0184082984924316, "learning_rate": 0.0002, "epoch": 4.412926391382406, "step": 61450}, {"loss": 0.5758, "grad_norm": 0.9994277358055115, "learning_rate": 0.0002, "epoch": 4.413644524236984, "step": 61460}, {"loss": 0.6009, "grad_norm": 1.0112420320510864, "learning_rate": 0.0002, "epoch": 4.414362657091562, "step": 61470}, {"loss": 0.584, "grad_norm": 0.9751759171485901, "learning_rate": 0.0002, "epoch": 4.41508078994614, "step": 61480}, {"loss": 0.6307, "grad_norm": 1.047135591506958, "learning_rate": 0.0002, "epoch": 4.415798922800718, "step": 61490}, {"loss": 0.6645, "grad_norm": 0.886282742023468, "learning_rate": 0.0002, "epoch": 4.416517055655296, "step": 61500}, {"loss": 0.6168, "grad_norm": 0.971964418888092, "learning_rate": 0.0002, "epoch": 4.417235188509874, "step": 61510}, {"loss": 0.5822, "grad_norm": 0.9603846073150635, "learning_rate": 0.0002, "epoch": 4.417953321364452, "step": 61520}, {"loss": 0.6349, "grad_norm": 1.060042142868042, "learning_rate": 0.0002, "epoch": 4.41867145421903, "step": 61530}, {"loss": 0.6223, "grad_norm": 1.1231369972229004, "learning_rate": 0.0002, "epoch": 4.419389587073608, "step": 61540}, {"loss": 0.6175, "grad_norm": 0.8269591331481934, "learning_rate": 0.0002, "epoch": 4.420107719928187, "step": 61550}, {"loss": 0.6285, "grad_norm": 1.0341241359710693, "learning_rate": 0.0002, "epoch": 4.420825852782765, "step": 61560}, {"loss": 0.6054, "grad_norm": 0.7276636958122253, "learning_rate": 0.0002, "epoch": 4.421543985637343, "step": 61570}, {"loss": 0.6321, "grad_norm": 1.0663669109344482, "learning_rate": 0.0002, "epoch": 4.422262118491921, "step": 61580}, {"loss": 0.5944, "grad_norm": 0.9764387011528015, "learning_rate": 0.0002, "epoch": 4.422980251346499, "step": 61590}, {"loss": 0.6065, "grad_norm": 1.0953258275985718, "learning_rate": 0.0002, "epoch": 4.423698384201077, "step": 61600}, {"loss": 0.5815, "grad_norm": 0.8877012729644775, "learning_rate": 0.0002, "epoch": 4.424416517055655, "step": 61610}, {"loss": 0.5798, "grad_norm": 0.8781440854072571, "learning_rate": 0.0002, "epoch": 4.425134649910233, "step": 61620}, {"loss": 0.6223, "grad_norm": 0.8333432674407959, "learning_rate": 0.0002, "epoch": 4.425852782764811, "step": 61630}, {"loss": 0.5949, "grad_norm": 0.9647989869117737, "learning_rate": 0.0002, "epoch": 4.42657091561939, "step": 61640}, {"loss": 0.6135, "grad_norm": 1.0801783800125122, "learning_rate": 0.0002, "epoch": 4.427289048473968, "step": 61650}, {"loss": 0.6065, "grad_norm": 0.8215882778167725, "learning_rate": 0.0002, "epoch": 4.428007181328546, "step": 61660}, {"loss": 0.5851, "grad_norm": 0.9853931665420532, "learning_rate": 0.0002, "epoch": 4.428725314183124, "step": 61670}, {"loss": 0.5942, "grad_norm": 0.8658010959625244, "learning_rate": 0.0002, "epoch": 4.429443447037702, "step": 61680}, {"loss": 0.6413, "grad_norm": 1.124064326286316, "learning_rate": 0.0002, "epoch": 4.43016157989228, "step": 61690}, {"loss": 0.6021, "grad_norm": 1.009340763092041, "learning_rate": 0.0002, "epoch": 4.430879712746858, "step": 61700}, {"loss": 0.6127, "grad_norm": 0.8705293536186218, "learning_rate": 0.0002, "epoch": 4.431597845601436, "step": 61710}, {"loss": 0.5971, "grad_norm": 1.1323511600494385, "learning_rate": 0.0002, "epoch": 4.432315978456014, "step": 61720}, {"loss": 0.5985, "grad_norm": 1.1203019618988037, "learning_rate": 0.0002, "epoch": 4.433034111310592, "step": 61730}, {"loss": 0.6178, "grad_norm": 1.1683770418167114, "learning_rate": 0.0002, "epoch": 4.433752244165171, "step": 61740}, {"loss": 0.6132, "grad_norm": 1.0735899209976196, "learning_rate": 0.0002, "epoch": 4.434470377019749, "step": 61750}, {"loss": 0.5664, "grad_norm": 1.142496109008789, "learning_rate": 0.0002, "epoch": 4.435188509874327, "step": 61760}, {"loss": 0.6276, "grad_norm": 1.1157732009887695, "learning_rate": 0.0002, "epoch": 4.435906642728905, "step": 61770}, {"loss": 0.6237, "grad_norm": 0.8845949172973633, "learning_rate": 0.0002, "epoch": 4.436624775583483, "step": 61780}, {"loss": 0.5964, "grad_norm": 1.1212759017944336, "learning_rate": 0.0002, "epoch": 4.437342908438061, "step": 61790}, {"loss": 0.6185, "grad_norm": 0.8832488656044006, "learning_rate": 0.0002, "epoch": 4.438061041292639, "step": 61800}, {"loss": 0.6264, "grad_norm": 0.9059590101242065, "learning_rate": 0.0002, "epoch": 4.438779174147217, "step": 61810}, {"loss": 0.6303, "grad_norm": 1.0625685453414917, "learning_rate": 0.0002, "epoch": 4.439497307001796, "step": 61820}, {"loss": 0.5795, "grad_norm": 0.9565598368644714, "learning_rate": 0.0002, "epoch": 4.440215439856374, "step": 61830}, {"loss": 0.6027, "grad_norm": 0.8975377082824707, "learning_rate": 0.0002, "epoch": 4.440933572710952, "step": 61840}, {"loss": 0.6334, "grad_norm": 1.0412718057632446, "learning_rate": 0.0002, "epoch": 4.44165170556553, "step": 61850}, {"loss": 0.6455, "grad_norm": 0.9923529624938965, "learning_rate": 0.0002, "epoch": 4.442369838420108, "step": 61860}, {"loss": 0.5931, "grad_norm": 1.3025734424591064, "learning_rate": 0.0002, "epoch": 4.443087971274686, "step": 61870}, {"loss": 0.5804, "grad_norm": 1.0031960010528564, "learning_rate": 0.0002, "epoch": 4.443806104129264, "step": 61880}, {"loss": 0.602, "grad_norm": 1.0974701642990112, "learning_rate": 0.0002, "epoch": 4.444524236983842, "step": 61890}, {"loss": 0.6078, "grad_norm": 1.1044024229049683, "learning_rate": 0.0002, "epoch": 4.44524236983842, "step": 61900}, {"loss": 0.6454, "grad_norm": 1.0782772302627563, "learning_rate": 0.0002, "epoch": 4.445960502692998, "step": 61910}, {"loss": 0.6453, "grad_norm": 1.006304383277893, "learning_rate": 0.0002, "epoch": 4.446678635547577, "step": 61920}, {"loss": 0.5449, "grad_norm": 0.9258833527565002, "learning_rate": 0.0002, "epoch": 4.447396768402155, "step": 61930}, {"loss": 0.5744, "grad_norm": 0.9888426065444946, "learning_rate": 0.0002, "epoch": 4.448114901256733, "step": 61940}, {"loss": 0.5853, "grad_norm": 0.9592963457107544, "learning_rate": 0.0002, "epoch": 4.448833034111311, "step": 61950}, {"loss": 0.6142, "grad_norm": 1.0527986288070679, "learning_rate": 0.0002, "epoch": 4.449551166965889, "step": 61960}, {"loss": 0.5829, "grad_norm": 0.8613291382789612, "learning_rate": 0.0002, "epoch": 4.450269299820467, "step": 61970}, {"loss": 0.6176, "grad_norm": 1.1083767414093018, "learning_rate": 0.0002, "epoch": 4.450987432675045, "step": 61980}, {"loss": 0.5768, "grad_norm": 0.772679328918457, "learning_rate": 0.0002, "epoch": 4.451705565529623, "step": 61990}, {"loss": 0.6348, "grad_norm": 0.9052274227142334, "learning_rate": 0.0002, "epoch": 4.452423698384201, "step": 62000}, {"loss": 0.6202, "grad_norm": 1.129667043685913, "learning_rate": 0.0002, "epoch": 4.45314183123878, "step": 62010}, {"loss": 0.6265, "grad_norm": 0.9994529485702515, "learning_rate": 0.0002, "epoch": 4.453859964093358, "step": 62020}, {"loss": 0.6249, "grad_norm": 0.982155978679657, "learning_rate": 0.0002, "epoch": 4.454578096947936, "step": 62030}, {"loss": 0.6255, "grad_norm": 0.9139904975891113, "learning_rate": 0.0002, "epoch": 4.455296229802514, "step": 62040}, {"loss": 0.6237, "grad_norm": 1.0877810716629028, "learning_rate": 0.0002, "epoch": 4.456014362657092, "step": 62050}, {"loss": 0.6105, "grad_norm": 1.0535308122634888, "learning_rate": 0.0002, "epoch": 4.45673249551167, "step": 62060}, {"loss": 0.6084, "grad_norm": 1.0225313901901245, "learning_rate": 0.0002, "epoch": 4.457450628366248, "step": 62070}, {"loss": 0.6239, "grad_norm": 0.8443132042884827, "learning_rate": 0.0002, "epoch": 4.458168761220826, "step": 62080}, {"loss": 0.5895, "grad_norm": 1.0426654815673828, "learning_rate": 0.0002, "epoch": 4.458886894075404, "step": 62090}, {"loss": 0.6022, "grad_norm": 1.1110700368881226, "learning_rate": 0.0002, "epoch": 4.459605026929982, "step": 62100}, {"loss": 0.6436, "grad_norm": 1.0200893878936768, "learning_rate": 0.0002, "epoch": 4.4603231597845605, "step": 62110}, {"loss": 0.628, "grad_norm": 0.9102830290794373, "learning_rate": 0.0002, "epoch": 4.4610412926391385, "step": 62120}, {"loss": 0.5894, "grad_norm": 1.1395094394683838, "learning_rate": 0.0002, "epoch": 4.4617594254937165, "step": 62130}, {"loss": 0.5765, "grad_norm": 1.1202316284179688, "learning_rate": 0.0002, "epoch": 4.4624775583482945, "step": 62140}, {"loss": 0.6238, "grad_norm": 1.142580509185791, "learning_rate": 0.0002, "epoch": 4.4631956912028725, "step": 62150}, {"loss": 0.6502, "grad_norm": 0.9843677878379822, "learning_rate": 0.0002, "epoch": 4.4639138240574505, "step": 62160}, {"loss": 0.6734, "grad_norm": 1.0351676940917969, "learning_rate": 0.0002, "epoch": 4.4646319569120285, "step": 62170}, {"loss": 0.6371, "grad_norm": 0.9365093111991882, "learning_rate": 0.0002, "epoch": 4.4653500897666065, "step": 62180}, {"loss": 0.5827, "grad_norm": 1.041193962097168, "learning_rate": 0.0002, "epoch": 4.4660682226211845, "step": 62190}, {"loss": 0.555, "grad_norm": 0.9686329960823059, "learning_rate": 0.0002, "epoch": 4.466786355475763, "step": 62200}, {"loss": 0.6405, "grad_norm": 1.028622031211853, "learning_rate": 0.0002, "epoch": 4.467504488330341, "step": 62210}, {"loss": 0.5928, "grad_norm": 0.9717516899108887, "learning_rate": 0.0002, "epoch": 4.468222621184919, "step": 62220}, {"loss": 0.6028, "grad_norm": 1.0467450618743896, "learning_rate": 0.0002, "epoch": 4.468940754039497, "step": 62230}, {"loss": 0.593, "grad_norm": 0.943717896938324, "learning_rate": 0.0002, "epoch": 4.469658886894075, "step": 62240}, {"loss": 0.5861, "grad_norm": 0.909429132938385, "learning_rate": 0.0002, "epoch": 4.470377019748653, "step": 62250}, {"loss": 0.6211, "grad_norm": 1.0294792652130127, "learning_rate": 0.0002, "epoch": 4.471095152603231, "step": 62260}, {"loss": 0.6215, "grad_norm": 1.1044281721115112, "learning_rate": 0.0002, "epoch": 4.471813285457809, "step": 62270}, {"loss": 0.6147, "grad_norm": 1.1555784940719604, "learning_rate": 0.0002, "epoch": 4.472531418312387, "step": 62280}, {"loss": 0.627, "grad_norm": 0.9441297650337219, "learning_rate": 0.0002, "epoch": 4.473249551166965, "step": 62290}, {"loss": 0.6205, "grad_norm": 0.9164380431175232, "learning_rate": 0.0002, "epoch": 4.473967684021544, "step": 62300}, {"loss": 0.6413, "grad_norm": 1.1139159202575684, "learning_rate": 0.0002, "epoch": 4.474685816876122, "step": 62310}, {"loss": 0.6013, "grad_norm": 1.0201882123947144, "learning_rate": 0.0002, "epoch": 4.4754039497307, "step": 62320}, {"loss": 0.6127, "grad_norm": 1.1471681594848633, "learning_rate": 0.0002, "epoch": 4.476122082585278, "step": 62330}, {"loss": 0.6322, "grad_norm": 1.0333549976348877, "learning_rate": 0.0002, "epoch": 4.476840215439856, "step": 62340}, {"loss": 0.654, "grad_norm": 0.8929767608642578, "learning_rate": 0.0002, "epoch": 4.477558348294434, "step": 62350}, {"loss": 0.6325, "grad_norm": 0.9465752840042114, "learning_rate": 0.0002, "epoch": 4.478276481149012, "step": 62360}, {"loss": 0.619, "grad_norm": 1.2155033349990845, "learning_rate": 0.0002, "epoch": 4.47899461400359, "step": 62370}, {"loss": 0.5538, "grad_norm": 0.7181217074394226, "learning_rate": 0.0002, "epoch": 4.479712746858169, "step": 62380}, {"loss": 0.6236, "grad_norm": 1.0052744150161743, "learning_rate": 0.0002, "epoch": 4.480430879712747, "step": 62390}, {"loss": 0.6443, "grad_norm": 0.8522219061851501, "learning_rate": 0.0002, "epoch": 4.481149012567325, "step": 62400}, {"loss": 0.6073, "grad_norm": 0.8844723105430603, "learning_rate": 0.0002, "epoch": 4.481867145421903, "step": 62410}, {"loss": 0.6193, "grad_norm": 0.9542465209960938, "learning_rate": 0.0002, "epoch": 4.482585278276481, "step": 62420}, {"loss": 0.6099, "grad_norm": 0.8963674306869507, "learning_rate": 0.0002, "epoch": 4.483303411131059, "step": 62430}, {"loss": 0.5826, "grad_norm": 0.8105363845825195, "learning_rate": 0.0002, "epoch": 4.484021543985637, "step": 62440}, {"loss": 0.6688, "grad_norm": 0.9618421196937561, "learning_rate": 0.0002, "epoch": 4.484739676840215, "step": 62450}, {"loss": 0.6042, "grad_norm": 1.1931076049804688, "learning_rate": 0.0002, "epoch": 4.485457809694793, "step": 62460}, {"loss": 0.5869, "grad_norm": 0.7406999468803406, "learning_rate": 0.0002, "epoch": 4.486175942549371, "step": 62470}, {"loss": 0.604, "grad_norm": 0.7698216438293457, "learning_rate": 0.0002, "epoch": 4.48689407540395, "step": 62480}, {"loss": 0.6062, "grad_norm": 0.862271249294281, "learning_rate": 0.0002, "epoch": 4.487612208258528, "step": 62490}, {"loss": 0.645, "grad_norm": 1.0025171041488647, "learning_rate": 0.0002, "epoch": 4.488330341113106, "step": 62500}, {"loss": 0.5727, "grad_norm": 0.8474493622779846, "learning_rate": 0.0002, "epoch": 4.489048473967684, "step": 62510}, {"loss": 0.6907, "grad_norm": 0.8965697884559631, "learning_rate": 0.0002, "epoch": 4.489766606822262, "step": 62520}, {"loss": 0.5846, "grad_norm": 1.1276488304138184, "learning_rate": 0.0002, "epoch": 4.49048473967684, "step": 62530}, {"loss": 0.6018, "grad_norm": 1.0253537893295288, "learning_rate": 0.0002, "epoch": 4.491202872531418, "step": 62540}, {"loss": 0.5831, "grad_norm": 1.1750596761703491, "learning_rate": 0.0002, "epoch": 4.491921005385996, "step": 62550}, {"loss": 0.6272, "grad_norm": 0.9951794147491455, "learning_rate": 0.0002, "epoch": 4.492639138240574, "step": 62560}, {"loss": 0.5931, "grad_norm": 1.2510017156600952, "learning_rate": 0.0002, "epoch": 4.493357271095153, "step": 62570}, {"loss": 0.6268, "grad_norm": 1.4066375494003296, "learning_rate": 0.0002, "epoch": 4.494075403949731, "step": 62580}, {"loss": 0.6274, "grad_norm": 0.988175094127655, "learning_rate": 0.0002, "epoch": 4.494793536804309, "step": 62590}, {"loss": 0.607, "grad_norm": 1.2049115896224976, "learning_rate": 0.0002, "epoch": 4.495511669658887, "step": 62600}, {"loss": 0.6384, "grad_norm": 0.962464451789856, "learning_rate": 0.0002, "epoch": 4.496229802513465, "step": 62610}, {"loss": 0.6436, "grad_norm": 0.9324793815612793, "learning_rate": 0.0002, "epoch": 4.496947935368043, "step": 62620}, {"loss": 0.6568, "grad_norm": 0.9174214005470276, "learning_rate": 0.0002, "epoch": 4.497666068222621, "step": 62630}, {"loss": 0.6146, "grad_norm": 0.9729902148246765, "learning_rate": 0.0002, "epoch": 4.498384201077199, "step": 62640}, {"loss": 0.6564, "grad_norm": 1.0190484523773193, "learning_rate": 0.0002, "epoch": 4.499102333931777, "step": 62650}, {"loss": 0.6571, "grad_norm": 1.1473679542541504, "learning_rate": 0.0002, "epoch": 4.499820466786355, "step": 62660}, {"loss": 0.6115, "grad_norm": 1.0160558223724365, "learning_rate": 0.0002, "epoch": 4.500538599640934, "step": 62670}, {"loss": 0.6206, "grad_norm": 0.8083887100219727, "learning_rate": 0.0002, "epoch": 4.501256732495512, "step": 62680}, {"loss": 0.6107, "grad_norm": 0.941933274269104, "learning_rate": 0.0002, "epoch": 4.50197486535009, "step": 62690}, {"loss": 0.6181, "grad_norm": 0.9962822794914246, "learning_rate": 0.0002, "epoch": 4.502692998204668, "step": 62700}, {"loss": 0.6364, "grad_norm": 0.8993943333625793, "learning_rate": 0.0002, "epoch": 4.503411131059246, "step": 62710}, {"loss": 0.6141, "grad_norm": 0.9438319206237793, "learning_rate": 0.0002, "epoch": 4.504129263913824, "step": 62720}, {"loss": 0.6453, "grad_norm": 0.7951892018318176, "learning_rate": 0.0002, "epoch": 4.504847396768402, "step": 62730}, {"loss": 0.616, "grad_norm": 0.8875413537025452, "learning_rate": 0.0002, "epoch": 4.50556552962298, "step": 62740}, {"loss": 0.5702, "grad_norm": 0.993819534778595, "learning_rate": 0.0002, "epoch": 4.506283662477558, "step": 62750}, {"loss": 0.6427, "grad_norm": 0.9177559018135071, "learning_rate": 0.0002, "epoch": 4.507001795332137, "step": 62760}, {"loss": 0.6278, "grad_norm": 0.8632771968841553, "learning_rate": 0.0002, "epoch": 4.507719928186715, "step": 62770}, {"loss": 0.6665, "grad_norm": 0.943778395652771, "learning_rate": 0.0002, "epoch": 4.508438061041293, "step": 62780}, {"loss": 0.6068, "grad_norm": 0.8754997849464417, "learning_rate": 0.0002, "epoch": 4.509156193895871, "step": 62790}, {"loss": 0.6345, "grad_norm": 1.102683424949646, "learning_rate": 0.0002, "epoch": 4.509874326750449, "step": 62800}, {"loss": 0.6057, "grad_norm": 1.1156457662582397, "learning_rate": 0.0002, "epoch": 4.510592459605027, "step": 62810}, {"loss": 0.5915, "grad_norm": 0.9178887009620667, "learning_rate": 0.0002, "epoch": 4.511310592459605, "step": 62820}, {"loss": 0.6081, "grad_norm": 0.9520689249038696, "learning_rate": 0.0002, "epoch": 4.512028725314183, "step": 62830}, {"loss": 0.6434, "grad_norm": 0.8880525231361389, "learning_rate": 0.0002, "epoch": 4.512746858168761, "step": 62840}, {"loss": 0.6895, "grad_norm": 0.9541497826576233, "learning_rate": 0.0002, "epoch": 4.513464991023339, "step": 62850}, {"loss": 0.6675, "grad_norm": 1.003766417503357, "learning_rate": 0.0002, "epoch": 4.514183123877918, "step": 62860}, {"loss": 0.6412, "grad_norm": 0.8844705820083618, "learning_rate": 0.0002, "epoch": 4.514901256732496, "step": 62870}, {"loss": 0.6289, "grad_norm": 1.1870828866958618, "learning_rate": 0.0002, "epoch": 4.515619389587074, "step": 62880}, {"loss": 0.6611, "grad_norm": 0.863487184047699, "learning_rate": 0.0002, "epoch": 4.516337522441652, "step": 62890}, {"loss": 0.59, "grad_norm": 0.997770369052887, "learning_rate": 0.0002, "epoch": 4.51705565529623, "step": 62900}, {"loss": 0.6476, "grad_norm": 0.9708612561225891, "learning_rate": 0.0002, "epoch": 4.517773788150808, "step": 62910}, {"loss": 0.6084, "grad_norm": 1.1381206512451172, "learning_rate": 0.0002, "epoch": 4.518491921005386, "step": 62920}, {"loss": 0.5739, "grad_norm": 1.0386693477630615, "learning_rate": 0.0002, "epoch": 4.519210053859964, "step": 62930}, {"loss": 0.6038, "grad_norm": 1.1711705923080444, "learning_rate": 0.0002, "epoch": 4.519928186714543, "step": 62940}, {"loss": 0.6276, "grad_norm": 0.8727447390556335, "learning_rate": 0.0002, "epoch": 4.520646319569121, "step": 62950}, {"loss": 0.6298, "grad_norm": 0.9215193390846252, "learning_rate": 0.0002, "epoch": 4.521364452423699, "step": 62960}, {"loss": 0.6199, "grad_norm": 1.005467176437378, "learning_rate": 0.0002, "epoch": 4.522082585278277, "step": 62970}, {"loss": 0.6324, "grad_norm": 0.8761187791824341, "learning_rate": 0.0002, "epoch": 4.522800718132855, "step": 62980}, {"loss": 0.6152, "grad_norm": 0.957848310470581, "learning_rate": 0.0002, "epoch": 4.523518850987433, "step": 62990}, {"loss": 0.5752, "grad_norm": 0.8634148836135864, "learning_rate": 0.0002, "epoch": 4.524236983842011, "step": 63000}, {"loss": 0.6127, "grad_norm": 0.9557477235794067, "learning_rate": 0.0002, "epoch": 4.524955116696589, "step": 63010}, {"loss": 0.5708, "grad_norm": 1.017720341682434, "learning_rate": 0.0002, "epoch": 4.525673249551167, "step": 63020}, {"loss": 0.6186, "grad_norm": 1.0281825065612793, "learning_rate": 0.0002, "epoch": 4.526391382405745, "step": 63030}, {"loss": 0.6221, "grad_norm": 1.253974437713623, "learning_rate": 0.0002, "epoch": 4.527109515260323, "step": 63040}, {"loss": 0.6381, "grad_norm": 0.8489068150520325, "learning_rate": 0.0002, "epoch": 4.527827648114902, "step": 63050}, {"loss": 0.6022, "grad_norm": 0.9681686162948608, "learning_rate": 0.0002, "epoch": 4.52854578096948, "step": 63060}, {"loss": 0.6166, "grad_norm": 1.10277259349823, "learning_rate": 0.0002, "epoch": 4.529263913824058, "step": 63070}, {"loss": 0.5838, "grad_norm": 0.9469163417816162, "learning_rate": 0.0002, "epoch": 4.529982046678636, "step": 63080}, {"loss": 0.6323, "grad_norm": 1.1228134632110596, "learning_rate": 0.0002, "epoch": 4.530700179533214, "step": 63090}, {"loss": 0.6143, "grad_norm": 0.9673212170600891, "learning_rate": 0.0002, "epoch": 4.531418312387792, "step": 63100}, {"loss": 0.713, "grad_norm": 1.0221107006072998, "learning_rate": 0.0002, "epoch": 4.53213644524237, "step": 63110}, {"loss": 0.6099, "grad_norm": 0.826372504234314, "learning_rate": 0.0002, "epoch": 4.532854578096948, "step": 63120}, {"loss": 0.6487, "grad_norm": 1.1805331707000732, "learning_rate": 0.0002, "epoch": 4.5335727109515265, "step": 63130}, {"loss": 0.6088, "grad_norm": 0.9645666480064392, "learning_rate": 0.0002, "epoch": 4.5342908438061045, "step": 63140}, {"loss": 0.6049, "grad_norm": 1.0838309526443481, "learning_rate": 0.0002, "epoch": 4.5350089766606825, "step": 63150}, {"loss": 0.5972, "grad_norm": 1.061414361000061, "learning_rate": 0.0002, "epoch": 4.5357271095152605, "step": 63160}, {"loss": 0.5706, "grad_norm": 0.841961145401001, "learning_rate": 0.0002, "epoch": 4.5364452423698385, "step": 63170}, {"loss": 0.6168, "grad_norm": 1.1220186948776245, "learning_rate": 0.0002, "epoch": 4.5371633752244165, "step": 63180}, {"loss": 0.6055, "grad_norm": 1.036441445350647, "learning_rate": 0.0002, "epoch": 4.5378815080789945, "step": 63190}, {"loss": 0.619, "grad_norm": 0.9089716076850891, "learning_rate": 0.0002, "epoch": 4.5385996409335725, "step": 63200}, {"loss": 0.6373, "grad_norm": 0.8699982762336731, "learning_rate": 0.0002, "epoch": 4.5393177737881505, "step": 63210}, {"loss": 0.6082, "grad_norm": 0.8489565253257751, "learning_rate": 0.0002, "epoch": 4.5400359066427285, "step": 63220}, {"loss": 0.5957, "grad_norm": 0.7778416275978088, "learning_rate": 0.0002, "epoch": 4.540754039497307, "step": 63230}, {"loss": 0.6109, "grad_norm": 1.0625852346420288, "learning_rate": 0.0002, "epoch": 4.541472172351885, "step": 63240}, {"loss": 0.6039, "grad_norm": 0.8515732884407043, "learning_rate": 0.0002, "epoch": 4.542190305206463, "step": 63250}, {"loss": 0.5827, "grad_norm": 0.7679561376571655, "learning_rate": 0.0002, "epoch": 4.542908438061041, "step": 63260}, {"loss": 0.5948, "grad_norm": 0.7358446717262268, "learning_rate": 0.0002, "epoch": 4.543626570915619, "step": 63270}, {"loss": 0.6265, "grad_norm": 1.0866128206253052, "learning_rate": 0.0002, "epoch": 4.544344703770197, "step": 63280}, {"loss": 0.6622, "grad_norm": 1.0870225429534912, "learning_rate": 0.0002, "epoch": 4.545062836624775, "step": 63290}, {"loss": 0.5859, "grad_norm": 0.951095461845398, "learning_rate": 0.0002, "epoch": 4.545780969479353, "step": 63300}, {"loss": 0.6252, "grad_norm": 1.0914306640625, "learning_rate": 0.0002, "epoch": 4.546499102333931, "step": 63310}, {"loss": 0.6504, "grad_norm": 0.8676106333732605, "learning_rate": 0.0002, "epoch": 4.54721723518851, "step": 63320}, {"loss": 0.6088, "grad_norm": 1.0129096508026123, "learning_rate": 0.0002, "epoch": 4.547935368043088, "step": 63330}, {"loss": 0.617, "grad_norm": 0.8710526823997498, "learning_rate": 0.0002, "epoch": 4.548653500897666, "step": 63340}, {"loss": 0.6336, "grad_norm": 0.7014815807342529, "learning_rate": 0.0002, "epoch": 4.549371633752244, "step": 63350}, {"loss": 0.5758, "grad_norm": 1.1546777486801147, "learning_rate": 0.0002, "epoch": 4.550089766606822, "step": 63360}, {"loss": 0.5976, "grad_norm": 0.7464957237243652, "learning_rate": 0.0002, "epoch": 4.5508078994614, "step": 63370}, {"loss": 0.6016, "grad_norm": 0.9976209998130798, "learning_rate": 0.0002, "epoch": 4.551526032315978, "step": 63380}, {"loss": 0.5784, "grad_norm": 0.9543681740760803, "learning_rate": 0.0002, "epoch": 4.552244165170556, "step": 63390}, {"loss": 0.5873, "grad_norm": 1.1498578786849976, "learning_rate": 0.0002, "epoch": 4.552962298025134, "step": 63400}, {"loss": 0.6445, "grad_norm": 1.0162293910980225, "learning_rate": 0.0002, "epoch": 4.553680430879712, "step": 63410}, {"loss": 0.5677, "grad_norm": 0.9015304446220398, "learning_rate": 0.0002, "epoch": 4.554398563734291, "step": 63420}, {"loss": 0.6257, "grad_norm": 1.1639831066131592, "learning_rate": 0.0002, "epoch": 4.555116696588869, "step": 63430}, {"loss": 0.6763, "grad_norm": 0.9494703412055969, "learning_rate": 0.0002, "epoch": 4.555834829443447, "step": 63440}, {"loss": 0.5955, "grad_norm": 1.0555956363677979, "learning_rate": 0.0002, "epoch": 4.556552962298025, "step": 63450}, {"loss": 0.6634, "grad_norm": 0.8513827919960022, "learning_rate": 0.0002, "epoch": 4.557271095152603, "step": 63460}, {"loss": 0.6507, "grad_norm": 1.0614275932312012, "learning_rate": 0.0002, "epoch": 4.557989228007181, "step": 63470}, {"loss": 0.5619, "grad_norm": 0.8341137766838074, "learning_rate": 0.0002, "epoch": 4.558707360861759, "step": 63480}, {"loss": 0.6147, "grad_norm": 1.2136222124099731, "learning_rate": 0.0002, "epoch": 4.559425493716337, "step": 63490}, {"loss": 0.6313, "grad_norm": 0.8806019425392151, "learning_rate": 0.0002, "epoch": 4.560143626570916, "step": 63500}, {"loss": 0.6012, "grad_norm": 1.2548854351043701, "learning_rate": 0.0002, "epoch": 4.560861759425494, "step": 63510}, {"loss": 0.5995, "grad_norm": 1.0162668228149414, "learning_rate": 0.0002, "epoch": 4.561579892280072, "step": 63520}, {"loss": 0.5895, "grad_norm": 1.0487624406814575, "learning_rate": 0.0002, "epoch": 4.56229802513465, "step": 63530}, {"loss": 0.5997, "grad_norm": 1.2505502700805664, "learning_rate": 0.0002, "epoch": 4.563016157989228, "step": 63540}, {"loss": 0.618, "grad_norm": 0.9930511713027954, "learning_rate": 0.0002, "epoch": 4.563734290843806, "step": 63550}, {"loss": 0.6695, "grad_norm": 0.8132568001747131, "learning_rate": 0.0002, "epoch": 4.564452423698384, "step": 63560}, {"loss": 0.6221, "grad_norm": 1.0129177570343018, "learning_rate": 0.0002, "epoch": 4.565170556552962, "step": 63570}, {"loss": 0.6463, "grad_norm": 0.9011693596839905, "learning_rate": 0.0002, "epoch": 4.56588868940754, "step": 63580}, {"loss": 0.6046, "grad_norm": 0.9161545634269714, "learning_rate": 0.0002, "epoch": 4.566606822262118, "step": 63590}, {"loss": 0.6413, "grad_norm": 0.8852348327636719, "learning_rate": 0.0002, "epoch": 4.567324955116696, "step": 63600}, {"loss": 0.6282, "grad_norm": 0.8579391837120056, "learning_rate": 0.0002, "epoch": 4.568043087971275, "step": 63610}, {"loss": 0.6041, "grad_norm": 0.9271050095558167, "learning_rate": 0.0002, "epoch": 4.568761220825853, "step": 63620}, {"loss": 0.6156, "grad_norm": 0.9881834983825684, "learning_rate": 0.0002, "epoch": 4.569479353680431, "step": 63630}, {"loss": 0.6164, "grad_norm": 1.0255686044692993, "learning_rate": 0.0002, "epoch": 4.570197486535009, "step": 63640}, {"loss": 0.6416, "grad_norm": 0.8758876919746399, "learning_rate": 0.0002, "epoch": 4.570915619389587, "step": 63650}, {"loss": 0.6787, "grad_norm": 1.0134185552597046, "learning_rate": 0.0002, "epoch": 4.571633752244165, "step": 63660}, {"loss": 0.6245, "grad_norm": 0.8535705208778381, "learning_rate": 0.0002, "epoch": 4.572351885098743, "step": 63670}, {"loss": 0.6282, "grad_norm": 0.9614834785461426, "learning_rate": 0.0002, "epoch": 4.573070017953321, "step": 63680}, {"loss": 0.6461, "grad_norm": 0.9004243612289429, "learning_rate": 0.0002, "epoch": 4.5737881508079, "step": 63690}, {"loss": 0.6172, "grad_norm": 0.9563080072402954, "learning_rate": 0.0002, "epoch": 4.574506283662478, "step": 63700}, {"loss": 0.6059, "grad_norm": 1.024857521057129, "learning_rate": 0.0002, "epoch": 4.575224416517056, "step": 63710}, {"loss": 0.6188, "grad_norm": 0.9345638155937195, "learning_rate": 0.0002, "epoch": 4.575942549371634, "step": 63720}, {"loss": 0.6814, "grad_norm": 1.27083158493042, "learning_rate": 0.0002, "epoch": 4.576660682226212, "step": 63730}, {"loss": 0.5987, "grad_norm": 1.0866559743881226, "learning_rate": 0.0002, "epoch": 4.57737881508079, "step": 63740}, {"loss": 0.5738, "grad_norm": 0.9253925681114197, "learning_rate": 0.0002, "epoch": 4.578096947935368, "step": 63750}, {"loss": 0.5981, "grad_norm": 0.8127399682998657, "learning_rate": 0.0002, "epoch": 4.578815080789946, "step": 63760}, {"loss": 0.6321, "grad_norm": 1.0453993082046509, "learning_rate": 0.0002, "epoch": 4.579533213644524, "step": 63770}, {"loss": 0.6423, "grad_norm": 1.2227544784545898, "learning_rate": 0.0002, "epoch": 4.580251346499102, "step": 63780}, {"loss": 0.6405, "grad_norm": 1.0207865238189697, "learning_rate": 0.0002, "epoch": 4.580969479353681, "step": 63790}, {"loss": 0.6268, "grad_norm": 1.030447244644165, "learning_rate": 0.0002, "epoch": 4.581687612208259, "step": 63800}, {"loss": 0.6014, "grad_norm": 1.0855677127838135, "learning_rate": 0.0002, "epoch": 4.582405745062837, "step": 63810}, {"loss": 0.6204, "grad_norm": 0.9572556018829346, "learning_rate": 0.0002, "epoch": 4.583123877917415, "step": 63820}, {"loss": 0.6094, "grad_norm": 0.9061040282249451, "learning_rate": 0.0002, "epoch": 4.583842010771993, "step": 63830}, {"loss": 0.6074, "grad_norm": 0.9267677068710327, "learning_rate": 0.0002, "epoch": 4.584560143626571, "step": 63840}, {"loss": 0.6525, "grad_norm": 1.070076823234558, "learning_rate": 0.0002, "epoch": 4.585278276481149, "step": 63850}, {"loss": 0.6074, "grad_norm": 1.045881748199463, "learning_rate": 0.0002, "epoch": 4.585996409335727, "step": 63860}, {"loss": 0.6106, "grad_norm": 0.9190576672554016, "learning_rate": 0.0002, "epoch": 4.586714542190305, "step": 63870}, {"loss": 0.6213, "grad_norm": 0.9263932704925537, "learning_rate": 0.0002, "epoch": 4.587432675044884, "step": 63880}, {"loss": 0.6077, "grad_norm": 1.0217589139938354, "learning_rate": 0.0002, "epoch": 4.588150807899462, "step": 63890}, {"loss": 0.5798, "grad_norm": 0.9200088381767273, "learning_rate": 0.0002, "epoch": 4.58886894075404, "step": 63900}, {"loss": 0.6311, "grad_norm": 0.9877251386642456, "learning_rate": 0.0002, "epoch": 4.589587073608618, "step": 63910}, {"loss": 0.5981, "grad_norm": 1.0059093236923218, "learning_rate": 0.0002, "epoch": 4.590305206463196, "step": 63920}, {"loss": 0.6265, "grad_norm": 1.2618095874786377, "learning_rate": 0.0002, "epoch": 4.591023339317774, "step": 63930}, {"loss": 0.583, "grad_norm": 1.1779268980026245, "learning_rate": 0.0002, "epoch": 4.591741472172352, "step": 63940}, {"loss": 0.6232, "grad_norm": 1.2339502573013306, "learning_rate": 0.0002, "epoch": 4.59245960502693, "step": 63950}, {"loss": 0.5985, "grad_norm": 0.7488788366317749, "learning_rate": 0.0002, "epoch": 4.593177737881508, "step": 63960}, {"loss": 0.5991, "grad_norm": 0.8366380929946899, "learning_rate": 0.0002, "epoch": 4.593895870736086, "step": 63970}, {"loss": 0.5864, "grad_norm": 1.0292677879333496, "learning_rate": 0.0002, "epoch": 4.594614003590665, "step": 63980}, {"loss": 0.666, "grad_norm": 0.7938551306724548, "learning_rate": 0.0002, "epoch": 4.595332136445243, "step": 63990}, {"loss": 0.6202, "grad_norm": 0.7958516478538513, "learning_rate": 0.0002, "epoch": 4.596050269299821, "step": 64000}, {"loss": 0.5868, "grad_norm": 0.9613908529281616, "learning_rate": 0.0002, "epoch": 4.596768402154399, "step": 64010}, {"loss": 0.6299, "grad_norm": 1.0253773927688599, "learning_rate": 0.0002, "epoch": 4.597486535008977, "step": 64020}, {"loss": 0.5964, "grad_norm": 1.0560888051986694, "learning_rate": 0.0002, "epoch": 4.598204667863555, "step": 64030}, {"loss": 0.6681, "grad_norm": 1.1093556880950928, "learning_rate": 0.0002, "epoch": 4.598922800718133, "step": 64040}, {"loss": 0.6097, "grad_norm": 0.8492098450660706, "learning_rate": 0.0002, "epoch": 4.599640933572711, "step": 64050}, {"loss": 0.6029, "grad_norm": 1.0070436000823975, "learning_rate": 0.0002, "epoch": 4.6003590664272895, "step": 64060}, {"loss": 0.6392, "grad_norm": 0.9774282574653625, "learning_rate": 0.0002, "epoch": 4.6010771992818675, "step": 64070}, {"loss": 0.6397, "grad_norm": 1.0744960308074951, "learning_rate": 0.0002, "epoch": 4.6017953321364455, "step": 64080}, {"loss": 0.6491, "grad_norm": 1.0101491212844849, "learning_rate": 0.0002, "epoch": 4.6025134649910235, "step": 64090}, {"loss": 0.594, "grad_norm": 1.2306591272354126, "learning_rate": 0.0002, "epoch": 4.6032315978456015, "step": 64100}, {"loss": 0.5783, "grad_norm": 0.9187033176422119, "learning_rate": 0.0002, "epoch": 4.6039497307001795, "step": 64110}, {"loss": 0.5982, "grad_norm": 0.9178676605224609, "learning_rate": 0.0002, "epoch": 4.6046678635547575, "step": 64120}, {"loss": 0.6074, "grad_norm": 1.006374716758728, "learning_rate": 0.0002, "epoch": 4.6053859964093355, "step": 64130}, {"loss": 0.6402, "grad_norm": 1.0774449110031128, "learning_rate": 0.0002, "epoch": 4.6061041292639135, "step": 64140}, {"loss": 0.6076, "grad_norm": 1.0360658168792725, "learning_rate": 0.0002, "epoch": 4.6068222621184916, "step": 64150}, {"loss": 0.6259, "grad_norm": 1.1061090230941772, "learning_rate": 0.0002, "epoch": 4.6075403949730696, "step": 64160}, {"loss": 0.6304, "grad_norm": 1.0320971012115479, "learning_rate": 0.0002, "epoch": 4.608258527827648, "step": 64170}, {"loss": 0.6182, "grad_norm": 0.8596988916397095, "learning_rate": 0.0002, "epoch": 4.6089766606822264, "step": 64180}, {"loss": 0.5646, "grad_norm": 1.1665741205215454, "learning_rate": 0.0002, "epoch": 4.6096947935368044, "step": 64190}, {"loss": 0.6219, "grad_norm": 0.857207715511322, "learning_rate": 0.0002, "epoch": 4.6104129263913824, "step": 64200}, {"loss": 0.6271, "grad_norm": 1.0088987350463867, "learning_rate": 0.0002, "epoch": 4.6111310592459605, "step": 64210}, {"loss": 0.6209, "grad_norm": 1.0985605716705322, "learning_rate": 0.0002, "epoch": 4.6118491921005385, "step": 64220}, {"loss": 0.6455, "grad_norm": 0.9504913687705994, "learning_rate": 0.0002, "epoch": 4.6125673249551165, "step": 64230}, {"loss": 0.6054, "grad_norm": 0.8415018916130066, "learning_rate": 0.0002, "epoch": 4.6132854578096945, "step": 64240}, {"loss": 0.5975, "grad_norm": 0.9857034087181091, "learning_rate": 0.0002, "epoch": 4.614003590664273, "step": 64250}, {"loss": 0.6347, "grad_norm": 1.0164235830307007, "learning_rate": 0.0002, "epoch": 4.614721723518851, "step": 64260}, {"loss": 0.5877, "grad_norm": 0.949481725692749, "learning_rate": 0.0002, "epoch": 4.615439856373429, "step": 64270}, {"loss": 0.5737, "grad_norm": 0.9526455998420715, "learning_rate": 0.0002, "epoch": 4.616157989228007, "step": 64280}, {"loss": 0.6134, "grad_norm": 1.1121242046356201, "learning_rate": 0.0002, "epoch": 4.616876122082585, "step": 64290}, {"loss": 0.6152, "grad_norm": 0.9598871469497681, "learning_rate": 0.0002, "epoch": 4.617594254937163, "step": 64300}, {"loss": 0.6405, "grad_norm": 1.0406304597854614, "learning_rate": 0.0002, "epoch": 4.618312387791741, "step": 64310}, {"loss": 0.5971, "grad_norm": 1.1816964149475098, "learning_rate": 0.0002, "epoch": 4.619030520646319, "step": 64320}, {"loss": 0.6483, "grad_norm": 0.9818326830863953, "learning_rate": 0.0002, "epoch": 4.619748653500897, "step": 64330}, {"loss": 0.6141, "grad_norm": 0.952017605304718, "learning_rate": 0.0002, "epoch": 4.620466786355475, "step": 64340}, {"loss": 0.6146, "grad_norm": 1.1263453960418701, "learning_rate": 0.0002, "epoch": 4.621184919210053, "step": 64350}, {"loss": 0.5973, "grad_norm": 1.1158473491668701, "learning_rate": 0.0002, "epoch": 4.621903052064632, "step": 64360}, {"loss": 0.6029, "grad_norm": 0.9056766033172607, "learning_rate": 0.0002, "epoch": 4.62262118491921, "step": 64370}, {"loss": 0.6488, "grad_norm": 0.8113203048706055, "learning_rate": 0.0002, "epoch": 4.623339317773788, "step": 64380}, {"loss": 0.6391, "grad_norm": 0.8646712899208069, "learning_rate": 0.0002, "epoch": 4.624057450628366, "step": 64390}, {"loss": 0.6191, "grad_norm": 1.0064425468444824, "learning_rate": 0.0002, "epoch": 4.624775583482944, "step": 64400}, {"loss": 0.5826, "grad_norm": 0.9867565631866455, "learning_rate": 0.0002, "epoch": 4.625493716337522, "step": 64410}, {"loss": 0.6409, "grad_norm": 1.018764615058899, "learning_rate": 0.0002, "epoch": 4.6262118491921, "step": 64420}, {"loss": 0.5992, "grad_norm": 1.0607863664627075, "learning_rate": 0.0002, "epoch": 4.626929982046678, "step": 64430}, {"loss": 0.6502, "grad_norm": 1.012825846672058, "learning_rate": 0.0002, "epoch": 4.627648114901257, "step": 64440}, {"loss": 0.6074, "grad_norm": 0.8441653847694397, "learning_rate": 0.0002, "epoch": 4.628366247755835, "step": 64450}, {"loss": 0.6462, "grad_norm": 0.9819194674491882, "learning_rate": 0.0002, "epoch": 4.629084380610413, "step": 64460}, {"loss": 0.5983, "grad_norm": 0.925519585609436, "learning_rate": 0.0002, "epoch": 4.629802513464991, "step": 64470}, {"loss": 0.5959, "grad_norm": 0.9409030079841614, "learning_rate": 0.0002, "epoch": 4.630520646319569, "step": 64480}, {"loss": 0.6265, "grad_norm": 1.148024559020996, "learning_rate": 0.0002, "epoch": 4.631238779174147, "step": 64490}, {"loss": 0.6556, "grad_norm": 0.8225533962249756, "learning_rate": 0.0002, "epoch": 4.631956912028725, "step": 64500}, {"loss": 0.5922, "grad_norm": 0.8806734681129456, "learning_rate": 0.0002, "epoch": 4.632675044883303, "step": 64510}, {"loss": 0.6202, "grad_norm": 0.9656694531440735, "learning_rate": 0.0002, "epoch": 4.633393177737881, "step": 64520}, {"loss": 0.6044, "grad_norm": 0.9977783560752869, "learning_rate": 0.0002, "epoch": 4.634111310592459, "step": 64530}, {"loss": 0.5741, "grad_norm": 0.9259420037269592, "learning_rate": 0.0002, "epoch": 4.634829443447038, "step": 64540}, {"loss": 0.5801, "grad_norm": 1.0215885639190674, "learning_rate": 0.0002, "epoch": 4.635547576301616, "step": 64550}, {"loss": 0.6492, "grad_norm": 1.1082557439804077, "learning_rate": 0.0002, "epoch": 4.636265709156194, "step": 64560}, {"loss": 0.6285, "grad_norm": 1.1183207035064697, "learning_rate": 0.0002, "epoch": 4.636983842010772, "step": 64570}, {"loss": 0.6216, "grad_norm": 0.9914339184761047, "learning_rate": 0.0002, "epoch": 4.63770197486535, "step": 64580}, {"loss": 0.6416, "grad_norm": 0.8065831661224365, "learning_rate": 0.0002, "epoch": 4.638420107719928, "step": 64590}, {"loss": 0.6078, "grad_norm": 1.1546721458435059, "learning_rate": 0.0002, "epoch": 4.639138240574506, "step": 64600}, {"loss": 0.6219, "grad_norm": 1.0395900011062622, "learning_rate": 0.0002, "epoch": 4.639856373429084, "step": 64610}, {"loss": 0.5939, "grad_norm": 0.9957455992698669, "learning_rate": 0.0002, "epoch": 4.640574506283663, "step": 64620}, {"loss": 0.6653, "grad_norm": 1.069557785987854, "learning_rate": 0.0002, "epoch": 4.641292639138241, "step": 64630}, {"loss": 0.6546, "grad_norm": 1.005236268043518, "learning_rate": 0.0002, "epoch": 4.642010771992819, "step": 64640}, {"loss": 0.6262, "grad_norm": 1.0216304063796997, "learning_rate": 0.0002, "epoch": 4.642728904847397, "step": 64650}, {"loss": 0.6756, "grad_norm": 0.8567317128181458, "learning_rate": 0.0002, "epoch": 4.643447037701975, "step": 64660}, {"loss": 0.5997, "grad_norm": 1.0386067628860474, "learning_rate": 0.0002, "epoch": 4.644165170556553, "step": 64670}, {"loss": 0.6471, "grad_norm": 0.9566055536270142, "learning_rate": 0.0002, "epoch": 4.644883303411131, "step": 64680}, {"loss": 0.6601, "grad_norm": 1.0990564823150635, "learning_rate": 0.0002, "epoch": 4.645601436265709, "step": 64690}, {"loss": 0.6418, "grad_norm": 0.9962695240974426, "learning_rate": 0.0002, "epoch": 4.646319569120287, "step": 64700}, {"loss": 0.6442, "grad_norm": 0.9041377305984497, "learning_rate": 0.0002, "epoch": 4.647037701974865, "step": 64710}, {"loss": 0.6276, "grad_norm": 0.8611233234405518, "learning_rate": 0.0002, "epoch": 4.647755834829443, "step": 64720}, {"loss": 0.6015, "grad_norm": 1.1569812297821045, "learning_rate": 0.0002, "epoch": 4.648473967684022, "step": 64730}, {"loss": 0.6169, "grad_norm": 0.7946197390556335, "learning_rate": 0.0002, "epoch": 4.6491921005386, "step": 64740}, {"loss": 0.668, "grad_norm": 0.9612061381340027, "learning_rate": 0.0002, "epoch": 4.649910233393178, "step": 64750}, {"loss": 0.6741, "grad_norm": 0.9669303297996521, "learning_rate": 0.0002, "epoch": 4.650628366247756, "step": 64760}, {"loss": 0.593, "grad_norm": 0.8117775321006775, "learning_rate": 0.0002, "epoch": 4.651346499102334, "step": 64770}, {"loss": 0.6915, "grad_norm": 1.2326241731643677, "learning_rate": 0.0002, "epoch": 4.652064631956912, "step": 64780}, {"loss": 0.6076, "grad_norm": 0.7494568228721619, "learning_rate": 0.0002, "epoch": 4.65278276481149, "step": 64790}, {"loss": 0.58, "grad_norm": 0.8145379424095154, "learning_rate": 0.0002, "epoch": 4.653500897666068, "step": 64800}, {"loss": 0.6351, "grad_norm": 1.0139610767364502, "learning_rate": 0.0002, "epoch": 4.654219030520647, "step": 64810}, {"loss": 0.6575, "grad_norm": 0.9887115359306335, "learning_rate": 0.0002, "epoch": 4.654937163375225, "step": 64820}, {"loss": 0.6338, "grad_norm": 0.9565147161483765, "learning_rate": 0.0002, "epoch": 4.655655296229803, "step": 64830}, {"loss": 0.6212, "grad_norm": 0.9022467136383057, "learning_rate": 0.0002, "epoch": 4.656373429084381, "step": 64840}, {"loss": 0.6395, "grad_norm": 1.075003981590271, "learning_rate": 0.0002, "epoch": 4.657091561938959, "step": 64850}, {"loss": 0.6191, "grad_norm": 0.8705733418464661, "learning_rate": 0.0002, "epoch": 4.657809694793537, "step": 64860}, {"loss": 0.5543, "grad_norm": 1.0826832056045532, "learning_rate": 0.0002, "epoch": 4.658527827648115, "step": 64870}, {"loss": 0.6363, "grad_norm": 1.1056268215179443, "learning_rate": 0.0002, "epoch": 4.659245960502693, "step": 64880}, {"loss": 0.6252, "grad_norm": 0.8664149641990662, "learning_rate": 0.0002, "epoch": 4.659964093357271, "step": 64890}, {"loss": 0.6126, "grad_norm": 0.9487230181694031, "learning_rate": 0.0002, "epoch": 4.660682226211849, "step": 64900}, {"loss": 0.5968, "grad_norm": 1.0357837677001953, "learning_rate": 0.0002, "epoch": 4.661400359066427, "step": 64910}, {"loss": 0.603, "grad_norm": 0.8620632290840149, "learning_rate": 0.0002, "epoch": 4.662118491921006, "step": 64920}, {"loss": 0.6113, "grad_norm": 1.108986735343933, "learning_rate": 0.0002, "epoch": 4.662836624775584, "step": 64930}, {"loss": 0.6115, "grad_norm": 0.8017674684524536, "learning_rate": 0.0002, "epoch": 4.663554757630162, "step": 64940}, {"loss": 0.6268, "grad_norm": 0.882347583770752, "learning_rate": 0.0002, "epoch": 4.66427289048474, "step": 64950}, {"loss": 0.657, "grad_norm": 0.9466867446899414, "learning_rate": 0.0002, "epoch": 4.664991023339318, "step": 64960}, {"loss": 0.645, "grad_norm": 1.1823636293411255, "learning_rate": 0.0002, "epoch": 4.665709156193896, "step": 64970}, {"loss": 0.5889, "grad_norm": 0.9535016417503357, "learning_rate": 0.0002, "epoch": 4.666427289048474, "step": 64980}, {"loss": 0.5986, "grad_norm": 0.9456726312637329, "learning_rate": 0.0002, "epoch": 4.667145421903052, "step": 64990}, {"loss": 0.6334, "grad_norm": 0.7761920690536499, "learning_rate": 0.0002, "epoch": 4.667863554757631, "step": 65000}, {"loss": 0.6645, "grad_norm": 1.060357689857483, "learning_rate": 0.0002, "epoch": 4.668581687612209, "step": 65010}, {"loss": 0.6369, "grad_norm": 0.9083862900733948, "learning_rate": 0.0002, "epoch": 4.669299820466787, "step": 65020}, {"loss": 0.5839, "grad_norm": 0.8745762705802917, "learning_rate": 0.0002, "epoch": 4.670017953321365, "step": 65030}, {"loss": 0.6517, "grad_norm": 0.8715422749519348, "learning_rate": 0.0002, "epoch": 4.670736086175943, "step": 65040}, {"loss": 0.6061, "grad_norm": 0.9407707452774048, "learning_rate": 0.0002, "epoch": 4.671454219030521, "step": 65050}, {"loss": 0.5928, "grad_norm": 0.8998945355415344, "learning_rate": 0.0002, "epoch": 4.672172351885099, "step": 65060}, {"loss": 0.6107, "grad_norm": 0.9147891998291016, "learning_rate": 0.0002, "epoch": 4.672890484739677, "step": 65070}, {"loss": 0.6215, "grad_norm": 1.116614580154419, "learning_rate": 0.0002, "epoch": 4.673608617594255, "step": 65080}, {"loss": 0.641, "grad_norm": 1.0764213800430298, "learning_rate": 0.0002, "epoch": 4.674326750448833, "step": 65090}, {"loss": 0.6353, "grad_norm": 0.9115945100784302, "learning_rate": 0.0002, "epoch": 4.6750448833034115, "step": 65100}, {"loss": 0.6506, "grad_norm": 1.001251459121704, "learning_rate": 0.0002, "epoch": 4.6757630161579895, "step": 65110}, {"loss": 0.6414, "grad_norm": 1.0330020189285278, "learning_rate": 0.0002, "epoch": 4.6764811490125675, "step": 65120}, {"loss": 0.6421, "grad_norm": 0.9083197116851807, "learning_rate": 0.0002, "epoch": 4.6771992818671455, "step": 65130}, {"loss": 0.5905, "grad_norm": 0.9298770427703857, "learning_rate": 0.0002, "epoch": 4.6779174147217235, "step": 65140}, {"loss": 0.633, "grad_norm": 1.0009549856185913, "learning_rate": 0.0002, "epoch": 4.6786355475763015, "step": 65150}, {"loss": 0.661, "grad_norm": 0.951389729976654, "learning_rate": 0.0002, "epoch": 4.6793536804308795, "step": 65160}, {"loss": 0.6282, "grad_norm": 1.151870608329773, "learning_rate": 0.0002, "epoch": 4.6800718132854575, "step": 65170}, {"loss": 0.5944, "grad_norm": 1.0074727535247803, "learning_rate": 0.0002, "epoch": 4.680789946140036, "step": 65180}, {"loss": 0.6539, "grad_norm": 1.0490152835845947, "learning_rate": 0.0002, "epoch": 4.681508078994614, "step": 65190}, {"loss": 0.6604, "grad_norm": 0.8967363834381104, "learning_rate": 0.0002, "epoch": 4.682226211849192, "step": 65200}, {"loss": 0.6582, "grad_norm": 1.2314889430999756, "learning_rate": 0.0002, "epoch": 4.68294434470377, "step": 65210}, {"loss": 0.6104, "grad_norm": 0.7764074802398682, "learning_rate": 0.0002, "epoch": 4.683662477558348, "step": 65220}, {"loss": 0.6401, "grad_norm": 1.0587822198867798, "learning_rate": 0.0002, "epoch": 4.684380610412926, "step": 65230}, {"loss": 0.556, "grad_norm": 0.916114091873169, "learning_rate": 0.0002, "epoch": 4.685098743267504, "step": 65240}, {"loss": 0.5912, "grad_norm": 0.9117472767829895, "learning_rate": 0.0002, "epoch": 4.685816876122082, "step": 65250}, {"loss": 0.6127, "grad_norm": 0.8369293212890625, "learning_rate": 0.0002, "epoch": 4.68653500897666, "step": 65260}, {"loss": 0.5715, "grad_norm": 0.9700121879577637, "learning_rate": 0.0002, "epoch": 4.687253141831238, "step": 65270}, {"loss": 0.6364, "grad_norm": 1.0008411407470703, "learning_rate": 0.0002, "epoch": 4.687971274685816, "step": 65280}, {"loss": 0.5816, "grad_norm": 0.9339549541473389, "learning_rate": 0.0002, "epoch": 4.688689407540395, "step": 65290}, {"loss": 0.6382, "grad_norm": 0.956701934337616, "learning_rate": 0.0002, "epoch": 4.689407540394973, "step": 65300}, {"loss": 0.6368, "grad_norm": 1.2042720317840576, "learning_rate": 0.0002, "epoch": 4.690125673249551, "step": 65310}, {"loss": 0.6138, "grad_norm": 0.8679144382476807, "learning_rate": 0.0002, "epoch": 4.690843806104129, "step": 65320}, {"loss": 0.6619, "grad_norm": 1.2320687770843506, "learning_rate": 0.0002, "epoch": 4.691561938958707, "step": 65330}, {"loss": 0.6212, "grad_norm": 0.8397238850593567, "learning_rate": 0.0002, "epoch": 4.692280071813285, "step": 65340}, {"loss": 0.578, "grad_norm": 0.7850362658500671, "learning_rate": 0.0002, "epoch": 4.692998204667863, "step": 65350}, {"loss": 0.632, "grad_norm": 0.9281290173530579, "learning_rate": 0.0002, "epoch": 4.693716337522441, "step": 65360}, {"loss": 0.6492, "grad_norm": 1.1506335735321045, "learning_rate": 0.0002, "epoch": 4.69443447037702, "step": 65370}, {"loss": 0.6503, "grad_norm": 1.0910584926605225, "learning_rate": 0.0002, "epoch": 4.695152603231598, "step": 65380}, {"loss": 0.66, "grad_norm": 0.8937386274337769, "learning_rate": 0.0002, "epoch": 4.695870736086176, "step": 65390}, {"loss": 0.6425, "grad_norm": 1.0163888931274414, "learning_rate": 0.0002, "epoch": 4.696588868940754, "step": 65400}, {"loss": 0.647, "grad_norm": 1.0290007591247559, "learning_rate": 0.0002, "epoch": 4.697307001795332, "step": 65410}, {"loss": 0.614, "grad_norm": 0.9046576023101807, "learning_rate": 0.0002, "epoch": 4.69802513464991, "step": 65420}, {"loss": 0.5844, "grad_norm": 1.0030237436294556, "learning_rate": 0.0002, "epoch": 4.698743267504488, "step": 65430}, {"loss": 0.6273, "grad_norm": 0.8196740746498108, "learning_rate": 0.0002, "epoch": 4.699461400359066, "step": 65440}, {"loss": 0.6273, "grad_norm": 0.9036651849746704, "learning_rate": 0.0002, "epoch": 4.700179533213644, "step": 65450}, {"loss": 0.6024, "grad_norm": 1.2080141305923462, "learning_rate": 0.0002, "epoch": 4.700897666068222, "step": 65460}, {"loss": 0.6461, "grad_norm": 0.8743635416030884, "learning_rate": 0.0002, "epoch": 4.7016157989228, "step": 65470}, {"loss": 0.6129, "grad_norm": 0.9566192030906677, "learning_rate": 0.0002, "epoch": 4.702333931777379, "step": 65480}, {"loss": 0.6721, "grad_norm": 1.0505144596099854, "learning_rate": 0.0002, "epoch": 4.703052064631957, "step": 65490}, {"loss": 0.6287, "grad_norm": 0.8797298073768616, "learning_rate": 0.0002, "epoch": 4.703770197486535, "step": 65500}, {"loss": 0.6515, "grad_norm": 0.9970770478248596, "learning_rate": 0.0002, "epoch": 4.704488330341113, "step": 65510}, {"loss": 0.6096, "grad_norm": 1.1743851900100708, "learning_rate": 0.0002, "epoch": 4.705206463195691, "step": 65520}, {"loss": 0.5755, "grad_norm": 0.9534381031990051, "learning_rate": 0.0002, "epoch": 4.705924596050269, "step": 65530}, {"loss": 0.6039, "grad_norm": 0.9735581278800964, "learning_rate": 0.0002, "epoch": 4.706642728904847, "step": 65540}, {"loss": 0.6217, "grad_norm": 1.185352087020874, "learning_rate": 0.0002, "epoch": 4.707360861759425, "step": 65550}, {"loss": 0.6398, "grad_norm": 0.9383901357650757, "learning_rate": 0.0002, "epoch": 4.708078994614004, "step": 65560}, {"loss": 0.6654, "grad_norm": 1.0194662809371948, "learning_rate": 0.0002, "epoch": 4.708797127468582, "step": 65570}, {"loss": 0.6008, "grad_norm": 0.8448300361633301, "learning_rate": 0.0002, "epoch": 4.70951526032316, "step": 65580}, {"loss": 0.6608, "grad_norm": 1.1930629014968872, "learning_rate": 0.0002, "epoch": 4.710233393177738, "step": 65590}, {"loss": 0.6082, "grad_norm": 1.0038636922836304, "learning_rate": 0.0002, "epoch": 4.710951526032316, "step": 65600}, {"loss": 0.6613, "grad_norm": 0.8206564784049988, "learning_rate": 0.0002, "epoch": 4.711669658886894, "step": 65610}, {"loss": 0.6142, "grad_norm": 1.0984861850738525, "learning_rate": 0.0002, "epoch": 4.712387791741472, "step": 65620}, {"loss": 0.6368, "grad_norm": 1.2891547679901123, "learning_rate": 0.0002, "epoch": 4.71310592459605, "step": 65630}, {"loss": 0.5857, "grad_norm": 0.927062451839447, "learning_rate": 0.0002, "epoch": 4.713824057450628, "step": 65640}, {"loss": 0.6187, "grad_norm": 0.8647334575653076, "learning_rate": 0.0002, "epoch": 4.714542190305206, "step": 65650}, {"loss": 0.6327, "grad_norm": 1.1017670631408691, "learning_rate": 0.0002, "epoch": 4.715260323159785, "step": 65660}, {"loss": 0.6398, "grad_norm": 0.9589072465896606, "learning_rate": 0.0002, "epoch": 4.715978456014363, "step": 65670}, {"loss": 0.6179, "grad_norm": 0.9496776461601257, "learning_rate": 0.0002, "epoch": 4.716696588868941, "step": 65680}, {"loss": 0.625, "grad_norm": 0.9266180396080017, "learning_rate": 0.0002, "epoch": 4.717414721723519, "step": 65690}, {"loss": 0.637, "grad_norm": 0.8699696063995361, "learning_rate": 0.0002, "epoch": 4.718132854578097, "step": 65700}, {"loss": 0.6402, "grad_norm": 1.0444015264511108, "learning_rate": 0.0002, "epoch": 4.718850987432675, "step": 65710}, {"loss": 0.6526, "grad_norm": 1.0100741386413574, "learning_rate": 0.0002, "epoch": 4.719569120287253, "step": 65720}, {"loss": 0.617, "grad_norm": 1.1442630290985107, "learning_rate": 0.0002, "epoch": 4.720287253141831, "step": 65730}, {"loss": 0.6214, "grad_norm": 0.8937877416610718, "learning_rate": 0.0002, "epoch": 4.721005385996409, "step": 65740}, {"loss": 0.625, "grad_norm": 1.0718764066696167, "learning_rate": 0.0002, "epoch": 4.721723518850988, "step": 65750}, {"loss": 0.6182, "grad_norm": 0.8838587999343872, "learning_rate": 0.0002, "epoch": 4.722441651705566, "step": 65760}, {"loss": 0.6254, "grad_norm": 1.1247940063476562, "learning_rate": 0.0002, "epoch": 4.723159784560144, "step": 65770}, {"loss": 0.5917, "grad_norm": 0.9491105675697327, "learning_rate": 0.0002, "epoch": 4.723877917414722, "step": 65780}, {"loss": 0.6178, "grad_norm": 1.0896921157836914, "learning_rate": 0.0002, "epoch": 4.7245960502693, "step": 65790}, {"loss": 0.5975, "grad_norm": 1.0097380876541138, "learning_rate": 0.0002, "epoch": 4.725314183123878, "step": 65800}, {"loss": 0.592, "grad_norm": 0.911763608455658, "learning_rate": 0.0002, "epoch": 4.726032315978456, "step": 65810}, {"loss": 0.6274, "grad_norm": 1.1295124292373657, "learning_rate": 0.0002, "epoch": 4.726750448833034, "step": 65820}, {"loss": 0.6004, "grad_norm": 0.7637538313865662, "learning_rate": 0.0002, "epoch": 4.727468581687612, "step": 65830}, {"loss": 0.6136, "grad_norm": 0.9255306720733643, "learning_rate": 0.0002, "epoch": 4.72818671454219, "step": 65840}, {"loss": 0.6013, "grad_norm": 0.9847530126571655, "learning_rate": 0.0002, "epoch": 4.728904847396769, "step": 65850}, {"loss": 0.6283, "grad_norm": 0.9036182761192322, "learning_rate": 0.0002, "epoch": 4.729622980251347, "step": 65860}, {"loss": 0.6374, "grad_norm": 0.8284199833869934, "learning_rate": 0.0002, "epoch": 4.730341113105925, "step": 65870}, {"loss": 0.6228, "grad_norm": 1.0142838954925537, "learning_rate": 0.0002, "epoch": 4.731059245960503, "step": 65880}, {"loss": 0.624, "grad_norm": 0.9389033913612366, "learning_rate": 0.0002, "epoch": 4.731777378815081, "step": 65890}, {"loss": 0.6414, "grad_norm": 0.8870056867599487, "learning_rate": 0.0002, "epoch": 4.732495511669659, "step": 65900}, {"loss": 0.6261, "grad_norm": 1.1211678981781006, "learning_rate": 0.0002, "epoch": 4.733213644524237, "step": 65910}, {"loss": 0.6065, "grad_norm": 0.7796614170074463, "learning_rate": 0.0002, "epoch": 4.733931777378815, "step": 65920}, {"loss": 0.6701, "grad_norm": 1.0360451936721802, "learning_rate": 0.0002, "epoch": 4.734649910233394, "step": 65930}, {"loss": 0.68, "grad_norm": 0.8383482098579407, "learning_rate": 0.0002, "epoch": 4.735368043087972, "step": 65940}, {"loss": 0.6014, "grad_norm": 0.7985122799873352, "learning_rate": 0.0002, "epoch": 4.73608617594255, "step": 65950}, {"loss": 0.6431, "grad_norm": 1.0314199924468994, "learning_rate": 0.0002, "epoch": 4.736804308797128, "step": 65960}, {"loss": 0.5894, "grad_norm": 0.9279016852378845, "learning_rate": 0.0002, "epoch": 4.737522441651706, "step": 65970}, {"loss": 0.6327, "grad_norm": 1.1046063899993896, "learning_rate": 0.0002, "epoch": 4.738240574506284, "step": 65980}, {"loss": 0.5778, "grad_norm": 0.9075793623924255, "learning_rate": 0.0002, "epoch": 4.738958707360862, "step": 65990}, {"loss": 0.5832, "grad_norm": 1.0945355892181396, "learning_rate": 0.0002, "epoch": 4.73967684021544, "step": 66000}, {"loss": 0.6256, "grad_norm": 0.8885519504547119, "learning_rate": 0.0002, "epoch": 4.740394973070018, "step": 66010}, {"loss": 0.6283, "grad_norm": 0.9312083125114441, "learning_rate": 0.0002, "epoch": 4.741113105924596, "step": 66020}, {"loss": 0.6328, "grad_norm": 1.1574538946151733, "learning_rate": 0.0002, "epoch": 4.741831238779174, "step": 66030}, {"loss": 0.6693, "grad_norm": 0.9346209168434143, "learning_rate": 0.0002, "epoch": 4.742549371633753, "step": 66040}, {"loss": 0.6252, "grad_norm": 0.8935149312019348, "learning_rate": 0.0002, "epoch": 4.743267504488331, "step": 66050}, {"loss": 0.6137, "grad_norm": 0.8958369493484497, "learning_rate": 0.0002, "epoch": 4.743985637342909, "step": 66060}, {"loss": 0.6088, "grad_norm": 0.9383506774902344, "learning_rate": 0.0002, "epoch": 4.744703770197487, "step": 66070}, {"loss": 0.6323, "grad_norm": 0.9868947863578796, "learning_rate": 0.0002, "epoch": 4.745421903052065, "step": 66080}, {"loss": 0.6426, "grad_norm": 1.3417645692825317, "learning_rate": 0.0002, "epoch": 4.746140035906643, "step": 66090}, {"loss": 0.5417, "grad_norm": 1.070693850517273, "learning_rate": 0.0002, "epoch": 4.746858168761221, "step": 66100}, {"loss": 0.6326, "grad_norm": 0.8841570019721985, "learning_rate": 0.0002, "epoch": 4.747576301615799, "step": 66110}, {"loss": 0.655, "grad_norm": 0.7963120341300964, "learning_rate": 0.0002, "epoch": 4.7482944344703775, "step": 66120}, {"loss": 0.6145, "grad_norm": 0.8145691156387329, "learning_rate": 0.0002, "epoch": 4.7490125673249555, "step": 66130}, {"loss": 0.6081, "grad_norm": 0.9074729681015015, "learning_rate": 0.0002, "epoch": 4.7497307001795335, "step": 66140}, {"loss": 0.5651, "grad_norm": 0.9129886627197266, "learning_rate": 0.0002, "epoch": 4.7504488330341115, "step": 66150}, {"loss": 0.6111, "grad_norm": 0.91527259349823, "learning_rate": 0.0002, "epoch": 4.7511669658886895, "step": 66160}, {"loss": 0.672, "grad_norm": 0.9569419622421265, "learning_rate": 0.0002, "epoch": 4.7518850987432675, "step": 66170}, {"loss": 0.597, "grad_norm": 0.8777104616165161, "learning_rate": 0.0002, "epoch": 4.7526032315978455, "step": 66180}, {"loss": 0.6433, "grad_norm": 0.9673085808753967, "learning_rate": 0.0002, "epoch": 4.7533213644524235, "step": 66190}, {"loss": 0.5783, "grad_norm": 1.0683966875076294, "learning_rate": 0.0002, "epoch": 4.7540394973070015, "step": 66200}, {"loss": 0.6356, "grad_norm": 1.1591907739639282, "learning_rate": 0.0002, "epoch": 4.7547576301615795, "step": 66210}, {"loss": 0.6482, "grad_norm": 1.1973309516906738, "learning_rate": 0.0002, "epoch": 4.755475763016158, "step": 66220}, {"loss": 0.5998, "grad_norm": 0.8472012281417847, "learning_rate": 0.0002, "epoch": 4.756193895870736, "step": 66230}, {"loss": 0.717, "grad_norm": 0.9896261692047119, "learning_rate": 0.0002, "epoch": 4.756912028725314, "step": 66240}, {"loss": 0.6368, "grad_norm": 0.8498432040214539, "learning_rate": 0.0002, "epoch": 4.757630161579892, "step": 66250}, {"loss": 0.5931, "grad_norm": 0.9624166488647461, "learning_rate": 0.0002, "epoch": 4.75834829443447, "step": 66260}, {"loss": 0.645, "grad_norm": 1.0951786041259766, "learning_rate": 0.0002, "epoch": 4.759066427289048, "step": 66270}, {"loss": 0.6092, "grad_norm": 0.9863157868385315, "learning_rate": 0.0002, "epoch": 4.759784560143626, "step": 66280}, {"loss": 0.6682, "grad_norm": 1.0062068700790405, "learning_rate": 0.0002, "epoch": 4.760502692998204, "step": 66290}, {"loss": 0.5704, "grad_norm": 0.8075495958328247, "learning_rate": 0.0002, "epoch": 4.761220825852782, "step": 66300}, {"loss": 0.6297, "grad_norm": 0.9617878198623657, "learning_rate": 0.0002, "epoch": 4.761938958707361, "step": 66310}, {"loss": 0.6141, "grad_norm": 1.097091555595398, "learning_rate": 0.0002, "epoch": 4.762657091561939, "step": 66320}, {"loss": 0.6152, "grad_norm": 1.2713453769683838, "learning_rate": 0.0002, "epoch": 4.763375224416517, "step": 66330}, {"loss": 0.6726, "grad_norm": 0.9473448991775513, "learning_rate": 0.0002, "epoch": 4.764093357271095, "step": 66340}, {"loss": 0.6032, "grad_norm": 1.0176854133605957, "learning_rate": 0.0002, "epoch": 4.764811490125673, "step": 66350}, {"loss": 0.6429, "grad_norm": 1.0486242771148682, "learning_rate": 0.0002, "epoch": 4.765529622980251, "step": 66360}, {"loss": 0.6875, "grad_norm": 1.249985694885254, "learning_rate": 0.0002, "epoch": 4.766247755834829, "step": 66370}, {"loss": 0.6086, "grad_norm": 1.283875584602356, "learning_rate": 0.0002, "epoch": 4.766965888689407, "step": 66380}, {"loss": 0.5997, "grad_norm": 1.0009022951126099, "learning_rate": 0.0002, "epoch": 4.767684021543985, "step": 66390}, {"loss": 0.5782, "grad_norm": 0.9718021750450134, "learning_rate": 0.0002, "epoch": 4.768402154398563, "step": 66400}, {"loss": 0.6292, "grad_norm": 1.0865732431411743, "learning_rate": 0.0002, "epoch": 4.769120287253142, "step": 66410}, {"loss": 0.6038, "grad_norm": 0.9273189306259155, "learning_rate": 0.0002, "epoch": 4.76983842010772, "step": 66420}, {"loss": 0.6244, "grad_norm": 1.067535638809204, "learning_rate": 0.0002, "epoch": 4.770556552962298, "step": 66430}, {"loss": 0.6434, "grad_norm": 1.0551011562347412, "learning_rate": 0.0002, "epoch": 4.771274685816876, "step": 66440}, {"loss": 0.6151, "grad_norm": 1.0336146354675293, "learning_rate": 0.0002, "epoch": 4.771992818671454, "step": 66450}, {"loss": 0.5955, "grad_norm": 0.8738380670547485, "learning_rate": 0.0002, "epoch": 4.772710951526032, "step": 66460}, {"loss": 0.6386, "grad_norm": 1.1048321723937988, "learning_rate": 0.0002, "epoch": 4.77342908438061, "step": 66470}, {"loss": 0.592, "grad_norm": 0.8471167683601379, "learning_rate": 0.0002, "epoch": 4.774147217235188, "step": 66480}, {"loss": 0.6139, "grad_norm": 1.2527031898498535, "learning_rate": 0.0002, "epoch": 4.774865350089767, "step": 66490}, {"loss": 0.579, "grad_norm": 1.0056052207946777, "learning_rate": 0.0002, "epoch": 4.775583482944345, "step": 66500}, {"loss": 0.6448, "grad_norm": 1.142456293106079, "learning_rate": 0.0002, "epoch": 4.776301615798923, "step": 66510}, {"loss": 0.6399, "grad_norm": 1.1813132762908936, "learning_rate": 0.0002, "epoch": 4.777019748653501, "step": 66520}, {"loss": 0.6575, "grad_norm": 0.8683654069900513, "learning_rate": 0.0002, "epoch": 4.777737881508079, "step": 66530}, {"loss": 0.6059, "grad_norm": 1.0577980279922485, "learning_rate": 0.0002, "epoch": 4.778456014362657, "step": 66540}, {"loss": 0.5923, "grad_norm": 1.077438473701477, "learning_rate": 0.0002, "epoch": 4.779174147217235, "step": 66550}, {"loss": 0.5744, "grad_norm": 1.0107938051223755, "learning_rate": 0.0002, "epoch": 4.779892280071813, "step": 66560}, {"loss": 0.6155, "grad_norm": 0.8071168065071106, "learning_rate": 0.0002, "epoch": 4.780610412926391, "step": 66570}, {"loss": 0.6126, "grad_norm": 0.8887564539909363, "learning_rate": 0.0002, "epoch": 4.781328545780969, "step": 66580}, {"loss": 0.6417, "grad_norm": 0.9823092222213745, "learning_rate": 0.0002, "epoch": 4.782046678635547, "step": 66590}, {"loss": 0.6108, "grad_norm": 0.9026784300804138, "learning_rate": 0.0002, "epoch": 4.782764811490126, "step": 66600}, {"loss": 0.6252, "grad_norm": 0.8912792205810547, "learning_rate": 0.0002, "epoch": 4.783482944344704, "step": 66610}, {"loss": 0.6285, "grad_norm": 1.0955979824066162, "learning_rate": 0.0002, "epoch": 4.784201077199282, "step": 66620}, {"loss": 0.6161, "grad_norm": 0.8614793419837952, "learning_rate": 0.0002, "epoch": 4.78491921005386, "step": 66630}, {"loss": 0.6343, "grad_norm": 0.7247269153594971, "learning_rate": 0.0002, "epoch": 4.785637342908438, "step": 66640}, {"loss": 0.5634, "grad_norm": 0.9685400724411011, "learning_rate": 0.0002, "epoch": 4.786355475763016, "step": 66650}, {"loss": 0.6419, "grad_norm": 0.9219905734062195, "learning_rate": 0.0002, "epoch": 4.787073608617594, "step": 66660}, {"loss": 0.6509, "grad_norm": 0.9217489361763, "learning_rate": 0.0002, "epoch": 4.787791741472172, "step": 66670}, {"loss": 0.6151, "grad_norm": 1.13791823387146, "learning_rate": 0.0002, "epoch": 4.788509874326751, "step": 66680}, {"loss": 0.6114, "grad_norm": 0.857542872428894, "learning_rate": 0.0002, "epoch": 4.789228007181329, "step": 66690}, {"loss": 0.6317, "grad_norm": 0.9886694550514221, "learning_rate": 0.0002, "epoch": 4.789946140035907, "step": 66700}, {"loss": 0.6436, "grad_norm": 0.987952470779419, "learning_rate": 0.0002, "epoch": 4.790664272890485, "step": 66710}, {"loss": 0.6284, "grad_norm": 1.051612377166748, "learning_rate": 0.0002, "epoch": 4.791382405745063, "step": 66720}, {"loss": 0.6207, "grad_norm": 0.9816454648971558, "learning_rate": 0.0002, "epoch": 4.792100538599641, "step": 66730}, {"loss": 0.6618, "grad_norm": 1.0953829288482666, "learning_rate": 0.0002, "epoch": 4.792818671454219, "step": 66740}, {"loss": 0.652, "grad_norm": 0.8720369935035706, "learning_rate": 0.0002, "epoch": 4.793536804308797, "step": 66750}, {"loss": 0.569, "grad_norm": 0.8910234570503235, "learning_rate": 0.0002, "epoch": 4.794254937163375, "step": 66760}, {"loss": 0.5814, "grad_norm": 0.8300510048866272, "learning_rate": 0.0002, "epoch": 4.794973070017953, "step": 66770}, {"loss": 0.591, "grad_norm": 0.9380533695220947, "learning_rate": 0.0002, "epoch": 4.795691202872531, "step": 66780}, {"loss": 0.6201, "grad_norm": 0.8361864686012268, "learning_rate": 0.0002, "epoch": 4.79640933572711, "step": 66790}, {"loss": 0.6192, "grad_norm": 1.051262617111206, "learning_rate": 0.0002, "epoch": 4.797127468581688, "step": 66800}, {"loss": 0.6408, "grad_norm": 1.1324400901794434, "learning_rate": 0.0002, "epoch": 4.797845601436266, "step": 66810}, {"loss": 0.6156, "grad_norm": 0.853903591632843, "learning_rate": 0.0002, "epoch": 4.798563734290844, "step": 66820}, {"loss": 0.5923, "grad_norm": 0.9949867725372314, "learning_rate": 0.0002, "epoch": 4.799281867145422, "step": 66830}, {"loss": 0.6453, "grad_norm": 0.9204033017158508, "learning_rate": 0.0002, "epoch": 4.8, "step": 66840}, {"loss": 0.6221, "grad_norm": 0.7461584806442261, "learning_rate": 0.0002, "epoch": 4.800718132854578, "step": 66850}, {"loss": 0.6019, "grad_norm": 1.1019874811172485, "learning_rate": 0.0002, "epoch": 4.801436265709156, "step": 66860}, {"loss": 0.6514, "grad_norm": 1.1695797443389893, "learning_rate": 0.0002, "epoch": 4.802154398563735, "step": 66870}, {"loss": 0.6105, "grad_norm": 1.0902758836746216, "learning_rate": 0.0002, "epoch": 4.802872531418313, "step": 66880}, {"loss": 0.6297, "grad_norm": 0.8778618574142456, "learning_rate": 0.0002, "epoch": 4.803590664272891, "step": 66890}, {"loss": 0.6608, "grad_norm": 0.905505359172821, "learning_rate": 0.0002, "epoch": 4.804308797127469, "step": 66900}, {"loss": 0.6386, "grad_norm": 1.0802056789398193, "learning_rate": 0.0002, "epoch": 4.805026929982047, "step": 66910}, {"loss": 0.5866, "grad_norm": 0.7899449467658997, "learning_rate": 0.0002, "epoch": 4.805745062836625, "step": 66920}, {"loss": 0.6169, "grad_norm": 1.1938519477844238, "learning_rate": 0.0002, "epoch": 4.806463195691203, "step": 66930}, {"loss": 0.5979, "grad_norm": 1.0213780403137207, "learning_rate": 0.0002, "epoch": 4.807181328545781, "step": 66940}, {"loss": 0.6518, "grad_norm": 0.9925506711006165, "learning_rate": 0.0002, "epoch": 4.807899461400359, "step": 66950}, {"loss": 0.6229, "grad_norm": 1.0174424648284912, "learning_rate": 0.0002, "epoch": 4.808617594254937, "step": 66960}, {"loss": 0.5932, "grad_norm": 1.0515072345733643, "learning_rate": 0.0002, "epoch": 4.809335727109516, "step": 66970}, {"loss": 0.6169, "grad_norm": 1.0161492824554443, "learning_rate": 0.0002, "epoch": 4.810053859964094, "step": 66980}, {"loss": 0.5804, "grad_norm": 0.8421840071678162, "learning_rate": 0.0002, "epoch": 4.810771992818672, "step": 66990}, {"loss": 0.6792, "grad_norm": 1.0493539571762085, "learning_rate": 0.0002, "epoch": 4.81149012567325, "step": 67000}, {"loss": 0.5906, "grad_norm": 1.1133309602737427, "learning_rate": 0.0002, "epoch": 4.812208258527828, "step": 67010}, {"loss": 0.5771, "grad_norm": 0.924017071723938, "learning_rate": 0.0002, "epoch": 4.812926391382406, "step": 67020}, {"loss": 0.625, "grad_norm": 1.0568689107894897, "learning_rate": 0.0002, "epoch": 4.813644524236984, "step": 67030}, {"loss": 0.6654, "grad_norm": 0.989414632320404, "learning_rate": 0.0002, "epoch": 4.814362657091562, "step": 67040}, {"loss": 0.6186, "grad_norm": 0.9256827235221863, "learning_rate": 0.0002, "epoch": 4.8150807899461405, "step": 67050}, {"loss": 0.637, "grad_norm": 0.9538901448249817, "learning_rate": 0.0002, "epoch": 4.8157989228007185, "step": 67060}, {"loss": 0.632, "grad_norm": 1.0373849868774414, "learning_rate": 0.0002, "epoch": 4.8165170556552965, "step": 67070}, {"loss": 0.5956, "grad_norm": 1.0019729137420654, "learning_rate": 0.0002, "epoch": 4.8172351885098745, "step": 67080}, {"loss": 0.636, "grad_norm": 0.9930381178855896, "learning_rate": 0.0002, "epoch": 4.8179533213644525, "step": 67090}, {"loss": 0.6106, "grad_norm": 1.0008453130722046, "learning_rate": 0.0002, "epoch": 4.8186714542190305, "step": 67100}, {"loss": 0.5841, "grad_norm": 1.0153851509094238, "learning_rate": 0.0002, "epoch": 4.8193895870736085, "step": 67110}, {"loss": 0.6012, "grad_norm": 1.0193161964416504, "learning_rate": 0.0002, "epoch": 4.8201077199281865, "step": 67120}, {"loss": 0.6602, "grad_norm": 1.0204501152038574, "learning_rate": 0.0002, "epoch": 4.8208258527827645, "step": 67130}, {"loss": 0.6235, "grad_norm": 0.9097670316696167, "learning_rate": 0.0002, "epoch": 4.8215439856373425, "step": 67140}, {"loss": 0.5836, "grad_norm": 0.9288716912269592, "learning_rate": 0.0002, "epoch": 4.8222621184919205, "step": 67150}, {"loss": 0.604, "grad_norm": 0.9975850582122803, "learning_rate": 0.0002, "epoch": 4.822980251346499, "step": 67160}, {"loss": 0.6877, "grad_norm": 0.8502511382102966, "learning_rate": 0.0002, "epoch": 4.823698384201077, "step": 67170}, {"loss": 0.6194, "grad_norm": 1.0129257440567017, "learning_rate": 0.0002, "epoch": 4.824416517055655, "step": 67180}, {"loss": 0.6294, "grad_norm": 1.0009492635726929, "learning_rate": 0.0002, "epoch": 4.825134649910233, "step": 67190}, {"loss": 0.5757, "grad_norm": 0.9273321032524109, "learning_rate": 0.0002, "epoch": 4.825852782764811, "step": 67200}, {"loss": 0.5749, "grad_norm": 1.0438604354858398, "learning_rate": 0.0002, "epoch": 4.8265709156193894, "step": 67210}, {"loss": 0.6273, "grad_norm": 1.119573712348938, "learning_rate": 0.0002, "epoch": 4.8272890484739674, "step": 67220}, {"loss": 0.6284, "grad_norm": 0.9607422351837158, "learning_rate": 0.0002, "epoch": 4.8280071813285454, "step": 67230}, {"loss": 0.6259, "grad_norm": 0.9614062905311584, "learning_rate": 0.0002, "epoch": 4.828725314183124, "step": 67240}, {"loss": 0.5709, "grad_norm": 1.1017652750015259, "learning_rate": 0.0002, "epoch": 4.829443447037702, "step": 67250}, {"loss": 0.6203, "grad_norm": 1.0521706342697144, "learning_rate": 0.0002, "epoch": 4.83016157989228, "step": 67260}, {"loss": 0.6266, "grad_norm": 0.7685959339141846, "learning_rate": 0.0002, "epoch": 4.830879712746858, "step": 67270}, {"loss": 0.5809, "grad_norm": 0.7894896268844604, "learning_rate": 0.0002, "epoch": 4.831597845601436, "step": 67280}, {"loss": 0.6349, "grad_norm": 1.0882996320724487, "learning_rate": 0.0002, "epoch": 4.832315978456014, "step": 67290}, {"loss": 0.6129, "grad_norm": 0.9215409755706787, "learning_rate": 0.0002, "epoch": 4.833034111310592, "step": 67300}, {"loss": 0.6142, "grad_norm": 0.8660635352134705, "learning_rate": 0.0002, "epoch": 4.83375224416517, "step": 67310}, {"loss": 0.6378, "grad_norm": 0.980879008769989, "learning_rate": 0.0002, "epoch": 4.834470377019748, "step": 67320}, {"loss": 0.6291, "grad_norm": 1.0356814861297607, "learning_rate": 0.0002, "epoch": 4.835188509874326, "step": 67330}, {"loss": 0.6271, "grad_norm": 1.0265507698059082, "learning_rate": 0.0002, "epoch": 4.835906642728904, "step": 67340}, {"loss": 0.6009, "grad_norm": 1.0659137964248657, "learning_rate": 0.0002, "epoch": 4.836624775583483, "step": 67350}, {"loss": 0.5946, "grad_norm": 0.9485231637954712, "learning_rate": 0.0002, "epoch": 4.837342908438061, "step": 67360}, {"loss": 0.6338, "grad_norm": 1.0950140953063965, "learning_rate": 0.0002, "epoch": 4.838061041292639, "step": 67370}, {"loss": 0.6314, "grad_norm": 0.8907382488250732, "learning_rate": 0.0002, "epoch": 4.838779174147217, "step": 67380}, {"loss": 0.6066, "grad_norm": 0.9777120351791382, "learning_rate": 0.0002, "epoch": 4.839497307001795, "step": 67390}, {"loss": 0.6258, "grad_norm": 0.8482252955436707, "learning_rate": 0.0002, "epoch": 4.840215439856373, "step": 67400}, {"loss": 0.603, "grad_norm": 0.8505899906158447, "learning_rate": 0.0002, "epoch": 4.840933572710951, "step": 67410}, {"loss": 0.609, "grad_norm": 0.8574482798576355, "learning_rate": 0.0002, "epoch": 4.841651705565529, "step": 67420}, {"loss": 0.6188, "grad_norm": 1.092310905456543, "learning_rate": 0.0002, "epoch": 4.842369838420108, "step": 67430}, {"loss": 0.619, "grad_norm": 0.9418560266494751, "learning_rate": 0.0002, "epoch": 4.843087971274686, "step": 67440}, {"loss": 0.6367, "grad_norm": 1.1310782432556152, "learning_rate": 0.0002, "epoch": 4.843806104129264, "step": 67450}, {"loss": 0.664, "grad_norm": 0.9993671774864197, "learning_rate": 0.0002, "epoch": 4.844524236983842, "step": 67460}, {"loss": 0.6247, "grad_norm": 0.8322528600692749, "learning_rate": 0.0002, "epoch": 4.84524236983842, "step": 67470}, {"loss": 0.5828, "grad_norm": 0.8488435745239258, "learning_rate": 0.0002, "epoch": 4.845960502692998, "step": 67480}, {"loss": 0.6023, "grad_norm": 0.8070611357688904, "learning_rate": 0.0002, "epoch": 4.846678635547576, "step": 67490}, {"loss": 0.6362, "grad_norm": 0.8200163245201111, "learning_rate": 0.0002, "epoch": 4.847396768402154, "step": 67500}, {"loss": 0.612, "grad_norm": 0.91901034116745, "learning_rate": 0.0002, "epoch": 4.848114901256732, "step": 67510}, {"loss": 0.6191, "grad_norm": 1.0938435792922974, "learning_rate": 0.0002, "epoch": 4.84883303411131, "step": 67520}, {"loss": 0.6736, "grad_norm": 0.7926174402236938, "learning_rate": 0.0002, "epoch": 4.849551166965889, "step": 67530}, {"loss": 0.6252, "grad_norm": 0.9914385676383972, "learning_rate": 0.0002, "epoch": 4.850269299820467, "step": 67540}, {"loss": 0.6278, "grad_norm": 1.033065915107727, "learning_rate": 0.0002, "epoch": 4.850987432675045, "step": 67550}, {"loss": 0.6334, "grad_norm": 0.9700239300727844, "learning_rate": 0.0002, "epoch": 4.851705565529623, "step": 67560}, {"loss": 0.6308, "grad_norm": 0.8550103902816772, "learning_rate": 0.0002, "epoch": 4.852423698384201, "step": 67570}, {"loss": 0.6194, "grad_norm": 1.0009654760360718, "learning_rate": 0.0002, "epoch": 4.853141831238779, "step": 67580}, {"loss": 0.5825, "grad_norm": 1.0766186714172363, "learning_rate": 0.0002, "epoch": 4.853859964093357, "step": 67590}, {"loss": 0.6216, "grad_norm": 0.9512220621109009, "learning_rate": 0.0002, "epoch": 4.854578096947935, "step": 67600}, {"loss": 0.6301, "grad_norm": 0.8434456586837769, "learning_rate": 0.0002, "epoch": 4.855296229802514, "step": 67610}, {"loss": 0.6416, "grad_norm": 1.0276665687561035, "learning_rate": 0.0002, "epoch": 4.856014362657092, "step": 67620}, {"loss": 0.6063, "grad_norm": 0.9758516550064087, "learning_rate": 0.0002, "epoch": 4.85673249551167, "step": 67630}, {"loss": 0.622, "grad_norm": 0.8988076448440552, "learning_rate": 0.0002, "epoch": 4.857450628366248, "step": 67640}, {"loss": 0.6516, "grad_norm": 1.0038257837295532, "learning_rate": 0.0002, "epoch": 4.858168761220826, "step": 67650}, {"loss": 0.6322, "grad_norm": 0.9973093867301941, "learning_rate": 0.0002, "epoch": 4.858886894075404, "step": 67660}, {"loss": 0.6065, "grad_norm": 0.9754974246025085, "learning_rate": 0.0002, "epoch": 4.859605026929982, "step": 67670}, {"loss": 0.6191, "grad_norm": 1.1829560995101929, "learning_rate": 0.0002, "epoch": 4.86032315978456, "step": 67680}, {"loss": 0.6267, "grad_norm": 1.1077659130096436, "learning_rate": 0.0002, "epoch": 4.861041292639138, "step": 67690}, {"loss": 0.6312, "grad_norm": 0.9862872958183289, "learning_rate": 0.0002, "epoch": 4.861759425493716, "step": 67700}, {"loss": 0.6281, "grad_norm": 0.9826052188873291, "learning_rate": 0.0002, "epoch": 4.862477558348294, "step": 67710}, {"loss": 0.6227, "grad_norm": 0.940082848072052, "learning_rate": 0.0002, "epoch": 4.863195691202873, "step": 67720}, {"loss": 0.6232, "grad_norm": 0.895434558391571, "learning_rate": 0.0002, "epoch": 4.863913824057451, "step": 67730}, {"loss": 0.6674, "grad_norm": 1.1194682121276855, "learning_rate": 0.0002, "epoch": 4.864631956912029, "step": 67740}, {"loss": 0.5981, "grad_norm": 0.9984544515609741, "learning_rate": 0.0002, "epoch": 4.865350089766607, "step": 67750}, {"loss": 0.6583, "grad_norm": 1.049224615097046, "learning_rate": 0.0002, "epoch": 4.866068222621185, "step": 67760}, {"loss": 0.583, "grad_norm": 1.009515643119812, "learning_rate": 0.0002, "epoch": 4.866786355475763, "step": 67770}, {"loss": 0.6466, "grad_norm": 1.0336902141571045, "learning_rate": 0.0002, "epoch": 4.867504488330341, "step": 67780}, {"loss": 0.6909, "grad_norm": 0.9310635924339294, "learning_rate": 0.0002, "epoch": 4.868222621184919, "step": 67790}, {"loss": 0.7267, "grad_norm": 0.934882640838623, "learning_rate": 0.0002, "epoch": 4.868940754039498, "step": 67800}, {"loss": 0.648, "grad_norm": 0.8663495779037476, "learning_rate": 0.0002, "epoch": 4.869658886894076, "step": 67810}, {"loss": 0.6275, "grad_norm": 1.0085018873214722, "learning_rate": 0.0002, "epoch": 4.870377019748654, "step": 67820}, {"loss": 0.6571, "grad_norm": 0.896507978439331, "learning_rate": 0.0002, "epoch": 4.871095152603232, "step": 67830}, {"loss": 0.6711, "grad_norm": 0.925809919834137, "learning_rate": 0.0002, "epoch": 4.87181328545781, "step": 67840}, {"loss": 0.5917, "grad_norm": 0.8044029474258423, "learning_rate": 0.0002, "epoch": 4.872531418312388, "step": 67850}, {"loss": 0.6671, "grad_norm": 1.0026800632476807, "learning_rate": 0.0002, "epoch": 4.873249551166966, "step": 67860}, {"loss": 0.6175, "grad_norm": 0.9577589631080627, "learning_rate": 0.0002, "epoch": 4.873967684021544, "step": 67870}, {"loss": 0.591, "grad_norm": 0.8225193619728088, "learning_rate": 0.0002, "epoch": 4.874685816876122, "step": 67880}, {"loss": 0.6, "grad_norm": 1.0019139051437378, "learning_rate": 0.0002, "epoch": 4.8754039497307, "step": 67890}, {"loss": 0.6521, "grad_norm": 0.9282827377319336, "learning_rate": 0.0002, "epoch": 4.876122082585278, "step": 67900}, {"loss": 0.6251, "grad_norm": 0.8204836249351501, "learning_rate": 0.0002, "epoch": 4.876840215439857, "step": 67910}, {"loss": 0.6345, "grad_norm": 0.907356321811676, "learning_rate": 0.0002, "epoch": 4.877558348294435, "step": 67920}, {"loss": 0.6438, "grad_norm": 1.12422776222229, "learning_rate": 0.0002, "epoch": 4.878276481149013, "step": 67930}, {"loss": 0.6727, "grad_norm": 0.8230205178260803, "learning_rate": 0.0002, "epoch": 4.878994614003591, "step": 67940}, {"loss": 0.6361, "grad_norm": 1.1588479280471802, "learning_rate": 0.0002, "epoch": 4.879712746858169, "step": 67950}, {"loss": 0.6489, "grad_norm": 1.1064553260803223, "learning_rate": 0.0002, "epoch": 4.880430879712747, "step": 67960}, {"loss": 0.5851, "grad_norm": 0.9311534762382507, "learning_rate": 0.0002, "epoch": 4.881149012567325, "step": 67970}, {"loss": 0.6238, "grad_norm": 0.7575639486312866, "learning_rate": 0.0002, "epoch": 4.881867145421903, "step": 67980}, {"loss": 0.5933, "grad_norm": 0.9201191067695618, "learning_rate": 0.0002, "epoch": 4.882585278276482, "step": 67990}, {"loss": 0.5806, "grad_norm": 0.8487658500671387, "learning_rate": 0.0002, "epoch": 4.88330341113106, "step": 68000}, {"loss": 0.598, "grad_norm": 0.9645208716392517, "learning_rate": 0.0002, "epoch": 4.884021543985638, "step": 68010}, {"loss": 0.6112, "grad_norm": 0.8594469428062439, "learning_rate": 0.0002, "epoch": 4.884739676840216, "step": 68020}, {"loss": 0.6115, "grad_norm": 0.9518412947654724, "learning_rate": 0.0002, "epoch": 4.885457809694794, "step": 68030}, {"loss": 0.6071, "grad_norm": 1.0934258699417114, "learning_rate": 0.0002, "epoch": 4.886175942549372, "step": 68040}, {"loss": 0.6265, "grad_norm": 0.988761842250824, "learning_rate": 0.0002, "epoch": 4.88689407540395, "step": 68050}, {"loss": 0.5981, "grad_norm": 0.7572013735771179, "learning_rate": 0.0002, "epoch": 4.887612208258528, "step": 68060}, {"loss": 0.6286, "grad_norm": 0.8801929950714111, "learning_rate": 0.0002, "epoch": 4.888330341113106, "step": 68070}, {"loss": 0.6503, "grad_norm": 1.0080658197402954, "learning_rate": 0.0002, "epoch": 4.889048473967684, "step": 68080}, {"loss": 0.6064, "grad_norm": 0.9588785171508789, "learning_rate": 0.0002, "epoch": 4.8897666068222625, "step": 68090}, {"loss": 0.6159, "grad_norm": 1.0994032621383667, "learning_rate": 0.0002, "epoch": 4.8904847396768405, "step": 68100}, {"loss": 0.6357, "grad_norm": 0.9851962924003601, "learning_rate": 0.0002, "epoch": 4.8912028725314185, "step": 68110}, {"loss": 0.5999, "grad_norm": 0.9566116333007812, "learning_rate": 0.0002, "epoch": 4.8919210053859965, "step": 68120}, {"loss": 0.6742, "grad_norm": 0.8708083033561707, "learning_rate": 0.0002, "epoch": 4.8926391382405745, "step": 68130}, {"loss": 0.6489, "grad_norm": 1.2182754278182983, "learning_rate": 0.0002, "epoch": 4.8933572710951525, "step": 68140}, {"loss": 0.6442, "grad_norm": 1.047988772392273, "learning_rate": 0.0002, "epoch": 4.8940754039497305, "step": 68150}, {"loss": 0.6176, "grad_norm": 0.8665831685066223, "learning_rate": 0.0002, "epoch": 4.8947935368043085, "step": 68160}, {"loss": 0.5721, "grad_norm": 0.9313908219337463, "learning_rate": 0.0002, "epoch": 4.8955116696588865, "step": 68170}, {"loss": 0.6073, "grad_norm": 0.9568582773208618, "learning_rate": 0.0002, "epoch": 4.896229802513465, "step": 68180}, {"loss": 0.6308, "grad_norm": 1.0427594184875488, "learning_rate": 0.0002, "epoch": 4.896947935368043, "step": 68190}, {"loss": 0.6357, "grad_norm": 0.9132021069526672, "learning_rate": 0.0002, "epoch": 4.897666068222621, "step": 68200}, {"loss": 0.6264, "grad_norm": 0.9597318768501282, "learning_rate": 0.0002, "epoch": 4.898384201077199, "step": 68210}, {"loss": 0.6025, "grad_norm": 1.0736947059631348, "learning_rate": 0.0002, "epoch": 4.899102333931777, "step": 68220}, {"loss": 0.5942, "grad_norm": 0.9318404793739319, "learning_rate": 0.0002, "epoch": 4.899820466786355, "step": 68230}, {"loss": 0.5991, "grad_norm": 0.8594326972961426, "learning_rate": 0.0002, "epoch": 4.900538599640933, "step": 68240}, {"loss": 0.6145, "grad_norm": 1.1437443494796753, "learning_rate": 0.0002, "epoch": 4.901256732495511, "step": 68250}, {"loss": 0.6414, "grad_norm": 1.1599408388137817, "learning_rate": 0.0002, "epoch": 4.901974865350089, "step": 68260}, {"loss": 0.6148, "grad_norm": 1.160628080368042, "learning_rate": 0.0002, "epoch": 4.902692998204667, "step": 68270}, {"loss": 0.613, "grad_norm": 1.0147801637649536, "learning_rate": 0.0002, "epoch": 4.903411131059246, "step": 68280}, {"loss": 0.6502, "grad_norm": 0.8622691631317139, "learning_rate": 0.0002, "epoch": 4.904129263913824, "step": 68290}, {"loss": 0.618, "grad_norm": 0.7179980874061584, "learning_rate": 0.0002, "epoch": 4.904847396768402, "step": 68300}, {"loss": 0.6388, "grad_norm": 1.1705092191696167, "learning_rate": 0.0002, "epoch": 4.90556552962298, "step": 68310}, {"loss": 0.6164, "grad_norm": 1.1687676906585693, "learning_rate": 0.0002, "epoch": 4.906283662477558, "step": 68320}, {"loss": 0.6791, "grad_norm": 1.1621531248092651, "learning_rate": 0.0002, "epoch": 4.907001795332136, "step": 68330}, {"loss": 0.6474, "grad_norm": 1.0241422653198242, "learning_rate": 0.0002, "epoch": 4.907719928186714, "step": 68340}, {"loss": 0.6225, "grad_norm": 0.943354070186615, "learning_rate": 0.0002, "epoch": 4.908438061041292, "step": 68350}, {"loss": 0.6596, "grad_norm": 0.8091703653335571, "learning_rate": 0.0002, "epoch": 4.909156193895871, "step": 68360}, {"loss": 0.6196, "grad_norm": 0.8871228694915771, "learning_rate": 0.0002, "epoch": 4.909874326750449, "step": 68370}, {"loss": 0.5714, "grad_norm": 1.0951069593429565, "learning_rate": 0.0002, "epoch": 4.910592459605027, "step": 68380}, {"loss": 0.6407, "grad_norm": 1.1355193853378296, "learning_rate": 0.0002, "epoch": 4.911310592459605, "step": 68390}, {"loss": 0.6369, "grad_norm": 1.0741122961044312, "learning_rate": 0.0002, "epoch": 4.912028725314183, "step": 68400}, {"loss": 0.6176, "grad_norm": 0.9285269975662231, "learning_rate": 0.0002, "epoch": 4.912746858168761, "step": 68410}, {"loss": 0.6433, "grad_norm": 1.080695390701294, "learning_rate": 0.0002, "epoch": 4.913464991023339, "step": 68420}, {"loss": 0.6505, "grad_norm": 0.921331524848938, "learning_rate": 0.0002, "epoch": 4.914183123877917, "step": 68430}, {"loss": 0.701, "grad_norm": 0.9763174057006836, "learning_rate": 0.0002, "epoch": 4.914901256732495, "step": 68440}, {"loss": 0.6429, "grad_norm": 1.1133354902267456, "learning_rate": 0.0002, "epoch": 4.915619389587073, "step": 68450}, {"loss": 0.6117, "grad_norm": 0.8373502492904663, "learning_rate": 0.0002, "epoch": 4.916337522441651, "step": 68460}, {"loss": 0.5993, "grad_norm": 0.9192346334457397, "learning_rate": 0.0002, "epoch": 4.91705565529623, "step": 68470}, {"loss": 0.626, "grad_norm": 1.0724657773971558, "learning_rate": 0.0002, "epoch": 4.917773788150808, "step": 68480}, {"loss": 0.6339, "grad_norm": 0.9209843873977661, "learning_rate": 0.0002, "epoch": 4.918491921005386, "step": 68490}, {"loss": 0.6427, "grad_norm": 0.9201577305793762, "learning_rate": 0.0002, "epoch": 4.919210053859964, "step": 68500}, {"loss": 0.6686, "grad_norm": 0.8086138963699341, "learning_rate": 0.0002, "epoch": 4.919928186714542, "step": 68510}, {"loss": 0.564, "grad_norm": 1.0917785167694092, "learning_rate": 0.0002, "epoch": 4.92064631956912, "step": 68520}, {"loss": 0.6177, "grad_norm": 0.9287897944450378, "learning_rate": 0.0002, "epoch": 4.921364452423698, "step": 68530}, {"loss": 0.6344, "grad_norm": 0.9830158948898315, "learning_rate": 0.0002, "epoch": 4.922082585278276, "step": 68540}, {"loss": 0.6583, "grad_norm": 0.8674678802490234, "learning_rate": 0.0002, "epoch": 4.922800718132855, "step": 68550}, {"loss": 0.6284, "grad_norm": 0.7996176481246948, "learning_rate": 0.0002, "epoch": 4.923518850987433, "step": 68560}, {"loss": 0.6089, "grad_norm": 1.1284033060073853, "learning_rate": 0.0002, "epoch": 4.924236983842011, "step": 68570}, {"loss": 0.6454, "grad_norm": 0.894339919090271, "learning_rate": 0.0002, "epoch": 4.924955116696589, "step": 68580}, {"loss": 0.6231, "grad_norm": 1.1140280961990356, "learning_rate": 0.0002, "epoch": 4.925673249551167, "step": 68590}, {"loss": 0.6318, "grad_norm": 0.9048344492912292, "learning_rate": 0.0002, "epoch": 4.926391382405745, "step": 68600}, {"loss": 0.5963, "grad_norm": 0.9380471706390381, "learning_rate": 0.0002, "epoch": 4.927109515260323, "step": 68610}, {"loss": 0.6384, "grad_norm": 0.8598429560661316, "learning_rate": 0.0002, "epoch": 4.927827648114901, "step": 68620}, {"loss": 0.6486, "grad_norm": 1.0813355445861816, "learning_rate": 0.0002, "epoch": 4.928545780969479, "step": 68630}, {"loss": 0.6367, "grad_norm": 0.979053795337677, "learning_rate": 0.0002, "epoch": 4.929263913824057, "step": 68640}, {"loss": 0.6084, "grad_norm": 0.8194574117660522, "learning_rate": 0.0002, "epoch": 4.929982046678636, "step": 68650}, {"loss": 0.6469, "grad_norm": 0.8593540787696838, "learning_rate": 0.0002, "epoch": 4.930700179533214, "step": 68660}, {"loss": 0.6465, "grad_norm": 1.0134016275405884, "learning_rate": 0.0002, "epoch": 4.931418312387792, "step": 68670}, {"loss": 0.6221, "grad_norm": 1.060586929321289, "learning_rate": 0.0002, "epoch": 4.93213644524237, "step": 68680}, {"loss": 0.5861, "grad_norm": 0.84132319688797, "learning_rate": 0.0002, "epoch": 4.932854578096948, "step": 68690}, {"loss": 0.6206, "grad_norm": 1.0767526626586914, "learning_rate": 0.0002, "epoch": 4.933572710951526, "step": 68700}, {"loss": 0.6294, "grad_norm": 0.8858519792556763, "learning_rate": 0.0002, "epoch": 4.934290843806104, "step": 68710}, {"loss": 0.6727, "grad_norm": 1.194031000137329, "learning_rate": 0.0002, "epoch": 4.935008976660682, "step": 68720}, {"loss": 0.6231, "grad_norm": 0.8270226120948792, "learning_rate": 0.0002, "epoch": 4.93572710951526, "step": 68730}, {"loss": 0.6538, "grad_norm": 1.0385973453521729, "learning_rate": 0.0002, "epoch": 4.936445242369839, "step": 68740}, {"loss": 0.623, "grad_norm": 0.9062243700027466, "learning_rate": 0.0002, "epoch": 4.937163375224417, "step": 68750}, {"loss": 0.6578, "grad_norm": 1.0526955127716064, "learning_rate": 0.0002, "epoch": 4.937881508078995, "step": 68760}, {"loss": 0.6425, "grad_norm": 0.930604100227356, "learning_rate": 0.0002, "epoch": 4.938599640933573, "step": 68770}, {"loss": 0.6228, "grad_norm": 0.9635265469551086, "learning_rate": 0.0002, "epoch": 4.939317773788151, "step": 68780}, {"loss": 0.6269, "grad_norm": 0.9825171232223511, "learning_rate": 0.0002, "epoch": 4.940035906642729, "step": 68790}, {"loss": 0.6063, "grad_norm": 0.9621182680130005, "learning_rate": 0.0002, "epoch": 4.940754039497307, "step": 68800}, {"loss": 0.6558, "grad_norm": 0.9655307531356812, "learning_rate": 0.0002, "epoch": 4.941472172351885, "step": 68810}, {"loss": 0.6441, "grad_norm": 1.2948180437088013, "learning_rate": 0.0002, "epoch": 4.942190305206463, "step": 68820}, {"loss": 0.6757, "grad_norm": 0.9206728339195251, "learning_rate": 0.0002, "epoch": 4.942908438061041, "step": 68830}, {"loss": 0.6554, "grad_norm": 1.0235631465911865, "learning_rate": 0.0002, "epoch": 4.94362657091562, "step": 68840}, {"loss": 0.6386, "grad_norm": 1.0542538166046143, "learning_rate": 0.0002, "epoch": 4.944344703770198, "step": 68850}, {"loss": 0.6359, "grad_norm": 0.9787087440490723, "learning_rate": 0.0002, "epoch": 4.945062836624776, "step": 68860}, {"loss": 0.659, "grad_norm": 0.9527219533920288, "learning_rate": 0.0002, "epoch": 4.945780969479354, "step": 68870}, {"loss": 0.6504, "grad_norm": 1.1525826454162598, "learning_rate": 0.0002, "epoch": 4.946499102333932, "step": 68880}, {"loss": 0.6345, "grad_norm": 0.8610072731971741, "learning_rate": 0.0002, "epoch": 4.94721723518851, "step": 68890}, {"loss": 0.6029, "grad_norm": 1.1403616666793823, "learning_rate": 0.0002, "epoch": 4.947935368043088, "step": 68900}, {"loss": 0.6476, "grad_norm": 1.10334312915802, "learning_rate": 0.0002, "epoch": 4.948653500897666, "step": 68910}, {"loss": 0.6123, "grad_norm": 0.8633760809898376, "learning_rate": 0.0002, "epoch": 4.949371633752245, "step": 68920}, {"loss": 0.6619, "grad_norm": 1.1291080713272095, "learning_rate": 0.0002, "epoch": 4.950089766606823, "step": 68930}, {"loss": 0.6003, "grad_norm": 1.0176939964294434, "learning_rate": 0.0002, "epoch": 4.950807899461401, "step": 68940}, {"loss": 0.6126, "grad_norm": 0.9207960963249207, "learning_rate": 0.0002, "epoch": 4.951526032315979, "step": 68950}, {"loss": 0.6031, "grad_norm": 0.9815934300422668, "learning_rate": 0.0002, "epoch": 4.952244165170557, "step": 68960}, {"loss": 0.6201, "grad_norm": 0.9725701808929443, "learning_rate": 0.0002, "epoch": 4.952962298025135, "step": 68970}, {"loss": 0.6251, "grad_norm": 0.844926655292511, "learning_rate": 0.0002, "epoch": 4.953680430879713, "step": 68980}, {"loss": 0.6446, "grad_norm": 0.9898511171340942, "learning_rate": 0.0002, "epoch": 4.954398563734291, "step": 68990}, {"loss": 0.629, "grad_norm": 1.1311410665512085, "learning_rate": 0.0002, "epoch": 4.955116696588869, "step": 69000}, {"loss": 0.6525, "grad_norm": 1.218610405921936, "learning_rate": 0.0002, "epoch": 4.955834829443447, "step": 69010}, {"loss": 0.6639, "grad_norm": 1.1536420583724976, "learning_rate": 0.0002, "epoch": 4.956552962298025, "step": 69020}, {"loss": 0.6375, "grad_norm": 1.1857786178588867, "learning_rate": 0.0002, "epoch": 4.957271095152604, "step": 69030}, {"loss": 0.6618, "grad_norm": 0.9969246983528137, "learning_rate": 0.0002, "epoch": 4.957989228007182, "step": 69040}, {"loss": 0.633, "grad_norm": 1.138635277748108, "learning_rate": 0.0002, "epoch": 4.95870736086176, "step": 69050}, {"loss": 0.6344, "grad_norm": 1.110474705696106, "learning_rate": 0.0002, "epoch": 4.959425493716338, "step": 69060}, {"loss": 0.687, "grad_norm": 1.0366318225860596, "learning_rate": 0.0002, "epoch": 4.960143626570916, "step": 69070}, {"loss": 0.6384, "grad_norm": 0.6927996277809143, "learning_rate": 0.0002, "epoch": 4.960861759425494, "step": 69080}, {"loss": 0.6337, "grad_norm": 1.0368026494979858, "learning_rate": 0.0002, "epoch": 4.961579892280072, "step": 69090}, {"loss": 0.6077, "grad_norm": 1.0638312101364136, "learning_rate": 0.0002, "epoch": 4.96229802513465, "step": 69100}, {"loss": 0.6403, "grad_norm": 1.0372415781021118, "learning_rate": 0.0002, "epoch": 4.9630161579892285, "step": 69110}, {"loss": 0.6347, "grad_norm": 0.8257387280464172, "learning_rate": 0.0002, "epoch": 4.9637342908438065, "step": 69120}, {"loss": 0.6405, "grad_norm": 1.0046974420547485, "learning_rate": 0.0002, "epoch": 4.9644524236983845, "step": 69130}, {"loss": 0.623, "grad_norm": 1.0139652490615845, "learning_rate": 0.0002, "epoch": 4.9651705565529625, "step": 69140}, {"loss": 0.5857, "grad_norm": 1.0214691162109375, "learning_rate": 0.0002, "epoch": 4.9658886894075405, "step": 69150}, {"loss": 0.624, "grad_norm": 1.1042424440383911, "learning_rate": 0.0002, "epoch": 4.9666068222621185, "step": 69160}, {"loss": 0.6475, "grad_norm": 0.8749067783355713, "learning_rate": 0.0002, "epoch": 4.9673249551166965, "step": 69170}, {"loss": 0.6734, "grad_norm": 0.9894024133682251, "learning_rate": 0.0002, "epoch": 4.9680430879712745, "step": 69180}, {"loss": 0.5894, "grad_norm": 1.0218034982681274, "learning_rate": 0.0002, "epoch": 4.9687612208258525, "step": 69190}, {"loss": 0.6423, "grad_norm": 0.9782929420471191, "learning_rate": 0.0002, "epoch": 4.9694793536804305, "step": 69200}, {"loss": 0.6455, "grad_norm": 0.9373409748077393, "learning_rate": 0.0002, "epoch": 4.9701974865350085, "step": 69210}, {"loss": 0.6105, "grad_norm": 1.0329546928405762, "learning_rate": 0.0002, "epoch": 4.970915619389587, "step": 69220}, {"loss": 0.6877, "grad_norm": 0.9746108055114746, "learning_rate": 0.0002, "epoch": 4.971633752244165, "step": 69230}, {"loss": 0.6342, "grad_norm": 0.9202073216438293, "learning_rate": 0.0002, "epoch": 4.972351885098743, "step": 69240}, {"loss": 0.6102, "grad_norm": 1.078032374382019, "learning_rate": 0.0002, "epoch": 4.973070017953321, "step": 69250}, {"loss": 0.6349, "grad_norm": 0.8860024809837341, "learning_rate": 0.0002, "epoch": 4.973788150807899, "step": 69260}, {"loss": 0.5971, "grad_norm": 0.915212094783783, "learning_rate": 0.0002, "epoch": 4.974506283662477, "step": 69270}, {"loss": 0.623, "grad_norm": 1.1192166805267334, "learning_rate": 0.0002, "epoch": 4.975224416517055, "step": 69280}, {"loss": 0.6347, "grad_norm": 0.8387445211410522, "learning_rate": 0.0002, "epoch": 4.975942549371633, "step": 69290}, {"loss": 0.6392, "grad_norm": 1.1210044622421265, "learning_rate": 0.0002, "epoch": 4.976660682226212, "step": 69300}, {"loss": 0.6565, "grad_norm": 1.0051207542419434, "learning_rate": 0.0002, "epoch": 4.97737881508079, "step": 69310}, {"loss": 0.5961, "grad_norm": 0.9248682856559753, "learning_rate": 0.0002, "epoch": 4.978096947935368, "step": 69320}, {"loss": 0.6067, "grad_norm": 0.8265128135681152, "learning_rate": 0.0002, "epoch": 4.978815080789946, "step": 69330}, {"loss": 0.6068, "grad_norm": 0.9432681798934937, "learning_rate": 0.0002, "epoch": 4.979533213644524, "step": 69340}, {"loss": 0.627, "grad_norm": 1.0135977268218994, "learning_rate": 0.0002, "epoch": 4.980251346499102, "step": 69350}, {"loss": 0.5882, "grad_norm": 0.9857245683670044, "learning_rate": 0.0002, "epoch": 4.98096947935368, "step": 69360}, {"loss": 0.6396, "grad_norm": 0.9215952157974243, "learning_rate": 0.0002, "epoch": 4.981687612208258, "step": 69370}, {"loss": 0.565, "grad_norm": 1.1518077850341797, "learning_rate": 0.0002, "epoch": 4.982405745062836, "step": 69380}, {"loss": 0.6022, "grad_norm": 0.8836095929145813, "learning_rate": 0.0002, "epoch": 4.983123877917414, "step": 69390}, {"loss": 0.6442, "grad_norm": 0.8082528710365295, "learning_rate": 0.0002, "epoch": 4.983842010771993, "step": 69400}, {"loss": 0.597, "grad_norm": 0.9295604825019836, "learning_rate": 0.0002, "epoch": 4.984560143626571, "step": 69410}, {"loss": 0.5811, "grad_norm": 1.002057433128357, "learning_rate": 0.0002, "epoch": 4.985278276481149, "step": 69420}, {"loss": 0.6275, "grad_norm": 0.8127216100692749, "learning_rate": 0.0002, "epoch": 4.985996409335727, "step": 69430}, {"loss": 0.6223, "grad_norm": 1.058138370513916, "learning_rate": 0.0002, "epoch": 4.986714542190305, "step": 69440}, {"loss": 0.6317, "grad_norm": 0.8451166749000549, "learning_rate": 0.0002, "epoch": 4.987432675044883, "step": 69450}, {"loss": 0.6135, "grad_norm": 0.9687268137931824, "learning_rate": 0.0002, "epoch": 4.988150807899461, "step": 69460}, {"loss": 0.5926, "grad_norm": 1.0342036485671997, "learning_rate": 0.0002, "epoch": 4.988868940754039, "step": 69470}, {"loss": 0.636, "grad_norm": 0.9042398929595947, "learning_rate": 0.0002, "epoch": 4.989587073608618, "step": 69480}, {"loss": 0.6193, "grad_norm": 1.0575438737869263, "learning_rate": 0.0002, "epoch": 4.990305206463196, "step": 69490}, {"loss": 0.5887, "grad_norm": 0.9364935159683228, "learning_rate": 0.0002, "epoch": 4.991023339317774, "step": 69500}, {"loss": 0.6532, "grad_norm": 1.0327378511428833, "learning_rate": 0.0002, "epoch": 4.991741472172352, "step": 69510}, {"loss": 0.6397, "grad_norm": 0.815592885017395, "learning_rate": 0.0002, "epoch": 4.99245960502693, "step": 69520}, {"loss": 0.6776, "grad_norm": 1.0813369750976562, "learning_rate": 0.0002, "epoch": 4.993177737881508, "step": 69530}, {"loss": 0.6964, "grad_norm": 1.0277023315429688, "learning_rate": 0.0002, "epoch": 4.993895870736086, "step": 69540}, {"loss": 0.6369, "grad_norm": 1.0291162729263306, "learning_rate": 0.0002, "epoch": 4.994614003590664, "step": 69550}, {"loss": 0.5842, "grad_norm": 0.8435685634613037, "learning_rate": 0.0002, "epoch": 4.995332136445242, "step": 69560}, {"loss": 0.6146, "grad_norm": 1.1972291469573975, "learning_rate": 0.0002, "epoch": 4.99605026929982, "step": 69570}, {"loss": 0.5977, "grad_norm": 0.8114907741546631, "learning_rate": 0.0002, "epoch": 4.996768402154398, "step": 69580}, {"loss": 0.6137, "grad_norm": 0.8296133875846863, "learning_rate": 0.0002, "epoch": 4.997486535008977, "step": 69590}, {"loss": 0.6273, "grad_norm": 1.1728706359863281, "learning_rate": 0.0002, "epoch": 4.998204667863555, "step": 69600}, {"loss": 0.6579, "grad_norm": 0.9586578607559204, "learning_rate": 0.0002, "epoch": 4.998922800718133, "step": 69610}, {"loss": 0.612, "grad_norm": 0.9725151062011719, "learning_rate": 0.0002, "epoch": 4.999640933572711, "step": 69620}, {"eval_loss": 1.133581519126892, "eval_runtime": 55.2151, "eval_samples_per_second": 13.275, "eval_steps_per_second": 1.666, "epoch": 5.0, "step": 69625}, {"loss": 0.5741, "grad_norm": 0.9312055706977844, "learning_rate": 0.0002, "epoch": 5.000359066427289, "step": 69630}, {"loss": 0.5625, "grad_norm": 1.0534896850585938, "learning_rate": 0.0002, "epoch": 5.001077199281867, "step": 69640}, {"loss": 0.581, "grad_norm": 0.8891698718070984, "learning_rate": 0.0002, "epoch": 5.001795332136445, "step": 69650}, {"loss": 0.554, "grad_norm": 0.7791097164154053, "learning_rate": 0.0002, "epoch": 5.002513464991023, "step": 69660}, {"loss": 0.5146, "grad_norm": 1.2891173362731934, "learning_rate": 0.0002, "epoch": 5.003231597845601, "step": 69670}, {"loss": 0.551, "grad_norm": 0.7909513711929321, "learning_rate": 0.0002, "epoch": 5.00394973070018, "step": 69680}, {"loss": 0.5671, "grad_norm": 0.988648533821106, "learning_rate": 0.0002, "epoch": 5.004667863554758, "step": 69690}, {"loss": 0.5113, "grad_norm": 0.9669296741485596, "learning_rate": 0.0002, "epoch": 5.005385996409336, "step": 69700}, {"loss": 0.5974, "grad_norm": 1.2393349409103394, "learning_rate": 0.0002, "epoch": 5.006104129263914, "step": 69710}, {"loss": 0.5481, "grad_norm": 1.2420750856399536, "learning_rate": 0.0002, "epoch": 5.006822262118492, "step": 69720}, {"loss": 0.5725, "grad_norm": 1.1698096990585327, "learning_rate": 0.0002, "epoch": 5.00754039497307, "step": 69730}, {"loss": 0.5646, "grad_norm": 1.2228301763534546, "learning_rate": 0.0002, "epoch": 5.008258527827648, "step": 69740}, {"loss": 0.6048, "grad_norm": 0.9350621104240417, "learning_rate": 0.0002, "epoch": 5.008976660682226, "step": 69750}, {"loss": 0.5278, "grad_norm": 0.9828507304191589, "learning_rate": 0.0002, "epoch": 5.009694793536804, "step": 69760}, {"loss": 0.5188, "grad_norm": 0.9372149109840393, "learning_rate": 0.0002, "epoch": 5.010412926391383, "step": 69770}, {"loss": 0.5408, "grad_norm": 0.8098477125167847, "learning_rate": 0.0002, "epoch": 5.011131059245961, "step": 69780}, {"loss": 0.533, "grad_norm": 1.0418338775634766, "learning_rate": 0.0002, "epoch": 5.011849192100539, "step": 69790}, {"loss": 0.5423, "grad_norm": 1.0175801515579224, "learning_rate": 0.0002, "epoch": 5.012567324955117, "step": 69800}, {"loss": 0.5389, "grad_norm": 1.2128081321716309, "learning_rate": 0.0002, "epoch": 5.013285457809695, "step": 69810}, {"loss": 0.5307, "grad_norm": 1.001805067062378, "learning_rate": 0.0002, "epoch": 5.014003590664273, "step": 69820}, {"loss": 0.533, "grad_norm": 0.8957470059394836, "learning_rate": 0.0002, "epoch": 5.014721723518851, "step": 69830}, {"loss": 0.6017, "grad_norm": 0.9344548583030701, "learning_rate": 0.0002, "epoch": 5.015439856373429, "step": 69840}, {"loss": 0.6182, "grad_norm": 0.8545927405357361, "learning_rate": 0.0002, "epoch": 5.016157989228007, "step": 69850}, {"loss": 0.5543, "grad_norm": 1.3907777070999146, "learning_rate": 0.0002, "epoch": 5.016876122082586, "step": 69860}, {"loss": 0.5028, "grad_norm": 0.8112093806266785, "learning_rate": 0.0002, "epoch": 5.017594254937164, "step": 69870}, {"loss": 0.5, "grad_norm": 1.0151532888412476, "learning_rate": 0.0002, "epoch": 5.018312387791742, "step": 69880}, {"loss": 0.5622, "grad_norm": 1.249021053314209, "learning_rate": 0.0002, "epoch": 5.01903052064632, "step": 69890}, {"loss": 0.5419, "grad_norm": 0.9310314059257507, "learning_rate": 0.0002, "epoch": 5.019748653500898, "step": 69900}, {"loss": 0.5628, "grad_norm": 0.9444572925567627, "learning_rate": 0.0002, "epoch": 5.020466786355476, "step": 69910}, {"loss": 0.5436, "grad_norm": 1.0952081680297852, "learning_rate": 0.0002, "epoch": 5.021184919210054, "step": 69920}, {"loss": 0.5532, "grad_norm": 1.2106375694274902, "learning_rate": 0.0002, "epoch": 5.021903052064632, "step": 69930}, {"loss": 0.5307, "grad_norm": 1.0179580450057983, "learning_rate": 0.0002, "epoch": 5.02262118491921, "step": 69940}, {"loss": 0.5537, "grad_norm": 1.0865367650985718, "learning_rate": 0.0002, "epoch": 5.023339317773788, "step": 69950}, {"loss": 0.6011, "grad_norm": 1.0965075492858887, "learning_rate": 0.0002, "epoch": 5.024057450628367, "step": 69960}, {"loss": 0.5255, "grad_norm": 0.8879445791244507, "learning_rate": 0.0002, "epoch": 5.024775583482945, "step": 69970}, {"loss": 0.5681, "grad_norm": 1.2588363885879517, "learning_rate": 0.0002, "epoch": 5.025493716337523, "step": 69980}, {"loss": 0.5288, "grad_norm": 0.935705304145813, "learning_rate": 0.0002, "epoch": 5.026211849192101, "step": 69990}, {"loss": 0.4922, "grad_norm": 1.072012186050415, "learning_rate": 0.0002, "epoch": 5.026929982046679, "step": 70000}, {"loss": 0.5729, "grad_norm": 1.286438226699829, "learning_rate": 0.0002, "epoch": 5.027648114901257, "step": 70010}, {"loss": 0.5569, "grad_norm": 1.1165392398834229, "learning_rate": 0.0002, "epoch": 5.028366247755835, "step": 70020}, {"loss": 0.5348, "grad_norm": 0.7998424172401428, "learning_rate": 0.0002, "epoch": 5.029084380610413, "step": 70030}, {"loss": 0.5436, "grad_norm": 1.5669852495193481, "learning_rate": 0.0002, "epoch": 5.029802513464991, "step": 70040}, {"loss": 0.5595, "grad_norm": 0.9780290722846985, "learning_rate": 0.0002, "epoch": 5.0305206463195695, "step": 70050}, {"loss": 0.5612, "grad_norm": 0.9837628602981567, "learning_rate": 0.0002, "epoch": 5.0312387791741475, "step": 70060}, {"loss": 0.5369, "grad_norm": 0.9558916091918945, "learning_rate": 0.0002, "epoch": 5.0319569120287255, "step": 70070}, {"loss": 0.552, "grad_norm": 0.8893155455589294, "learning_rate": 0.0002, "epoch": 5.0326750448833035, "step": 70080}, {"loss": 0.5684, "grad_norm": 1.1403675079345703, "learning_rate": 0.0002, "epoch": 5.0333931777378815, "step": 70090}, {"loss": 0.5352, "grad_norm": 1.0453649759292603, "learning_rate": 0.0002, "epoch": 5.0341113105924595, "step": 70100}, {"loss": 0.5691, "grad_norm": 0.8127498030662537, "learning_rate": 0.0002, "epoch": 5.0348294434470375, "step": 70110}, {"loss": 0.5254, "grad_norm": 0.9344680309295654, "learning_rate": 0.0002, "epoch": 5.0355475763016155, "step": 70120}, {"loss": 0.5385, "grad_norm": 1.0302079916000366, "learning_rate": 0.0002, "epoch": 5.0362657091561935, "step": 70130}, {"loss": 0.5949, "grad_norm": 1.0549713373184204, "learning_rate": 0.0002, "epoch": 5.036983842010772, "step": 70140}, {"loss": 0.4886, "grad_norm": 0.8916767835617065, "learning_rate": 0.0002, "epoch": 5.03770197486535, "step": 70150}, {"loss": 0.5761, "grad_norm": 0.9799798130989075, "learning_rate": 0.0002, "epoch": 5.038420107719928, "step": 70160}, {"loss": 0.5138, "grad_norm": 1.15560781955719, "learning_rate": 0.0002, "epoch": 5.039138240574506, "step": 70170}, {"loss": 0.6075, "grad_norm": 1.0577017068862915, "learning_rate": 0.0002, "epoch": 5.039856373429084, "step": 70180}, {"loss": 0.5316, "grad_norm": 1.027990698814392, "learning_rate": 0.0002, "epoch": 5.040574506283662, "step": 70190}, {"loss": 0.567, "grad_norm": 1.0818232297897339, "learning_rate": 0.0002, "epoch": 5.04129263913824, "step": 70200}, {"loss": 0.5699, "grad_norm": 1.0287196636199951, "learning_rate": 0.0002, "epoch": 5.042010771992818, "step": 70210}, {"loss": 0.5129, "grad_norm": 1.1569273471832275, "learning_rate": 0.0002, "epoch": 5.042728904847396, "step": 70220}, {"loss": 0.5407, "grad_norm": 1.0485484600067139, "learning_rate": 0.0002, "epoch": 5.0434470377019744, "step": 70230}, {"loss": 0.5203, "grad_norm": 0.9244540333747864, "learning_rate": 0.0002, "epoch": 5.044165170556553, "step": 70240}, {"loss": 0.5277, "grad_norm": 0.9576422572135925, "learning_rate": 0.0002, "epoch": 5.044883303411131, "step": 70250}, {"loss": 0.539, "grad_norm": 0.8719421625137329, "learning_rate": 0.0002, "epoch": 5.045601436265709, "step": 70260}, {"loss": 0.5725, "grad_norm": 0.8685409426689148, "learning_rate": 0.0002, "epoch": 5.046319569120287, "step": 70270}, {"loss": 0.5111, "grad_norm": 1.2735247611999512, "learning_rate": 0.0002, "epoch": 5.047037701974865, "step": 70280}, {"loss": 0.5768, "grad_norm": 0.9082128405570984, "learning_rate": 0.0002, "epoch": 5.047755834829443, "step": 70290}, {"loss": 0.5649, "grad_norm": 1.0626471042633057, "learning_rate": 0.0002, "epoch": 5.048473967684021, "step": 70300}, {"loss": 0.5694, "grad_norm": 1.1463991403579712, "learning_rate": 0.0002, "epoch": 5.049192100538599, "step": 70310}, {"loss": 0.5912, "grad_norm": 0.8825355172157288, "learning_rate": 0.0002, "epoch": 5.049910233393177, "step": 70320}, {"loss": 0.5814, "grad_norm": 1.0549408197402954, "learning_rate": 0.0002, "epoch": 5.050628366247756, "step": 70330}, {"loss": 0.5658, "grad_norm": 1.3740944862365723, "learning_rate": 0.0002, "epoch": 5.051346499102334, "step": 70340}, {"loss": 0.5665, "grad_norm": 1.4197895526885986, "learning_rate": 0.0002, "epoch": 5.052064631956912, "step": 70350}, {"loss": 0.5852, "grad_norm": 1.1764925718307495, "learning_rate": 0.0002, "epoch": 5.05278276481149, "step": 70360}, {"loss": 0.5551, "grad_norm": 1.0443403720855713, "learning_rate": 0.0002, "epoch": 5.053500897666068, "step": 70370}, {"loss": 0.5647, "grad_norm": 1.1807527542114258, "learning_rate": 0.0002, "epoch": 5.054219030520646, "step": 70380}, {"loss": 0.5712, "grad_norm": 1.4032433032989502, "learning_rate": 0.0002, "epoch": 5.054937163375224, "step": 70390}, {"loss": 0.5656, "grad_norm": 0.9815662503242493, "learning_rate": 0.0002, "epoch": 5.055655296229802, "step": 70400}, {"loss": 0.5878, "grad_norm": 0.9368446469306946, "learning_rate": 0.0002, "epoch": 5.05637342908438, "step": 70410}, {"loss": 0.5639, "grad_norm": 1.1156736612319946, "learning_rate": 0.0002, "epoch": 5.057091561938959, "step": 70420}, {"loss": 0.5564, "grad_norm": 1.01651132106781, "learning_rate": 0.0002, "epoch": 5.057809694793537, "step": 70430}, {"loss": 0.5276, "grad_norm": 0.9906342029571533, "learning_rate": 0.0002, "epoch": 5.058527827648115, "step": 70440}, {"loss": 0.5533, "grad_norm": 0.8666667938232422, "learning_rate": 0.0002, "epoch": 5.059245960502693, "step": 70450}, {"loss": 0.5253, "grad_norm": 1.0508924722671509, "learning_rate": 0.0002, "epoch": 5.059964093357271, "step": 70460}, {"loss": 0.5456, "grad_norm": 1.2472858428955078, "learning_rate": 0.0002, "epoch": 5.060682226211849, "step": 70470}, {"loss": 0.5836, "grad_norm": 1.019073724746704, "learning_rate": 0.0002, "epoch": 5.061400359066427, "step": 70480}, {"loss": 0.5206, "grad_norm": 0.9745403528213501, "learning_rate": 0.0002, "epoch": 5.062118491921005, "step": 70490}, {"loss": 0.5543, "grad_norm": 1.121208906173706, "learning_rate": 0.0002, "epoch": 5.062836624775583, "step": 70500}, {"loss": 0.54, "grad_norm": 1.0535147190093994, "learning_rate": 0.0002, "epoch": 5.063554757630161, "step": 70510}, {"loss": 0.5601, "grad_norm": 1.0368950366973877, "learning_rate": 0.0002, "epoch": 5.06427289048474, "step": 70520}, {"loss": 0.5495, "grad_norm": 0.948964536190033, "learning_rate": 0.0002, "epoch": 5.064991023339318, "step": 70530}, {"loss": 0.5254, "grad_norm": 1.0289826393127441, "learning_rate": 0.0002, "epoch": 5.065709156193896, "step": 70540}, {"loss": 0.591, "grad_norm": 1.118374228477478, "learning_rate": 0.0002, "epoch": 5.066427289048474, "step": 70550}, {"loss": 0.5874, "grad_norm": 0.8712816834449768, "learning_rate": 0.0002, "epoch": 5.067145421903052, "step": 70560}, {"loss": 0.557, "grad_norm": 0.9057969450950623, "learning_rate": 0.0002, "epoch": 5.06786355475763, "step": 70570}, {"loss": 0.5606, "grad_norm": 0.9292685985565186, "learning_rate": 0.0002, "epoch": 5.068581687612208, "step": 70580}, {"loss": 0.5468, "grad_norm": 0.9159911274909973, "learning_rate": 0.0002, "epoch": 5.069299820466786, "step": 70590}, {"loss": 0.5608, "grad_norm": 0.973848819732666, "learning_rate": 0.0002, "epoch": 5.070017953321364, "step": 70600}, {"loss": 0.5199, "grad_norm": 0.7892279028892517, "learning_rate": 0.0002, "epoch": 5.070736086175943, "step": 70610}, {"loss": 0.6009, "grad_norm": 0.9943311214447021, "learning_rate": 0.0002, "epoch": 5.071454219030521, "step": 70620}, {"loss": 0.5224, "grad_norm": 1.1457926034927368, "learning_rate": 0.0002, "epoch": 5.072172351885099, "step": 70630}, {"loss": 0.5821, "grad_norm": 0.9307738542556763, "learning_rate": 0.0002, "epoch": 5.072890484739677, "step": 70640}, {"loss": 0.5375, "grad_norm": 1.0899816751480103, "learning_rate": 0.0002, "epoch": 5.073608617594255, "step": 70650}, {"loss": 0.5407, "grad_norm": 0.8357672691345215, "learning_rate": 0.0002, "epoch": 5.074326750448833, "step": 70660}, {"loss": 0.5745, "grad_norm": 0.8889468312263489, "learning_rate": 0.0002, "epoch": 5.075044883303411, "step": 70670}, {"loss": 0.5595, "grad_norm": 0.9152118563652039, "learning_rate": 0.0002, "epoch": 5.075763016157989, "step": 70680}, {"loss": 0.5706, "grad_norm": 1.106160044670105, "learning_rate": 0.0002, "epoch": 5.076481149012567, "step": 70690}, {"loss": 0.5659, "grad_norm": 0.8519207835197449, "learning_rate": 0.0002, "epoch": 5.077199281867145, "step": 70700}, {"loss": 0.5312, "grad_norm": 0.9754986763000488, "learning_rate": 0.0002, "epoch": 5.077917414721724, "step": 70710}, {"loss": 0.5602, "grad_norm": 1.167883276939392, "learning_rate": 0.0002, "epoch": 5.078635547576302, "step": 70720}, {"loss": 0.5427, "grad_norm": 0.987622082233429, "learning_rate": 0.0002, "epoch": 5.07935368043088, "step": 70730}, {"loss": 0.5346, "grad_norm": 1.0008184909820557, "learning_rate": 0.0002, "epoch": 5.080071813285458, "step": 70740}, {"loss": 0.5219, "grad_norm": 0.6318819522857666, "learning_rate": 0.0002, "epoch": 5.080789946140036, "step": 70750}, {"loss": 0.5838, "grad_norm": 0.984886884689331, "learning_rate": 0.0002, "epoch": 5.081508078994614, "step": 70760}, {"loss": 0.5775, "grad_norm": 1.0583622455596924, "learning_rate": 0.0002, "epoch": 5.082226211849192, "step": 70770}, {"loss": 0.579, "grad_norm": 0.9730119705200195, "learning_rate": 0.0002, "epoch": 5.08294434470377, "step": 70780}, {"loss": 0.5806, "grad_norm": 1.0201330184936523, "learning_rate": 0.0002, "epoch": 5.083662477558348, "step": 70790}, {"loss": 0.5568, "grad_norm": 1.0479248762130737, "learning_rate": 0.0002, "epoch": 5.084380610412927, "step": 70800}, {"loss": 0.5619, "grad_norm": 0.9185113906860352, "learning_rate": 0.0002, "epoch": 5.085098743267505, "step": 70810}, {"loss": 0.5468, "grad_norm": 0.9326799511909485, "learning_rate": 0.0002, "epoch": 5.085816876122083, "step": 70820}, {"loss": 0.5424, "grad_norm": 0.958739697933197, "learning_rate": 0.0002, "epoch": 5.086535008976661, "step": 70830}, {"loss": 0.6098, "grad_norm": 0.9643770456314087, "learning_rate": 0.0002, "epoch": 5.087253141831239, "step": 70840}, {"loss": 0.5427, "grad_norm": 0.8650234341621399, "learning_rate": 0.0002, "epoch": 5.087971274685817, "step": 70850}, {"loss": 0.5452, "grad_norm": 0.9354105591773987, "learning_rate": 0.0002, "epoch": 5.088689407540395, "step": 70860}, {"loss": 0.5467, "grad_norm": 0.8736345171928406, "learning_rate": 0.0002, "epoch": 5.089407540394973, "step": 70870}, {"loss": 0.5607, "grad_norm": 0.9172632098197937, "learning_rate": 0.0002, "epoch": 5.090125673249551, "step": 70880}, {"loss": 0.5136, "grad_norm": 0.9495565295219421, "learning_rate": 0.0002, "epoch": 5.09084380610413, "step": 70890}, {"loss": 0.5633, "grad_norm": 1.0328829288482666, "learning_rate": 0.0002, "epoch": 5.091561938958708, "step": 70900}, {"loss": 0.566, "grad_norm": 0.9335703253746033, "learning_rate": 0.0002, "epoch": 5.092280071813286, "step": 70910}, {"loss": 0.5393, "grad_norm": 1.0919437408447266, "learning_rate": 0.0002, "epoch": 5.092998204667864, "step": 70920}, {"loss": 0.5931, "grad_norm": 1.03340744972229, "learning_rate": 0.0002, "epoch": 5.093716337522442, "step": 70930}, {"loss": 0.5228, "grad_norm": 1.0501604080200195, "learning_rate": 0.0002, "epoch": 5.09443447037702, "step": 70940}, {"loss": 0.5518, "grad_norm": 0.9442012310028076, "learning_rate": 0.0002, "epoch": 5.095152603231598, "step": 70950}, {"loss": 0.5185, "grad_norm": 1.2592464685440063, "learning_rate": 0.0002, "epoch": 5.095870736086176, "step": 70960}, {"loss": 0.5524, "grad_norm": 1.0961427688598633, "learning_rate": 0.0002, "epoch": 5.096588868940754, "step": 70970}, {"loss": 0.5702, "grad_norm": 1.0472424030303955, "learning_rate": 0.0002, "epoch": 5.097307001795333, "step": 70980}, {"loss": 0.5697, "grad_norm": 0.9489352107048035, "learning_rate": 0.0002, "epoch": 5.098025134649911, "step": 70990}, {"loss": 0.5559, "grad_norm": 1.0499446392059326, "learning_rate": 0.0002, "epoch": 5.098743267504489, "step": 71000}, {"loss": 0.5815, "grad_norm": 1.013005018234253, "learning_rate": 0.0002, "epoch": 5.099461400359067, "step": 71010}, {"loss": 0.5524, "grad_norm": 0.9594261050224304, "learning_rate": 0.0002, "epoch": 5.100179533213645, "step": 71020}, {"loss": 0.5746, "grad_norm": 1.2016123533248901, "learning_rate": 0.0002, "epoch": 5.100897666068223, "step": 71030}, {"loss": 0.5605, "grad_norm": 1.0389765501022339, "learning_rate": 0.0002, "epoch": 5.101615798922801, "step": 71040}, {"loss": 0.5036, "grad_norm": 1.053534746170044, "learning_rate": 0.0002, "epoch": 5.102333931777379, "step": 71050}, {"loss": 0.5764, "grad_norm": 1.1379448175430298, "learning_rate": 0.0002, "epoch": 5.103052064631957, "step": 71060}, {"loss": 0.5487, "grad_norm": 0.8796491622924805, "learning_rate": 0.0002, "epoch": 5.103770197486535, "step": 71070}, {"loss": 0.59, "grad_norm": 1.0591254234313965, "learning_rate": 0.0002, "epoch": 5.1044883303411135, "step": 71080}, {"loss": 0.5591, "grad_norm": 0.9622171521186829, "learning_rate": 0.0002, "epoch": 5.1052064631956915, "step": 71090}, {"loss": 0.5737, "grad_norm": 0.9173060059547424, "learning_rate": 0.0002, "epoch": 5.1059245960502695, "step": 71100}, {"loss": 0.5794, "grad_norm": 0.8363444805145264, "learning_rate": 0.0002, "epoch": 5.1066427289048475, "step": 71110}, {"loss": 0.5689, "grad_norm": 1.1006172895431519, "learning_rate": 0.0002, "epoch": 5.1073608617594255, "step": 71120}, {"loss": 0.5753, "grad_norm": 1.0720574855804443, "learning_rate": 0.0002, "epoch": 5.1080789946140035, "step": 71130}, {"loss": 0.5585, "grad_norm": 1.0560680627822876, "learning_rate": 0.0002, "epoch": 5.1087971274685815, "step": 71140}, {"loss": 0.5535, "grad_norm": 0.8485415577888489, "learning_rate": 0.0002, "epoch": 5.1095152603231595, "step": 71150}, {"loss": 0.545, "grad_norm": 1.109383225440979, "learning_rate": 0.0002, "epoch": 5.1102333931777375, "step": 71160}, {"loss": 0.568, "grad_norm": 0.9296035766601562, "learning_rate": 0.0002, "epoch": 5.110951526032316, "step": 71170}, {"loss": 0.5151, "grad_norm": 1.2855182886123657, "learning_rate": 0.0002, "epoch": 5.111669658886894, "step": 71180}, {"loss": 0.5578, "grad_norm": 1.0313524007797241, "learning_rate": 0.0002, "epoch": 5.112387791741472, "step": 71190}, {"loss": 0.5486, "grad_norm": 1.0436697006225586, "learning_rate": 0.0002, "epoch": 5.11310592459605, "step": 71200}, {"loss": 0.5592, "grad_norm": 0.901333212852478, "learning_rate": 0.0002, "epoch": 5.113824057450628, "step": 71210}, {"loss": 0.5644, "grad_norm": 1.2170051336288452, "learning_rate": 0.0002, "epoch": 5.114542190305206, "step": 71220}, {"loss": 0.5508, "grad_norm": 0.8850961327552795, "learning_rate": 0.0002, "epoch": 5.115260323159784, "step": 71230}, {"loss": 0.5814, "grad_norm": 1.0147113800048828, "learning_rate": 0.0002, "epoch": 5.115978456014362, "step": 71240}, {"loss": 0.5824, "grad_norm": 1.0043506622314453, "learning_rate": 0.0002, "epoch": 5.11669658886894, "step": 71250}, {"loss": 0.5363, "grad_norm": 0.9887113571166992, "learning_rate": 0.0002, "epoch": 5.117414721723518, "step": 71260}, {"loss": 0.5956, "grad_norm": 1.1013392210006714, "learning_rate": 0.0002, "epoch": 5.118132854578097, "step": 71270}, {"loss": 0.5596, "grad_norm": 0.9213799238204956, "learning_rate": 0.0002, "epoch": 5.118850987432675, "step": 71280}, {"loss": 0.5473, "grad_norm": 1.047400712966919, "learning_rate": 0.0002, "epoch": 5.119569120287253, "step": 71290}, {"loss": 0.5866, "grad_norm": 1.030534029006958, "learning_rate": 0.0002, "epoch": 5.120287253141831, "step": 71300}, {"loss": 0.5713, "grad_norm": 0.9464976191520691, "learning_rate": 0.0002, "epoch": 5.121005385996409, "step": 71310}, {"loss": 0.5707, "grad_norm": 0.8610315918922424, "learning_rate": 0.0002, "epoch": 5.121723518850987, "step": 71320}, {"loss": 0.5498, "grad_norm": 1.0824426412582397, "learning_rate": 0.0002, "epoch": 5.122441651705565, "step": 71330}, {"loss": 0.5802, "grad_norm": 0.9382733106613159, "learning_rate": 0.0002, "epoch": 5.123159784560143, "step": 71340}, {"loss": 0.5899, "grad_norm": 0.9364684224128723, "learning_rate": 0.0002, "epoch": 5.123877917414721, "step": 71350}, {"loss": 0.5839, "grad_norm": 0.9583013653755188, "learning_rate": 0.0002, "epoch": 5.1245960502693, "step": 71360}, {"loss": 0.5446, "grad_norm": 1.287533164024353, "learning_rate": 0.0002, "epoch": 5.125314183123878, "step": 71370}, {"loss": 0.5602, "grad_norm": 1.5031169652938843, "learning_rate": 0.0002, "epoch": 5.126032315978456, "step": 71380}, {"loss": 0.5143, "grad_norm": 0.9891406297683716, "learning_rate": 0.0002, "epoch": 5.126750448833034, "step": 71390}, {"loss": 0.5408, "grad_norm": 1.1851537227630615, "learning_rate": 0.0002, "epoch": 5.127468581687612, "step": 71400}, {"loss": 0.586, "grad_norm": 0.9869971871376038, "learning_rate": 0.0002, "epoch": 5.12818671454219, "step": 71410}, {"loss": 0.575, "grad_norm": 0.961662769317627, "learning_rate": 0.0002, "epoch": 5.128904847396768, "step": 71420}, {"loss": 0.5686, "grad_norm": 1.1036419868469238, "learning_rate": 0.0002, "epoch": 5.129622980251346, "step": 71430}, {"loss": 0.5642, "grad_norm": 1.175361156463623, "learning_rate": 0.0002, "epoch": 5.130341113105924, "step": 71440}, {"loss": 0.5294, "grad_norm": 0.9801875948905945, "learning_rate": 0.0002, "epoch": 5.131059245960503, "step": 71450}, {"loss": 0.5123, "grad_norm": 0.9424611330032349, "learning_rate": 0.0002, "epoch": 5.131777378815081, "step": 71460}, {"loss": 0.651, "grad_norm": 1.11662757396698, "learning_rate": 0.0002, "epoch": 5.132495511669659, "step": 71470}, {"loss": 0.5498, "grad_norm": 0.9969366192817688, "learning_rate": 0.0002, "epoch": 5.133213644524237, "step": 71480}, {"loss": 0.5315, "grad_norm": 1.278640866279602, "learning_rate": 0.0002, "epoch": 5.133931777378815, "step": 71490}, {"loss": 0.5525, "grad_norm": 1.1090457439422607, "learning_rate": 0.0002, "epoch": 5.134649910233393, "step": 71500}, {"loss": 0.5307, "grad_norm": 1.01808500289917, "learning_rate": 0.0002, "epoch": 5.135368043087971, "step": 71510}, {"loss": 0.5465, "grad_norm": 1.029135823249817, "learning_rate": 0.0002, "epoch": 5.136086175942549, "step": 71520}, {"loss": 0.588, "grad_norm": 1.1207175254821777, "learning_rate": 0.0002, "epoch": 5.136804308797127, "step": 71530}, {"loss": 0.5451, "grad_norm": 1.0327218770980835, "learning_rate": 0.0002, "epoch": 5.137522441651706, "step": 71540}, {"loss": 0.5944, "grad_norm": 1.042490839958191, "learning_rate": 0.0002, "epoch": 5.138240574506284, "step": 71550}, {"loss": 0.5777, "grad_norm": 1.1800413131713867, "learning_rate": 0.0002, "epoch": 5.138958707360862, "step": 71560}, {"loss": 0.6002, "grad_norm": 1.0748766660690308, "learning_rate": 0.0002, "epoch": 5.13967684021544, "step": 71570}, {"loss": 0.5418, "grad_norm": 0.9983090758323669, "learning_rate": 0.0002, "epoch": 5.140394973070018, "step": 71580}, {"loss": 0.5423, "grad_norm": 1.30636727809906, "learning_rate": 0.0002, "epoch": 5.141113105924596, "step": 71590}, {"loss": 0.5742, "grad_norm": 0.9960222840309143, "learning_rate": 0.0002, "epoch": 5.141831238779174, "step": 71600}, {"loss": 0.5496, "grad_norm": 1.237027645111084, "learning_rate": 0.0002, "epoch": 5.142549371633752, "step": 71610}, {"loss": 0.564, "grad_norm": 1.0913307666778564, "learning_rate": 0.0002, "epoch": 5.14326750448833, "step": 71620}, {"loss": 0.5458, "grad_norm": 0.940657913684845, "learning_rate": 0.0002, "epoch": 5.143985637342908, "step": 71630}, {"loss": 0.5918, "grad_norm": 1.093796730041504, "learning_rate": 0.0002, "epoch": 5.144703770197487, "step": 71640}, {"loss": 0.5519, "grad_norm": 0.9703856110572815, "learning_rate": 0.0002, "epoch": 5.145421903052065, "step": 71650}, {"loss": 0.5859, "grad_norm": 0.9874776005744934, "learning_rate": 0.0002, "epoch": 5.146140035906643, "step": 71660}, {"loss": 0.555, "grad_norm": 0.9723859429359436, "learning_rate": 0.0002, "epoch": 5.146858168761221, "step": 71670}, {"loss": 0.5866, "grad_norm": 0.997107207775116, "learning_rate": 0.0002, "epoch": 5.147576301615799, "step": 71680}, {"loss": 0.5399, "grad_norm": 1.0261175632476807, "learning_rate": 0.0002, "epoch": 5.148294434470377, "step": 71690}, {"loss": 0.5427, "grad_norm": 0.9093905687332153, "learning_rate": 0.0002, "epoch": 5.149012567324955, "step": 71700}, {"loss": 0.557, "grad_norm": 0.9909888505935669, "learning_rate": 0.0002, "epoch": 5.149730700179533, "step": 71710}, {"loss": 0.5343, "grad_norm": 0.9111971259117126, "learning_rate": 0.0002, "epoch": 5.150448833034111, "step": 71720}, {"loss": 0.5717, "grad_norm": 0.9319643974304199, "learning_rate": 0.0002, "epoch": 5.15116696588869, "step": 71730}, {"loss": 0.5676, "grad_norm": 1.0744104385375977, "learning_rate": 0.0002, "epoch": 5.151885098743268, "step": 71740}, {"loss": 0.5914, "grad_norm": 1.1555477380752563, "learning_rate": 0.0002, "epoch": 5.152603231597846, "step": 71750}, {"loss": 0.5859, "grad_norm": 0.9809171557426453, "learning_rate": 0.0002, "epoch": 5.153321364452424, "step": 71760}, {"loss": 0.5663, "grad_norm": 0.7937686443328857, "learning_rate": 0.0002, "epoch": 5.154039497307002, "step": 71770}, {"loss": 0.5637, "grad_norm": 1.1925430297851562, "learning_rate": 0.0002, "epoch": 5.15475763016158, "step": 71780}, {"loss": 0.5759, "grad_norm": 1.077412486076355, "learning_rate": 0.0002, "epoch": 5.155475763016158, "step": 71790}, {"loss": 0.5653, "grad_norm": 0.7992808222770691, "learning_rate": 0.0002, "epoch": 5.156193895870736, "step": 71800}, {"loss": 0.5596, "grad_norm": 1.0938535928726196, "learning_rate": 0.0002, "epoch": 5.156912028725314, "step": 71810}, {"loss": 0.5562, "grad_norm": 0.9458112120628357, "learning_rate": 0.0002, "epoch": 5.157630161579892, "step": 71820}, {"loss": 0.5514, "grad_norm": 0.984940230846405, "learning_rate": 0.0002, "epoch": 5.158348294434471, "step": 71830}, {"loss": 0.5262, "grad_norm": 0.9242565035820007, "learning_rate": 0.0002, "epoch": 5.159066427289049, "step": 71840}, {"loss": 0.5591, "grad_norm": 0.8386720418930054, "learning_rate": 0.0002, "epoch": 5.159784560143627, "step": 71850}, {"loss": 0.5871, "grad_norm": 0.9627357721328735, "learning_rate": 0.0002, "epoch": 5.160502692998205, "step": 71860}, {"loss": 0.6063, "grad_norm": 1.0118762254714966, "learning_rate": 0.0002, "epoch": 5.161220825852783, "step": 71870}, {"loss": 0.5558, "grad_norm": 1.1552608013153076, "learning_rate": 0.0002, "epoch": 5.161938958707361, "step": 71880}, {"loss": 0.5789, "grad_norm": 1.0910389423370361, "learning_rate": 0.0002, "epoch": 5.162657091561939, "step": 71890}, {"loss": 0.5568, "grad_norm": 1.046639084815979, "learning_rate": 0.0002, "epoch": 5.163375224416517, "step": 71900}, {"loss": 0.5646, "grad_norm": 1.0087649822235107, "learning_rate": 0.0002, "epoch": 5.164093357271095, "step": 71910}, {"loss": 0.5663, "grad_norm": 0.9418644309043884, "learning_rate": 0.0002, "epoch": 5.164811490125674, "step": 71920}, {"loss": 0.5668, "grad_norm": 1.1213915348052979, "learning_rate": 0.0002, "epoch": 5.165529622980252, "step": 71930}, {"loss": 0.5979, "grad_norm": 1.043786644935608, "learning_rate": 0.0002, "epoch": 5.16624775583483, "step": 71940}, {"loss": 0.5714, "grad_norm": 1.2150449752807617, "learning_rate": 0.0002, "epoch": 5.166965888689408, "step": 71950}, {"loss": 0.5766, "grad_norm": 1.1214520931243896, "learning_rate": 0.0002, "epoch": 5.167684021543986, "step": 71960}, {"loss": 0.5851, "grad_norm": 0.9235218167304993, "learning_rate": 0.0002, "epoch": 5.168402154398564, "step": 71970}, {"loss": 0.5917, "grad_norm": 0.8736480474472046, "learning_rate": 0.0002, "epoch": 5.169120287253142, "step": 71980}, {"loss": 0.5508, "grad_norm": 0.8723195195198059, "learning_rate": 0.0002, "epoch": 5.16983842010772, "step": 71990}, {"loss": 0.5927, "grad_norm": 1.0873022079467773, "learning_rate": 0.0002, "epoch": 5.170556552962298, "step": 72000}, {"loss": 0.5507, "grad_norm": 0.9196295142173767, "learning_rate": 0.0002, "epoch": 5.1712746858168765, "step": 72010}, {"loss": 0.5416, "grad_norm": 0.9244471192359924, "learning_rate": 0.0002, "epoch": 5.1719928186714546, "step": 72020}, {"loss": 0.5626, "grad_norm": 1.0555505752563477, "learning_rate": 0.0002, "epoch": 5.1727109515260326, "step": 72030}, {"loss": 0.6181, "grad_norm": 1.1527929306030273, "learning_rate": 0.0002, "epoch": 5.1734290843806106, "step": 72040}, {"loss": 0.6129, "grad_norm": 0.9069058895111084, "learning_rate": 0.0002, "epoch": 5.174147217235189, "step": 72050}, {"loss": 0.5597, "grad_norm": 1.1047141551971436, "learning_rate": 0.0002, "epoch": 5.174865350089767, "step": 72060}, {"loss": 0.5307, "grad_norm": 0.9805511832237244, "learning_rate": 0.0002, "epoch": 5.175583482944345, "step": 72070}, {"loss": 0.5672, "grad_norm": 1.1636970043182373, "learning_rate": 0.0002, "epoch": 5.176301615798923, "step": 72080}, {"loss": 0.6424, "grad_norm": 1.0193538665771484, "learning_rate": 0.0002, "epoch": 5.177019748653501, "step": 72090}, {"loss": 0.5722, "grad_norm": 0.8850618600845337, "learning_rate": 0.0002, "epoch": 5.177737881508079, "step": 72100}, {"loss": 0.5938, "grad_norm": 1.042271614074707, "learning_rate": 0.0002, "epoch": 5.1784560143626575, "step": 72110}, {"loss": 0.569, "grad_norm": 1.1405227184295654, "learning_rate": 0.0002, "epoch": 5.1791741472172355, "step": 72120}, {"loss": 0.5762, "grad_norm": 1.0013195276260376, "learning_rate": 0.0002, "epoch": 5.1798922800718135, "step": 72130}, {"loss": 0.5948, "grad_norm": 1.0474903583526611, "learning_rate": 0.0002, "epoch": 5.1806104129263915, "step": 72140}, {"loss": 0.5692, "grad_norm": 1.0384612083435059, "learning_rate": 0.0002, "epoch": 5.1813285457809695, "step": 72150}, {"loss": 0.5588, "grad_norm": 1.145086646080017, "learning_rate": 0.0002, "epoch": 5.1820466786355475, "step": 72160}, {"loss": 0.5294, "grad_norm": 1.0845173597335815, "learning_rate": 0.0002, "epoch": 5.1827648114901255, "step": 72170}, {"loss": 0.5796, "grad_norm": 0.9870346188545227, "learning_rate": 0.0002, "epoch": 5.1834829443447035, "step": 72180}, {"loss": 0.5844, "grad_norm": 1.1098768711090088, "learning_rate": 0.0002, "epoch": 5.1842010771992815, "step": 72190}, {"loss": 0.5536, "grad_norm": 0.9397785067558289, "learning_rate": 0.0002, "epoch": 5.18491921005386, "step": 72200}, {"loss": 0.5847, "grad_norm": 1.0817532539367676, "learning_rate": 0.0002, "epoch": 5.185637342908438, "step": 72210}, {"loss": 0.5492, "grad_norm": 1.0027309656143188, "learning_rate": 0.0002, "epoch": 5.186355475763016, "step": 72220}, {"loss": 0.5685, "grad_norm": 0.8262016773223877, "learning_rate": 0.0002, "epoch": 5.187073608617594, "step": 72230}, {"loss": 0.53, "grad_norm": 0.9968137741088867, "learning_rate": 0.0002, "epoch": 5.187791741472172, "step": 72240}, {"loss": 0.5663, "grad_norm": 0.9072695970535278, "learning_rate": 0.0002, "epoch": 5.18850987432675, "step": 72250}, {"loss": 0.5799, "grad_norm": 1.0388357639312744, "learning_rate": 0.0002, "epoch": 5.189228007181328, "step": 72260}, {"loss": 0.5805, "grad_norm": 0.8883537650108337, "learning_rate": 0.0002, "epoch": 5.189946140035906, "step": 72270}, {"loss": 0.5723, "grad_norm": 1.0161921977996826, "learning_rate": 0.0002, "epoch": 5.190664272890484, "step": 72280}, {"loss": 0.5805, "grad_norm": 0.964936375617981, "learning_rate": 0.0002, "epoch": 5.191382405745063, "step": 72290}, {"loss": 0.5145, "grad_norm": 0.9728496670722961, "learning_rate": 0.0002, "epoch": 5.192100538599641, "step": 72300}, {"loss": 0.552, "grad_norm": 1.2411649227142334, "learning_rate": 0.0002, "epoch": 5.192818671454219, "step": 72310}, {"loss": 0.5482, "grad_norm": 0.9430946111679077, "learning_rate": 0.0002, "epoch": 5.193536804308797, "step": 72320}, {"loss": 0.5007, "grad_norm": 1.1522886753082275, "learning_rate": 0.0002, "epoch": 5.194254937163375, "step": 72330}, {"loss": 0.5013, "grad_norm": 1.0727189779281616, "learning_rate": 0.0002, "epoch": 5.194973070017953, "step": 72340}, {"loss": 0.5157, "grad_norm": 1.2506077289581299, "learning_rate": 0.0002, "epoch": 5.195691202872531, "step": 72350}, {"loss": 0.592, "grad_norm": 1.0949938297271729, "learning_rate": 0.0002, "epoch": 5.196409335727109, "step": 72360}, {"loss": 0.5642, "grad_norm": 1.191125750541687, "learning_rate": 0.0002, "epoch": 5.197127468581687, "step": 72370}, {"loss": 0.5756, "grad_norm": 1.1154223680496216, "learning_rate": 0.0002, "epoch": 5.197845601436265, "step": 72380}, {"loss": 0.5996, "grad_norm": 0.9623886942863464, "learning_rate": 0.0002, "epoch": 5.198563734290844, "step": 72390}, {"loss": 0.5579, "grad_norm": 0.9432680010795593, "learning_rate": 0.0002, "epoch": 5.199281867145422, "step": 72400}, {"loss": 0.6055, "grad_norm": 1.035905122756958, "learning_rate": 0.0002, "epoch": 5.2, "step": 72410}, {"loss": 0.5515, "grad_norm": 0.9044913053512573, "learning_rate": 0.0002, "epoch": 5.200718132854578, "step": 72420}, {"loss": 0.5845, "grad_norm": 1.082187533378601, "learning_rate": 0.0002, "epoch": 5.201436265709156, "step": 72430}, {"loss": 0.6215, "grad_norm": 0.9368400573730469, "learning_rate": 0.0002, "epoch": 5.202154398563734, "step": 72440}, {"loss": 0.5903, "grad_norm": 1.1515194177627563, "learning_rate": 0.0002, "epoch": 5.202872531418312, "step": 72450}, {"loss": 0.5698, "grad_norm": 0.8333232402801514, "learning_rate": 0.0002, "epoch": 5.20359066427289, "step": 72460}, {"loss": 0.5534, "grad_norm": 1.0885688066482544, "learning_rate": 0.0002, "epoch": 5.204308797127468, "step": 72470}, {"loss": 0.5459, "grad_norm": 0.8189428448677063, "learning_rate": 0.0002, "epoch": 5.205026929982047, "step": 72480}, {"loss": 0.5981, "grad_norm": 1.0145429372787476, "learning_rate": 0.0002, "epoch": 5.205745062836625, "step": 72490}, {"loss": 0.5451, "grad_norm": 1.132490634918213, "learning_rate": 0.0002, "epoch": 5.206463195691203, "step": 72500}, {"loss": 0.5566, "grad_norm": 0.8866808414459229, "learning_rate": 0.0002, "epoch": 5.207181328545781, "step": 72510}, {"loss": 0.5469, "grad_norm": 0.9681518077850342, "learning_rate": 0.0002, "epoch": 5.207899461400359, "step": 72520}, {"loss": 0.5716, "grad_norm": 0.9992330074310303, "learning_rate": 0.0002, "epoch": 5.208617594254937, "step": 72530}, {"loss": 0.5894, "grad_norm": 1.0767436027526855, "learning_rate": 0.0002, "epoch": 5.209335727109515, "step": 72540}, {"loss": 0.5828, "grad_norm": 1.1362388134002686, "learning_rate": 0.0002, "epoch": 5.210053859964093, "step": 72550}, {"loss": 0.6156, "grad_norm": 0.9741758704185486, "learning_rate": 0.0002, "epoch": 5.210771992818671, "step": 72560}, {"loss": 0.6119, "grad_norm": 0.8216298818588257, "learning_rate": 0.0002, "epoch": 5.211490125673249, "step": 72570}, {"loss": 0.5813, "grad_norm": 0.7500724792480469, "learning_rate": 0.0002, "epoch": 5.212208258527828, "step": 72580}, {"loss": 0.5427, "grad_norm": 0.9152594804763794, "learning_rate": 0.0002, "epoch": 5.212926391382406, "step": 72590}, {"loss": 0.5792, "grad_norm": 1.014940857887268, "learning_rate": 0.0002, "epoch": 5.213644524236984, "step": 72600}, {"loss": 0.5487, "grad_norm": 0.9333099722862244, "learning_rate": 0.0002, "epoch": 5.214362657091562, "step": 72610}, {"loss": 0.5647, "grad_norm": 0.7940610647201538, "learning_rate": 0.0002, "epoch": 5.21508078994614, "step": 72620}, {"loss": 0.5474, "grad_norm": 1.0365521907806396, "learning_rate": 0.0002, "epoch": 5.215798922800718, "step": 72630}, {"loss": 0.6009, "grad_norm": 1.37727952003479, "learning_rate": 0.0002, "epoch": 5.216517055655296, "step": 72640}, {"loss": 0.5389, "grad_norm": 1.2019168138504028, "learning_rate": 0.0002, "epoch": 5.217235188509874, "step": 72650}, {"loss": 0.5593, "grad_norm": 1.1696226596832275, "learning_rate": 0.0002, "epoch": 5.217953321364452, "step": 72660}, {"loss": 0.5507, "grad_norm": 0.9608798623085022, "learning_rate": 0.0002, "epoch": 5.218671454219031, "step": 72670}, {"loss": 0.5502, "grad_norm": 0.9139777421951294, "learning_rate": 0.0002, "epoch": 5.219389587073609, "step": 72680}, {"loss": 0.5955, "grad_norm": 0.9937016367912292, "learning_rate": 0.0002, "epoch": 5.220107719928187, "step": 72690}, {"loss": 0.6031, "grad_norm": 1.2787059545516968, "learning_rate": 0.0002, "epoch": 5.220825852782765, "step": 72700}, {"loss": 0.5601, "grad_norm": 1.0757197141647339, "learning_rate": 0.0002, "epoch": 5.221543985637343, "step": 72710}, {"loss": 0.5556, "grad_norm": 0.8053579926490784, "learning_rate": 0.0002, "epoch": 5.222262118491921, "step": 72720}, {"loss": 0.5655, "grad_norm": 1.0239759683609009, "learning_rate": 0.0002, "epoch": 5.222980251346499, "step": 72730}, {"loss": 0.6153, "grad_norm": 0.9972975850105286, "learning_rate": 0.0002, "epoch": 5.223698384201077, "step": 72740}, {"loss": 0.569, "grad_norm": 1.0504519939422607, "learning_rate": 0.0002, "epoch": 5.224416517055655, "step": 72750}, {"loss": 0.5345, "grad_norm": 1.1793010234832764, "learning_rate": 0.0002, "epoch": 5.225134649910234, "step": 72760}, {"loss": 0.5674, "grad_norm": 1.1098815202713013, "learning_rate": 0.0002, "epoch": 5.225852782764812, "step": 72770}, {"loss": 0.5689, "grad_norm": 1.1078516244888306, "learning_rate": 0.0002, "epoch": 5.22657091561939, "step": 72780}, {"loss": 0.5614, "grad_norm": 0.8684433698654175, "learning_rate": 0.0002, "epoch": 5.227289048473968, "step": 72790}, {"loss": 0.5545, "grad_norm": 1.159390926361084, "learning_rate": 0.0002, "epoch": 5.228007181328546, "step": 72800}, {"loss": 0.5726, "grad_norm": 1.0468506813049316, "learning_rate": 0.0002, "epoch": 5.228725314183124, "step": 72810}, {"loss": 0.5662, "grad_norm": 0.8684625029563904, "learning_rate": 0.0002, "epoch": 5.229443447037702, "step": 72820}, {"loss": 0.6074, "grad_norm": 1.0117321014404297, "learning_rate": 0.0002, "epoch": 5.23016157989228, "step": 72830}, {"loss": 0.5956, "grad_norm": 1.0513219833374023, "learning_rate": 0.0002, "epoch": 5.230879712746858, "step": 72840}, {"loss": 0.5796, "grad_norm": 1.0659555196762085, "learning_rate": 0.0002, "epoch": 5.231597845601437, "step": 72850}, {"loss": 0.5916, "grad_norm": 0.7726831436157227, "learning_rate": 0.0002, "epoch": 5.232315978456015, "step": 72860}, {"loss": 0.557, "grad_norm": 1.0346935987472534, "learning_rate": 0.0002, "epoch": 5.233034111310593, "step": 72870}, {"loss": 0.567, "grad_norm": 0.9112410545349121, "learning_rate": 0.0002, "epoch": 5.233752244165171, "step": 72880}, {"loss": 0.575, "grad_norm": 1.2933332920074463, "learning_rate": 0.0002, "epoch": 5.234470377019749, "step": 72890}, {"loss": 0.5733, "grad_norm": 0.9740806221961975, "learning_rate": 0.0002, "epoch": 5.235188509874327, "step": 72900}, {"loss": 0.5661, "grad_norm": 0.8041712641716003, "learning_rate": 0.0002, "epoch": 5.235906642728905, "step": 72910}, {"loss": 0.5936, "grad_norm": 0.9510180950164795, "learning_rate": 0.0002, "epoch": 5.236624775583483, "step": 72920}, {"loss": 0.6312, "grad_norm": 0.9103419780731201, "learning_rate": 0.0002, "epoch": 5.237342908438061, "step": 72930}, {"loss": 0.5298, "grad_norm": 0.8317763805389404, "learning_rate": 0.0002, "epoch": 5.238061041292639, "step": 72940}, {"loss": 0.5887, "grad_norm": 1.0269867181777954, "learning_rate": 0.0002, "epoch": 5.238779174147218, "step": 72950}, {"loss": 0.6141, "grad_norm": 1.0599713325500488, "learning_rate": 0.0002, "epoch": 5.239497307001796, "step": 72960}, {"loss": 0.5785, "grad_norm": 0.9341228008270264, "learning_rate": 0.0002, "epoch": 5.240215439856374, "step": 72970}, {"loss": 0.5256, "grad_norm": 1.1216323375701904, "learning_rate": 0.0002, "epoch": 5.240933572710952, "step": 72980}, {"loss": 0.5995, "grad_norm": 0.9396152496337891, "learning_rate": 0.0002, "epoch": 5.24165170556553, "step": 72990}, {"loss": 0.6281, "grad_norm": 1.1474549770355225, "learning_rate": 0.0002, "epoch": 5.242369838420108, "step": 73000}, {"loss": 0.5693, "grad_norm": 1.2160102128982544, "learning_rate": 0.0002, "epoch": 5.243087971274686, "step": 73010}, {"loss": 0.5914, "grad_norm": 1.0755409002304077, "learning_rate": 0.0002, "epoch": 5.243806104129264, "step": 73020}, {"loss": 0.5697, "grad_norm": 1.0645225048065186, "learning_rate": 0.0002, "epoch": 5.244524236983842, "step": 73030}, {"loss": 0.5669, "grad_norm": 1.1155469417572021, "learning_rate": 0.0002, "epoch": 5.2452423698384205, "step": 73040}, {"loss": 0.5448, "grad_norm": 1.1631708145141602, "learning_rate": 0.0002, "epoch": 5.2459605026929985, "step": 73050}, {"loss": 0.6034, "grad_norm": 0.8747480511665344, "learning_rate": 0.0002, "epoch": 5.2466786355475765, "step": 73060}, {"loss": 0.5647, "grad_norm": 0.9174497723579407, "learning_rate": 0.0002, "epoch": 5.2473967684021545, "step": 73070}, {"loss": 0.5804, "grad_norm": 1.334018349647522, "learning_rate": 0.0002, "epoch": 5.2481149012567325, "step": 73080}, {"loss": 0.5491, "grad_norm": 1.0842393636703491, "learning_rate": 0.0002, "epoch": 5.2488330341113105, "step": 73090}, {"loss": 0.6078, "grad_norm": 1.0531692504882812, "learning_rate": 0.0002, "epoch": 5.2495511669658885, "step": 73100}, {"loss": 0.5912, "grad_norm": 0.9069980978965759, "learning_rate": 0.0002, "epoch": 5.2502692998204665, "step": 73110}, {"loss": 0.5845, "grad_norm": 1.1319832801818848, "learning_rate": 0.0002, "epoch": 5.2509874326750445, "step": 73120}, {"loss": 0.5921, "grad_norm": 1.0468456745147705, "learning_rate": 0.0002, "epoch": 5.2517055655296225, "step": 73130}, {"loss": 0.5688, "grad_norm": 1.1752768754959106, "learning_rate": 0.0002, "epoch": 5.252423698384201, "step": 73140}, {"loss": 0.5709, "grad_norm": 1.0697909593582153, "learning_rate": 0.0002, "epoch": 5.253141831238779, "step": 73150}, {"loss": 0.6187, "grad_norm": 1.1179429292678833, "learning_rate": 0.0002, "epoch": 5.253859964093357, "step": 73160}, {"loss": 0.6127, "grad_norm": 0.9088113903999329, "learning_rate": 0.0002, "epoch": 5.254578096947935, "step": 73170}, {"loss": 0.629, "grad_norm": 0.8814208507537842, "learning_rate": 0.0002, "epoch": 5.255296229802513, "step": 73180}, {"loss": 0.5881, "grad_norm": 1.026688814163208, "learning_rate": 0.0002, "epoch": 5.256014362657091, "step": 73190}, {"loss": 0.5883, "grad_norm": 0.9974902868270874, "learning_rate": 0.0002, "epoch": 5.256732495511669, "step": 73200}, {"loss": 0.5219, "grad_norm": 0.948743999004364, "learning_rate": 0.0002, "epoch": 5.257450628366247, "step": 73210}, {"loss": 0.5489, "grad_norm": 0.9069591164588928, "learning_rate": 0.0002, "epoch": 5.258168761220825, "step": 73220}, {"loss": 0.5667, "grad_norm": 1.0574030876159668, "learning_rate": 0.0002, "epoch": 5.258886894075404, "step": 73230}, {"loss": 0.5903, "grad_norm": 0.9299649596214294, "learning_rate": 0.0002, "epoch": 5.259605026929982, "step": 73240}, {"loss": 0.5678, "grad_norm": 0.9888820648193359, "learning_rate": 0.0002, "epoch": 5.26032315978456, "step": 73250}, {"loss": 0.5993, "grad_norm": 1.0164920091629028, "learning_rate": 0.0002, "epoch": 5.261041292639138, "step": 73260}, {"loss": 0.5585, "grad_norm": 0.933210551738739, "learning_rate": 0.0002, "epoch": 5.261759425493716, "step": 73270}, {"loss": 0.6061, "grad_norm": 1.1754034757614136, "learning_rate": 0.0002, "epoch": 5.262477558348294, "step": 73280}, {"loss": 0.5727, "grad_norm": 1.1599570512771606, "learning_rate": 0.0002, "epoch": 5.263195691202872, "step": 73290}, {"loss": 0.6252, "grad_norm": 1.0497905015945435, "learning_rate": 0.0002, "epoch": 5.26391382405745, "step": 73300}, {"loss": 0.5861, "grad_norm": 1.3603366613388062, "learning_rate": 0.0002, "epoch": 5.264631956912028, "step": 73310}, {"loss": 0.5713, "grad_norm": 1.0283215045928955, "learning_rate": 0.0002, "epoch": 5.265350089766607, "step": 73320}, {"loss": 0.6048, "grad_norm": 1.1043906211853027, "learning_rate": 0.0002, "epoch": 5.266068222621185, "step": 73330}, {"loss": 0.5383, "grad_norm": 0.9386111497879028, "learning_rate": 0.0002, "epoch": 5.266786355475763, "step": 73340}, {"loss": 0.5826, "grad_norm": 1.3586112260818481, "learning_rate": 0.0002, "epoch": 5.267504488330341, "step": 73350}, {"loss": 0.6213, "grad_norm": 1.034179449081421, "learning_rate": 0.0002, "epoch": 5.268222621184919, "step": 73360}, {"loss": 0.5809, "grad_norm": 0.9645284414291382, "learning_rate": 0.0002, "epoch": 5.268940754039497, "step": 73370}, {"loss": 0.5595, "grad_norm": 1.1078046560287476, "learning_rate": 0.0002, "epoch": 5.269658886894075, "step": 73380}, {"loss": 0.5518, "grad_norm": 0.9737151265144348, "learning_rate": 0.0002, "epoch": 5.270377019748653, "step": 73390}, {"loss": 0.5984, "grad_norm": 1.1911388635635376, "learning_rate": 0.0002, "epoch": 5.271095152603231, "step": 73400}, {"loss": 0.5867, "grad_norm": 0.9089180827140808, "learning_rate": 0.0002, "epoch": 5.27181328545781, "step": 73410}, {"loss": 0.6021, "grad_norm": 1.094515085220337, "learning_rate": 0.0002, "epoch": 5.272531418312388, "step": 73420}, {"loss": 0.652, "grad_norm": 1.2531700134277344, "learning_rate": 0.0002, "epoch": 5.273249551166966, "step": 73430}, {"loss": 0.5616, "grad_norm": 0.9279667139053345, "learning_rate": 0.0002, "epoch": 5.273967684021544, "step": 73440}, {"loss": 0.5378, "grad_norm": 0.9872317314147949, "learning_rate": 0.0002, "epoch": 5.274685816876122, "step": 73450}, {"loss": 0.5732, "grad_norm": 1.0645262002944946, "learning_rate": 0.0002, "epoch": 5.2754039497307, "step": 73460}, {"loss": 0.5331, "grad_norm": 0.9505489468574524, "learning_rate": 0.0002, "epoch": 5.276122082585278, "step": 73470}, {"loss": 0.5826, "grad_norm": 1.0444035530090332, "learning_rate": 0.0002, "epoch": 5.276840215439856, "step": 73480}, {"loss": 0.6267, "grad_norm": 1.1813455820083618, "learning_rate": 0.0002, "epoch": 5.277558348294434, "step": 73490}, {"loss": 0.5645, "grad_norm": 0.782117486000061, "learning_rate": 0.0002, "epoch": 5.278276481149012, "step": 73500}, {"loss": 0.5829, "grad_norm": 0.8837172389030457, "learning_rate": 0.0002, "epoch": 5.278994614003591, "step": 73510}, {"loss": 0.5894, "grad_norm": 0.8320443630218506, "learning_rate": 0.0002, "epoch": 5.279712746858169, "step": 73520}, {"loss": 0.5793, "grad_norm": 1.111466407775879, "learning_rate": 0.0002, "epoch": 5.280430879712747, "step": 73530}, {"loss": 0.5796, "grad_norm": 1.0448017120361328, "learning_rate": 0.0002, "epoch": 5.281149012567325, "step": 73540}, {"loss": 0.5642, "grad_norm": 1.2046639919281006, "learning_rate": 0.0002, "epoch": 5.281867145421903, "step": 73550}, {"loss": 0.5859, "grad_norm": 1.084886074066162, "learning_rate": 0.0002, "epoch": 5.282585278276481, "step": 73560}, {"loss": 0.6055, "grad_norm": 0.8321937918663025, "learning_rate": 0.0002, "epoch": 5.283303411131059, "step": 73570}, {"loss": 0.5735, "grad_norm": 1.172440767288208, "learning_rate": 0.0002, "epoch": 5.284021543985637, "step": 73580}, {"loss": 0.5491, "grad_norm": 0.937133252620697, "learning_rate": 0.0002, "epoch": 5.284739676840215, "step": 73590}, {"loss": 0.5575, "grad_norm": 1.0996583700180054, "learning_rate": 0.0002, "epoch": 5.285457809694794, "step": 73600}, {"loss": 0.5813, "grad_norm": 1.2459958791732788, "learning_rate": 0.0002, "epoch": 5.286175942549372, "step": 73610}, {"loss": 0.6146, "grad_norm": 0.8362332582473755, "learning_rate": 0.0002, "epoch": 5.28689407540395, "step": 73620}, {"loss": 0.5333, "grad_norm": 0.9784061312675476, "learning_rate": 0.0002, "epoch": 5.287612208258528, "step": 73630}, {"loss": 0.6146, "grad_norm": 1.087041974067688, "learning_rate": 0.0002, "epoch": 5.288330341113106, "step": 73640}, {"loss": 0.5775, "grad_norm": 0.8641281723976135, "learning_rate": 0.0002, "epoch": 5.289048473967684, "step": 73650}, {"loss": 0.5592, "grad_norm": 1.030386209487915, "learning_rate": 0.0002, "epoch": 5.289766606822262, "step": 73660}, {"loss": 0.5899, "grad_norm": 1.0551509857177734, "learning_rate": 0.0002, "epoch": 5.29048473967684, "step": 73670}, {"loss": 0.5805, "grad_norm": 0.9969013333320618, "learning_rate": 0.0002, "epoch": 5.291202872531418, "step": 73680}, {"loss": 0.5841, "grad_norm": 0.9566490054130554, "learning_rate": 0.0002, "epoch": 5.291921005385996, "step": 73690}, {"loss": 0.5756, "grad_norm": 1.1376742124557495, "learning_rate": 0.0002, "epoch": 5.292639138240575, "step": 73700}, {"loss": 0.5697, "grad_norm": 1.0127843618392944, "learning_rate": 0.0002, "epoch": 5.293357271095153, "step": 73710}, {"loss": 0.5673, "grad_norm": 0.9500759243965149, "learning_rate": 0.0002, "epoch": 5.294075403949731, "step": 73720}, {"loss": 0.6251, "grad_norm": 0.9597342610359192, "learning_rate": 0.0002, "epoch": 5.294793536804309, "step": 73730}, {"loss": 0.5887, "grad_norm": 1.0982595682144165, "learning_rate": 0.0002, "epoch": 5.295511669658887, "step": 73740}, {"loss": 0.5623, "grad_norm": 0.9007689952850342, "learning_rate": 0.0002, "epoch": 5.296229802513465, "step": 73750}, {"loss": 0.5854, "grad_norm": 0.9329614639282227, "learning_rate": 0.0002, "epoch": 5.296947935368043, "step": 73760}, {"loss": 0.5867, "grad_norm": 1.235142469406128, "learning_rate": 0.0002, "epoch": 5.297666068222621, "step": 73770}, {"loss": 0.6009, "grad_norm": 1.0875943899154663, "learning_rate": 0.0002, "epoch": 5.298384201077199, "step": 73780}, {"loss": 0.6009, "grad_norm": 1.0499054193496704, "learning_rate": 0.0002, "epoch": 5.299102333931778, "step": 73790}, {"loss": 0.625, "grad_norm": 1.117954969406128, "learning_rate": 0.0002, "epoch": 5.299820466786356, "step": 73800}, {"loss": 0.5502, "grad_norm": 0.800291121006012, "learning_rate": 0.0002, "epoch": 5.300538599640934, "step": 73810}, {"loss": 0.5815, "grad_norm": 1.1461842060089111, "learning_rate": 0.0002, "epoch": 5.301256732495512, "step": 73820}, {"loss": 0.6091, "grad_norm": 1.0084760189056396, "learning_rate": 0.0002, "epoch": 5.30197486535009, "step": 73830}, {"loss": 0.5802, "grad_norm": 1.1249386072158813, "learning_rate": 0.0002, "epoch": 5.302692998204668, "step": 73840}, {"loss": 0.55, "grad_norm": 1.0846004486083984, "learning_rate": 0.0002, "epoch": 5.303411131059246, "step": 73850}, {"loss": 0.5923, "grad_norm": 1.1557925939559937, "learning_rate": 0.0002, "epoch": 5.304129263913824, "step": 73860}, {"loss": 0.5904, "grad_norm": 1.2287988662719727, "learning_rate": 0.0002, "epoch": 5.304847396768402, "step": 73870}, {"loss": 0.554, "grad_norm": 0.9618542194366455, "learning_rate": 0.0002, "epoch": 5.30556552962298, "step": 73880}, {"loss": 0.5787, "grad_norm": 0.9429472088813782, "learning_rate": 0.0002, "epoch": 5.306283662477559, "step": 73890}, {"loss": 0.5937, "grad_norm": 0.9032631516456604, "learning_rate": 0.0002, "epoch": 5.307001795332137, "step": 73900}, {"loss": 0.577, "grad_norm": 1.0008580684661865, "learning_rate": 0.0002, "epoch": 5.307719928186715, "step": 73910}, {"loss": 0.5462, "grad_norm": 0.9795624017715454, "learning_rate": 0.0002, "epoch": 5.308438061041293, "step": 73920}, {"loss": 0.582, "grad_norm": 1.1194090843200684, "learning_rate": 0.0002, "epoch": 5.309156193895871, "step": 73930}, {"loss": 0.5859, "grad_norm": 1.1057528257369995, "learning_rate": 0.0002, "epoch": 5.309874326750449, "step": 73940}, {"loss": 0.5503, "grad_norm": 0.7807615995407104, "learning_rate": 0.0002, "epoch": 5.310592459605027, "step": 73950}, {"loss": 0.6128, "grad_norm": 0.9465593099594116, "learning_rate": 0.0002, "epoch": 5.311310592459605, "step": 73960}, {"loss": 0.5831, "grad_norm": 1.104210615158081, "learning_rate": 0.0002, "epoch": 5.312028725314184, "step": 73970}, {"loss": 0.5478, "grad_norm": 1.0452964305877686, "learning_rate": 0.0002, "epoch": 5.312746858168762, "step": 73980}, {"loss": 0.5856, "grad_norm": 1.0314992666244507, "learning_rate": 0.0002, "epoch": 5.31346499102334, "step": 73990}, {"loss": 0.6222, "grad_norm": 0.9187130928039551, "learning_rate": 0.0002, "epoch": 5.314183123877918, "step": 74000}, {"loss": 0.5739, "grad_norm": 0.8660678267478943, "learning_rate": 0.0002, "epoch": 5.314901256732496, "step": 74010}, {"loss": 0.5296, "grad_norm": 0.9470953345298767, "learning_rate": 0.0002, "epoch": 5.315619389587074, "step": 74020}, {"loss": 0.5772, "grad_norm": 1.0028631687164307, "learning_rate": 0.0002, "epoch": 5.316337522441652, "step": 74030}, {"loss": 0.6159, "grad_norm": 1.0237356424331665, "learning_rate": 0.0002, "epoch": 5.31705565529623, "step": 74040}, {"loss": 0.6277, "grad_norm": 1.0299798250198364, "learning_rate": 0.0002, "epoch": 5.317773788150808, "step": 74050}, {"loss": 0.568, "grad_norm": 1.0326799154281616, "learning_rate": 0.0002, "epoch": 5.318491921005386, "step": 74060}, {"loss": 0.5766, "grad_norm": 1.156346082687378, "learning_rate": 0.0002, "epoch": 5.3192100538599645, "step": 74070}, {"loss": 0.598, "grad_norm": 1.1542664766311646, "learning_rate": 0.0002, "epoch": 5.3199281867145425, "step": 74080}, {"loss": 0.5736, "grad_norm": 1.0503013134002686, "learning_rate": 0.0002, "epoch": 5.3206463195691205, "step": 74090}, {"loss": 0.6172, "grad_norm": 1.1088979244232178, "learning_rate": 0.0002, "epoch": 5.3213644524236985, "step": 74100}, {"loss": 0.5536, "grad_norm": 0.9314014911651611, "learning_rate": 0.0002, "epoch": 5.3220825852782765, "step": 74110}, {"loss": 0.6205, "grad_norm": 1.0813525915145874, "learning_rate": 0.0002, "epoch": 5.3228007181328545, "step": 74120}, {"loss": 0.6019, "grad_norm": 0.7824062705039978, "learning_rate": 0.0002, "epoch": 5.3235188509874325, "step": 74130}, {"loss": 0.6183, "grad_norm": 1.0552699565887451, "learning_rate": 0.0002, "epoch": 5.3242369838420105, "step": 74140}, {"loss": 0.5714, "grad_norm": 1.0916554927825928, "learning_rate": 0.0002, "epoch": 5.3249551166965885, "step": 74150}, {"loss": 0.6128, "grad_norm": 1.205618143081665, "learning_rate": 0.0002, "epoch": 5.325673249551167, "step": 74160}, {"loss": 0.616, "grad_norm": 1.2551230192184448, "learning_rate": 0.0002, "epoch": 5.326391382405745, "step": 74170}, {"loss": 0.5467, "grad_norm": 0.7715005278587341, "learning_rate": 0.0002, "epoch": 5.327109515260323, "step": 74180}, {"loss": 0.5793, "grad_norm": 1.1059352159500122, "learning_rate": 0.0002, "epoch": 5.327827648114901, "step": 74190}, {"loss": 0.5768, "grad_norm": 0.9441812634468079, "learning_rate": 0.0002, "epoch": 5.328545780969479, "step": 74200}, {"loss": 0.5708, "grad_norm": 1.0012084245681763, "learning_rate": 0.0002, "epoch": 5.329263913824057, "step": 74210}, {"loss": 0.5289, "grad_norm": 0.8594073057174683, "learning_rate": 0.0002, "epoch": 5.329982046678635, "step": 74220}, {"loss": 0.5933, "grad_norm": 0.8931775093078613, "learning_rate": 0.0002, "epoch": 5.330700179533213, "step": 74230}, {"loss": 0.5722, "grad_norm": 0.967250406742096, "learning_rate": 0.0002, "epoch": 5.331418312387791, "step": 74240}, {"loss": 0.5483, "grad_norm": 0.9776269793510437, "learning_rate": 0.0002, "epoch": 5.332136445242369, "step": 74250}, {"loss": 0.5655, "grad_norm": 0.9393186569213867, "learning_rate": 0.0002, "epoch": 5.332854578096948, "step": 74260}, {"loss": 0.5704, "grad_norm": 1.0081093311309814, "learning_rate": 0.0002, "epoch": 5.333572710951526, "step": 74270}, {"loss": 0.5588, "grad_norm": 0.9002147316932678, "learning_rate": 0.0002, "epoch": 5.334290843806104, "step": 74280}, {"loss": 0.5851, "grad_norm": 0.9237701296806335, "learning_rate": 0.0002, "epoch": 5.335008976660682, "step": 74290}, {"loss": 0.5958, "grad_norm": 1.070694923400879, "learning_rate": 0.0002, "epoch": 5.33572710951526, "step": 74300}, {"loss": 0.5877, "grad_norm": 1.0134668350219727, "learning_rate": 0.0002, "epoch": 5.336445242369838, "step": 74310}, {"loss": 0.5828, "grad_norm": 1.0903294086456299, "learning_rate": 0.0002, "epoch": 5.337163375224416, "step": 74320}, {"loss": 0.5146, "grad_norm": 0.9000239372253418, "learning_rate": 0.0002, "epoch": 5.337881508078994, "step": 74330}, {"loss": 0.5357, "grad_norm": 1.0584321022033691, "learning_rate": 0.0002, "epoch": 5.338599640933572, "step": 74340}, {"loss": 0.5844, "grad_norm": 1.046420931816101, "learning_rate": 0.0002, "epoch": 5.339317773788151, "step": 74350}, {"loss": 0.5489, "grad_norm": 0.8862320184707642, "learning_rate": 0.0002, "epoch": 5.340035906642729, "step": 74360}, {"loss": 0.5923, "grad_norm": 0.8197309970855713, "learning_rate": 0.0002, "epoch": 5.340754039497307, "step": 74370}, {"loss": 0.5408, "grad_norm": 0.9539661407470703, "learning_rate": 0.0002, "epoch": 5.341472172351885, "step": 74380}, {"loss": 0.5943, "grad_norm": 1.481026530265808, "learning_rate": 0.0002, "epoch": 5.342190305206463, "step": 74390}, {"loss": 0.6242, "grad_norm": 1.0685169696807861, "learning_rate": 0.0002, "epoch": 5.342908438061041, "step": 74400}, {"loss": 0.5917, "grad_norm": 1.1468359231948853, "learning_rate": 0.0002, "epoch": 5.343626570915619, "step": 74410}, {"loss": 0.556, "grad_norm": 0.9982373714447021, "learning_rate": 0.0002, "epoch": 5.344344703770197, "step": 74420}, {"loss": 0.6003, "grad_norm": 0.9273471236228943, "learning_rate": 0.0002, "epoch": 5.345062836624775, "step": 74430}, {"loss": 0.5239, "grad_norm": 1.058828592300415, "learning_rate": 0.0002, "epoch": 5.345780969479353, "step": 74440}, {"loss": 0.5434, "grad_norm": 1.0442006587982178, "learning_rate": 0.0002, "epoch": 5.346499102333932, "step": 74450}, {"loss": 0.5614, "grad_norm": 1.0955053567886353, "learning_rate": 0.0002, "epoch": 5.34721723518851, "step": 74460}, {"loss": 0.5992, "grad_norm": 0.9326002597808838, "learning_rate": 0.0002, "epoch": 5.347935368043088, "step": 74470}, {"loss": 0.6173, "grad_norm": 0.9496979117393494, "learning_rate": 0.0002, "epoch": 5.348653500897666, "step": 74480}, {"loss": 0.5483, "grad_norm": 1.1995937824249268, "learning_rate": 0.0002, "epoch": 5.349371633752244, "step": 74490}, {"loss": 0.5759, "grad_norm": 0.8761899471282959, "learning_rate": 0.0002, "epoch": 5.350089766606822, "step": 74500}, {"loss": 0.5866, "grad_norm": 1.2390170097351074, "learning_rate": 0.0002, "epoch": 5.3508078994614, "step": 74510}, {"loss": 0.6065, "grad_norm": 0.9101138114929199, "learning_rate": 0.0002, "epoch": 5.351526032315978, "step": 74520}, {"loss": 0.5908, "grad_norm": 0.925466001033783, "learning_rate": 0.0002, "epoch": 5.352244165170557, "step": 74530}, {"loss": 0.5992, "grad_norm": 0.9483969807624817, "learning_rate": 0.0002, "epoch": 5.352962298025135, "step": 74540}, {"loss": 0.5881, "grad_norm": 1.0530859231948853, "learning_rate": 0.0002, "epoch": 5.353680430879713, "step": 74550}, {"loss": 0.5607, "grad_norm": 1.209647536277771, "learning_rate": 0.0002, "epoch": 5.354398563734291, "step": 74560}, {"loss": 0.5782, "grad_norm": 0.9849331378936768, "learning_rate": 0.0002, "epoch": 5.355116696588869, "step": 74570}, {"loss": 0.6448, "grad_norm": 1.0822848081588745, "learning_rate": 0.0002, "epoch": 5.355834829443447, "step": 74580}, {"loss": 0.631, "grad_norm": 1.1460528373718262, "learning_rate": 0.0002, "epoch": 5.356552962298025, "step": 74590}, {"loss": 0.5634, "grad_norm": 0.9509134292602539, "learning_rate": 0.0002, "epoch": 5.357271095152603, "step": 74600}, {"loss": 0.5492, "grad_norm": 0.9884999394416809, "learning_rate": 0.0002, "epoch": 5.357989228007181, "step": 74610}, {"loss": 0.6096, "grad_norm": 0.9619579911231995, "learning_rate": 0.0002, "epoch": 5.358707360861759, "step": 74620}, {"loss": 0.5686, "grad_norm": 0.8596125245094299, "learning_rate": 0.0002, "epoch": 5.359425493716338, "step": 74630}, {"loss": 0.6112, "grad_norm": 1.16913640499115, "learning_rate": 0.0002, "epoch": 5.360143626570916, "step": 74640}, {"loss": 0.5779, "grad_norm": 0.99276202917099, "learning_rate": 0.0002, "epoch": 5.360861759425494, "step": 74650}, {"loss": 0.5699, "grad_norm": 1.1293696165084839, "learning_rate": 0.0002, "epoch": 5.361579892280072, "step": 74660}, {"loss": 0.5727, "grad_norm": 1.187947154045105, "learning_rate": 0.0002, "epoch": 5.36229802513465, "step": 74670}, {"loss": 0.5574, "grad_norm": 0.8637247681617737, "learning_rate": 0.0002, "epoch": 5.363016157989228, "step": 74680}, {"loss": 0.5738, "grad_norm": 1.1049476861953735, "learning_rate": 0.0002, "epoch": 5.363734290843806, "step": 74690}, {"loss": 0.6082, "grad_norm": 1.1736515760421753, "learning_rate": 0.0002, "epoch": 5.364452423698384, "step": 74700}, {"loss": 0.6238, "grad_norm": 1.0203301906585693, "learning_rate": 0.0002, "epoch": 5.365170556552962, "step": 74710}, {"loss": 0.5612, "grad_norm": 1.15559720993042, "learning_rate": 0.0002, "epoch": 5.365888689407541, "step": 74720}, {"loss": 0.5699, "grad_norm": 1.2008144855499268, "learning_rate": 0.0002, "epoch": 5.366606822262119, "step": 74730}, {"loss": 0.5749, "grad_norm": 1.0385756492614746, "learning_rate": 0.0002, "epoch": 5.367324955116697, "step": 74740}, {"loss": 0.5745, "grad_norm": 0.8964240550994873, "learning_rate": 0.0002, "epoch": 5.368043087971275, "step": 74750}, {"loss": 0.5799, "grad_norm": 0.9824761748313904, "learning_rate": 0.0002, "epoch": 5.368761220825853, "step": 74760}, {"loss": 0.5714, "grad_norm": 0.8815994262695312, "learning_rate": 0.0002, "epoch": 5.369479353680431, "step": 74770}, {"loss": 0.584, "grad_norm": 0.9729493856430054, "learning_rate": 0.0002, "epoch": 5.370197486535009, "step": 74780}, {"loss": 0.5884, "grad_norm": 1.1032123565673828, "learning_rate": 0.0002, "epoch": 5.370915619389587, "step": 74790}, {"loss": 0.5804, "grad_norm": 1.039591908454895, "learning_rate": 0.0002, "epoch": 5.371633752244165, "step": 74800}, {"loss": 0.5693, "grad_norm": 0.9741610884666443, "learning_rate": 0.0002, "epoch": 5.372351885098743, "step": 74810}, {"loss": 0.6225, "grad_norm": 0.9789814949035645, "learning_rate": 0.0002, "epoch": 5.373070017953322, "step": 74820}, {"loss": 0.5765, "grad_norm": 1.0777033567428589, "learning_rate": 0.0002, "epoch": 5.3737881508079, "step": 74830}, {"loss": 0.5553, "grad_norm": 0.9058641195297241, "learning_rate": 0.0002, "epoch": 5.374506283662478, "step": 74840}, {"loss": 0.5733, "grad_norm": 1.2161815166473389, "learning_rate": 0.0002, "epoch": 5.375224416517056, "step": 74850}, {"loss": 0.5679, "grad_norm": 1.1079481840133667, "learning_rate": 0.0002, "epoch": 5.375942549371634, "step": 74860}, {"loss": 0.605, "grad_norm": 0.9494470357894897, "learning_rate": 0.0002, "epoch": 5.376660682226212, "step": 74870}, {"loss": 0.6155, "grad_norm": 1.0116358995437622, "learning_rate": 0.0002, "epoch": 5.37737881508079, "step": 74880}, {"loss": 0.5595, "grad_norm": 0.9382423162460327, "learning_rate": 0.0002, "epoch": 5.378096947935368, "step": 74890}, {"loss": 0.5441, "grad_norm": 1.036151647567749, "learning_rate": 0.0002, "epoch": 5.378815080789946, "step": 74900}, {"loss": 0.5441, "grad_norm": 0.9436623454093933, "learning_rate": 0.0002, "epoch": 5.379533213644525, "step": 74910}, {"loss": 0.5327, "grad_norm": 1.0149152278900146, "learning_rate": 0.0002, "epoch": 5.380251346499103, "step": 74920}, {"loss": 0.5554, "grad_norm": 1.1645641326904297, "learning_rate": 0.0002, "epoch": 5.380969479353681, "step": 74930}, {"loss": 0.5662, "grad_norm": 1.002287745475769, "learning_rate": 0.0002, "epoch": 5.381687612208259, "step": 74940}, {"loss": 0.5602, "grad_norm": 1.1176437139511108, "learning_rate": 0.0002, "epoch": 5.382405745062837, "step": 74950}, {"loss": 0.582, "grad_norm": 0.9210802912712097, "learning_rate": 0.0002, "epoch": 5.383123877917415, "step": 74960}, {"loss": 0.5996, "grad_norm": 1.1873447895050049, "learning_rate": 0.0002, "epoch": 5.383842010771993, "step": 74970}, {"loss": 0.5391, "grad_norm": 0.8372976779937744, "learning_rate": 0.0002, "epoch": 5.384560143626571, "step": 74980}, {"loss": 0.5808, "grad_norm": 0.9220532178878784, "learning_rate": 0.0002, "epoch": 5.385278276481149, "step": 74990}, {"loss": 0.5897, "grad_norm": 0.9196901917457581, "learning_rate": 0.0002, "epoch": 5.385996409335727, "step": 75000}, {"loss": 0.5838, "grad_norm": 0.9325235486030579, "learning_rate": 0.0002, "epoch": 5.3867145421903055, "step": 75010}, {"loss": 0.5652, "grad_norm": 1.0902531147003174, "learning_rate": 0.0002, "epoch": 5.3874326750448835, "step": 75020}, {"loss": 0.581, "grad_norm": 1.049468755722046, "learning_rate": 0.0002, "epoch": 5.3881508078994615, "step": 75030}, {"loss": 0.6184, "grad_norm": 0.9372574687004089, "learning_rate": 0.0002, "epoch": 5.3888689407540395, "step": 75040}, {"loss": 0.6158, "grad_norm": 0.9013437628746033, "learning_rate": 0.0002, "epoch": 5.3895870736086176, "step": 75050}, {"loss": 0.5656, "grad_norm": 1.2111071348190308, "learning_rate": 0.0002, "epoch": 5.3903052064631956, "step": 75060}, {"loss": 0.5983, "grad_norm": 1.0006011724472046, "learning_rate": 0.0002, "epoch": 5.3910233393177736, "step": 75070}, {"loss": 0.5807, "grad_norm": 0.9180546402931213, "learning_rate": 0.0002, "epoch": 5.391741472172352, "step": 75080}, {"loss": 0.5878, "grad_norm": 1.096113920211792, "learning_rate": 0.0002, "epoch": 5.3924596050269304, "step": 75090}, {"loss": 0.5416, "grad_norm": 0.9041603207588196, "learning_rate": 0.0002, "epoch": 5.3931777378815084, "step": 75100}, {"loss": 0.5933, "grad_norm": 0.9675783514976501, "learning_rate": 0.0002, "epoch": 5.3938958707360865, "step": 75110}, {"loss": 0.5813, "grad_norm": 1.0952513217926025, "learning_rate": 0.0002, "epoch": 5.3946140035906645, "step": 75120}, {"loss": 0.5961, "grad_norm": 1.0166294574737549, "learning_rate": 0.0002, "epoch": 5.3953321364452425, "step": 75130}, {"loss": 0.6119, "grad_norm": 1.0892874002456665, "learning_rate": 0.0002, "epoch": 5.3960502692998205, "step": 75140}, {"loss": 0.6036, "grad_norm": 0.9894046187400818, "learning_rate": 0.0002, "epoch": 5.3967684021543985, "step": 75150}, {"loss": 0.5844, "grad_norm": 0.9991754293441772, "learning_rate": 0.0002, "epoch": 5.3974865350089765, "step": 75160}, {"loss": 0.5746, "grad_norm": 1.1027519702911377, "learning_rate": 0.0002, "epoch": 5.3982046678635545, "step": 75170}, {"loss": 0.5464, "grad_norm": 1.0579880475997925, "learning_rate": 0.0002, "epoch": 5.3989228007181325, "step": 75180}, {"loss": 0.5705, "grad_norm": 1.1149101257324219, "learning_rate": 0.0002, "epoch": 5.399640933572711, "step": 75190}, {"loss": 0.579, "grad_norm": 0.8802945017814636, "learning_rate": 0.0002, "epoch": 5.400359066427289, "step": 75200}, {"loss": 0.6117, "grad_norm": 0.9168137907981873, "learning_rate": 0.0002, "epoch": 5.401077199281867, "step": 75210}, {"loss": 0.543, "grad_norm": 1.232630968093872, "learning_rate": 0.0002, "epoch": 5.401795332136445, "step": 75220}, {"loss": 0.5739, "grad_norm": 1.1038591861724854, "learning_rate": 0.0002, "epoch": 5.402513464991023, "step": 75230}, {"loss": 0.5754, "grad_norm": 0.8985993266105652, "learning_rate": 0.0002, "epoch": 5.403231597845601, "step": 75240}, {"loss": 0.5517, "grad_norm": 1.1096316576004028, "learning_rate": 0.0002, "epoch": 5.403949730700179, "step": 75250}, {"loss": 0.5834, "grad_norm": 0.8516051173210144, "learning_rate": 0.0002, "epoch": 5.404667863554757, "step": 75260}, {"loss": 0.5779, "grad_norm": 0.9967356324195862, "learning_rate": 0.0002, "epoch": 5.405385996409335, "step": 75270}, {"loss": 0.6065, "grad_norm": 1.0092874765396118, "learning_rate": 0.0002, "epoch": 5.406104129263914, "step": 75280}, {"loss": 0.59, "grad_norm": 1.049838662147522, "learning_rate": 0.0002, "epoch": 5.406822262118492, "step": 75290}, {"loss": 0.6077, "grad_norm": 1.1491070985794067, "learning_rate": 0.0002, "epoch": 5.40754039497307, "step": 75300}, {"loss": 0.6423, "grad_norm": 0.9348118901252747, "learning_rate": 0.0002, "epoch": 5.408258527827648, "step": 75310}, {"loss": 0.5505, "grad_norm": 1.1226147413253784, "learning_rate": 0.0002, "epoch": 5.408976660682226, "step": 75320}, {"loss": 0.5906, "grad_norm": 0.9042587876319885, "learning_rate": 0.0002, "epoch": 5.409694793536804, "step": 75330}, {"loss": 0.5885, "grad_norm": 1.1212877035140991, "learning_rate": 0.0002, "epoch": 5.410412926391382, "step": 75340}, {"loss": 0.6056, "grad_norm": 0.9805570840835571, "learning_rate": 0.0002, "epoch": 5.41113105924596, "step": 75350}, {"loss": 0.5891, "grad_norm": 0.9803917407989502, "learning_rate": 0.0002, "epoch": 5.411849192100538, "step": 75360}, {"loss": 0.6338, "grad_norm": 1.2139064073562622, "learning_rate": 0.0002, "epoch": 5.412567324955116, "step": 75370}, {"loss": 0.5694, "grad_norm": 0.9510865211486816, "learning_rate": 0.0002, "epoch": 5.413285457809695, "step": 75380}, {"loss": 0.6072, "grad_norm": 1.0752202272415161, "learning_rate": 0.0002, "epoch": 5.414003590664273, "step": 75390}, {"loss": 0.5998, "grad_norm": 1.1144053936004639, "learning_rate": 0.0002, "epoch": 5.414721723518851, "step": 75400}, {"loss": 0.5783, "grad_norm": 1.128998875617981, "learning_rate": 0.0002, "epoch": 5.415439856373429, "step": 75410}, {"loss": 0.6092, "grad_norm": 1.2901849746704102, "learning_rate": 0.0002, "epoch": 5.416157989228007, "step": 75420}, {"loss": 0.5799, "grad_norm": 1.2822786569595337, "learning_rate": 0.0002, "epoch": 5.416876122082585, "step": 75430}, {"loss": 0.5744, "grad_norm": 0.8724783658981323, "learning_rate": 0.0002, "epoch": 5.417594254937163, "step": 75440}, {"loss": 0.5821, "grad_norm": 1.1321152448654175, "learning_rate": 0.0002, "epoch": 5.418312387791741, "step": 75450}, {"loss": 0.6394, "grad_norm": 1.1211779117584229, "learning_rate": 0.0002, "epoch": 5.419030520646319, "step": 75460}, {"loss": 0.584, "grad_norm": 1.0542290210723877, "learning_rate": 0.0002, "epoch": 5.419748653500898, "step": 75470}, {"loss": 0.5472, "grad_norm": 0.9432206153869629, "learning_rate": 0.0002, "epoch": 5.420466786355476, "step": 75480}, {"loss": 0.6053, "grad_norm": 1.2051608562469482, "learning_rate": 0.0002, "epoch": 5.421184919210054, "step": 75490}, {"loss": 0.5698, "grad_norm": 1.188256859779358, "learning_rate": 0.0002, "epoch": 5.421903052064632, "step": 75500}, {"loss": 0.5762, "grad_norm": 1.2768784761428833, "learning_rate": 0.0002, "epoch": 5.42262118491921, "step": 75510}, {"loss": 0.5961, "grad_norm": 0.8228567242622375, "learning_rate": 0.0002, "epoch": 5.423339317773788, "step": 75520}, {"loss": 0.602, "grad_norm": 1.235684871673584, "learning_rate": 0.0002, "epoch": 5.424057450628366, "step": 75530}, {"loss": 0.5923, "grad_norm": 0.8361109495162964, "learning_rate": 0.0002, "epoch": 5.424775583482944, "step": 75540}, {"loss": 0.578, "grad_norm": 1.0450727939605713, "learning_rate": 0.0002, "epoch": 5.425493716337522, "step": 75550}, {"loss": 0.6383, "grad_norm": 0.9942979216575623, "learning_rate": 0.0002, "epoch": 5.4262118491921, "step": 75560}, {"loss": 0.6406, "grad_norm": 0.8162592053413391, "learning_rate": 0.0002, "epoch": 5.426929982046679, "step": 75570}, {"loss": 0.5684, "grad_norm": 0.9193033576011658, "learning_rate": 0.0002, "epoch": 5.427648114901257, "step": 75580}, {"loss": 0.5773, "grad_norm": 1.095130443572998, "learning_rate": 0.0002, "epoch": 5.428366247755835, "step": 75590}, {"loss": 0.6036, "grad_norm": 1.1752824783325195, "learning_rate": 0.0002, "epoch": 5.429084380610413, "step": 75600}, {"loss": 0.5773, "grad_norm": 1.2007960081100464, "learning_rate": 0.0002, "epoch": 5.429802513464991, "step": 75610}, {"loss": 0.5928, "grad_norm": 0.997347354888916, "learning_rate": 0.0002, "epoch": 5.430520646319569, "step": 75620}, {"loss": 0.5798, "grad_norm": 1.3878827095031738, "learning_rate": 0.0002, "epoch": 5.431238779174147, "step": 75630}, {"loss": 0.5954, "grad_norm": 1.1839812994003296, "learning_rate": 0.0002, "epoch": 5.431956912028725, "step": 75640}, {"loss": 0.5789, "grad_norm": 0.9912546873092651, "learning_rate": 0.0002, "epoch": 5.432675044883303, "step": 75650}, {"loss": 0.5916, "grad_norm": 0.9305517673492432, "learning_rate": 0.0002, "epoch": 5.433393177737882, "step": 75660}, {"loss": 0.5869, "grad_norm": 1.0036604404449463, "learning_rate": 0.0002, "epoch": 5.43411131059246, "step": 75670}, {"loss": 0.5797, "grad_norm": 1.2500226497650146, "learning_rate": 0.0002, "epoch": 5.434829443447038, "step": 75680}, {"loss": 0.5923, "grad_norm": 0.9476167559623718, "learning_rate": 0.0002, "epoch": 5.435547576301616, "step": 75690}, {"loss": 0.5426, "grad_norm": 0.9769760370254517, "learning_rate": 0.0002, "epoch": 5.436265709156194, "step": 75700}, {"loss": 0.5397, "grad_norm": 1.1001025438308716, "learning_rate": 0.0002, "epoch": 5.436983842010772, "step": 75710}, {"loss": 0.5832, "grad_norm": 1.1783069372177124, "learning_rate": 0.0002, "epoch": 5.43770197486535, "step": 75720}, {"loss": 0.5961, "grad_norm": 0.887438952922821, "learning_rate": 0.0002, "epoch": 5.438420107719928, "step": 75730}, {"loss": 0.5904, "grad_norm": 0.9631154537200928, "learning_rate": 0.0002, "epoch": 5.439138240574506, "step": 75740}, {"loss": 0.5827, "grad_norm": 1.0824158191680908, "learning_rate": 0.0002, "epoch": 5.439856373429085, "step": 75750}, {"loss": 0.5824, "grad_norm": 1.0108296871185303, "learning_rate": 0.0002, "epoch": 5.440574506283663, "step": 75760}, {"loss": 0.6338, "grad_norm": 1.1728253364562988, "learning_rate": 0.0002, "epoch": 5.441292639138241, "step": 75770}, {"loss": 0.5661, "grad_norm": 1.0904773473739624, "learning_rate": 0.0002, "epoch": 5.442010771992819, "step": 75780}, {"loss": 0.638, "grad_norm": 0.8982957601547241, "learning_rate": 0.0002, "epoch": 5.442728904847397, "step": 75790}, {"loss": 0.583, "grad_norm": 1.0233404636383057, "learning_rate": 0.0002, "epoch": 5.443447037701975, "step": 75800}, {"loss": 0.6279, "grad_norm": 1.0092064142227173, "learning_rate": 0.0002, "epoch": 5.444165170556553, "step": 75810}, {"loss": 0.5673, "grad_norm": 1.2747842073440552, "learning_rate": 0.0002, "epoch": 5.444883303411131, "step": 75820}, {"loss": 0.5604, "grad_norm": 1.0365403890609741, "learning_rate": 0.0002, "epoch": 5.445601436265709, "step": 75830}, {"loss": 0.591, "grad_norm": 1.0413976907730103, "learning_rate": 0.0002, "epoch": 5.446319569120288, "step": 75840}, {"loss": 0.5995, "grad_norm": 0.8858456015586853, "learning_rate": 0.0002, "epoch": 5.447037701974866, "step": 75850}, {"loss": 0.5628, "grad_norm": 0.9823445677757263, "learning_rate": 0.0002, "epoch": 5.447755834829444, "step": 75860}, {"loss": 0.5691, "grad_norm": 0.8515284061431885, "learning_rate": 0.0002, "epoch": 5.448473967684022, "step": 75870}, {"loss": 0.5702, "grad_norm": 1.130850911140442, "learning_rate": 0.0002, "epoch": 5.4491921005386, "step": 75880}, {"loss": 0.5669, "grad_norm": 0.984725832939148, "learning_rate": 0.0002, "epoch": 5.449910233393178, "step": 75890}, {"loss": 0.5658, "grad_norm": 1.1701595783233643, "learning_rate": 0.0002, "epoch": 5.450628366247756, "step": 75900}, {"loss": 0.5555, "grad_norm": 0.8988107442855835, "learning_rate": 0.0002, "epoch": 5.451346499102334, "step": 75910}, {"loss": 0.6669, "grad_norm": 0.9909947514533997, "learning_rate": 0.0002, "epoch": 5.452064631956912, "step": 75920}, {"loss": 0.5528, "grad_norm": 0.8861672282218933, "learning_rate": 0.0002, "epoch": 5.45278276481149, "step": 75930}, {"loss": 0.5826, "grad_norm": 0.9513981938362122, "learning_rate": 0.0002, "epoch": 5.453500897666069, "step": 75940}, {"loss": 0.5827, "grad_norm": 1.0320760011672974, "learning_rate": 0.0002, "epoch": 5.454219030520647, "step": 75950}, {"loss": 0.5816, "grad_norm": 0.9830206632614136, "learning_rate": 0.0002, "epoch": 5.454937163375225, "step": 75960}, {"loss": 0.5228, "grad_norm": 0.9816349148750305, "learning_rate": 0.0002, "epoch": 5.455655296229803, "step": 75970}, {"loss": 0.594, "grad_norm": 0.9741218090057373, "learning_rate": 0.0002, "epoch": 5.456373429084381, "step": 75980}, {"loss": 0.634, "grad_norm": 1.1291148662567139, "learning_rate": 0.0002, "epoch": 5.457091561938959, "step": 75990}, {"loss": 0.5986, "grad_norm": 0.9770109057426453, "learning_rate": 0.0002, "epoch": 5.457809694793537, "step": 76000}, {"loss": 0.5783, "grad_norm": 1.0204377174377441, "learning_rate": 0.0002, "epoch": 5.458527827648115, "step": 76010}, {"loss": 0.5881, "grad_norm": 1.0453336238861084, "learning_rate": 0.0002, "epoch": 5.459245960502693, "step": 76020}, {"loss": 0.5798, "grad_norm": 1.1595505475997925, "learning_rate": 0.0002, "epoch": 5.4599640933572715, "step": 76030}, {"loss": 0.5787, "grad_norm": 1.1686701774597168, "learning_rate": 0.0002, "epoch": 5.4606822262118495, "step": 76040}, {"loss": 0.5746, "grad_norm": 1.14364755153656, "learning_rate": 0.0002, "epoch": 5.4614003590664275, "step": 76050}, {"loss": 0.5925, "grad_norm": 0.9742125868797302, "learning_rate": 0.0002, "epoch": 5.4621184919210055, "step": 76060}, {"loss": 0.6067, "grad_norm": 0.8235608339309692, "learning_rate": 0.0002, "epoch": 5.4628366247755835, "step": 76070}, {"loss": 0.5908, "grad_norm": 0.9801425337791443, "learning_rate": 0.0002, "epoch": 5.4635547576301615, "step": 76080}, {"loss": 0.6126, "grad_norm": 0.9001221060752869, "learning_rate": 0.0002, "epoch": 5.4642728904847395, "step": 76090}, {"loss": 0.6682, "grad_norm": 0.9292157888412476, "learning_rate": 0.0002, "epoch": 5.4649910233393175, "step": 76100}, {"loss": 0.6412, "grad_norm": 1.0024322271347046, "learning_rate": 0.0002, "epoch": 5.4657091561938955, "step": 76110}, {"loss": 0.5398, "grad_norm": 0.8057159781455994, "learning_rate": 0.0002, "epoch": 5.4664272890484735, "step": 76120}, {"loss": 0.5881, "grad_norm": 1.0617927312850952, "learning_rate": 0.0002, "epoch": 5.467145421903052, "step": 76130}, {"loss": 0.598, "grad_norm": 1.003967046737671, "learning_rate": 0.0002, "epoch": 5.46786355475763, "step": 76140}, {"loss": 0.5427, "grad_norm": 0.903408944606781, "learning_rate": 0.0002, "epoch": 5.468581687612208, "step": 76150}, {"loss": 0.5884, "grad_norm": 0.8173895478248596, "learning_rate": 0.0002, "epoch": 5.469299820466786, "step": 76160}, {"loss": 0.5526, "grad_norm": 1.0187482833862305, "learning_rate": 0.0002, "epoch": 5.470017953321364, "step": 76170}, {"loss": 0.5392, "grad_norm": 1.0418041944503784, "learning_rate": 0.0002, "epoch": 5.470736086175942, "step": 76180}, {"loss": 0.5761, "grad_norm": 0.9768357872962952, "learning_rate": 0.0002, "epoch": 5.47145421903052, "step": 76190}, {"loss": 0.5595, "grad_norm": 1.0834382772445679, "learning_rate": 0.0002, "epoch": 5.472172351885098, "step": 76200}, {"loss": 0.5783, "grad_norm": 0.8447439670562744, "learning_rate": 0.0002, "epoch": 5.472890484739676, "step": 76210}, {"loss": 0.5695, "grad_norm": 0.9379050135612488, "learning_rate": 0.0002, "epoch": 5.473608617594255, "step": 76220}, {"loss": 0.6053, "grad_norm": 1.0395485162734985, "learning_rate": 0.0002, "epoch": 5.474326750448833, "step": 76230}, {"loss": 0.5587, "grad_norm": 1.2082624435424805, "learning_rate": 0.0002, "epoch": 5.475044883303411, "step": 76240}, {"loss": 0.5891, "grad_norm": 1.0714443922042847, "learning_rate": 0.0002, "epoch": 5.475763016157989, "step": 76250}, {"loss": 0.5819, "grad_norm": 0.945319414138794, "learning_rate": 0.0002, "epoch": 5.476481149012567, "step": 76260}, {"loss": 0.5791, "grad_norm": 1.1415241956710815, "learning_rate": 0.0002, "epoch": 5.477199281867145, "step": 76270}, {"loss": 0.5586, "grad_norm": 0.9221673011779785, "learning_rate": 0.0002, "epoch": 5.477917414721723, "step": 76280}, {"loss": 0.5999, "grad_norm": 1.0118398666381836, "learning_rate": 0.0002, "epoch": 5.478635547576301, "step": 76290}, {"loss": 0.621, "grad_norm": 1.396807312965393, "learning_rate": 0.0002, "epoch": 5.479353680430879, "step": 76300}, {"loss": 0.5808, "grad_norm": 1.0437991619110107, "learning_rate": 0.0002, "epoch": 5.480071813285457, "step": 76310}, {"loss": 0.5846, "grad_norm": 1.5910401344299316, "learning_rate": 0.0002, "epoch": 5.480789946140036, "step": 76320}, {"loss": 0.6047, "grad_norm": 0.9262010455131531, "learning_rate": 0.0002, "epoch": 5.481508078994614, "step": 76330}, {"loss": 0.6079, "grad_norm": 1.2534247636795044, "learning_rate": 0.0002, "epoch": 5.482226211849192, "step": 76340}, {"loss": 0.5918, "grad_norm": 1.186294674873352, "learning_rate": 0.0002, "epoch": 5.48294434470377, "step": 76350}, {"loss": 0.5957, "grad_norm": 0.9822857975959778, "learning_rate": 0.0002, "epoch": 5.483662477558348, "step": 76360}, {"loss": 0.5427, "grad_norm": 1.0006381273269653, "learning_rate": 0.0002, "epoch": 5.484380610412926, "step": 76370}, {"loss": 0.5893, "grad_norm": 0.8960304260253906, "learning_rate": 0.0002, "epoch": 5.485098743267504, "step": 76380}, {"loss": 0.5515, "grad_norm": 0.7309539914131165, "learning_rate": 0.0002, "epoch": 5.485816876122082, "step": 76390}, {"loss": 0.5796, "grad_norm": 0.9747139811515808, "learning_rate": 0.0002, "epoch": 5.486535008976661, "step": 76400}, {"loss": 0.5898, "grad_norm": 0.9586864113807678, "learning_rate": 0.0002, "epoch": 5.487253141831239, "step": 76410}, {"loss": 0.6236, "grad_norm": 1.0815327167510986, "learning_rate": 0.0002, "epoch": 5.487971274685817, "step": 76420}, {"loss": 0.5923, "grad_norm": 1.1324117183685303, "learning_rate": 0.0002, "epoch": 5.488689407540395, "step": 76430}, {"loss": 0.5904, "grad_norm": 0.8575648069381714, "learning_rate": 0.0002, "epoch": 5.489407540394973, "step": 76440}, {"loss": 0.5477, "grad_norm": 0.9821682572364807, "learning_rate": 0.0002, "epoch": 5.490125673249551, "step": 76450}, {"loss": 0.5821, "grad_norm": 1.1611464023590088, "learning_rate": 0.0002, "epoch": 5.490843806104129, "step": 76460}, {"loss": 0.5227, "grad_norm": 1.0340297222137451, "learning_rate": 0.0002, "epoch": 5.491561938958707, "step": 76470}, {"loss": 0.6143, "grad_norm": 1.0116628408432007, "learning_rate": 0.0002, "epoch": 5.492280071813285, "step": 76480}, {"loss": 0.5968, "grad_norm": 0.9619752764701843, "learning_rate": 0.0002, "epoch": 5.492998204667863, "step": 76490}, {"loss": 0.5898, "grad_norm": 0.9924456477165222, "learning_rate": 0.0002, "epoch": 5.493716337522442, "step": 76500}, {"loss": 0.6041, "grad_norm": 0.9449224472045898, "learning_rate": 0.0002, "epoch": 5.49443447037702, "step": 76510}, {"loss": 0.5902, "grad_norm": 0.9075009822845459, "learning_rate": 0.0002, "epoch": 5.495152603231598, "step": 76520}, {"loss": 0.5602, "grad_norm": 1.3078763484954834, "learning_rate": 0.0002, "epoch": 5.495870736086176, "step": 76530}, {"loss": 0.5474, "grad_norm": 1.3162729740142822, "learning_rate": 0.0002, "epoch": 5.496588868940754, "step": 76540}, {"loss": 0.5938, "grad_norm": 1.144333839416504, "learning_rate": 0.0002, "epoch": 5.497307001795332, "step": 76550}, {"loss": 0.6105, "grad_norm": 0.9332208633422852, "learning_rate": 0.0002, "epoch": 5.49802513464991, "step": 76560}, {"loss": 0.5795, "grad_norm": 0.9660165309906006, "learning_rate": 0.0002, "epoch": 5.498743267504488, "step": 76570}, {"loss": 0.6023, "grad_norm": 1.0954749584197998, "learning_rate": 0.0002, "epoch": 5.499461400359066, "step": 76580}, {"loss": 0.5583, "grad_norm": 1.0537810325622559, "learning_rate": 0.0002, "epoch": 5.500179533213645, "step": 76590}, {"loss": 0.5976, "grad_norm": 0.9944321513175964, "learning_rate": 0.0002, "epoch": 5.500897666068223, "step": 76600}, {"loss": 0.5622, "grad_norm": 1.094462513923645, "learning_rate": 0.0002, "epoch": 5.501615798922801, "step": 76610}, {"loss": 0.6031, "grad_norm": 1.0246481895446777, "learning_rate": 0.0002, "epoch": 5.502333931777379, "step": 76620}, {"loss": 0.6211, "grad_norm": 0.9705453515052795, "learning_rate": 0.0002, "epoch": 5.503052064631957, "step": 76630}, {"loss": 0.6118, "grad_norm": 1.5252249240875244, "learning_rate": 0.0002, "epoch": 5.503770197486535, "step": 76640}, {"loss": 0.6351, "grad_norm": 0.8469606637954712, "learning_rate": 0.0002, "epoch": 5.504488330341113, "step": 76650}, {"loss": 0.6125, "grad_norm": 1.1882504224777222, "learning_rate": 0.0002, "epoch": 5.505206463195691, "step": 76660}, {"loss": 0.612, "grad_norm": 0.8447994589805603, "learning_rate": 0.0002, "epoch": 5.505924596050269, "step": 76670}, {"loss": 0.6233, "grad_norm": 0.9340696930885315, "learning_rate": 0.0002, "epoch": 5.506642728904847, "step": 76680}, {"loss": 0.5655, "grad_norm": 0.9622383713722229, "learning_rate": 0.0002, "epoch": 5.507360861759426, "step": 76690}, {"loss": 0.6346, "grad_norm": 1.1516523361206055, "learning_rate": 0.0002, "epoch": 5.508078994614004, "step": 76700}, {"loss": 0.5675, "grad_norm": 1.207190990447998, "learning_rate": 0.0002, "epoch": 5.508797127468582, "step": 76710}, {"loss": 0.5614, "grad_norm": 1.1244179010391235, "learning_rate": 0.0002, "epoch": 5.50951526032316, "step": 76720}, {"loss": 0.531, "grad_norm": 1.052288293838501, "learning_rate": 0.0002, "epoch": 5.510233393177738, "step": 76730}, {"loss": 0.5977, "grad_norm": 0.9571291208267212, "learning_rate": 0.0002, "epoch": 5.510951526032316, "step": 76740}, {"loss": 0.5974, "grad_norm": 0.9449458122253418, "learning_rate": 0.0002, "epoch": 5.511669658886894, "step": 76750}, {"loss": 0.59, "grad_norm": 1.0140511989593506, "learning_rate": 0.0002, "epoch": 5.512387791741472, "step": 76760}, {"loss": 0.5992, "grad_norm": 1.057715654373169, "learning_rate": 0.0002, "epoch": 5.513105924596051, "step": 76770}, {"loss": 0.5643, "grad_norm": 0.930642306804657, "learning_rate": 0.0002, "epoch": 5.513824057450629, "step": 76780}, {"loss": 0.5695, "grad_norm": 1.1213828325271606, "learning_rate": 0.0002, "epoch": 5.514542190305207, "step": 76790}, {"loss": 0.584, "grad_norm": 0.9147387742996216, "learning_rate": 0.0002, "epoch": 5.515260323159785, "step": 76800}, {"loss": 0.5759, "grad_norm": 1.1786983013153076, "learning_rate": 0.0002, "epoch": 5.515978456014363, "step": 76810}, {"loss": 0.5762, "grad_norm": 1.1022626161575317, "learning_rate": 0.0002, "epoch": 5.516696588868941, "step": 76820}, {"loss": 0.5795, "grad_norm": 1.0389000177383423, "learning_rate": 0.0002, "epoch": 5.517414721723519, "step": 76830}, {"loss": 0.5932, "grad_norm": 1.0750621557235718, "learning_rate": 0.0002, "epoch": 5.518132854578097, "step": 76840}, {"loss": 0.6177, "grad_norm": 1.0372626781463623, "learning_rate": 0.0002, "epoch": 5.518850987432675, "step": 76850}, {"loss": 0.5659, "grad_norm": 1.0989108085632324, "learning_rate": 0.0002, "epoch": 5.519569120287253, "step": 76860}, {"loss": 0.5525, "grad_norm": 1.030346155166626, "learning_rate": 0.0002, "epoch": 5.520287253141831, "step": 76870}, {"loss": 0.6669, "grad_norm": 1.1362419128417969, "learning_rate": 0.0002, "epoch": 5.52100538599641, "step": 76880}, {"loss": 0.5951, "grad_norm": 0.9110873937606812, "learning_rate": 0.0002, "epoch": 5.521723518850988, "step": 76890}, {"loss": 0.6161, "grad_norm": 1.0214358568191528, "learning_rate": 0.0002, "epoch": 5.522441651705566, "step": 76900}, {"loss": 0.6055, "grad_norm": 1.3764830827713013, "learning_rate": 0.0002, "epoch": 5.523159784560144, "step": 76910}, {"loss": 0.5822, "grad_norm": 1.0396335124969482, "learning_rate": 0.0002, "epoch": 5.523877917414722, "step": 76920}, {"loss": 0.6262, "grad_norm": 1.1942898035049438, "learning_rate": 0.0002, "epoch": 5.5245960502693, "step": 76930}, {"loss": 0.5927, "grad_norm": 0.8795760869979858, "learning_rate": 0.0002, "epoch": 5.525314183123878, "step": 76940}, {"loss": 0.5788, "grad_norm": 1.1081048250198364, "learning_rate": 0.0002, "epoch": 5.526032315978456, "step": 76950}, {"loss": 0.6101, "grad_norm": 0.9652274250984192, "learning_rate": 0.0002, "epoch": 5.526750448833035, "step": 76960}, {"loss": 0.6382, "grad_norm": 0.96559739112854, "learning_rate": 0.0002, "epoch": 5.527468581687613, "step": 76970}, {"loss": 0.6412, "grad_norm": 1.0416076183319092, "learning_rate": 0.0002, "epoch": 5.528186714542191, "step": 76980}, {"loss": 0.6027, "grad_norm": 0.9854229092597961, "learning_rate": 0.0002, "epoch": 5.528904847396769, "step": 76990}, {"loss": 0.6306, "grad_norm": 1.0515462160110474, "learning_rate": 0.0002, "epoch": 5.529622980251347, "step": 77000}, {"loss": 0.5783, "grad_norm": 1.0287327766418457, "learning_rate": 0.0002, "epoch": 5.530341113105925, "step": 77010}, {"loss": 0.6038, "grad_norm": 0.9579883217811584, "learning_rate": 0.0002, "epoch": 5.531059245960503, "step": 77020}, {"loss": 0.5908, "grad_norm": 1.0365805625915527, "learning_rate": 0.0002, "epoch": 5.531777378815081, "step": 77030}, {"loss": 0.5564, "grad_norm": 1.1600725650787354, "learning_rate": 0.0002, "epoch": 5.532495511669659, "step": 77040}, {"loss": 0.6147, "grad_norm": 0.8598031401634216, "learning_rate": 0.0002, "epoch": 5.533213644524237, "step": 77050}, {"loss": 0.5648, "grad_norm": 0.8884791731834412, "learning_rate": 0.0002, "epoch": 5.533931777378815, "step": 77060}, {"loss": 0.5559, "grad_norm": 0.900223433971405, "learning_rate": 0.0002, "epoch": 5.5346499102333935, "step": 77070}, {"loss": 0.5725, "grad_norm": 1.0212652683258057, "learning_rate": 0.0002, "epoch": 5.5353680430879715, "step": 77080}, {"loss": 0.6645, "grad_norm": 1.0924701690673828, "learning_rate": 0.0002, "epoch": 5.5360861759425495, "step": 77090}, {"loss": 0.5957, "grad_norm": 1.1955485343933105, "learning_rate": 0.0002, "epoch": 5.5368043087971275, "step": 77100}, {"loss": 0.5855, "grad_norm": 1.2157706022262573, "learning_rate": 0.0002, "epoch": 5.5375224416517055, "step": 77110}, {"loss": 0.6067, "grad_norm": 1.1118255853652954, "learning_rate": 0.0002, "epoch": 5.5382405745062835, "step": 77120}, {"loss": 0.5813, "grad_norm": 1.0146820545196533, "learning_rate": 0.0002, "epoch": 5.5389587073608615, "step": 77130}, {"loss": 0.6004, "grad_norm": 1.0876632928848267, "learning_rate": 0.0002, "epoch": 5.5396768402154395, "step": 77140}, {"loss": 0.5934, "grad_norm": 0.7914495468139648, "learning_rate": 0.0002, "epoch": 5.540394973070018, "step": 77150}, {"loss": 0.5666, "grad_norm": 1.0584027767181396, "learning_rate": 0.0002, "epoch": 5.541113105924596, "step": 77160}, {"loss": 0.523, "grad_norm": 0.9816845059394836, "learning_rate": 0.0002, "epoch": 5.541831238779174, "step": 77170}, {"loss": 0.5487, "grad_norm": 1.219076156616211, "learning_rate": 0.0002, "epoch": 5.542549371633752, "step": 77180}, {"loss": 0.639, "grad_norm": 0.9526635408401489, "learning_rate": 0.0002, "epoch": 5.54326750448833, "step": 77190}, {"loss": 0.5849, "grad_norm": 0.8437230587005615, "learning_rate": 0.0002, "epoch": 5.543985637342908, "step": 77200}, {"loss": 0.5858, "grad_norm": 0.9670451283454895, "learning_rate": 0.0002, "epoch": 5.544703770197486, "step": 77210}, {"loss": 0.559, "grad_norm": 1.015687346458435, "learning_rate": 0.0002, "epoch": 5.545421903052064, "step": 77220}, {"loss": 0.6065, "grad_norm": 0.8280553817749023, "learning_rate": 0.0002, "epoch": 5.546140035906642, "step": 77230}, {"loss": 0.5999, "grad_norm": 1.1320816278457642, "learning_rate": 0.0002, "epoch": 5.54685816876122, "step": 77240}, {"loss": 0.5894, "grad_norm": 1.3338711261749268, "learning_rate": 0.0002, "epoch": 5.547576301615799, "step": 77250}, {"loss": 0.591, "grad_norm": 0.9553194642066956, "learning_rate": 0.0002, "epoch": 5.548294434470377, "step": 77260}, {"loss": 0.6286, "grad_norm": 1.0604912042617798, "learning_rate": 0.0002, "epoch": 5.549012567324955, "step": 77270}, {"loss": 0.6362, "grad_norm": 1.1037590503692627, "learning_rate": 0.0002, "epoch": 5.549730700179533, "step": 77280}, {"loss": 0.6021, "grad_norm": 1.166212558746338, "learning_rate": 0.0002, "epoch": 5.550448833034111, "step": 77290}, {"loss": 0.5624, "grad_norm": 1.0189802646636963, "learning_rate": 0.0002, "epoch": 5.551166965888689, "step": 77300}, {"loss": 0.5998, "grad_norm": 0.9592387080192566, "learning_rate": 0.0002, "epoch": 5.551885098743267, "step": 77310}, {"loss": 0.609, "grad_norm": 0.9533785581588745, "learning_rate": 0.0002, "epoch": 5.552603231597845, "step": 77320}, {"loss": 0.5879, "grad_norm": 0.9666807055473328, "learning_rate": 0.0002, "epoch": 5.553321364452424, "step": 77330}, {"loss": 0.6049, "grad_norm": 0.8827478289604187, "learning_rate": 0.0002, "epoch": 5.554039497307002, "step": 77340}, {"loss": 0.5644, "grad_norm": 0.9574757814407349, "learning_rate": 0.0002, "epoch": 5.55475763016158, "step": 77350}, {"loss": 0.6083, "grad_norm": 1.14597487449646, "learning_rate": 0.0002, "epoch": 5.555475763016158, "step": 77360}, {"loss": 0.6025, "grad_norm": 1.009392499923706, "learning_rate": 0.0002, "epoch": 5.556193895870736, "step": 77370}, {"loss": 0.6141, "grad_norm": 1.115757942199707, "learning_rate": 0.0002, "epoch": 5.556912028725314, "step": 77380}, {"loss": 0.5538, "grad_norm": 0.9907452464103699, "learning_rate": 0.0002, "epoch": 5.557630161579892, "step": 77390}, {"loss": 0.6142, "grad_norm": 1.0667012929916382, "learning_rate": 0.0002, "epoch": 5.55834829443447, "step": 77400}, {"loss": 0.5728, "grad_norm": 0.9301251173019409, "learning_rate": 0.0002, "epoch": 5.559066427289048, "step": 77410}, {"loss": 0.6174, "grad_norm": 1.090384602546692, "learning_rate": 0.0002, "epoch": 5.559784560143626, "step": 77420}, {"loss": 0.5802, "grad_norm": 0.8073469996452332, "learning_rate": 0.0002, "epoch": 5.560502692998204, "step": 77430}, {"loss": 0.5757, "grad_norm": 1.1003652811050415, "learning_rate": 0.0002, "epoch": 5.561220825852783, "step": 77440}, {"loss": 0.5899, "grad_norm": 0.9493791460990906, "learning_rate": 0.0002, "epoch": 5.561938958707361, "step": 77450}, {"loss": 0.6029, "grad_norm": 0.925388514995575, "learning_rate": 0.0002, "epoch": 5.562657091561939, "step": 77460}, {"loss": 0.5893, "grad_norm": 1.0946427583694458, "learning_rate": 0.0002, "epoch": 5.563375224416517, "step": 77470}, {"loss": 0.58, "grad_norm": 0.9791404008865356, "learning_rate": 0.0002, "epoch": 5.564093357271095, "step": 77480}, {"loss": 0.5887, "grad_norm": 1.0534733533859253, "learning_rate": 0.0002, "epoch": 5.564811490125673, "step": 77490}, {"loss": 0.564, "grad_norm": 0.9351776242256165, "learning_rate": 0.0002, "epoch": 5.565529622980251, "step": 77500}, {"loss": 0.5489, "grad_norm": 1.004448413848877, "learning_rate": 0.0002, "epoch": 5.566247755834829, "step": 77510}, {"loss": 0.5717, "grad_norm": 1.0199403762817383, "learning_rate": 0.0002, "epoch": 5.566965888689408, "step": 77520}, {"loss": 0.6358, "grad_norm": 1.0693204402923584, "learning_rate": 0.0002, "epoch": 5.567684021543986, "step": 77530}, {"loss": 0.5896, "grad_norm": 1.0635178089141846, "learning_rate": 0.0002, "epoch": 5.568402154398564, "step": 77540}, {"loss": 0.6399, "grad_norm": 1.1154648065567017, "learning_rate": 0.0002, "epoch": 5.569120287253142, "step": 77550}, {"loss": 0.5748, "grad_norm": 0.999116837978363, "learning_rate": 0.0002, "epoch": 5.56983842010772, "step": 77560}, {"loss": 0.6159, "grad_norm": 0.9967397451400757, "learning_rate": 0.0002, "epoch": 5.570556552962298, "step": 77570}, {"loss": 0.6041, "grad_norm": 0.9684699773788452, "learning_rate": 0.0002, "epoch": 5.571274685816876, "step": 77580}, {"loss": 0.5876, "grad_norm": 1.027213454246521, "learning_rate": 0.0002, "epoch": 5.571992818671454, "step": 77590}, {"loss": 0.6631, "grad_norm": 1.0571194887161255, "learning_rate": 0.0002, "epoch": 5.572710951526032, "step": 77600}, {"loss": 0.5927, "grad_norm": 1.2010499238967896, "learning_rate": 0.0002, "epoch": 5.57342908438061, "step": 77610}, {"loss": 0.5962, "grad_norm": 1.1033680438995361, "learning_rate": 0.0002, "epoch": 5.574147217235188, "step": 77620}, {"loss": 0.5668, "grad_norm": 0.9394578337669373, "learning_rate": 0.0002, "epoch": 5.574865350089767, "step": 77630}, {"loss": 0.6018, "grad_norm": 1.379382610321045, "learning_rate": 0.0002, "epoch": 5.575583482944345, "step": 77640}, {"loss": 0.5921, "grad_norm": 0.9787197709083557, "learning_rate": 0.0002, "epoch": 5.576301615798923, "step": 77650}, {"loss": 0.569, "grad_norm": 0.9680284261703491, "learning_rate": 0.0002, "epoch": 5.577019748653501, "step": 77660}, {"loss": 0.5761, "grad_norm": 1.0449682474136353, "learning_rate": 0.0002, "epoch": 5.577737881508079, "step": 77670}, {"loss": 0.5835, "grad_norm": 1.1243085861206055, "learning_rate": 0.0002, "epoch": 5.578456014362657, "step": 77680}, {"loss": 0.5873, "grad_norm": 0.9228966236114502, "learning_rate": 0.0002, "epoch": 5.579174147217235, "step": 77690}, {"loss": 0.5888, "grad_norm": 1.1349890232086182, "learning_rate": 0.0002, "epoch": 5.579892280071813, "step": 77700}, {"loss": 0.6272, "grad_norm": 1.2248499393463135, "learning_rate": 0.0002, "epoch": 5.580610412926392, "step": 77710}, {"loss": 0.5734, "grad_norm": 1.0066324472427368, "learning_rate": 0.0002, "epoch": 5.58132854578097, "step": 77720}, {"loss": 0.6047, "grad_norm": 1.2642878293991089, "learning_rate": 0.0002, "epoch": 5.582046678635548, "step": 77730}, {"loss": 0.5946, "grad_norm": 1.031591534614563, "learning_rate": 0.0002, "epoch": 5.582764811490126, "step": 77740}, {"loss": 0.5743, "grad_norm": 1.0925929546356201, "learning_rate": 0.0002, "epoch": 5.583482944344704, "step": 77750}, {"loss": 0.6113, "grad_norm": 1.0567110776901245, "learning_rate": 0.0002, "epoch": 5.584201077199282, "step": 77760}, {"loss": 0.5523, "grad_norm": 1.246246099472046, "learning_rate": 0.0002, "epoch": 5.58491921005386, "step": 77770}, {"loss": 0.5934, "grad_norm": 1.2467739582061768, "learning_rate": 0.0002, "epoch": 5.585637342908438, "step": 77780}, {"loss": 0.6211, "grad_norm": 1.2695211172103882, "learning_rate": 0.0002, "epoch": 5.586355475763016, "step": 77790}, {"loss": 0.5824, "grad_norm": 1.0498571395874023, "learning_rate": 0.0002, "epoch": 5.587073608617594, "step": 77800}, {"loss": 0.5545, "grad_norm": 1.0078339576721191, "learning_rate": 0.0002, "epoch": 5.587791741472173, "step": 77810}, {"loss": 0.5995, "grad_norm": 1.108199954032898, "learning_rate": 0.0002, "epoch": 5.588509874326751, "step": 77820}, {"loss": 0.5716, "grad_norm": 1.0577641725540161, "learning_rate": 0.0002, "epoch": 5.589228007181329, "step": 77830}, {"loss": 0.6106, "grad_norm": 1.2169439792633057, "learning_rate": 0.0002, "epoch": 5.589946140035907, "step": 77840}, {"loss": 0.563, "grad_norm": 0.8310868740081787, "learning_rate": 0.0002, "epoch": 5.590664272890485, "step": 77850}, {"loss": 0.5749, "grad_norm": 0.9794082045555115, "learning_rate": 0.0002, "epoch": 5.591382405745063, "step": 77860}, {"loss": 0.6025, "grad_norm": 0.8867404460906982, "learning_rate": 0.0002, "epoch": 5.592100538599641, "step": 77870}, {"loss": 0.5581, "grad_norm": 0.9204208254814148, "learning_rate": 0.0002, "epoch": 5.592818671454219, "step": 77880}, {"loss": 0.5646, "grad_norm": 0.9801714420318604, "learning_rate": 0.0002, "epoch": 5.593536804308797, "step": 77890}, {"loss": 0.6036, "grad_norm": 0.9383925199508667, "learning_rate": 0.0002, "epoch": 5.594254937163376, "step": 77900}, {"loss": 0.6417, "grad_norm": 0.9124664068222046, "learning_rate": 0.0002, "epoch": 5.594973070017954, "step": 77910}, {"loss": 0.559, "grad_norm": 0.9618783593177795, "learning_rate": 0.0002, "epoch": 5.595691202872532, "step": 77920}, {"loss": 0.604, "grad_norm": 0.9575216770172119, "learning_rate": 0.0002, "epoch": 5.59640933572711, "step": 77930}, {"loss": 0.5987, "grad_norm": 1.1223464012145996, "learning_rate": 0.0002, "epoch": 5.597127468581688, "step": 77940}, {"loss": 0.615, "grad_norm": 0.9947475790977478, "learning_rate": 0.0002, "epoch": 5.597845601436266, "step": 77950}, {"loss": 0.5618, "grad_norm": 1.141959309577942, "learning_rate": 0.0002, "epoch": 5.598563734290844, "step": 77960}, {"loss": 0.5966, "grad_norm": 1.095525860786438, "learning_rate": 0.0002, "epoch": 5.599281867145422, "step": 77970}, {"loss": 0.5619, "grad_norm": 0.9396624565124512, "learning_rate": 0.0002, "epoch": 5.6, "step": 77980}, {"loss": 0.5549, "grad_norm": 0.8162274956703186, "learning_rate": 0.0002, "epoch": 5.600718132854578, "step": 77990}, {"loss": 0.5815, "grad_norm": 1.0130535364151, "learning_rate": 0.0002, "epoch": 5.6014362657091565, "step": 78000}, {"loss": 0.5891, "grad_norm": 1.0016634464263916, "learning_rate": 0.0002, "epoch": 5.6021543985637345, "step": 78010}, {"loss": 0.6029, "grad_norm": 0.8936169743537903, "learning_rate": 0.0002, "epoch": 5.6028725314183125, "step": 78020}, {"loss": 0.6284, "grad_norm": 1.169625163078308, "learning_rate": 0.0002, "epoch": 5.6035906642728905, "step": 78030}, {"loss": 0.6038, "grad_norm": 0.8896323442459106, "learning_rate": 0.0002, "epoch": 5.6043087971274685, "step": 78040}, {"loss": 0.6219, "grad_norm": 1.0939475297927856, "learning_rate": 0.0002, "epoch": 5.6050269299820465, "step": 78050}, {"loss": 0.6009, "grad_norm": 1.0880711078643799, "learning_rate": 0.0002, "epoch": 5.6057450628366245, "step": 78060}, {"loss": 0.6416, "grad_norm": 1.1426655054092407, "learning_rate": 0.0002, "epoch": 5.6064631956912026, "step": 78070}, {"loss": 0.6124, "grad_norm": 1.118586540222168, "learning_rate": 0.0002, "epoch": 5.607181328545781, "step": 78080}, {"loss": 0.5791, "grad_norm": 0.8784464597702026, "learning_rate": 0.0002, "epoch": 5.607899461400359, "step": 78090}, {"loss": 0.6385, "grad_norm": 1.137229561805725, "learning_rate": 0.0002, "epoch": 5.608617594254937, "step": 78100}, {"loss": 0.5998, "grad_norm": 1.1041932106018066, "learning_rate": 0.0002, "epoch": 5.6093357271095154, "step": 78110}, {"loss": 0.5985, "grad_norm": 1.0170503854751587, "learning_rate": 0.0002, "epoch": 5.6100538599640934, "step": 78120}, {"loss": 0.6376, "grad_norm": 1.298754334449768, "learning_rate": 0.0002, "epoch": 5.6107719928186714, "step": 78130}, {"loss": 0.6284, "grad_norm": 0.9344905018806458, "learning_rate": 0.0002, "epoch": 5.6114901256732495, "step": 78140}, {"loss": 0.5835, "grad_norm": 0.9467785954475403, "learning_rate": 0.0002, "epoch": 5.6122082585278275, "step": 78150}, {"loss": 0.5661, "grad_norm": 1.0617443323135376, "learning_rate": 0.0002, "epoch": 5.6129263913824055, "step": 78160}, {"loss": 0.5908, "grad_norm": 0.9017760753631592, "learning_rate": 0.0002, "epoch": 5.6136445242369835, "step": 78170}, {"loss": 0.5701, "grad_norm": 1.152601957321167, "learning_rate": 0.0002, "epoch": 5.6143626570915615, "step": 78180}, {"loss": 0.6319, "grad_norm": 0.9889463186264038, "learning_rate": 0.0002, "epoch": 5.61508078994614, "step": 78190}, {"loss": 0.5733, "grad_norm": 1.0367393493652344, "learning_rate": 0.0002, "epoch": 5.615798922800718, "step": 78200}, {"loss": 0.5785, "grad_norm": 0.8466457724571228, "learning_rate": 0.0002, "epoch": 5.616517055655296, "step": 78210}, {"loss": 0.563, "grad_norm": 0.936083197593689, "learning_rate": 0.0002, "epoch": 5.617235188509874, "step": 78220}, {"loss": 0.6077, "grad_norm": 1.018784999847412, "learning_rate": 0.0002, "epoch": 5.617953321364452, "step": 78230}, {"loss": 0.5676, "grad_norm": 0.8527804017066956, "learning_rate": 0.0002, "epoch": 5.61867145421903, "step": 78240}, {"loss": 0.5721, "grad_norm": 1.1873106956481934, "learning_rate": 0.0002, "epoch": 5.619389587073608, "step": 78250}, {"loss": 0.5905, "grad_norm": 0.9401728510856628, "learning_rate": 0.0002, "epoch": 5.620107719928186, "step": 78260}, {"loss": 0.5986, "grad_norm": 1.0801159143447876, "learning_rate": 0.0002, "epoch": 5.620825852782765, "step": 78270}, {"loss": 0.5769, "grad_norm": 1.0053739547729492, "learning_rate": 0.0002, "epoch": 5.621543985637343, "step": 78280}, {"loss": 0.5907, "grad_norm": 0.8599331378936768, "learning_rate": 0.0002, "epoch": 5.622262118491921, "step": 78290}, {"loss": 0.5689, "grad_norm": 2.3157296180725098, "learning_rate": 0.0002, "epoch": 5.622980251346499, "step": 78300}, {"loss": 0.5749, "grad_norm": 1.0027490854263306, "learning_rate": 0.0002, "epoch": 5.623698384201077, "step": 78310}, {"loss": 0.5452, "grad_norm": 0.996688961982727, "learning_rate": 0.0002, "epoch": 5.624416517055655, "step": 78320}, {"loss": 0.5979, "grad_norm": 1.0462113618850708, "learning_rate": 0.0002, "epoch": 5.625134649910233, "step": 78330}, {"loss": 0.5547, "grad_norm": 0.8750988245010376, "learning_rate": 0.0002, "epoch": 5.625852782764811, "step": 78340}, {"loss": 0.6076, "grad_norm": 0.8078145384788513, "learning_rate": 0.0002, "epoch": 5.626570915619389, "step": 78350}, {"loss": 0.6431, "grad_norm": 0.9047532081604004, "learning_rate": 0.0002, "epoch": 5.627289048473967, "step": 78360}, {"loss": 0.6027, "grad_norm": 0.9784479737281799, "learning_rate": 0.0002, "epoch": 5.628007181328546, "step": 78370}, {"loss": 0.6005, "grad_norm": 0.9529541730880737, "learning_rate": 0.0002, "epoch": 5.628725314183124, "step": 78380}, {"loss": 0.6057, "grad_norm": 0.8264740109443665, "learning_rate": 0.0002, "epoch": 5.629443447037702, "step": 78390}, {"loss": 0.5991, "grad_norm": 1.049724817276001, "learning_rate": 0.0002, "epoch": 5.63016157989228, "step": 78400}, {"loss": 0.5637, "grad_norm": 0.9866746068000793, "learning_rate": 0.0002, "epoch": 5.630879712746858, "step": 78410}, {"loss": 0.5622, "grad_norm": 0.897155225276947, "learning_rate": 0.0002, "epoch": 5.631597845601436, "step": 78420}, {"loss": 0.5838, "grad_norm": 1.225464940071106, "learning_rate": 0.0002, "epoch": 5.632315978456014, "step": 78430}, {"loss": 0.5928, "grad_norm": 0.8793753981590271, "learning_rate": 0.0002, "epoch": 5.633034111310592, "step": 78440}, {"loss": 0.6009, "grad_norm": 1.082482099533081, "learning_rate": 0.0002, "epoch": 5.63375224416517, "step": 78450}, {"loss": 0.6546, "grad_norm": 1.054064393043518, "learning_rate": 0.0002, "epoch": 5.634470377019749, "step": 78460}, {"loss": 0.5795, "grad_norm": 1.0032247304916382, "learning_rate": 0.0002, "epoch": 5.635188509874327, "step": 78470}, {"loss": 0.5697, "grad_norm": 0.8544651865959167, "learning_rate": 0.0002, "epoch": 5.635906642728905, "step": 78480}, {"loss": 0.6196, "grad_norm": 0.9475075602531433, "learning_rate": 0.0002, "epoch": 5.636624775583483, "step": 78490}, {"loss": 0.5975, "grad_norm": 1.0814138650894165, "learning_rate": 0.0002, "epoch": 5.637342908438061, "step": 78500}, {"loss": 0.5853, "grad_norm": 1.0813153982162476, "learning_rate": 0.0002, "epoch": 5.638061041292639, "step": 78510}, {"loss": 0.5806, "grad_norm": 1.0225616693496704, "learning_rate": 0.0002, "epoch": 5.638779174147217, "step": 78520}, {"loss": 0.5913, "grad_norm": 1.0777465105056763, "learning_rate": 0.0002, "epoch": 5.639497307001795, "step": 78530}, {"loss": 0.6207, "grad_norm": 1.156148910522461, "learning_rate": 0.0002, "epoch": 5.640215439856373, "step": 78540}, {"loss": 0.5843, "grad_norm": 1.0147465467453003, "learning_rate": 0.0002, "epoch": 5.640933572710951, "step": 78550}, {"loss": 0.6045, "grad_norm": 0.9606683850288391, "learning_rate": 0.0002, "epoch": 5.64165170556553, "step": 78560}, {"loss": 0.6457, "grad_norm": 0.9478723406791687, "learning_rate": 0.0002, "epoch": 5.642369838420108, "step": 78570}, {"loss": 0.5502, "grad_norm": 1.0653880834579468, "learning_rate": 0.0002, "epoch": 5.643087971274686, "step": 78580}, {"loss": 0.5938, "grad_norm": 1.7519923448562622, "learning_rate": 0.0002, "epoch": 5.643806104129264, "step": 78590}, {"loss": 0.6015, "grad_norm": 1.0567299127578735, "learning_rate": 0.0002, "epoch": 5.644524236983842, "step": 78600}, {"loss": 0.6329, "grad_norm": 0.8980287909507751, "learning_rate": 0.0002, "epoch": 5.64524236983842, "step": 78610}, {"loss": 0.6319, "grad_norm": 0.8792264461517334, "learning_rate": 0.0002, "epoch": 5.645960502692998, "step": 78620}, {"loss": 0.6234, "grad_norm": 1.2306275367736816, "learning_rate": 0.0002, "epoch": 5.646678635547576, "step": 78630}, {"loss": 0.5567, "grad_norm": 0.8259932398796082, "learning_rate": 0.0002, "epoch": 5.647396768402155, "step": 78640}, {"loss": 0.5484, "grad_norm": 0.9605076313018799, "learning_rate": 0.0002, "epoch": 5.648114901256733, "step": 78650}, {"loss": 0.5934, "grad_norm": 0.9967419505119324, "learning_rate": 0.0002, "epoch": 5.648833034111311, "step": 78660}, {"loss": 0.5755, "grad_norm": 0.9774024486541748, "learning_rate": 0.0002, "epoch": 5.649551166965889, "step": 78670}, {"loss": 0.6079, "grad_norm": 0.9838066697120667, "learning_rate": 0.0002, "epoch": 5.650269299820467, "step": 78680}, {"loss": 0.5674, "grad_norm": 1.1617798805236816, "learning_rate": 0.0002, "epoch": 5.650987432675045, "step": 78690}, {"loss": 0.6252, "grad_norm": 1.075006365776062, "learning_rate": 0.0002, "epoch": 5.651705565529623, "step": 78700}, {"loss": 0.5404, "grad_norm": 0.8859893679618835, "learning_rate": 0.0002, "epoch": 5.652423698384201, "step": 78710}, {"loss": 0.5657, "grad_norm": 1.0774717330932617, "learning_rate": 0.0002, "epoch": 5.653141831238779, "step": 78720}, {"loss": 0.625, "grad_norm": 1.147273302078247, "learning_rate": 0.0002, "epoch": 5.653859964093357, "step": 78730}, {"loss": 0.5819, "grad_norm": 1.1403213739395142, "learning_rate": 0.0002, "epoch": 5.654578096947935, "step": 78740}, {"loss": 0.5721, "grad_norm": 0.9115353226661682, "learning_rate": 0.0002, "epoch": 5.655296229802514, "step": 78750}, {"loss": 0.5521, "grad_norm": 0.9303002953529358, "learning_rate": 0.0002, "epoch": 5.656014362657092, "step": 78760}, {"loss": 0.6078, "grad_norm": 0.9324957728385925, "learning_rate": 0.0002, "epoch": 5.65673249551167, "step": 78770}, {"loss": 0.589, "grad_norm": 0.9688063859939575, "learning_rate": 0.0002, "epoch": 5.657450628366248, "step": 78780}, {"loss": 0.614, "grad_norm": 0.9019638299942017, "learning_rate": 0.0002, "epoch": 5.658168761220826, "step": 78790}, {"loss": 0.5594, "grad_norm": 0.8236798048019409, "learning_rate": 0.0002, "epoch": 5.658886894075404, "step": 78800}, {"loss": 0.6074, "grad_norm": 1.2702386379241943, "learning_rate": 0.0002, "epoch": 5.659605026929982, "step": 78810}, {"loss": 0.5738, "grad_norm": 1.041077971458435, "learning_rate": 0.0002, "epoch": 5.66032315978456, "step": 78820}, {"loss": 0.5773, "grad_norm": 0.9028838276863098, "learning_rate": 0.0002, "epoch": 5.661041292639139, "step": 78830}, {"loss": 0.5871, "grad_norm": 0.9874144196510315, "learning_rate": 0.0002, "epoch": 5.661759425493717, "step": 78840}, {"loss": 0.6039, "grad_norm": 0.9633761048316956, "learning_rate": 0.0002, "epoch": 5.662477558348295, "step": 78850}, {"loss": 0.5794, "grad_norm": 0.9069564342498779, "learning_rate": 0.0002, "epoch": 5.663195691202873, "step": 78860}, {"loss": 0.5836, "grad_norm": 0.9560621976852417, "learning_rate": 0.0002, "epoch": 5.663913824057451, "step": 78870}, {"loss": 0.579, "grad_norm": 0.9941161870956421, "learning_rate": 0.0002, "epoch": 5.664631956912029, "step": 78880}, {"loss": 0.6184, "grad_norm": 0.920407235622406, "learning_rate": 0.0002, "epoch": 5.665350089766607, "step": 78890}, {"loss": 0.6223, "grad_norm": 0.9909250140190125, "learning_rate": 0.0002, "epoch": 5.666068222621185, "step": 78900}, {"loss": 0.6154, "grad_norm": 0.9528568983078003, "learning_rate": 0.0002, "epoch": 5.666786355475763, "step": 78910}, {"loss": 0.6153, "grad_norm": 1.041440725326538, "learning_rate": 0.0002, "epoch": 5.667504488330341, "step": 78920}, {"loss": 0.609, "grad_norm": 1.0072191953659058, "learning_rate": 0.0002, "epoch": 5.66822262118492, "step": 78930}, {"loss": 0.6136, "grad_norm": 1.0740574598312378, "learning_rate": 0.0002, "epoch": 5.668940754039498, "step": 78940}, {"loss": 0.583, "grad_norm": 0.9168822169303894, "learning_rate": 0.0002, "epoch": 5.669658886894076, "step": 78950}, {"loss": 0.5808, "grad_norm": 1.1818004846572876, "learning_rate": 0.0002, "epoch": 5.670377019748654, "step": 78960}, {"loss": 0.6584, "grad_norm": 1.1925201416015625, "learning_rate": 0.0002, "epoch": 5.671095152603232, "step": 78970}, {"loss": 0.6074, "grad_norm": 0.879940390586853, "learning_rate": 0.0002, "epoch": 5.67181328545781, "step": 78980}, {"loss": 0.5863, "grad_norm": 1.0998331308364868, "learning_rate": 0.0002, "epoch": 5.672531418312388, "step": 78990}, {"loss": 0.5688, "grad_norm": 1.076637625694275, "learning_rate": 0.0002, "epoch": 5.673249551166966, "step": 79000}, {"loss": 0.6183, "grad_norm": 1.076864242553711, "learning_rate": 0.0002, "epoch": 5.673967684021544, "step": 79010}, {"loss": 0.6031, "grad_norm": 1.0206586122512817, "learning_rate": 0.0002, "epoch": 5.6746858168761225, "step": 79020}, {"loss": 0.5658, "grad_norm": 0.8242515325546265, "learning_rate": 0.0002, "epoch": 5.6754039497307005, "step": 79030}, {"loss": 0.5782, "grad_norm": 1.1180634498596191, "learning_rate": 0.0002, "epoch": 5.6761220825852785, "step": 79040}, {"loss": 0.6039, "grad_norm": 1.0155152082443237, "learning_rate": 0.0002, "epoch": 5.6768402154398565, "step": 79050}, {"loss": 0.5877, "grad_norm": 1.0445241928100586, "learning_rate": 0.0002, "epoch": 5.6775583482944345, "step": 79060}, {"loss": 0.5809, "grad_norm": 0.9851725697517395, "learning_rate": 0.0002, "epoch": 5.6782764811490125, "step": 79070}, {"loss": 0.5807, "grad_norm": 0.9979640245437622, "learning_rate": 0.0002, "epoch": 5.6789946140035905, "step": 79080}, {"loss": 0.6049, "grad_norm": 1.0398952960968018, "learning_rate": 0.0002, "epoch": 5.6797127468581685, "step": 79090}, {"loss": 0.6279, "grad_norm": 1.094164252281189, "learning_rate": 0.0002, "epoch": 5.6804308797127465, "step": 79100}, {"loss": 0.6325, "grad_norm": 0.9546816945075989, "learning_rate": 0.0002, "epoch": 5.6811490125673245, "step": 79110}, {"loss": 0.5658, "grad_norm": 1.1635938882827759, "learning_rate": 0.0002, "epoch": 5.681867145421903, "step": 79120}, {"loss": 0.5849, "grad_norm": 1.0260306596755981, "learning_rate": 0.0002, "epoch": 5.682585278276481, "step": 79130}, {"loss": 0.5653, "grad_norm": 0.9900122284889221, "learning_rate": 0.0002, "epoch": 5.683303411131059, "step": 79140}, {"loss": 0.6107, "grad_norm": 1.049688458442688, "learning_rate": 0.0002, "epoch": 5.684021543985637, "step": 79150}, {"loss": 0.5887, "grad_norm": 1.124272108078003, "learning_rate": 0.0002, "epoch": 5.684739676840215, "step": 79160}, {"loss": 0.5695, "grad_norm": 1.1109849214553833, "learning_rate": 0.0002, "epoch": 5.685457809694793, "step": 79170}, {"loss": 0.6014, "grad_norm": 0.739007830619812, "learning_rate": 0.0002, "epoch": 5.686175942549371, "step": 79180}, {"loss": 0.5995, "grad_norm": 1.2063007354736328, "learning_rate": 0.0002, "epoch": 5.686894075403949, "step": 79190}, {"loss": 0.5563, "grad_norm": 1.223317265510559, "learning_rate": 0.0002, "epoch": 5.687612208258528, "step": 79200}, {"loss": 0.6017, "grad_norm": 0.8042855858802795, "learning_rate": 0.0002, "epoch": 5.688330341113106, "step": 79210}, {"loss": 0.5909, "grad_norm": 0.9294175505638123, "learning_rate": 0.0002, "epoch": 5.689048473967684, "step": 79220}, {"loss": 0.6091, "grad_norm": 0.978084146976471, "learning_rate": 0.0002, "epoch": 5.689766606822262, "step": 79230}, {"loss": 0.6094, "grad_norm": 0.9271620512008667, "learning_rate": 0.0002, "epoch": 5.69048473967684, "step": 79240}, {"loss": 0.6454, "grad_norm": 1.158677339553833, "learning_rate": 0.0002, "epoch": 5.691202872531418, "step": 79250}, {"loss": 0.6054, "grad_norm": 0.9468576312065125, "learning_rate": 0.0002, "epoch": 5.691921005385996, "step": 79260}, {"loss": 0.6094, "grad_norm": 1.2025824785232544, "learning_rate": 0.0002, "epoch": 5.692639138240574, "step": 79270}, {"loss": 0.5995, "grad_norm": 1.0167860984802246, "learning_rate": 0.0002, "epoch": 5.693357271095152, "step": 79280}, {"loss": 0.5596, "grad_norm": 0.971199631690979, "learning_rate": 0.0002, "epoch": 5.69407540394973, "step": 79290}, {"loss": 0.6051, "grad_norm": 1.1757864952087402, "learning_rate": 0.0002, "epoch": 5.694793536804308, "step": 79300}, {"loss": 0.5915, "grad_norm": 1.0199662446975708, "learning_rate": 0.0002, "epoch": 5.695511669658887, "step": 79310}, {"loss": 0.5654, "grad_norm": 0.9662485122680664, "learning_rate": 0.0002, "epoch": 5.696229802513465, "step": 79320}, {"loss": 0.5602, "grad_norm": 0.9324414134025574, "learning_rate": 0.0002, "epoch": 5.696947935368043, "step": 79330}, {"loss": 0.5939, "grad_norm": 0.855752170085907, "learning_rate": 0.0002, "epoch": 5.697666068222621, "step": 79340}, {"loss": 0.6202, "grad_norm": 1.2723703384399414, "learning_rate": 0.0002, "epoch": 5.698384201077199, "step": 79350}, {"loss": 0.6028, "grad_norm": 1.0254011154174805, "learning_rate": 0.0002, "epoch": 5.699102333931777, "step": 79360}, {"loss": 0.5853, "grad_norm": 1.0958263874053955, "learning_rate": 0.0002, "epoch": 5.699820466786355, "step": 79370}, {"loss": 0.6292, "grad_norm": 1.0214145183563232, "learning_rate": 0.0002, "epoch": 5.700538599640933, "step": 79380}, {"loss": 0.6576, "grad_norm": 1.1087455749511719, "learning_rate": 0.0002, "epoch": 5.701256732495512, "step": 79390}, {"loss": 0.576, "grad_norm": 0.8885074853897095, "learning_rate": 0.0002, "epoch": 5.70197486535009, "step": 79400}, {"loss": 0.5452, "grad_norm": 0.9854450821876526, "learning_rate": 0.0002, "epoch": 5.702692998204668, "step": 79410}, {"loss": 0.5903, "grad_norm": 0.858744204044342, "learning_rate": 0.0002, "epoch": 5.703411131059246, "step": 79420}, {"loss": 0.5975, "grad_norm": 0.9434788823127747, "learning_rate": 0.0002, "epoch": 5.704129263913824, "step": 79430}, {"loss": 0.648, "grad_norm": 1.1388801336288452, "learning_rate": 0.0002, "epoch": 5.704847396768402, "step": 79440}, {"loss": 0.5895, "grad_norm": 1.0701899528503418, "learning_rate": 0.0002, "epoch": 5.70556552962298, "step": 79450}, {"loss": 0.5697, "grad_norm": 0.9147594571113586, "learning_rate": 0.0002, "epoch": 5.706283662477558, "step": 79460}, {"loss": 0.6043, "grad_norm": 1.055008053779602, "learning_rate": 0.0002, "epoch": 5.707001795332136, "step": 79470}, {"loss": 0.5625, "grad_norm": 0.7841609716415405, "learning_rate": 0.0002, "epoch": 5.707719928186714, "step": 79480}, {"loss": 0.6048, "grad_norm": 1.0334571599960327, "learning_rate": 0.0002, "epoch": 5.708438061041292, "step": 79490}, {"loss": 0.5924, "grad_norm": 1.2841367721557617, "learning_rate": 0.0002, "epoch": 5.709156193895871, "step": 79500}, {"loss": 0.5957, "grad_norm": 1.0296638011932373, "learning_rate": 0.0002, "epoch": 5.709874326750449, "step": 79510}, {"loss": 0.6015, "grad_norm": 0.9161922931671143, "learning_rate": 0.0002, "epoch": 5.710592459605027, "step": 79520}, {"loss": 0.6056, "grad_norm": 1.056856632232666, "learning_rate": 0.0002, "epoch": 5.711310592459605, "step": 79530}, {"loss": 0.5762, "grad_norm": 0.9919893145561218, "learning_rate": 0.0002, "epoch": 5.712028725314183, "step": 79540}, {"loss": 0.5987, "grad_norm": 1.1128891706466675, "learning_rate": 0.0002, "epoch": 5.712746858168761, "step": 79550}, {"loss": 0.5835, "grad_norm": 1.1171997785568237, "learning_rate": 0.0002, "epoch": 5.713464991023339, "step": 79560}, {"loss": 0.6037, "grad_norm": 0.9389346837997437, "learning_rate": 0.0002, "epoch": 5.714183123877917, "step": 79570}, {"loss": 0.5805, "grad_norm": 0.9869245886802673, "learning_rate": 0.0002, "epoch": 5.714901256732496, "step": 79580}, {"loss": 0.5776, "grad_norm": 0.9019966721534729, "learning_rate": 0.0002, "epoch": 5.715619389587074, "step": 79590}, {"loss": 0.567, "grad_norm": 0.9791252017021179, "learning_rate": 0.0002, "epoch": 5.716337522441652, "step": 79600}, {"loss": 0.5817, "grad_norm": 1.0269849300384521, "learning_rate": 0.0002, "epoch": 5.71705565529623, "step": 79610}, {"loss": 0.602, "grad_norm": 1.0340129137039185, "learning_rate": 0.0002, "epoch": 5.717773788150808, "step": 79620}, {"loss": 0.5969, "grad_norm": 0.9742604494094849, "learning_rate": 0.0002, "epoch": 5.718491921005386, "step": 79630}, {"loss": 0.5945, "grad_norm": 1.126868724822998, "learning_rate": 0.0002, "epoch": 5.719210053859964, "step": 79640}, {"loss": 0.601, "grad_norm": 1.04326331615448, "learning_rate": 0.0002, "epoch": 5.719928186714542, "step": 79650}, {"loss": 0.6071, "grad_norm": 0.8300277590751648, "learning_rate": 0.0002, "epoch": 5.72064631956912, "step": 79660}, {"loss": 0.6121, "grad_norm": 0.8482570052146912, "learning_rate": 0.0002, "epoch": 5.721364452423698, "step": 79670}, {"loss": 0.5937, "grad_norm": 1.0777807235717773, "learning_rate": 0.0002, "epoch": 5.722082585278277, "step": 79680}, {"loss": 0.5739, "grad_norm": 1.2682723999023438, "learning_rate": 0.0002, "epoch": 5.722800718132855, "step": 79690}, {"loss": 0.5759, "grad_norm": 0.8742772340774536, "learning_rate": 0.0002, "epoch": 5.723518850987433, "step": 79700}, {"loss": 0.5839, "grad_norm": 0.9218387603759766, "learning_rate": 0.0002, "epoch": 5.724236983842011, "step": 79710}, {"loss": 0.5968, "grad_norm": 0.8977975845336914, "learning_rate": 0.0002, "epoch": 5.724955116696589, "step": 79720}, {"loss": 0.5743, "grad_norm": 1.0873085260391235, "learning_rate": 0.0002, "epoch": 5.725673249551167, "step": 79730}, {"loss": 0.5986, "grad_norm": 0.9811807870864868, "learning_rate": 0.0002, "epoch": 5.726391382405745, "step": 79740}, {"loss": 0.5881, "grad_norm": 0.926764965057373, "learning_rate": 0.0002, "epoch": 5.727109515260323, "step": 79750}, {"loss": 0.5738, "grad_norm": 1.0103713274002075, "learning_rate": 0.0002, "epoch": 5.727827648114902, "step": 79760}, {"loss": 0.5807, "grad_norm": 1.1389189958572388, "learning_rate": 0.0002, "epoch": 5.72854578096948, "step": 79770}, {"loss": 0.636, "grad_norm": 1.1654961109161377, "learning_rate": 0.0002, "epoch": 5.729263913824058, "step": 79780}, {"loss": 0.5863, "grad_norm": 0.7925996780395508, "learning_rate": 0.0002, "epoch": 5.729982046678636, "step": 79790}, {"loss": 0.6005, "grad_norm": 1.3329131603240967, "learning_rate": 0.0002, "epoch": 5.730700179533214, "step": 79800}, {"loss": 0.6295, "grad_norm": 1.158328890800476, "learning_rate": 0.0002, "epoch": 5.731418312387792, "step": 79810}, {"loss": 0.5832, "grad_norm": 0.9904412031173706, "learning_rate": 0.0002, "epoch": 5.73213644524237, "step": 79820}, {"loss": 0.582, "grad_norm": 1.099233865737915, "learning_rate": 0.0002, "epoch": 5.732854578096948, "step": 79830}, {"loss": 0.6135, "grad_norm": 1.0224473476409912, "learning_rate": 0.0002, "epoch": 5.733572710951526, "step": 79840}, {"loss": 0.6063, "grad_norm": 1.0482215881347656, "learning_rate": 0.0002, "epoch": 5.734290843806104, "step": 79850}, {"loss": 0.5792, "grad_norm": 0.9790018200874329, "learning_rate": 0.0002, "epoch": 5.735008976660682, "step": 79860}, {"loss": 0.6089, "grad_norm": 1.034548044204712, "learning_rate": 0.0002, "epoch": 5.735727109515261, "step": 79870}, {"loss": 0.5676, "grad_norm": 0.799286961555481, "learning_rate": 0.0002, "epoch": 5.736445242369839, "step": 79880}, {"loss": 0.5344, "grad_norm": 1.0119048357009888, "learning_rate": 0.0002, "epoch": 5.737163375224417, "step": 79890}, {"loss": 0.5859, "grad_norm": 0.9742264151573181, "learning_rate": 0.0002, "epoch": 5.737881508078995, "step": 79900}, {"loss": 0.5992, "grad_norm": 1.0408239364624023, "learning_rate": 0.0002, "epoch": 5.738599640933573, "step": 79910}, {"loss": 0.6009, "grad_norm": 0.9165748953819275, "learning_rate": 0.0002, "epoch": 5.739317773788151, "step": 79920}, {"loss": 0.5864, "grad_norm": 1.1859451532363892, "learning_rate": 0.0002, "epoch": 5.740035906642729, "step": 79930}, {"loss": 0.5948, "grad_norm": 0.8772084712982178, "learning_rate": 0.0002, "epoch": 5.740754039497307, "step": 79940}, {"loss": 0.5614, "grad_norm": 1.0123273134231567, "learning_rate": 0.0002, "epoch": 5.741472172351886, "step": 79950}, {"loss": 0.6405, "grad_norm": 1.1873936653137207, "learning_rate": 0.0002, "epoch": 5.742190305206464, "step": 79960}, {"loss": 0.5818, "grad_norm": 0.9065699577331543, "learning_rate": 0.0002, "epoch": 5.742908438061042, "step": 79970}, {"loss": 0.6068, "grad_norm": 1.1626464128494263, "learning_rate": 0.0002, "epoch": 5.74362657091562, "step": 79980}, {"loss": 0.5814, "grad_norm": 1.0311716794967651, "learning_rate": 0.0002, "epoch": 5.744344703770198, "step": 79990}, {"loss": 0.5752, "grad_norm": 1.0865558385849, "learning_rate": 0.0002, "epoch": 5.745062836624776, "step": 80000}, {"loss": 0.6477, "grad_norm": 1.0257176160812378, "learning_rate": 0.0002, "epoch": 5.745780969479354, "step": 80010}, {"loss": 0.6172, "grad_norm": 0.9805439710617065, "learning_rate": 0.0002, "epoch": 5.746499102333932, "step": 80020}, {"loss": 0.5949, "grad_norm": 0.9744977355003357, "learning_rate": 0.0002, "epoch": 5.74721723518851, "step": 80030}, {"loss": 0.5893, "grad_norm": 1.302816390991211, "learning_rate": 0.0002, "epoch": 5.747935368043088, "step": 80040}, {"loss": 0.5653, "grad_norm": 0.8866990208625793, "learning_rate": 0.0002, "epoch": 5.748653500897666, "step": 80050}, {"loss": 0.5648, "grad_norm": 1.0133726596832275, "learning_rate": 0.0002, "epoch": 5.7493716337522445, "step": 80060}, {"loss": 0.6016, "grad_norm": 1.0043569803237915, "learning_rate": 0.0002, "epoch": 5.7500897666068225, "step": 80070}, {"loss": 0.6493, "grad_norm": 0.9100040197372437, "learning_rate": 0.0002, "epoch": 5.7508078994614005, "step": 80080}, {"loss": 0.5469, "grad_norm": 0.7994180917739868, "learning_rate": 0.0002, "epoch": 5.7515260323159785, "step": 80090}, {"loss": 0.6521, "grad_norm": 1.120188593864441, "learning_rate": 0.0002, "epoch": 5.7522441651705565, "step": 80100}, {"loss": 0.5737, "grad_norm": 0.9555420279502869, "learning_rate": 0.0002, "epoch": 5.7529622980251345, "step": 80110}, {"loss": 0.5897, "grad_norm": 1.0305951833724976, "learning_rate": 0.0002, "epoch": 5.7536804308797125, "step": 80120}, {"loss": 0.5821, "grad_norm": 0.9632731676101685, "learning_rate": 0.0002, "epoch": 5.7543985637342905, "step": 80130}, {"loss": 0.5618, "grad_norm": 1.2654297351837158, "learning_rate": 0.0002, "epoch": 5.755116696588869, "step": 80140}, {"loss": 0.6044, "grad_norm": 1.027190089225769, "learning_rate": 0.0002, "epoch": 5.755834829443447, "step": 80150}, {"loss": 0.6131, "grad_norm": 0.9829175472259521, "learning_rate": 0.0002, "epoch": 5.756552962298025, "step": 80160}, {"loss": 0.609, "grad_norm": 1.083803653717041, "learning_rate": 0.0002, "epoch": 5.757271095152603, "step": 80170}, {"loss": 0.6134, "grad_norm": 0.9353913068771362, "learning_rate": 0.0002, "epoch": 5.757989228007181, "step": 80180}, {"loss": 0.6515, "grad_norm": 1.1824370622634888, "learning_rate": 0.0002, "epoch": 5.758707360861759, "step": 80190}, {"loss": 0.6012, "grad_norm": 1.0901048183441162, "learning_rate": 0.0002, "epoch": 5.759425493716337, "step": 80200}, {"loss": 0.5639, "grad_norm": 1.0389254093170166, "learning_rate": 0.0002, "epoch": 5.760143626570915, "step": 80210}, {"loss": 0.6085, "grad_norm": 0.9746400117874146, "learning_rate": 0.0002, "epoch": 5.760861759425493, "step": 80220}, {"loss": 0.5874, "grad_norm": 0.9319248795509338, "learning_rate": 0.0002, "epoch": 5.761579892280071, "step": 80230}, {"loss": 0.5726, "grad_norm": 1.152784824371338, "learning_rate": 0.0002, "epoch": 5.76229802513465, "step": 80240}, {"loss": 0.5998, "grad_norm": 0.9462733864784241, "learning_rate": 0.0002, "epoch": 5.763016157989228, "step": 80250}, {"loss": 0.5755, "grad_norm": 0.8884182572364807, "learning_rate": 0.0002, "epoch": 5.763734290843806, "step": 80260}, {"loss": 0.5864, "grad_norm": 0.8755964636802673, "learning_rate": 0.0002, "epoch": 5.764452423698384, "step": 80270}, {"loss": 0.5659, "grad_norm": 0.8983452320098877, "learning_rate": 0.0002, "epoch": 5.765170556552962, "step": 80280}, {"loss": 0.5799, "grad_norm": 0.8565991520881653, "learning_rate": 0.0002, "epoch": 5.76588868940754, "step": 80290}, {"loss": 0.598, "grad_norm": 1.0557159185409546, "learning_rate": 0.0002, "epoch": 5.766606822262118, "step": 80300}, {"loss": 0.6441, "grad_norm": 1.057214379310608, "learning_rate": 0.0002, "epoch": 5.767324955116696, "step": 80310}, {"loss": 0.6038, "grad_norm": 0.9852516055107117, "learning_rate": 0.0002, "epoch": 5.768043087971275, "step": 80320}, {"loss": 0.5676, "grad_norm": 1.0339698791503906, "learning_rate": 0.0002, "epoch": 5.768761220825853, "step": 80330}, {"loss": 0.5963, "grad_norm": 1.0056889057159424, "learning_rate": 0.0002, "epoch": 5.769479353680431, "step": 80340}, {"loss": 0.5588, "grad_norm": 1.0941663980484009, "learning_rate": 0.0002, "epoch": 5.770197486535009, "step": 80350}, {"loss": 0.5729, "grad_norm": 1.2145589590072632, "learning_rate": 0.0002, "epoch": 5.770915619389587, "step": 80360}, {"loss": 0.5819, "grad_norm": 0.9609606862068176, "learning_rate": 0.0002, "epoch": 5.771633752244165, "step": 80370}, {"loss": 0.6313, "grad_norm": 0.8815773129463196, "learning_rate": 0.0002, "epoch": 5.772351885098743, "step": 80380}, {"loss": 0.6046, "grad_norm": 1.2630987167358398, "learning_rate": 0.0002, "epoch": 5.773070017953321, "step": 80390}, {"loss": 0.5918, "grad_norm": 1.0605450868606567, "learning_rate": 0.0002, "epoch": 5.773788150807899, "step": 80400}, {"loss": 0.6074, "grad_norm": 1.165069341659546, "learning_rate": 0.0002, "epoch": 5.774506283662477, "step": 80410}, {"loss": 0.5683, "grad_norm": 0.9038028717041016, "learning_rate": 0.0002, "epoch": 5.775224416517055, "step": 80420}, {"loss": 0.6024, "grad_norm": 1.0571858882904053, "learning_rate": 0.0002, "epoch": 5.775942549371634, "step": 80430}, {"loss": 0.624, "grad_norm": 1.0388168096542358, "learning_rate": 0.0002, "epoch": 5.776660682226212, "step": 80440}, {"loss": 0.6139, "grad_norm": 1.0552119016647339, "learning_rate": 0.0002, "epoch": 5.77737881508079, "step": 80450}, {"loss": 0.5988, "grad_norm": 1.0610109567642212, "learning_rate": 0.0002, "epoch": 5.778096947935368, "step": 80460}, {"loss": 0.6264, "grad_norm": 0.9906430244445801, "learning_rate": 0.0002, "epoch": 5.778815080789946, "step": 80470}, {"loss": 0.5807, "grad_norm": 1.1511857509613037, "learning_rate": 0.0002, "epoch": 5.779533213644524, "step": 80480}, {"loss": 0.6202, "grad_norm": 1.2738412618637085, "learning_rate": 0.0002, "epoch": 5.780251346499102, "step": 80490}, {"loss": 0.5957, "grad_norm": 0.8945937752723694, "learning_rate": 0.0002, "epoch": 5.78096947935368, "step": 80500}, {"loss": 0.6049, "grad_norm": 1.1105149984359741, "learning_rate": 0.0002, "epoch": 5.781687612208259, "step": 80510}, {"loss": 0.5989, "grad_norm": 0.8432297110557556, "learning_rate": 0.0002, "epoch": 5.782405745062837, "step": 80520}, {"loss": 0.6321, "grad_norm": 0.9257984757423401, "learning_rate": 0.0002, "epoch": 5.783123877917415, "step": 80530}, {"loss": 0.6191, "grad_norm": 1.1708799600601196, "learning_rate": 0.0002, "epoch": 5.783842010771993, "step": 80540}, {"loss": 0.5465, "grad_norm": 0.9969521164894104, "learning_rate": 0.0002, "epoch": 5.784560143626571, "step": 80550}, {"loss": 0.6569, "grad_norm": 1.0361413955688477, "learning_rate": 0.0002, "epoch": 5.785278276481149, "step": 80560}, {"loss": 0.6131, "grad_norm": 0.9876393675804138, "learning_rate": 0.0002, "epoch": 5.785996409335727, "step": 80570}, {"loss": 0.5586, "grad_norm": 1.0356241464614868, "learning_rate": 0.0002, "epoch": 5.786714542190305, "step": 80580}, {"loss": 0.5647, "grad_norm": 1.178865671157837, "learning_rate": 0.0002, "epoch": 5.787432675044883, "step": 80590}, {"loss": 0.578, "grad_norm": 0.8614338636398315, "learning_rate": 0.0002, "epoch": 5.788150807899461, "step": 80600}, {"loss": 0.5916, "grad_norm": 1.020734429359436, "learning_rate": 0.0002, "epoch": 5.788868940754039, "step": 80610}, {"loss": 0.6015, "grad_norm": 1.035951852798462, "learning_rate": 0.0002, "epoch": 5.789587073608618, "step": 80620}, {"loss": 0.5838, "grad_norm": 0.898637592792511, "learning_rate": 0.0002, "epoch": 5.790305206463196, "step": 80630}, {"loss": 0.5894, "grad_norm": 0.9803016781806946, "learning_rate": 0.0002, "epoch": 5.791023339317774, "step": 80640}, {"loss": 0.5806, "grad_norm": 1.2902555465698242, "learning_rate": 0.0002, "epoch": 5.791741472172352, "step": 80650}, {"loss": 0.6136, "grad_norm": 1.3364112377166748, "learning_rate": 0.0002, "epoch": 5.79245960502693, "step": 80660}, {"loss": 0.6071, "grad_norm": 0.8553985953330994, "learning_rate": 0.0002, "epoch": 5.793177737881508, "step": 80670}, {"loss": 0.5853, "grad_norm": 0.8211889863014221, "learning_rate": 0.0002, "epoch": 5.793895870736086, "step": 80680}, {"loss": 0.5732, "grad_norm": 0.9288306832313538, "learning_rate": 0.0002, "epoch": 5.794614003590664, "step": 80690}, {"loss": 0.6241, "grad_norm": 1.0716029405593872, "learning_rate": 0.0002, "epoch": 5.795332136445243, "step": 80700}, {"loss": 0.643, "grad_norm": 0.9957329034805298, "learning_rate": 0.0002, "epoch": 5.796050269299821, "step": 80710}, {"loss": 0.5762, "grad_norm": 0.9691376090049744, "learning_rate": 0.0002, "epoch": 5.796768402154399, "step": 80720}, {"loss": 0.6227, "grad_norm": 1.0590804815292358, "learning_rate": 0.0002, "epoch": 5.797486535008977, "step": 80730}, {"loss": 0.59, "grad_norm": 1.0408968925476074, "learning_rate": 0.0002, "epoch": 5.798204667863555, "step": 80740}, {"loss": 0.5656, "grad_norm": 1.0249526500701904, "learning_rate": 0.0002, "epoch": 5.798922800718133, "step": 80750}, {"loss": 0.5991, "grad_norm": 1.3658806085586548, "learning_rate": 0.0002, "epoch": 5.799640933572711, "step": 80760}, {"loss": 0.5671, "grad_norm": 0.9562603831291199, "learning_rate": 0.0002, "epoch": 5.800359066427289, "step": 80770}, {"loss": 0.5929, "grad_norm": 0.8790915012359619, "learning_rate": 0.0002, "epoch": 5.801077199281867, "step": 80780}, {"loss": 0.5864, "grad_norm": 0.8351004123687744, "learning_rate": 0.0002, "epoch": 5.801795332136445, "step": 80790}, {"loss": 0.5544, "grad_norm": 0.964562714099884, "learning_rate": 0.0002, "epoch": 5.802513464991024, "step": 80800}, {"loss": 0.6388, "grad_norm": 1.0873116254806519, "learning_rate": 0.0002, "epoch": 5.803231597845602, "step": 80810}, {"loss": 0.5891, "grad_norm": 0.9821216464042664, "learning_rate": 0.0002, "epoch": 5.80394973070018, "step": 80820}, {"loss": 0.631, "grad_norm": 1.1158807277679443, "learning_rate": 0.0002, "epoch": 5.804667863554758, "step": 80830}, {"loss": 0.6068, "grad_norm": 1.0098856687545776, "learning_rate": 0.0002, "epoch": 5.805385996409336, "step": 80840}, {"loss": 0.6112, "grad_norm": 0.9628035426139832, "learning_rate": 0.0002, "epoch": 5.806104129263914, "step": 80850}, {"loss": 0.6003, "grad_norm": 1.133800983428955, "learning_rate": 0.0002, "epoch": 5.806822262118492, "step": 80860}, {"loss": 0.5802, "grad_norm": 0.9423992037773132, "learning_rate": 0.0002, "epoch": 5.80754039497307, "step": 80870}, {"loss": 0.5729, "grad_norm": 1.0758612155914307, "learning_rate": 0.0002, "epoch": 5.808258527827648, "step": 80880}, {"loss": 0.586, "grad_norm": 1.232029914855957, "learning_rate": 0.0002, "epoch": 5.808976660682227, "step": 80890}, {"loss": 0.5932, "grad_norm": 1.1063108444213867, "learning_rate": 0.0002, "epoch": 5.809694793536805, "step": 80900}, {"loss": 0.5627, "grad_norm": 0.9759877920150757, "learning_rate": 0.0002, "epoch": 5.810412926391383, "step": 80910}, {"loss": 0.6169, "grad_norm": 0.9180193543434143, "learning_rate": 0.0002, "epoch": 5.811131059245961, "step": 80920}, {"loss": 0.6198, "grad_norm": 1.0818052291870117, "learning_rate": 0.0002, "epoch": 5.811849192100539, "step": 80930}, {"loss": 0.5997, "grad_norm": 0.998986542224884, "learning_rate": 0.0002, "epoch": 5.812567324955117, "step": 80940}, {"loss": 0.6183, "grad_norm": 1.1549060344696045, "learning_rate": 0.0002, "epoch": 5.813285457809695, "step": 80950}, {"loss": 0.5858, "grad_norm": 1.1900213956832886, "learning_rate": 0.0002, "epoch": 5.814003590664273, "step": 80960}, {"loss": 0.6249, "grad_norm": 0.8114368915557861, "learning_rate": 0.0002, "epoch": 5.814721723518851, "step": 80970}, {"loss": 0.6199, "grad_norm": 1.0296406745910645, "learning_rate": 0.0002, "epoch": 5.815439856373429, "step": 80980}, {"loss": 0.6226, "grad_norm": 1.0466746091842651, "learning_rate": 0.0002, "epoch": 5.8161579892280075, "step": 80990}, {"loss": 0.6303, "grad_norm": 1.0524508953094482, "learning_rate": 0.0002, "epoch": 5.8168761220825855, "step": 81000}, {"loss": 0.5708, "grad_norm": 1.1588358879089355, "learning_rate": 0.0002, "epoch": 5.8175942549371635, "step": 81010}, {"loss": 0.5818, "grad_norm": 0.9378601908683777, "learning_rate": 0.0002, "epoch": 5.8183123877917415, "step": 81020}, {"loss": 0.6404, "grad_norm": 0.9486441612243652, "learning_rate": 0.0002, "epoch": 5.8190305206463195, "step": 81030}, {"loss": 0.566, "grad_norm": 0.9805227518081665, "learning_rate": 0.0002, "epoch": 5.8197486535008975, "step": 81040}, {"loss": 0.6025, "grad_norm": 1.1627717018127441, "learning_rate": 0.0002, "epoch": 5.8204667863554755, "step": 81050}, {"loss": 0.5954, "grad_norm": 1.0716841220855713, "learning_rate": 0.0002, "epoch": 5.8211849192100535, "step": 81060}, {"loss": 0.6045, "grad_norm": 1.2398899793624878, "learning_rate": 0.0002, "epoch": 5.821903052064632, "step": 81070}, {"loss": 0.5813, "grad_norm": 1.0934730768203735, "learning_rate": 0.0002, "epoch": 5.82262118491921, "step": 81080}, {"loss": 0.5601, "grad_norm": 0.9701796174049377, "learning_rate": 0.0002, "epoch": 5.823339317773788, "step": 81090}, {"loss": 0.6493, "grad_norm": 1.0218969583511353, "learning_rate": 0.0002, "epoch": 5.824057450628366, "step": 81100}, {"loss": 0.6121, "grad_norm": 1.3066465854644775, "learning_rate": 0.0002, "epoch": 5.824775583482944, "step": 81110}, {"loss": 0.6145, "grad_norm": 1.1067441701889038, "learning_rate": 0.0002, "epoch": 5.825493716337522, "step": 81120}, {"loss": 0.5959, "grad_norm": 0.9750344753265381, "learning_rate": 0.0002, "epoch": 5.8262118491921004, "step": 81130}, {"loss": 0.6192, "grad_norm": 1.129191279411316, "learning_rate": 0.0002, "epoch": 5.8269299820466784, "step": 81140}, {"loss": 0.6191, "grad_norm": 1.05964195728302, "learning_rate": 0.0002, "epoch": 5.8276481149012564, "step": 81150}, {"loss": 0.6353, "grad_norm": 1.1094872951507568, "learning_rate": 0.0002, "epoch": 5.8283662477558345, "step": 81160}, {"loss": 0.5835, "grad_norm": 0.9163196086883545, "learning_rate": 0.0002, "epoch": 5.8290843806104125, "step": 81170}, {"loss": 0.6513, "grad_norm": 1.0035687685012817, "learning_rate": 0.0002, "epoch": 5.829802513464991, "step": 81180}, {"loss": 0.5948, "grad_norm": 1.0353461503982544, "learning_rate": 0.0002, "epoch": 5.830520646319569, "step": 81190}, {"loss": 0.602, "grad_norm": 1.0566555261611938, "learning_rate": 0.0002, "epoch": 5.831238779174147, "step": 81200}, {"loss": 0.6086, "grad_norm": 1.2373290061950684, "learning_rate": 0.0002, "epoch": 5.831956912028725, "step": 81210}, {"loss": 0.6054, "grad_norm": 0.8818837404251099, "learning_rate": 0.0002, "epoch": 5.832675044883303, "step": 81220}, {"loss": 0.604, "grad_norm": 1.1024713516235352, "learning_rate": 0.0002, "epoch": 5.833393177737881, "step": 81230}, {"loss": 0.6649, "grad_norm": 1.2478809356689453, "learning_rate": 0.0002, "epoch": 5.834111310592459, "step": 81240}, {"loss": 0.584, "grad_norm": 0.8647364377975464, "learning_rate": 0.0002, "epoch": 5.834829443447037, "step": 81250}, {"loss": 0.6089, "grad_norm": 1.1106358766555786, "learning_rate": 0.0002, "epoch": 5.835547576301616, "step": 81260}, {"loss": 0.5934, "grad_norm": 0.9432938694953918, "learning_rate": 0.0002, "epoch": 5.836265709156194, "step": 81270}, {"loss": 0.6401, "grad_norm": 1.0283797979354858, "learning_rate": 0.0002, "epoch": 5.836983842010772, "step": 81280}, {"loss": 0.6549, "grad_norm": 1.158918857574463, "learning_rate": 0.0002, "epoch": 5.83770197486535, "step": 81290}, {"loss": 0.5974, "grad_norm": 0.9700069427490234, "learning_rate": 0.0002, "epoch": 5.838420107719928, "step": 81300}, {"loss": 0.5841, "grad_norm": 1.08310866355896, "learning_rate": 0.0002, "epoch": 5.839138240574506, "step": 81310}, {"loss": 0.6234, "grad_norm": 1.05460524559021, "learning_rate": 0.0002, "epoch": 5.839856373429084, "step": 81320}, {"loss": 0.5586, "grad_norm": 0.9849268794059753, "learning_rate": 0.0002, "epoch": 5.840574506283662, "step": 81330}, {"loss": 0.5927, "grad_norm": 0.888306736946106, "learning_rate": 0.0002, "epoch": 5.84129263913824, "step": 81340}, {"loss": 0.6106, "grad_norm": 1.0337001085281372, "learning_rate": 0.0002, "epoch": 5.842010771992818, "step": 81350}, {"loss": 0.5957, "grad_norm": 1.0778567790985107, "learning_rate": 0.0002, "epoch": 5.842728904847397, "step": 81360}, {"loss": 0.5801, "grad_norm": 1.1484156847000122, "learning_rate": 0.0002, "epoch": 5.843447037701975, "step": 81370}, {"loss": 0.6348, "grad_norm": 1.0948245525360107, "learning_rate": 0.0002, "epoch": 5.844165170556553, "step": 81380}, {"loss": 0.5561, "grad_norm": 0.9363969564437866, "learning_rate": 0.0002, "epoch": 5.844883303411131, "step": 81390}, {"loss": 0.6336, "grad_norm": 1.0151013135910034, "learning_rate": 0.0002, "epoch": 5.845601436265709, "step": 81400}, {"loss": 0.6063, "grad_norm": 0.9925733804702759, "learning_rate": 0.0002, "epoch": 5.846319569120287, "step": 81410}, {"loss": 0.6512, "grad_norm": 1.0356744527816772, "learning_rate": 0.0002, "epoch": 5.847037701974865, "step": 81420}, {"loss": 0.5947, "grad_norm": 1.0633001327514648, "learning_rate": 0.0002, "epoch": 5.847755834829443, "step": 81430}, {"loss": 0.5851, "grad_norm": 0.9900460839271545, "learning_rate": 0.0002, "epoch": 5.848473967684021, "step": 81440}, {"loss": 0.6216, "grad_norm": 1.2677979469299316, "learning_rate": 0.0002, "epoch": 5.8491921005386, "step": 81450}, {"loss": 0.5633, "grad_norm": 0.8174138069152832, "learning_rate": 0.0002, "epoch": 5.849910233393178, "step": 81460}, {"loss": 0.6283, "grad_norm": 1.1986393928527832, "learning_rate": 0.0002, "epoch": 5.850628366247756, "step": 81470}, {"loss": 0.6056, "grad_norm": 1.1009358167648315, "learning_rate": 0.0002, "epoch": 5.851346499102334, "step": 81480}, {"loss": 0.6244, "grad_norm": 0.966446578502655, "learning_rate": 0.0002, "epoch": 5.852064631956912, "step": 81490}, {"loss": 0.5687, "grad_norm": 0.9657767415046692, "learning_rate": 0.0002, "epoch": 5.85278276481149, "step": 81500}, {"loss": 0.547, "grad_norm": 1.0480058193206787, "learning_rate": 0.0002, "epoch": 5.853500897666068, "step": 81510}, {"loss": 0.5737, "grad_norm": 1.2003830671310425, "learning_rate": 0.0002, "epoch": 5.854219030520646, "step": 81520}, {"loss": 0.602, "grad_norm": 0.8683754205703735, "learning_rate": 0.0002, "epoch": 5.854937163375224, "step": 81530}, {"loss": 0.5923, "grad_norm": 1.0860967636108398, "learning_rate": 0.0002, "epoch": 5.855655296229802, "step": 81540}, {"loss": 0.5959, "grad_norm": 1.0415282249450684, "learning_rate": 0.0002, "epoch": 5.856373429084381, "step": 81550}, {"loss": 0.6017, "grad_norm": 0.9897454380989075, "learning_rate": 0.0002, "epoch": 5.857091561938959, "step": 81560}, {"loss": 0.5588, "grad_norm": 1.173884630203247, "learning_rate": 0.0002, "epoch": 5.857809694793537, "step": 81570}, {"loss": 0.5715, "grad_norm": 1.2426209449768066, "learning_rate": 0.0002, "epoch": 5.858527827648115, "step": 81580}, {"loss": 0.6079, "grad_norm": 0.9390465021133423, "learning_rate": 0.0002, "epoch": 5.859245960502693, "step": 81590}, {"loss": 0.5896, "grad_norm": 1.1387195587158203, "learning_rate": 0.0002, "epoch": 5.859964093357271, "step": 81600}, {"loss": 0.6025, "grad_norm": 0.9902143478393555, "learning_rate": 0.0002, "epoch": 5.860682226211849, "step": 81610}, {"loss": 0.6197, "grad_norm": 0.8328776359558105, "learning_rate": 0.0002, "epoch": 5.861400359066427, "step": 81620}, {"loss": 0.6586, "grad_norm": 0.9837837815284729, "learning_rate": 0.0002, "epoch": 5.862118491921006, "step": 81630}, {"loss": 0.5793, "grad_norm": 1.0013370513916016, "learning_rate": 0.0002, "epoch": 5.862836624775584, "step": 81640}, {"loss": 0.6129, "grad_norm": 0.9408028721809387, "learning_rate": 0.0002, "epoch": 5.863554757630162, "step": 81650}, {"loss": 0.572, "grad_norm": 1.093140959739685, "learning_rate": 0.0002, "epoch": 5.86427289048474, "step": 81660}, {"loss": 0.6037, "grad_norm": 0.9554300904273987, "learning_rate": 0.0002, "epoch": 5.864991023339318, "step": 81670}, {"loss": 0.6136, "grad_norm": 1.1276485919952393, "learning_rate": 0.0002, "epoch": 5.865709156193896, "step": 81680}, {"loss": 0.6072, "grad_norm": 0.9628785252571106, "learning_rate": 0.0002, "epoch": 5.866427289048474, "step": 81690}, {"loss": 0.5962, "grad_norm": 0.9844689965248108, "learning_rate": 0.0002, "epoch": 5.867145421903052, "step": 81700}, {"loss": 0.5883, "grad_norm": 0.9679856896400452, "learning_rate": 0.0002, "epoch": 5.86786355475763, "step": 81710}, {"loss": 0.6244, "grad_norm": 1.0225571393966675, "learning_rate": 0.0002, "epoch": 5.868581687612208, "step": 81720}, {"loss": 0.6132, "grad_norm": 0.9330390691757202, "learning_rate": 0.0002, "epoch": 5.869299820466786, "step": 81730}, {"loss": 0.5895, "grad_norm": 1.0584566593170166, "learning_rate": 0.0002, "epoch": 5.870017953321365, "step": 81740}, {"loss": 0.5618, "grad_norm": 0.781548023223877, "learning_rate": 0.0002, "epoch": 5.870736086175943, "step": 81750}, {"loss": 0.5651, "grad_norm": 0.8906106352806091, "learning_rate": 0.0002, "epoch": 5.871454219030521, "step": 81760}, {"loss": 0.6258, "grad_norm": 1.1402281522750854, "learning_rate": 0.0002, "epoch": 5.872172351885099, "step": 81770}, {"loss": 0.5943, "grad_norm": 0.9991076588630676, "learning_rate": 0.0002, "epoch": 5.872890484739677, "step": 81780}, {"loss": 0.6095, "grad_norm": 1.0120140314102173, "learning_rate": 0.0002, "epoch": 5.873608617594255, "step": 81790}, {"loss": 0.6114, "grad_norm": 0.8857715725898743, "learning_rate": 0.0002, "epoch": 5.874326750448833, "step": 81800}, {"loss": 0.6027, "grad_norm": 0.8531954288482666, "learning_rate": 0.0002, "epoch": 5.875044883303411, "step": 81810}, {"loss": 0.6468, "grad_norm": 1.1601015329360962, "learning_rate": 0.0002, "epoch": 5.87576301615799, "step": 81820}, {"loss": 0.643, "grad_norm": 1.1435350179672241, "learning_rate": 0.0002, "epoch": 5.876481149012568, "step": 81830}, {"loss": 0.6195, "grad_norm": 0.9526153802871704, "learning_rate": 0.0002, "epoch": 5.877199281867146, "step": 81840}, {"loss": 0.648, "grad_norm": 1.06845223903656, "learning_rate": 0.0002, "epoch": 5.877917414721724, "step": 81850}, {"loss": 0.5963, "grad_norm": 0.9239344596862793, "learning_rate": 0.0002, "epoch": 5.878635547576302, "step": 81860}, {"loss": 0.5669, "grad_norm": 0.8632398247718811, "learning_rate": 0.0002, "epoch": 5.87935368043088, "step": 81870}, {"loss": 0.5904, "grad_norm": 0.9148443341255188, "learning_rate": 0.0002, "epoch": 5.880071813285458, "step": 81880}, {"loss": 0.5554, "grad_norm": 0.9910652041435242, "learning_rate": 0.0002, "epoch": 5.880789946140036, "step": 81890}, {"loss": 0.6132, "grad_norm": 0.8335179090499878, "learning_rate": 0.0002, "epoch": 5.881508078994614, "step": 81900}, {"loss": 0.6106, "grad_norm": 0.9921387434005737, "learning_rate": 0.0002, "epoch": 5.882226211849192, "step": 81910}, {"loss": 0.6327, "grad_norm": 1.0532517433166504, "learning_rate": 0.0002, "epoch": 5.88294434470377, "step": 81920}, {"loss": 0.6071, "grad_norm": 1.026400089263916, "learning_rate": 0.0002, "epoch": 5.883662477558349, "step": 81930}, {"loss": 0.6759, "grad_norm": 1.019195318222046, "learning_rate": 0.0002, "epoch": 5.884380610412927, "step": 81940}, {"loss": 0.5922, "grad_norm": 0.987238347530365, "learning_rate": 0.0002, "epoch": 5.885098743267505, "step": 81950}, {"loss": 0.5864, "grad_norm": 1.1714487075805664, "learning_rate": 0.0002, "epoch": 5.885816876122083, "step": 81960}, {"loss": 0.6006, "grad_norm": 1.0854483842849731, "learning_rate": 0.0002, "epoch": 5.886535008976661, "step": 81970}, {"loss": 0.588, "grad_norm": 1.0678396224975586, "learning_rate": 0.0002, "epoch": 5.887253141831239, "step": 81980}, {"loss": 0.6061, "grad_norm": 1.1009471416473389, "learning_rate": 0.0002, "epoch": 5.887971274685817, "step": 81990}, {"loss": 0.6397, "grad_norm": 1.2056844234466553, "learning_rate": 0.0002, "epoch": 5.888689407540395, "step": 82000}, {"loss": 0.6018, "grad_norm": 1.131302833557129, "learning_rate": 0.0002, "epoch": 5.8894075403949735, "step": 82010}, {"loss": 0.5822, "grad_norm": 1.4466036558151245, "learning_rate": 0.0002, "epoch": 5.8901256732495515, "step": 82020}, {"loss": 0.6295, "grad_norm": 1.051228404045105, "learning_rate": 0.0002, "epoch": 5.8908438061041295, "step": 82030}, {"loss": 0.5567, "grad_norm": 1.0010617971420288, "learning_rate": 0.0002, "epoch": 5.8915619389587075, "step": 82040}, {"loss": 0.5674, "grad_norm": 0.9095138311386108, "learning_rate": 0.0002, "epoch": 5.8922800718132855, "step": 82050}, {"loss": 0.5947, "grad_norm": 1.0237005949020386, "learning_rate": 0.0002, "epoch": 5.8929982046678635, "step": 82060}, {"loss": 0.6258, "grad_norm": 1.035122036933899, "learning_rate": 0.0002, "epoch": 5.8937163375224415, "step": 82070}, {"loss": 0.5866, "grad_norm": 1.0271964073181152, "learning_rate": 0.0002, "epoch": 5.8944344703770195, "step": 82080}, {"loss": 0.637, "grad_norm": 1.2044503688812256, "learning_rate": 0.0002, "epoch": 5.8951526032315975, "step": 82090}, {"loss": 0.6356, "grad_norm": 1.0275284051895142, "learning_rate": 0.0002, "epoch": 5.8958707360861755, "step": 82100}, {"loss": 0.6216, "grad_norm": 0.9974840879440308, "learning_rate": 0.0002, "epoch": 5.896588868940754, "step": 82110}, {"loss": 0.572, "grad_norm": 1.009968638420105, "learning_rate": 0.0002, "epoch": 5.897307001795332, "step": 82120}, {"loss": 0.6432, "grad_norm": 0.8396142721176147, "learning_rate": 0.0002, "epoch": 5.89802513464991, "step": 82130}, {"loss": 0.5671, "grad_norm": 1.002354621887207, "learning_rate": 0.0002, "epoch": 5.898743267504488, "step": 82140}, {"loss": 0.565, "grad_norm": 0.9998893737792969, "learning_rate": 0.0002, "epoch": 5.899461400359066, "step": 82150}, {"loss": 0.5836, "grad_norm": 1.1027010679244995, "learning_rate": 0.0002, "epoch": 5.900179533213644, "step": 82160}, {"loss": 0.6069, "grad_norm": 1.2028530836105347, "learning_rate": 0.0002, "epoch": 5.900897666068222, "step": 82170}, {"loss": 0.6184, "grad_norm": 1.0018759965896606, "learning_rate": 0.0002, "epoch": 5.9016157989228, "step": 82180}, {"loss": 0.5866, "grad_norm": 0.8911277055740356, "learning_rate": 0.0002, "epoch": 5.902333931777379, "step": 82190}, {"loss": 0.5638, "grad_norm": 1.0172009468078613, "learning_rate": 0.0002, "epoch": 5.903052064631957, "step": 82200}, {"loss": 0.6181, "grad_norm": 1.1664029359817505, "learning_rate": 0.0002, "epoch": 5.903770197486535, "step": 82210}, {"loss": 0.5863, "grad_norm": 1.0620089769363403, "learning_rate": 0.0002, "epoch": 5.904488330341113, "step": 82220}, {"loss": 0.6175, "grad_norm": 1.0756114721298218, "learning_rate": 0.0002, "epoch": 5.905206463195691, "step": 82230}, {"loss": 0.6223, "grad_norm": 1.1727497577667236, "learning_rate": 0.0002, "epoch": 5.905924596050269, "step": 82240}, {"loss": 0.5777, "grad_norm": 0.9833515882492065, "learning_rate": 0.0002, "epoch": 5.906642728904847, "step": 82250}, {"loss": 0.6344, "grad_norm": 0.9236368536949158, "learning_rate": 0.0002, "epoch": 5.907360861759425, "step": 82260}, {"loss": 0.6301, "grad_norm": 0.9773947596549988, "learning_rate": 0.0002, "epoch": 5.908078994614003, "step": 82270}, {"loss": 0.6255, "grad_norm": 1.1427783966064453, "learning_rate": 0.0002, "epoch": 5.908797127468581, "step": 82280}, {"loss": 0.6359, "grad_norm": 1.0215164422988892, "learning_rate": 0.0002, "epoch": 5.909515260323159, "step": 82290}, {"loss": 0.631, "grad_norm": 1.1157845258712769, "learning_rate": 0.0002, "epoch": 5.910233393177738, "step": 82300}, {"loss": 0.5706, "grad_norm": 1.1490662097930908, "learning_rate": 0.0002, "epoch": 5.910951526032316, "step": 82310}, {"loss": 0.5932, "grad_norm": 0.7233976125717163, "learning_rate": 0.0002, "epoch": 5.911669658886894, "step": 82320}, {"loss": 0.6199, "grad_norm": 1.0053865909576416, "learning_rate": 0.0002, "epoch": 5.912387791741472, "step": 82330}, {"loss": 0.6283, "grad_norm": 0.9764766097068787, "learning_rate": 0.0002, "epoch": 5.91310592459605, "step": 82340}, {"loss": 0.5981, "grad_norm": 0.9492928385734558, "learning_rate": 0.0002, "epoch": 5.913824057450628, "step": 82350}, {"loss": 0.6234, "grad_norm": 0.9538891315460205, "learning_rate": 0.0002, "epoch": 5.914542190305206, "step": 82360}, {"loss": 0.6717, "grad_norm": 1.2620314359664917, "learning_rate": 0.0002, "epoch": 5.915260323159784, "step": 82370}, {"loss": 0.5956, "grad_norm": 0.9913349151611328, "learning_rate": 0.0002, "epoch": 5.915978456014363, "step": 82380}, {"loss": 0.5877, "grad_norm": 0.9712074995040894, "learning_rate": 0.0002, "epoch": 5.916696588868941, "step": 82390}, {"loss": 0.5935, "grad_norm": 1.1554654836654663, "learning_rate": 0.0002, "epoch": 5.917414721723519, "step": 82400}, {"loss": 0.5881, "grad_norm": 1.1418904066085815, "learning_rate": 0.0002, "epoch": 5.918132854578097, "step": 82410}, {"loss": 0.5472, "grad_norm": 0.9405845999717712, "learning_rate": 0.0002, "epoch": 5.918850987432675, "step": 82420}, {"loss": 0.606, "grad_norm": 1.0801819562911987, "learning_rate": 0.0002, "epoch": 5.919569120287253, "step": 82430}, {"loss": 0.5953, "grad_norm": 0.8643896579742432, "learning_rate": 0.0002, "epoch": 5.920287253141831, "step": 82440}, {"loss": 0.6042, "grad_norm": 1.106025218963623, "learning_rate": 0.0002, "epoch": 5.921005385996409, "step": 82450}, {"loss": 0.5879, "grad_norm": 1.0338234901428223, "learning_rate": 0.0002, "epoch": 5.921723518850987, "step": 82460}, {"loss": 0.6733, "grad_norm": 1.0648493766784668, "learning_rate": 0.0002, "epoch": 5.922441651705565, "step": 82470}, {"loss": 0.6233, "grad_norm": 1.1950433254241943, "learning_rate": 0.0002, "epoch": 5.923159784560143, "step": 82480}, {"loss": 0.6148, "grad_norm": 0.8730897903442383, "learning_rate": 0.0002, "epoch": 5.923877917414722, "step": 82490}, {"loss": 0.6138, "grad_norm": 1.2262312173843384, "learning_rate": 0.0002, "epoch": 5.9245960502693, "step": 82500}, {"loss": 0.616, "grad_norm": 0.9526116251945496, "learning_rate": 0.0002, "epoch": 5.925314183123878, "step": 82510}, {"loss": 0.6372, "grad_norm": 1.0540224313735962, "learning_rate": 0.0002, "epoch": 5.926032315978456, "step": 82520}, {"loss": 0.6102, "grad_norm": 1.0537306070327759, "learning_rate": 0.0002, "epoch": 5.926750448833034, "step": 82530}, {"loss": 0.5789, "grad_norm": 1.134207844734192, "learning_rate": 0.0002, "epoch": 5.927468581687612, "step": 82540}, {"loss": 0.622, "grad_norm": 0.9042250514030457, "learning_rate": 0.0002, "epoch": 5.92818671454219, "step": 82550}, {"loss": 0.6207, "grad_norm": 1.0424834489822388, "learning_rate": 0.0002, "epoch": 5.928904847396768, "step": 82560}, {"loss": 0.5334, "grad_norm": 1.1571602821350098, "learning_rate": 0.0002, "epoch": 5.929622980251347, "step": 82570}, {"loss": 0.6549, "grad_norm": 1.1033377647399902, "learning_rate": 0.0002, "epoch": 5.930341113105925, "step": 82580}, {"loss": 0.5819, "grad_norm": 0.9211772680282593, "learning_rate": 0.0002, "epoch": 5.931059245960503, "step": 82590}, {"loss": 0.591, "grad_norm": 1.0566459894180298, "learning_rate": 0.0002, "epoch": 5.931777378815081, "step": 82600}, {"loss": 0.6318, "grad_norm": 1.1773834228515625, "learning_rate": 0.0002, "epoch": 5.932495511669659, "step": 82610}, {"loss": 0.6067, "grad_norm": 1.193396806716919, "learning_rate": 0.0002, "epoch": 5.933213644524237, "step": 82620}, {"loss": 0.6105, "grad_norm": 1.1101785898208618, "learning_rate": 0.0002, "epoch": 5.933931777378815, "step": 82630}, {"loss": 0.5742, "grad_norm": 0.6988118886947632, "learning_rate": 0.0002, "epoch": 5.934649910233393, "step": 82640}, {"loss": 0.626, "grad_norm": 0.9590985774993896, "learning_rate": 0.0002, "epoch": 5.935368043087971, "step": 82650}, {"loss": 0.5909, "grad_norm": 0.8512062430381775, "learning_rate": 0.0002, "epoch": 5.936086175942549, "step": 82660}, {"loss": 0.539, "grad_norm": 1.0381710529327393, "learning_rate": 0.0002, "epoch": 5.936804308797128, "step": 82670}, {"loss": 0.5608, "grad_norm": 1.0816296339035034, "learning_rate": 0.0002, "epoch": 5.937522441651706, "step": 82680}, {"loss": 0.6087, "grad_norm": 1.0592364072799683, "learning_rate": 0.0002, "epoch": 5.938240574506284, "step": 82690}, {"loss": 0.5792, "grad_norm": 0.737452507019043, "learning_rate": 0.0002, "epoch": 5.938958707360862, "step": 82700}, {"loss": 0.6031, "grad_norm": 0.9019039869308472, "learning_rate": 0.0002, "epoch": 5.93967684021544, "step": 82710}, {"loss": 0.6153, "grad_norm": 1.0049666166305542, "learning_rate": 0.0002, "epoch": 5.940394973070018, "step": 82720}, {"loss": 0.619, "grad_norm": 1.0016309022903442, "learning_rate": 0.0002, "epoch": 5.941113105924596, "step": 82730}, {"loss": 0.5796, "grad_norm": 0.7967594861984253, "learning_rate": 0.0002, "epoch": 5.941831238779174, "step": 82740}, {"loss": 0.6418, "grad_norm": 0.8978520631790161, "learning_rate": 0.0002, "epoch": 5.942549371633753, "step": 82750}, {"loss": 0.6234, "grad_norm": 1.0101654529571533, "learning_rate": 0.0002, "epoch": 5.943267504488331, "step": 82760}, {"loss": 0.5813, "grad_norm": 1.1515586376190186, "learning_rate": 0.0002, "epoch": 5.943985637342909, "step": 82770}, {"loss": 0.6031, "grad_norm": 0.8666134476661682, "learning_rate": 0.0002, "epoch": 5.944703770197487, "step": 82780}, {"loss": 0.565, "grad_norm": 1.1365231275558472, "learning_rate": 0.0002, "epoch": 5.945421903052065, "step": 82790}, {"loss": 0.6122, "grad_norm": 1.211229920387268, "learning_rate": 0.0002, "epoch": 5.946140035906643, "step": 82800}, {"loss": 0.5815, "grad_norm": 0.9900869727134705, "learning_rate": 0.0002, "epoch": 5.946858168761221, "step": 82810}, {"loss": 0.5973, "grad_norm": 0.9555928111076355, "learning_rate": 0.0002, "epoch": 5.947576301615799, "step": 82820}, {"loss": 0.5667, "grad_norm": 0.8468470573425293, "learning_rate": 0.0002, "epoch": 5.948294434470377, "step": 82830}, {"loss": 0.5895, "grad_norm": 1.0280319452285767, "learning_rate": 0.0002, "epoch": 5.949012567324955, "step": 82840}, {"loss": 0.5663, "grad_norm": 0.930145800113678, "learning_rate": 0.0002, "epoch": 5.949730700179533, "step": 82850}, {"loss": 0.5482, "grad_norm": 1.0677028894424438, "learning_rate": 0.0002, "epoch": 5.950448833034112, "step": 82860}, {"loss": 0.6009, "grad_norm": 1.2035255432128906, "learning_rate": 0.0002, "epoch": 5.95116696588869, "step": 82870}, {"loss": 0.6207, "grad_norm": 0.897537887096405, "learning_rate": 0.0002, "epoch": 5.951885098743268, "step": 82880}, {"loss": 0.6383, "grad_norm": 1.2858690023422241, "learning_rate": 0.0002, "epoch": 5.952603231597846, "step": 82890}, {"loss": 0.6111, "grad_norm": 1.0300413370132446, "learning_rate": 0.0002, "epoch": 5.953321364452424, "step": 82900}, {"loss": 0.6469, "grad_norm": 0.9873301982879639, "learning_rate": 0.0002, "epoch": 5.954039497307002, "step": 82910}, {"loss": 0.6173, "grad_norm": 1.0315600633621216, "learning_rate": 0.0002, "epoch": 5.95475763016158, "step": 82920}, {"loss": 0.5566, "grad_norm": 1.0631790161132812, "learning_rate": 0.0002, "epoch": 5.955475763016158, "step": 82930}, {"loss": 0.6067, "grad_norm": 1.035544514656067, "learning_rate": 0.0002, "epoch": 5.9561938958707366, "step": 82940}, {"loss": 0.6311, "grad_norm": 1.0162041187286377, "learning_rate": 0.0002, "epoch": 5.956912028725315, "step": 82950}, {"loss": 0.6005, "grad_norm": 0.7858892679214478, "learning_rate": 0.0002, "epoch": 5.957630161579893, "step": 82960}, {"loss": 0.5961, "grad_norm": 1.0359784364700317, "learning_rate": 0.0002, "epoch": 5.958348294434471, "step": 82970}, {"loss": 0.5704, "grad_norm": 1.057173252105713, "learning_rate": 0.0002, "epoch": 5.959066427289049, "step": 82980}, {"loss": 0.6127, "grad_norm": 1.1017464399337769, "learning_rate": 0.0002, "epoch": 5.959784560143627, "step": 82990}, {"loss": 0.5455, "grad_norm": 1.0688945055007935, "learning_rate": 0.0002, "epoch": 5.960502692998205, "step": 83000}, {"loss": 0.5429, "grad_norm": 1.048864483833313, "learning_rate": 0.0002, "epoch": 5.961220825852783, "step": 83010}, {"loss": 0.5559, "grad_norm": 1.057308316230774, "learning_rate": 0.0002, "epoch": 5.961938958707361, "step": 83020}, {"loss": 0.5703, "grad_norm": 0.9014604687690735, "learning_rate": 0.0002, "epoch": 5.962657091561939, "step": 83030}, {"loss": 0.6029, "grad_norm": 0.9899709224700928, "learning_rate": 0.0002, "epoch": 5.963375224416517, "step": 83040}, {"loss": 0.6403, "grad_norm": 1.0675519704818726, "learning_rate": 0.0002, "epoch": 5.9640933572710955, "step": 83050}, {"loss": 0.6016, "grad_norm": 0.9497889876365662, "learning_rate": 0.0002, "epoch": 5.9648114901256735, "step": 83060}, {"loss": 0.5997, "grad_norm": 0.9149549603462219, "learning_rate": 0.0002, "epoch": 5.9655296229802515, "step": 83070}, {"loss": 0.6105, "grad_norm": 1.329373836517334, "learning_rate": 0.0002, "epoch": 5.9662477558348295, "step": 83080}, {"loss": 0.6077, "grad_norm": 1.0731712579727173, "learning_rate": 0.0002, "epoch": 5.9669658886894075, "step": 83090}, {"loss": 0.6269, "grad_norm": 0.9498835802078247, "learning_rate": 0.0002, "epoch": 5.9676840215439855, "step": 83100}, {"loss": 0.6196, "grad_norm": 1.1222829818725586, "learning_rate": 0.0002, "epoch": 5.9684021543985635, "step": 83110}, {"loss": 0.5784, "grad_norm": 0.9923429489135742, "learning_rate": 0.0002, "epoch": 5.9691202872531415, "step": 83120}, {"loss": 0.6223, "grad_norm": 0.9046645164489746, "learning_rate": 0.0002, "epoch": 5.96983842010772, "step": 83130}, {"loss": 0.6252, "grad_norm": 0.9259500503540039, "learning_rate": 0.0002, "epoch": 5.970556552962298, "step": 83140}, {"loss": 0.5849, "grad_norm": 1.0604174137115479, "learning_rate": 0.0002, "epoch": 5.971274685816876, "step": 83150}, {"loss": 0.5789, "grad_norm": 1.0391676425933838, "learning_rate": 0.0002, "epoch": 5.971992818671454, "step": 83160}, {"loss": 0.5861, "grad_norm": 0.8825796246528625, "learning_rate": 0.0002, "epoch": 5.972710951526032, "step": 83170}, {"loss": 0.6164, "grad_norm": 0.9687952399253845, "learning_rate": 0.0002, "epoch": 5.97342908438061, "step": 83180}, {"loss": 0.6127, "grad_norm": 0.9401392340660095, "learning_rate": 0.0002, "epoch": 5.974147217235188, "step": 83190}, {"loss": 0.572, "grad_norm": 1.0526834726333618, "learning_rate": 0.0002, "epoch": 5.974865350089766, "step": 83200}, {"loss": 0.6047, "grad_norm": 1.1882060766220093, "learning_rate": 0.0002, "epoch": 5.975583482944344, "step": 83210}, {"loss": 0.5731, "grad_norm": 0.9182824492454529, "learning_rate": 0.0002, "epoch": 5.976301615798922, "step": 83220}, {"loss": 0.6092, "grad_norm": 1.344875454902649, "learning_rate": 0.0002, "epoch": 5.977019748653501, "step": 83230}, {"loss": 0.6198, "grad_norm": 1.3868434429168701, "learning_rate": 0.0002, "epoch": 5.977737881508079, "step": 83240}, {"loss": 0.6187, "grad_norm": 1.2702280282974243, "learning_rate": 0.0002, "epoch": 5.978456014362657, "step": 83250}, {"loss": 0.6271, "grad_norm": 0.9808234572410583, "learning_rate": 0.0002, "epoch": 5.979174147217235, "step": 83260}, {"loss": 0.6027, "grad_norm": 0.9225142598152161, "learning_rate": 0.0002, "epoch": 5.979892280071813, "step": 83270}, {"loss": 0.626, "grad_norm": 1.1095874309539795, "learning_rate": 0.0002, "epoch": 5.980610412926391, "step": 83280}, {"loss": 0.5994, "grad_norm": 1.2650344371795654, "learning_rate": 0.0002, "epoch": 5.981328545780969, "step": 83290}, {"loss": 0.5808, "grad_norm": 0.8230084180831909, "learning_rate": 0.0002, "epoch": 5.982046678635547, "step": 83300}, {"loss": 0.6399, "grad_norm": 1.171427607536316, "learning_rate": 0.0002, "epoch": 5.982764811490125, "step": 83310}, {"loss": 0.6033, "grad_norm": 0.7458868026733398, "learning_rate": 0.0002, "epoch": 5.983482944344704, "step": 83320}, {"loss": 0.6235, "grad_norm": 0.9238616228103638, "learning_rate": 0.0002, "epoch": 5.984201077199282, "step": 83330}, {"loss": 0.6316, "grad_norm": 1.027495265007019, "learning_rate": 0.0002, "epoch": 5.98491921005386, "step": 83340}, {"loss": 0.6202, "grad_norm": 1.0694037675857544, "learning_rate": 0.0002, "epoch": 5.985637342908438, "step": 83350}, {"loss": 0.5883, "grad_norm": 0.9498767256736755, "learning_rate": 0.0002, "epoch": 5.986355475763016, "step": 83360}, {"loss": 0.6022, "grad_norm": 1.0524284839630127, "learning_rate": 0.0002, "epoch": 5.987073608617594, "step": 83370}, {"loss": 0.5695, "grad_norm": 1.07961905002594, "learning_rate": 0.0002, "epoch": 5.987791741472172, "step": 83380}, {"loss": 0.5835, "grad_norm": 1.1436965465545654, "learning_rate": 0.0002, "epoch": 5.98850987432675, "step": 83390}, {"loss": 0.5835, "grad_norm": 1.2610782384872437, "learning_rate": 0.0002, "epoch": 5.989228007181328, "step": 83400}, {"loss": 0.6018, "grad_norm": 1.1105682849884033, "learning_rate": 0.0002, "epoch": 5.989946140035906, "step": 83410}, {"loss": 0.5989, "grad_norm": 0.9900349378585815, "learning_rate": 0.0002, "epoch": 5.990664272890485, "step": 83420}, {"loss": 0.6492, "grad_norm": 0.8766723275184631, "learning_rate": 0.0002, "epoch": 5.991382405745063, "step": 83430}, {"loss": 0.5944, "grad_norm": 0.9532597661018372, "learning_rate": 0.0002, "epoch": 5.992100538599641, "step": 83440}, {"loss": 0.5903, "grad_norm": 1.016831398010254, "learning_rate": 0.0002, "epoch": 5.992818671454219, "step": 83450}, {"loss": 0.6159, "grad_norm": 0.9884716272354126, "learning_rate": 0.0002, "epoch": 5.993536804308797, "step": 83460}, {"loss": 0.5559, "grad_norm": 0.9415417909622192, "learning_rate": 0.0002, "epoch": 5.994254937163375, "step": 83470}, {"loss": 0.5644, "grad_norm": 0.8629752397537231, "learning_rate": 0.0002, "epoch": 5.994973070017953, "step": 83480}, {"loss": 0.5961, "grad_norm": 1.061378002166748, "learning_rate": 0.0002, "epoch": 5.995691202872531, "step": 83490}, {"loss": 0.6117, "grad_norm": 0.907195508480072, "learning_rate": 0.0002, "epoch": 5.99640933572711, "step": 83500}, {"loss": 0.6584, "grad_norm": 1.023658037185669, "learning_rate": 0.0002, "epoch": 5.997127468581688, "step": 83510}, {"loss": 0.6009, "grad_norm": 0.9893278479576111, "learning_rate": 0.0002, "epoch": 5.997845601436266, "step": 83520}, {"loss": 0.609, "grad_norm": 1.1909127235412598, "learning_rate": 0.0002, "epoch": 5.998563734290844, "step": 83530}, {"loss": 0.5507, "grad_norm": 1.1800892353057861, "learning_rate": 0.0002, "epoch": 5.999281867145422, "step": 83540}, {"loss": 0.605, "grad_norm": 1.0822563171386719, "learning_rate": 0.0002, "epoch": 6.0, "step": 83550}, {"eval_loss": 1.1494214534759521, "eval_runtime": 55.1809, "eval_samples_per_second": 13.284, "eval_steps_per_second": 1.667, "epoch": 6.0, "step": 83550}, {"loss": 0.529, "grad_norm": 0.8760911226272583, "learning_rate": 0.0002, "epoch": 6.000718132854578, "step": 83560}, {"loss": 0.524, "grad_norm": 1.0037305355072021, "learning_rate": 0.0002, "epoch": 6.001436265709156, "step": 83570}, {"loss": 0.5622, "grad_norm": 1.0550320148468018, "learning_rate": 0.0002, "epoch": 6.002154398563734, "step": 83580}, {"loss": 0.5498, "grad_norm": 0.7841113805770874, "learning_rate": 0.0002, "epoch": 6.002872531418312, "step": 83590}, {"loss": 0.5332, "grad_norm": 1.1221094131469727, "learning_rate": 0.0002, "epoch": 6.003590664272891, "step": 83600}, {"loss": 0.5521, "grad_norm": 1.174143671989441, "learning_rate": 0.0002, "epoch": 6.004308797127469, "step": 83610}, {"loss": 0.514, "grad_norm": 1.1316391229629517, "learning_rate": 0.0002, "epoch": 6.005026929982047, "step": 83620}, {"loss": 0.5221, "grad_norm": 0.9318140745162964, "learning_rate": 0.0002, "epoch": 6.005745062836625, "step": 83630}, {"loss": 0.5133, "grad_norm": 1.1589723825454712, "learning_rate": 0.0002, "epoch": 6.006463195691203, "step": 83640}, {"loss": 0.509, "grad_norm": 0.7452214360237122, "learning_rate": 0.0002, "epoch": 6.007181328545781, "step": 83650}, {"loss": 0.5522, "grad_norm": 1.205767035484314, "learning_rate": 0.0002, "epoch": 6.007899461400359, "step": 83660}, {"loss": 0.4888, "grad_norm": 0.8741596341133118, "learning_rate": 0.0002, "epoch": 6.008617594254937, "step": 83670}, {"loss": 0.5653, "grad_norm": 1.152982234954834, "learning_rate": 0.0002, "epoch": 6.009335727109515, "step": 83680}, {"loss": 0.5286, "grad_norm": 1.2438874244689941, "learning_rate": 0.0002, "epoch": 6.010053859964093, "step": 83690}, {"loss": 0.5455, "grad_norm": 1.142795443534851, "learning_rate": 0.0002, "epoch": 6.010771992818672, "step": 83700}, {"loss": 0.5678, "grad_norm": 1.1999919414520264, "learning_rate": 0.0002, "epoch": 6.01149012567325, "step": 83710}, {"loss": 0.5233, "grad_norm": 1.1839698553085327, "learning_rate": 0.0002, "epoch": 6.012208258527828, "step": 83720}, {"loss": 0.5483, "grad_norm": 1.1131623983383179, "learning_rate": 0.0002, "epoch": 6.012926391382406, "step": 83730}, {"loss": 0.5086, "grad_norm": 0.8436203598976135, "learning_rate": 0.0002, "epoch": 6.013644524236984, "step": 83740}, {"loss": 0.4991, "grad_norm": 0.9938826560974121, "learning_rate": 0.0002, "epoch": 6.014362657091562, "step": 83750}, {"loss": 0.5767, "grad_norm": 1.1624900102615356, "learning_rate": 0.0002, "epoch": 6.01508078994614, "step": 83760}, {"loss": 0.5116, "grad_norm": 1.0212476253509521, "learning_rate": 0.0002, "epoch": 6.015798922800718, "step": 83770}, {"loss": 0.5247, "grad_norm": 0.8108501434326172, "learning_rate": 0.0002, "epoch": 6.016517055655296, "step": 83780}, {"loss": 0.5325, "grad_norm": 1.3106935024261475, "learning_rate": 0.0002, "epoch": 6.017235188509875, "step": 83790}, {"loss": 0.5336, "grad_norm": 1.3103147745132446, "learning_rate": 0.0002, "epoch": 6.017953321364453, "step": 83800}, {"loss": 0.5224, "grad_norm": 0.7501855492591858, "learning_rate": 0.0002, "epoch": 6.018671454219031, "step": 83810}, {"loss": 0.5079, "grad_norm": 0.9246482253074646, "learning_rate": 0.0002, "epoch": 6.019389587073609, "step": 83820}, {"loss": 0.5038, "grad_norm": 1.0305052995681763, "learning_rate": 0.0002, "epoch": 6.020107719928187, "step": 83830}, {"loss": 0.5314, "grad_norm": 1.0912569761276245, "learning_rate": 0.0002, "epoch": 6.020825852782765, "step": 83840}, {"loss": 0.5268, "grad_norm": 0.9320057034492493, "learning_rate": 0.0002, "epoch": 6.021543985637343, "step": 83850}, {"loss": 0.4795, "grad_norm": 1.160483479499817, "learning_rate": 0.0002, "epoch": 6.022262118491921, "step": 83860}, {"loss": 0.5014, "grad_norm": 1.0211237668991089, "learning_rate": 0.0002, "epoch": 6.022980251346499, "step": 83870}, {"loss": 0.5515, "grad_norm": 0.8101710081100464, "learning_rate": 0.0002, "epoch": 6.023698384201078, "step": 83880}, {"loss": 0.509, "grad_norm": 1.0671406984329224, "learning_rate": 0.0002, "epoch": 6.024416517055656, "step": 83890}, {"loss": 0.5573, "grad_norm": 1.3084125518798828, "learning_rate": 0.0002, "epoch": 6.025134649910234, "step": 83900}, {"loss": 0.5046, "grad_norm": 1.0144813060760498, "learning_rate": 0.0002, "epoch": 6.025852782764812, "step": 83910}, {"loss": 0.5184, "grad_norm": 1.134848952293396, "learning_rate": 0.0002, "epoch": 6.02657091561939, "step": 83920}, {"loss": 0.5241, "grad_norm": 1.183115005493164, "learning_rate": 0.0002, "epoch": 6.027289048473968, "step": 83930}, {"loss": 0.5097, "grad_norm": 0.961912989616394, "learning_rate": 0.0002, "epoch": 6.028007181328546, "step": 83940}, {"loss": 0.524, "grad_norm": 0.9033881425857544, "learning_rate": 0.0002, "epoch": 6.028725314183124, "step": 83950}, {"loss": 0.4978, "grad_norm": 1.0272901058197021, "learning_rate": 0.0002, "epoch": 6.029443447037702, "step": 83960}, {"loss": 0.5218, "grad_norm": 1.0007939338684082, "learning_rate": 0.0002, "epoch": 6.03016157989228, "step": 83970}, {"loss": 0.5215, "grad_norm": 1.0941389799118042, "learning_rate": 0.0002, "epoch": 6.0308797127468585, "step": 83980}, {"loss": 0.4881, "grad_norm": 0.9068517088890076, "learning_rate": 0.0002, "epoch": 6.0315978456014365, "step": 83990}, {"loss": 0.5352, "grad_norm": 0.8636500835418701, "learning_rate": 0.0002, "epoch": 6.0323159784560145, "step": 84000}, {"loss": 0.5668, "grad_norm": 1.352675437927246, "learning_rate": 0.0002, "epoch": 6.0330341113105925, "step": 84010}, {"loss": 0.5201, "grad_norm": 1.0889637470245361, "learning_rate": 0.0002, "epoch": 6.0337522441651705, "step": 84020}, {"loss": 0.5143, "grad_norm": 0.9063141345977783, "learning_rate": 0.0002, "epoch": 6.0344703770197485, "step": 84030}, {"loss": 0.5089, "grad_norm": 1.317254900932312, "learning_rate": 0.0002, "epoch": 6.0351885098743265, "step": 84040}, {"loss": 0.5198, "grad_norm": 1.1001603603363037, "learning_rate": 0.0002, "epoch": 6.0359066427289045, "step": 84050}, {"loss": 0.5167, "grad_norm": 0.8041839003562927, "learning_rate": 0.0002, "epoch": 6.0366247755834825, "step": 84060}, {"loss": 0.5157, "grad_norm": 1.125082015991211, "learning_rate": 0.0002, "epoch": 6.037342908438061, "step": 84070}, {"loss": 0.5023, "grad_norm": 0.8926277160644531, "learning_rate": 0.0002, "epoch": 6.038061041292639, "step": 84080}, {"loss": 0.4888, "grad_norm": 1.0548304319381714, "learning_rate": 0.0002, "epoch": 6.038779174147217, "step": 84090}, {"loss": 0.5216, "grad_norm": 1.2299435138702393, "learning_rate": 0.0002, "epoch": 6.039497307001795, "step": 84100}, {"loss": 0.5243, "grad_norm": 0.7348281741142273, "learning_rate": 0.0002, "epoch": 6.040215439856373, "step": 84110}, {"loss": 0.5598, "grad_norm": 1.032209873199463, "learning_rate": 0.0002, "epoch": 6.040933572710951, "step": 84120}, {"loss": 0.5448, "grad_norm": 0.925134003162384, "learning_rate": 0.0002, "epoch": 6.041651705565529, "step": 84130}, {"loss": 0.5153, "grad_norm": 1.1078300476074219, "learning_rate": 0.0002, "epoch": 6.042369838420107, "step": 84140}, {"loss": 0.5407, "grad_norm": 0.9045702815055847, "learning_rate": 0.0002, "epoch": 6.043087971274685, "step": 84150}, {"loss": 0.5188, "grad_norm": 0.8836823105812073, "learning_rate": 0.0002, "epoch": 6.043806104129264, "step": 84160}, {"loss": 0.5242, "grad_norm": 0.8083572387695312, "learning_rate": 0.0002, "epoch": 6.044524236983842, "step": 84170}, {"loss": 0.5203, "grad_norm": 0.8744190335273743, "learning_rate": 0.0002, "epoch": 6.04524236983842, "step": 84180}, {"loss": 0.5372, "grad_norm": 1.1944562196731567, "learning_rate": 0.0002, "epoch": 6.045960502692998, "step": 84190}, {"loss": 0.5648, "grad_norm": 1.3782621622085571, "learning_rate": 0.0002, "epoch": 6.046678635547576, "step": 84200}, {"loss": 0.5744, "grad_norm": 1.2800641059875488, "learning_rate": 0.0002, "epoch": 6.047396768402154, "step": 84210}, {"loss": 0.5513, "grad_norm": 1.1035456657409668, "learning_rate": 0.0002, "epoch": 6.048114901256732, "step": 84220}, {"loss": 0.5428, "grad_norm": 1.243274211883545, "learning_rate": 0.0002, "epoch": 6.04883303411131, "step": 84230}, {"loss": 0.55, "grad_norm": 0.8821795582771301, "learning_rate": 0.0002, "epoch": 6.049551166965888, "step": 84240}, {"loss": 0.5563, "grad_norm": 0.8730825185775757, "learning_rate": 0.0002, "epoch": 6.050269299820466, "step": 84250}, {"loss": 0.5755, "grad_norm": 0.9874304533004761, "learning_rate": 0.0002, "epoch": 6.050987432675045, "step": 84260}, {"loss": 0.5261, "grad_norm": 1.3245618343353271, "learning_rate": 0.0002, "epoch": 6.051705565529623, "step": 84270}, {"loss": 0.5172, "grad_norm": 1.04741370677948, "learning_rate": 0.0002, "epoch": 6.052423698384201, "step": 84280}, {"loss": 0.511, "grad_norm": 1.1984949111938477, "learning_rate": 0.0002, "epoch": 6.053141831238779, "step": 84290}, {"loss": 0.5148, "grad_norm": 0.9603039622306824, "learning_rate": 0.0002, "epoch": 6.053859964093357, "step": 84300}, {"loss": 0.54, "grad_norm": 1.178102731704712, "learning_rate": 0.0002, "epoch": 6.054578096947935, "step": 84310}, {"loss": 0.554, "grad_norm": 1.135046124458313, "learning_rate": 0.0002, "epoch": 6.055296229802513, "step": 84320}, {"loss": 0.517, "grad_norm": 0.9682887196540833, "learning_rate": 0.0002, "epoch": 6.056014362657091, "step": 84330}, {"loss": 0.5089, "grad_norm": 0.9676550030708313, "learning_rate": 0.0002, "epoch": 6.056732495511669, "step": 84340}, {"loss": 0.5472, "grad_norm": 1.0987977981567383, "learning_rate": 0.0002, "epoch": 6.057450628366248, "step": 84350}, {"loss": 0.5414, "grad_norm": 0.9808574914932251, "learning_rate": 0.0002, "epoch": 6.058168761220826, "step": 84360}, {"loss": 0.4836, "grad_norm": 1.0585200786590576, "learning_rate": 0.0002, "epoch": 6.058886894075404, "step": 84370}, {"loss": 0.5177, "grad_norm": 0.9592017531394958, "learning_rate": 0.0002, "epoch": 6.059605026929982, "step": 84380}, {"loss": 0.5352, "grad_norm": 0.9652285575866699, "learning_rate": 0.0002, "epoch": 6.06032315978456, "step": 84390}, {"loss": 0.5237, "grad_norm": 1.1223928928375244, "learning_rate": 0.0002, "epoch": 6.061041292639138, "step": 84400}, {"loss": 0.5515, "grad_norm": 1.0554455518722534, "learning_rate": 0.0002, "epoch": 6.061759425493716, "step": 84410}, {"loss": 0.5652, "grad_norm": 1.4566363096237183, "learning_rate": 0.0002, "epoch": 6.062477558348294, "step": 84420}, {"loss": 0.5219, "grad_norm": 1.0793368816375732, "learning_rate": 0.0002, "epoch": 6.063195691202872, "step": 84430}, {"loss": 0.5532, "grad_norm": 1.1032981872558594, "learning_rate": 0.0002, "epoch": 6.063913824057451, "step": 84440}, {"loss": 0.5257, "grad_norm": 1.0701037645339966, "learning_rate": 0.0002, "epoch": 6.064631956912029, "step": 84450}, {"loss": 0.5505, "grad_norm": 0.9359426498413086, "learning_rate": 0.0002, "epoch": 6.065350089766607, "step": 84460}, {"loss": 0.5363, "grad_norm": 1.0277773141860962, "learning_rate": 0.0002, "epoch": 6.066068222621185, "step": 84470}, {"loss": 0.5082, "grad_norm": 1.029319405555725, "learning_rate": 0.0002, "epoch": 6.066786355475763, "step": 84480}, {"loss": 0.4949, "grad_norm": 1.3563756942749023, "learning_rate": 0.0002, "epoch": 6.067504488330341, "step": 84490}, {"loss": 0.55, "grad_norm": 0.9577816128730774, "learning_rate": 0.0002, "epoch": 6.068222621184919, "step": 84500}, {"loss": 0.51, "grad_norm": 0.9856799840927124, "learning_rate": 0.0002, "epoch": 6.068940754039497, "step": 84510}, {"loss": 0.5527, "grad_norm": 1.3285183906555176, "learning_rate": 0.0002, "epoch": 6.069658886894075, "step": 84520}, {"loss": 0.517, "grad_norm": 1.0407335758209229, "learning_rate": 0.0002, "epoch": 6.070377019748653, "step": 84530}, {"loss": 0.5083, "grad_norm": 1.3125360012054443, "learning_rate": 0.0002, "epoch": 6.071095152603232, "step": 84540}, {"loss": 0.4791, "grad_norm": 1.0198888778686523, "learning_rate": 0.0002, "epoch": 6.07181328545781, "step": 84550}, {"loss": 0.5629, "grad_norm": 1.198135256767273, "learning_rate": 0.0002, "epoch": 6.072531418312388, "step": 84560}, {"loss": 0.5213, "grad_norm": 1.1547776460647583, "learning_rate": 0.0002, "epoch": 6.073249551166966, "step": 84570}, {"loss": 0.5503, "grad_norm": 1.1667766571044922, "learning_rate": 0.0002, "epoch": 6.073967684021544, "step": 84580}, {"loss": 0.5465, "grad_norm": 0.945159375667572, "learning_rate": 0.0002, "epoch": 6.074685816876122, "step": 84590}, {"loss": 0.5451, "grad_norm": 1.0362721681594849, "learning_rate": 0.0002, "epoch": 6.0754039497307, "step": 84600}, {"loss": 0.5538, "grad_norm": 1.1442973613739014, "learning_rate": 0.0002, "epoch": 6.076122082585278, "step": 84610}, {"loss": 0.5285, "grad_norm": 1.2077388763427734, "learning_rate": 0.0002, "epoch": 6.076840215439856, "step": 84620}, {"loss": 0.5581, "grad_norm": 1.1404398679733276, "learning_rate": 0.0002, "epoch": 6.077558348294435, "step": 84630}, {"loss": 0.5522, "grad_norm": 1.0291249752044678, "learning_rate": 0.0002, "epoch": 6.078276481149013, "step": 84640}, {"loss": 0.5227, "grad_norm": 1.2045460939407349, "learning_rate": 0.0002, "epoch": 6.078994614003591, "step": 84650}, {"loss": 0.5475, "grad_norm": 0.9492267966270447, "learning_rate": 0.0002, "epoch": 6.079712746858169, "step": 84660}, {"loss": 0.5664, "grad_norm": 0.9108620285987854, "learning_rate": 0.0002, "epoch": 6.080430879712747, "step": 84670}, {"loss": 0.517, "grad_norm": 1.0403251647949219, "learning_rate": 0.0002, "epoch": 6.081149012567325, "step": 84680}, {"loss": 0.5245, "grad_norm": 0.8537648916244507, "learning_rate": 0.0002, "epoch": 6.081867145421903, "step": 84690}, {"loss": 0.5572, "grad_norm": 0.8450568914413452, "learning_rate": 0.0002, "epoch": 6.082585278276481, "step": 84700}, {"loss": 0.5424, "grad_norm": 0.9770439267158508, "learning_rate": 0.0002, "epoch": 6.083303411131059, "step": 84710}, {"loss": 0.5268, "grad_norm": 0.7480165958404541, "learning_rate": 0.0002, "epoch": 6.084021543985638, "step": 84720}, {"loss": 0.5565, "grad_norm": 1.0038665533065796, "learning_rate": 0.0002, "epoch": 6.084739676840216, "step": 84730}, {"loss": 0.5779, "grad_norm": 1.2631266117095947, "learning_rate": 0.0002, "epoch": 6.085457809694794, "step": 84740}, {"loss": 0.5282, "grad_norm": 1.0285290479660034, "learning_rate": 0.0002, "epoch": 6.086175942549372, "step": 84750}, {"loss": 0.5393, "grad_norm": 0.8775458335876465, "learning_rate": 0.0002, "epoch": 6.08689407540395, "step": 84760}, {"loss": 0.5046, "grad_norm": 1.105391263961792, "learning_rate": 0.0002, "epoch": 6.087612208258528, "step": 84770}, {"loss": 0.5349, "grad_norm": 0.9214589595794678, "learning_rate": 0.0002, "epoch": 6.088330341113106, "step": 84780}, {"loss": 0.5076, "grad_norm": 1.1920515298843384, "learning_rate": 0.0002, "epoch": 6.089048473967684, "step": 84790}, {"loss": 0.5481, "grad_norm": 1.0314369201660156, "learning_rate": 0.0002, "epoch": 6.089766606822262, "step": 84800}, {"loss": 0.5553, "grad_norm": 1.1323022842407227, "learning_rate": 0.0002, "epoch": 6.09048473967684, "step": 84810}, {"loss": 0.554, "grad_norm": 0.9882907271385193, "learning_rate": 0.0002, "epoch": 6.091202872531419, "step": 84820}, {"loss": 0.5038, "grad_norm": 0.9372309446334839, "learning_rate": 0.0002, "epoch": 6.091921005385997, "step": 84830}, {"loss": 0.547, "grad_norm": 0.9904384016990662, "learning_rate": 0.0002, "epoch": 6.092639138240575, "step": 84840}, {"loss": 0.6083, "grad_norm": 1.1983239650726318, "learning_rate": 0.0002, "epoch": 6.093357271095153, "step": 84850}, {"loss": 0.5018, "grad_norm": 1.0157414674758911, "learning_rate": 0.0002, "epoch": 6.094075403949731, "step": 84860}, {"loss": 0.5264, "grad_norm": 1.1213963031768799, "learning_rate": 0.0002, "epoch": 6.094793536804309, "step": 84870}, {"loss": 0.5351, "grad_norm": 0.9863889813423157, "learning_rate": 0.0002, "epoch": 6.095511669658887, "step": 84880}, {"loss": 0.5816, "grad_norm": 1.2265585660934448, "learning_rate": 0.0002, "epoch": 6.096229802513465, "step": 84890}, {"loss": 0.5176, "grad_norm": 0.9000206589698792, "learning_rate": 0.0002, "epoch": 6.096947935368043, "step": 84900}, {"loss": 0.5849, "grad_norm": 0.9284350872039795, "learning_rate": 0.0002, "epoch": 6.097666068222622, "step": 84910}, {"loss": 0.535, "grad_norm": 0.8180069923400879, "learning_rate": 0.0002, "epoch": 6.0983842010772, "step": 84920}, {"loss": 0.5082, "grad_norm": 1.0313721895217896, "learning_rate": 0.0002, "epoch": 6.099102333931778, "step": 84930}, {"loss": 0.5233, "grad_norm": 0.9959180355072021, "learning_rate": 0.0002, "epoch": 6.099820466786356, "step": 84940}, {"loss": 0.554, "grad_norm": 1.1720712184906006, "learning_rate": 0.0002, "epoch": 6.100538599640934, "step": 84950}, {"loss": 0.5286, "grad_norm": 1.1033729314804077, "learning_rate": 0.0002, "epoch": 6.101256732495512, "step": 84960}, {"loss": 0.5303, "grad_norm": 1.2325657606124878, "learning_rate": 0.0002, "epoch": 6.10197486535009, "step": 84970}, {"loss": 0.5135, "grad_norm": 1.204935073852539, "learning_rate": 0.0002, "epoch": 6.102692998204668, "step": 84980}, {"loss": 0.4999, "grad_norm": 0.9543479084968567, "learning_rate": 0.0002, "epoch": 6.103411131059246, "step": 84990}, {"loss": 0.5488, "grad_norm": 1.0036866664886475, "learning_rate": 0.0002, "epoch": 6.1041292639138245, "step": 85000}, {"loss": 0.5224, "grad_norm": 1.0862882137298584, "learning_rate": 0.0002, "epoch": 6.1048473967684025, "step": 85010}, {"loss": 0.5399, "grad_norm": 1.052764892578125, "learning_rate": 0.0002, "epoch": 6.1055655296229805, "step": 85020}, {"loss": 0.5517, "grad_norm": 1.1948769092559814, "learning_rate": 0.0002, "epoch": 6.1062836624775585, "step": 85030}, {"loss": 0.5384, "grad_norm": 1.0291588306427002, "learning_rate": 0.0002, "epoch": 6.1070017953321365, "step": 85040}, {"loss": 0.5456, "grad_norm": 1.2162322998046875, "learning_rate": 0.0002, "epoch": 6.1077199281867145, "step": 85050}, {"loss": 0.5143, "grad_norm": 1.2867375612258911, "learning_rate": 0.0002, "epoch": 6.1084380610412925, "step": 85060}, {"loss": 0.5903, "grad_norm": 0.9639427661895752, "learning_rate": 0.0002, "epoch": 6.1091561938958705, "step": 85070}, {"loss": 0.5671, "grad_norm": 1.0775039196014404, "learning_rate": 0.0002, "epoch": 6.1098743267504485, "step": 85080}, {"loss": 0.5223, "grad_norm": 1.0423188209533691, "learning_rate": 0.0002, "epoch": 6.1105924596050265, "step": 85090}, {"loss": 0.5737, "grad_norm": 0.9388473033905029, "learning_rate": 0.0002, "epoch": 6.111310592459605, "step": 85100}, {"loss": 0.5676, "grad_norm": 1.0761773586273193, "learning_rate": 0.0002, "epoch": 6.112028725314183, "step": 85110}, {"loss": 0.5144, "grad_norm": 1.0886104106903076, "learning_rate": 0.0002, "epoch": 6.112746858168761, "step": 85120}, {"loss": 0.4909, "grad_norm": 0.8716141581535339, "learning_rate": 0.0002, "epoch": 6.113464991023339, "step": 85130}, {"loss": 0.5598, "grad_norm": 1.5060595273971558, "learning_rate": 0.0002, "epoch": 6.114183123877917, "step": 85140}, {"loss": 0.5431, "grad_norm": 1.2417129278182983, "learning_rate": 0.0002, "epoch": 6.114901256732495, "step": 85150}, {"loss": 0.5405, "grad_norm": 1.063604712486267, "learning_rate": 0.0002, "epoch": 6.115619389587073, "step": 85160}, {"loss": 0.5832, "grad_norm": 1.1341352462768555, "learning_rate": 0.0002, "epoch": 6.116337522441651, "step": 85170}, {"loss": 0.5708, "grad_norm": 1.011865258216858, "learning_rate": 0.0002, "epoch": 6.117055655296229, "step": 85180}, {"loss": 0.5472, "grad_norm": 1.0746972560882568, "learning_rate": 0.0002, "epoch": 6.117773788150808, "step": 85190}, {"loss": 0.5301, "grad_norm": 0.9522349238395691, "learning_rate": 0.0002, "epoch": 6.118491921005386, "step": 85200}, {"loss": 0.5952, "grad_norm": 1.091785192489624, "learning_rate": 0.0002, "epoch": 6.119210053859964, "step": 85210}, {"loss": 0.5474, "grad_norm": 1.1013420820236206, "learning_rate": 0.0002, "epoch": 6.119928186714542, "step": 85220}, {"loss": 0.5498, "grad_norm": 0.9477053880691528, "learning_rate": 0.0002, "epoch": 6.12064631956912, "step": 85230}, {"loss": 0.5594, "grad_norm": 1.1278045177459717, "learning_rate": 0.0002, "epoch": 6.121364452423698, "step": 85240}, {"loss": 0.5266, "grad_norm": 1.0343154668807983, "learning_rate": 0.0002, "epoch": 6.122082585278276, "step": 85250}, {"loss": 0.5581, "grad_norm": 0.9023236036300659, "learning_rate": 0.0002, "epoch": 6.122800718132854, "step": 85260}, {"loss": 0.5282, "grad_norm": 1.1085705757141113, "learning_rate": 0.0002, "epoch": 6.123518850987432, "step": 85270}, {"loss": 0.5482, "grad_norm": 1.2945729494094849, "learning_rate": 0.0002, "epoch": 6.124236983842011, "step": 85280}, {"loss": 0.5331, "grad_norm": 1.0367915630340576, "learning_rate": 0.0002, "epoch": 6.124955116696589, "step": 85290}, {"loss": 0.5546, "grad_norm": 0.9990636706352234, "learning_rate": 0.0002, "epoch": 6.125673249551167, "step": 85300}, {"loss": 0.5182, "grad_norm": 0.9737518429756165, "learning_rate": 0.0002, "epoch": 6.126391382405745, "step": 85310}, {"loss": 0.5826, "grad_norm": 1.0211181640625, "learning_rate": 0.0002, "epoch": 6.127109515260323, "step": 85320}, {"loss": 0.5153, "grad_norm": 0.9609670042991638, "learning_rate": 0.0002, "epoch": 6.127827648114901, "step": 85330}, {"loss": 0.582, "grad_norm": 1.124629259109497, "learning_rate": 0.0002, "epoch": 6.128545780969479, "step": 85340}, {"loss": 0.56, "grad_norm": 0.9436500072479248, "learning_rate": 0.0002, "epoch": 6.129263913824057, "step": 85350}, {"loss": 0.5568, "grad_norm": 1.3075382709503174, "learning_rate": 0.0002, "epoch": 6.129982046678635, "step": 85360}, {"loss": 0.543, "grad_norm": 0.9185589551925659, "learning_rate": 0.0002, "epoch": 6.130700179533213, "step": 85370}, {"loss": 0.5418, "grad_norm": 1.1051443815231323, "learning_rate": 0.0002, "epoch": 6.131418312387792, "step": 85380}, {"loss": 0.5727, "grad_norm": 1.185263752937317, "learning_rate": 0.0002, "epoch": 6.13213644524237, "step": 85390}, {"loss": 0.5448, "grad_norm": 1.0959895849227905, "learning_rate": 0.0002, "epoch": 6.132854578096948, "step": 85400}, {"loss": 0.4946, "grad_norm": 0.9279834032058716, "learning_rate": 0.0002, "epoch": 6.133572710951526, "step": 85410}, {"loss": 0.5524, "grad_norm": 1.36788010597229, "learning_rate": 0.0002, "epoch": 6.134290843806104, "step": 85420}, {"loss": 0.5122, "grad_norm": 1.0156842470169067, "learning_rate": 0.0002, "epoch": 6.135008976660682, "step": 85430}, {"loss": 0.5287, "grad_norm": 0.9998385906219482, "learning_rate": 0.0002, "epoch": 6.13572710951526, "step": 85440}, {"loss": 0.5205, "grad_norm": 1.21120285987854, "learning_rate": 0.0002, "epoch": 6.136445242369838, "step": 85450}, {"loss": 0.561, "grad_norm": 1.1198976039886475, "learning_rate": 0.0002, "epoch": 6.137163375224416, "step": 85460}, {"loss": 0.5527, "grad_norm": 0.8551197648048401, "learning_rate": 0.0002, "epoch": 6.137881508078995, "step": 85470}, {"loss": 0.5501, "grad_norm": 1.378423810005188, "learning_rate": 0.0002, "epoch": 6.138599640933573, "step": 85480}, {"loss": 0.5584, "grad_norm": 1.0602139234542847, "learning_rate": 0.0002, "epoch": 6.139317773788151, "step": 85490}, {"loss": 0.5656, "grad_norm": 0.9416277408599854, "learning_rate": 0.0002, "epoch": 6.140035906642729, "step": 85500}, {"loss": 0.5461, "grad_norm": 0.9356902241706848, "learning_rate": 0.0002, "epoch": 6.140754039497307, "step": 85510}, {"loss": 0.5405, "grad_norm": 1.1635851860046387, "learning_rate": 0.0002, "epoch": 6.141472172351885, "step": 85520}, {"loss": 0.5026, "grad_norm": 0.7880265712738037, "learning_rate": 0.0002, "epoch": 6.142190305206463, "step": 85530}, {"loss": 0.6164, "grad_norm": 1.0618375539779663, "learning_rate": 0.0002, "epoch": 6.142908438061041, "step": 85540}, {"loss": 0.5202, "grad_norm": 0.8438394665718079, "learning_rate": 0.0002, "epoch": 6.143626570915619, "step": 85550}, {"loss": 0.5651, "grad_norm": 1.0630128383636475, "learning_rate": 0.0002, "epoch": 6.144344703770198, "step": 85560}, {"loss": 0.5128, "grad_norm": 1.027308464050293, "learning_rate": 0.0002, "epoch": 6.145062836624776, "step": 85570}, {"loss": 0.5519, "grad_norm": 1.0832568407058716, "learning_rate": 0.0002, "epoch": 6.145780969479354, "step": 85580}, {"loss": 0.5484, "grad_norm": 0.9134858250617981, "learning_rate": 0.0002, "epoch": 6.146499102333932, "step": 85590}, {"loss": 0.5539, "grad_norm": 1.2738041877746582, "learning_rate": 0.0002, "epoch": 6.14721723518851, "step": 85600}, {"loss": 0.5141, "grad_norm": 0.9961518049240112, "learning_rate": 0.0002, "epoch": 6.147935368043088, "step": 85610}, {"loss": 0.5173, "grad_norm": 0.8851816654205322, "learning_rate": 0.0002, "epoch": 6.148653500897666, "step": 85620}, {"loss": 0.5478, "grad_norm": 0.96479731798172, "learning_rate": 0.0002, "epoch": 6.149371633752244, "step": 85630}, {"loss": 0.536, "grad_norm": 0.903256893157959, "learning_rate": 0.0002, "epoch": 6.150089766606822, "step": 85640}, {"loss": 0.5263, "grad_norm": 1.065151333808899, "learning_rate": 0.0002, "epoch": 6.1508078994614, "step": 85650}, {"loss": 0.5495, "grad_norm": 0.9824285507202148, "learning_rate": 0.0002, "epoch": 6.151526032315979, "step": 85660}, {"loss": 0.5724, "grad_norm": 1.1620386838912964, "learning_rate": 0.0002, "epoch": 6.152244165170557, "step": 85670}, {"loss": 0.5706, "grad_norm": 1.134757161140442, "learning_rate": 0.0002, "epoch": 6.152962298025135, "step": 85680}, {"loss": 0.5532, "grad_norm": 1.165537714958191, "learning_rate": 0.0002, "epoch": 6.153680430879713, "step": 85690}, {"loss": 0.5293, "grad_norm": 0.9486454129219055, "learning_rate": 0.0002, "epoch": 6.154398563734291, "step": 85700}, {"loss": 0.5219, "grad_norm": 0.9379110932350159, "learning_rate": 0.0002, "epoch": 6.155116696588869, "step": 85710}, {"loss": 0.5623, "grad_norm": 1.0051493644714355, "learning_rate": 0.0002, "epoch": 6.155834829443447, "step": 85720}, {"loss": 0.5389, "grad_norm": 0.9311991333961487, "learning_rate": 0.0002, "epoch": 6.156552962298025, "step": 85730}, {"loss": 0.5365, "grad_norm": 1.2071181535720825, "learning_rate": 0.0002, "epoch": 6.157271095152603, "step": 85740}, {"loss": 0.6081, "grad_norm": 1.2609243392944336, "learning_rate": 0.0002, "epoch": 6.157989228007182, "step": 85750}, {"loss": 0.5238, "grad_norm": 1.0485966205596924, "learning_rate": 0.0002, "epoch": 6.15870736086176, "step": 85760}, {"loss": 0.5221, "grad_norm": 0.9949250817298889, "learning_rate": 0.0002, "epoch": 6.159425493716338, "step": 85770}, {"loss": 0.5401, "grad_norm": 0.8191118836402893, "learning_rate": 0.0002, "epoch": 6.160143626570916, "step": 85780}, {"loss": 0.5283, "grad_norm": 0.96427983045578, "learning_rate": 0.0002, "epoch": 6.160861759425494, "step": 85790}, {"loss": 0.5597, "grad_norm": 1.0336496829986572, "learning_rate": 0.0002, "epoch": 6.161579892280072, "step": 85800}, {"loss": 0.5069, "grad_norm": 1.0699222087860107, "learning_rate": 0.0002, "epoch": 6.16229802513465, "step": 85810}, {"loss": 0.5433, "grad_norm": 1.2340054512023926, "learning_rate": 0.0002, "epoch": 6.163016157989228, "step": 85820}, {"loss": 0.5233, "grad_norm": 0.981848955154419, "learning_rate": 0.0002, "epoch": 6.163734290843806, "step": 85830}, {"loss": 0.5393, "grad_norm": 1.2059850692749023, "learning_rate": 0.0002, "epoch": 6.164452423698384, "step": 85840}, {"loss": 0.5358, "grad_norm": 1.0239924192428589, "learning_rate": 0.0002, "epoch": 6.165170556552963, "step": 85850}, {"loss": 0.5715, "grad_norm": 0.8601624369621277, "learning_rate": 0.0002, "epoch": 6.165888689407541, "step": 85860}, {"loss": 0.5442, "grad_norm": 1.1900125741958618, "learning_rate": 0.0002, "epoch": 6.166606822262119, "step": 85870}, {"loss": 0.5193, "grad_norm": 0.9747354388237, "learning_rate": 0.0002, "epoch": 6.167324955116697, "step": 85880}, {"loss": 0.5226, "grad_norm": 1.1277778148651123, "learning_rate": 0.0002, "epoch": 6.168043087971275, "step": 85890}, {"loss": 0.5554, "grad_norm": 1.1270111799240112, "learning_rate": 0.0002, "epoch": 6.168761220825853, "step": 85900}, {"loss": 0.5345, "grad_norm": 1.1610701084136963, "learning_rate": 0.0002, "epoch": 6.169479353680431, "step": 85910}, {"loss": 0.5524, "grad_norm": 0.873607873916626, "learning_rate": 0.0002, "epoch": 6.170197486535009, "step": 85920}, {"loss": 0.5021, "grad_norm": 1.040145993232727, "learning_rate": 0.0002, "epoch": 6.170915619389587, "step": 85930}, {"loss": 0.5072, "grad_norm": 1.0139122009277344, "learning_rate": 0.0002, "epoch": 6.1716337522441655, "step": 85940}, {"loss": 0.5674, "grad_norm": 1.0575451850891113, "learning_rate": 0.0002, "epoch": 6.1723518850987436, "step": 85950}, {"loss": 0.5517, "grad_norm": 1.100884199142456, "learning_rate": 0.0002, "epoch": 6.1730700179533216, "step": 85960}, {"loss": 0.5165, "grad_norm": 1.1741244792938232, "learning_rate": 0.0002, "epoch": 6.1737881508078996, "step": 85970}, {"loss": 0.526, "grad_norm": 0.9446555376052856, "learning_rate": 0.0002, "epoch": 6.174506283662478, "step": 85980}, {"loss": 0.493, "grad_norm": 0.9297952055931091, "learning_rate": 0.0002, "epoch": 6.175224416517056, "step": 85990}, {"loss": 0.5059, "grad_norm": 1.196361780166626, "learning_rate": 0.0002, "epoch": 6.175942549371634, "step": 86000}, {"loss": 0.5541, "grad_norm": 1.0719913244247437, "learning_rate": 0.0002, "epoch": 6.176660682226212, "step": 86010}, {"loss": 0.5613, "grad_norm": 1.0942085981369019, "learning_rate": 0.0002, "epoch": 6.17737881508079, "step": 86020}, {"loss": 0.5632, "grad_norm": 0.8989787697792053, "learning_rate": 0.0002, "epoch": 6.1780969479353685, "step": 86030}, {"loss": 0.5778, "grad_norm": 1.071344017982483, "learning_rate": 0.0002, "epoch": 6.1788150807899465, "step": 86040}, {"loss": 0.4885, "grad_norm": 0.9686782360076904, "learning_rate": 0.0002, "epoch": 6.1795332136445245, "step": 86050}, {"loss": 0.5727, "grad_norm": 1.0769884586334229, "learning_rate": 0.0002, "epoch": 6.1802513464991025, "step": 86060}, {"loss": 0.5356, "grad_norm": 0.9761241674423218, "learning_rate": 0.0002, "epoch": 6.1809694793536805, "step": 86070}, {"loss": 0.5736, "grad_norm": 1.0531808137893677, "learning_rate": 0.0002, "epoch": 6.1816876122082585, "step": 86080}, {"loss": 0.5899, "grad_norm": 1.0523570775985718, "learning_rate": 0.0002, "epoch": 6.1824057450628365, "step": 86090}, {"loss": 0.5941, "grad_norm": 1.2155946493148804, "learning_rate": 0.0002, "epoch": 6.1831238779174145, "step": 86100}, {"loss": 0.5315, "grad_norm": 1.1012920141220093, "learning_rate": 0.0002, "epoch": 6.1838420107719925, "step": 86110}, {"loss": 0.555, "grad_norm": 0.8764983415603638, "learning_rate": 0.0002, "epoch": 6.184560143626571, "step": 86120}, {"loss": 0.5219, "grad_norm": 0.950320303440094, "learning_rate": 0.0002, "epoch": 6.185278276481149, "step": 86130}, {"loss": 0.5275, "grad_norm": 1.1183594465255737, "learning_rate": 0.0002, "epoch": 6.185996409335727, "step": 86140}, {"loss": 0.4953, "grad_norm": 1.1919164657592773, "learning_rate": 0.0002, "epoch": 6.186714542190305, "step": 86150}, {"loss": 0.5121, "grad_norm": 1.1478904485702515, "learning_rate": 0.0002, "epoch": 6.187432675044883, "step": 86160}, {"loss": 0.5482, "grad_norm": 1.0764135122299194, "learning_rate": 0.0002, "epoch": 6.188150807899461, "step": 86170}, {"loss": 0.5448, "grad_norm": 1.195090889930725, "learning_rate": 0.0002, "epoch": 6.188868940754039, "step": 86180}, {"loss": 0.5461, "grad_norm": 1.089442253112793, "learning_rate": 0.0002, "epoch": 6.189587073608617, "step": 86190}, {"loss": 0.5415, "grad_norm": 0.9705546498298645, "learning_rate": 0.0002, "epoch": 6.190305206463195, "step": 86200}, {"loss": 0.5575, "grad_norm": 1.164642333984375, "learning_rate": 0.0002, "epoch": 6.191023339317773, "step": 86210}, {"loss": 0.5354, "grad_norm": 0.9551387429237366, "learning_rate": 0.0002, "epoch": 6.191741472172352, "step": 86220}, {"loss": 0.5237, "grad_norm": 1.0483227968215942, "learning_rate": 0.0002, "epoch": 6.19245960502693, "step": 86230}, {"loss": 0.5519, "grad_norm": 1.0068920850753784, "learning_rate": 0.0002, "epoch": 6.193177737881508, "step": 86240}, {"loss": 0.6136, "grad_norm": 1.142656683921814, "learning_rate": 0.0002, "epoch": 6.193895870736086, "step": 86250}, {"loss": 0.5722, "grad_norm": 1.1186467409133911, "learning_rate": 0.0002, "epoch": 6.194614003590664, "step": 86260}, {"loss": 0.5721, "grad_norm": 1.1664706468582153, "learning_rate": 0.0002, "epoch": 6.195332136445242, "step": 86270}, {"loss": 0.5397, "grad_norm": 1.2658511400222778, "learning_rate": 0.0002, "epoch": 6.19605026929982, "step": 86280}, {"loss": 0.5593, "grad_norm": 1.122759222984314, "learning_rate": 0.0002, "epoch": 6.196768402154398, "step": 86290}, {"loss": 0.5874, "grad_norm": 1.1611319780349731, "learning_rate": 0.0002, "epoch": 6.197486535008976, "step": 86300}, {"loss": 0.531, "grad_norm": 1.0476176738739014, "learning_rate": 0.0002, "epoch": 6.198204667863555, "step": 86310}, {"loss": 0.5455, "grad_norm": 1.2284801006317139, "learning_rate": 0.0002, "epoch": 6.198922800718133, "step": 86320}, {"loss": 0.5052, "grad_norm": 1.1340757608413696, "learning_rate": 0.0002, "epoch": 6.199640933572711, "step": 86330}, {"loss": 0.5651, "grad_norm": 1.045088768005371, "learning_rate": 0.0002, "epoch": 6.200359066427289, "step": 86340}, {"loss": 0.5606, "grad_norm": 1.1200770139694214, "learning_rate": 0.0002, "epoch": 6.201077199281867, "step": 86350}, {"loss": 0.5554, "grad_norm": 1.1879554986953735, "learning_rate": 0.0002, "epoch": 6.201795332136445, "step": 86360}, {"loss": 0.5442, "grad_norm": 1.1146271228790283, "learning_rate": 0.0002, "epoch": 6.202513464991023, "step": 86370}, {"loss": 0.5472, "grad_norm": 0.8934822678565979, "learning_rate": 0.0002, "epoch": 6.203231597845601, "step": 86380}, {"loss": 0.5663, "grad_norm": 1.21973717212677, "learning_rate": 0.0002, "epoch": 6.203949730700179, "step": 86390}, {"loss": 0.5351, "grad_norm": 0.9424970746040344, "learning_rate": 0.0002, "epoch": 6.204667863554757, "step": 86400}, {"loss": 0.5291, "grad_norm": 1.0036219358444214, "learning_rate": 0.0002, "epoch": 6.205385996409336, "step": 86410}, {"loss": 0.5117, "grad_norm": 0.9319575428962708, "learning_rate": 0.0002, "epoch": 6.206104129263914, "step": 86420}, {"loss": 0.5608, "grad_norm": 1.0548789501190186, "learning_rate": 0.0002, "epoch": 6.206822262118492, "step": 86430}, {"loss": 0.5556, "grad_norm": 0.9361019730567932, "learning_rate": 0.0002, "epoch": 6.20754039497307, "step": 86440}, {"loss": 0.5765, "grad_norm": 0.9350554347038269, "learning_rate": 0.0002, "epoch": 6.208258527827648, "step": 86450}, {"loss": 0.5616, "grad_norm": 1.291595458984375, "learning_rate": 0.0002, "epoch": 6.208976660682226, "step": 86460}, {"loss": 0.584, "grad_norm": 1.0414642095565796, "learning_rate": 0.0002, "epoch": 6.209694793536804, "step": 86470}, {"loss": 0.5282, "grad_norm": 1.1983444690704346, "learning_rate": 0.0002, "epoch": 6.210412926391382, "step": 86480}, {"loss": 0.493, "grad_norm": 0.9444540739059448, "learning_rate": 0.0002, "epoch": 6.21113105924596, "step": 86490}, {"loss": 0.5533, "grad_norm": 1.072526216506958, "learning_rate": 0.0002, "epoch": 6.211849192100539, "step": 86500}, {"loss": 0.5509, "grad_norm": 1.0109381675720215, "learning_rate": 0.0002, "epoch": 6.212567324955117, "step": 86510}, {"loss": 0.5244, "grad_norm": 1.1661816835403442, "learning_rate": 0.0002, "epoch": 6.213285457809695, "step": 86520}, {"loss": 0.5192, "grad_norm": 1.0434976816177368, "learning_rate": 0.0002, "epoch": 6.214003590664273, "step": 86530}, {"loss": 0.5732, "grad_norm": 1.1290796995162964, "learning_rate": 0.0002, "epoch": 6.214721723518851, "step": 86540}, {"loss": 0.5276, "grad_norm": 0.746512234210968, "learning_rate": 0.0002, "epoch": 6.215439856373429, "step": 86550}, {"loss": 0.5412, "grad_norm": 1.0346291065216064, "learning_rate": 0.0002, "epoch": 6.216157989228007, "step": 86560}, {"loss": 0.5452, "grad_norm": 1.2428497076034546, "learning_rate": 0.0002, "epoch": 6.216876122082585, "step": 86570}, {"loss": 0.4906, "grad_norm": 1.0040535926818848, "learning_rate": 0.0002, "epoch": 6.217594254937163, "step": 86580}, {"loss": 0.5368, "grad_norm": 0.9300616383552551, "learning_rate": 0.0002, "epoch": 6.218312387791742, "step": 86590}, {"loss": 0.51, "grad_norm": 1.0006635189056396, "learning_rate": 0.0002, "epoch": 6.21903052064632, "step": 86600}, {"loss": 0.573, "grad_norm": 1.1402281522750854, "learning_rate": 0.0002, "epoch": 6.219748653500898, "step": 86610}, {"loss": 0.5324, "grad_norm": 1.1543347835540771, "learning_rate": 0.0002, "epoch": 6.220466786355476, "step": 86620}, {"loss": 0.4904, "grad_norm": 1.1074384450912476, "learning_rate": 0.0002, "epoch": 6.221184919210054, "step": 86630}, {"loss": 0.5291, "grad_norm": 0.9032864570617676, "learning_rate": 0.0002, "epoch": 6.221903052064632, "step": 86640}, {"loss": 0.5651, "grad_norm": 1.094516396522522, "learning_rate": 0.0002, "epoch": 6.22262118491921, "step": 86650}, {"loss": 0.5723, "grad_norm": 1.2248685359954834, "learning_rate": 0.0002, "epoch": 6.223339317773788, "step": 86660}, {"loss": 0.5873, "grad_norm": 1.0211371183395386, "learning_rate": 0.0002, "epoch": 6.224057450628366, "step": 86670}, {"loss": 0.5459, "grad_norm": 1.0956611633300781, "learning_rate": 0.0002, "epoch": 6.224775583482945, "step": 86680}, {"loss": 0.5615, "grad_norm": 1.1494320631027222, "learning_rate": 0.0002, "epoch": 6.225493716337523, "step": 86690}, {"loss": 0.4953, "grad_norm": 0.968108594417572, "learning_rate": 0.0002, "epoch": 6.226211849192101, "step": 86700}, {"loss": 0.5349, "grad_norm": 1.376665711402893, "learning_rate": 0.0002, "epoch": 6.226929982046679, "step": 86710}, {"loss": 0.5285, "grad_norm": 1.2121574878692627, "learning_rate": 0.0002, "epoch": 6.227648114901257, "step": 86720}, {"loss": 0.534, "grad_norm": 1.001272439956665, "learning_rate": 0.0002, "epoch": 6.228366247755835, "step": 86730}, {"loss": 0.5684, "grad_norm": 0.9023162722587585, "learning_rate": 0.0002, "epoch": 6.229084380610413, "step": 86740}, {"loss": 0.5304, "grad_norm": 1.2660632133483887, "learning_rate": 0.0002, "epoch": 6.229802513464991, "step": 86750}, {"loss": 0.52, "grad_norm": 1.0549668073654175, "learning_rate": 0.0002, "epoch": 6.230520646319569, "step": 86760}, {"loss": 0.5268, "grad_norm": 1.0364645719528198, "learning_rate": 0.0002, "epoch": 6.231238779174147, "step": 86770}, {"loss": 0.5543, "grad_norm": 1.2197567224502563, "learning_rate": 0.0002, "epoch": 6.231956912028726, "step": 86780}, {"loss": 0.5675, "grad_norm": 0.8866947889328003, "learning_rate": 0.0002, "epoch": 6.232675044883304, "step": 86790}, {"loss": 0.5666, "grad_norm": 1.1795434951782227, "learning_rate": 0.0002, "epoch": 6.233393177737882, "step": 86800}, {"loss": 0.5309, "grad_norm": 1.0882378816604614, "learning_rate": 0.0002, "epoch": 6.23411131059246, "step": 86810}, {"loss": 0.5903, "grad_norm": 1.181888222694397, "learning_rate": 0.0002, "epoch": 6.234829443447038, "step": 86820}, {"loss": 0.5847, "grad_norm": 1.031209111213684, "learning_rate": 0.0002, "epoch": 6.235547576301616, "step": 86830}, {"loss": 0.5283, "grad_norm": 1.2889492511749268, "learning_rate": 0.0002, "epoch": 6.236265709156194, "step": 86840}, {"loss": 0.5409, "grad_norm": 0.874086856842041, "learning_rate": 0.0002, "epoch": 6.236983842010772, "step": 86850}, {"loss": 0.546, "grad_norm": 1.1912312507629395, "learning_rate": 0.0002, "epoch": 6.23770197486535, "step": 86860}, {"loss": 0.5446, "grad_norm": 1.0963071584701538, "learning_rate": 0.0002, "epoch": 6.238420107719929, "step": 86870}, {"loss": 0.5917, "grad_norm": 1.028746485710144, "learning_rate": 0.0002, "epoch": 6.239138240574507, "step": 86880}, {"loss": 0.5851, "grad_norm": 1.0736430883407593, "learning_rate": 0.0002, "epoch": 6.239856373429085, "step": 86890}, {"loss": 0.5773, "grad_norm": 0.9559927582740784, "learning_rate": 0.0002, "epoch": 6.240574506283663, "step": 86900}, {"loss": 0.5694, "grad_norm": 0.9696667790412903, "learning_rate": 0.0002, "epoch": 6.241292639138241, "step": 86910}, {"loss": 0.564, "grad_norm": 1.0710713863372803, "learning_rate": 0.0002, "epoch": 6.242010771992819, "step": 86920}, {"loss": 0.5557, "grad_norm": 1.0459970235824585, "learning_rate": 0.0002, "epoch": 6.242728904847397, "step": 86930}, {"loss": 0.5845, "grad_norm": 1.212083339691162, "learning_rate": 0.0002, "epoch": 6.243447037701975, "step": 86940}, {"loss": 0.5503, "grad_norm": 1.0369303226470947, "learning_rate": 0.0002, "epoch": 6.244165170556553, "step": 86950}, {"loss": 0.5468, "grad_norm": 1.180519700050354, "learning_rate": 0.0002, "epoch": 6.244883303411131, "step": 86960}, {"loss": 0.5969, "grad_norm": 1.0670114755630493, "learning_rate": 0.0002, "epoch": 6.2456014362657095, "step": 86970}, {"loss": 0.5712, "grad_norm": 1.072209119796753, "learning_rate": 0.0002, "epoch": 6.2463195691202875, "step": 86980}, {"loss": 0.5554, "grad_norm": 0.9642090201377869, "learning_rate": 0.0002, "epoch": 6.2470377019748655, "step": 86990}, {"loss": 0.5351, "grad_norm": 1.077467918395996, "learning_rate": 0.0002, "epoch": 6.2477558348294435, "step": 87000}, {"loss": 0.5434, "grad_norm": 1.1081476211547852, "learning_rate": 0.0002, "epoch": 6.2484739676840215, "step": 87010}, {"loss": 0.5692, "grad_norm": 0.8815084099769592, "learning_rate": 0.0002, "epoch": 6.2491921005385995, "step": 87020}, {"loss": 0.5649, "grad_norm": 0.8562555313110352, "learning_rate": 0.0002, "epoch": 6.2499102333931775, "step": 87030}, {"loss": 0.5305, "grad_norm": 0.8729159235954285, "learning_rate": 0.0002, "epoch": 6.2506283662477555, "step": 87040}, {"loss": 0.5179, "grad_norm": 1.005082368850708, "learning_rate": 0.0002, "epoch": 6.2513464991023335, "step": 87050}, {"loss": 0.5326, "grad_norm": 1.3991386890411377, "learning_rate": 0.0002, "epoch": 6.252064631956912, "step": 87060}, {"loss": 0.563, "grad_norm": 1.090180516242981, "learning_rate": 0.0002, "epoch": 6.25278276481149, "step": 87070}, {"loss": 0.6074, "grad_norm": 1.08149254322052, "learning_rate": 0.0002, "epoch": 6.253500897666068, "step": 87080}, {"loss": 0.5663, "grad_norm": 1.1021103858947754, "learning_rate": 0.0002, "epoch": 6.254219030520646, "step": 87090}, {"loss": 0.5744, "grad_norm": 1.2393771409988403, "learning_rate": 0.0002, "epoch": 6.254937163375224, "step": 87100}, {"loss": 0.5379, "grad_norm": 0.9702037572860718, "learning_rate": 0.0002, "epoch": 6.255655296229802, "step": 87110}, {"loss": 0.546, "grad_norm": 1.203088641166687, "learning_rate": 0.0002, "epoch": 6.25637342908438, "step": 87120}, {"loss": 0.5315, "grad_norm": 0.9722330570220947, "learning_rate": 0.0002, "epoch": 6.257091561938958, "step": 87130}, {"loss": 0.5864, "grad_norm": 0.9802384376525879, "learning_rate": 0.0002, "epoch": 6.257809694793536, "step": 87140}, {"loss": 0.5751, "grad_norm": 0.9991751909255981, "learning_rate": 0.0002, "epoch": 6.258527827648114, "step": 87150}, {"loss": 0.5574, "grad_norm": 1.1102324724197388, "learning_rate": 0.0002, "epoch": 6.259245960502693, "step": 87160}, {"loss": 0.545, "grad_norm": 1.1357909440994263, "learning_rate": 0.0002, "epoch": 6.259964093357271, "step": 87170}, {"loss": 0.5066, "grad_norm": 1.1128548383712769, "learning_rate": 0.0002, "epoch": 6.260682226211849, "step": 87180}, {"loss": 0.6394, "grad_norm": 1.1135061979293823, "learning_rate": 0.0002, "epoch": 6.261400359066427, "step": 87190}, {"loss": 0.4923, "grad_norm": 0.9545563459396362, "learning_rate": 0.0002, "epoch": 6.262118491921005, "step": 87200}, {"loss": 0.555, "grad_norm": 1.3011159896850586, "learning_rate": 0.0002, "epoch": 6.262836624775583, "step": 87210}, {"loss": 0.5517, "grad_norm": 1.217691421508789, "learning_rate": 0.0002, "epoch": 6.263554757630161, "step": 87220}, {"loss": 0.5316, "grad_norm": 0.9615218043327332, "learning_rate": 0.0002, "epoch": 6.264272890484739, "step": 87230}, {"loss": 0.5702, "grad_norm": 0.9935932159423828, "learning_rate": 0.0002, "epoch": 6.264991023339318, "step": 87240}, {"loss": 0.5313, "grad_norm": 1.01247239112854, "learning_rate": 0.0002, "epoch": 6.265709156193896, "step": 87250}, {"loss": 0.5723, "grad_norm": 1.1960358619689941, "learning_rate": 0.0002, "epoch": 6.266427289048474, "step": 87260}, {"loss": 0.5381, "grad_norm": 1.053942322731018, "learning_rate": 0.0002, "epoch": 6.267145421903052, "step": 87270}, {"loss": 0.5679, "grad_norm": 1.2450612783432007, "learning_rate": 0.0002, "epoch": 6.26786355475763, "step": 87280}, {"loss": 0.5149, "grad_norm": 0.7816058397293091, "learning_rate": 0.0002, "epoch": 6.268581687612208, "step": 87290}, {"loss": 0.549, "grad_norm": 1.014817237854004, "learning_rate": 0.0002, "epoch": 6.269299820466786, "step": 87300}, {"loss": 0.5787, "grad_norm": 1.1871070861816406, "learning_rate": 0.0002, "epoch": 6.270017953321364, "step": 87310}, {"loss": 0.5103, "grad_norm": 1.0170562267303467, "learning_rate": 0.0002, "epoch": 6.270736086175942, "step": 87320}, {"loss": 0.555, "grad_norm": 1.216288685798645, "learning_rate": 0.0002, "epoch": 6.27145421903052, "step": 87330}, {"loss": 0.5648, "grad_norm": 0.8846057653427124, "learning_rate": 0.0002, "epoch": 6.272172351885099, "step": 87340}, {"loss": 0.5781, "grad_norm": 1.181233286857605, "learning_rate": 0.0002, "epoch": 6.272890484739677, "step": 87350}, {"loss": 0.5359, "grad_norm": 1.0051873922348022, "learning_rate": 0.0002, "epoch": 6.273608617594255, "step": 87360}, {"loss": 0.5674, "grad_norm": 1.1179516315460205, "learning_rate": 0.0002, "epoch": 6.274326750448833, "step": 87370}, {"loss": 0.5935, "grad_norm": 1.0118002891540527, "learning_rate": 0.0002, "epoch": 6.275044883303411, "step": 87380}, {"loss": 0.5789, "grad_norm": 1.0948026180267334, "learning_rate": 0.0002, "epoch": 6.275763016157989, "step": 87390}, {"loss": 0.5277, "grad_norm": 1.0836515426635742, "learning_rate": 0.0002, "epoch": 6.276481149012567, "step": 87400}, {"loss": 0.5663, "grad_norm": 0.9548853039741516, "learning_rate": 0.0002, "epoch": 6.277199281867145, "step": 87410}, {"loss": 0.58, "grad_norm": 1.2531564235687256, "learning_rate": 0.0002, "epoch": 6.277917414721723, "step": 87420}, {"loss": 0.5651, "grad_norm": 1.010250449180603, "learning_rate": 0.0002, "epoch": 6.278635547576302, "step": 87430}, {"loss": 0.6222, "grad_norm": 1.3306254148483276, "learning_rate": 0.0002, "epoch": 6.27935368043088, "step": 87440}, {"loss": 0.5397, "grad_norm": 0.9485062956809998, "learning_rate": 0.0002, "epoch": 6.280071813285458, "step": 87450}, {"loss": 0.5441, "grad_norm": 0.9938563704490662, "learning_rate": 0.0002, "epoch": 6.280789946140036, "step": 87460}, {"loss": 0.5546, "grad_norm": 1.1747362613677979, "learning_rate": 0.0002, "epoch": 6.281508078994614, "step": 87470}, {"loss": 0.566, "grad_norm": 1.1712254285812378, "learning_rate": 0.0002, "epoch": 6.282226211849192, "step": 87480}, {"loss": 0.6165, "grad_norm": 1.1453865766525269, "learning_rate": 0.0002, "epoch": 6.28294434470377, "step": 87490}, {"loss": 0.535, "grad_norm": 0.974902331829071, "learning_rate": 0.0002, "epoch": 6.283662477558348, "step": 87500}, {"loss": 0.5354, "grad_norm": 1.1181912422180176, "learning_rate": 0.0002, "epoch": 6.284380610412926, "step": 87510}, {"loss": 0.5276, "grad_norm": 1.047453761100769, "learning_rate": 0.0002, "epoch": 6.285098743267504, "step": 87520}, {"loss": 0.5689, "grad_norm": 1.185815453529358, "learning_rate": 0.0002, "epoch": 6.285816876122083, "step": 87530}, {"loss": 0.5531, "grad_norm": 1.1126786470413208, "learning_rate": 0.0002, "epoch": 6.286535008976661, "step": 87540}, {"loss": 0.5619, "grad_norm": 1.0931676626205444, "learning_rate": 0.0002, "epoch": 6.287253141831239, "step": 87550}, {"loss": 0.5625, "grad_norm": 0.9930597543716431, "learning_rate": 0.0002, "epoch": 6.287971274685817, "step": 87560}, {"loss": 0.5637, "grad_norm": 0.9909583926200867, "learning_rate": 0.0002, "epoch": 6.288689407540395, "step": 87570}, {"loss": 0.5462, "grad_norm": 1.3766822814941406, "learning_rate": 0.0002, "epoch": 6.289407540394973, "step": 87580}, {"loss": 0.5544, "grad_norm": 1.0137864351272583, "learning_rate": 0.0002, "epoch": 6.290125673249551, "step": 87590}, {"loss": 0.5678, "grad_norm": 0.8761594295501709, "learning_rate": 0.0002, "epoch": 6.290843806104129, "step": 87600}, {"loss": 0.5393, "grad_norm": 1.155881404876709, "learning_rate": 0.0002, "epoch": 6.291561938958707, "step": 87610}, {"loss": 0.5606, "grad_norm": 0.9972963333129883, "learning_rate": 0.0002, "epoch": 6.292280071813286, "step": 87620}, {"loss": 0.5776, "grad_norm": 1.195021152496338, "learning_rate": 0.0002, "epoch": 6.292998204667864, "step": 87630}, {"loss": 0.5567, "grad_norm": 0.9872829914093018, "learning_rate": 0.0002, "epoch": 6.293716337522442, "step": 87640}, {"loss": 0.588, "grad_norm": 1.3643794059753418, "learning_rate": 0.0002, "epoch": 6.29443447037702, "step": 87650}, {"loss": 0.5181, "grad_norm": 0.9389668703079224, "learning_rate": 0.0002, "epoch": 6.295152603231598, "step": 87660}, {"loss": 0.5284, "grad_norm": 1.379319429397583, "learning_rate": 0.0002, "epoch": 6.295870736086176, "step": 87670}, {"loss": 0.5091, "grad_norm": 1.1253849267959595, "learning_rate": 0.0002, "epoch": 6.296588868940754, "step": 87680}, {"loss": 0.5383, "grad_norm": 1.2402328252792358, "learning_rate": 0.0002, "epoch": 6.297307001795332, "step": 87690}, {"loss": 0.5803, "grad_norm": 1.085004210472107, "learning_rate": 0.0002, "epoch": 6.29802513464991, "step": 87700}, {"loss": 0.5705, "grad_norm": 1.0939021110534668, "learning_rate": 0.0002, "epoch": 6.298743267504488, "step": 87710}, {"loss": 0.5391, "grad_norm": 1.0350301265716553, "learning_rate": 0.0002, "epoch": 6.299461400359067, "step": 87720}, {"loss": 0.5269, "grad_norm": 0.9862944483757019, "learning_rate": 0.0002, "epoch": 6.300179533213645, "step": 87730}, {"loss": 0.5378, "grad_norm": 0.990942656993866, "learning_rate": 0.0002, "epoch": 6.300897666068223, "step": 87740}, {"loss": 0.4843, "grad_norm": 0.9287887215614319, "learning_rate": 0.0002, "epoch": 6.301615798922801, "step": 87750}, {"loss": 0.5602, "grad_norm": 1.225714087486267, "learning_rate": 0.0002, "epoch": 6.302333931777379, "step": 87760}, {"loss": 0.5513, "grad_norm": 1.0181951522827148, "learning_rate": 0.0002, "epoch": 6.303052064631957, "step": 87770}, {"loss": 0.563, "grad_norm": 0.9808282256126404, "learning_rate": 0.0002, "epoch": 6.303770197486535, "step": 87780}, {"loss": 0.5738, "grad_norm": 1.1413379907608032, "learning_rate": 0.0002, "epoch": 6.304488330341113, "step": 87790}, {"loss": 0.5548, "grad_norm": 1.1188091039657593, "learning_rate": 0.0002, "epoch": 6.305206463195692, "step": 87800}, {"loss": 0.497, "grad_norm": 1.297154188156128, "learning_rate": 0.0002, "epoch": 6.30592459605027, "step": 87810}, {"loss": 0.5481, "grad_norm": 1.0723271369934082, "learning_rate": 0.0002, "epoch": 6.306642728904848, "step": 87820}, {"loss": 0.567, "grad_norm": 1.067265510559082, "learning_rate": 0.0002, "epoch": 6.307360861759426, "step": 87830}, {"loss": 0.5893, "grad_norm": 1.01328444480896, "learning_rate": 0.0002, "epoch": 6.308078994614004, "step": 87840}, {"loss": 0.5169, "grad_norm": 1.092671513557434, "learning_rate": 0.0002, "epoch": 6.308797127468582, "step": 87850}, {"loss": 0.6079, "grad_norm": 1.168721079826355, "learning_rate": 0.0002, "epoch": 6.30951526032316, "step": 87860}, {"loss": 0.5355, "grad_norm": 1.165495753288269, "learning_rate": 0.0002, "epoch": 6.310233393177738, "step": 87870}, {"loss": 0.6015, "grad_norm": 1.10816490650177, "learning_rate": 0.0002, "epoch": 6.310951526032316, "step": 87880}, {"loss": 0.5259, "grad_norm": 0.9667611718177795, "learning_rate": 0.0002, "epoch": 6.311669658886894, "step": 87890}, {"loss": 0.589, "grad_norm": 1.22564697265625, "learning_rate": 0.0002, "epoch": 6.312387791741473, "step": 87900}, {"loss": 0.5574, "grad_norm": 1.1156506538391113, "learning_rate": 0.0002, "epoch": 6.313105924596051, "step": 87910}, {"loss": 0.5324, "grad_norm": 1.03804349899292, "learning_rate": 0.0002, "epoch": 6.313824057450629, "step": 87920}, {"loss": 0.5577, "grad_norm": 0.9424136281013489, "learning_rate": 0.0002, "epoch": 6.314542190305207, "step": 87930}, {"loss": 0.5654, "grad_norm": 1.2243257761001587, "learning_rate": 0.0002, "epoch": 6.315260323159785, "step": 87940}, {"loss": 0.5884, "grad_norm": 1.0930471420288086, "learning_rate": 0.0002, "epoch": 6.315978456014363, "step": 87950}, {"loss": 0.5227, "grad_norm": 1.096875548362732, "learning_rate": 0.0002, "epoch": 6.316696588868941, "step": 87960}, {"loss": 0.5514, "grad_norm": 1.0606242418289185, "learning_rate": 0.0002, "epoch": 6.317414721723519, "step": 87970}, {"loss": 0.5409, "grad_norm": 0.8657089471817017, "learning_rate": 0.0002, "epoch": 6.318132854578097, "step": 87980}, {"loss": 0.5496, "grad_norm": 0.9751629829406738, "learning_rate": 0.0002, "epoch": 6.3188509874326755, "step": 87990}, {"loss": 0.5677, "grad_norm": 1.0751961469650269, "learning_rate": 0.0002, "epoch": 6.3195691202872535, "step": 88000}, {"loss": 0.5408, "grad_norm": 1.0679874420166016, "learning_rate": 0.0002, "epoch": 6.3202872531418315, "step": 88010}, {"loss": 0.5695, "grad_norm": 1.4102588891983032, "learning_rate": 0.0002, "epoch": 6.3210053859964095, "step": 88020}, {"loss": 0.5744, "grad_norm": 0.8747799396514893, "learning_rate": 0.0002, "epoch": 6.3217235188509875, "step": 88030}, {"loss": 0.6024, "grad_norm": 1.0866155624389648, "learning_rate": 0.0002, "epoch": 6.3224416517055655, "step": 88040}, {"loss": 0.5964, "grad_norm": 1.2255747318267822, "learning_rate": 0.0002, "epoch": 6.3231597845601435, "step": 88050}, {"loss": 0.5536, "grad_norm": 1.031588077545166, "learning_rate": 0.0002, "epoch": 6.3238779174147215, "step": 88060}, {"loss": 0.5631, "grad_norm": 1.1994154453277588, "learning_rate": 0.0002, "epoch": 6.3245960502692995, "step": 88070}, {"loss": 0.5644, "grad_norm": 0.9172461032867432, "learning_rate": 0.0002, "epoch": 6.3253141831238775, "step": 88080}, {"loss": 0.5739, "grad_norm": 0.8762667775154114, "learning_rate": 0.0002, "epoch": 6.326032315978456, "step": 88090}, {"loss": 0.558, "grad_norm": 1.166225790977478, "learning_rate": 0.0002, "epoch": 6.326750448833034, "step": 88100}, {"loss": 0.5688, "grad_norm": 1.014858365058899, "learning_rate": 0.0002, "epoch": 6.327468581687612, "step": 88110}, {"loss": 0.5783, "grad_norm": 1.1080266237258911, "learning_rate": 0.0002, "epoch": 6.32818671454219, "step": 88120}, {"loss": 0.6146, "grad_norm": 0.9775443077087402, "learning_rate": 0.0002, "epoch": 6.328904847396768, "step": 88130}, {"loss": 0.5658, "grad_norm": 0.9032314419746399, "learning_rate": 0.0002, "epoch": 6.329622980251346, "step": 88140}, {"loss": 0.5139, "grad_norm": 1.0170091390609741, "learning_rate": 0.0002, "epoch": 6.330341113105924, "step": 88150}, {"loss": 0.5155, "grad_norm": 0.9412024617195129, "learning_rate": 0.0002, "epoch": 6.331059245960502, "step": 88160}, {"loss": 0.5454, "grad_norm": 0.9090259671211243, "learning_rate": 0.0002, "epoch": 6.33177737881508, "step": 88170}, {"loss": 0.5564, "grad_norm": 0.8896998167037964, "learning_rate": 0.0002, "epoch": 6.332495511669659, "step": 88180}, {"loss": 0.5536, "grad_norm": 1.1648571491241455, "learning_rate": 0.0002, "epoch": 6.333213644524237, "step": 88190}, {"loss": 0.5439, "grad_norm": 1.13261878490448, "learning_rate": 0.0002, "epoch": 6.333931777378815, "step": 88200}, {"loss": 0.5367, "grad_norm": 0.9561943411827087, "learning_rate": 0.0002, "epoch": 6.334649910233393, "step": 88210}, {"loss": 0.548, "grad_norm": 1.3076379299163818, "learning_rate": 0.0002, "epoch": 6.335368043087971, "step": 88220}, {"loss": 0.5706, "grad_norm": 0.9788665175437927, "learning_rate": 0.0002, "epoch": 6.336086175942549, "step": 88230}, {"loss": 0.5439, "grad_norm": 1.2843645811080933, "learning_rate": 0.0002, "epoch": 6.336804308797127, "step": 88240}, {"loss": 0.5174, "grad_norm": 1.1531981229782104, "learning_rate": 0.0002, "epoch": 6.337522441651705, "step": 88250}, {"loss": 0.5746, "grad_norm": 1.1946183443069458, "learning_rate": 0.0002, "epoch": 6.338240574506283, "step": 88260}, {"loss": 0.5778, "grad_norm": 1.1190218925476074, "learning_rate": 0.0002, "epoch": 6.338958707360861, "step": 88270}, {"loss": 0.5175, "grad_norm": 1.0605140924453735, "learning_rate": 0.0002, "epoch": 6.33967684021544, "step": 88280}, {"loss": 0.5435, "grad_norm": 1.0237314701080322, "learning_rate": 0.0002, "epoch": 6.340394973070018, "step": 88290}, {"loss": 0.5595, "grad_norm": 1.1268457174301147, "learning_rate": 0.0002, "epoch": 6.341113105924596, "step": 88300}, {"loss": 0.5706, "grad_norm": 1.0750062465667725, "learning_rate": 0.0002, "epoch": 6.341831238779174, "step": 88310}, {"loss": 0.5334, "grad_norm": 1.2356536388397217, "learning_rate": 0.0002, "epoch": 6.342549371633752, "step": 88320}, {"loss": 0.5143, "grad_norm": 1.0375114679336548, "learning_rate": 0.0002, "epoch": 6.34326750448833, "step": 88330}, {"loss": 0.5583, "grad_norm": 1.063388705253601, "learning_rate": 0.0002, "epoch": 6.343985637342908, "step": 88340}, {"loss": 0.5301, "grad_norm": 0.9182760715484619, "learning_rate": 0.0002, "epoch": 6.344703770197486, "step": 88350}, {"loss": 0.5896, "grad_norm": 0.9787414073944092, "learning_rate": 0.0002, "epoch": 6.345421903052064, "step": 88360}, {"loss": 0.579, "grad_norm": 1.295432448387146, "learning_rate": 0.0002, "epoch": 6.346140035906643, "step": 88370}, {"loss": 0.5737, "grad_norm": 0.9269146919250488, "learning_rate": 0.0002, "epoch": 6.346858168761221, "step": 88380}, {"loss": 0.5551, "grad_norm": 0.9076777696609497, "learning_rate": 0.0002, "epoch": 6.347576301615799, "step": 88390}, {"loss": 0.5542, "grad_norm": 1.1186468601226807, "learning_rate": 0.0002, "epoch": 6.348294434470377, "step": 88400}, {"loss": 0.5806, "grad_norm": 1.1021504402160645, "learning_rate": 0.0002, "epoch": 6.349012567324955, "step": 88410}, {"loss": 0.5717, "grad_norm": 1.2439358234405518, "learning_rate": 0.0002, "epoch": 6.349730700179533, "step": 88420}, {"loss": 0.5384, "grad_norm": 1.1228888034820557, "learning_rate": 0.0002, "epoch": 6.350448833034111, "step": 88430}, {"loss": 0.5634, "grad_norm": 1.226587176322937, "learning_rate": 0.0002, "epoch": 6.351166965888689, "step": 88440}, {"loss": 0.5676, "grad_norm": 1.2813525199890137, "learning_rate": 0.0002, "epoch": 6.351885098743267, "step": 88450}, {"loss": 0.544, "grad_norm": 1.411405086517334, "learning_rate": 0.0002, "epoch": 6.352603231597846, "step": 88460}, {"loss": 0.5349, "grad_norm": 1.3659696578979492, "learning_rate": 0.0002, "epoch": 6.353321364452424, "step": 88470}, {"loss": 0.5453, "grad_norm": 1.1398485898971558, "learning_rate": 0.0002, "epoch": 6.354039497307002, "step": 88480}, {"loss": 0.5628, "grad_norm": 1.2088590860366821, "learning_rate": 0.0002, "epoch": 6.35475763016158, "step": 88490}, {"loss": 0.4978, "grad_norm": 0.9191108345985413, "learning_rate": 0.0002, "epoch": 6.355475763016158, "step": 88500}, {"loss": 0.5091, "grad_norm": 0.9855144619941711, "learning_rate": 0.0002, "epoch": 6.356193895870736, "step": 88510}, {"loss": 0.5635, "grad_norm": 1.0576577186584473, "learning_rate": 0.0002, "epoch": 6.356912028725314, "step": 88520}, {"loss": 0.5081, "grad_norm": 1.0213230848312378, "learning_rate": 0.0002, "epoch": 6.357630161579892, "step": 88530}, {"loss": 0.6141, "grad_norm": 1.2086849212646484, "learning_rate": 0.0002, "epoch": 6.35834829443447, "step": 88540}, {"loss": 0.5477, "grad_norm": 1.05294930934906, "learning_rate": 0.0002, "epoch": 6.359066427289049, "step": 88550}, {"loss": 0.5991, "grad_norm": 1.1798300743103027, "learning_rate": 0.0002, "epoch": 6.359784560143627, "step": 88560}, {"loss": 0.551, "grad_norm": 1.088749885559082, "learning_rate": 0.0002, "epoch": 6.360502692998205, "step": 88570}, {"loss": 0.5299, "grad_norm": 1.0071386098861694, "learning_rate": 0.0002, "epoch": 6.361220825852783, "step": 88580}, {"loss": 0.5691, "grad_norm": 1.2080132961273193, "learning_rate": 0.0002, "epoch": 6.361938958707361, "step": 88590}, {"loss": 0.5637, "grad_norm": 0.9784366488456726, "learning_rate": 0.0002, "epoch": 6.362657091561939, "step": 88600}, {"loss": 0.5499, "grad_norm": 0.9475322961807251, "learning_rate": 0.0002, "epoch": 6.363375224416517, "step": 88610}, {"loss": 0.5467, "grad_norm": 0.8267584443092346, "learning_rate": 0.0002, "epoch": 6.364093357271095, "step": 88620}, {"loss": 0.591, "grad_norm": 1.05606210231781, "learning_rate": 0.0002, "epoch": 6.364811490125673, "step": 88630}, {"loss": 0.5859, "grad_norm": 1.2059335708618164, "learning_rate": 0.0002, "epoch": 6.365529622980251, "step": 88640}, {"loss": 0.5992, "grad_norm": 1.1900845766067505, "learning_rate": 0.0002, "epoch": 6.36624775583483, "step": 88650}, {"loss": 0.5618, "grad_norm": 1.0271358489990234, "learning_rate": 0.0002, "epoch": 6.366965888689408, "step": 88660}, {"loss": 0.5363, "grad_norm": 1.1839162111282349, "learning_rate": 0.0002, "epoch": 6.367684021543986, "step": 88670}, {"loss": 0.5508, "grad_norm": 0.9042913317680359, "learning_rate": 0.0002, "epoch": 6.368402154398564, "step": 88680}, {"loss": 0.5253, "grad_norm": 1.079893946647644, "learning_rate": 0.0002, "epoch": 6.369120287253142, "step": 88690}, {"loss": 0.5414, "grad_norm": 1.0999629497528076, "learning_rate": 0.0002, "epoch": 6.36983842010772, "step": 88700}, {"loss": 0.57, "grad_norm": 1.0618157386779785, "learning_rate": 0.0002, "epoch": 6.370556552962298, "step": 88710}, {"loss": 0.5559, "grad_norm": 0.9567645788192749, "learning_rate": 0.0002, "epoch": 6.371274685816876, "step": 88720}, {"loss": 0.5547, "grad_norm": 1.0342025756835938, "learning_rate": 0.0002, "epoch": 6.371992818671454, "step": 88730}, {"loss": 0.5302, "grad_norm": 1.0789190530776978, "learning_rate": 0.0002, "epoch": 6.372710951526033, "step": 88740}, {"loss": 0.5394, "grad_norm": 0.9956819415092468, "learning_rate": 0.0002, "epoch": 6.373429084380611, "step": 88750}, {"loss": 0.5739, "grad_norm": 0.9103280305862427, "learning_rate": 0.0002, "epoch": 6.374147217235189, "step": 88760}, {"loss": 0.5313, "grad_norm": 0.9856002330780029, "learning_rate": 0.0002, "epoch": 6.374865350089767, "step": 88770}, {"loss": 0.5482, "grad_norm": 1.1801226139068604, "learning_rate": 0.0002, "epoch": 6.375583482944345, "step": 88780}, {"loss": 0.584, "grad_norm": 0.9876776933670044, "learning_rate": 0.0002, "epoch": 6.376301615798923, "step": 88790}, {"loss": 0.5633, "grad_norm": 1.0169886350631714, "learning_rate": 0.0002, "epoch": 6.377019748653501, "step": 88800}, {"loss": 0.5525, "grad_norm": 1.0118076801300049, "learning_rate": 0.0002, "epoch": 6.377737881508079, "step": 88810}, {"loss": 0.5205, "grad_norm": 1.0641456842422485, "learning_rate": 0.0002, "epoch": 6.378456014362657, "step": 88820}, {"loss": 0.5816, "grad_norm": 1.1138534545898438, "learning_rate": 0.0002, "epoch": 6.379174147217235, "step": 88830}, {"loss": 0.5979, "grad_norm": 1.1518962383270264, "learning_rate": 0.0002, "epoch": 6.379892280071814, "step": 88840}, {"loss": 0.5644, "grad_norm": 1.3662128448486328, "learning_rate": 0.0002, "epoch": 6.380610412926392, "step": 88850}, {"loss": 0.5662, "grad_norm": 0.9544311761856079, "learning_rate": 0.0002, "epoch": 6.38132854578097, "step": 88860}, {"loss": 0.5721, "grad_norm": 0.9747556447982788, "learning_rate": 0.0002, "epoch": 6.382046678635548, "step": 88870}, {"loss": 0.5458, "grad_norm": 1.1651948690414429, "learning_rate": 0.0002, "epoch": 6.382764811490126, "step": 88880}, {"loss": 0.5644, "grad_norm": 1.4048396348953247, "learning_rate": 0.0002, "epoch": 6.383482944344704, "step": 88890}, {"loss": 0.5686, "grad_norm": 1.1144068241119385, "learning_rate": 0.0002, "epoch": 6.384201077199282, "step": 88900}, {"loss": 0.5572, "grad_norm": 1.2978034019470215, "learning_rate": 0.0002, "epoch": 6.38491921005386, "step": 88910}, {"loss": 0.5279, "grad_norm": 1.1776132583618164, "learning_rate": 0.0002, "epoch": 6.385637342908438, "step": 88920}, {"loss": 0.5844, "grad_norm": 0.8849034905433655, "learning_rate": 0.0002, "epoch": 6.3863554757630165, "step": 88930}, {"loss": 0.5566, "grad_norm": 1.1207057237625122, "learning_rate": 0.0002, "epoch": 6.3870736086175945, "step": 88940}, {"loss": 0.5889, "grad_norm": 0.9364172220230103, "learning_rate": 0.0002, "epoch": 6.3877917414721725, "step": 88950}, {"loss": 0.5788, "grad_norm": 1.1731317043304443, "learning_rate": 0.0002, "epoch": 6.3885098743267505, "step": 88960}, {"loss": 0.5743, "grad_norm": 1.0411573648452759, "learning_rate": 0.0002, "epoch": 6.3892280071813286, "step": 88970}, {"loss": 0.557, "grad_norm": 1.0817447900772095, "learning_rate": 0.0002, "epoch": 6.3899461400359066, "step": 88980}, {"loss": 0.5715, "grad_norm": 1.0037593841552734, "learning_rate": 0.0002, "epoch": 6.3906642728904846, "step": 88990}, {"loss": 0.562, "grad_norm": 1.1684437990188599, "learning_rate": 0.0002, "epoch": 6.391382405745063, "step": 89000}, {"loss": 0.5544, "grad_norm": 1.0237388610839844, "learning_rate": 0.0002, "epoch": 6.392100538599641, "step": 89010}, {"loss": 0.607, "grad_norm": 1.24791419506073, "learning_rate": 0.0002, "epoch": 6.392818671454219, "step": 89020}, {"loss": 0.5139, "grad_norm": 0.842664897441864, "learning_rate": 0.0002, "epoch": 6.3935368043087974, "step": 89030}, {"loss": 0.5606, "grad_norm": 1.1692326068878174, "learning_rate": 0.0002, "epoch": 6.3942549371633755, "step": 89040}, {"loss": 0.5656, "grad_norm": 1.0786939859390259, "learning_rate": 0.0002, "epoch": 6.3949730700179535, "step": 89050}, {"loss": 0.5901, "grad_norm": 1.1315077543258667, "learning_rate": 0.0002, "epoch": 6.3956912028725315, "step": 89060}, {"loss": 0.5642, "grad_norm": 0.9949214458465576, "learning_rate": 0.0002, "epoch": 6.3964093357271095, "step": 89070}, {"loss": 0.5367, "grad_norm": 1.0302025079727173, "learning_rate": 0.0002, "epoch": 6.3971274685816875, "step": 89080}, {"loss": 0.5453, "grad_norm": 0.9664030075073242, "learning_rate": 0.0002, "epoch": 6.3978456014362655, "step": 89090}, {"loss": 0.5496, "grad_norm": 1.1251037120819092, "learning_rate": 0.0002, "epoch": 6.3985637342908435, "step": 89100}, {"loss": 0.56, "grad_norm": 1.1103272438049316, "learning_rate": 0.0002, "epoch": 6.399281867145422, "step": 89110}, {"loss": 0.5703, "grad_norm": 0.9192888736724854, "learning_rate": 0.0002, "epoch": 6.4, "step": 89120}, {"loss": 0.5436, "grad_norm": 1.027806043624878, "learning_rate": 0.0002, "epoch": 6.400718132854578, "step": 89130}, {"loss": 0.608, "grad_norm": 1.1219452619552612, "learning_rate": 0.0002, "epoch": 6.401436265709156, "step": 89140}, {"loss": 0.5488, "grad_norm": 1.1703979969024658, "learning_rate": 0.0002, "epoch": 6.402154398563734, "step": 89150}, {"loss": 0.5251, "grad_norm": 1.025874376296997, "learning_rate": 0.0002, "epoch": 6.402872531418312, "step": 89160}, {"loss": 0.5476, "grad_norm": 1.070225715637207, "learning_rate": 0.0002, "epoch": 6.40359066427289, "step": 89170}, {"loss": 0.5539, "grad_norm": 1.1915208101272583, "learning_rate": 0.0002, "epoch": 6.404308797127468, "step": 89180}, {"loss": 0.5504, "grad_norm": 1.1954079866409302, "learning_rate": 0.0002, "epoch": 6.405026929982046, "step": 89190}, {"loss": 0.558, "grad_norm": 1.035910964012146, "learning_rate": 0.0002, "epoch": 6.405745062836624, "step": 89200}, {"loss": 0.586, "grad_norm": 1.1363351345062256, "learning_rate": 0.0002, "epoch": 6.406463195691203, "step": 89210}, {"loss": 0.5594, "grad_norm": 1.2086843252182007, "learning_rate": 0.0002, "epoch": 6.407181328545781, "step": 89220}, {"loss": 0.5928, "grad_norm": 1.3492387533187866, "learning_rate": 0.0002, "epoch": 6.407899461400359, "step": 89230}, {"loss": 0.5679, "grad_norm": 0.8746330738067627, "learning_rate": 0.0002, "epoch": 6.408617594254937, "step": 89240}, {"loss": 0.5818, "grad_norm": 1.0165427923202515, "learning_rate": 0.0002, "epoch": 6.409335727109515, "step": 89250}, {"loss": 0.5437, "grad_norm": 1.0314675569534302, "learning_rate": 0.0002, "epoch": 6.410053859964093, "step": 89260}, {"loss": 0.5741, "grad_norm": 1.2128242254257202, "learning_rate": 0.0002, "epoch": 6.410771992818671, "step": 89270}, {"loss": 0.59, "grad_norm": 0.9496060013771057, "learning_rate": 0.0002, "epoch": 6.411490125673249, "step": 89280}, {"loss": 0.5949, "grad_norm": 1.1838264465332031, "learning_rate": 0.0002, "epoch": 6.412208258527827, "step": 89290}, {"loss": 0.543, "grad_norm": 1.1700918674468994, "learning_rate": 0.0002, "epoch": 6.412926391382406, "step": 89300}, {"loss": 0.5185, "grad_norm": 1.2102051973342896, "learning_rate": 0.0002, "epoch": 6.413644524236984, "step": 89310}, {"loss": 0.5516, "grad_norm": 0.9485594630241394, "learning_rate": 0.0002, "epoch": 6.414362657091562, "step": 89320}, {"loss": 0.5516, "grad_norm": 1.041496753692627, "learning_rate": 0.0002, "epoch": 6.41508078994614, "step": 89330}, {"loss": 0.545, "grad_norm": 1.0785019397735596, "learning_rate": 0.0002, "epoch": 6.415798922800718, "step": 89340}, {"loss": 0.5553, "grad_norm": 0.9527593851089478, "learning_rate": 0.0002, "epoch": 6.416517055655296, "step": 89350}, {"loss": 0.5624, "grad_norm": 0.9879035353660583, "learning_rate": 0.0002, "epoch": 6.417235188509874, "step": 89360}, {"loss": 0.5614, "grad_norm": 0.9143751263618469, "learning_rate": 0.0002, "epoch": 6.417953321364452, "step": 89370}, {"loss": 0.6034, "grad_norm": 0.9145408272743225, "learning_rate": 0.0002, "epoch": 6.41867145421903, "step": 89380}, {"loss": 0.5355, "grad_norm": 1.0128624439239502, "learning_rate": 0.0002, "epoch": 6.419389587073608, "step": 89390}, {"loss": 0.5581, "grad_norm": 0.9454543590545654, "learning_rate": 0.0002, "epoch": 6.420107719928187, "step": 89400}, {"loss": 0.6192, "grad_norm": 1.0659215450286865, "learning_rate": 0.0002, "epoch": 6.420825852782765, "step": 89410}, {"loss": 0.5645, "grad_norm": 1.1622642278671265, "learning_rate": 0.0002, "epoch": 6.421543985637343, "step": 89420}, {"loss": 0.5868, "grad_norm": 0.9805575013160706, "learning_rate": 0.0002, "epoch": 6.422262118491921, "step": 89430}, {"loss": 0.5743, "grad_norm": 0.871903121471405, "learning_rate": 0.0002, "epoch": 6.422980251346499, "step": 89440}, {"loss": 0.5537, "grad_norm": 0.992355227470398, "learning_rate": 0.0002, "epoch": 6.423698384201077, "step": 89450}, {"loss": 0.5453, "grad_norm": 1.4055765867233276, "learning_rate": 0.0002, "epoch": 6.424416517055655, "step": 89460}, {"loss": 0.5472, "grad_norm": 1.0447325706481934, "learning_rate": 0.0002, "epoch": 6.425134649910233, "step": 89470}, {"loss": 0.5782, "grad_norm": 1.1162594556808472, "learning_rate": 0.0002, "epoch": 6.425852782764811, "step": 89480}, {"loss": 0.5644, "grad_norm": 1.0767697095870972, "learning_rate": 0.0002, "epoch": 6.42657091561939, "step": 89490}, {"loss": 0.5828, "grad_norm": 1.2253819704055786, "learning_rate": 0.0002, "epoch": 6.427289048473968, "step": 89500}, {"loss": 0.6364, "grad_norm": 1.0623136758804321, "learning_rate": 0.0002, "epoch": 6.428007181328546, "step": 89510}, {"loss": 0.5714, "grad_norm": 1.3238742351531982, "learning_rate": 0.0002, "epoch": 6.428725314183124, "step": 89520}, {"loss": 0.5303, "grad_norm": 1.2376916408538818, "learning_rate": 0.0002, "epoch": 6.429443447037702, "step": 89530}, {"loss": 0.5281, "grad_norm": 1.197453260421753, "learning_rate": 0.0002, "epoch": 6.43016157989228, "step": 89540}, {"loss": 0.5624, "grad_norm": 1.0539700984954834, "learning_rate": 0.0002, "epoch": 6.430879712746858, "step": 89550}, {"loss": 0.5327, "grad_norm": 1.0659761428833008, "learning_rate": 0.0002, "epoch": 6.431597845601436, "step": 89560}, {"loss": 0.5295, "grad_norm": 1.0186322927474976, "learning_rate": 0.0002, "epoch": 6.432315978456014, "step": 89570}, {"loss": 0.5333, "grad_norm": 1.232337474822998, "learning_rate": 0.0002, "epoch": 6.433034111310592, "step": 89580}, {"loss": 0.559, "grad_norm": 1.1512500047683716, "learning_rate": 0.0002, "epoch": 6.433752244165171, "step": 89590}, {"loss": 0.5223, "grad_norm": 1.0068955421447754, "learning_rate": 0.0002, "epoch": 6.434470377019749, "step": 89600}, {"loss": 0.5363, "grad_norm": 1.1359424591064453, "learning_rate": 0.0002, "epoch": 6.435188509874327, "step": 89610}, {"loss": 0.553, "grad_norm": 1.4369128942489624, "learning_rate": 0.0002, "epoch": 6.435906642728905, "step": 89620}, {"loss": 0.5427, "grad_norm": 0.9382445216178894, "learning_rate": 0.0002, "epoch": 6.436624775583483, "step": 89630}, {"loss": 0.5781, "grad_norm": 0.8607977628707886, "learning_rate": 0.0002, "epoch": 6.437342908438061, "step": 89640}, {"loss": 0.5283, "grad_norm": 0.9498276114463806, "learning_rate": 0.0002, "epoch": 6.438061041292639, "step": 89650}, {"loss": 0.554, "grad_norm": 1.4109948873519897, "learning_rate": 0.0002, "epoch": 6.438779174147217, "step": 89660}, {"loss": 0.5723, "grad_norm": 1.106134295463562, "learning_rate": 0.0002, "epoch": 6.439497307001796, "step": 89670}, {"loss": 0.5782, "grad_norm": 1.128963589668274, "learning_rate": 0.0002, "epoch": 6.440215439856374, "step": 89680}, {"loss": 0.5638, "grad_norm": 1.1370604038238525, "learning_rate": 0.0002, "epoch": 6.440933572710952, "step": 89690}, {"loss": 0.5459, "grad_norm": 1.380922794342041, "learning_rate": 0.0002, "epoch": 6.44165170556553, "step": 89700}, {"loss": 0.5775, "grad_norm": 0.9597383737564087, "learning_rate": 0.0002, "epoch": 6.442369838420108, "step": 89710}, {"loss": 0.5504, "grad_norm": 1.1491756439208984, "learning_rate": 0.0002, "epoch": 6.443087971274686, "step": 89720}, {"loss": 0.5584, "grad_norm": 1.1313573122024536, "learning_rate": 0.0002, "epoch": 6.443806104129264, "step": 89730}, {"loss": 0.5743, "grad_norm": 1.1081135272979736, "learning_rate": 0.0002, "epoch": 6.444524236983842, "step": 89740}, {"loss": 0.5648, "grad_norm": 1.0297505855560303, "learning_rate": 0.0002, "epoch": 6.44524236983842, "step": 89750}, {"loss": 0.5743, "grad_norm": 1.0534520149230957, "learning_rate": 0.0002, "epoch": 6.445960502692998, "step": 89760}, {"loss": 0.5503, "grad_norm": 1.218485951423645, "learning_rate": 0.0002, "epoch": 6.446678635547577, "step": 89770}, {"loss": 0.543, "grad_norm": 0.9336987137794495, "learning_rate": 0.0002, "epoch": 6.447396768402155, "step": 89780}, {"loss": 0.5485, "grad_norm": 0.9854478240013123, "learning_rate": 0.0002, "epoch": 6.448114901256733, "step": 89790}, {"loss": 0.5718, "grad_norm": 1.1036708354949951, "learning_rate": 0.0002, "epoch": 6.448833034111311, "step": 89800}, {"loss": 0.5362, "grad_norm": 1.2220509052276611, "learning_rate": 0.0002, "epoch": 6.449551166965889, "step": 89810}, {"loss": 0.577, "grad_norm": 0.9955567121505737, "learning_rate": 0.0002, "epoch": 6.450269299820467, "step": 89820}, {"loss": 0.5458, "grad_norm": 1.0350912809371948, "learning_rate": 0.0002, "epoch": 6.450987432675045, "step": 89830}, {"loss": 0.5957, "grad_norm": 1.156080722808838, "learning_rate": 0.0002, "epoch": 6.451705565529623, "step": 89840}, {"loss": 0.588, "grad_norm": 0.8922389149665833, "learning_rate": 0.0002, "epoch": 6.452423698384201, "step": 89850}, {"loss": 0.5676, "grad_norm": 0.9318913221359253, "learning_rate": 0.0002, "epoch": 6.45314183123878, "step": 89860}, {"loss": 0.5778, "grad_norm": 0.9420756101608276, "learning_rate": 0.0002, "epoch": 6.453859964093358, "step": 89870}, {"loss": 0.5624, "grad_norm": 1.0303646326065063, "learning_rate": 0.0002, "epoch": 6.454578096947936, "step": 89880}, {"loss": 0.5304, "grad_norm": 1.070806860923767, "learning_rate": 0.0002, "epoch": 6.455296229802514, "step": 89890}, {"loss": 0.5682, "grad_norm": 0.9890686869621277, "learning_rate": 0.0002, "epoch": 6.456014362657092, "step": 89900}, {"loss": 0.5533, "grad_norm": 1.1254929304122925, "learning_rate": 0.0002, "epoch": 6.45673249551167, "step": 89910}, {"loss": 0.5717, "grad_norm": 1.0023183822631836, "learning_rate": 0.0002, "epoch": 6.457450628366248, "step": 89920}, {"loss": 0.5624, "grad_norm": 1.118721604347229, "learning_rate": 0.0002, "epoch": 6.458168761220826, "step": 89930}, {"loss": 0.5667, "grad_norm": 1.2170203924179077, "learning_rate": 0.0002, "epoch": 6.458886894075404, "step": 89940}, {"loss": 0.5523, "grad_norm": 1.0662257671356201, "learning_rate": 0.0002, "epoch": 6.459605026929982, "step": 89950}, {"loss": 0.537, "grad_norm": 0.8912546634674072, "learning_rate": 0.0002, "epoch": 6.4603231597845605, "step": 89960}, {"loss": 0.5646, "grad_norm": 1.0346225500106812, "learning_rate": 0.0002, "epoch": 6.4610412926391385, "step": 89970}, {"loss": 0.5827, "grad_norm": 1.239388346672058, "learning_rate": 0.0002, "epoch": 6.4617594254937165, "step": 89980}, {"loss": 0.5728, "grad_norm": 1.0100152492523193, "learning_rate": 0.0002, "epoch": 6.4624775583482945, "step": 89990}, {"loss": 0.5288, "grad_norm": 1.1496137380599976, "learning_rate": 0.0002, "epoch": 6.4631956912028725, "step": 90000}, {"loss": 0.5464, "grad_norm": 0.9652666449546814, "learning_rate": 0.0002, "epoch": 6.4639138240574505, "step": 90010}, {"loss": 0.5714, "grad_norm": 1.459730863571167, "learning_rate": 0.0002, "epoch": 6.4646319569120285, "step": 90020}, {"loss": 0.5684, "grad_norm": 0.9096665978431702, "learning_rate": 0.0002, "epoch": 6.4653500897666065, "step": 90030}, {"loss": 0.5784, "grad_norm": 1.1356233358383179, "learning_rate": 0.0002, "epoch": 6.4660682226211845, "step": 90040}, {"loss": 0.5605, "grad_norm": 1.0192385911941528, "learning_rate": 0.0002, "epoch": 6.466786355475763, "step": 90050}, {"loss": 0.5549, "grad_norm": 0.9494831562042236, "learning_rate": 0.0002, "epoch": 6.467504488330341, "step": 90060}, {"loss": 0.5732, "grad_norm": 0.9784388542175293, "learning_rate": 0.0002, "epoch": 6.468222621184919, "step": 90070}, {"loss": 0.5597, "grad_norm": 1.0754846334457397, "learning_rate": 0.0002, "epoch": 6.468940754039497, "step": 90080}, {"loss": 0.5571, "grad_norm": 0.9019646644592285, "learning_rate": 0.0002, "epoch": 6.469658886894075, "step": 90090}, {"loss": 0.5652, "grad_norm": 1.1848793029785156, "learning_rate": 0.0002, "epoch": 6.470377019748653, "step": 90100}, {"loss": 0.6054, "grad_norm": 1.1312837600708008, "learning_rate": 0.0002, "epoch": 6.471095152603231, "step": 90110}, {"loss": 0.5333, "grad_norm": 0.9868128299713135, "learning_rate": 0.0002, "epoch": 6.471813285457809, "step": 90120}, {"loss": 0.5627, "grad_norm": 0.894279956817627, "learning_rate": 0.0002, "epoch": 6.472531418312387, "step": 90130}, {"loss": 0.5898, "grad_norm": 1.1206544637680054, "learning_rate": 0.0002, "epoch": 6.473249551166965, "step": 90140}, {"loss": 0.6155, "grad_norm": 1.048126220703125, "learning_rate": 0.0002, "epoch": 6.473967684021544, "step": 90150}, {"loss": 0.5501, "grad_norm": 0.9624786972999573, "learning_rate": 0.0002, "epoch": 6.474685816876122, "step": 90160}, {"loss": 0.5311, "grad_norm": 1.3301671743392944, "learning_rate": 0.0002, "epoch": 6.4754039497307, "step": 90170}, {"loss": 0.5668, "grad_norm": 1.1016923189163208, "learning_rate": 0.0002, "epoch": 6.476122082585278, "step": 90180}, {"loss": 0.6371, "grad_norm": 1.084158182144165, "learning_rate": 0.0002, "epoch": 6.476840215439856, "step": 90190}, {"loss": 0.6117, "grad_norm": 1.0704890489578247, "learning_rate": 0.0002, "epoch": 6.477558348294434, "step": 90200}, {"loss": 0.5813, "grad_norm": 1.0849730968475342, "learning_rate": 0.0002, "epoch": 6.478276481149012, "step": 90210}, {"loss": 0.5624, "grad_norm": 1.0671768188476562, "learning_rate": 0.0002, "epoch": 6.47899461400359, "step": 90220}, {"loss": 0.6028, "grad_norm": 1.1208873987197876, "learning_rate": 0.0002, "epoch": 6.479712746858169, "step": 90230}, {"loss": 0.6087, "grad_norm": 1.1958850622177124, "learning_rate": 0.0002, "epoch": 6.480430879712747, "step": 90240}, {"loss": 0.5699, "grad_norm": 1.2102761268615723, "learning_rate": 0.0002, "epoch": 6.481149012567325, "step": 90250}, {"loss": 0.5859, "grad_norm": 1.0813510417938232, "learning_rate": 0.0002, "epoch": 6.481867145421903, "step": 90260}, {"loss": 0.548, "grad_norm": 0.8553891777992249, "learning_rate": 0.0002, "epoch": 6.482585278276481, "step": 90270}, {"loss": 0.6162, "grad_norm": 1.0855463743209839, "learning_rate": 0.0002, "epoch": 6.483303411131059, "step": 90280}, {"loss": 0.5456, "grad_norm": 1.1179498434066772, "learning_rate": 0.0002, "epoch": 6.484021543985637, "step": 90290}, {"loss": 0.62, "grad_norm": 1.1268035173416138, "learning_rate": 0.0002, "epoch": 6.484739676840215, "step": 90300}, {"loss": 0.5721, "grad_norm": 1.0755188465118408, "learning_rate": 0.0002, "epoch": 6.485457809694793, "step": 90310}, {"loss": 0.5267, "grad_norm": 1.0469547510147095, "learning_rate": 0.0002, "epoch": 6.486175942549371, "step": 90320}, {"loss": 0.5674, "grad_norm": 0.8739270567893982, "learning_rate": 0.0002, "epoch": 6.48689407540395, "step": 90330}, {"loss": 0.5725, "grad_norm": 1.2452377080917358, "learning_rate": 0.0002, "epoch": 6.487612208258528, "step": 90340}, {"loss": 0.6005, "grad_norm": 1.1576505899429321, "learning_rate": 0.0002, "epoch": 6.488330341113106, "step": 90350}, {"loss": 0.566, "grad_norm": 1.0247524976730347, "learning_rate": 0.0002, "epoch": 6.489048473967684, "step": 90360}, {"loss": 0.5997, "grad_norm": 1.1306205987930298, "learning_rate": 0.0002, "epoch": 6.489766606822262, "step": 90370}, {"loss": 0.5458, "grad_norm": 1.0545839071273804, "learning_rate": 0.0002, "epoch": 6.49048473967684, "step": 90380}, {"loss": 0.5779, "grad_norm": 1.281407117843628, "learning_rate": 0.0002, "epoch": 6.491202872531418, "step": 90390}, {"loss": 0.5774, "grad_norm": 1.2330801486968994, "learning_rate": 0.0002, "epoch": 6.491921005385996, "step": 90400}, {"loss": 0.5507, "grad_norm": 0.8966873288154602, "learning_rate": 0.0002, "epoch": 6.492639138240574, "step": 90410}, {"loss": 0.6008, "grad_norm": 0.9748067259788513, "learning_rate": 0.0002, "epoch": 6.493357271095153, "step": 90420}, {"loss": 0.5784, "grad_norm": 0.9285972118377686, "learning_rate": 0.0002, "epoch": 6.494075403949731, "step": 90430}, {"loss": 0.5635, "grad_norm": 1.123449444770813, "learning_rate": 0.0002, "epoch": 6.494793536804309, "step": 90440}, {"loss": 0.5686, "grad_norm": 1.4190359115600586, "learning_rate": 0.0002, "epoch": 6.495511669658887, "step": 90450}, {"loss": 0.6073, "grad_norm": 0.9877263307571411, "learning_rate": 0.0002, "epoch": 6.496229802513465, "step": 90460}, {"loss": 0.5527, "grad_norm": 0.9850174188613892, "learning_rate": 0.0002, "epoch": 6.496947935368043, "step": 90470}, {"loss": 0.5777, "grad_norm": 1.3609496355056763, "learning_rate": 0.0002, "epoch": 6.497666068222621, "step": 90480}, {"loss": 0.5405, "grad_norm": 0.8299460411071777, "learning_rate": 0.0002, "epoch": 6.498384201077199, "step": 90490}, {"loss": 0.5881, "grad_norm": 1.3359589576721191, "learning_rate": 0.0002, "epoch": 6.499102333931777, "step": 90500}, {"loss": 0.61, "grad_norm": 1.1211248636245728, "learning_rate": 0.0002, "epoch": 6.499820466786355, "step": 90510}, {"loss": 0.5582, "grad_norm": 1.1070419549942017, "learning_rate": 0.0002, "epoch": 6.500538599640934, "step": 90520}, {"loss": 0.5814, "grad_norm": 1.1590572595596313, "learning_rate": 0.0002, "epoch": 6.501256732495512, "step": 90530}, {"loss": 0.5486, "grad_norm": 0.9865858554840088, "learning_rate": 0.0002, "epoch": 6.50197486535009, "step": 90540}, {"loss": 0.6033, "grad_norm": 0.9752925634384155, "learning_rate": 0.0002, "epoch": 6.502692998204668, "step": 90550}, {"loss": 0.5409, "grad_norm": 1.2411525249481201, "learning_rate": 0.0002, "epoch": 6.503411131059246, "step": 90560}, {"loss": 0.554, "grad_norm": 1.1538971662521362, "learning_rate": 0.0002, "epoch": 6.504129263913824, "step": 90570}, {"loss": 0.584, "grad_norm": 1.2818700075149536, "learning_rate": 0.0002, "epoch": 6.504847396768402, "step": 90580}, {"loss": 0.543, "grad_norm": 1.2787950038909912, "learning_rate": 0.0002, "epoch": 6.50556552962298, "step": 90590}, {"loss": 0.5897, "grad_norm": 1.1357126235961914, "learning_rate": 0.0002, "epoch": 6.506283662477558, "step": 90600}, {"loss": 0.5506, "grad_norm": 1.0781097412109375, "learning_rate": 0.0002, "epoch": 6.507001795332137, "step": 90610}, {"loss": 0.5516, "grad_norm": 0.9754705429077148, "learning_rate": 0.0002, "epoch": 6.507719928186715, "step": 90620}, {"loss": 0.5955, "grad_norm": 1.018410563468933, "learning_rate": 0.0002, "epoch": 6.508438061041293, "step": 90630}, {"loss": 0.562, "grad_norm": 1.0382000207901, "learning_rate": 0.0002, "epoch": 6.509156193895871, "step": 90640}, {"loss": 0.5243, "grad_norm": 0.9059327840805054, "learning_rate": 0.0002, "epoch": 6.509874326750449, "step": 90650}, {"loss": 0.5628, "grad_norm": 1.2049181461334229, "learning_rate": 0.0002, "epoch": 6.510592459605027, "step": 90660}, {"loss": 0.6158, "grad_norm": 1.1005393266677856, "learning_rate": 0.0002, "epoch": 6.511310592459605, "step": 90670}, {"loss": 0.563, "grad_norm": 1.0504072904586792, "learning_rate": 0.0002, "epoch": 6.512028725314183, "step": 90680}, {"loss": 0.5792, "grad_norm": 1.2491340637207031, "learning_rate": 0.0002, "epoch": 6.512746858168761, "step": 90690}, {"loss": 0.5851, "grad_norm": 0.9971826672554016, "learning_rate": 0.0002, "epoch": 6.513464991023339, "step": 90700}, {"loss": 0.5597, "grad_norm": 1.0228981971740723, "learning_rate": 0.0002, "epoch": 6.514183123877918, "step": 90710}, {"loss": 0.5453, "grad_norm": 1.1531293392181396, "learning_rate": 0.0002, "epoch": 6.514901256732496, "step": 90720}, {"loss": 0.5501, "grad_norm": 0.9401963949203491, "learning_rate": 0.0002, "epoch": 6.515619389587074, "step": 90730}, {"loss": 0.5727, "grad_norm": 1.3876653909683228, "learning_rate": 0.0002, "epoch": 6.516337522441652, "step": 90740}, {"loss": 0.5978, "grad_norm": 1.3111445903778076, "learning_rate": 0.0002, "epoch": 6.51705565529623, "step": 90750}, {"loss": 0.6003, "grad_norm": 0.8705055713653564, "learning_rate": 0.0002, "epoch": 6.517773788150808, "step": 90760}, {"loss": 0.5418, "grad_norm": 1.213295340538025, "learning_rate": 0.0002, "epoch": 6.518491921005386, "step": 90770}, {"loss": 0.6073, "grad_norm": 1.2075343132019043, "learning_rate": 0.0002, "epoch": 6.519210053859964, "step": 90780}, {"loss": 0.6203, "grad_norm": 0.9814115166664124, "learning_rate": 0.0002, "epoch": 6.519928186714543, "step": 90790}, {"loss": 0.5708, "grad_norm": 1.0937272310256958, "learning_rate": 0.0002, "epoch": 6.520646319569121, "step": 90800}, {"loss": 0.5635, "grad_norm": 1.0839916467666626, "learning_rate": 0.0002, "epoch": 6.521364452423699, "step": 90810}, {"loss": 0.6166, "grad_norm": 1.1918399333953857, "learning_rate": 0.0002, "epoch": 6.522082585278277, "step": 90820}, {"loss": 0.5531, "grad_norm": 1.1677868366241455, "learning_rate": 0.0002, "epoch": 6.522800718132855, "step": 90830}, {"loss": 0.5826, "grad_norm": 1.0840870141983032, "learning_rate": 0.0002, "epoch": 6.523518850987433, "step": 90840}, {"loss": 0.56, "grad_norm": 1.10408353805542, "learning_rate": 0.0002, "epoch": 6.524236983842011, "step": 90850}, {"loss": 0.5729, "grad_norm": 1.056705355644226, "learning_rate": 0.0002, "epoch": 6.524955116696589, "step": 90860}, {"loss": 0.5819, "grad_norm": 1.0552406311035156, "learning_rate": 0.0002, "epoch": 6.525673249551167, "step": 90870}, {"loss": 0.5631, "grad_norm": 1.000816822052002, "learning_rate": 0.0002, "epoch": 6.526391382405745, "step": 90880}, {"loss": 0.5871, "grad_norm": 1.1465239524841309, "learning_rate": 0.0002, "epoch": 6.527109515260323, "step": 90890}, {"loss": 0.5652, "grad_norm": 0.9380449652671814, "learning_rate": 0.0002, "epoch": 6.527827648114902, "step": 90900}, {"loss": 0.5291, "grad_norm": 0.9572200179100037, "learning_rate": 0.0002, "epoch": 6.52854578096948, "step": 90910}, {"loss": 0.5819, "grad_norm": 1.0058002471923828, "learning_rate": 0.0002, "epoch": 6.529263913824058, "step": 90920}, {"loss": 0.584, "grad_norm": 1.0932626724243164, "learning_rate": 0.0002, "epoch": 6.529982046678636, "step": 90930}, {"loss": 0.5448, "grad_norm": 0.9283126592636108, "learning_rate": 0.0002, "epoch": 6.530700179533214, "step": 90940}, {"loss": 0.5916, "grad_norm": 1.1347819566726685, "learning_rate": 0.0002, "epoch": 6.531418312387792, "step": 90950}, {"loss": 0.5485, "grad_norm": 1.4964616298675537, "learning_rate": 0.0002, "epoch": 6.53213644524237, "step": 90960}, {"loss": 0.5567, "grad_norm": 1.1725877523422241, "learning_rate": 0.0002, "epoch": 6.532854578096948, "step": 90970}, {"loss": 0.6339, "grad_norm": 1.185640811920166, "learning_rate": 0.0002, "epoch": 6.5335727109515265, "step": 90980}, {"loss": 0.6021, "grad_norm": 1.0598312616348267, "learning_rate": 0.0002, "epoch": 6.5342908438061045, "step": 90990}, {"loss": 0.5666, "grad_norm": 1.389320731163025, "learning_rate": 0.0002, "epoch": 6.5350089766606825, "step": 91000}, {"loss": 0.5572, "grad_norm": 1.102960467338562, "learning_rate": 0.0002, "epoch": 6.5357271095152605, "step": 91010}, {"loss": 0.5624, "grad_norm": 1.2482284307479858, "learning_rate": 0.0002, "epoch": 6.5364452423698385, "step": 91020}, {"loss": 0.5927, "grad_norm": 1.213861346244812, "learning_rate": 0.0002, "epoch": 6.5371633752244165, "step": 91030}, {"loss": 0.5876, "grad_norm": 1.1872318983078003, "learning_rate": 0.0002, "epoch": 6.5378815080789945, "step": 91040}, {"loss": 0.5713, "grad_norm": 1.0767916440963745, "learning_rate": 0.0002, "epoch": 6.5385996409335725, "step": 91050}, {"loss": 0.5619, "grad_norm": 1.0610442161560059, "learning_rate": 0.0002, "epoch": 6.5393177737881505, "step": 91060}, {"loss": 0.5661, "grad_norm": 1.0161356925964355, "learning_rate": 0.0002, "epoch": 6.5400359066427285, "step": 91070}, {"loss": 0.5421, "grad_norm": 1.373284101486206, "learning_rate": 0.0002, "epoch": 6.540754039497307, "step": 91080}, {"loss": 0.603, "grad_norm": 1.1611387729644775, "learning_rate": 0.0002, "epoch": 6.541472172351885, "step": 91090}, {"loss": 0.5632, "grad_norm": 1.1980092525482178, "learning_rate": 0.0002, "epoch": 6.542190305206463, "step": 91100}, {"loss": 0.5313, "grad_norm": 1.1174312829971313, "learning_rate": 0.0002, "epoch": 6.542908438061041, "step": 91110}, {"loss": 0.5435, "grad_norm": 1.1376914978027344, "learning_rate": 0.0002, "epoch": 6.543626570915619, "step": 91120}, {"loss": 0.5549, "grad_norm": 1.0551620721817017, "learning_rate": 0.0002, "epoch": 6.544344703770197, "step": 91130}, {"loss": 0.5796, "grad_norm": 1.2839815616607666, "learning_rate": 0.0002, "epoch": 6.545062836624775, "step": 91140}, {"loss": 0.5267, "grad_norm": 0.7656933665275574, "learning_rate": 0.0002, "epoch": 6.545780969479353, "step": 91150}, {"loss": 0.5431, "grad_norm": 1.1079483032226562, "learning_rate": 0.0002, "epoch": 6.546499102333931, "step": 91160}, {"loss": 0.5814, "grad_norm": 1.4870734214782715, "learning_rate": 0.0002, "epoch": 6.54721723518851, "step": 91170}, {"loss": 0.5978, "grad_norm": 1.1784024238586426, "learning_rate": 0.0002, "epoch": 6.547935368043088, "step": 91180}, {"loss": 0.542, "grad_norm": 1.3510793447494507, "learning_rate": 0.0002, "epoch": 6.548653500897666, "step": 91190}, {"loss": 0.5435, "grad_norm": 1.0237789154052734, "learning_rate": 0.0002, "epoch": 6.549371633752244, "step": 91200}, {"loss": 0.5321, "grad_norm": 1.0721405744552612, "learning_rate": 0.0002, "epoch": 6.550089766606822, "step": 91210}, {"loss": 0.5234, "grad_norm": 0.9794955253601074, "learning_rate": 0.0002, "epoch": 6.5508078994614, "step": 91220}, {"loss": 0.5291, "grad_norm": 1.1046847105026245, "learning_rate": 0.0002, "epoch": 6.551526032315978, "step": 91230}, {"loss": 0.5627, "grad_norm": 0.9706982374191284, "learning_rate": 0.0002, "epoch": 6.552244165170556, "step": 91240}, {"loss": 0.5801, "grad_norm": 0.9466179609298706, "learning_rate": 0.0002, "epoch": 6.552962298025134, "step": 91250}, {"loss": 0.589, "grad_norm": 1.126806616783142, "learning_rate": 0.0002, "epoch": 6.553680430879712, "step": 91260}, {"loss": 0.5529, "grad_norm": 0.9713812470436096, "learning_rate": 0.0002, "epoch": 6.554398563734291, "step": 91270}, {"loss": 0.5654, "grad_norm": 0.8955506682395935, "learning_rate": 0.0002, "epoch": 6.555116696588869, "step": 91280}, {"loss": 0.6102, "grad_norm": 1.2066279649734497, "learning_rate": 0.0002, "epoch": 6.555834829443447, "step": 91290}, {"loss": 0.5442, "grad_norm": 0.957999587059021, "learning_rate": 0.0002, "epoch": 6.556552962298025, "step": 91300}, {"loss": 0.554, "grad_norm": 1.253709077835083, "learning_rate": 0.0002, "epoch": 6.557271095152603, "step": 91310}, {"loss": 0.5588, "grad_norm": 1.0075397491455078, "learning_rate": 0.0002, "epoch": 6.557989228007181, "step": 91320}, {"loss": 0.5265, "grad_norm": 0.9356904029846191, "learning_rate": 0.0002, "epoch": 6.558707360861759, "step": 91330}, {"loss": 0.5799, "grad_norm": 1.1555782556533813, "learning_rate": 0.0002, "epoch": 6.559425493716337, "step": 91340}, {"loss": 0.5787, "grad_norm": 0.9786396026611328, "learning_rate": 0.0002, "epoch": 6.560143626570916, "step": 91350}, {"loss": 0.5417, "grad_norm": 1.156374454498291, "learning_rate": 0.0002, "epoch": 6.560861759425494, "step": 91360}, {"loss": 0.5836, "grad_norm": 1.0572668313980103, "learning_rate": 0.0002, "epoch": 6.561579892280072, "step": 91370}, {"loss": 0.5632, "grad_norm": 1.4248497486114502, "learning_rate": 0.0002, "epoch": 6.56229802513465, "step": 91380}, {"loss": 0.5868, "grad_norm": 1.1191383600234985, "learning_rate": 0.0002, "epoch": 6.563016157989228, "step": 91390}, {"loss": 0.5919, "grad_norm": 0.9622306227684021, "learning_rate": 0.0002, "epoch": 6.563734290843806, "step": 91400}, {"loss": 0.557, "grad_norm": 1.3683338165283203, "learning_rate": 0.0002, "epoch": 6.564452423698384, "step": 91410}, {"loss": 0.5844, "grad_norm": 1.0363010168075562, "learning_rate": 0.0002, "epoch": 6.565170556552962, "step": 91420}, {"loss": 0.5718, "grad_norm": 1.2861888408660889, "learning_rate": 0.0002, "epoch": 6.56588868940754, "step": 91430}, {"loss": 0.5844, "grad_norm": 1.0330547094345093, "learning_rate": 0.0002, "epoch": 6.566606822262118, "step": 91440}, {"loss": 0.5748, "grad_norm": 1.044992446899414, "learning_rate": 0.0002, "epoch": 6.567324955116696, "step": 91450}, {"loss": 0.5853, "grad_norm": 1.0722706317901611, "learning_rate": 0.0002, "epoch": 6.568043087971275, "step": 91460}, {"loss": 0.5819, "grad_norm": 1.1327447891235352, "learning_rate": 0.0002, "epoch": 6.568761220825853, "step": 91470}, {"loss": 0.5706, "grad_norm": 1.2709840536117554, "learning_rate": 0.0002, "epoch": 6.569479353680431, "step": 91480}, {"loss": 0.5815, "grad_norm": 1.0964101552963257, "learning_rate": 0.0002, "epoch": 6.570197486535009, "step": 91490}, {"loss": 0.5556, "grad_norm": 0.9897898435592651, "learning_rate": 0.0002, "epoch": 6.570915619389587, "step": 91500}, {"loss": 0.5295, "grad_norm": 1.0143952369689941, "learning_rate": 0.0002, "epoch": 6.571633752244165, "step": 91510}, {"loss": 0.5527, "grad_norm": 0.923865020275116, "learning_rate": 0.0002, "epoch": 6.572351885098743, "step": 91520}, {"loss": 0.5749, "grad_norm": 1.144390344619751, "learning_rate": 0.0002, "epoch": 6.573070017953321, "step": 91530}, {"loss": 0.6356, "grad_norm": 1.0636180639266968, "learning_rate": 0.0002, "epoch": 6.5737881508079, "step": 91540}, {"loss": 0.5174, "grad_norm": 1.0699774026870728, "learning_rate": 0.0002, "epoch": 6.574506283662478, "step": 91550}, {"loss": 0.568, "grad_norm": 1.2139345407485962, "learning_rate": 0.0002, "epoch": 6.575224416517056, "step": 91560}, {"loss": 0.5151, "grad_norm": 1.4551644325256348, "learning_rate": 0.0002, "epoch": 6.575942549371634, "step": 91570}, {"loss": 0.5936, "grad_norm": 1.2388415336608887, "learning_rate": 0.0002, "epoch": 6.576660682226212, "step": 91580}, {"loss": 0.5711, "grad_norm": 0.9303404688835144, "learning_rate": 0.0002, "epoch": 6.57737881508079, "step": 91590}, {"loss": 0.6162, "grad_norm": 0.932905912399292, "learning_rate": 0.0002, "epoch": 6.578096947935368, "step": 91600}, {"loss": 0.5594, "grad_norm": 1.0726542472839355, "learning_rate": 0.0002, "epoch": 6.578815080789946, "step": 91610}, {"loss": 0.5879, "grad_norm": 1.138890266418457, "learning_rate": 0.0002, "epoch": 6.579533213644524, "step": 91620}, {"loss": 0.5669, "grad_norm": 1.087165355682373, "learning_rate": 0.0002, "epoch": 6.580251346499102, "step": 91630}, {"loss": 0.572, "grad_norm": 1.0526753664016724, "learning_rate": 0.0002, "epoch": 6.580969479353681, "step": 91640}, {"loss": 0.5872, "grad_norm": 1.068217158317566, "learning_rate": 0.0002, "epoch": 6.581687612208259, "step": 91650}, {"loss": 0.5817, "grad_norm": 1.09737229347229, "learning_rate": 0.0002, "epoch": 6.582405745062837, "step": 91660}, {"loss": 0.588, "grad_norm": 0.9466586112976074, "learning_rate": 0.0002, "epoch": 6.583123877917415, "step": 91670}, {"loss": 0.6083, "grad_norm": 1.2311620712280273, "learning_rate": 0.0002, "epoch": 6.583842010771993, "step": 91680}, {"loss": 0.5629, "grad_norm": 1.2385680675506592, "learning_rate": 0.0002, "epoch": 6.584560143626571, "step": 91690}, {"loss": 0.6515, "grad_norm": 0.947889506816864, "learning_rate": 0.0002, "epoch": 6.585278276481149, "step": 91700}, {"loss": 0.5928, "grad_norm": 0.9600529670715332, "learning_rate": 0.0002, "epoch": 6.585996409335727, "step": 91710}, {"loss": 0.6032, "grad_norm": 1.3595638275146484, "learning_rate": 0.0002, "epoch": 6.586714542190305, "step": 91720}, {"loss": 0.5658, "grad_norm": 1.0087260007858276, "learning_rate": 0.0002, "epoch": 6.587432675044884, "step": 91730}, {"loss": 0.558, "grad_norm": 1.0008373260498047, "learning_rate": 0.0002, "epoch": 6.588150807899462, "step": 91740}, {"loss": 0.5799, "grad_norm": 1.0367980003356934, "learning_rate": 0.0002, "epoch": 6.58886894075404, "step": 91750}, {"loss": 0.5834, "grad_norm": 1.1934503316879272, "learning_rate": 0.0002, "epoch": 6.589587073608618, "step": 91760}, {"loss": 0.5837, "grad_norm": 1.0295839309692383, "learning_rate": 0.0002, "epoch": 6.590305206463196, "step": 91770}, {"loss": 0.5663, "grad_norm": 0.926913857460022, "learning_rate": 0.0002, "epoch": 6.591023339317774, "step": 91780}, {"loss": 0.6089, "grad_norm": 1.055837631225586, "learning_rate": 0.0002, "epoch": 6.591741472172352, "step": 91790}, {"loss": 0.5597, "grad_norm": 1.006401777267456, "learning_rate": 0.0002, "epoch": 6.59245960502693, "step": 91800}, {"loss": 0.5726, "grad_norm": 1.1368589401245117, "learning_rate": 0.0002, "epoch": 6.593177737881508, "step": 91810}, {"loss": 0.5896, "grad_norm": 0.8494837880134583, "learning_rate": 0.0002, "epoch": 6.593895870736086, "step": 91820}, {"loss": 0.6145, "grad_norm": 1.3219822645187378, "learning_rate": 0.0002, "epoch": 6.594614003590665, "step": 91830}, {"loss": 0.5967, "grad_norm": 1.0583800077438354, "learning_rate": 0.0002, "epoch": 6.595332136445243, "step": 91840}, {"loss": 0.5942, "grad_norm": 1.0579098463058472, "learning_rate": 0.0002, "epoch": 6.596050269299821, "step": 91850}, {"loss": 0.5828, "grad_norm": 1.0618008375167847, "learning_rate": 0.0002, "epoch": 6.596768402154399, "step": 91860}, {"loss": 0.587, "grad_norm": 0.9425104260444641, "learning_rate": 0.0002, "epoch": 6.597486535008977, "step": 91870}, {"loss": 0.5478, "grad_norm": 0.9130632281303406, "learning_rate": 0.0002, "epoch": 6.598204667863555, "step": 91880}, {"loss": 0.5769, "grad_norm": 1.126438856124878, "learning_rate": 0.0002, "epoch": 6.598922800718133, "step": 91890}, {"loss": 0.5621, "grad_norm": 0.9135168194770813, "learning_rate": 0.0002, "epoch": 6.599640933572711, "step": 91900}, {"loss": 0.5544, "grad_norm": 1.1640992164611816, "learning_rate": 0.0002, "epoch": 6.6003590664272895, "step": 91910}, {"loss": 0.5595, "grad_norm": 1.2641936540603638, "learning_rate": 0.0002, "epoch": 6.6010771992818675, "step": 91920}, {"loss": 0.6329, "grad_norm": 1.1252738237380981, "learning_rate": 0.0002, "epoch": 6.6017953321364455, "step": 91930}, {"loss": 0.5466, "grad_norm": 1.0307750701904297, "learning_rate": 0.0002, "epoch": 6.6025134649910235, "step": 91940}, {"loss": 0.581, "grad_norm": 0.978972315788269, "learning_rate": 0.0002, "epoch": 6.6032315978456015, "step": 91950}, {"loss": 0.5485, "grad_norm": 1.1350890398025513, "learning_rate": 0.0002, "epoch": 6.6039497307001795, "step": 91960}, {"loss": 0.6263, "grad_norm": 0.9177488088607788, "learning_rate": 0.0002, "epoch": 6.6046678635547575, "step": 91970}, {"loss": 0.5833, "grad_norm": 1.0381031036376953, "learning_rate": 0.0002, "epoch": 6.6053859964093355, "step": 91980}, {"loss": 0.5793, "grad_norm": 1.1706395149230957, "learning_rate": 0.0002, "epoch": 6.6061041292639135, "step": 91990}, {"loss": 0.5899, "grad_norm": 1.1102650165557861, "learning_rate": 0.0002, "epoch": 6.6068222621184916, "step": 92000}, {"loss": 0.5712, "grad_norm": 0.9234306812286377, "learning_rate": 0.0002, "epoch": 6.6075403949730696, "step": 92010}, {"loss": 0.6152, "grad_norm": 1.2014371156692505, "learning_rate": 0.0002, "epoch": 6.608258527827648, "step": 92020}, {"loss": 0.5284, "grad_norm": 0.9392209053039551, "learning_rate": 0.0002, "epoch": 6.6089766606822264, "step": 92030}, {"loss": 0.5818, "grad_norm": 1.0882072448730469, "learning_rate": 0.0002, "epoch": 6.6096947935368044, "step": 92040}, {"loss": 0.5984, "grad_norm": 1.032155156135559, "learning_rate": 0.0002, "epoch": 6.6104129263913824, "step": 92050}, {"loss": 0.5498, "grad_norm": 0.913979172706604, "learning_rate": 0.0002, "epoch": 6.6111310592459605, "step": 92060}, {"loss": 0.5683, "grad_norm": 1.205101490020752, "learning_rate": 0.0002, "epoch": 6.6118491921005385, "step": 92070}, {"loss": 0.5816, "grad_norm": 1.0713984966278076, "learning_rate": 0.0002, "epoch": 6.6125673249551165, "step": 92080}, {"loss": 0.5729, "grad_norm": 0.9191082715988159, "learning_rate": 0.0002, "epoch": 6.6132854578096945, "step": 92090}, {"loss": 0.6036, "grad_norm": 0.9553678631782532, "learning_rate": 0.0002, "epoch": 6.614003590664273, "step": 92100}, {"loss": 0.6329, "grad_norm": 1.333262324333191, "learning_rate": 0.0002, "epoch": 6.614721723518851, "step": 92110}, {"loss": 0.5624, "grad_norm": 1.030739426612854, "learning_rate": 0.0002, "epoch": 6.615439856373429, "step": 92120}, {"loss": 0.5998, "grad_norm": 0.8777900338172913, "learning_rate": 0.0002, "epoch": 6.616157989228007, "step": 92130}, {"loss": 0.5239, "grad_norm": 1.071578860282898, "learning_rate": 0.0002, "epoch": 6.616876122082585, "step": 92140}, {"loss": 0.517, "grad_norm": 1.1931039094924927, "learning_rate": 0.0002, "epoch": 6.617594254937163, "step": 92150}, {"loss": 0.5849, "grad_norm": 1.2041425704956055, "learning_rate": 0.0002, "epoch": 6.618312387791741, "step": 92160}, {"loss": 0.5544, "grad_norm": 0.8523036241531372, "learning_rate": 0.0002, "epoch": 6.619030520646319, "step": 92170}, {"loss": 0.5857, "grad_norm": 1.1914807558059692, "learning_rate": 0.0002, "epoch": 6.619748653500897, "step": 92180}, {"loss": 0.5795, "grad_norm": 1.1336464881896973, "learning_rate": 0.0002, "epoch": 6.620466786355475, "step": 92190}, {"loss": 0.5566, "grad_norm": 1.2282923460006714, "learning_rate": 0.0002, "epoch": 6.621184919210053, "step": 92200}, {"loss": 0.5627, "grad_norm": 1.1887043714523315, "learning_rate": 0.0002, "epoch": 6.621903052064632, "step": 92210}, {"loss": 0.5739, "grad_norm": 0.9654178619384766, "learning_rate": 0.0002, "epoch": 6.62262118491921, "step": 92220}, {"loss": 0.5307, "grad_norm": 0.7957702875137329, "learning_rate": 0.0002, "epoch": 6.623339317773788, "step": 92230}, {"loss": 0.5668, "grad_norm": 0.8697461485862732, "learning_rate": 0.0002, "epoch": 6.624057450628366, "step": 92240}, {"loss": 0.5391, "grad_norm": 1.0392963886260986, "learning_rate": 0.0002, "epoch": 6.624775583482944, "step": 92250}, {"loss": 0.5867, "grad_norm": 1.1502392292022705, "learning_rate": 0.0002, "epoch": 6.625493716337522, "step": 92260}, {"loss": 0.5577, "grad_norm": 1.2818870544433594, "learning_rate": 0.0002, "epoch": 6.6262118491921, "step": 92270}, {"loss": 0.5864, "grad_norm": 0.8769828081130981, "learning_rate": 0.0002, "epoch": 6.626929982046678, "step": 92280}, {"loss": 0.5892, "grad_norm": 1.2273039817810059, "learning_rate": 0.0002, "epoch": 6.627648114901257, "step": 92290}, {"loss": 0.5568, "grad_norm": 0.8619378805160522, "learning_rate": 0.0002, "epoch": 6.628366247755835, "step": 92300}, {"loss": 0.589, "grad_norm": 0.9501098990440369, "learning_rate": 0.0002, "epoch": 6.629084380610413, "step": 92310}, {"loss": 0.6012, "grad_norm": 1.0698163509368896, "learning_rate": 0.0002, "epoch": 6.629802513464991, "step": 92320}, {"loss": 0.5766, "grad_norm": 1.0689377784729004, "learning_rate": 0.0002, "epoch": 6.630520646319569, "step": 92330}, {"loss": 0.5487, "grad_norm": 1.2086275815963745, "learning_rate": 0.0002, "epoch": 6.631238779174147, "step": 92340}, {"loss": 0.563, "grad_norm": 1.1256859302520752, "learning_rate": 0.0002, "epoch": 6.631956912028725, "step": 92350}, {"loss": 0.5542, "grad_norm": 0.9717738032341003, "learning_rate": 0.0002, "epoch": 6.632675044883303, "step": 92360}, {"loss": 0.6, "grad_norm": 0.9784330725669861, "learning_rate": 0.0002, "epoch": 6.633393177737881, "step": 92370}, {"loss": 0.5571, "grad_norm": 1.2600007057189941, "learning_rate": 0.0002, "epoch": 6.634111310592459, "step": 92380}, {"loss": 0.5852, "grad_norm": 0.889910101890564, "learning_rate": 0.0002, "epoch": 6.634829443447038, "step": 92390}, {"loss": 0.5635, "grad_norm": 1.010524868965149, "learning_rate": 0.0002, "epoch": 6.635547576301616, "step": 92400}, {"loss": 0.5806, "grad_norm": 1.325664758682251, "learning_rate": 0.0002, "epoch": 6.636265709156194, "step": 92410}, {"loss": 0.6149, "grad_norm": 1.3910914659500122, "learning_rate": 0.0002, "epoch": 6.636983842010772, "step": 92420}, {"loss": 0.5964, "grad_norm": 0.8858863115310669, "learning_rate": 0.0002, "epoch": 6.63770197486535, "step": 92430}, {"loss": 0.6007, "grad_norm": 1.1841683387756348, "learning_rate": 0.0002, "epoch": 6.638420107719928, "step": 92440}, {"loss": 0.584, "grad_norm": 1.2783559560775757, "learning_rate": 0.0002, "epoch": 6.639138240574506, "step": 92450}, {"loss": 0.5683, "grad_norm": 0.9154769778251648, "learning_rate": 0.0002, "epoch": 6.639856373429084, "step": 92460}, {"loss": 0.6238, "grad_norm": 1.003371000289917, "learning_rate": 0.0002, "epoch": 6.640574506283663, "step": 92470}, {"loss": 0.5537, "grad_norm": 0.9700522422790527, "learning_rate": 0.0002, "epoch": 6.641292639138241, "step": 92480}, {"loss": 0.5263, "grad_norm": 1.273629069328308, "learning_rate": 0.0002, "epoch": 6.642010771992819, "step": 92490}, {"loss": 0.5773, "grad_norm": 1.2746435403823853, "learning_rate": 0.0002, "epoch": 6.642728904847397, "step": 92500}, {"loss": 0.5778, "grad_norm": 1.0184870958328247, "learning_rate": 0.0002, "epoch": 6.643447037701975, "step": 92510}, {"loss": 0.5438, "grad_norm": 0.9988235831260681, "learning_rate": 0.0002, "epoch": 6.644165170556553, "step": 92520}, {"loss": 0.5275, "grad_norm": 1.075997233390808, "learning_rate": 0.0002, "epoch": 6.644883303411131, "step": 92530}, {"loss": 0.5927, "grad_norm": 1.180784821510315, "learning_rate": 0.0002, "epoch": 6.645601436265709, "step": 92540}, {"loss": 0.5641, "grad_norm": 1.0889579057693481, "learning_rate": 0.0002, "epoch": 6.646319569120287, "step": 92550}, {"loss": 0.5745, "grad_norm": 1.0069187879562378, "learning_rate": 0.0002, "epoch": 6.647037701974865, "step": 92560}, {"loss": 0.5706, "grad_norm": 1.110495686531067, "learning_rate": 0.0002, "epoch": 6.647755834829443, "step": 92570}, {"loss": 0.6124, "grad_norm": 1.0540684461593628, "learning_rate": 0.0002, "epoch": 6.648473967684022, "step": 92580}, {"loss": 0.5718, "grad_norm": 1.0917930603027344, "learning_rate": 0.0002, "epoch": 6.6491921005386, "step": 92590}, {"loss": 0.5556, "grad_norm": 1.225898027420044, "learning_rate": 0.0002, "epoch": 6.649910233393178, "step": 92600}, {"loss": 0.5663, "grad_norm": 0.9372484087944031, "learning_rate": 0.0002, "epoch": 6.650628366247756, "step": 92610}, {"loss": 0.5476, "grad_norm": 0.98685622215271, "learning_rate": 0.0002, "epoch": 6.651346499102334, "step": 92620}, {"loss": 0.6096, "grad_norm": 1.1148556470870972, "learning_rate": 0.0002, "epoch": 6.652064631956912, "step": 92630}, {"loss": 0.5371, "grad_norm": 1.1483707427978516, "learning_rate": 0.0002, "epoch": 6.65278276481149, "step": 92640}, {"loss": 0.5524, "grad_norm": 1.092708706855774, "learning_rate": 0.0002, "epoch": 6.653500897666068, "step": 92650}, {"loss": 0.5959, "grad_norm": 1.0641281604766846, "learning_rate": 0.0002, "epoch": 6.654219030520647, "step": 92660}, {"loss": 0.5478, "grad_norm": 0.9953374862670898, "learning_rate": 0.0002, "epoch": 6.654937163375225, "step": 92670}, {"loss": 0.5787, "grad_norm": 0.9792306423187256, "learning_rate": 0.0002, "epoch": 6.655655296229803, "step": 92680}, {"loss": 0.5945, "grad_norm": 1.1209690570831299, "learning_rate": 0.0002, "epoch": 6.656373429084381, "step": 92690}, {"loss": 0.5531, "grad_norm": 0.8281117677688599, "learning_rate": 0.0002, "epoch": 6.657091561938959, "step": 92700}, {"loss": 0.5315, "grad_norm": 0.9189280867576599, "learning_rate": 0.0002, "epoch": 6.657809694793537, "step": 92710}, {"loss": 0.6032, "grad_norm": 1.1859153509140015, "learning_rate": 0.0002, "epoch": 6.658527827648115, "step": 92720}, {"loss": 0.5201, "grad_norm": 0.9750476479530334, "learning_rate": 0.0002, "epoch": 6.659245960502693, "step": 92730}, {"loss": 0.5516, "grad_norm": 0.9973570704460144, "learning_rate": 0.0002, "epoch": 6.659964093357271, "step": 92740}, {"loss": 0.6042, "grad_norm": 1.0170378684997559, "learning_rate": 0.0002, "epoch": 6.660682226211849, "step": 92750}, {"loss": 0.6065, "grad_norm": 1.352283239364624, "learning_rate": 0.0002, "epoch": 6.661400359066427, "step": 92760}, {"loss": 0.5577, "grad_norm": 1.1020066738128662, "learning_rate": 0.0002, "epoch": 6.662118491921006, "step": 92770}, {"loss": 0.5748, "grad_norm": 1.0750092267990112, "learning_rate": 0.0002, "epoch": 6.662836624775584, "step": 92780}, {"loss": 0.5624, "grad_norm": 1.1006640195846558, "learning_rate": 0.0002, "epoch": 6.663554757630162, "step": 92790}, {"loss": 0.5383, "grad_norm": 1.2372384071350098, "learning_rate": 0.0002, "epoch": 6.66427289048474, "step": 92800}, {"loss": 0.5914, "grad_norm": 1.084846019744873, "learning_rate": 0.0002, "epoch": 6.664991023339318, "step": 92810}, {"loss": 0.5951, "grad_norm": 1.1738693714141846, "learning_rate": 0.0002, "epoch": 6.665709156193896, "step": 92820}, {"loss": 0.5825, "grad_norm": 1.159678339958191, "learning_rate": 0.0002, "epoch": 6.666427289048474, "step": 92830}, {"loss": 0.5483, "grad_norm": 0.9957766532897949, "learning_rate": 0.0002, "epoch": 6.667145421903052, "step": 92840}, {"loss": 0.5585, "grad_norm": 1.1403744220733643, "learning_rate": 0.0002, "epoch": 6.667863554757631, "step": 92850}, {"loss": 0.6091, "grad_norm": 1.0120519399642944, "learning_rate": 0.0002, "epoch": 6.668581687612209, "step": 92860}, {"loss": 0.5857, "grad_norm": 1.0876718759536743, "learning_rate": 0.0002, "epoch": 6.669299820466787, "step": 92870}, {"loss": 0.5876, "grad_norm": 1.175749659538269, "learning_rate": 0.0002, "epoch": 6.670017953321365, "step": 92880}, {"loss": 0.5365, "grad_norm": 0.9808473587036133, "learning_rate": 0.0002, "epoch": 6.670736086175943, "step": 92890}, {"loss": 0.578, "grad_norm": 1.121573805809021, "learning_rate": 0.0002, "epoch": 6.671454219030521, "step": 92900}, {"loss": 0.5745, "grad_norm": 0.9749727249145508, "learning_rate": 0.0002, "epoch": 6.672172351885099, "step": 92910}, {"loss": 0.588, "grad_norm": 1.0969820022583008, "learning_rate": 0.0002, "epoch": 6.672890484739677, "step": 92920}, {"loss": 0.5792, "grad_norm": 1.0777957439422607, "learning_rate": 0.0002, "epoch": 6.673608617594255, "step": 92930}, {"loss": 0.598, "grad_norm": 1.2342437505722046, "learning_rate": 0.0002, "epoch": 6.674326750448833, "step": 92940}, {"loss": 0.6069, "grad_norm": 1.18901789188385, "learning_rate": 0.0002, "epoch": 6.6750448833034115, "step": 92950}, {"loss": 0.6148, "grad_norm": 1.2212412357330322, "learning_rate": 0.0002, "epoch": 6.6757630161579895, "step": 92960}, {"loss": 0.5583, "grad_norm": 1.0007524490356445, "learning_rate": 0.0002, "epoch": 6.6764811490125675, "step": 92970}, {"loss": 0.5821, "grad_norm": 1.1012821197509766, "learning_rate": 0.0002, "epoch": 6.6771992818671455, "step": 92980}, {"loss": 0.5694, "grad_norm": 0.9446989893913269, "learning_rate": 0.0002, "epoch": 6.6779174147217235, "step": 92990}, {"loss": 0.5987, "grad_norm": 1.5307164192199707, "learning_rate": 0.0002, "epoch": 6.6786355475763015, "step": 93000}, {"loss": 0.6015, "grad_norm": 1.4290575981140137, "learning_rate": 0.0002, "epoch": 6.6793536804308795, "step": 93010}, {"loss": 0.5843, "grad_norm": 1.2367054224014282, "learning_rate": 0.0002, "epoch": 6.6800718132854575, "step": 93020}, {"loss": 0.5915, "grad_norm": 0.874568521976471, "learning_rate": 0.0002, "epoch": 6.680789946140036, "step": 93030}, {"loss": 0.5684, "grad_norm": 1.152861475944519, "learning_rate": 0.0002, "epoch": 6.681508078994614, "step": 93040}, {"loss": 0.5995, "grad_norm": 0.9524891972541809, "learning_rate": 0.0002, "epoch": 6.682226211849192, "step": 93050}, {"loss": 0.548, "grad_norm": 0.8084558844566345, "learning_rate": 0.0002, "epoch": 6.68294434470377, "step": 93060}, {"loss": 0.6002, "grad_norm": 1.1458806991577148, "learning_rate": 0.0002, "epoch": 6.683662477558348, "step": 93070}, {"loss": 0.5733, "grad_norm": 1.1427397727966309, "learning_rate": 0.0002, "epoch": 6.684380610412926, "step": 93080}, {"loss": 0.5721, "grad_norm": 1.1136237382888794, "learning_rate": 0.0002, "epoch": 6.685098743267504, "step": 93090}, {"loss": 0.5173, "grad_norm": 1.0270767211914062, "learning_rate": 0.0002, "epoch": 6.685816876122082, "step": 93100}, {"loss": 0.5594, "grad_norm": 0.9473410844802856, "learning_rate": 0.0002, "epoch": 6.68653500897666, "step": 93110}, {"loss": 0.6255, "grad_norm": 1.011011004447937, "learning_rate": 0.0002, "epoch": 6.687253141831238, "step": 93120}, {"loss": 0.5662, "grad_norm": 0.9286965131759644, "learning_rate": 0.0002, "epoch": 6.687971274685816, "step": 93130}, {"loss": 0.5729, "grad_norm": 1.226515293121338, "learning_rate": 0.0002, "epoch": 6.688689407540395, "step": 93140}, {"loss": 0.5821, "grad_norm": 0.9131909608840942, "learning_rate": 0.0002, "epoch": 6.689407540394973, "step": 93150}, {"loss": 0.5328, "grad_norm": 1.2111890316009521, "learning_rate": 0.0002, "epoch": 6.690125673249551, "step": 93160}, {"loss": 0.5939, "grad_norm": 0.9296384453773499, "learning_rate": 0.0002, "epoch": 6.690843806104129, "step": 93170}, {"loss": 0.5661, "grad_norm": 0.9636726975440979, "learning_rate": 0.0002, "epoch": 6.691561938958707, "step": 93180}, {"loss": 0.5998, "grad_norm": 1.0116214752197266, "learning_rate": 0.0002, "epoch": 6.692280071813285, "step": 93190}, {"loss": 0.5925, "grad_norm": 1.2671175003051758, "learning_rate": 0.0002, "epoch": 6.692998204667863, "step": 93200}, {"loss": 0.5982, "grad_norm": 1.0676039457321167, "learning_rate": 0.0002, "epoch": 6.693716337522441, "step": 93210}, {"loss": 0.5815, "grad_norm": 1.3277634382247925, "learning_rate": 0.0002, "epoch": 6.69443447037702, "step": 93220}, {"loss": 0.5621, "grad_norm": 0.9312936663627625, "learning_rate": 0.0002, "epoch": 6.695152603231598, "step": 93230}, {"loss": 0.5727, "grad_norm": 1.410414457321167, "learning_rate": 0.0002, "epoch": 6.695870736086176, "step": 93240}, {"loss": 0.5793, "grad_norm": 1.014519453048706, "learning_rate": 0.0002, "epoch": 6.696588868940754, "step": 93250}, {"loss": 0.5801, "grad_norm": 0.9211319088935852, "learning_rate": 0.0002, "epoch": 6.697307001795332, "step": 93260}, {"loss": 0.5472, "grad_norm": 1.1027755737304688, "learning_rate": 0.0002, "epoch": 6.69802513464991, "step": 93270}, {"loss": 0.5908, "grad_norm": 1.0538618564605713, "learning_rate": 0.0002, "epoch": 6.698743267504488, "step": 93280}, {"loss": 0.5694, "grad_norm": 1.159927248954773, "learning_rate": 0.0002, "epoch": 6.699461400359066, "step": 93290}, {"loss": 0.601, "grad_norm": 1.1329137086868286, "learning_rate": 0.0002, "epoch": 6.700179533213644, "step": 93300}, {"loss": 0.5702, "grad_norm": 0.9797694683074951, "learning_rate": 0.0002, "epoch": 6.700897666068222, "step": 93310}, {"loss": 0.6145, "grad_norm": 1.0968587398529053, "learning_rate": 0.0002, "epoch": 6.7016157989228, "step": 93320}, {"loss": 0.5737, "grad_norm": 0.9620516896247864, "learning_rate": 0.0002, "epoch": 6.702333931777379, "step": 93330}, {"loss": 0.5469, "grad_norm": 1.048879623413086, "learning_rate": 0.0002, "epoch": 6.703052064631957, "step": 93340}, {"loss": 0.5641, "grad_norm": 1.086421012878418, "learning_rate": 0.0002, "epoch": 6.703770197486535, "step": 93350}, {"loss": 0.5905, "grad_norm": 1.1045429706573486, "learning_rate": 0.0002, "epoch": 6.704488330341113, "step": 93360}, {"loss": 0.5602, "grad_norm": 1.081629991531372, "learning_rate": 0.0002, "epoch": 6.705206463195691, "step": 93370}, {"loss": 0.5644, "grad_norm": 0.9947898387908936, "learning_rate": 0.0002, "epoch": 6.705924596050269, "step": 93380}, {"loss": 0.5624, "grad_norm": 0.8837184309959412, "learning_rate": 0.0002, "epoch": 6.706642728904847, "step": 93390}, {"loss": 0.6168, "grad_norm": 1.1838666200637817, "learning_rate": 0.0002, "epoch": 6.707360861759425, "step": 93400}, {"loss": 0.5586, "grad_norm": 0.9221062064170837, "learning_rate": 0.0002, "epoch": 6.708078994614004, "step": 93410}, {"loss": 0.5481, "grad_norm": 1.0049937963485718, "learning_rate": 0.0002, "epoch": 6.708797127468582, "step": 93420}, {"loss": 0.5608, "grad_norm": 0.8895014524459839, "learning_rate": 0.0002, "epoch": 6.70951526032316, "step": 93430}, {"loss": 0.6043, "grad_norm": 1.2572799921035767, "learning_rate": 0.0002, "epoch": 6.710233393177738, "step": 93440}, {"loss": 0.5763, "grad_norm": 1.082982063293457, "learning_rate": 0.0002, "epoch": 6.710951526032316, "step": 93450}, {"loss": 0.5326, "grad_norm": 1.1520570516586304, "learning_rate": 0.0002, "epoch": 6.711669658886894, "step": 93460}, {"loss": 0.6059, "grad_norm": 1.0604512691497803, "learning_rate": 0.0002, "epoch": 6.712387791741472, "step": 93470}, {"loss": 0.5683, "grad_norm": 0.9887481331825256, "learning_rate": 0.0002, "epoch": 6.71310592459605, "step": 93480}, {"loss": 0.5741, "grad_norm": 1.0163664817810059, "learning_rate": 0.0002, "epoch": 6.713824057450628, "step": 93490}, {"loss": 0.5704, "grad_norm": 1.187687873840332, "learning_rate": 0.0002, "epoch": 6.714542190305206, "step": 93500}, {"loss": 0.5841, "grad_norm": 0.8770190477371216, "learning_rate": 0.0002, "epoch": 6.715260323159785, "step": 93510}, {"loss": 0.5758, "grad_norm": 1.1552737951278687, "learning_rate": 0.0002, "epoch": 6.715978456014363, "step": 93520}, {"loss": 0.5708, "grad_norm": 1.168770432472229, "learning_rate": 0.0002, "epoch": 6.716696588868941, "step": 93530}, {"loss": 0.5653, "grad_norm": 1.1071383953094482, "learning_rate": 0.0002, "epoch": 6.717414721723519, "step": 93540}, {"loss": 0.5813, "grad_norm": 0.8549296259880066, "learning_rate": 0.0002, "epoch": 6.718132854578097, "step": 93550}, {"loss": 0.6108, "grad_norm": 1.1576329469680786, "learning_rate": 0.0002, "epoch": 6.718850987432675, "step": 93560}, {"loss": 0.5605, "grad_norm": 1.1610777378082275, "learning_rate": 0.0002, "epoch": 6.719569120287253, "step": 93570}, {"loss": 0.6055, "grad_norm": 1.0316133499145508, "learning_rate": 0.0002, "epoch": 6.720287253141831, "step": 93580}, {"loss": 0.5889, "grad_norm": 1.1048495769500732, "learning_rate": 0.0002, "epoch": 6.721005385996409, "step": 93590}, {"loss": 0.5431, "grad_norm": 1.1212984323501587, "learning_rate": 0.0002, "epoch": 6.721723518850988, "step": 93600}, {"loss": 0.5971, "grad_norm": 1.1465938091278076, "learning_rate": 0.0002, "epoch": 6.722441651705566, "step": 93610}, {"loss": 0.5881, "grad_norm": 0.8978183269500732, "learning_rate": 0.0002, "epoch": 6.723159784560144, "step": 93620}, {"loss": 0.5292, "grad_norm": 1.0475369691848755, "learning_rate": 0.0002, "epoch": 6.723877917414722, "step": 93630}, {"loss": 0.5565, "grad_norm": 1.0717675685882568, "learning_rate": 0.0002, "epoch": 6.7245960502693, "step": 93640}, {"loss": 0.5594, "grad_norm": 1.2429792881011963, "learning_rate": 0.0002, "epoch": 6.725314183123878, "step": 93650}, {"loss": 0.5939, "grad_norm": 1.0333678722381592, "learning_rate": 0.0002, "epoch": 6.726032315978456, "step": 93660}, {"loss": 0.5264, "grad_norm": 1.211590051651001, "learning_rate": 0.0002, "epoch": 6.726750448833034, "step": 93670}, {"loss": 0.6022, "grad_norm": 1.0022165775299072, "learning_rate": 0.0002, "epoch": 6.727468581687612, "step": 93680}, {"loss": 0.5909, "grad_norm": 1.0192183256149292, "learning_rate": 0.0002, "epoch": 6.72818671454219, "step": 93690}, {"loss": 0.5283, "grad_norm": 0.9370006322860718, "learning_rate": 0.0002, "epoch": 6.728904847396769, "step": 93700}, {"loss": 0.5796, "grad_norm": 0.7869033813476562, "learning_rate": 0.0002, "epoch": 6.729622980251347, "step": 93710}, {"loss": 0.5481, "grad_norm": 0.899703860282898, "learning_rate": 0.0002, "epoch": 6.730341113105925, "step": 93720}, {"loss": 0.623, "grad_norm": 1.1216487884521484, "learning_rate": 0.0002, "epoch": 6.731059245960503, "step": 93730}, {"loss": 0.5974, "grad_norm": 0.9117740988731384, "learning_rate": 0.0002, "epoch": 6.731777378815081, "step": 93740}, {"loss": 0.6382, "grad_norm": 1.070947289466858, "learning_rate": 0.0002, "epoch": 6.732495511669659, "step": 93750}, {"loss": 0.6014, "grad_norm": 1.0529371500015259, "learning_rate": 0.0002, "epoch": 6.733213644524237, "step": 93760}, {"loss": 0.5177, "grad_norm": 0.7950748801231384, "learning_rate": 0.0002, "epoch": 6.733931777378815, "step": 93770}, {"loss": 0.6239, "grad_norm": 1.0469520092010498, "learning_rate": 0.0002, "epoch": 6.734649910233394, "step": 93780}, {"loss": 0.6177, "grad_norm": 1.4734543561935425, "learning_rate": 0.0002, "epoch": 6.735368043087972, "step": 93790}, {"loss": 0.583, "grad_norm": 0.8239574432373047, "learning_rate": 0.0002, "epoch": 6.73608617594255, "step": 93800}, {"loss": 0.557, "grad_norm": 1.1228505373001099, "learning_rate": 0.0002, "epoch": 6.736804308797128, "step": 93810}, {"loss": 0.5162, "grad_norm": 1.0902183055877686, "learning_rate": 0.0002, "epoch": 6.737522441651706, "step": 93820}, {"loss": 0.6094, "grad_norm": 1.220467209815979, "learning_rate": 0.0002, "epoch": 6.738240574506284, "step": 93830}, {"loss": 0.5963, "grad_norm": 1.199582815170288, "learning_rate": 0.0002, "epoch": 6.738958707360862, "step": 93840}, {"loss": 0.6004, "grad_norm": 1.1008597612380981, "learning_rate": 0.0002, "epoch": 6.73967684021544, "step": 93850}, {"loss": 0.5582, "grad_norm": 0.8596068620681763, "learning_rate": 0.0002, "epoch": 6.740394973070018, "step": 93860}, {"loss": 0.5661, "grad_norm": 1.220947027206421, "learning_rate": 0.0002, "epoch": 6.741113105924596, "step": 93870}, {"loss": 0.5425, "grad_norm": 1.2840452194213867, "learning_rate": 0.0002, "epoch": 6.741831238779174, "step": 93880}, {"loss": 0.5713, "grad_norm": 1.1923094987869263, "learning_rate": 0.0002, "epoch": 6.742549371633753, "step": 93890}, {"loss": 0.5523, "grad_norm": 1.1287206411361694, "learning_rate": 0.0002, "epoch": 6.743267504488331, "step": 93900}, {"loss": 0.5473, "grad_norm": 0.9465082287788391, "learning_rate": 0.0002, "epoch": 6.743985637342909, "step": 93910}, {"loss": 0.5795, "grad_norm": 0.9888480305671692, "learning_rate": 0.0002, "epoch": 6.744703770197487, "step": 93920}, {"loss": 0.5968, "grad_norm": 1.1438485383987427, "learning_rate": 0.0002, "epoch": 6.745421903052065, "step": 93930}, {"loss": 0.5711, "grad_norm": 0.8203039169311523, "learning_rate": 0.0002, "epoch": 6.746140035906643, "step": 93940}, {"loss": 0.5787, "grad_norm": 1.217855453491211, "learning_rate": 0.0002, "epoch": 6.746858168761221, "step": 93950}, {"loss": 0.5488, "grad_norm": 1.245977520942688, "learning_rate": 0.0002, "epoch": 6.747576301615799, "step": 93960}, {"loss": 0.5849, "grad_norm": 1.240097165107727, "learning_rate": 0.0002, "epoch": 6.7482944344703775, "step": 93970}, {"loss": 0.5717, "grad_norm": 0.9436663389205933, "learning_rate": 0.0002, "epoch": 6.7490125673249555, "step": 93980}, {"loss": 0.5717, "grad_norm": 0.9331963062286377, "learning_rate": 0.0002, "epoch": 6.7497307001795335, "step": 93990}, {"loss": 0.5777, "grad_norm": 0.9809562563896179, "learning_rate": 0.0002, "epoch": 6.7504488330341115, "step": 94000}, {"loss": 0.6237, "grad_norm": 1.1596009731292725, "learning_rate": 0.0002, "epoch": 6.7511669658886895, "step": 94010}, {"loss": 0.61, "grad_norm": 1.082684874534607, "learning_rate": 0.0002, "epoch": 6.7518850987432675, "step": 94020}, {"loss": 0.6285, "grad_norm": 0.9931458234786987, "learning_rate": 0.0002, "epoch": 6.7526032315978455, "step": 94030}, {"loss": 0.5606, "grad_norm": 0.8717518448829651, "learning_rate": 0.0002, "epoch": 6.7533213644524235, "step": 94040}, {"loss": 0.5504, "grad_norm": 0.9379602074623108, "learning_rate": 0.0002, "epoch": 6.7540394973070015, "step": 94050}, {"loss": 0.5942, "grad_norm": 0.8819605708122253, "learning_rate": 0.0002, "epoch": 6.7547576301615795, "step": 94060}, {"loss": 0.5989, "grad_norm": 1.111547589302063, "learning_rate": 0.0002, "epoch": 6.755475763016158, "step": 94070}, {"loss": 0.5898, "grad_norm": 1.0755881071090698, "learning_rate": 0.0002, "epoch": 6.756193895870736, "step": 94080}, {"loss": 0.5494, "grad_norm": 1.0734093189239502, "learning_rate": 0.0002, "epoch": 6.756912028725314, "step": 94090}, {"loss": 0.5979, "grad_norm": 1.0390300750732422, "learning_rate": 0.0002, "epoch": 6.757630161579892, "step": 94100}, {"loss": 0.5478, "grad_norm": 0.9557124972343445, "learning_rate": 0.0002, "epoch": 6.75834829443447, "step": 94110}, {"loss": 0.5613, "grad_norm": 1.0970680713653564, "learning_rate": 0.0002, "epoch": 6.759066427289048, "step": 94120}, {"loss": 0.5828, "grad_norm": 1.0715644359588623, "learning_rate": 0.0002, "epoch": 6.759784560143626, "step": 94130}, {"loss": 0.5424, "grad_norm": 1.1311662197113037, "learning_rate": 0.0002, "epoch": 6.760502692998204, "step": 94140}, {"loss": 0.6033, "grad_norm": 0.9891370534896851, "learning_rate": 0.0002, "epoch": 6.761220825852782, "step": 94150}, {"loss": 0.577, "grad_norm": 0.9472686648368835, "learning_rate": 0.0002, "epoch": 6.761938958707361, "step": 94160}, {"loss": 0.5935, "grad_norm": 1.1044381856918335, "learning_rate": 0.0002, "epoch": 6.762657091561939, "step": 94170}, {"loss": 0.6254, "grad_norm": 1.2088780403137207, "learning_rate": 0.0002, "epoch": 6.763375224416517, "step": 94180}, {"loss": 0.554, "grad_norm": 0.9210726618766785, "learning_rate": 0.0002, "epoch": 6.764093357271095, "step": 94190}, {"loss": 0.54, "grad_norm": 1.0969771146774292, "learning_rate": 0.0002, "epoch": 6.764811490125673, "step": 94200}, {"loss": 0.5414, "grad_norm": 1.1030265092849731, "learning_rate": 0.0002, "epoch": 6.765529622980251, "step": 94210}, {"loss": 0.5973, "grad_norm": 0.9451745748519897, "learning_rate": 0.0002, "epoch": 6.766247755834829, "step": 94220}, {"loss": 0.616, "grad_norm": 1.0216296911239624, "learning_rate": 0.0002, "epoch": 6.766965888689407, "step": 94230}, {"loss": 0.5402, "grad_norm": 1.4021092653274536, "learning_rate": 0.0002, "epoch": 6.767684021543985, "step": 94240}, {"loss": 0.5991, "grad_norm": 1.2341269254684448, "learning_rate": 0.0002, "epoch": 6.768402154398563, "step": 94250}, {"loss": 0.5743, "grad_norm": 1.1086686849594116, "learning_rate": 0.0002, "epoch": 6.769120287253142, "step": 94260}, {"loss": 0.551, "grad_norm": 0.8565682172775269, "learning_rate": 0.0002, "epoch": 6.76983842010772, "step": 94270}, {"loss": 0.6026, "grad_norm": 0.9314411878585815, "learning_rate": 0.0002, "epoch": 6.770556552962298, "step": 94280}, {"loss": 0.5972, "grad_norm": 1.0592315196990967, "learning_rate": 0.0002, "epoch": 6.771274685816876, "step": 94290}, {"loss": 0.5947, "grad_norm": 1.086379885673523, "learning_rate": 0.0002, "epoch": 6.771992818671454, "step": 94300}, {"loss": 0.5484, "grad_norm": 1.13401198387146, "learning_rate": 0.0002, "epoch": 6.772710951526032, "step": 94310}, {"loss": 0.5738, "grad_norm": 1.0137985944747925, "learning_rate": 0.0002, "epoch": 6.77342908438061, "step": 94320}, {"loss": 0.5972, "grad_norm": 1.0459709167480469, "learning_rate": 0.0002, "epoch": 6.774147217235188, "step": 94330}, {"loss": 0.6279, "grad_norm": 1.2213165760040283, "learning_rate": 0.0002, "epoch": 6.774865350089767, "step": 94340}, {"loss": 0.5522, "grad_norm": 1.099478006362915, "learning_rate": 0.0002, "epoch": 6.775583482944345, "step": 94350}, {"loss": 0.5694, "grad_norm": 1.124526858329773, "learning_rate": 0.0002, "epoch": 6.776301615798923, "step": 94360}, {"loss": 0.6393, "grad_norm": 1.0199998617172241, "learning_rate": 0.0002, "epoch": 6.777019748653501, "step": 94370}, {"loss": 0.5662, "grad_norm": 1.1849408149719238, "learning_rate": 0.0002, "epoch": 6.777737881508079, "step": 94380}, {"loss": 0.5856, "grad_norm": 1.2265552282333374, "learning_rate": 0.0002, "epoch": 6.778456014362657, "step": 94390}, {"loss": 0.5817, "grad_norm": 0.7576864361763, "learning_rate": 0.0002, "epoch": 6.779174147217235, "step": 94400}, {"loss": 0.5495, "grad_norm": 0.8172970414161682, "learning_rate": 0.0002, "epoch": 6.779892280071813, "step": 94410}, {"loss": 0.5902, "grad_norm": 1.1105220317840576, "learning_rate": 0.0002, "epoch": 6.780610412926391, "step": 94420}, {"loss": 0.5918, "grad_norm": 1.0542421340942383, "learning_rate": 0.0002, "epoch": 6.781328545780969, "step": 94430}, {"loss": 0.5911, "grad_norm": 1.0088121891021729, "learning_rate": 0.0002, "epoch": 6.782046678635547, "step": 94440}, {"loss": 0.5866, "grad_norm": 0.9872488379478455, "learning_rate": 0.0002, "epoch": 6.782764811490126, "step": 94450}, {"loss": 0.5524, "grad_norm": 1.2545148134231567, "learning_rate": 0.0002, "epoch": 6.783482944344704, "step": 94460}, {"loss": 0.5365, "grad_norm": 0.8847712278366089, "learning_rate": 0.0002, "epoch": 6.784201077199282, "step": 94470}, {"loss": 0.5999, "grad_norm": 0.7758765816688538, "learning_rate": 0.0002, "epoch": 6.78491921005386, "step": 94480}, {"loss": 0.5654, "grad_norm": 1.0454037189483643, "learning_rate": 0.0002, "epoch": 6.785637342908438, "step": 94490}, {"loss": 0.5943, "grad_norm": 1.1336725950241089, "learning_rate": 0.0002, "epoch": 6.786355475763016, "step": 94500}, {"loss": 0.6091, "grad_norm": 1.081356406211853, "learning_rate": 0.0002, "epoch": 6.787073608617594, "step": 94510}, {"loss": 0.5634, "grad_norm": 1.126288890838623, "learning_rate": 0.0002, "epoch": 6.787791741472172, "step": 94520}, {"loss": 0.5771, "grad_norm": 1.1156792640686035, "learning_rate": 0.0002, "epoch": 6.788509874326751, "step": 94530}, {"loss": 0.599, "grad_norm": 1.0243451595306396, "learning_rate": 0.0002, "epoch": 6.789228007181329, "step": 94540}, {"loss": 0.5949, "grad_norm": 0.9778338670730591, "learning_rate": 0.0002, "epoch": 6.789946140035907, "step": 94550}, {"loss": 0.6, "grad_norm": 0.9668094515800476, "learning_rate": 0.0002, "epoch": 6.790664272890485, "step": 94560}, {"loss": 0.6285, "grad_norm": 1.121848464012146, "learning_rate": 0.0002, "epoch": 6.791382405745063, "step": 94570}, {"loss": 0.5878, "grad_norm": 1.105825662612915, "learning_rate": 0.0002, "epoch": 6.792100538599641, "step": 94580}, {"loss": 0.5478, "grad_norm": 1.1236833333969116, "learning_rate": 0.0002, "epoch": 6.792818671454219, "step": 94590}, {"loss": 0.5854, "grad_norm": 1.0655126571655273, "learning_rate": 0.0002, "epoch": 6.793536804308797, "step": 94600}, {"loss": 0.5271, "grad_norm": 0.9249289631843567, "learning_rate": 0.0002, "epoch": 6.794254937163375, "step": 94610}, {"loss": 0.5767, "grad_norm": 1.0177690982818604, "learning_rate": 0.0002, "epoch": 6.794973070017953, "step": 94620}, {"loss": 0.6323, "grad_norm": 1.1961153745651245, "learning_rate": 0.0002, "epoch": 6.795691202872531, "step": 94630}, {"loss": 0.5623, "grad_norm": 1.0987505912780762, "learning_rate": 0.0002, "epoch": 6.79640933572711, "step": 94640}, {"loss": 0.5672, "grad_norm": 1.0165259838104248, "learning_rate": 0.0002, "epoch": 6.797127468581688, "step": 94650}, {"loss": 0.5777, "grad_norm": 1.1336601972579956, "learning_rate": 0.0002, "epoch": 6.797845601436266, "step": 94660}, {"loss": 0.6252, "grad_norm": 1.0786010026931763, "learning_rate": 0.0002, "epoch": 6.798563734290844, "step": 94670}, {"loss": 0.5755, "grad_norm": 1.2896602153778076, "learning_rate": 0.0002, "epoch": 6.799281867145422, "step": 94680}, {"loss": 0.5858, "grad_norm": 1.0934168100357056, "learning_rate": 0.0002, "epoch": 6.8, "step": 94690}, {"loss": 0.5381, "grad_norm": 1.1080414056777954, "learning_rate": 0.0002, "epoch": 6.800718132854578, "step": 94700}, {"loss": 0.5896, "grad_norm": 1.1141704320907593, "learning_rate": 0.0002, "epoch": 6.801436265709156, "step": 94710}, {"loss": 0.5487, "grad_norm": 0.9571144580841064, "learning_rate": 0.0002, "epoch": 6.802154398563735, "step": 94720}, {"loss": 0.5487, "grad_norm": 0.8907591700553894, "learning_rate": 0.0002, "epoch": 6.802872531418313, "step": 94730}, {"loss": 0.5551, "grad_norm": 1.0547759532928467, "learning_rate": 0.0002, "epoch": 6.803590664272891, "step": 94740}, {"loss": 0.5799, "grad_norm": 0.973573625087738, "learning_rate": 0.0002, "epoch": 6.804308797127469, "step": 94750}, {"loss": 0.6073, "grad_norm": 0.7889130711555481, "learning_rate": 0.0002, "epoch": 6.805026929982047, "step": 94760}, {"loss": 0.6004, "grad_norm": 0.9414647221565247, "learning_rate": 0.0002, "epoch": 6.805745062836625, "step": 94770}, {"loss": 0.5533, "grad_norm": 0.9452534317970276, "learning_rate": 0.0002, "epoch": 6.806463195691203, "step": 94780}, {"loss": 0.5379, "grad_norm": 1.2215145826339722, "learning_rate": 0.0002, "epoch": 6.807181328545781, "step": 94790}, {"loss": 0.6045, "grad_norm": 1.116302490234375, "learning_rate": 0.0002, "epoch": 6.807899461400359, "step": 94800}, {"loss": 0.5595, "grad_norm": 0.850916862487793, "learning_rate": 0.0002, "epoch": 6.808617594254937, "step": 94810}, {"loss": 0.5411, "grad_norm": 0.8699719905853271, "learning_rate": 0.0002, "epoch": 6.809335727109516, "step": 94820}, {"loss": 0.5334, "grad_norm": 1.0958143472671509, "learning_rate": 0.0002, "epoch": 6.810053859964094, "step": 94830}, {"loss": 0.5687, "grad_norm": 1.128580927848816, "learning_rate": 0.0002, "epoch": 6.810771992818672, "step": 94840}, {"loss": 0.5622, "grad_norm": 0.9490674138069153, "learning_rate": 0.0002, "epoch": 6.81149012567325, "step": 94850}, {"loss": 0.5779, "grad_norm": 0.9294022917747498, "learning_rate": 0.0002, "epoch": 6.812208258527828, "step": 94860}, {"loss": 0.5738, "grad_norm": 1.048378348350525, "learning_rate": 0.0002, "epoch": 6.812926391382406, "step": 94870}, {"loss": 0.5634, "grad_norm": 1.1972805261611938, "learning_rate": 0.0002, "epoch": 6.813644524236984, "step": 94880}, {"loss": 0.5732, "grad_norm": 0.7709503769874573, "learning_rate": 0.0002, "epoch": 6.814362657091562, "step": 94890}, {"loss": 0.5854, "grad_norm": 1.0244873762130737, "learning_rate": 0.0002, "epoch": 6.8150807899461405, "step": 94900}, {"loss": 0.581, "grad_norm": 1.0576984882354736, "learning_rate": 0.0002, "epoch": 6.8157989228007185, "step": 94910}, {"loss": 0.5812, "grad_norm": 1.3478775024414062, "learning_rate": 0.0002, "epoch": 6.8165170556552965, "step": 94920}, {"loss": 0.597, "grad_norm": 0.982311487197876, "learning_rate": 0.0002, "epoch": 6.8172351885098745, "step": 94930}, {"loss": 0.5703, "grad_norm": 1.1846535205841064, "learning_rate": 0.0002, "epoch": 6.8179533213644525, "step": 94940}, {"loss": 0.578, "grad_norm": 0.9255896210670471, "learning_rate": 0.0002, "epoch": 6.8186714542190305, "step": 94950}, {"loss": 0.5255, "grad_norm": 0.9418646693229675, "learning_rate": 0.0002, "epoch": 6.8193895870736085, "step": 94960}, {"loss": 0.6163, "grad_norm": 1.189335584640503, "learning_rate": 0.0002, "epoch": 6.8201077199281865, "step": 94970}, {"loss": 0.5646, "grad_norm": 1.1003406047821045, "learning_rate": 0.0002, "epoch": 6.8208258527827645, "step": 94980}, {"loss": 0.5677, "grad_norm": 0.9203724265098572, "learning_rate": 0.0002, "epoch": 6.8215439856373425, "step": 94990}, {"loss": 0.5862, "grad_norm": 1.093252182006836, "learning_rate": 0.0002, "epoch": 6.8222621184919205, "step": 95000}, {"loss": 0.6286, "grad_norm": 1.2737812995910645, "learning_rate": 0.0002, "epoch": 6.822980251346499, "step": 95010}, {"loss": 0.5726, "grad_norm": 1.1859848499298096, "learning_rate": 0.0002, "epoch": 6.823698384201077, "step": 95020}, {"loss": 0.5936, "grad_norm": 0.9591164588928223, "learning_rate": 0.0002, "epoch": 6.824416517055655, "step": 95030}, {"loss": 0.5401, "grad_norm": 1.0144239664077759, "learning_rate": 0.0002, "epoch": 6.825134649910233, "step": 95040}, {"loss": 0.6106, "grad_norm": 1.2520356178283691, "learning_rate": 0.0002, "epoch": 6.825852782764811, "step": 95050}, {"loss": 0.6206, "grad_norm": 1.003438115119934, "learning_rate": 0.0002, "epoch": 6.8265709156193894, "step": 95060}, {"loss": 0.5507, "grad_norm": 0.9512312412261963, "learning_rate": 0.0002, "epoch": 6.8272890484739674, "step": 95070}, {"loss": 0.5874, "grad_norm": 0.9984938502311707, "learning_rate": 0.0002, "epoch": 6.8280071813285454, "step": 95080}, {"loss": 0.5654, "grad_norm": 0.9630827307701111, "learning_rate": 0.0002, "epoch": 6.828725314183124, "step": 95090}, {"loss": 0.5749, "grad_norm": 0.8859394192695618, "learning_rate": 0.0002, "epoch": 6.829443447037702, "step": 95100}, {"loss": 0.5888, "grad_norm": 0.9082155227661133, "learning_rate": 0.0002, "epoch": 6.83016157989228, "step": 95110}, {"loss": 0.5773, "grad_norm": 1.0707300901412964, "learning_rate": 0.0002, "epoch": 6.830879712746858, "step": 95120}, {"loss": 0.5663, "grad_norm": 1.2023502588272095, "learning_rate": 0.0002, "epoch": 6.831597845601436, "step": 95130}, {"loss": 0.5843, "grad_norm": 1.0189216136932373, "learning_rate": 0.0002, "epoch": 6.832315978456014, "step": 95140}, {"loss": 0.5881, "grad_norm": 1.1216851472854614, "learning_rate": 0.0002, "epoch": 6.833034111310592, "step": 95150}, {"loss": 0.5852, "grad_norm": 1.124589204788208, "learning_rate": 0.0002, "epoch": 6.83375224416517, "step": 95160}, {"loss": 0.5374, "grad_norm": 1.1183217763900757, "learning_rate": 0.0002, "epoch": 6.834470377019748, "step": 95170}, {"loss": 0.6106, "grad_norm": 1.0307188034057617, "learning_rate": 0.0002, "epoch": 6.835188509874326, "step": 95180}, {"loss": 0.5978, "grad_norm": 1.2438706159591675, "learning_rate": 0.0002, "epoch": 6.835906642728904, "step": 95190}, {"loss": 0.5935, "grad_norm": 1.117887258529663, "learning_rate": 0.0002, "epoch": 6.836624775583483, "step": 95200}, {"loss": 0.5965, "grad_norm": 0.8934445381164551, "learning_rate": 0.0002, "epoch": 6.837342908438061, "step": 95210}, {"loss": 0.5384, "grad_norm": 1.097379207611084, "learning_rate": 0.0002, "epoch": 6.838061041292639, "step": 95220}, {"loss": 0.5792, "grad_norm": 1.1034258604049683, "learning_rate": 0.0002, "epoch": 6.838779174147217, "step": 95230}, {"loss": 0.5846, "grad_norm": 1.052120327949524, "learning_rate": 0.0002, "epoch": 6.839497307001795, "step": 95240}, {"loss": 0.5812, "grad_norm": 1.0844687223434448, "learning_rate": 0.0002, "epoch": 6.840215439856373, "step": 95250}, {"loss": 0.5746, "grad_norm": 1.1553566455841064, "learning_rate": 0.0002, "epoch": 6.840933572710951, "step": 95260}, {"loss": 0.5881, "grad_norm": 1.1977533102035522, "learning_rate": 0.0002, "epoch": 6.841651705565529, "step": 95270}, {"loss": 0.5562, "grad_norm": 0.9635998010635376, "learning_rate": 0.0002, "epoch": 6.842369838420108, "step": 95280}, {"loss": 0.6043, "grad_norm": 1.0867844820022583, "learning_rate": 0.0002, "epoch": 6.843087971274686, "step": 95290}, {"loss": 0.618, "grad_norm": 1.1252882480621338, "learning_rate": 0.0002, "epoch": 6.843806104129264, "step": 95300}, {"loss": 0.5468, "grad_norm": 1.1130266189575195, "learning_rate": 0.0002, "epoch": 6.844524236983842, "step": 95310}, {"loss": 0.6368, "grad_norm": 1.058863878250122, "learning_rate": 0.0002, "epoch": 6.84524236983842, "step": 95320}, {"loss": 0.6138, "grad_norm": 1.173840880393982, "learning_rate": 0.0002, "epoch": 6.845960502692998, "step": 95330}, {"loss": 0.5904, "grad_norm": 1.09446120262146, "learning_rate": 0.0002, "epoch": 6.846678635547576, "step": 95340}, {"loss": 0.5658, "grad_norm": 1.0762465000152588, "learning_rate": 0.0002, "epoch": 6.847396768402154, "step": 95350}, {"loss": 0.5601, "grad_norm": 1.0056897401809692, "learning_rate": 0.0002, "epoch": 6.848114901256732, "step": 95360}, {"loss": 0.6129, "grad_norm": 0.929190456867218, "learning_rate": 0.0002, "epoch": 6.84883303411131, "step": 95370}, {"loss": 0.5996, "grad_norm": 1.1152058839797974, "learning_rate": 0.0002, "epoch": 6.849551166965889, "step": 95380}, {"loss": 0.5939, "grad_norm": 1.0163987874984741, "learning_rate": 0.0002, "epoch": 6.850269299820467, "step": 95390}, {"loss": 0.56, "grad_norm": 1.1169452667236328, "learning_rate": 0.0002, "epoch": 6.850987432675045, "step": 95400}, {"loss": 0.5376, "grad_norm": 1.2225226163864136, "learning_rate": 0.0002, "epoch": 6.851705565529623, "step": 95410}, {"loss": 0.5937, "grad_norm": 1.0833172798156738, "learning_rate": 0.0002, "epoch": 6.852423698384201, "step": 95420}, {"loss": 0.5551, "grad_norm": 1.0159578323364258, "learning_rate": 0.0002, "epoch": 6.853141831238779, "step": 95430}, {"loss": 0.5599, "grad_norm": 1.1164990663528442, "learning_rate": 0.0002, "epoch": 6.853859964093357, "step": 95440}, {"loss": 0.6329, "grad_norm": 1.1340656280517578, "learning_rate": 0.0002, "epoch": 6.854578096947935, "step": 95450}, {"loss": 0.5686, "grad_norm": 1.1228697299957275, "learning_rate": 0.0002, "epoch": 6.855296229802514, "step": 95460}, {"loss": 0.6323, "grad_norm": 1.0189276933670044, "learning_rate": 0.0002, "epoch": 6.856014362657092, "step": 95470}, {"loss": 0.5366, "grad_norm": 1.1692779064178467, "learning_rate": 0.0002, "epoch": 6.85673249551167, "step": 95480}, {"loss": 0.5634, "grad_norm": 1.0779703855514526, "learning_rate": 0.0002, "epoch": 6.857450628366248, "step": 95490}, {"loss": 0.6031, "grad_norm": 1.0127906799316406, "learning_rate": 0.0002, "epoch": 6.858168761220826, "step": 95500}, {"loss": 0.5264, "grad_norm": 1.2124756574630737, "learning_rate": 0.0002, "epoch": 6.858886894075404, "step": 95510}, {"loss": 0.6361, "grad_norm": 1.0948219299316406, "learning_rate": 0.0002, "epoch": 6.859605026929982, "step": 95520}, {"loss": 0.5874, "grad_norm": 0.8796268701553345, "learning_rate": 0.0002, "epoch": 6.86032315978456, "step": 95530}, {"loss": 0.5824, "grad_norm": 1.0725175142288208, "learning_rate": 0.0002, "epoch": 6.861041292639138, "step": 95540}, {"loss": 0.5748, "grad_norm": 0.9067171812057495, "learning_rate": 0.0002, "epoch": 6.861759425493716, "step": 95550}, {"loss": 0.5882, "grad_norm": 1.0576670169830322, "learning_rate": 0.0002, "epoch": 6.862477558348294, "step": 95560}, {"loss": 0.5742, "grad_norm": 0.9622264504432678, "learning_rate": 0.0002, "epoch": 6.863195691202873, "step": 95570}, {"loss": 0.5824, "grad_norm": 1.0197248458862305, "learning_rate": 0.0002, "epoch": 6.863913824057451, "step": 95580}, {"loss": 0.5842, "grad_norm": 0.9197335243225098, "learning_rate": 0.0002, "epoch": 6.864631956912029, "step": 95590}, {"loss": 0.5768, "grad_norm": 1.0169627666473389, "learning_rate": 0.0002, "epoch": 6.865350089766607, "step": 95600}, {"loss": 0.5475, "grad_norm": 0.9868543744087219, "learning_rate": 0.0002, "epoch": 6.866068222621185, "step": 95610}, {"loss": 0.5702, "grad_norm": 0.9861942529678345, "learning_rate": 0.0002, "epoch": 6.866786355475763, "step": 95620}, {"loss": 0.5753, "grad_norm": 1.0906847715377808, "learning_rate": 0.0002, "epoch": 6.867504488330341, "step": 95630}, {"loss": 0.5492, "grad_norm": 1.2462674379348755, "learning_rate": 0.0002, "epoch": 6.868222621184919, "step": 95640}, {"loss": 0.5849, "grad_norm": 0.9801536202430725, "learning_rate": 0.0002, "epoch": 6.868940754039498, "step": 95650}, {"loss": 0.5849, "grad_norm": 1.0568761825561523, "learning_rate": 0.0002, "epoch": 6.869658886894076, "step": 95660}, {"loss": 0.5467, "grad_norm": 0.8431015014648438, "learning_rate": 0.0002, "epoch": 6.870377019748654, "step": 95670}, {"loss": 0.5887, "grad_norm": 1.2253447771072388, "learning_rate": 0.0002, "epoch": 6.871095152603232, "step": 95680}, {"loss": 0.594, "grad_norm": 0.8862479329109192, "learning_rate": 0.0002, "epoch": 6.87181328545781, "step": 95690}, {"loss": 0.6266, "grad_norm": 1.0733704566955566, "learning_rate": 0.0002, "epoch": 6.872531418312388, "step": 95700}, {"loss": 0.5816, "grad_norm": 0.9327288269996643, "learning_rate": 0.0002, "epoch": 6.873249551166966, "step": 95710}, {"loss": 0.5686, "grad_norm": 0.9877831339836121, "learning_rate": 0.0002, "epoch": 6.873967684021544, "step": 95720}, {"loss": 0.5423, "grad_norm": 0.9772239327430725, "learning_rate": 0.0002, "epoch": 6.874685816876122, "step": 95730}, {"loss": 0.5942, "grad_norm": 0.9799681901931763, "learning_rate": 0.0002, "epoch": 6.8754039497307, "step": 95740}, {"loss": 0.5667, "grad_norm": 1.0650758743286133, "learning_rate": 0.0002, "epoch": 6.876122082585278, "step": 95750}, {"loss": 0.5787, "grad_norm": 1.068557858467102, "learning_rate": 0.0002, "epoch": 6.876840215439857, "step": 95760}, {"loss": 0.5863, "grad_norm": 1.1335437297821045, "learning_rate": 0.0002, "epoch": 6.877558348294435, "step": 95770}, {"loss": 0.5496, "grad_norm": 0.8993158936500549, "learning_rate": 0.0002, "epoch": 6.878276481149013, "step": 95780}, {"loss": 0.5581, "grad_norm": 1.0593502521514893, "learning_rate": 0.0002, "epoch": 6.878994614003591, "step": 95790}, {"loss": 0.5691, "grad_norm": 1.2181397676467896, "learning_rate": 0.0002, "epoch": 6.879712746858169, "step": 95800}, {"loss": 0.5762, "grad_norm": 0.9614198207855225, "learning_rate": 0.0002, "epoch": 6.880430879712747, "step": 95810}, {"loss": 0.5893, "grad_norm": 1.021591067314148, "learning_rate": 0.0002, "epoch": 6.881149012567325, "step": 95820}, {"loss": 0.6063, "grad_norm": 1.3752840757369995, "learning_rate": 0.0002, "epoch": 6.881867145421903, "step": 95830}, {"loss": 0.5758, "grad_norm": 1.236355185508728, "learning_rate": 0.0002, "epoch": 6.882585278276482, "step": 95840}, {"loss": 0.5714, "grad_norm": 1.1957523822784424, "learning_rate": 0.0002, "epoch": 6.88330341113106, "step": 95850}, {"loss": 0.5738, "grad_norm": 0.8793587684631348, "learning_rate": 0.0002, "epoch": 6.884021543985638, "step": 95860}, {"loss": 0.6482, "grad_norm": 1.202054738998413, "learning_rate": 0.0002, "epoch": 6.884739676840216, "step": 95870}, {"loss": 0.5713, "grad_norm": 0.8061116337776184, "learning_rate": 0.0002, "epoch": 6.885457809694794, "step": 95880}, {"loss": 0.6138, "grad_norm": 1.0037956237792969, "learning_rate": 0.0002, "epoch": 6.886175942549372, "step": 95890}, {"loss": 0.5756, "grad_norm": 1.006435751914978, "learning_rate": 0.0002, "epoch": 6.88689407540395, "step": 95900}, {"loss": 0.6145, "grad_norm": 1.141200304031372, "learning_rate": 0.0002, "epoch": 6.887612208258528, "step": 95910}, {"loss": 0.6168, "grad_norm": 0.9017927050590515, "learning_rate": 0.0002, "epoch": 6.888330341113106, "step": 95920}, {"loss": 0.5843, "grad_norm": 0.9288154244422913, "learning_rate": 0.0002, "epoch": 6.889048473967684, "step": 95930}, {"loss": 0.564, "grad_norm": 1.2263801097869873, "learning_rate": 0.0002, "epoch": 6.8897666068222625, "step": 95940}, {"loss": 0.5884, "grad_norm": 1.2005410194396973, "learning_rate": 0.0002, "epoch": 6.8904847396768405, "step": 95950}, {"loss": 0.5625, "grad_norm": 1.0801531076431274, "learning_rate": 0.0002, "epoch": 6.8912028725314185, "step": 95960}, {"loss": 0.5671, "grad_norm": 1.1115456819534302, "learning_rate": 0.0002, "epoch": 6.8919210053859965, "step": 95970}, {"loss": 0.5774, "grad_norm": 1.062920093536377, "learning_rate": 0.0002, "epoch": 6.8926391382405745, "step": 95980}, {"loss": 0.5542, "grad_norm": 0.9343897700309753, "learning_rate": 0.0002, "epoch": 6.8933572710951525, "step": 95990}, {"loss": 0.5774, "grad_norm": 1.0236390829086304, "learning_rate": 0.0002, "epoch": 6.8940754039497305, "step": 96000}, {"loss": 0.6062, "grad_norm": 1.0680996179580688, "learning_rate": 0.0002, "epoch": 6.8947935368043085, "step": 96010}, {"loss": 0.563, "grad_norm": 1.1796760559082031, "learning_rate": 0.0002, "epoch": 6.8955116696588865, "step": 96020}, {"loss": 0.5401, "grad_norm": 0.9805570840835571, "learning_rate": 0.0002, "epoch": 6.896229802513465, "step": 96030}, {"loss": 0.5848, "grad_norm": 1.245386004447937, "learning_rate": 0.0002, "epoch": 6.896947935368043, "step": 96040}, {"loss": 0.578, "grad_norm": 1.0306174755096436, "learning_rate": 0.0002, "epoch": 6.897666068222621, "step": 96050}, {"loss": 0.6198, "grad_norm": 1.0599836111068726, "learning_rate": 0.0002, "epoch": 6.898384201077199, "step": 96060}, {"loss": 0.6029, "grad_norm": 1.1438795328140259, "learning_rate": 0.0002, "epoch": 6.899102333931777, "step": 96070}, {"loss": 0.5611, "grad_norm": 0.9044751524925232, "learning_rate": 0.0002, "epoch": 6.899820466786355, "step": 96080}, {"loss": 0.5623, "grad_norm": 0.9689591526985168, "learning_rate": 0.0002, "epoch": 6.900538599640933, "step": 96090}, {"loss": 0.5645, "grad_norm": 1.003217339515686, "learning_rate": 0.0002, "epoch": 6.901256732495511, "step": 96100}, {"loss": 0.5999, "grad_norm": 1.1630250215530396, "learning_rate": 0.0002, "epoch": 6.901974865350089, "step": 96110}, {"loss": 0.5661, "grad_norm": 1.0304425954818726, "learning_rate": 0.0002, "epoch": 6.902692998204667, "step": 96120}, {"loss": 0.5584, "grad_norm": 1.0148587226867676, "learning_rate": 0.0002, "epoch": 6.903411131059246, "step": 96130}, {"loss": 0.6235, "grad_norm": 1.3722255229949951, "learning_rate": 0.0002, "epoch": 6.904129263913824, "step": 96140}, {"loss": 0.6124, "grad_norm": 1.1518549919128418, "learning_rate": 0.0002, "epoch": 6.904847396768402, "step": 96150}, {"loss": 0.5388, "grad_norm": 1.0342949628829956, "learning_rate": 0.0002, "epoch": 6.90556552962298, "step": 96160}, {"loss": 0.5691, "grad_norm": 1.0178996324539185, "learning_rate": 0.0002, "epoch": 6.906283662477558, "step": 96170}, {"loss": 0.6578, "grad_norm": 1.3429099321365356, "learning_rate": 0.0002, "epoch": 6.907001795332136, "step": 96180}, {"loss": 0.5263, "grad_norm": 1.2281367778778076, "learning_rate": 0.0002, "epoch": 6.907719928186714, "step": 96190}, {"loss": 0.6072, "grad_norm": 0.8190469145774841, "learning_rate": 0.0002, "epoch": 6.908438061041292, "step": 96200}, {"loss": 0.5929, "grad_norm": 1.1344635486602783, "learning_rate": 0.0002, "epoch": 6.909156193895871, "step": 96210}, {"loss": 0.5793, "grad_norm": 1.0540097951889038, "learning_rate": 0.0002, "epoch": 6.909874326750449, "step": 96220}, {"loss": 0.5575, "grad_norm": 1.044974446296692, "learning_rate": 0.0002, "epoch": 6.910592459605027, "step": 96230}, {"loss": 0.5782, "grad_norm": 0.6890087723731995, "learning_rate": 0.0002, "epoch": 6.911310592459605, "step": 96240}, {"loss": 0.5615, "grad_norm": 1.1266905069351196, "learning_rate": 0.0002, "epoch": 6.912028725314183, "step": 96250}, {"loss": 0.5922, "grad_norm": 1.3173121213912964, "learning_rate": 0.0002, "epoch": 6.912746858168761, "step": 96260}, {"loss": 0.5336, "grad_norm": 1.0043895244598389, "learning_rate": 0.0002, "epoch": 6.913464991023339, "step": 96270}, {"loss": 0.5642, "grad_norm": 1.0634605884552002, "learning_rate": 0.0002, "epoch": 6.914183123877917, "step": 96280}, {"loss": 0.5241, "grad_norm": 1.234516978263855, "learning_rate": 0.0002, "epoch": 6.914901256732495, "step": 96290}, {"loss": 0.5791, "grad_norm": 1.042026162147522, "learning_rate": 0.0002, "epoch": 6.915619389587073, "step": 96300}, {"loss": 0.5396, "grad_norm": 1.063632845878601, "learning_rate": 0.0002, "epoch": 6.916337522441651, "step": 96310}, {"loss": 0.6265, "grad_norm": 1.0733225345611572, "learning_rate": 0.0002, "epoch": 6.91705565529623, "step": 96320}, {"loss": 0.6003, "grad_norm": 1.4382662773132324, "learning_rate": 0.0002, "epoch": 6.917773788150808, "step": 96330}, {"loss": 0.5732, "grad_norm": 1.19964599609375, "learning_rate": 0.0002, "epoch": 6.918491921005386, "step": 96340}, {"loss": 0.6177, "grad_norm": 0.9012235403060913, "learning_rate": 0.0002, "epoch": 6.919210053859964, "step": 96350}, {"loss": 0.6113, "grad_norm": 0.8663099408149719, "learning_rate": 0.0002, "epoch": 6.919928186714542, "step": 96360}, {"loss": 0.5164, "grad_norm": 0.8944193124771118, "learning_rate": 0.0002, "epoch": 6.92064631956912, "step": 96370}, {"loss": 0.5556, "grad_norm": 1.1201437711715698, "learning_rate": 0.0002, "epoch": 6.921364452423698, "step": 96380}, {"loss": 0.6219, "grad_norm": 1.0434664487838745, "learning_rate": 0.0002, "epoch": 6.922082585278276, "step": 96390}, {"loss": 0.5978, "grad_norm": 1.2666915655136108, "learning_rate": 0.0002, "epoch": 6.922800718132855, "step": 96400}, {"loss": 0.6231, "grad_norm": 0.9610332250595093, "learning_rate": 0.0002, "epoch": 6.923518850987433, "step": 96410}, {"loss": 0.5657, "grad_norm": 1.1521750688552856, "learning_rate": 0.0002, "epoch": 6.924236983842011, "step": 96420}, {"loss": 0.5682, "grad_norm": 0.921970546245575, "learning_rate": 0.0002, "epoch": 6.924955116696589, "step": 96430}, {"loss": 0.5761, "grad_norm": 1.1277226209640503, "learning_rate": 0.0002, "epoch": 6.925673249551167, "step": 96440}, {"loss": 0.5978, "grad_norm": 1.147425889968872, "learning_rate": 0.0002, "epoch": 6.926391382405745, "step": 96450}, {"loss": 0.6032, "grad_norm": 1.0128270387649536, "learning_rate": 0.0002, "epoch": 6.927109515260323, "step": 96460}, {"loss": 0.5747, "grad_norm": 1.0726343393325806, "learning_rate": 0.0002, "epoch": 6.927827648114901, "step": 96470}, {"loss": 0.6005, "grad_norm": 0.9902656078338623, "learning_rate": 0.0002, "epoch": 6.928545780969479, "step": 96480}, {"loss": 0.5477, "grad_norm": 0.9662004709243774, "learning_rate": 0.0002, "epoch": 6.929263913824057, "step": 96490}, {"loss": 0.5871, "grad_norm": 0.9595714807510376, "learning_rate": 0.0002, "epoch": 6.929982046678636, "step": 96500}, {"loss": 0.6144, "grad_norm": 1.0666614770889282, "learning_rate": 0.0002, "epoch": 6.930700179533214, "step": 96510}, {"loss": 0.5752, "grad_norm": 0.8744403123855591, "learning_rate": 0.0002, "epoch": 6.931418312387792, "step": 96520}, {"loss": 0.6124, "grad_norm": 1.0382628440856934, "learning_rate": 0.0002, "epoch": 6.93213644524237, "step": 96530}, {"loss": 0.5445, "grad_norm": 0.9165884256362915, "learning_rate": 0.0002, "epoch": 6.932854578096948, "step": 96540}, {"loss": 0.5936, "grad_norm": 0.9073842763900757, "learning_rate": 0.0002, "epoch": 6.933572710951526, "step": 96550}, {"loss": 0.5934, "grad_norm": 1.100635051727295, "learning_rate": 0.0002, "epoch": 6.934290843806104, "step": 96560}, {"loss": 0.5869, "grad_norm": 1.1503266096115112, "learning_rate": 0.0002, "epoch": 6.935008976660682, "step": 96570}, {"loss": 0.571, "grad_norm": 0.9526805281639099, "learning_rate": 0.0002, "epoch": 6.93572710951526, "step": 96580}, {"loss": 0.5959, "grad_norm": 1.115716814994812, "learning_rate": 0.0002, "epoch": 6.936445242369839, "step": 96590}, {"loss": 0.6071, "grad_norm": 1.0669193267822266, "learning_rate": 0.0002, "epoch": 6.937163375224417, "step": 96600}, {"loss": 0.6151, "grad_norm": 1.0191189050674438, "learning_rate": 0.0002, "epoch": 6.937881508078995, "step": 96610}, {"loss": 0.5803, "grad_norm": 1.1885946989059448, "learning_rate": 0.0002, "epoch": 6.938599640933573, "step": 96620}, {"loss": 0.5476, "grad_norm": 0.9806031584739685, "learning_rate": 0.0002, "epoch": 6.939317773788151, "step": 96630}, {"loss": 0.5994, "grad_norm": 0.9700000286102295, "learning_rate": 0.0002, "epoch": 6.940035906642729, "step": 96640}, {"loss": 0.5627, "grad_norm": 1.0870105028152466, "learning_rate": 0.0002, "epoch": 6.940754039497307, "step": 96650}, {"loss": 0.6031, "grad_norm": 0.7441867589950562, "learning_rate": 0.0002, "epoch": 6.941472172351885, "step": 96660}, {"loss": 0.5989, "grad_norm": 0.8631957173347473, "learning_rate": 0.0002, "epoch": 6.942190305206463, "step": 96670}, {"loss": 0.6299, "grad_norm": 1.0538444519042969, "learning_rate": 0.0002, "epoch": 6.942908438061041, "step": 96680}, {"loss": 0.5858, "grad_norm": 1.0235437154769897, "learning_rate": 0.0002, "epoch": 6.94362657091562, "step": 96690}, {"loss": 0.5978, "grad_norm": 1.069114089012146, "learning_rate": 0.0002, "epoch": 6.944344703770198, "step": 96700}, {"loss": 0.5613, "grad_norm": 1.0421861410140991, "learning_rate": 0.0002, "epoch": 6.945062836624776, "step": 96710}, {"loss": 0.5694, "grad_norm": 0.9244136810302734, "learning_rate": 0.0002, "epoch": 6.945780969479354, "step": 96720}, {"loss": 0.6043, "grad_norm": 0.962041437625885, "learning_rate": 0.0002, "epoch": 6.946499102333932, "step": 96730}, {"loss": 0.5968, "grad_norm": 1.049677848815918, "learning_rate": 0.0002, "epoch": 6.94721723518851, "step": 96740}, {"loss": 0.5989, "grad_norm": 1.0276710987091064, "learning_rate": 0.0002, "epoch": 6.947935368043088, "step": 96750}, {"loss": 0.5799, "grad_norm": 1.036650538444519, "learning_rate": 0.0002, "epoch": 6.948653500897666, "step": 96760}, {"loss": 0.5631, "grad_norm": 1.0379945039749146, "learning_rate": 0.0002, "epoch": 6.949371633752245, "step": 96770}, {"loss": 0.5439, "grad_norm": 0.9768070578575134, "learning_rate": 0.0002, "epoch": 6.950089766606823, "step": 96780}, {"loss": 0.5646, "grad_norm": 1.0515118837356567, "learning_rate": 0.0002, "epoch": 6.950807899461401, "step": 96790}, {"loss": 0.5513, "grad_norm": 0.9186223149299622, "learning_rate": 0.0002, "epoch": 6.951526032315979, "step": 96800}, {"loss": 0.6109, "grad_norm": 1.0430902242660522, "learning_rate": 0.0002, "epoch": 6.952244165170557, "step": 96810}, {"loss": 0.5823, "grad_norm": 0.7750678658485413, "learning_rate": 0.0002, "epoch": 6.952962298025135, "step": 96820}, {"loss": 0.6031, "grad_norm": 1.1721138954162598, "learning_rate": 0.0002, "epoch": 6.953680430879713, "step": 96830}, {"loss": 0.5527, "grad_norm": 1.2088165283203125, "learning_rate": 0.0002, "epoch": 6.954398563734291, "step": 96840}, {"loss": 0.5768, "grad_norm": 0.9956802129745483, "learning_rate": 0.0002, "epoch": 6.955116696588869, "step": 96850}, {"loss": 0.6052, "grad_norm": 1.0444421768188477, "learning_rate": 0.0002, "epoch": 6.955834829443447, "step": 96860}, {"loss": 0.5615, "grad_norm": 1.2420955896377563, "learning_rate": 0.0002, "epoch": 6.956552962298025, "step": 96870}, {"loss": 0.5377, "grad_norm": 1.0187203884124756, "learning_rate": 0.0002, "epoch": 6.957271095152604, "step": 96880}, {"loss": 0.5683, "grad_norm": 1.0883756875991821, "learning_rate": 0.0002, "epoch": 6.957989228007182, "step": 96890}, {"loss": 0.5406, "grad_norm": 1.1869568824768066, "learning_rate": 0.0002, "epoch": 6.95870736086176, "step": 96900}, {"loss": 0.5901, "grad_norm": 1.242119312286377, "learning_rate": 0.0002, "epoch": 6.959425493716338, "step": 96910}, {"loss": 0.5901, "grad_norm": 1.0262869596481323, "learning_rate": 0.0002, "epoch": 6.960143626570916, "step": 96920}, {"loss": 0.5633, "grad_norm": 0.9577149152755737, "learning_rate": 0.0002, "epoch": 6.960861759425494, "step": 96930}, {"loss": 0.5805, "grad_norm": 0.9224622249603271, "learning_rate": 0.0002, "epoch": 6.961579892280072, "step": 96940}, {"loss": 0.6157, "grad_norm": 1.0761854648590088, "learning_rate": 0.0002, "epoch": 6.96229802513465, "step": 96950}, {"loss": 0.6142, "grad_norm": 1.1029279232025146, "learning_rate": 0.0002, "epoch": 6.9630161579892285, "step": 96960}, {"loss": 0.5857, "grad_norm": 1.1132091283798218, "learning_rate": 0.0002, "epoch": 6.9637342908438065, "step": 96970}, {"loss": 0.5777, "grad_norm": 0.9723706245422363, "learning_rate": 0.0002, "epoch": 6.9644524236983845, "step": 96980}, {"loss": 0.5966, "grad_norm": 1.0453037023544312, "learning_rate": 0.0002, "epoch": 6.9651705565529625, "step": 96990}, {"loss": 0.5808, "grad_norm": 1.16423499584198, "learning_rate": 0.0002, "epoch": 6.9658886894075405, "step": 97000}, {"loss": 0.5734, "grad_norm": 1.1522771120071411, "learning_rate": 0.0002, "epoch": 6.9666068222621185, "step": 97010}, {"loss": 0.6009, "grad_norm": 1.020828127861023, "learning_rate": 0.0002, "epoch": 6.9673249551166965, "step": 97020}, {"loss": 0.6043, "grad_norm": 1.0301889181137085, "learning_rate": 0.0002, "epoch": 6.9680430879712745, "step": 97030}, {"loss": 0.6041, "grad_norm": 1.0615862607955933, "learning_rate": 0.0002, "epoch": 6.9687612208258525, "step": 97040}, {"loss": 0.5875, "grad_norm": 1.1750848293304443, "learning_rate": 0.0002, "epoch": 6.9694793536804305, "step": 97050}, {"loss": 0.5812, "grad_norm": 0.916283905506134, "learning_rate": 0.0002, "epoch": 6.9701974865350085, "step": 97060}, {"loss": 0.6158, "grad_norm": 1.0715203285217285, "learning_rate": 0.0002, "epoch": 6.970915619389587, "step": 97070}, {"loss": 0.6152, "grad_norm": 1.1171340942382812, "learning_rate": 0.0002, "epoch": 6.971633752244165, "step": 97080}, {"loss": 0.6361, "grad_norm": 0.886015772819519, "learning_rate": 0.0002, "epoch": 6.972351885098743, "step": 97090}, {"loss": 0.5934, "grad_norm": 0.9498746991157532, "learning_rate": 0.0002, "epoch": 6.973070017953321, "step": 97100}, {"loss": 0.5951, "grad_norm": 1.1563011407852173, "learning_rate": 0.0002, "epoch": 6.973788150807899, "step": 97110}, {"loss": 0.5966, "grad_norm": 0.9086321592330933, "learning_rate": 0.0002, "epoch": 6.974506283662477, "step": 97120}, {"loss": 0.6268, "grad_norm": 0.9804864525794983, "learning_rate": 0.0002, "epoch": 6.975224416517055, "step": 97130}, {"loss": 0.5282, "grad_norm": 1.5005993843078613, "learning_rate": 0.0002, "epoch": 6.975942549371633, "step": 97140}, {"loss": 0.5446, "grad_norm": 1.1720819473266602, "learning_rate": 0.0002, "epoch": 6.976660682226212, "step": 97150}, {"loss": 0.5325, "grad_norm": 1.095572590827942, "learning_rate": 0.0002, "epoch": 6.97737881508079, "step": 97160}, {"loss": 0.5721, "grad_norm": 1.1880861520767212, "learning_rate": 0.0002, "epoch": 6.978096947935368, "step": 97170}, {"loss": 0.5611, "grad_norm": 1.0959832668304443, "learning_rate": 0.0002, "epoch": 6.978815080789946, "step": 97180}, {"loss": 0.5834, "grad_norm": 1.2158745527267456, "learning_rate": 0.0002, "epoch": 6.979533213644524, "step": 97190}, {"loss": 0.5937, "grad_norm": 1.0073821544647217, "learning_rate": 0.0002, "epoch": 6.980251346499102, "step": 97200}, {"loss": 0.6035, "grad_norm": 0.8503464460372925, "learning_rate": 0.0002, "epoch": 6.98096947935368, "step": 97210}, {"loss": 0.651, "grad_norm": 0.9399861097335815, "learning_rate": 0.0002, "epoch": 6.981687612208258, "step": 97220}, {"loss": 0.6135, "grad_norm": 1.1167447566986084, "learning_rate": 0.0002, "epoch": 6.982405745062836, "step": 97230}, {"loss": 0.5575, "grad_norm": 1.2710384130477905, "learning_rate": 0.0002, "epoch": 6.983123877917414, "step": 97240}, {"loss": 0.5905, "grad_norm": 0.8514767289161682, "learning_rate": 0.0002, "epoch": 6.983842010771993, "step": 97250}, {"loss": 0.5932, "grad_norm": 0.9983348846435547, "learning_rate": 0.0002, "epoch": 6.984560143626571, "step": 97260}, {"loss": 0.5975, "grad_norm": 1.1713277101516724, "learning_rate": 0.0002, "epoch": 6.985278276481149, "step": 97270}, {"loss": 0.5297, "grad_norm": 1.346272349357605, "learning_rate": 0.0002, "epoch": 6.985996409335727, "step": 97280}, {"loss": 0.5847, "grad_norm": 1.0687556266784668, "learning_rate": 0.0002, "epoch": 6.986714542190305, "step": 97290}, {"loss": 0.5938, "grad_norm": 1.035805106163025, "learning_rate": 0.0002, "epoch": 6.987432675044883, "step": 97300}, {"loss": 0.5907, "grad_norm": 1.149027705192566, "learning_rate": 0.0002, "epoch": 6.988150807899461, "step": 97310}, {"loss": 0.5534, "grad_norm": 0.9672921895980835, "learning_rate": 0.0002, "epoch": 6.988868940754039, "step": 97320}, {"loss": 0.552, "grad_norm": 1.0306763648986816, "learning_rate": 0.0002, "epoch": 6.989587073608618, "step": 97330}, {"loss": 0.5705, "grad_norm": 1.1457809209823608, "learning_rate": 0.0002, "epoch": 6.990305206463196, "step": 97340}, {"loss": 0.5767, "grad_norm": 0.9718224406242371, "learning_rate": 0.0002, "epoch": 6.991023339317774, "step": 97350}, {"loss": 0.571, "grad_norm": 0.9872630834579468, "learning_rate": 0.0002, "epoch": 6.991741472172352, "step": 97360}, {"loss": 0.611, "grad_norm": 1.0302132368087769, "learning_rate": 0.0002, "epoch": 6.99245960502693, "step": 97370}, {"loss": 0.6, "grad_norm": 1.001103162765503, "learning_rate": 0.0002, "epoch": 6.993177737881508, "step": 97380}, {"loss": 0.5612, "grad_norm": 0.9207047820091248, "learning_rate": 0.0002, "epoch": 6.993895870736086, "step": 97390}, {"loss": 0.5752, "grad_norm": 1.1986219882965088, "learning_rate": 0.0002, "epoch": 6.994614003590664, "step": 97400}, {"loss": 0.5938, "grad_norm": 1.343885064125061, "learning_rate": 0.0002, "epoch": 6.995332136445242, "step": 97410}, {"loss": 0.5869, "grad_norm": 1.0611628293991089, "learning_rate": 0.0002, "epoch": 6.99605026929982, "step": 97420}, {"loss": 0.6378, "grad_norm": 0.9514605402946472, "learning_rate": 0.0002, "epoch": 6.996768402154398, "step": 97430}, {"loss": 0.5726, "grad_norm": 1.0259917974472046, "learning_rate": 0.0002, "epoch": 6.997486535008977, "step": 97440}, {"loss": 0.5762, "grad_norm": 1.0735033750534058, "learning_rate": 0.0002, "epoch": 6.998204667863555, "step": 97450}, {"loss": 0.6173, "grad_norm": 1.053984522819519, "learning_rate": 0.0002, "epoch": 6.998922800718133, "step": 97460}, {"loss": 0.581, "grad_norm": 1.0285807847976685, "learning_rate": 0.0002, "epoch": 6.999640933572711, "step": 97470}, {"eval_loss": 1.168665885925293, "eval_runtime": 55.1686, "eval_samples_per_second": 13.287, "eval_steps_per_second": 1.668, "epoch": 7.0, "step": 97475}, {"loss": 0.5596, "grad_norm": 1.0394084453582764, "learning_rate": 0.0002, "epoch": 7.000359066427289, "step": 97480}, {"loss": 0.5048, "grad_norm": 1.0377404689788818, "learning_rate": 0.0002, "epoch": 7.001077199281867, "step": 97490}, {"loss": 0.502, "grad_norm": 1.143609642982483, "learning_rate": 0.0002, "epoch": 7.001795332136445, "step": 97500}, {"loss": 0.5071, "grad_norm": 0.9544180035591125, "learning_rate": 0.0002, "epoch": 7.002513464991023, "step": 97510}, {"loss": 0.5249, "grad_norm": 1.1849734783172607, "learning_rate": 0.0002, "epoch": 7.003231597845601, "step": 97520}, {"loss": 0.5095, "grad_norm": 1.0769017934799194, "learning_rate": 0.0002, "epoch": 7.00394973070018, "step": 97530}, {"loss": 0.5238, "grad_norm": 1.2054177522659302, "learning_rate": 0.0002, "epoch": 7.004667863554758, "step": 97540}, {"loss": 0.4639, "grad_norm": 0.800378680229187, "learning_rate": 0.0002, "epoch": 7.005385996409336, "step": 97550}, {"loss": 0.5107, "grad_norm": 1.0197957754135132, "learning_rate": 0.0002, "epoch": 7.006104129263914, "step": 97560}, {"loss": 0.5382, "grad_norm": 1.1266579627990723, "learning_rate": 0.0002, "epoch": 7.006822262118492, "step": 97570}, {"loss": 0.5312, "grad_norm": 0.9955291152000427, "learning_rate": 0.0002, "epoch": 7.00754039497307, "step": 97580}, {"loss": 0.5332, "grad_norm": 1.1531357765197754, "learning_rate": 0.0002, "epoch": 7.008258527827648, "step": 97590}, {"loss": 0.5027, "grad_norm": 1.1159368753433228, "learning_rate": 0.0002, "epoch": 7.008976660682226, "step": 97600}, {"loss": 0.5304, "grad_norm": 1.2170041799545288, "learning_rate": 0.0002, "epoch": 7.009694793536804, "step": 97610}, {"loss": 0.527, "grad_norm": 1.2761963605880737, "learning_rate": 0.0002, "epoch": 7.010412926391383, "step": 97620}, {"loss": 0.4874, "grad_norm": 1.1703165769577026, "learning_rate": 0.0002, "epoch": 7.011131059245961, "step": 97630}, {"loss": 0.5225, "grad_norm": 1.0011869668960571, "learning_rate": 0.0002, "epoch": 7.011849192100539, "step": 97640}, {"loss": 0.4728, "grad_norm": 1.2599170207977295, "learning_rate": 0.0002, "epoch": 7.012567324955117, "step": 97650}, {"loss": 0.5147, "grad_norm": 0.9646086692810059, "learning_rate": 0.0002, "epoch": 7.013285457809695, "step": 97660}, {"loss": 0.5032, "grad_norm": 1.067461609840393, "learning_rate": 0.0002, "epoch": 7.014003590664273, "step": 97670}, {"loss": 0.5079, "grad_norm": 0.9157150983810425, "learning_rate": 0.0002, "epoch": 7.014721723518851, "step": 97680}, {"loss": 0.5466, "grad_norm": 1.5808709859848022, "learning_rate": 0.0002, "epoch": 7.015439856373429, "step": 97690}, {"loss": 0.4598, "grad_norm": 1.069395661354065, "learning_rate": 0.0002, "epoch": 7.016157989228007, "step": 97700}, {"loss": 0.5123, "grad_norm": 1.180887222290039, "learning_rate": 0.0002, "epoch": 7.016876122082586, "step": 97710}, {"loss": 0.5059, "grad_norm": 1.0960854291915894, "learning_rate": 0.0002, "epoch": 7.017594254937164, "step": 97720}, {"loss": 0.516, "grad_norm": 0.9090136885643005, "learning_rate": 0.0002, "epoch": 7.018312387791742, "step": 97730}, {"loss": 0.5025, "grad_norm": 0.992369532585144, "learning_rate": 0.0002, "epoch": 7.01903052064632, "step": 97740}, {"loss": 0.5225, "grad_norm": 1.1090840101242065, "learning_rate": 0.0002, "epoch": 7.019748653500898, "step": 97750}, {"loss": 0.4926, "grad_norm": 1.173752784729004, "learning_rate": 0.0002, "epoch": 7.020466786355476, "step": 97760}, {"loss": 0.496, "grad_norm": 1.1630373001098633, "learning_rate": 0.0002, "epoch": 7.021184919210054, "step": 97770}, {"loss": 0.4946, "grad_norm": 1.34774649143219, "learning_rate": 0.0002, "epoch": 7.021903052064632, "step": 97780}, {"loss": 0.4801, "grad_norm": 1.0631234645843506, "learning_rate": 0.0002, "epoch": 7.02262118491921, "step": 97790}, {"loss": 0.4986, "grad_norm": 1.1396355628967285, "learning_rate": 0.0002, "epoch": 7.023339317773788, "step": 97800}, {"loss": 0.5313, "grad_norm": 1.0061511993408203, "learning_rate": 0.0002, "epoch": 7.024057450628367, "step": 97810}, {"loss": 0.4896, "grad_norm": 0.8545233607292175, "learning_rate": 0.0002, "epoch": 7.024775583482945, "step": 97820}, {"loss": 0.4886, "grad_norm": 1.1746221780776978, "learning_rate": 0.0002, "epoch": 7.025493716337523, "step": 97830}, {"loss": 0.5056, "grad_norm": 0.9705178737640381, "learning_rate": 0.0002, "epoch": 7.026211849192101, "step": 97840}, {"loss": 0.5133, "grad_norm": 0.9517123103141785, "learning_rate": 0.0002, "epoch": 7.026929982046679, "step": 97850}, {"loss": 0.4859, "grad_norm": 1.0428272485733032, "learning_rate": 0.0002, "epoch": 7.027648114901257, "step": 97860}, {"loss": 0.5108, "grad_norm": 1.020277976989746, "learning_rate": 0.0002, "epoch": 7.028366247755835, "step": 97870}, {"loss": 0.5698, "grad_norm": 1.1434438228607178, "learning_rate": 0.0002, "epoch": 7.029084380610413, "step": 97880}, {"loss": 0.5312, "grad_norm": 0.8937026858329773, "learning_rate": 0.0002, "epoch": 7.029802513464991, "step": 97890}, {"loss": 0.4948, "grad_norm": 0.9241712093353271, "learning_rate": 0.0002, "epoch": 7.0305206463195695, "step": 97900}, {"loss": 0.4972, "grad_norm": 1.0576003789901733, "learning_rate": 0.0002, "epoch": 7.0312387791741475, "step": 97910}, {"loss": 0.483, "grad_norm": 0.9046192765235901, "learning_rate": 0.0002, "epoch": 7.0319569120287255, "step": 97920}, {"loss": 0.5153, "grad_norm": 0.9557563662528992, "learning_rate": 0.0002, "epoch": 7.0326750448833035, "step": 97930}, {"loss": 0.5237, "grad_norm": 1.0260612964630127, "learning_rate": 0.0002, "epoch": 7.0333931777378815, "step": 97940}, {"loss": 0.5197, "grad_norm": 1.005668044090271, "learning_rate": 0.0002, "epoch": 7.0341113105924595, "step": 97950}, {"loss": 0.5306, "grad_norm": 1.0715222358703613, "learning_rate": 0.0002, "epoch": 7.0348294434470375, "step": 97960}, {"loss": 0.5024, "grad_norm": 0.9782606363296509, "learning_rate": 0.0002, "epoch": 7.0355475763016155, "step": 97970}, {"loss": 0.467, "grad_norm": 0.970796525478363, "learning_rate": 0.0002, "epoch": 7.0362657091561935, "step": 97980}, {"loss": 0.4933, "grad_norm": 1.0109657049179077, "learning_rate": 0.0002, "epoch": 7.036983842010772, "step": 97990}, {"loss": 0.5153, "grad_norm": 1.0419244766235352, "learning_rate": 0.0002, "epoch": 7.03770197486535, "step": 98000}, {"loss": 0.5009, "grad_norm": 1.140035629272461, "learning_rate": 0.0002, "epoch": 7.038420107719928, "step": 98010}, {"loss": 0.4934, "grad_norm": 1.148266315460205, "learning_rate": 0.0002, "epoch": 7.039138240574506, "step": 98020}, {"loss": 0.5445, "grad_norm": 1.0584349632263184, "learning_rate": 0.0002, "epoch": 7.039856373429084, "step": 98030}, {"loss": 0.5111, "grad_norm": 1.0054830312728882, "learning_rate": 0.0002, "epoch": 7.040574506283662, "step": 98040}, {"loss": 0.5541, "grad_norm": 1.3186599016189575, "learning_rate": 0.0002, "epoch": 7.04129263913824, "step": 98050}, {"loss": 0.5054, "grad_norm": 1.5720367431640625, "learning_rate": 0.0002, "epoch": 7.042010771992818, "step": 98060}, {"loss": 0.4977, "grad_norm": 1.0619040727615356, "learning_rate": 0.0002, "epoch": 7.042728904847396, "step": 98070}, {"loss": 0.4769, "grad_norm": 1.1936930418014526, "learning_rate": 0.0002, "epoch": 7.0434470377019744, "step": 98080}, {"loss": 0.476, "grad_norm": 1.1437066793441772, "learning_rate": 0.0002, "epoch": 7.044165170556553, "step": 98090}, {"loss": 0.5046, "grad_norm": 1.1040478944778442, "learning_rate": 0.0002, "epoch": 7.044883303411131, "step": 98100}, {"loss": 0.5473, "grad_norm": 1.2150214910507202, "learning_rate": 0.0002, "epoch": 7.045601436265709, "step": 98110}, {"loss": 0.5467, "grad_norm": 1.1224234104156494, "learning_rate": 0.0002, "epoch": 7.046319569120287, "step": 98120}, {"loss": 0.5171, "grad_norm": 1.256640076637268, "learning_rate": 0.0002, "epoch": 7.047037701974865, "step": 98130}, {"loss": 0.5008, "grad_norm": 1.2098320722579956, "learning_rate": 0.0002, "epoch": 7.047755834829443, "step": 98140}, {"loss": 0.5187, "grad_norm": 1.0719431638717651, "learning_rate": 0.0002, "epoch": 7.048473967684021, "step": 98150}, {"loss": 0.5047, "grad_norm": 1.5370041131973267, "learning_rate": 0.0002, "epoch": 7.049192100538599, "step": 98160}, {"loss": 0.5036, "grad_norm": 1.166554570198059, "learning_rate": 0.0002, "epoch": 7.049910233393177, "step": 98170}, {"loss": 0.476, "grad_norm": 0.927842378616333, "learning_rate": 0.0002, "epoch": 7.050628366247756, "step": 98180}, {"loss": 0.4905, "grad_norm": 0.9756902456283569, "learning_rate": 0.0002, "epoch": 7.051346499102334, "step": 98190}, {"loss": 0.489, "grad_norm": 0.994195282459259, "learning_rate": 0.0002, "epoch": 7.052064631956912, "step": 98200}, {"loss": 0.5208, "grad_norm": 1.1864269971847534, "learning_rate": 0.0002, "epoch": 7.05278276481149, "step": 98210}, {"loss": 0.4897, "grad_norm": 0.8431169390678406, "learning_rate": 0.0002, "epoch": 7.053500897666068, "step": 98220}, {"loss": 0.4939, "grad_norm": 1.233312726020813, "learning_rate": 0.0002, "epoch": 7.054219030520646, "step": 98230}, {"loss": 0.5496, "grad_norm": 1.0040699243545532, "learning_rate": 0.0002, "epoch": 7.054937163375224, "step": 98240}, {"loss": 0.5197, "grad_norm": 1.004325032234192, "learning_rate": 0.0002, "epoch": 7.055655296229802, "step": 98250}, {"loss": 0.5465, "grad_norm": 1.1213003396987915, "learning_rate": 0.0002, "epoch": 7.05637342908438, "step": 98260}, {"loss": 0.5126, "grad_norm": 1.115504264831543, "learning_rate": 0.0002, "epoch": 7.057091561938959, "step": 98270}, {"loss": 0.4699, "grad_norm": 0.9618098139762878, "learning_rate": 0.0002, "epoch": 7.057809694793537, "step": 98280}, {"loss": 0.5442, "grad_norm": 0.9967533946037292, "learning_rate": 0.0002, "epoch": 7.058527827648115, "step": 98290}, {"loss": 0.5162, "grad_norm": 1.061136245727539, "learning_rate": 0.0002, "epoch": 7.059245960502693, "step": 98300}, {"loss": 0.5206, "grad_norm": 1.3787742853164673, "learning_rate": 0.0002, "epoch": 7.059964093357271, "step": 98310}, {"loss": 0.5003, "grad_norm": 1.0541613101959229, "learning_rate": 0.0002, "epoch": 7.060682226211849, "step": 98320}, {"loss": 0.5684, "grad_norm": 1.3264026641845703, "learning_rate": 0.0002, "epoch": 7.061400359066427, "step": 98330}, {"loss": 0.4889, "grad_norm": 0.9874539375305176, "learning_rate": 0.0002, "epoch": 7.062118491921005, "step": 98340}, {"loss": 0.513, "grad_norm": 0.8959392309188843, "learning_rate": 0.0002, "epoch": 7.062836624775583, "step": 98350}, {"loss": 0.5031, "grad_norm": 0.9952960014343262, "learning_rate": 0.0002, "epoch": 7.063554757630161, "step": 98360}, {"loss": 0.5264, "grad_norm": 1.0395413637161255, "learning_rate": 0.0002, "epoch": 7.06427289048474, "step": 98370}, {"loss": 0.4778, "grad_norm": 0.9314938187599182, "learning_rate": 0.0002, "epoch": 7.064991023339318, "step": 98380}, {"loss": 0.5607, "grad_norm": 1.0952500104904175, "learning_rate": 0.0002, "epoch": 7.065709156193896, "step": 98390}, {"loss": 0.5263, "grad_norm": 0.8393705487251282, "learning_rate": 0.0002, "epoch": 7.066427289048474, "step": 98400}, {"loss": 0.5338, "grad_norm": 1.0407543182373047, "learning_rate": 0.0002, "epoch": 7.067145421903052, "step": 98410}, {"loss": 0.524, "grad_norm": 1.015194296836853, "learning_rate": 0.0002, "epoch": 7.06786355475763, "step": 98420}, {"loss": 0.5486, "grad_norm": 1.0878134965896606, "learning_rate": 0.0002, "epoch": 7.068581687612208, "step": 98430}, {"loss": 0.5176, "grad_norm": 1.0402575731277466, "learning_rate": 0.0002, "epoch": 7.069299820466786, "step": 98440}, {"loss": 0.4895, "grad_norm": 0.8770583271980286, "learning_rate": 0.0002, "epoch": 7.070017953321364, "step": 98450}, {"loss": 0.4816, "grad_norm": 1.0066659450531006, "learning_rate": 0.0002, "epoch": 7.070736086175943, "step": 98460}, {"loss": 0.5185, "grad_norm": 1.1627628803253174, "learning_rate": 0.0002, "epoch": 7.071454219030521, "step": 98470}, {"loss": 0.5193, "grad_norm": 1.1217474937438965, "learning_rate": 0.0002, "epoch": 7.072172351885099, "step": 98480}, {"loss": 0.5621, "grad_norm": 1.1825461387634277, "learning_rate": 0.0002, "epoch": 7.072890484739677, "step": 98490}, {"loss": 0.5012, "grad_norm": 1.2198481559753418, "learning_rate": 0.0002, "epoch": 7.073608617594255, "step": 98500}, {"loss": 0.5059, "grad_norm": 1.0615922212600708, "learning_rate": 0.0002, "epoch": 7.074326750448833, "step": 98510}, {"loss": 0.5176, "grad_norm": 1.1725428104400635, "learning_rate": 0.0002, "epoch": 7.075044883303411, "step": 98520}, {"loss": 0.4844, "grad_norm": 1.0269757509231567, "learning_rate": 0.0002, "epoch": 7.075763016157989, "step": 98530}, {"loss": 0.53, "grad_norm": 0.9191881418228149, "learning_rate": 0.0002, "epoch": 7.076481149012567, "step": 98540}, {"loss": 0.4974, "grad_norm": 1.2156354188919067, "learning_rate": 0.0002, "epoch": 7.077199281867145, "step": 98550}, {"loss": 0.4933, "grad_norm": 1.1455811262130737, "learning_rate": 0.0002, "epoch": 7.077917414721724, "step": 98560}, {"loss": 0.524, "grad_norm": 1.1971662044525146, "learning_rate": 0.0002, "epoch": 7.078635547576302, "step": 98570}, {"loss": 0.5287, "grad_norm": 1.1876308917999268, "learning_rate": 0.0002, "epoch": 7.07935368043088, "step": 98580}, {"loss": 0.5429, "grad_norm": 1.0847078561782837, "learning_rate": 0.0002, "epoch": 7.080071813285458, "step": 98590}, {"loss": 0.5082, "grad_norm": 1.1745446920394897, "learning_rate": 0.0002, "epoch": 7.080789946140036, "step": 98600}, {"loss": 0.5145, "grad_norm": 1.133808970451355, "learning_rate": 0.0002, "epoch": 7.081508078994614, "step": 98610}, {"loss": 0.5054, "grad_norm": 0.8598989248275757, "learning_rate": 0.0002, "epoch": 7.082226211849192, "step": 98620}, {"loss": 0.5301, "grad_norm": 0.9775993824005127, "learning_rate": 0.0002, "epoch": 7.08294434470377, "step": 98630}, {"loss": 0.499, "grad_norm": 1.1053773164749146, "learning_rate": 0.0002, "epoch": 7.083662477558348, "step": 98640}, {"loss": 0.4975, "grad_norm": 1.1902083158493042, "learning_rate": 0.0002, "epoch": 7.084380610412927, "step": 98650}, {"loss": 0.5432, "grad_norm": 1.2208364009857178, "learning_rate": 0.0002, "epoch": 7.085098743267505, "step": 98660}, {"loss": 0.5078, "grad_norm": 1.3565878868103027, "learning_rate": 0.0002, "epoch": 7.085816876122083, "step": 98670}, {"loss": 0.5183, "grad_norm": 1.1915233135223389, "learning_rate": 0.0002, "epoch": 7.086535008976661, "step": 98680}, {"loss": 0.4765, "grad_norm": 0.7820531725883484, "learning_rate": 0.0002, "epoch": 7.087253141831239, "step": 98690}, {"loss": 0.5264, "grad_norm": 1.3015085458755493, "learning_rate": 0.0002, "epoch": 7.087971274685817, "step": 98700}, {"loss": 0.524, "grad_norm": 1.1178984642028809, "learning_rate": 0.0002, "epoch": 7.088689407540395, "step": 98710}, {"loss": 0.4689, "grad_norm": 1.0407224893569946, "learning_rate": 0.0002, "epoch": 7.089407540394973, "step": 98720}, {"loss": 0.5082, "grad_norm": 1.070882797241211, "learning_rate": 0.0002, "epoch": 7.090125673249551, "step": 98730}, {"loss": 0.511, "grad_norm": 1.0723912715911865, "learning_rate": 0.0002, "epoch": 7.09084380610413, "step": 98740}, {"loss": 0.5322, "grad_norm": 0.9973018169403076, "learning_rate": 0.0002, "epoch": 7.091561938958708, "step": 98750}, {"loss": 0.5346, "grad_norm": 1.2216873168945312, "learning_rate": 0.0002, "epoch": 7.092280071813286, "step": 98760}, {"loss": 0.5175, "grad_norm": 0.9081874489784241, "learning_rate": 0.0002, "epoch": 7.092998204667864, "step": 98770}, {"loss": 0.5165, "grad_norm": 1.141811490058899, "learning_rate": 0.0002, "epoch": 7.093716337522442, "step": 98780}, {"loss": 0.4975, "grad_norm": 0.9687919020652771, "learning_rate": 0.0002, "epoch": 7.09443447037702, "step": 98790}, {"loss": 0.5328, "grad_norm": 1.0691136121749878, "learning_rate": 0.0002, "epoch": 7.095152603231598, "step": 98800}, {"loss": 0.5087, "grad_norm": 1.100003957748413, "learning_rate": 0.0002, "epoch": 7.095870736086176, "step": 98810}, {"loss": 0.4859, "grad_norm": 1.0004968643188477, "learning_rate": 0.0002, "epoch": 7.096588868940754, "step": 98820}, {"loss": 0.532, "grad_norm": 1.0497100353240967, "learning_rate": 0.0002, "epoch": 7.097307001795333, "step": 98830}, {"loss": 0.4909, "grad_norm": 1.0173693895339966, "learning_rate": 0.0002, "epoch": 7.098025134649911, "step": 98840}, {"loss": 0.4948, "grad_norm": 1.3046447038650513, "learning_rate": 0.0002, "epoch": 7.098743267504489, "step": 98850}, {"loss": 0.4968, "grad_norm": 1.1587737798690796, "learning_rate": 0.0002, "epoch": 7.099461400359067, "step": 98860}, {"loss": 0.5003, "grad_norm": 0.9734950661659241, "learning_rate": 0.0002, "epoch": 7.100179533213645, "step": 98870}, {"loss": 0.489, "grad_norm": 1.2131417989730835, "learning_rate": 0.0002, "epoch": 7.100897666068223, "step": 98880}, {"loss": 0.5111, "grad_norm": 1.2643247842788696, "learning_rate": 0.0002, "epoch": 7.101615798922801, "step": 98890}, {"loss": 0.474, "grad_norm": 1.0531554222106934, "learning_rate": 0.0002, "epoch": 7.102333931777379, "step": 98900}, {"loss": 0.5315, "grad_norm": 1.0205429792404175, "learning_rate": 0.0002, "epoch": 7.103052064631957, "step": 98910}, {"loss": 0.5239, "grad_norm": 1.1247005462646484, "learning_rate": 0.0002, "epoch": 7.103770197486535, "step": 98920}, {"loss": 0.5491, "grad_norm": 1.1993550062179565, "learning_rate": 0.0002, "epoch": 7.1044883303411135, "step": 98930}, {"loss": 0.5339, "grad_norm": 1.1030243635177612, "learning_rate": 0.0002, "epoch": 7.1052064631956915, "step": 98940}, {"loss": 0.5329, "grad_norm": 1.134373426437378, "learning_rate": 0.0002, "epoch": 7.1059245960502695, "step": 98950}, {"loss": 0.4968, "grad_norm": 1.0449906587600708, "learning_rate": 0.0002, "epoch": 7.1066427289048475, "step": 98960}, {"loss": 0.5109, "grad_norm": 0.9911691546440125, "learning_rate": 0.0002, "epoch": 7.1073608617594255, "step": 98970}, {"loss": 0.522, "grad_norm": 1.2021015882492065, "learning_rate": 0.0002, "epoch": 7.1080789946140035, "step": 98980}, {"loss": 0.5523, "grad_norm": 1.1013414859771729, "learning_rate": 0.0002, "epoch": 7.1087971274685815, "step": 98990}, {"loss": 0.519, "grad_norm": 1.0632404088974, "learning_rate": 0.0002, "epoch": 7.1095152603231595, "step": 99000}, {"loss": 0.5624, "grad_norm": 1.1499850749969482, "learning_rate": 0.0002, "epoch": 7.1102333931777375, "step": 99010}, {"loss": 0.525, "grad_norm": 1.1187937259674072, "learning_rate": 0.0002, "epoch": 7.110951526032316, "step": 99020}, {"loss": 0.4913, "grad_norm": 1.109269618988037, "learning_rate": 0.0002, "epoch": 7.111669658886894, "step": 99030}, {"loss": 0.5087, "grad_norm": 1.04684317111969, "learning_rate": 0.0002, "epoch": 7.112387791741472, "step": 99040}, {"loss": 0.5409, "grad_norm": 1.142975926399231, "learning_rate": 0.0002, "epoch": 7.11310592459605, "step": 99050}, {"loss": 0.5021, "grad_norm": 1.0006840229034424, "learning_rate": 0.0002, "epoch": 7.113824057450628, "step": 99060}, {"loss": 0.4859, "grad_norm": 1.1721967458724976, "learning_rate": 0.0002, "epoch": 7.114542190305206, "step": 99070}, {"loss": 0.5333, "grad_norm": 1.0295040607452393, "learning_rate": 0.0002, "epoch": 7.115260323159784, "step": 99080}, {"loss": 0.5251, "grad_norm": 1.2406680583953857, "learning_rate": 0.0002, "epoch": 7.115978456014362, "step": 99090}, {"loss": 0.513, "grad_norm": 1.2812756299972534, "learning_rate": 0.0002, "epoch": 7.11669658886894, "step": 99100}, {"loss": 0.5016, "grad_norm": 0.9559424519538879, "learning_rate": 0.0002, "epoch": 7.117414721723518, "step": 99110}, {"loss": 0.5077, "grad_norm": 1.2253276109695435, "learning_rate": 0.0002, "epoch": 7.118132854578097, "step": 99120}, {"loss": 0.4637, "grad_norm": 0.9636382460594177, "learning_rate": 0.0002, "epoch": 7.118850987432675, "step": 99130}, {"loss": 0.481, "grad_norm": 0.9765542149543762, "learning_rate": 0.0002, "epoch": 7.119569120287253, "step": 99140}, {"loss": 0.5167, "grad_norm": 0.8722323775291443, "learning_rate": 0.0002, "epoch": 7.120287253141831, "step": 99150}, {"loss": 0.5575, "grad_norm": 1.2198525667190552, "learning_rate": 0.0002, "epoch": 7.121005385996409, "step": 99160}, {"loss": 0.5219, "grad_norm": 0.9809777140617371, "learning_rate": 0.0002, "epoch": 7.121723518850987, "step": 99170}, {"loss": 0.529, "grad_norm": 0.9328579902648926, "learning_rate": 0.0002, "epoch": 7.122441651705565, "step": 99180}, {"loss": 0.5258, "grad_norm": 1.0994173288345337, "learning_rate": 0.0002, "epoch": 7.123159784560143, "step": 99190}, {"loss": 0.5413, "grad_norm": 0.9433317184448242, "learning_rate": 0.0002, "epoch": 7.123877917414721, "step": 99200}, {"loss": 0.5414, "grad_norm": 0.9754116535186768, "learning_rate": 0.0002, "epoch": 7.1245960502693, "step": 99210}, {"loss": 0.5615, "grad_norm": 1.3194613456726074, "learning_rate": 0.0002, "epoch": 7.125314183123878, "step": 99220}, {"loss": 0.5409, "grad_norm": 1.166597604751587, "learning_rate": 0.0002, "epoch": 7.126032315978456, "step": 99230}, {"loss": 0.5384, "grad_norm": 1.1221239566802979, "learning_rate": 0.0002, "epoch": 7.126750448833034, "step": 99240}, {"loss": 0.488, "grad_norm": 1.1992909908294678, "learning_rate": 0.0002, "epoch": 7.127468581687612, "step": 99250}, {"loss": 0.5576, "grad_norm": 1.0624475479125977, "learning_rate": 0.0002, "epoch": 7.12818671454219, "step": 99260}, {"loss": 0.4927, "grad_norm": 0.9556567668914795, "learning_rate": 0.0002, "epoch": 7.128904847396768, "step": 99270}, {"loss": 0.4834, "grad_norm": 1.3168047666549683, "learning_rate": 0.0002, "epoch": 7.129622980251346, "step": 99280}, {"loss": 0.5186, "grad_norm": 1.0971012115478516, "learning_rate": 0.0002, "epoch": 7.130341113105924, "step": 99290}, {"loss": 0.5029, "grad_norm": 1.287570595741272, "learning_rate": 0.0002, "epoch": 7.131059245960503, "step": 99300}, {"loss": 0.5503, "grad_norm": 1.4277496337890625, "learning_rate": 0.0002, "epoch": 7.131777378815081, "step": 99310}, {"loss": 0.5517, "grad_norm": 0.933844268321991, "learning_rate": 0.0002, "epoch": 7.132495511669659, "step": 99320}, {"loss": 0.5313, "grad_norm": 1.0423851013183594, "learning_rate": 0.0002, "epoch": 7.133213644524237, "step": 99330}, {"loss": 0.521, "grad_norm": 1.0162577629089355, "learning_rate": 0.0002, "epoch": 7.133931777378815, "step": 99340}, {"loss": 0.5316, "grad_norm": 1.0845975875854492, "learning_rate": 0.0002, "epoch": 7.134649910233393, "step": 99350}, {"loss": 0.5378, "grad_norm": 1.0210866928100586, "learning_rate": 0.0002, "epoch": 7.135368043087971, "step": 99360}, {"loss": 0.5562, "grad_norm": 0.9540662169456482, "learning_rate": 0.0002, "epoch": 7.136086175942549, "step": 99370}, {"loss": 0.5092, "grad_norm": 0.9962146878242493, "learning_rate": 0.0002, "epoch": 7.136804308797127, "step": 99380}, {"loss": 0.5008, "grad_norm": 1.021399736404419, "learning_rate": 0.0002, "epoch": 7.137522441651706, "step": 99390}, {"loss": 0.5455, "grad_norm": 1.227946400642395, "learning_rate": 0.0002, "epoch": 7.138240574506284, "step": 99400}, {"loss": 0.5189, "grad_norm": 1.2851567268371582, "learning_rate": 0.0002, "epoch": 7.138958707360862, "step": 99410}, {"loss": 0.5434, "grad_norm": 0.9820418953895569, "learning_rate": 0.0002, "epoch": 7.13967684021544, "step": 99420}, {"loss": 0.5228, "grad_norm": 0.9503002762794495, "learning_rate": 0.0002, "epoch": 7.140394973070018, "step": 99430}, {"loss": 0.547, "grad_norm": 0.924704372882843, "learning_rate": 0.0002, "epoch": 7.141113105924596, "step": 99440}, {"loss": 0.4548, "grad_norm": 1.1376171112060547, "learning_rate": 0.0002, "epoch": 7.141831238779174, "step": 99450}, {"loss": 0.535, "grad_norm": 1.2862539291381836, "learning_rate": 0.0002, "epoch": 7.142549371633752, "step": 99460}, {"loss": 0.5078, "grad_norm": 1.1068240404129028, "learning_rate": 0.0002, "epoch": 7.14326750448833, "step": 99470}, {"loss": 0.5136, "grad_norm": 1.3112517595291138, "learning_rate": 0.0002, "epoch": 7.143985637342908, "step": 99480}, {"loss": 0.4944, "grad_norm": 1.0884982347488403, "learning_rate": 0.0002, "epoch": 7.144703770197487, "step": 99490}, {"loss": 0.5378, "grad_norm": 1.2093886137008667, "learning_rate": 0.0002, "epoch": 7.145421903052065, "step": 99500}, {"loss": 0.5375, "grad_norm": 0.9628178477287292, "learning_rate": 0.0002, "epoch": 7.146140035906643, "step": 99510}, {"loss": 0.527, "grad_norm": 1.1300674676895142, "learning_rate": 0.0002, "epoch": 7.146858168761221, "step": 99520}, {"loss": 0.5116, "grad_norm": 0.8746275901794434, "learning_rate": 0.0002, "epoch": 7.147576301615799, "step": 99530}, {"loss": 0.5821, "grad_norm": 1.034233808517456, "learning_rate": 0.0002, "epoch": 7.148294434470377, "step": 99540}, {"loss": 0.5131, "grad_norm": 1.0235376358032227, "learning_rate": 0.0002, "epoch": 7.149012567324955, "step": 99550}, {"loss": 0.4908, "grad_norm": 1.048659324645996, "learning_rate": 0.0002, "epoch": 7.149730700179533, "step": 99560}, {"loss": 0.5346, "grad_norm": 1.278841495513916, "learning_rate": 0.0002, "epoch": 7.150448833034111, "step": 99570}, {"loss": 0.4864, "grad_norm": 1.0460485219955444, "learning_rate": 0.0002, "epoch": 7.15116696588869, "step": 99580}, {"loss": 0.5243, "grad_norm": 1.070234775543213, "learning_rate": 0.0002, "epoch": 7.151885098743268, "step": 99590}, {"loss": 0.5592, "grad_norm": 1.1036664247512817, "learning_rate": 0.0002, "epoch": 7.152603231597846, "step": 99600}, {"loss": 0.5364, "grad_norm": 1.212744116783142, "learning_rate": 0.0002, "epoch": 7.153321364452424, "step": 99610}, {"loss": 0.5074, "grad_norm": 1.1095936298370361, "learning_rate": 0.0002, "epoch": 7.154039497307002, "step": 99620}, {"loss": 0.4783, "grad_norm": 1.1953791379928589, "learning_rate": 0.0002, "epoch": 7.15475763016158, "step": 99630}, {"loss": 0.511, "grad_norm": 1.3188790082931519, "learning_rate": 0.0002, "epoch": 7.155475763016158, "step": 99640}, {"loss": 0.487, "grad_norm": 0.8723140358924866, "learning_rate": 0.0002, "epoch": 7.156193895870736, "step": 99650}, {"loss": 0.4973, "grad_norm": 0.9156793355941772, "learning_rate": 0.0002, "epoch": 7.156912028725314, "step": 99660}, {"loss": 0.5254, "grad_norm": 0.9418860673904419, "learning_rate": 0.0002, "epoch": 7.157630161579892, "step": 99670}, {"loss": 0.5173, "grad_norm": 1.0322530269622803, "learning_rate": 0.0002, "epoch": 7.158348294434471, "step": 99680}, {"loss": 0.5107, "grad_norm": 1.0246423482894897, "learning_rate": 0.0002, "epoch": 7.159066427289049, "step": 99690}, {"loss": 0.5136, "grad_norm": 0.8930608630180359, "learning_rate": 0.0002, "epoch": 7.159784560143627, "step": 99700}, {"loss": 0.5274, "grad_norm": 1.038223385810852, "learning_rate": 0.0002, "epoch": 7.160502692998205, "step": 99710}, {"loss": 0.5627, "grad_norm": 1.1020445823669434, "learning_rate": 0.0002, "epoch": 7.161220825852783, "step": 99720}, {"loss": 0.4598, "grad_norm": 0.9623728394508362, "learning_rate": 0.0002, "epoch": 7.161938958707361, "step": 99730}, {"loss": 0.5135, "grad_norm": 1.0490144491195679, "learning_rate": 0.0002, "epoch": 7.162657091561939, "step": 99740}, {"loss": 0.4951, "grad_norm": 1.039595127105713, "learning_rate": 0.0002, "epoch": 7.163375224416517, "step": 99750}, {"loss": 0.5366, "grad_norm": 1.2656937837600708, "learning_rate": 0.0002, "epoch": 7.164093357271095, "step": 99760}, {"loss": 0.5206, "grad_norm": 1.469683289527893, "learning_rate": 0.0002, "epoch": 7.164811490125674, "step": 99770}, {"loss": 0.5348, "grad_norm": 1.1830174922943115, "learning_rate": 0.0002, "epoch": 7.165529622980252, "step": 99780}, {"loss": 0.5431, "grad_norm": 1.144771933555603, "learning_rate": 0.0002, "epoch": 7.16624775583483, "step": 99790}, {"loss": 0.5047, "grad_norm": 0.8902682662010193, "learning_rate": 0.0002, "epoch": 7.166965888689408, "step": 99800}, {"loss": 0.5045, "grad_norm": 1.0538955926895142, "learning_rate": 0.0002, "epoch": 7.167684021543986, "step": 99810}, {"loss": 0.531, "grad_norm": 1.3387681245803833, "learning_rate": 0.0002, "epoch": 7.168402154398564, "step": 99820}, {"loss": 0.536, "grad_norm": 1.1162230968475342, "learning_rate": 0.0002, "epoch": 7.169120287253142, "step": 99830}, {"loss": 0.5601, "grad_norm": 0.9946745038032532, "learning_rate": 0.0002, "epoch": 7.16983842010772, "step": 99840}, {"loss": 0.5217, "grad_norm": 1.0431642532348633, "learning_rate": 0.0002, "epoch": 7.170556552962298, "step": 99850}, {"loss": 0.537, "grad_norm": 1.1344799995422363, "learning_rate": 0.0002, "epoch": 7.1712746858168765, "step": 99860}, {"loss": 0.5404, "grad_norm": 0.8978185653686523, "learning_rate": 0.0002, "epoch": 7.1719928186714546, "step": 99870}, {"loss": 0.5468, "grad_norm": 1.2808794975280762, "learning_rate": 0.0002, "epoch": 7.1727109515260326, "step": 99880}, {"loss": 0.5222, "grad_norm": 1.0654441118240356, "learning_rate": 0.0002, "epoch": 7.1734290843806106, "step": 99890}, {"loss": 0.5411, "grad_norm": 1.2751258611679077, "learning_rate": 0.0002, "epoch": 7.174147217235189, "step": 99900}, {"loss": 0.5175, "grad_norm": 0.9488890171051025, "learning_rate": 0.0002, "epoch": 7.174865350089767, "step": 99910}, {"loss": 0.536, "grad_norm": 1.2057361602783203, "learning_rate": 0.0002, "epoch": 7.175583482944345, "step": 99920}, {"loss": 0.5416, "grad_norm": 1.2620776891708374, "learning_rate": 0.0002, "epoch": 7.176301615798923, "step": 99930}, {"loss": 0.5371, "grad_norm": 1.0042833089828491, "learning_rate": 0.0002, "epoch": 7.177019748653501, "step": 99940}, {"loss": 0.5185, "grad_norm": 0.9716517329216003, "learning_rate": 0.0002, "epoch": 7.177737881508079, "step": 99950}, {"loss": 0.4859, "grad_norm": 0.9876767992973328, "learning_rate": 0.0002, "epoch": 7.1784560143626575, "step": 99960}, {"loss": 0.5351, "grad_norm": 1.0020827054977417, "learning_rate": 0.0002, "epoch": 7.1791741472172355, "step": 99970}, {"loss": 0.539, "grad_norm": 1.0674978494644165, "learning_rate": 0.0002, "epoch": 7.1798922800718135, "step": 99980}, {"loss": 0.4997, "grad_norm": 1.3148112297058105, "learning_rate": 0.0002, "epoch": 7.1806104129263915, "step": 99990}, {"loss": 0.5155, "grad_norm": 1.048911690711975, "learning_rate": 0.0002, "epoch": 7.1813285457809695, "step": 100000}, {"loss": 0.5144, "grad_norm": 1.0747761726379395, "learning_rate": 0.0002, "epoch": 7.1820466786355475, "step": 100010}, {"loss": 0.4882, "grad_norm": 1.1818102598190308, "learning_rate": 0.0002, "epoch": 7.1827648114901255, "step": 100020}, {"loss": 0.5178, "grad_norm": 0.9548772573471069, "learning_rate": 0.0002, "epoch": 7.1834829443447035, "step": 100030}, {"loss": 0.568, "grad_norm": 1.2127790451049805, "learning_rate": 0.0002, "epoch": 7.1842010771992815, "step": 100040}, {"loss": 0.5658, "grad_norm": 1.1227222681045532, "learning_rate": 0.0002, "epoch": 7.18491921005386, "step": 100050}, {"loss": 0.5749, "grad_norm": 1.1687812805175781, "learning_rate": 0.0002, "epoch": 7.185637342908438, "step": 100060}, {"loss": 0.5171, "grad_norm": 0.9948291182518005, "learning_rate": 0.0002, "epoch": 7.186355475763016, "step": 100070}, {"loss": 0.5405, "grad_norm": 1.140623688697815, "learning_rate": 0.0002, "epoch": 7.187073608617594, "step": 100080}, {"loss": 0.4886, "grad_norm": 1.0152307748794556, "learning_rate": 0.0002, "epoch": 7.187791741472172, "step": 100090}, {"loss": 0.5302, "grad_norm": 1.049146056175232, "learning_rate": 0.0002, "epoch": 7.18850987432675, "step": 100100}, {"loss": 0.4833, "grad_norm": 0.9283392429351807, "learning_rate": 0.0002, "epoch": 7.189228007181328, "step": 100110}, {"loss": 0.5124, "grad_norm": 0.9900078177452087, "learning_rate": 0.0002, "epoch": 7.189946140035906, "step": 100120}, {"loss": 0.5715, "grad_norm": 0.9017449021339417, "learning_rate": 0.0002, "epoch": 7.190664272890484, "step": 100130}, {"loss": 0.508, "grad_norm": 1.0106319189071655, "learning_rate": 0.0002, "epoch": 7.191382405745063, "step": 100140}, {"loss": 0.4737, "grad_norm": 0.985713541507721, "learning_rate": 0.0002, "epoch": 7.192100538599641, "step": 100150}, {"loss": 0.5136, "grad_norm": 1.074846863746643, "learning_rate": 0.0002, "epoch": 7.192818671454219, "step": 100160}, {"loss": 0.542, "grad_norm": 1.1982495784759521, "learning_rate": 0.0002, "epoch": 7.193536804308797, "step": 100170}, {"loss": 0.5155, "grad_norm": 0.9354469180107117, "learning_rate": 0.0002, "epoch": 7.194254937163375, "step": 100180}, {"loss": 0.5808, "grad_norm": 1.289989948272705, "learning_rate": 0.0002, "epoch": 7.194973070017953, "step": 100190}, {"loss": 0.5174, "grad_norm": 1.2959555387496948, "learning_rate": 0.0002, "epoch": 7.195691202872531, "step": 100200}, {"loss": 0.5454, "grad_norm": 1.127426266670227, "learning_rate": 0.0002, "epoch": 7.196409335727109, "step": 100210}, {"loss": 0.5587, "grad_norm": 1.1479859352111816, "learning_rate": 0.0002, "epoch": 7.197127468581687, "step": 100220}, {"loss": 0.5357, "grad_norm": 0.9798394441604614, "learning_rate": 0.0002, "epoch": 7.197845601436265, "step": 100230}, {"loss": 0.5362, "grad_norm": 1.155127763748169, "learning_rate": 0.0002, "epoch": 7.198563734290844, "step": 100240}, {"loss": 0.5531, "grad_norm": 1.051482081413269, "learning_rate": 0.0002, "epoch": 7.199281867145422, "step": 100250}, {"loss": 0.5973, "grad_norm": 1.0441079139709473, "learning_rate": 0.0002, "epoch": 7.2, "step": 100260}, {"loss": 0.4961, "grad_norm": 0.9930968284606934, "learning_rate": 0.0002, "epoch": 7.200718132854578, "step": 100270}, {"loss": 0.504, "grad_norm": 1.001161813735962, "learning_rate": 0.0002, "epoch": 7.201436265709156, "step": 100280}, {"loss": 0.544, "grad_norm": 1.075697898864746, "learning_rate": 0.0002, "epoch": 7.202154398563734, "step": 100290}, {"loss": 0.5232, "grad_norm": 1.359117031097412, "learning_rate": 0.0002, "epoch": 7.202872531418312, "step": 100300}, {"loss": 0.5157, "grad_norm": 0.9824917316436768, "learning_rate": 0.0002, "epoch": 7.20359066427289, "step": 100310}, {"loss": 0.528, "grad_norm": 1.0275092124938965, "learning_rate": 0.0002, "epoch": 7.204308797127468, "step": 100320}, {"loss": 0.5513, "grad_norm": 1.1662230491638184, "learning_rate": 0.0002, "epoch": 7.205026929982047, "step": 100330}, {"loss": 0.5178, "grad_norm": 1.0671597719192505, "learning_rate": 0.0002, "epoch": 7.205745062836625, "step": 100340}, {"loss": 0.5157, "grad_norm": 1.6219303607940674, "learning_rate": 0.0002, "epoch": 7.206463195691203, "step": 100350}, {"loss": 0.5404, "grad_norm": 1.098658561706543, "learning_rate": 0.0002, "epoch": 7.207181328545781, "step": 100360}, {"loss": 0.5247, "grad_norm": 1.1623865365982056, "learning_rate": 0.0002, "epoch": 7.207899461400359, "step": 100370}, {"loss": 0.51, "grad_norm": 0.9317528009414673, "learning_rate": 0.0002, "epoch": 7.208617594254937, "step": 100380}, {"loss": 0.5142, "grad_norm": 1.1576400995254517, "learning_rate": 0.0002, "epoch": 7.209335727109515, "step": 100390}, {"loss": 0.5276, "grad_norm": 1.111785888671875, "learning_rate": 0.0002, "epoch": 7.210053859964093, "step": 100400}, {"loss": 0.5607, "grad_norm": 1.0347126722335815, "learning_rate": 0.0002, "epoch": 7.210771992818671, "step": 100410}, {"loss": 0.5527, "grad_norm": 1.2763441801071167, "learning_rate": 0.0002, "epoch": 7.211490125673249, "step": 100420}, {"loss": 0.4983, "grad_norm": 1.4479249715805054, "learning_rate": 0.0002, "epoch": 7.212208258527828, "step": 100430}, {"loss": 0.493, "grad_norm": 1.0243892669677734, "learning_rate": 0.0002, "epoch": 7.212926391382406, "step": 100440}, {"loss": 0.5152, "grad_norm": 1.099047064781189, "learning_rate": 0.0002, "epoch": 7.213644524236984, "step": 100450}, {"loss": 0.5124, "grad_norm": 0.9364129900932312, "learning_rate": 0.0002, "epoch": 7.214362657091562, "step": 100460}, {"loss": 0.5462, "grad_norm": 0.9328993558883667, "learning_rate": 0.0002, "epoch": 7.21508078994614, "step": 100470}, {"loss": 0.5446, "grad_norm": 1.336569905281067, "learning_rate": 0.0002, "epoch": 7.215798922800718, "step": 100480}, {"loss": 0.5545, "grad_norm": 1.090484380722046, "learning_rate": 0.0002, "epoch": 7.216517055655296, "step": 100490}, {"loss": 0.5563, "grad_norm": 0.8246992826461792, "learning_rate": 0.0002, "epoch": 7.217235188509874, "step": 100500}, {"loss": 0.5427, "grad_norm": 1.1569660902023315, "learning_rate": 0.0002, "epoch": 7.217953321364452, "step": 100510}, {"loss": 0.4926, "grad_norm": 0.9871801733970642, "learning_rate": 0.0002, "epoch": 7.218671454219031, "step": 100520}, {"loss": 0.52, "grad_norm": 0.9819903373718262, "learning_rate": 0.0002, "epoch": 7.219389587073609, "step": 100530}, {"loss": 0.4942, "grad_norm": 1.251344919204712, "learning_rate": 0.0002, "epoch": 7.220107719928187, "step": 100540}, {"loss": 0.5344, "grad_norm": 1.2649824619293213, "learning_rate": 0.0002, "epoch": 7.220825852782765, "step": 100550}, {"loss": 0.5205, "grad_norm": 1.1401978731155396, "learning_rate": 0.0002, "epoch": 7.221543985637343, "step": 100560}, {"loss": 0.5384, "grad_norm": 1.1615785360336304, "learning_rate": 0.0002, "epoch": 7.222262118491921, "step": 100570}, {"loss": 0.5467, "grad_norm": 1.1743568181991577, "learning_rate": 0.0002, "epoch": 7.222980251346499, "step": 100580}, {"loss": 0.5526, "grad_norm": 1.1526521444320679, "learning_rate": 0.0002, "epoch": 7.223698384201077, "step": 100590}, {"loss": 0.5571, "grad_norm": 1.1919556856155396, "learning_rate": 0.0002, "epoch": 7.224416517055655, "step": 100600}, {"loss": 0.5372, "grad_norm": 1.1855655908584595, "learning_rate": 0.0002, "epoch": 7.225134649910234, "step": 100610}, {"loss": 0.5372, "grad_norm": 1.1512478590011597, "learning_rate": 0.0002, "epoch": 7.225852782764812, "step": 100620}, {"loss": 0.5179, "grad_norm": 0.8307192325592041, "learning_rate": 0.0002, "epoch": 7.22657091561939, "step": 100630}, {"loss": 0.5591, "grad_norm": 1.269504189491272, "learning_rate": 0.0002, "epoch": 7.227289048473968, "step": 100640}, {"loss": 0.5256, "grad_norm": 1.2145130634307861, "learning_rate": 0.0002, "epoch": 7.228007181328546, "step": 100650}, {"loss": 0.5407, "grad_norm": 1.0325201749801636, "learning_rate": 0.0002, "epoch": 7.228725314183124, "step": 100660}, {"loss": 0.5081, "grad_norm": 0.9242451190948486, "learning_rate": 0.0002, "epoch": 7.229443447037702, "step": 100670}, {"loss": 0.4692, "grad_norm": 1.3832745552062988, "learning_rate": 0.0002, "epoch": 7.23016157989228, "step": 100680}, {"loss": 0.519, "grad_norm": 0.9716517925262451, "learning_rate": 0.0002, "epoch": 7.230879712746858, "step": 100690}, {"loss": 0.5359, "grad_norm": 1.0162315368652344, "learning_rate": 0.0002, "epoch": 7.231597845601437, "step": 100700}, {"loss": 0.5363, "grad_norm": 1.1335854530334473, "learning_rate": 0.0002, "epoch": 7.232315978456015, "step": 100710}, {"loss": 0.5136, "grad_norm": 0.9655877947807312, "learning_rate": 0.0002, "epoch": 7.233034111310593, "step": 100720}, {"loss": 0.5621, "grad_norm": 1.373853087425232, "learning_rate": 0.0002, "epoch": 7.233752244165171, "step": 100730}, {"loss": 0.5577, "grad_norm": 1.14335298538208, "learning_rate": 0.0002, "epoch": 7.234470377019749, "step": 100740}, {"loss": 0.5305, "grad_norm": 1.0966235399246216, "learning_rate": 0.0002, "epoch": 7.235188509874327, "step": 100750}, {"loss": 0.5865, "grad_norm": 1.1448538303375244, "learning_rate": 0.0002, "epoch": 7.235906642728905, "step": 100760}, {"loss": 0.5756, "grad_norm": 1.431077003479004, "learning_rate": 0.0002, "epoch": 7.236624775583483, "step": 100770}, {"loss": 0.5334, "grad_norm": 1.148725986480713, "learning_rate": 0.0002, "epoch": 7.237342908438061, "step": 100780}, {"loss": 0.539, "grad_norm": 1.2375414371490479, "learning_rate": 0.0002, "epoch": 7.238061041292639, "step": 100790}, {"loss": 0.526, "grad_norm": 1.0722655057907104, "learning_rate": 0.0002, "epoch": 7.238779174147218, "step": 100800}, {"loss": 0.5255, "grad_norm": 1.1120193004608154, "learning_rate": 0.0002, "epoch": 7.239497307001796, "step": 100810}, {"loss": 0.5304, "grad_norm": 1.1200876235961914, "learning_rate": 0.0002, "epoch": 7.240215439856374, "step": 100820}, {"loss": 0.5537, "grad_norm": 0.9498430490493774, "learning_rate": 0.0002, "epoch": 7.240933572710952, "step": 100830}, {"loss": 0.5844, "grad_norm": 1.0005161762237549, "learning_rate": 0.0002, "epoch": 7.24165170556553, "step": 100840}, {"loss": 0.5393, "grad_norm": 1.1116056442260742, "learning_rate": 0.0002, "epoch": 7.242369838420108, "step": 100850}, {"loss": 0.5064, "grad_norm": 1.2970526218414307, "learning_rate": 0.0002, "epoch": 7.243087971274686, "step": 100860}, {"loss": 0.5245, "grad_norm": 0.9523774981498718, "learning_rate": 0.0002, "epoch": 7.243806104129264, "step": 100870}, {"loss": 0.5059, "grad_norm": 1.0484211444854736, "learning_rate": 0.0002, "epoch": 7.244524236983842, "step": 100880}, {"loss": 0.5118, "grad_norm": 1.2013362646102905, "learning_rate": 0.0002, "epoch": 7.2452423698384205, "step": 100890}, {"loss": 0.5744, "grad_norm": 1.0352288484573364, "learning_rate": 0.0002, "epoch": 7.2459605026929985, "step": 100900}, {"loss": 0.535, "grad_norm": 1.2752721309661865, "learning_rate": 0.0002, "epoch": 7.2466786355475765, "step": 100910}, {"loss": 0.5401, "grad_norm": 0.9587982892990112, "learning_rate": 0.0002, "epoch": 7.2473967684021545, "step": 100920}, {"loss": 0.5751, "grad_norm": 1.57708740234375, "learning_rate": 0.0002, "epoch": 7.2481149012567325, "step": 100930}, {"loss": 0.5068, "grad_norm": 1.1802852153778076, "learning_rate": 0.0002, "epoch": 7.2488330341113105, "step": 100940}, {"loss": 0.5178, "grad_norm": 1.192427396774292, "learning_rate": 0.0002, "epoch": 7.2495511669658885, "step": 100950}, {"loss": 0.526, "grad_norm": 1.138766884803772, "learning_rate": 0.0002, "epoch": 7.2502692998204665, "step": 100960}, {"loss": 0.5322, "grad_norm": 1.1480544805526733, "learning_rate": 0.0002, "epoch": 7.2509874326750445, "step": 100970}, {"loss": 0.5247, "grad_norm": 1.096941351890564, "learning_rate": 0.0002, "epoch": 7.2517055655296225, "step": 100980}, {"loss": 0.5767, "grad_norm": 1.16941499710083, "learning_rate": 0.0002, "epoch": 7.252423698384201, "step": 100990}, {"loss": 0.5308, "grad_norm": 1.138398289680481, "learning_rate": 0.0002, "epoch": 7.253141831238779, "step": 101000}, {"loss": 0.5366, "grad_norm": 0.9534326791763306, "learning_rate": 0.0002, "epoch": 7.253859964093357, "step": 101010}, {"loss": 0.5659, "grad_norm": 1.2834177017211914, "learning_rate": 0.0002, "epoch": 7.254578096947935, "step": 101020}, {"loss": 0.5326, "grad_norm": 1.0083826780319214, "learning_rate": 0.0002, "epoch": 7.255296229802513, "step": 101030}, {"loss": 0.5128, "grad_norm": 0.8869968056678772, "learning_rate": 0.0002, "epoch": 7.256014362657091, "step": 101040}, {"loss": 0.528, "grad_norm": 1.1779630184173584, "learning_rate": 0.0002, "epoch": 7.256732495511669, "step": 101050}, {"loss": 0.5422, "grad_norm": 0.9937887787818909, "learning_rate": 0.0002, "epoch": 7.257450628366247, "step": 101060}, {"loss": 0.5799, "grad_norm": 0.9739404916763306, "learning_rate": 0.0002, "epoch": 7.258168761220825, "step": 101070}, {"loss": 0.5833, "grad_norm": 0.9721621870994568, "learning_rate": 0.0002, "epoch": 7.258886894075404, "step": 101080}, {"loss": 0.5711, "grad_norm": 1.0670732259750366, "learning_rate": 0.0002, "epoch": 7.259605026929982, "step": 101090}, {"loss": 0.5656, "grad_norm": 1.0157248973846436, "learning_rate": 0.0002, "epoch": 7.26032315978456, "step": 101100}, {"loss": 0.5114, "grad_norm": 0.6791224479675293, "learning_rate": 0.0002, "epoch": 7.261041292639138, "step": 101110}, {"loss": 0.5095, "grad_norm": 1.168717622756958, "learning_rate": 0.0002, "epoch": 7.261759425493716, "step": 101120}, {"loss": 0.5926, "grad_norm": 1.1143511533737183, "learning_rate": 0.0002, "epoch": 7.262477558348294, "step": 101130}, {"loss": 0.5454, "grad_norm": 1.088230013847351, "learning_rate": 0.0002, "epoch": 7.263195691202872, "step": 101140}, {"loss": 0.5116, "grad_norm": 1.1834399700164795, "learning_rate": 0.0002, "epoch": 7.26391382405745, "step": 101150}, {"loss": 0.5519, "grad_norm": 1.0157420635223389, "learning_rate": 0.0002, "epoch": 7.264631956912028, "step": 101160}, {"loss": 0.5379, "grad_norm": 1.103623390197754, "learning_rate": 0.0002, "epoch": 7.265350089766607, "step": 101170}, {"loss": 0.5621, "grad_norm": 1.2007834911346436, "learning_rate": 0.0002, "epoch": 7.266068222621185, "step": 101180}, {"loss": 0.4982, "grad_norm": 1.204030156135559, "learning_rate": 0.0002, "epoch": 7.266786355475763, "step": 101190}, {"loss": 0.5361, "grad_norm": 1.0954475402832031, "learning_rate": 0.0002, "epoch": 7.267504488330341, "step": 101200}, {"loss": 0.5657, "grad_norm": 1.0195337533950806, "learning_rate": 0.0002, "epoch": 7.268222621184919, "step": 101210}, {"loss": 0.5176, "grad_norm": 1.0377559661865234, "learning_rate": 0.0002, "epoch": 7.268940754039497, "step": 101220}, {"loss": 0.5177, "grad_norm": 1.1147254705429077, "learning_rate": 0.0002, "epoch": 7.269658886894075, "step": 101230}, {"loss": 0.547, "grad_norm": 1.0451658964157104, "learning_rate": 0.0002, "epoch": 7.270377019748653, "step": 101240}, {"loss": 0.5045, "grad_norm": 1.2418344020843506, "learning_rate": 0.0002, "epoch": 7.271095152603231, "step": 101250}, {"loss": 0.545, "grad_norm": 1.100477933883667, "learning_rate": 0.0002, "epoch": 7.27181328545781, "step": 101260}, {"loss": 0.5741, "grad_norm": 1.0112155675888062, "learning_rate": 0.0002, "epoch": 7.272531418312388, "step": 101270}, {"loss": 0.5522, "grad_norm": 1.3673237562179565, "learning_rate": 0.0002, "epoch": 7.273249551166966, "step": 101280}, {"loss": 0.563, "grad_norm": 1.0272409915924072, "learning_rate": 0.0002, "epoch": 7.273967684021544, "step": 101290}, {"loss": 0.515, "grad_norm": 1.1041511297225952, "learning_rate": 0.0002, "epoch": 7.274685816876122, "step": 101300}, {"loss": 0.5555, "grad_norm": 1.1367343664169312, "learning_rate": 0.0002, "epoch": 7.2754039497307, "step": 101310}, {"loss": 0.5595, "grad_norm": 0.936102569103241, "learning_rate": 0.0002, "epoch": 7.276122082585278, "step": 101320}, {"loss": 0.5732, "grad_norm": 1.1409412622451782, "learning_rate": 0.0002, "epoch": 7.276840215439856, "step": 101330}, {"loss": 0.5731, "grad_norm": 1.103954553604126, "learning_rate": 0.0002, "epoch": 7.277558348294434, "step": 101340}, {"loss": 0.5614, "grad_norm": 1.0316593647003174, "learning_rate": 0.0002, "epoch": 7.278276481149012, "step": 101350}, {"loss": 0.5273, "grad_norm": 1.2040457725524902, "learning_rate": 0.0002, "epoch": 7.278994614003591, "step": 101360}, {"loss": 0.5339, "grad_norm": 1.0609431266784668, "learning_rate": 0.0002, "epoch": 7.279712746858169, "step": 101370}, {"loss": 0.5196, "grad_norm": 1.0759286880493164, "learning_rate": 0.0002, "epoch": 7.280430879712747, "step": 101380}, {"loss": 0.495, "grad_norm": 1.128455400466919, "learning_rate": 0.0002, "epoch": 7.281149012567325, "step": 101390}, {"loss": 0.574, "grad_norm": 1.2482393980026245, "learning_rate": 0.0002, "epoch": 7.281867145421903, "step": 101400}, {"loss": 0.5087, "grad_norm": 1.216482400894165, "learning_rate": 0.0002, "epoch": 7.282585278276481, "step": 101410}, {"loss": 0.5262, "grad_norm": 1.1360549926757812, "learning_rate": 0.0002, "epoch": 7.283303411131059, "step": 101420}, {"loss": 0.5385, "grad_norm": 1.1246616840362549, "learning_rate": 0.0002, "epoch": 7.284021543985637, "step": 101430}, {"loss": 0.5394, "grad_norm": 1.2419198751449585, "learning_rate": 0.0002, "epoch": 7.284739676840215, "step": 101440}, {"loss": 0.4876, "grad_norm": 1.169204831123352, "learning_rate": 0.0002, "epoch": 7.285457809694794, "step": 101450}, {"loss": 0.562, "grad_norm": 0.988856852054596, "learning_rate": 0.0002, "epoch": 7.286175942549372, "step": 101460}, {"loss": 0.5678, "grad_norm": 1.0422797203063965, "learning_rate": 0.0002, "epoch": 7.28689407540395, "step": 101470}, {"loss": 0.5188, "grad_norm": 0.9522702097892761, "learning_rate": 0.0002, "epoch": 7.287612208258528, "step": 101480}, {"loss": 0.5611, "grad_norm": 1.2551125288009644, "learning_rate": 0.0002, "epoch": 7.288330341113106, "step": 101490}, {"loss": 0.4991, "grad_norm": 1.4335172176361084, "learning_rate": 0.0002, "epoch": 7.289048473967684, "step": 101500}, {"loss": 0.4907, "grad_norm": 1.1649556159973145, "learning_rate": 0.0002, "epoch": 7.289766606822262, "step": 101510}, {"loss": 0.5433, "grad_norm": 1.1837944984436035, "learning_rate": 0.0002, "epoch": 7.29048473967684, "step": 101520}, {"loss": 0.5456, "grad_norm": 1.1103264093399048, "learning_rate": 0.0002, "epoch": 7.291202872531418, "step": 101530}, {"loss": 0.5623, "grad_norm": 1.0029321908950806, "learning_rate": 0.0002, "epoch": 7.291921005385996, "step": 101540}, {"loss": 0.5334, "grad_norm": 1.1226013898849487, "learning_rate": 0.0002, "epoch": 7.292639138240575, "step": 101550}, {"loss": 0.5532, "grad_norm": 1.368054986000061, "learning_rate": 0.0002, "epoch": 7.293357271095153, "step": 101560}, {"loss": 0.5781, "grad_norm": 1.20630943775177, "learning_rate": 0.0002, "epoch": 7.294075403949731, "step": 101570}, {"loss": 0.6068, "grad_norm": 1.004388689994812, "learning_rate": 0.0002, "epoch": 7.294793536804309, "step": 101580}, {"loss": 0.5245, "grad_norm": 1.029399037361145, "learning_rate": 0.0002, "epoch": 7.295511669658887, "step": 101590}, {"loss": 0.5552, "grad_norm": 1.1087204217910767, "learning_rate": 0.0002, "epoch": 7.296229802513465, "step": 101600}, {"loss": 0.5264, "grad_norm": 1.1086976528167725, "learning_rate": 0.0002, "epoch": 7.296947935368043, "step": 101610}, {"loss": 0.5264, "grad_norm": 1.2080177068710327, "learning_rate": 0.0002, "epoch": 7.297666068222621, "step": 101620}, {"loss": 0.5326, "grad_norm": 1.0005929470062256, "learning_rate": 0.0002, "epoch": 7.298384201077199, "step": 101630}, {"loss": 0.5452, "grad_norm": 1.0818030834197998, "learning_rate": 0.0002, "epoch": 7.299102333931778, "step": 101640}, {"loss": 0.5332, "grad_norm": 1.3539172410964966, "learning_rate": 0.0002, "epoch": 7.299820466786356, "step": 101650}, {"loss": 0.5434, "grad_norm": 1.2323400974273682, "learning_rate": 0.0002, "epoch": 7.300538599640934, "step": 101660}, {"loss": 0.5308, "grad_norm": 1.0842500925064087, "learning_rate": 0.0002, "epoch": 7.301256732495512, "step": 101670}, {"loss": 0.5385, "grad_norm": 1.0156948566436768, "learning_rate": 0.0002, "epoch": 7.30197486535009, "step": 101680}, {"loss": 0.5225, "grad_norm": 0.9736073613166809, "learning_rate": 0.0002, "epoch": 7.302692998204668, "step": 101690}, {"loss": 0.5467, "grad_norm": 1.130902886390686, "learning_rate": 0.0002, "epoch": 7.303411131059246, "step": 101700}, {"loss": 0.5118, "grad_norm": 1.0969539880752563, "learning_rate": 0.0002, "epoch": 7.304129263913824, "step": 101710}, {"loss": 0.5992, "grad_norm": 1.1104915142059326, "learning_rate": 0.0002, "epoch": 7.304847396768402, "step": 101720}, {"loss": 0.5227, "grad_norm": 1.3659855127334595, "learning_rate": 0.0002, "epoch": 7.30556552962298, "step": 101730}, {"loss": 0.56, "grad_norm": 1.1095956563949585, "learning_rate": 0.0002, "epoch": 7.306283662477559, "step": 101740}, {"loss": 0.5553, "grad_norm": 1.1549444198608398, "learning_rate": 0.0002, "epoch": 7.307001795332137, "step": 101750}, {"loss": 0.5197, "grad_norm": 1.0718402862548828, "learning_rate": 0.0002, "epoch": 7.307719928186715, "step": 101760}, {"loss": 0.4963, "grad_norm": 1.151033639907837, "learning_rate": 0.0002, "epoch": 7.308438061041293, "step": 101770}, {"loss": 0.5533, "grad_norm": 0.9531689882278442, "learning_rate": 0.0002, "epoch": 7.309156193895871, "step": 101780}, {"loss": 0.5533, "grad_norm": 1.3025462627410889, "learning_rate": 0.0002, "epoch": 7.309874326750449, "step": 101790}, {"loss": 0.5453, "grad_norm": 1.062644600868225, "learning_rate": 0.0002, "epoch": 7.310592459605027, "step": 101800}, {"loss": 0.5364, "grad_norm": 1.1687922477722168, "learning_rate": 0.0002, "epoch": 7.311310592459605, "step": 101810}, {"loss": 0.5705, "grad_norm": 1.2879260778427124, "learning_rate": 0.0002, "epoch": 7.312028725314184, "step": 101820}, {"loss": 0.5358, "grad_norm": 0.9876636862754822, "learning_rate": 0.0002, "epoch": 7.312746858168762, "step": 101830}, {"loss": 0.5424, "grad_norm": 0.8604402542114258, "learning_rate": 0.0002, "epoch": 7.31346499102334, "step": 101840}, {"loss": 0.4947, "grad_norm": 1.1162822246551514, "learning_rate": 0.0002, "epoch": 7.314183123877918, "step": 101850}, {"loss": 0.4865, "grad_norm": 1.095772624015808, "learning_rate": 0.0002, "epoch": 7.314901256732496, "step": 101860}, {"loss": 0.5175, "grad_norm": 1.0100891590118408, "learning_rate": 0.0002, "epoch": 7.315619389587074, "step": 101870}, {"loss": 0.5223, "grad_norm": 0.9602094888687134, "learning_rate": 0.0002, "epoch": 7.316337522441652, "step": 101880}, {"loss": 0.5379, "grad_norm": 1.2045155763626099, "learning_rate": 0.0002, "epoch": 7.31705565529623, "step": 101890}, {"loss": 0.5607, "grad_norm": 1.014012098312378, "learning_rate": 0.0002, "epoch": 7.317773788150808, "step": 101900}, {"loss": 0.5184, "grad_norm": 1.0581108331680298, "learning_rate": 0.0002, "epoch": 7.318491921005386, "step": 101910}, {"loss": 0.5088, "grad_norm": 0.9462026953697205, "learning_rate": 0.0002, "epoch": 7.3192100538599645, "step": 101920}, {"loss": 0.5253, "grad_norm": 1.0593115091323853, "learning_rate": 0.0002, "epoch": 7.3199281867145425, "step": 101930}, {"loss": 0.5499, "grad_norm": 1.1326113939285278, "learning_rate": 0.0002, "epoch": 7.3206463195691205, "step": 101940}, {"loss": 0.5418, "grad_norm": 0.933236300945282, "learning_rate": 0.0002, "epoch": 7.3213644524236985, "step": 101950}, {"loss": 0.5267, "grad_norm": 0.9311601519584656, "learning_rate": 0.0002, "epoch": 7.3220825852782765, "step": 101960}, {"loss": 0.5146, "grad_norm": 1.2303248643875122, "learning_rate": 0.0002, "epoch": 7.3228007181328545, "step": 101970}, {"loss": 0.4947, "grad_norm": 1.1904213428497314, "learning_rate": 0.0002, "epoch": 7.3235188509874325, "step": 101980}, {"loss": 0.5409, "grad_norm": 1.281388759613037, "learning_rate": 0.0002, "epoch": 7.3242369838420105, "step": 101990}, {"loss": 0.5765, "grad_norm": 1.0551466941833496, "learning_rate": 0.0002, "epoch": 7.3249551166965885, "step": 102000}, {"loss": 0.539, "grad_norm": 1.3299282789230347, "learning_rate": 0.0002, "epoch": 7.325673249551167, "step": 102010}, {"loss": 0.5212, "grad_norm": 1.2172462940216064, "learning_rate": 0.0002, "epoch": 7.326391382405745, "step": 102020}, {"loss": 0.5308, "grad_norm": 1.0828213691711426, "learning_rate": 0.0002, "epoch": 7.327109515260323, "step": 102030}, {"loss": 0.5355, "grad_norm": 1.336836338043213, "learning_rate": 0.0002, "epoch": 7.327827648114901, "step": 102040}, {"loss": 0.5458, "grad_norm": 1.1681890487670898, "learning_rate": 0.0002, "epoch": 7.328545780969479, "step": 102050}, {"loss": 0.5227, "grad_norm": 0.9713141918182373, "learning_rate": 0.0002, "epoch": 7.329263913824057, "step": 102060}, {"loss": 0.5543, "grad_norm": 0.919150710105896, "learning_rate": 0.0002, "epoch": 7.329982046678635, "step": 102070}, {"loss": 0.5241, "grad_norm": 1.1288635730743408, "learning_rate": 0.0002, "epoch": 7.330700179533213, "step": 102080}, {"loss": 0.5273, "grad_norm": 1.1016335487365723, "learning_rate": 0.0002, "epoch": 7.331418312387791, "step": 102090}, {"loss": 0.5234, "grad_norm": 0.8584099411964417, "learning_rate": 0.0002, "epoch": 7.332136445242369, "step": 102100}, {"loss": 0.5383, "grad_norm": 1.1394617557525635, "learning_rate": 0.0002, "epoch": 7.332854578096948, "step": 102110}, {"loss": 0.573, "grad_norm": 1.0681827068328857, "learning_rate": 0.0002, "epoch": 7.333572710951526, "step": 102120}, {"loss": 0.5049, "grad_norm": 1.1277847290039062, "learning_rate": 0.0002, "epoch": 7.334290843806104, "step": 102130}, {"loss": 0.5124, "grad_norm": 1.093695044517517, "learning_rate": 0.0002, "epoch": 7.335008976660682, "step": 102140}, {"loss": 0.5298, "grad_norm": 1.2288036346435547, "learning_rate": 0.0002, "epoch": 7.33572710951526, "step": 102150}, {"loss": 0.5564, "grad_norm": 1.0734258890151978, "learning_rate": 0.0002, "epoch": 7.336445242369838, "step": 102160}, {"loss": 0.5119, "grad_norm": 1.1947388648986816, "learning_rate": 0.0002, "epoch": 7.337163375224416, "step": 102170}, {"loss": 0.5718, "grad_norm": 0.9444851279258728, "learning_rate": 0.0002, "epoch": 7.337881508078994, "step": 102180}, {"loss": 0.5298, "grad_norm": 1.0540008544921875, "learning_rate": 0.0002, "epoch": 7.338599640933572, "step": 102190}, {"loss": 0.5039, "grad_norm": 1.1238518953323364, "learning_rate": 0.0002, "epoch": 7.339317773788151, "step": 102200}, {"loss": 0.5599, "grad_norm": 1.129989743232727, "learning_rate": 0.0002, "epoch": 7.340035906642729, "step": 102210}, {"loss": 0.5158, "grad_norm": 0.8847355842590332, "learning_rate": 0.0002, "epoch": 7.340754039497307, "step": 102220}, {"loss": 0.5429, "grad_norm": 1.1628837585449219, "learning_rate": 0.0002, "epoch": 7.341472172351885, "step": 102230}, {"loss": 0.533, "grad_norm": 1.1139917373657227, "learning_rate": 0.0002, "epoch": 7.342190305206463, "step": 102240}, {"loss": 0.5758, "grad_norm": 1.113997220993042, "learning_rate": 0.0002, "epoch": 7.342908438061041, "step": 102250}, {"loss": 0.5468, "grad_norm": 1.2163578271865845, "learning_rate": 0.0002, "epoch": 7.343626570915619, "step": 102260}, {"loss": 0.5417, "grad_norm": 1.0641776323318481, "learning_rate": 0.0002, "epoch": 7.344344703770197, "step": 102270}, {"loss": 0.5349, "grad_norm": 1.2397149801254272, "learning_rate": 0.0002, "epoch": 7.345062836624775, "step": 102280}, {"loss": 0.5697, "grad_norm": 1.3043087720870972, "learning_rate": 0.0002, "epoch": 7.345780969479353, "step": 102290}, {"loss": 0.5203, "grad_norm": 1.0568885803222656, "learning_rate": 0.0002, "epoch": 7.346499102333932, "step": 102300}, {"loss": 0.5307, "grad_norm": 1.1168477535247803, "learning_rate": 0.0002, "epoch": 7.34721723518851, "step": 102310}, {"loss": 0.5357, "grad_norm": 1.0510926246643066, "learning_rate": 0.0002, "epoch": 7.347935368043088, "step": 102320}, {"loss": 0.557, "grad_norm": 1.0340518951416016, "learning_rate": 0.0002, "epoch": 7.348653500897666, "step": 102330}, {"loss": 0.5381, "grad_norm": 1.0256576538085938, "learning_rate": 0.0002, "epoch": 7.349371633752244, "step": 102340}, {"loss": 0.5382, "grad_norm": 1.1578398942947388, "learning_rate": 0.0002, "epoch": 7.350089766606822, "step": 102350}, {"loss": 0.5265, "grad_norm": 0.9840098023414612, "learning_rate": 0.0002, "epoch": 7.3508078994614, "step": 102360}, {"loss": 0.5525, "grad_norm": 1.1200997829437256, "learning_rate": 0.0002, "epoch": 7.351526032315978, "step": 102370}, {"loss": 0.5396, "grad_norm": 1.3507630825042725, "learning_rate": 0.0002, "epoch": 7.352244165170557, "step": 102380}, {"loss": 0.5486, "grad_norm": 1.156908631324768, "learning_rate": 0.0002, "epoch": 7.352962298025135, "step": 102390}, {"loss": 0.5111, "grad_norm": 1.2381980419158936, "learning_rate": 0.0002, "epoch": 7.353680430879713, "step": 102400}, {"loss": 0.5405, "grad_norm": 1.2751537561416626, "learning_rate": 0.0002, "epoch": 7.354398563734291, "step": 102410}, {"loss": 0.5659, "grad_norm": 1.2542656660079956, "learning_rate": 0.0002, "epoch": 7.355116696588869, "step": 102420}, {"loss": 0.537, "grad_norm": 1.1342339515686035, "learning_rate": 0.0002, "epoch": 7.355834829443447, "step": 102430}, {"loss": 0.5698, "grad_norm": 1.1476532220840454, "learning_rate": 0.0002, "epoch": 7.356552962298025, "step": 102440}, {"loss": 0.5952, "grad_norm": 1.0370854139328003, "learning_rate": 0.0002, "epoch": 7.357271095152603, "step": 102450}, {"loss": 0.5693, "grad_norm": 1.137521505355835, "learning_rate": 0.0002, "epoch": 7.357989228007181, "step": 102460}, {"loss": 0.5792, "grad_norm": 1.1226446628570557, "learning_rate": 0.0002, "epoch": 7.358707360861759, "step": 102470}, {"loss": 0.5395, "grad_norm": 0.975045382976532, "learning_rate": 0.0002, "epoch": 7.359425493716338, "step": 102480}, {"loss": 0.5589, "grad_norm": 1.0371936559677124, "learning_rate": 0.0002, "epoch": 7.360143626570916, "step": 102490}, {"loss": 0.5502, "grad_norm": 1.264593482017517, "learning_rate": 0.0002, "epoch": 7.360861759425494, "step": 102500}, {"loss": 0.5315, "grad_norm": 1.2820146083831787, "learning_rate": 0.0002, "epoch": 7.361579892280072, "step": 102510}, {"loss": 0.5531, "grad_norm": 1.3086479902267456, "learning_rate": 0.0002, "epoch": 7.36229802513465, "step": 102520}, {"loss": 0.5438, "grad_norm": 1.1097291707992554, "learning_rate": 0.0002, "epoch": 7.363016157989228, "step": 102530}, {"loss": 0.5208, "grad_norm": 1.3544751405715942, "learning_rate": 0.0002, "epoch": 7.363734290843806, "step": 102540}, {"loss": 0.5441, "grad_norm": 1.2640280723571777, "learning_rate": 0.0002, "epoch": 7.364452423698384, "step": 102550}, {"loss": 0.5655, "grad_norm": 0.932267963886261, "learning_rate": 0.0002, "epoch": 7.365170556552962, "step": 102560}, {"loss": 0.5, "grad_norm": 1.259298324584961, "learning_rate": 0.0002, "epoch": 7.365888689407541, "step": 102570}, {"loss": 0.5067, "grad_norm": 1.0883609056472778, "learning_rate": 0.0002, "epoch": 7.366606822262119, "step": 102580}, {"loss": 0.5719, "grad_norm": 1.5364124774932861, "learning_rate": 0.0002, "epoch": 7.367324955116697, "step": 102590}, {"loss": 0.5429, "grad_norm": 1.2528936862945557, "learning_rate": 0.0002, "epoch": 7.368043087971275, "step": 102600}, {"loss": 0.5283, "grad_norm": 0.9821929335594177, "learning_rate": 0.0002, "epoch": 7.368761220825853, "step": 102610}, {"loss": 0.5553, "grad_norm": 1.284264326095581, "learning_rate": 0.0002, "epoch": 7.369479353680431, "step": 102620}, {"loss": 0.5027, "grad_norm": 0.941703736782074, "learning_rate": 0.0002, "epoch": 7.370197486535009, "step": 102630}, {"loss": 0.5133, "grad_norm": 1.121385931968689, "learning_rate": 0.0002, "epoch": 7.370915619389587, "step": 102640}, {"loss": 0.5602, "grad_norm": 1.0397694110870361, "learning_rate": 0.0002, "epoch": 7.371633752244165, "step": 102650}, {"loss": 0.5267, "grad_norm": 1.0811786651611328, "learning_rate": 0.0002, "epoch": 7.372351885098743, "step": 102660}, {"loss": 0.5628, "grad_norm": 1.2080687284469604, "learning_rate": 0.0002, "epoch": 7.373070017953322, "step": 102670}, {"loss": 0.5475, "grad_norm": 1.0456428527832031, "learning_rate": 0.0002, "epoch": 7.3737881508079, "step": 102680}, {"loss": 0.5406, "grad_norm": 1.1772913932800293, "learning_rate": 0.0002, "epoch": 7.374506283662478, "step": 102690}, {"loss": 0.5528, "grad_norm": 1.209205150604248, "learning_rate": 0.0002, "epoch": 7.375224416517056, "step": 102700}, {"loss": 0.5432, "grad_norm": 1.220784068107605, "learning_rate": 0.0002, "epoch": 7.375942549371634, "step": 102710}, {"loss": 0.5084, "grad_norm": 1.0235114097595215, "learning_rate": 0.0002, "epoch": 7.376660682226212, "step": 102720}, {"loss": 0.5561, "grad_norm": 1.13937246799469, "learning_rate": 0.0002, "epoch": 7.37737881508079, "step": 102730}, {"loss": 0.5711, "grad_norm": 1.1369940042495728, "learning_rate": 0.0002, "epoch": 7.378096947935368, "step": 102740}, {"loss": 0.5559, "grad_norm": 0.9204146265983582, "learning_rate": 0.0002, "epoch": 7.378815080789946, "step": 102750}, {"loss": 0.5524, "grad_norm": 1.0428136587142944, "learning_rate": 0.0002, "epoch": 7.379533213644525, "step": 102760}, {"loss": 0.5428, "grad_norm": 1.3043127059936523, "learning_rate": 0.0002, "epoch": 7.380251346499103, "step": 102770}, {"loss": 0.5217, "grad_norm": 1.1984827518463135, "learning_rate": 0.0002, "epoch": 7.380969479353681, "step": 102780}, {"loss": 0.5534, "grad_norm": 1.169627070426941, "learning_rate": 0.0002, "epoch": 7.381687612208259, "step": 102790}, {"loss": 0.5194, "grad_norm": 0.9647679924964905, "learning_rate": 0.0002, "epoch": 7.382405745062837, "step": 102800}, {"loss": 0.5697, "grad_norm": 1.1284246444702148, "learning_rate": 0.0002, "epoch": 7.383123877917415, "step": 102810}, {"loss": 0.5203, "grad_norm": 0.9789248704910278, "learning_rate": 0.0002, "epoch": 7.383842010771993, "step": 102820}, {"loss": 0.545, "grad_norm": 1.191469669342041, "learning_rate": 0.0002, "epoch": 7.384560143626571, "step": 102830}, {"loss": 0.5451, "grad_norm": 1.0203280448913574, "learning_rate": 0.0002, "epoch": 7.385278276481149, "step": 102840}, {"loss": 0.5322, "grad_norm": 1.1877976655960083, "learning_rate": 0.0002, "epoch": 7.385996409335727, "step": 102850}, {"loss": 0.5238, "grad_norm": 1.2310867309570312, "learning_rate": 0.0002, "epoch": 7.3867145421903055, "step": 102860}, {"loss": 0.5864, "grad_norm": 1.0421714782714844, "learning_rate": 0.0002, "epoch": 7.3874326750448835, "step": 102870}, {"loss": 0.5595, "grad_norm": 1.2161095142364502, "learning_rate": 0.0002, "epoch": 7.3881508078994615, "step": 102880}, {"loss": 0.5307, "grad_norm": 0.9794706106185913, "learning_rate": 0.0002, "epoch": 7.3888689407540395, "step": 102890}, {"loss": 0.5399, "grad_norm": 1.2623358964920044, "learning_rate": 0.0002, "epoch": 7.3895870736086176, "step": 102900}, {"loss": 0.5065, "grad_norm": 0.9731680750846863, "learning_rate": 0.0002, "epoch": 7.3903052064631956, "step": 102910}, {"loss": 0.5521, "grad_norm": 1.2712689638137817, "learning_rate": 0.0002, "epoch": 7.3910233393177736, "step": 102920}, {"loss": 0.5296, "grad_norm": 0.9469414949417114, "learning_rate": 0.0002, "epoch": 7.391741472172352, "step": 102930}, {"loss": 0.5252, "grad_norm": 1.238718867301941, "learning_rate": 0.0002, "epoch": 7.3924596050269304, "step": 102940}, {"loss": 0.5143, "grad_norm": 1.262328028678894, "learning_rate": 0.0002, "epoch": 7.3931777378815084, "step": 102950}, {"loss": 0.5146, "grad_norm": 0.9899580478668213, "learning_rate": 0.0002, "epoch": 7.3938958707360865, "step": 102960}, {"loss": 0.5628, "grad_norm": 1.1182234287261963, "learning_rate": 0.0002, "epoch": 7.3946140035906645, "step": 102970}, {"loss": 0.5026, "grad_norm": 1.0213241577148438, "learning_rate": 0.0002, "epoch": 7.3953321364452425, "step": 102980}, {"loss": 0.5255, "grad_norm": 1.3077130317687988, "learning_rate": 0.0002, "epoch": 7.3960502692998205, "step": 102990}, {"loss": 0.54, "grad_norm": 0.8821753263473511, "learning_rate": 0.0002, "epoch": 7.3967684021543985, "step": 103000}, {"loss": 0.5547, "grad_norm": 1.1906793117523193, "learning_rate": 0.0002, "epoch": 7.3974865350089765, "step": 103010}, {"loss": 0.5544, "grad_norm": 0.9587275981903076, "learning_rate": 0.0002, "epoch": 7.3982046678635545, "step": 103020}, {"loss": 0.534, "grad_norm": 1.1806607246398926, "learning_rate": 0.0002, "epoch": 7.3989228007181325, "step": 103030}, {"loss": 0.4866, "grad_norm": 1.0863158702850342, "learning_rate": 0.0002, "epoch": 7.399640933572711, "step": 103040}, {"loss": 0.5455, "grad_norm": 1.3175718784332275, "learning_rate": 0.0002, "epoch": 7.400359066427289, "step": 103050}, {"loss": 0.557, "grad_norm": 1.0932444334030151, "learning_rate": 0.0002, "epoch": 7.401077199281867, "step": 103060}, {"loss": 0.5684, "grad_norm": 1.079542636871338, "learning_rate": 0.0002, "epoch": 7.401795332136445, "step": 103070}, {"loss": 0.5792, "grad_norm": 0.9434978365898132, "learning_rate": 0.0002, "epoch": 7.402513464991023, "step": 103080}, {"loss": 0.5267, "grad_norm": 1.2751423120498657, "learning_rate": 0.0002, "epoch": 7.403231597845601, "step": 103090}, {"loss": 0.5499, "grad_norm": 1.232871413230896, "learning_rate": 0.0002, "epoch": 7.403949730700179, "step": 103100}, {"loss": 0.5247, "grad_norm": 0.9898984432220459, "learning_rate": 0.0002, "epoch": 7.404667863554757, "step": 103110}, {"loss": 0.4788, "grad_norm": 0.8187330961227417, "learning_rate": 0.0002, "epoch": 7.405385996409335, "step": 103120}, {"loss": 0.5316, "grad_norm": 1.0267345905303955, "learning_rate": 0.0002, "epoch": 7.406104129263914, "step": 103130}, {"loss": 0.5209, "grad_norm": 1.018702507019043, "learning_rate": 0.0002, "epoch": 7.406822262118492, "step": 103140}, {"loss": 0.5288, "grad_norm": 1.2904773950576782, "learning_rate": 0.0002, "epoch": 7.40754039497307, "step": 103150}, {"loss": 0.5515, "grad_norm": 1.0485228300094604, "learning_rate": 0.0002, "epoch": 7.408258527827648, "step": 103160}, {"loss": 0.5778, "grad_norm": 1.112001895904541, "learning_rate": 0.0002, "epoch": 7.408976660682226, "step": 103170}, {"loss": 0.5881, "grad_norm": 0.9980560541152954, "learning_rate": 0.0002, "epoch": 7.409694793536804, "step": 103180}, {"loss": 0.5537, "grad_norm": 1.002909541130066, "learning_rate": 0.0002, "epoch": 7.410412926391382, "step": 103190}, {"loss": 0.5463, "grad_norm": 1.2632182836532593, "learning_rate": 0.0002, "epoch": 7.41113105924596, "step": 103200}, {"loss": 0.5731, "grad_norm": 0.8257913589477539, "learning_rate": 0.0002, "epoch": 7.411849192100538, "step": 103210}, {"loss": 0.5323, "grad_norm": 0.9777436852455139, "learning_rate": 0.0002, "epoch": 7.412567324955116, "step": 103220}, {"loss": 0.5396, "grad_norm": 1.1428900957107544, "learning_rate": 0.0002, "epoch": 7.413285457809695, "step": 103230}, {"loss": 0.5458, "grad_norm": 1.2036991119384766, "learning_rate": 0.0002, "epoch": 7.414003590664273, "step": 103240}, {"loss": 0.5478, "grad_norm": 1.0227148532867432, "learning_rate": 0.0002, "epoch": 7.414721723518851, "step": 103250}, {"loss": 0.5345, "grad_norm": 1.160910964012146, "learning_rate": 0.0002, "epoch": 7.415439856373429, "step": 103260}, {"loss": 0.5761, "grad_norm": 1.2486878633499146, "learning_rate": 0.0002, "epoch": 7.416157989228007, "step": 103270}, {"loss": 0.5296, "grad_norm": 0.9630030393600464, "learning_rate": 0.0002, "epoch": 7.416876122082585, "step": 103280}, {"loss": 0.5487, "grad_norm": 1.4181947708129883, "learning_rate": 0.0002, "epoch": 7.417594254937163, "step": 103290}, {"loss": 0.5168, "grad_norm": 1.173350214958191, "learning_rate": 0.0002, "epoch": 7.418312387791741, "step": 103300}, {"loss": 0.5549, "grad_norm": 1.2790213823318481, "learning_rate": 0.0002, "epoch": 7.419030520646319, "step": 103310}, {"loss": 0.5557, "grad_norm": 1.3033418655395508, "learning_rate": 0.0002, "epoch": 7.419748653500898, "step": 103320}, {"loss": 0.558, "grad_norm": 1.1796131134033203, "learning_rate": 0.0002, "epoch": 7.420466786355476, "step": 103330}, {"loss": 0.5436, "grad_norm": 1.2483408451080322, "learning_rate": 0.0002, "epoch": 7.421184919210054, "step": 103340}, {"loss": 0.5103, "grad_norm": 1.174924373626709, "learning_rate": 0.0002, "epoch": 7.421903052064632, "step": 103350}, {"loss": 0.5249, "grad_norm": 0.9597971439361572, "learning_rate": 0.0002, "epoch": 7.42262118491921, "step": 103360}, {"loss": 0.5652, "grad_norm": 1.029307246208191, "learning_rate": 0.0002, "epoch": 7.423339317773788, "step": 103370}, {"loss": 0.5484, "grad_norm": 1.2511323690414429, "learning_rate": 0.0002, "epoch": 7.424057450628366, "step": 103380}, {"loss": 0.5315, "grad_norm": 0.9973678588867188, "learning_rate": 0.0002, "epoch": 7.424775583482944, "step": 103390}, {"loss": 0.5346, "grad_norm": 1.248966932296753, "learning_rate": 0.0002, "epoch": 7.425493716337522, "step": 103400}, {"loss": 0.5386, "grad_norm": 1.1157349348068237, "learning_rate": 0.0002, "epoch": 7.4262118491921, "step": 103410}, {"loss": 0.5498, "grad_norm": 1.268991470336914, "learning_rate": 0.0002, "epoch": 7.426929982046679, "step": 103420}, {"loss": 0.5492, "grad_norm": 1.163036823272705, "learning_rate": 0.0002, "epoch": 7.427648114901257, "step": 103430}, {"loss": 0.5246, "grad_norm": 1.136313796043396, "learning_rate": 0.0002, "epoch": 7.428366247755835, "step": 103440}, {"loss": 0.5586, "grad_norm": 1.3698488473892212, "learning_rate": 0.0002, "epoch": 7.429084380610413, "step": 103450}, {"loss": 0.5405, "grad_norm": 1.136257290840149, "learning_rate": 0.0002, "epoch": 7.429802513464991, "step": 103460}, {"loss": 0.5278, "grad_norm": 1.236160397529602, "learning_rate": 0.0002, "epoch": 7.430520646319569, "step": 103470}, {"loss": 0.5341, "grad_norm": 1.1289445161819458, "learning_rate": 0.0002, "epoch": 7.431238779174147, "step": 103480}, {"loss": 0.5168, "grad_norm": 1.197693943977356, "learning_rate": 0.0002, "epoch": 7.431956912028725, "step": 103490}, {"loss": 0.5645, "grad_norm": 1.2970328330993652, "learning_rate": 0.0002, "epoch": 7.432675044883303, "step": 103500}, {"loss": 0.5763, "grad_norm": 1.1042685508728027, "learning_rate": 0.0002, "epoch": 7.433393177737882, "step": 103510}, {"loss": 0.5959, "grad_norm": 1.1035256385803223, "learning_rate": 0.0002, "epoch": 7.43411131059246, "step": 103520}, {"loss": 0.5472, "grad_norm": 1.210533618927002, "learning_rate": 0.0002, "epoch": 7.434829443447038, "step": 103530}, {"loss": 0.5242, "grad_norm": 1.0207868814468384, "learning_rate": 0.0002, "epoch": 7.435547576301616, "step": 103540}, {"loss": 0.4768, "grad_norm": 1.023432970046997, "learning_rate": 0.0002, "epoch": 7.436265709156194, "step": 103550}, {"loss": 0.5641, "grad_norm": 1.1517932415008545, "learning_rate": 0.0002, "epoch": 7.436983842010772, "step": 103560}, {"loss": 0.4931, "grad_norm": 1.2798852920532227, "learning_rate": 0.0002, "epoch": 7.43770197486535, "step": 103570}, {"loss": 0.5622, "grad_norm": 0.9245955348014832, "learning_rate": 0.0002, "epoch": 7.438420107719928, "step": 103580}, {"loss": 0.5333, "grad_norm": 1.0329653024673462, "learning_rate": 0.0002, "epoch": 7.439138240574506, "step": 103590}, {"loss": 0.5558, "grad_norm": 0.9156534671783447, "learning_rate": 0.0002, "epoch": 7.439856373429085, "step": 103600}, {"loss": 0.5469, "grad_norm": 1.0112179517745972, "learning_rate": 0.0002, "epoch": 7.440574506283663, "step": 103610}, {"loss": 0.5172, "grad_norm": 1.0597492456436157, "learning_rate": 0.0002, "epoch": 7.441292639138241, "step": 103620}, {"loss": 0.5406, "grad_norm": 1.0997483730316162, "learning_rate": 0.0002, "epoch": 7.442010771992819, "step": 103630}, {"loss": 0.5415, "grad_norm": 1.0250455141067505, "learning_rate": 0.0002, "epoch": 7.442728904847397, "step": 103640}, {"loss": 0.5897, "grad_norm": 1.0806883573532104, "learning_rate": 0.0002, "epoch": 7.443447037701975, "step": 103650}, {"loss": 0.5336, "grad_norm": 1.2387017011642456, "learning_rate": 0.0002, "epoch": 7.444165170556553, "step": 103660}, {"loss": 0.5084, "grad_norm": 1.0246366262435913, "learning_rate": 0.0002, "epoch": 7.444883303411131, "step": 103670}, {"loss": 0.5586, "grad_norm": 1.071362853050232, "learning_rate": 0.0002, "epoch": 7.445601436265709, "step": 103680}, {"loss": 0.5296, "grad_norm": 1.1581261157989502, "learning_rate": 0.0002, "epoch": 7.446319569120288, "step": 103690}, {"loss": 0.5622, "grad_norm": 1.1136809587478638, "learning_rate": 0.0002, "epoch": 7.447037701974866, "step": 103700}, {"loss": 0.5665, "grad_norm": 1.3133236169815063, "learning_rate": 0.0002, "epoch": 7.447755834829444, "step": 103710}, {"loss": 0.5433, "grad_norm": 1.163678765296936, "learning_rate": 0.0002, "epoch": 7.448473967684022, "step": 103720}, {"loss": 0.5397, "grad_norm": 1.121063232421875, "learning_rate": 0.0002, "epoch": 7.4491921005386, "step": 103730}, {"loss": 0.541, "grad_norm": 1.1806761026382446, "learning_rate": 0.0002, "epoch": 7.449910233393178, "step": 103740}, {"loss": 0.5757, "grad_norm": 0.9124397039413452, "learning_rate": 0.0002, "epoch": 7.450628366247756, "step": 103750}, {"loss": 0.5288, "grad_norm": 1.0819965600967407, "learning_rate": 0.0002, "epoch": 7.451346499102334, "step": 103760}, {"loss": 0.5307, "grad_norm": 1.260360836982727, "learning_rate": 0.0002, "epoch": 7.452064631956912, "step": 103770}, {"loss": 0.5322, "grad_norm": 1.3185076713562012, "learning_rate": 0.0002, "epoch": 7.45278276481149, "step": 103780}, {"loss": 0.5146, "grad_norm": 1.182569146156311, "learning_rate": 0.0002, "epoch": 7.453500897666069, "step": 103790}, {"loss": 0.5678, "grad_norm": 1.42801034450531, "learning_rate": 0.0002, "epoch": 7.454219030520647, "step": 103800}, {"loss": 0.5685, "grad_norm": 1.1232067346572876, "learning_rate": 0.0002, "epoch": 7.454937163375225, "step": 103810}, {"loss": 0.5321, "grad_norm": 0.9760740399360657, "learning_rate": 0.0002, "epoch": 7.455655296229803, "step": 103820}, {"loss": 0.5533, "grad_norm": 1.1086724996566772, "learning_rate": 0.0002, "epoch": 7.456373429084381, "step": 103830}, {"loss": 0.5383, "grad_norm": 1.293244481086731, "learning_rate": 0.0002, "epoch": 7.457091561938959, "step": 103840}, {"loss": 0.4926, "grad_norm": 1.0689499378204346, "learning_rate": 0.0002, "epoch": 7.457809694793537, "step": 103850}, {"loss": 0.5391, "grad_norm": 1.208716869354248, "learning_rate": 0.0002, "epoch": 7.458527827648115, "step": 103860}, {"loss": 0.5233, "grad_norm": 1.0105576515197754, "learning_rate": 0.0002, "epoch": 7.459245960502693, "step": 103870}, {"loss": 0.5266, "grad_norm": 1.1546603441238403, "learning_rate": 0.0002, "epoch": 7.4599640933572715, "step": 103880}, {"loss": 0.5695, "grad_norm": 1.258599042892456, "learning_rate": 0.0002, "epoch": 7.4606822262118495, "step": 103890}, {"loss": 0.6184, "grad_norm": 1.2506718635559082, "learning_rate": 0.0002, "epoch": 7.4614003590664275, "step": 103900}, {"loss": 0.528, "grad_norm": 1.0375752449035645, "learning_rate": 0.0002, "epoch": 7.4621184919210055, "step": 103910}, {"loss": 0.5307, "grad_norm": 1.0918235778808594, "learning_rate": 0.0002, "epoch": 7.4628366247755835, "step": 103920}, {"loss": 0.5573, "grad_norm": 1.2511614561080933, "learning_rate": 0.0002, "epoch": 7.4635547576301615, "step": 103930}, {"loss": 0.5446, "grad_norm": 0.9855675101280212, "learning_rate": 0.0002, "epoch": 7.4642728904847395, "step": 103940}, {"loss": 0.5811, "grad_norm": 1.1818993091583252, "learning_rate": 0.0002, "epoch": 7.4649910233393175, "step": 103950}, {"loss": 0.5746, "grad_norm": 1.2684056758880615, "learning_rate": 0.0002, "epoch": 7.4657091561938955, "step": 103960}, {"loss": 0.5906, "grad_norm": 1.3526806831359863, "learning_rate": 0.0002, "epoch": 7.4664272890484735, "step": 103970}, {"loss": 0.5551, "grad_norm": 1.1802287101745605, "learning_rate": 0.0002, "epoch": 7.467145421903052, "step": 103980}, {"loss": 0.5698, "grad_norm": 1.0627036094665527, "learning_rate": 0.0002, "epoch": 7.46786355475763, "step": 103990}, {"loss": 0.5266, "grad_norm": 1.2383025884628296, "learning_rate": 0.0002, "epoch": 7.468581687612208, "step": 104000}, {"loss": 0.5236, "grad_norm": 1.2024378776550293, "learning_rate": 0.0002, "epoch": 7.469299820466786, "step": 104010}, {"loss": 0.5523, "grad_norm": 0.8383823037147522, "learning_rate": 0.0002, "epoch": 7.470017953321364, "step": 104020}, {"loss": 0.5469, "grad_norm": 1.0333143472671509, "learning_rate": 0.0002, "epoch": 7.470736086175942, "step": 104030}, {"loss": 0.5519, "grad_norm": 1.232338309288025, "learning_rate": 0.0002, "epoch": 7.47145421903052, "step": 104040}, {"loss": 0.5317, "grad_norm": 1.1523895263671875, "learning_rate": 0.0002, "epoch": 7.472172351885098, "step": 104050}, {"loss": 0.5546, "grad_norm": 1.2198411226272583, "learning_rate": 0.0002, "epoch": 7.472890484739676, "step": 104060}, {"loss": 0.5488, "grad_norm": 1.1921417713165283, "learning_rate": 0.0002, "epoch": 7.473608617594255, "step": 104070}, {"loss": 0.5126, "grad_norm": 1.174011468887329, "learning_rate": 0.0002, "epoch": 7.474326750448833, "step": 104080}, {"loss": 0.5629, "grad_norm": 1.3201649188995361, "learning_rate": 0.0002, "epoch": 7.475044883303411, "step": 104090}, {"loss": 0.5775, "grad_norm": 0.9371066689491272, "learning_rate": 0.0002, "epoch": 7.475763016157989, "step": 104100}, {"loss": 0.5705, "grad_norm": 1.4846594333648682, "learning_rate": 0.0002, "epoch": 7.476481149012567, "step": 104110}, {"loss": 0.5591, "grad_norm": 1.1780450344085693, "learning_rate": 0.0002, "epoch": 7.477199281867145, "step": 104120}, {"loss": 0.5518, "grad_norm": 1.2080824375152588, "learning_rate": 0.0002, "epoch": 7.477917414721723, "step": 104130}, {"loss": 0.5683, "grad_norm": 1.0390220880508423, "learning_rate": 0.0002, "epoch": 7.478635547576301, "step": 104140}, {"loss": 0.5582, "grad_norm": 0.8703257441520691, "learning_rate": 0.0002, "epoch": 7.479353680430879, "step": 104150}, {"loss": 0.5255, "grad_norm": 1.017080307006836, "learning_rate": 0.0002, "epoch": 7.480071813285457, "step": 104160}, {"loss": 0.5541, "grad_norm": 1.2483022212982178, "learning_rate": 0.0002, "epoch": 7.480789946140036, "step": 104170}, {"loss": 0.5793, "grad_norm": 1.0958250761032104, "learning_rate": 0.0002, "epoch": 7.481508078994614, "step": 104180}, {"loss": 0.5221, "grad_norm": 1.1949903964996338, "learning_rate": 0.0002, "epoch": 7.482226211849192, "step": 104190}, {"loss": 0.5627, "grad_norm": 1.2361127138137817, "learning_rate": 0.0002, "epoch": 7.48294434470377, "step": 104200}, {"loss": 0.5388, "grad_norm": 1.2279026508331299, "learning_rate": 0.0002, "epoch": 7.483662477558348, "step": 104210}, {"loss": 0.5319, "grad_norm": 1.0336331129074097, "learning_rate": 0.0002, "epoch": 7.484380610412926, "step": 104220}, {"loss": 0.5505, "grad_norm": 1.0021189451217651, "learning_rate": 0.0002, "epoch": 7.485098743267504, "step": 104230}, {"loss": 0.5398, "grad_norm": 1.1586246490478516, "learning_rate": 0.0002, "epoch": 7.485816876122082, "step": 104240}, {"loss": 0.538, "grad_norm": 0.9006508588790894, "learning_rate": 0.0002, "epoch": 7.486535008976661, "step": 104250}, {"loss": 0.5436, "grad_norm": 1.2152459621429443, "learning_rate": 0.0002, "epoch": 7.487253141831239, "step": 104260}, {"loss": 0.5437, "grad_norm": 1.0048519372940063, "learning_rate": 0.0002, "epoch": 7.487971274685817, "step": 104270}, {"loss": 0.5663, "grad_norm": 1.1151599884033203, "learning_rate": 0.0002, "epoch": 7.488689407540395, "step": 104280}, {"loss": 0.554, "grad_norm": 0.9922400116920471, "learning_rate": 0.0002, "epoch": 7.489407540394973, "step": 104290}, {"loss": 0.5033, "grad_norm": 1.137277364730835, "learning_rate": 0.0002, "epoch": 7.490125673249551, "step": 104300}, {"loss": 0.5838, "grad_norm": 1.381284475326538, "learning_rate": 0.0002, "epoch": 7.490843806104129, "step": 104310}, {"loss": 0.5262, "grad_norm": 1.0104176998138428, "learning_rate": 0.0002, "epoch": 7.491561938958707, "step": 104320}, {"loss": 0.507, "grad_norm": 1.1292575597763062, "learning_rate": 0.0002, "epoch": 7.492280071813285, "step": 104330}, {"loss": 0.5807, "grad_norm": 1.0010626316070557, "learning_rate": 0.0002, "epoch": 7.492998204667863, "step": 104340}, {"loss": 0.5438, "grad_norm": 0.9468943476676941, "learning_rate": 0.0002, "epoch": 7.493716337522442, "step": 104350}, {"loss": 0.5451, "grad_norm": 1.0348953008651733, "learning_rate": 0.0002, "epoch": 7.49443447037702, "step": 104360}, {"loss": 0.5596, "grad_norm": 1.0347660779953003, "learning_rate": 0.0002, "epoch": 7.495152603231598, "step": 104370}, {"loss": 0.5381, "grad_norm": 1.1240533590316772, "learning_rate": 0.0002, "epoch": 7.495870736086176, "step": 104380}, {"loss": 0.5195, "grad_norm": 0.8433300852775574, "learning_rate": 0.0002, "epoch": 7.496588868940754, "step": 104390}, {"loss": 0.5776, "grad_norm": 1.0124489068984985, "learning_rate": 0.0002, "epoch": 7.497307001795332, "step": 104400}, {"loss": 0.5264, "grad_norm": 1.050297498703003, "learning_rate": 0.0002, "epoch": 7.49802513464991, "step": 104410}, {"loss": 0.5595, "grad_norm": 1.226494312286377, "learning_rate": 0.0002, "epoch": 7.498743267504488, "step": 104420}, {"loss": 0.5377, "grad_norm": 1.0367873907089233, "learning_rate": 0.0002, "epoch": 7.499461400359066, "step": 104430}, {"loss": 0.5176, "grad_norm": 1.2138985395431519, "learning_rate": 0.0002, "epoch": 7.500179533213645, "step": 104440}, {"loss": 0.5786, "grad_norm": 1.2024848461151123, "learning_rate": 0.0002, "epoch": 7.500897666068223, "step": 104450}, {"loss": 0.54, "grad_norm": 0.9568573832511902, "learning_rate": 0.0002, "epoch": 7.501615798922801, "step": 104460}, {"loss": 0.5243, "grad_norm": 0.959540605545044, "learning_rate": 0.0002, "epoch": 7.502333931777379, "step": 104470}, {"loss": 0.5211, "grad_norm": 1.1272302865982056, "learning_rate": 0.0002, "epoch": 7.503052064631957, "step": 104480}, {"loss": 0.5761, "grad_norm": 1.1625477075576782, "learning_rate": 0.0002, "epoch": 7.503770197486535, "step": 104490}, {"loss": 0.5805, "grad_norm": 1.1393729448318481, "learning_rate": 0.0002, "epoch": 7.504488330341113, "step": 104500}, {"loss": 0.5449, "grad_norm": 1.1496871709823608, "learning_rate": 0.0002, "epoch": 7.505206463195691, "step": 104510}, {"loss": 0.5212, "grad_norm": 1.10691237449646, "learning_rate": 0.0002, "epoch": 7.505924596050269, "step": 104520}, {"loss": 0.5687, "grad_norm": 1.1505173444747925, "learning_rate": 0.0002, "epoch": 7.506642728904847, "step": 104530}, {"loss": 0.5575, "grad_norm": 1.2328600883483887, "learning_rate": 0.0002, "epoch": 7.507360861759426, "step": 104540}, {"loss": 0.5457, "grad_norm": 1.0103087425231934, "learning_rate": 0.0002, "epoch": 7.508078994614004, "step": 104550}, {"loss": 0.561, "grad_norm": 1.1978994607925415, "learning_rate": 0.0002, "epoch": 7.508797127468582, "step": 104560}, {"loss": 0.552, "grad_norm": 1.070842981338501, "learning_rate": 0.0002, "epoch": 7.50951526032316, "step": 104570}, {"loss": 0.5441, "grad_norm": 1.1058868169784546, "learning_rate": 0.0002, "epoch": 7.510233393177738, "step": 104580}, {"loss": 0.5784, "grad_norm": 1.383592963218689, "learning_rate": 0.0002, "epoch": 7.510951526032316, "step": 104590}, {"loss": 0.5358, "grad_norm": 1.2177189588546753, "learning_rate": 0.0002, "epoch": 7.511669658886894, "step": 104600}, {"loss": 0.5565, "grad_norm": 1.7231167554855347, "learning_rate": 0.0002, "epoch": 7.512387791741472, "step": 104610}, {"loss": 0.5547, "grad_norm": 0.9763862490653992, "learning_rate": 0.0002, "epoch": 7.513105924596051, "step": 104620}, {"loss": 0.5567, "grad_norm": 1.242191195487976, "learning_rate": 0.0002, "epoch": 7.513824057450629, "step": 104630}, {"loss": 0.5051, "grad_norm": 0.9510217308998108, "learning_rate": 0.0002, "epoch": 7.514542190305207, "step": 104640}, {"loss": 0.5258, "grad_norm": 1.260542631149292, "learning_rate": 0.0002, "epoch": 7.515260323159785, "step": 104650}, {"loss": 0.5777, "grad_norm": 0.9604901075363159, "learning_rate": 0.0002, "epoch": 7.515978456014363, "step": 104660}, {"loss": 0.5461, "grad_norm": 1.0860100984573364, "learning_rate": 0.0002, "epoch": 7.516696588868941, "step": 104670}, {"loss": 0.5467, "grad_norm": 0.9627196192741394, "learning_rate": 0.0002, "epoch": 7.517414721723519, "step": 104680}, {"loss": 0.5461, "grad_norm": 1.0736050605773926, "learning_rate": 0.0002, "epoch": 7.518132854578097, "step": 104690}, {"loss": 0.5639, "grad_norm": 1.150801420211792, "learning_rate": 0.0002, "epoch": 7.518850987432675, "step": 104700}, {"loss": 0.5358, "grad_norm": 1.1193088293075562, "learning_rate": 0.0002, "epoch": 7.519569120287253, "step": 104710}, {"loss": 0.5807, "grad_norm": 1.0462759733200073, "learning_rate": 0.0002, "epoch": 7.520287253141831, "step": 104720}, {"loss": 0.5622, "grad_norm": 0.8539935946464539, "learning_rate": 0.0002, "epoch": 7.52100538599641, "step": 104730}, {"loss": 0.568, "grad_norm": 1.1345696449279785, "learning_rate": 0.0002, "epoch": 7.521723518850988, "step": 104740}, {"loss": 0.4941, "grad_norm": 1.0367025136947632, "learning_rate": 0.0002, "epoch": 7.522441651705566, "step": 104750}, {"loss": 0.5748, "grad_norm": 1.3531326055526733, "learning_rate": 0.0002, "epoch": 7.523159784560144, "step": 104760}, {"loss": 0.5197, "grad_norm": 0.8530771136283875, "learning_rate": 0.0002, "epoch": 7.523877917414722, "step": 104770}, {"loss": 0.5566, "grad_norm": 1.0597292184829712, "learning_rate": 0.0002, "epoch": 7.5245960502693, "step": 104780}, {"loss": 0.5435, "grad_norm": 1.0896775722503662, "learning_rate": 0.0002, "epoch": 7.525314183123878, "step": 104790}, {"loss": 0.508, "grad_norm": 1.3138227462768555, "learning_rate": 0.0002, "epoch": 7.526032315978456, "step": 104800}, {"loss": 0.5367, "grad_norm": 0.9158141016960144, "learning_rate": 0.0002, "epoch": 7.526750448833035, "step": 104810}, {"loss": 0.5281, "grad_norm": 1.1566123962402344, "learning_rate": 0.0002, "epoch": 7.527468581687613, "step": 104820}, {"loss": 0.5687, "grad_norm": 1.138040542602539, "learning_rate": 0.0002, "epoch": 7.528186714542191, "step": 104830}, {"loss": 0.5471, "grad_norm": 1.0407382249832153, "learning_rate": 0.0002, "epoch": 7.528904847396769, "step": 104840}, {"loss": 0.5291, "grad_norm": 1.104064702987671, "learning_rate": 0.0002, "epoch": 7.529622980251347, "step": 104850}, {"loss": 0.5153, "grad_norm": 1.040507435798645, "learning_rate": 0.0002, "epoch": 7.530341113105925, "step": 104860}, {"loss": 0.5347, "grad_norm": 1.146317958831787, "learning_rate": 0.0002, "epoch": 7.531059245960503, "step": 104870}, {"loss": 0.5184, "grad_norm": 1.0730783939361572, "learning_rate": 0.0002, "epoch": 7.531777378815081, "step": 104880}, {"loss": 0.5566, "grad_norm": 1.2540011405944824, "learning_rate": 0.0002, "epoch": 7.532495511669659, "step": 104890}, {"loss": 0.5615, "grad_norm": 1.0158214569091797, "learning_rate": 0.0002, "epoch": 7.533213644524237, "step": 104900}, {"loss": 0.5518, "grad_norm": 1.0645452737808228, "learning_rate": 0.0002, "epoch": 7.533931777378815, "step": 104910}, {"loss": 0.5675, "grad_norm": 1.1173311471939087, "learning_rate": 0.0002, "epoch": 7.5346499102333935, "step": 104920}, {"loss": 0.5692, "grad_norm": 1.091782808303833, "learning_rate": 0.0002, "epoch": 7.5353680430879715, "step": 104930}, {"loss": 0.5221, "grad_norm": 1.1219462156295776, "learning_rate": 0.0002, "epoch": 7.5360861759425495, "step": 104940}, {"loss": 0.6008, "grad_norm": 1.2164716720581055, "learning_rate": 0.0002, "epoch": 7.5368043087971275, "step": 104950}, {"loss": 0.5186, "grad_norm": 1.0167542695999146, "learning_rate": 0.0002, "epoch": 7.5375224416517055, "step": 104960}, {"loss": 0.5614, "grad_norm": 1.029844045639038, "learning_rate": 0.0002, "epoch": 7.5382405745062835, "step": 104970}, {"loss": 0.574, "grad_norm": 1.004914402961731, "learning_rate": 0.0002, "epoch": 7.5389587073608615, "step": 104980}, {"loss": 0.5399, "grad_norm": 1.151977300643921, "learning_rate": 0.0002, "epoch": 7.5396768402154395, "step": 104990}, {"loss": 0.5714, "grad_norm": 1.063069462776184, "learning_rate": 0.0002, "epoch": 7.540394973070018, "step": 105000}, {"loss": 0.5278, "grad_norm": 0.9950627684593201, "learning_rate": 0.0002, "epoch": 7.541113105924596, "step": 105010}, {"loss": 0.5717, "grad_norm": 0.9897221922874451, "learning_rate": 0.0002, "epoch": 7.541831238779174, "step": 105020}, {"loss": 0.5391, "grad_norm": 1.220423698425293, "learning_rate": 0.0002, "epoch": 7.542549371633752, "step": 105030}, {"loss": 0.5194, "grad_norm": 1.0800561904907227, "learning_rate": 0.0002, "epoch": 7.54326750448833, "step": 105040}, {"loss": 0.61, "grad_norm": 1.1115468740463257, "learning_rate": 0.0002, "epoch": 7.543985637342908, "step": 105050}, {"loss": 0.5944, "grad_norm": 1.1754465103149414, "learning_rate": 0.0002, "epoch": 7.544703770197486, "step": 105060}, {"loss": 0.5742, "grad_norm": 0.8769645690917969, "learning_rate": 0.0002, "epoch": 7.545421903052064, "step": 105070}, {"loss": 0.5784, "grad_norm": 1.0276274681091309, "learning_rate": 0.0002, "epoch": 7.546140035906642, "step": 105080}, {"loss": 0.5585, "grad_norm": 1.2642459869384766, "learning_rate": 0.0002, "epoch": 7.54685816876122, "step": 105090}, {"loss": 0.5418, "grad_norm": 1.1204240322113037, "learning_rate": 0.0002, "epoch": 7.547576301615799, "step": 105100}, {"loss": 0.551, "grad_norm": 1.1700465679168701, "learning_rate": 0.0002, "epoch": 7.548294434470377, "step": 105110}, {"loss": 0.5494, "grad_norm": 0.921738862991333, "learning_rate": 0.0002, "epoch": 7.549012567324955, "step": 105120}, {"loss": 0.5529, "grad_norm": 1.0517377853393555, "learning_rate": 0.0002, "epoch": 7.549730700179533, "step": 105130}, {"loss": 0.5369, "grad_norm": 0.8750519156455994, "learning_rate": 0.0002, "epoch": 7.550448833034111, "step": 105140}, {"loss": 0.5576, "grad_norm": 0.9947483539581299, "learning_rate": 0.0002, "epoch": 7.551166965888689, "step": 105150}, {"loss": 0.58, "grad_norm": 1.133035659790039, "learning_rate": 0.0002, "epoch": 7.551885098743267, "step": 105160}, {"loss": 0.53, "grad_norm": 1.0302581787109375, "learning_rate": 0.0002, "epoch": 7.552603231597845, "step": 105170}, {"loss": 0.5127, "grad_norm": 1.0290307998657227, "learning_rate": 0.0002, "epoch": 7.553321364452424, "step": 105180}, {"loss": 0.5489, "grad_norm": 1.2476361989974976, "learning_rate": 0.0002, "epoch": 7.554039497307002, "step": 105190}, {"loss": 0.5683, "grad_norm": 1.1051201820373535, "learning_rate": 0.0002, "epoch": 7.55475763016158, "step": 105200}, {"loss": 0.5408, "grad_norm": 1.4432711601257324, "learning_rate": 0.0002, "epoch": 7.555475763016158, "step": 105210}, {"loss": 0.5647, "grad_norm": 1.1134647130966187, "learning_rate": 0.0002, "epoch": 7.556193895870736, "step": 105220}, {"loss": 0.5852, "grad_norm": 1.2649270296096802, "learning_rate": 0.0002, "epoch": 7.556912028725314, "step": 105230}, {"loss": 0.5352, "grad_norm": 0.9547544717788696, "learning_rate": 0.0002, "epoch": 7.557630161579892, "step": 105240}, {"loss": 0.5416, "grad_norm": 1.153113842010498, "learning_rate": 0.0002, "epoch": 7.55834829443447, "step": 105250}, {"loss": 0.5366, "grad_norm": 1.0354572534561157, "learning_rate": 0.0002, "epoch": 7.559066427289048, "step": 105260}, {"loss": 0.5673, "grad_norm": 1.2131483554840088, "learning_rate": 0.0002, "epoch": 7.559784560143626, "step": 105270}, {"loss": 0.5389, "grad_norm": 0.9127926826477051, "learning_rate": 0.0002, "epoch": 7.560502692998204, "step": 105280}, {"loss": 0.5691, "grad_norm": 1.1065036058425903, "learning_rate": 0.0002, "epoch": 7.561220825852783, "step": 105290}, {"loss": 0.5586, "grad_norm": 1.133322834968567, "learning_rate": 0.0002, "epoch": 7.561938958707361, "step": 105300}, {"loss": 0.531, "grad_norm": 0.9822283387184143, "learning_rate": 0.0002, "epoch": 7.562657091561939, "step": 105310}, {"loss": 0.5406, "grad_norm": 1.0777708292007446, "learning_rate": 0.0002, "epoch": 7.563375224416517, "step": 105320}, {"loss": 0.5663, "grad_norm": 1.0826656818389893, "learning_rate": 0.0002, "epoch": 7.564093357271095, "step": 105330}, {"loss": 0.5497, "grad_norm": 1.1842281818389893, "learning_rate": 0.0002, "epoch": 7.564811490125673, "step": 105340}, {"loss": 0.553, "grad_norm": 1.1248035430908203, "learning_rate": 0.0002, "epoch": 7.565529622980251, "step": 105350}, {"loss": 0.603, "grad_norm": 0.9905921220779419, "learning_rate": 0.0002, "epoch": 7.566247755834829, "step": 105360}, {"loss": 0.5162, "grad_norm": 1.0215412378311157, "learning_rate": 0.0002, "epoch": 7.566965888689408, "step": 105370}, {"loss": 0.5871, "grad_norm": 1.2403844594955444, "learning_rate": 0.0002, "epoch": 7.567684021543986, "step": 105380}, {"loss": 0.6078, "grad_norm": 1.2371299266815186, "learning_rate": 0.0002, "epoch": 7.568402154398564, "step": 105390}, {"loss": 0.5637, "grad_norm": 1.2021104097366333, "learning_rate": 0.0002, "epoch": 7.569120287253142, "step": 105400}, {"loss": 0.5439, "grad_norm": 1.1641038656234741, "learning_rate": 0.0002, "epoch": 7.56983842010772, "step": 105410}, {"loss": 0.5238, "grad_norm": 1.1443949937820435, "learning_rate": 0.0002, "epoch": 7.570556552962298, "step": 105420}, {"loss": 0.57, "grad_norm": 1.1318271160125732, "learning_rate": 0.0002, "epoch": 7.571274685816876, "step": 105430}, {"loss": 0.5552, "grad_norm": 1.3928632736206055, "learning_rate": 0.0002, "epoch": 7.571992818671454, "step": 105440}, {"loss": 0.5304, "grad_norm": 1.1141331195831299, "learning_rate": 0.0002, "epoch": 7.572710951526032, "step": 105450}, {"loss": 0.6009, "grad_norm": 1.301546573638916, "learning_rate": 0.0002, "epoch": 7.57342908438061, "step": 105460}, {"loss": 0.5569, "grad_norm": 1.1085830926895142, "learning_rate": 0.0002, "epoch": 7.574147217235188, "step": 105470}, {"loss": 0.532, "grad_norm": 0.9858543872833252, "learning_rate": 0.0002, "epoch": 7.574865350089767, "step": 105480}, {"loss": 0.5367, "grad_norm": 1.0768673419952393, "learning_rate": 0.0002, "epoch": 7.575583482944345, "step": 105490}, {"loss": 0.5315, "grad_norm": 1.0940971374511719, "learning_rate": 0.0002, "epoch": 7.576301615798923, "step": 105500}, {"loss": 0.5451, "grad_norm": 1.2131849527359009, "learning_rate": 0.0002, "epoch": 7.577019748653501, "step": 105510}, {"loss": 0.5529, "grad_norm": 1.139255166053772, "learning_rate": 0.0002, "epoch": 7.577737881508079, "step": 105520}, {"loss": 0.5532, "grad_norm": 1.1880031824111938, "learning_rate": 0.0002, "epoch": 7.578456014362657, "step": 105530}, {"loss": 0.5759, "grad_norm": 1.1227078437805176, "learning_rate": 0.0002, "epoch": 7.579174147217235, "step": 105540}, {"loss": 0.5522, "grad_norm": 0.9665518999099731, "learning_rate": 0.0002, "epoch": 7.579892280071813, "step": 105550}, {"loss": 0.5285, "grad_norm": 1.2579736709594727, "learning_rate": 0.0002, "epoch": 7.580610412926392, "step": 105560}, {"loss": 0.5399, "grad_norm": 1.3003990650177002, "learning_rate": 0.0002, "epoch": 7.58132854578097, "step": 105570}, {"loss": 0.523, "grad_norm": 1.0537091493606567, "learning_rate": 0.0002, "epoch": 7.582046678635548, "step": 105580}, {"loss": 0.545, "grad_norm": 1.2199420928955078, "learning_rate": 0.0002, "epoch": 7.582764811490126, "step": 105590}, {"loss": 0.5701, "grad_norm": 1.1907626390457153, "learning_rate": 0.0002, "epoch": 7.583482944344704, "step": 105600}, {"loss": 0.5403, "grad_norm": 1.0684664249420166, "learning_rate": 0.0002, "epoch": 7.584201077199282, "step": 105610}, {"loss": 0.5393, "grad_norm": 1.1190338134765625, "learning_rate": 0.0002, "epoch": 7.58491921005386, "step": 105620}, {"loss": 0.5435, "grad_norm": 1.0873574018478394, "learning_rate": 0.0002, "epoch": 7.585637342908438, "step": 105630}, {"loss": 0.5581, "grad_norm": 1.0512418746948242, "learning_rate": 0.0002, "epoch": 7.586355475763016, "step": 105640}, {"loss": 0.5936, "grad_norm": 1.3036644458770752, "learning_rate": 0.0002, "epoch": 7.587073608617594, "step": 105650}, {"loss": 0.5598, "grad_norm": 1.037948489189148, "learning_rate": 0.0002, "epoch": 7.587791741472173, "step": 105660}, {"loss": 0.5646, "grad_norm": 0.987514317035675, "learning_rate": 0.0002, "epoch": 7.588509874326751, "step": 105670}, {"loss": 0.5683, "grad_norm": 1.2718415260314941, "learning_rate": 0.0002, "epoch": 7.589228007181329, "step": 105680}, {"loss": 0.5591, "grad_norm": 1.2168786525726318, "learning_rate": 0.0002, "epoch": 7.589946140035907, "step": 105690}, {"loss": 0.5536, "grad_norm": 1.0258911848068237, "learning_rate": 0.0002, "epoch": 7.590664272890485, "step": 105700}, {"loss": 0.5486, "grad_norm": 1.0203795433044434, "learning_rate": 0.0002, "epoch": 7.591382405745063, "step": 105710}, {"loss": 0.5411, "grad_norm": 1.1677968502044678, "learning_rate": 0.0002, "epoch": 7.592100538599641, "step": 105720}, {"loss": 0.5308, "grad_norm": 1.4036188125610352, "learning_rate": 0.0002, "epoch": 7.592818671454219, "step": 105730}, {"loss": 0.5896, "grad_norm": 1.0176831483840942, "learning_rate": 0.0002, "epoch": 7.593536804308797, "step": 105740}, {"loss": 0.5493, "grad_norm": 1.1458805799484253, "learning_rate": 0.0002, "epoch": 7.594254937163376, "step": 105750}, {"loss": 0.5547, "grad_norm": 1.038974642753601, "learning_rate": 0.0002, "epoch": 7.594973070017954, "step": 105760}, {"loss": 0.5709, "grad_norm": 1.247301697731018, "learning_rate": 0.0002, "epoch": 7.595691202872532, "step": 105770}, {"loss": 0.5504, "grad_norm": 0.8886832594871521, "learning_rate": 0.0002, "epoch": 7.59640933572711, "step": 105780}, {"loss": 0.5249, "grad_norm": 1.1210025548934937, "learning_rate": 0.0002, "epoch": 7.597127468581688, "step": 105790}, {"loss": 0.5422, "grad_norm": 1.1681327819824219, "learning_rate": 0.0002, "epoch": 7.597845601436266, "step": 105800}, {"loss": 0.5383, "grad_norm": 1.1547762155532837, "learning_rate": 0.0002, "epoch": 7.598563734290844, "step": 105810}, {"loss": 0.5183, "grad_norm": 1.1720976829528809, "learning_rate": 0.0002, "epoch": 7.599281867145422, "step": 105820}, {"loss": 0.5529, "grad_norm": 1.0706144571304321, "learning_rate": 0.0002, "epoch": 7.6, "step": 105830}, {"loss": 0.5805, "grad_norm": 1.031205415725708, "learning_rate": 0.0002, "epoch": 7.600718132854578, "step": 105840}, {"loss": 0.5672, "grad_norm": 1.1801010370254517, "learning_rate": 0.0002, "epoch": 7.6014362657091565, "step": 105850}, {"loss": 0.533, "grad_norm": 1.0154755115509033, "learning_rate": 0.0002, "epoch": 7.6021543985637345, "step": 105860}, {"loss": 0.5452, "grad_norm": 1.0330030918121338, "learning_rate": 0.0002, "epoch": 7.6028725314183125, "step": 105870}, {"loss": 0.5362, "grad_norm": 0.9404476881027222, "learning_rate": 0.0002, "epoch": 7.6035906642728905, "step": 105880}, {"loss": 0.5516, "grad_norm": 1.0264246463775635, "learning_rate": 0.0002, "epoch": 7.6043087971274685, "step": 105890}, {"loss": 0.6151, "grad_norm": 1.154560923576355, "learning_rate": 0.0002, "epoch": 7.6050269299820465, "step": 105900}, {"loss": 0.5418, "grad_norm": 0.8954422473907471, "learning_rate": 0.0002, "epoch": 7.6057450628366245, "step": 105910}, {"loss": 0.5368, "grad_norm": 0.9354978799819946, "learning_rate": 0.0002, "epoch": 7.6064631956912026, "step": 105920}, {"loss": 0.5594, "grad_norm": 1.2349580526351929, "learning_rate": 0.0002, "epoch": 7.607181328545781, "step": 105930}, {"loss": 0.5792, "grad_norm": 1.0203192234039307, "learning_rate": 0.0002, "epoch": 7.607899461400359, "step": 105940}, {"loss": 0.5231, "grad_norm": 0.8431771397590637, "learning_rate": 0.0002, "epoch": 7.608617594254937, "step": 105950}, {"loss": 0.5758, "grad_norm": 1.1733695268630981, "learning_rate": 0.0002, "epoch": 7.6093357271095154, "step": 105960}, {"loss": 0.5093, "grad_norm": 0.965118408203125, "learning_rate": 0.0002, "epoch": 7.6100538599640934, "step": 105970}, {"loss": 0.5359, "grad_norm": 0.987450897693634, "learning_rate": 0.0002, "epoch": 7.6107719928186714, "step": 105980}, {"loss": 0.5475, "grad_norm": 1.2337433099746704, "learning_rate": 0.0002, "epoch": 7.6114901256732495, "step": 105990}, {"loss": 0.5633, "grad_norm": 1.2976964712142944, "learning_rate": 0.0002, "epoch": 7.6122082585278275, "step": 106000}, {"loss": 0.6028, "grad_norm": 1.0748823881149292, "learning_rate": 0.0002, "epoch": 7.6129263913824055, "step": 106010}, {"loss": 0.5737, "grad_norm": 1.2771751880645752, "learning_rate": 0.0002, "epoch": 7.6136445242369835, "step": 106020}, {"loss": 0.608, "grad_norm": 0.9651449918746948, "learning_rate": 0.0002, "epoch": 7.6143626570915615, "step": 106030}, {"loss": 0.592, "grad_norm": 1.4248602390289307, "learning_rate": 0.0002, "epoch": 7.61508078994614, "step": 106040}, {"loss": 0.5814, "grad_norm": 1.1568830013275146, "learning_rate": 0.0002, "epoch": 7.615798922800718, "step": 106050}, {"loss": 0.5636, "grad_norm": 1.2090665102005005, "learning_rate": 0.0002, "epoch": 7.616517055655296, "step": 106060}, {"loss": 0.5405, "grad_norm": 1.0982604026794434, "learning_rate": 0.0002, "epoch": 7.617235188509874, "step": 106070}, {"loss": 0.551, "grad_norm": 1.0705735683441162, "learning_rate": 0.0002, "epoch": 7.617953321364452, "step": 106080}, {"loss": 0.5595, "grad_norm": 1.1313707828521729, "learning_rate": 0.0002, "epoch": 7.61867145421903, "step": 106090}, {"loss": 0.5578, "grad_norm": 1.2538282871246338, "learning_rate": 0.0002, "epoch": 7.619389587073608, "step": 106100}, {"loss": 0.5528, "grad_norm": 1.374280571937561, "learning_rate": 0.0002, "epoch": 7.620107719928186, "step": 106110}, {"loss": 0.5602, "grad_norm": 1.024248719215393, "learning_rate": 0.0002, "epoch": 7.620825852782765, "step": 106120}, {"loss": 0.5681, "grad_norm": 0.9976266622543335, "learning_rate": 0.0002, "epoch": 7.621543985637343, "step": 106130}, {"loss": 0.5338, "grad_norm": 1.2104789018630981, "learning_rate": 0.0002, "epoch": 7.622262118491921, "step": 106140}, {"loss": 0.53, "grad_norm": 1.154041051864624, "learning_rate": 0.0002, "epoch": 7.622980251346499, "step": 106150}, {"loss": 0.5538, "grad_norm": 1.1514118909835815, "learning_rate": 0.0002, "epoch": 7.623698384201077, "step": 106160}, {"loss": 0.5556, "grad_norm": 0.9994077086448669, "learning_rate": 0.0002, "epoch": 7.624416517055655, "step": 106170}, {"loss": 0.5366, "grad_norm": 1.0648950338363647, "learning_rate": 0.0002, "epoch": 7.625134649910233, "step": 106180}, {"loss": 0.5769, "grad_norm": 1.247307538986206, "learning_rate": 0.0002, "epoch": 7.625852782764811, "step": 106190}, {"loss": 0.5641, "grad_norm": 1.2144126892089844, "learning_rate": 0.0002, "epoch": 7.626570915619389, "step": 106200}, {"loss": 0.5478, "grad_norm": 1.196209192276001, "learning_rate": 0.0002, "epoch": 7.627289048473967, "step": 106210}, {"loss": 0.533, "grad_norm": 1.0064209699630737, "learning_rate": 0.0002, "epoch": 7.628007181328546, "step": 106220}, {"loss": 0.579, "grad_norm": 1.0938220024108887, "learning_rate": 0.0002, "epoch": 7.628725314183124, "step": 106230}, {"loss": 0.6253, "grad_norm": 1.0046473741531372, "learning_rate": 0.0002, "epoch": 7.629443447037702, "step": 106240}, {"loss": 0.5567, "grad_norm": 1.1092835664749146, "learning_rate": 0.0002, "epoch": 7.63016157989228, "step": 106250}, {"loss": 0.5647, "grad_norm": 1.0419597625732422, "learning_rate": 0.0002, "epoch": 7.630879712746858, "step": 106260}, {"loss": 0.5807, "grad_norm": 1.115281581878662, "learning_rate": 0.0002, "epoch": 7.631597845601436, "step": 106270}, {"loss": 0.564, "grad_norm": 0.926291823387146, "learning_rate": 0.0002, "epoch": 7.632315978456014, "step": 106280}, {"loss": 0.5694, "grad_norm": 1.2301737070083618, "learning_rate": 0.0002, "epoch": 7.633034111310592, "step": 106290}, {"loss": 0.577, "grad_norm": 1.2254445552825928, "learning_rate": 0.0002, "epoch": 7.63375224416517, "step": 106300}, {"loss": 0.5325, "grad_norm": 0.9048781394958496, "learning_rate": 0.0002, "epoch": 7.634470377019749, "step": 106310}, {"loss": 0.5464, "grad_norm": 0.9848755598068237, "learning_rate": 0.0002, "epoch": 7.635188509874327, "step": 106320}, {"loss": 0.5799, "grad_norm": 1.056156873703003, "learning_rate": 0.0002, "epoch": 7.635906642728905, "step": 106330}, {"loss": 0.5474, "grad_norm": 1.2103949785232544, "learning_rate": 0.0002, "epoch": 7.636624775583483, "step": 106340}, {"loss": 0.5648, "grad_norm": 0.9873999953269958, "learning_rate": 0.0002, "epoch": 7.637342908438061, "step": 106350}, {"loss": 0.4979, "grad_norm": 1.0306750535964966, "learning_rate": 0.0002, "epoch": 7.638061041292639, "step": 106360}, {"loss": 0.5604, "grad_norm": 1.1849476099014282, "learning_rate": 0.0002, "epoch": 7.638779174147217, "step": 106370}, {"loss": 0.5334, "grad_norm": 1.231707215309143, "learning_rate": 0.0002, "epoch": 7.639497307001795, "step": 106380}, {"loss": 0.5533, "grad_norm": 1.194321632385254, "learning_rate": 0.0002, "epoch": 7.640215439856373, "step": 106390}, {"loss": 0.5483, "grad_norm": 1.0539367198944092, "learning_rate": 0.0002, "epoch": 7.640933572710951, "step": 106400}, {"loss": 0.5668, "grad_norm": 1.1701070070266724, "learning_rate": 0.0002, "epoch": 7.64165170556553, "step": 106410}, {"loss": 0.5385, "grad_norm": 1.2178397178649902, "learning_rate": 0.0002, "epoch": 7.642369838420108, "step": 106420}, {"loss": 0.5209, "grad_norm": 0.9702774286270142, "learning_rate": 0.0002, "epoch": 7.643087971274686, "step": 106430}, {"loss": 0.5365, "grad_norm": 1.0613373517990112, "learning_rate": 0.0002, "epoch": 7.643806104129264, "step": 106440}, {"loss": 0.5514, "grad_norm": 1.0604264736175537, "learning_rate": 0.0002, "epoch": 7.644524236983842, "step": 106450}, {"loss": 0.5229, "grad_norm": 0.8836958408355713, "learning_rate": 0.0002, "epoch": 7.64524236983842, "step": 106460}, {"loss": 0.5337, "grad_norm": 1.1939433813095093, "learning_rate": 0.0002, "epoch": 7.645960502692998, "step": 106470}, {"loss": 0.5407, "grad_norm": 1.1198155879974365, "learning_rate": 0.0002, "epoch": 7.646678635547576, "step": 106480}, {"loss": 0.5829, "grad_norm": 1.1567481756210327, "learning_rate": 0.0002, "epoch": 7.647396768402155, "step": 106490}, {"loss": 0.5323, "grad_norm": 1.1108657121658325, "learning_rate": 0.0002, "epoch": 7.648114901256733, "step": 106500}, {"loss": 0.5703, "grad_norm": 1.116945505142212, "learning_rate": 0.0002, "epoch": 7.648833034111311, "step": 106510}, {"loss": 0.5444, "grad_norm": 0.951562762260437, "learning_rate": 0.0002, "epoch": 7.649551166965889, "step": 106520}, {"loss": 0.5798, "grad_norm": 1.1393115520477295, "learning_rate": 0.0002, "epoch": 7.650269299820467, "step": 106530}, {"loss": 0.5453, "grad_norm": 1.0645884275436401, "learning_rate": 0.0002, "epoch": 7.650987432675045, "step": 106540}, {"loss": 0.5584, "grad_norm": 1.0742363929748535, "learning_rate": 0.0002, "epoch": 7.651705565529623, "step": 106550}, {"loss": 0.5221, "grad_norm": 1.2417876720428467, "learning_rate": 0.0002, "epoch": 7.652423698384201, "step": 106560}, {"loss": 0.5232, "grad_norm": 1.1374881267547607, "learning_rate": 0.0002, "epoch": 7.653141831238779, "step": 106570}, {"loss": 0.5997, "grad_norm": 1.0783830881118774, "learning_rate": 0.0002, "epoch": 7.653859964093357, "step": 106580}, {"loss": 0.5451, "grad_norm": 1.014607548713684, "learning_rate": 0.0002, "epoch": 7.654578096947935, "step": 106590}, {"loss": 0.5329, "grad_norm": 0.9155649542808533, "learning_rate": 0.0002, "epoch": 7.655296229802514, "step": 106600}, {"loss": 0.5202, "grad_norm": 1.0671756267547607, "learning_rate": 0.0002, "epoch": 7.656014362657092, "step": 106610}, {"loss": 0.5588, "grad_norm": 0.9360224008560181, "learning_rate": 0.0002, "epoch": 7.65673249551167, "step": 106620}, {"loss": 0.54, "grad_norm": 1.1457395553588867, "learning_rate": 0.0002, "epoch": 7.657450628366248, "step": 106630}, {"loss": 0.5706, "grad_norm": 0.9849295020103455, "learning_rate": 0.0002, "epoch": 7.658168761220826, "step": 106640}, {"loss": 0.5683, "grad_norm": 1.0622800588607788, "learning_rate": 0.0002, "epoch": 7.658886894075404, "step": 106650}, {"loss": 0.5494, "grad_norm": 0.8352060914039612, "learning_rate": 0.0002, "epoch": 7.659605026929982, "step": 106660}, {"loss": 0.6049, "grad_norm": 1.1975891590118408, "learning_rate": 0.0002, "epoch": 7.66032315978456, "step": 106670}, {"loss": 0.5588, "grad_norm": 1.1585075855255127, "learning_rate": 0.0002, "epoch": 7.661041292639139, "step": 106680}, {"loss": 0.512, "grad_norm": 1.1387015581130981, "learning_rate": 0.0002, "epoch": 7.661759425493717, "step": 106690}, {"loss": 0.5552, "grad_norm": 1.2752996683120728, "learning_rate": 0.0002, "epoch": 7.662477558348295, "step": 106700}, {"loss": 0.5639, "grad_norm": 1.1885957717895508, "learning_rate": 0.0002, "epoch": 7.663195691202873, "step": 106710}, {"loss": 0.5622, "grad_norm": 0.9355967044830322, "learning_rate": 0.0002, "epoch": 7.663913824057451, "step": 106720}, {"loss": 0.5205, "grad_norm": 1.0528348684310913, "learning_rate": 0.0002, "epoch": 7.664631956912029, "step": 106730}, {"loss": 0.5985, "grad_norm": 1.1075369119644165, "learning_rate": 0.0002, "epoch": 7.665350089766607, "step": 106740}, {"loss": 0.5606, "grad_norm": 1.2078553438186646, "learning_rate": 0.0002, "epoch": 7.666068222621185, "step": 106750}, {"loss": 0.5493, "grad_norm": 0.9850115776062012, "learning_rate": 0.0002, "epoch": 7.666786355475763, "step": 106760}, {"loss": 0.5743, "grad_norm": 1.1855263710021973, "learning_rate": 0.0002, "epoch": 7.667504488330341, "step": 106770}, {"loss": 0.618, "grad_norm": 1.3375587463378906, "learning_rate": 0.0002, "epoch": 7.66822262118492, "step": 106780}, {"loss": 0.548, "grad_norm": 0.8773086071014404, "learning_rate": 0.0002, "epoch": 7.668940754039498, "step": 106790}, {"loss": 0.5948, "grad_norm": 1.293311595916748, "learning_rate": 0.0002, "epoch": 7.669658886894076, "step": 106800}, {"loss": 0.5772, "grad_norm": 1.1973644495010376, "learning_rate": 0.0002, "epoch": 7.670377019748654, "step": 106810}, {"loss": 0.5888, "grad_norm": 1.0847374200820923, "learning_rate": 0.0002, "epoch": 7.671095152603232, "step": 106820}, {"loss": 0.5787, "grad_norm": 0.98153156042099, "learning_rate": 0.0002, "epoch": 7.67181328545781, "step": 106830}, {"loss": 0.5471, "grad_norm": 1.049188494682312, "learning_rate": 0.0002, "epoch": 7.672531418312388, "step": 106840}, {"loss": 0.589, "grad_norm": 1.0110270977020264, "learning_rate": 0.0002, "epoch": 7.673249551166966, "step": 106850}, {"loss": 0.5826, "grad_norm": 1.046575903892517, "learning_rate": 0.0002, "epoch": 7.673967684021544, "step": 106860}, {"loss": 0.5808, "grad_norm": 0.9939501285552979, "learning_rate": 0.0002, "epoch": 7.6746858168761225, "step": 106870}, {"loss": 0.544, "grad_norm": 1.1165480613708496, "learning_rate": 0.0002, "epoch": 7.6754039497307005, "step": 106880}, {"loss": 0.5573, "grad_norm": 0.8909515738487244, "learning_rate": 0.0002, "epoch": 7.6761220825852785, "step": 106890}, {"loss": 0.5554, "grad_norm": 0.99685138463974, "learning_rate": 0.0002, "epoch": 7.6768402154398565, "step": 106900}, {"loss": 0.5705, "grad_norm": 0.9978061318397522, "learning_rate": 0.0002, "epoch": 7.6775583482944345, "step": 106910}, {"loss": 0.5651, "grad_norm": 1.2148759365081787, "learning_rate": 0.0002, "epoch": 7.6782764811490125, "step": 106920}, {"loss": 0.5784, "grad_norm": 1.2721340656280518, "learning_rate": 0.0002, "epoch": 7.6789946140035905, "step": 106930}, {"loss": 0.5698, "grad_norm": 1.0458247661590576, "learning_rate": 0.0002, "epoch": 7.6797127468581685, "step": 106940}, {"loss": 0.5815, "grad_norm": 0.9900956749916077, "learning_rate": 0.0002, "epoch": 7.6804308797127465, "step": 106950}, {"loss": 0.5816, "grad_norm": 1.0812790393829346, "learning_rate": 0.0002, "epoch": 7.6811490125673245, "step": 106960}, {"loss": 0.5555, "grad_norm": 1.1479923725128174, "learning_rate": 0.0002, "epoch": 7.681867145421903, "step": 106970}, {"loss": 0.5781, "grad_norm": 0.7898157238960266, "learning_rate": 0.0002, "epoch": 7.682585278276481, "step": 106980}, {"loss": 0.6027, "grad_norm": 1.4052869081497192, "learning_rate": 0.0002, "epoch": 7.683303411131059, "step": 106990}, {"loss": 0.5459, "grad_norm": 1.3122624158859253, "learning_rate": 0.0002, "epoch": 7.684021543985637, "step": 107000}, {"loss": 0.6084, "grad_norm": 1.0138102769851685, "learning_rate": 0.0002, "epoch": 7.684739676840215, "step": 107010}, {"loss": 0.5447, "grad_norm": 1.0716434717178345, "learning_rate": 0.0002, "epoch": 7.685457809694793, "step": 107020}, {"loss": 0.5254, "grad_norm": 1.2208350896835327, "learning_rate": 0.0002, "epoch": 7.686175942549371, "step": 107030}, {"loss": 0.5675, "grad_norm": 1.3777594566345215, "learning_rate": 0.0002, "epoch": 7.686894075403949, "step": 107040}, {"loss": 0.5517, "grad_norm": 1.1951156854629517, "learning_rate": 0.0002, "epoch": 7.687612208258528, "step": 107050}, {"loss": 0.5622, "grad_norm": 0.987120509147644, "learning_rate": 0.0002, "epoch": 7.688330341113106, "step": 107060}, {"loss": 0.5047, "grad_norm": 0.9455362558364868, "learning_rate": 0.0002, "epoch": 7.689048473967684, "step": 107070}, {"loss": 0.555, "grad_norm": 0.9832291007041931, "learning_rate": 0.0002, "epoch": 7.689766606822262, "step": 107080}, {"loss": 0.5607, "grad_norm": 1.046239972114563, "learning_rate": 0.0002, "epoch": 7.69048473967684, "step": 107090}, {"loss": 0.5875, "grad_norm": 1.1121305227279663, "learning_rate": 0.0002, "epoch": 7.691202872531418, "step": 107100}, {"loss": 0.5531, "grad_norm": 1.0636173486709595, "learning_rate": 0.0002, "epoch": 7.691921005385996, "step": 107110}, {"loss": 0.6101, "grad_norm": 1.2166199684143066, "learning_rate": 0.0002, "epoch": 7.692639138240574, "step": 107120}, {"loss": 0.5676, "grad_norm": 1.0859293937683105, "learning_rate": 0.0002, "epoch": 7.693357271095152, "step": 107130}, {"loss": 0.5465, "grad_norm": 0.9719768166542053, "learning_rate": 0.0002, "epoch": 7.69407540394973, "step": 107140}, {"loss": 0.5442, "grad_norm": 1.5153313875198364, "learning_rate": 0.0002, "epoch": 7.694793536804308, "step": 107150}, {"loss": 0.5624, "grad_norm": 1.1787729263305664, "learning_rate": 0.0002, "epoch": 7.695511669658887, "step": 107160}, {"loss": 0.5632, "grad_norm": 0.9926921129226685, "learning_rate": 0.0002, "epoch": 7.696229802513465, "step": 107170}, {"loss": 0.5235, "grad_norm": 1.0670396089553833, "learning_rate": 0.0002, "epoch": 7.696947935368043, "step": 107180}, {"loss": 0.539, "grad_norm": 1.022409200668335, "learning_rate": 0.0002, "epoch": 7.697666068222621, "step": 107190}, {"loss": 0.579, "grad_norm": 0.9605807065963745, "learning_rate": 0.0002, "epoch": 7.698384201077199, "step": 107200}, {"loss": 0.6008, "grad_norm": 1.2187163829803467, "learning_rate": 0.0002, "epoch": 7.699102333931777, "step": 107210}, {"loss": 0.5686, "grad_norm": 1.2335593700408936, "learning_rate": 0.0002, "epoch": 7.699820466786355, "step": 107220}, {"loss": 0.5494, "grad_norm": 1.159769892692566, "learning_rate": 0.0002, "epoch": 7.700538599640933, "step": 107230}, {"loss": 0.5304, "grad_norm": 0.9486351013183594, "learning_rate": 0.0002, "epoch": 7.701256732495512, "step": 107240}, {"loss": 0.5702, "grad_norm": 1.2952953577041626, "learning_rate": 0.0002, "epoch": 7.70197486535009, "step": 107250}, {"loss": 0.6003, "grad_norm": 0.9187726974487305, "learning_rate": 0.0002, "epoch": 7.702692998204668, "step": 107260}, {"loss": 0.5774, "grad_norm": 1.0610202550888062, "learning_rate": 0.0002, "epoch": 7.703411131059246, "step": 107270}, {"loss": 0.5334, "grad_norm": 1.0553513765335083, "learning_rate": 0.0002, "epoch": 7.704129263913824, "step": 107280}, {"loss": 0.5396, "grad_norm": 1.0521212816238403, "learning_rate": 0.0002, "epoch": 7.704847396768402, "step": 107290}, {"loss": 0.5501, "grad_norm": 1.197798252105713, "learning_rate": 0.0002, "epoch": 7.70556552962298, "step": 107300}, {"loss": 0.5827, "grad_norm": 1.1656016111373901, "learning_rate": 0.0002, "epoch": 7.706283662477558, "step": 107310}, {"loss": 0.5495, "grad_norm": 1.1318942308425903, "learning_rate": 0.0002, "epoch": 7.707001795332136, "step": 107320}, {"loss": 0.5205, "grad_norm": 1.2302566766738892, "learning_rate": 0.0002, "epoch": 7.707719928186714, "step": 107330}, {"loss": 0.5353, "grad_norm": 1.2854527235031128, "learning_rate": 0.0002, "epoch": 7.708438061041292, "step": 107340}, {"loss": 0.5999, "grad_norm": 1.2395009994506836, "learning_rate": 0.0002, "epoch": 7.709156193895871, "step": 107350}, {"loss": 0.5766, "grad_norm": 1.2834311723709106, "learning_rate": 0.0002, "epoch": 7.709874326750449, "step": 107360}, {"loss": 0.551, "grad_norm": 0.9438875317573547, "learning_rate": 0.0002, "epoch": 7.710592459605027, "step": 107370}, {"loss": 0.5848, "grad_norm": 1.2651551961898804, "learning_rate": 0.0002, "epoch": 7.711310592459605, "step": 107380}, {"loss": 0.5253, "grad_norm": 1.0880811214447021, "learning_rate": 0.0002, "epoch": 7.712028725314183, "step": 107390}, {"loss": 0.532, "grad_norm": 1.077873706817627, "learning_rate": 0.0002, "epoch": 7.712746858168761, "step": 107400}, {"loss": 0.5675, "grad_norm": 1.183581829071045, "learning_rate": 0.0002, "epoch": 7.713464991023339, "step": 107410}, {"loss": 0.5453, "grad_norm": 0.903417706489563, "learning_rate": 0.0002, "epoch": 7.714183123877917, "step": 107420}, {"loss": 0.5549, "grad_norm": 1.0142052173614502, "learning_rate": 0.0002, "epoch": 7.714901256732496, "step": 107430}, {"loss": 0.521, "grad_norm": 1.287375807762146, "learning_rate": 0.0002, "epoch": 7.715619389587074, "step": 107440}, {"loss": 0.6079, "grad_norm": 1.036961555480957, "learning_rate": 0.0002, "epoch": 7.716337522441652, "step": 107450}, {"loss": 0.6028, "grad_norm": 1.053189992904663, "learning_rate": 0.0002, "epoch": 7.71705565529623, "step": 107460}, {"loss": 0.5444, "grad_norm": 1.0782629251480103, "learning_rate": 0.0002, "epoch": 7.717773788150808, "step": 107470}, {"loss": 0.5621, "grad_norm": 1.2815700769424438, "learning_rate": 0.0002, "epoch": 7.718491921005386, "step": 107480}, {"loss": 0.5661, "grad_norm": 1.0254477262496948, "learning_rate": 0.0002, "epoch": 7.719210053859964, "step": 107490}, {"loss": 0.5786, "grad_norm": 1.2113746404647827, "learning_rate": 0.0002, "epoch": 7.719928186714542, "step": 107500}, {"loss": 0.5674, "grad_norm": 1.1663107872009277, "learning_rate": 0.0002, "epoch": 7.72064631956912, "step": 107510}, {"loss": 0.5415, "grad_norm": 1.1120136976242065, "learning_rate": 0.0002, "epoch": 7.721364452423698, "step": 107520}, {"loss": 0.5204, "grad_norm": 0.9561337828636169, "learning_rate": 0.0002, "epoch": 7.722082585278277, "step": 107530}, {"loss": 0.5283, "grad_norm": 1.0723344087600708, "learning_rate": 0.0002, "epoch": 7.722800718132855, "step": 107540}, {"loss": 0.5743, "grad_norm": 1.1457021236419678, "learning_rate": 0.0002, "epoch": 7.723518850987433, "step": 107550}, {"loss": 0.6075, "grad_norm": 1.1626014709472656, "learning_rate": 0.0002, "epoch": 7.724236983842011, "step": 107560}, {"loss": 0.5567, "grad_norm": 1.0837032794952393, "learning_rate": 0.0002, "epoch": 7.724955116696589, "step": 107570}, {"loss": 0.5979, "grad_norm": 1.1355236768722534, "learning_rate": 0.0002, "epoch": 7.725673249551167, "step": 107580}, {"loss": 0.5342, "grad_norm": 0.9753133654594421, "learning_rate": 0.0002, "epoch": 7.726391382405745, "step": 107590}, {"loss": 0.5771, "grad_norm": 1.1424425840377808, "learning_rate": 0.0002, "epoch": 7.727109515260323, "step": 107600}, {"loss": 0.5508, "grad_norm": 0.8058976531028748, "learning_rate": 0.0002, "epoch": 7.727827648114902, "step": 107610}, {"loss": 0.5949, "grad_norm": 1.1998937129974365, "learning_rate": 0.0002, "epoch": 7.72854578096948, "step": 107620}, {"loss": 0.5348, "grad_norm": 1.0383063554763794, "learning_rate": 0.0002, "epoch": 7.729263913824058, "step": 107630}, {"loss": 0.5182, "grad_norm": 1.069886565208435, "learning_rate": 0.0002, "epoch": 7.729982046678636, "step": 107640}, {"loss": 0.5089, "grad_norm": 1.113100290298462, "learning_rate": 0.0002, "epoch": 7.730700179533214, "step": 107650}, {"loss": 0.5637, "grad_norm": 1.1166869401931763, "learning_rate": 0.0002, "epoch": 7.731418312387792, "step": 107660}, {"loss": 0.5489, "grad_norm": 1.3739103078842163, "learning_rate": 0.0002, "epoch": 7.73213644524237, "step": 107670}, {"loss": 0.5801, "grad_norm": 0.9432857036590576, "learning_rate": 0.0002, "epoch": 7.732854578096948, "step": 107680}, {"loss": 0.5258, "grad_norm": 1.0611073970794678, "learning_rate": 0.0002, "epoch": 7.733572710951526, "step": 107690}, {"loss": 0.5937, "grad_norm": 1.052598476409912, "learning_rate": 0.0002, "epoch": 7.734290843806104, "step": 107700}, {"loss": 0.5769, "grad_norm": 1.080534815788269, "learning_rate": 0.0002, "epoch": 7.735008976660682, "step": 107710}, {"loss": 0.5344, "grad_norm": 1.3288558721542358, "learning_rate": 0.0002, "epoch": 7.735727109515261, "step": 107720}, {"loss": 0.5321, "grad_norm": 1.1469939947128296, "learning_rate": 0.0002, "epoch": 7.736445242369839, "step": 107730}, {"loss": 0.5506, "grad_norm": 0.9235124588012695, "learning_rate": 0.0002, "epoch": 7.737163375224417, "step": 107740}, {"loss": 0.5475, "grad_norm": 1.2601470947265625, "learning_rate": 0.0002, "epoch": 7.737881508078995, "step": 107750}, {"loss": 0.5495, "grad_norm": 1.181703805923462, "learning_rate": 0.0002, "epoch": 7.738599640933573, "step": 107760}, {"loss": 0.5641, "grad_norm": 0.9549161195755005, "learning_rate": 0.0002, "epoch": 7.739317773788151, "step": 107770}, {"loss": 0.591, "grad_norm": 1.078458547592163, "learning_rate": 0.0002, "epoch": 7.740035906642729, "step": 107780}, {"loss": 0.5623, "grad_norm": 1.1542205810546875, "learning_rate": 0.0002, "epoch": 7.740754039497307, "step": 107790}, {"loss": 0.6026, "grad_norm": 1.288838505744934, "learning_rate": 0.0002, "epoch": 7.741472172351886, "step": 107800}, {"loss": 0.5281, "grad_norm": 0.972050666809082, "learning_rate": 0.0002, "epoch": 7.742190305206464, "step": 107810}, {"loss": 0.5414, "grad_norm": 0.9113378524780273, "learning_rate": 0.0002, "epoch": 7.742908438061042, "step": 107820}, {"loss": 0.5618, "grad_norm": 1.207448959350586, "learning_rate": 0.0002, "epoch": 7.74362657091562, "step": 107830}, {"loss": 0.5477, "grad_norm": 1.2151618003845215, "learning_rate": 0.0002, "epoch": 7.744344703770198, "step": 107840}, {"loss": 0.5584, "grad_norm": 1.0792107582092285, "learning_rate": 0.0002, "epoch": 7.745062836624776, "step": 107850}, {"loss": 0.5333, "grad_norm": 0.9030680656433105, "learning_rate": 0.0002, "epoch": 7.745780969479354, "step": 107860}, {"loss": 0.6002, "grad_norm": 1.120816707611084, "learning_rate": 0.0002, "epoch": 7.746499102333932, "step": 107870}, {"loss": 0.555, "grad_norm": 1.221238374710083, "learning_rate": 0.0002, "epoch": 7.74721723518851, "step": 107880}, {"loss": 0.5678, "grad_norm": 1.2627668380737305, "learning_rate": 0.0002, "epoch": 7.747935368043088, "step": 107890}, {"loss": 0.5489, "grad_norm": 1.4177098274230957, "learning_rate": 0.0002, "epoch": 7.748653500897666, "step": 107900}, {"loss": 0.5264, "grad_norm": 1.2448033094406128, "learning_rate": 0.0002, "epoch": 7.7493716337522445, "step": 107910}, {"loss": 0.577, "grad_norm": 1.1706769466400146, "learning_rate": 0.0002, "epoch": 7.7500897666068225, "step": 107920}, {"loss": 0.5566, "grad_norm": 0.9637128114700317, "learning_rate": 0.0002, "epoch": 7.7508078994614005, "step": 107930}, {"loss": 0.5351, "grad_norm": 1.129179835319519, "learning_rate": 0.0002, "epoch": 7.7515260323159785, "step": 107940}, {"loss": 0.5569, "grad_norm": 1.3793165683746338, "learning_rate": 0.0002, "epoch": 7.7522441651705565, "step": 107950}, {"loss": 0.5895, "grad_norm": 1.0685398578643799, "learning_rate": 0.0002, "epoch": 7.7529622980251345, "step": 107960}, {"loss": 0.5662, "grad_norm": 0.9382266998291016, "learning_rate": 0.0002, "epoch": 7.7536804308797125, "step": 107970}, {"loss": 0.5575, "grad_norm": 1.0740195512771606, "learning_rate": 0.0002, "epoch": 7.7543985637342905, "step": 107980}, {"loss": 0.5546, "grad_norm": 1.292909860610962, "learning_rate": 0.0002, "epoch": 7.755116696588869, "step": 107990}, {"loss": 0.5968, "grad_norm": 1.2145541906356812, "learning_rate": 0.0002, "epoch": 7.755834829443447, "step": 108000}, {"loss": 0.5443, "grad_norm": 0.9905714988708496, "learning_rate": 0.0002, "epoch": 7.756552962298025, "step": 108010}, {"loss": 0.5594, "grad_norm": 1.1003599166870117, "learning_rate": 0.0002, "epoch": 7.757271095152603, "step": 108020}, {"loss": 0.5631, "grad_norm": 1.0429667234420776, "learning_rate": 0.0002, "epoch": 7.757989228007181, "step": 108030}, {"loss": 0.5511, "grad_norm": 0.8607417941093445, "learning_rate": 0.0002, "epoch": 7.758707360861759, "step": 108040}, {"loss": 0.5861, "grad_norm": 1.0659228563308716, "learning_rate": 0.0002, "epoch": 7.759425493716337, "step": 108050}, {"loss": 0.5507, "grad_norm": 1.0484120845794678, "learning_rate": 0.0002, "epoch": 7.760143626570915, "step": 108060}, {"loss": 0.5115, "grad_norm": 1.1236662864685059, "learning_rate": 0.0002, "epoch": 7.760861759425493, "step": 108070}, {"loss": 0.5506, "grad_norm": 1.0550786256790161, "learning_rate": 0.0002, "epoch": 7.761579892280071, "step": 108080}, {"loss": 0.5225, "grad_norm": 1.178968906402588, "learning_rate": 0.0002, "epoch": 7.76229802513465, "step": 108090}, {"loss": 0.5489, "grad_norm": 0.9117124080657959, "learning_rate": 0.0002, "epoch": 7.763016157989228, "step": 108100}, {"loss": 0.5776, "grad_norm": 1.1276684999465942, "learning_rate": 0.0002, "epoch": 7.763734290843806, "step": 108110}, {"loss": 0.552, "grad_norm": 1.0472416877746582, "learning_rate": 0.0002, "epoch": 7.764452423698384, "step": 108120}, {"loss": 0.5699, "grad_norm": 0.8711934685707092, "learning_rate": 0.0002, "epoch": 7.765170556552962, "step": 108130}, {"loss": 0.5454, "grad_norm": 1.0953301191329956, "learning_rate": 0.0002, "epoch": 7.76588868940754, "step": 108140}, {"loss": 0.5728, "grad_norm": 1.1367015838623047, "learning_rate": 0.0002, "epoch": 7.766606822262118, "step": 108150}, {"loss": 0.5556, "grad_norm": 1.324832797050476, "learning_rate": 0.0002, "epoch": 7.767324955116696, "step": 108160}, {"loss": 0.5749, "grad_norm": 1.0333607196807861, "learning_rate": 0.0002, "epoch": 7.768043087971275, "step": 108170}, {"loss": 0.586, "grad_norm": 1.1580414772033691, "learning_rate": 0.0002, "epoch": 7.768761220825853, "step": 108180}, {"loss": 0.5506, "grad_norm": 1.1693189144134521, "learning_rate": 0.0002, "epoch": 7.769479353680431, "step": 108190}, {"loss": 0.6068, "grad_norm": 1.0650800466537476, "learning_rate": 0.0002, "epoch": 7.770197486535009, "step": 108200}, {"loss": 0.5536, "grad_norm": 1.0890787839889526, "learning_rate": 0.0002, "epoch": 7.770915619389587, "step": 108210}, {"loss": 0.5463, "grad_norm": 1.065359115600586, "learning_rate": 0.0002, "epoch": 7.771633752244165, "step": 108220}, {"loss": 0.5826, "grad_norm": 0.864976704120636, "learning_rate": 0.0002, "epoch": 7.772351885098743, "step": 108230}, {"loss": 0.5042, "grad_norm": 0.9769368171691895, "learning_rate": 0.0002, "epoch": 7.773070017953321, "step": 108240}, {"loss": 0.5493, "grad_norm": 1.2894748449325562, "learning_rate": 0.0002, "epoch": 7.773788150807899, "step": 108250}, {"loss": 0.5609, "grad_norm": 1.1528522968292236, "learning_rate": 0.0002, "epoch": 7.774506283662477, "step": 108260}, {"loss": 0.5725, "grad_norm": 1.1542086601257324, "learning_rate": 0.0002, "epoch": 7.775224416517055, "step": 108270}, {"loss": 0.5255, "grad_norm": 1.3909233808517456, "learning_rate": 0.0002, "epoch": 7.775942549371634, "step": 108280}, {"loss": 0.5497, "grad_norm": 0.9855168461799622, "learning_rate": 0.0002, "epoch": 7.776660682226212, "step": 108290}, {"loss": 0.5353, "grad_norm": 1.0425859689712524, "learning_rate": 0.0002, "epoch": 7.77737881508079, "step": 108300}, {"loss": 0.5721, "grad_norm": 1.0025626420974731, "learning_rate": 0.0002, "epoch": 7.778096947935368, "step": 108310}, {"loss": 0.5875, "grad_norm": 1.036100149154663, "learning_rate": 0.0002, "epoch": 7.778815080789946, "step": 108320}, {"loss": 0.5549, "grad_norm": 0.9820912480354309, "learning_rate": 0.0002, "epoch": 7.779533213644524, "step": 108330}, {"loss": 0.5405, "grad_norm": 1.4552558660507202, "learning_rate": 0.0002, "epoch": 7.780251346499102, "step": 108340}, {"loss": 0.5243, "grad_norm": 1.1851739883422852, "learning_rate": 0.0002, "epoch": 7.78096947935368, "step": 108350}, {"loss": 0.5123, "grad_norm": 0.9678618311882019, "learning_rate": 0.0002, "epoch": 7.781687612208259, "step": 108360}, {"loss": 0.5433, "grad_norm": 1.052158236503601, "learning_rate": 0.0002, "epoch": 7.782405745062837, "step": 108370}, {"loss": 0.5448, "grad_norm": 0.8977556228637695, "learning_rate": 0.0002, "epoch": 7.783123877917415, "step": 108380}, {"loss": 0.5818, "grad_norm": 1.2486764192581177, "learning_rate": 0.0002, "epoch": 7.783842010771993, "step": 108390}, {"loss": 0.553, "grad_norm": 1.020477056503296, "learning_rate": 0.0002, "epoch": 7.784560143626571, "step": 108400}, {"loss": 0.5788, "grad_norm": 1.1957271099090576, "learning_rate": 0.0002, "epoch": 7.785278276481149, "step": 108410}, {"loss": 0.5615, "grad_norm": 1.0586557388305664, "learning_rate": 0.0002, "epoch": 7.785996409335727, "step": 108420}, {"loss": 0.5393, "grad_norm": 0.8806754946708679, "learning_rate": 0.0002, "epoch": 7.786714542190305, "step": 108430}, {"loss": 0.556, "grad_norm": 1.0272849798202515, "learning_rate": 0.0002, "epoch": 7.787432675044883, "step": 108440}, {"loss": 0.5559, "grad_norm": 1.052829623222351, "learning_rate": 0.0002, "epoch": 7.788150807899461, "step": 108450}, {"loss": 0.5903, "grad_norm": 1.276508092880249, "learning_rate": 0.0002, "epoch": 7.788868940754039, "step": 108460}, {"loss": 0.5816, "grad_norm": 0.9878475069999695, "learning_rate": 0.0002, "epoch": 7.789587073608618, "step": 108470}, {"loss": 0.5438, "grad_norm": 0.9568123817443848, "learning_rate": 0.0002, "epoch": 7.790305206463196, "step": 108480}, {"loss": 0.6028, "grad_norm": 1.097121238708496, "learning_rate": 0.0002, "epoch": 7.791023339317774, "step": 108490}, {"loss": 0.5528, "grad_norm": 1.188984751701355, "learning_rate": 0.0002, "epoch": 7.791741472172352, "step": 108500}, {"loss": 0.6019, "grad_norm": 0.9185505509376526, "learning_rate": 0.0002, "epoch": 7.79245960502693, "step": 108510}, {"loss": 0.5659, "grad_norm": 0.9427091479301453, "learning_rate": 0.0002, "epoch": 7.793177737881508, "step": 108520}, {"loss": 0.5709, "grad_norm": 1.0734131336212158, "learning_rate": 0.0002, "epoch": 7.793895870736086, "step": 108530}, {"loss": 0.5138, "grad_norm": 1.1126554012298584, "learning_rate": 0.0002, "epoch": 7.794614003590664, "step": 108540}, {"loss": 0.5766, "grad_norm": 1.1394606828689575, "learning_rate": 0.0002, "epoch": 7.795332136445243, "step": 108550}, {"loss": 0.5843, "grad_norm": 0.9328436851501465, "learning_rate": 0.0002, "epoch": 7.796050269299821, "step": 108560}, {"loss": 0.5675, "grad_norm": 1.1082807779312134, "learning_rate": 0.0002, "epoch": 7.796768402154399, "step": 108570}, {"loss": 0.6251, "grad_norm": 1.1107451915740967, "learning_rate": 0.0002, "epoch": 7.797486535008977, "step": 108580}, {"loss": 0.5478, "grad_norm": 1.1145843267440796, "learning_rate": 0.0002, "epoch": 7.798204667863555, "step": 108590}, {"loss": 0.5781, "grad_norm": 0.9881244897842407, "learning_rate": 0.0002, "epoch": 7.798922800718133, "step": 108600}, {"loss": 0.5513, "grad_norm": 1.022754192352295, "learning_rate": 0.0002, "epoch": 7.799640933572711, "step": 108610}, {"loss": 0.5375, "grad_norm": 1.197089672088623, "learning_rate": 0.0002, "epoch": 7.800359066427289, "step": 108620}, {"loss": 0.5878, "grad_norm": 1.0599340200424194, "learning_rate": 0.0002, "epoch": 7.801077199281867, "step": 108630}, {"loss": 0.6197, "grad_norm": 1.1776701211929321, "learning_rate": 0.0002, "epoch": 7.801795332136445, "step": 108640}, {"loss": 0.5765, "grad_norm": 0.9674487709999084, "learning_rate": 0.0002, "epoch": 7.802513464991024, "step": 108650}, {"loss": 0.5504, "grad_norm": 0.9964252710342407, "learning_rate": 0.0002, "epoch": 7.803231597845602, "step": 108660}, {"loss": 0.5846, "grad_norm": 1.0302894115447998, "learning_rate": 0.0002, "epoch": 7.80394973070018, "step": 108670}, {"loss": 0.5867, "grad_norm": 1.3224111795425415, "learning_rate": 0.0002, "epoch": 7.804667863554758, "step": 108680}, {"loss": 0.5346, "grad_norm": 1.2263908386230469, "learning_rate": 0.0002, "epoch": 7.805385996409336, "step": 108690}, {"loss": 0.5781, "grad_norm": 1.3223700523376465, "learning_rate": 0.0002, "epoch": 7.806104129263914, "step": 108700}, {"loss": 0.5677, "grad_norm": 1.0767865180969238, "learning_rate": 0.0002, "epoch": 7.806822262118492, "step": 108710}, {"loss": 0.5838, "grad_norm": 1.0822714567184448, "learning_rate": 0.0002, "epoch": 7.80754039497307, "step": 108720}, {"loss": 0.5865, "grad_norm": 1.2550771236419678, "learning_rate": 0.0002, "epoch": 7.808258527827648, "step": 108730}, {"loss": 0.5767, "grad_norm": 1.0170459747314453, "learning_rate": 0.0002, "epoch": 7.808976660682227, "step": 108740}, {"loss": 0.5512, "grad_norm": 1.1515722274780273, "learning_rate": 0.0002, "epoch": 7.809694793536805, "step": 108750}, {"loss": 0.5639, "grad_norm": 1.327756643295288, "learning_rate": 0.0002, "epoch": 7.810412926391383, "step": 108760}, {"loss": 0.5896, "grad_norm": 1.0545963048934937, "learning_rate": 0.0002, "epoch": 7.811131059245961, "step": 108770}, {"loss": 0.5727, "grad_norm": 1.0827748775482178, "learning_rate": 0.0002, "epoch": 7.811849192100539, "step": 108780}, {"loss": 0.5547, "grad_norm": 1.010693073272705, "learning_rate": 0.0002, "epoch": 7.812567324955117, "step": 108790}, {"loss": 0.5548, "grad_norm": 1.2254958152770996, "learning_rate": 0.0002, "epoch": 7.813285457809695, "step": 108800}, {"loss": 0.5726, "grad_norm": 0.9775252938270569, "learning_rate": 0.0002, "epoch": 7.814003590664273, "step": 108810}, {"loss": 0.5523, "grad_norm": 0.9968659281730652, "learning_rate": 0.0002, "epoch": 7.814721723518851, "step": 108820}, {"loss": 0.589, "grad_norm": 0.9968136548995972, "learning_rate": 0.0002, "epoch": 7.815439856373429, "step": 108830}, {"loss": 0.5424, "grad_norm": 1.0271786451339722, "learning_rate": 0.0002, "epoch": 7.8161579892280075, "step": 108840}, {"loss": 0.6101, "grad_norm": 1.332309603691101, "learning_rate": 0.0002, "epoch": 7.8168761220825855, "step": 108850}, {"loss": 0.6154, "grad_norm": 1.2836099863052368, "learning_rate": 0.0002, "epoch": 7.8175942549371635, "step": 108860}, {"loss": 0.5984, "grad_norm": 0.9816291332244873, "learning_rate": 0.0002, "epoch": 7.8183123877917415, "step": 108870}, {"loss": 0.6183, "grad_norm": 1.1243056058883667, "learning_rate": 0.0002, "epoch": 7.8190305206463195, "step": 108880}, {"loss": 0.5799, "grad_norm": 1.2360351085662842, "learning_rate": 0.0002, "epoch": 7.8197486535008975, "step": 108890}, {"loss": 0.5254, "grad_norm": 1.2734822034835815, "learning_rate": 0.0002, "epoch": 7.8204667863554755, "step": 108900}, {"loss": 0.5835, "grad_norm": 1.2423732280731201, "learning_rate": 0.0002, "epoch": 7.8211849192100535, "step": 108910}, {"loss": 0.5614, "grad_norm": 0.969839334487915, "learning_rate": 0.0002, "epoch": 7.821903052064632, "step": 108920}, {"loss": 0.6005, "grad_norm": 1.1603267192840576, "learning_rate": 0.0002, "epoch": 7.82262118491921, "step": 108930}, {"loss": 0.5152, "grad_norm": 1.1748993396759033, "learning_rate": 0.0002, "epoch": 7.823339317773788, "step": 108940}, {"loss": 0.5379, "grad_norm": 1.246304988861084, "learning_rate": 0.0002, "epoch": 7.824057450628366, "step": 108950}, {"loss": 0.5609, "grad_norm": 0.9472703337669373, "learning_rate": 0.0002, "epoch": 7.824775583482944, "step": 108960}, {"loss": 0.5421, "grad_norm": 1.22053062915802, "learning_rate": 0.0002, "epoch": 7.825493716337522, "step": 108970}, {"loss": 0.5966, "grad_norm": 1.0310567617416382, "learning_rate": 0.0002, "epoch": 7.8262118491921004, "step": 108980}, {"loss": 0.5646, "grad_norm": 1.1211191415786743, "learning_rate": 0.0002, "epoch": 7.8269299820466784, "step": 108990}, {"loss": 0.5635, "grad_norm": 0.9057613015174866, "learning_rate": 0.0002, "epoch": 7.8276481149012564, "step": 109000}, {"loss": 0.5618, "grad_norm": 1.0615124702453613, "learning_rate": 0.0002, "epoch": 7.8283662477558345, "step": 109010}, {"loss": 0.5547, "grad_norm": 0.9669250845909119, "learning_rate": 0.0002, "epoch": 7.8290843806104125, "step": 109020}, {"loss": 0.5562, "grad_norm": 1.1100435256958008, "learning_rate": 0.0002, "epoch": 7.829802513464991, "step": 109030}, {"loss": 0.5725, "grad_norm": 1.2583600282669067, "learning_rate": 0.0002, "epoch": 7.830520646319569, "step": 109040}, {"loss": 0.5484, "grad_norm": 1.228148102760315, "learning_rate": 0.0002, "epoch": 7.831238779174147, "step": 109050}, {"loss": 0.5884, "grad_norm": 1.0673317909240723, "learning_rate": 0.0002, "epoch": 7.831956912028725, "step": 109060}, {"loss": 0.5265, "grad_norm": 1.169648289680481, "learning_rate": 0.0002, "epoch": 7.832675044883303, "step": 109070}, {"loss": 0.5759, "grad_norm": 1.0065253973007202, "learning_rate": 0.0002, "epoch": 7.833393177737881, "step": 109080}, {"loss": 0.5553, "grad_norm": 1.1310595273971558, "learning_rate": 0.0002, "epoch": 7.834111310592459, "step": 109090}, {"loss": 0.6005, "grad_norm": 0.9469314217567444, "learning_rate": 0.0002, "epoch": 7.834829443447037, "step": 109100}, {"loss": 0.5472, "grad_norm": 1.1143816709518433, "learning_rate": 0.0002, "epoch": 7.835547576301616, "step": 109110}, {"loss": 0.564, "grad_norm": 1.0617737770080566, "learning_rate": 0.0002, "epoch": 7.836265709156194, "step": 109120}, {"loss": 0.556, "grad_norm": 1.0489295721054077, "learning_rate": 0.0002, "epoch": 7.836983842010772, "step": 109130}, {"loss": 0.6081, "grad_norm": 1.2900800704956055, "learning_rate": 0.0002, "epoch": 7.83770197486535, "step": 109140}, {"loss": 0.5699, "grad_norm": 1.1539736986160278, "learning_rate": 0.0002, "epoch": 7.838420107719928, "step": 109150}, {"loss": 0.551, "grad_norm": 1.0503592491149902, "learning_rate": 0.0002, "epoch": 7.839138240574506, "step": 109160}, {"loss": 0.588, "grad_norm": 1.134155035018921, "learning_rate": 0.0002, "epoch": 7.839856373429084, "step": 109170}, {"loss": 0.5629, "grad_norm": 1.042429804801941, "learning_rate": 0.0002, "epoch": 7.840574506283662, "step": 109180}, {"loss": 0.5637, "grad_norm": 1.0549449920654297, "learning_rate": 0.0002, "epoch": 7.84129263913824, "step": 109190}, {"loss": 0.5181, "grad_norm": 0.9603164196014404, "learning_rate": 0.0002, "epoch": 7.842010771992818, "step": 109200}, {"loss": 0.5609, "grad_norm": 1.3291586637496948, "learning_rate": 0.0002, "epoch": 7.842728904847397, "step": 109210}, {"loss": 0.5262, "grad_norm": 0.7739448547363281, "learning_rate": 0.0002, "epoch": 7.843447037701975, "step": 109220}, {"loss": 0.4998, "grad_norm": 1.0020095109939575, "learning_rate": 0.0002, "epoch": 7.844165170556553, "step": 109230}, {"loss": 0.5322, "grad_norm": 0.9480768442153931, "learning_rate": 0.0002, "epoch": 7.844883303411131, "step": 109240}, {"loss": 0.6093, "grad_norm": 1.0376673936843872, "learning_rate": 0.0002, "epoch": 7.845601436265709, "step": 109250}, {"loss": 0.5547, "grad_norm": 0.9776299595832825, "learning_rate": 0.0002, "epoch": 7.846319569120287, "step": 109260}, {"loss": 0.5631, "grad_norm": 1.0477584600448608, "learning_rate": 0.0002, "epoch": 7.847037701974865, "step": 109270}, {"loss": 0.5722, "grad_norm": 1.162746548652649, "learning_rate": 0.0002, "epoch": 7.847755834829443, "step": 109280}, {"loss": 0.5476, "grad_norm": 1.0150725841522217, "learning_rate": 0.0002, "epoch": 7.848473967684021, "step": 109290}, {"loss": 0.6201, "grad_norm": 1.0144163370132446, "learning_rate": 0.0002, "epoch": 7.8491921005386, "step": 109300}, {"loss": 0.549, "grad_norm": 0.9614455103874207, "learning_rate": 0.0002, "epoch": 7.849910233393178, "step": 109310}, {"loss": 0.6078, "grad_norm": 1.223591685295105, "learning_rate": 0.0002, "epoch": 7.850628366247756, "step": 109320}, {"loss": 0.5763, "grad_norm": 1.149753212928772, "learning_rate": 0.0002, "epoch": 7.851346499102334, "step": 109330}, {"loss": 0.5859, "grad_norm": 0.8418117165565491, "learning_rate": 0.0002, "epoch": 7.852064631956912, "step": 109340}, {"loss": 0.5742, "grad_norm": 1.3950735330581665, "learning_rate": 0.0002, "epoch": 7.85278276481149, "step": 109350}, {"loss": 0.6149, "grad_norm": 1.315022587776184, "learning_rate": 0.0002, "epoch": 7.853500897666068, "step": 109360}, {"loss": 0.5632, "grad_norm": 0.9699475765228271, "learning_rate": 0.0002, "epoch": 7.854219030520646, "step": 109370}, {"loss": 0.5677, "grad_norm": 1.0460443496704102, "learning_rate": 0.0002, "epoch": 7.854937163375224, "step": 109380}, {"loss": 0.5915, "grad_norm": 1.0051870346069336, "learning_rate": 0.0002, "epoch": 7.855655296229802, "step": 109390}, {"loss": 0.5358, "grad_norm": 1.1087634563446045, "learning_rate": 0.0002, "epoch": 7.856373429084381, "step": 109400}, {"loss": 0.5705, "grad_norm": 1.0926934480667114, "learning_rate": 0.0002, "epoch": 7.857091561938959, "step": 109410}, {"loss": 0.5581, "grad_norm": 0.9953354597091675, "learning_rate": 0.0002, "epoch": 7.857809694793537, "step": 109420}, {"loss": 0.5602, "grad_norm": 1.170961856842041, "learning_rate": 0.0002, "epoch": 7.858527827648115, "step": 109430}, {"loss": 0.5464, "grad_norm": 1.2087738513946533, "learning_rate": 0.0002, "epoch": 7.859245960502693, "step": 109440}, {"loss": 0.5677, "grad_norm": 0.969118595123291, "learning_rate": 0.0002, "epoch": 7.859964093357271, "step": 109450}, {"loss": 0.5784, "grad_norm": 1.2040046453475952, "learning_rate": 0.0002, "epoch": 7.860682226211849, "step": 109460}, {"loss": 0.566, "grad_norm": 0.9882297515869141, "learning_rate": 0.0002, "epoch": 7.861400359066427, "step": 109470}, {"loss": 0.5676, "grad_norm": 1.0635188817977905, "learning_rate": 0.0002, "epoch": 7.862118491921006, "step": 109480}, {"loss": 0.5927, "grad_norm": 1.174045205116272, "learning_rate": 0.0002, "epoch": 7.862836624775584, "step": 109490}, {"loss": 0.5403, "grad_norm": 0.9702258706092834, "learning_rate": 0.0002, "epoch": 7.863554757630162, "step": 109500}, {"loss": 0.5585, "grad_norm": 0.8843887448310852, "learning_rate": 0.0002, "epoch": 7.86427289048474, "step": 109510}, {"loss": 0.596, "grad_norm": 0.961931049823761, "learning_rate": 0.0002, "epoch": 7.864991023339318, "step": 109520}, {"loss": 0.5994, "grad_norm": 0.9497876763343811, "learning_rate": 0.0002, "epoch": 7.865709156193896, "step": 109530}, {"loss": 0.5666, "grad_norm": 1.0348241329193115, "learning_rate": 0.0002, "epoch": 7.866427289048474, "step": 109540}, {"loss": 0.6202, "grad_norm": 1.0796928405761719, "learning_rate": 0.0002, "epoch": 7.867145421903052, "step": 109550}, {"loss": 0.5302, "grad_norm": 1.2193728685379028, "learning_rate": 0.0002, "epoch": 7.86786355475763, "step": 109560}, {"loss": 0.5264, "grad_norm": 0.8161213994026184, "learning_rate": 0.0002, "epoch": 7.868581687612208, "step": 109570}, {"loss": 0.5618, "grad_norm": 1.062281608581543, "learning_rate": 0.0002, "epoch": 7.869299820466786, "step": 109580}, {"loss": 0.5762, "grad_norm": 1.0982999801635742, "learning_rate": 0.0002, "epoch": 7.870017953321365, "step": 109590}, {"loss": 0.6085, "grad_norm": 1.057931661605835, "learning_rate": 0.0002, "epoch": 7.870736086175943, "step": 109600}, {"loss": 0.5605, "grad_norm": 1.1201120615005493, "learning_rate": 0.0002, "epoch": 7.871454219030521, "step": 109610}, {"loss": 0.6069, "grad_norm": 1.2803348302841187, "learning_rate": 0.0002, "epoch": 7.872172351885099, "step": 109620}, {"loss": 0.5434, "grad_norm": 1.1370888948440552, "learning_rate": 0.0002, "epoch": 7.872890484739677, "step": 109630}, {"loss": 0.5532, "grad_norm": 1.1025199890136719, "learning_rate": 0.0002, "epoch": 7.873608617594255, "step": 109640}, {"loss": 0.5952, "grad_norm": 0.9794017672538757, "learning_rate": 0.0002, "epoch": 7.874326750448833, "step": 109650}, {"loss": 0.5293, "grad_norm": 1.0693902969360352, "learning_rate": 0.0002, "epoch": 7.875044883303411, "step": 109660}, {"loss": 0.56, "grad_norm": 1.1972219944000244, "learning_rate": 0.0002, "epoch": 7.87576301615799, "step": 109670}, {"loss": 0.5743, "grad_norm": 1.5061790943145752, "learning_rate": 0.0002, "epoch": 7.876481149012568, "step": 109680}, {"loss": 0.5557, "grad_norm": 1.194033145904541, "learning_rate": 0.0002, "epoch": 7.877199281867146, "step": 109690}, {"loss": 0.5437, "grad_norm": 1.1381443738937378, "learning_rate": 0.0002, "epoch": 7.877917414721724, "step": 109700}, {"loss": 0.5586, "grad_norm": 1.1147687435150146, "learning_rate": 0.0002, "epoch": 7.878635547576302, "step": 109710}, {"loss": 0.5233, "grad_norm": 1.0469177961349487, "learning_rate": 0.0002, "epoch": 7.87935368043088, "step": 109720}, {"loss": 0.5463, "grad_norm": 1.066167950630188, "learning_rate": 0.0002, "epoch": 7.880071813285458, "step": 109730}, {"loss": 0.5505, "grad_norm": 1.1696351766586304, "learning_rate": 0.0002, "epoch": 7.880789946140036, "step": 109740}, {"loss": 0.5796, "grad_norm": 1.0112557411193848, "learning_rate": 0.0002, "epoch": 7.881508078994614, "step": 109750}, {"loss": 0.5676, "grad_norm": 1.0896331071853638, "learning_rate": 0.0002, "epoch": 7.882226211849192, "step": 109760}, {"loss": 0.5987, "grad_norm": 1.1275625228881836, "learning_rate": 0.0002, "epoch": 7.88294434470377, "step": 109770}, {"loss": 0.5248, "grad_norm": 0.859959602355957, "learning_rate": 0.0002, "epoch": 7.883662477558349, "step": 109780}, {"loss": 0.5811, "grad_norm": 1.1432042121887207, "learning_rate": 0.0002, "epoch": 7.884380610412927, "step": 109790}, {"loss": 0.5697, "grad_norm": 1.0156069993972778, "learning_rate": 0.0002, "epoch": 7.885098743267505, "step": 109800}, {"loss": 0.5698, "grad_norm": 0.8594014048576355, "learning_rate": 0.0002, "epoch": 7.885816876122083, "step": 109810}, {"loss": 0.5177, "grad_norm": 0.8861605525016785, "learning_rate": 0.0002, "epoch": 7.886535008976661, "step": 109820}, {"loss": 0.5863, "grad_norm": 0.9504907131195068, "learning_rate": 0.0002, "epoch": 7.887253141831239, "step": 109830}, {"loss": 0.542, "grad_norm": 1.0248312950134277, "learning_rate": 0.0002, "epoch": 7.887971274685817, "step": 109840}, {"loss": 0.5852, "grad_norm": 1.1179074048995972, "learning_rate": 0.0002, "epoch": 7.888689407540395, "step": 109850}, {"loss": 0.5844, "grad_norm": 0.9005255103111267, "learning_rate": 0.0002, "epoch": 7.8894075403949735, "step": 109860}, {"loss": 0.5233, "grad_norm": 1.0487693548202515, "learning_rate": 0.0002, "epoch": 7.8901256732495515, "step": 109870}, {"loss": 0.5368, "grad_norm": 1.2038270235061646, "learning_rate": 0.0002, "epoch": 7.8908438061041295, "step": 109880}, {"loss": 0.5373, "grad_norm": 0.9288236498832703, "learning_rate": 0.0002, "epoch": 7.8915619389587075, "step": 109890}, {"loss": 0.5547, "grad_norm": 0.959175169467926, "learning_rate": 0.0002, "epoch": 7.8922800718132855, "step": 109900}, {"loss": 0.5269, "grad_norm": 0.9703200459480286, "learning_rate": 0.0002, "epoch": 7.8929982046678635, "step": 109910}, {"loss": 0.5543, "grad_norm": 1.2670199871063232, "learning_rate": 0.0002, "epoch": 7.8937163375224415, "step": 109920}, {"loss": 0.6126, "grad_norm": 1.3127061128616333, "learning_rate": 0.0002, "epoch": 7.8944344703770195, "step": 109930}, {"loss": 0.5317, "grad_norm": 1.072664737701416, "learning_rate": 0.0002, "epoch": 7.8951526032315975, "step": 109940}, {"loss": 0.5385, "grad_norm": 1.0517730712890625, "learning_rate": 0.0002, "epoch": 7.8958707360861755, "step": 109950}, {"loss": 0.6131, "grad_norm": 0.8665887713432312, "learning_rate": 0.0002, "epoch": 7.896588868940754, "step": 109960}, {"loss": 0.5495, "grad_norm": 1.2894970178604126, "learning_rate": 0.0002, "epoch": 7.897307001795332, "step": 109970}, {"loss": 0.5664, "grad_norm": 1.1201982498168945, "learning_rate": 0.0002, "epoch": 7.89802513464991, "step": 109980}, {"loss": 0.5454, "grad_norm": 1.0165940523147583, "learning_rate": 0.0002, "epoch": 7.898743267504488, "step": 109990}, {"loss": 0.5538, "grad_norm": 1.1439729928970337, "learning_rate": 0.0002, "epoch": 7.899461400359066, "step": 110000}, {"loss": 0.56, "grad_norm": 1.0404242277145386, "learning_rate": 0.0002, "epoch": 7.900179533213644, "step": 110010}, {"loss": 0.5582, "grad_norm": 1.015904426574707, "learning_rate": 0.0002, "epoch": 7.900897666068222, "step": 110020}, {"loss": 0.5395, "grad_norm": 1.1397117376327515, "learning_rate": 0.0002, "epoch": 7.9016157989228, "step": 110030}, {"loss": 0.6062, "grad_norm": 1.5121701955795288, "learning_rate": 0.0002, "epoch": 7.902333931777379, "step": 110040}, {"loss": 0.5359, "grad_norm": 1.1664289236068726, "learning_rate": 0.0002, "epoch": 7.903052064631957, "step": 110050}, {"loss": 0.5873, "grad_norm": 1.1808925867080688, "learning_rate": 0.0002, "epoch": 7.903770197486535, "step": 110060}, {"loss": 0.5807, "grad_norm": 0.997465968132019, "learning_rate": 0.0002, "epoch": 7.904488330341113, "step": 110070}, {"loss": 0.5785, "grad_norm": 1.164481520652771, "learning_rate": 0.0002, "epoch": 7.905206463195691, "step": 110080}, {"loss": 0.5534, "grad_norm": 1.3008257150650024, "learning_rate": 0.0002, "epoch": 7.905924596050269, "step": 110090}, {"loss": 0.5542, "grad_norm": 1.067894697189331, "learning_rate": 0.0002, "epoch": 7.906642728904847, "step": 110100}, {"loss": 0.5849, "grad_norm": 1.0160772800445557, "learning_rate": 0.0002, "epoch": 7.907360861759425, "step": 110110}, {"loss": 0.5493, "grad_norm": 1.0485782623291016, "learning_rate": 0.0002, "epoch": 7.908078994614003, "step": 110120}, {"loss": 0.632, "grad_norm": 1.2126682996749878, "learning_rate": 0.0002, "epoch": 7.908797127468581, "step": 110130}, {"loss": 0.5652, "grad_norm": 1.124619722366333, "learning_rate": 0.0002, "epoch": 7.909515260323159, "step": 110140}, {"loss": 0.5664, "grad_norm": 1.1250736713409424, "learning_rate": 0.0002, "epoch": 7.910233393177738, "step": 110150}, {"loss": 0.59, "grad_norm": 0.9558429718017578, "learning_rate": 0.0002, "epoch": 7.910951526032316, "step": 110160}, {"loss": 0.5726, "grad_norm": 1.1605639457702637, "learning_rate": 0.0002, "epoch": 7.911669658886894, "step": 110170}, {"loss": 0.6007, "grad_norm": 1.4227420091629028, "learning_rate": 0.0002, "epoch": 7.912387791741472, "step": 110180}, {"loss": 0.5475, "grad_norm": 1.1452029943466187, "learning_rate": 0.0002, "epoch": 7.91310592459605, "step": 110190}, {"loss": 0.6068, "grad_norm": 0.9975438714027405, "learning_rate": 0.0002, "epoch": 7.913824057450628, "step": 110200}, {"loss": 0.5602, "grad_norm": 1.0418251752853394, "learning_rate": 0.0002, "epoch": 7.914542190305206, "step": 110210}, {"loss": 0.6304, "grad_norm": 1.2578071355819702, "learning_rate": 0.0002, "epoch": 7.915260323159784, "step": 110220}, {"loss": 0.551, "grad_norm": 0.9857864379882812, "learning_rate": 0.0002, "epoch": 7.915978456014363, "step": 110230}, {"loss": 0.5608, "grad_norm": 1.2045122385025024, "learning_rate": 0.0002, "epoch": 7.916696588868941, "step": 110240}, {"loss": 0.555, "grad_norm": 1.0540096759796143, "learning_rate": 0.0002, "epoch": 7.917414721723519, "step": 110250}, {"loss": 0.6137, "grad_norm": 1.3578428030014038, "learning_rate": 0.0002, "epoch": 7.918132854578097, "step": 110260}, {"loss": 0.6152, "grad_norm": 1.1917411088943481, "learning_rate": 0.0002, "epoch": 7.918850987432675, "step": 110270}, {"loss": 0.5896, "grad_norm": 0.953195333480835, "learning_rate": 0.0002, "epoch": 7.919569120287253, "step": 110280}, {"loss": 0.5826, "grad_norm": 1.060767650604248, "learning_rate": 0.0002, "epoch": 7.920287253141831, "step": 110290}, {"loss": 0.593, "grad_norm": 1.0920186042785645, "learning_rate": 0.0002, "epoch": 7.921005385996409, "step": 110300}, {"loss": 0.6003, "grad_norm": 1.0263668298721313, "learning_rate": 0.0002, "epoch": 7.921723518850987, "step": 110310}, {"loss": 0.5302, "grad_norm": 1.0305999517440796, "learning_rate": 0.0002, "epoch": 7.922441651705565, "step": 110320}, {"loss": 0.5746, "grad_norm": 1.2554773092269897, "learning_rate": 0.0002, "epoch": 7.923159784560143, "step": 110330}, {"loss": 0.5372, "grad_norm": 1.1688004732131958, "learning_rate": 0.0002, "epoch": 7.923877917414722, "step": 110340}, {"loss": 0.5498, "grad_norm": 0.996721625328064, "learning_rate": 0.0002, "epoch": 7.9245960502693, "step": 110350}, {"loss": 0.5664, "grad_norm": 1.000508427619934, "learning_rate": 0.0002, "epoch": 7.925314183123878, "step": 110360}, {"loss": 0.5564, "grad_norm": 1.0895634889602661, "learning_rate": 0.0002, "epoch": 7.926032315978456, "step": 110370}, {"loss": 0.5983, "grad_norm": 0.9376350045204163, "learning_rate": 0.0002, "epoch": 7.926750448833034, "step": 110380}, {"loss": 0.5933, "grad_norm": 0.9476872086524963, "learning_rate": 0.0002, "epoch": 7.927468581687612, "step": 110390}, {"loss": 0.5511, "grad_norm": 1.142225742340088, "learning_rate": 0.0002, "epoch": 7.92818671454219, "step": 110400}, {"loss": 0.6069, "grad_norm": 1.2613552808761597, "learning_rate": 0.0002, "epoch": 7.928904847396768, "step": 110410}, {"loss": 0.5513, "grad_norm": 1.0425217151641846, "learning_rate": 0.0002, "epoch": 7.929622980251347, "step": 110420}, {"loss": 0.5709, "grad_norm": 1.1250224113464355, "learning_rate": 0.0002, "epoch": 7.930341113105925, "step": 110430}, {"loss": 0.5816, "grad_norm": 1.1487616300582886, "learning_rate": 0.0002, "epoch": 7.931059245960503, "step": 110440}, {"loss": 0.5986, "grad_norm": 1.009817123413086, "learning_rate": 0.0002, "epoch": 7.931777378815081, "step": 110450}, {"loss": 0.5414, "grad_norm": 1.0866706371307373, "learning_rate": 0.0002, "epoch": 7.932495511669659, "step": 110460}, {"loss": 0.5875, "grad_norm": 0.9821379780769348, "learning_rate": 0.0002, "epoch": 7.933213644524237, "step": 110470}, {"loss": 0.5282, "grad_norm": 1.042220115661621, "learning_rate": 0.0002, "epoch": 7.933931777378815, "step": 110480}, {"loss": 0.5712, "grad_norm": 1.018154263496399, "learning_rate": 0.0002, "epoch": 7.934649910233393, "step": 110490}, {"loss": 0.5508, "grad_norm": 1.0129317045211792, "learning_rate": 0.0002, "epoch": 7.935368043087971, "step": 110500}, {"loss": 0.5797, "grad_norm": 1.0918302536010742, "learning_rate": 0.0002, "epoch": 7.936086175942549, "step": 110510}, {"loss": 0.5849, "grad_norm": 1.3739500045776367, "learning_rate": 0.0002, "epoch": 7.936804308797128, "step": 110520}, {"loss": 0.5356, "grad_norm": 0.9313759803771973, "learning_rate": 0.0002, "epoch": 7.937522441651706, "step": 110530}, {"loss": 0.5892, "grad_norm": 1.0325546264648438, "learning_rate": 0.0002, "epoch": 7.938240574506284, "step": 110540}, {"loss": 0.5487, "grad_norm": 1.0858685970306396, "learning_rate": 0.0002, "epoch": 7.938958707360862, "step": 110550}, {"loss": 0.5843, "grad_norm": 0.9607970118522644, "learning_rate": 0.0002, "epoch": 7.93967684021544, "step": 110560}, {"loss": 0.5648, "grad_norm": 1.2014137506484985, "learning_rate": 0.0002, "epoch": 7.940394973070018, "step": 110570}, {"loss": 0.5827, "grad_norm": 1.0917125940322876, "learning_rate": 0.0002, "epoch": 7.941113105924596, "step": 110580}, {"loss": 0.5428, "grad_norm": 1.0328655242919922, "learning_rate": 0.0002, "epoch": 7.941831238779174, "step": 110590}, {"loss": 0.5481, "grad_norm": 0.9071711897850037, "learning_rate": 0.0002, "epoch": 7.942549371633753, "step": 110600}, {"loss": 0.5578, "grad_norm": 1.0363129377365112, "learning_rate": 0.0002, "epoch": 7.943267504488331, "step": 110610}, {"loss": 0.549, "grad_norm": 1.1908930540084839, "learning_rate": 0.0002, "epoch": 7.943985637342909, "step": 110620}, {"loss": 0.5677, "grad_norm": 1.1436357498168945, "learning_rate": 0.0002, "epoch": 7.944703770197487, "step": 110630}, {"loss": 0.5578, "grad_norm": 1.2671914100646973, "learning_rate": 0.0002, "epoch": 7.945421903052065, "step": 110640}, {"loss": 0.6233, "grad_norm": 1.0665358304977417, "learning_rate": 0.0002, "epoch": 7.946140035906643, "step": 110650}, {"loss": 0.602, "grad_norm": 1.065150499343872, "learning_rate": 0.0002, "epoch": 7.946858168761221, "step": 110660}, {"loss": 0.5545, "grad_norm": 1.3114454746246338, "learning_rate": 0.0002, "epoch": 7.947576301615799, "step": 110670}, {"loss": 0.5459, "grad_norm": 1.439401388168335, "learning_rate": 0.0002, "epoch": 7.948294434470377, "step": 110680}, {"loss": 0.6127, "grad_norm": 1.0176633596420288, "learning_rate": 0.0002, "epoch": 7.949012567324955, "step": 110690}, {"loss": 0.6169, "grad_norm": 1.2536396980285645, "learning_rate": 0.0002, "epoch": 7.949730700179533, "step": 110700}, {"loss": 0.5768, "grad_norm": 1.1297016143798828, "learning_rate": 0.0002, "epoch": 7.950448833034112, "step": 110710}, {"loss": 0.5725, "grad_norm": 0.9819521307945251, "learning_rate": 0.0002, "epoch": 7.95116696588869, "step": 110720}, {"loss": 0.5475, "grad_norm": 1.0327529907226562, "learning_rate": 0.0002, "epoch": 7.951885098743268, "step": 110730}, {"loss": 0.5547, "grad_norm": 1.003000259399414, "learning_rate": 0.0002, "epoch": 7.952603231597846, "step": 110740}, {"loss": 0.5807, "grad_norm": 0.9818766117095947, "learning_rate": 0.0002, "epoch": 7.953321364452424, "step": 110750}, {"loss": 0.546, "grad_norm": 1.1950650215148926, "learning_rate": 0.0002, "epoch": 7.954039497307002, "step": 110760}, {"loss": 0.5199, "grad_norm": 1.1700283288955688, "learning_rate": 0.0002, "epoch": 7.95475763016158, "step": 110770}, {"loss": 0.5975, "grad_norm": 0.8310879468917847, "learning_rate": 0.0002, "epoch": 7.955475763016158, "step": 110780}, {"loss": 0.5867, "grad_norm": 1.3428716659545898, "learning_rate": 0.0002, "epoch": 7.9561938958707366, "step": 110790}, {"loss": 0.5137, "grad_norm": 1.2581387758255005, "learning_rate": 0.0002, "epoch": 7.956912028725315, "step": 110800}, {"loss": 0.5565, "grad_norm": 1.0624088048934937, "learning_rate": 0.0002, "epoch": 7.957630161579893, "step": 110810}, {"loss": 0.5577, "grad_norm": 1.0604743957519531, "learning_rate": 0.0002, "epoch": 7.958348294434471, "step": 110820}, {"loss": 0.5931, "grad_norm": 1.3024394512176514, "learning_rate": 0.0002, "epoch": 7.959066427289049, "step": 110830}, {"loss": 0.5808, "grad_norm": 0.9976829886436462, "learning_rate": 0.0002, "epoch": 7.959784560143627, "step": 110840}, {"loss": 0.5629, "grad_norm": 1.2092949151992798, "learning_rate": 0.0002, "epoch": 7.960502692998205, "step": 110850}, {"loss": 0.5698, "grad_norm": 1.0752426385879517, "learning_rate": 0.0002, "epoch": 7.961220825852783, "step": 110860}, {"loss": 0.5789, "grad_norm": 0.9072325229644775, "learning_rate": 0.0002, "epoch": 7.961938958707361, "step": 110870}, {"loss": 0.5532, "grad_norm": 1.1252259016036987, "learning_rate": 0.0002, "epoch": 7.962657091561939, "step": 110880}, {"loss": 0.5733, "grad_norm": 1.002448558807373, "learning_rate": 0.0002, "epoch": 7.963375224416517, "step": 110890}, {"loss": 0.563, "grad_norm": 0.9354956150054932, "learning_rate": 0.0002, "epoch": 7.9640933572710955, "step": 110900}, {"loss": 0.5599, "grad_norm": 1.1560840606689453, "learning_rate": 0.0002, "epoch": 7.9648114901256735, "step": 110910}, {"loss": 0.5929, "grad_norm": 1.169173240661621, "learning_rate": 0.0002, "epoch": 7.9655296229802515, "step": 110920}, {"loss": 0.5824, "grad_norm": 1.169741153717041, "learning_rate": 0.0002, "epoch": 7.9662477558348295, "step": 110930}, {"loss": 0.5722, "grad_norm": 1.092739224433899, "learning_rate": 0.0002, "epoch": 7.9669658886894075, "step": 110940}, {"loss": 0.5266, "grad_norm": 0.901034414768219, "learning_rate": 0.0002, "epoch": 7.9676840215439855, "step": 110950}, {"loss": 0.603, "grad_norm": 1.1143959760665894, "learning_rate": 0.0002, "epoch": 7.9684021543985635, "step": 110960}, {"loss": 0.5886, "grad_norm": 1.1839512586593628, "learning_rate": 0.0002, "epoch": 7.9691202872531415, "step": 110970}, {"loss": 0.5671, "grad_norm": 0.9340457320213318, "learning_rate": 0.0002, "epoch": 7.96983842010772, "step": 110980}, {"loss": 0.5296, "grad_norm": 1.0368584394454956, "learning_rate": 0.0002, "epoch": 7.970556552962298, "step": 110990}, {"loss": 0.5723, "grad_norm": 1.0153379440307617, "learning_rate": 0.0002, "epoch": 7.971274685816876, "step": 111000}, {"loss": 0.5945, "grad_norm": 1.0815552473068237, "learning_rate": 0.0002, "epoch": 7.971992818671454, "step": 111010}, {"loss": 0.542, "grad_norm": 1.0502792596817017, "learning_rate": 0.0002, "epoch": 7.972710951526032, "step": 111020}, {"loss": 0.5841, "grad_norm": 1.3402234315872192, "learning_rate": 0.0002, "epoch": 7.97342908438061, "step": 111030}, {"loss": 0.5902, "grad_norm": 1.155196189880371, "learning_rate": 0.0002, "epoch": 7.974147217235188, "step": 111040}, {"loss": 0.5544, "grad_norm": 1.2841416597366333, "learning_rate": 0.0002, "epoch": 7.974865350089766, "step": 111050}, {"loss": 0.5572, "grad_norm": 1.1467466354370117, "learning_rate": 0.0002, "epoch": 7.975583482944344, "step": 111060}, {"loss": 0.6115, "grad_norm": 1.1308223009109497, "learning_rate": 0.0002, "epoch": 7.976301615798922, "step": 111070}, {"loss": 0.5639, "grad_norm": 1.0641266107559204, "learning_rate": 0.0002, "epoch": 7.977019748653501, "step": 111080}, {"loss": 0.5154, "grad_norm": 1.0808128118515015, "learning_rate": 0.0002, "epoch": 7.977737881508079, "step": 111090}, {"loss": 0.5546, "grad_norm": 1.2631522417068481, "learning_rate": 0.0002, "epoch": 7.978456014362657, "step": 111100}, {"loss": 0.6013, "grad_norm": 1.1176106929779053, "learning_rate": 0.0002, "epoch": 7.979174147217235, "step": 111110}, {"loss": 0.5364, "grad_norm": 1.183842658996582, "learning_rate": 0.0002, "epoch": 7.979892280071813, "step": 111120}, {"loss": 0.5809, "grad_norm": 0.9207148551940918, "learning_rate": 0.0002, "epoch": 7.980610412926391, "step": 111130}, {"loss": 0.6112, "grad_norm": 1.314513087272644, "learning_rate": 0.0002, "epoch": 7.981328545780969, "step": 111140}, {"loss": 0.6035, "grad_norm": 1.4508297443389893, "learning_rate": 0.0002, "epoch": 7.982046678635547, "step": 111150}, {"loss": 0.5971, "grad_norm": 1.1941379308700562, "learning_rate": 0.0002, "epoch": 7.982764811490125, "step": 111160}, {"loss": 0.5181, "grad_norm": 1.0326071977615356, "learning_rate": 0.0002, "epoch": 7.983482944344704, "step": 111170}, {"loss": 0.5534, "grad_norm": 1.1843258142471313, "learning_rate": 0.0002, "epoch": 7.984201077199282, "step": 111180}, {"loss": 0.5658, "grad_norm": 0.98868727684021, "learning_rate": 0.0002, "epoch": 7.98491921005386, "step": 111190}, {"loss": 0.5603, "grad_norm": 1.0722097158432007, "learning_rate": 0.0002, "epoch": 7.985637342908438, "step": 111200}, {"loss": 0.5485, "grad_norm": 1.254882574081421, "learning_rate": 0.0002, "epoch": 7.986355475763016, "step": 111210}, {"loss": 0.5192, "grad_norm": 1.1299649477005005, "learning_rate": 0.0002, "epoch": 7.987073608617594, "step": 111220}, {"loss": 0.5268, "grad_norm": 1.0343568325042725, "learning_rate": 0.0002, "epoch": 7.987791741472172, "step": 111230}, {"loss": 0.5803, "grad_norm": 1.173403024673462, "learning_rate": 0.0002, "epoch": 7.98850987432675, "step": 111240}, {"loss": 0.5788, "grad_norm": 1.2749351263046265, "learning_rate": 0.0002, "epoch": 7.989228007181328, "step": 111250}, {"loss": 0.5979, "grad_norm": 1.1579365730285645, "learning_rate": 0.0002, "epoch": 7.989946140035906, "step": 111260}, {"loss": 0.5371, "grad_norm": 1.2069926261901855, "learning_rate": 0.0002, "epoch": 7.990664272890485, "step": 111270}, {"loss": 0.5563, "grad_norm": 1.1962283849716187, "learning_rate": 0.0002, "epoch": 7.991382405745063, "step": 111280}, {"loss": 0.5858, "grad_norm": 0.9776540398597717, "learning_rate": 0.0002, "epoch": 7.992100538599641, "step": 111290}, {"loss": 0.5696, "grad_norm": 0.9829531311988831, "learning_rate": 0.0002, "epoch": 7.992818671454219, "step": 111300}, {"loss": 0.5805, "grad_norm": 1.3035449981689453, "learning_rate": 0.0002, "epoch": 7.993536804308797, "step": 111310}, {"loss": 0.5829, "grad_norm": 1.3423140048980713, "learning_rate": 0.0002, "epoch": 7.994254937163375, "step": 111320}, {"loss": 0.5904, "grad_norm": 1.1216566562652588, "learning_rate": 0.0002, "epoch": 7.994973070017953, "step": 111330}, {"loss": 0.6022, "grad_norm": 1.0143498182296753, "learning_rate": 0.0002, "epoch": 7.995691202872531, "step": 111340}, {"loss": 0.572, "grad_norm": 1.0691397190093994, "learning_rate": 0.0002, "epoch": 7.99640933572711, "step": 111350}, {"loss": 0.5412, "grad_norm": 1.3484272956848145, "learning_rate": 0.0002, "epoch": 7.997127468581688, "step": 111360}, {"loss": 0.5635, "grad_norm": 0.9939428567886353, "learning_rate": 0.0002, "epoch": 7.997845601436266, "step": 111370}, {"loss": 0.568, "grad_norm": 1.0009615421295166, "learning_rate": 0.0002, "epoch": 7.998563734290844, "step": 111380}, {"loss": 0.5718, "grad_norm": 0.986566424369812, "learning_rate": 0.0002, "epoch": 7.999281867145422, "step": 111390}, {"loss": 0.5632, "grad_norm": 0.9135745167732239, "learning_rate": 0.0002, "epoch": 8.0, "step": 111400}]}